diff --git a/sft/Full_new_smoe_sigmoidgating/added_tokens.json b/sft/Full_new_smoe_sigmoidgating/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/added_tokens.json b/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/config.json b/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/config.json new file mode 100644 index 0000000000000000000000000000000000000000..97409ed874967d8d79c126c028d286e8fe8e1484 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/config.json @@ -0,0 +1,199 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": true, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": false, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "smoe_sigmoidgating", + "norm_softmax": false, + "normalization": true, + "num_attention_heads": 32, + "num_experts": 4, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 2, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/mm_projector.bin", + "rate_compete": 0.2, + "rate_flip": 0.05, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": true, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": false, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/generation_config.json b/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/latest b/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/latest new file mode 100644 index 0000000000000000000000000000000000000000..41750eb6d66126b6023d0560fd5c7875c0706774 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/latest @@ -0,0 +1 @@ +global_step13312 \ No newline at end of file diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/model-00001-of-00003.safetensors b/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..79e7b64592f8e92b702ee82d2b70de03a02a9e44 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c807a158404fa1a6a9b83572b3d380af7bb8173d75f220f24592137cdf2193b +size 4972489328 diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/model-00002-of-00003.safetensors b/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c5ece23cc9b5e18ffafef2adacbdac39fd07ed99 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b12ae6b60b0ab8fa2473ae22b2cce411e425dd88e6c4deffcc1530d07b4b3f0 +size 4985529648 diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/model-00003-of-00003.safetensors b/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ed5629d3fdbe1adba565f41e234a05c9e7c23369 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:77c2129f6b9d599d187dd66ae82563edba8626ee53a80806d9359f97026e52b2 +size 248943552 diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/model.safetensors.index.json b/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..aa54419fc0a3eab502aa7c4ad974dca52ed10803 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/model.safetensors.index.json @@ -0,0 +1,1005 @@ +{ + "metadata": { + "total_size": 10206819456 + }, + "weight_map": { + "lm_head.weight": "model-00003-of-00003.safetensors", + "model.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00003-of-00003.safetensors", + "model.norm.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors" + } +} diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/rng_state_0.pth b/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..c1e6773e944015af0e83161fa2d20fe7d469fd7f --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22271cc36f268c0b3e870b3930ac590fd40a4a3cd3a88aed74f78e5f8790aceb +size 15024 diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/rng_state_1.pth b/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..a834a7be015ebd36883cec3bb92a8657936cd0a6 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19435e9be5d4b837d96fc2e9286e23e27344bb6ad3222ef1b9d207e6b2bb8c78 +size 15024 diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/rng_state_2.pth b/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..1f1b991258d274ff5481ace768d5b6702d919d50 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2902ec572b1b2f1a6a78f8979353bf31953eacdc78b129cc34a9f04c1de9b8d5 +size 15024 diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/rng_state_3.pth b/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..ee742fbd21912a77c2d25fe5ca60af4403668637 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8a011e80ba323d1fcabf31eaea4d2bc397efadb23603b4248f0067ff8ca3987 +size 15024 diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/special_tokens_map.json b/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/tokenizer.model b/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/tokenizer_config.json b/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/trainer_state.json b/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6e8c166c161de01d07f78577b0ff23c90355bc63 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/trainer_state.json @@ -0,0 +1,226337 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.8003607395160078, + "eval_steps": 500, + "global_step": 13312, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.05015663, + "auxiliary_loss_mlp": 0.02215404, + "balance_loss_clip": 1.76946592, + "balance_loss_mlp": 2.42247009, + "epoch": 6.012325266796934e-05, + "flos": 24456507091200.0, + "grad_norm": 54.31846269900138, + "language_loss": 2.84849024, + "learning_rate": 0.0, + "loss": 1.94356799, + "num_input_tokens_seen": 19155, + "router_z_loss_clip": 4.4375, + "router_z_loss_mlp": 26.0, + "step": 1, + "time_per_iteration": 14.062297821044922 + }, + { + "auxiliary_loss_clip": 0.03371575, + "auxiliary_loss_mlp": 0.01459085, + "balance_loss_clip": 1.18919563, + "balance_loss_mlp": 1.61943495, + "epoch": 0.00012024650533593868, + "flos": 20225931246720.0, + "grad_norm": 34.71678092445231, + "language_loss": 1.82690942, + "learning_rate": 4.4628432569317594e-07, + "loss": 1.87521601, + "num_input_tokens_seen": 36175, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 17.5, + "step": 2, + "time_per_iteration": 2.4504079818725586 + }, + { + "auxiliary_loss_clip": 0.03311525, + "auxiliary_loss_mlp": 0.014397, + "balance_loss_clip": 1.18697679, + "balance_loss_mlp": 1.61685562, + "epoch": 0.000180369758003908, + "flos": 22309935454080.0, + "grad_norm": 34.59102075188436, + "language_loss": 1.57529902, + "learning_rate": 7.073439208833112e-07, + "loss": 1.62281132, + "num_input_tokens_seen": 54870, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 17.0, + "step": 3, + "time_per_iteration": 2.4145541191101074 + }, + { + "auxiliary_loss_clip": 0.03353861, + "auxiliary_loss_mlp": 0.01449549, + "balance_loss_clip": 1.15390992, + "balance_loss_mlp": 1.61571431, + "epoch": 0.00024049301067187735, + "flos": 22414650577920.0, + "grad_norm": 51.728740512395206, + "language_loss": 1.67595887, + "learning_rate": 8.925686513863519e-07, + "loss": 1.72399294, + "num_input_tokens_seen": 74575, + "router_z_loss_clip": 2.953125, + "router_z_loss_mlp": 17.375, + "step": 4, + "time_per_iteration": 2.466392993927002 + }, + { + "auxiliary_loss_clip": 0.03393634, + "auxiliary_loss_mlp": 0.01505687, + "balance_loss_clip": 1.21710527, + "balance_loss_mlp": 1.61638641, + "epoch": 0.0003006162633398467, + "flos": 21396978449280.0, + "grad_norm": 56.74196654651921, + "language_loss": 1.90851176, + "learning_rate": 1.0362401141348472e-06, + "loss": 1.95750499, + "num_input_tokens_seen": 92580, + "router_z_loss_clip": 2.890625, + "router_z_loss_mlp": 17.75, + "step": 5, + "time_per_iteration": 2.6828246116638184 + }, + { + "auxiliary_loss_clip": 0.03361898, + "auxiliary_loss_mlp": 0.01518906, + "balance_loss_clip": 1.22441149, + "balance_loss_mlp": 1.60614848, + "epoch": 0.000360739516007816, + "flos": 21652375127040.0, + "grad_norm": 33.32400799743486, + "language_loss": 1.6094954, + "learning_rate": 1.153628246576487e-06, + "loss": 1.6583035, + "num_input_tokens_seen": 109705, + "router_z_loss_clip": 2.953125, + "router_z_loss_mlp": 17.5, + "step": 6, + "time_per_iteration": 2.660855770111084 + }, + { + "auxiliary_loss_clip": 0.03345758, + "auxiliary_loss_mlp": 0.01485904, + "balance_loss_clip": 1.20209074, + "balance_loss_mlp": 1.60783124, + "epoch": 0.0004208627686757854, + "flos": 27159742897920.0, + "grad_norm": 26.76365346454933, + "language_loss": 1.53346825, + "learning_rate": 1.2528784983718962e-06, + "loss": 1.58178496, + "num_input_tokens_seen": 129425, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 17.375, + "step": 7, + "time_per_iteration": 2.718822956085205 + }, + { + "auxiliary_loss_clip": 0.03312894, + "auxiliary_loss_mlp": 0.01444018, + "balance_loss_clip": 1.16630852, + "balance_loss_mlp": 1.60320723, + "epoch": 0.0004809860213437547, + "flos": 31319096135040.0, + "grad_norm": 31.923588970831496, + "language_loss": 1.43687642, + "learning_rate": 1.338852977079528e-06, + "loss": 1.48444545, + "num_input_tokens_seen": 149210, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 17.0, + "step": 8, + "time_per_iteration": 2.779961109161377 + }, + { + "auxiliary_loss_clip": 0.03360351, + "auxiliary_loss_mlp": 0.01496215, + "balance_loss_clip": 1.21144783, + "balance_loss_mlp": 1.60258842, + "epoch": 0.000541109274011724, + "flos": 32160411463680.0, + "grad_norm": 28.084887526361417, + "language_loss": 1.49955618, + "learning_rate": 1.4146878417666224e-06, + "loss": 1.54812181, + "num_input_tokens_seen": 169055, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 17.5, + "step": 9, + "time_per_iteration": 2.799635887145996 + }, + { + "auxiliary_loss_clip": 0.03302188, + "auxiliary_loss_mlp": 0.01477479, + "balance_loss_clip": 1.20797062, + "balance_loss_mlp": 1.6070832, + "epoch": 0.0006012325266796934, + "flos": 18916808163840.0, + "grad_norm": 23.45187310710616, + "language_loss": 1.44727731, + "learning_rate": 1.4825244398280232e-06, + "loss": 1.49507403, + "num_input_tokens_seen": 188045, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 17.0, + "step": 10, + "time_per_iteration": 2.6989152431488037 + }, + { + "auxiliary_loss_clip": 0.03356835, + "auxiliary_loss_mlp": 0.01493566, + "balance_loss_clip": 1.21928966, + "balance_loss_mlp": 1.61121845, + "epoch": 0.0006613557793476627, + "flos": 20774861867520.0, + "grad_norm": 18.63867113279811, + "language_loss": 1.45021069, + "learning_rate": 1.5438901072051983e-06, + "loss": 1.4987148, + "num_input_tokens_seen": 207035, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 17.5, + "step": 11, + "time_per_iteration": 2.6820693016052246 + }, + { + "auxiliary_loss_clip": 0.0328584, + "auxiliary_loss_mlp": 0.01449969, + "balance_loss_clip": 1.17378449, + "balance_loss_mlp": 1.59900761, + "epoch": 0.000721479032015632, + "flos": 16581680997120.0, + "grad_norm": 16.861449854609447, + "language_loss": 1.45122719, + "learning_rate": 1.5999125722696629e-06, + "loss": 1.49858522, + "num_input_tokens_seen": 223225, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 16.875, + "step": 12, + "time_per_iteration": 2.631218910217285 + }, + { + "auxiliary_loss_clip": 0.03313605, + "auxiliary_loss_mlp": 0.01404342, + "balance_loss_clip": 1.14589679, + "balance_loss_mlp": 1.60898232, + "epoch": 0.0007816022846836014, + "flos": 23805471144960.0, + "grad_norm": 11.176593153687291, + "language_loss": 1.24100113, + "learning_rate": 1.6514482443788434e-06, + "loss": 1.28818083, + "num_input_tokens_seen": 242570, + "router_z_loss_clip": 2.578125, + "router_z_loss_mlp": 17.125, + "step": 13, + "time_per_iteration": 2.6961779594421387 + }, + { + "auxiliary_loss_clip": 0.03282163, + "auxiliary_loss_mlp": 0.01472629, + "balance_loss_clip": 1.20464635, + "balance_loss_mlp": 1.60534358, + "epoch": 0.0008417255373515708, + "flos": 19172204841600.0, + "grad_norm": 5.7580183597057975, + "language_loss": 1.20611417, + "learning_rate": 1.6991628240650723e-06, + "loss": 1.25366211, + "num_input_tokens_seen": 261215, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 16.75, + "step": 14, + "time_per_iteration": 2.6555092334747314 + }, + { + "auxiliary_loss_clip": 0.0326835, + "auxiliary_loss_mlp": 0.01431945, + "balance_loss_clip": 1.16815877, + "balance_loss_mlp": 1.6104542, + "epoch": 0.00090184879001954, + "flos": 26395564026240.0, + "grad_norm": 6.4839782289009085, + "language_loss": 1.12832427, + "learning_rate": 1.7435840350181584e-06, + "loss": 1.1753273, + "num_input_tokens_seen": 280035, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 16.5, + "step": 15, + "time_per_iteration": 2.717512607574463 + }, + { + "auxiliary_loss_clip": 0.03231722, + "auxiliary_loss_mlp": 0.01412441, + "balance_loss_clip": 1.16257811, + "balance_loss_mlp": 1.59521294, + "epoch": 0.0009619720426875094, + "flos": 24679500785280.0, + "grad_norm": 4.584872954405151, + "language_loss": 1.1119349, + "learning_rate": 1.7851373027727038e-06, + "loss": 1.15837646, + "num_input_tokens_seen": 300265, + "router_z_loss_clip": 2.5, + "router_z_loss_mlp": 16.375, + "step": 16, + "time_per_iteration": 2.7170701026916504 + }, + { + "auxiliary_loss_clip": 0.03220058, + "auxiliary_loss_mlp": 0.0141779, + "balance_loss_clip": 1.17784595, + "balance_loss_mlp": 1.60289145, + "epoch": 0.0010220952953554788, + "flos": 18624531196800.0, + "grad_norm": 5.285773165398426, + "language_loss": 1.1253047, + "learning_rate": 1.8241705979033208e-06, + "loss": 1.17168307, + "num_input_tokens_seen": 317375, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 16.125, + "step": 17, + "time_per_iteration": 2.6125564575195312 + }, + { + "auxiliary_loss_clip": 0.0315575, + "auxiliary_loss_mlp": 0.01378857, + "balance_loss_clip": 1.14730477, + "balance_loss_mlp": 1.60051179, + "epoch": 0.001082218548023448, + "flos": 26142537646080.0, + "grad_norm": 3.8094646515897193, + "language_loss": 1.08149433, + "learning_rate": 1.860972167459798e-06, + "loss": 1.12684035, + "num_input_tokens_seen": 337975, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 15.5625, + "step": 18, + "time_per_iteration": 5.593315362930298 + }, + { + "auxiliary_loss_clip": 0.03181327, + "auxiliary_loss_mlp": 0.01400224, + "balance_loss_clip": 1.13548398, + "balance_loss_mlp": 1.59901524, + "epoch": 0.0011423418006914173, + "flos": 19609776322560.0, + "grad_norm": 4.551402579460018, + "language_loss": 1.02296436, + "learning_rate": 1.89578346593066e-06, + "loss": 1.06877995, + "num_input_tokens_seen": 356635, + "router_z_loss_clip": 2.65625, + "router_z_loss_mlp": 15.8125, + "step": 19, + "time_per_iteration": 2.6462903022766113 + }, + { + "auxiliary_loss_clip": 0.0312444, + "auxiliary_loss_mlp": 0.01341166, + "balance_loss_clip": 1.12096262, + "balance_loss_mlp": 1.60122275, + "epoch": 0.0012024650533593868, + "flos": 17895365107200.0, + "grad_norm": 4.049985155187145, + "language_loss": 1.16660511, + "learning_rate": 1.928808765521199e-06, + "loss": 1.21126115, + "num_input_tokens_seen": 375625, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 15.25, + "step": 20, + "time_per_iteration": 2.687962293624878 + }, + { + "auxiliary_loss_clip": 0.03111088, + "auxiliary_loss_mlp": 0.01380381, + "balance_loss_clip": 1.13109064, + "balance_loss_mlp": 1.58184814, + "epoch": 0.001262588306027356, + "flos": 21252043071360.0, + "grad_norm": 8.855966691950416, + "language_loss": 1.06044388, + "learning_rate": 1.9602224192552076e-06, + "loss": 1.1053586, + "num_input_tokens_seen": 394350, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 15.3125, + "step": 21, + "time_per_iteration": 2.705784320831299 + }, + { + "auxiliary_loss_clip": 0.03006166, + "auxiliary_loss_mlp": 0.0138104, + "balance_loss_clip": 1.14758062, + "balance_loss_mlp": 1.56386232, + "epoch": 0.0013227115586953253, + "flos": 26104077158400.0, + "grad_norm": 3.503731577984969, + "language_loss": 1.05752254, + "learning_rate": 1.9901744328983746e-06, + "loss": 1.10139465, + "num_input_tokens_seen": 413255, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 14.4375, + "step": 22, + "time_per_iteration": 2.714902400970459 + }, + { + "auxiliary_loss_clip": 0.02958535, + "auxiliary_loss_mlp": 0.01337723, + "balance_loss_clip": 1.12743819, + "balance_loss_mlp": 1.56545472, + "epoch": 0.0013828348113632948, + "flos": 23951376190080.0, + "grad_norm": 2.8887485842740657, + "language_loss": 0.91820848, + "learning_rate": 2.018794797290208e-06, + "loss": 0.96117103, + "num_input_tokens_seen": 433065, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 13.9375, + "step": 23, + "time_per_iteration": 2.6802501678466797 + }, + { + "auxiliary_loss_clip": 0.02925568, + "auxiliary_loss_mlp": 0.0136327, + "balance_loss_clip": 1.14306688, + "balance_loss_mlp": 1.55789983, + "epoch": 0.001442958064031264, + "flos": 15959851724160.0, + "grad_norm": 2.888412626700388, + "language_loss": 1.08090949, + "learning_rate": 2.046196897962839e-06, + "loss": 1.12379789, + "num_input_tokens_seen": 451175, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 13.6875, + "step": 24, + "time_per_iteration": 2.6134862899780273 + }, + { + "auxiliary_loss_clip": 0.02818042, + "auxiliary_loss_mlp": 0.01329399, + "balance_loss_clip": 1.11892343, + "balance_loss_mlp": 1.55278993, + "epoch": 0.0015030813166992333, + "flos": 18108350801280.0, + "grad_norm": 3.5526652768314877, + "language_loss": 1.01197755, + "learning_rate": 2.0724802282696944e-06, + "loss": 1.05345201, + "num_input_tokens_seen": 468775, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 12.6875, + "step": 25, + "time_per_iteration": 2.6801955699920654 + }, + { + "auxiliary_loss_clip": 0.02811065, + "auxiliary_loss_mlp": 0.01310914, + "balance_loss_clip": 1.10196424, + "balance_loss_mlp": 1.55557573, + "epoch": 0.0015632045693672028, + "flos": 22234558763520.0, + "grad_norm": 2.8866965715457127, + "language_loss": 1.0650332, + "learning_rate": 2.0977325700720194e-06, + "loss": 1.10625291, + "num_input_tokens_seen": 488530, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 12.5625, + "step": 26, + "time_per_iteration": 2.6561954021453857 + }, + { + "auxiliary_loss_clip": 0.02754337, + "auxiliary_loss_mlp": 0.01325989, + "balance_loss_clip": 1.12600398, + "balance_loss_mlp": 1.54593086, + "epoch": 0.001623327822035172, + "flos": 23991955580160.0, + "grad_norm": 8.480879524297928, + "language_loss": 0.95465469, + "learning_rate": 2.122031762649933e-06, + "loss": 0.99545801, + "num_input_tokens_seen": 510495, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 12.0625, + "step": 27, + "time_per_iteration": 2.717332363128662 + }, + { + "auxiliary_loss_clip": 0.02732017, + "auxiliary_loss_mlp": 0.0131313, + "balance_loss_clip": 1.13174081, + "balance_loss_mlp": 1.55085063, + "epoch": 0.0016834510747031415, + "flos": 19677647070720.0, + "grad_norm": 2.7582152185230338, + "language_loss": 1.06276608, + "learning_rate": 2.1454471497582483e-06, + "loss": 1.1032176, + "num_input_tokens_seen": 528605, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 11.8125, + "step": 28, + "time_per_iteration": 2.6645846366882324 + }, + { + "auxiliary_loss_clip": 0.02698877, + "auxiliary_loss_mlp": 0.01319704, + "balance_loss_clip": 1.1339283, + "balance_loss_mlp": 1.5357703, + "epoch": 0.0017435743273711108, + "flos": 20923819568640.0, + "grad_norm": 2.703793609192777, + "language_loss": 1.02653611, + "learning_rate": 2.1680407726407727e-06, + "loss": 1.06672192, + "num_input_tokens_seen": 548515, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 11.625, + "step": 29, + "time_per_iteration": 2.6647088527679443 + }, + { + "auxiliary_loss_clip": 0.02692806, + "auxiliary_loss_mlp": 0.01313595, + "balance_loss_clip": 1.12667465, + "balance_loss_mlp": 1.53252506, + "epoch": 0.00180369758003908, + "flos": 19528976678400.0, + "grad_norm": 3.824163422844594, + "language_loss": 1.1929419, + "learning_rate": 2.189868360711334e-06, + "loss": 1.233006, + "num_input_tokens_seen": 564025, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 11.625, + "step": 30, + "time_per_iteration": 2.6305816173553467 + }, + { + "auxiliary_loss_clip": 0.02610821, + "auxiliary_loss_mlp": 0.01338782, + "balance_loss_clip": 1.15748882, + "balance_loss_mlp": 1.51829374, + "epoch": 0.0018638208327070496, + "flos": 27453169100160.0, + "grad_norm": 4.55861683808779, + "language_loss": 1.02499342, + "learning_rate": 2.2109801597326265e-06, + "loss": 1.06448936, + "num_input_tokens_seen": 583345, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 10.9375, + "step": 31, + "time_per_iteration": 2.71045184135437 + }, + { + "auxiliary_loss_clip": 0.02583705, + "auxiliary_loss_mlp": 0.01332414, + "balance_loss_clip": 1.15245557, + "balance_loss_mlp": 1.52035046, + "epoch": 0.0019239440853750188, + "flos": 13589460380160.0, + "grad_norm": 2.526137445187824, + "language_loss": 0.95697796, + "learning_rate": 2.2314216284658796e-06, + "loss": 0.99613917, + "num_input_tokens_seen": 600010, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 10.625, + "step": 32, + "time_per_iteration": 2.626783847808838 + }, + { + "auxiliary_loss_clip": 0.02566919, + "auxiliary_loss_mlp": 0.01304168, + "balance_loss_clip": 1.13670313, + "balance_loss_mlp": 1.51655078, + "epoch": 0.001984067338042988, + "flos": 11253866336640.0, + "grad_norm": 3.344933729659458, + "language_loss": 0.95465255, + "learning_rate": 2.2512340280885094e-06, + "loss": 0.99336338, + "num_input_tokens_seen": 616295, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 10.5, + "step": 33, + "time_per_iteration": 2.645725727081299 + }, + { + "auxiliary_loss_clip": 0.02433039, + "auxiliary_loss_mlp": 0.013041, + "balance_loss_clip": 1.14569449, + "balance_loss_mlp": 1.48877192, + "epoch": 0.0020441905907109576, + "flos": 22386245898240.0, + "grad_norm": 4.808068329548225, + "language_loss": 0.91556877, + "learning_rate": 2.270454923596497e-06, + "loss": 0.95294011, + "num_input_tokens_seen": 637640, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 9.4375, + "step": 34, + "time_per_iteration": 2.7327146530151367 + }, + { + "auxiliary_loss_clip": 0.02385913, + "auxiliary_loss_mlp": 0.0127366, + "balance_loss_clip": 1.1172576, + "balance_loss_mlp": 1.45172572, + "epoch": 0.0021043138433789266, + "flos": 49778580337920.0, + "grad_norm": 2.948252640490764, + "language_loss": 0.76639408, + "learning_rate": 2.2891186125067434e-06, + "loss": 0.80298984, + "num_input_tokens_seen": 659710, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 9.375, + "step": 35, + "time_per_iteration": 2.940739870071411 + }, + { + "auxiliary_loss_clip": 0.02360979, + "auxiliary_loss_mlp": 0.0127456, + "balance_loss_clip": 1.12769413, + "balance_loss_mlp": 1.46427846, + "epoch": 0.002164437096046896, + "flos": 20557961591040.0, + "grad_norm": 2.1659182072135064, + "language_loss": 0.89043307, + "learning_rate": 2.307256493152974e-06, + "loss": 0.92678845, + "num_input_tokens_seen": 679670, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 8.9375, + "step": 36, + "time_per_iteration": 2.693335771560669 + }, + { + "auxiliary_loss_clip": 0.02305413, + "auxiliary_loss_mlp": 0.01335093, + "balance_loss_clip": 1.18574798, + "balance_loss_mlp": 1.45221901, + "epoch": 0.0022245603487148656, + "flos": 26542295084160.0, + "grad_norm": 3.3248653771669416, + "language_loss": 0.93231332, + "learning_rate": 2.3248973825097614e-06, + "loss": 0.96871841, + "num_input_tokens_seen": 700170, + "router_z_loss_clip": 1.4921875, + "router_z_loss_mlp": 8.5, + "step": 37, + "time_per_iteration": 2.70194673538208 + }, + { + "auxiliary_loss_clip": 0.02264412, + "auxiliary_loss_mlp": 0.01277806, + "balance_loss_clip": 1.15373349, + "balance_loss_mlp": 1.44697845, + "epoch": 0.0022846836013828346, + "flos": 20338188226560.0, + "grad_norm": 2.1191864106647906, + "language_loss": 1.04275775, + "learning_rate": 2.3420677916238357e-06, + "loss": 1.07817996, + "num_input_tokens_seen": 718545, + "router_z_loss_clip": 1.2421875, + "router_z_loss_mlp": 8.1875, + "step": 38, + "time_per_iteration": 2.674187183380127 + }, + { + "auxiliary_loss_clip": 0.02234117, + "auxiliary_loss_mlp": 0.01257339, + "balance_loss_clip": 1.13164425, + "balance_loss_mlp": 1.44101977, + "epoch": 0.002344806854050804, + "flos": 26247575992320.0, + "grad_norm": 2.2707505194681685, + "language_loss": 0.85635245, + "learning_rate": 2.358792165262154e-06, + "loss": 0.891267, + "num_input_tokens_seen": 739865, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 7.9375, + "step": 39, + "time_per_iteration": 2.716417074203491 + }, + { + "auxiliary_loss_clip": 0.02209554, + "auxiliary_loss_mlp": 0.01248677, + "balance_loss_clip": 1.1173557, + "balance_loss_mlp": 1.43176007, + "epoch": 0.0024049301067187736, + "flos": 11801539981440.0, + "grad_norm": 2.874633531970748, + "language_loss": 0.90416026, + "learning_rate": 2.3750930912143747e-06, + "loss": 0.93874258, + "num_input_tokens_seen": 755770, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 7.78125, + "step": 40, + "time_per_iteration": 2.621108055114746 + }, + { + "auxiliary_loss_clip": 0.02158681, + "auxiliary_loss_mlp": 0.01271709, + "balance_loss_clip": 1.15626693, + "balance_loss_mlp": 1.42207694, + "epoch": 0.0024650533593867426, + "flos": 20631506688000.0, + "grad_norm": 3.842521317695652, + "language_loss": 0.93497038, + "learning_rate": 2.3909914837471044e-06, + "loss": 0.96927428, + "num_input_tokens_seen": 773440, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 7.375, + "step": 41, + "time_per_iteration": 2.66089129447937 + }, + { + "auxiliary_loss_clip": 0.0212207, + "auxiliary_loss_mlp": 0.0125263, + "balance_loss_clip": 1.14720106, + "balance_loss_mlp": 1.41368401, + "epoch": 0.002525176612054712, + "flos": 18406122549120.0, + "grad_norm": 4.5963223670672635, + "language_loss": 0.97454929, + "learning_rate": 2.4065067449483835e-06, + "loss": 1.00829637, + "num_input_tokens_seen": 790455, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 7.09375, + "step": 42, + "time_per_iteration": 2.63149094581604 + }, + { + "auxiliary_loss_clip": 0.02082851, + "auxiliary_loss_mlp": 0.01298258, + "balance_loss_clip": 1.18939614, + "balance_loss_mlp": 1.41430426, + "epoch": 0.0025852998647226816, + "flos": 28184023128960.0, + "grad_norm": 2.9545418034556814, + "language_loss": 0.97656071, + "learning_rate": 2.4216569070848724e-06, + "loss": 1.01037169, + "num_input_tokens_seen": 810645, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 6.6875, + "step": 43, + "time_per_iteration": 2.7244436740875244 + }, + { + "auxiliary_loss_clip": 0.02102024, + "auxiliary_loss_mlp": 0.01311792, + "balance_loss_clip": 1.19706488, + "balance_loss_mlp": 1.4130851, + "epoch": 0.0026454231173906506, + "flos": 14283110897280.0, + "grad_norm": 2.0531245010632473, + "language_loss": 0.93701768, + "learning_rate": 2.4364587585915504e-06, + "loss": 0.97115582, + "num_input_tokens_seen": 827470, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 6.875, + "step": 44, + "time_per_iteration": 2.6628317832946777 + }, + { + "auxiliary_loss_clip": 0.02065563, + "auxiliary_loss_mlp": 0.01272457, + "balance_loss_clip": 1.17236853, + "balance_loss_mlp": 1.41084957, + "epoch": 0.00270554637005862, + "flos": 22419211605120.0, + "grad_norm": 9.3374631511207, + "language_loss": 0.98937047, + "learning_rate": 2.450927955901469e-06, + "loss": 1.02275062, + "num_input_tokens_seen": 847285, + "router_z_loss_clip": 1.0, + "router_z_loss_mlp": 6.5625, + "step": 45, + "time_per_iteration": 2.7355775833129883 + }, + { + "auxiliary_loss_clip": 0.02040064, + "auxiliary_loss_mlp": 0.01227769, + "balance_loss_clip": 1.13831401, + "balance_loss_mlp": 1.39673805, + "epoch": 0.0027656696227265896, + "flos": 23985778440960.0, + "grad_norm": 1.8055823424878037, + "language_loss": 1.02792716, + "learning_rate": 2.465079122983384e-06, + "loss": 1.06060553, + "num_input_tokens_seen": 867545, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 6.4375, + "step": 46, + "time_per_iteration": 2.7488839626312256 + }, + { + "auxiliary_loss_clip": 0.02002379, + "auxiliary_loss_mlp": 0.01270193, + "balance_loss_clip": 1.17773402, + "balance_loss_mlp": 1.38648152, + "epoch": 0.0028257928753945586, + "flos": 37669503087360.0, + "grad_norm": 2.971366079361506, + "language_loss": 0.88043427, + "learning_rate": 2.4789259401737868e-06, + "loss": 0.91315997, + "num_input_tokens_seen": 889915, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 6.15625, + "step": 47, + "time_per_iteration": 2.845005512237549 + }, + { + "auxiliary_loss_clip": 0.01963914, + "auxiliary_loss_mlp": 0.01252908, + "balance_loss_clip": 1.16493094, + "balance_loss_mlp": 1.37624073, + "epoch": 0.002885916128062528, + "flos": 22454547609600.0, + "grad_norm": 2.070099145794898, + "language_loss": 0.87949276, + "learning_rate": 2.492481223656015e-06, + "loss": 0.91166103, + "num_input_tokens_seen": 908975, + "router_z_loss_clip": 0.87890625, + "router_z_loss_mlp": 5.875, + "step": 48, + "time_per_iteration": 2.7514398097991943 + }, + { + "auxiliary_loss_clip": 0.01962956, + "auxiliary_loss_mlp": 0.01244481, + "balance_loss_clip": 1.15078259, + "balance_loss_mlp": 1.36602139, + "epoch": 0.0029460393807304976, + "flos": 27012796358400.0, + "grad_norm": 2.366138839739612, + "language_loss": 0.89877701, + "learning_rate": 2.5057569967437924e-06, + "loss": 0.93085134, + "num_input_tokens_seen": 929810, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 6.0, + "step": 49, + "time_per_iteration": 2.743236541748047 + }, + { + "auxiliary_loss_clip": 0.01955947, + "auxiliary_loss_mlp": 0.01232227, + "balance_loss_clip": 1.14534748, + "balance_loss_mlp": 1.36045313, + "epoch": 0.0030061626333984666, + "flos": 15851832549120.0, + "grad_norm": 2.8158483763506914, + "language_loss": 0.91078663, + "learning_rate": 2.51876455396287e-06, + "loss": 0.94266832, + "num_input_tokens_seen": 948650, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 5.9375, + "step": 50, + "time_per_iteration": 2.6860456466674805 + }, + { + "auxiliary_loss_clip": 0.01953364, + "auxiliary_loss_mlp": 0.01201227, + "balance_loss_clip": 1.11778045, + "balance_loss_mlp": 1.36547732, + "epoch": 0.003066285886066436, + "flos": 31827052316160.0, + "grad_norm": 3.5299735782100026, + "language_loss": 0.87144494, + "learning_rate": 2.5315145187866316e-06, + "loss": 0.90299082, + "num_input_tokens_seen": 966455, + "router_z_loss_clip": 0.83203125, + "router_z_loss_mlp": 5.875, + "step": 51, + "time_per_iteration": 2.7481534481048584 + }, + { + "auxiliary_loss_clip": 0.01909154, + "auxiliary_loss_mlp": 0.01207037, + "balance_loss_clip": 1.12707186, + "balance_loss_mlp": 1.35597348, + "epoch": 0.0031264091387344056, + "flos": 41427482774400.0, + "grad_norm": 2.0262044932375836, + "language_loss": 0.95253396, + "learning_rate": 2.5440168957651953e-06, + "loss": 0.98369586, + "num_input_tokens_seen": 988110, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 5.53125, + "step": 52, + "time_per_iteration": 2.8958797454833984 + }, + { + "auxiliary_loss_clip": 0.01904814, + "auxiliary_loss_mlp": 0.01243661, + "balance_loss_clip": 1.16274214, + "balance_loss_mlp": 1.35173535, + "epoch": 0.0031865323914023747, + "flos": 23440941970560.0, + "grad_norm": 3.3193539013945546, + "language_loss": 0.92261833, + "learning_rate": 2.5562811176888872e-06, + "loss": 0.95410311, + "num_input_tokens_seen": 1008550, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 5.53125, + "step": 53, + "time_per_iteration": 2.7579286098480225 + }, + { + "auxiliary_loss_clip": 0.01893968, + "auxiliary_loss_mlp": 0.01196907, + "balance_loss_clip": 1.11489081, + "balance_loss_mlp": 1.35535884, + "epoch": 0.003246655644070344, + "flos": 14429195510400.0, + "grad_norm": 2.2021865200163, + "language_loss": 0.82945669, + "learning_rate": 2.5683160883431093e-06, + "loss": 0.86036545, + "num_input_tokens_seen": 1026840, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 5.375, + "step": 54, + "time_per_iteration": 2.684718132019043 + }, + { + "auxiliary_loss_clip": 0.01889572, + "auxiliary_loss_mlp": 0.01211293, + "balance_loss_clip": 1.13113666, + "balance_loss_mlp": 1.34359026, + "epoch": 0.0033067788967383136, + "flos": 35918247496320.0, + "grad_norm": 2.4060188817442487, + "language_loss": 0.81305432, + "learning_rate": 2.580130221340046e-06, + "loss": 0.84406298, + "num_input_tokens_seen": 1048875, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 5.4375, + "step": 55, + "time_per_iteration": 2.7722246646881104 + }, + { + "auxiliary_loss_clip": 0.01879346, + "auxiliary_loss_mlp": 0.01199903, + "balance_loss_clip": 1.11926973, + "balance_loss_mlp": 1.33773279, + "epoch": 0.003366902149406283, + "flos": 22958732862720.0, + "grad_norm": 2.497299649397407, + "language_loss": 0.87261844, + "learning_rate": 2.5917314754514246e-06, + "loss": 0.90341091, + "num_input_tokens_seen": 1066435, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 5.40625, + "step": 56, + "time_per_iteration": 2.7031195163726807 + }, + { + "auxiliary_loss_clip": 0.01879922, + "auxiliary_loss_mlp": 0.01161266, + "balance_loss_clip": 1.0864507, + "balance_loss_mlp": 1.33024335, + "epoch": 0.003427025402074252, + "flos": 26582838560640.0, + "grad_norm": 2.4089458733946882, + "language_loss": 0.92949611, + "learning_rate": 2.6031273868139713e-06, + "loss": 0.95990801, + "num_input_tokens_seen": 1090330, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 5.5, + "step": 57, + "time_per_iteration": 2.8580281734466553 + }, + { + "auxiliary_loss_clip": 0.01843074, + "auxiliary_loss_mlp": 0.01217004, + "balance_loss_clip": 1.14395308, + "balance_loss_mlp": 1.33453596, + "epoch": 0.0034871486547422216, + "flos": 23951196622080.0, + "grad_norm": 2.105168727735643, + "language_loss": 0.99725533, + "learning_rate": 2.614325098333948e-06, + "loss": 1.02785611, + "num_input_tokens_seen": 1109840, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 5.09375, + "step": 58, + "time_per_iteration": 2.687504529953003 + }, + { + "auxiliary_loss_clip": 0.01822907, + "auxiliary_loss_mlp": 0.01195384, + "balance_loss_clip": 1.12319088, + "balance_loss_mlp": 1.32094967, + "epoch": 0.003547271907410191, + "flos": 21214983214080.0, + "grad_norm": 2.1328304194940855, + "language_loss": 0.8821373, + "learning_rate": 2.625331386578098e-06, + "loss": 0.9123202, + "num_input_tokens_seen": 1128415, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 5.03125, + "step": 59, + "time_per_iteration": 6.997380495071411 + }, + { + "auxiliary_loss_clip": 0.01844896, + "auxiliary_loss_mlp": 0.01162144, + "balance_loss_clip": 1.08885431, + "balance_loss_mlp": 1.32932925, + "epoch": 0.00360739516007816, + "flos": 16504903676160.0, + "grad_norm": 2.097582115586327, + "language_loss": 0.93430054, + "learning_rate": 2.63615268640451e-06, + "loss": 0.96437097, + "num_input_tokens_seen": 1146515, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 5.15625, + "step": 60, + "time_per_iteration": 2.67743182182312 + }, + { + "auxiliary_loss_clip": 0.0182307, + "auxiliary_loss_mlp": 0.01172385, + "balance_loss_clip": 1.10376787, + "balance_loss_mlp": 1.31307459, + "epoch": 0.0036675184127461296, + "flos": 19464805031040.0, + "grad_norm": 4.241258673484683, + "language_loss": 0.90090871, + "learning_rate": 2.6467951135575943e-06, + "loss": 0.93086326, + "num_input_tokens_seen": 1166330, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 5.09375, + "step": 61, + "time_per_iteration": 2.707247257232666 + }, + { + "auxiliary_loss_clip": 0.01806801, + "auxiliary_loss_mlp": 0.01142561, + "balance_loss_clip": 1.07475519, + "balance_loss_mlp": 1.31002319, + "epoch": 0.003727641665414099, + "flos": 20957323979520.0, + "grad_norm": 3.0487456468745586, + "language_loss": 0.88434047, + "learning_rate": 2.657264485425803e-06, + "loss": 0.9138341, + "num_input_tokens_seen": 1186010, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 4.96875, + "step": 62, + "time_per_iteration": 2.736107587814331 + }, + { + "auxiliary_loss_clip": 0.01787131, + "auxiliary_loss_mlp": 0.01161947, + "balance_loss_clip": 1.09132755, + "balance_loss_mlp": 1.30018497, + "epoch": 0.003787764918082068, + "flos": 18406050721920.0, + "grad_norm": 2.6509198595432406, + "language_loss": 0.96265876, + "learning_rate": 2.6675663401385186e-06, + "loss": 0.99214947, + "num_input_tokens_seen": 1204985, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 4.875, + "step": 63, + "time_per_iteration": 2.6760194301605225 + }, + { + "auxiliary_loss_clip": 0.01795174, + "auxiliary_loss_mlp": 0.01169703, + "balance_loss_clip": 1.10284996, + "balance_loss_mlp": 1.30725491, + "epoch": 0.0038478881707500376, + "flos": 12459243962880.0, + "grad_norm": 2.677484479433752, + "language_loss": 0.99141657, + "learning_rate": 2.677705954159056e-06, + "loss": 1.02106524, + "num_input_tokens_seen": 1223545, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 4.875, + "step": 64, + "time_per_iteration": 2.675295114517212 + }, + { + "auxiliary_loss_clip": 0.01802087, + "auxiliary_loss_mlp": 0.01149441, + "balance_loss_clip": 1.08134842, + "balance_loss_mlp": 1.30652797, + "epoch": 0.003908011423418007, + "flos": 13553334276480.0, + "grad_norm": 2.45939593962701, + "language_loss": 0.85358196, + "learning_rate": 2.6876883585136904e-06, + "loss": 0.88309723, + "num_input_tokens_seen": 1241175, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 4.9375, + "step": 65, + "time_per_iteration": 2.647696018218994 + }, + { + "auxiliary_loss_clip": 0.01779034, + "auxiliary_loss_mlp": 0.01156784, + "balance_loss_clip": 1.0886445, + "balance_loss_mlp": 1.29322505, + "epoch": 0.003968134676085976, + "flos": 18333475292160.0, + "grad_norm": 2.8561979494145033, + "language_loss": 0.85224223, + "learning_rate": 2.697518353781685e-06, + "loss": 0.88160038, + "num_input_tokens_seen": 1259315, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 4.875, + "step": 66, + "time_per_iteration": 2.617143392562866 + }, + { + "auxiliary_loss_clip": 0.01782156, + "auxiliary_loss_mlp": 0.01152634, + "balance_loss_clip": 1.07648349, + "balance_loss_mlp": 1.29168975, + "epoch": 0.004028257928753946, + "flos": 20485242506880.0, + "grad_norm": 2.246759082278279, + "language_loss": 0.96454394, + "learning_rate": 2.7072005239581103e-06, + "loss": 0.99389184, + "num_input_tokens_seen": 1277055, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 4.90625, + "step": 67, + "time_per_iteration": 2.6343421936035156 + }, + { + "auxiliary_loss_clip": 0.01753238, + "auxiliary_loss_mlp": 0.01155696, + "balance_loss_clip": 1.08340704, + "balance_loss_mlp": 1.28524387, + "epoch": 0.004088381181421915, + "flos": 18843837684480.0, + "grad_norm": 2.549207131743101, + "language_loss": 0.94534445, + "learning_rate": 2.7167392492896727e-06, + "loss": 0.97443378, + "num_input_tokens_seen": 1294355, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 4.6875, + "step": 68, + "time_per_iteration": 2.614696741104126 + }, + { + "auxiliary_loss_clip": 0.01748377, + "auxiliary_loss_mlp": 0.01156697, + "balance_loss_clip": 1.08717394, + "balance_loss_mlp": 1.28268003, + "epoch": 0.004148504434089885, + "flos": 19427817000960.0, + "grad_norm": 1.9922029239060344, + "language_loss": 0.95657748, + "learning_rate": 2.7261387181735195e-06, + "loss": 0.98562825, + "num_input_tokens_seen": 1313525, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 4.65625, + "step": 69, + "time_per_iteration": 2.6637492179870605 + }, + { + "auxiliary_loss_clip": 0.01742428, + "auxiliary_loss_mlp": 0.01160645, + "balance_loss_clip": 1.09598637, + "balance_loss_mlp": 1.2855866, + "epoch": 0.004208627686757853, + "flos": 20811023884800.0, + "grad_norm": 2.4176731159017075, + "language_loss": 0.98073572, + "learning_rate": 2.7354029381999196e-06, + "loss": 1.00976658, + "num_input_tokens_seen": 1330505, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 4.5625, + "step": 70, + "time_per_iteration": 2.6395556926727295 + }, + { + "auxiliary_loss_clip": 0.01748999, + "auxiliary_loss_mlp": 0.01146397, + "balance_loss_clip": 1.07673144, + "balance_loss_mlp": 1.2760632, + "epoch": 0.004268750939425823, + "flos": 19098623831040.0, + "grad_norm": 2.71386904393857, + "language_loss": 0.93927777, + "learning_rate": 2.7445357464116983e-06, + "loss": 0.96823174, + "num_input_tokens_seen": 1349615, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 4.75, + "step": 71, + "time_per_iteration": 2.628272294998169 + }, + { + "auxiliary_loss_clip": 0.01838762, + "auxiliary_loss_mlp": 0.01327632, + "balance_loss_clip": 1.28967619, + "balance_loss_mlp": 1.43997037, + "epoch": 0.004328874192093792, + "flos": 52439635514880.0, + "grad_norm": 2.4194543250518663, + "language_loss": 0.65655279, + "learning_rate": 2.75354081884615e-06, + "loss": 0.68821681, + "num_input_tokens_seen": 1410275, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 4.0, + "step": 72, + "time_per_iteration": 3.104635000228882 + }, + { + "auxiliary_loss_clip": 0.01820285, + "auxiliary_loss_mlp": 0.01295248, + "balance_loss_clip": 1.25824571, + "balance_loss_mlp": 1.43420911, + "epoch": 0.004388997444761762, + "flos": 66473239564800.0, + "grad_norm": 2.2482458517722455, + "language_loss": 0.63711512, + "learning_rate": 2.7624216794188286e-06, + "loss": 0.66827047, + "num_input_tokens_seen": 1473020, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 3.859375, + "step": 73, + "time_per_iteration": 3.208836793899536 + }, + { + "auxiliary_loss_clip": 0.01723308, + "auxiliary_loss_mlp": 0.01141966, + "balance_loss_clip": 1.07382631, + "balance_loss_mlp": 1.26790953, + "epoch": 0.004449120697429731, + "flos": 18952970181120.0, + "grad_norm": 2.4515337577309424, + "language_loss": 0.85899854, + "learning_rate": 2.771181708202938e-06, + "loss": 0.88765126, + "num_input_tokens_seen": 1490385, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 4.5625, + "step": 74, + "time_per_iteration": 2.6287550926208496 + }, + { + "auxiliary_loss_clip": 0.01725734, + "auxiliary_loss_mlp": 0.01165418, + "balance_loss_clip": 1.09584761, + "balance_loss_mlp": 1.26750898, + "epoch": 0.004509243950097701, + "flos": 21105491581440.0, + "grad_norm": 2.110493434952054, + "language_loss": 0.9716984, + "learning_rate": 2.779824149153005e-06, + "loss": 1.00060987, + "num_input_tokens_seen": 1509725, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 4.5625, + "step": 75, + "time_per_iteration": 2.635618209838867 + }, + { + "auxiliary_loss_clip": 0.01704277, + "auxiliary_loss_mlp": 0.01144942, + "balance_loss_clip": 1.07875705, + "balance_loss_mlp": 1.26302838, + "epoch": 0.004569367202765669, + "flos": 20698730991360.0, + "grad_norm": 2.60583579179481, + "language_loss": 0.87675405, + "learning_rate": 2.788352117317012e-06, + "loss": 0.9052462, + "num_input_tokens_seen": 1527245, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 4.4375, + "step": 76, + "time_per_iteration": 2.6379826068878174 + }, + { + "auxiliary_loss_clip": 0.01705571, + "auxiliary_loss_mlp": 0.0114831, + "balance_loss_clip": 1.07845366, + "balance_loss_mlp": 1.26138341, + "epoch": 0.004629490455433639, + "flos": 28658474899200.0, + "grad_norm": 1.9080158042054207, + "language_loss": 0.91751724, + "learning_rate": 2.796768605577095e-06, + "loss": 0.94605613, + "num_input_tokens_seen": 1548930, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 4.4375, + "step": 77, + "time_per_iteration": 2.6596872806549072 + }, + { + "auxiliary_loss_clip": 0.01694222, + "auxiliary_loss_mlp": 0.01165235, + "balance_loss_clip": 1.09494948, + "balance_loss_mlp": 1.26167083, + "epoch": 0.004689613708101608, + "flos": 11072409805440.0, + "grad_norm": 2.1229280552318803, + "language_loss": 0.92189825, + "learning_rate": 2.80507649095533e-06, + "loss": 0.95049286, + "num_input_tokens_seen": 1565695, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 4.3125, + "step": 78, + "time_per_iteration": 2.598590850830078 + }, + { + "auxiliary_loss_clip": 0.01690635, + "auxiliary_loss_mlp": 0.01155594, + "balance_loss_clip": 1.08735824, + "balance_loss_mlp": 1.25696921, + "epoch": 0.004749736960769578, + "flos": 21799106184960.0, + "grad_norm": 2.280813483182965, + "language_loss": 0.82480371, + "learning_rate": 2.813278540517843e-06, + "loss": 0.85326606, + "num_input_tokens_seen": 1582625, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 4.34375, + "step": 79, + "time_per_iteration": 2.6215708255767822 + }, + { + "auxiliary_loss_clip": 0.01705122, + "auxiliary_loss_mlp": 0.01133248, + "balance_loss_clip": 1.06315339, + "balance_loss_mlp": 1.26029253, + "epoch": 0.004809860213437547, + "flos": 19792597570560.0, + "grad_norm": 2.4809717100134616, + "language_loss": 0.91311121, + "learning_rate": 2.8213774169075505e-06, + "loss": 0.94149494, + "num_input_tokens_seen": 1601725, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 4.4375, + "step": 80, + "time_per_iteration": 2.639841079711914 + }, + { + "auxiliary_loss_clip": 0.01674552, + "auxiliary_loss_mlp": 0.01142875, + "balance_loss_clip": 1.07254159, + "balance_loss_mlp": 1.25350285, + "epoch": 0.004869983466105517, + "flos": 26574327037440.0, + "grad_norm": 2.165091554789383, + "language_loss": 0.94981706, + "learning_rate": 2.829375683533245e-06, + "loss": 0.97799134, + "num_input_tokens_seen": 1622420, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 4.21875, + "step": 81, + "time_per_iteration": 2.6689717769622803 + }, + { + "auxiliary_loss_clip": 0.01688803, + "auxiliary_loss_mlp": 0.01148831, + "balance_loss_clip": 1.08269382, + "balance_loss_mlp": 1.25745821, + "epoch": 0.004930106718773485, + "flos": 12823378087680.0, + "grad_norm": 2.9914678747629226, + "language_loss": 0.96341741, + "learning_rate": 2.8372758094402803e-06, + "loss": 0.99179375, + "num_input_tokens_seen": 1640715, + "router_z_loss_clip": 0.66015625, + "router_z_loss_mlp": 4.3125, + "step": 82, + "time_per_iteration": 2.629596471786499 + }, + { + "auxiliary_loss_clip": 0.01671229, + "auxiliary_loss_mlp": 0.01159801, + "balance_loss_clip": 1.09013557, + "balance_loss_mlp": 1.24528587, + "epoch": 0.004990229971441455, + "flos": 25774919902080.0, + "grad_norm": 2.533591741594043, + "language_loss": 0.8664127, + "learning_rate": 2.84508017388607e-06, + "loss": 0.894723, + "num_input_tokens_seen": 1662210, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 4.25, + "step": 83, + "time_per_iteration": 2.7277162075042725 + }, + { + "auxiliary_loss_clip": 0.01664198, + "auxiliary_loss_mlp": 0.01156919, + "balance_loss_clip": 1.08663368, + "balance_loss_mlp": 1.24647975, + "epoch": 0.005050353224109424, + "flos": 17457254922240.0, + "grad_norm": 3.373799694341511, + "language_loss": 0.91779828, + "learning_rate": 2.852791070641559e-06, + "loss": 0.94600952, + "num_input_tokens_seen": 1681070, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 4.1875, + "step": 84, + "time_per_iteration": 2.62187123298645 + }, + { + "auxiliary_loss_clip": 0.01645783, + "auxiliary_loss_mlp": 0.01205663, + "balance_loss_clip": 1.17075825, + "balance_loss_mlp": 1.34984684, + "epoch": 0.005110476476777394, + "flos": 69805460367360.0, + "grad_norm": 1.4266053341540552, + "language_loss": 0.62504542, + "learning_rate": 2.8604107120381682e-06, + "loss": 0.65355992, + "num_input_tokens_seen": 1747140, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 2.96875, + "step": 85, + "time_per_iteration": 3.190223217010498 + }, + { + "auxiliary_loss_clip": 0.0165122, + "auxiliary_loss_mlp": 0.01127154, + "balance_loss_clip": 1.05648708, + "balance_loss_mlp": 1.23674285, + "epoch": 0.005170599729445363, + "flos": 24790105739520.0, + "grad_norm": 1.7428139018461835, + "language_loss": 0.90836501, + "learning_rate": 2.8679412327780482e-06, + "loss": 0.93614876, + "num_input_tokens_seen": 1767475, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 4.15625, + "step": 86, + "time_per_iteration": 2.66162109375 + }, + { + "auxiliary_loss_clip": 0.01655877, + "auxiliary_loss_mlp": 0.01161945, + "balance_loss_clip": 1.09065783, + "balance_loss_mlp": 1.24282312, + "epoch": 0.005230722982113333, + "flos": 23258048895360.0, + "grad_norm": 2.38275425723773, + "language_loss": 0.8209877, + "learning_rate": 2.8753846935240833e-06, + "loss": 0.84916592, + "num_input_tokens_seen": 1784980, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 4.125, + "step": 87, + "time_per_iteration": 2.618906021118164 + }, + { + "auxiliary_loss_clip": 0.01644726, + "auxiliary_loss_mlp": 0.01154792, + "balance_loss_clip": 1.08617568, + "balance_loss_mlp": 1.24127626, + "epoch": 0.005290846234781301, + "flos": 16727909264640.0, + "grad_norm": 1.8918921085406437, + "language_loss": 0.95630223, + "learning_rate": 2.8827430842847267e-06, + "loss": 0.98429739, + "num_input_tokens_seen": 1803030, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 4.03125, + "step": 88, + "time_per_iteration": 2.5916671752929688 + }, + { + "auxiliary_loss_clip": 0.01661198, + "auxiliary_loss_mlp": 0.0114963, + "balance_loss_clip": 1.08230066, + "balance_loss_mlp": 1.24101663, + "epoch": 0.005350969487449271, + "flos": 20886077352960.0, + "grad_norm": 1.9438908009999392, + "language_loss": 0.85920149, + "learning_rate": 2.8900183276075957e-06, + "loss": 0.88730979, + "num_input_tokens_seen": 1822865, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 4.1875, + "step": 89, + "time_per_iteration": 2.6486849784851074 + }, + { + "auxiliary_loss_clip": 0.01648909, + "auxiliary_loss_mlp": 0.01132231, + "balance_loss_clip": 1.06547391, + "balance_loss_mlp": 1.23491406, + "epoch": 0.00541109274011724, + "flos": 26209977431040.0, + "grad_norm": 4.519706664825811, + "language_loss": 0.91517568, + "learning_rate": 2.8972122815946455e-06, + "loss": 0.94298708, + "num_input_tokens_seen": 1842435, + "router_z_loss_clip": 0.66796875, + "router_z_loss_mlp": 4.125, + "step": 90, + "time_per_iteration": 2.658997058868408 + }, + { + "auxiliary_loss_clip": 0.01630542, + "auxiliary_loss_mlp": 0.0113282, + "balance_loss_clip": 1.06496572, + "balance_loss_mlp": 1.23102689, + "epoch": 0.00547121599278521, + "flos": 21178569801600.0, + "grad_norm": 2.2090932400382486, + "language_loss": 0.8587057, + "learning_rate": 2.90432674275074e-06, + "loss": 0.88633931, + "num_input_tokens_seen": 1860065, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 3.984375, + "step": 91, + "time_per_iteration": 2.619231939315796 + }, + { + "auxiliary_loss_clip": 0.01629785, + "auxiliary_loss_mlp": 0.01140917, + "balance_loss_clip": 1.07458866, + "balance_loss_mlp": 1.22673059, + "epoch": 0.005531339245453179, + "flos": 19718801078400.0, + "grad_norm": 2.769705373909222, + "language_loss": 0.86930025, + "learning_rate": 2.91136344867656e-06, + "loss": 0.89700729, + "num_input_tokens_seen": 1878135, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 4.03125, + "step": 92, + "time_per_iteration": 2.646968126296997 + }, + { + "auxiliary_loss_clip": 0.01621216, + "auxiliary_loss_mlp": 0.01179948, + "balance_loss_clip": 1.1122849, + "balance_loss_mlp": 1.21872091, + "epoch": 0.005591462498121149, + "flos": 17636089760640.0, + "grad_norm": 2.5030178409929, + "language_loss": 0.92042911, + "learning_rate": 2.918324080615938e-06, + "loss": 0.94844079, + "num_input_tokens_seen": 1894895, + "router_z_loss_clip": 0.67578125, + "router_z_loss_mlp": 4.03125, + "step": 93, + "time_per_iteration": 2.59853196144104 + }, + { + "auxiliary_loss_clip": 0.016342, + "auxiliary_loss_mlp": 0.01152159, + "balance_loss_clip": 1.08120561, + "balance_loss_mlp": 1.22512126, + "epoch": 0.005651585750789117, + "flos": 20011221699840.0, + "grad_norm": 2.2071592078672198, + "language_loss": 0.87372428, + "learning_rate": 2.925210265866963e-06, + "loss": 0.90158784, + "num_input_tokens_seen": 1913220, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 4.09375, + "step": 94, + "time_per_iteration": 2.587707281112671 + }, + { + "auxiliary_loss_clip": 0.01562532, + "auxiliary_loss_mlp": 0.01067909, + "balance_loss_clip": 1.03243279, + "balance_loss_mlp": 1.30452466, + "epoch": 0.005711709003457087, + "flos": 59812957981440.0, + "grad_norm": 1.3851210442303683, + "language_loss": 0.6813519, + "learning_rate": 2.932023580065507e-06, + "loss": 0.70765626, + "num_input_tokens_seen": 1970970, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 2.578125, + "step": 95, + "time_per_iteration": 3.067047595977783 + }, + { + "auxiliary_loss_clip": 0.01611383, + "auxiliary_loss_mlp": 0.01154317, + "balance_loss_clip": 1.08693981, + "balance_loss_mlp": 1.21303511, + "epoch": 0.005771832256125056, + "flos": 15559591495680.0, + "grad_norm": 2.5109536438971976, + "language_loss": 0.89978027, + "learning_rate": 2.9387655493491906e-06, + "loss": 0.92743719, + "num_input_tokens_seen": 1988930, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 3.984375, + "step": 96, + "time_per_iteration": 2.590522289276123 + }, + { + "auxiliary_loss_clip": 0.01603776, + "auxiliary_loss_mlp": 0.01143264, + "balance_loss_clip": 1.08108413, + "balance_loss_mlp": 1.21597803, + "epoch": 0.005831955508793026, + "flos": 22528380015360.0, + "grad_norm": 2.825781473558237, + "language_loss": 0.89798892, + "learning_rate": 2.9454376524092147e-06, + "loss": 0.92545933, + "num_input_tokens_seen": 2006285, + "router_z_loss_clip": 0.62109375, + "router_z_loss_mlp": 3.875, + "step": 97, + "time_per_iteration": 2.630364179611206 + }, + { + "auxiliary_loss_clip": 0.0158997, + "auxiliary_loss_mlp": 0.01139649, + "balance_loss_clip": 1.07103181, + "balance_loss_mlp": 1.20754981, + "epoch": 0.005892078761460995, + "flos": 22049834094720.0, + "grad_norm": 2.1954130163748573, + "language_loss": 0.76553786, + "learning_rate": 2.952041322436969e-06, + "loss": 0.79283404, + "num_input_tokens_seen": 2024905, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 3.8125, + "step": 98, + "time_per_iteration": 2.6088852882385254 + }, + { + "auxiliary_loss_clip": 0.01531856, + "auxiliary_loss_mlp": 0.01047217, + "balance_loss_clip": 1.01250362, + "balance_loss_mlp": 1.28449416, + "epoch": 0.005952202014128965, + "flos": 68539143317760.0, + "grad_norm": 1.0389188302362988, + "language_loss": 0.65464473, + "learning_rate": 2.9585779489718204e-06, + "loss": 0.68043554, + "num_input_tokens_seen": 2086220, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 2.46875, + "step": 99, + "time_per_iteration": 3.196779251098633 + }, + { + "auxiliary_loss_clip": 0.0159215, + "auxiliary_loss_mlp": 0.01143603, + "balance_loss_clip": 1.07312632, + "balance_loss_mlp": 1.20754516, + "epoch": 0.006012325266796933, + "flos": 22960887678720.0, + "grad_norm": 2.02393591458392, + "language_loss": 0.90861535, + "learning_rate": 2.9650488796560464e-06, + "loss": 0.93597281, + "num_input_tokens_seen": 2103365, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 3.84375, + "step": 100, + "time_per_iteration": 2.659716844558716 + }, + { + "auxiliary_loss_clip": 0.01602583, + "auxiliary_loss_mlp": 0.01150362, + "balance_loss_clip": 1.08360529, + "balance_loss_mlp": 1.21008992, + "epoch": 0.006072448519464903, + "flos": 17347942857600.0, + "grad_norm": 9.149928686451464, + "language_loss": 0.91165614, + "learning_rate": 2.971455421902446e-06, + "loss": 0.93918556, + "num_input_tokens_seen": 2121995, + "router_z_loss_clip": 0.66796875, + "router_z_loss_mlp": 3.921875, + "step": 101, + "time_per_iteration": 5.522722959518433 + }, + { + "auxiliary_loss_clip": 0.01592164, + "auxiliary_loss_mlp": 0.01153598, + "balance_loss_clip": 1.08273995, + "balance_loss_mlp": 1.21078956, + "epoch": 0.006132571772132872, + "flos": 24681116897280.0, + "grad_norm": 2.149611483260168, + "language_loss": 0.90634245, + "learning_rate": 2.9777988444798075e-06, + "loss": 0.9338001, + "num_input_tokens_seen": 2141815, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 3.8125, + "step": 102, + "time_per_iteration": 2.7264201641082764 + }, + { + "auxiliary_loss_clip": 0.01586171, + "auxiliary_loss_mlp": 0.01134806, + "balance_loss_clip": 1.06986046, + "balance_loss_mlp": 1.20794034, + "epoch": 0.006192695024800842, + "flos": 21465675210240.0, + "grad_norm": 2.4455555336324135, + "language_loss": 0.87990314, + "learning_rate": 2.9840803790210285e-06, + "loss": 0.9071129, + "num_input_tokens_seen": 2161125, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 3.78125, + "step": 103, + "time_per_iteration": 2.6332345008850098 + }, + { + "auxiliary_loss_clip": 0.01586169, + "auxiliary_loss_mlp": 0.01136721, + "balance_loss_clip": 1.07015502, + "balance_loss_mlp": 1.2100153, + "epoch": 0.006252818277468811, + "flos": 17420410546560.0, + "grad_norm": 1.9653003456434248, + "language_loss": 0.93796182, + "learning_rate": 2.990301221458371e-06, + "loss": 0.96519077, + "num_input_tokens_seen": 2179510, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 3.765625, + "step": 104, + "time_per_iteration": 2.5763180255889893 + }, + { + "auxiliary_loss_clip": 0.01576682, + "auxiliary_loss_mlp": 0.01148107, + "balance_loss_clip": 1.08382916, + "balance_loss_mlp": 1.20004964, + "epoch": 0.006312941530136781, + "flos": 19099557584640.0, + "grad_norm": 2.978383813748495, + "language_loss": 0.96302718, + "learning_rate": 2.9964625333900544e-06, + "loss": 0.99027503, + "num_input_tokens_seen": 2197870, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 3.765625, + "step": 105, + "time_per_iteration": 2.598074197769165 + }, + { + "auxiliary_loss_clip": 0.01576054, + "auxiliary_loss_mlp": 0.01157995, + "balance_loss_clip": 1.08618331, + "balance_loss_mlp": 1.20040035, + "epoch": 0.006373064782804749, + "flos": 24060831909120.0, + "grad_norm": 2.254409296180574, + "language_loss": 0.86981636, + "learning_rate": 3.002565443382063e-06, + "loss": 0.89715683, + "num_input_tokens_seen": 2217495, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 3.75, + "step": 106, + "time_per_iteration": 2.620400905609131 + }, + { + "auxiliary_loss_clip": 0.01558878, + "auxiliary_loss_mlp": 0.01142953, + "balance_loss_clip": 1.07462192, + "balance_loss_mlp": 1.18650925, + "epoch": 0.006433188035472719, + "flos": 18332433797760.0, + "grad_norm": 2.299900982703377, + "language_loss": 0.8342824, + "learning_rate": 3.008611048208843e-06, + "loss": 0.86130083, + "num_input_tokens_seen": 2236520, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 3.71875, + "step": 107, + "time_per_iteration": 2.6031439304351807 + }, + { + "auxiliary_loss_clip": 0.01473949, + "auxiliary_loss_mlp": 0.01044987, + "balance_loss_clip": 1.01294351, + "balance_loss_mlp": 1.24969411, + "epoch": 0.006493311288140688, + "flos": 62562387594240.0, + "grad_norm": 0.9921074222226888, + "language_loss": 0.64829654, + "learning_rate": 3.014600414036285e-06, + "loss": 0.67348593, + "num_input_tokens_seen": 2300140, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 2.25, + "step": 108, + "time_per_iteration": 3.1797876358032227 + }, + { + "auxiliary_loss_clip": 0.01549803, + "auxiliary_loss_mlp": 0.0113223, + "balance_loss_clip": 1.0634706, + "balance_loss_mlp": 1.18794155, + "epoch": 0.006553434540808658, + "flos": 19500141035520.0, + "grad_norm": 3.0292528917398895, + "language_loss": 0.97705221, + "learning_rate": 3.0205345775501937e-06, + "loss": 1.00387263, + "num_input_tokens_seen": 2317320, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 3.625, + "step": 109, + "time_per_iteration": 2.587251663208008 + }, + { + "auxiliary_loss_clip": 0.01548304, + "auxiliary_loss_mlp": 0.01143686, + "balance_loss_clip": 1.07759643, + "balance_loss_mlp": 1.18955791, + "epoch": 0.006613557793476627, + "flos": 21105132445440.0, + "grad_norm": 1.7037490209774204, + "language_loss": 0.84119976, + "learning_rate": 3.0264145470332218e-06, + "loss": 0.86811972, + "num_input_tokens_seen": 2337820, + "router_z_loss_clip": 0.66015625, + "router_z_loss_mlp": 3.59375, + "step": 110, + "time_per_iteration": 2.612900495529175 + }, + { + "auxiliary_loss_clip": 0.01543027, + "auxiliary_loss_mlp": 0.01148771, + "balance_loss_clip": 1.08287191, + "balance_loss_mlp": 1.18348098, + "epoch": 0.006673681046144597, + "flos": 26030747543040.0, + "grad_norm": 2.0686651571732186, + "language_loss": 0.83053756, + "learning_rate": 3.032241303393073e-06, + "loss": 0.85745549, + "num_input_tokens_seen": 2358560, + "router_z_loss_clip": 0.66015625, + "router_z_loss_mlp": 3.59375, + "step": 111, + "time_per_iteration": 2.648775815963745 + }, + { + "auxiliary_loss_clip": 0.01543945, + "auxiliary_loss_mlp": 0.01132291, + "balance_loss_clip": 1.06906247, + "balance_loss_mlp": 1.18600404, + "epoch": 0.006733804298812566, + "flos": 23147767163520.0, + "grad_norm": 1.9360906695559799, + "language_loss": 0.94064176, + "learning_rate": 3.0380158011446e-06, + "loss": 0.96740413, + "num_input_tokens_seen": 2379005, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 3.59375, + "step": 112, + "time_per_iteration": 2.5952305793762207 + }, + { + "auxiliary_loss_clip": 0.01547241, + "auxiliary_loss_mlp": 0.0113746, + "balance_loss_clip": 1.07342076, + "balance_loss_mlp": 1.18214464, + "epoch": 0.006793927551480535, + "flos": 11764444210560.0, + "grad_norm": 2.4119047199233594, + "language_loss": 0.79298341, + "learning_rate": 3.0437389693482466e-06, + "loss": 0.81983036, + "num_input_tokens_seen": 2395610, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 3.65625, + "step": 113, + "time_per_iteration": 2.524744987487793 + }, + { + "auxiliary_loss_clip": 0.01535171, + "auxiliary_loss_mlp": 0.01130123, + "balance_loss_clip": 1.06460583, + "balance_loss_mlp": 1.1784718, + "epoch": 0.006854050804148504, + "flos": 19171953446400.0, + "grad_norm": 2.1108584765070924, + "language_loss": 0.93168736, + "learning_rate": 3.0494117125071475e-06, + "loss": 0.95834035, + "num_input_tokens_seen": 2415005, + "router_z_loss_clip": 0.65625, + "router_z_loss_mlp": 3.5625, + "step": 114, + "time_per_iteration": 2.6716785430908203 + }, + { + "auxiliary_loss_clip": 0.01541748, + "auxiliary_loss_mlp": 0.01138267, + "balance_loss_clip": 1.07828045, + "balance_loss_mlp": 1.17785645, + "epoch": 0.006914174056816474, + "flos": 21981891519360.0, + "grad_norm": 2.266348661789013, + "language_loss": 0.94440514, + "learning_rate": 3.055034911425055e-06, + "loss": 0.97120523, + "num_input_tokens_seen": 2433965, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 3.640625, + "step": 115, + "time_per_iteration": 2.6136229038238525 + }, + { + "auxiliary_loss_clip": 0.01536673, + "auxiliary_loss_mlp": 0.0111845, + "balance_loss_clip": 1.052122, + "balance_loss_mlp": 1.1758287, + "epoch": 0.006974297309484443, + "flos": 16289152634880.0, + "grad_norm": 12.665326776351556, + "language_loss": 0.81903678, + "learning_rate": 3.0606094240271244e-06, + "loss": 0.84558797, + "num_input_tokens_seen": 2451605, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 3.609375, + "step": 116, + "time_per_iteration": 2.577003240585327 + }, + { + "auxiliary_loss_clip": 0.01526673, + "auxiliary_loss_mlp": 0.01127935, + "balance_loss_clip": 1.06375241, + "balance_loss_mlp": 1.17504787, + "epoch": 0.007034420562152413, + "flos": 26104005331200.0, + "grad_norm": 2.0071741256932794, + "language_loss": 0.88063896, + "learning_rate": 3.0661360861454656e-06, + "loss": 0.90718508, + "num_input_tokens_seen": 2472035, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 3.515625, + "step": 117, + "time_per_iteration": 2.611503839492798 + }, + { + "auxiliary_loss_clip": 0.01525448, + "auxiliary_loss_mlp": 0.01143736, + "balance_loss_clip": 1.07840896, + "balance_loss_mlp": 1.17308259, + "epoch": 0.007094543814820382, + "flos": 14204609723520.0, + "grad_norm": 2.5473368597875594, + "language_loss": 0.84470415, + "learning_rate": 3.071615712271274e-06, + "loss": 0.87139601, + "num_input_tokens_seen": 2489285, + "router_z_loss_clip": 0.65234375, + "router_z_loss_mlp": 3.53125, + "step": 118, + "time_per_iteration": 2.577461004257202 + }, + { + "auxiliary_loss_clip": 0.01536798, + "auxiliary_loss_mlp": 0.01163532, + "balance_loss_clip": 1.09930205, + "balance_loss_mlp": 1.1748507, + "epoch": 0.007154667067488351, + "flos": 14976007228800.0, + "grad_norm": 2.057592918726277, + "language_loss": 0.99470234, + "learning_rate": 3.0770490962752172e-06, + "loss": 1.02170563, + "num_input_tokens_seen": 2506460, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 3.625, + "step": 119, + "time_per_iteration": 2.549661636352539 + }, + { + "auxiliary_loss_clip": 0.01537395, + "auxiliary_loss_mlp": 0.0111939, + "balance_loss_clip": 1.05701971, + "balance_loss_mlp": 1.16968298, + "epoch": 0.00721479032015632, + "flos": 20193288762240.0, + "grad_norm": 2.410205702357196, + "language_loss": 0.89085704, + "learning_rate": 3.082437012097686e-06, + "loss": 0.91742492, + "num_input_tokens_seen": 2525565, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 3.6875, + "step": 120, + "time_per_iteration": 2.583630084991455 + }, + { + "auxiliary_loss_clip": 0.01524337, + "auxiliary_loss_mlp": 0.01130091, + "balance_loss_clip": 1.06667209, + "balance_loss_mlp": 1.17169607, + "epoch": 0.00727491357282429, + "flos": 23147228459520.0, + "grad_norm": 1.904240324338801, + "language_loss": 0.93491054, + "learning_rate": 3.0877802144103967e-06, + "loss": 0.96145487, + "num_input_tokens_seen": 2546605, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 3.53125, + "step": 121, + "time_per_iteration": 2.6146788597106934 + }, + { + "auxiliary_loss_clip": 0.01523412, + "auxiliary_loss_mlp": 0.0114605, + "balance_loss_clip": 1.08382273, + "balance_loss_mlp": 1.17073464, + "epoch": 0.007335036825492259, + "flos": 15521669712000.0, + "grad_norm": 3.352658173167552, + "language_loss": 0.90176952, + "learning_rate": 3.09307943925077e-06, + "loss": 0.92846411, + "num_input_tokens_seen": 2560730, + "router_z_loss_clip": 0.62109375, + "router_z_loss_mlp": 3.53125, + "step": 122, + "time_per_iteration": 2.566470146179199 + }, + { + "auxiliary_loss_clip": 0.01520578, + "auxiliary_loss_mlp": 0.01142532, + "balance_loss_clip": 1.07634664, + "balance_loss_mlp": 1.16606736, + "epoch": 0.007395160078160229, + "flos": 24243365848320.0, + "grad_norm": 2.7249964127160764, + "language_loss": 0.92516506, + "learning_rate": 3.0983354046304154e-06, + "loss": 0.95179617, + "num_input_tokens_seen": 2579550, + "router_z_loss_clip": 0.66015625, + "router_z_loss_mlp": 3.546875, + "step": 123, + "time_per_iteration": 2.6002941131591797 + }, + { + "auxiliary_loss_clip": 0.01517776, + "auxiliary_loss_mlp": 0.01125795, + "balance_loss_clip": 1.06433022, + "balance_loss_mlp": 1.1609534, + "epoch": 0.007455283330828198, + "flos": 31759792099200.0, + "grad_norm": 7.583203404073904, + "language_loss": 0.71128142, + "learning_rate": 3.103548811118979e-06, + "loss": 0.73771715, + "num_input_tokens_seen": 2600390, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 3.5625, + "step": 124, + "time_per_iteration": 2.79618763923645 + }, + { + "auxiliary_loss_clip": 0.01505473, + "auxiliary_loss_mlp": 0.01124615, + "balance_loss_clip": 1.06157708, + "balance_loss_mlp": 1.16223335, + "epoch": 0.007515406583496167, + "flos": 26615157822720.0, + "grad_norm": 2.4227692366027855, + "language_loss": 0.88482195, + "learning_rate": 3.108720342404542e-06, + "loss": 0.9111228, + "num_input_tokens_seen": 2620770, + "router_z_loss_clip": 0.62890625, + "router_z_loss_mlp": 3.4375, + "step": 125, + "time_per_iteration": 2.6131536960601807 + }, + { + "auxiliary_loss_clip": 0.0152071, + "auxiliary_loss_mlp": 0.01140137, + "balance_loss_clip": 1.07762396, + "balance_loss_mlp": 1.16211164, + "epoch": 0.007575529836164136, + "flos": 18223696350720.0, + "grad_norm": 2.993097477973623, + "language_loss": 0.82384819, + "learning_rate": 3.1138506658316945e-06, + "loss": 0.8504566, + "num_input_tokens_seen": 2639900, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 3.59375, + "step": 126, + "time_per_iteration": 2.595423936843872 + }, + { + "auxiliary_loss_clip": 0.01514354, + "auxiliary_loss_mlp": 0.01139255, + "balance_loss_clip": 1.077981, + "balance_loss_mlp": 1.16128385, + "epoch": 0.007635653088832106, + "flos": 21580410228480.0, + "grad_norm": 3.7264016399601534, + "language_loss": 0.67276633, + "learning_rate": 3.1189404329183404e-06, + "loss": 0.69930243, + "num_input_tokens_seen": 2657450, + "router_z_loss_clip": 0.61328125, + "router_z_loss_mlp": 3.53125, + "step": 127, + "time_per_iteration": 2.620950937271118 + }, + { + "auxiliary_loss_clip": 0.01504536, + "auxiliary_loss_mlp": 0.01128822, + "balance_loss_clip": 1.06640375, + "balance_loss_mlp": 1.16422939, + "epoch": 0.007695776341500075, + "flos": 25375054723200.0, + "grad_norm": 3.6226937306152496, + "language_loss": 0.8815757, + "learning_rate": 3.1239902798522317e-06, + "loss": 0.90790927, + "num_input_tokens_seen": 2678150, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 3.40625, + "step": 128, + "time_per_iteration": 2.6476521492004395 + }, + { + "auxiliary_loss_clip": 0.01505804, + "auxiliary_loss_mlp": 0.01141266, + "balance_loss_clip": 1.07870471, + "balance_loss_mlp": 1.15920687, + "epoch": 0.007755899594168045, + "flos": 22343906741760.0, + "grad_norm": 1.875185485357673, + "language_loss": 0.84581351, + "learning_rate": 3.129000827968184e-06, + "loss": 0.87228423, + "num_input_tokens_seen": 2698290, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 3.46875, + "step": 129, + "time_per_iteration": 2.611762285232544 + }, + { + "auxiliary_loss_clip": 0.01499869, + "auxiliary_loss_mlp": 0.01133647, + "balance_loss_clip": 1.07122934, + "balance_loss_mlp": 1.1588279, + "epoch": 0.007816022846836013, + "flos": 22638230784000.0, + "grad_norm": 2.023668494136832, + "language_loss": 0.9742806, + "learning_rate": 3.133972684206866e-06, + "loss": 1.00061572, + "num_input_tokens_seen": 2717630, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 3.40625, + "step": 130, + "time_per_iteration": 2.599639415740967 + }, + { + "auxiliary_loss_clip": 0.01493155, + "auxiliary_loss_mlp": 0.0113499, + "balance_loss_clip": 1.07109392, + "balance_loss_mlp": 1.15518749, + "epoch": 0.007876146099503984, + "flos": 18182901479040.0, + "grad_norm": 2.1876581172480285, + "language_loss": 0.82624269, + "learning_rate": 3.138906441556014e-06, + "loss": 0.85252404, + "num_input_tokens_seen": 2735835, + "router_z_loss_clip": 0.63671875, + "router_z_loss_mlp": 3.375, + "step": 131, + "time_per_iteration": 2.6086065769195557 + }, + { + "auxiliary_loss_clip": 0.01502593, + "auxiliary_loss_mlp": 0.01127672, + "balance_loss_clip": 1.06759024, + "balance_loss_mlp": 1.15800536, + "epoch": 0.007936269352171952, + "flos": 27119486730240.0, + "grad_norm": 2.4868851395581677, + "language_loss": 0.82762384, + "learning_rate": 3.143802679474861e-06, + "loss": 0.85392648, + "num_input_tokens_seen": 2756335, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 3.4375, + "step": 132, + "time_per_iteration": 2.673790454864502 + }, + { + "auxiliary_loss_clip": 0.01493849, + "auxiliary_loss_mlp": 0.01128197, + "balance_loss_clip": 1.06716144, + "balance_loss_mlp": 1.15264463, + "epoch": 0.007996392604839923, + "flos": 19026335710080.0, + "grad_norm": 2.7432419346617443, + "language_loss": 0.95486552, + "learning_rate": 3.1486619643025565e-06, + "loss": 0.98108596, + "num_input_tokens_seen": 2775090, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 3.40625, + "step": 133, + "time_per_iteration": 2.6287872791290283 + }, + { + "auxiliary_loss_clip": 0.01490198, + "auxiliary_loss_mlp": 0.01125526, + "balance_loss_clip": 1.06725681, + "balance_loss_mlp": 1.16143155, + "epoch": 0.008056515857507891, + "flos": 25484151306240.0, + "grad_norm": 1.7764051426707919, + "language_loss": 0.73316634, + "learning_rate": 3.153484849651286e-06, + "loss": 0.7593236, + "num_input_tokens_seen": 2795320, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 3.296875, + "step": 134, + "time_per_iteration": 2.6728081703186035 + }, + { + "auxiliary_loss_clip": 0.01486213, + "auxiliary_loss_mlp": 0.01130543, + "balance_loss_clip": 1.06707644, + "balance_loss_mlp": 1.14955854, + "epoch": 0.00811663911017586, + "flos": 20557566541440.0, + "grad_norm": 3.090234736760587, + "language_loss": 0.88808328, + "learning_rate": 3.1582718767847806e-06, + "loss": 0.91425079, + "num_input_tokens_seen": 2812815, + "router_z_loss_clip": 0.63671875, + "router_z_loss_mlp": 3.375, + "step": 135, + "time_per_iteration": 2.6380510330200195 + }, + { + "auxiliary_loss_clip": 0.01489108, + "auxiliary_loss_mlp": 0.01131555, + "balance_loss_clip": 1.06789732, + "balance_loss_mlp": 1.15456343, + "epoch": 0.00817676236284383, + "flos": 18799738761600.0, + "grad_norm": 2.008171494368998, + "language_loss": 0.89123899, + "learning_rate": 3.1630235749828485e-06, + "loss": 0.9174456, + "num_input_tokens_seen": 2830445, + "router_z_loss_clip": 0.63671875, + "router_z_loss_mlp": 3.34375, + "step": 136, + "time_per_iteration": 2.555936813354492 + }, + { + "auxiliary_loss_clip": 0.01486639, + "auxiliary_loss_mlp": 0.01108223, + "balance_loss_clip": 1.04962027, + "balance_loss_mlp": 1.14870429, + "epoch": 0.008236885615511799, + "flos": 23873593288320.0, + "grad_norm": 5.8712537379963345, + "language_loss": 0.8400104, + "learning_rate": 3.1677404618925676e-06, + "loss": 0.86595905, + "num_input_tokens_seen": 2846965, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 3.375, + "step": 137, + "time_per_iteration": 2.6225337982177734 + }, + { + "auxiliary_loss_clip": 0.01482624, + "auxiliary_loss_mlp": 0.01116377, + "balance_loss_clip": 1.05796409, + "balance_loss_mlp": 1.14842129, + "epoch": 0.00829700886817977, + "flos": 24643626076800.0, + "grad_norm": 1.6861384534946333, + "language_loss": 0.90170664, + "learning_rate": 3.1724230438666953e-06, + "loss": 0.9276967, + "num_input_tokens_seen": 2867520, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 3.34375, + "step": 138, + "time_per_iteration": 2.653205156326294 + }, + { + "auxiliary_loss_clip": 0.01472312, + "auxiliary_loss_mlp": 0.01119929, + "balance_loss_clip": 1.0568912, + "balance_loss_mlp": 1.1478796, + "epoch": 0.008357132120847738, + "flos": 25262007644160.0, + "grad_norm": 2.679342832062188, + "language_loss": 0.91253459, + "learning_rate": 3.177071816289865e-06, + "loss": 0.93845713, + "num_input_tokens_seen": 2885675, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 3.234375, + "step": 139, + "time_per_iteration": 2.6182503700256348 + }, + { + "auxiliary_loss_clip": 0.01489087, + "auxiliary_loss_mlp": 0.01123997, + "balance_loss_clip": 1.06229401, + "balance_loss_mlp": 1.154405, + "epoch": 0.008417255373515706, + "flos": 27344898529920.0, + "grad_norm": 2.5553770836970675, + "language_loss": 0.85446793, + "learning_rate": 3.181687263893095e-06, + "loss": 0.88059878, + "num_input_tokens_seen": 2905960, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 3.34375, + "step": 140, + "time_per_iteration": 2.649454116821289 + }, + { + "auxiliary_loss_clip": 0.01476267, + "auxiliary_loss_mlp": 0.0111889, + "balance_loss_clip": 1.0594281, + "balance_loss_mlp": 1.14865911, + "epoch": 0.008477378626183677, + "flos": 17639070589440.0, + "grad_norm": 2.379593217845822, + "language_loss": 0.84156519, + "learning_rate": 3.186269861057098e-06, + "loss": 0.86751676, + "num_input_tokens_seen": 2922780, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 3.28125, + "step": 141, + "time_per_iteration": 2.608603000640869 + }, + { + "auxiliary_loss_clip": 0.01480312, + "auxiliary_loss_mlp": 0.01134333, + "balance_loss_clip": 1.07320273, + "balance_loss_mlp": 1.14624739, + "epoch": 0.008537501878851645, + "flos": 13881342297600.0, + "grad_norm": 2.3283494467369965, + "language_loss": 0.81387591, + "learning_rate": 3.1908200721048745e-06, + "loss": 0.84002233, + "num_input_tokens_seen": 2938765, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 3.34375, + "step": 142, + "time_per_iteration": 4.023308753967285 + }, + { + "auxiliary_loss_clip": 0.01378722, + "auxiliary_loss_mlp": 0.01032728, + "balance_loss_clip": 1.00621629, + "balance_loss_mlp": 1.1918689, + "epoch": 0.008597625131519616, + "flos": 71248101281280.0, + "grad_norm": 1.0451783350372967, + "language_loss": 0.66831523, + "learning_rate": 3.195338351584042e-06, + "loss": 0.69242978, + "num_input_tokens_seen": 3006665, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.8671875, + "step": 143, + "time_per_iteration": 4.718023777008057 + }, + { + "auxiliary_loss_clip": 0.01472184, + "auxiliary_loss_mlp": 0.0112263, + "balance_loss_clip": 1.06283474, + "balance_loss_mlp": 1.14625573, + "epoch": 0.008657748384187584, + "flos": 17602836744960.0, + "grad_norm": 2.2608538764922295, + "language_loss": 0.83954072, + "learning_rate": 3.1998251445393258e-06, + "loss": 0.86548889, + "num_input_tokens_seen": 3024335, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 3.25, + "step": 144, + "time_per_iteration": 2.5878453254699707 + }, + { + "auxiliary_loss_clip": 0.01457808, + "auxiliary_loss_mlp": 0.01111605, + "balance_loss_clip": 1.04890084, + "balance_loss_mlp": 1.13930941, + "epoch": 0.008717871636855555, + "flos": 19715317459200.0, + "grad_norm": 2.241812154138119, + "language_loss": 0.88511693, + "learning_rate": 3.204280886775619e-06, + "loss": 0.91081107, + "num_input_tokens_seen": 3043300, + "router_z_loss_clip": 0.62890625, + "router_z_loss_mlp": 3.1875, + "step": 145, + "time_per_iteration": 2.586512565612793 + }, + { + "auxiliary_loss_clip": 0.01475641, + "auxiliary_loss_mlp": 0.01124002, + "balance_loss_clip": 1.06153631, + "balance_loss_mlp": 1.14211285, + "epoch": 0.008777994889523523, + "flos": 24717422568960.0, + "grad_norm": 1.792984011276012, + "language_loss": 0.85949898, + "learning_rate": 3.208706005112005e-06, + "loss": 0.88549542, + "num_input_tokens_seen": 3064610, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 3.34375, + "step": 146, + "time_per_iteration": 2.6258151531219482 + }, + { + "auxiliary_loss_clip": 0.01359324, + "auxiliary_loss_mlp": 0.01026953, + "balance_loss_clip": 1.00082254, + "balance_loss_mlp": 1.17825258, + "epoch": 0.008838118142191492, + "flos": 70132067758080.0, + "grad_norm": 0.8557738136673508, + "language_loss": 0.60047674, + "learning_rate": 3.213100917627104e-06, + "loss": 0.62433958, + "num_input_tokens_seen": 3130385, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.8125, + "step": 147, + "time_per_iteration": 3.2522764205932617 + }, + { + "auxiliary_loss_clip": 0.01465546, + "auxiliary_loss_mlp": 0.01124118, + "balance_loss_clip": 1.06670642, + "balance_loss_mlp": 1.14550173, + "epoch": 0.008898241394859462, + "flos": 20044797937920.0, + "grad_norm": 1.8343461268862185, + "language_loss": 0.8454501, + "learning_rate": 3.2174660338961135e-06, + "loss": 0.87134671, + "num_input_tokens_seen": 3149760, + "router_z_loss_clip": 0.57421875, + "router_z_loss_mlp": 3.203125, + "step": 148, + "time_per_iteration": 2.635499954223633 + }, + { + "auxiliary_loss_clip": 0.0147086, + "auxiliary_loss_mlp": 0.01143141, + "balance_loss_clip": 1.07914925, + "balance_loss_mlp": 1.14693797, + "epoch": 0.008958364647527431, + "flos": 10743611685120.0, + "grad_norm": 2.2581185064103404, + "language_loss": 0.88802874, + "learning_rate": 3.2218017552198588e-06, + "loss": 0.91416872, + "num_input_tokens_seen": 3164500, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 3.234375, + "step": 149, + "time_per_iteration": 2.5458836555480957 + }, + { + "auxiliary_loss_clip": 0.01466862, + "auxiliary_loss_mlp": 0.01112061, + "balance_loss_clip": 1.05445874, + "balance_loss_mlp": 1.14131117, + "epoch": 0.009018487900195401, + "flos": 29127467802240.0, + "grad_norm": 2.7760320197047097, + "language_loss": 0.93054724, + "learning_rate": 3.226108474846181e-06, + "loss": 0.95633656, + "num_input_tokens_seen": 3182455, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 3.25, + "step": 150, + "time_per_iteration": 2.648111343383789 + }, + { + "auxiliary_loss_clip": 0.01454371, + "auxiliary_loss_mlp": 0.01109463, + "balance_loss_clip": 1.05391192, + "balance_loss_mlp": 1.13663483, + "epoch": 0.00907861115286337, + "flos": 32963661354240.0, + "grad_norm": 1.9005080345968057, + "language_loss": 0.74303263, + "learning_rate": 3.2303865781839817e-06, + "loss": 0.76867104, + "num_input_tokens_seen": 3203995, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 3.171875, + "step": 151, + "time_per_iteration": 2.6953165531158447 + }, + { + "auxiliary_loss_clip": 0.014664, + "auxiliary_loss_mlp": 0.01125146, + "balance_loss_clip": 1.06735289, + "balance_loss_mlp": 1.14143276, + "epoch": 0.009138734405531338, + "flos": 21762441377280.0, + "grad_norm": 2.6241423805649298, + "language_loss": 0.88251799, + "learning_rate": 3.234636443010188e-06, + "loss": 0.90843344, + "num_input_tokens_seen": 3222575, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 3.25, + "step": 152, + "time_per_iteration": 2.6034231185913086 + }, + { + "auxiliary_loss_clip": 0.01466383, + "auxiliary_loss_mlp": 0.01121244, + "balance_loss_clip": 1.0628314, + "balance_loss_mlp": 1.14757276, + "epoch": 0.009198857658199309, + "flos": 20842517134080.0, + "grad_norm": 3.4062301864690196, + "language_loss": 0.83957756, + "learning_rate": 3.238858439669943e-06, + "loss": 0.86545384, + "num_input_tokens_seen": 3240180, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 3.1875, + "step": 153, + "time_per_iteration": 2.6023271083831787 + }, + { + "auxiliary_loss_clip": 0.01456394, + "auxiliary_loss_mlp": 0.01136316, + "balance_loss_clip": 1.0765202, + "balance_loss_mlp": 1.13805962, + "epoch": 0.009258980910867277, + "flos": 24827381078400.0, + "grad_norm": 1.9441527650945287, + "language_loss": 0.89881843, + "learning_rate": 3.2430529312702712e-06, + "loss": 0.92474556, + "num_input_tokens_seen": 3259800, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 3.1875, + "step": 154, + "time_per_iteration": 2.646308183670044 + }, + { + "auxiliary_loss_clip": 0.01460439, + "auxiliary_loss_mlp": 0.01154617, + "balance_loss_clip": 1.09577537, + "balance_loss_mlp": 1.14094579, + "epoch": 0.009319104163535248, + "flos": 28767786963840.0, + "grad_norm": 2.0692323216259187, + "language_loss": 0.89471745, + "learning_rate": 3.2472202738674737e-06, + "loss": 0.92086804, + "num_input_tokens_seen": 3280400, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 3.1875, + "step": 155, + "time_per_iteration": 2.6336286067962646 + }, + { + "auxiliary_loss_clip": 0.01463585, + "auxiliary_loss_mlp": 0.01116238, + "balance_loss_clip": 1.05894589, + "balance_loss_mlp": 1.13895822, + "epoch": 0.009379227416203216, + "flos": 16582004219520.0, + "grad_norm": 3.3077298720636255, + "language_loss": 0.86882627, + "learning_rate": 3.2513608166485063e-06, + "loss": 0.89462447, + "num_input_tokens_seen": 3297600, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 3.25, + "step": 156, + "time_per_iteration": 2.5539867877960205 + }, + { + "auxiliary_loss_clip": 0.01462083, + "auxiliary_loss_mlp": 0.01121969, + "balance_loss_clip": 1.06408143, + "balance_loss_mlp": 1.14298415, + "epoch": 0.009439350668871187, + "flos": 18329919845760.0, + "grad_norm": 2.4916444524903527, + "language_loss": 0.99553013, + "learning_rate": 3.2554749021065498e-06, + "loss": 1.02137065, + "num_input_tokens_seen": 3313635, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 3.1875, + "step": 157, + "time_per_iteration": 2.5249693393707275 + }, + { + "auxiliary_loss_clip": 0.01445636, + "auxiliary_loss_mlp": 0.01139016, + "balance_loss_clip": 1.08146214, + "balance_loss_mlp": 1.1366899, + "epoch": 0.009499473921539155, + "flos": 24349912565760.0, + "grad_norm": 2.0302475566757225, + "language_loss": 0.8847568, + "learning_rate": 3.2595628662110186e-06, + "loss": 0.91060334, + "num_input_tokens_seen": 3333735, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 3.09375, + "step": 158, + "time_per_iteration": 2.6009252071380615 + }, + { + "auxiliary_loss_clip": 0.01452439, + "auxiliary_loss_mlp": 0.01124253, + "balance_loss_clip": 1.06555486, + "balance_loss_mlp": 1.13677907, + "epoch": 0.009559597174207124, + "flos": 16399326625920.0, + "grad_norm": 4.310723443959545, + "language_loss": 0.86534697, + "learning_rate": 3.2636250385721982e-06, + "loss": 0.89111388, + "num_input_tokens_seen": 3348800, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 3.15625, + "step": 159, + "time_per_iteration": 2.6107394695281982 + }, + { + "auxiliary_loss_clip": 0.01442093, + "auxiliary_loss_mlp": 0.01132817, + "balance_loss_clip": 1.07340288, + "balance_loss_mlp": 1.13145089, + "epoch": 0.009619720426875094, + "flos": 22856890826880.0, + "grad_norm": 1.790220267572532, + "language_loss": 0.86825597, + "learning_rate": 3.2676617426007263e-06, + "loss": 0.89400506, + "num_input_tokens_seen": 3368595, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 3.109375, + "step": 160, + "time_per_iteration": 2.574252128601074 + }, + { + "auxiliary_loss_clip": 0.01449537, + "auxiliary_loss_mlp": 0.01117828, + "balance_loss_clip": 1.06318271, + "balance_loss_mlp": 1.13704872, + "epoch": 0.009679843679543063, + "flos": 19135001329920.0, + "grad_norm": 2.6107931748588893, + "language_loss": 0.91542315, + "learning_rate": 3.2716732956621042e-06, + "loss": 0.94109678, + "num_input_tokens_seen": 3384975, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 3.125, + "step": 161, + "time_per_iteration": 2.550865650177002 + }, + { + "auxiliary_loss_clip": 0.01454094, + "auxiliary_loss_mlp": 0.01109765, + "balance_loss_clip": 1.05488133, + "balance_loss_mlp": 1.13759339, + "epoch": 0.009739966932211033, + "flos": 20302995876480.0, + "grad_norm": 2.2107920101940994, + "language_loss": 0.91690832, + "learning_rate": 3.2756600092264203e-06, + "loss": 0.94254684, + "num_input_tokens_seen": 3404755, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 3.15625, + "step": 162, + "time_per_iteration": 2.5527970790863037 + }, + { + "auxiliary_loss_clip": 0.01312712, + "auxiliary_loss_mlp": 0.0102725, + "balance_loss_clip": 1.00331306, + "balance_loss_mlp": 1.14560354, + "epoch": 0.009800090184879002, + "flos": 67034234177280.0, + "grad_norm": 1.2615279464106541, + "language_loss": 0.72354776, + "learning_rate": 3.279622189013474e-06, + "loss": 0.74694741, + "num_input_tokens_seen": 3467210, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.671875, + "step": 163, + "time_per_iteration": 3.143763542175293 + }, + { + "auxiliary_loss_clip": 0.01440764, + "auxiliary_loss_mlp": 0.01113881, + "balance_loss_clip": 1.05804312, + "balance_loss_mlp": 1.13505006, + "epoch": 0.00986021343754697, + "flos": 17164690646400.0, + "grad_norm": 2.1923315312730374, + "language_loss": 0.8427155, + "learning_rate": 3.283560135133457e-06, + "loss": 0.86826193, + "num_input_tokens_seen": 3483220, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 3.0625, + "step": 164, + "time_per_iteration": 2.5536584854125977 + }, + { + "auxiliary_loss_clip": 0.01429878, + "auxiliary_loss_mlp": 0.01100497, + "balance_loss_clip": 1.04585135, + "balance_loss_mlp": 1.12637794, + "epoch": 0.00992033669021494, + "flos": 17749424148480.0, + "grad_norm": 2.006756380443377, + "language_loss": 0.89215541, + "learning_rate": 3.2874741422233565e-06, + "loss": 0.91745919, + "num_input_tokens_seen": 3501465, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 3.03125, + "step": 165, + "time_per_iteration": 2.5313210487365723 + }, + { + "auxiliary_loss_clip": 0.01434156, + "auxiliary_loss_mlp": 0.01127756, + "balance_loss_clip": 1.0692482, + "balance_loss_mlp": 1.12764359, + "epoch": 0.00998045994288291, + "flos": 25297164080640.0, + "grad_norm": 6.432940691763592, + "language_loss": 0.80138129, + "learning_rate": 3.2913644995792465e-06, + "loss": 0.82700044, + "num_input_tokens_seen": 3520480, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 3.0625, + "step": 166, + "time_per_iteration": 2.6461095809936523 + }, + { + "auxiliary_loss_clip": 0.01438531, + "auxiliary_loss_mlp": 0.01125189, + "balance_loss_clip": 1.06749213, + "balance_loss_mlp": 1.13121533, + "epoch": 0.01004058319555088, + "flos": 32298954220800.0, + "grad_norm": 2.334124726802297, + "language_loss": 0.9190954, + "learning_rate": 3.2952314912845914e-06, + "loss": 0.94473255, + "num_input_tokens_seen": 3539570, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 3.078125, + "step": 167, + "time_per_iteration": 2.655597448348999 + }, + { + "auxiliary_loss_clip": 0.01430369, + "auxiliary_loss_mlp": 0.01135101, + "balance_loss_clip": 1.07997894, + "balance_loss_mlp": 1.12960708, + "epoch": 0.010100706448218848, + "flos": 11319941404800.0, + "grad_norm": 3.1870046541457873, + "language_loss": 0.90852308, + "learning_rate": 3.299075396334735e-06, + "loss": 0.93417776, + "num_input_tokens_seen": 3555465, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 3.0, + "step": 168, + "time_per_iteration": 2.5387983322143555 + }, + { + "auxiliary_loss_clip": 0.01424973, + "auxiliary_loss_mlp": 0.01106848, + "balance_loss_clip": 1.05072391, + "balance_loss_mlp": 1.12456727, + "epoch": 0.010160829700886819, + "flos": 29719491765120.0, + "grad_norm": 2.0495813916191077, + "language_loss": 0.87094414, + "learning_rate": 3.3028964887576868e-06, + "loss": 0.89626241, + "num_input_tokens_seen": 3578970, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 3.0, + "step": 169, + "time_per_iteration": 2.6448419094085693 + }, + { + "auxiliary_loss_clip": 0.01426284, + "auxiliary_loss_mlp": 0.01111393, + "balance_loss_clip": 1.05548358, + "balance_loss_mlp": 1.12704372, + "epoch": 0.010220952953554787, + "flos": 20412343854720.0, + "grad_norm": 3.0203817486241973, + "language_loss": 0.84758192, + "learning_rate": 3.306695037731344e-06, + "loss": 0.87295866, + "num_input_tokens_seen": 3597275, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 3.0, + "step": 170, + "time_per_iteration": 2.5596489906311035 + }, + { + "auxiliary_loss_clip": 0.01435879, + "auxiliary_loss_mlp": 0.01136565, + "balance_loss_clip": 1.07963061, + "balance_loss_mlp": 1.12765205, + "epoch": 0.010281076206222756, + "flos": 31285124847360.0, + "grad_norm": 2.124400250788896, + "language_loss": 0.89896494, + "learning_rate": 3.3104713076972827e-06, + "loss": 0.92468935, + "num_input_tokens_seen": 3618905, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 3.078125, + "step": 171, + "time_per_iteration": 2.71183180809021 + }, + { + "auxiliary_loss_clip": 0.01429687, + "auxiliary_loss_mlp": 0.01108406, + "balance_loss_clip": 1.05421364, + "balance_loss_mlp": 1.1300813, + "epoch": 0.010341199458890726, + "flos": 21982286568960.0, + "grad_norm": 2.015577645060998, + "language_loss": 0.88978243, + "learning_rate": 3.314225558471224e-06, + "loss": 0.91516334, + "num_input_tokens_seen": 3639610, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 3.0, + "step": 172, + "time_per_iteration": 2.6193771362304688 + }, + { + "auxiliary_loss_clip": 0.01419105, + "auxiliary_loss_mlp": 0.01124801, + "balance_loss_clip": 1.06986928, + "balance_loss_mlp": 1.12354624, + "epoch": 0.010401322711558695, + "flos": 30810529422720.0, + "grad_norm": 1.6868779107262128, + "language_loss": 0.81148165, + "learning_rate": 3.317958045350308e-06, + "loss": 0.83692074, + "num_input_tokens_seen": 3664030, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 2.953125, + "step": 173, + "time_per_iteration": 2.656935691833496 + }, + { + "auxiliary_loss_clip": 0.01430653, + "auxiliary_loss_mlp": 0.0110718, + "balance_loss_clip": 1.05496693, + "balance_loss_mlp": 1.12733519, + "epoch": 0.010461445964226665, + "flos": 24715124098560.0, + "grad_norm": 2.1134597687554244, + "language_loss": 0.82498932, + "learning_rate": 3.3216690192172596e-06, + "loss": 0.85036767, + "num_input_tokens_seen": 3683615, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 3.03125, + "step": 174, + "time_per_iteration": 2.6050753593444824 + }, + { + "auxiliary_loss_clip": 0.01425822, + "auxiliary_loss_mlp": 0.01124156, + "balance_loss_clip": 1.06984437, + "balance_loss_mlp": 1.12589645, + "epoch": 0.010521569216894634, + "flos": 27710361457920.0, + "grad_norm": 2.6035215697191965, + "language_loss": 0.72699076, + "learning_rate": 3.325358726641591e-06, + "loss": 0.75249052, + "num_input_tokens_seen": 3704540, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 3.0, + "step": 175, + "time_per_iteration": 2.6859946250915527 + }, + { + "auxiliary_loss_clip": 0.01427679, + "auxiliary_loss_mlp": 0.0113274, + "balance_loss_clip": 1.07571054, + "balance_loss_mlp": 1.12603855, + "epoch": 0.010581692469562603, + "flos": 12458346122880.0, + "grad_norm": 2.402827576481816, + "language_loss": 0.98082507, + "learning_rate": 3.329027409977902e-06, + "loss": 1.00642931, + "num_input_tokens_seen": 3721320, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 3.015625, + "step": 176, + "time_per_iteration": 2.5405664443969727 + }, + { + "auxiliary_loss_clip": 0.01412838, + "auxiliary_loss_mlp": 0.01132631, + "balance_loss_clip": 1.08005941, + "balance_loss_mlp": 1.12270594, + "epoch": 0.010641815722230573, + "flos": 19427601519360.0, + "grad_norm": 2.3427037211777115, + "language_loss": 0.76749414, + "learning_rate": 3.3326753074614087e-06, + "loss": 0.79294884, + "num_input_tokens_seen": 3739385, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.90625, + "step": 177, + "time_per_iteration": 2.555553674697876 + }, + { + "auxiliary_loss_clip": 0.01423246, + "auxiliary_loss_mlp": 0.01104615, + "balance_loss_clip": 1.0507797, + "balance_loss_mlp": 1.12089574, + "epoch": 0.010701938974898541, + "flos": 18332577452160.0, + "grad_norm": 2.4108248963401464, + "language_loss": 0.76824659, + "learning_rate": 3.3363026533007716e-06, + "loss": 0.79352522, + "num_input_tokens_seen": 3756360, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 3.015625, + "step": 178, + "time_per_iteration": 2.5799388885498047 + }, + { + "auxiliary_loss_clip": 0.01429506, + "auxiliary_loss_mlp": 0.01108132, + "balance_loss_clip": 1.05224717, + "balance_loss_mlp": 1.12586653, + "epoch": 0.010762062227566512, + "flos": 19203985399680.0, + "grad_norm": 2.1918052506036174, + "language_loss": 0.84004253, + "learning_rate": 3.3399096777683303e-06, + "loss": 0.86541891, + "num_input_tokens_seen": 3773930, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 3.03125, + "step": 179, + "time_per_iteration": 2.5387184619903564 + }, + { + "auxiliary_loss_clip": 0.01420983, + "auxiliary_loss_mlp": 0.01112539, + "balance_loss_clip": 1.05677247, + "balance_loss_mlp": 1.12062979, + "epoch": 0.01082218548023448, + "flos": 31425427370880.0, + "grad_norm": 1.90488055395076, + "language_loss": 0.83719397, + "learning_rate": 3.3434966072878213e-06, + "loss": 0.86252916, + "num_input_tokens_seen": 3793630, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 3.0, + "step": 180, + "time_per_iteration": 2.6149253845214844 + }, + { + "auxiliary_loss_clip": 0.01421575, + "auxiliary_loss_mlp": 0.01120367, + "balance_loss_clip": 1.06503046, + "balance_loss_mlp": 1.1226536, + "epoch": 0.01088230873290245, + "flos": 25046436170880.0, + "grad_norm": 3.784573507260413, + "language_loss": 0.7774682, + "learning_rate": 3.3470636645196674e-06, + "loss": 0.80288756, + "num_input_tokens_seen": 3813610, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 3.0, + "step": 181, + "time_per_iteration": 2.5769712924957275 + }, + { + "auxiliary_loss_clip": 0.01417045, + "auxiliary_loss_mlp": 0.01131731, + "balance_loss_clip": 1.07732356, + "balance_loss_mlp": 1.11938787, + "epoch": 0.01094243198557042, + "flos": 22893411980160.0, + "grad_norm": 3.1835165271024377, + "language_loss": 0.76440376, + "learning_rate": 3.3506110684439156e-06, + "loss": 0.78989148, + "num_input_tokens_seen": 3831390, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 2.96875, + "step": 182, + "time_per_iteration": 2.5641353130340576 + }, + { + "auxiliary_loss_clip": 0.01412704, + "auxiliary_loss_mlp": 0.01127012, + "balance_loss_clip": 1.07122183, + "balance_loss_mlp": 1.11758399, + "epoch": 0.011002555238238388, + "flos": 17165049782400.0, + "grad_norm": 2.172025067133121, + "language_loss": 0.87377435, + "learning_rate": 3.3541390344409054e-06, + "loss": 0.89917147, + "num_input_tokens_seen": 3849705, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 2.953125, + "step": 183, + "time_per_iteration": 2.567457914352417 + }, + { + "auxiliary_loss_clip": 0.01415124, + "auxiliary_loss_mlp": 0.01114516, + "balance_loss_clip": 1.06397092, + "balance_loss_mlp": 1.1209594, + "epoch": 0.011062678490906358, + "flos": 22310150935680.0, + "grad_norm": 2.2669267607504255, + "language_loss": 0.86875558, + "learning_rate": 3.357647774369736e-06, + "loss": 0.89405191, + "num_input_tokens_seen": 3869230, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.953125, + "step": 184, + "time_per_iteration": 5.380701780319214 + }, + { + "auxiliary_loss_clip": 0.01411555, + "auxiliary_loss_mlp": 0.01107942, + "balance_loss_clip": 1.05308247, + "balance_loss_mlp": 1.12176847, + "epoch": 0.011122801743574327, + "flos": 24388373053440.0, + "grad_norm": 1.8448371257401488, + "language_loss": 0.83683228, + "learning_rate": 3.3611374966446085e-06, + "loss": 0.86202729, + "num_input_tokens_seen": 3889735, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 2.90625, + "step": 185, + "time_per_iteration": 2.5522208213806152 + }, + { + "auxiliary_loss_clip": 0.01420908, + "auxiliary_loss_mlp": 0.01109712, + "balance_loss_clip": 1.05253971, + "balance_loss_mlp": 1.11964798, + "epoch": 0.011182924996242297, + "flos": 18150258994560.0, + "grad_norm": 2.4162416092451475, + "language_loss": 0.71111757, + "learning_rate": 3.3646084063091142e-06, + "loss": 0.73642373, + "num_input_tokens_seen": 3908855, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 3.015625, + "step": 186, + "time_per_iteration": 2.536498546600342 + }, + { + "auxiliary_loss_clip": 0.01416319, + "auxiliary_loss_mlp": 0.01107204, + "balance_loss_clip": 1.0558964, + "balance_loss_mlp": 1.11923158, + "epoch": 0.011243048248910266, + "flos": 15486800584320.0, + "grad_norm": 3.342492581434835, + "language_loss": 1.02028871, + "learning_rate": 3.3680607051085194e-06, + "loss": 1.04552388, + "num_input_tokens_seen": 3923865, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 2.96875, + "step": 187, + "time_per_iteration": 2.5189080238342285 + }, + { + "auxiliary_loss_clip": 0.01405552, + "auxiliary_loss_mlp": 0.01110459, + "balance_loss_clip": 1.05597997, + "balance_loss_mlp": 1.11834478, + "epoch": 0.011303171501578235, + "flos": 40916868986880.0, + "grad_norm": 1.6787333311747052, + "language_loss": 0.75107503, + "learning_rate": 3.371494591560139e-06, + "loss": 0.7762351, + "num_input_tokens_seen": 3946870, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 2.875, + "step": 188, + "time_per_iteration": 2.73420786857605 + }, + { + "auxiliary_loss_clip": 0.01292523, + "auxiliary_loss_mlp": 0.01034102, + "balance_loss_clip": 1.01273942, + "balance_loss_mlp": 1.13387585, + "epoch": 0.011363294754246205, + "flos": 66302697790080.0, + "grad_norm": 0.7700467396195164, + "language_loss": 0.56216431, + "learning_rate": 3.3749102610218297e-06, + "loss": 0.5854305, + "num_input_tokens_seen": 4010005, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.5859375, + "step": 189, + "time_per_iteration": 3.176280975341797 + }, + { + "auxiliary_loss_clip": 0.01402635, + "auxiliary_loss_mlp": 0.01121834, + "balance_loss_clip": 1.06742704, + "balance_loss_mlp": 1.1134795, + "epoch": 0.011423418006914174, + "flos": 24900279730560.0, + "grad_norm": 2.292403028528975, + "language_loss": 0.94771594, + "learning_rate": 3.3783079057586833e-06, + "loss": 0.97296059, + "num_input_tokens_seen": 4029035, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 2.90625, + "step": 190, + "time_per_iteration": 2.604132652282715 + }, + { + "auxiliary_loss_clip": 0.01403317, + "auxiliary_loss_mlp": 0.01101291, + "balance_loss_clip": 1.04964972, + "balance_loss_mlp": 1.11493886, + "epoch": 0.011483541259582144, + "flos": 19791879298560.0, + "grad_norm": 2.993049163405909, + "language_loss": 0.84462845, + "learning_rate": 3.3816877150079665e-06, + "loss": 0.8696745, + "num_input_tokens_seen": 4046995, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 2.875, + "step": 191, + "time_per_iteration": 2.569664716720581 + }, + { + "auxiliary_loss_clip": 0.01402316, + "auxiliary_loss_mlp": 0.01121031, + "balance_loss_clip": 1.0698905, + "balance_loss_mlp": 1.11087692, + "epoch": 0.011543664512250112, + "flos": 26176939896960.0, + "grad_norm": 2.0097697123850593, + "language_loss": 0.91439575, + "learning_rate": 3.385049875042367e-06, + "loss": 0.93962914, + "num_input_tokens_seen": 4065865, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 2.90625, + "step": 192, + "time_per_iteration": 2.6416900157928467 + }, + { + "auxiliary_loss_clip": 0.0139743, + "auxiliary_loss_mlp": 0.01113461, + "balance_loss_clip": 1.05776596, + "balance_loss_mlp": 1.11231375, + "epoch": 0.011603787764918083, + "flos": 23768985905280.0, + "grad_norm": 2.095754720056515, + "language_loss": 0.86849445, + "learning_rate": 3.3883945692315938e-06, + "loss": 0.89360332, + "num_input_tokens_seen": 4085305, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 2.84375, + "step": 193, + "time_per_iteration": 2.569899797439575 + }, + { + "auxiliary_loss_clip": 0.01399232, + "auxiliary_loss_mlp": 0.01095137, + "balance_loss_clip": 1.04409146, + "balance_loss_mlp": 1.10937476, + "epoch": 0.011663911017586051, + "flos": 25954688494080.0, + "grad_norm": 2.446553756436178, + "language_loss": 0.92399615, + "learning_rate": 3.3917219781023906e-06, + "loss": 0.9489398, + "num_input_tokens_seen": 4105185, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 2.90625, + "step": 194, + "time_per_iteration": 2.6078743934631348 + }, + { + "auxiliary_loss_clip": 0.01405837, + "auxiliary_loss_mlp": 0.0110398, + "balance_loss_clip": 1.05188549, + "balance_loss_mlp": 1.11522019, + "epoch": 0.01172403427025402, + "flos": 17895149625600.0, + "grad_norm": 3.1413620570060052, + "language_loss": 0.89698559, + "learning_rate": 3.3950322793970014e-06, + "loss": 0.92208374, + "num_input_tokens_seen": 4123160, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.90625, + "step": 195, + "time_per_iteration": 2.5785820484161377 + }, + { + "auxiliary_loss_clip": 0.01400897, + "auxiliary_loss_mlp": 0.01117652, + "balance_loss_clip": 1.06345916, + "balance_loss_mlp": 1.11416054, + "epoch": 0.01178415752292199, + "flos": 17894539094400.0, + "grad_norm": 3.0173579296668813, + "language_loss": 0.8577168, + "learning_rate": 3.3983256481301445e-06, + "loss": 0.88290232, + "num_input_tokens_seen": 4140425, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 2.875, + "step": 196, + "time_per_iteration": 2.5492773056030273 + }, + { + "auxiliary_loss_clip": 0.01397107, + "auxiliary_loss_mlp": 0.01106206, + "balance_loss_clip": 1.05299139, + "balance_loss_mlp": 1.10991478, + "epoch": 0.011844280775589959, + "flos": 22893555634560.0, + "grad_norm": 2.86264810097015, + "language_loss": 0.93367243, + "learning_rate": 3.4016022566445335e-06, + "loss": 0.95870566, + "num_input_tokens_seen": 4159555, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.875, + "step": 197, + "time_per_iteration": 2.5488431453704834 + }, + { + "auxiliary_loss_clip": 0.01394686, + "auxiliary_loss_mlp": 0.0110986, + "balance_loss_clip": 1.05781317, + "balance_loss_mlp": 1.1120131, + "epoch": 0.01190440402825793, + "flos": 26980333441920.0, + "grad_norm": 2.1872318454948045, + "language_loss": 0.79184073, + "learning_rate": 3.4048622746649966e-06, + "loss": 0.81688625, + "num_input_tokens_seen": 4180480, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 2.828125, + "step": 198, + "time_per_iteration": 2.6208834648132324 + }, + { + "auxiliary_loss_clip": 0.01390401, + "auxiliary_loss_mlp": 0.01116361, + "balance_loss_clip": 1.06545901, + "balance_loss_mlp": 1.11265802, + "epoch": 0.011964527280925898, + "flos": 20521584092160.0, + "grad_norm": 3.3720724842630663, + "language_loss": 0.88065112, + "learning_rate": 3.4081058693512278e-06, + "loss": 0.90571868, + "num_input_tokens_seen": 4198835, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.765625, + "step": 199, + "time_per_iteration": 2.5257043838500977 + }, + { + "auxiliary_loss_clip": 0.01403414, + "auxiliary_loss_mlp": 0.01121968, + "balance_loss_clip": 1.0658679, + "balance_loss_mlp": 1.11557496, + "epoch": 0.012024650533593867, + "flos": 27745984771200.0, + "grad_norm": 1.8432610551497841, + "language_loss": 0.81327617, + "learning_rate": 3.411333205349222e-06, + "loss": 0.83853, + "num_input_tokens_seen": 4219335, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 2.875, + "step": 200, + "time_per_iteration": 2.593231201171875 + }, + { + "auxiliary_loss_clip": 0.01400536, + "auxiliary_loss_mlp": 0.01101092, + "balance_loss_clip": 1.04792464, + "balance_loss_mlp": 1.11138511, + "epoch": 0.012084773786261837, + "flos": 10452017076480.0, + "grad_norm": 2.758923223370522, + "language_loss": 0.87688923, + "learning_rate": 3.4145444448414217e-06, + "loss": 0.90190548, + "num_input_tokens_seen": 4236940, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.90625, + "step": 201, + "time_per_iteration": 2.5057122707366943 + }, + { + "auxiliary_loss_clip": 0.01401128, + "auxiliary_loss_mlp": 0.01110995, + "balance_loss_clip": 1.05751753, + "balance_loss_mlp": 1.1152513, + "epoch": 0.012144897038929806, + "flos": 23105751229440.0, + "grad_norm": 3.7927516715708736, + "language_loss": 0.84123611, + "learning_rate": 3.4177397475956223e-06, + "loss": 0.86635733, + "num_input_tokens_seen": 4256755, + "router_z_loss_clip": 0.53515625, + "router_z_loss_mlp": 2.859375, + "step": 202, + "time_per_iteration": 2.555680751800537 + }, + { + "auxiliary_loss_clip": 0.01388205, + "auxiliary_loss_mlp": 0.01109065, + "balance_loss_clip": 1.05639839, + "balance_loss_mlp": 1.10674798, + "epoch": 0.012205020291597776, + "flos": 21033203460480.0, + "grad_norm": 1.9040504717952067, + "language_loss": 0.90116632, + "learning_rate": 3.4209192710126685e-06, + "loss": 0.926139, + "num_input_tokens_seen": 4276505, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 2.8125, + "step": 203, + "time_per_iteration": 2.526937484741211 + }, + { + "auxiliary_loss_clip": 0.01281494, + "auxiliary_loss_mlp": 0.01053133, + "balance_loss_clip": 1.03138971, + "balance_loss_mlp": 1.12054539, + "epoch": 0.012265143544265745, + "flos": 68447785075200.0, + "grad_norm": 1.0150955472927095, + "language_loss": 0.61259121, + "learning_rate": 3.4240831701729837e-06, + "loss": 0.63593745, + "num_input_tokens_seen": 4330965, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.609375, + "step": 204, + "time_per_iteration": 3.051469326019287 + }, + { + "auxiliary_loss_clip": 0.01398264, + "auxiliary_loss_mlp": 0.01111819, + "balance_loss_clip": 1.0593431, + "balance_loss_mlp": 1.11035323, + "epoch": 0.012325266796933715, + "flos": 17019252478080.0, + "grad_norm": 2.269022633654934, + "language_loss": 0.91206741, + "learning_rate": 3.4272315978819516e-06, + "loss": 0.93716824, + "num_input_tokens_seen": 4348200, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.875, + "step": 205, + "time_per_iteration": 2.5105605125427246 + }, + { + "auxiliary_loss_clip": 0.01406073, + "auxiliary_loss_mlp": 0.01120568, + "balance_loss_clip": 1.06675649, + "balance_loss_mlp": 1.11524296, + "epoch": 0.012385390049601683, + "flos": 20190056538240.0, + "grad_norm": 2.2813283317886497, + "language_loss": 0.89215505, + "learning_rate": 3.4303647047142043e-06, + "loss": 0.91742146, + "num_input_tokens_seen": 4365460, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 2.90625, + "step": 206, + "time_per_iteration": 2.524327039718628 + }, + { + "auxiliary_loss_clip": 0.01394865, + "auxiliary_loss_mlp": 0.01101938, + "balance_loss_clip": 1.05039215, + "balance_loss_mlp": 1.10848641, + "epoch": 0.012445513302269652, + "flos": 16253134272000.0, + "grad_norm": 2.502758142715096, + "language_loss": 0.95368809, + "learning_rate": 3.43348263905683e-06, + "loss": 0.97865611, + "num_input_tokens_seen": 4383650, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 2.859375, + "step": 207, + "time_per_iteration": 2.5147407054901123 + }, + { + "auxiliary_loss_clip": 0.01393931, + "auxiliary_loss_mlp": 0.01116307, + "balance_loss_clip": 1.06416512, + "balance_loss_mlp": 1.11335945, + "epoch": 0.012505636554937622, + "flos": 23769380954880.0, + "grad_norm": 2.4565104125033232, + "language_loss": 0.75770479, + "learning_rate": 3.436585547151547e-06, + "loss": 0.78280723, + "num_input_tokens_seen": 4403765, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.8125, + "step": 208, + "time_per_iteration": 2.5426721572875977 + }, + { + "auxiliary_loss_clip": 0.01382601, + "auxiliary_loss_mlp": 0.01107359, + "balance_loss_clip": 1.05497861, + "balance_loss_mlp": 1.10796773, + "epoch": 0.012565759807605591, + "flos": 30591546157440.0, + "grad_norm": 2.79364384939249, + "language_loss": 0.98718858, + "learning_rate": 3.4396735731358586e-06, + "loss": 1.01208818, + "num_input_tokens_seen": 4421935, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.75, + "step": 209, + "time_per_iteration": 2.607238292694092 + }, + { + "auxiliary_loss_clip": 0.01389293, + "auxiliary_loss_mlp": 0.01112212, + "balance_loss_clip": 1.05971253, + "balance_loss_mlp": 1.11020541, + "epoch": 0.012625883060273561, + "flos": 40113511355520.0, + "grad_norm": 7.039976369418198, + "language_loss": 0.85444254, + "learning_rate": 3.4427468590832302e-06, + "loss": 0.87945753, + "num_input_tokens_seen": 4441470, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.78125, + "step": 210, + "time_per_iteration": 2.67632794380188 + }, + { + "auxiliary_loss_clip": 0.01385349, + "auxiliary_loss_mlp": 0.01120301, + "balance_loss_clip": 1.07042408, + "balance_loss_mlp": 1.1073029, + "epoch": 0.01268600631294153, + "flos": 27089178629760.0, + "grad_norm": 2.2334441604414783, + "language_loss": 0.97016168, + "learning_rate": 3.445805545042314e-06, + "loss": 0.99521822, + "num_input_tokens_seen": 4459950, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 2.78125, + "step": 211, + "time_per_iteration": 2.5733633041381836 + }, + { + "auxiliary_loss_clip": 0.01394963, + "auxiliary_loss_mlp": 0.01114691, + "balance_loss_clip": 1.0616188, + "balance_loss_mlp": 1.11342549, + "epoch": 0.012746129565609499, + "flos": 16982767238400.0, + "grad_norm": 3.6563211355425453, + "language_loss": 0.95188707, + "learning_rate": 3.448849769075239e-06, + "loss": 0.97698367, + "num_input_tokens_seen": 4478390, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.8125, + "step": 212, + "time_per_iteration": 2.5224313735961914 + }, + { + "auxiliary_loss_clip": 0.01383511, + "auxiliary_loss_mlp": 0.01116361, + "balance_loss_clip": 1.06376541, + "balance_loss_mlp": 1.10996664, + "epoch": 0.012806252818277469, + "flos": 46533476995200.0, + "grad_norm": 2.0395830195466504, + "language_loss": 0.76049221, + "learning_rate": 3.4518796672950093e-06, + "loss": 0.78549099, + "num_input_tokens_seen": 4501665, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 2.734375, + "step": 213, + "time_per_iteration": 2.76625919342041 + }, + { + "auxiliary_loss_clip": 0.0138732, + "auxiliary_loss_mlp": 0.01103154, + "balance_loss_clip": 1.052037, + "balance_loss_mlp": 1.10833097, + "epoch": 0.012866376070945438, + "flos": 14388616120320.0, + "grad_norm": 8.414558483522654, + "language_loss": 0.86754733, + "learning_rate": 3.4548953739020187e-06, + "loss": 0.89245206, + "num_input_tokens_seen": 4519055, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 2.78125, + "step": 214, + "time_per_iteration": 2.500417470932007 + }, + { + "auxiliary_loss_clip": 0.0138682, + "auxiliary_loss_mlp": 0.01127788, + "balance_loss_clip": 1.07397687, + "balance_loss_mlp": 1.11549139, + "epoch": 0.012926499323613408, + "flos": 26140813793280.0, + "grad_norm": 2.3854037050744057, + "language_loss": 0.77357471, + "learning_rate": 3.4578970212197196e-06, + "loss": 0.79872084, + "num_input_tokens_seen": 4540870, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 2.703125, + "step": 215, + "time_per_iteration": 2.6116256713867188 + }, + { + "auxiliary_loss_clip": 0.01394912, + "auxiliary_loss_mlp": 0.01111048, + "balance_loss_clip": 1.06002641, + "balance_loss_mlp": 1.11393261, + "epoch": 0.012986622576281377, + "flos": 30117202128000.0, + "grad_norm": 2.44498430810385, + "language_loss": 0.90545797, + "learning_rate": 3.460884739729461e-06, + "loss": 0.93051755, + "num_input_tokens_seen": 4560395, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.8125, + "step": 216, + "time_per_iteration": 2.5903706550598145 + }, + { + "auxiliary_loss_clip": 0.0138678, + "auxiliary_loss_mlp": 0.01107632, + "balance_loss_clip": 1.05622888, + "balance_loss_mlp": 1.10772836, + "epoch": 0.013046745828949347, + "flos": 13954025468160.0, + "grad_norm": 2.630220300857062, + "language_loss": 0.93660516, + "learning_rate": 3.463858658104523e-06, + "loss": 0.96154928, + "num_input_tokens_seen": 4575785, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 2.78125, + "step": 217, + "time_per_iteration": 2.5109100341796875 + }, + { + "auxiliary_loss_clip": 0.01381618, + "auxiliary_loss_mlp": 0.01107152, + "balance_loss_clip": 1.05360317, + "balance_loss_mlp": 1.10700643, + "epoch": 0.013106869081617315, + "flos": 17347835116800.0, + "grad_norm": 1.9165712032980975, + "language_loss": 0.93656206, + "learning_rate": 3.4668189032433696e-06, + "loss": 0.96144974, + "num_input_tokens_seen": 4594985, + "router_z_loss_clip": 0.53515625, + "router_z_loss_mlp": 2.75, + "step": 218, + "time_per_iteration": 2.6586077213287354 + }, + { + "auxiliary_loss_clip": 0.01376505, + "auxiliary_loss_mlp": 0.01108753, + "balance_loss_clip": 1.05820787, + "balance_loss_mlp": 1.10663593, + "epoch": 0.013166992334285284, + "flos": 25884914325120.0, + "grad_norm": 1.916363531530835, + "language_loss": 0.86148179, + "learning_rate": 3.46976560030214e-06, + "loss": 0.88633436, + "num_input_tokens_seen": 4616125, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 2.703125, + "step": 219, + "time_per_iteration": 2.584040880203247 + }, + { + "auxiliary_loss_clip": 0.01383955, + "auxiliary_loss_mlp": 0.01101272, + "balance_loss_clip": 1.05056047, + "balance_loss_mlp": 1.110309, + "epoch": 0.013227115586953254, + "flos": 31175956437120.0, + "grad_norm": 1.7731463199764816, + "language_loss": 0.87598741, + "learning_rate": 3.4726988727263976e-06, + "loss": 0.90083969, + "num_input_tokens_seen": 4637795, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.75, + "step": 220, + "time_per_iteration": 2.6294186115264893 + }, + { + "auxiliary_loss_clip": 0.01373821, + "auxiliary_loss_mlp": 0.01103316, + "balance_loss_clip": 1.05663311, + "balance_loss_mlp": 1.10389161, + "epoch": 0.013287238839621223, + "flos": 20409470766720.0, + "grad_norm": 1.991547522293572, + "language_loss": 0.86413074, + "learning_rate": 3.475618842282164e-06, + "loss": 0.88890207, + "num_input_tokens_seen": 4656835, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 2.6875, + "step": 221, + "time_per_iteration": 2.606137990951538 + }, + { + "auxiliary_loss_clip": 0.0137878, + "auxiliary_loss_mlp": 0.01109834, + "balance_loss_clip": 1.05800176, + "balance_loss_mlp": 1.10240269, + "epoch": 0.013347362092289193, + "flos": 14137134024960.0, + "grad_norm": 2.017045003530743, + "language_loss": 0.92153138, + "learning_rate": 3.4785256290862486e-06, + "loss": 0.94641757, + "num_input_tokens_seen": 4673015, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 2.765625, + "step": 222, + "time_per_iteration": 2.6237566471099854 + }, + { + "auxiliary_loss_clip": 0.01377393, + "auxiliary_loss_mlp": 0.01105441, + "balance_loss_clip": 1.05129576, + "balance_loss_mlp": 1.10672021, + "epoch": 0.013407485344957162, + "flos": 21797705554560.0, + "grad_norm": 2.7127164790698606, + "language_loss": 0.95539695, + "learning_rate": 3.481419351635897e-06, + "loss": 0.98022527, + "num_input_tokens_seen": 4692355, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 2.71875, + "step": 223, + "time_per_iteration": 2.679387092590332 + }, + { + "auxiliary_loss_clip": 0.01377947, + "auxiliary_loss_mlp": 0.01106927, + "balance_loss_clip": 1.05612004, + "balance_loss_mlp": 1.10671806, + "epoch": 0.013467608597625132, + "flos": 18621622195200.0, + "grad_norm": 2.5543531214735586, + "language_loss": 0.88022512, + "learning_rate": 3.484300126837776e-06, + "loss": 0.90507382, + "num_input_tokens_seen": 4710080, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.71875, + "step": 224, + "time_per_iteration": 2.6327528953552246 + }, + { + "auxiliary_loss_clip": 0.0137715, + "auxiliary_loss_mlp": 0.01102713, + "balance_loss_clip": 1.04873466, + "balance_loss_mlp": 1.10632586, + "epoch": 0.013527731850293101, + "flos": 18552314903040.0, + "grad_norm": 2.0812591886363183, + "language_loss": 0.89642018, + "learning_rate": 3.487168070036317e-06, + "loss": 0.92121875, + "num_input_tokens_seen": 4728980, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 2.703125, + "step": 225, + "time_per_iteration": 2.511749505996704 + }, + { + "auxiliary_loss_clip": 0.01374075, + "auxiliary_loss_mlp": 0.01115854, + "balance_loss_clip": 1.06273401, + "balance_loss_mlp": 1.10547256, + "epoch": 0.01358785510296107, + "flos": 19165381257600.0, + "grad_norm": 2.1555099546542142, + "language_loss": 0.99022663, + "learning_rate": 3.4900232950414224e-06, + "loss": 1.01512599, + "num_input_tokens_seen": 4747020, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.6875, + "step": 226, + "time_per_iteration": 5.38438868522644 + }, + { + "auxiliary_loss_clip": 0.0137773, + "auxiliary_loss_mlp": 0.01111487, + "balance_loss_clip": 1.0584867, + "balance_loss_mlp": 1.10696185, + "epoch": 0.01364797835562904, + "flos": 23329941966720.0, + "grad_norm": 15.523681056640678, + "language_loss": 0.91210413, + "learning_rate": 3.4928659141555727e-06, + "loss": 0.93699628, + "num_input_tokens_seen": 4765000, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.703125, + "step": 227, + "time_per_iteration": 2.5391762256622314 + }, + { + "auxiliary_loss_clip": 0.01252818, + "auxiliary_loss_mlp": 0.01025357, + "balance_loss_clip": 1.00666487, + "balance_loss_mlp": 1.10911703, + "epoch": 0.013708101608297009, + "flos": 70993746097920.0, + "grad_norm": 0.99230217192713, + "language_loss": 0.57680154, + "learning_rate": 3.4956960382003234e-06, + "loss": 0.59958327, + "num_input_tokens_seen": 4833210, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.4375, + "step": 228, + "time_per_iteration": 3.1981163024902344 + }, + { + "auxiliary_loss_clip": 0.0136686, + "auxiliary_loss_mlp": 0.01110654, + "balance_loss_clip": 1.06127763, + "balance_loss_mlp": 1.10228515, + "epoch": 0.013768224860964979, + "flos": 16325170997760.0, + "grad_norm": 2.2779006264878374, + "language_loss": 0.8759563, + "learning_rate": 3.4985137765422354e-06, + "loss": 0.90073144, + "num_input_tokens_seen": 4850120, + "router_z_loss_clip": 0.49414062, + "router_z_loss_mlp": 2.65625, + "step": 229, + "time_per_iteration": 2.49130916595459 + }, + { + "auxiliary_loss_clip": 0.01377631, + "auxiliary_loss_mlp": 0.01101911, + "balance_loss_clip": 1.05212951, + "balance_loss_mlp": 1.10486007, + "epoch": 0.013828348113632948, + "flos": 20193037367040.0, + "grad_norm": 4.280679608747667, + "language_loss": 0.84247303, + "learning_rate": 3.501319237118231e-06, + "loss": 0.8672685, + "num_input_tokens_seen": 4866215, + "router_z_loss_clip": 0.49804688, + "router_z_loss_mlp": 2.734375, + "step": 230, + "time_per_iteration": 2.501218557357788 + }, + { + "auxiliary_loss_clip": 0.01375417, + "auxiliary_loss_mlp": 0.0111628, + "balance_loss_clip": 1.06671298, + "balance_loss_mlp": 1.10600948, + "epoch": 0.013888471366300916, + "flos": 20741070147840.0, + "grad_norm": 1.78964280876859, + "language_loss": 0.90378422, + "learning_rate": 3.5041125264604056e-06, + "loss": 0.92870116, + "num_input_tokens_seen": 4885630, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 2.6875, + "step": 231, + "time_per_iteration": 2.541137456893921 + }, + { + "auxiliary_loss_clip": 0.01377441, + "auxiliary_loss_mlp": 0.01108629, + "balance_loss_clip": 1.05941916, + "balance_loss_mlp": 1.10821056, + "epoch": 0.013948594618968886, + "flos": 22090628966400.0, + "grad_norm": 2.031489983297281, + "language_loss": 0.83706695, + "learning_rate": 3.5068937497203002e-06, + "loss": 0.86192763, + "num_input_tokens_seen": 4905570, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 2.6875, + "step": 232, + "time_per_iteration": 2.5444753170013428 + }, + { + "auxiliary_loss_clip": 0.0137977, + "auxiliary_loss_mlp": 0.01092372, + "balance_loss_clip": 1.04125488, + "balance_loss_mlp": 1.10017753, + "epoch": 0.014008717871636855, + "flos": 19063108258560.0, + "grad_norm": 2.928489064169697, + "language_loss": 0.74033689, + "learning_rate": 3.509663010692652e-06, + "loss": 0.76505834, + "num_input_tokens_seen": 4923535, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 2.796875, + "step": 233, + "time_per_iteration": 2.5364952087402344 + }, + { + "auxiliary_loss_clip": 0.01382965, + "auxiliary_loss_mlp": 0.0112384, + "balance_loss_clip": 1.07141209, + "balance_loss_mlp": 1.10741055, + "epoch": 0.014068841124304825, + "flos": 14530822064640.0, + "grad_norm": 2.287774019631123, + "language_loss": 0.85867143, + "learning_rate": 3.512420411838642e-06, + "loss": 0.88373953, + "num_input_tokens_seen": 4939200, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.75, + "step": 234, + "time_per_iteration": 2.532949209213257 + }, + { + "auxiliary_loss_clip": 0.01375298, + "auxiliary_loss_mlp": 0.01106064, + "balance_loss_clip": 1.05683041, + "balance_loss_mlp": 1.10759592, + "epoch": 0.014128964376972794, + "flos": 18077396256000.0, + "grad_norm": 2.6527993685177154, + "language_loss": 0.89144391, + "learning_rate": 3.515166054308634e-06, + "loss": 0.9162575, + "num_input_tokens_seen": 4956620, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 2.671875, + "step": 235, + "time_per_iteration": 2.509592294692993 + }, + { + "auxiliary_loss_clip": 0.0137416, + "auxiliary_loss_mlp": 0.01119384, + "balance_loss_clip": 1.06874382, + "balance_loss_mlp": 1.10830367, + "epoch": 0.014189087629640764, + "flos": 25334331678720.0, + "grad_norm": 4.054998173736759, + "language_loss": 0.85780042, + "learning_rate": 3.5179000379644498e-06, + "loss": 0.88273585, + "num_input_tokens_seen": 4975650, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.65625, + "step": 236, + "time_per_iteration": 2.744925022125244 + }, + { + "auxiliary_loss_clip": 0.0137118, + "auxiliary_loss_mlp": 0.01099258, + "balance_loss_clip": 1.04871392, + "balance_loss_mlp": 1.10178149, + "epoch": 0.014249210882308733, + "flos": 36139744713600.0, + "grad_norm": 2.128422813257453, + "language_loss": 0.82452404, + "learning_rate": 3.520622461401154e-06, + "loss": 0.84922838, + "num_input_tokens_seen": 4997415, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.6875, + "step": 237, + "time_per_iteration": 2.67307710647583 + }, + { + "auxiliary_loss_clip": 0.01369116, + "auxiliary_loss_mlp": 0.01116968, + "balance_loss_clip": 1.0643487, + "balance_loss_mlp": 1.10451889, + "epoch": 0.014309334134976702, + "flos": 12932977461120.0, + "grad_norm": 3.103781307849977, + "language_loss": 0.77321362, + "learning_rate": 3.5233334219683935e-06, + "loss": 0.79807448, + "num_input_tokens_seen": 5013905, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 2.65625, + "step": 238, + "time_per_iteration": 2.4973809719085693 + }, + { + "auxiliary_loss_clip": 0.01368178, + "auxiliary_loss_mlp": 0.01112367, + "balance_loss_clip": 1.06566119, + "balance_loss_mlp": 1.10654771, + "epoch": 0.014369457387644672, + "flos": 20777519473920.0, + "grad_norm": 1.992064896075991, + "language_loss": 0.87370872, + "learning_rate": 3.526033015791284e-06, + "loss": 0.89851415, + "num_input_tokens_seen": 5033645, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 2.609375, + "step": 239, + "time_per_iteration": 2.554222583770752 + }, + { + "auxiliary_loss_clip": 0.01352979, + "auxiliary_loss_mlp": 0.01100535, + "balance_loss_clip": 1.05330408, + "balance_loss_mlp": 1.09776592, + "epoch": 0.01442958064031264, + "flos": 25848536826240.0, + "grad_norm": 2.2433371609956283, + "language_loss": 0.93297911, + "learning_rate": 3.528721337790862e-06, + "loss": 0.95751429, + "num_input_tokens_seen": 5052875, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 2.5625, + "step": 240, + "time_per_iteration": 2.588529348373413 + }, + { + "auxiliary_loss_clip": 0.01362634, + "auxiliary_loss_mlp": 0.01104045, + "balance_loss_clip": 1.05736244, + "balance_loss_mlp": 1.10324717, + "epoch": 0.014489703892980611, + "flos": 28219718269440.0, + "grad_norm": 2.299780828803648, + "language_loss": 0.85129881, + "learning_rate": 3.531398481704111e-06, + "loss": 0.8759656, + "num_input_tokens_seen": 5075005, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 2.59375, + "step": 241, + "time_per_iteration": 2.607272148132324 + }, + { + "auxiliary_loss_clip": 0.01360778, + "auxiliary_loss_mlp": 0.01116022, + "balance_loss_clip": 1.06695509, + "balance_loss_mlp": 1.10865557, + "epoch": 0.01454982714564858, + "flos": 22490925108480.0, + "grad_norm": 1.927287768398498, + "language_loss": 0.88410223, + "learning_rate": 3.534064540103573e-06, + "loss": 0.90887022, + "num_input_tokens_seen": 5091875, + "router_z_loss_clip": 0.49023438, + "router_z_loss_mlp": 2.53125, + "step": 242, + "time_per_iteration": 2.522657632827759 + }, + { + "auxiliary_loss_clip": 0.013595, + "auxiliary_loss_mlp": 0.0109979, + "balance_loss_clip": 1.04981756, + "balance_loss_mlp": 1.10147619, + "epoch": 0.014609950398316548, + "flos": 21653201139840.0, + "grad_norm": 2.6384412969740922, + "language_loss": 0.86817086, + "learning_rate": 3.536719604416555e-06, + "loss": 0.89276373, + "num_input_tokens_seen": 5111290, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 2.578125, + "step": 243, + "time_per_iteration": 2.5738751888275146 + }, + { + "auxiliary_loss_clip": 0.01366378, + "auxiliary_loss_mlp": 0.01105289, + "balance_loss_clip": 1.05574584, + "balance_loss_mlp": 1.10421979, + "epoch": 0.014670073650984519, + "flos": 21869993675520.0, + "grad_norm": 1.576084931358892, + "language_loss": 0.84271425, + "learning_rate": 3.5393637649439464e-06, + "loss": 0.86743093, + "num_input_tokens_seen": 5132265, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 2.625, + "step": 244, + "time_per_iteration": 2.51370906829834 + }, + { + "auxiliary_loss_clip": 0.01374385, + "auxiliary_loss_mlp": 0.01115077, + "balance_loss_clip": 1.06403196, + "balance_loss_mlp": 1.10701251, + "epoch": 0.014730196903652487, + "flos": 23183713699200.0, + "grad_norm": 2.2775099056278916, + "language_loss": 0.78689361, + "learning_rate": 3.54199711087864e-06, + "loss": 0.8117882, + "num_input_tokens_seen": 5148575, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 2.671875, + "step": 245, + "time_per_iteration": 2.5579745769500732 + }, + { + "auxiliary_loss_clip": 0.01372772, + "auxiliary_loss_mlp": 0.0110276, + "balance_loss_clip": 1.04961681, + "balance_loss_mlp": 1.10232484, + "epoch": 0.014790320156320457, + "flos": 23222605150080.0, + "grad_norm": 2.2330220282190685, + "language_loss": 0.84241545, + "learning_rate": 3.5446197303235913e-06, + "loss": 0.86717069, + "num_input_tokens_seen": 5170415, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.703125, + "step": 246, + "time_per_iteration": 2.565614700317383 + }, + { + "auxiliary_loss_clip": 0.01367419, + "auxiliary_loss_mlp": 0.01097455, + "balance_loss_clip": 1.04722059, + "balance_loss_mlp": 1.10181057, + "epoch": 0.014850443408988426, + "flos": 15815490963840.0, + "grad_norm": 1.9335653980079095, + "language_loss": 0.9014703, + "learning_rate": 3.5472317103095034e-06, + "loss": 0.92611909, + "num_input_tokens_seen": 5188565, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 2.65625, + "step": 247, + "time_per_iteration": 2.5572896003723145 + }, + { + "auxiliary_loss_clip": 0.01365881, + "auxiliary_loss_mlp": 0.01097755, + "balance_loss_clip": 1.04952252, + "balance_loss_mlp": 1.09689593, + "epoch": 0.014910566661656396, + "flos": 22781657790720.0, + "grad_norm": 2.1205098484246734, + "language_loss": 0.78058362, + "learning_rate": 3.549833136812155e-06, + "loss": 0.80521989, + "num_input_tokens_seen": 5207810, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 2.6875, + "step": 248, + "time_per_iteration": 2.5365517139434814 + }, + { + "auxiliary_loss_clip": 0.0136687, + "auxiliary_loss_mlp": 0.01105288, + "balance_loss_clip": 1.05552983, + "balance_loss_mlp": 1.10545397, + "epoch": 0.014970689914324365, + "flos": 26865023806080.0, + "grad_norm": 2.1747011613954177, + "language_loss": 0.83849227, + "learning_rate": 3.552424094769381e-06, + "loss": 0.86321384, + "num_input_tokens_seen": 5226210, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 2.609375, + "step": 249, + "time_per_iteration": 2.6142020225524902 + }, + { + "auxiliary_loss_clip": 0.01360073, + "auxiliary_loss_mlp": 0.01106063, + "balance_loss_clip": 1.05806887, + "balance_loss_mlp": 1.09971058, + "epoch": 0.015030813166992334, + "flos": 13985662371840.0, + "grad_norm": 2.2137591284686455, + "language_loss": 0.93476778, + "learning_rate": 3.5550046680977174e-06, + "loss": 0.95942914, + "num_input_tokens_seen": 5241660, + "router_z_loss_clip": 0.47851562, + "router_z_loss_mlp": 2.609375, + "step": 250, + "time_per_iteration": 2.485686779022217 + }, + { + "auxiliary_loss_clip": 0.01369254, + "auxiliary_loss_mlp": 0.01114661, + "balance_loss_clip": 1.06351972, + "balance_loss_mlp": 1.10460913, + "epoch": 0.015090936419660304, + "flos": 24717817618560.0, + "grad_norm": 2.2612141068319622, + "language_loss": 0.97030997, + "learning_rate": 3.5575749397087034e-06, + "loss": 0.99514914, + "num_input_tokens_seen": 5261090, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 2.640625, + "step": 251, + "time_per_iteration": 2.5887296199798584 + }, + { + "auxiliary_loss_clip": 0.01362288, + "auxiliary_loss_mlp": 0.01105325, + "balance_loss_clip": 1.05723596, + "balance_loss_mlp": 1.09872079, + "epoch": 0.015151059672328273, + "flos": 25738793798400.0, + "grad_norm": 2.0465178965121136, + "language_loss": 0.8428089, + "learning_rate": 3.5601349915248707e-06, + "loss": 0.86748511, + "num_input_tokens_seen": 5279175, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 2.640625, + "step": 252, + "time_per_iteration": 2.5749199390411377 + }, + { + "auxiliary_loss_clip": 0.01357969, + "auxiliary_loss_mlp": 0.01114738, + "balance_loss_clip": 1.06569552, + "balance_loss_mlp": 1.10169089, + "epoch": 0.015211182924996243, + "flos": 21871214737920.0, + "grad_norm": 2.482990993198259, + "language_loss": 0.98208833, + "learning_rate": 3.5626849044954064e-06, + "loss": 1.00681543, + "num_input_tokens_seen": 5296975, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 2.5625, + "step": 253, + "time_per_iteration": 2.5639333724975586 + }, + { + "auxiliary_loss_clip": 0.01233728, + "auxiliary_loss_mlp": 0.01026961, + "balance_loss_clip": 1.00855541, + "balance_loss_mlp": 1.09965372, + "epoch": 0.015271306177664212, + "flos": 66895080888960.0, + "grad_norm": 0.8505459641429172, + "language_loss": 0.55672622, + "learning_rate": 3.5652247586115167e-06, + "loss": 0.57933319, + "num_input_tokens_seen": 5358375, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.34375, + "step": 254, + "time_per_iteration": 3.1063449382781982 + }, + { + "auxiliary_loss_clip": 0.01362079, + "auxiliary_loss_mlp": 0.01116704, + "balance_loss_clip": 1.06687438, + "balance_loss_mlp": 1.09652638, + "epoch": 0.01533142943033218, + "flos": 26834069260800.0, + "grad_norm": 2.4360968938917065, + "language_loss": 0.90453845, + "learning_rate": 3.567754632921479e-06, + "loss": 0.9293263, + "num_input_tokens_seen": 5377255, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 2.65625, + "step": 255, + "time_per_iteration": 2.5746912956237793 + }, + { + "auxiliary_loss_clip": 0.01358909, + "auxiliary_loss_mlp": 0.01125654, + "balance_loss_clip": 1.07568169, + "balance_loss_mlp": 1.09931397, + "epoch": 0.01539155268300015, + "flos": 20813753318400.0, + "grad_norm": 2.2666703391376903, + "language_loss": 0.8562001, + "learning_rate": 3.5702746055454075e-06, + "loss": 0.8810457, + "num_input_tokens_seen": 5395320, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 2.59375, + "step": 256, + "time_per_iteration": 2.6095149517059326 + }, + { + "auxiliary_loss_clip": 0.01366413, + "auxiliary_loss_mlp": 0.01112529, + "balance_loss_clip": 1.06305718, + "balance_loss_mlp": 1.09961021, + "epoch": 0.01545167593566812, + "flos": 15961862885760.0, + "grad_norm": 2.7442871984488386, + "language_loss": 0.71504897, + "learning_rate": 3.5727847536897254e-06, + "loss": 0.73983842, + "num_input_tokens_seen": 5411970, + "router_z_loss_clip": 0.49414062, + "router_z_loss_mlp": 2.65625, + "step": 257, + "time_per_iteration": 2.5939691066741943 + }, + { + "auxiliary_loss_clip": 0.01357007, + "auxiliary_loss_mlp": 0.01100177, + "balance_loss_clip": 1.05087197, + "balance_loss_mlp": 1.09875202, + "epoch": 0.01551179918833609, + "flos": 22601745544320.0, + "grad_norm": 1.9522192109187282, + "language_loss": 0.94659579, + "learning_rate": 3.5752851536613596e-06, + "loss": 0.97116768, + "num_input_tokens_seen": 5430245, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 2.578125, + "step": 258, + "time_per_iteration": 2.7119739055633545 + }, + { + "auxiliary_loss_clip": 0.01356701, + "auxiliary_loss_mlp": 0.01104272, + "balance_loss_clip": 1.05615926, + "balance_loss_mlp": 1.09608126, + "epoch": 0.015571922441004058, + "flos": 22816706486400.0, + "grad_norm": 3.167214789879638, + "language_loss": 0.93174207, + "learning_rate": 3.577775880881658e-06, + "loss": 0.95635182, + "num_input_tokens_seen": 5448905, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 2.59375, + "step": 259, + "time_per_iteration": 2.6776607036590576 + }, + { + "auxiliary_loss_clip": 0.01351639, + "auxiliary_loss_mlp": 0.010988, + "balance_loss_clip": 1.05297637, + "balance_loss_mlp": 1.10035825, + "epoch": 0.015632045693672027, + "flos": 18947439486720.0, + "grad_norm": 2.1226725879970605, + "language_loss": 0.97360909, + "learning_rate": 3.5802570099000424e-06, + "loss": 0.99811351, + "num_input_tokens_seen": 5466405, + "router_z_loss_clip": 0.45898438, + "router_z_loss_mlp": 2.515625, + "step": 260, + "time_per_iteration": 2.520759105682373 + }, + { + "auxiliary_loss_clip": 0.01365989, + "auxiliary_loss_mlp": 0.01110082, + "balance_loss_clip": 1.06282747, + "balance_loss_mlp": 1.10060608, + "epoch": 0.015692168946339995, + "flos": 29971728046080.0, + "grad_norm": 2.3569711169381, + "language_loss": 0.87644511, + "learning_rate": 3.5827286144073947e-06, + "loss": 0.90120584, + "num_input_tokens_seen": 5487055, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 2.65625, + "step": 261, + "time_per_iteration": 2.5837602615356445 + }, + { + "auxiliary_loss_clip": 0.0135711, + "auxiliary_loss_mlp": 0.01105229, + "balance_loss_clip": 1.05613816, + "balance_loss_mlp": 1.09709311, + "epoch": 0.015752292199007967, + "flos": 19392085946880.0, + "grad_norm": 1.9926513495738176, + "language_loss": 0.67226446, + "learning_rate": 3.5851907672491904e-06, + "loss": 0.69688779, + "num_input_tokens_seen": 5506600, + "router_z_loss_clip": 0.49023438, + "router_z_loss_mlp": 2.59375, + "step": 262, + "time_per_iteration": 2.5490784645080566 + }, + { + "auxiliary_loss_clip": 0.01354995, + "auxiliary_loss_mlp": 0.01121613, + "balance_loss_clip": 1.07145, + "balance_loss_mlp": 1.0984714, + "epoch": 0.015812415451675936, + "flos": 20339804338560.0, + "grad_norm": 2.3019763169045637, + "language_loss": 0.68570435, + "learning_rate": 3.587643540438383e-06, + "loss": 0.71047044, + "num_input_tokens_seen": 5524350, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 2.5625, + "step": 263, + "time_per_iteration": 2.5207104682922363 + }, + { + "auxiliary_loss_clip": 0.01355963, + "auxiliary_loss_mlp": 0.01105396, + "balance_loss_clip": 1.055686, + "balance_loss_mlp": 1.09446979, + "epoch": 0.015872538704343905, + "flos": 17525412979200.0, + "grad_norm": 3.705792502973735, + "language_loss": 0.85120308, + "learning_rate": 3.590087005168037e-06, + "loss": 0.87581658, + "num_input_tokens_seen": 5542145, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 2.625, + "step": 264, + "time_per_iteration": 2.559406280517578 + }, + { + "auxiliary_loss_clip": 0.01361439, + "auxiliary_loss_mlp": 0.01092909, + "balance_loss_clip": 1.04663229, + "balance_loss_mlp": 1.10003614, + "epoch": 0.015932661957011873, + "flos": 15260490944640.0, + "grad_norm": 4.651007312001026, + "language_loss": 1.04371059, + "learning_rate": 3.5925212318237344e-06, + "loss": 1.06825411, + "num_input_tokens_seen": 5557920, + "router_z_loss_clip": 0.46289062, + "router_z_loss_mlp": 2.625, + "step": 265, + "time_per_iteration": 2.5076427459716797 + }, + { + "auxiliary_loss_clip": 0.01364923, + "auxiliary_loss_mlp": 0.01114141, + "balance_loss_clip": 1.06266677, + "balance_loss_mlp": 1.10278761, + "epoch": 0.015992785209679845, + "flos": 20302528999680.0, + "grad_norm": 2.2797174203272705, + "language_loss": 0.75153112, + "learning_rate": 3.5949462899957323e-06, + "loss": 0.77632177, + "num_input_tokens_seen": 5576290, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 2.625, + "step": 266, + "time_per_iteration": 2.52923583984375 + }, + { + "auxiliary_loss_clip": 0.01351984, + "auxiliary_loss_mlp": 0.01101922, + "balance_loss_clip": 1.05321336, + "balance_loss_mlp": 1.10004377, + "epoch": 0.016052908462347814, + "flos": 23362368969600.0, + "grad_norm": 1.7047265515665009, + "language_loss": 0.90568709, + "learning_rate": 3.5973622484909068e-06, + "loss": 0.93022615, + "num_input_tokens_seen": 5595205, + "router_z_loss_clip": 0.48632812, + "router_z_loss_mlp": 2.515625, + "step": 267, + "time_per_iteration": 4.033226251602173 + }, + { + "auxiliary_loss_clip": 0.01359316, + "auxiliary_loss_mlp": 0.01118854, + "balance_loss_clip": 1.07143235, + "balance_loss_mlp": 1.09878063, + "epoch": 0.016113031715015783, + "flos": 21286588976640.0, + "grad_norm": 2.258126572730018, + "language_loss": 0.86044276, + "learning_rate": 3.599769175344462e-06, + "loss": 0.88522446, + "num_input_tokens_seen": 5612645, + "router_z_loss_clip": 0.47460938, + "router_z_loss_mlp": 2.609375, + "step": 268, + "time_per_iteration": 3.9120936393737793 + }, + { + "auxiliary_loss_clip": 0.01352601, + "auxiliary_loss_mlp": 0.01098281, + "balance_loss_clip": 1.05186045, + "balance_loss_mlp": 1.10092831, + "epoch": 0.01617315496768375, + "flos": 18914689261440.0, + "grad_norm": 3.4793793476816335, + "language_loss": 0.88284534, + "learning_rate": 3.602167137831432e-06, + "loss": 0.90735412, + "num_input_tokens_seen": 5628345, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 2.515625, + "step": 269, + "time_per_iteration": 2.5170347690582275 + }, + { + "auxiliary_loss_clip": 0.01357286, + "auxiliary_loss_mlp": 0.01099969, + "balance_loss_clip": 1.04901874, + "balance_loss_mlp": 1.09723783, + "epoch": 0.01623327822035172, + "flos": 16546488647040.0, + "grad_norm": 2.082153756456244, + "language_loss": 0.97073388, + "learning_rate": 3.6045562024779565e-06, + "loss": 0.99530637, + "num_input_tokens_seen": 5645940, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.59375, + "step": 270, + "time_per_iteration": 2.4856350421905518 + }, + { + "auxiliary_loss_clip": 0.01357366, + "auxiliary_loss_mlp": 0.01117767, + "balance_loss_clip": 1.07001138, + "balance_loss_mlp": 1.10259032, + "epoch": 0.016293401473019692, + "flos": 23513481486720.0, + "grad_norm": 2.1071719511680755, + "language_loss": 0.85919821, + "learning_rate": 3.606936435072361e-06, + "loss": 0.88394946, + "num_input_tokens_seen": 5665690, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 2.546875, + "step": 271, + "time_per_iteration": 2.55047345161438 + }, + { + "auxiliary_loss_clip": 0.01355041, + "auxiliary_loss_mlp": 0.0109977, + "balance_loss_clip": 1.05201519, + "balance_loss_mlp": 1.09418058, + "epoch": 0.01635352472568766, + "flos": 29016072748800.0, + "grad_norm": 3.6330072162998523, + "language_loss": 0.81509304, + "learning_rate": 3.609307900676025e-06, + "loss": 0.83964115, + "num_input_tokens_seen": 5683190, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 2.609375, + "step": 272, + "time_per_iteration": 2.563840389251709 + }, + { + "auxiliary_loss_clip": 0.01348825, + "auxiliary_loss_mlp": 0.01118044, + "balance_loss_clip": 1.07229137, + "balance_loss_mlp": 1.09649634, + "epoch": 0.01641364797835563, + "flos": 13370513028480.0, + "grad_norm": 2.4112371858801436, + "language_loss": 0.81101978, + "learning_rate": 3.611670663634051e-06, + "loss": 0.83568847, + "num_input_tokens_seen": 5699780, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 2.515625, + "step": 273, + "time_per_iteration": 2.504791736602783 + }, + { + "auxiliary_loss_clip": 0.01348205, + "auxiliary_loss_mlp": 0.01105988, + "balance_loss_clip": 1.05825627, + "balance_loss_mlp": 1.0930239, + "epoch": 0.016473771231023598, + "flos": 18878239935360.0, + "grad_norm": 2.3125197915452387, + "language_loss": 0.91599321, + "learning_rate": 3.614024787585744e-06, + "loss": 0.94053519, + "num_input_tokens_seen": 5716980, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 2.5625, + "step": 274, + "time_per_iteration": 2.530883312225342 + }, + { + "auxiliary_loss_clip": 0.01346841, + "auxiliary_loss_mlp": 0.01110058, + "balance_loss_clip": 1.06154013, + "balance_loss_mlp": 1.09588742, + "epoch": 0.016533894483691566, + "flos": 22601637803520.0, + "grad_norm": 1.8828740595481548, + "language_loss": 0.87952697, + "learning_rate": 3.6163703354748927e-06, + "loss": 0.90409595, + "num_input_tokens_seen": 5737780, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 2.515625, + "step": 275, + "time_per_iteration": 2.6067841053009033 + }, + { + "auxiliary_loss_clip": 0.01349399, + "auxiliary_loss_mlp": 0.01103927, + "balance_loss_clip": 1.05481219, + "balance_loss_mlp": 1.09579742, + "epoch": 0.01659401773635954, + "flos": 21507188353920.0, + "grad_norm": 1.8814357547622875, + "language_loss": 0.80717576, + "learning_rate": 3.6187073695598707e-06, + "loss": 0.83170903, + "num_input_tokens_seen": 5758330, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 2.53125, + "step": 276, + "time_per_iteration": 2.5251641273498535 + }, + { + "auxiliary_loss_clip": 0.01340258, + "auxiliary_loss_mlp": 0.01100275, + "balance_loss_clip": 1.0561676, + "balance_loss_mlp": 1.0946306, + "epoch": 0.016654140989027507, + "flos": 32850973411200.0, + "grad_norm": 1.7238418569970533, + "language_loss": 0.81033546, + "learning_rate": 3.621035951423551e-06, + "loss": 0.83474076, + "num_input_tokens_seen": 5778340, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 2.46875, + "step": 277, + "time_per_iteration": 2.6796398162841797 + }, + { + "auxiliary_loss_clip": 0.01338755, + "auxiliary_loss_mlp": 0.01095233, + "balance_loss_clip": 1.04828835, + "balance_loss_mlp": 1.08789539, + "epoch": 0.016714264241695476, + "flos": 12306228024960.0, + "grad_norm": 2.810922211495867, + "language_loss": 0.80307728, + "learning_rate": 3.623356141983041e-06, + "loss": 0.82741719, + "num_input_tokens_seen": 5794295, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 2.515625, + "step": 278, + "time_per_iteration": 2.4939208030700684 + }, + { + "auxiliary_loss_clip": 0.01343866, + "auxiliary_loss_mlp": 0.01101481, + "balance_loss_clip": 1.05634809, + "balance_loss_mlp": 1.09381282, + "epoch": 0.016774387494363444, + "flos": 27123796362240.0, + "grad_norm": 1.7778988036026468, + "language_loss": 0.90482658, + "learning_rate": 3.6256680014992486e-06, + "loss": 0.92928004, + "num_input_tokens_seen": 5814405, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 2.5, + "step": 279, + "time_per_iteration": 2.571439504623413 + }, + { + "auxiliary_loss_clip": 0.01348727, + "auxiliary_loss_mlp": 0.01116075, + "balance_loss_clip": 1.06872559, + "balance_loss_mlp": 1.09391451, + "epoch": 0.016834510747031413, + "flos": 20191493082240.0, + "grad_norm": 3.0477743200742387, + "language_loss": 0.94153798, + "learning_rate": 3.6279715895862713e-06, + "loss": 0.96618605, + "num_input_tokens_seen": 5832795, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 2.546875, + "step": 280, + "time_per_iteration": 2.5161728858947754 + }, + { + "auxiliary_loss_clip": 0.0134865, + "auxiliary_loss_mlp": 0.0110692, + "balance_loss_clip": 1.05864, + "balance_loss_mlp": 1.09245062, + "epoch": 0.016894633999699385, + "flos": 27274262434560.0, + "grad_norm": 3.578687135351882, + "language_loss": 0.73929775, + "learning_rate": 3.6302669652206183e-06, + "loss": 0.76385343, + "num_input_tokens_seen": 5855750, + "router_z_loss_clip": 0.48242188, + "router_z_loss_mlp": 2.5625, + "step": 281, + "time_per_iteration": 2.616241931915283 + }, + { + "auxiliary_loss_clip": 0.01343434, + "auxiliary_loss_mlp": 0.0111488, + "balance_loss_clip": 1.06977129, + "balance_loss_mlp": 1.09390783, + "epoch": 0.016954757252367354, + "flos": 14902964922240.0, + "grad_norm": 2.679798242609796, + "language_loss": 0.80207133, + "learning_rate": 3.632554186750274e-06, + "loss": 0.82665443, + "num_input_tokens_seen": 5872610, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 2.5, + "step": 282, + "time_per_iteration": 2.5421135425567627 + }, + { + "auxiliary_loss_clip": 0.01348806, + "auxiliary_loss_mlp": 0.01117348, + "balance_loss_clip": 1.0704273, + "balance_loss_mlp": 1.09599137, + "epoch": 0.017014880505035322, + "flos": 21358805270400.0, + "grad_norm": 2.1184562475367916, + "language_loss": 0.77788174, + "learning_rate": 3.6348333119035937e-06, + "loss": 0.80254328, + "num_input_tokens_seen": 5892985, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 2.53125, + "step": 283, + "time_per_iteration": 2.516474485397339 + }, + { + "auxiliary_loss_clip": 0.01349252, + "auxiliary_loss_mlp": 0.01091995, + "balance_loss_clip": 1.04788804, + "balance_loss_mlp": 1.09700751, + "epoch": 0.01707500375770329, + "flos": 35333154858240.0, + "grad_norm": 2.1009174504018544, + "language_loss": 0.84172702, + "learning_rate": 3.6371043977980503e-06, + "loss": 0.86613953, + "num_input_tokens_seen": 5914060, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 2.515625, + "step": 284, + "time_per_iteration": 2.646301031112671 + }, + { + "auxiliary_loss_clip": 0.01339164, + "auxiliary_loss_mlp": 0.010994, + "balance_loss_clip": 1.05216956, + "balance_loss_mlp": 1.09148788, + "epoch": 0.01713512701037126, + "flos": 23582070506880.0, + "grad_norm": 3.014395623363928, + "language_loss": 0.96993905, + "learning_rate": 3.639367500948819e-06, + "loss": 0.99432468, + "num_input_tokens_seen": 5932860, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 2.46875, + "step": 285, + "time_per_iteration": 2.5412731170654297 + }, + { + "auxiliary_loss_clip": 0.01342544, + "auxiliary_loss_mlp": 0.01093983, + "balance_loss_clip": 1.05025744, + "balance_loss_mlp": 1.09407294, + "epoch": 0.01719525026303923, + "flos": 27634661544960.0, + "grad_norm": 2.2067050643741433, + "language_loss": 0.93951917, + "learning_rate": 3.6416226772772178e-06, + "loss": 0.96388453, + "num_input_tokens_seen": 5952725, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 2.484375, + "step": 286, + "time_per_iteration": 2.5895566940307617 + }, + { + "auxiliary_loss_clip": 0.0133546, + "auxiliary_loss_mlp": 0.01090331, + "balance_loss_clip": 1.04503167, + "balance_loss_mlp": 1.08924019, + "epoch": 0.0172553735157072, + "flos": 26979722910720.0, + "grad_norm": 1.8729510510678706, + "language_loss": 0.92157722, + "learning_rate": 3.643869982119001e-06, + "loss": 0.94583511, + "num_input_tokens_seen": 5970560, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 2.46875, + "step": 287, + "time_per_iteration": 2.6144802570343018 + }, + { + "auxiliary_loss_clip": 0.01338793, + "auxiliary_loss_mlp": 0.01089685, + "balance_loss_clip": 1.04462433, + "balance_loss_mlp": 1.08859432, + "epoch": 0.01731549676837517, + "flos": 14056621689600.0, + "grad_norm": 3.2271144452092564, + "language_loss": 1.02026963, + "learning_rate": 3.646109470232502e-06, + "loss": 1.04455447, + "num_input_tokens_seen": 5982980, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 2.5, + "step": 288, + "time_per_iteration": 2.488274097442627 + }, + { + "auxiliary_loss_clip": 0.01222501, + "auxiliary_loss_mlp": 0.01036501, + "balance_loss_clip": 1.02000237, + "balance_loss_mlp": 1.09325862, + "epoch": 0.017375620021043137, + "flos": 66510694471680.0, + "grad_norm": 0.9131614435254132, + "language_loss": 0.63915455, + "learning_rate": 3.6483411958066417e-06, + "loss": 0.66174459, + "num_input_tokens_seen": 6049445, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 1.296875, + "step": 289, + "time_per_iteration": 3.222426652908325 + }, + { + "auxiliary_loss_clip": 0.01341104, + "auxiliary_loss_mlp": 0.01107523, + "balance_loss_clip": 1.06379664, + "balance_loss_mlp": 1.09403992, + "epoch": 0.01743574327371111, + "flos": 15225154940160.0, + "grad_norm": 2.4014361624695173, + "language_loss": 0.88569438, + "learning_rate": 3.6505652124687957e-06, + "loss": 0.91018069, + "num_input_tokens_seen": 6064150, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 2.46875, + "step": 290, + "time_per_iteration": 2.49294114112854 + }, + { + "auxiliary_loss_clip": 0.01338257, + "auxiliary_loss_mlp": 0.01091523, + "balance_loss_clip": 1.04631877, + "balance_loss_mlp": 1.09248078, + "epoch": 0.017495866526379078, + "flos": 25373869574400.0, + "grad_norm": 2.156562479490788, + "language_loss": 0.84578067, + "learning_rate": 3.6527815732925258e-06, + "loss": 0.87007844, + "num_input_tokens_seen": 6083920, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 2.453125, + "step": 291, + "time_per_iteration": 2.5356485843658447 + }, + { + "auxiliary_loss_clip": 0.01345108, + "auxiliary_loss_mlp": 0.01106973, + "balance_loss_clip": 1.05897939, + "balance_loss_mlp": 1.10042334, + "epoch": 0.017555989779047047, + "flos": 26359473836160.0, + "grad_norm": 1.6617628708439536, + "language_loss": 0.72766221, + "learning_rate": 3.6549903308051806e-06, + "loss": 0.75218308, + "num_input_tokens_seen": 6105460, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 2.453125, + "step": 292, + "time_per_iteration": 2.6524176597595215 + }, + { + "auxiliary_loss_clip": 0.01333825, + "auxiliary_loss_mlp": 0.01101528, + "balance_loss_clip": 1.05625248, + "balance_loss_mlp": 1.09236324, + "epoch": 0.017616113031715015, + "flos": 22338807010560.0, + "grad_norm": 2.2014441192179866, + "language_loss": 0.8726995, + "learning_rate": 3.6571915369953646e-06, + "loss": 0.89705306, + "num_input_tokens_seen": 6122890, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 2.40625, + "step": 293, + "time_per_iteration": 2.531580686569214 + }, + { + "auxiliary_loss_clip": 0.01334314, + "auxiliary_loss_mlp": 0.0110389, + "balance_loss_clip": 1.05959213, + "balance_loss_mlp": 1.09177744, + "epoch": 0.017676236284382984, + "flos": 20156911263360.0, + "grad_norm": 2.3120260424061367, + "language_loss": 0.81276119, + "learning_rate": 3.6593852433202797e-06, + "loss": 0.83714324, + "num_input_tokens_seen": 6142890, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 2.4375, + "step": 294, + "time_per_iteration": 2.568784236907959 + }, + { + "auxiliary_loss_clip": 0.01334452, + "auxiliary_loss_mlp": 0.01107857, + "balance_loss_clip": 1.06274807, + "balance_loss_mlp": 1.08824301, + "epoch": 0.017736359537050956, + "flos": 25223331674880.0, + "grad_norm": 2.9227055740425705, + "language_loss": 0.83710909, + "learning_rate": 3.6615715007129453e-06, + "loss": 0.86153215, + "num_input_tokens_seen": 6162030, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 2.46875, + "step": 295, + "time_per_iteration": 2.5799450874328613 + }, + { + "auxiliary_loss_clip": 0.01339817, + "auxiliary_loss_mlp": 0.01110731, + "balance_loss_clip": 1.06559837, + "balance_loss_mlp": 1.09874845, + "epoch": 0.017796482789718925, + "flos": 20338798757760.0, + "grad_norm": 2.5339269047951727, + "language_loss": 0.84620988, + "learning_rate": 3.6637503595892897e-06, + "loss": 0.87071538, + "num_input_tokens_seen": 6180540, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 2.40625, + "step": 296, + "time_per_iteration": 2.5243051052093506 + }, + { + "auxiliary_loss_clip": 0.01338756, + "auxiliary_loss_mlp": 0.01097832, + "balance_loss_clip": 1.05417752, + "balance_loss_mlp": 1.09317493, + "epoch": 0.017856606042386893, + "flos": 22379206832640.0, + "grad_norm": 2.123858619871597, + "language_loss": 0.87729871, + "learning_rate": 3.665921869855132e-06, + "loss": 0.90166461, + "num_input_tokens_seen": 6199425, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 2.453125, + "step": 297, + "time_per_iteration": 2.5186710357666016 + }, + { + "auxiliary_loss_clip": 0.01337139, + "auxiliary_loss_mlp": 0.01100837, + "balance_loss_clip": 1.05713463, + "balance_loss_mlp": 1.09108877, + "epoch": 0.017916729295054862, + "flos": 20230061310720.0, + "grad_norm": 2.170328911832355, + "language_loss": 0.88528925, + "learning_rate": 3.6680860809130346e-06, + "loss": 0.90966904, + "num_input_tokens_seen": 6219170, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 2.46875, + "step": 298, + "time_per_iteration": 2.5320143699645996 + }, + { + "auxiliary_loss_clip": 0.0133273, + "auxiliary_loss_mlp": 0.01118432, + "balance_loss_clip": 1.07234538, + "balance_loss_mlp": 1.09249902, + "epoch": 0.01797685254772283, + "flos": 19390972625280.0, + "grad_norm": 1.8938405886263965, + "language_loss": 0.88666737, + "learning_rate": 3.6702430416690516e-06, + "loss": 0.91117901, + "num_input_tokens_seen": 6237930, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 2.40625, + "step": 299, + "time_per_iteration": 2.588275671005249 + }, + { + "auxiliary_loss_clip": 0.01340105, + "auxiliary_loss_mlp": 0.01105829, + "balance_loss_clip": 1.06055307, + "balance_loss_mlp": 1.09275746, + "epoch": 0.018036975800390802, + "flos": 24426007528320.0, + "grad_norm": 3.2936483356677253, + "language_loss": 0.64349103, + "learning_rate": 3.672392800539357e-06, + "loss": 0.66795039, + "num_input_tokens_seen": 6257170, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 2.46875, + "step": 300, + "time_per_iteration": 2.592313289642334 + }, + { + "auxiliary_loss_clip": 0.01338706, + "auxiliary_loss_mlp": 0.01105447, + "balance_loss_clip": 1.05986142, + "balance_loss_mlp": 1.09540462, + "epoch": 0.01809709905305877, + "flos": 15778933896960.0, + "grad_norm": 2.310898752337597, + "language_loss": 0.88330823, + "learning_rate": 3.6745354054567686e-06, + "loss": 0.90774977, + "num_input_tokens_seen": 6274780, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 2.4375, + "step": 301, + "time_per_iteration": 2.499481439590454 + }, + { + "auxiliary_loss_clip": 0.01214573, + "auxiliary_loss_mlp": 0.01024582, + "balance_loss_clip": 1.00932336, + "balance_loss_mlp": 1.08753991, + "epoch": 0.01815722230572674, + "flos": 67348382526720.0, + "grad_norm": 0.8370211186232274, + "language_loss": 0.62198341, + "learning_rate": 3.676670903877158e-06, + "loss": 0.64437497, + "num_input_tokens_seen": 6340435, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 1.265625, + "step": 302, + "time_per_iteration": 3.259997844696045 + }, + { + "auxiliary_loss_clip": 0.01329895, + "auxiliary_loss_mlp": 0.01107479, + "balance_loss_clip": 1.06198907, + "balance_loss_mlp": 1.08938098, + "epoch": 0.01821734555839471, + "flos": 15485615435520.0, + "grad_norm": 2.491293816938874, + "language_loss": 0.89617372, + "learning_rate": 3.6787993427857567e-06, + "loss": 0.92054749, + "num_input_tokens_seen": 6358160, + "router_z_loss_clip": 0.45507812, + "router_z_loss_mlp": 2.40625, + "step": 303, + "time_per_iteration": 2.536773920059204 + }, + { + "auxiliary_loss_clip": 0.01336859, + "auxiliary_loss_mlp": 0.01114111, + "balance_loss_clip": 1.06778669, + "balance_loss_mlp": 1.09363747, + "epoch": 0.018277468811062677, + "flos": 24097424889600.0, + "grad_norm": 4.887297609803561, + "language_loss": 0.80314684, + "learning_rate": 3.680920768703364e-06, + "loss": 0.82765651, + "num_input_tokens_seen": 6378485, + "router_z_loss_clip": 0.46289062, + "router_z_loss_mlp": 2.4375, + "step": 304, + "time_per_iteration": 2.563828945159912 + }, + { + "auxiliary_loss_clip": 0.01331614, + "auxiliary_loss_mlp": 0.01094816, + "balance_loss_clip": 1.05144823, + "balance_loss_mlp": 1.09657788, + "epoch": 0.01833759206373065, + "flos": 20959335141120.0, + "grad_norm": 1.8235558005033383, + "language_loss": 0.82894015, + "learning_rate": 3.6830352276924415e-06, + "loss": 0.85320443, + "num_input_tokens_seen": 6397845, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 2.34375, + "step": 305, + "time_per_iteration": 2.5195910930633545 + }, + { + "auxiliary_loss_clip": 0.01332168, + "auxiliary_loss_mlp": 0.01093114, + "balance_loss_clip": 1.04993677, + "balance_loss_mlp": 1.08868921, + "epoch": 0.018397715316398618, + "flos": 19390757143680.0, + "grad_norm": 1.9087210074301977, + "language_loss": 0.90843809, + "learning_rate": 3.685142765363119e-06, + "loss": 0.93269092, + "num_input_tokens_seen": 6416475, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 2.4375, + "step": 306, + "time_per_iteration": 2.501276969909668 + }, + { + "auxiliary_loss_clip": 0.01324982, + "auxiliary_loss_mlp": 0.01090544, + "balance_loss_clip": 1.04815364, + "balance_loss_mlp": 1.08638549, + "epoch": 0.018457838569066586, + "flos": 29132531619840.0, + "grad_norm": 2.1762826783898586, + "language_loss": 0.86435306, + "learning_rate": 3.687243426879095e-06, + "loss": 0.88850832, + "num_input_tokens_seen": 6437520, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 2.390625, + "step": 307, + "time_per_iteration": 2.6048038005828857 + }, + { + "auxiliary_loss_clip": 0.01325097, + "auxiliary_loss_mlp": 0.01106166, + "balance_loss_clip": 1.05817199, + "balance_loss_mlp": 1.09046888, + "epoch": 0.018517961821734555, + "flos": 19208654167680.0, + "grad_norm": 2.221444292833677, + "language_loss": 0.71723771, + "learning_rate": 3.6893372569634466e-06, + "loss": 0.74155033, + "num_input_tokens_seen": 6455680, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 2.34375, + "step": 308, + "time_per_iteration": 2.513774871826172 + }, + { + "auxiliary_loss_clip": 0.01331987, + "auxiliary_loss_mlp": 0.01102938, + "balance_loss_clip": 1.05904555, + "balance_loss_mlp": 1.08861351, + "epoch": 0.018578085074402523, + "flos": 19863018184320.0, + "grad_norm": 2.2254161740825293, + "language_loss": 0.91952753, + "learning_rate": 3.6914242999043395e-06, + "loss": 0.94387674, + "num_input_tokens_seen": 6474880, + "router_z_loss_clip": 0.43945312, + "router_z_loss_mlp": 2.4375, + "step": 309, + "time_per_iteration": 5.224750280380249 + }, + { + "auxiliary_loss_clip": 0.01338325, + "auxiliary_loss_mlp": 0.01104953, + "balance_loss_clip": 1.05896235, + "balance_loss_mlp": 1.08840334, + "epoch": 0.018638208327070496, + "flos": 29606947476480.0, + "grad_norm": 2.8056803187702135, + "language_loss": 0.72399509, + "learning_rate": 3.69350459956065e-06, + "loss": 0.74842793, + "num_input_tokens_seen": 6495945, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 2.5, + "step": 310, + "time_per_iteration": 2.584730863571167 + }, + { + "auxiliary_loss_clip": 0.01330325, + "auxiliary_loss_mlp": 0.01112154, + "balance_loss_clip": 1.06790328, + "balance_loss_mlp": 1.09306264, + "epoch": 0.018698331579738464, + "flos": 45731555907840.0, + "grad_norm": 12.392698164772181, + "language_loss": 0.74104297, + "learning_rate": 3.695578199367497e-06, + "loss": 0.76546776, + "num_input_tokens_seen": 6519930, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 2.375, + "step": 311, + "time_per_iteration": 2.734072208404541 + }, + { + "auxiliary_loss_clip": 0.01337963, + "auxiliary_loss_mlp": 0.0110935, + "balance_loss_clip": 1.06619668, + "balance_loss_mlp": 1.09045064, + "epoch": 0.018758454832406433, + "flos": 20483662308480.0, + "grad_norm": 2.2753160661232603, + "language_loss": 0.91518372, + "learning_rate": 3.6976451423416825e-06, + "loss": 0.93965685, + "num_input_tokens_seen": 6535070, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 2.46875, + "step": 312, + "time_per_iteration": 2.5117411613464355 + }, + { + "auxiliary_loss_clip": 0.01336169, + "auxiliary_loss_mlp": 0.01112089, + "balance_loss_clip": 1.06609774, + "balance_loss_mlp": 1.09088099, + "epoch": 0.0188185780850744, + "flos": 15777784661760.0, + "grad_norm": 2.320247917383294, + "language_loss": 0.89746982, + "learning_rate": 3.699705471087043e-06, + "loss": 0.92195237, + "num_input_tokens_seen": 6554135, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 2.453125, + "step": 313, + "time_per_iteration": 2.4761838912963867 + }, + { + "auxiliary_loss_clip": 0.01340305, + "auxiliary_loss_mlp": 0.01098393, + "balance_loss_clip": 1.05230689, + "balance_loss_mlp": 1.09061432, + "epoch": 0.018878701337742373, + "flos": 22455732758400.0, + "grad_norm": 2.3404867001555236, + "language_loss": 0.73099983, + "learning_rate": 3.7017592277997256e-06, + "loss": 0.75538683, + "num_input_tokens_seen": 6572275, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 2.5, + "step": 314, + "time_per_iteration": 2.5488638877868652 + }, + { + "auxiliary_loss_clip": 0.01326469, + "auxiliary_loss_mlp": 0.01103837, + "balance_loss_clip": 1.06101751, + "balance_loss_mlp": 1.08694446, + "epoch": 0.018938824590410342, + "flos": 30993530238720.0, + "grad_norm": 2.192553769026804, + "language_loss": 0.89887041, + "learning_rate": 3.7038064542733654e-06, + "loss": 0.92317349, + "num_input_tokens_seen": 6594520, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 2.40625, + "step": 315, + "time_per_iteration": 2.5857741832733154 + }, + { + "auxiliary_loss_clip": 0.01329672, + "auxiliary_loss_mlp": 0.01096027, + "balance_loss_clip": 1.05170512, + "balance_loss_mlp": 1.08870411, + "epoch": 0.01899894784307831, + "flos": 23258910821760.0, + "grad_norm": 1.8364758613144732, + "language_loss": 0.80796063, + "learning_rate": 3.7058471919041945e-06, + "loss": 0.83221763, + "num_input_tokens_seen": 6614245, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 2.40625, + "step": 316, + "time_per_iteration": 2.5222342014312744 + }, + { + "auxiliary_loss_clip": 0.01324399, + "auxiliary_loss_mlp": 0.01095063, + "balance_loss_clip": 1.05131364, + "balance_loss_mlp": 1.08633423, + "epoch": 0.01905907109574628, + "flos": 17457901367040.0, + "grad_norm": 2.1363686538021236, + "language_loss": 0.90357143, + "learning_rate": 3.7078814816960605e-06, + "loss": 0.92776608, + "num_input_tokens_seen": 6632015, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 2.375, + "step": 317, + "time_per_iteration": 2.4696123600006104 + }, + { + "auxiliary_loss_clip": 0.01319895, + "auxiliary_loss_mlp": 0.01095564, + "balance_loss_clip": 1.0515281, + "balance_loss_mlp": 1.0845592, + "epoch": 0.019119194348414248, + "flos": 14970225139200.0, + "grad_norm": 2.5260192321083794, + "language_loss": 0.90939772, + "learning_rate": 3.709909364265374e-06, + "loss": 0.93355227, + "num_input_tokens_seen": 6649015, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 2.34375, + "step": 318, + "time_per_iteration": 2.488128185272217 + }, + { + "auxiliary_loss_clip": 0.01324457, + "auxiliary_loss_mlp": 0.01088861, + "balance_loss_clip": 1.04706657, + "balance_loss_mlp": 1.08574772, + "epoch": 0.01917931760108222, + "flos": 25482822503040.0, + "grad_norm": 2.626221841877022, + "language_loss": 0.93980259, + "learning_rate": 3.7119308798459706e-06, + "loss": 0.96393579, + "num_input_tokens_seen": 6669225, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 2.390625, + "step": 319, + "time_per_iteration": 2.5184502601623535 + }, + { + "auxiliary_loss_clip": 0.01205117, + "auxiliary_loss_mlp": 0.01080363, + "balance_loss_clip": 1.06586683, + "balance_loss_mlp": 1.07482553, + "epoch": 0.01923944085375019, + "flos": 71556967353600.0, + "grad_norm": 0.9345393611259016, + "language_loss": 0.59860981, + "learning_rate": 3.7139460682939026e-06, + "loss": 0.62146461, + "num_input_tokens_seen": 6725775, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 1.296875, + "step": 320, + "time_per_iteration": 3.0250258445739746 + }, + { + "auxiliary_loss_clip": 0.01320993, + "auxiliary_loss_mlp": 0.0110086, + "balance_loss_clip": 1.05827808, + "balance_loss_mlp": 1.08425927, + "epoch": 0.019299564106418157, + "flos": 19682495406720.0, + "grad_norm": 3.0799113353921572, + "language_loss": 0.89622325, + "learning_rate": 3.715954969092154e-06, + "loss": 0.92044175, + "num_input_tokens_seen": 6744170, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 2.375, + "step": 321, + "time_per_iteration": 2.476439952850342 + }, + { + "auxiliary_loss_clip": 0.01332068, + "auxiliary_loss_mlp": 0.0112077, + "balance_loss_clip": 1.07620978, + "balance_loss_mlp": 1.08993089, + "epoch": 0.019359687359086126, + "flos": 24387151991040.0, + "grad_norm": 2.068543890023447, + "language_loss": 0.82884163, + "learning_rate": 3.7179576213552805e-06, + "loss": 0.85337007, + "num_input_tokens_seen": 6764565, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 2.421875, + "step": 322, + "time_per_iteration": 2.556302309036255 + }, + { + "auxiliary_loss_clip": 0.01332156, + "auxiliary_loss_mlp": 0.01090343, + "balance_loss_clip": 1.04828596, + "balance_loss_mlp": 1.08754158, + "epoch": 0.019419810611754094, + "flos": 23951376190080.0, + "grad_norm": 2.2506232399398245, + "language_loss": 0.72734368, + "learning_rate": 3.719954063833981e-06, + "loss": 0.75156873, + "num_input_tokens_seen": 6785310, + "router_z_loss_clip": 0.41992188, + "router_z_loss_mlp": 2.453125, + "step": 323, + "time_per_iteration": 2.5033397674560547 + }, + { + "auxiliary_loss_clip": 0.01318896, + "auxiliary_loss_mlp": 0.01090622, + "balance_loss_clip": 1.04763484, + "balance_loss_mlp": 1.08184087, + "epoch": 0.019479933864422067, + "flos": 22160223567360.0, + "grad_norm": 2.023515622890843, + "language_loss": 0.92639947, + "learning_rate": 3.721944334919596e-06, + "loss": 0.95049465, + "num_input_tokens_seen": 6803290, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 2.375, + "step": 324, + "time_per_iteration": 2.5194544792175293 + }, + { + "auxiliary_loss_clip": 0.01328869, + "auxiliary_loss_mlp": 0.01089838, + "balance_loss_clip": 1.04935479, + "balance_loss_mlp": 1.08943164, + "epoch": 0.019540057117090035, + "flos": 22236821320320.0, + "grad_norm": 4.018466874717804, + "language_loss": 0.65336061, + "learning_rate": 3.7239284726485375e-06, + "loss": 0.67754775, + "num_input_tokens_seen": 6822570, + "router_z_loss_clip": 0.40429688, + "router_z_loss_mlp": 2.390625, + "step": 325, + "time_per_iteration": 2.5107386112213135 + }, + { + "auxiliary_loss_clip": 0.0132709, + "auxiliary_loss_mlp": 0.01101196, + "balance_loss_clip": 1.05799484, + "balance_loss_mlp": 1.093485, + "epoch": 0.019600180369758004, + "flos": 23076771932160.0, + "grad_norm": 1.921455060851243, + "language_loss": 0.76449442, + "learning_rate": 3.72590651470665e-06, + "loss": 0.78877723, + "num_input_tokens_seen": 6841910, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 2.34375, + "step": 326, + "time_per_iteration": 2.5080325603485107 + }, + { + "auxiliary_loss_clip": 0.01320399, + "auxiliary_loss_mlp": 0.01103572, + "balance_loss_clip": 1.06015599, + "balance_loss_mlp": 1.08845115, + "epoch": 0.019660303622425972, + "flos": 25410857604480.0, + "grad_norm": 2.1551163890972123, + "language_loss": 0.79176939, + "learning_rate": 3.727878498433505e-06, + "loss": 0.8160091, + "num_input_tokens_seen": 6862480, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 2.3125, + "step": 327, + "time_per_iteration": 2.5449633598327637 + }, + { + "auxiliary_loss_clip": 0.01326802, + "auxiliary_loss_mlp": 0.01111954, + "balance_loss_clip": 1.06984949, + "balance_loss_mlp": 1.08873606, + "epoch": 0.01972042687509394, + "flos": 23657519024640.0, + "grad_norm": 2.1574079642063246, + "language_loss": 0.80725288, + "learning_rate": 3.7298444608266328e-06, + "loss": 0.83164048, + "num_input_tokens_seen": 6882015, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 2.390625, + "step": 328, + "time_per_iteration": 2.5418970584869385 + }, + { + "auxiliary_loss_clip": 0.01325663, + "auxiliary_loss_mlp": 0.01096681, + "balance_loss_clip": 1.05278802, + "balance_loss_mlp": 1.08396721, + "epoch": 0.019780550127761913, + "flos": 18223480869120.0, + "grad_norm": 2.245263087715646, + "language_loss": 0.93704766, + "learning_rate": 3.731804438545683e-06, + "loss": 0.96127105, + "num_input_tokens_seen": 6899785, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 2.40625, + "step": 329, + "time_per_iteration": 2.4910004138946533 + }, + { + "auxiliary_loss_clip": 0.01332781, + "auxiliary_loss_mlp": 0.01105781, + "balance_loss_clip": 1.06253231, + "balance_loss_mlp": 1.08930123, + "epoch": 0.01984067338042988, + "flos": 22418780641920.0, + "grad_norm": 2.9776357674257365, + "language_loss": 0.74277973, + "learning_rate": 3.7337584679165324e-06, + "loss": 0.7671653, + "num_input_tokens_seen": 6918575, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 2.4375, + "step": 330, + "time_per_iteration": 2.51430082321167 + }, + { + "auxiliary_loss_clip": 0.01328701, + "auxiliary_loss_mlp": 0.01120913, + "balance_loss_clip": 1.07814097, + "balance_loss_mlp": 1.08762872, + "epoch": 0.01990079663309785, + "flos": 17055199013760.0, + "grad_norm": 2.972763157156593, + "language_loss": 0.93870068, + "learning_rate": 3.7357065849353186e-06, + "loss": 0.96319681, + "num_input_tokens_seen": 6936965, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 2.40625, + "step": 331, + "time_per_iteration": 2.4759159088134766 + }, + { + "auxiliary_loss_clip": 0.01316192, + "auxiliary_loss_mlp": 0.01089699, + "balance_loss_clip": 1.04938233, + "balance_loss_mlp": 1.0853951, + "epoch": 0.01996091988576582, + "flos": 15961791058560.0, + "grad_norm": 2.6958694906457836, + "language_loss": 0.92730892, + "learning_rate": 3.737648825272422e-06, + "loss": 0.95136791, + "num_input_tokens_seen": 6953475, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 2.3125, + "step": 332, + "time_per_iteration": 2.49817156791687 + }, + { + "auxiliary_loss_clip": 0.01325132, + "auxiliary_loss_mlp": 0.01092519, + "balance_loss_clip": 1.04903162, + "balance_loss_mlp": 1.09081161, + "epoch": 0.02002104313843379, + "flos": 23586451966080.0, + "grad_norm": 2.6289067025313777, + "language_loss": 0.75589794, + "learning_rate": 3.739585224276384e-06, + "loss": 0.78007442, + "num_input_tokens_seen": 6971630, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 2.34375, + "step": 333, + "time_per_iteration": 2.5180609226226807 + }, + { + "auxiliary_loss_clip": 0.01323371, + "auxiliary_loss_mlp": 0.01087623, + "balance_loss_clip": 1.04597139, + "balance_loss_mlp": 1.08625877, + "epoch": 0.02008116639110176, + "flos": 34094883352320.0, + "grad_norm": 2.1766901409232426, + "language_loss": 0.78768885, + "learning_rate": 3.7415158169777673e-06, + "loss": 0.81179881, + "num_input_tokens_seen": 6992775, + "router_z_loss_clip": 0.41601562, + "router_z_loss_mlp": 2.375, + "step": 334, + "time_per_iteration": 2.614708423614502 + }, + { + "auxiliary_loss_clip": 0.01324397, + "auxiliary_loss_mlp": 0.01094838, + "balance_loss_clip": 1.05015838, + "balance_loss_mlp": 1.08276975, + "epoch": 0.020141289643769728, + "flos": 19683716469120.0, + "grad_norm": 2.4059127888346916, + "language_loss": 0.83083838, + "learning_rate": 3.7434406380929575e-06, + "loss": 0.85503072, + "num_input_tokens_seen": 7011425, + "router_z_loss_clip": 0.44726562, + "router_z_loss_mlp": 2.421875, + "step": 335, + "time_per_iteration": 2.495260000228882 + }, + { + "auxiliary_loss_clip": 0.01320649, + "auxiliary_loss_mlp": 0.01090782, + "balance_loss_clip": 1.04934454, + "balance_loss_mlp": 1.08585882, + "epoch": 0.020201412896437697, + "flos": 20740567357440.0, + "grad_norm": 2.166489879958422, + "language_loss": 0.92639577, + "learning_rate": 3.745359722027911e-06, + "loss": 0.95051014, + "num_input_tokens_seen": 7029450, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 2.34375, + "step": 336, + "time_per_iteration": 2.526906967163086 + }, + { + "auxiliary_loss_clip": 0.01321744, + "auxiliary_loss_mlp": 0.01083167, + "balance_loss_clip": 1.04139614, + "balance_loss_mlp": 1.08352447, + "epoch": 0.020261536149105665, + "flos": 20266510636800.0, + "grad_norm": 1.825762702383362, + "language_loss": 0.88474333, + "learning_rate": 3.7472731028818428e-06, + "loss": 0.90879244, + "num_input_tokens_seen": 7047555, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 2.390625, + "step": 337, + "time_per_iteration": 2.5151607990264893 + }, + { + "auxiliary_loss_clip": 0.01310297, + "auxiliary_loss_mlp": 0.01101804, + "balance_loss_clip": 1.05836427, + "balance_loss_mlp": 1.08001363, + "epoch": 0.020321659401773638, + "flos": 25848752307840.0, + "grad_norm": 1.5415234153999902, + "language_loss": 0.89914495, + "learning_rate": 3.7491808144508626e-06, + "loss": 0.92326593, + "num_input_tokens_seen": 7068185, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 2.3125, + "step": 338, + "time_per_iteration": 2.5795979499816895 + }, + { + "auxiliary_loss_clip": 0.01324391, + "auxiliary_loss_mlp": 0.01099602, + "balance_loss_clip": 1.05742574, + "balance_loss_mlp": 1.08479571, + "epoch": 0.020381782654441606, + "flos": 17495033051520.0, + "grad_norm": 2.047046576054304, + "language_loss": 0.84801471, + "learning_rate": 3.7510828902315576e-06, + "loss": 0.87225461, + "num_input_tokens_seen": 7085955, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 2.40625, + "step": 339, + "time_per_iteration": 2.4558403491973877 + }, + { + "auxiliary_loss_clip": 0.01326609, + "auxiliary_loss_mlp": 0.01093427, + "balance_loss_clip": 1.05001152, + "balance_loss_mlp": 1.08709431, + "epoch": 0.020441905907109575, + "flos": 24243940465920.0, + "grad_norm": 1.7544231793273473, + "language_loss": 0.88913274, + "learning_rate": 3.75297936342452e-06, + "loss": 0.91333312, + "num_input_tokens_seen": 7106345, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 2.40625, + "step": 340, + "time_per_iteration": 2.5330188274383545 + }, + { + "auxiliary_loss_clip": 0.01323557, + "auxiliary_loss_mlp": 0.01086176, + "balance_loss_clip": 1.04135346, + "balance_loss_mlp": 1.0859195, + "epoch": 0.020502029159777543, + "flos": 22233301787520.0, + "grad_norm": 2.2340783182785975, + "language_loss": 0.88071406, + "learning_rate": 3.7548702669378253e-06, + "loss": 0.90481138, + "num_input_tokens_seen": 7125070, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 2.375, + "step": 341, + "time_per_iteration": 2.502161979675293 + }, + { + "auxiliary_loss_clip": 0.01325847, + "auxiliary_loss_mlp": 0.01099304, + "balance_loss_clip": 1.05643678, + "balance_loss_mlp": 1.08389783, + "epoch": 0.020562152412445512, + "flos": 23987861429760.0, + "grad_norm": 3.2005009235922572, + "language_loss": 0.80293322, + "learning_rate": 3.756755633390458e-06, + "loss": 0.82718468, + "num_input_tokens_seen": 7144675, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 2.421875, + "step": 342, + "time_per_iteration": 2.5315535068511963 + }, + { + "auxiliary_loss_clip": 0.0131301, + "auxiliary_loss_mlp": 0.01098615, + "balance_loss_clip": 1.05293417, + "balance_loss_mlp": 1.08132875, + "epoch": 0.020622275665113484, + "flos": 26975305537920.0, + "grad_norm": 2.399130254204822, + "language_loss": 0.89451253, + "learning_rate": 3.7586354951156886e-06, + "loss": 0.91862881, + "num_input_tokens_seen": 7165505, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 2.3125, + "step": 343, + "time_per_iteration": 2.554255485534668 + }, + { + "auxiliary_loss_clip": 0.01325104, + "auxiliary_loss_mlp": 0.01094315, + "balance_loss_clip": 1.05342627, + "balance_loss_mlp": 1.08973229, + "epoch": 0.020682398917781453, + "flos": 22600704049920.0, + "grad_norm": 2.3234219523507296, + "language_loss": 0.78252918, + "learning_rate": 3.7605098841644e-06, + "loss": 0.80672336, + "num_input_tokens_seen": 7184605, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 2.359375, + "step": 344, + "time_per_iteration": 2.514665365219116 + }, + { + "auxiliary_loss_clip": 0.01309596, + "auxiliary_loss_mlp": 0.0110098, + "balance_loss_clip": 1.05730188, + "balance_loss_mlp": 1.08079529, + "epoch": 0.02074252217044942, + "flos": 15013605790080.0, + "grad_norm": 1.8371023099908983, + "language_loss": 0.75138956, + "learning_rate": 3.7623788323083666e-06, + "loss": 0.77549529, + "num_input_tokens_seen": 7203065, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 2.28125, + "step": 345, + "time_per_iteration": 2.513394594192505 + }, + { + "auxiliary_loss_clip": 0.01318525, + "auxiliary_loss_mlp": 0.01101003, + "balance_loss_clip": 1.05806339, + "balance_loss_mlp": 1.08789146, + "epoch": 0.02080264542311739, + "flos": 25337958952320.0, + "grad_norm": 2.0741733748571565, + "language_loss": 0.90269232, + "learning_rate": 3.7642423710434837e-06, + "loss": 0.92688763, + "num_input_tokens_seen": 7222995, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 2.3125, + "step": 346, + "time_per_iteration": 2.5487060546875 + }, + { + "auxiliary_loss_clip": 0.01314255, + "auxiliary_loss_mlp": 0.01095048, + "balance_loss_clip": 1.05527973, + "balance_loss_mlp": 1.08358788, + "epoch": 0.02086276867578536, + "flos": 24388804016640.0, + "grad_norm": 2.0766581400667, + "language_loss": 0.78869188, + "learning_rate": 3.7661005315929563e-06, + "loss": 0.81278479, + "num_input_tokens_seen": 7244625, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 2.3125, + "step": 347, + "time_per_iteration": 2.516402006149292 + }, + { + "auxiliary_loss_clip": 0.01317315, + "auxiliary_loss_mlp": 0.01096912, + "balance_loss_clip": 1.05335259, + "balance_loss_mlp": 1.08719826, + "epoch": 0.02092289192845333, + "flos": 24462205459200.0, + "grad_norm": 2.4234628631287927, + "language_loss": 0.71424043, + "learning_rate": 3.7679533449104354e-06, + "loss": 0.7383827, + "num_input_tokens_seen": 7263255, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 2.3125, + "step": 348, + "time_per_iteration": 2.5407540798187256 + }, + { + "auxiliary_loss_clip": 0.01319638, + "auxiliary_loss_mlp": 0.01101899, + "balance_loss_clip": 1.0595324, + "balance_loss_mlp": 1.08435416, + "epoch": 0.0209830151811213, + "flos": 17451185523840.0, + "grad_norm": 4.002924557181807, + "language_loss": 0.76819432, + "learning_rate": 3.7698008416831116e-06, + "loss": 0.79240972, + "num_input_tokens_seen": 7279275, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 2.34375, + "step": 349, + "time_per_iteration": 2.4884049892425537 + }, + { + "auxiliary_loss_clip": 0.0130292, + "auxiliary_loss_mlp": 0.0109884, + "balance_loss_clip": 1.05792725, + "balance_loss_mlp": 1.08141851, + "epoch": 0.021043138433789268, + "flos": 24573995562240.0, + "grad_norm": 1.9115672624672835, + "language_loss": 0.85271406, + "learning_rate": 3.7716430523347664e-06, + "loss": 0.87673163, + "num_input_tokens_seen": 7300180, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 2.21875, + "step": 350, + "time_per_iteration": 2.559812307357788 + }, + { + "auxiliary_loss_clip": 0.01311162, + "auxiliary_loss_mlp": 0.01089483, + "balance_loss_clip": 1.05083585, + "balance_loss_mlp": 1.08571863, + "epoch": 0.021103261686457236, + "flos": 24454053072000.0, + "grad_norm": 2.3355222976898764, + "language_loss": 0.80104828, + "learning_rate": 3.773480007028776e-06, + "loss": 0.82505476, + "num_input_tokens_seen": 7317430, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 2.25, + "step": 351, + "time_per_iteration": 5.468780517578125 + }, + { + "auxiliary_loss_clip": 0.01318524, + "auxiliary_loss_mlp": 0.01103443, + "balance_loss_clip": 1.06048024, + "balance_loss_mlp": 1.08623564, + "epoch": 0.021163384939125205, + "flos": 14683083816960.0, + "grad_norm": 3.8473493260702125, + "language_loss": 0.87258279, + "learning_rate": 3.775311735671078e-06, + "loss": 0.89680254, + "num_input_tokens_seen": 7334875, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 2.328125, + "step": 352, + "time_per_iteration": 2.4787278175354004 + }, + { + "auxiliary_loss_clip": 0.01312545, + "auxiliary_loss_mlp": 0.01105111, + "balance_loss_clip": 1.06248152, + "balance_loss_mlp": 1.08574009, + "epoch": 0.021223508191793177, + "flos": 24493195918080.0, + "grad_norm": 1.8920106465676412, + "language_loss": 0.82386625, + "learning_rate": 3.7771382679130878e-06, + "loss": 0.84804279, + "num_input_tokens_seen": 7355185, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 2.265625, + "step": 353, + "time_per_iteration": 2.5428433418273926 + }, + { + "auxiliary_loss_clip": 0.01307832, + "auxiliary_loss_mlp": 0.01091814, + "balance_loss_clip": 1.05133069, + "balance_loss_mlp": 1.08353949, + "epoch": 0.021283631444461146, + "flos": 24126978804480.0, + "grad_norm": 2.0636001035279694, + "language_loss": 0.8102631, + "learning_rate": 3.7789596331545845e-06, + "loss": 0.83425963, + "num_input_tokens_seen": 7374425, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 2.25, + "step": 354, + "time_per_iteration": 2.5261166095733643 + }, + { + "auxiliary_loss_clip": 0.01315043, + "auxiliary_loss_mlp": 0.01092413, + "balance_loss_clip": 1.04935455, + "balance_loss_mlp": 1.08190715, + "epoch": 0.021343754697129114, + "flos": 25192233475200.0, + "grad_norm": 2.8065821662627575, + "language_loss": 0.80764574, + "learning_rate": 3.780775860546545e-06, + "loss": 0.83172029, + "num_input_tokens_seen": 7394175, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 2.328125, + "step": 355, + "time_per_iteration": 2.56968355178833 + }, + { + "auxiliary_loss_clip": 0.01310125, + "auxiliary_loss_mlp": 0.01086869, + "balance_loss_clip": 1.0454793, + "balance_loss_mlp": 1.08140039, + "epoch": 0.021403877949797083, + "flos": 17274182279040.0, + "grad_norm": 2.2488803729957, + "language_loss": 0.89553398, + "learning_rate": 3.7825869789939474e-06, + "loss": 0.91950381, + "num_input_tokens_seen": 7412645, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 2.28125, + "step": 356, + "time_per_iteration": 2.5510213375091553 + }, + { + "auxiliary_loss_clip": 0.01308646, + "auxiliary_loss_mlp": 0.01083372, + "balance_loss_clip": 1.04117227, + "balance_loss_mlp": 1.08451605, + "epoch": 0.021464001202465055, + "flos": 30917435276160.0, + "grad_norm": 2.7055681522526522, + "language_loss": 0.80032516, + "learning_rate": 3.784393017158528e-06, + "loss": 0.82424533, + "num_input_tokens_seen": 7432275, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 2.234375, + "step": 357, + "time_per_iteration": 2.5834848880767822 + }, + { + "auxiliary_loss_clip": 0.01311386, + "auxiliary_loss_mlp": 0.0108216, + "balance_loss_clip": 1.04336917, + "balance_loss_mlp": 1.08195996, + "epoch": 0.021524124455133024, + "flos": 18186385098240.0, + "grad_norm": 2.3810225918991827, + "language_loss": 0.7661376, + "learning_rate": 3.786194003461506e-06, + "loss": 0.7900731, + "num_input_tokens_seen": 7450245, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 2.296875, + "step": 358, + "time_per_iteration": 2.4937031269073486 + }, + { + "auxiliary_loss_clip": 0.01308618, + "auxiliary_loss_mlp": 0.01088514, + "balance_loss_clip": 1.04574156, + "balance_loss_mlp": 1.08024073, + "epoch": 0.021584247707800992, + "flos": 13805786039040.0, + "grad_norm": 3.004949550769694, + "language_loss": 0.88491321, + "learning_rate": 3.787989966086264e-06, + "loss": 0.90888453, + "num_input_tokens_seen": 7466845, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 2.28125, + "step": 359, + "time_per_iteration": 2.452698230743408 + }, + { + "auxiliary_loss_clip": 0.01316066, + "auxiliary_loss_mlp": 0.01089057, + "balance_loss_clip": 1.05000377, + "balance_loss_mlp": 1.08438587, + "epoch": 0.02164437096046896, + "flos": 23294713703040.0, + "grad_norm": 2.789884231725057, + "language_loss": 0.76007903, + "learning_rate": 3.789780932980997e-06, + "loss": 0.78413033, + "num_input_tokens_seen": 7485450, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.3125, + "step": 360, + "time_per_iteration": 2.490006685256958 + }, + { + "auxiliary_loss_clip": 0.01189834, + "auxiliary_loss_mlp": 0.010797, + "balance_loss_clip": 1.06634831, + "balance_loss_mlp": 1.06162107, + "epoch": 0.02170449421313693, + "flos": 68899578341760.0, + "grad_norm": 0.8685264055585812, + "language_loss": 0.64943242, + "learning_rate": 3.79156693186132e-06, + "loss": 0.67212784, + "num_input_tokens_seen": 7553780, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 1.28125, + "step": 361, + "time_per_iteration": 3.1978280544281006 + }, + { + "auxiliary_loss_clip": 0.01307066, + "auxiliary_loss_mlp": 0.01088482, + "balance_loss_clip": 1.04826093, + "balance_loss_mlp": 1.0776422, + "epoch": 0.0217646174658049, + "flos": 25228539146880.0, + "grad_norm": 2.6839093883440213, + "language_loss": 0.78157276, + "learning_rate": 3.7933479902128433e-06, + "loss": 0.80552828, + "num_input_tokens_seen": 7574155, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 2.296875, + "step": 362, + "time_per_iteration": 2.5401153564453125 + }, + { + "auxiliary_loss_clip": 0.0131339, + "auxiliary_loss_mlp": 0.01092034, + "balance_loss_clip": 1.05171776, + "balance_loss_mlp": 1.08265781, + "epoch": 0.02182474071847287, + "flos": 22893124671360.0, + "grad_norm": 2.163466714708112, + "language_loss": 0.92508751, + "learning_rate": 3.7951241352937077e-06, + "loss": 0.94914174, + "num_input_tokens_seen": 7592320, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 2.3125, + "step": 363, + "time_per_iteration": 2.4868171215057373 + }, + { + "auxiliary_loss_clip": 0.01307593, + "auxiliary_loss_mlp": 0.01102168, + "balance_loss_clip": 1.06270981, + "balance_loss_mlp": 1.08121252, + "epoch": 0.02188486397114084, + "flos": 23658991482240.0, + "grad_norm": 2.137373361500905, + "language_loss": 0.89611077, + "learning_rate": 3.7968953941370915e-06, + "loss": 0.92020839, + "num_input_tokens_seen": 7611185, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 2.265625, + "step": 364, + "time_per_iteration": 2.5251975059509277 + }, + { + "auxiliary_loss_clip": 0.01313873, + "auxiliary_loss_mlp": 0.01094072, + "balance_loss_clip": 1.05232477, + "balance_loss_mlp": 1.08512843, + "epoch": 0.021944987223808807, + "flos": 21543637680000.0, + "grad_norm": 2.0040846596101867, + "language_loss": 0.79597497, + "learning_rate": 3.798661793553676e-06, + "loss": 0.82005441, + "num_input_tokens_seen": 7631970, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 2.28125, + "step": 365, + "time_per_iteration": 2.5358779430389404 + }, + { + "auxiliary_loss_clip": 0.01307321, + "auxiliary_loss_mlp": 0.01094001, + "balance_loss_clip": 1.05218291, + "balance_loss_mlp": 1.08262253, + "epoch": 0.022005110476476776, + "flos": 16070887641600.0, + "grad_norm": 2.4198695758814126, + "language_loss": 0.84312123, + "learning_rate": 3.8004233601340808e-06, + "loss": 0.86713445, + "num_input_tokens_seen": 7649745, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 2.25, + "step": 366, + "time_per_iteration": 2.4834306240081787 + }, + { + "auxiliary_loss_clip": 0.01314411, + "auxiliary_loss_mlp": 0.01089093, + "balance_loss_clip": 1.05008757, + "balance_loss_mlp": 1.08409071, + "epoch": 0.022065233729144748, + "flos": 21433715084160.0, + "grad_norm": 2.4790438398014114, + "language_loss": 0.87009263, + "learning_rate": 3.8021801202512694e-06, + "loss": 0.89412761, + "num_input_tokens_seen": 7668830, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.296875, + "step": 367, + "time_per_iteration": 2.486476421356201 + }, + { + "auxiliary_loss_clip": 0.01315695, + "auxiliary_loss_mlp": 0.01094559, + "balance_loss_clip": 1.05247772, + "balance_loss_mlp": 1.08183074, + "epoch": 0.022125356981812717, + "flos": 21543709507200.0, + "grad_norm": 3.1787846704720906, + "language_loss": 0.84725291, + "learning_rate": 3.803932100062912e-06, + "loss": 0.87135541, + "num_input_tokens_seen": 7687240, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 2.34375, + "step": 368, + "time_per_iteration": 2.522035837173462 + }, + { + "auxiliary_loss_clip": 0.01314671, + "auxiliary_loss_mlp": 0.01085486, + "balance_loss_clip": 1.04559815, + "balance_loss_mlp": 1.07997978, + "epoch": 0.022185480234480685, + "flos": 20704153944960.0, + "grad_norm": 3.205334425353566, + "language_loss": 0.75328851, + "learning_rate": 3.8056793255137264e-06, + "loss": 0.77728999, + "num_input_tokens_seen": 7704440, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 2.34375, + "step": 369, + "time_per_iteration": 2.5247385501861572 + }, + { + "auxiliary_loss_clip": 0.01309465, + "auxiliary_loss_mlp": 0.01102826, + "balance_loss_clip": 1.06241453, + "balance_loss_mlp": 1.08204889, + "epoch": 0.022245603487148654, + "flos": 25193203142400.0, + "grad_norm": 2.195001895084689, + "language_loss": 0.82444763, + "learning_rate": 3.8074218223377844e-06, + "loss": 0.84857059, + "num_input_tokens_seen": 7727160, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 2.28125, + "step": 370, + "time_per_iteration": 2.556654453277588 + }, + { + "auxiliary_loss_clip": 0.01308477, + "auxiliary_loss_mlp": 0.01102256, + "balance_loss_clip": 1.06186807, + "balance_loss_mlp": 1.08148122, + "epoch": 0.022305726739816623, + "flos": 21395936954880.0, + "grad_norm": 1.701167396379405, + "language_loss": 0.81576145, + "learning_rate": 3.8091596160607834e-06, + "loss": 0.83986878, + "num_input_tokens_seen": 7747730, + "router_z_loss_clip": 0.40429688, + "router_z_loss_mlp": 2.265625, + "step": 371, + "time_per_iteration": 2.5303707122802734 + }, + { + "auxiliary_loss_clip": 0.01313813, + "auxiliary_loss_mlp": 0.01097647, + "balance_loss_clip": 1.05611479, + "balance_loss_mlp": 1.08685589, + "epoch": 0.022365849992484595, + "flos": 22492146170880.0, + "grad_norm": 2.421527930745161, + "language_loss": 0.83273733, + "learning_rate": 3.8108927320022896e-06, + "loss": 0.85685182, + "num_input_tokens_seen": 7766765, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 2.28125, + "step": 372, + "time_per_iteration": 2.528141975402832 + }, + { + "auxiliary_loss_clip": 0.01304775, + "auxiliary_loss_mlp": 0.01093239, + "balance_loss_clip": 1.05170679, + "balance_loss_mlp": 1.08068216, + "epoch": 0.022425973245152563, + "flos": 17856581397120.0, + "grad_norm": 3.9515576064335742, + "language_loss": 0.78448784, + "learning_rate": 3.8126211952779548e-06, + "loss": 0.80846798, + "num_input_tokens_seen": 7784010, + "router_z_loss_clip": 0.41601562, + "router_z_loss_mlp": 2.234375, + "step": 373, + "time_per_iteration": 2.4879236221313477 + }, + { + "auxiliary_loss_clip": 0.01310159, + "auxiliary_loss_mlp": 0.01088775, + "balance_loss_clip": 1.04681301, + "balance_loss_mlp": 1.08387947, + "epoch": 0.022486096497820532, + "flos": 15483029656320.0, + "grad_norm": 2.577150517784044, + "language_loss": 0.77507353, + "learning_rate": 3.8143450308016952e-06, + "loss": 0.79906291, + "num_input_tokens_seen": 7801305, + "router_z_loss_clip": 0.41992188, + "router_z_loss_mlp": 2.265625, + "step": 374, + "time_per_iteration": 2.467660665512085 + }, + { + "auxiliary_loss_clip": 0.01300907, + "auxiliary_loss_mlp": 0.01075524, + "balance_loss_clip": 1.03415811, + "balance_loss_mlp": 1.07458413, + "epoch": 0.0225462197504885, + "flos": 27784157950080.0, + "grad_norm": 2.1361288872426187, + "language_loss": 0.85989249, + "learning_rate": 3.8160642632878525e-06, + "loss": 0.8836568, + "num_input_tokens_seen": 7823965, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 2.265625, + "step": 375, + "time_per_iteration": 2.555748224258423 + }, + { + "auxiliary_loss_clip": 0.01307901, + "auxiliary_loss_mlp": 0.01100092, + "balance_loss_clip": 1.05767775, + "balance_loss_mlp": 1.08341241, + "epoch": 0.02260634300315647, + "flos": 19975490645760.0, + "grad_norm": 5.5735447387306785, + "language_loss": 0.89170349, + "learning_rate": 3.817778917253314e-06, + "loss": 0.91578341, + "num_input_tokens_seen": 7842115, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 2.25, + "step": 376, + "time_per_iteration": 2.53151798248291 + }, + { + "auxiliary_loss_clip": 0.01309113, + "auxiliary_loss_mlp": 0.01087831, + "balance_loss_clip": 1.04908752, + "balance_loss_mlp": 1.07899499, + "epoch": 0.02266646625582444, + "flos": 16028189349120.0, + "grad_norm": 4.261190841992283, + "language_loss": 0.74947262, + "learning_rate": 3.8194890170196155e-06, + "loss": 0.77344215, + "num_input_tokens_seen": 7857830, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 2.3125, + "step": 377, + "time_per_iteration": 2.463115692138672 + }, + { + "auxiliary_loss_clip": 0.0129987, + "auxiliary_loss_mlp": 0.01091273, + "balance_loss_clip": 1.05009794, + "balance_loss_mlp": 1.08131123, + "epoch": 0.02272658950849241, + "flos": 20404622430720.0, + "grad_norm": 9.398931100052017, + "language_loss": 0.99195766, + "learning_rate": 3.8211945867150055e-06, + "loss": 1.01586914, + "num_input_tokens_seen": 7875840, + "router_z_loss_clip": 0.41210938, + "router_z_loss_mlp": 2.1875, + "step": 378, + "time_per_iteration": 2.4765851497650146 + }, + { + "auxiliary_loss_clip": 0.01180245, + "auxiliary_loss_mlp": 0.0112236, + "balance_loss_clip": 1.10910404, + "balance_loss_mlp": 1.06006432, + "epoch": 0.02278671276116038, + "flos": 69847332647040.0, + "grad_norm": 0.9843357397114052, + "language_loss": 0.75457036, + "learning_rate": 3.822895650276492e-06, + "loss": 0.77759647, + "num_input_tokens_seen": 7940190, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 1.203125, + "step": 379, + "time_per_iteration": 3.113067388534546 + }, + { + "auxiliary_loss_clip": 0.01308809, + "auxiliary_loss_mlp": 0.01083458, + "balance_loss_clip": 1.0448581, + "balance_loss_mlp": 1.07811105, + "epoch": 0.022846836013828347, + "flos": 38508771340800.0, + "grad_norm": 4.195302770466088, + "language_loss": 0.78423429, + "learning_rate": 3.824592231451859e-06, + "loss": 0.80815697, + "num_input_tokens_seen": 7960840, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 2.3125, + "step": 380, + "time_per_iteration": 2.6457204818725586 + }, + { + "auxiliary_loss_clip": 0.01302565, + "auxiliary_loss_mlp": 0.01083038, + "balance_loss_clip": 1.04527259, + "balance_loss_mlp": 1.08019924, + "epoch": 0.02290695926649632, + "flos": 20959478795520.0, + "grad_norm": 2.272240555091753, + "language_loss": 0.9679752, + "learning_rate": 3.826284353801652e-06, + "loss": 0.99183118, + "num_input_tokens_seen": 7975500, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 2.21875, + "step": 381, + "time_per_iteration": 2.485316038131714 + }, + { + "auxiliary_loss_clip": 0.01312325, + "auxiliary_loss_mlp": 0.01084569, + "balance_loss_clip": 1.04501581, + "balance_loss_mlp": 1.08177519, + "epoch": 0.022967082519164288, + "flos": 24022407335040.0, + "grad_norm": 2.322972014312181, + "language_loss": 0.88035834, + "learning_rate": 3.827972040701142e-06, + "loss": 0.90432727, + "num_input_tokens_seen": 7993880, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 2.3125, + "step": 382, + "time_per_iteration": 2.5361156463623047 + }, + { + "auxiliary_loss_clip": 0.01306631, + "auxiliary_loss_mlp": 0.01099641, + "balance_loss_clip": 1.06080246, + "balance_loss_mlp": 1.08242524, + "epoch": 0.023027205771832256, + "flos": 20997149184000.0, + "grad_norm": 2.197151340607638, + "language_loss": 0.84830511, + "learning_rate": 3.829655315342268e-06, + "loss": 0.87236774, + "num_input_tokens_seen": 8012730, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 2.25, + "step": 383, + "time_per_iteration": 2.479097843170166 + }, + { + "auxiliary_loss_clip": 0.01303681, + "auxiliary_loss_mlp": 0.01106386, + "balance_loss_clip": 1.06673658, + "balance_loss_mlp": 1.08259249, + "epoch": 0.023087329024500225, + "flos": 21360816432000.0, + "grad_norm": 2.2992198386883116, + "language_loss": 0.83199835, + "learning_rate": 3.831334200735543e-06, + "loss": 0.85609907, + "num_input_tokens_seen": 8031275, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 2.203125, + "step": 384, + "time_per_iteration": 2.5008413791656494 + }, + { + "auxiliary_loss_clip": 0.01303616, + "auxiliary_loss_mlp": 0.0109643, + "balance_loss_clip": 1.06030965, + "balance_loss_mlp": 1.08539534, + "epoch": 0.023147452277168194, + "flos": 21872435800320.0, + "grad_norm": 1.8570399395654076, + "language_loss": 0.89240694, + "learning_rate": 3.8330087197119426e-06, + "loss": 0.91640741, + "num_input_tokens_seen": 8051600, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 2.1875, + "step": 385, + "time_per_iteration": 2.4913859367370605 + }, + { + "auxiliary_loss_clip": 0.01306859, + "auxiliary_loss_mlp": 0.01121647, + "balance_loss_clip": 1.08397639, + "balance_loss_mlp": 1.0826149, + "epoch": 0.023207575529836166, + "flos": 18916700423040.0, + "grad_norm": 2.2576284783670357, + "language_loss": 0.70096415, + "learning_rate": 3.83467889492477e-06, + "loss": 0.72524917, + "num_input_tokens_seen": 8070600, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 2.234375, + "step": 386, + "time_per_iteration": 2.5017154216766357 + }, + { + "auxiliary_loss_clip": 0.01308067, + "auxiliary_loss_mlp": 0.01098351, + "balance_loss_clip": 1.06072879, + "balance_loss_mlp": 1.08460176, + "epoch": 0.023267698782504134, + "flos": 25046005207680.0, + "grad_norm": 1.9470877788533054, + "language_loss": 0.87909782, + "learning_rate": 3.836344748851495e-06, + "loss": 0.90316188, + "num_input_tokens_seen": 8090680, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 2.234375, + "step": 387, + "time_per_iteration": 2.5142157077789307 + }, + { + "auxiliary_loss_clip": 0.01308318, + "auxiliary_loss_mlp": 0.01085815, + "balance_loss_clip": 1.04666662, + "balance_loss_mlp": 1.08291698, + "epoch": 0.023327822035172103, + "flos": 28879217930880.0, + "grad_norm": 2.441105853176172, + "language_loss": 0.83429295, + "learning_rate": 3.838006303795566e-06, + "loss": 0.85823429, + "num_input_tokens_seen": 8114610, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.25, + "step": 388, + "time_per_iteration": 2.591242790222168 + }, + { + "auxiliary_loss_clip": 0.01305661, + "auxiliary_loss_mlp": 0.01094305, + "balance_loss_clip": 1.05754054, + "balance_loss_mlp": 1.08271885, + "epoch": 0.02338794528784007, + "flos": 27121533805440.0, + "grad_norm": 3.2646980282386644, + "language_loss": 0.93823689, + "learning_rate": 3.839663581888206e-06, + "loss": 0.96223652, + "num_input_tokens_seen": 8133975, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 2.21875, + "step": 389, + "time_per_iteration": 2.5427236557006836 + }, + { + "auxiliary_loss_clip": 0.01299094, + "auxiliary_loss_mlp": 0.01087693, + "balance_loss_clip": 1.04954624, + "balance_loss_mlp": 1.08334351, + "epoch": 0.02344806854050804, + "flos": 21322355944320.0, + "grad_norm": 2.08298220488583, + "language_loss": 0.87901413, + "learning_rate": 3.841316605090178e-06, + "loss": 0.90288198, + "num_input_tokens_seen": 8153570, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 2.15625, + "step": 390, + "time_per_iteration": 2.53519606590271 + }, + { + "auxiliary_loss_clip": 0.01304239, + "auxiliary_loss_mlp": 0.01095828, + "balance_loss_clip": 1.05927861, + "balance_loss_mlp": 1.08334053, + "epoch": 0.023508191793176012, + "flos": 24789997998720.0, + "grad_norm": 2.2293869448662362, + "language_loss": 0.89346433, + "learning_rate": 3.842965395193529e-06, + "loss": 0.91746497, + "num_input_tokens_seen": 8170075, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 2.203125, + "step": 391, + "time_per_iteration": 2.5662949085235596 + }, + { + "auxiliary_loss_clip": 0.01302453, + "auxiliary_loss_mlp": 0.0107275, + "balance_loss_clip": 1.03560483, + "balance_loss_mlp": 1.08116579, + "epoch": 0.02356831504584398, + "flos": 25995375624960.0, + "grad_norm": 2.022763227206087, + "language_loss": 0.86065882, + "learning_rate": 3.84460997382332e-06, + "loss": 0.88441086, + "num_input_tokens_seen": 8190420, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 2.21875, + "step": 392, + "time_per_iteration": 4.050429105758667 + }, + { + "auxiliary_loss_clip": 0.01297975, + "auxiliary_loss_mlp": 0.01086863, + "balance_loss_clip": 1.04990816, + "balance_loss_mlp": 1.08006191, + "epoch": 0.02362843829851195, + "flos": 19062461813760.0, + "grad_norm": 2.9628480690926318, + "language_loss": 0.88900077, + "learning_rate": 3.8462503624393256e-06, + "loss": 0.91284919, + "num_input_tokens_seen": 8208790, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 2.1875, + "step": 393, + "time_per_iteration": 3.9293932914733887 + }, + { + "auxiliary_loss_clip": 0.01309989, + "auxiliary_loss_mlp": 0.01103908, + "balance_loss_clip": 1.06449771, + "balance_loss_mlp": 1.087502, + "epoch": 0.023688561551179918, + "flos": 16071031296000.0, + "grad_norm": 2.0531375516435943, + "language_loss": 0.81400156, + "learning_rate": 3.84788658233771e-06, + "loss": 0.83814055, + "num_input_tokens_seen": 8226885, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 2.21875, + "step": 394, + "time_per_iteration": 2.552100658416748 + }, + { + "auxiliary_loss_clip": 0.01299653, + "auxiliary_loss_mlp": 0.01084647, + "balance_loss_clip": 1.04611897, + "balance_loss_mlp": 1.08043575, + "epoch": 0.023748684803847887, + "flos": 21724375939200.0, + "grad_norm": 2.0447414784698092, + "language_loss": 0.86189264, + "learning_rate": 3.84951865465269e-06, + "loss": 0.88573563, + "num_input_tokens_seen": 8246825, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 2.1875, + "step": 395, + "time_per_iteration": 2.536823272705078 + }, + { + "auxiliary_loss_clip": 0.01174527, + "auxiliary_loss_mlp": 0.01044608, + "balance_loss_clip": 1.03135228, + "balance_loss_mlp": 1.0590049, + "epoch": 0.02380880805651586, + "flos": 61926192881280.0, + "grad_norm": 0.9487784547172928, + "language_loss": 0.63808912, + "learning_rate": 3.851146600358172e-06, + "loss": 0.66028047, + "num_input_tokens_seen": 8302835, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 1.15625, + "step": 396, + "time_per_iteration": 2.935506582260132 + }, + { + "auxiliary_loss_clip": 0.01296295, + "auxiliary_loss_mlp": 0.01069502, + "balance_loss_clip": 1.03252339, + "balance_loss_mlp": 1.07895613, + "epoch": 0.023868931309183827, + "flos": 20266331068800.0, + "grad_norm": 2.6168641306315172, + "language_loss": 0.83744055, + "learning_rate": 3.852770440269372e-06, + "loss": 0.86109853, + "num_input_tokens_seen": 8320745, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 2.171875, + "step": 397, + "time_per_iteration": 2.5051028728485107 + }, + { + "auxiliary_loss_clip": 0.01302535, + "auxiliary_loss_mlp": 0.01091033, + "balance_loss_clip": 1.05288601, + "balance_loss_mlp": 1.08300877, + "epoch": 0.023929054561851796, + "flos": 21139103733120.0, + "grad_norm": 2.535145802301163, + "language_loss": 0.84050488, + "learning_rate": 3.854390195044404e-06, + "loss": 0.86444056, + "num_input_tokens_seen": 8339540, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 2.1875, + "step": 398, + "time_per_iteration": 2.5234646797180176 + }, + { + "auxiliary_loss_clip": 0.01300466, + "auxiliary_loss_mlp": 0.0108273, + "balance_loss_clip": 1.04427278, + "balance_loss_mlp": 1.07864475, + "epoch": 0.023989177814519765, + "flos": 13698521049600.0, + "grad_norm": 2.904470095612531, + "language_loss": 0.85865271, + "learning_rate": 3.856005885185868e-06, + "loss": 0.88248467, + "num_input_tokens_seen": 8354890, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 2.21875, + "step": 399, + "time_per_iteration": 2.4674201011657715 + }, + { + "auxiliary_loss_clip": 0.01295496, + "auxiliary_loss_mlp": 0.01093118, + "balance_loss_clip": 1.05566239, + "balance_loss_mlp": 1.08021355, + "epoch": 0.024049301067187733, + "flos": 26322018929280.0, + "grad_norm": 2.016759933832732, + "language_loss": 0.86157769, + "learning_rate": 3.857617531042398e-06, + "loss": 0.88546383, + "num_input_tokens_seen": 8375845, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 2.15625, + "step": 400, + "time_per_iteration": 2.554075241088867 + }, + { + "auxiliary_loss_clip": 0.01303599, + "auxiliary_loss_mlp": 0.01083552, + "balance_loss_clip": 1.04652512, + "balance_loss_mlp": 1.0848943, + "epoch": 0.024109424319855705, + "flos": 24425432910720.0, + "grad_norm": 3.068890951588493, + "language_loss": 0.79142016, + "learning_rate": 3.8592251528102065e-06, + "loss": 0.8152917, + "num_input_tokens_seen": 8395240, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 2.1875, + "step": 401, + "time_per_iteration": 2.543750286102295 + }, + { + "auxiliary_loss_clip": 0.01297911, + "auxiliary_loss_mlp": 0.01096359, + "balance_loss_clip": 1.05968988, + "balance_loss_mlp": 1.07987046, + "epoch": 0.024169547572523674, + "flos": 29604397610880.0, + "grad_norm": 2.2009554384450154, + "language_loss": 0.78456193, + "learning_rate": 3.8608287705345976e-06, + "loss": 0.80850464, + "num_input_tokens_seen": 8416950, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 2.1875, + "step": 402, + "time_per_iteration": 2.5531415939331055 + }, + { + "auxiliary_loss_clip": 0.0130167, + "auxiliary_loss_mlp": 0.01084273, + "balance_loss_clip": 1.04529142, + "balance_loss_mlp": 1.07989287, + "epoch": 0.024229670825191642, + "flos": 22601458235520.0, + "grad_norm": 2.7198213535828923, + "language_loss": 0.94637424, + "learning_rate": 3.86242840411147e-06, + "loss": 0.97023368, + "num_input_tokens_seen": 8433660, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.21875, + "step": 403, + "time_per_iteration": 2.4873671531677246 + }, + { + "auxiliary_loss_clip": 0.01306025, + "auxiliary_loss_mlp": 0.01095616, + "balance_loss_clip": 1.05620587, + "balance_loss_mlp": 1.07952547, + "epoch": 0.02428979407785961, + "flos": 18150258994560.0, + "grad_norm": 2.3706875621243246, + "language_loss": 0.99751151, + "learning_rate": 3.864024073288798e-06, + "loss": 1.02152789, + "num_input_tokens_seen": 8450180, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 2.265625, + "step": 404, + "time_per_iteration": 2.5400550365448 + }, + { + "auxiliary_loss_clip": 0.01304501, + "auxiliary_loss_mlp": 0.01104455, + "balance_loss_clip": 1.06716657, + "balance_loss_mlp": 1.08213115, + "epoch": 0.024349917330527583, + "flos": 15304984917120.0, + "grad_norm": 2.480197457162756, + "language_loss": 0.87603909, + "learning_rate": 3.865615797668091e-06, + "loss": 0.90012866, + "num_input_tokens_seen": 8467775, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 2.21875, + "step": 405, + "time_per_iteration": 2.4698479175567627 + }, + { + "auxiliary_loss_clip": 0.01314075, + "auxiliary_loss_mlp": 0.01107285, + "balance_loss_clip": 1.06835127, + "balance_loss_mlp": 1.08775485, + "epoch": 0.024410040583195552, + "flos": 20773892200320.0, + "grad_norm": 3.242686201363518, + "language_loss": 0.93258083, + "learning_rate": 3.867203596705844e-06, + "loss": 0.9567945, + "num_input_tokens_seen": 8486765, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 2.265625, + "step": 406, + "time_per_iteration": 2.505598306655884 + }, + { + "auxiliary_loss_clip": 0.01305046, + "auxiliary_loss_mlp": 0.01092168, + "balance_loss_clip": 1.05330622, + "balance_loss_mlp": 1.08378315, + "epoch": 0.02447016383586352, + "flos": 21798854789760.0, + "grad_norm": 2.059728688773918, + "language_loss": 0.87446553, + "learning_rate": 3.86878748971496e-06, + "loss": 0.89843762, + "num_input_tokens_seen": 8506515, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 2.21875, + "step": 407, + "time_per_iteration": 2.5017173290252686 + }, + { + "auxiliary_loss_clip": 0.01301523, + "auxiliary_loss_mlp": 0.01085362, + "balance_loss_clip": 1.04814506, + "balance_loss_mlp": 1.08445001, + "epoch": 0.02453028708853149, + "flos": 33948116380800.0, + "grad_norm": 2.439524495250932, + "language_loss": 0.7404871, + "learning_rate": 3.8703674958661596e-06, + "loss": 0.76435596, + "num_input_tokens_seen": 8528035, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 2.171875, + "step": 408, + "time_per_iteration": 2.6097092628479004 + }, + { + "auxiliary_loss_clip": 0.013061, + "auxiliary_loss_mlp": 0.01096961, + "balance_loss_clip": 1.05771768, + "balance_loss_mlp": 1.08381224, + "epoch": 0.024590410341199458, + "flos": 21793000872960.0, + "grad_norm": 2.750776221383638, + "language_loss": 0.92393035, + "learning_rate": 3.871943634189376e-06, + "loss": 0.94796097, + "num_input_tokens_seen": 8546455, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 2.21875, + "step": 409, + "time_per_iteration": 2.5198304653167725 + }, + { + "auxiliary_loss_clip": 0.01304769, + "auxiliary_loss_mlp": 0.01080478, + "balance_loss_clip": 1.04488206, + "balance_loss_mlp": 1.0854609, + "epoch": 0.02465053359386743, + "flos": 35114782124160.0, + "grad_norm": 1.9763435283924244, + "language_loss": 0.82926536, + "learning_rate": 3.873515923575128e-06, + "loss": 0.85311788, + "num_input_tokens_seen": 8568450, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 2.1875, + "step": 410, + "time_per_iteration": 2.624333143234253 + }, + { + "auxiliary_loss_clip": 0.01307118, + "auxiliary_loss_mlp": 0.01089288, + "balance_loss_clip": 1.05164146, + "balance_loss_mlp": 1.08556843, + "epoch": 0.0247106568465354, + "flos": 27451409333760.0, + "grad_norm": 4.176812441051998, + "language_loss": 0.77715993, + "learning_rate": 3.875084382775879e-06, + "loss": 0.80112404, + "num_input_tokens_seen": 8589340, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 2.21875, + "step": 411, + "time_per_iteration": 2.571401596069336 + }, + { + "auxiliary_loss_clip": 0.01303549, + "auxiliary_loss_mlp": 0.01102238, + "balance_loss_clip": 1.06311393, + "balance_loss_mlp": 1.08078265, + "epoch": 0.024770780099203367, + "flos": 20703794808960.0, + "grad_norm": 2.1103060729449883, + "language_loss": 0.86276567, + "learning_rate": 3.87664903040738e-06, + "loss": 0.88682353, + "num_input_tokens_seen": 8607150, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.21875, + "step": 412, + "time_per_iteration": 2.4968833923339844 + }, + { + "auxiliary_loss_clip": 0.01168305, + "auxiliary_loss_mlp": 0.01068817, + "balance_loss_clip": 1.05632353, + "balance_loss_mlp": 1.05478358, + "epoch": 0.024830903351871336, + "flos": 69551859369600.0, + "grad_norm": 0.8568818905087673, + "language_loss": 0.58512402, + "learning_rate": 3.878209884949994e-06, + "loss": 0.60749531, + "num_input_tokens_seen": 8669865, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 1.1328125, + "step": 413, + "time_per_iteration": 3.1763217449188232 + }, + { + "auxiliary_loss_clip": 0.01296528, + "auxiliary_loss_mlp": 0.01092205, + "balance_loss_clip": 1.05145931, + "balance_loss_mlp": 1.07941055, + "epoch": 0.024891026604539304, + "flos": 32270477713920.0, + "grad_norm": 1.7554792190049524, + "language_loss": 0.80704832, + "learning_rate": 3.879766964750006e-06, + "loss": 0.83093566, + "num_input_tokens_seen": 8690235, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 2.171875, + "step": 414, + "time_per_iteration": 2.5954627990722656 + }, + { + "auxiliary_loss_clip": 0.01292737, + "auxiliary_loss_mlp": 0.01093441, + "balance_loss_clip": 1.05660486, + "balance_loss_mlp": 1.07739186, + "epoch": 0.024951149857207276, + "flos": 18840282238080.0, + "grad_norm": 2.3796689224247904, + "language_loss": 0.80473328, + "learning_rate": 3.881320288020917e-06, + "loss": 0.82859504, + "num_input_tokens_seen": 8706295, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 2.15625, + "step": 415, + "time_per_iteration": 2.471665620803833 + }, + { + "auxiliary_loss_clip": 0.0131185, + "auxiliary_loss_mlp": 0.01085672, + "balance_loss_clip": 1.0481931, + "balance_loss_mlp": 1.08601356, + "epoch": 0.025011273109875245, + "flos": 15377201210880.0, + "grad_norm": 5.333540620494007, + "language_loss": 0.96179891, + "learning_rate": 3.882869872844723e-06, + "loss": 0.98577416, + "num_input_tokens_seen": 8724200, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 2.25, + "step": 416, + "time_per_iteration": 2.5133068561553955 + }, + { + "auxiliary_loss_clip": 0.01300197, + "auxiliary_loss_mlp": 0.01076153, + "balance_loss_clip": 1.03702867, + "balance_loss_mlp": 1.0806849, + "epoch": 0.025071396362543213, + "flos": 18915515274240.0, + "grad_norm": 2.409464042642492, + "language_loss": 0.77541196, + "learning_rate": 3.884415737173176e-06, + "loss": 0.79917544, + "num_input_tokens_seen": 8744170, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.1875, + "step": 417, + "time_per_iteration": 2.5206987857818604 + }, + { + "auxiliary_loss_clip": 0.01297091, + "auxiliary_loss_mlp": 0.01092626, + "balance_loss_clip": 1.05512297, + "balance_loss_mlp": 1.08281994, + "epoch": 0.025131519615211182, + "flos": 25337958952320.0, + "grad_norm": 1.6345521849457858, + "language_loss": 0.7689445, + "learning_rate": 3.8859578988290344e-06, + "loss": 0.79284167, + "num_input_tokens_seen": 8765120, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 2.140625, + "step": 418, + "time_per_iteration": 2.6002862453460693 + }, + { + "auxiliary_loss_clip": 0.01304842, + "auxiliary_loss_mlp": 0.01075451, + "balance_loss_clip": 1.03873444, + "balance_loss_mlp": 1.08383846, + "epoch": 0.02519164286787915, + "flos": 18953149749120.0, + "grad_norm": 2.548681745998596, + "language_loss": 0.81088459, + "learning_rate": 3.887496375507294e-06, + "loss": 0.83468759, + "num_input_tokens_seen": 8783500, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 2.203125, + "step": 419, + "time_per_iteration": 2.5097553730010986 + }, + { + "auxiliary_loss_clip": 0.01298642, + "auxiliary_loss_mlp": 0.01085388, + "balance_loss_clip": 1.0453577, + "balance_loss_mlp": 1.08236253, + "epoch": 0.025251766120547123, + "flos": 17421092904960.0, + "grad_norm": 1.9166879875817555, + "language_loss": 0.73812175, + "learning_rate": 3.8890311847764065e-06, + "loss": 0.761962, + "num_input_tokens_seen": 8801175, + "router_z_loss_clip": 0.40039062, + "router_z_loss_mlp": 2.15625, + "step": 420, + "time_per_iteration": 2.480468511581421 + }, + { + "auxiliary_loss_clip": 0.01298409, + "auxiliary_loss_mlp": 0.01098321, + "balance_loss_clip": 1.06086528, + "balance_loss_mlp": 1.0791508, + "epoch": 0.02531188937321509, + "flos": 25045430590080.0, + "grad_norm": 1.7246544027149788, + "language_loss": 0.78928417, + "learning_rate": 3.890562344079484e-06, + "loss": 0.8132515, + "num_input_tokens_seen": 8820215, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 2.1875, + "step": 421, + "time_per_iteration": 2.583979845046997 + }, + { + "auxiliary_loss_clip": 0.01300301, + "auxiliary_loss_mlp": 0.01095113, + "balance_loss_clip": 1.05589294, + "balance_loss_mlp": 1.08374381, + "epoch": 0.02537201262588306, + "flos": 30592228515840.0, + "grad_norm": 2.879256315405443, + "language_loss": 0.81915486, + "learning_rate": 3.89208987073549e-06, + "loss": 0.84310895, + "num_input_tokens_seen": 8839660, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 2.171875, + "step": 422, + "time_per_iteration": 2.5834591388702393 + }, + { + "auxiliary_loss_clip": 0.01299282, + "auxiliary_loss_mlp": 0.01079788, + "balance_loss_clip": 1.0445497, + "balance_loss_mlp": 1.07925105, + "epoch": 0.02543213587855103, + "flos": 26065365275520.0, + "grad_norm": 1.9426129656279463, + "language_loss": 0.83468062, + "learning_rate": 3.893613781940409e-06, + "loss": 0.85847133, + "num_input_tokens_seen": 8859280, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 2.203125, + "step": 423, + "time_per_iteration": 2.5526318550109863 + }, + { + "auxiliary_loss_clip": 0.01293361, + "auxiliary_loss_mlp": 0.01086715, + "balance_loss_clip": 1.04978371, + "balance_loss_mlp": 1.07668817, + "epoch": 0.025492259131218997, + "flos": 36022818965760.0, + "grad_norm": 2.7010989411926367, + "language_loss": 0.74435121, + "learning_rate": 3.895134094768415e-06, + "loss": 0.768152, + "num_input_tokens_seen": 8880560, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 2.171875, + "step": 424, + "time_per_iteration": 2.606895923614502 + }, + { + "auxiliary_loss_clip": 0.01303473, + "auxiliary_loss_mlp": 0.01097188, + "balance_loss_clip": 1.06113958, + "balance_loss_mlp": 1.08349586, + "epoch": 0.02555238238388697, + "flos": 18588045957120.0, + "grad_norm": 2.227147445366898, + "language_loss": 0.83008313, + "learning_rate": 3.896650826173015e-06, + "loss": 0.85408974, + "num_input_tokens_seen": 8899155, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 2.203125, + "step": 425, + "time_per_iteration": 2.522517442703247 + }, + { + "auxiliary_loss_clip": 0.01299491, + "auxiliary_loss_mlp": 0.01096328, + "balance_loss_clip": 1.05691719, + "balance_loss_mlp": 1.07528758, + "epoch": 0.025612505636554938, + "flos": 24243186280320.0, + "grad_norm": 2.394258070540652, + "language_loss": 0.85481966, + "learning_rate": 3.898163992988186e-06, + "loss": 0.87877786, + "num_input_tokens_seen": 8917890, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 2.25, + "step": 426, + "time_per_iteration": 2.5039095878601074 + }, + { + "auxiliary_loss_clip": 0.01160068, + "auxiliary_loss_mlp": 0.0104803, + "balance_loss_clip": 1.03663349, + "balance_loss_mlp": 1.04526472, + "epoch": 0.025672628889222907, + "flos": 60586941265920.0, + "grad_norm": 0.8962322500302954, + "language_loss": 0.57186544, + "learning_rate": 3.899673611929491e-06, + "loss": 0.5939464, + "num_input_tokens_seen": 8978260, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 1.1484375, + "step": 427, + "time_per_iteration": 3.2289342880249023 + }, + { + "auxiliary_loss_clip": 0.01297452, + "auxiliary_loss_mlp": 0.01095521, + "balance_loss_clip": 1.05849457, + "balance_loss_mlp": 1.0838623, + "epoch": 0.025732752141890875, + "flos": 19573255169280.0, + "grad_norm": 2.6536896946259816, + "language_loss": 0.88190198, + "learning_rate": 3.901179699595194e-06, + "loss": 0.90583158, + "num_input_tokens_seen": 8994460, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 2.125, + "step": 428, + "time_per_iteration": 2.500389814376831 + }, + { + "auxiliary_loss_clip": 0.01290417, + "auxiliary_loss_mlp": 0.01078869, + "balance_loss_clip": 1.03972101, + "balance_loss_mlp": 1.07718623, + "epoch": 0.025792875394558847, + "flos": 31284262920960.0, + "grad_norm": 1.6692033855414803, + "language_loss": 0.85672665, + "learning_rate": 3.902682272467353e-06, + "loss": 0.88041949, + "num_input_tokens_seen": 9016670, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.125, + "step": 429, + "time_per_iteration": 2.605687379837036 + }, + { + "auxiliary_loss_clip": 0.01297427, + "auxiliary_loss_mlp": 0.01082502, + "balance_loss_clip": 1.04373491, + "balance_loss_mlp": 1.07673144, + "epoch": 0.025852998647226816, + "flos": 32379610210560.0, + "grad_norm": 2.5023850128037672, + "language_loss": 0.88384748, + "learning_rate": 3.904181346912895e-06, + "loss": 0.90764678, + "num_input_tokens_seen": 9039720, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 2.203125, + "step": 430, + "time_per_iteration": 2.593492269515991 + }, + { + "auxiliary_loss_clip": 0.01298542, + "auxiliary_loss_mlp": 0.01083596, + "balance_loss_clip": 1.04799962, + "balance_loss_mlp": 1.08428442, + "epoch": 0.025913121899894784, + "flos": 20193288762240.0, + "grad_norm": 1.9811912271744876, + "language_loss": 0.84202254, + "learning_rate": 3.905676939184698e-06, + "loss": 0.86584389, + "num_input_tokens_seen": 9059850, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 2.140625, + "step": 431, + "time_per_iteration": 2.5326902866363525 + }, + { + "auxiliary_loss_clip": 0.01291302, + "auxiliary_loss_mlp": 0.01073914, + "balance_loss_clip": 1.03886628, + "balance_loss_mlp": 1.0772872, + "epoch": 0.025973245152562753, + "flos": 14720430983040.0, + "grad_norm": 2.686150654607635, + "language_loss": 0.86775959, + "learning_rate": 3.907169065422638e-06, + "loss": 0.89141178, + "num_input_tokens_seen": 9077590, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 2.140625, + "step": 432, + "time_per_iteration": 2.4793269634246826 + }, + { + "auxiliary_loss_clip": 0.01296964, + "auxiliary_loss_mlp": 0.01080084, + "balance_loss_clip": 1.04491723, + "balance_loss_mlp": 1.08109105, + "epoch": 0.02603336840523072, + "flos": 30992991534720.0, + "grad_norm": 2.6953453355349684, + "language_loss": 0.76074433, + "learning_rate": 3.908657741654636e-06, + "loss": 0.78451484, + "num_input_tokens_seen": 9099880, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 2.15625, + "step": 433, + "time_per_iteration": 2.6125545501708984 + }, + { + "auxiliary_loss_clip": 0.01296292, + "auxiliary_loss_mlp": 0.0109282, + "balance_loss_clip": 1.05312383, + "balance_loss_mlp": 1.07772529, + "epoch": 0.026093491657898694, + "flos": 17674262939520.0, + "grad_norm": 2.2540618473103247, + "language_loss": 0.89764363, + "learning_rate": 3.910142983797699e-06, + "loss": 0.92153478, + "num_input_tokens_seen": 9118620, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 2.1875, + "step": 434, + "time_per_iteration": 5.3097922801971436 + }, + { + "auxiliary_loss_clip": 0.01297376, + "auxiliary_loss_mlp": 0.01102904, + "balance_loss_clip": 1.06404209, + "balance_loss_mlp": 1.08362865, + "epoch": 0.026153614910566662, + "flos": 17857874286720.0, + "grad_norm": 6.328317132251919, + "language_loss": 0.7985189, + "learning_rate": 3.9116248076589305e-06, + "loss": 0.82252169, + "num_input_tokens_seen": 9135655, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 2.125, + "step": 435, + "time_per_iteration": 3.9629530906677246 + }, + { + "auxiliary_loss_clip": 0.01291104, + "auxiliary_loss_mlp": 0.01091144, + "balance_loss_clip": 1.05316401, + "balance_loss_mlp": 1.0750463, + "epoch": 0.02621373816323463, + "flos": 20011113959040.0, + "grad_norm": 3.559504815450524, + "language_loss": 0.86357677, + "learning_rate": 3.913103228936546e-06, + "loss": 0.88739926, + "num_input_tokens_seen": 9153520, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 2.15625, + "step": 436, + "time_per_iteration": 2.479033946990967 + }, + { + "auxiliary_loss_clip": 0.01296325, + "auxiliary_loss_mlp": 0.01099771, + "balance_loss_clip": 1.06214869, + "balance_loss_mlp": 1.07964039, + "epoch": 0.0262738614159026, + "flos": 19281193683840.0, + "grad_norm": 2.6168892141891944, + "language_loss": 0.75002837, + "learning_rate": 3.914578263220868e-06, + "loss": 0.77398932, + "num_input_tokens_seen": 9170750, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 2.171875, + "step": 437, + "time_per_iteration": 2.508769989013672 + }, + { + "auxiliary_loss_clip": 0.01293849, + "auxiliary_loss_mlp": 0.01104049, + "balance_loss_clip": 1.06380415, + "balance_loss_mlp": 1.08015561, + "epoch": 0.026333984668570568, + "flos": 18807208790400.0, + "grad_norm": 2.3031145987765758, + "language_loss": 0.91467845, + "learning_rate": 3.916049925995316e-06, + "loss": 0.93865746, + "num_input_tokens_seen": 9188430, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 2.140625, + "step": 438, + "time_per_iteration": 2.4693844318389893 + }, + { + "auxiliary_loss_clip": 0.01155458, + "auxiliary_loss_mlp": 0.01064255, + "balance_loss_clip": 1.05276346, + "balance_loss_mlp": 1.0448494, + "epoch": 0.02639410792123854, + "flos": 64572020691840.0, + "grad_norm": 0.877669139368542, + "language_loss": 0.62577796, + "learning_rate": 3.917518232637377e-06, + "loss": 0.64797509, + "num_input_tokens_seen": 9255835, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 1.109375, + "step": 439, + "time_per_iteration": 3.162259101867676 + }, + { + "auxiliary_loss_clip": 0.01303989, + "auxiliary_loss_mlp": 0.01098096, + "balance_loss_clip": 1.05873275, + "balance_loss_mlp": 1.08440769, + "epoch": 0.02645423117390651, + "flos": 28473462921600.0, + "grad_norm": 2.1384369611317493, + "language_loss": 0.75629139, + "learning_rate": 3.918983198419573e-06, + "loss": 0.78031218, + "num_input_tokens_seen": 9276835, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 2.203125, + "step": 440, + "time_per_iteration": 2.5541677474975586 + }, + { + "auxiliary_loss_clip": 0.01294139, + "auxiliary_loss_mlp": 0.01082398, + "balance_loss_clip": 1.04408443, + "balance_loss_mlp": 1.08003163, + "epoch": 0.026514354426574478, + "flos": 18551237495040.0, + "grad_norm": 1.9583565981573345, + "language_loss": 0.83186466, + "learning_rate": 3.920444838510415e-06, + "loss": 0.85563004, + "num_input_tokens_seen": 9295075, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 2.140625, + "step": 441, + "time_per_iteration": 2.453705072402954 + }, + { + "auxiliary_loss_clip": 0.01298235, + "auxiliary_loss_mlp": 0.01092726, + "balance_loss_clip": 1.05286217, + "balance_loss_mlp": 1.07855892, + "epoch": 0.026574477679242446, + "flos": 20667812359680.0, + "grad_norm": 2.035076381127293, + "language_loss": 0.7850582, + "learning_rate": 3.92190316797534e-06, + "loss": 0.80896777, + "num_input_tokens_seen": 9314205, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 2.203125, + "step": 442, + "time_per_iteration": 2.477555990219116 + }, + { + "auxiliary_loss_clip": 0.01145517, + "auxiliary_loss_mlp": 0.01012445, + "balance_loss_clip": 1.00185919, + "balance_loss_mlp": 1.04045749, + "epoch": 0.026634600931910415, + "flos": 57956125340160.0, + "grad_norm": 0.9584767110468104, + "language_loss": 0.64475185, + "learning_rate": 3.92335820177765e-06, + "loss": 0.66633147, + "num_input_tokens_seen": 9367395, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 1.046875, + "step": 443, + "time_per_iteration": 2.9838714599609375 + }, + { + "auxiliary_loss_clip": 0.01297944, + "auxiliary_loss_mlp": 0.01087685, + "balance_loss_clip": 1.04941845, + "balance_loss_mlp": 1.08318424, + "epoch": 0.026694724184578387, + "flos": 15815131827840.0, + "grad_norm": 2.4335650573352483, + "language_loss": 0.82707053, + "learning_rate": 3.924809954779425e-06, + "loss": 0.85092688, + "num_input_tokens_seen": 9385185, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 2.140625, + "step": 444, + "time_per_iteration": 2.4520323276519775 + }, + { + "auxiliary_loss_clip": 0.0130195, + "auxiliary_loss_mlp": 0.0108515, + "balance_loss_clip": 1.0440464, + "balance_loss_mlp": 1.08103406, + "epoch": 0.026754847437246355, + "flos": 23440259612160.0, + "grad_norm": 2.6903851096875733, + "language_loss": 0.95400113, + "learning_rate": 3.9262584417424425e-06, + "loss": 0.97787213, + "num_input_tokens_seen": 9403225, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 2.21875, + "step": 445, + "time_per_iteration": 2.5113518238067627 + }, + { + "auxiliary_loss_clip": 0.01296406, + "auxiliary_loss_mlp": 0.01096489, + "balance_loss_clip": 1.05657816, + "balance_loss_mlp": 1.08177555, + "epoch": 0.026814970689914324, + "flos": 17341801632000.0, + "grad_norm": 2.416617421630428, + "language_loss": 0.91790259, + "learning_rate": 3.9277036773290725e-06, + "loss": 0.94183153, + "num_input_tokens_seen": 9420540, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 2.15625, + "step": 446, + "time_per_iteration": 2.4585111141204834 + }, + { + "auxiliary_loss_clip": 0.01293099, + "auxiliary_loss_mlp": 0.01085762, + "balance_loss_clip": 1.04718637, + "balance_loss_mlp": 1.08102632, + "epoch": 0.026875093942582293, + "flos": 17894718662400.0, + "grad_norm": 2.3983095061811635, + "language_loss": 0.80024058, + "learning_rate": 3.92914567610317e-06, + "loss": 0.82402921, + "num_input_tokens_seen": 9438840, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 2.125, + "step": 447, + "time_per_iteration": 2.509643316268921 + }, + { + "auxiliary_loss_clip": 0.01292768, + "auxiliary_loss_mlp": 0.01072511, + "balance_loss_clip": 1.03658175, + "balance_loss_mlp": 1.07935369, + "epoch": 0.026935217195250265, + "flos": 21723980889600.0, + "grad_norm": 2.4579217038825423, + "language_loss": 0.86773896, + "learning_rate": 3.930584452530952e-06, + "loss": 0.89139175, + "num_input_tokens_seen": 9457215, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 2.125, + "step": 448, + "time_per_iteration": 2.477384328842163 + }, + { + "auxiliary_loss_clip": 0.01287268, + "auxiliary_loss_mlp": 0.01093327, + "balance_loss_clip": 1.0583508, + "balance_loss_mlp": 1.07870793, + "epoch": 0.026995340447918233, + "flos": 23622685810560.0, + "grad_norm": 2.1426472419274503, + "language_loss": 0.88779259, + "learning_rate": 3.9320200209818755e-06, + "loss": 0.91159856, + "num_input_tokens_seen": 9475615, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 2.078125, + "step": 449, + "time_per_iteration": 2.50108003616333 + }, + { + "auxiliary_loss_clip": 0.01298718, + "auxiliary_loss_mlp": 0.01087936, + "balance_loss_clip": 1.04897857, + "balance_loss_mlp": 1.08056545, + "epoch": 0.027055463700586202, + "flos": 17931275729280.0, + "grad_norm": 1.9975703664508544, + "language_loss": 0.80516291, + "learning_rate": 3.933452395729493e-06, + "loss": 0.82902944, + "num_input_tokens_seen": 9493975, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.1875, + "step": 450, + "time_per_iteration": 2.470536947250366 + }, + { + "auxiliary_loss_clip": 0.01291132, + "auxiliary_loss_mlp": 0.01077476, + "balance_loss_clip": 1.03973413, + "balance_loss_mlp": 1.08217299, + "epoch": 0.02711558695325417, + "flos": 25118903859840.0, + "grad_norm": 2.7768383062811637, + "language_loss": 0.81500483, + "learning_rate": 3.934881590952304e-06, + "loss": 0.83869088, + "num_input_tokens_seen": 9514810, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 2.09375, + "step": 451, + "time_per_iteration": 2.530539035797119 + }, + { + "auxiliary_loss_clip": 0.01289442, + "auxiliary_loss_mlp": 0.0109125, + "balance_loss_clip": 1.0524354, + "balance_loss_mlp": 1.08151317, + "epoch": 0.02717571020592214, + "flos": 24239559006720.0, + "grad_norm": 1.5925691418309382, + "language_loss": 0.76994318, + "learning_rate": 3.936307620734599e-06, + "loss": 0.79375011, + "num_input_tokens_seen": 9533635, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 2.078125, + "step": 452, + "time_per_iteration": 2.5138871669769287 + }, + { + "auxiliary_loss_clip": 0.01292925, + "auxiliary_loss_mlp": 0.01088314, + "balance_loss_clip": 1.0507158, + "balance_loss_mlp": 1.08201516, + "epoch": 0.02723583345859011, + "flos": 25118939773440.0, + "grad_norm": 1.9334646917545748, + "language_loss": 0.73053265, + "learning_rate": 3.937730499067294e-06, + "loss": 0.754345, + "num_input_tokens_seen": 9555420, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 2.109375, + "step": 453, + "time_per_iteration": 2.5271401405334473 + }, + { + "auxiliary_loss_clip": 0.01288113, + "auxiliary_loss_mlp": 0.01086026, + "balance_loss_clip": 1.04952383, + "balance_loss_mlp": 1.08018303, + "epoch": 0.02729595671125808, + "flos": 42741597847680.0, + "grad_norm": 1.845498968311748, + "language_loss": 0.82439983, + "learning_rate": 3.939150239848748e-06, + "loss": 0.84814119, + "num_input_tokens_seen": 9578950, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 2.078125, + "step": 454, + "time_per_iteration": 2.6724069118499756 + }, + { + "auxiliary_loss_clip": 0.01290287, + "auxiliary_loss_mlp": 0.01078957, + "balance_loss_clip": 1.04491115, + "balance_loss_mlp": 1.0808264, + "epoch": 0.02735607996392605, + "flos": 21430985650560.0, + "grad_norm": 2.1414002490484005, + "language_loss": 0.75815403, + "learning_rate": 3.9405668568855866e-06, + "loss": 0.78184646, + "num_input_tokens_seen": 9598160, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 2.09375, + "step": 455, + "time_per_iteration": 2.496913194656372 + }, + { + "auxiliary_loss_clip": 0.01290624, + "auxiliary_loss_mlp": 0.01097119, + "balance_loss_clip": 1.06114161, + "balance_loss_mlp": 1.07846022, + "epoch": 0.027416203216594017, + "flos": 20851280052480.0, + "grad_norm": 2.102028743174525, + "language_loss": 0.80576169, + "learning_rate": 3.941980363893499e-06, + "loss": 0.82963914, + "num_input_tokens_seen": 9616010, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 2.125, + "step": 456, + "time_per_iteration": 2.4748263359069824 + }, + { + "auxiliary_loss_clip": 0.01286184, + "auxiliary_loss_mlp": 0.01078793, + "balance_loss_clip": 1.04152811, + "balance_loss_mlp": 1.07863176, + "epoch": 0.027476326469261986, + "flos": 13224500242560.0, + "grad_norm": 2.479828414472028, + "language_loss": 0.81621009, + "learning_rate": 3.9433907744980384e-06, + "loss": 0.83985978, + "num_input_tokens_seen": 9634000, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 2.078125, + "step": 457, + "time_per_iteration": 2.5122945308685303 + }, + { + "auxiliary_loss_clip": 0.01289671, + "auxiliary_loss_mlp": 0.01084101, + "balance_loss_clip": 1.04728937, + "balance_loss_mlp": 1.07828617, + "epoch": 0.027536449721929958, + "flos": 24024526237440.0, + "grad_norm": 2.0492464691581476, + "language_loss": 0.94062889, + "learning_rate": 3.944798102235412e-06, + "loss": 0.96436661, + "num_input_tokens_seen": 9653455, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 2.109375, + "step": 458, + "time_per_iteration": 2.542919874191284 + }, + { + "auxiliary_loss_clip": 0.01287914, + "auxiliary_loss_mlp": 0.01093849, + "balance_loss_clip": 1.05872989, + "balance_loss_mlp": 1.07926297, + "epoch": 0.027596572974597926, + "flos": 13006055681280.0, + "grad_norm": 2.4293190258203774, + "language_loss": 0.79353511, + "learning_rate": 3.9462023605532545e-06, + "loss": 0.81735277, + "num_input_tokens_seen": 9669650, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 2.09375, + "step": 459, + "time_per_iteration": 2.472830295562744 + }, + { + "auxiliary_loss_clip": 0.01293203, + "auxiliary_loss_mlp": 0.01082653, + "balance_loss_clip": 1.04360008, + "balance_loss_mlp": 1.08543491, + "epoch": 0.027656696227265895, + "flos": 26143076350080.0, + "grad_norm": 1.8472887331493792, + "language_loss": 0.83103061, + "learning_rate": 3.947603562811407e-06, + "loss": 0.85478914, + "num_input_tokens_seen": 9691415, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.078125, + "step": 460, + "time_per_iteration": 2.5376338958740234 + }, + { + "auxiliary_loss_clip": 0.01140517, + "auxiliary_loss_mlp": 0.01043703, + "balance_loss_clip": 1.03488147, + "balance_loss_mlp": 1.03798664, + "epoch": 0.027716819479933864, + "flos": 60697222997760.0, + "grad_norm": 1.5738760379538346, + "language_loss": 0.73565412, + "learning_rate": 3.949001722282675e-06, + "loss": 0.7574963, + "num_input_tokens_seen": 9755605, + "router_z_loss_clip": 0.08837891, + "router_z_loss_mlp": 1.0234375, + "step": 461, + "time_per_iteration": 3.0358285903930664 + }, + { + "auxiliary_loss_clip": 0.01289208, + "auxiliary_loss_mlp": 0.01081781, + "balance_loss_clip": 1.04735351, + "balance_loss_mlp": 1.086905, + "epoch": 0.027776942732601832, + "flos": 31211938886400.0, + "grad_norm": 2.85425781388422, + "language_loss": 0.81291741, + "learning_rate": 3.950396852153582e-06, + "loss": 0.83662736, + "num_input_tokens_seen": 9776270, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 2.015625, + "step": 462, + "time_per_iteration": 2.6079564094543457 + }, + { + "auxiliary_loss_clip": 0.01287586, + "auxiliary_loss_mlp": 0.01073577, + "balance_loss_clip": 1.04096127, + "balance_loss_mlp": 1.08167982, + "epoch": 0.027837065985269804, + "flos": 22674644196480.0, + "grad_norm": 2.2822341634579195, + "language_loss": 0.90235889, + "learning_rate": 3.951788965525118e-06, + "loss": 0.92597055, + "num_input_tokens_seen": 9794465, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 2.0625, + "step": 463, + "time_per_iteration": 2.4881155490875244 + }, + { + "auxiliary_loss_clip": 0.01137482, + "auxiliary_loss_mlp": 0.01014393, + "balance_loss_clip": 1.00561893, + "balance_loss_mlp": 1.03824747, + "epoch": 0.027897189237937773, + "flos": 62182487399040.0, + "grad_norm": 0.8835585057209928, + "language_loss": 0.59031862, + "learning_rate": 3.953178075413476e-06, + "loss": 0.61183739, + "num_input_tokens_seen": 9849685, + "router_z_loss_clip": 0.08789062, + "router_z_loss_mlp": 0.9921875, + "step": 464, + "time_per_iteration": 3.039377212524414 + }, + { + "auxiliary_loss_clip": 0.01299905, + "auxiliary_loss_mlp": 0.01097461, + "balance_loss_clip": 1.06081581, + "balance_loss_mlp": 1.08716702, + "epoch": 0.02795731249060574, + "flos": 24493160004480.0, + "grad_norm": 2.8663863440598525, + "language_loss": 0.81203198, + "learning_rate": 3.954564194750784e-06, + "loss": 0.83600569, + "num_input_tokens_seen": 9869505, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 2.125, + "step": 465, + "time_per_iteration": 2.5197718143463135 + }, + { + "auxiliary_loss_clip": 0.01286546, + "auxiliary_loss_mlp": 0.01082829, + "balance_loss_clip": 1.04708982, + "balance_loss_mlp": 1.08028877, + "epoch": 0.02801743574327371, + "flos": 23733003456000.0, + "grad_norm": 2.004656273762408, + "language_loss": 0.78560221, + "learning_rate": 3.955947336385828e-06, + "loss": 0.80929601, + "num_input_tokens_seen": 9890950, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 2.0625, + "step": 466, + "time_per_iteration": 2.5151565074920654 + }, + { + "auxiliary_loss_clip": 0.01285777, + "auxiliary_loss_mlp": 0.01085472, + "balance_loss_clip": 1.05075812, + "balance_loss_mlp": 1.0816046, + "epoch": 0.02807755899594168, + "flos": 20629100476800.0, + "grad_norm": 2.05931728393333, + "language_loss": 0.87548482, + "learning_rate": 3.957327513084761e-06, + "loss": 0.89919734, + "num_input_tokens_seen": 9911265, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 2.03125, + "step": 467, + "time_per_iteration": 2.4994542598724365 + }, + { + "auxiliary_loss_clip": 0.01289137, + "auxiliary_loss_mlp": 0.01106554, + "balance_loss_clip": 1.06969416, + "balance_loss_mlp": 1.08202362, + "epoch": 0.02813768224860965, + "flos": 19244564789760.0, + "grad_norm": 2.728881931821799, + "language_loss": 0.86217642, + "learning_rate": 3.958704737531818e-06, + "loss": 0.88613331, + "num_input_tokens_seen": 9929025, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 2.0625, + "step": 468, + "time_per_iteration": 2.482377767562866 + }, + { + "auxiliary_loss_clip": 0.01287545, + "auxiliary_loss_mlp": 0.01081999, + "balance_loss_clip": 1.0447104, + "balance_loss_mlp": 1.07984936, + "epoch": 0.02819780550127762, + "flos": 20813968800000.0, + "grad_norm": 3.6924571591440762, + "language_loss": 0.91605878, + "learning_rate": 3.9600790223300065e-06, + "loss": 0.93975413, + "num_input_tokens_seen": 9945190, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 2.078125, + "step": 469, + "time_per_iteration": 2.471510648727417 + }, + { + "auxiliary_loss_clip": 0.01286876, + "auxiliary_loss_mlp": 0.01096778, + "balance_loss_clip": 1.06106234, + "balance_loss_mlp": 1.08290672, + "epoch": 0.028257928753945588, + "flos": 19974125928960.0, + "grad_norm": 8.38112094971343, + "language_loss": 0.81587195, + "learning_rate": 3.96145038000181e-06, + "loss": 0.83970851, + "num_input_tokens_seen": 9962820, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 2.03125, + "step": 470, + "time_per_iteration": 2.5398614406585693 + }, + { + "auxiliary_loss_clip": 0.01286572, + "auxiliary_loss_mlp": 0.01085498, + "balance_loss_clip": 1.04868627, + "balance_loss_mlp": 1.07859015, + "epoch": 0.028318052006613557, + "flos": 20484488321280.0, + "grad_norm": 1.8437898933227894, + "language_loss": 0.93147206, + "learning_rate": 3.962818822989861e-06, + "loss": 0.9551928, + "num_input_tokens_seen": 9982595, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 2.078125, + "step": 471, + "time_per_iteration": 2.5005030632019043 + }, + { + "auxiliary_loss_clip": 0.0128173, + "auxiliary_loss_mlp": 0.01094713, + "balance_loss_clip": 1.05885458, + "balance_loss_mlp": 1.07808042, + "epoch": 0.02837817525928153, + "flos": 28514832410880.0, + "grad_norm": 1.89303735573371, + "language_loss": 0.757568, + "learning_rate": 3.964184363657625e-06, + "loss": 0.78133243, + "num_input_tokens_seen": 10004645, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 2.03125, + "step": 472, + "time_per_iteration": 2.597637176513672 + }, + { + "auxiliary_loss_clip": 0.0128882, + "auxiliary_loss_mlp": 0.01078393, + "balance_loss_clip": 1.04479945, + "balance_loss_mlp": 1.07699013, + "epoch": 0.028438298511949497, + "flos": 18551668458240.0, + "grad_norm": 3.986951446490631, + "language_loss": 0.93354845, + "learning_rate": 3.965547014290071e-06, + "loss": 0.95722055, + "num_input_tokens_seen": 10022555, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 2.125, + "step": 473, + "time_per_iteration": 2.4882545471191406 + }, + { + "auxiliary_loss_clip": 0.01293922, + "auxiliary_loss_mlp": 0.01115319, + "balance_loss_clip": 1.08134401, + "balance_loss_mlp": 1.08149064, + "epoch": 0.028498421764617466, + "flos": 16910227722240.0, + "grad_norm": 4.845992674029067, + "language_loss": 0.88586211, + "learning_rate": 3.96690678709433e-06, + "loss": 0.90995455, + "num_input_tokens_seen": 10041025, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 2.125, + "step": 474, + "time_per_iteration": 2.483210563659668 + }, + { + "auxiliary_loss_clip": 0.01284496, + "auxiliary_loss_mlp": 0.01091761, + "balance_loss_clip": 1.05559278, + "balance_loss_mlp": 1.07983565, + "epoch": 0.028558545017285435, + "flos": 27778699082880.0, + "grad_norm": 2.474550917046853, + "language_loss": 0.78771299, + "learning_rate": 3.968263694200355e-06, + "loss": 0.81147563, + "num_input_tokens_seen": 10060775, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 2.046875, + "step": 475, + "time_per_iteration": 2.5462486743927 + }, + { + "auxiliary_loss_clip": 0.01139312, + "auxiliary_loss_mlp": 0.0107539, + "balance_loss_clip": 1.06647348, + "balance_loss_mlp": 1.03907108, + "epoch": 0.028618668269953403, + "flos": 65654367258240.0, + "grad_norm": 0.9304884927077405, + "language_loss": 0.66880804, + "learning_rate": 3.969617747661569e-06, + "loss": 0.6909551, + "num_input_tokens_seen": 10120225, + "router_z_loss_clip": 0.08935547, + "router_z_loss_mlp": 1.0, + "step": 476, + "time_per_iteration": 5.8287513256073 + }, + { + "auxiliary_loss_clip": 0.01286666, + "auxiliary_loss_mlp": 0.01081774, + "balance_loss_clip": 1.04527175, + "balance_loss_mlp": 1.0796659, + "epoch": 0.028678791522621375, + "flos": 21937074324480.0, + "grad_norm": 2.9569520931335775, + "language_loss": 0.83852398, + "learning_rate": 3.970968959455509e-06, + "loss": 0.86220837, + "num_input_tokens_seen": 10137880, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 2.078125, + "step": 477, + "time_per_iteration": 2.5179195404052734 + }, + { + "auxiliary_loss_clip": 0.01293161, + "auxiliary_loss_mlp": 0.01088101, + "balance_loss_clip": 1.05164671, + "balance_loss_mlp": 1.08298135, + "epoch": 0.028738914775289344, + "flos": 24572128055040.0, + "grad_norm": 2.2048636254017504, + "language_loss": 0.82267237, + "learning_rate": 3.97231734148446e-06, + "loss": 0.84648502, + "num_input_tokens_seen": 10156930, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 2.09375, + "step": 478, + "time_per_iteration": 2.495760679244995 + }, + { + "auxiliary_loss_clip": 0.01283274, + "auxiliary_loss_mlp": 0.01076252, + "balance_loss_clip": 1.0409658, + "balance_loss_mlp": 1.07707858, + "epoch": 0.028799038027957313, + "flos": 23257977068160.0, + "grad_norm": 2.28603697529264, + "language_loss": 0.81010443, + "learning_rate": 3.973662905576082e-06, + "loss": 0.8336997, + "num_input_tokens_seen": 10176295, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 2.0625, + "step": 479, + "time_per_iteration": 2.491910934448242 + }, + { + "auxiliary_loss_clip": 0.01281719, + "auxiliary_loss_mlp": 0.01080307, + "balance_loss_clip": 1.04323328, + "balance_loss_mlp": 1.07729793, + "epoch": 0.02885916128062528, + "flos": 22164102236160.0, + "grad_norm": 2.2385690137770715, + "language_loss": 0.73465097, + "learning_rate": 3.975005663484038e-06, + "loss": 0.75827128, + "num_input_tokens_seen": 10195790, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 2.03125, + "step": 480, + "time_per_iteration": 2.4959068298339844 + }, + { + "auxiliary_loss_clip": 0.01280408, + "auxiliary_loss_mlp": 0.01071287, + "balance_loss_clip": 1.03945768, + "balance_loss_mlp": 1.07837129, + "epoch": 0.02891928453329325, + "flos": 22932842135040.0, + "grad_norm": 1.6612342828976938, + "language_loss": 0.87719476, + "learning_rate": 3.976345626888605e-06, + "loss": 0.90071172, + "num_input_tokens_seen": 10218405, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 2.03125, + "step": 481, + "time_per_iteration": 2.534792184829712 + }, + { + "auxiliary_loss_clip": 0.0113967, + "auxiliary_loss_mlp": 0.01022688, + "balance_loss_clip": 1.01367593, + "balance_loss_mlp": 1.03470159, + "epoch": 0.028979407785961222, + "flos": 57432941792640.0, + "grad_norm": 0.8259666239631118, + "language_loss": 0.66064727, + "learning_rate": 3.9776828073972864e-06, + "loss": 0.68227088, + "num_input_tokens_seen": 10271005, + "router_z_loss_clip": 0.09033203, + "router_z_loss_mlp": 1.046875, + "step": 482, + "time_per_iteration": 2.8219997882843018 + }, + { + "auxiliary_loss_clip": 0.01295379, + "auxiliary_loss_mlp": 0.01073835, + "balance_loss_clip": 1.04014635, + "balance_loss_mlp": 1.08159328, + "epoch": 0.02903953103862919, + "flos": 16722737706240.0, + "grad_norm": 2.373570732629757, + "language_loss": 0.78743541, + "learning_rate": 3.979017216545415e-06, + "loss": 0.81112754, + "num_input_tokens_seen": 10288405, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 2.140625, + "step": 483, + "time_per_iteration": 2.4733006954193115 + }, + { + "auxiliary_loss_clip": 0.01293434, + "auxiliary_loss_mlp": 0.01090935, + "balance_loss_clip": 1.0548625, + "balance_loss_mlp": 1.08311069, + "epoch": 0.02909965429129716, + "flos": 16763640318720.0, + "grad_norm": 2.520023812901894, + "language_loss": 0.75405324, + "learning_rate": 3.980348865796749e-06, + "loss": 0.77789688, + "num_input_tokens_seen": 10306875, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 2.109375, + "step": 484, + "time_per_iteration": 2.466634750366211 + }, + { + "auxiliary_loss_clip": 0.01288089, + "auxiliary_loss_mlp": 0.01078618, + "balance_loss_clip": 1.04459584, + "balance_loss_mlp": 1.08002305, + "epoch": 0.029159777543965128, + "flos": 19785343023360.0, + "grad_norm": 2.0323982063196153, + "language_loss": 0.84021544, + "learning_rate": 3.9816777665440615e-06, + "loss": 0.86388254, + "num_input_tokens_seen": 10323965, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 2.078125, + "step": 485, + "time_per_iteration": 2.511415719985962 + }, + { + "auxiliary_loss_clip": 0.01293039, + "auxiliary_loss_mlp": 0.01081906, + "balance_loss_clip": 1.04740667, + "balance_loss_mlp": 1.08659554, + "epoch": 0.029219900796633096, + "flos": 19642670202240.0, + "grad_norm": 1.9066132168030567, + "language_loss": 0.84465218, + "learning_rate": 3.983003930109732e-06, + "loss": 0.86840165, + "num_input_tokens_seen": 10342620, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 2.0625, + "step": 486, + "time_per_iteration": 2.453583002090454 + }, + { + "auxiliary_loss_clip": 0.01284719, + "auxiliary_loss_mlp": 0.01083872, + "balance_loss_clip": 1.04841876, + "balance_loss_mlp": 1.07841349, + "epoch": 0.02928002404930107, + "flos": 25885704424320.0, + "grad_norm": 1.9228432408219163, + "language_loss": 0.8891986, + "learning_rate": 3.984327367746315e-06, + "loss": 0.91288453, + "num_input_tokens_seen": 10364610, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 2.0625, + "step": 487, + "time_per_iteration": 2.5558598041534424 + }, + { + "auxiliary_loss_clip": 0.0128758, + "auxiliary_loss_mlp": 0.01070867, + "balance_loss_clip": 1.03806067, + "balance_loss_mlp": 1.08095598, + "epoch": 0.029340147301969037, + "flos": 20660234590080.0, + "grad_norm": 2.5260996981700456, + "language_loss": 0.87981069, + "learning_rate": 3.985648090637122e-06, + "loss": 0.90339512, + "num_input_tokens_seen": 10380910, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 2.0625, + "step": 488, + "time_per_iteration": 2.5299952030181885 + }, + { + "auxiliary_loss_clip": 0.01283325, + "auxiliary_loss_mlp": 0.01079627, + "balance_loss_clip": 1.0449605, + "balance_loss_mlp": 1.07794333, + "epoch": 0.029400270554637006, + "flos": 24428018689920.0, + "grad_norm": 2.1862911790042543, + "language_loss": 0.88956475, + "learning_rate": 3.986966109896785e-06, + "loss": 0.9131943, + "num_input_tokens_seen": 10400665, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 2.046875, + "step": 489, + "time_per_iteration": 2.545240879058838 + }, + { + "auxiliary_loss_clip": 0.0127768, + "auxiliary_loss_mlp": 0.01078157, + "balance_loss_clip": 1.04322839, + "balance_loss_mlp": 1.07402337, + "epoch": 0.029460393807304974, + "flos": 20120892900480.0, + "grad_norm": 2.0397830948196756, + "language_loss": 0.88539088, + "learning_rate": 3.988281436571815e-06, + "loss": 0.90894926, + "num_input_tokens_seen": 10420150, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 2.03125, + "step": 490, + "time_per_iteration": 2.4727838039398193 + }, + { + "auxiliary_loss_clip": 0.01284238, + "auxiliary_loss_mlp": 0.01081508, + "balance_loss_clip": 1.04774833, + "balance_loss_mlp": 1.07731342, + "epoch": 0.029520517059972943, + "flos": 17675914965120.0, + "grad_norm": 2.230679327742206, + "language_loss": 0.91299963, + "learning_rate": 3.989594081641164e-06, + "loss": 0.93665713, + "num_input_tokens_seen": 10438210, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 2.0625, + "step": 491, + "time_per_iteration": 2.4900829792022705 + }, + { + "auxiliary_loss_clip": 0.01274874, + "auxiliary_loss_mlp": 0.01070684, + "balance_loss_clip": 1.03804421, + "balance_loss_mlp": 1.0749476, + "epoch": 0.029580640312640915, + "flos": 18953185662720.0, + "grad_norm": 2.419480988494796, + "language_loss": 0.85232413, + "learning_rate": 3.9909040560167675e-06, + "loss": 0.87577969, + "num_input_tokens_seen": 10455125, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 2.0, + "step": 492, + "time_per_iteration": 2.457188844680786 + }, + { + "auxiliary_loss_clip": 0.0128558, + "auxiliary_loss_mlp": 0.01093772, + "balance_loss_clip": 1.05939209, + "balance_loss_mlp": 1.08082771, + "epoch": 0.029640763565308884, + "flos": 18726121837440.0, + "grad_norm": 2.826333733481051, + "language_loss": 0.83989829, + "learning_rate": 3.992211370544093e-06, + "loss": 0.86369187, + "num_input_tokens_seen": 10470990, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 2.046875, + "step": 493, + "time_per_iteration": 2.4821553230285645 + }, + { + "auxiliary_loss_clip": 0.01280126, + "auxiliary_loss_mlp": 0.01079048, + "balance_loss_clip": 1.04586005, + "balance_loss_mlp": 1.07578444, + "epoch": 0.029700886817976852, + "flos": 20595308757120.0, + "grad_norm": 1.8259196989393787, + "language_loss": 0.86575663, + "learning_rate": 3.99351603600268e-06, + "loss": 0.88934839, + "num_input_tokens_seen": 10490685, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 2.046875, + "step": 494, + "time_per_iteration": 2.507068395614624 + }, + { + "auxiliary_loss_clip": 0.01286409, + "auxiliary_loss_mlp": 0.01082408, + "balance_loss_clip": 1.05084157, + "balance_loss_mlp": 1.07973599, + "epoch": 0.02976101007064482, + "flos": 22236857233920.0, + "grad_norm": 4.414490317498679, + "language_loss": 0.86250752, + "learning_rate": 3.994818063106668e-06, + "loss": 0.88619578, + "num_input_tokens_seen": 10509435, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 2.0625, + "step": 495, + "time_per_iteration": 2.498401165008545 + }, + { + "auxiliary_loss_clip": 0.01274095, + "auxiliary_loss_mlp": 0.0107342, + "balance_loss_clip": 1.04144859, + "balance_loss_mlp": 1.07653904, + "epoch": 0.029821133323312793, + "flos": 23732644320000.0, + "grad_norm": 1.893732744603442, + "language_loss": 0.6230706, + "learning_rate": 3.99611746250533e-06, + "loss": 0.64654577, + "num_input_tokens_seen": 10530050, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 1.9765625, + "step": 496, + "time_per_iteration": 2.499669313430786 + }, + { + "auxiliary_loss_clip": 0.01276388, + "auxiliary_loss_mlp": 0.01085353, + "balance_loss_clip": 1.05314219, + "balance_loss_mlp": 1.07830799, + "epoch": 0.02988125657598076, + "flos": 22419498913920.0, + "grad_norm": 1.8423417765009742, + "language_loss": 0.88582325, + "learning_rate": 3.997414244783595e-06, + "loss": 0.90944064, + "num_input_tokens_seen": 10551370, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 1.984375, + "step": 497, + "time_per_iteration": 2.5570924282073975 + }, + { + "auxiliary_loss_clip": 0.01282787, + "auxiliary_loss_mlp": 0.0108035, + "balance_loss_clip": 1.04711461, + "balance_loss_mlp": 1.07822609, + "epoch": 0.02994137982864873, + "flos": 13845108453120.0, + "grad_norm": 3.4064142479622377, + "language_loss": 0.85174376, + "learning_rate": 3.998708420462557e-06, + "loss": 0.87537515, + "num_input_tokens_seen": 10569225, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 2.046875, + "step": 498, + "time_per_iteration": 2.513601541519165 + }, + { + "auxiliary_loss_clip": 0.01281177, + "auxiliary_loss_mlp": 0.01082811, + "balance_loss_clip": 1.05052912, + "balance_loss_mlp": 1.07829463, + "epoch": 0.0300015030813167, + "flos": 23908354675200.0, + "grad_norm": 37.23719619981942, + "language_loss": 0.78152531, + "learning_rate": 4e-06, + "loss": 0.80516517, + "num_input_tokens_seen": 10586170, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 2.03125, + "step": 499, + "time_per_iteration": 2.4924824237823486 + }, + { + "auxiliary_loss_clip": 0.01282354, + "auxiliary_loss_mlp": 0.01080564, + "balance_loss_clip": 1.04818654, + "balance_loss_mlp": 1.08037949, + "epoch": 0.030061626333984667, + "flos": 22016796560640.0, + "grad_norm": 3.687829420060643, + "language_loss": 0.8271451, + "learning_rate": 3.9999999620799e-06, + "loss": 0.85077423, + "num_input_tokens_seen": 10606205, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 2.015625, + "step": 500, + "time_per_iteration": 2.494333028793335 + }, + { + "auxiliary_loss_clip": 0.01274571, + "auxiliary_loss_mlp": 0.01084389, + "balance_loss_clip": 1.04924583, + "balance_loss_mlp": 1.07541978, + "epoch": 0.03012174958665264, + "flos": 23039747988480.0, + "grad_norm": 2.6096117253121447, + "language_loss": 0.88464928, + "learning_rate": 3.9999998483196e-06, + "loss": 0.90823889, + "num_input_tokens_seen": 10625995, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 1.9921875, + "step": 501, + "time_per_iteration": 2.494575262069702 + }, + { + "auxiliary_loss_clip": 0.01283018, + "auxiliary_loss_mlp": 0.01073076, + "balance_loss_clip": 1.04158127, + "balance_loss_mlp": 1.07912767, + "epoch": 0.030181872839320608, + "flos": 18953257489920.0, + "grad_norm": 2.304054979465899, + "language_loss": 0.86586684, + "learning_rate": 3.9999996587191065e-06, + "loss": 0.88942778, + "num_input_tokens_seen": 10644105, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 2.03125, + "step": 502, + "time_per_iteration": 2.4574413299560547 + }, + { + "auxiliary_loss_clip": 0.01278734, + "auxiliary_loss_mlp": 0.01077839, + "balance_loss_clip": 1.0444839, + "balance_loss_mlp": 1.07952762, + "epoch": 0.030241996091988577, + "flos": 16728017005440.0, + "grad_norm": 2.6244890775354976, + "language_loss": 0.84661186, + "learning_rate": 3.999999393278425e-06, + "loss": 0.87017757, + "num_input_tokens_seen": 10661090, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 1.9921875, + "step": 503, + "time_per_iteration": 2.4406938552856445 + }, + { + "auxiliary_loss_clip": 0.0127278, + "auxiliary_loss_mlp": 0.01082796, + "balance_loss_clip": 1.05008519, + "balance_loss_mlp": 1.07727659, + "epoch": 0.030302119344656545, + "flos": 28621271387520.0, + "grad_norm": 1.6755724800263092, + "language_loss": 0.88215417, + "learning_rate": 3.999999051997567e-06, + "loss": 0.90570992, + "num_input_tokens_seen": 10682380, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.953125, + "step": 504, + "time_per_iteration": 2.5319011211395264 + }, + { + "auxiliary_loss_clip": 0.01274883, + "auxiliary_loss_mlp": 0.01087129, + "balance_loss_clip": 1.05556226, + "balance_loss_mlp": 1.07692564, + "epoch": 0.030362242597324514, + "flos": 15669334523520.0, + "grad_norm": 2.2080583468347, + "language_loss": 0.78446162, + "learning_rate": 3.9999986348765425e-06, + "loss": 0.80808175, + "num_input_tokens_seen": 10699925, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.9765625, + "step": 505, + "time_per_iteration": 2.4724690914154053 + }, + { + "auxiliary_loss_clip": 0.01135682, + "auxiliary_loss_mlp": 0.01018291, + "balance_loss_clip": 1.00927854, + "balance_loss_mlp": 1.04092085, + "epoch": 0.030422365849992486, + "flos": 72125973676800.0, + "grad_norm": 0.8461866637376847, + "language_loss": 0.55057126, + "learning_rate": 3.999998141915371e-06, + "loss": 0.57211095, + "num_input_tokens_seen": 10766525, + "router_z_loss_clip": 0.09033203, + "router_z_loss_mlp": 0.9453125, + "step": 506, + "time_per_iteration": 3.2490124702453613 + }, + { + "auxiliary_loss_clip": 0.01274292, + "auxiliary_loss_mlp": 0.01087138, + "balance_loss_clip": 1.05418897, + "balance_loss_mlp": 1.0756762, + "epoch": 0.030482489102660455, + "flos": 19427817000960.0, + "grad_norm": 1.9034614277572226, + "language_loss": 0.83767861, + "learning_rate": 3.999997573114069e-06, + "loss": 0.8612929, + "num_input_tokens_seen": 10786725, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.984375, + "step": 507, + "time_per_iteration": 2.48811674118042 + }, + { + "auxiliary_loss_clip": 0.01280318, + "auxiliary_loss_mlp": 0.01080114, + "balance_loss_clip": 1.04778421, + "balance_loss_mlp": 1.07709789, + "epoch": 0.030542612355328423, + "flos": 20375822701440.0, + "grad_norm": 2.5950154193771526, + "language_loss": 0.88689649, + "learning_rate": 3.999996928472659e-06, + "loss": 0.91050076, + "num_input_tokens_seen": 10805390, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 2.03125, + "step": 508, + "time_per_iteration": 2.4966533184051514 + }, + { + "auxiliary_loss_clip": 0.01281637, + "auxiliary_loss_mlp": 0.01063766, + "balance_loss_clip": 1.0311265, + "balance_loss_mlp": 1.07728887, + "epoch": 0.030602735607996392, + "flos": 34677354297600.0, + "grad_norm": 2.2339008285543227, + "language_loss": 0.71499902, + "learning_rate": 3.999996207991165e-06, + "loss": 0.73845309, + "num_input_tokens_seen": 10828030, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 2.03125, + "step": 509, + "time_per_iteration": 2.5966317653656006 + }, + { + "auxiliary_loss_clip": 0.01274736, + "auxiliary_loss_mlp": 0.01072718, + "balance_loss_clip": 1.04229546, + "balance_loss_mlp": 1.07770133, + "epoch": 0.03066285886066436, + "flos": 23658668259840.0, + "grad_norm": 2.064360756351981, + "language_loss": 0.82369828, + "learning_rate": 3.999995411669614e-06, + "loss": 0.8471728, + "num_input_tokens_seen": 10845240, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.9765625, + "step": 510, + "time_per_iteration": 2.5276355743408203 + }, + { + "auxiliary_loss_clip": 0.01280977, + "auxiliary_loss_mlp": 0.01082699, + "balance_loss_clip": 1.04984498, + "balance_loss_mlp": 1.08235979, + "epoch": 0.030722982113332332, + "flos": 23002975440000.0, + "grad_norm": 2.1614325499153693, + "language_loss": 0.83621502, + "learning_rate": 3.999994539508036e-06, + "loss": 0.85985172, + "num_input_tokens_seen": 10864325, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.984375, + "step": 511, + "time_per_iteration": 2.503779888153076 + }, + { + "auxiliary_loss_clip": 0.01278507, + "auxiliary_loss_mlp": 0.01077898, + "balance_loss_clip": 1.04633093, + "balance_loss_mlp": 1.07648492, + "epoch": 0.0307831053660003, + "flos": 24750855152640.0, + "grad_norm": 2.1059740170821515, + "language_loss": 0.82234836, + "learning_rate": 3.9999935915064655e-06, + "loss": 0.8459124, + "num_input_tokens_seen": 10883860, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 2.03125, + "step": 512, + "time_per_iteration": 2.5306975841522217 + }, + { + "auxiliary_loss_clip": 0.01276149, + "auxiliary_loss_mlp": 0.01077944, + "balance_loss_clip": 1.04499435, + "balance_loss_mlp": 1.0769974, + "epoch": 0.03084322861866827, + "flos": 26140885620480.0, + "grad_norm": 1.9256325141107502, + "language_loss": 0.87030005, + "learning_rate": 3.9999925676649374e-06, + "loss": 0.89384103, + "num_input_tokens_seen": 10904555, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 1.9921875, + "step": 513, + "time_per_iteration": 2.507490634918213 + }, + { + "auxiliary_loss_clip": 0.01281572, + "auxiliary_loss_mlp": 0.01080973, + "balance_loss_clip": 1.04840553, + "balance_loss_mlp": 1.07869625, + "epoch": 0.03090335187133624, + "flos": 18771298168320.0, + "grad_norm": 3.202753983864072, + "language_loss": 0.79141152, + "learning_rate": 3.999991467983491e-06, + "loss": 0.81503695, + "num_input_tokens_seen": 10923700, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 2.03125, + "step": 514, + "time_per_iteration": 2.515496015548706 + }, + { + "auxiliary_loss_clip": 0.01276725, + "auxiliary_loss_mlp": 0.01063014, + "balance_loss_clip": 1.03218651, + "balance_loss_mlp": 1.07966864, + "epoch": 0.030963475124004207, + "flos": 23221886878080.0, + "grad_norm": 2.5461002634459216, + "language_loss": 0.77459693, + "learning_rate": 3.999990292462167e-06, + "loss": 0.79799432, + "num_input_tokens_seen": 10942730, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.96875, + "step": 515, + "time_per_iteration": 2.481903553009033 + }, + { + "auxiliary_loss_clip": 0.01272098, + "auxiliary_loss_mlp": 0.0106896, + "balance_loss_clip": 1.03615332, + "balance_loss_mlp": 1.07318711, + "epoch": 0.03102359837667218, + "flos": 42525595411200.0, + "grad_norm": 1.901518391780262, + "language_loss": 0.82729101, + "learning_rate": 3.999989041101011e-06, + "loss": 0.85070157, + "num_input_tokens_seen": 10967120, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.9921875, + "step": 516, + "time_per_iteration": 2.699577808380127 + }, + { + "auxiliary_loss_clip": 0.01272185, + "auxiliary_loss_mlp": 0.01070291, + "balance_loss_clip": 1.03760433, + "balance_loss_mlp": 1.07659435, + "epoch": 0.031083721629340148, + "flos": 21176953689600.0, + "grad_norm": 2.071844032637654, + "language_loss": 0.79009813, + "learning_rate": 3.999987713900071e-06, + "loss": 0.81352293, + "num_input_tokens_seen": 10986775, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.953125, + "step": 517, + "time_per_iteration": 4.0190205574035645 + }, + { + "auxiliary_loss_clip": 0.01269009, + "auxiliary_loss_mlp": 0.01072314, + "balance_loss_clip": 1.04069996, + "balance_loss_mlp": 1.07610774, + "epoch": 0.031143844882008116, + "flos": 29716187713920.0, + "grad_norm": 1.58218863781409, + "language_loss": 0.90778029, + "learning_rate": 3.999986310859396e-06, + "loss": 0.93119347, + "num_input_tokens_seen": 11011360, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.9296875, + "step": 518, + "time_per_iteration": 4.080751657485962 + }, + { + "auxiliary_loss_clip": 0.0128372, + "auxiliary_loss_mlp": 0.01093666, + "balance_loss_clip": 1.05883288, + "balance_loss_mlp": 1.08518016, + "epoch": 0.031203968134676085, + "flos": 23112467072640.0, + "grad_norm": 3.008779144342936, + "language_loss": 0.86396456, + "learning_rate": 3.999984831979039e-06, + "loss": 0.88773847, + "num_input_tokens_seen": 11030150, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 1.984375, + "step": 519, + "time_per_iteration": 2.510267734527588 + }, + { + "auxiliary_loss_clip": 0.01278708, + "auxiliary_loss_mlp": 0.01092513, + "balance_loss_clip": 1.06092215, + "balance_loss_mlp": 1.07567024, + "epoch": 0.03126409138734405, + "flos": 20954379064320.0, + "grad_norm": 2.0313723427087216, + "language_loss": 0.87156898, + "learning_rate": 3.999983277259057e-06, + "loss": 0.8952812, + "num_input_tokens_seen": 11049145, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 2.03125, + "step": 520, + "time_per_iteration": 2.4891066551208496 + }, + { + "auxiliary_loss_clip": 0.01281744, + "auxiliary_loss_mlp": 0.01089643, + "balance_loss_clip": 1.05633557, + "balance_loss_mlp": 1.07832289, + "epoch": 0.031324214640012026, + "flos": 21650112570240.0, + "grad_norm": 1.6802829394342778, + "language_loss": 0.89362079, + "learning_rate": 3.999981646699509e-06, + "loss": 0.91733468, + "num_input_tokens_seen": 11068835, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 2.03125, + "step": 521, + "time_per_iteration": 2.508524179458618 + }, + { + "auxiliary_loss_clip": 0.01274208, + "auxiliary_loss_mlp": 0.010832, + "balance_loss_clip": 1.04889154, + "balance_loss_mlp": 1.07795191, + "epoch": 0.03138433789267999, + "flos": 23441337020160.0, + "grad_norm": 2.273639697525746, + "language_loss": 0.71327078, + "learning_rate": 3.999979940300456e-06, + "loss": 0.73684484, + "num_input_tokens_seen": 11088980, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.9609375, + "step": 522, + "time_per_iteration": 2.49629282951355 + }, + { + "auxiliary_loss_clip": 0.01278501, + "auxiliary_loss_mlp": 0.01083501, + "balance_loss_clip": 1.05150533, + "balance_loss_mlp": 1.07655358, + "epoch": 0.03144446114534796, + "flos": 18982164960000.0, + "grad_norm": 3.1208656196394706, + "language_loss": 0.84886295, + "learning_rate": 3.999978158061963e-06, + "loss": 0.87248302, + "num_input_tokens_seen": 11104300, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 2.015625, + "step": 523, + "time_per_iteration": 2.4674315452575684 + }, + { + "auxiliary_loss_clip": 0.01280597, + "auxiliary_loss_mlp": 0.01075539, + "balance_loss_clip": 1.04249442, + "balance_loss_mlp": 1.07655168, + "epoch": 0.031504584398015935, + "flos": 22637692080000.0, + "grad_norm": 1.9693639011355857, + "language_loss": 0.90419745, + "learning_rate": 3.999976299984099e-06, + "loss": 0.92775881, + "num_input_tokens_seen": 11123335, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 2.046875, + "step": 524, + "time_per_iteration": 2.480764627456665 + }, + { + "auxiliary_loss_clip": 0.01285248, + "auxiliary_loss_mlp": 0.01083988, + "balance_loss_clip": 1.05034757, + "balance_loss_mlp": 1.08102393, + "epoch": 0.0315647076506839, + "flos": 25297056339840.0, + "grad_norm": 2.4392367222760276, + "language_loss": 0.80040443, + "learning_rate": 3.999974366066933e-06, + "loss": 0.8240968, + "num_input_tokens_seen": 11140880, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 2.046875, + "step": 525, + "time_per_iteration": 2.5409629344940186 + }, + { + "auxiliary_loss_clip": 0.01277675, + "auxiliary_loss_mlp": 0.01082993, + "balance_loss_clip": 1.05025804, + "balance_loss_mlp": 1.07571197, + "epoch": 0.03162483090335187, + "flos": 16982839065600.0, + "grad_norm": 2.8378410017413658, + "language_loss": 0.80693865, + "learning_rate": 3.999972356310538e-06, + "loss": 0.83054531, + "num_input_tokens_seen": 11158710, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 2.03125, + "step": 526, + "time_per_iteration": 2.4509081840515137 + }, + { + "auxiliary_loss_clip": 0.01285808, + "auxiliary_loss_mlp": 0.01072361, + "balance_loss_clip": 1.03655052, + "balance_loss_mlp": 1.08127069, + "epoch": 0.03168495415601984, + "flos": 18734489706240.0, + "grad_norm": 2.27970800213601, + "language_loss": 0.81417823, + "learning_rate": 3.999970270714991e-06, + "loss": 0.83775997, + "num_input_tokens_seen": 11177550, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 2.046875, + "step": 527, + "time_per_iteration": 2.4760756492614746 + }, + { + "auxiliary_loss_clip": 0.01273782, + "auxiliary_loss_mlp": 0.01080634, + "balance_loss_clip": 1.04651666, + "balance_loss_mlp": 1.07408452, + "epoch": 0.03174507740868781, + "flos": 21214875473280.0, + "grad_norm": 2.59751390244888, + "language_loss": 0.93932182, + "learning_rate": 3.999968109280371e-06, + "loss": 0.96286595, + "num_input_tokens_seen": 11196230, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 2.0, + "step": 528, + "time_per_iteration": 2.4721155166625977 + }, + { + "auxiliary_loss_clip": 0.01273884, + "auxiliary_loss_mlp": 0.01073354, + "balance_loss_clip": 1.04083371, + "balance_loss_mlp": 1.07427406, + "epoch": 0.03180520066135578, + "flos": 24787663614720.0, + "grad_norm": 1.8844039207994492, + "language_loss": 0.84143054, + "learning_rate": 3.99996587200676e-06, + "loss": 0.86490291, + "num_input_tokens_seen": 11214935, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 2.0, + "step": 529, + "time_per_iteration": 2.5173239707946777 + }, + { + "auxiliary_loss_clip": 0.01278919, + "auxiliary_loss_mlp": 0.01087129, + "balance_loss_clip": 1.05530047, + "balance_loss_mlp": 1.08254409, + "epoch": 0.03186532391402375, + "flos": 24864261367680.0, + "grad_norm": 2.130233453276154, + "language_loss": 0.90547037, + "learning_rate": 3.999963558894243e-06, + "loss": 0.92913085, + "num_input_tokens_seen": 11235310, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 1.96875, + "step": 530, + "time_per_iteration": 2.5096359252929688 + }, + { + "auxiliary_loss_clip": 0.0127291, + "auxiliary_loss_mlp": 0.01073181, + "balance_loss_clip": 1.03930187, + "balance_loss_mlp": 1.07199419, + "epoch": 0.03192544716669172, + "flos": 21215055041280.0, + "grad_norm": 2.12169085676626, + "language_loss": 0.76197046, + "learning_rate": 3.999961169942907e-06, + "loss": 0.78543139, + "num_input_tokens_seen": 11254425, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 2.015625, + "step": 531, + "time_per_iteration": 2.503265142440796 + }, + { + "auxiliary_loss_clip": 0.01272973, + "auxiliary_loss_mlp": 0.01064442, + "balance_loss_clip": 1.03030038, + "balance_loss_mlp": 1.07424712, + "epoch": 0.03198557041935969, + "flos": 24353216616960.0, + "grad_norm": 2.621085079916904, + "language_loss": 0.9073056, + "learning_rate": 3.999958705152843e-06, + "loss": 0.9306798, + "num_input_tokens_seen": 11274595, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 1.9921875, + "step": 532, + "time_per_iteration": 2.506220817565918 + }, + { + "auxiliary_loss_clip": 0.01137355, + "auxiliary_loss_mlp": 0.01010615, + "balance_loss_clip": 1.00198412, + "balance_loss_mlp": 1.0428524, + "epoch": 0.032045693672027656, + "flos": 61827367587840.0, + "grad_norm": 0.7306749876416057, + "language_loss": 0.57931173, + "learning_rate": 3.9999561645241445e-06, + "loss": 0.60079145, + "num_input_tokens_seen": 11336705, + "router_z_loss_clip": 0.08642578, + "router_z_loss_mlp": 0.9453125, + "step": 533, + "time_per_iteration": 3.154953956604004 + }, + { + "auxiliary_loss_clip": 0.01271016, + "auxiliary_loss_mlp": 0.01084756, + "balance_loss_clip": 1.05209231, + "balance_loss_mlp": 1.07378936, + "epoch": 0.03210581692469563, + "flos": 28401174800640.0, + "grad_norm": 1.8972625930530718, + "language_loss": 0.86725944, + "learning_rate": 3.999953548056907e-06, + "loss": 0.89081717, + "num_input_tokens_seen": 11356820, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 1.96875, + "step": 534, + "time_per_iteration": 2.5384750366210938 + }, + { + "auxiliary_loss_clip": 0.01271847, + "auxiliary_loss_mlp": 0.01066511, + "balance_loss_clip": 1.03468204, + "balance_loss_mlp": 1.07573223, + "epoch": 0.03216594017736359, + "flos": 24717709877760.0, + "grad_norm": 2.118212102173022, + "language_loss": 0.77352351, + "learning_rate": 3.999950855751232e-06, + "loss": 0.79690707, + "num_input_tokens_seen": 11376645, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 1.9609375, + "step": 535, + "time_per_iteration": 2.517940044403076 + }, + { + "auxiliary_loss_clip": 0.01274503, + "auxiliary_loss_mlp": 0.01083227, + "balance_loss_clip": 1.05151725, + "balance_loss_mlp": 1.07644773, + "epoch": 0.032226063430031565, + "flos": 31175453646720.0, + "grad_norm": 2.176836888233088, + "language_loss": 0.8074764, + "learning_rate": 3.999948087607219e-06, + "loss": 0.83105373, + "num_input_tokens_seen": 11397310, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 1.984375, + "step": 536, + "time_per_iteration": 2.546128034591675 + }, + { + "auxiliary_loss_clip": 0.01275643, + "auxiliary_loss_mlp": 0.01077633, + "balance_loss_clip": 1.04361033, + "balance_loss_mlp": 1.07698941, + "epoch": 0.03228618668269954, + "flos": 32198225506560.0, + "grad_norm": 2.3353202427960627, + "language_loss": 0.70118421, + "learning_rate": 3.999945243624975e-06, + "loss": 0.72471696, + "num_input_tokens_seen": 11418475, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 1.9921875, + "step": 537, + "time_per_iteration": 2.578101634979248 + }, + { + "auxiliary_loss_clip": 0.01274556, + "auxiliary_loss_mlp": 0.01081628, + "balance_loss_clip": 1.04877353, + "balance_loss_mlp": 1.08040798, + "epoch": 0.0323463099353675, + "flos": 22670154996480.0, + "grad_norm": 2.1000918694055044, + "language_loss": 0.8250435, + "learning_rate": 3.999942323804607e-06, + "loss": 0.84860539, + "num_input_tokens_seen": 11436630, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.9375, + "step": 538, + "time_per_iteration": 2.4822683334350586 + }, + { + "auxiliary_loss_clip": 0.01280793, + "auxiliary_loss_mlp": 0.01078587, + "balance_loss_clip": 1.0458765, + "balance_loss_mlp": 1.0775007, + "epoch": 0.032406433188035474, + "flos": 26905172232960.0, + "grad_norm": 1.8128048759039839, + "language_loss": 0.78999949, + "learning_rate": 3.999939328146225e-06, + "loss": 0.81359327, + "num_input_tokens_seen": 11457275, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 2.03125, + "step": 539, + "time_per_iteration": 2.5495705604553223 + }, + { + "auxiliary_loss_clip": 0.01274183, + "auxiliary_loss_mlp": 0.01066988, + "balance_loss_clip": 1.03284597, + "balance_loss_mlp": 1.0766232, + "epoch": 0.03246655644070344, + "flos": 31503928544640.0, + "grad_norm": 2.6651388031929835, + "language_loss": 0.77802742, + "learning_rate": 3.999936256649943e-06, + "loss": 0.80143911, + "num_input_tokens_seen": 11476925, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 1.9765625, + "step": 540, + "time_per_iteration": 2.5547144412994385 + }, + { + "auxiliary_loss_clip": 0.01282159, + "auxiliary_loss_mlp": 0.01070004, + "balance_loss_clip": 1.03755546, + "balance_loss_mlp": 1.08122253, + "epoch": 0.03252667969337141, + "flos": 23218331431680.0, + "grad_norm": 2.2422114385304845, + "language_loss": 0.85410464, + "learning_rate": 3.999933109315878e-06, + "loss": 0.8776263, + "num_input_tokens_seen": 11496830, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 2.0, + "step": 541, + "time_per_iteration": 2.517545700073242 + }, + { + "auxiliary_loss_clip": 0.01271503, + "auxiliary_loss_mlp": 0.01083563, + "balance_loss_clip": 1.04906392, + "balance_loss_mlp": 1.07759655, + "epoch": 0.032586802946039384, + "flos": 14757454926720.0, + "grad_norm": 2.210152212848466, + "language_loss": 0.89072484, + "learning_rate": 3.9999298861441496e-06, + "loss": 0.91427547, + "num_input_tokens_seen": 11515605, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.9375, + "step": 542, + "time_per_iteration": 2.437566041946411 + }, + { + "auxiliary_loss_clip": 0.01272694, + "auxiliary_loss_mlp": 0.01075801, + "balance_loss_clip": 1.04289961, + "balance_loss_mlp": 1.07649362, + "epoch": 0.03264692619870735, + "flos": 24280677100800.0, + "grad_norm": 2.3494598042187236, + "language_loss": 0.71096039, + "learning_rate": 3.999926587134879e-06, + "loss": 0.73444533, + "num_input_tokens_seen": 11536230, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.9609375, + "step": 543, + "time_per_iteration": 2.5121288299560547 + }, + { + "auxiliary_loss_clip": 0.0127171, + "auxiliary_loss_mlp": 0.01086873, + "balance_loss_clip": 1.05411386, + "balance_loss_mlp": 1.07139826, + "epoch": 0.03270704945137532, + "flos": 22893160584960.0, + "grad_norm": 2.6617228213889375, + "language_loss": 0.91273057, + "learning_rate": 3.999923212288192e-06, + "loss": 0.93631637, + "num_input_tokens_seen": 11554715, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 2.0, + "step": 544, + "time_per_iteration": 2.529536008834839 + }, + { + "auxiliary_loss_clip": 0.01274727, + "auxiliary_loss_mlp": 0.01082721, + "balance_loss_clip": 1.05186987, + "balance_loss_mlp": 1.07790041, + "epoch": 0.032767172704043286, + "flos": 18041018757120.0, + "grad_norm": 3.144073602630947, + "language_loss": 0.6640051, + "learning_rate": 3.999919761604216e-06, + "loss": 0.68757957, + "num_input_tokens_seen": 11571370, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.96875, + "step": 545, + "time_per_iteration": 2.487250328063965 + }, + { + "auxiliary_loss_clip": 0.01272187, + "auxiliary_loss_mlp": 0.01069604, + "balance_loss_clip": 1.03715563, + "balance_loss_mlp": 1.07393909, + "epoch": 0.03282729595671126, + "flos": 22528739151360.0, + "grad_norm": 2.6288964335615805, + "language_loss": 0.91857421, + "learning_rate": 3.999916235083083e-06, + "loss": 0.94199216, + "num_input_tokens_seen": 11588560, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 1.984375, + "step": 546, + "time_per_iteration": 2.4893922805786133 + }, + { + "auxiliary_loss_clip": 0.0126813, + "auxiliary_loss_mlp": 0.01071134, + "balance_loss_clip": 1.03723049, + "balance_loss_mlp": 1.07095337, + "epoch": 0.03288741920937923, + "flos": 20410620001920.0, + "grad_norm": 2.4455611041839127, + "language_loss": 0.82002354, + "learning_rate": 3.999912632724925e-06, + "loss": 0.84341609, + "num_input_tokens_seen": 11605685, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 1.96875, + "step": 547, + "time_per_iteration": 2.4879701137542725 + }, + { + "auxiliary_loss_clip": 0.01271545, + "auxiliary_loss_mlp": 0.01070995, + "balance_loss_clip": 1.0376637, + "balance_loss_mlp": 1.07550538, + "epoch": 0.032947542462047195, + "flos": 20777986350720.0, + "grad_norm": 3.015836198351779, + "language_loss": 0.80919325, + "learning_rate": 3.999908954529881e-06, + "loss": 0.83261865, + "num_input_tokens_seen": 11626290, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 1.9609375, + "step": 548, + "time_per_iteration": 2.501983404159546 + }, + { + "auxiliary_loss_clip": 0.01270889, + "auxiliary_loss_mlp": 0.01079421, + "balance_loss_clip": 1.04499304, + "balance_loss_mlp": 1.07411838, + "epoch": 0.03300766571471517, + "flos": 19901263190400.0, + "grad_norm": 3.9904289991591217, + "language_loss": 0.67330974, + "learning_rate": 3.999905200498087e-06, + "loss": 0.69681287, + "num_input_tokens_seen": 11643950, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.96875, + "step": 549, + "time_per_iteration": 2.479069948196411 + }, + { + "auxiliary_loss_clip": 0.01265753, + "auxiliary_loss_mlp": 0.01075673, + "balance_loss_clip": 1.04286647, + "balance_loss_mlp": 1.07537639, + "epoch": 0.03306778896738313, + "flos": 17967760968960.0, + "grad_norm": 2.081726350608672, + "language_loss": 0.86137938, + "learning_rate": 3.999901370629689e-06, + "loss": 0.88479364, + "num_input_tokens_seen": 11662560, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.90625, + "step": 550, + "time_per_iteration": 2.435030221939087 + }, + { + "auxiliary_loss_clip": 0.01271779, + "auxiliary_loss_mlp": 0.01089379, + "balance_loss_clip": 1.05712056, + "balance_loss_mlp": 1.07876444, + "epoch": 0.033127912220051105, + "flos": 21653380707840.0, + "grad_norm": 2.0024940554917534, + "language_loss": 0.81302834, + "learning_rate": 3.99989746492483e-06, + "loss": 0.83663994, + "num_input_tokens_seen": 11682265, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 1.9296875, + "step": 551, + "time_per_iteration": 2.474317789077759 + }, + { + "auxiliary_loss_clip": 0.01278525, + "auxiliary_loss_mlp": 0.01080037, + "balance_loss_clip": 1.0469687, + "balance_loss_mlp": 1.0786469, + "epoch": 0.03318803547271908, + "flos": 30188376927360.0, + "grad_norm": 2.5540153370218697, + "language_loss": 0.85907811, + "learning_rate": 3.999893483383658e-06, + "loss": 0.88266373, + "num_input_tokens_seen": 11699300, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 2.0, + "step": 552, + "time_per_iteration": 2.5352437496185303 + }, + { + "auxiliary_loss_clip": 0.01276099, + "auxiliary_loss_mlp": 0.01077197, + "balance_loss_clip": 1.0428648, + "balance_loss_mlp": 1.07894135, + "epoch": 0.03324815872538704, + "flos": 20376038183040.0, + "grad_norm": 2.3148388677976253, + "language_loss": 0.928128, + "learning_rate": 3.999889426006326e-06, + "loss": 0.95166099, + "num_input_tokens_seen": 11716955, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.96875, + "step": 553, + "time_per_iteration": 2.4860291481018066 + }, + { + "auxiliary_loss_clip": 0.01270959, + "auxiliary_loss_mlp": 0.01072703, + "balance_loss_clip": 1.03858554, + "balance_loss_mlp": 1.0755136, + "epoch": 0.033308281978055014, + "flos": 24494560634880.0, + "grad_norm": 2.234190064541142, + "language_loss": 0.78874755, + "learning_rate": 3.999885292792986e-06, + "loss": 0.81218415, + "num_input_tokens_seen": 11736130, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 1.953125, + "step": 554, + "time_per_iteration": 2.4878416061401367 + }, + { + "auxiliary_loss_clip": 0.0126611, + "auxiliary_loss_mlp": 0.01082645, + "balance_loss_clip": 1.04838455, + "balance_loss_mlp": 1.07417822, + "epoch": 0.03336840523072298, + "flos": 23400326666880.0, + "grad_norm": 2.1365458646452424, + "language_loss": 0.82297659, + "learning_rate": 3.999881083743795e-06, + "loss": 0.84646416, + "num_input_tokens_seen": 11754425, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.9140625, + "step": 555, + "time_per_iteration": 2.4846394062042236 + }, + { + "auxiliary_loss_clip": 0.01270081, + "auxiliary_loss_mlp": 0.01075464, + "balance_loss_clip": 1.04156113, + "balance_loss_mlp": 1.07390678, + "epoch": 0.03342852848339095, + "flos": 30550571717760.0, + "grad_norm": 2.781828445596944, + "language_loss": 0.88624835, + "learning_rate": 3.999876798858914e-06, + "loss": 0.90970379, + "num_input_tokens_seen": 11772845, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 1.96875, + "step": 556, + "time_per_iteration": 2.5788674354553223 + }, + { + "auxiliary_loss_clip": 0.01269545, + "auxiliary_loss_mlp": 0.01079314, + "balance_loss_clip": 1.04531527, + "balance_loss_mlp": 1.07534254, + "epoch": 0.03348865173605892, + "flos": 22893304239360.0, + "grad_norm": 2.0860752820949586, + "language_loss": 0.83492053, + "learning_rate": 3.999872438138503e-06, + "loss": 0.85840911, + "num_input_tokens_seen": 11792850, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 1.9375, + "step": 557, + "time_per_iteration": 2.5352954864501953 + }, + { + "auxiliary_loss_clip": 0.01275093, + "auxiliary_loss_mlp": 0.0106652, + "balance_loss_clip": 1.03495288, + "balance_loss_mlp": 1.07979858, + "epoch": 0.03354877498872689, + "flos": 17676022705920.0, + "grad_norm": 9.145612151583265, + "language_loss": 0.94169575, + "learning_rate": 3.999868001582729e-06, + "loss": 0.96511185, + "num_input_tokens_seen": 11809670, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.953125, + "step": 558, + "time_per_iteration": 2.4541964530944824 + }, + { + "auxiliary_loss_clip": 0.01265501, + "auxiliary_loss_mlp": 0.01073065, + "balance_loss_clip": 1.0406878, + "balance_loss_mlp": 1.07178497, + "epoch": 0.03360889824139486, + "flos": 21652985658240.0, + "grad_norm": 2.48174106566098, + "language_loss": 0.7735827, + "learning_rate": 3.99986348919176e-06, + "loss": 0.7969684, + "num_input_tokens_seen": 11829665, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 1.9375, + "step": 559, + "time_per_iteration": 5.362890005111694 + }, + { + "auxiliary_loss_clip": 0.01268387, + "auxiliary_loss_mlp": 0.01078962, + "balance_loss_clip": 1.04818201, + "balance_loss_mlp": 1.07386613, + "epoch": 0.033669021494062826, + "flos": 21795730306560.0, + "grad_norm": 2.071149038386511, + "language_loss": 0.87681198, + "learning_rate": 3.9998589009657675e-06, + "loss": 0.90028548, + "num_input_tokens_seen": 11848190, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.953125, + "step": 560, + "time_per_iteration": 3.9536426067352295 + }, + { + "auxiliary_loss_clip": 0.01264547, + "auxiliary_loss_mlp": 0.01067998, + "balance_loss_clip": 1.0375762, + "balance_loss_mlp": 1.07323277, + "epoch": 0.0337291447467308, + "flos": 21866222747520.0, + "grad_norm": 2.2284071587683463, + "language_loss": 0.81380183, + "learning_rate": 3.999854236904925e-06, + "loss": 0.83712727, + "num_input_tokens_seen": 11864795, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.9140625, + "step": 561, + "time_per_iteration": 2.49826717376709 + }, + { + "auxiliary_loss_clip": 0.01263917, + "auxiliary_loss_mlp": 0.01071053, + "balance_loss_clip": 1.04029727, + "balance_loss_mlp": 1.07403696, + "epoch": 0.03378926799939877, + "flos": 24245951627520.0, + "grad_norm": 1.7768341081574646, + "language_loss": 0.82018232, + "learning_rate": 3.999849497009409e-06, + "loss": 0.84353203, + "num_input_tokens_seen": 11885275, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.90625, + "step": 562, + "time_per_iteration": 2.503990888595581 + }, + { + "auxiliary_loss_clip": 0.01269896, + "auxiliary_loss_mlp": 0.01075498, + "balance_loss_clip": 1.04352641, + "balance_loss_mlp": 1.07592142, + "epoch": 0.033849391252066735, + "flos": 16507812677760.0, + "grad_norm": 1.966221896086353, + "language_loss": 0.84028983, + "learning_rate": 3.999844681279401e-06, + "loss": 0.86374378, + "num_input_tokens_seen": 11903595, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 1.9375, + "step": 563, + "time_per_iteration": 2.464571952819824 + }, + { + "auxiliary_loss_clip": 0.01268432, + "auxiliary_loss_mlp": 0.01079949, + "balance_loss_clip": 1.04866886, + "balance_loss_mlp": 1.07648492, + "epoch": 0.03390951450473471, + "flos": 15669298609920.0, + "grad_norm": 2.359913311978066, + "language_loss": 0.94194812, + "learning_rate": 3.99983978971508e-06, + "loss": 0.96543193, + "num_input_tokens_seen": 11917815, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.921875, + "step": 564, + "time_per_iteration": 2.423762798309326 + }, + { + "auxiliary_loss_clip": 0.01267204, + "auxiliary_loss_mlp": 0.01070001, + "balance_loss_clip": 1.03745687, + "balance_loss_mlp": 1.07225537, + "epoch": 0.03396963775740267, + "flos": 22674787850880.0, + "grad_norm": 3.7666153248687277, + "language_loss": 0.94089758, + "learning_rate": 3.999834822316635e-06, + "loss": 0.96426964, + "num_input_tokens_seen": 11936305, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 1.953125, + "step": 565, + "time_per_iteration": 2.499417543411255 + }, + { + "auxiliary_loss_clip": 0.01140331, + "auxiliary_loss_mlp": 0.0102506, + "balance_loss_clip": 1.01714468, + "balance_loss_mlp": 1.04934859, + "epoch": 0.034029761010070644, + "flos": 64392683063040.0, + "grad_norm": 1.1198796781785882, + "language_loss": 0.54823005, + "learning_rate": 3.9998297790842535e-06, + "loss": 0.569884, + "num_input_tokens_seen": 11998940, + "router_z_loss_clip": 0.07910156, + "router_z_loss_mlp": 0.91015625, + "step": 566, + "time_per_iteration": 3.1322038173675537 + }, + { + "auxiliary_loss_clip": 0.01270043, + "auxiliary_loss_mlp": 0.01072204, + "balance_loss_clip": 1.03837276, + "balance_loss_mlp": 1.0753262, + "epoch": 0.034089884262738616, + "flos": 25004204755200.0, + "grad_norm": 2.6603630269915683, + "language_loss": 0.76780868, + "learning_rate": 3.999824660018126e-06, + "loss": 0.79123116, + "num_input_tokens_seen": 12018860, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 1.9453125, + "step": 567, + "time_per_iteration": 2.5351951122283936 + }, + { + "auxiliary_loss_clip": 0.01261299, + "auxiliary_loss_mlp": 0.01077897, + "balance_loss_clip": 1.04809463, + "balance_loss_mlp": 1.07400167, + "epoch": 0.03415000751540658, + "flos": 28439096584320.0, + "grad_norm": 4.563520524929296, + "language_loss": 0.80796623, + "learning_rate": 3.999819465118447e-06, + "loss": 0.83135819, + "num_input_tokens_seen": 12039675, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.875, + "step": 568, + "time_per_iteration": 2.558093309402466 + }, + { + "auxiliary_loss_clip": 0.01263323, + "auxiliary_loss_mlp": 0.01079335, + "balance_loss_clip": 1.04836476, + "balance_loss_mlp": 1.07628214, + "epoch": 0.034210130768074554, + "flos": 21468727866240.0, + "grad_norm": 1.809578126153619, + "language_loss": 0.86777622, + "learning_rate": 3.999814194385413e-06, + "loss": 0.89120281, + "num_input_tokens_seen": 12057680, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.875, + "step": 569, + "time_per_iteration": 2.500319719314575 + }, + { + "auxiliary_loss_clip": 0.01264002, + "auxiliary_loss_mlp": 0.01073079, + "balance_loss_clip": 1.04227519, + "balance_loss_mlp": 1.07425416, + "epoch": 0.03427025402074252, + "flos": 18697501676160.0, + "grad_norm": 1.8164454228173497, + "language_loss": 0.95802778, + "learning_rate": 3.9998088478192255e-06, + "loss": 0.98139858, + "num_input_tokens_seen": 12076135, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.8984375, + "step": 570, + "time_per_iteration": 2.473808526992798 + }, + { + "auxiliary_loss_clip": 0.01264689, + "auxiliary_loss_mlp": 0.01080759, + "balance_loss_clip": 1.04733253, + "balance_loss_mlp": 1.07053721, + "epoch": 0.03433037727341049, + "flos": 20849987162880.0, + "grad_norm": 2.217921822086313, + "language_loss": 0.79522127, + "learning_rate": 3.9998034254200846e-06, + "loss": 0.81867576, + "num_input_tokens_seen": 12094785, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 1.9375, + "step": 571, + "time_per_iteration": 2.48317813873291 + }, + { + "auxiliary_loss_clip": 0.01265335, + "auxiliary_loss_mlp": 0.01076969, + "balance_loss_clip": 1.04490221, + "balance_loss_mlp": 1.07593679, + "epoch": 0.03439050052607846, + "flos": 25410282986880.0, + "grad_norm": 2.3471183659940555, + "language_loss": 0.79962778, + "learning_rate": 3.999797927188199e-06, + "loss": 0.82305074, + "num_input_tokens_seen": 12114590, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 1.890625, + "step": 572, + "time_per_iteration": 2.5112764835357666 + }, + { + "auxiliary_loss_clip": 0.01270326, + "auxiliary_loss_mlp": 0.0106947, + "balance_loss_clip": 1.03871393, + "balance_loss_mlp": 1.07574439, + "epoch": 0.03445062377874643, + "flos": 17640147997440.0, + "grad_norm": 1.9544136074887903, + "language_loss": 0.84374899, + "learning_rate": 3.999792353123774e-06, + "loss": 0.86714697, + "num_input_tokens_seen": 12132390, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.9453125, + "step": 573, + "time_per_iteration": 2.474212408065796 + }, + { + "auxiliary_loss_clip": 0.01266726, + "auxiliary_loss_mlp": 0.01064214, + "balance_loss_clip": 1.03460276, + "balance_loss_mlp": 1.07282329, + "epoch": 0.0345107470314144, + "flos": 16764502245120.0, + "grad_norm": 3.553507560277694, + "language_loss": 0.76376265, + "learning_rate": 3.999786703227023e-06, + "loss": 0.78707206, + "num_input_tokens_seen": 12149035, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.9375, + "step": 574, + "time_per_iteration": 2.4510116577148438 + }, + { + "auxiliary_loss_clip": 0.01264596, + "auxiliary_loss_mlp": 0.01064423, + "balance_loss_clip": 1.03531194, + "balance_loss_mlp": 1.0731982, + "epoch": 0.03457087028408237, + "flos": 14684448533760.0, + "grad_norm": 2.5278817664157343, + "language_loss": 0.83801597, + "learning_rate": 3.9997809774981606e-06, + "loss": 0.86130619, + "num_input_tokens_seen": 12167530, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.9140625, + "step": 575, + "time_per_iteration": 2.459693193435669 + }, + { + "auxiliary_loss_clip": 0.01260171, + "auxiliary_loss_mlp": 0.01067742, + "balance_loss_clip": 1.03830886, + "balance_loss_mlp": 1.07501364, + "epoch": 0.03463099353675034, + "flos": 20011293527040.0, + "grad_norm": 2.241383472398266, + "language_loss": 0.83726245, + "learning_rate": 3.9997751759374025e-06, + "loss": 0.86054158, + "num_input_tokens_seen": 12186340, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.8515625, + "step": 576, + "time_per_iteration": 2.47292423248291 + }, + { + "auxiliary_loss_clip": 0.01267718, + "auxiliary_loss_mlp": 0.01074956, + "balance_loss_clip": 1.04582155, + "balance_loss_mlp": 1.08247435, + "epoch": 0.03469111678941831, + "flos": 25301150490240.0, + "grad_norm": 2.0876645490308334, + "language_loss": 0.8640908, + "learning_rate": 3.99976929854497e-06, + "loss": 0.88751757, + "num_input_tokens_seen": 12204090, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.8515625, + "step": 577, + "time_per_iteration": 2.529500961303711 + }, + { + "auxiliary_loss_clip": 0.01262371, + "auxiliary_loss_mlp": 0.01069797, + "balance_loss_clip": 1.04028082, + "balance_loss_mlp": 1.0769875, + "epoch": 0.034751240042086275, + "flos": 23259413612160.0, + "grad_norm": 3.2017547958107784, + "language_loss": 0.72333407, + "learning_rate": 3.9997633453210845e-06, + "loss": 0.74665576, + "num_input_tokens_seen": 12224850, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.859375, + "step": 578, + "time_per_iteration": 2.4868762493133545 + }, + { + "auxiliary_loss_clip": 0.01263036, + "auxiliary_loss_mlp": 0.01071071, + "balance_loss_clip": 1.04050565, + "balance_loss_mlp": 1.07441878, + "epoch": 0.03481136329475425, + "flos": 23769237300480.0, + "grad_norm": 1.8544904120227406, + "language_loss": 0.77664137, + "learning_rate": 3.999757316265973e-06, + "loss": 0.79998243, + "num_input_tokens_seen": 12244935, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.8828125, + "step": 579, + "time_per_iteration": 2.50669002532959 + }, + { + "auxiliary_loss_clip": 0.01260844, + "auxiliary_loss_mlp": 0.01077558, + "balance_loss_clip": 1.04634845, + "balance_loss_mlp": 1.07355189, + "epoch": 0.03487148654742222, + "flos": 20157521794560.0, + "grad_norm": 2.5351053977844136, + "language_loss": 0.86927247, + "learning_rate": 3.999751211379863e-06, + "loss": 0.89265645, + "num_input_tokens_seen": 12262140, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.875, + "step": 580, + "time_per_iteration": 2.505908966064453 + }, + { + "auxiliary_loss_clip": 0.01266331, + "auxiliary_loss_mlp": 0.01063953, + "balance_loss_clip": 1.03536677, + "balance_loss_mlp": 1.07510614, + "epoch": 0.034931609800090184, + "flos": 15669585918720.0, + "grad_norm": 4.565959491833327, + "language_loss": 0.82161844, + "learning_rate": 3.999745030662987e-06, + "loss": 0.84492135, + "num_input_tokens_seen": 12280930, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.9140625, + "step": 581, + "time_per_iteration": 2.4735610485076904 + }, + { + "auxiliary_loss_clip": 0.01263493, + "auxiliary_loss_mlp": 0.01067149, + "balance_loss_clip": 1.03832436, + "balance_loss_mlp": 1.07712197, + "epoch": 0.034991733052758156, + "flos": 16362374509440.0, + "grad_norm": 2.2699668532214377, + "language_loss": 0.77498174, + "learning_rate": 3.99973877411558e-06, + "loss": 0.79828823, + "num_input_tokens_seen": 12299125, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.8671875, + "step": 582, + "time_per_iteration": 2.4596173763275146 + }, + { + "auxiliary_loss_clip": 0.01261728, + "auxiliary_loss_mlp": 0.01075668, + "balance_loss_clip": 1.04467332, + "balance_loss_mlp": 1.07715631, + "epoch": 0.03505185630542612, + "flos": 19387309438080.0, + "grad_norm": 2.0991939318744692, + "language_loss": 0.87632537, + "learning_rate": 3.999732441737877e-06, + "loss": 0.89969933, + "num_input_tokens_seen": 12316905, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 1.84375, + "step": 583, + "time_per_iteration": 2.46062970161438 + }, + { + "auxiliary_loss_clip": 0.01268555, + "auxiliary_loss_mlp": 0.01082553, + "balance_loss_clip": 1.05167794, + "balance_loss_mlp": 1.07587278, + "epoch": 0.03511197955809409, + "flos": 21323828401920.0, + "grad_norm": 2.3581841085942004, + "language_loss": 0.80997103, + "learning_rate": 3.99972603353012e-06, + "loss": 0.83348215, + "num_input_tokens_seen": 12335070, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.921875, + "step": 584, + "time_per_iteration": 2.4776926040649414 + }, + { + "auxiliary_loss_clip": 0.01262257, + "auxiliary_loss_mlp": 0.01063212, + "balance_loss_clip": 1.03326654, + "balance_loss_mlp": 1.0725317, + "epoch": 0.035172102810762065, + "flos": 14136595320960.0, + "grad_norm": 2.6245680316153743, + "language_loss": 0.92654932, + "learning_rate": 3.999719549492551e-06, + "loss": 0.94980395, + "num_input_tokens_seen": 12350315, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 1.8984375, + "step": 585, + "time_per_iteration": 2.486678123474121 + }, + { + "auxiliary_loss_clip": 0.01262479, + "auxiliary_loss_mlp": 0.01070226, + "balance_loss_clip": 1.04011369, + "balance_loss_mlp": 1.07368612, + "epoch": 0.03523222606343003, + "flos": 20296890564480.0, + "grad_norm": 2.4855014647160245, + "language_loss": 0.87484592, + "learning_rate": 3.9997129896254165e-06, + "loss": 0.89817297, + "num_input_tokens_seen": 12366030, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.890625, + "step": 586, + "time_per_iteration": 2.457772970199585 + }, + { + "auxiliary_loss_clip": 0.01269677, + "auxiliary_loss_mlp": 0.01071192, + "balance_loss_clip": 1.04137754, + "balance_loss_mlp": 1.07875896, + "epoch": 0.035292349316098, + "flos": 20375822701440.0, + "grad_norm": 1.7854143394247532, + "language_loss": 0.76574278, + "learning_rate": 3.999706353928965e-06, + "loss": 0.78915149, + "num_input_tokens_seen": 12384895, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.90625, + "step": 587, + "time_per_iteration": 2.4794015884399414 + }, + { + "auxiliary_loss_clip": 0.01269924, + "auxiliary_loss_mlp": 0.01061103, + "balance_loss_clip": 1.02991772, + "balance_loss_mlp": 1.07701528, + "epoch": 0.03535247256876597, + "flos": 21468871520640.0, + "grad_norm": 1.6805414217886456, + "language_loss": 0.78441286, + "learning_rate": 3.999699642403449e-06, + "loss": 0.80772316, + "num_input_tokens_seen": 12404980, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.9296875, + "step": 588, + "time_per_iteration": 2.4755733013153076 + }, + { + "auxiliary_loss_clip": 0.01267146, + "auxiliary_loss_mlp": 0.01071411, + "balance_loss_clip": 1.03850961, + "balance_loss_mlp": 1.07600832, + "epoch": 0.03541259582143394, + "flos": 23623044946560.0, + "grad_norm": 2.6477303031273185, + "language_loss": 0.94003904, + "learning_rate": 3.99969285504912e-06, + "loss": 0.96342462, + "num_input_tokens_seen": 12423835, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.9140625, + "step": 589, + "time_per_iteration": 2.515296459197998 + }, + { + "auxiliary_loss_clip": 0.01269747, + "auxiliary_loss_mlp": 0.01067695, + "balance_loss_clip": 1.03803611, + "balance_loss_mlp": 1.07632184, + "epoch": 0.03547271907410191, + "flos": 33726367768320.0, + "grad_norm": 2.4870139863099157, + "language_loss": 0.84060037, + "learning_rate": 3.99968599186624e-06, + "loss": 0.86397475, + "num_input_tokens_seen": 12443135, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.9375, + "step": 590, + "time_per_iteration": 2.583080291748047 + }, + { + "auxiliary_loss_clip": 0.01259593, + "auxiliary_loss_mlp": 0.01062628, + "balance_loss_clip": 1.0342319, + "balance_loss_mlp": 1.07476449, + "epoch": 0.03553284232676988, + "flos": 21142695093120.0, + "grad_norm": 2.031404841890899, + "language_loss": 0.86889851, + "learning_rate": 3.999679052855065e-06, + "loss": 0.89212072, + "num_input_tokens_seen": 12462895, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.8515625, + "step": 591, + "time_per_iteration": 2.497912883758545 + }, + { + "auxiliary_loss_clip": 0.01264593, + "auxiliary_loss_mlp": 0.01070221, + "balance_loss_clip": 1.03917849, + "balance_loss_mlp": 1.07271862, + "epoch": 0.03559296557943785, + "flos": 20046593617920.0, + "grad_norm": 3.1144902928375586, + "language_loss": 0.82980722, + "learning_rate": 3.999672038015861e-06, + "loss": 0.85315537, + "num_input_tokens_seen": 12481515, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 1.921875, + "step": 592, + "time_per_iteration": 2.463977813720703 + }, + { + "auxiliary_loss_clip": 0.01128612, + "auxiliary_loss_mlp": 0.01022486, + "balance_loss_clip": 1.01590526, + "balance_loss_mlp": 1.03881407, + "epoch": 0.035653088832105814, + "flos": 60334597244160.0, + "grad_norm": 0.8806680605255408, + "language_loss": 0.59741807, + "learning_rate": 3.999664947348893e-06, + "loss": 0.61892909, + "num_input_tokens_seen": 12548220, + "router_z_loss_clip": 0.06591797, + "router_z_loss_mlp": 0.8984375, + "step": 593, + "time_per_iteration": 3.1275696754455566 + }, + { + "auxiliary_loss_clip": 0.01262803, + "auxiliary_loss_mlp": 0.01070928, + "balance_loss_clip": 1.03988624, + "balance_loss_mlp": 1.07810974, + "epoch": 0.035713212084773786, + "flos": 20113135562880.0, + "grad_norm": 1.8853114596204945, + "language_loss": 0.87042278, + "learning_rate": 3.999657780854429e-06, + "loss": 0.89376009, + "num_input_tokens_seen": 12566105, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 1.84375, + "step": 594, + "time_per_iteration": 2.522805690765381 + }, + { + "auxiliary_loss_clip": 0.01262874, + "auxiliary_loss_mlp": 0.01064868, + "balance_loss_clip": 1.03539896, + "balance_loss_mlp": 1.07309461, + "epoch": 0.03577333533744176, + "flos": 26285785084800.0, + "grad_norm": 2.3431313884364395, + "language_loss": 0.83481348, + "learning_rate": 3.999650538532742e-06, + "loss": 0.85809088, + "num_input_tokens_seen": 12586680, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.8984375, + "step": 595, + "time_per_iteration": 2.565220832824707 + }, + { + "auxiliary_loss_clip": 0.01261367, + "auxiliary_loss_mlp": 0.01072242, + "balance_loss_clip": 1.04216576, + "balance_loss_mlp": 1.07610273, + "epoch": 0.035833458590109724, + "flos": 10889732211840.0, + "grad_norm": 3.1278930526147426, + "language_loss": 0.96185803, + "learning_rate": 3.999643220384106e-06, + "loss": 0.98519421, + "num_input_tokens_seen": 12601605, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.859375, + "step": 596, + "time_per_iteration": 2.460515260696411 + }, + { + "auxiliary_loss_clip": 0.0126361, + "auxiliary_loss_mlp": 0.0107037, + "balance_loss_clip": 1.04185498, + "balance_loss_mlp": 1.07627654, + "epoch": 0.035893581842777696, + "flos": 22090198003200.0, + "grad_norm": 2.2167421176017204, + "language_loss": 0.82718551, + "learning_rate": 3.999635826408799e-06, + "loss": 0.85052526, + "num_input_tokens_seen": 12620365, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.875, + "step": 597, + "time_per_iteration": 2.5076701641082764 + }, + { + "auxiliary_loss_clip": 0.01261023, + "auxiliary_loss_mlp": 0.01069081, + "balance_loss_clip": 1.03956461, + "balance_loss_mlp": 1.0784421, + "epoch": 0.03595370509544566, + "flos": 23038347358080.0, + "grad_norm": 2.168981908539252, + "language_loss": 0.81386817, + "learning_rate": 3.999628356607101e-06, + "loss": 0.83716923, + "num_input_tokens_seen": 12641140, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.828125, + "step": 598, + "time_per_iteration": 2.531188726425171 + }, + { + "auxiliary_loss_clip": 0.01254264, + "auxiliary_loss_mlp": 0.0106961, + "balance_loss_clip": 1.03894937, + "balance_loss_mlp": 1.07570839, + "epoch": 0.03601382834811363, + "flos": 20777734955520.0, + "grad_norm": 1.9075541218278638, + "language_loss": 0.81387949, + "learning_rate": 3.999620810979295e-06, + "loss": 0.83711827, + "num_input_tokens_seen": 12661080, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.7890625, + "step": 599, + "time_per_iteration": 2.511871576309204 + }, + { + "auxiliary_loss_clip": 0.01262476, + "auxiliary_loss_mlp": 0.01074253, + "balance_loss_clip": 1.04557085, + "balance_loss_mlp": 1.07350755, + "epoch": 0.036073951600781605, + "flos": 23951627585280.0, + "grad_norm": 2.1528215266255604, + "language_loss": 0.86115932, + "learning_rate": 3.999613189525668e-06, + "loss": 0.88452661, + "num_input_tokens_seen": 12678270, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.890625, + "step": 600, + "time_per_iteration": 2.50054669380188 + }, + { + "auxiliary_loss_clip": 0.01254617, + "auxiliary_loss_mlp": 0.01080731, + "balance_loss_clip": 1.05133438, + "balance_loss_mlp": 1.06909621, + "epoch": 0.03613407485344957, + "flos": 18912283050240.0, + "grad_norm": 3.928737875146519, + "language_loss": 0.82175761, + "learning_rate": 3.999605492246508e-06, + "loss": 0.84511113, + "num_input_tokens_seen": 12697295, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.8515625, + "step": 601, + "time_per_iteration": 6.795202255249023 + }, + { + "auxiliary_loss_clip": 0.01253245, + "auxiliary_loss_mlp": 0.01056304, + "balance_loss_clip": 1.02666831, + "balance_loss_mlp": 1.07096183, + "epoch": 0.03619419810611754, + "flos": 23038526926080.0, + "grad_norm": 2.2629653513719252, + "language_loss": 0.75467926, + "learning_rate": 3.999597719142107e-06, + "loss": 0.77777481, + "num_input_tokens_seen": 12716165, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.8203125, + "step": 602, + "time_per_iteration": 2.503629446029663 + }, + { + "auxiliary_loss_clip": 0.01252806, + "auxiliary_loss_mlp": 0.01057069, + "balance_loss_clip": 1.02833962, + "balance_loss_mlp": 1.07078326, + "epoch": 0.03625432135878551, + "flos": 29457774293760.0, + "grad_norm": 1.9962737747137984, + "language_loss": 0.80078572, + "learning_rate": 3.999589870212761e-06, + "loss": 0.82388449, + "num_input_tokens_seen": 12735475, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.8203125, + "step": 603, + "time_per_iteration": 2.568368911743164 + }, + { + "auxiliary_loss_clip": 0.01258325, + "auxiliary_loss_mlp": 0.01061531, + "balance_loss_clip": 1.03320646, + "balance_loss_mlp": 1.07597041, + "epoch": 0.03631444461145348, + "flos": 23508525409920.0, + "grad_norm": 1.9836566776981934, + "language_loss": 0.86801207, + "learning_rate": 3.9995819454587664e-06, + "loss": 0.89121068, + "num_input_tokens_seen": 12754540, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.8203125, + "step": 604, + "time_per_iteration": 2.496415376663208 + }, + { + "auxiliary_loss_clip": 0.01260423, + "auxiliary_loss_mlp": 0.01065702, + "balance_loss_clip": 1.03549433, + "balance_loss_mlp": 1.07688427, + "epoch": 0.03637456786412145, + "flos": 16618130323200.0, + "grad_norm": 3.252638522711271, + "language_loss": 0.81078291, + "learning_rate": 3.999573944880424e-06, + "loss": 0.83404416, + "num_input_tokens_seen": 12773050, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.828125, + "step": 605, + "time_per_iteration": 2.46071457862854 + }, + { + "auxiliary_loss_clip": 0.01255946, + "auxiliary_loss_mlp": 0.01067125, + "balance_loss_clip": 1.04012406, + "balance_loss_mlp": 1.07317901, + "epoch": 0.03643469111678942, + "flos": 15851832549120.0, + "grad_norm": 2.2162807408147964, + "language_loss": 0.85624671, + "learning_rate": 3.9995658684780375e-06, + "loss": 0.87947738, + "num_input_tokens_seen": 12791240, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.828125, + "step": 606, + "time_per_iteration": 2.450775623321533 + }, + { + "auxiliary_loss_clip": 0.01262483, + "auxiliary_loss_mlp": 0.01072166, + "balance_loss_clip": 1.04279351, + "balance_loss_mlp": 1.07551849, + "epoch": 0.03649481436945739, + "flos": 23620387340160.0, + "grad_norm": 2.1498788116147125, + "language_loss": 0.82370651, + "learning_rate": 3.999557716251912e-06, + "loss": 0.84705305, + "num_input_tokens_seen": 12812245, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.8671875, + "step": 607, + "time_per_iteration": 2.4969747066497803 + }, + { + "auxiliary_loss_clip": 0.01255757, + "auxiliary_loss_mlp": 0.01063348, + "balance_loss_clip": 1.035954, + "balance_loss_mlp": 1.07488835, + "epoch": 0.036554937622125354, + "flos": 21755581879680.0, + "grad_norm": 3.329641026295442, + "language_loss": 0.8315016, + "learning_rate": 3.999549488202358e-06, + "loss": 0.8546927, + "num_input_tokens_seen": 12831085, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.8046875, + "step": 608, + "time_per_iteration": 2.4648640155792236 + }, + { + "auxiliary_loss_clip": 0.01260127, + "auxiliary_loss_mlp": 0.0106578, + "balance_loss_clip": 1.03533435, + "balance_loss_mlp": 1.0769459, + "epoch": 0.036615060874793326, + "flos": 17819772935040.0, + "grad_norm": 2.072924568315734, + "language_loss": 0.82258713, + "learning_rate": 3.999541184329688e-06, + "loss": 0.84584618, + "num_input_tokens_seen": 12849115, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.828125, + "step": 609, + "time_per_iteration": 2.4761714935302734 + }, + { + "auxiliary_loss_clip": 0.01266536, + "auxiliary_loss_mlp": 0.01080333, + "balance_loss_clip": 1.05247378, + "balance_loss_mlp": 1.08229148, + "epoch": 0.0366751841274613, + "flos": 26753808320640.0, + "grad_norm": 2.279075715646142, + "language_loss": 0.7924515, + "learning_rate": 3.999532804634215e-06, + "loss": 0.81592017, + "num_input_tokens_seen": 12868005, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.84375, + "step": 610, + "time_per_iteration": 2.512223958969116 + }, + { + "auxiliary_loss_clip": 0.01265179, + "auxiliary_loss_mlp": 0.01076881, + "balance_loss_clip": 1.04767442, + "balance_loss_mlp": 1.07819688, + "epoch": 0.03673530738012926, + "flos": 22196960202240.0, + "grad_norm": 2.108980449215705, + "language_loss": 0.87263799, + "learning_rate": 3.9995243491162575e-06, + "loss": 0.89605856, + "num_input_tokens_seen": 12886890, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.8671875, + "step": 611, + "time_per_iteration": 2.488800525665283 + }, + { + "auxiliary_loss_clip": 0.01257304, + "auxiliary_loss_mlp": 0.01084406, + "balance_loss_clip": 1.05577183, + "balance_loss_mlp": 1.0769043, + "epoch": 0.036795430632797235, + "flos": 24681655601280.0, + "grad_norm": 2.0539399448943145, + "language_loss": 0.72783852, + "learning_rate": 3.999515817776136e-06, + "loss": 0.75125557, + "num_input_tokens_seen": 12906130, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.8046875, + "step": 612, + "time_per_iteration": 2.4950740337371826 + }, + { + "auxiliary_loss_clip": 0.01258776, + "auxiliary_loss_mlp": 0.01069045, + "balance_loss_clip": 1.03999329, + "balance_loss_mlp": 1.07377708, + "epoch": 0.0368555538854652, + "flos": 17748921358080.0, + "grad_norm": 2.903841869182041, + "language_loss": 0.7909385, + "learning_rate": 3.999507210614175e-06, + "loss": 0.81421661, + "num_input_tokens_seen": 12925260, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.8515625, + "step": 613, + "time_per_iteration": 2.4849369525909424 + }, + { + "auxiliary_loss_clip": 0.01253943, + "auxiliary_loss_mlp": 0.01079095, + "balance_loss_clip": 1.05141413, + "balance_loss_mlp": 1.07326341, + "epoch": 0.03691567713813317, + "flos": 20594554571520.0, + "grad_norm": 2.273957434397869, + "language_loss": 0.93266213, + "learning_rate": 3.9994985276307e-06, + "loss": 0.95599246, + "num_input_tokens_seen": 12944590, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.8125, + "step": 614, + "time_per_iteration": 2.4639992713928223 + }, + { + "auxiliary_loss_clip": 0.01263574, + "auxiliary_loss_mlp": 0.01075313, + "balance_loss_clip": 1.04415178, + "balance_loss_mlp": 1.07938302, + "epoch": 0.036975800390801145, + "flos": 33650380546560.0, + "grad_norm": 2.901964177226116, + "language_loss": 0.72534943, + "learning_rate": 3.999489768826041e-06, + "loss": 0.74873829, + "num_input_tokens_seen": 12964785, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.84375, + "step": 615, + "time_per_iteration": 2.601372480392456 + }, + { + "auxiliary_loss_clip": 0.01258092, + "auxiliary_loss_mlp": 0.01071353, + "balance_loss_clip": 1.04299331, + "balance_loss_mlp": 1.07278967, + "epoch": 0.03703592364346911, + "flos": 28293694329600.0, + "grad_norm": 2.023635364571096, + "language_loss": 0.81449711, + "learning_rate": 3.999480934200528e-06, + "loss": 0.83779156, + "num_input_tokens_seen": 12986705, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.8515625, + "step": 616, + "time_per_iteration": 2.5325467586517334 + }, + { + "auxiliary_loss_clip": 0.01256707, + "auxiliary_loss_mlp": 0.01063142, + "balance_loss_clip": 1.03643894, + "balance_loss_mlp": 1.07431316, + "epoch": 0.03709604689613708, + "flos": 31504215853440.0, + "grad_norm": 1.9753277492127743, + "language_loss": 0.67868775, + "learning_rate": 3.999472023754499e-06, + "loss": 0.7018863, + "num_input_tokens_seen": 13010560, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.828125, + "step": 617, + "time_per_iteration": 2.5784177780151367 + }, + { + "auxiliary_loss_clip": 0.01263095, + "auxiliary_loss_mlp": 0.01064585, + "balance_loss_clip": 1.0349381, + "balance_loss_mlp": 1.07892454, + "epoch": 0.03715617014880505, + "flos": 19609381272960.0, + "grad_norm": 3.556814357499394, + "language_loss": 0.80340034, + "learning_rate": 3.99946303748829e-06, + "loss": 0.8266772, + "num_input_tokens_seen": 13028935, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.84375, + "step": 618, + "time_per_iteration": 2.4876792430877686 + }, + { + "auxiliary_loss_clip": 0.01261829, + "auxiliary_loss_mlp": 0.01070874, + "balance_loss_clip": 1.04059458, + "balance_loss_mlp": 1.07458091, + "epoch": 0.03721629340147302, + "flos": 15924192497280.0, + "grad_norm": 2.355648226269084, + "language_loss": 0.91115171, + "learning_rate": 3.999453975402242e-06, + "loss": 0.93447876, + "num_input_tokens_seen": 13046000, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.875, + "step": 619, + "time_per_iteration": 2.4804162979125977 + }, + { + "auxiliary_loss_clip": 0.01259898, + "auxiliary_loss_mlp": 0.01077134, + "balance_loss_clip": 1.04871452, + "balance_loss_mlp": 1.07845378, + "epoch": 0.03727641665414099, + "flos": 21104090951040.0, + "grad_norm": 2.218621959424752, + "language_loss": 0.94397002, + "learning_rate": 3.9994448374967e-06, + "loss": 0.96734041, + "num_input_tokens_seen": 13062995, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.8125, + "step": 620, + "time_per_iteration": 2.4592232704162598 + }, + { + "auxiliary_loss_clip": 0.01257463, + "auxiliary_loss_mlp": 0.01077616, + "balance_loss_clip": 1.04750419, + "balance_loss_mlp": 1.07455909, + "epoch": 0.037336539906808956, + "flos": 24131683486080.0, + "grad_norm": 1.8159025601621845, + "language_loss": 0.77105826, + "learning_rate": 3.999435623772008e-06, + "loss": 0.7944091, + "num_input_tokens_seen": 13084120, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.828125, + "step": 621, + "time_per_iteration": 2.53365159034729 + }, + { + "auxiliary_loss_clip": 0.01255819, + "auxiliary_loss_mlp": 0.01059811, + "balance_loss_clip": 1.03084326, + "balance_loss_mlp": 1.07761526, + "epoch": 0.03739666315947693, + "flos": 22346384780160.0, + "grad_norm": 2.793013868715132, + "language_loss": 0.86895752, + "learning_rate": 3.999426334228518e-06, + "loss": 0.89211386, + "num_input_tokens_seen": 13100035, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.78125, + "step": 622, + "time_per_iteration": 2.472726583480835 + }, + { + "auxiliary_loss_clip": 0.01258428, + "auxiliary_loss_mlp": 0.01064577, + "balance_loss_clip": 1.03591871, + "balance_loss_mlp": 1.07622766, + "epoch": 0.0374567864121449, + "flos": 20449511452800.0, + "grad_norm": 2.261361439009279, + "language_loss": 0.90376818, + "learning_rate": 3.999416968866581e-06, + "loss": 0.9269982, + "num_input_tokens_seen": 13118070, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.8203125, + "step": 623, + "time_per_iteration": 2.486699104309082 + }, + { + "auxiliary_loss_clip": 0.0125978, + "auxiliary_loss_mlp": 0.01075147, + "balance_loss_clip": 1.04626298, + "balance_loss_mlp": 1.07841158, + "epoch": 0.037516909664812866, + "flos": 19208043636480.0, + "grad_norm": 1.9910669563462169, + "language_loss": 0.84149444, + "learning_rate": 3.999407527686551e-06, + "loss": 0.86484373, + "num_input_tokens_seen": 13136355, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.8125, + "step": 624, + "time_per_iteration": 2.4514520168304443 + }, + { + "auxiliary_loss_clip": 0.01261437, + "auxiliary_loss_mlp": 0.01067743, + "balance_loss_clip": 1.03867936, + "balance_loss_mlp": 1.0750618, + "epoch": 0.03757703291748084, + "flos": 35005218664320.0, + "grad_norm": 2.4867963928692554, + "language_loss": 0.66228586, + "learning_rate": 3.999398010688788e-06, + "loss": 0.68557763, + "num_input_tokens_seen": 13155435, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.8671875, + "step": 625, + "time_per_iteration": 2.5765273571014404 + }, + { + "auxiliary_loss_clip": 0.01253583, + "auxiliary_loss_mlp": 0.0106714, + "balance_loss_clip": 1.03697979, + "balance_loss_mlp": 1.07435441, + "epoch": 0.0376371561701488, + "flos": 25483899911040.0, + "grad_norm": 2.071255255654034, + "language_loss": 0.77375329, + "learning_rate": 3.999388417873652e-06, + "loss": 0.79696059, + "num_input_tokens_seen": 13174295, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.7890625, + "step": 626, + "time_per_iteration": 2.5022406578063965 + }, + { + "auxiliary_loss_clip": 0.01258684, + "auxiliary_loss_mlp": 0.01074389, + "balance_loss_clip": 1.04499173, + "balance_loss_mlp": 1.07735705, + "epoch": 0.037697279422816775, + "flos": 18185630912640.0, + "grad_norm": 2.2077512286027288, + "language_loss": 0.81357861, + "learning_rate": 3.999378749241506e-06, + "loss": 0.83690929, + "num_input_tokens_seen": 13192500, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.8125, + "step": 627, + "time_per_iteration": 2.4750607013702393 + }, + { + "auxiliary_loss_clip": 0.01261632, + "auxiliary_loss_mlp": 0.01076941, + "balance_loss_clip": 1.04768682, + "balance_loss_mlp": 1.07859111, + "epoch": 0.03775740267548475, + "flos": 24644272521600.0, + "grad_norm": 3.546199216596373, + "language_loss": 0.88572276, + "learning_rate": 3.999369004792719e-06, + "loss": 0.90910852, + "num_input_tokens_seen": 13213470, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.828125, + "step": 628, + "time_per_iteration": 2.571899890899658 + }, + { + "auxiliary_loss_clip": 0.01253553, + "auxiliary_loss_mlp": 0.01067038, + "balance_loss_clip": 1.03864217, + "balance_loss_mlp": 1.07086658, + "epoch": 0.03781752592815271, + "flos": 21288205088640.0, + "grad_norm": 2.488861546346732, + "language_loss": 0.79683006, + "learning_rate": 3.999359184527658e-06, + "loss": 0.82003593, + "num_input_tokens_seen": 13232365, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.828125, + "step": 629, + "time_per_iteration": 2.486675262451172 + }, + { + "auxiliary_loss_clip": 0.01258011, + "auxiliary_loss_mlp": 0.01067816, + "balance_loss_clip": 1.03977799, + "balance_loss_mlp": 1.07458425, + "epoch": 0.037877649180820684, + "flos": 22089623385600.0, + "grad_norm": 1.7117761504495859, + "language_loss": 0.76808703, + "learning_rate": 3.999349288446696e-06, + "loss": 0.79134536, + "num_input_tokens_seen": 13251920, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.8359375, + "step": 630, + "time_per_iteration": 2.494297742843628 + }, + { + "auxiliary_loss_clip": 0.01262746, + "auxiliary_loss_mlp": 0.01070638, + "balance_loss_clip": 1.04250503, + "balance_loss_mlp": 1.07651484, + "epoch": 0.03793777243348865, + "flos": 14501339976960.0, + "grad_norm": 2.6765452133705403, + "language_loss": 0.91492796, + "learning_rate": 3.99933931655021e-06, + "loss": 0.93826187, + "num_input_tokens_seen": 13267440, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.859375, + "step": 631, + "time_per_iteration": 2.4605348110198975 + }, + { + "auxiliary_loss_clip": 0.01252436, + "auxiliary_loss_mlp": 0.01076716, + "balance_loss_clip": 1.04560196, + "balance_loss_mlp": 1.07244229, + "epoch": 0.03799789568615662, + "flos": 21908418249600.0, + "grad_norm": 1.669704350294595, + "language_loss": 0.9207651, + "learning_rate": 3.999329268838575e-06, + "loss": 0.94405663, + "num_input_tokens_seen": 13287850, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 1.796875, + "step": 632, + "time_per_iteration": 2.518498659133911 + }, + { + "auxiliary_loss_clip": 0.01255106, + "auxiliary_loss_mlp": 0.01058467, + "balance_loss_clip": 1.03069162, + "balance_loss_mlp": 1.07462335, + "epoch": 0.03805801893882459, + "flos": 24827021942400.0, + "grad_norm": 2.0828864645498872, + "language_loss": 0.8341018, + "learning_rate": 3.999319145312175e-06, + "loss": 0.85723758, + "num_input_tokens_seen": 13307760, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.8046875, + "step": 633, + "time_per_iteration": 2.5217537879943848 + }, + { + "auxiliary_loss_clip": 0.01258224, + "auxiliary_loss_mlp": 0.01071025, + "balance_loss_clip": 1.04153264, + "balance_loss_mlp": 1.07408428, + "epoch": 0.03811814219149256, + "flos": 30482952364800.0, + "grad_norm": 1.6987522649376106, + "language_loss": 0.69638437, + "learning_rate": 3.999308945971392e-06, + "loss": 0.71967685, + "num_input_tokens_seen": 13331230, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.84375, + "step": 634, + "time_per_iteration": 2.5694239139556885 + }, + { + "auxiliary_loss_clip": 0.01127675, + "auxiliary_loss_mlp": 0.01017483, + "balance_loss_clip": 1.0106163, + "balance_loss_mlp": 1.04225707, + "epoch": 0.03817826544416053, + "flos": 66992577379200.0, + "grad_norm": 0.8852243261294688, + "language_loss": 0.61585373, + "learning_rate": 3.999298670816614e-06, + "loss": 0.63730532, + "num_input_tokens_seen": 13394760, + "router_z_loss_clip": 0.06884766, + "router_z_loss_mlp": 0.8515625, + "step": 635, + "time_per_iteration": 3.1059212684631348 + }, + { + "auxiliary_loss_clip": 0.01253433, + "auxiliary_loss_mlp": 0.01068627, + "balance_loss_clip": 1.03930187, + "balance_loss_mlp": 1.07354546, + "epoch": 0.038238388696828496, + "flos": 20485350247680.0, + "grad_norm": 2.2313569204055246, + "language_loss": 0.83721048, + "learning_rate": 3.9992883198482294e-06, + "loss": 0.86043108, + "num_input_tokens_seen": 13412775, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.796875, + "step": 636, + "time_per_iteration": 2.4975383281707764 + }, + { + "auxiliary_loss_clip": 0.01258014, + "auxiliary_loss_mlp": 0.01077997, + "balance_loss_clip": 1.04852867, + "balance_loss_mlp": 1.07623935, + "epoch": 0.03829851194949647, + "flos": 17965893461760.0, + "grad_norm": 2.4018992949787847, + "language_loss": 0.79327047, + "learning_rate": 3.999277893066632e-06, + "loss": 0.8166306, + "num_input_tokens_seen": 13427835, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.8203125, + "step": 637, + "time_per_iteration": 2.4560744762420654 + }, + { + "auxiliary_loss_clip": 0.01258084, + "auxiliary_loss_mlp": 0.01073075, + "balance_loss_clip": 1.04342771, + "balance_loss_mlp": 1.07309079, + "epoch": 0.03835863520216444, + "flos": 22456522857600.0, + "grad_norm": 2.8779285506389924, + "language_loss": 0.8410306, + "learning_rate": 3.999267390472215e-06, + "loss": 0.86434221, + "num_input_tokens_seen": 13447295, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.8515625, + "step": 638, + "time_per_iteration": 2.504343271255493 + }, + { + "auxiliary_loss_clip": 0.01263348, + "auxiliary_loss_mlp": 0.01067898, + "balance_loss_clip": 1.03717756, + "balance_loss_mlp": 1.07495832, + "epoch": 0.038418758454832405, + "flos": 22164425458560.0, + "grad_norm": 2.5416523890288976, + "language_loss": 0.70099992, + "learning_rate": 3.999256812065381e-06, + "loss": 0.72431237, + "num_input_tokens_seen": 13468455, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.890625, + "step": 639, + "time_per_iteration": 2.52817964553833 + }, + { + "auxiliary_loss_clip": 0.01259266, + "auxiliary_loss_mlp": 0.01075603, + "balance_loss_clip": 1.04463232, + "balance_loss_mlp": 1.07514286, + "epoch": 0.03847888170750038, + "flos": 22747435107840.0, + "grad_norm": 2.42201861797838, + "language_loss": 0.85030365, + "learning_rate": 3.999246157846526e-06, + "loss": 0.8736524, + "num_input_tokens_seen": 13489085, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.84375, + "step": 640, + "time_per_iteration": 2.503262758255005 + }, + { + "auxiliary_loss_clip": 0.0126167, + "auxiliary_loss_mlp": 0.0107911, + "balance_loss_clip": 1.04725742, + "balance_loss_mlp": 1.07574821, + "epoch": 0.03853900496016834, + "flos": 22711201263360.0, + "grad_norm": 2.3722848939528953, + "language_loss": 0.82117289, + "learning_rate": 3.9992354278160574e-06, + "loss": 0.84458065, + "num_input_tokens_seen": 13509120, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 1.859375, + "step": 641, + "time_per_iteration": 2.51052188873291 + }, + { + "auxiliary_loss_clip": 0.01118992, + "auxiliary_loss_mlp": 0.01008303, + "balance_loss_clip": 1.00205636, + "balance_loss_mlp": 1.03414774, + "epoch": 0.038599128212836314, + "flos": 70399136355840.0, + "grad_norm": 0.9008353353488252, + "language_loss": 0.6540072, + "learning_rate": 3.999224621974381e-06, + "loss": 0.67528021, + "num_input_tokens_seen": 13562005, + "router_z_loss_clip": 0.06225586, + "router_z_loss_mlp": 0.8515625, + "step": 642, + "time_per_iteration": 4.430839538574219 + }, + { + "auxiliary_loss_clip": 0.01256856, + "auxiliary_loss_mlp": 0.01062608, + "balance_loss_clip": 1.03433132, + "balance_loss_mlp": 1.07364345, + "epoch": 0.03865925146550429, + "flos": 23295144666240.0, + "grad_norm": 1.9870813050305103, + "language_loss": 0.79512584, + "learning_rate": 3.999213740321906e-06, + "loss": 0.81832051, + "num_input_tokens_seen": 13582185, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.8359375, + "step": 643, + "time_per_iteration": 5.386199951171875 + }, + { + "auxiliary_loss_clip": 0.01255871, + "auxiliary_loss_mlp": 0.01074414, + "balance_loss_clip": 1.0456841, + "balance_loss_mlp": 1.07266629, + "epoch": 0.03871937471817225, + "flos": 21430446946560.0, + "grad_norm": 2.074949815918338, + "language_loss": 0.82926929, + "learning_rate": 3.999202782859046e-06, + "loss": 0.85257208, + "num_input_tokens_seen": 13599555, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.828125, + "step": 644, + "time_per_iteration": 2.45499587059021 + }, + { + "auxiliary_loss_clip": 0.01260265, + "auxiliary_loss_mlp": 0.0106622, + "balance_loss_clip": 1.03503489, + "balance_loss_mlp": 1.07482159, + "epoch": 0.038779497970840224, + "flos": 34277309550720.0, + "grad_norm": 2.258008571643512, + "language_loss": 0.82131916, + "learning_rate": 3.9991917495862165e-06, + "loss": 0.84458399, + "num_input_tokens_seen": 13621160, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.859375, + "step": 645, + "time_per_iteration": 2.610435724258423 + }, + { + "auxiliary_loss_clip": 0.0126099, + "auxiliary_loss_mlp": 0.01070847, + "balance_loss_clip": 1.04121125, + "balance_loss_mlp": 1.07544899, + "epoch": 0.03883962122350819, + "flos": 22748189293440.0, + "grad_norm": 2.4729923618605554, + "language_loss": 0.82006776, + "learning_rate": 3.9991806405038345e-06, + "loss": 0.84338611, + "num_input_tokens_seen": 13641915, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.8515625, + "step": 646, + "time_per_iteration": 2.4771342277526855 + }, + { + "auxiliary_loss_clip": 0.01260575, + "auxiliary_loss_mlp": 0.01080585, + "balance_loss_clip": 1.05123544, + "balance_loss_mlp": 1.07928514, + "epoch": 0.03889974447617616, + "flos": 21945837242880.0, + "grad_norm": 1.8327945326632593, + "language_loss": 0.81973422, + "learning_rate": 3.999169455612323e-06, + "loss": 0.84314579, + "num_input_tokens_seen": 13661410, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.8125, + "step": 647, + "time_per_iteration": 2.522347927093506 + }, + { + "auxiliary_loss_clip": 0.01260388, + "auxiliary_loss_mlp": 0.01069051, + "balance_loss_clip": 1.03965366, + "balance_loss_mlp": 1.07776546, + "epoch": 0.03895986772884413, + "flos": 31504826384640.0, + "grad_norm": 1.9222642653000834, + "language_loss": 0.84699827, + "learning_rate": 3.999158194912106e-06, + "loss": 0.87029266, + "num_input_tokens_seen": 13681705, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.828125, + "step": 648, + "time_per_iteration": 2.561929941177368 + }, + { + "auxiliary_loss_clip": 0.01258218, + "auxiliary_loss_mlp": 0.01071465, + "balance_loss_clip": 1.041448, + "balance_loss_mlp": 1.07636404, + "epoch": 0.0390199909815121, + "flos": 19901011795200.0, + "grad_norm": 3.7283662397985053, + "language_loss": 0.84446943, + "learning_rate": 3.9991468584036086e-06, + "loss": 0.86776626, + "num_input_tokens_seen": 13700400, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.8203125, + "step": 649, + "time_per_iteration": 2.477027416229248 + }, + { + "auxiliary_loss_clip": 0.01259496, + "auxiliary_loss_mlp": 0.01070031, + "balance_loss_clip": 1.03977561, + "balance_loss_mlp": 1.07551885, + "epoch": 0.03908011423418007, + "flos": 21612478095360.0, + "grad_norm": 1.8508721849532739, + "language_loss": 0.79670662, + "learning_rate": 3.999135446087263e-06, + "loss": 0.8200019, + "num_input_tokens_seen": 13720145, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.84375, + "step": 650, + "time_per_iteration": 2.482440948486328 + }, + { + "auxiliary_loss_clip": 0.0125375, + "auxiliary_loss_mlp": 0.01072982, + "balance_loss_clip": 1.04314423, + "balance_loss_mlp": 1.07259929, + "epoch": 0.039140237486848035, + "flos": 18661411486080.0, + "grad_norm": 2.708739352564946, + "language_loss": 0.78509629, + "learning_rate": 3.9991239579635e-06, + "loss": 0.80836356, + "num_input_tokens_seen": 13737500, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.8125, + "step": 651, + "time_per_iteration": 2.4757516384124756 + }, + { + "auxiliary_loss_clip": 0.01255418, + "auxiliary_loss_mlp": 0.01082545, + "balance_loss_clip": 1.05004883, + "balance_loss_mlp": 1.0719974, + "epoch": 0.03920036073951601, + "flos": 18661124177280.0, + "grad_norm": 2.7896665115169244, + "language_loss": 0.88031149, + "learning_rate": 3.999112394032757e-06, + "loss": 0.90369117, + "num_input_tokens_seen": 13754750, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 1.8359375, + "step": 652, + "time_per_iteration": 2.4425668716430664 + }, + { + "auxiliary_loss_clip": 0.01249027, + "auxiliary_loss_mlp": 0.01069663, + "balance_loss_clip": 1.0411005, + "balance_loss_mlp": 1.07108784, + "epoch": 0.03926048399218398, + "flos": 31354468053120.0, + "grad_norm": 3.185528651545475, + "language_loss": 0.79044777, + "learning_rate": 3.999100754295471e-06, + "loss": 0.81363463, + "num_input_tokens_seen": 13771990, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.78125, + "step": 653, + "time_per_iteration": 2.5651934146881104 + }, + { + "auxiliary_loss_clip": 0.01264568, + "auxiliary_loss_mlp": 0.01070462, + "balance_loss_clip": 1.03996825, + "balance_loss_mlp": 1.07603264, + "epoch": 0.039320607244851945, + "flos": 29603499770880.0, + "grad_norm": 2.207303268368246, + "language_loss": 0.86304128, + "learning_rate": 3.999089038752085e-06, + "loss": 0.88639158, + "num_input_tokens_seen": 13792750, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.8828125, + "step": 654, + "time_per_iteration": 2.533297061920166 + }, + { + "auxiliary_loss_clip": 0.01115043, + "auxiliary_loss_mlp": 0.01012751, + "balance_loss_clip": 1.00710094, + "balance_loss_mlp": 1.03246427, + "epoch": 0.03938073049751992, + "flos": 66534609951360.0, + "grad_norm": 0.7205066186016396, + "language_loss": 0.49900642, + "learning_rate": 3.999077247403041e-06, + "loss": 0.5202843, + "num_input_tokens_seen": 13858570, + "router_z_loss_clip": 0.05639648, + "router_z_loss_mlp": 0.82421875, + "step": 655, + "time_per_iteration": 3.1399919986724854 + }, + { + "auxiliary_loss_clip": 0.01251012, + "auxiliary_loss_mlp": 0.01066863, + "balance_loss_clip": 1.03866971, + "balance_loss_mlp": 1.07330465, + "epoch": 0.03944085375018788, + "flos": 23367827836800.0, + "grad_norm": 2.4228021909793918, + "language_loss": 0.80845964, + "learning_rate": 3.9990653802487886e-06, + "loss": 0.83163846, + "num_input_tokens_seen": 13876335, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.78125, + "step": 656, + "time_per_iteration": 2.5063297748565674 + }, + { + "auxiliary_loss_clip": 0.01264211, + "auxiliary_loss_mlp": 0.0109165, + "balance_loss_clip": 1.0566026, + "balance_loss_mlp": 1.07672703, + "epoch": 0.039500977002855854, + "flos": 18548292579840.0, + "grad_norm": 2.8602268717749526, + "language_loss": 0.76602596, + "learning_rate": 3.999053437289776e-06, + "loss": 0.78958458, + "num_input_tokens_seen": 13892640, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 1.875, + "step": 657, + "time_per_iteration": 2.4405555725097656 + }, + { + "auxiliary_loss_clip": 0.01258331, + "auxiliary_loss_mlp": 0.01071967, + "balance_loss_clip": 1.04192615, + "balance_loss_mlp": 1.07452726, + "epoch": 0.039561100255523826, + "flos": 25338174433920.0, + "grad_norm": 2.1526815744488945, + "language_loss": 0.81690443, + "learning_rate": 3.999041418526457e-06, + "loss": 0.84020746, + "num_input_tokens_seen": 13910085, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.84375, + "step": 658, + "time_per_iteration": 2.5383949279785156 + }, + { + "auxiliary_loss_clip": 0.01252051, + "auxiliary_loss_mlp": 0.01072669, + "balance_loss_clip": 1.04091132, + "balance_loss_mlp": 1.07283425, + "epoch": 0.03962122350819179, + "flos": 18219889509120.0, + "grad_norm": 2.2075021313123777, + "language_loss": 0.91331315, + "learning_rate": 3.999029323959287e-06, + "loss": 0.93656039, + "num_input_tokens_seen": 13928800, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.796875, + "step": 659, + "time_per_iteration": 2.4678854942321777 + }, + { + "auxiliary_loss_clip": 0.01259034, + "auxiliary_loss_mlp": 0.01066414, + "balance_loss_clip": 1.03699267, + "balance_loss_mlp": 1.07427669, + "epoch": 0.03968134676085976, + "flos": 20522230536960.0, + "grad_norm": 2.5412719342676215, + "language_loss": 0.79241848, + "learning_rate": 3.999017153588724e-06, + "loss": 0.81567293, + "num_input_tokens_seen": 13948325, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.8515625, + "step": 660, + "time_per_iteration": 2.5135834217071533 + }, + { + "auxiliary_loss_clip": 0.01255641, + "auxiliary_loss_mlp": 0.01070807, + "balance_loss_clip": 1.04017007, + "balance_loss_mlp": 1.07534087, + "epoch": 0.03974147001352773, + "flos": 22422587483520.0, + "grad_norm": 1.6909533460123631, + "language_loss": 0.81942898, + "learning_rate": 3.999004907415231e-06, + "loss": 0.84269351, + "num_input_tokens_seen": 13969090, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.796875, + "step": 661, + "time_per_iteration": 2.513702154159546 + }, + { + "auxiliary_loss_clip": 0.01112947, + "auxiliary_loss_mlp": 0.01010967, + "balance_loss_clip": 1.00519753, + "balance_loss_mlp": 1.03039932, + "epoch": 0.0398015932661957, + "flos": 71128769322240.0, + "grad_norm": 0.9113020435813882, + "language_loss": 0.69376045, + "learning_rate": 3.998992585439272e-06, + "loss": 0.7149995, + "num_input_tokens_seen": 14037555, + "router_z_loss_clip": 0.05761719, + "router_z_loss_mlp": 0.82421875, + "step": 662, + "time_per_iteration": 3.2435107231140137 + }, + { + "auxiliary_loss_clip": 0.01260063, + "auxiliary_loss_mlp": 0.01071537, + "balance_loss_clip": 1.04113865, + "balance_loss_mlp": 1.0779382, + "epoch": 0.03986171651886367, + "flos": 16800951571200.0, + "grad_norm": 2.025040011333182, + "language_loss": 0.83253002, + "learning_rate": 3.998980187661314e-06, + "loss": 0.85584599, + "num_input_tokens_seen": 14055765, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.8125, + "step": 663, + "time_per_iteration": 2.5213887691497803 + }, + { + "auxiliary_loss_clip": 0.01261822, + "auxiliary_loss_mlp": 0.0106269, + "balance_loss_clip": 1.032125, + "balance_loss_mlp": 1.07768416, + "epoch": 0.03992183977153164, + "flos": 24535068197760.0, + "grad_norm": 2.8595031628608143, + "language_loss": 0.87538105, + "learning_rate": 3.998967714081826e-06, + "loss": 0.89862621, + "num_input_tokens_seen": 14074195, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.84375, + "step": 664, + "time_per_iteration": 2.516810655593872 + }, + { + "auxiliary_loss_clip": 0.0125116, + "auxiliary_loss_mlp": 0.01060628, + "balance_loss_clip": 1.03065872, + "balance_loss_mlp": 1.07347679, + "epoch": 0.03998196302419961, + "flos": 15595897167360.0, + "grad_norm": 2.3519362819230625, + "language_loss": 0.84738994, + "learning_rate": 3.998955164701281e-06, + "loss": 0.87050784, + "num_input_tokens_seen": 14090215, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 1.7734375, + "step": 665, + "time_per_iteration": 2.4348978996276855 + }, + { + "auxiliary_loss_clip": 0.01263346, + "auxiliary_loss_mlp": 0.01087391, + "balance_loss_clip": 1.05525231, + "balance_loss_mlp": 1.07680821, + "epoch": 0.04004208627686758, + "flos": 25305065072640.0, + "grad_norm": 2.1279588772882687, + "language_loss": 0.81491798, + "learning_rate": 3.998942539520158e-06, + "loss": 0.83842534, + "num_input_tokens_seen": 14112150, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 1.8671875, + "step": 666, + "time_per_iteration": 2.564187526702881 + }, + { + "auxiliary_loss_clip": 0.01252779, + "auxiliary_loss_mlp": 0.01074876, + "balance_loss_clip": 1.04276049, + "balance_loss_mlp": 1.07225358, + "epoch": 0.04010220952953555, + "flos": 23475847011840.0, + "grad_norm": 2.9939634291419526, + "language_loss": 0.87121451, + "learning_rate": 3.998929838538932e-06, + "loss": 0.89449108, + "num_input_tokens_seen": 14131475, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 1.8046875, + "step": 667, + "time_per_iteration": 2.547909736633301 + }, + { + "auxiliary_loss_clip": 0.0125258, + "auxiliary_loss_mlp": 0.01066007, + "balance_loss_clip": 1.03661036, + "balance_loss_mlp": 1.07692444, + "epoch": 0.04016233278220352, + "flos": 18617025254400.0, + "grad_norm": 2.627098567014159, + "language_loss": 0.80619991, + "learning_rate": 3.998917061758087e-06, + "loss": 0.82938576, + "num_input_tokens_seen": 14146165, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.7578125, + "step": 668, + "time_per_iteration": 2.441667079925537 + }, + { + "auxiliary_loss_clip": 0.01110895, + "auxiliary_loss_mlp": 0.01011229, + "balance_loss_clip": 1.0053643, + "balance_loss_mlp": 1.02968836, + "epoch": 0.040222456034871484, + "flos": 70906194696960.0, + "grad_norm": 0.7872457900726799, + "language_loss": 0.60042131, + "learning_rate": 3.998904209178107e-06, + "loss": 0.62164247, + "num_input_tokens_seen": 14215005, + "router_z_loss_clip": 0.05859375, + "router_z_loss_mlp": 0.8125, + "step": 669, + "time_per_iteration": 3.200874090194702 + }, + { + "auxiliary_loss_clip": 0.01253738, + "auxiliary_loss_mlp": 0.0107276, + "balance_loss_clip": 1.0431962, + "balance_loss_mlp": 1.07228541, + "epoch": 0.040282579287539456, + "flos": 23764712186880.0, + "grad_norm": 1.7415828974469272, + "language_loss": 0.86405391, + "learning_rate": 3.9988912807994785e-06, + "loss": 0.88731897, + "num_input_tokens_seen": 14235510, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.8125, + "step": 670, + "time_per_iteration": 2.5169434547424316 + }, + { + "auxiliary_loss_clip": 0.0124964, + "auxiliary_loss_mlp": 0.01070621, + "balance_loss_clip": 1.0414381, + "balance_loss_mlp": 1.07305872, + "epoch": 0.04034270254020743, + "flos": 18478518410880.0, + "grad_norm": 1.9261739939324196, + "language_loss": 0.752123, + "learning_rate": 3.998878276622692e-06, + "loss": 0.7753256, + "num_input_tokens_seen": 14254565, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.765625, + "step": 671, + "time_per_iteration": 2.514566421508789 + }, + { + "auxiliary_loss_clip": 0.01259516, + "auxiliary_loss_mlp": 0.01075144, + "balance_loss_clip": 1.04472136, + "balance_loss_mlp": 1.0774349, + "epoch": 0.040402825792875394, + "flos": 17201858244480.0, + "grad_norm": 2.0846907245314688, + "language_loss": 0.92279977, + "learning_rate": 3.998865196648242e-06, + "loss": 0.94614637, + "num_input_tokens_seen": 14271885, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.8203125, + "step": 672, + "time_per_iteration": 2.464036226272583 + }, + { + "auxiliary_loss_clip": 0.01253491, + "auxiliary_loss_mlp": 0.01071171, + "balance_loss_clip": 1.03921115, + "balance_loss_mlp": 1.07329202, + "epoch": 0.040462949045543366, + "flos": 19172168928000.0, + "grad_norm": 1.816355722874097, + "language_loss": 0.90220857, + "learning_rate": 3.998852040876622e-06, + "loss": 0.92545515, + "num_input_tokens_seen": 14289670, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 1.796875, + "step": 673, + "time_per_iteration": 2.450547456741333 + }, + { + "auxiliary_loss_clip": 0.01249229, + "auxiliary_loss_mlp": 0.01077482, + "balance_loss_clip": 1.0463202, + "balance_loss_mlp": 1.07150948, + "epoch": 0.04052307229821133, + "flos": 24019821555840.0, + "grad_norm": 2.117589951798075, + "language_loss": 0.74881005, + "learning_rate": 3.998838809308334e-06, + "loss": 0.77207714, + "num_input_tokens_seen": 14309285, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.78125, + "step": 674, + "time_per_iteration": 2.5444436073303223 + }, + { + "auxiliary_loss_clip": 0.01260981, + "auxiliary_loss_mlp": 0.01061202, + "balance_loss_clip": 1.03036261, + "balance_loss_mlp": 1.07609737, + "epoch": 0.0405831955508793, + "flos": 16436601964800.0, + "grad_norm": 2.2422867770418797, + "language_loss": 0.78305578, + "learning_rate": 3.9988255019438766e-06, + "loss": 0.80627763, + "num_input_tokens_seen": 14328300, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.84375, + "step": 675, + "time_per_iteration": 2.4525954723358154 + }, + { + "auxiliary_loss_clip": 0.01252319, + "auxiliary_loss_mlp": 0.01078615, + "balance_loss_clip": 1.04578447, + "balance_loss_mlp": 1.07254028, + "epoch": 0.040643318803547275, + "flos": 24279922915200.0, + "grad_norm": 1.7072695919905723, + "language_loss": 0.76650077, + "learning_rate": 3.998812118783757e-06, + "loss": 0.78981006, + "num_input_tokens_seen": 14346395, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.796875, + "step": 676, + "time_per_iteration": 2.530043840408325 + }, + { + "auxiliary_loss_clip": 0.01258388, + "auxiliary_loss_mlp": 0.01076398, + "balance_loss_clip": 1.04564214, + "balance_loss_mlp": 1.0750767, + "epoch": 0.04070344205621524, + "flos": 17712076982400.0, + "grad_norm": 2.3168648577819138, + "language_loss": 0.85182011, + "learning_rate": 3.9987986598284804e-06, + "loss": 0.87516803, + "num_input_tokens_seen": 14364605, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.828125, + "step": 677, + "time_per_iteration": 2.4390082359313965 + }, + { + "auxiliary_loss_clip": 0.01249568, + "auxiliary_loss_mlp": 0.01068372, + "balance_loss_clip": 1.03804517, + "balance_loss_mlp": 1.071486, + "epoch": 0.04076356530888321, + "flos": 26177658168960.0, + "grad_norm": 1.7808730288109123, + "language_loss": 0.76348364, + "learning_rate": 3.998785125078559e-06, + "loss": 0.78666306, + "num_input_tokens_seen": 14385265, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.78125, + "step": 678, + "time_per_iteration": 2.5151596069335938 + }, + { + "auxiliary_loss_clip": 0.01250603, + "auxiliary_loss_mlp": 0.01066495, + "balance_loss_clip": 1.03807509, + "balance_loss_mlp": 1.07162285, + "epoch": 0.04082368856155118, + "flos": 35773455772800.0, + "grad_norm": 1.9938089142752387, + "language_loss": 0.82114184, + "learning_rate": 3.998771514534505e-06, + "loss": 0.84431279, + "num_input_tokens_seen": 14406090, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.7890625, + "step": 679, + "time_per_iteration": 2.5701568126678467 + }, + { + "auxiliary_loss_clip": 0.01255726, + "auxiliary_loss_mlp": 0.01057721, + "balance_loss_clip": 1.02799058, + "balance_loss_mlp": 1.07693028, + "epoch": 0.04088381181421915, + "flos": 28146640049280.0, + "grad_norm": 1.893911305727382, + "language_loss": 0.76349533, + "learning_rate": 3.998757828196835e-06, + "loss": 0.7866298, + "num_input_tokens_seen": 14425130, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.7890625, + "step": 680, + "time_per_iteration": 2.5289864540100098 + }, + { + "auxiliary_loss_clip": 0.01255007, + "auxiliary_loss_mlp": 0.01065478, + "balance_loss_clip": 1.03305268, + "balance_loss_mlp": 1.07167506, + "epoch": 0.04094393506688712, + "flos": 27597673514880.0, + "grad_norm": 1.7999776318515568, + "language_loss": 0.83315849, + "learning_rate": 3.9987440660660685e-06, + "loss": 0.8563633, + "num_input_tokens_seen": 14447355, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 1.8359375, + "step": 681, + "time_per_iteration": 2.5313305854797363 + }, + { + "auxiliary_loss_clip": 0.01253144, + "auxiliary_loss_mlp": 0.01064685, + "balance_loss_clip": 1.03302324, + "balance_loss_mlp": 1.07082057, + "epoch": 0.04100405831955509, + "flos": 23112036109440.0, + "grad_norm": 1.6690976928218293, + "language_loss": 0.71312869, + "learning_rate": 3.998730228142726e-06, + "loss": 0.73630697, + "num_input_tokens_seen": 14466790, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.828125, + "step": 682, + "time_per_iteration": 2.5190017223358154 + }, + { + "auxiliary_loss_clip": 0.01251459, + "auxiliary_loss_mlp": 0.01068202, + "balance_loss_clip": 1.03911471, + "balance_loss_mlp": 1.07090235, + "epoch": 0.04106418157222306, + "flos": 20156731695360.0, + "grad_norm": 1.7744847161326498, + "language_loss": 0.72373003, + "learning_rate": 3.998716314427333e-06, + "loss": 0.74692667, + "num_input_tokens_seen": 14485195, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.8046875, + "step": 683, + "time_per_iteration": 2.473156690597534 + }, + { + "auxiliary_loss_clip": 0.01250706, + "auxiliary_loss_mlp": 0.01075324, + "balance_loss_clip": 1.04540253, + "balance_loss_mlp": 1.07707, + "epoch": 0.041124304824891024, + "flos": 17420697855360.0, + "grad_norm": 2.316908811268422, + "language_loss": 0.81263745, + "learning_rate": 3.998702324920417e-06, + "loss": 0.83589774, + "num_input_tokens_seen": 14503370, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 1.734375, + "step": 684, + "time_per_iteration": 5.34027099609375 + }, + { + "auxiliary_loss_clip": 0.01251905, + "auxiliary_loss_mlp": 0.01072266, + "balance_loss_clip": 1.04053211, + "balance_loss_mlp": 1.07572865, + "epoch": 0.041184428077558996, + "flos": 25780163287680.0, + "grad_norm": 1.5327144156887007, + "language_loss": 0.90501672, + "learning_rate": 3.9986882596225085e-06, + "loss": 0.92825842, + "num_input_tokens_seen": 14526415, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 1.765625, + "step": 685, + "time_per_iteration": 3.918776750564575 + }, + { + "auxiliary_loss_clip": 0.01253389, + "auxiliary_loss_mlp": 0.010703, + "balance_loss_clip": 1.04002118, + "balance_loss_mlp": 1.07458997, + "epoch": 0.04124455133022697, + "flos": 22964766347520.0, + "grad_norm": 2.0402082016953234, + "language_loss": 0.87871253, + "learning_rate": 3.998674118534141e-06, + "loss": 0.90194941, + "num_input_tokens_seen": 14546595, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.7890625, + "step": 686, + "time_per_iteration": 2.481177806854248 + }, + { + "auxiliary_loss_clip": 0.01258153, + "auxiliary_loss_mlp": 0.01071669, + "balance_loss_clip": 1.04158103, + "balance_loss_mlp": 1.07474661, + "epoch": 0.04130467458289493, + "flos": 21289067015040.0, + "grad_norm": 1.7716861202834375, + "language_loss": 0.71645427, + "learning_rate": 3.998659901655851e-06, + "loss": 0.73975253, + "num_input_tokens_seen": 14566590, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.8359375, + "step": 687, + "time_per_iteration": 2.4720261096954346 + }, + { + "auxiliary_loss_clip": 0.01252382, + "auxiliary_loss_mlp": 0.01070684, + "balance_loss_clip": 1.04262209, + "balance_loss_mlp": 1.07918715, + "epoch": 0.041364797835562905, + "flos": 19974233669760.0, + "grad_norm": 2.117746024922212, + "language_loss": 0.8642537, + "learning_rate": 3.998645608988177e-06, + "loss": 0.88748431, + "num_input_tokens_seen": 14585965, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.734375, + "step": 688, + "time_per_iteration": 2.46706223487854 + }, + { + "auxiliary_loss_clip": 0.01249454, + "auxiliary_loss_mlp": 0.01083042, + "balance_loss_clip": 1.05338287, + "balance_loss_mlp": 1.07534754, + "epoch": 0.04142492108823087, + "flos": 21906227520000.0, + "grad_norm": 2.6487514234328304, + "language_loss": 0.83326006, + "learning_rate": 3.998631240531661e-06, + "loss": 0.85658503, + "num_input_tokens_seen": 14606015, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.7421875, + "step": 689, + "time_per_iteration": 2.4689462184906006 + }, + { + "auxiliary_loss_clip": 0.01248134, + "auxiliary_loss_mlp": 0.01077255, + "balance_loss_clip": 1.04847789, + "balance_loss_mlp": 1.07176828, + "epoch": 0.04148504434089884, + "flos": 27639617621760.0, + "grad_norm": 1.7821885346326607, + "language_loss": 0.68391848, + "learning_rate": 3.998616796286848e-06, + "loss": 0.70717239, + "num_input_tokens_seen": 14629955, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.765625, + "step": 690, + "time_per_iteration": 2.5583128929138184 + }, + { + "auxiliary_loss_clip": 0.012458, + "auxiliary_loss_mlp": 0.01071299, + "balance_loss_clip": 1.04197323, + "balance_loss_mlp": 1.07094526, + "epoch": 0.041545167593566815, + "flos": 20518387781760.0, + "grad_norm": 1.747700039366933, + "language_loss": 0.74933273, + "learning_rate": 3.998602276254286e-06, + "loss": 0.77250373, + "num_input_tokens_seen": 14648000, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.75, + "step": 691, + "time_per_iteration": 2.4566729068756104 + }, + { + "auxiliary_loss_clip": 0.01246178, + "auxiliary_loss_mlp": 0.0107911, + "balance_loss_clip": 1.04890203, + "balance_loss_mlp": 1.07268727, + "epoch": 0.04160529084623478, + "flos": 11868907939200.0, + "grad_norm": 2.450885846250815, + "language_loss": 0.84518701, + "learning_rate": 3.998587680434526e-06, + "loss": 0.86843991, + "num_input_tokens_seen": 14662235, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.734375, + "step": 692, + "time_per_iteration": 2.4667932987213135 + }, + { + "auxiliary_loss_clip": 0.01252043, + "auxiliary_loss_mlp": 0.01072791, + "balance_loss_clip": 1.04124784, + "balance_loss_mlp": 1.07099986, + "epoch": 0.04166541409890275, + "flos": 14828306503680.0, + "grad_norm": 9.166238009589804, + "language_loss": 0.89107299, + "learning_rate": 3.99857300882812e-06, + "loss": 0.9143213, + "num_input_tokens_seen": 14676065, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.8125, + "step": 693, + "time_per_iteration": 2.4823052883148193 + }, + { + "auxiliary_loss_clip": 0.01254961, + "auxiliary_loss_mlp": 0.01065864, + "balance_loss_clip": 1.03637171, + "balance_loss_mlp": 1.07755136, + "epoch": 0.04172553735157072, + "flos": 25808137004160.0, + "grad_norm": 2.1462970179067646, + "language_loss": 0.82179356, + "learning_rate": 3.998558261435626e-06, + "loss": 0.84500182, + "num_input_tokens_seen": 14694955, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.7734375, + "step": 694, + "time_per_iteration": 2.564098834991455 + }, + { + "auxiliary_loss_clip": 0.01253069, + "auxiliary_loss_mlp": 0.01067691, + "balance_loss_clip": 1.03791225, + "balance_loss_mlp": 1.07214785, + "epoch": 0.04178566060423869, + "flos": 24279815174400.0, + "grad_norm": 2.057768586122239, + "language_loss": 0.83656573, + "learning_rate": 3.9985434382576015e-06, + "loss": 0.85977334, + "num_input_tokens_seen": 14715510, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.8125, + "step": 695, + "time_per_iteration": 2.5122969150543213 + }, + { + "auxiliary_loss_clip": 0.01249861, + "auxiliary_loss_mlp": 0.01073319, + "balance_loss_clip": 1.04270577, + "balance_loss_mlp": 1.07313716, + "epoch": 0.04184578385690666, + "flos": 18222008411520.0, + "grad_norm": 2.138642052855673, + "language_loss": 0.8441087, + "learning_rate": 3.99852853929461e-06, + "loss": 0.86734056, + "num_input_tokens_seen": 14731755, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.765625, + "step": 696, + "time_per_iteration": 2.462756872177124 + }, + { + "auxiliary_loss_clip": 0.01247863, + "auxiliary_loss_mlp": 0.01073791, + "balance_loss_clip": 1.04253471, + "balance_loss_mlp": 1.07146811, + "epoch": 0.041905907109574626, + "flos": 22776342577920.0, + "grad_norm": 2.042298821772003, + "language_loss": 0.93134123, + "learning_rate": 3.998513564547216e-06, + "loss": 0.95455778, + "num_input_tokens_seen": 14750810, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.765625, + "step": 697, + "time_per_iteration": 2.5189502239227295 + }, + { + "auxiliary_loss_clip": 0.0124398, + "auxiliary_loss_mlp": 0.01069004, + "balance_loss_clip": 1.04048967, + "balance_loss_mlp": 1.07146859, + "epoch": 0.0419660303622426, + "flos": 20156947176960.0, + "grad_norm": 2.2837511795811207, + "language_loss": 0.83989406, + "learning_rate": 3.998498514015987e-06, + "loss": 0.86302388, + "num_input_tokens_seen": 14768435, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.71875, + "step": 698, + "time_per_iteration": 2.5080907344818115 + }, + { + "auxiliary_loss_clip": 0.01247569, + "auxiliary_loss_mlp": 0.01086724, + "balance_loss_clip": 1.05551505, + "balance_loss_mlp": 1.0711751, + "epoch": 0.042026153614910564, + "flos": 23076376882560.0, + "grad_norm": 1.9405760650289445, + "language_loss": 0.91369909, + "learning_rate": 3.998483387701495e-06, + "loss": 0.93704206, + "num_input_tokens_seen": 14786690, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.7578125, + "step": 699, + "time_per_iteration": 2.4667766094207764 + }, + { + "auxiliary_loss_clip": 0.01113685, + "auxiliary_loss_mlp": 0.01024099, + "balance_loss_clip": 1.01842487, + "balance_loss_mlp": 1.03384757, + "epoch": 0.042086276867578536, + "flos": 64495243370880.0, + "grad_norm": 0.8964375713204716, + "language_loss": 0.67850006, + "learning_rate": 3.998468185604312e-06, + "loss": 0.69987792, + "num_input_tokens_seen": 14853840, + "router_z_loss_clip": 0.05664062, + "router_z_loss_mlp": 0.796875, + "step": 700, + "time_per_iteration": 3.1214911937713623 + }, + { + "auxiliary_loss_clip": 0.01254452, + "auxiliary_loss_mlp": 0.01078478, + "balance_loss_clip": 1.04695964, + "balance_loss_mlp": 1.07502532, + "epoch": 0.04214640012024651, + "flos": 15487016065920.0, + "grad_norm": 2.6789371965697524, + "language_loss": 0.89020562, + "learning_rate": 3.998452907725016e-06, + "loss": 0.913535, + "num_input_tokens_seen": 14869580, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 1.796875, + "step": 701, + "time_per_iteration": 2.46085524559021 + }, + { + "auxiliary_loss_clip": 0.01250018, + "auxiliary_loss_mlp": 0.0107128, + "balance_loss_clip": 1.04085803, + "balance_loss_mlp": 1.07681179, + "epoch": 0.04220652337291447, + "flos": 23877040993920.0, + "grad_norm": 2.2592774096130794, + "language_loss": 0.67494118, + "learning_rate": 3.998437554064184e-06, + "loss": 0.69815421, + "num_input_tokens_seen": 14891065, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.734375, + "step": 702, + "time_per_iteration": 2.5170979499816895 + }, + { + "auxiliary_loss_clip": 0.01112958, + "auxiliary_loss_mlp": 0.01006834, + "balance_loss_clip": 1.00161314, + "balance_loss_mlp": 1.03296542, + "epoch": 0.042266646625582445, + "flos": 63795451628160.0, + "grad_norm": 0.8426087453226233, + "language_loss": 0.60777819, + "learning_rate": 3.9984221246224006e-06, + "loss": 0.62897617, + "num_input_tokens_seen": 14954815, + "router_z_loss_clip": 0.05224609, + "router_z_loss_mlp": 0.80078125, + "step": 703, + "time_per_iteration": 3.155794143676758 + }, + { + "auxiliary_loss_clip": 0.01112196, + "auxiliary_loss_mlp": 0.01010352, + "balance_loss_clip": 1.0050354, + "balance_loss_mlp": 1.03251982, + "epoch": 0.04232676987825041, + "flos": 50018863345920.0, + "grad_norm": 1.0167549333074237, + "language_loss": 0.5776214, + "learning_rate": 3.9984066194002494e-06, + "loss": 0.59884691, + "num_input_tokens_seen": 15003050, + "router_z_loss_clip": 0.05322266, + "router_z_loss_mlp": 0.796875, + "step": 704, + "time_per_iteration": 2.95633602142334 + }, + { + "auxiliary_loss_clip": 0.01252148, + "auxiliary_loss_mlp": 0.01070665, + "balance_loss_clip": 1.0397656, + "balance_loss_mlp": 1.07432342, + "epoch": 0.04238689313091838, + "flos": 21616105368960.0, + "grad_norm": 2.1970745802550624, + "language_loss": 0.87708455, + "learning_rate": 3.998391038398319e-06, + "loss": 0.90031266, + "num_input_tokens_seen": 15021990, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.78125, + "step": 705, + "time_per_iteration": 2.51802921295166 + }, + { + "auxiliary_loss_clip": 0.01238458, + "auxiliary_loss_mlp": 0.01062417, + "balance_loss_clip": 1.03498721, + "balance_loss_mlp": 1.06876624, + "epoch": 0.042447016383586354, + "flos": 19135109070720.0, + "grad_norm": 1.7054575923778923, + "language_loss": 0.71612352, + "learning_rate": 3.998375381617201e-06, + "loss": 0.73913229, + "num_input_tokens_seen": 15040700, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.6953125, + "step": 706, + "time_per_iteration": 2.464270830154419 + }, + { + "auxiliary_loss_clip": 0.01243119, + "auxiliary_loss_mlp": 0.01068207, + "balance_loss_clip": 1.03816676, + "balance_loss_mlp": 1.07029784, + "epoch": 0.04250713963625432, + "flos": 24426007528320.0, + "grad_norm": 2.0927829932503714, + "language_loss": 0.93480223, + "learning_rate": 3.9983596490574875e-06, + "loss": 0.95791554, + "num_input_tokens_seen": 15056725, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.7265625, + "step": 707, + "time_per_iteration": 2.5087966918945312 + }, + { + "auxiliary_loss_clip": 0.01245928, + "auxiliary_loss_mlp": 0.01065311, + "balance_loss_clip": 1.03441203, + "balance_loss_mlp": 1.0676806, + "epoch": 0.04256726288892229, + "flos": 30367391333760.0, + "grad_norm": 2.3244890877745883, + "language_loss": 0.81275034, + "learning_rate": 3.998343840719776e-06, + "loss": 0.83586276, + "num_input_tokens_seen": 15077550, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.78125, + "step": 708, + "time_per_iteration": 2.557119607925415 + }, + { + "auxiliary_loss_clip": 0.01251091, + "auxiliary_loss_mlp": 0.01073266, + "balance_loss_clip": 1.04239082, + "balance_loss_mlp": 1.07195199, + "epoch": 0.04262738614159026, + "flos": 16362661818240.0, + "grad_norm": 2.2553269788690224, + "language_loss": 0.82229173, + "learning_rate": 3.998327956604666e-06, + "loss": 0.84553528, + "num_input_tokens_seen": 15094955, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.796875, + "step": 709, + "time_per_iteration": 2.4828600883483887 + }, + { + "auxiliary_loss_clip": 0.01256006, + "auxiliary_loss_mlp": 0.01064315, + "balance_loss_clip": 1.03389335, + "balance_loss_mlp": 1.07517564, + "epoch": 0.04268750939425823, + "flos": 20412379768320.0, + "grad_norm": 2.534138916450152, + "language_loss": 0.85063422, + "learning_rate": 3.99831199671276e-06, + "loss": 0.87383747, + "num_input_tokens_seen": 15113395, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.8125, + "step": 710, + "time_per_iteration": 2.453641653060913 + }, + { + "auxiliary_loss_clip": 0.01254724, + "auxiliary_loss_mlp": 0.01070713, + "balance_loss_clip": 1.04114938, + "balance_loss_mlp": 1.07757199, + "epoch": 0.0427476326469262, + "flos": 20302959962880.0, + "grad_norm": 3.316207411440496, + "language_loss": 0.84996349, + "learning_rate": 3.998295961044662e-06, + "loss": 0.87321782, + "num_input_tokens_seen": 15132920, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.7734375, + "step": 711, + "time_per_iteration": 2.4685802459716797 + }, + { + "auxiliary_loss_clip": 0.01246695, + "auxiliary_loss_mlp": 0.01069917, + "balance_loss_clip": 1.03827882, + "balance_loss_mlp": 1.07044697, + "epoch": 0.042807755899594166, + "flos": 21650794928640.0, + "grad_norm": 2.000925777751644, + "language_loss": 0.85439169, + "learning_rate": 3.9982798496009804e-06, + "loss": 0.87755781, + "num_input_tokens_seen": 15153115, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.765625, + "step": 712, + "time_per_iteration": 2.5002598762512207 + }, + { + "auxiliary_loss_clip": 0.01252579, + "auxiliary_loss_mlp": 0.0107294, + "balance_loss_clip": 1.0445205, + "balance_loss_mlp": 1.0701685, + "epoch": 0.04286787915226214, + "flos": 21435007973760.0, + "grad_norm": 3.2453781921901728, + "language_loss": 0.90829903, + "learning_rate": 3.998263662382328e-06, + "loss": 0.9315542, + "num_input_tokens_seen": 15172770, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.8203125, + "step": 713, + "time_per_iteration": 2.4908998012542725 + }, + { + "auxiliary_loss_clip": 0.01108903, + "auxiliary_loss_mlp": 0.01017546, + "balance_loss_clip": 1.01187158, + "balance_loss_mlp": 1.0288384, + "epoch": 0.04292800240493011, + "flos": 66397970615040.0, + "grad_norm": 0.8777811618173876, + "language_loss": 0.63746506, + "learning_rate": 3.9982473993893165e-06, + "loss": 0.65872955, + "num_input_tokens_seen": 15240055, + "router_z_loss_clip": 0.05664062, + "router_z_loss_mlp": 0.80078125, + "step": 714, + "time_per_iteration": 3.158921480178833 + }, + { + "auxiliary_loss_clip": 0.01249012, + "auxiliary_loss_mlp": 0.01080593, + "balance_loss_clip": 1.05076694, + "balance_loss_mlp": 1.07545531, + "epoch": 0.042988125657598075, + "flos": 31650264552960.0, + "grad_norm": 2.1622955343434382, + "language_loss": 0.74528754, + "learning_rate": 3.998231060622563e-06, + "loss": 0.76858354, + "num_input_tokens_seen": 15261585, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 1.734375, + "step": 715, + "time_per_iteration": 2.5759642124176025 + }, + { + "auxiliary_loss_clip": 0.01250142, + "auxiliary_loss_mlp": 0.01077038, + "balance_loss_clip": 1.04534006, + "balance_loss_mlp": 1.07450986, + "epoch": 0.04304824891026605, + "flos": 33248468292480.0, + "grad_norm": 2.2108029839954213, + "language_loss": 0.72630137, + "learning_rate": 3.998214646082688e-06, + "loss": 0.74957311, + "num_input_tokens_seen": 15281160, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.7578125, + "step": 716, + "time_per_iteration": 2.5973668098449707 + }, + { + "auxiliary_loss_clip": 0.01104967, + "auxiliary_loss_mlp": 0.01006876, + "balance_loss_clip": 1.00091577, + "balance_loss_mlp": 1.02687418, + "epoch": 0.04310837216293401, + "flos": 64064782782720.0, + "grad_norm": 0.9052113850529176, + "language_loss": 0.65557301, + "learning_rate": 3.998198155770314e-06, + "loss": 0.67669141, + "num_input_tokens_seen": 15344505, + "router_z_loss_clip": 0.05957031, + "router_z_loss_mlp": 0.78125, + "step": 717, + "time_per_iteration": 3.114957571029663 + }, + { + "auxiliary_loss_clip": 0.01104969, + "auxiliary_loss_mlp": 0.01003955, + "balance_loss_clip": 0.99780369, + "balance_loss_mlp": 1.02667391, + "epoch": 0.043168495415601985, + "flos": 61343757849600.0, + "grad_norm": 0.9880116621267147, + "language_loss": 0.58762264, + "learning_rate": 3.998181589686065e-06, + "loss": 0.60871184, + "num_input_tokens_seen": 15404050, + "router_z_loss_clip": 0.0612793, + "router_z_loss_mlp": 0.78125, + "step": 718, + "time_per_iteration": 2.910278797149658 + }, + { + "auxiliary_loss_clip": 0.01248398, + "auxiliary_loss_mlp": 0.01074809, + "balance_loss_clip": 1.04314709, + "balance_loss_mlp": 1.0758605, + "epoch": 0.04322861866826996, + "flos": 20704261685760.0, + "grad_norm": 1.8513004644505335, + "language_loss": 0.91198725, + "learning_rate": 3.99816494783057e-06, + "loss": 0.93521935, + "num_input_tokens_seen": 15424190, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.7265625, + "step": 719, + "time_per_iteration": 2.492509126663208 + }, + { + "auxiliary_loss_clip": 0.01244347, + "auxiliary_loss_mlp": 0.0107141, + "balance_loss_clip": 1.04208493, + "balance_loss_mlp": 1.06931555, + "epoch": 0.04328874192093792, + "flos": 30373352991360.0, + "grad_norm": 1.803377327315558, + "language_loss": 0.66468138, + "learning_rate": 3.99814823020446e-06, + "loss": 0.68783891, + "num_input_tokens_seen": 15446500, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.75, + "step": 720, + "time_per_iteration": 2.6061203479766846 + }, + { + "auxiliary_loss_clip": 0.01244682, + "auxiliary_loss_mlp": 0.01079523, + "balance_loss_clip": 1.04895782, + "balance_loss_mlp": 1.07152998, + "epoch": 0.043348865173605894, + "flos": 21944795748480.0, + "grad_norm": 1.8832143461121282, + "language_loss": 0.77743989, + "learning_rate": 3.9981314368083684e-06, + "loss": 0.80068195, + "num_input_tokens_seen": 15465830, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.734375, + "step": 721, + "time_per_iteration": 2.5255632400512695 + }, + { + "auxiliary_loss_clip": 0.01251204, + "auxiliary_loss_mlp": 0.0108774, + "balance_loss_clip": 1.05879569, + "balance_loss_mlp": 1.07584524, + "epoch": 0.04340898842627386, + "flos": 15264225959040.0, + "grad_norm": 3.027898330451403, + "language_loss": 0.87873065, + "learning_rate": 3.998114567642933e-06, + "loss": 0.90212011, + "num_input_tokens_seen": 15479985, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.75, + "step": 722, + "time_per_iteration": 2.536283493041992 + }, + { + "auxiliary_loss_clip": 0.0125365, + "auxiliary_loss_mlp": 0.01075404, + "balance_loss_clip": 1.04660296, + "balance_loss_mlp": 1.0758208, + "epoch": 0.04346911167894183, + "flos": 27965434913280.0, + "grad_norm": 30.376200688873947, + "language_loss": 0.84770942, + "learning_rate": 3.998097622708792e-06, + "loss": 0.87099999, + "num_input_tokens_seen": 15501545, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.78125, + "step": 723, + "time_per_iteration": 2.5167360305786133 + }, + { + "auxiliary_loss_clip": 0.01256754, + "auxiliary_loss_mlp": 0.01076494, + "balance_loss_clip": 1.04638171, + "balance_loss_mlp": 1.07828176, + "epoch": 0.0435292349316098, + "flos": 29242202820480.0, + "grad_norm": 1.9203333396820472, + "language_loss": 0.82793808, + "learning_rate": 3.99808060200659e-06, + "loss": 0.85127056, + "num_input_tokens_seen": 15521725, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.78125, + "step": 724, + "time_per_iteration": 2.5561347007751465 + }, + { + "auxiliary_loss_clip": 0.0125067, + "auxiliary_loss_mlp": 0.01090321, + "balance_loss_clip": 1.05975556, + "balance_loss_mlp": 1.07561088, + "epoch": 0.04358935818427777, + "flos": 20558356640640.0, + "grad_norm": 1.8200683460759586, + "language_loss": 0.79530561, + "learning_rate": 3.998063505536971e-06, + "loss": 0.81871551, + "num_input_tokens_seen": 15540910, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.75, + "step": 725, + "time_per_iteration": 2.4551918506622314 + }, + { + "auxiliary_loss_clip": 0.0126067, + "auxiliary_loss_mlp": 0.01076358, + "balance_loss_clip": 1.04529178, + "balance_loss_mlp": 1.07715642, + "epoch": 0.04364948143694574, + "flos": 14464926564480.0, + "grad_norm": 2.8106150104808485, + "language_loss": 0.87100697, + "learning_rate": 3.998046333300584e-06, + "loss": 0.89437729, + "num_input_tokens_seen": 15558640, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 1.8359375, + "step": 726, + "time_per_iteration": 5.350574731826782 + }, + { + "auxiliary_loss_clip": 0.01106916, + "auxiliary_loss_mlp": 0.01011263, + "balance_loss_clip": 1.00542223, + "balance_loss_mlp": 1.02866364, + "epoch": 0.043709604689613706, + "flos": 50067268922880.0, + "grad_norm": 0.9088619113669424, + "language_loss": 0.5587045, + "learning_rate": 3.998029085298079e-06, + "loss": 0.57988632, + "num_input_tokens_seen": 15612975, + "router_z_loss_clip": 0.05834961, + "router_z_loss_mlp": 0.78125, + "step": 727, + "time_per_iteration": 3.1539440155029297 + }, + { + "auxiliary_loss_clip": 0.01251236, + "auxiliary_loss_mlp": 0.01076851, + "balance_loss_clip": 1.04676282, + "balance_loss_mlp": 1.07453549, + "epoch": 0.04376972794228168, + "flos": 13991588115840.0, + "grad_norm": 2.397861957488019, + "language_loss": 0.82248902, + "learning_rate": 3.998011761530112e-06, + "loss": 0.84576982, + "num_input_tokens_seen": 15631070, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.765625, + "step": 728, + "time_per_iteration": 2.4445250034332275 + }, + { + "auxiliary_loss_clip": 0.01244631, + "auxiliary_loss_mlp": 0.01068516, + "balance_loss_clip": 1.0395956, + "balance_loss_mlp": 1.07265663, + "epoch": 0.04382985119494965, + "flos": 22009901149440.0, + "grad_norm": 2.2715062050859745, + "language_loss": 0.77187145, + "learning_rate": 3.997994361997338e-06, + "loss": 0.79500294, + "num_input_tokens_seen": 15647825, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.71875, + "step": 729, + "time_per_iteration": 2.5091514587402344 + }, + { + "auxiliary_loss_clip": 0.0125233, + "auxiliary_loss_mlp": 0.01074681, + "balance_loss_clip": 1.04502177, + "balance_loss_mlp": 1.07452357, + "epoch": 0.043889974447617615, + "flos": 24206521472640.0, + "grad_norm": 2.258754879989397, + "language_loss": 0.9515503, + "learning_rate": 3.997976886700417e-06, + "loss": 0.97482038, + "num_input_tokens_seen": 15668260, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.78125, + "step": 730, + "time_per_iteration": 2.4795522689819336 + }, + { + "auxiliary_loss_clip": 0.0124716, + "auxiliary_loss_mlp": 0.01065838, + "balance_loss_clip": 1.03496313, + "balance_loss_mlp": 1.07000017, + "epoch": 0.04395009770028559, + "flos": 17274541415040.0, + "grad_norm": 2.2097226025839483, + "language_loss": 0.88016784, + "learning_rate": 3.997959335640013e-06, + "loss": 0.90329784, + "num_input_tokens_seen": 15685630, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.7734375, + "step": 731, + "time_per_iteration": 2.4678709506988525 + }, + { + "auxiliary_loss_clip": 0.01251191, + "auxiliary_loss_mlp": 0.01073318, + "balance_loss_clip": 1.04589999, + "balance_loss_mlp": 1.07521737, + "epoch": 0.04401022095295355, + "flos": 12310286261760.0, + "grad_norm": 3.3707184473936587, + "language_loss": 0.88656235, + "learning_rate": 3.997941708816791e-06, + "loss": 0.90980744, + "num_input_tokens_seen": 15698645, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.7578125, + "step": 732, + "time_per_iteration": 2.4135851860046387 + }, + { + "auxiliary_loss_clip": 0.01251086, + "auxiliary_loss_mlp": 0.01073165, + "balance_loss_clip": 1.04288554, + "balance_loss_mlp": 1.07443762, + "epoch": 0.044070344205621524, + "flos": 20959658363520.0, + "grad_norm": 2.131822645051773, + "language_loss": 0.86010063, + "learning_rate": 3.997924006231419e-06, + "loss": 0.88334322, + "num_input_tokens_seen": 15716775, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.765625, + "step": 733, + "time_per_iteration": 2.491278648376465 + }, + { + "auxiliary_loss_clip": 0.01256254, + "auxiliary_loss_mlp": 0.01078649, + "balance_loss_clip": 1.04715347, + "balance_loss_mlp": 1.07624841, + "epoch": 0.044130467458289496, + "flos": 13845288021120.0, + "grad_norm": 2.0564057381838885, + "language_loss": 0.91515708, + "learning_rate": 3.9979062278845685e-06, + "loss": 0.93850613, + "num_input_tokens_seen": 15733320, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 1.796875, + "step": 734, + "time_per_iteration": 2.451258897781372 + }, + { + "auxiliary_loss_clip": 0.01247796, + "auxiliary_loss_mlp": 0.01064289, + "balance_loss_clip": 1.03696656, + "balance_loss_mlp": 1.07613921, + "epoch": 0.04419059071095746, + "flos": 28655063107200.0, + "grad_norm": 1.8863467898976456, + "language_loss": 0.77831066, + "learning_rate": 3.9978883737769125e-06, + "loss": 0.8014316, + "num_input_tokens_seen": 15752705, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.71875, + "step": 735, + "time_per_iteration": 2.558958053588867 + }, + { + "auxiliary_loss_clip": 0.01240634, + "auxiliary_loss_mlp": 0.01063468, + "balance_loss_clip": 1.03526342, + "balance_loss_mlp": 1.06886315, + "epoch": 0.04425071396362543, + "flos": 28183304856960.0, + "grad_norm": 2.1337917025346074, + "language_loss": 0.88456166, + "learning_rate": 3.9978704439091305e-06, + "loss": 0.90760267, + "num_input_tokens_seen": 15772800, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.71875, + "step": 736, + "time_per_iteration": 2.5100033283233643 + }, + { + "auxiliary_loss_clip": 0.01242163, + "auxiliary_loss_mlp": 0.01067088, + "balance_loss_clip": 1.03995562, + "balance_loss_mlp": 1.07473993, + "epoch": 0.0443108372162934, + "flos": 23658452778240.0, + "grad_norm": 1.954630170969084, + "language_loss": 0.84155536, + "learning_rate": 3.997852438281901e-06, + "loss": 0.86464787, + "num_input_tokens_seen": 15793665, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.671875, + "step": 737, + "time_per_iteration": 2.5015766620635986 + }, + { + "auxiliary_loss_clip": 0.01251899, + "auxiliary_loss_mlp": 0.01072468, + "balance_loss_clip": 1.04077065, + "balance_loss_mlp": 1.07667851, + "epoch": 0.04437096046896137, + "flos": 33979861025280.0, + "grad_norm": 2.0376910697928947, + "language_loss": 0.8518666, + "learning_rate": 3.997834356895906e-06, + "loss": 0.87511027, + "num_input_tokens_seen": 15813175, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.75, + "step": 738, + "time_per_iteration": 2.5576610565185547 + }, + { + "auxiliary_loss_clip": 0.01112093, + "auxiliary_loss_mlp": 0.01046041, + "balance_loss_clip": 1.04048622, + "balance_loss_mlp": 1.03298163, + "epoch": 0.04443108372162934, + "flos": 67397506375680.0, + "grad_norm": 0.8684121686227821, + "language_loss": 0.59110028, + "learning_rate": 3.9978161997518324e-06, + "loss": 0.61268163, + "num_input_tokens_seen": 15872050, + "router_z_loss_clip": 0.05566406, + "router_z_loss_mlp": 0.7890625, + "step": 739, + "time_per_iteration": 3.0643718242645264 + }, + { + "auxiliary_loss_clip": 0.0124678, + "auxiliary_loss_mlp": 0.01070548, + "balance_loss_clip": 1.04220033, + "balance_loss_mlp": 1.07513726, + "epoch": 0.04449120697429731, + "flos": 29752672953600.0, + "grad_norm": 2.1860888775648695, + "language_loss": 0.91622591, + "learning_rate": 3.997797966850369e-06, + "loss": 0.93939924, + "num_input_tokens_seen": 15891085, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.71875, + "step": 740, + "time_per_iteration": 2.5448389053344727 + }, + { + "auxiliary_loss_clip": 0.01252276, + "auxiliary_loss_mlp": 0.01063948, + "balance_loss_clip": 1.03693473, + "balance_loss_mlp": 1.07766986, + "epoch": 0.04455133022696528, + "flos": 36502119072000.0, + "grad_norm": 2.01644947055736, + "language_loss": 0.71842492, + "learning_rate": 3.997779658192205e-06, + "loss": 0.74158716, + "num_input_tokens_seen": 15914225, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.7421875, + "step": 741, + "time_per_iteration": 2.5979790687561035 + }, + { + "auxiliary_loss_clip": 0.01240373, + "auxiliary_loss_mlp": 0.01073056, + "balance_loss_clip": 1.04532838, + "balance_loss_mlp": 1.07044411, + "epoch": 0.044611453479633245, + "flos": 28803661672320.0, + "grad_norm": 1.722907957661965, + "language_loss": 0.88555831, + "learning_rate": 3.997761273778037e-06, + "loss": 0.9086926, + "num_input_tokens_seen": 15934540, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.703125, + "step": 742, + "time_per_iteration": 2.6367549896240234 + }, + { + "auxiliary_loss_clip": 0.0124233, + "auxiliary_loss_mlp": 0.01061826, + "balance_loss_clip": 1.03253651, + "balance_loss_mlp": 1.07209873, + "epoch": 0.04467157673230122, + "flos": 20010970304640.0, + "grad_norm": 2.0306401320231693, + "language_loss": 0.83823264, + "learning_rate": 3.997742813608561e-06, + "loss": 0.86127412, + "num_input_tokens_seen": 15952560, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.703125, + "step": 743, + "time_per_iteration": 2.516587972640991 + }, + { + "auxiliary_loss_clip": 0.01249271, + "auxiliary_loss_mlp": 0.01068722, + "balance_loss_clip": 1.04161429, + "balance_loss_mlp": 1.07474804, + "epoch": 0.04473169998496919, + "flos": 18004964480640.0, + "grad_norm": 3.0889105946672704, + "language_loss": 0.79948521, + "learning_rate": 3.997724277684479e-06, + "loss": 0.8226651, + "num_input_tokens_seen": 15970620, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.75, + "step": 744, + "time_per_iteration": 2.44805645942688 + }, + { + "auxiliary_loss_clip": 0.01243449, + "auxiliary_loss_mlp": 0.01067337, + "balance_loss_clip": 1.04037201, + "balance_loss_mlp": 1.07279778, + "epoch": 0.044791823237637154, + "flos": 20631722169600.0, + "grad_norm": 2.388036535067576, + "language_loss": 0.85400093, + "learning_rate": 3.99770566600649e-06, + "loss": 0.87710881, + "num_input_tokens_seen": 15987325, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.703125, + "step": 745, + "time_per_iteration": 2.4790430068969727 + }, + { + "auxiliary_loss_clip": 0.01242131, + "auxiliary_loss_mlp": 0.01064523, + "balance_loss_clip": 1.03569877, + "balance_loss_mlp": 1.0714339, + "epoch": 0.04485194649030513, + "flos": 31176171918720.0, + "grad_norm": 2.1215702602167688, + "language_loss": 0.6866799, + "learning_rate": 3.997686978575302e-06, + "loss": 0.70974648, + "num_input_tokens_seen": 16008310, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.7109375, + "step": 746, + "time_per_iteration": 2.5645759105682373 + }, + { + "auxiliary_loss_clip": 0.01250748, + "auxiliary_loss_mlp": 0.010776, + "balance_loss_clip": 1.04748797, + "balance_loss_mlp": 1.0783143, + "epoch": 0.04491206974297309, + "flos": 26143291831680.0, + "grad_norm": 2.1376273799467547, + "language_loss": 0.68823957, + "learning_rate": 3.997668215391625e-06, + "loss": 0.71152306, + "num_input_tokens_seen": 16029620, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.7265625, + "step": 747, + "time_per_iteration": 2.5267317295074463 + }, + { + "auxiliary_loss_clip": 0.01248685, + "auxiliary_loss_mlp": 0.01079454, + "balance_loss_clip": 1.04984236, + "balance_loss_mlp": 1.07314527, + "epoch": 0.044972192995641064, + "flos": 20667668705280.0, + "grad_norm": 1.9669744064389407, + "language_loss": 0.66721869, + "learning_rate": 3.997649376456168e-06, + "loss": 0.69050002, + "num_input_tokens_seen": 16049065, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.75, + "step": 748, + "time_per_iteration": 2.4818925857543945 + }, + { + "auxiliary_loss_clip": 0.01250197, + "auxiliary_loss_mlp": 0.01082391, + "balance_loss_clip": 1.05320835, + "balance_loss_mlp": 1.07779491, + "epoch": 0.045032316248309036, + "flos": 16106834177280.0, + "grad_norm": 2.650057046326624, + "language_loss": 0.76540357, + "learning_rate": 3.997630461769647e-06, + "loss": 0.78872949, + "num_input_tokens_seen": 16066765, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.7265625, + "step": 749, + "time_per_iteration": 2.4454426765441895 + }, + { + "auxiliary_loss_clip": 0.01251335, + "auxiliary_loss_mlp": 0.01077492, + "balance_loss_clip": 1.04883409, + "balance_loss_mlp": 1.0770005, + "epoch": 0.045092439500977, + "flos": 17858843953920.0, + "grad_norm": 2.0345099055640317, + "language_loss": 0.88970172, + "learning_rate": 3.997611471332778e-06, + "loss": 0.91298997, + "num_input_tokens_seen": 16085980, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.7421875, + "step": 750, + "time_per_iteration": 2.458716630935669 + }, + { + "auxiliary_loss_clip": 0.01247033, + "auxiliary_loss_mlp": 0.01074335, + "balance_loss_clip": 1.04295921, + "balance_loss_mlp": 1.07139015, + "epoch": 0.04515256275364497, + "flos": 24462815990400.0, + "grad_norm": 2.3716924268159367, + "language_loss": 0.74869245, + "learning_rate": 3.9975924051462825e-06, + "loss": 0.77190608, + "num_input_tokens_seen": 16106260, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 1.7578125, + "step": 751, + "time_per_iteration": 2.5231218338012695 + }, + { + "auxiliary_loss_clip": 0.01243504, + "auxiliary_loss_mlp": 0.01073697, + "balance_loss_clip": 1.04573071, + "balance_loss_mlp": 1.07175446, + "epoch": 0.04521268600631294, + "flos": 20916385453440.0, + "grad_norm": 2.2224468826240975, + "language_loss": 0.69360238, + "learning_rate": 3.997573263210883e-06, + "loss": 0.7167744, + "num_input_tokens_seen": 16123475, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.71875, + "step": 752, + "time_per_iteration": 2.4620048999786377 + }, + { + "auxiliary_loss_clip": 0.01244736, + "auxiliary_loss_mlp": 0.01057192, + "balance_loss_clip": 1.02927327, + "balance_loss_mlp": 1.07154715, + "epoch": 0.04527280925898091, + "flos": 13371374954880.0, + "grad_norm": 2.984649176219999, + "language_loss": 0.91634125, + "learning_rate": 3.997554045527305e-06, + "loss": 0.9393605, + "num_input_tokens_seen": 16138335, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.734375, + "step": 753, + "time_per_iteration": 2.4722437858581543 + }, + { + "auxiliary_loss_clip": 0.01249124, + "auxiliary_loss_mlp": 0.01075497, + "balance_loss_clip": 1.04728031, + "balance_loss_mlp": 1.07501864, + "epoch": 0.04533293251164888, + "flos": 23254565276160.0, + "grad_norm": 2.2056938633592975, + "language_loss": 0.91197902, + "learning_rate": 3.997534752096277e-06, + "loss": 0.93522525, + "num_input_tokens_seen": 16157110, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.7421875, + "step": 754, + "time_per_iteration": 2.472975492477417 + }, + { + "auxiliary_loss_clip": 0.01238249, + "auxiliary_loss_mlp": 0.0107062, + "balance_loss_clip": 1.04144955, + "balance_loss_mlp": 1.07163191, + "epoch": 0.04539305576431685, + "flos": 12422004537600.0, + "grad_norm": 2.234660546964849, + "language_loss": 0.78528345, + "learning_rate": 3.997515382918531e-06, + "loss": 0.80837214, + "num_input_tokens_seen": 16174155, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.671875, + "step": 755, + "time_per_iteration": 2.4704174995422363 + }, + { + "auxiliary_loss_clip": 0.01248815, + "auxiliary_loss_mlp": 0.0107981, + "balance_loss_clip": 1.05100918, + "balance_loss_mlp": 1.07416105, + "epoch": 0.04545317901698482, + "flos": 16070995382400.0, + "grad_norm": 1.9667934561660614, + "language_loss": 0.78451371, + "learning_rate": 3.9974959379948015e-06, + "loss": 0.80779994, + "num_input_tokens_seen": 16192240, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.75, + "step": 756, + "time_per_iteration": 2.4873547554016113 + }, + { + "auxiliary_loss_clip": 0.01118229, + "auxiliary_loss_mlp": 0.010118, + "balance_loss_clip": 1.00600612, + "balance_loss_mlp": 1.03558636, + "epoch": 0.045513302269652785, + "flos": 66396139021440.0, + "grad_norm": 0.8118987787253854, + "language_loss": 0.62730747, + "learning_rate": 3.997476417325827e-06, + "loss": 0.64860779, + "num_input_tokens_seen": 16255775, + "router_z_loss_clip": 0.05786133, + "router_z_loss_mlp": 0.828125, + "step": 757, + "time_per_iteration": 3.1292941570281982 + }, + { + "auxiliary_loss_clip": 0.01242797, + "auxiliary_loss_mlp": 0.01069674, + "balance_loss_clip": 1.04220784, + "balance_loss_mlp": 1.0731318, + "epoch": 0.04557342552232076, + "flos": 21471169991040.0, + "grad_norm": 1.5194495460848947, + "language_loss": 0.84329176, + "learning_rate": 3.997456820912346e-06, + "loss": 0.86641645, + "num_input_tokens_seen": 16277015, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.6953125, + "step": 758, + "time_per_iteration": 2.498905658721924 + }, + { + "auxiliary_loss_clip": 0.01237511, + "auxiliary_loss_mlp": 0.01067085, + "balance_loss_clip": 1.0405376, + "balance_loss_mlp": 1.06733441, + "epoch": 0.04563354877498873, + "flos": 23732680233600.0, + "grad_norm": 2.0933163310434963, + "language_loss": 0.88315606, + "learning_rate": 3.997437148755101e-06, + "loss": 0.90620202, + "num_input_tokens_seen": 16296005, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.703125, + "step": 759, + "time_per_iteration": 2.5122711658477783 + }, + { + "auxiliary_loss_clip": 0.01248241, + "auxiliary_loss_mlp": 0.01075804, + "balance_loss_clip": 1.04644299, + "balance_loss_mlp": 1.075526, + "epoch": 0.045693672027656694, + "flos": 25735741142400.0, + "grad_norm": 2.170817451496144, + "language_loss": 0.73644727, + "learning_rate": 3.9974174008548405e-06, + "loss": 0.75968778, + "num_input_tokens_seen": 16315300, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.7265625, + "step": 760, + "time_per_iteration": 2.511322021484375 + }, + { + "auxiliary_loss_clip": 0.01244913, + "auxiliary_loss_mlp": 0.01073409, + "balance_loss_clip": 1.04630077, + "balance_loss_mlp": 1.07509935, + "epoch": 0.045753795280324666, + "flos": 19719016560000.0, + "grad_norm": 2.192184725657734, + "language_loss": 0.82177126, + "learning_rate": 3.9973975772123105e-06, + "loss": 0.84495443, + "num_input_tokens_seen": 16333820, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.6953125, + "step": 761, + "time_per_iteration": 2.4831535816192627 + }, + { + "auxiliary_loss_clip": 0.01238913, + "auxiliary_loss_mlp": 0.0107061, + "balance_loss_clip": 1.04245305, + "balance_loss_mlp": 1.06961203, + "epoch": 0.04581391853299264, + "flos": 23255786338560.0, + "grad_norm": 1.7986428347309282, + "language_loss": 0.79732436, + "learning_rate": 3.997377677828266e-06, + "loss": 0.82041955, + "num_input_tokens_seen": 16355290, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.6953125, + "step": 762, + "time_per_iteration": 2.52453875541687 + }, + { + "auxiliary_loss_clip": 0.01117014, + "auxiliary_loss_mlp": 0.01036706, + "balance_loss_clip": 1.03112733, + "balance_loss_mlp": 1.03455913, + "epoch": 0.0458740417856606, + "flos": 64231155601920.0, + "grad_norm": 1.008821564963746, + "language_loss": 0.58659625, + "learning_rate": 3.9973577027034585e-06, + "loss": 0.60813344, + "num_input_tokens_seen": 16415995, + "router_z_loss_clip": 0.0559082, + "router_z_loss_mlp": 0.82421875, + "step": 763, + "time_per_iteration": 3.1429429054260254 + }, + { + "auxiliary_loss_clip": 0.01245459, + "auxiliary_loss_mlp": 0.01081866, + "balance_loss_clip": 1.05381632, + "balance_loss_mlp": 1.07288039, + "epoch": 0.045934165038328575, + "flos": 20770121272320.0, + "grad_norm": 2.8717486924500517, + "language_loss": 0.87752867, + "learning_rate": 3.9973376518386475e-06, + "loss": 0.9008019, + "num_input_tokens_seen": 16433120, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.7265625, + "step": 764, + "time_per_iteration": 2.4727554321289062 + }, + { + "auxiliary_loss_clip": 0.01248006, + "auxiliary_loss_mlp": 0.01079864, + "balance_loss_clip": 1.05192137, + "balance_loss_mlp": 1.07565248, + "epoch": 0.04599428829099654, + "flos": 30262891691520.0, + "grad_norm": 1.9426139778845304, + "language_loss": 0.86118066, + "learning_rate": 3.997317525234592e-06, + "loss": 0.88445938, + "num_input_tokens_seen": 16453360, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.71875, + "step": 765, + "time_per_iteration": 2.5370731353759766 + }, + { + "auxiliary_loss_clip": 0.01248646, + "auxiliary_loss_mlp": 0.01070241, + "balance_loss_clip": 1.03912735, + "balance_loss_mlp": 1.07336497, + "epoch": 0.04605441154366451, + "flos": 23038921975680.0, + "grad_norm": 3.0624701923152453, + "language_loss": 0.87846982, + "learning_rate": 3.997297322892056e-06, + "loss": 0.90165865, + "num_input_tokens_seen": 16471160, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 1.75, + "step": 766, + "time_per_iteration": 2.475677013397217 + }, + { + "auxiliary_loss_clip": 0.01239894, + "auxiliary_loss_mlp": 0.01067957, + "balance_loss_clip": 1.03979921, + "balance_loss_mlp": 1.06896472, + "epoch": 0.046114534796332485, + "flos": 22017407091840.0, + "grad_norm": 2.616885530601855, + "language_loss": 0.84314167, + "learning_rate": 3.997277044811806e-06, + "loss": 0.86622024, + "num_input_tokens_seen": 16488940, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.703125, + "step": 767, + "time_per_iteration": 2.465449810028076 + }, + { + "auxiliary_loss_clip": 0.01245421, + "auxiliary_loss_mlp": 0.01060911, + "balance_loss_clip": 1.03249097, + "balance_loss_mlp": 1.07569289, + "epoch": 0.04617465804900045, + "flos": 29862380067840.0, + "grad_norm": 2.056931367891973, + "language_loss": 0.87013769, + "learning_rate": 3.99725669099461e-06, + "loss": 0.89320099, + "num_input_tokens_seen": 16509505, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.703125, + "step": 768, + "time_per_iteration": 5.441957235336304 + }, + { + "auxiliary_loss_clip": 0.01238542, + "auxiliary_loss_mlp": 0.01069073, + "balance_loss_clip": 1.04184508, + "balance_loss_mlp": 1.06768477, + "epoch": 0.04623478130166842, + "flos": 25630056351360.0, + "grad_norm": 2.1199205591749033, + "language_loss": 0.75022334, + "learning_rate": 3.9972362614412395e-06, + "loss": 0.77329946, + "num_input_tokens_seen": 16528840, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.703125, + "step": 769, + "time_per_iteration": 2.5294675827026367 + }, + { + "auxiliary_loss_clip": 0.01238179, + "auxiliary_loss_mlp": 0.01063477, + "balance_loss_clip": 1.03734684, + "balance_loss_mlp": 1.07084632, + "epoch": 0.04629490455433639, + "flos": 20449080489600.0, + "grad_norm": 1.886534334963383, + "language_loss": 0.86162585, + "learning_rate": 3.997215756152471e-06, + "loss": 0.88464236, + "num_input_tokens_seen": 16548335, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.671875, + "step": 770, + "time_per_iteration": 2.4646449089050293 + }, + { + "auxiliary_loss_clip": 0.01248004, + "auxiliary_loss_mlp": 0.01066015, + "balance_loss_clip": 1.0385015, + "balance_loss_mlp": 1.07160687, + "epoch": 0.04635502780700436, + "flos": 23148736830720.0, + "grad_norm": 2.8625416592988477, + "language_loss": 0.87259042, + "learning_rate": 3.99719517512908e-06, + "loss": 0.89573061, + "num_input_tokens_seen": 16567725, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.765625, + "step": 771, + "time_per_iteration": 2.512622117996216 + }, + { + "auxiliary_loss_clip": 0.01246333, + "auxiliary_loss_mlp": 0.01076832, + "balance_loss_clip": 1.04726815, + "balance_loss_mlp": 1.06911707, + "epoch": 0.04641515105967233, + "flos": 23292020183040.0, + "grad_norm": 2.3640102097360587, + "language_loss": 0.83736801, + "learning_rate": 3.997174518371848e-06, + "loss": 0.86059964, + "num_input_tokens_seen": 16588175, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.7734375, + "step": 772, + "time_per_iteration": 2.509572982788086 + }, + { + "auxiliary_loss_clip": 0.01243608, + "auxiliary_loss_mlp": 0.01064058, + "balance_loss_clip": 1.03721189, + "balance_loss_mlp": 1.07392263, + "epoch": 0.046475274312340296, + "flos": 25115204759040.0, + "grad_norm": 2.3097217333215694, + "language_loss": 0.73399591, + "learning_rate": 3.997153785881557e-06, + "loss": 0.75707257, + "num_input_tokens_seen": 16607735, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.6953125, + "step": 773, + "time_per_iteration": 2.5539331436157227 + }, + { + "auxiliary_loss_clip": 0.01240234, + "auxiliary_loss_mlp": 0.01065536, + "balance_loss_clip": 1.03624654, + "balance_loss_mlp": 1.07288945, + "epoch": 0.04653539756500827, + "flos": 25264916645760.0, + "grad_norm": 2.066531290075925, + "language_loss": 0.78523052, + "learning_rate": 3.997132977658996e-06, + "loss": 0.80828828, + "num_input_tokens_seen": 16627225, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.671875, + "step": 774, + "time_per_iteration": 2.5350210666656494 + }, + { + "auxiliary_loss_clip": 0.01239038, + "auxiliary_loss_mlp": 0.01065848, + "balance_loss_clip": 1.03955007, + "balance_loss_mlp": 1.07101154, + "epoch": 0.046595520817676234, + "flos": 35404150089600.0, + "grad_norm": 2.187480231527322, + "language_loss": 0.73357666, + "learning_rate": 3.997112093704952e-06, + "loss": 0.75662553, + "num_input_tokens_seen": 16647785, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.6796875, + "step": 775, + "time_per_iteration": 2.6102981567382812 + }, + { + "auxiliary_loss_clip": 0.01240703, + "auxiliary_loss_mlp": 0.01059246, + "balance_loss_clip": 1.03096998, + "balance_loss_mlp": 1.06996655, + "epoch": 0.046655644070344206, + "flos": 18112516778880.0, + "grad_norm": 1.5904648869830247, + "language_loss": 0.77037287, + "learning_rate": 3.997091134020217e-06, + "loss": 0.79337239, + "num_input_tokens_seen": 16667555, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.703125, + "step": 776, + "time_per_iteration": 2.4713642597198486 + }, + { + "auxiliary_loss_clip": 0.01236202, + "auxiliary_loss_mlp": 0.01063775, + "balance_loss_clip": 1.03790653, + "balance_loss_mlp": 1.06914115, + "epoch": 0.04671576732301218, + "flos": 29205286617600.0, + "grad_norm": 1.9751950676431418, + "language_loss": 0.70967531, + "learning_rate": 3.997070098605585e-06, + "loss": 0.73267508, + "num_input_tokens_seen": 16686875, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.671875, + "step": 777, + "time_per_iteration": 2.540151596069336 + }, + { + "auxiliary_loss_clip": 0.01242182, + "auxiliary_loss_mlp": 0.01078178, + "balance_loss_clip": 1.04999709, + "balance_loss_mlp": 1.07221043, + "epoch": 0.04677589057568014, + "flos": 30478319510400.0, + "grad_norm": 1.9852588200641685, + "language_loss": 0.76756501, + "learning_rate": 3.997048987461856e-06, + "loss": 0.79076868, + "num_input_tokens_seen": 16706420, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.703125, + "step": 778, + "time_per_iteration": 2.5299642086029053 + }, + { + "auxiliary_loss_clip": 0.01236882, + "auxiliary_loss_mlp": 0.01068399, + "balance_loss_clip": 1.04049253, + "balance_loss_mlp": 1.06948996, + "epoch": 0.046836013828348115, + "flos": 20557674282240.0, + "grad_norm": 2.9364819041983576, + "language_loss": 0.78900939, + "learning_rate": 3.997027800589829e-06, + "loss": 0.81206226, + "num_input_tokens_seen": 16726390, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.671875, + "step": 779, + "time_per_iteration": 2.4999477863311768 + }, + { + "auxiliary_loss_clip": 0.01230899, + "auxiliary_loss_mlp": 0.01065999, + "balance_loss_clip": 1.03997588, + "balance_loss_mlp": 1.06776333, + "epoch": 0.04689613708101608, + "flos": 25447378757760.0, + "grad_norm": 1.7037291099106273, + "language_loss": 0.77051055, + "learning_rate": 3.997006537990308e-06, + "loss": 0.7934795, + "num_input_tokens_seen": 16748965, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.6328125, + "step": 780, + "time_per_iteration": 2.54770565032959 + }, + { + "auxiliary_loss_clip": 0.01235667, + "auxiliary_loss_mlp": 0.01067194, + "balance_loss_clip": 1.04187369, + "balance_loss_mlp": 1.07070863, + "epoch": 0.04695626033368405, + "flos": 23001395241600.0, + "grad_norm": 2.6789342331958745, + "language_loss": 0.76432645, + "learning_rate": 3.996985199664099e-06, + "loss": 0.78735507, + "num_input_tokens_seen": 16768620, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.6484375, + "step": 781, + "time_per_iteration": 2.5040361881256104 + }, + { + "auxiliary_loss_clip": 0.01245917, + "auxiliary_loss_mlp": 0.01072818, + "balance_loss_clip": 1.04468417, + "balance_loss_mlp": 1.07423282, + "epoch": 0.047016383586352024, + "flos": 29133357632640.0, + "grad_norm": 2.2171800145032736, + "language_loss": 0.74027473, + "learning_rate": 3.99696378561201e-06, + "loss": 0.76346207, + "num_input_tokens_seen": 16789755, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.71875, + "step": 782, + "time_per_iteration": 2.528890371322632 + }, + { + "auxiliary_loss_clip": 0.01241991, + "auxiliary_loss_mlp": 0.01060851, + "balance_loss_clip": 1.03549504, + "balance_loss_mlp": 1.07483578, + "epoch": 0.04707650683901999, + "flos": 14976330451200.0, + "grad_norm": 6.219089205177081, + "language_loss": 0.8032757, + "learning_rate": 3.996942295834855e-06, + "loss": 0.82630414, + "num_input_tokens_seen": 16807585, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.671875, + "step": 783, + "time_per_iteration": 2.4866061210632324 + }, + { + "auxiliary_loss_clip": 0.01232605, + "auxiliary_loss_mlp": 0.01059533, + "balance_loss_clip": 1.03417742, + "balance_loss_mlp": 1.07062817, + "epoch": 0.04713663009168796, + "flos": 21651118151040.0, + "grad_norm": 2.0172272756643816, + "language_loss": 0.81289953, + "learning_rate": 3.996920730333448e-06, + "loss": 0.83582091, + "num_input_tokens_seen": 16827220, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.6171875, + "step": 784, + "time_per_iteration": 2.476659059524536 + }, + { + "auxiliary_loss_clip": 0.01238913, + "auxiliary_loss_mlp": 0.01072248, + "balance_loss_clip": 1.04597473, + "balance_loss_mlp": 1.0683856, + "epoch": 0.04719675334435593, + "flos": 21325408600320.0, + "grad_norm": 2.171254656371271, + "language_loss": 0.8076694, + "learning_rate": 3.996899089108607e-06, + "loss": 0.83078098, + "num_input_tokens_seen": 16846230, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.703125, + "step": 785, + "time_per_iteration": 2.493598461151123 + }, + { + "auxiliary_loss_clip": 0.01241548, + "auxiliary_loss_mlp": 0.01061941, + "balance_loss_clip": 1.03752661, + "balance_loss_mlp": 1.0762614, + "epoch": 0.0472568765970239, + "flos": 17931383470080.0, + "grad_norm": 2.444819858404617, + "language_loss": 0.89981294, + "learning_rate": 3.996877372161152e-06, + "loss": 0.92284781, + "num_input_tokens_seen": 16865325, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.65625, + "step": 786, + "time_per_iteration": 2.4573311805725098 + }, + { + "auxiliary_loss_clip": 0.012413, + "auxiliary_loss_mlp": 0.01070001, + "balance_loss_clip": 1.04055619, + "balance_loss_mlp": 1.06742501, + "epoch": 0.04731699984969187, + "flos": 18077324428800.0, + "grad_norm": 3.379381752409287, + "language_loss": 0.76639462, + "learning_rate": 3.9968555794919065e-06, + "loss": 0.78950763, + "num_input_tokens_seen": 16882930, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.734375, + "step": 787, + "time_per_iteration": 2.447611093521118 + }, + { + "auxiliary_loss_clip": 0.01247236, + "auxiliary_loss_mlp": 0.01071736, + "balance_loss_clip": 1.04431772, + "balance_loss_mlp": 1.0765723, + "epoch": 0.047377123102359836, + "flos": 23185078416000.0, + "grad_norm": 2.4642209511959403, + "language_loss": 0.80851126, + "learning_rate": 3.996833711101698e-06, + "loss": 0.83170098, + "num_input_tokens_seen": 16900710, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.7109375, + "step": 788, + "time_per_iteration": 2.4679956436157227 + }, + { + "auxiliary_loss_clip": 0.01236983, + "auxiliary_loss_mlp": 0.01074337, + "balance_loss_clip": 1.04551244, + "balance_loss_mlp": 1.07285857, + "epoch": 0.04743724635502781, + "flos": 22747794243840.0, + "grad_norm": 2.2318634793178127, + "language_loss": 0.84819949, + "learning_rate": 3.996811766991355e-06, + "loss": 0.87131274, + "num_input_tokens_seen": 16919210, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.640625, + "step": 789, + "time_per_iteration": 2.4982516765594482 + }, + { + "auxiliary_loss_clip": 0.01242053, + "auxiliary_loss_mlp": 0.01066276, + "balance_loss_clip": 1.04006219, + "balance_loss_mlp": 1.07367456, + "epoch": 0.04749736960769577, + "flos": 17238702620160.0, + "grad_norm": 1.948517450129577, + "language_loss": 0.82196069, + "learning_rate": 3.996789747161709e-06, + "loss": 0.84504396, + "num_input_tokens_seen": 16937125, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.6875, + "step": 790, + "time_per_iteration": 2.4380602836608887 + }, + { + "auxiliary_loss_clip": 0.01236299, + "auxiliary_loss_mlp": 0.01062348, + "balance_loss_clip": 1.03524029, + "balance_loss_mlp": 1.06857598, + "epoch": 0.047557492860363745, + "flos": 40479261592320.0, + "grad_norm": 2.8806939749630054, + "language_loss": 0.88245451, + "learning_rate": 3.996767651613597e-06, + "loss": 0.90544093, + "num_input_tokens_seen": 16958610, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.671875, + "step": 791, + "time_per_iteration": 2.6723573207855225 + }, + { + "auxiliary_loss_clip": 0.01239952, + "auxiliary_loss_mlp": 0.010655, + "balance_loss_clip": 1.03826034, + "balance_loss_mlp": 1.07212687, + "epoch": 0.04761761611303172, + "flos": 18698004466560.0, + "grad_norm": 2.2584516419561464, + "language_loss": 0.90245461, + "learning_rate": 3.996745480347854e-06, + "loss": 0.92550921, + "num_input_tokens_seen": 16977300, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.671875, + "step": 792, + "time_per_iteration": 2.4627771377563477 + }, + { + "auxiliary_loss_clip": 0.01241845, + "auxiliary_loss_mlp": 0.01074856, + "balance_loss_clip": 1.04874945, + "balance_loss_mlp": 1.07157969, + "epoch": 0.04767773936569968, + "flos": 20921987975040.0, + "grad_norm": 1.9386484459236437, + "language_loss": 0.7310667, + "learning_rate": 3.996723233365324e-06, + "loss": 0.75423372, + "num_input_tokens_seen": 16994950, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.703125, + "step": 793, + "time_per_iteration": 2.512657403945923 + }, + { + "auxiliary_loss_clip": 0.0124198, + "auxiliary_loss_mlp": 0.01067209, + "balance_loss_clip": 1.03969526, + "balance_loss_mlp": 1.07207203, + "epoch": 0.047737862618367655, + "flos": 23732680233600.0, + "grad_norm": 2.0117940746735123, + "language_loss": 0.86102074, + "learning_rate": 3.996700910666847e-06, + "loss": 0.88411266, + "num_input_tokens_seen": 17014760, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.703125, + "step": 794, + "time_per_iteration": 2.510611057281494 + }, + { + "auxiliary_loss_clip": 0.0123999, + "auxiliary_loss_mlp": 0.01074174, + "balance_loss_clip": 1.04701805, + "balance_loss_mlp": 1.06925917, + "epoch": 0.04779798587103562, + "flos": 23695764030720.0, + "grad_norm": 3.4118642482115384, + "language_loss": 0.69812739, + "learning_rate": 3.996678512253272e-06, + "loss": 0.72126907, + "num_input_tokens_seen": 17032715, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.7109375, + "step": 795, + "time_per_iteration": 2.500420093536377 + }, + { + "auxiliary_loss_clip": 0.01236981, + "auxiliary_loss_mlp": 0.01070364, + "balance_loss_clip": 1.0432204, + "balance_loss_mlp": 1.06999111, + "epoch": 0.04785810912370359, + "flos": 23183641872000.0, + "grad_norm": 2.0479238599532135, + "language_loss": 0.81053579, + "learning_rate": 3.996656038125449e-06, + "loss": 0.83360916, + "num_input_tokens_seen": 17052215, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.671875, + "step": 796, + "time_per_iteration": 2.4838409423828125 + }, + { + "auxiliary_loss_clip": 0.0124002, + "auxiliary_loss_mlp": 0.01058331, + "balance_loss_clip": 1.03129458, + "balance_loss_mlp": 1.07190371, + "epoch": 0.047918232376371564, + "flos": 18040623707520.0, + "grad_norm": 2.3456590334750858, + "language_loss": 0.81249642, + "learning_rate": 3.996633488284228e-06, + "loss": 0.83547997, + "num_input_tokens_seen": 17069225, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.6796875, + "step": 797, + "time_per_iteration": 2.466343402862549 + }, + { + "auxiliary_loss_clip": 0.01122032, + "auxiliary_loss_mlp": 0.0100279, + "balance_loss_clip": 0.9972828, + "balance_loss_mlp": 1.03672731, + "epoch": 0.04797835562903953, + "flos": 62442588758400.0, + "grad_norm": 0.9120921080635288, + "language_loss": 0.64447635, + "learning_rate": 3.996610862730465e-06, + "loss": 0.66572458, + "num_input_tokens_seen": 17126680, + "router_z_loss_clip": 0.05517578, + "router_z_loss_mlp": 0.8515625, + "step": 798, + "time_per_iteration": 3.0081863403320312 + }, + { + "auxiliary_loss_clip": 0.01243937, + "auxiliary_loss_mlp": 0.01070197, + "balance_loss_clip": 1.04285014, + "balance_loss_mlp": 1.06894708, + "epoch": 0.0480384788817075, + "flos": 21507296094720.0, + "grad_norm": 7.0153313624744005, + "language_loss": 0.90794134, + "learning_rate": 3.996588161465018e-06, + "loss": 0.93108267, + "num_input_tokens_seen": 17144835, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.75, + "step": 799, + "time_per_iteration": 2.4872424602508545 + }, + { + "auxiliary_loss_clip": 0.01242621, + "auxiliary_loss_mlp": 0.01069655, + "balance_loss_clip": 1.04220068, + "balance_loss_mlp": 1.07567, + "epoch": 0.048098602134375466, + "flos": 21726710323200.0, + "grad_norm": 2.1467314479540818, + "language_loss": 0.86701, + "learning_rate": 3.996565384488748e-06, + "loss": 0.89013278, + "num_input_tokens_seen": 17165030, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.671875, + "step": 800, + "time_per_iteration": 2.477720022201538 + }, + { + "auxiliary_loss_clip": 0.01243518, + "auxiliary_loss_mlp": 0.0106979, + "balance_loss_clip": 1.04362369, + "balance_loss_mlp": 1.07207572, + "epoch": 0.04815872538704344, + "flos": 22931082368640.0, + "grad_norm": 7.517902152046504, + "language_loss": 0.84513009, + "learning_rate": 3.996542531802518e-06, + "loss": 0.86826313, + "num_input_tokens_seen": 17184895, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.71875, + "step": 801, + "time_per_iteration": 2.487889528274536 + }, + { + "auxiliary_loss_clip": 0.01242116, + "auxiliary_loss_mlp": 0.01071409, + "balance_loss_clip": 1.04470587, + "balance_loss_mlp": 1.07289147, + "epoch": 0.04821884863971141, + "flos": 43174716042240.0, + "grad_norm": 1.97564705550146, + "language_loss": 0.79967415, + "learning_rate": 3.996519603407196e-06, + "loss": 0.82280934, + "num_input_tokens_seen": 17208225, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.6953125, + "step": 802, + "time_per_iteration": 2.6496224403381348 + }, + { + "auxiliary_loss_clip": 0.01238875, + "auxiliary_loss_mlp": 0.01065547, + "balance_loss_clip": 1.03963101, + "balance_loss_mlp": 1.07270598, + "epoch": 0.048278971892379376, + "flos": 18620006083200.0, + "grad_norm": 2.8331626885697725, + "language_loss": 0.86420751, + "learning_rate": 3.996496599303649e-06, + "loss": 0.88725173, + "num_input_tokens_seen": 17226305, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.6640625, + "step": 803, + "time_per_iteration": 2.4806807041168213 + }, + { + "auxiliary_loss_clip": 0.01238315, + "auxiliary_loss_mlp": 0.01061166, + "balance_loss_clip": 1.0346303, + "balance_loss_mlp": 1.07398677, + "epoch": 0.04833909514504735, + "flos": 20230061310720.0, + "grad_norm": 2.229653749186784, + "language_loss": 0.85436332, + "learning_rate": 3.996473519492753e-06, + "loss": 0.87735808, + "num_input_tokens_seen": 17244545, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.640625, + "step": 804, + "time_per_iteration": 2.458303213119507 + }, + { + "auxiliary_loss_clip": 0.01239413, + "auxiliary_loss_mlp": 0.01066878, + "balance_loss_clip": 1.04099822, + "balance_loss_mlp": 1.07286024, + "epoch": 0.04839921839771532, + "flos": 24645170361600.0, + "grad_norm": 2.2509331098011645, + "language_loss": 0.86119306, + "learning_rate": 3.99645036397538e-06, + "loss": 0.88425595, + "num_input_tokens_seen": 17265730, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.6640625, + "step": 805, + "time_per_iteration": 2.5021419525146484 + }, + { + "auxiliary_loss_clip": 0.01235031, + "auxiliary_loss_mlp": 0.01067273, + "balance_loss_clip": 1.04115391, + "balance_loss_mlp": 1.06942892, + "epoch": 0.048459341650383285, + "flos": 24827452905600.0, + "grad_norm": 1.8866019303880346, + "language_loss": 0.68034315, + "learning_rate": 3.9964271327524085e-06, + "loss": 0.70336622, + "num_input_tokens_seen": 17284820, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.65625, + "step": 806, + "time_per_iteration": 2.4904568195343018 + }, + { + "auxiliary_loss_clip": 0.01235579, + "auxiliary_loss_mlp": 0.01064526, + "balance_loss_clip": 1.03847933, + "balance_loss_mlp": 1.07208037, + "epoch": 0.04851946490305126, + "flos": 22163204396160.0, + "grad_norm": 2.221107161276338, + "language_loss": 0.7716608, + "learning_rate": 3.9964038258247214e-06, + "loss": 0.79466188, + "num_input_tokens_seen": 17305085, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.6328125, + "step": 807, + "time_per_iteration": 2.498624563217163 + }, + { + "auxiliary_loss_clip": 0.01232532, + "auxiliary_loss_mlp": 0.01071643, + "balance_loss_clip": 1.04567873, + "balance_loss_mlp": 1.06831741, + "epoch": 0.04857958815571922, + "flos": 19792022952960.0, + "grad_norm": 2.844770488216335, + "language_loss": 0.86509991, + "learning_rate": 3.9963804431932005e-06, + "loss": 0.88814163, + "num_input_tokens_seen": 17322715, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.640625, + "step": 808, + "time_per_iteration": 2.444673538208008 + }, + { + "auxiliary_loss_clip": 0.01242847, + "auxiliary_loss_mlp": 0.01070908, + "balance_loss_clip": 1.04441929, + "balance_loss_mlp": 1.07261682, + "epoch": 0.048639711408387194, + "flos": 18697968552960.0, + "grad_norm": 1.9428867449931826, + "language_loss": 0.90154302, + "learning_rate": 3.996356984858732e-06, + "loss": 0.92468053, + "num_input_tokens_seen": 17341455, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.703125, + "step": 809, + "time_per_iteration": 5.353702545166016 + }, + { + "auxiliary_loss_clip": 0.01242102, + "auxiliary_loss_mlp": 0.01070004, + "balance_loss_clip": 1.0432415, + "balance_loss_mlp": 1.07577538, + "epoch": 0.048699834661055166, + "flos": 24863507182080.0, + "grad_norm": 2.12821080633451, + "language_loss": 0.84360719, + "learning_rate": 3.996333450822208e-06, + "loss": 0.86672825, + "num_input_tokens_seen": 17360765, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.65625, + "step": 810, + "time_per_iteration": 3.8935022354125977 + }, + { + "auxiliary_loss_clip": 0.01240735, + "auxiliary_loss_mlp": 0.01066961, + "balance_loss_clip": 1.04049659, + "balance_loss_mlp": 1.07189715, + "epoch": 0.04875995791372313, + "flos": 20704010290560.0, + "grad_norm": 1.7610993085905569, + "language_loss": 0.80875039, + "learning_rate": 3.99630984108452e-06, + "loss": 0.8318274, + "num_input_tokens_seen": 17380625, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.6875, + "step": 811, + "time_per_iteration": 2.5000643730163574 + }, + { + "auxiliary_loss_clip": 0.01232044, + "auxiliary_loss_mlp": 0.0107527, + "balance_loss_clip": 1.04991412, + "balance_loss_mlp": 1.06997907, + "epoch": 0.048820081166391104, + "flos": 18588297352320.0, + "grad_norm": 2.0417171226218715, + "language_loss": 0.74768531, + "learning_rate": 3.9962861556465615e-06, + "loss": 0.77075845, + "num_input_tokens_seen": 17399355, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.625, + "step": 812, + "time_per_iteration": 2.4853179454803467 + }, + { + "auxiliary_loss_clip": 0.01233917, + "auxiliary_loss_mlp": 0.01074517, + "balance_loss_clip": 1.04924428, + "balance_loss_mlp": 1.07263327, + "epoch": 0.04888020441905907, + "flos": 22707322594560.0, + "grad_norm": 1.8904091966919716, + "language_loss": 0.89845109, + "learning_rate": 3.996262394509233e-06, + "loss": 0.92153537, + "num_input_tokens_seen": 17418240, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.609375, + "step": 813, + "time_per_iteration": 2.6731016635894775 + }, + { + "auxiliary_loss_clip": 0.01232344, + "auxiliary_loss_mlp": 0.01058653, + "balance_loss_clip": 1.03429866, + "balance_loss_mlp": 1.07083082, + "epoch": 0.04894032767172704, + "flos": 22784351310720.0, + "grad_norm": 2.028357820963791, + "language_loss": 0.74551463, + "learning_rate": 3.9962385576734335e-06, + "loss": 0.76842451, + "num_input_tokens_seen": 17436250, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.6171875, + "step": 814, + "time_per_iteration": 2.509963035583496 + }, + { + "auxiliary_loss_clip": 0.01235531, + "auxiliary_loss_mlp": 0.01067085, + "balance_loss_clip": 1.04074001, + "balance_loss_mlp": 1.07073569, + "epoch": 0.04900045092439501, + "flos": 25516147345920.0, + "grad_norm": 2.3605733083261464, + "language_loss": 0.83740532, + "learning_rate": 3.9962146451400675e-06, + "loss": 0.86043149, + "num_input_tokens_seen": 17455750, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.6484375, + "step": 815, + "time_per_iteration": 2.5490894317626953 + }, + { + "auxiliary_loss_clip": 0.01239776, + "auxiliary_loss_mlp": 0.0106033, + "balance_loss_clip": 1.03396082, + "balance_loss_mlp": 1.07326484, + "epoch": 0.04906057417706298, + "flos": 25958136199680.0, + "grad_norm": 2.271155414035229, + "language_loss": 0.90803105, + "learning_rate": 3.996190656910043e-06, + "loss": 0.93103218, + "num_input_tokens_seen": 17474995, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.6640625, + "step": 816, + "time_per_iteration": 2.5273053646087646 + }, + { + "auxiliary_loss_clip": 0.01240454, + "auxiliary_loss_mlp": 0.01060699, + "balance_loss_clip": 1.03410304, + "balance_loss_mlp": 1.0732162, + "epoch": 0.04912069742973095, + "flos": 18624638937600.0, + "grad_norm": 3.2321750342473603, + "language_loss": 0.79924619, + "learning_rate": 3.996166592984268e-06, + "loss": 0.82225776, + "num_input_tokens_seen": 17493395, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.671875, + "step": 817, + "time_per_iteration": 2.5095019340515137 + }, + { + "auxiliary_loss_clip": 0.0123455, + "auxiliary_loss_mlp": 0.01074727, + "balance_loss_clip": 1.04864395, + "balance_loss_mlp": 1.07184172, + "epoch": 0.049180820682398915, + "flos": 23699786353920.0, + "grad_norm": 1.8264850687392937, + "language_loss": 0.84520394, + "learning_rate": 3.996142453363656e-06, + "loss": 0.86829674, + "num_input_tokens_seen": 17514565, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.625, + "step": 818, + "time_per_iteration": 2.5476157665252686 + }, + { + "auxiliary_loss_clip": 0.01243386, + "auxiliary_loss_mlp": 0.01067455, + "balance_loss_clip": 1.04041791, + "balance_loss_mlp": 1.07401037, + "epoch": 0.04924094393506689, + "flos": 22420396753920.0, + "grad_norm": 2.779535734169796, + "language_loss": 0.75307131, + "learning_rate": 3.996118238049124e-06, + "loss": 0.77617967, + "num_input_tokens_seen": 17534590, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.6953125, + "step": 819, + "time_per_iteration": 2.5486624240875244 + }, + { + "auxiliary_loss_clip": 0.01239669, + "auxiliary_loss_mlp": 0.01061583, + "balance_loss_clip": 1.03858793, + "balance_loss_mlp": 1.07577193, + "epoch": 0.04930106718773486, + "flos": 15738246766080.0, + "grad_norm": 2.1475545017813853, + "language_loss": 0.85166955, + "learning_rate": 3.996093947041586e-06, + "loss": 0.87468207, + "num_input_tokens_seen": 17551900, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.640625, + "step": 820, + "time_per_iteration": 2.4565298557281494 + }, + { + "auxiliary_loss_clip": 0.0123627, + "auxiliary_loss_mlp": 0.01061843, + "balance_loss_clip": 1.03602266, + "balance_loss_mlp": 1.07061315, + "epoch": 0.049361190440402825, + "flos": 26250628648320.0, + "grad_norm": 1.902695357085614, + "language_loss": 0.9041872, + "learning_rate": 3.996069580341966e-06, + "loss": 0.92716837, + "num_input_tokens_seen": 17571485, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.65625, + "step": 821, + "time_per_iteration": 2.5412514209747314 + }, + { + "auxiliary_loss_clip": 0.01233424, + "auxiliary_loss_mlp": 0.01073041, + "balance_loss_clip": 1.04773307, + "balance_loss_mlp": 1.06951392, + "epoch": 0.0494213136930708, + "flos": 21252366293760.0, + "grad_norm": 2.0531707528144274, + "language_loss": 0.8941884, + "learning_rate": 3.996045137951188e-06, + "loss": 0.91725308, + "num_input_tokens_seen": 17591410, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.640625, + "step": 822, + "time_per_iteration": 2.5171031951904297 + }, + { + "auxiliary_loss_clip": 0.01237258, + "auxiliary_loss_mlp": 0.01059943, + "balance_loss_clip": 1.03295374, + "balance_loss_mlp": 1.0742538, + "epoch": 0.04948143694573876, + "flos": 27965506740480.0, + "grad_norm": 2.060390808888412, + "language_loss": 0.67537785, + "learning_rate": 3.996020619870178e-06, + "loss": 0.69834983, + "num_input_tokens_seen": 17612010, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.6328125, + "step": 823, + "time_per_iteration": 2.5744235515594482 + }, + { + "auxiliary_loss_clip": 0.01120581, + "auxiliary_loss_mlp": 0.01008389, + "balance_loss_clip": 1.00323892, + "balance_loss_mlp": 1.04174662, + "epoch": 0.049541560198406734, + "flos": 66180995533440.0, + "grad_norm": 1.3777513990451415, + "language_loss": 0.62206292, + "learning_rate": 3.995996026099866e-06, + "loss": 0.64335263, + "num_input_tokens_seen": 17673430, + "router_z_loss_clip": 0.05151367, + "router_z_loss_mlp": 0.7890625, + "step": 824, + "time_per_iteration": 3.13708758354187 + }, + { + "auxiliary_loss_clip": 0.01240025, + "auxiliary_loss_mlp": 0.01070032, + "balance_loss_clip": 1.0431149, + "balance_loss_mlp": 1.07293963, + "epoch": 0.049601683451074706, + "flos": 22892693708160.0, + "grad_norm": 2.021638376413324, + "language_loss": 0.90364408, + "learning_rate": 3.995971356641185e-06, + "loss": 0.92674464, + "num_input_tokens_seen": 17689545, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.671875, + "step": 825, + "time_per_iteration": 2.519487142562866 + }, + { + "auxiliary_loss_clip": 0.01237141, + "auxiliary_loss_mlp": 0.01064311, + "balance_loss_clip": 1.03678548, + "balance_loss_mlp": 1.0713625, + "epoch": 0.04966180670374267, + "flos": 21433643256960.0, + "grad_norm": 23.06748840114486, + "language_loss": 0.66790086, + "learning_rate": 3.9959466114950695e-06, + "loss": 0.69091535, + "num_input_tokens_seen": 17705965, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.65625, + "step": 826, + "time_per_iteration": 2.486837387084961 + }, + { + "auxiliary_loss_clip": 0.01236344, + "auxiliary_loss_mlp": 0.01062091, + "balance_loss_clip": 1.0362581, + "balance_loss_mlp": 1.07166433, + "epoch": 0.04972192995641064, + "flos": 23107367341440.0, + "grad_norm": 5.4656671498779845, + "language_loss": 0.78386623, + "learning_rate": 3.995921790662459e-06, + "loss": 0.80685055, + "num_input_tokens_seen": 17724580, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.640625, + "step": 827, + "time_per_iteration": 2.517092704772949 + }, + { + "auxiliary_loss_clip": 0.0124052, + "auxiliary_loss_mlp": 0.01072288, + "balance_loss_clip": 1.04553795, + "balance_loss_mlp": 1.07333767, + "epoch": 0.04978205320907861, + "flos": 40406147458560.0, + "grad_norm": 2.8940457048653916, + "language_loss": 0.78592682, + "learning_rate": 3.995896894144294e-06, + "loss": 0.80905491, + "num_input_tokens_seen": 17747755, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.671875, + "step": 828, + "time_per_iteration": 2.6536450386047363 + }, + { + "auxiliary_loss_clip": 0.01227721, + "auxiliary_loss_mlp": 0.01058804, + "balance_loss_clip": 1.03428292, + "balance_loss_mlp": 1.06777728, + "epoch": 0.04984217646174658, + "flos": 25228539146880.0, + "grad_norm": 2.330577425067274, + "language_loss": 0.83493364, + "learning_rate": 3.995871921941519e-06, + "loss": 0.85779881, + "num_input_tokens_seen": 17768550, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.6015625, + "step": 829, + "time_per_iteration": 2.5744268894195557 + }, + { + "auxiliary_loss_clip": 0.01235678, + "auxiliary_loss_mlp": 0.01073434, + "balance_loss_clip": 1.04433525, + "balance_loss_mlp": 1.07021666, + "epoch": 0.04990229971441455, + "flos": 15959636242560.0, + "grad_norm": 2.2375926111489743, + "language_loss": 0.75055873, + "learning_rate": 3.99584687405508e-06, + "loss": 0.77364987, + "num_input_tokens_seen": 17786080, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.65625, + "step": 830, + "time_per_iteration": 2.5045461654663086 + }, + { + "auxiliary_loss_clip": 0.01233457, + "auxiliary_loss_mlp": 0.01065961, + "balance_loss_clip": 1.03935385, + "balance_loss_mlp": 1.06966341, + "epoch": 0.04996242296708252, + "flos": 18405116968320.0, + "grad_norm": 1.962979792887244, + "language_loss": 0.79379636, + "learning_rate": 3.995821750485929e-06, + "loss": 0.81679052, + "num_input_tokens_seen": 17803635, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.640625, + "step": 831, + "time_per_iteration": 2.5924267768859863 + }, + { + "auxiliary_loss_clip": 0.01237676, + "auxiliary_loss_mlp": 0.01070014, + "balance_loss_clip": 1.04487276, + "balance_loss_mlp": 1.07213569, + "epoch": 0.05002254621975049, + "flos": 17858053854720.0, + "grad_norm": 2.758266217871517, + "language_loss": 0.91538632, + "learning_rate": 3.995796551235016e-06, + "loss": 0.93846321, + "num_input_tokens_seen": 17822190, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.65625, + "step": 832, + "time_per_iteration": 2.653150796890259 + }, + { + "auxiliary_loss_clip": 0.01230534, + "auxiliary_loss_mlp": 0.01081981, + "balance_loss_clip": 1.05747163, + "balance_loss_mlp": 1.07053018, + "epoch": 0.050082669472418455, + "flos": 45660273367680.0, + "grad_norm": 1.9700093948003867, + "language_loss": 0.83139837, + "learning_rate": 3.9957712763032974e-06, + "loss": 0.85452354, + "num_input_tokens_seen": 17846915, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.6015625, + "step": 833, + "time_per_iteration": 2.73848819732666 + }, + { + "auxiliary_loss_clip": 0.0123523, + "auxiliary_loss_mlp": 0.01058285, + "balance_loss_clip": 1.0318923, + "balance_loss_mlp": 1.06913459, + "epoch": 0.05014279272508643, + "flos": 37962067363200.0, + "grad_norm": 2.433665596415918, + "language_loss": 0.8254565, + "learning_rate": 3.995745925691733e-06, + "loss": 0.84839165, + "num_input_tokens_seen": 17867270, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.65625, + "step": 834, + "time_per_iteration": 2.6406352519989014 + }, + { + "auxiliary_loss_clip": 0.01236789, + "auxiliary_loss_mlp": 0.01063828, + "balance_loss_clip": 1.03710127, + "balance_loss_mlp": 1.07138014, + "epoch": 0.0502029159777544, + "flos": 20996179516800.0, + "grad_norm": 2.099554255469436, + "language_loss": 0.91758966, + "learning_rate": 3.995720499401282e-06, + "loss": 0.94059587, + "num_input_tokens_seen": 17884880, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.65625, + "step": 835, + "time_per_iteration": 2.4750425815582275 + }, + { + "auxiliary_loss_clip": 0.0123437, + "auxiliary_loss_mlp": 0.01071713, + "balance_loss_clip": 1.04372287, + "balance_loss_mlp": 1.06699944, + "epoch": 0.050263039230422364, + "flos": 15888066393600.0, + "grad_norm": 2.4903656252358735, + "language_loss": 0.76346481, + "learning_rate": 3.995694997432911e-06, + "loss": 0.78652561, + "num_input_tokens_seen": 17903695, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.671875, + "step": 836, + "time_per_iteration": 2.4839258193969727 + }, + { + "auxiliary_loss_clip": 0.01229978, + "auxiliary_loss_mlp": 0.01072782, + "balance_loss_clip": 1.04696083, + "balance_loss_mlp": 1.07100809, + "epoch": 0.050323162483090336, + "flos": 23732752060800.0, + "grad_norm": 2.1380784235063066, + "language_loss": 0.8360337, + "learning_rate": 3.9956694197875855e-06, + "loss": 0.85906136, + "num_input_tokens_seen": 17920745, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.5859375, + "step": 837, + "time_per_iteration": 2.5140485763549805 + }, + { + "auxiliary_loss_clip": 0.01233502, + "auxiliary_loss_mlp": 0.01065592, + "balance_loss_clip": 1.0403192, + "balance_loss_mlp": 1.07245386, + "epoch": 0.0503832857357583, + "flos": 20266223328000.0, + "grad_norm": 2.225982034212064, + "language_loss": 0.73137468, + "learning_rate": 3.995643766466275e-06, + "loss": 0.75436556, + "num_input_tokens_seen": 17938220, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.609375, + "step": 838, + "time_per_iteration": 2.5128419399261475 + }, + { + "auxiliary_loss_clip": 0.01229023, + "auxiliary_loss_mlp": 0.0106788, + "balance_loss_clip": 1.04195237, + "balance_loss_mlp": 1.06636167, + "epoch": 0.05044340898842627, + "flos": 17785011548160.0, + "grad_norm": 1.886796600099776, + "language_loss": 0.83328462, + "learning_rate": 3.995618037469953e-06, + "loss": 0.85625362, + "num_input_tokens_seen": 17957325, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.6328125, + "step": 839, + "time_per_iteration": 2.499415874481201 + }, + { + "auxiliary_loss_clip": 0.01228207, + "auxiliary_loss_mlp": 0.01066651, + "balance_loss_clip": 1.04128349, + "balance_loss_mlp": 1.06866539, + "epoch": 0.050503532241094246, + "flos": 22966526113920.0, + "grad_norm": 2.2056506497336765, + "language_loss": 0.85777193, + "learning_rate": 3.995592232799595e-06, + "loss": 0.8807205, + "num_input_tokens_seen": 17975875, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.59375, + "step": 840, + "time_per_iteration": 2.522038698196411 + }, + { + "auxiliary_loss_clip": 0.01235877, + "auxiliary_loss_mlp": 0.01063775, + "balance_loss_clip": 1.03691697, + "balance_loss_mlp": 1.07246661, + "epoch": 0.05056365549376221, + "flos": 22776989022720.0, + "grad_norm": 2.034102412822674, + "language_loss": 0.94658732, + "learning_rate": 3.99556635245618e-06, + "loss": 0.96958393, + "num_input_tokens_seen": 17994340, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.6328125, + "step": 841, + "time_per_iteration": 2.4996211528778076 + }, + { + "auxiliary_loss_clip": 0.01234454, + "auxiliary_loss_mlp": 0.01066453, + "balance_loss_clip": 1.03958321, + "balance_loss_mlp": 1.07130527, + "epoch": 0.05062377874643018, + "flos": 30916968399360.0, + "grad_norm": 2.030819255438432, + "language_loss": 0.77387047, + "learning_rate": 3.995540396440688e-06, + "loss": 0.79687953, + "num_input_tokens_seen": 18015260, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.6328125, + "step": 842, + "time_per_iteration": 2.6253628730773926 + }, + { + "auxiliary_loss_clip": 0.01238804, + "auxiliary_loss_mlp": 0.01067813, + "balance_loss_clip": 1.041659, + "balance_loss_mlp": 1.07278991, + "epoch": 0.05068390199909815, + "flos": 19647159402240.0, + "grad_norm": 2.283727909175907, + "language_loss": 0.78014457, + "learning_rate": 3.995514364754105e-06, + "loss": 0.80321074, + "num_input_tokens_seen": 18033960, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.6640625, + "step": 843, + "time_per_iteration": 2.5158324241638184 + }, + { + "auxiliary_loss_clip": 0.01237695, + "auxiliary_loss_mlp": 0.01061566, + "balance_loss_clip": 1.036461, + "balance_loss_mlp": 1.07266212, + "epoch": 0.05074402525176612, + "flos": 37962103276800.0, + "grad_norm": 2.249210505837228, + "language_loss": 0.82952344, + "learning_rate": 3.995488257397417e-06, + "loss": 0.85251611, + "num_input_tokens_seen": 18056700, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.6484375, + "step": 844, + "time_per_iteration": 2.6476500034332275 + }, + { + "auxiliary_loss_clip": 0.01229818, + "auxiliary_loss_mlp": 0.0106479, + "balance_loss_clip": 1.03935087, + "balance_loss_mlp": 1.06871867, + "epoch": 0.05080414850443409, + "flos": 22054610603520.0, + "grad_norm": 2.3236550986537368, + "language_loss": 0.76042783, + "learning_rate": 3.995462074371614e-06, + "loss": 0.78337395, + "num_input_tokens_seen": 18075815, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.609375, + "step": 845, + "time_per_iteration": 2.502901315689087 + }, + { + "auxiliary_loss_clip": 0.01229682, + "auxiliary_loss_mlp": 0.01075672, + "balance_loss_clip": 1.04924285, + "balance_loss_mlp": 1.06694174, + "epoch": 0.05086427175710206, + "flos": 20225787592320.0, + "grad_norm": 2.2528566199281905, + "language_loss": 0.87468004, + "learning_rate": 3.99543581567769e-06, + "loss": 0.89773357, + "num_input_tokens_seen": 18095095, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.625, + "step": 846, + "time_per_iteration": 2.5271859169006348 + }, + { + "auxiliary_loss_clip": 0.01230653, + "auxiliary_loss_mlp": 0.01070334, + "balance_loss_clip": 1.04521692, + "balance_loss_mlp": 1.06982791, + "epoch": 0.05092439500977003, + "flos": 15159223526400.0, + "grad_norm": 1.95159927266484, + "language_loss": 0.87571466, + "learning_rate": 3.9954094813166394e-06, + "loss": 0.89872456, + "num_input_tokens_seen": 18112675, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.609375, + "step": 847, + "time_per_iteration": 2.4566030502319336 + }, + { + "auxiliary_loss_clip": 0.01226009, + "auxiliary_loss_mlp": 0.01071018, + "balance_loss_clip": 1.04489946, + "balance_loss_mlp": 1.06883907, + "epoch": 0.050984518262437994, + "flos": 22055149307520.0, + "grad_norm": 2.141846591022022, + "language_loss": 0.81706643, + "learning_rate": 3.995383071289462e-06, + "loss": 0.84003675, + "num_input_tokens_seen": 18130745, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.5703125, + "step": 848, + "time_per_iteration": 2.4695050716400146 + }, + { + "auxiliary_loss_clip": 0.0123182, + "auxiliary_loss_mlp": 0.01077851, + "balance_loss_clip": 1.0522449, + "balance_loss_mlp": 1.07167053, + "epoch": 0.05104464151510597, + "flos": 30225329043840.0, + "grad_norm": 1.898868752622741, + "language_loss": 0.87266076, + "learning_rate": 3.995356585597158e-06, + "loss": 0.89575738, + "num_input_tokens_seen": 18152410, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.6015625, + "step": 849, + "time_per_iteration": 2.5472936630249023 + }, + { + "auxiliary_loss_clip": 0.0122487, + "auxiliary_loss_mlp": 0.01062562, + "balance_loss_clip": 1.03743291, + "balance_loss_mlp": 1.06569946, + "epoch": 0.05110476476777394, + "flos": 18332900674560.0, + "grad_norm": 1.8637209623848903, + "language_loss": 0.83340889, + "learning_rate": 3.995330024240732e-06, + "loss": 0.85628319, + "num_input_tokens_seen": 18170870, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.59375, + "step": 850, + "time_per_iteration": 2.493814468383789 + }, + { + "auxiliary_loss_clip": 0.01229016, + "auxiliary_loss_mlp": 0.01064236, + "balance_loss_clip": 1.03847528, + "balance_loss_mlp": 1.06816506, + "epoch": 0.051164888020441904, + "flos": 37998732170880.0, + "grad_norm": 2.1400408414194154, + "language_loss": 0.6501807, + "learning_rate": 3.995303387221192e-06, + "loss": 0.67311323, + "num_input_tokens_seen": 18191555, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.609375, + "step": 851, + "time_per_iteration": 5.443026065826416 + }, + { + "auxiliary_loss_clip": 0.01228781, + "auxiliary_loss_mlp": 0.01071453, + "balance_loss_clip": 1.04424942, + "balance_loss_mlp": 1.0674876, + "epoch": 0.051225011273109876, + "flos": 23038634666880.0, + "grad_norm": 2.2562645326336686, + "language_loss": 0.8376134, + "learning_rate": 3.995276674539547e-06, + "loss": 0.86061573, + "num_input_tokens_seen": 18208620, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.609375, + "step": 852, + "time_per_iteration": 2.4753623008728027 + }, + { + "auxiliary_loss_clip": 0.01231223, + "auxiliary_loss_mlp": 0.01068594, + "balance_loss_clip": 1.04190326, + "balance_loss_mlp": 1.06879044, + "epoch": 0.05128513452577785, + "flos": 18259822454400.0, + "grad_norm": 1.9405819970113303, + "language_loss": 0.80252314, + "learning_rate": 3.995249886196811e-06, + "loss": 0.82552135, + "num_input_tokens_seen": 18226370, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.625, + "step": 853, + "time_per_iteration": 2.5048112869262695 + }, + { + "auxiliary_loss_clip": 0.01226539, + "auxiliary_loss_mlp": 0.01060743, + "balance_loss_clip": 1.03432584, + "balance_loss_mlp": 1.06710184, + "epoch": 0.05134525777844581, + "flos": 27198957571200.0, + "grad_norm": 1.8237562231360178, + "language_loss": 0.75846469, + "learning_rate": 3.995223022193999e-06, + "loss": 0.7813375, + "num_input_tokens_seen": 18247075, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.59375, + "step": 854, + "time_per_iteration": 2.53165602684021 + }, + { + "auxiliary_loss_clip": 0.01233418, + "auxiliary_loss_mlp": 0.0106357, + "balance_loss_clip": 1.03678393, + "balance_loss_mlp": 1.07139039, + "epoch": 0.051405381031113785, + "flos": 28362247436160.0, + "grad_norm": 2.718422527893707, + "language_loss": 0.81173462, + "learning_rate": 3.99519608253213e-06, + "loss": 0.83470446, + "num_input_tokens_seen": 18265680, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.625, + "step": 855, + "time_per_iteration": 2.5610744953155518 + }, + { + "auxiliary_loss_clip": 0.01113278, + "auxiliary_loss_mlp": 0.01020682, + "balance_loss_clip": 1.01534104, + "balance_loss_mlp": 1.03902698, + "epoch": 0.05146550428378175, + "flos": 65618169327360.0, + "grad_norm": 1.0071030268205712, + "language_loss": 0.65609074, + "learning_rate": 3.995169067212227e-06, + "loss": 0.67743033, + "num_input_tokens_seen": 18327015, + "router_z_loss_clip": 0.0534668, + "router_z_loss_mlp": 0.7421875, + "step": 856, + "time_per_iteration": 3.0546581745147705 + }, + { + "auxiliary_loss_clip": 0.01224884, + "auxiliary_loss_mlp": 0.01053813, + "balance_loss_clip": 1.02823043, + "balance_loss_mlp": 1.06811357, + "epoch": 0.05152562753644972, + "flos": 22054861998720.0, + "grad_norm": 1.8111088050205955, + "language_loss": 0.76996124, + "learning_rate": 3.9951419762353116e-06, + "loss": 0.79274821, + "num_input_tokens_seen": 18345235, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.5625, + "step": 857, + "time_per_iteration": 2.6051554679870605 + }, + { + "auxiliary_loss_clip": 0.01229705, + "auxiliary_loss_mlp": 0.01057595, + "balance_loss_clip": 1.03130889, + "balance_loss_mlp": 1.06846082, + "epoch": 0.051585750789117694, + "flos": 18509544783360.0, + "grad_norm": 3.7937823779894377, + "language_loss": 0.88893878, + "learning_rate": 3.995114809602412e-06, + "loss": 0.91181171, + "num_input_tokens_seen": 18362350, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.6171875, + "step": 858, + "time_per_iteration": 2.4517769813537598 + }, + { + "auxiliary_loss_clip": 0.01228685, + "auxiliary_loss_mlp": 0.01056497, + "balance_loss_clip": 1.03000832, + "balance_loss_mlp": 1.06902003, + "epoch": 0.05164587404178566, + "flos": 23730238108800.0, + "grad_norm": 1.9531750101692102, + "language_loss": 0.75199753, + "learning_rate": 3.9950875673145605e-06, + "loss": 0.77484941, + "num_input_tokens_seen": 18383390, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.59375, + "step": 859, + "time_per_iteration": 2.5090014934539795 + }, + { + "auxiliary_loss_clip": 0.01237239, + "auxiliary_loss_mlp": 0.01070917, + "balance_loss_clip": 1.04280758, + "balance_loss_mlp": 1.06980002, + "epoch": 0.05170599729445363, + "flos": 16252882876800.0, + "grad_norm": 2.092452223155828, + "language_loss": 0.90812773, + "learning_rate": 3.995060249372788e-06, + "loss": 0.93120927, + "num_input_tokens_seen": 18399220, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.671875, + "step": 860, + "time_per_iteration": 2.437220335006714 + }, + { + "auxiliary_loss_clip": 0.01231057, + "auxiliary_loss_mlp": 0.01060698, + "balance_loss_clip": 1.03568769, + "balance_loss_mlp": 1.0717634, + "epoch": 0.0517661205471216, + "flos": 23985922095360.0, + "grad_norm": 1.9189860758016508, + "language_loss": 0.82252973, + "learning_rate": 3.99503285577813e-06, + "loss": 0.8454473, + "num_input_tokens_seen": 18419005, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.59375, + "step": 861, + "time_per_iteration": 2.50883412361145 + }, + { + "auxiliary_loss_clip": 0.01233216, + "auxiliary_loss_mlp": 0.01057472, + "balance_loss_clip": 1.03177071, + "balance_loss_mlp": 1.0704143, + "epoch": 0.05182624379978957, + "flos": 29277718392960.0, + "grad_norm": 2.0352629197197762, + "language_loss": 0.78607392, + "learning_rate": 3.995005386531627e-06, + "loss": 0.80898082, + "num_input_tokens_seen": 18440550, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.625, + "step": 862, + "time_per_iteration": 2.548757314682007 + }, + { + "auxiliary_loss_clip": 0.01229413, + "auxiliary_loss_mlp": 0.01068334, + "balance_loss_clip": 1.04402709, + "balance_loss_mlp": 1.07291067, + "epoch": 0.05188636705245754, + "flos": 24170826332160.0, + "grad_norm": 1.9841587361763113, + "language_loss": 0.88999134, + "learning_rate": 3.9949778416343195e-06, + "loss": 0.91296881, + "num_input_tokens_seen": 18461950, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.5625, + "step": 863, + "time_per_iteration": 2.506289005279541 + }, + { + "auxiliary_loss_clip": 0.01238268, + "auxiliary_loss_mlp": 0.01064282, + "balance_loss_clip": 1.03712606, + "balance_loss_mlp": 1.07635331, + "epoch": 0.051946490305125506, + "flos": 26760703731840.0, + "grad_norm": 2.003999649515418, + "language_loss": 0.7575798, + "learning_rate": 3.9949502210872525e-06, + "loss": 0.78060532, + "num_input_tokens_seen": 18480555, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.625, + "step": 864, + "time_per_iteration": 2.515944480895996 + }, + { + "auxiliary_loss_clip": 0.01236545, + "auxiliary_loss_mlp": 0.01069508, + "balance_loss_clip": 1.04228067, + "balance_loss_mlp": 1.07355332, + "epoch": 0.05200661355779348, + "flos": 21502519585920.0, + "grad_norm": 1.9298630836237705, + "language_loss": 0.7919569, + "learning_rate": 3.994922524891474e-06, + "loss": 0.81501746, + "num_input_tokens_seen": 18499645, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.6328125, + "step": 865, + "time_per_iteration": 2.485499620437622 + }, + { + "auxiliary_loss_clip": 0.0123268, + "auxiliary_loss_mlp": 0.0106684, + "balance_loss_clip": 1.04144871, + "balance_loss_mlp": 1.07079291, + "epoch": 0.05206673681046144, + "flos": 18114492026880.0, + "grad_norm": 2.366131428952597, + "language_loss": 0.85700798, + "learning_rate": 3.994894753048032e-06, + "loss": 0.88000321, + "num_input_tokens_seen": 18516810, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.6171875, + "step": 866, + "time_per_iteration": 2.4854226112365723 + }, + { + "auxiliary_loss_clip": 0.01242589, + "auxiliary_loss_mlp": 0.01065926, + "balance_loss_clip": 1.03910398, + "balance_loss_mlp": 1.0804987, + "epoch": 0.052126860063129415, + "flos": 17524191916800.0, + "grad_norm": 2.535209572965093, + "language_loss": 0.8680315, + "learning_rate": 3.9948669055579815e-06, + "loss": 0.89111662, + "num_input_tokens_seen": 18532510, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.625, + "step": 867, + "time_per_iteration": 2.4644970893859863 + }, + { + "auxiliary_loss_clip": 0.01231644, + "auxiliary_loss_mlp": 0.01073847, + "balance_loss_clip": 1.05021977, + "balance_loss_mlp": 1.07513499, + "epoch": 0.05218698331579739, + "flos": 32598054771840.0, + "grad_norm": 1.64188364663517, + "language_loss": 0.63562089, + "learning_rate": 3.9948389824223785e-06, + "loss": 0.65867579, + "num_input_tokens_seen": 18557380, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.5625, + "step": 868, + "time_per_iteration": 2.567958354949951 + }, + { + "auxiliary_loss_clip": 0.01236968, + "auxiliary_loss_mlp": 0.01065922, + "balance_loss_clip": 1.03753829, + "balance_loss_mlp": 1.07263327, + "epoch": 0.05224710656846535, + "flos": 22127293774080.0, + "grad_norm": 2.1448269109564198, + "language_loss": 0.83076257, + "learning_rate": 3.994810983642281e-06, + "loss": 0.85379148, + "num_input_tokens_seen": 18575720, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.6484375, + "step": 869, + "time_per_iteration": 2.5021841526031494 + }, + { + "auxiliary_loss_clip": 0.01237154, + "auxiliary_loss_mlp": 0.01057742, + "balance_loss_clip": 1.03201652, + "balance_loss_mlp": 1.07245827, + "epoch": 0.052307229821133325, + "flos": 11145092976000.0, + "grad_norm": 2.352948725027126, + "language_loss": 0.87544227, + "learning_rate": 3.994782909218751e-06, + "loss": 0.89839119, + "num_input_tokens_seen": 18592185, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.6484375, + "step": 870, + "time_per_iteration": 2.459662437438965 + }, + { + "auxiliary_loss_clip": 0.01238457, + "auxiliary_loss_mlp": 0.01067184, + "balance_loss_clip": 1.04135191, + "balance_loss_mlp": 1.07536197, + "epoch": 0.05236735307380129, + "flos": 19128070005120.0, + "grad_norm": 1.9212028950510787, + "language_loss": 0.80554998, + "learning_rate": 3.994754759152854e-06, + "loss": 0.82860637, + "num_input_tokens_seen": 18609560, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.6328125, + "step": 871, + "time_per_iteration": 2.4701170921325684 + }, + { + "auxiliary_loss_clip": 0.01234905, + "auxiliary_loss_mlp": 0.01064695, + "balance_loss_clip": 1.04009032, + "balance_loss_mlp": 1.07576704, + "epoch": 0.05242747632646926, + "flos": 20960663944320.0, + "grad_norm": 1.5975290841395262, + "language_loss": 0.81374049, + "learning_rate": 3.994726533445656e-06, + "loss": 0.8367365, + "num_input_tokens_seen": 18629405, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.59375, + "step": 872, + "time_per_iteration": 2.4886369705200195 + }, + { + "auxiliary_loss_clip": 0.0111147, + "auxiliary_loss_mlp": 0.0101489, + "balance_loss_clip": 1.00952566, + "balance_loss_mlp": 1.03955865, + "epoch": 0.052487599579137234, + "flos": 65020542842880.0, + "grad_norm": 0.8879269166117758, + "language_loss": 0.61579192, + "learning_rate": 3.9946982320982274e-06, + "loss": 0.63705552, + "num_input_tokens_seen": 18681480, + "router_z_loss_clip": 0.05371094, + "router_z_loss_mlp": 0.71875, + "step": 873, + "time_per_iteration": 2.9913430213928223 + }, + { + "auxiliary_loss_clip": 0.01231663, + "auxiliary_loss_mlp": 0.01058247, + "balance_loss_clip": 1.03245032, + "balance_loss_mlp": 1.07107997, + "epoch": 0.0525477228318052, + "flos": 23288859786240.0, + "grad_norm": 1.8426182555123698, + "language_loss": 0.88426232, + "learning_rate": 3.994669855111643e-06, + "loss": 0.90716141, + "num_input_tokens_seen": 18700390, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.6015625, + "step": 874, + "time_per_iteration": 2.4794461727142334 + }, + { + "auxiliary_loss_clip": 0.0123222, + "auxiliary_loss_mlp": 0.01062298, + "balance_loss_clip": 1.03626251, + "balance_loss_mlp": 1.06908488, + "epoch": 0.05260784608447317, + "flos": 32230221546240.0, + "grad_norm": 2.2494767595307628, + "language_loss": 0.74779439, + "learning_rate": 3.994641402486977e-06, + "loss": 0.77073956, + "num_input_tokens_seen": 18721280, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.6328125, + "step": 875, + "time_per_iteration": 2.5768113136291504 + }, + { + "auxiliary_loss_clip": 0.01228414, + "auxiliary_loss_mlp": 0.01061086, + "balance_loss_clip": 1.03412056, + "balance_loss_mlp": 1.06905699, + "epoch": 0.052667969337141136, + "flos": 24463211040000.0, + "grad_norm": 2.052141253618648, + "language_loss": 0.92836702, + "learning_rate": 3.99461287422531e-06, + "loss": 0.951262, + "num_input_tokens_seen": 18741545, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.59375, + "step": 876, + "time_per_iteration": 2.535587787628174 + }, + { + "auxiliary_loss_clip": 0.0110653, + "auxiliary_loss_mlp": 0.01009036, + "balance_loss_clip": 1.00379074, + "balance_loss_mlp": 1.03698087, + "epoch": 0.05272809258980911, + "flos": 57784329567360.0, + "grad_norm": 0.854570032578524, + "language_loss": 0.62934959, + "learning_rate": 3.994584270327722e-06, + "loss": 0.6505053, + "num_input_tokens_seen": 18801400, + "router_z_loss_clip": 0.05249023, + "router_z_loss_mlp": 0.6953125, + "step": 877, + "time_per_iteration": 3.094581127166748 + }, + { + "auxiliary_loss_clip": 0.01231545, + "auxiliary_loss_mlp": 0.01069359, + "balance_loss_clip": 1.04174972, + "balance_loss_mlp": 1.06975055, + "epoch": 0.05278821584247708, + "flos": 17420805596160.0, + "grad_norm": 2.154366240232031, + "language_loss": 0.85691291, + "learning_rate": 3.994555590795299e-06, + "loss": 0.87992191, + "num_input_tokens_seen": 18819670, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.6171875, + "step": 878, + "time_per_iteration": 2.5052285194396973 + }, + { + "auxiliary_loss_clip": 0.01232133, + "auxiliary_loss_mlp": 0.01063559, + "balance_loss_clip": 1.03754723, + "balance_loss_mlp": 1.06974411, + "epoch": 0.052848339095145046, + "flos": 26137258346880.0, + "grad_norm": 2.0833089409086942, + "language_loss": 0.82790506, + "learning_rate": 3.9945268356291275e-06, + "loss": 0.85086197, + "num_input_tokens_seen": 18840580, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.625, + "step": 879, + "time_per_iteration": 2.564312219619751 + }, + { + "auxiliary_loss_clip": 0.01227867, + "auxiliary_loss_mlp": 0.01066877, + "balance_loss_clip": 1.04011488, + "balance_loss_mlp": 1.06966615, + "epoch": 0.05290846234781302, + "flos": 16472081623680.0, + "grad_norm": 4.271066320440391, + "language_loss": 0.84404933, + "learning_rate": 3.9944980048302985e-06, + "loss": 0.86699677, + "num_input_tokens_seen": 18859295, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.578125, + "step": 880, + "time_per_iteration": 2.4854133129119873 + }, + { + "auxiliary_loss_clip": 0.01233797, + "auxiliary_loss_mlp": 0.01069821, + "balance_loss_clip": 1.04360688, + "balance_loss_mlp": 1.07206059, + "epoch": 0.05296858560048098, + "flos": 19865173000320.0, + "grad_norm": 3.515636761469604, + "language_loss": 0.87156737, + "learning_rate": 3.994469098399906e-06, + "loss": 0.89460361, + "num_input_tokens_seen": 18877485, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.625, + "step": 881, + "time_per_iteration": 2.476846933364868 + }, + { + "auxiliary_loss_clip": 0.01228751, + "auxiliary_loss_mlp": 0.01065941, + "balance_loss_clip": 1.03789103, + "balance_loss_mlp": 1.06813371, + "epoch": 0.053028708853148955, + "flos": 24388588535040.0, + "grad_norm": 1.9345214626214409, + "language_loss": 0.87682849, + "learning_rate": 3.994440116339046e-06, + "loss": 0.89977539, + "num_input_tokens_seen": 18898275, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.609375, + "step": 882, + "time_per_iteration": 2.6449031829833984 + }, + { + "auxiliary_loss_clip": 0.01233714, + "auxiliary_loss_mlp": 0.01065669, + "balance_loss_clip": 1.03825057, + "balance_loss_mlp": 1.07030129, + "epoch": 0.05308883210581693, + "flos": 36393166143360.0, + "grad_norm": 2.7245054008776814, + "language_loss": 0.68869275, + "learning_rate": 3.994411058648816e-06, + "loss": 0.71168661, + "num_input_tokens_seen": 18920665, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.6328125, + "step": 883, + "time_per_iteration": 2.620363235473633 + }, + { + "auxiliary_loss_clip": 0.01225388, + "auxiliary_loss_mlp": 0.01060527, + "balance_loss_clip": 1.03461075, + "balance_loss_mlp": 1.06937146, + "epoch": 0.05314895535848489, + "flos": 22855095146880.0, + "grad_norm": 1.9628498458506696, + "language_loss": 0.75887203, + "learning_rate": 3.994381925330319e-06, + "loss": 0.78173113, + "num_input_tokens_seen": 18939835, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.5546875, + "step": 884, + "time_per_iteration": 2.4948067665100098 + }, + { + "auxiliary_loss_clip": 0.01225729, + "auxiliary_loss_mlp": 0.01063879, + "balance_loss_clip": 1.03870201, + "balance_loss_mlp": 1.06921601, + "epoch": 0.053209078611152864, + "flos": 12860330204160.0, + "grad_norm": 2.00306560312032, + "language_loss": 0.85323638, + "learning_rate": 3.994352716384659e-06, + "loss": 0.87613249, + "num_input_tokens_seen": 18958405, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.5625, + "step": 885, + "time_per_iteration": 2.5159530639648438 + }, + { + "auxiliary_loss_clip": 0.01228523, + "auxiliary_loss_mlp": 0.01068973, + "balance_loss_clip": 1.04205549, + "balance_loss_mlp": 1.06673646, + "epoch": 0.05326920186382083, + "flos": 12164596698240.0, + "grad_norm": 2.6316893825734344, + "language_loss": 0.85726082, + "learning_rate": 3.994323431812945e-06, + "loss": 0.88023585, + "num_input_tokens_seen": 18975445, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.6171875, + "step": 886, + "time_per_iteration": 2.4650700092315674 + }, + { + "auxiliary_loss_clip": 0.01226585, + "auxiliary_loss_mlp": 0.01066459, + "balance_loss_clip": 1.03908896, + "balance_loss_mlp": 1.06944001, + "epoch": 0.0533293251164888, + "flos": 22704485420160.0, + "grad_norm": 2.1517488326805214, + "language_loss": 0.89229804, + "learning_rate": 3.994294071616286e-06, + "loss": 0.91522843, + "num_input_tokens_seen": 18991930, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.5703125, + "step": 887, + "time_per_iteration": 2.5020337104797363 + }, + { + "auxiliary_loss_clip": 0.01227687, + "auxiliary_loss_mlp": 0.01070962, + "balance_loss_clip": 1.04270935, + "balance_loss_mlp": 1.06604195, + "epoch": 0.053389448369156774, + "flos": 26940939200640.0, + "grad_norm": 2.2836036404275593, + "language_loss": 0.75076836, + "learning_rate": 3.994264635795796e-06, + "loss": 0.77375484, + "num_input_tokens_seen": 19009790, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.6171875, + "step": 888, + "time_per_iteration": 2.5055694580078125 + }, + { + "auxiliary_loss_clip": 0.0123028, + "auxiliary_loss_mlp": 0.0107639, + "balance_loss_clip": 1.0480895, + "balance_loss_mlp": 1.07113457, + "epoch": 0.05344957162182474, + "flos": 25556331686400.0, + "grad_norm": 2.032914331295681, + "language_loss": 0.88330352, + "learning_rate": 3.994235124352592e-06, + "loss": 0.90637028, + "num_input_tokens_seen": 19030170, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.59375, + "step": 889, + "time_per_iteration": 2.5147650241851807 + }, + { + "auxiliary_loss_clip": 0.01222875, + "auxiliary_loss_mlp": 0.01053174, + "balance_loss_clip": 1.02748489, + "balance_loss_mlp": 1.06732821, + "epoch": 0.05350969487449271, + "flos": 19719591177600.0, + "grad_norm": 1.9726085703824752, + "language_loss": 0.88269985, + "learning_rate": 3.994205537287791e-06, + "loss": 0.90546036, + "num_input_tokens_seen": 19048075, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.5546875, + "step": 890, + "time_per_iteration": 2.490300416946411 + }, + { + "auxiliary_loss_clip": 0.01225662, + "auxiliary_loss_mlp": 0.0107145, + "balance_loss_clip": 1.04612982, + "balance_loss_mlp": 1.06690812, + "epoch": 0.053569818127160676, + "flos": 27016351804800.0, + "grad_norm": 2.320271972022273, + "language_loss": 0.93251556, + "learning_rate": 3.994175874602517e-06, + "loss": 0.95548671, + "num_input_tokens_seen": 19067465, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.59375, + "step": 891, + "time_per_iteration": 2.5133957862854004 + }, + { + "auxiliary_loss_clip": 0.01225404, + "auxiliary_loss_mlp": 0.01062212, + "balance_loss_clip": 1.03506804, + "balance_loss_mlp": 1.06682086, + "epoch": 0.05362994137982865, + "flos": 13188338225280.0, + "grad_norm": 2.238230674372026, + "language_loss": 0.71759057, + "learning_rate": 3.994146136297893e-06, + "loss": 0.74046671, + "num_input_tokens_seen": 19085505, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.5859375, + "step": 892, + "time_per_iteration": 2.5544779300689697 + }, + { + "auxiliary_loss_clip": 0.01229119, + "auxiliary_loss_mlp": 0.01067529, + "balance_loss_clip": 1.0421617, + "balance_loss_mlp": 1.06946719, + "epoch": 0.05369006463249662, + "flos": 28658008022400.0, + "grad_norm": 2.3204520758070037, + "language_loss": 0.82304287, + "learning_rate": 3.994116322375049e-06, + "loss": 0.84600937, + "num_input_tokens_seen": 19104360, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.6015625, + "step": 893, + "time_per_iteration": 5.3903117179870605 + }, + { + "auxiliary_loss_clip": 0.0122945, + "auxiliary_loss_mlp": 0.01070342, + "balance_loss_clip": 1.04430699, + "balance_loss_mlp": 1.0679965, + "epoch": 0.053750187885164585, + "flos": 28913153304960.0, + "grad_norm": 2.3808217776212937, + "language_loss": 0.81695569, + "learning_rate": 3.994086432835114e-06, + "loss": 0.83995366, + "num_input_tokens_seen": 19124680, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.609375, + "step": 894, + "time_per_iteration": 2.52809476852417 + }, + { + "auxiliary_loss_clip": 0.01227471, + "auxiliary_loss_mlp": 0.01065449, + "balance_loss_clip": 1.03915119, + "balance_loss_mlp": 1.06881404, + "epoch": 0.05381031113783256, + "flos": 15158828476800.0, + "grad_norm": 2.5337894710206093, + "language_loss": 0.76043701, + "learning_rate": 3.994056467679221e-06, + "loss": 0.7833662, + "num_input_tokens_seen": 19142895, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.5859375, + "step": 895, + "time_per_iteration": 2.4810688495635986 + }, + { + "auxiliary_loss_clip": 0.01238307, + "auxiliary_loss_mlp": 0.01057353, + "balance_loss_clip": 1.03022122, + "balance_loss_mlp": 1.07260597, + "epoch": 0.05387043439050053, + "flos": 21835232288640.0, + "grad_norm": 2.2065839001211156, + "language_loss": 0.86456096, + "learning_rate": 3.9940264269085065e-06, + "loss": 0.88751751, + "num_input_tokens_seen": 19163125, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.65625, + "step": 896, + "time_per_iteration": 2.522254467010498 + }, + { + "auxiliary_loss_clip": 0.01231325, + "auxiliary_loss_mlp": 0.01062675, + "balance_loss_clip": 1.03495908, + "balance_loss_mlp": 1.06809413, + "epoch": 0.053930557643168495, + "flos": 17310308382720.0, + "grad_norm": 2.1680285530564274, + "language_loss": 0.87949234, + "learning_rate": 3.9939963105241115e-06, + "loss": 0.90243232, + "num_input_tokens_seen": 19179385, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.6328125, + "step": 897, + "time_per_iteration": 2.457918167114258 + }, + { + "auxiliary_loss_clip": 0.0122574, + "auxiliary_loss_mlp": 0.01063765, + "balance_loss_clip": 1.03570318, + "balance_loss_mlp": 1.06723523, + "epoch": 0.05399068089583647, + "flos": 17348481561600.0, + "grad_norm": 1.7359050724031848, + "language_loss": 0.9035244, + "learning_rate": 3.993966118527175e-06, + "loss": 0.9264195, + "num_input_tokens_seen": 19198725, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.5859375, + "step": 898, + "time_per_iteration": 2.4593143463134766 + }, + { + "auxiliary_loss_clip": 0.01234899, + "auxiliary_loss_mlp": 0.01084595, + "balance_loss_clip": 1.05808282, + "balance_loss_mlp": 1.07024622, + "epoch": 0.05405080414850443, + "flos": 17486952491520.0, + "grad_norm": 3.958355519485596, + "language_loss": 0.91756964, + "learning_rate": 3.993935850918845e-06, + "loss": 0.94076455, + "num_input_tokens_seen": 19212380, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.6484375, + "step": 899, + "time_per_iteration": 2.4461729526519775 + }, + { + "auxiliary_loss_clip": 0.01225208, + "auxiliary_loss_mlp": 0.01065344, + "balance_loss_clip": 1.03964233, + "balance_loss_mlp": 1.06601286, + "epoch": 0.054110927401172404, + "flos": 24496787278080.0, + "grad_norm": 2.6493739136310643, + "language_loss": 0.75594276, + "learning_rate": 3.9939055077002665e-06, + "loss": 0.77884829, + "num_input_tokens_seen": 19232235, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.59375, + "step": 900, + "time_per_iteration": 2.5180957317352295 + }, + { + "auxiliary_loss_clip": 0.01231903, + "auxiliary_loss_mlp": 0.01059763, + "balance_loss_clip": 1.03413296, + "balance_loss_mlp": 1.06860638, + "epoch": 0.054171050653840376, + "flos": 22930040874240.0, + "grad_norm": 2.2496787705299908, + "language_loss": 0.7377668, + "learning_rate": 3.993875088872592e-06, + "loss": 0.76068342, + "num_input_tokens_seen": 19251460, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.6328125, + "step": 901, + "time_per_iteration": 2.49638032913208 + }, + { + "auxiliary_loss_clip": 0.01221671, + "auxiliary_loss_mlp": 0.01074944, + "balance_loss_clip": 1.04982698, + "balance_loss_mlp": 1.06662059, + "epoch": 0.05423117390650834, + "flos": 12933192942720.0, + "grad_norm": 2.0553503619333586, + "language_loss": 0.85004938, + "learning_rate": 3.9938445944369745e-06, + "loss": 0.87301552, + "num_input_tokens_seen": 19269060, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.546875, + "step": 902, + "time_per_iteration": 2.5067105293273926 + }, + { + "auxiliary_loss_clip": 0.01226177, + "auxiliary_loss_mlp": 0.0106299, + "balance_loss_clip": 1.03638279, + "balance_loss_mlp": 1.06769705, + "epoch": 0.05429129715917631, + "flos": 19901335017600.0, + "grad_norm": 2.0002475654879195, + "language_loss": 0.8655951, + "learning_rate": 3.993814024394569e-06, + "loss": 0.8884868, + "num_input_tokens_seen": 19288620, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5859375, + "step": 903, + "time_per_iteration": 2.522193670272827 + }, + { + "auxiliary_loss_clip": 0.01227512, + "auxiliary_loss_mlp": 0.01062198, + "balance_loss_clip": 1.03622222, + "balance_loss_mlp": 1.06904316, + "epoch": 0.05435142041184428, + "flos": 16908611610240.0, + "grad_norm": 2.4298091072226855, + "language_loss": 0.74835998, + "learning_rate": 3.993783378746537e-06, + "loss": 0.77125704, + "num_input_tokens_seen": 19306615, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.578125, + "step": 904, + "time_per_iteration": 2.456969976425171 + }, + { + "auxiliary_loss_clip": 0.0123038, + "auxiliary_loss_mlp": 0.01073252, + "balance_loss_clip": 1.04685879, + "balance_loss_mlp": 1.06905615, + "epoch": 0.05441154366451225, + "flos": 23948323534080.0, + "grad_norm": 2.0843949675352356, + "language_loss": 0.85750329, + "learning_rate": 3.993752657494039e-06, + "loss": 0.8805396, + "num_input_tokens_seen": 19321680, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.609375, + "step": 905, + "time_per_iteration": 2.5358731746673584 + }, + { + "auxiliary_loss_clip": 0.01227222, + "auxiliary_loss_mlp": 0.01078235, + "balance_loss_clip": 1.05317712, + "balance_loss_mlp": 1.07247257, + "epoch": 0.05447166691718022, + "flos": 19975382904960.0, + "grad_norm": 1.7937911991915148, + "language_loss": 0.74028552, + "learning_rate": 3.993721860638241e-06, + "loss": 0.76334012, + "num_input_tokens_seen": 19339760, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.546875, + "step": 906, + "time_per_iteration": 2.468331813812256 + }, + { + "auxiliary_loss_clip": 0.01228766, + "auxiliary_loss_mlp": 0.01065896, + "balance_loss_clip": 1.03909731, + "balance_loss_mlp": 1.06858826, + "epoch": 0.05453179016984819, + "flos": 24936513575040.0, + "grad_norm": 2.220044948377472, + "language_loss": 0.87410975, + "learning_rate": 3.993690988180309e-06, + "loss": 0.89705634, + "num_input_tokens_seen": 19359585, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.6015625, + "step": 907, + "time_per_iteration": 2.5177390575408936 + }, + { + "auxiliary_loss_clip": 0.01227557, + "auxiliary_loss_mlp": 0.01071851, + "balance_loss_clip": 1.04521942, + "balance_loss_mlp": 1.07002556, + "epoch": 0.05459191342251616, + "flos": 18115102558080.0, + "grad_norm": 1.8689281211501179, + "language_loss": 0.86915505, + "learning_rate": 3.9936600401214165e-06, + "loss": 0.89214909, + "num_input_tokens_seen": 19378590, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.578125, + "step": 908, + "time_per_iteration": 2.45135498046875 + }, + { + "auxiliary_loss_clip": 0.01225417, + "auxiliary_loss_mlp": 0.01068459, + "balance_loss_clip": 1.04073071, + "balance_loss_mlp": 1.06842148, + "epoch": 0.054652036675184125, + "flos": 19208295031680.0, + "grad_norm": 2.409525813232516, + "language_loss": 0.89454836, + "learning_rate": 3.9936290164627345e-06, + "loss": 0.91748714, + "num_input_tokens_seen": 19397910, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.5703125, + "step": 909, + "time_per_iteration": 2.4702625274658203 + }, + { + "auxiliary_loss_clip": 0.01231345, + "auxiliary_loss_mlp": 0.01075786, + "balance_loss_clip": 1.04773629, + "balance_loss_mlp": 1.06930447, + "epoch": 0.0547121599278521, + "flos": 16325745615360.0, + "grad_norm": 2.4022545211155593, + "language_loss": 0.70942473, + "learning_rate": 3.99359791720544e-06, + "loss": 0.73249602, + "num_input_tokens_seen": 19415950, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.625, + "step": 910, + "time_per_iteration": 2.4530797004699707 + }, + { + "auxiliary_loss_clip": 0.01224757, + "auxiliary_loss_mlp": 0.01055797, + "balance_loss_clip": 1.03002357, + "balance_loss_mlp": 1.06815219, + "epoch": 0.05477228318052007, + "flos": 20339014239360.0, + "grad_norm": 2.0100188286094745, + "language_loss": 0.8349818, + "learning_rate": 3.993566742350714e-06, + "loss": 0.85778737, + "num_input_tokens_seen": 19435275, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.5625, + "step": 911, + "time_per_iteration": 2.4792025089263916 + }, + { + "auxiliary_loss_clip": 0.01224017, + "auxiliary_loss_mlp": 0.01072081, + "balance_loss_clip": 1.04524732, + "balance_loss_mlp": 1.06649613, + "epoch": 0.054832406433188034, + "flos": 21973092687360.0, + "grad_norm": 2.746196883211308, + "language_loss": 0.76096344, + "learning_rate": 3.993535491899736e-06, + "loss": 0.7839244, + "num_input_tokens_seen": 19452090, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.578125, + "step": 912, + "time_per_iteration": 2.4651522636413574 + }, + { + "auxiliary_loss_clip": 0.01219912, + "auxiliary_loss_mlp": 0.01052416, + "balance_loss_clip": 1.02733433, + "balance_loss_mlp": 1.06664968, + "epoch": 0.054892529685856006, + "flos": 16398931576320.0, + "grad_norm": 2.385296939765248, + "language_loss": 0.82667339, + "learning_rate": 3.993504165853694e-06, + "loss": 0.84939671, + "num_input_tokens_seen": 19470865, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.53125, + "step": 913, + "time_per_iteration": 2.475384473800659 + }, + { + "auxiliary_loss_clip": 0.01224168, + "auxiliary_loss_mlp": 0.01061883, + "balance_loss_clip": 1.03633678, + "balance_loss_mlp": 1.07065797, + "epoch": 0.05495265293852397, + "flos": 23912341084800.0, + "grad_norm": 2.227172084037845, + "language_loss": 0.83470452, + "learning_rate": 3.993472764213772e-06, + "loss": 0.85756505, + "num_input_tokens_seen": 19492145, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.53125, + "step": 914, + "time_per_iteration": 2.5741806030273438 + }, + { + "auxiliary_loss_clip": 0.01229195, + "auxiliary_loss_mlp": 0.01057782, + "balance_loss_clip": 1.03324902, + "balance_loss_mlp": 1.07264161, + "epoch": 0.055012776191191944, + "flos": 23586954756480.0, + "grad_norm": 2.897688985464872, + "language_loss": 0.9010309, + "learning_rate": 3.9934412869811655e-06, + "loss": 0.92390066, + "num_input_tokens_seen": 19511015, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.5625, + "step": 915, + "time_per_iteration": 2.492981433868408 + }, + { + "auxiliary_loss_clip": 0.01225584, + "auxiliary_loss_mlp": 0.01055475, + "balance_loss_clip": 1.03046489, + "balance_loss_mlp": 1.0708915, + "epoch": 0.055072899443859916, + "flos": 17528501548800.0, + "grad_norm": 1.870109983937874, + "language_loss": 0.89555848, + "learning_rate": 3.993409734157064e-06, + "loss": 0.91836905, + "num_input_tokens_seen": 19529040, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.546875, + "step": 916, + "time_per_iteration": 2.4621188640594482 + }, + { + "auxiliary_loss_clip": 0.01228011, + "auxiliary_loss_mlp": 0.01072271, + "balance_loss_clip": 1.04593801, + "balance_loss_mlp": 1.06942379, + "epoch": 0.05513302269652788, + "flos": 21687172427520.0, + "grad_norm": 1.7933741103180343, + "language_loss": 0.80085957, + "learning_rate": 3.993378105742666e-06, + "loss": 0.82386243, + "num_input_tokens_seen": 19549540, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.5859375, + "step": 917, + "time_per_iteration": 2.49455189704895 + }, + { + "auxiliary_loss_clip": 0.01225592, + "auxiliary_loss_mlp": 0.01058516, + "balance_loss_clip": 1.03270769, + "balance_loss_mlp": 1.06678224, + "epoch": 0.05519314594919585, + "flos": 21613340021760.0, + "grad_norm": 1.9216560267302982, + "language_loss": 0.79673612, + "learning_rate": 3.9933464017391705e-06, + "loss": 0.81957722, + "num_input_tokens_seen": 19567570, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.59375, + "step": 918, + "time_per_iteration": 2.504734516143799 + }, + { + "auxiliary_loss_clip": 0.01223712, + "auxiliary_loss_mlp": 0.01059794, + "balance_loss_clip": 1.03414011, + "balance_loss_mlp": 1.06658053, + "epoch": 0.05525326920186382, + "flos": 21798567480960.0, + "grad_norm": 1.9394116717498289, + "language_loss": 0.89132315, + "learning_rate": 3.99331462214778e-06, + "loss": 0.91415823, + "num_input_tokens_seen": 19585330, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.5703125, + "step": 919, + "time_per_iteration": 2.5087900161743164 + }, + { + "auxiliary_loss_clip": 0.01219042, + "auxiliary_loss_mlp": 0.01068553, + "balance_loss_clip": 1.0427916, + "balance_loss_mlp": 1.06515777, + "epoch": 0.05531339245453179, + "flos": 28439635288320.0, + "grad_norm": 2.688355226699252, + "language_loss": 0.87421197, + "learning_rate": 3.993282766969699e-06, + "loss": 0.89708793, + "num_input_tokens_seen": 19604970, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.5390625, + "step": 920, + "time_per_iteration": 2.536914348602295 + }, + { + "auxiliary_loss_clip": 0.01223828, + "auxiliary_loss_mlp": 0.01063036, + "balance_loss_clip": 1.03733468, + "balance_loss_mlp": 1.06937671, + "epoch": 0.05537351570719976, + "flos": 37375143131520.0, + "grad_norm": 2.1255302161497704, + "language_loss": 0.65921712, + "learning_rate": 3.993250836206136e-06, + "loss": 0.68208569, + "num_input_tokens_seen": 19626235, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.546875, + "step": 921, + "time_per_iteration": 2.643416166305542 + }, + { + "auxiliary_loss_clip": 0.01229793, + "auxiliary_loss_mlp": 0.0106877, + "balance_loss_clip": 1.03969455, + "balance_loss_mlp": 1.0698204, + "epoch": 0.05543363895986773, + "flos": 20084479488000.0, + "grad_norm": 2.143682946402907, + "language_loss": 0.71841472, + "learning_rate": 3.993218829858301e-06, + "loss": 0.74140036, + "num_input_tokens_seen": 19644305, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.6015625, + "step": 922, + "time_per_iteration": 2.4544074535369873 + }, + { + "auxiliary_loss_clip": 0.0122536, + "auxiliary_loss_mlp": 0.01070183, + "balance_loss_clip": 1.04346824, + "balance_loss_mlp": 1.0669136, + "epoch": 0.0554937622125357, + "flos": 24533200690560.0, + "grad_norm": 2.766492717488127, + "language_loss": 0.82548857, + "learning_rate": 3.993186747927408e-06, + "loss": 0.84844404, + "num_input_tokens_seen": 19662130, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5859375, + "step": 923, + "time_per_iteration": 2.490915536880493 + }, + { + "auxiliary_loss_clip": 0.01221243, + "auxiliary_loss_mlp": 0.01068053, + "balance_loss_clip": 1.04194593, + "balance_loss_mlp": 1.06429458, + "epoch": 0.055553885465203665, + "flos": 14320063013760.0, + "grad_norm": 2.2095756655687397, + "language_loss": 0.78808558, + "learning_rate": 3.993154590414675e-06, + "loss": 0.81097853, + "num_input_tokens_seen": 19680715, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.5625, + "step": 924, + "time_per_iteration": 2.45884108543396 + }, + { + "auxiliary_loss_clip": 0.0121918, + "auxiliary_loss_mlp": 0.01059373, + "balance_loss_clip": 1.03245521, + "balance_loss_mlp": 1.06480467, + "epoch": 0.05561400871787164, + "flos": 27381132374400.0, + "grad_norm": 1.9513803878946447, + "language_loss": 1.02250028, + "learning_rate": 3.993122357321319e-06, + "loss": 1.04528582, + "num_input_tokens_seen": 19700535, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.546875, + "step": 925, + "time_per_iteration": 2.5296268463134766 + }, + { + "auxiliary_loss_clip": 0.01220429, + "auxiliary_loss_mlp": 0.01055633, + "balance_loss_clip": 1.02975261, + "balance_loss_mlp": 1.0634799, + "epoch": 0.05567413197053961, + "flos": 23221096778880.0, + "grad_norm": 2.3756260245044687, + "language_loss": 0.80808276, + "learning_rate": 3.993090048648564e-06, + "loss": 0.83084333, + "num_input_tokens_seen": 19718825, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.5703125, + "step": 926, + "time_per_iteration": 2.496299982070923 + }, + { + "auxiliary_loss_clip": 0.01229405, + "auxiliary_loss_mlp": 0.01068259, + "balance_loss_clip": 1.04049563, + "balance_loss_mlp": 1.06743848, + "epoch": 0.055734255223207574, + "flos": 25264952559360.0, + "grad_norm": 2.4713559623940924, + "language_loss": 0.73378903, + "learning_rate": 3.993057664397634e-06, + "loss": 0.75676566, + "num_input_tokens_seen": 19739080, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.6171875, + "step": 927, + "time_per_iteration": 2.5607478618621826 + }, + { + "auxiliary_loss_clip": 0.01103967, + "auxiliary_loss_mlp": 0.01014529, + "balance_loss_clip": 1.00837731, + "balance_loss_mlp": 1.03639269, + "epoch": 0.055794378475875546, + "flos": 66503116702080.0, + "grad_norm": 0.7814837823676635, + "language_loss": 0.5989722, + "learning_rate": 3.9930252045697585e-06, + "loss": 0.62015712, + "num_input_tokens_seen": 19802960, + "router_z_loss_clip": 0.0612793, + "router_z_loss_mlp": 0.67578125, + "step": 928, + "time_per_iteration": 3.0945305824279785 + }, + { + "auxiliary_loss_clip": 0.01223562, + "auxiliary_loss_mlp": 0.01066756, + "balance_loss_clip": 1.04035151, + "balance_loss_mlp": 1.06729245, + "epoch": 0.05585450172854351, + "flos": 25337635729920.0, + "grad_norm": 2.3037954576101587, + "language_loss": 0.95011377, + "learning_rate": 3.992992669166168e-06, + "loss": 0.97301698, + "num_input_tokens_seen": 19822765, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.5625, + "step": 929, + "time_per_iteration": 2.527270555496216 + }, + { + "auxiliary_loss_clip": 0.01221186, + "auxiliary_loss_mlp": 0.01067668, + "balance_loss_clip": 1.03924894, + "balance_loss_mlp": 1.06494856, + "epoch": 0.05591462498121148, + "flos": 33911738881920.0, + "grad_norm": 2.1540114832188553, + "language_loss": 0.71827871, + "learning_rate": 3.992960058188094e-06, + "loss": 0.74116725, + "num_input_tokens_seen": 19843590, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.5625, + "step": 930, + "time_per_iteration": 2.57513689994812 + }, + { + "auxiliary_loss_clip": 0.01227654, + "auxiliary_loss_mlp": 0.01062398, + "balance_loss_clip": 1.0355165, + "balance_loss_mlp": 1.06905401, + "epoch": 0.055974748233879455, + "flos": 17930880679680.0, + "grad_norm": 2.336481182624628, + "language_loss": 0.85333288, + "learning_rate": 3.992927371636776e-06, + "loss": 0.87623346, + "num_input_tokens_seen": 19860230, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.5859375, + "step": 931, + "time_per_iteration": 2.459167957305908 + }, + { + "auxiliary_loss_clip": 0.01224553, + "auxiliary_loss_mlp": 0.01072004, + "balance_loss_clip": 1.0448482, + "balance_loss_mlp": 1.06556344, + "epoch": 0.05603487148654742, + "flos": 24021976371840.0, + "grad_norm": 1.9723738142749898, + "language_loss": 0.83577204, + "learning_rate": 3.9928946095134525e-06, + "loss": 0.85873753, + "num_input_tokens_seen": 19880795, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.59375, + "step": 932, + "time_per_iteration": 2.4930593967437744 + }, + { + "auxiliary_loss_clip": 0.01223225, + "auxiliary_loss_mlp": 0.01067756, + "balance_loss_clip": 1.04012322, + "balance_loss_mlp": 1.06712675, + "epoch": 0.05609499473921539, + "flos": 17307758517120.0, + "grad_norm": 2.411257667891357, + "language_loss": 0.73405433, + "learning_rate": 3.992861771819365e-06, + "loss": 0.75696409, + "num_input_tokens_seen": 19897960, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.5625, + "step": 933, + "time_per_iteration": 2.526521682739258 + }, + { + "auxiliary_loss_clip": 0.01219811, + "auxiliary_loss_mlp": 0.01070368, + "balance_loss_clip": 1.04328358, + "balance_loss_mlp": 1.06432819, + "epoch": 0.05615511799188336, + "flos": 20994742972800.0, + "grad_norm": 2.577929883809357, + "language_loss": 0.86850882, + "learning_rate": 3.99282885855576e-06, + "loss": 0.89141059, + "num_input_tokens_seen": 19913315, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.5546875, + "step": 934, + "time_per_iteration": 5.338034391403198 + }, + { + "auxiliary_loss_clip": 0.01220003, + "auxiliary_loss_mlp": 0.01069693, + "balance_loss_clip": 1.04495692, + "balance_loss_mlp": 1.06842983, + "epoch": 0.05621524124455133, + "flos": 17273535834240.0, + "grad_norm": 2.2060919587088965, + "language_loss": 0.80243224, + "learning_rate": 3.992795869723885e-06, + "loss": 0.82532918, + "num_input_tokens_seen": 19928790, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.515625, + "step": 935, + "time_per_iteration": 3.8198087215423584 + }, + { + "auxiliary_loss_clip": 0.01094527, + "auxiliary_loss_mlp": 0.01008984, + "balance_loss_clip": 1.00321388, + "balance_loss_mlp": 1.02876139, + "epoch": 0.0562753644972193, + "flos": 58719370458240.0, + "grad_norm": 0.8225714537835027, + "language_loss": 0.69179416, + "learning_rate": 3.99276280532499e-06, + "loss": 0.71282923, + "num_input_tokens_seen": 19988785, + "router_z_loss_clip": 0.05761719, + "router_z_loss_mlp": 0.65625, + "step": 936, + "time_per_iteration": 2.9585764408111572 + }, + { + "auxiliary_loss_clip": 0.01220636, + "auxiliary_loss_mlp": 0.01067113, + "balance_loss_clip": 1.04123259, + "balance_loss_mlp": 1.06387568, + "epoch": 0.05633548774988727, + "flos": 17457039440640.0, + "grad_norm": 2.5168182860703237, + "language_loss": 0.75900578, + "learning_rate": 3.992729665360331e-06, + "loss": 0.78188324, + "num_input_tokens_seen": 20007685, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.5703125, + "step": 937, + "time_per_iteration": 2.4891855716705322 + }, + { + "auxiliary_loss_clip": 0.01091743, + "auxiliary_loss_mlp": 0.01002728, + "balance_loss_clip": 0.99738711, + "balance_loss_mlp": 1.02642298, + "epoch": 0.05639561100255524, + "flos": 70654928083200.0, + "grad_norm": 0.8631606334327763, + "language_loss": 0.64287508, + "learning_rate": 3.992696449831162e-06, + "loss": 0.66381979, + "num_input_tokens_seen": 20072750, + "router_z_loss_clip": 0.0534668, + "router_z_loss_mlp": 0.65625, + "step": 938, + "time_per_iteration": 3.0239782333374023 + }, + { + "auxiliary_loss_clip": 0.01226335, + "auxiliary_loss_mlp": 0.0107197, + "balance_loss_clip": 1.04487348, + "balance_loss_mlp": 1.06571174, + "epoch": 0.056455734255223204, + "flos": 20485996692480.0, + "grad_norm": 4.570077538128457, + "language_loss": 0.7903074, + "learning_rate": 3.992663158738745e-06, + "loss": 0.81329048, + "num_input_tokens_seen": 20089070, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.609375, + "step": 939, + "time_per_iteration": 2.494706630706787 + }, + { + "auxiliary_loss_clip": 0.012214, + "auxiliary_loss_mlp": 0.01063948, + "balance_loss_clip": 1.03868759, + "balance_loss_mlp": 1.0669229, + "epoch": 0.056515857507891176, + "flos": 22053569109120.0, + "grad_norm": 1.950609958048397, + "language_loss": 0.73893893, + "learning_rate": 3.992629792084341e-06, + "loss": 0.76179242, + "num_input_tokens_seen": 20108790, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.546875, + "step": 940, + "time_per_iteration": 2.5279061794281006 + }, + { + "auxiliary_loss_clip": 0.01220257, + "auxiliary_loss_mlp": 0.01064421, + "balance_loss_clip": 1.03776574, + "balance_loss_mlp": 1.06722569, + "epoch": 0.05657598076055915, + "flos": 24025316336640.0, + "grad_norm": 1.9142676693922898, + "language_loss": 0.70475829, + "learning_rate": 3.992596349869216e-06, + "loss": 0.72760499, + "num_input_tokens_seen": 20128455, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.53125, + "step": 941, + "time_per_iteration": 2.551604747772217 + }, + { + "auxiliary_loss_clip": 0.01218348, + "auxiliary_loss_mlp": 0.01058281, + "balance_loss_clip": 1.03229308, + "balance_loss_mlp": 1.06624675, + "epoch": 0.05663610401322711, + "flos": 20480609652480.0, + "grad_norm": 2.3045436850665917, + "language_loss": 0.80928791, + "learning_rate": 3.992562832094637e-06, + "loss": 0.83205426, + "num_input_tokens_seen": 20145775, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.515625, + "step": 942, + "time_per_iteration": 2.515646457672119 + }, + { + "auxiliary_loss_clip": 0.01214197, + "auxiliary_loss_mlp": 0.01057859, + "balance_loss_clip": 1.03245521, + "balance_loss_mlp": 1.062042, + "epoch": 0.056696227265895086, + "flos": 21069042255360.0, + "grad_norm": 2.7900678467193205, + "language_loss": 0.88067353, + "learning_rate": 3.9925292387618755e-06, + "loss": 0.9033941, + "num_input_tokens_seen": 20164315, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.5234375, + "step": 943, + "time_per_iteration": 2.674614191055298 + }, + { + "auxiliary_loss_clip": 0.01220399, + "auxiliary_loss_mlp": 0.01056577, + "balance_loss_clip": 1.03182912, + "balance_loss_mlp": 1.06757212, + "epoch": 0.05675635051856306, + "flos": 17821317219840.0, + "grad_norm": 2.6837069047913924, + "language_loss": 0.75092185, + "learning_rate": 3.992495569872206e-06, + "loss": 0.77369165, + "num_input_tokens_seen": 20182760, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.5234375, + "step": 944, + "time_per_iteration": 2.5806639194488525 + }, + { + "auxiliary_loss_clip": 0.01215674, + "auxiliary_loss_mlp": 0.01062669, + "balance_loss_clip": 1.0385294, + "balance_loss_mlp": 1.06267428, + "epoch": 0.05681647377123102, + "flos": 23114945111040.0, + "grad_norm": 1.7462690351912153, + "language_loss": 0.79321784, + "learning_rate": 3.992461825426906e-06, + "loss": 0.8160013, + "num_input_tokens_seen": 20203830, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.53125, + "step": 945, + "time_per_iteration": 2.695613384246826 + }, + { + "auxiliary_loss_clip": 0.01218347, + "auxiliary_loss_mlp": 0.01061935, + "balance_loss_clip": 1.03628159, + "balance_loss_mlp": 1.06407309, + "epoch": 0.056876597023898995, + "flos": 16070528505600.0, + "grad_norm": 2.1794845223078556, + "language_loss": 0.82465631, + "learning_rate": 3.992428005427252e-06, + "loss": 0.84745914, + "num_input_tokens_seen": 20220365, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.546875, + "step": 946, + "time_per_iteration": 2.6081790924072266 + }, + { + "auxiliary_loss_clip": 0.01223255, + "auxiliary_loss_mlp": 0.01059618, + "balance_loss_clip": 1.03258097, + "balance_loss_mlp": 1.06615055, + "epoch": 0.05693672027656696, + "flos": 16835641130880.0, + "grad_norm": 2.7693395657309297, + "language_loss": 0.7904911, + "learning_rate": 3.992394109874529e-06, + "loss": 0.8133198, + "num_input_tokens_seen": 20238640, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.5703125, + "step": 947, + "time_per_iteration": 2.460472822189331 + }, + { + "auxiliary_loss_clip": 0.01227462, + "auxiliary_loss_mlp": 0.01065027, + "balance_loss_clip": 1.03890848, + "balance_loss_mlp": 1.06883287, + "epoch": 0.05699684352923493, + "flos": 21389113370880.0, + "grad_norm": 7.046260534289203, + "language_loss": 0.85772789, + "learning_rate": 3.9923601387700225e-06, + "loss": 0.88065279, + "num_input_tokens_seen": 20251025, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.5859375, + "step": 948, + "time_per_iteration": 2.4560892581939697 + }, + { + "auxiliary_loss_clip": 0.01217019, + "auxiliary_loss_mlp": 0.01060985, + "balance_loss_clip": 1.03374553, + "balance_loss_mlp": 1.06329989, + "epoch": 0.057056966781902904, + "flos": 15560309767680.0, + "grad_norm": 1.8055084405958775, + "language_loss": 0.87044799, + "learning_rate": 3.992326092115019e-06, + "loss": 0.89322805, + "num_input_tokens_seen": 20269775, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.5390625, + "step": 949, + "time_per_iteration": 2.4843316078186035 + }, + { + "auxiliary_loss_clip": 0.01212611, + "auxiliary_loss_mlp": 0.01066298, + "balance_loss_clip": 1.04170561, + "balance_loss_mlp": 1.06284809, + "epoch": 0.05711709003457087, + "flos": 19937856170880.0, + "grad_norm": 2.230679935648155, + "language_loss": 0.79035759, + "learning_rate": 3.992291969910811e-06, + "loss": 0.81314665, + "num_input_tokens_seen": 20287715, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.4921875, + "step": 950, + "time_per_iteration": 2.468172311782837 + }, + { + "auxiliary_loss_clip": 0.01221984, + "auxiliary_loss_mlp": 0.01068601, + "balance_loss_clip": 1.04365039, + "balance_loss_mlp": 1.06574106, + "epoch": 0.05717721328723884, + "flos": 30332701774080.0, + "grad_norm": 2.0871877141587682, + "language_loss": 0.8244521, + "learning_rate": 3.992257772158691e-06, + "loss": 0.84735799, + "num_input_tokens_seen": 20307070, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.5625, + "step": 951, + "time_per_iteration": 2.5418505668640137 + }, + { + "auxiliary_loss_clip": 0.01215404, + "auxiliary_loss_mlp": 0.01062639, + "balance_loss_clip": 1.03568625, + "balance_loss_mlp": 1.06129527, + "epoch": 0.05723733653990681, + "flos": 23654358627840.0, + "grad_norm": 2.5400916768099426, + "language_loss": 0.86685216, + "learning_rate": 3.992223498859958e-06, + "loss": 0.88963258, + "num_input_tokens_seen": 20324945, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.5390625, + "step": 952, + "time_per_iteration": 2.513356924057007 + }, + { + "auxiliary_loss_clip": 0.0122001, + "auxiliary_loss_mlp": 0.01062958, + "balance_loss_clip": 1.03415656, + "balance_loss_mlp": 1.06145215, + "epoch": 0.05729745979257478, + "flos": 22055759838720.0, + "grad_norm": 1.725154467975805, + "language_loss": 0.79043579, + "learning_rate": 3.9921891500159084e-06, + "loss": 0.81326544, + "num_input_tokens_seen": 20346135, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.5859375, + "step": 953, + "time_per_iteration": 2.490940570831299 + }, + { + "auxiliary_loss_clip": 0.01223554, + "auxiliary_loss_mlp": 0.01063244, + "balance_loss_clip": 1.03592086, + "balance_loss_mlp": 1.06757712, + "epoch": 0.05735758304524275, + "flos": 19604353368960.0, + "grad_norm": 2.2937199779067106, + "language_loss": 0.87086606, + "learning_rate": 3.992154725627848e-06, + "loss": 0.89373398, + "num_input_tokens_seen": 20364450, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.5625, + "step": 954, + "time_per_iteration": 2.495039701461792 + }, + { + "auxiliary_loss_clip": 0.01221375, + "auxiliary_loss_mlp": 0.01062344, + "balance_loss_clip": 1.03707159, + "balance_loss_mlp": 1.06446028, + "epoch": 0.057417706297910716, + "flos": 19099018880640.0, + "grad_norm": 2.3514674671771933, + "language_loss": 0.87789929, + "learning_rate": 3.9921202256970804e-06, + "loss": 0.90073651, + "num_input_tokens_seen": 20383500, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.5625, + "step": 955, + "time_per_iteration": 2.5018017292022705 + }, + { + "auxiliary_loss_clip": 0.01214985, + "auxiliary_loss_mlp": 0.01065732, + "balance_loss_clip": 1.04000711, + "balance_loss_mlp": 1.06217909, + "epoch": 0.05747782955057869, + "flos": 16654507822080.0, + "grad_norm": 3.7193659196918576, + "language_loss": 0.89682388, + "learning_rate": 3.992085650224914e-06, + "loss": 0.919631, + "num_input_tokens_seen": 20400295, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.53125, + "step": 956, + "time_per_iteration": 2.43306565284729 + }, + { + "auxiliary_loss_clip": 0.01212174, + "auxiliary_loss_mlp": 0.0105844, + "balance_loss_clip": 1.03232098, + "balance_loss_mlp": 1.06344521, + "epoch": 0.05753795280324665, + "flos": 14502058248960.0, + "grad_norm": 1.7667772588634594, + "language_loss": 0.75335747, + "learning_rate": 3.99205099921266e-06, + "loss": 0.77606356, + "num_input_tokens_seen": 20419085, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.484375, + "step": 957, + "time_per_iteration": 2.469240665435791 + }, + { + "auxiliary_loss_clip": 0.01218166, + "auxiliary_loss_mlp": 0.01075955, + "balance_loss_clip": 1.04713011, + "balance_loss_mlp": 1.06214452, + "epoch": 0.057598076055914625, + "flos": 18076318848000.0, + "grad_norm": 1.8974624224625587, + "language_loss": 0.79871029, + "learning_rate": 3.992016272661633e-06, + "loss": 0.82165146, + "num_input_tokens_seen": 20437465, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.5625, + "step": 958, + "time_per_iteration": 2.5016849040985107 + }, + { + "auxiliary_loss_clip": 0.01214009, + "auxiliary_loss_mlp": 0.01062008, + "balance_loss_clip": 1.03780818, + "balance_loss_mlp": 1.06024444, + "epoch": 0.0576581993085826, + "flos": 22124600254080.0, + "grad_norm": 2.5702669091422234, + "language_loss": 0.88410264, + "learning_rate": 3.99198147057315e-06, + "loss": 0.90686285, + "num_input_tokens_seen": 20456235, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.5390625, + "step": 959, + "time_per_iteration": 2.4830191135406494 + }, + { + "auxiliary_loss_clip": 0.01211651, + "auxiliary_loss_mlp": 0.01063458, + "balance_loss_clip": 1.03832912, + "balance_loss_mlp": 1.0626018, + "epoch": 0.05771832256125056, + "flos": 33181746779520.0, + "grad_norm": 2.6997220185951347, + "language_loss": 0.78556621, + "learning_rate": 3.991946592948529e-06, + "loss": 0.8083173, + "num_input_tokens_seen": 20476825, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.4921875, + "step": 960, + "time_per_iteration": 2.569218397140503 + }, + { + "auxiliary_loss_clip": 0.01217172, + "auxiliary_loss_mlp": 0.01063539, + "balance_loss_clip": 1.03685999, + "balance_loss_mlp": 1.06168103, + "epoch": 0.057778445813918534, + "flos": 24170143973760.0, + "grad_norm": 4.159271492638429, + "language_loss": 0.932491, + "learning_rate": 3.991911639789094e-06, + "loss": 0.95529813, + "num_input_tokens_seen": 20496965, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.5546875, + "step": 961, + "time_per_iteration": 2.511765480041504 + }, + { + "auxiliary_loss_clip": 0.01215042, + "auxiliary_loss_mlp": 0.01070899, + "balance_loss_clip": 1.04411268, + "balance_loss_mlp": 1.06039667, + "epoch": 0.0578385690665865, + "flos": 29643037666560.0, + "grad_norm": 2.532017623976099, + "language_loss": 0.6822986, + "learning_rate": 3.991876611096169e-06, + "loss": 0.70515805, + "num_input_tokens_seen": 20518035, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.546875, + "step": 962, + "time_per_iteration": 2.544498920440674 + }, + { + "auxiliary_loss_clip": 0.01214012, + "auxiliary_loss_mlp": 0.01068596, + "balance_loss_clip": 1.04461062, + "balance_loss_mlp": 1.06268489, + "epoch": 0.05789869231925447, + "flos": 20885430908160.0, + "grad_norm": 2.445305128304827, + "language_loss": 0.88187808, + "learning_rate": 3.991841506871084e-06, + "loss": 0.90470415, + "num_input_tokens_seen": 20534740, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.515625, + "step": 963, + "time_per_iteration": 2.459487199783325 + }, + { + "auxiliary_loss_clip": 0.01222623, + "auxiliary_loss_mlp": 0.01058866, + "balance_loss_clip": 1.03337944, + "balance_loss_mlp": 1.06633568, + "epoch": 0.057958815571922444, + "flos": 26031106679040.0, + "grad_norm": 2.5656796350524473, + "language_loss": 0.84858835, + "learning_rate": 3.99180632711517e-06, + "loss": 0.87140322, + "num_input_tokens_seen": 20553485, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.5625, + "step": 964, + "time_per_iteration": 2.5268235206604004 + }, + { + "auxiliary_loss_clip": 0.01216658, + "auxiliary_loss_mlp": 0.01067828, + "balance_loss_clip": 1.04157782, + "balance_loss_mlp": 1.06309247, + "epoch": 0.05801893882459041, + "flos": 18077683564800.0, + "grad_norm": 2.846103019544017, + "language_loss": 0.77748007, + "learning_rate": 3.99177107182976e-06, + "loss": 0.80032492, + "num_input_tokens_seen": 20572155, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.5390625, + "step": 965, + "time_per_iteration": 2.4572315216064453 + }, + { + "auxiliary_loss_clip": 0.01211478, + "auxiliary_loss_mlp": 0.01068539, + "balance_loss_clip": 1.04424393, + "balance_loss_mlp": 1.0614084, + "epoch": 0.05807906207725838, + "flos": 17748885444480.0, + "grad_norm": 2.4479010977704463, + "language_loss": 0.80922461, + "learning_rate": 3.99173574101619e-06, + "loss": 0.83202475, + "num_input_tokens_seen": 20590395, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.5, + "step": 966, + "time_per_iteration": 2.4682776927948 + }, + { + "auxiliary_loss_clip": 0.01212307, + "auxiliary_loss_mlp": 0.01061872, + "balance_loss_clip": 1.03730273, + "balance_loss_mlp": 1.06173599, + "epoch": 0.058139185329926346, + "flos": 18040372312320.0, + "grad_norm": 1.8643875206872442, + "language_loss": 0.76291096, + "learning_rate": 3.9917003346758035e-06, + "loss": 0.78565276, + "num_input_tokens_seen": 20608435, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.5, + "step": 967, + "time_per_iteration": 2.453474521636963 + }, + { + "auxiliary_loss_clip": 0.01084806, + "auxiliary_loss_mlp": 0.01040579, + "balance_loss_clip": 1.03547657, + "balance_loss_mlp": 1.02152586, + "epoch": 0.05819930858259432, + "flos": 62363297485440.0, + "grad_norm": 0.7926144837125159, + "language_loss": 0.57362092, + "learning_rate": 3.991664852809939e-06, + "loss": 0.59487474, + "num_input_tokens_seen": 20668575, + "router_z_loss_clip": 0.05102539, + "router_z_loss_mlp": 0.6328125, + "step": 968, + "time_per_iteration": 2.994419574737549 + }, + { + "auxiliary_loss_clip": 0.01218807, + "auxiliary_loss_mlp": 0.01055354, + "balance_loss_clip": 1.02865148, + "balance_loss_mlp": 1.06574845, + "epoch": 0.05825943183526229, + "flos": 19135360465920.0, + "grad_norm": 2.057389892616485, + "language_loss": 0.82289147, + "learning_rate": 3.991629295419945e-06, + "loss": 0.84563303, + "num_input_tokens_seen": 20687355, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.53125, + "step": 969, + "time_per_iteration": 2.4718844890594482 + }, + { + "auxiliary_loss_clip": 0.01217673, + "auxiliary_loss_mlp": 0.01056826, + "balance_loss_clip": 1.03105259, + "balance_loss_mlp": 1.06392384, + "epoch": 0.058319555087930255, + "flos": 29022465369600.0, + "grad_norm": 2.1897875503845725, + "language_loss": 0.780442, + "learning_rate": 3.991593662507167e-06, + "loss": 0.80318701, + "num_input_tokens_seen": 20705710, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.5390625, + "step": 970, + "time_per_iteration": 2.5632171630859375 + }, + { + "auxiliary_loss_clip": 0.01216631, + "auxiliary_loss_mlp": 0.01054997, + "balance_loss_clip": 1.02809155, + "balance_loss_mlp": 1.06188202, + "epoch": 0.05837967834059823, + "flos": 18879999701760.0, + "grad_norm": 2.6802242915962, + "language_loss": 0.92492616, + "learning_rate": 3.991557954072958e-06, + "loss": 0.94764245, + "num_input_tokens_seen": 20722405, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.546875, + "step": 971, + "time_per_iteration": 2.4642531871795654 + }, + { + "auxiliary_loss_clip": 0.01210603, + "auxiliary_loss_mlp": 0.010597, + "balance_loss_clip": 1.03439212, + "balance_loss_mlp": 1.05865097, + "epoch": 0.05843980159326619, + "flos": 25703062744320.0, + "grad_norm": 3.0470884327064276, + "language_loss": 0.86133701, + "learning_rate": 3.991522170118673e-06, + "loss": 0.88404, + "num_input_tokens_seen": 20741480, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.515625, + "step": 972, + "time_per_iteration": 2.5298526287078857 + }, + { + "auxiliary_loss_clip": 0.01212752, + "auxiliary_loss_mlp": 0.01064681, + "balance_loss_clip": 1.04038596, + "balance_loss_mlp": 1.0636549, + "epoch": 0.058499924845934165, + "flos": 25552129795200.0, + "grad_norm": 2.0754734138997906, + "language_loss": 0.87340444, + "learning_rate": 3.991486310645667e-06, + "loss": 0.89617872, + "num_input_tokens_seen": 20759685, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.4921875, + "step": 973, + "time_per_iteration": 2.5198311805725098 + }, + { + "auxiliary_loss_clip": 0.01213937, + "auxiliary_loss_mlp": 0.01067264, + "balance_loss_clip": 1.04070425, + "balance_loss_mlp": 1.06140256, + "epoch": 0.05856004809860214, + "flos": 16436171001600.0, + "grad_norm": 3.2539468590332707, + "language_loss": 0.74868345, + "learning_rate": 3.991450375655301e-06, + "loss": 0.77149546, + "num_input_tokens_seen": 20778180, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5234375, + "step": 974, + "time_per_iteration": 2.465268850326538 + }, + { + "auxiliary_loss_clip": 0.0121359, + "auxiliary_loss_mlp": 0.01059075, + "balance_loss_clip": 1.03308713, + "balance_loss_mlp": 1.06260133, + "epoch": 0.0586201713512701, + "flos": 39458824116480.0, + "grad_norm": 1.7891188847385684, + "language_loss": 0.76707923, + "learning_rate": 3.991414365148936e-06, + "loss": 0.78980577, + "num_input_tokens_seen": 20802705, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.515625, + "step": 975, + "time_per_iteration": 2.633850336074829 + }, + { + "auxiliary_loss_clip": 0.01216778, + "auxiliary_loss_mlp": 0.01068456, + "balance_loss_clip": 1.04332697, + "balance_loss_mlp": 1.0621978, + "epoch": 0.058680294603938074, + "flos": 23365170230400.0, + "grad_norm": 2.0981769673049326, + "language_loss": 0.76878488, + "learning_rate": 3.99137827912794e-06, + "loss": 0.79163718, + "num_input_tokens_seen": 20822540, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.546875, + "step": 976, + "time_per_iteration": 6.8309245109558105 + }, + { + "auxiliary_loss_clip": 0.01210296, + "auxiliary_loss_mlp": 0.01061517, + "balance_loss_clip": 1.03606534, + "balance_loss_mlp": 1.0585494, + "epoch": 0.05874041785660604, + "flos": 32232017226240.0, + "grad_norm": 1.8109666318996334, + "language_loss": 0.87465948, + "learning_rate": 3.991342117593679e-06, + "loss": 0.89737761, + "num_input_tokens_seen": 20844175, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.515625, + "step": 977, + "time_per_iteration": 2.5693395137786865 + }, + { + "auxiliary_loss_clip": 0.01213396, + "auxiliary_loss_mlp": 0.01064383, + "balance_loss_clip": 1.0380497, + "balance_loss_mlp": 1.06246471, + "epoch": 0.05880054110927401, + "flos": 22310043194880.0, + "grad_norm": 1.7886661734827753, + "language_loss": 0.79517525, + "learning_rate": 3.991305880547527e-06, + "loss": 0.81795299, + "num_input_tokens_seen": 20864730, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.5078125, + "step": 978, + "time_per_iteration": 2.51609206199646 + }, + { + "auxiliary_loss_clip": 0.01218239, + "auxiliary_loss_mlp": 0.01069938, + "balance_loss_clip": 1.04339027, + "balance_loss_mlp": 1.06304932, + "epoch": 0.05886066436194198, + "flos": 27380450016000.0, + "grad_norm": 2.6270410794651102, + "language_loss": 0.80902123, + "learning_rate": 3.991269567990855e-06, + "loss": 0.83190298, + "num_input_tokens_seen": 20885200, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.546875, + "step": 979, + "time_per_iteration": 2.527127504348755 + }, + { + "auxiliary_loss_clip": 0.01085971, + "auxiliary_loss_mlp": 0.01009543, + "balance_loss_clip": 1.0044651, + "balance_loss_mlp": 1.02304745, + "epoch": 0.05892078761460995, + "flos": 59584493525760.0, + "grad_norm": 0.94528472512207, + "language_loss": 0.59059429, + "learning_rate": 3.9912331799250415e-06, + "loss": 0.61154944, + "num_input_tokens_seen": 20940325, + "router_z_loss_clip": 0.05078125, + "router_z_loss_mlp": 0.62890625, + "step": 980, + "time_per_iteration": 2.9545915126800537 + }, + { + "auxiliary_loss_clip": 0.01210703, + "auxiliary_loss_mlp": 0.0106402, + "balance_loss_clip": 1.03747201, + "balance_loss_mlp": 1.0622623, + "epoch": 0.05898091086727792, + "flos": 15414081500160.0, + "grad_norm": 2.3915266710240917, + "language_loss": 0.86397457, + "learning_rate": 3.9911967163514665e-06, + "loss": 0.88672185, + "num_input_tokens_seen": 20958220, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.484375, + "step": 981, + "time_per_iteration": 2.4726293087005615 + }, + { + "auxiliary_loss_clip": 0.01212695, + "auxiliary_loss_mlp": 0.01057503, + "balance_loss_clip": 1.03423381, + "balance_loss_mlp": 1.06214404, + "epoch": 0.059041034119945886, + "flos": 23655328295040.0, + "grad_norm": 1.9485203495729437, + "language_loss": 0.79623365, + "learning_rate": 3.991160177271513e-06, + "loss": 0.81893563, + "num_input_tokens_seen": 20978920, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.5078125, + "step": 982, + "time_per_iteration": 2.5271458625793457 + }, + { + "auxiliary_loss_clip": 0.01219179, + "auxiliary_loss_mlp": 0.01060762, + "balance_loss_clip": 1.03571582, + "balance_loss_mlp": 1.06248748, + "epoch": 0.05910115737261386, + "flos": 24754087376640.0, + "grad_norm": 2.5320957946125437, + "language_loss": 0.84376037, + "learning_rate": 3.9911235626865654e-06, + "loss": 0.86655974, + "num_input_tokens_seen": 20999490, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.5625, + "step": 983, + "time_per_iteration": 2.526364803314209 + }, + { + "auxiliary_loss_clip": 0.01212847, + "auxiliary_loss_mlp": 0.01067453, + "balance_loss_clip": 1.04361129, + "balance_loss_mlp": 1.06317604, + "epoch": 0.05916128062528183, + "flos": 11728749070080.0, + "grad_norm": 1.8446015864025267, + "language_loss": 0.84607553, + "learning_rate": 3.9910868725980125e-06, + "loss": 0.86887848, + "num_input_tokens_seen": 21017865, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.4921875, + "step": 984, + "time_per_iteration": 2.456803321838379 + }, + { + "auxiliary_loss_clip": 0.01211466, + "auxiliary_loss_mlp": 0.01059154, + "balance_loss_clip": 1.03551483, + "balance_loss_mlp": 1.06338882, + "epoch": 0.059221403877949795, + "flos": 21902995296000.0, + "grad_norm": 2.3276500524021495, + "language_loss": 0.77875566, + "learning_rate": 3.9910501070072465e-06, + "loss": 0.80146182, + "num_input_tokens_seen": 21035900, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.484375, + "step": 985, + "time_per_iteration": 2.504096508026123 + }, + { + "auxiliary_loss_clip": 0.01215785, + "auxiliary_loss_mlp": 0.01061307, + "balance_loss_clip": 1.03661919, + "balance_loss_mlp": 1.06191659, + "epoch": 0.05928152713061777, + "flos": 20514580940160.0, + "grad_norm": 2.294716701848832, + "language_loss": 0.90598249, + "learning_rate": 3.991013265915661e-06, + "loss": 0.92875338, + "num_input_tokens_seen": 21053235, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.5390625, + "step": 986, + "time_per_iteration": 2.4882049560546875 + }, + { + "auxiliary_loss_clip": 0.01215421, + "auxiliary_loss_mlp": 0.01062373, + "balance_loss_clip": 1.03534794, + "balance_loss_mlp": 1.06017947, + "epoch": 0.05934165038328574, + "flos": 24495135252480.0, + "grad_norm": 3.8181645576894256, + "language_loss": 0.7589798, + "learning_rate": 3.9909763493246525e-06, + "loss": 0.78175771, + "num_input_tokens_seen": 21073090, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.546875, + "step": 987, + "time_per_iteration": 2.492412805557251 + }, + { + "auxiliary_loss_clip": 0.01216653, + "auxiliary_loss_mlp": 0.01059941, + "balance_loss_clip": 1.03491902, + "balance_loss_mlp": 1.06059265, + "epoch": 0.059401773635953704, + "flos": 38728041914880.0, + "grad_norm": 2.1447391932017843, + "language_loss": 0.71525705, + "learning_rate": 3.990939357235621e-06, + "loss": 0.73802304, + "num_input_tokens_seen": 21094895, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.5625, + "step": 988, + "time_per_iteration": 2.6386756896972656 + }, + { + "auxiliary_loss_clip": 0.01081383, + "auxiliary_loss_mlp": 0.01012054, + "balance_loss_clip": 1.00680876, + "balance_loss_mlp": 1.01888978, + "epoch": 0.059461896888621676, + "flos": 58023565125120.0, + "grad_norm": 0.9344259157338769, + "language_loss": 0.71159971, + "learning_rate": 3.99090228964997e-06, + "loss": 0.73253405, + "num_input_tokens_seen": 21147555, + "router_z_loss_clip": 0.05249023, + "router_z_loss_mlp": 0.625, + "step": 989, + "time_per_iteration": 2.903996706008911 + }, + { + "auxiliary_loss_clip": 0.01219656, + "auxiliary_loss_mlp": 0.01067443, + "balance_loss_clip": 1.0404067, + "balance_loss_mlp": 1.06221163, + "epoch": 0.05952202014128964, + "flos": 22127760650880.0, + "grad_norm": 1.89069901477269, + "language_loss": 0.78102934, + "learning_rate": 3.990865146569105e-06, + "loss": 0.80390036, + "num_input_tokens_seen": 21167845, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.578125, + "step": 990, + "time_per_iteration": 2.6252431869506836 + }, + { + "auxiliary_loss_clip": 0.01208224, + "auxiliary_loss_mlp": 0.0105602, + "balance_loss_clip": 1.0303905, + "balance_loss_mlp": 1.05700588, + "epoch": 0.059582143393957614, + "flos": 20445776438400.0, + "grad_norm": 2.077710223302236, + "language_loss": 0.86406755, + "learning_rate": 3.990827927994434e-06, + "loss": 0.88671005, + "num_input_tokens_seen": 21185085, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.515625, + "step": 991, + "time_per_iteration": 2.483853340148926 + }, + { + "auxiliary_loss_clip": 0.01216429, + "auxiliary_loss_mlp": 0.01065185, + "balance_loss_clip": 1.04030573, + "balance_loss_mlp": 1.06190968, + "epoch": 0.059642266646625586, + "flos": 20594877793920.0, + "grad_norm": 2.866628977756486, + "language_loss": 0.76876801, + "learning_rate": 3.9907906339273674e-06, + "loss": 0.79158413, + "num_input_tokens_seen": 21204230, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 1.546875, + "step": 992, + "time_per_iteration": 2.5149648189544678 + }, + { + "auxiliary_loss_clip": 0.01214781, + "auxiliary_loss_mlp": 0.0106048, + "balance_loss_clip": 1.03701937, + "balance_loss_mlp": 1.06251192, + "epoch": 0.05970238989929355, + "flos": 19352655792000.0, + "grad_norm": 2.726921793738851, + "language_loss": 0.74594641, + "learning_rate": 3.9907532643693215e-06, + "loss": 0.76869899, + "num_input_tokens_seen": 21222655, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.5234375, + "step": 993, + "time_per_iteration": 2.4739816188812256 + }, + { + "auxiliary_loss_clip": 0.01214249, + "auxiliary_loss_mlp": 0.01071365, + "balance_loss_clip": 1.04560351, + "balance_loss_mlp": 1.06326771, + "epoch": 0.05976251315196152, + "flos": 30264040926720.0, + "grad_norm": 3.2517233877247396, + "language_loss": 0.78911841, + "learning_rate": 3.990715819321712e-06, + "loss": 0.81197453, + "num_input_tokens_seen": 21242310, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.515625, + "step": 994, + "time_per_iteration": 2.5408835411071777 + }, + { + "auxiliary_loss_clip": 0.01214677, + "auxiliary_loss_mlp": 0.01082728, + "balance_loss_clip": 1.05768251, + "balance_loss_mlp": 1.06170893, + "epoch": 0.05982263640462949, + "flos": 23185150243200.0, + "grad_norm": 2.42517884603863, + "language_loss": 0.79639304, + "learning_rate": 3.99067829878596e-06, + "loss": 0.81936711, + "num_input_tokens_seen": 21261410, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.53125, + "step": 995, + "time_per_iteration": 2.5062758922576904 + }, + { + "auxiliary_loss_clip": 0.01212085, + "auxiliary_loss_mlp": 0.01065995, + "balance_loss_clip": 1.04022169, + "balance_loss_mlp": 1.05969059, + "epoch": 0.05988275965729746, + "flos": 27850879463040.0, + "grad_norm": 2.536496545288829, + "language_loss": 0.86939722, + "learning_rate": 3.990640702763487e-06, + "loss": 0.89217806, + "num_input_tokens_seen": 21280080, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.53125, + "step": 996, + "time_per_iteration": 2.5236001014709473 + }, + { + "auxiliary_loss_clip": 0.01217352, + "auxiliary_loss_mlp": 0.01070048, + "balance_loss_clip": 1.04098463, + "balance_loss_mlp": 1.06309104, + "epoch": 0.05994288290996543, + "flos": 24680003575680.0, + "grad_norm": 4.013698471354103, + "language_loss": 0.88192105, + "learning_rate": 3.990603031255718e-06, + "loss": 0.90479505, + "num_input_tokens_seen": 21296765, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.546875, + "step": 997, + "time_per_iteration": 2.483116626739502 + }, + { + "auxiliary_loss_clip": 0.01079761, + "auxiliary_loss_mlp": 0.01004407, + "balance_loss_clip": 0.9993524, + "balance_loss_mlp": 1.01837301, + "epoch": 0.0600030061626334, + "flos": 69929568835200.0, + "grad_norm": 1.020759515587473, + "language_loss": 0.75442117, + "learning_rate": 3.990565284264083e-06, + "loss": 0.77526283, + "num_input_tokens_seen": 21363345, + "router_z_loss_clip": 0.05053711, + "router_z_loss_mlp": 0.6171875, + "step": 998, + "time_per_iteration": 3.152331590652466 + }, + { + "auxiliary_loss_clip": 0.01213812, + "auxiliary_loss_mlp": 0.01067708, + "balance_loss_clip": 1.04179215, + "balance_loss_mlp": 1.0626508, + "epoch": 0.06006312941530137, + "flos": 26540140268160.0, + "grad_norm": 1.8375420281697645, + "language_loss": 0.75796127, + "learning_rate": 3.990527461790013e-06, + "loss": 0.7807765, + "num_input_tokens_seen": 21385290, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.515625, + "step": 999, + "time_per_iteration": 2.5051238536834717 + }, + { + "auxiliary_loss_clip": 0.01212853, + "auxiliary_loss_mlp": 0.0106227, + "balance_loss_clip": 1.03575778, + "balance_loss_mlp": 1.05894446, + "epoch": 0.060123252667969335, + "flos": 27344000689920.0, + "grad_norm": 1.9091686508511199, + "language_loss": 0.82658899, + "learning_rate": 3.990489563834943e-06, + "loss": 0.8493402, + "num_input_tokens_seen": 21407625, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5390625, + "step": 1000, + "time_per_iteration": 2.5369935035705566 + }, + { + "auxiliary_loss_clip": 0.01215386, + "auxiliary_loss_mlp": 0.01057348, + "balance_loss_clip": 1.03282714, + "balance_loss_mlp": 1.06143069, + "epoch": 0.06018337592063731, + "flos": 27016710940800.0, + "grad_norm": 3.4065508827059783, + "language_loss": 0.85644853, + "learning_rate": 3.990451590400309e-06, + "loss": 0.8791759, + "num_input_tokens_seen": 21426835, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.5390625, + "step": 1001, + "time_per_iteration": 2.4972190856933594 + }, + { + "auxiliary_loss_clip": 0.0120879, + "auxiliary_loss_mlp": 0.01063055, + "balance_loss_clip": 1.0376749, + "balance_loss_mlp": 1.0587517, + "epoch": 0.06024349917330528, + "flos": 25592960580480.0, + "grad_norm": 2.156321640703371, + "language_loss": 0.74386394, + "learning_rate": 3.990413541487551e-06, + "loss": 0.76658237, + "num_input_tokens_seen": 21444920, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.5, + "step": 1002, + "time_per_iteration": 2.531172275543213 + }, + { + "auxiliary_loss_clip": 0.01211576, + "auxiliary_loss_mlp": 0.01065904, + "balance_loss_clip": 1.04019034, + "balance_loss_mlp": 1.06015134, + "epoch": 0.060303622425973244, + "flos": 26133271937280.0, + "grad_norm": 3.1165374575777145, + "language_loss": 0.75346643, + "learning_rate": 3.990375417098112e-06, + "loss": 0.77624118, + "num_input_tokens_seen": 21463555, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.515625, + "step": 1003, + "time_per_iteration": 2.508817434310913 + }, + { + "auxiliary_loss_clip": 0.01219434, + "auxiliary_loss_mlp": 0.01065938, + "balance_loss_clip": 1.04047489, + "balance_loss_mlp": 1.06255794, + "epoch": 0.060363745678641216, + "flos": 20377187418240.0, + "grad_norm": 2.2578292515807603, + "language_loss": 0.70071733, + "learning_rate": 3.990337217233437e-06, + "loss": 0.723571, + "num_input_tokens_seen": 21481990, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.5625, + "step": 1004, + "time_per_iteration": 2.480116844177246 + }, + { + "auxiliary_loss_clip": 0.01218526, + "auxiliary_loss_mlp": 0.01073584, + "balance_loss_clip": 1.04810917, + "balance_loss_mlp": 1.06360686, + "epoch": 0.06042386893130918, + "flos": 17749172753280.0, + "grad_norm": 2.248554137518493, + "language_loss": 0.83246684, + "learning_rate": 3.990298941894976e-06, + "loss": 0.85538793, + "num_input_tokens_seen": 21500385, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.546875, + "step": 1005, + "time_per_iteration": 2.449733018875122 + }, + { + "auxiliary_loss_clip": 0.01077543, + "auxiliary_loss_mlp": 0.01007523, + "balance_loss_clip": 1.00306416, + "balance_loss_mlp": 1.0157814, + "epoch": 0.06048399218397715, + "flos": 68538496872960.0, + "grad_norm": 0.8959746990508154, + "language_loss": 0.59000289, + "learning_rate": 3.9902605910841794e-06, + "loss": 0.61085355, + "num_input_tokens_seen": 21561040, + "router_z_loss_clip": 0.04467773, + "router_z_loss_mlp": 0.6171875, + "step": 1006, + "time_per_iteration": 3.1583423614501953 + }, + { + "auxiliary_loss_clip": 0.01209886, + "auxiliary_loss_mlp": 0.0105727, + "balance_loss_clip": 1.03203392, + "balance_loss_mlp": 1.05658197, + "epoch": 0.060544115436645125, + "flos": 23258515772160.0, + "grad_norm": 2.271524805944984, + "language_loss": 0.7428897, + "learning_rate": 3.990222164802503e-06, + "loss": 0.76556122, + "num_input_tokens_seen": 21580655, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.53125, + "step": 1007, + "time_per_iteration": 2.49139666557312 + }, + { + "auxiliary_loss_clip": 0.01212867, + "auxiliary_loss_mlp": 0.01055047, + "balance_loss_clip": 1.02930975, + "balance_loss_mlp": 1.05897522, + "epoch": 0.06060423868931309, + "flos": 23878441624320.0, + "grad_norm": 1.8583948299039934, + "language_loss": 0.80739897, + "learning_rate": 3.9901836630514006e-06, + "loss": 0.83007812, + "num_input_tokens_seen": 21599650, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.5390625, + "step": 1008, + "time_per_iteration": 2.4990036487579346 + }, + { + "auxiliary_loss_clip": 0.01213893, + "auxiliary_loss_mlp": 0.01055804, + "balance_loss_clip": 1.03082955, + "balance_loss_mlp": 1.06254637, + "epoch": 0.06066436194198106, + "flos": 18728061171840.0, + "grad_norm": 1.935763632111394, + "language_loss": 0.77840835, + "learning_rate": 3.990145085832335e-06, + "loss": 0.80110532, + "num_input_tokens_seen": 21617550, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.515625, + "step": 1009, + "time_per_iteration": 2.4785048961639404 + }, + { + "auxiliary_loss_clip": 0.01210213, + "auxiliary_loss_mlp": 0.01059495, + "balance_loss_clip": 1.03537917, + "balance_loss_mlp": 1.06082368, + "epoch": 0.06072448519464903, + "flos": 24640465680000.0, + "grad_norm": 2.1058592784097567, + "language_loss": 0.93059653, + "learning_rate": 3.990106433146769e-06, + "loss": 0.95329368, + "num_input_tokens_seen": 21635865, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.4921875, + "step": 1010, + "time_per_iteration": 2.507596015930176 + }, + { + "auxiliary_loss_clip": 0.01219036, + "auxiliary_loss_mlp": 0.01066438, + "balance_loss_clip": 1.0390203, + "balance_loss_mlp": 1.05885124, + "epoch": 0.060784608447317, + "flos": 17378825575680.0, + "grad_norm": 3.1716667034247843, + "language_loss": 0.71846473, + "learning_rate": 3.9900677049961665e-06, + "loss": 0.74131954, + "num_input_tokens_seen": 21653945, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.6015625, + "step": 1011, + "time_per_iteration": 2.4791040420532227 + }, + { + "auxiliary_loss_clip": 0.01214432, + "auxiliary_loss_mlp": 0.01070485, + "balance_loss_clip": 1.04388905, + "balance_loss_mlp": 1.05902421, + "epoch": 0.06084473169998497, + "flos": 23692208584320.0, + "grad_norm": 2.5871469840663535, + "language_loss": 0.87542284, + "learning_rate": 3.990028901381999e-06, + "loss": 0.89827204, + "num_input_tokens_seen": 21671230, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5546875, + "step": 1012, + "time_per_iteration": 2.4876151084899902 + }, + { + "auxiliary_loss_clip": 0.01206171, + "auxiliary_loss_mlp": 0.01060353, + "balance_loss_clip": 1.03577256, + "balance_loss_mlp": 1.05505085, + "epoch": 0.06090485495265294, + "flos": 23546339452800.0, + "grad_norm": 1.8956263482043672, + "language_loss": 0.76679665, + "learning_rate": 3.989990022305734e-06, + "loss": 0.78946191, + "num_input_tokens_seen": 21691155, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.515625, + "step": 1013, + "time_per_iteration": 2.4874446392059326 + }, + { + "auxiliary_loss_clip": 0.01215089, + "auxiliary_loss_mlp": 0.01067055, + "balance_loss_clip": 1.03946972, + "balance_loss_mlp": 1.05924904, + "epoch": 0.06096497820532091, + "flos": 20339301548160.0, + "grad_norm": 2.654718290448769, + "language_loss": 0.85651302, + "learning_rate": 3.98995106776885e-06, + "loss": 0.87933445, + "num_input_tokens_seen": 21707405, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.5546875, + "step": 1014, + "time_per_iteration": 2.483774423599243 + }, + { + "auxiliary_loss_clip": 0.0122011, + "auxiliary_loss_mlp": 0.01067578, + "balance_loss_clip": 1.03996944, + "balance_loss_mlp": 1.06207335, + "epoch": 0.061025101457988874, + "flos": 26939035779840.0, + "grad_norm": 2.4287988001966028, + "language_loss": 0.72807163, + "learning_rate": 3.98991203777282e-06, + "loss": 0.75094855, + "num_input_tokens_seen": 21728090, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.578125, + "step": 1015, + "time_per_iteration": 2.520026206970215 + }, + { + "auxiliary_loss_clip": 0.01207162, + "auxiliary_loss_mlp": 0.01068406, + "balance_loss_clip": 1.04290748, + "balance_loss_mlp": 1.0576005, + "epoch": 0.061085224710656846, + "flos": 25375054723200.0, + "grad_norm": 1.6555956389633335, + "language_loss": 0.79197502, + "learning_rate": 3.9898729323191275e-06, + "loss": 0.8147307, + "num_input_tokens_seen": 21747950, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4921875, + "step": 1016, + "time_per_iteration": 2.5177054405212402 + }, + { + "auxiliary_loss_clip": 0.01210352, + "auxiliary_loss_mlp": 0.01060413, + "balance_loss_clip": 1.03609443, + "balance_loss_mlp": 1.0571332, + "epoch": 0.06114534796332482, + "flos": 24824759385600.0, + "grad_norm": 1.934405213560846, + "language_loss": 0.76170123, + "learning_rate": 3.989833751409254e-06, + "loss": 0.78440881, + "num_input_tokens_seen": 21767900, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.53125, + "step": 1017, + "time_per_iteration": 2.517730951309204 + }, + { + "auxiliary_loss_clip": 0.01220983, + "auxiliary_loss_mlp": 0.01069505, + "balance_loss_clip": 1.04331422, + "balance_loss_mlp": 1.06240773, + "epoch": 0.061205471215992784, + "flos": 20631434860800.0, + "grad_norm": 1.873264658326973, + "language_loss": 0.86145842, + "learning_rate": 3.989794495044685e-06, + "loss": 0.88436329, + "num_input_tokens_seen": 21787375, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.5859375, + "step": 1018, + "time_per_iteration": 5.324457883834839 + }, + { + "auxiliary_loss_clip": 0.01206709, + "auxiliary_loss_mlp": 0.01071464, + "balance_loss_clip": 1.045012, + "balance_loss_mlp": 1.05659163, + "epoch": 0.061265594468660756, + "flos": 16508351381760.0, + "grad_norm": 2.696758126666256, + "language_loss": 0.77535981, + "learning_rate": 3.989755163226909e-06, + "loss": 0.79814154, + "num_input_tokens_seen": 21806275, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.5, + "step": 1019, + "time_per_iteration": 2.453768253326416 + }, + { + "auxiliary_loss_clip": 0.01210848, + "auxiliary_loss_mlp": 0.01061489, + "balance_loss_clip": 1.03559661, + "balance_loss_mlp": 1.05749679, + "epoch": 0.06132571772132872, + "flos": 26246211275520.0, + "grad_norm": 1.8458417378275351, + "language_loss": 0.84254557, + "learning_rate": 3.989715755957418e-06, + "loss": 0.86526895, + "num_input_tokens_seen": 21826430, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.53125, + "step": 1020, + "time_per_iteration": 2.5126123428344727 + }, + { + "auxiliary_loss_clip": 0.01217116, + "auxiliary_loss_mlp": 0.01060663, + "balance_loss_clip": 1.0352596, + "balance_loss_mlp": 1.06234074, + "epoch": 0.06138584097399669, + "flos": 37414788768000.0, + "grad_norm": 2.186416819505148, + "language_loss": 0.79234397, + "learning_rate": 3.989676273237705e-06, + "loss": 0.81512177, + "num_input_tokens_seen": 21847800, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.546875, + "step": 1021, + "time_per_iteration": 2.583918333053589 + }, + { + "auxiliary_loss_clip": 0.01207219, + "auxiliary_loss_mlp": 0.01064403, + "balance_loss_clip": 1.04207504, + "balance_loss_mlp": 1.05748677, + "epoch": 0.061445964226664665, + "flos": 17420661941760.0, + "grad_norm": 2.2026341390443434, + "language_loss": 0.87493509, + "learning_rate": 3.9896367150692705e-06, + "loss": 0.89765131, + "num_input_tokens_seen": 21863385, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.5, + "step": 1022, + "time_per_iteration": 2.441298007965088 + }, + { + "auxiliary_loss_clip": 0.01213359, + "auxiliary_loss_mlp": 0.0106856, + "balance_loss_clip": 1.04353857, + "balance_loss_mlp": 1.06052542, + "epoch": 0.06150608747933263, + "flos": 22600021691520.0, + "grad_norm": 1.752710779550117, + "language_loss": 0.82776564, + "learning_rate": 3.989597081453611e-06, + "loss": 0.85058486, + "num_input_tokens_seen": 21881880, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.53125, + "step": 1023, + "time_per_iteration": 2.5027952194213867 + }, + { + "auxiliary_loss_clip": 0.01079025, + "auxiliary_loss_mlp": 0.01003997, + "balance_loss_clip": 0.99944335, + "balance_loss_mlp": 1.01796818, + "epoch": 0.0615662107320006, + "flos": 56741482005120.0, + "grad_norm": 0.8999264202466762, + "language_loss": 0.65078986, + "learning_rate": 3.989557372392231e-06, + "loss": 0.67162001, + "num_input_tokens_seen": 21940550, + "router_z_loss_clip": 0.0456543, + "router_z_loss_mlp": 0.609375, + "step": 1024, + "time_per_iteration": 3.0969655513763428 + }, + { + "auxiliary_loss_clip": 0.01212272, + "auxiliary_loss_mlp": 0.01066841, + "balance_loss_clip": 1.04123473, + "balance_loss_mlp": 1.05936897, + "epoch": 0.06162633398466857, + "flos": 22564793427840.0, + "grad_norm": 1.9303372998519377, + "language_loss": 0.88293028, + "learning_rate": 3.989517587886636e-06, + "loss": 0.90572149, + "num_input_tokens_seen": 21958390, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.53125, + "step": 1025, + "time_per_iteration": 2.5229876041412354 + }, + { + "auxiliary_loss_clip": 0.01212316, + "auxiliary_loss_mlp": 0.01059432, + "balance_loss_clip": 1.03513718, + "balance_loss_mlp": 1.05916524, + "epoch": 0.06168645723733654, + "flos": 25593104234880.0, + "grad_norm": 1.519276165786755, + "language_loss": 0.84567487, + "learning_rate": 3.989477727938335e-06, + "loss": 0.86839235, + "num_input_tokens_seen": 21978625, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.53125, + "step": 1026, + "time_per_iteration": 2.5304806232452393 + }, + { + "auxiliary_loss_clip": 0.01212365, + "auxiliary_loss_mlp": 0.01071013, + "balance_loss_clip": 1.04614556, + "balance_loss_mlp": 1.05798197, + "epoch": 0.06174658049000451, + "flos": 15997917162240.0, + "grad_norm": 1.9431802827698534, + "language_loss": 0.82320756, + "learning_rate": 3.989437792548839e-06, + "loss": 0.84604132, + "num_input_tokens_seen": 21996035, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 1.546875, + "step": 1027, + "time_per_iteration": 2.4508368968963623 + }, + { + "auxiliary_loss_clip": 0.01209611, + "auxiliary_loss_mlp": 0.01058329, + "balance_loss_clip": 1.03343821, + "balance_loss_mlp": 1.05799866, + "epoch": 0.06180670374267248, + "flos": 11285970117120.0, + "grad_norm": 2.262386050001272, + "language_loss": 0.84232426, + "learning_rate": 3.989397781719663e-06, + "loss": 0.86500365, + "num_input_tokens_seen": 22011625, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 1.515625, + "step": 1028, + "time_per_iteration": 2.4485137462615967 + }, + { + "auxiliary_loss_clip": 0.01077664, + "auxiliary_loss_mlp": 0.01009618, + "balance_loss_clip": 1.00544536, + "balance_loss_mlp": 1.01686025, + "epoch": 0.06186682699534045, + "flos": 65130142216320.0, + "grad_norm": 0.9476883841381922, + "language_loss": 0.60497737, + "learning_rate": 3.989357695452323e-06, + "loss": 0.6258502, + "num_input_tokens_seen": 22066035, + "router_z_loss_clip": 0.04174805, + "router_z_loss_mlp": 0.609375, + "step": 1029, + "time_per_iteration": 2.8714137077331543 + }, + { + "auxiliary_loss_clip": 0.0120304, + "auxiliary_loss_mlp": 0.01066238, + "balance_loss_clip": 1.0419786, + "balance_loss_mlp": 1.05338669, + "epoch": 0.061926950248008414, + "flos": 21105742976640.0, + "grad_norm": 2.297452518318954, + "language_loss": 0.82309926, + "learning_rate": 3.98931753374834e-06, + "loss": 0.84579194, + "num_input_tokens_seen": 22085015, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.4921875, + "step": 1030, + "time_per_iteration": 2.4705348014831543 + }, + { + "auxiliary_loss_clip": 0.01214194, + "auxiliary_loss_mlp": 0.01071397, + "balance_loss_clip": 1.04586279, + "balance_loss_mlp": 1.06025672, + "epoch": 0.061987073500676386, + "flos": 17748454481280.0, + "grad_norm": 2.391039807046215, + "language_loss": 0.80262065, + "learning_rate": 3.989277296609237e-06, + "loss": 0.82547653, + "num_input_tokens_seen": 22102775, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.5390625, + "step": 1031, + "time_per_iteration": 2.447964906692505 + }, + { + "auxiliary_loss_clip": 0.0121101, + "auxiliary_loss_mlp": 0.01075497, + "balance_loss_clip": 1.04919958, + "balance_loss_mlp": 1.05865717, + "epoch": 0.06204719675334436, + "flos": 21836237869440.0, + "grad_norm": 1.6245278130098144, + "language_loss": 0.77141201, + "learning_rate": 3.98923698403654e-06, + "loss": 0.79427713, + "num_input_tokens_seen": 22121680, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.5234375, + "step": 1032, + "time_per_iteration": 2.475891590118408 + }, + { + "auxiliary_loss_clip": 0.01205906, + "auxiliary_loss_mlp": 0.01069412, + "balance_loss_clip": 1.04350805, + "balance_loss_mlp": 1.05307126, + "epoch": 0.06210732000601232, + "flos": 19353697286400.0, + "grad_norm": 3.949793190746779, + "language_loss": 0.89276892, + "learning_rate": 3.989196596031776e-06, + "loss": 0.91552204, + "num_input_tokens_seen": 22138155, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.53125, + "step": 1033, + "time_per_iteration": 2.5332658290863037 + }, + { + "auxiliary_loss_clip": 0.01212647, + "auxiliary_loss_mlp": 0.01059216, + "balance_loss_clip": 1.03437293, + "balance_loss_mlp": 1.05739737, + "epoch": 0.062167443258680295, + "flos": 24749382695040.0, + "grad_norm": 2.160025730572359, + "language_loss": 0.84795135, + "learning_rate": 3.989156132596479e-06, + "loss": 0.87066996, + "num_input_tokens_seen": 22157420, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.5546875, + "step": 1034, + "time_per_iteration": 2.507636785507202 + }, + { + "auxiliary_loss_clip": 0.01202421, + "auxiliary_loss_mlp": 0.01060051, + "balance_loss_clip": 1.03399241, + "balance_loss_mlp": 1.05694687, + "epoch": 0.06222756651134827, + "flos": 34458478773120.0, + "grad_norm": 3.176440156188905, + "language_loss": 0.81156218, + "learning_rate": 3.989115593732182e-06, + "loss": 0.83418697, + "num_input_tokens_seen": 22178620, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.453125, + "step": 1035, + "time_per_iteration": 2.624635696411133 + }, + { + "auxiliary_loss_clip": 0.01212161, + "auxiliary_loss_mlp": 0.01068413, + "balance_loss_clip": 1.04051828, + "balance_loss_mlp": 1.06080353, + "epoch": 0.06228768976401623, + "flos": 25666469763840.0, + "grad_norm": 2.252599829484163, + "language_loss": 0.78701359, + "learning_rate": 3.989074979440421e-06, + "loss": 0.80981934, + "num_input_tokens_seen": 22197125, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.515625, + "step": 1036, + "time_per_iteration": 2.53027081489563 + }, + { + "auxiliary_loss_clip": 0.01204167, + "auxiliary_loss_mlp": 0.01068533, + "balance_loss_clip": 1.04334402, + "balance_loss_mlp": 1.05620134, + "epoch": 0.062347813016684205, + "flos": 25295619795840.0, + "grad_norm": 1.670767972712633, + "language_loss": 0.86802149, + "learning_rate": 3.989034289722739e-06, + "loss": 0.8907485, + "num_input_tokens_seen": 22217575, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.4765625, + "step": 1037, + "time_per_iteration": 2.506011724472046 + }, + { + "auxiliary_loss_clip": 0.01206019, + "auxiliary_loss_mlp": 0.01057504, + "balance_loss_clip": 1.02990723, + "balance_loss_mlp": 1.05728471, + "epoch": 0.06240793626935217, + "flos": 26907039740160.0, + "grad_norm": 2.1914513209480933, + "language_loss": 0.81051469, + "learning_rate": 3.988993524580676e-06, + "loss": 0.83314991, + "num_input_tokens_seen": 22236840, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.484375, + "step": 1038, + "time_per_iteration": 2.486758232116699 + }, + { + "auxiliary_loss_clip": 0.01205947, + "auxiliary_loss_mlp": 0.01072566, + "balance_loss_clip": 1.04587555, + "balance_loss_mlp": 1.05856836, + "epoch": 0.06246805952202014, + "flos": 21615782146560.0, + "grad_norm": 2.3663261426095965, + "language_loss": 0.85336804, + "learning_rate": 3.98895268401578e-06, + "loss": 0.87615323, + "num_input_tokens_seen": 22256465, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.46875, + "step": 1039, + "time_per_iteration": 2.489241123199463 + }, + { + "auxiliary_loss_clip": 0.01207559, + "auxiliary_loss_mlp": 0.01067632, + "balance_loss_clip": 1.0417757, + "balance_loss_mlp": 1.05744672, + "epoch": 0.0625281827746881, + "flos": 19311896833920.0, + "grad_norm": 1.9774289629637263, + "language_loss": 0.80853289, + "learning_rate": 3.9889117680296e-06, + "loss": 0.83128488, + "num_input_tokens_seen": 22274025, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.5, + "step": 1040, + "time_per_iteration": 2.480022668838501 + }, + { + "auxiliary_loss_clip": 0.01213203, + "auxiliary_loss_mlp": 0.01067746, + "balance_loss_clip": 1.04155588, + "balance_loss_mlp": 1.06227219, + "epoch": 0.06258830602735609, + "flos": 27745769289600.0, + "grad_norm": 2.535271913081881, + "language_loss": 0.69440711, + "learning_rate": 3.988870776623685e-06, + "loss": 0.71721661, + "num_input_tokens_seen": 22292245, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.5078125, + "step": 1041, + "time_per_iteration": 2.5417978763580322 + }, + { + "auxiliary_loss_clip": 0.01210541, + "auxiliary_loss_mlp": 0.0106006, + "balance_loss_clip": 1.03360724, + "balance_loss_mlp": 1.05743289, + "epoch": 0.06264842928002405, + "flos": 23222605150080.0, + "grad_norm": 1.9564735382917973, + "language_loss": 0.80983013, + "learning_rate": 3.9888297097995905e-06, + "loss": 0.83253616, + "num_input_tokens_seen": 22311455, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.53125, + "step": 1042, + "time_per_iteration": 2.478926181793213 + }, + { + "auxiliary_loss_clip": 0.01210242, + "auxiliary_loss_mlp": 0.01057893, + "balance_loss_clip": 1.03352678, + "balance_loss_mlp": 1.05925727, + "epoch": 0.06270855253269202, + "flos": 38399495189760.0, + "grad_norm": 1.9466384226705415, + "language_loss": 0.76463902, + "learning_rate": 3.988788567558874e-06, + "loss": 0.78732038, + "num_input_tokens_seen": 22333750, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.515625, + "step": 1043, + "time_per_iteration": 2.6262781620025635 + }, + { + "auxiliary_loss_clip": 0.01203702, + "auxiliary_loss_mlp": 0.01066445, + "balance_loss_clip": 1.04174471, + "balance_loss_mlp": 1.05835676, + "epoch": 0.06276867578535998, + "flos": 22453542028800.0, + "grad_norm": 1.8860277298285366, + "language_loss": 0.92454541, + "learning_rate": 3.988747349903097e-06, + "loss": 0.94724691, + "num_input_tokens_seen": 22351940, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.453125, + "step": 1044, + "time_per_iteration": 2.4886953830718994 + }, + { + "auxiliary_loss_clip": 0.01204359, + "auxiliary_loss_mlp": 0.01073486, + "balance_loss_clip": 1.04824948, + "balance_loss_mlp": 1.05475259, + "epoch": 0.06282879903802796, + "flos": 22930435923840.0, + "grad_norm": 1.9539908597303346, + "language_loss": 0.8581354, + "learning_rate": 3.988706056833821e-06, + "loss": 0.88091385, + "num_input_tokens_seen": 22372085, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.5, + "step": 1045, + "time_per_iteration": 2.5382347106933594 + }, + { + "auxiliary_loss_clip": 0.01203094, + "auxiliary_loss_mlp": 0.01073753, + "balance_loss_clip": 1.04900479, + "balance_loss_mlp": 1.05618775, + "epoch": 0.06288892229069593, + "flos": 34819237019520.0, + "grad_norm": 2.0798822187092094, + "language_loss": 0.77675486, + "learning_rate": 3.9886646883526125e-06, + "loss": 0.79952335, + "num_input_tokens_seen": 22392020, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.46875, + "step": 1046, + "time_per_iteration": 2.548157215118408 + }, + { + "auxiliary_loss_clip": 0.01206605, + "auxiliary_loss_mlp": 0.01074859, + "balance_loss_clip": 1.04981279, + "balance_loss_mlp": 1.05837655, + "epoch": 0.06294904554336389, + "flos": 19427134642560.0, + "grad_norm": 2.197016946040243, + "language_loss": 0.77317166, + "learning_rate": 3.988623244461039e-06, + "loss": 0.79598629, + "num_input_tokens_seen": 22411180, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.4765625, + "step": 1047, + "time_per_iteration": 2.467973232269287 + }, + { + "auxiliary_loss_clip": 0.0121283, + "auxiliary_loss_mlp": 0.01061299, + "balance_loss_clip": 1.03584743, + "balance_loss_mlp": 1.05874014, + "epoch": 0.06300916879603187, + "flos": 40661867358720.0, + "grad_norm": 2.3103480986625753, + "language_loss": 0.7696203, + "learning_rate": 3.988581725160672e-06, + "loss": 0.79236162, + "num_input_tokens_seen": 22435105, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.546875, + "step": 1048, + "time_per_iteration": 2.636072874069214 + }, + { + "auxiliary_loss_clip": 0.01209565, + "auxiliary_loss_mlp": 0.01072791, + "balance_loss_clip": 1.0470655, + "balance_loss_mlp": 1.0583266, + "epoch": 0.06306929204869983, + "flos": 23804142341760.0, + "grad_norm": 2.2069714466600656, + "language_loss": 0.77757037, + "learning_rate": 3.988540130453087e-06, + "loss": 0.80039394, + "num_input_tokens_seen": 22452710, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.515625, + "step": 1049, + "time_per_iteration": 2.5173420906066895 + }, + { + "auxiliary_loss_clip": 0.01207985, + "auxiliary_loss_mlp": 0.01065489, + "balance_loss_clip": 1.04047871, + "balance_loss_mlp": 1.05734015, + "epoch": 0.0631294153013678, + "flos": 18915802583040.0, + "grad_norm": 2.316298014027776, + "language_loss": 0.83165503, + "learning_rate": 3.988498460339862e-06, + "loss": 0.85438979, + "num_input_tokens_seen": 22470175, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.5078125, + "step": 1050, + "time_per_iteration": 2.4742541313171387 + }, + { + "auxiliary_loss_clip": 0.01204381, + "auxiliary_loss_mlp": 0.01062607, + "balance_loss_clip": 1.03852665, + "balance_loss_mlp": 1.05776763, + "epoch": 0.06318953855403578, + "flos": 24280174310400.0, + "grad_norm": 2.1475970013183563, + "language_loss": 0.76909173, + "learning_rate": 3.988456714822575e-06, + "loss": 0.79176152, + "num_input_tokens_seen": 22490020, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.46875, + "step": 1051, + "time_per_iteration": 2.4629740715026855 + }, + { + "auxiliary_loss_clip": 0.01207556, + "auxiliary_loss_mlp": 0.01070398, + "balance_loss_clip": 1.04487562, + "balance_loss_mlp": 1.05788827, + "epoch": 0.06324966180670374, + "flos": 22528918719360.0, + "grad_norm": 2.090947022989376, + "language_loss": 0.80053556, + "learning_rate": 3.98841489390281e-06, + "loss": 0.82331514, + "num_input_tokens_seen": 22509685, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.4921875, + "step": 1052, + "time_per_iteration": 2.4729230403900146 + }, + { + "auxiliary_loss_clip": 0.01209047, + "auxiliary_loss_mlp": 0.01064567, + "balance_loss_clip": 1.03911567, + "balance_loss_mlp": 1.05839717, + "epoch": 0.06330978505937171, + "flos": 15778107884160.0, + "grad_norm": 2.21177767113968, + "language_loss": 0.78088665, + "learning_rate": 3.988372997582155e-06, + "loss": 0.80362272, + "num_input_tokens_seen": 22527905, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.5078125, + "step": 1053, + "time_per_iteration": 2.433969736099243 + }, + { + "auxiliary_loss_clip": 0.01209123, + "auxiliary_loss_mlp": 0.01055135, + "balance_loss_clip": 1.03094769, + "balance_loss_mlp": 1.0578481, + "epoch": 0.06336990831203967, + "flos": 21471098163840.0, + "grad_norm": 1.8421697124920164, + "language_loss": 0.84737611, + "learning_rate": 3.988331025862195e-06, + "loss": 0.8700186, + "num_input_tokens_seen": 22546335, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.515625, + "step": 1054, + "time_per_iteration": 2.4986183643341064 + }, + { + "auxiliary_loss_clip": 0.01205973, + "auxiliary_loss_mlp": 0.01065192, + "balance_loss_clip": 1.04051518, + "balance_loss_mlp": 1.05870843, + "epoch": 0.06343003156470765, + "flos": 18478877546880.0, + "grad_norm": 1.9255333357469135, + "language_loss": 0.8566432, + "learning_rate": 3.9882889787445225e-06, + "loss": 0.87935483, + "num_input_tokens_seen": 22563885, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.4765625, + "step": 1055, + "time_per_iteration": 2.5098037719726562 + }, + { + "auxiliary_loss_clip": 0.0121179, + "auxiliary_loss_mlp": 0.01071097, + "balance_loss_clip": 1.0451932, + "balance_loss_mlp": 1.05891657, + "epoch": 0.06349015481737562, + "flos": 25154886309120.0, + "grad_norm": 2.390503126540762, + "language_loss": 0.80966836, + "learning_rate": 3.988246856230734e-06, + "loss": 0.83249724, + "num_input_tokens_seen": 22583035, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.53125, + "step": 1056, + "time_per_iteration": 2.4944088459014893 + }, + { + "auxiliary_loss_clip": 0.01211818, + "auxiliary_loss_mlp": 0.01065832, + "balance_loss_clip": 1.03816319, + "balance_loss_mlp": 1.05503476, + "epoch": 0.06355027807004358, + "flos": 26871775562880.0, + "grad_norm": 2.70684555522199, + "language_loss": 0.81153649, + "learning_rate": 3.988204658322426e-06, + "loss": 0.83431304, + "num_input_tokens_seen": 22605055, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.5703125, + "step": 1057, + "time_per_iteration": 2.5327882766723633 + }, + { + "auxiliary_loss_clip": 0.0119703, + "auxiliary_loss_mlp": 0.01056395, + "balance_loss_clip": 1.03401923, + "balance_loss_mlp": 1.054492, + "epoch": 0.06361040132271156, + "flos": 21396691140480.0, + "grad_norm": 2.2830641052403826, + "language_loss": 0.8369416, + "learning_rate": 3.988162385021196e-06, + "loss": 0.85947585, + "num_input_tokens_seen": 22623760, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.421875, + "step": 1058, + "time_per_iteration": 2.4742424488067627 + }, + { + "auxiliary_loss_clip": 0.01208572, + "auxiliary_loss_mlp": 0.01067718, + "balance_loss_clip": 1.03934646, + "balance_loss_mlp": 1.05714464, + "epoch": 0.06367052457537953, + "flos": 25733765894400.0, + "grad_norm": 1.9712110015930453, + "language_loss": 0.87264961, + "learning_rate": 3.988120036328651e-06, + "loss": 0.8954125, + "num_input_tokens_seen": 22643000, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.515625, + "step": 1059, + "time_per_iteration": 5.514882564544678 + }, + { + "auxiliary_loss_clip": 0.01213823, + "auxiliary_loss_mlp": 0.01068179, + "balance_loss_clip": 1.04273927, + "balance_loss_mlp": 1.06130195, + "epoch": 0.0637306478280475, + "flos": 17631420992640.0, + "grad_norm": 2.227642611819728, + "language_loss": 0.9117676, + "learning_rate": 3.988077612246394e-06, + "loss": 0.9345876, + "num_input_tokens_seen": 22660460, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.5234375, + "step": 1060, + "time_per_iteration": 3.8977622985839844 + }, + { + "auxiliary_loss_clip": 0.01204952, + "auxiliary_loss_mlp": 0.01062848, + "balance_loss_clip": 1.03691983, + "balance_loss_mlp": 1.05582809, + "epoch": 0.06379077108071547, + "flos": 13662610427520.0, + "grad_norm": 1.9159755464944204, + "language_loss": 0.87713706, + "learning_rate": 3.988035112776035e-06, + "loss": 0.89981508, + "num_input_tokens_seen": 22679270, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.4921875, + "step": 1061, + "time_per_iteration": 2.4825663566589355 + }, + { + "auxiliary_loss_clip": 0.01213048, + "auxiliary_loss_mlp": 0.01066139, + "balance_loss_clip": 1.03862584, + "balance_loss_mlp": 1.05683804, + "epoch": 0.06385089433338344, + "flos": 28478849961600.0, + "grad_norm": 2.167309005799961, + "language_loss": 0.771905, + "learning_rate": 3.987992537919185e-06, + "loss": 0.79469687, + "num_input_tokens_seen": 22699330, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.5625, + "step": 1062, + "time_per_iteration": 2.5576398372650146 + }, + { + "auxiliary_loss_clip": 0.01206834, + "auxiliary_loss_mlp": 0.01063844, + "balance_loss_clip": 1.03896523, + "balance_loss_mlp": 1.05504322, + "epoch": 0.0639110175860514, + "flos": 24311057028480.0, + "grad_norm": 2.0414192004570872, + "language_loss": 0.86835265, + "learning_rate": 3.987949887677459e-06, + "loss": 0.89105946, + "num_input_tokens_seen": 22717945, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 1.515625, + "step": 1063, + "time_per_iteration": 2.472473382949829 + }, + { + "auxiliary_loss_clip": 0.01206458, + "auxiliary_loss_mlp": 0.0106328, + "balance_loss_clip": 1.03747082, + "balance_loss_mlp": 1.05539751, + "epoch": 0.06397114083871938, + "flos": 22090772620800.0, + "grad_norm": 2.0150359019026185, + "language_loss": 0.8051579, + "learning_rate": 3.9879071620524744e-06, + "loss": 0.82785529, + "num_input_tokens_seen": 22736790, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.515625, + "step": 1064, + "time_per_iteration": 2.478205919265747 + }, + { + "auxiliary_loss_clip": 0.01207278, + "auxiliary_loss_mlp": 0.01070567, + "balance_loss_clip": 1.04409075, + "balance_loss_mlp": 1.05682254, + "epoch": 0.06403126409138735, + "flos": 19572824206080.0, + "grad_norm": 2.254194289767691, + "language_loss": 0.84650666, + "learning_rate": 3.987864361045851e-06, + "loss": 0.86928511, + "num_input_tokens_seen": 22754745, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5078125, + "step": 1065, + "time_per_iteration": 2.456456184387207 + }, + { + "auxiliary_loss_clip": 0.01207067, + "auxiliary_loss_mlp": 0.01055171, + "balance_loss_clip": 1.03099585, + "balance_loss_mlp": 1.05966115, + "epoch": 0.06409138734405531, + "flos": 40807413267840.0, + "grad_norm": 1.66169186591579, + "language_loss": 0.68201709, + "learning_rate": 3.987821484659211e-06, + "loss": 0.70463943, + "num_input_tokens_seen": 22776780, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.46875, + "step": 1066, + "time_per_iteration": 2.6294829845428467 + }, + { + "auxiliary_loss_clip": 0.01206291, + "auxiliary_loss_mlp": 0.01076738, + "balance_loss_clip": 1.05003476, + "balance_loss_mlp": 1.05877519, + "epoch": 0.06415151059672328, + "flos": 20441610460800.0, + "grad_norm": 3.704601442813356, + "language_loss": 0.90345579, + "learning_rate": 3.987778532894181e-06, + "loss": 0.9262861, + "num_input_tokens_seen": 22793915, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.4765625, + "step": 1067, + "time_per_iteration": 2.459721565246582 + }, + { + "auxiliary_loss_clip": 0.01207052, + "auxiliary_loss_mlp": 0.01068129, + "balance_loss_clip": 1.04364336, + "balance_loss_mlp": 1.05625772, + "epoch": 0.06421163384939126, + "flos": 18072045129600.0, + "grad_norm": 2.8684947664405436, + "language_loss": 0.8343029, + "learning_rate": 3.987735505752391e-06, + "loss": 0.85705471, + "num_input_tokens_seen": 22812670, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.515625, + "step": 1068, + "time_per_iteration": 2.4611129760742188 + }, + { + "auxiliary_loss_clip": 0.01205753, + "auxiliary_loss_mlp": 0.01064379, + "balance_loss_clip": 1.03963113, + "balance_loss_mlp": 1.05991328, + "epoch": 0.06427175710205922, + "flos": 25119442563840.0, + "grad_norm": 2.4683216708617053, + "language_loss": 0.89402264, + "learning_rate": 3.987692403235471e-06, + "loss": 0.91672397, + "num_input_tokens_seen": 22832440, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.453125, + "step": 1069, + "time_per_iteration": 2.486241340637207 + }, + { + "auxiliary_loss_clip": 0.01206711, + "auxiliary_loss_mlp": 0.01082225, + "balance_loss_clip": 1.05555749, + "balance_loss_mlp": 1.05718124, + "epoch": 0.06433188035472719, + "flos": 17380549428480.0, + "grad_norm": 2.6076700233042396, + "language_loss": 0.95764256, + "learning_rate": 3.987649225345056e-06, + "loss": 0.98053193, + "num_input_tokens_seen": 22845495, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5, + "step": 1070, + "time_per_iteration": 2.413357734680176 + }, + { + "auxiliary_loss_clip": 0.01209924, + "auxiliary_loss_mlp": 0.01057318, + "balance_loss_clip": 1.0309608, + "balance_loss_mlp": 1.05859673, + "epoch": 0.06439200360739517, + "flos": 23546267625600.0, + "grad_norm": 1.8004745601001504, + "language_loss": 0.8819589, + "learning_rate": 3.987605972082782e-06, + "loss": 0.90463126, + "num_input_tokens_seen": 22865390, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.515625, + "step": 1071, + "time_per_iteration": 2.4717295169830322 + }, + { + "auxiliary_loss_clip": 0.01204044, + "auxiliary_loss_mlp": 0.01052011, + "balance_loss_clip": 1.02799058, + "balance_loss_mlp": 1.056633, + "epoch": 0.06445212686006313, + "flos": 21979772616960.0, + "grad_norm": 1.6498592642907823, + "language_loss": 0.75996184, + "learning_rate": 3.987562643450292e-06, + "loss": 0.78252238, + "num_input_tokens_seen": 22885495, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.4765625, + "step": 1072, + "time_per_iteration": 2.486936330795288 + }, + { + "auxiliary_loss_clip": 0.01207782, + "auxiliary_loss_mlp": 0.010661, + "balance_loss_clip": 1.03951669, + "balance_loss_mlp": 1.05679154, + "epoch": 0.0645122501127311, + "flos": 25921291824000.0, + "grad_norm": 1.95165590675185, + "language_loss": 0.80415034, + "learning_rate": 3.987519239449226e-06, + "loss": 0.82688916, + "num_input_tokens_seen": 22904845, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5078125, + "step": 1073, + "time_per_iteration": 2.476189613342285 + }, + { + "auxiliary_loss_clip": 0.01200054, + "auxiliary_loss_mlp": 0.01059954, + "balance_loss_clip": 1.03563547, + "balance_loss_mlp": 1.05634785, + "epoch": 0.06457237336539907, + "flos": 25626034028160.0, + "grad_norm": 1.7105520573330508, + "language_loss": 0.80205524, + "learning_rate": 3.987475760081233e-06, + "loss": 0.82465529, + "num_input_tokens_seen": 22925940, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.4375, + "step": 1074, + "time_per_iteration": 2.499133586883545 + }, + { + "auxiliary_loss_clip": 0.01204265, + "auxiliary_loss_mlp": 0.01060595, + "balance_loss_clip": 1.03469074, + "balance_loss_mlp": 1.05560029, + "epoch": 0.06463249661806704, + "flos": 19463979018240.0, + "grad_norm": 2.398999995550556, + "language_loss": 0.79203326, + "learning_rate": 3.987432205347958e-06, + "loss": 0.81468183, + "num_input_tokens_seen": 22944375, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.484375, + "step": 1075, + "time_per_iteration": 2.46777606010437 + }, + { + "auxiliary_loss_clip": 0.01207545, + "auxiliary_loss_mlp": 0.01064646, + "balance_loss_clip": 1.04086363, + "balance_loss_mlp": 1.05960226, + "epoch": 0.064692619870735, + "flos": 24498044254080.0, + "grad_norm": 2.7671348430420712, + "language_loss": 0.87819242, + "learning_rate": 3.987388575251055e-06, + "loss": 0.90091443, + "num_input_tokens_seen": 22959145, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.484375, + "step": 1076, + "time_per_iteration": 2.487417221069336 + }, + { + "auxiliary_loss_clip": 0.01199028, + "auxiliary_loss_mlp": 0.01053729, + "balance_loss_clip": 1.02918351, + "balance_loss_mlp": 1.05429745, + "epoch": 0.06475274312340297, + "flos": 17018677860480.0, + "grad_norm": 2.1388407300528534, + "language_loss": 0.80692923, + "learning_rate": 3.98734486979218e-06, + "loss": 0.82945681, + "num_input_tokens_seen": 22978100, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.4453125, + "step": 1077, + "time_per_iteration": 2.4290995597839355 + }, + { + "auxiliary_loss_clip": 0.01211867, + "auxiliary_loss_mlp": 0.01071702, + "balance_loss_clip": 1.04566646, + "balance_loss_mlp": 1.05862093, + "epoch": 0.06481286637607095, + "flos": 24572379450240.0, + "grad_norm": 2.618517400605346, + "language_loss": 0.91640681, + "learning_rate": 3.987301088972986e-06, + "loss": 0.93924248, + "num_input_tokens_seen": 22997285, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.53125, + "step": 1078, + "time_per_iteration": 2.500995635986328 + }, + { + "auxiliary_loss_clip": 0.01212712, + "auxiliary_loss_mlp": 0.01062475, + "balance_loss_clip": 1.03698754, + "balance_loss_mlp": 1.05874825, + "epoch": 0.06487298962873891, + "flos": 21105635235840.0, + "grad_norm": 2.106125999672554, + "language_loss": 0.78772497, + "learning_rate": 3.987257232795137e-06, + "loss": 0.81047684, + "num_input_tokens_seen": 23016285, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.5390625, + "step": 1079, + "time_per_iteration": 2.4510841369628906 + }, + { + "auxiliary_loss_clip": 0.01204732, + "auxiliary_loss_mlp": 0.01061369, + "balance_loss_clip": 1.03619218, + "balance_loss_mlp": 1.05602205, + "epoch": 0.06493311288140688, + "flos": 24608182331520.0, + "grad_norm": 2.051955253501364, + "language_loss": 0.69555283, + "learning_rate": 3.987213301260294e-06, + "loss": 0.7182138, + "num_input_tokens_seen": 23036420, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.484375, + "step": 1080, + "time_per_iteration": 2.5024302005767822 + }, + { + "auxiliary_loss_clip": 0.01204586, + "auxiliary_loss_mlp": 0.01063302, + "balance_loss_clip": 1.03649211, + "balance_loss_mlp": 1.05477285, + "epoch": 0.06499323613407486, + "flos": 25337994865920.0, + "grad_norm": 1.85895294752556, + "language_loss": 0.72094852, + "learning_rate": 3.987169294370123e-06, + "loss": 0.74362737, + "num_input_tokens_seen": 23056945, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.5, + "step": 1081, + "time_per_iteration": 2.5032622814178467 + }, + { + "auxiliary_loss_clip": 0.01201777, + "auxiliary_loss_mlp": 0.01064533, + "balance_loss_clip": 1.03867674, + "balance_loss_mlp": 1.0554111, + "epoch": 0.06505335938674282, + "flos": 20375714960640.0, + "grad_norm": 2.6422342029105863, + "language_loss": 0.84621316, + "learning_rate": 3.987125212126294e-06, + "loss": 0.86887628, + "num_input_tokens_seen": 23074940, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.46875, + "step": 1082, + "time_per_iteration": 2.459564447402954 + }, + { + "auxiliary_loss_clip": 0.01214386, + "auxiliary_loss_mlp": 0.01067955, + "balance_loss_clip": 1.04106104, + "balance_loss_mlp": 1.05817008, + "epoch": 0.06511348263941079, + "flos": 25337923038720.0, + "grad_norm": 2.177850298461163, + "language_loss": 0.8303026, + "learning_rate": 3.987081054530478e-06, + "loss": 0.85312605, + "num_input_tokens_seen": 23093420, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.5625, + "step": 1083, + "time_per_iteration": 2.504584550857544 + }, + { + "auxiliary_loss_clip": 0.01206291, + "auxiliary_loss_mlp": 0.01065303, + "balance_loss_clip": 1.03852844, + "balance_loss_mlp": 1.05794787, + "epoch": 0.06517360589207877, + "flos": 20332801186560.0, + "grad_norm": 2.6002614807121227, + "language_loss": 0.79689312, + "learning_rate": 3.987036821584348e-06, + "loss": 0.81960905, + "num_input_tokens_seen": 23111550, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.484375, + "step": 1084, + "time_per_iteration": 2.4530820846557617 + }, + { + "auxiliary_loss_clip": 0.01204762, + "auxiliary_loss_mlp": 0.01060872, + "balance_loss_clip": 1.03489637, + "balance_loss_mlp": 1.05634058, + "epoch": 0.06523372914474673, + "flos": 31681650061440.0, + "grad_norm": 2.1191367521188074, + "language_loss": 0.66211331, + "learning_rate": 3.986992513289584e-06, + "loss": 0.68476963, + "num_input_tokens_seen": 23130335, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.484375, + "step": 1085, + "time_per_iteration": 2.5733256340026855 + }, + { + "auxiliary_loss_clip": 0.01198609, + "auxiliary_loss_mlp": 0.01069188, + "balance_loss_clip": 1.04436827, + "balance_loss_mlp": 1.05400848, + "epoch": 0.0652938523974147, + "flos": 20778165918720.0, + "grad_norm": 1.9997547556569089, + "language_loss": 0.76998973, + "learning_rate": 3.9869481296478645e-06, + "loss": 0.79266769, + "num_input_tokens_seen": 23152380, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.4453125, + "step": 1086, + "time_per_iteration": 2.4958763122558594 + }, + { + "auxiliary_loss_clip": 0.01199669, + "auxiliary_loss_mlp": 0.01063299, + "balance_loss_clip": 1.03763306, + "balance_loss_mlp": 1.05291176, + "epoch": 0.06535397565008266, + "flos": 16690993061760.0, + "grad_norm": 2.1546414392836977, + "language_loss": 0.85154319, + "learning_rate": 3.986903670660872e-06, + "loss": 0.87417287, + "num_input_tokens_seen": 23171630, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.46875, + "step": 1087, + "time_per_iteration": 2.4456934928894043 + }, + { + "auxiliary_loss_clip": 0.01204231, + "auxiliary_loss_mlp": 0.01061167, + "balance_loss_clip": 1.03609776, + "balance_loss_mlp": 1.05594206, + "epoch": 0.06541409890275064, + "flos": 26868220116480.0, + "grad_norm": 1.7775330808837086, + "language_loss": 0.77970594, + "learning_rate": 3.9868591363302945e-06, + "loss": 0.80235994, + "num_input_tokens_seen": 23192520, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.484375, + "step": 1088, + "time_per_iteration": 2.53505277633667 + }, + { + "auxiliary_loss_clip": 0.01204134, + "auxiliary_loss_mlp": 0.01066637, + "balance_loss_clip": 1.04329574, + "balance_loss_mlp": 1.05602646, + "epoch": 0.06547422215541861, + "flos": 20521620005760.0, + "grad_norm": 1.9036978890371752, + "language_loss": 0.71191919, + "learning_rate": 3.9868145266578186e-06, + "loss": 0.73462689, + "num_input_tokens_seen": 23210710, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.484375, + "step": 1089, + "time_per_iteration": 2.4569168090820312 + }, + { + "auxiliary_loss_clip": 0.01204151, + "auxiliary_loss_mlp": 0.0105997, + "balance_loss_clip": 1.03566289, + "balance_loss_mlp": 1.05729651, + "epoch": 0.06553434540808657, + "flos": 22016616992640.0, + "grad_norm": 1.7924808842614686, + "language_loss": 0.85504186, + "learning_rate": 3.9867698416451366e-06, + "loss": 0.8776831, + "num_input_tokens_seen": 23230305, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.46875, + "step": 1090, + "time_per_iteration": 2.4624812602996826 + }, + { + "auxiliary_loss_clip": 0.01204567, + "auxiliary_loss_mlp": 0.01062106, + "balance_loss_clip": 1.0365001, + "balance_loss_mlp": 1.05594897, + "epoch": 0.06559446866075455, + "flos": 24608649208320.0, + "grad_norm": 2.2382380061135945, + "language_loss": 0.72027361, + "learning_rate": 3.9867250812939434e-06, + "loss": 0.74294031, + "num_input_tokens_seen": 23249015, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.484375, + "step": 1091, + "time_per_iteration": 2.4911999702453613 + }, + { + "auxiliary_loss_clip": 0.01201014, + "auxiliary_loss_mlp": 0.0106187, + "balance_loss_clip": 1.03638279, + "balance_loss_mlp": 1.05507159, + "epoch": 0.06565459191342252, + "flos": 24274679529600.0, + "grad_norm": 2.7948943762047525, + "language_loss": 0.82525271, + "learning_rate": 3.986680245605936e-06, + "loss": 0.8478815, + "num_input_tokens_seen": 23265105, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4609375, + "step": 1092, + "time_per_iteration": 2.510835886001587 + }, + { + "auxiliary_loss_clip": 0.01205888, + "auxiliary_loss_mlp": 0.01064535, + "balance_loss_clip": 1.03716493, + "balance_loss_mlp": 1.05484402, + "epoch": 0.06571471516609048, + "flos": 24787124910720.0, + "grad_norm": 4.994634192306823, + "language_loss": 0.71286589, + "learning_rate": 3.986635334582814e-06, + "loss": 0.73557013, + "num_input_tokens_seen": 23283950, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.515625, + "step": 1093, + "time_per_iteration": 2.528994560241699 + }, + { + "auxiliary_loss_clip": 0.01204526, + "auxiliary_loss_mlp": 0.01063177, + "balance_loss_clip": 1.03668869, + "balance_loss_mlp": 1.05701041, + "epoch": 0.06577483841875846, + "flos": 26214071581440.0, + "grad_norm": 1.8259988866114194, + "language_loss": 0.87971264, + "learning_rate": 3.986590348226282e-06, + "loss": 0.90238965, + "num_input_tokens_seen": 23305005, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.4765625, + "step": 1094, + "time_per_iteration": 2.50201678276062 + }, + { + "auxiliary_loss_clip": 0.01205803, + "auxiliary_loss_mlp": 0.0106202, + "balance_loss_clip": 1.0350548, + "balance_loss_mlp": 1.0575459, + "epoch": 0.06583496167142643, + "flos": 25080802508160.0, + "grad_norm": 1.6349502946236962, + "language_loss": 0.81364405, + "learning_rate": 3.986545286538044e-06, + "loss": 0.83632231, + "num_input_tokens_seen": 23323220, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.484375, + "step": 1095, + "time_per_iteration": 2.4947729110717773 + }, + { + "auxiliary_loss_clip": 0.01200923, + "auxiliary_loss_mlp": 0.01057353, + "balance_loss_clip": 1.03414297, + "balance_loss_mlp": 1.05544913, + "epoch": 0.06589508492409439, + "flos": 25629804956160.0, + "grad_norm": 2.4379029944224215, + "language_loss": 0.69712919, + "learning_rate": 3.986500149519811e-06, + "loss": 0.7197119, + "num_input_tokens_seen": 23342235, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.453125, + "step": 1096, + "time_per_iteration": 2.5135879516601562 + }, + { + "auxiliary_loss_clip": 0.01206873, + "auxiliary_loss_mlp": 0.01069815, + "balance_loss_clip": 1.04451883, + "balance_loss_mlp": 1.0592947, + "epoch": 0.06595520817676236, + "flos": 23621249266560.0, + "grad_norm": 1.7715259730160258, + "language_loss": 0.77498722, + "learning_rate": 3.986454937173292e-06, + "loss": 0.79775411, + "num_input_tokens_seen": 23363680, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.4765625, + "step": 1097, + "time_per_iteration": 2.4872820377349854 + }, + { + "auxiliary_loss_clip": 0.0120653, + "auxiliary_loss_mlp": 0.01063548, + "balance_loss_clip": 1.03814423, + "balance_loss_mlp": 1.05785179, + "epoch": 0.06601533142943034, + "flos": 33801708545280.0, + "grad_norm": 1.7376479388989727, + "language_loss": 0.77846545, + "learning_rate": 3.986409649500203e-06, + "loss": 0.80116618, + "num_input_tokens_seen": 23385590, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.484375, + "step": 1098, + "time_per_iteration": 2.583075761795044 + }, + { + "auxiliary_loss_clip": 0.01204454, + "auxiliary_loss_mlp": 0.01071542, + "balance_loss_clip": 1.04483891, + "balance_loss_mlp": 1.05739522, + "epoch": 0.0660754546820983, + "flos": 20259184262400.0, + "grad_norm": 1.9398633669636132, + "language_loss": 0.81675154, + "learning_rate": 3.986364286502261e-06, + "loss": 0.83951151, + "num_input_tokens_seen": 23402945, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.46875, + "step": 1099, + "time_per_iteration": 2.446969985961914 + }, + { + "auxiliary_loss_clip": 0.01195976, + "auxiliary_loss_mlp": 0.01052705, + "balance_loss_clip": 1.02801692, + "balance_loss_mlp": 1.0519135, + "epoch": 0.06613557793476627, + "flos": 19354164163200.0, + "grad_norm": 2.0018625732470245, + "language_loss": 0.82619941, + "learning_rate": 3.986318848181186e-06, + "loss": 0.84868616, + "num_input_tokens_seen": 23421410, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.4375, + "step": 1100, + "time_per_iteration": 2.4545743465423584 + }, + { + "auxiliary_loss_clip": 0.01204382, + "auxiliary_loss_mlp": 0.01060672, + "balance_loss_clip": 1.03630555, + "balance_loss_mlp": 1.05827951, + "epoch": 0.06619570118743424, + "flos": 13772568936960.0, + "grad_norm": 2.362466383483127, + "language_loss": 0.73439336, + "learning_rate": 3.986273334538702e-06, + "loss": 0.7570439, + "num_input_tokens_seen": 23438870, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.4609375, + "step": 1101, + "time_per_iteration": 6.740786790847778 + }, + { + "auxiliary_loss_clip": 0.0119874, + "auxiliary_loss_mlp": 0.01062411, + "balance_loss_clip": 1.03829539, + "balance_loss_mlp": 1.05373132, + "epoch": 0.06625582444010221, + "flos": 17857874286720.0, + "grad_norm": 2.46656505058328, + "language_loss": 0.86047602, + "learning_rate": 3.986227745576533e-06, + "loss": 0.88308758, + "num_input_tokens_seen": 23456975, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.453125, + "step": 1102, + "time_per_iteration": 2.4480903148651123 + }, + { + "auxiliary_loss_clip": 0.01200394, + "auxiliary_loss_mlp": 0.01057443, + "balance_loss_clip": 1.0322063, + "balance_loss_mlp": 1.05588222, + "epoch": 0.06631594769277017, + "flos": 11838707579520.0, + "grad_norm": 2.0494810685505995, + "language_loss": 0.81707513, + "learning_rate": 3.98618208129641e-06, + "loss": 0.83965349, + "num_input_tokens_seen": 23473440, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.4453125, + "step": 1103, + "time_per_iteration": 2.4419338703155518 + }, + { + "auxiliary_loss_clip": 0.01203538, + "auxiliary_loss_mlp": 0.01063441, + "balance_loss_clip": 1.04029047, + "balance_loss_mlp": 1.05891824, + "epoch": 0.06637607094543815, + "flos": 19793351756160.0, + "grad_norm": 1.7865556655629211, + "language_loss": 0.82059169, + "learning_rate": 3.986136341700063e-06, + "loss": 0.84326148, + "num_input_tokens_seen": 23493880, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.4453125, + "step": 1104, + "time_per_iteration": 2.506230115890503 + }, + { + "auxiliary_loss_clip": 0.01195268, + "auxiliary_loss_mlp": 0.01047754, + "balance_loss_clip": 1.02229071, + "balance_loss_mlp": 1.05232382, + "epoch": 0.06643619419810612, + "flos": 25485659677440.0, + "grad_norm": 1.6089454783719872, + "language_loss": 0.80542791, + "learning_rate": 3.986090526789227e-06, + "loss": 0.82785821, + "num_input_tokens_seen": 23514920, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4296875, + "step": 1105, + "time_per_iteration": 2.524385929107666 + }, + { + "auxiliary_loss_clip": 0.01197193, + "auxiliary_loss_mlp": 0.01060252, + "balance_loss_clip": 1.03720832, + "balance_loss_mlp": 1.05697632, + "epoch": 0.06649631745077408, + "flos": 16946533393920.0, + "grad_norm": 1.8452117827451007, + "language_loss": 0.96738935, + "learning_rate": 3.986044636565639e-06, + "loss": 0.98996383, + "num_input_tokens_seen": 23531635, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.40625, + "step": 1106, + "time_per_iteration": 2.455122470855713 + }, + { + "auxiliary_loss_clip": 0.01204143, + "auxiliary_loss_mlp": 0.01060087, + "balance_loss_clip": 1.03436136, + "balance_loss_mlp": 1.05509543, + "epoch": 0.06655644070344206, + "flos": 17858592558720.0, + "grad_norm": 1.9568581550144768, + "language_loss": 0.82766026, + "learning_rate": 3.985998671031039e-06, + "loss": 0.85030258, + "num_input_tokens_seen": 23551020, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.4921875, + "step": 1107, + "time_per_iteration": 2.4554357528686523 + }, + { + "auxiliary_loss_clip": 0.01077187, + "auxiliary_loss_mlp": 0.01010186, + "balance_loss_clip": 1.0061568, + "balance_loss_mlp": 1.01696265, + "epoch": 0.06661656395611003, + "flos": 61419350021760.0, + "grad_norm": 0.8235952583150978, + "language_loss": 0.56729984, + "learning_rate": 3.9859526301871705e-06, + "loss": 0.58817357, + "num_input_tokens_seen": 23610675, + "router_z_loss_clip": 0.0402832, + "router_z_loss_mlp": 0.6015625, + "step": 1108, + "time_per_iteration": 3.0248770713806152 + }, + { + "auxiliary_loss_clip": 0.01200435, + "auxiliary_loss_mlp": 0.01065514, + "balance_loss_clip": 1.04034865, + "balance_loss_mlp": 1.05397463, + "epoch": 0.066676687208778, + "flos": 20662856282880.0, + "grad_norm": 2.4203653272420693, + "language_loss": 0.72493321, + "learning_rate": 3.9859065140357795e-06, + "loss": 0.74759269, + "num_input_tokens_seen": 23628710, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.46875, + "step": 1109, + "time_per_iteration": 2.4559717178344727 + }, + { + "auxiliary_loss_clip": 0.01197389, + "auxiliary_loss_mlp": 0.01063103, + "balance_loss_clip": 1.03759217, + "balance_loss_mlp": 1.05389571, + "epoch": 0.06673681046144596, + "flos": 20923280864640.0, + "grad_norm": 3.084593088047962, + "language_loss": 0.78256035, + "learning_rate": 3.985860322578614e-06, + "loss": 0.80516529, + "num_input_tokens_seen": 23649160, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4375, + "step": 1110, + "time_per_iteration": 2.4989912509918213 + }, + { + "auxiliary_loss_clip": 0.01201522, + "auxiliary_loss_mlp": 0.01057407, + "balance_loss_clip": 1.0334934, + "balance_loss_mlp": 1.05598152, + "epoch": 0.06679693371411394, + "flos": 31065818359680.0, + "grad_norm": 2.197430378352105, + "language_loss": 0.71290207, + "learning_rate": 3.985814055817427e-06, + "loss": 0.73549128, + "num_input_tokens_seen": 23671995, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.453125, + "step": 1111, + "time_per_iteration": 2.5445287227630615 + }, + { + "auxiliary_loss_clip": 0.0120524, + "auxiliary_loss_mlp": 0.01062473, + "balance_loss_clip": 1.03833365, + "balance_loss_mlp": 1.05788755, + "epoch": 0.0668570569667819, + "flos": 21726135705600.0, + "grad_norm": 1.8078370838130353, + "language_loss": 0.78315711, + "learning_rate": 3.985767713753971e-06, + "loss": 0.80583429, + "num_input_tokens_seen": 23690705, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.4765625, + "step": 1112, + "time_per_iteration": 2.4791040420532227 + }, + { + "auxiliary_loss_clip": 0.01203172, + "auxiliary_loss_mlp": 0.01058254, + "balance_loss_clip": 1.03426933, + "balance_loss_mlp": 1.05794001, + "epoch": 0.06691718021944987, + "flos": 22747255539840.0, + "grad_norm": 2.0430507180103943, + "language_loss": 0.78819263, + "learning_rate": 3.985721296390005e-06, + "loss": 0.81080687, + "num_input_tokens_seen": 23709990, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.453125, + "step": 1113, + "time_per_iteration": 2.4637296199798584 + }, + { + "auxiliary_loss_clip": 0.01195153, + "auxiliary_loss_mlp": 0.01053406, + "balance_loss_clip": 1.03056598, + "balance_loss_mlp": 1.05255365, + "epoch": 0.06697730347211785, + "flos": 16545626720640.0, + "grad_norm": 2.035611213247421, + "language_loss": 0.82393003, + "learning_rate": 3.985674803727289e-06, + "loss": 0.84641558, + "num_input_tokens_seen": 23728485, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.4296875, + "step": 1114, + "time_per_iteration": 2.434006452560425 + }, + { + "auxiliary_loss_clip": 0.01076, + "auxiliary_loss_mlp": 0.01008619, + "balance_loss_clip": 1.00463712, + "balance_loss_mlp": 1.0165143, + "epoch": 0.06703742672478581, + "flos": 59782326658560.0, + "grad_norm": 0.8339607525511222, + "language_loss": 0.58126414, + "learning_rate": 3.985628235767584e-06, + "loss": 0.60211033, + "num_input_tokens_seen": 23786650, + "router_z_loss_clip": 0.03979492, + "router_z_loss_mlp": 0.59375, + "step": 1115, + "time_per_iteration": 3.020782709121704 + }, + { + "auxiliary_loss_clip": 0.01200335, + "auxiliary_loss_mlp": 0.01059737, + "balance_loss_clip": 1.03427422, + "balance_loss_mlp": 1.05479646, + "epoch": 0.06709754997745378, + "flos": 16800197385600.0, + "grad_norm": 2.8263674595854464, + "language_loss": 0.91123891, + "learning_rate": 3.985581592512658e-06, + "loss": 0.93383968, + "num_input_tokens_seen": 23802555, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.453125, + "step": 1116, + "time_per_iteration": 2.446439504623413 + }, + { + "auxiliary_loss_clip": 0.01209259, + "auxiliary_loss_mlp": 0.01067721, + "balance_loss_clip": 1.04323506, + "balance_loss_mlp": 1.06065357, + "epoch": 0.06715767323012176, + "flos": 22123917895680.0, + "grad_norm": 2.019283248682947, + "language_loss": 0.8709814, + "learning_rate": 3.985534873964279e-06, + "loss": 0.89375114, + "num_input_tokens_seen": 23822945, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.484375, + "step": 1117, + "time_per_iteration": 2.486212968826294 + }, + { + "auxiliary_loss_clip": 0.01074244, + "auxiliary_loss_mlp": 0.0100646, + "balance_loss_clip": 1.00250196, + "balance_loss_mlp": 1.01550937, + "epoch": 0.06721779648278972, + "flos": 66618100137600.0, + "grad_norm": 0.9454776991467404, + "language_loss": 0.59798217, + "learning_rate": 3.985488080124218e-06, + "loss": 0.6187892, + "num_input_tokens_seen": 23874075, + "router_z_loss_clip": 0.03955078, + "router_z_loss_mlp": 0.5859375, + "step": 1118, + "time_per_iteration": 3.0197594165802 + }, + { + "auxiliary_loss_clip": 0.01201284, + "auxiliary_loss_mlp": 0.01056466, + "balance_loss_clip": 1.03255224, + "balance_loss_mlp": 1.05418777, + "epoch": 0.06727791973545769, + "flos": 22382474970240.0, + "grad_norm": 3.7568577616727468, + "language_loss": 0.83498162, + "learning_rate": 3.985441210994251e-06, + "loss": 0.85755914, + "num_input_tokens_seen": 23889720, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.4765625, + "step": 1119, + "time_per_iteration": 2.4535257816314697 + }, + { + "auxiliary_loss_clip": 0.01199216, + "auxiliary_loss_mlp": 0.01059451, + "balance_loss_clip": 1.03732562, + "balance_loss_mlp": 1.0562222, + "epoch": 0.06733804298812565, + "flos": 24280210224000.0, + "grad_norm": 1.8165724331790314, + "language_loss": 0.8480413, + "learning_rate": 3.9853942665761545e-06, + "loss": 0.87062794, + "num_input_tokens_seen": 23909385, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.4296875, + "step": 1120, + "time_per_iteration": 2.533182382583618 + }, + { + "auxiliary_loss_clip": 0.01208718, + "auxiliary_loss_mlp": 0.01067102, + "balance_loss_clip": 1.04269981, + "balance_loss_mlp": 1.0602659, + "epoch": 0.06739816624079363, + "flos": 15918230839680.0, + "grad_norm": 2.032922437281707, + "language_loss": 0.78959441, + "learning_rate": 3.985347246871708e-06, + "loss": 0.81235266, + "num_input_tokens_seen": 23926830, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.484375, + "step": 1121, + "time_per_iteration": 2.4523215293884277 + }, + { + "auxiliary_loss_clip": 0.01072018, + "auxiliary_loss_mlp": 0.01004747, + "balance_loss_clip": 1.00033593, + "balance_loss_mlp": 1.0132587, + "epoch": 0.0674582894934616, + "flos": 71398567353600.0, + "grad_norm": 0.7615352754050735, + "language_loss": 0.58346939, + "learning_rate": 3.985300151882694e-06, + "loss": 0.60423702, + "num_input_tokens_seen": 23992640, + "router_z_loss_clip": 0.04418945, + "router_z_loss_mlp": 0.5859375, + "step": 1122, + "time_per_iteration": 3.2087855339050293 + }, + { + "auxiliary_loss_clip": 0.0120309, + "auxiliary_loss_mlp": 0.01065913, + "balance_loss_clip": 1.04245234, + "balance_loss_mlp": 1.0584271, + "epoch": 0.06751841274612956, + "flos": 25264952559360.0, + "grad_norm": 2.0430211727412098, + "language_loss": 0.71546745, + "learning_rate": 3.985252981610901e-06, + "loss": 0.73815745, + "num_input_tokens_seen": 24011135, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.4453125, + "step": 1123, + "time_per_iteration": 2.5017640590667725 + }, + { + "auxiliary_loss_clip": 0.01201701, + "auxiliary_loss_mlp": 0.01057362, + "balance_loss_clip": 1.03216124, + "balance_loss_mlp": 1.05484593, + "epoch": 0.06757853599879754, + "flos": 23802741711360.0, + "grad_norm": 1.8376842720828679, + "language_loss": 0.79288971, + "learning_rate": 3.985205736058114e-06, + "loss": 0.81548035, + "num_input_tokens_seen": 24030695, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.46875, + "step": 1124, + "time_per_iteration": 2.4980688095092773 + }, + { + "auxiliary_loss_clip": 0.01196564, + "auxiliary_loss_mlp": 0.01054377, + "balance_loss_clip": 1.03204954, + "balance_loss_mlp": 1.05469489, + "epoch": 0.0676386592514655, + "flos": 21033742164480.0, + "grad_norm": 2.0983993205372253, + "language_loss": 0.71198726, + "learning_rate": 3.985158415226128e-06, + "loss": 0.73449671, + "num_input_tokens_seen": 24050680, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.421875, + "step": 1125, + "time_per_iteration": 2.4704325199127197 + }, + { + "auxiliary_loss_clip": 0.01198895, + "auxiliary_loss_mlp": 0.01068522, + "balance_loss_clip": 1.04247451, + "balance_loss_mlp": 1.05620742, + "epoch": 0.06769878250413347, + "flos": 25556331686400.0, + "grad_norm": 2.9171204901367243, + "language_loss": 0.80814254, + "learning_rate": 3.985111019116736e-06, + "loss": 0.83081663, + "num_input_tokens_seen": 24067205, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.421875, + "step": 1126, + "time_per_iteration": 2.5046803951263428 + }, + { + "auxiliary_loss_clip": 0.01070877, + "auxiliary_loss_mlp": 0.01002736, + "balance_loss_clip": 0.9986586, + "balance_loss_mlp": 1.01286924, + "epoch": 0.06775890575680145, + "flos": 70655251305600.0, + "grad_norm": 0.7804116507992601, + "language_loss": 0.59733766, + "learning_rate": 3.985063547731735e-06, + "loss": 0.61807376, + "num_input_tokens_seen": 24131320, + "router_z_loss_clip": 0.04077148, + "router_z_loss_mlp": 0.578125, + "step": 1127, + "time_per_iteration": 3.0877249240875244 + }, + { + "auxiliary_loss_clip": 0.01199514, + "auxiliary_loss_mlp": 0.01056848, + "balance_loss_clip": 1.03376949, + "balance_loss_mlp": 1.05723238, + "epoch": 0.06781902900946941, + "flos": 24235500769920.0, + "grad_norm": 2.13286114653412, + "language_loss": 0.81392133, + "learning_rate": 3.985016001072925e-06, + "loss": 0.83648497, + "num_input_tokens_seen": 24149930, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.421875, + "step": 1128, + "time_per_iteration": 2.5406885147094727 + }, + { + "auxiliary_loss_clip": 0.01208088, + "auxiliary_loss_mlp": 0.0105195, + "balance_loss_clip": 1.02692807, + "balance_loss_mlp": 1.0598706, + "epoch": 0.06787915226213738, + "flos": 22417523665920.0, + "grad_norm": 3.047918834731733, + "language_loss": 0.76034033, + "learning_rate": 3.984968379142109e-06, + "loss": 0.78294069, + "num_input_tokens_seen": 24169590, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.484375, + "step": 1129, + "time_per_iteration": 2.486829996109009 + }, + { + "auxiliary_loss_clip": 0.01201584, + "auxiliary_loss_mlp": 0.01061333, + "balance_loss_clip": 1.03721654, + "balance_loss_mlp": 1.05536139, + "epoch": 0.06793927551480534, + "flos": 37706922080640.0, + "grad_norm": 1.8621491947103987, + "language_loss": 0.72340226, + "learning_rate": 3.984920681941094e-06, + "loss": 0.74603146, + "num_input_tokens_seen": 24189965, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.4609375, + "step": 1130, + "time_per_iteration": 2.6195991039276123 + }, + { + "auxiliary_loss_clip": 0.01197626, + "auxiliary_loss_mlp": 0.01063599, + "balance_loss_clip": 1.03957844, + "balance_loss_mlp": 1.05584192, + "epoch": 0.06799939876747332, + "flos": 20631398947200.0, + "grad_norm": 2.3479224842049917, + "language_loss": 0.80624223, + "learning_rate": 3.984872909471688e-06, + "loss": 0.82885444, + "num_input_tokens_seen": 24208045, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.421875, + "step": 1131, + "time_per_iteration": 2.4917030334472656 + }, + { + "auxiliary_loss_clip": 0.01196301, + "auxiliary_loss_mlp": 0.0106802, + "balance_loss_clip": 1.04398775, + "balance_loss_mlp": 1.05550814, + "epoch": 0.06805952202014129, + "flos": 14864755829760.0, + "grad_norm": 2.1673533627141652, + "language_loss": 0.8104949, + "learning_rate": 3.984825061735701e-06, + "loss": 0.83313811, + "num_input_tokens_seen": 24223805, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.40625, + "step": 1132, + "time_per_iteration": 2.4325902462005615 + }, + { + "auxiliary_loss_clip": 0.01199688, + "auxiliary_loss_mlp": 0.01069367, + "balance_loss_clip": 1.04525137, + "balance_loss_mlp": 1.05629563, + "epoch": 0.06811964527280925, + "flos": 48909434947200.0, + "grad_norm": 1.450417149602266, + "language_loss": 0.63629937, + "learning_rate": 3.9847771387349495e-06, + "loss": 0.65898991, + "num_input_tokens_seen": 24249475, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.4296875, + "step": 1133, + "time_per_iteration": 2.7164230346679688 + }, + { + "auxiliary_loss_clip": 0.01203203, + "auxiliary_loss_mlp": 0.01059397, + "balance_loss_clip": 1.03194308, + "balance_loss_mlp": 1.05427325, + "epoch": 0.06817976852547723, + "flos": 15377273038080.0, + "grad_norm": 2.5027083277203963, + "language_loss": 0.74811196, + "learning_rate": 3.9847291404712506e-06, + "loss": 0.77073789, + "num_input_tokens_seen": 24267980, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.484375, + "step": 1134, + "time_per_iteration": 2.420506000518799 + }, + { + "auxiliary_loss_clip": 0.01201452, + "auxiliary_loss_mlp": 0.01064371, + "balance_loss_clip": 1.04088652, + "balance_loss_mlp": 1.05952573, + "epoch": 0.0682398917781452, + "flos": 20155690200960.0, + "grad_norm": 2.0759609389962037, + "language_loss": 0.87245119, + "learning_rate": 3.984681066946423e-06, + "loss": 0.89510942, + "num_input_tokens_seen": 24286805, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.421875, + "step": 1135, + "time_per_iteration": 2.464738607406616 + }, + { + "auxiliary_loss_clip": 0.01200809, + "auxiliary_loss_mlp": 0.01055494, + "balance_loss_clip": 1.03010249, + "balance_loss_mlp": 1.05388534, + "epoch": 0.06830001503081316, + "flos": 23440618748160.0, + "grad_norm": 2.383261313924855, + "language_loss": 0.78335494, + "learning_rate": 3.984632918162291e-06, + "loss": 0.80591798, + "num_input_tokens_seen": 24305855, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.46875, + "step": 1136, + "time_per_iteration": 2.4486002922058105 + }, + { + "auxiliary_loss_clip": 0.01206211, + "auxiliary_loss_mlp": 0.01073979, + "balance_loss_clip": 1.04906416, + "balance_loss_mlp": 1.06089664, + "epoch": 0.06836013828348114, + "flos": 34349813153280.0, + "grad_norm": 3.2008110915617207, + "language_loss": 0.83941948, + "learning_rate": 3.984584694120679e-06, + "loss": 0.86222148, + "num_input_tokens_seen": 24326535, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 1.453125, + "step": 1137, + "time_per_iteration": 2.5714635848999023 + }, + { + "auxiliary_loss_clip": 0.01199575, + "auxiliary_loss_mlp": 0.01061827, + "balance_loss_clip": 1.03806889, + "balance_loss_mlp": 1.05628538, + "epoch": 0.06842026153614911, + "flos": 23148844571520.0, + "grad_norm": 2.067587662099544, + "language_loss": 0.78669268, + "learning_rate": 3.984536394823418e-06, + "loss": 0.80930662, + "num_input_tokens_seen": 24345810, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.4375, + "step": 1138, + "time_per_iteration": 2.459437370300293 + }, + { + "auxiliary_loss_clip": 0.01202271, + "auxiliary_loss_mlp": 0.01059469, + "balance_loss_clip": 1.03480506, + "balance_loss_mlp": 1.05729747, + "epoch": 0.06848038478881707, + "flos": 24608972430720.0, + "grad_norm": 2.606905885529735, + "language_loss": 0.85683703, + "learning_rate": 3.984488020272336e-06, + "loss": 0.87945449, + "num_input_tokens_seen": 24366095, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.453125, + "step": 1139, + "time_per_iteration": 2.5198936462402344 + }, + { + "auxiliary_loss_clip": 0.01201061, + "auxiliary_loss_mlp": 0.01057605, + "balance_loss_clip": 1.03297663, + "balance_loss_mlp": 1.05803108, + "epoch": 0.06854050804148504, + "flos": 40880994278400.0, + "grad_norm": 1.7528507300348692, + "language_loss": 0.74826896, + "learning_rate": 3.984439570469271e-06, + "loss": 0.77085567, + "num_input_tokens_seen": 24388665, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.4296875, + "step": 1140, + "time_per_iteration": 2.6609106063842773 + }, + { + "auxiliary_loss_clip": 0.01198151, + "auxiliary_loss_mlp": 0.01062313, + "balance_loss_clip": 1.03698146, + "balance_loss_mlp": 1.05620885, + "epoch": 0.06860063129415302, + "flos": 31686354743040.0, + "grad_norm": 2.210262717529583, + "language_loss": 0.68083167, + "learning_rate": 3.9843910454160574e-06, + "loss": 0.70343632, + "num_input_tokens_seen": 24407705, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.421875, + "step": 1141, + "time_per_iteration": 2.5661122798919678 + }, + { + "auxiliary_loss_clip": 0.01205913, + "auxiliary_loss_mlp": 0.0106664, + "balance_loss_clip": 1.04098654, + "balance_loss_mlp": 1.05848837, + "epoch": 0.06866075454682098, + "flos": 26542007775360.0, + "grad_norm": 1.82433360121009, + "language_loss": 0.79399014, + "learning_rate": 3.984342445114538e-06, + "loss": 0.8167156, + "num_input_tokens_seen": 24428390, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.46875, + "step": 1142, + "time_per_iteration": 2.5499107837677 + }, + { + "auxiliary_loss_clip": 0.0120232, + "auxiliary_loss_mlp": 0.01061074, + "balance_loss_clip": 1.03650475, + "balance_loss_mlp": 1.05730164, + "epoch": 0.06872087779948895, + "flos": 29789768724480.0, + "grad_norm": 1.6821535193321122, + "language_loss": 0.68701231, + "learning_rate": 3.984293769566553e-06, + "loss": 0.70964622, + "num_input_tokens_seen": 24450810, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.4453125, + "step": 1143, + "time_per_iteration": 5.380373239517212 + }, + { + "auxiliary_loss_clip": 0.01196375, + "auxiliary_loss_mlp": 0.01059216, + "balance_loss_clip": 1.03670955, + "balance_loss_mlp": 1.05885804, + "epoch": 0.06878100105215693, + "flos": 26941118768640.0, + "grad_norm": 1.8434796401844256, + "language_loss": 0.74694496, + "learning_rate": 3.98424501877395e-06, + "loss": 0.76950091, + "num_input_tokens_seen": 24469965, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.375, + "step": 1144, + "time_per_iteration": 2.536839485168457 + }, + { + "auxiliary_loss_clip": 0.01204332, + "auxiliary_loss_mlp": 0.01064223, + "balance_loss_clip": 1.03893876, + "balance_loss_mlp": 1.05654943, + "epoch": 0.06884112430482489, + "flos": 10670748946560.0, + "grad_norm": 2.296493270147659, + "language_loss": 0.91720247, + "learning_rate": 3.984196192738577e-06, + "loss": 0.93988806, + "num_input_tokens_seen": 24486370, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4765625, + "step": 1145, + "time_per_iteration": 2.44307017326355 + }, + { + "auxiliary_loss_clip": 0.01206887, + "auxiliary_loss_mlp": 0.01067692, + "balance_loss_clip": 1.04160893, + "balance_loss_mlp": 1.05779576, + "epoch": 0.06890124755749286, + "flos": 20193647898240.0, + "grad_norm": 2.4650333910918865, + "language_loss": 0.82189268, + "learning_rate": 3.984147291462285e-06, + "loss": 0.84463847, + "num_input_tokens_seen": 24503780, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.4921875, + "step": 1146, + "time_per_iteration": 2.4743804931640625 + }, + { + "auxiliary_loss_clip": 0.01198651, + "auxiliary_loss_mlp": 0.01061891, + "balance_loss_clip": 1.03869271, + "balance_loss_mlp": 1.05755806, + "epoch": 0.06896137081016084, + "flos": 20449224144000.0, + "grad_norm": 2.5935722439127744, + "language_loss": 0.85150343, + "learning_rate": 3.98409831494693e-06, + "loss": 0.87410891, + "num_input_tokens_seen": 24522320, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.4140625, + "step": 1147, + "time_per_iteration": 2.48410701751709 + }, + { + "auxiliary_loss_clip": 0.01201275, + "auxiliary_loss_mlp": 0.01064743, + "balance_loss_clip": 1.03988767, + "balance_loss_mlp": 1.05699074, + "epoch": 0.0690214940628288, + "flos": 18368703555840.0, + "grad_norm": 2.3932988353276645, + "language_loss": 0.86235052, + "learning_rate": 3.984049263194367e-06, + "loss": 0.88501072, + "num_input_tokens_seen": 24540445, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.4453125, + "step": 1148, + "time_per_iteration": 2.455441951751709 + }, + { + "auxiliary_loss_clip": 0.01199305, + "auxiliary_loss_mlp": 0.01058033, + "balance_loss_clip": 1.0337863, + "balance_loss_mlp": 1.05560231, + "epoch": 0.06908161731549677, + "flos": 20558033418240.0, + "grad_norm": 2.070658514783469, + "language_loss": 0.69185412, + "learning_rate": 3.9840001362064575e-06, + "loss": 0.71442747, + "num_input_tokens_seen": 24557105, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.4375, + "step": 1149, + "time_per_iteration": 2.461880922317505 + }, + { + "auxiliary_loss_clip": 0.01203307, + "auxiliary_loss_mlp": 0.0105502, + "balance_loss_clip": 1.0289495, + "balance_loss_mlp": 1.05679548, + "epoch": 0.06914174056816474, + "flos": 27563666313600.0, + "grad_norm": 2.828663566846353, + "language_loss": 0.84069788, + "learning_rate": 3.983950933985064e-06, + "loss": 0.86328113, + "num_input_tokens_seen": 24578240, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.4609375, + "step": 1150, + "time_per_iteration": 2.509122371673584 + }, + { + "auxiliary_loss_clip": 0.01206199, + "auxiliary_loss_mlp": 0.01058671, + "balance_loss_clip": 1.03453135, + "balance_loss_mlp": 1.06116164, + "epoch": 0.06920186382083271, + "flos": 15304015249920.0, + "grad_norm": 3.57752822218259, + "language_loss": 0.82044697, + "learning_rate": 3.983901656532052e-06, + "loss": 0.84309566, + "num_input_tokens_seen": 24593585, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.453125, + "step": 1151, + "time_per_iteration": 2.420128345489502 + }, + { + "auxiliary_loss_clip": 0.01201904, + "auxiliary_loss_mlp": 0.01062248, + "balance_loss_clip": 1.03883505, + "balance_loss_mlp": 1.06011868, + "epoch": 0.06926198707350067, + "flos": 25191227894400.0, + "grad_norm": 1.8279979065740934, + "language_loss": 0.85587418, + "learning_rate": 3.983852303849291e-06, + "loss": 0.87851566, + "num_input_tokens_seen": 24613110, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.4140625, + "step": 1152, + "time_per_iteration": 2.498180866241455 + }, + { + "auxiliary_loss_clip": 0.01198565, + "auxiliary_loss_mlp": 0.01060938, + "balance_loss_clip": 1.03747797, + "balance_loss_mlp": 1.05767703, + "epoch": 0.06932211032616864, + "flos": 13256137146240.0, + "grad_norm": 2.1251557516582995, + "language_loss": 0.90536988, + "learning_rate": 3.983802875938651e-06, + "loss": 0.92796487, + "num_input_tokens_seen": 24628795, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.40625, + "step": 1153, + "time_per_iteration": 2.422480821609497 + }, + { + "auxiliary_loss_clip": 0.01200757, + "auxiliary_loss_mlp": 0.01054623, + "balance_loss_clip": 1.03035152, + "balance_loss_mlp": 1.05790865, + "epoch": 0.06938223357883662, + "flos": 24827381078400.0, + "grad_norm": 2.190017778582164, + "language_loss": 0.81363368, + "learning_rate": 3.983753372802008e-06, + "loss": 0.83618748, + "num_input_tokens_seen": 24645480, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.4296875, + "step": 1154, + "time_per_iteration": 2.528118848800659 + }, + { + "auxiliary_loss_clip": 0.01202754, + "auxiliary_loss_mlp": 0.01068044, + "balance_loss_clip": 1.04476249, + "balance_loss_mlp": 1.06078768, + "epoch": 0.06944235683150458, + "flos": 27267977554560.0, + "grad_norm": 32.79102955334026, + "language_loss": 0.7560131, + "learning_rate": 3.983703794441237e-06, + "loss": 0.77872109, + "num_input_tokens_seen": 24664630, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.421875, + "step": 1155, + "time_per_iteration": 2.5010287761688232 + }, + { + "auxiliary_loss_clip": 0.01196108, + "auxiliary_loss_mlp": 0.01059268, + "balance_loss_clip": 1.03595114, + "balance_loss_mlp": 1.05511975, + "epoch": 0.06950248008417255, + "flos": 25808065176960.0, + "grad_norm": 1.6800097473238784, + "language_loss": 0.71119213, + "learning_rate": 3.98365414085822e-06, + "loss": 0.73374593, + "num_input_tokens_seen": 24684210, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.40625, + "step": 1156, + "time_per_iteration": 2.5034549236297607 + }, + { + "auxiliary_loss_clip": 0.01199728, + "auxiliary_loss_mlp": 0.01069841, + "balance_loss_clip": 1.04437828, + "balance_loss_mlp": 1.05711889, + "epoch": 0.06956260333684053, + "flos": 22271546793600.0, + "grad_norm": 2.0301788984863918, + "language_loss": 0.75299567, + "learning_rate": 3.98360441205484e-06, + "loss": 0.77569139, + "num_input_tokens_seen": 24702490, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4296875, + "step": 1157, + "time_per_iteration": 2.4654574394226074 + }, + { + "auxiliary_loss_clip": 0.0119867, + "auxiliary_loss_mlp": 0.0105715, + "balance_loss_clip": 1.03240204, + "balance_loss_mlp": 1.0551796, + "epoch": 0.0696227265895085, + "flos": 29681390413440.0, + "grad_norm": 1.6687264459000366, + "language_loss": 0.71895158, + "learning_rate": 3.983554608032982e-06, + "loss": 0.7415098, + "num_input_tokens_seen": 24724340, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.4375, + "step": 1158, + "time_per_iteration": 2.53495454788208 + }, + { + "auxiliary_loss_clip": 0.01202231, + "auxiliary_loss_mlp": 0.01063046, + "balance_loss_clip": 1.03764284, + "balance_loss_mlp": 1.05718327, + "epoch": 0.06968284984217646, + "flos": 25523545547520.0, + "grad_norm": 1.9777890540291267, + "language_loss": 0.79796576, + "learning_rate": 3.983504728794533e-06, + "loss": 0.82061857, + "num_input_tokens_seen": 24745550, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.453125, + "step": 1159, + "time_per_iteration": 2.511402130126953 + }, + { + "auxiliary_loss_clip": 0.01205534, + "auxiliary_loss_mlp": 0.0106614, + "balance_loss_clip": 1.03938961, + "balance_loss_mlp": 1.05860782, + "epoch": 0.06974297309484444, + "flos": 20698192287360.0, + "grad_norm": 5.094070474761981, + "language_loss": 0.810929, + "learning_rate": 3.983454774341387e-06, + "loss": 0.83364576, + "num_input_tokens_seen": 24762575, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.46875, + "step": 1160, + "time_per_iteration": 2.4580883979797363 + }, + { + "auxiliary_loss_clip": 0.01197544, + "auxiliary_loss_mlp": 0.01059119, + "balance_loss_clip": 1.03373909, + "balance_loss_mlp": 1.05382752, + "epoch": 0.0698030963475124, + "flos": 26505199313280.0, + "grad_norm": 1.8746427931419856, + "language_loss": 0.75958532, + "learning_rate": 3.983404744675437e-06, + "loss": 0.78215194, + "num_input_tokens_seen": 24782605, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4375, + "step": 1161, + "time_per_iteration": 2.5046370029449463 + }, + { + "auxiliary_loss_clip": 0.01195466, + "auxiliary_loss_mlp": 0.01062077, + "balance_loss_clip": 1.03642368, + "balance_loss_mlp": 1.05299318, + "epoch": 0.06986321960018037, + "flos": 23040430346880.0, + "grad_norm": 1.806880077375887, + "language_loss": 0.8285073, + "learning_rate": 3.9833546397985794e-06, + "loss": 0.85108274, + "num_input_tokens_seen": 24802910, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.4296875, + "step": 1162, + "time_per_iteration": 2.4779040813446045 + }, + { + "auxiliary_loss_clip": 0.01193968, + "auxiliary_loss_mlp": 0.01055987, + "balance_loss_clip": 1.03172803, + "balance_loss_mlp": 1.05355024, + "epoch": 0.06992334285284833, + "flos": 28584822061440.0, + "grad_norm": 1.8779282806609423, + "language_loss": 0.79095101, + "learning_rate": 3.983304459712716e-06, + "loss": 0.81345057, + "num_input_tokens_seen": 24823305, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.40625, + "step": 1163, + "time_per_iteration": 2.515899181365967 + }, + { + "auxiliary_loss_clip": 0.01198337, + "auxiliary_loss_mlp": 0.0106386, + "balance_loss_clip": 1.03728819, + "balance_loss_mlp": 1.05438375, + "epoch": 0.06998346610551631, + "flos": 20595344670720.0, + "grad_norm": 2.1142628107327233, + "language_loss": 0.79552305, + "learning_rate": 3.983254204419749e-06, + "loss": 0.81814498, + "num_input_tokens_seen": 24842155, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.4375, + "step": 1164, + "time_per_iteration": 2.476428747177124 + }, + { + "auxiliary_loss_clip": 0.0119937, + "auxiliary_loss_mlp": 0.0106576, + "balance_loss_clip": 1.0401659, + "balance_loss_mlp": 1.05587661, + "epoch": 0.07004358935818428, + "flos": 22528810978560.0, + "grad_norm": 1.4863162511761774, + "language_loss": 0.73198837, + "learning_rate": 3.983203873921583e-06, + "loss": 0.75463963, + "num_input_tokens_seen": 24862080, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.4375, + "step": 1165, + "time_per_iteration": 2.5053012371063232 + }, + { + "auxiliary_loss_clip": 0.01196916, + "auxiliary_loss_mlp": 0.01056731, + "balance_loss_clip": 1.03225732, + "balance_loss_mlp": 1.05550849, + "epoch": 0.07010371261085224, + "flos": 28949997680640.0, + "grad_norm": 1.690867173089168, + "language_loss": 0.81019437, + "learning_rate": 3.983153468220128e-06, + "loss": 0.83273077, + "num_input_tokens_seen": 24886165, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.4140625, + "step": 1166, + "time_per_iteration": 2.5378963947296143 + }, + { + "auxiliary_loss_clip": 0.01194011, + "auxiliary_loss_mlp": 0.0104974, + "balance_loss_clip": 1.02452731, + "balance_loss_mlp": 1.0534389, + "epoch": 0.07016383586352022, + "flos": 23659171050240.0, + "grad_norm": 4.886682439277329, + "language_loss": 0.84443307, + "learning_rate": 3.983102987317295e-06, + "loss": 0.86687052, + "num_input_tokens_seen": 24905775, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.40625, + "step": 1167, + "time_per_iteration": 2.5244622230529785 + }, + { + "auxiliary_loss_clip": 0.01201364, + "auxiliary_loss_mlp": 0.01057063, + "balance_loss_clip": 1.03188586, + "balance_loss_mlp": 1.05693448, + "epoch": 0.07022395911618819, + "flos": 19792130693760.0, + "grad_norm": 3.687845484368313, + "language_loss": 0.89423364, + "learning_rate": 3.983052431214997e-06, + "loss": 0.9168179, + "num_input_tokens_seen": 24924295, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.4453125, + "step": 1168, + "time_per_iteration": 2.49411678314209 + }, + { + "auxiliary_loss_clip": 0.01203973, + "auxiliary_loss_mlp": 0.01068913, + "balance_loss_clip": 1.04078007, + "balance_loss_mlp": 1.05737031, + "epoch": 0.07028408236885615, + "flos": 21689147675520.0, + "grad_norm": 2.629371766417224, + "language_loss": 0.88661098, + "learning_rate": 3.983001799915153e-06, + "loss": 0.9093399, + "num_input_tokens_seen": 24943210, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.46875, + "step": 1169, + "time_per_iteration": 2.4795143604278564 + }, + { + "auxiliary_loss_clip": 0.01203226, + "auxiliary_loss_mlp": 0.01069625, + "balance_loss_clip": 1.04397118, + "balance_loss_mlp": 1.05864179, + "epoch": 0.07034420562152413, + "flos": 25630271832960.0, + "grad_norm": 2.0154006947860705, + "language_loss": 0.84000075, + "learning_rate": 3.982951093419681e-06, + "loss": 0.86272925, + "num_input_tokens_seen": 24960360, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.4375, + "step": 1170, + "time_per_iteration": 2.501016616821289 + }, + { + "auxiliary_loss_clip": 0.01199625, + "auxiliary_loss_mlp": 0.01064997, + "balance_loss_clip": 1.03860402, + "balance_loss_mlp": 1.05753505, + "epoch": 0.0704043288741921, + "flos": 20810449267200.0, + "grad_norm": 1.945268169582358, + "language_loss": 0.75220597, + "learning_rate": 3.982900311730506e-06, + "loss": 0.77485222, + "num_input_tokens_seen": 24978290, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.421875, + "step": 1171, + "time_per_iteration": 2.4456748962402344 + }, + { + "auxiliary_loss_clip": 0.01199689, + "auxiliary_loss_mlp": 0.01058158, + "balance_loss_clip": 1.03393483, + "balance_loss_mlp": 1.05765915, + "epoch": 0.07046445212686006, + "flos": 25593176062080.0, + "grad_norm": 3.2481396571627923, + "language_loss": 0.88848841, + "learning_rate": 3.9828494548495514e-06, + "loss": 0.91106689, + "num_input_tokens_seen": 24997055, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.421875, + "step": 1172, + "time_per_iteration": 2.4970321655273438 + }, + { + "auxiliary_loss_clip": 0.01202846, + "auxiliary_loss_mlp": 0.01053059, + "balance_loss_clip": 1.02776241, + "balance_loss_mlp": 1.05584753, + "epoch": 0.07052457537952803, + "flos": 25556978131200.0, + "grad_norm": 1.6229718682058278, + "language_loss": 0.8212136, + "learning_rate": 3.982798522778748e-06, + "loss": 0.84377271, + "num_input_tokens_seen": 25017490, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.46875, + "step": 1173, + "time_per_iteration": 2.485822916030884 + }, + { + "auxiliary_loss_clip": 0.01200818, + "auxiliary_loss_mlp": 0.01061183, + "balance_loss_clip": 1.03574347, + "balance_loss_mlp": 1.05786848, + "epoch": 0.070584698632196, + "flos": 17968515154560.0, + "grad_norm": 2.056745883983527, + "language_loss": 0.81825697, + "learning_rate": 3.9827475155200245e-06, + "loss": 0.840877, + "num_input_tokens_seen": 25035660, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4296875, + "step": 1174, + "time_per_iteration": 2.4564759731292725 + }, + { + "auxiliary_loss_clip": 0.01199791, + "auxiliary_loss_mlp": 0.01060254, + "balance_loss_clip": 1.03538728, + "balance_loss_mlp": 1.0569849, + "epoch": 0.07064482188486397, + "flos": 25370888745600.0, + "grad_norm": 1.925446476900023, + "language_loss": 0.8511939, + "learning_rate": 3.982696433075317e-06, + "loss": 0.87379438, + "num_input_tokens_seen": 25054785, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 1.421875, + "step": 1175, + "time_per_iteration": 2.487058639526367 + }, + { + "auxiliary_loss_clip": 0.0120243, + "auxiliary_loss_mlp": 0.01067762, + "balance_loss_clip": 1.04362202, + "balance_loss_mlp": 1.05922508, + "epoch": 0.07070494513753194, + "flos": 24899848767360.0, + "grad_norm": 1.9716433558257507, + "language_loss": 0.8303746, + "learning_rate": 3.982645275446563e-06, + "loss": 0.85307658, + "num_input_tokens_seen": 25075180, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.4375, + "step": 1176, + "time_per_iteration": 2.511456251144409 + }, + { + "auxiliary_loss_clip": 0.01197689, + "auxiliary_loss_mlp": 0.01061097, + "balance_loss_clip": 1.03528786, + "balance_loss_mlp": 1.05717707, + "epoch": 0.07076506839019991, + "flos": 22338447874560.0, + "grad_norm": 2.3318965992312, + "language_loss": 0.74563694, + "learning_rate": 3.982594042635701e-06, + "loss": 0.76822478, + "num_input_tokens_seen": 25093035, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.40625, + "step": 1177, + "time_per_iteration": 2.460102081298828 + }, + { + "auxiliary_loss_clip": 0.01207406, + "auxiliary_loss_mlp": 0.01058724, + "balance_loss_clip": 1.033476, + "balance_loss_mlp": 1.06167924, + "epoch": 0.07082519164286788, + "flos": 18660800954880.0, + "grad_norm": 2.2206541819979995, + "language_loss": 0.86031914, + "learning_rate": 3.982542734644673e-06, + "loss": 0.88298053, + "num_input_tokens_seen": 25112520, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.4609375, + "step": 1178, + "time_per_iteration": 2.4605627059936523 + }, + { + "auxiliary_loss_clip": 0.01087089, + "auxiliary_loss_mlp": 0.01007975, + "balance_loss_clip": 1.00349271, + "balance_loss_mlp": 1.02766943, + "epoch": 0.07088531489553584, + "flos": 63654107610240.0, + "grad_norm": 0.8386980392448491, + "language_loss": 0.63242435, + "learning_rate": 3.982491351475427e-06, + "loss": 0.65337497, + "num_input_tokens_seen": 25177760, + "router_z_loss_clip": 0.04492188, + "router_z_loss_mlp": 0.59375, + "step": 1179, + "time_per_iteration": 3.156688690185547 + }, + { + "auxiliary_loss_clip": 0.01207076, + "auxiliary_loss_mlp": 0.01062734, + "balance_loss_clip": 1.03886819, + "balance_loss_mlp": 1.06038809, + "epoch": 0.07094543814820382, + "flos": 21572688804480.0, + "grad_norm": 2.3853497849810945, + "language_loss": 0.83326972, + "learning_rate": 3.98243989312991e-06, + "loss": 0.85596782, + "num_input_tokens_seen": 25195260, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.46875, + "step": 1180, + "time_per_iteration": 2.4823896884918213 + }, + { + "auxiliary_loss_clip": 0.01200915, + "auxiliary_loss_mlp": 0.01065839, + "balance_loss_clip": 1.04087663, + "balance_loss_mlp": 1.05910683, + "epoch": 0.07100556140087179, + "flos": 22089946608000.0, + "grad_norm": 2.1921067510196446, + "language_loss": 0.88595563, + "learning_rate": 3.982388359610074e-06, + "loss": 0.90862316, + "num_input_tokens_seen": 25212740, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.421875, + "step": 1181, + "time_per_iteration": 2.505908727645874 + }, + { + "auxiliary_loss_clip": 0.01200757, + "auxiliary_loss_mlp": 0.01060636, + "balance_loss_clip": 1.03607869, + "balance_loss_mlp": 1.05944347, + "epoch": 0.07106568465353975, + "flos": 47922286400640.0, + "grad_norm": 2.2303634282095257, + "language_loss": 0.83314365, + "learning_rate": 3.9823367509178725e-06, + "loss": 0.85575759, + "num_input_tokens_seen": 25236420, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.4140625, + "step": 1182, + "time_per_iteration": 2.7283623218536377 + }, + { + "auxiliary_loss_clip": 0.01199287, + "auxiliary_loss_mlp": 0.01065564, + "balance_loss_clip": 1.04006529, + "balance_loss_mlp": 1.06100821, + "epoch": 0.07112580790620772, + "flos": 23440798316160.0, + "grad_norm": 2.671395976555463, + "language_loss": 0.7925818, + "learning_rate": 3.982285067055262e-06, + "loss": 0.81523037, + "num_input_tokens_seen": 25255120, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.3828125, + "step": 1183, + "time_per_iteration": 2.5057172775268555 + }, + { + "auxiliary_loss_clip": 0.01201972, + "auxiliary_loss_mlp": 0.01059167, + "balance_loss_clip": 1.03441906, + "balance_loss_mlp": 1.05550563, + "epoch": 0.0711859311588757, + "flos": 31868888682240.0, + "grad_norm": 2.6492838430830963, + "language_loss": 0.78910172, + "learning_rate": 3.982233308024204e-06, + "loss": 0.8117131, + "num_input_tokens_seen": 25275150, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.46875, + "step": 1184, + "time_per_iteration": 5.494150638580322 + }, + { + "auxiliary_loss_clip": 0.01196982, + "auxiliary_loss_mlp": 0.01057128, + "balance_loss_clip": 1.03369188, + "balance_loss_mlp": 1.05884266, + "epoch": 0.07124605441154366, + "flos": 19610315026560.0, + "grad_norm": 2.546293211356889, + "language_loss": 0.7696892, + "learning_rate": 3.98218147382666e-06, + "loss": 0.79223031, + "num_input_tokens_seen": 25293680, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.375, + "step": 1185, + "time_per_iteration": 3.8873486518859863 + }, + { + "auxiliary_loss_clip": 0.01200052, + "auxiliary_loss_mlp": 0.01065088, + "balance_loss_clip": 1.0408771, + "balance_loss_mlp": 1.05808377, + "epoch": 0.07130617766421163, + "flos": 14684448533760.0, + "grad_norm": 2.519913974657541, + "language_loss": 0.65896261, + "learning_rate": 3.982129564464596e-06, + "loss": 0.68161404, + "num_input_tokens_seen": 25310050, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.421875, + "step": 1186, + "time_per_iteration": 2.44986891746521 + }, + { + "auxiliary_loss_clip": 0.01198722, + "auxiliary_loss_mlp": 0.01056267, + "balance_loss_clip": 1.03234124, + "balance_loss_mlp": 1.05906928, + "epoch": 0.07136630091687961, + "flos": 26067915141120.0, + "grad_norm": 2.0047668871213205, + "language_loss": 0.69673246, + "learning_rate": 3.98207757993998e-06, + "loss": 0.71928233, + "num_input_tokens_seen": 25331020, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.3984375, + "step": 1187, + "time_per_iteration": 2.517432451248169 + }, + { + "auxiliary_loss_clip": 0.01194056, + "auxiliary_loss_mlp": 0.01059861, + "balance_loss_clip": 1.03713942, + "balance_loss_mlp": 1.05690861, + "epoch": 0.07142642416954757, + "flos": 15669190869120.0, + "grad_norm": 2.6848541171122307, + "language_loss": 0.78598166, + "learning_rate": 3.9820255202547845e-06, + "loss": 0.80852079, + "num_input_tokens_seen": 25347875, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.375, + "step": 1188, + "time_per_iteration": 2.4682350158691406 + }, + { + "auxiliary_loss_clip": 0.01197809, + "auxiliary_loss_mlp": 0.01056931, + "balance_loss_clip": 1.03282666, + "balance_loss_mlp": 1.0588758, + "epoch": 0.07148654742221554, + "flos": 19755322231680.0, + "grad_norm": 2.0343008635273834, + "language_loss": 0.84854662, + "learning_rate": 3.981973385410981e-06, + "loss": 0.87109399, + "num_input_tokens_seen": 25366715, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.390625, + "step": 1189, + "time_per_iteration": 2.451464891433716 + }, + { + "auxiliary_loss_clip": 0.01193617, + "auxiliary_loss_mlp": 0.01062112, + "balance_loss_clip": 1.03707767, + "balance_loss_mlp": 1.05589187, + "epoch": 0.07154667067488352, + "flos": 23471824688640.0, + "grad_norm": 1.7193907035784557, + "language_loss": 0.77021295, + "learning_rate": 3.9819211754105494e-06, + "loss": 0.79277021, + "num_input_tokens_seen": 25385450, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.375, + "step": 1190, + "time_per_iteration": 2.5028254985809326 + }, + { + "auxiliary_loss_clip": 0.01200514, + "auxiliary_loss_mlp": 0.01065982, + "balance_loss_clip": 1.04018509, + "balance_loss_mlp": 1.0585537, + "epoch": 0.07160679392755148, + "flos": 18332936588160.0, + "grad_norm": 2.3385605637591302, + "language_loss": 0.75145626, + "learning_rate": 3.981868890255468e-06, + "loss": 0.77412122, + "num_input_tokens_seen": 25403940, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.421875, + "step": 1191, + "time_per_iteration": 2.461045980453491 + }, + { + "auxiliary_loss_clip": 0.0119767, + "auxiliary_loss_mlp": 0.01056581, + "balance_loss_clip": 1.03147578, + "balance_loss_mlp": 1.05730891, + "epoch": 0.07166691718021945, + "flos": 17747017937280.0, + "grad_norm": 3.3332115059632583, + "language_loss": 0.7360636, + "learning_rate": 3.981816529947719e-06, + "loss": 0.75860614, + "num_input_tokens_seen": 25420410, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.40625, + "step": 1192, + "time_per_iteration": 2.4944753646850586 + }, + { + "auxiliary_loss_clip": 0.01194068, + "auxiliary_loss_mlp": 0.01052089, + "balance_loss_clip": 1.02884293, + "balance_loss_mlp": 1.05358601, + "epoch": 0.07172704043288743, + "flos": 22451925916800.0, + "grad_norm": 2.1652973689026176, + "language_loss": 0.7830255, + "learning_rate": 3.9817640944892896e-06, + "loss": 0.80548704, + "num_input_tokens_seen": 25439415, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.40625, + "step": 1193, + "time_per_iteration": 2.487025737762451 + }, + { + "auxiliary_loss_clip": 0.01202609, + "auxiliary_loss_mlp": 0.01053593, + "balance_loss_clip": 1.02786815, + "balance_loss_mlp": 1.06034899, + "epoch": 0.07178716368555539, + "flos": 23222210100480.0, + "grad_norm": 1.9678931818636167, + "language_loss": 0.85748619, + "learning_rate": 3.981711583882166e-06, + "loss": 0.88004816, + "num_input_tokens_seen": 25458715, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.421875, + "step": 1194, + "time_per_iteration": 2.493823766708374 + }, + { + "auxiliary_loss_clip": 0.01197363, + "auxiliary_loss_mlp": 0.0106262, + "balance_loss_clip": 1.03886151, + "balance_loss_mlp": 1.05782473, + "epoch": 0.07184728693822336, + "flos": 25150828072320.0, + "grad_norm": 1.9701258602591958, + "language_loss": 0.81425989, + "learning_rate": 3.981658998128341e-06, + "loss": 0.83685976, + "num_input_tokens_seen": 25477985, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.3984375, + "step": 1195, + "time_per_iteration": 2.5168802738189697 + }, + { + "auxiliary_loss_clip": 0.01195742, + "auxiliary_loss_mlp": 0.01051594, + "balance_loss_clip": 1.02979064, + "balance_loss_mlp": 1.05720496, + "epoch": 0.07190741019089132, + "flos": 22711237176960.0, + "grad_norm": 1.9269272748189905, + "language_loss": 0.79917538, + "learning_rate": 3.981606337229808e-06, + "loss": 0.82164884, + "num_input_tokens_seen": 25497110, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.3828125, + "step": 1196, + "time_per_iteration": 2.4749536514282227 + }, + { + "auxiliary_loss_clip": 0.01193553, + "auxiliary_loss_mlp": 0.01069477, + "balance_loss_clip": 1.04418063, + "balance_loss_mlp": 1.05655897, + "epoch": 0.0719675334435593, + "flos": 29349791032320.0, + "grad_norm": 8.862292558474625, + "language_loss": 0.71015084, + "learning_rate": 3.9815536011885655e-06, + "loss": 0.73278111, + "num_input_tokens_seen": 25516555, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.3671875, + "step": 1197, + "time_per_iteration": 2.520514726638794 + }, + { + "auxiliary_loss_clip": 0.01192449, + "auxiliary_loss_mlp": 0.01052157, + "balance_loss_clip": 1.02845871, + "balance_loss_mlp": 1.05429292, + "epoch": 0.07202765669622727, + "flos": 17639788861440.0, + "grad_norm": 2.0584524946763767, + "language_loss": 0.86034989, + "learning_rate": 3.98150079000661e-06, + "loss": 0.88279593, + "num_input_tokens_seen": 25533895, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.3828125, + "step": 1198, + "time_per_iteration": 2.441458225250244 + }, + { + "auxiliary_loss_clip": 0.01194875, + "auxiliary_loss_mlp": 0.01061206, + "balance_loss_clip": 1.03724504, + "balance_loss_mlp": 1.05664325, + "epoch": 0.07208777994889523, + "flos": 21434038306560.0, + "grad_norm": 2.7240513490380307, + "language_loss": 0.83822477, + "learning_rate": 3.981447903685947e-06, + "loss": 0.8607856, + "num_input_tokens_seen": 25554195, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.3828125, + "step": 1199, + "time_per_iteration": 2.462790012359619 + }, + { + "auxiliary_loss_clip": 0.01201627, + "auxiliary_loss_mlp": 0.01055923, + "balance_loss_clip": 1.03351128, + "balance_loss_mlp": 1.06159616, + "epoch": 0.07214790320156321, + "flos": 26940867373440.0, + "grad_norm": 2.0725431151836453, + "language_loss": 0.76464498, + "learning_rate": 3.981394942228581e-06, + "loss": 0.78722042, + "num_input_tokens_seen": 25574155, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.3984375, + "step": 1200, + "time_per_iteration": 2.5007636547088623 + }, + { + "auxiliary_loss_clip": 0.01196382, + "auxiliary_loss_mlp": 0.01061794, + "balance_loss_clip": 1.0376662, + "balance_loss_mlp": 1.05783701, + "epoch": 0.07220802645423118, + "flos": 23879949995520.0, + "grad_norm": 1.959995672067427, + "language_loss": 0.82965535, + "learning_rate": 3.98134190563652e-06, + "loss": 0.85223711, + "num_input_tokens_seen": 25592735, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.390625, + "step": 1201, + "time_per_iteration": 2.4968512058258057 + }, + { + "auxiliary_loss_clip": 0.01198607, + "auxiliary_loss_mlp": 0.01059493, + "balance_loss_clip": 1.03372014, + "balance_loss_mlp": 1.05568862, + "epoch": 0.07226814970689914, + "flos": 19243631036160.0, + "grad_norm": 2.411287508312223, + "language_loss": 0.69041032, + "learning_rate": 3.981288793911775e-06, + "loss": 0.71299136, + "num_input_tokens_seen": 25611510, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.4296875, + "step": 1202, + "time_per_iteration": 2.456216812133789 + }, + { + "auxiliary_loss_clip": 0.01196785, + "auxiliary_loss_mlp": 0.01063607, + "balance_loss_clip": 1.03804839, + "balance_loss_mlp": 1.05721354, + "epoch": 0.07232827295956712, + "flos": 19172025273600.0, + "grad_norm": 1.9411904343348254, + "language_loss": 0.87723774, + "learning_rate": 3.98123560705636e-06, + "loss": 0.89984161, + "num_input_tokens_seen": 25629560, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.3984375, + "step": 1203, + "time_per_iteration": 2.449903964996338 + }, + { + "auxiliary_loss_clip": 0.01198028, + "auxiliary_loss_mlp": 0.01061987, + "balance_loss_clip": 1.03803837, + "balance_loss_mlp": 1.0546416, + "epoch": 0.07238839621223508, + "flos": 17639752947840.0, + "grad_norm": 1.819852916387131, + "language_loss": 0.7844671, + "learning_rate": 3.981182345072293e-06, + "loss": 0.80706728, + "num_input_tokens_seen": 25648330, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.4375, + "step": 1204, + "time_per_iteration": 2.449265480041504 + }, + { + "auxiliary_loss_clip": 0.01194984, + "auxiliary_loss_mlp": 0.01062187, + "balance_loss_clip": 1.0388217, + "balance_loss_mlp": 1.05605316, + "epoch": 0.07244851946490305, + "flos": 28292401440000.0, + "grad_norm": 1.8514893306986777, + "language_loss": 0.81960398, + "learning_rate": 3.981129007961593e-06, + "loss": 0.84217566, + "num_input_tokens_seen": 25669470, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.390625, + "step": 1205, + "time_per_iteration": 2.517423629760742 + }, + { + "auxiliary_loss_clip": 0.01199989, + "auxiliary_loss_mlp": 0.01067422, + "balance_loss_clip": 1.04250705, + "balance_loss_mlp": 1.05852747, + "epoch": 0.07250864271757101, + "flos": 22564829341440.0, + "grad_norm": 2.0830735488163254, + "language_loss": 0.76702261, + "learning_rate": 3.981075595726283e-06, + "loss": 0.78969669, + "num_input_tokens_seen": 25690470, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.4140625, + "step": 1206, + "time_per_iteration": 2.489978313446045 + }, + { + "auxiliary_loss_clip": 0.01193529, + "auxiliary_loss_mlp": 0.01055273, + "balance_loss_clip": 1.03071594, + "balance_loss_mlp": 1.05481935, + "epoch": 0.072568765970239, + "flos": 21762405463680.0, + "grad_norm": 1.8430962541821914, + "language_loss": 0.77246201, + "learning_rate": 3.981022108368387e-06, + "loss": 0.79495007, + "num_input_tokens_seen": 25709205, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.3828125, + "step": 1207, + "time_per_iteration": 2.4895267486572266 + }, + { + "auxiliary_loss_clip": 0.01194673, + "auxiliary_loss_mlp": 0.01049476, + "balance_loss_clip": 1.02816105, + "balance_loss_mlp": 1.05703962, + "epoch": 0.07262888922290696, + "flos": 25519702792320.0, + "grad_norm": 5.768853045708734, + "language_loss": 0.79723513, + "learning_rate": 3.9809685458899345e-06, + "loss": 0.81967664, + "num_input_tokens_seen": 25728485, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.375, + "step": 1208, + "time_per_iteration": 2.509073495864868 + }, + { + "auxiliary_loss_clip": 0.0119292, + "auxiliary_loss_mlp": 0.01054601, + "balance_loss_clip": 1.03204679, + "balance_loss_mlp": 1.05551386, + "epoch": 0.07268901247557492, + "flos": 21246548290560.0, + "grad_norm": 3.6873449148768063, + "language_loss": 0.78595626, + "learning_rate": 3.980914908292955e-06, + "loss": 0.80843151, + "num_input_tokens_seen": 25747730, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.375, + "step": 1209, + "time_per_iteration": 2.506157398223877 + }, + { + "auxiliary_loss_clip": 0.01194158, + "auxiliary_loss_mlp": 0.01056209, + "balance_loss_clip": 1.03409529, + "balance_loss_mlp": 1.05510461, + "epoch": 0.0727491357282429, + "flos": 25479302970240.0, + "grad_norm": 2.6193169355932104, + "language_loss": 0.81117678, + "learning_rate": 3.980861195579486e-06, + "loss": 0.83368045, + "num_input_tokens_seen": 25768050, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.390625, + "step": 1210, + "time_per_iteration": 2.4985666275024414 + }, + { + "auxiliary_loss_clip": 0.01192388, + "auxiliary_loss_mlp": 0.01061033, + "balance_loss_clip": 1.03688109, + "balance_loss_mlp": 1.0565064, + "epoch": 0.07280925898091087, + "flos": 24462169545600.0, + "grad_norm": 2.2378435782703834, + "language_loss": 0.84350932, + "learning_rate": 3.98080740775156e-06, + "loss": 0.86604351, + "num_input_tokens_seen": 25787985, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.359375, + "step": 1211, + "time_per_iteration": 2.4971728324890137 + }, + { + "auxiliary_loss_clip": 0.01189207, + "auxiliary_loss_mlp": 0.01051238, + "balance_loss_clip": 1.02931547, + "balance_loss_mlp": 1.05233216, + "epoch": 0.07286938223357883, + "flos": 18288191220480.0, + "grad_norm": 2.2910402501943516, + "language_loss": 0.90813953, + "learning_rate": 3.98075354481122e-06, + "loss": 0.9305439, + "num_input_tokens_seen": 25803620, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.375, + "step": 1212, + "time_per_iteration": 2.424874782562256 + }, + { + "auxiliary_loss_clip": 0.01191621, + "auxiliary_loss_mlp": 0.01051304, + "balance_loss_clip": 1.0286777, + "balance_loss_mlp": 1.05457211, + "epoch": 0.07292950548624681, + "flos": 21214803646080.0, + "grad_norm": 2.346480404505952, + "language_loss": 0.7238096, + "learning_rate": 3.9806996067605055e-06, + "loss": 0.74623883, + "num_input_tokens_seen": 25823315, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.3671875, + "step": 1213, + "time_per_iteration": 2.443542003631592 + }, + { + "auxiliary_loss_clip": 0.0119423, + "auxiliary_loss_mlp": 0.01051601, + "balance_loss_clip": 1.02848625, + "balance_loss_mlp": 1.05338192, + "epoch": 0.07298962873891478, + "flos": 24642009964800.0, + "grad_norm": 1.9141465843449694, + "language_loss": 0.84441102, + "learning_rate": 3.980645593601465e-06, + "loss": 0.86686933, + "num_input_tokens_seen": 25842605, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.40625, + "step": 1214, + "time_per_iteration": 2.500112295150757 + }, + { + "auxiliary_loss_clip": 0.01197246, + "auxiliary_loss_mlp": 0.0105819, + "balance_loss_clip": 1.03468192, + "balance_loss_mlp": 1.05678558, + "epoch": 0.07304975199158274, + "flos": 27052765217280.0, + "grad_norm": 2.82775499028919, + "language_loss": 0.83929181, + "learning_rate": 3.980591505336144e-06, + "loss": 0.86184609, + "num_input_tokens_seen": 25863030, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.40625, + "step": 1215, + "time_per_iteration": 2.539769411087036 + }, + { + "auxiliary_loss_clip": 0.01194493, + "auxiliary_loss_mlp": 0.01061008, + "balance_loss_clip": 1.03711891, + "balance_loss_mlp": 1.05474758, + "epoch": 0.07310987524425071, + "flos": 33549544091520.0, + "grad_norm": 1.8082751516232567, + "language_loss": 0.80984753, + "learning_rate": 3.980537341966595e-06, + "loss": 0.83240259, + "num_input_tokens_seen": 25888015, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.390625, + "step": 1216, + "time_per_iteration": 2.6036598682403564 + }, + { + "auxiliary_loss_clip": 0.01196444, + "auxiliary_loss_mlp": 0.01050548, + "balance_loss_clip": 1.02863717, + "balance_loss_mlp": 1.05746269, + "epoch": 0.07316999849691869, + "flos": 28110944908800.0, + "grad_norm": 2.8100743600713276, + "language_loss": 0.76112509, + "learning_rate": 3.980483103494872e-06, + "loss": 0.78359497, + "num_input_tokens_seen": 25908660, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.390625, + "step": 1217, + "time_per_iteration": 2.513061046600342 + }, + { + "auxiliary_loss_clip": 0.01192952, + "auxiliary_loss_mlp": 0.01055183, + "balance_loss_clip": 1.0347029, + "balance_loss_mlp": 1.05546904, + "epoch": 0.07323012174958665, + "flos": 14392602529920.0, + "grad_norm": 2.0751842608938142, + "language_loss": 0.86442709, + "learning_rate": 3.98042878992303e-06, + "loss": 0.88690841, + "num_input_tokens_seen": 25927215, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.375, + "step": 1218, + "time_per_iteration": 2.4514572620391846 + }, + { + "auxiliary_loss_clip": 0.01193593, + "auxiliary_loss_mlp": 0.01062446, + "balance_loss_clip": 1.03989124, + "balance_loss_mlp": 1.05405331, + "epoch": 0.07329024500225462, + "flos": 21616428591360.0, + "grad_norm": 1.9036635750322874, + "language_loss": 0.86757988, + "learning_rate": 3.9803744012531305e-06, + "loss": 0.8901403, + "num_input_tokens_seen": 25945500, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.3984375, + "step": 1219, + "time_per_iteration": 2.4501893520355225 + }, + { + "auxiliary_loss_clip": 0.01190573, + "auxiliary_loss_mlp": 0.01058106, + "balance_loss_clip": 1.03654075, + "balance_loss_mlp": 1.05260015, + "epoch": 0.0733503682549226, + "flos": 13224141106560.0, + "grad_norm": 2.320539289810395, + "language_loss": 0.84721315, + "learning_rate": 3.980319937487235e-06, + "loss": 0.86969984, + "num_input_tokens_seen": 25963105, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.375, + "step": 1220, + "time_per_iteration": 2.4651544094085693 + }, + { + "auxiliary_loss_clip": 0.01193314, + "auxiliary_loss_mlp": 0.01062531, + "balance_loss_clip": 1.04015541, + "balance_loss_mlp": 1.05455709, + "epoch": 0.07341049150759056, + "flos": 20886975192960.0, + "grad_norm": 2.803787378453645, + "language_loss": 0.76840538, + "learning_rate": 3.98026539862741e-06, + "loss": 0.79096377, + "num_input_tokens_seen": 25981690, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.390625, + "step": 1221, + "time_per_iteration": 2.4643850326538086 + }, + { + "auxiliary_loss_clip": 0.01195957, + "auxiliary_loss_mlp": 0.01059407, + "balance_loss_clip": 1.0369482, + "balance_loss_mlp": 1.05698907, + "epoch": 0.07347061476025853, + "flos": 15413614623360.0, + "grad_norm": 4.111967976062365, + "language_loss": 0.92201889, + "learning_rate": 3.980210784675722e-06, + "loss": 0.94457251, + "num_input_tokens_seen": 25999890, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.390625, + "step": 1222, + "time_per_iteration": 2.461393117904663 + }, + { + "auxiliary_loss_clip": 0.01197113, + "auxiliary_loss_mlp": 0.01056347, + "balance_loss_clip": 1.03440046, + "balance_loss_mlp": 1.05795276, + "epoch": 0.0735307380129265, + "flos": 11108859131520.0, + "grad_norm": 2.739326433562924, + "language_loss": 0.91106719, + "learning_rate": 3.980156095634242e-06, + "loss": 0.9336018, + "num_input_tokens_seen": 26016445, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.390625, + "step": 1223, + "time_per_iteration": 2.4616212844848633 + }, + { + "auxiliary_loss_clip": 0.01194512, + "auxiliary_loss_mlp": 0.01071192, + "balance_loss_clip": 1.04895926, + "balance_loss_mlp": 1.05628467, + "epoch": 0.07359086126559447, + "flos": 23732392924800.0, + "grad_norm": 2.5538951271380395, + "language_loss": 0.81946027, + "learning_rate": 3.980101331505045e-06, + "loss": 0.84211743, + "num_input_tokens_seen": 26036080, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.3828125, + "step": 1224, + "time_per_iteration": 2.555060386657715 + }, + { + "auxiliary_loss_clip": 0.01191919, + "auxiliary_loss_mlp": 0.01053854, + "balance_loss_clip": 1.02938056, + "balance_loss_mlp": 1.05385065, + "epoch": 0.07365098451826244, + "flos": 20993270515200.0, + "grad_norm": 2.209826315991058, + "language_loss": 0.83313572, + "learning_rate": 3.9800464922902076e-06, + "loss": 0.8555935, + "num_input_tokens_seen": 26055805, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.375, + "step": 1225, + "time_per_iteration": 2.5317656993865967 + }, + { + "auxiliary_loss_clip": 0.01194191, + "auxiliary_loss_mlp": 0.0105208, + "balance_loss_clip": 1.0300144, + "balance_loss_mlp": 1.05566537, + "epoch": 0.0737111077709304, + "flos": 19933582452480.0, + "grad_norm": 2.0864455990649144, + "language_loss": 0.9037565, + "learning_rate": 3.979991577991808e-06, + "loss": 0.92621917, + "num_input_tokens_seen": 26073905, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.3828125, + "step": 1226, + "time_per_iteration": 5.374137878417969 + }, + { + "auxiliary_loss_clip": 0.01201048, + "auxiliary_loss_mlp": 0.01048748, + "balance_loss_clip": 1.02451301, + "balance_loss_mlp": 1.05401981, + "epoch": 0.07377123102359838, + "flos": 16581537342720.0, + "grad_norm": 2.8833434676543, + "language_loss": 0.76944947, + "learning_rate": 3.97993658861193e-06, + "loss": 0.79194742, + "num_input_tokens_seen": 26091700, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.46875, + "step": 1227, + "time_per_iteration": 2.4528942108154297 + }, + { + "auxiliary_loss_clip": 0.01192324, + "auxiliary_loss_mlp": 0.01049537, + "balance_loss_clip": 1.02720916, + "balance_loss_mlp": 1.05810142, + "epoch": 0.07383135427626634, + "flos": 28328563457280.0, + "grad_norm": 1.6041059240123434, + "language_loss": 0.85634637, + "learning_rate": 3.9798815241526575e-06, + "loss": 0.87876499, + "num_input_tokens_seen": 26114105, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.34375, + "step": 1228, + "time_per_iteration": 2.5452229976654053 + }, + { + "auxiliary_loss_clip": 0.01194537, + "auxiliary_loss_mlp": 0.01061009, + "balance_loss_clip": 1.0383954, + "balance_loss_mlp": 1.05448794, + "epoch": 0.07389147752893431, + "flos": 20047168235520.0, + "grad_norm": 4.251776538682485, + "language_loss": 0.79688829, + "learning_rate": 3.97982638461608e-06, + "loss": 0.81944382, + "num_input_tokens_seen": 26131165, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.3984375, + "step": 1229, + "time_per_iteration": 2.501086711883545 + }, + { + "auxiliary_loss_clip": 0.01196019, + "auxiliary_loss_mlp": 0.01061374, + "balance_loss_clip": 1.03777039, + "balance_loss_mlp": 1.05632436, + "epoch": 0.07395160078160229, + "flos": 18114132890880.0, + "grad_norm": 2.028375336194412, + "language_loss": 0.78218549, + "learning_rate": 3.979771170004287e-06, + "loss": 0.8047595, + "num_input_tokens_seen": 26150040, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.3984375, + "step": 1230, + "time_per_iteration": 2.4474098682403564 + }, + { + "auxiliary_loss_clip": 0.01193092, + "auxiliary_loss_mlp": 0.01048754, + "balance_loss_clip": 1.02554393, + "balance_loss_mlp": 1.05599403, + "epoch": 0.07401172403427025, + "flos": 23586918842880.0, + "grad_norm": 1.924374124094053, + "language_loss": 0.81301343, + "learning_rate": 3.979715880319372e-06, + "loss": 0.83543187, + "num_input_tokens_seen": 26169380, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.375, + "step": 1231, + "time_per_iteration": 2.4861042499542236 + }, + { + "auxiliary_loss_clip": 0.01198041, + "auxiliary_loss_mlp": 0.01066474, + "balance_loss_clip": 1.04277539, + "balance_loss_mlp": 1.05443811, + "epoch": 0.07407184728693822, + "flos": 26359904799360.0, + "grad_norm": 2.4882746298902343, + "language_loss": 0.95111585, + "learning_rate": 3.979660515563434e-06, + "loss": 0.97376096, + "num_input_tokens_seen": 26189420, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.4375, + "step": 1232, + "time_per_iteration": 2.5074143409729004 + }, + { + "auxiliary_loss_clip": 0.01194092, + "auxiliary_loss_mlp": 0.01060623, + "balance_loss_clip": 1.03938031, + "balance_loss_mlp": 1.05667329, + "epoch": 0.0741319705396062, + "flos": 22200443821440.0, + "grad_norm": 2.246534337547551, + "language_loss": 0.80640733, + "learning_rate": 3.979605075738569e-06, + "loss": 0.82895458, + "num_input_tokens_seen": 26209300, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.375, + "step": 1233, + "time_per_iteration": 2.490816831588745 + }, + { + "auxiliary_loss_clip": 0.01198611, + "auxiliary_loss_mlp": 0.01060349, + "balance_loss_clip": 1.03488624, + "balance_loss_mlp": 1.05483365, + "epoch": 0.07419209379227416, + "flos": 39200482523520.0, + "grad_norm": 2.357402762223285, + "language_loss": 0.70458734, + "learning_rate": 3.979549560846883e-06, + "loss": 0.72717696, + "num_input_tokens_seen": 26228110, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4375, + "step": 1234, + "time_per_iteration": 2.605139970779419 + }, + { + "auxiliary_loss_clip": 0.01195848, + "auxiliary_loss_mlp": 0.01059615, + "balance_loss_clip": 1.03665543, + "balance_loss_mlp": 1.05792761, + "epoch": 0.07425221704494213, + "flos": 22781657790720.0, + "grad_norm": 2.1034220776692765, + "language_loss": 0.77058101, + "learning_rate": 3.979493970890478e-06, + "loss": 0.79313564, + "num_input_tokens_seen": 26247020, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.3828125, + "step": 1235, + "time_per_iteration": 2.4728853702545166 + }, + { + "auxiliary_loss_clip": 0.01189622, + "auxiliary_loss_mlp": 0.01053872, + "balance_loss_clip": 1.03123438, + "balance_loss_mlp": 1.05414248, + "epoch": 0.0743123402976101, + "flos": 22272983337600.0, + "grad_norm": 5.584514149172867, + "language_loss": 0.82648033, + "learning_rate": 3.979438305871464e-06, + "loss": 0.84891528, + "num_input_tokens_seen": 26265750, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.359375, + "step": 1236, + "time_per_iteration": 2.462069511413574 + }, + { + "auxiliary_loss_clip": 0.0119681, + "auxiliary_loss_mlp": 0.01057366, + "balance_loss_clip": 1.03385794, + "balance_loss_mlp": 1.05572712, + "epoch": 0.07437246355027807, + "flos": 29315029645440.0, + "grad_norm": 2.2536643652174724, + "language_loss": 0.75702679, + "learning_rate": 3.979382565791951e-06, + "loss": 0.77956861, + "num_input_tokens_seen": 26287905, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.40625, + "step": 1237, + "time_per_iteration": 2.5572054386138916 + }, + { + "auxiliary_loss_clip": 0.01192925, + "auxiliary_loss_mlp": 0.01060344, + "balance_loss_clip": 1.03817141, + "balance_loss_mlp": 1.05427146, + "epoch": 0.07443258680294604, + "flos": 31944732249600.0, + "grad_norm": 1.878495773650564, + "language_loss": 0.7740556, + "learning_rate": 3.979326750654053e-06, + "loss": 0.7965883, + "num_input_tokens_seen": 26311795, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.3828125, + "step": 1238, + "time_per_iteration": 2.5915493965148926 + }, + { + "auxiliary_loss_clip": 0.01198337, + "auxiliary_loss_mlp": 0.01055743, + "balance_loss_clip": 1.03222322, + "balance_loss_mlp": 1.05435395, + "epoch": 0.074492710055614, + "flos": 22675290641280.0, + "grad_norm": 2.0695087378138455, + "language_loss": 0.86322856, + "learning_rate": 3.9792708604598854e-06, + "loss": 0.88576937, + "num_input_tokens_seen": 26330330, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.4375, + "step": 1239, + "time_per_iteration": 2.4961507320404053 + }, + { + "auxiliary_loss_clip": 0.01194884, + "auxiliary_loss_mlp": 0.01049072, + "balance_loss_clip": 1.02401412, + "balance_loss_mlp": 1.05433989, + "epoch": 0.07455283330828198, + "flos": 21284901037440.0, + "grad_norm": 2.179426429753772, + "language_loss": 0.89070082, + "learning_rate": 3.979214895211569e-06, + "loss": 0.91314042, + "num_input_tokens_seen": 26348865, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.40625, + "step": 1240, + "time_per_iteration": 2.456801176071167 + }, + { + "auxiliary_loss_clip": 0.01197473, + "auxiliary_loss_mlp": 0.01058447, + "balance_loss_clip": 1.03325772, + "balance_loss_mlp": 1.05600643, + "epoch": 0.07461295656094995, + "flos": 24388408967040.0, + "grad_norm": 2.2624482063672513, + "language_loss": 0.88586551, + "learning_rate": 3.979158854911225e-06, + "loss": 0.90842468, + "num_input_tokens_seen": 26368210, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.4140625, + "step": 1241, + "time_per_iteration": 2.5667178630828857 + }, + { + "auxiliary_loss_clip": 0.01080695, + "auxiliary_loss_mlp": 0.01022083, + "balance_loss_clip": 1.018507, + "balance_loss_mlp": 1.02113318, + "epoch": 0.07467307981361791, + "flos": 62109660574080.0, + "grad_norm": 0.9233978594431768, + "language_loss": 0.63032585, + "learning_rate": 3.979102739560979e-06, + "loss": 0.65135366, + "num_input_tokens_seen": 26424890, + "router_z_loss_clip": 0.03564453, + "router_z_loss_mlp": 0.59375, + "step": 1242, + "time_per_iteration": 3.1321358680725098 + }, + { + "auxiliary_loss_clip": 0.012088, + "auxiliary_loss_mlp": 0.01059736, + "balance_loss_clip": 1.03305697, + "balance_loss_mlp": 1.05792046, + "epoch": 0.07473320306628589, + "flos": 24863148046080.0, + "grad_norm": 2.8956100556858004, + "language_loss": 0.62917286, + "learning_rate": 3.9790465491629595e-06, + "loss": 0.65185821, + "num_input_tokens_seen": 26446405, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.5078125, + "step": 1243, + "time_per_iteration": 2.5571463108062744 + }, + { + "auxiliary_loss_clip": 0.01196196, + "auxiliary_loss_mlp": 0.01052045, + "balance_loss_clip": 1.0280956, + "balance_loss_mlp": 1.05710852, + "epoch": 0.07479332631895386, + "flos": 24897442556160.0, + "grad_norm": 2.504235331520048, + "language_loss": 0.76465732, + "learning_rate": 3.978990283719296e-06, + "loss": 0.78713971, + "num_input_tokens_seen": 26466070, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.390625, + "step": 1244, + "time_per_iteration": 2.501621723175049 + }, + { + "auxiliary_loss_clip": 0.01197755, + "auxiliary_loss_mlp": 0.01058762, + "balance_loss_clip": 1.03462183, + "balance_loss_mlp": 1.05684423, + "epoch": 0.07485344957162182, + "flos": 17815247821440.0, + "grad_norm": 2.8968513367461495, + "language_loss": 0.69149882, + "learning_rate": 3.978933943232123e-06, + "loss": 0.714064, + "num_input_tokens_seen": 26479350, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.40625, + "step": 1245, + "time_per_iteration": 2.417921781539917 + }, + { + "auxiliary_loss_clip": 0.01196347, + "auxiliary_loss_mlp": 0.01052065, + "balance_loss_clip": 1.02768707, + "balance_loss_mlp": 1.05663347, + "epoch": 0.0749135728242898, + "flos": 25010202326400.0, + "grad_norm": 1.9272496045423029, + "language_loss": 0.88344061, + "learning_rate": 3.978877527703576e-06, + "loss": 0.90592474, + "num_input_tokens_seen": 26498255, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.3984375, + "step": 1246, + "time_per_iteration": 2.5631723403930664 + }, + { + "auxiliary_loss_clip": 0.01205457, + "auxiliary_loss_mlp": 0.01067222, + "balance_loss_clip": 1.04055524, + "balance_loss_mlp": 1.05656838, + "epoch": 0.07497369607695777, + "flos": 17822071405440.0, + "grad_norm": 2.4755370190447064, + "language_loss": 0.87921643, + "learning_rate": 3.9788210371357945e-06, + "loss": 0.90194321, + "num_input_tokens_seen": 26515375, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.4921875, + "step": 1247, + "time_per_iteration": 2.4602389335632324 + }, + { + "auxiliary_loss_clip": 0.01194073, + "auxiliary_loss_mlp": 0.01060013, + "balance_loss_clip": 1.03502667, + "balance_loss_mlp": 1.05565107, + "epoch": 0.07503381932962573, + "flos": 15121086261120.0, + "grad_norm": 2.2039165223770194, + "language_loss": 0.6477375, + "learning_rate": 3.978764471530921e-06, + "loss": 0.67027843, + "num_input_tokens_seen": 26533595, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.3828125, + "step": 1248, + "time_per_iteration": 2.4408388137817383 + }, + { + "auxiliary_loss_clip": 0.01192958, + "auxiliary_loss_mlp": 0.0106246, + "balance_loss_clip": 1.04016805, + "balance_loss_mlp": 1.0575254, + "epoch": 0.0750939425822937, + "flos": 12816734071680.0, + "grad_norm": 2.0641418493429713, + "language_loss": 0.73964334, + "learning_rate": 3.978707830891102e-06, + "loss": 0.76219749, + "num_input_tokens_seen": 26549405, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.359375, + "step": 1249, + "time_per_iteration": 2.443767547607422 + }, + { + "auxiliary_loss_clip": 0.01201286, + "auxiliary_loss_mlp": 0.01068388, + "balance_loss_clip": 1.0433774, + "balance_loss_mlp": 1.05842972, + "epoch": 0.07515406583496168, + "flos": 24206844695040.0, + "grad_norm": 2.607815988938315, + "language_loss": 0.81845009, + "learning_rate": 3.978651115218482e-06, + "loss": 0.84114683, + "num_input_tokens_seen": 26567200, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.4296875, + "step": 1250, + "time_per_iteration": 2.491236448287964 + }, + { + "auxiliary_loss_clip": 0.01197565, + "auxiliary_loss_mlp": 0.01061004, + "balance_loss_clip": 1.03703094, + "balance_loss_mlp": 1.05932856, + "epoch": 0.07521418908762964, + "flos": 26688164215680.0, + "grad_norm": 2.308634463940828, + "language_loss": 0.66713893, + "learning_rate": 3.978594324515215e-06, + "loss": 0.68972456, + "num_input_tokens_seen": 26586190, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.3828125, + "step": 1251, + "time_per_iteration": 2.5437874794006348 + }, + { + "auxiliary_loss_clip": 0.0107681, + "auxiliary_loss_mlp": 0.01002851, + "balance_loss_clip": 0.99946529, + "balance_loss_mlp": 1.02021933, + "epoch": 0.0752743123402976, + "flos": 59095140589440.0, + "grad_norm": 0.8978558428983584, + "language_loss": 0.70356798, + "learning_rate": 3.9785374587834515e-06, + "loss": 0.72436458, + "num_input_tokens_seen": 26650710, + "router_z_loss_clip": 0.03393555, + "router_z_loss_mlp": 0.56640625, + "step": 1252, + "time_per_iteration": 3.1170923709869385 + }, + { + "auxiliary_loss_clip": 0.01194007, + "auxiliary_loss_mlp": 0.01061281, + "balance_loss_clip": 1.03698599, + "balance_loss_mlp": 1.05419612, + "epoch": 0.07533443559296558, + "flos": 23477032160640.0, + "grad_norm": 2.9290655276351045, + "language_loss": 0.79516673, + "learning_rate": 3.97848051802535e-06, + "loss": 0.81771958, + "num_input_tokens_seen": 26669000, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.3984375, + "step": 1253, + "time_per_iteration": 2.4821414947509766 + }, + { + "auxiliary_loss_clip": 0.01199953, + "auxiliary_loss_mlp": 0.01065033, + "balance_loss_clip": 1.04125071, + "balance_loss_mlp": 1.05829906, + "epoch": 0.07539455884563355, + "flos": 20879110114560.0, + "grad_norm": 2.5751371148477995, + "language_loss": 0.93441045, + "learning_rate": 3.978423502243069e-06, + "loss": 0.95706034, + "num_input_tokens_seen": 26683075, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.421875, + "step": 1254, + "time_per_iteration": 2.4245519638061523 + }, + { + "auxiliary_loss_clip": 0.01191058, + "auxiliary_loss_mlp": 0.01062028, + "balance_loss_clip": 1.03849554, + "balance_loss_mlp": 1.05566263, + "epoch": 0.07545468209830151, + "flos": 27672906551040.0, + "grad_norm": 1.866823394820361, + "language_loss": 0.88030314, + "learning_rate": 3.97836641143877e-06, + "loss": 0.902834, + "num_input_tokens_seen": 26701875, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.359375, + "step": 1255, + "time_per_iteration": 2.5579185485839844 + }, + { + "auxiliary_loss_clip": 0.01192242, + "auxiliary_loss_mlp": 0.01064619, + "balance_loss_clip": 1.04009795, + "balance_loss_mlp": 1.05518413, + "epoch": 0.0755148053509695, + "flos": 14136990370560.0, + "grad_norm": 1.7574194703288544, + "language_loss": 0.79516619, + "learning_rate": 3.978309245614618e-06, + "loss": 0.81773484, + "num_input_tokens_seen": 26719050, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.3671875, + "step": 1256, + "time_per_iteration": 2.4203784465789795 + }, + { + "auxiliary_loss_clip": 0.01074137, + "auxiliary_loss_mlp": 0.01007102, + "balance_loss_clip": 1.00378788, + "balance_loss_mlp": 1.01769829, + "epoch": 0.07557492860363746, + "flos": 58235257929600.0, + "grad_norm": 0.8283025846018472, + "language_loss": 0.58016127, + "learning_rate": 3.9782520047727825e-06, + "loss": 0.60097361, + "num_input_tokens_seen": 26780650, + "router_z_loss_clip": 0.03320312, + "router_z_loss_mlp": 0.5625, + "step": 1257, + "time_per_iteration": 3.1732118129730225 + }, + { + "auxiliary_loss_clip": 0.0119581, + "auxiliary_loss_mlp": 0.01056297, + "balance_loss_clip": 1.03272927, + "balance_loss_mlp": 1.05982757, + "epoch": 0.07563505185630542, + "flos": 24644380262400.0, + "grad_norm": 3.1336739114125107, + "language_loss": 0.89859951, + "learning_rate": 3.978194688915432e-06, + "loss": 0.92112058, + "num_input_tokens_seen": 26798725, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.359375, + "step": 1258, + "time_per_iteration": 2.516925811767578 + }, + { + "auxiliary_loss_clip": 0.01192364, + "auxiliary_loss_mlp": 0.01054502, + "balance_loss_clip": 1.03181624, + "balance_loss_mlp": 1.05663717, + "epoch": 0.07569517510897339, + "flos": 15522998515200.0, + "grad_norm": 3.28312942247731, + "language_loss": 0.81211507, + "learning_rate": 3.978137298044741e-06, + "loss": 0.83458376, + "num_input_tokens_seen": 26817005, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.359375, + "step": 1259, + "time_per_iteration": 2.449533224105835 + }, + { + "auxiliary_loss_clip": 0.01193912, + "auxiliary_loss_mlp": 0.01058573, + "balance_loss_clip": 1.03593481, + "balance_loss_mlp": 1.05662787, + "epoch": 0.07575529836164137, + "flos": 22928532503040.0, + "grad_norm": 1.9172803769558988, + "language_loss": 0.75733984, + "learning_rate": 3.978079832162885e-06, + "loss": 0.77986467, + "num_input_tokens_seen": 26836655, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.375, + "step": 1260, + "time_per_iteration": 2.5003559589385986 + }, + { + "auxiliary_loss_clip": 0.01192246, + "auxiliary_loss_mlp": 0.01059755, + "balance_loss_clip": 1.03550828, + "balance_loss_mlp": 1.0552032, + "epoch": 0.07581542161430933, + "flos": 19500428344320.0, + "grad_norm": 1.8260195606442358, + "language_loss": 0.84695768, + "learning_rate": 3.978022291272044e-06, + "loss": 0.86947775, + "num_input_tokens_seen": 26854925, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.375, + "step": 1261, + "time_per_iteration": 2.4633476734161377 + }, + { + "auxiliary_loss_clip": 0.01200376, + "auxiliary_loss_mlp": 0.01060967, + "balance_loss_clip": 1.03828108, + "balance_loss_mlp": 1.05969536, + "epoch": 0.0758755448669773, + "flos": 24973465691520.0, + "grad_norm": 2.3160282321136334, + "language_loss": 0.8266682, + "learning_rate": 3.977964675374399e-06, + "loss": 0.84928167, + "num_input_tokens_seen": 26876170, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.40625, + "step": 1262, + "time_per_iteration": 2.5256471633911133 + }, + { + "auxiliary_loss_clip": 0.01192085, + "auxiliary_loss_mlp": 0.01061195, + "balance_loss_clip": 1.03703153, + "balance_loss_mlp": 1.0540688, + "epoch": 0.07593566811964528, + "flos": 22747973811840.0, + "grad_norm": 2.4581964181262776, + "language_loss": 0.8255769, + "learning_rate": 3.977906984472136e-06, + "loss": 0.84810972, + "num_input_tokens_seen": 26895005, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.3828125, + "step": 1263, + "time_per_iteration": 2.470656633377075 + }, + { + "auxiliary_loss_clip": 0.01195735, + "auxiliary_loss_mlp": 0.01056704, + "balance_loss_clip": 1.03381538, + "balance_loss_mlp": 1.05504882, + "epoch": 0.07599579137231324, + "flos": 23112395245440.0, + "grad_norm": 2.324943057092889, + "language_loss": 0.7591399, + "learning_rate": 3.977849218567442e-06, + "loss": 0.78166431, + "num_input_tokens_seen": 26913930, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.40625, + "step": 1264, + "time_per_iteration": 2.4715359210968018 + }, + { + "auxiliary_loss_clip": 0.0119596, + "auxiliary_loss_mlp": 0.01062168, + "balance_loss_clip": 1.03832579, + "balance_loss_mlp": 1.05711412, + "epoch": 0.07605591462498121, + "flos": 14502058248960.0, + "grad_norm": 2.1997185871944356, + "language_loss": 0.81106204, + "learning_rate": 3.977791377662507e-06, + "loss": 0.83364332, + "num_input_tokens_seen": 26931485, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.390625, + "step": 1265, + "time_per_iteration": 2.440000295639038 + }, + { + "auxiliary_loss_clip": 0.01195477, + "auxiliary_loss_mlp": 0.01056708, + "balance_loss_clip": 1.03408241, + "balance_loss_mlp": 1.05631864, + "epoch": 0.07611603787764919, + "flos": 23514199758720.0, + "grad_norm": 2.141616369936441, + "language_loss": 0.64935738, + "learning_rate": 3.977733461759524e-06, + "loss": 0.67187923, + "num_input_tokens_seen": 26951670, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.390625, + "step": 1266, + "time_per_iteration": 2.495001792907715 + }, + { + "auxiliary_loss_clip": 0.01194799, + "auxiliary_loss_mlp": 0.01060988, + "balance_loss_clip": 1.03752804, + "balance_loss_mlp": 1.05550349, + "epoch": 0.07617616113031715, + "flos": 21507188353920.0, + "grad_norm": 2.2514277899416606, + "language_loss": 0.79527593, + "learning_rate": 3.977675470860691e-06, + "loss": 0.81783378, + "num_input_tokens_seen": 26970335, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.390625, + "step": 1267, + "time_per_iteration": 2.4763970375061035 + }, + { + "auxiliary_loss_clip": 0.01194511, + "auxiliary_loss_mlp": 0.01051729, + "balance_loss_clip": 1.02975869, + "balance_loss_mlp": 1.05526185, + "epoch": 0.07623628438298512, + "flos": 14573161221120.0, + "grad_norm": 2.2740159695832682, + "language_loss": 0.7253381, + "learning_rate": 3.977617404968205e-06, + "loss": 0.74780059, + "num_input_tokens_seen": 26986025, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.390625, + "step": 1268, + "time_per_iteration": 3.8910977840423584 + }, + { + "auxiliary_loss_clip": 0.01192554, + "auxiliary_loss_mlp": 0.01057239, + "balance_loss_clip": 1.03447044, + "balance_loss_mlp": 1.05342031, + "epoch": 0.07629640763565308, + "flos": 14720395069440.0, + "grad_norm": 2.163449384012833, + "language_loss": 0.81891817, + "learning_rate": 3.977559264084269e-06, + "loss": 0.84141612, + "num_input_tokens_seen": 27004045, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.390625, + "step": 1269, + "time_per_iteration": 3.8643741607666016 + }, + { + "auxiliary_loss_clip": 0.01192657, + "auxiliary_loss_mlp": 0.01054484, + "balance_loss_clip": 1.03120267, + "balance_loss_mlp": 1.05559695, + "epoch": 0.07635653088832106, + "flos": 14902929008640.0, + "grad_norm": 3.2383492700687078, + "language_loss": 0.88135087, + "learning_rate": 3.977501048211088e-06, + "loss": 0.90382218, + "num_input_tokens_seen": 27022070, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.375, + "step": 1270, + "time_per_iteration": 2.4746575355529785 + }, + { + "auxiliary_loss_clip": 0.01198155, + "auxiliary_loss_mlp": 0.0105921, + "balance_loss_clip": 1.03559494, + "balance_loss_mlp": 1.05707884, + "epoch": 0.07641665414098903, + "flos": 26651571235200.0, + "grad_norm": 2.188682914143081, + "language_loss": 0.71113384, + "learning_rate": 3.977442757350869e-06, + "loss": 0.73370755, + "num_input_tokens_seen": 27041755, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.4140625, + "step": 1271, + "time_per_iteration": 2.529632091522217 + }, + { + "auxiliary_loss_clip": 0.01189637, + "auxiliary_loss_mlp": 0.01066344, + "balance_loss_clip": 1.04351556, + "balance_loss_mlp": 1.05675423, + "epoch": 0.07647677739365699, + "flos": 25192808092800.0, + "grad_norm": 1.9018984880968814, + "language_loss": 0.82745486, + "learning_rate": 3.977384391505823e-06, + "loss": 0.85001469, + "num_input_tokens_seen": 27061540, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.328125, + "step": 1272, + "time_per_iteration": 2.4950368404388428 + }, + { + "auxiliary_loss_clip": 0.01191491, + "auxiliary_loss_mlp": 0.01061838, + "balance_loss_clip": 1.03867579, + "balance_loss_mlp": 1.05351079, + "epoch": 0.07653690064632497, + "flos": 20558141159040.0, + "grad_norm": 2.0211474255264643, + "language_loss": 0.79951203, + "learning_rate": 3.977325950678162e-06, + "loss": 0.82204533, + "num_input_tokens_seen": 27081395, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.3828125, + "step": 1273, + "time_per_iteration": 2.490281105041504 + }, + { + "auxiliary_loss_clip": 0.01194744, + "auxiliary_loss_mlp": 0.01062211, + "balance_loss_clip": 1.03858376, + "balance_loss_mlp": 1.05600715, + "epoch": 0.07659702389899294, + "flos": 22269320150400.0, + "grad_norm": 1.848359088284866, + "language_loss": 0.81545758, + "learning_rate": 3.977267434870103e-06, + "loss": 0.83802712, + "num_input_tokens_seen": 27101175, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.390625, + "step": 1274, + "time_per_iteration": 2.499799966812134 + }, + { + "auxiliary_loss_clip": 0.01191932, + "auxiliary_loss_mlp": 0.01068548, + "balance_loss_clip": 1.04430115, + "balance_loss_mlp": 1.05469346, + "epoch": 0.0766571471516609, + "flos": 32636120209920.0, + "grad_norm": 1.991418246716423, + "language_loss": 0.73099387, + "learning_rate": 3.977208844083865e-06, + "loss": 0.75359869, + "num_input_tokens_seen": 27124505, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.375, + "step": 1275, + "time_per_iteration": 2.557973623275757 + }, + { + "auxiliary_loss_clip": 0.011939, + "auxiliary_loss_mlp": 0.01061514, + "balance_loss_clip": 1.0368377, + "balance_loss_mlp": 1.05536842, + "epoch": 0.07671727040432888, + "flos": 15267386355840.0, + "grad_norm": 2.1093684912214545, + "language_loss": 0.79584897, + "learning_rate": 3.9771501783216685e-06, + "loss": 0.81840312, + "num_input_tokens_seen": 27140960, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.390625, + "step": 1276, + "time_per_iteration": 2.4329752922058105 + }, + { + "auxiliary_loss_clip": 0.01196395, + "auxiliary_loss_mlp": 0.01051332, + "balance_loss_clip": 1.02838457, + "balance_loss_mlp": 1.05656397, + "epoch": 0.07677739365699685, + "flos": 28184094956160.0, + "grad_norm": 2.623540269613024, + "language_loss": 0.59020305, + "learning_rate": 3.97709143758574e-06, + "loss": 0.61268032, + "num_input_tokens_seen": 27160985, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.3984375, + "step": 1277, + "time_per_iteration": 2.5318989753723145 + }, + { + "auxiliary_loss_clip": 0.01200985, + "auxiliary_loss_mlp": 0.01057464, + "balance_loss_clip": 1.03433728, + "balance_loss_mlp": 1.05805659, + "epoch": 0.07683751690966481, + "flos": 18296128126080.0, + "grad_norm": 2.2944749333347096, + "language_loss": 0.74846482, + "learning_rate": 3.977032621878305e-06, + "loss": 0.77104926, + "num_input_tokens_seen": 27178390, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.4296875, + "step": 1278, + "time_per_iteration": 2.448615789413452 + }, + { + "auxiliary_loss_clip": 0.01190146, + "auxiliary_loss_mlp": 0.01051712, + "balance_loss_clip": 1.02943182, + "balance_loss_mlp": 1.05475163, + "epoch": 0.07689764016233278, + "flos": 21981101420160.0, + "grad_norm": 4.0999470067777075, + "language_loss": 0.88656616, + "learning_rate": 3.976973731201596e-06, + "loss": 0.90898478, + "num_input_tokens_seen": 27197505, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.359375, + "step": 1279, + "time_per_iteration": 2.4883790016174316 + }, + { + "auxiliary_loss_clip": 0.01189256, + "auxiliary_loss_mlp": 0.01062556, + "balance_loss_clip": 1.03973901, + "balance_loss_mlp": 1.05507362, + "epoch": 0.07695776341500075, + "flos": 22235995307520.0, + "grad_norm": 3.4596954186847393, + "language_loss": 0.82899994, + "learning_rate": 3.976914765557845e-06, + "loss": 0.85151803, + "num_input_tokens_seen": 27214260, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.34375, + "step": 1280, + "time_per_iteration": 2.459294319152832 + }, + { + "auxiliary_loss_clip": 0.01188755, + "auxiliary_loss_mlp": 0.01061811, + "balance_loss_clip": 1.03874409, + "balance_loss_mlp": 1.05492759, + "epoch": 0.07701788666766872, + "flos": 16143750380160.0, + "grad_norm": 1.9224222656998016, + "language_loss": 0.76059222, + "learning_rate": 3.9768557249492875e-06, + "loss": 0.78309786, + "num_input_tokens_seen": 27232525, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.3359375, + "step": 1281, + "time_per_iteration": 2.453183650970459 + }, + { + "auxiliary_loss_clip": 0.0119548, + "auxiliary_loss_mlp": 0.01054802, + "balance_loss_clip": 1.03128171, + "balance_loss_mlp": 1.05448353, + "epoch": 0.07707800992033668, + "flos": 19463045264640.0, + "grad_norm": 1.8937081587754587, + "language_loss": 0.75307631, + "learning_rate": 3.9767966093781634e-06, + "loss": 0.77557921, + "num_input_tokens_seen": 27249800, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.40625, + "step": 1282, + "time_per_iteration": 2.4526116847991943 + }, + { + "auxiliary_loss_clip": 0.01190337, + "auxiliary_loss_mlp": 0.01070616, + "balance_loss_clip": 1.04734671, + "balance_loss_mlp": 1.054286, + "epoch": 0.07713813317300466, + "flos": 18990281433600.0, + "grad_norm": 2.0304459145795963, + "language_loss": 0.8428033, + "learning_rate": 3.976737418846713e-06, + "loss": 0.86541283, + "num_input_tokens_seen": 27268895, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.359375, + "step": 1283, + "time_per_iteration": 2.468101739883423 + }, + { + "auxiliary_loss_clip": 0.01192768, + "auxiliary_loss_mlp": 0.01062747, + "balance_loss_clip": 1.0375464, + "balance_loss_mlp": 1.05560803, + "epoch": 0.07719825642567263, + "flos": 18113953322880.0, + "grad_norm": 2.622403612740989, + "language_loss": 0.75031364, + "learning_rate": 3.976678153357181e-06, + "loss": 0.77286887, + "num_input_tokens_seen": 27288180, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.375, + "step": 1284, + "time_per_iteration": 2.451749801635742 + }, + { + "auxiliary_loss_clip": 0.01188745, + "auxiliary_loss_mlp": 0.0106155, + "balance_loss_clip": 1.03947222, + "balance_loss_mlp": 1.05330253, + "epoch": 0.0772583796783406, + "flos": 42194426993280.0, + "grad_norm": 1.6448065546510353, + "language_loss": 0.75934827, + "learning_rate": 3.976618812911817e-06, + "loss": 0.78185129, + "num_input_tokens_seen": 27311815, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.359375, + "step": 1285, + "time_per_iteration": 2.664769411087036 + }, + { + "auxiliary_loss_clip": 0.01196484, + "auxiliary_loss_mlp": 0.01062869, + "balance_loss_clip": 1.0406251, + "balance_loss_mlp": 1.05862105, + "epoch": 0.07731850293100857, + "flos": 24753692327040.0, + "grad_norm": 1.8165785508620624, + "language_loss": 0.84204662, + "learning_rate": 3.9765593975128685e-06, + "loss": 0.86464012, + "num_input_tokens_seen": 27331890, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.375, + "step": 1286, + "time_per_iteration": 2.550670862197876 + }, + { + "auxiliary_loss_clip": 0.01196192, + "auxiliary_loss_mlp": 0.01055874, + "balance_loss_clip": 1.03271151, + "balance_loss_mlp": 1.05582845, + "epoch": 0.07737862618367654, + "flos": 17565884628480.0, + "grad_norm": 4.521300853065514, + "language_loss": 0.76725763, + "learning_rate": 3.97649990716259e-06, + "loss": 0.78977823, + "num_input_tokens_seen": 27348320, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.40625, + "step": 1287, + "time_per_iteration": 2.455627918243408 + }, + { + "auxiliary_loss_clip": 0.01190346, + "auxiliary_loss_mlp": 0.01058612, + "balance_loss_clip": 1.03636777, + "balance_loss_mlp": 1.05476642, + "epoch": 0.0774387494363445, + "flos": 25627147349760.0, + "grad_norm": 1.6785000972571258, + "language_loss": 0.84509134, + "learning_rate": 3.976440341863237e-06, + "loss": 0.86758095, + "num_input_tokens_seen": 27367670, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.359375, + "step": 1288, + "time_per_iteration": 2.500218629837036 + }, + { + "auxiliary_loss_clip": 0.01192387, + "auxiliary_loss_mlp": 0.01056799, + "balance_loss_clip": 1.0350318, + "balance_loss_mlp": 1.05364347, + "epoch": 0.07749887268901248, + "flos": 12239865648000.0, + "grad_norm": 2.192533837519805, + "language_loss": 0.85769016, + "learning_rate": 3.976380701617068e-06, + "loss": 0.88018203, + "num_input_tokens_seen": 27385485, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.390625, + "step": 1289, + "time_per_iteration": 2.4759440422058105 + }, + { + "auxiliary_loss_clip": 0.01189023, + "auxiliary_loss_mlp": 0.01047658, + "balance_loss_clip": 1.02563989, + "balance_loss_mlp": 1.05300641, + "epoch": 0.07755899594168045, + "flos": 25081736261760.0, + "grad_norm": 1.8877463184856607, + "language_loss": 0.85053366, + "learning_rate": 3.976320986426344e-06, + "loss": 0.87290049, + "num_input_tokens_seen": 27405110, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.359375, + "step": 1290, + "time_per_iteration": 2.5266401767730713 + }, + { + "auxiliary_loss_clip": 0.01185369, + "auxiliary_loss_mlp": 0.01059291, + "balance_loss_clip": 1.03541303, + "balance_loss_mlp": 1.05397463, + "epoch": 0.07761911919434841, + "flos": 14246410176000.0, + "grad_norm": 2.3980248629455834, + "language_loss": 0.90562832, + "learning_rate": 3.9762611962933315e-06, + "loss": 0.92807496, + "num_input_tokens_seen": 27422855, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.3125, + "step": 1291, + "time_per_iteration": 2.4760262966156006 + }, + { + "auxiliary_loss_clip": 0.01071114, + "auxiliary_loss_mlp": 0.01008288, + "balance_loss_clip": 1.00456893, + "balance_loss_mlp": 1.01656318, + "epoch": 0.07767924244701638, + "flos": 67237202954880.0, + "grad_norm": 0.9429671936579762, + "language_loss": 0.64993972, + "learning_rate": 3.9762013312202955e-06, + "loss": 0.67073375, + "num_input_tokens_seen": 27487190, + "router_z_loss_clip": 0.03710938, + "router_z_loss_mlp": 0.546875, + "step": 1292, + "time_per_iteration": 3.1508371829986572 + }, + { + "auxiliary_loss_clip": 0.0118873, + "auxiliary_loss_mlp": 0.01059018, + "balance_loss_clip": 1.03716707, + "balance_loss_mlp": 1.05293965, + "epoch": 0.07773936569968436, + "flos": 28550635292160.0, + "grad_norm": 1.7960778456946043, + "language_loss": 0.87610948, + "learning_rate": 3.9761413912095075e-06, + "loss": 0.89858699, + "num_input_tokens_seen": 27510465, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.359375, + "step": 1293, + "time_per_iteration": 2.6359729766845703 + }, + { + "auxiliary_loss_clip": 0.01193413, + "auxiliary_loss_mlp": 0.01062236, + "balance_loss_clip": 1.03789377, + "balance_loss_mlp": 1.05659533, + "epoch": 0.07779948895235232, + "flos": 27490264871040.0, + "grad_norm": 2.312065886688882, + "language_loss": 0.85111046, + "learning_rate": 3.976081376263239e-06, + "loss": 0.873667, + "num_input_tokens_seen": 27528645, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.3671875, + "step": 1294, + "time_per_iteration": 2.5151314735412598 + }, + { + "auxiliary_loss_clip": 0.01193943, + "auxiliary_loss_mlp": 0.01054926, + "balance_loss_clip": 1.03188252, + "balance_loss_mlp": 1.05702615, + "epoch": 0.07785961220502029, + "flos": 18223301301120.0, + "grad_norm": 2.728225366024782, + "language_loss": 0.79202414, + "learning_rate": 3.976021286383768e-06, + "loss": 0.81451285, + "num_input_tokens_seen": 27546165, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.3671875, + "step": 1295, + "time_per_iteration": 2.459510326385498 + }, + { + "auxiliary_loss_clip": 0.01188808, + "auxiliary_loss_mlp": 0.01052849, + "balance_loss_clip": 1.02966261, + "balance_loss_mlp": 1.05383039, + "epoch": 0.07791973545768827, + "flos": 24608218245120.0, + "grad_norm": 2.8222308711400834, + "language_loss": 0.88216382, + "learning_rate": 3.975961121573371e-06, + "loss": 0.90458035, + "num_input_tokens_seen": 27566520, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3515625, + "step": 1296, + "time_per_iteration": 2.492892026901245 + }, + { + "auxiliary_loss_clip": 0.01192946, + "auxiliary_loss_mlp": 0.01058016, + "balance_loss_clip": 1.03410244, + "balance_loss_mlp": 1.05591464, + "epoch": 0.07797985871035623, + "flos": 14282069402880.0, + "grad_norm": 3.2140473454082086, + "language_loss": 0.96160841, + "learning_rate": 3.9759008818343305e-06, + "loss": 0.98411804, + "num_input_tokens_seen": 27581960, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.375, + "step": 1297, + "time_per_iteration": 2.4668915271759033 + }, + { + "auxiliary_loss_clip": 0.01189875, + "auxiliary_loss_mlp": 0.01054366, + "balance_loss_clip": 1.032372, + "balance_loss_mlp": 1.05289149, + "epoch": 0.0780399819630242, + "flos": 26610453141120.0, + "grad_norm": 2.460261972702069, + "language_loss": 0.76087165, + "learning_rate": 3.97584056716893e-06, + "loss": 0.78331399, + "num_input_tokens_seen": 27601415, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.3671875, + "step": 1298, + "time_per_iteration": 2.5059781074523926 + }, + { + "auxiliary_loss_clip": 0.01192131, + "auxiliary_loss_mlp": 0.01061793, + "balance_loss_clip": 1.04039502, + "balance_loss_mlp": 1.05696058, + "epoch": 0.07810010521569218, + "flos": 21834514016640.0, + "grad_norm": 1.8752674736144914, + "language_loss": 0.80755305, + "learning_rate": 3.9757801775794575e-06, + "loss": 0.83009231, + "num_input_tokens_seen": 27621490, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.3515625, + "step": 1299, + "time_per_iteration": 2.5036020278930664 + }, + { + "auxiliary_loss_clip": 0.01183493, + "auxiliary_loss_mlp": 0.01056623, + "balance_loss_clip": 1.03402138, + "balance_loss_mlp": 1.05226159, + "epoch": 0.07816022846836014, + "flos": 25081233471360.0, + "grad_norm": 2.1903498852009813, + "language_loss": 0.86459941, + "learning_rate": 3.975719713068202e-06, + "loss": 0.88700056, + "num_input_tokens_seen": 27640600, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.3125, + "step": 1300, + "time_per_iteration": 2.4866278171539307 + }, + { + "auxiliary_loss_clip": 0.0118988, + "auxiliary_loss_mlp": 0.01052064, + "balance_loss_clip": 1.0284245, + "balance_loss_mlp": 1.05393028, + "epoch": 0.0782203517210281, + "flos": 40917515431680.0, + "grad_norm": 1.909902293479526, + "language_loss": 0.71778899, + "learning_rate": 3.975659173637458e-06, + "loss": 0.74020839, + "num_input_tokens_seen": 27663070, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.359375, + "step": 1301, + "time_per_iteration": 2.6491336822509766 + }, + { + "auxiliary_loss_clip": 0.01196178, + "auxiliary_loss_mlp": 0.0106414, + "balance_loss_clip": 1.04106081, + "balance_loss_mlp": 1.0586772, + "epoch": 0.07828047497369607, + "flos": 41172014269440.0, + "grad_norm": 1.5624281437346959, + "language_loss": 0.70860815, + "learning_rate": 3.97559855928952e-06, + "loss": 0.7312113, + "num_input_tokens_seen": 27686425, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.375, + "step": 1302, + "time_per_iteration": 2.635430335998535 + }, + { + "auxiliary_loss_clip": 0.01188946, + "auxiliary_loss_mlp": 0.01060723, + "balance_loss_clip": 1.03702378, + "balance_loss_mlp": 1.05438161, + "epoch": 0.07834059822636405, + "flos": 23508130360320.0, + "grad_norm": 2.152945758623263, + "language_loss": 0.8192755, + "learning_rate": 3.9755378700266864e-06, + "loss": 0.84177226, + "num_input_tokens_seen": 27704900, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.34375, + "step": 1303, + "time_per_iteration": 2.4861090183258057 + }, + { + "auxiliary_loss_clip": 0.01188578, + "auxiliary_loss_mlp": 0.01061933, + "balance_loss_clip": 1.03879452, + "balance_loss_mlp": 1.05351233, + "epoch": 0.07840072147903202, + "flos": 20193899293440.0, + "grad_norm": 1.8425530042965788, + "language_loss": 0.7497822, + "learning_rate": 3.9754771058512585e-06, + "loss": 0.77228731, + "num_input_tokens_seen": 27724890, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3515625, + "step": 1304, + "time_per_iteration": 2.464087963104248 + }, + { + "auxiliary_loss_clip": 0.01191658, + "auxiliary_loss_mlp": 0.0106237, + "balance_loss_clip": 1.03857565, + "balance_loss_mlp": 1.05645108, + "epoch": 0.07846084473169998, + "flos": 21360816432000.0, + "grad_norm": 1.696211405930565, + "language_loss": 0.76397038, + "learning_rate": 3.975416266765542e-06, + "loss": 0.78651059, + "num_input_tokens_seen": 27743115, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.359375, + "step": 1305, + "time_per_iteration": 2.486093521118164 + }, + { + "auxiliary_loss_clip": 0.01192283, + "auxiliary_loss_mlp": 0.01064575, + "balance_loss_clip": 1.04087615, + "balance_loss_mlp": 1.05527782, + "epoch": 0.07852096798436796, + "flos": 25410965345280.0, + "grad_norm": 2.2926357932273866, + "language_loss": 0.85035503, + "learning_rate": 3.975355352771841e-06, + "loss": 0.87292361, + "num_input_tokens_seen": 27763570, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.3671875, + "step": 1306, + "time_per_iteration": 2.496265172958374 + }, + { + "auxiliary_loss_clip": 0.0119039, + "auxiliary_loss_mlp": 0.0104404, + "balance_loss_clip": 1.02299976, + "balance_loss_mlp": 1.05652416, + "epoch": 0.07858109123703592, + "flos": 24571481610240.0, + "grad_norm": 3.0575778567802976, + "language_loss": 0.90087706, + "learning_rate": 3.975294363872468e-06, + "loss": 0.92322135, + "num_input_tokens_seen": 27780030, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.34375, + "step": 1307, + "time_per_iteration": 2.5122623443603516 + }, + { + "auxiliary_loss_clip": 0.01189263, + "auxiliary_loss_mlp": 0.01057091, + "balance_loss_clip": 1.03295124, + "balance_loss_mlp": 1.05417371, + "epoch": 0.07864121448970389, + "flos": 20698874645760.0, + "grad_norm": 1.8540925974151201, + "language_loss": 0.83408689, + "learning_rate": 3.975233300069735e-06, + "loss": 0.85655046, + "num_input_tokens_seen": 27796225, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.3515625, + "step": 1308, + "time_per_iteration": 2.4686944484710693 + }, + { + "auxiliary_loss_clip": 0.01186004, + "auxiliary_loss_mlp": 0.01053628, + "balance_loss_clip": 1.03177738, + "balance_loss_mlp": 1.05289674, + "epoch": 0.07870133774237187, + "flos": 22966526113920.0, + "grad_norm": 1.6283340971904061, + "language_loss": 0.77841777, + "learning_rate": 3.975172161365958e-06, + "loss": 0.80081415, + "num_input_tokens_seen": 27815975, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.328125, + "step": 1309, + "time_per_iteration": 5.444388151168823 + }, + { + "auxiliary_loss_clip": 0.0119416, + "auxiliary_loss_mlp": 0.01062294, + "balance_loss_clip": 1.0380106, + "balance_loss_mlp": 1.05386913, + "epoch": 0.07876146099503983, + "flos": 18842832103680.0, + "grad_norm": 1.9656388899868151, + "language_loss": 0.80146122, + "learning_rate": 3.975110947763453e-06, + "loss": 0.82402575, + "num_input_tokens_seen": 27832255, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.40625, + "step": 1310, + "time_per_iteration": 3.8553466796875 + }, + { + "auxiliary_loss_clip": 0.01185305, + "auxiliary_loss_mlp": 0.01052602, + "balance_loss_clip": 1.03067899, + "balance_loss_mlp": 1.05544043, + "epoch": 0.0788215842477078, + "flos": 23805794367360.0, + "grad_norm": 1.7115323272474947, + "language_loss": 0.73069102, + "learning_rate": 3.9750496592645435e-06, + "loss": 0.75307012, + "num_input_tokens_seen": 27852180, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.296875, + "step": 1311, + "time_per_iteration": 2.5299458503723145 + }, + { + "auxiliary_loss_clip": 0.01190682, + "auxiliary_loss_mlp": 0.01072627, + "balance_loss_clip": 1.04861844, + "balance_loss_mlp": 1.05650353, + "epoch": 0.07888170750037576, + "flos": 21579907438080.0, + "grad_norm": 1.9161215374898264, + "language_loss": 0.85871482, + "learning_rate": 3.974988295871553e-06, + "loss": 0.88134789, + "num_input_tokens_seen": 27871435, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.34375, + "step": 1312, + "time_per_iteration": 2.5490031242370605 + }, + { + "auxiliary_loss_clip": 0.01186476, + "auxiliary_loss_mlp": 0.01059916, + "balance_loss_clip": 1.03811264, + "balance_loss_mlp": 1.0555284, + "epoch": 0.07894183075304374, + "flos": 19864849777920.0, + "grad_norm": 1.7542323177910393, + "language_loss": 0.81968379, + "learning_rate": 3.9749268575868085e-06, + "loss": 0.84214771, + "num_input_tokens_seen": 27890625, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.3125, + "step": 1313, + "time_per_iteration": 2.507046699523926 + }, + { + "auxiliary_loss_clip": 0.0119261, + "auxiliary_loss_mlp": 0.0105996, + "balance_loss_clip": 1.03528404, + "balance_loss_mlp": 1.05271506, + "epoch": 0.07900195400571171, + "flos": 16143463071360.0, + "grad_norm": 3.109477065223649, + "language_loss": 0.73372161, + "learning_rate": 3.97486534441264e-06, + "loss": 0.7562474, + "num_input_tokens_seen": 27906530, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.3984375, + "step": 1314, + "time_per_iteration": 2.4396395683288574 + }, + { + "auxiliary_loss_clip": 0.01185115, + "auxiliary_loss_mlp": 0.01058505, + "balance_loss_clip": 1.03678489, + "balance_loss_mlp": 1.05120206, + "epoch": 0.07906207725837967, + "flos": 23730417676800.0, + "grad_norm": 1.579996187361532, + "language_loss": 0.79460657, + "learning_rate": 3.974803756351379e-06, + "loss": 0.81704271, + "num_input_tokens_seen": 27926725, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.34375, + "step": 1315, + "time_per_iteration": 2.493365526199341 + }, + { + "auxiliary_loss_clip": 0.011877, + "auxiliary_loss_mlp": 0.01060931, + "balance_loss_clip": 1.03592062, + "balance_loss_mlp": 1.05232, + "epoch": 0.07912220051104765, + "flos": 24315905364480.0, + "grad_norm": 1.9411836832725016, + "language_loss": 0.73614991, + "learning_rate": 3.974742093405362e-06, + "loss": 0.75863618, + "num_input_tokens_seen": 27947875, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.3515625, + "step": 1316, + "time_per_iteration": 2.4696316719055176 + }, + { + "auxiliary_loss_clip": 0.01193023, + "auxiliary_loss_mlp": 0.01063129, + "balance_loss_clip": 1.03940618, + "balance_loss_mlp": 1.05415511, + "epoch": 0.07918232376371562, + "flos": 18880035615360.0, + "grad_norm": 2.862910173072837, + "language_loss": 0.65148681, + "learning_rate": 3.974680355576927e-06, + "loss": 0.67404836, + "num_input_tokens_seen": 27965040, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.390625, + "step": 1317, + "time_per_iteration": 2.447847843170166 + }, + { + "auxiliary_loss_clip": 0.01197561, + "auxiliary_loss_mlp": 0.01063488, + "balance_loss_clip": 1.03899026, + "balance_loss_mlp": 1.05774999, + "epoch": 0.07924244701638358, + "flos": 27376284038400.0, + "grad_norm": 2.3478172138868967, + "language_loss": 0.7324174, + "learning_rate": 3.974618542868415e-06, + "loss": 0.75502789, + "num_input_tokens_seen": 27985330, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.3984375, + "step": 1318, + "time_per_iteration": 2.497406482696533 + }, + { + "auxiliary_loss_clip": 0.01188329, + "auxiliary_loss_mlp": 0.01057875, + "balance_loss_clip": 1.03557122, + "balance_loss_mlp": 1.05335736, + "epoch": 0.07930257026905156, + "flos": 25120340403840.0, + "grad_norm": 1.92969491679129, + "language_loss": 0.90610284, + "learning_rate": 3.97455665528217e-06, + "loss": 0.92856491, + "num_input_tokens_seen": 28007615, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.3515625, + "step": 1319, + "time_per_iteration": 2.5007200241088867 + }, + { + "auxiliary_loss_clip": 0.01188786, + "auxiliary_loss_mlp": 0.01054126, + "balance_loss_clip": 1.03086793, + "balance_loss_mlp": 1.05155873, + "epoch": 0.07936269352171953, + "flos": 21834478103040.0, + "grad_norm": 2.95797867210378, + "language_loss": 0.79765761, + "learning_rate": 3.974494692820539e-06, + "loss": 0.82008684, + "num_input_tokens_seen": 28027765, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.375, + "step": 1320, + "time_per_iteration": 2.4683783054351807 + }, + { + "auxiliary_loss_clip": 0.01190918, + "auxiliary_loss_mlp": 0.01057044, + "balance_loss_clip": 1.03448987, + "balance_loss_mlp": 1.05700457, + "epoch": 0.07942281677438749, + "flos": 16939889377920.0, + "grad_norm": 2.6163787894008363, + "language_loss": 0.69574934, + "learning_rate": 3.974432655485872e-06, + "loss": 0.71822894, + "num_input_tokens_seen": 28044225, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.34375, + "step": 1321, + "time_per_iteration": 2.466911554336548 + }, + { + "auxiliary_loss_clip": 0.01184231, + "auxiliary_loss_mlp": 0.01055954, + "balance_loss_clip": 1.03313756, + "balance_loss_mlp": 1.05313718, + "epoch": 0.07948294002705546, + "flos": 18986941468800.0, + "grad_norm": 1.926313653502779, + "language_loss": 0.83559513, + "learning_rate": 3.9743705432805195e-06, + "loss": 0.857997, + "num_input_tokens_seen": 28062915, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.3125, + "step": 1322, + "time_per_iteration": 2.465885639190674 + }, + { + "auxiliary_loss_clip": 0.01188233, + "auxiliary_loss_mlp": 0.01058763, + "balance_loss_clip": 1.03544521, + "balance_loss_mlp": 1.05104756, + "epoch": 0.07954306327972344, + "flos": 21653452535040.0, + "grad_norm": 1.8863777031262867, + "language_loss": 0.90437615, + "learning_rate": 3.974308356206838e-06, + "loss": 0.92684615, + "num_input_tokens_seen": 28082175, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.375, + "step": 1323, + "time_per_iteration": 2.465841293334961 + }, + { + "auxiliary_loss_clip": 0.0118735, + "auxiliary_loss_mlp": 0.01057162, + "balance_loss_clip": 1.03438115, + "balance_loss_mlp": 1.05414796, + "epoch": 0.0796031865323914, + "flos": 23220270766080.0, + "grad_norm": 1.6454981938510795, + "language_loss": 0.82583225, + "learning_rate": 3.974246094267187e-06, + "loss": 0.84827733, + "num_input_tokens_seen": 28102645, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.328125, + "step": 1324, + "time_per_iteration": 2.475486993789673 + }, + { + "auxiliary_loss_clip": 0.01188398, + "auxiliary_loss_mlp": 0.01049438, + "balance_loss_clip": 1.0255841, + "balance_loss_mlp": 1.05264676, + "epoch": 0.07966330978505937, + "flos": 23294534135040.0, + "grad_norm": 2.416918252865386, + "language_loss": 0.79654729, + "learning_rate": 3.974183757463925e-06, + "loss": 0.81892562, + "num_input_tokens_seen": 28122805, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.359375, + "step": 1325, + "time_per_iteration": 2.482555389404297 + }, + { + "auxiliary_loss_clip": 0.01190127, + "auxiliary_loss_mlp": 0.01064919, + "balance_loss_clip": 1.03989661, + "balance_loss_mlp": 1.05474687, + "epoch": 0.07972343303772735, + "flos": 18363783392640.0, + "grad_norm": 2.170521767048619, + "language_loss": 0.8812806, + "learning_rate": 3.974121345799418e-06, + "loss": 0.90383106, + "num_input_tokens_seen": 28140530, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.3515625, + "step": 1326, + "time_per_iteration": 2.466742753982544 + }, + { + "auxiliary_loss_clip": 0.01182901, + "auxiliary_loss_mlp": 0.01052255, + "balance_loss_clip": 1.02823424, + "balance_loss_mlp": 1.05014396, + "epoch": 0.07978355629039531, + "flos": 21762513204480.0, + "grad_norm": 2.3992518634606164, + "language_loss": 0.83013594, + "learning_rate": 3.974058859276032e-06, + "loss": 0.8524875, + "num_input_tokens_seen": 28159640, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.328125, + "step": 1327, + "time_per_iteration": 2.4989237785339355 + }, + { + "auxiliary_loss_clip": 0.0119143, + "auxiliary_loss_mlp": 0.0105424, + "balance_loss_clip": 1.03013575, + "balance_loss_mlp": 1.05436027, + "epoch": 0.07984367954306328, + "flos": 18551309322240.0, + "grad_norm": 2.1664091533416587, + "language_loss": 0.78452092, + "learning_rate": 3.9739962978961354e-06, + "loss": 0.80697763, + "num_input_tokens_seen": 28177050, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.375, + "step": 1328, + "time_per_iteration": 2.4402852058410645 + }, + { + "auxiliary_loss_clip": 0.01191637, + "auxiliary_loss_mlp": 0.01053331, + "balance_loss_clip": 1.02969217, + "balance_loss_mlp": 1.05460131, + "epoch": 0.07990380279573125, + "flos": 16904050583040.0, + "grad_norm": 2.484533735051083, + "language_loss": 0.74277186, + "learning_rate": 3.973933661662101e-06, + "loss": 0.76522154, + "num_input_tokens_seen": 28193245, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.375, + "step": 1329, + "time_per_iteration": 2.425388813018799 + }, + { + "auxiliary_loss_clip": 0.01185759, + "auxiliary_loss_mlp": 0.01060058, + "balance_loss_clip": 1.03731298, + "balance_loss_mlp": 1.05096054, + "epoch": 0.07996392604839922, + "flos": 24098358643200.0, + "grad_norm": 1.5753219993175995, + "language_loss": 0.81090498, + "learning_rate": 3.973870950576305e-06, + "loss": 0.83336312, + "num_input_tokens_seen": 28213570, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.3515625, + "step": 1330, + "time_per_iteration": 2.4831247329711914 + }, + { + "auxiliary_loss_clip": 0.01190834, + "auxiliary_loss_mlp": 0.01062422, + "balance_loss_clip": 1.03924823, + "balance_loss_mlp": 1.05348384, + "epoch": 0.08002404930106718, + "flos": 14278729438080.0, + "grad_norm": 2.322034822225311, + "language_loss": 0.88790143, + "learning_rate": 3.9738081646411255e-06, + "loss": 0.91043401, + "num_input_tokens_seen": 28229980, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.375, + "step": 1331, + "time_per_iteration": 2.4410722255706787 + }, + { + "auxiliary_loss_clip": 0.01193336, + "auxiliary_loss_mlp": 0.01058252, + "balance_loss_clip": 1.03414834, + "balance_loss_mlp": 1.05288279, + "epoch": 0.08008417255373516, + "flos": 40406219285760.0, + "grad_norm": 2.577873328737783, + "language_loss": 0.73332524, + "learning_rate": 3.973745303858942e-06, + "loss": 0.75584114, + "num_input_tokens_seen": 28253840, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.40625, + "step": 1332, + "time_per_iteration": 2.6054465770721436 + }, + { + "auxiliary_loss_clip": 0.01186558, + "auxiliary_loss_mlp": 0.01050644, + "balance_loss_clip": 1.02820885, + "balance_loss_mlp": 1.05179858, + "epoch": 0.08014429580640313, + "flos": 18478913460480.0, + "grad_norm": 1.9568005204239032, + "language_loss": 0.82994795, + "learning_rate": 3.973682368232138e-06, + "loss": 0.85232008, + "num_input_tokens_seen": 28271675, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.34375, + "step": 1333, + "time_per_iteration": 2.453457832336426 + }, + { + "auxiliary_loss_clip": 0.01187412, + "auxiliary_loss_mlp": 0.01055323, + "balance_loss_clip": 1.03272128, + "balance_loss_mlp": 1.05115032, + "epoch": 0.0802044190590711, + "flos": 22053461368320.0, + "grad_norm": 2.7771179443818466, + "language_loss": 0.74698973, + "learning_rate": 3.9736193577631015e-06, + "loss": 0.76941711, + "num_input_tokens_seen": 28291850, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.359375, + "step": 1334, + "time_per_iteration": 2.5768256187438965 + }, + { + "auxiliary_loss_clip": 0.01187182, + "auxiliary_loss_mlp": 0.01060862, + "balance_loss_clip": 1.03831935, + "balance_loss_mlp": 1.05457497, + "epoch": 0.08026454231173906, + "flos": 24572128055040.0, + "grad_norm": 2.0216765528325635, + "language_loss": 0.80279201, + "learning_rate": 3.973556272454221e-06, + "loss": 0.82527244, + "num_input_tokens_seen": 28310780, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.328125, + "step": 1335, + "time_per_iteration": 2.538670301437378 + }, + { + "auxiliary_loss_clip": 0.01078994, + "auxiliary_loss_mlp": 0.01011272, + "balance_loss_clip": 1.00802934, + "balance_loss_mlp": 1.02308655, + "epoch": 0.08032466556440704, + "flos": 52581841459200.0, + "grad_norm": 0.7427722697577622, + "language_loss": 0.56020629, + "learning_rate": 3.973493112307889e-06, + "loss": 0.58110893, + "num_input_tokens_seen": 28369985, + "router_z_loss_clip": 0.0324707, + "router_z_loss_mlp": 0.5625, + "step": 1336, + "time_per_iteration": 3.125026226043701 + }, + { + "auxiliary_loss_clip": 0.01188939, + "auxiliary_loss_mlp": 0.01054834, + "balance_loss_clip": 1.0331738, + "balance_loss_mlp": 1.05371606, + "epoch": 0.080384788817075, + "flos": 23842602829440.0, + "grad_norm": 2.050916847484745, + "language_loss": 0.67764497, + "learning_rate": 3.9734298773265005e-06, + "loss": 0.70008272, + "num_input_tokens_seen": 28388670, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.3515625, + "step": 1337, + "time_per_iteration": 2.506103038787842 + }, + { + "auxiliary_loss_clip": 0.01188826, + "auxiliary_loss_mlp": 0.01065102, + "balance_loss_clip": 1.04313135, + "balance_loss_mlp": 1.05480385, + "epoch": 0.08044491206974297, + "flos": 25300719527040.0, + "grad_norm": 1.8692893317328456, + "language_loss": 0.86701488, + "learning_rate": 3.973366567512453e-06, + "loss": 0.88955414, + "num_input_tokens_seen": 28411845, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.3359375, + "step": 1338, + "time_per_iteration": 2.5451908111572266 + }, + { + "auxiliary_loss_clip": 0.01188004, + "auxiliary_loss_mlp": 0.01060185, + "balance_loss_clip": 1.0368793, + "balance_loss_mlp": 1.05142283, + "epoch": 0.08050503532241095, + "flos": 22376549226240.0, + "grad_norm": 2.6265473040924725, + "language_loss": 0.87246621, + "learning_rate": 3.973303182868147e-06, + "loss": 0.89494807, + "num_input_tokens_seen": 28427875, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.359375, + "step": 1339, + "time_per_iteration": 2.450932502746582 + }, + { + "auxiliary_loss_clip": 0.01181336, + "auxiliary_loss_mlp": 0.01047749, + "balance_loss_clip": 1.02660179, + "balance_loss_mlp": 1.05106449, + "epoch": 0.08056515857507891, + "flos": 18369421827840.0, + "grad_norm": 2.428441908593999, + "language_loss": 0.88819683, + "learning_rate": 3.973239723395988e-06, + "loss": 0.91048771, + "num_input_tokens_seen": 28446615, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.296875, + "step": 1340, + "time_per_iteration": 2.4539895057678223 + }, + { + "auxiliary_loss_clip": 0.01072684, + "auxiliary_loss_mlp": 0.01003041, + "balance_loss_clip": 0.99951285, + "balance_loss_mlp": 1.01727247, + "epoch": 0.08062528182774688, + "flos": 51348130980480.0, + "grad_norm": 0.8886760882983712, + "language_loss": 0.64806795, + "learning_rate": 3.97317618909838e-06, + "loss": 0.66882515, + "num_input_tokens_seen": 28505290, + "router_z_loss_clip": 0.03540039, + "router_z_loss_mlp": 0.5546875, + "step": 1341, + "time_per_iteration": 3.0034360885620117 + }, + { + "auxiliary_loss_clip": 0.01193907, + "auxiliary_loss_mlp": 0.01060938, + "balance_loss_clip": 1.03577328, + "balance_loss_mlp": 1.05301166, + "epoch": 0.08068540508041486, + "flos": 17599712261760.0, + "grad_norm": 2.817345215565239, + "language_loss": 0.89616883, + "learning_rate": 3.973112579977733e-06, + "loss": 0.91871732, + "num_input_tokens_seen": 28522735, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.40625, + "step": 1342, + "time_per_iteration": 2.479701042175293 + }, + { + "auxiliary_loss_clip": 0.01194936, + "auxiliary_loss_mlp": 0.0105815, + "balance_loss_clip": 1.03334308, + "balance_loss_mlp": 1.05721259, + "epoch": 0.08074552833308282, + "flos": 10561185486720.0, + "grad_norm": 2.7453135307928216, + "language_loss": 0.76378155, + "learning_rate": 3.973048896036459e-06, + "loss": 0.78631246, + "num_input_tokens_seen": 28539460, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.375, + "step": 1343, + "time_per_iteration": 2.4969120025634766 + }, + { + "auxiliary_loss_clip": 0.01072196, + "auxiliary_loss_mlp": 0.01008036, + "balance_loss_clip": 1.00446022, + "balance_loss_mlp": 1.01791215, + "epoch": 0.08080565158575079, + "flos": 60840254954880.0, + "grad_norm": 0.8963318804352591, + "language_loss": 0.57395822, + "learning_rate": 3.972985137276974e-06, + "loss": 0.59476054, + "num_input_tokens_seen": 28599855, + "router_z_loss_clip": 0.03564453, + "router_z_loss_mlp": 0.54296875, + "step": 1344, + "time_per_iteration": 2.9917871952056885 + }, + { + "auxiliary_loss_clip": 0.01190985, + "auxiliary_loss_mlp": 0.0105771, + "balance_loss_clip": 1.03452373, + "balance_loss_mlp": 1.05523396, + "epoch": 0.08086577483841875, + "flos": 18332361970560.0, + "grad_norm": 2.677643541218582, + "language_loss": 0.86665964, + "learning_rate": 3.972921303701695e-06, + "loss": 0.88914657, + "num_input_tokens_seen": 28617585, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.359375, + "step": 1345, + "time_per_iteration": 2.4601447582244873 + }, + { + "auxiliary_loss_clip": 0.01187459, + "auxiliary_loss_mlp": 0.01054913, + "balance_loss_clip": 1.03289497, + "balance_loss_mlp": 1.05403256, + "epoch": 0.08092589809108673, + "flos": 21543601766400.0, + "grad_norm": 1.7098835991166323, + "language_loss": 0.87242532, + "learning_rate": 3.972857395313042e-06, + "loss": 0.894849, + "num_input_tokens_seen": 28636355, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.3359375, + "step": 1346, + "time_per_iteration": 2.4809892177581787 + }, + { + "auxiliary_loss_clip": 0.01185898, + "auxiliary_loss_mlp": 0.01054973, + "balance_loss_clip": 1.03256202, + "balance_loss_mlp": 1.05219567, + "epoch": 0.0809860213437547, + "flos": 22128012046080.0, + "grad_norm": 1.6659805361601863, + "language_loss": 0.92606491, + "learning_rate": 3.972793412113439e-06, + "loss": 0.94847363, + "num_input_tokens_seen": 28656260, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.3359375, + "step": 1347, + "time_per_iteration": 2.4802379608154297 + }, + { + "auxiliary_loss_clip": 0.0118757, + "auxiliary_loss_mlp": 0.01057822, + "balance_loss_clip": 1.03318167, + "balance_loss_mlp": 1.05471659, + "epoch": 0.08104614459642266, + "flos": 21725489260800.0, + "grad_norm": 9.453605004454174, + "language_loss": 0.89181751, + "learning_rate": 3.972729354105312e-06, + "loss": 0.91427147, + "num_input_tokens_seen": 28675865, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.328125, + "step": 1348, + "time_per_iteration": 2.4610300064086914 + }, + { + "auxiliary_loss_clip": 0.01185296, + "auxiliary_loss_mlp": 0.01056008, + "balance_loss_clip": 1.03420484, + "balance_loss_mlp": 1.05543983, + "epoch": 0.08110626784909064, + "flos": 23951878980480.0, + "grad_norm": 2.4916215003739355, + "language_loss": 0.76796132, + "learning_rate": 3.97266522129109e-06, + "loss": 0.7903744, + "num_input_tokens_seen": 28696255, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.296875, + "step": 1349, + "time_per_iteration": 2.4789178371429443 + }, + { + "auxiliary_loss_clip": 0.01187103, + "auxiliary_loss_mlp": 0.0105974, + "balance_loss_clip": 1.03669679, + "balance_loss_mlp": 1.05236626, + "epoch": 0.0811663911017586, + "flos": 19025689265280.0, + "grad_norm": 2.126949034470324, + "language_loss": 0.88571703, + "learning_rate": 3.972601013673205e-06, + "loss": 0.90818548, + "num_input_tokens_seen": 28713905, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.34375, + "step": 1350, + "time_per_iteration": 2.43094539642334 + }, + { + "auxiliary_loss_clip": 0.01184059, + "auxiliary_loss_mlp": 0.01061052, + "balance_loss_clip": 1.03773451, + "balance_loss_mlp": 1.05228257, + "epoch": 0.08122651435442657, + "flos": 15341290588800.0, + "grad_norm": 2.044220866897066, + "language_loss": 0.82058489, + "learning_rate": 3.972536731254092e-06, + "loss": 0.843036, + "num_input_tokens_seen": 28732075, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.3125, + "step": 1351, + "time_per_iteration": 6.688653469085693 + }, + { + "auxiliary_loss_clip": 0.01184193, + "auxiliary_loss_mlp": 0.01053712, + "balance_loss_clip": 1.02917862, + "balance_loss_mlp": 1.04863417, + "epoch": 0.08128663760709455, + "flos": 23221563655680.0, + "grad_norm": 1.9894600711485977, + "language_loss": 0.75347674, + "learning_rate": 3.972472374036189e-06, + "loss": 0.77585584, + "num_input_tokens_seen": 28751150, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.359375, + "step": 1352, + "time_per_iteration": 2.4888412952423096 + }, + { + "auxiliary_loss_clip": 0.01192461, + "auxiliary_loss_mlp": 0.01055559, + "balance_loss_clip": 1.03163338, + "balance_loss_mlp": 1.05483341, + "epoch": 0.08134676085976252, + "flos": 22965628273920.0, + "grad_norm": 1.7603053493114211, + "language_loss": 0.82833469, + "learning_rate": 3.972407942021935e-06, + "loss": 0.85081488, + "num_input_tokens_seen": 28773360, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.375, + "step": 1353, + "time_per_iteration": 2.522960901260376 + }, + { + "auxiliary_loss_clip": 0.01073388, + "auxiliary_loss_mlp": 0.01010471, + "balance_loss_clip": 1.00694275, + "balance_loss_mlp": 1.01996851, + "epoch": 0.08140688411243048, + "flos": 64322115816960.0, + "grad_norm": 0.8931676068679675, + "language_loss": 0.5970993, + "learning_rate": 3.972343435213775e-06, + "loss": 0.61793786, + "num_input_tokens_seen": 28833390, + "router_z_loss_clip": 0.03540039, + "router_z_loss_mlp": 0.53125, + "step": 1354, + "time_per_iteration": 3.0639474391937256 + }, + { + "auxiliary_loss_clip": 0.0118665, + "auxiliary_loss_mlp": 0.01060844, + "balance_loss_clip": 1.03764629, + "balance_loss_mlp": 1.05431724, + "epoch": 0.08146700736509845, + "flos": 22491858862080.0, + "grad_norm": 1.7981329827127455, + "language_loss": 0.82785606, + "learning_rate": 3.972278853614154e-06, + "loss": 0.85033101, + "num_input_tokens_seen": 28852430, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.3203125, + "step": 1355, + "time_per_iteration": 2.4664132595062256 + }, + { + "auxiliary_loss_clip": 0.01186535, + "auxiliary_loss_mlp": 0.01062061, + "balance_loss_clip": 1.03619206, + "balance_loss_mlp": 1.05146575, + "epoch": 0.08152713061776642, + "flos": 20447823513600.0, + "grad_norm": 1.9123465925299232, + "language_loss": 0.70799643, + "learning_rate": 3.972214197225521e-06, + "loss": 0.73048234, + "num_input_tokens_seen": 28870685, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.3515625, + "step": 1356, + "time_per_iteration": 2.509061813354492 + }, + { + "auxiliary_loss_clip": 0.01188256, + "auxiliary_loss_mlp": 0.01055944, + "balance_loss_clip": 1.03169644, + "balance_loss_mlp": 1.05148005, + "epoch": 0.08158725387043439, + "flos": 23550218121600.0, + "grad_norm": 2.53580294551395, + "language_loss": 0.70255458, + "learning_rate": 3.972149466050329e-06, + "loss": 0.72499657, + "num_input_tokens_seen": 28889860, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.3671875, + "step": 1357, + "time_per_iteration": 2.476951837539673 + }, + { + "auxiliary_loss_clip": 0.01191615, + "auxiliary_loss_mlp": 0.01053374, + "balance_loss_clip": 1.03067684, + "balance_loss_mlp": 1.05488217, + "epoch": 0.08164737712310235, + "flos": 22017335264640.0, + "grad_norm": 2.6163823683714953, + "language_loss": 0.84186697, + "learning_rate": 3.97208466009103e-06, + "loss": 0.86431682, + "num_input_tokens_seen": 28905865, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.3671875, + "step": 1358, + "time_per_iteration": 2.457376480102539 + }, + { + "auxiliary_loss_clip": 0.01190093, + "auxiliary_loss_mlp": 0.01056216, + "balance_loss_clip": 1.0310626, + "balance_loss_mlp": 1.05484545, + "epoch": 0.08170750037577033, + "flos": 23367827836800.0, + "grad_norm": 1.9894839389786314, + "language_loss": 1.02294087, + "learning_rate": 3.972019779350084e-06, + "loss": 1.04540396, + "num_input_tokens_seen": 28925250, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.3515625, + "step": 1359, + "time_per_iteration": 2.4723212718963623 + }, + { + "auxiliary_loss_clip": 0.01185855, + "auxiliary_loss_mlp": 0.01057366, + "balance_loss_clip": 1.03344035, + "balance_loss_mlp": 1.0511415, + "epoch": 0.0817676236284383, + "flos": 28397978490240.0, + "grad_norm": 2.0666688933075963, + "language_loss": 0.82969773, + "learning_rate": 3.971954823829951e-06, + "loss": 0.85212988, + "num_input_tokens_seen": 28943445, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.34375, + "step": 1360, + "time_per_iteration": 2.5143508911132812 + }, + { + "auxiliary_loss_clip": 0.01190184, + "auxiliary_loss_mlp": 0.01062181, + "balance_loss_clip": 1.03820777, + "balance_loss_mlp": 1.05335808, + "epoch": 0.08182774688110626, + "flos": 19208905562880.0, + "grad_norm": 2.14797754608813, + "language_loss": 0.72352278, + "learning_rate": 3.971889793533093e-06, + "loss": 0.74604642, + "num_input_tokens_seen": 28962695, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.3671875, + "step": 1361, + "time_per_iteration": 2.458034038543701 + }, + { + "auxiliary_loss_clip": 0.01179057, + "auxiliary_loss_mlp": 0.01057287, + "balance_loss_clip": 1.03249121, + "balance_loss_mlp": 1.04741335, + "epoch": 0.08188787013377424, + "flos": 22784099915520.0, + "grad_norm": 5.8589819193374515, + "language_loss": 0.76781029, + "learning_rate": 3.971824688461976e-06, + "loss": 0.79017377, + "num_input_tokens_seen": 28982120, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.3125, + "step": 1362, + "time_per_iteration": 2.472259759902954 + }, + { + "auxiliary_loss_clip": 0.01187551, + "auxiliary_loss_mlp": 0.01052018, + "balance_loss_clip": 1.0291419, + "balance_loss_mlp": 1.05449164, + "epoch": 0.08194799338644221, + "flos": 16468095214080.0, + "grad_norm": 2.631594675791475, + "language_loss": 0.72409523, + "learning_rate": 3.971759508619069e-06, + "loss": 0.74649096, + "num_input_tokens_seen": 28998100, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.328125, + "step": 1363, + "time_per_iteration": 2.4447264671325684 + }, + { + "auxiliary_loss_clip": 0.01189235, + "auxiliary_loss_mlp": 0.01061525, + "balance_loss_clip": 1.03603828, + "balance_loss_mlp": 1.05607057, + "epoch": 0.08200811663911017, + "flos": 23913633974400.0, + "grad_norm": 3.9166951523525464, + "language_loss": 0.77459586, + "learning_rate": 3.971694254006844e-06, + "loss": 0.79710352, + "num_input_tokens_seen": 29017095, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.328125, + "step": 1364, + "time_per_iteration": 2.475141763687134 + }, + { + "auxiliary_loss_clip": 0.01190144, + "auxiliary_loss_mlp": 0.01061328, + "balance_loss_clip": 1.03745019, + "balance_loss_mlp": 1.05500793, + "epoch": 0.08206823989177814, + "flos": 17896550256000.0, + "grad_norm": 2.6241179536013033, + "language_loss": 0.82025397, + "learning_rate": 3.971628924627776e-06, + "loss": 0.84276867, + "num_input_tokens_seen": 29037240, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.3515625, + "step": 1365, + "time_per_iteration": 2.493732452392578 + }, + { + "auxiliary_loss_clip": 0.0118713, + "auxiliary_loss_mlp": 0.0105741, + "balance_loss_clip": 1.03406882, + "balance_loss_mlp": 1.05614781, + "epoch": 0.08212836314444612, + "flos": 22088186841600.0, + "grad_norm": 3.3261283913074884, + "language_loss": 0.82173789, + "learning_rate": 3.97156352048434e-06, + "loss": 0.84418333, + "num_input_tokens_seen": 29056250, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.3125, + "step": 1366, + "time_per_iteration": 2.4809322357177734 + }, + { + "auxiliary_loss_clip": 0.01186928, + "auxiliary_loss_mlp": 0.0105891, + "balance_loss_clip": 1.03703475, + "balance_loss_mlp": 1.05126381, + "epoch": 0.08218848639711408, + "flos": 17597485618560.0, + "grad_norm": 2.8403828718649033, + "language_loss": 0.81534755, + "learning_rate": 3.97149804157902e-06, + "loss": 0.83780599, + "num_input_tokens_seen": 29073380, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.359375, + "step": 1367, + "time_per_iteration": 2.4577715396881104 + }, + { + "auxiliary_loss_clip": 0.01192402, + "auxiliary_loss_mlp": 0.01060775, + "balance_loss_clip": 1.03724277, + "balance_loss_mlp": 1.05413651, + "epoch": 0.08224860964978205, + "flos": 17857838373120.0, + "grad_norm": 2.3540874203263358, + "language_loss": 0.83644414, + "learning_rate": 3.9714324879142946e-06, + "loss": 0.85897589, + "num_input_tokens_seen": 29091330, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.3828125, + "step": 1368, + "time_per_iteration": 2.453547716140747 + }, + { + "auxiliary_loss_clip": 0.01181645, + "auxiliary_loss_mlp": 0.01049123, + "balance_loss_clip": 1.02694988, + "balance_loss_mlp": 1.05349994, + "epoch": 0.08230873290245003, + "flos": 25227533566080.0, + "grad_norm": 1.7360129433802456, + "language_loss": 0.81245828, + "learning_rate": 3.971366859492653e-06, + "loss": 0.83476603, + "num_input_tokens_seen": 29110375, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.28125, + "step": 1369, + "time_per_iteration": 2.527573585510254 + }, + { + "auxiliary_loss_clip": 0.01185735, + "auxiliary_loss_mlp": 0.01051149, + "balance_loss_clip": 1.02979898, + "balance_loss_mlp": 1.05528903, + "epoch": 0.08236885615511799, + "flos": 31759935753600.0, + "grad_norm": 2.240857135161324, + "language_loss": 0.74790901, + "learning_rate": 3.971301156316582e-06, + "loss": 0.77027786, + "num_input_tokens_seen": 29129395, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.3046875, + "step": 1370, + "time_per_iteration": 2.5205185413360596 + }, + { + "auxiliary_loss_clip": 0.01189372, + "auxiliary_loss_mlp": 0.01061396, + "balance_loss_clip": 1.03697038, + "balance_loss_mlp": 1.05480862, + "epoch": 0.08242897940778596, + "flos": 23185832601600.0, + "grad_norm": 1.6313231263601415, + "language_loss": 0.74633086, + "learning_rate": 3.971235378388573e-06, + "loss": 0.76883852, + "num_input_tokens_seen": 29148650, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.34375, + "step": 1371, + "time_per_iteration": 2.4995803833007812 + }, + { + "auxiliary_loss_clip": 0.01188254, + "auxiliary_loss_mlp": 0.01061601, + "balance_loss_clip": 1.03769946, + "balance_loss_mlp": 1.05410123, + "epoch": 0.08248910266045394, + "flos": 34491480393600.0, + "grad_norm": 2.0830704741847423, + "language_loss": 0.71080554, + "learning_rate": 3.971169525711122e-06, + "loss": 0.73330408, + "num_input_tokens_seen": 29170785, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.34375, + "step": 1372, + "time_per_iteration": 2.574457883834839 + }, + { + "auxiliary_loss_clip": 0.0118845, + "auxiliary_loss_mlp": 0.01051797, + "balance_loss_clip": 1.02750254, + "balance_loss_mlp": 1.05397415, + "epoch": 0.0825492259131219, + "flos": 13436228960640.0, + "grad_norm": 3.137320584176607, + "language_loss": 0.88010907, + "learning_rate": 3.9711035982867246e-06, + "loss": 0.90251154, + "num_input_tokens_seen": 29185210, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.34375, + "step": 1373, + "time_per_iteration": 2.485727310180664 + }, + { + "auxiliary_loss_clip": 0.01186594, + "auxiliary_loss_mlp": 0.01058909, + "balance_loss_clip": 1.03575897, + "balance_loss_mlp": 1.05331743, + "epoch": 0.08260934916578987, + "flos": 25812446636160.0, + "grad_norm": 1.7727067520163604, + "language_loss": 0.82349706, + "learning_rate": 3.971037596117882e-06, + "loss": 0.84595209, + "num_input_tokens_seen": 29205210, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.328125, + "step": 1374, + "time_per_iteration": 2.5223724842071533 + }, + { + "auxiliary_loss_clip": 0.01076427, + "auxiliary_loss_mlp": 0.01007461, + "balance_loss_clip": 1.0032891, + "balance_loss_mlp": 1.02371156, + "epoch": 0.08266947241845783, + "flos": 63460009491840.0, + "grad_norm": 0.8248734910296001, + "language_loss": 0.60630989, + "learning_rate": 3.970971519207095e-06, + "loss": 0.62714875, + "num_input_tokens_seen": 29265350, + "router_z_loss_clip": 0.04174805, + "router_z_loss_mlp": 0.5234375, + "step": 1375, + "time_per_iteration": 3.0909183025360107 + }, + { + "auxiliary_loss_clip": 0.01074233, + "auxiliary_loss_mlp": 0.01006319, + "balance_loss_clip": 1.00221813, + "balance_loss_mlp": 1.02162504, + "epoch": 0.08272959567112581, + "flos": 69993704568960.0, + "grad_norm": 0.9071425511101782, + "language_loss": 0.62149519, + "learning_rate": 3.970905367556871e-06, + "loss": 0.64230067, + "num_input_tokens_seen": 29321475, + "router_z_loss_clip": 0.04101562, + "router_z_loss_mlp": 0.52734375, + "step": 1376, + "time_per_iteration": 2.991158962249756 + }, + { + "auxiliary_loss_clip": 0.01195866, + "auxiliary_loss_mlp": 0.01069408, + "balance_loss_clip": 1.04624534, + "balance_loss_mlp": 1.05995989, + "epoch": 0.08278971892379378, + "flos": 20413205781120.0, + "grad_norm": 1.9826192893196872, + "language_loss": 0.82601643, + "learning_rate": 3.970839141169718e-06, + "loss": 0.84866917, + "num_input_tokens_seen": 29341405, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.359375, + "step": 1377, + "time_per_iteration": 2.5851728916168213 + }, + { + "auxiliary_loss_clip": 0.01188463, + "auxiliary_loss_mlp": 0.01057538, + "balance_loss_clip": 1.0342443, + "balance_loss_mlp": 1.05601847, + "epoch": 0.08284984217646174, + "flos": 26250233598720.0, + "grad_norm": 1.8760965133588865, + "language_loss": 0.84516692, + "learning_rate": 3.970772840048147e-06, + "loss": 0.86762691, + "num_input_tokens_seen": 29361955, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.3203125, + "step": 1378, + "time_per_iteration": 2.500251054763794 + }, + { + "auxiliary_loss_clip": 0.01190024, + "auxiliary_loss_mlp": 0.01062419, + "balance_loss_clip": 1.0383265, + "balance_loss_mlp": 1.05516553, + "epoch": 0.08290996542912972, + "flos": 27194683852800.0, + "grad_norm": 1.9551783234852504, + "language_loss": 0.87725681, + "learning_rate": 3.970706464194672e-06, + "loss": 0.89978123, + "num_input_tokens_seen": 29382395, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.3515625, + "step": 1379, + "time_per_iteration": 2.5428385734558105 + }, + { + "auxiliary_loss_clip": 0.01189534, + "auxiliary_loss_mlp": 0.01056049, + "balance_loss_clip": 1.03336358, + "balance_loss_mlp": 1.05776525, + "epoch": 0.08297008868179769, + "flos": 38618191146240.0, + "grad_norm": 1.7573789229703745, + "language_loss": 0.78658688, + "learning_rate": 3.970640013611812e-06, + "loss": 0.80904275, + "num_input_tokens_seen": 29404460, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.3125, + "step": 1380, + "time_per_iteration": 2.6350595951080322 + }, + { + "auxiliary_loss_clip": 0.01190411, + "auxiliary_loss_mlp": 0.01061393, + "balance_loss_clip": 1.03666866, + "balance_loss_mlp": 1.05878401, + "epoch": 0.08303021193446565, + "flos": 19974736460160.0, + "grad_norm": 2.2395713763978002, + "language_loss": 0.86146504, + "learning_rate": 3.970573488302083e-06, + "loss": 0.88398302, + "num_input_tokens_seen": 29422675, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.3125, + "step": 1381, + "time_per_iteration": 2.470153331756592 + }, + { + "auxiliary_loss_clip": 0.0119877, + "auxiliary_loss_mlp": 0.01060106, + "balance_loss_clip": 1.03604937, + "balance_loss_mlp": 1.06063581, + "epoch": 0.08309033518713363, + "flos": 13662646341120.0, + "grad_norm": 3.795546136319442, + "language_loss": 0.8817445, + "learning_rate": 3.970506888268011e-06, + "loss": 0.90433335, + "num_input_tokens_seen": 29439840, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.3828125, + "step": 1382, + "time_per_iteration": 2.4352822303771973 + }, + { + "auxiliary_loss_clip": 0.01190764, + "auxiliary_loss_mlp": 0.01059612, + "balance_loss_clip": 1.03728414, + "balance_loss_mlp": 1.0569818, + "epoch": 0.0831504584398016, + "flos": 17968551068160.0, + "grad_norm": 2.6234570747150734, + "language_loss": 0.77606535, + "learning_rate": 3.970440213512121e-06, + "loss": 0.79856908, + "num_input_tokens_seen": 29457360, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.34375, + "step": 1383, + "time_per_iteration": 2.45939040184021 + }, + { + "auxiliary_loss_clip": 0.01194291, + "auxiliary_loss_mlp": 0.01056287, + "balance_loss_clip": 1.03254008, + "balance_loss_mlp": 1.05730414, + "epoch": 0.08321058169246956, + "flos": 22601386408320.0, + "grad_norm": 2.1508484512905945, + "language_loss": 0.8293128, + "learning_rate": 3.97037346403694e-06, + "loss": 0.85181862, + "num_input_tokens_seen": 29477040, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.3671875, + "step": 1384, + "time_per_iteration": 2.4773356914520264 + }, + { + "auxiliary_loss_clip": 0.01198678, + "auxiliary_loss_mlp": 0.01055169, + "balance_loss_clip": 1.02937245, + "balance_loss_mlp": 1.05890989, + "epoch": 0.08327070494513754, + "flos": 22850426378880.0, + "grad_norm": 2.4890613364481893, + "language_loss": 0.84828049, + "learning_rate": 3.970306639845e-06, + "loss": 0.87081897, + "num_input_tokens_seen": 29492010, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.3984375, + "step": 1385, + "time_per_iteration": 2.5084009170532227 + }, + { + "auxiliary_loss_clip": 0.01194904, + "auxiliary_loss_mlp": 0.01066074, + "balance_loss_clip": 1.04257774, + "balance_loss_mlp": 1.05825758, + "epoch": 0.0833308281978055, + "flos": 22782986593920.0, + "grad_norm": 2.123672194513448, + "language_loss": 0.68744183, + "learning_rate": 3.970239740938835e-06, + "loss": 0.7100516, + "num_input_tokens_seen": 29511850, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.3671875, + "step": 1386, + "time_per_iteration": 2.477592945098877 + }, + { + "auxiliary_loss_clip": 0.01191265, + "auxiliary_loss_mlp": 0.0105612, + "balance_loss_clip": 1.03186047, + "balance_loss_mlp": 1.05579662, + "epoch": 0.08339095145047347, + "flos": 20812604083200.0, + "grad_norm": 1.7726596290820096, + "language_loss": 0.82067239, + "learning_rate": 3.97017276732098e-06, + "loss": 0.84314626, + "num_input_tokens_seen": 29531415, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.359375, + "step": 1387, + "time_per_iteration": 2.529261350631714 + }, + { + "auxiliary_loss_clip": 0.01196512, + "auxiliary_loss_mlp": 0.0107016, + "balance_loss_clip": 1.04474461, + "balance_loss_mlp": 1.05739772, + "epoch": 0.08345107470314143, + "flos": 18515326872960.0, + "grad_norm": 2.385304875072474, + "language_loss": 0.77194649, + "learning_rate": 3.970105718993978e-06, + "loss": 0.79461324, + "num_input_tokens_seen": 29549525, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.390625, + "step": 1388, + "time_per_iteration": 2.4517693519592285 + }, + { + "auxiliary_loss_clip": 0.01187734, + "auxiliary_loss_mlp": 0.01059717, + "balance_loss_clip": 1.0351125, + "balance_loss_mlp": 1.0574429, + "epoch": 0.08351119795580941, + "flos": 18807567926400.0, + "grad_norm": 2.246368739161805, + "language_loss": 0.79078835, + "learning_rate": 3.970038595960369e-06, + "loss": 0.81326282, + "num_input_tokens_seen": 29568705, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.3046875, + "step": 1389, + "time_per_iteration": 2.4999983310699463 + }, + { + "auxiliary_loss_clip": 0.01194109, + "auxiliary_loss_mlp": 0.01056803, + "balance_loss_clip": 1.03368866, + "balance_loss_mlp": 1.05773938, + "epoch": 0.08357132120847738, + "flos": 18441817689600.0, + "grad_norm": 4.533904477221136, + "language_loss": 0.87495124, + "learning_rate": 3.969971398222699e-06, + "loss": 0.89746046, + "num_input_tokens_seen": 29585855, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.359375, + "step": 1390, + "time_per_iteration": 2.438126802444458 + }, + { + "auxiliary_loss_clip": 0.01190886, + "auxiliary_loss_mlp": 0.01063167, + "balance_loss_clip": 1.03902745, + "balance_loss_mlp": 1.05621624, + "epoch": 0.08363144446114534, + "flos": 25922333318400.0, + "grad_norm": 1.6928828016377326, + "language_loss": 0.86753631, + "learning_rate": 3.969904125783517e-06, + "loss": 0.89007682, + "num_input_tokens_seen": 29607280, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.34375, + "step": 1391, + "time_per_iteration": 2.5615429878234863 + }, + { + "auxiliary_loss_clip": 0.01198327, + "auxiliary_loss_mlp": 0.01071606, + "balance_loss_clip": 1.0480268, + "balance_loss_mlp": 1.05904424, + "epoch": 0.08369156771381332, + "flos": 18041306065920.0, + "grad_norm": 4.090701354718017, + "language_loss": 0.87550449, + "learning_rate": 3.969836778645371e-06, + "loss": 0.89820385, + "num_input_tokens_seen": 29624130, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.390625, + "step": 1392, + "time_per_iteration": 2.4343698024749756 + }, + { + "auxiliary_loss_clip": 0.01190277, + "auxiliary_loss_mlp": 0.01060815, + "balance_loss_clip": 1.03682983, + "balance_loss_mlp": 1.05556941, + "epoch": 0.08375169096648129, + "flos": 22675111073280.0, + "grad_norm": 2.9857894096842457, + "language_loss": 0.80519998, + "learning_rate": 3.969769356810819e-06, + "loss": 0.82771087, + "num_input_tokens_seen": 29643210, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.34375, + "step": 1393, + "time_per_iteration": 3.9978342056274414 + }, + { + "auxiliary_loss_clip": 0.01191931, + "auxiliary_loss_mlp": 0.01054176, + "balance_loss_clip": 1.03098941, + "balance_loss_mlp": 1.05832088, + "epoch": 0.08381181421914925, + "flos": 26103215232000.0, + "grad_norm": 1.8413427873168604, + "language_loss": 0.84738398, + "learning_rate": 3.969701860282415e-06, + "loss": 0.86984503, + "num_input_tokens_seen": 29663920, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3359375, + "step": 1394, + "time_per_iteration": 3.995389461517334 + }, + { + "auxiliary_loss_clip": 0.01193271, + "auxiliary_loss_mlp": 0.010537, + "balance_loss_clip": 1.0296433, + "balance_loss_mlp": 1.05856824, + "epoch": 0.08387193747181723, + "flos": 20629782835200.0, + "grad_norm": 1.7688902284368797, + "language_loss": 0.82957625, + "learning_rate": 3.969634289062719e-06, + "loss": 0.85204601, + "num_input_tokens_seen": 29683825, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.34375, + "step": 1395, + "time_per_iteration": 2.5080416202545166 + }, + { + "auxiliary_loss_clip": 0.01194811, + "auxiliary_loss_mlp": 0.01062467, + "balance_loss_clip": 1.03683722, + "balance_loss_mlp": 1.05833054, + "epoch": 0.0839320607244852, + "flos": 13443196199040.0, + "grad_norm": 1.9626395114639965, + "language_loss": 0.82492781, + "learning_rate": 3.969566643154293e-06, + "loss": 0.84750068, + "num_input_tokens_seen": 29698775, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.3671875, + "step": 1396, + "time_per_iteration": 2.51763653755188 + }, + { + "auxiliary_loss_clip": 0.01191589, + "auxiliary_loss_mlp": 0.01058769, + "balance_loss_clip": 1.03253114, + "balance_loss_mlp": 1.05944824, + "epoch": 0.08399218397715316, + "flos": 23477247642240.0, + "grad_norm": 2.3756879295671367, + "language_loss": 0.7702114, + "learning_rate": 3.969498922559703e-06, + "loss": 0.79271495, + "num_input_tokens_seen": 29719430, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.3203125, + "step": 1397, + "time_per_iteration": 2.522019624710083 + }, + { + "auxiliary_loss_clip": 0.01191257, + "auxiliary_loss_mlp": 0.01050826, + "balance_loss_clip": 1.02635193, + "balance_loss_mlp": 1.05688787, + "epoch": 0.08405230722982113, + "flos": 25920717206400.0, + "grad_norm": 2.1333990758799795, + "language_loss": 0.77589226, + "learning_rate": 3.969431127281516e-06, + "loss": 0.79831308, + "num_input_tokens_seen": 29739685, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.34375, + "step": 1398, + "time_per_iteration": 2.499405860900879 + }, + { + "auxiliary_loss_clip": 0.01187104, + "auxiliary_loss_mlp": 0.01057261, + "balance_loss_clip": 1.03366995, + "balance_loss_mlp": 1.05604136, + "epoch": 0.0841124304824891, + "flos": 17967437746560.0, + "grad_norm": 6.547707007931562, + "language_loss": 0.94411373, + "learning_rate": 3.969363257322304e-06, + "loss": 0.96655744, + "num_input_tokens_seen": 29756165, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.3125, + "step": 1399, + "time_per_iteration": 2.458564043045044 + }, + { + "auxiliary_loss_clip": 0.01192876, + "auxiliary_loss_mlp": 0.01060981, + "balance_loss_clip": 1.03585184, + "balance_loss_mlp": 1.05564523, + "epoch": 0.08417255373515707, + "flos": 25629661301760.0, + "grad_norm": 2.3313569082148637, + "language_loss": 0.82052553, + "learning_rate": 3.96929531268464e-06, + "loss": 0.84306407, + "num_input_tokens_seen": 29776425, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.375, + "step": 1400, + "time_per_iteration": 2.511075258255005 + }, + { + "auxiliary_loss_clip": 0.01191821, + "auxiliary_loss_mlp": 0.01061122, + "balance_loss_clip": 1.03713727, + "balance_loss_mlp": 1.05681479, + "epoch": 0.08423267698782504, + "flos": 26249730808320.0, + "grad_norm": 3.6029570836648723, + "language_loss": 0.86615682, + "learning_rate": 3.969227293371099e-06, + "loss": 0.8886863, + "num_input_tokens_seen": 29796440, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.3515625, + "step": 1401, + "time_per_iteration": 2.5328855514526367 + }, + { + "auxiliary_loss_clip": 0.01190636, + "auxiliary_loss_mlp": 0.01063749, + "balance_loss_clip": 1.0383811, + "balance_loss_mlp": 1.05496573, + "epoch": 0.08429280024049302, + "flos": 20119707751680.0, + "grad_norm": 2.2778357332658543, + "language_loss": 0.87128234, + "learning_rate": 3.969159199384263e-06, + "loss": 0.89382625, + "num_input_tokens_seen": 29814755, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.359375, + "step": 1402, + "time_per_iteration": 2.4695520401000977 + }, + { + "auxiliary_loss_clip": 0.0118725, + "auxiliary_loss_mlp": 0.01056626, + "balance_loss_clip": 1.03340352, + "balance_loss_mlp": 1.0542388, + "epoch": 0.08435292349316098, + "flos": 42924526836480.0, + "grad_norm": 2.954964391273458, + "language_loss": 0.88680542, + "learning_rate": 3.9690910307267125e-06, + "loss": 0.90924418, + "num_input_tokens_seen": 29834785, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.328125, + "step": 1403, + "time_per_iteration": 2.6655161380767822 + }, + { + "auxiliary_loss_clip": 0.01189559, + "auxiliary_loss_mlp": 0.01056388, + "balance_loss_clip": 1.03105569, + "balance_loss_mlp": 1.05429792, + "epoch": 0.08441304674582895, + "flos": 22857285876480.0, + "grad_norm": 1.9645692036725415, + "language_loss": 0.80325729, + "learning_rate": 3.969022787401033e-06, + "loss": 0.82571673, + "num_input_tokens_seen": 29854695, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.359375, + "step": 1404, + "time_per_iteration": 2.5011603832244873 + }, + { + "auxiliary_loss_clip": 0.01195719, + "auxiliary_loss_mlp": 0.01066072, + "balance_loss_clip": 1.04089534, + "balance_loss_mlp": 1.05798006, + "epoch": 0.08447316999849692, + "flos": 18697501676160.0, + "grad_norm": 2.1059643070764027, + "language_loss": 0.83845061, + "learning_rate": 3.968954469409811e-06, + "loss": 0.86106849, + "num_input_tokens_seen": 29872180, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.375, + "step": 1405, + "time_per_iteration": 2.4612858295440674 + }, + { + "auxiliary_loss_clip": 0.01188265, + "auxiliary_loss_mlp": 0.01056168, + "balance_loss_clip": 1.03314888, + "balance_loss_mlp": 1.05381966, + "epoch": 0.08453329325116489, + "flos": 25483971738240.0, + "grad_norm": 1.7581309060245893, + "language_loss": 0.80343008, + "learning_rate": 3.968886076755639e-06, + "loss": 0.82587439, + "num_input_tokens_seen": 29893205, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.34375, + "step": 1406, + "time_per_iteration": 2.496676206588745 + }, + { + "auxiliary_loss_clip": 0.01192497, + "auxiliary_loss_mlp": 0.01065969, + "balance_loss_clip": 1.0421989, + "balance_loss_mlp": 1.05858994, + "epoch": 0.08459341650383286, + "flos": 20920048640640.0, + "grad_norm": 1.8241253914082192, + "language_loss": 0.79411483, + "learning_rate": 3.96881760944111e-06, + "loss": 0.8166995, + "num_input_tokens_seen": 29911970, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.3359375, + "step": 1407, + "time_per_iteration": 2.491055727005005 + }, + { + "auxiliary_loss_clip": 0.01188371, + "auxiliary_loss_mlp": 0.01055807, + "balance_loss_clip": 1.03234673, + "balance_loss_mlp": 1.05521655, + "epoch": 0.08465353975650082, + "flos": 13043079624960.0, + "grad_norm": 4.541456574357825, + "language_loss": 0.91929626, + "learning_rate": 3.968749067468819e-06, + "loss": 0.94173807, + "num_input_tokens_seen": 29929925, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.328125, + "step": 1408, + "time_per_iteration": 2.44599986076355 + }, + { + "auxiliary_loss_clip": 0.01074059, + "auxiliary_loss_mlp": 0.01025014, + "balance_loss_clip": 1.02074611, + "balance_loss_mlp": 1.02193737, + "epoch": 0.0847136630091688, + "flos": 60877422552960.0, + "grad_norm": 0.8980094129226197, + "language_loss": 0.61861706, + "learning_rate": 3.968680450841368e-06, + "loss": 0.63960779, + "num_input_tokens_seen": 29985950, + "router_z_loss_clip": 0.04272461, + "router_z_loss_mlp": 0.5234375, + "step": 1409, + "time_per_iteration": 3.1084799766540527 + }, + { + "auxiliary_loss_clip": 0.01180993, + "auxiliary_loss_mlp": 0.01060196, + "balance_loss_clip": 1.03784466, + "balance_loss_mlp": 1.05419254, + "epoch": 0.08477378626183676, + "flos": 22046530043520.0, + "grad_norm": 2.25814404402445, + "language_loss": 0.86819237, + "learning_rate": 3.968611759561355e-06, + "loss": 0.89060426, + "num_input_tokens_seen": 30004330, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.265625, + "step": 1410, + "time_per_iteration": 2.4854791164398193 + }, + { + "auxiliary_loss_clip": 0.01188551, + "auxiliary_loss_mlp": 0.01056537, + "balance_loss_clip": 1.0309782, + "balance_loss_mlp": 1.05453801, + "epoch": 0.08483390951450473, + "flos": 16690059308160.0, + "grad_norm": 2.048224684561652, + "language_loss": 0.74138093, + "learning_rate": 3.968542993631388e-06, + "loss": 0.76383173, + "num_input_tokens_seen": 30022555, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.3359375, + "step": 1411, + "time_per_iteration": 2.484879970550537 + }, + { + "auxiliary_loss_clip": 0.01068033, + "auxiliary_loss_mlp": 0.01005767, + "balance_loss_clip": 1.00169039, + "balance_loss_mlp": 1.01640451, + "epoch": 0.08489403276717271, + "flos": 51584640082560.0, + "grad_norm": 0.9041737870208939, + "language_loss": 0.56723791, + "learning_rate": 3.968474153054073e-06, + "loss": 0.58797586, + "num_input_tokens_seen": 30077220, + "router_z_loss_clip": 0.04077148, + "router_z_loss_mlp": 0.515625, + "step": 1412, + "time_per_iteration": 3.003227949142456 + }, + { + "auxiliary_loss_clip": 0.01183878, + "auxiliary_loss_mlp": 0.01062107, + "balance_loss_clip": 1.03855133, + "balance_loss_mlp": 1.05354273, + "epoch": 0.08495415601984067, + "flos": 17092330698240.0, + "grad_norm": 2.0338814511208883, + "language_loss": 0.89084172, + "learning_rate": 3.96840523783202e-06, + "loss": 0.91330159, + "num_input_tokens_seen": 30094600, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.3046875, + "step": 1413, + "time_per_iteration": 2.4545698165893555 + }, + { + "auxiliary_loss_clip": 0.01186591, + "auxiliary_loss_mlp": 0.01054482, + "balance_loss_clip": 1.03019929, + "balance_loss_mlp": 1.0562067, + "epoch": 0.08501427927250864, + "flos": 23148413608320.0, + "grad_norm": 2.1859301398641415, + "language_loss": 0.8807795, + "learning_rate": 3.968336247967844e-06, + "loss": 0.90319026, + "num_input_tokens_seen": 30114475, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.3046875, + "step": 1414, + "time_per_iteration": 2.4803147315979004 + }, + { + "auxiliary_loss_clip": 0.01185784, + "auxiliary_loss_mlp": 0.01056984, + "balance_loss_clip": 1.03497767, + "balance_loss_mlp": 1.0540117, + "epoch": 0.08507440252517662, + "flos": 19063467394560.0, + "grad_norm": 1.82577143383273, + "language_loss": 0.77434587, + "learning_rate": 3.96826718346416e-06, + "loss": 0.79677355, + "num_input_tokens_seen": 30133350, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.3125, + "step": 1415, + "time_per_iteration": 2.510671615600586 + }, + { + "auxiliary_loss_clip": 0.01185616, + "auxiliary_loss_mlp": 0.010657, + "balance_loss_clip": 1.0441227, + "balance_loss_mlp": 1.05612898, + "epoch": 0.08513452577784458, + "flos": 60182296600320.0, + "grad_norm": 1.848223104879299, + "language_loss": 0.70859981, + "learning_rate": 3.968198044323587e-06, + "loss": 0.73111296, + "num_input_tokens_seen": 30159005, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.296875, + "step": 1416, + "time_per_iteration": 2.827016592025757 + }, + { + "auxiliary_loss_clip": 0.01192002, + "auxiliary_loss_mlp": 0.0106124, + "balance_loss_clip": 1.03587198, + "balance_loss_mlp": 1.05693281, + "epoch": 0.08519464903051255, + "flos": 27308485117440.0, + "grad_norm": 1.9370001986884609, + "language_loss": 0.74855268, + "learning_rate": 3.968128830548748e-06, + "loss": 0.77108514, + "num_input_tokens_seen": 30179450, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.3515625, + "step": 1417, + "time_per_iteration": 2.51518177986145 + }, + { + "auxiliary_loss_clip": 0.01184448, + "auxiliary_loss_mlp": 0.01055419, + "balance_loss_clip": 1.03157723, + "balance_loss_mlp": 1.05394006, + "epoch": 0.08525477228318051, + "flos": 20266438809600.0, + "grad_norm": 2.566029486363868, + "language_loss": 0.82460356, + "learning_rate": 3.968059542142265e-06, + "loss": 0.84700227, + "num_input_tokens_seen": 30197235, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.3046875, + "step": 1418, + "time_per_iteration": 2.4632515907287598 + }, + { + "auxiliary_loss_clip": 0.01078096, + "auxiliary_loss_mlp": 0.01026146, + "balance_loss_clip": 1.02221191, + "balance_loss_mlp": 1.0269177, + "epoch": 0.08531489553584849, + "flos": 67615017183360.0, + "grad_norm": 0.8662062784105238, + "language_loss": 0.56616145, + "learning_rate": 3.9679901791067685e-06, + "loss": 0.58720386, + "num_input_tokens_seen": 30257410, + "router_z_loss_clip": 0.03930664, + "router_z_loss_mlp": 0.51171875, + "step": 1419, + "time_per_iteration": 3.0262646675109863 + }, + { + "auxiliary_loss_clip": 0.01185611, + "auxiliary_loss_mlp": 0.01062944, + "balance_loss_clip": 1.03858972, + "balance_loss_mlp": 1.05284262, + "epoch": 0.08537501878851646, + "flos": 27526965592320.0, + "grad_norm": 2.301787344693911, + "language_loss": 0.69764268, + "learning_rate": 3.967920741444886e-06, + "loss": 0.72012818, + "num_input_tokens_seen": 30277865, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.328125, + "step": 1420, + "time_per_iteration": 2.5173370838165283 + }, + { + "auxiliary_loss_clip": 0.01182824, + "auxiliary_loss_mlp": 0.01051954, + "balance_loss_clip": 1.02912498, + "balance_loss_mlp": 1.05232763, + "epoch": 0.08543514204118442, + "flos": 22784243569920.0, + "grad_norm": 1.56579546013663, + "language_loss": 0.87886292, + "learning_rate": 3.967851229159252e-06, + "loss": 0.90121067, + "num_input_tokens_seen": 30298545, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.3046875, + "step": 1421, + "time_per_iteration": 2.498198986053467 + }, + { + "auxiliary_loss_clip": 0.01069987, + "auxiliary_loss_mlp": 0.01004015, + "balance_loss_clip": 1.00034332, + "balance_loss_mlp": 1.01909983, + "epoch": 0.0854952652938524, + "flos": 60990721027200.0, + "grad_norm": 0.7935144939089421, + "language_loss": 0.63490081, + "learning_rate": 3.967781642252502e-06, + "loss": 0.65564084, + "num_input_tokens_seen": 30361725, + "router_z_loss_clip": 0.03662109, + "router_z_loss_mlp": 0.5078125, + "step": 1422, + "time_per_iteration": 3.050874948501587 + }, + { + "auxiliary_loss_clip": 0.01182797, + "auxiliary_loss_mlp": 0.01065038, + "balance_loss_clip": 1.04182768, + "balance_loss_mlp": 1.05538559, + "epoch": 0.08555538854652037, + "flos": 28038046256640.0, + "grad_norm": 2.040119561169685, + "language_loss": 0.83427018, + "learning_rate": 3.967711980727276e-06, + "loss": 0.85674852, + "num_input_tokens_seen": 30382180, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.2734375, + "step": 1423, + "time_per_iteration": 2.525075674057007 + }, + { + "auxiliary_loss_clip": 0.01190455, + "auxiliary_loss_mlp": 0.01059451, + "balance_loss_clip": 1.0365268, + "balance_loss_mlp": 1.05613029, + "epoch": 0.08561551179918833, + "flos": 23509279595520.0, + "grad_norm": 1.7627385415604107, + "language_loss": 0.74945033, + "learning_rate": 3.967642244586213e-06, + "loss": 0.77194929, + "num_input_tokens_seen": 30402980, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.34375, + "step": 1424, + "time_per_iteration": 2.523231029510498 + }, + { + "auxiliary_loss_clip": 0.01185893, + "auxiliary_loss_mlp": 0.01056266, + "balance_loss_clip": 1.03307986, + "balance_loss_mlp": 1.05510807, + "epoch": 0.08567563505185631, + "flos": 17926930183680.0, + "grad_norm": 1.9395290082560723, + "language_loss": 0.7574805, + "learning_rate": 3.96757243383196e-06, + "loss": 0.7799021, + "num_input_tokens_seen": 30420800, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3046875, + "step": 1425, + "time_per_iteration": 2.441420793533325 + }, + { + "auxiliary_loss_clip": 0.01183386, + "auxiliary_loss_mlp": 0.01053965, + "balance_loss_clip": 1.03092194, + "balance_loss_mlp": 1.05407834, + "epoch": 0.08573575830452428, + "flos": 19719519350400.0, + "grad_norm": 2.579491371045568, + "language_loss": 0.93504989, + "learning_rate": 3.9675025484671624e-06, + "loss": 0.95742333, + "num_input_tokens_seen": 30439620, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.296875, + "step": 1426, + "time_per_iteration": 2.4703657627105713 + }, + { + "auxiliary_loss_clip": 0.0119154, + "auxiliary_loss_mlp": 0.01066598, + "balance_loss_clip": 1.04115915, + "balance_loss_mlp": 1.05764198, + "epoch": 0.08579588155719224, + "flos": 17931563038080.0, + "grad_norm": 2.235647808517122, + "language_loss": 0.75003266, + "learning_rate": 3.967432588494471e-06, + "loss": 0.772614, + "num_input_tokens_seen": 30457300, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.34375, + "step": 1427, + "time_per_iteration": 2.4430549144744873 + }, + { + "auxiliary_loss_clip": 0.01182417, + "auxiliary_loss_mlp": 0.01061112, + "balance_loss_clip": 1.03907049, + "balance_loss_mlp": 1.05315089, + "epoch": 0.08585600480986022, + "flos": 16033324993920.0, + "grad_norm": 2.3372587699614726, + "language_loss": 0.81915152, + "learning_rate": 3.96736255391654e-06, + "loss": 0.84158677, + "num_input_tokens_seen": 30471580, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.296875, + "step": 1428, + "time_per_iteration": 2.454441785812378 + }, + { + "auxiliary_loss_clip": 0.01189987, + "auxiliary_loss_mlp": 0.01066735, + "balance_loss_clip": 1.04258347, + "balance_loss_mlp": 1.05586076, + "epoch": 0.08591612806252819, + "flos": 28657433404800.0, + "grad_norm": 2.395570851050941, + "language_loss": 0.79697371, + "learning_rate": 3.967292444736023e-06, + "loss": 0.81954098, + "num_input_tokens_seen": 30492720, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.34375, + "step": 1429, + "time_per_iteration": 2.5411579608917236 + }, + { + "auxiliary_loss_clip": 0.0119024, + "auxiliary_loss_mlp": 0.01062326, + "balance_loss_clip": 1.03952122, + "balance_loss_mlp": 1.05773449, + "epoch": 0.08597625131519615, + "flos": 20959119659520.0, + "grad_norm": 2.301464625204156, + "language_loss": 0.88055587, + "learning_rate": 3.967222260955578e-06, + "loss": 0.90308148, + "num_input_tokens_seen": 30509535, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.328125, + "step": 1430, + "time_per_iteration": 2.497523546218872 + }, + { + "auxiliary_loss_clip": 0.01184535, + "auxiliary_loss_mlp": 0.01072949, + "balance_loss_clip": 1.04995334, + "balance_loss_mlp": 1.05712664, + "epoch": 0.08603637456786412, + "flos": 23256360956160.0, + "grad_norm": 1.7504719201320615, + "language_loss": 0.81914723, + "learning_rate": 3.96715200257787e-06, + "loss": 0.84172201, + "num_input_tokens_seen": 30529490, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.2734375, + "step": 1431, + "time_per_iteration": 2.478731393814087 + }, + { + "auxiliary_loss_clip": 0.01184756, + "auxiliary_loss_mlp": 0.01056491, + "balance_loss_clip": 1.03337657, + "balance_loss_mlp": 1.05376828, + "epoch": 0.0860964978205321, + "flos": 28694170039680.0, + "grad_norm": 1.9949655353101803, + "language_loss": 0.77759397, + "learning_rate": 3.967081669605559e-06, + "loss": 0.80000651, + "num_input_tokens_seen": 30550205, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3125, + "step": 1432, + "time_per_iteration": 2.5344104766845703 + }, + { + "auxiliary_loss_clip": 0.0118072, + "auxiliary_loss_mlp": 0.01059034, + "balance_loss_clip": 1.03497803, + "balance_loss_mlp": 1.05027151, + "epoch": 0.08615662107320006, + "flos": 19318397195520.0, + "grad_norm": 2.2873036973179603, + "language_loss": 0.73330259, + "learning_rate": 3.967011262041315e-06, + "loss": 0.75570011, + "num_input_tokens_seen": 30568830, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.3046875, + "step": 1433, + "time_per_iteration": 2.4787938594818115 + }, + { + "auxiliary_loss_clip": 0.01188497, + "auxiliary_loss_mlp": 0.01058804, + "balance_loss_clip": 1.03375793, + "balance_loss_mlp": 1.05464733, + "epoch": 0.08621674432586802, + "flos": 15851688894720.0, + "grad_norm": 2.615593579271415, + "language_loss": 0.85741955, + "learning_rate": 3.9669407798878065e-06, + "loss": 0.87989259, + "num_input_tokens_seen": 30585730, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.3359375, + "step": 1434, + "time_per_iteration": 5.500946998596191 + }, + { + "auxiliary_loss_clip": 0.01182383, + "auxiliary_loss_mlp": 0.01054521, + "balance_loss_clip": 1.03139436, + "balance_loss_mlp": 1.05177212, + "epoch": 0.086276867578536, + "flos": 14100648785280.0, + "grad_norm": 3.0513138823403825, + "language_loss": 0.78913063, + "learning_rate": 3.966870223147707e-06, + "loss": 0.81149966, + "num_input_tokens_seen": 30603180, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3046875, + "step": 1435, + "time_per_iteration": 3.899777412414551 + }, + { + "auxiliary_loss_clip": 0.01070575, + "auxiliary_loss_mlp": 0.01027743, + "balance_loss_clip": 1.02428555, + "balance_loss_mlp": 1.02010655, + "epoch": 0.08633699083120397, + "flos": 70184857772160.0, + "grad_norm": 0.8910926846424677, + "language_loss": 0.57930011, + "learning_rate": 3.96679959182369e-06, + "loss": 0.60028332, + "num_input_tokens_seen": 30668895, + "router_z_loss_clip": 0.03466797, + "router_z_loss_mlp": 0.5078125, + "step": 1436, + "time_per_iteration": 3.179255247116089 + }, + { + "auxiliary_loss_clip": 0.01186059, + "auxiliary_loss_mlp": 0.01049386, + "balance_loss_clip": 1.02633083, + "balance_loss_mlp": 1.05314159, + "epoch": 0.08639711408387193, + "flos": 30298874140800.0, + "grad_norm": 2.429993259280604, + "language_loss": 0.68775386, + "learning_rate": 3.966728885918437e-06, + "loss": 0.71010828, + "num_input_tokens_seen": 30688955, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.328125, + "step": 1437, + "time_per_iteration": 2.529339551925659 + }, + { + "auxiliary_loss_clip": 0.01185365, + "auxiliary_loss_mlp": 0.01050306, + "balance_loss_clip": 1.02806163, + "balance_loss_mlp": 1.05388093, + "epoch": 0.08645723733653991, + "flos": 20297680663680.0, + "grad_norm": 2.5641138848438163, + "language_loss": 0.7274068, + "learning_rate": 3.966658105434627e-06, + "loss": 0.74976349, + "num_input_tokens_seen": 30706095, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.3125, + "step": 1438, + "time_per_iteration": 2.4840176105499268 + }, + { + "auxiliary_loss_clip": 0.01183596, + "auxiliary_loss_mlp": 0.01049035, + "balance_loss_clip": 1.02594447, + "balance_loss_mlp": 1.05472374, + "epoch": 0.08651736058920788, + "flos": 32890583134080.0, + "grad_norm": 1.681614476681305, + "language_loss": 0.64628494, + "learning_rate": 3.966587250374945e-06, + "loss": 0.66861117, + "num_input_tokens_seen": 30729025, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.2890625, + "step": 1439, + "time_per_iteration": 2.61686372756958 + }, + { + "auxiliary_loss_clip": 0.01187197, + "auxiliary_loss_mlp": 0.01055218, + "balance_loss_clip": 1.03131676, + "balance_loss_mlp": 1.05638909, + "epoch": 0.08657748384187584, + "flos": 22637368857600.0, + "grad_norm": 2.062065757985673, + "language_loss": 0.87748063, + "learning_rate": 3.966516320742077e-06, + "loss": 0.89990479, + "num_input_tokens_seen": 30746155, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.3125, + "step": 1440, + "time_per_iteration": 2.5116493701934814 + }, + { + "auxiliary_loss_clip": 0.01188419, + "auxiliary_loss_mlp": 0.01059749, + "balance_loss_clip": 1.03538251, + "balance_loss_mlp": 1.0540843, + "epoch": 0.08663760709454381, + "flos": 23658380951040.0, + "grad_norm": 2.4102507257620363, + "language_loss": 0.83243793, + "learning_rate": 3.9664453165387124e-06, + "loss": 0.85491961, + "num_input_tokens_seen": 30761410, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.34375, + "step": 1441, + "time_per_iteration": 2.5058300495147705 + }, + { + "auxiliary_loss_clip": 0.01070024, + "auxiliary_loss_mlp": 0.01009256, + "balance_loss_clip": 1.00525022, + "balance_loss_mlp": 1.01939523, + "epoch": 0.08669773034721179, + "flos": 62686564911360.0, + "grad_norm": 0.8461220926791603, + "language_loss": 0.60426581, + "learning_rate": 3.966374237767545e-06, + "loss": 0.62505859, + "num_input_tokens_seen": 30823010, + "router_z_loss_clip": 0.04003906, + "router_z_loss_mlp": 0.5078125, + "step": 1442, + "time_per_iteration": 3.1946628093719482 + }, + { + "auxiliary_loss_clip": 0.01192002, + "auxiliary_loss_mlp": 0.01057232, + "balance_loss_clip": 1.03379524, + "balance_loss_mlp": 1.05709028, + "epoch": 0.08675785359987975, + "flos": 20667489137280.0, + "grad_norm": 3.2809405592870835, + "language_loss": 0.79264277, + "learning_rate": 3.96630308443127e-06, + "loss": 0.81513512, + "num_input_tokens_seen": 30841980, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.3515625, + "step": 1443, + "time_per_iteration": 2.477691411972046 + }, + { + "auxiliary_loss_clip": 0.01185255, + "auxiliary_loss_mlp": 0.01054103, + "balance_loss_clip": 1.03170311, + "balance_loss_mlp": 1.05261874, + "epoch": 0.08681797685254772, + "flos": 26941118768640.0, + "grad_norm": 1.764762918327591, + "language_loss": 0.82248437, + "learning_rate": 3.966231856532584e-06, + "loss": 0.8448779, + "num_input_tokens_seen": 30863280, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.328125, + "step": 1444, + "time_per_iteration": 2.584773063659668 + }, + { + "auxiliary_loss_clip": 0.01189581, + "auxiliary_loss_mlp": 0.01049918, + "balance_loss_clip": 1.02745867, + "balance_loss_mlp": 1.05537939, + "epoch": 0.0868781001052157, + "flos": 17712831168000.0, + "grad_norm": 1.945627197742621, + "language_loss": 0.86856627, + "learning_rate": 3.966160554074189e-06, + "loss": 0.89096129, + "num_input_tokens_seen": 30881710, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.34375, + "step": 1445, + "time_per_iteration": 2.506258964538574 + }, + { + "auxiliary_loss_clip": 0.01189413, + "auxiliary_loss_mlp": 0.01054326, + "balance_loss_clip": 1.03303528, + "balance_loss_mlp": 1.05808067, + "epoch": 0.08693822335788366, + "flos": 19896522595200.0, + "grad_norm": 1.9763924186655837, + "language_loss": 0.81639445, + "learning_rate": 3.96608917705879e-06, + "loss": 0.8388319, + "num_input_tokens_seen": 30900225, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.3125, + "step": 1446, + "time_per_iteration": 2.531313180923462 + }, + { + "auxiliary_loss_clip": 0.010647, + "auxiliary_loss_mlp": 0.01005416, + "balance_loss_clip": 1.0020541, + "balance_loss_mlp": 1.0147202, + "epoch": 0.08699834661055163, + "flos": 67023747406080.0, + "grad_norm": 0.728477241136595, + "language_loss": 0.54725462, + "learning_rate": 3.966017725489091e-06, + "loss": 0.56795579, + "num_input_tokens_seen": 30959580, + "router_z_loss_clip": 0.03369141, + "router_z_loss_mlp": 0.5, + "step": 1447, + "time_per_iteration": 3.1009976863861084 + }, + { + "auxiliary_loss_clip": 0.01178637, + "auxiliary_loss_mlp": 0.01052877, + "balance_loss_clip": 1.03104973, + "balance_loss_mlp": 1.05198455, + "epoch": 0.0870584698632196, + "flos": 13480507451520.0, + "grad_norm": 2.2332818090387243, + "language_loss": 0.84593046, + "learning_rate": 3.965946199367804e-06, + "loss": 0.8682456, + "num_input_tokens_seen": 30976775, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.265625, + "step": 1448, + "time_per_iteration": 2.483792543411255 + }, + { + "auxiliary_loss_clip": 0.01185215, + "auxiliary_loss_mlp": 0.01056358, + "balance_loss_clip": 1.03386295, + "balance_loss_mlp": 1.0524509, + "epoch": 0.08711859311588757, + "flos": 16107013745280.0, + "grad_norm": 3.099884448391289, + "language_loss": 0.80688727, + "learning_rate": 3.965874598697638e-06, + "loss": 0.82930297, + "num_input_tokens_seen": 30990495, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.328125, + "step": 1449, + "time_per_iteration": 2.4637081623077393 + }, + { + "auxiliary_loss_clip": 0.01182046, + "auxiliary_loss_mlp": 0.01050023, + "balance_loss_clip": 1.02862501, + "balance_loss_mlp": 1.05370414, + "epoch": 0.08717871636855554, + "flos": 38472357928320.0, + "grad_norm": 4.183651889411507, + "language_loss": 0.71012592, + "learning_rate": 3.965802923481313e-06, + "loss": 0.73244655, + "num_input_tokens_seen": 31014080, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.2890625, + "step": 1450, + "time_per_iteration": 2.6521542072296143 + }, + { + "auxiliary_loss_clip": 0.0118314, + "auxiliary_loss_mlp": 0.01053244, + "balance_loss_clip": 1.03057098, + "balance_loss_mlp": 1.05502534, + "epoch": 0.0872388396212235, + "flos": 17600574188160.0, + "grad_norm": 1.8266796466048172, + "language_loss": 0.83492875, + "learning_rate": 3.965731173721542e-06, + "loss": 0.85729253, + "num_input_tokens_seen": 31031210, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.28125, + "step": 1451, + "time_per_iteration": 2.4866271018981934 + }, + { + "auxiliary_loss_clip": 0.01181156, + "auxiliary_loss_mlp": 0.01057138, + "balance_loss_clip": 1.03538203, + "balance_loss_mlp": 1.05371869, + "epoch": 0.08729896287389148, + "flos": 25259385951360.0, + "grad_norm": 1.850339391564711, + "language_loss": 0.74351519, + "learning_rate": 3.965659349421049e-06, + "loss": 0.76589811, + "num_input_tokens_seen": 31049710, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.2734375, + "step": 1452, + "time_per_iteration": 2.5450925827026367 + }, + { + "auxiliary_loss_clip": 0.01182798, + "auxiliary_loss_mlp": 0.01061481, + "balance_loss_clip": 1.03840256, + "balance_loss_mlp": 1.05121017, + "epoch": 0.08735908612655945, + "flos": 15632454234240.0, + "grad_norm": 3.3421371051734474, + "language_loss": 0.79840016, + "learning_rate": 3.965587450582556e-06, + "loss": 0.82084292, + "num_input_tokens_seen": 31066160, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3125, + "step": 1453, + "time_per_iteration": 2.49350643157959 + }, + { + "auxiliary_loss_clip": 0.01182604, + "auxiliary_loss_mlp": 0.0106489, + "balance_loss_clip": 1.04213262, + "balance_loss_mlp": 1.0545752, + "epoch": 0.08741920937922741, + "flos": 20339660684160.0, + "grad_norm": 1.982640213979625, + "language_loss": 0.71298045, + "learning_rate": 3.96551547720879e-06, + "loss": 0.73545539, + "num_input_tokens_seen": 31085270, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.28125, + "step": 1454, + "time_per_iteration": 2.534457206726074 + }, + { + "auxiliary_loss_clip": 0.01070391, + "auxiliary_loss_mlp": 0.01029091, + "balance_loss_clip": 1.02608728, + "balance_loss_mlp": 1.02026677, + "epoch": 0.08747933263189539, + "flos": 62819795433600.0, + "grad_norm": 0.7993884765543664, + "language_loss": 0.58655661, + "learning_rate": 3.96544342930248e-06, + "loss": 0.6075514, + "num_input_tokens_seen": 31148445, + "router_z_loss_clip": 0.0300293, + "router_z_loss_mlp": 0.5, + "step": 1455, + "time_per_iteration": 3.088113307952881 + }, + { + "auxiliary_loss_clip": 0.01182632, + "auxiliary_loss_mlp": 0.01058901, + "balance_loss_clip": 1.03638279, + "balance_loss_mlp": 1.05210626, + "epoch": 0.08753945588456336, + "flos": 33035877648000.0, + "grad_norm": 1.5590098662562957, + "language_loss": 0.77404714, + "learning_rate": 3.965371306866359e-06, + "loss": 0.79646254, + "num_input_tokens_seen": 31168770, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.3046875, + "step": 1456, + "time_per_iteration": 2.6145191192626953 + }, + { + "auxiliary_loss_clip": 0.01182283, + "auxiliary_loss_mlp": 0.01051585, + "balance_loss_clip": 1.02888715, + "balance_loss_mlp": 1.05235434, + "epoch": 0.08759957913723132, + "flos": 35547182046720.0, + "grad_norm": 2.3657198267749777, + "language_loss": 0.72391665, + "learning_rate": 3.96529910990316e-06, + "loss": 0.74625528, + "num_input_tokens_seen": 31189270, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.296875, + "step": 1457, + "time_per_iteration": 2.6438605785369873 + }, + { + "auxiliary_loss_clip": 0.01179054, + "auxiliary_loss_mlp": 0.01047648, + "balance_loss_clip": 1.02623844, + "balance_loss_mlp": 1.05207849, + "epoch": 0.0876597023898993, + "flos": 23911120022400.0, + "grad_norm": 1.5929331180335078, + "language_loss": 0.86215973, + "learning_rate": 3.965226838415622e-06, + "loss": 0.88442671, + "num_input_tokens_seen": 31210385, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.265625, + "step": 1458, + "time_per_iteration": 2.539658546447754 + }, + { + "auxiliary_loss_clip": 0.01189161, + "auxiliary_loss_mlp": 0.01059801, + "balance_loss_clip": 1.03694844, + "balance_loss_mlp": 1.05887103, + "epoch": 0.08771982564256726, + "flos": 18114025150080.0, + "grad_norm": 1.660016084678777, + "language_loss": 0.80662763, + "learning_rate": 3.965154492406486e-06, + "loss": 0.8291173, + "num_input_tokens_seen": 31229745, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.3046875, + "step": 1459, + "time_per_iteration": 2.4880902767181396 + }, + { + "auxiliary_loss_clip": 0.01187526, + "auxiliary_loss_mlp": 0.01054149, + "balance_loss_clip": 1.03057003, + "balance_loss_mlp": 1.05512893, + "epoch": 0.08777994889523523, + "flos": 17712005155200.0, + "grad_norm": 2.474003232718447, + "language_loss": 0.84058738, + "learning_rate": 3.9650820718784945e-06, + "loss": 0.86300415, + "num_input_tokens_seen": 31248280, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.328125, + "step": 1460, + "time_per_iteration": 2.4644060134887695 + }, + { + "auxiliary_loss_clip": 0.01178547, + "auxiliary_loss_mlp": 0.01054299, + "balance_loss_clip": 1.03287745, + "balance_loss_mlp": 1.05051732, + "epoch": 0.0878400721479032, + "flos": 12819930382080.0, + "grad_norm": 2.696872821623283, + "language_loss": 0.81030595, + "learning_rate": 3.965009576834394e-06, + "loss": 0.83263445, + "num_input_tokens_seen": 31262190, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.28125, + "step": 1461, + "time_per_iteration": 2.4456100463867188 + }, + { + "auxiliary_loss_clip": 0.01187345, + "auxiliary_loss_mlp": 0.01059901, + "balance_loss_clip": 1.03795433, + "balance_loss_mlp": 1.05579305, + "epoch": 0.08790019540057117, + "flos": 26392690938240.0, + "grad_norm": 1.656505593412751, + "language_loss": 0.76405656, + "learning_rate": 3.964937007276932e-06, + "loss": 0.786529, + "num_input_tokens_seen": 31283690, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.3125, + "step": 1462, + "time_per_iteration": 2.546812057495117 + }, + { + "auxiliary_loss_clip": 0.01190578, + "auxiliary_loss_mlp": 0.01058183, + "balance_loss_clip": 1.03431702, + "balance_loss_mlp": 1.05753493, + "epoch": 0.08796031865323914, + "flos": 19134031662720.0, + "grad_norm": 2.4277854967530663, + "language_loss": 0.74615479, + "learning_rate": 3.9648643632088634e-06, + "loss": 0.76864231, + "num_input_tokens_seen": 31302505, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.328125, + "step": 1463, + "time_per_iteration": 2.46510648727417 + }, + { + "auxiliary_loss_clip": 0.01189177, + "auxiliary_loss_mlp": 0.0106376, + "balance_loss_clip": 1.03929877, + "balance_loss_mlp": 1.05380559, + "epoch": 0.0880204419059071, + "flos": 26064287867520.0, + "grad_norm": 2.09054267836168, + "language_loss": 0.83423382, + "learning_rate": 3.964791644632941e-06, + "loss": 0.85676318, + "num_input_tokens_seen": 31323070, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.3515625, + "step": 1464, + "time_per_iteration": 2.5343735218048096 + }, + { + "auxiliary_loss_clip": 0.01183588, + "auxiliary_loss_mlp": 0.01069008, + "balance_loss_clip": 1.04659677, + "balance_loss_mlp": 1.05336595, + "epoch": 0.08808056515857508, + "flos": 22377842115840.0, + "grad_norm": 4.267071209901202, + "language_loss": 0.78351951, + "learning_rate": 3.964718851551923e-06, + "loss": 0.80604541, + "num_input_tokens_seen": 31341880, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.296875, + "step": 1465, + "time_per_iteration": 2.4745209217071533 + }, + { + "auxiliary_loss_clip": 0.01190864, + "auxiliary_loss_mlp": 0.01059186, + "balance_loss_clip": 1.0371089, + "balance_loss_mlp": 1.05628061, + "epoch": 0.08814068841124305, + "flos": 23185293897600.0, + "grad_norm": 1.8950228405880263, + "language_loss": 0.84698099, + "learning_rate": 3.9646459839685675e-06, + "loss": 0.86948144, + "num_input_tokens_seen": 31361995, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.34375, + "step": 1466, + "time_per_iteration": 2.4920802116394043 + }, + { + "auxiliary_loss_clip": 0.01185126, + "auxiliary_loss_mlp": 0.0105874, + "balance_loss_clip": 1.03556609, + "balance_loss_mlp": 1.05407715, + "epoch": 0.08820081166391101, + "flos": 25155281358720.0, + "grad_norm": 3.8136580791310783, + "language_loss": 0.84233636, + "learning_rate": 3.964573041885641e-06, + "loss": 0.86477506, + "num_input_tokens_seen": 31381515, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3125, + "step": 1467, + "time_per_iteration": 2.5413413047790527 + }, + { + "auxiliary_loss_clip": 0.01183856, + "auxiliary_loss_mlp": 0.01056021, + "balance_loss_clip": 1.03381276, + "balance_loss_mlp": 1.05462813, + "epoch": 0.08826093491657899, + "flos": 22231685675520.0, + "grad_norm": 1.7481416698073104, + "language_loss": 0.75517243, + "learning_rate": 3.964500025305907e-06, + "loss": 0.7775712, + "num_input_tokens_seen": 31400345, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.296875, + "step": 1468, + "time_per_iteration": 2.496363878250122 + }, + { + "auxiliary_loss_clip": 0.01184448, + "auxiliary_loss_mlp": 0.0105718, + "balance_loss_clip": 1.03623509, + "balance_loss_mlp": 1.05570245, + "epoch": 0.08832105816924696, + "flos": 22126826897280.0, + "grad_norm": 1.7579385887345491, + "language_loss": 0.80601043, + "learning_rate": 3.9644269342321355e-06, + "loss": 0.82842672, + "num_input_tokens_seen": 31419620, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.2890625, + "step": 1469, + "time_per_iteration": 2.5486512184143066 + }, + { + "auxiliary_loss_clip": 0.01187777, + "auxiliary_loss_mlp": 0.01054482, + "balance_loss_clip": 1.0321182, + "balance_loss_mlp": 1.05454695, + "epoch": 0.08838118142191492, + "flos": 17566495159680.0, + "grad_norm": 3.202810753535508, + "language_loss": 0.77607989, + "learning_rate": 3.9643537686670974e-06, + "loss": 0.7985025, + "num_input_tokens_seen": 31437970, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.3359375, + "step": 1470, + "time_per_iteration": 2.6632297039031982 + }, + { + "auxiliary_loss_clip": 0.01182287, + "auxiliary_loss_mlp": 0.0106647, + "balance_loss_clip": 1.04266429, + "balance_loss_mlp": 1.05412459, + "epoch": 0.0884413046745829, + "flos": 20777196251520.0, + "grad_norm": 1.774803600242038, + "language_loss": 0.84233272, + "learning_rate": 3.964280528613569e-06, + "loss": 0.86482024, + "num_input_tokens_seen": 31457040, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.28125, + "step": 1471, + "time_per_iteration": 2.5040950775146484 + }, + { + "auxiliary_loss_clip": 0.01178062, + "auxiliary_loss_mlp": 0.01052705, + "balance_loss_clip": 1.03247499, + "balance_loss_mlp": 1.05459309, + "epoch": 0.08850142792725087, + "flos": 22125462180480.0, + "grad_norm": 1.6761790638208889, + "language_loss": 0.83481324, + "learning_rate": 3.964207214074324e-06, + "loss": 0.85712093, + "num_input_tokens_seen": 31477520, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.234375, + "step": 1472, + "time_per_iteration": 2.5079073905944824 + }, + { + "auxiliary_loss_clip": 0.01185739, + "auxiliary_loss_mlp": 0.0105882, + "balance_loss_clip": 1.03597999, + "balance_loss_mlp": 1.05491877, + "epoch": 0.08856155117991883, + "flos": 22418744728320.0, + "grad_norm": 2.396127276436556, + "language_loss": 0.828246, + "learning_rate": 3.964133825052146e-06, + "loss": 0.85069156, + "num_input_tokens_seen": 31495575, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.3125, + "step": 1473, + "time_per_iteration": 2.4919679164886475 + }, + { + "auxiliary_loss_clip": 0.01183368, + "auxiliary_loss_mlp": 0.01061525, + "balance_loss_clip": 1.04040098, + "balance_loss_mlp": 1.05414963, + "epoch": 0.0886216744325868, + "flos": 29937002572800.0, + "grad_norm": 1.8346488607114506, + "language_loss": 0.78871369, + "learning_rate": 3.964060361549816e-06, + "loss": 0.81116265, + "num_input_tokens_seen": 31520020, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.296875, + "step": 1474, + "time_per_iteration": 2.5806753635406494 + }, + { + "auxiliary_loss_clip": 0.01181812, + "auxiliary_loss_mlp": 0.01057333, + "balance_loss_clip": 1.03413475, + "balance_loss_mlp": 1.05450511, + "epoch": 0.08868179768525478, + "flos": 23982833525760.0, + "grad_norm": 1.918961213895669, + "language_loss": 0.79045832, + "learning_rate": 3.963986823570121e-06, + "loss": 0.81284976, + "num_input_tokens_seen": 31539265, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.2734375, + "step": 1475, + "time_per_iteration": 2.495753765106201 + }, + { + "auxiliary_loss_clip": 0.01184034, + "auxiliary_loss_mlp": 0.01048109, + "balance_loss_clip": 1.0258882, + "balance_loss_mlp": 1.05443335, + "epoch": 0.08874192093792274, + "flos": 43177553216640.0, + "grad_norm": 1.6510632676992876, + "language_loss": 0.73973525, + "learning_rate": 3.963913211115848e-06, + "loss": 0.76205671, + "num_input_tokens_seen": 31563425, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.296875, + "step": 1476, + "time_per_iteration": 6.925957679748535 + }, + { + "auxiliary_loss_clip": 0.0118493, + "auxiliary_loss_mlp": 0.01060562, + "balance_loss_clip": 1.03723264, + "balance_loss_mlp": 1.05454326, + "epoch": 0.0888020441905907, + "flos": 32852445868800.0, + "grad_norm": 1.527991814504802, + "language_loss": 0.74644423, + "learning_rate": 3.9638395241897895e-06, + "loss": 0.76889908, + "num_input_tokens_seen": 31584525, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.3046875, + "step": 1477, + "time_per_iteration": 2.6033589839935303 + }, + { + "auxiliary_loss_clip": 0.01181345, + "auxiliary_loss_mlp": 0.01048179, + "balance_loss_clip": 1.02571976, + "balance_loss_mlp": 1.05315852, + "epoch": 0.08886216744325869, + "flos": 23149347361920.0, + "grad_norm": 2.4237564416671002, + "language_loss": 0.86488914, + "learning_rate": 3.963765762794739e-06, + "loss": 0.88718438, + "num_input_tokens_seen": 31603325, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.28125, + "step": 1478, + "time_per_iteration": 2.5188398361206055 + }, + { + "auxiliary_loss_clip": 0.01182629, + "auxiliary_loss_mlp": 0.01057749, + "balance_loss_clip": 1.03599334, + "balance_loss_mlp": 1.05417609, + "epoch": 0.08892229069592665, + "flos": 23331593992320.0, + "grad_norm": 7.715019285918926, + "language_loss": 0.77988106, + "learning_rate": 3.963691926933495e-06, + "loss": 0.80228484, + "num_input_tokens_seen": 31624820, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.28125, + "step": 1479, + "time_per_iteration": 2.50730562210083 + }, + { + "auxiliary_loss_clip": 0.01180801, + "auxiliary_loss_mlp": 0.01053517, + "balance_loss_clip": 1.02986622, + "balance_loss_mlp": 1.05275774, + "epoch": 0.08898241394859462, + "flos": 26213784272640.0, + "grad_norm": 2.3628139464189815, + "language_loss": 0.78267598, + "learning_rate": 3.9636180166088555e-06, + "loss": 0.80501914, + "num_input_tokens_seen": 31646080, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.28125, + "step": 1480, + "time_per_iteration": 2.512730360031128 + }, + { + "auxiliary_loss_clip": 0.01185027, + "auxiliary_loss_mlp": 0.01063075, + "balance_loss_clip": 1.03901875, + "balance_loss_mlp": 1.05357075, + "epoch": 0.0890425372012626, + "flos": 23550613171200.0, + "grad_norm": 3.1949876590170825, + "language_loss": 0.66627192, + "learning_rate": 3.963544031823624e-06, + "loss": 0.68875289, + "num_input_tokens_seen": 31665770, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.3125, + "step": 1481, + "time_per_iteration": 2.4874138832092285 + }, + { + "auxiliary_loss_clip": 0.0118244, + "auxiliary_loss_mlp": 0.01051994, + "balance_loss_clip": 1.03040504, + "balance_loss_mlp": 1.05519605, + "epoch": 0.08910266045393056, + "flos": 23002795872000.0, + "grad_norm": 1.9560930463008703, + "language_loss": 0.9644348, + "learning_rate": 3.9634699725806065e-06, + "loss": 0.98677909, + "num_input_tokens_seen": 31683805, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.2734375, + "step": 1482, + "time_per_iteration": 2.484274387359619 + }, + { + "auxiliary_loss_clip": 0.01190541, + "auxiliary_loss_mlp": 0.01055727, + "balance_loss_clip": 1.03306508, + "balance_loss_mlp": 1.0577234, + "epoch": 0.08916278370659853, + "flos": 31936508035200.0, + "grad_norm": 2.358614174414972, + "language_loss": 0.78436875, + "learning_rate": 3.96339583888261e-06, + "loss": 0.80683142, + "num_input_tokens_seen": 31704630, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.328125, + "step": 1483, + "time_per_iteration": 2.566199779510498 + }, + { + "auxiliary_loss_clip": 0.01183147, + "auxiliary_loss_mlp": 0.01072522, + "balance_loss_clip": 1.04891825, + "balance_loss_mlp": 1.05463076, + "epoch": 0.08922290695926649, + "flos": 17530404969600.0, + "grad_norm": 2.232834813834399, + "language_loss": 0.86091626, + "learning_rate": 3.963321630732448e-06, + "loss": 0.88347292, + "num_input_tokens_seen": 31723255, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.28125, + "step": 1484, + "time_per_iteration": 2.4742467403411865 + }, + { + "auxiliary_loss_clip": 0.01190947, + "auxiliary_loss_mlp": 0.01064822, + "balance_loss_clip": 1.04152799, + "balance_loss_mlp": 1.0570302, + "epoch": 0.08928303021193447, + "flos": 32125075459200.0, + "grad_norm": 1.7135103732453094, + "language_loss": 0.80460989, + "learning_rate": 3.963247348132932e-06, + "loss": 0.82716757, + "num_input_tokens_seen": 31747045, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.34375, + "step": 1485, + "time_per_iteration": 2.5808591842651367 + }, + { + "auxiliary_loss_clip": 0.01182644, + "auxiliary_loss_mlp": 0.01059654, + "balance_loss_clip": 1.03663421, + "balance_loss_mlp": 1.05256486, + "epoch": 0.08934315346460243, + "flos": 22125210785280.0, + "grad_norm": 2.0833446931013144, + "language_loss": 0.8295821, + "learning_rate": 3.96317299108688e-06, + "loss": 0.852005, + "num_input_tokens_seen": 31766615, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.296875, + "step": 1486, + "time_per_iteration": 2.5060923099517822 + }, + { + "auxiliary_loss_clip": 0.01184012, + "auxiliary_loss_mlp": 0.01060171, + "balance_loss_clip": 1.03749752, + "balance_loss_mlp": 1.05506349, + "epoch": 0.0894032767172704, + "flos": 22565583527040.0, + "grad_norm": 1.6673763915473876, + "language_loss": 0.76653707, + "learning_rate": 3.963098559597111e-06, + "loss": 0.78897893, + "num_input_tokens_seen": 31785855, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.296875, + "step": 1487, + "time_per_iteration": 2.4968059062957764 + }, + { + "auxiliary_loss_clip": 0.01181982, + "auxiliary_loss_mlp": 0.01063322, + "balance_loss_clip": 1.03908658, + "balance_loss_mlp": 1.05203557, + "epoch": 0.08946339996993838, + "flos": 20193396503040.0, + "grad_norm": 2.360836711926668, + "language_loss": 0.83246535, + "learning_rate": 3.963024053666449e-06, + "loss": 0.85491836, + "num_input_tokens_seen": 31804210, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.296875, + "step": 1488, + "time_per_iteration": 2.48189377784729 + }, + { + "auxiliary_loss_clip": 0.01180173, + "auxiliary_loss_mlp": 0.01051663, + "balance_loss_clip": 1.03020549, + "balance_loss_mlp": 1.05375743, + "epoch": 0.08952352322260634, + "flos": 48360181104000.0, + "grad_norm": 1.9508187836998312, + "language_loss": 0.71647823, + "learning_rate": 3.962949473297718e-06, + "loss": 0.73879659, + "num_input_tokens_seen": 31826150, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.265625, + "step": 1489, + "time_per_iteration": 2.701035737991333 + }, + { + "auxiliary_loss_clip": 0.01178824, + "auxiliary_loss_mlp": 0.01053682, + "balance_loss_clip": 1.03087783, + "balance_loss_mlp": 1.05088401, + "epoch": 0.08958364647527431, + "flos": 31793081028480.0, + "grad_norm": 1.8144641128553483, + "language_loss": 0.89490288, + "learning_rate": 3.962874818493745e-06, + "loss": 0.91722786, + "num_input_tokens_seen": 31848060, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.28125, + "step": 1490, + "time_per_iteration": 2.676098108291626 + }, + { + "auxiliary_loss_clip": 0.01187914, + "auxiliary_loss_mlp": 0.01064376, + "balance_loss_clip": 1.0416671, + "balance_loss_mlp": 1.05264366, + "epoch": 0.08964376972794229, + "flos": 23368186972800.0, + "grad_norm": 2.165908760559946, + "language_loss": 0.73276365, + "learning_rate": 3.9628000892573635e-06, + "loss": 0.75528657, + "num_input_tokens_seen": 31870040, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.3515625, + "step": 1491, + "time_per_iteration": 2.5531163215637207 + }, + { + "auxiliary_loss_clip": 0.01181575, + "auxiliary_loss_mlp": 0.01050632, + "balance_loss_clip": 1.02984166, + "balance_loss_mlp": 1.05362582, + "epoch": 0.08970389298061025, + "flos": 23294785530240.0, + "grad_norm": 1.6884120279290091, + "language_loss": 0.77121007, + "learning_rate": 3.9627252855914055e-06, + "loss": 0.79353207, + "num_input_tokens_seen": 31890400, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.28125, + "step": 1492, + "time_per_iteration": 2.485531806945801 + }, + { + "auxiliary_loss_clip": 0.01180742, + "auxiliary_loss_mlp": 0.01055458, + "balance_loss_clip": 1.03324914, + "balance_loss_mlp": 1.05471706, + "epoch": 0.08976401623327822, + "flos": 33761703772800.0, + "grad_norm": 2.0059524225222414, + "language_loss": 0.71168351, + "learning_rate": 3.962650407498707e-06, + "loss": 0.73404551, + "num_input_tokens_seen": 31913435, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.2578125, + "step": 1493, + "time_per_iteration": 2.5819149017333984 + }, + { + "auxiliary_loss_clip": 0.01184961, + "auxiliary_loss_mlp": 0.01056172, + "balance_loss_clip": 1.03304577, + "balance_loss_mlp": 1.05477107, + "epoch": 0.08982413948594618, + "flos": 23911335504000.0, + "grad_norm": 1.7443337417031568, + "language_loss": 0.86910093, + "learning_rate": 3.962575454982109e-06, + "loss": 0.89151227, + "num_input_tokens_seen": 31932435, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3046875, + "step": 1494, + "time_per_iteration": 2.491126775741577 + }, + { + "auxiliary_loss_clip": 0.01180854, + "auxiliary_loss_mlp": 0.01064445, + "balance_loss_clip": 1.04080594, + "balance_loss_mlp": 1.05289626, + "epoch": 0.08988426273861416, + "flos": 16837544551680.0, + "grad_norm": 1.7176751495851263, + "language_loss": 0.83065581, + "learning_rate": 3.962500428044454e-06, + "loss": 0.85310876, + "num_input_tokens_seen": 31950125, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.28125, + "step": 1495, + "time_per_iteration": 2.463747501373291 + }, + { + "auxiliary_loss_clip": 0.01187726, + "auxiliary_loss_mlp": 0.01056179, + "balance_loss_clip": 1.03410196, + "balance_loss_mlp": 1.05825078, + "epoch": 0.08994438599128213, + "flos": 14793365548800.0, + "grad_norm": 1.861203767183833, + "language_loss": 0.69813877, + "learning_rate": 3.962425326688585e-06, + "loss": 0.72057784, + "num_input_tokens_seen": 31968050, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.296875, + "step": 1496, + "time_per_iteration": 2.4409985542297363 + }, + { + "auxiliary_loss_clip": 0.01180533, + "auxiliary_loss_mlp": 0.01051241, + "balance_loss_clip": 1.03035557, + "balance_loss_mlp": 1.05325341, + "epoch": 0.09000450924395009, + "flos": 17384320356480.0, + "grad_norm": 1.6091347390483586, + "language_loss": 0.79913563, + "learning_rate": 3.962350150917351e-06, + "loss": 0.82145333, + "num_input_tokens_seen": 31985675, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.2734375, + "step": 1497, + "time_per_iteration": 2.492732048034668 + }, + { + "auxiliary_loss_clip": 0.01186533, + "auxiliary_loss_mlp": 0.0105809, + "balance_loss_clip": 1.03484416, + "balance_loss_mlp": 1.05299318, + "epoch": 0.09006463249661807, + "flos": 24280317964800.0, + "grad_norm": 2.3611651581227915, + "language_loss": 0.8262192, + "learning_rate": 3.9622749007336035e-06, + "loss": 0.84866548, + "num_input_tokens_seen": 32005180, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.3359375, + "step": 1498, + "time_per_iteration": 2.492124080657959 + }, + { + "auxiliary_loss_clip": 0.01188542, + "auxiliary_loss_mlp": 0.01061597, + "balance_loss_clip": 1.0402112, + "balance_loss_mlp": 1.05628157, + "epoch": 0.09012475574928604, + "flos": 13661928069120.0, + "grad_norm": 2.316244908481527, + "language_loss": 0.7849865, + "learning_rate": 3.962199576140195e-06, + "loss": 0.80748791, + "num_input_tokens_seen": 32022970, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.3203125, + "step": 1499, + "time_per_iteration": 2.455986738204956 + }, + { + "auxiliary_loss_clip": 0.0117942, + "auxiliary_loss_mlp": 0.01055125, + "balance_loss_clip": 1.03348815, + "balance_loss_mlp": 1.05351877, + "epoch": 0.090184879001954, + "flos": 23327751237120.0, + "grad_norm": 1.652937184766999, + "language_loss": 0.93453979, + "learning_rate": 3.962124177139981e-06, + "loss": 0.95688522, + "num_input_tokens_seen": 32043055, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.2578125, + "step": 1500, + "time_per_iteration": 2.481450080871582 + }, + { + "auxiliary_loss_clip": 0.01182931, + "auxiliary_loss_mlp": 0.01050934, + "balance_loss_clip": 1.0268302, + "balance_loss_mlp": 1.05170345, + "epoch": 0.09024500225462198, + "flos": 23002688131200.0, + "grad_norm": 2.9257189866461966, + "language_loss": 0.74465239, + "learning_rate": 3.962048703735822e-06, + "loss": 0.76699102, + "num_input_tokens_seen": 32061900, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.3125, + "step": 1501, + "time_per_iteration": 2.4806344509124756 + }, + { + "auxiliary_loss_clip": 0.01077215, + "auxiliary_loss_mlp": 0.01003378, + "balance_loss_clip": 0.99992049, + "balance_loss_mlp": 1.02834833, + "epoch": 0.09030512550728995, + "flos": 62189203242240.0, + "grad_norm": 0.7322723529864947, + "language_loss": 0.58304042, + "learning_rate": 3.96197315593058e-06, + "loss": 0.60384637, + "num_input_tokens_seen": 32122745, + "router_z_loss_clip": 0.03466797, + "router_z_loss_mlp": 0.48828125, + "step": 1502, + "time_per_iteration": 3.066755771636963 + }, + { + "auxiliary_loss_clip": 0.01178455, + "auxiliary_loss_mlp": 0.01047829, + "balance_loss_clip": 1.02655029, + "balance_loss_mlp": 1.05134845, + "epoch": 0.09036524875995791, + "flos": 38800689171840.0, + "grad_norm": 2.407651446444188, + "language_loss": 0.69502187, + "learning_rate": 3.961897533727119e-06, + "loss": 0.71728474, + "num_input_tokens_seen": 32145125, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.2734375, + "step": 1503, + "time_per_iteration": 2.608006000518799 + }, + { + "auxiliary_loss_clip": 0.01180913, + "auxiliary_loss_mlp": 0.01054911, + "balance_loss_clip": 1.03346539, + "balance_loss_mlp": 1.0508244, + "epoch": 0.09042537201262588, + "flos": 21690081429120.0, + "grad_norm": 2.015182939383952, + "language_loss": 0.86142361, + "learning_rate": 3.961821837128306e-06, + "loss": 0.88378185, + "num_input_tokens_seen": 32166255, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.3046875, + "step": 1504, + "time_per_iteration": 2.489906072616577 + }, + { + "auxiliary_loss_clip": 0.01188306, + "auxiliary_loss_mlp": 0.01064134, + "balance_loss_clip": 1.03871906, + "balance_loss_mlp": 1.05330658, + "epoch": 0.09048549526529386, + "flos": 22267021680000.0, + "grad_norm": 1.9466916160800904, + "language_loss": 0.72267938, + "learning_rate": 3.961746066137014e-06, + "loss": 0.74520379, + "num_input_tokens_seen": 32184010, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.3515625, + "step": 1505, + "time_per_iteration": 2.465965509414673 + }, + { + "auxiliary_loss_clip": 0.01179818, + "auxiliary_loss_mlp": 0.01054589, + "balance_loss_clip": 1.03203487, + "balance_loss_mlp": 1.05332816, + "epoch": 0.09054561851796182, + "flos": 14610939350400.0, + "grad_norm": 2.3726339000283447, + "language_loss": 0.80946511, + "learning_rate": 3.961670220756114e-06, + "loss": 0.83180916, + "num_input_tokens_seen": 32201635, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.265625, + "step": 1506, + "time_per_iteration": 2.4512932300567627 + }, + { + "auxiliary_loss_clip": 0.01179114, + "auxiliary_loss_mlp": 0.01049611, + "balance_loss_clip": 1.02859426, + "balance_loss_mlp": 1.0531404, + "epoch": 0.09060574177062979, + "flos": 27636169916160.0, + "grad_norm": 2.1533698580433254, + "language_loss": 0.76043189, + "learning_rate": 3.961594300988482e-06, + "loss": 0.78271914, + "num_input_tokens_seen": 32221940, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.2578125, + "step": 1507, + "time_per_iteration": 2.5277152061462402 + }, + { + "auxiliary_loss_clip": 0.01067186, + "auxiliary_loss_mlp": 0.01009923, + "balance_loss_clip": 1.00679994, + "balance_loss_mlp": 1.01922798, + "epoch": 0.09066586502329776, + "flos": 66085797513600.0, + "grad_norm": 0.7312512202665958, + "language_loss": 0.57670546, + "learning_rate": 3.961518306836998e-06, + "loss": 0.59747648, + "num_input_tokens_seen": 32276495, + "router_z_loss_clip": 0.03112793, + "router_z_loss_mlp": 0.48046875, + "step": 1508, + "time_per_iteration": 2.9330992698669434 + }, + { + "auxiliary_loss_clip": 0.01182207, + "auxiliary_loss_mlp": 0.01052694, + "balance_loss_clip": 1.0313319, + "balance_loss_mlp": 1.05309892, + "epoch": 0.09072598827596573, + "flos": 18916449027840.0, + "grad_norm": 2.072562238387217, + "language_loss": 0.85046542, + "learning_rate": 3.961442238304543e-06, + "loss": 0.87281442, + "num_input_tokens_seen": 32294130, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.2890625, + "step": 1509, + "time_per_iteration": 2.475606918334961 + }, + { + "auxiliary_loss_clip": 0.01189974, + "auxiliary_loss_mlp": 0.01065674, + "balance_loss_clip": 1.04158139, + "balance_loss_mlp": 1.05606115, + "epoch": 0.0907861115286337, + "flos": 24821742643200.0, + "grad_norm": 2.413703760690829, + "language_loss": 0.84302551, + "learning_rate": 3.961366095394002e-06, + "loss": 0.86558187, + "num_input_tokens_seen": 32313555, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.3359375, + "step": 1510, + "time_per_iteration": 2.576070785522461 + }, + { + "auxiliary_loss_clip": 0.01184002, + "auxiliary_loss_mlp": 0.01055545, + "balance_loss_clip": 1.0335387, + "balance_loss_mlp": 1.05408144, + "epoch": 0.09084623478130167, + "flos": 21652842003840.0, + "grad_norm": 1.9204492801986277, + "language_loss": 0.85558611, + "learning_rate": 3.961289878108262e-06, + "loss": 0.8779816, + "num_input_tokens_seen": 32331430, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.296875, + "step": 1511, + "time_per_iteration": 2.5085484981536865 + }, + { + "auxiliary_loss_clip": 0.01181957, + "auxiliary_loss_mlp": 0.01048579, + "balance_loss_clip": 1.02690685, + "balance_loss_mlp": 1.05469918, + "epoch": 0.09090635803396964, + "flos": 27639258485760.0, + "grad_norm": 1.5775523407684693, + "language_loss": 0.84897017, + "learning_rate": 3.9612135864502135e-06, + "loss": 0.87127548, + "num_input_tokens_seen": 32353705, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.2734375, + "step": 1512, + "time_per_iteration": 2.515565872192383 + }, + { + "auxiliary_loss_clip": 0.01175178, + "auxiliary_loss_mlp": 0.01049482, + "balance_loss_clip": 1.02888274, + "balance_loss_mlp": 1.05033123, + "epoch": 0.0909664812866376, + "flos": 17669127294720.0, + "grad_norm": 2.9006324958480167, + "language_loss": 0.86704344, + "learning_rate": 3.961137220422749e-06, + "loss": 0.88929009, + "num_input_tokens_seen": 32370520, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.25, + "step": 1513, + "time_per_iteration": 2.475271701812744 + }, + { + "auxiliary_loss_clip": 0.01180699, + "auxiliary_loss_mlp": 0.01051925, + "balance_loss_clip": 1.03170729, + "balance_loss_mlp": 1.0536902, + "epoch": 0.09102660453930557, + "flos": 23951448017280.0, + "grad_norm": 1.6716164971548293, + "language_loss": 0.86379707, + "learning_rate": 3.961060780028764e-06, + "loss": 0.8861233, + "num_input_tokens_seen": 32389105, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.2734375, + "step": 1514, + "time_per_iteration": 2.5317347049713135 + }, + { + "auxiliary_loss_clip": 0.0118192, + "auxiliary_loss_mlp": 0.01060131, + "balance_loss_clip": 1.03991365, + "balance_loss_mlp": 1.05550981, + "epoch": 0.09108672779197355, + "flos": 25812949426560.0, + "grad_norm": 1.9279276264910965, + "language_loss": 0.89882755, + "learning_rate": 3.960984265271159e-06, + "loss": 0.92124808, + "num_input_tokens_seen": 32408065, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.265625, + "step": 1515, + "time_per_iteration": 2.5507757663726807 + }, + { + "auxiliary_loss_clip": 0.011822, + "auxiliary_loss_mlp": 0.0105444, + "balance_loss_clip": 1.03174293, + "balance_loss_mlp": 1.05321527, + "epoch": 0.09114685104464151, + "flos": 29639482220160.0, + "grad_norm": 2.0145121179505905, + "language_loss": 0.85567206, + "learning_rate": 3.9609076761528335e-06, + "loss": 0.87803847, + "num_input_tokens_seen": 32427225, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.28125, + "step": 1516, + "time_per_iteration": 2.524787425994873 + }, + { + "auxiliary_loss_clip": 0.01182997, + "auxiliary_loss_mlp": 0.01053148, + "balance_loss_clip": 1.03130913, + "balance_loss_mlp": 1.05217946, + "epoch": 0.09120697429730948, + "flos": 33729635905920.0, + "grad_norm": 1.5232376391767188, + "language_loss": 0.81104374, + "learning_rate": 3.960831012676692e-06, + "loss": 0.83340514, + "num_input_tokens_seen": 32450510, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.3125, + "step": 1517, + "time_per_iteration": 2.5781173706054688 + }, + { + "auxiliary_loss_clip": 0.01185126, + "auxiliary_loss_mlp": 0.01068952, + "balance_loss_clip": 1.04729199, + "balance_loss_mlp": 1.05378699, + "epoch": 0.09126709754997746, + "flos": 18401381953920.0, + "grad_norm": 1.6026665805728266, + "language_loss": 0.78008473, + "learning_rate": 3.960754274845642e-06, + "loss": 0.80262554, + "num_input_tokens_seen": 32468425, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.3125, + "step": 1518, + "time_per_iteration": 4.000938653945923 + }, + { + "auxiliary_loss_clip": 0.01179619, + "auxiliary_loss_mlp": 0.01060053, + "balance_loss_clip": 1.03851235, + "balance_loss_mlp": 1.05189955, + "epoch": 0.09132722080264542, + "flos": 22091957769600.0, + "grad_norm": 1.883609624415087, + "language_loss": 0.86375809, + "learning_rate": 3.960677462662594e-06, + "loss": 0.88615477, + "num_input_tokens_seen": 32487510, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.28125, + "step": 1519, + "time_per_iteration": 3.945183277130127 + }, + { + "auxiliary_loss_clip": 0.01180521, + "auxiliary_loss_mlp": 0.01053198, + "balance_loss_clip": 1.03027439, + "balance_loss_mlp": 1.05196333, + "epoch": 0.09138734405531339, + "flos": 21033131633280.0, + "grad_norm": 2.4149150298084425, + "language_loss": 0.73425877, + "learning_rate": 3.96060057613046e-06, + "loss": 0.75659597, + "num_input_tokens_seen": 32507250, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.28125, + "step": 1520, + "time_per_iteration": 2.470977306365967 + }, + { + "auxiliary_loss_clip": 0.01181506, + "auxiliary_loss_mlp": 0.01055081, + "balance_loss_clip": 1.03299177, + "balance_loss_mlp": 1.0525614, + "epoch": 0.09144746730798137, + "flos": 20083940784000.0, + "grad_norm": 2.6960755220153825, + "language_loss": 0.85296613, + "learning_rate": 3.960523615252156e-06, + "loss": 0.87533194, + "num_input_tokens_seen": 32526045, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.2890625, + "step": 1521, + "time_per_iteration": 2.478440761566162 + }, + { + "auxiliary_loss_clip": 0.01183058, + "auxiliary_loss_mlp": 0.01057495, + "balance_loss_clip": 1.034917, + "balance_loss_mlp": 1.05319118, + "epoch": 0.09150759056064933, + "flos": 22778210085120.0, + "grad_norm": 2.1543470058122876, + "language_loss": 0.83979875, + "learning_rate": 3.960446580030599e-06, + "loss": 0.86220425, + "num_input_tokens_seen": 32546575, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.296875, + "step": 1522, + "time_per_iteration": 2.4761834144592285 + }, + { + "auxiliary_loss_clip": 0.01174804, + "auxiliary_loss_mlp": 0.01057417, + "balance_loss_clip": 1.03500533, + "balance_loss_mlp": 1.05125594, + "epoch": 0.0915677138133173, + "flos": 27564205017600.0, + "grad_norm": 2.174137545904809, + "language_loss": 0.810691, + "learning_rate": 3.960369470468711e-06, + "loss": 0.83301324, + "num_input_tokens_seen": 32568795, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.234375, + "step": 1523, + "time_per_iteration": 2.525385618209839 + }, + { + "auxiliary_loss_clip": 0.01182998, + "auxiliary_loss_mlp": 0.01063543, + "balance_loss_clip": 1.0426811, + "balance_loss_mlp": 1.05365944, + "epoch": 0.09162783706598528, + "flos": 17674765729920.0, + "grad_norm": 2.529065997296093, + "language_loss": 0.74591744, + "learning_rate": 3.960292286569418e-06, + "loss": 0.76838291, + "num_input_tokens_seen": 32587010, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.296875, + "step": 1524, + "time_per_iteration": 2.4293112754821777 + }, + { + "auxiliary_loss_clip": 0.01181121, + "auxiliary_loss_mlp": 0.01060116, + "balance_loss_clip": 1.03822935, + "balance_loss_mlp": 1.05373263, + "epoch": 0.09168796031865324, + "flos": 18478195188480.0, + "grad_norm": 2.0870290485059586, + "language_loss": 0.861516, + "learning_rate": 3.960215028335644e-06, + "loss": 0.88392842, + "num_input_tokens_seen": 32602375, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.2734375, + "step": 1525, + "time_per_iteration": 2.449774980545044 + }, + { + "auxiliary_loss_clip": 0.01181752, + "auxiliary_loss_mlp": 0.01047765, + "balance_loss_clip": 1.02577078, + "balance_loss_mlp": 1.05424511, + "epoch": 0.0917480835713212, + "flos": 29387605075200.0, + "grad_norm": 2.3600448138049597, + "language_loss": 0.74690467, + "learning_rate": 3.96013769577032e-06, + "loss": 0.76919985, + "num_input_tokens_seen": 32621460, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.28125, + "step": 1526, + "time_per_iteration": 2.5295088291168213 + }, + { + "auxiliary_loss_clip": 0.01177679, + "auxiliary_loss_mlp": 0.01052164, + "balance_loss_clip": 1.03058743, + "balance_loss_mlp": 1.05291057, + "epoch": 0.09180820682398917, + "flos": 19829262378240.0, + "grad_norm": 1.970734062299861, + "language_loss": 0.7736311, + "learning_rate": 3.960060288876378e-06, + "loss": 0.79592943, + "num_input_tokens_seen": 32640440, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.25, + "step": 1527, + "time_per_iteration": 2.465484142303467 + }, + { + "auxiliary_loss_clip": 0.01179355, + "auxiliary_loss_mlp": 0.01053495, + "balance_loss_clip": 1.03064227, + "balance_loss_mlp": 1.05090261, + "epoch": 0.09186833007665715, + "flos": 23841848643840.0, + "grad_norm": 1.9755082573034908, + "language_loss": 0.78465801, + "learning_rate": 3.959982807656753e-06, + "loss": 0.80698651, + "num_input_tokens_seen": 32660020, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.28125, + "step": 1528, + "time_per_iteration": 2.5257718563079834 + }, + { + "auxiliary_loss_clip": 0.01177926, + "auxiliary_loss_mlp": 0.01048723, + "balance_loss_clip": 1.0276351, + "balance_loss_mlp": 1.05085492, + "epoch": 0.09192845332932512, + "flos": 12932726065920.0, + "grad_norm": 2.6736868569465813, + "language_loss": 0.76880527, + "learning_rate": 3.959905252114384e-06, + "loss": 0.79107177, + "num_input_tokens_seen": 32678170, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.2734375, + "step": 1529, + "time_per_iteration": 2.4417288303375244 + }, + { + "auxiliary_loss_clip": 0.01180418, + "auxiliary_loss_mlp": 0.01053431, + "balance_loss_clip": 1.0306139, + "balance_loss_mlp": 1.05037212, + "epoch": 0.09198857658199308, + "flos": 24568177559040.0, + "grad_norm": 1.767002219307874, + "language_loss": 0.83118784, + "learning_rate": 3.959827622252211e-06, + "loss": 0.85352623, + "num_input_tokens_seen": 32697540, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.296875, + "step": 1530, + "time_per_iteration": 2.53367018699646 + }, + { + "auxiliary_loss_clip": 0.01173477, + "auxiliary_loss_mlp": 0.01059229, + "balance_loss_clip": 1.03723454, + "balance_loss_mlp": 1.05024123, + "epoch": 0.09204869983466106, + "flos": 20266941600000.0, + "grad_norm": 2.058190265763826, + "language_loss": 0.8408612, + "learning_rate": 3.959749918073179e-06, + "loss": 0.86318833, + "num_input_tokens_seen": 32716805, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.234375, + "step": 1531, + "time_per_iteration": 2.4784743785858154 + }, + { + "auxiliary_loss_clip": 0.01177383, + "auxiliary_loss_mlp": 0.01048961, + "balance_loss_clip": 1.02728868, + "balance_loss_mlp": 1.05083799, + "epoch": 0.09210882308732903, + "flos": 20885646389760.0, + "grad_norm": 1.8347699676368683, + "language_loss": 0.81135088, + "learning_rate": 3.959672139580233e-06, + "loss": 0.83361435, + "num_input_tokens_seen": 32736385, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.265625, + "step": 1532, + "time_per_iteration": 2.506875991821289 + }, + { + "auxiliary_loss_clip": 0.01179012, + "auxiliary_loss_mlp": 0.01052948, + "balance_loss_clip": 1.03044105, + "balance_loss_mlp": 1.05169332, + "epoch": 0.09216894633999699, + "flos": 30956326727040.0, + "grad_norm": 1.8650949584676202, + "language_loss": 0.83489287, + "learning_rate": 3.9595942867763235e-06, + "loss": 0.85721242, + "num_input_tokens_seen": 32757140, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.2734375, + "step": 1533, + "time_per_iteration": 2.5279369354248047 + }, + { + "auxiliary_loss_clip": 0.01181754, + "auxiliary_loss_mlp": 0.01048559, + "balance_loss_clip": 1.02662432, + "balance_loss_mlp": 1.05468941, + "epoch": 0.09222906959266497, + "flos": 13151565676800.0, + "grad_norm": 1.8226281566677605, + "language_loss": 0.89789164, + "learning_rate": 3.959516359664402e-06, + "loss": 0.92019475, + "num_input_tokens_seen": 32774860, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.2734375, + "step": 1534, + "time_per_iteration": 2.498732089996338 + }, + { + "auxiliary_loss_clip": 0.01178154, + "auxiliary_loss_mlp": 0.01064045, + "balance_loss_clip": 1.03958321, + "balance_loss_mlp": 1.04994035, + "epoch": 0.09228919284533293, + "flos": 25994477784960.0, + "grad_norm": 2.6410414613778777, + "language_loss": 0.75911283, + "learning_rate": 3.959438358247424e-06, + "loss": 0.78153479, + "num_input_tokens_seen": 32795250, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.28125, + "step": 1535, + "time_per_iteration": 2.5389468669891357 + }, + { + "auxiliary_loss_clip": 0.01170543, + "auxiliary_loss_mlp": 0.01043965, + "balance_loss_clip": 1.02404571, + "balance_loss_mlp": 1.04907823, + "epoch": 0.0923493160980009, + "flos": 18660800954880.0, + "grad_norm": 1.8388387816947327, + "language_loss": 0.81344318, + "learning_rate": 3.959360282528346e-06, + "loss": 0.83558822, + "num_input_tokens_seen": 32813805, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.2109375, + "step": 1536, + "time_per_iteration": 2.5075631141662598 + }, + { + "auxiliary_loss_clip": 0.01173873, + "auxiliary_loss_mlp": 0.01051939, + "balance_loss_clip": 1.0312202, + "balance_loss_mlp": 1.04995418, + "epoch": 0.09240943935066886, + "flos": 21140576190720.0, + "grad_norm": 2.109198419692537, + "language_loss": 0.8921392, + "learning_rate": 3.959282132510131e-06, + "loss": 0.91439736, + "num_input_tokens_seen": 32830960, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.234375, + "step": 1537, + "time_per_iteration": 2.4454562664031982 + }, + { + "auxiliary_loss_clip": 0.01177438, + "auxiliary_loss_mlp": 0.01059104, + "balance_loss_clip": 1.03638315, + "balance_loss_mlp": 1.05164456, + "epoch": 0.09246956260333684, + "flos": 20592435669120.0, + "grad_norm": 2.1959440535625285, + "language_loss": 0.8072964, + "learning_rate": 3.959203908195741e-06, + "loss": 0.82966185, + "num_input_tokens_seen": 32848275, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.2578125, + "step": 1538, + "time_per_iteration": 2.50838303565979 + }, + { + "auxiliary_loss_clip": 0.01066028, + "auxiliary_loss_mlp": 0.0101212, + "balance_loss_clip": 1.0091517, + "balance_loss_mlp": 1.01794529, + "epoch": 0.09252968585600481, + "flos": 67558710614400.0, + "grad_norm": 0.74443800558722, + "language_loss": 0.57375526, + "learning_rate": 3.959125609588142e-06, + "loss": 0.59453678, + "num_input_tokens_seen": 32917730, + "router_z_loss_clip": 0.02966309, + "router_z_loss_mlp": 0.48046875, + "step": 1539, + "time_per_iteration": 3.16038179397583 + }, + { + "auxiliary_loss_clip": 0.01179737, + "auxiliary_loss_mlp": 0.01051392, + "balance_loss_clip": 1.02958906, + "balance_loss_mlp": 1.05291581, + "epoch": 0.09258980910867277, + "flos": 17383853479680.0, + "grad_norm": 2.903908071477431, + "language_loss": 0.67164814, + "learning_rate": 3.959047236690304e-06, + "loss": 0.69395947, + "num_input_tokens_seen": 32934910, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.265625, + "step": 1540, + "time_per_iteration": 2.488809585571289 + }, + { + "auxiliary_loss_clip": 0.01178592, + "auxiliary_loss_mlp": 0.0104328, + "balance_loss_clip": 1.02154827, + "balance_loss_mlp": 1.05285096, + "epoch": 0.09264993236134075, + "flos": 19865927185920.0, + "grad_norm": 1.797248436862791, + "language_loss": 0.83666921, + "learning_rate": 3.958968789505198e-06, + "loss": 0.85888791, + "num_input_tokens_seen": 32953840, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.2578125, + "step": 1541, + "time_per_iteration": 2.5406758785247803 + }, + { + "auxiliary_loss_clip": 0.01060695, + "auxiliary_loss_mlp": 0.01009011, + "balance_loss_clip": 1.0061146, + "balance_loss_mlp": 1.01339245, + "epoch": 0.09271005561400872, + "flos": 62284401262080.0, + "grad_norm": 0.8904869203130611, + "language_loss": 0.6196329, + "learning_rate": 3.9588902680358e-06, + "loss": 0.64032996, + "num_input_tokens_seen": 33011410, + "router_z_loss_clip": 0.02893066, + "router_z_loss_mlp": 0.47265625, + "step": 1542, + "time_per_iteration": 3.0973262786865234 + }, + { + "auxiliary_loss_clip": 0.01178215, + "auxiliary_loss_mlp": 0.01055032, + "balance_loss_clip": 1.03486192, + "balance_loss_mlp": 1.05283189, + "epoch": 0.09277017886667668, + "flos": 23329870139520.0, + "grad_norm": 1.711071573157868, + "language_loss": 0.82672381, + "learning_rate": 3.958811672285086e-06, + "loss": 0.84905624, + "num_input_tokens_seen": 33031675, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.25, + "step": 1543, + "time_per_iteration": 2.489415168762207 + }, + { + "auxiliary_loss_clip": 0.0117317, + "auxiliary_loss_mlp": 0.01055984, + "balance_loss_clip": 1.03462195, + "balance_loss_mlp": 1.05128777, + "epoch": 0.09283030211934466, + "flos": 54745169875200.0, + "grad_norm": 1.6169278883375504, + "language_loss": 0.72058821, + "learning_rate": 3.958733002256038e-06, + "loss": 0.74287981, + "num_input_tokens_seen": 33056355, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.21875, + "step": 1544, + "time_per_iteration": 2.7986748218536377 + }, + { + "auxiliary_loss_clip": 0.01177062, + "auxiliary_loss_mlp": 0.01047649, + "balance_loss_clip": 1.0257864, + "balance_loss_mlp": 1.05111873, + "epoch": 0.09289042537201263, + "flos": 30334784762880.0, + "grad_norm": 1.7012123784712243, + "language_loss": 0.77617419, + "learning_rate": 3.958654257951637e-06, + "loss": 0.79842126, + "num_input_tokens_seen": 33079520, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.265625, + "step": 1545, + "time_per_iteration": 2.5673069953918457 + }, + { + "auxiliary_loss_clip": 0.01173726, + "auxiliary_loss_mlp": 0.01050414, + "balance_loss_clip": 1.029338, + "balance_loss_mlp": 1.0525856, + "epoch": 0.09295054862468059, + "flos": 17746838369280.0, + "grad_norm": 2.736353511607615, + "language_loss": 0.74531418, + "learning_rate": 3.9585754393748706e-06, + "loss": 0.76755565, + "num_input_tokens_seen": 33096135, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.2109375, + "step": 1546, + "time_per_iteration": 2.456806182861328 + }, + { + "auxiliary_loss_clip": 0.01180806, + "auxiliary_loss_mlp": 0.01051708, + "balance_loss_clip": 1.02968979, + "balance_loss_mlp": 1.05292201, + "epoch": 0.09301067187734856, + "flos": 23658021815040.0, + "grad_norm": 2.1086065935537284, + "language_loss": 0.84392273, + "learning_rate": 3.9584965465287275e-06, + "loss": 0.86624783, + "num_input_tokens_seen": 33115245, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.28125, + "step": 1547, + "time_per_iteration": 2.5041439533233643 + }, + { + "auxiliary_loss_clip": 0.01177454, + "auxiliary_loss_mlp": 0.01053084, + "balance_loss_clip": 1.03136444, + "balance_loss_mlp": 1.05125856, + "epoch": 0.09307079513001654, + "flos": 27527719777920.0, + "grad_norm": 7.120670718523448, + "language_loss": 0.67616034, + "learning_rate": 3.958417579416199e-06, + "loss": 0.6984657, + "num_input_tokens_seen": 33136640, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.2578125, + "step": 1548, + "time_per_iteration": 2.513141393661499 + }, + { + "auxiliary_loss_clip": 0.01178735, + "auxiliary_loss_mlp": 0.01053, + "balance_loss_clip": 1.03083944, + "balance_loss_mlp": 1.05175209, + "epoch": 0.0931309183826845, + "flos": 20627340710400.0, + "grad_norm": 2.761700755369037, + "language_loss": 0.83445251, + "learning_rate": 3.9583385380402795e-06, + "loss": 0.85676992, + "num_input_tokens_seen": 33155060, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.265625, + "step": 1549, + "time_per_iteration": 2.4822285175323486 + }, + { + "auxiliary_loss_clip": 0.01181659, + "auxiliary_loss_mlp": 0.01043887, + "balance_loss_clip": 1.02312112, + "balance_loss_mlp": 1.05560291, + "epoch": 0.09319104163535247, + "flos": 29020921084800.0, + "grad_norm": 1.7822943519837542, + "language_loss": 0.75744081, + "learning_rate": 3.958259422403966e-06, + "loss": 0.77969635, + "num_input_tokens_seen": 33175420, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.2578125, + "step": 1550, + "time_per_iteration": 2.5503265857696533 + }, + { + "auxiliary_loss_clip": 0.01179426, + "auxiliary_loss_mlp": 0.01069184, + "balance_loss_clip": 1.04579496, + "balance_loss_mlp": 1.05118561, + "epoch": 0.09325116488802045, + "flos": 25301545539840.0, + "grad_norm": 2.0184762942100876, + "language_loss": 0.83272278, + "learning_rate": 3.95818023251026e-06, + "loss": 0.85520893, + "num_input_tokens_seen": 33194120, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.28125, + "step": 1551, + "time_per_iteration": 2.4962081909179688 + }, + { + "auxiliary_loss_clip": 0.01060634, + "auxiliary_loss_mlp": 0.01007794, + "balance_loss_clip": 1.0051949, + "balance_loss_mlp": 1.01350796, + "epoch": 0.09331128814068841, + "flos": 61536203942400.0, + "grad_norm": 0.7800746873014213, + "language_loss": 0.6182366, + "learning_rate": 3.958100968362163e-06, + "loss": 0.6389209, + "num_input_tokens_seen": 33261080, + "router_z_loss_clip": 0.02600098, + "router_z_loss_mlp": 0.47070312, + "step": 1552, + "time_per_iteration": 3.2178378105163574 + }, + { + "auxiliary_loss_clip": 0.01059462, + "auxiliary_loss_mlp": 0.01003668, + "balance_loss_clip": 1.00099754, + "balance_loss_mlp": 1.01257896, + "epoch": 0.09337141139335638, + "flos": 53293700171520.0, + "grad_norm": 0.8330449834122059, + "language_loss": 0.5895977, + "learning_rate": 3.958021629962681e-06, + "loss": 0.61022902, + "num_input_tokens_seen": 33330235, + "router_z_loss_clip": 0.0267334, + "router_z_loss_mlp": 0.46875, + "step": 1553, + "time_per_iteration": 3.220923900604248 + }, + { + "auxiliary_loss_clip": 0.01178223, + "auxiliary_loss_mlp": 0.01058803, + "balance_loss_clip": 1.0369525, + "balance_loss_mlp": 1.05040002, + "epoch": 0.09343153464602436, + "flos": 23476852592640.0, + "grad_norm": 2.0753391269624797, + "language_loss": 0.87452686, + "learning_rate": 3.957942217314823e-06, + "loss": 0.89689714, + "num_input_tokens_seen": 33349035, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.28125, + "step": 1554, + "time_per_iteration": 2.5448763370513916 + }, + { + "auxiliary_loss_clip": 0.01174828, + "auxiliary_loss_mlp": 0.01052934, + "balance_loss_clip": 1.0310595, + "balance_loss_mlp": 1.05265594, + "epoch": 0.09349165789869232, + "flos": 19353481804800.0, + "grad_norm": 2.2438919833216913, + "language_loss": 0.81355709, + "learning_rate": 3.957862730421599e-06, + "loss": 0.83583468, + "num_input_tokens_seen": 33368060, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.21875, + "step": 1555, + "time_per_iteration": 2.4939990043640137 + }, + { + "auxiliary_loss_clip": 0.01058772, + "auxiliary_loss_mlp": 0.01008478, + "balance_loss_clip": 1.00581956, + "balance_loss_mlp": 1.01259685, + "epoch": 0.09355178115136029, + "flos": 67502580635520.0, + "grad_norm": 0.8701907042199977, + "language_loss": 0.59583747, + "learning_rate": 3.957783169286024e-06, + "loss": 0.61651003, + "num_input_tokens_seen": 33430825, + "router_z_loss_clip": 0.02661133, + "router_z_loss_mlp": 0.4609375, + "step": 1556, + "time_per_iteration": 3.0923824310302734 + }, + { + "auxiliary_loss_clip": 0.01177126, + "auxiliary_loss_mlp": 0.01056269, + "balance_loss_clip": 1.03518105, + "balance_loss_mlp": 1.05278862, + "epoch": 0.09361190440402825, + "flos": 37341638720640.0, + "grad_norm": 1.5891177576034032, + "language_loss": 0.84455961, + "learning_rate": 3.9577035339111155e-06, + "loss": 0.86689359, + "num_input_tokens_seen": 33454855, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.2421875, + "step": 1557, + "time_per_iteration": 2.5973968505859375 + }, + { + "auxiliary_loss_clip": 0.01175988, + "auxiliary_loss_mlp": 0.01061513, + "balance_loss_clip": 1.03799307, + "balance_loss_mlp": 1.05065048, + "epoch": 0.09367202765669623, + "flos": 24899705112960.0, + "grad_norm": 1.787574567308206, + "language_loss": 0.77987397, + "learning_rate": 3.957623824299893e-06, + "loss": 0.80224895, + "num_input_tokens_seen": 33476000, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.25, + "step": 1558, + "time_per_iteration": 2.5390286445617676 + }, + { + "auxiliary_loss_clip": 0.01178258, + "auxiliary_loss_mlp": 0.01054751, + "balance_loss_clip": 1.03268576, + "balance_loss_mlp": 1.05035424, + "epoch": 0.0937321509093642, + "flos": 15705568368000.0, + "grad_norm": 2.0310113035260873, + "language_loss": 0.7998119, + "learning_rate": 3.957544040455379e-06, + "loss": 0.822142, + "num_input_tokens_seen": 33493845, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.28125, + "step": 1559, + "time_per_iteration": 5.3233802318573 + }, + { + "auxiliary_loss_clip": 0.01172855, + "auxiliary_loss_mlp": 0.01063353, + "balance_loss_clip": 1.04146647, + "balance_loss_mlp": 1.05015147, + "epoch": 0.09379227416203216, + "flos": 20483698222080.0, + "grad_norm": 1.9877315441152976, + "language_loss": 0.76720232, + "learning_rate": 3.957464182380599e-06, + "loss": 0.78956437, + "num_input_tokens_seen": 33510850, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.2265625, + "step": 1560, + "time_per_iteration": 3.863935947418213 + }, + { + "auxiliary_loss_clip": 0.01180546, + "auxiliary_loss_mlp": 0.01059772, + "balance_loss_clip": 1.03793311, + "balance_loss_mlp": 1.05101645, + "epoch": 0.09385239741470014, + "flos": 24352498344960.0, + "grad_norm": 1.6628394684514, + "language_loss": 0.81219828, + "learning_rate": 3.95738425007858e-06, + "loss": 0.83460152, + "num_input_tokens_seen": 33530430, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.296875, + "step": 1561, + "time_per_iteration": 2.5050160884857178 + }, + { + "auxiliary_loss_clip": 0.01175131, + "auxiliary_loss_mlp": 0.01048338, + "balance_loss_clip": 1.02641547, + "balance_loss_mlp": 1.04764926, + "epoch": 0.0939125206673681, + "flos": 33291489807360.0, + "grad_norm": 2.307547697406205, + "language_loss": 0.61553764, + "learning_rate": 3.957304243552354e-06, + "loss": 0.63777232, + "num_input_tokens_seen": 33551975, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.2734375, + "step": 1562, + "time_per_iteration": 2.5884838104248047 + }, + { + "auxiliary_loss_clip": 0.01177686, + "auxiliary_loss_mlp": 0.01059886, + "balance_loss_clip": 1.03920364, + "balance_loss_mlp": 1.0552876, + "epoch": 0.09397264392003607, + "flos": 19244923925760.0, + "grad_norm": 2.5948914783661468, + "language_loss": 0.84981585, + "learning_rate": 3.957224162804956e-06, + "loss": 0.87219155, + "num_input_tokens_seen": 33569850, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.21875, + "step": 1563, + "time_per_iteration": 2.427928924560547 + }, + { + "auxiliary_loss_clip": 0.01172512, + "auxiliary_loss_mlp": 0.01048044, + "balance_loss_clip": 1.02767134, + "balance_loss_mlp": 1.05013323, + "epoch": 0.09403276717270405, + "flos": 19317930318720.0, + "grad_norm": 1.8141046481233785, + "language_loss": 0.76106739, + "learning_rate": 3.9571440078394205e-06, + "loss": 0.78327298, + "num_input_tokens_seen": 33590510, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.21875, + "step": 1564, + "time_per_iteration": 2.4996325969696045 + }, + { + "auxiliary_loss_clip": 0.01177295, + "auxiliary_loss_mlp": 0.01055133, + "balance_loss_clip": 1.03415227, + "balance_loss_mlp": 1.05290008, + "epoch": 0.09409289042537201, + "flos": 23583471137280.0, + "grad_norm": 2.0134268414891388, + "language_loss": 0.7971766, + "learning_rate": 3.9570637786587895e-06, + "loss": 0.81950086, + "num_input_tokens_seen": 33608810, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.25, + "step": 1565, + "time_per_iteration": 2.470870018005371 + }, + { + "auxiliary_loss_clip": 0.01175133, + "auxiliary_loss_mlp": 0.01069432, + "balance_loss_clip": 1.0479629, + "balance_loss_mlp": 1.0497129, + "epoch": 0.09415301367803998, + "flos": 20078446003200.0, + "grad_norm": 1.8353632925340597, + "language_loss": 0.75241816, + "learning_rate": 3.956983475266103e-06, + "loss": 0.77486378, + "num_input_tokens_seen": 33627265, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.2578125, + "step": 1566, + "time_per_iteration": 2.4962053298950195 + }, + { + "auxiliary_loss_clip": 0.0117411, + "auxiliary_loss_mlp": 0.01059014, + "balance_loss_clip": 1.03746092, + "balance_loss_mlp": 1.04822683, + "epoch": 0.09421313693070796, + "flos": 21062075016960.0, + "grad_norm": 2.55149440594841, + "language_loss": 0.77724433, + "learning_rate": 3.956903097664407e-06, + "loss": 0.79957557, + "num_input_tokens_seen": 33644810, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.2578125, + "step": 1567, + "time_per_iteration": 2.448511838912964 + }, + { + "auxiliary_loss_clip": 0.01178494, + "auxiliary_loss_mlp": 0.01054706, + "balance_loss_clip": 1.03504825, + "balance_loss_mlp": 1.05183101, + "epoch": 0.09427326018337592, + "flos": 24316156759680.0, + "grad_norm": 2.293964487000622, + "language_loss": 0.82571244, + "learning_rate": 3.956822645856749e-06, + "loss": 0.8480444, + "num_input_tokens_seen": 33665665, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.265625, + "step": 1568, + "time_per_iteration": 2.5221774578094482 + }, + { + "auxiliary_loss_clip": 0.01179838, + "auxiliary_loss_mlp": 0.01048346, + "balance_loss_clip": 1.02527881, + "balance_loss_mlp": 1.05191278, + "epoch": 0.09433338343604389, + "flos": 20263888944000.0, + "grad_norm": 4.3822924949764515, + "language_loss": 0.7658236, + "learning_rate": 3.9567421198461814e-06, + "loss": 0.78810549, + "num_input_tokens_seen": 33684760, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.28125, + "step": 1569, + "time_per_iteration": 2.464019775390625 + }, + { + "auxiliary_loss_clip": 0.01171203, + "auxiliary_loss_mlp": 0.01052053, + "balance_loss_clip": 1.03004718, + "balance_loss_mlp": 1.04984534, + "epoch": 0.09439350668871185, + "flos": 12742973493120.0, + "grad_norm": 2.11394347406088, + "language_loss": 0.86315012, + "learning_rate": 3.956661519635756e-06, + "loss": 0.88538271, + "num_input_tokens_seen": 33700750, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.21875, + "step": 1570, + "time_per_iteration": 2.479001998901367 + }, + { + "auxiliary_loss_clip": 0.01177967, + "auxiliary_loss_mlp": 0.01049184, + "balance_loss_clip": 1.02764273, + "balance_loss_mlp": 1.05340183, + "epoch": 0.09445362994137983, + "flos": 25962266263680.0, + "grad_norm": 1.6480791038221163, + "language_loss": 0.76531005, + "learning_rate": 3.95658084522853e-06, + "loss": 0.78758156, + "num_input_tokens_seen": 33724430, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.25, + "step": 1571, + "time_per_iteration": 2.5270462036132812 + }, + { + "auxiliary_loss_clip": 0.01169263, + "auxiliary_loss_mlp": 0.01049685, + "balance_loss_clip": 1.02848995, + "balance_loss_mlp": 1.0496099, + "epoch": 0.0945137531940478, + "flos": 19715353372800.0, + "grad_norm": 1.780883866775424, + "language_loss": 0.79518712, + "learning_rate": 3.956500096627561e-06, + "loss": 0.81737661, + "num_input_tokens_seen": 33743455, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1953125, + "step": 1572, + "time_per_iteration": 2.477403163909912 + }, + { + "auxiliary_loss_clip": 0.01172702, + "auxiliary_loss_mlp": 0.01053869, + "balance_loss_clip": 1.03288805, + "balance_loss_mlp": 1.05036175, + "epoch": 0.09457387644671576, + "flos": 23617047375360.0, + "grad_norm": 1.8458711299535766, + "language_loss": 0.87948155, + "learning_rate": 3.956419273835913e-06, + "loss": 0.90174723, + "num_input_tokens_seen": 33763435, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.21875, + "step": 1573, + "time_per_iteration": 2.5164122581481934 + }, + { + "auxiliary_loss_clip": 0.01177194, + "auxiliary_loss_mlp": 0.01059795, + "balance_loss_clip": 1.03533316, + "balance_loss_mlp": 1.05045378, + "epoch": 0.09463399969938374, + "flos": 26907291135360.0, + "grad_norm": 2.770313323609274, + "language_loss": 0.81827116, + "learning_rate": 3.95633837685665e-06, + "loss": 0.84064102, + "num_input_tokens_seen": 33784325, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.265625, + "step": 1574, + "time_per_iteration": 2.5540831089019775 + }, + { + "auxiliary_loss_clip": 0.01178056, + "auxiliary_loss_mlp": 0.01052269, + "balance_loss_clip": 1.03128815, + "balance_loss_mlp": 1.05359375, + "epoch": 0.0946941229520517, + "flos": 23659566099840.0, + "grad_norm": 2.139236970889498, + "language_loss": 0.80922085, + "learning_rate": 3.95625740569284e-06, + "loss": 0.83152413, + "num_input_tokens_seen": 33802510, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.25, + "step": 1575, + "time_per_iteration": 2.4874608516693115 + }, + { + "auxiliary_loss_clip": 0.01172567, + "auxiliary_loss_mlp": 0.01063693, + "balance_loss_clip": 1.04184198, + "balance_loss_mlp": 1.05048943, + "epoch": 0.09475424620471967, + "flos": 24134053783680.0, + "grad_norm": 2.1107661515601, + "language_loss": 0.86745369, + "learning_rate": 3.956176360347553e-06, + "loss": 0.88981628, + "num_input_tokens_seen": 33819980, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.21875, + "step": 1576, + "time_per_iteration": 2.514961004257202 + }, + { + "auxiliary_loss_clip": 0.01058351, + "auxiliary_loss_mlp": 0.01025821, + "balance_loss_clip": 1.02272177, + "balance_loss_mlp": 1.01272786, + "epoch": 0.09481436945738765, + "flos": 68426168065920.0, + "grad_norm": 0.9836929902555142, + "language_loss": 0.65832257, + "learning_rate": 3.956095240823862e-06, + "loss": 0.67916429, + "num_input_tokens_seen": 33878925, + "router_z_loss_clip": 0.03100586, + "router_z_loss_mlp": 0.45703125, + "step": 1577, + "time_per_iteration": 3.042998790740967 + }, + { + "auxiliary_loss_clip": 0.01175806, + "auxiliary_loss_mlp": 0.01045658, + "balance_loss_clip": 1.02504635, + "balance_loss_mlp": 1.05083144, + "epoch": 0.09487449271005562, + "flos": 16654076858880.0, + "grad_norm": 3.158821122445177, + "language_loss": 0.79113019, + "learning_rate": 3.956014047124844e-06, + "loss": 0.81334484, + "num_input_tokens_seen": 33897600, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.25, + "step": 1578, + "time_per_iteration": 2.492605447769165 + }, + { + "auxiliary_loss_clip": 0.01173104, + "auxiliary_loss_mlp": 0.01056494, + "balance_loss_clip": 1.03446436, + "balance_loss_mlp": 1.04935408, + "epoch": 0.09493461596272358, + "flos": 24275685110400.0, + "grad_norm": 1.6941125689582233, + "language_loss": 0.77994359, + "learning_rate": 3.955932779253578e-06, + "loss": 0.80223954, + "num_input_tokens_seen": 33917365, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.234375, + "step": 1579, + "time_per_iteration": 2.5021350383758545 + }, + { + "auxiliary_loss_clip": 0.01176838, + "auxiliary_loss_mlp": 0.01054415, + "balance_loss_clip": 1.0317533, + "balance_loss_mlp": 1.05228639, + "epoch": 0.09499473921539155, + "flos": 21870173243520.0, + "grad_norm": 2.3012950697800747, + "language_loss": 0.73576474, + "learning_rate": 3.955851437213144e-06, + "loss": 0.75807726, + "num_input_tokens_seen": 33936680, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.2421875, + "step": 1580, + "time_per_iteration": 2.500426769256592 + }, + { + "auxiliary_loss_clip": 0.01171524, + "auxiliary_loss_mlp": 0.01053034, + "balance_loss_clip": 1.03235102, + "balance_loss_mlp": 1.05162525, + "epoch": 0.09505486246805953, + "flos": 33547137880320.0, + "grad_norm": 2.820694860574998, + "language_loss": 0.77813822, + "learning_rate": 3.955770021006627e-06, + "loss": 0.80038381, + "num_input_tokens_seen": 33960685, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.203125, + "step": 1581, + "time_per_iteration": 2.569086790084839 + }, + { + "auxiliary_loss_clip": 0.01177083, + "auxiliary_loss_mlp": 0.0105881, + "balance_loss_clip": 1.03779316, + "balance_loss_mlp": 1.05315304, + "epoch": 0.09511498572072749, + "flos": 21215342350080.0, + "grad_norm": 2.1718701740895443, + "language_loss": 0.86914808, + "learning_rate": 3.955688530637116e-06, + "loss": 0.89150703, + "num_input_tokens_seen": 33980015, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.234375, + "step": 1582, + "time_per_iteration": 2.476386785507202 + }, + { + "auxiliary_loss_clip": 0.01178411, + "auxiliary_loss_mlp": 0.01057193, + "balance_loss_clip": 1.03394723, + "balance_loss_mlp": 1.05487967, + "epoch": 0.09517510897339546, + "flos": 14611262572800.0, + "grad_norm": 1.7496793522695477, + "language_loss": 0.66838771, + "learning_rate": 3.955606966107699e-06, + "loss": 0.6907438, + "num_input_tokens_seen": 33997705, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.234375, + "step": 1583, + "time_per_iteration": 2.4433302879333496 + }, + { + "auxiliary_loss_clip": 0.01180705, + "auxiliary_loss_mlp": 0.01052141, + "balance_loss_clip": 1.02919281, + "balance_loss_mlp": 1.0555923, + "epoch": 0.09523523222606343, + "flos": 27817339138560.0, + "grad_norm": 1.8272679383640855, + "language_loss": 0.70314872, + "learning_rate": 3.95552532742147e-06, + "loss": 0.7254771, + "num_input_tokens_seen": 34017465, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.25, + "step": 1584, + "time_per_iteration": 2.5352938175201416 + }, + { + "auxiliary_loss_clip": 0.01177345, + "auxiliary_loss_mlp": 0.01054432, + "balance_loss_clip": 1.0344646, + "balance_loss_mlp": 1.0527246, + "epoch": 0.0952953554787314, + "flos": 20706272847360.0, + "grad_norm": 1.5429491827095454, + "language_loss": 0.80649364, + "learning_rate": 3.955443614581525e-06, + "loss": 0.82881135, + "num_input_tokens_seen": 34038550, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.2421875, + "step": 1585, + "time_per_iteration": 2.5006139278411865 + }, + { + "auxiliary_loss_clip": 0.01179471, + "auxiliary_loss_mlp": 0.01056563, + "balance_loss_clip": 1.03301954, + "balance_loss_mlp": 1.05324364, + "epoch": 0.09535547873139937, + "flos": 24787627701120.0, + "grad_norm": 1.5763794615860258, + "language_loss": 0.7156626, + "learning_rate": 3.955361827590961e-06, + "loss": 0.73802292, + "num_input_tokens_seen": 34058665, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.265625, + "step": 1586, + "time_per_iteration": 2.510941982269287 + }, + { + "auxiliary_loss_clip": 0.01058124, + "auxiliary_loss_mlp": 0.010121, + "balance_loss_clip": 1.00946522, + "balance_loss_mlp": 1.01272035, + "epoch": 0.09541560198406734, + "flos": 71912194905600.0, + "grad_norm": 0.8128409972345002, + "language_loss": 0.55392706, + "learning_rate": 3.955279966452883e-06, + "loss": 0.57462931, + "num_input_tokens_seen": 34109655, + "router_z_loss_clip": 0.02636719, + "router_z_loss_mlp": 0.453125, + "step": 1587, + "time_per_iteration": 2.8747992515563965 + }, + { + "auxiliary_loss_clip": 0.0118109, + "auxiliary_loss_mlp": 0.01056077, + "balance_loss_clip": 1.0345006, + "balance_loss_mlp": 1.0550952, + "epoch": 0.09547572523673531, + "flos": 28982604251520.0, + "grad_norm": 1.813611272618652, + "language_loss": 0.81023234, + "learning_rate": 3.955198031170391e-06, + "loss": 0.83260405, + "num_input_tokens_seen": 34131115, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.265625, + "step": 1588, + "time_per_iteration": 2.5403292179107666 + }, + { + "auxiliary_loss_clip": 0.01178114, + "auxiliary_loss_mlp": 0.01054854, + "balance_loss_clip": 1.03290713, + "balance_loss_mlp": 1.05471849, + "epoch": 0.09553584848940327, + "flos": 24133910129280.0, + "grad_norm": 2.1843830695972835, + "language_loss": 0.81552076, + "learning_rate": 3.955116021746594e-06, + "loss": 0.83785045, + "num_input_tokens_seen": 34151925, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.234375, + "step": 1589, + "time_per_iteration": 2.4995651245117188 + }, + { + "auxiliary_loss_clip": 0.01175474, + "auxiliary_loss_mlp": 0.01051658, + "balance_loss_clip": 1.02901983, + "balance_loss_mlp": 1.05340207, + "epoch": 0.09559597174207124, + "flos": 42851376789120.0, + "grad_norm": 1.4497838373443381, + "language_loss": 0.65005404, + "learning_rate": 3.955033938184601e-06, + "loss": 0.67232537, + "num_input_tokens_seen": 34175395, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.21875, + "step": 1590, + "time_per_iteration": 2.7222375869750977 + }, + { + "auxiliary_loss_clip": 0.01173556, + "auxiliary_loss_mlp": 0.01051921, + "balance_loss_clip": 1.03036785, + "balance_loss_mlp": 1.05178595, + "epoch": 0.09565609499473922, + "flos": 32670845683200.0, + "grad_norm": 1.714913693600035, + "language_loss": 0.83272862, + "learning_rate": 3.954951780487526e-06, + "loss": 0.85498345, + "num_input_tokens_seen": 34197760, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.21875, + "step": 1591, + "time_per_iteration": 2.571277379989624 + }, + { + "auxiliary_loss_clip": 0.01179776, + "auxiliary_loss_mlp": 0.01055769, + "balance_loss_clip": 1.03419209, + "balance_loss_mlp": 1.05280709, + "epoch": 0.09571621824740718, + "flos": 18478410670080.0, + "grad_norm": 2.268244689889179, + "language_loss": 0.74068749, + "learning_rate": 3.9548695486584835e-06, + "loss": 0.76304293, + "num_input_tokens_seen": 34215330, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.265625, + "step": 1592, + "time_per_iteration": 2.446272373199463 + }, + { + "auxiliary_loss_clip": 0.01173297, + "auxiliary_loss_mlp": 0.0104948, + "balance_loss_clip": 1.0282129, + "balance_loss_mlp": 1.05028248, + "epoch": 0.09577634150007515, + "flos": 29387497334400.0, + "grad_norm": 1.9287746031752921, + "language_loss": 0.74135411, + "learning_rate": 3.954787242700592e-06, + "loss": 0.76358187, + "num_input_tokens_seen": 34237745, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.234375, + "step": 1593, + "time_per_iteration": 2.518038749694824 + }, + { + "auxiliary_loss_clip": 0.01175652, + "auxiliary_loss_mlp": 0.01051222, + "balance_loss_clip": 1.03061128, + "balance_loss_mlp": 1.05365515, + "epoch": 0.09583646475274313, + "flos": 22747830157440.0, + "grad_norm": 1.8251705146793997, + "language_loss": 0.69907188, + "learning_rate": 3.954704862616971e-06, + "loss": 0.72134066, + "num_input_tokens_seen": 34256565, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.21875, + "step": 1594, + "time_per_iteration": 2.5454983711242676 + }, + { + "auxiliary_loss_clip": 0.01174594, + "auxiliary_loss_mlp": 0.0105111, + "balance_loss_clip": 1.03062999, + "balance_loss_mlp": 1.05023921, + "epoch": 0.0958965880054111, + "flos": 23218367345280.0, + "grad_norm": 2.596137828422853, + "language_loss": 0.82464099, + "learning_rate": 3.954622408410747e-06, + "loss": 0.84689802, + "num_input_tokens_seen": 34275970, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.2421875, + "step": 1595, + "time_per_iteration": 2.472062826156616 + }, + { + "auxiliary_loss_clip": 0.01176658, + "auxiliary_loss_mlp": 0.01050558, + "balance_loss_clip": 1.02803886, + "balance_loss_mlp": 1.05217803, + "epoch": 0.09595671125807906, + "flos": 21324438933120.0, + "grad_norm": 2.0311987750358953, + "language_loss": 0.84673214, + "learning_rate": 3.954539880085045e-06, + "loss": 0.86900425, + "num_input_tokens_seen": 34295490, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.2421875, + "step": 1596, + "time_per_iteration": 2.4801599979400635 + }, + { + "auxiliary_loss_clip": 0.01181467, + "auxiliary_loss_mlp": 0.01051063, + "balance_loss_clip": 1.02871156, + "balance_loss_mlp": 1.05628884, + "epoch": 0.09601683451074704, + "flos": 39603472185600.0, + "grad_norm": 2.531539932785817, + "language_loss": 0.68993127, + "learning_rate": 3.9544572776429945e-06, + "loss": 0.71225667, + "num_input_tokens_seen": 34319990, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.25, + "step": 1597, + "time_per_iteration": 2.6195101737976074 + }, + { + "auxiliary_loss_clip": 0.01175632, + "auxiliary_loss_mlp": 0.0104509, + "balance_loss_clip": 1.02370429, + "balance_loss_mlp": 1.04902959, + "epoch": 0.096076957763415, + "flos": 23732716147200.0, + "grad_norm": 2.18946094151333, + "language_loss": 0.74929029, + "learning_rate": 3.954374601087729e-06, + "loss": 0.77149749, + "num_input_tokens_seen": 34339225, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.265625, + "step": 1598, + "time_per_iteration": 2.474071502685547 + }, + { + "auxiliary_loss_clip": 0.01179079, + "auxiliary_loss_mlp": 0.01048418, + "balance_loss_clip": 1.02574444, + "balance_loss_mlp": 1.05284083, + "epoch": 0.09613708101608297, + "flos": 34678108483200.0, + "grad_norm": 1.6350676424235815, + "language_loss": 0.69002283, + "learning_rate": 3.954291850422382e-06, + "loss": 0.7122978, + "num_input_tokens_seen": 34361020, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.265625, + "step": 1599, + "time_per_iteration": 2.5599992275238037 + }, + { + "auxiliary_loss_clip": 0.01174972, + "auxiliary_loss_mlp": 0.01056578, + "balance_loss_clip": 1.0358355, + "balance_loss_mlp": 1.05169392, + "epoch": 0.09619720426875093, + "flos": 20740028653440.0, + "grad_norm": 2.013538613147854, + "language_loss": 0.840271, + "learning_rate": 3.954209025650093e-06, + "loss": 0.8625865, + "num_input_tokens_seen": 34378630, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.234375, + "step": 1600, + "time_per_iteration": 2.4882116317749023 + }, + { + "auxiliary_loss_clip": 0.01174537, + "auxiliary_loss_mlp": 0.01052763, + "balance_loss_clip": 1.03162694, + "balance_loss_mlp": 1.05098653, + "epoch": 0.09625732752141891, + "flos": 13042720488960.0, + "grad_norm": 3.038904015519863, + "language_loss": 0.8034178, + "learning_rate": 3.954126126774001e-06, + "loss": 0.82569081, + "num_input_tokens_seen": 34397110, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.234375, + "step": 1601, + "time_per_iteration": 5.328745365142822 + }, + { + "auxiliary_loss_clip": 0.01178453, + "auxiliary_loss_mlp": 0.01052259, + "balance_loss_clip": 1.03031266, + "balance_loss_mlp": 1.05090928, + "epoch": 0.09631745077408688, + "flos": 22273629782400.0, + "grad_norm": 2.183236390866488, + "language_loss": 0.82405198, + "learning_rate": 3.954043153797251e-06, + "loss": 0.84635913, + "num_input_tokens_seen": 34414165, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.2734375, + "step": 1602, + "time_per_iteration": 2.4609556198120117 + }, + { + "auxiliary_loss_clip": 0.01172805, + "auxiliary_loss_mlp": 0.01051791, + "balance_loss_clip": 1.02926075, + "balance_loss_mlp": 1.05170703, + "epoch": 0.09637757402675484, + "flos": 24754266944640.0, + "grad_norm": 1.882331764966583, + "language_loss": 0.62527591, + "learning_rate": 3.953960106722989e-06, + "loss": 0.64752185, + "num_input_tokens_seen": 34434445, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.2109375, + "step": 1603, + "time_per_iteration": 2.4974379539489746 + }, + { + "auxiliary_loss_clip": 0.01178105, + "auxiliary_loss_mlp": 0.01054363, + "balance_loss_clip": 1.03049707, + "balance_loss_mlp": 1.05224609, + "epoch": 0.09643769727942282, + "flos": 22525758322560.0, + "grad_norm": 2.347327571135852, + "language_loss": 0.71259016, + "learning_rate": 3.953876985554364e-06, + "loss": 0.73491484, + "num_input_tokens_seen": 34453095, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.2578125, + "step": 1604, + "time_per_iteration": 2.5012693405151367 + }, + { + "auxiliary_loss_clip": 0.01172586, + "auxiliary_loss_mlp": 0.01056823, + "balance_loss_clip": 1.0368669, + "balance_loss_mlp": 1.05051208, + "epoch": 0.09649782053209079, + "flos": 30921026636160.0, + "grad_norm": 2.129697971326249, + "language_loss": 0.79487669, + "learning_rate": 3.953793790294527e-06, + "loss": 0.8171708, + "num_input_tokens_seen": 34473680, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.21875, + "step": 1605, + "time_per_iteration": 2.5392873287200928 + }, + { + "auxiliary_loss_clip": 0.01176232, + "auxiliary_loss_mlp": 0.01044253, + "balance_loss_clip": 1.02275968, + "balance_loss_mlp": 1.04916394, + "epoch": 0.09655794378475875, + "flos": 25337635729920.0, + "grad_norm": 3.698123586343809, + "language_loss": 0.74810207, + "learning_rate": 3.953710520946634e-06, + "loss": 0.77030694, + "num_input_tokens_seen": 34492610, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.2734375, + "step": 1606, + "time_per_iteration": 2.4922726154327393 + }, + { + "auxiliary_loss_clip": 0.01176161, + "auxiliary_loss_mlp": 0.01044775, + "balance_loss_clip": 1.02391386, + "balance_loss_mlp": 1.05243278, + "epoch": 0.09661806703742673, + "flos": 22346061557760.0, + "grad_norm": 1.649703340967918, + "language_loss": 0.75382137, + "learning_rate": 3.953627177513843e-06, + "loss": 0.77603066, + "num_input_tokens_seen": 34511855, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.234375, + "step": 1607, + "time_per_iteration": 2.4787087440490723 + }, + { + "auxiliary_loss_clip": 0.0117289, + "auxiliary_loss_mlp": 0.01042475, + "balance_loss_clip": 1.02206647, + "balance_loss_mlp": 1.04831934, + "epoch": 0.0966781902900947, + "flos": 17457578144640.0, + "grad_norm": 2.262571531890369, + "language_loss": 0.86648059, + "learning_rate": 3.953543759999312e-06, + "loss": 0.88863426, + "num_input_tokens_seen": 34528905, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.25, + "step": 1608, + "time_per_iteration": 2.435391664505005 + }, + { + "auxiliary_loss_clip": 0.01183391, + "auxiliary_loss_mlp": 0.01056654, + "balance_loss_clip": 1.03513622, + "balance_loss_mlp": 1.05276418, + "epoch": 0.09673831354276266, + "flos": 36903995412480.0, + "grad_norm": 2.2277980990408297, + "language_loss": 0.70968121, + "learning_rate": 3.953460268406207e-06, + "loss": 0.73208165, + "num_input_tokens_seen": 34548480, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.3046875, + "step": 1609, + "time_per_iteration": 2.599719762802124 + }, + { + "auxiliary_loss_clip": 0.01173214, + "auxiliary_loss_mlp": 0.01054271, + "balance_loss_clip": 1.03342104, + "balance_loss_mlp": 1.04860282, + "epoch": 0.09679843679543064, + "flos": 20701388597760.0, + "grad_norm": 3.7787270736621674, + "language_loss": 0.84566712, + "learning_rate": 3.953376702737693e-06, + "loss": 0.86794198, + "num_input_tokens_seen": 34565410, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.2421875, + "step": 1610, + "time_per_iteration": 2.446676254272461 + }, + { + "auxiliary_loss_clip": 0.01177531, + "auxiliary_loss_mlp": 0.01049914, + "balance_loss_clip": 1.02781224, + "balance_loss_mlp": 1.05382621, + "epoch": 0.0968585600480986, + "flos": 23514415240320.0, + "grad_norm": 2.0483419743874682, + "language_loss": 0.67360532, + "learning_rate": 3.953293062996939e-06, + "loss": 0.69587982, + "num_input_tokens_seen": 34584840, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.234375, + "step": 1611, + "time_per_iteration": 2.520211696624756 + }, + { + "auxiliary_loss_clip": 0.01177545, + "auxiliary_loss_mlp": 0.0105068, + "balance_loss_clip": 1.03000879, + "balance_loss_mlp": 1.05313492, + "epoch": 0.09691868330076657, + "flos": 20121072468480.0, + "grad_norm": 1.6625909003061596, + "language_loss": 0.81166416, + "learning_rate": 3.953209349187115e-06, + "loss": 0.83394641, + "num_input_tokens_seen": 34603360, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.2421875, + "step": 1612, + "time_per_iteration": 2.449491262435913 + }, + { + "auxiliary_loss_clip": 0.01180036, + "auxiliary_loss_mlp": 0.01061745, + "balance_loss_clip": 1.04027581, + "balance_loss_mlp": 1.05431938, + "epoch": 0.09697880655343454, + "flos": 16544692967040.0, + "grad_norm": 2.509420249413084, + "language_loss": 0.80708754, + "learning_rate": 3.953125561311398e-06, + "loss": 0.82950538, + "num_input_tokens_seen": 34620760, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.2578125, + "step": 1613, + "time_per_iteration": 2.4753763675689697 + }, + { + "auxiliary_loss_clip": 0.01173718, + "auxiliary_loss_mlp": 0.01052644, + "balance_loss_clip": 1.03019738, + "balance_loss_mlp": 1.05074048, + "epoch": 0.09703892980610251, + "flos": 26104184899200.0, + "grad_norm": 2.0025313344872484, + "language_loss": 0.84173608, + "learning_rate": 3.953041699372964e-06, + "loss": 0.86399966, + "num_input_tokens_seen": 34640695, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.2265625, + "step": 1614, + "time_per_iteration": 2.5492141246795654 + }, + { + "auxiliary_loss_clip": 0.01065917, + "auxiliary_loss_mlp": 0.010187, + "balance_loss_clip": 1.01610088, + "balance_loss_mlp": 1.019063, + "epoch": 0.09709905305877048, + "flos": 60443622000000.0, + "grad_norm": 0.7078098108364695, + "language_loss": 0.54584575, + "learning_rate": 3.952957763374992e-06, + "loss": 0.56669194, + "num_input_tokens_seen": 34702395, + "router_z_loss_clip": 0.02600098, + "router_z_loss_mlp": 0.46875, + "step": 1615, + "time_per_iteration": 3.1041057109832764 + }, + { + "auxiliary_loss_clip": 0.01065912, + "auxiliary_loss_mlp": 0.01007477, + "balance_loss_clip": 1.00491357, + "balance_loss_mlp": 1.01844954, + "epoch": 0.09715917631143844, + "flos": 57639932893440.0, + "grad_norm": 0.7637649269659756, + "language_loss": 0.5822649, + "learning_rate": 3.952873753320666e-06, + "loss": 0.60299873, + "num_input_tokens_seen": 34768910, + "router_z_loss_clip": 0.02563477, + "router_z_loss_mlp": 0.47460938, + "step": 1616, + "time_per_iteration": 3.215376377105713 + }, + { + "auxiliary_loss_clip": 0.01178513, + "auxiliary_loss_mlp": 0.01055808, + "balance_loss_clip": 1.03275275, + "balance_loss_mlp": 1.05275226, + "epoch": 0.09721929956410642, + "flos": 20558212986240.0, + "grad_norm": 1.690325520565165, + "language_loss": 0.69293094, + "learning_rate": 3.952789669213172e-06, + "loss": 0.71527421, + "num_input_tokens_seen": 34787680, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.2578125, + "step": 1617, + "time_per_iteration": 2.458017587661743 + }, + { + "auxiliary_loss_clip": 0.01176727, + "auxiliary_loss_mlp": 0.01055641, + "balance_loss_clip": 1.03116739, + "balance_loss_mlp": 1.05130577, + "epoch": 0.09727942281677439, + "flos": 27344359825920.0, + "grad_norm": 1.7927692696889819, + "language_loss": 0.80748308, + "learning_rate": 3.952705511055698e-06, + "loss": 0.8298068, + "num_input_tokens_seen": 34808330, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.25, + "step": 1618, + "time_per_iteration": 2.5471577644348145 + }, + { + "auxiliary_loss_clip": 0.01169902, + "auxiliary_loss_mlp": 0.01050477, + "balance_loss_clip": 1.03077149, + "balance_loss_mlp": 1.04996848, + "epoch": 0.09733954606944235, + "flos": 24900028335360.0, + "grad_norm": 1.5831304278494804, + "language_loss": 0.9288674, + "learning_rate": 3.952621278851435e-06, + "loss": 0.9510712, + "num_input_tokens_seen": 34830020, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1953125, + "step": 1619, + "time_per_iteration": 2.515282392501831 + }, + { + "auxiliary_loss_clip": 0.01171299, + "auxiliary_loss_mlp": 0.01052594, + "balance_loss_clip": 1.03150594, + "balance_loss_mlp": 1.05216622, + "epoch": 0.09739966932211033, + "flos": 31503928544640.0, + "grad_norm": 1.7974961209450113, + "language_loss": 0.88785303, + "learning_rate": 3.9525369726035784e-06, + "loss": 0.910092, + "num_input_tokens_seen": 34850330, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1875, + "step": 1620, + "time_per_iteration": 2.556744337081909 + }, + { + "auxiliary_loss_clip": 0.01175309, + "auxiliary_loss_mlp": 0.01056801, + "balance_loss_clip": 1.0339601, + "balance_loss_mlp": 1.05045033, + "epoch": 0.0974597925747783, + "flos": 23878764846720.0, + "grad_norm": 1.90931759761679, + "language_loss": 0.77130795, + "learning_rate": 3.952452592315324e-06, + "loss": 0.79362905, + "num_input_tokens_seen": 34871640, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.25, + "step": 1621, + "time_per_iteration": 2.491441011428833 + }, + { + "auxiliary_loss_clip": 0.01171563, + "auxiliary_loss_mlp": 0.01056002, + "balance_loss_clip": 1.03398418, + "balance_loss_mlp": 1.04859447, + "epoch": 0.09751991582744626, + "flos": 17019575700480.0, + "grad_norm": 1.9170880538391684, + "language_loss": 0.77856946, + "learning_rate": 3.952368137989871e-06, + "loss": 0.80084509, + "num_input_tokens_seen": 34888100, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.2265625, + "step": 1622, + "time_per_iteration": 2.4379701614379883 + }, + { + "auxiliary_loss_clip": 0.01177415, + "auxiliary_loss_mlp": 0.01056732, + "balance_loss_clip": 1.0349052, + "balance_loss_mlp": 1.05105746, + "epoch": 0.09758003908011423, + "flos": 28402826826240.0, + "grad_norm": 1.9420709042223125, + "language_loss": 0.85783195, + "learning_rate": 3.9522836096304225e-06, + "loss": 0.88017344, + "num_input_tokens_seen": 34910485, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.265625, + "step": 1623, + "time_per_iteration": 2.51741099357605 + }, + { + "auxiliary_loss_clip": 0.01172696, + "auxiliary_loss_mlp": 0.01056286, + "balance_loss_clip": 1.03498316, + "balance_loss_mlp": 1.05181813, + "epoch": 0.09764016233278221, + "flos": 18144297336960.0, + "grad_norm": 2.2833168401589656, + "language_loss": 0.80328369, + "learning_rate": 3.952199007240184e-06, + "loss": 0.8255735, + "num_input_tokens_seen": 34928615, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.2109375, + "step": 1624, + "time_per_iteration": 2.4646618366241455 + }, + { + "auxiliary_loss_clip": 0.01170952, + "auxiliary_loss_mlp": 0.01044517, + "balance_loss_clip": 1.02450192, + "balance_loss_mlp": 1.04799926, + "epoch": 0.09770028558545017, + "flos": 15265842071040.0, + "grad_norm": 2.7577002662180954, + "language_loss": 0.8575626, + "learning_rate": 3.952114330822364e-06, + "loss": 0.87971735, + "num_input_tokens_seen": 34946045, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.234375, + "step": 1625, + "time_per_iteration": 2.452615976333618 + }, + { + "auxiliary_loss_clip": 0.01176501, + "auxiliary_loss_mlp": 0.01055325, + "balance_loss_clip": 1.03445125, + "balance_loss_mlp": 1.05226421, + "epoch": 0.09776040883811814, + "flos": 23472435219840.0, + "grad_norm": 3.258883448957912, + "language_loss": 0.8539601, + "learning_rate": 3.952029580380172e-06, + "loss": 0.87627834, + "num_input_tokens_seen": 34962865, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.2421875, + "step": 1626, + "time_per_iteration": 2.4931013584136963 + }, + { + "auxiliary_loss_clip": 0.01181466, + "auxiliary_loss_mlp": 0.0105723, + "balance_loss_clip": 1.03493834, + "balance_loss_mlp": 1.05541551, + "epoch": 0.09782053209078612, + "flos": 24499480798080.0, + "grad_norm": 1.979888643217431, + "language_loss": 0.83329904, + "learning_rate": 3.9519447559168234e-06, + "loss": 0.85568601, + "num_input_tokens_seen": 34983505, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.2578125, + "step": 1627, + "time_per_iteration": 2.5056917667388916 + }, + { + "auxiliary_loss_clip": 0.01170161, + "auxiliary_loss_mlp": 0.01050744, + "balance_loss_clip": 1.03065729, + "balance_loss_mlp": 1.0488416, + "epoch": 0.09788065534345408, + "flos": 21580158833280.0, + "grad_norm": 1.7873285490487296, + "language_loss": 0.84291327, + "learning_rate": 3.951859857435534e-06, + "loss": 0.86512232, + "num_input_tokens_seen": 35001825, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.2109375, + "step": 1628, + "time_per_iteration": 2.4835076332092285 + }, + { + "auxiliary_loss_clip": 0.01169153, + "auxiliary_loss_mlp": 0.01052825, + "balance_loss_clip": 1.0321064, + "balance_loss_mlp": 1.04880238, + "epoch": 0.09794077859612205, + "flos": 23842459175040.0, + "grad_norm": 1.6092149858605884, + "language_loss": 0.75609362, + "learning_rate": 3.951774884939523e-06, + "loss": 0.77831334, + "num_input_tokens_seen": 35023075, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.203125, + "step": 1629, + "time_per_iteration": 2.4959983825683594 + }, + { + "auxiliary_loss_clip": 0.01175285, + "auxiliary_loss_mlp": 0.01046701, + "balance_loss_clip": 1.02412319, + "balance_loss_mlp": 1.0530107, + "epoch": 0.09800090184879003, + "flos": 23659889322240.0, + "grad_norm": 1.5982247062153871, + "language_loss": 0.78224194, + "learning_rate": 3.951689838432013e-06, + "loss": 0.80446172, + "num_input_tokens_seen": 35043480, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.21875, + "step": 1630, + "time_per_iteration": 2.5138731002807617 + }, + { + "auxiliary_loss_clip": 0.01177321, + "auxiliary_loss_mlp": 0.01051411, + "balance_loss_clip": 1.0292381, + "balance_loss_mlp": 1.05457997, + "epoch": 0.09806102510145799, + "flos": 17055773631360.0, + "grad_norm": 1.9134334701620013, + "language_loss": 0.86704385, + "learning_rate": 3.951604717916228e-06, + "loss": 0.8893311, + "num_input_tokens_seen": 35061490, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.2265625, + "step": 1631, + "time_per_iteration": 2.443878173828125 + }, + { + "auxiliary_loss_clip": 0.01172712, + "auxiliary_loss_mlp": 0.01050929, + "balance_loss_clip": 1.03065109, + "balance_loss_mlp": 1.05258322, + "epoch": 0.09812114835412596, + "flos": 23878477537920.0, + "grad_norm": 2.096430969489036, + "language_loss": 0.83111286, + "learning_rate": 3.9515195233953975e-06, + "loss": 0.85334921, + "num_input_tokens_seen": 35079670, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.203125, + "step": 1632, + "time_per_iteration": 2.478825807571411 + }, + { + "auxiliary_loss_clip": 0.01174956, + "auxiliary_loss_mlp": 0.01056552, + "balance_loss_clip": 1.0368464, + "balance_loss_mlp": 1.05281615, + "epoch": 0.09818127160679392, + "flos": 20595488325120.0, + "grad_norm": 1.5107232822128822, + "language_loss": 0.7877655, + "learning_rate": 3.951434254872751e-06, + "loss": 0.81008065, + "num_input_tokens_seen": 35099205, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.21875, + "step": 1633, + "time_per_iteration": 2.447930097579956 + }, + { + "auxiliary_loss_clip": 0.01169184, + "auxiliary_loss_mlp": 0.01049402, + "balance_loss_clip": 1.02833819, + "balance_loss_mlp": 1.04989707, + "epoch": 0.0982413948594619, + "flos": 15487339288320.0, + "grad_norm": 2.0663591821232865, + "language_loss": 0.73159611, + "learning_rate": 3.951348912351521e-06, + "loss": 0.75378191, + "num_input_tokens_seen": 35115270, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1875, + "step": 1634, + "time_per_iteration": 2.460265636444092 + }, + { + "auxiliary_loss_clip": 0.01179893, + "auxiliary_loss_mlp": 0.01062758, + "balance_loss_clip": 1.04026294, + "balance_loss_mlp": 1.0516957, + "epoch": 0.09830151811212987, + "flos": 24207958016640.0, + "grad_norm": 2.7516342600991868, + "language_loss": 0.72714394, + "learning_rate": 3.951263495834947e-06, + "loss": 0.74957043, + "num_input_tokens_seen": 35134065, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.28125, + "step": 1635, + "time_per_iteration": 2.4835710525512695 + }, + { + "auxiliary_loss_clip": 0.01177592, + "auxiliary_loss_mlp": 0.01055297, + "balance_loss_clip": 1.03301644, + "balance_loss_mlp": 1.05253148, + "epoch": 0.09836164136479783, + "flos": 20594590485120.0, + "grad_norm": 1.8458745824258636, + "language_loss": 0.7819975, + "learning_rate": 3.951178005326264e-06, + "loss": 0.80432636, + "num_input_tokens_seen": 35154870, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.25, + "step": 1636, + "time_per_iteration": 2.53061842918396 + }, + { + "auxiliary_loss_clip": 0.01173491, + "auxiliary_loss_mlp": 0.01056847, + "balance_loss_clip": 1.03498387, + "balance_loss_mlp": 1.05113721, + "epoch": 0.09842176461746581, + "flos": 19934157070080.0, + "grad_norm": 2.2976115041381386, + "language_loss": 0.70005965, + "learning_rate": 3.951092440828715e-06, + "loss": 0.722363, + "num_input_tokens_seen": 35171850, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.21875, + "step": 1637, + "time_per_iteration": 2.445742130279541 + }, + { + "auxiliary_loss_clip": 0.0117379, + "auxiliary_loss_mlp": 0.01053221, + "balance_loss_clip": 1.03175139, + "balance_loss_mlp": 1.05108416, + "epoch": 0.09848188787013377, + "flos": 21214659991680.0, + "grad_norm": 2.115587702667026, + "language_loss": 0.77395654, + "learning_rate": 3.951006802345545e-06, + "loss": 0.79622668, + "num_input_tokens_seen": 35188795, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.2265625, + "step": 1638, + "time_per_iteration": 2.4725139141082764 + }, + { + "auxiliary_loss_clip": 0.01170234, + "auxiliary_loss_mlp": 0.01046096, + "balance_loss_clip": 1.02524579, + "balance_loss_mlp": 1.05077171, + "epoch": 0.09854201112280174, + "flos": 30154226071680.0, + "grad_norm": 1.4162008179950134, + "language_loss": 0.7263118, + "learning_rate": 3.950921089880003e-06, + "loss": 0.74847507, + "num_input_tokens_seen": 35212100, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1953125, + "step": 1639, + "time_per_iteration": 2.5534512996673584 + }, + { + "auxiliary_loss_clip": 0.01173162, + "auxiliary_loss_mlp": 0.01040763, + "balance_loss_clip": 1.01943696, + "balance_loss_mlp": 1.05003214, + "epoch": 0.09860213437546972, + "flos": 21795730306560.0, + "grad_norm": 1.8280373897837945, + "language_loss": 0.88669002, + "learning_rate": 3.950835303435337e-06, + "loss": 0.90882927, + "num_input_tokens_seen": 35230390, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.234375, + "step": 1640, + "time_per_iteration": 2.4868786334991455 + }, + { + "auxiliary_loss_clip": 0.01173727, + "auxiliary_loss_mlp": 0.01037743, + "balance_loss_clip": 1.01685774, + "balance_loss_mlp": 1.05164635, + "epoch": 0.09866225762813768, + "flos": 21835555511040.0, + "grad_norm": 2.1859335509376527, + "language_loss": 0.8086108, + "learning_rate": 3.950749443014801e-06, + "loss": 0.83072555, + "num_input_tokens_seen": 35250405, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.21875, + "step": 1641, + "time_per_iteration": 2.5081584453582764 + }, + { + "auxiliary_loss_clip": 0.01173536, + "auxiliary_loss_mlp": 0.01054387, + "balance_loss_clip": 1.03130805, + "balance_loss_mlp": 1.05067503, + "epoch": 0.09872238088080565, + "flos": 17599855916160.0, + "grad_norm": 2.4983515693134417, + "language_loss": 0.85826755, + "learning_rate": 3.95066350862165e-06, + "loss": 0.88054669, + "num_input_tokens_seen": 35262820, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.2265625, + "step": 1642, + "time_per_iteration": 2.4351255893707275 + }, + { + "auxiliary_loss_clip": 0.01177694, + "auxiliary_loss_mlp": 0.01053725, + "balance_loss_clip": 1.0326128, + "balance_loss_mlp": 1.05365527, + "epoch": 0.09878250413347361, + "flos": 27636134002560.0, + "grad_norm": 1.7421144196917664, + "language_loss": 0.80859929, + "learning_rate": 3.950577500259144e-06, + "loss": 0.83091342, + "num_input_tokens_seen": 35284490, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.2421875, + "step": 1643, + "time_per_iteration": 3.9550716876983643 + }, + { + "auxiliary_loss_clip": 0.01170472, + "auxiliary_loss_mlp": 0.01063125, + "balance_loss_clip": 1.04138088, + "balance_loss_mlp": 1.0494256, + "epoch": 0.0988426273861416, + "flos": 16544728880640.0, + "grad_norm": 1.9624417465121429, + "language_loss": 0.8262763, + "learning_rate": 3.950491417930543e-06, + "loss": 0.84861231, + "num_input_tokens_seen": 35302815, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.2109375, + "step": 1644, + "time_per_iteration": 3.8253817558288574 + }, + { + "auxiliary_loss_clip": 0.01169448, + "auxiliary_loss_mlp": 0.01048566, + "balance_loss_clip": 1.02733469, + "balance_loss_mlp": 1.05048347, + "epoch": 0.09890275063880956, + "flos": 21215270522880.0, + "grad_norm": 1.7099323885745632, + "language_loss": 0.6819675, + "learning_rate": 3.9504052616391124e-06, + "loss": 0.70414758, + "num_input_tokens_seen": 35321175, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1953125, + "step": 1645, + "time_per_iteration": 2.4549567699432373 + }, + { + "auxiliary_loss_clip": 0.01065531, + "auxiliary_loss_mlp": 0.01023286, + "balance_loss_clip": 1.0206517, + "balance_loss_mlp": 1.01924491, + "epoch": 0.09896287389147752, + "flos": 59379372910080.0, + "grad_norm": 0.9514884974425206, + "language_loss": 0.60854232, + "learning_rate": 3.950319031388119e-06, + "loss": 0.62943053, + "num_input_tokens_seen": 35381740, + "router_z_loss_clip": 0.02636719, + "router_z_loss_mlp": 0.46289062, + "step": 1646, + "time_per_iteration": 2.9953765869140625 + }, + { + "auxiliary_loss_clip": 0.01170253, + "auxiliary_loss_mlp": 0.01049996, + "balance_loss_clip": 1.02725112, + "balance_loss_mlp": 1.04880357, + "epoch": 0.0990229971441455, + "flos": 29642678530560.0, + "grad_norm": 2.5496486678231425, + "language_loss": 0.73046064, + "learning_rate": 3.950232727180833e-06, + "loss": 0.75266314, + "num_input_tokens_seen": 35403760, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.2109375, + "step": 1647, + "time_per_iteration": 2.5241641998291016 + }, + { + "auxiliary_loss_clip": 0.01171762, + "auxiliary_loss_mlp": 0.0105645, + "balance_loss_clip": 1.03663731, + "balance_loss_mlp": 1.04955053, + "epoch": 0.09908312039681347, + "flos": 21834873152640.0, + "grad_norm": 1.8237647662791463, + "language_loss": 0.84120429, + "learning_rate": 3.950146349020525e-06, + "loss": 0.86348635, + "num_input_tokens_seen": 35424050, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.21875, + "step": 1648, + "time_per_iteration": 2.467717170715332 + }, + { + "auxiliary_loss_clip": 0.01061152, + "auxiliary_loss_mlp": 0.01009658, + "balance_loss_clip": 1.00701165, + "balance_loss_mlp": 1.0159142, + "epoch": 0.09914324364948143, + "flos": 57564304807680.0, + "grad_norm": 0.7437092318732932, + "language_loss": 0.55674303, + "learning_rate": 3.950059896910473e-06, + "loss": 0.57745123, + "num_input_tokens_seen": 35481690, + "router_z_loss_clip": 0.02648926, + "router_z_loss_mlp": 0.453125, + "step": 1649, + "time_per_iteration": 2.99874210357666 + }, + { + "auxiliary_loss_clip": 0.01165781, + "auxiliary_loss_mlp": 0.01046657, + "balance_loss_clip": 1.02598572, + "balance_loss_mlp": 1.04597533, + "epoch": 0.09920336690214941, + "flos": 34123934476800.0, + "grad_norm": 2.284847215884091, + "language_loss": 0.89930248, + "learning_rate": 3.949973370853954e-06, + "loss": 0.92142689, + "num_input_tokens_seen": 35498635, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1953125, + "step": 1650, + "time_per_iteration": 2.5542855262756348 + }, + { + "auxiliary_loss_clip": 0.01058202, + "auxiliary_loss_mlp": 0.01003693, + "balance_loss_clip": 1.00105858, + "balance_loss_mlp": 1.01395106, + "epoch": 0.09926349015481738, + "flos": 71216428464000.0, + "grad_norm": 0.8031298543824162, + "language_loss": 0.63733649, + "learning_rate": 3.94988677085425e-06, + "loss": 0.65795547, + "num_input_tokens_seen": 35565720, + "router_z_loss_clip": 0.02636719, + "router_z_loss_mlp": 0.44140625, + "step": 1651, + "time_per_iteration": 3.217806100845337 + }, + { + "auxiliary_loss_clip": 0.01168872, + "auxiliary_loss_mlp": 0.01054978, + "balance_loss_clip": 1.03318655, + "balance_loss_mlp": 1.04885435, + "epoch": 0.09932361340748534, + "flos": 23148700917120.0, + "grad_norm": 1.9462006377707899, + "language_loss": 0.88288587, + "learning_rate": 3.949800096914643e-06, + "loss": 0.90512443, + "num_input_tokens_seen": 35586000, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.203125, + "step": 1652, + "time_per_iteration": 2.5014448165893555 + }, + { + "auxiliary_loss_clip": 0.01174376, + "auxiliary_loss_mlp": 0.01057611, + "balance_loss_clip": 1.03692842, + "balance_loss_mlp": 1.05190849, + "epoch": 0.09938373666015332, + "flos": 19828651847040.0, + "grad_norm": 1.9500387106757973, + "language_loss": 0.82206833, + "learning_rate": 3.949713349038422e-06, + "loss": 0.84438825, + "num_input_tokens_seen": 35604355, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.2265625, + "step": 1653, + "time_per_iteration": 2.4881839752197266 + }, + { + "auxiliary_loss_clip": 0.01172582, + "auxiliary_loss_mlp": 0.010545, + "balance_loss_clip": 1.03330469, + "balance_loss_mlp": 1.04984093, + "epoch": 0.09944385991282129, + "flos": 22090664880000.0, + "grad_norm": 2.0314065071494136, + "language_loss": 0.79399735, + "learning_rate": 3.949626527228875e-06, + "loss": 0.81626815, + "num_input_tokens_seen": 35625495, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.2265625, + "step": 1654, + "time_per_iteration": 2.5269205570220947 + }, + { + "auxiliary_loss_clip": 0.01167439, + "auxiliary_loss_mlp": 0.01055854, + "balance_loss_clip": 1.03700721, + "balance_loss_mlp": 1.05072093, + "epoch": 0.09950398316548925, + "flos": 19828867328640.0, + "grad_norm": 1.5637423809135174, + "language_loss": 0.8088094, + "learning_rate": 3.949539631489295e-06, + "loss": 0.83104229, + "num_input_tokens_seen": 35645030, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.171875, + "step": 1655, + "time_per_iteration": 2.4652602672576904 + }, + { + "auxiliary_loss_clip": 0.01167369, + "auxiliary_loss_mlp": 0.0105576, + "balance_loss_clip": 1.03495777, + "balance_loss_mlp": 1.04891443, + "epoch": 0.09956410641815722, + "flos": 25003701964800.0, + "grad_norm": 1.9082198159511756, + "language_loss": 0.80947387, + "learning_rate": 3.9494526618229765e-06, + "loss": 0.83170521, + "num_input_tokens_seen": 35664305, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.1875, + "step": 1656, + "time_per_iteration": 2.4966416358947754 + }, + { + "auxiliary_loss_clip": 0.01170477, + "auxiliary_loss_mlp": 0.0106116, + "balance_loss_clip": 1.04066813, + "balance_loss_mlp": 1.05147541, + "epoch": 0.0996242296708252, + "flos": 19317714837120.0, + "grad_norm": 1.6268850155063674, + "language_loss": 0.88850212, + "learning_rate": 3.949365618233217e-06, + "loss": 0.91081852, + "num_input_tokens_seen": 35684060, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1875, + "step": 1657, + "time_per_iteration": 2.446124792098999 + }, + { + "auxiliary_loss_clip": 0.01175951, + "auxiliary_loss_mlp": 0.01063236, + "balance_loss_clip": 1.04088378, + "balance_loss_mlp": 1.05091214, + "epoch": 0.09968435292349316, + "flos": 21871609787520.0, + "grad_norm": 2.0057694643168302, + "language_loss": 0.84758937, + "learning_rate": 3.9492785007233195e-06, + "loss": 0.86998123, + "num_input_tokens_seen": 35703250, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.25, + "step": 1658, + "time_per_iteration": 2.457902669906616 + }, + { + "auxiliary_loss_clip": 0.01054631, + "auxiliary_loss_mlp": 0.01077242, + "balance_loss_clip": 1.07460773, + "balance_loss_mlp": 1.0110395, + "epoch": 0.09974447617616113, + "flos": 65384533313280.0, + "grad_norm": 0.9153195332104517, + "language_loss": 0.60843968, + "learning_rate": 3.949191309296585e-06, + "loss": 0.62975848, + "num_input_tokens_seen": 35762165, + "router_z_loss_clip": 0.02636719, + "router_z_loss_mlp": 0.43554688, + "step": 1659, + "time_per_iteration": 3.077805519104004 + }, + { + "auxiliary_loss_clip": 0.01170517, + "auxiliary_loss_mlp": 0.01052954, + "balance_loss_clip": 1.03155613, + "balance_loss_mlp": 1.04999721, + "epoch": 0.0998045994288291, + "flos": 23659817495040.0, + "grad_norm": 1.8691655756599186, + "language_loss": 0.85116851, + "learning_rate": 3.949104043956321e-06, + "loss": 0.87340325, + "num_input_tokens_seen": 35781520, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.2109375, + "step": 1660, + "time_per_iteration": 2.49082612991333 + }, + { + "auxiliary_loss_clip": 0.01171003, + "auxiliary_loss_mlp": 0.01056184, + "balance_loss_clip": 1.03393948, + "balance_loss_mlp": 1.05291247, + "epoch": 0.09986472268149707, + "flos": 19609704495360.0, + "grad_norm": 2.130922035700174, + "language_loss": 0.80037123, + "learning_rate": 3.949016704705836e-06, + "loss": 0.8226431, + "num_input_tokens_seen": 35799565, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.1796875, + "step": 1661, + "time_per_iteration": 2.4412636756896973 + }, + { + "auxiliary_loss_clip": 0.01175671, + "auxiliary_loss_mlp": 0.01050112, + "balance_loss_clip": 1.02801085, + "balance_loss_mlp": 1.05002224, + "epoch": 0.09992484593416504, + "flos": 26213317395840.0, + "grad_norm": 1.8939661728963775, + "language_loss": 0.83592767, + "learning_rate": 3.948929291548443e-06, + "loss": 0.85818553, + "num_input_tokens_seen": 35821085, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.2578125, + "step": 1662, + "time_per_iteration": 2.5200328826904297 + }, + { + "auxiliary_loss_clip": 0.01171098, + "auxiliary_loss_mlp": 0.01052384, + "balance_loss_clip": 1.02972281, + "balance_loss_mlp": 1.05104828, + "epoch": 0.09998496918683301, + "flos": 17493632421120.0, + "grad_norm": 2.1063962968477, + "language_loss": 0.88696563, + "learning_rate": 3.9488418044874546e-06, + "loss": 0.90920055, + "num_input_tokens_seen": 35839840, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.203125, + "step": 1663, + "time_per_iteration": 2.42790150642395 + }, + { + "auxiliary_loss_clip": 0.01174901, + "auxiliary_loss_mlp": 0.01052956, + "balance_loss_clip": 1.03084326, + "balance_loss_mlp": 1.05225635, + "epoch": 0.10004509243950098, + "flos": 22784925928320.0, + "grad_norm": 1.6888490247303796, + "language_loss": 0.7034179, + "learning_rate": 3.948754243526191e-06, + "loss": 0.72569644, + "num_input_tokens_seen": 35861545, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.21875, + "step": 1664, + "time_per_iteration": 2.512596368789673 + }, + { + "auxiliary_loss_clip": 0.01173831, + "auxiliary_loss_mlp": 0.01050685, + "balance_loss_clip": 1.02903676, + "balance_loss_mlp": 1.0535655, + "epoch": 0.10010521569216894, + "flos": 16253385667200.0, + "grad_norm": 2.1773983349048804, + "language_loss": 0.7878316, + "learning_rate": 3.94866660866797e-06, + "loss": 0.81007671, + "num_input_tokens_seen": 35878295, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.203125, + "step": 1665, + "time_per_iteration": 2.4271252155303955 + }, + { + "auxiliary_loss_clip": 0.0117847, + "auxiliary_loss_mlp": 0.01061559, + "balance_loss_clip": 1.0404706, + "balance_loss_mlp": 1.05681181, + "epoch": 0.10016533894483691, + "flos": 23402589223680.0, + "grad_norm": 1.663243771388797, + "language_loss": 0.70152062, + "learning_rate": 3.9485788999161165e-06, + "loss": 0.72392094, + "num_input_tokens_seen": 35898990, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.21875, + "step": 1666, + "time_per_iteration": 2.499131202697754 + }, + { + "auxiliary_loss_clip": 0.01173729, + "auxiliary_loss_mlp": 0.01060981, + "balance_loss_clip": 1.03777063, + "balance_loss_mlp": 1.0506525, + "epoch": 0.10022546219750489, + "flos": 19354164163200.0, + "grad_norm": 1.8121915129470096, + "language_loss": 0.791031, + "learning_rate": 3.948491117273956e-06, + "loss": 0.8133781, + "num_input_tokens_seen": 35916225, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.234375, + "step": 1667, + "time_per_iteration": 2.4429264068603516 + }, + { + "auxiliary_loss_clip": 0.0117317, + "auxiliary_loss_mlp": 0.01050651, + "balance_loss_clip": 1.02810836, + "balance_loss_mlp": 1.05261493, + "epoch": 0.10028558545017285, + "flos": 27085766837760.0, + "grad_norm": 2.9507555712476945, + "language_loss": 0.7715596, + "learning_rate": 3.948403260744817e-06, + "loss": 0.79379785, + "num_input_tokens_seen": 35934630, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.203125, + "step": 1668, + "time_per_iteration": 2.5223031044006348 + }, + { + "auxiliary_loss_clip": 0.01173457, + "auxiliary_loss_mlp": 0.01057389, + "balance_loss_clip": 1.03434563, + "balance_loss_mlp": 1.05256963, + "epoch": 0.10034570870284082, + "flos": 25847136195840.0, + "grad_norm": 1.9809152554972944, + "language_loss": 0.77852714, + "learning_rate": 3.948315330332031e-06, + "loss": 0.80083561, + "num_input_tokens_seen": 35953855, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.2109375, + "step": 1669, + "time_per_iteration": 2.5082881450653076 + }, + { + "auxiliary_loss_clip": 0.01181618, + "auxiliary_loss_mlp": 0.01059972, + "balance_loss_clip": 1.03641593, + "balance_loss_mlp": 1.05464602, + "epoch": 0.1004058319555088, + "flos": 26249587153920.0, + "grad_norm": 2.145889566444559, + "language_loss": 0.85461181, + "learning_rate": 3.948227326038933e-06, + "loss": 0.87702769, + "num_input_tokens_seen": 35974555, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.2734375, + "step": 1670, + "time_per_iteration": 2.5235135555267334 + }, + { + "auxiliary_loss_clip": 0.01166248, + "auxiliary_loss_mlp": 0.01057789, + "balance_loss_clip": 1.03681993, + "balance_loss_mlp": 1.0501771, + "epoch": 0.10046595520817676, + "flos": 25374480105600.0, + "grad_norm": 1.5986093935623644, + "language_loss": 0.76899171, + "learning_rate": 3.9481392478688586e-06, + "loss": 0.79123211, + "num_input_tokens_seen": 35996830, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.15625, + "step": 1671, + "time_per_iteration": 2.505441665649414 + }, + { + "auxiliary_loss_clip": 0.01059926, + "auxiliary_loss_mlp": 0.01022857, + "balance_loss_clip": 1.02019823, + "balance_loss_mlp": 1.01598763, + "epoch": 0.10052607846084473, + "flos": 67461821677440.0, + "grad_norm": 0.7900846916321359, + "language_loss": 0.60719293, + "learning_rate": 3.948051095825149e-06, + "loss": 0.62802076, + "num_input_tokens_seen": 36054465, + "router_z_loss_clip": 0.02661133, + "router_z_loss_mlp": 0.43945312, + "step": 1672, + "time_per_iteration": 3.07255482673645 + }, + { + "auxiliary_loss_clip": 0.01173395, + "auxiliary_loss_mlp": 0.01064348, + "balance_loss_clip": 1.04179382, + "balance_loss_mlp": 1.05045998, + "epoch": 0.10058620171351271, + "flos": 21360493209600.0, + "grad_norm": 2.0407855091156377, + "language_loss": 0.77119517, + "learning_rate": 3.947962869911147e-06, + "loss": 0.79357255, + "num_input_tokens_seen": 36073480, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.234375, + "step": 1673, + "time_per_iteration": 2.4693222045898438 + }, + { + "auxiliary_loss_clip": 0.01171478, + "auxiliary_loss_mlp": 0.01052114, + "balance_loss_clip": 1.03066778, + "balance_loss_mlp": 1.04964709, + "epoch": 0.10064632496618067, + "flos": 16800125558400.0, + "grad_norm": 2.2570599367002835, + "language_loss": 0.72829556, + "learning_rate": 3.947874570130197e-06, + "loss": 0.75053144, + "num_input_tokens_seen": 36091830, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.21875, + "step": 1674, + "time_per_iteration": 2.4534130096435547 + }, + { + "auxiliary_loss_clip": 0.01170516, + "auxiliary_loss_mlp": 0.01051148, + "balance_loss_clip": 1.03047729, + "balance_loss_mlp": 1.04903197, + "epoch": 0.10070644821884864, + "flos": 23624445576960.0, + "grad_norm": 2.043409325490185, + "language_loss": 0.79386973, + "learning_rate": 3.947786196485649e-06, + "loss": 0.81608635, + "num_input_tokens_seen": 36111400, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.21875, + "step": 1675, + "time_per_iteration": 2.496504545211792 + }, + { + "auxiliary_loss_clip": 0.01168157, + "auxiliary_loss_mlp": 0.01064762, + "balance_loss_clip": 1.04449606, + "balance_loss_mlp": 1.04908013, + "epoch": 0.1007665714715166, + "flos": 24462564595200.0, + "grad_norm": 2.0305638084579294, + "language_loss": 0.81565315, + "learning_rate": 3.947697748980853e-06, + "loss": 0.8379823, + "num_input_tokens_seen": 36129345, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1875, + "step": 1676, + "time_per_iteration": 2.5022919178009033 + }, + { + "auxiliary_loss_clip": 0.01174395, + "auxiliary_loss_mlp": 0.01058603, + "balance_loss_clip": 1.03713369, + "balance_loss_mlp": 1.05283856, + "epoch": 0.10082669472418458, + "flos": 16799119977600.0, + "grad_norm": 2.134524944411931, + "language_loss": 0.86155027, + "learning_rate": 3.947609227619163e-06, + "loss": 0.88388026, + "num_input_tokens_seen": 36146255, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.2109375, + "step": 1677, + "time_per_iteration": 2.44887113571167 + }, + { + "auxiliary_loss_clip": 0.01171962, + "auxiliary_loss_mlp": 0.01055328, + "balance_loss_clip": 1.03452563, + "balance_loss_mlp": 1.05113602, + "epoch": 0.10088681797685255, + "flos": 13553513844480.0, + "grad_norm": 5.349815535910457, + "language_loss": 0.86318195, + "learning_rate": 3.947520632403936e-06, + "loss": 0.88545489, + "num_input_tokens_seen": 36164050, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.2109375, + "step": 1678, + "time_per_iteration": 2.4373903274536133 + }, + { + "auxiliary_loss_clip": 0.01172423, + "auxiliary_loss_mlp": 0.01055078, + "balance_loss_clip": 1.03359675, + "balance_loss_mlp": 1.05214512, + "epoch": 0.10094694122952051, + "flos": 25265706744960.0, + "grad_norm": 2.6897314721028867, + "language_loss": 0.89726269, + "learning_rate": 3.947431963338532e-06, + "loss": 0.91953766, + "num_input_tokens_seen": 36183530, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.203125, + "step": 1679, + "time_per_iteration": 2.493959903717041 + }, + { + "auxiliary_loss_clip": 0.01056795, + "auxiliary_loss_mlp": 0.01017317, + "balance_loss_clip": 1.01468229, + "balance_loss_mlp": 1.01327634, + "epoch": 0.10100706448218849, + "flos": 69854299885440.0, + "grad_norm": 0.7831657514235874, + "language_loss": 0.53018153, + "learning_rate": 3.947343220426312e-06, + "loss": 0.55092263, + "num_input_tokens_seen": 36248550, + "router_z_loss_clip": 0.02636719, + "router_z_loss_mlp": 0.43554688, + "step": 1680, + "time_per_iteration": 3.15899658203125 + }, + { + "auxiliary_loss_clip": 0.01168402, + "auxiliary_loss_mlp": 0.01055332, + "balance_loss_clip": 1.03429151, + "balance_loss_mlp": 1.04983318, + "epoch": 0.10106718773485646, + "flos": 20007163463040.0, + "grad_norm": 1.657625192327098, + "language_loss": 0.76889706, + "learning_rate": 3.947254403670641e-06, + "loss": 0.79113436, + "num_input_tokens_seen": 36266065, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.1875, + "step": 1681, + "time_per_iteration": 2.446937322616577 + }, + { + "auxiliary_loss_clip": 0.01175341, + "auxiliary_loss_mlp": 0.01058478, + "balance_loss_clip": 1.03423131, + "balance_loss_mlp": 1.04937744, + "epoch": 0.10112731098752442, + "flos": 13479825093120.0, + "grad_norm": 2.135292201068385, + "language_loss": 0.93928307, + "learning_rate": 3.947165513074889e-06, + "loss": 0.96162128, + "num_input_tokens_seen": 36280960, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.2578125, + "step": 1682, + "time_per_iteration": 2.4357759952545166 + }, + { + "auxiliary_loss_clip": 0.01172101, + "auxiliary_loss_mlp": 0.01053977, + "balance_loss_clip": 1.03315091, + "balance_loss_mlp": 1.05045152, + "epoch": 0.1011874342401924, + "flos": 18515901490560.0, + "grad_norm": 5.112669241194533, + "language_loss": 0.87866408, + "learning_rate": 3.947076548642425e-06, + "loss": 0.90092492, + "num_input_tokens_seen": 36299010, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.21875, + "step": 1683, + "time_per_iteration": 2.427802562713623 + }, + { + "auxiliary_loss_clip": 0.01169341, + "auxiliary_loss_mlp": 0.01059869, + "balance_loss_clip": 1.03888798, + "balance_loss_mlp": 1.05144525, + "epoch": 0.10124755749286037, + "flos": 20702861055360.0, + "grad_norm": 1.7718228637860187, + "language_loss": 0.74768114, + "learning_rate": 3.946987510376624e-06, + "loss": 0.76997328, + "num_input_tokens_seen": 36318400, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.1796875, + "step": 1684, + "time_per_iteration": 5.332470417022705 + }, + { + "auxiliary_loss_clip": 0.01059522, + "auxiliary_loss_mlp": 0.01011499, + "balance_loss_clip": 1.00863802, + "balance_loss_mlp": 1.01624751, + "epoch": 0.10130768074552833, + "flos": 56109456247680.0, + "grad_norm": 0.760003339390084, + "language_loss": 0.61090153, + "learning_rate": 3.9468983982808615e-06, + "loss": 0.6316117, + "num_input_tokens_seen": 36381815, + "router_z_loss_clip": 0.02856445, + "router_z_loss_mlp": 0.43359375, + "step": 1685, + "time_per_iteration": 4.508171081542969 + }, + { + "auxiliary_loss_clip": 0.01169013, + "auxiliary_loss_mlp": 0.01049359, + "balance_loss_clip": 1.02769828, + "balance_loss_mlp": 1.04891801, + "epoch": 0.1013678039981963, + "flos": 33402346156800.0, + "grad_norm": 2.3224629698824075, + "language_loss": 0.61664945, + "learning_rate": 3.946809212358516e-06, + "loss": 0.63883317, + "num_input_tokens_seen": 36404320, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.203125, + "step": 1686, + "time_per_iteration": 2.586596965789795 + }, + { + "auxiliary_loss_clip": 0.01173787, + "auxiliary_loss_mlp": 0.01054454, + "balance_loss_clip": 1.03238797, + "balance_loss_mlp": 1.0545882, + "epoch": 0.10142792725086427, + "flos": 31905338008320.0, + "grad_norm": 2.1992592502117443, + "language_loss": 0.81408226, + "learning_rate": 3.946719952612972e-06, + "loss": 0.83636469, + "num_input_tokens_seen": 36427510, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1875, + "step": 1687, + "time_per_iteration": 2.5495810508728027 + }, + { + "auxiliary_loss_clip": 0.01173812, + "auxiliary_loss_mlp": 0.01051846, + "balance_loss_clip": 1.03055501, + "balance_loss_mlp": 1.0514555, + "epoch": 0.10148805050353224, + "flos": 28475905046400.0, + "grad_norm": 1.783489688966995, + "language_loss": 0.72360015, + "learning_rate": 3.94663061904761e-06, + "loss": 0.74585676, + "num_input_tokens_seen": 36448230, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.21875, + "step": 1688, + "time_per_iteration": 2.5242748260498047 + }, + { + "auxiliary_loss_clip": 0.01169898, + "auxiliary_loss_mlp": 0.01054433, + "balance_loss_clip": 1.03264165, + "balance_loss_mlp": 1.05043888, + "epoch": 0.1015481737562002, + "flos": 25148888737920.0, + "grad_norm": 1.9893327907397977, + "language_loss": 0.86880058, + "learning_rate": 3.94654121166582e-06, + "loss": 0.8910439, + "num_input_tokens_seen": 36464395, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.1875, + "step": 1689, + "time_per_iteration": 2.5283408164978027 + }, + { + "auxiliary_loss_clip": 0.01165961, + "auxiliary_loss_mlp": 0.01045526, + "balance_loss_clip": 1.02585626, + "balance_loss_mlp": 1.04692245, + "epoch": 0.10160829700886818, + "flos": 30882781630080.0, + "grad_norm": 1.8972643802531153, + "language_loss": 0.88054395, + "learning_rate": 3.946451730470993e-06, + "loss": 0.90265882, + "num_input_tokens_seen": 36486475, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1875, + "step": 1690, + "time_per_iteration": 2.5732247829437256 + }, + { + "auxiliary_loss_clip": 0.01170509, + "auxiliary_loss_mlp": 0.01051598, + "balance_loss_clip": 1.02961624, + "balance_loss_mlp": 1.04965854, + "epoch": 0.10166842026153615, + "flos": 20412020632320.0, + "grad_norm": 1.8841763324380914, + "language_loss": 0.83124495, + "learning_rate": 3.946362175466521e-06, + "loss": 0.85346603, + "num_input_tokens_seen": 36505310, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.203125, + "step": 1691, + "time_per_iteration": 2.453263282775879 + }, + { + "auxiliary_loss_clip": 0.01172636, + "auxiliary_loss_mlp": 0.01050561, + "balance_loss_clip": 1.028579, + "balance_loss_mlp": 1.05049825, + "epoch": 0.10172854351420411, + "flos": 33476968661760.0, + "grad_norm": 1.648035623213742, + "language_loss": 0.66938514, + "learning_rate": 3.946272546655801e-06, + "loss": 0.69161713, + "num_input_tokens_seen": 36529820, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.21875, + "step": 1692, + "time_per_iteration": 2.5865867137908936 + }, + { + "auxiliary_loss_clip": 0.01167535, + "auxiliary_loss_mlp": 0.01067279, + "balance_loss_clip": 1.04540372, + "balance_loss_mlp": 1.0471102, + "epoch": 0.1017886667668721, + "flos": 23550325862400.0, + "grad_norm": 1.649284734670808, + "language_loss": 0.75387824, + "learning_rate": 3.94618284404223e-06, + "loss": 0.77622634, + "num_input_tokens_seen": 36549000, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.203125, + "step": 1693, + "time_per_iteration": 2.499476194381714 + }, + { + "auxiliary_loss_clip": 0.01171507, + "auxiliary_loss_mlp": 0.01049517, + "balance_loss_clip": 1.02685595, + "balance_loss_mlp": 1.04984784, + "epoch": 0.10184879001954006, + "flos": 23296078419840.0, + "grad_norm": 1.6930931596653784, + "language_loss": 0.87206519, + "learning_rate": 3.9460930676292105e-06, + "loss": 0.89427543, + "num_input_tokens_seen": 36567515, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.21875, + "step": 1694, + "time_per_iteration": 2.483264923095703 + }, + { + "auxiliary_loss_clip": 0.01177185, + "auxiliary_loss_mlp": 0.01052768, + "balance_loss_clip": 1.03013015, + "balance_loss_mlp": 1.05056214, + "epoch": 0.10190891327220802, + "flos": 18333116156160.0, + "grad_norm": 3.1999162319303274, + "language_loss": 0.79579329, + "learning_rate": 3.946003217420147e-06, + "loss": 0.81809288, + "num_input_tokens_seen": 36586190, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.265625, + "step": 1695, + "time_per_iteration": 2.4574177265167236 + }, + { + "auxiliary_loss_clip": 0.01168528, + "auxiliary_loss_mlp": 0.01055372, + "balance_loss_clip": 1.03280592, + "balance_loss_mlp": 1.04648614, + "epoch": 0.10196903652487599, + "flos": 26465374108800.0, + "grad_norm": 1.7546035908378184, + "language_loss": 0.86581397, + "learning_rate": 3.945913293418447e-06, + "loss": 0.88805294, + "num_input_tokens_seen": 36607495, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.21875, + "step": 1696, + "time_per_iteration": 2.4986772537231445 + }, + { + "auxiliary_loss_clip": 0.01168623, + "auxiliary_loss_mlp": 0.01056747, + "balance_loss_clip": 1.03532469, + "balance_loss_mlp": 1.04927731, + "epoch": 0.10202915977754397, + "flos": 21869526798720.0, + "grad_norm": 1.97196247739744, + "language_loss": 0.82034266, + "learning_rate": 3.945823295627519e-06, + "loss": 0.84259629, + "num_input_tokens_seen": 36628555, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.1953125, + "step": 1697, + "time_per_iteration": 2.483682155609131 + }, + { + "auxiliary_loss_clip": 0.01170239, + "auxiliary_loss_mlp": 0.0104937, + "balance_loss_clip": 1.02674437, + "balance_loss_mlp": 1.0477041, + "epoch": 0.10208928303021193, + "flos": 22309755886080.0, + "grad_norm": 1.9483747561194416, + "language_loss": 0.80650747, + "learning_rate": 3.9457332240507775e-06, + "loss": 0.82870358, + "num_input_tokens_seen": 36646250, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.2265625, + "step": 1698, + "time_per_iteration": 2.4512858390808105 + }, + { + "auxiliary_loss_clip": 0.01172882, + "auxiliary_loss_mlp": 0.01048738, + "balance_loss_clip": 1.02756608, + "balance_loss_mlp": 1.05113077, + "epoch": 0.1021494062828799, + "flos": 22125569921280.0, + "grad_norm": 4.641294823605382, + "language_loss": 0.75680709, + "learning_rate": 3.945643078691637e-06, + "loss": 0.77902329, + "num_input_tokens_seen": 36666675, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.21875, + "step": 1699, + "time_per_iteration": 2.458315849304199 + }, + { + "auxiliary_loss_clip": 0.01171952, + "auxiliary_loss_mlp": 0.01048121, + "balance_loss_clip": 1.02606726, + "balance_loss_mlp": 1.05093145, + "epoch": 0.10220952953554788, + "flos": 19646728439040.0, + "grad_norm": 1.7623204527071121, + "language_loss": 0.79777479, + "learning_rate": 3.945552859553516e-06, + "loss": 0.81997555, + "num_input_tokens_seen": 36685225, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.2109375, + "step": 1700, + "time_per_iteration": 2.4692423343658447 + }, + { + "auxiliary_loss_clip": 0.01170922, + "auxiliary_loss_mlp": 0.01045823, + "balance_loss_clip": 1.02411532, + "balance_loss_mlp": 1.04850125, + "epoch": 0.10226965278821584, + "flos": 29787290686080.0, + "grad_norm": 1.8827887870563835, + "language_loss": 0.76854098, + "learning_rate": 3.945462566639836e-06, + "loss": 0.79070842, + "num_input_tokens_seen": 36705985, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.21875, + "step": 1701, + "time_per_iteration": 2.5015852451324463 + }, + { + "auxiliary_loss_clip": 0.01176415, + "auxiliary_loss_mlp": 0.01048843, + "balance_loss_clip": 1.02708709, + "balance_loss_mlp": 1.05213511, + "epoch": 0.10232977604088381, + "flos": 27016818681600.0, + "grad_norm": 2.1180628790190927, + "language_loss": 0.78123891, + "learning_rate": 3.945372199954019e-06, + "loss": 0.80349147, + "num_input_tokens_seen": 36725815, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.2421875, + "step": 1702, + "time_per_iteration": 2.4999852180480957 + }, + { + "auxiliary_loss_clip": 0.01168217, + "auxiliary_loss_mlp": 0.01046251, + "balance_loss_clip": 1.02586651, + "balance_loss_mlp": 1.0487566, + "epoch": 0.10238989929355179, + "flos": 20777519473920.0, + "grad_norm": 2.3091523831758765, + "language_loss": 0.94838184, + "learning_rate": 3.945281759499494e-06, + "loss": 0.97052652, + "num_input_tokens_seen": 36742345, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.1953125, + "step": 1703, + "time_per_iteration": 2.4586100578308105 + }, + { + "auxiliary_loss_clip": 0.01058115, + "auxiliary_loss_mlp": 0.01013234, + "balance_loss_clip": 1.01077867, + "balance_loss_mlp": 1.01462317, + "epoch": 0.10245002254621975, + "flos": 57698322451200.0, + "grad_norm": 0.8800585598511617, + "language_loss": 0.55092424, + "learning_rate": 3.94519124527969e-06, + "loss": 0.57163775, + "num_input_tokens_seen": 36798775, + "router_z_loss_clip": 0.02453613, + "router_z_loss_mlp": 0.43554688, + "step": 1704, + "time_per_iteration": 2.998384952545166 + }, + { + "auxiliary_loss_clip": 0.01170706, + "auxiliary_loss_mlp": 0.01050153, + "balance_loss_clip": 1.02790844, + "balance_loss_mlp": 1.04962945, + "epoch": 0.10251014579888772, + "flos": 16800125558400.0, + "grad_norm": 3.5257555777633174, + "language_loss": 0.83979154, + "learning_rate": 3.945100657298039e-06, + "loss": 0.86200017, + "num_input_tokens_seen": 36816295, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.2109375, + "step": 1705, + "time_per_iteration": 2.4242281913757324 + }, + { + "auxiliary_loss_clip": 0.01057951, + "auxiliary_loss_mlp": 0.01005039, + "balance_loss_clip": 1.00258374, + "balance_loss_mlp": 1.01514411, + "epoch": 0.1025702690515557, + "flos": 68565500922240.0, + "grad_norm": 0.7733309182053202, + "language_loss": 0.60434854, + "learning_rate": 3.9450099955579765e-06, + "loss": 0.62497854, + "num_input_tokens_seen": 36882030, + "router_z_loss_clip": 0.02453613, + "router_z_loss_mlp": 0.4296875, + "step": 1706, + "time_per_iteration": 3.127495765686035 + }, + { + "auxiliary_loss_clip": 0.01175774, + "auxiliary_loss_mlp": 0.01050349, + "balance_loss_clip": 1.02876019, + "balance_loss_mlp": 1.05214357, + "epoch": 0.10263039230422366, + "flos": 14866623336960.0, + "grad_norm": 2.0444921886168284, + "language_loss": 0.85967243, + "learning_rate": 3.94491926006294e-06, + "loss": 0.88193369, + "num_input_tokens_seen": 36899245, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.234375, + "step": 1707, + "time_per_iteration": 2.4486777782440186 + }, + { + "auxiliary_loss_clip": 0.01169845, + "auxiliary_loss_mlp": 0.01046854, + "balance_loss_clip": 1.02654099, + "balance_loss_mlp": 1.04891372, + "epoch": 0.10269051555689163, + "flos": 25337599816320.0, + "grad_norm": 1.6368034329364625, + "language_loss": 0.72840983, + "learning_rate": 3.944828450816369e-06, + "loss": 0.75057685, + "num_input_tokens_seen": 36920950, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.2109375, + "step": 1708, + "time_per_iteration": 2.5019850730895996 + }, + { + "auxiliary_loss_clip": 0.01168702, + "auxiliary_loss_mlp": 0.01054619, + "balance_loss_clip": 1.0325532, + "balance_loss_mlp": 1.0493356, + "epoch": 0.10275063880955959, + "flos": 21068826773760.0, + "grad_norm": 1.9016884094819633, + "language_loss": 0.90944314, + "learning_rate": 3.944737567821709e-06, + "loss": 0.93167639, + "num_input_tokens_seen": 36938900, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1953125, + "step": 1709, + "time_per_iteration": 2.4590399265289307 + }, + { + "auxiliary_loss_clip": 0.01173643, + "auxiliary_loss_mlp": 0.01055032, + "balance_loss_clip": 1.03357422, + "balance_loss_mlp": 1.05296373, + "epoch": 0.10281076206222757, + "flos": 30366780802560.0, + "grad_norm": 3.826538703219267, + "language_loss": 0.8828221, + "learning_rate": 3.944646611082406e-06, + "loss": 0.90510881, + "num_input_tokens_seen": 36957010, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.203125, + "step": 1710, + "time_per_iteration": 2.533165216445923 + }, + { + "auxiliary_loss_clip": 0.01167248, + "auxiliary_loss_mlp": 0.01053637, + "balance_loss_clip": 1.03229809, + "balance_loss_mlp": 1.04937959, + "epoch": 0.10287088531489554, + "flos": 22418313765120.0, + "grad_norm": 1.824520485293549, + "language_loss": 0.79264998, + "learning_rate": 3.944555580601908e-06, + "loss": 0.81485879, + "num_input_tokens_seen": 36977690, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1796875, + "step": 1711, + "time_per_iteration": 2.4947102069854736 + }, + { + "auxiliary_loss_clip": 0.01171963, + "auxiliary_loss_mlp": 0.01058195, + "balance_loss_clip": 1.03615332, + "balance_loss_mlp": 1.05005431, + "epoch": 0.1029310085675635, + "flos": 25115994858240.0, + "grad_norm": 2.0689984646996016, + "language_loss": 0.73589319, + "learning_rate": 3.944464476383668e-06, + "loss": 0.7581948, + "num_input_tokens_seen": 36997300, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.21875, + "step": 1712, + "time_per_iteration": 2.521899461746216 + }, + { + "auxiliary_loss_clip": 0.01166438, + "auxiliary_loss_mlp": 0.01058704, + "balance_loss_clip": 1.03902221, + "balance_loss_mlp": 1.04961872, + "epoch": 0.10299113182023148, + "flos": 19865639877120.0, + "grad_norm": 1.8460865361447714, + "language_loss": 0.86673403, + "learning_rate": 3.94437329843114e-06, + "loss": 0.8889854, + "num_input_tokens_seen": 37016110, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.1640625, + "step": 1713, + "time_per_iteration": 2.467824935913086 + }, + { + "auxiliary_loss_clip": 0.01166539, + "auxiliary_loss_mlp": 0.01059926, + "balance_loss_clip": 1.04019666, + "balance_loss_mlp": 1.04741335, + "epoch": 0.10305125507289944, + "flos": 20447608032000.0, + "grad_norm": 2.6691144860495126, + "language_loss": 0.72610664, + "learning_rate": 3.944282046747782e-06, + "loss": 0.74837124, + "num_input_tokens_seen": 37036405, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1875, + "step": 1714, + "time_per_iteration": 2.478605031967163 + }, + { + "auxiliary_loss_clip": 0.0117345, + "auxiliary_loss_mlp": 0.01057893, + "balance_loss_clip": 1.03542209, + "balance_loss_mlp": 1.04920006, + "epoch": 0.10311137832556741, + "flos": 26250772302720.0, + "grad_norm": 2.3323118637090605, + "language_loss": 0.91395295, + "learning_rate": 3.944190721337053e-06, + "loss": 0.93626636, + "num_input_tokens_seen": 37057580, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.2421875, + "step": 1715, + "time_per_iteration": 2.5223729610443115 + }, + { + "auxiliary_loss_clip": 0.01167345, + "auxiliary_loss_mlp": 0.01053609, + "balance_loss_clip": 1.03290248, + "balance_loss_mlp": 1.04737377, + "epoch": 0.10317150157823539, + "flos": 35298932175360.0, + "grad_norm": 1.9302110224144968, + "language_loss": 0.75736755, + "learning_rate": 3.944099322202418e-06, + "loss": 0.77957708, + "num_input_tokens_seen": 37079120, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.203125, + "step": 1716, + "time_per_iteration": 2.585345506668091 + }, + { + "auxiliary_loss_clip": 0.01171415, + "auxiliary_loss_mlp": 0.01068189, + "balance_loss_clip": 1.04601645, + "balance_loss_mlp": 1.04868793, + "epoch": 0.10323162483090335, + "flos": 25739943033600.0, + "grad_norm": 2.1161503252482747, + "language_loss": 0.85214567, + "learning_rate": 3.944007849347342e-06, + "loss": 0.87454176, + "num_input_tokens_seen": 37099710, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.2265625, + "step": 1717, + "time_per_iteration": 2.500964879989624 + }, + { + "auxiliary_loss_clip": 0.01169937, + "auxiliary_loss_mlp": 0.01055982, + "balance_loss_clip": 1.0358479, + "balance_loss_mlp": 1.05102515, + "epoch": 0.10329174808357132, + "flos": 16289870906880.0, + "grad_norm": 2.0308520014155746, + "language_loss": 0.82883167, + "learning_rate": 3.943916302775292e-06, + "loss": 0.85109091, + "num_input_tokens_seen": 37117775, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.1875, + "step": 1718, + "time_per_iteration": 2.436836004257202 + }, + { + "auxiliary_loss_clip": 0.01169212, + "auxiliary_loss_mlp": 0.01052655, + "balance_loss_clip": 1.03058898, + "balance_loss_mlp": 1.05092025, + "epoch": 0.10335187133623928, + "flos": 36687166963200.0, + "grad_norm": 1.8725763890619624, + "language_loss": 0.73192763, + "learning_rate": 3.943824682489742e-06, + "loss": 0.75414634, + "num_input_tokens_seen": 37140280, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1796875, + "step": 1719, + "time_per_iteration": 2.606293201446533 + }, + { + "auxiliary_loss_clip": 0.01172065, + "auxiliary_loss_mlp": 0.01046316, + "balance_loss_clip": 1.02637219, + "balance_loss_mlp": 1.05197001, + "epoch": 0.10341199458890726, + "flos": 14975648092800.0, + "grad_norm": 2.356604748076592, + "language_loss": 0.92601806, + "learning_rate": 3.9437329884941665e-06, + "loss": 0.94820189, + "num_input_tokens_seen": 37158350, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.203125, + "step": 1720, + "time_per_iteration": 2.4628992080688477 + }, + { + "auxiliary_loss_clip": 0.01167182, + "auxiliary_loss_mlp": 0.01054246, + "balance_loss_clip": 1.03239512, + "balance_loss_mlp": 1.04656935, + "epoch": 0.10347211784157523, + "flos": 21031587348480.0, + "grad_norm": 2.8075298743139174, + "language_loss": 0.79416633, + "learning_rate": 3.943641220792039e-06, + "loss": 0.81638062, + "num_input_tokens_seen": 37177120, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.2109375, + "step": 1721, + "time_per_iteration": 2.4713754653930664 + }, + { + "auxiliary_loss_clip": 0.0117694, + "auxiliary_loss_mlp": 0.01056525, + "balance_loss_clip": 1.03317165, + "balance_loss_mlp": 1.05172479, + "epoch": 0.1035322410942432, + "flos": 19792094780160.0, + "grad_norm": 2.496468299898097, + "language_loss": 0.80755401, + "learning_rate": 3.9435493793868434e-06, + "loss": 0.82988858, + "num_input_tokens_seen": 37195895, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.25, + "step": 1722, + "time_per_iteration": 2.4676520824432373 + }, + { + "auxiliary_loss_clip": 0.01056762, + "auxiliary_loss_mlp": 0.01049921, + "balance_loss_clip": 1.04772782, + "balance_loss_mlp": 1.013726, + "epoch": 0.10359236434691117, + "flos": 52698874947840.0, + "grad_norm": 0.9564367479099696, + "language_loss": 0.67185652, + "learning_rate": 3.943457464282059e-06, + "loss": 0.69292337, + "num_input_tokens_seen": 37247270, + "router_z_loss_clip": 0.02197266, + "router_z_loss_mlp": 0.4296875, + "step": 1723, + "time_per_iteration": 2.8474721908569336 + }, + { + "auxiliary_loss_clip": 0.01170693, + "auxiliary_loss_mlp": 0.01050183, + "balance_loss_clip": 1.02951217, + "balance_loss_mlp": 1.04747462, + "epoch": 0.10365248759957914, + "flos": 18405404277120.0, + "grad_norm": 2.780632359822339, + "language_loss": 0.77922273, + "learning_rate": 3.9433654754811745e-06, + "loss": 0.80143142, + "num_input_tokens_seen": 37265595, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.234375, + "step": 1724, + "time_per_iteration": 2.4311840534210205 + }, + { + "auxiliary_loss_clip": 0.01175556, + "auxiliary_loss_mlp": 0.01052899, + "balance_loss_clip": 1.03233576, + "balance_loss_mlp": 1.05101144, + "epoch": 0.1037126108522471, + "flos": 47553555335040.0, + "grad_norm": 1.8180629527722856, + "language_loss": 0.74894094, + "learning_rate": 3.943273412987676e-06, + "loss": 0.77122545, + "num_input_tokens_seen": 37286660, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.2421875, + "step": 1725, + "time_per_iteration": 2.6802284717559814 + }, + { + "auxiliary_loss_clip": 0.01170353, + "auxiliary_loss_mlp": 0.01049343, + "balance_loss_clip": 1.02852905, + "balance_loss_mlp": 1.05098462, + "epoch": 0.10377273410491508, + "flos": 22816670572800.0, + "grad_norm": 2.4392097975248244, + "language_loss": 0.75290418, + "learning_rate": 3.943181276805054e-06, + "loss": 0.77510113, + "num_input_tokens_seen": 37304915, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.1875, + "step": 1726, + "time_per_iteration": 5.461729049682617 + }, + { + "auxiliary_loss_clip": 0.01174745, + "auxiliary_loss_mlp": 0.01059612, + "balance_loss_clip": 1.03765321, + "balance_loss_mlp": 1.0527426, + "epoch": 0.10383285735758305, + "flos": 26138694890880.0, + "grad_norm": 1.8824890959349092, + "language_loss": 0.73943913, + "learning_rate": 3.9430890669368035e-06, + "loss": 0.76178271, + "num_input_tokens_seen": 37325265, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.21875, + "step": 1727, + "time_per_iteration": 3.883134126663208 + }, + { + "auxiliary_loss_clip": 0.01169505, + "auxiliary_loss_mlp": 0.01051483, + "balance_loss_clip": 1.03023946, + "balance_loss_mlp": 1.04815936, + "epoch": 0.10389298061025101, + "flos": 17091791994240.0, + "grad_norm": 2.187385195417556, + "language_loss": 0.84670323, + "learning_rate": 3.942996783386422e-06, + "loss": 0.86891311, + "num_input_tokens_seen": 37341650, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.2109375, + "step": 1728, + "time_per_iteration": 2.4405598640441895 + }, + { + "auxiliary_loss_clip": 0.01171168, + "auxiliary_loss_mlp": 0.0105142, + "balance_loss_clip": 1.02980709, + "balance_loss_mlp": 1.05098438, + "epoch": 0.10395310386291898, + "flos": 20776513893120.0, + "grad_norm": 2.4528097766615677, + "language_loss": 0.70985407, + "learning_rate": 3.942904426157406e-06, + "loss": 0.73207992, + "num_input_tokens_seen": 37360270, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.203125, + "step": 1729, + "time_per_iteration": 2.465688467025757 + }, + { + "auxiliary_loss_clip": 0.01170997, + "auxiliary_loss_mlp": 0.01059912, + "balance_loss_clip": 1.03679705, + "balance_loss_mlp": 1.05000722, + "epoch": 0.10401322711558696, + "flos": 12820540913280.0, + "grad_norm": 2.5788681057232625, + "language_loss": 0.81288344, + "learning_rate": 3.9428119952532605e-06, + "loss": 0.8351925, + "num_input_tokens_seen": 37375225, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.2109375, + "step": 1730, + "time_per_iteration": 2.4582717418670654 + }, + { + "auxiliary_loss_clip": 0.01167657, + "auxiliary_loss_mlp": 0.010515, + "balance_loss_clip": 1.03190255, + "balance_loss_mlp": 1.04836845, + "epoch": 0.10407335036825492, + "flos": 23184683366400.0, + "grad_norm": 2.1021084439253723, + "language_loss": 0.75932384, + "learning_rate": 3.942719490677489e-06, + "loss": 0.78151548, + "num_input_tokens_seen": 37395165, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.1953125, + "step": 1731, + "time_per_iteration": 2.4650096893310547 + }, + { + "auxiliary_loss_clip": 0.01164648, + "auxiliary_loss_mlp": 0.01046999, + "balance_loss_clip": 1.02762735, + "balance_loss_mlp": 1.04899907, + "epoch": 0.10413347362092289, + "flos": 26104184899200.0, + "grad_norm": 1.8082651510271561, + "language_loss": 0.82679468, + "learning_rate": 3.9426269124336e-06, + "loss": 0.84891117, + "num_input_tokens_seen": 37414845, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.15625, + "step": 1732, + "time_per_iteration": 2.5152552127838135 + }, + { + "auxiliary_loss_clip": 0.01169252, + "auxiliary_loss_mlp": 0.01048286, + "balance_loss_clip": 1.02881873, + "balance_loss_mlp": 1.05052853, + "epoch": 0.10419359687359087, + "flos": 12641059630080.0, + "grad_norm": 2.755876599624297, + "language_loss": 0.82947195, + "learning_rate": 3.942534260525104e-06, + "loss": 0.85164732, + "num_input_tokens_seen": 37432490, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.1875, + "step": 1733, + "time_per_iteration": 2.4426257610321045 + }, + { + "auxiliary_loss_clip": 0.01171007, + "auxiliary_loss_mlp": 0.01052347, + "balance_loss_clip": 1.03171146, + "balance_loss_mlp": 1.04982805, + "epoch": 0.10425372012625883, + "flos": 12125094716160.0, + "grad_norm": 2.4971959439308336, + "language_loss": 0.76446331, + "learning_rate": 3.942441534955514e-06, + "loss": 0.78669679, + "num_input_tokens_seen": 37449435, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.2109375, + "step": 1734, + "time_per_iteration": 2.4556663036346436 + }, + { + "auxiliary_loss_clip": 0.01165131, + "auxiliary_loss_mlp": 0.01047841, + "balance_loss_clip": 1.02795696, + "balance_loss_mlp": 1.04718471, + "epoch": 0.1043138433789268, + "flos": 25337563902720.0, + "grad_norm": 1.9861442095390862, + "language_loss": 0.74962163, + "learning_rate": 3.9423487357283465e-06, + "loss": 0.7717514, + "num_input_tokens_seen": 37469105, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.1796875, + "step": 1735, + "time_per_iteration": 2.4961798191070557 + }, + { + "auxiliary_loss_clip": 0.01172587, + "auxiliary_loss_mlp": 0.01048204, + "balance_loss_clip": 1.02724743, + "balance_loss_mlp": 1.05081487, + "epoch": 0.10437396663159478, + "flos": 29167149352320.0, + "grad_norm": 1.9829662552727403, + "language_loss": 0.79049939, + "learning_rate": 3.94225586284712e-06, + "loss": 0.8127073, + "num_input_tokens_seen": 37490540, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.21875, + "step": 1736, + "time_per_iteration": 2.530808448791504 + }, + { + "auxiliary_loss_clip": 0.01166906, + "auxiliary_loss_mlp": 0.01057245, + "balance_loss_clip": 1.03655005, + "balance_loss_mlp": 1.0491184, + "epoch": 0.10443408988426274, + "flos": 25080946162560.0, + "grad_norm": 1.8105684861006923, + "language_loss": 0.70339012, + "learning_rate": 3.942162916315356e-06, + "loss": 0.72563159, + "num_input_tokens_seen": 37511905, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.171875, + "step": 1737, + "time_per_iteration": 2.4789419174194336 + }, + { + "auxiliary_loss_clip": 0.01170601, + "auxiliary_loss_mlp": 0.01051121, + "balance_loss_clip": 1.02758932, + "balance_loss_mlp": 1.04718471, + "epoch": 0.1044942131369307, + "flos": 26759662237440.0, + "grad_norm": 2.004598680960266, + "language_loss": 0.81483257, + "learning_rate": 3.942069896136581e-06, + "loss": 0.83704984, + "num_input_tokens_seen": 37533635, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.234375, + "step": 1738, + "time_per_iteration": 2.505312442779541 + }, + { + "auxiliary_loss_clip": 0.0116919, + "auxiliary_loss_mlp": 0.01058357, + "balance_loss_clip": 1.0351944, + "balance_loss_mlp": 1.04712963, + "epoch": 0.10455433638959867, + "flos": 18442571875200.0, + "grad_norm": 4.442978598454381, + "language_loss": 0.750579, + "learning_rate": 3.9419768023143196e-06, + "loss": 0.77285445, + "num_input_tokens_seen": 37552035, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.21875, + "step": 1739, + "time_per_iteration": 2.4544031620025635 + }, + { + "auxiliary_loss_clip": 0.01168087, + "auxiliary_loss_mlp": 0.01055908, + "balance_loss_clip": 1.0349865, + "balance_loss_mlp": 1.04893625, + "epoch": 0.10461445964226665, + "flos": 23218977876480.0, + "grad_norm": 1.676051388115223, + "language_loss": 0.77279431, + "learning_rate": 3.941883634852104e-06, + "loss": 0.79503429, + "num_input_tokens_seen": 37571540, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1875, + "step": 1740, + "time_per_iteration": 2.489302635192871 + }, + { + "auxiliary_loss_clip": 0.01169756, + "auxiliary_loss_mlp": 0.01048525, + "balance_loss_clip": 1.02820003, + "balance_loss_mlp": 1.05093944, + "epoch": 0.10467458289493461, + "flos": 24345243797760.0, + "grad_norm": 2.1911967502326775, + "language_loss": 0.85983682, + "learning_rate": 3.941790393753467e-06, + "loss": 0.88201964, + "num_input_tokens_seen": 37588265, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1875, + "step": 1741, + "time_per_iteration": 2.4571211338043213 + }, + { + "auxiliary_loss_clip": 0.01171445, + "auxiliary_loss_mlp": 0.01053113, + "balance_loss_clip": 1.03091609, + "balance_loss_mlp": 1.04901385, + "epoch": 0.10473470614760258, + "flos": 21287953693440.0, + "grad_norm": 4.086245960730198, + "language_loss": 0.74991679, + "learning_rate": 3.941697079021942e-06, + "loss": 0.77216244, + "num_input_tokens_seen": 37606860, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.2265625, + "step": 1742, + "time_per_iteration": 2.4919426441192627 + }, + { + "auxiliary_loss_clip": 0.01171849, + "auxiliary_loss_mlp": 0.01059576, + "balance_loss_clip": 1.03914368, + "balance_loss_mlp": 1.05323386, + "epoch": 0.10479482940027056, + "flos": 21687208341120.0, + "grad_norm": 1.9550995481311175, + "language_loss": 0.87150526, + "learning_rate": 3.94160369066107e-06, + "loss": 0.89381945, + "num_input_tokens_seen": 37625210, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.1875, + "step": 1743, + "time_per_iteration": 2.470841884613037 + }, + { + "auxiliary_loss_clip": 0.01168292, + "auxiliary_loss_mlp": 0.01049872, + "balance_loss_clip": 1.02760363, + "balance_loss_mlp": 1.04964471, + "epoch": 0.10485495265293852, + "flos": 21573694385280.0, + "grad_norm": 2.1176645115958923, + "language_loss": 0.75532508, + "learning_rate": 3.941510228674391e-06, + "loss": 0.77750671, + "num_input_tokens_seen": 37644110, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.1875, + "step": 1744, + "time_per_iteration": 2.4725873470306396 + }, + { + "auxiliary_loss_clip": 0.01171079, + "auxiliary_loss_mlp": 0.01052914, + "balance_loss_clip": 1.03336394, + "balance_loss_mlp": 1.05184436, + "epoch": 0.10491507590560649, + "flos": 37961923708800.0, + "grad_norm": 2.151699961275852, + "language_loss": 0.79306591, + "learning_rate": 3.941416693065451e-06, + "loss": 0.81530583, + "num_input_tokens_seen": 37665800, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.1875, + "step": 1745, + "time_per_iteration": 2.5937912464141846 + }, + { + "auxiliary_loss_clip": 0.01166892, + "auxiliary_loss_mlp": 0.01062835, + "balance_loss_clip": 1.04194999, + "balance_loss_mlp": 1.047683, + "epoch": 0.10497519915827447, + "flos": 26396282298240.0, + "grad_norm": 2.087314316255438, + "language_loss": 0.82382894, + "learning_rate": 3.941323083837794e-06, + "loss": 0.8461262, + "num_input_tokens_seen": 37685095, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1875, + "step": 1746, + "time_per_iteration": 2.520306348800659 + }, + { + "auxiliary_loss_clip": 0.01170145, + "auxiliary_loss_mlp": 0.01062461, + "balance_loss_clip": 1.04186153, + "balance_loss_mlp": 1.05198646, + "epoch": 0.10503532241094243, + "flos": 40662190581120.0, + "grad_norm": 1.645771273172373, + "language_loss": 0.69951761, + "learning_rate": 3.941229400994971e-06, + "loss": 0.7218436, + "num_input_tokens_seen": 37707445, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1875, + "step": 1747, + "time_per_iteration": 2.618581771850586 + }, + { + "auxiliary_loss_clip": 0.01176288, + "auxiliary_loss_mlp": 0.01062255, + "balance_loss_clip": 1.04140496, + "balance_loss_mlp": 1.05136323, + "epoch": 0.1050954456636104, + "flos": 29789409588480.0, + "grad_norm": 2.3385484358742192, + "language_loss": 0.84245849, + "learning_rate": 3.941135644540535e-06, + "loss": 0.86484385, + "num_input_tokens_seen": 37728325, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.25, + "step": 1748, + "time_per_iteration": 2.539386034011841 + }, + { + "auxiliary_loss_clip": 0.01165269, + "auxiliary_loss_mlp": 0.01049548, + "balance_loss_clip": 1.02797103, + "balance_loss_mlp": 1.04729426, + "epoch": 0.10515556891627838, + "flos": 23948754497280.0, + "grad_norm": 1.8953667439120294, + "language_loss": 0.71491921, + "learning_rate": 3.941041814478041e-06, + "loss": 0.7370674, + "num_input_tokens_seen": 37748910, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.1796875, + "step": 1749, + "time_per_iteration": 2.481700897216797 + }, + { + "auxiliary_loss_clip": 0.01166695, + "auxiliary_loss_mlp": 0.01060715, + "balance_loss_clip": 1.0395906, + "balance_loss_mlp": 1.04953468, + "epoch": 0.10521569216894634, + "flos": 18259606972800.0, + "grad_norm": 1.9760411129591238, + "language_loss": 0.81960011, + "learning_rate": 3.940947910811047e-06, + "loss": 0.84187424, + "num_input_tokens_seen": 37765745, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.171875, + "step": 1750, + "time_per_iteration": 2.4454832077026367 + }, + { + "auxiliary_loss_clip": 0.01173904, + "auxiliary_loss_mlp": 0.01060945, + "balance_loss_clip": 1.03946304, + "balance_loss_mlp": 1.05259562, + "epoch": 0.10527581542161431, + "flos": 15630909949440.0, + "grad_norm": 2.3402404294313524, + "language_loss": 0.91871023, + "learning_rate": 3.940853933543114e-06, + "loss": 0.94105875, + "num_input_tokens_seen": 37780520, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.21875, + "step": 1751, + "time_per_iteration": 2.416607141494751 + }, + { + "auxiliary_loss_clip": 0.01166575, + "auxiliary_loss_mlp": 0.0104776, + "balance_loss_clip": 1.02698207, + "balance_loss_mlp": 1.04889047, + "epoch": 0.10533593867428227, + "flos": 18296559089280.0, + "grad_norm": 2.265296057434122, + "language_loss": 0.79560149, + "learning_rate": 3.940759882677805e-06, + "loss": 0.81774485, + "num_input_tokens_seen": 37799515, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.1796875, + "step": 1752, + "time_per_iteration": 2.46063494682312 + }, + { + "auxiliary_loss_clip": 0.01167711, + "auxiliary_loss_mlp": 0.01052906, + "balance_loss_clip": 1.03202033, + "balance_loss_mlp": 1.05050862, + "epoch": 0.10539606192695025, + "flos": 29023219555200.0, + "grad_norm": 2.1401152378303867, + "language_loss": 0.75782037, + "learning_rate": 3.940665758218686e-06, + "loss": 0.78002656, + "num_input_tokens_seen": 37818695, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.171875, + "step": 1753, + "time_per_iteration": 2.51767635345459 + }, + { + "auxiliary_loss_clip": 0.01172527, + "auxiliary_loss_mlp": 0.01057137, + "balance_loss_clip": 1.03436756, + "balance_loss_mlp": 1.04939532, + "epoch": 0.10545618517961822, + "flos": 19969313506560.0, + "grad_norm": 2.0790136174876546, + "language_loss": 0.84048498, + "learning_rate": 3.940571560169328e-06, + "loss": 0.86278164, + "num_input_tokens_seen": 37837860, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.234375, + "step": 1754, + "time_per_iteration": 2.4683756828308105 + }, + { + "auxiliary_loss_clip": 0.01175207, + "auxiliary_loss_mlp": 0.01053622, + "balance_loss_clip": 1.03044736, + "balance_loss_mlp": 1.05438888, + "epoch": 0.10551630843228618, + "flos": 16143427157760.0, + "grad_norm": 2.8736094439376645, + "language_loss": 0.68956709, + "learning_rate": 3.940477288533302e-06, + "loss": 0.71185535, + "num_input_tokens_seen": 37856260, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.2109375, + "step": 1755, + "time_per_iteration": 2.45597243309021 + }, + { + "auxiliary_loss_clip": 0.01172827, + "auxiliary_loss_mlp": 0.01061763, + "balance_loss_clip": 1.03989983, + "balance_loss_mlp": 1.05102587, + "epoch": 0.10557643168495416, + "flos": 23440115957760.0, + "grad_norm": 5.502613786824721, + "language_loss": 0.76718754, + "learning_rate": 3.940382943314182e-06, + "loss": 0.78953344, + "num_input_tokens_seen": 37876960, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.21875, + "step": 1756, + "time_per_iteration": 2.5239176750183105 + }, + { + "auxiliary_loss_clip": 0.01172125, + "auxiliary_loss_mlp": 0.01058013, + "balance_loss_clip": 1.03712726, + "balance_loss_mlp": 1.04982626, + "epoch": 0.10563655493762213, + "flos": 21799034357760.0, + "grad_norm": 1.7784869470084927, + "language_loss": 0.80162531, + "learning_rate": 3.940288524515547e-06, + "loss": 0.82392669, + "num_input_tokens_seen": 37897070, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.21875, + "step": 1757, + "time_per_iteration": 2.4551706314086914 + }, + { + "auxiliary_loss_clip": 0.01171845, + "auxiliary_loss_mlp": 0.01056344, + "balance_loss_clip": 1.03499317, + "balance_loss_mlp": 1.05132246, + "epoch": 0.10569667819029009, + "flos": 53800863275520.0, + "grad_norm": 1.631431596421375, + "language_loss": 0.78800333, + "learning_rate": 3.940194032140976e-06, + "loss": 0.81028521, + "num_input_tokens_seen": 37923635, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.203125, + "step": 1758, + "time_per_iteration": 2.7955896854400635 + }, + { + "auxiliary_loss_clip": 0.01177436, + "auxiliary_loss_mlp": 0.01050523, + "balance_loss_clip": 1.02865982, + "balance_loss_mlp": 1.05364573, + "epoch": 0.10575680144295807, + "flos": 22925515760640.0, + "grad_norm": 2.609159841262955, + "language_loss": 0.9189958, + "learning_rate": 3.940099466194054e-06, + "loss": 0.94127536, + "num_input_tokens_seen": 37942650, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.234375, + "step": 1759, + "time_per_iteration": 2.4853782653808594 + }, + { + "auxiliary_loss_clip": 0.01173064, + "auxiliary_loss_mlp": 0.01055702, + "balance_loss_clip": 1.03276575, + "balance_loss_mlp": 1.04970741, + "epoch": 0.10581692469562604, + "flos": 14136667148160.0, + "grad_norm": 2.498568213886603, + "language_loss": 0.76932353, + "learning_rate": 3.940004826678365e-06, + "loss": 0.79161119, + "num_input_tokens_seen": 37960660, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.234375, + "step": 1760, + "time_per_iteration": 2.470705509185791 + }, + { + "auxiliary_loss_clip": 0.01173982, + "auxiliary_loss_mlp": 0.01061265, + "balance_loss_clip": 1.03825736, + "balance_loss_mlp": 1.05152941, + "epoch": 0.105877047948294, + "flos": 25958674903680.0, + "grad_norm": 2.349800445259612, + "language_loss": 0.89282435, + "learning_rate": 3.939910113597498e-06, + "loss": 0.91517675, + "num_input_tokens_seen": 37978625, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.2265625, + "step": 1761, + "time_per_iteration": 2.491501569747925 + }, + { + "auxiliary_loss_clip": 0.01173015, + "auxiliary_loss_mlp": 0.01060542, + "balance_loss_clip": 1.03944254, + "balance_loss_mlp": 1.0518589, + "epoch": 0.10593717120096197, + "flos": 30664768032000.0, + "grad_norm": 2.4794664397863877, + "language_loss": 0.78304708, + "learning_rate": 3.9398153269550464e-06, + "loss": 0.80538261, + "num_input_tokens_seen": 38000005, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.2109375, + "step": 1762, + "time_per_iteration": 2.5563831329345703 + }, + { + "auxiliary_loss_clip": 0.01062071, + "auxiliary_loss_mlp": 0.01014008, + "balance_loss_clip": 1.0110991, + "balance_loss_mlp": 1.02000487, + "epoch": 0.10599729445362994, + "flos": 66436682497920.0, + "grad_norm": 0.753444103392694, + "language_loss": 0.60481733, + "learning_rate": 3.939720466754602e-06, + "loss": 0.62557811, + "num_input_tokens_seen": 38066165, + "router_z_loss_clip": 0.02905273, + "router_z_loss_mlp": 0.421875, + "step": 1763, + "time_per_iteration": 3.2239294052124023 + }, + { + "auxiliary_loss_clip": 0.01170891, + "auxiliary_loss_mlp": 0.01048971, + "balance_loss_clip": 1.02777529, + "balance_loss_mlp": 1.04924011, + "epoch": 0.10605741770629791, + "flos": 23948179879680.0, + "grad_norm": 2.054980370260194, + "language_loss": 0.8010751, + "learning_rate": 3.939625532999763e-06, + "loss": 0.82327372, + "num_input_tokens_seen": 38086150, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.21875, + "step": 1764, + "time_per_iteration": 2.476325273513794 + }, + { + "auxiliary_loss_clip": 0.01169028, + "auxiliary_loss_mlp": 0.01049345, + "balance_loss_clip": 1.02745855, + "balance_loss_mlp": 1.04961264, + "epoch": 0.10611754095896588, + "flos": 19387524919680.0, + "grad_norm": 1.7621956234955212, + "language_loss": 0.7999962, + "learning_rate": 3.9395305256941314e-06, + "loss": 0.82217997, + "num_input_tokens_seen": 38104205, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.1953125, + "step": 1765, + "time_per_iteration": 2.446593999862671 + }, + { + "auxiliary_loss_clip": 0.01167126, + "auxiliary_loss_mlp": 0.01054873, + "balance_loss_clip": 1.03394008, + "balance_loss_mlp": 1.04794002, + "epoch": 0.10617766421163385, + "flos": 22237755073920.0, + "grad_norm": 1.867239621884004, + "language_loss": 0.76693732, + "learning_rate": 3.939435444841306e-06, + "loss": 0.78915727, + "num_input_tokens_seen": 38122005, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1875, + "step": 1766, + "time_per_iteration": 2.4462356567382812 + }, + { + "auxiliary_loss_clip": 0.01170332, + "auxiliary_loss_mlp": 0.01059306, + "balance_loss_clip": 1.0366683, + "balance_loss_mlp": 1.05017042, + "epoch": 0.10623778746430182, + "flos": 28404407024640.0, + "grad_norm": 1.6580981789618001, + "language_loss": 0.77319431, + "learning_rate": 3.939340290444895e-06, + "loss": 0.79549068, + "num_input_tokens_seen": 38143365, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.203125, + "step": 1767, + "time_per_iteration": 2.542797088623047 + }, + { + "auxiliary_loss_clip": 0.01060068, + "auxiliary_loss_mlp": 0.01000453, + "balance_loss_clip": 0.99785471, + "balance_loss_mlp": 1.01804066, + "epoch": 0.10629791071696978, + "flos": 64234639221120.0, + "grad_norm": 0.6789245534488961, + "language_loss": 0.57902765, + "learning_rate": 3.939245062508506e-06, + "loss": 0.59963286, + "num_input_tokens_seen": 38210035, + "router_z_loss_clip": 0.02600098, + "router_z_loss_mlp": 0.421875, + "step": 1768, + "time_per_iteration": 6.071596384048462 + }, + { + "auxiliary_loss_clip": 0.01172748, + "auxiliary_loss_mlp": 0.01041813, + "balance_loss_clip": 1.0219171, + "balance_loss_mlp": 1.05201912, + "epoch": 0.10635803396963776, + "flos": 22747578762240.0, + "grad_norm": 1.446404125156032, + "language_loss": 0.86796767, + "learning_rate": 3.939149761035749e-06, + "loss": 0.89011335, + "num_input_tokens_seen": 38231230, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.2109375, + "step": 1769, + "time_per_iteration": 2.5106868743896484 + }, + { + "auxiliary_loss_clip": 0.01175908, + "auxiliary_loss_mlp": 0.01056805, + "balance_loss_clip": 1.03496528, + "balance_loss_mlp": 1.05300689, + "epoch": 0.10641815722230573, + "flos": 31395586147200.0, + "grad_norm": 1.766851816283336, + "language_loss": 0.61890501, + "learning_rate": 3.9390543860302395e-06, + "loss": 0.64123213, + "num_input_tokens_seen": 38253890, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.2265625, + "step": 1770, + "time_per_iteration": 2.5770323276519775 + }, + { + "auxiliary_loss_clip": 0.01061292, + "auxiliary_loss_mlp": 0.01003176, + "balance_loss_clip": 1.00058925, + "balance_loss_mlp": 1.01873469, + "epoch": 0.1064782804749737, + "flos": 58552527784320.0, + "grad_norm": 0.8864779346546747, + "language_loss": 0.57095039, + "learning_rate": 3.9389589374955925e-06, + "loss": 0.59159505, + "num_input_tokens_seen": 38304290, + "router_z_loss_clip": 0.02587891, + "router_z_loss_mlp": 0.42578125, + "step": 1771, + "time_per_iteration": 2.957993507385254 + }, + { + "auxiliary_loss_clip": 0.01174087, + "auxiliary_loss_mlp": 0.01063103, + "balance_loss_clip": 1.04187179, + "balance_loss_mlp": 1.05443954, + "epoch": 0.10653840372764166, + "flos": 23987825516160.0, + "grad_norm": 1.6398085638646198, + "language_loss": 0.88530469, + "learning_rate": 3.938863415435429e-06, + "loss": 0.90767658, + "num_input_tokens_seen": 38324725, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1953125, + "step": 1772, + "time_per_iteration": 2.520744562149048 + }, + { + "auxiliary_loss_clip": 0.01176768, + "auxiliary_loss_mlp": 0.01063607, + "balance_loss_clip": 1.03945482, + "balance_loss_mlp": 1.05091381, + "epoch": 0.10659852698030964, + "flos": 18294655668480.0, + "grad_norm": 2.8236986107629094, + "language_loss": 0.76021719, + "learning_rate": 3.93876781985337e-06, + "loss": 0.78262091, + "num_input_tokens_seen": 38340735, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.2578125, + "step": 1773, + "time_per_iteration": 2.4228129386901855 + }, + { + "auxiliary_loss_clip": 0.01171647, + "auxiliary_loss_mlp": 0.01063224, + "balance_loss_clip": 1.04087257, + "balance_loss_mlp": 1.05147731, + "epoch": 0.1066586502329776, + "flos": 32160591031680.0, + "grad_norm": 2.1931291175477177, + "language_loss": 0.83184093, + "learning_rate": 3.938672150753041e-06, + "loss": 0.85418963, + "num_input_tokens_seen": 38361315, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.203125, + "step": 1774, + "time_per_iteration": 2.5613787174224854 + }, + { + "auxiliary_loss_clip": 0.01177598, + "auxiliary_loss_mlp": 0.01054445, + "balance_loss_clip": 1.03262997, + "balance_loss_mlp": 1.05220413, + "epoch": 0.10671877348564557, + "flos": 17785155202560.0, + "grad_norm": 2.683505024819064, + "language_loss": 0.76297373, + "learning_rate": 3.9385764081380704e-06, + "loss": 0.78529418, + "num_input_tokens_seen": 38377425, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.25, + "step": 1775, + "time_per_iteration": 2.437676429748535 + }, + { + "auxiliary_loss_clip": 0.01057587, + "auxiliary_loss_mlp": 0.01006639, + "balance_loss_clip": 1.00413537, + "balance_loss_mlp": 1.01520467, + "epoch": 0.10677889673831355, + "flos": 63510177813120.0, + "grad_norm": 0.8253045983972309, + "language_loss": 0.57443953, + "learning_rate": 3.9384805920120876e-06, + "loss": 0.59508181, + "num_input_tokens_seen": 38440275, + "router_z_loss_clip": 0.02502441, + "router_z_loss_mlp": 0.42382812, + "step": 1776, + "time_per_iteration": 3.101378917694092 + }, + { + "auxiliary_loss_clip": 0.01176962, + "auxiliary_loss_mlp": 0.01059775, + "balance_loss_clip": 1.0365653, + "balance_loss_mlp": 1.05411029, + "epoch": 0.10683901999098151, + "flos": 22017694400640.0, + "grad_norm": 1.6481869723516467, + "language_loss": 0.83374244, + "learning_rate": 3.938384702378727e-06, + "loss": 0.8561098, + "num_input_tokens_seen": 38461820, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.2265625, + "step": 1777, + "time_per_iteration": 2.5109002590179443 + }, + { + "auxiliary_loss_clip": 0.01170133, + "auxiliary_loss_mlp": 0.01055162, + "balance_loss_clip": 1.03371584, + "balance_loss_mlp": 1.05298579, + "epoch": 0.10689914324364948, + "flos": 25042952551680.0, + "grad_norm": 2.6420984425067013, + "language_loss": 0.87275863, + "learning_rate": 3.938288739241625e-06, + "loss": 0.89501154, + "num_input_tokens_seen": 38482235, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.171875, + "step": 1778, + "time_per_iteration": 2.503103494644165 + }, + { + "auxiliary_loss_clip": 0.01175003, + "auxiliary_loss_mlp": 0.01053847, + "balance_loss_clip": 1.032354, + "balance_loss_mlp": 1.05328, + "epoch": 0.10695926649631746, + "flos": 16435129507200.0, + "grad_norm": 2.213225731734914, + "language_loss": 0.83970487, + "learning_rate": 3.938192702604417e-06, + "loss": 0.86199337, + "num_input_tokens_seen": 38500690, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.21875, + "step": 1779, + "time_per_iteration": 2.4574496746063232 + }, + { + "auxiliary_loss_clip": 0.01169562, + "auxiliary_loss_mlp": 0.01052117, + "balance_loss_clip": 1.03086162, + "balance_loss_mlp": 1.04975557, + "epoch": 0.10701938974898542, + "flos": 16979211792000.0, + "grad_norm": 2.4959309518827655, + "language_loss": 0.67064941, + "learning_rate": 3.9380965924707495e-06, + "loss": 0.69286621, + "num_input_tokens_seen": 38518405, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1953125, + "step": 1780, + "time_per_iteration": 2.447756052017212 + }, + { + "auxiliary_loss_clip": 0.01171847, + "auxiliary_loss_mlp": 0.01046888, + "balance_loss_clip": 1.02553749, + "balance_loss_mlp": 1.05183458, + "epoch": 0.10707951300165339, + "flos": 15888102307200.0, + "grad_norm": 2.25546613947904, + "language_loss": 0.91667759, + "learning_rate": 3.938000408844265e-06, + "loss": 0.93886495, + "num_input_tokens_seen": 38535060, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.203125, + "step": 1781, + "time_per_iteration": 2.4367144107818604 + }, + { + "auxiliary_loss_clip": 0.01175433, + "auxiliary_loss_mlp": 0.01046071, + "balance_loss_clip": 1.02524495, + "balance_loss_mlp": 1.05302, + "epoch": 0.10713963625432135, + "flos": 14247164361600.0, + "grad_norm": 2.202402738572802, + "language_loss": 0.79505372, + "learning_rate": 3.9379041517286105e-06, + "loss": 0.81726873, + "num_input_tokens_seen": 38552855, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.2265625, + "step": 1782, + "time_per_iteration": 2.4340877532958984 + }, + { + "auxiliary_loss_clip": 0.01175468, + "auxiliary_loss_mlp": 0.01052246, + "balance_loss_clip": 1.03055024, + "balance_loss_mlp": 1.0517509, + "epoch": 0.10719975950698933, + "flos": 16756780821120.0, + "grad_norm": 2.0445491568240994, + "language_loss": 0.78994977, + "learning_rate": 3.937807821127436e-06, + "loss": 0.81222689, + "num_input_tokens_seen": 38570075, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.234375, + "step": 1783, + "time_per_iteration": 2.434527635574341 + }, + { + "auxiliary_loss_clip": 0.01176375, + "auxiliary_loss_mlp": 0.01052212, + "balance_loss_clip": 1.02991986, + "balance_loss_mlp": 1.0529108, + "epoch": 0.1072598827596573, + "flos": 22710626645760.0, + "grad_norm": 1.8050343336808015, + "language_loss": 0.85956216, + "learning_rate": 3.937711417044395e-06, + "loss": 0.88184798, + "num_input_tokens_seen": 38587970, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.234375, + "step": 1784, + "time_per_iteration": 2.4867746829986572 + }, + { + "auxiliary_loss_clip": 0.01174134, + "auxiliary_loss_mlp": 0.01054075, + "balance_loss_clip": 1.03188968, + "balance_loss_mlp": 1.05080986, + "epoch": 0.10732000601232526, + "flos": 23258264376960.0, + "grad_norm": 3.0774406347184806, + "language_loss": 1.00899053, + "learning_rate": 3.937614939483143e-06, + "loss": 1.03127265, + "num_input_tokens_seen": 38605840, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.234375, + "step": 1785, + "time_per_iteration": 2.46663498878479 + }, + { + "auxiliary_loss_clip": 0.01171119, + "auxiliary_loss_mlp": 0.01057254, + "balance_loss_clip": 1.03636849, + "balance_loss_mlp": 1.05306709, + "epoch": 0.10738012926499324, + "flos": 24207060176640.0, + "grad_norm": 1.4495948735276882, + "language_loss": 0.85070992, + "learning_rate": 3.937518388447339e-06, + "loss": 0.87299371, + "num_input_tokens_seen": 38627070, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1796875, + "step": 1786, + "time_per_iteration": 2.505018949508667 + }, + { + "auxiliary_loss_clip": 0.01170035, + "auxiliary_loss_mlp": 0.01059108, + "balance_loss_clip": 1.035779, + "balance_loss_mlp": 1.04750311, + "epoch": 0.1074402525176612, + "flos": 20923065383040.0, + "grad_norm": 1.8788886178726656, + "language_loss": 0.78817046, + "learning_rate": 3.937421763940642e-06, + "loss": 0.81046188, + "num_input_tokens_seen": 38645840, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.21875, + "step": 1787, + "time_per_iteration": 2.443532705307007 + }, + { + "auxiliary_loss_clip": 0.01176938, + "auxiliary_loss_mlp": 0.01049821, + "balance_loss_clip": 1.02768385, + "balance_loss_mlp": 1.0517112, + "epoch": 0.10750037577032917, + "flos": 16946928443520.0, + "grad_norm": 2.551869220071384, + "language_loss": 0.82557851, + "learning_rate": 3.937325065966719e-06, + "loss": 0.84784609, + "num_input_tokens_seen": 38664770, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.25, + "step": 1788, + "time_per_iteration": 2.4807305335998535 + }, + { + "auxiliary_loss_clip": 0.01170019, + "auxiliary_loss_mlp": 0.0106343, + "balance_loss_clip": 1.04219902, + "balance_loss_mlp": 1.04939878, + "epoch": 0.10756049902299715, + "flos": 20266546550400.0, + "grad_norm": 2.778852512980128, + "language_loss": 0.77794182, + "learning_rate": 3.9372282945292335e-06, + "loss": 0.80027628, + "num_input_tokens_seen": 38683865, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.203125, + "step": 1789, + "time_per_iteration": 2.482330322265625 + }, + { + "auxiliary_loss_clip": 0.01173111, + "auxiliary_loss_mlp": 0.01061244, + "balance_loss_clip": 1.03631723, + "balance_loss_mlp": 1.05133712, + "epoch": 0.10762062227566511, + "flos": 23586523793280.0, + "grad_norm": 2.434124451319009, + "language_loss": 0.74467903, + "learning_rate": 3.937131449631859e-06, + "loss": 0.76702261, + "num_input_tokens_seen": 38702485, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 1.21875, + "step": 1790, + "time_per_iteration": 2.5921239852905273 + }, + { + "auxiliary_loss_clip": 0.01177807, + "auxiliary_loss_mlp": 0.01072366, + "balance_loss_clip": 1.04766607, + "balance_loss_mlp": 1.05428767, + "epoch": 0.10768074552833308, + "flos": 24310626065280.0, + "grad_norm": 2.5839507236364554, + "language_loss": 0.78495383, + "learning_rate": 3.9370345312782645e-06, + "loss": 0.80745554, + "num_input_tokens_seen": 38722475, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.234375, + "step": 1791, + "time_per_iteration": 2.5242488384246826 + }, + { + "auxiliary_loss_clip": 0.01167341, + "auxiliary_loss_mlp": 0.01053897, + "balance_loss_clip": 1.0330478, + "balance_loss_mlp": 1.05112934, + "epoch": 0.10774086878100106, + "flos": 25299965341440.0, + "grad_norm": 1.8605555947944812, + "language_loss": 0.70855284, + "learning_rate": 3.936937539472126e-06, + "loss": 0.73076522, + "num_input_tokens_seen": 38743285, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.15625, + "step": 1792, + "time_per_iteration": 2.5260751247406006 + }, + { + "auxiliary_loss_clip": 0.01175824, + "auxiliary_loss_mlp": 0.010463, + "balance_loss_clip": 1.02330506, + "balance_loss_mlp": 1.05109024, + "epoch": 0.10780099203366902, + "flos": 22054035985920.0, + "grad_norm": 2.973355145299492, + "language_loss": 0.76029646, + "learning_rate": 3.9368404742171236e-06, + "loss": 0.78251767, + "num_input_tokens_seen": 38763035, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.25, + "step": 1793, + "time_per_iteration": 2.5037007331848145 + }, + { + "auxiliary_loss_clip": 0.01171847, + "auxiliary_loss_mlp": 0.01060242, + "balance_loss_clip": 1.03793848, + "balance_loss_mlp": 1.0537113, + "epoch": 0.10786111528633699, + "flos": 22747471021440.0, + "grad_norm": 1.7251623627880495, + "language_loss": 0.85158944, + "learning_rate": 3.936743335516936e-06, + "loss": 0.87391031, + "num_input_tokens_seen": 38784900, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.1875, + "step": 1794, + "time_per_iteration": 2.5210132598876953 + }, + { + "auxiliary_loss_clip": 0.01180393, + "auxiliary_loss_mlp": 0.01052991, + "balance_loss_clip": 1.02954292, + "balance_loss_mlp": 1.05342674, + "epoch": 0.10792123853900495, + "flos": 20851064570880.0, + "grad_norm": 1.9245153565321482, + "language_loss": 0.74914879, + "learning_rate": 3.936646123375246e-06, + "loss": 0.77148265, + "num_input_tokens_seen": 38804695, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.265625, + "step": 1795, + "time_per_iteration": 2.486111879348755 + }, + { + "auxiliary_loss_clip": 0.0117609, + "auxiliary_loss_mlp": 0.01060963, + "balance_loss_clip": 1.03863525, + "balance_loss_mlp": 1.05227423, + "epoch": 0.10798136179167293, + "flos": 17748705876480.0, + "grad_norm": 2.917857918230487, + "language_loss": 0.8116014, + "learning_rate": 3.936548837795741e-06, + "loss": 0.83397192, + "num_input_tokens_seen": 38822395, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.234375, + "step": 1796, + "time_per_iteration": 2.4357504844665527 + }, + { + "auxiliary_loss_clip": 0.01177296, + "auxiliary_loss_mlp": 0.01075942, + "balance_loss_clip": 1.05260134, + "balance_loss_mlp": 1.05476594, + "epoch": 0.1080414850443409, + "flos": 13589639948160.0, + "grad_norm": 2.4043777768562293, + "language_loss": 0.73476732, + "learning_rate": 3.936451478782111e-06, + "loss": 0.75729972, + "num_input_tokens_seen": 38839865, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.21875, + "step": 1797, + "time_per_iteration": 2.477867841720581 + }, + { + "auxiliary_loss_clip": 0.01172695, + "auxiliary_loss_mlp": 0.01051138, + "balance_loss_clip": 1.03081274, + "balance_loss_mlp": 1.05260658, + "epoch": 0.10810160829700886, + "flos": 16253421580800.0, + "grad_norm": 3.1892188654982396, + "language_loss": 0.81348622, + "learning_rate": 3.936354046338046e-06, + "loss": 0.83572453, + "num_input_tokens_seen": 38857300, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.203125, + "step": 1798, + "time_per_iteration": 2.5060064792633057 + }, + { + "auxiliary_loss_clip": 0.011719, + "auxiliary_loss_mlp": 0.01053896, + "balance_loss_clip": 1.03075755, + "balance_loss_mlp": 1.0508821, + "epoch": 0.10816173154967684, + "flos": 15158002464000.0, + "grad_norm": 2.4195393058725623, + "language_loss": 0.85180116, + "learning_rate": 3.936256540467242e-06, + "loss": 0.87405908, + "num_input_tokens_seen": 38874960, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.2109375, + "step": 1799, + "time_per_iteration": 2.4546945095062256 + }, + { + "auxiliary_loss_clip": 0.01172982, + "auxiliary_loss_mlp": 0.01064124, + "balance_loss_clip": 1.04271412, + "balance_loss_mlp": 1.0546999, + "epoch": 0.10822185480234481, + "flos": 17785334770560.0, + "grad_norm": 2.2474252534922265, + "language_loss": 0.77365196, + "learning_rate": 3.9361589611733955e-06, + "loss": 0.79602301, + "num_input_tokens_seen": 38893610, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.1796875, + "step": 1800, + "time_per_iteration": 2.4650769233703613 + }, + { + "auxiliary_loss_clip": 0.01168665, + "auxiliary_loss_mlp": 0.01044543, + "balance_loss_clip": 1.02443254, + "balance_loss_mlp": 1.05136347, + "epoch": 0.10828197805501277, + "flos": 25556654908800.0, + "grad_norm": 2.2954016650766844, + "language_loss": 0.7287963, + "learning_rate": 3.9360613084602075e-06, + "loss": 0.7509284, + "num_input_tokens_seen": 38913485, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.171875, + "step": 1801, + "time_per_iteration": 2.5045113563537598 + }, + { + "auxiliary_loss_clip": 0.01177863, + "auxiliary_loss_mlp": 0.01048534, + "balance_loss_clip": 1.02785134, + "balance_loss_mlp": 1.05259442, + "epoch": 0.10834210130768075, + "flos": 28984435845120.0, + "grad_norm": 1.8364602771794378, + "language_loss": 0.66427058, + "learning_rate": 3.935963582331381e-06, + "loss": 0.68653458, + "num_input_tokens_seen": 38935650, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.25, + "step": 1802, + "time_per_iteration": 2.5547947883605957 + }, + { + "auxiliary_loss_clip": 0.01170285, + "auxiliary_loss_mlp": 0.01059138, + "balance_loss_clip": 1.03712058, + "balance_loss_mlp": 1.05202222, + "epoch": 0.10840222456034872, + "flos": 20264212166400.0, + "grad_norm": 1.7898565484043845, + "language_loss": 0.8136133, + "learning_rate": 3.935865782790621e-06, + "loss": 0.83590758, + "num_input_tokens_seen": 38954130, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1796875, + "step": 1803, + "time_per_iteration": 2.4758658409118652 + }, + { + "auxiliary_loss_clip": 0.0116949, + "auxiliary_loss_mlp": 0.01052862, + "balance_loss_clip": 1.031106, + "balance_loss_mlp": 1.05126929, + "epoch": 0.10846234781301668, + "flos": 19863054097920.0, + "grad_norm": 2.61974519761109, + "language_loss": 0.9122982, + "learning_rate": 3.9357679098416365e-06, + "loss": 0.93452168, + "num_input_tokens_seen": 38972905, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.1875, + "step": 1804, + "time_per_iteration": 2.4858944416046143 + }, + { + "auxiliary_loss_clip": 0.01175328, + "auxiliary_loss_mlp": 0.01052796, + "balance_loss_clip": 1.03031349, + "balance_loss_mlp": 1.05401301, + "epoch": 0.10852247106568465, + "flos": 26469037296000.0, + "grad_norm": 2.0091269076806078, + "language_loss": 0.7623654, + "learning_rate": 3.935669963488139e-06, + "loss": 0.78464663, + "num_input_tokens_seen": 38993255, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.2109375, + "step": 1805, + "time_per_iteration": 2.5379836559295654 + }, + { + "auxiliary_loss_clip": 0.01172079, + "auxiliary_loss_mlp": 0.01048159, + "balance_loss_clip": 1.02842999, + "balance_loss_mlp": 1.0535754, + "epoch": 0.10858259431835263, + "flos": 30081506987520.0, + "grad_norm": 1.8192828849331855, + "language_loss": 0.860416, + "learning_rate": 3.935571943733843e-06, + "loss": 0.88261837, + "num_input_tokens_seen": 39012610, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1875, + "step": 1806, + "time_per_iteration": 2.5523955821990967 + }, + { + "auxiliary_loss_clip": 0.01170931, + "auxiliary_loss_mlp": 0.01053704, + "balance_loss_clip": 1.03275895, + "balance_loss_mlp": 1.05068612, + "epoch": 0.10864271757102059, + "flos": 19063180085760.0, + "grad_norm": 5.439462316727856, + "language_loss": 0.80572915, + "learning_rate": 3.9354738505824635e-06, + "loss": 0.82797557, + "num_input_tokens_seen": 39030120, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.203125, + "step": 1807, + "time_per_iteration": 2.514390230178833 + }, + { + "auxiliary_loss_clip": 0.01171878, + "auxiliary_loss_mlp": 0.01051305, + "balance_loss_clip": 1.03168321, + "balance_loss_mlp": 1.05415583, + "epoch": 0.10870284082368856, + "flos": 24715052271360.0, + "grad_norm": 1.7684897552837426, + "language_loss": 0.78731525, + "learning_rate": 3.9353756840377225e-06, + "loss": 0.80954707, + "num_input_tokens_seen": 39049875, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.171875, + "step": 1808, + "time_per_iteration": 2.5084331035614014 + }, + { + "auxiliary_loss_clip": 0.01176105, + "auxiliary_loss_mlp": 0.01052005, + "balance_loss_clip": 1.03090501, + "balance_loss_mlp": 1.05633223, + "epoch": 0.10876296407635654, + "flos": 20627663932800.0, + "grad_norm": 1.6609588216066864, + "language_loss": 0.78927523, + "learning_rate": 3.935277444103342e-06, + "loss": 0.81155634, + "num_input_tokens_seen": 39068935, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1953125, + "step": 1809, + "time_per_iteration": 5.368049621582031 + }, + { + "auxiliary_loss_clip": 0.01171492, + "auxiliary_loss_mlp": 0.01053913, + "balance_loss_clip": 1.03318286, + "balance_loss_mlp": 1.05087388, + "epoch": 0.1088230873290245, + "flos": 21579835610880.0, + "grad_norm": 2.0370215842844197, + "language_loss": 0.8468523, + "learning_rate": 3.935179130783046e-06, + "loss": 0.86910635, + "num_input_tokens_seen": 39087370, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.203125, + "step": 1810, + "time_per_iteration": 3.904432535171509 + }, + { + "auxiliary_loss_clip": 0.01180342, + "auxiliary_loss_mlp": 0.01054633, + "balance_loss_clip": 1.03111291, + "balance_loss_mlp": 1.05665135, + "epoch": 0.10888321058169247, + "flos": 26469037296000.0, + "grad_norm": 1.9531179942167565, + "language_loss": 0.63677633, + "learning_rate": 3.935080744080564e-06, + "loss": 0.6591261, + "num_input_tokens_seen": 39106635, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.234375, + "step": 1811, + "time_per_iteration": 2.523650646209717 + }, + { + "auxiliary_loss_clip": 0.01171345, + "auxiliary_loss_mlp": 0.01048747, + "balance_loss_clip": 1.02737319, + "balance_loss_mlp": 1.05139136, + "epoch": 0.10894333383436045, + "flos": 25848608653440.0, + "grad_norm": 3.279966127836369, + "language_loss": 0.74238914, + "learning_rate": 3.934982283999626e-06, + "loss": 0.76459008, + "num_input_tokens_seen": 39126335, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.203125, + "step": 1812, + "time_per_iteration": 2.5579042434692383 + }, + { + "auxiliary_loss_clip": 0.01173457, + "auxiliary_loss_mlp": 0.01047521, + "balance_loss_clip": 1.02587295, + "balance_loss_mlp": 1.05391026, + "epoch": 0.10900345708702841, + "flos": 19537093152000.0, + "grad_norm": 1.9314487748153213, + "language_loss": 0.72647583, + "learning_rate": 3.934883750543966e-06, + "loss": 0.74868566, + "num_input_tokens_seen": 39144820, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.1875, + "step": 1813, + "time_per_iteration": 2.488762617111206 + }, + { + "auxiliary_loss_clip": 0.01174675, + "auxiliary_loss_mlp": 0.01051455, + "balance_loss_clip": 1.02999711, + "balance_loss_mlp": 1.05744648, + "epoch": 0.10906358033969638, + "flos": 23623296341760.0, + "grad_norm": 10.097396236718186, + "language_loss": 0.82224226, + "learning_rate": 3.93478514371732e-06, + "loss": 0.84450358, + "num_input_tokens_seen": 39165945, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.171875, + "step": 1814, + "time_per_iteration": 2.495798349380493 + }, + { + "auxiliary_loss_clip": 0.01176897, + "auxiliary_loss_mlp": 0.01057916, + "balance_loss_clip": 1.03670859, + "balance_loss_mlp": 1.05595291, + "epoch": 0.10912370359236434, + "flos": 21214731818880.0, + "grad_norm": 2.3551509805271422, + "language_loss": 0.84218144, + "learning_rate": 3.934686463523429e-06, + "loss": 0.86452949, + "num_input_tokens_seen": 39183520, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.2109375, + "step": 1815, + "time_per_iteration": 2.462663173675537 + }, + { + "auxiliary_loss_clip": 0.01175955, + "auxiliary_loss_mlp": 0.01053131, + "balance_loss_clip": 1.03054035, + "balance_loss_mlp": 1.05833483, + "epoch": 0.10918382684503232, + "flos": 13553190622080.0, + "grad_norm": 2.3954928768695027, + "language_loss": 0.71048725, + "learning_rate": 3.9345877099660315e-06, + "loss": 0.73277813, + "num_input_tokens_seen": 39201190, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.171875, + "step": 1816, + "time_per_iteration": 2.465953826904297 + }, + { + "auxiliary_loss_clip": 0.01178612, + "auxiliary_loss_mlp": 0.01063123, + "balance_loss_clip": 1.04061651, + "balance_loss_mlp": 1.056674, + "epoch": 0.10924395009770028, + "flos": 27964321591680.0, + "grad_norm": 2.0063973144433067, + "language_loss": 0.72811669, + "learning_rate": 3.9344888830488744e-06, + "loss": 0.75053406, + "num_input_tokens_seen": 39221210, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.21875, + "step": 1817, + "time_per_iteration": 2.5323143005371094 + }, + { + "auxiliary_loss_clip": 0.01178871, + "auxiliary_loss_mlp": 0.01054207, + "balance_loss_clip": 1.03167605, + "balance_loss_mlp": 1.05709267, + "epoch": 0.10930407335036825, + "flos": 25593750679680.0, + "grad_norm": 1.767365755633268, + "language_loss": 0.67279243, + "learning_rate": 3.934389982775706e-06, + "loss": 0.6951232, + "num_input_tokens_seen": 39242025, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.21875, + "step": 1818, + "time_per_iteration": 2.5450243949890137 + }, + { + "auxiliary_loss_clip": 0.01177017, + "auxiliary_loss_mlp": 0.01063325, + "balance_loss_clip": 1.04123521, + "balance_loss_mlp": 1.05534315, + "epoch": 0.10936419660303623, + "flos": 18406194376320.0, + "grad_norm": 2.0802139312896744, + "language_loss": 0.72992313, + "learning_rate": 3.934291009150275e-06, + "loss": 0.75232661, + "num_input_tokens_seen": 39259870, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.21875, + "step": 1819, + "time_per_iteration": 2.487644910812378 + }, + { + "auxiliary_loss_clip": 0.01180831, + "auxiliary_loss_mlp": 0.01051168, + "balance_loss_clip": 1.02994883, + "balance_loss_mlp": 1.06090236, + "epoch": 0.1094243198557042, + "flos": 23840052963840.0, + "grad_norm": 7.240077427900601, + "language_loss": 0.73943537, + "learning_rate": 3.934191962176335e-06, + "loss": 0.76175541, + "num_input_tokens_seen": 39278500, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.203125, + "step": 1820, + "time_per_iteration": 2.5064899921417236 + }, + { + "auxiliary_loss_clip": 0.01177081, + "auxiliary_loss_mlp": 0.0105084, + "balance_loss_clip": 1.02765381, + "balance_loss_mlp": 1.05699766, + "epoch": 0.10948444310837216, + "flos": 14643940970880.0, + "grad_norm": 2.1677198782015887, + "language_loss": 0.82586408, + "learning_rate": 3.934092841857642e-06, + "loss": 0.84814322, + "num_input_tokens_seen": 39294800, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.203125, + "step": 1821, + "time_per_iteration": 2.4487218856811523 + }, + { + "auxiliary_loss_clip": 0.01174491, + "auxiliary_loss_mlp": 0.01049191, + "balance_loss_clip": 1.0280906, + "balance_loss_mlp": 1.05549288, + "epoch": 0.10954456636104014, + "flos": 27818811596160.0, + "grad_norm": 2.4783722356243065, + "language_loss": 0.76171732, + "learning_rate": 3.933993648197955e-06, + "loss": 0.78395414, + "num_input_tokens_seen": 39314625, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1875, + "step": 1822, + "time_per_iteration": 2.5196666717529297 + }, + { + "auxiliary_loss_clip": 0.01175357, + "auxiliary_loss_mlp": 0.01051738, + "balance_loss_clip": 1.03070986, + "balance_loss_mlp": 1.05751145, + "epoch": 0.1096046896137081, + "flos": 33620934372480.0, + "grad_norm": 1.9066217775511896, + "language_loss": 0.79275787, + "learning_rate": 3.933894381201034e-06, + "loss": 0.81502879, + "num_input_tokens_seen": 39336465, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1796875, + "step": 1823, + "time_per_iteration": 2.5665249824523926 + }, + { + "auxiliary_loss_clip": 0.01176588, + "auxiliary_loss_mlp": 0.01047872, + "balance_loss_clip": 1.02583015, + "balance_loss_mlp": 1.05788529, + "epoch": 0.10966481286637607, + "flos": 26980010219520.0, + "grad_norm": 1.7066251744315906, + "language_loss": 0.79424715, + "learning_rate": 3.933795040870645e-06, + "loss": 0.81649172, + "num_input_tokens_seen": 39357930, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1875, + "step": 1824, + "time_per_iteration": 2.5238118171691895 + }, + { + "auxiliary_loss_clip": 0.01173809, + "auxiliary_loss_mlp": 0.01054255, + "balance_loss_clip": 1.03264284, + "balance_loss_mlp": 1.05610347, + "epoch": 0.10972493611904403, + "flos": 23036551678080.0, + "grad_norm": 2.2183246130345, + "language_loss": 0.87992203, + "learning_rate": 3.933695627210554e-06, + "loss": 0.90220273, + "num_input_tokens_seen": 39376380, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.171875, + "step": 1825, + "time_per_iteration": 2.48294734954834 + }, + { + "auxiliary_loss_clip": 0.01171104, + "auxiliary_loss_mlp": 0.01056568, + "balance_loss_clip": 1.03483629, + "balance_loss_mlp": 1.05362988, + "epoch": 0.10978505937171201, + "flos": 38104632443520.0, + "grad_norm": 1.8404731426595848, + "language_loss": 0.76462233, + "learning_rate": 3.933596140224532e-06, + "loss": 0.78689909, + "num_input_tokens_seen": 39399935, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.171875, + "step": 1826, + "time_per_iteration": 2.6397035121917725 + }, + { + "auxiliary_loss_clip": 0.01066703, + "auxiliary_loss_mlp": 0.01001412, + "balance_loss_clip": 0.9983961, + "balance_loss_mlp": 1.02257371, + "epoch": 0.10984518262437998, + "flos": 59849694616320.0, + "grad_norm": 0.8361632453995619, + "language_loss": 0.54999328, + "learning_rate": 3.93349657991635e-06, + "loss": 0.57067442, + "num_input_tokens_seen": 39460685, + "router_z_loss_clip": 0.03015137, + "router_z_loss_mlp": 0.44140625, + "step": 1827, + "time_per_iteration": 3.065896511077881 + }, + { + "auxiliary_loss_clip": 0.01064494, + "auxiliary_loss_mlp": 0.01003719, + "balance_loss_clip": 1.00082231, + "balance_loss_mlp": 1.02098036, + "epoch": 0.10990530587704794, + "flos": 66719837410560.0, + "grad_norm": 0.7348311418426204, + "language_loss": 0.55346334, + "learning_rate": 3.933396946289784e-06, + "loss": 0.57414544, + "num_input_tokens_seen": 39524765, + "router_z_loss_clip": 0.02893066, + "router_z_loss_mlp": 0.43359375, + "step": 1828, + "time_per_iteration": 3.0850460529327393 + }, + { + "auxiliary_loss_clip": 0.01180205, + "auxiliary_loss_mlp": 0.01063699, + "balance_loss_clip": 1.03967869, + "balance_loss_mlp": 1.05754089, + "epoch": 0.10996542912971592, + "flos": 25447199189760.0, + "grad_norm": 2.992065013624077, + "language_loss": 0.84191215, + "learning_rate": 3.933297239348612e-06, + "loss": 0.86435115, + "num_input_tokens_seen": 39543640, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.2265625, + "step": 1829, + "time_per_iteration": 2.5398643016815186 + }, + { + "auxiliary_loss_clip": 0.01181422, + "auxiliary_loss_mlp": 0.01057367, + "balance_loss_clip": 1.03348923, + "balance_loss_mlp": 1.05845475, + "epoch": 0.11002555238238389, + "flos": 44018186186880.0, + "grad_norm": 2.654516298718269, + "language_loss": 0.8878119, + "learning_rate": 3.933197459096614e-06, + "loss": 0.91019976, + "num_input_tokens_seen": 39567525, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.2265625, + "step": 1830, + "time_per_iteration": 2.6912100315093994 + }, + { + "auxiliary_loss_clip": 0.01061018, + "auxiliary_loss_mlp": 0.01017752, + "balance_loss_clip": 1.01497495, + "balance_loss_mlp": 1.01824236, + "epoch": 0.11008567563505185, + "flos": 54065133590400.0, + "grad_norm": 0.6883241829767079, + "language_loss": 0.55492055, + "learning_rate": 3.9330976055375756e-06, + "loss": 0.57570827, + "num_input_tokens_seen": 39628470, + "router_z_loss_clip": 0.02783203, + "router_z_loss_mlp": 0.42773438, + "step": 1831, + "time_per_iteration": 3.075678825378418 + }, + { + "auxiliary_loss_clip": 0.01183643, + "auxiliary_loss_mlp": 0.01072422, + "balance_loss_clip": 1.04829443, + "balance_loss_mlp": 1.05867732, + "epoch": 0.11014579888771983, + "flos": 24243150366720.0, + "grad_norm": 2.054835171188452, + "language_loss": 0.90726995, + "learning_rate": 3.932997678675282e-06, + "loss": 0.92983055, + "num_input_tokens_seen": 39646670, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.25, + "step": 1832, + "time_per_iteration": 2.5084948539733887 + }, + { + "auxiliary_loss_clip": 0.01058943, + "auxiliary_loss_mlp": 0.01015171, + "balance_loss_clip": 1.01245296, + "balance_loss_mlp": 1.01603723, + "epoch": 0.1102059221403878, + "flos": 57743965658880.0, + "grad_norm": 0.7159549093535102, + "language_loss": 0.59889859, + "learning_rate": 3.932897678513523e-06, + "loss": 0.61963969, + "num_input_tokens_seen": 39712915, + "router_z_loss_clip": 0.02722168, + "router_z_loss_mlp": 0.4296875, + "step": 1833, + "time_per_iteration": 3.0748977661132812 + }, + { + "auxiliary_loss_clip": 0.01175273, + "auxiliary_loss_mlp": 0.01050301, + "balance_loss_clip": 1.0277946, + "balance_loss_mlp": 1.05353165, + "epoch": 0.11026604539305576, + "flos": 16795923667200.0, + "grad_norm": 2.6030857455850303, + "language_loss": 0.8095156, + "learning_rate": 3.93279760505609e-06, + "loss": 0.83177137, + "num_input_tokens_seen": 39730650, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.21875, + "step": 1834, + "time_per_iteration": 2.452131509780884 + }, + { + "auxiliary_loss_clip": 0.01179876, + "auxiliary_loss_mlp": 0.0105407, + "balance_loss_clip": 1.0302285, + "balance_loss_mlp": 1.05899858, + "epoch": 0.11032616864572373, + "flos": 23988076911360.0, + "grad_norm": 2.5262438386564807, + "language_loss": 0.90514123, + "learning_rate": 3.932697458306779e-06, + "loss": 0.9274807, + "num_input_tokens_seen": 39751065, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.2109375, + "step": 1835, + "time_per_iteration": 2.5261363983154297 + }, + { + "auxiliary_loss_clip": 0.01177237, + "auxiliary_loss_mlp": 0.0105853, + "balance_loss_clip": 1.03445005, + "balance_loss_mlp": 1.05625033, + "epoch": 0.1103862918983917, + "flos": 19683141851520.0, + "grad_norm": 2.0785934228774003, + "language_loss": 0.63590646, + "learning_rate": 3.932597238269386e-06, + "loss": 0.65826416, + "num_input_tokens_seen": 39769245, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.2109375, + "step": 1836, + "time_per_iteration": 2.502586603164673 + }, + { + "auxiliary_loss_clip": 0.01173672, + "auxiliary_loss_mlp": 0.01057372, + "balance_loss_clip": 1.03547311, + "balance_loss_mlp": 1.05388379, + "epoch": 0.11044641515105967, + "flos": 32160878340480.0, + "grad_norm": 1.9330421575083043, + "language_loss": 0.72814602, + "learning_rate": 3.932496944947711e-06, + "loss": 0.75045645, + "num_input_tokens_seen": 39790830, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.1953125, + "step": 1837, + "time_per_iteration": 2.594910144805908 + }, + { + "auxiliary_loss_clip": 0.01179947, + "auxiliary_loss_mlp": 0.0105928, + "balance_loss_clip": 1.03733349, + "balance_loss_mlp": 1.05921102, + "epoch": 0.11050653840372764, + "flos": 16689233295360.0, + "grad_norm": 2.132041599419941, + "language_loss": 0.79049784, + "learning_rate": 3.93239657834556e-06, + "loss": 0.81289005, + "num_input_tokens_seen": 39809475, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.203125, + "step": 1838, + "time_per_iteration": 2.4922690391540527 + }, + { + "auxiliary_loss_clip": 0.01175098, + "auxiliary_loss_mlp": 0.01061476, + "balance_loss_clip": 1.03883791, + "balance_loss_mlp": 1.05623114, + "epoch": 0.11056666165639562, + "flos": 21208877902080.0, + "grad_norm": 4.130442583787946, + "language_loss": 0.71453696, + "learning_rate": 3.932296138466736e-06, + "loss": 0.73690271, + "num_input_tokens_seen": 39826355, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.1875, + "step": 1839, + "time_per_iteration": 2.5151031017303467 + }, + { + "auxiliary_loss_clip": 0.01183988, + "auxiliary_loss_mlp": 0.01053903, + "balance_loss_clip": 1.03082371, + "balance_loss_mlp": 1.05938148, + "epoch": 0.11062678490906358, + "flos": 19165488998400.0, + "grad_norm": 2.064820600929851, + "language_loss": 0.79099703, + "learning_rate": 3.93219562531505e-06, + "loss": 0.81337595, + "num_input_tokens_seen": 39845335, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.25, + "step": 1840, + "time_per_iteration": 2.487116575241089 + }, + { + "auxiliary_loss_clip": 0.01171241, + "auxiliary_loss_mlp": 0.01053863, + "balance_loss_clip": 1.03234553, + "balance_loss_mlp": 1.05329347, + "epoch": 0.11068690816173155, + "flos": 24895287740160.0, + "grad_norm": 2.0204098875762293, + "language_loss": 0.87691998, + "learning_rate": 3.932095038894311e-06, + "loss": 0.89917111, + "num_input_tokens_seen": 39865065, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.1796875, + "step": 1841, + "time_per_iteration": 2.5141868591308594 + }, + { + "auxiliary_loss_clip": 0.01170262, + "auxiliary_loss_mlp": 0.01053518, + "balance_loss_clip": 1.03126192, + "balance_loss_mlp": 1.05365491, + "epoch": 0.11074703141439952, + "flos": 16472368932480.0, + "grad_norm": 2.3404569451138535, + "language_loss": 0.90582979, + "learning_rate": 3.931994379208334e-06, + "loss": 0.92806768, + "num_input_tokens_seen": 39882780, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.1640625, + "step": 1842, + "time_per_iteration": 2.4583773612976074 + }, + { + "auxiliary_loss_clip": 0.0117179, + "auxiliary_loss_mlp": 0.0105155, + "balance_loss_clip": 1.03080761, + "balance_loss_mlp": 1.05210185, + "epoch": 0.11080715466706749, + "flos": 19172420323200.0, + "grad_norm": 2.171204868901281, + "language_loss": 0.85597986, + "learning_rate": 3.931893646260937e-06, + "loss": 0.87821329, + "num_input_tokens_seen": 39900295, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1953125, + "step": 1843, + "time_per_iteration": 2.4801278114318848 + }, + { + "auxiliary_loss_clip": 0.01174004, + "auxiliary_loss_mlp": 0.01060021, + "balance_loss_clip": 1.03645349, + "balance_loss_mlp": 1.05622911, + "epoch": 0.11086727791973545, + "flos": 27704687109120.0, + "grad_norm": 1.47825888700324, + "language_loss": 0.7494424, + "learning_rate": 3.931792840055941e-06, + "loss": 0.77178264, + "num_input_tokens_seen": 39922075, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.1796875, + "step": 1844, + "time_per_iteration": 2.526383876800537 + }, + { + "auxiliary_loss_clip": 0.01173241, + "auxiliary_loss_mlp": 0.01054334, + "balance_loss_clip": 1.0304563, + "balance_loss_mlp": 1.05405343, + "epoch": 0.11092740117240343, + "flos": 18514967736960.0, + "grad_norm": 2.0036363505702433, + "language_loss": 0.75732028, + "learning_rate": 3.931691960597165e-06, + "loss": 0.77959603, + "num_input_tokens_seen": 39940115, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.1953125, + "step": 1845, + "time_per_iteration": 2.463327169418335 + }, + { + "auxiliary_loss_clip": 0.01171011, + "auxiliary_loss_mlp": 0.01054645, + "balance_loss_clip": 1.03341389, + "balance_loss_mlp": 1.05351365, + "epoch": 0.1109875244250714, + "flos": 20522446018560.0, + "grad_norm": 1.6129010657048202, + "language_loss": 0.76336479, + "learning_rate": 3.9315910078884375e-06, + "loss": 0.7856214, + "num_input_tokens_seen": 39959920, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.171875, + "step": 1846, + "time_per_iteration": 2.465045928955078 + }, + { + "auxiliary_loss_clip": 0.01175917, + "auxiliary_loss_mlp": 0.01053852, + "balance_loss_clip": 1.03262115, + "balance_loss_mlp": 1.05392015, + "epoch": 0.11104764767773936, + "flos": 14098601710080.0, + "grad_norm": 2.9965527726637577, + "language_loss": 0.85611343, + "learning_rate": 3.931489981933584e-06, + "loss": 0.87841111, + "num_input_tokens_seen": 39974755, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.21875, + "step": 1847, + "time_per_iteration": 2.4493908882141113 + }, + { + "auxiliary_loss_clip": 0.01174539, + "auxiliary_loss_mlp": 0.01053148, + "balance_loss_clip": 1.03018796, + "balance_loss_mlp": 1.05326366, + "epoch": 0.11110777093040733, + "flos": 20594518657920.0, + "grad_norm": 3.3740806549350086, + "language_loss": 0.76464605, + "learning_rate": 3.931388882736438e-06, + "loss": 0.78692293, + "num_input_tokens_seen": 39993355, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.2109375, + "step": 1848, + "time_per_iteration": 2.4647536277770996 + }, + { + "auxiliary_loss_clip": 0.01173713, + "auxiliary_loss_mlp": 0.01048417, + "balance_loss_clip": 1.02754378, + "balance_loss_mlp": 1.05833888, + "epoch": 0.11116789418307531, + "flos": 21870065502720.0, + "grad_norm": 2.0750561163348173, + "language_loss": 0.77849847, + "learning_rate": 3.931287710300832e-06, + "loss": 0.8007198, + "num_input_tokens_seen": 40012410, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1484375, + "step": 1849, + "time_per_iteration": 2.514777660369873 + }, + { + "auxiliary_loss_clip": 0.01176001, + "auxiliary_loss_mlp": 0.01056414, + "balance_loss_clip": 1.03496861, + "balance_loss_mlp": 1.05422294, + "epoch": 0.11122801743574327, + "flos": 15523106256000.0, + "grad_norm": 3.6662643697478066, + "language_loss": 0.71315688, + "learning_rate": 3.931186464630601e-06, + "loss": 0.73548102, + "num_input_tokens_seen": 40029315, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.21875, + "step": 1850, + "time_per_iteration": 2.4420053958892822 + }, + { + "auxiliary_loss_clip": 0.01174469, + "auxiliary_loss_mlp": 0.01056777, + "balance_loss_clip": 1.03434181, + "balance_loss_mlp": 1.05444217, + "epoch": 0.11128814068841124, + "flos": 14392279307520.0, + "grad_norm": 2.2721050151861912, + "language_loss": 0.81174368, + "learning_rate": 3.931085145729588e-06, + "loss": 0.83405614, + "num_input_tokens_seen": 40045765, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.203125, + "step": 1851, + "time_per_iteration": 5.341679811477661 + }, + { + "auxiliary_loss_clip": 0.01173679, + "auxiliary_loss_mlp": 0.01051699, + "balance_loss_clip": 1.03126621, + "balance_loss_mlp": 1.05519962, + "epoch": 0.11134826394107922, + "flos": 16653933204480.0, + "grad_norm": 3.240427658931177, + "language_loss": 0.88860446, + "learning_rate": 3.930983753601631e-06, + "loss": 0.91085827, + "num_input_tokens_seen": 40061660, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.1875, + "step": 1852, + "time_per_iteration": 3.8281352519989014 + }, + { + "auxiliary_loss_clip": 0.01176515, + "auxiliary_loss_mlp": 0.01057817, + "balance_loss_clip": 1.03514326, + "balance_loss_mlp": 1.05636191, + "epoch": 0.11140838719374718, + "flos": 16690993061760.0, + "grad_norm": 2.0685366180695848, + "language_loss": 0.72092974, + "learning_rate": 3.930882288250578e-06, + "loss": 0.74327302, + "num_input_tokens_seen": 40080180, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.203125, + "step": 1853, + "time_per_iteration": 2.4896738529205322 + }, + { + "auxiliary_loss_clip": 0.01055744, + "auxiliary_loss_mlp": 0.01002079, + "balance_loss_clip": 0.99923038, + "balance_loss_mlp": 1.0132798, + "epoch": 0.11146851044641515, + "flos": 60976355587200.0, + "grad_norm": 0.7783537669608381, + "language_loss": 0.53647029, + "learning_rate": 3.930780749680273e-06, + "loss": 0.5570485, + "num_input_tokens_seen": 40138910, + "router_z_loss_clip": 0.02844238, + "router_z_loss_mlp": 0.42578125, + "step": 1854, + "time_per_iteration": 3.0189781188964844 + }, + { + "auxiliary_loss_clip": 0.01184355, + "auxiliary_loss_mlp": 0.01052254, + "balance_loss_clip": 1.02937746, + "balance_loss_mlp": 1.057657, + "epoch": 0.11152863369908313, + "flos": 22193835719040.0, + "grad_norm": 2.006296213399466, + "language_loss": 0.8394689, + "learning_rate": 3.9306791378945705e-06, + "loss": 0.861835, + "num_input_tokens_seen": 40157745, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.265625, + "step": 1855, + "time_per_iteration": 2.4908485412597656 + }, + { + "auxiliary_loss_clip": 0.01173641, + "auxiliary_loss_mlp": 0.0106694, + "balance_loss_clip": 1.04588723, + "balance_loss_mlp": 1.05353498, + "epoch": 0.11158875695175109, + "flos": 19537524115200.0, + "grad_norm": 2.2091175797191815, + "language_loss": 0.82098675, + "learning_rate": 3.9305774528973205e-06, + "loss": 0.84339261, + "num_input_tokens_seen": 40175375, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.203125, + "step": 1856, + "time_per_iteration": 2.4653480052948 + }, + { + "auxiliary_loss_clip": 0.01173222, + "auxiliary_loss_mlp": 0.01048519, + "balance_loss_clip": 1.02631092, + "balance_loss_mlp": 1.05662763, + "epoch": 0.11164888020441906, + "flos": 25442709989760.0, + "grad_norm": 2.9605277294776, + "language_loss": 0.8305279, + "learning_rate": 3.93047569469238e-06, + "loss": 0.85274535, + "num_input_tokens_seen": 40195715, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.1640625, + "step": 1857, + "time_per_iteration": 2.5205907821655273 + }, + { + "auxiliary_loss_clip": 0.01173614, + "auxiliary_loss_mlp": 0.01049346, + "balance_loss_clip": 1.0279119, + "balance_loss_mlp": 1.05195725, + "epoch": 0.11170900345708702, + "flos": 15632741543040.0, + "grad_norm": 2.3309612964817923, + "language_loss": 0.83037764, + "learning_rate": 3.930373863283608e-06, + "loss": 0.85260725, + "num_input_tokens_seen": 40213975, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.21875, + "step": 1858, + "time_per_iteration": 2.4567432403564453 + }, + { + "auxiliary_loss_clip": 0.01175678, + "auxiliary_loss_mlp": 0.01062921, + "balance_loss_clip": 1.04205894, + "balance_loss_mlp": 1.05549788, + "epoch": 0.111769126709755, + "flos": 23039424766080.0, + "grad_norm": 2.004830650729854, + "language_loss": 0.91120583, + "learning_rate": 3.930271958674866e-06, + "loss": 0.93359184, + "num_input_tokens_seen": 40233905, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.203125, + "step": 1859, + "time_per_iteration": 2.532048463821411 + }, + { + "auxiliary_loss_clip": 0.01173939, + "auxiliary_loss_mlp": 0.0105127, + "balance_loss_clip": 1.02983618, + "balance_loss_mlp": 1.05344319, + "epoch": 0.11182924996242297, + "flos": 20850705434880.0, + "grad_norm": 2.4768392741235306, + "language_loss": 0.81709313, + "learning_rate": 3.930169980870018e-06, + "loss": 0.83934522, + "num_input_tokens_seen": 40252810, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.203125, + "step": 1860, + "time_per_iteration": 2.4747087955474854 + }, + { + "auxiliary_loss_clip": 0.01170638, + "auxiliary_loss_mlp": 0.01056481, + "balance_loss_clip": 1.0361197, + "balance_loss_mlp": 1.05388653, + "epoch": 0.11188937321509093, + "flos": 17455315587840.0, + "grad_norm": 2.1256274007234937, + "language_loss": 0.75203162, + "learning_rate": 3.930067929872931e-06, + "loss": 0.77430284, + "num_input_tokens_seen": 40272000, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.171875, + "step": 1861, + "time_per_iteration": 2.4773240089416504 + }, + { + "auxiliary_loss_clip": 0.01169857, + "auxiliary_loss_mlp": 0.01052708, + "balance_loss_clip": 1.03318143, + "balance_loss_mlp": 1.05338371, + "epoch": 0.11194949646775891, + "flos": 24095916518400.0, + "grad_norm": 2.0016824982414776, + "language_loss": 0.88759935, + "learning_rate": 3.929965805687474e-06, + "loss": 0.90982509, + "num_input_tokens_seen": 40290660, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.1640625, + "step": 1862, + "time_per_iteration": 2.4750735759735107 + }, + { + "auxiliary_loss_clip": 0.01173358, + "auxiliary_loss_mlp": 0.01059619, + "balance_loss_clip": 1.03880525, + "balance_loss_mlp": 1.05597067, + "epoch": 0.11200961972042688, + "flos": 25153880728320.0, + "grad_norm": 2.1858127473987015, + "language_loss": 0.8707, + "learning_rate": 3.92986360831752e-06, + "loss": 0.89302975, + "num_input_tokens_seen": 40307820, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.171875, + "step": 1863, + "time_per_iteration": 2.510676860809326 + }, + { + "auxiliary_loss_clip": 0.01173984, + "auxiliary_loss_mlp": 0.01051873, + "balance_loss_clip": 1.0283289, + "balance_loss_mlp": 1.05463171, + "epoch": 0.11206974297309484, + "flos": 21288312829440.0, + "grad_norm": 2.0887108243102976, + "language_loss": 0.64630157, + "learning_rate": 3.929761337766945e-06, + "loss": 0.66856015, + "num_input_tokens_seen": 40327430, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.1953125, + "step": 1864, + "time_per_iteration": 2.4843807220458984 + }, + { + "auxiliary_loss_clip": 0.01171142, + "auxiliary_loss_mlp": 0.01051015, + "balance_loss_clip": 1.03169096, + "balance_loss_mlp": 1.05504417, + "epoch": 0.11212986622576282, + "flos": 18915982151040.0, + "grad_norm": 2.0715232833306874, + "language_loss": 0.73895639, + "learning_rate": 3.929658994039627e-06, + "loss": 0.76117796, + "num_input_tokens_seen": 40344545, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.15625, + "step": 1865, + "time_per_iteration": 2.4509596824645996 + }, + { + "auxiliary_loss_clip": 0.01169998, + "auxiliary_loss_mlp": 0.01051954, + "balance_loss_clip": 1.02928007, + "balance_loss_mlp": 1.05253589, + "epoch": 0.11218998947843078, + "flos": 22054754257920.0, + "grad_norm": 2.190736679244475, + "language_loss": 0.84019023, + "learning_rate": 3.929556577139446e-06, + "loss": 0.86240977, + "num_input_tokens_seen": 40362300, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.171875, + "step": 1866, + "time_per_iteration": 2.473715305328369 + }, + { + "auxiliary_loss_clip": 0.01169711, + "auxiliary_loss_mlp": 0.01048459, + "balance_loss_clip": 1.02737069, + "balance_loss_mlp": 1.05260134, + "epoch": 0.11225011273109875, + "flos": 24571697091840.0, + "grad_norm": 1.5419857436109028, + "language_loss": 0.81424987, + "learning_rate": 3.929454087070286e-06, + "loss": 0.83643156, + "num_input_tokens_seen": 40384720, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.171875, + "step": 1867, + "time_per_iteration": 2.5367391109466553 + }, + { + "auxiliary_loss_clip": 0.01172987, + "auxiliary_loss_mlp": 0.01055012, + "balance_loss_clip": 1.03473496, + "balance_loss_mlp": 1.05594015, + "epoch": 0.11231023598376672, + "flos": 28438665621120.0, + "grad_norm": 2.5308159777425976, + "language_loss": 0.86677599, + "learning_rate": 3.929351523836035e-06, + "loss": 0.88905597, + "num_input_tokens_seen": 40404000, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.171875, + "step": 1868, + "time_per_iteration": 2.5044100284576416 + }, + { + "auxiliary_loss_clip": 0.01172172, + "auxiliary_loss_mlp": 0.01049736, + "balance_loss_clip": 1.03026938, + "balance_loss_mlp": 1.05724931, + "epoch": 0.1123703592364347, + "flos": 14426466076800.0, + "grad_norm": 2.333499600894065, + "language_loss": 0.68059367, + "learning_rate": 3.9292488874405795e-06, + "loss": 0.70281279, + "num_input_tokens_seen": 40418665, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.1484375, + "step": 1869, + "time_per_iteration": 2.4462318420410156 + }, + { + "auxiliary_loss_clip": 0.01176659, + "auxiliary_loss_mlp": 0.01061629, + "balance_loss_clip": 1.03969407, + "balance_loss_mlp": 1.05456114, + "epoch": 0.11243048248910266, + "flos": 22236282616320.0, + "grad_norm": 2.049754856307833, + "language_loss": 0.7735095, + "learning_rate": 3.929146177887814e-06, + "loss": 0.79589236, + "num_input_tokens_seen": 40437870, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.21875, + "step": 1870, + "time_per_iteration": 2.488471031188965 + }, + { + "auxiliary_loss_clip": 0.01174025, + "auxiliary_loss_mlp": 0.01053264, + "balance_loss_clip": 1.03177094, + "balance_loss_mlp": 1.05264199, + "epoch": 0.11249060574177062, + "flos": 18584167288320.0, + "grad_norm": 1.8085683914823212, + "language_loss": 0.75747174, + "learning_rate": 3.929043395181631e-06, + "loss": 0.77974463, + "num_input_tokens_seen": 40455570, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.21875, + "step": 1871, + "time_per_iteration": 2.475358486175537 + }, + { + "auxiliary_loss_clip": 0.01171992, + "auxiliary_loss_mlp": 0.01049389, + "balance_loss_clip": 1.02936232, + "balance_loss_mlp": 1.05448031, + "epoch": 0.1125507289944386, + "flos": 22856567604480.0, + "grad_norm": 2.4822417703451265, + "language_loss": 0.81949306, + "learning_rate": 3.928940539325929e-06, + "loss": 0.84170687, + "num_input_tokens_seen": 40473600, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.171875, + "step": 1872, + "time_per_iteration": 2.4984912872314453 + }, + { + "auxiliary_loss_clip": 0.01172929, + "auxiliary_loss_mlp": 0.01052146, + "balance_loss_clip": 1.03183281, + "balance_loss_mlp": 1.05497694, + "epoch": 0.11261085224710657, + "flos": 19676390094720.0, + "grad_norm": 2.7250665555581937, + "language_loss": 0.83564019, + "learning_rate": 3.9288376103246095e-06, + "loss": 0.85789096, + "num_input_tokens_seen": 40490025, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1796875, + "step": 1873, + "time_per_iteration": 2.480172872543335 + }, + { + "auxiliary_loss_clip": 0.01175617, + "auxiliary_loss_mlp": 0.01053305, + "balance_loss_clip": 1.03089404, + "balance_loss_mlp": 1.05352998, + "epoch": 0.11267097549977453, + "flos": 26063246373120.0, + "grad_norm": 2.2103217259008985, + "language_loss": 0.91925669, + "learning_rate": 3.928734608181575e-06, + "loss": 0.9415459, + "num_input_tokens_seen": 40511580, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.21875, + "step": 1874, + "time_per_iteration": 2.5193865299224854 + }, + { + "auxiliary_loss_clip": 0.01168968, + "auxiliary_loss_mlp": 0.01057524, + "balance_loss_clip": 1.0376637, + "balance_loss_mlp": 1.0528394, + "epoch": 0.11273109875244251, + "flos": 21068036674560.0, + "grad_norm": 1.5656160151577971, + "language_loss": 0.7534616, + "learning_rate": 3.928631532900729e-06, + "loss": 0.77572656, + "num_input_tokens_seen": 40530155, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.15625, + "step": 1875, + "time_per_iteration": 2.509000062942505 + }, + { + "auxiliary_loss_clip": 0.01168067, + "auxiliary_loss_mlp": 0.01054767, + "balance_loss_clip": 1.03545499, + "balance_loss_mlp": 1.05498421, + "epoch": 0.11279122200511048, + "flos": 27088999061760.0, + "grad_norm": 1.875753927893446, + "language_loss": 0.71727258, + "learning_rate": 3.928528384485984e-06, + "loss": 0.73950088, + "num_input_tokens_seen": 40549500, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.1328125, + "step": 1876, + "time_per_iteration": 2.5222911834716797 + }, + { + "auxiliary_loss_clip": 0.01170022, + "auxiliary_loss_mlp": 0.01051226, + "balance_loss_clip": 1.03036463, + "balance_loss_mlp": 1.05574679, + "epoch": 0.11285134525777844, + "flos": 20187901722240.0, + "grad_norm": 2.408917627715415, + "language_loss": 0.76760256, + "learning_rate": 3.9284251629412475e-06, + "loss": 0.78981495, + "num_input_tokens_seen": 40567475, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.140625, + "step": 1877, + "time_per_iteration": 2.487870693206787 + }, + { + "auxiliary_loss_clip": 0.01173931, + "auxiliary_loss_mlp": 0.01057623, + "balance_loss_clip": 1.03555715, + "balance_loss_mlp": 1.05530918, + "epoch": 0.11291146851044641, + "flos": 12458453863680.0, + "grad_norm": 2.569804002246691, + "language_loss": 0.88132238, + "learning_rate": 3.928321868270436e-06, + "loss": 0.90363795, + "num_input_tokens_seen": 40583280, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1875, + "step": 1878, + "time_per_iteration": 2.4562089443206787 + }, + { + "auxiliary_loss_clip": 0.01171231, + "auxiliary_loss_mlp": 0.01046868, + "balance_loss_clip": 1.02628088, + "balance_loss_mlp": 1.05382609, + "epoch": 0.11297159176311439, + "flos": 23842315520640.0, + "grad_norm": 2.2792620862185036, + "language_loss": 0.81521666, + "learning_rate": 3.928218500477466e-06, + "loss": 0.83739763, + "num_input_tokens_seen": 40603080, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.171875, + "step": 1879, + "time_per_iteration": 2.515162944793701 + }, + { + "auxiliary_loss_clip": 0.01174903, + "auxiliary_loss_mlp": 0.01056113, + "balance_loss_clip": 1.03513217, + "balance_loss_mlp": 1.05591071, + "epoch": 0.11303171501578235, + "flos": 29930538124800.0, + "grad_norm": 2.9729184409385376, + "language_loss": 0.70101768, + "learning_rate": 3.928115059566259e-06, + "loss": 0.72332788, + "num_input_tokens_seen": 40623255, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.1875, + "step": 1880, + "time_per_iteration": 2.5420267581939697 + }, + { + "auxiliary_loss_clip": 0.01169399, + "auxiliary_loss_mlp": 0.01045441, + "balance_loss_clip": 1.02442431, + "balance_loss_mlp": 1.05396068, + "epoch": 0.11309183826845032, + "flos": 16180558842240.0, + "grad_norm": 1.7442831242084353, + "language_loss": 0.72337204, + "learning_rate": 3.928011545540734e-06, + "loss": 0.74552047, + "num_input_tokens_seen": 40641570, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.15625, + "step": 1881, + "time_per_iteration": 2.4648680686950684 + }, + { + "auxiliary_loss_clip": 0.01172977, + "auxiliary_loss_mlp": 0.01057236, + "balance_loss_clip": 1.03452694, + "balance_loss_mlp": 1.05385113, + "epoch": 0.1131519615211183, + "flos": 12020702814720.0, + "grad_norm": 2.4452990726029533, + "language_loss": 0.74243963, + "learning_rate": 3.927907958404819e-06, + "loss": 0.76474178, + "num_input_tokens_seen": 40658775, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.1953125, + "step": 1882, + "time_per_iteration": 2.459181547164917 + }, + { + "auxiliary_loss_clip": 0.01171271, + "auxiliary_loss_mlp": 0.01052266, + "balance_loss_clip": 1.03045106, + "balance_loss_mlp": 1.05493677, + "epoch": 0.11321208477378626, + "flos": 26250125857920.0, + "grad_norm": 2.8641228673356873, + "language_loss": 0.79328096, + "learning_rate": 3.92780429816244e-06, + "loss": 0.81551635, + "num_input_tokens_seen": 40679555, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.15625, + "step": 1883, + "time_per_iteration": 2.5236945152282715 + }, + { + "auxiliary_loss_clip": 0.01173507, + "auxiliary_loss_mlp": 0.01054538, + "balance_loss_clip": 1.03271067, + "balance_loss_mlp": 1.05288672, + "epoch": 0.11327220802645423, + "flos": 13626376583040.0, + "grad_norm": 3.0524763398538193, + "language_loss": 0.77151698, + "learning_rate": 3.927700564817529e-06, + "loss": 0.79379749, + "num_input_tokens_seen": 40697295, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.203125, + "step": 1884, + "time_per_iteration": 2.4974489212036133 + }, + { + "auxiliary_loss_clip": 0.01063959, + "auxiliary_loss_mlp": 0.01012749, + "balance_loss_clip": 1.0102694, + "balance_loss_mlp": 1.02156711, + "epoch": 0.1133323312791222, + "flos": 57191802814080.0, + "grad_norm": 0.7928734254501784, + "language_loss": 0.55183071, + "learning_rate": 3.927596758374019e-06, + "loss": 0.5725978, + "num_input_tokens_seen": 40758095, + "router_z_loss_clip": 0.02478027, + "router_z_loss_mlp": 0.42382812, + "step": 1885, + "time_per_iteration": 2.9756290912628174 + }, + { + "auxiliary_loss_clip": 0.01166272, + "auxiliary_loss_mlp": 0.01047922, + "balance_loss_clip": 1.02758515, + "balance_loss_mlp": 1.0534817, + "epoch": 0.11339245453179017, + "flos": 24351708245760.0, + "grad_norm": 5.752063942495911, + "language_loss": 0.90240276, + "learning_rate": 3.927492878835848e-06, + "loss": 0.92454469, + "num_input_tokens_seen": 40777140, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.125, + "step": 1886, + "time_per_iteration": 2.5031139850616455 + }, + { + "auxiliary_loss_clip": 0.01168969, + "auxiliary_loss_mlp": 0.01051145, + "balance_loss_clip": 1.03018832, + "balance_loss_mlp": 1.05306387, + "epoch": 0.11345257778445814, + "flos": 22670693700480.0, + "grad_norm": 2.0267704425546036, + "language_loss": 0.85101235, + "learning_rate": 3.927388926206953e-06, + "loss": 0.87321353, + "num_input_tokens_seen": 40797505, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.15625, + "step": 1887, + "time_per_iteration": 2.5177412033081055 + }, + { + "auxiliary_loss_clip": 0.01172698, + "auxiliary_loss_mlp": 0.01061982, + "balance_loss_clip": 1.0417881, + "balance_loss_mlp": 1.05554259, + "epoch": 0.11351270103712612, + "flos": 20988242611200.0, + "grad_norm": 5.5783153731033055, + "language_loss": 0.76168925, + "learning_rate": 3.927284900491277e-06, + "loss": 0.78403604, + "num_input_tokens_seen": 40812970, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.171875, + "step": 1888, + "time_per_iteration": 2.4965853691101074 + }, + { + "auxiliary_loss_clip": 0.01177743, + "auxiliary_loss_mlp": 0.01057849, + "balance_loss_clip": 1.03542566, + "balance_loss_mlp": 1.05632472, + "epoch": 0.11357282428979408, + "flos": 37347923600640.0, + "grad_norm": 2.114301103868513, + "language_loss": 0.68039739, + "learning_rate": 3.927180801692764e-06, + "loss": 0.70275331, + "num_input_tokens_seen": 40837745, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.21875, + "step": 1889, + "time_per_iteration": 2.643867015838623 + }, + { + "auxiliary_loss_clip": 0.01172679, + "auxiliary_loss_mlp": 0.01047613, + "balance_loss_clip": 1.02611947, + "balance_loss_mlp": 1.05620956, + "epoch": 0.11363294754246205, + "flos": 21757018423680.0, + "grad_norm": 2.158184033346157, + "language_loss": 0.84414917, + "learning_rate": 3.927076629815362e-06, + "loss": 0.86635208, + "num_input_tokens_seen": 40856490, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.1640625, + "step": 1890, + "time_per_iteration": 2.5018270015716553 + }, + { + "auxiliary_loss_clip": 0.01168344, + "auxiliary_loss_mlp": 0.01050115, + "balance_loss_clip": 1.02855039, + "balance_loss_mlp": 1.05288363, + "epoch": 0.11369307079513001, + "flos": 22601637803520.0, + "grad_norm": 2.2859967152973373, + "language_loss": 0.65099049, + "learning_rate": 3.926972384863022e-06, + "loss": 0.67317504, + "num_input_tokens_seen": 40874070, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.15625, + "step": 1891, + "time_per_iteration": 2.4870762825012207 + }, + { + "auxiliary_loss_clip": 0.01173219, + "auxiliary_loss_mlp": 0.01043072, + "balance_loss_clip": 1.02274656, + "balance_loss_mlp": 1.05397856, + "epoch": 0.11375319404779799, + "flos": 21944257044480.0, + "grad_norm": 2.358390081637715, + "language_loss": 0.87789619, + "learning_rate": 3.9268680668396956e-06, + "loss": 0.90005904, + "num_input_tokens_seen": 40892425, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1953125, + "step": 1892, + "time_per_iteration": 2.469215154647827 + }, + { + "auxiliary_loss_clip": 0.01173439, + "auxiliary_loss_mlp": 0.01066287, + "balance_loss_clip": 1.04509139, + "balance_loss_mlp": 1.05419993, + "epoch": 0.11381331730046595, + "flos": 26395456285440.0, + "grad_norm": 2.4185703679999775, + "language_loss": 0.72724342, + "learning_rate": 3.926763675749339e-06, + "loss": 0.7496407, + "num_input_tokens_seen": 40912190, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1875, + "step": 1893, + "time_per_iteration": 4.021688222885132 + }, + { + "auxiliary_loss_clip": 0.01169367, + "auxiliary_loss_mlp": 0.0105827, + "balance_loss_clip": 1.03531051, + "balance_loss_mlp": 1.05175805, + "epoch": 0.11387344055313392, + "flos": 23804716959360.0, + "grad_norm": 2.254020248775613, + "language_loss": 0.79367435, + "learning_rate": 3.92665921159591e-06, + "loss": 0.81595069, + "num_input_tokens_seen": 40928395, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.171875, + "step": 1894, + "time_per_iteration": 3.9190711975097656 + }, + { + "auxiliary_loss_clip": 0.01176791, + "auxiliary_loss_mlp": 0.01052535, + "balance_loss_clip": 1.03074312, + "balance_loss_mlp": 1.05530715, + "epoch": 0.1139335638058019, + "flos": 34522865902080.0, + "grad_norm": 2.587114905294773, + "language_loss": 0.78868139, + "learning_rate": 3.926554674383371e-06, + "loss": 0.81097472, + "num_input_tokens_seen": 40946555, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.21875, + "step": 1895, + "time_per_iteration": 2.5924861431121826 + }, + { + "auxiliary_loss_clip": 0.0106161, + "auxiliary_loss_mlp": 0.01001633, + "balance_loss_clip": 0.99917758, + "balance_loss_mlp": 1.01840448, + "epoch": 0.11399368705846986, + "flos": 70587811520640.0, + "grad_norm": 0.8005582337036792, + "language_loss": 0.63316774, + "learning_rate": 3.926450064115686e-06, + "loss": 0.65380025, + "num_input_tokens_seen": 41004910, + "router_z_loss_clip": 0.02453613, + "router_z_loss_mlp": 0.43359375, + "step": 1896, + "time_per_iteration": 3.143843412399292 + }, + { + "auxiliary_loss_clip": 0.01170086, + "auxiliary_loss_mlp": 0.01059473, + "balance_loss_clip": 1.03600097, + "balance_loss_mlp": 1.05385494, + "epoch": 0.11405381031113783, + "flos": 21324259365120.0, + "grad_norm": 1.6058527618620146, + "language_loss": 0.84707338, + "learning_rate": 3.926345380796821e-06, + "loss": 0.86936897, + "num_input_tokens_seen": 41026385, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.15625, + "step": 1897, + "time_per_iteration": 2.5120036602020264 + }, + { + "auxiliary_loss_clip": 0.0117262, + "auxiliary_loss_mlp": 0.01053072, + "balance_loss_clip": 1.03159046, + "balance_loss_mlp": 1.05385423, + "epoch": 0.11411393356380581, + "flos": 19719627091200.0, + "grad_norm": 2.3286063431421926, + "language_loss": 0.79776239, + "learning_rate": 3.9262406244307465e-06, + "loss": 0.8200193, + "num_input_tokens_seen": 41045315, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.1875, + "step": 1898, + "time_per_iteration": 2.5186216831207275 + }, + { + "auxiliary_loss_clip": 0.01174476, + "auxiliary_loss_mlp": 0.01056562, + "balance_loss_clip": 1.03330398, + "balance_loss_mlp": 1.05247831, + "epoch": 0.11417405681647377, + "flos": 17530440883200.0, + "grad_norm": 1.996095488823442, + "language_loss": 0.73049861, + "learning_rate": 3.926135795021435e-06, + "loss": 0.75280899, + "num_input_tokens_seen": 41063390, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.21875, + "step": 1899, + "time_per_iteration": 2.484767198562622 + }, + { + "auxiliary_loss_clip": 0.01059885, + "auxiliary_loss_mlp": 0.01004452, + "balance_loss_clip": 1.00197208, + "balance_loss_mlp": 1.0167762, + "epoch": 0.11423418006914174, + "flos": 59674666619520.0, + "grad_norm": 0.9092154832512579, + "language_loss": 0.63432097, + "learning_rate": 3.92603089257286e-06, + "loss": 0.65496433, + "num_input_tokens_seen": 41124180, + "router_z_loss_clip": 0.02478027, + "router_z_loss_mlp": 0.4296875, + "step": 1900, + "time_per_iteration": 3.0239956378936768 + }, + { + "auxiliary_loss_clip": 0.0117026, + "auxiliary_loss_mlp": 0.01058021, + "balance_loss_clip": 1.03600276, + "balance_loss_mlp": 1.05181098, + "epoch": 0.1142943033218097, + "flos": 22963114321920.0, + "grad_norm": 1.6715138036124124, + "language_loss": 0.78116465, + "learning_rate": 3.925925917089001e-06, + "loss": 0.80344748, + "num_input_tokens_seen": 41143485, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.1875, + "step": 1901, + "time_per_iteration": 2.5007457733154297 + }, + { + "auxiliary_loss_clip": 0.01172182, + "auxiliary_loss_mlp": 0.01059819, + "balance_loss_clip": 1.03894591, + "balance_loss_mlp": 1.05482793, + "epoch": 0.11435442657447768, + "flos": 18256267008000.0, + "grad_norm": 1.9023337273707566, + "language_loss": 0.83676988, + "learning_rate": 3.925820868573839e-06, + "loss": 0.85908997, + "num_input_tokens_seen": 41161695, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.171875, + "step": 1902, + "time_per_iteration": 2.4389002323150635 + }, + { + "auxiliary_loss_clip": 0.0117356, + "auxiliary_loss_mlp": 0.01050952, + "balance_loss_clip": 1.02856493, + "balance_loss_mlp": 1.05356252, + "epoch": 0.11441454982714565, + "flos": 24061191045120.0, + "grad_norm": 1.6958297254772137, + "language_loss": 0.77551281, + "learning_rate": 3.925715747031356e-06, + "loss": 0.79775804, + "num_input_tokens_seen": 41181715, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.203125, + "step": 1903, + "time_per_iteration": 2.503164768218994 + }, + { + "auxiliary_loss_clip": 0.01171838, + "auxiliary_loss_mlp": 0.01045456, + "balance_loss_clip": 1.02651334, + "balance_loss_mlp": 1.05437744, + "epoch": 0.11447467307981361, + "flos": 25337707557120.0, + "grad_norm": 2.553861289811236, + "language_loss": 0.75704938, + "learning_rate": 3.925610552465539e-06, + "loss": 0.77922231, + "num_input_tokens_seen": 41201770, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.171875, + "step": 1904, + "time_per_iteration": 2.5097854137420654 + }, + { + "auxiliary_loss_clip": 0.01171595, + "auxiliary_loss_mlp": 0.01053755, + "balance_loss_clip": 1.03192747, + "balance_loss_mlp": 1.05519056, + "epoch": 0.11453479633248159, + "flos": 21726063878400.0, + "grad_norm": 2.146045336495955, + "language_loss": 0.92476678, + "learning_rate": 3.9255052848803764e-06, + "loss": 0.94702017, + "num_input_tokens_seen": 41220590, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.1640625, + "step": 1905, + "time_per_iteration": 2.4905850887298584 + }, + { + "auxiliary_loss_clip": 0.0117632, + "auxiliary_loss_mlp": 0.01050773, + "balance_loss_clip": 1.02755141, + "balance_loss_mlp": 1.0496794, + "epoch": 0.11459491958514956, + "flos": 12969714096000.0, + "grad_norm": 2.457773566764277, + "language_loss": 0.77108872, + "learning_rate": 3.925399944279861e-06, + "loss": 0.7933597, + "num_input_tokens_seen": 41237250, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.265625, + "step": 1906, + "time_per_iteration": 2.4469265937805176 + }, + { + "auxiliary_loss_clip": 0.01173869, + "auxiliary_loss_mlp": 0.01053097, + "balance_loss_clip": 1.03072143, + "balance_loss_mlp": 1.05375302, + "epoch": 0.11465504283781752, + "flos": 22711273090560.0, + "grad_norm": 2.4555636334810593, + "language_loss": 0.81855345, + "learning_rate": 3.925294530667986e-06, + "loss": 0.84082305, + "num_input_tokens_seen": 41256680, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.203125, + "step": 1907, + "time_per_iteration": 2.499772071838379 + }, + { + "auxiliary_loss_clip": 0.01173651, + "auxiliary_loss_mlp": 0.0106593, + "balance_loss_clip": 1.045784, + "balance_loss_mlp": 1.05599511, + "epoch": 0.1147151660904855, + "flos": 23398387332480.0, + "grad_norm": 4.041607412488977, + "language_loss": 0.84798187, + "learning_rate": 3.92518904404875e-06, + "loss": 0.87037772, + "num_input_tokens_seen": 41270955, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.171875, + "step": 1908, + "time_per_iteration": 2.468519687652588 + }, + { + "auxiliary_loss_clip": 0.0105669, + "auxiliary_loss_mlp": 0.01004376, + "balance_loss_clip": 1.0018847, + "balance_loss_mlp": 1.01344705, + "epoch": 0.11477528934315347, + "flos": 63011843498880.0, + "grad_norm": 0.9477470057539497, + "language_loss": 0.6100027, + "learning_rate": 3.925083484426153e-06, + "loss": 0.63061339, + "num_input_tokens_seen": 41319180, + "router_z_loss_clip": 0.02490234, + "router_z_loss_mlp": 0.43164062, + "step": 1909, + "time_per_iteration": 2.8313472270965576 + }, + { + "auxiliary_loss_clip": 0.01174173, + "auxiliary_loss_mlp": 0.01052438, + "balance_loss_clip": 1.03223228, + "balance_loss_mlp": 1.05660319, + "epoch": 0.11483541259582143, + "flos": 16325601960960.0, + "grad_norm": 2.135894642259737, + "language_loss": 0.78793955, + "learning_rate": 3.924977851804197e-06, + "loss": 0.8102057, + "num_input_tokens_seen": 41337480, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.171875, + "step": 1910, + "time_per_iteration": 2.4613592624664307 + }, + { + "auxiliary_loss_clip": 0.01178149, + "auxiliary_loss_mlp": 0.01051798, + "balance_loss_clip": 1.03005373, + "balance_loss_mlp": 1.05803406, + "epoch": 0.1148955358484894, + "flos": 21580410228480.0, + "grad_norm": 3.035949872237615, + "language_loss": 0.76787984, + "learning_rate": 3.9248721461868875e-06, + "loss": 0.79017925, + "num_input_tokens_seen": 41354650, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.203125, + "step": 1911, + "time_per_iteration": 2.475069761276245 + }, + { + "auxiliary_loss_clip": 0.01166349, + "auxiliary_loss_mlp": 0.01051416, + "balance_loss_clip": 1.03048277, + "balance_loss_mlp": 1.05284548, + "epoch": 0.11495565910115738, + "flos": 27673696650240.0, + "grad_norm": 2.1144124150337023, + "language_loss": 0.7927531, + "learning_rate": 3.9247663675782336e-06, + "loss": 0.81493074, + "num_input_tokens_seen": 41376935, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1328125, + "step": 1912, + "time_per_iteration": 2.543607473373413 + }, + { + "auxiliary_loss_clip": 0.01169469, + "auxiliary_loss_mlp": 0.0105862, + "balance_loss_clip": 1.0369364, + "balance_loss_mlp": 1.05352569, + "epoch": 0.11501578235382534, + "flos": 20632368614400.0, + "grad_norm": 1.9322037304643997, + "language_loss": 0.7777245, + "learning_rate": 3.924660515982246e-06, + "loss": 0.80000544, + "num_input_tokens_seen": 41396105, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.1640625, + "step": 1913, + "time_per_iteration": 2.5093326568603516 + }, + { + "auxiliary_loss_clip": 0.01169525, + "auxiliary_loss_mlp": 0.01051154, + "balance_loss_clip": 1.02889776, + "balance_loss_mlp": 1.05118954, + "epoch": 0.1150759056064933, + "flos": 19829046896640.0, + "grad_norm": 3.783180746712747, + "language_loss": 0.70389271, + "learning_rate": 3.924554591402939e-06, + "loss": 0.72609949, + "num_input_tokens_seen": 41415600, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.1875, + "step": 1914, + "time_per_iteration": 2.5099785327911377 + }, + { + "auxiliary_loss_clip": 0.01058565, + "auxiliary_loss_mlp": 0.01002053, + "balance_loss_clip": 0.99943084, + "balance_loss_mlp": 1.01452589, + "epoch": 0.11513602885916129, + "flos": 70045776311040.0, + "grad_norm": 0.7556045547130329, + "language_loss": 0.61044526, + "learning_rate": 3.92444859384433e-06, + "loss": 0.63105142, + "num_input_tokens_seen": 41478760, + "router_z_loss_clip": 0.02624512, + "router_z_loss_mlp": 0.44140625, + "step": 1915, + "time_per_iteration": 3.1735148429870605 + }, + { + "auxiliary_loss_clip": 0.01172283, + "auxiliary_loss_mlp": 0.01054758, + "balance_loss_clip": 1.03273964, + "balance_loss_mlp": 1.05674434, + "epoch": 0.11519615211182925, + "flos": 15741730385280.0, + "grad_norm": 2.822924091618307, + "language_loss": 0.9323889, + "learning_rate": 3.924342523310436e-06, + "loss": 0.95465934, + "num_input_tokens_seen": 41495720, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.15625, + "step": 1916, + "time_per_iteration": 2.4806342124938965 + }, + { + "auxiliary_loss_clip": 0.01171086, + "auxiliary_loss_mlp": 0.01061893, + "balance_loss_clip": 1.03845596, + "balance_loss_mlp": 1.05340374, + "epoch": 0.11525627536449722, + "flos": 20667632791680.0, + "grad_norm": 1.8768677942494545, + "language_loss": 0.72286755, + "learning_rate": 3.9242363798052806e-06, + "loss": 0.7451973, + "num_input_tokens_seen": 41513585, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.171875, + "step": 1917, + "time_per_iteration": 2.519758701324463 + }, + { + "auxiliary_loss_clip": 0.01171782, + "auxiliary_loss_mlp": 0.0104867, + "balance_loss_clip": 1.02664053, + "balance_loss_mlp": 1.05521619, + "epoch": 0.1153163986171652, + "flos": 20303283185280.0, + "grad_norm": 2.2984335892825594, + "language_loss": 0.74389827, + "learning_rate": 3.92413016333289e-06, + "loss": 0.76610279, + "num_input_tokens_seen": 41533390, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1640625, + "step": 1918, + "time_per_iteration": 2.4867136478424072 + }, + { + "auxiliary_loss_clip": 0.01173604, + "auxiliary_loss_mlp": 0.01045744, + "balance_loss_clip": 1.02394044, + "balance_loss_mlp": 1.05273843, + "epoch": 0.11537652186983316, + "flos": 17639321984640.0, + "grad_norm": 2.1981507651696193, + "language_loss": 0.86515707, + "learning_rate": 3.92402387389729e-06, + "loss": 0.88735056, + "num_input_tokens_seen": 41551015, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.2109375, + "step": 1919, + "time_per_iteration": 2.4838428497314453 + }, + { + "auxiliary_loss_clip": 0.01168988, + "auxiliary_loss_mlp": 0.01054432, + "balance_loss_clip": 1.03190136, + "balance_loss_mlp": 1.05291939, + "epoch": 0.11543664512250112, + "flos": 21069401391360.0, + "grad_norm": 2.516832715272094, + "language_loss": 0.86640596, + "learning_rate": 3.923917511502512e-06, + "loss": 0.88864017, + "num_input_tokens_seen": 41568055, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.15625, + "step": 1920, + "time_per_iteration": 2.524017333984375 + }, + { + "auxiliary_loss_clip": 0.01167627, + "auxiliary_loss_mlp": 0.01047596, + "balance_loss_clip": 1.02549434, + "balance_loss_mlp": 1.05360281, + "epoch": 0.11549676837516909, + "flos": 22747542848640.0, + "grad_norm": 2.2143351457696525, + "language_loss": 0.79792106, + "learning_rate": 3.923811076152589e-06, + "loss": 0.82007331, + "num_input_tokens_seen": 41587435, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.140625, + "step": 1921, + "time_per_iteration": 2.494673252105713 + }, + { + "auxiliary_loss_clip": 0.01174597, + "auxiliary_loss_mlp": 0.01056005, + "balance_loss_clip": 1.03331947, + "balance_loss_mlp": 1.05358851, + "epoch": 0.11555689162783707, + "flos": 19168972617600.0, + "grad_norm": 8.96706495073623, + "language_loss": 0.78418177, + "learning_rate": 3.923704567851557e-06, + "loss": 0.8064878, + "num_input_tokens_seen": 41604975, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.2109375, + "step": 1922, + "time_per_iteration": 2.5293705463409424 + }, + { + "auxiliary_loss_clip": 0.01174074, + "auxiliary_loss_mlp": 0.01060645, + "balance_loss_clip": 1.03910375, + "balance_loss_mlp": 1.05410469, + "epoch": 0.11561701488050503, + "flos": 24572056227840.0, + "grad_norm": 1.8482726295091094, + "language_loss": 0.84187758, + "learning_rate": 3.923597986603456e-06, + "loss": 0.86422473, + "num_input_tokens_seen": 41626155, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.203125, + "step": 1923, + "time_per_iteration": 2.5203118324279785 + }, + { + "auxiliary_loss_clip": 0.01175585, + "auxiliary_loss_mlp": 0.01053498, + "balance_loss_clip": 1.03074098, + "balance_loss_mlp": 1.05742192, + "epoch": 0.115677138133173, + "flos": 17092546179840.0, + "grad_norm": 2.0576366068601666, + "language_loss": 0.80471247, + "learning_rate": 3.9234913324123264e-06, + "loss": 0.82700336, + "num_input_tokens_seen": 41644805, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.1796875, + "step": 1924, + "time_per_iteration": 2.48531436920166 + }, + { + "auxiliary_loss_clip": 0.01058783, + "auxiliary_loss_mlp": 0.01001491, + "balance_loss_clip": 0.99917841, + "balance_loss_mlp": 1.0154866, + "epoch": 0.11573726138584098, + "flos": 62703875266560.0, + "grad_norm": 0.810907468185892, + "language_loss": 0.6115036, + "learning_rate": 3.923384605282212e-06, + "loss": 0.6321063, + "num_input_tokens_seen": 41709345, + "router_z_loss_clip": 0.02307129, + "router_z_loss_mlp": 0.43359375, + "step": 1925, + "time_per_iteration": 3.112396478652954 + }, + { + "auxiliary_loss_clip": 0.01173159, + "auxiliary_loss_mlp": 0.01076027, + "balance_loss_clip": 1.05304384, + "balance_loss_mlp": 1.05447614, + "epoch": 0.11579738463850894, + "flos": 22601135013120.0, + "grad_norm": 2.806943429185086, + "language_loss": 0.7482335, + "learning_rate": 3.923277805217161e-06, + "loss": 0.77072537, + "num_input_tokens_seen": 41730210, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.1875, + "step": 1926, + "time_per_iteration": 2.4890315532684326 + }, + { + "auxiliary_loss_clip": 0.01174997, + "auxiliary_loss_mlp": 0.0106307, + "balance_loss_clip": 1.03873897, + "balance_loss_mlp": 1.0552361, + "epoch": 0.11585750789117691, + "flos": 21726135705600.0, + "grad_norm": 2.429758451090488, + "language_loss": 0.73112315, + "learning_rate": 3.923170932221222e-06, + "loss": 0.7535038, + "num_input_tokens_seen": 41750270, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.203125, + "step": 1927, + "time_per_iteration": 2.4673402309417725 + }, + { + "auxiliary_loss_clip": 0.0117016, + "auxiliary_loss_mlp": 0.01054777, + "balance_loss_clip": 1.03244913, + "balance_loss_mlp": 1.05291271, + "epoch": 0.11591763114384489, + "flos": 26287544851200.0, + "grad_norm": 2.854021270140142, + "language_loss": 0.86824137, + "learning_rate": 3.92306398629845e-06, + "loss": 0.89049077, + "num_input_tokens_seen": 41772975, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.171875, + "step": 1928, + "time_per_iteration": 2.530325412750244 + }, + { + "auxiliary_loss_clip": 0.01173569, + "auxiliary_loss_mlp": 0.01055147, + "balance_loss_clip": 1.03289056, + "balance_loss_mlp": 1.05469573, + "epoch": 0.11597775439651285, + "flos": 23000461488000.0, + "grad_norm": 1.71243688867153, + "language_loss": 0.77567977, + "learning_rate": 3.922956967452898e-06, + "loss": 0.79796684, + "num_input_tokens_seen": 41791765, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.1875, + "step": 1929, + "time_per_iteration": 2.489664316177368 + }, + { + "auxiliary_loss_clip": 0.01168882, + "auxiliary_loss_mlp": 0.01062437, + "balance_loss_clip": 1.04238629, + "balance_loss_mlp": 1.05385804, + "epoch": 0.11603787764918082, + "flos": 31941715507200.0, + "grad_norm": 1.6293868207273203, + "language_loss": 0.76724243, + "learning_rate": 3.922849875688626e-06, + "loss": 0.78955561, + "num_input_tokens_seen": 41815615, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.1484375, + "step": 1930, + "time_per_iteration": 2.5867533683776855 + }, + { + "auxiliary_loss_clip": 0.01169352, + "auxiliary_loss_mlp": 0.0105213, + "balance_loss_clip": 1.03027928, + "balance_loss_mlp": 1.05313969, + "epoch": 0.1160980009018488, + "flos": 22271654534400.0, + "grad_norm": 1.9270697111110349, + "language_loss": 0.72114342, + "learning_rate": 3.922742711009693e-06, + "loss": 0.74335825, + "num_input_tokens_seen": 41834810, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.1640625, + "step": 1931, + "time_per_iteration": 2.5218429565429688 + }, + { + "auxiliary_loss_clip": 0.01173627, + "auxiliary_loss_mlp": 0.0105412, + "balance_loss_clip": 1.03168511, + "balance_loss_mlp": 1.05528855, + "epoch": 0.11615812415451676, + "flos": 22783633038720.0, + "grad_norm": 1.5295866923660926, + "language_loss": 0.82133794, + "learning_rate": 3.922635473420164e-06, + "loss": 0.84361541, + "num_input_tokens_seen": 41854975, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.1875, + "step": 1932, + "time_per_iteration": 2.4879212379455566 + }, + { + "auxiliary_loss_clip": 0.01053319, + "auxiliary_loss_mlp": 0.01007659, + "balance_loss_clip": 1.00539386, + "balance_loss_mlp": 1.0111897, + "epoch": 0.11621824740718473, + "flos": 67146096107520.0, + "grad_norm": 0.7701959329661775, + "language_loss": 0.61053753, + "learning_rate": 3.922528162924105e-06, + "loss": 0.63114727, + "num_input_tokens_seen": 41911105, + "router_z_loss_clip": 0.02270508, + "router_z_loss_mlp": 0.421875, + "step": 1933, + "time_per_iteration": 2.960437059402466 + }, + { + "auxiliary_loss_clip": 0.01172297, + "auxiliary_loss_mlp": 0.01054299, + "balance_loss_clip": 1.03248382, + "balance_loss_mlp": 1.05259895, + "epoch": 0.11627837065985269, + "flos": 20375930442240.0, + "grad_norm": 2.2263920275904425, + "language_loss": 0.85587192, + "learning_rate": 3.922420779525586e-06, + "loss": 0.87813795, + "num_input_tokens_seen": 41931750, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.1953125, + "step": 1934, + "time_per_iteration": 5.3810875415802 + }, + { + "auxiliary_loss_clip": 0.01178805, + "auxiliary_loss_mlp": 0.01059072, + "balance_loss_clip": 1.03667259, + "balance_loss_mlp": 1.05852652, + "epoch": 0.11633849391252067, + "flos": 21725812483200.0, + "grad_norm": 2.481370623449466, + "language_loss": 0.65555394, + "learning_rate": 3.9223133232286776e-06, + "loss": 0.67793274, + "num_input_tokens_seen": 41949400, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.203125, + "step": 1935, + "time_per_iteration": 2.483814239501953 + }, + { + "auxiliary_loss_clip": 0.01176161, + "auxiliary_loss_mlp": 0.01053675, + "balance_loss_clip": 1.03352857, + "balance_loss_mlp": 1.05533004, + "epoch": 0.11639861716518864, + "flos": 18805341283200.0, + "grad_norm": 1.8046174937009931, + "language_loss": 0.75469184, + "learning_rate": 3.922205794037456e-06, + "loss": 0.77699012, + "num_input_tokens_seen": 41968100, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.2109375, + "step": 1936, + "time_per_iteration": 3.8786003589630127 + }, + { + "auxiliary_loss_clip": 0.01173369, + "auxiliary_loss_mlp": 0.0105617, + "balance_loss_clip": 1.0325551, + "balance_loss_mlp": 1.05320179, + "epoch": 0.1164587404178566, + "flos": 21214983214080.0, + "grad_norm": 1.9600676544166102, + "language_loss": 0.84061754, + "learning_rate": 3.922098191955998e-06, + "loss": 0.86291301, + "num_input_tokens_seen": 41986375, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.1953125, + "step": 1937, + "time_per_iteration": 2.5084798336029053 + }, + { + "auxiliary_loss_clip": 0.01166803, + "auxiliary_loss_mlp": 0.01045843, + "balance_loss_clip": 1.02533889, + "balance_loss_mlp": 1.05254185, + "epoch": 0.11651886367052458, + "flos": 27818632028160.0, + "grad_norm": 2.0067941571917927, + "language_loss": 0.76479459, + "learning_rate": 3.921990516988384e-06, + "loss": 0.78692102, + "num_input_tokens_seen": 42006055, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.140625, + "step": 1938, + "time_per_iteration": 2.5770225524902344 + }, + { + "auxiliary_loss_clip": 0.01177239, + "auxiliary_loss_mlp": 0.01051282, + "balance_loss_clip": 1.02963328, + "balance_loss_mlp": 1.05566061, + "epoch": 0.11657898692319255, + "flos": 22889569224960.0, + "grad_norm": 2.0274312317590084, + "language_loss": 0.79127967, + "learning_rate": 3.921882769138696e-06, + "loss": 0.8135649, + "num_input_tokens_seen": 42024995, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.21875, + "step": 1939, + "time_per_iteration": 2.5020864009857178 + }, + { + "auxiliary_loss_clip": 0.01173869, + "auxiliary_loss_mlp": 0.0105151, + "balance_loss_clip": 1.02886081, + "balance_loss_mlp": 1.05530274, + "epoch": 0.11663911017586051, + "flos": 24315905364480.0, + "grad_norm": 3.7077039427391343, + "language_loss": 0.86712289, + "learning_rate": 3.9217749484110215e-06, + "loss": 0.88937664, + "num_input_tokens_seen": 42042640, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.1875, + "step": 1940, + "time_per_iteration": 2.484750270843506 + }, + { + "auxiliary_loss_clip": 0.01172427, + "auxiliary_loss_mlp": 0.0105781, + "balance_loss_clip": 1.03699601, + "balance_loss_mlp": 1.05674481, + "epoch": 0.11669923342852849, + "flos": 42340152470400.0, + "grad_norm": 1.4506595925957548, + "language_loss": 0.75750297, + "learning_rate": 3.921667054809449e-06, + "loss": 0.7798053, + "num_input_tokens_seen": 42067005, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.15625, + "step": 1941, + "time_per_iteration": 2.7000842094421387 + }, + { + "auxiliary_loss_clip": 0.01167731, + "auxiliary_loss_mlp": 0.01059034, + "balance_loss_clip": 1.0375998, + "balance_loss_mlp": 1.05215478, + "epoch": 0.11675935668119646, + "flos": 14642288945280.0, + "grad_norm": 2.1675787105273256, + "language_loss": 0.8828994, + "learning_rate": 3.921559088338068e-06, + "loss": 0.90516704, + "num_input_tokens_seen": 42082295, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.15625, + "step": 1942, + "time_per_iteration": 2.460014581680298 + }, + { + "auxiliary_loss_clip": 0.01170106, + "auxiliary_loss_mlp": 0.01048326, + "balance_loss_clip": 1.02839422, + "balance_loss_mlp": 1.05465341, + "epoch": 0.11681947993386442, + "flos": 35116470063360.0, + "grad_norm": 1.688985931696262, + "language_loss": 0.67729998, + "learning_rate": 3.921451049000975e-06, + "loss": 0.69948429, + "num_input_tokens_seen": 42105295, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.15625, + "step": 1943, + "time_per_iteration": 2.5899837017059326 + }, + { + "auxiliary_loss_clip": 0.01170349, + "auxiliary_loss_mlp": 0.01046897, + "balance_loss_clip": 1.02586865, + "balance_loss_mlp": 1.05437136, + "epoch": 0.11687960318653239, + "flos": 38983259024640.0, + "grad_norm": 2.2767867948110263, + "language_loss": 0.69852126, + "learning_rate": 3.921342936802265e-06, + "loss": 0.72069371, + "num_input_tokens_seen": 42125520, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1640625, + "step": 1944, + "time_per_iteration": 2.6237125396728516 + }, + { + "auxiliary_loss_clip": 0.01166997, + "auxiliary_loss_mlp": 0.01045496, + "balance_loss_clip": 1.02513456, + "balance_loss_mlp": 1.05112338, + "epoch": 0.11693972643920036, + "flos": 25994980575360.0, + "grad_norm": 2.1059371232711572, + "language_loss": 0.82477605, + "learning_rate": 3.921234751746038e-06, + "loss": 0.84690094, + "num_input_tokens_seen": 42146335, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.15625, + "step": 1945, + "time_per_iteration": 2.519148349761963 + }, + { + "auxiliary_loss_clip": 0.01169071, + "auxiliary_loss_mlp": 0.01058941, + "balance_loss_clip": 1.03805542, + "balance_loss_mlp": 1.05241919, + "epoch": 0.11699984969186833, + "flos": 27272107618560.0, + "grad_norm": 2.378189536328268, + "language_loss": 0.7640717, + "learning_rate": 3.9211264938363975e-06, + "loss": 0.7863518, + "num_input_tokens_seen": 42165320, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1640625, + "step": 1946, + "time_per_iteration": 2.516782283782959 + }, + { + "auxiliary_loss_clip": 0.01169578, + "auxiliary_loss_mlp": 0.0105231, + "balance_loss_clip": 1.03249717, + "balance_loss_mlp": 1.05597568, + "epoch": 0.1170599729445363, + "flos": 15267853232640.0, + "grad_norm": 2.040115867247402, + "language_loss": 0.68749321, + "learning_rate": 3.921018163077448e-06, + "loss": 0.70971209, + "num_input_tokens_seen": 42182955, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.140625, + "step": 1947, + "time_per_iteration": 2.443979501724243 + }, + { + "auxiliary_loss_clip": 0.01173266, + "auxiliary_loss_mlp": 0.01062988, + "balance_loss_clip": 1.041924, + "balance_loss_mlp": 1.05761504, + "epoch": 0.11712009619720427, + "flos": 17164439251200.0, + "grad_norm": 1.892409556337103, + "language_loss": 0.84730887, + "learning_rate": 3.920909759473295e-06, + "loss": 0.86967146, + "num_input_tokens_seen": 42200760, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.15625, + "step": 1948, + "time_per_iteration": 2.456883192062378 + }, + { + "auxiliary_loss_clip": 0.01060706, + "auxiliary_loss_mlp": 0.01000375, + "balance_loss_clip": 0.99784815, + "balance_loss_mlp": 1.01743388, + "epoch": 0.11718021944987224, + "flos": 70940991997440.0, + "grad_norm": 0.8146373030628324, + "language_loss": 0.65102834, + "learning_rate": 3.920801283028054e-06, + "loss": 0.6716392, + "num_input_tokens_seen": 42265745, + "router_z_loss_clip": 0.02526855, + "router_z_loss_mlp": 0.43359375, + "step": 1949, + "time_per_iteration": 3.083716630935669 + }, + { + "auxiliary_loss_clip": 0.01168495, + "auxiliary_loss_mlp": 0.01056708, + "balance_loss_clip": 1.03614426, + "balance_loss_mlp": 1.05524707, + "epoch": 0.1172403427025402, + "flos": 27453456408960.0, + "grad_norm": 1.7265339558443402, + "language_loss": 0.71616268, + "learning_rate": 3.920692733745835e-06, + "loss": 0.73841476, + "num_input_tokens_seen": 42286245, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.1328125, + "step": 1950, + "time_per_iteration": 2.5140750408172607 + }, + { + "auxiliary_loss_clip": 0.01174036, + "auxiliary_loss_mlp": 0.0105899, + "balance_loss_clip": 1.03823543, + "balance_loss_mlp": 1.05524027, + "epoch": 0.11730046595520818, + "flos": 15668723992320.0, + "grad_norm": 13.047142281747327, + "language_loss": 0.76811576, + "learning_rate": 3.920584111630755e-06, + "loss": 0.79044604, + "num_input_tokens_seen": 42302710, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1875, + "step": 1951, + "time_per_iteration": 2.4511098861694336 + }, + { + "auxiliary_loss_clip": 0.01172385, + "auxiliary_loss_mlp": 0.0106409, + "balance_loss_clip": 1.04351449, + "balance_loss_mlp": 1.05736876, + "epoch": 0.11736058920787615, + "flos": 25630164092160.0, + "grad_norm": 2.4689531190361858, + "language_loss": 0.75770319, + "learning_rate": 3.9204754166869325e-06, + "loss": 0.78006792, + "num_input_tokens_seen": 42324115, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.15625, + "step": 1952, + "time_per_iteration": 2.5249404907226562 + }, + { + "auxiliary_loss_clip": 0.01170041, + "auxiliary_loss_mlp": 0.01060486, + "balance_loss_clip": 1.04038692, + "balance_loss_mlp": 1.05350161, + "epoch": 0.11742071246054411, + "flos": 21434289701760.0, + "grad_norm": 1.8929141854364566, + "language_loss": 0.71838403, + "learning_rate": 3.920366648918491e-06, + "loss": 0.74068928, + "num_input_tokens_seen": 42342505, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.171875, + "step": 1953, + "time_per_iteration": 2.5321006774902344 + }, + { + "auxiliary_loss_clip": 0.01178671, + "auxiliary_loss_mlp": 0.01054108, + "balance_loss_clip": 1.03186345, + "balance_loss_mlp": 1.05794597, + "epoch": 0.11748083571321208, + "flos": 15997845335040.0, + "grad_norm": 2.5505654209141317, + "language_loss": 0.7939415, + "learning_rate": 3.920257808329552e-06, + "loss": 0.81626928, + "num_input_tokens_seen": 42360525, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.203125, + "step": 1954, + "time_per_iteration": 2.477182149887085 + }, + { + "auxiliary_loss_clip": 0.01174109, + "auxiliary_loss_mlp": 0.01060284, + "balance_loss_clip": 1.03859961, + "balance_loss_mlp": 1.05628419, + "epoch": 0.11754095896588006, + "flos": 16180056051840.0, + "grad_norm": 2.1305529461824344, + "language_loss": 0.85609406, + "learning_rate": 3.920148894924246e-06, + "loss": 0.878438, + "num_input_tokens_seen": 42377045, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.1796875, + "step": 1955, + "time_per_iteration": 2.4685070514678955 + }, + { + "auxiliary_loss_clip": 0.01172636, + "auxiliary_loss_mlp": 0.01049417, + "balance_loss_clip": 1.02949762, + "balance_loss_mlp": 1.05551839, + "epoch": 0.11760108221854802, + "flos": 13261596013440.0, + "grad_norm": 3.149612339355701, + "language_loss": 0.77626467, + "learning_rate": 3.920039908706701e-06, + "loss": 0.79848516, + "num_input_tokens_seen": 42393960, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.171875, + "step": 1956, + "time_per_iteration": 2.469529151916504 + }, + { + "auxiliary_loss_clip": 0.01169266, + "auxiliary_loss_mlp": 0.01054147, + "balance_loss_clip": 1.03357112, + "balance_loss_mlp": 1.05667603, + "epoch": 0.11766120547121599, + "flos": 24498439303680.0, + "grad_norm": 4.253665449575931, + "language_loss": 0.80333984, + "learning_rate": 3.91993084968105e-06, + "loss": 0.82557392, + "num_input_tokens_seen": 42413160, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.125, + "step": 1957, + "time_per_iteration": 2.508272886276245 + }, + { + "auxiliary_loss_clip": 0.01176684, + "auxiliary_loss_mlp": 0.01049845, + "balance_loss_clip": 1.03003287, + "balance_loss_mlp": 1.05895627, + "epoch": 0.11772132872388397, + "flos": 17784005967360.0, + "grad_norm": 3.1587185145349737, + "language_loss": 0.77638769, + "learning_rate": 3.919821717851428e-06, + "loss": 0.79865301, + "num_input_tokens_seen": 42432590, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.1796875, + "step": 1958, + "time_per_iteration": 2.48563551902771 + }, + { + "auxiliary_loss_clip": 0.01174636, + "auxiliary_loss_mlp": 0.01048305, + "balance_loss_clip": 1.02640605, + "balance_loss_mlp": 1.05859971, + "epoch": 0.11778145197655193, + "flos": 13217030213760.0, + "grad_norm": 2.0966272081131985, + "language_loss": 0.76906043, + "learning_rate": 3.919712513221976e-06, + "loss": 0.79128981, + "num_input_tokens_seen": 42450135, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.15625, + "step": 1959, + "time_per_iteration": 2.4826674461364746 + }, + { + "auxiliary_loss_clip": 0.01171719, + "auxiliary_loss_mlp": 0.01050959, + "balance_loss_clip": 1.03128934, + "balance_loss_mlp": 1.05581582, + "epoch": 0.1178415752292199, + "flos": 20230204965120.0, + "grad_norm": 3.13785825532277, + "language_loss": 0.69989765, + "learning_rate": 3.919603235796832e-06, + "loss": 0.72212446, + "num_input_tokens_seen": 42470050, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.15625, + "step": 1960, + "time_per_iteration": 2.4965405464172363 + }, + { + "auxiliary_loss_clip": 0.01178622, + "auxiliary_loss_mlp": 0.01058274, + "balance_loss_clip": 1.03704309, + "balance_loss_mlp": 1.05921102, + "epoch": 0.11790169848188788, + "flos": 13040134709760.0, + "grad_norm": 2.5802576751796327, + "language_loss": 0.81135678, + "learning_rate": 3.9194938855801406e-06, + "loss": 0.83372575, + "num_input_tokens_seen": 42484335, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1953125, + "step": 1961, + "time_per_iteration": 2.456537961959839 + }, + { + "auxiliary_loss_clip": 0.01167569, + "auxiliary_loss_mlp": 0.01055573, + "balance_loss_clip": 1.03640413, + "balance_loss_mlp": 1.05682623, + "epoch": 0.11796182173455584, + "flos": 22265728790400.0, + "grad_norm": 3.5009623449342206, + "language_loss": 0.92335653, + "learning_rate": 3.919384462576049e-06, + "loss": 0.94558799, + "num_input_tokens_seen": 42502720, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.109375, + "step": 1962, + "time_per_iteration": 2.4831955432891846 + }, + { + "auxiliary_loss_clip": 0.01175087, + "auxiliary_loss_mlp": 0.01054037, + "balance_loss_clip": 1.03379536, + "balance_loss_mlp": 1.05849361, + "epoch": 0.1180219449872238, + "flos": 10635017892480.0, + "grad_norm": 2.1891263418172353, + "language_loss": 0.87132198, + "learning_rate": 3.919274966788707e-06, + "loss": 0.89361322, + "num_input_tokens_seen": 42519460, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.1640625, + "step": 1963, + "time_per_iteration": 2.453864097595215 + }, + { + "auxiliary_loss_clip": 0.01174267, + "auxiliary_loss_mlp": 0.01047313, + "balance_loss_clip": 1.02764392, + "balance_loss_mlp": 1.05800569, + "epoch": 0.11808206823989177, + "flos": 20923532259840.0, + "grad_norm": 2.1122466665000155, + "language_loss": 0.84163988, + "learning_rate": 3.919165398222265e-06, + "loss": 0.86385566, + "num_input_tokens_seen": 42539420, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1640625, + "step": 1964, + "time_per_iteration": 2.496471405029297 + }, + { + "auxiliary_loss_clip": 0.01178376, + "auxiliary_loss_mlp": 0.01056634, + "balance_loss_clip": 1.03699994, + "balance_loss_mlp": 1.06327403, + "epoch": 0.11814219149255975, + "flos": 20777770869120.0, + "grad_norm": 1.965243610427017, + "language_loss": 0.82994169, + "learning_rate": 3.919055756880879e-06, + "loss": 0.85229176, + "num_input_tokens_seen": 42558225, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.1484375, + "step": 1965, + "time_per_iteration": 2.46545672416687 + }, + { + "auxiliary_loss_clip": 0.01175057, + "auxiliary_loss_mlp": 0.01049044, + "balance_loss_clip": 1.02883816, + "balance_loss_mlp": 1.05948591, + "epoch": 0.11820231474522772, + "flos": 48759938542080.0, + "grad_norm": 1.6968751772896917, + "language_loss": 0.74517393, + "learning_rate": 3.918946042768707e-06, + "loss": 0.76741493, + "num_input_tokens_seen": 42580790, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.15625, + "step": 1966, + "time_per_iteration": 2.730928421020508 + }, + { + "auxiliary_loss_clip": 0.01185811, + "auxiliary_loss_mlp": 0.01055482, + "balance_loss_clip": 1.03552604, + "balance_loss_mlp": 1.0661025, + "epoch": 0.11826243799789568, + "flos": 16690598012160.0, + "grad_norm": 3.573953561090722, + "language_loss": 0.725128, + "learning_rate": 3.918836255889908e-06, + "loss": 0.74754095, + "num_input_tokens_seen": 42597355, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.1953125, + "step": 1967, + "time_per_iteration": 2.459409713745117 + }, + { + "auxiliary_loss_clip": 0.01174728, + "auxiliary_loss_mlp": 0.01044222, + "balance_loss_clip": 1.02409899, + "balance_loss_mlp": 1.0596199, + "epoch": 0.11832256125056366, + "flos": 16909868586240.0, + "grad_norm": 2.07735233424318, + "language_loss": 0.87874025, + "learning_rate": 3.9187263962486456e-06, + "loss": 0.90092969, + "num_input_tokens_seen": 42616060, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.15625, + "step": 1968, + "time_per_iteration": 2.474860191345215 + }, + { + "auxiliary_loss_clip": 0.0117476, + "auxiliary_loss_mlp": 0.01051094, + "balance_loss_clip": 1.03083992, + "balance_loss_mlp": 1.05980873, + "epoch": 0.11838268450323162, + "flos": 22820405587200.0, + "grad_norm": 2.3710109771053904, + "language_loss": 0.66827953, + "learning_rate": 3.918616463849087e-06, + "loss": 0.69053805, + "num_input_tokens_seen": 42636285, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1484375, + "step": 1969, + "time_per_iteration": 2.5025057792663574 + }, + { + "auxiliary_loss_clip": 0.01177024, + "auxiliary_loss_mlp": 0.01052173, + "balance_loss_clip": 1.03172874, + "balance_loss_mlp": 1.06375933, + "epoch": 0.11844280775589959, + "flos": 33545844990720.0, + "grad_norm": 2.0668162562591013, + "language_loss": 0.81199527, + "learning_rate": 3.918506458695399e-06, + "loss": 0.83428723, + "num_input_tokens_seen": 42658320, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1328125, + "step": 1970, + "time_per_iteration": 2.6005184650421143 + }, + { + "auxiliary_loss_clip": 0.01071753, + "auxiliary_loss_mlp": 0.0102596, + "balance_loss_clip": 1.02306354, + "balance_loss_mlp": 1.02803779, + "epoch": 0.11850293100856757, + "flos": 66350998604160.0, + "grad_norm": 0.8059191438251484, + "language_loss": 0.66145539, + "learning_rate": 3.918396380791754e-06, + "loss": 0.68243253, + "num_input_tokens_seen": 42721500, + "router_z_loss_clip": 0.02893066, + "router_z_loss_mlp": 0.4375, + "step": 1971, + "time_per_iteration": 3.0580737590789795 + }, + { + "auxiliary_loss_clip": 0.01173379, + "auxiliary_loss_mlp": 0.0105069, + "balance_loss_clip": 1.03112769, + "balance_loss_mlp": 1.0578413, + "epoch": 0.11856305426123553, + "flos": 24681045070080.0, + "grad_norm": 1.9720310647047086, + "language_loss": 0.79760695, + "learning_rate": 3.918286230142327e-06, + "loss": 0.81984764, + "num_input_tokens_seen": 42739825, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.15625, + "step": 1972, + "time_per_iteration": 2.5330677032470703 + }, + { + "auxiliary_loss_clip": 0.01174806, + "auxiliary_loss_mlp": 0.01052417, + "balance_loss_clip": 1.03144813, + "balance_loss_mlp": 1.06013465, + "epoch": 0.1186231775139035, + "flos": 24280102483200.0, + "grad_norm": 2.451560144092476, + "language_loss": 0.72162819, + "learning_rate": 3.918176006751292e-06, + "loss": 0.74390036, + "num_input_tokens_seen": 42758695, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.140625, + "step": 1973, + "time_per_iteration": 2.497079372406006 + }, + { + "auxiliary_loss_clip": 0.0117035, + "auxiliary_loss_mlp": 0.01043803, + "balance_loss_clip": 1.02407408, + "balance_loss_mlp": 1.05802357, + "epoch": 0.11868330076657148, + "flos": 21757413473280.0, + "grad_norm": 2.2680636805256897, + "language_loss": 0.71724641, + "learning_rate": 3.918065710622832e-06, + "loss": 0.73938787, + "num_input_tokens_seen": 42778510, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.125, + "step": 1974, + "time_per_iteration": 2.5145771503448486 + }, + { + "auxiliary_loss_clip": 0.01170733, + "auxiliary_loss_mlp": 0.01038557, + "balance_loss_clip": 1.01937568, + "balance_loss_mlp": 1.05660915, + "epoch": 0.11874342401923944, + "flos": 17193274894080.0, + "grad_norm": 2.192039880981389, + "language_loss": 0.77186036, + "learning_rate": 3.917955341761128e-06, + "loss": 0.7939533, + "num_input_tokens_seen": 42793995, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.140625, + "step": 1975, + "time_per_iteration": 2.4483766555786133 + }, + { + "auxiliary_loss_clip": 0.01172653, + "auxiliary_loss_mlp": 0.01051494, + "balance_loss_clip": 1.03212273, + "balance_loss_mlp": 1.06021976, + "epoch": 0.11880354727190741, + "flos": 15229572312960.0, + "grad_norm": 2.2667330410251596, + "language_loss": 0.7498399, + "learning_rate": 3.917844900170364e-06, + "loss": 0.77208138, + "num_input_tokens_seen": 42809000, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.125, + "step": 1976, + "time_per_iteration": 3.9421374797821045 + }, + { + "auxiliary_loss_clip": 0.01172444, + "auxiliary_loss_mlp": 0.01044553, + "balance_loss_clip": 1.02544367, + "balance_loss_mlp": 1.05979395, + "epoch": 0.11886367052457537, + "flos": 27309706179840.0, + "grad_norm": 1.6192257034176818, + "language_loss": 0.75191766, + "learning_rate": 3.91773438585473e-06, + "loss": 0.77408761, + "num_input_tokens_seen": 42831585, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.125, + "step": 1977, + "time_per_iteration": 3.9506070613861084 + }, + { + "auxiliary_loss_clip": 0.01172952, + "auxiliary_loss_mlp": 0.01053238, + "balance_loss_clip": 1.0338068, + "balance_loss_mlp": 1.05777454, + "epoch": 0.11892379377724335, + "flos": 21798280172160.0, + "grad_norm": 7.387040580957373, + "language_loss": 0.7393533, + "learning_rate": 3.9176237988184165e-06, + "loss": 0.76161528, + "num_input_tokens_seen": 42848420, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.15625, + "step": 1978, + "time_per_iteration": 2.4754912853240967 + }, + { + "auxiliary_loss_clip": 0.01168854, + "auxiliary_loss_mlp": 0.01048253, + "balance_loss_clip": 1.02872658, + "balance_loss_mlp": 1.05782461, + "epoch": 0.11898391702991132, + "flos": 13991013498240.0, + "grad_norm": 1.709416576437117, + "language_loss": 0.73273945, + "learning_rate": 3.917513139065616e-06, + "loss": 0.75491059, + "num_input_tokens_seen": 42866645, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.109375, + "step": 1979, + "time_per_iteration": 2.478938579559326 + }, + { + "auxiliary_loss_clip": 0.01172076, + "auxiliary_loss_mlp": 0.01048439, + "balance_loss_clip": 1.0286746, + "balance_loss_mlp": 1.05735934, + "epoch": 0.11904404028257928, + "flos": 32234567091840.0, + "grad_norm": 1.877436937799078, + "language_loss": 0.98387957, + "learning_rate": 3.917402406600525e-06, + "loss": 1.00608468, + "num_input_tokens_seen": 42888515, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1484375, + "step": 1980, + "time_per_iteration": 2.5758843421936035 + }, + { + "auxiliary_loss_clip": 0.01173349, + "auxiliary_loss_mlp": 0.01046819, + "balance_loss_clip": 1.02580202, + "balance_loss_mlp": 1.05741775, + "epoch": 0.11910416353524726, + "flos": 23586272398080.0, + "grad_norm": 1.8930015682875676, + "language_loss": 0.85929906, + "learning_rate": 3.917291601427342e-06, + "loss": 0.88150084, + "num_input_tokens_seen": 42909035, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.15625, + "step": 1981, + "time_per_iteration": 2.505051612854004 + }, + { + "auxiliary_loss_clip": 0.01172656, + "auxiliary_loss_mlp": 0.01057237, + "balance_loss_clip": 1.03601766, + "balance_loss_mlp": 1.057832, + "epoch": 0.11916428678791523, + "flos": 25333038789120.0, + "grad_norm": 1.9242535829958574, + "language_loss": 0.85007018, + "learning_rate": 3.91718072355027e-06, + "loss": 0.87236911, + "num_input_tokens_seen": 42927555, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1484375, + "step": 1982, + "time_per_iteration": 2.513012409210205 + }, + { + "auxiliary_loss_clip": 0.01166906, + "auxiliary_loss_mlp": 0.01046511, + "balance_loss_clip": 1.02667475, + "balance_loss_mlp": 1.05463564, + "epoch": 0.11922441004058319, + "flos": 19788431592960.0, + "grad_norm": 1.926275276354154, + "language_loss": 0.85026526, + "learning_rate": 3.917069772973513e-06, + "loss": 0.87239939, + "num_input_tokens_seen": 42945300, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.125, + "step": 1983, + "time_per_iteration": 2.4627623558044434 + }, + { + "auxiliary_loss_clip": 0.01172266, + "auxiliary_loss_mlp": 0.01049845, + "balance_loss_clip": 1.02980638, + "balance_loss_mlp": 1.05581713, + "epoch": 0.11928453329325117, + "flos": 21536347219200.0, + "grad_norm": 3.2679367356540894, + "language_loss": 0.77020949, + "learning_rate": 3.916958749701277e-06, + "loss": 0.79243064, + "num_input_tokens_seen": 42961295, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.15625, + "step": 1984, + "time_per_iteration": 2.466224193572998 + }, + { + "auxiliary_loss_clip": 0.01168386, + "auxiliary_loss_mlp": 0.01055095, + "balance_loss_clip": 1.03542554, + "balance_loss_mlp": 1.05464029, + "epoch": 0.11934465654591914, + "flos": 20815010294400.0, + "grad_norm": 1.7272493982968635, + "language_loss": 0.83323789, + "learning_rate": 3.9168476537377745e-06, + "loss": 0.85547268, + "num_input_tokens_seen": 42980330, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.140625, + "step": 1985, + "time_per_iteration": 2.485797882080078 + }, + { + "auxiliary_loss_clip": 0.01162278, + "auxiliary_loss_mlp": 0.01046498, + "balance_loss_clip": 1.02659011, + "balance_loss_mlp": 1.05230284, + "epoch": 0.1194047797985871, + "flos": 19060486565760.0, + "grad_norm": 1.9847962315308523, + "language_loss": 0.7379061, + "learning_rate": 3.916736485087216e-06, + "loss": 0.75999391, + "num_input_tokens_seen": 42996125, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.1015625, + "step": 1986, + "time_per_iteration": 2.4477651119232178 + }, + { + "auxiliary_loss_clip": 0.01167211, + "auxiliary_loss_mlp": 0.01055872, + "balance_loss_clip": 1.03664303, + "balance_loss_mlp": 1.05418456, + "epoch": 0.11946490305125507, + "flos": 27190805184000.0, + "grad_norm": 2.0940320364759573, + "language_loss": 0.7209813, + "learning_rate": 3.916625243753819e-06, + "loss": 0.74321216, + "num_input_tokens_seen": 43014180, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.125, + "step": 1987, + "time_per_iteration": 2.528564929962158 + }, + { + "auxiliary_loss_clip": 0.01166851, + "auxiliary_loss_mlp": 0.01053632, + "balance_loss_clip": 1.03256774, + "balance_loss_mlp": 1.05243921, + "epoch": 0.11952502630392305, + "flos": 21140791672320.0, + "grad_norm": 2.544292945564917, + "language_loss": 0.72455966, + "learning_rate": 3.916513929741799e-06, + "loss": 0.74676454, + "num_input_tokens_seen": 43032120, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.140625, + "step": 1988, + "time_per_iteration": 2.482295274734497 + }, + { + "auxiliary_loss_clip": 0.01168039, + "auxiliary_loss_mlp": 0.01063511, + "balance_loss_clip": 1.04195809, + "balance_loss_mlp": 1.05425191, + "epoch": 0.11958514955659101, + "flos": 22124241118080.0, + "grad_norm": 2.3919568417846544, + "language_loss": 0.80848205, + "learning_rate": 3.91640254305538e-06, + "loss": 0.83079755, + "num_input_tokens_seen": 43052215, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.140625, + "step": 1989, + "time_per_iteration": 2.5321335792541504 + }, + { + "auxiliary_loss_clip": 0.01171171, + "auxiliary_loss_mlp": 0.01051003, + "balance_loss_clip": 1.03040385, + "balance_loss_mlp": 1.05518925, + "epoch": 0.11964527280925898, + "flos": 17421452040960.0, + "grad_norm": 2.7848130249027077, + "language_loss": 0.76000333, + "learning_rate": 3.916291083698784e-06, + "loss": 0.78222507, + "num_input_tokens_seen": 43069720, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.15625, + "step": 1990, + "time_per_iteration": 2.4608383178710938 + }, + { + "auxiliary_loss_clip": 0.01060104, + "auxiliary_loss_mlp": 0.0101675, + "balance_loss_clip": 1.01392448, + "balance_loss_mlp": 1.01813149, + "epoch": 0.11970539606192696, + "flos": 70679741402880.0, + "grad_norm": 0.8877551125762418, + "language_loss": 0.55219597, + "learning_rate": 3.916179551676238e-06, + "loss": 0.57296449, + "num_input_tokens_seen": 43123130, + "router_z_loss_clip": 0.02819824, + "router_z_loss_mlp": 0.41992188, + "step": 1991, + "time_per_iteration": 3.0575883388519287 + }, + { + "auxiliary_loss_clip": 0.01166639, + "auxiliary_loss_mlp": 0.01048947, + "balance_loss_clip": 1.02905095, + "balance_loss_mlp": 1.05472517, + "epoch": 0.11976551931459492, + "flos": 21215019127680.0, + "grad_norm": 2.2244739837006797, + "language_loss": 0.78156978, + "learning_rate": 3.916067946991971e-06, + "loss": 0.8037256, + "num_input_tokens_seen": 43140015, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.1171875, + "step": 1992, + "time_per_iteration": 2.5395517349243164 + }, + { + "auxiliary_loss_clip": 0.01170251, + "auxiliary_loss_mlp": 0.0104925, + "balance_loss_clip": 1.02819777, + "balance_loss_mlp": 1.0534482, + "epoch": 0.11982564256726289, + "flos": 25989306226560.0, + "grad_norm": 1.898510109378507, + "language_loss": 0.78694016, + "learning_rate": 3.915956269650216e-06, + "loss": 0.80913514, + "num_input_tokens_seen": 43160105, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1640625, + "step": 1993, + "time_per_iteration": 2.5264625549316406 + }, + { + "auxiliary_loss_clip": 0.01165494, + "auxiliary_loss_mlp": 0.01058458, + "balance_loss_clip": 1.03837109, + "balance_loss_mlp": 1.05150676, + "epoch": 0.11988576581993086, + "flos": 21650866755840.0, + "grad_norm": 1.7590613991113047, + "language_loss": 0.82287014, + "learning_rate": 3.915844519655208e-06, + "loss": 0.8451097, + "num_input_tokens_seen": 43179835, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.140625, + "step": 1994, + "time_per_iteration": 2.4871127605438232 + }, + { + "auxiliary_loss_clip": 0.01166639, + "auxiliary_loss_mlp": 0.01054967, + "balance_loss_clip": 1.03551149, + "balance_loss_mlp": 1.05389762, + "epoch": 0.11994588907259883, + "flos": 17857407409920.0, + "grad_norm": 2.1035856813409786, + "language_loss": 0.87953222, + "learning_rate": 3.915732697011183e-06, + "loss": 0.9017483, + "num_input_tokens_seen": 43197210, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.125, + "step": 1995, + "time_per_iteration": 2.46690034866333 + }, + { + "auxiliary_loss_clip": 0.01169288, + "auxiliary_loss_mlp": 0.01057862, + "balance_loss_clip": 1.03692937, + "balance_loss_mlp": 1.05346155, + "epoch": 0.1200060123252668, + "flos": 24462744163200.0, + "grad_norm": 2.783456627489481, + "language_loss": 0.74206698, + "learning_rate": 3.9156208017223825e-06, + "loss": 0.76433849, + "num_input_tokens_seen": 43215050, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.15625, + "step": 1996, + "time_per_iteration": 2.5115768909454346 + }, + { + "auxiliary_loss_clip": 0.01167539, + "auxiliary_loss_mlp": 0.01052549, + "balance_loss_clip": 1.03138888, + "balance_loss_mlp": 1.05337763, + "epoch": 0.12006613557793476, + "flos": 18732191235840.0, + "grad_norm": 1.9342712291191904, + "language_loss": 0.88266122, + "learning_rate": 3.915508833793048e-06, + "loss": 0.90486217, + "num_input_tokens_seen": 43233900, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.140625, + "step": 1997, + "time_per_iteration": 2.4716532230377197 + }, + { + "auxiliary_loss_clip": 0.01167703, + "auxiliary_loss_mlp": 0.01063842, + "balance_loss_clip": 1.04287314, + "balance_loss_mlp": 1.05315256, + "epoch": 0.12012625883060274, + "flos": 22267739952000.0, + "grad_norm": 3.8633631849497054, + "language_loss": 0.78929418, + "learning_rate": 3.915396793227428e-06, + "loss": 0.81160963, + "num_input_tokens_seen": 43252105, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1484375, + "step": 1998, + "time_per_iteration": 2.4798996448516846 + }, + { + "auxiliary_loss_clip": 0.01170638, + "auxiliary_loss_mlp": 0.01048668, + "balance_loss_clip": 1.027318, + "balance_loss_mlp": 1.05610394, + "epoch": 0.1201863820832707, + "flos": 21758885930880.0, + "grad_norm": 2.053047413592738, + "language_loss": 0.73435485, + "learning_rate": 3.915284680029769e-06, + "loss": 0.75654793, + "num_input_tokens_seen": 43270315, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1484375, + "step": 1999, + "time_per_iteration": 2.5017611980438232 + }, + { + "auxiliary_loss_clip": 0.01169689, + "auxiliary_loss_mlp": 0.01065385, + "balance_loss_clip": 1.04436839, + "balance_loss_mlp": 1.05347967, + "epoch": 0.12024650533593867, + "flos": 21907987286400.0, + "grad_norm": 3.6093884580795677, + "language_loss": 0.74955112, + "learning_rate": 3.915172494204323e-06, + "loss": 0.77190185, + "num_input_tokens_seen": 43289935, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.15625, + "step": 2000, + "time_per_iteration": 2.5060245990753174 + }, + { + "auxiliary_loss_clip": 0.01170552, + "auxiliary_loss_mlp": 0.01050835, + "balance_loss_clip": 1.02997398, + "balance_loss_mlp": 1.05408299, + "epoch": 0.12030662858860665, + "flos": 21689219502720.0, + "grad_norm": 1.5368563042333518, + "language_loss": 0.84667969, + "learning_rate": 3.915060235755344e-06, + "loss": 0.86889356, + "num_input_tokens_seen": 43309325, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.171875, + "step": 2001, + "time_per_iteration": 2.499922752380371 + }, + { + "auxiliary_loss_clip": 0.01168457, + "auxiliary_loss_mlp": 0.0105136, + "balance_loss_clip": 1.03176236, + "balance_loss_mlp": 1.05330753, + "epoch": 0.12036675184127461, + "flos": 12933228856320.0, + "grad_norm": 2.074842616733997, + "language_loss": 0.73982531, + "learning_rate": 3.91494790468709e-06, + "loss": 0.76202351, + "num_input_tokens_seen": 43327010, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.15625, + "step": 2002, + "time_per_iteration": 2.486853837966919 + }, + { + "auxiliary_loss_clip": 0.01175825, + "auxiliary_loss_mlp": 0.01058049, + "balance_loss_clip": 1.03599501, + "balance_loss_mlp": 1.05508709, + "epoch": 0.12042687509394258, + "flos": 20851028657280.0, + "grad_norm": 2.832741043586106, + "language_loss": 0.78091669, + "learning_rate": 3.9148355010038185e-06, + "loss": 0.80325544, + "num_input_tokens_seen": 43345650, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.2109375, + "step": 2003, + "time_per_iteration": 2.4740982055664062 + }, + { + "auxiliary_loss_clip": 0.01166397, + "auxiliary_loss_mlp": 0.01050741, + "balance_loss_clip": 1.02979612, + "balance_loss_mlp": 1.0521121, + "epoch": 0.12048699834661056, + "flos": 23878513451520.0, + "grad_norm": 1.9652989098821625, + "language_loss": 0.72093791, + "learning_rate": 3.914723024709793e-06, + "loss": 0.74310923, + "num_input_tokens_seen": 43365555, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.140625, + "step": 2004, + "time_per_iteration": 2.5126965045928955 + }, + { + "auxiliary_loss_clip": 0.01174991, + "auxiliary_loss_mlp": 0.01061179, + "balance_loss_clip": 1.03877997, + "balance_loss_mlp": 1.0546937, + "epoch": 0.12054712159927852, + "flos": 19756363726080.0, + "grad_norm": 2.2150760255497945, + "language_loss": 0.78260767, + "learning_rate": 3.914610475809279e-06, + "loss": 0.80496937, + "num_input_tokens_seen": 43384990, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.203125, + "step": 2005, + "time_per_iteration": 2.4991190433502197 + }, + { + "auxiliary_loss_clip": 0.01056255, + "auxiliary_loss_mlp": 0.01005501, + "balance_loss_clip": 1.00284314, + "balance_loss_mlp": 1.01496482, + "epoch": 0.12060724485194649, + "flos": 51672763123200.0, + "grad_norm": 0.9233110616682776, + "language_loss": 0.58020771, + "learning_rate": 3.914497854306543e-06, + "loss": 0.60082525, + "num_input_tokens_seen": 43436335, + "router_z_loss_clip": 0.02661133, + "router_z_loss_mlp": 0.4140625, + "step": 2006, + "time_per_iteration": 2.8520798683166504 + }, + { + "auxiliary_loss_clip": 0.01165745, + "auxiliary_loss_mlp": 0.01049181, + "balance_loss_clip": 1.02958333, + "balance_loss_mlp": 1.05345094, + "epoch": 0.12066736810461445, + "flos": 18990425088000.0, + "grad_norm": 1.7247761793975513, + "language_loss": 0.76275218, + "learning_rate": 3.9143851602058575e-06, + "loss": 0.78490144, + "num_input_tokens_seen": 43456495, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.125, + "step": 2007, + "time_per_iteration": 2.50325083732605 + }, + { + "auxiliary_loss_clip": 0.01170732, + "auxiliary_loss_mlp": 0.01058411, + "balance_loss_clip": 1.03653646, + "balance_loss_mlp": 1.05348623, + "epoch": 0.12072749135728243, + "flos": 16471973882880.0, + "grad_norm": 3.332475401193337, + "language_loss": 0.82973194, + "learning_rate": 3.914272393511494e-06, + "loss": 0.85202336, + "num_input_tokens_seen": 43473085, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.171875, + "step": 2008, + "time_per_iteration": 2.4650609493255615 + }, + { + "auxiliary_loss_clip": 0.0116834, + "auxiliary_loss_mlp": 0.01054228, + "balance_loss_clip": 1.03319979, + "balance_loss_mlp": 1.05225682, + "epoch": 0.1207876146099504, + "flos": 18077108947200.0, + "grad_norm": 2.236244219024357, + "language_loss": 0.84184098, + "learning_rate": 3.91415955422773e-06, + "loss": 0.86406672, + "num_input_tokens_seen": 43491135, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1640625, + "step": 2009, + "time_per_iteration": 2.4602744579315186 + }, + { + "auxiliary_loss_clip": 0.01170159, + "auxiliary_loss_mlp": 0.01053411, + "balance_loss_clip": 1.03083277, + "balance_loss_mlp": 1.0551877, + "epoch": 0.12084773786261836, + "flos": 21871573873920.0, + "grad_norm": 1.7312486930792712, + "language_loss": 0.83945864, + "learning_rate": 3.914046642358844e-06, + "loss": 0.86169434, + "num_input_tokens_seen": 43510440, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.1484375, + "step": 2010, + "time_per_iteration": 2.480238437652588 + }, + { + "auxiliary_loss_clip": 0.01171814, + "auxiliary_loss_mlp": 0.01056176, + "balance_loss_clip": 1.03437304, + "balance_loss_mlp": 1.05634403, + "epoch": 0.12090786111528634, + "flos": 18333044328960.0, + "grad_norm": 1.658807365911602, + "language_loss": 0.84157598, + "learning_rate": 3.9139336579091174e-06, + "loss": 0.8638559, + "num_input_tokens_seen": 43530145, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.15625, + "step": 2011, + "time_per_iteration": 2.454406499862671 + }, + { + "auxiliary_loss_clip": 0.01172165, + "auxiliary_loss_mlp": 0.01055064, + "balance_loss_clip": 1.03386891, + "balance_loss_mlp": 1.055547, + "epoch": 0.1209679843679543, + "flos": 21105850717440.0, + "grad_norm": 1.879921554869875, + "language_loss": 0.96007967, + "learning_rate": 3.913820600882834e-06, + "loss": 0.9823519, + "num_input_tokens_seen": 43549315, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.171875, + "step": 2012, + "time_per_iteration": 2.479583740234375 + }, + { + "auxiliary_loss_clip": 0.01166488, + "auxiliary_loss_mlp": 0.01047788, + "balance_loss_clip": 1.026914, + "balance_loss_mlp": 1.05365777, + "epoch": 0.12102810762062227, + "flos": 29241053585280.0, + "grad_norm": 2.6055417591736036, + "language_loss": 0.80619711, + "learning_rate": 3.913707471284283e-06, + "loss": 0.82833993, + "num_input_tokens_seen": 43569240, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.125, + "step": 2013, + "time_per_iteration": 2.538651704788208 + }, + { + "auxiliary_loss_clip": 0.01172968, + "auxiliary_loss_mlp": 0.0104686, + "balance_loss_clip": 1.02444816, + "balance_loss_mlp": 1.05412138, + "epoch": 0.12108823087329025, + "flos": 17930701111680.0, + "grad_norm": 3.9791821612033953, + "language_loss": 0.77157021, + "learning_rate": 3.9135942691177515e-06, + "loss": 0.79376847, + "num_input_tokens_seen": 43587710, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.1875, + "step": 2014, + "time_per_iteration": 2.4411396980285645 + }, + { + "auxiliary_loss_clip": 0.01169091, + "auxiliary_loss_mlp": 0.01046827, + "balance_loss_clip": 1.02509499, + "balance_loss_mlp": 1.05448556, + "epoch": 0.12114835412595822, + "flos": 22091850028800.0, + "grad_norm": 2.028780359370303, + "language_loss": 0.86930937, + "learning_rate": 3.913480994387535e-06, + "loss": 0.89146852, + "num_input_tokens_seen": 43606000, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.140625, + "step": 2015, + "time_per_iteration": 2.4546844959259033 + }, + { + "auxiliary_loss_clip": 0.01159471, + "auxiliary_loss_mlp": 0.01047561, + "balance_loss_clip": 1.0268662, + "balance_loss_mlp": 1.04779112, + "epoch": 0.12120847737862618, + "flos": 20412343854720.0, + "grad_norm": 2.0866681231001762, + "language_loss": 0.69274801, + "learning_rate": 3.913367647097926e-06, + "loss": 0.71481836, + "num_input_tokens_seen": 43624815, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.109375, + "step": 2016, + "time_per_iteration": 2.469177007675171 + }, + { + "auxiliary_loss_clip": 0.01169041, + "auxiliary_loss_mlp": 0.01043193, + "balance_loss_clip": 1.02042413, + "balance_loss_mlp": 1.05407953, + "epoch": 0.12126860063129415, + "flos": 22309037614080.0, + "grad_norm": 3.095255398319528, + "language_loss": 0.80049825, + "learning_rate": 3.913254227253225e-06, + "loss": 0.82262057, + "num_input_tokens_seen": 43643960, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.15625, + "step": 2017, + "time_per_iteration": 2.459447145462036 + }, + { + "auxiliary_loss_clip": 0.01168347, + "auxiliary_loss_mlp": 0.01051308, + "balance_loss_clip": 1.0292666, + "balance_loss_mlp": 1.05315137, + "epoch": 0.12132872388396213, + "flos": 13699275235200.0, + "grad_norm": 2.364451122732105, + "language_loss": 0.69343489, + "learning_rate": 3.913140734857731e-06, + "loss": 0.71563143, + "num_input_tokens_seen": 43662650, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1484375, + "step": 2018, + "time_per_iteration": 3.919508695602417 + }, + { + "auxiliary_loss_clip": 0.0117102, + "auxiliary_loss_mlp": 0.0105213, + "balance_loss_clip": 1.03226995, + "balance_loss_mlp": 1.05712008, + "epoch": 0.12138884713663009, + "flos": 26466954307200.0, + "grad_norm": 2.162901456551013, + "language_loss": 0.72318506, + "learning_rate": 3.91302716991575e-06, + "loss": 0.74541652, + "num_input_tokens_seen": 43684205, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.140625, + "step": 2019, + "time_per_iteration": 3.910888433456421 + }, + { + "auxiliary_loss_clip": 0.01168573, + "auxiliary_loss_mlp": 0.01057878, + "balance_loss_clip": 1.03615856, + "balance_loss_mlp": 1.05187333, + "epoch": 0.12144897038929806, + "flos": 26141603892480.0, + "grad_norm": 1.8061721544245042, + "language_loss": 0.92484713, + "learning_rate": 3.912913532431586e-06, + "loss": 0.94711161, + "num_input_tokens_seen": 43706320, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.1640625, + "step": 2020, + "time_per_iteration": 2.5007998943328857 + }, + { + "auxiliary_loss_clip": 0.01168404, + "auxiliary_loss_mlp": 0.0105088, + "balance_loss_clip": 1.03064966, + "balance_loss_mlp": 1.05388308, + "epoch": 0.12150909364196603, + "flos": 24717530309760.0, + "grad_norm": 1.9478588429028871, + "language_loss": 0.77149868, + "learning_rate": 3.912799822409549e-06, + "loss": 0.79369152, + "num_input_tokens_seen": 43724805, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.140625, + "step": 2021, + "time_per_iteration": 2.522216796875 + }, + { + "auxiliary_loss_clip": 0.01165897, + "auxiliary_loss_mlp": 0.01046456, + "balance_loss_clip": 1.02586901, + "balance_loss_mlp": 1.05312037, + "epoch": 0.121569216894634, + "flos": 25186990089600.0, + "grad_norm": 2.0305604143992944, + "language_loss": 0.80324662, + "learning_rate": 3.912686039853952e-06, + "loss": 0.82537007, + "num_input_tokens_seen": 43742320, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.125, + "step": 2022, + "time_per_iteration": 2.518737316131592 + }, + { + "auxiliary_loss_clip": 0.01173528, + "auxiliary_loss_mlp": 0.01051897, + "balance_loss_clip": 1.03094029, + "balance_loss_mlp": 1.057019, + "epoch": 0.12162934014730196, + "flos": 13444094039040.0, + "grad_norm": 1.9019957932594662, + "language_loss": 0.8458122, + "learning_rate": 3.912572184769108e-06, + "loss": 0.86806649, + "num_input_tokens_seen": 43760665, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1640625, + "step": 2023, + "time_per_iteration": 2.4534339904785156 + }, + { + "auxiliary_loss_clip": 0.01169339, + "auxiliary_loss_mlp": 0.0104975, + "balance_loss_clip": 1.02916241, + "balance_loss_mlp": 1.05421007, + "epoch": 0.12168946339996994, + "flos": 16946138344320.0, + "grad_norm": 2.2004951084054234, + "language_loss": 0.85155022, + "learning_rate": 3.912458257159335e-06, + "loss": 0.87374109, + "num_input_tokens_seen": 43779020, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.15625, + "step": 2024, + "time_per_iteration": 2.436833143234253 + }, + { + "auxiliary_loss_clip": 0.0116415, + "auxiliary_loss_mlp": 0.010498, + "balance_loss_clip": 1.02974951, + "balance_loss_mlp": 1.04884946, + "epoch": 0.12174958665263791, + "flos": 29821585196160.0, + "grad_norm": 2.043367551334066, + "language_loss": 0.71662712, + "learning_rate": 3.912344257028954e-06, + "loss": 0.73876667, + "num_input_tokens_seen": 43798850, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.15625, + "step": 2025, + "time_per_iteration": 2.541215658187866 + }, + { + "auxiliary_loss_clip": 0.01168343, + "auxiliary_loss_mlp": 0.0104585, + "balance_loss_clip": 1.02564383, + "balance_loss_mlp": 1.05309796, + "epoch": 0.12180970990530587, + "flos": 24641902224000.0, + "grad_norm": 2.0848974538483755, + "language_loss": 0.75976777, + "learning_rate": 3.912230184382286e-06, + "loss": 0.7819097, + "num_input_tokens_seen": 43820130, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.15625, + "step": 2026, + "time_per_iteration": 2.529049873352051 + }, + { + "auxiliary_loss_clip": 0.01167307, + "auxiliary_loss_mlp": 0.01045796, + "balance_loss_clip": 1.02570963, + "balance_loss_mlp": 1.05251837, + "epoch": 0.12186983315797385, + "flos": 20521691832960.0, + "grad_norm": 2.6572777094172597, + "language_loss": 0.88875067, + "learning_rate": 3.912116039223659e-06, + "loss": 0.9108817, + "num_input_tokens_seen": 43838485, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.1484375, + "step": 2027, + "time_per_iteration": 2.472158432006836 + }, + { + "auxiliary_loss_clip": 0.01165413, + "auxiliary_loss_mlp": 0.01052054, + "balance_loss_clip": 1.03375518, + "balance_loss_mlp": 1.05316114, + "epoch": 0.12192995641064182, + "flos": 27818344719360.0, + "grad_norm": 2.343330799439898, + "language_loss": 0.75515145, + "learning_rate": 3.912001821557399e-06, + "loss": 0.77732611, + "num_input_tokens_seen": 43859080, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.125, + "step": 2028, + "time_per_iteration": 2.5286035537719727 + }, + { + "auxiliary_loss_clip": 0.01164893, + "auxiliary_loss_mlp": 0.010582, + "balance_loss_clip": 1.03758836, + "balance_loss_mlp": 1.05089998, + "epoch": 0.12199007966330978, + "flos": 22017119783040.0, + "grad_norm": 2.270604294931249, + "language_loss": 0.766294, + "learning_rate": 3.911887531387839e-06, + "loss": 0.78852487, + "num_input_tokens_seen": 43879030, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.140625, + "step": 2029, + "time_per_iteration": 2.479799747467041 + }, + { + "auxiliary_loss_clip": 0.0116289, + "auxiliary_loss_mlp": 0.01051159, + "balance_loss_clip": 1.03113246, + "balance_loss_mlp": 1.05001879, + "epoch": 0.12205020291597775, + "flos": 23295216493440.0, + "grad_norm": 2.2290592341985747, + "language_loss": 0.7955277, + "learning_rate": 3.911773168719313e-06, + "loss": 0.81766814, + "num_input_tokens_seen": 43898505, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.125, + "step": 2030, + "time_per_iteration": 2.479250431060791 + }, + { + "auxiliary_loss_clip": 0.01164659, + "auxiliary_loss_mlp": 0.01054283, + "balance_loss_clip": 1.03301597, + "balance_loss_mlp": 1.0526309, + "epoch": 0.12211032616864573, + "flos": 26031609469440.0, + "grad_norm": 3.9595633959777694, + "language_loss": 0.74556369, + "learning_rate": 3.911658733556155e-06, + "loss": 0.76775312, + "num_input_tokens_seen": 43917945, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.125, + "step": 2031, + "time_per_iteration": 2.4966888427734375 + }, + { + "auxiliary_loss_clip": 0.01166064, + "auxiliary_loss_mlp": 0.01045008, + "balance_loss_clip": 1.0269599, + "balance_loss_mlp": 1.05319047, + "epoch": 0.12217044942131369, + "flos": 20410943224320.0, + "grad_norm": 1.9774178696035418, + "language_loss": 0.75045705, + "learning_rate": 3.911544225902707e-06, + "loss": 0.77256775, + "num_input_tokens_seen": 43937385, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.1328125, + "step": 2032, + "time_per_iteration": 2.4545648097991943 + }, + { + "auxiliary_loss_clip": 0.01156748, + "auxiliary_loss_mlp": 0.01043308, + "balance_loss_clip": 1.02398455, + "balance_loss_mlp": 1.04844511, + "epoch": 0.12223057267398166, + "flos": 22857142222080.0, + "grad_norm": 1.6143118682838826, + "language_loss": 0.88853258, + "learning_rate": 3.911429645763311e-06, + "loss": 0.91053319, + "num_input_tokens_seen": 43958130, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0859375, + "step": 2033, + "time_per_iteration": 2.505521535873413 + }, + { + "auxiliary_loss_clip": 0.01170793, + "auxiliary_loss_mlp": 0.01050252, + "balance_loss_clip": 1.03059459, + "balance_loss_mlp": 1.05660009, + "epoch": 0.12229069592664964, + "flos": 20047563285120.0, + "grad_norm": 2.1152048244965096, + "language_loss": 0.65517056, + "learning_rate": 3.911314993142311e-06, + "loss": 0.67738092, + "num_input_tokens_seen": 43976800, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.140625, + "step": 2034, + "time_per_iteration": 2.4647884368896484 + }, + { + "auxiliary_loss_clip": 0.01167041, + "auxiliary_loss_mlp": 0.01055195, + "balance_loss_clip": 1.03425026, + "balance_loss_mlp": 1.05399358, + "epoch": 0.1223508191793176, + "flos": 22274240313600.0, + "grad_norm": 1.59634219760927, + "language_loss": 0.76435542, + "learning_rate": 3.911200268044055e-06, + "loss": 0.78657782, + "num_input_tokens_seen": 43996620, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.125, + "step": 2035, + "time_per_iteration": 2.483016014099121 + }, + { + "auxiliary_loss_clip": 0.01169828, + "auxiliary_loss_mlp": 0.01051267, + "balance_loss_clip": 1.03104889, + "balance_loss_mlp": 1.0543201, + "epoch": 0.12241094243198557, + "flos": 21285978445440.0, + "grad_norm": 1.8316823187763973, + "language_loss": 0.71407682, + "learning_rate": 3.911085470472892e-06, + "loss": 0.73628777, + "num_input_tokens_seen": 44016175, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.15625, + "step": 2036, + "time_per_iteration": 2.476471185684204 + }, + { + "auxiliary_loss_clip": 0.01168411, + "auxiliary_loss_mlp": 0.01051825, + "balance_loss_clip": 1.0309397, + "balance_loss_mlp": 1.05532706, + "epoch": 0.12247106568465355, + "flos": 17382381022080.0, + "grad_norm": 1.632988910709452, + "language_loss": 0.83352619, + "learning_rate": 3.910970600433178e-06, + "loss": 0.85572863, + "num_input_tokens_seen": 44035060, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.125, + "step": 2037, + "time_per_iteration": 2.476040840148926 + }, + { + "auxiliary_loss_clip": 0.0117386, + "auxiliary_loss_mlp": 0.01058093, + "balance_loss_clip": 1.03625405, + "balance_loss_mlp": 1.05652785, + "epoch": 0.12253118893732151, + "flos": 27045438842880.0, + "grad_norm": 2.722283338591856, + "language_loss": 0.80255699, + "learning_rate": 3.910855657929267e-06, + "loss": 0.82487655, + "num_input_tokens_seen": 44053330, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.171875, + "step": 2038, + "time_per_iteration": 2.5043163299560547 + }, + { + "auxiliary_loss_clip": 0.01058546, + "auxiliary_loss_mlp": 0.01007425, + "balance_loss_clip": 1.0051837, + "balance_loss_mlp": 1.01638949, + "epoch": 0.12259131218998948, + "flos": 53861518368000.0, + "grad_norm": 0.832889593555193, + "language_loss": 0.58671033, + "learning_rate": 3.910740642965518e-06, + "loss": 0.60737002, + "num_input_tokens_seen": 44107575, + "router_z_loss_clip": 0.02246094, + "router_z_loss_mlp": 0.421875, + "step": 2039, + "time_per_iteration": 2.9495608806610107 + }, + { + "auxiliary_loss_clip": 0.01172242, + "auxiliary_loss_mlp": 0.01049387, + "balance_loss_clip": 1.0277977, + "balance_loss_mlp": 1.05559754, + "epoch": 0.12265143544265744, + "flos": 17891917401600.0, + "grad_norm": 2.6229044060505298, + "language_loss": 0.80485016, + "learning_rate": 3.910625555546292e-06, + "loss": 0.82706642, + "num_input_tokens_seen": 44126075, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.171875, + "step": 2040, + "time_per_iteration": 2.4483039379119873 + }, + { + "auxiliary_loss_clip": 0.01166059, + "auxiliary_loss_mlp": 0.01050675, + "balance_loss_clip": 1.02977788, + "balance_loss_mlp": 1.05270815, + "epoch": 0.12271155869532542, + "flos": 21799932197760.0, + "grad_norm": 1.8235003945490114, + "language_loss": 0.82753873, + "learning_rate": 3.910510395675953e-06, + "loss": 0.84970617, + "num_input_tokens_seen": 44145605, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1328125, + "step": 2041, + "time_per_iteration": 2.4804372787475586 + }, + { + "auxiliary_loss_clip": 0.01170766, + "auxiliary_loss_mlp": 0.01049407, + "balance_loss_clip": 1.02846217, + "balance_loss_mlp": 1.05399048, + "epoch": 0.12277168194799339, + "flos": 19828759587840.0, + "grad_norm": 1.7522185366152092, + "language_loss": 0.66806722, + "learning_rate": 3.9103951633588694e-06, + "loss": 0.69026893, + "num_input_tokens_seen": 44164770, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1640625, + "step": 2042, + "time_per_iteration": 2.4683480262756348 + }, + { + "auxiliary_loss_clip": 0.01165609, + "auxiliary_loss_mlp": 0.01051247, + "balance_loss_clip": 1.03032589, + "balance_loss_mlp": 1.05184031, + "epoch": 0.12283180520066135, + "flos": 23221024951680.0, + "grad_norm": 1.8478924147346443, + "language_loss": 0.81661081, + "learning_rate": 3.910279858599409e-06, + "loss": 0.83877933, + "num_input_tokens_seen": 44184025, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.140625, + "step": 2043, + "time_per_iteration": 2.5265614986419678 + }, + { + "auxiliary_loss_clip": 0.01166463, + "auxiliary_loss_mlp": 0.01049773, + "balance_loss_clip": 1.02792168, + "balance_loss_mlp": 1.05028844, + "epoch": 0.12289192845332933, + "flos": 18588476920320.0, + "grad_norm": 2.0920421188484095, + "language_loss": 0.8049221, + "learning_rate": 3.910164481401946e-06, + "loss": 0.82708442, + "num_input_tokens_seen": 44202950, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.1640625, + "step": 2044, + "time_per_iteration": 2.45843768119812 + }, + { + "auxiliary_loss_clip": 0.0116264, + "auxiliary_loss_mlp": 0.01046352, + "balance_loss_clip": 1.02577674, + "balance_loss_mlp": 1.05169511, + "epoch": 0.1229520517059973, + "flos": 25769532862080.0, + "grad_norm": 1.7057283877293323, + "language_loss": 0.7796452, + "learning_rate": 3.910049031770853e-06, + "loss": 0.8017351, + "num_input_tokens_seen": 44221115, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.109375, + "step": 2045, + "time_per_iteration": 2.5117220878601074 + }, + { + "auxiliary_loss_clip": 0.01172524, + "auxiliary_loss_mlp": 0.01063382, + "balance_loss_clip": 1.04210341, + "balance_loss_mlp": 1.05461311, + "epoch": 0.12301217495866526, + "flos": 20887154760960.0, + "grad_norm": 2.0659302798736436, + "language_loss": 0.67135215, + "learning_rate": 3.90993350971051e-06, + "loss": 0.69371116, + "num_input_tokens_seen": 44240575, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1796875, + "step": 2046, + "time_per_iteration": 2.466304063796997 + }, + { + "auxiliary_loss_clip": 0.01166597, + "auxiliary_loss_mlp": 0.01058908, + "balance_loss_clip": 1.03793919, + "balance_loss_mlp": 1.05408335, + "epoch": 0.12307229821133324, + "flos": 22378811783040.0, + "grad_norm": 2.3143924335245654, + "language_loss": 0.72491664, + "learning_rate": 3.909817915225297e-06, + "loss": 0.7471717, + "num_input_tokens_seen": 44257145, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.125, + "step": 2047, + "time_per_iteration": 2.4625275135040283 + }, + { + "auxiliary_loss_clip": 0.01163998, + "auxiliary_loss_mlp": 0.0106421, + "balance_loss_clip": 1.04232347, + "balance_loss_mlp": 1.05105257, + "epoch": 0.1231324214640012, + "flos": 23367396873600.0, + "grad_norm": 1.6458989790549132, + "language_loss": 0.76394033, + "learning_rate": 3.909702248319597e-06, + "loss": 0.7862224, + "num_input_tokens_seen": 44278035, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.125, + "step": 2048, + "time_per_iteration": 2.508790969848633 + }, + { + "auxiliary_loss_clip": 0.01165989, + "auxiliary_loss_mlp": 0.01048998, + "balance_loss_clip": 1.03061616, + "balance_loss_mlp": 1.05322123, + "epoch": 0.12319254471666917, + "flos": 23767154311680.0, + "grad_norm": 2.118548028298143, + "language_loss": 0.84626836, + "learning_rate": 3.909586508997797e-06, + "loss": 0.86841822, + "num_input_tokens_seen": 44296980, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.125, + "step": 2049, + "time_per_iteration": 2.538325071334839 + }, + { + "auxiliary_loss_clip": 0.01164402, + "auxiliary_loss_mlp": 0.01053692, + "balance_loss_clip": 1.0336647, + "balance_loss_mlp": 1.05051267, + "epoch": 0.12325266796933713, + "flos": 23550146294400.0, + "grad_norm": 3.176509780932849, + "language_loss": 0.75351131, + "learning_rate": 3.909470697264285e-06, + "loss": 0.77569222, + "num_input_tokens_seen": 44318005, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.140625, + "step": 2050, + "time_per_iteration": 2.499915599822998 + }, + { + "auxiliary_loss_clip": 0.0116542, + "auxiliary_loss_mlp": 0.01054604, + "balance_loss_clip": 1.03431439, + "balance_loss_mlp": 1.05127048, + "epoch": 0.12331279122200511, + "flos": 24423996366720.0, + "grad_norm": 1.9728027261326873, + "language_loss": 0.80877042, + "learning_rate": 3.909354813123452e-06, + "loss": 0.83097064, + "num_input_tokens_seen": 44335260, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.140625, + "step": 2051, + "time_per_iteration": 2.5018789768218994 + }, + { + "auxiliary_loss_clip": 0.01164273, + "auxiliary_loss_mlp": 0.01053369, + "balance_loss_clip": 1.03338933, + "balance_loss_mlp": 1.05348301, + "epoch": 0.12337291447467308, + "flos": 25484294960640.0, + "grad_norm": 1.7756923294305167, + "language_loss": 0.79991698, + "learning_rate": 3.909238856579693e-06, + "loss": 0.82209337, + "num_input_tokens_seen": 44355315, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.109375, + "step": 2052, + "time_per_iteration": 2.4962196350097656 + }, + { + "auxiliary_loss_clip": 0.01167428, + "auxiliary_loss_mlp": 0.01059063, + "balance_loss_clip": 1.03793955, + "balance_loss_mlp": 1.0515492, + "epoch": 0.12343303772734104, + "flos": 23550002640000.0, + "grad_norm": 2.071130498978609, + "language_loss": 0.73757279, + "learning_rate": 3.909122827637406e-06, + "loss": 0.75983769, + "num_input_tokens_seen": 44373020, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.15625, + "step": 2053, + "time_per_iteration": 2.4748997688293457 + }, + { + "auxiliary_loss_clip": 0.01164856, + "auxiliary_loss_mlp": 0.01054483, + "balance_loss_clip": 1.03337085, + "balance_loss_mlp": 1.04912996, + "epoch": 0.12349316098000902, + "flos": 47557074867840.0, + "grad_norm": 2.5139588428492408, + "language_loss": 0.73835206, + "learning_rate": 3.909006726300991e-06, + "loss": 0.76054543, + "num_input_tokens_seen": 44397525, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.15625, + "step": 2054, + "time_per_iteration": 2.7009665966033936 + }, + { + "auxiliary_loss_clip": 0.01161738, + "auxiliary_loss_mlp": 0.01042094, + "balance_loss_clip": 1.02381933, + "balance_loss_mlp": 1.04980421, + "epoch": 0.12355328423267699, + "flos": 25045969294080.0, + "grad_norm": 2.0020033330801863, + "language_loss": 0.85107529, + "learning_rate": 3.908890552574849e-06, + "loss": 0.87311363, + "num_input_tokens_seen": 44415890, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.1171875, + "step": 2055, + "time_per_iteration": 2.5038392543792725 + }, + { + "auxiliary_loss_clip": 0.01164626, + "auxiliary_loss_mlp": 0.01053304, + "balance_loss_clip": 1.03445673, + "balance_loss_mlp": 1.05093932, + "epoch": 0.12361340748534495, + "flos": 27709140395520.0, + "grad_norm": 1.9818000135561404, + "language_loss": 0.77465194, + "learning_rate": 3.908774306463384e-06, + "loss": 0.79683125, + "num_input_tokens_seen": 44436625, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.140625, + "step": 2056, + "time_per_iteration": 2.5265629291534424 + }, + { + "auxiliary_loss_clip": 0.01162241, + "auxiliary_loss_mlp": 0.01055177, + "balance_loss_clip": 1.03486395, + "balance_loss_mlp": 1.04937708, + "epoch": 0.12367353073801293, + "flos": 26140598311680.0, + "grad_norm": 1.9976131339644834, + "language_loss": 0.83188522, + "learning_rate": 3.908657987971009e-06, + "loss": 0.85405934, + "num_input_tokens_seen": 44455265, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1328125, + "step": 2057, + "time_per_iteration": 2.502987861633301 + }, + { + "auxiliary_loss_clip": 0.0116756, + "auxiliary_loss_mlp": 0.01053922, + "balance_loss_clip": 1.03272629, + "balance_loss_mlp": 1.05169332, + "epoch": 0.1237336539906809, + "flos": 25156035544320.0, + "grad_norm": 1.751792200322901, + "language_loss": 0.78356105, + "learning_rate": 3.90854159710213e-06, + "loss": 0.80577588, + "num_input_tokens_seen": 44475815, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1640625, + "step": 2058, + "time_per_iteration": 2.5236053466796875 + }, + { + "auxiliary_loss_clip": 0.01167574, + "auxiliary_loss_mlp": 0.01052354, + "balance_loss_clip": 1.03086066, + "balance_loss_mlp": 1.05105174, + "epoch": 0.12379377724334886, + "flos": 15304589867520.0, + "grad_norm": 2.1327254817813124, + "language_loss": 0.83191061, + "learning_rate": 3.9084251338611624e-06, + "loss": 0.85410988, + "num_input_tokens_seen": 44494045, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.1640625, + "step": 2059, + "time_per_iteration": 5.313246726989746 + }, + { + "auxiliary_loss_clip": 0.01169117, + "auxiliary_loss_mlp": 0.01056711, + "balance_loss_clip": 1.0344671, + "balance_loss_mlp": 1.05206418, + "epoch": 0.12385390049601683, + "flos": 21316717509120.0, + "grad_norm": 2.990324814625926, + "language_loss": 0.81387389, + "learning_rate": 3.908308598252523e-06, + "loss": 0.83613217, + "num_input_tokens_seen": 44509120, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.171875, + "step": 2060, + "time_per_iteration": 3.8617331981658936 + }, + { + "auxiliary_loss_clip": 0.01163462, + "auxiliary_loss_mlp": 0.01050537, + "balance_loss_clip": 1.02928221, + "balance_loss_mlp": 1.04859161, + "epoch": 0.1239140237486848, + "flos": 15116309752320.0, + "grad_norm": 2.0129231677956105, + "language_loss": 0.86278749, + "learning_rate": 3.9081919902806306e-06, + "loss": 0.88492751, + "num_input_tokens_seen": 44525780, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1484375, + "step": 2061, + "time_per_iteration": 2.4531033039093018 + }, + { + "auxiliary_loss_clip": 0.01163888, + "auxiliary_loss_mlp": 0.01045306, + "balance_loss_clip": 1.02552915, + "balance_loss_mlp": 1.05163288, + "epoch": 0.12397414700135277, + "flos": 21976791788160.0, + "grad_norm": 2.146204871859891, + "language_loss": 0.84992719, + "learning_rate": 3.908075309949906e-06, + "loss": 0.87201917, + "num_input_tokens_seen": 44543125, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.125, + "step": 2062, + "time_per_iteration": 2.475050449371338 + }, + { + "auxiliary_loss_clip": 0.01167091, + "auxiliary_loss_mlp": 0.01057701, + "balance_loss_clip": 1.03600502, + "balance_loss_mlp": 1.05348217, + "epoch": 0.12403427025402074, + "flos": 13400892956160.0, + "grad_norm": 2.194910982672458, + "language_loss": 0.78651118, + "learning_rate": 3.907958557264774e-06, + "loss": 0.80875909, + "num_input_tokens_seen": 44560275, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.140625, + "step": 2063, + "time_per_iteration": 2.4638655185699463 + }, + { + "auxiliary_loss_clip": 0.01167155, + "auxiliary_loss_mlp": 0.01058063, + "balance_loss_clip": 1.03590226, + "balance_loss_mlp": 1.05330634, + "epoch": 0.12409439350668872, + "flos": 15304374385920.0, + "grad_norm": 2.133219584666701, + "language_loss": 0.79411167, + "learning_rate": 3.907841732229663e-06, + "loss": 0.81636381, + "num_input_tokens_seen": 44577640, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.1328125, + "step": 2064, + "time_per_iteration": 2.4441418647766113 + }, + { + "auxiliary_loss_clip": 0.01163006, + "auxiliary_loss_mlp": 0.01051291, + "balance_loss_clip": 1.03083503, + "balance_loss_mlp": 1.04955256, + "epoch": 0.12415451675935668, + "flos": 25009376313600.0, + "grad_norm": 2.2298036351802533, + "language_loss": 0.92358226, + "learning_rate": 3.907724834849002e-06, + "loss": 0.9457252, + "num_input_tokens_seen": 44594860, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1328125, + "step": 2065, + "time_per_iteration": 2.4909794330596924 + }, + { + "auxiliary_loss_clip": 0.01166011, + "auxiliary_loss_mlp": 0.01050011, + "balance_loss_clip": 1.02880335, + "balance_loss_mlp": 1.05061674, + "epoch": 0.12421464001202465, + "flos": 23659673840640.0, + "grad_norm": 1.7134253508315578, + "language_loss": 0.8042016, + "learning_rate": 3.907607865127225e-06, + "loss": 0.82636184, + "num_input_tokens_seen": 44614780, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.15625, + "step": 2066, + "time_per_iteration": 2.484276056289673 + }, + { + "auxiliary_loss_clip": 0.01052523, + "auxiliary_loss_mlp": 0.0100337, + "balance_loss_clip": 1.00111723, + "balance_loss_mlp": 1.01144505, + "epoch": 0.12427476326469263, + "flos": 65732904345600.0, + "grad_norm": 0.8687209975293121, + "language_loss": 0.63275361, + "learning_rate": 3.907490823068766e-06, + "loss": 0.65331256, + "num_input_tokens_seen": 44671240, + "router_z_loss_clip": 0.02258301, + "router_z_loss_mlp": 0.41015625, + "step": 2067, + "time_per_iteration": 3.0286524295806885 + }, + { + "auxiliary_loss_clip": 0.01166519, + "auxiliary_loss_mlp": 0.0105175, + "balance_loss_clip": 1.03103137, + "balance_loss_mlp": 1.05087852, + "epoch": 0.12433488651736059, + "flos": 24535427333760.0, + "grad_norm": 1.9774411847970965, + "language_loss": 0.93209147, + "learning_rate": 3.907373708678063e-06, + "loss": 0.95427418, + "num_input_tokens_seen": 44691050, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.15625, + "step": 2068, + "time_per_iteration": 2.4971697330474854 + }, + { + "auxiliary_loss_clip": 0.01167817, + "auxiliary_loss_mlp": 0.01049229, + "balance_loss_clip": 1.03079867, + "balance_loss_mlp": 1.053213, + "epoch": 0.12439500977002856, + "flos": 21031659175680.0, + "grad_norm": 1.9835561743386452, + "language_loss": 0.81277847, + "learning_rate": 3.9072565219595596e-06, + "loss": 0.83494884, + "num_input_tokens_seen": 44709850, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.1484375, + "step": 2069, + "time_per_iteration": 2.4772391319274902 + }, + { + "auxiliary_loss_clip": 0.01166777, + "auxiliary_loss_mlp": 0.01055339, + "balance_loss_clip": 1.03519261, + "balance_loss_mlp": 1.05177176, + "epoch": 0.12445513302269653, + "flos": 26830621555200.0, + "grad_norm": 1.606173275168009, + "language_loss": 0.77390277, + "learning_rate": 3.907139262917696e-06, + "loss": 0.79612398, + "num_input_tokens_seen": 44731475, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.1484375, + "step": 2070, + "time_per_iteration": 2.4962410926818848 + }, + { + "auxiliary_loss_clip": 0.01172971, + "auxiliary_loss_mlp": 0.01046497, + "balance_loss_clip": 1.02598071, + "balance_loss_mlp": 1.05637431, + "epoch": 0.1245152562753645, + "flos": 18368919037440.0, + "grad_norm": 2.418044156181854, + "language_loss": 0.80847198, + "learning_rate": 3.907021931556922e-06, + "loss": 0.83066666, + "num_input_tokens_seen": 44749685, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1640625, + "step": 2071, + "time_per_iteration": 2.452148199081421 + }, + { + "auxiliary_loss_clip": 0.01162159, + "auxiliary_loss_mlp": 0.01051578, + "balance_loss_clip": 1.03063262, + "balance_loss_mlp": 1.05134583, + "epoch": 0.12457537952803246, + "flos": 33107986200960.0, + "grad_norm": 1.802846280579791, + "language_loss": 0.77933639, + "learning_rate": 3.906904527881684e-06, + "loss": 0.80147374, + "num_input_tokens_seen": 44772165, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.109375, + "step": 2072, + "time_per_iteration": 2.5763509273529053 + }, + { + "auxiliary_loss_clip": 0.01166298, + "auxiliary_loss_mlp": 0.01054628, + "balance_loss_clip": 1.03480363, + "balance_loss_mlp": 1.05423427, + "epoch": 0.12463550278070043, + "flos": 22270217990400.0, + "grad_norm": 2.6278132513508976, + "language_loss": 0.74839735, + "learning_rate": 3.9067870518964355e-06, + "loss": 0.77060658, + "num_input_tokens_seen": 44790580, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.125, + "step": 2073, + "time_per_iteration": 2.4676945209503174 + }, + { + "auxiliary_loss_clip": 0.01162135, + "auxiliary_loss_mlp": 0.01050014, + "balance_loss_clip": 1.02904546, + "balance_loss_mlp": 1.04915833, + "epoch": 0.12469562603336841, + "flos": 14679025580160.0, + "grad_norm": 1.9457561725453951, + "language_loss": 0.90556443, + "learning_rate": 3.906669503605631e-06, + "loss": 0.92768592, + "num_input_tokens_seen": 44806730, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1328125, + "step": 2074, + "time_per_iteration": 2.4873156547546387 + }, + { + "auxiliary_loss_clip": 0.01168793, + "auxiliary_loss_mlp": 0.01051059, + "balance_loss_clip": 1.02843285, + "balance_loss_mlp": 1.05183172, + "epoch": 0.12475574928603637, + "flos": 24644775312000.0, + "grad_norm": 2.3814572559525877, + "language_loss": 0.83753067, + "learning_rate": 3.906551883013728e-06, + "loss": 0.85972917, + "num_input_tokens_seen": 44825550, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.171875, + "step": 2075, + "time_per_iteration": 2.500657320022583 + }, + { + "auxiliary_loss_clip": 0.01164838, + "auxiliary_loss_mlp": 0.01056086, + "balance_loss_clip": 1.0341754, + "balance_loss_mlp": 1.05080831, + "epoch": 0.12481587253870434, + "flos": 21762980081280.0, + "grad_norm": 2.1638910845289567, + "language_loss": 0.73802024, + "learning_rate": 3.9064341901251865e-06, + "loss": 0.76022947, + "num_input_tokens_seen": 44844155, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.140625, + "step": 2076, + "time_per_iteration": 2.5686564445495605 + }, + { + "auxiliary_loss_clip": 0.01161577, + "auxiliary_loss_mlp": 0.01043023, + "balance_loss_clip": 1.02346063, + "balance_loss_mlp": 1.05219531, + "epoch": 0.12487599579137232, + "flos": 21432529935360.0, + "grad_norm": 1.967733683791653, + "language_loss": 0.7551648, + "learning_rate": 3.906316424944469e-06, + "loss": 0.77721083, + "num_input_tokens_seen": 44863780, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.09375, + "step": 2077, + "time_per_iteration": 2.489954710006714 + }, + { + "auxiliary_loss_clip": 0.01163633, + "auxiliary_loss_mlp": 0.0105265, + "balance_loss_clip": 1.03104901, + "balance_loss_mlp": 1.05015802, + "epoch": 0.12493611904404028, + "flos": 16107624276480.0, + "grad_norm": 4.043491061132511, + "language_loss": 0.82077563, + "learning_rate": 3.906198587476043e-06, + "loss": 0.84293842, + "num_input_tokens_seen": 44881480, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.1328125, + "step": 2078, + "time_per_iteration": 2.445270299911499 + }, + { + "auxiliary_loss_clip": 0.01168396, + "auxiliary_loss_mlp": 0.0104761, + "balance_loss_clip": 1.02629507, + "balance_loss_mlp": 1.05372512, + "epoch": 0.12499624229670825, + "flos": 21580266574080.0, + "grad_norm": 2.023726857078381, + "language_loss": 0.75024784, + "learning_rate": 3.906080677724374e-06, + "loss": 0.77240789, + "num_input_tokens_seen": 44900390, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1484375, + "step": 2079, + "time_per_iteration": 2.4694364070892334 + }, + { + "auxiliary_loss_clip": 0.01173002, + "auxiliary_loss_mlp": 0.01056904, + "balance_loss_clip": 1.03578043, + "balance_loss_mlp": 1.05697465, + "epoch": 0.1250563655493762, + "flos": 25699040421120.0, + "grad_norm": 2.9314739831996124, + "language_loss": 0.83961046, + "learning_rate": 3.905962695693935e-06, + "loss": 0.86190951, + "num_input_tokens_seen": 44920375, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1640625, + "step": 2080, + "time_per_iteration": 2.52955961227417 + }, + { + "auxiliary_loss_clip": 0.01166024, + "auxiliary_loss_mlp": 0.0105753, + "balance_loss_clip": 1.0364058, + "balance_loss_mlp": 1.05275226, + "epoch": 0.12511648880204418, + "flos": 16909509450240.0, + "grad_norm": 2.0357346796271307, + "language_loss": 0.84575123, + "learning_rate": 3.9058446413892e-06, + "loss": 0.8679868, + "num_input_tokens_seen": 44938415, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1328125, + "step": 2081, + "time_per_iteration": 2.4380433559417725 + }, + { + "auxiliary_loss_clip": 0.0116507, + "auxiliary_loss_mlp": 0.01044581, + "balance_loss_clip": 1.02430391, + "balance_loss_mlp": 1.05154538, + "epoch": 0.12517661205471217, + "flos": 17567500740480.0, + "grad_norm": 1.660916229819668, + "language_loss": 0.76882648, + "learning_rate": 3.905726514814646e-06, + "loss": 0.790923, + "num_input_tokens_seen": 44957135, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1328125, + "step": 2082, + "time_per_iteration": 2.454939842224121 + }, + { + "auxiliary_loss_clip": 0.01182882, + "auxiliary_loss_mlp": 0.01052846, + "balance_loss_clip": 1.03117347, + "balance_loss_mlp": 1.06035674, + "epoch": 0.12523673530738014, + "flos": 16033791870720.0, + "grad_norm": 2.833832134330164, + "language_loss": 0.78994107, + "learning_rate": 3.9056083159747495e-06, + "loss": 0.81229836, + "num_input_tokens_seen": 44974480, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.2265625, + "step": 2083, + "time_per_iteration": 2.4439167976379395 + }, + { + "auxiliary_loss_clip": 0.01168103, + "auxiliary_loss_mlp": 0.01051445, + "balance_loss_clip": 1.02855682, + "balance_loss_mlp": 1.05132031, + "epoch": 0.1252968585600481, + "flos": 18807747494400.0, + "grad_norm": 2.376124844090109, + "language_loss": 0.89690113, + "learning_rate": 3.9054900448739966e-06, + "loss": 0.91909659, + "num_input_tokens_seen": 44990310, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.171875, + "step": 2084, + "time_per_iteration": 2.492274045944214 + }, + { + "auxiliary_loss_clip": 0.01168755, + "auxiliary_loss_mlp": 0.01049772, + "balance_loss_clip": 1.02876747, + "balance_loss_mlp": 1.05379784, + "epoch": 0.12535698181271607, + "flos": 27271568914560.0, + "grad_norm": 1.9059704425119062, + "language_loss": 0.79718572, + "learning_rate": 3.905371701516869e-06, + "loss": 0.81937099, + "num_input_tokens_seen": 45010720, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.1484375, + "step": 2085, + "time_per_iteration": 2.5295538902282715 + }, + { + "auxiliary_loss_clip": 0.0116658, + "auxiliary_loss_mlp": 0.01051649, + "balance_loss_clip": 1.03011954, + "balance_loss_mlp": 1.05235541, + "epoch": 0.12541710506538403, + "flos": 22054107813120.0, + "grad_norm": 1.9580642243137214, + "language_loss": 0.88227898, + "learning_rate": 3.905253285907856e-06, + "loss": 0.90446126, + "num_input_tokens_seen": 45030360, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.140625, + "step": 2086, + "time_per_iteration": 2.4508614540100098 + }, + { + "auxiliary_loss_clip": 0.01162238, + "auxiliary_loss_mlp": 0.01045013, + "balance_loss_clip": 1.02541506, + "balance_loss_mlp": 1.05238986, + "epoch": 0.125477228318052, + "flos": 12603173760000.0, + "grad_norm": 2.3707303368435957, + "language_loss": 0.87088495, + "learning_rate": 3.905134798051447e-06, + "loss": 0.89295745, + "num_input_tokens_seen": 45045085, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.09375, + "step": 2087, + "time_per_iteration": 2.4342494010925293 + }, + { + "auxiliary_loss_clip": 0.01166252, + "auxiliary_loss_mlp": 0.01056999, + "balance_loss_clip": 1.03444421, + "balance_loss_mlp": 1.05230761, + "epoch": 0.12553735157071996, + "flos": 23878549365120.0, + "grad_norm": 3.239876707553976, + "language_loss": 0.73480451, + "learning_rate": 3.905016237952136e-06, + "loss": 0.75703704, + "num_input_tokens_seen": 45065145, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.140625, + "step": 2088, + "time_per_iteration": 2.4926228523254395 + }, + { + "auxiliary_loss_clip": 0.01053685, + "auxiliary_loss_mlp": 0.01004858, + "balance_loss_clip": 1.00259304, + "balance_loss_mlp": 1.01231122, + "epoch": 0.12559747482338796, + "flos": 69920841830400.0, + "grad_norm": 0.759594920780347, + "language_loss": 0.61699253, + "learning_rate": 3.904897605614418e-06, + "loss": 0.63757795, + "num_input_tokens_seen": 45126230, + "router_z_loss_clip": 0.02270508, + "router_z_loss_mlp": 0.4140625, + "step": 2089, + "time_per_iteration": 3.0373222827911377 + }, + { + "auxiliary_loss_clip": 0.01165987, + "auxiliary_loss_mlp": 0.01057326, + "balance_loss_clip": 1.03522468, + "balance_loss_mlp": 1.05317736, + "epoch": 0.12565759807605592, + "flos": 24279563779200.0, + "grad_norm": 2.0159960445234746, + "language_loss": 0.78266793, + "learning_rate": 3.904778901042793e-06, + "loss": 0.80490106, + "num_input_tokens_seen": 45145545, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.125, + "step": 2090, + "time_per_iteration": 2.5307860374450684 + }, + { + "auxiliary_loss_clip": 0.01051163, + "auxiliary_loss_mlp": 0.01005786, + "balance_loss_clip": 1.00381935, + "balance_loss_mlp": 1.01062346, + "epoch": 0.12571772132872389, + "flos": 56451180286080.0, + "grad_norm": 0.749206069507312, + "language_loss": 0.59394926, + "learning_rate": 3.90466012424176e-06, + "loss": 0.61451876, + "num_input_tokens_seen": 45206845, + "router_z_loss_clip": 0.01965332, + "router_z_loss_mlp": 0.40625, + "step": 2091, + "time_per_iteration": 2.976081609725952 + }, + { + "auxiliary_loss_clip": 0.01166574, + "auxiliary_loss_mlp": 0.0105012, + "balance_loss_clip": 1.03016472, + "balance_loss_mlp": 1.0538522, + "epoch": 0.12577784458139185, + "flos": 41245846675200.0, + "grad_norm": 1.8692826570762828, + "language_loss": 0.63588953, + "learning_rate": 3.904541275215825e-06, + "loss": 0.6580565, + "num_input_tokens_seen": 45228495, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.125, + "step": 2092, + "time_per_iteration": 2.633054733276367 + }, + { + "auxiliary_loss_clip": 0.01169654, + "auxiliary_loss_mlp": 0.01059319, + "balance_loss_clip": 1.03770638, + "balance_loss_mlp": 1.05095637, + "epoch": 0.12583796783405982, + "flos": 19755501799680.0, + "grad_norm": 3.3800613541528257, + "language_loss": 0.80149096, + "learning_rate": 3.904422353969493e-06, + "loss": 0.82378066, + "num_input_tokens_seen": 45245720, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.1875, + "step": 2093, + "time_per_iteration": 2.4769086837768555 + }, + { + "auxiliary_loss_clip": 0.01166637, + "auxiliary_loss_mlp": 0.01065148, + "balance_loss_clip": 1.04385769, + "balance_loss_mlp": 1.05323935, + "epoch": 0.12589809108672778, + "flos": 22602104680320.0, + "grad_norm": 1.7179534274341421, + "language_loss": 0.75928843, + "learning_rate": 3.904303360507276e-06, + "loss": 0.78160632, + "num_input_tokens_seen": 45265650, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1328125, + "step": 2094, + "time_per_iteration": 2.4775569438934326 + }, + { + "auxiliary_loss_clip": 0.01163905, + "auxiliary_loss_mlp": 0.01053098, + "balance_loss_clip": 1.03322637, + "balance_loss_mlp": 1.05116057, + "epoch": 0.12595821433939577, + "flos": 45222845541120.0, + "grad_norm": 1.654740537988477, + "language_loss": 0.76833487, + "learning_rate": 3.9041842948336835e-06, + "loss": 0.79050487, + "num_input_tokens_seen": 45287790, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.125, + "step": 2095, + "time_per_iteration": 2.669593095779419 + }, + { + "auxiliary_loss_clip": 0.01166425, + "auxiliary_loss_mlp": 0.01064344, + "balance_loss_clip": 1.04330409, + "balance_loss_mlp": 1.05012596, + "epoch": 0.12601833759206374, + "flos": 14319811618560.0, + "grad_norm": 2.7658625824396568, + "language_loss": 0.8312341, + "learning_rate": 3.904065156953232e-06, + "loss": 0.85354173, + "num_input_tokens_seen": 45305720, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1640625, + "step": 2096, + "time_per_iteration": 2.446169853210449 + }, + { + "auxiliary_loss_clip": 0.01167307, + "auxiliary_loss_mlp": 0.0105403, + "balance_loss_clip": 1.03317988, + "balance_loss_mlp": 1.05236387, + "epoch": 0.1260784608447317, + "flos": 21288241002240.0, + "grad_norm": 1.9365429623482773, + "language_loss": 0.7532599, + "learning_rate": 3.903945946870439e-06, + "loss": 0.77547324, + "num_input_tokens_seen": 45325290, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1484375, + "step": 2097, + "time_per_iteration": 2.46520733833313 + }, + { + "auxiliary_loss_clip": 0.0116818, + "auxiliary_loss_mlp": 0.0105919, + "balance_loss_clip": 1.0399375, + "balance_loss_mlp": 1.05366278, + "epoch": 0.12613858409739967, + "flos": 26251311006720.0, + "grad_norm": 2.0415683165998004, + "language_loss": 0.8696878, + "learning_rate": 3.9038266645898246e-06, + "loss": 0.89196146, + "num_input_tokens_seen": 45344465, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.1484375, + "step": 2098, + "time_per_iteration": 2.488985061645508 + }, + { + "auxiliary_loss_clip": 0.01171506, + "auxiliary_loss_mlp": 0.01063691, + "balance_loss_clip": 1.03984964, + "balance_loss_mlp": 1.05263424, + "epoch": 0.12619870735006763, + "flos": 21579979265280.0, + "grad_norm": 1.8810788789855342, + "language_loss": 0.69538295, + "learning_rate": 3.903707310115912e-06, + "loss": 0.71773493, + "num_input_tokens_seen": 45362465, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.1875, + "step": 2099, + "time_per_iteration": 2.4791061878204346 + }, + { + "auxiliary_loss_clip": 0.01167442, + "auxiliary_loss_mlp": 0.01058165, + "balance_loss_clip": 1.03538442, + "balance_loss_mlp": 1.05016196, + "epoch": 0.1262588306027356, + "flos": 23367037737600.0, + "grad_norm": 3.489186386071109, + "language_loss": 0.81622505, + "learning_rate": 3.903587883453228e-06, + "loss": 0.83848113, + "num_input_tokens_seen": 45382700, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.171875, + "step": 2100, + "time_per_iteration": 2.4970083236694336 + }, + { + "auxiliary_loss_clip": 0.01170444, + "auxiliary_loss_mlp": 0.01056399, + "balance_loss_clip": 1.03558493, + "balance_loss_mlp": 1.05375385, + "epoch": 0.12631895385540357, + "flos": 23949185460480.0, + "grad_norm": 21.240028764463403, + "language_loss": 0.80653214, + "learning_rate": 3.903468384606302e-06, + "loss": 0.82880062, + "num_input_tokens_seen": 45401005, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.1640625, + "step": 2101, + "time_per_iteration": 5.441275596618652 + }, + { + "auxiliary_loss_clip": 0.01053889, + "auxiliary_loss_mlp": 0.01009667, + "balance_loss_clip": 1.00753367, + "balance_loss_mlp": 1.01423335, + "epoch": 0.12637907710807156, + "flos": 70282138780800.0, + "grad_norm": 0.7055092704674581, + "language_loss": 0.57077372, + "learning_rate": 3.903348813579662e-06, + "loss": 0.59140933, + "num_input_tokens_seen": 45466555, + "router_z_loss_clip": 0.0213623, + "router_z_loss_mlp": 0.39648438, + "step": 2102, + "time_per_iteration": 4.4595959186553955 + }, + { + "auxiliary_loss_clip": 0.01172982, + "auxiliary_loss_mlp": 0.0105633, + "balance_loss_clip": 1.03513408, + "balance_loss_mlp": 1.05443108, + "epoch": 0.12643920036073952, + "flos": 18915084311040.0, + "grad_norm": 1.9163731362545673, + "language_loss": 0.93033105, + "learning_rate": 3.903229170377845e-06, + "loss": 0.9526242, + "num_input_tokens_seen": 45485165, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1875, + "step": 2103, + "time_per_iteration": 2.4612908363342285 + }, + { + "auxiliary_loss_clip": 0.01160763, + "auxiliary_loss_mlp": 0.01039063, + "balance_loss_clip": 1.01929784, + "balance_loss_mlp": 1.05146646, + "epoch": 0.1264993236134075, + "flos": 27782470010880.0, + "grad_norm": 1.70771861982282, + "language_loss": 0.7804687, + "learning_rate": 3.903109455005387e-06, + "loss": 0.80246699, + "num_input_tokens_seen": 45504630, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.09375, + "step": 2104, + "time_per_iteration": 2.556351661682129 + }, + { + "auxiliary_loss_clip": 0.01173017, + "auxiliary_loss_mlp": 0.01056721, + "balance_loss_clip": 1.03659892, + "balance_loss_mlp": 1.05698192, + "epoch": 0.12655944686607545, + "flos": 24754697907840.0, + "grad_norm": 1.9983303318130716, + "language_loss": 0.81274837, + "learning_rate": 3.902989667466828e-06, + "loss": 0.83504581, + "num_input_tokens_seen": 45524885, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.15625, + "step": 2105, + "time_per_iteration": 2.4998059272766113 + }, + { + "auxiliary_loss_clip": 0.01177911, + "auxiliary_loss_mlp": 0.01057389, + "balance_loss_clip": 1.03515697, + "balance_loss_mlp": 1.05756688, + "epoch": 0.12661957011874342, + "flos": 24133048202880.0, + "grad_norm": 2.6618923007939728, + "language_loss": 0.83258855, + "learning_rate": 3.90286980776671e-06, + "loss": 0.85494161, + "num_input_tokens_seen": 45545000, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.203125, + "step": 2106, + "time_per_iteration": 2.4816856384277344 + }, + { + "auxiliary_loss_clip": 0.01170292, + "auxiliary_loss_mlp": 0.01048713, + "balance_loss_clip": 1.02755296, + "balance_loss_mlp": 1.05664992, + "epoch": 0.12667969337141138, + "flos": 24569614103040.0, + "grad_norm": 2.017673348074064, + "language_loss": 0.73717511, + "learning_rate": 3.902749875909578e-06, + "loss": 0.75936514, + "num_input_tokens_seen": 45564210, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.140625, + "step": 2107, + "time_per_iteration": 2.503575325012207 + }, + { + "auxiliary_loss_clip": 0.01166119, + "auxiliary_loss_mlp": 0.01046685, + "balance_loss_clip": 1.02683651, + "balance_loss_mlp": 1.05330598, + "epoch": 0.12673981662407935, + "flos": 22961677777920.0, + "grad_norm": 2.8409726657459213, + "language_loss": 0.79492414, + "learning_rate": 3.90262987189998e-06, + "loss": 0.81705213, + "num_input_tokens_seen": 45583030, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.125, + "step": 2108, + "time_per_iteration": 2.448009967803955 + }, + { + "auxiliary_loss_clip": 0.01168328, + "auxiliary_loss_mlp": 0.01048086, + "balance_loss_clip": 1.02635407, + "balance_loss_mlp": 1.05213785, + "epoch": 0.12679993987674734, + "flos": 17274864637440.0, + "grad_norm": 2.700834997101356, + "language_loss": 0.75458848, + "learning_rate": 3.902509795742467e-06, + "loss": 0.77675259, + "num_input_tokens_seen": 45602265, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.1640625, + "step": 2109, + "time_per_iteration": 2.463996171951294 + }, + { + "auxiliary_loss_clip": 0.01165378, + "auxiliary_loss_mlp": 0.01046335, + "balance_loss_clip": 1.02641523, + "balance_loss_mlp": 1.05309939, + "epoch": 0.1268600631294153, + "flos": 17275080119040.0, + "grad_norm": 5.620565406896926, + "language_loss": 0.82876229, + "learning_rate": 3.902389647441592e-06, + "loss": 0.85087943, + "num_input_tokens_seen": 45620595, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.125, + "step": 2110, + "time_per_iteration": 2.4536476135253906 + }, + { + "auxiliary_loss_clip": 0.01166918, + "auxiliary_loss_mlp": 0.01055332, + "balance_loss_clip": 1.03271818, + "balance_loss_mlp": 1.0524385, + "epoch": 0.12692018638208327, + "flos": 24061047390720.0, + "grad_norm": 1.8108257578185059, + "language_loss": 0.78553301, + "learning_rate": 3.90226942700191e-06, + "loss": 0.80775553, + "num_input_tokens_seen": 45641140, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.140625, + "step": 2111, + "time_per_iteration": 2.4898500442504883 + }, + { + "auxiliary_loss_clip": 0.01178398, + "auxiliary_loss_mlp": 0.01069762, + "balance_loss_clip": 1.04634905, + "balance_loss_mlp": 1.05599511, + "epoch": 0.12698030963475124, + "flos": 31831900652160.0, + "grad_norm": 2.2255287569010567, + "language_loss": 0.76852119, + "learning_rate": 3.902149134427982e-06, + "loss": 0.79100275, + "num_input_tokens_seen": 45662315, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.2265625, + "step": 2112, + "time_per_iteration": 2.534062623977661 + }, + { + "auxiliary_loss_clip": 0.0116691, + "auxiliary_loss_mlp": 0.01060346, + "balance_loss_clip": 1.03878117, + "balance_loss_mlp": 1.05138493, + "epoch": 0.1270404328874192, + "flos": 25187744275200.0, + "grad_norm": 1.901101750436338, + "language_loss": 0.85764933, + "learning_rate": 3.902028769724367e-06, + "loss": 0.87992191, + "num_input_tokens_seen": 45680335, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.15625, + "step": 2113, + "time_per_iteration": 2.4980924129486084 + }, + { + "auxiliary_loss_clip": 0.01166421, + "auxiliary_loss_mlp": 0.01057595, + "balance_loss_clip": 1.03581548, + "balance_loss_mlp": 1.05287683, + "epoch": 0.12710055614008717, + "flos": 15997342544640.0, + "grad_norm": 2.270588429793272, + "language_loss": 0.74000478, + "learning_rate": 3.9019083328956315e-06, + "loss": 0.76224494, + "num_input_tokens_seen": 45696240, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.1328125, + "step": 2114, + "time_per_iteration": 2.422631025314331 + }, + { + "auxiliary_loss_clip": 0.01170563, + "auxiliary_loss_mlp": 0.01057942, + "balance_loss_clip": 1.03504217, + "balance_loss_mlp": 1.05601084, + "epoch": 0.12716067939275516, + "flos": 15085642515840.0, + "grad_norm": 1.7902572486589996, + "language_loss": 0.83236456, + "learning_rate": 3.901787823946341e-06, + "loss": 0.85464966, + "num_input_tokens_seen": 45713695, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.1484375, + "step": 2115, + "time_per_iteration": 2.4601340293884277 + }, + { + "auxiliary_loss_clip": 0.01169954, + "auxiliary_loss_mlp": 0.01060607, + "balance_loss_clip": 1.03953075, + "balance_loss_mlp": 1.05397201, + "epoch": 0.12722080264542313, + "flos": 28366736636160.0, + "grad_norm": 1.532692301262898, + "language_loss": 0.86615002, + "learning_rate": 3.901667242881065e-06, + "loss": 0.88845563, + "num_input_tokens_seen": 45736655, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1640625, + "step": 2116, + "time_per_iteration": 2.5315732955932617 + }, + { + "auxiliary_loss_clip": 0.01164638, + "auxiliary_loss_mlp": 0.01050843, + "balance_loss_clip": 1.03062534, + "balance_loss_mlp": 1.05188024, + "epoch": 0.1272809258980911, + "flos": 32379897519360.0, + "grad_norm": 1.8525451323112498, + "language_loss": 0.70492947, + "learning_rate": 3.9015465897043775e-06, + "loss": 0.72708428, + "num_input_tokens_seen": 45758195, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.125, + "step": 2117, + "time_per_iteration": 2.6213905811309814 + }, + { + "auxiliary_loss_clip": 0.01168229, + "auxiliary_loss_mlp": 0.01055103, + "balance_loss_clip": 1.03346658, + "balance_loss_mlp": 1.05461121, + "epoch": 0.12734104915075906, + "flos": 16034402401920.0, + "grad_norm": 2.4058915352959294, + "language_loss": 0.86858076, + "learning_rate": 3.901425864420852e-06, + "loss": 0.89081407, + "num_input_tokens_seen": 45774280, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.140625, + "step": 2118, + "time_per_iteration": 2.4760360717773438 + }, + { + "auxiliary_loss_clip": 0.01164532, + "auxiliary_loss_mlp": 0.01048268, + "balance_loss_clip": 1.0279547, + "balance_loss_mlp": 1.0518508, + "epoch": 0.12740117240342702, + "flos": 18260325244800.0, + "grad_norm": 1.7933295144796901, + "language_loss": 0.87325591, + "learning_rate": 3.901305067035068e-06, + "loss": 0.89538383, + "num_input_tokens_seen": 45792760, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.125, + "step": 2119, + "time_per_iteration": 2.547213315963745 + }, + { + "auxiliary_loss_clip": 0.01167828, + "auxiliary_loss_mlp": 0.01051741, + "balance_loss_clip": 1.03024805, + "balance_loss_mlp": 1.05369782, + "epoch": 0.127461295656095, + "flos": 12121790664960.0, + "grad_norm": 2.4444945117671018, + "language_loss": 0.8769815, + "learning_rate": 3.901184197551605e-06, + "loss": 0.89917719, + "num_input_tokens_seen": 45804300, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.140625, + "step": 2120, + "time_per_iteration": 2.4568872451782227 + }, + { + "auxiliary_loss_clip": 0.01169401, + "auxiliary_loss_mlp": 0.0104623, + "balance_loss_clip": 1.02553487, + "balance_loss_mlp": 1.05405664, + "epoch": 0.12752141890876295, + "flos": 23149095966720.0, + "grad_norm": 1.8558714180118523, + "language_loss": 0.75193042, + "learning_rate": 3.901063255975046e-06, + "loss": 0.77408671, + "num_input_tokens_seen": 45823780, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1484375, + "step": 2121, + "time_per_iteration": 2.508117437362671 + }, + { + "auxiliary_loss_clip": 0.01167335, + "auxiliary_loss_mlp": 0.01050063, + "balance_loss_clip": 1.02895081, + "balance_loss_mlp": 1.05228865, + "epoch": 0.12758154216143094, + "flos": 21615997628160.0, + "grad_norm": 2.458066848563671, + "language_loss": 0.8294577, + "learning_rate": 3.900942242309978e-06, + "loss": 0.8516317, + "num_input_tokens_seen": 45840495, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.15625, + "step": 2122, + "time_per_iteration": 2.4878990650177 + }, + { + "auxiliary_loss_clip": 0.01168476, + "auxiliary_loss_mlp": 0.01050394, + "balance_loss_clip": 1.02924609, + "balance_loss_mlp": 1.05379128, + "epoch": 0.1276416654140989, + "flos": 15924874855680.0, + "grad_norm": 2.1208761223769375, + "language_loss": 0.79040462, + "learning_rate": 3.90082115656099e-06, + "loss": 0.81259328, + "num_input_tokens_seen": 45857735, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1484375, + "step": 2123, + "time_per_iteration": 2.512085199356079 + }, + { + "auxiliary_loss_clip": 0.0117181, + "auxiliary_loss_mlp": 0.01056255, + "balance_loss_clip": 1.03411841, + "balance_loss_mlp": 1.05565643, + "epoch": 0.12770178866676687, + "flos": 22382690451840.0, + "grad_norm": 1.7846776317234667, + "language_loss": 0.79227948, + "learning_rate": 3.900699998732673e-06, + "loss": 0.81456017, + "num_input_tokens_seen": 45876485, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1640625, + "step": 2124, + "time_per_iteration": 2.4865264892578125 + }, + { + "auxiliary_loss_clip": 0.01168084, + "auxiliary_loss_mlp": 0.01054179, + "balance_loss_clip": 1.03267348, + "balance_loss_mlp": 1.05149364, + "epoch": 0.12776191191943484, + "flos": 21652482867840.0, + "grad_norm": 2.8175561910153215, + "language_loss": 0.75565529, + "learning_rate": 3.900578768829623e-06, + "loss": 0.77787793, + "num_input_tokens_seen": 45894645, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.1640625, + "step": 2125, + "time_per_iteration": 2.514455795288086 + }, + { + "auxiliary_loss_clip": 0.01166899, + "auxiliary_loss_mlp": 0.01047376, + "balance_loss_clip": 1.02645469, + "balance_loss_mlp": 1.05262208, + "epoch": 0.1278220351721028, + "flos": 25735561574400.0, + "grad_norm": 2.1990589160087493, + "language_loss": 0.77811432, + "learning_rate": 3.900457466856434e-06, + "loss": 0.80025709, + "num_input_tokens_seen": 45913755, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.140625, + "step": 2126, + "time_per_iteration": 2.556657075881958 + }, + { + "auxiliary_loss_clip": 0.01167875, + "auxiliary_loss_mlp": 0.01050746, + "balance_loss_clip": 1.03124356, + "balance_loss_mlp": 1.05559683, + "epoch": 0.12788215842477077, + "flos": 41243224982400.0, + "grad_norm": 1.702389562623477, + "language_loss": 0.69255161, + "learning_rate": 3.9003360928177085e-06, + "loss": 0.71473777, + "num_input_tokens_seen": 45936095, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.125, + "step": 2127, + "time_per_iteration": 2.629990339279175 + }, + { + "auxiliary_loss_clip": 0.01050691, + "auxiliary_loss_mlp": 0.01005275, + "balance_loss_clip": 1.00326061, + "balance_loss_mlp": 1.01139402, + "epoch": 0.12794228167743876, + "flos": 70877430881280.0, + "grad_norm": 0.8552720802624753, + "language_loss": 0.62738979, + "learning_rate": 3.900214646718047e-06, + "loss": 0.64794946, + "num_input_tokens_seen": 46004655, + "router_z_loss_clip": 0.0201416, + "router_z_loss_mlp": 0.39257812, + "step": 2128, + "time_per_iteration": 3.1237356662750244 + }, + { + "auxiliary_loss_clip": 0.01168478, + "auxiliary_loss_mlp": 0.01048721, + "balance_loss_clip": 1.02646422, + "balance_loss_mlp": 1.05287039, + "epoch": 0.12800240493010673, + "flos": 16289727252480.0, + "grad_norm": 2.3711218915030368, + "language_loss": 0.77148604, + "learning_rate": 3.900093128562056e-06, + "loss": 0.79365802, + "num_input_tokens_seen": 46023610, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.15625, + "step": 2129, + "time_per_iteration": 2.4499564170837402 + }, + { + "auxiliary_loss_clip": 0.01179121, + "auxiliary_loss_mlp": 0.01052089, + "balance_loss_clip": 1.02902186, + "balance_loss_mlp": 1.05744195, + "epoch": 0.1280625281827747, + "flos": 20631542601600.0, + "grad_norm": 2.273395516882369, + "language_loss": 0.79321349, + "learning_rate": 3.899971538354343e-06, + "loss": 0.81552559, + "num_input_tokens_seen": 46041725, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.21875, + "step": 2130, + "time_per_iteration": 2.4536893367767334 + }, + { + "auxiliary_loss_clip": 0.0116812, + "auxiliary_loss_mlp": 0.01044456, + "balance_loss_clip": 1.02463198, + "balance_loss_mlp": 1.05328345, + "epoch": 0.12812265143544266, + "flos": 22638230784000.0, + "grad_norm": 2.267455405666958, + "language_loss": 0.70879477, + "learning_rate": 3.899849876099518e-06, + "loss": 0.73092055, + "num_input_tokens_seen": 46061095, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.1484375, + "step": 2131, + "time_per_iteration": 2.514155149459839 + }, + { + "auxiliary_loss_clip": 0.01166691, + "auxiliary_loss_mlp": 0.01051797, + "balance_loss_clip": 1.03007698, + "balance_loss_mlp": 1.05375445, + "epoch": 0.12818277468811062, + "flos": 34714701463680.0, + "grad_norm": 2.2952793086030376, + "language_loss": 0.72266257, + "learning_rate": 3.899728141802197e-06, + "loss": 0.74484742, + "num_input_tokens_seen": 46082670, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.125, + "step": 2132, + "time_per_iteration": 2.5662834644317627 + }, + { + "auxiliary_loss_clip": 0.01163765, + "auxiliary_loss_mlp": 0.01054914, + "balance_loss_clip": 1.03396928, + "balance_loss_mlp": 1.05281162, + "epoch": 0.1282428979407786, + "flos": 23112107936640.0, + "grad_norm": 2.1162344308699828, + "language_loss": 0.82306767, + "learning_rate": 3.8996063354669935e-06, + "loss": 0.84525442, + "num_input_tokens_seen": 46102410, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.109375, + "step": 2133, + "time_per_iteration": 2.488302230834961 + }, + { + "auxiliary_loss_clip": 0.01174206, + "auxiliary_loss_mlp": 0.01061813, + "balance_loss_clip": 1.03871, + "balance_loss_mlp": 1.05329132, + "epoch": 0.12830302119344655, + "flos": 20886508316160.0, + "grad_norm": 2.538367341661163, + "language_loss": 0.79631573, + "learning_rate": 3.899484457098528e-06, + "loss": 0.81867594, + "num_input_tokens_seen": 46121145, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.203125, + "step": 2134, + "time_per_iteration": 2.4610936641693115 + }, + { + "auxiliary_loss_clip": 0.01172893, + "auxiliary_loss_mlp": 0.01045118, + "balance_loss_clip": 1.02393413, + "balance_loss_mlp": 1.05650806, + "epoch": 0.12836314444611455, + "flos": 21397768548480.0, + "grad_norm": 2.033800341734765, + "language_loss": 0.83015293, + "learning_rate": 3.899362506701421e-06, + "loss": 0.85233301, + "num_input_tokens_seen": 46140740, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1640625, + "step": 2135, + "time_per_iteration": 2.4743056297302246 + }, + { + "auxiliary_loss_clip": 0.01165668, + "auxiliary_loss_mlp": 0.01061205, + "balance_loss_clip": 1.03842425, + "balance_loss_mlp": 1.05173945, + "epoch": 0.1284232676987825, + "flos": 13662466773120.0, + "grad_norm": 2.9021762622464853, + "language_loss": 0.77293968, + "learning_rate": 3.899240484280298e-06, + "loss": 0.79520839, + "num_input_tokens_seen": 46156805, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.140625, + "step": 2136, + "time_per_iteration": 2.4412362575531006 + }, + { + "auxiliary_loss_clip": 0.01051729, + "auxiliary_loss_mlp": 0.01001869, + "balance_loss_clip": 0.99983084, + "balance_loss_mlp": 1.01248765, + "epoch": 0.12848339095145048, + "flos": 59994737735040.0, + "grad_norm": 0.8943310105061408, + "language_loss": 0.59115362, + "learning_rate": 3.899118389839785e-06, + "loss": 0.61168963, + "num_input_tokens_seen": 46222085, + "router_z_loss_clip": 0.02038574, + "router_z_loss_mlp": 0.39257812, + "step": 2137, + "time_per_iteration": 3.2407264709472656 + }, + { + "auxiliary_loss_clip": 0.01164926, + "auxiliary_loss_mlp": 0.01052629, + "balance_loss_clip": 1.03207743, + "balance_loss_mlp": 1.04970789, + "epoch": 0.12854351420411844, + "flos": 13881378211200.0, + "grad_norm": 2.4694787743163404, + "language_loss": 0.81923193, + "learning_rate": 3.898996223384512e-06, + "loss": 0.84140748, + "num_input_tokens_seen": 46239970, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.15625, + "step": 2138, + "time_per_iteration": 2.441708564758301 + }, + { + "auxiliary_loss_clip": 0.01170897, + "auxiliary_loss_mlp": 0.01055556, + "balance_loss_clip": 1.03207207, + "balance_loss_mlp": 1.05353928, + "epoch": 0.1286036374567864, + "flos": 22637943475200.0, + "grad_norm": 2.804990264663657, + "language_loss": 0.79418135, + "learning_rate": 3.898873984919113e-06, + "loss": 0.81644583, + "num_input_tokens_seen": 46257740, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.171875, + "step": 2139, + "time_per_iteration": 2.5321907997131348 + }, + { + "auxiliary_loss_clip": 0.01169458, + "auxiliary_loss_mlp": 0.0104552, + "balance_loss_clip": 1.02488446, + "balance_loss_mlp": 1.05315363, + "epoch": 0.12866376070945437, + "flos": 16324775948160.0, + "grad_norm": 2.1742564972583667, + "language_loss": 0.84761363, + "learning_rate": 3.8987516744482215e-06, + "loss": 0.86976337, + "num_input_tokens_seen": 46275445, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.1640625, + "step": 2140, + "time_per_iteration": 2.469543933868408 + }, + { + "auxiliary_loss_clip": 0.01164368, + "auxiliary_loss_mlp": 0.01045521, + "balance_loss_clip": 1.02524316, + "balance_loss_mlp": 1.05079114, + "epoch": 0.12872388396212234, + "flos": 11874546374400.0, + "grad_norm": 2.376703775404894, + "language_loss": 0.85850012, + "learning_rate": 3.898629291976476e-06, + "loss": 0.88059902, + "num_input_tokens_seen": 46291710, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1328125, + "step": 2141, + "time_per_iteration": 2.473205327987671 + }, + { + "auxiliary_loss_clip": 0.0116884, + "auxiliary_loss_mlp": 0.01049008, + "balance_loss_clip": 1.0278126, + "balance_loss_mlp": 1.05059922, + "epoch": 0.12878400721479033, + "flos": 28366700722560.0, + "grad_norm": 3.411777854813752, + "language_loss": 0.68245387, + "learning_rate": 3.898506837508518e-06, + "loss": 0.7046324, + "num_input_tokens_seen": 46311335, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1796875, + "step": 2142, + "time_per_iteration": 2.5327556133270264 + }, + { + "auxiliary_loss_clip": 0.01171992, + "auxiliary_loss_mlp": 0.01048809, + "balance_loss_clip": 1.02702951, + "balance_loss_mlp": 1.05430341, + "epoch": 0.1288441304674583, + "flos": 25885632597120.0, + "grad_norm": 2.0295098459565692, + "language_loss": 0.82883704, + "learning_rate": 3.89838431104899e-06, + "loss": 0.85104507, + "num_input_tokens_seen": 46330985, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.171875, + "step": 2143, + "time_per_iteration": 4.014873743057251 + }, + { + "auxiliary_loss_clip": 0.01171398, + "auxiliary_loss_mlp": 0.01053828, + "balance_loss_clip": 1.03262091, + "balance_loss_mlp": 1.05572712, + "epoch": 0.12890425372012626, + "flos": 20813789232000.0, + "grad_norm": 1.7367706894947552, + "language_loss": 0.81788546, + "learning_rate": 3.898261712602539e-06, + "loss": 0.84013772, + "num_input_tokens_seen": 46351295, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.15625, + "step": 2144, + "time_per_iteration": 4.002255439758301 + }, + { + "auxiliary_loss_clip": 0.0116509, + "auxiliary_loss_mlp": 0.0105384, + "balance_loss_clip": 1.03108335, + "balance_loss_mlp": 1.04864693, + "epoch": 0.12896437697279423, + "flos": 22565870835840.0, + "grad_norm": 3.8817809862500727, + "language_loss": 0.78257203, + "learning_rate": 3.898139042173813e-06, + "loss": 0.80476135, + "num_input_tokens_seen": 46368600, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.1640625, + "step": 2145, + "time_per_iteration": 2.4952287673950195 + }, + { + "auxiliary_loss_clip": 0.01167211, + "auxiliary_loss_mlp": 0.01049919, + "balance_loss_clip": 1.02825832, + "balance_loss_mlp": 1.05031526, + "epoch": 0.1290245002254622, + "flos": 17493776075520.0, + "grad_norm": 2.1659704609946897, + "language_loss": 0.82622325, + "learning_rate": 3.898016299767465e-06, + "loss": 0.84839463, + "num_input_tokens_seen": 46387370, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.171875, + "step": 2146, + "time_per_iteration": 2.4898681640625 + }, + { + "auxiliary_loss_clip": 0.01165601, + "auxiliary_loss_mlp": 0.01051615, + "balance_loss_clip": 1.02959681, + "balance_loss_mlp": 1.05129158, + "epoch": 0.12908462347813016, + "flos": 36315957859200.0, + "grad_norm": 2.717320122986492, + "language_loss": 0.70446974, + "learning_rate": 3.897893485388149e-06, + "loss": 0.72664189, + "num_input_tokens_seen": 46409570, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.140625, + "step": 2147, + "time_per_iteration": 2.5964484214782715 + }, + { + "auxiliary_loss_clip": 0.01165989, + "auxiliary_loss_mlp": 0.01051149, + "balance_loss_clip": 1.03069305, + "balance_loss_mlp": 1.05166912, + "epoch": 0.12914474673079815, + "flos": 22528703237760.0, + "grad_norm": 2.443887417123452, + "language_loss": 0.71685153, + "learning_rate": 3.897770599040521e-06, + "loss": 0.73902297, + "num_input_tokens_seen": 46429320, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.140625, + "step": 2148, + "time_per_iteration": 2.4910573959350586 + }, + { + "auxiliary_loss_clip": 0.01165944, + "auxiliary_loss_mlp": 0.01046939, + "balance_loss_clip": 1.02681684, + "balance_loss_mlp": 1.05413008, + "epoch": 0.12920486998346611, + "flos": 21471888263040.0, + "grad_norm": 1.666574129953403, + "language_loss": 0.79379606, + "learning_rate": 3.897647640729242e-06, + "loss": 0.81592482, + "num_input_tokens_seen": 46450155, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.1171875, + "step": 2149, + "time_per_iteration": 2.495443820953369 + }, + { + "auxiliary_loss_clip": 0.01167493, + "auxiliary_loss_mlp": 0.01046346, + "balance_loss_clip": 1.02455473, + "balance_loss_mlp": 1.05306077, + "epoch": 0.12926499323613408, + "flos": 27308556944640.0, + "grad_norm": 2.1379132369478313, + "language_loss": 0.76475441, + "learning_rate": 3.897524610458975e-06, + "loss": 0.78689277, + "num_input_tokens_seen": 46470280, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.140625, + "step": 2150, + "time_per_iteration": 2.524395704269409 + }, + { + "auxiliary_loss_clip": 0.01166143, + "auxiliary_loss_mlp": 0.0105244, + "balance_loss_clip": 1.03124499, + "balance_loss_mlp": 1.05094671, + "epoch": 0.12932511648880204, + "flos": 22091131756800.0, + "grad_norm": 2.417935370690141, + "language_loss": 0.70735669, + "learning_rate": 3.8974015082343835e-06, + "loss": 0.72954249, + "num_input_tokens_seen": 46487605, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1484375, + "step": 2151, + "time_per_iteration": 2.5213184356689453 + }, + { + "auxiliary_loss_clip": 0.01165721, + "auxiliary_loss_mlp": 0.01044761, + "balance_loss_clip": 1.02502, + "balance_loss_mlp": 1.05457592, + "epoch": 0.12938523974147, + "flos": 20302780394880.0, + "grad_norm": 1.9866869590783298, + "language_loss": 0.84050369, + "learning_rate": 3.897278334060137e-06, + "loss": 0.86260849, + "num_input_tokens_seen": 46505100, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.109375, + "step": 2152, + "time_per_iteration": 2.4494428634643555 + }, + { + "auxiliary_loss_clip": 0.01167192, + "auxiliary_loss_mlp": 0.01057934, + "balance_loss_clip": 1.03689384, + "balance_loss_mlp": 1.05128813, + "epoch": 0.12944536299413797, + "flos": 19499961467520.0, + "grad_norm": 2.226463520109079, + "language_loss": 0.78646791, + "learning_rate": 3.897155087940906e-06, + "loss": 0.80871922, + "num_input_tokens_seen": 46524020, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.15625, + "step": 2153, + "time_per_iteration": 2.46975040435791 + }, + { + "auxiliary_loss_clip": 0.01163518, + "auxiliary_loss_mlp": 0.01052865, + "balance_loss_clip": 1.03220654, + "balance_loss_mlp": 1.05069268, + "epoch": 0.12950548624680594, + "flos": 27707919333120.0, + "grad_norm": 2.482522823334948, + "language_loss": 0.80135351, + "learning_rate": 3.897031769881364e-06, + "loss": 0.82351738, + "num_input_tokens_seen": 46544640, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.125, + "step": 2154, + "time_per_iteration": 2.558769941329956 + }, + { + "auxiliary_loss_clip": 0.01170487, + "auxiliary_loss_mlp": 0.01051039, + "balance_loss_clip": 1.02998686, + "balance_loss_mlp": 1.05522227, + "epoch": 0.12956560949947393, + "flos": 17565740974080.0, + "grad_norm": 2.0988715261553774, + "language_loss": 0.83128881, + "learning_rate": 3.896908379886188e-06, + "loss": 0.85350406, + "num_input_tokens_seen": 46561395, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1484375, + "step": 2155, + "time_per_iteration": 2.476299524307251 + }, + { + "auxiliary_loss_clip": 0.01164922, + "auxiliary_loss_mlp": 0.01050282, + "balance_loss_clip": 1.02961075, + "balance_loss_mlp": 1.05010283, + "epoch": 0.1296257327521419, + "flos": 20740711011840.0, + "grad_norm": 2.842594732542889, + "language_loss": 0.76062953, + "learning_rate": 3.896784917960055e-06, + "loss": 0.7827816, + "num_input_tokens_seen": 46579395, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1484375, + "step": 2156, + "time_per_iteration": 2.6024632453918457 + }, + { + "auxiliary_loss_clip": 0.01161875, + "auxiliary_loss_mlp": 0.01051596, + "balance_loss_clip": 1.03078222, + "balance_loss_mlp": 1.05121815, + "epoch": 0.12968585600480986, + "flos": 16395735265920.0, + "grad_norm": 1.9934077258859366, + "language_loss": 0.86546719, + "learning_rate": 3.896661384107648e-06, + "loss": 0.88760191, + "num_input_tokens_seen": 46597090, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.109375, + "step": 2157, + "time_per_iteration": 2.452826976776123 + }, + { + "auxiliary_loss_clip": 0.01164359, + "auxiliary_loss_mlp": 0.01059125, + "balance_loss_clip": 1.03745282, + "balance_loss_mlp": 1.04796743, + "epoch": 0.12974597925747783, + "flos": 28329533124480.0, + "grad_norm": 2.339899004847696, + "language_loss": 0.80590808, + "learning_rate": 3.896537778333651e-06, + "loss": 0.82814288, + "num_input_tokens_seen": 46617355, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.1640625, + "step": 2158, + "time_per_iteration": 2.5332443714141846 + }, + { + "auxiliary_loss_clip": 0.01169288, + "auxiliary_loss_mlp": 0.01055971, + "balance_loss_clip": 1.03510916, + "balance_loss_mlp": 1.05294585, + "epoch": 0.1298061025101458, + "flos": 9683025782400.0, + "grad_norm": 2.254282600322574, + "language_loss": 0.74603379, + "learning_rate": 3.896414100642752e-06, + "loss": 0.76828635, + "num_input_tokens_seen": 46633130, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1640625, + "step": 2159, + "time_per_iteration": 2.469038963317871 + }, + { + "auxiliary_loss_clip": 0.01158286, + "auxiliary_loss_mlp": 0.0105286, + "balance_loss_clip": 1.0323323, + "balance_loss_mlp": 1.04777908, + "epoch": 0.12986622576281376, + "flos": 27709535445120.0, + "grad_norm": 2.1260113568932746, + "language_loss": 0.8227706, + "learning_rate": 3.89629035103964e-06, + "loss": 0.84488213, + "num_input_tokens_seen": 46650575, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.109375, + "step": 2160, + "time_per_iteration": 2.516723155975342 + }, + { + "auxiliary_loss_clip": 0.01159917, + "auxiliary_loss_mlp": 0.01044215, + "balance_loss_clip": 1.02450943, + "balance_loss_mlp": 1.05318654, + "epoch": 0.12992634901548175, + "flos": 18802719590400.0, + "grad_norm": 1.6308358458278915, + "language_loss": 0.81877828, + "learning_rate": 3.896166529529008e-06, + "loss": 0.8408196, + "num_input_tokens_seen": 46668780, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0625, + "step": 2161, + "time_per_iteration": 2.4677131175994873 + }, + { + "auxiliary_loss_clip": 0.01161774, + "auxiliary_loss_mlp": 0.01056265, + "balance_loss_clip": 1.03479493, + "balance_loss_mlp": 1.05035043, + "epoch": 0.12998647226814972, + "flos": 29127575543040.0, + "grad_norm": 2.2782308625037686, + "language_loss": 0.82592809, + "learning_rate": 3.896042636115551e-06, + "loss": 0.84810847, + "num_input_tokens_seen": 46687550, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.109375, + "step": 2162, + "time_per_iteration": 2.5702993869781494 + }, + { + "auxiliary_loss_clip": 0.01164237, + "auxiliary_loss_mlp": 0.01054699, + "balance_loss_clip": 1.03454113, + "balance_loss_mlp": 1.04993796, + "epoch": 0.13004659552081768, + "flos": 19573686132480.0, + "grad_norm": 2.619296712638915, + "language_loss": 0.72762972, + "learning_rate": 3.895918670803968e-06, + "loss": 0.7498191, + "num_input_tokens_seen": 46706730, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.140625, + "step": 2163, + "time_per_iteration": 2.4531478881835938 + }, + { + "auxiliary_loss_clip": 0.0116607, + "auxiliary_loss_mlp": 0.01053845, + "balance_loss_clip": 1.03183889, + "balance_loss_mlp": 1.05107188, + "epoch": 0.13010671877348565, + "flos": 22490709626880.0, + "grad_norm": 2.0773433264348435, + "language_loss": 0.81498116, + "learning_rate": 3.895794633598958e-06, + "loss": 0.83718032, + "num_input_tokens_seen": 46724250, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1484375, + "step": 2164, + "time_per_iteration": 2.497072458267212 + }, + { + "auxiliary_loss_clip": 0.01164255, + "auxiliary_loss_mlp": 0.01042951, + "balance_loss_clip": 1.02381766, + "balance_loss_mlp": 1.05107093, + "epoch": 0.1301668420261536, + "flos": 23878226142720.0, + "grad_norm": 2.2040156749440523, + "language_loss": 0.72564822, + "learning_rate": 3.8956705245052256e-06, + "loss": 0.7477203, + "num_input_tokens_seen": 46744105, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.1328125, + "step": 2165, + "time_per_iteration": 2.515026807785034 + }, + { + "auxiliary_loss_clip": 0.01167589, + "auxiliary_loss_mlp": 0.01047652, + "balance_loss_clip": 1.02599204, + "balance_loss_mlp": 1.05286038, + "epoch": 0.13022696527882158, + "flos": 23150065633920.0, + "grad_norm": 2.8786436091142913, + "language_loss": 0.74697578, + "learning_rate": 3.8955463435274765e-06, + "loss": 0.76912814, + "num_input_tokens_seen": 46764250, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.1484375, + "step": 2166, + "time_per_iteration": 2.5301709175109863 + }, + { + "auxiliary_loss_clip": 0.01165477, + "auxiliary_loss_mlp": 0.01047606, + "balance_loss_clip": 1.02751899, + "balance_loss_mlp": 1.05156064, + "epoch": 0.13028708853148954, + "flos": 26908548111360.0, + "grad_norm": 1.5708346768068926, + "language_loss": 0.83053899, + "learning_rate": 3.895422090670421e-06, + "loss": 0.85266984, + "num_input_tokens_seen": 46786865, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.140625, + "step": 2167, + "time_per_iteration": 2.632035732269287 + }, + { + "auxiliary_loss_clip": 0.01163335, + "auxiliary_loss_mlp": 0.01060394, + "balance_loss_clip": 1.03931761, + "balance_loss_mlp": 1.05201721, + "epoch": 0.13034721178415754, + "flos": 21251468453760.0, + "grad_norm": 1.9158171210349437, + "language_loss": 0.83286303, + "learning_rate": 3.89529776593877e-06, + "loss": 0.85510027, + "num_input_tokens_seen": 46807030, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.109375, + "step": 2168, + "time_per_iteration": 2.4766387939453125 + }, + { + "auxiliary_loss_clip": 0.0116626, + "auxiliary_loss_mlp": 0.01052307, + "balance_loss_clip": 1.03075409, + "balance_loss_mlp": 1.05258656, + "epoch": 0.1304073350368255, + "flos": 18767239931520.0, + "grad_norm": 2.304013454801214, + "language_loss": 0.80027354, + "learning_rate": 3.8951733693372375e-06, + "loss": 0.82245922, + "num_input_tokens_seen": 46826280, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.140625, + "step": 2169, + "time_per_iteration": 2.5185413360595703 + }, + { + "auxiliary_loss_clip": 0.01166949, + "auxiliary_loss_mlp": 0.01045126, + "balance_loss_clip": 1.02329922, + "balance_loss_mlp": 1.05451608, + "epoch": 0.13046745828949347, + "flos": 28364653647360.0, + "grad_norm": 4.565704621626811, + "language_loss": 0.66456163, + "learning_rate": 3.8950489008705406e-06, + "loss": 0.68668246, + "num_input_tokens_seen": 46846505, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.125, + "step": 2170, + "time_per_iteration": 2.5556788444519043 + }, + { + "auxiliary_loss_clip": 0.01165384, + "auxiliary_loss_mlp": 0.01044241, + "balance_loss_clip": 1.02397573, + "balance_loss_mlp": 1.05294132, + "epoch": 0.13052758154216143, + "flos": 29605044055680.0, + "grad_norm": 1.848772151746763, + "language_loss": 0.66935396, + "learning_rate": 3.8949243605434e-06, + "loss": 0.69145024, + "num_input_tokens_seen": 46867380, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.125, + "step": 2171, + "time_per_iteration": 2.553422451019287 + }, + { + "auxiliary_loss_clip": 0.01164709, + "auxiliary_loss_mlp": 0.01048224, + "balance_loss_clip": 1.02649236, + "balance_loss_mlp": 1.05211711, + "epoch": 0.1305877047948294, + "flos": 19390864884480.0, + "grad_norm": 1.9479804069383955, + "language_loss": 0.71952963, + "learning_rate": 3.894799748360537e-06, + "loss": 0.74165899, + "num_input_tokens_seen": 46886810, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.125, + "step": 2172, + "time_per_iteration": 2.4801840782165527 + }, + { + "auxiliary_loss_clip": 0.01161466, + "auxiliary_loss_mlp": 0.01043706, + "balance_loss_clip": 1.02508521, + "balance_loss_mlp": 1.05435848, + "epoch": 0.13064782804749736, + "flos": 16873527000960.0, + "grad_norm": 1.8616776845407013, + "language_loss": 0.75547618, + "learning_rate": 3.894675064326678e-06, + "loss": 0.77752787, + "num_input_tokens_seen": 46905620, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.0703125, + "step": 2173, + "time_per_iteration": 2.4639194011688232 + }, + { + "auxiliary_loss_clip": 0.01165867, + "auxiliary_loss_mlp": 0.01055263, + "balance_loss_clip": 1.03406715, + "balance_loss_mlp": 1.05319107, + "epoch": 0.13070795130016533, + "flos": 24499085748480.0, + "grad_norm": 2.777389952877741, + "language_loss": 0.70484382, + "learning_rate": 3.894550308446551e-06, + "loss": 0.72705513, + "num_input_tokens_seen": 46925120, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.125, + "step": 2174, + "time_per_iteration": 2.4914908409118652 + }, + { + "auxiliary_loss_clip": 0.01055025, + "auxiliary_loss_mlp": 0.01011158, + "balance_loss_clip": 1.0087378, + "balance_loss_mlp": 1.0165, + "epoch": 0.13076807455283332, + "flos": 71054505953280.0, + "grad_norm": 0.8022263951171452, + "language_loss": 0.59071571, + "learning_rate": 3.894425480724886e-06, + "loss": 0.61137754, + "num_input_tokens_seen": 46988195, + "router_z_loss_clip": 0.02416992, + "router_z_loss_mlp": 0.38671875, + "step": 2175, + "time_per_iteration": 3.244633913040161 + }, + { + "auxiliary_loss_clip": 0.01164931, + "auxiliary_loss_mlp": 0.01051735, + "balance_loss_clip": 1.03214908, + "balance_loss_mlp": 1.05474329, + "epoch": 0.13082819780550128, + "flos": 20264499475200.0, + "grad_norm": 2.247504257537708, + "language_loss": 0.79946023, + "learning_rate": 3.894300581166417e-06, + "loss": 0.8216269, + "num_input_tokens_seen": 47004720, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.1015625, + "step": 2176, + "time_per_iteration": 2.439883232116699 + }, + { + "auxiliary_loss_clip": 0.01163907, + "auxiliary_loss_mlp": 0.01050259, + "balance_loss_clip": 1.02806199, + "balance_loss_mlp": 1.05234194, + "epoch": 0.13088832105816925, + "flos": 34203441231360.0, + "grad_norm": 1.8562517641565577, + "language_loss": 0.74595284, + "learning_rate": 3.894175609775881e-06, + "loss": 0.76809454, + "num_input_tokens_seen": 47024255, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.1171875, + "step": 2177, + "time_per_iteration": 2.561140298843384 + }, + { + "auxiliary_loss_clip": 0.01163674, + "auxiliary_loss_mlp": 0.01046693, + "balance_loss_clip": 1.024472, + "balance_loss_mlp": 1.05222929, + "epoch": 0.13094844431083721, + "flos": 17894970057600.0, + "grad_norm": 2.128567307625778, + "language_loss": 0.81855309, + "learning_rate": 3.894050566558015e-06, + "loss": 0.84065676, + "num_input_tokens_seen": 47042465, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.1171875, + "step": 2178, + "time_per_iteration": 2.458812713623047 + }, + { + "auxiliary_loss_clip": 0.01166111, + "auxiliary_loss_mlp": 0.01047933, + "balance_loss_clip": 1.02695179, + "balance_loss_mlp": 1.05466795, + "epoch": 0.13100856756350518, + "flos": 17311313963520.0, + "grad_norm": 2.66972533149016, + "language_loss": 0.74942935, + "learning_rate": 3.893925451517562e-06, + "loss": 0.77156973, + "num_input_tokens_seen": 47060370, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.109375, + "step": 2179, + "time_per_iteration": 2.4679782390594482 + }, + { + "auxiliary_loss_clip": 0.01161603, + "auxiliary_loss_mlp": 0.0105054, + "balance_loss_clip": 1.03079903, + "balance_loss_mlp": 1.05280709, + "epoch": 0.13106869081617314, + "flos": 22200551562240.0, + "grad_norm": 2.0560779031919636, + "language_loss": 0.84319234, + "learning_rate": 3.893800264659266e-06, + "loss": 0.86531377, + "num_input_tokens_seen": 47081415, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0859375, + "step": 2180, + "time_per_iteration": 2.567873477935791 + }, + { + "auxiliary_loss_clip": 0.01166279, + "auxiliary_loss_mlp": 0.01054601, + "balance_loss_clip": 1.03483582, + "balance_loss_mlp": 1.05700839, + "epoch": 0.13112881406884114, + "flos": 21763123735680.0, + "grad_norm": 2.214126283525484, + "language_loss": 0.8987745, + "learning_rate": 3.8936750059878746e-06, + "loss": 0.92098325, + "num_input_tokens_seen": 47099860, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.09375, + "step": 2181, + "time_per_iteration": 2.4802486896514893 + }, + { + "auxiliary_loss_clip": 0.01166281, + "auxiliary_loss_mlp": 0.01043829, + "balance_loss_clip": 1.02438569, + "balance_loss_mlp": 1.0557189, + "epoch": 0.1311889373215091, + "flos": 23331091201920.0, + "grad_norm": 1.8993602522657917, + "language_loss": 0.68657839, + "learning_rate": 3.893549675508137e-06, + "loss": 0.70867944, + "num_input_tokens_seen": 47118540, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.109375, + "step": 2182, + "time_per_iteration": 2.460148572921753 + }, + { + "auxiliary_loss_clip": 0.01167141, + "auxiliary_loss_mlp": 0.01048146, + "balance_loss_clip": 1.02745128, + "balance_loss_mlp": 1.05504203, + "epoch": 0.13124906057417707, + "flos": 21467363149440.0, + "grad_norm": 2.6442759836393277, + "language_loss": 0.78435183, + "learning_rate": 3.893424273224806e-06, + "loss": 0.80650467, + "num_input_tokens_seen": 47136710, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.125, + "step": 2183, + "time_per_iteration": 2.5462143421173096 + }, + { + "auxiliary_loss_clip": 0.01162472, + "auxiliary_loss_mlp": 0.01043905, + "balance_loss_clip": 1.02375841, + "balance_loss_mlp": 1.05238128, + "epoch": 0.13130918382684503, + "flos": 23255319461760.0, + "grad_norm": 2.788927255894662, + "language_loss": 0.85543215, + "learning_rate": 3.893298799142636e-06, + "loss": 0.87749588, + "num_input_tokens_seen": 47157155, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.1015625, + "step": 2184, + "time_per_iteration": 3.8904993534088135 + }, + { + "auxiliary_loss_clip": 0.01165934, + "auxiliary_loss_mlp": 0.01047649, + "balance_loss_clip": 1.0265255, + "balance_loss_mlp": 1.0529201, + "epoch": 0.131369307079513, + "flos": 20850274471680.0, + "grad_norm": 2.505672435211917, + "language_loss": 0.82206696, + "learning_rate": 3.893173253266387e-06, + "loss": 0.84420282, + "num_input_tokens_seen": 47176820, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1328125, + "step": 2185, + "time_per_iteration": 5.3855485916137695 + }, + { + "auxiliary_loss_clip": 0.01168066, + "auxiliary_loss_mlp": 0.0105393, + "balance_loss_clip": 1.03323543, + "balance_loss_mlp": 1.05440092, + "epoch": 0.13142943033218096, + "flos": 17858341163520.0, + "grad_norm": 2.0294565364346235, + "language_loss": 0.73037684, + "learning_rate": 3.893047635600818e-06, + "loss": 0.7525968, + "num_input_tokens_seen": 47195855, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1328125, + "step": 2186, + "time_per_iteration": 2.4839119911193848 + }, + { + "auxiliary_loss_clip": 0.01165928, + "auxiliary_loss_mlp": 0.01048235, + "balance_loss_clip": 1.02601433, + "balance_loss_mlp": 1.05449164, + "epoch": 0.13148955358484893, + "flos": 20996035862400.0, + "grad_norm": 2.0525608711513614, + "language_loss": 0.80174023, + "learning_rate": 3.892921946150693e-06, + "loss": 0.82388186, + "num_input_tokens_seen": 47214535, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.109375, + "step": 2187, + "time_per_iteration": 2.463906764984131 + }, + { + "auxiliary_loss_clip": 0.01053378, + "auxiliary_loss_mlp": 0.01005839, + "balance_loss_clip": 1.00344312, + "balance_loss_mlp": 1.01508641, + "epoch": 0.13154967683751692, + "flos": 70172467580160.0, + "grad_norm": 0.8435449169341035, + "language_loss": 0.58977342, + "learning_rate": 3.892796184920778e-06, + "loss": 0.61036563, + "num_input_tokens_seen": 47270300, + "router_z_loss_clip": 0.02392578, + "router_z_loss_mlp": 0.3828125, + "step": 2188, + "time_per_iteration": 3.1052041053771973 + }, + { + "auxiliary_loss_clip": 0.01169813, + "auxiliary_loss_mlp": 0.01050803, + "balance_loss_clip": 1.03037024, + "balance_loss_mlp": 1.05918622, + "epoch": 0.1316098000901849, + "flos": 20376145923840.0, + "grad_norm": 2.1443848583942846, + "language_loss": 0.74199927, + "learning_rate": 3.892670351915842e-06, + "loss": 0.76420546, + "num_input_tokens_seen": 47290720, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.109375, + "step": 2189, + "time_per_iteration": 2.5137264728546143 + }, + { + "auxiliary_loss_clip": 0.01166605, + "auxiliary_loss_mlp": 0.01049022, + "balance_loss_clip": 1.02894759, + "balance_loss_mlp": 1.05678558, + "epoch": 0.13166992334285285, + "flos": 23221132692480.0, + "grad_norm": 1.7642431940848833, + "language_loss": 0.72561657, + "learning_rate": 3.892544447140657e-06, + "loss": 0.74777287, + "num_input_tokens_seen": 47311820, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.1015625, + "step": 2190, + "time_per_iteration": 2.5053412914276123 + }, + { + "auxiliary_loss_clip": 0.01169095, + "auxiliary_loss_mlp": 0.01051343, + "balance_loss_clip": 1.03094649, + "balance_loss_mlp": 1.05706906, + "epoch": 0.13173004659552082, + "flos": 23330947547520.0, + "grad_norm": 8.700182749243472, + "language_loss": 0.74395585, + "learning_rate": 3.892418470599996e-06, + "loss": 0.76616025, + "num_input_tokens_seen": 47331605, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.1171875, + "step": 2191, + "time_per_iteration": 2.507687568664551 + }, + { + "auxiliary_loss_clip": 0.01168877, + "auxiliary_loss_mlp": 0.01049305, + "balance_loss_clip": 1.02841949, + "balance_loss_mlp": 1.05689156, + "epoch": 0.13179016984818878, + "flos": 21251504367360.0, + "grad_norm": 2.0250128968483403, + "language_loss": 0.79286075, + "learning_rate": 3.892292422298637e-06, + "loss": 0.8150425, + "num_input_tokens_seen": 47350455, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1171875, + "step": 2192, + "time_per_iteration": 2.5068893432617188 + }, + { + "auxiliary_loss_clip": 0.01168449, + "auxiliary_loss_mlp": 0.01053422, + "balance_loss_clip": 1.03290629, + "balance_loss_mlp": 1.05564141, + "epoch": 0.13185029310085675, + "flos": 17778690754560.0, + "grad_norm": 1.9285179647135495, + "language_loss": 0.84827602, + "learning_rate": 3.892166302241361e-06, + "loss": 0.87049472, + "num_input_tokens_seen": 47368225, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.125, + "step": 2193, + "time_per_iteration": 2.456409215927124 + }, + { + "auxiliary_loss_clip": 0.0105585, + "auxiliary_loss_mlp": 0.01002145, + "balance_loss_clip": 0.99976075, + "balance_loss_mlp": 1.0179081, + "epoch": 0.1319104163535247, + "flos": 69851785933440.0, + "grad_norm": 0.7727203010194038, + "language_loss": 0.54049635, + "learning_rate": 3.8920401104329475e-06, + "loss": 0.56107628, + "num_input_tokens_seen": 47427125, + "router_z_loss_clip": 0.02380371, + "router_z_loss_mlp": 0.37890625, + "step": 2194, + "time_per_iteration": 3.0569794178009033 + }, + { + "auxiliary_loss_clip": 0.01165932, + "auxiliary_loss_mlp": 0.01046302, + "balance_loss_clip": 1.02566671, + "balance_loss_mlp": 1.05514359, + "epoch": 0.1319705396061927, + "flos": 25193095401600.0, + "grad_norm": 1.7688784093808256, + "language_loss": 0.72086227, + "learning_rate": 3.891913846878185e-06, + "loss": 0.74298465, + "num_input_tokens_seen": 47450275, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.109375, + "step": 2195, + "time_per_iteration": 2.527435541152954 + }, + { + "auxiliary_loss_clip": 0.01173804, + "auxiliary_loss_mlp": 0.01045041, + "balance_loss_clip": 1.02310634, + "balance_loss_mlp": 1.05663633, + "epoch": 0.13203066285886067, + "flos": 20740459616640.0, + "grad_norm": 1.7664998702658374, + "language_loss": 0.78195536, + "learning_rate": 3.891787511581859e-06, + "loss": 0.80414379, + "num_input_tokens_seen": 47469155, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.171875, + "step": 2196, + "time_per_iteration": 2.4685165882110596 + }, + { + "auxiliary_loss_clip": 0.01169552, + "auxiliary_loss_mlp": 0.01046979, + "balance_loss_clip": 1.02714252, + "balance_loss_mlp": 1.05638218, + "epoch": 0.13209078611152864, + "flos": 22054395121920.0, + "grad_norm": 2.1663119445052295, + "language_loss": 0.74861938, + "learning_rate": 3.89166110454876e-06, + "loss": 0.77078474, + "num_input_tokens_seen": 47488405, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.1328125, + "step": 2197, + "time_per_iteration": 2.489504814147949 + }, + { + "auxiliary_loss_clip": 0.01170611, + "auxiliary_loss_mlp": 0.01044966, + "balance_loss_clip": 1.02430725, + "balance_loss_mlp": 1.05543399, + "epoch": 0.1321509093641966, + "flos": 16284950743680.0, + "grad_norm": 2.4378795089069674, + "language_loss": 0.8011694, + "learning_rate": 3.891534625783685e-06, + "loss": 0.82332516, + "num_input_tokens_seen": 47505650, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1484375, + "step": 2198, + "time_per_iteration": 2.437718391418457 + }, + { + "auxiliary_loss_clip": 0.0116676, + "auxiliary_loss_mlp": 0.01061419, + "balance_loss_clip": 1.04173732, + "balance_loss_mlp": 1.05483699, + "epoch": 0.13221103261686457, + "flos": 16983018633600.0, + "grad_norm": 2.4514815632850038, + "language_loss": 0.82552117, + "learning_rate": 3.891408075291425e-06, + "loss": 0.847803, + "num_input_tokens_seen": 47521540, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1171875, + "step": 2199, + "time_per_iteration": 2.47356915473938 + }, + { + "auxiliary_loss_clip": 0.01167277, + "auxiliary_loss_mlp": 0.01047633, + "balance_loss_clip": 1.02724838, + "balance_loss_mlp": 1.05458844, + "epoch": 0.13227115586953253, + "flos": 34233605677440.0, + "grad_norm": 2.465688895758548, + "language_loss": 0.68963099, + "learning_rate": 3.8912814530767826e-06, + "loss": 0.71178007, + "num_input_tokens_seen": 47543625, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.125, + "step": 2200, + "time_per_iteration": 2.5828843116760254 + }, + { + "auxiliary_loss_clip": 0.01166491, + "auxiliary_loss_mlp": 0.01055533, + "balance_loss_clip": 1.03420663, + "balance_loss_mlp": 1.05397916, + "epoch": 0.13233127912220052, + "flos": 20704656735360.0, + "grad_norm": 2.591612522060186, + "language_loss": 0.84600091, + "learning_rate": 3.891154759144557e-06, + "loss": 0.86822116, + "num_input_tokens_seen": 47563740, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.125, + "step": 2201, + "time_per_iteration": 2.5546202659606934 + }, + { + "auxiliary_loss_clip": 0.01168797, + "auxiliary_loss_mlp": 0.01054072, + "balance_loss_clip": 1.03315115, + "balance_loss_mlp": 1.05466592, + "epoch": 0.1323914023748685, + "flos": 25805048434560.0, + "grad_norm": 1.901870031688447, + "language_loss": 0.86978126, + "learning_rate": 3.891027993499554e-06, + "loss": 0.89200991, + "num_input_tokens_seen": 47582655, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.140625, + "step": 2202, + "time_per_iteration": 2.509300470352173 + }, + { + "auxiliary_loss_clip": 0.01164666, + "auxiliary_loss_mlp": 0.01042993, + "balance_loss_clip": 1.02364576, + "balance_loss_mlp": 1.05389142, + "epoch": 0.13245152562753645, + "flos": 21251540280960.0, + "grad_norm": 2.3614014237187084, + "language_loss": 0.72746712, + "learning_rate": 3.89090115614658e-06, + "loss": 0.74954367, + "num_input_tokens_seen": 47600875, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.109375, + "step": 2203, + "time_per_iteration": 2.5267388820648193 + }, + { + "auxiliary_loss_clip": 0.01167891, + "auxiliary_loss_mlp": 0.0105678, + "balance_loss_clip": 1.03781366, + "balance_loss_mlp": 1.05453348, + "epoch": 0.13251164888020442, + "flos": 26610955931520.0, + "grad_norm": 2.5436302639516, + "language_loss": 0.73248756, + "learning_rate": 3.890774247090444e-06, + "loss": 0.75473428, + "num_input_tokens_seen": 47619250, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.1328125, + "step": 2204, + "time_per_iteration": 2.5298051834106445 + }, + { + "auxiliary_loss_clip": 0.01168712, + "auxiliary_loss_mlp": 0.01053403, + "balance_loss_clip": 1.03211212, + "balance_loss_mlp": 1.05558085, + "epoch": 0.13257177213287238, + "flos": 29826541272960.0, + "grad_norm": 1.7540271848273767, + "language_loss": 0.78627133, + "learning_rate": 3.89064726633596e-06, + "loss": 0.80849254, + "num_input_tokens_seen": 47639445, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1328125, + "step": 2205, + "time_per_iteration": 2.5343189239501953 + }, + { + "auxiliary_loss_clip": 0.01166449, + "auxiliary_loss_mlp": 0.01053788, + "balance_loss_clip": 1.033391, + "balance_loss_mlp": 1.05560231, + "epoch": 0.13263189538554035, + "flos": 21288456483840.0, + "grad_norm": 2.234297854715259, + "language_loss": 0.78748876, + "learning_rate": 3.890520213887941e-06, + "loss": 0.80969107, + "num_input_tokens_seen": 47658740, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.109375, + "step": 2206, + "time_per_iteration": 2.473229169845581 + }, + { + "auxiliary_loss_clip": 0.01170953, + "auxiliary_loss_mlp": 0.01046503, + "balance_loss_clip": 1.02750087, + "balance_loss_mlp": 1.05758011, + "epoch": 0.13269201863820831, + "flos": 16874101618560.0, + "grad_norm": 2.3028539815574494, + "language_loss": 0.73993444, + "learning_rate": 3.890393089751208e-06, + "loss": 0.76210898, + "num_input_tokens_seen": 47676880, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.125, + "step": 2207, + "time_per_iteration": 2.479421854019165 + }, + { + "auxiliary_loss_clip": 0.01160402, + "auxiliary_loss_mlp": 0.0104899, + "balance_loss_clip": 1.02822387, + "balance_loss_mlp": 1.05323017, + "epoch": 0.1327521418908763, + "flos": 23768914078080.0, + "grad_norm": 2.4105539478543454, + "language_loss": 0.84151787, + "learning_rate": 3.890265893930578e-06, + "loss": 0.86361182, + "num_input_tokens_seen": 47696635, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.0703125, + "step": 2208, + "time_per_iteration": 2.501969337463379 + }, + { + "auxiliary_loss_clip": 0.01161894, + "auxiliary_loss_mlp": 0.01055633, + "balance_loss_clip": 1.03621435, + "balance_loss_mlp": 1.05553222, + "epoch": 0.13281226514354427, + "flos": 26505594362880.0, + "grad_norm": 1.9362156368998853, + "language_loss": 0.85323346, + "learning_rate": 3.890138626430876e-06, + "loss": 0.87540877, + "num_input_tokens_seen": 47717760, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0625, + "step": 2209, + "time_per_iteration": 2.509761333465576 + }, + { + "auxiliary_loss_clip": 0.01166975, + "auxiliary_loss_mlp": 0.01049621, + "balance_loss_clip": 1.03039217, + "balance_loss_mlp": 1.05628705, + "epoch": 0.13287238839621224, + "flos": 24498762526080.0, + "grad_norm": 2.055387861012722, + "language_loss": 0.81545013, + "learning_rate": 3.890011287256929e-06, + "loss": 0.83761609, + "num_input_tokens_seen": 47737685, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.109375, + "step": 2210, + "time_per_iteration": 2.4920527935028076 + }, + { + "auxiliary_loss_clip": 0.0106048, + "auxiliary_loss_mlp": 0.01003994, + "balance_loss_clip": 1.0016222, + "balance_loss_mlp": 1.02205014, + "epoch": 0.1329325116488802, + "flos": 67694344369920.0, + "grad_norm": 0.7616894664797615, + "language_loss": 0.57984382, + "learning_rate": 3.889883876413563e-06, + "loss": 0.6004886, + "num_input_tokens_seen": 47802415, + "router_z_loss_clip": 0.02368164, + "router_z_loss_mlp": 0.3828125, + "step": 2211, + "time_per_iteration": 3.1735260486602783 + }, + { + "auxiliary_loss_clip": 0.01059664, + "auxiliary_loss_mlp": 0.01005439, + "balance_loss_clip": 1.00312614, + "balance_loss_mlp": 1.02081084, + "epoch": 0.13299263490154817, + "flos": 72261894741120.0, + "grad_norm": 0.7970523185699088, + "language_loss": 0.55364317, + "learning_rate": 3.889756393905611e-06, + "loss": 0.57429421, + "num_input_tokens_seen": 47871485, + "router_z_loss_clip": 0.02307129, + "router_z_loss_mlp": 0.38671875, + "step": 2212, + "time_per_iteration": 3.142056465148926 + }, + { + "auxiliary_loss_clip": 0.01170665, + "auxiliary_loss_mlp": 0.01052255, + "balance_loss_clip": 1.03164423, + "balance_loss_mlp": 1.056463, + "epoch": 0.13305275815421613, + "flos": 17931275729280.0, + "grad_norm": 4.2694742121271645, + "language_loss": 0.74779308, + "learning_rate": 3.889628839737908e-06, + "loss": 0.77002227, + "num_input_tokens_seen": 47888315, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.140625, + "step": 2213, + "time_per_iteration": 2.4599013328552246 + }, + { + "auxiliary_loss_clip": 0.0115893, + "auxiliary_loss_mlp": 0.01047564, + "balance_loss_clip": 1.02889609, + "balance_loss_mlp": 1.05235839, + "epoch": 0.13311288140688413, + "flos": 22340889999360.0, + "grad_norm": 2.0343460890824927, + "language_loss": 0.79269958, + "learning_rate": 3.889501213915291e-06, + "loss": 0.81476456, + "num_input_tokens_seen": 47906600, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.0625, + "step": 2214, + "time_per_iteration": 2.4635097980499268 + }, + { + "auxiliary_loss_clip": 0.01166328, + "auxiliary_loss_mlp": 0.01051317, + "balance_loss_clip": 1.03062189, + "balance_loss_mlp": 1.05593503, + "epoch": 0.1331730046595521, + "flos": 31868888682240.0, + "grad_norm": 2.0399610331480407, + "language_loss": 0.69410872, + "learning_rate": 3.889373516442597e-06, + "loss": 0.71628523, + "num_input_tokens_seen": 47927630, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1015625, + "step": 2215, + "time_per_iteration": 2.5798754692077637 + }, + { + "auxiliary_loss_clip": 0.01166771, + "auxiliary_loss_mlp": 0.01046808, + "balance_loss_clip": 1.02725816, + "balance_loss_mlp": 1.05576539, + "epoch": 0.13323312791222006, + "flos": 22566589107840.0, + "grad_norm": 2.4518621177772175, + "language_loss": 0.81136751, + "learning_rate": 3.889245747324671e-06, + "loss": 0.83350337, + "num_input_tokens_seen": 47947935, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.109375, + "step": 2216, + "time_per_iteration": 2.49601674079895 + }, + { + "auxiliary_loss_clip": 0.01166215, + "auxiliary_loss_mlp": 0.01057297, + "balance_loss_clip": 1.03668606, + "balance_loss_mlp": 1.05610895, + "epoch": 0.13329325116488802, + "flos": 15085319293440.0, + "grad_norm": 3.5729384628186307, + "language_loss": 0.87350845, + "learning_rate": 3.889117906566356e-06, + "loss": 0.89574361, + "num_input_tokens_seen": 47965515, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.1015625, + "step": 2217, + "time_per_iteration": 2.435224771499634 + }, + { + "auxiliary_loss_clip": 0.01165439, + "auxiliary_loss_mlp": 0.01048261, + "balance_loss_clip": 1.02716112, + "balance_loss_mlp": 1.05609739, + "epoch": 0.133353374417556, + "flos": 27453671890560.0, + "grad_norm": 2.6393181601709057, + "language_loss": 0.73460543, + "learning_rate": 3.888989994172501e-06, + "loss": 0.75674248, + "num_input_tokens_seen": 47985675, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.09375, + "step": 2218, + "time_per_iteration": 2.4984188079833984 + }, + { + "auxiliary_loss_clip": 0.01164132, + "auxiliary_loss_mlp": 0.01044805, + "balance_loss_clip": 1.02401495, + "balance_loss_mlp": 1.05406141, + "epoch": 0.13341349767022395, + "flos": 24094695456000.0, + "grad_norm": 1.803125703936159, + "language_loss": 0.87483871, + "learning_rate": 3.8888620101479565e-06, + "loss": 0.89692807, + "num_input_tokens_seen": 48004985, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.09375, + "step": 2219, + "time_per_iteration": 2.4761111736297607 + }, + { + "auxiliary_loss_clip": 0.01166927, + "auxiliary_loss_mlp": 0.01051114, + "balance_loss_clip": 1.03198123, + "balance_loss_mlp": 1.05804753, + "epoch": 0.13347362092289192, + "flos": 24133335511680.0, + "grad_norm": 1.5604165479120375, + "language_loss": 0.77241862, + "learning_rate": 3.888733954497574e-06, + "loss": 0.79459906, + "num_input_tokens_seen": 48024965, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0859375, + "step": 2220, + "time_per_iteration": 2.5172770023345947 + }, + { + "auxiliary_loss_clip": 0.01158357, + "auxiliary_loss_mlp": 0.01044474, + "balance_loss_clip": 1.02599692, + "balance_loss_mlp": 1.05065227, + "epoch": 0.1335337441755599, + "flos": 18436538390400.0, + "grad_norm": 2.752699726256429, + "language_loss": 0.79361391, + "learning_rate": 3.888605827226212e-06, + "loss": 0.81564224, + "num_input_tokens_seen": 48040890, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.078125, + "step": 2221, + "time_per_iteration": 2.4729459285736084 + }, + { + "auxiliary_loss_clip": 0.01056162, + "auxiliary_loss_mlp": 0.01004009, + "balance_loss_clip": 1.00177944, + "balance_loss_mlp": 1.01797867, + "epoch": 0.13359386742822787, + "flos": 50611997652480.0, + "grad_norm": 0.9620212456786271, + "language_loss": 0.6890744, + "learning_rate": 3.8884776283387275e-06, + "loss": 0.70967615, + "num_input_tokens_seen": 48091855, + "router_z_loss_clip": 0.02233887, + "router_z_loss_mlp": 0.3828125, + "step": 2222, + "time_per_iteration": 2.9102694988250732 + }, + { + "auxiliary_loss_clip": 0.011664, + "auxiliary_loss_mlp": 0.01047762, + "balance_loss_clip": 1.02885592, + "balance_loss_mlp": 1.05645049, + "epoch": 0.13365399068089584, + "flos": 22778569221120.0, + "grad_norm": 1.8990549263762904, + "language_loss": 0.66966134, + "learning_rate": 3.888349357839982e-06, + "loss": 0.69180298, + "num_input_tokens_seen": 48111350, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.1015625, + "step": 2223, + "time_per_iteration": 2.4860363006591797 + }, + { + "auxiliary_loss_clip": 0.01162257, + "auxiliary_loss_mlp": 0.01055999, + "balance_loss_clip": 1.03584075, + "balance_loss_mlp": 1.05173874, + "epoch": 0.1337141139335638, + "flos": 12531603911040.0, + "grad_norm": 2.0940561003244738, + "language_loss": 0.82572883, + "learning_rate": 3.88822101573484e-06, + "loss": 0.84791142, + "num_input_tokens_seen": 48129840, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.109375, + "step": 2224, + "time_per_iteration": 2.453310966491699 + }, + { + "auxiliary_loss_clip": 0.01167505, + "auxiliary_loss_mlp": 0.01047104, + "balance_loss_clip": 1.0266118, + "balance_loss_mlp": 1.05410361, + "epoch": 0.13377423718623177, + "flos": 23038957889280.0, + "grad_norm": 2.0797940389634624, + "language_loss": 0.66006851, + "learning_rate": 3.888092602028167e-06, + "loss": 0.68221462, + "num_input_tokens_seen": 48149240, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1328125, + "step": 2225, + "time_per_iteration": 2.505760669708252 + }, + { + "auxiliary_loss_clip": 0.01164479, + "auxiliary_loss_mlp": 0.01054978, + "balance_loss_clip": 1.03491461, + "balance_loss_mlp": 1.05366707, + "epoch": 0.13383436043889974, + "flos": 16216397637120.0, + "grad_norm": 2.2490181158076545, + "language_loss": 0.89484501, + "learning_rate": 3.887964116724835e-06, + "loss": 0.91703951, + "num_input_tokens_seen": 48166330, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.109375, + "step": 2226, + "time_per_iteration": 3.827432632446289 + }, + { + "auxiliary_loss_clip": 0.01166768, + "auxiliary_loss_mlp": 0.01050683, + "balance_loss_clip": 1.03132319, + "balance_loss_mlp": 1.05492473, + "epoch": 0.1338944836915677, + "flos": 24279671520000.0, + "grad_norm": 2.0692514385202947, + "language_loss": 0.73874348, + "learning_rate": 3.887835559829712e-06, + "loss": 0.76091796, + "num_input_tokens_seen": 48187600, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.1171875, + "step": 2227, + "time_per_iteration": 5.469221115112305 + }, + { + "auxiliary_loss_clip": 0.01166021, + "auxiliary_loss_mlp": 0.01049972, + "balance_loss_clip": 1.02971888, + "balance_loss_mlp": 1.05582607, + "epoch": 0.1339546069442357, + "flos": 17598742594560.0, + "grad_norm": 2.597241668203809, + "language_loss": 0.8519839, + "learning_rate": 3.8877069313476764e-06, + "loss": 0.87414384, + "num_input_tokens_seen": 48204400, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1015625, + "step": 2228, + "time_per_iteration": 2.449289560317993 + }, + { + "auxiliary_loss_clip": 0.01162737, + "auxiliary_loss_mlp": 0.01047632, + "balance_loss_clip": 1.0275687, + "balance_loss_mlp": 1.05501461, + "epoch": 0.13401473019690366, + "flos": 18990065952000.0, + "grad_norm": 2.700498827765594, + "language_loss": 0.8100034, + "learning_rate": 3.8875782312836054e-06, + "loss": 0.83210707, + "num_input_tokens_seen": 48222180, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.078125, + "step": 2229, + "time_per_iteration": 2.454185962677002 + }, + { + "auxiliary_loss_clip": 0.01165405, + "auxiliary_loss_mlp": 0.01055372, + "balance_loss_clip": 1.03528547, + "balance_loss_mlp": 1.05576682, + "epoch": 0.13407485344957162, + "flos": 26943812288640.0, + "grad_norm": 2.350850930683171, + "language_loss": 0.73814881, + "learning_rate": 3.887449459642378e-06, + "loss": 0.76035661, + "num_input_tokens_seen": 48243245, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.09375, + "step": 2230, + "time_per_iteration": 2.538679838180542 + }, + { + "auxiliary_loss_clip": 0.0116587, + "auxiliary_loss_mlp": 0.01055192, + "balance_loss_clip": 1.03551102, + "balance_loss_mlp": 1.0541544, + "epoch": 0.1341349767022396, + "flos": 20339373375360.0, + "grad_norm": 8.27737726970052, + "language_loss": 0.79914325, + "learning_rate": 3.8873206164288785e-06, + "loss": 0.82135391, + "num_input_tokens_seen": 48262600, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.1171875, + "step": 2231, + "time_per_iteration": 2.436964273452759 + }, + { + "auxiliary_loss_clip": 0.0116777, + "auxiliary_loss_mlp": 0.01049092, + "balance_loss_clip": 1.02906466, + "balance_loss_mlp": 1.05716896, + "epoch": 0.13419509995490755, + "flos": 29862020931840.0, + "grad_norm": 1.9954658779127024, + "language_loss": 0.72341192, + "learning_rate": 3.887191701647992e-06, + "loss": 0.74558049, + "num_input_tokens_seen": 48285075, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.109375, + "step": 2232, + "time_per_iteration": 2.5315330028533936 + }, + { + "auxiliary_loss_clip": 0.01169038, + "auxiliary_loss_mlp": 0.01047761, + "balance_loss_clip": 1.02664888, + "balance_loss_mlp": 1.05505097, + "epoch": 0.13425522320757552, + "flos": 26942986275840.0, + "grad_norm": 2.53729194427275, + "language_loss": 0.65508974, + "learning_rate": 3.8870627153046066e-06, + "loss": 0.67725778, + "num_input_tokens_seen": 48301285, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.140625, + "step": 2233, + "time_per_iteration": 2.480006694793701 + }, + { + "auxiliary_loss_clip": 0.01161612, + "auxiliary_loss_mlp": 0.01047371, + "balance_loss_clip": 1.02687883, + "balance_loss_mlp": 1.05011904, + "epoch": 0.1343153464602435, + "flos": 15777281871360.0, + "grad_norm": 4.541384002557222, + "language_loss": 0.81492066, + "learning_rate": 3.886933657403615e-06, + "loss": 0.8370105, + "num_input_tokens_seen": 48317835, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1171875, + "step": 2234, + "time_per_iteration": 2.431962490081787 + }, + { + "auxiliary_loss_clip": 0.01165653, + "auxiliary_loss_mlp": 0.01054939, + "balance_loss_clip": 1.03466105, + "balance_loss_mlp": 1.05424869, + "epoch": 0.13437546971291148, + "flos": 24314756129280.0, + "grad_norm": 1.9481483268780417, + "language_loss": 0.82361299, + "learning_rate": 3.886804527949909e-06, + "loss": 0.84581894, + "num_input_tokens_seen": 48335670, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1171875, + "step": 2235, + "time_per_iteration": 2.4478979110717773 + }, + { + "auxiliary_loss_clip": 0.0116322, + "auxiliary_loss_mlp": 0.01055841, + "balance_loss_clip": 1.03378713, + "balance_loss_mlp": 1.05170834, + "epoch": 0.13443559296557944, + "flos": 26650673395200.0, + "grad_norm": 1.6568048404288893, + "language_loss": 0.86399209, + "learning_rate": 3.8866753269483864e-06, + "loss": 0.88618279, + "num_input_tokens_seen": 48357805, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1171875, + "step": 2236, + "time_per_iteration": 2.534761428833008 + }, + { + "auxiliary_loss_clip": 0.01166625, + "auxiliary_loss_mlp": 0.0104777, + "balance_loss_clip": 1.02712345, + "balance_loss_mlp": 1.05506372, + "epoch": 0.1344957162182474, + "flos": 21796197183360.0, + "grad_norm": 1.5401183277834882, + "language_loss": 0.76936173, + "learning_rate": 3.886546054403946e-06, + "loss": 0.79150563, + "num_input_tokens_seen": 48377845, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1171875, + "step": 2237, + "time_per_iteration": 2.454881191253662 + }, + { + "auxiliary_loss_clip": 0.0116441, + "auxiliary_loss_mlp": 0.01051932, + "balance_loss_clip": 1.02974725, + "balance_loss_mlp": 1.05312407, + "epoch": 0.13455583947091537, + "flos": 19865568049920.0, + "grad_norm": 1.976295310563951, + "language_loss": 0.78737688, + "learning_rate": 3.886416710321491e-06, + "loss": 0.80954033, + "num_input_tokens_seen": 48394735, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.1171875, + "step": 2238, + "time_per_iteration": 2.508364200592041 + }, + { + "auxiliary_loss_clip": 0.01162005, + "auxiliary_loss_mlp": 0.01051856, + "balance_loss_clip": 1.03057706, + "balance_loss_mlp": 1.0530107, + "epoch": 0.13461596272358334, + "flos": 30846835094400.0, + "grad_norm": 2.3078790626960246, + "language_loss": 0.67977941, + "learning_rate": 3.886287294705924e-06, + "loss": 0.70191795, + "num_input_tokens_seen": 48414200, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.09375, + "step": 2239, + "time_per_iteration": 2.533148765563965 + }, + { + "auxiliary_loss_clip": 0.01165153, + "auxiliary_loss_mlp": 0.01049226, + "balance_loss_clip": 1.02888918, + "balance_loss_mlp": 1.05296254, + "epoch": 0.1346760859762513, + "flos": 12494436312960.0, + "grad_norm": 2.7482132203763245, + "language_loss": 0.81085825, + "learning_rate": 3.8861578075621555e-06, + "loss": 0.83300203, + "num_input_tokens_seen": 48431065, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.125, + "step": 2240, + "time_per_iteration": 2.458702802658081 + }, + { + "auxiliary_loss_clip": 0.01166075, + "auxiliary_loss_mlp": 0.01050419, + "balance_loss_clip": 1.02958083, + "balance_loss_mlp": 1.05302262, + "epoch": 0.1347362092289193, + "flos": 21836022387840.0, + "grad_norm": 1.775061814751768, + "language_loss": 0.77491653, + "learning_rate": 3.886028248895093e-06, + "loss": 0.79708141, + "num_input_tokens_seen": 48450335, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1328125, + "step": 2241, + "time_per_iteration": 2.4814610481262207 + }, + { + "auxiliary_loss_clip": 0.01160364, + "auxiliary_loss_mlp": 0.01044969, + "balance_loss_clip": 1.0265156, + "balance_loss_mlp": 1.05368328, + "epoch": 0.13479633248158726, + "flos": 23509459163520.0, + "grad_norm": 1.708340264075402, + "language_loss": 0.83106101, + "learning_rate": 3.88589861870965e-06, + "loss": 0.85311437, + "num_input_tokens_seen": 48468555, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.0625, + "step": 2242, + "time_per_iteration": 2.531010627746582 + }, + { + "auxiliary_loss_clip": 0.01166889, + "auxiliary_loss_mlp": 0.01052169, + "balance_loss_clip": 1.03056788, + "balance_loss_mlp": 1.05465889, + "epoch": 0.13485645573425523, + "flos": 29344332165120.0, + "grad_norm": 3.594763109819468, + "language_loss": 0.64927268, + "learning_rate": 3.885768917010744e-06, + "loss": 0.67146331, + "num_input_tokens_seen": 48488515, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.125, + "step": 2243, + "time_per_iteration": 2.5215864181518555 + }, + { + "auxiliary_loss_clip": 0.01158013, + "auxiliary_loss_mlp": 0.01045929, + "balance_loss_clip": 1.02573538, + "balance_loss_mlp": 1.05214143, + "epoch": 0.1349165789869232, + "flos": 28037112503040.0, + "grad_norm": 1.6702464572283469, + "language_loss": 0.72275442, + "learning_rate": 3.8856391438032895e-06, + "loss": 0.74479383, + "num_input_tokens_seen": 48510515, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0625, + "step": 2244, + "time_per_iteration": 2.572275161743164 + }, + { + "auxiliary_loss_clip": 0.01161264, + "auxiliary_loss_mlp": 0.01052624, + "balance_loss_clip": 1.03339577, + "balance_loss_mlp": 1.0510093, + "epoch": 0.13497670223959116, + "flos": 22853730430080.0, + "grad_norm": 1.6251739599249553, + "language_loss": 0.86419517, + "learning_rate": 3.88550929909221e-06, + "loss": 0.886334, + "num_input_tokens_seen": 48529940, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.1015625, + "step": 2245, + "time_per_iteration": 2.4847447872161865 + }, + { + "auxiliary_loss_clip": 0.0115964, + "auxiliary_loss_mlp": 0.01049956, + "balance_loss_clip": 1.029953, + "balance_loss_mlp": 1.0534606, + "epoch": 0.13503682549225912, + "flos": 16504580453760.0, + "grad_norm": 1.986035604010071, + "language_loss": 0.79054129, + "learning_rate": 3.88537938288243e-06, + "loss": 0.81263721, + "num_input_tokens_seen": 48548190, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0625, + "step": 2246, + "time_per_iteration": 2.521500825881958 + }, + { + "auxiliary_loss_clip": 0.01053943, + "auxiliary_loss_mlp": 0.01006436, + "balance_loss_clip": 1.00378919, + "balance_loss_mlp": 1.01705432, + "epoch": 0.1350969487449271, + "flos": 70756303242240.0, + "grad_norm": 0.7498554605470831, + "language_loss": 0.60597092, + "learning_rate": 3.885249395178874e-06, + "loss": 0.6265747, + "num_input_tokens_seen": 48613165, + "router_z_loss_clip": 0.02648926, + "router_z_loss_mlp": 0.3671875, + "step": 2247, + "time_per_iteration": 3.209567070007324 + }, + { + "auxiliary_loss_clip": 0.0117261, + "auxiliary_loss_mlp": 0.01058621, + "balance_loss_clip": 1.03629315, + "balance_loss_mlp": 1.05673957, + "epoch": 0.13515707199759508, + "flos": 23075981832960.0, + "grad_norm": 2.930333372025318, + "language_loss": 0.81250268, + "learning_rate": 3.885119335986473e-06, + "loss": 0.83481503, + "num_input_tokens_seen": 48631705, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.15625, + "step": 2248, + "time_per_iteration": 2.5274717807769775 + }, + { + "auxiliary_loss_clip": 0.01157631, + "auxiliary_loss_mlp": 0.0104321, + "balance_loss_clip": 1.02503014, + "balance_loss_mlp": 1.0515008, + "epoch": 0.13521719525026304, + "flos": 23186371305600.0, + "grad_norm": 2.1598236051462383, + "language_loss": 0.77427459, + "learning_rate": 3.884989205310157e-06, + "loss": 0.79628301, + "num_input_tokens_seen": 48649740, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.0625, + "step": 2249, + "time_per_iteration": 2.475325345993042 + }, + { + "auxiliary_loss_clip": 0.01161564, + "auxiliary_loss_mlp": 0.01053869, + "balance_loss_clip": 1.03477216, + "balance_loss_mlp": 1.05408192, + "epoch": 0.135277318502931, + "flos": 24790931752320.0, + "grad_norm": 1.4620260499768896, + "language_loss": 0.84598488, + "learning_rate": 3.884859003154862e-06, + "loss": 0.86813927, + "num_input_tokens_seen": 48671565, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0703125, + "step": 2250, + "time_per_iteration": 2.5579018592834473 + }, + { + "auxiliary_loss_clip": 0.01162926, + "auxiliary_loss_mlp": 0.0104688, + "balance_loss_clip": 1.02586317, + "balance_loss_mlp": 1.05311561, + "epoch": 0.13533744175559898, + "flos": 21908525990400.0, + "grad_norm": 1.9830962049575767, + "language_loss": 0.8213973, + "learning_rate": 3.884728729525524e-06, + "loss": 0.84349537, + "num_input_tokens_seen": 48690425, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.1015625, + "step": 2251, + "time_per_iteration": 2.459254503250122 + }, + { + "auxiliary_loss_clip": 0.01160349, + "auxiliary_loss_mlp": 0.01053163, + "balance_loss_clip": 1.03144348, + "balance_loss_mlp": 1.05075097, + "epoch": 0.13539756500826694, + "flos": 21211643249280.0, + "grad_norm": 1.6927381248236872, + "language_loss": 0.85981321, + "learning_rate": 3.884598384427084e-06, + "loss": 0.88194835, + "num_input_tokens_seen": 48707505, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.09375, + "step": 2252, + "time_per_iteration": 2.508246421813965 + }, + { + "auxiliary_loss_clip": 0.01050697, + "auxiliary_loss_mlp": 0.0100648, + "balance_loss_clip": 1.00398886, + "balance_loss_mlp": 1.01368976, + "epoch": 0.1354576882609349, + "flos": 63242103634560.0, + "grad_norm": 0.7502755191421498, + "language_loss": 0.61736262, + "learning_rate": 3.884467967864485e-06, + "loss": 0.63793439, + "num_input_tokens_seen": 48775895, + "router_z_loss_clip": 0.02490234, + "router_z_loss_mlp": 0.37109375, + "step": 2253, + "time_per_iteration": 3.1357691287994385 + }, + { + "auxiliary_loss_clip": 0.01163708, + "auxiliary_loss_mlp": 0.01055809, + "balance_loss_clip": 1.0357219, + "balance_loss_mlp": 1.05454588, + "epoch": 0.1355178115136029, + "flos": 25483037984640.0, + "grad_norm": 2.033104819567641, + "language_loss": 0.89383745, + "learning_rate": 3.884337479842671e-06, + "loss": 0.91603261, + "num_input_tokens_seen": 48798370, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.09375, + "step": 2254, + "time_per_iteration": 2.4983997344970703 + }, + { + "auxiliary_loss_clip": 0.01166229, + "auxiliary_loss_mlp": 0.01050811, + "balance_loss_clip": 1.02786362, + "balance_loss_mlp": 1.05202925, + "epoch": 0.13557793476627086, + "flos": 21616967295360.0, + "grad_norm": 2.0851597725495843, + "language_loss": 0.84461302, + "learning_rate": 3.884206920366591e-06, + "loss": 0.86678338, + "num_input_tokens_seen": 48817955, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.140625, + "step": 2255, + "time_per_iteration": 2.4466094970703125 + }, + { + "auxiliary_loss_clip": 0.01159898, + "auxiliary_loss_mlp": 0.01046769, + "balance_loss_clip": 1.02632451, + "balance_loss_mlp": 1.05059099, + "epoch": 0.13563805801893883, + "flos": 24928253447040.0, + "grad_norm": 2.8290739743459126, + "language_loss": 0.7493006, + "learning_rate": 3.884076289441196e-06, + "loss": 0.77136725, + "num_input_tokens_seen": 48836330, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.09375, + "step": 2256, + "time_per_iteration": 2.49464750289917 + }, + { + "auxiliary_loss_clip": 0.01164681, + "auxiliary_loss_mlp": 0.01049021, + "balance_loss_clip": 1.02806377, + "balance_loss_mlp": 1.05080438, + "epoch": 0.1356981812716068, + "flos": 14750272206720.0, + "grad_norm": 4.107811937736733, + "language_loss": 0.83023381, + "learning_rate": 3.88394558707144e-06, + "loss": 0.85237086, + "num_input_tokens_seen": 48851890, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.140625, + "step": 2257, + "time_per_iteration": 2.4069128036499023 + }, + { + "auxiliary_loss_clip": 0.0116793, + "auxiliary_loss_mlp": 0.01054876, + "balance_loss_clip": 1.03272712, + "balance_loss_mlp": 1.05211377, + "epoch": 0.13575830452427476, + "flos": 11108571822720.0, + "grad_norm": 2.2162023158830655, + "language_loss": 0.82266492, + "learning_rate": 3.883814813262277e-06, + "loss": 0.84489298, + "num_input_tokens_seen": 48865510, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.15625, + "step": 2258, + "time_per_iteration": 2.4187939167022705 + }, + { + "auxiliary_loss_clip": 0.01161942, + "auxiliary_loss_mlp": 0.01051916, + "balance_loss_clip": 1.02890849, + "balance_loss_mlp": 1.05117583, + "epoch": 0.13581842777694272, + "flos": 17960290940160.0, + "grad_norm": 2.3528312033652434, + "language_loss": 0.82556236, + "learning_rate": 3.883683968018669e-06, + "loss": 0.84770095, + "num_input_tokens_seen": 48882360, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.109375, + "step": 2259, + "time_per_iteration": 2.4182498455047607 + }, + { + "auxiliary_loss_clip": 0.01162398, + "auxiliary_loss_mlp": 0.0105006, + "balance_loss_clip": 1.0313561, + "balance_loss_mlp": 1.05370188, + "epoch": 0.1358785510296107, + "flos": 22857142222080.0, + "grad_norm": 1.9951846625000045, + "language_loss": 0.73434722, + "learning_rate": 3.8835530513455755e-06, + "loss": 0.75647175, + "num_input_tokens_seen": 48902700, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0859375, + "step": 2260, + "time_per_iteration": 2.5117952823638916 + }, + { + "auxiliary_loss_clip": 0.01160597, + "auxiliary_loss_mlp": 0.01053624, + "balance_loss_clip": 1.03389525, + "balance_loss_mlp": 1.05164778, + "epoch": 0.13593867428227868, + "flos": 25739404329600.0, + "grad_norm": 2.6406640236232826, + "language_loss": 0.75450647, + "learning_rate": 3.883422063247961e-06, + "loss": 0.77664864, + "num_input_tokens_seen": 48922525, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.09375, + "step": 2261, + "time_per_iteration": 2.4773809909820557 + }, + { + "auxiliary_loss_clip": 0.01164897, + "auxiliary_loss_mlp": 0.01048665, + "balance_loss_clip": 1.02887654, + "balance_loss_mlp": 1.05329657, + "epoch": 0.13599879753494665, + "flos": 31249214225280.0, + "grad_norm": 1.9984757312973846, + "language_loss": 0.63141024, + "learning_rate": 3.883291003730794e-06, + "loss": 0.65354586, + "num_input_tokens_seen": 48942510, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.1171875, + "step": 2262, + "time_per_iteration": 2.5423331260681152 + }, + { + "auxiliary_loss_clip": 0.01161423, + "auxiliary_loss_mlp": 0.01043862, + "balance_loss_clip": 1.02458549, + "balance_loss_mlp": 1.05046105, + "epoch": 0.1360589207876146, + "flos": 23915034604800.0, + "grad_norm": 2.598036861128168, + "language_loss": 0.82363462, + "learning_rate": 3.883159872799043e-06, + "loss": 0.84568739, + "num_input_tokens_seen": 48962625, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.109375, + "step": 2263, + "time_per_iteration": 2.472050428390503 + }, + { + "auxiliary_loss_clip": 0.01166147, + "auxiliary_loss_mlp": 0.01061822, + "balance_loss_clip": 1.03914785, + "balance_loss_mlp": 1.05306447, + "epoch": 0.13611904404028258, + "flos": 19974197756160.0, + "grad_norm": 1.7757676532235749, + "language_loss": 0.87984985, + "learning_rate": 3.8830286704576815e-06, + "loss": 0.90212959, + "num_input_tokens_seen": 48982525, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.1328125, + "step": 2264, + "time_per_iteration": 2.4857943058013916 + }, + { + "auxiliary_loss_clip": 0.01163519, + "auxiliary_loss_mlp": 0.01048759, + "balance_loss_clip": 1.02700329, + "balance_loss_mlp": 1.05115557, + "epoch": 0.13617916729295054, + "flos": 15340644144000.0, + "grad_norm": 2.9904691281538693, + "language_loss": 0.7103616, + "learning_rate": 3.882897396711683e-06, + "loss": 0.73248434, + "num_input_tokens_seen": 48997605, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.125, + "step": 2265, + "time_per_iteration": 2.428753614425659 + }, + { + "auxiliary_loss_clip": 0.01160486, + "auxiliary_loss_mlp": 0.01041679, + "balance_loss_clip": 1.02187812, + "balance_loss_mlp": 1.05258036, + "epoch": 0.1362392905456185, + "flos": 27451445247360.0, + "grad_norm": 2.049615390343222, + "language_loss": 0.66760135, + "learning_rate": 3.882766051566027e-06, + "loss": 0.689623, + "num_input_tokens_seen": 49018535, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.078125, + "step": 2266, + "time_per_iteration": 2.4990508556365967 + }, + { + "auxiliary_loss_clip": 0.01159505, + "auxiliary_loss_mlp": 0.01060297, + "balance_loss_clip": 1.04079425, + "balance_loss_mlp": 1.05220675, + "epoch": 0.1362994137982865, + "flos": 25009017177600.0, + "grad_norm": 1.7538751206895893, + "language_loss": 0.76376909, + "learning_rate": 3.882634635025694e-06, + "loss": 0.78596711, + "num_input_tokens_seen": 49038865, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0703125, + "step": 2267, + "time_per_iteration": 2.485907554626465 + }, + { + "auxiliary_loss_clip": 0.0116058, + "auxiliary_loss_mlp": 0.01046136, + "balance_loss_clip": 1.02639508, + "balance_loss_mlp": 1.05051804, + "epoch": 0.13635953705095447, + "flos": 20303031790080.0, + "grad_norm": 2.002795226804265, + "language_loss": 0.81781995, + "learning_rate": 3.882503147095667e-06, + "loss": 0.83988714, + "num_input_tokens_seen": 49058010, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1015625, + "step": 2268, + "time_per_iteration": 3.890936851501465 + }, + { + "auxiliary_loss_clip": 0.01161581, + "auxiliary_loss_mlp": 0.01046085, + "balance_loss_clip": 1.02567649, + "balance_loss_mlp": 1.0542717, + "epoch": 0.13641966030362243, + "flos": 31358418549120.0, + "grad_norm": 2.071095479959133, + "language_loss": 0.76078153, + "learning_rate": 3.882371587780931e-06, + "loss": 0.78285825, + "num_input_tokens_seen": 49080330, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0703125, + "step": 2269, + "time_per_iteration": 4.03081202507019 + }, + { + "auxiliary_loss_clip": 0.01165717, + "auxiliary_loss_mlp": 0.0104628, + "balance_loss_clip": 1.02612138, + "balance_loss_mlp": 1.05518508, + "epoch": 0.1364797835562904, + "flos": 20478095700480.0, + "grad_norm": 2.039865659244694, + "language_loss": 0.80856502, + "learning_rate": 3.882239957086477e-06, + "loss": 0.83068502, + "num_input_tokens_seen": 49097035, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.109375, + "step": 2270, + "time_per_iteration": 2.431426525115967 + }, + { + "auxiliary_loss_clip": 0.01164608, + "auxiliary_loss_mlp": 0.01055428, + "balance_loss_clip": 1.03463817, + "balance_loss_mlp": 1.05227089, + "epoch": 0.13653990680895836, + "flos": 13078343802240.0, + "grad_norm": 2.715242097566801, + "language_loss": 0.75720018, + "learning_rate": 3.882108255017295e-06, + "loss": 0.77940053, + "num_input_tokens_seen": 49113945, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.125, + "step": 2271, + "time_per_iteration": 2.440701961517334 + }, + { + "auxiliary_loss_clip": 0.01161613, + "auxiliary_loss_mlp": 0.01052373, + "balance_loss_clip": 1.03149939, + "balance_loss_mlp": 1.05171776, + "epoch": 0.13660003006162633, + "flos": 16946712961920.0, + "grad_norm": 2.2487551674667565, + "language_loss": 0.80084515, + "learning_rate": 3.881976481578379e-06, + "loss": 0.82298499, + "num_input_tokens_seen": 49132855, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1015625, + "step": 2272, + "time_per_iteration": 2.4305598735809326 + }, + { + "auxiliary_loss_clip": 0.01056084, + "auxiliary_loss_mlp": 0.01011943, + "balance_loss_clip": 1.00937963, + "balance_loss_mlp": 1.01818228, + "epoch": 0.1366601533142943, + "flos": 68682749892480.0, + "grad_norm": 0.7032235049035468, + "language_loss": 0.60682511, + "learning_rate": 3.8818446367747255e-06, + "loss": 0.62750536, + "num_input_tokens_seen": 49198310, + "router_z_loss_clip": 0.02563477, + "router_z_loss_mlp": 0.37890625, + "step": 2273, + "time_per_iteration": 3.1601598262786865 + }, + { + "auxiliary_loss_clip": 0.01158579, + "auxiliary_loss_mlp": 0.01047766, + "balance_loss_clip": 1.02732205, + "balance_loss_mlp": 1.05170178, + "epoch": 0.13672027656696228, + "flos": 19244241567360.0, + "grad_norm": 1.7482195510707834, + "language_loss": 0.77978206, + "learning_rate": 3.881712720611336e-06, + "loss": 0.80184555, + "num_input_tokens_seen": 49217250, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0625, + "step": 2274, + "time_per_iteration": 2.448374032974243 + }, + { + "auxiliary_loss_clip": 0.01163563, + "auxiliary_loss_mlp": 0.01046845, + "balance_loss_clip": 1.02613878, + "balance_loss_mlp": 1.0536654, + "epoch": 0.13678039981963025, + "flos": 24534924543360.0, + "grad_norm": 2.152740159395537, + "language_loss": 0.78435361, + "learning_rate": 3.881580733093211e-06, + "loss": 0.80645764, + "num_input_tokens_seen": 49236615, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1015625, + "step": 2275, + "time_per_iteration": 2.4761078357696533 + }, + { + "auxiliary_loss_clip": 0.01161418, + "auxiliary_loss_mlp": 0.01039, + "balance_loss_clip": 1.02003431, + "balance_loss_mlp": 1.05312562, + "epoch": 0.13684052307229821, + "flos": 15669334523520.0, + "grad_norm": 2.879456622893362, + "language_loss": 0.81436646, + "learning_rate": 3.881448674225356e-06, + "loss": 0.83637059, + "num_input_tokens_seen": 49253935, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0859375, + "step": 2276, + "time_per_iteration": 2.453623056411743 + }, + { + "auxiliary_loss_clip": 0.01169888, + "auxiliary_loss_mlp": 0.01054109, + "balance_loss_clip": 1.03082716, + "balance_loss_mlp": 1.05443549, + "epoch": 0.13690064632496618, + "flos": 28364689560960.0, + "grad_norm": 2.7308629221608576, + "language_loss": 0.69347179, + "learning_rate": 3.881316544012779e-06, + "loss": 0.71571183, + "num_input_tokens_seen": 49273605, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.15625, + "step": 2277, + "time_per_iteration": 2.537464141845703 + }, + { + "auxiliary_loss_clip": 0.01162034, + "auxiliary_loss_mlp": 0.01051118, + "balance_loss_clip": 1.03056657, + "balance_loss_mlp": 1.05136657, + "epoch": 0.13696076957763414, + "flos": 23404779953280.0, + "grad_norm": 2.1796180013972384, + "language_loss": 0.80487186, + "learning_rate": 3.88118434246049e-06, + "loss": 0.82700336, + "num_input_tokens_seen": 49291785, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.109375, + "step": 2278, + "time_per_iteration": 2.478158950805664 + }, + { + "auxiliary_loss_clip": 0.01164216, + "auxiliary_loss_mlp": 0.01047731, + "balance_loss_clip": 1.02788246, + "balance_loss_mlp": 1.05658543, + "epoch": 0.1370208928303021, + "flos": 37196595601920.0, + "grad_norm": 2.2222454745927744, + "language_loss": 0.74863833, + "learning_rate": 3.881052069573502e-06, + "loss": 0.77075779, + "num_input_tokens_seen": 49311405, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.078125, + "step": 2279, + "time_per_iteration": 2.5930991172790527 + }, + { + "auxiliary_loss_clip": 0.01166611, + "auxiliary_loss_mlp": 0.0105256, + "balance_loss_clip": 1.03232992, + "balance_loss_mlp": 1.05331779, + "epoch": 0.13708101608297008, + "flos": 26976311118720.0, + "grad_norm": 2.3437990696634916, + "language_loss": 0.76614088, + "learning_rate": 3.880919725356831e-06, + "loss": 0.78833258, + "num_input_tokens_seen": 49331835, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.1328125, + "step": 2280, + "time_per_iteration": 2.527808666229248 + }, + { + "auxiliary_loss_clip": 0.01156674, + "auxiliary_loss_mlp": 0.01046301, + "balance_loss_clip": 1.0272876, + "balance_loss_mlp": 1.04930711, + "epoch": 0.13714113933563807, + "flos": 32556864850560.0, + "grad_norm": 1.7035700975942816, + "language_loss": 0.79808372, + "learning_rate": 3.880787309815496e-06, + "loss": 0.82011348, + "num_input_tokens_seen": 49352290, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.078125, + "step": 2281, + "time_per_iteration": 2.5486884117126465 + }, + { + "auxiliary_loss_clip": 0.01167882, + "auxiliary_loss_mlp": 0.01055632, + "balance_loss_clip": 1.03618872, + "balance_loss_mlp": 1.05488086, + "epoch": 0.13720126258830603, + "flos": 16101267569280.0, + "grad_norm": 1.697672260024265, + "language_loss": 0.83955061, + "learning_rate": 3.880654822954518e-06, + "loss": 0.86178571, + "num_input_tokens_seen": 49370285, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.125, + "step": 2282, + "time_per_iteration": 2.4731719493865967 + }, + { + "auxiliary_loss_clip": 0.01157557, + "auxiliary_loss_mlp": 0.01055496, + "balance_loss_clip": 1.03664923, + "balance_loss_mlp": 1.05028629, + "epoch": 0.137261385840974, + "flos": 18953544798720.0, + "grad_norm": 1.8152250836173982, + "language_loss": 0.73821312, + "learning_rate": 3.8805222647789195e-06, + "loss": 0.76034367, + "num_input_tokens_seen": 49389610, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0703125, + "step": 2283, + "time_per_iteration": 2.5041310787200928 + }, + { + "auxiliary_loss_clip": 0.01161767, + "auxiliary_loss_mlp": 0.01048138, + "balance_loss_clip": 1.02991104, + "balance_loss_mlp": 1.05546188, + "epoch": 0.13732150909364196, + "flos": 23295360147840.0, + "grad_norm": 2.845966051455131, + "language_loss": 0.83875519, + "learning_rate": 3.880389635293729e-06, + "loss": 0.86085427, + "num_input_tokens_seen": 49408390, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.0625, + "step": 2284, + "time_per_iteration": 2.489459991455078 + }, + { + "auxiliary_loss_clip": 0.01164156, + "auxiliary_loss_mlp": 0.01049019, + "balance_loss_clip": 1.02784729, + "balance_loss_mlp": 1.05016088, + "epoch": 0.13738163234630993, + "flos": 29351263489920.0, + "grad_norm": 1.9356174938409232, + "language_loss": 0.74778754, + "learning_rate": 3.880256934503974e-06, + "loss": 0.76991928, + "num_input_tokens_seen": 49427725, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.140625, + "step": 2285, + "time_per_iteration": 2.542114734649658 + }, + { + "auxiliary_loss_clip": 0.01158945, + "auxiliary_loss_mlp": 0.01045355, + "balance_loss_clip": 1.02680647, + "balance_loss_mlp": 1.05192137, + "epoch": 0.1374417555989779, + "flos": 26651319840000.0, + "grad_norm": 1.7476035379248278, + "language_loss": 0.74461651, + "learning_rate": 3.880124162414689e-06, + "loss": 0.7666595, + "num_input_tokens_seen": 49449000, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0703125, + "step": 2286, + "time_per_iteration": 2.52837872505188 + }, + { + "auxiliary_loss_clip": 0.01165905, + "auxiliary_loss_mlp": 0.01045032, + "balance_loss_clip": 1.02407491, + "balance_loss_mlp": 1.05466056, + "epoch": 0.1375018788516459, + "flos": 28403401443840.0, + "grad_norm": 2.4229799840234936, + "language_loss": 0.86074513, + "learning_rate": 3.879991319030908e-06, + "loss": 0.88285446, + "num_input_tokens_seen": 49468360, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.109375, + "step": 2287, + "time_per_iteration": 2.5267093181610107 + }, + { + "auxiliary_loss_clip": 0.01162503, + "auxiliary_loss_mlp": 0.01046382, + "balance_loss_clip": 1.02724862, + "balance_loss_mlp": 1.05281329, + "epoch": 0.13756200210431385, + "flos": 37413783187200.0, + "grad_norm": 2.1686670508464783, + "language_loss": 0.68304116, + "learning_rate": 3.879858404357666e-06, + "loss": 0.70512998, + "num_input_tokens_seen": 49493450, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.09375, + "step": 2288, + "time_per_iteration": 2.6589176654815674 + }, + { + "auxiliary_loss_clip": 0.01162886, + "auxiliary_loss_mlp": 0.01054184, + "balance_loss_clip": 1.03410959, + "balance_loss_mlp": 1.05404294, + "epoch": 0.13762212535698182, + "flos": 22711021695360.0, + "grad_norm": 3.8263362529629896, + "language_loss": 0.87251699, + "learning_rate": 3.879725418400005e-06, + "loss": 0.89468765, + "num_input_tokens_seen": 49511220, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.0859375, + "step": 2289, + "time_per_iteration": 2.4834415912628174 + }, + { + "auxiliary_loss_clip": 0.01154414, + "auxiliary_loss_mlp": 0.01045882, + "balance_loss_clip": 1.02735722, + "balance_loss_mlp": 1.0496552, + "epoch": 0.13768224860964978, + "flos": 23952130375680.0, + "grad_norm": 1.801469753111382, + "language_loss": 0.74045157, + "learning_rate": 3.879592361162969e-06, + "loss": 0.76245451, + "num_input_tokens_seen": 49529820, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.046875, + "step": 2290, + "time_per_iteration": 2.4901175498962402 + }, + { + "auxiliary_loss_clip": 0.01056657, + "auxiliary_loss_mlp": 0.01003238, + "balance_loss_clip": 1.00099707, + "balance_loss_mlp": 1.01923215, + "epoch": 0.13774237186231775, + "flos": 63590438753280.0, + "grad_norm": 0.7021136788609851, + "language_loss": 0.5160234, + "learning_rate": 3.8794592326516015e-06, + "loss": 0.53662229, + "num_input_tokens_seen": 49595325, + "router_z_loss_clip": 0.02246094, + "router_z_loss_mlp": 0.375, + "step": 2291, + "time_per_iteration": 3.1141176223754883 + }, + { + "auxiliary_loss_clip": 0.01158988, + "auxiliary_loss_mlp": 0.01049009, + "balance_loss_clip": 1.02856493, + "balance_loss_mlp": 1.05007744, + "epoch": 0.1378024951149857, + "flos": 24279456038400.0, + "grad_norm": 2.104305633549435, + "language_loss": 0.7090801, + "learning_rate": 3.879326032870952e-06, + "loss": 0.73116004, + "num_input_tokens_seen": 49615850, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.09375, + "step": 2292, + "time_per_iteration": 2.5535075664520264 + }, + { + "auxiliary_loss_clip": 0.01160381, + "auxiliary_loss_mlp": 0.01044896, + "balance_loss_clip": 1.02613211, + "balance_loss_mlp": 1.05272794, + "epoch": 0.13786261836765368, + "flos": 14021537080320.0, + "grad_norm": 2.835181445389694, + "language_loss": 0.79774708, + "learning_rate": 3.879192761826071e-06, + "loss": 0.81979978, + "num_input_tokens_seen": 49631860, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.078125, + "step": 2293, + "time_per_iteration": 2.4434242248535156 + }, + { + "auxiliary_loss_clip": 0.01159833, + "auxiliary_loss_mlp": 0.01050431, + "balance_loss_clip": 1.03065419, + "balance_loss_mlp": 1.0489893, + "epoch": 0.13792274162032167, + "flos": 28878679226880.0, + "grad_norm": 2.8100583587938566, + "language_loss": 0.78455698, + "learning_rate": 3.879059419522011e-06, + "loss": 0.80665964, + "num_input_tokens_seen": 49652145, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.109375, + "step": 2294, + "time_per_iteration": 2.5279018878936768 + }, + { + "auxiliary_loss_clip": 0.01156302, + "auxiliary_loss_mlp": 0.01044594, + "balance_loss_clip": 1.02679634, + "balance_loss_mlp": 1.05053687, + "epoch": 0.13798286487298964, + "flos": 21141150808320.0, + "grad_norm": 2.844605455172751, + "language_loss": 0.80448526, + "learning_rate": 3.878926005963831e-06, + "loss": 0.82649422, + "num_input_tokens_seen": 49669880, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 1.0546875, + "step": 2295, + "time_per_iteration": 2.46471905708313 + }, + { + "auxiliary_loss_clip": 0.01158353, + "auxiliary_loss_mlp": 0.01045588, + "balance_loss_clip": 1.02604938, + "balance_loss_mlp": 1.04990947, + "epoch": 0.1380429881256576, + "flos": 22487477402880.0, + "grad_norm": 1.905081494696058, + "language_loss": 0.78027165, + "learning_rate": 3.878792521156588e-06, + "loss": 0.80231106, + "num_input_tokens_seen": 49687255, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0859375, + "step": 2296, + "time_per_iteration": 2.489081859588623 + }, + { + "auxiliary_loss_clip": 0.0116031, + "auxiliary_loss_mlp": 0.01052914, + "balance_loss_clip": 1.03356612, + "balance_loss_mlp": 1.05272174, + "epoch": 0.13810311137832557, + "flos": 21393674398080.0, + "grad_norm": 1.8577842545242083, + "language_loss": 0.78632545, + "learning_rate": 3.8786589651053446e-06, + "loss": 0.80845773, + "num_input_tokens_seen": 49706650, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.078125, + "step": 2297, + "time_per_iteration": 2.479617118835449 + }, + { + "auxiliary_loss_clip": 0.01157952, + "auxiliary_loss_mlp": 0.01050362, + "balance_loss_clip": 1.03187263, + "balance_loss_mlp": 1.05133367, + "epoch": 0.13816323463099353, + "flos": 25989844930560.0, + "grad_norm": 2.1383795008624946, + "language_loss": 0.69005466, + "learning_rate": 3.878525337815164e-06, + "loss": 0.71213776, + "num_input_tokens_seen": 49725715, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0625, + "step": 2298, + "time_per_iteration": 2.4894726276397705 + }, + { + "auxiliary_loss_clip": 0.01163842, + "auxiliary_loss_mlp": 0.0105021, + "balance_loss_clip": 1.03075552, + "balance_loss_mlp": 1.05287397, + "epoch": 0.1382233578836615, + "flos": 19244313394560.0, + "grad_norm": 1.7932718261070644, + "language_loss": 0.86958891, + "learning_rate": 3.878391639291116e-06, + "loss": 0.89172935, + "num_input_tokens_seen": 49744710, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.109375, + "step": 2299, + "time_per_iteration": 2.4343175888061523 + }, + { + "auxiliary_loss_clip": 0.01158457, + "auxiliary_loss_mlp": 0.01052611, + "balance_loss_clip": 1.03221393, + "balance_loss_mlp": 1.05076718, + "epoch": 0.1382834811363295, + "flos": 25666290195840.0, + "grad_norm": 2.6477233854648015, + "language_loss": 0.7542398, + "learning_rate": 3.878257869538267e-06, + "loss": 0.7763505, + "num_input_tokens_seen": 49764300, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.078125, + "step": 2300, + "time_per_iteration": 2.5398943424224854 + }, + { + "auxiliary_loss_clip": 0.01160789, + "auxiliary_loss_mlp": 0.01050356, + "balance_loss_clip": 1.03088915, + "balance_loss_mlp": 1.05409729, + "epoch": 0.13834360438899745, + "flos": 19784193788160.0, + "grad_norm": 2.6084363319634956, + "language_loss": 0.82612532, + "learning_rate": 3.878124028561692e-06, + "loss": 0.8482368, + "num_input_tokens_seen": 49778380, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0625, + "step": 2301, + "time_per_iteration": 2.435732841491699 + }, + { + "auxiliary_loss_clip": 0.01155849, + "auxiliary_loss_mlp": 0.01043651, + "balance_loss_clip": 1.02461374, + "balance_loss_mlp": 1.04986811, + "epoch": 0.13840372764166542, + "flos": 26651858544000.0, + "grad_norm": 2.0886382571109987, + "language_loss": 0.85972583, + "learning_rate": 3.877990116366466e-06, + "loss": 0.8817209, + "num_input_tokens_seen": 49797460, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0625, + "step": 2302, + "time_per_iteration": 2.504011869430542 + }, + { + "auxiliary_loss_clip": 0.01055451, + "auxiliary_loss_mlp": 0.01009124, + "balance_loss_clip": 1.00688314, + "balance_loss_mlp": 1.0189817, + "epoch": 0.13846385089433338, + "flos": 70510998286080.0, + "grad_norm": 0.7554932596602951, + "language_loss": 0.65648526, + "learning_rate": 3.877856132957667e-06, + "loss": 0.677131, + "num_input_tokens_seen": 49868005, + "router_z_loss_clip": 0.02246094, + "router_z_loss_mlp": 0.36328125, + "step": 2303, + "time_per_iteration": 3.2563750743865967 + }, + { + "auxiliary_loss_clip": 0.0115535, + "auxiliary_loss_mlp": 0.01038432, + "balance_loss_clip": 1.01971662, + "balance_loss_mlp": 1.05022073, + "epoch": 0.13852397414700135, + "flos": 17348732956800.0, + "grad_norm": 2.0694955360834912, + "language_loss": 0.78234196, + "learning_rate": 3.877722078340374e-06, + "loss": 0.80427974, + "num_input_tokens_seen": 49885825, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.046875, + "step": 2304, + "time_per_iteration": 2.461975574493408 + }, + { + "auxiliary_loss_clip": 0.01161783, + "auxiliary_loss_mlp": 0.01038842, + "balance_loss_clip": 1.01991165, + "balance_loss_mlp": 1.05225086, + "epoch": 0.13858409739966931, + "flos": 21543781334400.0, + "grad_norm": 1.838077080535218, + "language_loss": 0.77824223, + "learning_rate": 3.877587952519672e-06, + "loss": 0.8002485, + "num_input_tokens_seen": 49905975, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.09375, + "step": 2305, + "time_per_iteration": 2.468254804611206 + }, + { + "auxiliary_loss_clip": 0.0115424, + "auxiliary_loss_mlp": 0.01045678, + "balance_loss_clip": 1.02732027, + "balance_loss_mlp": 1.04923558, + "epoch": 0.13864422065233728, + "flos": 21579907438080.0, + "grad_norm": 3.2063314507866947, + "language_loss": 0.87484217, + "learning_rate": 3.877453755500647e-06, + "loss": 0.89684129, + "num_input_tokens_seen": 49925800, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.046875, + "step": 2306, + "time_per_iteration": 2.4840242862701416 + }, + { + "auxiliary_loss_clip": 0.0105475, + "auxiliary_loss_mlp": 0.01012102, + "balance_loss_clip": 1.00995588, + "balance_loss_mlp": 1.01749539, + "epoch": 0.13870434390500527, + "flos": 53371156872960.0, + "grad_norm": 0.8793018572536648, + "language_loss": 0.59049129, + "learning_rate": 3.877319487288387e-06, + "loss": 0.6111598, + "num_input_tokens_seen": 49977620, + "router_z_loss_clip": 0.02148438, + "router_z_loss_mlp": 0.37304688, + "step": 2307, + "time_per_iteration": 3.1098880767822266 + }, + { + "auxiliary_loss_clip": 0.01164649, + "auxiliary_loss_mlp": 0.01043993, + "balance_loss_clip": 1.0233345, + "balance_loss_mlp": 1.05279016, + "epoch": 0.13876446715767324, + "flos": 22565906749440.0, + "grad_norm": 1.7539420555734833, + "language_loss": 0.79683769, + "learning_rate": 3.877185147887984e-06, + "loss": 0.81892413, + "num_input_tokens_seen": 49996650, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1171875, + "step": 2308, + "time_per_iteration": 2.5119385719299316 + }, + { + "auxiliary_loss_clip": 0.01159668, + "auxiliary_loss_mlp": 0.01042487, + "balance_loss_clip": 1.02331865, + "balance_loss_mlp": 1.05292201, + "epoch": 0.1388245904103412, + "flos": 20705231352960.0, + "grad_norm": 2.1876242684272342, + "language_loss": 0.78186178, + "learning_rate": 3.877050737304533e-06, + "loss": 0.80388331, + "num_input_tokens_seen": 50015640, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0625, + "step": 2309, + "time_per_iteration": 3.889902353286743 + }, + { + "auxiliary_loss_clip": 0.01164667, + "auxiliary_loss_mlp": 0.01044971, + "balance_loss_clip": 1.02517033, + "balance_loss_mlp": 1.05319023, + "epoch": 0.13888471366300917, + "flos": 20554729367040.0, + "grad_norm": 1.9671645437439387, + "language_loss": 0.67473733, + "learning_rate": 3.876916255543129e-06, + "loss": 0.69683367, + "num_input_tokens_seen": 50033500, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1171875, + "step": 2310, + "time_per_iteration": 5.331011056900024 + }, + { + "auxiliary_loss_clip": 0.01159907, + "auxiliary_loss_mlp": 0.01051301, + "balance_loss_clip": 1.03067827, + "balance_loss_mlp": 1.0511837, + "epoch": 0.13894483691567713, + "flos": 13838033473920.0, + "grad_norm": 1.8339330301012977, + "language_loss": 0.83962393, + "learning_rate": 3.8767817026088725e-06, + "loss": 0.86173606, + "num_input_tokens_seen": 50050075, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.0859375, + "step": 2311, + "time_per_iteration": 2.4287211894989014 + }, + { + "auxiliary_loss_clip": 0.01165031, + "auxiliary_loss_mlp": 0.01046165, + "balance_loss_clip": 1.02629256, + "balance_loss_mlp": 1.05262017, + "epoch": 0.1390049601683451, + "flos": 28031186759040.0, + "grad_norm": 2.2677083380951997, + "language_loss": 0.81788063, + "learning_rate": 3.876647078506866e-06, + "loss": 0.83999264, + "num_input_tokens_seen": 50070080, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.125, + "step": 2312, + "time_per_iteration": 2.5261852741241455 + }, + { + "auxiliary_loss_clip": 0.01165344, + "auxiliary_loss_mlp": 0.01045576, + "balance_loss_clip": 1.02634764, + "balance_loss_mlp": 1.05353236, + "epoch": 0.13906508342101306, + "flos": 26756860976640.0, + "grad_norm": 2.1868066623869202, + "language_loss": 0.86641061, + "learning_rate": 3.876512383242215e-06, + "loss": 0.88851982, + "num_input_tokens_seen": 50090040, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.1171875, + "step": 2313, + "time_per_iteration": 2.491847515106201 + }, + { + "auxiliary_loss_clip": 0.0116138, + "auxiliary_loss_mlp": 0.01052556, + "balance_loss_clip": 1.03208828, + "balance_loss_mlp": 1.05377281, + "epoch": 0.13912520667368106, + "flos": 24535104111360.0, + "grad_norm": 2.199884337980412, + "language_loss": 0.79629153, + "learning_rate": 3.876377616820024e-06, + "loss": 0.8184309, + "num_input_tokens_seen": 50110595, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.078125, + "step": 2314, + "time_per_iteration": 2.513545036315918 + }, + { + "auxiliary_loss_clip": 0.0116003, + "auxiliary_loss_mlp": 0.0104633, + "balance_loss_clip": 1.02668452, + "balance_loss_mlp": 1.05130863, + "epoch": 0.13918532992634902, + "flos": 19383215287680.0, + "grad_norm": 2.30759926974498, + "language_loss": 0.86246645, + "learning_rate": 3.876242779245409e-06, + "loss": 0.88453007, + "num_input_tokens_seen": 50125430, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0859375, + "step": 2315, + "time_per_iteration": 2.4236056804656982 + }, + { + "auxiliary_loss_clip": 0.01159066, + "auxiliary_loss_mlp": 0.01052564, + "balance_loss_clip": 1.03192866, + "balance_loss_mlp": 1.05146074, + "epoch": 0.139245453179017, + "flos": 21323756574720.0, + "grad_norm": 2.162038852448813, + "language_loss": 0.77074778, + "learning_rate": 3.876107870523477e-06, + "loss": 0.79286408, + "num_input_tokens_seen": 50144120, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.078125, + "step": 2316, + "time_per_iteration": 2.4574813842773438 + }, + { + "auxiliary_loss_clip": 0.01157842, + "auxiliary_loss_mlp": 0.01058721, + "balance_loss_clip": 1.03733492, + "balance_loss_mlp": 1.05045736, + "epoch": 0.13930557643168495, + "flos": 19500607912320.0, + "grad_norm": 1.6719823206156588, + "language_loss": 0.76972795, + "learning_rate": 3.875972890659349e-06, + "loss": 0.7918936, + "num_input_tokens_seen": 50162500, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.078125, + "step": 2317, + "time_per_iteration": 2.448096990585327 + }, + { + "auxiliary_loss_clip": 0.01162372, + "auxiliary_loss_mlp": 0.01049791, + "balance_loss_clip": 1.02993095, + "balance_loss_mlp": 1.05272126, + "epoch": 0.13936569968435292, + "flos": 25410821690880.0, + "grad_norm": 2.004328537884534, + "language_loss": 0.80159998, + "learning_rate": 3.875837839658139e-06, + "loss": 0.82372165, + "num_input_tokens_seen": 50182415, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.09375, + "step": 2318, + "time_per_iteration": 2.5152556896209717 + }, + { + "auxiliary_loss_clip": 0.01050424, + "auxiliary_loss_mlp": 0.0100261, + "balance_loss_clip": 1.00027394, + "balance_loss_mlp": 1.01373565, + "epoch": 0.13942582293702088, + "flos": 70771063731840.0, + "grad_norm": 0.8654041988705774, + "language_loss": 0.59008324, + "learning_rate": 3.87570271752497e-06, + "loss": 0.61061358, + "num_input_tokens_seen": 50245160, + "router_z_loss_clip": 0.02331543, + "router_z_loss_mlp": 0.3671875, + "step": 2319, + "time_per_iteration": 3.101083993911743 + }, + { + "auxiliary_loss_clip": 0.01162526, + "auxiliary_loss_mlp": 0.01053809, + "balance_loss_clip": 1.03365111, + "balance_loss_mlp": 1.05213809, + "epoch": 0.13948594618968888, + "flos": 35590885920000.0, + "grad_norm": 2.2307371496542356, + "language_loss": 0.65372109, + "learning_rate": 3.875567524264967e-06, + "loss": 0.67588449, + "num_input_tokens_seen": 50268215, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.109375, + "step": 2320, + "time_per_iteration": 2.580655336380005 + }, + { + "auxiliary_loss_clip": 0.01157047, + "auxiliary_loss_mlp": 0.01043656, + "balance_loss_clip": 1.02407002, + "balance_loss_mlp": 1.0507009, + "epoch": 0.13954606944235684, + "flos": 21105204272640.0, + "grad_norm": 1.6249908375914148, + "language_loss": 0.70695353, + "learning_rate": 3.875432259883256e-06, + "loss": 0.72896051, + "num_input_tokens_seen": 50288575, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0625, + "step": 2321, + "time_per_iteration": 2.4594380855560303 + }, + { + "auxiliary_loss_clip": 0.01158572, + "auxiliary_loss_mlp": 0.01055348, + "balance_loss_clip": 1.0345459, + "balance_loss_mlp": 1.04883599, + "epoch": 0.1396061926950248, + "flos": 25044425009280.0, + "grad_norm": 43.01057366099128, + "language_loss": 0.86161166, + "learning_rate": 3.875296924384965e-06, + "loss": 0.88375086, + "num_input_tokens_seen": 50308735, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.09375, + "step": 2322, + "time_per_iteration": 2.4912750720977783 + }, + { + "auxiliary_loss_clip": 0.01152207, + "auxiliary_loss_mlp": 0.01056736, + "balance_loss_clip": 1.0373404, + "balance_loss_mlp": 1.04840016, + "epoch": 0.13966631594769277, + "flos": 37634023428480.0, + "grad_norm": 1.7187096085030618, + "language_loss": 0.6682983, + "learning_rate": 3.875161517775226e-06, + "loss": 0.69038773, + "num_input_tokens_seen": 50331025, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0390625, + "step": 2323, + "time_per_iteration": 2.606084108352661 + }, + { + "auxiliary_loss_clip": 0.0116621, + "auxiliary_loss_mlp": 0.01051125, + "balance_loss_clip": 1.03068066, + "balance_loss_mlp": 1.05250573, + "epoch": 0.13972643920036074, + "flos": 16690993061760.0, + "grad_norm": 2.0268681764850665, + "language_loss": 0.89011461, + "learning_rate": 3.875026040059175e-06, + "loss": 0.91228795, + "num_input_tokens_seen": 50349725, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1328125, + "step": 2324, + "time_per_iteration": 2.458172559738159 + }, + { + "auxiliary_loss_clip": 0.01159494, + "auxiliary_loss_mlp": 0.01056649, + "balance_loss_clip": 1.03626466, + "balance_loss_mlp": 1.04949069, + "epoch": 0.1397865624530287, + "flos": 23331055288320.0, + "grad_norm": 4.4201897818475775, + "language_loss": 0.70700991, + "learning_rate": 3.8748904912419485e-06, + "loss": 0.7291714, + "num_input_tokens_seen": 50367965, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1015625, + "step": 2325, + "time_per_iteration": 2.4608585834503174 + }, + { + "auxiliary_loss_clip": 0.01161715, + "auxiliary_loss_mlp": 0.01055057, + "balance_loss_clip": 1.03568554, + "balance_loss_mlp": 1.05384755, + "epoch": 0.13984668570569667, + "flos": 22778317825920.0, + "grad_norm": 1.8512202881484865, + "language_loss": 0.81165004, + "learning_rate": 3.874754871328688e-06, + "loss": 0.83381784, + "num_input_tokens_seen": 50385605, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.078125, + "step": 2326, + "time_per_iteration": 2.474729537963867 + }, + { + "auxiliary_loss_clip": 0.01155088, + "auxiliary_loss_mlp": 0.01047016, + "balance_loss_clip": 1.02880073, + "balance_loss_mlp": 1.05092621, + "epoch": 0.13990680895836466, + "flos": 19464553635840.0, + "grad_norm": 1.806872548679543, + "language_loss": 0.88955671, + "learning_rate": 3.874619180324534e-06, + "loss": 0.9115777, + "num_input_tokens_seen": 50403985, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.0390625, + "step": 2327, + "time_per_iteration": 2.4512577056884766 + }, + { + "auxiliary_loss_clip": 0.01155487, + "auxiliary_loss_mlp": 0.0105816, + "balance_loss_clip": 1.03790593, + "balance_loss_mlp": 1.05021226, + "epoch": 0.13996693221103262, + "flos": 20303283185280.0, + "grad_norm": 2.4750320646827992, + "language_loss": 0.85236871, + "learning_rate": 3.874483418234632e-06, + "loss": 0.87450516, + "num_input_tokens_seen": 50421590, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.046875, + "step": 2328, + "time_per_iteration": 2.4724884033203125 + }, + { + "auxiliary_loss_clip": 0.01158673, + "auxiliary_loss_mlp": 0.0104927, + "balance_loss_clip": 1.02926636, + "balance_loss_mlp": 1.05120313, + "epoch": 0.1400270554637006, + "flos": 26617707688320.0, + "grad_norm": 1.653872228613324, + "language_loss": 0.74084997, + "learning_rate": 3.874347585064131e-06, + "loss": 0.76292944, + "num_input_tokens_seen": 50443945, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.078125, + "step": 2329, + "time_per_iteration": 2.5238442420959473 + }, + { + "auxiliary_loss_clip": 0.01156952, + "auxiliary_loss_mlp": 0.01050364, + "balance_loss_clip": 1.03070641, + "balance_loss_mlp": 1.04729962, + "epoch": 0.14008717871636855, + "flos": 19391475415680.0, + "grad_norm": 1.840223813628444, + "language_loss": 0.77969897, + "learning_rate": 3.874211680818183e-06, + "loss": 0.80177212, + "num_input_tokens_seen": 50462065, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.09375, + "step": 2330, + "time_per_iteration": 2.468606948852539 + }, + { + "auxiliary_loss_clip": 0.01156133, + "auxiliary_loss_mlp": 0.01046075, + "balance_loss_clip": 1.02738333, + "balance_loss_mlp": 1.0495398, + "epoch": 0.14014730196903652, + "flos": 15304266645120.0, + "grad_norm": 2.6993483396219506, + "language_loss": 0.72030222, + "learning_rate": 3.87407570550194e-06, + "loss": 0.74232423, + "num_input_tokens_seen": 50479565, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0625, + "step": 2331, + "time_per_iteration": 2.504417896270752 + }, + { + "auxiliary_loss_clip": 0.01150975, + "auxiliary_loss_mlp": 0.01053113, + "balance_loss_clip": 1.03333664, + "balance_loss_mlp": 1.05008936, + "epoch": 0.14020742522170448, + "flos": 14939701557120.0, + "grad_norm": 1.585347596838152, + "language_loss": 0.72609055, + "learning_rate": 3.873939659120557e-06, + "loss": 0.74813151, + "num_input_tokens_seen": 50497305, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0078125, + "step": 2332, + "time_per_iteration": 2.4244635105133057 + }, + { + "auxiliary_loss_clip": 0.01047328, + "auxiliary_loss_mlp": 0.01002801, + "balance_loss_clip": 1.00048828, + "balance_loss_mlp": 1.01059568, + "epoch": 0.14026754847437245, + "flos": 48824580044160.0, + "grad_norm": 0.8290843953692559, + "language_loss": 0.56071591, + "learning_rate": 3.873803541679196e-06, + "loss": 0.58121729, + "num_input_tokens_seen": 50549735, + "router_z_loss_clip": 0.02307129, + "router_z_loss_mlp": 0.3671875, + "step": 2333, + "time_per_iteration": 2.8934712409973145 + }, + { + "auxiliary_loss_clip": 0.01155339, + "auxiliary_loss_mlp": 0.01046057, + "balance_loss_clip": 1.02629185, + "balance_loss_mlp": 1.05001664, + "epoch": 0.14032767172704044, + "flos": 25773267876480.0, + "grad_norm": 1.7851490004805215, + "language_loss": 0.82529652, + "learning_rate": 3.873667353183016e-06, + "loss": 0.84731042, + "num_input_tokens_seen": 50570100, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0546875, + "step": 2334, + "time_per_iteration": 2.495786428451538 + }, + { + "auxiliary_loss_clip": 0.01155545, + "auxiliary_loss_mlp": 0.01048248, + "balance_loss_clip": 1.0293529, + "balance_loss_mlp": 1.05012262, + "epoch": 0.1403877949797084, + "flos": 21216312017280.0, + "grad_norm": 1.8251700419130605, + "language_loss": 0.81237197, + "learning_rate": 3.8735310936371825e-06, + "loss": 0.83440989, + "num_input_tokens_seen": 50589185, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0546875, + "step": 2335, + "time_per_iteration": 2.483055591583252 + }, + { + "auxiliary_loss_clip": 0.01163426, + "auxiliary_loss_mlp": 0.01051429, + "balance_loss_clip": 1.02829087, + "balance_loss_mlp": 1.05328035, + "epoch": 0.14044791823237637, + "flos": 22747973811840.0, + "grad_norm": 1.83822789048078, + "language_loss": 0.82159901, + "learning_rate": 3.873394763046862e-06, + "loss": 0.8437475, + "num_input_tokens_seen": 50609645, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.1015625, + "step": 2336, + "time_per_iteration": 2.4732770919799805 + }, + { + "auxiliary_loss_clip": 0.01157668, + "auxiliary_loss_mlp": 0.01044768, + "balance_loss_clip": 1.02526581, + "balance_loss_mlp": 1.05202782, + "epoch": 0.14050804148504434, + "flos": 22964443125120.0, + "grad_norm": 1.8506426201256954, + "language_loss": 0.80081403, + "learning_rate": 3.873258361417225e-06, + "loss": 0.82283843, + "num_input_tokens_seen": 50628385, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0546875, + "step": 2337, + "time_per_iteration": 2.4599671363830566 + }, + { + "auxiliary_loss_clip": 0.01155582, + "auxiliary_loss_mlp": 0.0104864, + "balance_loss_clip": 1.02911353, + "balance_loss_mlp": 1.04861474, + "epoch": 0.1405681647377123, + "flos": 22200336080640.0, + "grad_norm": 2.2474896580124963, + "language_loss": 0.7927807, + "learning_rate": 3.873121888753442e-06, + "loss": 0.81482291, + "num_input_tokens_seen": 50647260, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0703125, + "step": 2338, + "time_per_iteration": 2.4892208576202393 + }, + { + "auxiliary_loss_clip": 0.01165053, + "auxiliary_loss_mlp": 0.01046329, + "balance_loss_clip": 1.02577746, + "balance_loss_mlp": 1.05685067, + "epoch": 0.14062828799038027, + "flos": 23732787974400.0, + "grad_norm": 2.148660398501072, + "language_loss": 0.79827893, + "learning_rate": 3.87298534506069e-06, + "loss": 0.82039273, + "num_input_tokens_seen": 50666130, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.078125, + "step": 2339, + "time_per_iteration": 2.4672555923461914 + }, + { + "auxiliary_loss_clip": 0.01159986, + "auxiliary_loss_mlp": 0.01055012, + "balance_loss_clip": 1.03506875, + "balance_loss_mlp": 1.0527122, + "epoch": 0.14068841124304826, + "flos": 39202493685120.0, + "grad_norm": 1.7979240482106922, + "language_loss": 0.6582588, + "learning_rate": 3.872848730344146e-06, + "loss": 0.68040884, + "num_input_tokens_seen": 50687440, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0703125, + "step": 2340, + "time_per_iteration": 2.614506483078003 + }, + { + "auxiliary_loss_clip": 0.01155238, + "auxiliary_loss_mlp": 0.01048788, + "balance_loss_clip": 1.02936912, + "balance_loss_mlp": 1.05242825, + "epoch": 0.14074853449571623, + "flos": 20192283181440.0, + "grad_norm": 2.5431372850663334, + "language_loss": 0.78670812, + "learning_rate": 3.87271204460899e-06, + "loss": 0.80874836, + "num_input_tokens_seen": 50704030, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.03125, + "step": 2341, + "time_per_iteration": 2.4420077800750732 + }, + { + "auxiliary_loss_clip": 0.01156345, + "auxiliary_loss_mlp": 0.01050043, + "balance_loss_clip": 1.03058767, + "balance_loss_mlp": 1.05246425, + "epoch": 0.1408086577483842, + "flos": 18405871153920.0, + "grad_norm": 11.570217446637303, + "language_loss": 0.80154169, + "learning_rate": 3.8725752878604066e-06, + "loss": 0.82360554, + "num_input_tokens_seen": 50723305, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.03125, + "step": 2342, + "time_per_iteration": 2.4961190223693848 + }, + { + "auxiliary_loss_clip": 0.01159304, + "auxiliary_loss_mlp": 0.010435, + "balance_loss_clip": 1.02486777, + "balance_loss_mlp": 1.05673313, + "epoch": 0.14086878100105216, + "flos": 25264593423360.0, + "grad_norm": 1.9358851833739352, + "language_loss": 0.77974075, + "learning_rate": 3.87243846010358e-06, + "loss": 0.80176884, + "num_input_tokens_seen": 50743270, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.0234375, + "step": 2343, + "time_per_iteration": 2.479679584503174 + }, + { + "auxiliary_loss_clip": 0.01049361, + "auxiliary_loss_mlp": 0.0100492, + "balance_loss_clip": 1.0025475, + "balance_loss_mlp": 1.01255798, + "epoch": 0.14092890425372012, + "flos": 65978388869760.0, + "grad_norm": 0.8341361150670269, + "language_loss": 0.6155628, + "learning_rate": 3.872301561343699e-06, + "loss": 0.63610566, + "num_input_tokens_seen": 50802710, + "router_z_loss_clip": 0.02368164, + "router_z_loss_mlp": 0.3671875, + "step": 2344, + "time_per_iteration": 3.048691987991333 + }, + { + "auxiliary_loss_clip": 0.01151538, + "auxiliary_loss_mlp": 0.01040868, + "balance_loss_clip": 1.02309346, + "balance_loss_mlp": 1.04911709, + "epoch": 0.1409890275063881, + "flos": 23694973931520.0, + "grad_norm": 1.886714907416039, + "language_loss": 0.64591062, + "learning_rate": 3.872164591585956e-06, + "loss": 0.6678347, + "num_input_tokens_seen": 50822625, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 1.0234375, + "step": 2345, + "time_per_iteration": 2.509552240371704 + }, + { + "auxiliary_loss_clip": 0.01162324, + "auxiliary_loss_mlp": 0.01044831, + "balance_loss_clip": 1.023803, + "balance_loss_mlp": 1.05019534, + "epoch": 0.14104915075905605, + "flos": 23623152687360.0, + "grad_norm": 2.502398022219224, + "language_loss": 0.736485, + "learning_rate": 3.8720275508355435e-06, + "loss": 0.7585566, + "num_input_tokens_seen": 50842330, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1171875, + "step": 2346, + "time_per_iteration": 2.4962430000305176 + }, + { + "auxiliary_loss_clip": 0.01160187, + "auxiliary_loss_mlp": 0.01046171, + "balance_loss_clip": 1.02600074, + "balance_loss_mlp": 1.05144429, + "epoch": 0.14110927401172405, + "flos": 20595165102720.0, + "grad_norm": 2.4324488814849703, + "language_loss": 0.77868927, + "learning_rate": 3.8718904390976585e-06, + "loss": 0.80075288, + "num_input_tokens_seen": 50861035, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0859375, + "step": 2347, + "time_per_iteration": 2.4663050174713135 + }, + { + "auxiliary_loss_clip": 0.01155281, + "auxiliary_loss_mlp": 0.01046804, + "balance_loss_clip": 1.02852941, + "balance_loss_mlp": 1.04918981, + "epoch": 0.141169397264392, + "flos": 28548049512960.0, + "grad_norm": 1.7514485331985392, + "language_loss": 0.76446569, + "learning_rate": 3.8717532563775e-06, + "loss": 0.78648651, + "num_input_tokens_seen": 50880105, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.0625, + "step": 2348, + "time_per_iteration": 2.508002758026123 + }, + { + "auxiliary_loss_clip": 0.0115565, + "auxiliary_loss_mlp": 0.01043027, + "balance_loss_clip": 1.02346444, + "balance_loss_mlp": 1.0508523, + "epoch": 0.14122952051705998, + "flos": 17092258871040.0, + "grad_norm": 1.8350283773112115, + "language_loss": 0.8686446, + "learning_rate": 3.871616002680272e-06, + "loss": 0.89063132, + "num_input_tokens_seen": 50897720, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.046875, + "step": 2349, + "time_per_iteration": 2.4446985721588135 + }, + { + "auxiliary_loss_clip": 0.01156083, + "auxiliary_loss_mlp": 0.0104419, + "balance_loss_clip": 1.02478313, + "balance_loss_mlp": 1.05220377, + "epoch": 0.14128964376972794, + "flos": 28946801370240.0, + "grad_norm": 1.7285118920158233, + "language_loss": 0.8895669, + "learning_rate": 3.871478678011177e-06, + "loss": 0.9115696, + "num_input_tokens_seen": 50918385, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0390625, + "step": 2350, + "time_per_iteration": 2.49725341796875 + }, + { + "auxiliary_loss_clip": 0.0115943, + "auxiliary_loss_mlp": 0.01046195, + "balance_loss_clip": 1.02542889, + "balance_loss_mlp": 1.05281878, + "epoch": 0.1413497670223959, + "flos": 18989778643200.0, + "grad_norm": 1.8656651100546833, + "language_loss": 0.814816, + "learning_rate": 3.871341282375423e-06, + "loss": 0.83687228, + "num_input_tokens_seen": 50938270, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.0703125, + "step": 2351, + "time_per_iteration": 3.941416025161743 + }, + { + "auxiliary_loss_clip": 0.01157242, + "auxiliary_loss_mlp": 0.01040097, + "balance_loss_clip": 1.02102304, + "balance_loss_mlp": 1.05032706, + "epoch": 0.14140989027506387, + "flos": 29862236413440.0, + "grad_norm": 2.6782915885510286, + "language_loss": 0.82935351, + "learning_rate": 3.871203815778219e-06, + "loss": 0.85132694, + "num_input_tokens_seen": 50958155, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0703125, + "step": 2352, + "time_per_iteration": 5.431722640991211 + }, + { + "auxiliary_loss_clip": 0.01047453, + "auxiliary_loss_mlp": 0.01006216, + "balance_loss_clip": 1.00387907, + "balance_loss_mlp": 1.01053333, + "epoch": 0.14147001352773186, + "flos": 62079532041600.0, + "grad_norm": 0.90864091090638, + "language_loss": 0.61894125, + "learning_rate": 3.87106627822478e-06, + "loss": 0.63947791, + "num_input_tokens_seen": 51020705, + "router_z_loss_clip": 0.02331543, + "router_z_loss_mlp": 0.36914062, + "step": 2353, + "time_per_iteration": 3.0071640014648438 + }, + { + "auxiliary_loss_clip": 0.01154516, + "auxiliary_loss_mlp": 0.01047207, + "balance_loss_clip": 1.02807426, + "balance_loss_mlp": 1.05024958, + "epoch": 0.14153013678039983, + "flos": 22017514832640.0, + "grad_norm": 1.8535903324814498, + "language_loss": 0.87264848, + "learning_rate": 3.8709286697203196e-06, + "loss": 0.89466572, + "num_input_tokens_seen": 51039995, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0390625, + "step": 2354, + "time_per_iteration": 2.4613726139068604 + }, + { + "auxiliary_loss_clip": 0.01157155, + "auxiliary_loss_mlp": 0.01046298, + "balance_loss_clip": 1.02607965, + "balance_loss_mlp": 1.04953241, + "epoch": 0.1415902600330678, + "flos": 19720093968000.0, + "grad_norm": 1.9651075901387003, + "language_loss": 0.74872321, + "learning_rate": 3.870790990270057e-06, + "loss": 0.77075779, + "num_input_tokens_seen": 51059075, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.078125, + "step": 2355, + "time_per_iteration": 2.442379951477051 + }, + { + "auxiliary_loss_clip": 0.01047047, + "auxiliary_loss_mlp": 0.01002716, + "balance_loss_clip": 1.00052261, + "balance_loss_mlp": 1.01023293, + "epoch": 0.14165038328573576, + "flos": 65900929190400.0, + "grad_norm": 0.6790475533637321, + "language_loss": 0.5182299, + "learning_rate": 3.870653239879212e-06, + "loss": 0.53872752, + "num_input_tokens_seen": 51120380, + "router_z_loss_clip": 0.02197266, + "router_z_loss_mlp": 0.3671875, + "step": 2356, + "time_per_iteration": 2.9892258644104004 + }, + { + "auxiliary_loss_clip": 0.01156071, + "auxiliary_loss_mlp": 0.01053896, + "balance_loss_clip": 1.03495359, + "balance_loss_mlp": 1.05080867, + "epoch": 0.14171050653840372, + "flos": 12130158533760.0, + "grad_norm": 3.0630792396255053, + "language_loss": 0.70576489, + "learning_rate": 3.8705154185530095e-06, + "loss": 0.72786456, + "num_input_tokens_seen": 51136950, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0546875, + "step": 2357, + "time_per_iteration": 2.421844005584717 + }, + { + "auxiliary_loss_clip": 0.0116013, + "auxiliary_loss_mlp": 0.01050288, + "balance_loss_clip": 1.03169179, + "balance_loss_mlp": 1.05012453, + "epoch": 0.1417706297910717, + "flos": 20412487509120.0, + "grad_norm": 1.8720076771552743, + "language_loss": 0.82205695, + "learning_rate": 3.870377526296674e-06, + "loss": 0.84416115, + "num_input_tokens_seen": 51155175, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.09375, + "step": 2358, + "time_per_iteration": 2.4519011974334717 + }, + { + "auxiliary_loss_clip": 0.01160902, + "auxiliary_loss_mlp": 0.01047176, + "balance_loss_clip": 1.02663624, + "balance_loss_mlp": 1.051018, + "epoch": 0.14183075304373965, + "flos": 22380607463040.0, + "grad_norm": 6.439592826280342, + "language_loss": 0.7129705, + "learning_rate": 3.870239563115436e-06, + "loss": 0.73505127, + "num_input_tokens_seen": 51174500, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1015625, + "step": 2359, + "time_per_iteration": 2.4797613620758057 + }, + { + "auxiliary_loss_clip": 0.0115558, + "auxiliary_loss_mlp": 0.01043529, + "balance_loss_clip": 1.02374041, + "balance_loss_mlp": 1.04988599, + "epoch": 0.14189087629640765, + "flos": 21580913018880.0, + "grad_norm": 5.514404455287625, + "language_loss": 0.76040578, + "learning_rate": 3.870101529014526e-06, + "loss": 0.78239685, + "num_input_tokens_seen": 51194270, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.0546875, + "step": 2360, + "time_per_iteration": 2.4538815021514893 + }, + { + "auxiliary_loss_clip": 0.011559, + "auxiliary_loss_mlp": 0.01041926, + "balance_loss_clip": 1.02173233, + "balance_loss_mlp": 1.05221295, + "epoch": 0.1419509995490756, + "flos": 20008564093440.0, + "grad_norm": 2.1535632205539135, + "language_loss": 0.8188749, + "learning_rate": 3.869963423999178e-06, + "loss": 0.84085315, + "num_input_tokens_seen": 51211850, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0390625, + "step": 2361, + "time_per_iteration": 2.4411346912384033 + }, + { + "auxiliary_loss_clip": 0.01152529, + "auxiliary_loss_mlp": 0.01047315, + "balance_loss_clip": 1.02826524, + "balance_loss_mlp": 1.04964995, + "epoch": 0.14201112280174358, + "flos": 31941464112000.0, + "grad_norm": 2.775663525053056, + "language_loss": 0.74489617, + "learning_rate": 3.86982524807463e-06, + "loss": 0.76689464, + "num_input_tokens_seen": 51233545, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.03125, + "step": 2362, + "time_per_iteration": 2.530163049697876 + }, + { + "auxiliary_loss_clip": 0.01158195, + "auxiliary_loss_mlp": 0.01046424, + "balance_loss_clip": 1.0265274, + "balance_loss_mlp": 1.05187464, + "epoch": 0.14207124605441154, + "flos": 41464147582080.0, + "grad_norm": 4.478599792998506, + "language_loss": 0.73748112, + "learning_rate": 3.869687001246122e-06, + "loss": 0.75952733, + "num_input_tokens_seen": 51257615, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0625, + "step": 2363, + "time_per_iteration": 2.646651029586792 + }, + { + "auxiliary_loss_clip": 0.01156109, + "auxiliary_loss_mlp": 0.01045606, + "balance_loss_clip": 1.02605534, + "balance_loss_mlp": 1.05005693, + "epoch": 0.1421313693070795, + "flos": 31905086613120.0, + "grad_norm": 1.8353407682080387, + "language_loss": 0.72971261, + "learning_rate": 3.8695486835188946e-06, + "loss": 0.75172973, + "num_input_tokens_seen": 51279645, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0625, + "step": 2364, + "time_per_iteration": 2.5670576095581055 + }, + { + "auxiliary_loss_clip": 0.01152213, + "auxiliary_loss_mlp": 0.01048707, + "balance_loss_clip": 1.031183, + "balance_loss_mlp": 1.05015445, + "epoch": 0.14219149255974747, + "flos": 26871165031680.0, + "grad_norm": 4.452075303519762, + "language_loss": 0.90230036, + "learning_rate": 3.869410294898195e-06, + "loss": 0.92430955, + "num_input_tokens_seen": 51299775, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 1.015625, + "step": 2365, + "time_per_iteration": 2.5130062103271484 + }, + { + "auxiliary_loss_clip": 0.01155172, + "auxiliary_loss_mlp": 0.0104726, + "balance_loss_clip": 1.02735198, + "balance_loss_mlp": 1.04896259, + "epoch": 0.14225161581241544, + "flos": 27454426076160.0, + "grad_norm": 1.956458588852685, + "language_loss": 0.65377176, + "learning_rate": 3.869271835389268e-06, + "loss": 0.67579615, + "num_input_tokens_seen": 51319430, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0625, + "step": 2366, + "time_per_iteration": 2.5081095695495605 + }, + { + "auxiliary_loss_clip": 0.01152693, + "auxiliary_loss_mlp": 0.01056429, + "balance_loss_clip": 1.03640223, + "balance_loss_mlp": 1.04979372, + "epoch": 0.14231173906508343, + "flos": 10561436881920.0, + "grad_norm": 2.190613479881076, + "language_loss": 0.80414236, + "learning_rate": 3.8691333049973665e-06, + "loss": 0.82623357, + "num_input_tokens_seen": 51336045, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.03125, + "step": 2367, + "time_per_iteration": 2.4398317337036133 + }, + { + "auxiliary_loss_clip": 0.01158941, + "auxiliary_loss_mlp": 0.01054295, + "balance_loss_clip": 1.0333972, + "balance_loss_mlp": 1.05221498, + "epoch": 0.1423718623177514, + "flos": 28360882719360.0, + "grad_norm": 2.898581267606924, + "language_loss": 0.82619941, + "learning_rate": 3.868994703727742e-06, + "loss": 0.84833181, + "num_input_tokens_seen": 51357030, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.0703125, + "step": 2368, + "time_per_iteration": 2.512401580810547 + }, + { + "auxiliary_loss_clip": 0.01157439, + "auxiliary_loss_mlp": 0.01050054, + "balance_loss_clip": 1.0298835, + "balance_loss_mlp": 1.05165803, + "epoch": 0.14243198557041936, + "flos": 19354235990400.0, + "grad_norm": 2.7587049982231675, + "language_loss": 0.86971414, + "learning_rate": 3.868856031585652e-06, + "loss": 0.89178908, + "num_input_tokens_seen": 51374890, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0625, + "step": 2369, + "time_per_iteration": 2.444784164428711 + }, + { + "auxiliary_loss_clip": 0.01158905, + "auxiliary_loss_mlp": 0.01042779, + "balance_loss_clip": 1.02303767, + "balance_loss_mlp": 1.04913163, + "epoch": 0.14249210882308733, + "flos": 28806857982720.0, + "grad_norm": 1.4370193327140612, + "language_loss": 0.75704634, + "learning_rate": 3.868717288576354e-06, + "loss": 0.77906322, + "num_input_tokens_seen": 51398100, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.09375, + "step": 2370, + "time_per_iteration": 2.527740240097046 + }, + { + "auxiliary_loss_clip": 0.01154457, + "auxiliary_loss_mlp": 0.01058445, + "balance_loss_clip": 1.0384295, + "balance_loss_mlp": 1.04879546, + "epoch": 0.1425522320757553, + "flos": 21835016807040.0, + "grad_norm": 1.7319048865171518, + "language_loss": 0.82923144, + "learning_rate": 3.868578474705109e-06, + "loss": 0.85136044, + "num_input_tokens_seen": 51418745, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.0625, + "step": 2371, + "time_per_iteration": 2.4644808769226074 + }, + { + "auxiliary_loss_clip": 0.01158835, + "auxiliary_loss_mlp": 0.01051346, + "balance_loss_clip": 1.03171265, + "balance_loss_mlp": 1.05157602, + "epoch": 0.14261235532842326, + "flos": 17311457617920.0, + "grad_norm": 1.956158386855541, + "language_loss": 0.82575452, + "learning_rate": 3.868439589977181e-06, + "loss": 0.84785628, + "num_input_tokens_seen": 51437455, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0703125, + "step": 2372, + "time_per_iteration": 2.42240047454834 + }, + { + "auxiliary_loss_clip": 0.01157732, + "auxiliary_loss_mlp": 0.01051664, + "balance_loss_clip": 1.03175569, + "balance_loss_mlp": 1.05134308, + "epoch": 0.14267247858109125, + "flos": 18806741913600.0, + "grad_norm": 2.19442784605527, + "language_loss": 0.8396256, + "learning_rate": 3.868300634397836e-06, + "loss": 0.86171949, + "num_input_tokens_seen": 51455710, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0625, + "step": 2373, + "time_per_iteration": 2.444695472717285 + }, + { + "auxiliary_loss_clip": 0.01154816, + "auxiliary_loss_mlp": 0.01050946, + "balance_loss_clip": 1.03294528, + "balance_loss_mlp": 1.05012143, + "epoch": 0.14273260183375922, + "flos": 11358904682880.0, + "grad_norm": 2.034088541649992, + "language_loss": 0.86271042, + "learning_rate": 3.8681616079723445e-06, + "loss": 0.88476801, + "num_input_tokens_seen": 51471270, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.046875, + "step": 2374, + "time_per_iteration": 2.428062915802002 + }, + { + "auxiliary_loss_clip": 0.01161306, + "auxiliary_loss_mlp": 0.01050996, + "balance_loss_clip": 1.03024197, + "balance_loss_mlp": 1.05125451, + "epoch": 0.14279272508642718, + "flos": 27567688636800.0, + "grad_norm": 4.612229602439842, + "language_loss": 0.7919687, + "learning_rate": 3.868022510705977e-06, + "loss": 0.81409162, + "num_input_tokens_seen": 51492705, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1015625, + "step": 2375, + "time_per_iteration": 2.526838541030884 + }, + { + "auxiliary_loss_clip": 0.01157834, + "auxiliary_loss_mlp": 0.01056869, + "balance_loss_clip": 1.03655601, + "balance_loss_mlp": 1.05240607, + "epoch": 0.14285284833909515, + "flos": 16252559654400.0, + "grad_norm": 2.386247922788535, + "language_loss": 0.76400912, + "learning_rate": 3.867883342604009e-06, + "loss": 0.78615618, + "num_input_tokens_seen": 51510780, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.046875, + "step": 2376, + "time_per_iteration": 2.4554591178894043 + }, + { + "auxiliary_loss_clip": 0.01156552, + "auxiliary_loss_mlp": 0.0104949, + "balance_loss_clip": 1.02995205, + "balance_loss_mlp": 1.05075741, + "epoch": 0.1429129715917631, + "flos": 19755609540480.0, + "grad_norm": 2.9035160782842753, + "language_loss": 0.93037754, + "learning_rate": 3.867744103671717e-06, + "loss": 0.952438, + "num_input_tokens_seen": 51531400, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0625, + "step": 2377, + "time_per_iteration": 2.51385760307312 + }, + { + "auxiliary_loss_clip": 0.01157682, + "auxiliary_loss_mlp": 0.01051526, + "balance_loss_clip": 1.02991319, + "balance_loss_mlp": 1.05085003, + "epoch": 0.14297309484443108, + "flos": 21137092571520.0, + "grad_norm": 1.9751577144221115, + "language_loss": 0.91598773, + "learning_rate": 3.867604793914382e-06, + "loss": 0.93807983, + "num_input_tokens_seen": 51548215, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.0703125, + "step": 2378, + "time_per_iteration": 2.558563470840454 + }, + { + "auxiliary_loss_clip": 0.01159674, + "auxiliary_loss_mlp": 0.01044299, + "balance_loss_clip": 1.02410531, + "balance_loss_mlp": 1.051296, + "epoch": 0.14303321809709904, + "flos": 23586667447680.0, + "grad_norm": 1.745891074970689, + "language_loss": 0.73947102, + "learning_rate": 3.8674654133372864e-06, + "loss": 0.76151079, + "num_input_tokens_seen": 51566820, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0859375, + "step": 2379, + "time_per_iteration": 2.511359214782715 + }, + { + "auxiliary_loss_clip": 0.01156473, + "auxiliary_loss_mlp": 0.01056109, + "balance_loss_clip": 1.03636849, + "balance_loss_mlp": 1.05014992, + "epoch": 0.14309334134976703, + "flos": 15888281875200.0, + "grad_norm": 1.8640465231226504, + "language_loss": 0.79013336, + "learning_rate": 3.867325961945714e-06, + "loss": 0.81225914, + "num_input_tokens_seen": 51585075, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0625, + "step": 2380, + "time_per_iteration": 2.466219663619995 + }, + { + "auxiliary_loss_clip": 0.01162977, + "auxiliary_loss_mlp": 0.0105089, + "balance_loss_clip": 1.03124452, + "balance_loss_mlp": 1.05528164, + "epoch": 0.143153464602435, + "flos": 16325601960960.0, + "grad_norm": 2.3244590707621073, + "language_loss": 0.87958229, + "learning_rate": 3.867186439744955e-06, + "loss": 0.90172088, + "num_input_tokens_seen": 51603185, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.078125, + "step": 2381, + "time_per_iteration": 2.4476850032806396 + }, + { + "auxiliary_loss_clip": 0.01156941, + "auxiliary_loss_mlp": 0.01051059, + "balance_loss_clip": 1.03084123, + "balance_loss_mlp": 1.0517571, + "epoch": 0.14321358785510296, + "flos": 17092079303040.0, + "grad_norm": 2.599935932772449, + "language_loss": 0.76852649, + "learning_rate": 3.867046846740299e-06, + "loss": 0.7906065, + "num_input_tokens_seen": 51620880, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.046875, + "step": 2382, + "time_per_iteration": 2.4389045238494873 + }, + { + "auxiliary_loss_clip": 0.01157847, + "auxiliary_loss_mlp": 0.01053474, + "balance_loss_clip": 1.03338671, + "balance_loss_mlp": 1.05068171, + "epoch": 0.14327371110777093, + "flos": 26322916769280.0, + "grad_norm": 2.461149819336849, + "language_loss": 0.76948071, + "learning_rate": 3.866907182937039e-06, + "loss": 0.79159391, + "num_input_tokens_seen": 51640170, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0703125, + "step": 2383, + "time_per_iteration": 2.516038179397583 + }, + { + "auxiliary_loss_clip": 0.01158748, + "auxiliary_loss_mlp": 0.01050878, + "balance_loss_clip": 1.0299803, + "balance_loss_mlp": 1.05114412, + "epoch": 0.1433338343604389, + "flos": 18076462502400.0, + "grad_norm": 2.169581662424978, + "language_loss": 0.88202822, + "learning_rate": 3.866767448340471e-06, + "loss": 0.9041245, + "num_input_tokens_seen": 51656580, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.078125, + "step": 2384, + "time_per_iteration": 2.42138934135437 + }, + { + "auxiliary_loss_clip": 0.01164338, + "auxiliary_loss_mlp": 0.01049242, + "balance_loss_clip": 1.02780819, + "balance_loss_mlp": 1.05382657, + "epoch": 0.14339395761310686, + "flos": 15522783033600.0, + "grad_norm": 4.175812514986151, + "language_loss": 0.79225606, + "learning_rate": 3.866627642955895e-06, + "loss": 0.81439185, + "num_input_tokens_seen": 51674645, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.109375, + "step": 2385, + "time_per_iteration": 2.4439244270324707 + }, + { + "auxiliary_loss_clip": 0.01156029, + "auxiliary_loss_mlp": 0.01046717, + "balance_loss_clip": 1.02692771, + "balance_loss_mlp": 1.04881537, + "epoch": 0.14345408086577485, + "flos": 28548767784960.0, + "grad_norm": 1.9672730758223058, + "language_loss": 0.74989617, + "learning_rate": 3.866487766788612e-06, + "loss": 0.77192366, + "num_input_tokens_seen": 51695770, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.078125, + "step": 2386, + "time_per_iteration": 2.533304214477539 + }, + { + "auxiliary_loss_clip": 0.01159067, + "auxiliary_loss_mlp": 0.01047419, + "balance_loss_clip": 1.02777338, + "balance_loss_mlp": 1.05180025, + "epoch": 0.14351420411844282, + "flos": 20230061310720.0, + "grad_norm": 2.5174427688568626, + "language_loss": 0.78475344, + "learning_rate": 3.866347819843925e-06, + "loss": 0.80681831, + "num_input_tokens_seen": 51714165, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0703125, + "step": 2387, + "time_per_iteration": 2.4568724632263184 + }, + { + "auxiliary_loss_clip": 0.01157837, + "auxiliary_loss_mlp": 0.010548, + "balance_loss_clip": 1.03389072, + "balance_loss_mlp": 1.05092847, + "epoch": 0.14357432737111078, + "flos": 19865029345920.0, + "grad_norm": 2.559937991009886, + "language_loss": 0.82087159, + "learning_rate": 3.866207802127143e-06, + "loss": 0.84299791, + "num_input_tokens_seen": 51734440, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.0703125, + "step": 2388, + "time_per_iteration": 2.5136237144470215 + }, + { + "auxiliary_loss_clip": 0.01161514, + "auxiliary_loss_mlp": 0.01044981, + "balance_loss_clip": 1.02633715, + "balance_loss_mlp": 1.05393136, + "epoch": 0.14363445062377875, + "flos": 28256814040320.0, + "grad_norm": 2.471836270672028, + "language_loss": 0.82267237, + "learning_rate": 3.866067713643573e-06, + "loss": 0.84473729, + "num_input_tokens_seen": 51753730, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.078125, + "step": 2389, + "time_per_iteration": 2.576352119445801 + }, + { + "auxiliary_loss_clip": 0.01161426, + "auxiliary_loss_mlp": 0.01051291, + "balance_loss_clip": 1.03020322, + "balance_loss_mlp": 1.05032301, + "epoch": 0.1436945738764467, + "flos": 18186672407040.0, + "grad_norm": 2.165584666776674, + "language_loss": 0.82654548, + "learning_rate": 3.8659275543985285e-06, + "loss": 0.84867263, + "num_input_tokens_seen": 51771195, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.109375, + "step": 2390, + "time_per_iteration": 2.5145435333251953 + }, + { + "auxiliary_loss_clip": 0.01158378, + "auxiliary_loss_mlp": 0.01054186, + "balance_loss_clip": 1.03406334, + "balance_loss_mlp": 1.0510571, + "epoch": 0.14375469712911468, + "flos": 27307910499840.0, + "grad_norm": 3.0575281215329086, + "language_loss": 0.74616158, + "learning_rate": 3.865787324397324e-06, + "loss": 0.76828718, + "num_input_tokens_seen": 51792290, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.078125, + "step": 2391, + "time_per_iteration": 2.5368545055389404 + }, + { + "auxiliary_loss_clip": 0.01050934, + "auxiliary_loss_mlp": 0.01014282, + "balance_loss_clip": 1.0121367, + "balance_loss_mlp": 1.01461065, + "epoch": 0.14381482038178264, + "flos": 56891445287040.0, + "grad_norm": 0.8732258813949081, + "language_loss": 0.61769497, + "learning_rate": 3.865647023645277e-06, + "loss": 0.63834715, + "num_input_tokens_seen": 51843675, + "router_z_loss_clip": 0.02148438, + "router_z_loss_mlp": 0.36328125, + "step": 2392, + "time_per_iteration": 2.9315476417541504 + }, + { + "auxiliary_loss_clip": 0.01161818, + "auxiliary_loss_mlp": 0.01056559, + "balance_loss_clip": 1.03449333, + "balance_loss_mlp": 1.04981267, + "epoch": 0.14387494363445064, + "flos": 14282177143680.0, + "grad_norm": 2.638581894381379, + "language_loss": 0.76172751, + "learning_rate": 3.865506652147709e-06, + "loss": 0.78391123, + "num_input_tokens_seen": 51860285, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1171875, + "step": 2393, + "time_per_iteration": 3.857799530029297 + }, + { + "auxiliary_loss_clip": 0.01161345, + "auxiliary_loss_mlp": 0.01049065, + "balance_loss_clip": 1.02908611, + "balance_loss_mlp": 1.05249143, + "epoch": 0.1439350668871186, + "flos": 26761493831040.0, + "grad_norm": 1.8778469598095298, + "language_loss": 0.76782668, + "learning_rate": 3.865366209909941e-06, + "loss": 0.78993082, + "num_input_tokens_seen": 51880105, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.09375, + "step": 2394, + "time_per_iteration": 3.979130983352661 + }, + { + "auxiliary_loss_clip": 0.01158023, + "auxiliary_loss_mlp": 0.01048603, + "balance_loss_clip": 1.02836156, + "balance_loss_mlp": 1.05062532, + "epoch": 0.14399519013978657, + "flos": 40700040537600.0, + "grad_norm": 1.605706810552395, + "language_loss": 0.85831755, + "learning_rate": 3.8652256969372994e-06, + "loss": 0.88038385, + "num_input_tokens_seen": 51905175, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.078125, + "step": 2395, + "time_per_iteration": 2.652092933654785 + }, + { + "auxiliary_loss_clip": 0.01157831, + "auxiliary_loss_mlp": 0.01049814, + "balance_loss_clip": 1.03040648, + "balance_loss_mlp": 1.05241179, + "epoch": 0.14405531339245453, + "flos": 20557530627840.0, + "grad_norm": 1.5230484666362787, + "language_loss": 0.82984561, + "learning_rate": 3.865085113235113e-06, + "loss": 0.85192204, + "num_input_tokens_seen": 51924490, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0546875, + "step": 2396, + "time_per_iteration": 2.4647467136383057 + }, + { + "auxiliary_loss_clip": 0.01152766, + "auxiliary_loss_mlp": 0.01046059, + "balance_loss_clip": 1.02691364, + "balance_loss_mlp": 1.04947495, + "epoch": 0.1441154366451225, + "flos": 19572931946880.0, + "grad_norm": 2.435366869769497, + "language_loss": 0.82564163, + "learning_rate": 3.864944458808712e-06, + "loss": 0.8476299, + "num_input_tokens_seen": 51940490, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.03125, + "step": 2397, + "time_per_iteration": 2.4151055812835693 + }, + { + "auxiliary_loss_clip": 0.01161338, + "auxiliary_loss_mlp": 0.01047669, + "balance_loss_clip": 1.02689052, + "balance_loss_mlp": 1.05216622, + "epoch": 0.14417555989779046, + "flos": 18515721922560.0, + "grad_norm": 1.6104109289920625, + "language_loss": 0.79418427, + "learning_rate": 3.86480373366343e-06, + "loss": 0.81627429, + "num_input_tokens_seen": 51957910, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.09375, + "step": 2398, + "time_per_iteration": 2.4470388889312744 + }, + { + "auxiliary_loss_clip": 0.01158929, + "auxiliary_loss_mlp": 0.01052066, + "balance_loss_clip": 1.03246808, + "balance_loss_mlp": 1.05359757, + "epoch": 0.14423568315045843, + "flos": 26031681296640.0, + "grad_norm": 2.7500042291552433, + "language_loss": 0.64847696, + "learning_rate": 3.864662937804603e-06, + "loss": 0.67058688, + "num_input_tokens_seen": 51978010, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0546875, + "step": 2399, + "time_per_iteration": 2.5123891830444336 + }, + { + "auxiliary_loss_clip": 0.01157977, + "auxiliary_loss_mlp": 0.01044487, + "balance_loss_clip": 1.02472198, + "balance_loss_mlp": 1.05306005, + "epoch": 0.14429580640312642, + "flos": 21288743792640.0, + "grad_norm": 1.4896130870957418, + "language_loss": 0.82329226, + "learning_rate": 3.864522071237571e-06, + "loss": 0.84531689, + "num_input_tokens_seen": 51998515, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0546875, + "step": 2400, + "time_per_iteration": 2.4825797080993652 + }, + { + "auxiliary_loss_clip": 0.01162323, + "auxiliary_loss_mlp": 0.01052957, + "balance_loss_clip": 1.03165436, + "balance_loss_mlp": 1.053689, + "epoch": 0.14435592965579438, + "flos": 25627865621760.0, + "grad_norm": 1.540874002782335, + "language_loss": 0.74606794, + "learning_rate": 3.864381133967676e-06, + "loss": 0.76822078, + "num_input_tokens_seen": 52019270, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.0859375, + "step": 2401, + "time_per_iteration": 2.507983684539795 + }, + { + "auxiliary_loss_clip": 0.01156636, + "auxiliary_loss_mlp": 0.01046459, + "balance_loss_clip": 1.0269084, + "balance_loss_mlp": 1.05109596, + "epoch": 0.14441605290846235, + "flos": 22965053656320.0, + "grad_norm": 1.7568662987329828, + "language_loss": 0.80577219, + "learning_rate": 3.86424012600026e-06, + "loss": 0.82780313, + "num_input_tokens_seen": 52039315, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.046875, + "step": 2402, + "time_per_iteration": 2.4913880825042725 + }, + { + "auxiliary_loss_clip": 0.01156436, + "auxiliary_loss_mlp": 0.01048893, + "balance_loss_clip": 1.02880669, + "balance_loss_mlp": 1.05137098, + "epoch": 0.14447617616113032, + "flos": 17347655548800.0, + "grad_norm": 2.1115432529250753, + "language_loss": 0.84918672, + "learning_rate": 3.864099047340673e-06, + "loss": 0.87124002, + "num_input_tokens_seen": 52056555, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.046875, + "step": 2403, + "time_per_iteration": 2.4267525672912598 + }, + { + "auxiliary_loss_clip": 0.01155438, + "auxiliary_loss_mlp": 0.01053748, + "balance_loss_clip": 1.03312445, + "balance_loss_mlp": 1.04934669, + "epoch": 0.14453629941379828, + "flos": 24060185464320.0, + "grad_norm": 3.423742001713465, + "language_loss": 0.70017314, + "learning_rate": 3.863957897994262e-06, + "loss": 0.72226501, + "num_input_tokens_seen": 52075800, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.0625, + "step": 2404, + "time_per_iteration": 2.487827777862549 + }, + { + "auxiliary_loss_clip": 0.01151274, + "auxiliary_loss_mlp": 0.0104872, + "balance_loss_clip": 1.02976513, + "balance_loss_mlp": 1.0473218, + "epoch": 0.14459642266646625, + "flos": 14429554646400.0, + "grad_norm": 2.368746641876408, + "language_loss": 0.72847003, + "learning_rate": 3.863816677966381e-06, + "loss": 0.75046992, + "num_input_tokens_seen": 52092585, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0390625, + "step": 2405, + "time_per_iteration": 2.458444833755493 + }, + { + "auxiliary_loss_clip": 0.01152813, + "auxiliary_loss_mlp": 0.0104761, + "balance_loss_clip": 1.02879858, + "balance_loss_mlp": 1.04891181, + "epoch": 0.14465654591913424, + "flos": 9867032179200.0, + "grad_norm": 2.2064790582144473, + "language_loss": 0.73115766, + "learning_rate": 3.863675387262386e-06, + "loss": 0.75316191, + "num_input_tokens_seen": 52108990, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0390625, + "step": 2406, + "time_per_iteration": 2.4501168727874756 + }, + { + "auxiliary_loss_clip": 0.0115439, + "auxiliary_loss_mlp": 0.01052848, + "balance_loss_clip": 1.03161645, + "balance_loss_mlp": 1.04889357, + "epoch": 0.1447166691718022, + "flos": 24972926987520.0, + "grad_norm": 4.997473868200426, + "language_loss": 0.75399184, + "learning_rate": 3.8635340258876325e-06, + "loss": 0.77606416, + "num_input_tokens_seen": 52125385, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.0546875, + "step": 2407, + "time_per_iteration": 2.482008934020996 + }, + { + "auxiliary_loss_clip": 0.01151849, + "auxiliary_loss_mlp": 0.01043439, + "balance_loss_clip": 1.02418649, + "balance_loss_mlp": 1.04607177, + "epoch": 0.14477679242447017, + "flos": 21908023200000.0, + "grad_norm": 1.6082248834480546, + "language_loss": 0.79472804, + "learning_rate": 3.8633925938474826e-06, + "loss": 0.81668091, + "num_input_tokens_seen": 52144985, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.0625, + "step": 2408, + "time_per_iteration": 2.4657323360443115 + }, + { + "auxiliary_loss_clip": 0.01155517, + "auxiliary_loss_mlp": 0.01051689, + "balance_loss_clip": 1.03006482, + "balance_loss_mlp": 1.05088127, + "epoch": 0.14483691567713813, + "flos": 20740746925440.0, + "grad_norm": 2.1979655558708893, + "language_loss": 0.82594806, + "learning_rate": 3.863251091147299e-06, + "loss": 0.84802014, + "num_input_tokens_seen": 52163885, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.046875, + "step": 2409, + "time_per_iteration": 2.450345039367676 + }, + { + "auxiliary_loss_clip": 0.01156412, + "auxiliary_loss_mlp": 0.01053711, + "balance_loss_clip": 1.03411365, + "balance_loss_mlp": 1.05046105, + "epoch": 0.1448970389298061, + "flos": 35407705536000.0, + "grad_norm": 1.954409921875598, + "language_loss": 0.74561608, + "learning_rate": 3.863109517792446e-06, + "loss": 0.7677173, + "num_input_tokens_seen": 52184325, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0625, + "step": 2410, + "time_per_iteration": 2.5861988067626953 + }, + { + "auxiliary_loss_clip": 0.01154014, + "auxiliary_loss_mlp": 0.01047443, + "balance_loss_clip": 1.02883387, + "balance_loss_mlp": 1.04858971, + "epoch": 0.14495716218247406, + "flos": 15414368808960.0, + "grad_norm": 2.3844352739280597, + "language_loss": 0.81135416, + "learning_rate": 3.8629678737882945e-06, + "loss": 0.83336866, + "num_input_tokens_seen": 52202740, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0546875, + "step": 2411, + "time_per_iteration": 2.4708898067474365 + }, + { + "auxiliary_loss_clip": 0.0115486, + "auxiliary_loss_mlp": 0.01053033, + "balance_loss_clip": 1.03403103, + "balance_loss_mlp": 1.05123138, + "epoch": 0.14501728543514203, + "flos": 33693222493440.0, + "grad_norm": 1.954560524414831, + "language_loss": 0.69816971, + "learning_rate": 3.862826159140214e-06, + "loss": 0.7202487, + "num_input_tokens_seen": 52223100, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.03125, + "step": 2412, + "time_per_iteration": 2.5614776611328125 + }, + { + "auxiliary_loss_clip": 0.0115476, + "auxiliary_loss_mlp": 0.01046078, + "balance_loss_clip": 1.02640891, + "balance_loss_mlp": 1.05100143, + "epoch": 0.14507740868781002, + "flos": 15596112648960.0, + "grad_norm": 2.1541085269745803, + "language_loss": 0.77347231, + "learning_rate": 3.862684373853579e-06, + "loss": 0.79548067, + "num_input_tokens_seen": 52239690, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0390625, + "step": 2413, + "time_per_iteration": 2.4292590618133545 + }, + { + "auxiliary_loss_clip": 0.01049286, + "auxiliary_loss_mlp": 0.0100403, + "balance_loss_clip": 1.00175285, + "balance_loss_mlp": 1.01294982, + "epoch": 0.145137531940478, + "flos": 66675343438080.0, + "grad_norm": 0.9152840666775347, + "language_loss": 0.58887923, + "learning_rate": 3.8625425179337656e-06, + "loss": 0.60941237, + "num_input_tokens_seen": 52296705, + "router_z_loss_clip": 0.02282715, + "router_z_loss_mlp": 0.36328125, + "step": 2414, + "time_per_iteration": 2.9752402305603027 + }, + { + "auxiliary_loss_clip": 0.01048826, + "auxiliary_loss_mlp": 0.01001535, + "balance_loss_clip": 0.99943656, + "balance_loss_mlp": 1.01240802, + "epoch": 0.14519765519314595, + "flos": 67521578929920.0, + "grad_norm": 0.8348908268898737, + "language_loss": 0.6218617, + "learning_rate": 3.862400591386154e-06, + "loss": 0.64236534, + "num_input_tokens_seen": 52361830, + "router_z_loss_clip": 0.02099609, + "router_z_loss_mlp": 0.36328125, + "step": 2415, + "time_per_iteration": 3.039710521697998 + }, + { + "auxiliary_loss_clip": 0.01151709, + "auxiliary_loss_mlp": 0.01046414, + "balance_loss_clip": 1.02637458, + "balance_loss_mlp": 1.04699647, + "epoch": 0.14525777844581392, + "flos": 17198913329280.0, + "grad_norm": 1.8743578134099377, + "language_loss": 0.72001135, + "learning_rate": 3.8622585942161245e-06, + "loss": 0.74199259, + "num_input_tokens_seen": 52379420, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.046875, + "step": 2416, + "time_per_iteration": 2.4283041954040527 + }, + { + "auxiliary_loss_clip": 0.0104556, + "auxiliary_loss_mlp": 0.01005813, + "balance_loss_clip": 1.00379848, + "balance_loss_mlp": 1.01002693, + "epoch": 0.14531790169848188, + "flos": 65404609015680.0, + "grad_norm": 0.711670432605859, + "language_loss": 0.60392165, + "learning_rate": 3.8621165264290635e-06, + "loss": 0.62443542, + "num_input_tokens_seen": 52446290, + "router_z_loss_clip": 0.0201416, + "router_z_loss_mlp": 0.35546875, + "step": 2417, + "time_per_iteration": 3.0824739933013916 + }, + { + "auxiliary_loss_clip": 0.01155799, + "auxiliary_loss_mlp": 0.01055986, + "balance_loss_clip": 1.03639972, + "balance_loss_mlp": 1.04795754, + "epoch": 0.14537802495114985, + "flos": 32562467372160.0, + "grad_norm": 2.9144560714513363, + "language_loss": 0.79237175, + "learning_rate": 3.861974388030356e-06, + "loss": 0.8144896, + "num_input_tokens_seen": 52467295, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.078125, + "step": 2418, + "time_per_iteration": 2.564497947692871 + }, + { + "auxiliary_loss_clip": 0.01150145, + "auxiliary_loss_mlp": 0.01051645, + "balance_loss_clip": 1.03267837, + "balance_loss_mlp": 1.04712582, + "epoch": 0.1454381482038178, + "flos": 20226685432320.0, + "grad_norm": 1.8755047341617508, + "language_loss": 0.72032261, + "learning_rate": 3.861832179025394e-06, + "loss": 0.74234051, + "num_input_tokens_seen": 52487295, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.03125, + "step": 2419, + "time_per_iteration": 2.457617998123169 + }, + { + "auxiliary_loss_clip": 0.01153915, + "auxiliary_loss_mlp": 0.01053899, + "balance_loss_clip": 1.0335021, + "balance_loss_mlp": 1.05042267, + "epoch": 0.1454982714564858, + "flos": 22893124671360.0, + "grad_norm": 2.3659429121693525, + "language_loss": 0.90125811, + "learning_rate": 3.861689899419569e-06, + "loss": 0.92333627, + "num_input_tokens_seen": 52504220, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.03125, + "step": 2420, + "time_per_iteration": 2.456087827682495 + }, + { + "auxiliary_loss_clip": 0.01154143, + "auxiliary_loss_mlp": 0.01057012, + "balance_loss_clip": 1.0382725, + "balance_loss_mlp": 1.04868603, + "epoch": 0.14555839470915377, + "flos": 20229845829120.0, + "grad_norm": 2.2940003535379057, + "language_loss": 0.83309549, + "learning_rate": 3.861547549218276e-06, + "loss": 0.85520703, + "num_input_tokens_seen": 52521900, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0546875, + "step": 2421, + "time_per_iteration": 2.441432476043701 + }, + { + "auxiliary_loss_clip": 0.01153189, + "auxiliary_loss_mlp": 0.01053683, + "balance_loss_clip": 1.03400183, + "balance_loss_mlp": 1.04684627, + "epoch": 0.14561851796182174, + "flos": 22236282616320.0, + "grad_norm": 1.6167157199382733, + "language_loss": 0.81511533, + "learning_rate": 3.861405128426914e-06, + "loss": 0.83718407, + "num_input_tokens_seen": 52540495, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0625, + "step": 2422, + "time_per_iteration": 2.473010540008545 + }, + { + "auxiliary_loss_clip": 0.01046424, + "auxiliary_loss_mlp": 0.01017838, + "balance_loss_clip": 1.01558518, + "balance_loss_mlp": 1.01065397, + "epoch": 0.1456786412144897, + "flos": 52636786289280.0, + "grad_norm": 0.9226410759759552, + "language_loss": 0.63245702, + "learning_rate": 3.861262637050883e-06, + "loss": 0.65309966, + "num_input_tokens_seen": 52603305, + "router_z_loss_clip": 0.02258301, + "router_z_loss_mlp": 0.35742188, + "step": 2423, + "time_per_iteration": 3.0516433715820312 + }, + { + "auxiliary_loss_clip": 0.01155109, + "auxiliary_loss_mlp": 0.01045363, + "balance_loss_clip": 1.02756512, + "balance_loss_mlp": 1.05096769, + "epoch": 0.14573876446715767, + "flos": 23221671396480.0, + "grad_norm": 1.7656587875688796, + "language_loss": 0.8267172, + "learning_rate": 3.861120075095585e-06, + "loss": 0.84872198, + "num_input_tokens_seen": 52623435, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 1.046875, + "step": 2424, + "time_per_iteration": 2.4918792247772217 + }, + { + "auxiliary_loss_clip": 0.01153149, + "auxiliary_loss_mlp": 0.0104962, + "balance_loss_clip": 1.03071296, + "balance_loss_mlp": 1.04970837, + "epoch": 0.14579888771982563, + "flos": 18114384286080.0, + "grad_norm": 2.0603730404595915, + "language_loss": 0.79317909, + "learning_rate": 3.860977442566429e-06, + "loss": 0.81520677, + "num_input_tokens_seen": 52642255, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.03125, + "step": 2425, + "time_per_iteration": 2.4607083797454834 + }, + { + "auxiliary_loss_clip": 0.01155851, + "auxiliary_loss_mlp": 0.01048544, + "balance_loss_clip": 1.030007, + "balance_loss_mlp": 1.05136847, + "epoch": 0.14585901097249362, + "flos": 23001107932800.0, + "grad_norm": 2.4026453111661703, + "language_loss": 0.83269531, + "learning_rate": 3.860834739468821e-06, + "loss": 0.85473925, + "num_input_tokens_seen": 52658700, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0390625, + "step": 2426, + "time_per_iteration": 2.4615883827209473 + }, + { + "auxiliary_loss_clip": 0.01153183, + "auxiliary_loss_mlp": 0.0104253, + "balance_loss_clip": 1.02420735, + "balance_loss_mlp": 1.05100346, + "epoch": 0.1459191342251616, + "flos": 21908669644800.0, + "grad_norm": 1.78851961601388, + "language_loss": 0.86878085, + "learning_rate": 3.860691965808173e-06, + "loss": 0.89073801, + "num_input_tokens_seen": 52678140, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.0234375, + "step": 2427, + "time_per_iteration": 2.46846866607666 + }, + { + "auxiliary_loss_clip": 0.01159617, + "auxiliary_loss_mlp": 0.01046481, + "balance_loss_clip": 1.0264895, + "balance_loss_mlp": 1.05060291, + "epoch": 0.14597925747782955, + "flos": 14975504438400.0, + "grad_norm": 1.9424277979169204, + "language_loss": 0.66795039, + "learning_rate": 3.8605491215899e-06, + "loss": 0.69001138, + "num_input_tokens_seen": 52696825, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.0859375, + "step": 2428, + "time_per_iteration": 2.4277987480163574 + }, + { + "auxiliary_loss_clip": 0.01154279, + "auxiliary_loss_mlp": 0.01048778, + "balance_loss_clip": 1.02870345, + "balance_loss_mlp": 1.05036306, + "epoch": 0.14603938073049752, + "flos": 21068898600960.0, + "grad_norm": 1.7447652065053452, + "language_loss": 0.8363744, + "learning_rate": 3.860406206819417e-06, + "loss": 0.85840499, + "num_input_tokens_seen": 52715125, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0390625, + "step": 2429, + "time_per_iteration": 2.5208661556243896 + }, + { + "auxiliary_loss_clip": 0.01152615, + "auxiliary_loss_mlp": 0.01048492, + "balance_loss_clip": 1.02972817, + "balance_loss_mlp": 1.04804671, + "epoch": 0.14609950398316549, + "flos": 19864777950720.0, + "grad_norm": 1.723947749216575, + "language_loss": 0.78811824, + "learning_rate": 3.860263221502145e-06, + "loss": 0.8101294, + "num_input_tokens_seen": 52734015, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.046875, + "step": 2430, + "time_per_iteration": 2.460575580596924 + }, + { + "auxiliary_loss_clip": 0.0115835, + "auxiliary_loss_mlp": 0.01049311, + "balance_loss_clip": 1.03014231, + "balance_loss_mlp": 1.0529238, + "epoch": 0.14615962723583345, + "flos": 22418852469120.0, + "grad_norm": 2.3723861833809767, + "language_loss": 0.83178174, + "learning_rate": 3.860120165643504e-06, + "loss": 0.85385835, + "num_input_tokens_seen": 52753025, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0546875, + "step": 2431, + "time_per_iteration": 2.468472480773926 + }, + { + "auxiliary_loss_clip": 0.01158923, + "auxiliary_loss_mlp": 0.01053127, + "balance_loss_clip": 1.03244448, + "balance_loss_mlp": 1.05131185, + "epoch": 0.14621975048850142, + "flos": 22346241125760.0, + "grad_norm": 1.7402379411604871, + "language_loss": 0.78777766, + "learning_rate": 3.859977039248921e-06, + "loss": 0.80989814, + "num_input_tokens_seen": 52773420, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.078125, + "step": 2432, + "time_per_iteration": 2.4618513584136963 + }, + { + "auxiliary_loss_clip": 0.01153865, + "auxiliary_loss_mlp": 0.01052087, + "balance_loss_clip": 1.03158331, + "balance_loss_mlp": 1.04917812, + "epoch": 0.1462798737411694, + "flos": 24389163152640.0, + "grad_norm": 1.9105383938395448, + "language_loss": 0.79940903, + "learning_rate": 3.859833842323822e-06, + "loss": 0.82146859, + "num_input_tokens_seen": 52792870, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.046875, + "step": 2433, + "time_per_iteration": 2.4901435375213623 + }, + { + "auxiliary_loss_clip": 0.01152814, + "auxiliary_loss_mlp": 0.01051119, + "balance_loss_clip": 1.03149712, + "balance_loss_mlp": 1.05186844, + "epoch": 0.14633999699383737, + "flos": 19244672530560.0, + "grad_norm": 1.8984055506020234, + "language_loss": 0.78421938, + "learning_rate": 3.859690574873638e-06, + "loss": 0.80625868, + "num_input_tokens_seen": 52811615, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0078125, + "step": 2434, + "time_per_iteration": 3.833007335662842 + }, + { + "auxiliary_loss_clip": 0.01046525, + "auxiliary_loss_mlp": 0.01005945, + "balance_loss_clip": 1.00356054, + "balance_loss_mlp": 1.01038933, + "epoch": 0.14640012024650534, + "flos": 62660638270080.0, + "grad_norm": 0.8674820067375166, + "language_loss": 0.58373666, + "learning_rate": 3.8595472369038e-06, + "loss": 0.60426134, + "num_input_tokens_seen": 52873230, + "router_z_loss_clip": 0.02380371, + "router_z_loss_mlp": 0.36132812, + "step": 2435, + "time_per_iteration": 5.911077499389648 + }, + { + "auxiliary_loss_clip": 0.01147895, + "auxiliary_loss_mlp": 0.01045492, + "balance_loss_clip": 1.02620411, + "balance_loss_mlp": 1.04662895, + "epoch": 0.1464602434991733, + "flos": 12276243146880.0, + "grad_norm": 2.2832294661951753, + "language_loss": 0.88395989, + "learning_rate": 3.859403828419744e-06, + "loss": 0.90589368, + "num_input_tokens_seen": 52889325, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.015625, + "step": 2436, + "time_per_iteration": 2.440303325653076 + }, + { + "auxiliary_loss_clip": 0.01156296, + "auxiliary_loss_mlp": 0.01046658, + "balance_loss_clip": 1.02697682, + "balance_loss_mlp": 1.05032742, + "epoch": 0.14652036675184127, + "flos": 20922311197440.0, + "grad_norm": 2.0196076648737, + "language_loss": 0.74832988, + "learning_rate": 3.85926034942691e-06, + "loss": 0.7703594, + "num_input_tokens_seen": 52909705, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0625, + "step": 2437, + "time_per_iteration": 2.460806369781494 + }, + { + "auxiliary_loss_clip": 0.01153675, + "auxiliary_loss_mlp": 0.01044961, + "balance_loss_clip": 1.02374196, + "balance_loss_mlp": 1.04798007, + "epoch": 0.14658049000450923, + "flos": 27703681528320.0, + "grad_norm": 2.346268485469047, + "language_loss": 0.73932636, + "learning_rate": 3.859116799930736e-06, + "loss": 0.76131272, + "num_input_tokens_seen": 52930300, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.0546875, + "step": 2438, + "time_per_iteration": 2.5051729679107666 + }, + { + "auxiliary_loss_clip": 0.01154512, + "auxiliary_loss_mlp": 0.01041271, + "balance_loss_clip": 1.02310383, + "balance_loss_mlp": 1.05231857, + "epoch": 0.14664061325717723, + "flos": 24936513575040.0, + "grad_norm": 1.8289443089735578, + "language_loss": 0.74791402, + "learning_rate": 3.858973179936668e-06, + "loss": 0.76987189, + "num_input_tokens_seen": 52949955, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.0234375, + "step": 2439, + "time_per_iteration": 2.4596338272094727 + }, + { + "auxiliary_loss_clip": 0.01151843, + "auxiliary_loss_mlp": 0.01047986, + "balance_loss_clip": 1.02872145, + "balance_loss_mlp": 1.04913521, + "epoch": 0.1467007365098452, + "flos": 40297661406720.0, + "grad_norm": 2.106046924266039, + "language_loss": 0.74542844, + "learning_rate": 3.85882948945015e-06, + "loss": 0.76742673, + "num_input_tokens_seen": 52972905, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.03125, + "step": 2440, + "time_per_iteration": 2.613889217376709 + }, + { + "auxiliary_loss_clip": 0.01146734, + "auxiliary_loss_mlp": 0.01048348, + "balance_loss_clip": 1.02964425, + "balance_loss_mlp": 1.04660702, + "epoch": 0.14676085976251316, + "flos": 26541074021760.0, + "grad_norm": 1.6151911954653986, + "language_loss": 0.83047861, + "learning_rate": 3.85868572847663e-06, + "loss": 0.85242939, + "num_input_tokens_seen": 52994850, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0, + "step": 2441, + "time_per_iteration": 2.508570432662964 + }, + { + "auxiliary_loss_clip": 0.01157481, + "auxiliary_loss_mlp": 0.0104725, + "balance_loss_clip": 1.0275681, + "balance_loss_mlp": 1.04952955, + "epoch": 0.14682098301518112, + "flos": 23550110380800.0, + "grad_norm": 3.362343971731744, + "language_loss": 0.71562135, + "learning_rate": 3.858541897021563e-06, + "loss": 0.73766863, + "num_input_tokens_seen": 53014740, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.078125, + "step": 2442, + "time_per_iteration": 2.4903416633605957 + }, + { + "auxiliary_loss_clip": 0.01160717, + "auxiliary_loss_mlp": 0.01042253, + "balance_loss_clip": 1.02257109, + "balance_loss_mlp": 1.0510819, + "epoch": 0.1468811062678491, + "flos": 11651073909120.0, + "grad_norm": 3.2762909335645043, + "language_loss": 0.80804002, + "learning_rate": 3.8583979950904e-06, + "loss": 0.83006966, + "num_input_tokens_seen": 53029780, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.09375, + "step": 2443, + "time_per_iteration": 2.424539089202881 + }, + { + "auxiliary_loss_clip": 0.01154468, + "auxiliary_loss_mlp": 0.01049831, + "balance_loss_clip": 1.03005409, + "balance_loss_mlp": 1.0504694, + "epoch": 0.14694122952051705, + "flos": 23002616304000.0, + "grad_norm": 2.077049554342068, + "language_loss": 0.8297509, + "learning_rate": 3.858254022688599e-06, + "loss": 0.85179389, + "num_input_tokens_seen": 53048620, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0390625, + "step": 2444, + "time_per_iteration": 2.4937214851379395 + }, + { + "auxiliary_loss_clip": 0.01154781, + "auxiliary_loss_mlp": 0.01048939, + "balance_loss_clip": 1.02961493, + "balance_loss_mlp": 1.05025554, + "epoch": 0.14700135277318502, + "flos": 26502972670080.0, + "grad_norm": 1.763635964291881, + "language_loss": 0.71218902, + "learning_rate": 3.85810997982162e-06, + "loss": 0.73422623, + "num_input_tokens_seen": 53070055, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.046875, + "step": 2445, + "time_per_iteration": 2.491645336151123 + }, + { + "auxiliary_loss_clip": 0.01045345, + "auxiliary_loss_mlp": 0.01028611, + "balance_loss_clip": 1.02659595, + "balance_loss_mlp": 1.00942683, + "epoch": 0.147061476025853, + "flos": 59449434387840.0, + "grad_norm": 0.8232649654452494, + "language_loss": 0.63138294, + "learning_rate": 3.857965866494923e-06, + "loss": 0.6521225, + "num_input_tokens_seen": 53126945, + "router_z_loss_clip": 0.0201416, + "router_z_loss_mlp": 0.359375, + "step": 2446, + "time_per_iteration": 2.9610531330108643 + }, + { + "auxiliary_loss_clip": 0.01158924, + "auxiliary_loss_mlp": 0.0104308, + "balance_loss_clip": 1.02355385, + "balance_loss_mlp": 1.05348802, + "epoch": 0.14712159927852098, + "flos": 28330897841280.0, + "grad_norm": 1.8119571313268434, + "language_loss": 0.74937665, + "learning_rate": 3.857821682713975e-06, + "loss": 0.7713967, + "num_input_tokens_seen": 53149130, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0546875, + "step": 2447, + "time_per_iteration": 2.547112226486206 + }, + { + "auxiliary_loss_clip": 0.0115445, + "auxiliary_loss_mlp": 0.01046965, + "balance_loss_clip": 1.02838051, + "balance_loss_mlp": 1.04998112, + "epoch": 0.14718172253118894, + "flos": 27089825074560.0, + "grad_norm": 2.0554455972062744, + "language_loss": 0.85722244, + "learning_rate": 3.857677428484242e-06, + "loss": 0.87923658, + "num_input_tokens_seen": 53167120, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.046875, + "step": 2448, + "time_per_iteration": 2.519530773162842 + }, + { + "auxiliary_loss_clip": 0.01045412, + "auxiliary_loss_mlp": 0.01010534, + "balance_loss_clip": 1.0081377, + "balance_loss_mlp": 1.00952029, + "epoch": 0.1472418457838569, + "flos": 66706764860160.0, + "grad_norm": 0.7649510042513386, + "language_loss": 0.56836212, + "learning_rate": 3.857533103811195e-06, + "loss": 0.58892155, + "num_input_tokens_seen": 53227945, + "router_z_loss_clip": 0.02392578, + "router_z_loss_mlp": 0.359375, + "step": 2449, + "time_per_iteration": 3.0049068927764893 + }, + { + "auxiliary_loss_clip": 0.01150109, + "auxiliary_loss_mlp": 0.01044261, + "balance_loss_clip": 1.02462673, + "balance_loss_mlp": 1.04850447, + "epoch": 0.14730196903652487, + "flos": 19573578391680.0, + "grad_norm": 1.900224172693126, + "language_loss": 0.85544562, + "learning_rate": 3.857388708700307e-06, + "loss": 0.87738931, + "num_input_tokens_seen": 53244615, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.015625, + "step": 2450, + "time_per_iteration": 2.5826945304870605 + }, + { + "auxiliary_loss_clip": 0.01155696, + "auxiliary_loss_mlp": 0.01049879, + "balance_loss_clip": 1.03009009, + "balance_loss_mlp": 1.05074143, + "epoch": 0.14736209228919284, + "flos": 16071031296000.0, + "grad_norm": 2.029178420182481, + "language_loss": 0.74693608, + "learning_rate": 3.857244243157052e-06, + "loss": 0.76899183, + "num_input_tokens_seen": 53262205, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0546875, + "step": 2451, + "time_per_iteration": 2.4345250129699707 + }, + { + "auxiliary_loss_clip": 0.01146898, + "auxiliary_loss_mlp": 0.01039395, + "balance_loss_clip": 1.02092934, + "balance_loss_mlp": 1.04758763, + "epoch": 0.1474222155418608, + "flos": 23039460679680.0, + "grad_norm": 1.6073898366987713, + "language_loss": 0.82240498, + "learning_rate": 3.85709970718691e-06, + "loss": 0.8442679, + "num_input_tokens_seen": 53282445, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.9921875, + "step": 2452, + "time_per_iteration": 2.468869924545288 + }, + { + "auxiliary_loss_clip": 0.01154267, + "auxiliary_loss_mlp": 0.01038485, + "balance_loss_clip": 1.02032936, + "balance_loss_mlp": 1.05154371, + "epoch": 0.1474823387945288, + "flos": 17018641946880.0, + "grad_norm": 1.7191329381743174, + "language_loss": 0.74021572, + "learning_rate": 3.856955100795361e-06, + "loss": 0.76214325, + "num_input_tokens_seen": 53299060, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.03125, + "step": 2453, + "time_per_iteration": 2.433424472808838 + }, + { + "auxiliary_loss_clip": 0.01154761, + "auxiliary_loss_mlp": 0.01050025, + "balance_loss_clip": 1.03048682, + "balance_loss_mlp": 1.04918802, + "epoch": 0.14754246204719676, + "flos": 17895041884800.0, + "grad_norm": 2.171465059586897, + "language_loss": 0.76326835, + "learning_rate": 3.856810423987889e-06, + "loss": 0.78531623, + "num_input_tokens_seen": 53315970, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0546875, + "step": 2454, + "time_per_iteration": 2.419368028640747 + }, + { + "auxiliary_loss_clip": 0.01155198, + "auxiliary_loss_mlp": 0.01038866, + "balance_loss_clip": 1.01980472, + "balance_loss_mlp": 1.04922831, + "epoch": 0.14760258529986472, + "flos": 13079097987840.0, + "grad_norm": 2.006370127686132, + "language_loss": 0.8301537, + "learning_rate": 3.856665676769979e-06, + "loss": 0.85209435, + "num_input_tokens_seen": 53332940, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0625, + "step": 2455, + "time_per_iteration": 2.426819324493408 + }, + { + "auxiliary_loss_clip": 0.01157227, + "auxiliary_loss_mlp": 0.01044033, + "balance_loss_clip": 1.02519834, + "balance_loss_mlp": 1.04846048, + "epoch": 0.1476627085525327, + "flos": 30806399358720.0, + "grad_norm": 2.442844218228049, + "language_loss": 0.83938581, + "learning_rate": 3.85652085914712e-06, + "loss": 0.8613984, + "num_input_tokens_seen": 53353295, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.09375, + "step": 2456, + "time_per_iteration": 2.525296926498413 + }, + { + "auxiliary_loss_clip": 0.01151791, + "auxiliary_loss_mlp": 0.01043419, + "balance_loss_clip": 1.02459574, + "balance_loss_mlp": 1.04980254, + "epoch": 0.14772283180520066, + "flos": 21689434984320.0, + "grad_norm": 1.8839437807359896, + "language_loss": 0.84325618, + "learning_rate": 3.856375971124805e-06, + "loss": 0.86520827, + "num_input_tokens_seen": 53373410, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0234375, + "step": 2457, + "time_per_iteration": 2.471068859100342 + }, + { + "auxiliary_loss_clip": 0.01149123, + "auxiliary_loss_mlp": 0.01041136, + "balance_loss_clip": 1.02237296, + "balance_loss_mlp": 1.04932761, + "epoch": 0.14778295505786862, + "flos": 18770400328320.0, + "grad_norm": 1.9862753985638202, + "language_loss": 0.75645256, + "learning_rate": 3.856231012708527e-06, + "loss": 0.77835512, + "num_input_tokens_seen": 53391430, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.99609375, + "step": 2458, + "time_per_iteration": 2.44146466255188 + }, + { + "auxiliary_loss_clip": 0.01160318, + "auxiliary_loss_mlp": 0.01049421, + "balance_loss_clip": 1.0284996, + "balance_loss_mlp": 1.05119324, + "epoch": 0.1478430783105366, + "flos": 22893555634560.0, + "grad_norm": 2.405388225865701, + "language_loss": 0.83817005, + "learning_rate": 3.856085983903782e-06, + "loss": 0.86026746, + "num_input_tokens_seen": 53409960, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.09375, + "step": 2459, + "time_per_iteration": 2.470345973968506 + }, + { + "auxiliary_loss_clip": 0.01149406, + "auxiliary_loss_mlp": 0.01041796, + "balance_loss_clip": 1.02359271, + "balance_loss_mlp": 1.0489651, + "epoch": 0.14790320156320458, + "flos": 15085319293440.0, + "grad_norm": 2.6666731923680733, + "language_loss": 0.75856471, + "learning_rate": 3.855940884716071e-06, + "loss": 0.78047681, + "num_input_tokens_seen": 53426160, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.0, + "step": 2460, + "time_per_iteration": 2.4294657707214355 + }, + { + "auxiliary_loss_clip": 0.01157631, + "auxiliary_loss_mlp": 0.01042027, + "balance_loss_clip": 1.02260733, + "balance_loss_mlp": 1.05102873, + "epoch": 0.14796332481587254, + "flos": 26504768350080.0, + "grad_norm": 1.6904429322803973, + "language_loss": 0.81591463, + "learning_rate": 3.855795715150896e-06, + "loss": 0.83791113, + "num_input_tokens_seen": 53448530, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0703125, + "step": 2461, + "time_per_iteration": 2.4993178844451904 + }, + { + "auxiliary_loss_clip": 0.01159506, + "auxiliary_loss_mlp": 0.01046713, + "balance_loss_clip": 1.02611399, + "balance_loss_mlp": 1.05356562, + "epoch": 0.1480234480685405, + "flos": 17563191108480.0, + "grad_norm": 3.2471604819605036, + "language_loss": 0.65689576, + "learning_rate": 3.855650475213761e-06, + "loss": 0.678958, + "num_input_tokens_seen": 53465915, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.0625, + "step": 2462, + "time_per_iteration": 2.4197235107421875 + }, + { + "auxiliary_loss_clip": 0.0115574, + "auxiliary_loss_mlp": 0.01048819, + "balance_loss_clip": 1.02929282, + "balance_loss_mlp": 1.05148113, + "epoch": 0.14808357132120847, + "flos": 53582203232640.0, + "grad_norm": 1.4717210360784851, + "language_loss": 0.67368174, + "learning_rate": 3.8555051649101745e-06, + "loss": 0.69572735, + "num_input_tokens_seen": 53496055, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0390625, + "step": 2463, + "time_per_iteration": 2.774268865585327 + }, + { + "auxiliary_loss_clip": 0.01154664, + "auxiliary_loss_mlp": 0.01050077, + "balance_loss_clip": 1.03071713, + "balance_loss_mlp": 1.04978383, + "epoch": 0.14814369457387644, + "flos": 19829190551040.0, + "grad_norm": 2.177919724516607, + "language_loss": 0.76567936, + "learning_rate": 3.855359784245646e-06, + "loss": 0.78772676, + "num_input_tokens_seen": 53513790, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.046875, + "step": 2464, + "time_per_iteration": 2.4522674083709717 + }, + { + "auxiliary_loss_clip": 0.01152455, + "auxiliary_loss_mlp": 0.01049156, + "balance_loss_clip": 1.03089297, + "balance_loss_mlp": 1.05009413, + "epoch": 0.1482038178265444, + "flos": 23914962777600.0, + "grad_norm": 1.623144605896263, + "language_loss": 0.79623306, + "learning_rate": 3.855214333225688e-06, + "loss": 0.81824923, + "num_input_tokens_seen": 53533410, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.0234375, + "step": 2465, + "time_per_iteration": 2.4946794509887695 + }, + { + "auxiliary_loss_clip": 0.01159963, + "auxiliary_loss_mlp": 0.01045929, + "balance_loss_clip": 1.02543747, + "balance_loss_mlp": 1.0522809, + "epoch": 0.1482639410792124, + "flos": 24170503109760.0, + "grad_norm": 2.8838905575360925, + "language_loss": 0.76230991, + "learning_rate": 3.855068811855817e-06, + "loss": 0.78436887, + "num_input_tokens_seen": 53554775, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.078125, + "step": 2466, + "time_per_iteration": 2.4722483158111572 + }, + { + "auxiliary_loss_clip": 0.01052707, + "auxiliary_loss_mlp": 0.01020247, + "balance_loss_clip": 1.01781487, + "balance_loss_mlp": 1.01613474, + "epoch": 0.14832406433188036, + "flos": 66191051341440.0, + "grad_norm": 0.8013334536894682, + "language_loss": 0.60022712, + "learning_rate": 3.854923220141551e-06, + "loss": 0.62095666, + "num_input_tokens_seen": 53609675, + "router_z_loss_clip": 0.02429199, + "router_z_loss_mlp": 0.3671875, + "step": 2467, + "time_per_iteration": 3.0702927112579346 + }, + { + "auxiliary_loss_clip": 0.01154799, + "auxiliary_loss_mlp": 0.01043072, + "balance_loss_clip": 1.02393889, + "balance_loss_mlp": 1.05059397, + "epoch": 0.14838418758454833, + "flos": 25411252654080.0, + "grad_norm": 2.3345318496369405, + "language_loss": 0.87671721, + "learning_rate": 3.85477755808841e-06, + "loss": 0.89869595, + "num_input_tokens_seen": 53626950, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.046875, + "step": 2468, + "time_per_iteration": 2.5042202472686768 + }, + { + "auxiliary_loss_clip": 0.0115781, + "auxiliary_loss_mlp": 0.01052711, + "balance_loss_clip": 1.0322901, + "balance_loss_mlp": 1.05078602, + "epoch": 0.1484443108372163, + "flos": 23289901280640.0, + "grad_norm": 4.884804263226826, + "language_loss": 0.75884396, + "learning_rate": 3.854631825701919e-06, + "loss": 0.78094912, + "num_input_tokens_seen": 53644200, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0703125, + "step": 2469, + "time_per_iteration": 2.4750967025756836 + }, + { + "auxiliary_loss_clip": 0.01153722, + "auxiliary_loss_mlp": 0.01053888, + "balance_loss_clip": 1.03425384, + "balance_loss_mlp": 1.04954958, + "epoch": 0.14850443408988426, + "flos": 14647675985280.0, + "grad_norm": 2.457578452134473, + "language_loss": 0.76183128, + "learning_rate": 3.854486022987603e-06, + "loss": 0.78390741, + "num_input_tokens_seen": 53659650, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.046875, + "step": 2470, + "time_per_iteration": 2.4312937259674072 + }, + { + "auxiliary_loss_clip": 0.01152707, + "auxiliary_loss_mlp": 0.01045721, + "balance_loss_clip": 1.02651668, + "balance_loss_mlp": 1.05050206, + "epoch": 0.14856455734255222, + "flos": 23548314700800.0, + "grad_norm": 1.9398758609720104, + "language_loss": 0.72121894, + "learning_rate": 3.8543401499509905e-06, + "loss": 0.74320322, + "num_input_tokens_seen": 53680275, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0234375, + "step": 2471, + "time_per_iteration": 2.519866466522217 + }, + { + "auxiliary_loss_clip": 0.01160204, + "auxiliary_loss_mlp": 0.01047639, + "balance_loss_clip": 1.0272181, + "balance_loss_mlp": 1.0499022, + "epoch": 0.1486246805952202, + "flos": 18077288515200.0, + "grad_norm": 2.11598070664324, + "language_loss": 0.89739621, + "learning_rate": 3.854194206597615e-06, + "loss": 0.91947466, + "num_input_tokens_seen": 53698270, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.1015625, + "step": 2472, + "time_per_iteration": 2.4281632900238037 + }, + { + "auxiliary_loss_clip": 0.01155174, + "auxiliary_loss_mlp": 0.01049471, + "balance_loss_clip": 1.030123, + "balance_loss_mlp": 1.05059123, + "epoch": 0.14868480384788818, + "flos": 19353625459200.0, + "grad_norm": 4.013793804030176, + "language_loss": 0.80734539, + "learning_rate": 3.854048192933008e-06, + "loss": 0.82939184, + "num_input_tokens_seen": 53716845, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.046875, + "step": 2473, + "time_per_iteration": 2.4329466819763184 + }, + { + "auxiliary_loss_clip": 0.0115911, + "auxiliary_loss_mlp": 0.01063152, + "balance_loss_clip": 1.04358959, + "balance_loss_mlp": 1.05129409, + "epoch": 0.14874492710055615, + "flos": 22200192426240.0, + "grad_norm": 2.5981192604624526, + "language_loss": 0.77540123, + "learning_rate": 3.853902108962709e-06, + "loss": 0.79762381, + "num_input_tokens_seen": 53734970, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.078125, + "step": 2474, + "time_per_iteration": 2.453432083129883 + }, + { + "auxiliary_loss_clip": 0.01157718, + "auxiliary_loss_mlp": 0.01057886, + "balance_loss_clip": 1.03763211, + "balance_loss_mlp": 1.04955983, + "epoch": 0.1488050503532241, + "flos": 21103444506240.0, + "grad_norm": 1.8103491271764227, + "language_loss": 0.82315612, + "learning_rate": 3.853755954692255e-06, + "loss": 0.84531218, + "num_input_tokens_seen": 53753415, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.0859375, + "step": 2475, + "time_per_iteration": 2.4591174125671387 + }, + { + "auxiliary_loss_clip": 0.01157844, + "auxiliary_loss_mlp": 0.01058234, + "balance_loss_clip": 1.03985167, + "balance_loss_mlp": 1.05399168, + "epoch": 0.14886517360589208, + "flos": 12786569625600.0, + "grad_norm": 1.9240192853863896, + "language_loss": 0.80811602, + "learning_rate": 3.85360973012719e-06, + "loss": 0.83027685, + "num_input_tokens_seen": 53770305, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.0390625, + "step": 2476, + "time_per_iteration": 3.810553789138794 + }, + { + "auxiliary_loss_clip": 0.01148934, + "auxiliary_loss_mlp": 0.01053022, + "balance_loss_clip": 1.03467607, + "balance_loss_mlp": 1.05016851, + "epoch": 0.14892529685856004, + "flos": 29022860419200.0, + "grad_norm": 1.8396010916090604, + "language_loss": 0.77889222, + "learning_rate": 3.853463435273058e-06, + "loss": 0.80091178, + "num_input_tokens_seen": 53788895, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.98828125, + "step": 2477, + "time_per_iteration": 4.031312942504883 + }, + { + "auxiliary_loss_clip": 0.01048753, + "auxiliary_loss_mlp": 0.01018076, + "balance_loss_clip": 1.01581085, + "balance_loss_mlp": 1.01302671, + "epoch": 0.148985420111228, + "flos": 61926121054080.0, + "grad_norm": 0.8050876444063699, + "language_loss": 0.60130364, + "learning_rate": 3.853317070135407e-06, + "loss": 0.62197196, + "num_input_tokens_seen": 53850260, + "router_z_loss_clip": 0.02270508, + "router_z_loss_mlp": 0.35742188, + "step": 2478, + "time_per_iteration": 3.1073787212371826 + }, + { + "auxiliary_loss_clip": 0.01154836, + "auxiliary_loss_mlp": 0.01044957, + "balance_loss_clip": 1.02695656, + "balance_loss_mlp": 1.05078554, + "epoch": 0.149045543363896, + "flos": 23915106432000.0, + "grad_norm": 2.232556799389181, + "language_loss": 0.70951897, + "learning_rate": 3.853170634719787e-06, + "loss": 0.7315169, + "num_input_tokens_seen": 53867520, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0390625, + "step": 2479, + "time_per_iteration": 2.475215435028076 + }, + { + "auxiliary_loss_clip": 0.01153193, + "auxiliary_loss_mlp": 0.01050847, + "balance_loss_clip": 1.0313679, + "balance_loss_mlp": 1.04886127, + "epoch": 0.14910566661656396, + "flos": 23654394541440.0, + "grad_norm": 1.5896653051626852, + "language_loss": 0.80748487, + "learning_rate": 3.853024129031751e-06, + "loss": 0.82952535, + "num_input_tokens_seen": 53886620, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.046875, + "step": 2480, + "time_per_iteration": 2.4618492126464844 + }, + { + "auxiliary_loss_clip": 0.01156746, + "auxiliary_loss_mlp": 0.01047338, + "balance_loss_clip": 1.02838397, + "balance_loss_mlp": 1.05017209, + "epoch": 0.14916578986923193, + "flos": 20515299212160.0, + "grad_norm": 2.4101793906634894, + "language_loss": 0.84132183, + "learning_rate": 3.852877553076854e-06, + "loss": 0.86336267, + "num_input_tokens_seen": 53902230, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0625, + "step": 2481, + "time_per_iteration": 2.437391519546509 + }, + { + "auxiliary_loss_clip": 0.01152664, + "auxiliary_loss_mlp": 0.01051193, + "balance_loss_clip": 1.03046227, + "balance_loss_mlp": 1.04808569, + "epoch": 0.1492259131218999, + "flos": 22491822948480.0, + "grad_norm": 3.194199563979109, + "language_loss": 0.77347398, + "learning_rate": 3.8527309068606546e-06, + "loss": 0.79551256, + "num_input_tokens_seen": 53919475, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.046875, + "step": 2482, + "time_per_iteration": 2.4710068702697754 + }, + { + "auxiliary_loss_clip": 0.01161857, + "auxiliary_loss_mlp": 0.01040162, + "balance_loss_clip": 1.01939583, + "balance_loss_mlp": 1.05186439, + "epoch": 0.14928603637456786, + "flos": 23185868515200.0, + "grad_norm": 2.968394626295353, + "language_loss": 0.78719991, + "learning_rate": 3.852584190388713e-06, + "loss": 0.80922014, + "num_input_tokens_seen": 53939150, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1015625, + "step": 2483, + "time_per_iteration": 2.5075182914733887 + }, + { + "auxiliary_loss_clip": 0.0114759, + "auxiliary_loss_mlp": 0.01040314, + "balance_loss_clip": 1.02304077, + "balance_loss_mlp": 1.04774714, + "epoch": 0.14934615962723582, + "flos": 21653237053440.0, + "grad_norm": 1.642113570978582, + "language_loss": 0.70521605, + "learning_rate": 3.852437403666595e-06, + "loss": 0.72709513, + "num_input_tokens_seen": 53958735, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 1.0, + "step": 2484, + "time_per_iteration": 2.4810657501220703 + }, + { + "auxiliary_loss_clip": 0.01154341, + "auxiliary_loss_mlp": 0.01041731, + "balance_loss_clip": 1.02049971, + "balance_loss_mlp": 1.04769683, + "epoch": 0.1494062828799038, + "flos": 27010066924800.0, + "grad_norm": 2.5518326423103654, + "language_loss": 0.84396368, + "learning_rate": 3.852290546699863e-06, + "loss": 0.86592442, + "num_input_tokens_seen": 53975065, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.0703125, + "step": 2485, + "time_per_iteration": 2.47004771232605 + }, + { + "auxiliary_loss_clip": 0.01155612, + "auxiliary_loss_mlp": 0.01044521, + "balance_loss_clip": 1.02442229, + "balance_loss_mlp": 1.04906201, + "epoch": 0.14946640613257178, + "flos": 21214947300480.0, + "grad_norm": 2.1854599778658663, + "language_loss": 0.84902173, + "learning_rate": 3.8521436194940894e-06, + "loss": 0.87102306, + "num_input_tokens_seen": 53993330, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0625, + "step": 2486, + "time_per_iteration": 2.4553234577178955 + }, + { + "auxiliary_loss_clip": 0.011482, + "auxiliary_loss_mlp": 0.01038818, + "balance_loss_clip": 1.02208114, + "balance_loss_mlp": 1.04672825, + "epoch": 0.14952652938523975, + "flos": 13370872164480.0, + "grad_norm": 2.4579579723442855, + "language_loss": 0.74329305, + "learning_rate": 3.851996622054842e-06, + "loss": 0.76516318, + "num_input_tokens_seen": 54010515, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 1.015625, + "step": 2487, + "time_per_iteration": 2.436316967010498 + }, + { + "auxiliary_loss_clip": 0.01148703, + "auxiliary_loss_mlp": 0.01048053, + "balance_loss_clip": 1.02934861, + "balance_loss_mlp": 1.04707325, + "epoch": 0.1495866526379077, + "flos": 35517699959040.0, + "grad_norm": 2.1423480103066375, + "language_loss": 0.71837348, + "learning_rate": 3.8518495543877e-06, + "loss": 0.74034101, + "num_input_tokens_seen": 54031315, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 2488, + "time_per_iteration": 2.649794816970825 + }, + { + "auxiliary_loss_clip": 0.01156424, + "auxiliary_loss_mlp": 0.01046549, + "balance_loss_clip": 1.02780962, + "balance_loss_mlp": 1.04946375, + "epoch": 0.14964677589057568, + "flos": 17632749795840.0, + "grad_norm": 2.5167610907777513, + "language_loss": 0.70519507, + "learning_rate": 3.851702416498235e-06, + "loss": 0.72722483, + "num_input_tokens_seen": 54045965, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0703125, + "step": 2489, + "time_per_iteration": 2.416708469390869 + }, + { + "auxiliary_loss_clip": 0.01153385, + "auxiliary_loss_mlp": 0.01045512, + "balance_loss_clip": 1.02637911, + "balance_loss_mlp": 1.04785299, + "epoch": 0.14970689914324364, + "flos": 20185280029440.0, + "grad_norm": 6.063777716142612, + "language_loss": 0.81789696, + "learning_rate": 3.8515552083920295e-06, + "loss": 0.83988589, + "num_input_tokens_seen": 54059960, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0546875, + "step": 2490, + "time_per_iteration": 2.433284282684326 + }, + { + "auxiliary_loss_clip": 0.0115747, + "auxiliary_loss_mlp": 0.01042151, + "balance_loss_clip": 1.02357852, + "balance_loss_mlp": 1.05097246, + "epoch": 0.1497670223959116, + "flos": 37228699382400.0, + "grad_norm": 1.781748843431282, + "language_loss": 0.79878485, + "learning_rate": 3.851407930074666e-06, + "loss": 0.82078111, + "num_input_tokens_seen": 54079330, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0625, + "step": 2491, + "time_per_iteration": 2.616642475128174 + }, + { + "auxiliary_loss_clip": 0.0115457, + "auxiliary_loss_mlp": 0.01046038, + "balance_loss_clip": 1.02491403, + "balance_loss_mlp": 1.04683256, + "epoch": 0.1498271456485796, + "flos": 24455848752000.0, + "grad_norm": 2.263792295832721, + "language_loss": 0.90779251, + "learning_rate": 3.851260581551727e-06, + "loss": 0.9297986, + "num_input_tokens_seen": 54097555, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.078125, + "step": 2492, + "time_per_iteration": 2.508188009262085 + }, + { + "auxiliary_loss_clip": 0.01152347, + "auxiliary_loss_mlp": 0.01059815, + "balance_loss_clip": 1.04028893, + "balance_loss_mlp": 1.04883122, + "epoch": 0.14988726890124757, + "flos": 16253601148800.0, + "grad_norm": 2.7210225604175116, + "language_loss": 0.79162109, + "learning_rate": 3.851113162828802e-06, + "loss": 0.8137427, + "num_input_tokens_seen": 54115600, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.03125, + "step": 2493, + "time_per_iteration": 2.4228014945983887 + }, + { + "auxiliary_loss_clip": 0.01150881, + "auxiliary_loss_mlp": 0.01042857, + "balance_loss_clip": 1.02299631, + "balance_loss_mlp": 1.04643607, + "epoch": 0.14994739215391553, + "flos": 20666555383680.0, + "grad_norm": 2.8095511996528297, + "language_loss": 0.80186284, + "learning_rate": 3.85096567391148e-06, + "loss": 0.82380015, + "num_input_tokens_seen": 54135220, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.046875, + "step": 2494, + "time_per_iteration": 2.4774162769317627 + }, + { + "auxiliary_loss_clip": 0.01149241, + "auxiliary_loss_mlp": 0.01046465, + "balance_loss_clip": 1.02613974, + "balance_loss_mlp": 1.04731214, + "epoch": 0.1500075154065835, + "flos": 70652375239680.0, + "grad_norm": 1.9697458415941205, + "language_loss": 0.65825832, + "learning_rate": 3.850818114805354e-06, + "loss": 0.68021536, + "num_input_tokens_seen": 54161065, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.015625, + "step": 2495, + "time_per_iteration": 2.87758207321167 + }, + { + "auxiliary_loss_clip": 0.01053312, + "auxiliary_loss_mlp": 0.01002116, + "balance_loss_clip": 0.99992257, + "balance_loss_mlp": 1.01668406, + "epoch": 0.15006763865925146, + "flos": 68011937447040.0, + "grad_norm": 1.1924806916138095, + "language_loss": 0.59488082, + "learning_rate": 3.850670485516019e-06, + "loss": 0.61543506, + "num_input_tokens_seen": 54225095, + "router_z_loss_clip": 0.02197266, + "router_z_loss_mlp": 0.3671875, + "step": 2496, + "time_per_iteration": 3.0807061195373535 + }, + { + "auxiliary_loss_clip": 0.01152427, + "auxiliary_loss_mlp": 0.01054598, + "balance_loss_clip": 1.03467774, + "balance_loss_mlp": 1.0468092, + "epoch": 0.15012776191191943, + "flos": 18916269459840.0, + "grad_norm": 2.296903755979897, + "language_loss": 0.65457296, + "learning_rate": 3.850522786049075e-06, + "loss": 0.67664325, + "num_input_tokens_seen": 54243750, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0546875, + "step": 2497, + "time_per_iteration": 2.4403655529022217 + }, + { + "auxiliary_loss_clip": 0.01155934, + "auxiliary_loss_mlp": 0.01048581, + "balance_loss_clip": 1.03021121, + "balance_loss_mlp": 1.05125117, + "epoch": 0.1501878851645874, + "flos": 23701330638720.0, + "grad_norm": 1.4500790349521295, + "language_loss": 0.75247943, + "learning_rate": 3.850375016410121e-06, + "loss": 0.77452457, + "num_input_tokens_seen": 54266185, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.046875, + "step": 2498, + "time_per_iteration": 2.5286927223205566 + }, + { + "auxiliary_loss_clip": 0.01155949, + "auxiliary_loss_mlp": 0.01043093, + "balance_loss_clip": 1.02256477, + "balance_loss_mlp": 1.04910398, + "epoch": 0.15024800841725539, + "flos": 20412523422720.0, + "grad_norm": 2.1627878003877257, + "language_loss": 0.72073609, + "learning_rate": 3.850227176604761e-06, + "loss": 0.74272656, + "num_input_tokens_seen": 54283940, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0625, + "step": 2499, + "time_per_iteration": 2.4415009021759033 + }, + { + "auxiliary_loss_clip": 0.01153017, + "auxiliary_loss_mlp": 0.01049378, + "balance_loss_clip": 1.03001857, + "balance_loss_mlp": 1.04765654, + "epoch": 0.15030813166992335, + "flos": 31831002812160.0, + "grad_norm": 1.7935878764928508, + "language_loss": 0.7195605, + "learning_rate": 3.850079266638601e-06, + "loss": 0.74158442, + "num_input_tokens_seen": 54304830, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0625, + "step": 2500, + "time_per_iteration": 2.5504300594329834 + }, + { + "auxiliary_loss_clip": 0.01152715, + "auxiliary_loss_mlp": 0.01058033, + "balance_loss_clip": 1.03831601, + "balance_loss_mlp": 1.04960001, + "epoch": 0.15036825492259132, + "flos": 35657822914560.0, + "grad_norm": 2.491284008551419, + "language_loss": 0.64973354, + "learning_rate": 3.849931286517249e-06, + "loss": 0.67184103, + "num_input_tokens_seen": 54325595, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.03125, + "step": 2501, + "time_per_iteration": 2.587292432785034 + }, + { + "auxiliary_loss_clip": 0.01153217, + "auxiliary_loss_mlp": 0.01059755, + "balance_loss_clip": 1.03940582, + "balance_loss_mlp": 1.04861319, + "epoch": 0.15042837817525928, + "flos": 18838163335680.0, + "grad_norm": 2.0240839018319, + "language_loss": 0.83043593, + "learning_rate": 3.849783236246318e-06, + "loss": 0.85256565, + "num_input_tokens_seen": 54342180, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.046875, + "step": 2502, + "time_per_iteration": 2.470350980758667 + }, + { + "auxiliary_loss_clip": 0.01149694, + "auxiliary_loss_mlp": 0.01050766, + "balance_loss_clip": 1.03272963, + "balance_loss_mlp": 1.04702473, + "epoch": 0.15048850142792725, + "flos": 19535548867200.0, + "grad_norm": 2.3174234065433597, + "language_loss": 0.77197748, + "learning_rate": 3.849635115831421e-06, + "loss": 0.79398209, + "num_input_tokens_seen": 54360255, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.0234375, + "step": 2503, + "time_per_iteration": 2.6598432064056396 + }, + { + "auxiliary_loss_clip": 0.01151836, + "auxiliary_loss_mlp": 0.01043843, + "balance_loss_clip": 1.02585387, + "balance_loss_mlp": 1.04901898, + "epoch": 0.1505486246805952, + "flos": 22017550746240.0, + "grad_norm": 2.1270494317377007, + "language_loss": 0.85432625, + "learning_rate": 3.849486925278176e-06, + "loss": 0.87628305, + "num_input_tokens_seen": 54378260, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0234375, + "step": 2504, + "time_per_iteration": 2.7323355674743652 + }, + { + "auxiliary_loss_clip": 0.01150395, + "auxiliary_loss_mlp": 0.01041023, + "balance_loss_clip": 1.02322519, + "balance_loss_mlp": 1.04855871, + "epoch": 0.15060874793326318, + "flos": 20743153136640.0, + "grad_norm": 1.6383963769174188, + "language_loss": 0.83226919, + "learning_rate": 3.8493386645922e-06, + "loss": 0.85418344, + "num_input_tokens_seen": 54399745, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 1.015625, + "step": 2505, + "time_per_iteration": 2.4866323471069336 + }, + { + "auxiliary_loss_clip": 0.01150621, + "auxiliary_loss_mlp": 0.01046549, + "balance_loss_clip": 1.02851272, + "balance_loss_mlp": 1.04672468, + "epoch": 0.15066887118593117, + "flos": 16471902055680.0, + "grad_norm": 2.268670074130615, + "language_loss": 0.7639147, + "learning_rate": 3.849190333779117e-06, + "loss": 0.78588635, + "num_input_tokens_seen": 54417105, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.0390625, + "step": 2506, + "time_per_iteration": 2.4266390800476074 + }, + { + "auxiliary_loss_clip": 0.01156061, + "auxiliary_loss_mlp": 0.01043099, + "balance_loss_clip": 1.02452636, + "balance_loss_mlp": 1.04987144, + "epoch": 0.15072899443859913, + "flos": 19859319083520.0, + "grad_norm": 4.189374997051622, + "language_loss": 0.76202261, + "learning_rate": 3.849041932844552e-06, + "loss": 0.78401417, + "num_input_tokens_seen": 54433920, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0625, + "step": 2507, + "time_per_iteration": 2.477936029434204 + }, + { + "auxiliary_loss_clip": 0.01145213, + "auxiliary_loss_mlp": 0.01043256, + "balance_loss_clip": 1.02519584, + "balance_loss_mlp": 1.04538798, + "epoch": 0.1507891176912671, + "flos": 20776226584320.0, + "grad_norm": 2.4120052182021503, + "language_loss": 0.69041586, + "learning_rate": 3.848893461794131e-06, + "loss": 0.71230054, + "num_input_tokens_seen": 54451540, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.0, + "step": 2508, + "time_per_iteration": 2.4462738037109375 + }, + { + "auxiliary_loss_clip": 0.01156095, + "auxiliary_loss_mlp": 0.01046654, + "balance_loss_clip": 1.02870142, + "balance_loss_mlp": 1.05190873, + "epoch": 0.15084924094393506, + "flos": 23586631534080.0, + "grad_norm": 1.8904486830015208, + "language_loss": 0.77516425, + "learning_rate": 3.8487449206334845e-06, + "loss": 0.79719174, + "num_input_tokens_seen": 54470800, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0390625, + "step": 2509, + "time_per_iteration": 2.47723126411438 + }, + { + "auxiliary_loss_clip": 0.01160822, + "auxiliary_loss_mlp": 0.01052281, + "balance_loss_clip": 1.0307281, + "balance_loss_mlp": 1.05027628, + "epoch": 0.15090936419660303, + "flos": 18911313383040.0, + "grad_norm": 2.607083522867767, + "language_loss": 0.80497003, + "learning_rate": 3.848596309368246e-06, + "loss": 0.82710105, + "num_input_tokens_seen": 54486525, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.1015625, + "step": 2510, + "time_per_iteration": 2.4445176124572754 + }, + { + "auxiliary_loss_clip": 0.0115714, + "auxiliary_loss_mlp": 0.01053415, + "balance_loss_clip": 1.0336144, + "balance_loss_mlp": 1.05078745, + "epoch": 0.150969487449271, + "flos": 17928223073280.0, + "grad_norm": 2.033214689307001, + "language_loss": 0.73913604, + "learning_rate": 3.8484476280040495e-06, + "loss": 0.76124156, + "num_input_tokens_seen": 54503795, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0625, + "step": 2511, + "time_per_iteration": 2.4372222423553467 + }, + { + "auxiliary_loss_clip": 0.01152059, + "auxiliary_loss_mlp": 0.01040964, + "balance_loss_clip": 1.02332115, + "balance_loss_mlp": 1.04880548, + "epoch": 0.151029610701939, + "flos": 24243078539520.0, + "grad_norm": 2.077792778828972, + "language_loss": 0.6935091, + "learning_rate": 3.848298876546534e-06, + "loss": 0.71543926, + "num_input_tokens_seen": 54523025, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 1.03125, + "step": 2512, + "time_per_iteration": 2.5126166343688965 + }, + { + "auxiliary_loss_clip": 0.01154623, + "auxiliary_loss_mlp": 0.01047016, + "balance_loss_clip": 1.02903962, + "balance_loss_mlp": 1.05130434, + "epoch": 0.15108973395460695, + "flos": 30262496641920.0, + "grad_norm": 3.0703205269170364, + "language_loss": 0.73833334, + "learning_rate": 3.84815005500134e-06, + "loss": 0.76034975, + "num_input_tokens_seen": 54545025, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.03125, + "step": 2513, + "time_per_iteration": 2.5560262203216553 + }, + { + "auxiliary_loss_clip": 0.01052097, + "auxiliary_loss_mlp": 0.01001905, + "balance_loss_clip": 0.99995023, + "balance_loss_mlp": 1.01588845, + "epoch": 0.15114985720727492, + "flos": 60437624428800.0, + "grad_norm": 0.8742342414591, + "language_loss": 0.64759278, + "learning_rate": 3.84800116337411e-06, + "loss": 0.6681329, + "num_input_tokens_seen": 54604545, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.36328125, + "step": 2514, + "time_per_iteration": 3.0147135257720947 + }, + { + "auxiliary_loss_clip": 0.01150943, + "auxiliary_loss_mlp": 0.01043819, + "balance_loss_clip": 1.02588964, + "balance_loss_mlp": 1.04910421, + "epoch": 0.15120998045994288, + "flos": 20521691832960.0, + "grad_norm": 2.6951033245551597, + "language_loss": 0.73257691, + "learning_rate": 3.8478522016704916e-06, + "loss": 0.75452447, + "num_input_tokens_seen": 54620590, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0234375, + "step": 2515, + "time_per_iteration": 2.4640309810638428 + }, + { + "auxiliary_loss_clip": 0.01151787, + "auxiliary_loss_mlp": 0.0104255, + "balance_loss_clip": 1.02361989, + "balance_loss_mlp": 1.04967082, + "epoch": 0.15127010371261085, + "flos": 21178893024000.0, + "grad_norm": 1.8637331039353218, + "language_loss": 0.76990104, + "learning_rate": 3.8477031698961325e-06, + "loss": 0.79184443, + "num_input_tokens_seen": 54640410, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.015625, + "step": 2516, + "time_per_iteration": 2.4672725200653076 + }, + { + "auxiliary_loss_clip": 0.01049641, + "auxiliary_loss_mlp": 0.01003705, + "balance_loss_clip": 1.00177407, + "balance_loss_mlp": 1.01351547, + "epoch": 0.1513302269652788, + "flos": 65320648974720.0, + "grad_norm": 0.745436195681612, + "language_loss": 0.54673135, + "learning_rate": 3.8475540680566835e-06, + "loss": 0.56726485, + "num_input_tokens_seen": 54701430, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.36132812, + "step": 2517, + "time_per_iteration": 3.0677855014801025 + }, + { + "auxiliary_loss_clip": 0.01151686, + "auxiliary_loss_mlp": 0.0104095, + "balance_loss_clip": 1.02126849, + "balance_loss_mlp": 1.04780149, + "epoch": 0.15139035021794678, + "flos": 19135827342720.0, + "grad_norm": 2.2326216563166983, + "language_loss": 0.78515786, + "learning_rate": 3.8474048961577995e-06, + "loss": 0.8070842, + "num_input_tokens_seen": 54720845, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0390625, + "step": 2518, + "time_per_iteration": 3.8305110931396484 + }, + { + "auxiliary_loss_clip": 0.01159011, + "auxiliary_loss_mlp": 0.01048517, + "balance_loss_clip": 1.02851379, + "balance_loss_mlp": 1.05163026, + "epoch": 0.15145047347061477, + "flos": 26578564842240.0, + "grad_norm": 2.1364726943924772, + "language_loss": 0.70153689, + "learning_rate": 3.847255654205137e-06, + "loss": 0.72361219, + "num_input_tokens_seen": 54740495, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0703125, + "step": 2519, + "time_per_iteration": 3.9920616149902344 + }, + { + "auxiliary_loss_clip": 0.01151572, + "auxiliary_loss_mlp": 0.01044317, + "balance_loss_clip": 1.02549386, + "balance_loss_mlp": 1.04812384, + "epoch": 0.15151059672328274, + "flos": 20302959962880.0, + "grad_norm": 1.9802508383478334, + "language_loss": 0.79219216, + "learning_rate": 3.847106342204354e-06, + "loss": 0.81415105, + "num_input_tokens_seen": 54758415, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.03125, + "step": 2520, + "time_per_iteration": 2.4853925704956055 + }, + { + "auxiliary_loss_clip": 0.01155647, + "auxiliary_loss_mlp": 0.01050752, + "balance_loss_clip": 1.03090394, + "balance_loss_mlp": 1.05067897, + "epoch": 0.1515707199759507, + "flos": 27228367831680.0, + "grad_norm": 2.075013959426641, + "language_loss": 0.74324691, + "learning_rate": 3.846956960161114e-06, + "loss": 0.76531088, + "num_input_tokens_seen": 54779355, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.046875, + "step": 2521, + "time_per_iteration": 2.6154706478118896 + }, + { + "auxiliary_loss_clip": 0.01160623, + "auxiliary_loss_mlp": 0.01045817, + "balance_loss_clip": 1.02587366, + "balance_loss_mlp": 1.05273759, + "epoch": 0.15163084322861867, + "flos": 23587349806080.0, + "grad_norm": 2.7623729867934737, + "language_loss": 0.81996739, + "learning_rate": 3.84680750808108e-06, + "loss": 0.84203184, + "num_input_tokens_seen": 54799465, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.078125, + "step": 2522, + "time_per_iteration": 2.4873530864715576 + }, + { + "auxiliary_loss_clip": 0.0104876, + "auxiliary_loss_mlp": 0.01001752, + "balance_loss_clip": 0.99982071, + "balance_loss_mlp": 1.01252866, + "epoch": 0.15169096648128663, + "flos": 66889622021760.0, + "grad_norm": 0.824359498034346, + "language_loss": 0.57915509, + "learning_rate": 3.846657985969922e-06, + "loss": 0.59966022, + "num_input_tokens_seen": 54857665, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.36328125, + "step": 2523, + "time_per_iteration": 2.998990774154663 + }, + { + "auxiliary_loss_clip": 0.01153336, + "auxiliary_loss_mlp": 0.01050595, + "balance_loss_clip": 1.03147376, + "balance_loss_mlp": 1.04972816, + "epoch": 0.1517510897339546, + "flos": 29095435848960.0, + "grad_norm": 1.970015434384356, + "language_loss": 0.7485956, + "learning_rate": 3.8465083938333066e-06, + "loss": 0.77063495, + "num_input_tokens_seen": 54879895, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0390625, + "step": 2524, + "time_per_iteration": 2.570068836212158 + }, + { + "auxiliary_loss_clip": 0.0115237, + "auxiliary_loss_mlp": 0.01044934, + "balance_loss_clip": 1.02603889, + "balance_loss_mlp": 1.0488894, + "epoch": 0.1518112129866226, + "flos": 18406553512320.0, + "grad_norm": 1.8388163356316347, + "language_loss": 0.74780655, + "learning_rate": 3.8463587316769085e-06, + "loss": 0.76977956, + "num_input_tokens_seen": 54898245, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.03125, + "step": 2525, + "time_per_iteration": 2.431143283843994 + }, + { + "auxiliary_loss_clip": 0.01157293, + "auxiliary_loss_mlp": 0.01043467, + "balance_loss_clip": 1.02432156, + "balance_loss_mlp": 1.05145812, + "epoch": 0.15187133623929056, + "flos": 19425410789760.0, + "grad_norm": 1.8962457769996104, + "language_loss": 0.79644465, + "learning_rate": 3.846208999506402e-06, + "loss": 0.81845224, + "num_input_tokens_seen": 54917060, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0625, + "step": 2526, + "time_per_iteration": 2.5167391300201416 + }, + { + "auxiliary_loss_clip": 0.01151222, + "auxiliary_loss_mlp": 0.01044184, + "balance_loss_clip": 1.0271492, + "balance_loss_mlp": 1.05228162, + "epoch": 0.15193145949195852, + "flos": 17566207850880.0, + "grad_norm": 1.8025865198757494, + "language_loss": 0.84928662, + "learning_rate": 3.846059197327466e-06, + "loss": 0.87124068, + "num_input_tokens_seen": 54936365, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9921875, + "step": 2527, + "time_per_iteration": 2.4550719261169434 + }, + { + "auxiliary_loss_clip": 0.01151683, + "auxiliary_loss_mlp": 0.01041065, + "balance_loss_clip": 1.02321947, + "balance_loss_mlp": 1.04876995, + "epoch": 0.15199158274462649, + "flos": 36176265866880.0, + "grad_norm": 2.2810224367730156, + "language_loss": 0.69326001, + "learning_rate": 3.845909325145779e-06, + "loss": 0.71518755, + "num_input_tokens_seen": 54961365, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 1.03125, + "step": 2528, + "time_per_iteration": 2.610042095184326 + }, + { + "auxiliary_loss_clip": 0.01153606, + "auxiliary_loss_mlp": 0.01047581, + "balance_loss_clip": 1.0288415, + "balance_loss_mlp": 1.05137038, + "epoch": 0.15205170599729445, + "flos": 23074042498560.0, + "grad_norm": 2.490892546855648, + "language_loss": 0.86502308, + "learning_rate": 3.845759382967026e-06, + "loss": 0.88703495, + "num_input_tokens_seen": 54980750, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0234375, + "step": 2529, + "time_per_iteration": 2.4695634841918945 + }, + { + "auxiliary_loss_clip": 0.01147713, + "auxiliary_loss_mlp": 0.0103836, + "balance_loss_clip": 1.01989472, + "balance_loss_mlp": 1.04683101, + "epoch": 0.15211182924996242, + "flos": 21908382336000.0, + "grad_norm": 1.8772276619965056, + "language_loss": 0.83002013, + "learning_rate": 3.845609370796893e-06, + "loss": 0.85188091, + "num_input_tokens_seen": 54999675, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.0078125, + "step": 2530, + "time_per_iteration": 2.476238489151001 + }, + { + "auxiliary_loss_clip": 0.01153377, + "auxiliary_loss_mlp": 0.01044599, + "balance_loss_clip": 1.02550209, + "balance_loss_mlp": 1.04987955, + "epoch": 0.15217195250263038, + "flos": 13881521865600.0, + "grad_norm": 2.344030506991615, + "language_loss": 0.80540878, + "learning_rate": 3.845459288641066e-06, + "loss": 0.82738853, + "num_input_tokens_seen": 55018295, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.03125, + "step": 2531, + "time_per_iteration": 2.443617105484009 + }, + { + "auxiliary_loss_clip": 0.01149745, + "auxiliary_loss_mlp": 0.01049084, + "balance_loss_clip": 1.03138137, + "balance_loss_mlp": 1.04895151, + "epoch": 0.15223207575529837, + "flos": 24535319592960.0, + "grad_norm": 2.0816362099746017, + "language_loss": 0.79241651, + "learning_rate": 3.8453091365052394e-06, + "loss": 0.81440473, + "num_input_tokens_seen": 55037975, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 1.0078125, + "step": 2532, + "time_per_iteration": 2.5071239471435547 + }, + { + "auxiliary_loss_clip": 0.0115001, + "auxiliary_loss_mlp": 0.01046515, + "balance_loss_clip": 1.02694106, + "balance_loss_mlp": 1.04952455, + "epoch": 0.15229219900796634, + "flos": 25556798563200.0, + "grad_norm": 1.8298502444413876, + "language_loss": 0.87712961, + "learning_rate": 3.845158914395105e-06, + "loss": 0.89909488, + "num_input_tokens_seen": 55057135, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0, + "step": 2533, + "time_per_iteration": 2.5262463092803955 + }, + { + "auxiliary_loss_clip": 0.01150696, + "auxiliary_loss_mlp": 0.01047398, + "balance_loss_clip": 1.02932572, + "balance_loss_mlp": 1.04766071, + "epoch": 0.1523523222606343, + "flos": 18217806520320.0, + "grad_norm": 2.2606742211331556, + "language_loss": 0.79057097, + "learning_rate": 3.84500862231636e-06, + "loss": 0.81255192, + "num_input_tokens_seen": 55075525, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.03125, + "step": 2534, + "time_per_iteration": 2.4421815872192383 + }, + { + "auxiliary_loss_clip": 0.01156406, + "auxiliary_loss_mlp": 0.01041573, + "balance_loss_clip": 1.02177238, + "balance_loss_mlp": 1.04847312, + "epoch": 0.15241244551330227, + "flos": 13260087642240.0, + "grad_norm": 2.8989864742133933, + "language_loss": 0.76862979, + "learning_rate": 3.844858260274702e-06, + "loss": 0.7906096, + "num_input_tokens_seen": 55090845, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.078125, + "step": 2535, + "time_per_iteration": 2.4193530082702637 + }, + { + "auxiliary_loss_clip": 0.01156147, + "auxiliary_loss_mlp": 0.01040468, + "balance_loss_clip": 1.02153718, + "balance_loss_mlp": 1.04885459, + "epoch": 0.15247256876597023, + "flos": 19715568854400.0, + "grad_norm": 2.234687708038525, + "language_loss": 0.78185135, + "learning_rate": 3.844707828275835e-06, + "loss": 0.80381751, + "num_input_tokens_seen": 55108750, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0703125, + "step": 2536, + "time_per_iteration": 2.478066921234131 + }, + { + "auxiliary_loss_clip": 0.01150448, + "auxiliary_loss_mlp": 0.0105096, + "balance_loss_clip": 1.03305459, + "balance_loss_mlp": 1.05067229, + "epoch": 0.1525326920186382, + "flos": 20375858615040.0, + "grad_norm": 2.124557148089124, + "language_loss": 0.74979979, + "learning_rate": 3.844557326325461e-06, + "loss": 0.77181387, + "num_input_tokens_seen": 55126750, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0, + "step": 2537, + "time_per_iteration": 2.455779552459717 + }, + { + "auxiliary_loss_clip": 0.01152934, + "auxiliary_loss_mlp": 0.01043794, + "balance_loss_clip": 1.02545929, + "balance_loss_mlp": 1.04965043, + "epoch": 0.15259281527130616, + "flos": 13589963170560.0, + "grad_norm": 2.005826380833244, + "language_loss": 0.77631724, + "learning_rate": 3.8444067544292896e-06, + "loss": 0.79828459, + "num_input_tokens_seen": 55144690, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.03125, + "step": 2538, + "time_per_iteration": 2.527730941772461 + }, + { + "auxiliary_loss_clip": 0.01147714, + "auxiliary_loss_mlp": 0.01040159, + "balance_loss_clip": 1.02308786, + "balance_loss_mlp": 1.04806781, + "epoch": 0.15265293852397416, + "flos": 22860374446080.0, + "grad_norm": 1.6961003069906246, + "language_loss": 0.89707708, + "learning_rate": 3.844256112593029e-06, + "loss": 0.9189558, + "num_input_tokens_seen": 55166055, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.99609375, + "step": 2539, + "time_per_iteration": 2.485410451889038 + }, + { + "auxiliary_loss_clip": 0.01151642, + "auxiliary_loss_mlp": 0.01043152, + "balance_loss_clip": 1.02491331, + "balance_loss_mlp": 1.05028892, + "epoch": 0.15271306177664212, + "flos": 29238108670080.0, + "grad_norm": 2.1834515010765627, + "language_loss": 0.93514961, + "learning_rate": 3.844105400822391e-06, + "loss": 0.95709753, + "num_input_tokens_seen": 55186285, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.015625, + "step": 2540, + "time_per_iteration": 2.5399627685546875 + }, + { + "auxiliary_loss_clip": 0.01144897, + "auxiliary_loss_mlp": 0.01043966, + "balance_loss_clip": 1.0266571, + "balance_loss_mlp": 1.04625463, + "epoch": 0.1527731850293101, + "flos": 31246269310080.0, + "grad_norm": 1.9271166035098393, + "language_loss": 0.75039941, + "learning_rate": 3.843954619123092e-06, + "loss": 0.77228808, + "num_input_tokens_seen": 55207915, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.984375, + "step": 2541, + "time_per_iteration": 2.516559362411499 + }, + { + "auxiliary_loss_clip": 0.01147451, + "auxiliary_loss_mlp": 0.01048877, + "balance_loss_clip": 1.03025603, + "balance_loss_mlp": 1.04787207, + "epoch": 0.15283330828197805, + "flos": 22382079920640.0, + "grad_norm": 1.7480154890803248, + "language_loss": 0.81308234, + "learning_rate": 3.84380376750085e-06, + "loss": 0.83504558, + "num_input_tokens_seen": 55227860, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.99609375, + "step": 2542, + "time_per_iteration": 2.4681694507598877 + }, + { + "auxiliary_loss_clip": 0.01150381, + "auxiliary_loss_mlp": 0.01050782, + "balance_loss_clip": 1.03213799, + "balance_loss_mlp": 1.04772067, + "epoch": 0.15289343153464602, + "flos": 25520133755520.0, + "grad_norm": 2.009812895323552, + "language_loss": 0.77568293, + "learning_rate": 3.843652845961383e-06, + "loss": 0.79769456, + "num_input_tokens_seen": 55247330, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.0234375, + "step": 2543, + "time_per_iteration": 2.4899120330810547 + }, + { + "auxiliary_loss_clip": 0.01147346, + "auxiliary_loss_mlp": 0.01045177, + "balance_loss_clip": 1.0266397, + "balance_loss_mlp": 1.04692626, + "epoch": 0.15295355478731398, + "flos": 22710016114560.0, + "grad_norm": 2.3128696364379935, + "language_loss": 0.86483204, + "learning_rate": 3.843501854510416e-06, + "loss": 0.88675725, + "num_input_tokens_seen": 55266195, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0, + "step": 2544, + "time_per_iteration": 2.4774844646453857 + }, + { + "auxiliary_loss_clip": 0.01152485, + "auxiliary_loss_mlp": 0.010531, + "balance_loss_clip": 1.03287029, + "balance_loss_mlp": 1.04675508, + "epoch": 0.15301367803998198, + "flos": 23251907669760.0, + "grad_norm": 2.0966566192890106, + "language_loss": 0.8228749, + "learning_rate": 3.843350793153673e-06, + "loss": 0.84493077, + "num_input_tokens_seen": 55283305, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0546875, + "step": 2545, + "time_per_iteration": 2.4526925086975098 + }, + { + "auxiliary_loss_clip": 0.01149503, + "auxiliary_loss_mlp": 0.01044491, + "balance_loss_clip": 1.02614498, + "balance_loss_mlp": 1.04802954, + "epoch": 0.15307380129264994, + "flos": 25886279041920.0, + "grad_norm": 2.540509049886226, + "language_loss": 0.70711339, + "learning_rate": 3.843199661896884e-06, + "loss": 0.72905338, + "num_input_tokens_seen": 55303035, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.015625, + "step": 2546, + "time_per_iteration": 2.5009732246398926 + }, + { + "auxiliary_loss_clip": 0.01152354, + "auxiliary_loss_mlp": 0.01043417, + "balance_loss_clip": 1.02423596, + "balance_loss_mlp": 1.04967904, + "epoch": 0.1531339245453179, + "flos": 46973239205760.0, + "grad_norm": 1.5770850469719229, + "language_loss": 0.77521312, + "learning_rate": 3.843048460745779e-06, + "loss": 0.79717076, + "num_input_tokens_seen": 55327570, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.03125, + "step": 2547, + "time_per_iteration": 2.6822421550750732 + }, + { + "auxiliary_loss_clip": 0.01152263, + "auxiliary_loss_mlp": 0.01047861, + "balance_loss_clip": 1.02932382, + "balance_loss_mlp": 1.04904902, + "epoch": 0.15319404779798587, + "flos": 35882049565440.0, + "grad_norm": 2.0900989153424976, + "language_loss": 0.73985445, + "learning_rate": 3.842897189706092e-06, + "loss": 0.76185566, + "num_input_tokens_seen": 55351090, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.03125, + "step": 2548, + "time_per_iteration": 2.59080171585083 + }, + { + "auxiliary_loss_clip": 0.01150578, + "auxiliary_loss_mlp": 0.01050674, + "balance_loss_clip": 1.03158915, + "balance_loss_mlp": 1.04806828, + "epoch": 0.15325417105065384, + "flos": 25664638170240.0, + "grad_norm": 1.499185349529517, + "language_loss": 0.80589813, + "learning_rate": 3.842745848783558e-06, + "loss": 0.82791066, + "num_input_tokens_seen": 55371050, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0234375, + "step": 2549, + "time_per_iteration": 2.498096227645874 + }, + { + "auxiliary_loss_clip": 0.01150664, + "auxiliary_loss_mlp": 0.01048572, + "balance_loss_clip": 1.02951026, + "balance_loss_mlp": 1.04750037, + "epoch": 0.1533142943033218, + "flos": 18770831291520.0, + "grad_norm": 1.687491024735964, + "language_loss": 0.74760693, + "learning_rate": 3.842594437983917e-06, + "loss": 0.76959932, + "num_input_tokens_seen": 55390375, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.03125, + "step": 2550, + "time_per_iteration": 2.4740684032440186 + }, + { + "auxiliary_loss_clip": 0.01153822, + "auxiliary_loss_mlp": 0.01039682, + "balance_loss_clip": 1.02035773, + "balance_loss_mlp": 1.04903841, + "epoch": 0.15337441755598977, + "flos": 23107367341440.0, + "grad_norm": 2.205632522725416, + "language_loss": 0.76839805, + "learning_rate": 3.8424429573129115e-06, + "loss": 0.79033309, + "num_input_tokens_seen": 55408890, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.046875, + "step": 2551, + "time_per_iteration": 2.468886375427246 + }, + { + "auxiliary_loss_clip": 0.01045401, + "auxiliary_loss_mlp": 0.01020401, + "balance_loss_clip": 1.01873255, + "balance_loss_mlp": 1.0102303, + "epoch": 0.15343454080865776, + "flos": 59861079227520.0, + "grad_norm": 0.9464853846906186, + "language_loss": 0.56666422, + "learning_rate": 3.842291406776283e-06, + "loss": 0.58732224, + "num_input_tokens_seen": 55463815, + "router_z_loss_clip": 0.01672363, + "router_z_loss_mlp": 0.3515625, + "step": 2552, + "time_per_iteration": 3.0059380531311035 + }, + { + "auxiliary_loss_clip": 0.01152358, + "auxiliary_loss_mlp": 0.010458, + "balance_loss_clip": 1.02684569, + "balance_loss_mlp": 1.04793155, + "epoch": 0.15349466406132573, + "flos": 11910887959680.0, + "grad_norm": 3.2490122092843947, + "language_loss": 0.88505352, + "learning_rate": 3.84213978637978e-06, + "loss": 0.90703511, + "num_input_tokens_seen": 55481050, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.046875, + "step": 2553, + "time_per_iteration": 2.4523322582244873 + }, + { + "auxiliary_loss_clip": 0.01153624, + "auxiliary_loss_mlp": 0.01047537, + "balance_loss_clip": 1.02858269, + "balance_loss_mlp": 1.04771137, + "epoch": 0.1535547873139937, + "flos": 24096922099200.0, + "grad_norm": 1.8003580088176259, + "language_loss": 0.78462374, + "learning_rate": 3.841988096129152e-06, + "loss": 0.80663538, + "num_input_tokens_seen": 55500050, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0625, + "step": 2554, + "time_per_iteration": 2.48526668548584 + }, + { + "auxiliary_loss_clip": 0.01154341, + "auxiliary_loss_mlp": 0.01052395, + "balance_loss_clip": 1.03212881, + "balance_loss_mlp": 1.04941773, + "epoch": 0.15361491056666166, + "flos": 17566459246080.0, + "grad_norm": 2.4926146542113763, + "language_loss": 0.78344929, + "learning_rate": 3.841836336030151e-06, + "loss": 0.80551672, + "num_input_tokens_seen": 55518125, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.046875, + "step": 2555, + "time_per_iteration": 2.4687228202819824 + }, + { + "auxiliary_loss_clip": 0.01149124, + "auxiliary_loss_mlp": 0.01053536, + "balance_loss_clip": 1.03543973, + "balance_loss_mlp": 1.04890609, + "epoch": 0.15367503381932962, + "flos": 25046041121280.0, + "grad_norm": 1.6634961059278193, + "language_loss": 0.76901627, + "learning_rate": 3.8416845060885305e-06, + "loss": 0.7910428, + "num_input_tokens_seen": 55540960, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.0, + "step": 2556, + "time_per_iteration": 2.5006635189056396 + }, + { + "auxiliary_loss_clip": 0.01145988, + "auxiliary_loss_mlp": 0.0104239, + "balance_loss_clip": 1.02362633, + "balance_loss_mlp": 1.04657805, + "epoch": 0.15373515707199759, + "flos": 21507332008320.0, + "grad_norm": 1.8623555031997667, + "language_loss": 0.89489496, + "learning_rate": 3.84153260631005e-06, + "loss": 0.9167788, + "num_input_tokens_seen": 55559210, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.99609375, + "step": 2557, + "time_per_iteration": 2.4434657096862793 + }, + { + "auxiliary_loss_clip": 0.01151609, + "auxiliary_loss_mlp": 0.01046416, + "balance_loss_clip": 1.0263536, + "balance_loss_mlp": 1.04834831, + "epoch": 0.15379528032466555, + "flos": 25994729180160.0, + "grad_norm": 2.0348980361104587, + "language_loss": 0.7119934, + "learning_rate": 3.841380636700468e-06, + "loss": 0.73397368, + "num_input_tokens_seen": 55578925, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.03125, + "step": 2558, + "time_per_iteration": 2.490226984024048 + }, + { + "auxiliary_loss_clip": 0.01152232, + "auxiliary_loss_mlp": 0.01047681, + "balance_loss_clip": 1.02863097, + "balance_loss_mlp": 1.04888546, + "epoch": 0.15385540357733354, + "flos": 19277315015040.0, + "grad_norm": 2.2935483083292705, + "language_loss": 0.92370701, + "learning_rate": 3.841228597265548e-06, + "loss": 0.94570613, + "num_input_tokens_seen": 55597255, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.03125, + "step": 2559, + "time_per_iteration": 3.885131597518921 + }, + { + "auxiliary_loss_clip": 0.01155373, + "auxiliary_loss_mlp": 0.01053347, + "balance_loss_clip": 1.03331971, + "balance_loss_mlp": 1.05068171, + "epoch": 0.1539155268300015, + "flos": 28549126920960.0, + "grad_norm": 5.140445938018919, + "language_loss": 0.63637704, + "learning_rate": 3.841076488011055e-06, + "loss": 0.65846419, + "num_input_tokens_seen": 55619515, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.046875, + "step": 2560, + "time_per_iteration": 5.343889236450195 + }, + { + "auxiliary_loss_clip": 0.01153839, + "auxiliary_loss_mlp": 0.01046849, + "balance_loss_clip": 1.02725124, + "balance_loss_mlp": 1.04950392, + "epoch": 0.15397565008266947, + "flos": 23547883737600.0, + "grad_norm": 1.8613162525264346, + "language_loss": 0.88230681, + "learning_rate": 3.8409243089427574e-06, + "loss": 0.90431374, + "num_input_tokens_seen": 55640050, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.046875, + "step": 2561, + "time_per_iteration": 2.4648611545562744 + }, + { + "auxiliary_loss_clip": 0.01145331, + "auxiliary_loss_mlp": 0.01041909, + "balance_loss_clip": 1.02433765, + "balance_loss_mlp": 1.0477581, + "epoch": 0.15403577333533744, + "flos": 17129821518720.0, + "grad_norm": 1.8458305826175445, + "language_loss": 0.82909077, + "learning_rate": 3.840772060066425e-06, + "loss": 0.85096323, + "num_input_tokens_seen": 55658695, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9765625, + "step": 2562, + "time_per_iteration": 2.4327874183654785 + }, + { + "auxiliary_loss_clip": 0.01160792, + "auxiliary_loss_mlp": 0.0104767, + "balance_loss_clip": 1.02614117, + "balance_loss_mlp": 1.05274105, + "epoch": 0.1540958965880054, + "flos": 17894503180800.0, + "grad_norm": 1.8513620412223286, + "language_loss": 0.74713194, + "learning_rate": 3.840619741387832e-06, + "loss": 0.7692166, + "num_input_tokens_seen": 55676340, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.078125, + "step": 2563, + "time_per_iteration": 2.4246435165405273 + }, + { + "auxiliary_loss_clip": 0.01152598, + "auxiliary_loss_mlp": 0.01044039, + "balance_loss_clip": 1.02425051, + "balance_loss_mlp": 1.04708791, + "epoch": 0.15415601984067337, + "flos": 32161057908480.0, + "grad_norm": 4.308351588789828, + "language_loss": 0.75896233, + "learning_rate": 3.8404673529127534e-06, + "loss": 0.78092873, + "num_input_tokens_seen": 55698890, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.0546875, + "step": 2564, + "time_per_iteration": 2.5528018474578857 + }, + { + "auxiliary_loss_clip": 0.01149402, + "auxiliary_loss_mlp": 0.01050825, + "balance_loss_clip": 1.03233564, + "balance_loss_mlp": 1.04782677, + "epoch": 0.15421614309334136, + "flos": 24024418496640.0, + "grad_norm": 1.9915177170702032, + "language_loss": 0.70825899, + "learning_rate": 3.840314894646969e-06, + "loss": 0.73026133, + "num_input_tokens_seen": 55718535, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.015625, + "step": 2565, + "time_per_iteration": 2.5007505416870117 + }, + { + "auxiliary_loss_clip": 0.01149717, + "auxiliary_loss_mlp": 0.01050801, + "balance_loss_clip": 1.0315845, + "balance_loss_mlp": 1.04728019, + "epoch": 0.15427626634600933, + "flos": 24386290064640.0, + "grad_norm": 2.308308002927142, + "language_loss": 0.71535969, + "learning_rate": 3.840162366596259e-06, + "loss": 0.73736489, + "num_input_tokens_seen": 55738970, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.0234375, + "step": 2566, + "time_per_iteration": 2.498033285140991 + }, + { + "auxiliary_loss_clip": 0.01143736, + "auxiliary_loss_mlp": 0.01042132, + "balance_loss_clip": 1.02379811, + "balance_loss_mlp": 1.04381752, + "epoch": 0.1543363895986773, + "flos": 23331522165120.0, + "grad_norm": 1.7584763964610812, + "language_loss": 0.85129261, + "learning_rate": 3.840009768766408e-06, + "loss": 0.87315124, + "num_input_tokens_seen": 55759585, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.0, + "step": 2567, + "time_per_iteration": 2.46708083152771 + }, + { + "auxiliary_loss_clip": 0.01150763, + "auxiliary_loss_mlp": 0.01050725, + "balance_loss_clip": 1.03266454, + "balance_loss_mlp": 1.0491097, + "epoch": 0.15439651285134526, + "flos": 24274284480000.0, + "grad_norm": 2.4904852760766127, + "language_loss": 0.78025472, + "learning_rate": 3.839857101163202e-06, + "loss": 0.80226958, + "num_input_tokens_seen": 55779250, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.015625, + "step": 2568, + "time_per_iteration": 2.476029634475708 + }, + { + "auxiliary_loss_clip": 0.01150703, + "auxiliary_loss_mlp": 0.01039967, + "balance_loss_clip": 1.01974905, + "balance_loss_mlp": 1.04835856, + "epoch": 0.15445663610401322, + "flos": 22456163721600.0, + "grad_norm": 1.967048361077992, + "language_loss": 0.70183134, + "learning_rate": 3.83970436379243e-06, + "loss": 0.72373807, + "num_input_tokens_seen": 55800470, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0234375, + "step": 2569, + "time_per_iteration": 2.4566383361816406 + }, + { + "auxiliary_loss_clip": 0.011445, + "auxiliary_loss_mlp": 0.0104299, + "balance_loss_clip": 1.0243814, + "balance_loss_mlp": 1.04563344, + "epoch": 0.1545167593566812, + "flos": 22049510872320.0, + "grad_norm": 1.7954711420319855, + "language_loss": 0.76502788, + "learning_rate": 3.839551556659884e-06, + "loss": 0.78690279, + "num_input_tokens_seen": 55817795, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.98828125, + "step": 2570, + "time_per_iteration": 2.4543209075927734 + }, + { + "auxiliary_loss_clip": 0.01149071, + "auxiliary_loss_mlp": 0.01044712, + "balance_loss_clip": 1.02532816, + "balance_loss_mlp": 1.04811645, + "epoch": 0.15457688260934915, + "flos": 19318253541120.0, + "grad_norm": 7.2402617485583525, + "language_loss": 0.77214551, + "learning_rate": 3.839398679771359e-06, + "loss": 0.7940833, + "num_input_tokens_seen": 55836125, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0078125, + "step": 2571, + "time_per_iteration": 2.4532222747802734 + }, + { + "auxiliary_loss_clip": 0.0114992, + "auxiliary_loss_mlp": 0.01049579, + "balance_loss_clip": 1.03086352, + "balance_loss_mlp": 1.04835165, + "epoch": 0.15463700586201715, + "flos": 24133981956480.0, + "grad_norm": 1.949392721600437, + "language_loss": 0.82254899, + "learning_rate": 3.839245733132652e-06, + "loss": 0.84454399, + "num_input_tokens_seen": 55855280, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 2572, + "time_per_iteration": 2.4919703006744385 + }, + { + "auxiliary_loss_clip": 0.01152049, + "auxiliary_loss_mlp": 0.01047577, + "balance_loss_clip": 1.02838445, + "balance_loss_mlp": 1.04827368, + "epoch": 0.1546971291146851, + "flos": 22420935457920.0, + "grad_norm": 1.621727953381826, + "language_loss": 0.90506172, + "learning_rate": 3.839092716749563e-06, + "loss": 0.92705798, + "num_input_tokens_seen": 55875695, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.03125, + "step": 2573, + "time_per_iteration": 2.4679911136627197 + }, + { + "auxiliary_loss_clip": 0.01152025, + "auxiliary_loss_mlp": 0.01056653, + "balance_loss_clip": 1.03724563, + "balance_loss_mlp": 1.04919529, + "epoch": 0.15475725236735308, + "flos": 17530225401600.0, + "grad_norm": 1.7899098306423509, + "language_loss": 0.70378339, + "learning_rate": 3.838939630627893e-06, + "loss": 0.72587025, + "num_input_tokens_seen": 55894575, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0234375, + "step": 2574, + "time_per_iteration": 2.448148012161255 + }, + { + "auxiliary_loss_clip": 0.01150284, + "auxiliary_loss_mlp": 0.01048729, + "balance_loss_clip": 1.02798676, + "balance_loss_mlp": 1.04641008, + "epoch": 0.15481737562002104, + "flos": 22561740771840.0, + "grad_norm": 1.761755301023602, + "language_loss": 0.82718939, + "learning_rate": 3.838786474773448e-06, + "loss": 0.84917951, + "num_input_tokens_seen": 55912855, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.0390625, + "step": 2575, + "time_per_iteration": 2.4515788555145264 + }, + { + "auxiliary_loss_clip": 0.011498, + "auxiliary_loss_mlp": 0.01047927, + "balance_loss_clip": 1.02937794, + "balance_loss_mlp": 1.0456214, + "epoch": 0.154877498872689, + "flos": 24900567039360.0, + "grad_norm": 2.21774000772259, + "language_loss": 0.84661531, + "learning_rate": 3.838633249192036e-06, + "loss": 0.86859256, + "num_input_tokens_seen": 55932375, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0390625, + "step": 2576, + "time_per_iteration": 2.5052003860473633 + }, + { + "auxiliary_loss_clip": 0.01149148, + "auxiliary_loss_mlp": 0.01043114, + "balance_loss_clip": 1.02414751, + "balance_loss_mlp": 1.04679108, + "epoch": 0.15493762212535697, + "flos": 28147501975680.0, + "grad_norm": 1.816317520286285, + "language_loss": 0.81942815, + "learning_rate": 3.838479953889465e-06, + "loss": 0.84135079, + "num_input_tokens_seen": 55953970, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0234375, + "step": 2577, + "time_per_iteration": 2.5133895874023438 + }, + { + "auxiliary_loss_clip": 0.01151988, + "auxiliary_loss_mlp": 0.01049852, + "balance_loss_clip": 1.03090954, + "balance_loss_mlp": 1.04980743, + "epoch": 0.15499774537802496, + "flos": 25411073086080.0, + "grad_norm": 2.384736720709717, + "language_loss": 0.76260924, + "learning_rate": 3.8383265888715525e-06, + "loss": 0.78462768, + "num_input_tokens_seen": 55973120, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0234375, + "step": 2578, + "time_per_iteration": 2.5140793323516846 + }, + { + "auxiliary_loss_clip": 0.01151489, + "auxiliary_loss_mlp": 0.01045761, + "balance_loss_clip": 1.02630556, + "balance_loss_mlp": 1.04832911, + "epoch": 0.15505786863069293, + "flos": 22091562720000.0, + "grad_norm": 2.651100693067537, + "language_loss": 0.82420707, + "learning_rate": 3.83817315414411e-06, + "loss": 0.84617954, + "num_input_tokens_seen": 55993260, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.03125, + "step": 2579, + "time_per_iteration": 2.4410548210144043 + }, + { + "auxiliary_loss_clip": 0.01152359, + "auxiliary_loss_mlp": 0.01049402, + "balance_loss_clip": 1.03056741, + "balance_loss_mlp": 1.05137682, + "epoch": 0.1551179918833609, + "flos": 18917131386240.0, + "grad_norm": 1.6356270056083286, + "language_loss": 0.80460835, + "learning_rate": 3.838019649712958e-06, + "loss": 0.82662606, + "num_input_tokens_seen": 56012130, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0078125, + "step": 2580, + "time_per_iteration": 2.457929849624634 + }, + { + "auxiliary_loss_clip": 0.01050492, + "auxiliary_loss_mlp": 0.01014696, + "balance_loss_clip": 1.0128479, + "balance_loss_mlp": 1.01473403, + "epoch": 0.15517811513602886, + "flos": 66239172587520.0, + "grad_norm": 0.84873853717235, + "language_loss": 0.58840239, + "learning_rate": 3.8378660755839166e-06, + "loss": 0.60905427, + "num_input_tokens_seen": 56079045, + "router_z_loss_clip": 0.01843262, + "router_z_loss_mlp": 0.35742188, + "step": 2581, + "time_per_iteration": 3.1725480556488037 + }, + { + "auxiliary_loss_clip": 0.01152966, + "auxiliary_loss_mlp": 0.01044952, + "balance_loss_clip": 1.02615237, + "balance_loss_mlp": 1.04869819, + "epoch": 0.15523823838869683, + "flos": 24021078531840.0, + "grad_norm": 1.8637973548327127, + "language_loss": 0.85214508, + "learning_rate": 3.8377124317628095e-06, + "loss": 0.87412429, + "num_input_tokens_seen": 56098745, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0390625, + "step": 2582, + "time_per_iteration": 2.486454963684082 + }, + { + "auxiliary_loss_clip": 0.01150766, + "auxiliary_loss_mlp": 0.01055198, + "balance_loss_clip": 1.03534937, + "balance_loss_mlp": 1.04837251, + "epoch": 0.1552983616413648, + "flos": 20485062938880.0, + "grad_norm": 2.457099081417407, + "language_loss": 0.78432047, + "learning_rate": 3.8375587182554625e-06, + "loss": 0.80638009, + "num_input_tokens_seen": 56117655, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0234375, + "step": 2583, + "time_per_iteration": 2.468686580657959 + }, + { + "auxiliary_loss_clip": 0.01151702, + "auxiliary_loss_mlp": 0.01054386, + "balance_loss_clip": 1.03458571, + "balance_loss_mlp": 1.04853427, + "epoch": 0.15535848489403276, + "flos": 32123710742400.0, + "grad_norm": 1.6727812592242826, + "language_loss": 0.76121294, + "learning_rate": 3.837404935067705e-06, + "loss": 0.78327382, + "num_input_tokens_seen": 56141960, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.03125, + "step": 2584, + "time_per_iteration": 2.5471444129943848 + }, + { + "auxiliary_loss_clip": 0.01149066, + "auxiliary_loss_mlp": 0.01046504, + "balance_loss_clip": 1.02746594, + "balance_loss_mlp": 1.04740906, + "epoch": 0.15541860814670075, + "flos": 19098444263040.0, + "grad_norm": 2.0194610159936324, + "language_loss": 0.75623107, + "learning_rate": 3.837251082205368e-06, + "loss": 0.7781868, + "num_input_tokens_seen": 56161430, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.015625, + "step": 2585, + "time_per_iteration": 2.4448020458221436 + }, + { + "auxiliary_loss_clip": 0.01146182, + "auxiliary_loss_mlp": 0.01049826, + "balance_loss_clip": 1.03101528, + "balance_loss_mlp": 1.04662418, + "epoch": 0.1554787313993687, + "flos": 19172097100800.0, + "grad_norm": 2.233481730992117, + "language_loss": 0.611651, + "learning_rate": 3.837097159674286e-06, + "loss": 0.63361114, + "num_input_tokens_seen": 56179390, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.9921875, + "step": 2586, + "time_per_iteration": 2.4375994205474854 + }, + { + "auxiliary_loss_clip": 0.01150781, + "auxiliary_loss_mlp": 0.01047148, + "balance_loss_clip": 1.02814651, + "balance_loss_mlp": 1.04623449, + "epoch": 0.15553885465203668, + "flos": 16143822207360.0, + "grad_norm": 1.8194244944539537, + "language_loss": 0.8108865, + "learning_rate": 3.836943167480296e-06, + "loss": 0.83286583, + "num_input_tokens_seen": 56198020, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.046875, + "step": 2587, + "time_per_iteration": 2.4394617080688477 + }, + { + "auxiliary_loss_clip": 0.01155076, + "auxiliary_loss_mlp": 0.01058653, + "balance_loss_clip": 1.03512144, + "balance_loss_mlp": 1.04851258, + "epoch": 0.15559897790470464, + "flos": 25337779384320.0, + "grad_norm": 1.8978014455674168, + "language_loss": 0.88844347, + "learning_rate": 3.836789105629236e-06, + "loss": 0.91058075, + "num_input_tokens_seen": 56218165, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.0625, + "step": 2588, + "time_per_iteration": 2.519864559173584 + }, + { + "auxiliary_loss_clip": 0.01150101, + "auxiliary_loss_mlp": 0.01053957, + "balance_loss_clip": 1.03351235, + "balance_loss_mlp": 1.04859662, + "epoch": 0.1556591011573726, + "flos": 23148772744320.0, + "grad_norm": 2.6765596364055266, + "language_loss": 0.64950025, + "learning_rate": 3.83663497412695e-06, + "loss": 0.6715408, + "num_input_tokens_seen": 56237160, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.015625, + "step": 2589, + "time_per_iteration": 2.5106732845306396 + }, + { + "auxiliary_loss_clip": 0.01150618, + "auxiliary_loss_mlp": 0.01044948, + "balance_loss_clip": 1.02451587, + "balance_loss_mlp": 1.0483036, + "epoch": 0.15571922441004057, + "flos": 25370888745600.0, + "grad_norm": 1.7614316666112095, + "language_loss": 0.82610166, + "learning_rate": 3.836480772979281e-06, + "loss": 0.84805739, + "num_input_tokens_seen": 56257610, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0234375, + "step": 2590, + "time_per_iteration": 2.519573211669922 + }, + { + "auxiliary_loss_clip": 0.01151128, + "auxiliary_loss_mlp": 0.01047405, + "balance_loss_clip": 1.02761662, + "balance_loss_mlp": 1.04740536, + "epoch": 0.15577934766270854, + "flos": 14501375890560.0, + "grad_norm": 2.1478399705358195, + "language_loss": 0.78919029, + "learning_rate": 3.836326502192077e-06, + "loss": 0.81117558, + "num_input_tokens_seen": 56275215, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.0390625, + "step": 2591, + "time_per_iteration": 2.446871519088745 + }, + { + "auxiliary_loss_clip": 0.01150021, + "auxiliary_loss_mlp": 0.01051358, + "balance_loss_clip": 1.03271413, + "balance_loss_mlp": 1.04902434, + "epoch": 0.15583947091537653, + "flos": 37414537372800.0, + "grad_norm": 1.9877262596002243, + "language_loss": 0.64780253, + "learning_rate": 3.836172161771189e-06, + "loss": 0.66981632, + "num_input_tokens_seen": 56297130, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0078125, + "step": 2592, + "time_per_iteration": 2.5992095470428467 + }, + { + "auxiliary_loss_clip": 0.01156577, + "auxiliary_loss_mlp": 0.01052338, + "balance_loss_clip": 1.03195322, + "balance_loss_mlp": 1.0518856, + "epoch": 0.1558995941680445, + "flos": 21834729498240.0, + "grad_norm": 2.6077304694487062, + "language_loss": 0.81806099, + "learning_rate": 3.836017751722467e-06, + "loss": 0.84015012, + "num_input_tokens_seen": 56314995, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.046875, + "step": 2593, + "time_per_iteration": 2.4317471981048584 + }, + { + "auxiliary_loss_clip": 0.01148564, + "auxiliary_loss_mlp": 0.01049732, + "balance_loss_clip": 1.02876306, + "balance_loss_mlp": 1.04862404, + "epoch": 0.15595971742071246, + "flos": 19792633484160.0, + "grad_norm": 2.3131099691306445, + "language_loss": 0.72585857, + "learning_rate": 3.8358632720517695e-06, + "loss": 0.7478416, + "num_input_tokens_seen": 56334005, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.0, + "step": 2594, + "time_per_iteration": 2.454946994781494 + }, + { + "auxiliary_loss_clip": 0.01145676, + "auxiliary_loss_mlp": 0.01044453, + "balance_loss_clip": 1.02514088, + "balance_loss_mlp": 1.0476191, + "epoch": 0.15601984067338043, + "flos": 26722135503360.0, + "grad_norm": 1.980280068020953, + "language_loss": 0.8170377, + "learning_rate": 3.835708722764952e-06, + "loss": 0.83893895, + "num_input_tokens_seen": 56353795, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.98046875, + "step": 2595, + "time_per_iteration": 2.4859232902526855 + }, + { + "auxiliary_loss_clip": 0.01149214, + "auxiliary_loss_mlp": 0.01047121, + "balance_loss_clip": 1.02761889, + "balance_loss_mlp": 1.04722846, + "epoch": 0.1560799639260484, + "flos": 18369278173440.0, + "grad_norm": 2.3729637830877177, + "language_loss": 0.86587811, + "learning_rate": 3.835554103867876e-06, + "loss": 0.88784146, + "num_input_tokens_seen": 56373195, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.015625, + "step": 2596, + "time_per_iteration": 2.431851387023926 + }, + { + "auxiliary_loss_clip": 0.0114636, + "auxiliary_loss_mlp": 0.01043935, + "balance_loss_clip": 1.02558839, + "balance_loss_mlp": 1.04831815, + "epoch": 0.15614008717871636, + "flos": 22598980197120.0, + "grad_norm": 1.6624104890405602, + "language_loss": 0.68610018, + "learning_rate": 3.835399415366404e-06, + "loss": 0.70800316, + "num_input_tokens_seen": 56391525, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.98046875, + "step": 2597, + "time_per_iteration": 2.447265625 + }, + { + "auxiliary_loss_clip": 0.01144111, + "auxiliary_loss_mlp": 0.01040539, + "balance_loss_clip": 1.02210891, + "balance_loss_mlp": 1.04714298, + "epoch": 0.15620021043138435, + "flos": 22746860490240.0, + "grad_norm": 1.638980754682227, + "language_loss": 0.79885375, + "learning_rate": 3.8352446572664035e-06, + "loss": 0.82070029, + "num_input_tokens_seen": 56410715, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.97265625, + "step": 2598, + "time_per_iteration": 2.4641571044921875 + }, + { + "auxiliary_loss_clip": 0.01141262, + "auxiliary_loss_mlp": 0.01039052, + "balance_loss_clip": 1.02003777, + "balance_loss_mlp": 1.04484367, + "epoch": 0.15626033368405232, + "flos": 13114936782720.0, + "grad_norm": 2.19687533686526, + "language_loss": 0.82877028, + "learning_rate": 3.8350898295737405e-06, + "loss": 0.85057342, + "num_input_tokens_seen": 56429170, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.96484375, + "step": 2599, + "time_per_iteration": 2.419464111328125 + }, + { + "auxiliary_loss_clip": 0.01155461, + "auxiliary_loss_mlp": 0.01052363, + "balance_loss_clip": 1.03115571, + "balance_loss_mlp": 1.04991198, + "epoch": 0.15632045693672028, + "flos": 16472297105280.0, + "grad_norm": 3.412785735027946, + "language_loss": 0.81813747, + "learning_rate": 3.834934932294287e-06, + "loss": 0.84021574, + "num_input_tokens_seen": 56445685, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.0546875, + "step": 2600, + "time_per_iteration": 2.408848524093628 + }, + { + "auxiliary_loss_clip": 0.01152936, + "auxiliary_loss_mlp": 0.0104778, + "balance_loss_clip": 1.02813435, + "balance_loss_mlp": 1.05145574, + "epoch": 0.15638058018938825, + "flos": 20850346298880.0, + "grad_norm": 1.8570517134994367, + "language_loss": 0.8869983, + "learning_rate": 3.834779965433917e-06, + "loss": 0.90900552, + "num_input_tokens_seen": 56465900, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.015625, + "step": 2601, + "time_per_iteration": 3.8960022926330566 + }, + { + "auxiliary_loss_clip": 0.01155618, + "auxiliary_loss_mlp": 0.01064496, + "balance_loss_clip": 1.04250216, + "balance_loss_mlp": 1.05294669, + "epoch": 0.1564407034420562, + "flos": 21872220318720.0, + "grad_norm": 1.6572791804428935, + "language_loss": 0.78657669, + "learning_rate": 3.834624928998508e-06, + "loss": 0.80877781, + "num_input_tokens_seen": 56485020, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.0234375, + "step": 2602, + "time_per_iteration": 5.330498456954956 + }, + { + "auxiliary_loss_clip": 0.01148351, + "auxiliary_loss_mlp": 0.01041482, + "balance_loss_clip": 1.02178836, + "balance_loss_mlp": 1.04872918, + "epoch": 0.15650082669472418, + "flos": 21834549930240.0, + "grad_norm": 1.9481072701353659, + "language_loss": 0.73668396, + "learning_rate": 3.8344698229939376e-06, + "loss": 0.75858229, + "num_input_tokens_seen": 56505205, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.99609375, + "step": 2603, + "time_per_iteration": 2.4632985591888428 + }, + { + "auxiliary_loss_clip": 0.01152236, + "auxiliary_loss_mlp": 0.01051929, + "balance_loss_clip": 1.03205693, + "balance_loss_mlp": 1.05066442, + "epoch": 0.15656094994739214, + "flos": 13800542653440.0, + "grad_norm": 3.4624008692922583, + "language_loss": 0.87223339, + "learning_rate": 3.8343146474260865e-06, + "loss": 0.89427507, + "num_input_tokens_seen": 56521495, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.015625, + "step": 2604, + "time_per_iteration": 2.449589490890503 + }, + { + "auxiliary_loss_clip": 0.01151636, + "auxiliary_loss_mlp": 0.01043178, + "balance_loss_clip": 1.02404523, + "balance_loss_mlp": 1.04892218, + "epoch": 0.15662107320006013, + "flos": 27308197808640.0, + "grad_norm": 1.883819023069068, + "language_loss": 0.85465723, + "learning_rate": 3.834159402300841e-06, + "loss": 0.87660539, + "num_input_tokens_seen": 56540665, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0234375, + "step": 2605, + "time_per_iteration": 2.4958839416503906 + }, + { + "auxiliary_loss_clip": 0.01153078, + "auxiliary_loss_mlp": 0.01047461, + "balance_loss_clip": 1.0274334, + "balance_loss_mlp": 1.04840827, + "epoch": 0.1566811964527281, + "flos": 26685075646080.0, + "grad_norm": 2.4518366617864897, + "language_loss": 0.72954321, + "learning_rate": 3.834004087624087e-06, + "loss": 0.75154853, + "num_input_tokens_seen": 56560805, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.046875, + "step": 2606, + "time_per_iteration": 2.5142898559570312 + }, + { + "auxiliary_loss_clip": 0.01153185, + "auxiliary_loss_mlp": 0.01052184, + "balance_loss_clip": 1.03406429, + "balance_loss_mlp": 1.05257165, + "epoch": 0.15674131970539606, + "flos": 16103422385280.0, + "grad_norm": 1.9820673877795116, + "language_loss": 0.7643044, + "learning_rate": 3.8338487034017145e-06, + "loss": 0.78635812, + "num_input_tokens_seen": 56576335, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.0, + "step": 2607, + "time_per_iteration": 2.433779239654541 + }, + { + "auxiliary_loss_clip": 0.01150219, + "auxiliary_loss_mlp": 0.01046695, + "balance_loss_clip": 1.0282656, + "balance_loss_mlp": 1.05097091, + "epoch": 0.15680144295806403, + "flos": 19169690889600.0, + "grad_norm": 1.7850270515341367, + "language_loss": 0.8191157, + "learning_rate": 3.833693249639615e-06, + "loss": 0.8410849, + "num_input_tokens_seen": 56595880, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.9921875, + "step": 2608, + "time_per_iteration": 2.4599456787109375 + }, + { + "auxiliary_loss_clip": 0.0115477, + "auxiliary_loss_mlp": 0.01051079, + "balance_loss_clip": 1.03001475, + "balance_loss_mlp": 1.05087662, + "epoch": 0.156861566210732, + "flos": 20813430096000.0, + "grad_norm": 1.762197880640894, + "language_loss": 0.72479111, + "learning_rate": 3.833537726343684e-06, + "loss": 0.74684954, + "num_input_tokens_seen": 56615130, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.0390625, + "step": 2609, + "time_per_iteration": 2.478262424468994 + }, + { + "auxiliary_loss_clip": 0.0115339, + "auxiliary_loss_mlp": 0.01043612, + "balance_loss_clip": 1.02415729, + "balance_loss_mlp": 1.04881263, + "epoch": 0.15692168946339996, + "flos": 20047922421120.0, + "grad_norm": 1.8833233307981396, + "language_loss": 0.71974212, + "learning_rate": 3.833382133519818e-06, + "loss": 0.74171209, + "num_input_tokens_seen": 56634005, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.046875, + "step": 2610, + "time_per_iteration": 2.468616247177124 + }, + { + "auxiliary_loss_clip": 0.01153055, + "auxiliary_loss_mlp": 0.01052533, + "balance_loss_clip": 1.03119481, + "balance_loss_mlp": 1.04865789, + "epoch": 0.15698181271606793, + "flos": 21398019943680.0, + "grad_norm": 2.0486839750324117, + "language_loss": 0.72148776, + "learning_rate": 3.833226471173919e-06, + "loss": 0.74354362, + "num_input_tokens_seen": 56653480, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.046875, + "step": 2611, + "time_per_iteration": 2.4812967777252197 + }, + { + "auxiliary_loss_clip": 0.01152967, + "auxiliary_loss_mlp": 0.01044873, + "balance_loss_clip": 1.02517986, + "balance_loss_mlp": 1.05081797, + "epoch": 0.15704193596873592, + "flos": 20845785271680.0, + "grad_norm": 2.1526303920645153, + "language_loss": 0.70732605, + "learning_rate": 3.833070739311887e-06, + "loss": 0.72930443, + "num_input_tokens_seen": 56672270, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0234375, + "step": 2612, + "time_per_iteration": 2.4659905433654785 + }, + { + "auxiliary_loss_clip": 0.0115345, + "auxiliary_loss_mlp": 0.01053573, + "balance_loss_clip": 1.03448749, + "balance_loss_mlp": 1.05112672, + "epoch": 0.15710205922140388, + "flos": 21762908254080.0, + "grad_norm": 1.98698506128839, + "language_loss": 0.75649011, + "learning_rate": 3.83291493793963e-06, + "loss": 0.77856034, + "num_input_tokens_seen": 56691510, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0234375, + "step": 2613, + "time_per_iteration": 2.5053935050964355 + }, + { + "auxiliary_loss_clip": 0.01150247, + "auxiliary_loss_mlp": 0.01054631, + "balance_loss_clip": 1.03454411, + "balance_loss_mlp": 1.04870725, + "epoch": 0.15716218247407185, + "flos": 25007760201600.0, + "grad_norm": 1.7256548803860323, + "language_loss": 0.6593504, + "learning_rate": 3.832759067063055e-06, + "loss": 0.68139917, + "num_input_tokens_seen": 56712230, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.015625, + "step": 2614, + "time_per_iteration": 2.49568772315979 + }, + { + "auxiliary_loss_clip": 0.01154976, + "auxiliary_loss_mlp": 0.01050381, + "balance_loss_clip": 1.02972233, + "balance_loss_mlp": 1.04979289, + "epoch": 0.1572223057267398, + "flos": 20191780391040.0, + "grad_norm": 2.1509467282749055, + "language_loss": 0.7554003, + "learning_rate": 3.832603126688072e-06, + "loss": 0.7774539, + "num_input_tokens_seen": 56727490, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.0546875, + "step": 2615, + "time_per_iteration": 2.529383420944214 + }, + { + "auxiliary_loss_clip": 0.0115204, + "auxiliary_loss_mlp": 0.01052516, + "balance_loss_clip": 1.03374028, + "balance_loss_mlp": 1.05295634, + "epoch": 0.15728242897940778, + "flos": 20959514709120.0, + "grad_norm": 1.616950748432624, + "language_loss": 0.72989607, + "learning_rate": 3.832447116820594e-06, + "loss": 0.75194162, + "num_input_tokens_seen": 56747385, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9921875, + "step": 2616, + "time_per_iteration": 2.5096960067749023 + }, + { + "auxiliary_loss_clip": 0.01152584, + "auxiliary_loss_mlp": 0.01055054, + "balance_loss_clip": 1.03453839, + "balance_loss_mlp": 1.04991412, + "epoch": 0.15734255223207574, + "flos": 23038275530880.0, + "grad_norm": 3.5663633553154774, + "language_loss": 0.72316766, + "learning_rate": 3.832291037466539e-06, + "loss": 0.74524403, + "num_input_tokens_seen": 56768055, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0234375, + "step": 2617, + "time_per_iteration": 2.46756911277771 + }, + { + "auxiliary_loss_clip": 0.01151577, + "auxiliary_loss_mlp": 0.01043789, + "balance_loss_clip": 1.02453637, + "balance_loss_mlp": 1.05169988, + "epoch": 0.15740267548474374, + "flos": 20551281661440.0, + "grad_norm": 2.0296559288157563, + "language_loss": 0.74336463, + "learning_rate": 3.8321348886318235e-06, + "loss": 0.76531827, + "num_input_tokens_seen": 56785110, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.0, + "step": 2618, + "time_per_iteration": 2.4584109783172607 + }, + { + "auxiliary_loss_clip": 0.01156356, + "auxiliary_loss_mlp": 0.01051737, + "balance_loss_clip": 1.02976644, + "balance_loss_mlp": 1.05079079, + "epoch": 0.1574627987374117, + "flos": 22666922772480.0, + "grad_norm": 2.116136233608656, + "language_loss": 0.78624105, + "learning_rate": 3.8319786703223695e-06, + "loss": 0.80832201, + "num_input_tokens_seen": 56804975, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.0546875, + "step": 2619, + "time_per_iteration": 2.481902837753296 + }, + { + "auxiliary_loss_clip": 0.01151953, + "auxiliary_loss_mlp": 0.01052764, + "balance_loss_clip": 1.03373837, + "balance_loss_mlp": 1.05213726, + "epoch": 0.15752292199007967, + "flos": 16800664262400.0, + "grad_norm": 1.705564128099723, + "language_loss": 0.76632881, + "learning_rate": 3.831822382544101e-06, + "loss": 0.78837597, + "num_input_tokens_seen": 56822470, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0, + "step": 2620, + "time_per_iteration": 2.432645082473755 + }, + { + "auxiliary_loss_clip": 0.01153614, + "auxiliary_loss_mlp": 0.01050007, + "balance_loss_clip": 1.02901375, + "balance_loss_mlp": 1.05096626, + "epoch": 0.15758304524274763, + "flos": 29826002568960.0, + "grad_norm": 1.7942321132139696, + "language_loss": 0.70836174, + "learning_rate": 3.831666025302944e-06, + "loss": 0.73039794, + "num_input_tokens_seen": 56842100, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.03125, + "step": 2621, + "time_per_iteration": 2.5259244441986084 + }, + { + "auxiliary_loss_clip": 0.01156472, + "auxiliary_loss_mlp": 0.01049198, + "balance_loss_clip": 1.0277524, + "balance_loss_mlp": 1.05222857, + "epoch": 0.1576431684954156, + "flos": 53577426723840.0, + "grad_norm": 2.5825564073202467, + "language_loss": 0.71880406, + "learning_rate": 3.831509598604828e-06, + "loss": 0.74086076, + "num_input_tokens_seen": 56865920, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.046875, + "step": 2622, + "time_per_iteration": 2.738351583480835 + }, + { + "auxiliary_loss_clip": 0.01153726, + "auxiliary_loss_mlp": 0.01047401, + "balance_loss_clip": 1.02826762, + "balance_loss_mlp": 1.05162704, + "epoch": 0.15770329174808356, + "flos": 20813609664000.0, + "grad_norm": 1.7275011876813262, + "language_loss": 0.87603116, + "learning_rate": 3.831353102455684e-06, + "loss": 0.89804244, + "num_input_tokens_seen": 56885265, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0234375, + "step": 2623, + "time_per_iteration": 2.439276695251465 + }, + { + "auxiliary_loss_clip": 0.01153997, + "auxiliary_loss_mlp": 0.01046147, + "balance_loss_clip": 1.02774143, + "balance_loss_mlp": 1.05301619, + "epoch": 0.15776341500075153, + "flos": 24974004395520.0, + "grad_norm": 1.7488793041913886, + "language_loss": 0.82132548, + "learning_rate": 3.831196536861448e-06, + "loss": 0.84332693, + "num_input_tokens_seen": 56906710, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.0078125, + "step": 2624, + "time_per_iteration": 2.5011823177337646 + }, + { + "auxiliary_loss_clip": 0.01156666, + "auxiliary_loss_mlp": 0.01047764, + "balance_loss_clip": 1.02720022, + "balance_loss_mlp": 1.0518285, + "epoch": 0.15782353825341952, + "flos": 21907915459200.0, + "grad_norm": 2.213311097116894, + "language_loss": 0.79965818, + "learning_rate": 3.831039901828054e-06, + "loss": 0.82170242, + "num_input_tokens_seen": 56924275, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.046875, + "step": 2625, + "time_per_iteration": 2.469705581665039 + }, + { + "auxiliary_loss_clip": 0.01152837, + "auxiliary_loss_mlp": 0.01050956, + "balance_loss_clip": 1.03215635, + "balance_loss_mlp": 1.05189955, + "epoch": 0.15788366150608749, + "flos": 26177191292160.0, + "grad_norm": 2.0497226184185044, + "language_loss": 0.80393386, + "learning_rate": 3.830883197361445e-06, + "loss": 0.82597172, + "num_input_tokens_seen": 56941525, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0078125, + "step": 2626, + "time_per_iteration": 2.4822630882263184 + }, + { + "auxiliary_loss_clip": 0.01157567, + "auxiliary_loss_mlp": 0.01046921, + "balance_loss_clip": 1.02703679, + "balance_loss_mlp": 1.05660009, + "epoch": 0.15794378475875545, + "flos": 27709822753920.0, + "grad_norm": 1.8439314798963051, + "language_loss": 0.73819017, + "learning_rate": 3.830726423467561e-06, + "loss": 0.76023501, + "num_input_tokens_seen": 56962145, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0078125, + "step": 2627, + "time_per_iteration": 2.5146384239196777 + }, + { + "auxiliary_loss_clip": 0.01153645, + "auxiliary_loss_mlp": 0.01055765, + "balance_loss_clip": 1.03515387, + "balance_loss_mlp": 1.05136025, + "epoch": 0.15800390801142342, + "flos": 12130158533760.0, + "grad_norm": 2.581375347872909, + "language_loss": 0.84926289, + "learning_rate": 3.830569580152348e-06, + "loss": 0.87135696, + "num_input_tokens_seen": 56977505, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.0234375, + "step": 2628, + "time_per_iteration": 2.476461172103882 + }, + { + "auxiliary_loss_clip": 0.01152526, + "auxiliary_loss_mlp": 0.01039179, + "balance_loss_clip": 1.02045107, + "balance_loss_mlp": 1.05181646, + "epoch": 0.15806403126409138, + "flos": 20704728562560.0, + "grad_norm": 1.9330212081502065, + "language_loss": 0.76414472, + "learning_rate": 3.830412667421752e-06, + "loss": 0.78606176, + "num_input_tokens_seen": 56996770, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0078125, + "step": 2629, + "time_per_iteration": 2.4604575634002686 + }, + { + "auxiliary_loss_clip": 0.01157301, + "auxiliary_loss_mlp": 0.01053673, + "balance_loss_clip": 1.03277516, + "balance_loss_mlp": 1.05376625, + "epoch": 0.15812415451675935, + "flos": 17821712269440.0, + "grad_norm": 2.3335878107949624, + "language_loss": 0.73786485, + "learning_rate": 3.8302556852817245e-06, + "loss": 0.7599746, + "num_input_tokens_seen": 57014970, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.0390625, + "step": 2630, + "time_per_iteration": 2.4556961059570312 + }, + { + "auxiliary_loss_clip": 0.01159154, + "auxiliary_loss_mlp": 0.01049527, + "balance_loss_clip": 1.02934527, + "balance_loss_mlp": 1.05278432, + "epoch": 0.15818427776942734, + "flos": 20084048524800.0, + "grad_norm": 3.0799062126580385, + "language_loss": 0.83732498, + "learning_rate": 3.8300986337382184e-06, + "loss": 0.85941184, + "num_input_tokens_seen": 57034045, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0625, + "step": 2631, + "time_per_iteration": 2.46466326713562 + }, + { + "auxiliary_loss_clip": 0.0115417, + "auxiliary_loss_mlp": 0.01047476, + "balance_loss_clip": 1.02800894, + "balance_loss_mlp": 1.05072045, + "epoch": 0.1582444010220953, + "flos": 21214911386880.0, + "grad_norm": 1.8231521117013414, + "language_loss": 0.78509778, + "learning_rate": 3.8299415127971895e-06, + "loss": 0.80711424, + "num_input_tokens_seen": 57053695, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0390625, + "step": 2632, + "time_per_iteration": 2.4678170680999756 + }, + { + "auxiliary_loss_clip": 0.01160199, + "auxiliary_loss_mlp": 0.01058182, + "balance_loss_clip": 1.03766572, + "balance_loss_mlp": 1.05516291, + "epoch": 0.15830452427476327, + "flos": 17858341163520.0, + "grad_norm": 2.1429957658458374, + "language_loss": 0.83250827, + "learning_rate": 3.829784322464594e-06, + "loss": 0.8546921, + "num_input_tokens_seen": 57071290, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.046875, + "step": 2633, + "time_per_iteration": 2.4329495429992676 + }, + { + "auxiliary_loss_clip": 0.01161566, + "auxiliary_loss_mlp": 0.01046458, + "balance_loss_clip": 1.02641928, + "balance_loss_mlp": 1.05591452, + "epoch": 0.15836464752743123, + "flos": 24534960456960.0, + "grad_norm": 1.9651575849984717, + "language_loss": 0.77401066, + "learning_rate": 3.829627062746394e-06, + "loss": 0.79609084, + "num_input_tokens_seen": 57091465, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.0625, + "step": 2634, + "time_per_iteration": 2.4989452362060547 + }, + { + "auxiliary_loss_clip": 0.01158347, + "auxiliary_loss_mlp": 0.01049894, + "balance_loss_clip": 1.02961695, + "balance_loss_mlp": 1.05281138, + "epoch": 0.1584247707800992, + "flos": 20120821073280.0, + "grad_norm": 2.178604932363088, + "language_loss": 0.89144027, + "learning_rate": 3.829469733648552e-06, + "loss": 0.91352272, + "num_input_tokens_seen": 57110075, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.0546875, + "step": 2635, + "time_per_iteration": 2.45926570892334 + }, + { + "auxiliary_loss_clip": 0.0115666, + "auxiliary_loss_mlp": 0.0105615, + "balance_loss_clip": 1.03518081, + "balance_loss_mlp": 1.05145168, + "epoch": 0.15848489403276717, + "flos": 20375966355840.0, + "grad_norm": 2.07071202721755, + "language_loss": 0.75814605, + "learning_rate": 3.829312335177034e-06, + "loss": 0.78027415, + "num_input_tokens_seen": 57128945, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.046875, + "step": 2636, + "time_per_iteration": 2.4601919651031494 + }, + { + "auxiliary_loss_clip": 0.01159967, + "auxiliary_loss_mlp": 0.01046973, + "balance_loss_clip": 1.0252409, + "balance_loss_mlp": 1.05383635, + "epoch": 0.15854501728543513, + "flos": 39346890359040.0, + "grad_norm": 2.192817266182781, + "language_loss": 0.72065628, + "learning_rate": 3.82915486733781e-06, + "loss": 0.74272561, + "num_input_tokens_seen": 57152385, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.0625, + "step": 2637, + "time_per_iteration": 2.6509416103363037 + }, + { + "auxiliary_loss_clip": 0.01154792, + "auxiliary_loss_mlp": 0.01042754, + "balance_loss_clip": 1.02395523, + "balance_loss_mlp": 1.05307317, + "epoch": 0.15860514053810312, + "flos": 24864225454080.0, + "grad_norm": 1.9644709833035638, + "language_loss": 0.77938193, + "learning_rate": 3.82899733013685e-06, + "loss": 0.80135739, + "num_input_tokens_seen": 57172620, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 2638, + "time_per_iteration": 2.516597032546997 + }, + { + "auxiliary_loss_clip": 0.01160159, + "auxiliary_loss_mlp": 0.01062214, + "balance_loss_clip": 1.04074407, + "balance_loss_mlp": 1.05348861, + "epoch": 0.1586652637907711, + "flos": 26177694082560.0, + "grad_norm": 1.8473853011869859, + "language_loss": 0.75521988, + "learning_rate": 3.828839723580128e-06, + "loss": 0.77744359, + "num_input_tokens_seen": 57194680, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.0625, + "step": 2639, + "time_per_iteration": 2.5517024993896484 + }, + { + "auxiliary_loss_clip": 0.01159513, + "auxiliary_loss_mlp": 0.01061213, + "balance_loss_clip": 1.04115009, + "balance_loss_mlp": 1.0541048, + "epoch": 0.15872538704343905, + "flos": 19792058866560.0, + "grad_norm": 2.7935559917311212, + "language_loss": 0.81487972, + "learning_rate": 3.82868204767362e-06, + "loss": 0.83708692, + "num_input_tokens_seen": 57214675, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0546875, + "step": 2640, + "time_per_iteration": 2.5613112449645996 + }, + { + "auxiliary_loss_clip": 0.01152653, + "auxiliary_loss_mlp": 0.01050922, + "balance_loss_clip": 1.030406, + "balance_loss_mlp": 1.05107331, + "epoch": 0.15878551029610702, + "flos": 28475366342400.0, + "grad_norm": 1.4887809421561018, + "language_loss": 0.67051661, + "learning_rate": 3.828524302423306e-06, + "loss": 0.69255233, + "num_input_tokens_seen": 57235830, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.015625, + "step": 2641, + "time_per_iteration": 2.5603220462799072 + }, + { + "auxiliary_loss_clip": 0.01163302, + "auxiliary_loss_mlp": 0.01057677, + "balance_loss_clip": 1.03670835, + "balance_loss_mlp": 1.05338526, + "epoch": 0.15884563354877498, + "flos": 24206701040640.0, + "grad_norm": 2.894977763056953, + "language_loss": 0.7508198, + "learning_rate": 3.828366487835167e-06, + "loss": 0.77302957, + "num_input_tokens_seen": 57255970, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.09375, + "step": 2642, + "time_per_iteration": 2.4783003330230713 + }, + { + "auxiliary_loss_clip": 0.01154514, + "auxiliary_loss_mlp": 0.01054374, + "balance_loss_clip": 1.0343703, + "balance_loss_mlp": 1.05342579, + "epoch": 0.15890575680144295, + "flos": 23949795991680.0, + "grad_norm": 2.1233146618452046, + "language_loss": 0.70096999, + "learning_rate": 3.828208603915186e-06, + "loss": 0.72305882, + "num_input_tokens_seen": 57274435, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.015625, + "step": 2643, + "time_per_iteration": 3.8417530059814453 + }, + { + "auxiliary_loss_clip": 0.0115474, + "auxiliary_loss_mlp": 0.01046992, + "balance_loss_clip": 1.02801371, + "balance_loss_mlp": 1.05399418, + "epoch": 0.15896588005411091, + "flos": 21215019127680.0, + "grad_norm": 2.266510625665779, + "language_loss": 0.78172421, + "learning_rate": 3.828050650669353e-06, + "loss": 0.80374151, + "num_input_tokens_seen": 57293115, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0078125, + "step": 2644, + "time_per_iteration": 3.918332099914551 + }, + { + "auxiliary_loss_clip": 0.01155626, + "auxiliary_loss_mlp": 0.01054747, + "balance_loss_clip": 1.03432608, + "balance_loss_mlp": 1.05189228, + "epoch": 0.1590260033067789, + "flos": 24352390604160.0, + "grad_norm": 1.8745538844001242, + "language_loss": 0.82203078, + "learning_rate": 3.827892628103657e-06, + "loss": 0.84413457, + "num_input_tokens_seen": 57312565, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0390625, + "step": 2645, + "time_per_iteration": 2.484264373779297 + }, + { + "auxiliary_loss_clip": 0.01156639, + "auxiliary_loss_mlp": 0.01055562, + "balance_loss_clip": 1.0340929, + "balance_loss_mlp": 1.05192447, + "epoch": 0.15908612655944687, + "flos": 32048944583040.0, + "grad_norm": 1.974907168100252, + "language_loss": 0.69778836, + "learning_rate": 3.827734536224087e-06, + "loss": 0.71991032, + "num_input_tokens_seen": 57333360, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.046875, + "step": 2646, + "time_per_iteration": 2.5406665802001953 + }, + { + "auxiliary_loss_clip": 0.01151139, + "auxiliary_loss_mlp": 0.01046289, + "balance_loss_clip": 1.02738249, + "balance_loss_mlp": 1.05206954, + "epoch": 0.15914624981211484, + "flos": 17785370684160.0, + "grad_norm": 2.5066454352116914, + "language_loss": 0.62659109, + "learning_rate": 3.827576375036642e-06, + "loss": 0.64856541, + "num_input_tokens_seen": 57350575, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.9921875, + "step": 2647, + "time_per_iteration": 2.442711353302002 + }, + { + "auxiliary_loss_clip": 0.01155368, + "auxiliary_loss_mlp": 0.01052347, + "balance_loss_clip": 1.03226066, + "balance_loss_mlp": 1.05410099, + "epoch": 0.1592063730647828, + "flos": 17712507945600.0, + "grad_norm": 2.1253745247586204, + "language_loss": 0.8942067, + "learning_rate": 3.827418144547318e-06, + "loss": 0.91628385, + "num_input_tokens_seen": 57367570, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.015625, + "step": 2648, + "time_per_iteration": 2.4649319648742676 + }, + { + "auxiliary_loss_clip": 0.01152722, + "auxiliary_loss_mlp": 0.01049569, + "balance_loss_clip": 1.03141308, + "balance_loss_mlp": 1.05391204, + "epoch": 0.15926649631745077, + "flos": 18803545603200.0, + "grad_norm": 1.8651001097947648, + "language_loss": 0.91716385, + "learning_rate": 3.827259844762114e-06, + "loss": 0.93918669, + "num_input_tokens_seen": 57383980, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.98828125, + "step": 2649, + "time_per_iteration": 2.451261520385742 + }, + { + "auxiliary_loss_clip": 0.01163223, + "auxiliary_loss_mlp": 0.01049063, + "balance_loss_clip": 1.02802217, + "balance_loss_mlp": 1.05272281, + "epoch": 0.15932661957011873, + "flos": 17566243764480.0, + "grad_norm": 3.3226984417644028, + "language_loss": 0.71273595, + "learning_rate": 3.827101475687033e-06, + "loss": 0.73485881, + "num_input_tokens_seen": 57400840, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1015625, + "step": 2650, + "time_per_iteration": 2.43603253364563 + }, + { + "auxiliary_loss_clip": 0.01153823, + "auxiliary_loss_mlp": 0.01044738, + "balance_loss_clip": 1.02695203, + "balance_loss_mlp": 1.05372715, + "epoch": 0.15938674282278673, + "flos": 13334351011200.0, + "grad_norm": 2.4247432930640898, + "language_loss": 0.71116996, + "learning_rate": 3.826943037328082e-06, + "loss": 0.73315561, + "num_input_tokens_seen": 57419230, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 1.0, + "step": 2651, + "time_per_iteration": 2.467451572418213 + }, + { + "auxiliary_loss_clip": 0.01155622, + "auxiliary_loss_mlp": 0.01049144, + "balance_loss_clip": 1.02912855, + "balance_loss_mlp": 1.0513978, + "epoch": 0.1594468660754547, + "flos": 22488842119680.0, + "grad_norm": 1.909821572556346, + "language_loss": 0.7997523, + "learning_rate": 3.8267845296912674e-06, + "loss": 0.82179999, + "num_input_tokens_seen": 57439315, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.046875, + "step": 2652, + "time_per_iteration": 2.519624948501587 + }, + { + "auxiliary_loss_clip": 0.01153837, + "auxiliary_loss_mlp": 0.01045946, + "balance_loss_clip": 1.02665794, + "balance_loss_mlp": 1.05385149, + "epoch": 0.15950698932812266, + "flos": 15007320910080.0, + "grad_norm": 2.695147262103697, + "language_loss": 0.70050812, + "learning_rate": 3.826625952782601e-06, + "loss": 0.72250587, + "num_input_tokens_seen": 57454635, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.0, + "step": 2653, + "time_per_iteration": 2.439445972442627 + }, + { + "auxiliary_loss_clip": 0.01154814, + "auxiliary_loss_mlp": 0.01043059, + "balance_loss_clip": 1.02309155, + "balance_loss_mlp": 1.05308652, + "epoch": 0.15956711258079062, + "flos": 30155052084480.0, + "grad_norm": 2.046273350718209, + "language_loss": 0.76509416, + "learning_rate": 3.826467306608095e-06, + "loss": 0.7870729, + "num_input_tokens_seen": 57476805, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.015625, + "step": 2654, + "time_per_iteration": 2.529644012451172 + }, + { + "auxiliary_loss_clip": 0.01154147, + "auxiliary_loss_mlp": 0.01047134, + "balance_loss_clip": 1.02750051, + "balance_loss_mlp": 1.0526185, + "epoch": 0.1596272358334586, + "flos": 21032700670080.0, + "grad_norm": 1.961582700797155, + "language_loss": 0.8208828, + "learning_rate": 3.826308591173765e-06, + "loss": 0.84289569, + "num_input_tokens_seen": 57496400, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.015625, + "step": 2655, + "time_per_iteration": 2.4841158390045166 + }, + { + "auxiliary_loss_clip": 0.01154679, + "auxiliary_loss_mlp": 0.01049984, + "balance_loss_clip": 1.03166127, + "balance_loss_mlp": 1.05125904, + "epoch": 0.15968735908612655, + "flos": 15268032800640.0, + "grad_norm": 2.077546195878165, + "language_loss": 0.73565602, + "learning_rate": 3.826149806485631e-06, + "loss": 0.75770259, + "num_input_tokens_seen": 57513700, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.03125, + "step": 2656, + "time_per_iteration": 2.4727072715759277 + }, + { + "auxiliary_loss_clip": 0.01149623, + "auxiliary_loss_mlp": 0.01046235, + "balance_loss_clip": 1.02766216, + "balance_loss_mlp": 1.05170095, + "epoch": 0.15974748233879452, + "flos": 52665726695040.0, + "grad_norm": 1.884771930829773, + "language_loss": 0.77508467, + "learning_rate": 3.825990952549713e-06, + "loss": 0.79704326, + "num_input_tokens_seen": 57536180, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.98046875, + "step": 2657, + "time_per_iteration": 2.801560401916504 + }, + { + "auxiliary_loss_clip": 0.01154211, + "auxiliary_loss_mlp": 0.01048143, + "balance_loss_clip": 1.02910495, + "balance_loss_mlp": 1.05459499, + "epoch": 0.1598076055914625, + "flos": 18733232730240.0, + "grad_norm": 1.6493844029380673, + "language_loss": 0.74807733, + "learning_rate": 3.825832029372035e-06, + "loss": 0.77010089, + "num_input_tokens_seen": 57555025, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.99609375, + "step": 2658, + "time_per_iteration": 2.4434328079223633 + }, + { + "auxiliary_loss_clip": 0.01155878, + "auxiliary_loss_mlp": 0.01050595, + "balance_loss_clip": 1.02912521, + "balance_loss_mlp": 1.05291355, + "epoch": 0.15986772884413047, + "flos": 34349238535680.0, + "grad_norm": 1.8153435843839463, + "language_loss": 0.75194407, + "learning_rate": 3.825673036958624e-06, + "loss": 0.77400887, + "num_input_tokens_seen": 57577660, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.03125, + "step": 2659, + "time_per_iteration": 2.587700366973877 + }, + { + "auxiliary_loss_clip": 0.01159224, + "auxiliary_loss_mlp": 0.01052946, + "balance_loss_clip": 1.03295422, + "balance_loss_mlp": 1.05531979, + "epoch": 0.15992785209679844, + "flos": 22054969739520.0, + "grad_norm": 2.4521775760186526, + "language_loss": 0.90417045, + "learning_rate": 3.825513975315508e-06, + "loss": 0.92629218, + "num_input_tokens_seen": 57596335, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0390625, + "step": 2660, + "time_per_iteration": 2.45237398147583 + }, + { + "auxiliary_loss_clip": 0.01161295, + "auxiliary_loss_mlp": 0.0105014, + "balance_loss_clip": 1.0300889, + "balance_loss_mlp": 1.05822825, + "epoch": 0.1599879753494664, + "flos": 33066652625280.0, + "grad_norm": 2.0123178843036373, + "language_loss": 0.77552611, + "learning_rate": 3.82535484444872e-06, + "loss": 0.79764044, + "num_input_tokens_seen": 57616830, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.03125, + "step": 2661, + "time_per_iteration": 2.574652910232544 + }, + { + "auxiliary_loss_clip": 0.01158998, + "auxiliary_loss_mlp": 0.01048944, + "balance_loss_clip": 1.02913153, + "balance_loss_mlp": 1.05460262, + "epoch": 0.16004809860213437, + "flos": 28038010343040.0, + "grad_norm": 1.7348749157972516, + "language_loss": 0.74735796, + "learning_rate": 3.825195644364292e-06, + "loss": 0.76943737, + "num_input_tokens_seen": 57635515, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.046875, + "step": 2662, + "time_per_iteration": 2.506974935531616 + }, + { + "auxiliary_loss_clip": 0.01158039, + "auxiliary_loss_mlp": 0.01051532, + "balance_loss_clip": 1.03233898, + "balance_loss_mlp": 1.05416894, + "epoch": 0.16010822185480234, + "flos": 22780113505920.0, + "grad_norm": 2.0770925688556074, + "language_loss": 0.82047677, + "learning_rate": 3.825036375068263e-06, + "loss": 0.84257245, + "num_input_tokens_seen": 57654250, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0390625, + "step": 2663, + "time_per_iteration": 2.459630012512207 + }, + { + "auxiliary_loss_clip": 0.0116012, + "auxiliary_loss_mlp": 0.0104966, + "balance_loss_clip": 1.02978826, + "balance_loss_mlp": 1.05576038, + "epoch": 0.16016834510747033, + "flos": 20084012611200.0, + "grad_norm": 2.5815812177362454, + "language_loss": 0.7910682, + "learning_rate": 3.824877036566672e-06, + "loss": 0.81316602, + "num_input_tokens_seen": 57672645, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0390625, + "step": 2664, + "time_per_iteration": 2.4978790283203125 + }, + { + "auxiliary_loss_clip": 0.01156133, + "auxiliary_loss_mlp": 0.01051164, + "balance_loss_clip": 1.03222167, + "balance_loss_mlp": 1.05318165, + "epoch": 0.1602284683601383, + "flos": 21173829206400.0, + "grad_norm": 1.8148985254226184, + "language_loss": 0.93767202, + "learning_rate": 3.824717628865561e-06, + "loss": 0.95974499, + "num_input_tokens_seen": 57691055, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.03125, + "step": 2665, + "time_per_iteration": 2.467349052429199 + }, + { + "auxiliary_loss_clip": 0.01157965, + "auxiliary_loss_mlp": 0.01047224, + "balance_loss_clip": 1.02750635, + "balance_loss_mlp": 1.05352151, + "epoch": 0.16028859161280626, + "flos": 14647568244480.0, + "grad_norm": 1.9534389472193405, + "language_loss": 0.85255575, + "learning_rate": 3.824558151970974e-06, + "loss": 0.87460762, + "num_input_tokens_seen": 57707235, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.046875, + "step": 2666, + "time_per_iteration": 2.4229867458343506 + }, + { + "auxiliary_loss_clip": 0.01155877, + "auxiliary_loss_mlp": 0.01047711, + "balance_loss_clip": 1.02899504, + "balance_loss_mlp": 1.05404496, + "epoch": 0.16034871486547422, + "flos": 20990325600000.0, + "grad_norm": 1.873987360542769, + "language_loss": 0.81461811, + "learning_rate": 3.8243986058889595e-06, + "loss": 0.83665401, + "num_input_tokens_seen": 57724190, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 2667, + "time_per_iteration": 2.4989583492279053 + }, + { + "auxiliary_loss_clip": 0.01157612, + "auxiliary_loss_mlp": 0.01050501, + "balance_loss_clip": 1.03104627, + "balance_loss_mlp": 1.05707479, + "epoch": 0.1604088381181422, + "flos": 21397732634880.0, + "grad_norm": 2.676276626789842, + "language_loss": 0.74079859, + "learning_rate": 3.824238990625567e-06, + "loss": 0.76287973, + "num_input_tokens_seen": 57743620, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0078125, + "step": 2668, + "time_per_iteration": 2.463395357131958 + }, + { + "auxiliary_loss_clip": 0.01158531, + "auxiliary_loss_mlp": 0.01051768, + "balance_loss_clip": 1.03175282, + "balance_loss_mlp": 1.05527806, + "epoch": 0.16046896137081015, + "flos": 23877040993920.0, + "grad_norm": 1.6382268793433732, + "language_loss": 0.77214229, + "learning_rate": 3.824079306186848e-06, + "loss": 0.79424524, + "num_input_tokens_seen": 57764810, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.03125, + "step": 2669, + "time_per_iteration": 2.5107781887054443 + }, + { + "auxiliary_loss_clip": 0.01059914, + "auxiliary_loss_mlp": 0.01008943, + "balance_loss_clip": 1.0062964, + "balance_loss_mlp": 1.0249362, + "epoch": 0.16052908462347812, + "flos": 59806709015040.0, + "grad_norm": 0.8072457077707946, + "language_loss": 0.55571371, + "learning_rate": 3.823919552578861e-06, + "loss": 0.57640231, + "num_input_tokens_seen": 57824390, + "router_z_loss_clip": 0.02648926, + "router_z_loss_mlp": 0.34960938, + "step": 2670, + "time_per_iteration": 2.964386463165283 + }, + { + "auxiliary_loss_clip": 0.01157188, + "auxiliary_loss_mlp": 0.01043938, + "balance_loss_clip": 1.02544856, + "balance_loss_mlp": 1.05379438, + "epoch": 0.1605892078761461, + "flos": 18296559089280.0, + "grad_norm": 8.31640977393562, + "language_loss": 0.77088535, + "learning_rate": 3.82375972980766e-06, + "loss": 0.79289663, + "num_input_tokens_seen": 57843665, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.03125, + "step": 2671, + "time_per_iteration": 2.4722845554351807 + }, + { + "auxiliary_loss_clip": 0.01159298, + "auxiliary_loss_mlp": 0.01045605, + "balance_loss_clip": 1.02684164, + "balance_loss_mlp": 1.05666459, + "epoch": 0.16064933112881408, + "flos": 32160734686080.0, + "grad_norm": 1.9636142117953166, + "language_loss": 0.64497644, + "learning_rate": 3.8235998378793086e-06, + "loss": 0.66702545, + "num_input_tokens_seen": 57863305, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.03125, + "step": 2672, + "time_per_iteration": 2.5702145099639893 + }, + { + "auxiliary_loss_clip": 0.01157155, + "auxiliary_loss_mlp": 0.01042294, + "balance_loss_clip": 1.02128983, + "balance_loss_mlp": 1.05270457, + "epoch": 0.16070945438148204, + "flos": 19828795501440.0, + "grad_norm": 1.885579538712505, + "language_loss": 0.8533771, + "learning_rate": 3.8234398767998675e-06, + "loss": 0.87537158, + "num_input_tokens_seen": 57883025, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.046875, + "step": 2673, + "time_per_iteration": 2.4754209518432617 + }, + { + "auxiliary_loss_clip": 0.01156938, + "auxiliary_loss_mlp": 0.01055602, + "balance_loss_clip": 1.03718424, + "balance_loss_mlp": 1.05537605, + "epoch": 0.16076957763415, + "flos": 18913144976640.0, + "grad_norm": 2.484212796080384, + "language_loss": 0.72797197, + "learning_rate": 3.823279846575403e-06, + "loss": 0.75009739, + "num_input_tokens_seen": 57901430, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.015625, + "step": 2674, + "time_per_iteration": 2.4771230220794678 + }, + { + "auxiliary_loss_clip": 0.01153717, + "auxiliary_loss_mlp": 0.01047616, + "balance_loss_clip": 1.02745771, + "balance_loss_mlp": 1.05242229, + "epoch": 0.16082970088681797, + "flos": 16764358590720.0, + "grad_norm": 2.0917218572710143, + "language_loss": 0.84550452, + "learning_rate": 3.823119747211986e-06, + "loss": 0.86751789, + "num_input_tokens_seen": 57919550, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.015625, + "step": 2675, + "time_per_iteration": 2.4583237171173096 + }, + { + "auxiliary_loss_clip": 0.01158822, + "auxiliary_loss_mlp": 0.0104935, + "balance_loss_clip": 1.02890563, + "balance_loss_mlp": 1.0566349, + "epoch": 0.16088982413948594, + "flos": 35150261783040.0, + "grad_norm": 1.979365293626276, + "language_loss": 0.82605797, + "learning_rate": 3.822959578715685e-06, + "loss": 0.84813964, + "num_input_tokens_seen": 57939890, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0234375, + "step": 2676, + "time_per_iteration": 2.5966403484344482 + }, + { + "auxiliary_loss_clip": 0.01157172, + "auxiliary_loss_mlp": 0.01050632, + "balance_loss_clip": 1.03263116, + "balance_loss_mlp": 1.05701363, + "epoch": 0.1609499473921539, + "flos": 18625105814400.0, + "grad_norm": 1.9372140801278581, + "language_loss": 0.73252106, + "learning_rate": 3.822799341092573e-06, + "loss": 0.75459909, + "num_input_tokens_seen": 57957410, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0, + "step": 2677, + "time_per_iteration": 2.459545135498047 + }, + { + "auxiliary_loss_clip": 0.01153742, + "auxiliary_loss_mlp": 0.01046774, + "balance_loss_clip": 1.02774811, + "balance_loss_mlp": 1.05381799, + "epoch": 0.1610100706448219, + "flos": 33145728416640.0, + "grad_norm": 3.4714871699848, + "language_loss": 0.76175338, + "learning_rate": 3.822639034348728e-06, + "loss": 0.78375852, + "num_input_tokens_seen": 57977900, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0, + "step": 2678, + "time_per_iteration": 2.6220550537109375 + }, + { + "auxiliary_loss_clip": 0.01154476, + "auxiliary_loss_mlp": 0.01048242, + "balance_loss_clip": 1.02835846, + "balance_loss_mlp": 1.05157948, + "epoch": 0.16107019389748986, + "flos": 34676707852800.0, + "grad_norm": 1.6939354956764687, + "language_loss": 0.70202518, + "learning_rate": 3.822478658490228e-06, + "loss": 0.72405231, + "num_input_tokens_seen": 57998210, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.03125, + "step": 2679, + "time_per_iteration": 2.580995559692383 + }, + { + "auxiliary_loss_clip": 0.01054747, + "auxiliary_loss_mlp": 0.01023179, + "balance_loss_clip": 1.020854, + "balance_loss_mlp": 1.02026391, + "epoch": 0.16113031715015783, + "flos": 65713403260800.0, + "grad_norm": 0.8161414687228778, + "language_loss": 0.51844025, + "learning_rate": 3.822318213523154e-06, + "loss": 0.5392195, + "num_input_tokens_seen": 58059420, + "router_z_loss_clip": 0.02319336, + "router_z_loss_mlp": 0.34375, + "step": 2680, + "time_per_iteration": 3.105682849884033 + }, + { + "auxiliary_loss_clip": 0.01155604, + "auxiliary_loss_mlp": 0.01047691, + "balance_loss_clip": 1.02750874, + "balance_loss_mlp": 1.05157876, + "epoch": 0.1611904404028258, + "flos": 20810413353600.0, + "grad_norm": 1.8335073832427007, + "language_loss": 0.80319828, + "learning_rate": 3.8221576994535925e-06, + "loss": 0.82523119, + "num_input_tokens_seen": 58078370, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0390625, + "step": 2681, + "time_per_iteration": 2.4695565700531006 + }, + { + "auxiliary_loss_clip": 0.01151045, + "auxiliary_loss_mlp": 0.01058971, + "balance_loss_clip": 1.04031444, + "balance_loss_mlp": 1.05258918, + "epoch": 0.16125056365549376, + "flos": 27013335062400.0, + "grad_norm": 1.8021457293712753, + "language_loss": 0.69142133, + "learning_rate": 3.821997116287627e-06, + "loss": 0.71352148, + "num_input_tokens_seen": 58097395, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.984375, + "step": 2682, + "time_per_iteration": 2.5027854442596436 + }, + { + "auxiliary_loss_clip": 0.011576, + "auxiliary_loss_mlp": 0.01048243, + "balance_loss_clip": 1.02800107, + "balance_loss_mlp": 1.0559957, + "epoch": 0.16131068690816172, + "flos": 19276524915840.0, + "grad_norm": 1.8107912193408944, + "language_loss": 0.87568235, + "learning_rate": 3.821836464031348e-06, + "loss": 0.89774084, + "num_input_tokens_seen": 58115630, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.015625, + "step": 2683, + "time_per_iteration": 2.461944341659546 + }, + { + "auxiliary_loss_clip": 0.01156212, + "auxiliary_loss_mlp": 0.0105566, + "balance_loss_clip": 1.03587174, + "balance_loss_mlp": 1.05452991, + "epoch": 0.16137081016082971, + "flos": 35337931367040.0, + "grad_norm": 3.5824209574719035, + "language_loss": 0.74160969, + "learning_rate": 3.821675742690849e-06, + "loss": 0.76372838, + "num_input_tokens_seen": 58138655, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.015625, + "step": 2684, + "time_per_iteration": 4.005981206893921 + }, + { + "auxiliary_loss_clip": 0.01159701, + "auxiliary_loss_mlp": 0.01048543, + "balance_loss_clip": 1.02811038, + "balance_loss_mlp": 1.05543995, + "epoch": 0.16143093341349768, + "flos": 34235257703040.0, + "grad_norm": 1.919238603617177, + "language_loss": 0.70244128, + "learning_rate": 3.821514952272223e-06, + "loss": 0.72452366, + "num_input_tokens_seen": 58157440, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.046875, + "step": 2685, + "time_per_iteration": 5.387023448944092 + }, + { + "auxiliary_loss_clip": 0.0115036, + "auxiliary_loss_mlp": 0.01047397, + "balance_loss_clip": 1.0282284, + "balance_loss_mlp": 1.0518229, + "epoch": 0.16149105666616564, + "flos": 27999262546560.0, + "grad_norm": 1.8016019482814314, + "language_loss": 0.71518582, + "learning_rate": 3.821354092781567e-06, + "loss": 0.73716336, + "num_input_tokens_seen": 58176660, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.984375, + "step": 2686, + "time_per_iteration": 2.5451064109802246 + }, + { + "auxiliary_loss_clip": 0.01157161, + "auxiliary_loss_mlp": 0.01051189, + "balance_loss_clip": 1.03191292, + "balance_loss_mlp": 1.05551481, + "epoch": 0.1615511799188336, + "flos": 19422214479360.0, + "grad_norm": 1.8631629169214377, + "language_loss": 0.81521869, + "learning_rate": 3.821193164224981e-06, + "loss": 0.83730221, + "num_input_tokens_seen": 58195085, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.015625, + "step": 2687, + "time_per_iteration": 2.4542620182037354 + }, + { + "auxiliary_loss_clip": 0.01155843, + "auxiliary_loss_mlp": 0.01044301, + "balance_loss_clip": 1.02327275, + "balance_loss_mlp": 1.04894984, + "epoch": 0.16161130317150157, + "flos": 22854915578880.0, + "grad_norm": 1.8081463969498348, + "language_loss": 0.71823454, + "learning_rate": 3.821032166608568e-06, + "loss": 0.74023592, + "num_input_tokens_seen": 58213540, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.0625, + "step": 2688, + "time_per_iteration": 2.493476152420044 + }, + { + "auxiliary_loss_clip": 0.0115191, + "auxiliary_loss_mlp": 0.01045785, + "balance_loss_clip": 1.02730739, + "balance_loss_mlp": 1.05067098, + "epoch": 0.16167142642416954, + "flos": 26110577520000.0, + "grad_norm": 2.2392978206929555, + "language_loss": 0.76041406, + "learning_rate": 3.8208710999384325e-06, + "loss": 0.78239101, + "num_input_tokens_seen": 58236995, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.015625, + "step": 2689, + "time_per_iteration": 2.5840976238250732 + }, + { + "auxiliary_loss_clip": 0.01155388, + "auxiliary_loss_mlp": 0.01046669, + "balance_loss_clip": 1.02704763, + "balance_loss_mlp": 1.05417943, + "epoch": 0.1617315496768375, + "flos": 22779646629120.0, + "grad_norm": 1.9258973882551216, + "language_loss": 0.87260234, + "learning_rate": 3.820709964220683e-06, + "loss": 0.89462292, + "num_input_tokens_seen": 58257230, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.015625, + "step": 2690, + "time_per_iteration": 2.496943473815918 + }, + { + "auxiliary_loss_clip": 0.01151534, + "auxiliary_loss_mlp": 0.01047712, + "balance_loss_clip": 1.02980638, + "balance_loss_mlp": 1.05211663, + "epoch": 0.1617916729295055, + "flos": 22017299351040.0, + "grad_norm": 1.562024048541713, + "language_loss": 0.87728393, + "learning_rate": 3.8205487594614284e-06, + "loss": 0.89927632, + "num_input_tokens_seen": 58277080, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9921875, + "step": 2691, + "time_per_iteration": 2.510960817337036 + }, + { + "auxiliary_loss_clip": 0.01157097, + "auxiliary_loss_mlp": 0.01049167, + "balance_loss_clip": 1.02764988, + "balance_loss_mlp": 1.05021381, + "epoch": 0.16185179618217346, + "flos": 23438248450560.0, + "grad_norm": 2.082856606872889, + "language_loss": 0.82327259, + "learning_rate": 3.820387485666784e-06, + "loss": 0.84533525, + "num_input_tokens_seen": 58294815, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.0703125, + "step": 2692, + "time_per_iteration": 2.481032371520996 + }, + { + "auxiliary_loss_clip": 0.0115716, + "auxiliary_loss_mlp": 0.01048999, + "balance_loss_clip": 1.02835155, + "balance_loss_mlp": 1.05069244, + "epoch": 0.16191191943484143, + "flos": 25666110627840.0, + "grad_norm": 3.0763505181853454, + "language_loss": 0.80942917, + "learning_rate": 3.820226142842862e-06, + "loss": 0.83149081, + "num_input_tokens_seen": 58313215, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.0625, + "step": 2693, + "time_per_iteration": 2.493278980255127 + }, + { + "auxiliary_loss_clip": 0.01150662, + "auxiliary_loss_mlp": 0.01054953, + "balance_loss_clip": 1.03670192, + "balance_loss_mlp": 1.05223358, + "epoch": 0.1619720426875094, + "flos": 23477355383040.0, + "grad_norm": 1.7139740211881158, + "language_loss": 0.83639967, + "learning_rate": 3.820064730995783e-06, + "loss": 0.85845578, + "num_input_tokens_seen": 58333215, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.984375, + "step": 2694, + "time_per_iteration": 2.5051510334014893 + }, + { + "auxiliary_loss_clip": 0.01156309, + "auxiliary_loss_mlp": 0.01048183, + "balance_loss_clip": 1.02764273, + "balance_loss_mlp": 1.0509156, + "epoch": 0.16203216594017736, + "flos": 24133658734080.0, + "grad_norm": 1.9608549080280004, + "language_loss": 0.69125426, + "learning_rate": 3.819903250131667e-06, + "loss": 0.71329916, + "num_input_tokens_seen": 58351160, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0546875, + "step": 2695, + "time_per_iteration": 2.495098352432251 + }, + { + "auxiliary_loss_clip": 0.01159947, + "auxiliary_loss_mlp": 0.01054922, + "balance_loss_clip": 1.03391731, + "balance_loss_mlp": 1.05520689, + "epoch": 0.16209228919284532, + "flos": 22340889999360.0, + "grad_norm": 2.466913217352614, + "language_loss": 0.82403111, + "learning_rate": 3.819741700256637e-06, + "loss": 0.84617984, + "num_input_tokens_seen": 58368505, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.046875, + "step": 2696, + "time_per_iteration": 2.484523296356201 + }, + { + "auxiliary_loss_clip": 0.01161904, + "auxiliary_loss_mlp": 0.01056335, + "balance_loss_clip": 1.03529406, + "balance_loss_mlp": 1.05316591, + "epoch": 0.1621524124455133, + "flos": 15815131827840.0, + "grad_norm": 1.9982919021229957, + "language_loss": 0.8852337, + "learning_rate": 3.8195800813768194e-06, + "loss": 0.90741605, + "num_input_tokens_seen": 58385085, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.09375, + "step": 2697, + "time_per_iteration": 2.4806151390075684 + }, + { + "auxiliary_loss_clip": 0.01147135, + "auxiliary_loss_mlp": 0.01046149, + "balance_loss_clip": 1.02756453, + "balance_loss_mlp": 1.04989469, + "epoch": 0.16221253569818128, + "flos": 30186688988160.0, + "grad_norm": 1.4702975792509376, + "language_loss": 0.80172735, + "learning_rate": 3.819418393498343e-06, + "loss": 0.82366014, + "num_input_tokens_seen": 58406985, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.97265625, + "step": 2698, + "time_per_iteration": 2.532137393951416 + }, + { + "auxiliary_loss_clip": 0.01149805, + "auxiliary_loss_mlp": 0.01049018, + "balance_loss_clip": 1.02957439, + "balance_loss_mlp": 1.05167758, + "epoch": 0.16227265895084925, + "flos": 24605991601920.0, + "grad_norm": 1.5576448961090323, + "language_loss": 0.77258182, + "learning_rate": 3.819256636627339e-06, + "loss": 0.79456997, + "num_input_tokens_seen": 58426205, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.98046875, + "step": 2699, + "time_per_iteration": 2.514084577560425 + }, + { + "auxiliary_loss_clip": 0.01150261, + "auxiliary_loss_mlp": 0.0104371, + "balance_loss_clip": 1.0251497, + "balance_loss_mlp": 1.04891944, + "epoch": 0.1623327822035172, + "flos": 19573326996480.0, + "grad_norm": 2.038036982956784, + "language_loss": 0.85697722, + "learning_rate": 3.81909481076994e-06, + "loss": 0.87891692, + "num_input_tokens_seen": 58443830, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.015625, + "step": 2700, + "time_per_iteration": 2.4434289932250977 + }, + { + "auxiliary_loss_clip": 0.01147712, + "auxiliary_loss_mlp": 0.0104585, + "balance_loss_clip": 1.0247376, + "balance_loss_mlp": 1.04878318, + "epoch": 0.16239290545618518, + "flos": 26468462678400.0, + "grad_norm": 1.6982179557795123, + "language_loss": 0.80378878, + "learning_rate": 3.818932915932284e-06, + "loss": 0.82572436, + "num_input_tokens_seen": 58464405, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.98828125, + "step": 2701, + "time_per_iteration": 2.5267322063446045 + }, + { + "auxiliary_loss_clip": 0.01156296, + "auxiliary_loss_mlp": 0.01048895, + "balance_loss_clip": 1.02945244, + "balance_loss_mlp": 1.05514598, + "epoch": 0.16245302870885314, + "flos": 15851940289920.0, + "grad_norm": 1.5999982166608073, + "language_loss": 0.73006868, + "learning_rate": 3.818770952120511e-06, + "loss": 0.75212055, + "num_input_tokens_seen": 58483295, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0078125, + "step": 2702, + "time_per_iteration": 2.44750714302063 + }, + { + "auxiliary_loss_clip": 0.01153204, + "auxiliary_loss_mlp": 0.01050741, + "balance_loss_clip": 1.02986753, + "balance_loss_mlp": 1.05053687, + "epoch": 0.1625131519615211, + "flos": 14756521173120.0, + "grad_norm": 2.5386207662450464, + "language_loss": 0.73164749, + "learning_rate": 3.81860891934076e-06, + "loss": 0.7536869, + "num_input_tokens_seen": 58501205, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.0234375, + "step": 2703, + "time_per_iteration": 2.469242811203003 + }, + { + "auxiliary_loss_clip": 0.01150736, + "auxiliary_loss_mlp": 0.01046914, + "balance_loss_clip": 1.02538514, + "balance_loss_mlp": 1.04765964, + "epoch": 0.1625732752141891, + "flos": 28220508368640.0, + "grad_norm": 1.9216464968932823, + "language_loss": 0.70681584, + "learning_rate": 3.818446817599176e-06, + "loss": 0.72879231, + "num_input_tokens_seen": 58522315, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.03125, + "step": 2704, + "time_per_iteration": 2.5236263275146484 + }, + { + "auxiliary_loss_clip": 0.0105028, + "auxiliary_loss_mlp": 0.01003507, + "balance_loss_clip": 1.00091982, + "balance_loss_mlp": 1.01563144, + "epoch": 0.16263339846685707, + "flos": 67327947688320.0, + "grad_norm": 0.7797469934396678, + "language_loss": 0.53369009, + "learning_rate": 3.818284646901907e-06, + "loss": 0.55422795, + "num_input_tokens_seen": 58586695, + "router_z_loss_clip": 0.02587891, + "router_z_loss_mlp": 0.34765625, + "step": 2705, + "time_per_iteration": 3.0887868404388428 + }, + { + "auxiliary_loss_clip": 0.0115608, + "auxiliary_loss_mlp": 0.01048272, + "balance_loss_clip": 1.02873373, + "balance_loss_mlp": 1.05151534, + "epoch": 0.16269352171952503, + "flos": 14319165173760.0, + "grad_norm": 2.4525976943058896, + "language_loss": 0.75060308, + "learning_rate": 3.818122407255102e-06, + "loss": 0.77264655, + "num_input_tokens_seen": 58602435, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.046875, + "step": 2706, + "time_per_iteration": 2.439283847808838 + }, + { + "auxiliary_loss_clip": 0.01154579, + "auxiliary_loss_mlp": 0.01051686, + "balance_loss_clip": 1.03248119, + "balance_loss_mlp": 1.05240536, + "epoch": 0.162753644972193, + "flos": 28361205941760.0, + "grad_norm": 1.9153778871117788, + "language_loss": 0.7234174, + "learning_rate": 3.817960098664914e-06, + "loss": 0.74547994, + "num_input_tokens_seen": 58621275, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.015625, + "step": 2707, + "time_per_iteration": 2.51819109916687 + }, + { + "auxiliary_loss_clip": 0.01155215, + "auxiliary_loss_mlp": 0.01050366, + "balance_loss_clip": 1.03154302, + "balance_loss_mlp": 1.05275822, + "epoch": 0.16281376822486096, + "flos": 19937856170880.0, + "grad_norm": 3.869992791268662, + "language_loss": 0.83790398, + "learning_rate": 3.817797721137495e-06, + "loss": 0.85995972, + "num_input_tokens_seen": 58637550, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.03125, + "step": 2708, + "time_per_iteration": 2.4592010974884033 + }, + { + "auxiliary_loss_clip": 0.0115992, + "auxiliary_loss_mlp": 0.01049095, + "balance_loss_clip": 1.02768469, + "balance_loss_mlp": 1.05268705, + "epoch": 0.16287389147752893, + "flos": 21251719848960.0, + "grad_norm": 2.162290718142945, + "language_loss": 0.86529553, + "learning_rate": 3.817635274679006e-06, + "loss": 0.88738573, + "num_input_tokens_seen": 58654135, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.0703125, + "step": 2709, + "time_per_iteration": 2.4745054244995117 + }, + { + "auxiliary_loss_clip": 0.01154974, + "auxiliary_loss_mlp": 0.01054439, + "balance_loss_clip": 1.0353297, + "balance_loss_mlp": 1.05096519, + "epoch": 0.1629340147301969, + "flos": 19244672530560.0, + "grad_norm": 1.6782807127870958, + "language_loss": 0.91449893, + "learning_rate": 3.817472759295605e-06, + "loss": 0.93659306, + "num_input_tokens_seen": 58674320, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0390625, + "step": 2710, + "time_per_iteration": 2.4846651554107666 + }, + { + "auxiliary_loss_clip": 0.0115562, + "auxiliary_loss_mlp": 0.01054818, + "balance_loss_clip": 1.03549433, + "balance_loss_mlp": 1.05447197, + "epoch": 0.16299413798286488, + "flos": 21249816428160.0, + "grad_norm": 1.99410407833921, + "language_loss": 0.8129673, + "learning_rate": 3.817310174993453e-06, + "loss": 0.83507168, + "num_input_tokens_seen": 58691000, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0078125, + "step": 2711, + "time_per_iteration": 2.4878618717193604 + }, + { + "auxiliary_loss_clip": 0.01153665, + "auxiliary_loss_mlp": 0.01046499, + "balance_loss_clip": 1.02731824, + "balance_loss_mlp": 1.04737568, + "epoch": 0.16305426123553285, + "flos": 18770579896320.0, + "grad_norm": 2.7794575527068077, + "language_loss": 0.81605875, + "learning_rate": 3.817147521778719e-06, + "loss": 0.83806038, + "num_input_tokens_seen": 58710230, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0625, + "step": 2712, + "time_per_iteration": 2.4479072093963623 + }, + { + "auxiliary_loss_clip": 0.01158025, + "auxiliary_loss_mlp": 0.01058532, + "balance_loss_clip": 1.03858864, + "balance_loss_mlp": 1.05211174, + "epoch": 0.16311438448820081, + "flos": 22087648137600.0, + "grad_norm": 2.1959953506899774, + "language_loss": 0.76885653, + "learning_rate": 3.816984799657568e-06, + "loss": 0.79102206, + "num_input_tokens_seen": 58728610, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0625, + "step": 2713, + "time_per_iteration": 2.493394374847412 + }, + { + "auxiliary_loss_clip": 0.01155185, + "auxiliary_loss_mlp": 0.01062558, + "balance_loss_clip": 1.04290032, + "balance_loss_mlp": 1.05623782, + "epoch": 0.16317450774086878, + "flos": 16467700164480.0, + "grad_norm": 2.081844956712308, + "language_loss": 0.78926778, + "learning_rate": 3.8168220086361715e-06, + "loss": 0.8114453, + "num_input_tokens_seen": 58744385, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.98828125, + "step": 2714, + "time_per_iteration": 2.442214012145996 + }, + { + "auxiliary_loss_clip": 0.01155305, + "auxiliary_loss_mlp": 0.01059199, + "balance_loss_clip": 1.04011369, + "balance_loss_mlp": 1.05286288, + "epoch": 0.16323463099353674, + "flos": 24352929308160.0, + "grad_norm": 2.259619309439112, + "language_loss": 0.78143466, + "learning_rate": 3.816659148720702e-06, + "loss": 0.80357969, + "num_input_tokens_seen": 58763905, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0234375, + "step": 2715, + "time_per_iteration": 2.499178409576416 + }, + { + "auxiliary_loss_clip": 0.01150615, + "auxiliary_loss_mlp": 0.01047807, + "balance_loss_clip": 1.02973497, + "balance_loss_mlp": 1.04868412, + "epoch": 0.1632947542462047, + "flos": 24900782520960.0, + "grad_norm": 2.0916631483814783, + "language_loss": 0.81397748, + "learning_rate": 3.816496219917336e-06, + "loss": 0.8359617, + "num_input_tokens_seen": 58785580, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.015625, + "step": 2716, + "time_per_iteration": 2.5004689693450928 + }, + { + "auxiliary_loss_clip": 0.01158421, + "auxiliary_loss_mlp": 0.01057354, + "balance_loss_clip": 1.03853106, + "balance_loss_mlp": 1.05482328, + "epoch": 0.1633548774988727, + "flos": 24900279730560.0, + "grad_norm": 1.8793848003912939, + "language_loss": 0.86203027, + "learning_rate": 3.816333222232251e-06, + "loss": 0.88418794, + "num_input_tokens_seen": 58806075, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0390625, + "step": 2717, + "time_per_iteration": 2.5112617015838623 + }, + { + "auxiliary_loss_clip": 0.0115161, + "auxiliary_loss_mlp": 0.01046152, + "balance_loss_clip": 1.02725708, + "balance_loss_mlp": 1.05153894, + "epoch": 0.16341500075154067, + "flos": 30441798357120.0, + "grad_norm": 1.652261986612604, + "language_loss": 0.76514149, + "learning_rate": 3.816170155671629e-06, + "loss": 0.78711915, + "num_input_tokens_seen": 58827405, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0, + "step": 2718, + "time_per_iteration": 2.549654245376587 + }, + { + "auxiliary_loss_clip": 0.01156654, + "auxiliary_loss_mlp": 0.01045361, + "balance_loss_clip": 1.02696729, + "balance_loss_mlp": 1.05180717, + "epoch": 0.16347512400420863, + "flos": 22784530878720.0, + "grad_norm": 2.080955072975882, + "language_loss": 0.73027492, + "learning_rate": 3.816007020241652e-06, + "loss": 0.75229508, + "num_input_tokens_seen": 58847205, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.046875, + "step": 2719, + "time_per_iteration": 2.4911599159240723 + }, + { + "auxiliary_loss_clip": 0.01151759, + "auxiliary_loss_mlp": 0.01049636, + "balance_loss_clip": 1.03084862, + "balance_loss_mlp": 1.0492239, + "epoch": 0.1635352472568766, + "flos": 22633274707200.0, + "grad_norm": 1.6610037254914274, + "language_loss": 0.72384167, + "learning_rate": 3.815843815948507e-06, + "loss": 0.74585563, + "num_input_tokens_seen": 58866865, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0234375, + "step": 2720, + "time_per_iteration": 2.4733760356903076 + }, + { + "auxiliary_loss_clip": 0.01150132, + "auxiliary_loss_mlp": 0.01048266, + "balance_loss_clip": 1.02789283, + "balance_loss_mlp": 1.05076206, + "epoch": 0.16359537050954456, + "flos": 15522998515200.0, + "grad_norm": 2.2797021453727893, + "language_loss": 0.75100243, + "learning_rate": 3.8156805427983824e-06, + "loss": 0.77298641, + "num_input_tokens_seen": 58885200, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.9921875, + "step": 2721, + "time_per_iteration": 2.44942569732666 + }, + { + "auxiliary_loss_clip": 0.01155245, + "auxiliary_loss_mlp": 0.01049168, + "balance_loss_clip": 1.02893853, + "balance_loss_mlp": 1.0502317, + "epoch": 0.16365549376221253, + "flos": 22090162089600.0, + "grad_norm": 1.74959220753002, + "language_loss": 0.79254043, + "learning_rate": 3.8155172007974695e-06, + "loss": 0.81458461, + "num_input_tokens_seen": 58906385, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.046875, + "step": 2722, + "time_per_iteration": 2.4775915145874023 + }, + { + "auxiliary_loss_clip": 0.01158964, + "auxiliary_loss_mlp": 0.01049216, + "balance_loss_clip": 1.02675653, + "balance_loss_mlp": 1.05248678, + "epoch": 0.1637156170148805, + "flos": 24060400945920.0, + "grad_norm": 2.0539311275727634, + "language_loss": 0.8477816, + "learning_rate": 3.8153537899519624e-06, + "loss": 0.86986339, + "num_input_tokens_seen": 58925040, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.0625, + "step": 2723, + "time_per_iteration": 2.5084922313690186 + }, + { + "auxiliary_loss_clip": 0.01146914, + "auxiliary_loss_mlp": 0.01037208, + "balance_loss_clip": 1.0177772, + "balance_loss_mlp": 1.04940808, + "epoch": 0.1637757402675485, + "flos": 26685362954880.0, + "grad_norm": 2.0049787201865503, + "language_loss": 0.70883536, + "learning_rate": 3.815190310268058e-06, + "loss": 0.73067659, + "num_input_tokens_seen": 58944790, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.9765625, + "step": 2724, + "time_per_iteration": 2.5094263553619385 + }, + { + "auxiliary_loss_clip": 0.01150034, + "auxiliary_loss_mlp": 0.01044202, + "balance_loss_clip": 1.02583206, + "balance_loss_mlp": 1.05113125, + "epoch": 0.16383586352021645, + "flos": 16106941918080.0, + "grad_norm": 2.04326868324577, + "language_loss": 0.70914948, + "learning_rate": 3.815026761751955e-06, + "loss": 0.73109186, + "num_input_tokens_seen": 58962500, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.9921875, + "step": 2725, + "time_per_iteration": 2.495342254638672 + }, + { + "auxiliary_loss_clip": 0.01149794, + "auxiliary_loss_mlp": 0.01042547, + "balance_loss_clip": 1.02437937, + "balance_loss_mlp": 1.05219352, + "epoch": 0.16389598677288442, + "flos": 19165991788800.0, + "grad_norm": 1.9381311422505, + "language_loss": 0.8873682, + "learning_rate": 3.814863144409855e-06, + "loss": 0.90929163, + "num_input_tokens_seen": 58980355, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9765625, + "step": 2726, + "time_per_iteration": 3.983738660812378 + }, + { + "auxiliary_loss_clip": 0.01156798, + "auxiliary_loss_mlp": 0.01049309, + "balance_loss_clip": 1.02965117, + "balance_loss_mlp": 1.05406547, + "epoch": 0.16395611002555238, + "flos": 21507008785920.0, + "grad_norm": 1.8502717081228044, + "language_loss": 0.7439661, + "learning_rate": 3.814699458247963e-06, + "loss": 0.76602715, + "num_input_tokens_seen": 58999505, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.03125, + "step": 2727, + "time_per_iteration": 5.52494215965271 + }, + { + "auxiliary_loss_clip": 0.01150784, + "auxiliary_loss_mlp": 0.0105177, + "balance_loss_clip": 1.03429413, + "balance_loss_mlp": 1.05145037, + "epoch": 0.16401623327822035, + "flos": 21470918595840.0, + "grad_norm": 1.6814144838265654, + "language_loss": 0.82321334, + "learning_rate": 3.8145357032724855e-06, + "loss": 0.84523886, + "num_input_tokens_seen": 59017930, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9921875, + "step": 2728, + "time_per_iteration": 2.4621498584747314 + }, + { + "auxiliary_loss_clip": 0.01156146, + "auxiliary_loss_mlp": 0.01050932, + "balance_loss_clip": 1.03131044, + "balance_loss_mlp": 1.05167341, + "epoch": 0.1640763565308883, + "flos": 13626232928640.0, + "grad_norm": 2.4458707176630425, + "language_loss": 0.84766865, + "learning_rate": 3.814371879489633e-06, + "loss": 0.86973941, + "num_input_tokens_seen": 59035130, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0390625, + "step": 2729, + "time_per_iteration": 2.459495782852173 + }, + { + "auxiliary_loss_clip": 0.01151277, + "auxiliary_loss_mlp": 0.01044659, + "balance_loss_clip": 1.02661061, + "balance_loss_mlp": 1.04923487, + "epoch": 0.16413647978355628, + "flos": 15451464579840.0, + "grad_norm": 1.9327126112676087, + "language_loss": 0.72569054, + "learning_rate": 3.814207986905616e-06, + "loss": 0.74764991, + "num_input_tokens_seen": 59053080, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.0234375, + "step": 2730, + "time_per_iteration": 2.451016902923584 + }, + { + "auxiliary_loss_clip": 0.01153124, + "auxiliary_loss_mlp": 0.01053311, + "balance_loss_clip": 1.03243709, + "balance_loss_mlp": 1.04862678, + "epoch": 0.16419660303622427, + "flos": 45878682015360.0, + "grad_norm": 2.2141787283307854, + "language_loss": 0.74431163, + "learning_rate": 3.814044025526651e-06, + "loss": 0.76637596, + "num_input_tokens_seen": 59075610, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.046875, + "step": 2731, + "time_per_iteration": 2.6857874393463135 + }, + { + "auxiliary_loss_clip": 0.0115844, + "auxiliary_loss_mlp": 0.01048812, + "balance_loss_clip": 1.02818894, + "balance_loss_mlp": 1.05408466, + "epoch": 0.16425672628889224, + "flos": 18952826526720.0, + "grad_norm": 2.15833206643789, + "language_loss": 0.78783584, + "learning_rate": 3.8138799953589548e-06, + "loss": 0.80990839, + "num_input_tokens_seen": 59094555, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.046875, + "step": 2732, + "time_per_iteration": 2.44146728515625 + }, + { + "auxiliary_loss_clip": 0.01155842, + "auxiliary_loss_mlp": 0.01051717, + "balance_loss_clip": 1.03166568, + "balance_loss_mlp": 1.05211556, + "epoch": 0.1643168495415602, + "flos": 24312996362880.0, + "grad_norm": 1.9937390498547816, + "language_loss": 0.68943298, + "learning_rate": 3.8137158964087473e-06, + "loss": 0.71150857, + "num_input_tokens_seen": 59113515, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.0390625, + "step": 2733, + "time_per_iteration": 2.4981601238250732 + }, + { + "auxiliary_loss_clip": 0.01151384, + "auxiliary_loss_mlp": 0.01048132, + "balance_loss_clip": 1.02792621, + "balance_loss_mlp": 1.05054927, + "epoch": 0.16437697279422817, + "flos": 26428421992320.0, + "grad_norm": 2.20018793155086, + "language_loss": 0.80626202, + "learning_rate": 3.8135517286822508e-06, + "loss": 0.8282572, + "num_input_tokens_seen": 59133275, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0078125, + "step": 2734, + "time_per_iteration": 2.495030641555786 + }, + { + "auxiliary_loss_clip": 0.01152713, + "auxiliary_loss_mlp": 0.0105599, + "balance_loss_clip": 1.03638041, + "balance_loss_mlp": 1.05143905, + "epoch": 0.16443709604689613, + "flos": 34532239351680.0, + "grad_norm": 4.0691467716051175, + "language_loss": 0.82265377, + "learning_rate": 3.8133874921856914e-06, + "loss": 0.84474081, + "num_input_tokens_seen": 59154095, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0078125, + "step": 2735, + "time_per_iteration": 2.5911896228790283 + }, + { + "auxiliary_loss_clip": 0.01150004, + "auxiliary_loss_mlp": 0.01044021, + "balance_loss_clip": 1.02556753, + "balance_loss_mlp": 1.05158913, + "epoch": 0.1644972192995641, + "flos": 23258048895360.0, + "grad_norm": 2.5735103485950077, + "language_loss": 0.78697491, + "learning_rate": 3.813223186925296e-06, + "loss": 0.80891526, + "num_input_tokens_seen": 59173795, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.984375, + "step": 2736, + "time_per_iteration": 2.4699559211730957 + }, + { + "auxiliary_loss_clip": 0.01155005, + "auxiliary_loss_mlp": 0.01052588, + "balance_loss_clip": 1.03438449, + "balance_loss_mlp": 1.05231023, + "epoch": 0.1645573425522321, + "flos": 26979543342720.0, + "grad_norm": 1.680513335410081, + "language_loss": 0.81409019, + "learning_rate": 3.8130588129072964e-06, + "loss": 0.83616614, + "num_input_tokens_seen": 59191610, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.03125, + "step": 2737, + "time_per_iteration": 2.4892401695251465 + }, + { + "auxiliary_loss_clip": 0.0115392, + "auxiliary_loss_mlp": 0.01046744, + "balance_loss_clip": 1.02819467, + "balance_loss_mlp": 1.05107307, + "epoch": 0.16461746580490005, + "flos": 28731768600960.0, + "grad_norm": 1.8393773079816103, + "language_loss": 0.87291563, + "learning_rate": 3.8128943701379246e-06, + "loss": 0.89492232, + "num_input_tokens_seen": 59213000, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.03125, + "step": 2738, + "time_per_iteration": 2.54569935798645 + }, + { + "auxiliary_loss_clip": 0.01154293, + "auxiliary_loss_mlp": 0.01055893, + "balance_loss_clip": 1.03653371, + "balance_loss_mlp": 1.05139303, + "epoch": 0.16467758905756802, + "flos": 24930156867840.0, + "grad_norm": 2.0122721864238438, + "language_loss": 0.72351867, + "learning_rate": 3.8127298586234167e-06, + "loss": 0.74562055, + "num_input_tokens_seen": 59232340, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.03125, + "step": 2739, + "time_per_iteration": 2.5309460163116455 + }, + { + "auxiliary_loss_clip": 0.01148442, + "auxiliary_loss_mlp": 0.0104888, + "balance_loss_clip": 1.02991343, + "balance_loss_mlp": 1.04766631, + "epoch": 0.16473771231023598, + "flos": 24826519152000.0, + "grad_norm": 1.690107638621115, + "language_loss": 0.81735384, + "learning_rate": 3.8125652783700104e-06, + "loss": 0.8393271, + "num_input_tokens_seen": 59253950, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0078125, + "step": 2740, + "time_per_iteration": 2.5005404949188232 + }, + { + "auxiliary_loss_clip": 0.01157284, + "auxiliary_loss_mlp": 0.01053239, + "balance_loss_clip": 1.03176928, + "balance_loss_mlp": 1.05347896, + "epoch": 0.16479783556290395, + "flos": 39896072375040.0, + "grad_norm": 2.8033984026588756, + "language_loss": 0.69098473, + "learning_rate": 3.8124006293839475e-06, + "loss": 0.71308994, + "num_input_tokens_seen": 59275545, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.0390625, + "step": 2741, + "time_per_iteration": 2.6353659629821777 + }, + { + "auxiliary_loss_clip": 0.01151645, + "auxiliary_loss_mlp": 0.01044648, + "balance_loss_clip": 1.02588463, + "balance_loss_mlp": 1.04987025, + "epoch": 0.16485795881557191, + "flos": 19897061299200.0, + "grad_norm": 2.1078448839323167, + "language_loss": 0.79967189, + "learning_rate": 3.812235911671472e-06, + "loss": 0.82163477, + "num_input_tokens_seen": 59293480, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 2742, + "time_per_iteration": 2.4471442699432373 + }, + { + "auxiliary_loss_clip": 0.01150824, + "auxiliary_loss_mlp": 0.01053847, + "balance_loss_clip": 1.03373659, + "balance_loss_mlp": 1.05117011, + "epoch": 0.16491808206823988, + "flos": 20556129997440.0, + "grad_norm": 2.1468697804747823, + "language_loss": 0.84769481, + "learning_rate": 3.8120711252388274e-06, + "loss": 0.86974156, + "num_input_tokens_seen": 59313435, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0, + "step": 2743, + "time_per_iteration": 2.459146022796631 + }, + { + "auxiliary_loss_clip": 0.01149398, + "auxiliary_loss_mlp": 0.01052609, + "balance_loss_clip": 1.03359556, + "balance_loss_mlp": 1.05074859, + "epoch": 0.16497820532090787, + "flos": 23800802376960.0, + "grad_norm": 1.5853616537097488, + "language_loss": 0.85723281, + "learning_rate": 3.811906270092265e-06, + "loss": 0.87925285, + "num_input_tokens_seen": 59331535, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.984375, + "step": 2744, + "time_per_iteration": 2.4920642375946045 + }, + { + "auxiliary_loss_clip": 0.01147114, + "auxiliary_loss_mlp": 0.01046312, + "balance_loss_clip": 1.0283947, + "balance_loss_mlp": 1.05124998, + "epoch": 0.16503832857357584, + "flos": 25482642935040.0, + "grad_norm": 1.7300129139105382, + "language_loss": 0.82973897, + "learning_rate": 3.811741346238036e-06, + "loss": 0.85167319, + "num_input_tokens_seen": 59350680, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9609375, + "step": 2745, + "time_per_iteration": 2.490399122238159 + }, + { + "auxiliary_loss_clip": 0.0115758, + "auxiliary_loss_mlp": 0.01054165, + "balance_loss_clip": 1.03548467, + "balance_loss_mlp": 1.05477679, + "epoch": 0.1650984518262438, + "flos": 17676058619520.0, + "grad_norm": 2.19754759855213, + "language_loss": 0.76411253, + "learning_rate": 3.8115763536823923e-06, + "loss": 0.78622997, + "num_input_tokens_seen": 59367020, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.03125, + "step": 2746, + "time_per_iteration": 2.46258282661438 + }, + { + "auxiliary_loss_clip": 0.01152266, + "auxiliary_loss_mlp": 0.01052583, + "balance_loss_clip": 1.03387904, + "balance_loss_mlp": 1.05164099, + "epoch": 0.16515857507891177, + "flos": 18698327688960.0, + "grad_norm": 1.5978428663850568, + "language_loss": 0.80686736, + "learning_rate": 3.811411292431592e-06, + "loss": 0.82891583, + "num_input_tokens_seen": 59386075, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0078125, + "step": 2747, + "time_per_iteration": 2.4612972736358643 + }, + { + "auxiliary_loss_clip": 0.01158238, + "auxiliary_loss_mlp": 0.01048108, + "balance_loss_clip": 1.02848577, + "balance_loss_mlp": 1.05559731, + "epoch": 0.16521869833157973, + "flos": 15010481306880.0, + "grad_norm": 1.853069559467639, + "language_loss": 0.69463658, + "learning_rate": 3.8112461624918945e-06, + "loss": 0.71670008, + "num_input_tokens_seen": 59402690, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0234375, + "step": 2748, + "time_per_iteration": 2.4235999584198 + }, + { + "auxiliary_loss_clip": 0.01155731, + "auxiliary_loss_mlp": 0.0105142, + "balance_loss_clip": 1.03314471, + "balance_loss_mlp": 1.05482006, + "epoch": 0.1652788215842477, + "flos": 22121152548480.0, + "grad_norm": 2.265414403061137, + "language_loss": 0.87653661, + "learning_rate": 3.811080963869561e-06, + "loss": 0.89860809, + "num_input_tokens_seen": 59421130, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.0078125, + "step": 2749, + "time_per_iteration": 2.4706709384918213 + }, + { + "auxiliary_loss_clip": 0.01153325, + "auxiliary_loss_mlp": 0.01048781, + "balance_loss_clip": 1.02905142, + "balance_loss_mlp": 1.0509429, + "epoch": 0.16533894483691566, + "flos": 18333080242560.0, + "grad_norm": 2.3451981357461444, + "language_loss": 0.79248077, + "learning_rate": 3.8109156965708557e-06, + "loss": 0.81450188, + "num_input_tokens_seen": 59438970, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0234375, + "step": 2750, + "time_per_iteration": 2.4588990211486816 + }, + { + "auxiliary_loss_clip": 0.01151699, + "auxiliary_loss_mlp": 0.01045956, + "balance_loss_clip": 1.02657294, + "balance_loss_mlp": 1.05188382, + "epoch": 0.16539906808958366, + "flos": 22382115834240.0, + "grad_norm": 1.7653411133265118, + "language_loss": 0.95010567, + "learning_rate": 3.8107503606020455e-06, + "loss": 0.9720822, + "num_input_tokens_seen": 59458510, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.99609375, + "step": 2751, + "time_per_iteration": 2.4776439666748047 + }, + { + "auxiliary_loss_clip": 0.01152135, + "auxiliary_loss_mlp": 0.01045797, + "balance_loss_clip": 1.02762985, + "balance_loss_mlp": 1.05480134, + "epoch": 0.16545919134225162, + "flos": 22711093522560.0, + "grad_norm": 1.9833662518999209, + "language_loss": 0.71080822, + "learning_rate": 3.8105849559693997e-06, + "loss": 0.73278749, + "num_input_tokens_seen": 59477110, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.97265625, + "step": 2752, + "time_per_iteration": 2.4609227180480957 + }, + { + "auxiliary_loss_clip": 0.01051961, + "auxiliary_loss_mlp": 0.01021231, + "balance_loss_clip": 1.01878762, + "balance_loss_mlp": 1.01785779, + "epoch": 0.1655193145949196, + "flos": 67802974076160.0, + "grad_norm": 0.7698122762266473, + "language_loss": 0.54079807, + "learning_rate": 3.810419482679192e-06, + "loss": 0.56152999, + "num_input_tokens_seen": 59541155, + "router_z_loss_clip": 0.02441406, + "router_z_loss_mlp": 0.33984375, + "step": 2753, + "time_per_iteration": 3.161339282989502 + }, + { + "auxiliary_loss_clip": 0.01152964, + "auxiliary_loss_mlp": 0.01042006, + "balance_loss_clip": 1.02282572, + "balance_loss_mlp": 1.05254793, + "epoch": 0.16557943784758755, + "flos": 24280389792000.0, + "grad_norm": 1.9686645345026932, + "language_loss": 0.75467873, + "learning_rate": 3.8102539407376954e-06, + "loss": 0.77662838, + "num_input_tokens_seen": 59561155, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.0, + "step": 2754, + "time_per_iteration": 2.4837357997894287 + }, + { + "auxiliary_loss_clip": 0.01160718, + "auxiliary_loss_mlp": 0.01060834, + "balance_loss_clip": 1.03875661, + "balance_loss_mlp": 1.05358946, + "epoch": 0.16563956110025552, + "flos": 20083617561600.0, + "grad_norm": 3.81944507319113, + "language_loss": 0.87154973, + "learning_rate": 3.810088330151188e-06, + "loss": 0.89376527, + "num_input_tokens_seen": 59580460, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.0703125, + "step": 2755, + "time_per_iteration": 2.482780933380127 + }, + { + "auxiliary_loss_clip": 0.01148695, + "auxiliary_loss_mlp": 0.01052509, + "balance_loss_clip": 1.03348362, + "balance_loss_mlp": 1.04862666, + "epoch": 0.16569968435292348, + "flos": 28034454896640.0, + "grad_norm": 1.859731734913831, + "language_loss": 0.73258269, + "learning_rate": 3.80992265092595e-06, + "loss": 0.7545948, + "num_input_tokens_seen": 59600025, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0, + "step": 2756, + "time_per_iteration": 2.519432783126831 + }, + { + "auxiliary_loss_clip": 0.01149207, + "auxiliary_loss_mlp": 0.01049415, + "balance_loss_clip": 1.02999544, + "balance_loss_mlp": 1.05331099, + "epoch": 0.16575980760559147, + "flos": 26250233598720.0, + "grad_norm": 1.6628427585054586, + "language_loss": 0.74967468, + "learning_rate": 3.8097569030682636e-06, + "loss": 0.77166092, + "num_input_tokens_seen": 59620600, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.9609375, + "step": 2757, + "time_per_iteration": 2.5122530460357666 + }, + { + "auxiliary_loss_clip": 0.01154145, + "auxiliary_loss_mlp": 0.01044644, + "balance_loss_clip": 1.02590466, + "balance_loss_mlp": 1.05359447, + "epoch": 0.16581993085825944, + "flos": 26943955943040.0, + "grad_norm": 2.101183789218018, + "language_loss": 0.84532511, + "learning_rate": 3.8095910865844137e-06, + "loss": 0.86731303, + "num_input_tokens_seen": 59641385, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0078125, + "step": 2758, + "time_per_iteration": 2.5268592834472656 + }, + { + "auxiliary_loss_clip": 0.01153935, + "auxiliary_loss_mlp": 0.01051485, + "balance_loss_clip": 1.03382993, + "balance_loss_mlp": 1.05355358, + "epoch": 0.1658800541109274, + "flos": 21653632103040.0, + "grad_norm": 3.016772390052645, + "language_loss": 0.79003322, + "learning_rate": 3.809425201480689e-06, + "loss": 0.81208748, + "num_input_tokens_seen": 59659865, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 1.0, + "step": 2759, + "time_per_iteration": 2.468798875808716 + }, + { + "auxiliary_loss_clip": 0.01151828, + "auxiliary_loss_mlp": 0.01048294, + "balance_loss_clip": 1.02953088, + "balance_loss_mlp": 1.05121255, + "epoch": 0.16594017736359537, + "flos": 16435488643200.0, + "grad_norm": 4.81235802271706, + "language_loss": 0.75059134, + "learning_rate": 3.8092592477633793e-06, + "loss": 0.77259254, + "num_input_tokens_seen": 59678780, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0, + "step": 2760, + "time_per_iteration": 2.459453582763672 + }, + { + "auxiliary_loss_clip": 0.01158028, + "auxiliary_loss_mlp": 0.0104013, + "balance_loss_clip": 1.02139056, + "balance_loss_mlp": 1.05363011, + "epoch": 0.16600030061626334, + "flos": 22637297030400.0, + "grad_norm": 1.843496656605, + "language_loss": 0.73409051, + "learning_rate": 3.8090932254387774e-06, + "loss": 0.75607204, + "num_input_tokens_seen": 59698795, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.046875, + "step": 2761, + "time_per_iteration": 2.473264455795288 + }, + { + "auxiliary_loss_clip": 0.01154594, + "auxiliary_loss_mlp": 0.01046157, + "balance_loss_clip": 1.02709532, + "balance_loss_mlp": 1.05460942, + "epoch": 0.1660604238689313, + "flos": 26396569607040.0, + "grad_norm": 2.076392836835936, + "language_loss": 0.89255953, + "learning_rate": 3.8089271345131788e-06, + "loss": 0.91456699, + "num_input_tokens_seen": 59718795, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0, + "step": 2762, + "time_per_iteration": 2.4917852878570557 + }, + { + "auxiliary_loss_clip": 0.01153346, + "auxiliary_loss_mlp": 0.01052721, + "balance_loss_clip": 1.03327847, + "balance_loss_mlp": 1.0517025, + "epoch": 0.16612054712159927, + "flos": 23039999383680.0, + "grad_norm": 1.6634533311047424, + "language_loss": 0.87782222, + "learning_rate": 3.8087609749928822e-06, + "loss": 0.89988291, + "num_input_tokens_seen": 59737555, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.015625, + "step": 2763, + "time_per_iteration": 2.48002028465271 + }, + { + "auxiliary_loss_clip": 0.01051809, + "auxiliary_loss_mlp": 0.01013596, + "balance_loss_clip": 1.01105642, + "balance_loss_mlp": 1.01786494, + "epoch": 0.16618067037426726, + "flos": 59241225202560.0, + "grad_norm": 0.7771287992078079, + "language_loss": 0.59777391, + "learning_rate": 3.8085947468841885e-06, + "loss": 0.61842799, + "num_input_tokens_seen": 59800915, + "router_z_loss_clip": 0.02539062, + "router_z_loss_mlp": 0.33984375, + "step": 2764, + "time_per_iteration": 3.0722031593322754 + }, + { + "auxiliary_loss_clip": 0.01154679, + "auxiliary_loss_mlp": 0.01052765, + "balance_loss_clip": 1.03183234, + "balance_loss_mlp": 1.05292118, + "epoch": 0.16624079362693522, + "flos": 27198813916800.0, + "grad_norm": 1.8564974944455146, + "language_loss": 0.82349414, + "learning_rate": 3.808428450193401e-06, + "loss": 0.8455686, + "num_input_tokens_seen": 59822910, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.015625, + "step": 2765, + "time_per_iteration": 2.5071089267730713 + }, + { + "auxiliary_loss_clip": 0.01161301, + "auxiliary_loss_mlp": 0.01048817, + "balance_loss_clip": 1.02758563, + "balance_loss_mlp": 1.05308914, + "epoch": 0.1663009168796032, + "flos": 10925068216320.0, + "grad_norm": 2.1954568630881566, + "language_loss": 0.70029616, + "learning_rate": 3.8082620849268244e-06, + "loss": 0.72239733, + "num_input_tokens_seen": 59838805, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.078125, + "step": 2766, + "time_per_iteration": 2.417538642883301 + }, + { + "auxiliary_loss_clip": 0.01153227, + "auxiliary_loss_mlp": 0.01045171, + "balance_loss_clip": 1.02669311, + "balance_loss_mlp": 1.05449462, + "epoch": 0.16636104013227115, + "flos": 17894431353600.0, + "grad_norm": 2.3642497854018174, + "language_loss": 0.88693011, + "learning_rate": 3.808095651090769e-06, + "loss": 0.90891409, + "num_input_tokens_seen": 59855345, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.98828125, + "step": 2767, + "time_per_iteration": 2.447087287902832 + }, + { + "auxiliary_loss_clip": 0.01048639, + "auxiliary_loss_mlp": 0.01007692, + "balance_loss_clip": 1.0051651, + "balance_loss_mlp": 1.01474071, + "epoch": 0.16642116338493912, + "flos": 66726050463360.0, + "grad_norm": 0.659533193053428, + "language_loss": 0.52894622, + "learning_rate": 3.8079291486915447e-06, + "loss": 0.54950953, + "num_input_tokens_seen": 59917710, + "router_z_loss_clip": 0.02526855, + "router_z_loss_mlp": 0.33984375, + "step": 2768, + "time_per_iteration": 4.540286064147949 + }, + { + "auxiliary_loss_clip": 0.01156575, + "auxiliary_loss_mlp": 0.01051889, + "balance_loss_clip": 1.03196931, + "balance_loss_mlp": 1.05233693, + "epoch": 0.16648128663760708, + "flos": 19026048401280.0, + "grad_norm": 2.4421243199538543, + "language_loss": 0.84964579, + "learning_rate": 3.8077625777354667e-06, + "loss": 0.87173045, + "num_input_tokens_seen": 59935105, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.046875, + "step": 2769, + "time_per_iteration": 3.9888546466827393 + }, + { + "auxiliary_loss_clip": 0.01046706, + "auxiliary_loss_mlp": 0.01007405, + "balance_loss_clip": 1.00486565, + "balance_loss_mlp": 1.01284146, + "epoch": 0.16654140989027508, + "flos": 70134976759680.0, + "grad_norm": 0.809970645404753, + "language_loss": 0.57417655, + "learning_rate": 3.80759593822885e-06, + "loss": 0.59471762, + "num_input_tokens_seen": 59984085, + "router_z_loss_clip": 0.02539062, + "router_z_loss_mlp": 0.33984375, + "step": 2770, + "time_per_iteration": 2.909212350845337 + }, + { + "auxiliary_loss_clip": 0.01045765, + "auxiliary_loss_mlp": 0.01004174, + "balance_loss_clip": 1.00161099, + "balance_loss_mlp": 1.0120976, + "epoch": 0.16660153314294304, + "flos": 70272406195200.0, + "grad_norm": 0.8642108743281017, + "language_loss": 0.5621168, + "learning_rate": 3.807429230178015e-06, + "loss": 0.58261615, + "num_input_tokens_seen": 60043470, + "router_z_loss_clip": 0.02563477, + "router_z_loss_mlp": 0.3359375, + "step": 2771, + "time_per_iteration": 2.9000375270843506 + }, + { + "auxiliary_loss_clip": 0.01152287, + "auxiliary_loss_mlp": 0.01058074, + "balance_loss_clip": 1.03741515, + "balance_loss_mlp": 1.05137527, + "epoch": 0.166661656395611, + "flos": 23075048079360.0, + "grad_norm": 2.4271023422086593, + "language_loss": 0.70461071, + "learning_rate": 3.8072624535892817e-06, + "loss": 0.72671425, + "num_input_tokens_seen": 60063045, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.0078125, + "step": 2772, + "time_per_iteration": 2.45868182182312 + }, + { + "auxiliary_loss_clip": 0.01150213, + "auxiliary_loss_mlp": 0.01052488, + "balance_loss_clip": 1.03305721, + "balance_loss_mlp": 1.04914951, + "epoch": 0.16672177964827897, + "flos": 28366341586560.0, + "grad_norm": 1.8764675289735346, + "language_loss": 0.86201918, + "learning_rate": 3.807095608468975e-06, + "loss": 0.8840462, + "num_input_tokens_seen": 60081945, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0078125, + "step": 2773, + "time_per_iteration": 2.513784885406494 + }, + { + "auxiliary_loss_clip": 0.01152492, + "auxiliary_loss_mlp": 0.01046232, + "balance_loss_clip": 1.02808821, + "balance_loss_mlp": 1.05230188, + "epoch": 0.16678190290094694, + "flos": 19091010147840.0, + "grad_norm": 2.2216439453760595, + "language_loss": 0.81859678, + "learning_rate": 3.8069286948234224e-06, + "loss": 0.84058398, + "num_input_tokens_seen": 60096820, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.0, + "step": 2774, + "time_per_iteration": 2.4288830757141113 + }, + { + "auxiliary_loss_clip": 0.01155539, + "auxiliary_loss_mlp": 0.0104957, + "balance_loss_clip": 1.02955508, + "balance_loss_mlp": 1.05290627, + "epoch": 0.1668420261536149, + "flos": 21799106184960.0, + "grad_norm": 2.1125697386324576, + "language_loss": 0.83287829, + "learning_rate": 3.806761712658952e-06, + "loss": 0.85492939, + "num_input_tokens_seen": 60116140, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.0234375, + "step": 2775, + "time_per_iteration": 2.4773504734039307 + }, + { + "auxiliary_loss_clip": 0.01151, + "auxiliary_loss_mlp": 0.01053902, + "balance_loss_clip": 1.03599668, + "balance_loss_mlp": 1.0527029, + "epoch": 0.16690214940628287, + "flos": 19062533640960.0, + "grad_norm": 1.9011936520028738, + "language_loss": 0.80721045, + "learning_rate": 3.806594661981897e-06, + "loss": 0.82925946, + "num_input_tokens_seen": 60134235, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.984375, + "step": 2776, + "time_per_iteration": 2.4736995697021484 + }, + { + "auxiliary_loss_clip": 0.01147621, + "auxiliary_loss_mlp": 0.01053383, + "balance_loss_clip": 1.03457189, + "balance_loss_mlp": 1.05260348, + "epoch": 0.16696227265895086, + "flos": 18588548747520.0, + "grad_norm": 1.7922512358148395, + "language_loss": 0.798361, + "learning_rate": 3.8064275427985906e-06, + "loss": 0.82037103, + "num_input_tokens_seen": 60153275, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.953125, + "step": 2777, + "time_per_iteration": 2.4625258445739746 + }, + { + "auxiliary_loss_clip": 0.01149386, + "auxiliary_loss_mlp": 0.0105028, + "balance_loss_clip": 1.0313735, + "balance_loss_mlp": 1.05002642, + "epoch": 0.16702239591161883, + "flos": 23294139085440.0, + "grad_norm": 1.8218923631286437, + "language_loss": 0.85132945, + "learning_rate": 3.806260355115371e-06, + "loss": 0.87332618, + "num_input_tokens_seen": 60173215, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.99609375, + "step": 2778, + "time_per_iteration": 2.4819412231445312 + }, + { + "auxiliary_loss_clip": 0.01154381, + "auxiliary_loss_mlp": 0.0104532, + "balance_loss_clip": 1.02626991, + "balance_loss_mlp": 1.05222583, + "epoch": 0.1670825191642868, + "flos": 24425648392320.0, + "grad_norm": 2.6489491047564826, + "language_loss": 0.74133682, + "learning_rate": 3.8060930989385778e-06, + "loss": 0.76333386, + "num_input_tokens_seen": 60190515, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0234375, + "step": 2779, + "time_per_iteration": 2.510207176208496 + }, + { + "auxiliary_loss_clip": 0.0115174, + "auxiliary_loss_mlp": 0.01045601, + "balance_loss_clip": 1.02625358, + "balance_loss_mlp": 1.05116367, + "epoch": 0.16714264241695476, + "flos": 26797512193920.0, + "grad_norm": 2.2761441742273663, + "language_loss": 0.65382051, + "learning_rate": 3.805925774274554e-06, + "loss": 0.67579395, + "num_input_tokens_seen": 60211655, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0078125, + "step": 2780, + "time_per_iteration": 2.5250439643859863 + }, + { + "auxiliary_loss_clip": 0.01150325, + "auxiliary_loss_mlp": 0.01048314, + "balance_loss_clip": 1.02856088, + "balance_loss_mlp": 1.05120933, + "epoch": 0.16720276566962272, + "flos": 21835304115840.0, + "grad_norm": 2.0602280440022382, + "language_loss": 0.78563058, + "learning_rate": 3.805758381129643e-06, + "loss": 0.80761701, + "num_input_tokens_seen": 60230860, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.9921875, + "step": 2781, + "time_per_iteration": 2.4921979904174805 + }, + { + "auxiliary_loss_clip": 0.01153739, + "auxiliary_loss_mlp": 0.01049181, + "balance_loss_clip": 1.03065586, + "balance_loss_mlp": 1.05227423, + "epoch": 0.1672628889222907, + "flos": 21470415805440.0, + "grad_norm": 1.480266857331911, + "language_loss": 0.75262564, + "learning_rate": 3.805590919510193e-06, + "loss": 0.77465487, + "num_input_tokens_seen": 60250535, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.015625, + "step": 2782, + "time_per_iteration": 2.468590021133423 + }, + { + "auxiliary_loss_clip": 0.01159372, + "auxiliary_loss_mlp": 0.01052642, + "balance_loss_clip": 1.03141046, + "balance_loss_mlp": 1.05443954, + "epoch": 0.16732301217495865, + "flos": 30774008269440.0, + "grad_norm": 1.999958464394936, + "language_loss": 0.67841566, + "learning_rate": 3.8054233894225547e-06, + "loss": 0.70053571, + "num_input_tokens_seen": 60269530, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.0546875, + "step": 2783, + "time_per_iteration": 2.5312225818634033 + }, + { + "auxiliary_loss_clip": 0.01153889, + "auxiliary_loss_mlp": 0.01050749, + "balance_loss_clip": 1.03193808, + "balance_loss_mlp": 1.0538497, + "epoch": 0.16738313542762664, + "flos": 23474625949440.0, + "grad_norm": 2.209785525271013, + "language_loss": 0.70028126, + "learning_rate": 3.805255790873081e-06, + "loss": 0.72232759, + "num_input_tokens_seen": 60289900, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0, + "step": 2784, + "time_per_iteration": 2.4932820796966553 + }, + { + "auxiliary_loss_clip": 0.01154602, + "auxiliary_loss_mlp": 0.0105186, + "balance_loss_clip": 1.03087902, + "balance_loss_mlp": 1.05120277, + "epoch": 0.1674432586802946, + "flos": 29789086366080.0, + "grad_norm": 1.9638597335511054, + "language_loss": 0.60441053, + "learning_rate": 3.805088123868126e-06, + "loss": 0.62647516, + "num_input_tokens_seen": 60310025, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.03125, + "step": 2785, + "time_per_iteration": 2.527010440826416 + }, + { + "auxiliary_loss_clip": 0.0104901, + "auxiliary_loss_mlp": 0.01029558, + "balance_loss_clip": 1.02681625, + "balance_loss_mlp": 1.01595187, + "epoch": 0.16750338193296258, + "flos": 66136073575680.0, + "grad_norm": 0.8343482124814343, + "language_loss": 0.588, + "learning_rate": 3.8049203884140492e-06, + "loss": 0.60878569, + "num_input_tokens_seen": 60377800, + "router_z_loss_clip": 0.02746582, + "router_z_loss_mlp": 0.33007812, + "step": 2786, + "time_per_iteration": 3.1062281131744385 + }, + { + "auxiliary_loss_clip": 0.0115343, + "auxiliary_loss_mlp": 0.01044844, + "balance_loss_clip": 1.0253408, + "balance_loss_mlp": 1.05108333, + "epoch": 0.16756350518563054, + "flos": 25696777864320.0, + "grad_norm": 1.9494651562196093, + "language_loss": 0.75846571, + "learning_rate": 3.80475258451721e-06, + "loss": 0.78044844, + "num_input_tokens_seen": 60398215, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0234375, + "step": 2787, + "time_per_iteration": 2.51383900642395 + }, + { + "auxiliary_loss_clip": 0.0115361, + "auxiliary_loss_mlp": 0.0104169, + "balance_loss_clip": 1.02287841, + "balance_loss_mlp": 1.05218899, + "epoch": 0.1676236284382985, + "flos": 23836102467840.0, + "grad_norm": 2.088538847955111, + "language_loss": 0.77615869, + "learning_rate": 3.804584712183972e-06, + "loss": 0.79811174, + "num_input_tokens_seen": 60416910, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 2788, + "time_per_iteration": 2.4926373958587646 + }, + { + "auxiliary_loss_clip": 0.01048965, + "auxiliary_loss_mlp": 0.01004104, + "balance_loss_clip": 1.00154078, + "balance_loss_mlp": 1.01582766, + "epoch": 0.16768375169096647, + "flos": 59874902985600.0, + "grad_norm": 0.861309286667726, + "language_loss": 0.59360403, + "learning_rate": 3.8044167714207013e-06, + "loss": 0.61413473, + "num_input_tokens_seen": 60468660, + "router_z_loss_clip": 0.02563477, + "router_z_loss_mlp": 0.33203125, + "step": 2789, + "time_per_iteration": 2.9390883445739746 + }, + { + "auxiliary_loss_clip": 0.01153417, + "auxiliary_loss_mlp": 0.01052728, + "balance_loss_clip": 1.03262937, + "balance_loss_mlp": 1.05115533, + "epoch": 0.16774387494363446, + "flos": 38435657207040.0, + "grad_norm": 1.8582032581880512, + "language_loss": 0.70117038, + "learning_rate": 3.804248762233765e-06, + "loss": 0.72323185, + "num_input_tokens_seen": 60492370, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.015625, + "step": 2790, + "time_per_iteration": 2.6337287425994873 + }, + { + "auxiliary_loss_clip": 0.01154528, + "auxiliary_loss_mlp": 0.01057043, + "balance_loss_clip": 1.03852975, + "balance_loss_mlp": 1.05254579, + "epoch": 0.16780399819630243, + "flos": 22637620252800.0, + "grad_norm": 1.9267324208283758, + "language_loss": 0.7914235, + "learning_rate": 3.8040806846295356e-06, + "loss": 0.81353921, + "num_input_tokens_seen": 60512655, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0234375, + "step": 2791, + "time_per_iteration": 2.4992258548736572 + }, + { + "auxiliary_loss_clip": 0.01154296, + "auxiliary_loss_mlp": 0.0104755, + "balance_loss_clip": 1.02807093, + "balance_loss_mlp": 1.05311096, + "epoch": 0.1678641214489704, + "flos": 32891516887680.0, + "grad_norm": 1.670563786806713, + "language_loss": 0.71465087, + "learning_rate": 3.8039125386143853e-06, + "loss": 0.73666936, + "num_input_tokens_seen": 60533090, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.015625, + "step": 2792, + "time_per_iteration": 2.5886104106903076 + }, + { + "auxiliary_loss_clip": 0.01154826, + "auxiliary_loss_mlp": 0.01045884, + "balance_loss_clip": 1.02648878, + "balance_loss_mlp": 1.05179656, + "epoch": 0.16792424470163836, + "flos": 19974916028160.0, + "grad_norm": 2.423044729867527, + "language_loss": 0.72166264, + "learning_rate": 3.803744324194691e-06, + "loss": 0.74366981, + "num_input_tokens_seen": 60553190, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.03125, + "step": 2793, + "time_per_iteration": 2.5197043418884277 + }, + { + "auxiliary_loss_clip": 0.01153184, + "auxiliary_loss_mlp": 0.01054586, + "balance_loss_clip": 1.03502417, + "balance_loss_mlp": 1.05135465, + "epoch": 0.16798436795430632, + "flos": 19719878486400.0, + "grad_norm": 1.9474647186442988, + "language_loss": 0.77305138, + "learning_rate": 3.803576041376831e-06, + "loss": 0.79512912, + "num_input_tokens_seen": 60571995, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.015625, + "step": 2794, + "time_per_iteration": 2.467292547225952 + }, + { + "auxiliary_loss_clip": 0.01154384, + "auxiliary_loss_mlp": 0.01054467, + "balance_loss_clip": 1.03558397, + "balance_loss_mlp": 1.05253601, + "epoch": 0.1680444912069743, + "flos": 28104839596800.0, + "grad_norm": 2.2742759048834578, + "language_loss": 0.71613103, + "learning_rate": 3.803407690167187e-06, + "loss": 0.7382195, + "num_input_tokens_seen": 60591275, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.015625, + "step": 2795, + "time_per_iteration": 2.5272278785705566 + }, + { + "auxiliary_loss_clip": 0.01149377, + "auxiliary_loss_mlp": 0.01044626, + "balance_loss_clip": 1.02592218, + "balance_loss_mlp": 1.04932868, + "epoch": 0.16810461445964225, + "flos": 18075205526400.0, + "grad_norm": 1.942494339721957, + "language_loss": 0.83784455, + "learning_rate": 3.803239270572142e-06, + "loss": 0.8597846, + "num_input_tokens_seen": 60609235, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0, + "step": 2796, + "time_per_iteration": 2.448528289794922 + }, + { + "auxiliary_loss_clip": 0.01152862, + "auxiliary_loss_mlp": 0.01059215, + "balance_loss_clip": 1.03911614, + "balance_loss_mlp": 1.04904127, + "epoch": 0.16816473771231025, + "flos": 23878657105920.0, + "grad_norm": 1.6778887705488965, + "language_loss": 0.8109591, + "learning_rate": 3.8030707825980838e-06, + "loss": 0.83307993, + "num_input_tokens_seen": 60629880, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0390625, + "step": 2797, + "time_per_iteration": 2.5044567584991455 + }, + { + "auxiliary_loss_clip": 0.01147186, + "auxiliary_loss_mlp": 0.01044345, + "balance_loss_clip": 1.02766752, + "balance_loss_mlp": 1.05142093, + "epoch": 0.1682248609649782, + "flos": 22783597125120.0, + "grad_norm": 1.4189820060365406, + "language_loss": 0.74740726, + "learning_rate": 3.802902226251401e-06, + "loss": 0.76932257, + "num_input_tokens_seen": 60651175, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.95703125, + "step": 2798, + "time_per_iteration": 2.4913666248321533 + }, + { + "auxiliary_loss_clip": 0.01154688, + "auxiliary_loss_mlp": 0.01049917, + "balance_loss_clip": 1.03250098, + "balance_loss_mlp": 1.05462337, + "epoch": 0.16828498421764618, + "flos": 20705123612160.0, + "grad_norm": 1.8962576537558784, + "language_loss": 0.79592311, + "learning_rate": 3.8027336015384845e-06, + "loss": 0.81796914, + "num_input_tokens_seen": 60670210, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 1.0, + "step": 2799, + "time_per_iteration": 2.4844021797180176 + }, + { + "auxiliary_loss_clip": 0.01153892, + "auxiliary_loss_mlp": 0.01046392, + "balance_loss_clip": 1.02597189, + "balance_loss_mlp": 1.04983997, + "epoch": 0.16834510747031414, + "flos": 29420606695680.0, + "grad_norm": 2.7819182919151455, + "language_loss": 0.70778632, + "learning_rate": 3.8025649084657296e-06, + "loss": 0.72978926, + "num_input_tokens_seen": 60690895, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0390625, + "step": 2800, + "time_per_iteration": 2.548715829849243 + }, + { + "auxiliary_loss_clip": 0.01148463, + "auxiliary_loss_mlp": 0.01043839, + "balance_loss_clip": 1.02365637, + "balance_loss_mlp": 1.04882574, + "epoch": 0.1684052307229821, + "flos": 18145374744960.0, + "grad_norm": 1.9135359518782422, + "language_loss": 0.83549178, + "learning_rate": 3.8023961470395326e-06, + "loss": 0.85741478, + "num_input_tokens_seen": 60708280, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.99609375, + "step": 2801, + "time_per_iteration": 2.456601858139038 + }, + { + "auxiliary_loss_clip": 0.01149997, + "auxiliary_loss_mlp": 0.01052315, + "balance_loss_clip": 1.03355145, + "balance_loss_mlp": 1.04947591, + "epoch": 0.16846535397565007, + "flos": 16574929240320.0, + "grad_norm": 2.757874152621573, + "language_loss": 0.822721, + "learning_rate": 3.8022273172662933e-06, + "loss": 0.84474415, + "num_input_tokens_seen": 60724150, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0078125, + "step": 2802, + "time_per_iteration": 2.4426534175872803 + }, + { + "auxiliary_loss_clip": 0.01153107, + "auxiliary_loss_mlp": 0.01047694, + "balance_loss_clip": 1.02764344, + "balance_loss_mlp": 1.05123353, + "epoch": 0.16852547722831807, + "flos": 30408868563840.0, + "grad_norm": 1.4855905624355255, + "language_loss": 0.81064272, + "learning_rate": 3.802058419152413e-06, + "loss": 0.83265072, + "num_input_tokens_seen": 60746485, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.015625, + "step": 2803, + "time_per_iteration": 2.5615930557250977 + }, + { + "auxiliary_loss_clip": 0.01150255, + "auxiliary_loss_mlp": 0.01045652, + "balance_loss_clip": 1.02693641, + "balance_loss_mlp": 1.05246449, + "epoch": 0.16858560048098603, + "flos": 33507420416640.0, + "grad_norm": 2.2799183114600545, + "language_loss": 0.7645762, + "learning_rate": 3.801889452704297e-06, + "loss": 0.78653532, + "num_input_tokens_seen": 60762875, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9765625, + "step": 2804, + "time_per_iteration": 2.541059970855713 + }, + { + "auxiliary_loss_clip": 0.01045818, + "auxiliary_loss_mlp": 0.01026702, + "balance_loss_clip": 1.02452028, + "balance_loss_mlp": 1.01328063, + "epoch": 0.168645723733654, + "flos": 67370502326400.0, + "grad_norm": 0.8620881286764229, + "language_loss": 0.55414748, + "learning_rate": 3.8017204179283526e-06, + "loss": 0.57487267, + "num_input_tokens_seen": 60825510, + "router_z_loss_clip": 0.02185059, + "router_z_loss_mlp": 0.32421875, + "step": 2805, + "time_per_iteration": 3.033358573913574 + }, + { + "auxiliary_loss_clip": 0.01144187, + "auxiliary_loss_mlp": 0.01039064, + "balance_loss_clip": 1.02161169, + "balance_loss_mlp": 1.04741919, + "epoch": 0.16870584698632196, + "flos": 21324618501120.0, + "grad_norm": 1.9122963285347783, + "language_loss": 0.73038024, + "learning_rate": 3.8015513148309892e-06, + "loss": 0.75221276, + "num_input_tokens_seen": 60844440, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.96484375, + "step": 2806, + "time_per_iteration": 2.4699463844299316 + }, + { + "auxiliary_loss_clip": 0.01148467, + "auxiliary_loss_mlp": 0.01045307, + "balance_loss_clip": 1.02712786, + "balance_loss_mlp": 1.05072176, + "epoch": 0.16876597023898993, + "flos": 20740746925440.0, + "grad_norm": 1.9407491705316076, + "language_loss": 0.69966477, + "learning_rate": 3.80138214341862e-06, + "loss": 0.7216025, + "num_input_tokens_seen": 60863210, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.98046875, + "step": 2807, + "time_per_iteration": 2.4583139419555664 + }, + { + "auxiliary_loss_clip": 0.01149832, + "auxiliary_loss_mlp": 0.01051611, + "balance_loss_clip": 1.03196526, + "balance_loss_mlp": 1.05013919, + "epoch": 0.1688260934916579, + "flos": 20303498666880.0, + "grad_norm": 2.8028706291815912, + "language_loss": 0.70265883, + "learning_rate": 3.8012129036976587e-06, + "loss": 0.72467327, + "num_input_tokens_seen": 60882510, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.9921875, + "step": 2808, + "time_per_iteration": 2.4724719524383545 + }, + { + "auxiliary_loss_clip": 0.01154296, + "auxiliary_loss_mlp": 0.0104775, + "balance_loss_clip": 1.02792549, + "balance_loss_mlp": 1.05130935, + "epoch": 0.16888621674432586, + "flos": 20340702178560.0, + "grad_norm": 2.1293629398657954, + "language_loss": 0.80103064, + "learning_rate": 3.8010435956745236e-06, + "loss": 0.8230511, + "num_input_tokens_seen": 60901105, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.03125, + "step": 2809, + "time_per_iteration": 3.844451427459717 + }, + { + "auxiliary_loss_clip": 0.01155336, + "auxiliary_loss_mlp": 0.01051942, + "balance_loss_clip": 1.03301144, + "balance_loss_mlp": 1.050385, + "epoch": 0.16894633999699385, + "flos": 16244802316800.0, + "grad_norm": 2.0909159229075245, + "language_loss": 0.88465077, + "learning_rate": 3.8008742193556358e-06, + "loss": 0.9067235, + "num_input_tokens_seen": 60915340, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.046875, + "step": 2810, + "time_per_iteration": 5.43256688117981 + }, + { + "auxiliary_loss_clip": 0.0115459, + "auxiliary_loss_mlp": 0.01052284, + "balance_loss_clip": 1.03238845, + "balance_loss_mlp": 1.05188894, + "epoch": 0.16900646324966181, + "flos": 19610171372160.0, + "grad_norm": 2.324870160833927, + "language_loss": 0.92483926, + "learning_rate": 3.800704774747416e-06, + "loss": 0.94690794, + "num_input_tokens_seen": 60933735, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.03125, + "step": 2811, + "time_per_iteration": 2.4633538722991943 + }, + { + "auxiliary_loss_clip": 0.01157458, + "auxiliary_loss_mlp": 0.01049775, + "balance_loss_clip": 1.03154814, + "balance_loss_mlp": 1.05537057, + "epoch": 0.16906658650232978, + "flos": 22018089450240.0, + "grad_norm": 20.150047321728213, + "language_loss": 0.78719699, + "learning_rate": 3.800535261856291e-06, + "loss": 0.80926931, + "num_input_tokens_seen": 60953105, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.0234375, + "step": 2812, + "time_per_iteration": 2.475893974304199 + }, + { + "auxiliary_loss_clip": 0.01154531, + "auxiliary_loss_mlp": 0.01053249, + "balance_loss_clip": 1.0353322, + "balance_loss_mlp": 1.05427527, + "epoch": 0.16912670975499774, + "flos": 11763690024960.0, + "grad_norm": 2.3708558754635103, + "language_loss": 0.7492249, + "learning_rate": 3.8003656806886887e-06, + "loss": 0.7713027, + "num_input_tokens_seen": 60969150, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 1.0078125, + "step": 2813, + "time_per_iteration": 2.4622457027435303 + }, + { + "auxiliary_loss_clip": 0.01155154, + "auxiliary_loss_mlp": 0.01047749, + "balance_loss_clip": 1.02862835, + "balance_loss_mlp": 1.05231524, + "epoch": 0.1691868330076657, + "flos": 17161386595200.0, + "grad_norm": 2.6643465032783955, + "language_loss": 0.69000697, + "learning_rate": 3.8001960312510396e-06, + "loss": 0.71203601, + "num_input_tokens_seen": 60982825, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.03125, + "step": 2814, + "time_per_iteration": 2.442352771759033 + }, + { + "auxiliary_loss_clip": 0.01152587, + "auxiliary_loss_mlp": 0.01048898, + "balance_loss_clip": 1.03032494, + "balance_loss_mlp": 1.05269694, + "epoch": 0.16924695626033368, + "flos": 22416553998720.0, + "grad_norm": 3.3683342322522543, + "language_loss": 0.61842358, + "learning_rate": 3.800026313549776e-06, + "loss": 0.64043844, + "num_input_tokens_seen": 61000875, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0, + "step": 2815, + "time_per_iteration": 2.4859516620635986 + }, + { + "auxiliary_loss_clip": 0.01149812, + "auxiliary_loss_mlp": 0.01050269, + "balance_loss_clip": 1.03179121, + "balance_loss_mlp": 1.05104065, + "epoch": 0.16930707951300164, + "flos": 25739655724800.0, + "grad_norm": 1.9947957584318596, + "language_loss": 0.81983805, + "learning_rate": 3.7998565275913342e-06, + "loss": 0.84183884, + "num_input_tokens_seen": 61021940, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9921875, + "step": 2816, + "time_per_iteration": 2.5549440383911133 + }, + { + "auxiliary_loss_clip": 0.01156016, + "auxiliary_loss_mlp": 0.01049677, + "balance_loss_clip": 1.03072321, + "balance_loss_mlp": 1.05379295, + "epoch": 0.16936720276566963, + "flos": 22747040058240.0, + "grad_norm": 2.502019531770294, + "language_loss": 0.8722589, + "learning_rate": 3.799686673382153e-06, + "loss": 0.89431584, + "num_input_tokens_seen": 61040285, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0234375, + "step": 2817, + "time_per_iteration": 2.4906835556030273 + }, + { + "auxiliary_loss_clip": 0.01152128, + "auxiliary_loss_mlp": 0.01051154, + "balance_loss_clip": 1.03200889, + "balance_loss_mlp": 1.05302715, + "epoch": 0.1694273260183376, + "flos": 19573973441280.0, + "grad_norm": 1.7787508021643152, + "language_loss": 0.81666476, + "learning_rate": 3.799516750928672e-06, + "loss": 0.83869755, + "num_input_tokens_seen": 61059020, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.98828125, + "step": 2818, + "time_per_iteration": 2.4673428535461426 + }, + { + "auxiliary_loss_clip": 0.01151603, + "auxiliary_loss_mlp": 0.01052661, + "balance_loss_clip": 1.03339636, + "balance_loss_mlp": 1.05154157, + "epoch": 0.16948744927100556, + "flos": 12457843332480.0, + "grad_norm": 5.791836374282792, + "language_loss": 0.80712807, + "learning_rate": 3.799346760237336e-06, + "loss": 0.8291707, + "num_input_tokens_seen": 61074245, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0, + "step": 2819, + "time_per_iteration": 2.43947434425354 + }, + { + "auxiliary_loss_clip": 0.01048844, + "auxiliary_loss_mlp": 0.01007246, + "balance_loss_clip": 1.00504076, + "balance_loss_mlp": 1.01552486, + "epoch": 0.16954757252367353, + "flos": 71291694435840.0, + "grad_norm": 0.9491282523447765, + "language_loss": 0.61080176, + "learning_rate": 3.7991767013145902e-06, + "loss": 0.63136268, + "num_input_tokens_seen": 61127080, + "router_z_loss_clip": 0.02209473, + "router_z_loss_mlp": 0.33203125, + "step": 2820, + "time_per_iteration": 3.008953809738159 + }, + { + "auxiliary_loss_clip": 0.01152835, + "auxiliary_loss_mlp": 0.01049908, + "balance_loss_clip": 1.031335, + "balance_loss_mlp": 1.05163527, + "epoch": 0.1696076957763415, + "flos": 29606516513280.0, + "grad_norm": 2.1013484538112097, + "language_loss": 0.78625357, + "learning_rate": 3.7990065741668844e-06, + "loss": 0.808281, + "num_input_tokens_seen": 61146955, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.015625, + "step": 2821, + "time_per_iteration": 2.5363481044769287 + }, + { + "auxiliary_loss_clip": 0.01152889, + "auxiliary_loss_mlp": 0.01056486, + "balance_loss_clip": 1.03667343, + "balance_loss_mlp": 1.05229986, + "epoch": 0.16966781902900946, + "flos": 24388588535040.0, + "grad_norm": 2.87583667245789, + "language_loss": 0.78450388, + "learning_rate": 3.7988363788006685e-06, + "loss": 0.80659759, + "num_input_tokens_seen": 61166605, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.0078125, + "step": 2822, + "time_per_iteration": 2.4969065189361572 + }, + { + "auxiliary_loss_clip": 0.01147495, + "auxiliary_loss_mlp": 0.01050996, + "balance_loss_clip": 1.03299582, + "balance_loss_mlp": 1.04956698, + "epoch": 0.16972794228167745, + "flos": 23038814234880.0, + "grad_norm": 1.9220487825624015, + "language_loss": 0.75016022, + "learning_rate": 3.7986661152223967e-06, + "loss": 0.77214515, + "num_input_tokens_seen": 61186535, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9765625, + "step": 2823, + "time_per_iteration": 2.491588830947876 + }, + { + "auxiliary_loss_clip": 0.01151822, + "auxiliary_loss_mlp": 0.0105186, + "balance_loss_clip": 1.03198779, + "balance_loss_mlp": 1.05209637, + "epoch": 0.16978806553434542, + "flos": 35228691129600.0, + "grad_norm": 1.9648811068121905, + "language_loss": 0.60514438, + "learning_rate": 3.7984957834385257e-06, + "loss": 0.62718117, + "num_input_tokens_seen": 61208965, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.99609375, + "step": 2824, + "time_per_iteration": 2.6178910732269287 + }, + { + "auxiliary_loss_clip": 0.01151958, + "auxiliary_loss_mlp": 0.01040113, + "balance_loss_clip": 1.02030015, + "balance_loss_mlp": 1.05367076, + "epoch": 0.16984818878701338, + "flos": 32014290936960.0, + "grad_norm": 1.6856049786717988, + "language_loss": 0.73004806, + "learning_rate": 3.7983253834555144e-06, + "loss": 0.75196874, + "num_input_tokens_seen": 61230670, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.98046875, + "step": 2825, + "time_per_iteration": 2.559774398803711 + }, + { + "auxiliary_loss_clip": 0.01155697, + "auxiliary_loss_mlp": 0.01054546, + "balance_loss_clip": 1.03321934, + "balance_loss_mlp": 1.0505774, + "epoch": 0.16990831203968135, + "flos": 22818609907200.0, + "grad_norm": 1.7849035157466668, + "language_loss": 0.85660541, + "learning_rate": 3.7981549152798245e-06, + "loss": 0.87870789, + "num_input_tokens_seen": 61249510, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.0546875, + "step": 2826, + "time_per_iteration": 2.4860360622406006 + }, + { + "auxiliary_loss_clip": 0.0115502, + "auxiliary_loss_mlp": 0.01050706, + "balance_loss_clip": 1.03164482, + "balance_loss_mlp": 1.0515151, + "epoch": 0.1699684352923493, + "flos": 23039604334080.0, + "grad_norm": 2.3205594057943175, + "language_loss": 0.8232255, + "learning_rate": 3.7979843789179196e-06, + "loss": 0.84528267, + "num_input_tokens_seen": 61269440, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.03125, + "step": 2827, + "time_per_iteration": 2.4965107440948486 + }, + { + "auxiliary_loss_clip": 0.01153252, + "auxiliary_loss_mlp": 0.01049837, + "balance_loss_clip": 1.02965498, + "balance_loss_mlp": 1.05059743, + "epoch": 0.17002855854501728, + "flos": 21434110133760.0, + "grad_norm": 2.393760877815214, + "language_loss": 0.73652613, + "learning_rate": 3.797813774376267e-06, + "loss": 0.75855708, + "num_input_tokens_seen": 61288195, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0234375, + "step": 2828, + "time_per_iteration": 2.5726237297058105 + }, + { + "auxiliary_loss_clip": 0.01046718, + "auxiliary_loss_mlp": 0.01008554, + "balance_loss_clip": 1.00625372, + "balance_loss_mlp": 1.01360035, + "epoch": 0.17008868179768524, + "flos": 71453509205760.0, + "grad_norm": 0.76062911359866, + "language_loss": 0.56446254, + "learning_rate": 3.797643101661336e-06, + "loss": 0.5850153, + "num_input_tokens_seen": 61350850, + "router_z_loss_clip": 0.02294922, + "router_z_loss_mlp": 0.33203125, + "step": 2829, + "time_per_iteration": 3.1035284996032715 + }, + { + "auxiliary_loss_clip": 0.01148906, + "auxiliary_loss_mlp": 0.01048452, + "balance_loss_clip": 1.02912867, + "balance_loss_mlp": 1.04916263, + "epoch": 0.17014880505035324, + "flos": 24900315644160.0, + "grad_norm": 1.7229604876305038, + "language_loss": 0.83673382, + "learning_rate": 3.7974723607795983e-06, + "loss": 0.85870743, + "num_input_tokens_seen": 61370765, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.99609375, + "step": 2830, + "time_per_iteration": 2.5140810012817383 + }, + { + "auxiliary_loss_clip": 0.01150805, + "auxiliary_loss_mlp": 0.01048567, + "balance_loss_clip": 1.02792013, + "balance_loss_mlp": 1.04919207, + "epoch": 0.1702089283030212, + "flos": 29862415981440.0, + "grad_norm": 2.0065309441313337, + "language_loss": 0.77852297, + "learning_rate": 3.797301551737529e-06, + "loss": 0.80051666, + "num_input_tokens_seen": 61388935, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.015625, + "step": 2831, + "time_per_iteration": 2.524578094482422 + }, + { + "auxiliary_loss_clip": 0.01152075, + "auxiliary_loss_mlp": 0.01050912, + "balance_loss_clip": 1.03013349, + "balance_loss_mlp": 1.04948521, + "epoch": 0.17026905155568917, + "flos": 17744180762880.0, + "grad_norm": 2.1211873867699285, + "language_loss": 0.79345167, + "learning_rate": 3.7971306745416044e-06, + "loss": 0.81548154, + "num_input_tokens_seen": 61407350, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.0234375, + "step": 2832, + "time_per_iteration": 2.459954261779785 + }, + { + "auxiliary_loss_clip": 0.01151972, + "auxiliary_loss_mlp": 0.0104718, + "balance_loss_clip": 1.02836847, + "balance_loss_mlp": 1.05050385, + "epoch": 0.17032917480835713, + "flos": 23148665003520.0, + "grad_norm": 1.9382017652854369, + "language_loss": 0.89026237, + "learning_rate": 3.7969597291983046e-06, + "loss": 0.91225392, + "num_input_tokens_seen": 61429010, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 2833, + "time_per_iteration": 2.4812114238739014 + }, + { + "auxiliary_loss_clip": 0.0115284, + "auxiliary_loss_mlp": 0.01048999, + "balance_loss_clip": 1.02963924, + "balance_loss_mlp": 1.05124569, + "epoch": 0.1703892980610251, + "flos": 39202565512320.0, + "grad_norm": 2.853060698790674, + "language_loss": 0.72425497, + "learning_rate": 3.7967887157141115e-06, + "loss": 0.74627328, + "num_input_tokens_seen": 61450040, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.015625, + "step": 2834, + "time_per_iteration": 2.6271297931671143 + }, + { + "auxiliary_loss_clip": 0.01156378, + "auxiliary_loss_mlp": 0.01058486, + "balance_loss_clip": 1.03894782, + "balance_loss_mlp": 1.05294132, + "epoch": 0.17044942131369306, + "flos": 23039101543680.0, + "grad_norm": 1.9954265429463485, + "language_loss": 0.86434042, + "learning_rate": 3.7966176340955106e-06, + "loss": 0.88648909, + "num_input_tokens_seen": 61468585, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.03125, + "step": 2835, + "time_per_iteration": 2.4804999828338623 + }, + { + "auxiliary_loss_clip": 0.01155592, + "auxiliary_loss_mlp": 0.01051963, + "balance_loss_clip": 1.03007674, + "balance_loss_mlp": 1.05081642, + "epoch": 0.17050954456636103, + "flos": 17054983532160.0, + "grad_norm": 1.9180646463430515, + "language_loss": 0.73242748, + "learning_rate": 3.796446484348989e-06, + "loss": 0.75450307, + "num_input_tokens_seen": 61486330, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.046875, + "step": 2836, + "time_per_iteration": 2.4694178104400635 + }, + { + "auxiliary_loss_clip": 0.01156947, + "auxiliary_loss_mlp": 0.01048414, + "balance_loss_clip": 1.02599072, + "balance_loss_mlp": 1.05033076, + "epoch": 0.17056966781902902, + "flos": 16836969934080.0, + "grad_norm": 2.1253309510576717, + "language_loss": 0.79653537, + "learning_rate": 3.796275266481036e-06, + "loss": 0.81858897, + "num_input_tokens_seen": 61503950, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.0703125, + "step": 2837, + "time_per_iteration": 2.452153444290161 + }, + { + "auxiliary_loss_clip": 0.01150588, + "auxiliary_loss_mlp": 0.01045279, + "balance_loss_clip": 1.02550185, + "balance_loss_mlp": 1.05232143, + "epoch": 0.17062979107169698, + "flos": 17712543859200.0, + "grad_norm": 2.19906443062711, + "language_loss": 0.83575213, + "learning_rate": 3.7961039804981456e-06, + "loss": 0.85771078, + "num_input_tokens_seen": 61523550, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.98046875, + "step": 2838, + "time_per_iteration": 2.479573965072632 + }, + { + "auxiliary_loss_clip": 0.01148981, + "auxiliary_loss_mlp": 0.01045249, + "balance_loss_clip": 1.02660489, + "balance_loss_mlp": 1.05069315, + "epoch": 0.17068991432436495, + "flos": 22525040050560.0, + "grad_norm": 1.7423496230624245, + "language_loss": 0.93620354, + "learning_rate": 3.795932626406812e-06, + "loss": 0.95814586, + "num_input_tokens_seen": 61542720, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.984375, + "step": 2839, + "time_per_iteration": 2.5399010181427 + }, + { + "auxiliary_loss_clip": 0.01154457, + "auxiliary_loss_mlp": 0.01049077, + "balance_loss_clip": 1.0277859, + "balance_loss_mlp": 1.05050242, + "epoch": 0.17075003757703291, + "flos": 25882939077120.0, + "grad_norm": 2.8052720148780894, + "language_loss": 0.83847374, + "learning_rate": 3.7957612042135336e-06, + "loss": 0.86050916, + "num_input_tokens_seen": 61563040, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.0390625, + "step": 2840, + "time_per_iteration": 2.5449130535125732 + }, + { + "auxiliary_loss_clip": 0.01155521, + "auxiliary_loss_mlp": 0.01047778, + "balance_loss_clip": 1.02647519, + "balance_loss_mlp": 1.05213881, + "epoch": 0.17081016082970088, + "flos": 20120713332480.0, + "grad_norm": 2.014300966058614, + "language_loss": 0.76390004, + "learning_rate": 3.79558971392481e-06, + "loss": 0.78593302, + "num_input_tokens_seen": 61581890, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.03125, + "step": 2841, + "time_per_iteration": 2.4836723804473877 + }, + { + "auxiliary_loss_clip": 0.01152003, + "auxiliary_loss_mlp": 0.01052533, + "balance_loss_clip": 1.03243482, + "balance_loss_mlp": 1.04932261, + "epoch": 0.17087028408236885, + "flos": 24936477661440.0, + "grad_norm": 1.8874127741110907, + "language_loss": 0.77000463, + "learning_rate": 3.7954181555471443e-06, + "loss": 0.79205, + "num_input_tokens_seen": 61602095, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.03125, + "step": 2842, + "time_per_iteration": 2.5051841735839844 + }, + { + "auxiliary_loss_clip": 0.01148386, + "auxiliary_loss_mlp": 0.01046299, + "balance_loss_clip": 1.02647448, + "balance_loss_mlp": 1.0497905, + "epoch": 0.17093040733503684, + "flos": 19057864872960.0, + "grad_norm": 2.05566421297988, + "language_loss": 0.86086738, + "learning_rate": 3.795246529087043e-06, + "loss": 0.88281423, + "num_input_tokens_seen": 61620400, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.98828125, + "step": 2843, + "time_per_iteration": 2.4487509727478027 + }, + { + "auxiliary_loss_clip": 0.01150009, + "auxiliary_loss_mlp": 0.01046155, + "balance_loss_clip": 1.02696228, + "balance_loss_mlp": 1.05090249, + "epoch": 0.1709905305877048, + "flos": 13078954333440.0, + "grad_norm": 1.8875494657309706, + "language_loss": 0.6826812, + "learning_rate": 3.7950748345510126e-06, + "loss": 0.70464289, + "num_input_tokens_seen": 61637680, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.9921875, + "step": 2844, + "time_per_iteration": 2.4429779052734375 + }, + { + "auxiliary_loss_clip": 0.01150851, + "auxiliary_loss_mlp": 0.01054229, + "balance_loss_clip": 1.03371274, + "balance_loss_mlp": 1.05040824, + "epoch": 0.17105065384037277, + "flos": 19209336526080.0, + "grad_norm": 1.8058232236820264, + "language_loss": 0.78258789, + "learning_rate": 3.7949030719455646e-06, + "loss": 0.80463862, + "num_input_tokens_seen": 61655630, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0, + "step": 2845, + "time_per_iteration": 2.4377951622009277 + }, + { + "auxiliary_loss_clip": 0.01151786, + "auxiliary_loss_mlp": 0.01045909, + "balance_loss_clip": 1.02687097, + "balance_loss_mlp": 1.05064154, + "epoch": 0.17111077709304073, + "flos": 18515183218560.0, + "grad_norm": 2.746386155528142, + "language_loss": 0.77959955, + "learning_rate": 3.7947312412772127e-06, + "loss": 0.8015765, + "num_input_tokens_seen": 61673475, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0078125, + "step": 2846, + "time_per_iteration": 2.4196622371673584 + }, + { + "auxiliary_loss_clip": 0.01152165, + "auxiliary_loss_mlp": 0.0104791, + "balance_loss_clip": 1.02895534, + "balance_loss_mlp": 1.05158973, + "epoch": 0.1711709003457087, + "flos": 25082670015360.0, + "grad_norm": 1.7441395807388675, + "language_loss": 0.7942031, + "learning_rate": 3.794559342552472e-06, + "loss": 0.81620383, + "num_input_tokens_seen": 61693370, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0078125, + "step": 2847, + "time_per_iteration": 2.504087448120117 + }, + { + "auxiliary_loss_clip": 0.01148457, + "auxiliary_loss_mlp": 0.0104865, + "balance_loss_clip": 1.02913523, + "balance_loss_mlp": 1.04612017, + "epoch": 0.17123102359837666, + "flos": 17566387418880.0, + "grad_norm": 2.239997254259111, + "language_loss": 0.86818451, + "learning_rate": 3.7943873757778614e-06, + "loss": 0.89015555, + "num_input_tokens_seen": 61710820, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0234375, + "step": 2848, + "time_per_iteration": 2.438711643218994 + }, + { + "auxiliary_loss_clip": 0.0115323, + "auxiliary_loss_mlp": 0.01044307, + "balance_loss_clip": 1.02438748, + "balance_loss_mlp": 1.05133212, + "epoch": 0.17129114685104463, + "flos": 26173635845760.0, + "grad_norm": 1.715396677859901, + "language_loss": 0.75223613, + "learning_rate": 3.794215340959902e-06, + "loss": 0.77421153, + "num_input_tokens_seen": 61729855, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.015625, + "step": 2849, + "time_per_iteration": 2.4918415546417236 + }, + { + "auxiliary_loss_clip": 0.01047678, + "auxiliary_loss_mlp": 0.01003312, + "balance_loss_clip": 1.00107098, + "balance_loss_mlp": 1.01492834, + "epoch": 0.17135127010371262, + "flos": 69269710037760.0, + "grad_norm": 0.7949737728021388, + "language_loss": 0.57471085, + "learning_rate": 3.7940432381051163e-06, + "loss": 0.59522074, + "num_input_tokens_seen": 61790290, + "router_z_loss_clip": 0.02246094, + "router_z_loss_mlp": 0.328125, + "step": 2850, + "time_per_iteration": 3.057778835296631 + }, + { + "auxiliary_loss_clip": 0.01146039, + "auxiliary_loss_mlp": 0.0105304, + "balance_loss_clip": 1.03332317, + "balance_loss_mlp": 1.04852295, + "epoch": 0.1714113933563806, + "flos": 23550110380800.0, + "grad_norm": 2.4364727127987704, + "language_loss": 0.80988616, + "learning_rate": 3.793871067220031e-06, + "loss": 0.83187693, + "num_input_tokens_seen": 61809265, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.9765625, + "step": 2851, + "time_per_iteration": 3.887600898742676 + }, + { + "auxiliary_loss_clip": 0.01146778, + "auxiliary_loss_mlp": 0.0104369, + "balance_loss_clip": 1.02549911, + "balance_loss_mlp": 1.04858351, + "epoch": 0.17147151660904855, + "flos": 21142443697920.0, + "grad_norm": 2.035620688428962, + "language_loss": 0.93063158, + "learning_rate": 3.7936988283111764e-06, + "loss": 0.95253623, + "num_input_tokens_seen": 61828980, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.98046875, + "step": 2852, + "time_per_iteration": 3.920153856277466 + }, + { + "auxiliary_loss_clip": 0.01149404, + "auxiliary_loss_mlp": 0.01053071, + "balance_loss_clip": 1.03374732, + "balance_loss_mlp": 1.04728949, + "epoch": 0.17153163986171652, + "flos": 18624890332800.0, + "grad_norm": 1.8406206656402175, + "language_loss": 0.69480836, + "learning_rate": 3.7935265213850817e-06, + "loss": 0.71683311, + "num_input_tokens_seen": 61847915, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.015625, + "step": 2853, + "time_per_iteration": 2.4457037448883057 + }, + { + "auxiliary_loss_clip": 0.0115316, + "auxiliary_loss_mlp": 0.01050964, + "balance_loss_clip": 1.03150904, + "balance_loss_mlp": 1.05059445, + "epoch": 0.17159176311438448, + "flos": 18223265387520.0, + "grad_norm": 2.187977199847503, + "language_loss": 0.66505128, + "learning_rate": 3.7933541464482815e-06, + "loss": 0.68709248, + "num_input_tokens_seen": 61865570, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0234375, + "step": 2854, + "time_per_iteration": 2.4421632289886475 + }, + { + "auxiliary_loss_clip": 0.01144359, + "auxiliary_loss_mlp": 0.01044047, + "balance_loss_clip": 1.02520037, + "balance_loss_mlp": 1.04574227, + "epoch": 0.17165188636705245, + "flos": 20738987159040.0, + "grad_norm": 1.8257227486643586, + "language_loss": 0.89394444, + "learning_rate": 3.7931817035073124e-06, + "loss": 0.91582847, + "num_input_tokens_seen": 61883340, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.984375, + "step": 2855, + "time_per_iteration": 2.4601552486419678 + }, + { + "auxiliary_loss_clip": 0.01148566, + "auxiliary_loss_mlp": 0.01051381, + "balance_loss_clip": 1.03286791, + "balance_loss_mlp": 1.04792452, + "epoch": 0.17171200961972044, + "flos": 24899884680960.0, + "grad_norm": 2.515892939250119, + "language_loss": 0.83822739, + "learning_rate": 3.7930091925687134e-06, + "loss": 0.86022681, + "num_input_tokens_seen": 61900610, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0078125, + "step": 2856, + "time_per_iteration": 2.4747347831726074 + }, + { + "auxiliary_loss_clip": 0.01151618, + "auxiliary_loss_mlp": 0.01048758, + "balance_loss_clip": 1.02906466, + "balance_loss_mlp": 1.05112195, + "epoch": 0.1717721328723884, + "flos": 20157234485760.0, + "grad_norm": 1.9053156238546485, + "language_loss": 0.8645792, + "learning_rate": 3.792836613639026e-06, + "loss": 0.88658297, + "num_input_tokens_seen": 61916795, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0078125, + "step": 2857, + "time_per_iteration": 2.4460220336914062 + }, + { + "auxiliary_loss_clip": 0.01148045, + "auxiliary_loss_mlp": 0.0105234, + "balance_loss_clip": 1.03327847, + "balance_loss_mlp": 1.04805577, + "epoch": 0.17183225612505637, + "flos": 23361650697600.0, + "grad_norm": 2.139076633770832, + "language_loss": 0.77919662, + "learning_rate": 3.7926639667247947e-06, + "loss": 0.80120051, + "num_input_tokens_seen": 61936665, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0, + "step": 2858, + "time_per_iteration": 2.4459195137023926 + }, + { + "auxiliary_loss_clip": 0.01156262, + "auxiliary_loss_mlp": 0.01058687, + "balance_loss_clip": 1.03761101, + "balance_loss_mlp": 1.04760742, + "epoch": 0.17189237937772434, + "flos": 18114240631680.0, + "grad_norm": 2.423579883765011, + "language_loss": 0.77235049, + "learning_rate": 3.7924912518325663e-06, + "loss": 0.79449999, + "num_input_tokens_seen": 61954415, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.0859375, + "step": 2859, + "time_per_iteration": 2.43471360206604 + }, + { + "auxiliary_loss_clip": 0.01148379, + "auxiliary_loss_mlp": 0.01050312, + "balance_loss_clip": 1.03069019, + "balance_loss_mlp": 1.04920983, + "epoch": 0.1719525026303923, + "flos": 23258408031360.0, + "grad_norm": 3.774880148287903, + "language_loss": 0.77179611, + "learning_rate": 3.7923184689688902e-06, + "loss": 0.79378301, + "num_input_tokens_seen": 61973940, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.9921875, + "step": 2860, + "time_per_iteration": 2.463344097137451 + }, + { + "auxiliary_loss_clip": 0.01149457, + "auxiliary_loss_mlp": 0.01051045, + "balance_loss_clip": 1.03217435, + "balance_loss_mlp": 1.04703689, + "epoch": 0.17201262588306027, + "flos": 20810413353600.0, + "grad_norm": 2.1505291491255463, + "language_loss": 0.81964719, + "learning_rate": 3.792145618140317e-06, + "loss": 0.84165227, + "num_input_tokens_seen": 61991845, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0234375, + "step": 2861, + "time_per_iteration": 2.4505395889282227 + }, + { + "auxiliary_loss_clip": 0.01149339, + "auxiliary_loss_mlp": 0.01050609, + "balance_loss_clip": 1.03163123, + "balance_loss_mlp": 1.04897118, + "epoch": 0.17207274913572823, + "flos": 20375858615040.0, + "grad_norm": 4.22955926449596, + "language_loss": 0.85649675, + "learning_rate": 3.7919726993534038e-06, + "loss": 0.87849623, + "num_input_tokens_seen": 62009395, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0078125, + "step": 2862, + "time_per_iteration": 2.4392077922821045 + }, + { + "auxiliary_loss_clip": 0.01144423, + "auxiliary_loss_mlp": 0.01046105, + "balance_loss_clip": 1.02867651, + "balance_loss_mlp": 1.04785109, + "epoch": 0.17213287238839622, + "flos": 26797727675520.0, + "grad_norm": 2.3146804122881037, + "language_loss": 0.77874523, + "learning_rate": 3.7917997126147054e-06, + "loss": 0.80065054, + "num_input_tokens_seen": 62029005, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.96484375, + "step": 2863, + "time_per_iteration": 2.4745166301727295 + }, + { + "auxiliary_loss_clip": 0.01147347, + "auxiliary_loss_mlp": 0.01048138, + "balance_loss_clip": 1.02935052, + "balance_loss_mlp": 1.04726493, + "epoch": 0.1721929956410642, + "flos": 26030819370240.0, + "grad_norm": 1.7012031973405044, + "language_loss": 0.72191179, + "learning_rate": 3.7916266579307823e-06, + "loss": 0.74386668, + "num_input_tokens_seen": 62048730, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0, + "step": 2864, + "time_per_iteration": 2.496522903442383 + }, + { + "auxiliary_loss_clip": 0.01151447, + "auxiliary_loss_mlp": 0.01053526, + "balance_loss_clip": 1.03497648, + "balance_loss_mlp": 1.04935968, + "epoch": 0.17225311889373215, + "flos": 22273091078400.0, + "grad_norm": 1.6688219876641972, + "language_loss": 0.72896975, + "learning_rate": 3.7914535353081973e-06, + "loss": 0.75101948, + "num_input_tokens_seen": 62069000, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.015625, + "step": 2865, + "time_per_iteration": 2.468726396560669 + }, + { + "auxiliary_loss_clip": 0.01151587, + "auxiliary_loss_mlp": 0.01044873, + "balance_loss_clip": 1.02608538, + "balance_loss_mlp": 1.05194211, + "epoch": 0.17231324214640012, + "flos": 21287774125440.0, + "grad_norm": 3.1747822479918764, + "language_loss": 0.79011786, + "learning_rate": 3.7912803447535145e-06, + "loss": 0.81208247, + "num_input_tokens_seen": 62086750, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.99609375, + "step": 2866, + "time_per_iteration": 2.445716381072998 + }, + { + "auxiliary_loss_clip": 0.01149563, + "auxiliary_loss_mlp": 0.01046901, + "balance_loss_clip": 1.02651668, + "balance_loss_mlp": 1.04966402, + "epoch": 0.17237336539906808, + "flos": 19680735640320.0, + "grad_norm": 1.797659045411876, + "language_loss": 0.79865277, + "learning_rate": 3.7911070862733016e-06, + "loss": 0.82061744, + "num_input_tokens_seen": 62106240, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0, + "step": 2867, + "time_per_iteration": 2.4745590686798096 + }, + { + "auxiliary_loss_clip": 0.0114836, + "auxiliary_loss_mlp": 0.01037447, + "balance_loss_clip": 1.01887417, + "balance_loss_mlp": 1.04821014, + "epoch": 0.17243348865173605, + "flos": 17529650784000.0, + "grad_norm": 1.717941409951427, + "language_loss": 0.79707634, + "learning_rate": 3.7909337598741276e-06, + "loss": 0.81893444, + "num_input_tokens_seen": 62124895, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0, + "step": 2868, + "time_per_iteration": 2.4545693397521973 + }, + { + "auxiliary_loss_clip": 0.01157442, + "auxiliary_loss_mlp": 0.0104461, + "balance_loss_clip": 1.02645397, + "balance_loss_mlp": 1.0538218, + "epoch": 0.17249361190440402, + "flos": 18259858368000.0, + "grad_norm": 1.9332967921770021, + "language_loss": 0.84265673, + "learning_rate": 3.7907603655625674e-06, + "loss": 0.86467719, + "num_input_tokens_seen": 62143510, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.03125, + "step": 2869, + "time_per_iteration": 2.445429563522339 + }, + { + "auxiliary_loss_clip": 0.01151297, + "auxiliary_loss_mlp": 0.01052302, + "balance_loss_clip": 1.03226328, + "balance_loss_mlp": 1.04971075, + "epoch": 0.172553735157072, + "flos": 21174367910400.0, + "grad_norm": 2.3539211413688954, + "language_loss": 0.77522051, + "learning_rate": 3.7905869033451932e-06, + "loss": 0.79725653, + "num_input_tokens_seen": 62162285, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.015625, + "step": 2870, + "time_per_iteration": 2.4975087642669678 + }, + { + "auxiliary_loss_clip": 0.01146931, + "auxiliary_loss_mlp": 0.01043287, + "balance_loss_clip": 1.02609706, + "balance_loss_mlp": 1.05132568, + "epoch": 0.17261385840973997, + "flos": 22273270646400.0, + "grad_norm": 1.897031493968697, + "language_loss": 0.7680704, + "learning_rate": 3.7904133732285857e-06, + "loss": 0.78997254, + "num_input_tokens_seen": 62180970, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.95703125, + "step": 2871, + "time_per_iteration": 2.4777348041534424 + }, + { + "auxiliary_loss_clip": 0.01150344, + "auxiliary_loss_mlp": 0.01043916, + "balance_loss_clip": 1.02442563, + "balance_loss_mlp": 1.05061746, + "epoch": 0.17267398166240794, + "flos": 27922233830400.0, + "grad_norm": 2.240934958328371, + "language_loss": 0.74448204, + "learning_rate": 3.7902397752193228e-06, + "loss": 0.76642466, + "num_input_tokens_seen": 62198965, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0, + "step": 2872, + "time_per_iteration": 2.5021097660064697 + }, + { + "auxiliary_loss_clip": 0.01147343, + "auxiliary_loss_mlp": 0.01039942, + "balance_loss_clip": 1.02117848, + "balance_loss_mlp": 1.05127549, + "epoch": 0.1727341049150759, + "flos": 21945118970880.0, + "grad_norm": 1.8155923086100165, + "language_loss": 0.82694656, + "learning_rate": 3.790066109323988e-06, + "loss": 0.84881938, + "num_input_tokens_seen": 62219890, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9609375, + "step": 2873, + "time_per_iteration": 2.4852540493011475 + }, + { + "auxiliary_loss_clip": 0.01147589, + "auxiliary_loss_mlp": 0.01043231, + "balance_loss_clip": 1.0229888, + "balance_loss_mlp": 1.049196, + "epoch": 0.17279422816774387, + "flos": 18107883924480.0, + "grad_norm": 2.0464410919173814, + "language_loss": 0.75083232, + "learning_rate": 3.7898923755491678e-06, + "loss": 0.77274048, + "num_input_tokens_seen": 62237140, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.984375, + "step": 2874, + "time_per_iteration": 2.440610885620117 + }, + { + "auxiliary_loss_clip": 0.01151305, + "auxiliary_loss_mlp": 0.01044062, + "balance_loss_clip": 1.0238322, + "balance_loss_mlp": 1.0515728, + "epoch": 0.17285435142041183, + "flos": 21835447770240.0, + "grad_norm": 1.9230852666364326, + "language_loss": 0.8067199, + "learning_rate": 3.7897185739014487e-06, + "loss": 0.8286736, + "num_input_tokens_seen": 62255405, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.99609375, + "step": 2875, + "time_per_iteration": 2.478473424911499 + }, + { + "auxiliary_loss_clip": 0.01153488, + "auxiliary_loss_mlp": 0.01049908, + "balance_loss_clip": 1.02984488, + "balance_loss_mlp": 1.05083489, + "epoch": 0.17291447467307983, + "flos": 18368452160640.0, + "grad_norm": 2.5699127680633542, + "language_loss": 0.87525117, + "learning_rate": 3.7895447043874217e-06, + "loss": 0.89728516, + "num_input_tokens_seen": 62271280, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.03125, + "step": 2876, + "time_per_iteration": 2.4580602645874023 + }, + { + "auxiliary_loss_clip": 0.01150473, + "auxiliary_loss_mlp": 0.01042457, + "balance_loss_clip": 1.02384901, + "balance_loss_mlp": 1.05273616, + "epoch": 0.1729745979257478, + "flos": 18624638937600.0, + "grad_norm": 1.9567138745888089, + "language_loss": 0.84561193, + "learning_rate": 3.789370767013681e-06, + "loss": 0.86754125, + "num_input_tokens_seen": 62289140, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9765625, + "step": 2877, + "time_per_iteration": 2.4696123600006104 + }, + { + "auxiliary_loss_clip": 0.01151589, + "auxiliary_loss_mlp": 0.01041028, + "balance_loss_clip": 1.02179909, + "balance_loss_mlp": 1.05281305, + "epoch": 0.17303472117841576, + "flos": 22998234844800.0, + "grad_norm": 3.0724129461132406, + "language_loss": 0.79527134, + "learning_rate": 3.7891967617868204e-06, + "loss": 0.81719756, + "num_input_tokens_seen": 62307490, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.984375, + "step": 2878, + "time_per_iteration": 2.4739902019500732 + }, + { + "auxiliary_loss_clip": 0.01147004, + "auxiliary_loss_mlp": 0.01042956, + "balance_loss_clip": 1.02450228, + "balance_loss_mlp": 1.04968572, + "epoch": 0.17309484443108372, + "flos": 25664386775040.0, + "grad_norm": 1.9694378769308076, + "language_loss": 0.70306808, + "learning_rate": 3.78902268871344e-06, + "loss": 0.72496772, + "num_input_tokens_seen": 62328570, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.97265625, + "step": 2879, + "time_per_iteration": 2.5014665126800537 + }, + { + "auxiliary_loss_clip": 0.01151101, + "auxiliary_loss_mlp": 0.01050497, + "balance_loss_clip": 1.03156662, + "balance_loss_mlp": 1.05038834, + "epoch": 0.1731549676837517, + "flos": 13552903313280.0, + "grad_norm": 2.4431111997211734, + "language_loss": 0.83465785, + "learning_rate": 3.78884854780014e-06, + "loss": 0.85667384, + "num_input_tokens_seen": 62345735, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0078125, + "step": 2880, + "time_per_iteration": 2.433776378631592 + }, + { + "auxiliary_loss_clip": 0.01153087, + "auxiliary_loss_mlp": 0.01044219, + "balance_loss_clip": 1.0250026, + "balance_loss_mlp": 1.05171311, + "epoch": 0.17321509093641965, + "flos": 22857070394880.0, + "grad_norm": 2.135155165507549, + "language_loss": 0.80866969, + "learning_rate": 3.7886743390535236e-06, + "loss": 0.8306427, + "num_input_tokens_seen": 62365525, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.0078125, + "step": 2881, + "time_per_iteration": 2.4944772720336914 + }, + { + "auxiliary_loss_clip": 0.01148623, + "auxiliary_loss_mlp": 0.01043966, + "balance_loss_clip": 1.0263828, + "balance_loss_mlp": 1.05030859, + "epoch": 0.17327521418908762, + "flos": 24352785653760.0, + "grad_norm": 2.5502275528368066, + "language_loss": 0.77372867, + "learning_rate": 3.788500062480197e-06, + "loss": 0.79565454, + "num_input_tokens_seen": 62385160, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.984375, + "step": 2882, + "time_per_iteration": 2.5426836013793945 + }, + { + "auxiliary_loss_clip": 0.011482, + "auxiliary_loss_mlp": 0.01051627, + "balance_loss_clip": 1.03276825, + "balance_loss_mlp": 1.05005169, + "epoch": 0.1733353374417556, + "flos": 33105651816960.0, + "grad_norm": 1.8718611847068298, + "language_loss": 0.76652586, + "learning_rate": 3.788325718086769e-06, + "loss": 0.78852415, + "num_input_tokens_seen": 62405280, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.984375, + "step": 2883, + "time_per_iteration": 2.5733277797698975 + }, + { + "auxiliary_loss_clip": 0.01146516, + "auxiliary_loss_mlp": 0.01044391, + "balance_loss_clip": 1.0265696, + "balance_loss_mlp": 1.04944682, + "epoch": 0.17339546069442358, + "flos": 24388947671040.0, + "grad_norm": 1.945193845574475, + "language_loss": 0.85463524, + "learning_rate": 3.7881513058798503e-06, + "loss": 0.87654424, + "num_input_tokens_seen": 62423665, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.97265625, + "step": 2884, + "time_per_iteration": 2.4708735942840576 + }, + { + "auxiliary_loss_clip": 0.01149646, + "auxiliary_loss_mlp": 0.01039486, + "balance_loss_clip": 1.02122355, + "balance_loss_mlp": 1.05114794, + "epoch": 0.17345558394709154, + "flos": 27454174680960.0, + "grad_norm": 1.6148586475999513, + "language_loss": 0.73758793, + "learning_rate": 3.787976825866055e-06, + "loss": 0.75947917, + "num_input_tokens_seen": 62445170, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.984375, + "step": 2885, + "time_per_iteration": 2.5266878604888916 + }, + { + "auxiliary_loss_clip": 0.01147273, + "auxiliary_loss_mlp": 0.01044711, + "balance_loss_clip": 1.02775908, + "balance_loss_mlp": 1.05269074, + "epoch": 0.1735157071997595, + "flos": 24682158391680.0, + "grad_norm": 1.9690054244815705, + "language_loss": 0.70377076, + "learning_rate": 3.7878022780519998e-06, + "loss": 0.72569054, + "num_input_tokens_seen": 62466135, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9453125, + "step": 2886, + "time_per_iteration": 2.508695363998413 + }, + { + "auxiliary_loss_clip": 0.01146959, + "auxiliary_loss_mlp": 0.01040101, + "balance_loss_clip": 1.0212425, + "balance_loss_mlp": 1.04799545, + "epoch": 0.17357583045242747, + "flos": 21688932193920.0, + "grad_norm": 1.9665325510573808, + "language_loss": 0.69294798, + "learning_rate": 3.7876276624443024e-06, + "loss": 0.7148186, + "num_input_tokens_seen": 62483910, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.98828125, + "step": 2887, + "time_per_iteration": 2.4787776470184326 + }, + { + "auxiliary_loss_clip": 0.01149915, + "auxiliary_loss_mlp": 0.01049822, + "balance_loss_clip": 1.03180945, + "balance_loss_mlp": 1.05075955, + "epoch": 0.17363595370509544, + "flos": 15375728753280.0, + "grad_norm": 1.791000255721863, + "language_loss": 0.85391176, + "learning_rate": 3.787452979049585e-06, + "loss": 0.87590909, + "num_input_tokens_seen": 62501530, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9921875, + "step": 2888, + "time_per_iteration": 2.4234085083007812 + }, + { + "auxiliary_loss_clip": 0.01149617, + "auxiliary_loss_mlp": 0.01046928, + "balance_loss_clip": 1.02668667, + "balance_loss_mlp": 1.05046952, + "epoch": 0.1736960769577634, + "flos": 23440941970560.0, + "grad_norm": 3.660213605651755, + "language_loss": 0.78465497, + "learning_rate": 3.7872782278744718e-06, + "loss": 0.80662042, + "num_input_tokens_seen": 62521295, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.9921875, + "step": 2889, + "time_per_iteration": 2.5042123794555664 + }, + { + "auxiliary_loss_clip": 0.01146581, + "auxiliary_loss_mlp": 0.01047944, + "balance_loss_clip": 1.02913308, + "balance_loss_mlp": 1.05222893, + "epoch": 0.1737562002104314, + "flos": 18587830475520.0, + "grad_norm": 2.9081348702485723, + "language_loss": 0.83860242, + "learning_rate": 3.7871034089255883e-06, + "loss": 0.86054766, + "num_input_tokens_seen": 62539615, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.9453125, + "step": 2890, + "time_per_iteration": 2.4698500633239746 + }, + { + "auxiliary_loss_clip": 0.01150813, + "auxiliary_loss_mlp": 0.01047378, + "balance_loss_clip": 1.02880502, + "balance_loss_mlp": 1.05083108, + "epoch": 0.17381632346309936, + "flos": 15998060816640.0, + "grad_norm": 1.9935479009749588, + "language_loss": 0.82253492, + "learning_rate": 3.7869285222095653e-06, + "loss": 0.84451687, + "num_input_tokens_seen": 62556820, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0, + "step": 2891, + "time_per_iteration": 2.4478886127471924 + }, + { + "auxiliary_loss_clip": 0.01150226, + "auxiliary_loss_mlp": 0.01045776, + "balance_loss_clip": 1.02608275, + "balance_loss_mlp": 1.04824781, + "epoch": 0.17387644671576732, + "flos": 13369830670080.0, + "grad_norm": 2.3073165362682873, + "language_loss": 0.81479478, + "learning_rate": 3.7867535677330334e-06, + "loss": 0.8367548, + "num_input_tokens_seen": 62572450, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.015625, + "step": 2892, + "time_per_iteration": 2.4094645977020264 + }, + { + "auxiliary_loss_clip": 0.01154909, + "auxiliary_loss_mlp": 0.01055666, + "balance_loss_clip": 1.03519785, + "balance_loss_mlp": 1.05379355, + "epoch": 0.1739365699684353, + "flos": 26615516958720.0, + "grad_norm": 2.24459564009462, + "language_loss": 0.74480057, + "learning_rate": 3.786578545502627e-06, + "loss": 0.76690638, + "num_input_tokens_seen": 62592580, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0078125, + "step": 2893, + "time_per_iteration": 3.8296191692352295 + }, + { + "auxiliary_loss_clip": 0.01152082, + "auxiliary_loss_mlp": 0.010434, + "balance_loss_clip": 1.02375412, + "balance_loss_mlp": 1.05193436, + "epoch": 0.17399669322110325, + "flos": 23367971491200.0, + "grad_norm": 2.117368029368179, + "language_loss": 0.83073241, + "learning_rate": 3.7864034555249828e-06, + "loss": 0.85268712, + "num_input_tokens_seen": 62611220, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0, + "step": 2894, + "time_per_iteration": 3.9817075729370117 + }, + { + "auxiliary_loss_clip": 0.01150382, + "auxiliary_loss_mlp": 0.01047312, + "balance_loss_clip": 1.02523482, + "balance_loss_mlp": 1.05032384, + "epoch": 0.17405681647377122, + "flos": 22054107813120.0, + "grad_norm": 2.157907065313142, + "language_loss": 0.74051547, + "learning_rate": 3.786228297806741e-06, + "loss": 0.76249242, + "num_input_tokens_seen": 62629185, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.0, + "step": 2895, + "time_per_iteration": 2.461857318878174 + }, + { + "auxiliary_loss_clip": 0.01048544, + "auxiliary_loss_mlp": 0.01006984, + "balance_loss_clip": 1.00467134, + "balance_loss_mlp": 1.01600659, + "epoch": 0.1741169397264392, + "flos": 61457559114240.0, + "grad_norm": 0.8715266336267762, + "language_loss": 0.6273998, + "learning_rate": 3.7860530723545435e-06, + "loss": 0.64795506, + "num_input_tokens_seen": 62691895, + "router_z_loss_clip": 0.02307129, + "router_z_loss_mlp": 0.32421875, + "step": 2896, + "time_per_iteration": 3.1462173461914062 + }, + { + "auxiliary_loss_clip": 0.01148575, + "auxiliary_loss_mlp": 0.01041696, + "balance_loss_clip": 1.02160895, + "balance_loss_mlp": 1.04787612, + "epoch": 0.17417706297910718, + "flos": 27017680608000.0, + "grad_norm": 2.3238967096174923, + "language_loss": 0.75600475, + "learning_rate": 3.785877779175034e-06, + "loss": 0.77790749, + "num_input_tokens_seen": 62713790, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0078125, + "step": 2897, + "time_per_iteration": 2.4974682331085205 + }, + { + "auxiliary_loss_clip": 0.01147676, + "auxiliary_loss_mlp": 0.01042715, + "balance_loss_clip": 1.02354646, + "balance_loss_mlp": 1.05000067, + "epoch": 0.17423718623177514, + "flos": 33508856960640.0, + "grad_norm": 1.9004029304223122, + "language_loss": 0.69384712, + "learning_rate": 3.7857024182748606e-06, + "loss": 0.71575105, + "num_input_tokens_seen": 62736285, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.9765625, + "step": 2898, + "time_per_iteration": 2.5650558471679688 + }, + { + "auxiliary_loss_clip": 0.0115334, + "auxiliary_loss_mlp": 0.01049615, + "balance_loss_clip": 1.03026772, + "balance_loss_mlp": 1.05215359, + "epoch": 0.1742973094844431, + "flos": 27198634348800.0, + "grad_norm": 2.315885710988465, + "language_loss": 0.76069367, + "learning_rate": 3.7855269896606717e-06, + "loss": 0.78272319, + "num_input_tokens_seen": 62756240, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.015625, + "step": 2899, + "time_per_iteration": 2.5006191730499268 + }, + { + "auxiliary_loss_clip": 0.01145178, + "auxiliary_loss_mlp": 0.01045245, + "balance_loss_clip": 1.02571905, + "balance_loss_mlp": 1.04929495, + "epoch": 0.17435743273711107, + "flos": 22710734386560.0, + "grad_norm": 1.9440585306650153, + "language_loss": 0.72821134, + "learning_rate": 3.785351493339121e-06, + "loss": 0.75011557, + "num_input_tokens_seen": 62775910, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.9609375, + "step": 2900, + "time_per_iteration": 2.5199801921844482 + }, + { + "auxiliary_loss_clip": 0.01150074, + "auxiliary_loss_mlp": 0.01051215, + "balance_loss_clip": 1.03261876, + "balance_loss_mlp": 1.04989529, + "epoch": 0.17441755598977904, + "flos": 41646466039680.0, + "grad_norm": 1.6677330343015109, + "language_loss": 0.70085949, + "learning_rate": 3.785175929316863e-06, + "loss": 0.72287238, + "num_input_tokens_seen": 62799385, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0, + "step": 2901, + "time_per_iteration": 2.624864101409912 + }, + { + "auxiliary_loss_clip": 0.01152064, + "auxiliary_loss_mlp": 0.01048884, + "balance_loss_clip": 1.03022778, + "balance_loss_mlp": 1.05087507, + "epoch": 0.174477679242447, + "flos": 26287077974400.0, + "grad_norm": 1.7643324639769489, + "language_loss": 0.76549768, + "learning_rate": 3.7850002976005543e-06, + "loss": 0.78750718, + "num_input_tokens_seen": 62819380, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.0078125, + "step": 2902, + "time_per_iteration": 2.51680850982666 + }, + { + "auxiliary_loss_clip": 0.01150394, + "auxiliary_loss_mlp": 0.01056591, + "balance_loss_clip": 1.03741002, + "balance_loss_mlp": 1.04885221, + "epoch": 0.174537802495115, + "flos": 17858412990720.0, + "grad_norm": 2.129298660499851, + "language_loss": 0.81787169, + "learning_rate": 3.7848245981968558e-06, + "loss": 0.8399415, + "num_input_tokens_seen": 62836205, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.015625, + "step": 2903, + "time_per_iteration": 2.436877727508545 + }, + { + "auxiliary_loss_clip": 0.01148467, + "auxiliary_loss_mlp": 0.01041626, + "balance_loss_clip": 1.02255297, + "balance_loss_mlp": 1.04978609, + "epoch": 0.17459792574778296, + "flos": 16940715390720.0, + "grad_norm": 2.1703016783079327, + "language_loss": 0.73228866, + "learning_rate": 3.784648831112429e-06, + "loss": 0.75418955, + "num_input_tokens_seen": 62854045, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.98828125, + "step": 2904, + "time_per_iteration": 2.462775707244873 + }, + { + "auxiliary_loss_clip": 0.0114756, + "auxiliary_loss_mlp": 0.01045319, + "balance_loss_clip": 1.02719879, + "balance_loss_mlp": 1.04777265, + "epoch": 0.17465804900045093, + "flos": 25520026014720.0, + "grad_norm": 1.9374721445221084, + "language_loss": 0.64526325, + "learning_rate": 3.7844729963539406e-06, + "loss": 0.6671921, + "num_input_tokens_seen": 62873075, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.0, + "step": 2905, + "time_per_iteration": 2.468395233154297 + }, + { + "auxiliary_loss_clip": 0.01157304, + "auxiliary_loss_mlp": 0.01050089, + "balance_loss_clip": 1.0292747, + "balance_loss_mlp": 1.05202341, + "epoch": 0.1747181722531189, + "flos": 24129708238080.0, + "grad_norm": 1.804147248272645, + "language_loss": 0.79236615, + "learning_rate": 3.7842970939280566e-06, + "loss": 0.81444013, + "num_input_tokens_seen": 62892675, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.0546875, + "step": 2906, + "time_per_iteration": 2.4632725715637207 + }, + { + "auxiliary_loss_clip": 0.01150693, + "auxiliary_loss_mlp": 0.01055346, + "balance_loss_clip": 1.03577161, + "balance_loss_mlp": 1.05044913, + "epoch": 0.17477829550578686, + "flos": 17748813617280.0, + "grad_norm": 1.7929508882228948, + "language_loss": 0.81010377, + "learning_rate": 3.784121123841449e-06, + "loss": 0.83216417, + "num_input_tokens_seen": 62910675, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0, + "step": 2907, + "time_per_iteration": 2.4214229583740234 + }, + { + "auxiliary_loss_clip": 0.01152007, + "auxiliary_loss_mlp": 0.01050463, + "balance_loss_clip": 1.03161621, + "balance_loss_mlp": 1.05040026, + "epoch": 0.17483841875845482, + "flos": 15377344865280.0, + "grad_norm": 2.7402312811515515, + "language_loss": 0.81315112, + "learning_rate": 3.7839450861007886e-06, + "loss": 0.83517587, + "num_input_tokens_seen": 62928130, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.015625, + "step": 2908, + "time_per_iteration": 2.4340970516204834 + }, + { + "auxiliary_loss_clip": 0.01150458, + "auxiliary_loss_mlp": 0.01051278, + "balance_loss_clip": 1.03047633, + "balance_loss_mlp": 1.04978228, + "epoch": 0.17489854201112282, + "flos": 17163254102400.0, + "grad_norm": 2.419675279893618, + "language_loss": 0.80399191, + "learning_rate": 3.7837689807127518e-06, + "loss": 0.82600915, + "num_input_tokens_seen": 62944290, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.0078125, + "step": 2909, + "time_per_iteration": 2.4170033931732178 + }, + { + "auxiliary_loss_clip": 0.0115308, + "auxiliary_loss_mlp": 0.01053412, + "balance_loss_clip": 1.03319383, + "balance_loss_mlp": 1.05133021, + "epoch": 0.17495866526379078, + "flos": 19755286318080.0, + "grad_norm": 1.6998329053727648, + "language_loss": 0.76530939, + "learning_rate": 3.783592807684017e-06, + "loss": 0.78737426, + "num_input_tokens_seen": 62963505, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.015625, + "step": 2910, + "time_per_iteration": 2.457628011703491 + }, + { + "auxiliary_loss_clip": 0.01151475, + "auxiliary_loss_mlp": 0.01049527, + "balance_loss_clip": 1.02901077, + "balance_loss_mlp": 1.05060935, + "epoch": 0.17501878851645875, + "flos": 28511133310080.0, + "grad_norm": 1.6502133484544155, + "language_loss": 0.87255991, + "learning_rate": 3.7834165670212645e-06, + "loss": 0.89456993, + "num_input_tokens_seen": 62985020, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0078125, + "step": 2911, + "time_per_iteration": 2.5302672386169434 + }, + { + "auxiliary_loss_clip": 0.01148398, + "auxiliary_loss_mlp": 0.0105451, + "balance_loss_clip": 1.03349352, + "balance_loss_mlp": 1.04746377, + "epoch": 0.1750789117691267, + "flos": 17931203902080.0, + "grad_norm": 2.260601647926804, + "language_loss": 0.89586449, + "learning_rate": 3.7832402587311764e-06, + "loss": 0.91789353, + "num_input_tokens_seen": 63001745, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.0078125, + "step": 2912, + "time_per_iteration": 2.447650194168091 + }, + { + "auxiliary_loss_clip": 0.01150608, + "auxiliary_loss_mlp": 0.01050267, + "balance_loss_clip": 1.0302161, + "balance_loss_mlp": 1.04871392, + "epoch": 0.17513903502179468, + "flos": 18259427404800.0, + "grad_norm": 2.8836544870459813, + "language_loss": 0.7262938, + "learning_rate": 3.783063882820439e-06, + "loss": 0.74830252, + "num_input_tokens_seen": 63019750, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.015625, + "step": 2913, + "time_per_iteration": 2.423595666885376 + }, + { + "auxiliary_loss_clip": 0.01150722, + "auxiliary_loss_mlp": 0.01047113, + "balance_loss_clip": 1.02738369, + "balance_loss_mlp": 1.0522244, + "epoch": 0.17519915827446264, + "flos": 20704728562560.0, + "grad_norm": 2.243393227782369, + "language_loss": 0.68799925, + "learning_rate": 3.782887439295741e-06, + "loss": 0.70997757, + "num_input_tokens_seen": 63039500, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.984375, + "step": 2914, + "time_per_iteration": 2.46085262298584 + }, + { + "auxiliary_loss_clip": 0.01149727, + "auxiliary_loss_mlp": 0.01056566, + "balance_loss_clip": 1.03616977, + "balance_loss_mlp": 1.05143356, + "epoch": 0.1752592815271306, + "flos": 20523415685760.0, + "grad_norm": 1.8218690011087264, + "language_loss": 0.93755293, + "learning_rate": 3.782710928163772e-06, + "loss": 0.95961595, + "num_input_tokens_seen": 63059785, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.98046875, + "step": 2915, + "time_per_iteration": 2.457148551940918 + }, + { + "auxiliary_loss_clip": 0.01143068, + "auxiliary_loss_mlp": 0.01047095, + "balance_loss_clip": 1.02744889, + "balance_loss_mlp": 1.04722261, + "epoch": 0.1753194047797986, + "flos": 21799178012160.0, + "grad_norm": 1.8144768789670476, + "language_loss": 0.80869162, + "learning_rate": 3.782534349431226e-06, + "loss": 0.83059323, + "num_input_tokens_seen": 63079385, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.95703125, + "step": 2916, + "time_per_iteration": 2.4740476608276367 + }, + { + "auxiliary_loss_clip": 0.01150429, + "auxiliary_loss_mlp": 0.01056449, + "balance_loss_clip": 1.03663611, + "balance_loss_mlp": 1.04854608, + "epoch": 0.17537952803246656, + "flos": 20668351063680.0, + "grad_norm": 1.67512565222408, + "language_loss": 0.73645711, + "learning_rate": 3.782357703104799e-06, + "loss": 0.75852591, + "num_input_tokens_seen": 63098970, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.015625, + "step": 2917, + "time_per_iteration": 2.4484915733337402 + }, + { + "auxiliary_loss_clip": 0.01144993, + "auxiliary_loss_mlp": 0.01055794, + "balance_loss_clip": 1.03517044, + "balance_loss_mlp": 1.04897738, + "epoch": 0.17543965128513453, + "flos": 23295072839040.0, + "grad_norm": 12.675743752905372, + "language_loss": 0.77019119, + "learning_rate": 3.7821809891911897e-06, + "loss": 0.79219908, + "num_input_tokens_seen": 63118750, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 0.9609375, + "step": 2918, + "time_per_iteration": 2.4723429679870605 + }, + { + "auxiliary_loss_clip": 0.01154194, + "auxiliary_loss_mlp": 0.01046159, + "balance_loss_clip": 1.0260129, + "balance_loss_mlp": 1.05131745, + "epoch": 0.1754997745378025, + "flos": 29095615416960.0, + "grad_norm": 3.415786226656528, + "language_loss": 0.74196291, + "learning_rate": 3.782004207697098e-06, + "loss": 0.76396644, + "num_input_tokens_seen": 63136865, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.03125, + "step": 2919, + "time_per_iteration": 2.5049829483032227 + }, + { + "auxiliary_loss_clip": 0.01154792, + "auxiliary_loss_mlp": 0.01049917, + "balance_loss_clip": 1.03080809, + "balance_loss_mlp": 1.05090559, + "epoch": 0.17555989779047046, + "flos": 30371844620160.0, + "grad_norm": 1.7754050788280298, + "language_loss": 0.74211872, + "learning_rate": 3.781827358629228e-06, + "loss": 0.76416576, + "num_input_tokens_seen": 63158325, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0390625, + "step": 2920, + "time_per_iteration": 2.565361738204956 + }, + { + "auxiliary_loss_clip": 0.01144387, + "auxiliary_loss_mlp": 0.01039886, + "balance_loss_clip": 1.0219686, + "balance_loss_mlp": 1.04717219, + "epoch": 0.17562002104313842, + "flos": 23287746464640.0, + "grad_norm": 2.3164139995284834, + "language_loss": 0.7949307, + "learning_rate": 3.7816504419942873e-06, + "loss": 0.81677347, + "num_input_tokens_seen": 63173115, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.97265625, + "step": 2921, + "time_per_iteration": 2.4471213817596436 + }, + { + "auxiliary_loss_clip": 0.01153986, + "auxiliary_loss_mlp": 0.0104563, + "balance_loss_clip": 1.02569795, + "balance_loss_mlp": 1.05029321, + "epoch": 0.1756801442958064, + "flos": 24790500789120.0, + "grad_norm": 1.6170497741380607, + "language_loss": 0.87493849, + "learning_rate": 3.7814734577989823e-06, + "loss": 0.89693457, + "num_input_tokens_seen": 63192880, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0390625, + "step": 2922, + "time_per_iteration": 2.5042173862457275 + }, + { + "auxiliary_loss_clip": 0.01149338, + "auxiliary_loss_mlp": 0.01050477, + "balance_loss_clip": 1.03074801, + "balance_loss_mlp": 1.04808784, + "epoch": 0.17574026754847438, + "flos": 25771651764480.0, + "grad_norm": 2.3811708545321735, + "language_loss": 0.62097687, + "learning_rate": 3.7812964060500253e-06, + "loss": 0.64297503, + "num_input_tokens_seen": 63214395, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.015625, + "step": 2923, + "time_per_iteration": 2.5067484378814697 + }, + { + "auxiliary_loss_clip": 0.01154551, + "auxiliary_loss_mlp": 0.01048297, + "balance_loss_clip": 1.02775693, + "balance_loss_mlp": 1.05287814, + "epoch": 0.17580039080114235, + "flos": 17456608477440.0, + "grad_norm": 2.1344206016331797, + "language_loss": 0.80602306, + "learning_rate": 3.78111928675413e-06, + "loss": 0.82805157, + "num_input_tokens_seen": 63231020, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.015625, + "step": 2924, + "time_per_iteration": 2.453174114227295 + }, + { + "auxiliary_loss_clip": 0.0115147, + "auxiliary_loss_mlp": 0.01053673, + "balance_loss_clip": 1.03214407, + "balance_loss_mlp": 1.04809761, + "epoch": 0.1758605140538103, + "flos": 14864648088960.0, + "grad_norm": 3.672968077353321, + "language_loss": 0.70954067, + "learning_rate": 3.7809420999180126e-06, + "loss": 0.73159206, + "num_input_tokens_seen": 63246245, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.03125, + "step": 2925, + "time_per_iteration": 2.4666385650634766 + }, + { + "auxiliary_loss_clip": 0.01148763, + "auxiliary_loss_mlp": 0.0104438, + "balance_loss_clip": 1.02538979, + "balance_loss_mlp": 1.05147243, + "epoch": 0.17592063730647828, + "flos": 23004268329600.0, + "grad_norm": 1.6622274839000213, + "language_loss": 0.71700275, + "learning_rate": 3.7807648455483934e-06, + "loss": 0.73893416, + "num_input_tokens_seen": 63267790, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.97265625, + "step": 2926, + "time_per_iteration": 2.50289249420166 + }, + { + "auxiliary_loss_clip": 0.01150931, + "auxiliary_loss_mlp": 0.01043072, + "balance_loss_clip": 1.02191257, + "balance_loss_mlp": 1.04857433, + "epoch": 0.17598076055914624, + "flos": 20741501111040.0, + "grad_norm": 1.8916391197618272, + "language_loss": 0.84433806, + "learning_rate": 3.7805875236519918e-06, + "loss": 0.86627805, + "num_input_tokens_seen": 63286830, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.0234375, + "step": 2927, + "time_per_iteration": 2.447207450866699 + }, + { + "auxiliary_loss_clip": 0.01149947, + "auxiliary_loss_mlp": 0.01043802, + "balance_loss_clip": 1.02568233, + "balance_loss_mlp": 1.0506475, + "epoch": 0.1760408838118142, + "flos": 34092441227520.0, + "grad_norm": 1.8156588356210406, + "language_loss": 0.71879232, + "learning_rate": 3.7804101342355336e-06, + "loss": 0.74072987, + "num_input_tokens_seen": 63308870, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9921875, + "step": 2928, + "time_per_iteration": 2.585942029953003 + }, + { + "auxiliary_loss_clip": 0.01150116, + "auxiliary_loss_mlp": 0.01048544, + "balance_loss_clip": 1.028934, + "balance_loss_mlp": 1.05230594, + "epoch": 0.1761010070644822, + "flos": 24168384207360.0, + "grad_norm": 2.0402577824357886, + "language_loss": 0.83222824, + "learning_rate": 3.780232677305744e-06, + "loss": 0.85421479, + "num_input_tokens_seen": 63329005, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.9765625, + "step": 2929, + "time_per_iteration": 2.461101770401001 + }, + { + "auxiliary_loss_clip": 0.01149627, + "auxiliary_loss_mlp": 0.0104173, + "balance_loss_clip": 1.02298999, + "balance_loss_mlp": 1.0493536, + "epoch": 0.17616113031715017, + "flos": 26576697335040.0, + "grad_norm": 1.817429721867852, + "language_loss": 0.7933988, + "learning_rate": 3.7800551528693535e-06, + "loss": 0.81531239, + "num_input_tokens_seen": 63349390, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0, + "step": 2930, + "time_per_iteration": 2.491748571395874 + }, + { + "auxiliary_loss_clip": 0.01154203, + "auxiliary_loss_mlp": 0.0104708, + "balance_loss_clip": 1.02671921, + "balance_loss_mlp": 1.05319881, + "epoch": 0.17622125356981813, + "flos": 25666685245440.0, + "grad_norm": 2.194829469856105, + "language_loss": 0.76142448, + "learning_rate": 3.7798775609330927e-06, + "loss": 0.78343737, + "num_input_tokens_seen": 63368835, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0078125, + "step": 2931, + "time_per_iteration": 2.4907379150390625 + }, + { + "auxiliary_loss_clip": 0.01151699, + "auxiliary_loss_mlp": 0.0104003, + "balance_loss_clip": 1.0211587, + "balance_loss_mlp": 1.05108666, + "epoch": 0.1762813768224861, + "flos": 16508530949760.0, + "grad_norm": 2.8261445455709153, + "language_loss": 0.74740392, + "learning_rate": 3.779699901503696e-06, + "loss": 0.7693212, + "num_input_tokens_seen": 63385220, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0078125, + "step": 2932, + "time_per_iteration": 2.4252588748931885 + }, + { + "auxiliary_loss_clip": 0.01157373, + "auxiliary_loss_mlp": 0.01043179, + "balance_loss_clip": 1.0221262, + "balance_loss_mlp": 1.05086923, + "epoch": 0.17634150007515406, + "flos": 11211850402560.0, + "grad_norm": 2.4930669650063355, + "language_loss": 0.8968839, + "learning_rate": 3.7795221745879016e-06, + "loss": 0.9188894, + "num_input_tokens_seen": 63400865, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.0625, + "step": 2933, + "time_per_iteration": 2.4334278106689453 + }, + { + "auxiliary_loss_clip": 0.01147962, + "auxiliary_loss_mlp": 0.01047507, + "balance_loss_clip": 1.02980459, + "balance_loss_mlp": 1.05053639, + "epoch": 0.17640162332782203, + "flos": 23659925235840.0, + "grad_norm": 1.6616334836184845, + "language_loss": 0.88273364, + "learning_rate": 3.779344380192448e-06, + "loss": 0.90468836, + "num_input_tokens_seen": 63421390, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9765625, + "step": 2934, + "time_per_iteration": 3.891472578048706 + }, + { + "auxiliary_loss_clip": 0.01147552, + "auxiliary_loss_mlp": 0.0104641, + "balance_loss_clip": 1.02827823, + "balance_loss_mlp": 1.04972959, + "epoch": 0.17646174658049, + "flos": 53796984606720.0, + "grad_norm": 1.7575209177187046, + "language_loss": 0.70843625, + "learning_rate": 3.779166518324077e-06, + "loss": 0.73037589, + "num_input_tokens_seen": 63444715, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.98046875, + "step": 2935, + "time_per_iteration": 5.650984287261963 + }, + { + "auxiliary_loss_clip": 0.01157572, + "auxiliary_loss_mlp": 0.0104314, + "balance_loss_clip": 1.02405488, + "balance_loss_mlp": 1.05251908, + "epoch": 0.17652186983315798, + "flos": 24243868638720.0, + "grad_norm": 2.2448658169111795, + "language_loss": 0.69255942, + "learning_rate": 3.7789885889895325e-06, + "loss": 0.71456659, + "num_input_tokens_seen": 63465525, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0546875, + "step": 2936, + "time_per_iteration": 2.4864091873168945 + }, + { + "auxiliary_loss_clip": 0.01154775, + "auxiliary_loss_mlp": 0.01045313, + "balance_loss_clip": 1.02758646, + "balance_loss_mlp": 1.05530488, + "epoch": 0.17658199308582595, + "flos": 27454282421760.0, + "grad_norm": 1.883537128373794, + "language_loss": 0.71391022, + "learning_rate": 3.7788105921955634e-06, + "loss": 0.73591107, + "num_input_tokens_seen": 63485815, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.99609375, + "step": 2937, + "time_per_iteration": 2.5096240043640137 + }, + { + "auxiliary_loss_clip": 0.01158892, + "auxiliary_loss_mlp": 0.010448, + "balance_loss_clip": 1.02461779, + "balance_loss_mlp": 1.05530524, + "epoch": 0.17664211633849392, + "flos": 22418672901120.0, + "grad_norm": 2.165923066719211, + "language_loss": 0.7584855, + "learning_rate": 3.7786325279489184e-06, + "loss": 0.78052241, + "num_input_tokens_seen": 63503905, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.03125, + "step": 2938, + "time_per_iteration": 2.475069284439087 + }, + { + "auxiliary_loss_clip": 0.01153261, + "auxiliary_loss_mlp": 0.01043059, + "balance_loss_clip": 1.02466512, + "balance_loss_mlp": 1.05156195, + "epoch": 0.17670223959116188, + "flos": 24715124098560.0, + "grad_norm": 2.20477923303766, + "language_loss": 0.71130306, + "learning_rate": 3.7784543962563495e-06, + "loss": 0.73326623, + "num_input_tokens_seen": 63521985, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.015625, + "step": 2939, + "time_per_iteration": 2.4806766510009766 + }, + { + "auxiliary_loss_clip": 0.01153772, + "auxiliary_loss_mlp": 0.0104332, + "balance_loss_clip": 1.02421093, + "balance_loss_mlp": 1.0538342, + "epoch": 0.17676236284382985, + "flos": 22527051212160.0, + "grad_norm": 3.125031265469358, + "language_loss": 0.73781312, + "learning_rate": 3.7782761971246115e-06, + "loss": 0.7597841, + "num_input_tokens_seen": 63539830, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0, + "step": 2940, + "time_per_iteration": 2.5438694953918457 + }, + { + "auxiliary_loss_clip": 0.01154904, + "auxiliary_loss_mlp": 0.01045748, + "balance_loss_clip": 1.02568471, + "balance_loss_mlp": 1.05372643, + "epoch": 0.1768224860964978, + "flos": 12385160161920.0, + "grad_norm": 2.4976558026918703, + "language_loss": 0.85003591, + "learning_rate": 3.7780979305604616e-06, + "loss": 0.87204242, + "num_input_tokens_seen": 63555495, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0078125, + "step": 2941, + "time_per_iteration": 2.4616622924804688 + }, + { + "auxiliary_loss_clip": 0.01154492, + "auxiliary_loss_mlp": 0.0104577, + "balance_loss_clip": 1.02687514, + "balance_loss_mlp": 1.05292201, + "epoch": 0.1768826093491658, + "flos": 24353360271360.0, + "grad_norm": 2.199835477442084, + "language_loss": 0.7711162, + "learning_rate": 3.7779195965706607e-06, + "loss": 0.79311877, + "num_input_tokens_seen": 63575290, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.015625, + "step": 2942, + "time_per_iteration": 2.512493848800659 + }, + { + "auxiliary_loss_clip": 0.01154308, + "auxiliary_loss_mlp": 0.01044542, + "balance_loss_clip": 1.02514625, + "balance_loss_mlp": 1.05181623, + "epoch": 0.17694273260183377, + "flos": 23587062497280.0, + "grad_norm": 1.9811917296629065, + "language_loss": 0.80591762, + "learning_rate": 3.77774119516197e-06, + "loss": 0.82790613, + "num_input_tokens_seen": 63594670, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0234375, + "step": 2943, + "time_per_iteration": 2.4898416996002197 + }, + { + "auxiliary_loss_clip": 0.01154834, + "auxiliary_loss_mlp": 0.01050893, + "balance_loss_clip": 1.02953053, + "balance_loss_mlp": 1.05046725, + "epoch": 0.17700285585450173, + "flos": 26760991040640.0, + "grad_norm": 2.9958912509352866, + "language_loss": 0.80558729, + "learning_rate": 3.777562726341155e-06, + "loss": 0.82764459, + "num_input_tokens_seen": 63614780, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.046875, + "step": 2944, + "time_per_iteration": 2.533968448638916 + }, + { + "auxiliary_loss_clip": 0.01154843, + "auxiliary_loss_mlp": 0.01062464, + "balance_loss_clip": 1.04353368, + "balance_loss_mlp": 1.05239737, + "epoch": 0.1770629791071697, + "flos": 42776323320960.0, + "grad_norm": 1.992535786356086, + "language_loss": 0.73450243, + "learning_rate": 3.7773841901149835e-06, + "loss": 0.75667548, + "num_input_tokens_seen": 63637190, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0234375, + "step": 2945, + "time_per_iteration": 2.641890287399292 + }, + { + "auxiliary_loss_clip": 0.01152525, + "auxiliary_loss_mlp": 0.01050215, + "balance_loss_clip": 1.03179753, + "balance_loss_mlp": 1.05274916, + "epoch": 0.17712310235983766, + "flos": 17345572560000.0, + "grad_norm": 2.3259800829895028, + "language_loss": 0.7778489, + "learning_rate": 3.7772055864902256e-06, + "loss": 0.79987633, + "num_input_tokens_seen": 63652140, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.99609375, + "step": 2946, + "time_per_iteration": 2.420511484146118 + }, + { + "auxiliary_loss_clip": 0.01150621, + "auxiliary_loss_mlp": 0.01051141, + "balance_loss_clip": 1.03190041, + "balance_loss_mlp": 1.05060697, + "epoch": 0.17718322561250563, + "flos": 23878477537920.0, + "grad_norm": 1.9846715459481197, + "language_loss": 0.76240218, + "learning_rate": 3.7770269154736535e-06, + "loss": 0.78441978, + "num_input_tokens_seen": 63671700, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.0, + "step": 2947, + "time_per_iteration": 2.485795259475708 + }, + { + "auxiliary_loss_clip": 0.01148639, + "auxiliary_loss_mlp": 0.01046512, + "balance_loss_clip": 1.02725959, + "balance_loss_mlp": 1.04881549, + "epoch": 0.1772433488651736, + "flos": 36466352104320.0, + "grad_norm": 2.7031010106606654, + "language_loss": 0.71890748, + "learning_rate": 3.7768481770720424e-06, + "loss": 0.74085903, + "num_input_tokens_seen": 63691685, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.99609375, + "step": 2948, + "time_per_iteration": 2.598586320877075 + }, + { + "auxiliary_loss_clip": 0.01151482, + "auxiliary_loss_mlp": 0.01051629, + "balance_loss_clip": 1.03313947, + "balance_loss_mlp": 1.05261326, + "epoch": 0.1773034721178416, + "flos": 26684716510080.0, + "grad_norm": 1.809900152556277, + "language_loss": 0.81843233, + "learning_rate": 3.776669371292171e-06, + "loss": 0.8404634, + "num_input_tokens_seen": 63711720, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.98828125, + "step": 2949, + "time_per_iteration": 2.496962547302246 + }, + { + "auxiliary_loss_clip": 0.01051917, + "auxiliary_loss_mlp": 0.01007586, + "balance_loss_clip": 1.00552368, + "balance_loss_mlp": 1.01889789, + "epoch": 0.17736359537050955, + "flos": 57117467617920.0, + "grad_norm": 0.7669309197050882, + "language_loss": 0.64973593, + "learning_rate": 3.7764904981408186e-06, + "loss": 0.670331, + "num_input_tokens_seen": 63776280, + "router_z_loss_clip": 0.02062988, + "router_z_loss_mlp": 0.33007812, + "step": 2950, + "time_per_iteration": 3.1220879554748535 + }, + { + "auxiliary_loss_clip": 0.01145274, + "auxiliary_loss_mlp": 0.01049164, + "balance_loss_clip": 1.02992332, + "balance_loss_mlp": 1.04777181, + "epoch": 0.17742371862317752, + "flos": 27198203385600.0, + "grad_norm": 1.9502306021254343, + "language_loss": 0.83540517, + "learning_rate": 3.7763115576247686e-06, + "loss": 0.85734957, + "num_input_tokens_seen": 63797535, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.9765625, + "step": 2951, + "time_per_iteration": 2.5360641479492188 + }, + { + "auxiliary_loss_clip": 0.01153398, + "auxiliary_loss_mlp": 0.01055919, + "balance_loss_clip": 1.03710794, + "balance_loss_mlp": 1.04963326, + "epoch": 0.17748384187584548, + "flos": 20959694277120.0, + "grad_norm": 3.175759961241781, + "language_loss": 0.80564123, + "learning_rate": 3.776132549750806e-06, + "loss": 0.82773435, + "num_input_tokens_seen": 63817045, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0390625, + "step": 2952, + "time_per_iteration": 2.478635787963867 + }, + { + "auxiliary_loss_clip": 0.01150606, + "auxiliary_loss_mlp": 0.01051207, + "balance_loss_clip": 1.03157318, + "balance_loss_mlp": 1.05045855, + "epoch": 0.17754396512851345, + "flos": 25009986844800.0, + "grad_norm": 2.157061982289712, + "language_loss": 0.79982865, + "learning_rate": 3.7759534745257194e-06, + "loss": 0.82184678, + "num_input_tokens_seen": 63837665, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0, + "step": 2953, + "time_per_iteration": 2.5143978595733643 + }, + { + "auxiliary_loss_clip": 0.01152559, + "auxiliary_loss_mlp": 0.01048489, + "balance_loss_clip": 1.03003526, + "balance_loss_mlp": 1.05173969, + "epoch": 0.1776040883811814, + "flos": 32051566275840.0, + "grad_norm": 1.8943960347088487, + "language_loss": 0.88006002, + "learning_rate": 3.7757743319562994e-06, + "loss": 0.90207046, + "num_input_tokens_seen": 63858455, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.0078125, + "step": 2954, + "time_per_iteration": 2.575603485107422 + }, + { + "auxiliary_loss_clip": 0.01150383, + "auxiliary_loss_mlp": 0.01052239, + "balance_loss_clip": 1.0327127, + "balance_loss_mlp": 1.05101538, + "epoch": 0.17766421163384938, + "flos": 21574125348480.0, + "grad_norm": 2.123866524492404, + "language_loss": 0.84441978, + "learning_rate": 3.7755951220493386e-06, + "loss": 0.86644602, + "num_input_tokens_seen": 63876935, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.9921875, + "step": 2955, + "time_per_iteration": 2.476022958755493 + }, + { + "auxiliary_loss_clip": 0.01148363, + "auxiliary_loss_mlp": 0.0104412, + "balance_loss_clip": 1.02591681, + "balance_loss_mlp": 1.04843807, + "epoch": 0.17772433488651737, + "flos": 22419319345920.0, + "grad_norm": 2.0229859139182382, + "language_loss": 0.71172267, + "learning_rate": 3.7754158448116327e-06, + "loss": 0.73364747, + "num_input_tokens_seen": 63896815, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.0, + "step": 2956, + "time_per_iteration": 2.4795608520507812 + }, + { + "auxiliary_loss_clip": 0.0114893, + "auxiliary_loss_mlp": 0.01052163, + "balance_loss_clip": 1.03226662, + "balance_loss_mlp": 1.04974461, + "epoch": 0.17778445813918534, + "flos": 25629445820160.0, + "grad_norm": 1.891261769499534, + "language_loss": 0.82908547, + "learning_rate": 3.7752365002499795e-06, + "loss": 0.85109639, + "num_input_tokens_seen": 63916140, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.9921875, + "step": 2957, + "time_per_iteration": 2.494279384613037 + }, + { + "auxiliary_loss_clip": 0.01146796, + "auxiliary_loss_mlp": 0.01046422, + "balance_loss_clip": 1.02819514, + "balance_loss_mlp": 1.04814482, + "epoch": 0.1778445813918533, + "flos": 25628871202560.0, + "grad_norm": 1.926043663168548, + "language_loss": 0.75286758, + "learning_rate": 3.7750570883711807e-06, + "loss": 0.7747997, + "num_input_tokens_seen": 63935220, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.984375, + "step": 2958, + "time_per_iteration": 2.532339572906494 + }, + { + "auxiliary_loss_clip": 0.01153516, + "auxiliary_loss_mlp": 0.01043348, + "balance_loss_clip": 1.02483475, + "balance_loss_mlp": 1.05278933, + "epoch": 0.17790470464452127, + "flos": 22345522853760.0, + "grad_norm": 2.0794730574663265, + "language_loss": 0.79558724, + "learning_rate": 3.7748776091820397e-06, + "loss": 0.8175559, + "num_input_tokens_seen": 63954550, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.0078125, + "step": 2959, + "time_per_iteration": 2.45941424369812 + }, + { + "auxiliary_loss_clip": 0.01152953, + "auxiliary_loss_mlp": 0.01045372, + "balance_loss_clip": 1.02573824, + "balance_loss_mlp": 1.04968762, + "epoch": 0.17796482789718923, + "flos": 18765875214720.0, + "grad_norm": 2.284306220471852, + "language_loss": 0.52288693, + "learning_rate": 3.774698062689362e-06, + "loss": 0.5448702, + "num_input_tokens_seen": 63972425, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.03125, + "step": 2960, + "time_per_iteration": 2.4603421688079834 + }, + { + "auxiliary_loss_clip": 0.01154348, + "auxiliary_loss_mlp": 0.01055908, + "balance_loss_clip": 1.03685808, + "balance_loss_mlp": 1.05185843, + "epoch": 0.1780249511498572, + "flos": 23440941970560.0, + "grad_norm": 1.9615261009939866, + "language_loss": 0.89047921, + "learning_rate": 3.7745184488999548e-06, + "loss": 0.9125818, + "num_input_tokens_seen": 63992165, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0234375, + "step": 2961, + "time_per_iteration": 2.475848913192749 + }, + { + "auxiliary_loss_clip": 0.01151915, + "auxiliary_loss_mlp": 0.01051556, + "balance_loss_clip": 1.0313381, + "balance_loss_mlp": 1.04849648, + "epoch": 0.1780850744025252, + "flos": 23367468700800.0, + "grad_norm": 2.2193748892921517, + "language_loss": 0.79186273, + "learning_rate": 3.774338767820631e-06, + "loss": 0.81389749, + "num_input_tokens_seen": 64013470, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.03125, + "step": 2962, + "time_per_iteration": 2.508054256439209 + }, + { + "auxiliary_loss_clip": 0.011535, + "auxiliary_loss_mlp": 0.01051549, + "balance_loss_clip": 1.03175986, + "balance_loss_mlp": 1.0524615, + "epoch": 0.17814519765519315, + "flos": 13771994319360.0, + "grad_norm": 1.9550413638631114, + "language_loss": 0.74514943, + "learning_rate": 3.774159019458203e-06, + "loss": 0.76719993, + "num_input_tokens_seen": 64030975, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.015625, + "step": 2963, + "time_per_iteration": 2.4414234161376953 + }, + { + "auxiliary_loss_clip": 0.01156042, + "auxiliary_loss_mlp": 0.01048013, + "balance_loss_clip": 1.02822399, + "balance_loss_mlp": 1.05221784, + "epoch": 0.17820532090786112, + "flos": 21976396738560.0, + "grad_norm": 1.541363360665875, + "language_loss": 0.78624183, + "learning_rate": 3.7739792038194877e-06, + "loss": 0.80828238, + "num_input_tokens_seen": 64050075, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.0390625, + "step": 2964, + "time_per_iteration": 2.502497911453247 + }, + { + "auxiliary_loss_clip": 0.0115044, + "auxiliary_loss_mlp": 0.01056098, + "balance_loss_clip": 1.03661871, + "balance_loss_mlp": 1.05026746, + "epoch": 0.17826544416052909, + "flos": 24790752184320.0, + "grad_norm": 1.923237578914178, + "language_loss": 0.81686175, + "learning_rate": 3.7737993209113027e-06, + "loss": 0.83892715, + "num_input_tokens_seen": 64071920, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0, + "step": 2965, + "time_per_iteration": 2.538076400756836 + }, + { + "auxiliary_loss_clip": 0.01147349, + "auxiliary_loss_mlp": 0.01049832, + "balance_loss_clip": 1.03273785, + "balance_loss_mlp": 1.04941893, + "epoch": 0.17832556741319705, + "flos": 13879582531200.0, + "grad_norm": 2.2408088539265183, + "language_loss": 0.94580686, + "learning_rate": 3.7736193707404698e-06, + "loss": 0.96777868, + "num_input_tokens_seen": 64086835, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.98046875, + "step": 2966, + "time_per_iteration": 2.43082332611084 + }, + { + "auxiliary_loss_clip": 0.01149854, + "auxiliary_loss_mlp": 0.01045128, + "balance_loss_clip": 1.02470756, + "balance_loss_mlp": 1.05002928, + "epoch": 0.17838569066586502, + "flos": 36641703323520.0, + "grad_norm": 2.145285080590972, + "language_loss": 0.72469354, + "learning_rate": 3.7734393533138127e-06, + "loss": 0.74664342, + "num_input_tokens_seen": 64107360, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0, + "step": 2967, + "time_per_iteration": 2.5735998153686523 + }, + { + "auxiliary_loss_clip": 0.01145139, + "auxiliary_loss_mlp": 0.01044527, + "balance_loss_clip": 1.02613282, + "balance_loss_mlp": 1.04889679, + "epoch": 0.17844581391853298, + "flos": 18727271072640.0, + "grad_norm": 2.088672387523525, + "language_loss": 0.76831949, + "learning_rate": 3.773259268638157e-06, + "loss": 0.79021615, + "num_input_tokens_seen": 64124690, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.9609375, + "step": 2968, + "time_per_iteration": 2.437344789505005 + }, + { + "auxiliary_loss_clip": 0.01147824, + "auxiliary_loss_mlp": 0.01047277, + "balance_loss_clip": 1.0287044, + "balance_loss_mlp": 1.04982233, + "epoch": 0.17850593717120097, + "flos": 27378259286400.0, + "grad_norm": 3.3962137266502075, + "language_loss": 0.75934523, + "learning_rate": 3.7730791167203333e-06, + "loss": 0.78129619, + "num_input_tokens_seen": 64146315, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.98046875, + "step": 2969, + "time_per_iteration": 2.5003507137298584 + }, + { + "auxiliary_loss_clip": 0.01047445, + "auxiliary_loss_mlp": 0.01001591, + "balance_loss_clip": 0.99940914, + "balance_loss_mlp": 1.01426291, + "epoch": 0.17856606042386894, + "flos": 66996025084800.0, + "grad_norm": 0.8459028719848601, + "language_loss": 0.69080526, + "learning_rate": 3.772898897567171e-06, + "loss": 0.7112956, + "num_input_tokens_seen": 64210875, + "router_z_loss_clip": 0.02185059, + "router_z_loss_mlp": 0.33203125, + "step": 2970, + "time_per_iteration": 3.1193249225616455 + }, + { + "auxiliary_loss_clip": 0.01153596, + "auxiliary_loss_mlp": 0.01041832, + "balance_loss_clip": 1.0229373, + "balance_loss_mlp": 1.0498271, + "epoch": 0.1786261836765369, + "flos": 36977001805440.0, + "grad_norm": 2.0858657386647614, + "language_loss": 0.67452097, + "learning_rate": 3.772718611185505e-06, + "loss": 0.69647527, + "num_input_tokens_seen": 64230740, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0390625, + "step": 2971, + "time_per_iteration": 2.580946683883667 + }, + { + "auxiliary_loss_clip": 0.01146095, + "auxiliary_loss_mlp": 0.0105018, + "balance_loss_clip": 1.03059363, + "balance_loss_mlp": 1.04643905, + "epoch": 0.17868630692920487, + "flos": 24825441744000.0, + "grad_norm": 1.713623966203784, + "language_loss": 0.89631712, + "learning_rate": 3.7725382575821717e-06, + "loss": 0.91827983, + "num_input_tokens_seen": 64252300, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.99609375, + "step": 2972, + "time_per_iteration": 2.491608142852783 + }, + { + "auxiliary_loss_clip": 0.01150348, + "auxiliary_loss_mlp": 0.01056161, + "balance_loss_clip": 1.03762364, + "balance_loss_mlp": 1.05058205, + "epoch": 0.17874643018187283, + "flos": 16981977139200.0, + "grad_norm": 2.067523530387673, + "language_loss": 0.88030291, + "learning_rate": 3.77235783676401e-06, + "loss": 0.90236795, + "num_input_tokens_seen": 64270105, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0, + "step": 2973, + "time_per_iteration": 2.4357106685638428 + }, + { + "auxiliary_loss_clip": 0.01148396, + "auxiliary_loss_mlp": 0.01051569, + "balance_loss_clip": 1.03282917, + "balance_loss_mlp": 1.04979324, + "epoch": 0.1788065534345408, + "flos": 21032233793280.0, + "grad_norm": 2.1406659419236176, + "language_loss": 0.75648922, + "learning_rate": 3.7721773487378615e-06, + "loss": 0.77848881, + "num_input_tokens_seen": 64287250, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.984375, + "step": 2974, + "time_per_iteration": 2.484236478805542 + }, + { + "auxiliary_loss_clip": 0.01148515, + "auxiliary_loss_mlp": 0.01044686, + "balance_loss_clip": 1.02560067, + "balance_loss_mlp": 1.04925394, + "epoch": 0.17886667668720876, + "flos": 23987717775360.0, + "grad_norm": 2.8019304252630453, + "language_loss": 0.74556506, + "learning_rate": 3.7719967935105705e-06, + "loss": 0.76749712, + "num_input_tokens_seen": 64307140, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.9921875, + "step": 2975, + "time_per_iteration": 2.4658849239349365 + }, + { + "auxiliary_loss_clip": 0.01145454, + "auxiliary_loss_mlp": 0.0104533, + "balance_loss_clip": 1.02692378, + "balance_loss_mlp": 1.04805982, + "epoch": 0.17892679993987676, + "flos": 25739476156800.0, + "grad_norm": 1.5963289978134585, + "language_loss": 0.73245859, + "learning_rate": 3.7718161710889833e-06, + "loss": 0.7543664, + "num_input_tokens_seen": 64328760, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.97265625, + "step": 2976, + "time_per_iteration": 3.921170949935913 + }, + { + "auxiliary_loss_clip": 0.01140857, + "auxiliary_loss_mlp": 0.01040265, + "balance_loss_clip": 1.02455354, + "balance_loss_mlp": 1.04732931, + "epoch": 0.17898692319254472, + "flos": 25699686865920.0, + "grad_norm": 1.5556273460638488, + "language_loss": 0.77324069, + "learning_rate": 3.7716354814799495e-06, + "loss": 0.79505193, + "num_input_tokens_seen": 64348800, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.9375, + "step": 2977, + "time_per_iteration": 5.459218978881836 + }, + { + "auxiliary_loss_clip": 0.01150602, + "auxiliary_loss_mlp": 0.0105157, + "balance_loss_clip": 1.03352153, + "balance_loss_mlp": 1.05327988, + "epoch": 0.1790470464452127, + "flos": 19317786664320.0, + "grad_norm": 2.814268655584857, + "language_loss": 0.79470795, + "learning_rate": 3.7714547246903203e-06, + "loss": 0.81672966, + "num_input_tokens_seen": 64367955, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.97265625, + "step": 2978, + "time_per_iteration": 2.4917376041412354 + }, + { + "auxiliary_loss_clip": 0.01152273, + "auxiliary_loss_mlp": 0.0104187, + "balance_loss_clip": 1.022892, + "balance_loss_mlp": 1.04982674, + "epoch": 0.17910716969788065, + "flos": 30044267562240.0, + "grad_norm": 1.6585859201367117, + "language_loss": 0.76166439, + "learning_rate": 3.7712739007269508e-06, + "loss": 0.78360581, + "num_input_tokens_seen": 64389805, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0234375, + "step": 2979, + "time_per_iteration": 2.5283753871917725 + }, + { + "auxiliary_loss_clip": 0.01144584, + "auxiliary_loss_mlp": 0.01046117, + "balance_loss_clip": 1.0283196, + "balance_loss_mlp": 1.04760695, + "epoch": 0.17916729295054862, + "flos": 19427709260160.0, + "grad_norm": 2.3100878996861014, + "language_loss": 0.69246143, + "learning_rate": 3.7710930095966976e-06, + "loss": 0.7143684, + "num_input_tokens_seen": 64408220, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.96875, + "step": 2980, + "time_per_iteration": 2.452199935913086 + }, + { + "auxiliary_loss_clip": 0.01148553, + "auxiliary_loss_mlp": 0.01047507, + "balance_loss_clip": 1.02703881, + "balance_loss_mlp": 1.04957294, + "epoch": 0.17922741620321658, + "flos": 14611549881600.0, + "grad_norm": 1.6769030770257147, + "language_loss": 0.7077347, + "learning_rate": 3.7709120513064196e-06, + "loss": 0.72969532, + "num_input_tokens_seen": 64426380, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.98828125, + "step": 2981, + "time_per_iteration": 2.453328847885132 + }, + { + "auxiliary_loss_clip": 0.01151272, + "auxiliary_loss_mlp": 0.01057949, + "balance_loss_clip": 1.03929293, + "balance_loss_mlp": 1.05124855, + "epoch": 0.17928753945588458, + "flos": 17165301177600.0, + "grad_norm": 2.4096510966801916, + "language_loss": 0.82313269, + "learning_rate": 3.7707310258629796e-06, + "loss": 0.84522492, + "num_input_tokens_seen": 64444355, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.0, + "step": 2982, + "time_per_iteration": 2.4727423191070557 + }, + { + "auxiliary_loss_clip": 0.01145202, + "auxiliary_loss_mlp": 0.01048964, + "balance_loss_clip": 1.0309453, + "balance_loss_mlp": 1.04754186, + "epoch": 0.17934766270855254, + "flos": 31395622060800.0, + "grad_norm": 2.0170018574221404, + "language_loss": 0.82899523, + "learning_rate": 3.7705499332732413e-06, + "loss": 0.85093689, + "num_input_tokens_seen": 64467800, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9765625, + "step": 2983, + "time_per_iteration": 2.5544486045837402 + }, + { + "auxiliary_loss_clip": 0.01148269, + "auxiliary_loss_mlp": 0.01051569, + "balance_loss_clip": 1.03234076, + "balance_loss_mlp": 1.04676509, + "epoch": 0.1794077859612205, + "flos": 20814184281600.0, + "grad_norm": 2.0025677466759175, + "language_loss": 0.84977567, + "learning_rate": 3.7703687735440718e-06, + "loss": 0.87177408, + "num_input_tokens_seen": 64487230, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.015625, + "step": 2984, + "time_per_iteration": 2.461451530456543 + }, + { + "auxiliary_loss_clip": 0.01146636, + "auxiliary_loss_mlp": 0.01044432, + "balance_loss_clip": 1.02558494, + "balance_loss_mlp": 1.04734373, + "epoch": 0.17946790921388847, + "flos": 28986447006720.0, + "grad_norm": 2.5972673531528874, + "language_loss": 0.89526331, + "learning_rate": 3.7701875466823416e-06, + "loss": 0.91717398, + "num_input_tokens_seen": 64509165, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.9921875, + "step": 2985, + "time_per_iteration": 2.5644643306732178 + }, + { + "auxiliary_loss_clip": 0.01142965, + "auxiliary_loss_mlp": 0.01045202, + "balance_loss_clip": 1.02879906, + "balance_loss_mlp": 1.0478375, + "epoch": 0.17952803246655644, + "flos": 20737406960640.0, + "grad_norm": 1.9029387971382474, + "language_loss": 0.69863129, + "learning_rate": 3.770006252694922e-06, + "loss": 0.72051299, + "num_input_tokens_seen": 64527940, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.953125, + "step": 2986, + "time_per_iteration": 2.4629499912261963 + }, + { + "auxiliary_loss_clip": 0.01144523, + "auxiliary_loss_mlp": 0.01043434, + "balance_loss_clip": 1.02507591, + "balance_loss_mlp": 1.04828227, + "epoch": 0.1795881557192244, + "flos": 28255988027520.0, + "grad_norm": 2.203273814413497, + "language_loss": 0.77872753, + "learning_rate": 3.769824891588688e-06, + "loss": 0.80060714, + "num_input_tokens_seen": 64545230, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.96484375, + "step": 2987, + "time_per_iteration": 2.524712562561035 + }, + { + "auxiliary_loss_clip": 0.01149287, + "auxiliary_loss_mlp": 0.01043881, + "balance_loss_clip": 1.02412844, + "balance_loss_mlp": 1.04834962, + "epoch": 0.17964827897189237, + "flos": 18552027594240.0, + "grad_norm": 2.225668764256514, + "language_loss": 0.78012109, + "learning_rate": 3.7696434633705164e-06, + "loss": 0.8020528, + "num_input_tokens_seen": 64563820, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0078125, + "step": 2988, + "time_per_iteration": 2.4608163833618164 + }, + { + "auxiliary_loss_clip": 0.01048374, + "auxiliary_loss_mlp": 0.01007691, + "balance_loss_clip": 1.00570035, + "balance_loss_mlp": 1.0154314, + "epoch": 0.17970840222456036, + "flos": 58165088711040.0, + "grad_norm": 0.7961406236538413, + "language_loss": 0.62767559, + "learning_rate": 3.7694619680472875e-06, + "loss": 0.64823627, + "num_input_tokens_seen": 64621315, + "router_z_loss_clip": 0.01989746, + "router_z_loss_mlp": 0.33007812, + "step": 2989, + "time_per_iteration": 2.9831957817077637 + }, + { + "auxiliary_loss_clip": 0.01146079, + "auxiliary_loss_mlp": 0.01041184, + "balance_loss_clip": 1.02363658, + "balance_loss_mlp": 1.04836369, + "epoch": 0.17976852547722832, + "flos": 20300805146880.0, + "grad_norm": 3.4434429944335525, + "language_loss": 0.70464563, + "learning_rate": 3.7692804056258837e-06, + "loss": 0.72651821, + "num_input_tokens_seen": 64639885, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.98046875, + "step": 2990, + "time_per_iteration": 2.556100606918335 + }, + { + "auxiliary_loss_clip": 0.01146243, + "auxiliary_loss_mlp": 0.01039011, + "balance_loss_clip": 1.0210464, + "balance_loss_mlp": 1.04735422, + "epoch": 0.1798286487298963, + "flos": 39669367685760.0, + "grad_norm": 1.7649502456354873, + "language_loss": 0.68110204, + "learning_rate": 3.7690987761131893e-06, + "loss": 0.70295459, + "num_input_tokens_seen": 64661220, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.98828125, + "step": 2991, + "time_per_iteration": 2.6224544048309326 + }, + { + "auxiliary_loss_clip": 0.01145545, + "auxiliary_loss_mlp": 0.01040119, + "balance_loss_clip": 1.02194011, + "balance_loss_mlp": 1.04794931, + "epoch": 0.17988877198256426, + "flos": 25520313323520.0, + "grad_norm": 1.5716432326573742, + "language_loss": 0.82754636, + "learning_rate": 3.7689170795160924e-06, + "loss": 0.84940296, + "num_input_tokens_seen": 64682530, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9765625, + "step": 2992, + "time_per_iteration": 2.51824951171875 + }, + { + "auxiliary_loss_clip": 0.01138637, + "auxiliary_loss_mlp": 0.01040458, + "balance_loss_clip": 1.02301776, + "balance_loss_mlp": 1.04464579, + "epoch": 0.17994889523523222, + "flos": 18807496099200.0, + "grad_norm": 2.1353598877924806, + "language_loss": 0.81958085, + "learning_rate": 3.7687353158414822e-06, + "loss": 0.84137177, + "num_input_tokens_seen": 64701025, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.94140625, + "step": 2993, + "time_per_iteration": 2.4349074363708496 + }, + { + "auxiliary_loss_clip": 0.01143824, + "auxiliary_loss_mlp": 0.01047314, + "balance_loss_clip": 1.02889621, + "balance_loss_mlp": 1.04586673, + "epoch": 0.18000901848790019, + "flos": 21104450087040.0, + "grad_norm": 1.7254805142405878, + "language_loss": 0.78390837, + "learning_rate": 3.7685534850962517e-06, + "loss": 0.80581975, + "num_input_tokens_seen": 64719570, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.98046875, + "step": 2994, + "time_per_iteration": 2.4898691177368164 + }, + { + "auxiliary_loss_clip": 0.01148185, + "auxiliary_loss_mlp": 0.01043708, + "balance_loss_clip": 1.02642226, + "balance_loss_mlp": 1.04966068, + "epoch": 0.18006914174056818, + "flos": 19646441130240.0, + "grad_norm": 1.8689491925476576, + "language_loss": 0.80392146, + "learning_rate": 3.768371587287296e-06, + "loss": 0.82584035, + "num_input_tokens_seen": 64738110, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.984375, + "step": 2995, + "time_per_iteration": 2.4521572589874268 + }, + { + "auxiliary_loss_clip": 0.01144196, + "auxiliary_loss_mlp": 0.01046311, + "balance_loss_clip": 1.02939498, + "balance_loss_mlp": 1.04679298, + "epoch": 0.18012926499323614, + "flos": 19499889640320.0, + "grad_norm": 1.5635152056288029, + "language_loss": 0.84467834, + "learning_rate": 3.768189622421512e-06, + "loss": 0.86658335, + "num_input_tokens_seen": 64756345, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.97265625, + "step": 2996, + "time_per_iteration": 2.46993088722229 + }, + { + "auxiliary_loss_clip": 0.01139788, + "auxiliary_loss_mlp": 0.01041997, + "balance_loss_clip": 1.02493799, + "balance_loss_mlp": 1.04656756, + "epoch": 0.1801893882459041, + "flos": 19464553635840.0, + "grad_norm": 2.9197857622903793, + "language_loss": 0.88254511, + "learning_rate": 3.7680075905058006e-06, + "loss": 0.90436304, + "num_input_tokens_seen": 64776375, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9296875, + "step": 2997, + "time_per_iteration": 2.470113515853882 + }, + { + "auxiliary_loss_clip": 0.01148809, + "auxiliary_loss_mlp": 0.01043259, + "balance_loss_clip": 1.02435279, + "balance_loss_mlp": 1.04666877, + "epoch": 0.18024951149857207, + "flos": 26870590414080.0, + "grad_norm": 2.5635961030192935, + "language_loss": 0.8504566, + "learning_rate": 3.7678254915470643e-06, + "loss": 0.87237728, + "num_input_tokens_seen": 64796210, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0234375, + "step": 2998, + "time_per_iteration": 2.5252864360809326 + }, + { + "auxiliary_loss_clip": 0.0114547, + "auxiliary_loss_mlp": 0.01045025, + "balance_loss_clip": 1.02783537, + "balance_loss_mlp": 1.05022454, + "epoch": 0.18030963475124004, + "flos": 30226621933440.0, + "grad_norm": 1.8695557812200347, + "language_loss": 0.84270376, + "learning_rate": 3.7676433255522084e-06, + "loss": 0.86460871, + "num_input_tokens_seen": 64818590, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.953125, + "step": 2999, + "time_per_iteration": 2.5272696018218994 + }, + { + "auxiliary_loss_clip": 0.01143823, + "auxiliary_loss_mlp": 0.01044085, + "balance_loss_clip": 1.02577412, + "balance_loss_mlp": 1.04662383, + "epoch": 0.180369758003908, + "flos": 22307493329280.0, + "grad_norm": 1.7700032623605295, + "language_loss": 0.74753368, + "learning_rate": 3.76746109252814e-06, + "loss": 0.76941276, + "num_input_tokens_seen": 64838350, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.97265625, + "step": 3000, + "time_per_iteration": 2.4800922870635986 + }, + { + "auxiliary_loss_clip": 0.01143329, + "auxiliary_loss_mlp": 0.01060132, + "balance_loss_clip": 1.04111791, + "balance_loss_mlp": 1.04825568, + "epoch": 0.18042988125657597, + "flos": 23732033788800.0, + "grad_norm": 2.369063359757221, + "language_loss": 0.71625632, + "learning_rate": 3.76727879248177e-06, + "loss": 0.73829091, + "num_input_tokens_seen": 64858065, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.953125, + "step": 3001, + "time_per_iteration": 2.484309434890747 + }, + { + "auxiliary_loss_clip": 0.01148499, + "auxiliary_loss_mlp": 0.01048814, + "balance_loss_clip": 1.03010964, + "balance_loss_mlp": 1.04815364, + "epoch": 0.18049000450924396, + "flos": 24093582134400.0, + "grad_norm": 2.7240097708601225, + "language_loss": 0.87795258, + "learning_rate": 3.767096425420011e-06, + "loss": 0.89992571, + "num_input_tokens_seen": 64877305, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.0078125, + "step": 3002, + "time_per_iteration": 2.4881784915924072 + }, + { + "auxiliary_loss_clip": 0.011444, + "auxiliary_loss_mlp": 0.01044754, + "balance_loss_clip": 1.02689672, + "balance_loss_mlp": 1.04694915, + "epoch": 0.18055012776191193, + "flos": 22163168482560.0, + "grad_norm": 1.6880476069492312, + "language_loss": 0.80563951, + "learning_rate": 3.7669139913497788e-06, + "loss": 0.8275311, + "num_input_tokens_seen": 64896955, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.9765625, + "step": 3003, + "time_per_iteration": 2.452103614807129 + }, + { + "auxiliary_loss_clip": 0.0114812, + "auxiliary_loss_mlp": 0.01044755, + "balance_loss_clip": 1.02673101, + "balance_loss_mlp": 1.04780829, + "epoch": 0.1806102510145799, + "flos": 28913512440960.0, + "grad_norm": 2.4630533980116804, + "language_loss": 0.66931474, + "learning_rate": 3.7667314902779907e-06, + "loss": 0.69124347, + "num_input_tokens_seen": 64917080, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0, + "step": 3004, + "time_per_iteration": 2.5085701942443848 + }, + { + "auxiliary_loss_clip": 0.0114685, + "auxiliary_loss_mlp": 0.01050762, + "balance_loss_clip": 1.03184342, + "balance_loss_mlp": 1.04860806, + "epoch": 0.18067037426724786, + "flos": 19025689265280.0, + "grad_norm": 1.8927608809249736, + "language_loss": 0.85172975, + "learning_rate": 3.7665489222115677e-06, + "loss": 0.87370586, + "num_input_tokens_seen": 64935215, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.984375, + "step": 3005, + "time_per_iteration": 2.44529128074646 + }, + { + "auxiliary_loss_clip": 0.01141782, + "auxiliary_loss_mlp": 0.01042658, + "balance_loss_clip": 1.02611172, + "balance_loss_mlp": 1.04684031, + "epoch": 0.18073049751991582, + "flos": 27453635976960.0, + "grad_norm": 1.553419886600377, + "language_loss": 0.82951266, + "learning_rate": 3.766366287157432e-06, + "loss": 0.85135704, + "num_input_tokens_seen": 64956275, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.94921875, + "step": 3006, + "time_per_iteration": 2.532597780227661 + }, + { + "auxiliary_loss_clip": 0.01143778, + "auxiliary_loss_mlp": 0.01050753, + "balance_loss_clip": 1.0315007, + "balance_loss_mlp": 1.04581141, + "epoch": 0.1807906207725838, + "flos": 28729039167360.0, + "grad_norm": 1.6363768703600998, + "language_loss": 0.76883924, + "learning_rate": 3.7661835851225103e-06, + "loss": 0.79078454, + "num_input_tokens_seen": 64979390, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.98046875, + "step": 3007, + "time_per_iteration": 2.5265002250671387 + }, + { + "auxiliary_loss_clip": 0.01046842, + "auxiliary_loss_mlp": 0.01004593, + "balance_loss_clip": 1.00238752, + "balance_loss_mlp": 1.01358199, + "epoch": 0.18085074402525175, + "flos": 64466515468800.0, + "grad_norm": 0.8067080511403597, + "language_loss": 0.56949043, + "learning_rate": 3.7660008161137294e-06, + "loss": 0.59000474, + "num_input_tokens_seen": 65043135, + "router_z_loss_clip": 0.02209473, + "router_z_loss_mlp": 0.33203125, + "step": 3008, + "time_per_iteration": 3.1923961639404297 + }, + { + "auxiliary_loss_clip": 0.01148419, + "auxiliary_loss_mlp": 0.01048421, + "balance_loss_clip": 1.02878737, + "balance_loss_mlp": 1.04951596, + "epoch": 0.18091086727791975, + "flos": 23476960333440.0, + "grad_norm": 1.8063105677439477, + "language_loss": 0.67226636, + "learning_rate": 3.765817980138021e-06, + "loss": 0.69423479, + "num_input_tokens_seen": 65062845, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.98828125, + "step": 3009, + "time_per_iteration": 2.467525005340576 + }, + { + "auxiliary_loss_clip": 0.01147918, + "auxiliary_loss_mlp": 0.01047401, + "balance_loss_clip": 1.02993655, + "balance_loss_mlp": 1.04874969, + "epoch": 0.1809709905305877, + "flos": 24170467196160.0, + "grad_norm": 1.842230928142314, + "language_loss": 0.75573891, + "learning_rate": 3.7656350772023177e-06, + "loss": 0.77769208, + "num_input_tokens_seen": 65082110, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.98828125, + "step": 3010, + "time_per_iteration": 2.486067533493042 + }, + { + "auxiliary_loss_clip": 0.01141012, + "auxiliary_loss_mlp": 0.0104251, + "balance_loss_clip": 1.02585649, + "balance_loss_mlp": 1.04816866, + "epoch": 0.18103111378325568, + "flos": 21650902669440.0, + "grad_norm": 1.6130539386655762, + "language_loss": 0.66672593, + "learning_rate": 3.7654521073135553e-06, + "loss": 0.6885612, + "num_input_tokens_seen": 65101985, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9296875, + "step": 3011, + "time_per_iteration": 2.461749792098999 + }, + { + "auxiliary_loss_clip": 0.01142359, + "auxiliary_loss_mlp": 0.01048591, + "balance_loss_clip": 1.0309006, + "balance_loss_mlp": 1.04706419, + "epoch": 0.18109123703592364, + "flos": 53686918356480.0, + "grad_norm": 2.1517129990512927, + "language_loss": 0.71184897, + "learning_rate": 3.7652690704786723e-06, + "loss": 0.73375839, + "num_input_tokens_seen": 65129295, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.953125, + "step": 3012, + "time_per_iteration": 2.7380943298339844 + }, + { + "auxiliary_loss_clip": 0.01145213, + "auxiliary_loss_mlp": 0.01048493, + "balance_loss_clip": 1.03045654, + "balance_loss_mlp": 1.05109787, + "epoch": 0.1811513602885916, + "flos": 35845564325760.0, + "grad_norm": 2.2489260815019447, + "language_loss": 0.62039113, + "learning_rate": 3.765085966704609e-06, + "loss": 0.64232826, + "num_input_tokens_seen": 65150625, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.94140625, + "step": 3013, + "time_per_iteration": 2.5800936222076416 + }, + { + "auxiliary_loss_clip": 0.01145888, + "auxiliary_loss_mlp": 0.01050021, + "balance_loss_clip": 1.03303385, + "balance_loss_mlp": 1.04870379, + "epoch": 0.18121148354125957, + "flos": 23732572492800.0, + "grad_norm": 1.5535403171237991, + "language_loss": 0.76026124, + "learning_rate": 3.764902795998309e-06, + "loss": 0.7822203, + "num_input_tokens_seen": 65170880, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.97265625, + "step": 3014, + "time_per_iteration": 2.5049405097961426 + }, + { + "auxiliary_loss_clip": 0.01151342, + "auxiliary_loss_mlp": 0.01046668, + "balance_loss_clip": 1.02697504, + "balance_loss_mlp": 1.05086446, + "epoch": 0.18127160679392756, + "flos": 28728320895360.0, + "grad_norm": 1.7733972454950666, + "language_loss": 0.65696967, + "learning_rate": 3.7647195583667184e-06, + "loss": 0.67894971, + "num_input_tokens_seen": 65192530, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0, + "step": 3015, + "time_per_iteration": 2.52614426612854 + }, + { + "auxiliary_loss_clip": 0.01143858, + "auxiliary_loss_mlp": 0.01043977, + "balance_loss_clip": 1.0262742, + "balance_loss_mlp": 1.0490694, + "epoch": 0.18133173004659553, + "flos": 20485062938880.0, + "grad_norm": 1.7500400577379265, + "language_loss": 0.7809943, + "learning_rate": 3.764536253816785e-06, + "loss": 0.80287266, + "num_input_tokens_seen": 65211675, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9453125, + "step": 3016, + "time_per_iteration": 2.4736039638519287 + }, + { + "auxiliary_loss_clip": 0.01152649, + "auxiliary_loss_mlp": 0.01050768, + "balance_loss_clip": 1.03214788, + "balance_loss_mlp": 1.05294776, + "epoch": 0.1813918532992635, + "flos": 22852078404480.0, + "grad_norm": 1.6390488083316745, + "language_loss": 0.83498454, + "learning_rate": 3.7643528823554602e-06, + "loss": 0.85701871, + "num_input_tokens_seen": 65231185, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0, + "step": 3017, + "time_per_iteration": 2.454888105392456 + }, + { + "auxiliary_loss_clip": 0.01142751, + "auxiliary_loss_mlp": 0.01039497, + "balance_loss_clip": 1.02192545, + "balance_loss_mlp": 1.0486486, + "epoch": 0.18145197655193146, + "flos": 36065122208640.0, + "grad_norm": 2.2301629944757964, + "language_loss": 0.67067724, + "learning_rate": 3.764169443989697e-06, + "loss": 0.69249976, + "num_input_tokens_seen": 65251645, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.94140625, + "step": 3018, + "time_per_iteration": 3.950299024581909 + }, + { + "auxiliary_loss_clip": 0.01146405, + "auxiliary_loss_mlp": 0.0103774, + "balance_loss_clip": 1.01953697, + "balance_loss_mlp": 1.04928112, + "epoch": 0.18151209980459942, + "flos": 24023951619840.0, + "grad_norm": 2.174717508383113, + "language_loss": 0.75745898, + "learning_rate": 3.7639859387264518e-06, + "loss": 0.77930045, + "num_input_tokens_seen": 65271125, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.97265625, + "step": 3019, + "time_per_iteration": 3.9721574783325195 + }, + { + "auxiliary_loss_clip": 0.01149794, + "auxiliary_loss_mlp": 0.01045611, + "balance_loss_clip": 1.02653718, + "balance_loss_mlp": 1.05230832, + "epoch": 0.1815722230572674, + "flos": 23951627585280.0, + "grad_norm": 2.1373464597463574, + "language_loss": 0.81687438, + "learning_rate": 3.7638023665726834e-06, + "loss": 0.83882844, + "num_input_tokens_seen": 65290600, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.9765625, + "step": 3020, + "time_per_iteration": 2.510564088821411 + }, + { + "auxiliary_loss_clip": 0.01148401, + "auxiliary_loss_mlp": 0.01042965, + "balance_loss_clip": 1.02373672, + "balance_loss_mlp": 1.05124021, + "epoch": 0.18163234630993536, + "flos": 24386469632640.0, + "grad_norm": 2.9178918869439654, + "language_loss": 0.77220714, + "learning_rate": 3.763618727535352e-06, + "loss": 0.79412079, + "num_input_tokens_seen": 65311040, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.96875, + "step": 3021, + "time_per_iteration": 2.4856297969818115 + }, + { + "auxiliary_loss_clip": 0.01141247, + "auxiliary_loss_mlp": 0.0104233, + "balance_loss_clip": 1.02419829, + "balance_loss_mlp": 1.04617524, + "epoch": 0.18169246956260335, + "flos": 24681332378880.0, + "grad_norm": 1.7066661124221545, + "language_loss": 0.84841502, + "learning_rate": 3.763435021621422e-06, + "loss": 0.87025082, + "num_input_tokens_seen": 65332115, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.94921875, + "step": 3022, + "time_per_iteration": 2.4933700561523438 + }, + { + "auxiliary_loss_clip": 0.01148694, + "auxiliary_loss_mlp": 0.01042276, + "balance_loss_clip": 1.02296424, + "balance_loss_mlp": 1.0491302, + "epoch": 0.1817525928152713, + "flos": 24243294021120.0, + "grad_norm": 1.9452352079001236, + "language_loss": 0.69178426, + "learning_rate": 3.763251248837859e-06, + "loss": 0.7136941, + "num_input_tokens_seen": 65352210, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.9921875, + "step": 3023, + "time_per_iteration": 2.495107412338257 + }, + { + "auxiliary_loss_clip": 0.01144443, + "auxiliary_loss_mlp": 0.01044488, + "balance_loss_clip": 1.0261296, + "balance_loss_mlp": 1.04748738, + "epoch": 0.18181271606793928, + "flos": 16472081623680.0, + "grad_norm": 1.9417078000950883, + "language_loss": 0.73956865, + "learning_rate": 3.7630674091916317e-06, + "loss": 0.76145792, + "num_input_tokens_seen": 65370600, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.96875, + "step": 3024, + "time_per_iteration": 2.4531846046447754 + }, + { + "auxiliary_loss_clip": 0.01145205, + "auxiliary_loss_mlp": 0.01043186, + "balance_loss_clip": 1.02549553, + "balance_loss_mlp": 1.0490942, + "epoch": 0.18187283932060724, + "flos": 18581042805120.0, + "grad_norm": 2.344564071286257, + "language_loss": 0.88167858, + "learning_rate": 3.7628835026897123e-06, + "loss": 0.90356255, + "num_input_tokens_seen": 65387270, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9609375, + "step": 3025, + "time_per_iteration": 2.4708051681518555 + }, + { + "auxiliary_loss_clip": 0.01145802, + "auxiliary_loss_mlp": 0.01052568, + "balance_loss_clip": 1.03356552, + "balance_loss_mlp": 1.05046904, + "epoch": 0.1819329625732752, + "flos": 20266833859200.0, + "grad_norm": 2.755473586939447, + "language_loss": 0.79284346, + "learning_rate": 3.7626995293390735e-06, + "loss": 0.8148272, + "num_input_tokens_seen": 65406550, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.953125, + "step": 3026, + "time_per_iteration": 2.482987403869629 + }, + { + "auxiliary_loss_clip": 0.01149047, + "auxiliary_loss_mlp": 0.01053602, + "balance_loss_clip": 1.03424227, + "balance_loss_mlp": 1.0502665, + "epoch": 0.18199308582594317, + "flos": 25915186512000.0, + "grad_norm": 1.6571051349992714, + "language_loss": 0.76047945, + "learning_rate": 3.762515489146692e-06, + "loss": 0.78250599, + "num_input_tokens_seen": 65425955, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.98828125, + "step": 3027, + "time_per_iteration": 2.4952149391174316 + }, + { + "auxiliary_loss_clip": 0.01151758, + "auxiliary_loss_mlp": 0.01049071, + "balance_loss_clip": 1.03055763, + "balance_loss_mlp": 1.05106115, + "epoch": 0.18205320907861114, + "flos": 15377524433280.0, + "grad_norm": 1.7989426432275553, + "language_loss": 0.85400331, + "learning_rate": 3.762331382119546e-06, + "loss": 0.87601155, + "num_input_tokens_seen": 65442820, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0078125, + "step": 3028, + "time_per_iteration": 2.438113212585449 + }, + { + "auxiliary_loss_clip": 0.01144845, + "auxiliary_loss_mlp": 0.01043225, + "balance_loss_clip": 1.02543902, + "balance_loss_mlp": 1.04937243, + "epoch": 0.18211333233127913, + "flos": 25624310175360.0, + "grad_norm": 1.8205418995180693, + "language_loss": 0.82655656, + "learning_rate": 3.7621472082646183e-06, + "loss": 0.84843719, + "num_input_tokens_seen": 65461825, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.953125, + "step": 3029, + "time_per_iteration": 2.4866995811462402 + }, + { + "auxiliary_loss_clip": 0.01151958, + "auxiliary_loss_mlp": 0.01045395, + "balance_loss_clip": 1.02640462, + "balance_loss_mlp": 1.05306637, + "epoch": 0.1821734555839471, + "flos": 14976007228800.0, + "grad_norm": 2.0975281503542433, + "language_loss": 0.78150737, + "learning_rate": 3.761962967588891e-06, + "loss": 0.80348092, + "num_input_tokens_seen": 65479480, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.9921875, + "step": 3030, + "time_per_iteration": 2.458627700805664 + }, + { + "auxiliary_loss_clip": 0.01150751, + "auxiliary_loss_mlp": 0.01043659, + "balance_loss_clip": 1.02495515, + "balance_loss_mlp": 1.05141127, + "epoch": 0.18223357883661506, + "flos": 20194007034240.0, + "grad_norm": 1.955618442063123, + "language_loss": 0.85318518, + "learning_rate": 3.761778660099352e-06, + "loss": 0.87512928, + "num_input_tokens_seen": 65497775, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.99609375, + "step": 3031, + "time_per_iteration": 2.4492268562316895 + }, + { + "auxiliary_loss_clip": 0.01150203, + "auxiliary_loss_mlp": 0.01045881, + "balance_loss_clip": 1.02824974, + "balance_loss_mlp": 1.05232072, + "epoch": 0.18229370208928303, + "flos": 15231978524160.0, + "grad_norm": 1.8744751837074634, + "language_loss": 0.79713088, + "learning_rate": 3.76159428580299e-06, + "loss": 0.81909174, + "num_input_tokens_seen": 65516505, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.98046875, + "step": 3032, + "time_per_iteration": 2.4449126720428467 + }, + { + "auxiliary_loss_clip": 0.0115633, + "auxiliary_loss_mlp": 0.01044461, + "balance_loss_clip": 1.0260191, + "balance_loss_mlp": 1.05395341, + "epoch": 0.182353825341951, + "flos": 23840483927040.0, + "grad_norm": 2.0774072235136964, + "language_loss": 0.81420642, + "learning_rate": 3.761409844706795e-06, + "loss": 0.8362143, + "num_input_tokens_seen": 65536160, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.0234375, + "step": 3033, + "time_per_iteration": 2.47562575340271 + }, + { + "auxiliary_loss_clip": 0.01052781, + "auxiliary_loss_mlp": 0.01006645, + "balance_loss_clip": 1.00452268, + "balance_loss_mlp": 1.01995599, + "epoch": 0.18241394859461896, + "flos": 61190957393280.0, + "grad_norm": 0.8883360043233282, + "language_loss": 0.63479006, + "learning_rate": 3.7612253368177625e-06, + "loss": 0.6553843, + "num_input_tokens_seen": 65589375, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.328125, + "step": 3034, + "time_per_iteration": 2.9712142944335938 + }, + { + "auxiliary_loss_clip": 0.01148548, + "auxiliary_loss_mlp": 0.01043903, + "balance_loss_clip": 1.0263083, + "balance_loss_mlp": 1.05033147, + "epoch": 0.18247407184728695, + "flos": 18471694826880.0, + "grad_norm": 2.0132790953316113, + "language_loss": 0.79684323, + "learning_rate": 3.7610407621428893e-06, + "loss": 0.81876773, + "num_input_tokens_seen": 65606720, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.984375, + "step": 3035, + "time_per_iteration": 2.4517030715942383 + }, + { + "auxiliary_loss_clip": 0.01147231, + "auxiliary_loss_mlp": 0.01044908, + "balance_loss_clip": 1.02792096, + "balance_loss_mlp": 1.05231702, + "epoch": 0.18253419509995492, + "flos": 21795191602560.0, + "grad_norm": 2.217606261766961, + "language_loss": 0.84895855, + "learning_rate": 3.7608561206891735e-06, + "loss": 0.87087989, + "num_input_tokens_seen": 65625495, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.94921875, + "step": 3036, + "time_per_iteration": 2.5017378330230713 + }, + { + "auxiliary_loss_clip": 0.01142577, + "auxiliary_loss_mlp": 0.01042615, + "balance_loss_clip": 1.02524662, + "balance_loss_mlp": 1.04940438, + "epoch": 0.18259431835262288, + "flos": 20149764456960.0, + "grad_norm": 2.216717642760365, + "language_loss": 0.79836094, + "learning_rate": 3.760671412463617e-06, + "loss": 0.82021284, + "num_input_tokens_seen": 65643515, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9296875, + "step": 3037, + "time_per_iteration": 2.4591338634490967 + }, + { + "auxiliary_loss_clip": 0.01149617, + "auxiliary_loss_mlp": 0.01047298, + "balance_loss_clip": 1.02686584, + "balance_loss_mlp": 1.05208671, + "epoch": 0.18265444160529085, + "flos": 16981653916800.0, + "grad_norm": 2.68131613553598, + "language_loss": 0.79450762, + "learning_rate": 3.7604866374732246e-06, + "loss": 0.81647676, + "num_input_tokens_seen": 65658155, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.9765625, + "step": 3038, + "time_per_iteration": 2.440664768218994 + }, + { + "auxiliary_loss_clip": 0.0114731, + "auxiliary_loss_mlp": 0.01048244, + "balance_loss_clip": 1.03069699, + "balance_loss_mlp": 1.05140162, + "epoch": 0.1827145648579588, + "flos": 34423250509440.0, + "grad_norm": 2.3213350225315748, + "language_loss": 0.67311364, + "learning_rate": 3.7603017957250023e-06, + "loss": 0.69506919, + "num_input_tokens_seen": 65679310, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.95703125, + "step": 3039, + "time_per_iteration": 2.573272466659546 + }, + { + "auxiliary_loss_clip": 0.01148566, + "auxiliary_loss_mlp": 0.01051856, + "balance_loss_clip": 1.03323567, + "balance_loss_mlp": 1.05112875, + "epoch": 0.18277468811062678, + "flos": 53287017264000.0, + "grad_norm": 1.9125298187860031, + "language_loss": 0.73687911, + "learning_rate": 3.7601168872259593e-06, + "loss": 0.75888336, + "num_input_tokens_seen": 65705235, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9765625, + "step": 3040, + "time_per_iteration": 2.771242618560791 + }, + { + "auxiliary_loss_clip": 0.0114474, + "auxiliary_loss_mlp": 0.01042775, + "balance_loss_clip": 1.02418995, + "balance_loss_mlp": 1.04849768, + "epoch": 0.18283481136329474, + "flos": 31650659602560.0, + "grad_norm": 1.8780343880464916, + "language_loss": 0.60176188, + "learning_rate": 3.7599319119831075e-06, + "loss": 0.62363702, + "num_input_tokens_seen": 65727575, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9609375, + "step": 3041, + "time_per_iteration": 2.670691967010498 + }, + { + "auxiliary_loss_clip": 0.01146425, + "auxiliary_loss_mlp": 0.01055713, + "balance_loss_clip": 1.03756928, + "balance_loss_mlp": 1.05012786, + "epoch": 0.18289493461596273, + "flos": 53137664513280.0, + "grad_norm": 1.7488247873746179, + "language_loss": 0.60361505, + "learning_rate": 3.7597468700034616e-06, + "loss": 0.6256364, + "num_input_tokens_seen": 65751370, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9609375, + "step": 3042, + "time_per_iteration": 2.7942960262298584 + }, + { + "auxiliary_loss_clip": 0.01144442, + "auxiliary_loss_mlp": 0.0104919, + "balance_loss_clip": 1.03143954, + "balance_loss_mlp": 1.04945385, + "epoch": 0.1829550578686307, + "flos": 25589369220480.0, + "grad_norm": 1.6831322617730042, + "language_loss": 0.8769263, + "learning_rate": 3.7595617612940374e-06, + "loss": 0.8988626, + "num_input_tokens_seen": 65771040, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.94921875, + "step": 3043, + "time_per_iteration": 2.524871587753296 + }, + { + "auxiliary_loss_clip": 0.01149409, + "auxiliary_loss_mlp": 0.01049211, + "balance_loss_clip": 1.03005409, + "balance_loss_mlp": 1.05107832, + "epoch": 0.18301518112129866, + "flos": 22601422321920.0, + "grad_norm": 1.9464603469819268, + "language_loss": 0.707008, + "learning_rate": 3.7593765858618552e-06, + "loss": 0.72899425, + "num_input_tokens_seen": 65789345, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.984375, + "step": 3044, + "time_per_iteration": 2.483335018157959 + }, + { + "auxiliary_loss_clip": 0.01150484, + "auxiliary_loss_mlp": 0.01055406, + "balance_loss_clip": 1.03552175, + "balance_loss_mlp": 1.04929996, + "epoch": 0.18307530437396663, + "flos": 34020799551360.0, + "grad_norm": 2.0901220952627497, + "language_loss": 0.64385587, + "learning_rate": 3.7591913437139365e-06, + "loss": 0.66591471, + "num_input_tokens_seen": 65810990, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.015625, + "step": 3045, + "time_per_iteration": 2.592855453491211 + }, + { + "auxiliary_loss_clip": 0.01145205, + "auxiliary_loss_mlp": 0.01054969, + "balance_loss_clip": 1.0368377, + "balance_loss_mlp": 1.04977548, + "epoch": 0.1831354276266346, + "flos": 21279765392640.0, + "grad_norm": 2.998731206361719, + "language_loss": 0.79165137, + "learning_rate": 3.7590060348573066e-06, + "loss": 0.81365317, + "num_input_tokens_seen": 65827230, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.953125, + "step": 3046, + "time_per_iteration": 2.5034587383270264 + }, + { + "auxiliary_loss_clip": 0.01146985, + "auxiliary_loss_mlp": 0.01048107, + "balance_loss_clip": 1.02908087, + "balance_loss_mlp": 1.04764223, + "epoch": 0.18319555087930256, + "flos": 21032952065280.0, + "grad_norm": 3.3529268295267016, + "language_loss": 0.78991181, + "learning_rate": 3.7588206592989903e-06, + "loss": 0.81186271, + "num_input_tokens_seen": 65845900, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.9921875, + "step": 3047, + "time_per_iteration": 2.5140535831451416 + }, + { + "auxiliary_loss_clip": 0.01145799, + "auxiliary_loss_mlp": 0.01046901, + "balance_loss_clip": 1.02923381, + "balance_loss_mlp": 1.05111742, + "epoch": 0.18325567413197055, + "flos": 34382958428160.0, + "grad_norm": 1.5613113238500957, + "language_loss": 0.80888635, + "learning_rate": 3.7586352170460194e-06, + "loss": 0.83081341, + "num_input_tokens_seen": 65868730, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9453125, + "step": 3048, + "time_per_iteration": 2.5848963260650635 + }, + { + "auxiliary_loss_clip": 0.01148052, + "auxiliary_loss_mlp": 0.01042108, + "balance_loss_clip": 1.02283192, + "balance_loss_mlp": 1.0502528, + "epoch": 0.18331579738463852, + "flos": 20558464381440.0, + "grad_norm": 1.8161394933049422, + "language_loss": 0.86232805, + "learning_rate": 3.758449708105424e-06, + "loss": 0.88422966, + "num_input_tokens_seen": 65888420, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.9765625, + "step": 3049, + "time_per_iteration": 2.4665114879608154 + }, + { + "auxiliary_loss_clip": 0.01154476, + "auxiliary_loss_mlp": 0.01043247, + "balance_loss_clip": 1.02364874, + "balance_loss_mlp": 1.05159521, + "epoch": 0.18337592063730648, + "flos": 19607872901760.0, + "grad_norm": 2.2703740748038066, + "language_loss": 0.77160966, + "learning_rate": 3.75826413248424e-06, + "loss": 0.79358685, + "num_input_tokens_seen": 65905840, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.03125, + "step": 3050, + "time_per_iteration": 2.4525256156921387 + }, + { + "auxiliary_loss_clip": 0.01146286, + "auxiliary_loss_mlp": 0.01045385, + "balance_loss_clip": 1.02683592, + "balance_loss_mlp": 1.04867804, + "epoch": 0.18343604388997445, + "flos": 20850885002880.0, + "grad_norm": 2.010292972394078, + "language_loss": 0.99174476, + "learning_rate": 3.7580784901895035e-06, + "loss": 1.0136615, + "num_input_tokens_seen": 65922845, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9765625, + "step": 3051, + "time_per_iteration": 2.4559926986694336 + }, + { + "auxiliary_loss_clip": 0.01145751, + "auxiliary_loss_mlp": 0.01039078, + "balance_loss_clip": 1.02096963, + "balance_loss_mlp": 1.050529, + "epoch": 0.1834961671426424, + "flos": 24394370624640.0, + "grad_norm": 1.5992624239842805, + "language_loss": 0.86153144, + "learning_rate": 3.7578927812282542e-06, + "loss": 0.8833797, + "num_input_tokens_seen": 65945555, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.953125, + "step": 3052, + "time_per_iteration": 2.559396505355835 + }, + { + "auxiliary_loss_clip": 0.01145626, + "auxiliary_loss_mlp": 0.0105059, + "balance_loss_clip": 1.03267264, + "balance_loss_mlp": 1.04985499, + "epoch": 0.18355629039531038, + "flos": 21251612108160.0, + "grad_norm": 1.8182752776897229, + "language_loss": 0.73004341, + "learning_rate": 3.7577070056075356e-06, + "loss": 0.75200558, + "num_input_tokens_seen": 65963965, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.95703125, + "step": 3053, + "time_per_iteration": 2.4481074810028076 + }, + { + "auxiliary_loss_clip": 0.01150169, + "auxiliary_loss_mlp": 0.01049972, + "balance_loss_clip": 1.03051662, + "balance_loss_mlp": 1.05208337, + "epoch": 0.18361641364797834, + "flos": 28656499651200.0, + "grad_norm": 1.6467304764216655, + "language_loss": 0.62212563, + "learning_rate": 3.7575211633343902e-06, + "loss": 0.64412701, + "num_input_tokens_seen": 65985965, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.98046875, + "step": 3054, + "time_per_iteration": 2.5701377391815186 + }, + { + "auxiliary_loss_clip": 0.01146023, + "auxiliary_loss_mlp": 0.01042771, + "balance_loss_clip": 1.02510393, + "balance_loss_mlp": 1.04962707, + "epoch": 0.18367653690064634, + "flos": 20918827578240.0, + "grad_norm": 2.2210920593094325, + "language_loss": 0.78501689, + "learning_rate": 3.7573352544158663e-06, + "loss": 0.80690485, + "num_input_tokens_seen": 66005645, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9609375, + "step": 3055, + "time_per_iteration": 2.47776198387146 + }, + { + "auxiliary_loss_clip": 0.0114232, + "auxiliary_loss_mlp": 0.01053937, + "balance_loss_clip": 1.03622222, + "balance_loss_mlp": 1.04779387, + "epoch": 0.1837366601533143, + "flos": 28765596234240.0, + "grad_norm": 1.894881128028073, + "language_loss": 0.70218527, + "learning_rate": 3.757149278859014e-06, + "loss": 0.72414786, + "num_input_tokens_seen": 66025675, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9453125, + "step": 3056, + "time_per_iteration": 2.541361093521118 + }, + { + "auxiliary_loss_clip": 0.0114918, + "auxiliary_loss_mlp": 0.01043721, + "balance_loss_clip": 1.02612543, + "balance_loss_mlp": 1.05066419, + "epoch": 0.18379678340598227, + "flos": 21251432540160.0, + "grad_norm": 1.4932354373853338, + "language_loss": 0.8028152, + "learning_rate": 3.7569632366708842e-06, + "loss": 0.82474422, + "num_input_tokens_seen": 66046125, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.984375, + "step": 3057, + "time_per_iteration": 2.4718995094299316 + }, + { + "auxiliary_loss_clip": 0.0115229, + "auxiliary_loss_mlp": 0.01049373, + "balance_loss_clip": 1.02864265, + "balance_loss_mlp": 1.04847729, + "epoch": 0.18385690665865023, + "flos": 20449619193600.0, + "grad_norm": 2.0112890674266914, + "language_loss": 0.82289785, + "learning_rate": 3.756777127858533e-06, + "loss": 0.84491444, + "num_input_tokens_seen": 66064375, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.0390625, + "step": 3058, + "time_per_iteration": 2.4653379917144775 + }, + { + "auxiliary_loss_clip": 0.01148012, + "auxiliary_loss_mlp": 0.01046535, + "balance_loss_clip": 1.02818882, + "balance_loss_mlp": 1.04893029, + "epoch": 0.1839170299113182, + "flos": 26140562398080.0, + "grad_norm": 2.205773819593527, + "language_loss": 0.85894352, + "learning_rate": 3.756590952429017e-06, + "loss": 0.88088906, + "num_input_tokens_seen": 66084590, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.9921875, + "step": 3059, + "time_per_iteration": 4.0151047706604 + }, + { + "auxiliary_loss_clip": 0.01145706, + "auxiliary_loss_mlp": 0.01045338, + "balance_loss_clip": 1.02724195, + "balance_loss_mlp": 1.04931092, + "epoch": 0.18397715316398616, + "flos": 31758032332800.0, + "grad_norm": 1.70952354928268, + "language_loss": 0.72799402, + "learning_rate": 3.756404710389396e-06, + "loss": 0.74990445, + "num_input_tokens_seen": 66107105, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9609375, + "step": 3060, + "time_per_iteration": 5.466471195220947 + }, + { + "auxiliary_loss_clip": 0.01151276, + "auxiliary_loss_mlp": 0.01042783, + "balance_loss_clip": 1.02375722, + "balance_loss_mlp": 1.05253565, + "epoch": 0.18403727641665413, + "flos": 24611989173120.0, + "grad_norm": 1.7373746338425942, + "language_loss": 0.72797298, + "learning_rate": 3.7562184017467323e-06, + "loss": 0.74991357, + "num_input_tokens_seen": 66129295, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.98828125, + "step": 3061, + "time_per_iteration": 2.5244035720825195 + }, + { + "auxiliary_loss_clip": 0.01146537, + "auxiliary_loss_mlp": 0.0104557, + "balance_loss_clip": 1.02697313, + "balance_loss_mlp": 1.05087519, + "epoch": 0.18409739966932212, + "flos": 23439900476160.0, + "grad_norm": 1.8714044833418495, + "language_loss": 0.81622046, + "learning_rate": 3.7560320265080906e-06, + "loss": 0.83814156, + "num_input_tokens_seen": 66146910, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.95703125, + "step": 3062, + "time_per_iteration": 2.4767649173736572 + }, + { + "auxiliary_loss_clip": 0.01154667, + "auxiliary_loss_mlp": 0.01045371, + "balance_loss_clip": 1.02681041, + "balance_loss_mlp": 1.05394542, + "epoch": 0.18415752292199009, + "flos": 21872112577920.0, + "grad_norm": 1.7582970194369052, + "language_loss": 0.72718614, + "learning_rate": 3.7558455846805383e-06, + "loss": 0.74918652, + "num_input_tokens_seen": 66165370, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0078125, + "step": 3063, + "time_per_iteration": 2.5082144737243652 + }, + { + "auxiliary_loss_clip": 0.01146453, + "auxiliary_loss_mlp": 0.01041784, + "balance_loss_clip": 1.02516627, + "balance_loss_mlp": 1.04935837, + "epoch": 0.18421764617465805, + "flos": 25410678036480.0, + "grad_norm": 2.1216519555610183, + "language_loss": 0.65496099, + "learning_rate": 3.7556590762711463e-06, + "loss": 0.6768434, + "num_input_tokens_seen": 66186210, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.96875, + "step": 3064, + "time_per_iteration": 2.523141622543335 + }, + { + "auxiliary_loss_clip": 0.01149329, + "auxiliary_loss_mlp": 0.01048992, + "balance_loss_clip": 1.03081298, + "balance_loss_mlp": 1.05274165, + "epoch": 0.18427776942732602, + "flos": 27198131558400.0, + "grad_norm": 2.6163412642887947, + "language_loss": 0.68768656, + "learning_rate": 3.7554725012869853e-06, + "loss": 0.70966971, + "num_input_tokens_seen": 66204800, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.96875, + "step": 3065, + "time_per_iteration": 2.5244293212890625 + }, + { + "auxiliary_loss_clip": 0.01151353, + "auxiliary_loss_mlp": 0.01047403, + "balance_loss_clip": 1.02819824, + "balance_loss_mlp": 1.05120087, + "epoch": 0.18433789267999398, + "flos": 27852351920640.0, + "grad_norm": 4.932084281869228, + "language_loss": 0.72561431, + "learning_rate": 3.7552858597351318e-06, + "loss": 0.74760187, + "num_input_tokens_seen": 66222195, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0, + "step": 3066, + "time_per_iteration": 2.5428919792175293 + }, + { + "auxiliary_loss_clip": 0.01148706, + "auxiliary_loss_mlp": 0.01043732, + "balance_loss_clip": 1.02608871, + "balance_loss_mlp": 1.05074954, + "epoch": 0.18439801593266195, + "flos": 17856940533120.0, + "grad_norm": 1.9825677919996112, + "language_loss": 0.82477474, + "learning_rate": 3.7550991516226622e-06, + "loss": 0.84669906, + "num_input_tokens_seen": 66239505, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.98046875, + "step": 3067, + "time_per_iteration": 2.4500880241394043 + }, + { + "auxiliary_loss_clip": 0.01048916, + "auxiliary_loss_mlp": 0.01007477, + "balance_loss_clip": 1.00535476, + "balance_loss_mlp": 1.01668859, + "epoch": 0.18445813918532994, + "flos": 56389522590720.0, + "grad_norm": 0.7924805733675573, + "language_loss": 0.59706604, + "learning_rate": 3.754912376956657e-06, + "loss": 0.61763, + "num_input_tokens_seen": 66295695, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.32226562, + "step": 3068, + "time_per_iteration": 2.9375104904174805 + }, + { + "auxiliary_loss_clip": 0.01153283, + "auxiliary_loss_mlp": 0.01040124, + "balance_loss_clip": 1.02232563, + "balance_loss_mlp": 1.05714762, + "epoch": 0.1845182624379979, + "flos": 20957180325120.0, + "grad_norm": 1.8708990955689164, + "language_loss": 0.76227212, + "learning_rate": 3.7547255357441987e-06, + "loss": 0.78420615, + "num_input_tokens_seen": 66315315, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9609375, + "step": 3069, + "time_per_iteration": 2.462446451187134 + }, + { + "auxiliary_loss_clip": 0.01151625, + "auxiliary_loss_mlp": 0.01040566, + "balance_loss_clip": 1.02233863, + "balance_loss_mlp": 1.05299067, + "epoch": 0.18457838569066587, + "flos": 20485170679680.0, + "grad_norm": 1.7428293735192475, + "language_loss": 0.84803855, + "learning_rate": 3.7545386279923718e-06, + "loss": 0.86996043, + "num_input_tokens_seen": 66333675, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.984375, + "step": 3070, + "time_per_iteration": 2.4887194633483887 + }, + { + "auxiliary_loss_clip": 0.01152145, + "auxiliary_loss_mlp": 0.01042624, + "balance_loss_clip": 1.02462363, + "balance_loss_mlp": 1.05298758, + "epoch": 0.18463850894333383, + "flos": 25010022758400.0, + "grad_norm": 1.9722863584187038, + "language_loss": 0.77370453, + "learning_rate": 3.754351653708265e-06, + "loss": 0.79565221, + "num_input_tokens_seen": 66354075, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.98828125, + "step": 3071, + "time_per_iteration": 2.482213258743286 + }, + { + "auxiliary_loss_clip": 0.01152228, + "auxiliary_loss_mlp": 0.01048541, + "balance_loss_clip": 1.03042173, + "balance_loss_mlp": 1.05342758, + "epoch": 0.1846986321960018, + "flos": 16800628348800.0, + "grad_norm": 2.705053980849468, + "language_loss": 0.77691031, + "learning_rate": 3.7541646128989674e-06, + "loss": 0.79891801, + "num_input_tokens_seen": 66372520, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9921875, + "step": 3072, + "time_per_iteration": 2.466387987136841 + }, + { + "auxiliary_loss_clip": 0.01150023, + "auxiliary_loss_mlp": 0.01042519, + "balance_loss_clip": 1.02339804, + "balance_loss_mlp": 1.05013216, + "epoch": 0.18475875544866976, + "flos": 20814327936000.0, + "grad_norm": 1.8173375196390826, + "language_loss": 0.8607235, + "learning_rate": 3.7539775055715715e-06, + "loss": 0.88264889, + "num_input_tokens_seen": 66390745, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0, + "step": 3073, + "time_per_iteration": 2.4510810375213623 + }, + { + "auxiliary_loss_clip": 0.01151045, + "auxiliary_loss_mlp": 0.01045152, + "balance_loss_clip": 1.02851045, + "balance_loss_mlp": 1.05339348, + "epoch": 0.18481887870133773, + "flos": 22601422321920.0, + "grad_norm": 2.2059027996031877, + "language_loss": 0.92005521, + "learning_rate": 3.7537903317331732e-06, + "loss": 0.9420172, + "num_input_tokens_seen": 66410525, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.97265625, + "step": 3074, + "time_per_iteration": 2.473710298538208 + }, + { + "auxiliary_loss_clip": 0.01146992, + "auxiliary_loss_mlp": 0.01044255, + "balance_loss_clip": 1.02490735, + "balance_loss_mlp": 1.05028176, + "epoch": 0.18487900195400572, + "flos": 29458815788160.0, + "grad_norm": 1.9913742546968862, + "language_loss": 0.65041798, + "learning_rate": 3.75360309139087e-06, + "loss": 0.67233044, + "num_input_tokens_seen": 66432535, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.96875, + "step": 3075, + "time_per_iteration": 2.533724784851074 + }, + { + "auxiliary_loss_clip": 0.01149493, + "auxiliary_loss_mlp": 0.01043367, + "balance_loss_clip": 1.02578402, + "balance_loss_mlp": 1.053177, + "epoch": 0.1849391252066737, + "flos": 20628777254400.0, + "grad_norm": 1.709240712607824, + "language_loss": 0.72323918, + "learning_rate": 3.753415784551761e-06, + "loss": 0.74516779, + "num_input_tokens_seen": 66450620, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9609375, + "step": 3076, + "time_per_iteration": 2.4544899463653564 + }, + { + "auxiliary_loss_clip": 0.01153692, + "auxiliary_loss_mlp": 0.01046042, + "balance_loss_clip": 1.0280292, + "balance_loss_mlp": 1.05341136, + "epoch": 0.18499924845934165, + "flos": 14428549065600.0, + "grad_norm": 2.4900368363969854, + "language_loss": 0.80860448, + "learning_rate": 3.7532284112229507e-06, + "loss": 0.83060181, + "num_input_tokens_seen": 66467865, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0, + "step": 3077, + "time_per_iteration": 2.45137882232666 + }, + { + "auxiliary_loss_clip": 0.01146798, + "auxiliary_loss_mlp": 0.01045715, + "balance_loss_clip": 1.02816749, + "balance_loss_mlp": 1.05103469, + "epoch": 0.18505937171200962, + "flos": 23727652329600.0, + "grad_norm": 1.7908770900539794, + "language_loss": 0.78764129, + "learning_rate": 3.7530409714115424e-06, + "loss": 0.8095665, + "num_input_tokens_seen": 66486245, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.95703125, + "step": 3078, + "time_per_iteration": 2.477393865585327 + }, + { + "auxiliary_loss_clip": 0.01147874, + "auxiliary_loss_mlp": 0.01044325, + "balance_loss_clip": 1.02714717, + "balance_loss_mlp": 1.05057585, + "epoch": 0.18511949496467758, + "flos": 25957489754880.0, + "grad_norm": 1.8549646444276375, + "language_loss": 0.7758081, + "learning_rate": 3.7528534651246453e-06, + "loss": 0.79773009, + "num_input_tokens_seen": 66506510, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9765625, + "step": 3079, + "time_per_iteration": 2.5069448947906494 + }, + { + "auxiliary_loss_clip": 0.01143899, + "auxiliary_loss_mlp": 0.0104323, + "balance_loss_clip": 1.02581406, + "balance_loss_mlp": 1.04723024, + "epoch": 0.18517961821734555, + "flos": 42413553912960.0, + "grad_norm": 2.3452692712375893, + "language_loss": 0.81668431, + "learning_rate": 3.752665892369369e-06, + "loss": 0.83855557, + "num_input_tokens_seen": 66530960, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.96484375, + "step": 3080, + "time_per_iteration": 2.688206911087036 + }, + { + "auxiliary_loss_clip": 0.01149652, + "auxiliary_loss_mlp": 0.01046927, + "balance_loss_clip": 1.02812803, + "balance_loss_mlp": 1.05079699, + "epoch": 0.18523974147001354, + "flos": 24097568544000.0, + "grad_norm": 2.0276132956863764, + "language_loss": 0.7435087, + "learning_rate": 3.7524782531528266e-06, + "loss": 0.7654745, + "num_input_tokens_seen": 66550275, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.98828125, + "step": 3081, + "time_per_iteration": 2.5003983974456787 + }, + { + "auxiliary_loss_clip": 0.01151656, + "auxiliary_loss_mlp": 0.01050271, + "balance_loss_clip": 1.03124547, + "balance_loss_mlp": 1.05527234, + "epoch": 0.1852998647226815, + "flos": 27375278457600.0, + "grad_norm": 2.070281784994394, + "language_loss": 0.71532816, + "learning_rate": 3.7522905474821334e-06, + "loss": 0.73734742, + "num_input_tokens_seen": 66569040, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.9609375, + "step": 3082, + "time_per_iteration": 2.514004707336426 + }, + { + "auxiliary_loss_clip": 0.011545, + "auxiliary_loss_mlp": 0.01050471, + "balance_loss_clip": 1.03155267, + "balance_loss_mlp": 1.05488813, + "epoch": 0.18535998797534947, + "flos": 18332757020160.0, + "grad_norm": 1.869200996989063, + "language_loss": 0.69338834, + "learning_rate": 3.752102775364407e-06, + "loss": 0.71543807, + "num_input_tokens_seen": 66587775, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.9921875, + "step": 3083, + "time_per_iteration": 2.446418523788452 + }, + { + "auxiliary_loss_clip": 0.0114679, + "auxiliary_loss_mlp": 0.01049301, + "balance_loss_clip": 1.03187287, + "balance_loss_mlp": 1.05216169, + "epoch": 0.18542011122801744, + "flos": 37845859887360.0, + "grad_norm": 4.022344342016001, + "language_loss": 0.68854296, + "learning_rate": 3.751914936806767e-06, + "loss": 0.71050388, + "num_input_tokens_seen": 66610800, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9453125, + "step": 3084, + "time_per_iteration": 2.5964090824127197 + }, + { + "auxiliary_loss_clip": 0.01145496, + "auxiliary_loss_mlp": 0.01043341, + "balance_loss_clip": 1.02541232, + "balance_loss_mlp": 1.04961908, + "epoch": 0.1854802344806854, + "flos": 25186128163200.0, + "grad_norm": 1.5883609883793584, + "language_loss": 0.77831411, + "learning_rate": 3.7517270318163377e-06, + "loss": 0.80020249, + "num_input_tokens_seen": 66630960, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9609375, + "step": 3085, + "time_per_iteration": 2.500401020050049 + }, + { + "auxiliary_loss_clip": 0.01145039, + "auxiliary_loss_mlp": 0.01053452, + "balance_loss_clip": 1.03557014, + "balance_loss_mlp": 1.04887915, + "epoch": 0.18554035773335337, + "flos": 26684788337280.0, + "grad_norm": 1.8880953488015286, + "language_loss": 0.73488086, + "learning_rate": 3.751539060400244e-06, + "loss": 0.7568658, + "num_input_tokens_seen": 66650585, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.9609375, + "step": 3086, + "time_per_iteration": 2.5121798515319824 + }, + { + "auxiliary_loss_clip": 0.0114717, + "auxiliary_loss_mlp": 0.01048198, + "balance_loss_clip": 1.02949429, + "balance_loss_mlp": 1.05223882, + "epoch": 0.18560048098602133, + "flos": 22346887570560.0, + "grad_norm": 4.074676999617497, + "language_loss": 0.70087367, + "learning_rate": 3.7513510225656132e-06, + "loss": 0.72282737, + "num_input_tokens_seen": 66670045, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.953125, + "step": 3087, + "time_per_iteration": 2.469980001449585 + }, + { + "auxiliary_loss_clip": 0.01149215, + "auxiliary_loss_mlp": 0.01048668, + "balance_loss_clip": 1.02928519, + "balance_loss_mlp": 1.05118215, + "epoch": 0.18566060423868933, + "flos": 17748526308480.0, + "grad_norm": 2.299065028063824, + "language_loss": 0.72731185, + "learning_rate": 3.7511629183195764e-06, + "loss": 0.74929065, + "num_input_tokens_seen": 66688790, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.98046875, + "step": 3088, + "time_per_iteration": 2.4569249153137207 + }, + { + "auxiliary_loss_clip": 0.01144422, + "auxiliary_loss_mlp": 0.01045089, + "balance_loss_clip": 1.02733839, + "balance_loss_mlp": 1.05015588, + "epoch": 0.1857207274913573, + "flos": 24677274142080.0, + "grad_norm": 2.023411505730453, + "language_loss": 0.91849768, + "learning_rate": 3.7509747476692663e-06, + "loss": 0.94039273, + "num_input_tokens_seen": 66708090, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.94140625, + "step": 3089, + "time_per_iteration": 2.5086276531219482 + }, + { + "auxiliary_loss_clip": 0.01146464, + "auxiliary_loss_mlp": 0.0104377, + "balance_loss_clip": 1.02573323, + "balance_loss_mlp": 1.05124271, + "epoch": 0.18578085074402526, + "flos": 28147825198080.0, + "grad_norm": 2.7535733421879174, + "language_loss": 0.57406759, + "learning_rate": 3.7507865106218176e-06, + "loss": 0.59596992, + "num_input_tokens_seen": 66727320, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.953125, + "step": 3090, + "time_per_iteration": 2.544934034347534 + }, + { + "auxiliary_loss_clip": 0.011443, + "auxiliary_loss_mlp": 0.01049477, + "balance_loss_clip": 1.03133333, + "balance_loss_mlp": 1.04945779, + "epoch": 0.18584097399669322, + "flos": 23951878980480.0, + "grad_norm": 1.9526543189913628, + "language_loss": 0.82229531, + "learning_rate": 3.7505982071843695e-06, + "loss": 0.84423304, + "num_input_tokens_seen": 66747505, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9453125, + "step": 3091, + "time_per_iteration": 2.5339536666870117 + }, + { + "auxiliary_loss_clip": 0.01149127, + "auxiliary_loss_mlp": 0.01049478, + "balance_loss_clip": 1.03165662, + "balance_loss_mlp": 1.05212235, + "epoch": 0.18590109724936119, + "flos": 17201678676480.0, + "grad_norm": 2.0588011246991127, + "language_loss": 0.83561456, + "learning_rate": 3.7504098373640617e-06, + "loss": 0.85760063, + "num_input_tokens_seen": 66766425, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.96875, + "step": 3092, + "time_per_iteration": 2.5091474056243896 + }, + { + "auxiliary_loss_clip": 0.01151013, + "auxiliary_loss_mlp": 0.01044436, + "balance_loss_clip": 1.02562487, + "balance_loss_mlp": 1.05010569, + "epoch": 0.18596122050202915, + "flos": 17234644383360.0, + "grad_norm": 4.142827775979207, + "language_loss": 0.93487823, + "learning_rate": 3.750221401168038e-06, + "loss": 0.95683277, + "num_input_tokens_seen": 66781130, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0078125, + "step": 3093, + "time_per_iteration": 2.4307475090026855 + }, + { + "auxiliary_loss_clip": 0.01146588, + "auxiliary_loss_mlp": 0.01038182, + "balance_loss_clip": 1.02115917, + "balance_loss_mlp": 1.05090082, + "epoch": 0.18602134375469712, + "flos": 19020733188480.0, + "grad_norm": 2.060946690404802, + "language_loss": 0.77380008, + "learning_rate": 3.750032898603443e-06, + "loss": 0.79564774, + "num_input_tokens_seen": 66797535, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.953125, + "step": 3094, + "time_per_iteration": 2.4520375728607178 + }, + { + "auxiliary_loss_clip": 0.01147212, + "auxiliary_loss_mlp": 0.01047698, + "balance_loss_clip": 1.03098452, + "balance_loss_mlp": 1.05099964, + "epoch": 0.1860814670073651, + "flos": 50950094417280.0, + "grad_norm": 1.6535165555915046, + "language_loss": 0.69985378, + "learning_rate": 3.749844329677425e-06, + "loss": 0.72180283, + "num_input_tokens_seen": 66821720, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9609375, + "step": 3095, + "time_per_iteration": 2.7395834922790527 + }, + { + "auxiliary_loss_clip": 0.01149572, + "auxiliary_loss_mlp": 0.01045107, + "balance_loss_clip": 1.02614033, + "balance_loss_mlp": 1.05169249, + "epoch": 0.18614159026003307, + "flos": 19390972625280.0, + "grad_norm": 1.9053555001005595, + "language_loss": 0.8077082, + "learning_rate": 3.749655694397135e-06, + "loss": 0.82965505, + "num_input_tokens_seen": 66839060, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.98046875, + "step": 3096, + "time_per_iteration": 2.4506232738494873 + }, + { + "auxiliary_loss_clip": 0.01147695, + "auxiliary_loss_mlp": 0.01047208, + "balance_loss_clip": 1.02883816, + "balance_loss_mlp": 1.05086875, + "epoch": 0.18620171351270104, + "flos": 21798782962560.0, + "grad_norm": 2.061308652340225, + "language_loss": 0.75101036, + "learning_rate": 3.7494669927697255e-06, + "loss": 0.77295941, + "num_input_tokens_seen": 66857760, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.96875, + "step": 3097, + "time_per_iteration": 2.46639347076416 + }, + { + "auxiliary_loss_clip": 0.01147181, + "auxiliary_loss_mlp": 0.01045993, + "balance_loss_clip": 1.02789688, + "balance_loss_mlp": 1.05196047, + "epoch": 0.186261836765369, + "flos": 16362877299840.0, + "grad_norm": 2.5365100966912664, + "language_loss": 0.66038394, + "learning_rate": 3.749278224802352e-06, + "loss": 0.68231571, + "num_input_tokens_seen": 66876460, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.953125, + "step": 3098, + "time_per_iteration": 2.46763014793396 + }, + { + "auxiliary_loss_clip": 0.01148744, + "auxiliary_loss_mlp": 0.01048857, + "balance_loss_clip": 1.02973545, + "balance_loss_mlp": 1.04978585, + "epoch": 0.18632196001803697, + "flos": 23370054480000.0, + "grad_norm": 1.6025275160282182, + "language_loss": 0.69907904, + "learning_rate": 3.7490893905021733e-06, + "loss": 0.72105503, + "num_input_tokens_seen": 66897960, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.98828125, + "step": 3099, + "time_per_iteration": 2.469336748123169 + }, + { + "auxiliary_loss_clip": 0.01147788, + "auxiliary_loss_mlp": 0.01052362, + "balance_loss_clip": 1.03290749, + "balance_loss_mlp": 1.04985309, + "epoch": 0.18638208327070493, + "flos": 22492002516480.0, + "grad_norm": 1.4888180158498334, + "language_loss": 0.71623552, + "learning_rate": 3.7489004898763494e-06, + "loss": 0.73823702, + "num_input_tokens_seen": 66917675, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.9765625, + "step": 3100, + "time_per_iteration": 2.510803699493408 + }, + { + "auxiliary_loss_clip": 0.01150621, + "auxiliary_loss_mlp": 0.01050424, + "balance_loss_clip": 1.03104091, + "balance_loss_mlp": 1.05147338, + "epoch": 0.18644220652337293, + "flos": 29165245931520.0, + "grad_norm": 2.2181859131844757, + "language_loss": 0.80163074, + "learning_rate": 3.7487115229320444e-06, + "loss": 0.82364118, + "num_input_tokens_seen": 66936000, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.9921875, + "step": 3101, + "time_per_iteration": 4.007607936859131 + }, + { + "auxiliary_loss_clip": 0.0114449, + "auxiliary_loss_mlp": 0.01043434, + "balance_loss_clip": 1.02606487, + "balance_loss_mlp": 1.05100489, + "epoch": 0.1865023297760409, + "flos": 24243796811520.0, + "grad_norm": 2.082156961368248, + "language_loss": 0.76803768, + "learning_rate": 3.7485224896764222e-06, + "loss": 0.78991693, + "num_input_tokens_seen": 66955700, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9375, + "step": 3102, + "time_per_iteration": 5.438685894012451 + }, + { + "auxiliary_loss_clip": 0.0114717, + "auxiliary_loss_mlp": 0.01041158, + "balance_loss_clip": 1.02322865, + "balance_loss_mlp": 1.04973269, + "epoch": 0.18656245302870886, + "flos": 19128716449920.0, + "grad_norm": 2.5595226686006565, + "language_loss": 0.76962835, + "learning_rate": 3.7483333901166525e-06, + "loss": 0.79151165, + "num_input_tokens_seen": 66972815, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9765625, + "step": 3103, + "time_per_iteration": 2.4742202758789062 + }, + { + "auxiliary_loss_clip": 0.0114783, + "auxiliary_loss_mlp": 0.0104302, + "balance_loss_clip": 1.02540123, + "balance_loss_mlp": 1.05014729, + "epoch": 0.18662257628137682, + "flos": 17786088956160.0, + "grad_norm": 1.966347666558745, + "language_loss": 0.79074025, + "learning_rate": 3.7481442242599054e-06, + "loss": 0.81264877, + "num_input_tokens_seen": 66992280, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9765625, + "step": 3104, + "time_per_iteration": 2.4873924255371094 + }, + { + "auxiliary_loss_clip": 0.01148715, + "auxiliary_loss_mlp": 0.01043939, + "balance_loss_clip": 1.02653468, + "balance_loss_mlp": 1.05237842, + "epoch": 0.1866826995340448, + "flos": 24024382583040.0, + "grad_norm": 1.943867006204371, + "language_loss": 0.8519029, + "learning_rate": 3.747954992113354e-06, + "loss": 0.87382948, + "num_input_tokens_seen": 67012220, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9609375, + "step": 3105, + "time_per_iteration": 2.488638162612915 + }, + { + "auxiliary_loss_clip": 0.01152184, + "auxiliary_loss_mlp": 0.01047951, + "balance_loss_clip": 1.02872288, + "balance_loss_mlp": 1.0491997, + "epoch": 0.18674282278671275, + "flos": 26141244756480.0, + "grad_norm": 1.7838474228223986, + "language_loss": 0.86952424, + "learning_rate": 3.7477656936841742e-06, + "loss": 0.89152563, + "num_input_tokens_seen": 67032030, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.03125, + "step": 3106, + "time_per_iteration": 2.5103402137756348 + }, + { + "auxiliary_loss_clip": 0.0115436, + "auxiliary_loss_mlp": 0.01044282, + "balance_loss_clip": 1.02623367, + "balance_loss_mlp": 1.05296755, + "epoch": 0.18680294603938072, + "flos": 19201938324480.0, + "grad_norm": 1.9680738799082358, + "language_loss": 0.78253353, + "learning_rate": 3.7475763289795445e-06, + "loss": 0.80451989, + "num_input_tokens_seen": 67048920, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.015625, + "step": 3107, + "time_per_iteration": 2.44567608833313 + }, + { + "auxiliary_loss_clip": 0.01150298, + "auxiliary_loss_mlp": 0.01051545, + "balance_loss_clip": 1.03179181, + "balance_loss_mlp": 1.05040216, + "epoch": 0.1868630692920487, + "flos": 28544889116160.0, + "grad_norm": 1.9125203241398734, + "language_loss": 0.74114668, + "learning_rate": 3.7473868980066446e-06, + "loss": 0.76316506, + "num_input_tokens_seen": 67068645, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0, + "step": 3108, + "time_per_iteration": 2.5254971981048584 + }, + { + "auxiliary_loss_clip": 0.0115125, + "auxiliary_loss_mlp": 0.01045574, + "balance_loss_clip": 1.02684629, + "balance_loss_mlp": 1.05332017, + "epoch": 0.18692319254471668, + "flos": 17238020261760.0, + "grad_norm": 1.6536820415924105, + "language_loss": 0.74707133, + "learning_rate": 3.747197400772658e-06, + "loss": 0.76903957, + "num_input_tokens_seen": 67087075, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.98046875, + "step": 3109, + "time_per_iteration": 2.426945924758911 + }, + { + "auxiliary_loss_clip": 0.01147996, + "auxiliary_loss_mlp": 0.0104719, + "balance_loss_clip": 1.02845001, + "balance_loss_mlp": 1.05078959, + "epoch": 0.18698331579738464, + "flos": 23185186156800.0, + "grad_norm": 1.4293009008592994, + "language_loss": 0.84324062, + "learning_rate": 3.747007837284772e-06, + "loss": 0.86519247, + "num_input_tokens_seen": 67108040, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.97265625, + "step": 3110, + "time_per_iteration": 2.4744956493377686 + }, + { + "auxiliary_loss_clip": 0.01154611, + "auxiliary_loss_mlp": 0.0104307, + "balance_loss_clip": 1.02472341, + "balance_loss_mlp": 1.05598927, + "epoch": 0.1870434390500526, + "flos": 25516721963520.0, + "grad_norm": 1.633662412254079, + "language_loss": 0.84753799, + "learning_rate": 3.7468182075501737e-06, + "loss": 0.86951482, + "num_input_tokens_seen": 67127605, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.984375, + "step": 3111, + "time_per_iteration": 2.4757230281829834 + }, + { + "auxiliary_loss_clip": 0.01149935, + "auxiliary_loss_mlp": 0.01042098, + "balance_loss_clip": 1.02408528, + "balance_loss_mlp": 1.05231404, + "epoch": 0.18710356230272057, + "flos": 19500823393920.0, + "grad_norm": 1.8513735900463348, + "language_loss": 0.76565534, + "learning_rate": 3.7466285115760536e-06, + "loss": 0.78757566, + "num_input_tokens_seen": 67145785, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9765625, + "step": 3112, + "time_per_iteration": 2.465552806854248 + }, + { + "auxiliary_loss_clip": 0.01150842, + "auxiliary_loss_mlp": 0.0104724, + "balance_loss_clip": 1.02907228, + "balance_loss_mlp": 1.0516355, + "epoch": 0.18716368555538854, + "flos": 26760847386240.0, + "grad_norm": 1.8580615351340177, + "language_loss": 0.64277315, + "learning_rate": 3.7464387493696046e-06, + "loss": 0.66475397, + "num_input_tokens_seen": 67165930, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9921875, + "step": 3113, + "time_per_iteration": 2.491805076599121 + }, + { + "auxiliary_loss_clip": 0.01155946, + "auxiliary_loss_mlp": 0.01047625, + "balance_loss_clip": 1.02817035, + "balance_loss_mlp": 1.0528996, + "epoch": 0.1872238088080565, + "flos": 25189827264000.0, + "grad_norm": 2.238258329288858, + "language_loss": 0.81043601, + "learning_rate": 3.746248920938024e-06, + "loss": 0.83247173, + "num_input_tokens_seen": 67185830, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.03125, + "step": 3114, + "time_per_iteration": 2.4947290420532227 + }, + { + "auxiliary_loss_clip": 0.01153492, + "auxiliary_loss_mlp": 0.01054246, + "balance_loss_clip": 1.03361082, + "balance_loss_mlp": 1.05319226, + "epoch": 0.1872839320607245, + "flos": 24134305178880.0, + "grad_norm": 2.2102322241331467, + "language_loss": 0.57819968, + "learning_rate": 3.74605902628851e-06, + "loss": 0.60027713, + "num_input_tokens_seen": 67206930, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.0, + "step": 3115, + "time_per_iteration": 2.4892075061798096 + }, + { + "auxiliary_loss_clip": 0.01151062, + "auxiliary_loss_mlp": 0.01056747, + "balance_loss_clip": 1.03873444, + "balance_loss_mlp": 1.05434299, + "epoch": 0.18734405531339246, + "flos": 21173793292800.0, + "grad_norm": 1.8141768865365742, + "language_loss": 0.71160758, + "learning_rate": 3.745869065428261e-06, + "loss": 0.73368567, + "num_input_tokens_seen": 67226290, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.96484375, + "step": 3116, + "time_per_iteration": 2.4705467224121094 + }, + { + "auxiliary_loss_clip": 0.01142667, + "auxiliary_loss_mlp": 0.01035702, + "balance_loss_clip": 1.01751065, + "balance_loss_mlp": 1.04771161, + "epoch": 0.18740417856606043, + "flos": 17237697039360.0, + "grad_norm": 1.8736078530078255, + "language_loss": 0.78733885, + "learning_rate": 3.7456790383644833e-06, + "loss": 0.80912256, + "num_input_tokens_seen": 67244410, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.94921875, + "step": 3117, + "time_per_iteration": 2.418527126312256 + }, + { + "auxiliary_loss_clip": 0.01151486, + "auxiliary_loss_mlp": 0.01048418, + "balance_loss_clip": 1.02898717, + "balance_loss_mlp": 1.05421317, + "epoch": 0.1874643018187284, + "flos": 32558049999360.0, + "grad_norm": 1.743274375857092, + "language_loss": 0.83945131, + "learning_rate": 3.745488945104381e-06, + "loss": 0.86145031, + "num_input_tokens_seen": 67264470, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.97265625, + "step": 3118, + "time_per_iteration": 2.5691416263580322 + }, + { + "auxiliary_loss_clip": 0.01151442, + "auxiliary_loss_mlp": 0.01049226, + "balance_loss_clip": 1.03109384, + "balance_loss_mlp": 1.0525409, + "epoch": 0.18752442507139636, + "flos": 23258156636160.0, + "grad_norm": 1.7594323212393352, + "language_loss": 0.76151264, + "learning_rate": 3.7452987856551636e-06, + "loss": 0.78351927, + "num_input_tokens_seen": 67284315, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.98828125, + "step": 3119, + "time_per_iteration": 2.459648847579956 + }, + { + "auxiliary_loss_clip": 0.01150997, + "auxiliary_loss_mlp": 0.01053702, + "balance_loss_clip": 1.03549838, + "balance_loss_mlp": 1.05181718, + "epoch": 0.18758454832406432, + "flos": 21760933006080.0, + "grad_norm": 1.593515591831454, + "language_loss": 0.81975627, + "learning_rate": 3.7451085600240406e-06, + "loss": 0.84180319, + "num_input_tokens_seen": 67302780, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9921875, + "step": 3120, + "time_per_iteration": 2.478870153427124 + }, + { + "auxiliary_loss_clip": 0.01148213, + "auxiliary_loss_mlp": 0.01043273, + "balance_loss_clip": 1.02526081, + "balance_loss_mlp": 1.05178094, + "epoch": 0.1876446715767323, + "flos": 29570210841600.0, + "grad_norm": 1.7598733043788508, + "language_loss": 0.8513701, + "learning_rate": 3.7449182682182263e-06, + "loss": 0.873285, + "num_input_tokens_seen": 67323405, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9609375, + "step": 3121, + "time_per_iteration": 2.5178277492523193 + }, + { + "auxiliary_loss_clip": 0.0115058, + "auxiliary_loss_mlp": 0.0104859, + "balance_loss_clip": 1.02976704, + "balance_loss_mlp": 1.05281448, + "epoch": 0.18770479482940028, + "flos": 30339992234880.0, + "grad_norm": 2.163070382320244, + "language_loss": 0.70038795, + "learning_rate": 3.744727910244937e-06, + "loss": 0.72237968, + "num_input_tokens_seen": 67345800, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9765625, + "step": 3122, + "time_per_iteration": 2.5523242950439453 + }, + { + "auxiliary_loss_clip": 0.0114817, + "auxiliary_loss_mlp": 0.01045591, + "balance_loss_clip": 1.02524245, + "balance_loss_mlp": 1.05194402, + "epoch": 0.18776491808206824, + "flos": 14465357527680.0, + "grad_norm": 2.352571744641408, + "language_loss": 0.7034744, + "learning_rate": 3.7445374861113905e-06, + "loss": 0.72541201, + "num_input_tokens_seen": 67363575, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.9609375, + "step": 3123, + "time_per_iteration": 2.4145569801330566 + }, + { + "auxiliary_loss_clip": 0.01149047, + "auxiliary_loss_mlp": 0.01047451, + "balance_loss_clip": 1.02968884, + "balance_loss_mlp": 1.05238771, + "epoch": 0.1878250413347362, + "flos": 24498547044480.0, + "grad_norm": 2.0330816469172097, + "language_loss": 0.73851109, + "learning_rate": 3.7443469958248066e-06, + "loss": 0.76047611, + "num_input_tokens_seen": 67381765, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.96875, + "step": 3124, + "time_per_iteration": 2.497352123260498 + }, + { + "auxiliary_loss_clip": 0.01153024, + "auxiliary_loss_mlp": 0.01050586, + "balance_loss_clip": 1.03027248, + "balance_loss_mlp": 1.05275774, + "epoch": 0.18788516458740417, + "flos": 39786185692800.0, + "grad_norm": 1.9990758157966066, + "language_loss": 0.80601895, + "learning_rate": 3.7441564393924106e-06, + "loss": 0.82805508, + "num_input_tokens_seen": 67405000, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.0, + "step": 3125, + "time_per_iteration": 2.605851411819458 + }, + { + "auxiliary_loss_clip": 0.01056257, + "auxiliary_loss_mlp": 0.01009024, + "balance_loss_clip": 1.00697315, + "balance_loss_mlp": 1.02352476, + "epoch": 0.18794528784007214, + "flos": 64699250664960.0, + "grad_norm": 0.9386177249275542, + "language_loss": 0.63591504, + "learning_rate": 3.7439658168214273e-06, + "loss": 0.65656781, + "num_input_tokens_seen": 67467140, + "router_z_loss_clip": 0.02050781, + "router_z_loss_mlp": 0.328125, + "step": 3126, + "time_per_iteration": 3.0943961143493652 + }, + { + "auxiliary_loss_clip": 0.01150221, + "auxiliary_loss_mlp": 0.01042071, + "balance_loss_clip": 1.02366543, + "balance_loss_mlp": 1.05439222, + "epoch": 0.1880054110927401, + "flos": 28622061486720.0, + "grad_norm": 1.7984129752859428, + "language_loss": 0.81274688, + "learning_rate": 3.7437751281190857e-06, + "loss": 0.83466977, + "num_input_tokens_seen": 67487980, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.9609375, + "step": 3127, + "time_per_iteration": 2.535048723220825 + }, + { + "auxiliary_loss_clip": 0.01054784, + "auxiliary_loss_mlp": 0.0100739, + "balance_loss_clip": 1.00543487, + "balance_loss_mlp": 1.02235639, + "epoch": 0.1880655343454081, + "flos": 64488958490880.0, + "grad_norm": 0.7620779230288282, + "language_loss": 0.6191628, + "learning_rate": 3.7435843732926164e-06, + "loss": 0.63978451, + "num_input_tokens_seen": 67552500, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.32421875, + "step": 3128, + "time_per_iteration": 3.1384503841400146 + }, + { + "auxiliary_loss_clip": 0.01153999, + "auxiliary_loss_mlp": 0.0104217, + "balance_loss_clip": 1.02329898, + "balance_loss_mlp": 1.05182266, + "epoch": 0.18812565759807606, + "flos": 32124464928000.0, + "grad_norm": 2.171302965646948, + "language_loss": 0.71237707, + "learning_rate": 3.7433935523492536e-06, + "loss": 0.73433876, + "num_input_tokens_seen": 67573295, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0234375, + "step": 3129, + "time_per_iteration": 2.560601234436035 + }, + { + "auxiliary_loss_clip": 0.01149923, + "auxiliary_loss_mlp": 0.01051091, + "balance_loss_clip": 1.03206491, + "balance_loss_mlp": 1.05224252, + "epoch": 0.18818578085074403, + "flos": 20624539449600.0, + "grad_norm": 2.040923932078449, + "language_loss": 0.85375232, + "learning_rate": 3.7432026652962314e-06, + "loss": 0.87576246, + "num_input_tokens_seen": 67590010, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.9765625, + "step": 3130, + "time_per_iteration": 2.4366040229797363 + }, + { + "auxiliary_loss_clip": 0.0114816, + "auxiliary_loss_mlp": 0.01044498, + "balance_loss_clip": 1.02507877, + "balance_loss_mlp": 1.04844868, + "epoch": 0.188245904103412, + "flos": 28840506048000.0, + "grad_norm": 1.9842347260172397, + "language_loss": 0.77227372, + "learning_rate": 3.7430117121407897e-06, + "loss": 0.7942003, + "num_input_tokens_seen": 67611110, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0, + "step": 3131, + "time_per_iteration": 2.503112554550171 + }, + { + "auxiliary_loss_clip": 0.01151098, + "auxiliary_loss_mlp": 0.0104766, + "balance_loss_clip": 1.02800202, + "balance_loss_mlp": 1.05402517, + "epoch": 0.18830602735607996, + "flos": 29420319386880.0, + "grad_norm": 1.8095346888628816, + "language_loss": 0.81244844, + "learning_rate": 3.74282069289017e-06, + "loss": 0.834436, + "num_input_tokens_seen": 67631990, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.96875, + "step": 3132, + "time_per_iteration": 2.5265986919403076 + }, + { + "auxiliary_loss_clip": 0.01154443, + "auxiliary_loss_mlp": 0.01048532, + "balance_loss_clip": 1.02939904, + "balance_loss_mlp": 1.05395401, + "epoch": 0.18836615060874792, + "flos": 28872933050880.0, + "grad_norm": 2.3595669444771135, + "language_loss": 0.79035556, + "learning_rate": 3.742629607551614e-06, + "loss": 0.81238532, + "num_input_tokens_seen": 67650490, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0, + "step": 3133, + "time_per_iteration": 2.500927209854126 + }, + { + "auxiliary_loss_clip": 0.01151251, + "auxiliary_loss_mlp": 0.0105121, + "balance_loss_clip": 1.03224421, + "balance_loss_mlp": 1.05204821, + "epoch": 0.18842627386141592, + "flos": 22601673717120.0, + "grad_norm": 4.024150314183157, + "language_loss": 0.82826144, + "learning_rate": 3.7424384561323698e-06, + "loss": 0.85028601, + "num_input_tokens_seen": 67668860, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.9921875, + "step": 3134, + "time_per_iteration": 2.4773380756378174 + }, + { + "auxiliary_loss_clip": 0.01146819, + "auxiliary_loss_mlp": 0.01047119, + "balance_loss_clip": 1.02847505, + "balance_loss_mlp": 1.05027199, + "epoch": 0.18848639711408388, + "flos": 24573600512640.0, + "grad_norm": 1.4735244825899, + "language_loss": 0.82783771, + "learning_rate": 3.742247238639684e-06, + "loss": 0.8497771, + "num_input_tokens_seen": 67690220, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.96484375, + "step": 3135, + "time_per_iteration": 2.4957115650177 + }, + { + "auxiliary_loss_clip": 0.01149872, + "auxiliary_loss_mlp": 0.01052674, + "balance_loss_clip": 1.03343356, + "balance_loss_mlp": 1.0503304, + "epoch": 0.18854652036675185, + "flos": 34166920078080.0, + "grad_norm": 1.8513380433423674, + "language_loss": 0.79031271, + "learning_rate": 3.7420559550808083e-06, + "loss": 0.81233823, + "num_input_tokens_seen": 67709820, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.9921875, + "step": 3136, + "time_per_iteration": 2.556800127029419 + }, + { + "auxiliary_loss_clip": 0.01150763, + "auxiliary_loss_mlp": 0.01048681, + "balance_loss_clip": 1.02947617, + "balance_loss_mlp": 1.05327463, + "epoch": 0.1886066436194198, + "flos": 24200236592640.0, + "grad_norm": 1.9366242888645147, + "language_loss": 0.81049621, + "learning_rate": 3.741864605462996e-06, + "loss": 0.83249068, + "num_input_tokens_seen": 67729490, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.9765625, + "step": 3137, + "time_per_iteration": 2.487513542175293 + }, + { + "auxiliary_loss_clip": 0.01151307, + "auxiliary_loss_mlp": 0.01057024, + "balance_loss_clip": 1.03913093, + "balance_loss_mlp": 1.05406666, + "epoch": 0.18866676687208778, + "flos": 21251109317760.0, + "grad_norm": 1.5870634004860276, + "language_loss": 0.8119483, + "learning_rate": 3.741673189793504e-06, + "loss": 0.83403158, + "num_input_tokens_seen": 67749665, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.97265625, + "step": 3138, + "time_per_iteration": 2.4554855823516846 + }, + { + "auxiliary_loss_clip": 0.01152889, + "auxiliary_loss_mlp": 0.01050697, + "balance_loss_clip": 1.03162408, + "balance_loss_mlp": 1.05190897, + "epoch": 0.18872689012475574, + "flos": 37308673013760.0, + "grad_norm": 1.760814692015778, + "language_loss": 0.636096, + "learning_rate": 3.7414817080795896e-06, + "loss": 0.6581319, + "num_input_tokens_seen": 67776230, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0078125, + "step": 3139, + "time_per_iteration": 2.6103553771972656 + }, + { + "auxiliary_loss_clip": 0.01146092, + "auxiliary_loss_mlp": 0.0105006, + "balance_loss_clip": 1.03046215, + "balance_loss_mlp": 1.04812348, + "epoch": 0.1887870133774237, + "flos": 21652303299840.0, + "grad_norm": 2.433795452320061, + "language_loss": 0.71546841, + "learning_rate": 3.741290160328514e-06, + "loss": 0.73742986, + "num_input_tokens_seen": 67795080, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.98046875, + "step": 3140, + "time_per_iteration": 2.4519457817077637 + }, + { + "auxiliary_loss_clip": 0.01147517, + "auxiliary_loss_mlp": 0.01047268, + "balance_loss_clip": 1.02764606, + "balance_loss_mlp": 1.04848385, + "epoch": 0.1888471366300917, + "flos": 15924659374080.0, + "grad_norm": 3.1391974719951574, + "language_loss": 0.87001872, + "learning_rate": 3.7410985465475412e-06, + "loss": 0.89196658, + "num_input_tokens_seen": 67813110, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.98828125, + "step": 3141, + "time_per_iteration": 2.4811747074127197 + }, + { + "auxiliary_loss_clip": 0.01153623, + "auxiliary_loss_mlp": 0.0104492, + "balance_loss_clip": 1.02460694, + "balance_loss_mlp": 1.05144691, + "epoch": 0.18890725988275966, + "flos": 18551955767040.0, + "grad_norm": 2.021325930100965, + "language_loss": 0.77418405, + "learning_rate": 3.7409068667439378e-06, + "loss": 0.79616946, + "num_input_tokens_seen": 67831070, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.0234375, + "step": 3142, + "time_per_iteration": 2.5001184940338135 + }, + { + "auxiliary_loss_clip": 0.01148279, + "auxiliary_loss_mlp": 0.01042631, + "balance_loss_clip": 1.02542925, + "balance_loss_mlp": 1.05104184, + "epoch": 0.18896738313542763, + "flos": 28840865184000.0, + "grad_norm": 1.6841374820722228, + "language_loss": 0.78446913, + "learning_rate": 3.740715120924971e-06, + "loss": 0.80637825, + "num_input_tokens_seen": 67852170, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.97265625, + "step": 3143, + "time_per_iteration": 3.9074132442474365 + }, + { + "auxiliary_loss_clip": 0.01150084, + "auxiliary_loss_mlp": 0.01049426, + "balance_loss_clip": 1.03081727, + "balance_loss_mlp": 1.05069065, + "epoch": 0.1890275063880956, + "flos": 22412747157120.0, + "grad_norm": 4.1822349926512485, + "language_loss": 0.71507585, + "learning_rate": 3.740523309097912e-06, + "loss": 0.73707104, + "num_input_tokens_seen": 67869945, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9921875, + "step": 3144, + "time_per_iteration": 3.981715679168701 + }, + { + "auxiliary_loss_clip": 0.01152034, + "auxiliary_loss_mlp": 0.01045652, + "balance_loss_clip": 1.02605355, + "balance_loss_mlp": 1.0513736, + "epoch": 0.18908762964076356, + "flos": 24243904552320.0, + "grad_norm": 2.6203593578621893, + "language_loss": 0.73683178, + "learning_rate": 3.7403314312700356e-06, + "loss": 0.75880861, + "num_input_tokens_seen": 67890240, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0078125, + "step": 3145, + "time_per_iteration": 2.5101706981658936 + }, + { + "auxiliary_loss_clip": 0.01143872, + "auxiliary_loss_mlp": 0.01045631, + "balance_loss_clip": 1.02783298, + "balance_loss_mlp": 1.04759097, + "epoch": 0.18914775289343153, + "flos": 16982910892800.0, + "grad_norm": 2.6756165752276027, + "language_loss": 0.77081764, + "learning_rate": 3.740139487448616e-06, + "loss": 0.79271269, + "num_input_tokens_seen": 67907825, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9609375, + "step": 3146, + "time_per_iteration": 2.4278056621551514 + }, + { + "auxiliary_loss_clip": 0.01148489, + "auxiliary_loss_mlp": 0.01046339, + "balance_loss_clip": 1.02811205, + "balance_loss_mlp": 1.04947495, + "epoch": 0.1892078761460995, + "flos": 21543781334400.0, + "grad_norm": 1.794796296308648, + "language_loss": 0.78377169, + "learning_rate": 3.7399474776409326e-06, + "loss": 0.80571997, + "num_input_tokens_seen": 67926670, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.98828125, + "step": 3147, + "time_per_iteration": 2.467607259750366 + }, + { + "auxiliary_loss_clip": 0.0114757, + "auxiliary_loss_mlp": 0.01048988, + "balance_loss_clip": 1.0310235, + "balance_loss_mlp": 1.0499115, + "epoch": 0.18926799939876748, + "flos": 23001538896000.0, + "grad_norm": 3.2769360880247853, + "language_loss": 0.67016155, + "learning_rate": 3.739755401854267e-06, + "loss": 0.69212711, + "num_input_tokens_seen": 67943645, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9765625, + "step": 3148, + "time_per_iteration": 2.4846954345703125 + }, + { + "auxiliary_loss_clip": 0.01145427, + "auxiliary_loss_mlp": 0.01037742, + "balance_loss_clip": 1.02037382, + "balance_loss_mlp": 1.04898858, + "epoch": 0.18932812265143545, + "flos": 22273019251200.0, + "grad_norm": 4.644784357412393, + "language_loss": 0.75978655, + "learning_rate": 3.739563260095902e-06, + "loss": 0.78161824, + "num_input_tokens_seen": 67962345, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.96484375, + "step": 3149, + "time_per_iteration": 2.4768459796905518 + }, + { + "auxiliary_loss_clip": 0.01143839, + "auxiliary_loss_mlp": 0.01047607, + "balance_loss_clip": 1.03028584, + "balance_loss_mlp": 1.05033517, + "epoch": 0.1893882459041034, + "flos": 18624423456000.0, + "grad_norm": 2.9181295874949735, + "language_loss": 0.81229341, + "learning_rate": 3.7393710523731245e-06, + "loss": 0.83420789, + "num_input_tokens_seen": 67979760, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9375, + "step": 3150, + "time_per_iteration": 2.42832088470459 + }, + { + "auxiliary_loss_clip": 0.01148187, + "auxiliary_loss_mlp": 0.01046446, + "balance_loss_clip": 1.02886271, + "balance_loss_mlp": 1.05068374, + "epoch": 0.18944836915677138, + "flos": 22892981016960.0, + "grad_norm": 2.066054594612055, + "language_loss": 0.84966886, + "learning_rate": 3.7391787786932215e-06, + "loss": 0.87161517, + "num_input_tokens_seen": 67996895, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9765625, + "step": 3151, + "time_per_iteration": 2.458054542541504 + }, + { + "auxiliary_loss_clip": 0.01148364, + "auxiliary_loss_mlp": 0.01052715, + "balance_loss_clip": 1.03441668, + "balance_loss_mlp": 1.04896331, + "epoch": 0.18950849240943934, + "flos": 26796542526720.0, + "grad_norm": 1.9128881662164896, + "language_loss": 0.7443462, + "learning_rate": 3.7389864390634857e-06, + "loss": 0.76635695, + "num_input_tokens_seen": 68018365, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.99609375, + "step": 3152, + "time_per_iteration": 2.4904792308807373 + }, + { + "auxiliary_loss_clip": 0.01146776, + "auxiliary_loss_mlp": 0.01048229, + "balance_loss_clip": 1.02937067, + "balance_loss_mlp": 1.0502255, + "epoch": 0.1895686156621073, + "flos": 24971239048320.0, + "grad_norm": 1.8661622565083957, + "language_loss": 0.75719136, + "learning_rate": 3.738794033491209e-06, + "loss": 0.77914143, + "num_input_tokens_seen": 68037985, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.96484375, + "step": 3153, + "time_per_iteration": 2.5026283264160156 + }, + { + "auxiliary_loss_clip": 0.01148349, + "auxiliary_loss_mlp": 0.01048305, + "balance_loss_clip": 1.03007817, + "balance_loss_mlp": 1.04962945, + "epoch": 0.1896287389147753, + "flos": 21944544353280.0, + "grad_norm": 1.8393709351558127, + "language_loss": 0.79529279, + "learning_rate": 3.7386015619836887e-06, + "loss": 0.81725931, + "num_input_tokens_seen": 68057975, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.984375, + "step": 3154, + "time_per_iteration": 2.4544081687927246 + }, + { + "auxiliary_loss_clip": 0.01151316, + "auxiliary_loss_mlp": 0.01047877, + "balance_loss_clip": 1.02919698, + "balance_loss_mlp": 1.04986668, + "epoch": 0.18968886216744327, + "flos": 18179058723840.0, + "grad_norm": 2.673670363277482, + "language_loss": 0.72798991, + "learning_rate": 3.738409024548223e-06, + "loss": 0.74998182, + "num_input_tokens_seen": 68074175, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 3155, + "time_per_iteration": 2.425431728363037 + }, + { + "auxiliary_loss_clip": 0.01145009, + "auxiliary_loss_mlp": 0.01048344, + "balance_loss_clip": 1.03042662, + "balance_loss_mlp": 1.04930019, + "epoch": 0.18974898542011123, + "flos": 20412487509120.0, + "grad_norm": 1.676026678838244, + "language_loss": 0.73911691, + "learning_rate": 3.7382164211921136e-06, + "loss": 0.76105046, + "num_input_tokens_seen": 68095230, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.95703125, + "step": 3156, + "time_per_iteration": 2.4683640003204346 + }, + { + "auxiliary_loss_clip": 0.01149399, + "auxiliary_loss_mlp": 0.01050259, + "balance_loss_clip": 1.03281915, + "balance_loss_mlp": 1.05195308, + "epoch": 0.1898091086727792, + "flos": 23985024255360.0, + "grad_norm": 1.5984593201401434, + "language_loss": 0.68251741, + "learning_rate": 3.7380237519226623e-06, + "loss": 0.70451397, + "num_input_tokens_seen": 68113805, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9765625, + "step": 3157, + "time_per_iteration": 2.472182512283325 + }, + { + "auxiliary_loss_clip": 0.01146139, + "auxiliary_loss_mlp": 0.01043457, + "balance_loss_clip": 1.02539706, + "balance_loss_mlp": 1.04914486, + "epoch": 0.18986923192544716, + "flos": 27637067756160.0, + "grad_norm": 1.9937577865402571, + "language_loss": 0.80197155, + "learning_rate": 3.737831016747176e-06, + "loss": 0.82386756, + "num_input_tokens_seen": 68133190, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.96875, + "step": 3158, + "time_per_iteration": 2.4978723526000977 + }, + { + "auxiliary_loss_clip": 0.01152812, + "auxiliary_loss_mlp": 0.01045212, + "balance_loss_clip": 1.02624583, + "balance_loss_mlp": 1.05201745, + "epoch": 0.18992935517811513, + "flos": 25484151306240.0, + "grad_norm": 1.9065090881698699, + "language_loss": 0.71940476, + "learning_rate": 3.737638215672964e-06, + "loss": 0.74138498, + "num_input_tokens_seen": 68152330, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0078125, + "step": 3159, + "time_per_iteration": 2.503129720687866 + }, + { + "auxiliary_loss_clip": 0.01150054, + "auxiliary_loss_mlp": 0.01049079, + "balance_loss_clip": 1.02987432, + "balance_loss_mlp": 1.05255282, + "epoch": 0.1899894784307831, + "flos": 17420805596160.0, + "grad_norm": 1.8597759984302606, + "language_loss": 0.85071993, + "learning_rate": 3.7374453487073366e-06, + "loss": 0.8727113, + "num_input_tokens_seen": 68170185, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.9765625, + "step": 3160, + "time_per_iteration": 2.45534348487854 + }, + { + "auxiliary_loss_clip": 0.01143204, + "auxiliary_loss_mlp": 0.01049047, + "balance_loss_clip": 1.03235734, + "balance_loss_mlp": 1.050807, + "epoch": 0.19004960168345109, + "flos": 27492240119040.0, + "grad_norm": 1.7120140162377986, + "language_loss": 0.73554128, + "learning_rate": 3.7372524158576074e-06, + "loss": 0.75746381, + "num_input_tokens_seen": 68191665, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.92578125, + "step": 3161, + "time_per_iteration": 2.5551726818084717 + }, + { + "auxiliary_loss_clip": 0.01150414, + "auxiliary_loss_mlp": 0.01047878, + "balance_loss_clip": 1.02982974, + "balance_loss_mlp": 1.05420387, + "epoch": 0.19010972493611905, + "flos": 38654676385920.0, + "grad_norm": 1.554139282497156, + "language_loss": 0.80939364, + "learning_rate": 3.7370594171310926e-06, + "loss": 0.83137655, + "num_input_tokens_seen": 68214635, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9609375, + "step": 3162, + "time_per_iteration": 2.609764337539673 + }, + { + "auxiliary_loss_clip": 0.01149524, + "auxiliary_loss_mlp": 0.01043018, + "balance_loss_clip": 1.02486265, + "balance_loss_mlp": 1.05257571, + "epoch": 0.19016984818878702, + "flos": 19244744357760.0, + "grad_norm": 1.8884975109329094, + "language_loss": 0.75600141, + "learning_rate": 3.73686635253511e-06, + "loss": 0.77792686, + "num_input_tokens_seen": 68232150, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.96875, + "step": 3163, + "time_per_iteration": 2.4494824409484863 + }, + { + "auxiliary_loss_clip": 0.01149917, + "auxiliary_loss_mlp": 0.0103951, + "balance_loss_clip": 1.02161682, + "balance_loss_mlp": 1.05577397, + "epoch": 0.19022997144145498, + "flos": 37596891744000.0, + "grad_norm": 1.5980783305445414, + "language_loss": 0.74197054, + "learning_rate": 3.736673222076982e-06, + "loss": 0.76386476, + "num_input_tokens_seen": 68253370, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.94140625, + "step": 3164, + "time_per_iteration": 2.5901739597320557 + }, + { + "auxiliary_loss_clip": 0.01148415, + "auxiliary_loss_mlp": 0.01039529, + "balance_loss_clip": 1.02151656, + "balance_loss_mlp": 1.05402589, + "epoch": 0.19029009469412295, + "flos": 61530921665280.0, + "grad_norm": 1.5830796140792522, + "language_loss": 0.66913098, + "learning_rate": 3.7364800257640313e-06, + "loss": 0.69101042, + "num_input_tokens_seen": 68278895, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9453125, + "step": 3165, + "time_per_iteration": 2.899500608444214 + }, + { + "auxiliary_loss_clip": 0.01148214, + "auxiliary_loss_mlp": 0.01045421, + "balance_loss_clip": 1.02624011, + "balance_loss_mlp": 1.05282831, + "epoch": 0.1903502179467909, + "flos": 13954851480960.0, + "grad_norm": 2.1716027754337257, + "language_loss": 0.7452209, + "learning_rate": 3.7362867636035835e-06, + "loss": 0.76715726, + "num_input_tokens_seen": 68294880, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.953125, + "step": 3166, + "time_per_iteration": 2.4325685501098633 + }, + { + "auxiliary_loss_clip": 0.01062623, + "auxiliary_loss_mlp": 0.01017161, + "balance_loss_clip": 1.01490772, + "balance_loss_mlp": 1.02902174, + "epoch": 0.1904103411994589, + "flos": 66899641916160.0, + "grad_norm": 0.8067170187870535, + "language_loss": 0.50396568, + "learning_rate": 3.736093435602968e-06, + "loss": 0.52476352, + "num_input_tokens_seen": 68359665, + "router_z_loss_clip": 0.02258301, + "router_z_loss_mlp": 0.3359375, + "step": 3167, + "time_per_iteration": 3.1095221042633057 + }, + { + "auxiliary_loss_clip": 0.01146367, + "auxiliary_loss_mlp": 0.01049594, + "balance_loss_clip": 1.03184342, + "balance_loss_mlp": 1.05208659, + "epoch": 0.19047046445212687, + "flos": 21908741472000.0, + "grad_norm": 1.7496006549093657, + "language_loss": 0.74235475, + "learning_rate": 3.7359000417695156e-06, + "loss": 0.76431435, + "num_input_tokens_seen": 68378950, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9453125, + "step": 3168, + "time_per_iteration": 2.483551502227783 + }, + { + "auxiliary_loss_clip": 0.01059618, + "auxiliary_loss_mlp": 0.01004786, + "balance_loss_clip": 1.00246131, + "balance_loss_mlp": 1.02649927, + "epoch": 0.19053058770479483, + "flos": 59255156701440.0, + "grad_norm": 0.8615778549663292, + "language_loss": 0.60097563, + "learning_rate": 3.73570658211056e-06, + "loss": 0.6216197, + "num_input_tokens_seen": 68434235, + "router_z_loss_clip": 0.02319336, + "router_z_loss_mlp": 0.33203125, + "step": 3169, + "time_per_iteration": 2.958176851272583 + }, + { + "auxiliary_loss_clip": 0.01152665, + "auxiliary_loss_mlp": 0.01051299, + "balance_loss_clip": 1.03371537, + "balance_loss_mlp": 1.05302989, + "epoch": 0.1905907109574628, + "flos": 23951304362880.0, + "grad_norm": 1.550337238497042, + "language_loss": 0.77976263, + "learning_rate": 3.735513056633436e-06, + "loss": 0.80180222, + "num_input_tokens_seen": 68453830, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.99609375, + "step": 3170, + "time_per_iteration": 2.5174756050109863 + }, + { + "auxiliary_loss_clip": 0.01145075, + "auxiliary_loss_mlp": 0.01046915, + "balance_loss_clip": 1.02960575, + "balance_loss_mlp": 1.05185819, + "epoch": 0.19065083421013077, + "flos": 20812316774400.0, + "grad_norm": 1.7193055204742105, + "language_loss": 0.78597021, + "learning_rate": 3.7353194653454834e-06, + "loss": 0.80789012, + "num_input_tokens_seen": 68473005, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.93359375, + "step": 3171, + "time_per_iteration": 2.4895551204681396 + }, + { + "auxiliary_loss_clip": 0.01149187, + "auxiliary_loss_mlp": 0.0104474, + "balance_loss_clip": 1.02617931, + "balance_loss_mlp": 1.05111575, + "epoch": 0.19071095746279873, + "flos": 31284981192960.0, + "grad_norm": 3.5246110250440386, + "language_loss": 0.78578937, + "learning_rate": 3.7351258082540426e-06, + "loss": 0.80772865, + "num_input_tokens_seen": 68493470, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.98046875, + "step": 3172, + "time_per_iteration": 2.574558973312378 + }, + { + "auxiliary_loss_clip": 0.01148239, + "auxiliary_loss_mlp": 0.01054453, + "balance_loss_clip": 1.03711963, + "balance_loss_mlp": 1.05253482, + "epoch": 0.1907710807154667, + "flos": 14356117290240.0, + "grad_norm": 1.581476317811461, + "language_loss": 0.80126482, + "learning_rate": 3.7349320853664576e-06, + "loss": 0.82329178, + "num_input_tokens_seen": 68511290, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.95703125, + "step": 3173, + "time_per_iteration": 2.464979410171509 + }, + { + "auxiliary_loss_clip": 0.01149716, + "auxiliary_loss_mlp": 0.01051904, + "balance_loss_clip": 1.03432083, + "balance_loss_mlp": 1.05250478, + "epoch": 0.1908312039681347, + "flos": 26907039740160.0, + "grad_norm": 1.9222394249434893, + "language_loss": 0.78740567, + "learning_rate": 3.7347382966900735e-06, + "loss": 0.8094219, + "num_input_tokens_seen": 68532575, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.96875, + "step": 3174, + "time_per_iteration": 2.540959358215332 + }, + { + "auxiliary_loss_clip": 0.01149777, + "auxiliary_loss_mlp": 0.01047648, + "balance_loss_clip": 1.03043461, + "balance_loss_mlp": 1.05367374, + "epoch": 0.19089132722080265, + "flos": 14494695960960.0, + "grad_norm": 1.8458147293094664, + "language_loss": 0.80757344, + "learning_rate": 3.7345444422322395e-06, + "loss": 0.82954776, + "num_input_tokens_seen": 68548760, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9609375, + "step": 3175, + "time_per_iteration": 2.441190481185913 + }, + { + "auxiliary_loss_clip": 0.01148992, + "auxiliary_loss_mlp": 0.01056395, + "balance_loss_clip": 1.03821599, + "balance_loss_mlp": 1.0521791, + "epoch": 0.19095145047347062, + "flos": 13952876232960.0, + "grad_norm": 2.3562328324004445, + "language_loss": 0.85142022, + "learning_rate": 3.7343505220003067e-06, + "loss": 0.87347412, + "num_input_tokens_seen": 68563100, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.96875, + "step": 3176, + "time_per_iteration": 2.4397072792053223 + }, + { + "auxiliary_loss_clip": 0.01152727, + "auxiliary_loss_mlp": 0.01056149, + "balance_loss_clip": 1.036515, + "balance_loss_mlp": 1.05395234, + "epoch": 0.19101157372613858, + "flos": 25301832848640.0, + "grad_norm": 2.002060812172469, + "language_loss": 0.81206596, + "learning_rate": 3.7341565360016285e-06, + "loss": 0.83415473, + "num_input_tokens_seen": 68581650, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.98828125, + "step": 3177, + "time_per_iteration": 2.4980266094207764 + }, + { + "auxiliary_loss_clip": 0.01144454, + "auxiliary_loss_mlp": 0.01048966, + "balance_loss_clip": 1.03073931, + "balance_loss_mlp": 1.0503974, + "epoch": 0.19107169697880655, + "flos": 20558212986240.0, + "grad_norm": 1.9374450898751996, + "language_loss": 0.74628592, + "learning_rate": 3.73396248424356e-06, + "loss": 0.76822007, + "num_input_tokens_seen": 68600360, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.94140625, + "step": 3178, + "time_per_iteration": 2.477679967880249 + }, + { + "auxiliary_loss_clip": 0.01147132, + "auxiliary_loss_mlp": 0.01039746, + "balance_loss_clip": 1.02273464, + "balance_loss_mlp": 1.05001104, + "epoch": 0.19113182023147451, + "flos": 22163204396160.0, + "grad_norm": 1.8429055258583904, + "language_loss": 0.8167876, + "learning_rate": 3.7337683667334606e-06, + "loss": 0.83865643, + "num_input_tokens_seen": 68617885, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.97265625, + "step": 3179, + "time_per_iteration": 2.452310800552368 + }, + { + "auxiliary_loss_clip": 0.0114904, + "auxiliary_loss_mlp": 0.01046544, + "balance_loss_clip": 1.02892482, + "balance_loss_mlp": 1.05279994, + "epoch": 0.19119194348414248, + "flos": 18581796990720.0, + "grad_norm": 2.1508657656276484, + "language_loss": 0.7946887, + "learning_rate": 3.733574183478691e-06, + "loss": 0.81664455, + "num_input_tokens_seen": 68634550, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.96484375, + "step": 3180, + "time_per_iteration": 2.451066732406616 + }, + { + "auxiliary_loss_clip": 0.0114304, + "auxiliary_loss_mlp": 0.01045985, + "balance_loss_clip": 1.02770984, + "balance_loss_mlp": 1.04780042, + "epoch": 0.19125206673681047, + "flos": 19026623018880.0, + "grad_norm": 2.916741655382754, + "language_loss": 0.79891652, + "learning_rate": 3.733379934486615e-06, + "loss": 0.82080674, + "num_input_tokens_seen": 68651895, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.94921875, + "step": 3181, + "time_per_iteration": 2.4310615062713623 + }, + { + "auxiliary_loss_clip": 0.0114616, + "auxiliary_loss_mlp": 0.01053832, + "balance_loss_clip": 1.03623664, + "balance_loss_mlp": 1.04858851, + "epoch": 0.19131218998947844, + "flos": 21690153256320.0, + "grad_norm": 1.7607714952320546, + "language_loss": 0.73820639, + "learning_rate": 3.7331856197645973e-06, + "loss": 0.76020634, + "num_input_tokens_seen": 68671500, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9765625, + "step": 3182, + "time_per_iteration": 2.4712350368499756 + }, + { + "auxiliary_loss_clip": 0.01148345, + "auxiliary_loss_mlp": 0.01048421, + "balance_loss_clip": 1.03093314, + "balance_loss_mlp": 1.05187011, + "epoch": 0.1913723132421464, + "flos": 18442500048000.0, + "grad_norm": 1.8018319163421928, + "language_loss": 0.6486634, + "learning_rate": 3.7329912393200084e-06, + "loss": 0.67063105, + "num_input_tokens_seen": 68690570, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.96484375, + "step": 3183, + "time_per_iteration": 2.440232753753662 + }, + { + "auxiliary_loss_clip": 0.01145449, + "auxiliary_loss_mlp": 0.01047983, + "balance_loss_clip": 1.02920759, + "balance_loss_mlp": 1.04864669, + "epoch": 0.19143243649481437, + "flos": 27160102033920.0, + "grad_norm": 1.760716170695104, + "language_loss": 0.73234087, + "learning_rate": 3.7327967931602173e-06, + "loss": 0.7542752, + "num_input_tokens_seen": 68709735, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.96875, + "step": 3184, + "time_per_iteration": 3.9211573600769043 + }, + { + "auxiliary_loss_clip": 0.01145752, + "auxiliary_loss_mlp": 0.01047876, + "balance_loss_clip": 1.0281471, + "balance_loss_mlp": 1.04738748, + "epoch": 0.19149255974748233, + "flos": 21718952985600.0, + "grad_norm": 2.1066155051108315, + "language_loss": 0.8784132, + "learning_rate": 3.732602281292598e-06, + "loss": 0.9003495, + "num_input_tokens_seen": 68727565, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.984375, + "step": 3185, + "time_per_iteration": 5.396124601364136 + }, + { + "auxiliary_loss_clip": 0.01143307, + "auxiliary_loss_mlp": 0.01046716, + "balance_loss_clip": 1.02803612, + "balance_loss_mlp": 1.04899192, + "epoch": 0.1915526830001503, + "flos": 22963293889920.0, + "grad_norm": 2.10102369978198, + "language_loss": 0.72667789, + "learning_rate": 3.7324077037245267e-06, + "loss": 0.74857807, + "num_input_tokens_seen": 68748110, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.94140625, + "step": 3186, + "time_per_iteration": 2.498241901397705 + }, + { + "auxiliary_loss_clip": 0.01153236, + "auxiliary_loss_mlp": 0.01042185, + "balance_loss_clip": 1.02244437, + "balance_loss_mlp": 1.054919, + "epoch": 0.1916128062528183, + "flos": 26140741966080.0, + "grad_norm": 2.264264166459479, + "language_loss": 0.83865881, + "learning_rate": 3.7322130604633825e-06, + "loss": 0.86061311, + "num_input_tokens_seen": 68769765, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.98046875, + "step": 3187, + "time_per_iteration": 2.527416467666626 + }, + { + "auxiliary_loss_clip": 0.01051867, + "auxiliary_loss_mlp": 0.01015636, + "balance_loss_clip": 1.01343083, + "balance_loss_mlp": 1.01988959, + "epoch": 0.19167292950548626, + "flos": 54925767457920.0, + "grad_norm": 0.8634842964488614, + "language_loss": 0.55803859, + "learning_rate": 3.732018351516544e-06, + "loss": 0.5787136, + "num_input_tokens_seen": 68826815, + "router_z_loss_clip": 0.02209473, + "router_z_loss_mlp": 0.3203125, + "step": 3188, + "time_per_iteration": 3.0815136432647705 + }, + { + "auxiliary_loss_clip": 0.01145462, + "auxiliary_loss_mlp": 0.01055783, + "balance_loss_clip": 1.03709126, + "balance_loss_mlp": 1.04972625, + "epoch": 0.19173305275815422, + "flos": 29935601942400.0, + "grad_norm": 1.71302722892552, + "language_loss": 0.70180511, + "learning_rate": 3.731823576891397e-06, + "loss": 0.72381759, + "num_input_tokens_seen": 68847585, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.95703125, + "step": 3189, + "time_per_iteration": 2.5380465984344482 + }, + { + "auxiliary_loss_clip": 0.01140421, + "auxiliary_loss_mlp": 0.01034792, + "balance_loss_clip": 1.01822233, + "balance_loss_mlp": 1.04853344, + "epoch": 0.1917931760108222, + "flos": 24752471264640.0, + "grad_norm": 2.222159201352765, + "language_loss": 0.74234986, + "learning_rate": 3.7316287365953266e-06, + "loss": 0.76410198, + "num_input_tokens_seen": 68866620, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.91796875, + "step": 3190, + "time_per_iteration": 2.5862700939178467 + }, + { + "auxiliary_loss_clip": 0.01143494, + "auxiliary_loss_mlp": 0.01056401, + "balance_loss_clip": 1.03818583, + "balance_loss_mlp": 1.04965627, + "epoch": 0.19185329926349015, + "flos": 18843550375680.0, + "grad_norm": 1.8818377537371913, + "language_loss": 0.8394708, + "learning_rate": 3.73143383063572e-06, + "loss": 0.86146975, + "num_input_tokens_seen": 68885515, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9375, + "step": 3191, + "time_per_iteration": 2.5077905654907227 + }, + { + "auxiliary_loss_clip": 0.01139779, + "auxiliary_loss_mlp": 0.01038816, + "balance_loss_clip": 1.02217412, + "balance_loss_mlp": 1.04766488, + "epoch": 0.19191342251615812, + "flos": 22086858038400.0, + "grad_norm": 1.7694679756443132, + "language_loss": 0.89325655, + "learning_rate": 3.73123885901997e-06, + "loss": 0.91504252, + "num_input_tokens_seen": 68903225, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.921875, + "step": 3192, + "time_per_iteration": 2.4738776683807373 + }, + { + "auxiliary_loss_clip": 0.01150885, + "auxiliary_loss_mlp": 0.0105345, + "balance_loss_clip": 1.03398299, + "balance_loss_mlp": 1.0531472, + "epoch": 0.19197354576882608, + "flos": 22199115018240.0, + "grad_norm": 2.352703418633998, + "language_loss": 0.74830496, + "learning_rate": 3.7310438217554687e-06, + "loss": 0.77034831, + "num_input_tokens_seen": 68922860, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.9765625, + "step": 3193, + "time_per_iteration": 2.47143816947937 + }, + { + "auxiliary_loss_clip": 0.01146927, + "auxiliary_loss_mlp": 0.01045614, + "balance_loss_clip": 1.02717233, + "balance_loss_mlp": 1.04918766, + "epoch": 0.19203366902149407, + "flos": 24896185580160.0, + "grad_norm": 1.7283890992056894, + "language_loss": 0.74733245, + "learning_rate": 3.730848718849612e-06, + "loss": 0.7692579, + "num_input_tokens_seen": 68943000, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.9765625, + "step": 3194, + "time_per_iteration": 2.5001959800720215 + }, + { + "auxiliary_loss_clip": 0.0105047, + "auxiliary_loss_mlp": 0.010055, + "balance_loss_clip": 1.00319958, + "balance_loss_mlp": 1.01851392, + "epoch": 0.19209379227416204, + "flos": 68416722789120.0, + "grad_norm": 0.7975785668902318, + "language_loss": 0.68455988, + "learning_rate": 3.7306535503097985e-06, + "loss": 0.70511955, + "num_input_tokens_seen": 69000255, + "router_z_loss_clip": 0.02294922, + "router_z_loss_mlp": 0.3203125, + "step": 3195, + "time_per_iteration": 3.014677047729492 + }, + { + "auxiliary_loss_clip": 0.01146296, + "auxiliary_loss_mlp": 0.01043268, + "balance_loss_clip": 1.0254823, + "balance_loss_mlp": 1.05066323, + "epoch": 0.19215391552683, + "flos": 22055185221120.0, + "grad_norm": 1.9672517867074575, + "language_loss": 0.72712696, + "learning_rate": 3.730458316143429e-06, + "loss": 0.74902254, + "num_input_tokens_seen": 69019665, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.95703125, + "step": 3196, + "time_per_iteration": 2.4855856895446777 + }, + { + "auxiliary_loss_clip": 0.01151669, + "auxiliary_loss_mlp": 0.01046783, + "balance_loss_clip": 1.0284251, + "balance_loss_mlp": 1.05643284, + "epoch": 0.19221403877949797, + "flos": 20302959962880.0, + "grad_norm": 1.8158077484015336, + "language_loss": 0.83774233, + "learning_rate": 3.7302630163579068e-06, + "loss": 0.85972691, + "num_input_tokens_seen": 69039055, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.953125, + "step": 3197, + "time_per_iteration": 2.4530181884765625 + }, + { + "auxiliary_loss_clip": 0.01146905, + "auxiliary_loss_mlp": 0.01044345, + "balance_loss_clip": 1.02565312, + "balance_loss_mlp": 1.05036283, + "epoch": 0.19227416203216594, + "flos": 23185329811200.0, + "grad_norm": 2.295881830513264, + "language_loss": 0.80459738, + "learning_rate": 3.7300676509606373e-06, + "loss": 0.82650983, + "num_input_tokens_seen": 69056370, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.96875, + "step": 3198, + "time_per_iteration": 2.4882590770721436 + }, + { + "auxiliary_loss_clip": 0.01148366, + "auxiliary_loss_mlp": 0.01050243, + "balance_loss_clip": 1.03090763, + "balance_loss_mlp": 1.04984999, + "epoch": 0.1923342852848339, + "flos": 25776607841280.0, + "grad_norm": 1.9800701307051174, + "language_loss": 0.7862891, + "learning_rate": 3.729872219959029e-06, + "loss": 0.80827522, + "num_input_tokens_seen": 69075915, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.984375, + "step": 3199, + "time_per_iteration": 2.507227659225464 + }, + { + "auxiliary_loss_clip": 0.01146428, + "auxiliary_loss_mlp": 0.01042987, + "balance_loss_clip": 1.02567828, + "balance_loss_mlp": 1.05150342, + "epoch": 0.19239440853750187, + "flos": 17128349061120.0, + "grad_norm": 2.05190707233933, + "language_loss": 0.83391261, + "learning_rate": 3.7296767233604934e-06, + "loss": 0.85580671, + "num_input_tokens_seen": 69094145, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.94921875, + "step": 3200, + "time_per_iteration": 2.459218978881836 + }, + { + "auxiliary_loss_clip": 0.01148087, + "auxiliary_loss_mlp": 0.01051054, + "balance_loss_clip": 1.03286231, + "balance_loss_mlp": 1.0524931, + "epoch": 0.19245453179016986, + "flos": 16435093593600.0, + "grad_norm": 2.0233550639398428, + "language_loss": 0.78678542, + "learning_rate": 3.729481161172443e-06, + "loss": 0.80877686, + "num_input_tokens_seen": 69111110, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.95703125, + "step": 3201, + "time_per_iteration": 2.435478448867798 + }, + { + "auxiliary_loss_clip": 0.01148745, + "auxiliary_loss_mlp": 0.01046904, + "balance_loss_clip": 1.02874875, + "balance_loss_mlp": 1.05050445, + "epoch": 0.19251465504283782, + "flos": 20230276792320.0, + "grad_norm": 2.1716175760371814, + "language_loss": 0.69168961, + "learning_rate": 3.7292855334022927e-06, + "loss": 0.71364617, + "num_input_tokens_seen": 69130280, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.984375, + "step": 3202, + "time_per_iteration": 2.4596354961395264 + }, + { + "auxiliary_loss_clip": 0.01145317, + "auxiliary_loss_mlp": 0.01035376, + "balance_loss_clip": 1.01790023, + "balance_loss_mlp": 1.05140352, + "epoch": 0.1925747782955058, + "flos": 19464374067840.0, + "grad_norm": 1.7015130302687178, + "language_loss": 0.91123176, + "learning_rate": 3.7290898400574627e-06, + "loss": 0.93303871, + "num_input_tokens_seen": 69149570, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9375, + "step": 3203, + "time_per_iteration": 2.4425902366638184 + }, + { + "auxiliary_loss_clip": 0.01147002, + "auxiliary_loss_mlp": 0.01050127, + "balance_loss_clip": 1.03127956, + "balance_loss_mlp": 1.05008471, + "epoch": 0.19263490154817375, + "flos": 17785586165760.0, + "grad_norm": 2.129263396651385, + "language_loss": 0.81766933, + "learning_rate": 3.7288940811453725e-06, + "loss": 0.83964062, + "num_input_tokens_seen": 69168190, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.96875, + "step": 3204, + "time_per_iteration": 2.4466230869293213 + }, + { + "auxiliary_loss_clip": 0.01143673, + "auxiliary_loss_mlp": 0.01047511, + "balance_loss_clip": 1.03022599, + "balance_loss_mlp": 1.0497942, + "epoch": 0.19269502480084172, + "flos": 17457075354240.0, + "grad_norm": 2.065510679734303, + "language_loss": 0.75797462, + "learning_rate": 3.7286982566734454e-06, + "loss": 0.77988648, + "num_input_tokens_seen": 69186950, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9375, + "step": 3205, + "time_per_iteration": 2.439906358718872 + }, + { + "auxiliary_loss_clip": 0.01150471, + "auxiliary_loss_mlp": 0.01047316, + "balance_loss_clip": 1.02958953, + "balance_loss_mlp": 1.05312991, + "epoch": 0.19275514805350968, + "flos": 21506901045120.0, + "grad_norm": 2.4125731541540465, + "language_loss": 0.83020669, + "learning_rate": 3.728502366649107e-06, + "loss": 0.85218459, + "num_input_tokens_seen": 69204850, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.97265625, + "step": 3206, + "time_per_iteration": 2.463888168334961 + }, + { + "auxiliary_loss_clip": 0.0104957, + "auxiliary_loss_mlp": 0.01003581, + "balance_loss_clip": 1.00139928, + "balance_loss_mlp": 1.01731467, + "epoch": 0.19281527130617768, + "flos": 47695979738880.0, + "grad_norm": 0.8499440783854421, + "language_loss": 0.60609913, + "learning_rate": 3.728306411079786e-06, + "loss": 0.62663066, + "num_input_tokens_seen": 69259200, + "router_z_loss_clip": 0.02185059, + "router_z_loss_mlp": 0.32421875, + "step": 3207, + "time_per_iteration": 2.8865902423858643 + }, + { + "auxiliary_loss_clip": 0.01147085, + "auxiliary_loss_mlp": 0.01045801, + "balance_loss_clip": 1.02789569, + "balance_loss_mlp": 1.05069125, + "epoch": 0.19287539455884564, + "flos": 11801252672640.0, + "grad_norm": 2.4047527057594564, + "language_loss": 0.75119245, + "learning_rate": 3.7281103899729125e-06, + "loss": 0.77312136, + "num_input_tokens_seen": 69275835, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.9609375, + "step": 3208, + "time_per_iteration": 2.4727799892425537 + }, + { + "auxiliary_loss_clip": 0.01146825, + "auxiliary_loss_mlp": 0.01048755, + "balance_loss_clip": 1.02921605, + "balance_loss_mlp": 1.04890394, + "epoch": 0.1929355178115136, + "flos": 20631434860800.0, + "grad_norm": 2.3372356299161696, + "language_loss": 0.60567236, + "learning_rate": 3.7279143033359195e-06, + "loss": 0.62762815, + "num_input_tokens_seen": 69294810, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.98046875, + "step": 3209, + "time_per_iteration": 2.4695677757263184 + }, + { + "auxiliary_loss_clip": 0.0114885, + "auxiliary_loss_mlp": 0.01049539, + "balance_loss_clip": 1.03003573, + "balance_loss_mlp": 1.04981887, + "epoch": 0.19299564106418157, + "flos": 40807916058240.0, + "grad_norm": 1.9457412312791633, + "language_loss": 0.80153656, + "learning_rate": 3.727718151176243e-06, + "loss": 0.82352048, + "num_input_tokens_seen": 69316065, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.9921875, + "step": 3210, + "time_per_iteration": 2.6459405422210693 + }, + { + "auxiliary_loss_clip": 0.01138808, + "auxiliary_loss_mlp": 0.01041334, + "balance_loss_clip": 1.02437103, + "balance_loss_mlp": 1.04580569, + "epoch": 0.19305576431684954, + "flos": 11361418634880.0, + "grad_norm": 2.107646167575127, + "language_loss": 0.82575119, + "learning_rate": 3.7275219335013217e-06, + "loss": 0.84755266, + "num_input_tokens_seen": 69332900, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9296875, + "step": 3211, + "time_per_iteration": 2.454702615737915 + }, + { + "auxiliary_loss_clip": 0.01046258, + "auxiliary_loss_mlp": 0.01012694, + "balance_loss_clip": 1.01057243, + "balance_loss_mlp": 1.01463401, + "epoch": 0.1931158875695175, + "flos": 54511895975040.0, + "grad_norm": 0.9758169311408023, + "language_loss": 0.63670558, + "learning_rate": 3.7273256503185953e-06, + "loss": 0.65729511, + "num_input_tokens_seen": 69382535, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.31640625, + "step": 3212, + "time_per_iteration": 2.914459705352783 + }, + { + "auxiliary_loss_clip": 0.01145937, + "auxiliary_loss_mlp": 0.01046347, + "balance_loss_clip": 1.02967, + "balance_loss_mlp": 1.05140018, + "epoch": 0.19317601082218547, + "flos": 19828436365440.0, + "grad_norm": 1.5978218597026725, + "language_loss": 0.76514798, + "learning_rate": 3.7271293016355074e-06, + "loss": 0.78707075, + "num_input_tokens_seen": 69400600, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9453125, + "step": 3213, + "time_per_iteration": 2.47961163520813 + }, + { + "auxiliary_loss_clip": 0.01147383, + "auxiliary_loss_mlp": 0.01047068, + "balance_loss_clip": 1.02823281, + "balance_loss_mlp": 1.04934072, + "epoch": 0.19323613407485346, + "flos": 13152068467200.0, + "grad_norm": 4.5461953882780115, + "language_loss": 0.70799339, + "learning_rate": 3.726932887459503e-06, + "loss": 0.72993791, + "num_input_tokens_seen": 69417350, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.98046875, + "step": 3214, + "time_per_iteration": 2.4547488689422607 + }, + { + "auxiliary_loss_clip": 0.01142593, + "auxiliary_loss_mlp": 0.01046871, + "balance_loss_clip": 1.02808392, + "balance_loss_mlp": 1.0470041, + "epoch": 0.19329625732752143, + "flos": 14027247342720.0, + "grad_norm": 2.2459266127411848, + "language_loss": 0.75352395, + "learning_rate": 3.72673640779803e-06, + "loss": 0.77541864, + "num_input_tokens_seen": 69431845, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.95703125, + "step": 3215, + "time_per_iteration": 2.4477176666259766 + }, + { + "auxiliary_loss_clip": 0.01139586, + "auxiliary_loss_mlp": 0.01053833, + "balance_loss_clip": 1.03685808, + "balance_loss_mlp": 1.04626155, + "epoch": 0.1933563805801894, + "flos": 23441732069760.0, + "grad_norm": 2.304207478946857, + "language_loss": 0.88559556, + "learning_rate": 3.72653986265854e-06, + "loss": 0.90752971, + "num_input_tokens_seen": 69453275, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9296875, + "step": 3216, + "time_per_iteration": 2.499464988708496 + }, + { + "auxiliary_loss_clip": 0.01140269, + "auxiliary_loss_mlp": 0.0104998, + "balance_loss_clip": 1.0330286, + "balance_loss_mlp": 1.0474, + "epoch": 0.19341650383285736, + "flos": 20485314334080.0, + "grad_norm": 1.5978066249985532, + "language_loss": 0.79762065, + "learning_rate": 3.726343252048485e-06, + "loss": 0.8195231, + "num_input_tokens_seen": 69471830, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9296875, + "step": 3217, + "time_per_iteration": 2.4428889751434326 + }, + { + "auxiliary_loss_clip": 0.0114893, + "auxiliary_loss_mlp": 0.01048467, + "balance_loss_clip": 1.0294652, + "balance_loss_mlp": 1.0504688, + "epoch": 0.19347662708552532, + "flos": 17858484817920.0, + "grad_norm": 2.6606972104147673, + "language_loss": 0.61408496, + "learning_rate": 3.7261465759753206e-06, + "loss": 0.63605893, + "num_input_tokens_seen": 69489320, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.984375, + "step": 3218, + "time_per_iteration": 2.4313230514526367 + }, + { + "auxiliary_loss_clip": 0.0114381, + "auxiliary_loss_mlp": 0.01040596, + "balance_loss_clip": 1.02315593, + "balance_loss_mlp": 1.04883909, + "epoch": 0.1935367503381933, + "flos": 18187247024640.0, + "grad_norm": 1.6811153728366703, + "language_loss": 0.80158418, + "learning_rate": 3.7259498344465053e-06, + "loss": 0.82342821, + "num_input_tokens_seen": 69506665, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.94921875, + "step": 3219, + "time_per_iteration": 2.4347593784332275 + }, + { + "auxiliary_loss_clip": 0.01145851, + "auxiliary_loss_mlp": 0.01048329, + "balance_loss_clip": 1.03010237, + "balance_loss_mlp": 1.05070114, + "epoch": 0.19359687359086128, + "flos": 15957122290560.0, + "grad_norm": 2.032012314604138, + "language_loss": 0.85781908, + "learning_rate": 3.7257530274694993e-06, + "loss": 0.87976086, + "num_input_tokens_seen": 69523835, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.94921875, + "step": 3220, + "time_per_iteration": 2.4572718143463135 + }, + { + "auxiliary_loss_clip": 0.01136805, + "auxiliary_loss_mlp": 0.0103947, + "balance_loss_clip": 1.02356791, + "balance_loss_mlp": 1.0477736, + "epoch": 0.19365699684352924, + "flos": 21215198695680.0, + "grad_norm": 2.087292049011103, + "language_loss": 0.84617937, + "learning_rate": 3.725556155051766e-06, + "loss": 0.86794209, + "num_input_tokens_seen": 69542620, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.890625, + "step": 3221, + "time_per_iteration": 2.4601354598999023 + }, + { + "auxiliary_loss_clip": 0.01142607, + "auxiliary_loss_mlp": 0.01049362, + "balance_loss_clip": 1.0331614, + "balance_loss_mlp": 1.05009556, + "epoch": 0.1937171200961972, + "flos": 17311098481920.0, + "grad_norm": 2.075109928662421, + "language_loss": 0.85929954, + "learning_rate": 3.7253592172007702e-06, + "loss": 0.88121927, + "num_input_tokens_seen": 69561130, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.92578125, + "step": 3222, + "time_per_iteration": 2.433027505874634 + }, + { + "auxiliary_loss_clip": 0.0114145, + "auxiliary_loss_mlp": 0.01040151, + "balance_loss_clip": 1.02212656, + "balance_loss_mlp": 1.04663789, + "epoch": 0.19377724334886517, + "flos": 22635968227200.0, + "grad_norm": 3.9278404759018053, + "language_loss": 0.78207982, + "learning_rate": 3.72516221392398e-06, + "loss": 0.80389583, + "num_input_tokens_seen": 69580425, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9453125, + "step": 3223, + "time_per_iteration": 2.4451496601104736 + }, + { + "auxiliary_loss_clip": 0.01139994, + "auxiliary_loss_mlp": 0.01047584, + "balance_loss_clip": 1.03013206, + "balance_loss_mlp": 1.04896808, + "epoch": 0.19383736660153314, + "flos": 15077813351040.0, + "grad_norm": 1.8200574771064912, + "language_loss": 0.75589085, + "learning_rate": 3.7249651452288653e-06, + "loss": 0.77776659, + "num_input_tokens_seen": 69597085, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.91015625, + "step": 3224, + "time_per_iteration": 2.4390981197357178 + }, + { + "auxiliary_loss_clip": 0.0114036, + "auxiliary_loss_mlp": 0.01039996, + "balance_loss_clip": 1.02274644, + "balance_loss_mlp": 1.04741263, + "epoch": 0.1938974898542011, + "flos": 47119934350080.0, + "grad_norm": 2.092202382915022, + "language_loss": 0.71141279, + "learning_rate": 3.7247680111229e-06, + "loss": 0.73321629, + "num_input_tokens_seen": 69618885, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9296875, + "step": 3225, + "time_per_iteration": 2.6690707206726074 + }, + { + "auxiliary_loss_clip": 0.01142605, + "auxiliary_loss_mlp": 0.01044348, + "balance_loss_clip": 1.0279572, + "balance_loss_mlp": 1.04787326, + "epoch": 0.19395761310686907, + "flos": 25812554376960.0, + "grad_norm": 2.058354492672399, + "language_loss": 0.6915803, + "learning_rate": 3.7245708116135585e-06, + "loss": 0.71344984, + "num_input_tokens_seen": 69638200, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9453125, + "step": 3226, + "time_per_iteration": 3.906217336654663 + }, + { + "auxiliary_loss_clip": 0.0114437, + "auxiliary_loss_mlp": 0.01044177, + "balance_loss_clip": 1.02562809, + "balance_loss_mlp": 1.05274427, + "epoch": 0.19401773635953706, + "flos": 23039604334080.0, + "grad_norm": 1.6131772564475266, + "language_loss": 0.76138854, + "learning_rate": 3.7243735467083193e-06, + "loss": 0.78327405, + "num_input_tokens_seen": 69657550, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9140625, + "step": 3227, + "time_per_iteration": 4.168737411499023 + }, + { + "auxiliary_loss_clip": 0.01140847, + "auxiliary_loss_mlp": 0.01042545, + "balance_loss_clip": 1.02547467, + "balance_loss_mlp": 1.04588878, + "epoch": 0.19407785961220503, + "flos": 15920780705280.0, + "grad_norm": 1.8539897665707572, + "language_loss": 0.69154215, + "learning_rate": 3.724176216414662e-06, + "loss": 0.7133761, + "num_input_tokens_seen": 69675005, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.94921875, + "step": 3228, + "time_per_iteration": 2.4857404232025146 + }, + { + "auxiliary_loss_clip": 0.01142054, + "auxiliary_loss_mlp": 0.01044016, + "balance_loss_clip": 1.02698135, + "balance_loss_mlp": 1.04929864, + "epoch": 0.194137982864873, + "flos": 25921722787200.0, + "grad_norm": 1.9069922854616745, + "language_loss": 0.7428174, + "learning_rate": 3.72397882074007e-06, + "loss": 0.76467812, + "num_input_tokens_seen": 69696455, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9296875, + "step": 3229, + "time_per_iteration": 2.5357918739318848 + }, + { + "auxiliary_loss_clip": 0.01141663, + "auxiliary_loss_mlp": 0.01041685, + "balance_loss_clip": 1.02443504, + "balance_loss_mlp": 1.04832351, + "epoch": 0.19419810611754096, + "flos": 13261344618240.0, + "grad_norm": 1.6963766145995596, + "language_loss": 0.65157712, + "learning_rate": 3.7237813596920285e-06, + "loss": 0.67341059, + "num_input_tokens_seen": 69714245, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.93359375, + "step": 3230, + "time_per_iteration": 2.4796855449676514 + }, + { + "auxiliary_loss_clip": 0.01137871, + "auxiliary_loss_mlp": 0.01044544, + "balance_loss_clip": 1.0268054, + "balance_loss_mlp": 1.04652202, + "epoch": 0.19425822937020892, + "flos": 15705568368000.0, + "grad_norm": 1.8877471342298004, + "language_loss": 0.8184334, + "learning_rate": 3.7235838332780254e-06, + "loss": 0.84025759, + "num_input_tokens_seen": 69731515, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9140625, + "step": 3231, + "time_per_iteration": 2.5315961837768555 + }, + { + "auxiliary_loss_clip": 0.01145592, + "auxiliary_loss_mlp": 0.01039112, + "balance_loss_clip": 1.02045608, + "balance_loss_mlp": 1.05067456, + "epoch": 0.1943183526228769, + "flos": 23105392093440.0, + "grad_norm": 1.787689187471357, + "language_loss": 0.86743605, + "learning_rate": 3.72338624150555e-06, + "loss": 0.88928306, + "num_input_tokens_seen": 69748885, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.94921875, + "step": 3232, + "time_per_iteration": 2.4916152954101562 + }, + { + "auxiliary_loss_clip": 0.01141636, + "auxiliary_loss_mlp": 0.01052447, + "balance_loss_clip": 1.03497076, + "balance_loss_mlp": 1.05008495, + "epoch": 0.19437847587554485, + "flos": 24712610146560.0, + "grad_norm": 1.5602267859616314, + "language_loss": 0.8513217, + "learning_rate": 3.723188584382096e-06, + "loss": 0.87326247, + "num_input_tokens_seen": 69767540, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9140625, + "step": 3233, + "time_per_iteration": 2.526118040084839 + }, + { + "auxiliary_loss_clip": 0.01145232, + "auxiliary_loss_mlp": 0.01053705, + "balance_loss_clip": 1.03603804, + "balance_loss_mlp": 1.04827857, + "epoch": 0.19443859912821285, + "flos": 23116130259840.0, + "grad_norm": 1.6631942166294669, + "language_loss": 0.89191484, + "learning_rate": 3.722990861915158e-06, + "loss": 0.91390419, + "num_input_tokens_seen": 69789340, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.96875, + "step": 3234, + "time_per_iteration": 2.4783849716186523 + }, + { + "auxiliary_loss_clip": 0.01143869, + "auxiliary_loss_mlp": 0.01043333, + "balance_loss_clip": 1.02493858, + "balance_loss_mlp": 1.04675341, + "epoch": 0.1944987223808808, + "flos": 15084385539840.0, + "grad_norm": 2.1776085062187374, + "language_loss": 0.78503513, + "learning_rate": 3.722793074112234e-06, + "loss": 0.80690718, + "num_input_tokens_seen": 69806470, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.96875, + "step": 3235, + "time_per_iteration": 2.4414284229278564 + }, + { + "auxiliary_loss_clip": 0.01146423, + "auxiliary_loss_mlp": 0.01041843, + "balance_loss_clip": 1.02545178, + "balance_loss_mlp": 1.05288744, + "epoch": 0.19455884563354878, + "flos": 17126876603520.0, + "grad_norm": 2.115791514531618, + "language_loss": 0.7937218, + "learning_rate": 3.7225952209808233e-06, + "loss": 0.81560451, + "num_input_tokens_seen": 69822655, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.93359375, + "step": 3236, + "time_per_iteration": 2.4394619464874268 + }, + { + "auxiliary_loss_clip": 0.01144391, + "auxiliary_loss_mlp": 0.01040175, + "balance_loss_clip": 1.02204323, + "balance_loss_mlp": 1.05156302, + "epoch": 0.19461896888621674, + "flos": 20193396503040.0, + "grad_norm": 2.445233321344346, + "language_loss": 0.75936478, + "learning_rate": 3.72239730252843e-06, + "loss": 0.78121042, + "num_input_tokens_seen": 69841895, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9296875, + "step": 3237, + "time_per_iteration": 2.544003486633301 + }, + { + "auxiliary_loss_clip": 0.01147227, + "auxiliary_loss_mlp": 0.01046687, + "balance_loss_clip": 1.03005719, + "balance_loss_mlp": 1.05079889, + "epoch": 0.1946790921388847, + "flos": 25301365971840.0, + "grad_norm": 2.0921387862929586, + "language_loss": 0.75056225, + "learning_rate": 3.7221993187625583e-06, + "loss": 0.77250135, + "num_input_tokens_seen": 69862220, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.96484375, + "step": 3238, + "time_per_iteration": 2.4795806407928467 + }, + { + "auxiliary_loss_clip": 0.01147117, + "auxiliary_loss_mlp": 0.01044554, + "balance_loss_clip": 1.02600455, + "balance_loss_mlp": 1.05317962, + "epoch": 0.19473921539155267, + "flos": 20193396503040.0, + "grad_norm": 1.8233855681516762, + "language_loss": 0.73016453, + "learning_rate": 3.7220012696907155e-06, + "loss": 0.75208122, + "num_input_tokens_seen": 69881830, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.94140625, + "step": 3239, + "time_per_iteration": 2.4695816040039062 + }, + { + "auxiliary_loss_clip": 0.01144581, + "auxiliary_loss_mlp": 0.01048537, + "balance_loss_clip": 1.03026247, + "balance_loss_mlp": 1.0505631, + "epoch": 0.19479933864422067, + "flos": 20887549810560.0, + "grad_norm": 1.897973355517785, + "language_loss": 0.73792124, + "learning_rate": 3.721803155320412e-06, + "loss": 0.75985241, + "num_input_tokens_seen": 69900515, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.94140625, + "step": 3240, + "time_per_iteration": 2.4483954906463623 + }, + { + "auxiliary_loss_clip": 0.0114635, + "auxiliary_loss_mlp": 0.01041908, + "balance_loss_clip": 1.02477801, + "balance_loss_mlp": 1.05221701, + "epoch": 0.19485946189688863, + "flos": 23295072839040.0, + "grad_norm": 1.8797415358152445, + "language_loss": 0.66685343, + "learning_rate": 3.7216049756591606e-06, + "loss": 0.68873608, + "num_input_tokens_seen": 69920060, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.94140625, + "step": 3241, + "time_per_iteration": 2.5644116401672363 + }, + { + "auxiliary_loss_clip": 0.01144249, + "auxiliary_loss_mlp": 0.01045443, + "balance_loss_clip": 1.0280863, + "balance_loss_mlp": 1.05193758, + "epoch": 0.1949195851495566, + "flos": 23295036925440.0, + "grad_norm": 1.4346271942222966, + "language_loss": 0.82889283, + "learning_rate": 3.7214067307144754e-06, + "loss": 0.85078967, + "num_input_tokens_seen": 69939820, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.921875, + "step": 3242, + "time_per_iteration": 2.476043701171875 + }, + { + "auxiliary_loss_clip": 0.01054708, + "auxiliary_loss_mlp": 0.01010683, + "balance_loss_clip": 1.00856066, + "balance_loss_mlp": 1.02379096, + "epoch": 0.19497970840222456, + "flos": 64962871557120.0, + "grad_norm": 0.8482804620416572, + "language_loss": 0.57572454, + "learning_rate": 3.721208420493875e-06, + "loss": 0.59637845, + "num_input_tokens_seen": 70002145, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.30859375, + "step": 3243, + "time_per_iteration": 3.1217525005340576 + }, + { + "auxiliary_loss_clip": 0.01144732, + "auxiliary_loss_mlp": 0.01043073, + "balance_loss_clip": 1.02573967, + "balance_loss_mlp": 1.05099249, + "epoch": 0.19503983165489253, + "flos": 19644717277440.0, + "grad_norm": 2.02063631868758, + "language_loss": 0.83243412, + "learning_rate": 3.7210100450048784e-06, + "loss": 0.85431218, + "num_input_tokens_seen": 70020510, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9375, + "step": 3244, + "time_per_iteration": 2.4848830699920654 + }, + { + "auxiliary_loss_clip": 0.01147429, + "auxiliary_loss_mlp": 0.01048127, + "balance_loss_clip": 1.03144979, + "balance_loss_mlp": 1.05495024, + "epoch": 0.1950999549075605, + "flos": 21141976821120.0, + "grad_norm": 1.8275576625869878, + "language_loss": 0.77049786, + "learning_rate": 3.7208116042550088e-06, + "loss": 0.79245341, + "num_input_tokens_seen": 70040760, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.921875, + "step": 3245, + "time_per_iteration": 2.5539040565490723 + }, + { + "auxiliary_loss_clip": 0.01141945, + "auxiliary_loss_mlp": 0.01041151, + "balance_loss_clip": 1.0235796, + "balance_loss_mlp": 1.04852772, + "epoch": 0.19516007816022846, + "flos": 20884820376960.0, + "grad_norm": 2.8639596298576055, + "language_loss": 0.84020388, + "learning_rate": 3.7206130982517906e-06, + "loss": 0.86203486, + "num_input_tokens_seen": 70058720, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9375, + "step": 3246, + "time_per_iteration": 2.5018341541290283 + }, + { + "auxiliary_loss_clip": 0.0114444, + "auxiliary_loss_mlp": 0.01045285, + "balance_loss_clip": 1.02834511, + "balance_loss_mlp": 1.04978824, + "epoch": 0.19522020141289645, + "flos": 16910515031040.0, + "grad_norm": 2.1267063345385777, + "language_loss": 0.7636531, + "learning_rate": 3.7204145270027514e-06, + "loss": 0.78555036, + "num_input_tokens_seen": 70076470, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9453125, + "step": 3247, + "time_per_iteration": 2.4512898921966553 + }, + { + "auxiliary_loss_clip": 0.01144663, + "auxiliary_loss_mlp": 0.01038699, + "balance_loss_clip": 1.02228367, + "balance_loss_mlp": 1.05077446, + "epoch": 0.19528032466556441, + "flos": 26724829023360.0, + "grad_norm": 1.4744510548582124, + "language_loss": 0.75330198, + "learning_rate": 3.720215890515421e-06, + "loss": 0.77513552, + "num_input_tokens_seen": 70096220, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9375, + "step": 3248, + "time_per_iteration": 2.5222222805023193 + }, + { + "auxiliary_loss_clip": 0.01140079, + "auxiliary_loss_mlp": 0.0104275, + "balance_loss_clip": 1.02590537, + "balance_loss_mlp": 1.04661679, + "epoch": 0.19534044791823238, + "flos": 21032808410880.0, + "grad_norm": 1.9881324270373204, + "language_loss": 0.78316575, + "learning_rate": 3.7200171887973316e-06, + "loss": 0.80499399, + "num_input_tokens_seen": 70114800, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.93359375, + "step": 3249, + "time_per_iteration": 2.475385904312134 + }, + { + "auxiliary_loss_clip": 0.01143906, + "auxiliary_loss_mlp": 0.01048238, + "balance_loss_clip": 1.0316205, + "balance_loss_mlp": 1.04948914, + "epoch": 0.19540057117090034, + "flos": 22344050396160.0, + "grad_norm": 1.839405294960197, + "language_loss": 0.73238158, + "learning_rate": 3.7198184218560176e-06, + "loss": 0.7543031, + "num_input_tokens_seen": 70134930, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9453125, + "step": 3250, + "time_per_iteration": 2.4548323154449463 + }, + { + "auxiliary_loss_clip": 0.01136082, + "auxiliary_loss_mlp": 0.01037994, + "balance_loss_clip": 1.02206779, + "balance_loss_mlp": 1.04583359, + "epoch": 0.1954606944235683, + "flos": 20301631159680.0, + "grad_norm": 1.9014920395959154, + "language_loss": 0.79582441, + "learning_rate": 3.719619589699017e-06, + "loss": 0.8175652, + "num_input_tokens_seen": 70152045, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.90234375, + "step": 3251, + "time_per_iteration": 2.4597084522247314 + }, + { + "auxiliary_loss_clip": 0.01142571, + "auxiliary_loss_mlp": 0.01041367, + "balance_loss_clip": 1.02441597, + "balance_loss_mlp": 1.04888558, + "epoch": 0.19552081767623627, + "flos": 17346865449600.0, + "grad_norm": 3.2143497379473613, + "language_loss": 0.83534026, + "learning_rate": 3.7194206923338695e-06, + "loss": 0.85717964, + "num_input_tokens_seen": 70169240, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9375, + "step": 3252, + "time_per_iteration": 2.4245967864990234 + }, + { + "auxiliary_loss_clip": 0.01143675, + "auxiliary_loss_mlp": 0.01048315, + "balance_loss_clip": 1.03026652, + "balance_loss_mlp": 1.04651105, + "epoch": 0.19558094092890424, + "flos": 31977626129280.0, + "grad_norm": 1.7806404718622555, + "language_loss": 0.73870194, + "learning_rate": 3.719221729768117e-06, + "loss": 0.76062191, + "num_input_tokens_seen": 70192690, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.96875, + "step": 3253, + "time_per_iteration": 2.5752809047698975 + }, + { + "auxiliary_loss_clip": 0.01142809, + "auxiliary_loss_mlp": 0.01040218, + "balance_loss_clip": 1.02352846, + "balance_loss_mlp": 1.04619944, + "epoch": 0.19564106418157223, + "flos": 22268889187200.0, + "grad_norm": 1.833285648050628, + "language_loss": 0.76684111, + "learning_rate": 3.7190227020093037e-06, + "loss": 0.78867137, + "num_input_tokens_seen": 70209685, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.96484375, + "step": 3254, + "time_per_iteration": 2.533993721008301 + }, + { + "auxiliary_loss_clip": 0.01044914, + "auxiliary_loss_mlp": 0.01004749, + "balance_loss_clip": 1.00268674, + "balance_loss_mlp": 1.01349974, + "epoch": 0.1957011874342402, + "flos": 54364554385920.0, + "grad_norm": 0.7652407497357797, + "language_loss": 0.55344874, + "learning_rate": 3.7188236090649774e-06, + "loss": 0.5739454, + "num_input_tokens_seen": 70265050, + "router_z_loss_clip": 0.02062988, + "router_z_loss_mlp": 0.3125, + "step": 3255, + "time_per_iteration": 3.164173126220703 + }, + { + "auxiliary_loss_clip": 0.01144973, + "auxiliary_loss_mlp": 0.01041369, + "balance_loss_clip": 1.02407217, + "balance_loss_mlp": 1.05057478, + "epoch": 0.19576131068690816, + "flos": 16506699356160.0, + "grad_norm": 2.650975615707017, + "language_loss": 0.7066443, + "learning_rate": 3.718624450942688e-06, + "loss": 0.7285077, + "num_input_tokens_seen": 70281830, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9453125, + "step": 3256, + "time_per_iteration": 2.496424436569214 + }, + { + "auxiliary_loss_clip": 0.01139601, + "auxiliary_loss_mlp": 0.0104318, + "balance_loss_clip": 1.02635908, + "balance_loss_mlp": 1.04647136, + "epoch": 0.19582143393957613, + "flos": 14719676797440.0, + "grad_norm": 2.256610935254856, + "language_loss": 0.80055118, + "learning_rate": 3.718425227649987e-06, + "loss": 0.82237899, + "num_input_tokens_seen": 70297420, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9296875, + "step": 3257, + "time_per_iteration": 2.4771504402160645 + }, + { + "auxiliary_loss_clip": 0.01143218, + "auxiliary_loss_mlp": 0.01042656, + "balance_loss_clip": 1.02637219, + "balance_loss_mlp": 1.05034149, + "epoch": 0.1958815571922441, + "flos": 24425504737920.0, + "grad_norm": 1.9567741269254724, + "language_loss": 0.74843282, + "learning_rate": 3.7182259391944292e-06, + "loss": 0.77029151, + "num_input_tokens_seen": 70319210, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.9296875, + "step": 3258, + "time_per_iteration": 2.6177120208740234 + }, + { + "auxiliary_loss_clip": 0.01142767, + "auxiliary_loss_mlp": 0.01036452, + "balance_loss_clip": 1.01932144, + "balance_loss_mlp": 1.04772139, + "epoch": 0.19594168044491206, + "flos": 24900279730560.0, + "grad_norm": 1.7410781544458231, + "language_loss": 0.74462247, + "learning_rate": 3.7180265855835714e-06, + "loss": 0.7664147, + "num_input_tokens_seen": 70339045, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.94921875, + "step": 3259, + "time_per_iteration": 2.54068660736084 + }, + { + "auxiliary_loss_clip": 0.01145135, + "auxiliary_loss_mlp": 0.01036775, + "balance_loss_clip": 1.01923943, + "balance_loss_mlp": 1.04965675, + "epoch": 0.19600180369758005, + "flos": 12057008486400.0, + "grad_norm": 2.380592438675979, + "language_loss": 0.77040654, + "learning_rate": 3.7178271668249735e-06, + "loss": 0.7922256, + "num_input_tokens_seen": 70356505, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.953125, + "step": 3260, + "time_per_iteration": 2.4983303546905518 + }, + { + "auxiliary_loss_clip": 0.01143361, + "auxiliary_loss_mlp": 0.01041828, + "balance_loss_clip": 1.02459061, + "balance_loss_mlp": 1.0486325, + "epoch": 0.19606192695024802, + "flos": 20850202644480.0, + "grad_norm": 2.011568492365706, + "language_loss": 0.82168972, + "learning_rate": 3.7176276829261975e-06, + "loss": 0.84354162, + "num_input_tokens_seen": 70375410, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9453125, + "step": 3261, + "time_per_iteration": 2.52164626121521 + }, + { + "auxiliary_loss_clip": 0.01144228, + "auxiliary_loss_mlp": 0.01042699, + "balance_loss_clip": 1.02510428, + "balance_loss_mlp": 1.05130327, + "epoch": 0.19612205020291598, + "flos": 28475509996800.0, + "grad_norm": 2.1812525814986112, + "language_loss": 0.76691413, + "learning_rate": 3.717428133894807e-06, + "loss": 0.78878343, + "num_input_tokens_seen": 70396315, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9296875, + "step": 3262, + "time_per_iteration": 2.513619899749756 + }, + { + "auxiliary_loss_clip": 0.01145398, + "auxiliary_loss_mlp": 0.01044459, + "balance_loss_clip": 1.02775788, + "balance_loss_mlp": 1.05290008, + "epoch": 0.19618217345558395, + "flos": 25556618995200.0, + "grad_norm": 1.7175684177653927, + "language_loss": 0.8667773, + "learning_rate": 3.71722851973837e-06, + "loss": 0.88867593, + "num_input_tokens_seen": 70417945, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.921875, + "step": 3263, + "time_per_iteration": 2.49373459815979 + }, + { + "auxiliary_loss_clip": 0.01140801, + "auxiliary_loss_mlp": 0.01041854, + "balance_loss_clip": 1.0251646, + "balance_loss_mlp": 1.04784787, + "epoch": 0.1962422967082519, + "flos": 25264413855360.0, + "grad_norm": 1.5660143494742738, + "language_loss": 0.74136549, + "learning_rate": 3.717028840464455e-06, + "loss": 0.76319206, + "num_input_tokens_seen": 70438690, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9296875, + "step": 3264, + "time_per_iteration": 2.4891843795776367 + }, + { + "auxiliary_loss_clip": 0.0114591, + "auxiliary_loss_mlp": 0.01049823, + "balance_loss_clip": 1.03340793, + "balance_loss_mlp": 1.05435038, + "epoch": 0.19630241996091988, + "flos": 18807352444800.0, + "grad_norm": 4.0742741532711975, + "language_loss": 0.78590196, + "learning_rate": 3.7168290960806344e-06, + "loss": 0.8078593, + "num_input_tokens_seen": 70455385, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9140625, + "step": 3265, + "time_per_iteration": 2.4226529598236084 + }, + { + "auxiliary_loss_clip": 0.01047401, + "auxiliary_loss_mlp": 0.01014864, + "balance_loss_clip": 1.01292133, + "balance_loss_mlp": 1.01652646, + "epoch": 0.19636254321358784, + "flos": 62321137896960.0, + "grad_norm": 0.7852387786228787, + "language_loss": 0.53459084, + "learning_rate": 3.716629286594483e-06, + "loss": 0.55521357, + "num_input_tokens_seen": 70514280, + "router_z_loss_clip": 0.01940918, + "router_z_loss_mlp": 0.30859375, + "step": 3266, + "time_per_iteration": 3.0519652366638184 + }, + { + "auxiliary_loss_clip": 0.01145434, + "auxiliary_loss_mlp": 0.01041492, + "balance_loss_clip": 1.02263319, + "balance_loss_mlp": 1.04800785, + "epoch": 0.19642266646625584, + "flos": 21069329564160.0, + "grad_norm": 1.9728388819613873, + "language_loss": 0.80503136, + "learning_rate": 3.7164294120135767e-06, + "loss": 0.82690066, + "num_input_tokens_seen": 70531800, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.9765625, + "step": 3267, + "time_per_iteration": 2.436455011367798 + }, + { + "auxiliary_loss_clip": 0.01138843, + "auxiliary_loss_mlp": 0.0104324, + "balance_loss_clip": 1.02726591, + "balance_loss_mlp": 1.04780269, + "epoch": 0.1964827897189238, + "flos": 14538651229440.0, + "grad_norm": 2.528633756775916, + "language_loss": 0.87031806, + "learning_rate": 3.7162294723454953e-06, + "loss": 0.89213896, + "num_input_tokens_seen": 70550615, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.91015625, + "step": 3268, + "time_per_iteration": 5.348580360412598 + }, + { + "auxiliary_loss_clip": 0.01141651, + "auxiliary_loss_mlp": 0.01045776, + "balance_loss_clip": 1.02865744, + "balance_loss_mlp": 1.04996669, + "epoch": 0.19654291297159177, + "flos": 19244636616960.0, + "grad_norm": 2.7845337804652086, + "language_loss": 0.69331455, + "learning_rate": 3.7160294675978197e-06, + "loss": 0.71518886, + "num_input_tokens_seen": 70568690, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9140625, + "step": 3269, + "time_per_iteration": 3.9386346340179443 + }, + { + "auxiliary_loss_clip": 0.01148343, + "auxiliary_loss_mlp": 0.01051701, + "balance_loss_clip": 1.03361702, + "balance_loss_mlp": 1.0530045, + "epoch": 0.19660303622425973, + "flos": 25775710001280.0, + "grad_norm": 2.4386480468071086, + "language_loss": 0.80760634, + "learning_rate": 3.715829397778135e-06, + "loss": 0.82960677, + "num_input_tokens_seen": 70588665, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.953125, + "step": 3270, + "time_per_iteration": 2.5130820274353027 + }, + { + "auxiliary_loss_clip": 0.01140062, + "auxiliary_loss_mlp": 0.01045089, + "balance_loss_clip": 1.02848363, + "balance_loss_mlp": 1.04726839, + "epoch": 0.1966631594769277, + "flos": 20595093275520.0, + "grad_norm": 1.857854204827715, + "language_loss": 0.83918732, + "learning_rate": 3.715629262894028e-06, + "loss": 0.86103886, + "num_input_tokens_seen": 70606900, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9296875, + "step": 3271, + "time_per_iteration": 2.4522581100463867 + }, + { + "auxiliary_loss_clip": 0.01139583, + "auxiliary_loss_mlp": 0.01046491, + "balance_loss_clip": 1.0297302, + "balance_loss_mlp": 1.04943895, + "epoch": 0.19672328272959566, + "flos": 23623188600960.0, + "grad_norm": 2.1376155358713835, + "language_loss": 0.80162311, + "learning_rate": 3.715429062953087e-06, + "loss": 0.82348382, + "num_input_tokens_seen": 70625955, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8984375, + "step": 3272, + "time_per_iteration": 2.4968738555908203 + }, + { + "auxiliary_loss_clip": 0.01145188, + "auxiliary_loss_mlp": 0.01045772, + "balance_loss_clip": 1.02766371, + "balance_loss_mlp": 1.05075002, + "epoch": 0.19678340598226365, + "flos": 23110922787840.0, + "grad_norm": 1.7855512393811417, + "language_loss": 0.80728978, + "learning_rate": 3.7152287979629043e-06, + "loss": 0.82919937, + "num_input_tokens_seen": 70646090, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9453125, + "step": 3273, + "time_per_iteration": 2.525407552719116 + }, + { + "auxiliary_loss_clip": 0.01142802, + "auxiliary_loss_mlp": 0.01051833, + "balance_loss_clip": 1.03454804, + "balance_loss_mlp": 1.04807115, + "epoch": 0.19684352923493162, + "flos": 24534852716160.0, + "grad_norm": 5.081990879764466, + "language_loss": 0.7791425, + "learning_rate": 3.7150284679310735e-06, + "loss": 0.80108881, + "num_input_tokens_seen": 70666065, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9453125, + "step": 3274, + "time_per_iteration": 2.527858018875122 + }, + { + "auxiliary_loss_clip": 0.01141542, + "auxiliary_loss_mlp": 0.01052239, + "balance_loss_clip": 1.03440571, + "balance_loss_mlp": 1.04765558, + "epoch": 0.19690365248759958, + "flos": 21796448578560.0, + "grad_norm": 2.1984029701042367, + "language_loss": 0.81144857, + "learning_rate": 3.7148280728651914e-06, + "loss": 0.83338642, + "num_input_tokens_seen": 70681580, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.9375, + "step": 3275, + "time_per_iteration": 2.451392412185669 + }, + { + "auxiliary_loss_clip": 0.01143631, + "auxiliary_loss_mlp": 0.01047389, + "balance_loss_clip": 1.02934027, + "balance_loss_mlp": 1.04772139, + "epoch": 0.19696377574026755, + "flos": 19056643810560.0, + "grad_norm": 1.90284229785688, + "language_loss": 0.81104618, + "learning_rate": 3.7146276127728563e-06, + "loss": 0.83295637, + "num_input_tokens_seen": 70697745, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9609375, + "step": 3276, + "time_per_iteration": 2.462033748626709 + }, + { + "auxiliary_loss_clip": 0.01142306, + "auxiliary_loss_mlp": 0.01038428, + "balance_loss_clip": 1.02132106, + "balance_loss_mlp": 1.04889154, + "epoch": 0.19702389899293551, + "flos": 22820656982400.0, + "grad_norm": 2.0909421048868126, + "language_loss": 0.89347923, + "learning_rate": 3.7144270876616713e-06, + "loss": 0.91528654, + "num_input_tokens_seen": 70715110, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.93359375, + "step": 3277, + "time_per_iteration": 2.4887003898620605 + }, + { + "auxiliary_loss_clip": 0.01146208, + "auxiliary_loss_mlp": 0.01047781, + "balance_loss_clip": 1.02804041, + "balance_loss_mlp": 1.04832077, + "epoch": 0.19708402224560348, + "flos": 22894237992960.0, + "grad_norm": 2.9974095646387573, + "language_loss": 0.62265754, + "learning_rate": 3.714226497539239e-06, + "loss": 0.64459741, + "num_input_tokens_seen": 70734715, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.9765625, + "step": 3278, + "time_per_iteration": 2.560401201248169 + }, + { + "auxiliary_loss_clip": 0.01144829, + "auxiliary_loss_mlp": 0.01054112, + "balance_loss_clip": 1.03562284, + "balance_loss_mlp": 1.04910243, + "epoch": 0.19714414549827144, + "flos": 25662519267840.0, + "grad_norm": 3.1131920881239936, + "language_loss": 0.73664343, + "learning_rate": 3.714025842413166e-06, + "loss": 0.75863284, + "num_input_tokens_seen": 70752650, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.95703125, + "step": 3279, + "time_per_iteration": 2.5036048889160156 + }, + { + "auxiliary_loss_clip": 0.01144667, + "auxiliary_loss_mlp": 0.01045176, + "balance_loss_clip": 1.02816486, + "balance_loss_mlp": 1.04906511, + "epoch": 0.19720426875093944, + "flos": 23915824704000.0, + "grad_norm": 1.6310774806952162, + "language_loss": 0.82451236, + "learning_rate": 3.713825122291061e-06, + "loss": 0.84641075, + "num_input_tokens_seen": 70772365, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.95703125, + "step": 3280, + "time_per_iteration": 2.499962091445923 + }, + { + "auxiliary_loss_clip": 0.01145271, + "auxiliary_loss_mlp": 0.01043645, + "balance_loss_clip": 1.02744484, + "balance_loss_mlp": 1.05086279, + "epoch": 0.1972643920036074, + "flos": 13881952828800.0, + "grad_norm": 1.847926035637751, + "language_loss": 0.77581155, + "learning_rate": 3.713624337180536e-06, + "loss": 0.79770064, + "num_input_tokens_seen": 70790340, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.94140625, + "step": 3281, + "time_per_iteration": 2.4610888957977295 + }, + { + "auxiliary_loss_clip": 0.01141389, + "auxiliary_loss_mlp": 0.01043048, + "balance_loss_clip": 1.02719295, + "balance_loss_mlp": 1.0507971, + "epoch": 0.19732451525627537, + "flos": 19863592801920.0, + "grad_norm": 1.593504057665797, + "language_loss": 0.79502213, + "learning_rate": 3.7134234870892045e-06, + "loss": 0.81686652, + "num_input_tokens_seen": 70809295, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.90625, + "step": 3282, + "time_per_iteration": 2.453230857849121 + }, + { + "auxiliary_loss_clip": 0.01149903, + "auxiliary_loss_mlp": 0.01044987, + "balance_loss_clip": 1.0279994, + "balance_loss_mlp": 1.05359089, + "epoch": 0.19738463850894333, + "flos": 24973429777920.0, + "grad_norm": 2.157912578421005, + "language_loss": 0.71937042, + "learning_rate": 3.7132225720246826e-06, + "loss": 0.7413193, + "num_input_tokens_seen": 70828765, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9609375, + "step": 3283, + "time_per_iteration": 2.5070157051086426 + }, + { + "auxiliary_loss_clip": 0.0114321, + "auxiliary_loss_mlp": 0.01041465, + "balance_loss_clip": 1.02462053, + "balance_loss_mlp": 1.04858577, + "epoch": 0.1974447617616113, + "flos": 18368883123840.0, + "grad_norm": 1.741034644212953, + "language_loss": 0.78832877, + "learning_rate": 3.7130215919945886e-06, + "loss": 0.81017548, + "num_input_tokens_seen": 70846805, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9453125, + "step": 3284, + "time_per_iteration": 2.436530113220215 + }, + { + "auxiliary_loss_clip": 0.01147439, + "auxiliary_loss_mlp": 0.01047462, + "balance_loss_clip": 1.02952087, + "balance_loss_mlp": 1.05069387, + "epoch": 0.19750488501427926, + "flos": 22892945103360.0, + "grad_norm": 2.0622477624774325, + "language_loss": 0.86366653, + "learning_rate": 3.7128205470065445e-06, + "loss": 0.88561547, + "num_input_tokens_seen": 70863805, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.96875, + "step": 3285, + "time_per_iteration": 2.4581058025360107 + }, + { + "auxiliary_loss_clip": 0.01143401, + "auxiliary_loss_mlp": 0.01042449, + "balance_loss_clip": 1.02571201, + "balance_loss_mlp": 1.0520879, + "epoch": 0.19756500826694723, + "flos": 21871502046720.0, + "grad_norm": 2.7361177014734372, + "language_loss": 0.88680863, + "learning_rate": 3.712619437068174e-06, + "loss": 0.90866709, + "num_input_tokens_seen": 70882660, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9140625, + "step": 3286, + "time_per_iteration": 2.472475290298462 + }, + { + "auxiliary_loss_clip": 0.01148385, + "auxiliary_loss_mlp": 0.01049125, + "balance_loss_clip": 1.03036189, + "balance_loss_mlp": 1.05260301, + "epoch": 0.19762513151961522, + "flos": 15158972131200.0, + "grad_norm": 2.2372981039860833, + "language_loss": 0.78297567, + "learning_rate": 3.712418262187102e-06, + "loss": 0.80495083, + "num_input_tokens_seen": 70898765, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.95703125, + "step": 3287, + "time_per_iteration": 2.417569875717163 + }, + { + "auxiliary_loss_clip": 0.01146192, + "auxiliary_loss_mlp": 0.01045423, + "balance_loss_clip": 1.02674246, + "balance_loss_mlp": 1.04974318, + "epoch": 0.1976852547722832, + "flos": 16979175878400.0, + "grad_norm": 2.197025185749627, + "language_loss": 0.81252837, + "learning_rate": 3.7122170223709584e-06, + "loss": 0.83444452, + "num_input_tokens_seen": 70916370, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.96484375, + "step": 3288, + "time_per_iteration": 2.4107155799865723 + }, + { + "auxiliary_loss_clip": 0.01139417, + "auxiliary_loss_mlp": 0.01049687, + "balance_loss_clip": 1.03315234, + "balance_loss_mlp": 1.04890108, + "epoch": 0.19774537802495115, + "flos": 20302924049280.0, + "grad_norm": 1.7615970311636253, + "language_loss": 0.72502065, + "learning_rate": 3.712015717627374e-06, + "loss": 0.74691164, + "num_input_tokens_seen": 70934870, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.90625, + "step": 3289, + "time_per_iteration": 2.4479291439056396 + }, + { + "auxiliary_loss_clip": 0.01144115, + "auxiliary_loss_mlp": 0.01045349, + "balance_loss_clip": 1.02807593, + "balance_loss_mlp": 1.0500598, + "epoch": 0.19780550127761912, + "flos": 27235478724480.0, + "grad_norm": 2.0523474932115833, + "language_loss": 0.7944051, + "learning_rate": 3.7118143479639813e-06, + "loss": 0.81629974, + "num_input_tokens_seen": 70955140, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9375, + "step": 3290, + "time_per_iteration": 2.499950408935547 + }, + { + "auxiliary_loss_clip": 0.0104544, + "auxiliary_loss_mlp": 0.01002976, + "balance_loss_clip": 1.00056827, + "balance_loss_mlp": 1.01336336, + "epoch": 0.19786562453028708, + "flos": 63550972684800.0, + "grad_norm": 0.9098407078047199, + "language_loss": 0.60440773, + "learning_rate": 3.711612913388418e-06, + "loss": 0.62489194, + "num_input_tokens_seen": 71012005, + "router_z_loss_clip": 0.02404785, + "router_z_loss_mlp": 0.3203125, + "step": 3291, + "time_per_iteration": 3.1538305282592773 + }, + { + "auxiliary_loss_clip": 0.01144863, + "auxiliary_loss_mlp": 0.01044766, + "balance_loss_clip": 1.02639592, + "balance_loss_mlp": 1.04670751, + "epoch": 0.19792574778295505, + "flos": 26286647011200.0, + "grad_norm": 2.151168561582294, + "language_loss": 0.81352198, + "learning_rate": 3.7114114139083204e-06, + "loss": 0.83541822, + "num_input_tokens_seen": 71031140, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.984375, + "step": 3292, + "time_per_iteration": 2.539417028427124 + }, + { + "auxiliary_loss_clip": 0.01137712, + "auxiliary_loss_mlp": 0.01047669, + "balance_loss_clip": 1.03051507, + "balance_loss_mlp": 1.04855824, + "epoch": 0.19798587103562304, + "flos": 19938107566080.0, + "grad_norm": 2.212806192124084, + "language_loss": 0.82146955, + "learning_rate": 3.7112098495313313e-06, + "loss": 0.84332335, + "num_input_tokens_seen": 71050250, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.890625, + "step": 3293, + "time_per_iteration": 2.438809394836426 + }, + { + "auxiliary_loss_clip": 0.01151271, + "auxiliary_loss_mlp": 0.01048402, + "balance_loss_clip": 1.02988923, + "balance_loss_mlp": 1.05333924, + "epoch": 0.198045994288291, + "flos": 20120282369280.0, + "grad_norm": 2.10438249616411, + "language_loss": 0.61268854, + "learning_rate": 3.711008220265093e-06, + "loss": 0.63468528, + "num_input_tokens_seen": 71068665, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9765625, + "step": 3294, + "time_per_iteration": 2.451650381088257 + }, + { + "auxiliary_loss_clip": 0.01143209, + "auxiliary_loss_mlp": 0.01043395, + "balance_loss_clip": 1.02681279, + "balance_loss_mlp": 1.05004907, + "epoch": 0.19810611754095897, + "flos": 17967653228160.0, + "grad_norm": 2.028666267444235, + "language_loss": 0.86983609, + "learning_rate": 3.710806526117251e-06, + "loss": 0.89170212, + "num_input_tokens_seen": 71085320, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9296875, + "step": 3295, + "time_per_iteration": 2.416771411895752 + }, + { + "auxiliary_loss_clip": 0.01141633, + "auxiliary_loss_mlp": 0.01051654, + "balance_loss_clip": 1.03529871, + "balance_loss_mlp": 1.04786801, + "epoch": 0.19816624079362694, + "flos": 15084996071040.0, + "grad_norm": 13.771873008268457, + "language_loss": 0.80491048, + "learning_rate": 3.7106047670954544e-06, + "loss": 0.82684338, + "num_input_tokens_seen": 71102020, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.9375, + "step": 3296, + "time_per_iteration": 2.450934648513794 + }, + { + "auxiliary_loss_clip": 0.01145402, + "auxiliary_loss_mlp": 0.01045523, + "balance_loss_clip": 1.02637851, + "balance_loss_mlp": 1.0482688, + "epoch": 0.1982263640462949, + "flos": 24900315644160.0, + "grad_norm": 2.0804115334054134, + "language_loss": 0.68406892, + "learning_rate": 3.710402943207354e-06, + "loss": 0.70597816, + "num_input_tokens_seen": 71123390, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.97265625, + "step": 3297, + "time_per_iteration": 2.5111610889434814 + }, + { + "auxiliary_loss_clip": 0.01138573, + "auxiliary_loss_mlp": 0.01040678, + "balance_loss_clip": 1.02440548, + "balance_loss_mlp": 1.04895413, + "epoch": 0.19828648729896287, + "flos": 20376181837440.0, + "grad_norm": 1.7575465421519259, + "language_loss": 0.81232154, + "learning_rate": 3.7102010544606016e-06, + "loss": 0.83411407, + "num_input_tokens_seen": 71141800, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.89453125, + "step": 3298, + "time_per_iteration": 2.472025156021118 + }, + { + "auxiliary_loss_clip": 0.01147375, + "auxiliary_loss_mlp": 0.01046386, + "balance_loss_clip": 1.02634668, + "balance_loss_mlp": 1.05001056, + "epoch": 0.19834661055163083, + "flos": 18880035615360.0, + "grad_norm": 2.343960149367745, + "language_loss": 0.85115641, + "learning_rate": 3.7099991008628544e-06, + "loss": 0.87309396, + "num_input_tokens_seen": 71159505, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 0.9765625, + "step": 3299, + "time_per_iteration": 2.4725356101989746 + }, + { + "auxiliary_loss_clip": 0.01045198, + "auxiliary_loss_mlp": 0.01003179, + "balance_loss_clip": 1.00097358, + "balance_loss_mlp": 1.0131526, + "epoch": 0.19840673380429882, + "flos": 60259184640000.0, + "grad_norm": 0.7731212371218976, + "language_loss": 0.53215671, + "learning_rate": 3.7097970824217706e-06, + "loss": 0.55264044, + "num_input_tokens_seen": 71223265, + "router_z_loss_clip": 0.02209473, + "router_z_loss_mlp": 0.3203125, + "step": 3300, + "time_per_iteration": 3.004054069519043 + }, + { + "auxiliary_loss_clip": 0.01142157, + "auxiliary_loss_mlp": 0.01051571, + "balance_loss_clip": 1.03298628, + "balance_loss_mlp": 1.04772329, + "epoch": 0.1984668570569668, + "flos": 19902017376000.0, + "grad_norm": 1.6138936044346288, + "language_loss": 0.73150593, + "learning_rate": 3.7095949991450093e-06, + "loss": 0.75344324, + "num_input_tokens_seen": 71242385, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9453125, + "step": 3301, + "time_per_iteration": 2.4547884464263916 + }, + { + "auxiliary_loss_clip": 0.01140885, + "auxiliary_loss_mlp": 0.01038256, + "balance_loss_clip": 1.02191293, + "balance_loss_mlp": 1.04811358, + "epoch": 0.19852698030963475, + "flos": 15630766295040.0, + "grad_norm": 2.437382428027231, + "language_loss": 0.88445318, + "learning_rate": 3.709392851040235e-06, + "loss": 0.90624458, + "num_input_tokens_seen": 71258990, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.9296875, + "step": 3302, + "time_per_iteration": 2.429579019546509 + }, + { + "auxiliary_loss_clip": 0.01142317, + "auxiliary_loss_mlp": 0.0104676, + "balance_loss_clip": 1.02940273, + "balance_loss_mlp": 1.04750872, + "epoch": 0.19858710356230272, + "flos": 43143007311360.0, + "grad_norm": 1.9503370408087137, + "language_loss": 0.73907369, + "learning_rate": 3.709190638115111e-06, + "loss": 0.76096445, + "num_input_tokens_seen": 71282770, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9453125, + "step": 3303, + "time_per_iteration": 2.627835273742676 + }, + { + "auxiliary_loss_clip": 0.01141217, + "auxiliary_loss_mlp": 0.0104825, + "balance_loss_clip": 1.03117871, + "balance_loss_mlp": 1.04874539, + "epoch": 0.19864722681497068, + "flos": 35144084643840.0, + "grad_norm": 1.8172241344194675, + "language_loss": 0.74761099, + "learning_rate": 3.7089883603773084e-06, + "loss": 0.76950562, + "num_input_tokens_seen": 71301410, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.921875, + "step": 3304, + "time_per_iteration": 2.551241397857666 + }, + { + "auxiliary_loss_clip": 0.01139854, + "auxiliary_loss_mlp": 0.01039407, + "balance_loss_clip": 1.02333784, + "balance_loss_mlp": 1.04763281, + "epoch": 0.19870735006763865, + "flos": 19426200888960.0, + "grad_norm": 2.605019982075021, + "language_loss": 0.85717452, + "learning_rate": 3.7087860178344955e-06, + "loss": 0.87896717, + "num_input_tokens_seen": 71319670, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.921875, + "step": 3305, + "time_per_iteration": 2.432363986968994 + }, + { + "auxiliary_loss_clip": 0.01141298, + "auxiliary_loss_mlp": 0.01040354, + "balance_loss_clip": 1.02408171, + "balance_loss_mlp": 1.04600525, + "epoch": 0.19876747332030664, + "flos": 23547380947200.0, + "grad_norm": 1.7555780714506408, + "language_loss": 0.68014234, + "learning_rate": 3.7085836104943445e-06, + "loss": 0.70195889, + "num_input_tokens_seen": 71339850, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.953125, + "step": 3306, + "time_per_iteration": 2.478422164916992 + }, + { + "auxiliary_loss_clip": 0.01137681, + "auxiliary_loss_mlp": 0.01036227, + "balance_loss_clip": 1.02098584, + "balance_loss_mlp": 1.0453912, + "epoch": 0.1988275965729746, + "flos": 19829406032640.0, + "grad_norm": 1.4744708200758283, + "language_loss": 0.76455241, + "learning_rate": 3.7083811383645332e-06, + "loss": 0.78629148, + "num_input_tokens_seen": 71359795, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.921875, + "step": 3307, + "time_per_iteration": 2.459176778793335 + }, + { + "auxiliary_loss_clip": 0.01140736, + "auxiliary_loss_mlp": 0.0104117, + "balance_loss_clip": 1.02520776, + "balance_loss_mlp": 1.04866791, + "epoch": 0.19888771982564257, + "flos": 23513625141120.0, + "grad_norm": 1.8666050855147507, + "language_loss": 0.75933248, + "learning_rate": 3.708178601452737e-06, + "loss": 0.78115153, + "num_input_tokens_seen": 71378885, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.921875, + "step": 3308, + "time_per_iteration": 2.483060121536255 + }, + { + "auxiliary_loss_clip": 0.01141228, + "auxiliary_loss_mlp": 0.01041046, + "balance_loss_clip": 1.02426159, + "balance_loss_mlp": 1.04736626, + "epoch": 0.19894784307831054, + "flos": 18150510389760.0, + "grad_norm": 1.6368693105847256, + "language_loss": 0.75640005, + "learning_rate": 3.7079759997666374e-06, + "loss": 0.7782228, + "num_input_tokens_seen": 71397285, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.94140625, + "step": 3309, + "time_per_iteration": 3.8069632053375244 + }, + { + "auxiliary_loss_clip": 0.01138354, + "auxiliary_loss_mlp": 0.01046592, + "balance_loss_clip": 1.02869844, + "balance_loss_mlp": 1.04665506, + "epoch": 0.1990079663309785, + "flos": 24276044246400.0, + "grad_norm": 1.6858420956549012, + "language_loss": 0.87646699, + "learning_rate": 3.707773333313917e-06, + "loss": 0.8983165, + "num_input_tokens_seen": 71415775, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.9140625, + "step": 3310, + "time_per_iteration": 3.9299721717834473 + }, + { + "auxiliary_loss_clip": 0.01138843, + "auxiliary_loss_mlp": 0.01041462, + "balance_loss_clip": 1.02431977, + "balance_loss_mlp": 1.04637599, + "epoch": 0.19906808958364647, + "flos": 34897666366080.0, + "grad_norm": 3.6845239503362412, + "language_loss": 0.64166129, + "learning_rate": 3.70757060210226e-06, + "loss": 0.66346431, + "num_input_tokens_seen": 71437315, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.92578125, + "step": 3311, + "time_per_iteration": 2.5747337341308594 + }, + { + "auxiliary_loss_clip": 0.01143032, + "auxiliary_loss_mlp": 0.01042216, + "balance_loss_clip": 1.02559805, + "balance_loss_mlp": 1.04768658, + "epoch": 0.19912821283631443, + "flos": 24024885373440.0, + "grad_norm": 2.462607887220823, + "language_loss": 0.74053729, + "learning_rate": 3.707367806139355e-06, + "loss": 0.76238978, + "num_input_tokens_seen": 71456320, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.953125, + "step": 3312, + "time_per_iteration": 2.471867799758911 + }, + { + "auxiliary_loss_clip": 0.01141113, + "auxiliary_loss_mlp": 0.01046905, + "balance_loss_clip": 1.03060961, + "balance_loss_mlp": 1.04843581, + "epoch": 0.19918833608898243, + "flos": 19859031774720.0, + "grad_norm": 2.2841450786746016, + "language_loss": 0.83511955, + "learning_rate": 3.7071649454328915e-06, + "loss": 0.8569997, + "num_input_tokens_seen": 71475360, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.92578125, + "step": 3313, + "time_per_iteration": 2.4846627712249756 + }, + { + "auxiliary_loss_clip": 0.01142431, + "auxiliary_loss_mlp": 0.01041924, + "balance_loss_clip": 1.02575922, + "balance_loss_mlp": 1.04944849, + "epoch": 0.1992484593416504, + "flos": 29095794984960.0, + "grad_norm": 3.438256379955746, + "language_loss": 0.80930895, + "learning_rate": 3.7069620199905625e-06, + "loss": 0.83115256, + "num_input_tokens_seen": 71496155, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.9296875, + "step": 3314, + "time_per_iteration": 2.525754928588867 + }, + { + "auxiliary_loss_clip": 0.01137185, + "auxiliary_loss_mlp": 0.01043596, + "balance_loss_clip": 1.0280745, + "balance_loss_mlp": 1.04706359, + "epoch": 0.19930858259431836, + "flos": 23295001011840.0, + "grad_norm": 1.5137591341622172, + "language_loss": 0.87549174, + "learning_rate": 3.7067590298200627e-06, + "loss": 0.89729953, + "num_input_tokens_seen": 71517295, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8984375, + "step": 3315, + "time_per_iteration": 2.5170931816101074 + }, + { + "auxiliary_loss_clip": 0.01141446, + "auxiliary_loss_mlp": 0.01046665, + "balance_loss_clip": 1.03032112, + "balance_loss_mlp": 1.04808092, + "epoch": 0.19936870584698632, + "flos": 25378825651200.0, + "grad_norm": 1.5984895942740787, + "language_loss": 0.71255141, + "learning_rate": 3.7065559749290892e-06, + "loss": 0.73443246, + "num_input_tokens_seen": 71540000, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9296875, + "step": 3316, + "time_per_iteration": 2.520071029663086 + }, + { + "auxiliary_loss_clip": 0.0105243, + "auxiliary_loss_mlp": 0.01028392, + "balance_loss_clip": 1.02646089, + "balance_loss_mlp": 1.01928639, + "epoch": 0.1994288290996543, + "flos": 62168053109760.0, + "grad_norm": 0.8439111854473917, + "language_loss": 0.66260874, + "learning_rate": 3.706352855325342e-06, + "loss": 0.68341696, + "num_input_tokens_seen": 71607880, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.33203125, + "step": 3317, + "time_per_iteration": 3.1460416316986084 + }, + { + "auxiliary_loss_clip": 0.01142295, + "auxiliary_loss_mlp": 0.01052969, + "balance_loss_clip": 1.03557682, + "balance_loss_mlp": 1.04575253, + "epoch": 0.19948895235232225, + "flos": 19025832919680.0, + "grad_norm": 2.672944172124665, + "language_loss": 0.74319738, + "learning_rate": 3.7061496710165233e-06, + "loss": 0.76515001, + "num_input_tokens_seen": 71625695, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.96484375, + "step": 3318, + "time_per_iteration": 2.6139748096466064 + }, + { + "auxiliary_loss_clip": 0.01134724, + "auxiliary_loss_mlp": 0.01043694, + "balance_loss_clip": 1.0282445, + "balance_loss_mlp": 1.04536486, + "epoch": 0.19954907560499022, + "flos": 37815803182080.0, + "grad_norm": 1.900050251198073, + "language_loss": 0.78860074, + "learning_rate": 3.7059464220103385e-06, + "loss": 0.81038487, + "num_input_tokens_seen": 71648520, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.89453125, + "step": 3319, + "time_per_iteration": 2.6014342308044434 + }, + { + "auxiliary_loss_clip": 0.01141458, + "auxiliary_loss_mlp": 0.01042777, + "balance_loss_clip": 1.02439499, + "balance_loss_mlp": 1.04806578, + "epoch": 0.1996091988576582, + "flos": 49565199594240.0, + "grad_norm": 2.0962453666662073, + "language_loss": 0.75462162, + "learning_rate": 3.7057431083144945e-06, + "loss": 0.77646399, + "num_input_tokens_seen": 71672185, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.93359375, + "step": 3320, + "time_per_iteration": 2.739485263824463 + }, + { + "auxiliary_loss_clip": 0.01139438, + "auxiliary_loss_mlp": 0.01042565, + "balance_loss_clip": 1.02613819, + "balance_loss_mlp": 1.04714417, + "epoch": 0.19966932211032618, + "flos": 22635788659200.0, + "grad_norm": 2.167317842134812, + "language_loss": 0.80547488, + "learning_rate": 3.705539729936701e-06, + "loss": 0.82729495, + "num_input_tokens_seen": 71692890, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.921875, + "step": 3321, + "time_per_iteration": 2.581353187561035 + }, + { + "auxiliary_loss_clip": 0.01049309, + "auxiliary_loss_mlp": 0.01003433, + "balance_loss_clip": 1.00151408, + "balance_loss_mlp": 1.01694489, + "epoch": 0.19972944536299414, + "flos": 54082117745280.0, + "grad_norm": 0.880630206553271, + "language_loss": 0.65178835, + "learning_rate": 3.7053362868846696e-06, + "loss": 0.67231572, + "num_input_tokens_seen": 71745815, + "router_z_loss_clip": 0.01916504, + "router_z_loss_mlp": 0.32421875, + "step": 3322, + "time_per_iteration": 2.9042704105377197 + }, + { + "auxiliary_loss_clip": 0.01050141, + "auxiliary_loss_mlp": 0.01003283, + "balance_loss_clip": 1.00130391, + "balance_loss_mlp": 1.01724231, + "epoch": 0.1997895686156621, + "flos": 69355031817600.0, + "grad_norm": 0.7916622121471568, + "language_loss": 0.56975091, + "learning_rate": 3.7051327791661153e-06, + "loss": 0.59028506, + "num_input_tokens_seen": 71806915, + "router_z_loss_clip": 0.01977539, + "router_z_loss_mlp": 0.328125, + "step": 3323, + "time_per_iteration": 3.2141411304473877 + }, + { + "auxiliary_loss_clip": 0.01139547, + "auxiliary_loss_mlp": 0.01035371, + "balance_loss_clip": 1.01859808, + "balance_loss_mlp": 1.04839373, + "epoch": 0.19984969186833007, + "flos": 18552063507840.0, + "grad_norm": 1.9849201654975537, + "language_loss": 0.80526733, + "learning_rate": 3.7049292067887555e-06, + "loss": 0.82701647, + "num_input_tokens_seen": 71824645, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9140625, + "step": 3324, + "time_per_iteration": 2.5455262660980225 + }, + { + "auxiliary_loss_clip": 0.01137077, + "auxiliary_loss_mlp": 0.01040613, + "balance_loss_clip": 1.02329218, + "balance_loss_mlp": 1.04540765, + "epoch": 0.19990981512099804, + "flos": 26429678968320.0, + "grad_norm": 1.8681208438308643, + "language_loss": 0.53681695, + "learning_rate": 3.7047255697603092e-06, + "loss": 0.55859387, + "num_input_tokens_seen": 71845125, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.91796875, + "step": 3325, + "time_per_iteration": 2.581782102584839 + }, + { + "auxiliary_loss_clip": 0.01138508, + "auxiliary_loss_mlp": 0.01039502, + "balance_loss_clip": 1.02337289, + "balance_loss_mlp": 1.04565668, + "epoch": 0.19996993837366603, + "flos": 16325997010560.0, + "grad_norm": 2.0672953846254027, + "language_loss": 0.86169922, + "learning_rate": 3.7045218680884984e-06, + "loss": 0.88347936, + "num_input_tokens_seen": 71863500, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.9296875, + "step": 3326, + "time_per_iteration": 2.494718551635742 + }, + { + "auxiliary_loss_clip": 0.01138244, + "auxiliary_loss_mlp": 0.01037965, + "balance_loss_clip": 1.02243209, + "balance_loss_mlp": 1.04851878, + "epoch": 0.200030061626334, + "flos": 20844169159680.0, + "grad_norm": 1.8653522915536895, + "language_loss": 0.71835959, + "learning_rate": 3.7043181017810476e-06, + "loss": 0.74012172, + "num_input_tokens_seen": 71881845, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8984375, + "step": 3327, + "time_per_iteration": 2.536574602127075 + }, + { + "auxiliary_loss_clip": 0.0114197, + "auxiliary_loss_mlp": 0.01041829, + "balance_loss_clip": 1.02368546, + "balance_loss_mlp": 1.04750776, + "epoch": 0.20009018487900196, + "flos": 23762629198080.0, + "grad_norm": 1.83111198959611, + "language_loss": 0.76588571, + "learning_rate": 3.7041142708456833e-06, + "loss": 0.78772372, + "num_input_tokens_seen": 71900940, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9453125, + "step": 3328, + "time_per_iteration": 2.5083916187286377 + }, + { + "auxiliary_loss_clip": 0.01698253, + "auxiliary_loss_mlp": 0.01552284, + "balance_loss_clip": 1.52980089, + "balance_loss_mlp": 1.56677365, + "epoch": 0.20015030813166992, + "flos": 28106162236800.0, + "grad_norm": 1.6482454448342019, + "language_loss": 1.03044438, + "learning_rate": 3.7039103752901353e-06, + "loss": 0.7143048, + "num_input_tokens_seen": 71921925, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.3125, + "step": 3329, + "time_per_iteration": 15.37552785873413 + }, + { + "auxiliary_loss_clip": 0.01146286, + "auxiliary_loss_mlp": 0.01050404, + "balance_loss_clip": 1.03149772, + "balance_loss_mlp": 1.0504123, + "epoch": 0.2002104313843379, + "flos": 26067160955520.0, + "grad_norm": 1.5519947176183269, + "language_loss": 0.81297028, + "learning_rate": 3.7037064151221353e-06, + "loss": 0.8349371, + "num_input_tokens_seen": 71941855, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.9609375, + "step": 3330, + "time_per_iteration": 2.500103712081909 + }, + { + "auxiliary_loss_clip": 0.01140997, + "auxiliary_loss_mlp": 0.01037259, + "balance_loss_clip": 1.01994956, + "balance_loss_mlp": 1.04669356, + "epoch": 0.20027055463700585, + "flos": 22966633854720.0, + "grad_norm": 2.032272994312633, + "language_loss": 0.76649368, + "learning_rate": 3.703502390349417e-06, + "loss": 0.78827626, + "num_input_tokens_seen": 71960915, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9453125, + "step": 3331, + "time_per_iteration": 2.4018712043762207 + }, + { + "auxiliary_loss_clip": 0.01141733, + "auxiliary_loss_mlp": 0.01045779, + "balance_loss_clip": 1.02819538, + "balance_loss_mlp": 1.04608667, + "epoch": 0.20033067788967382, + "flos": 17165660313600.0, + "grad_norm": 1.6582018653132529, + "language_loss": 0.79261309, + "learning_rate": 3.7032983009797176e-06, + "loss": 0.81448817, + "num_input_tokens_seen": 71979220, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.953125, + "step": 3332, + "time_per_iteration": 2.4550859928131104 + }, + { + "auxiliary_loss_clip": 0.01045684, + "auxiliary_loss_mlp": 0.01005368, + "balance_loss_clip": 1.0036391, + "balance_loss_mlp": 1.01433849, + "epoch": 0.2003908011423418, + "flos": 60825566292480.0, + "grad_norm": 0.9315137515082259, + "language_loss": 0.61990142, + "learning_rate": 3.703094147020776e-06, + "loss": 0.64041197, + "num_input_tokens_seen": 72033950, + "router_z_loss_clip": 0.01733398, + "router_z_loss_mlp": 0.31445312, + "step": 3333, + "time_per_iteration": 2.9623756408691406 + }, + { + "auxiliary_loss_clip": 0.01139681, + "auxiliary_loss_mlp": 0.01044905, + "balance_loss_clip": 1.02819228, + "balance_loss_mlp": 1.04501462, + "epoch": 0.20045092439500978, + "flos": 24206234163840.0, + "grad_norm": 2.1372355522021893, + "language_loss": 0.81203878, + "learning_rate": 3.7028899284803334e-06, + "loss": 0.8338846, + "num_input_tokens_seen": 72051395, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9453125, + "step": 3334, + "time_per_iteration": 2.49924373626709 + }, + { + "auxiliary_loss_clip": 0.01146523, + "auxiliary_loss_mlp": 0.01047388, + "balance_loss_clip": 1.02938735, + "balance_loss_mlp": 1.04878521, + "epoch": 0.20051104764767774, + "flos": 29387605075200.0, + "grad_norm": 2.1564721635267516, + "language_loss": 0.74261904, + "learning_rate": 3.702685645366134e-06, + "loss": 0.76455814, + "num_input_tokens_seen": 72071305, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9765625, + "step": 3335, + "time_per_iteration": 2.634608745574951 + }, + { + "auxiliary_loss_clip": 0.01150022, + "auxiliary_loss_mlp": 0.01058924, + "balance_loss_clip": 1.04205632, + "balance_loss_mlp": 1.05375338, + "epoch": 0.2005711709003457, + "flos": 23513804709120.0, + "grad_norm": 1.6943946878944693, + "language_loss": 0.79839814, + "learning_rate": 3.7024812976859243e-06, + "loss": 0.82048762, + "num_input_tokens_seen": 72090165, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9609375, + "step": 3336, + "time_per_iteration": 2.7025394439697266 + }, + { + "auxiliary_loss_clip": 0.01145798, + "auxiliary_loss_mlp": 0.01045992, + "balance_loss_clip": 1.02744317, + "balance_loss_mlp": 1.04703879, + "epoch": 0.20063129415301367, + "flos": 22523388024960.0, + "grad_norm": 1.9043375292422164, + "language_loss": 0.78031212, + "learning_rate": 3.7022768854474532e-06, + "loss": 0.80223, + "num_input_tokens_seen": 72107210, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.98828125, + "step": 3337, + "time_per_iteration": 2.5718014240264893 + }, + { + "auxiliary_loss_clip": 0.01143827, + "auxiliary_loss_mlp": 0.01045572, + "balance_loss_clip": 1.02708244, + "balance_loss_mlp": 1.0486424, + "epoch": 0.20069141740568164, + "flos": 25958243940480.0, + "grad_norm": 1.9983960159800889, + "language_loss": 0.6873948, + "learning_rate": 3.7020724086584724e-06, + "loss": 0.70928884, + "num_input_tokens_seen": 72126315, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.94921875, + "step": 3338, + "time_per_iteration": 2.5848047733306885 + }, + { + "auxiliary_loss_clip": 0.01143098, + "auxiliary_loss_mlp": 0.01049172, + "balance_loss_clip": 1.03263819, + "balance_loss_mlp": 1.04853702, + "epoch": 0.2007515406583496, + "flos": 24790608529920.0, + "grad_norm": 2.1061075345379576, + "language_loss": 0.68823779, + "learning_rate": 3.701867867326735e-06, + "loss": 0.71016049, + "num_input_tokens_seen": 72146470, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.9453125, + "step": 3339, + "time_per_iteration": 2.523771047592163 + }, + { + "auxiliary_loss_clip": 0.01149874, + "auxiliary_loss_mlp": 0.01038245, + "balance_loss_clip": 1.02217603, + "balance_loss_mlp": 1.05197799, + "epoch": 0.2008116639110176, + "flos": 37925582123520.0, + "grad_norm": 2.3080693694415872, + "language_loss": 0.66263533, + "learning_rate": 3.7016632614599974e-06, + "loss": 0.68451655, + "num_input_tokens_seen": 72166600, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.9765625, + "step": 3340, + "time_per_iteration": 2.647495985031128 + }, + { + "auxiliary_loss_clip": 0.01141947, + "auxiliary_loss_mlp": 0.0103392, + "balance_loss_clip": 1.01570475, + "balance_loss_mlp": 1.0457145, + "epoch": 0.20087178716368556, + "flos": 20740531443840.0, + "grad_norm": 2.8472305033219696, + "language_loss": 0.74124628, + "learning_rate": 3.701458591066019e-06, + "loss": 0.76300496, + "num_input_tokens_seen": 72185160, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.9609375, + "step": 3341, + "time_per_iteration": 2.511585235595703 + }, + { + "auxiliary_loss_clip": 0.01140464, + "auxiliary_loss_mlp": 0.01043131, + "balance_loss_clip": 1.02689481, + "balance_loss_mlp": 1.04846787, + "epoch": 0.20093191041635353, + "flos": 23842279607040.0, + "grad_norm": 2.1698717951472326, + "language_loss": 0.71578503, + "learning_rate": 3.70125385615256e-06, + "loss": 0.73762101, + "num_input_tokens_seen": 72205160, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.91796875, + "step": 3342, + "time_per_iteration": 2.561998128890991 + }, + { + "auxiliary_loss_clip": 0.01142187, + "auxiliary_loss_mlp": 0.01045325, + "balance_loss_clip": 1.02871895, + "balance_loss_mlp": 1.04746354, + "epoch": 0.2009920336690215, + "flos": 21792067119360.0, + "grad_norm": 1.9864957062525024, + "language_loss": 0.73130047, + "learning_rate": 3.701049056727384e-06, + "loss": 0.75317556, + "num_input_tokens_seen": 72223555, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9453125, + "step": 3343, + "time_per_iteration": 4.046127557754517 + }, + { + "auxiliary_loss_clip": 0.01142173, + "auxiliary_loss_mlp": 0.01050047, + "balance_loss_clip": 1.03252363, + "balance_loss_mlp": 1.04738092, + "epoch": 0.20105215692168946, + "flos": 26359222440960.0, + "grad_norm": 1.9813453341923526, + "language_loss": 0.81026411, + "learning_rate": 3.7008441927982574e-06, + "loss": 0.83218634, + "num_input_tokens_seen": 72242465, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.94921875, + "step": 3344, + "time_per_iteration": 2.520765542984009 + }, + { + "auxiliary_loss_clip": 0.01141139, + "auxiliary_loss_mlp": 0.01050367, + "balance_loss_clip": 1.03412437, + "balance_loss_mlp": 1.04661858, + "epoch": 0.20111228017435742, + "flos": 18807280617600.0, + "grad_norm": 2.7491478080862684, + "language_loss": 0.83503234, + "learning_rate": 3.700639264372948e-06, + "loss": 0.85694736, + "num_input_tokens_seen": 72260655, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.9453125, + "step": 3345, + "time_per_iteration": 4.064355373382568 + }, + { + "auxiliary_loss_clip": 0.01135224, + "auxiliary_loss_mlp": 0.01041726, + "balance_loss_clip": 1.02689624, + "balance_loss_mlp": 1.0464828, + "epoch": 0.20117240342702541, + "flos": 19975059682560.0, + "grad_norm": 1.723487885242635, + "language_loss": 0.67909771, + "learning_rate": 3.7004342714592283e-06, + "loss": 0.70086718, + "num_input_tokens_seen": 72279055, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.88671875, + "step": 3346, + "time_per_iteration": 2.521949291229248 + }, + { + "auxiliary_loss_clip": 0.01140579, + "auxiliary_loss_mlp": 0.01048866, + "balance_loss_clip": 1.03233206, + "balance_loss_mlp": 1.04726124, + "epoch": 0.20123252667969338, + "flos": 23142703345920.0, + "grad_norm": 2.272845003166824, + "language_loss": 0.73496711, + "learning_rate": 3.70022921406487e-06, + "loss": 0.75686157, + "num_input_tokens_seen": 72297895, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.93359375, + "step": 3347, + "time_per_iteration": 2.5316877365112305 + }, + { + "auxiliary_loss_clip": 0.01140927, + "auxiliary_loss_mlp": 0.01047237, + "balance_loss_clip": 1.03179908, + "balance_loss_mlp": 1.04827023, + "epoch": 0.20129264993236134, + "flos": 23221671396480.0, + "grad_norm": 1.7467826588499227, + "language_loss": 0.86716485, + "learning_rate": 3.70002409219765e-06, + "loss": 0.88904649, + "num_input_tokens_seen": 72318385, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.92578125, + "step": 3348, + "time_per_iteration": 2.5123202800750732 + }, + { + "auxiliary_loss_clip": 0.01138726, + "auxiliary_loss_mlp": 0.01041589, + "balance_loss_clip": 1.02335036, + "balance_loss_mlp": 1.04729295, + "epoch": 0.2013527731850293, + "flos": 21871466133120.0, + "grad_norm": 1.5886148695932183, + "language_loss": 0.71200913, + "learning_rate": 3.699818905865346e-06, + "loss": 0.73381227, + "num_input_tokens_seen": 72338235, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.9140625, + "step": 3349, + "time_per_iteration": 2.505594491958618 + }, + { + "auxiliary_loss_clip": 0.01144556, + "auxiliary_loss_mlp": 0.01048519, + "balance_loss_clip": 1.03016067, + "balance_loss_mlp": 1.04982185, + "epoch": 0.20141289643769728, + "flos": 18040803275520.0, + "grad_norm": 1.649154800785762, + "language_loss": 0.71079665, + "learning_rate": 3.6996136550757377e-06, + "loss": 0.73272741, + "num_input_tokens_seen": 72357825, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.9453125, + "step": 3350, + "time_per_iteration": 2.4927315711975098 + }, + { + "auxiliary_loss_clip": 0.01145933, + "auxiliary_loss_mlp": 0.01044553, + "balance_loss_clip": 1.02612305, + "balance_loss_mlp": 1.05045485, + "epoch": 0.20147301969036524, + "flos": 23951412103680.0, + "grad_norm": 3.2873247390310554, + "language_loss": 0.76327842, + "learning_rate": 3.69940833983661e-06, + "loss": 0.78518331, + "num_input_tokens_seen": 72376335, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.953125, + "step": 3351, + "time_per_iteration": 2.5077342987060547 + }, + { + "auxiliary_loss_clip": 0.01146641, + "auxiliary_loss_mlp": 0.01043619, + "balance_loss_clip": 1.02555871, + "balance_loss_mlp": 1.05069637, + "epoch": 0.2015331429430332, + "flos": 25588471380480.0, + "grad_norm": 1.662758000066145, + "language_loss": 0.80545723, + "learning_rate": 3.699202960155748e-06, + "loss": 0.8273598, + "num_input_tokens_seen": 72395440, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.95703125, + "step": 3352, + "time_per_iteration": 2.5717766284942627 + }, + { + "auxiliary_loss_clip": 0.01146315, + "auxiliary_loss_mlp": 0.01039569, + "balance_loss_clip": 1.02274823, + "balance_loss_mlp": 1.05210721, + "epoch": 0.2015932661957012, + "flos": 26724972677760.0, + "grad_norm": 1.7179856660366186, + "language_loss": 0.8027631, + "learning_rate": 3.6989975160409396e-06, + "loss": 0.82462192, + "num_input_tokens_seen": 72414670, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9453125, + "step": 3353, + "time_per_iteration": 2.6415467262268066 + }, + { + "auxiliary_loss_clip": 0.01140403, + "auxiliary_loss_mlp": 0.01041635, + "balance_loss_clip": 1.02512455, + "balance_loss_mlp": 1.04978478, + "epoch": 0.20165338944836916, + "flos": 15633136592640.0, + "grad_norm": 2.050762039112588, + "language_loss": 0.8946988, + "learning_rate": 3.6987920074999747e-06, + "loss": 0.91651917, + "num_input_tokens_seen": 72432210, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.90625, + "step": 3354, + "time_per_iteration": 2.4780237674713135 + }, + { + "auxiliary_loss_clip": 0.01052075, + "auxiliary_loss_mlp": 0.01011403, + "balance_loss_clip": 1.00948358, + "balance_loss_mlp": 1.0202148, + "epoch": 0.20171351270103713, + "flos": 57912529207680.0, + "grad_norm": 0.830112597874188, + "language_loss": 0.55839282, + "learning_rate": 3.6985864345406465e-06, + "loss": 0.57902759, + "num_input_tokens_seen": 72489225, + "router_z_loss_clip": 0.01916504, + "router_z_loss_mlp": 0.31835938, + "step": 3355, + "time_per_iteration": 3.0224292278289795 + }, + { + "auxiliary_loss_clip": 0.01140957, + "auxiliary_loss_mlp": 0.01045212, + "balance_loss_clip": 1.02891648, + "balance_loss_mlp": 1.05068707, + "epoch": 0.2017736359537051, + "flos": 20814363849600.0, + "grad_norm": 1.5257876958196368, + "language_loss": 0.84076762, + "learning_rate": 3.698380797170751e-06, + "loss": 0.86262929, + "num_input_tokens_seen": 72508715, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.90234375, + "step": 3356, + "time_per_iteration": 2.510615348815918 + }, + { + "auxiliary_loss_clip": 0.01152963, + "auxiliary_loss_mlp": 0.01043363, + "balance_loss_clip": 1.024194, + "balance_loss_mlp": 1.05356848, + "epoch": 0.20183375920637306, + "flos": 17092043389440.0, + "grad_norm": 2.9361880537925584, + "language_loss": 0.688007, + "learning_rate": 3.698175095398085e-06, + "loss": 0.70997024, + "num_input_tokens_seen": 72525135, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.9921875, + "step": 3357, + "time_per_iteration": 2.460022211074829 + }, + { + "auxiliary_loss_clip": 0.01144866, + "auxiliary_loss_mlp": 0.01040819, + "balance_loss_clip": 1.02280617, + "balance_loss_mlp": 1.0492487, + "epoch": 0.20189388245904102, + "flos": 18661339658880.0, + "grad_norm": 1.7490617907772006, + "language_loss": 0.71748042, + "learning_rate": 3.6979693292304493e-06, + "loss": 0.73933733, + "num_input_tokens_seen": 72543690, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.95703125, + "step": 3358, + "time_per_iteration": 2.563767194747925 + }, + { + "auxiliary_loss_clip": 0.01139733, + "auxiliary_loss_mlp": 0.01054955, + "balance_loss_clip": 1.03818202, + "balance_loss_mlp": 1.04849517, + "epoch": 0.20195400571170902, + "flos": 16797539779200.0, + "grad_norm": 2.042998238377631, + "language_loss": 0.83104217, + "learning_rate": 3.6977634986756463e-06, + "loss": 0.85298896, + "num_input_tokens_seen": 72560725, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9140625, + "step": 3359, + "time_per_iteration": 2.531332015991211 + }, + { + "auxiliary_loss_clip": 0.01052883, + "auxiliary_loss_mlp": 0.01001012, + "balance_loss_clip": 0.99911654, + "balance_loss_mlp": 1.02214265, + "epoch": 0.20201412896437698, + "flos": 67174716268800.0, + "grad_norm": 12.853939959466139, + "language_loss": 0.5895561, + "learning_rate": 3.697557603741482e-06, + "loss": 0.61009508, + "num_input_tokens_seen": 72621940, + "router_z_loss_clip": 0.0189209, + "router_z_loss_mlp": 0.30859375, + "step": 3360, + "time_per_iteration": 3.0536341667175293 + }, + { + "auxiliary_loss_clip": 0.01147837, + "auxiliary_loss_mlp": 0.01049077, + "balance_loss_clip": 1.03117216, + "balance_loss_mlp": 1.05149043, + "epoch": 0.20207425221704495, + "flos": 21325013550720.0, + "grad_norm": 2.4416015649532286, + "language_loss": 0.62138069, + "learning_rate": 3.697351644435763e-06, + "loss": 0.64334983, + "num_input_tokens_seen": 72639135, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.96484375, + "step": 3361, + "time_per_iteration": 2.512657403945923 + }, + { + "auxiliary_loss_clip": 0.0114522, + "auxiliary_loss_mlp": 0.01055979, + "balance_loss_clip": 1.03900385, + "balance_loss_mlp": 1.05156183, + "epoch": 0.2021343754697129, + "flos": 22527158952960.0, + "grad_norm": 2.0025961231737526, + "language_loss": 0.75524926, + "learning_rate": 3.6971456207662993e-06, + "loss": 0.77726126, + "num_input_tokens_seen": 72658525, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9375, + "step": 3362, + "time_per_iteration": 2.555492639541626 + }, + { + "auxiliary_loss_clip": 0.01145631, + "auxiliary_loss_mlp": 0.01046193, + "balance_loss_clip": 1.02926481, + "balance_loss_mlp": 1.05209327, + "epoch": 0.20219449872238088, + "flos": 19062785036160.0, + "grad_norm": 1.6135185744423872, + "language_loss": 0.76400363, + "learning_rate": 3.6969395327409035e-06, + "loss": 0.78592181, + "num_input_tokens_seen": 72678085, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9375, + "step": 3363, + "time_per_iteration": 2.486969470977783 + }, + { + "auxiliary_loss_clip": 0.01141408, + "auxiliary_loss_mlp": 0.01052858, + "balance_loss_clip": 1.03686023, + "balance_loss_mlp": 1.04736471, + "epoch": 0.20225462197504884, + "flos": 24717027519360.0, + "grad_norm": 2.0495916908721434, + "language_loss": 0.74606001, + "learning_rate": 3.696733380367391e-06, + "loss": 0.76800275, + "num_input_tokens_seen": 72698695, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.9375, + "step": 3364, + "time_per_iteration": 2.58673095703125 + }, + { + "auxiliary_loss_clip": 0.01144028, + "auxiliary_loss_mlp": 0.01052057, + "balance_loss_clip": 1.03390145, + "balance_loss_mlp": 1.04865253, + "epoch": 0.2023147452277168, + "flos": 22018304931840.0, + "grad_norm": 2.1992700083841084, + "language_loss": 0.71451771, + "learning_rate": 3.6965271636535783e-06, + "loss": 0.73647857, + "num_input_tokens_seen": 72717880, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.953125, + "step": 3365, + "time_per_iteration": 2.522470712661743 + }, + { + "auxiliary_loss_clip": 0.01147339, + "auxiliary_loss_mlp": 0.01052179, + "balance_loss_clip": 1.03516757, + "balance_loss_mlp": 1.05331004, + "epoch": 0.2023748684803848, + "flos": 17745365911680.0, + "grad_norm": 1.9561618637344158, + "language_loss": 0.85770535, + "learning_rate": 3.696320882607286e-06, + "loss": 0.87970054, + "num_input_tokens_seen": 72736410, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.94140625, + "step": 3366, + "time_per_iteration": 2.536529541015625 + }, + { + "auxiliary_loss_clip": 0.01143453, + "auxiliary_loss_mlp": 0.01044399, + "balance_loss_clip": 1.02761412, + "balance_loss_mlp": 1.0499506, + "epoch": 0.20243499173305277, + "flos": 31138932493440.0, + "grad_norm": 1.628387041142295, + "language_loss": 0.69651556, + "learning_rate": 3.696114537236335e-06, + "loss": 0.7183941, + "num_input_tokens_seen": 72758295, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.93359375, + "step": 3367, + "time_per_iteration": 2.5608372688293457 + }, + { + "auxiliary_loss_clip": 0.01145892, + "auxiliary_loss_mlp": 0.01043195, + "balance_loss_clip": 1.0235498, + "balance_loss_mlp": 1.04696274, + "epoch": 0.20249511498572073, + "flos": 33839235279360.0, + "grad_norm": 2.963599898430263, + "language_loss": 0.68230569, + "learning_rate": 3.6959081275485512e-06, + "loss": 0.70419657, + "num_input_tokens_seen": 72782495, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.98828125, + "step": 3368, + "time_per_iteration": 2.66802978515625 + }, + { + "auxiliary_loss_clip": 0.01143607, + "auxiliary_loss_mlp": 0.01049214, + "balance_loss_clip": 1.03178596, + "balance_loss_mlp": 1.0505259, + "epoch": 0.2025552382383887, + "flos": 21215629658880.0, + "grad_norm": 7.849671101524798, + "language_loss": 0.77025628, + "learning_rate": 3.6957016535517615e-06, + "loss": 0.79218459, + "num_input_tokens_seen": 72801885, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9296875, + "step": 3369, + "time_per_iteration": 2.5143446922302246 + }, + { + "auxiliary_loss_clip": 0.01145287, + "auxiliary_loss_mlp": 0.01057318, + "balance_loss_clip": 1.04029489, + "balance_loss_mlp": 1.04800487, + "epoch": 0.20261536149105666, + "flos": 14647388676480.0, + "grad_norm": 4.298107611861754, + "language_loss": 0.65408337, + "learning_rate": 3.695495115253795e-06, + "loss": 0.67610943, + "num_input_tokens_seen": 72816990, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.97265625, + "step": 3370, + "time_per_iteration": 2.503589630126953 + }, + { + "auxiliary_loss_clip": 0.01048919, + "auxiliary_loss_mlp": 0.01024768, + "balance_loss_clip": 1.02313519, + "balance_loss_mlp": 1.01856685, + "epoch": 0.20267548474372463, + "flos": 66783649921920.0, + "grad_norm": 0.6799262329378595, + "language_loss": 0.58101869, + "learning_rate": 3.6952885126624834e-06, + "loss": 0.60175562, + "num_input_tokens_seen": 72879240, + "router_z_loss_clip": 0.01635742, + "router_z_loss_mlp": 0.3046875, + "step": 3371, + "time_per_iteration": 3.1626369953155518 + }, + { + "auxiliary_loss_clip": 0.01143688, + "auxiliary_loss_mlp": 0.0104249, + "balance_loss_clip": 1.0254668, + "balance_loss_mlp": 1.04866266, + "epoch": 0.2027356079963926, + "flos": 24680793674880.0, + "grad_norm": 1.766606164011739, + "language_loss": 0.92068136, + "learning_rate": 3.6950818457856617e-06, + "loss": 0.94254309, + "num_input_tokens_seen": 72899030, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.94921875, + "step": 3372, + "time_per_iteration": 2.578045129776001 + }, + { + "auxiliary_loss_clip": 0.0114549, + "auxiliary_loss_mlp": 0.01044018, + "balance_loss_clip": 1.02538514, + "balance_loss_mlp": 1.05037856, + "epoch": 0.20279573124906058, + "flos": 26392762765440.0, + "grad_norm": 1.6491924635250923, + "language_loss": 0.78632712, + "learning_rate": 3.694875114631167e-06, + "loss": 0.80822217, + "num_input_tokens_seen": 72919190, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.953125, + "step": 3373, + "time_per_iteration": 2.5762507915496826 + }, + { + "auxiliary_loss_clip": 0.01137806, + "auxiliary_loss_mlp": 0.01039377, + "balance_loss_clip": 1.02162719, + "balance_loss_mlp": 1.04629672, + "epoch": 0.20285585450172855, + "flos": 33799984692480.0, + "grad_norm": 1.8751465027713456, + "language_loss": 0.71102971, + "learning_rate": 3.6946683192068377e-06, + "loss": 0.73280156, + "num_input_tokens_seen": 72939720, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9140625, + "step": 3374, + "time_per_iteration": 2.6212260723114014 + }, + { + "auxiliary_loss_clip": 0.01048807, + "auxiliary_loss_mlp": 0.01001811, + "balance_loss_clip": 1.00001132, + "balance_loss_mlp": 1.01811993, + "epoch": 0.20291597775439651, + "flos": 71164823598720.0, + "grad_norm": 0.9912238676598704, + "language_loss": 0.62450445, + "learning_rate": 3.694461459520516e-06, + "loss": 0.64501071, + "num_input_tokens_seen": 73000015, + "router_z_loss_clip": 0.01794434, + "router_z_loss_mlp": 0.30859375, + "step": 3375, + "time_per_iteration": 3.0768048763275146 + }, + { + "auxiliary_loss_clip": 0.01140549, + "auxiliary_loss_mlp": 0.01044631, + "balance_loss_clip": 1.02722621, + "balance_loss_mlp": 1.04769731, + "epoch": 0.20297610100706448, + "flos": 19494287118720.0, + "grad_norm": 1.6669967725054042, + "language_loss": 0.82450807, + "learning_rate": 3.6942545355800463e-06, + "loss": 0.84635985, + "num_input_tokens_seen": 73017675, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9296875, + "step": 3376, + "time_per_iteration": 2.5632758140563965 + }, + { + "auxiliary_loss_clip": 0.011433, + "auxiliary_loss_mlp": 0.01039932, + "balance_loss_clip": 1.02110839, + "balance_loss_mlp": 1.04692364, + "epoch": 0.20303622425973245, + "flos": 25044245441280.0, + "grad_norm": 2.2640770034372006, + "language_loss": 0.81587797, + "learning_rate": 3.6940475473932743e-06, + "loss": 0.83771032, + "num_input_tokens_seen": 73036135, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.96484375, + "step": 3377, + "time_per_iteration": 2.6376402378082275 + }, + { + "auxiliary_loss_clip": 0.01139097, + "auxiliary_loss_mlp": 0.01045773, + "balance_loss_clip": 1.02786779, + "balance_loss_mlp": 1.04670048, + "epoch": 0.2030963475124004, + "flos": 21979988098560.0, + "grad_norm": 4.046949512949318, + "language_loss": 0.769104, + "learning_rate": 3.69384049496805e-06, + "loss": 0.79095268, + "num_input_tokens_seen": 73054075, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.921875, + "step": 3378, + "time_per_iteration": 2.532942056655884 + }, + { + "auxiliary_loss_clip": 0.01143396, + "auxiliary_loss_mlp": 0.01043534, + "balance_loss_clip": 1.02493691, + "balance_loss_mlp": 1.04772687, + "epoch": 0.2031564707650684, + "flos": 19500392430720.0, + "grad_norm": 1.9870266088444717, + "language_loss": 0.79710048, + "learning_rate": 3.6936333783122242e-06, + "loss": 0.81896979, + "num_input_tokens_seen": 73073530, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.95703125, + "step": 3379, + "time_per_iteration": 2.5187509059906006 + }, + { + "auxiliary_loss_clip": 0.01137083, + "auxiliary_loss_mlp": 0.01038348, + "balance_loss_clip": 1.02162337, + "balance_loss_mlp": 1.04698288, + "epoch": 0.20321659401773637, + "flos": 22747075971840.0, + "grad_norm": 1.7003196517483214, + "language_loss": 0.86949915, + "learning_rate": 3.6934261974336505e-06, + "loss": 0.89125347, + "num_input_tokens_seen": 73092820, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8984375, + "step": 3380, + "time_per_iteration": 2.5350420475006104 + }, + { + "auxiliary_loss_clip": 0.01143485, + "auxiliary_loss_mlp": 0.01046611, + "balance_loss_clip": 1.02905154, + "balance_loss_mlp": 1.05103135, + "epoch": 0.20327671727040433, + "flos": 22455840499200.0, + "grad_norm": 1.9133898096862498, + "language_loss": 0.74515057, + "learning_rate": 3.693218952340186e-06, + "loss": 0.76705158, + "num_input_tokens_seen": 73113385, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.92578125, + "step": 3381, + "time_per_iteration": 2.5428466796875 + }, + { + "auxiliary_loss_clip": 0.01143807, + "auxiliary_loss_mlp": 0.01043772, + "balance_loss_clip": 1.0258193, + "balance_loss_mlp": 1.04754519, + "epoch": 0.2033368405230723, + "flos": 19535010163200.0, + "grad_norm": 1.741042372938858, + "language_loss": 0.79304886, + "learning_rate": 3.6930116430396895e-06, + "loss": 0.81492472, + "num_input_tokens_seen": 73131195, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9609375, + "step": 3382, + "time_per_iteration": 2.51084041595459 + }, + { + "auxiliary_loss_clip": 0.01146625, + "auxiliary_loss_mlp": 0.01040796, + "balance_loss_clip": 1.02123427, + "balance_loss_mlp": 1.04849267, + "epoch": 0.20339696377574026, + "flos": 13809233744640.0, + "grad_norm": 1.8514394244027284, + "language_loss": 0.80188596, + "learning_rate": 3.6928042695400214e-06, + "loss": 0.82376015, + "num_input_tokens_seen": 73148850, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.98046875, + "step": 3383, + "time_per_iteration": 2.5047500133514404 + }, + { + "auxiliary_loss_clip": 0.0113964, + "auxiliary_loss_mlp": 0.01042049, + "balance_loss_clip": 1.02401257, + "balance_loss_mlp": 1.04616201, + "epoch": 0.20345708702840823, + "flos": 20339409288960.0, + "grad_norm": 6.482166974991387, + "language_loss": 0.74195492, + "learning_rate": 3.6925968318490464e-06, + "loss": 0.76377177, + "num_input_tokens_seen": 73166775, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.93359375, + "step": 3384, + "time_per_iteration": 2.4931931495666504 + }, + { + "auxiliary_loss_clip": 0.01147866, + "auxiliary_loss_mlp": 0.01043488, + "balance_loss_clip": 1.02442586, + "balance_loss_mlp": 1.04929996, + "epoch": 0.2035172102810762, + "flos": 20333950421760.0, + "grad_norm": 2.292912234818254, + "language_loss": 0.76429737, + "learning_rate": 3.6923893299746293e-06, + "loss": 0.78621089, + "num_input_tokens_seen": 73183215, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.984375, + "step": 3385, + "time_per_iteration": 3.9999845027923584 + }, + { + "auxiliary_loss_clip": 0.01139546, + "auxiliary_loss_mlp": 0.01058955, + "balance_loss_clip": 1.04031098, + "balance_loss_mlp": 1.04538202, + "epoch": 0.2035773335337442, + "flos": 23330983461120.0, + "grad_norm": 1.8347755395186154, + "language_loss": 0.68259251, + "learning_rate": 3.692181763924639e-06, + "loss": 0.70457751, + "num_input_tokens_seen": 73203290, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.94140625, + "step": 3386, + "time_per_iteration": 2.525538682937622 + }, + { + "auxiliary_loss_clip": 0.01143921, + "auxiliary_loss_mlp": 0.01054172, + "balance_loss_clip": 1.0348835, + "balance_loss_mlp": 1.04785144, + "epoch": 0.20363745678641215, + "flos": 28330287310080.0, + "grad_norm": 1.949323793812955, + "language_loss": 0.81000078, + "learning_rate": 3.691974133706947e-06, + "loss": 0.83198166, + "num_input_tokens_seen": 73226185, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.9609375, + "step": 3387, + "time_per_iteration": 4.122355222702026 + }, + { + "auxiliary_loss_clip": 0.01137445, + "auxiliary_loss_mlp": 0.01040694, + "balance_loss_clip": 1.02331305, + "balance_loss_mlp": 1.04754424, + "epoch": 0.20369758003908012, + "flos": 18915658928640.0, + "grad_norm": 2.869822824167972, + "language_loss": 0.79960001, + "learning_rate": 3.6917664393294262e-06, + "loss": 0.82138139, + "num_input_tokens_seen": 73243300, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.8984375, + "step": 3388, + "time_per_iteration": 2.498455047607422 + }, + { + "auxiliary_loss_clip": 0.01142619, + "auxiliary_loss_mlp": 0.01040015, + "balance_loss_clip": 1.02120411, + "balance_loss_mlp": 1.04757476, + "epoch": 0.20375770329174808, + "flos": 19206499351680.0, + "grad_norm": 1.6489636222716584, + "language_loss": 0.71810246, + "learning_rate": 3.6915586807999527e-06, + "loss": 0.73992884, + "num_input_tokens_seen": 73261490, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.94921875, + "step": 3389, + "time_per_iteration": 2.4751241207122803 + }, + { + "auxiliary_loss_clip": 0.01140457, + "auxiliary_loss_mlp": 0.01048463, + "balance_loss_clip": 1.03108239, + "balance_loss_mlp": 1.04812241, + "epoch": 0.20381782654441605, + "flos": 19391008538880.0, + "grad_norm": 1.7476252287205662, + "language_loss": 0.87431413, + "learning_rate": 3.691350858126404e-06, + "loss": 0.89620328, + "num_input_tokens_seen": 73280180, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.921875, + "step": 3390, + "time_per_iteration": 2.5229172706604004 + }, + { + "auxiliary_loss_clip": 0.01138893, + "auxiliary_loss_mlp": 0.01044263, + "balance_loss_clip": 1.02673888, + "balance_loss_mlp": 1.04638386, + "epoch": 0.203877949797084, + "flos": 24827704300800.0, + "grad_norm": 3.0399462437196743, + "language_loss": 0.71092427, + "learning_rate": 3.691142971316662e-06, + "loss": 0.73275584, + "num_input_tokens_seen": 73300680, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.92578125, + "step": 3391, + "time_per_iteration": 2.528003454208374 + }, + { + "auxiliary_loss_clip": 0.01137362, + "auxiliary_loss_mlp": 0.01043664, + "balance_loss_clip": 1.02592552, + "balance_loss_mlp": 1.04483938, + "epoch": 0.20393807304975198, + "flos": 18003707504640.0, + "grad_norm": 2.517550673127581, + "language_loss": 0.85993969, + "learning_rate": 3.6909350203786086e-06, + "loss": 0.88174999, + "num_input_tokens_seen": 73316760, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.921875, + "step": 3392, + "time_per_iteration": 2.5080008506774902 + }, + { + "auxiliary_loss_clip": 0.01143294, + "auxiliary_loss_mlp": 0.01049793, + "balance_loss_clip": 1.03231716, + "balance_loss_mlp": 1.04759896, + "epoch": 0.20399819630241997, + "flos": 24206988349440.0, + "grad_norm": 1.5067582134175779, + "language_loss": 0.80730146, + "learning_rate": 3.69072700532013e-06, + "loss": 0.82923234, + "num_input_tokens_seen": 73339385, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.95703125, + "step": 3393, + "time_per_iteration": 2.5464906692504883 + }, + { + "auxiliary_loss_clip": 0.01139211, + "auxiliary_loss_mlp": 0.01039094, + "balance_loss_clip": 1.02236915, + "balance_loss_mlp": 1.0471251, + "epoch": 0.20405831955508794, + "flos": 20777124424320.0, + "grad_norm": 1.882536464234473, + "language_loss": 0.86276352, + "learning_rate": 3.6905189261491137e-06, + "loss": 0.88454658, + "num_input_tokens_seen": 73357235, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.921875, + "step": 3394, + "time_per_iteration": 2.495544195175171 + }, + { + "auxiliary_loss_clip": 0.01139364, + "auxiliary_loss_mlp": 0.01042637, + "balance_loss_clip": 1.02640033, + "balance_loss_mlp": 1.04756498, + "epoch": 0.2041184428077559, + "flos": 15486908325120.0, + "grad_norm": 2.9880936155816324, + "language_loss": 0.83455038, + "learning_rate": 3.69031078287345e-06, + "loss": 0.85637033, + "num_input_tokens_seen": 73374435, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.91796875, + "step": 3395, + "time_per_iteration": 2.4636099338531494 + }, + { + "auxiliary_loss_clip": 0.01144564, + "auxiliary_loss_mlp": 0.01035131, + "balance_loss_clip": 1.01753616, + "balance_loss_mlp": 1.04799199, + "epoch": 0.20417856606042387, + "flos": 15588463052160.0, + "grad_norm": 2.0105247570422877, + "language_loss": 0.83632553, + "learning_rate": 3.690102575501033e-06, + "loss": 0.85812247, + "num_input_tokens_seen": 73391025, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.96875, + "step": 3396, + "time_per_iteration": 2.507140636444092 + }, + { + "auxiliary_loss_clip": 0.01139778, + "auxiliary_loss_mlp": 0.01042511, + "balance_loss_clip": 1.02470088, + "balance_loss_mlp": 1.04775488, + "epoch": 0.20423868931309183, + "flos": 24279348297600.0, + "grad_norm": 1.9261630392212734, + "language_loss": 0.77139032, + "learning_rate": 3.6898943040397556e-06, + "loss": 0.79321325, + "num_input_tokens_seen": 73409270, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.91796875, + "step": 3397, + "time_per_iteration": 2.5000061988830566 + }, + { + "auxiliary_loss_clip": 0.01140053, + "auxiliary_loss_mlp": 0.01043864, + "balance_loss_clip": 1.027771, + "balance_loss_mlp": 1.0482713, + "epoch": 0.2042988125657598, + "flos": 18614870438400.0, + "grad_norm": 2.6022565941655285, + "language_loss": 0.87048233, + "learning_rate": 3.689685968497518e-06, + "loss": 0.89232147, + "num_input_tokens_seen": 73425225, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.91796875, + "step": 3398, + "time_per_iteration": 2.4879262447357178 + }, + { + "auxiliary_loss_clip": 0.01146457, + "auxiliary_loss_mlp": 0.01045529, + "balance_loss_clip": 1.02855396, + "balance_loss_mlp": 1.05200124, + "epoch": 0.2043589358184278, + "flos": 17851230270720.0, + "grad_norm": 2.0446998950436273, + "language_loss": 0.77973163, + "learning_rate": 3.6894775688822186e-06, + "loss": 0.8016516, + "num_input_tokens_seen": 73440940, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9453125, + "step": 3399, + "time_per_iteration": 2.4417104721069336 + }, + { + "auxiliary_loss_clip": 0.01142529, + "auxiliary_loss_mlp": 0.01039697, + "balance_loss_clip": 1.02180338, + "balance_loss_mlp": 1.0471437, + "epoch": 0.20441905907109575, + "flos": 21435223455360.0, + "grad_norm": 1.9372936252349278, + "language_loss": 0.76201475, + "learning_rate": 3.6892691052017603e-06, + "loss": 0.78383702, + "num_input_tokens_seen": 73458805, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.953125, + "step": 3400, + "time_per_iteration": 2.513378858566284 + }, + { + "auxiliary_loss_clip": 0.01140509, + "auxiliary_loss_mlp": 0.01043924, + "balance_loss_clip": 1.02709138, + "balance_loss_mlp": 1.04937315, + "epoch": 0.20447918232376372, + "flos": 27707703851520.0, + "grad_norm": 1.6590163779918286, + "language_loss": 0.79357922, + "learning_rate": 3.6890605774640487e-06, + "loss": 0.81542361, + "num_input_tokens_seen": 73479380, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9140625, + "step": 3401, + "time_per_iteration": 2.5628185272216797 + }, + { + "auxiliary_loss_clip": 0.01141107, + "auxiliary_loss_mlp": 0.01041447, + "balance_loss_clip": 1.02400649, + "balance_loss_mlp": 1.04659653, + "epoch": 0.20453930557643168, + "flos": 30524214113280.0, + "grad_norm": 1.682072453203677, + "language_loss": 0.69205511, + "learning_rate": 3.688851985676991e-06, + "loss": 0.71388066, + "num_input_tokens_seen": 73505105, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9453125, + "step": 3402, + "time_per_iteration": 2.653932571411133 + }, + { + "auxiliary_loss_clip": 0.01144935, + "auxiliary_loss_mlp": 0.01043349, + "balance_loss_clip": 1.02538395, + "balance_loss_mlp": 1.05008948, + "epoch": 0.20459942882909965, + "flos": 18987767481600.0, + "grad_norm": 2.6906490082479086, + "language_loss": 0.81077826, + "learning_rate": 3.688643329848496e-06, + "loss": 0.83266115, + "num_input_tokens_seen": 73523700, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.94921875, + "step": 3403, + "time_per_iteration": 2.518402099609375 + }, + { + "auxiliary_loss_clip": 0.01145331, + "auxiliary_loss_mlp": 0.0104575, + "balance_loss_clip": 1.02873933, + "balance_loss_mlp": 1.05067933, + "epoch": 0.20465955208176762, + "flos": 20339050152960.0, + "grad_norm": 1.7308307985558895, + "language_loss": 0.83497006, + "learning_rate": 3.6884346099864772e-06, + "loss": 0.85688084, + "num_input_tokens_seen": 73542625, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9453125, + "step": 3404, + "time_per_iteration": 2.5041427612304688 + }, + { + "auxiliary_loss_clip": 0.0114107, + "auxiliary_loss_mlp": 0.0104714, + "balance_loss_clip": 1.03018808, + "balance_loss_mlp": 1.04686713, + "epoch": 0.20471967533443558, + "flos": 21251288885760.0, + "grad_norm": 1.717424757849508, + "language_loss": 0.86319768, + "learning_rate": 3.6882258260988487e-06, + "loss": 0.88507974, + "num_input_tokens_seen": 73561450, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9453125, + "step": 3405, + "time_per_iteration": 2.5019404888153076 + }, + { + "auxiliary_loss_clip": 0.01138198, + "auxiliary_loss_mlp": 0.01042134, + "balance_loss_clip": 1.02558827, + "balance_loss_mlp": 1.04664326, + "epoch": 0.20477979858710357, + "flos": 14501555458560.0, + "grad_norm": 2.0734152439752327, + "language_loss": 0.84731919, + "learning_rate": 3.6880169781935276e-06, + "loss": 0.86912251, + "num_input_tokens_seen": 73577155, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9140625, + "step": 3406, + "time_per_iteration": 2.508274793624878 + }, + { + "auxiliary_loss_clip": 0.0114, + "auxiliary_loss_mlp": 0.01042004, + "balance_loss_clip": 1.02601814, + "balance_loss_mlp": 1.04885817, + "epoch": 0.20483992183977154, + "flos": 11400310085760.0, + "grad_norm": 2.0579137112366332, + "language_loss": 0.68086451, + "learning_rate": 3.6878080662784336e-06, + "loss": 0.70268458, + "num_input_tokens_seen": 73594900, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.9140625, + "step": 3407, + "time_per_iteration": 2.4675915241241455 + }, + { + "auxiliary_loss_clip": 0.01137483, + "auxiliary_loss_mlp": 0.0104729, + "balance_loss_clip": 1.03039861, + "balance_loss_mlp": 1.0469842, + "epoch": 0.2049000450924395, + "flos": 19060271084160.0, + "grad_norm": 2.4520435823789857, + "language_loss": 0.84025276, + "learning_rate": 3.6875990903614886e-06, + "loss": 0.86210054, + "num_input_tokens_seen": 73613810, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.90625, + "step": 3408, + "time_per_iteration": 2.4996185302734375 + }, + { + "auxiliary_loss_clip": 0.01144748, + "auxiliary_loss_mlp": 0.01045034, + "balance_loss_clip": 1.02851176, + "balance_loss_mlp": 1.05156052, + "epoch": 0.20496016834510747, + "flos": 14574561851520.0, + "grad_norm": 2.726731275915995, + "language_loss": 0.64288676, + "learning_rate": 3.6873900504506166e-06, + "loss": 0.66478455, + "num_input_tokens_seen": 73631495, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9296875, + "step": 3409, + "time_per_iteration": 2.469758987426758 + }, + { + "auxiliary_loss_clip": 0.01139054, + "auxiliary_loss_mlp": 0.01046007, + "balance_loss_clip": 1.0295676, + "balance_loss_mlp": 1.04638147, + "epoch": 0.20502029159777543, + "flos": 22126647329280.0, + "grad_norm": 1.319045584705984, + "language_loss": 0.80357087, + "learning_rate": 3.687180946553745e-06, + "loss": 0.82542145, + "num_input_tokens_seen": 73652840, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.92578125, + "step": 3410, + "time_per_iteration": 2.5167293548583984 + }, + { + "auxiliary_loss_clip": 0.01140553, + "auxiliary_loss_mlp": 0.01046311, + "balance_loss_clip": 1.0303905, + "balance_loss_mlp": 1.05014896, + "epoch": 0.2050804148504434, + "flos": 25367907916800.0, + "grad_norm": 2.259997857874164, + "language_loss": 0.75796056, + "learning_rate": 3.686971778678803e-06, + "loss": 0.7798292, + "num_input_tokens_seen": 73672150, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.90625, + "step": 3411, + "time_per_iteration": 2.5411264896392822 + }, + { + "auxiliary_loss_clip": 0.01144909, + "auxiliary_loss_mlp": 0.0104429, + "balance_loss_clip": 1.02817273, + "balance_loss_mlp": 1.05220985, + "epoch": 0.2051405381031114, + "flos": 23620171858560.0, + "grad_norm": 2.0004173274373183, + "language_loss": 0.73696554, + "learning_rate": 3.686762546833722e-06, + "loss": 0.75885755, + "num_input_tokens_seen": 73691940, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.9296875, + "step": 3412, + "time_per_iteration": 2.5047144889831543 + }, + { + "auxiliary_loss_clip": 0.01143761, + "auxiliary_loss_mlp": 0.01047167, + "balance_loss_clip": 1.03015614, + "balance_loss_mlp": 1.04735541, + "epoch": 0.20520066135577936, + "flos": 19565533745280.0, + "grad_norm": 2.0925027501904228, + "language_loss": 0.77863461, + "learning_rate": 3.6865532510264362e-06, + "loss": 0.8005439, + "num_input_tokens_seen": 73709080, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.96484375, + "step": 3413, + "time_per_iteration": 2.5472991466522217 + }, + { + "auxiliary_loss_clip": 0.01138869, + "auxiliary_loss_mlp": 0.01042643, + "balance_loss_clip": 1.02534604, + "balance_loss_mlp": 1.04989886, + "epoch": 0.20526078460844732, + "flos": 17676345928320.0, + "grad_norm": 1.912987525537943, + "language_loss": 0.84719825, + "learning_rate": 3.6863438912648823e-06, + "loss": 0.86901337, + "num_input_tokens_seen": 73727670, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.890625, + "step": 3414, + "time_per_iteration": 2.478729724884033 + }, + { + "auxiliary_loss_clip": 0.01138295, + "auxiliary_loss_mlp": 0.0104162, + "balance_loss_clip": 1.02496636, + "balance_loss_mlp": 1.04659235, + "epoch": 0.2053209078611153, + "flos": 21500328856320.0, + "grad_norm": 1.9076108002018353, + "language_loss": 0.80448711, + "learning_rate": 3.6861344675569986e-06, + "loss": 0.82628626, + "num_input_tokens_seen": 73747170, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.91796875, + "step": 3415, + "time_per_iteration": 2.5366415977478027 + }, + { + "auxiliary_loss_clip": 0.01137909, + "auxiliary_loss_mlp": 0.01037046, + "balance_loss_clip": 1.02154934, + "balance_loss_mlp": 1.04796863, + "epoch": 0.20538103111378325, + "flos": 25663524848640.0, + "grad_norm": 1.7629792917286327, + "language_loss": 0.72893143, + "learning_rate": 3.6859249799107275e-06, + "loss": 0.75068092, + "num_input_tokens_seen": 73767690, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8984375, + "step": 3416, + "time_per_iteration": 2.5656492710113525 + }, + { + "auxiliary_loss_clip": 0.01140135, + "auxiliary_loss_mlp": 0.01042271, + "balance_loss_clip": 1.02520072, + "balance_loss_mlp": 1.04695165, + "epoch": 0.20544115436645122, + "flos": 23148952312320.0, + "grad_norm": 2.5523210605949425, + "language_loss": 0.78623438, + "learning_rate": 3.6857154283340115e-06, + "loss": 0.80805844, + "num_input_tokens_seen": 73786900, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.93359375, + "step": 3417, + "time_per_iteration": 2.51582932472229 + }, + { + "auxiliary_loss_clip": 0.01140114, + "auxiliary_loss_mlp": 0.01046708, + "balance_loss_clip": 1.02948236, + "balance_loss_mlp": 1.04842472, + "epoch": 0.20550127761911918, + "flos": 19390433921280.0, + "grad_norm": 2.178207343470702, + "language_loss": 0.87390542, + "learning_rate": 3.685505812834798e-06, + "loss": 0.89577365, + "num_input_tokens_seen": 73804515, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.91796875, + "step": 3418, + "time_per_iteration": 2.4900615215301514 + }, + { + "auxiliary_loss_clip": 0.01139839, + "auxiliary_loss_mlp": 0.0104158, + "balance_loss_clip": 1.0251534, + "balance_loss_mlp": 1.04798996, + "epoch": 0.20556140087178718, + "flos": 22893124671360.0, + "grad_norm": 2.115759049165993, + "language_loss": 0.62156075, + "learning_rate": 3.685296133421035e-06, + "loss": 0.64337492, + "num_input_tokens_seen": 73822910, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.921875, + "step": 3419, + "time_per_iteration": 2.527057647705078 + }, + { + "auxiliary_loss_clip": 0.01143982, + "auxiliary_loss_mlp": 0.0104893, + "balance_loss_clip": 1.02977359, + "balance_loss_mlp": 1.04905963, + "epoch": 0.20562152412445514, + "flos": 19789652655360.0, + "grad_norm": 2.2865688080492466, + "language_loss": 0.86502206, + "learning_rate": 3.685086390100674e-06, + "loss": 0.88695121, + "num_input_tokens_seen": 73841160, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.953125, + "step": 3420, + "time_per_iteration": 2.532512664794922 + }, + { + "auxiliary_loss_clip": 0.01138181, + "auxiliary_loss_mlp": 0.0104181, + "balance_loss_clip": 1.02533531, + "balance_loss_mlp": 1.04659796, + "epoch": 0.2056816473771231, + "flos": 31501989210240.0, + "grad_norm": 2.535685660701584, + "language_loss": 0.70904821, + "learning_rate": 3.684876582881668e-06, + "loss": 0.73084807, + "num_input_tokens_seen": 73862795, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.91796875, + "step": 3421, + "time_per_iteration": 2.5924150943756104 + }, + { + "auxiliary_loss_clip": 0.0113664, + "auxiliary_loss_mlp": 0.01038524, + "balance_loss_clip": 1.02099967, + "balance_loss_mlp": 1.04581738, + "epoch": 0.20574177062979107, + "flos": 23258372117760.0, + "grad_norm": 3.5707952740494235, + "language_loss": 0.70370102, + "learning_rate": 3.6846667117719732e-06, + "loss": 0.72545266, + "num_input_tokens_seen": 73881525, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.90625, + "step": 3422, + "time_per_iteration": 2.499041795730591 + }, + { + "auxiliary_loss_clip": 0.01060302, + "auxiliary_loss_mlp": 0.01012319, + "balance_loss_clip": 1.01001859, + "balance_loss_mlp": 1.02983248, + "epoch": 0.20580189388245904, + "flos": 70312518708480.0, + "grad_norm": 0.7605512778953217, + "language_loss": 0.55499864, + "learning_rate": 3.684456776779548e-06, + "loss": 0.57572484, + "num_input_tokens_seen": 73937775, + "router_z_loss_clip": 0.02294922, + "router_z_loss_mlp": 0.3046875, + "step": 3423, + "time_per_iteration": 3.1569108963012695 + }, + { + "auxiliary_loss_clip": 0.0114215, + "auxiliary_loss_mlp": 0.01042807, + "balance_loss_clip": 1.02494931, + "balance_loss_mlp": 1.04882169, + "epoch": 0.205862017135127, + "flos": 30737846252160.0, + "grad_norm": 1.7754304652232902, + "language_loss": 0.71701574, + "learning_rate": 3.684246777912353e-06, + "loss": 0.73886526, + "num_input_tokens_seen": 73958250, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.9296875, + "step": 3424, + "time_per_iteration": 2.58278751373291 + }, + { + "auxiliary_loss_clip": 0.01140924, + "auxiliary_loss_mlp": 0.01046159, + "balance_loss_clip": 1.02920699, + "balance_loss_mlp": 1.05022514, + "epoch": 0.20592214038779497, + "flos": 21324546673920.0, + "grad_norm": 1.563470220797352, + "language_loss": 0.75031066, + "learning_rate": 3.684036715178351e-06, + "loss": 0.77218151, + "num_input_tokens_seen": 73977775, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.90625, + "step": 3425, + "time_per_iteration": 2.518050193786621 + }, + { + "auxiliary_loss_clip": 0.01145974, + "auxiliary_loss_mlp": 0.01057037, + "balance_loss_clip": 1.0404191, + "balance_loss_mlp": 1.0545603, + "epoch": 0.20598226364046296, + "flos": 22891652213760.0, + "grad_norm": 1.8081006382856646, + "language_loss": 0.88246548, + "learning_rate": 3.683826588585508e-06, + "loss": 0.90449566, + "num_input_tokens_seen": 73996590, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9140625, + "step": 3426, + "time_per_iteration": 2.5141823291778564 + }, + { + "auxiliary_loss_clip": 0.01139115, + "auxiliary_loss_mlp": 0.01046156, + "balance_loss_clip": 1.02927566, + "balance_loss_mlp": 1.04961991, + "epoch": 0.20604238689313092, + "flos": 23878549365120.0, + "grad_norm": 1.8273097367093476, + "language_loss": 0.76748925, + "learning_rate": 3.6836163981417926e-06, + "loss": 0.78934193, + "num_input_tokens_seen": 74015935, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.89453125, + "step": 3427, + "time_per_iteration": 4.068110227584839 + }, + { + "auxiliary_loss_clip": 0.01143208, + "auxiliary_loss_mlp": 0.01048853, + "balance_loss_clip": 1.03143609, + "balance_loss_mlp": 1.04978716, + "epoch": 0.2061025101457989, + "flos": 22491535639680.0, + "grad_norm": 1.6956079848027177, + "language_loss": 0.73914266, + "learning_rate": 3.683406143855174e-06, + "loss": 0.76106334, + "num_input_tokens_seen": 74036575, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.93359375, + "step": 3428, + "time_per_iteration": 2.5296199321746826 + }, + { + "auxiliary_loss_clip": 0.0113987, + "auxiliary_loss_mlp": 0.01049805, + "balance_loss_clip": 1.03188777, + "balance_loss_mlp": 1.04691577, + "epoch": 0.20616263339846685, + "flos": 22778928357120.0, + "grad_norm": 3.779292361126499, + "language_loss": 0.73553443, + "learning_rate": 3.6831958257336256e-06, + "loss": 0.75743121, + "num_input_tokens_seen": 74055365, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9296875, + "step": 3429, + "time_per_iteration": 3.979640483856201 + }, + { + "auxiliary_loss_clip": 0.01146724, + "auxiliary_loss_mlp": 0.01041423, + "balance_loss_clip": 1.0242331, + "balance_loss_mlp": 1.05180049, + "epoch": 0.20622275665113482, + "flos": 20882198684160.0, + "grad_norm": 1.8474903397728304, + "language_loss": 0.85301876, + "learning_rate": 3.6829854437851237e-06, + "loss": 0.87490022, + "num_input_tokens_seen": 74074875, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.94921875, + "step": 3430, + "time_per_iteration": 2.532275438308716 + }, + { + "auxiliary_loss_clip": 0.0114587, + "auxiliary_loss_mlp": 0.01052093, + "balance_loss_clip": 1.03411579, + "balance_loss_mlp": 1.05116892, + "epoch": 0.20628287990380278, + "flos": 19354415558400.0, + "grad_norm": 1.4715876867440674, + "language_loss": 0.69369543, + "learning_rate": 3.6827749980176444e-06, + "loss": 0.715675, + "num_input_tokens_seen": 74094505, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.94921875, + "step": 3431, + "time_per_iteration": 2.4857282638549805 + }, + { + "auxiliary_loss_clip": 0.01051719, + "auxiliary_loss_mlp": 0.01015472, + "balance_loss_clip": 1.01329005, + "balance_loss_mlp": 1.02078724, + "epoch": 0.20634300315647078, + "flos": 71517932248320.0, + "grad_norm": 0.8322663536180677, + "language_loss": 0.60249984, + "learning_rate": 3.6825644884391693e-06, + "loss": 0.62317169, + "num_input_tokens_seen": 74158500, + "router_z_loss_clip": 0.02185059, + "router_z_loss_mlp": 0.30859375, + "step": 3432, + "time_per_iteration": 3.250966787338257 + }, + { + "auxiliary_loss_clip": 0.01143675, + "auxiliary_loss_mlp": 0.01047086, + "balance_loss_clip": 1.03021789, + "balance_loss_mlp": 1.05125713, + "epoch": 0.20640312640913874, + "flos": 21723944976000.0, + "grad_norm": 1.7869258470827205, + "language_loss": 0.72495091, + "learning_rate": 3.682353915057679e-06, + "loss": 0.74685854, + "num_input_tokens_seen": 74176685, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.921875, + "step": 3433, + "time_per_iteration": 2.528576135635376 + }, + { + "auxiliary_loss_clip": 0.01143793, + "auxiliary_loss_mlp": 0.01050396, + "balance_loss_clip": 1.03295541, + "balance_loss_mlp": 1.04886997, + "epoch": 0.2064632496618067, + "flos": 20554621626240.0, + "grad_norm": 1.715054190412472, + "language_loss": 0.8721565, + "learning_rate": 3.6821432778811604e-06, + "loss": 0.8940984, + "num_input_tokens_seen": 74194935, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.94921875, + "step": 3434, + "time_per_iteration": 2.507589101791382 + }, + { + "auxiliary_loss_clip": 0.01144514, + "auxiliary_loss_mlp": 0.01043806, + "balance_loss_clip": 1.0269376, + "balance_loss_mlp": 1.04833162, + "epoch": 0.20652337291447467, + "flos": 29823273135360.0, + "grad_norm": 1.6274854163318595, + "language_loss": 0.69133317, + "learning_rate": 3.6819325769176004e-06, + "loss": 0.71321636, + "num_input_tokens_seen": 74215400, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9609375, + "step": 3435, + "time_per_iteration": 2.587930679321289 + }, + { + "auxiliary_loss_clip": 0.01140929, + "auxiliary_loss_mlp": 0.01041675, + "balance_loss_clip": 1.0241158, + "balance_loss_mlp": 1.04983366, + "epoch": 0.20658349616714264, + "flos": 26213640618240.0, + "grad_norm": 1.7028603597643168, + "language_loss": 0.8922776, + "learning_rate": 3.681721812174988e-06, + "loss": 0.91410363, + "num_input_tokens_seen": 74234090, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.91015625, + "step": 3436, + "time_per_iteration": 2.57295298576355 + }, + { + "auxiliary_loss_clip": 0.01144451, + "auxiliary_loss_mlp": 0.01035549, + "balance_loss_clip": 1.01856136, + "balance_loss_mlp": 1.05126333, + "epoch": 0.2066436194198106, + "flos": 25994370044160.0, + "grad_norm": 1.8990861512322268, + "language_loss": 0.76659, + "learning_rate": 3.6815109836613163e-06, + "loss": 0.78839004, + "num_input_tokens_seen": 74253345, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9296875, + "step": 3437, + "time_per_iteration": 2.5819849967956543 + }, + { + "auxiliary_loss_clip": 0.01142266, + "auxiliary_loss_mlp": 0.0104022, + "balance_loss_clip": 1.02397132, + "balance_loss_mlp": 1.04877901, + "epoch": 0.20670374267247857, + "flos": 21361067827200.0, + "grad_norm": 1.7925672188665596, + "language_loss": 0.77611911, + "learning_rate": 3.6813000913845795e-06, + "loss": 0.79794395, + "num_input_tokens_seen": 74271615, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.93359375, + "step": 3438, + "time_per_iteration": 2.5091731548309326 + }, + { + "auxiliary_loss_clip": 0.01047915, + "auxiliary_loss_mlp": 0.01005377, + "balance_loss_clip": 1.00348175, + "balance_loss_mlp": 1.01723933, + "epoch": 0.20676386592514656, + "flos": 66383281952640.0, + "grad_norm": 0.8367234589951487, + "language_loss": 0.67141807, + "learning_rate": 3.6810891353527747e-06, + "loss": 0.69195092, + "num_input_tokens_seen": 74331390, + "router_z_loss_clip": 0.0189209, + "router_z_loss_mlp": 0.30664062, + "step": 3439, + "time_per_iteration": 3.0797181129455566 + }, + { + "auxiliary_loss_clip": 0.01142942, + "auxiliary_loss_mlp": 0.01037044, + "balance_loss_clip": 1.02028275, + "balance_loss_mlp": 1.04791629, + "epoch": 0.20682398917781453, + "flos": 17274577328640.0, + "grad_norm": 2.0580501207842428, + "language_loss": 0.83931267, + "learning_rate": 3.6808781155739014e-06, + "loss": 0.86111259, + "num_input_tokens_seen": 74347335, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.94921875, + "step": 3440, + "time_per_iteration": 2.5015172958374023 + }, + { + "auxiliary_loss_clip": 0.01143016, + "auxiliary_loss_mlp": 0.01041686, + "balance_loss_clip": 1.02584338, + "balance_loss_mlp": 1.05009377, + "epoch": 0.2068841124304825, + "flos": 18077288515200.0, + "grad_norm": 1.9416657792651912, + "language_loss": 0.84825736, + "learning_rate": 3.6806670320559614e-06, + "loss": 0.87010437, + "num_input_tokens_seen": 74366310, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.9296875, + "step": 3441, + "time_per_iteration": 2.4866137504577637 + }, + { + "auxiliary_loss_clip": 0.01140001, + "auxiliary_loss_mlp": 0.01044739, + "balance_loss_clip": 1.02778697, + "balance_loss_mlp": 1.0502038, + "epoch": 0.20694423568315046, + "flos": 27347017432320.0, + "grad_norm": 1.6577892844013908, + "language_loss": 0.85889506, + "learning_rate": 3.680455884806959e-06, + "loss": 0.88074249, + "num_input_tokens_seen": 74387100, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8984375, + "step": 3442, + "time_per_iteration": 2.5914649963378906 + }, + { + "auxiliary_loss_clip": 0.01145487, + "auxiliary_loss_mlp": 0.01040291, + "balance_loss_clip": 1.02305317, + "balance_loss_mlp": 1.05208063, + "epoch": 0.20700435893581842, + "flos": 20229845829120.0, + "grad_norm": 1.9070439101703558, + "language_loss": 0.72829354, + "learning_rate": 3.6802446738349014e-06, + "loss": 0.75015128, + "num_input_tokens_seen": 74404460, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.93359375, + "step": 3443, + "time_per_iteration": 2.5210063457489014 + }, + { + "auxiliary_loss_clip": 0.01140016, + "auxiliary_loss_mlp": 0.01044032, + "balance_loss_clip": 1.02879703, + "balance_loss_mlp": 1.0496819, + "epoch": 0.2070644821884864, + "flos": 20631111638400.0, + "grad_norm": 2.5056876708900186, + "language_loss": 0.85428166, + "learning_rate": 3.680033399147797e-06, + "loss": 0.87612224, + "num_input_tokens_seen": 74423790, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.90625, + "step": 3444, + "time_per_iteration": 2.528493881225586 + }, + { + "auxiliary_loss_clip": 0.01047325, + "auxiliary_loss_mlp": 0.0100746, + "balance_loss_clip": 1.00537384, + "balance_loss_mlp": 1.01688242, + "epoch": 0.20712460544115438, + "flos": 65941077617280.0, + "grad_norm": 0.6978715278146553, + "language_loss": 0.57091653, + "learning_rate": 3.6798220607536585e-06, + "loss": 0.5914644, + "num_input_tokens_seen": 74488130, + "router_z_loss_clip": 0.02087402, + "router_z_loss_mlp": 0.3046875, + "step": 3445, + "time_per_iteration": 3.086552619934082 + }, + { + "auxiliary_loss_clip": 0.01140085, + "auxiliary_loss_mlp": 0.01050946, + "balance_loss_clip": 1.03356516, + "balance_loss_mlp": 1.04968095, + "epoch": 0.20718472869382235, + "flos": 19425734012160.0, + "grad_norm": 1.5621496076246746, + "language_loss": 0.78459281, + "learning_rate": 3.6796106586604987e-06, + "loss": 0.80650306, + "num_input_tokens_seen": 74506720, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.90625, + "step": 3446, + "time_per_iteration": 2.4844422340393066 + }, + { + "auxiliary_loss_clip": 0.01148285, + "auxiliary_loss_mlp": 0.01048146, + "balance_loss_clip": 1.02846456, + "balance_loss_mlp": 1.05057228, + "epoch": 0.2072448519464903, + "flos": 24499049834880.0, + "grad_norm": 2.157476270385918, + "language_loss": 0.62436825, + "learning_rate": 3.679399192876334e-06, + "loss": 0.64633256, + "num_input_tokens_seen": 74525330, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.9765625, + "step": 3447, + "time_per_iteration": 2.592799663543701 + }, + { + "auxiliary_loss_clip": 0.01141894, + "auxiliary_loss_mlp": 0.01047763, + "balance_loss_clip": 1.03071666, + "balance_loss_mlp": 1.04810297, + "epoch": 0.20730497519915828, + "flos": 23075694524160.0, + "grad_norm": 1.740614876967074, + "language_loss": 0.86066437, + "learning_rate": 3.679187663409184e-06, + "loss": 0.88256097, + "num_input_tokens_seen": 74544535, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9375, + "step": 3448, + "time_per_iteration": 2.5054237842559814 + }, + { + "auxiliary_loss_clip": 0.01140662, + "auxiliary_loss_mlp": 0.01044351, + "balance_loss_clip": 1.02576649, + "balance_loss_mlp": 1.04814398, + "epoch": 0.20736509845182624, + "flos": 21069042255360.0, + "grad_norm": 3.1117492515519665, + "language_loss": 0.75452864, + "learning_rate": 3.6789760702670696e-06, + "loss": 0.77637869, + "num_input_tokens_seen": 74562300, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.92578125, + "step": 3449, + "time_per_iteration": 2.506657838821411 + }, + { + "auxiliary_loss_clip": 0.01145667, + "auxiliary_loss_mlp": 0.01050496, + "balance_loss_clip": 1.03194678, + "balance_loss_mlp": 1.04896426, + "epoch": 0.2074252217044942, + "flos": 17633288499840.0, + "grad_norm": 1.7877143934577313, + "language_loss": 0.76703656, + "learning_rate": 3.6787644134580134e-06, + "loss": 0.78899819, + "num_input_tokens_seen": 74580080, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.96875, + "step": 3450, + "time_per_iteration": 2.479090929031372 + }, + { + "auxiliary_loss_clip": 0.01143955, + "auxiliary_loss_mlp": 0.01047659, + "balance_loss_clip": 1.0302192, + "balance_loss_mlp": 1.04780531, + "epoch": 0.20748534495716217, + "flos": 23546985897600.0, + "grad_norm": 1.5227053471466307, + "language_loss": 0.822101, + "learning_rate": 3.6785526929900436e-06, + "loss": 0.84401715, + "num_input_tokens_seen": 74598980, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9609375, + "step": 3451, + "time_per_iteration": 2.5465826988220215 + }, + { + "auxiliary_loss_clip": 0.01047156, + "auxiliary_loss_mlp": 0.01003865, + "balance_loss_clip": 1.00193334, + "balance_loss_mlp": 1.01645589, + "epoch": 0.20754546820983016, + "flos": 52252935598080.0, + "grad_norm": 0.7930757504147553, + "language_loss": 0.56569821, + "learning_rate": 3.6783409088711875e-06, + "loss": 0.58620846, + "num_input_tokens_seen": 74655275, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.30859375, + "step": 3452, + "time_per_iteration": 2.979168653488159 + }, + { + "auxiliary_loss_clip": 0.01144097, + "auxiliary_loss_mlp": 0.01045617, + "balance_loss_clip": 1.02765203, + "balance_loss_mlp": 1.0492605, + "epoch": 0.20760559146249813, + "flos": 20412379768320.0, + "grad_norm": 1.970927529953097, + "language_loss": 0.88332593, + "learning_rate": 3.6781290611094755e-06, + "loss": 0.90522313, + "num_input_tokens_seen": 74674560, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.94921875, + "step": 3453, + "time_per_iteration": 2.5404746532440186 + }, + { + "auxiliary_loss_clip": 0.01145334, + "auxiliary_loss_mlp": 0.01043412, + "balance_loss_clip": 1.02518487, + "balance_loss_mlp": 1.05121803, + "epoch": 0.2076657147151661, + "flos": 23186012169600.0, + "grad_norm": 1.6193396769615114, + "language_loss": 0.80056196, + "learning_rate": 3.6779171497129407e-06, + "loss": 0.82244939, + "num_input_tokens_seen": 74694500, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.94140625, + "step": 3454, + "time_per_iteration": 2.536154270172119 + }, + { + "auxiliary_loss_clip": 0.0114231, + "auxiliary_loss_mlp": 0.0104846, + "balance_loss_clip": 1.03128242, + "balance_loss_mlp": 1.04881716, + "epoch": 0.20772583796783406, + "flos": 18293219124480.0, + "grad_norm": 3.767477329453147, + "language_loss": 0.76424366, + "learning_rate": 3.6777051746896202e-06, + "loss": 0.78615135, + "num_input_tokens_seen": 74710485, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.93359375, + "step": 3455, + "time_per_iteration": 2.502450466156006 + }, + { + "auxiliary_loss_clip": 0.01141184, + "auxiliary_loss_mlp": 0.01049655, + "balance_loss_clip": 1.03247654, + "balance_loss_mlp": 1.04867601, + "epoch": 0.20778596122050202, + "flos": 17602800831360.0, + "grad_norm": 2.1876724852466163, + "language_loss": 0.80599815, + "learning_rate": 3.6774931360475516e-06, + "loss": 0.82790661, + "num_input_tokens_seen": 74727450, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.92578125, + "step": 3456, + "time_per_iteration": 2.495405673980713 + }, + { + "auxiliary_loss_clip": 0.01147485, + "auxiliary_loss_mlp": 0.01042924, + "balance_loss_clip": 1.02447069, + "balance_loss_mlp": 1.05180097, + "epoch": 0.20784608447317, + "flos": 23805578885760.0, + "grad_norm": 1.5859267830694757, + "language_loss": 0.77988815, + "learning_rate": 3.6772810337947745e-06, + "loss": 0.80179226, + "num_input_tokens_seen": 74746725, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.95703125, + "step": 3457, + "time_per_iteration": 2.5625829696655273 + }, + { + "auxiliary_loss_clip": 0.01149281, + "auxiliary_loss_mlp": 0.01054167, + "balance_loss_clip": 1.03461635, + "balance_loss_mlp": 1.05195451, + "epoch": 0.20790620772583795, + "flos": 17639286071040.0, + "grad_norm": 2.0073788397072136, + "language_loss": 0.83581042, + "learning_rate": 3.677068867939333e-06, + "loss": 0.85784483, + "num_input_tokens_seen": 74765255, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.96875, + "step": 3458, + "time_per_iteration": 2.470740556716919 + }, + { + "auxiliary_loss_clip": 0.01142717, + "auxiliary_loss_mlp": 0.01041287, + "balance_loss_clip": 1.02443111, + "balance_loss_mlp": 1.05063045, + "epoch": 0.20796633097850595, + "flos": 27673481168640.0, + "grad_norm": 1.732611194718632, + "language_loss": 0.76041365, + "learning_rate": 3.676856638489272e-06, + "loss": 0.78225368, + "num_input_tokens_seen": 74785710, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.921875, + "step": 3459, + "time_per_iteration": 2.5753207206726074 + }, + { + "auxiliary_loss_clip": 0.01138446, + "auxiliary_loss_mlp": 0.01041199, + "balance_loss_clip": 1.02451003, + "balance_loss_mlp": 1.04829502, + "epoch": 0.2080264542311739, + "flos": 19245606284160.0, + "grad_norm": 2.1264218253084386, + "language_loss": 0.77302521, + "learning_rate": 3.6766443454526382e-06, + "loss": 0.79482168, + "num_input_tokens_seen": 74804490, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8984375, + "step": 3460, + "time_per_iteration": 2.498760938644409 + }, + { + "auxiliary_loss_clip": 0.01143636, + "auxiliary_loss_mlp": 0.01050405, + "balance_loss_clip": 1.03284574, + "balance_loss_mlp": 1.04819179, + "epoch": 0.20808657748384188, + "flos": 27525924097920.0, + "grad_norm": 2.1644839576228296, + "language_loss": 0.75785947, + "learning_rate": 3.6764319888374836e-06, + "loss": 0.77979982, + "num_input_tokens_seen": 74826340, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.953125, + "step": 3461, + "time_per_iteration": 2.5850372314453125 + }, + { + "auxiliary_loss_clip": 0.01145604, + "auxiliary_loss_mlp": 0.010446, + "balance_loss_clip": 1.02645624, + "balance_loss_mlp": 1.0469749, + "epoch": 0.20814670073650984, + "flos": 26906931999360.0, + "grad_norm": 1.8484421465162717, + "language_loss": 0.88227051, + "learning_rate": 3.6762195686518604e-06, + "loss": 0.90417254, + "num_input_tokens_seen": 74844960, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.98828125, + "step": 3462, + "time_per_iteration": 2.558375358581543 + }, + { + "auxiliary_loss_clip": 0.01043601, + "auxiliary_loss_mlp": 0.0101247, + "balance_loss_clip": 1.01059818, + "balance_loss_mlp": 1.01278758, + "epoch": 0.2082068239891778, + "flos": 70175735717760.0, + "grad_norm": 0.7627714646141646, + "language_loss": 0.59057152, + "learning_rate": 3.6760070849038226e-06, + "loss": 0.6111322, + "num_input_tokens_seen": 74909075, + "router_z_loss_clip": 0.01867676, + "router_z_loss_mlp": 0.30859375, + "step": 3463, + "time_per_iteration": 3.2280492782592773 + }, + { + "auxiliary_loss_clip": 0.01144566, + "auxiliary_loss_mlp": 0.01049331, + "balance_loss_clip": 1.03056765, + "balance_loss_mlp": 1.04713821, + "epoch": 0.20826694724184577, + "flos": 24608074590720.0, + "grad_norm": 2.542529703880477, + "language_loss": 0.65831709, + "learning_rate": 3.675794537601429e-06, + "loss": 0.68025607, + "num_input_tokens_seen": 74928125, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.97265625, + "step": 3464, + "time_per_iteration": 2.5706918239593506 + }, + { + "auxiliary_loss_clip": 0.01147872, + "auxiliary_loss_mlp": 0.01050812, + "balance_loss_clip": 1.03160763, + "balance_loss_mlp": 1.0492928, + "epoch": 0.20832707049451377, + "flos": 12892829034240.0, + "grad_norm": 2.848617339554035, + "language_loss": 0.83536243, + "learning_rate": 3.6755819267527373e-06, + "loss": 0.85734928, + "num_input_tokens_seen": 74945090, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.984375, + "step": 3465, + "time_per_iteration": 2.535473585128784 + }, + { + "auxiliary_loss_clip": 0.01143191, + "auxiliary_loss_mlp": 0.01044869, + "balance_loss_clip": 1.02767932, + "balance_loss_mlp": 1.04802513, + "epoch": 0.20838719374718173, + "flos": 22198827709440.0, + "grad_norm": 3.628659863163492, + "language_loss": 0.81463158, + "learning_rate": 3.6753692523658113e-06, + "loss": 0.83651215, + "num_input_tokens_seen": 74963630, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.953125, + "step": 3466, + "time_per_iteration": 2.535311222076416 + }, + { + "auxiliary_loss_clip": 0.01146517, + "auxiliary_loss_mlp": 0.01044717, + "balance_loss_clip": 1.02863586, + "balance_loss_mlp": 1.05303347, + "epoch": 0.2084473169998497, + "flos": 15158648908800.0, + "grad_norm": 1.967186340276973, + "language_loss": 0.81678396, + "learning_rate": 3.675156514448716e-06, + "loss": 0.83869636, + "num_input_tokens_seen": 74981875, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.9375, + "step": 3467, + "time_per_iteration": 2.4783830642700195 + }, + { + "auxiliary_loss_clip": 0.01142574, + "auxiliary_loss_mlp": 0.01041679, + "balance_loss_clip": 1.02469158, + "balance_loss_mlp": 1.05200005, + "epoch": 0.20850744025251766, + "flos": 17456788045440.0, + "grad_norm": 2.0682841758185235, + "language_loss": 0.8186093, + "learning_rate": 3.674943713009518e-06, + "loss": 0.84045184, + "num_input_tokens_seen": 74999155, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.90625, + "step": 3468, + "time_per_iteration": 2.5275001525878906 + }, + { + "auxiliary_loss_clip": 0.0114752, + "auxiliary_loss_mlp": 0.01046643, + "balance_loss_clip": 1.02677095, + "balance_loss_mlp": 1.05024171, + "epoch": 0.20856756350518563, + "flos": 25698968593920.0, + "grad_norm": 1.9832892060266627, + "language_loss": 0.90227246, + "learning_rate": 3.6747308480562856e-06, + "loss": 0.92421412, + "num_input_tokens_seen": 75017850, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.9765625, + "step": 3469, + "time_per_iteration": 3.999607563018799 + }, + { + "auxiliary_loss_clip": 0.01147477, + "auxiliary_loss_mlp": 0.01051285, + "balance_loss_clip": 1.03329682, + "balance_loss_mlp": 1.0530771, + "epoch": 0.2086276867578536, + "flos": 37889060970240.0, + "grad_norm": 1.764094275638393, + "language_loss": 0.7643016, + "learning_rate": 3.674517919597092e-06, + "loss": 0.78628922, + "num_input_tokens_seen": 75039270, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9453125, + "step": 3470, + "time_per_iteration": 4.186570405960083 + }, + { + "auxiliary_loss_clip": 0.0114555, + "auxiliary_loss_mlp": 0.01048445, + "balance_loss_clip": 1.03039646, + "balance_loss_mlp": 1.05154145, + "epoch": 0.20868781001052156, + "flos": 25557049958400.0, + "grad_norm": 1.7254586081909284, + "language_loss": 0.7592454, + "learning_rate": 3.674304927640011e-06, + "loss": 0.78118539, + "num_input_tokens_seen": 75059350, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.94140625, + "step": 3471, + "time_per_iteration": 2.5700020790100098 + }, + { + "auxiliary_loss_clip": 0.01148899, + "auxiliary_loss_mlp": 0.01054484, + "balance_loss_clip": 1.03488564, + "balance_loss_mlp": 1.04796982, + "epoch": 0.20874793326318955, + "flos": 27529192235520.0, + "grad_norm": 1.907022336492936, + "language_loss": 0.75515926, + "learning_rate": 3.67409187219312e-06, + "loss": 0.77719313, + "num_input_tokens_seen": 75080150, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0078125, + "step": 3472, + "time_per_iteration": 2.555927038192749 + }, + { + "auxiliary_loss_clip": 0.01144631, + "auxiliary_loss_mlp": 0.01045397, + "balance_loss_clip": 1.02790928, + "balance_loss_mlp": 1.05051231, + "epoch": 0.20880805651585752, + "flos": 18548795370240.0, + "grad_norm": 1.9877478939715982, + "language_loss": 0.84168947, + "learning_rate": 3.6738787532644966e-06, + "loss": 0.86358976, + "num_input_tokens_seen": 75097920, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.94140625, + "step": 3473, + "time_per_iteration": 2.5261759757995605 + }, + { + "auxiliary_loss_clip": 0.01043725, + "auxiliary_loss_mlp": 0.0100228, + "balance_loss_clip": 1.00027776, + "balance_loss_mlp": 1.01290703, + "epoch": 0.20886817976852548, + "flos": 65946644225280.0, + "grad_norm": 0.8792852781400284, + "language_loss": 0.63631999, + "learning_rate": 3.6736655708622235e-06, + "loss": 0.65678006, + "num_input_tokens_seen": 75152410, + "router_z_loss_clip": 0.02001953, + "router_z_loss_mlp": 0.30859375, + "step": 3474, + "time_per_iteration": 3.025831460952759 + }, + { + "auxiliary_loss_clip": 0.01146356, + "auxiliary_loss_mlp": 0.01041236, + "balance_loss_clip": 1.02334285, + "balance_loss_mlp": 1.04993105, + "epoch": 0.20892830302119345, + "flos": 36539178929280.0, + "grad_norm": 2.882119897934913, + "language_loss": 0.69867098, + "learning_rate": 3.6734523249943844e-06, + "loss": 0.72054696, + "num_input_tokens_seen": 75173265, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.96484375, + "step": 3475, + "time_per_iteration": 2.751676559448242 + }, + { + "auxiliary_loss_clip": 0.01146508, + "auxiliary_loss_mlp": 0.01047852, + "balance_loss_clip": 1.02961278, + "balance_loss_mlp": 1.05162299, + "epoch": 0.2089884262738614, + "flos": 20956749361920.0, + "grad_norm": 1.4951270147360183, + "language_loss": 0.70032048, + "learning_rate": 3.673239015669065e-06, + "loss": 0.72226411, + "num_input_tokens_seen": 75193640, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.94921875, + "step": 3476, + "time_per_iteration": 2.5493083000183105 + }, + { + "auxiliary_loss_clip": 0.011446, + "auxiliary_loss_mlp": 0.01046029, + "balance_loss_clip": 1.02850533, + "balance_loss_mlp": 1.05099094, + "epoch": 0.20904854952652938, + "flos": 22784028088320.0, + "grad_norm": 2.0857679152031716, + "language_loss": 0.89590299, + "learning_rate": 3.6730256428943544e-06, + "loss": 0.91780925, + "num_input_tokens_seen": 75212545, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9375, + "step": 3477, + "time_per_iteration": 2.506962537765503 + }, + { + "auxiliary_loss_clip": 0.01142894, + "auxiliary_loss_mlp": 0.01047844, + "balance_loss_clip": 1.03005815, + "balance_loss_mlp": 1.04896593, + "epoch": 0.20910867277919734, + "flos": 27303277645440.0, + "grad_norm": 4.245750786990739, + "language_loss": 0.67988396, + "learning_rate": 3.672812206678344e-06, + "loss": 0.70179135, + "num_input_tokens_seen": 75230865, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9375, + "step": 3478, + "time_per_iteration": 2.57366681098938 + }, + { + "auxiliary_loss_clip": 0.01143008, + "auxiliary_loss_mlp": 0.0104171, + "balance_loss_clip": 1.02334023, + "balance_loss_mlp": 1.04826832, + "epoch": 0.20916879603186533, + "flos": 14319237000960.0, + "grad_norm": 2.137628491911851, + "language_loss": 0.85035646, + "learning_rate": 3.672598707029127e-06, + "loss": 0.87220371, + "num_input_tokens_seen": 75248285, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.94921875, + "step": 3479, + "time_per_iteration": 2.4716267585754395 + }, + { + "auxiliary_loss_clip": 0.01146636, + "auxiliary_loss_mlp": 0.01049707, + "balance_loss_clip": 1.03156328, + "balance_loss_mlp": 1.04972577, + "epoch": 0.2092289192845333, + "flos": 22273019251200.0, + "grad_norm": 2.2225866030569175, + "language_loss": 0.73807257, + "learning_rate": 3.6723851439548003e-06, + "loss": 0.76003599, + "num_input_tokens_seen": 75266310, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.96875, + "step": 3480, + "time_per_iteration": 2.4856386184692383 + }, + { + "auxiliary_loss_clip": 0.01141126, + "auxiliary_loss_mlp": 0.01047253, + "balance_loss_clip": 1.03113592, + "balance_loss_mlp": 1.04844785, + "epoch": 0.20928904253720126, + "flos": 14830712714880.0, + "grad_norm": 2.023418551380918, + "language_loss": 0.75601453, + "learning_rate": 3.67217151746346e-06, + "loss": 0.77789831, + "num_input_tokens_seen": 75284175, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.9296875, + "step": 3481, + "time_per_iteration": 2.4812443256378174 + }, + { + "auxiliary_loss_clip": 0.01145872, + "auxiliary_loss_mlp": 0.0104777, + "balance_loss_clip": 1.03051996, + "balance_loss_mlp": 1.05047393, + "epoch": 0.20934916578986923, + "flos": 23259162216960.0, + "grad_norm": 3.5251666716598273, + "language_loss": 0.85337639, + "learning_rate": 3.671957827563209e-06, + "loss": 0.87531281, + "num_input_tokens_seen": 75303465, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.953125, + "step": 3482, + "time_per_iteration": 2.521284580230713 + }, + { + "auxiliary_loss_clip": 0.01145664, + "auxiliary_loss_mlp": 0.01048133, + "balance_loss_clip": 1.02940559, + "balance_loss_mlp": 1.05097377, + "epoch": 0.2094092890425372, + "flos": 32014398677760.0, + "grad_norm": 2.8936854891166743, + "language_loss": 0.70626152, + "learning_rate": 3.6717440742621494e-06, + "loss": 0.72819948, + "num_input_tokens_seen": 75325290, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9453125, + "step": 3483, + "time_per_iteration": 2.5876524448394775 + }, + { + "auxiliary_loss_clip": 0.01146142, + "auxiliary_loss_mlp": 0.01060474, + "balance_loss_clip": 1.04193723, + "balance_loss_mlp": 1.04891169, + "epoch": 0.20946941229520516, + "flos": 20010647082240.0, + "grad_norm": 1.8606830424584557, + "language_loss": 0.74988431, + "learning_rate": 3.6715302575683865e-06, + "loss": 0.77195048, + "num_input_tokens_seen": 75343895, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.96875, + "step": 3484, + "time_per_iteration": 2.49701189994812 + }, + { + "auxiliary_loss_clip": 0.01143763, + "auxiliary_loss_mlp": 0.01048057, + "balance_loss_clip": 1.02991378, + "balance_loss_mlp": 1.05028141, + "epoch": 0.20952953554787315, + "flos": 30740072895360.0, + "grad_norm": 1.8378150509428508, + "language_loss": 0.70690203, + "learning_rate": 3.6713163774900292e-06, + "loss": 0.7288202, + "num_input_tokens_seen": 75367100, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9375, + "step": 3485, + "time_per_iteration": 2.5692059993743896 + }, + { + "auxiliary_loss_clip": 0.01146857, + "auxiliary_loss_mlp": 0.01045553, + "balance_loss_clip": 1.02712297, + "balance_loss_mlp": 1.05028093, + "epoch": 0.20958965880054112, + "flos": 27049209770880.0, + "grad_norm": 1.9069158447471781, + "language_loss": 0.82965356, + "learning_rate": 3.6711024340351875e-06, + "loss": 0.85157764, + "num_input_tokens_seen": 75389925, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.96875, + "step": 3486, + "time_per_iteration": 2.569308042526245 + }, + { + "auxiliary_loss_clip": 0.0114472, + "auxiliary_loss_mlp": 0.01050567, + "balance_loss_clip": 1.03309095, + "balance_loss_mlp": 1.04790449, + "epoch": 0.20964978205320908, + "flos": 34204123589760.0, + "grad_norm": 3.843984040964354, + "language_loss": 0.8699702, + "learning_rate": 3.6708884272119737e-06, + "loss": 0.89192313, + "num_input_tokens_seen": 75408575, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.96875, + "step": 3487, + "time_per_iteration": 2.608441114425659 + }, + { + "auxiliary_loss_clip": 0.01141021, + "auxiliary_loss_mlp": 0.01047132, + "balance_loss_clip": 1.0287739, + "balance_loss_mlp": 1.04695904, + "epoch": 0.20970990530587705, + "flos": 23477391296640.0, + "grad_norm": 2.4377115915778713, + "language_loss": 0.72369969, + "learning_rate": 3.670674357028504e-06, + "loss": 0.74558127, + "num_input_tokens_seen": 75427155, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.94140625, + "step": 3488, + "time_per_iteration": 2.529233694076538 + }, + { + "auxiliary_loss_clip": 0.01144055, + "auxiliary_loss_mlp": 0.01045689, + "balance_loss_clip": 1.02812946, + "balance_loss_mlp": 1.04897618, + "epoch": 0.209770028558545, + "flos": 18551452976640.0, + "grad_norm": 2.6657941113460764, + "language_loss": 0.80726898, + "learning_rate": 3.6704602234928945e-06, + "loss": 0.82916641, + "num_input_tokens_seen": 75444450, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.953125, + "step": 3489, + "time_per_iteration": 2.4847962856292725 + }, + { + "auxiliary_loss_clip": 0.01142088, + "auxiliary_loss_mlp": 0.01042551, + "balance_loss_clip": 1.0253495, + "balance_loss_mlp": 1.04718399, + "epoch": 0.20983015181121298, + "flos": 21617003208960.0, + "grad_norm": 1.7888402521564877, + "language_loss": 0.72827011, + "learning_rate": 3.670246026613266e-06, + "loss": 0.75011659, + "num_input_tokens_seen": 75462625, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.94921875, + "step": 3490, + "time_per_iteration": 2.543064594268799 + }, + { + "auxiliary_loss_clip": 0.01140159, + "auxiliary_loss_mlp": 0.0105099, + "balance_loss_clip": 1.03437209, + "balance_loss_mlp": 1.04955435, + "epoch": 0.20989027506388094, + "flos": 16614718531200.0, + "grad_norm": 5.073894522138561, + "language_loss": 0.70159817, + "learning_rate": 3.6700317663977415e-06, + "loss": 0.72350967, + "num_input_tokens_seen": 75480640, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.90625, + "step": 3491, + "time_per_iteration": 2.4785172939300537 + }, + { + "auxiliary_loss_clip": 0.01142629, + "auxiliary_loss_mlp": 0.01045142, + "balance_loss_clip": 1.02633047, + "balance_loss_mlp": 1.04678369, + "epoch": 0.20995039831654894, + "flos": 23216823060480.0, + "grad_norm": 3.7459720995568557, + "language_loss": 0.7931999, + "learning_rate": 3.669817442854444e-06, + "loss": 0.8150776, + "num_input_tokens_seen": 75494900, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9609375, + "step": 3492, + "time_per_iteration": 2.5213027000427246 + }, + { + "auxiliary_loss_clip": 0.01144565, + "auxiliary_loss_mlp": 0.01041079, + "balance_loss_clip": 1.02341175, + "balance_loss_mlp": 1.04977345, + "epoch": 0.2100105215692169, + "flos": 18147493647360.0, + "grad_norm": 1.9629392465329358, + "language_loss": 0.86883962, + "learning_rate": 3.669603055991502e-06, + "loss": 0.89069605, + "num_input_tokens_seen": 75513370, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9453125, + "step": 3493, + "time_per_iteration": 2.499797821044922 + }, + { + "auxiliary_loss_clip": 0.01139311, + "auxiliary_loss_mlp": 0.01040774, + "balance_loss_clip": 1.02408433, + "balance_loss_mlp": 1.04791212, + "epoch": 0.21007064482188487, + "flos": 15961611490560.0, + "grad_norm": 1.8525794886403055, + "language_loss": 0.68810928, + "learning_rate": 3.6693886058170455e-06, + "loss": 0.70991009, + "num_input_tokens_seen": 75532480, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9140625, + "step": 3494, + "time_per_iteration": 2.5374889373779297 + }, + { + "auxiliary_loss_clip": 0.01146689, + "auxiliary_loss_mlp": 0.01037903, + "balance_loss_clip": 1.02054656, + "balance_loss_mlp": 1.05010796, + "epoch": 0.21013076807455283, + "flos": 32234315696640.0, + "grad_norm": 1.7465496854212388, + "language_loss": 0.78900456, + "learning_rate": 3.6691740923392053e-06, + "loss": 0.81085044, + "num_input_tokens_seen": 75552745, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.96484375, + "step": 3495, + "time_per_iteration": 2.6390578746795654 + }, + { + "auxiliary_loss_clip": 0.0114231, + "auxiliary_loss_mlp": 0.01042653, + "balance_loss_clip": 1.02505755, + "balance_loss_mlp": 1.04696178, + "epoch": 0.2101908913272208, + "flos": 23696625957120.0, + "grad_norm": 1.7459726457298623, + "language_loss": 0.77192879, + "learning_rate": 3.668959515566116e-06, + "loss": 0.79377842, + "num_input_tokens_seen": 75574355, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.953125, + "step": 3496, + "time_per_iteration": 2.552386522293091 + }, + { + "auxiliary_loss_clip": 0.01145605, + "auxiliary_loss_mlp": 0.0105152, + "balance_loss_clip": 1.03297126, + "balance_loss_mlp": 1.04933989, + "epoch": 0.21025101457988876, + "flos": 20375786787840.0, + "grad_norm": 2.0396086665216777, + "language_loss": 0.82009852, + "learning_rate": 3.668744875505915e-06, + "loss": 0.84206975, + "num_input_tokens_seen": 75592215, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9609375, + "step": 3497, + "time_per_iteration": 2.498359441757202 + }, + { + "auxiliary_loss_clip": 0.01146873, + "auxiliary_loss_mlp": 0.01048221, + "balance_loss_clip": 1.03091133, + "balance_loss_mlp": 1.04979134, + "epoch": 0.21031113783255675, + "flos": 25775638174080.0, + "grad_norm": 2.5223195218779577, + "language_loss": 0.67314029, + "learning_rate": 3.668530172166741e-06, + "loss": 0.69509119, + "num_input_tokens_seen": 75610740, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.96875, + "step": 3498, + "time_per_iteration": 2.540766716003418 + }, + { + "auxiliary_loss_clip": 0.01145112, + "auxiliary_loss_mlp": 0.01045261, + "balance_loss_clip": 1.02679563, + "balance_loss_mlp": 1.04782224, + "epoch": 0.21037126108522472, + "flos": 22018197191040.0, + "grad_norm": 2.2477271783909414, + "language_loss": 0.80623376, + "learning_rate": 3.6683154055567352e-06, + "loss": 0.82813752, + "num_input_tokens_seen": 75631005, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.97265625, + "step": 3499, + "time_per_iteration": 2.5283098220825195 + }, + { + "auxiliary_loss_clip": 0.0114621, + "auxiliary_loss_mlp": 0.01043014, + "balance_loss_clip": 1.02612233, + "balance_loss_mlp": 1.05201602, + "epoch": 0.21043138433789269, + "flos": 25334403505920.0, + "grad_norm": 1.776862664007905, + "language_loss": 0.78366566, + "learning_rate": 3.668100575684043e-06, + "loss": 0.80555797, + "num_input_tokens_seen": 75650655, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.94140625, + "step": 3500, + "time_per_iteration": 2.5419158935546875 + }, + { + "auxiliary_loss_clip": 0.01142389, + "auxiliary_loss_mlp": 0.01042754, + "balance_loss_clip": 1.02524185, + "balance_loss_mlp": 1.0480907, + "epoch": 0.21049150759056065, + "flos": 25556654908800.0, + "grad_norm": 1.628727093990466, + "language_loss": 0.73989725, + "learning_rate": 3.6678856825568094e-06, + "loss": 0.76174867, + "num_input_tokens_seen": 75669895, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.94140625, + "step": 3501, + "time_per_iteration": 2.535419464111328 + }, + { + "auxiliary_loss_clip": 0.01140428, + "auxiliary_loss_mlp": 0.01041829, + "balance_loss_clip": 1.02429342, + "balance_loss_mlp": 1.04671168, + "epoch": 0.21055163084322862, + "flos": 24495602129280.0, + "grad_norm": 1.6206913905571714, + "language_loss": 0.75292969, + "learning_rate": 3.667670726183183e-06, + "loss": 0.77475226, + "num_input_tokens_seen": 75689535, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9375, + "step": 3502, + "time_per_iteration": 2.508277654647827 + }, + { + "auxiliary_loss_clip": 0.01141546, + "auxiliary_loss_mlp": 0.0104558, + "balance_loss_clip": 1.02796102, + "balance_loss_mlp": 1.0475595, + "epoch": 0.21061175409589658, + "flos": 25739045193600.0, + "grad_norm": 1.9145063235338367, + "language_loss": 0.77090263, + "learning_rate": 3.667455706571316e-06, + "loss": 0.7927739, + "num_input_tokens_seen": 75709265, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.94140625, + "step": 3503, + "time_per_iteration": 2.5607948303222656 + }, + { + "auxiliary_loss_clip": 0.01148374, + "auxiliary_loss_mlp": 0.01049218, + "balance_loss_clip": 1.02813029, + "balance_loss_mlp": 1.048738, + "epoch": 0.21067187734856455, + "flos": 18989168112000.0, + "grad_norm": 2.3817148130730144, + "language_loss": 0.77991742, + "learning_rate": 3.6672406237293617e-06, + "loss": 0.80189341, + "num_input_tokens_seen": 75727050, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.9921875, + "step": 3504, + "time_per_iteration": 2.495028018951416 + }, + { + "auxiliary_loss_clip": 0.01145149, + "auxiliary_loss_mlp": 0.01047751, + "balance_loss_clip": 1.02952361, + "balance_loss_mlp": 1.0473187, + "epoch": 0.21073200060123254, + "flos": 24681368292480.0, + "grad_norm": 1.5529728217373517, + "language_loss": 0.77045631, + "learning_rate": 3.6670254776654754e-06, + "loss": 0.79238534, + "num_input_tokens_seen": 75747175, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9765625, + "step": 3505, + "time_per_iteration": 2.5408663749694824 + }, + { + "auxiliary_loss_clip": 0.01138823, + "auxiliary_loss_mlp": 0.01046578, + "balance_loss_clip": 1.02931666, + "balance_loss_mlp": 1.04786968, + "epoch": 0.2107921238539005, + "flos": 28549342402560.0, + "grad_norm": 1.9911708078552777, + "language_loss": 0.63704473, + "learning_rate": 3.6668102683878163e-06, + "loss": 0.65889871, + "num_input_tokens_seen": 75767690, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.91015625, + "step": 3506, + "time_per_iteration": 2.564246892929077 + }, + { + "auxiliary_loss_clip": 0.01140339, + "auxiliary_loss_mlp": 0.01046628, + "balance_loss_clip": 1.02904439, + "balance_loss_mlp": 1.04773796, + "epoch": 0.21085224710656847, + "flos": 25885848078720.0, + "grad_norm": 1.8633964271687153, + "language_loss": 0.81863034, + "learning_rate": 3.6665949959045443e-06, + "loss": 0.84050006, + "num_input_tokens_seen": 75787255, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.92578125, + "step": 3507, + "time_per_iteration": 2.6049435138702393 + }, + { + "auxiliary_loss_clip": 0.011401, + "auxiliary_loss_mlp": 0.0104784, + "balance_loss_clip": 1.0299232, + "balance_loss_mlp": 1.04645514, + "epoch": 0.21091237035923643, + "flos": 14976294537600.0, + "grad_norm": 2.0263301336255135, + "language_loss": 0.75496012, + "learning_rate": 3.666379660223824e-06, + "loss": 0.77683949, + "num_input_tokens_seen": 75805890, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.93359375, + "step": 3508, + "time_per_iteration": 2.5366437435150146 + }, + { + "auxiliary_loss_clip": 0.01144539, + "auxiliary_loss_mlp": 0.01042146, + "balance_loss_clip": 1.02395463, + "balance_loss_mlp": 1.04809749, + "epoch": 0.2109724936119044, + "flos": 16362518163840.0, + "grad_norm": 2.1922875924351115, + "language_loss": 0.85395098, + "learning_rate": 3.6661642613538192e-06, + "loss": 0.87581778, + "num_input_tokens_seen": 75821620, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.96484375, + "step": 3509, + "time_per_iteration": 2.4895167350769043 + }, + { + "auxiliary_loss_clip": 0.01146568, + "auxiliary_loss_mlp": 0.01043895, + "balance_loss_clip": 1.02503562, + "balance_loss_mlp": 1.04908204, + "epoch": 0.21103261686457236, + "flos": 31502492000640.0, + "grad_norm": 1.5522473876542349, + "language_loss": 0.67803288, + "learning_rate": 3.6659487993026987e-06, + "loss": 0.69993746, + "num_input_tokens_seen": 75842490, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.9765625, + "step": 3510, + "time_per_iteration": 4.065294027328491 + }, + { + "auxiliary_loss_clip": 0.01143018, + "auxiliary_loss_mlp": 0.01041477, + "balance_loss_clip": 1.02381003, + "balance_loss_mlp": 1.04653811, + "epoch": 0.21109274011724033, + "flos": 27344072517120.0, + "grad_norm": 1.9784941086490475, + "language_loss": 0.7240749, + "learning_rate": 3.6657332740786327e-06, + "loss": 0.74591982, + "num_input_tokens_seen": 75865985, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.96484375, + "step": 3511, + "time_per_iteration": 2.5701003074645996 + }, + { + "auxiliary_loss_clip": 0.01148402, + "auxiliary_loss_mlp": 0.01039531, + "balance_loss_clip": 1.02020764, + "balance_loss_mlp": 1.05022192, + "epoch": 0.21115286336990832, + "flos": 17820383466240.0, + "grad_norm": 2.3544542512902322, + "language_loss": 0.69737375, + "learning_rate": 3.665517685689794e-06, + "loss": 0.71925306, + "num_input_tokens_seen": 75882745, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.98046875, + "step": 3512, + "time_per_iteration": 3.9019229412078857 + }, + { + "auxiliary_loss_clip": 0.01143526, + "auxiliary_loss_mlp": 0.01047621, + "balance_loss_clip": 1.02872658, + "balance_loss_mlp": 1.04680824, + "epoch": 0.2112129866225763, + "flos": 27197987904000.0, + "grad_norm": 1.6756724017558497, + "language_loss": 0.73159289, + "learning_rate": 3.6653020341443584e-06, + "loss": 0.7535044, + "num_input_tokens_seen": 75904305, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.96875, + "step": 3513, + "time_per_iteration": 2.5643980503082275 + }, + { + "auxiliary_loss_clip": 0.01140444, + "auxiliary_loss_mlp": 0.0103852, + "balance_loss_clip": 1.02212906, + "balance_loss_mlp": 1.04916954, + "epoch": 0.21127310987524425, + "flos": 23731279603200.0, + "grad_norm": 1.635076517146385, + "language_loss": 0.74235332, + "learning_rate": 3.665086319450502e-06, + "loss": 0.76414299, + "num_input_tokens_seen": 75923710, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9140625, + "step": 3514, + "time_per_iteration": 2.5240070819854736 + }, + { + "auxiliary_loss_clip": 0.01144119, + "auxiliary_loss_mlp": 0.01040689, + "balance_loss_clip": 1.02347541, + "balance_loss_mlp": 1.0482856, + "epoch": 0.21133323312791222, + "flos": 18332505624960.0, + "grad_norm": 1.7928371848293583, + "language_loss": 0.76707381, + "learning_rate": 3.6648705416164062e-06, + "loss": 0.78892195, + "num_input_tokens_seen": 75942625, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9609375, + "step": 3515, + "time_per_iteration": 2.526527166366577 + }, + { + "auxiliary_loss_clip": 0.0114362, + "auxiliary_loss_mlp": 0.01042748, + "balance_loss_clip": 1.02517664, + "balance_loss_mlp": 1.04956555, + "epoch": 0.21139335638058018, + "flos": 17931203902080.0, + "grad_norm": 1.8516547188762509, + "language_loss": 0.68242604, + "learning_rate": 3.6646547006502518e-06, + "loss": 0.70428967, + "num_input_tokens_seen": 75959930, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.94140625, + "step": 3516, + "time_per_iteration": 2.46085786819458 + }, + { + "auxiliary_loss_clip": 0.01145197, + "auxiliary_loss_mlp": 0.01047209, + "balance_loss_clip": 1.02883935, + "balance_loss_mlp": 1.04901481, + "epoch": 0.21145347963324815, + "flos": 24572092141440.0, + "grad_norm": 1.653683865815189, + "language_loss": 0.85012519, + "learning_rate": 3.664438796560225e-06, + "loss": 0.87204921, + "num_input_tokens_seen": 75980335, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.9609375, + "step": 3517, + "time_per_iteration": 2.5080301761627197 + }, + { + "auxiliary_loss_clip": 0.01141463, + "auxiliary_loss_mlp": 0.01037033, + "balance_loss_clip": 1.01965201, + "balance_loss_mlp": 1.04722667, + "epoch": 0.21151360288591614, + "flos": 35845959375360.0, + "grad_norm": 2.26725319642869, + "language_loss": 0.62925792, + "learning_rate": 3.664222829354512e-06, + "loss": 0.65104288, + "num_input_tokens_seen": 76002095, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.94140625, + "step": 3518, + "time_per_iteration": 2.5949900150299072 + }, + { + "auxiliary_loss_clip": 0.01142565, + "auxiliary_loss_mlp": 0.01049413, + "balance_loss_clip": 1.03290248, + "balance_loss_mlp": 1.04891765, + "epoch": 0.2115737261385841, + "flos": 24641579001600.0, + "grad_norm": 1.8284325952385483, + "language_loss": 0.88772321, + "learning_rate": 3.664006799041303e-06, + "loss": 0.90964293, + "num_input_tokens_seen": 76020425, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.9375, + "step": 3519, + "time_per_iteration": 2.5356082916259766 + }, + { + "auxiliary_loss_clip": 0.0114445, + "auxiliary_loss_mlp": 0.01049295, + "balance_loss_clip": 1.03184235, + "balance_loss_mlp": 1.04866135, + "epoch": 0.21163384939125207, + "flos": 25226887121280.0, + "grad_norm": 1.5988506078375424, + "language_loss": 0.81066215, + "learning_rate": 3.6637907056287886e-06, + "loss": 0.83259952, + "num_input_tokens_seen": 76041210, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.95703125, + "step": 3520, + "time_per_iteration": 2.5069239139556885 + }, + { + "auxiliary_loss_clip": 0.0113827, + "auxiliary_loss_mlp": 0.01046076, + "balance_loss_clip": 1.02926779, + "balance_loss_mlp": 1.0469681, + "epoch": 0.21169397264392004, + "flos": 26067520091520.0, + "grad_norm": 1.592359744312873, + "language_loss": 0.76163614, + "learning_rate": 3.6635745491251642e-06, + "loss": 0.78347969, + "num_input_tokens_seen": 76062685, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9140625, + "step": 3521, + "time_per_iteration": 2.560037851333618 + }, + { + "auxiliary_loss_clip": 0.0113934, + "auxiliary_loss_mlp": 0.0104393, + "balance_loss_clip": 1.02842069, + "balance_loss_mlp": 1.04592443, + "epoch": 0.211754095896588, + "flos": 23108265181440.0, + "grad_norm": 2.0717596449561024, + "language_loss": 0.75950933, + "learning_rate": 3.663358329538626e-06, + "loss": 0.78134197, + "num_input_tokens_seen": 76082300, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.93359375, + "step": 3522, + "time_per_iteration": 2.4758715629577637 + }, + { + "auxiliary_loss_clip": 0.01141462, + "auxiliary_loss_mlp": 0.01049727, + "balance_loss_clip": 1.03176177, + "balance_loss_mlp": 1.04737353, + "epoch": 0.21181421914925597, + "flos": 27922341571200.0, + "grad_norm": 2.026497436525855, + "language_loss": 0.70436251, + "learning_rate": 3.663142046877374e-06, + "loss": 0.72627443, + "num_input_tokens_seen": 76101135, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.94140625, + "step": 3523, + "time_per_iteration": 2.5368640422821045 + }, + { + "auxiliary_loss_clip": 0.01140964, + "auxiliary_loss_mlp": 0.01044975, + "balance_loss_clip": 1.02786803, + "balance_loss_mlp": 1.04820895, + "epoch": 0.21187434240192393, + "flos": 17128636369920.0, + "grad_norm": 2.216886450348082, + "language_loss": 0.76683456, + "learning_rate": 3.6629257011496085e-06, + "loss": 0.7886939, + "num_input_tokens_seen": 76119320, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.92578125, + "step": 3524, + "time_per_iteration": 2.5932695865631104 + }, + { + "auxiliary_loss_clip": 0.01139634, + "auxiliary_loss_mlp": 0.01042013, + "balance_loss_clip": 1.02533603, + "balance_loss_mlp": 1.04276347, + "epoch": 0.21193446565459192, + "flos": 22347318533760.0, + "grad_norm": 2.020092904399728, + "language_loss": 0.81433582, + "learning_rate": 3.6627092923635338e-06, + "loss": 0.83615232, + "num_input_tokens_seen": 76137445, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.96875, + "step": 3525, + "time_per_iteration": 2.5425641536712646 + }, + { + "auxiliary_loss_clip": 0.011388, + "auxiliary_loss_mlp": 0.01041718, + "balance_loss_clip": 1.02484989, + "balance_loss_mlp": 1.04668331, + "epoch": 0.2119945889072599, + "flos": 27199316707200.0, + "grad_norm": 2.1031950889850655, + "language_loss": 0.75104785, + "learning_rate": 3.662492820527356e-06, + "loss": 0.77285308, + "num_input_tokens_seen": 76159500, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.921875, + "step": 3526, + "time_per_iteration": 2.533210515975952 + }, + { + "auxiliary_loss_clip": 0.01142205, + "auxiliary_loss_mlp": 0.01041732, + "balance_loss_clip": 1.02466083, + "balance_loss_mlp": 1.04663801, + "epoch": 0.21205471215992786, + "flos": 20991869884800.0, + "grad_norm": 1.9135764326712537, + "language_loss": 0.77385598, + "learning_rate": 3.662276285649284e-06, + "loss": 0.79569542, + "num_input_tokens_seen": 76177990, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.95703125, + "step": 3527, + "time_per_iteration": 2.53898286819458 + }, + { + "auxiliary_loss_clip": 0.0113944, + "auxiliary_loss_mlp": 0.01045919, + "balance_loss_clip": 1.02797842, + "balance_loss_mlp": 1.0461328, + "epoch": 0.21211483541259582, + "flos": 20777663128320.0, + "grad_norm": 1.981008674330079, + "language_loss": 0.78037727, + "learning_rate": 3.662059687737528e-06, + "loss": 0.80223083, + "num_input_tokens_seen": 76197125, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9296875, + "step": 3528, + "time_per_iteration": 2.5360231399536133 + }, + { + "auxiliary_loss_clip": 0.01138776, + "auxiliary_loss_mlp": 0.01047702, + "balance_loss_clip": 1.03096509, + "balance_loss_mlp": 1.04611731, + "epoch": 0.21217495866526379, + "flos": 18989994124800.0, + "grad_norm": 1.7275367809487383, + "language_loss": 0.8170321, + "learning_rate": 3.6618430268003024e-06, + "loss": 0.83889693, + "num_input_tokens_seen": 76216215, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.92578125, + "step": 3529, + "time_per_iteration": 2.531228542327881 + }, + { + "auxiliary_loss_clip": 0.01141251, + "auxiliary_loss_mlp": 0.01044804, + "balance_loss_clip": 1.028234, + "balance_loss_mlp": 1.04647708, + "epoch": 0.21223508191793175, + "flos": 20667309569280.0, + "grad_norm": 2.1603106904513547, + "language_loss": 0.76616383, + "learning_rate": 3.6616263028458235e-06, + "loss": 0.78802443, + "num_input_tokens_seen": 76237010, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9453125, + "step": 3530, + "time_per_iteration": 2.5361740589141846 + }, + { + "auxiliary_loss_clip": 0.01136983, + "auxiliary_loss_mlp": 0.01041907, + "balance_loss_clip": 1.02593338, + "balance_loss_mlp": 1.0451746, + "epoch": 0.21229520517059972, + "flos": 21616464504960.0, + "grad_norm": 2.3391242970409873, + "language_loss": 0.82978404, + "learning_rate": 3.661409515882308e-06, + "loss": 0.85157299, + "num_input_tokens_seen": 76255965, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.91796875, + "step": 3531, + "time_per_iteration": 2.571411609649658 + }, + { + "auxiliary_loss_clip": 0.01141528, + "auxiliary_loss_mlp": 0.01039512, + "balance_loss_clip": 1.02149963, + "balance_loss_mlp": 1.04744506, + "epoch": 0.2123553284232677, + "flos": 13991049411840.0, + "grad_norm": 2.416019676502894, + "language_loss": 0.73473567, + "learning_rate": 3.661192665917977e-06, + "loss": 0.75654608, + "num_input_tokens_seen": 76272150, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.94140625, + "step": 3532, + "time_per_iteration": 2.473006248474121 + }, + { + "auxiliary_loss_clip": 0.01138524, + "auxiliary_loss_mlp": 0.01042643, + "balance_loss_clip": 1.02485681, + "balance_loss_mlp": 1.04561734, + "epoch": 0.21241545167593567, + "flos": 18296774570880.0, + "grad_norm": 1.7353898898315339, + "language_loss": 0.73855233, + "learning_rate": 3.660975752961054e-06, + "loss": 0.76036394, + "num_input_tokens_seen": 76291425, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.92578125, + "step": 3533, + "time_per_iteration": 2.526780366897583 + }, + { + "auxiliary_loss_clip": 0.01140469, + "auxiliary_loss_mlp": 0.01045491, + "balance_loss_clip": 1.02833724, + "balance_loss_mlp": 1.04576015, + "epoch": 0.21247557492860364, + "flos": 34713121265280.0, + "grad_norm": 1.8944995629732337, + "language_loss": 0.7098999, + "learning_rate": 3.6607587770197634e-06, + "loss": 0.73175949, + "num_input_tokens_seen": 76313975, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9453125, + "step": 3534, + "time_per_iteration": 2.6947309970855713 + }, + { + "auxiliary_loss_clip": 0.01141409, + "auxiliary_loss_mlp": 0.01038239, + "balance_loss_clip": 1.02032161, + "balance_loss_mlp": 1.04669714, + "epoch": 0.2125356981812716, + "flos": 22053820504320.0, + "grad_norm": 1.9387778569542722, + "language_loss": 0.71567297, + "learning_rate": 3.6605417381023346e-06, + "loss": 0.73746949, + "num_input_tokens_seen": 76330955, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9453125, + "step": 3535, + "time_per_iteration": 2.6022329330444336 + }, + { + "auxiliary_loss_clip": 0.01136751, + "auxiliary_loss_mlp": 0.01046685, + "balance_loss_clip": 1.0299238, + "balance_loss_mlp": 1.04549336, + "epoch": 0.21259582143393957, + "flos": 28548336821760.0, + "grad_norm": 1.8756666540330442, + "language_loss": 0.7040931, + "learning_rate": 3.660324636216996e-06, + "loss": 0.72592747, + "num_input_tokens_seen": 76352680, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.91015625, + "step": 3536, + "time_per_iteration": 2.6005256175994873 + }, + { + "auxiliary_loss_clip": 0.01140865, + "auxiliary_loss_mlp": 0.01044171, + "balance_loss_clip": 1.02706444, + "balance_loss_mlp": 1.04512393, + "epoch": 0.21265594468660753, + "flos": 20120892900480.0, + "grad_norm": 1.9573194210103453, + "language_loss": 0.88217437, + "learning_rate": 3.660107471371981e-06, + "loss": 0.90402472, + "num_input_tokens_seen": 76370750, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.95703125, + "step": 3537, + "time_per_iteration": 2.5565810203552246 + }, + { + "auxiliary_loss_clip": 0.01134343, + "auxiliary_loss_mlp": 0.01040555, + "balance_loss_clip": 1.02425885, + "balance_loss_mlp": 1.0437026, + "epoch": 0.21271606793927553, + "flos": 23076161400960.0, + "grad_norm": 1.957058885696691, + "language_loss": 0.80129743, + "learning_rate": 3.659890243575524e-06, + "loss": 0.82304639, + "num_input_tokens_seen": 76390610, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.90625, + "step": 3538, + "time_per_iteration": 2.5501785278320312 + }, + { + "auxiliary_loss_clip": 0.01135328, + "auxiliary_loss_mlp": 0.01041186, + "balance_loss_clip": 1.025653, + "balance_loss_mlp": 1.0446775, + "epoch": 0.2127761911919435, + "flos": 26388201738240.0, + "grad_norm": 1.587715235485788, + "language_loss": 0.87131894, + "learning_rate": 3.659672952835863e-06, + "loss": 0.89308405, + "num_input_tokens_seen": 76408860, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.90625, + "step": 3539, + "time_per_iteration": 2.5751259326934814 + }, + { + "auxiliary_loss_clip": 0.01139264, + "auxiliary_loss_mlp": 0.01045476, + "balance_loss_clip": 1.02914476, + "balance_loss_mlp": 1.04718518, + "epoch": 0.21283631444461146, + "flos": 20228265630720.0, + "grad_norm": 3.3040839486156184, + "language_loss": 0.57464051, + "learning_rate": 3.659455599161237e-06, + "loss": 0.59648788, + "num_input_tokens_seen": 76424980, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.921875, + "step": 3540, + "time_per_iteration": 2.4746458530426025 + }, + { + "auxiliary_loss_clip": 0.01140156, + "auxiliary_loss_mlp": 0.01040246, + "balance_loss_clip": 1.02330637, + "balance_loss_mlp": 1.04658604, + "epoch": 0.21289643769727942, + "flos": 13516992691200.0, + "grad_norm": 5.8376417218282874, + "language_loss": 0.76062799, + "learning_rate": 3.659238182559888e-06, + "loss": 0.78243208, + "num_input_tokens_seen": 76443135, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.93359375, + "step": 3541, + "time_per_iteration": 2.5111818313598633 + }, + { + "auxiliary_loss_clip": 0.0113571, + "auxiliary_loss_mlp": 0.01041682, + "balance_loss_clip": 1.02517211, + "balance_loss_mlp": 1.04530454, + "epoch": 0.2129565609499474, + "flos": 24827021942400.0, + "grad_norm": 1.9190227230034667, + "language_loss": 0.69458514, + "learning_rate": 3.6590207030400615e-06, + "loss": 0.71635908, + "num_input_tokens_seen": 76462470, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.90234375, + "step": 3542, + "time_per_iteration": 2.556300401687622 + }, + { + "auxiliary_loss_clip": 0.01134092, + "auxiliary_loss_mlp": 0.01034857, + "balance_loss_clip": 1.01945567, + "balance_loss_mlp": 1.04443789, + "epoch": 0.21301668420261535, + "flos": 23659242877440.0, + "grad_norm": 1.8172219669397587, + "language_loss": 0.75591409, + "learning_rate": 3.658803160610004e-06, + "loss": 0.77760351, + "num_input_tokens_seen": 76481995, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8984375, + "step": 3543, + "time_per_iteration": 2.54424786567688 + }, + { + "auxiliary_loss_clip": 0.01138428, + "auxiliary_loss_mlp": 0.01038735, + "balance_loss_clip": 1.02253413, + "balance_loss_mlp": 1.04843175, + "epoch": 0.21307680745528332, + "flos": 16362805472640.0, + "grad_norm": 2.1531603349332915, + "language_loss": 0.66787028, + "learning_rate": 3.6585855552779634e-06, + "loss": 0.68964195, + "num_input_tokens_seen": 76500245, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8984375, + "step": 3544, + "time_per_iteration": 2.516359329223633 + }, + { + "auxiliary_loss_clip": 0.01136471, + "auxiliary_loss_mlp": 0.01040176, + "balance_loss_clip": 1.0245831, + "balance_loss_mlp": 1.04379654, + "epoch": 0.2131369307079513, + "flos": 19099054794240.0, + "grad_norm": 1.9827170900636153, + "language_loss": 0.71089172, + "learning_rate": 3.6583678870521934e-06, + "loss": 0.73265821, + "num_input_tokens_seen": 76519535, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.92578125, + "step": 3545, + "time_per_iteration": 2.5377357006073 + }, + { + "auxiliary_loss_clip": 0.01138848, + "auxiliary_loss_mlp": 0.01046644, + "balance_loss_clip": 1.03095567, + "balance_loss_mlp": 1.04571509, + "epoch": 0.21319705396061928, + "flos": 30372275583360.0, + "grad_norm": 1.730364240275379, + "language_loss": 0.72334421, + "learning_rate": 3.658150155940946e-06, + "loss": 0.74519908, + "num_input_tokens_seen": 76542065, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.9296875, + "step": 3546, + "time_per_iteration": 2.5640652179718018 + }, + { + "auxiliary_loss_clip": 0.0113929, + "auxiliary_loss_mlp": 0.0104318, + "balance_loss_clip": 1.02695596, + "balance_loss_mlp": 1.0467453, + "epoch": 0.21325717721328724, + "flos": 21756192410880.0, + "grad_norm": 1.889324350950523, + "language_loss": 0.80698627, + "learning_rate": 3.657932361952479e-06, + "loss": 0.82881093, + "num_input_tokens_seen": 76560540, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.92578125, + "step": 3547, + "time_per_iteration": 2.527398109436035 + }, + { + "auxiliary_loss_clip": 0.01140759, + "auxiliary_loss_mlp": 0.0104395, + "balance_loss_clip": 1.02702212, + "balance_loss_mlp": 1.04538703, + "epoch": 0.2133173004659552, + "flos": 28730870760960.0, + "grad_norm": 3.232228952830713, + "language_loss": 0.74496448, + "learning_rate": 3.6577145050950504e-06, + "loss": 0.76681155, + "num_input_tokens_seen": 76581760, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.953125, + "step": 3548, + "time_per_iteration": 2.5493834018707275 + }, + { + "auxiliary_loss_clip": 0.01141872, + "auxiliary_loss_mlp": 0.01045412, + "balance_loss_clip": 1.02719641, + "balance_loss_mlp": 1.04663396, + "epoch": 0.21337742371862317, + "flos": 16837077674880.0, + "grad_norm": 2.0441969792992265, + "language_loss": 0.74135804, + "learning_rate": 3.657496585376922e-06, + "loss": 0.76323086, + "num_input_tokens_seen": 76599940, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.953125, + "step": 3549, + "time_per_iteration": 2.514817476272583 + }, + { + "auxiliary_loss_clip": 0.01142468, + "auxiliary_loss_mlp": 0.01046789, + "balance_loss_clip": 1.03063631, + "balance_loss_mlp": 1.04963064, + "epoch": 0.21343754697129114, + "flos": 24424930120320.0, + "grad_norm": 1.6981522694050752, + "language_loss": 0.80653727, + "learning_rate": 3.657278602806357e-06, + "loss": 0.82842982, + "num_input_tokens_seen": 76619580, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.92578125, + "step": 3550, + "time_per_iteration": 2.541501045227051 + }, + { + "auxiliary_loss_clip": 0.01136887, + "auxiliary_loss_mlp": 0.01044073, + "balance_loss_clip": 1.02883255, + "balance_loss_mlp": 1.04706621, + "epoch": 0.21349767022395913, + "flos": 19277817805440.0, + "grad_norm": 1.615115943492657, + "language_loss": 0.88341218, + "learning_rate": 3.657060557391621e-06, + "loss": 0.90522182, + "num_input_tokens_seen": 76638195, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8984375, + "step": 3551, + "time_per_iteration": 2.5310463905334473 + }, + { + "auxiliary_loss_clip": 0.01136336, + "auxiliary_loss_mlp": 0.0104486, + "balance_loss_clip": 1.02887464, + "balance_loss_mlp": 1.04430258, + "epoch": 0.2135577934766271, + "flos": 17347547808000.0, + "grad_norm": 2.1215125327645152, + "language_loss": 0.83415043, + "learning_rate": 3.656842449140983e-06, + "loss": 0.8559624, + "num_input_tokens_seen": 76656695, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.91796875, + "step": 3552, + "time_per_iteration": 3.974120616912842 + }, + { + "auxiliary_loss_clip": 0.0113546, + "auxiliary_loss_mlp": 0.01048241, + "balance_loss_clip": 1.03164101, + "balance_loss_mlp": 1.04522753, + "epoch": 0.21361791672929506, + "flos": 24057204635520.0, + "grad_norm": 1.7556537525349103, + "language_loss": 0.76692683, + "learning_rate": 3.656624278062713e-06, + "loss": 0.78876388, + "num_input_tokens_seen": 76677430, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.90234375, + "step": 3553, + "time_per_iteration": 3.964289903640747 + }, + { + "auxiliary_loss_clip": 0.0113607, + "auxiliary_loss_mlp": 0.01040019, + "balance_loss_clip": 1.02520156, + "balance_loss_mlp": 1.04556942, + "epoch": 0.21367803998196302, + "flos": 22162306556160.0, + "grad_norm": 1.6502841430946371, + "language_loss": 0.72946119, + "learning_rate": 3.6564060441650843e-06, + "loss": 0.75122207, + "num_input_tokens_seen": 76697615, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.90625, + "step": 3554, + "time_per_iteration": 2.5141818523406982 + }, + { + "auxiliary_loss_clip": 0.01137832, + "auxiliary_loss_mlp": 0.01036933, + "balance_loss_clip": 1.02121508, + "balance_loss_mlp": 1.04672861, + "epoch": 0.213738163234631, + "flos": 20886867452160.0, + "grad_norm": 1.9371755733444218, + "language_loss": 0.6745261, + "learning_rate": 3.6561877474563724e-06, + "loss": 0.69627374, + "num_input_tokens_seen": 76715685, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.91015625, + "step": 3555, + "time_per_iteration": 2.6116089820861816 + }, + { + "auxiliary_loss_clip": 0.01138406, + "auxiliary_loss_mlp": 0.01039389, + "balance_loss_clip": 1.02293801, + "balance_loss_mlp": 1.04564714, + "epoch": 0.21379828648729896, + "flos": 28403114135040.0, + "grad_norm": 2.2550763051095752, + "language_loss": 0.64778429, + "learning_rate": 3.6559693879448553e-06, + "loss": 0.66956222, + "num_input_tokens_seen": 76735405, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9296875, + "step": 3556, + "time_per_iteration": 2.553746223449707 + }, + { + "auxiliary_loss_clip": 0.01139299, + "auxiliary_loss_mlp": 0.01045701, + "balance_loss_clip": 1.02893996, + "balance_loss_mlp": 1.04656768, + "epoch": 0.21385840973996692, + "flos": 25479662106240.0, + "grad_norm": 1.6295299556205536, + "language_loss": 0.72333252, + "learning_rate": 3.6557509656388125e-06, + "loss": 0.74518251, + "num_input_tokens_seen": 76754395, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9296875, + "step": 3557, + "time_per_iteration": 2.6562533378601074 + }, + { + "auxiliary_loss_clip": 0.0114268, + "auxiliary_loss_mlp": 0.01039642, + "balance_loss_clip": 1.02189136, + "balance_loss_mlp": 1.04716706, + "epoch": 0.2139185329926349, + "flos": 28074280101120.0, + "grad_norm": 1.6722734443717013, + "language_loss": 0.67139357, + "learning_rate": 3.655532480546528e-06, + "loss": 0.6932168, + "num_input_tokens_seen": 76777210, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.953125, + "step": 3558, + "time_per_iteration": 2.5435290336608887 + }, + { + "auxiliary_loss_clip": 0.01142773, + "auxiliary_loss_mlp": 0.01036302, + "balance_loss_clip": 1.01943386, + "balance_loss_mlp": 1.04542494, + "epoch": 0.21397865624530288, + "flos": 19608698914560.0, + "grad_norm": 1.8839208997443517, + "language_loss": 0.79702216, + "learning_rate": 3.655313932676286e-06, + "loss": 0.81881285, + "num_input_tokens_seen": 76795830, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9765625, + "step": 3559, + "time_per_iteration": 2.5535330772399902 + }, + { + "auxiliary_loss_clip": 0.01137143, + "auxiliary_loss_mlp": 0.01044167, + "balance_loss_clip": 1.02822304, + "balance_loss_mlp": 1.04436731, + "epoch": 0.21403877949797084, + "flos": 24681476033280.0, + "grad_norm": 1.6653874224583467, + "language_loss": 0.67549068, + "learning_rate": 3.655095322036373e-06, + "loss": 0.69730377, + "num_input_tokens_seen": 76814700, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.9296875, + "step": 3560, + "time_per_iteration": 2.5241451263427734 + }, + { + "auxiliary_loss_clip": 0.0114283, + "auxiliary_loss_mlp": 0.01041365, + "balance_loss_clip": 1.02514052, + "balance_loss_mlp": 1.04846883, + "epoch": 0.2140989027506388, + "flos": 19861150677120.0, + "grad_norm": 1.8721878156787213, + "language_loss": 0.72995424, + "learning_rate": 3.65487664863508e-06, + "loss": 0.75179613, + "num_input_tokens_seen": 76833400, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.9453125, + "step": 3561, + "time_per_iteration": 2.5678720474243164 + }, + { + "auxiliary_loss_clip": 0.01142897, + "auxiliary_loss_mlp": 0.01044952, + "balance_loss_clip": 1.02817965, + "balance_loss_mlp": 1.04897678, + "epoch": 0.21415902600330677, + "flos": 19135324552320.0, + "grad_norm": 2.2783713689110243, + "language_loss": 0.77110738, + "learning_rate": 3.654657912480698e-06, + "loss": 0.79298586, + "num_input_tokens_seen": 76850645, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9375, + "step": 3562, + "time_per_iteration": 2.4598803520202637 + }, + { + "auxiliary_loss_clip": 0.01140561, + "auxiliary_loss_mlp": 0.01038127, + "balance_loss_clip": 1.02160454, + "balance_loss_mlp": 1.04795694, + "epoch": 0.21421914925597474, + "flos": 22272624201600.0, + "grad_norm": 1.5929440625910447, + "language_loss": 0.84534913, + "learning_rate": 3.6544391135815237e-06, + "loss": 0.867136, + "num_input_tokens_seen": 76870135, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.92578125, + "step": 3563, + "time_per_iteration": 2.5654757022857666 + }, + { + "auxiliary_loss_clip": 0.0114087, + "auxiliary_loss_mlp": 0.01038331, + "balance_loss_clip": 1.02227342, + "balance_loss_mlp": 1.04757166, + "epoch": 0.2142792725086427, + "flos": 33875109987840.0, + "grad_norm": 1.6134338415520206, + "language_loss": 0.76727796, + "learning_rate": 3.6542202519458507e-06, + "loss": 0.78907001, + "num_input_tokens_seen": 76893905, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.93359375, + "step": 3564, + "time_per_iteration": 2.591064214706421 + }, + { + "auxiliary_loss_clip": 0.01138542, + "auxiliary_loss_mlp": 0.01041793, + "balance_loss_clip": 1.02560401, + "balance_loss_mlp": 1.0467248, + "epoch": 0.2143393957613107, + "flos": 19860216923520.0, + "grad_norm": 1.880454163642384, + "language_loss": 0.88260084, + "learning_rate": 3.654001327581981e-06, + "loss": 0.90440416, + "num_input_tokens_seen": 76914205, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.91796875, + "step": 3565, + "time_per_iteration": 2.571242094039917 + }, + { + "auxiliary_loss_clip": 0.0105057, + "auxiliary_loss_mlp": 0.01009282, + "balance_loss_clip": 1.00739813, + "balance_loss_mlp": 1.0192101, + "epoch": 0.21439951901397866, + "flos": 68530093090560.0, + "grad_norm": 0.8403524328969202, + "language_loss": 0.52300179, + "learning_rate": 3.653782340498215e-06, + "loss": 0.54360026, + "num_input_tokens_seen": 76975650, + "router_z_loss_clip": 0.01879883, + "router_z_loss_mlp": 0.3125, + "step": 3566, + "time_per_iteration": 3.055588722229004 + }, + { + "auxiliary_loss_clip": 0.01136421, + "auxiliary_loss_mlp": 0.01036219, + "balance_loss_clip": 1.02093637, + "balance_loss_mlp": 1.04677701, + "epoch": 0.21445964226664663, + "flos": 19682998197120.0, + "grad_norm": 1.91490691342046, + "language_loss": 0.67412555, + "learning_rate": 3.6535632907028566e-06, + "loss": 0.69585192, + "num_input_tokens_seen": 76992615, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.89453125, + "step": 3567, + "time_per_iteration": 2.5511529445648193 + }, + { + "auxiliary_loss_clip": 0.01135888, + "auxiliary_loss_mlp": 0.01041672, + "balance_loss_clip": 1.02630615, + "balance_loss_mlp": 1.04691041, + "epoch": 0.2145197655193146, + "flos": 31107259676160.0, + "grad_norm": 1.6974661731729381, + "language_loss": 0.74437779, + "learning_rate": 3.6533441782042126e-06, + "loss": 0.7661534, + "num_input_tokens_seen": 77017005, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.890625, + "step": 3568, + "time_per_iteration": 2.613090753555298 + }, + { + "auxiliary_loss_clip": 0.01137867, + "auxiliary_loss_mlp": 0.01043309, + "balance_loss_clip": 1.02710819, + "balance_loss_mlp": 1.04578757, + "epoch": 0.21457988877198256, + "flos": 20120785159680.0, + "grad_norm": 1.7479940521784256, + "language_loss": 0.77864397, + "learning_rate": 3.6531250030105917e-06, + "loss": 0.80045569, + "num_input_tokens_seen": 77034990, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.921875, + "step": 3569, + "time_per_iteration": 2.567439317703247 + }, + { + "auxiliary_loss_clip": 0.01147794, + "auxiliary_loss_mlp": 0.01038363, + "balance_loss_clip": 1.01981413, + "balance_loss_mlp": 1.05039883, + "epoch": 0.21464001202465052, + "flos": 18588045957120.0, + "grad_norm": 2.3364918832975317, + "language_loss": 0.69533777, + "learning_rate": 3.6529057651303053e-06, + "loss": 0.71719933, + "num_input_tokens_seen": 77052610, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9765625, + "step": 3570, + "time_per_iteration": 2.489550828933716 + }, + { + "auxiliary_loss_clip": 0.01144243, + "auxiliary_loss_mlp": 0.01043749, + "balance_loss_clip": 1.02703631, + "balance_loss_mlp": 1.0480299, + "epoch": 0.21470013527731852, + "flos": 21835160461440.0, + "grad_norm": 2.465398793786977, + "language_loss": 0.78108835, + "learning_rate": 3.6526864645716666e-06, + "loss": 0.80296826, + "num_input_tokens_seen": 77072475, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9609375, + "step": 3571, + "time_per_iteration": 2.527509927749634 + }, + { + "auxiliary_loss_clip": 0.01143428, + "auxiliary_loss_mlp": 0.01043615, + "balance_loss_clip": 1.02556705, + "balance_loss_mlp": 1.0501976, + "epoch": 0.21476025852998648, + "flos": 17603195880960.0, + "grad_norm": 2.5347995603010767, + "language_loss": 0.82851684, + "learning_rate": 3.652467101342991e-06, + "loss": 0.85038722, + "num_input_tokens_seen": 77089930, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.93359375, + "step": 3572, + "time_per_iteration": 2.491955280303955 + }, + { + "auxiliary_loss_clip": 0.01144597, + "auxiliary_loss_mlp": 0.01039432, + "balance_loss_clip": 1.02248025, + "balance_loss_mlp": 1.04700291, + "epoch": 0.21482038178265445, + "flos": 24828135264000.0, + "grad_norm": 2.35018592277076, + "language_loss": 0.64916813, + "learning_rate": 3.652247675452598e-06, + "loss": 0.67100847, + "num_input_tokens_seen": 77108970, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9765625, + "step": 3573, + "time_per_iteration": 2.5238969326019287 + }, + { + "auxiliary_loss_clip": 0.01133482, + "auxiliary_loss_mlp": 0.01040895, + "balance_loss_clip": 1.02481413, + "balance_loss_mlp": 1.04417133, + "epoch": 0.2148805050353224, + "flos": 23258228463360.0, + "grad_norm": 2.2164535787006705, + "language_loss": 0.75577438, + "learning_rate": 3.652028186908807e-06, + "loss": 0.77751815, + "num_input_tokens_seen": 77126045, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.890625, + "step": 3574, + "time_per_iteration": 2.5497734546661377 + }, + { + "auxiliary_loss_clip": 0.01137499, + "auxiliary_loss_mlp": 0.01035076, + "balance_loss_clip": 1.01752853, + "balance_loss_mlp": 1.04568887, + "epoch": 0.21494062828799038, + "flos": 21321098968320.0, + "grad_norm": 1.959683075701339, + "language_loss": 0.72380054, + "learning_rate": 3.6518086357199416e-06, + "loss": 0.74552631, + "num_input_tokens_seen": 77144600, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.91796875, + "step": 3575, + "time_per_iteration": 2.539255142211914 + }, + { + "auxiliary_loss_clip": 0.01141362, + "auxiliary_loss_mlp": 0.01036894, + "balance_loss_clip": 1.02097976, + "balance_loss_mlp": 1.04890776, + "epoch": 0.21500075154065834, + "flos": 18843334894080.0, + "grad_norm": 1.6473570004326006, + "language_loss": 0.68102455, + "learning_rate": 3.6515890218943277e-06, + "loss": 0.70280713, + "num_input_tokens_seen": 77162965, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.92578125, + "step": 3576, + "time_per_iteration": 2.515245199203491 + }, + { + "auxiliary_loss_clip": 0.01144679, + "auxiliary_loss_mlp": 0.01041063, + "balance_loss_clip": 1.02347922, + "balance_loss_mlp": 1.04820943, + "epoch": 0.2150608747933263, + "flos": 18441997257600.0, + "grad_norm": 2.1450103743023936, + "language_loss": 0.88840854, + "learning_rate": 3.651369345440292e-06, + "loss": 0.91026592, + "num_input_tokens_seen": 77179960, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9609375, + "step": 3577, + "time_per_iteration": 2.4426753520965576 + }, + { + "auxiliary_loss_clip": 0.01054886, + "auxiliary_loss_mlp": 0.01006787, + "balance_loss_clip": 1.00466526, + "balance_loss_mlp": 1.02252448, + "epoch": 0.2151209980459943, + "flos": 66598242894720.0, + "grad_norm": 0.8177210285410575, + "language_loss": 0.56242883, + "learning_rate": 3.6511496063661654e-06, + "loss": 0.5830456, + "num_input_tokens_seen": 77239500, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.32421875, + "step": 3578, + "time_per_iteration": 3.0434820652008057 + }, + { + "auxiliary_loss_clip": 0.0114273, + "auxiliary_loss_mlp": 0.01039801, + "balance_loss_clip": 1.02345788, + "balance_loss_mlp": 1.04957211, + "epoch": 0.21518112129866226, + "flos": 21575885114880.0, + "grad_norm": 1.6812319537870581, + "language_loss": 0.88500881, + "learning_rate": 3.6509298046802807e-06, + "loss": 0.90683413, + "num_input_tokens_seen": 77254680, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.9296875, + "step": 3579, + "time_per_iteration": 2.4646458625793457 + }, + { + "auxiliary_loss_clip": 0.01140846, + "auxiliary_loss_mlp": 0.01042477, + "balance_loss_clip": 1.02551329, + "balance_loss_mlp": 1.04618824, + "epoch": 0.21524124455133023, + "flos": 20047635112320.0, + "grad_norm": 1.7668055337606152, + "language_loss": 0.78238297, + "learning_rate": 3.650709940390972e-06, + "loss": 0.80421615, + "num_input_tokens_seen": 77274060, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9453125, + "step": 3580, + "time_per_iteration": 2.5029854774475098 + }, + { + "auxiliary_loss_clip": 0.01138764, + "auxiliary_loss_mlp": 0.01042384, + "balance_loss_clip": 1.02557576, + "balance_loss_mlp": 1.04757452, + "epoch": 0.2153013678039982, + "flos": 23951807153280.0, + "grad_norm": 1.7955176576656944, + "language_loss": 0.73129165, + "learning_rate": 3.6504900135065775e-06, + "loss": 0.75310302, + "num_input_tokens_seen": 77293255, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9140625, + "step": 3581, + "time_per_iteration": 2.503103733062744 + }, + { + "auxiliary_loss_clip": 0.01137091, + "auxiliary_loss_mlp": 0.0104596, + "balance_loss_clip": 1.02723205, + "balance_loss_mlp": 1.04665411, + "epoch": 0.21536149105666616, + "flos": 20594841880320.0, + "grad_norm": 2.610409860459302, + "language_loss": 0.70739609, + "learning_rate": 3.6502700240354357e-06, + "loss": 0.72922659, + "num_input_tokens_seen": 77312390, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.90234375, + "step": 3582, + "time_per_iteration": 2.4840197563171387 + }, + { + "auxiliary_loss_clip": 0.01137402, + "auxiliary_loss_mlp": 0.01041337, + "balance_loss_clip": 1.02401567, + "balance_loss_mlp": 1.04602027, + "epoch": 0.21542161430933413, + "flos": 12860042895360.0, + "grad_norm": 2.8570718584923633, + "language_loss": 0.84140432, + "learning_rate": 3.650049971985889e-06, + "loss": 0.86319172, + "num_input_tokens_seen": 77330985, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9140625, + "step": 3583, + "time_per_iteration": 2.4435312747955322 + }, + { + "auxiliary_loss_clip": 0.01143933, + "auxiliary_loss_mlp": 0.01045352, + "balance_loss_clip": 1.02834046, + "balance_loss_mlp": 1.04859185, + "epoch": 0.21548173756200212, + "flos": 26103933504000.0, + "grad_norm": 3.180305067245919, + "language_loss": 0.83226246, + "learning_rate": 3.6498298573662824e-06, + "loss": 0.8541553, + "num_input_tokens_seen": 77350770, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.953125, + "step": 3584, + "time_per_iteration": 2.521476984024048 + }, + { + "auxiliary_loss_clip": 0.01136808, + "auxiliary_loss_mlp": 0.01046426, + "balance_loss_clip": 1.02816272, + "balance_loss_mlp": 1.04518461, + "epoch": 0.21554186081467008, + "flos": 22163779013760.0, + "grad_norm": 2.0358477693345667, + "language_loss": 0.90233314, + "learning_rate": 3.6496096801849625e-06, + "loss": 0.92416549, + "num_input_tokens_seen": 77370510, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.91796875, + "step": 3585, + "time_per_iteration": 2.464745283126831 + }, + { + "auxiliary_loss_clip": 0.01140925, + "auxiliary_loss_mlp": 0.01042254, + "balance_loss_clip": 1.02568388, + "balance_loss_mlp": 1.04832685, + "epoch": 0.21560198406733805, + "flos": 22966741595520.0, + "grad_norm": 2.8296186032289348, + "language_loss": 0.74414444, + "learning_rate": 3.649389440450277e-06, + "loss": 0.76597619, + "num_input_tokens_seen": 77390645, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9296875, + "step": 3586, + "time_per_iteration": 2.5062146186828613 + }, + { + "auxiliary_loss_clip": 0.01141021, + "auxiliary_loss_mlp": 0.01042527, + "balance_loss_clip": 1.02668393, + "balance_loss_mlp": 1.04796743, + "epoch": 0.215662107320006, + "flos": 22784064001920.0, + "grad_norm": 2.1680236591426416, + "language_loss": 0.83055526, + "learning_rate": 3.6491691381705804e-06, + "loss": 0.85239077, + "num_input_tokens_seen": 77409655, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.9296875, + "step": 3587, + "time_per_iteration": 2.4784295558929443 + }, + { + "auxiliary_loss_clip": 0.01138799, + "auxiliary_loss_mlp": 0.01041925, + "balance_loss_clip": 1.02438986, + "balance_loss_mlp": 1.04664946, + "epoch": 0.21572223057267398, + "flos": 30883859038080.0, + "grad_norm": 1.8176747371086701, + "language_loss": 0.75756669, + "learning_rate": 3.648948773354224e-06, + "loss": 0.77937388, + "num_input_tokens_seen": 77430560, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.921875, + "step": 3588, + "time_per_iteration": 2.5896053314208984 + }, + { + "auxiliary_loss_clip": 0.01137468, + "auxiliary_loss_mlp": 0.01039715, + "balance_loss_clip": 1.02294254, + "balance_loss_mlp": 1.04534698, + "epoch": 0.21578235382534194, + "flos": 26910487445760.0, + "grad_norm": 1.8272464683057401, + "language_loss": 0.81006658, + "learning_rate": 3.6487283460095643e-06, + "loss": 0.83183837, + "num_input_tokens_seen": 77455000, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.921875, + "step": 3589, + "time_per_iteration": 2.540090799331665 + }, + { + "auxiliary_loss_clip": 0.01142059, + "auxiliary_loss_mlp": 0.01039493, + "balance_loss_clip": 1.02341199, + "balance_loss_mlp": 1.04792953, + "epoch": 0.2158424770780099, + "flos": 24425720219520.0, + "grad_norm": 2.6129530472479154, + "language_loss": 0.72591126, + "learning_rate": 3.648507856144961e-06, + "loss": 0.74772674, + "num_input_tokens_seen": 77475075, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.94140625, + "step": 3590, + "time_per_iteration": 2.5113861560821533 + }, + { + "auxiliary_loss_clip": 0.01145271, + "auxiliary_loss_mlp": 0.01046731, + "balance_loss_clip": 1.02769351, + "balance_loss_mlp": 1.04830956, + "epoch": 0.2159026003306779, + "flos": 23949975559680.0, + "grad_norm": 2.0133132975130477, + "language_loss": 0.83914638, + "learning_rate": 3.648287303768775e-06, + "loss": 0.86106646, + "num_input_tokens_seen": 77495945, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.96875, + "step": 3591, + "time_per_iteration": 2.488309621810913 + }, + { + "auxiliary_loss_clip": 0.01145642, + "auxiliary_loss_mlp": 0.01050952, + "balance_loss_clip": 1.03167534, + "balance_loss_mlp": 1.04884136, + "epoch": 0.21596272358334587, + "flos": 30040963511040.0, + "grad_norm": 2.271326779903827, + "language_loss": 0.69294131, + "learning_rate": 3.6480666888893686e-06, + "loss": 0.71490723, + "num_input_tokens_seen": 77517140, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.96875, + "step": 3592, + "time_per_iteration": 2.571373462677002 + }, + { + "auxiliary_loss_clip": 0.01143219, + "auxiliary_loss_mlp": 0.01049672, + "balance_loss_clip": 1.03150403, + "balance_loss_mlp": 1.04881072, + "epoch": 0.21602284683601383, + "flos": 20376217751040.0, + "grad_norm": 2.3999192225546677, + "language_loss": 0.84150124, + "learning_rate": 3.647846011515108e-06, + "loss": 0.86343014, + "num_input_tokens_seen": 77536085, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9453125, + "step": 3593, + "time_per_iteration": 2.4590611457824707 + }, + { + "auxiliary_loss_clip": 0.01144804, + "auxiliary_loss_mlp": 0.01049477, + "balance_loss_clip": 1.03210783, + "balance_loss_mlp": 1.04839182, + "epoch": 0.2160829700886818, + "flos": 20777339905920.0, + "grad_norm": 2.850380650061706, + "language_loss": 0.75163305, + "learning_rate": 3.6476252716543625e-06, + "loss": 0.77357584, + "num_input_tokens_seen": 77553675, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.96484375, + "step": 3594, + "time_per_iteration": 3.9338901042938232 + }, + { + "auxiliary_loss_clip": 0.01139476, + "auxiliary_loss_mlp": 0.01043593, + "balance_loss_clip": 1.02666509, + "balance_loss_mlp": 1.04763508, + "epoch": 0.21614309334134976, + "flos": 22309755886080.0, + "grad_norm": 2.0680180645872057, + "language_loss": 0.80541027, + "learning_rate": 3.6474044693155007e-06, + "loss": 0.82724094, + "num_input_tokens_seen": 77573360, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.921875, + "step": 3595, + "time_per_iteration": 3.9857921600341797 + }, + { + "auxiliary_loss_clip": 0.01146272, + "auxiliary_loss_mlp": 0.01043286, + "balance_loss_clip": 1.0259887, + "balance_loss_mlp": 1.04883027, + "epoch": 0.21620321659401773, + "flos": 19609524927360.0, + "grad_norm": 2.3330392864683347, + "language_loss": 0.78089929, + "learning_rate": 3.647183604506897e-06, + "loss": 0.80279487, + "num_input_tokens_seen": 77591865, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.97265625, + "step": 3596, + "time_per_iteration": 2.4515480995178223 + }, + { + "auxiliary_loss_clip": 0.01138472, + "auxiliary_loss_mlp": 0.0104618, + "balance_loss_clip": 1.03006268, + "balance_loss_mlp": 1.04786897, + "epoch": 0.2162633398466857, + "flos": 18844555956480.0, + "grad_norm": 1.9545740457841054, + "language_loss": 0.83011472, + "learning_rate": 3.6469626772369253e-06, + "loss": 0.85196126, + "num_input_tokens_seen": 77611600, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.90625, + "step": 3597, + "time_per_iteration": 2.504703998565674 + }, + { + "auxiliary_loss_clip": 0.011446, + "auxiliary_loss_mlp": 0.01045187, + "balance_loss_clip": 1.02756798, + "balance_loss_mlp": 1.05029655, + "epoch": 0.21632346309935369, + "flos": 18768820129920.0, + "grad_norm": 1.5849845027976412, + "language_loss": 0.80171728, + "learning_rate": 3.6467416875139642e-06, + "loss": 0.82361513, + "num_input_tokens_seen": 77630665, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.94140625, + "step": 3598, + "time_per_iteration": 2.487013101577759 + }, + { + "auxiliary_loss_clip": 0.0114385, + "auxiliary_loss_mlp": 0.01045551, + "balance_loss_clip": 1.02745485, + "balance_loss_mlp": 1.0476619, + "epoch": 0.21638358635202165, + "flos": 26324173745280.0, + "grad_norm": 1.8175927270691912, + "language_loss": 0.82054996, + "learning_rate": 3.6465206353463934e-06, + "loss": 0.842444, + "num_input_tokens_seen": 77650835, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9609375, + "step": 3599, + "time_per_iteration": 2.5515315532684326 + }, + { + "auxiliary_loss_clip": 0.0113915, + "auxiliary_loss_mlp": 0.0104149, + "balance_loss_clip": 1.0253613, + "balance_loss_mlp": 1.04831243, + "epoch": 0.21644370960468962, + "flos": 20740854666240.0, + "grad_norm": 3.186477441139726, + "language_loss": 0.7654863, + "learning_rate": 3.6462995207425947e-06, + "loss": 0.78729272, + "num_input_tokens_seen": 77669000, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.90625, + "step": 3600, + "time_per_iteration": 2.5067033767700195 + }, + { + "auxiliary_loss_clip": 0.01139528, + "auxiliary_loss_mlp": 0.01043686, + "balance_loss_clip": 1.02842712, + "balance_loss_mlp": 1.04657555, + "epoch": 0.21650383285735758, + "flos": 23952238116480.0, + "grad_norm": 1.9514188507385115, + "language_loss": 0.80026001, + "learning_rate": 3.6460783437109533e-06, + "loss": 0.82209218, + "num_input_tokens_seen": 77688745, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.9296875, + "step": 3601, + "time_per_iteration": 2.5383710861206055 + }, + { + "auxiliary_loss_clip": 0.01142747, + "auxiliary_loss_mlp": 0.01047381, + "balance_loss_clip": 1.0306437, + "balance_loss_mlp": 1.04938436, + "epoch": 0.21656395611002555, + "flos": 23696087253120.0, + "grad_norm": 1.8096424478422806, + "language_loss": 0.83358335, + "learning_rate": 3.6458571042598565e-06, + "loss": 0.85548466, + "num_input_tokens_seen": 77708445, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.93359375, + "step": 3602, + "time_per_iteration": 2.525151491165161 + }, + { + "auxiliary_loss_clip": 0.01140411, + "auxiliary_loss_mlp": 0.01048188, + "balance_loss_clip": 1.03065276, + "balance_loss_mlp": 1.04670155, + "epoch": 0.2166240793626935, + "flos": 20666052593280.0, + "grad_norm": 1.6489882186888527, + "language_loss": 0.74271673, + "learning_rate": 3.645635802397693e-06, + "loss": 0.76460266, + "num_input_tokens_seen": 77728465, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9375, + "step": 3603, + "time_per_iteration": 2.5083842277526855 + }, + { + "auxiliary_loss_clip": 0.01140372, + "auxiliary_loss_mlp": 0.01043135, + "balance_loss_clip": 1.02723289, + "balance_loss_mlp": 1.05022252, + "epoch": 0.2166842026153615, + "flos": 21580410228480.0, + "grad_norm": 1.5478742891076147, + "language_loss": 0.73956323, + "learning_rate": 3.645414438132855e-06, + "loss": 0.76139832, + "num_input_tokens_seen": 77746735, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.90234375, + "step": 3604, + "time_per_iteration": 2.5100204944610596 + }, + { + "auxiliary_loss_clip": 0.01137594, + "auxiliary_loss_mlp": 0.01042667, + "balance_loss_clip": 1.02598965, + "balance_loss_mlp": 1.04853153, + "epoch": 0.21674432586802947, + "flos": 25629948610560.0, + "grad_norm": 2.2268823896980376, + "language_loss": 0.80375803, + "learning_rate": 3.6451930114737366e-06, + "loss": 0.82556069, + "num_input_tokens_seen": 77768105, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.890625, + "step": 3605, + "time_per_iteration": 2.5182228088378906 + }, + { + "auxiliary_loss_clip": 0.01064224, + "auxiliary_loss_mlp": 0.01010449, + "balance_loss_clip": 1.0086962, + "balance_loss_mlp": 1.02975249, + "epoch": 0.21680444912069743, + "flos": 56417783616000.0, + "grad_norm": 0.6948121220218867, + "language_loss": 0.58376318, + "learning_rate": 3.6449715224287347e-06, + "loss": 0.60450989, + "num_input_tokens_seen": 77833750, + "router_z_loss_clip": 0.01757812, + "router_z_loss_mlp": 0.34375, + "step": 3606, + "time_per_iteration": 3.1655373573303223 + }, + { + "auxiliary_loss_clip": 0.01145196, + "auxiliary_loss_mlp": 0.01046918, + "balance_loss_clip": 1.02921534, + "balance_loss_mlp": 1.04939568, + "epoch": 0.2168645723733654, + "flos": 23878944414720.0, + "grad_norm": 2.6754398361548613, + "language_loss": 0.73210037, + "learning_rate": 3.644749971006248e-06, + "loss": 0.75402147, + "num_input_tokens_seen": 77853780, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9609375, + "step": 3607, + "time_per_iteration": 2.508920431137085 + }, + { + "auxiliary_loss_clip": 0.01146221, + "auxiliary_loss_mlp": 0.0104816, + "balance_loss_clip": 1.02995718, + "balance_loss_mlp": 1.04935443, + "epoch": 0.21692469562603336, + "flos": 16946174257920.0, + "grad_norm": 2.5718647894236053, + "language_loss": 0.76626337, + "learning_rate": 3.6445283572146765e-06, + "loss": 0.78820717, + "num_input_tokens_seen": 77872575, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.96875, + "step": 3608, + "time_per_iteration": 2.440258502960205 + }, + { + "auxiliary_loss_clip": 0.01144868, + "auxiliary_loss_mlp": 0.01046046, + "balance_loss_clip": 1.02985787, + "balance_loss_mlp": 1.04866827, + "epoch": 0.21698481887870133, + "flos": 25119047514240.0, + "grad_norm": 1.796333172920123, + "language_loss": 0.74395084, + "learning_rate": 3.6443066810624255e-06, + "loss": 0.76586002, + "num_input_tokens_seen": 77892700, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.9609375, + "step": 3609, + "time_per_iteration": 2.5326688289642334 + }, + { + "auxiliary_loss_clip": 0.01143438, + "auxiliary_loss_mlp": 0.01048498, + "balance_loss_clip": 1.03137922, + "balance_loss_mlp": 1.04871368, + "epoch": 0.2170449421313693, + "flos": 17894682748800.0, + "grad_norm": 1.781486329059154, + "language_loss": 0.88848329, + "learning_rate": 3.6440849425579e-06, + "loss": 0.91040266, + "num_input_tokens_seen": 77911060, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9453125, + "step": 3610, + "time_per_iteration": 2.4611029624938965 + }, + { + "auxiliary_loss_clip": 0.01144855, + "auxiliary_loss_mlp": 0.01038228, + "balance_loss_clip": 1.02090693, + "balance_loss_mlp": 1.05045652, + "epoch": 0.2171050653840373, + "flos": 22638446265600.0, + "grad_norm": 2.036787917991119, + "language_loss": 0.77587712, + "learning_rate": 3.6438631417095095e-06, + "loss": 0.79770797, + "num_input_tokens_seen": 77929930, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9453125, + "step": 3611, + "time_per_iteration": 2.5187723636627197 + }, + { + "auxiliary_loss_clip": 0.01136409, + "auxiliary_loss_mlp": 0.01044629, + "balance_loss_clip": 1.02829766, + "balance_loss_mlp": 1.04609489, + "epoch": 0.21716518863670525, + "flos": 19499997381120.0, + "grad_norm": 2.067133307741882, + "language_loss": 0.63197911, + "learning_rate": 3.6436412785256637e-06, + "loss": 0.65378946, + "num_input_tokens_seen": 77949060, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.90234375, + "step": 3612, + "time_per_iteration": 2.4585959911346436 + }, + { + "auxiliary_loss_clip": 0.0114176, + "auxiliary_loss_mlp": 0.01042113, + "balance_loss_clip": 1.02504194, + "balance_loss_mlp": 1.04799449, + "epoch": 0.21722531188937322, + "flos": 19792022952960.0, + "grad_norm": 1.9312736490377453, + "language_loss": 0.75120652, + "learning_rate": 3.643419353014776e-06, + "loss": 0.77304518, + "num_input_tokens_seen": 77967920, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9375, + "step": 3613, + "time_per_iteration": 2.4866983890533447 + }, + { + "auxiliary_loss_clip": 0.01136544, + "auxiliary_loss_mlp": 0.01046281, + "balance_loss_clip": 1.02900767, + "balance_loss_mlp": 1.04560208, + "epoch": 0.21728543514204118, + "flos": 13334386924800.0, + "grad_norm": 3.0184875495721, + "language_loss": 0.70767504, + "learning_rate": 3.643197365185261e-06, + "loss": 0.72950327, + "num_input_tokens_seen": 77985330, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.90625, + "step": 3614, + "time_per_iteration": 2.4454689025878906 + }, + { + "auxiliary_loss_clip": 0.01141605, + "auxiliary_loss_mlp": 0.01046562, + "balance_loss_clip": 1.0288837, + "balance_loss_mlp": 1.0491401, + "epoch": 0.21734555839470915, + "flos": 15231870783360.0, + "grad_norm": 1.8064523730299737, + "language_loss": 0.7314586, + "learning_rate": 3.6429753150455378e-06, + "loss": 0.75334036, + "num_input_tokens_seen": 78003105, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.92578125, + "step": 3615, + "time_per_iteration": 2.488711357116699 + }, + { + "auxiliary_loss_clip": 0.01145923, + "auxiliary_loss_mlp": 0.0104763, + "balance_loss_clip": 1.02832997, + "balance_loss_mlp": 1.04751146, + "epoch": 0.2174056816473771, + "flos": 19973982274560.0, + "grad_norm": 2.7876016160510377, + "language_loss": 0.90045536, + "learning_rate": 3.6427532026040263e-06, + "loss": 0.92239082, + "num_input_tokens_seen": 78019655, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.984375, + "step": 3616, + "time_per_iteration": 2.4552054405212402 + }, + { + "auxiliary_loss_clip": 0.01143252, + "auxiliary_loss_mlp": 0.01041039, + "balance_loss_clip": 1.02356279, + "balance_loss_mlp": 1.04853153, + "epoch": 0.21746580490004508, + "flos": 16687293960960.0, + "grad_norm": 2.4503731233397383, + "language_loss": 0.8111589, + "learning_rate": 3.642531027869148e-06, + "loss": 0.83300173, + "num_input_tokens_seen": 78036025, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9453125, + "step": 3617, + "time_per_iteration": 2.465254068374634 + }, + { + "auxiliary_loss_clip": 0.01143954, + "auxiliary_loss_mlp": 0.01045828, + "balance_loss_clip": 1.02928162, + "balance_loss_mlp": 1.04851139, + "epoch": 0.21752592815271307, + "flos": 25772298209280.0, + "grad_norm": 1.7784831572545423, + "language_loss": 0.75509727, + "learning_rate": 3.642308790849329e-06, + "loss": 0.77699506, + "num_input_tokens_seen": 78055645, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.953125, + "step": 3618, + "time_per_iteration": 2.5263705253601074 + }, + { + "auxiliary_loss_clip": 0.0114255, + "auxiliary_loss_mlp": 0.01049263, + "balance_loss_clip": 1.03103614, + "balance_loss_mlp": 1.04738426, + "epoch": 0.21758605140538104, + "flos": 11254692349440.0, + "grad_norm": 1.9247647214638754, + "language_loss": 0.69221723, + "learning_rate": 3.642086491552996e-06, + "loss": 0.71413535, + "num_input_tokens_seen": 78071660, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.94921875, + "step": 3619, + "time_per_iteration": 2.4615654945373535 + }, + { + "auxiliary_loss_clip": 0.01145954, + "auxiliary_loss_mlp": 0.01044723, + "balance_loss_clip": 1.02723491, + "balance_loss_mlp": 1.04906762, + "epoch": 0.217646174658049, + "flos": 19242625455360.0, + "grad_norm": 1.7662634429670958, + "language_loss": 0.78337491, + "learning_rate": 3.641864129988579e-06, + "loss": 0.80528164, + "num_input_tokens_seen": 78091265, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.96875, + "step": 3620, + "time_per_iteration": 2.4954700469970703 + }, + { + "auxiliary_loss_clip": 0.01133661, + "auxiliary_loss_mlp": 0.0103768, + "balance_loss_clip": 1.02116966, + "balance_loss_mlp": 1.04363799, + "epoch": 0.21770629791071697, + "flos": 21945083057280.0, + "grad_norm": 2.0129000326388695, + "language_loss": 0.79769373, + "learning_rate": 3.641641706164509e-06, + "loss": 0.81940717, + "num_input_tokens_seen": 78110095, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.90234375, + "step": 3621, + "time_per_iteration": 2.490427255630493 + }, + { + "auxiliary_loss_clip": 0.01138307, + "auxiliary_loss_mlp": 0.01040724, + "balance_loss_clip": 1.02436888, + "balance_loss_mlp": 1.04595852, + "epoch": 0.21776642116338493, + "flos": 24936764970240.0, + "grad_norm": 1.7548460288059653, + "language_loss": 0.87967801, + "learning_rate": 3.641419220089221e-06, + "loss": 0.90146828, + "num_input_tokens_seen": 78129475, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.921875, + "step": 3622, + "time_per_iteration": 2.484462022781372 + }, + { + "auxiliary_loss_clip": 0.01142961, + "auxiliary_loss_mlp": 0.01040225, + "balance_loss_clip": 1.02067459, + "balance_loss_mlp": 1.04766297, + "epoch": 0.2178265444160529, + "flos": 17821317219840.0, + "grad_norm": 4.811459611972859, + "language_loss": 0.76945633, + "learning_rate": 3.641196671771152e-06, + "loss": 0.79128814, + "num_input_tokens_seen": 78146880, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.94921875, + "step": 3623, + "time_per_iteration": 2.4476547241210938 + }, + { + "auxiliary_loss_clip": 0.0114403, + "auxiliary_loss_mlp": 0.01048104, + "balance_loss_clip": 1.02992439, + "balance_loss_mlp": 1.04891419, + "epoch": 0.2178866676687209, + "flos": 17712902995200.0, + "grad_norm": 2.1152987510548615, + "language_loss": 0.84886312, + "learning_rate": 3.640974061218741e-06, + "loss": 0.8707844, + "num_input_tokens_seen": 78165065, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.94921875, + "step": 3624, + "time_per_iteration": 2.444913387298584 + }, + { + "auxiliary_loss_clip": 0.0114445, + "auxiliary_loss_mlp": 0.010571, + "balance_loss_clip": 1.0397315, + "balance_loss_mlp": 1.0487287, + "epoch": 0.21794679092138886, + "flos": 16945851035520.0, + "grad_norm": 2.345969751242133, + "language_loss": 0.77035248, + "learning_rate": 3.640751388440429e-06, + "loss": 0.79236794, + "num_input_tokens_seen": 78180005, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.95703125, + "step": 3625, + "time_per_iteration": 2.4511115550994873 + }, + { + "auxiliary_loss_clip": 0.01059313, + "auxiliary_loss_mlp": 0.01000008, + "balance_loss_clip": 0.99836272, + "balance_loss_mlp": 1.02361774, + "epoch": 0.21800691417405682, + "flos": 63718566566400.0, + "grad_norm": 0.8233389824181596, + "language_loss": 0.60720766, + "learning_rate": 3.64052865344466e-06, + "loss": 0.62780088, + "num_input_tokens_seen": 78245350, + "router_z_loss_clip": 0.01647949, + "router_z_loss_mlp": 0.35546875, + "step": 3626, + "time_per_iteration": 3.21004319190979 + }, + { + "auxiliary_loss_clip": 0.0114194, + "auxiliary_loss_mlp": 0.01047127, + "balance_loss_clip": 1.02858984, + "balance_loss_mlp": 1.04572678, + "epoch": 0.21806703742672479, + "flos": 21616392677760.0, + "grad_norm": 1.8978511257882154, + "language_loss": 0.90608853, + "learning_rate": 3.6403058562398795e-06, + "loss": 0.92797917, + "num_input_tokens_seen": 78264165, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9609375, + "step": 3627, + "time_per_iteration": 2.4744250774383545 + }, + { + "auxiliary_loss_clip": 0.01138482, + "auxiliary_loss_mlp": 0.01041219, + "balance_loss_clip": 1.02346826, + "balance_loss_mlp": 1.04541492, + "epoch": 0.21812716067939275, + "flos": 19354882435200.0, + "grad_norm": 1.8495097769686537, + "language_loss": 0.73612916, + "learning_rate": 3.6400829968345365e-06, + "loss": 0.75792623, + "num_input_tokens_seen": 78283745, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9296875, + "step": 3628, + "time_per_iteration": 2.4595446586608887 + }, + { + "auxiliary_loss_clip": 0.01137064, + "auxiliary_loss_mlp": 0.01039204, + "balance_loss_clip": 1.02232444, + "balance_loss_mlp": 1.04432046, + "epoch": 0.21818728393206072, + "flos": 23548063305600.0, + "grad_norm": 1.99633175048199, + "language_loss": 0.76800162, + "learning_rate": 3.6398600752370826e-06, + "loss": 0.78976429, + "num_input_tokens_seen": 78302900, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.92578125, + "step": 3629, + "time_per_iteration": 2.487610101699829 + }, + { + "auxiliary_loss_clip": 0.01140004, + "auxiliary_loss_mlp": 0.01041342, + "balance_loss_clip": 1.02514172, + "balance_loss_mlp": 1.04701388, + "epoch": 0.21824740718472868, + "flos": 30225652266240.0, + "grad_norm": 1.5547294213075904, + "language_loss": 0.71320152, + "learning_rate": 3.63963709145597e-06, + "loss": 0.73501503, + "num_input_tokens_seen": 78326470, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.9296875, + "step": 3630, + "time_per_iteration": 2.608846426010132 + }, + { + "auxiliary_loss_clip": 0.01134439, + "auxiliary_loss_mlp": 0.01042587, + "balance_loss_clip": 1.0277338, + "balance_loss_mlp": 1.04635286, + "epoch": 0.21830753043739667, + "flos": 26134672567680.0, + "grad_norm": 1.8110131954886999, + "language_loss": 0.76331747, + "learning_rate": 3.6394140454996544e-06, + "loss": 0.78508776, + "num_input_tokens_seen": 78345810, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8828125, + "step": 3631, + "time_per_iteration": 2.53765869140625 + }, + { + "auxiliary_loss_clip": 0.01138964, + "auxiliary_loss_mlp": 0.0103994, + "balance_loss_clip": 1.0237397, + "balance_loss_mlp": 1.0455693, + "epoch": 0.21836765369006464, + "flos": 21720712752000.0, + "grad_norm": 2.0710075205659906, + "language_loss": 0.74879777, + "learning_rate": 3.639190937376594e-06, + "loss": 0.77058685, + "num_input_tokens_seen": 78364085, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.93359375, + "step": 3632, + "time_per_iteration": 2.484896421432495 + }, + { + "auxiliary_loss_clip": 0.01136054, + "auxiliary_loss_mlp": 0.01035961, + "balance_loss_clip": 1.02029681, + "balance_loss_mlp": 1.04511309, + "epoch": 0.2184277769427326, + "flos": 19937604775680.0, + "grad_norm": 1.966664682342333, + "language_loss": 0.83337629, + "learning_rate": 3.638967767095249e-06, + "loss": 0.8550964, + "num_input_tokens_seen": 78381385, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.91015625, + "step": 3633, + "time_per_iteration": 2.4721779823303223 + }, + { + "auxiliary_loss_clip": 0.01136294, + "auxiliary_loss_mlp": 0.01048418, + "balance_loss_clip": 1.03228879, + "balance_loss_mlp": 1.04592657, + "epoch": 0.21848790019540057, + "flos": 20340235301760.0, + "grad_norm": 1.8655293845238095, + "language_loss": 0.81782126, + "learning_rate": 3.6387445346640823e-06, + "loss": 0.83966839, + "num_input_tokens_seen": 78400500, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.90234375, + "step": 3634, + "time_per_iteration": 2.5514795780181885 + }, + { + "auxiliary_loss_clip": 0.01144011, + "auxiliary_loss_mlp": 0.01041001, + "balance_loss_clip": 1.02468133, + "balance_loss_mlp": 1.04863131, + "epoch": 0.21854802344806853, + "flos": 15450818135040.0, + "grad_norm": 2.010090632845536, + "language_loss": 0.75077927, + "learning_rate": 3.638521240091558e-06, + "loss": 0.77262932, + "num_input_tokens_seen": 78418340, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.953125, + "step": 3635, + "time_per_iteration": 4.07889199256897 + }, + { + "auxiliary_loss_clip": 0.01137664, + "auxiliary_loss_mlp": 0.01053987, + "balance_loss_clip": 1.03775024, + "balance_loss_mlp": 1.04744601, + "epoch": 0.2186081467007365, + "flos": 16320717711360.0, + "grad_norm": 2.2167396678675155, + "language_loss": 0.87881035, + "learning_rate": 3.6382978833861445e-06, + "loss": 0.90072685, + "num_input_tokens_seen": 78434375, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.90234375, + "step": 3636, + "time_per_iteration": 3.9134533405303955 + }, + { + "auxiliary_loss_clip": 0.01138959, + "auxiliary_loss_mlp": 0.01051316, + "balance_loss_clip": 1.03406608, + "balance_loss_mlp": 1.0456109, + "epoch": 0.2186682699534045, + "flos": 21689255416320.0, + "grad_norm": 1.9800006249435054, + "language_loss": 0.75948632, + "learning_rate": 3.638074464556311e-06, + "loss": 0.78138912, + "num_input_tokens_seen": 78451735, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.93359375, + "step": 3637, + "time_per_iteration": 2.5531604290008545 + }, + { + "auxiliary_loss_clip": 0.01143812, + "auxiliary_loss_mlp": 0.01042573, + "balance_loss_clip": 1.02445328, + "balance_loss_mlp": 1.04728055, + "epoch": 0.21872839320607246, + "flos": 17739260599680.0, + "grad_norm": 4.376345077988984, + "language_loss": 0.89677018, + "learning_rate": 3.63785098361053e-06, + "loss": 0.91863406, + "num_input_tokens_seen": 78462730, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.96875, + "step": 3638, + "time_per_iteration": 2.435544967651367 + }, + { + "auxiliary_loss_clip": 0.01140476, + "auxiliary_loss_mlp": 0.01050633, + "balance_loss_clip": 1.03377736, + "balance_loss_mlp": 1.04854274, + "epoch": 0.21878851645874042, + "flos": 18652289431680.0, + "grad_norm": 2.382131601644944, + "language_loss": 0.89958721, + "learning_rate": 3.637627440557275e-06, + "loss": 0.9214983, + "num_input_tokens_seen": 78476300, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.91796875, + "step": 3639, + "time_per_iteration": 2.448150634765625 + }, + { + "auxiliary_loss_clip": 0.01138473, + "auxiliary_loss_mlp": 0.01045558, + "balance_loss_clip": 1.02972686, + "balance_loss_mlp": 1.04632282, + "epoch": 0.2188486397114084, + "flos": 25557301353600.0, + "grad_norm": 1.7796744672676124, + "language_loss": 0.79038727, + "learning_rate": 3.637403835405024e-06, + "loss": 0.81222755, + "num_input_tokens_seen": 78496135, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.921875, + "step": 3640, + "time_per_iteration": 2.544577121734619 + }, + { + "auxiliary_loss_clip": 0.01142754, + "auxiliary_loss_mlp": 0.01051502, + "balance_loss_clip": 1.03291786, + "balance_loss_mlp": 1.05100346, + "epoch": 0.21890876296407635, + "flos": 17892061056000.0, + "grad_norm": 2.046383525913898, + "language_loss": 0.72049212, + "learning_rate": 3.637180168162255e-06, + "loss": 0.74243474, + "num_input_tokens_seen": 78513855, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.91796875, + "step": 3641, + "time_per_iteration": 2.465439558029175 + }, + { + "auxiliary_loss_clip": 0.01142611, + "auxiliary_loss_mlp": 0.01042223, + "balance_loss_clip": 1.02610588, + "balance_loss_mlp": 1.05203855, + "epoch": 0.21896888621674432, + "flos": 17749100926080.0, + "grad_norm": 2.4771917366671, + "language_loss": 0.80913448, + "learning_rate": 3.63695643883745e-06, + "loss": 0.8309828, + "num_input_tokens_seen": 78531740, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.90625, + "step": 3642, + "time_per_iteration": 2.4598801136016846 + }, + { + "auxiliary_loss_clip": 0.01144439, + "auxiliary_loss_mlp": 0.01040377, + "balance_loss_clip": 1.02319944, + "balance_loss_mlp": 1.05089164, + "epoch": 0.21902900946941228, + "flos": 23076161400960.0, + "grad_norm": 2.0352379603627684, + "language_loss": 0.71573192, + "learning_rate": 3.6367326474390928e-06, + "loss": 0.73758006, + "num_input_tokens_seen": 78549600, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9375, + "step": 3643, + "time_per_iteration": 2.4988484382629395 + }, + { + "auxiliary_loss_clip": 0.01144262, + "auxiliary_loss_mlp": 0.01048332, + "balance_loss_clip": 1.03115392, + "balance_loss_mlp": 1.05041492, + "epoch": 0.21908913272208028, + "flos": 48178545004800.0, + "grad_norm": 2.9224514767679763, + "language_loss": 0.68172711, + "learning_rate": 3.6365087939756696e-06, + "loss": 0.70365304, + "num_input_tokens_seen": 78573350, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9375, + "step": 3644, + "time_per_iteration": 2.721107244491577 + }, + { + "auxiliary_loss_clip": 0.01144867, + "auxiliary_loss_mlp": 0.01041697, + "balance_loss_clip": 1.0252583, + "balance_loss_mlp": 1.04905653, + "epoch": 0.21914925597474824, + "flos": 22236749493120.0, + "grad_norm": 2.1869112310362504, + "language_loss": 0.77744782, + "learning_rate": 3.636284878455669e-06, + "loss": 0.79931343, + "num_input_tokens_seen": 78591005, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9609375, + "step": 3645, + "time_per_iteration": 2.4838709831237793 + }, + { + "auxiliary_loss_clip": 0.01140139, + "auxiliary_loss_mlp": 0.01048358, + "balance_loss_clip": 1.03275371, + "balance_loss_mlp": 1.04988873, + "epoch": 0.2192093792274162, + "flos": 22125605834880.0, + "grad_norm": 1.575077237748942, + "language_loss": 0.82405865, + "learning_rate": 3.636060900887582e-06, + "loss": 0.84594363, + "num_input_tokens_seen": 78610645, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.90234375, + "step": 3646, + "time_per_iteration": 2.467958927154541 + }, + { + "auxiliary_loss_clip": 0.01137932, + "auxiliary_loss_mlp": 0.01036528, + "balance_loss_clip": 1.02050591, + "balance_loss_mlp": 1.04901123, + "epoch": 0.21926950248008417, + "flos": 15669442264320.0, + "grad_norm": 1.7225223193128734, + "language_loss": 0.83016759, + "learning_rate": 3.635836861279901e-06, + "loss": 0.85191214, + "num_input_tokens_seen": 78628340, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.890625, + "step": 3647, + "time_per_iteration": 2.4670159816741943 + }, + { + "auxiliary_loss_clip": 0.01137396, + "auxiliary_loss_mlp": 0.01046032, + "balance_loss_clip": 1.02991438, + "balance_loss_mlp": 1.04734278, + "epoch": 0.21932962573275214, + "flos": 30262496641920.0, + "grad_norm": 1.5879018059409027, + "language_loss": 0.72555232, + "learning_rate": 3.635612759641123e-06, + "loss": 0.74738657, + "num_input_tokens_seen": 78649355, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.90234375, + "step": 3648, + "time_per_iteration": 2.5572352409362793 + }, + { + "auxiliary_loss_clip": 0.01140287, + "auxiliary_loss_mlp": 0.01045097, + "balance_loss_clip": 1.02628565, + "balance_loss_mlp": 1.04563618, + "epoch": 0.2193897489854201, + "flos": 10780132838400.0, + "grad_norm": 2.3666125536095612, + "language_loss": 0.74363017, + "learning_rate": 3.635388595979745e-06, + "loss": 0.76548404, + "num_input_tokens_seen": 78664915, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9453125, + "step": 3649, + "time_per_iteration": 2.4465692043304443 + }, + { + "auxiliary_loss_clip": 0.01133567, + "auxiliary_loss_mlp": 0.01043993, + "balance_loss_clip": 1.02869856, + "balance_loss_mlp": 1.04609215, + "epoch": 0.21944987223808807, + "flos": 19133313390720.0, + "grad_norm": 2.0558746559562953, + "language_loss": 0.86408567, + "learning_rate": 3.635164370304267e-06, + "loss": 0.88586134, + "num_input_tokens_seen": 78681475, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.875, + "step": 3650, + "time_per_iteration": 2.4408226013183594 + }, + { + "auxiliary_loss_clip": 0.01137285, + "auxiliary_loss_mlp": 0.0104387, + "balance_loss_clip": 1.02747929, + "balance_loss_mlp": 1.04549015, + "epoch": 0.21950999549075606, + "flos": 22711093522560.0, + "grad_norm": 2.0425834927064934, + "language_loss": 0.83693743, + "learning_rate": 3.6349400826231927e-06, + "loss": 0.85874897, + "num_input_tokens_seen": 78702300, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.91796875, + "step": 3651, + "time_per_iteration": 2.502694845199585 + }, + { + "auxiliary_loss_clip": 0.01137563, + "auxiliary_loss_mlp": 0.01046031, + "balance_loss_clip": 1.02941298, + "balance_loss_mlp": 1.04595184, + "epoch": 0.21957011874342403, + "flos": 10561329141120.0, + "grad_norm": 1.8702009414404626, + "language_loss": 0.74629313, + "learning_rate": 3.634715732945027e-06, + "loss": 0.76812911, + "num_input_tokens_seen": 78720230, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9140625, + "step": 3652, + "time_per_iteration": 2.4422640800476074 + }, + { + "auxiliary_loss_clip": 0.01052644, + "auxiliary_loss_mlp": 0.01011234, + "balance_loss_clip": 1.00946999, + "balance_loss_mlp": 1.0194056, + "epoch": 0.219630241996092, + "flos": 65747913252480.0, + "grad_norm": 0.7344385056765022, + "language_loss": 0.51548386, + "learning_rate": 3.6344913212782764e-06, + "loss": 0.53612262, + "num_input_tokens_seen": 78780200, + "router_z_loss_clip": 0.0177002, + "router_z_loss_mlp": 0.33203125, + "step": 3653, + "time_per_iteration": 3.0743935108184814 + }, + { + "auxiliary_loss_clip": 0.01142335, + "auxiliary_loss_mlp": 0.01048616, + "balance_loss_clip": 1.03215361, + "balance_loss_mlp": 1.05115473, + "epoch": 0.21969036524875996, + "flos": 23696518216320.0, + "grad_norm": 1.781801507589209, + "language_loss": 0.75256276, + "learning_rate": 3.6342668476314514e-06, + "loss": 0.77447224, + "num_input_tokens_seen": 78800575, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.91015625, + "step": 3654, + "time_per_iteration": 2.4826300144195557 + }, + { + "auxiliary_loss_clip": 0.01143131, + "auxiliary_loss_mlp": 0.01041429, + "balance_loss_clip": 1.02499056, + "balance_loss_mlp": 1.04988194, + "epoch": 0.21975048850142792, + "flos": 19640910435840.0, + "grad_norm": 1.9986760770887892, + "language_loss": 0.72757828, + "learning_rate": 3.634042312013064e-06, + "loss": 0.74942386, + "num_input_tokens_seen": 78819585, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9296875, + "step": 3655, + "time_per_iteration": 2.494662284851074 + }, + { + "auxiliary_loss_clip": 0.01139919, + "auxiliary_loss_mlp": 0.01044993, + "balance_loss_clip": 1.02860177, + "balance_loss_mlp": 1.04802227, + "epoch": 0.21981061175409589, + "flos": 22448550038400.0, + "grad_norm": 1.6963533722566047, + "language_loss": 0.80971813, + "learning_rate": 3.6338177144316276e-06, + "loss": 0.83156729, + "num_input_tokens_seen": 78837330, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.91796875, + "step": 3656, + "time_per_iteration": 2.465020179748535 + }, + { + "auxiliary_loss_clip": 0.01141894, + "auxiliary_loss_mlp": 0.01039083, + "balance_loss_clip": 1.02267933, + "balance_loss_mlp": 1.05085039, + "epoch": 0.21987073500676388, + "flos": 18151049093760.0, + "grad_norm": 2.205234752003223, + "language_loss": 0.84668207, + "learning_rate": 3.63359305489566e-06, + "loss": 0.86849183, + "num_input_tokens_seen": 78854955, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.91015625, + "step": 3657, + "time_per_iteration": 2.4626548290252686 + }, + { + "auxiliary_loss_clip": 0.01138622, + "auxiliary_loss_mlp": 0.0103825, + "balance_loss_clip": 1.02126312, + "balance_loss_mlp": 1.0460434, + "epoch": 0.21993085825943184, + "flos": 25626177682560.0, + "grad_norm": 1.714181577212399, + "language_loss": 0.80485702, + "learning_rate": 3.6333683334136803e-06, + "loss": 0.8266257, + "num_input_tokens_seen": 78874965, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.92578125, + "step": 3658, + "time_per_iteration": 2.492835521697998 + }, + { + "auxiliary_loss_clip": 0.01053481, + "auxiliary_loss_mlp": 0.01002458, + "balance_loss_clip": 1.00065756, + "balance_loss_mlp": 1.02029002, + "epoch": 0.2199909815120998, + "flos": 70923217743360.0, + "grad_norm": 0.8995084923077876, + "language_loss": 0.58224851, + "learning_rate": 3.6331435499942095e-06, + "loss": 0.60280788, + "num_input_tokens_seen": 78937740, + "router_z_loss_clip": 0.01794434, + "router_z_loss_mlp": 0.33203125, + "step": 3659, + "time_per_iteration": 3.1709213256835938 + }, + { + "auxiliary_loss_clip": 0.01140235, + "auxiliary_loss_mlp": 0.0103939, + "balance_loss_clip": 1.02248645, + "balance_loss_mlp": 1.04958415, + "epoch": 0.22005110476476777, + "flos": 21543529939200.0, + "grad_norm": 2.4575828715719177, + "language_loss": 0.74535513, + "learning_rate": 3.632918704645772e-06, + "loss": 0.76715136, + "num_input_tokens_seen": 78955055, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.90625, + "step": 3660, + "time_per_iteration": 2.474397897720337 + }, + { + "auxiliary_loss_clip": 0.01139013, + "auxiliary_loss_mlp": 0.01040353, + "balance_loss_clip": 1.02336597, + "balance_loss_mlp": 1.04723859, + "epoch": 0.22011122801743574, + "flos": 22054502862720.0, + "grad_norm": 2.0332694306983723, + "language_loss": 0.81225419, + "learning_rate": 3.632693797376893e-06, + "loss": 0.83404779, + "num_input_tokens_seen": 78974895, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.91796875, + "step": 3661, + "time_per_iteration": 2.4926669597625732 + }, + { + "auxiliary_loss_clip": 0.01138494, + "auxiliary_loss_mlp": 0.01042707, + "balance_loss_clip": 1.02639949, + "balance_loss_mlp": 1.04773009, + "epoch": 0.2201713512701037, + "flos": 26687589598080.0, + "grad_norm": 1.8682139743879211, + "language_loss": 0.73236209, + "learning_rate": 3.632468828196102e-06, + "loss": 0.75417411, + "num_input_tokens_seen": 78994990, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.90625, + "step": 3662, + "time_per_iteration": 2.5111234188079834 + }, + { + "auxiliary_loss_clip": 0.01140855, + "auxiliary_loss_mlp": 0.01048578, + "balance_loss_clip": 1.03333092, + "balance_loss_mlp": 1.05132473, + "epoch": 0.22023147452277167, + "flos": 22162198815360.0, + "grad_norm": 1.6440107639340105, + "language_loss": 0.77800119, + "learning_rate": 3.632243797111929e-06, + "loss": 0.79989552, + "num_input_tokens_seen": 79014405, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.89453125, + "step": 3663, + "time_per_iteration": 2.485520601272583 + }, + { + "auxiliary_loss_clip": 0.01144475, + "auxiliary_loss_mlp": 0.01043185, + "balance_loss_clip": 1.02581656, + "balance_loss_mlp": 1.05125535, + "epoch": 0.22029159777543966, + "flos": 22523280284160.0, + "grad_norm": 3.566897500342904, + "language_loss": 0.80484056, + "learning_rate": 3.632018704132908e-06, + "loss": 0.8267172, + "num_input_tokens_seen": 79032375, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.93359375, + "step": 3664, + "time_per_iteration": 2.4827098846435547 + }, + { + "auxiliary_loss_clip": 0.01146334, + "auxiliary_loss_mlp": 0.01042617, + "balance_loss_clip": 1.02354348, + "balance_loss_mlp": 1.04959095, + "epoch": 0.22035172102810763, + "flos": 13042469093760.0, + "grad_norm": 2.530665000734818, + "language_loss": 0.76296824, + "learning_rate": 3.6317935492675742e-06, + "loss": 0.78485775, + "num_input_tokens_seen": 79049635, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.96875, + "step": 3665, + "time_per_iteration": 2.5118229389190674 + }, + { + "auxiliary_loss_clip": 0.01139389, + "auxiliary_loss_mlp": 0.01044667, + "balance_loss_clip": 1.0282042, + "balance_loss_mlp": 1.04779172, + "epoch": 0.2204118442807756, + "flos": 12165817760640.0, + "grad_norm": 2.7337119989610468, + "language_loss": 0.97959125, + "learning_rate": 3.631568332524466e-06, + "loss": 1.00143182, + "num_input_tokens_seen": 79062890, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9140625, + "step": 3666, + "time_per_iteration": 2.4461512565612793 + }, + { + "auxiliary_loss_clip": 0.01136729, + "auxiliary_loss_mlp": 0.01039342, + "balance_loss_clip": 1.02241421, + "balance_loss_mlp": 1.04582953, + "epoch": 0.22047196753344356, + "flos": 40108806673920.0, + "grad_norm": 2.115803047817727, + "language_loss": 0.80494016, + "learning_rate": 3.631343053912122e-06, + "loss": 0.82670087, + "num_input_tokens_seen": 79085495, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.90625, + "step": 3667, + "time_per_iteration": 2.65198016166687 + }, + { + "auxiliary_loss_clip": 0.01144733, + "auxiliary_loss_mlp": 0.01046593, + "balance_loss_clip": 1.02776945, + "balance_loss_mlp": 1.04882097, + "epoch": 0.22053209078611152, + "flos": 20701137202560.0, + "grad_norm": 1.916720089378095, + "language_loss": 0.77463895, + "learning_rate": 3.631117713439087e-06, + "loss": 0.79655218, + "num_input_tokens_seen": 79101820, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9609375, + "step": 3668, + "time_per_iteration": 2.459141254425049 + }, + { + "auxiliary_loss_clip": 0.0114207, + "auxiliary_loss_mlp": 0.01042745, + "balance_loss_clip": 1.02568614, + "balance_loss_mlp": 1.05058837, + "epoch": 0.2205922140387795, + "flos": 24716309247360.0, + "grad_norm": 1.730318389149699, + "language_loss": 0.71514869, + "learning_rate": 3.630892311113904e-06, + "loss": 0.73699689, + "num_input_tokens_seen": 79123320, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9140625, + "step": 3669, + "time_per_iteration": 2.550732135772705 + }, + { + "auxiliary_loss_clip": 0.01139227, + "auxiliary_loss_mlp": 0.01037839, + "balance_loss_clip": 1.02106571, + "balance_loss_mlp": 1.04615474, + "epoch": 0.22065233729144745, + "flos": 23477247642240.0, + "grad_norm": 2.0994504177928826, + "language_loss": 0.85294032, + "learning_rate": 3.6306668469451215e-06, + "loss": 0.87471098, + "num_input_tokens_seen": 79141615, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9296875, + "step": 3670, + "time_per_iteration": 2.4727606773376465 + }, + { + "auxiliary_loss_clip": 0.01147385, + "auxiliary_loss_mlp": 0.01040938, + "balance_loss_clip": 1.02360499, + "balance_loss_mlp": 1.05130565, + "epoch": 0.22071246054411545, + "flos": 35225566646400.0, + "grad_norm": 1.775856591734502, + "language_loss": 0.76796275, + "learning_rate": 3.6304413209412886e-06, + "loss": 0.789846, + "num_input_tokens_seen": 79164910, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9609375, + "step": 3671, + "time_per_iteration": 2.613104820251465 + }, + { + "auxiliary_loss_clip": 0.01140966, + "auxiliary_loss_mlp": 0.01034463, + "balance_loss_clip": 1.01758265, + "balance_loss_mlp": 1.0487864, + "epoch": 0.2207725837967834, + "flos": 18150294908160.0, + "grad_norm": 2.8820912362302202, + "language_loss": 0.80472648, + "learning_rate": 3.6302157331109573e-06, + "loss": 0.82648075, + "num_input_tokens_seen": 79179685, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.921875, + "step": 3672, + "time_per_iteration": 2.4365992546081543 + }, + { + "auxiliary_loss_clip": 0.01144646, + "auxiliary_loss_mlp": 0.01047711, + "balance_loss_clip": 1.03129566, + "balance_loss_mlp": 1.05145025, + "epoch": 0.22083270704945138, + "flos": 20479675898880.0, + "grad_norm": 1.8912849075471436, + "language_loss": 0.736193, + "learning_rate": 3.629990083462682e-06, + "loss": 0.75811654, + "num_input_tokens_seen": 79196285, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9296875, + "step": 3673, + "time_per_iteration": 2.4908931255340576 + }, + { + "auxiliary_loss_clip": 0.01145514, + "auxiliary_loss_mlp": 0.01037762, + "balance_loss_clip": 1.02064395, + "balance_loss_mlp": 1.05221379, + "epoch": 0.22089283030211934, + "flos": 34125801984000.0, + "grad_norm": 1.9375944290288487, + "language_loss": 0.76505005, + "learning_rate": 3.6297643720050203e-06, + "loss": 0.78688282, + "num_input_tokens_seen": 79216060, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9296875, + "step": 3674, + "time_per_iteration": 2.569312572479248 + }, + { + "auxiliary_loss_clip": 0.01142786, + "auxiliary_loss_mlp": 0.01043506, + "balance_loss_clip": 1.02518344, + "balance_loss_mlp": 1.05025005, + "epoch": 0.2209529535547873, + "flos": 18077216688000.0, + "grad_norm": 2.0287396146216055, + "language_loss": 0.74786556, + "learning_rate": 3.6295385987465293e-06, + "loss": 0.76972854, + "num_input_tokens_seen": 79235145, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.92578125, + "step": 3675, + "time_per_iteration": 2.4762706756591797 + }, + { + "auxiliary_loss_clip": 0.01141021, + "auxiliary_loss_mlp": 0.01040878, + "balance_loss_clip": 1.02395034, + "balance_loss_mlp": 1.0473659, + "epoch": 0.22101307680745527, + "flos": 27235335070080.0, + "grad_norm": 1.7527405009289938, + "language_loss": 0.80050498, + "learning_rate": 3.629312763695772e-06, + "loss": 0.82232398, + "num_input_tokens_seen": 79256960, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9375, + "step": 3676, + "time_per_iteration": 2.5846786499023438 + }, + { + "auxiliary_loss_clip": 0.0114147, + "auxiliary_loss_mlp": 0.01047315, + "balance_loss_clip": 1.03106666, + "balance_loss_mlp": 1.0474596, + "epoch": 0.22107320006012326, + "flos": 16543256423040.0, + "grad_norm": 1.974355382670518, + "language_loss": 0.75501895, + "learning_rate": 3.6290868668613107e-06, + "loss": 0.77690685, + "num_input_tokens_seen": 79274860, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.94140625, + "step": 3677, + "time_per_iteration": 4.02753758430481 + }, + { + "auxiliary_loss_clip": 0.01135837, + "auxiliary_loss_mlp": 0.01041033, + "balance_loss_clip": 1.02455878, + "balance_loss_mlp": 1.0449332, + "epoch": 0.22113332331279123, + "flos": 22054466949120.0, + "grad_norm": 2.0397766719275494, + "language_loss": 0.83412457, + "learning_rate": 3.628860908251712e-06, + "loss": 0.85589325, + "num_input_tokens_seen": 79294005, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.91015625, + "step": 3678, + "time_per_iteration": 3.9455032348632812 + }, + { + "auxiliary_loss_clip": 0.01140751, + "auxiliary_loss_mlp": 0.01046282, + "balance_loss_clip": 1.02903211, + "balance_loss_mlp": 1.04866314, + "epoch": 0.2211934465654592, + "flos": 26612787525120.0, + "grad_norm": 1.7724652071984504, + "language_loss": 0.89272189, + "learning_rate": 3.6286348878755452e-06, + "loss": 0.91459215, + "num_input_tokens_seen": 79314005, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.921875, + "step": 3679, + "time_per_iteration": 2.548166036605835 + }, + { + "auxiliary_loss_clip": 0.01142658, + "auxiliary_loss_mlp": 0.01053161, + "balance_loss_clip": 1.03517246, + "balance_loss_mlp": 1.04887235, + "epoch": 0.22125356981812716, + "flos": 16360363347840.0, + "grad_norm": 2.4577897330130773, + "language_loss": 0.86718571, + "learning_rate": 3.6284088057413803e-06, + "loss": 0.88914388, + "num_input_tokens_seen": 79331030, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9375, + "step": 3680, + "time_per_iteration": 2.468712329864502 + }, + { + "auxiliary_loss_clip": 0.0114123, + "auxiliary_loss_mlp": 0.01044656, + "balance_loss_clip": 1.02809739, + "balance_loss_mlp": 1.05175805, + "epoch": 0.22131369307079513, + "flos": 21651118151040.0, + "grad_norm": 2.0752123015423556, + "language_loss": 0.81897914, + "learning_rate": 3.6281826618577894e-06, + "loss": 0.84083802, + "num_input_tokens_seen": 79348560, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.89453125, + "step": 3681, + "time_per_iteration": 2.532210350036621 + }, + { + "auxiliary_loss_clip": 0.01136266, + "auxiliary_loss_mlp": 0.0103672, + "balance_loss_clip": 1.02076972, + "balance_loss_mlp": 1.04784071, + "epoch": 0.2213738163234631, + "flos": 19609524927360.0, + "grad_norm": 2.44274183004677, + "language_loss": 0.79908317, + "learning_rate": 3.62795645623335e-06, + "loss": 0.82081306, + "num_input_tokens_seen": 79367175, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.88671875, + "step": 3682, + "time_per_iteration": 2.491135358810425 + }, + { + "auxiliary_loss_clip": 0.01140313, + "auxiliary_loss_mlp": 0.01042047, + "balance_loss_clip": 1.02433276, + "balance_loss_mlp": 1.04739022, + "epoch": 0.22143393957613106, + "flos": 23623404082560.0, + "grad_norm": 2.2064811404605376, + "language_loss": 0.77283889, + "learning_rate": 3.627730188876638e-06, + "loss": 0.79466248, + "num_input_tokens_seen": 79388435, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9296875, + "step": 3683, + "time_per_iteration": 2.503041982650757 + }, + { + "auxiliary_loss_clip": 0.01141417, + "auxiliary_loss_mlp": 0.01045647, + "balance_loss_clip": 1.02824235, + "balance_loss_mlp": 1.04623342, + "epoch": 0.22149406282879905, + "flos": 26177801823360.0, + "grad_norm": 2.114071962716483, + "language_loss": 0.72779894, + "learning_rate": 3.627503859796234e-06, + "loss": 0.74966961, + "num_input_tokens_seen": 79407910, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.953125, + "step": 3684, + "time_per_iteration": 2.521495819091797 + }, + { + "auxiliary_loss_clip": 0.01142849, + "auxiliary_loss_mlp": 0.01043772, + "balance_loss_clip": 1.02598643, + "balance_loss_mlp": 1.05060613, + "epoch": 0.221554186081467, + "flos": 14538758970240.0, + "grad_norm": 1.9389187138945425, + "language_loss": 0.80108052, + "learning_rate": 3.6272774690007207e-06, + "loss": 0.82294679, + "num_input_tokens_seen": 79424020, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.921875, + "step": 3685, + "time_per_iteration": 2.436958074569702 + }, + { + "auxiliary_loss_clip": 0.01135153, + "auxiliary_loss_mlp": 0.01040139, + "balance_loss_clip": 1.02504683, + "balance_loss_mlp": 1.04634571, + "epoch": 0.22161430933413498, + "flos": 22238257864320.0, + "grad_norm": 1.5568750132404718, + "language_loss": 0.87128556, + "learning_rate": 3.6270510164986823e-06, + "loss": 0.89303845, + "num_input_tokens_seen": 79445605, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.88671875, + "step": 3686, + "time_per_iteration": 2.5519070625305176 + }, + { + "auxiliary_loss_clip": 0.01138026, + "auxiliary_loss_mlp": 0.01042632, + "balance_loss_clip": 1.02552581, + "balance_loss_mlp": 1.04762685, + "epoch": 0.22167443258680294, + "flos": 23476529370240.0, + "grad_norm": 1.942015126167962, + "language_loss": 0.77953136, + "learning_rate": 3.626824502298707e-06, + "loss": 0.8013379, + "num_input_tokens_seen": 79463850, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.90234375, + "step": 3687, + "time_per_iteration": 2.495084285736084 + }, + { + "auxiliary_loss_clip": 0.01146436, + "auxiliary_loss_mlp": 0.01048705, + "balance_loss_clip": 1.03085971, + "balance_loss_mlp": 1.05057812, + "epoch": 0.2217345558394709, + "flos": 23221132692480.0, + "grad_norm": 1.8313314390802422, + "language_loss": 0.84722549, + "learning_rate": 3.626597926409383e-06, + "loss": 0.86917698, + "num_input_tokens_seen": 79482845, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.95703125, + "step": 3688, + "time_per_iteration": 2.5029165744781494 + }, + { + "auxiliary_loss_clip": 0.01146721, + "auxiliary_loss_mlp": 0.0104649, + "balance_loss_clip": 1.02897787, + "balance_loss_mlp": 1.05005932, + "epoch": 0.22179467909213887, + "flos": 20011078045440.0, + "grad_norm": 2.7913489877281905, + "language_loss": 0.81395769, + "learning_rate": 3.6263712888393027e-06, + "loss": 0.83588976, + "num_input_tokens_seen": 79501550, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.96484375, + "step": 3689, + "time_per_iteration": 2.487032651901245 + }, + { + "auxiliary_loss_clip": 0.0114216, + "auxiliary_loss_mlp": 0.01044991, + "balance_loss_clip": 1.02758622, + "balance_loss_mlp": 1.04985952, + "epoch": 0.22185480234480687, + "flos": 19683034110720.0, + "grad_norm": 2.5504206662352082, + "language_loss": 0.70040542, + "learning_rate": 3.626144589597061e-06, + "loss": 0.72227693, + "num_input_tokens_seen": 79519680, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.921875, + "step": 3690, + "time_per_iteration": 2.5005807876586914 + }, + { + "auxiliary_loss_clip": 0.01145048, + "auxiliary_loss_mlp": 0.01038313, + "balance_loss_clip": 1.0202167, + "balance_loss_mlp": 1.04890513, + "epoch": 0.22191492559747483, + "flos": 21981316901760.0, + "grad_norm": 1.7318147752747124, + "language_loss": 0.72394359, + "learning_rate": 3.6259178286912528e-06, + "loss": 0.74577713, + "num_input_tokens_seen": 79539000, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9609375, + "step": 3691, + "time_per_iteration": 2.4835989475250244 + }, + { + "auxiliary_loss_clip": 0.01145815, + "auxiliary_loss_mlp": 0.01049746, + "balance_loss_clip": 1.03169739, + "balance_loss_mlp": 1.05317688, + "epoch": 0.2219750488501428, + "flos": 23222066446080.0, + "grad_norm": 2.1843836481793057, + "language_loss": 0.71611524, + "learning_rate": 3.625691006130477e-06, + "loss": 0.73807085, + "num_input_tokens_seen": 79559695, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.92578125, + "step": 3692, + "time_per_iteration": 2.515230655670166 + }, + { + "auxiliary_loss_clip": 0.01146831, + "auxiliary_loss_mlp": 0.01044658, + "balance_loss_clip": 1.02750337, + "balance_loss_mlp": 1.05008483, + "epoch": 0.22203517210281076, + "flos": 22453685683200.0, + "grad_norm": 2.7650002202849113, + "language_loss": 0.87580657, + "learning_rate": 3.6254641219233362e-06, + "loss": 0.89772147, + "num_input_tokens_seen": 79579095, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.96875, + "step": 3693, + "time_per_iteration": 2.5013554096221924 + }, + { + "auxiliary_loss_clip": 0.01138596, + "auxiliary_loss_mlp": 0.010363, + "balance_loss_clip": 1.02086258, + "balance_loss_mlp": 1.04947054, + "epoch": 0.22209529535547873, + "flos": 17564555825280.0, + "grad_norm": 3.031177285152565, + "language_loss": 0.85307622, + "learning_rate": 3.6252371760784325e-06, + "loss": 0.87482512, + "num_input_tokens_seen": 79596430, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.890625, + "step": 3694, + "time_per_iteration": 2.4828481674194336 + }, + { + "auxiliary_loss_clip": 0.01147368, + "auxiliary_loss_mlp": 0.01041631, + "balance_loss_clip": 1.02370214, + "balance_loss_mlp": 1.0491637, + "epoch": 0.2221554186081467, + "flos": 21469015175040.0, + "grad_norm": 1.9517253418741858, + "language_loss": 0.69055748, + "learning_rate": 3.6250101686043725e-06, + "loss": 0.71244752, + "num_input_tokens_seen": 79615825, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.984375, + "step": 3695, + "time_per_iteration": 2.49957537651062 + }, + { + "auxiliary_loss_clip": 0.01141491, + "auxiliary_loss_mlp": 0.0104003, + "balance_loss_clip": 1.02438951, + "balance_loss_mlp": 1.05095696, + "epoch": 0.22221554186081466, + "flos": 27673445255040.0, + "grad_norm": 1.4867456423055678, + "language_loss": 0.71710318, + "learning_rate": 3.6247830995097637e-06, + "loss": 0.73891842, + "num_input_tokens_seen": 79637875, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.90625, + "step": 3696, + "time_per_iteration": 2.5991299152374268 + }, + { + "auxiliary_loss_clip": 0.01140811, + "auxiliary_loss_mlp": 0.01040962, + "balance_loss_clip": 1.02387977, + "balance_loss_mlp": 1.0483942, + "epoch": 0.22227566511348265, + "flos": 25958926298880.0, + "grad_norm": 1.901791440824732, + "language_loss": 0.87694812, + "learning_rate": 3.624555968803217e-06, + "loss": 0.8987658, + "num_input_tokens_seen": 79656970, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.921875, + "step": 3697, + "time_per_iteration": 2.524841547012329 + }, + { + "auxiliary_loss_clip": 0.01134138, + "auxiliary_loss_mlp": 0.01045972, + "balance_loss_clip": 1.03020072, + "balance_loss_mlp": 1.04646909, + "epoch": 0.22233578836615062, + "flos": 39203678833920.0, + "grad_norm": 1.985465494359005, + "language_loss": 0.66109681, + "learning_rate": 3.624328776493346e-06, + "loss": 0.68289793, + "num_input_tokens_seen": 79680275, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.87890625, + "step": 3698, + "time_per_iteration": 2.6806552410125732 + }, + { + "auxiliary_loss_clip": 0.01143188, + "auxiliary_loss_mlp": 0.01038878, + "balance_loss_clip": 1.0208652, + "balance_loss_mlp": 1.049245, + "epoch": 0.22239591161881858, + "flos": 36283782251520.0, + "grad_norm": 1.9701476357110561, + "language_loss": 0.82699466, + "learning_rate": 3.6241015225887637e-06, + "loss": 0.84881532, + "num_input_tokens_seen": 79701255, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9375, + "step": 3699, + "time_per_iteration": 2.620795965194702 + }, + { + "auxiliary_loss_clip": 0.01141189, + "auxiliary_loss_mlp": 0.01044961, + "balance_loss_clip": 1.02789021, + "balance_loss_mlp": 1.04960978, + "epoch": 0.22245603487148655, + "flos": 19719591177600.0, + "grad_norm": 1.6593732889446324, + "language_loss": 0.79488564, + "learning_rate": 3.62387420709809e-06, + "loss": 0.81674713, + "num_input_tokens_seen": 79721315, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9140625, + "step": 3700, + "time_per_iteration": 2.4886739253997803 + }, + { + "auxiliary_loss_clip": 0.01148421, + "auxiliary_loss_mlp": 0.01045024, + "balance_loss_clip": 1.02639139, + "balance_loss_mlp": 1.05154204, + "epoch": 0.2225161581241545, + "flos": 46280450615040.0, + "grad_norm": 7.082418544009014, + "language_loss": 0.72063768, + "learning_rate": 3.623646830029943e-06, + "loss": 0.74257213, + "num_input_tokens_seen": 79742705, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.96875, + "step": 3701, + "time_per_iteration": 2.7293899059295654 + }, + { + "auxiliary_loss_clip": 0.01139069, + "auxiliary_loss_mlp": 0.0104219, + "balance_loss_clip": 1.02520323, + "balance_loss_mlp": 1.04706395, + "epoch": 0.22257628137682248, + "flos": 23696194993920.0, + "grad_norm": 1.9269634413479926, + "language_loss": 0.79704928, + "learning_rate": 3.6234193913929454e-06, + "loss": 0.81886196, + "num_input_tokens_seen": 79763000, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.921875, + "step": 3702, + "time_per_iteration": 2.5527849197387695 + }, + { + "auxiliary_loss_clip": 0.01132932, + "auxiliary_loss_mlp": 0.0104181, + "balance_loss_clip": 1.02487028, + "balance_loss_mlp": 1.04518211, + "epoch": 0.22263640462949044, + "flos": 19353984595200.0, + "grad_norm": 2.7410709876553447, + "language_loss": 0.78632712, + "learning_rate": 3.623191891195723e-06, + "loss": 0.80807453, + "num_input_tokens_seen": 79781335, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.87890625, + "step": 3703, + "time_per_iteration": 2.4955005645751953 + }, + { + "auxiliary_loss_clip": 0.01140692, + "auxiliary_loss_mlp": 0.01037524, + "balance_loss_clip": 1.01810527, + "balance_loss_mlp": 1.0468421, + "epoch": 0.22269652788215843, + "flos": 20776047016320.0, + "grad_norm": 1.8479834568020117, + "language_loss": 0.74212444, + "learning_rate": 3.6229643294469005e-06, + "loss": 0.7639066, + "num_input_tokens_seen": 79800150, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.9375, + "step": 3704, + "time_per_iteration": 2.5000903606414795 + }, + { + "auxiliary_loss_clip": 0.0113847, + "auxiliary_loss_mlp": 0.01042668, + "balance_loss_clip": 1.02618146, + "balance_loss_mlp": 1.05030012, + "epoch": 0.2227566511348264, + "flos": 47958843467520.0, + "grad_norm": 1.7361108874663713, + "language_loss": 0.64372134, + "learning_rate": 3.6227367061551074e-06, + "loss": 0.66553271, + "num_input_tokens_seen": 79822390, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8828125, + "step": 3705, + "time_per_iteration": 2.6993744373321533 + }, + { + "auxiliary_loss_clip": 0.01064369, + "auxiliary_loss_mlp": 0.01006302, + "balance_loss_clip": 1.00454926, + "balance_loss_mlp": 1.03098035, + "epoch": 0.22281677438749437, + "flos": 66218953230720.0, + "grad_norm": 1.353184132187748, + "language_loss": 0.65301311, + "learning_rate": 3.6225090213289766e-06, + "loss": 0.67371976, + "num_input_tokens_seen": 79873350, + "router_z_loss_clip": 0.01757812, + "router_z_loss_mlp": 0.33398438, + "step": 3706, + "time_per_iteration": 2.9832844734191895 + }, + { + "auxiliary_loss_clip": 0.01137982, + "auxiliary_loss_mlp": 0.01036629, + "balance_loss_clip": 1.02076256, + "balance_loss_mlp": 1.0461061, + "epoch": 0.22287689764016233, + "flos": 21871609787520.0, + "grad_norm": 3.09427451037038, + "language_loss": 0.80608439, + "learning_rate": 3.622281274977141e-06, + "loss": 0.82783049, + "num_input_tokens_seen": 79891715, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.91796875, + "step": 3707, + "time_per_iteration": 2.5236454010009766 + }, + { + "auxiliary_loss_clip": 0.01139003, + "auxiliary_loss_mlp": 0.01038491, + "balance_loss_clip": 1.02184916, + "balance_loss_mlp": 1.04706407, + "epoch": 0.2229370208928303, + "flos": 27672475587840.0, + "grad_norm": 2.0318896185848057, + "language_loss": 0.78124011, + "learning_rate": 3.6220534671082367e-06, + "loss": 0.80301505, + "num_input_tokens_seen": 79911175, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.921875, + "step": 3708, + "time_per_iteration": 2.5254104137420654 + }, + { + "auxiliary_loss_clip": 0.01142891, + "auxiliary_loss_mlp": 0.01039636, + "balance_loss_clip": 1.02291107, + "balance_loss_mlp": 1.04897153, + "epoch": 0.22299714414549826, + "flos": 30154657034880.0, + "grad_norm": 1.913582269302705, + "language_loss": 0.79989487, + "learning_rate": 3.6218255977309024e-06, + "loss": 0.82172012, + "num_input_tokens_seen": 79931875, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9375, + "step": 3709, + "time_per_iteration": 2.5528371334075928 + }, + { + "auxiliary_loss_clip": 0.01139166, + "auxiliary_loss_mlp": 0.01046119, + "balance_loss_clip": 1.02913201, + "balance_loss_mlp": 1.04580092, + "epoch": 0.22305726739816625, + "flos": 23143134309120.0, + "grad_norm": 2.062693768306912, + "language_loss": 0.68752408, + "learning_rate": 3.6215976668537787e-06, + "loss": 0.70937693, + "num_input_tokens_seen": 79952445, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.93359375, + "step": 3710, + "time_per_iteration": 2.511275053024292 + }, + { + "auxiliary_loss_clip": 0.01144244, + "auxiliary_loss_mlp": 0.01039149, + "balance_loss_clip": 1.0221858, + "balance_loss_mlp": 1.04812646, + "epoch": 0.22311739065083422, + "flos": 19172061187200.0, + "grad_norm": 2.3083581079415216, + "language_loss": 0.90696692, + "learning_rate": 3.6213696744855096e-06, + "loss": 0.92880082, + "num_input_tokens_seen": 79971030, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9609375, + "step": 3711, + "time_per_iteration": 2.4757487773895264 + }, + { + "auxiliary_loss_clip": 0.01138091, + "auxiliary_loss_mlp": 0.01051989, + "balance_loss_clip": 1.03406, + "balance_loss_mlp": 1.04603434, + "epoch": 0.22317751390350218, + "flos": 13617757319040.0, + "grad_norm": 2.758927620438821, + "language_loss": 0.89628232, + "learning_rate": 3.6211416206347395e-06, + "loss": 0.91818309, + "num_input_tokens_seen": 79982085, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.921875, + "step": 3712, + "time_per_iteration": 2.3870105743408203 + }, + { + "auxiliary_loss_clip": 0.01139482, + "auxiliary_loss_mlp": 0.01051487, + "balance_loss_clip": 1.03356993, + "balance_loss_mlp": 1.04956841, + "epoch": 0.22323763715617015, + "flos": 11029065068160.0, + "grad_norm": 3.039950461935961, + "language_loss": 0.74859631, + "learning_rate": 3.620913505310117e-06, + "loss": 0.77050602, + "num_input_tokens_seen": 79997460, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.8984375, + "step": 3713, + "time_per_iteration": 2.4336304664611816 + }, + { + "auxiliary_loss_clip": 0.01138793, + "auxiliary_loss_mlp": 0.01041826, + "balance_loss_clip": 1.02543497, + "balance_loss_mlp": 1.048329, + "epoch": 0.22329776040883811, + "flos": 41351531466240.0, + "grad_norm": 1.8221921578975473, + "language_loss": 0.62592143, + "learning_rate": 3.6206853285202917e-06, + "loss": 0.64772761, + "num_input_tokens_seen": 80022450, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90625, + "step": 3714, + "time_per_iteration": 2.6230995655059814 + }, + { + "auxiliary_loss_clip": 0.01139199, + "auxiliary_loss_mlp": 0.01036969, + "balance_loss_clip": 1.02073312, + "balance_loss_mlp": 1.04734552, + "epoch": 0.22335788366150608, + "flos": 25119478477440.0, + "grad_norm": 1.9329837891440178, + "language_loss": 0.79052407, + "learning_rate": 3.6204570902739164e-06, + "loss": 0.81228578, + "num_input_tokens_seen": 80042100, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.91796875, + "step": 3715, + "time_per_iteration": 2.510436534881592 + }, + { + "auxiliary_loss_clip": 0.011421, + "auxiliary_loss_mlp": 0.01050674, + "balance_loss_clip": 1.03372216, + "balance_loss_mlp": 1.05021942, + "epoch": 0.22341800691417404, + "flos": 16983377769600.0, + "grad_norm": 1.6633570096565886, + "language_loss": 0.77182817, + "learning_rate": 3.620228790579645e-06, + "loss": 0.79375589, + "num_input_tokens_seen": 80059690, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.921875, + "step": 3716, + "time_per_iteration": 2.4398605823516846 + }, + { + "auxiliary_loss_clip": 0.01141179, + "auxiliary_loss_mlp": 0.01047022, + "balance_loss_clip": 1.03046429, + "balance_loss_mlp": 1.04845762, + "epoch": 0.22347813016684204, + "flos": 14136738975360.0, + "grad_norm": 2.028714583879474, + "language_loss": 0.79209757, + "learning_rate": 3.6200004294461367e-06, + "loss": 0.81397963, + "num_input_tokens_seen": 80076060, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.92578125, + "step": 3717, + "time_per_iteration": 2.456042766571045 + }, + { + "auxiliary_loss_clip": 0.01143546, + "auxiliary_loss_mlp": 0.01041127, + "balance_loss_clip": 1.02377009, + "balance_loss_mlp": 1.04934192, + "epoch": 0.22353825341951, + "flos": 23583147914880.0, + "grad_norm": 2.2103373086531115, + "language_loss": 0.68029571, + "learning_rate": 3.6197720068820497e-06, + "loss": 0.70214242, + "num_input_tokens_seen": 80094760, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.94140625, + "step": 3718, + "time_per_iteration": 2.4818973541259766 + }, + { + "auxiliary_loss_clip": 0.01142458, + "auxiliary_loss_mlp": 0.01038613, + "balance_loss_clip": 1.02067208, + "balance_loss_mlp": 1.04784536, + "epoch": 0.22359837667217797, + "flos": 29824206888960.0, + "grad_norm": 1.9912565029374794, + "language_loss": 0.80194163, + "learning_rate": 3.619543522896045e-06, + "loss": 0.8237524, + "num_input_tokens_seen": 80114475, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.9453125, + "step": 3719, + "time_per_iteration": 3.985903263092041 + }, + { + "auxiliary_loss_clip": 0.01145808, + "auxiliary_loss_mlp": 0.01052597, + "balance_loss_clip": 1.03396416, + "balance_loss_mlp": 1.04785836, + "epoch": 0.22365849992484593, + "flos": 17603088140160.0, + "grad_norm": 2.0930960597239707, + "language_loss": 0.86421579, + "learning_rate": 3.6193149774967885e-06, + "loss": 0.88619983, + "num_input_tokens_seen": 80132920, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.98046875, + "step": 3720, + "time_per_iteration": 3.914626359939575 + }, + { + "auxiliary_loss_clip": 0.0114136, + "auxiliary_loss_mlp": 0.01033623, + "balance_loss_clip": 1.01682639, + "balance_loss_mlp": 1.05105066, + "epoch": 0.2237186231775139, + "flos": 22710949868160.0, + "grad_norm": 1.6398614781610892, + "language_loss": 0.74860299, + "learning_rate": 3.619086370692945e-06, + "loss": 0.77035284, + "num_input_tokens_seen": 80152845, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.90234375, + "step": 3721, + "time_per_iteration": 2.485271453857422 + }, + { + "auxiliary_loss_clip": 0.011451, + "auxiliary_loss_mlp": 0.01043389, + "balance_loss_clip": 1.0256865, + "balance_loss_mlp": 1.0494988, + "epoch": 0.22377874643018186, + "flos": 13371518609280.0, + "grad_norm": 2.928465692067959, + "language_loss": 0.78943181, + "learning_rate": 3.6188577024931844e-06, + "loss": 0.81131673, + "num_input_tokens_seen": 80170680, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.953125, + "step": 3722, + "time_per_iteration": 2.471928834915161 + }, + { + "auxiliary_loss_clip": 0.01140042, + "auxiliary_loss_mlp": 0.0104164, + "balance_loss_clip": 1.02551126, + "balance_loss_mlp": 1.05004597, + "epoch": 0.22383886968284986, + "flos": 17894970057600.0, + "grad_norm": 2.2482737248582247, + "language_loss": 0.82315016, + "learning_rate": 3.618628972906178e-06, + "loss": 0.84496701, + "num_input_tokens_seen": 80189030, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8984375, + "step": 3723, + "time_per_iteration": 2.4540791511535645 + }, + { + "auxiliary_loss_clip": 0.01144828, + "auxiliary_loss_mlp": 0.01044673, + "balance_loss_clip": 1.02729177, + "balance_loss_mlp": 1.05062389, + "epoch": 0.22389899293551782, + "flos": 23879123982720.0, + "grad_norm": 2.154682666342997, + "language_loss": 0.84433442, + "learning_rate": 3.6184001819405984e-06, + "loss": 0.86622941, + "num_input_tokens_seen": 80208365, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.94140625, + "step": 3724, + "time_per_iteration": 2.526204824447632 + }, + { + "auxiliary_loss_clip": 0.0114043, + "auxiliary_loss_mlp": 0.010395, + "balance_loss_clip": 1.02297735, + "balance_loss_mlp": 1.04889762, + "epoch": 0.2239591161881858, + "flos": 27272430840960.0, + "grad_norm": 2.178002887638817, + "language_loss": 0.79036546, + "learning_rate": 3.618171329605121e-06, + "loss": 0.81216478, + "num_input_tokens_seen": 80228685, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.9140625, + "step": 3725, + "time_per_iteration": 2.513136625289917 + }, + { + "auxiliary_loss_clip": 0.01139478, + "auxiliary_loss_mlp": 0.01039417, + "balance_loss_clip": 1.02271581, + "balance_loss_mlp": 1.04898071, + "epoch": 0.22401923944085375, + "flos": 22236857233920.0, + "grad_norm": 1.6732241790302085, + "language_loss": 0.77158499, + "learning_rate": 3.6179424159084254e-06, + "loss": 0.79337394, + "num_input_tokens_seen": 80247635, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.90625, + "step": 3726, + "time_per_iteration": 2.5645246505737305 + }, + { + "auxiliary_loss_clip": 0.01150164, + "auxiliary_loss_mlp": 0.01045662, + "balance_loss_clip": 1.02677917, + "balance_loss_mlp": 1.05054045, + "epoch": 0.22407936269352172, + "flos": 12053668521600.0, + "grad_norm": 2.7042555627132296, + "language_loss": 0.72376108, + "learning_rate": 3.6177134408591914e-06, + "loss": 0.74571931, + "num_input_tokens_seen": 80260045, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.99609375, + "step": 3727, + "time_per_iteration": 2.4437429904937744 + }, + { + "auxiliary_loss_clip": 0.0114439, + "auxiliary_loss_mlp": 0.01040468, + "balance_loss_clip": 1.02140689, + "balance_loss_mlp": 1.04682648, + "epoch": 0.22413948594618968, + "flos": 19353553632000.0, + "grad_norm": 2.2876633759350327, + "language_loss": 0.86584771, + "learning_rate": 3.6174844044661013e-06, + "loss": 0.88769633, + "num_input_tokens_seen": 80277680, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.9765625, + "step": 3728, + "time_per_iteration": 2.496020793914795 + }, + { + "auxiliary_loss_clip": 0.01143576, + "auxiliary_loss_mlp": 0.0104763, + "balance_loss_clip": 1.02838981, + "balance_loss_mlp": 1.05045211, + "epoch": 0.22419960919885765, + "flos": 24170000319360.0, + "grad_norm": 2.0817566504616734, + "language_loss": 0.80479026, + "learning_rate": 3.6172553067378406e-06, + "loss": 0.82670236, + "num_input_tokens_seen": 80294795, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.9296875, + "step": 3729, + "time_per_iteration": 2.4733448028564453 + }, + { + "auxiliary_loss_clip": 0.01136706, + "auxiliary_loss_mlp": 0.01046287, + "balance_loss_clip": 1.03019357, + "balance_loss_mlp": 1.04672551, + "epoch": 0.22425973245152564, + "flos": 27378977558400.0, + "grad_norm": 2.3054621640206205, + "language_loss": 0.86468041, + "learning_rate": 3.6170261476830964e-06, + "loss": 0.88651037, + "num_input_tokens_seen": 80315425, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8984375, + "step": 3730, + "time_per_iteration": 2.5348362922668457 + }, + { + "auxiliary_loss_clip": 0.01136756, + "auxiliary_loss_mlp": 0.0103563, + "balance_loss_clip": 1.01917958, + "balance_loss_mlp": 1.04737782, + "epoch": 0.2243198557041936, + "flos": 13735652734080.0, + "grad_norm": 1.75673058423422, + "language_loss": 0.73293322, + "learning_rate": 3.616796927310559e-06, + "loss": 0.75465709, + "num_input_tokens_seen": 80333905, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.89453125, + "step": 3731, + "time_per_iteration": 2.4397478103637695 + }, + { + "auxiliary_loss_clip": 0.01141304, + "auxiliary_loss_mlp": 0.0104035, + "balance_loss_clip": 1.02370882, + "balance_loss_mlp": 1.04893279, + "epoch": 0.22437997895686157, + "flos": 19530700531200.0, + "grad_norm": 2.4044438539905575, + "language_loss": 0.75237334, + "learning_rate": 3.6165676456289195e-06, + "loss": 0.77418989, + "num_input_tokens_seen": 80352165, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.921875, + "step": 3732, + "time_per_iteration": 2.476630926132202 + }, + { + "auxiliary_loss_clip": 0.01141784, + "auxiliary_loss_mlp": 0.01058138, + "balance_loss_clip": 1.04106712, + "balance_loss_mlp": 1.0494858, + "epoch": 0.22444010220952954, + "flos": 23696230907520.0, + "grad_norm": 1.8584104659795708, + "language_loss": 0.88037199, + "learning_rate": 3.616338302646873e-06, + "loss": 0.90237123, + "num_input_tokens_seen": 80371305, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.921875, + "step": 3733, + "time_per_iteration": 2.4723222255706787 + }, + { + "auxiliary_loss_clip": 0.01137652, + "auxiliary_loss_mlp": 0.01042602, + "balance_loss_clip": 1.02473271, + "balance_loss_mlp": 1.04564941, + "epoch": 0.2245002254621975, + "flos": 22382905933440.0, + "grad_norm": 1.6767676579772364, + "language_loss": 0.84200239, + "learning_rate": 3.6161088983731166e-06, + "loss": 0.86380494, + "num_input_tokens_seen": 80391020, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.921875, + "step": 3734, + "time_per_iteration": 2.5214619636535645 + }, + { + "auxiliary_loss_clip": 0.01143902, + "auxiliary_loss_mlp": 0.01048514, + "balance_loss_clip": 1.03170574, + "balance_loss_mlp": 1.0513525, + "epoch": 0.22456034871486547, + "flos": 26942303917440.0, + "grad_norm": 1.6368426378189131, + "language_loss": 0.76838279, + "learning_rate": 3.6158794328163482e-06, + "loss": 0.79030693, + "num_input_tokens_seen": 80411365, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.92578125, + "step": 3735, + "time_per_iteration": 2.5025858879089355 + }, + { + "auxiliary_loss_clip": 0.01134798, + "auxiliary_loss_mlp": 0.01047796, + "balance_loss_clip": 1.032215, + "balance_loss_mlp": 1.04791164, + "epoch": 0.22462047196753343, + "flos": 28983538005120.0, + "grad_norm": 3.6998773026048046, + "language_loss": 0.84505916, + "learning_rate": 3.6156499059852702e-06, + "loss": 0.86688507, + "num_input_tokens_seen": 80431075, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8671875, + "step": 3736, + "time_per_iteration": 2.581409454345703 + }, + { + "auxiliary_loss_clip": 0.0114079, + "auxiliary_loss_mlp": 0.01039504, + "balance_loss_clip": 1.02306545, + "balance_loss_mlp": 1.04848719, + "epoch": 0.22468059522020142, + "flos": 20011329440640.0, + "grad_norm": 2.2208030259376192, + "language_loss": 0.86398852, + "learning_rate": 3.615420317888586e-06, + "loss": 0.88579136, + "num_input_tokens_seen": 80449240, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.92578125, + "step": 3737, + "time_per_iteration": 2.4498212337493896 + }, + { + "auxiliary_loss_clip": 0.01141365, + "auxiliary_loss_mlp": 0.01047163, + "balance_loss_clip": 1.02917397, + "balance_loss_mlp": 1.0476644, + "epoch": 0.2247407184728694, + "flos": 29314239546240.0, + "grad_norm": 2.434824168439142, + "language_loss": 0.79145718, + "learning_rate": 3.6151906685350006e-06, + "loss": 0.81334245, + "num_input_tokens_seen": 80467900, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9375, + "step": 3738, + "time_per_iteration": 2.5505504608154297 + }, + { + "auxiliary_loss_clip": 0.01140019, + "auxiliary_loss_mlp": 0.01041393, + "balance_loss_clip": 1.02564526, + "balance_loss_mlp": 1.0471611, + "epoch": 0.22480084172553735, + "flos": 22310366417280.0, + "grad_norm": 2.2711438439691314, + "language_loss": 0.75895345, + "learning_rate": 3.614960957933224e-06, + "loss": 0.78076756, + "num_input_tokens_seen": 80487100, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.9296875, + "step": 3739, + "time_per_iteration": 2.458307981491089 + }, + { + "auxiliary_loss_clip": 0.01137257, + "auxiliary_loss_mlp": 0.0104211, + "balance_loss_clip": 1.0255754, + "balance_loss_mlp": 1.04610491, + "epoch": 0.22486096497820532, + "flos": 25591272641280.0, + "grad_norm": 1.9782758832921432, + "language_loss": 0.74705702, + "learning_rate": 3.6147311860919655e-06, + "loss": 0.76885068, + "num_input_tokens_seen": 80508625, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9140625, + "step": 3740, + "time_per_iteration": 2.5424981117248535 + }, + { + "auxiliary_loss_clip": 0.011377, + "auxiliary_loss_mlp": 0.01039355, + "balance_loss_clip": 1.02234411, + "balance_loss_mlp": 1.04691672, + "epoch": 0.22492108823087328, + "flos": 17639824775040.0, + "grad_norm": 2.174963459036685, + "language_loss": 0.76083958, + "learning_rate": 3.614501353019939e-06, + "loss": 0.78261012, + "num_input_tokens_seen": 80527345, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.90625, + "step": 3741, + "time_per_iteration": 2.4539613723754883 + }, + { + "auxiliary_loss_clip": 0.01140029, + "auxiliary_loss_mlp": 0.01038592, + "balance_loss_clip": 1.02263021, + "balance_loss_mlp": 1.05022252, + "epoch": 0.22498121148354125, + "flos": 16034653797120.0, + "grad_norm": 1.917686629559915, + "language_loss": 0.87458241, + "learning_rate": 3.6142714587258592e-06, + "loss": 0.89636862, + "num_input_tokens_seen": 80545545, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8984375, + "step": 3742, + "time_per_iteration": 2.483146905899048 + }, + { + "auxiliary_loss_clip": 0.01137495, + "auxiliary_loss_mlp": 0.01051324, + "balance_loss_clip": 1.03403831, + "balance_loss_mlp": 1.04824293, + "epoch": 0.22504133473620924, + "flos": 24023772051840.0, + "grad_norm": 2.0726823880461116, + "language_loss": 0.81939828, + "learning_rate": 3.614041503218444e-06, + "loss": 0.84128648, + "num_input_tokens_seen": 80565040, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.890625, + "step": 3743, + "time_per_iteration": 2.4786789417266846 + }, + { + "auxiliary_loss_clip": 0.01140562, + "auxiliary_loss_mlp": 0.01038532, + "balance_loss_clip": 1.02241504, + "balance_loss_mlp": 1.04843307, + "epoch": 0.2251014579888772, + "flos": 16763963541120.0, + "grad_norm": 3.9980575521347697, + "language_loss": 0.63616955, + "learning_rate": 3.6138114865064134e-06, + "loss": 0.65796053, + "num_input_tokens_seen": 80582815, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.921875, + "step": 3744, + "time_per_iteration": 2.4746344089508057 + }, + { + "auxiliary_loss_clip": 0.01137356, + "auxiliary_loss_mlp": 0.01042928, + "balance_loss_clip": 1.02634597, + "balance_loss_mlp": 1.04524422, + "epoch": 0.22516158124154517, + "flos": 13991013498240.0, + "grad_norm": 3.3106228370485806, + "language_loss": 0.75711048, + "learning_rate": 3.613581408598489e-06, + "loss": 0.77891332, + "num_input_tokens_seen": 80600865, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.921875, + "step": 3745, + "time_per_iteration": 2.4295878410339355 + }, + { + "auxiliary_loss_clip": 0.01136631, + "auxiliary_loss_mlp": 0.0103759, + "balance_loss_clip": 1.02142549, + "balance_loss_mlp": 1.04637384, + "epoch": 0.22522170449421314, + "flos": 14390016750720.0, + "grad_norm": 1.8117958881819525, + "language_loss": 0.80839783, + "learning_rate": 3.6133512695033965e-06, + "loss": 0.83013999, + "num_input_tokens_seen": 80617455, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.90234375, + "step": 3746, + "time_per_iteration": 2.4423928260803223 + }, + { + "auxiliary_loss_clip": 0.01138701, + "auxiliary_loss_mlp": 0.01045887, + "balance_loss_clip": 1.02903056, + "balance_loss_mlp": 1.04503584, + "epoch": 0.2252818277468811, + "flos": 23805542972160.0, + "grad_norm": 2.508960709641407, + "language_loss": 0.86067426, + "learning_rate": 3.613121069229862e-06, + "loss": 0.8825202, + "num_input_tokens_seen": 80635125, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.93359375, + "step": 3747, + "time_per_iteration": 2.471223831176758 + }, + { + "auxiliary_loss_clip": 0.01136945, + "auxiliary_loss_mlp": 0.01034039, + "balance_loss_clip": 1.01789808, + "balance_loss_mlp": 1.04515314, + "epoch": 0.22534195099954907, + "flos": 24718033100160.0, + "grad_norm": 1.812236682782158, + "language_loss": 0.76358509, + "learning_rate": 3.6128908077866145e-06, + "loss": 0.78529495, + "num_input_tokens_seen": 80656370, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.91796875, + "step": 3748, + "time_per_iteration": 2.525108575820923 + }, + { + "auxiliary_loss_clip": 0.01142287, + "auxiliary_loss_mlp": 0.01044202, + "balance_loss_clip": 1.0274291, + "balance_loss_mlp": 1.04882264, + "epoch": 0.22540207425221703, + "flos": 21032341534080.0, + "grad_norm": 1.7339876982656162, + "language_loss": 0.79497123, + "learning_rate": 3.6126604851823864e-06, + "loss": 0.81683606, + "num_input_tokens_seen": 80676495, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9375, + "step": 3749, + "time_per_iteration": 2.4881162643432617 + }, + { + "auxiliary_loss_clip": 0.01134115, + "auxiliary_loss_mlp": 0.01039256, + "balance_loss_clip": 1.02423525, + "balance_loss_mlp": 1.04609084, + "epoch": 0.22546219750488503, + "flos": 19390362094080.0, + "grad_norm": 1.6101192523185979, + "language_loss": 0.8009423, + "learning_rate": 3.6124301014259108e-06, + "loss": 0.82267606, + "num_input_tokens_seen": 80694755, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8828125, + "step": 3750, + "time_per_iteration": 2.4656643867492676 + }, + { + "auxiliary_loss_clip": 0.01140861, + "auxiliary_loss_mlp": 0.01044128, + "balance_loss_clip": 1.02733183, + "balance_loss_mlp": 1.04821157, + "epoch": 0.225522320757553, + "flos": 25192628524800.0, + "grad_norm": 2.418289881699729, + "language_loss": 0.81336129, + "learning_rate": 3.6121996565259244e-06, + "loss": 0.83521116, + "num_input_tokens_seen": 80713670, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.92578125, + "step": 3751, + "time_per_iteration": 2.4960029125213623 + }, + { + "auxiliary_loss_clip": 0.01141479, + "auxiliary_loss_mlp": 0.01038662, + "balance_loss_clip": 1.02242589, + "balance_loss_mlp": 1.04915667, + "epoch": 0.22558244401022096, + "flos": 17163110448000.0, + "grad_norm": 1.757449596716865, + "language_loss": 0.83989275, + "learning_rate": 3.611969150491165e-06, + "loss": 0.86169416, + "num_input_tokens_seen": 80731450, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.921875, + "step": 3752, + "time_per_iteration": 2.4668636322021484 + }, + { + "auxiliary_loss_clip": 0.01136965, + "auxiliary_loss_mlp": 0.01039126, + "balance_loss_clip": 1.02375996, + "balance_loss_mlp": 1.04671109, + "epoch": 0.22564256726288892, + "flos": 15231008856960.0, + "grad_norm": 1.7780915453784651, + "language_loss": 0.78616595, + "learning_rate": 3.611738583330375e-06, + "loss": 0.80792689, + "num_input_tokens_seen": 80748415, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.90234375, + "step": 3753, + "time_per_iteration": 2.4305062294006348 + }, + { + "auxiliary_loss_clip": 0.01137405, + "auxiliary_loss_mlp": 0.01038232, + "balance_loss_clip": 1.02113724, + "balance_loss_mlp": 1.04717183, + "epoch": 0.2257026905155569, + "flos": 34568652764160.0, + "grad_norm": 1.990408742554116, + "language_loss": 0.78284466, + "learning_rate": 3.611507955052295e-06, + "loss": 0.80460101, + "num_input_tokens_seen": 80770835, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.90234375, + "step": 3754, + "time_per_iteration": 2.584170341491699 + }, + { + "auxiliary_loss_clip": 0.0113674, + "auxiliary_loss_mlp": 0.01040681, + "balance_loss_clip": 1.0243969, + "balance_loss_mlp": 1.04882884, + "epoch": 0.22576281376822485, + "flos": 19938430788480.0, + "grad_norm": 1.915767444367904, + "language_loss": 0.70267534, + "learning_rate": 3.6112772656656727e-06, + "loss": 0.72444952, + "num_input_tokens_seen": 80787840, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.87890625, + "step": 3755, + "time_per_iteration": 2.458731174468994 + }, + { + "auxiliary_loss_clip": 0.01145193, + "auxiliary_loss_mlp": 0.0104804, + "balance_loss_clip": 1.031744, + "balance_loss_mlp": 1.0502069, + "epoch": 0.22582293702089282, + "flos": 24602005192320.0, + "grad_norm": 2.7446757969812783, + "language_loss": 0.77373838, + "learning_rate": 3.6110465151792547e-06, + "loss": 0.79567063, + "num_input_tokens_seen": 80806335, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.94921875, + "step": 3756, + "time_per_iteration": 2.5073161125183105 + }, + { + "auxiliary_loss_clip": 0.01145039, + "auxiliary_loss_mlp": 0.01042429, + "balance_loss_clip": 1.02498841, + "balance_loss_mlp": 1.05014277, + "epoch": 0.2258830602735608, + "flos": 23035438356480.0, + "grad_norm": 1.8909279955578986, + "language_loss": 0.82552433, + "learning_rate": 3.6108157036017916e-06, + "loss": 0.847399, + "num_input_tokens_seen": 80825355, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.94921875, + "step": 3757, + "time_per_iteration": 2.471353054046631 + }, + { + "auxiliary_loss_clip": 0.01140888, + "auxiliary_loss_mlp": 0.01039381, + "balance_loss_clip": 1.02258492, + "balance_loss_mlp": 1.04810619, + "epoch": 0.22594318352622877, + "flos": 22158427887360.0, + "grad_norm": 1.8410990661161322, + "language_loss": 0.73181808, + "learning_rate": 3.6105848309420358e-06, + "loss": 0.7536208, + "num_input_tokens_seen": 80842570, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.92578125, + "step": 3758, + "time_per_iteration": 2.5376477241516113 + }, + { + "auxiliary_loss_clip": 0.01144551, + "auxiliary_loss_mlp": 0.0104662, + "balance_loss_clip": 1.02985883, + "balance_loss_mlp": 1.04991663, + "epoch": 0.22600330677889674, + "flos": 20594303176320.0, + "grad_norm": 2.0967514749881015, + "language_loss": 0.77208662, + "learning_rate": 3.6103538972087412e-06, + "loss": 0.79399836, + "num_input_tokens_seen": 80858745, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9453125, + "step": 3759, + "time_per_iteration": 2.447608709335327 + }, + { + "auxiliary_loss_clip": 0.01141959, + "auxiliary_loss_mlp": 0.01043995, + "balance_loss_clip": 1.02643597, + "balance_loss_mlp": 1.04806697, + "epoch": 0.2260634300315647, + "flos": 35659798162560.0, + "grad_norm": 1.9036057015372598, + "language_loss": 0.78638428, + "learning_rate": 3.6101229024106655e-06, + "loss": 0.80824387, + "num_input_tokens_seen": 80880085, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.94140625, + "step": 3760, + "time_per_iteration": 4.231990098953247 + }, + { + "auxiliary_loss_clip": 0.01054354, + "auxiliary_loss_mlp": 0.01007925, + "balance_loss_clip": 1.00607765, + "balance_loss_mlp": 1.02028942, + "epoch": 0.22612355328423267, + "flos": 72090455126400.0, + "grad_norm": 0.9344871733021222, + "language_loss": 0.60090166, + "learning_rate": 3.609891846556569e-06, + "loss": 0.62152445, + "num_input_tokens_seen": 80937660, + "router_z_loss_clip": 0.01843262, + "router_z_loss_mlp": 0.33984375, + "step": 3761, + "time_per_iteration": 4.482504367828369 + }, + { + "auxiliary_loss_clip": 0.0114253, + "auxiliary_loss_mlp": 0.01044191, + "balance_loss_clip": 1.02678633, + "balance_loss_mlp": 1.0478611, + "epoch": 0.22618367653690064, + "flos": 22783776693120.0, + "grad_norm": 2.386395888426225, + "language_loss": 0.77400732, + "learning_rate": 3.609660729655211e-06, + "loss": 0.79587454, + "num_input_tokens_seen": 80956265, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9453125, + "step": 3762, + "time_per_iteration": 2.5162198543548584 + }, + { + "auxiliary_loss_clip": 0.01143363, + "auxiliary_loss_mlp": 0.01040981, + "balance_loss_clip": 1.02395821, + "balance_loss_mlp": 1.05073345, + "epoch": 0.22624379978956863, + "flos": 20448254476800.0, + "grad_norm": 2.10132066013886, + "language_loss": 0.78800118, + "learning_rate": 3.6094295517153573e-06, + "loss": 0.80984461, + "num_input_tokens_seen": 80975185, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.92578125, + "step": 3763, + "time_per_iteration": 2.4578778743743896 + }, + { + "auxiliary_loss_clip": 0.01145794, + "auxiliary_loss_mlp": 0.0105418, + "balance_loss_clip": 1.03583384, + "balance_loss_mlp": 1.05000031, + "epoch": 0.2263039230422366, + "flos": 17494314779520.0, + "grad_norm": 1.8659674868358982, + "language_loss": 0.91363662, + "learning_rate": 3.6091983127457743e-06, + "loss": 0.93563628, + "num_input_tokens_seen": 80992830, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.95703125, + "step": 3764, + "time_per_iteration": 2.536231517791748 + }, + { + "auxiliary_loss_clip": 0.01138186, + "auxiliary_loss_mlp": 0.01054666, + "balance_loss_clip": 1.03740454, + "balance_loss_mlp": 1.04773271, + "epoch": 0.22636404629490456, + "flos": 28329748606080.0, + "grad_norm": 1.6188972360392109, + "language_loss": 0.75211406, + "learning_rate": 3.6089670127552293e-06, + "loss": 0.77404261, + "num_input_tokens_seen": 81013675, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.90625, + "step": 3765, + "time_per_iteration": 2.516646146774292 + }, + { + "auxiliary_loss_clip": 0.01139986, + "auxiliary_loss_mlp": 0.01045374, + "balance_loss_clip": 1.02868426, + "balance_loss_mlp": 1.04855943, + "epoch": 0.22642416954757252, + "flos": 17489143221120.0, + "grad_norm": 1.9315012383394614, + "language_loss": 0.89618981, + "learning_rate": 3.608735651752494e-06, + "loss": 0.91804343, + "num_input_tokens_seen": 81030345, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9140625, + "step": 3766, + "time_per_iteration": 2.4829306602478027 + }, + { + "auxiliary_loss_clip": 0.01138287, + "auxiliary_loss_mlp": 0.01042769, + "balance_loss_clip": 1.02568591, + "balance_loss_mlp": 1.04891181, + "epoch": 0.2264842928002405, + "flos": 24384530298240.0, + "grad_norm": 1.6662033714223943, + "language_loss": 0.74710411, + "learning_rate": 3.6085042297463417e-06, + "loss": 0.76891464, + "num_input_tokens_seen": 81051000, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.89453125, + "step": 3767, + "time_per_iteration": 2.4989218711853027 + }, + { + "auxiliary_loss_clip": 0.011397, + "auxiliary_loss_mlp": 0.01044149, + "balance_loss_clip": 1.02664912, + "balance_loss_mlp": 1.04619229, + "epoch": 0.22654441605290845, + "flos": 19830519354240.0, + "grad_norm": 1.4804117361030718, + "language_loss": 0.7156831, + "learning_rate": 3.6082727467455477e-06, + "loss": 0.73752159, + "num_input_tokens_seen": 81071205, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.93359375, + "step": 3768, + "time_per_iteration": 2.5078160762786865 + }, + { + "auxiliary_loss_clip": 0.01143764, + "auxiliary_loss_mlp": 0.01054415, + "balance_loss_clip": 1.03682017, + "balance_loss_mlp": 1.05247319, + "epoch": 0.22660453930557642, + "flos": 27454569730560.0, + "grad_norm": 1.80046116612075, + "language_loss": 0.78268003, + "learning_rate": 3.6080412027588905e-06, + "loss": 0.80466181, + "num_input_tokens_seen": 81091880, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9140625, + "step": 3769, + "time_per_iteration": 2.5122978687286377 + }, + { + "auxiliary_loss_clip": 0.01142038, + "auxiliary_loss_mlp": 0.01042012, + "balance_loss_clip": 1.02465522, + "balance_loss_mlp": 1.0467639, + "epoch": 0.2266646625582444, + "flos": 23988148738560.0, + "grad_norm": 1.7393050758681738, + "language_loss": 0.68427956, + "learning_rate": 3.6078095977951488e-06, + "loss": 0.70612001, + "num_input_tokens_seen": 81113290, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.953125, + "step": 3770, + "time_per_iteration": 2.557098150253296 + }, + { + "auxiliary_loss_clip": 0.01141766, + "auxiliary_loss_mlp": 0.01041674, + "balance_loss_clip": 1.02537811, + "balance_loss_mlp": 1.04682195, + "epoch": 0.22672478581091238, + "flos": 26028054023040.0, + "grad_norm": 1.6251414008252867, + "language_loss": 0.80370939, + "learning_rate": 3.6075779318631067e-06, + "loss": 0.82554382, + "num_input_tokens_seen": 81133535, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.94921875, + "step": 3771, + "time_per_iteration": 2.5156240463256836 + }, + { + "auxiliary_loss_clip": 0.01135038, + "auxiliary_loss_mlp": 0.01045619, + "balance_loss_clip": 1.0290848, + "balance_loss_mlp": 1.04606724, + "epoch": 0.22678490906358034, + "flos": 23841812730240.0, + "grad_norm": 1.567346312954514, + "language_loss": 0.78844583, + "learning_rate": 3.6073462049715486e-06, + "loss": 0.81025243, + "num_input_tokens_seen": 81154650, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.890625, + "step": 3772, + "time_per_iteration": 2.539632558822632 + }, + { + "auxiliary_loss_clip": 0.0105304, + "auxiliary_loss_mlp": 0.01005348, + "balance_loss_clip": 1.00351191, + "balance_loss_mlp": 1.02012253, + "epoch": 0.2268450323162483, + "flos": 65048088574080.0, + "grad_norm": 0.6518085485856671, + "language_loss": 0.54334348, + "learning_rate": 3.607114417129261e-06, + "loss": 0.56392735, + "num_input_tokens_seen": 81221240, + "router_z_loss_clip": 0.01831055, + "router_z_loss_mlp": 0.33007812, + "step": 3773, + "time_per_iteration": 3.1463003158569336 + }, + { + "auxiliary_loss_clip": 0.01136639, + "auxiliary_loss_mlp": 0.01039094, + "balance_loss_clip": 1.02222633, + "balance_loss_mlp": 1.04712117, + "epoch": 0.22690515556891627, + "flos": 22526081544960.0, + "grad_norm": 1.9230264173849037, + "language_loss": 0.70101082, + "learning_rate": 3.6068825683450334e-06, + "loss": 0.72276813, + "num_input_tokens_seen": 81241520, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.89453125, + "step": 3774, + "time_per_iteration": 2.5099127292633057 + }, + { + "auxiliary_loss_clip": 0.01134613, + "auxiliary_loss_mlp": 0.01038845, + "balance_loss_clip": 1.02232277, + "balance_loss_mlp": 1.04480648, + "epoch": 0.22696527882158424, + "flos": 18223444955520.0, + "grad_norm": 2.4369678263863057, + "language_loss": 0.74585366, + "learning_rate": 3.606650658627658e-06, + "loss": 0.76758826, + "num_input_tokens_seen": 81256825, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8984375, + "step": 3775, + "time_per_iteration": 2.4441745281219482 + }, + { + "auxiliary_loss_clip": 0.0113676, + "auxiliary_loss_mlp": 0.01038998, + "balance_loss_clip": 1.02311933, + "balance_loss_mlp": 1.04534245, + "epoch": 0.22702540207425223, + "flos": 17019252478080.0, + "grad_norm": 2.175545430509675, + "language_loss": 0.8256253, + "learning_rate": 3.606418687985928e-06, + "loss": 0.8473829, + "num_input_tokens_seen": 81275695, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.9140625, + "step": 3776, + "time_per_iteration": 2.4418301582336426 + }, + { + "auxiliary_loss_clip": 0.01139885, + "auxiliary_loss_mlp": 0.01037889, + "balance_loss_clip": 1.02125907, + "balance_loss_mlp": 1.04619908, + "epoch": 0.2270855253269202, + "flos": 21325731822720.0, + "grad_norm": 2.75835757539417, + "language_loss": 0.83031607, + "learning_rate": 3.606186656428641e-06, + "loss": 0.85209382, + "num_input_tokens_seen": 81294920, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9375, + "step": 3777, + "time_per_iteration": 2.5585062503814697 + }, + { + "auxiliary_loss_clip": 0.01137385, + "auxiliary_loss_mlp": 0.01039138, + "balance_loss_clip": 1.02232909, + "balance_loss_mlp": 1.04596353, + "epoch": 0.22714564857958816, + "flos": 23550469516800.0, + "grad_norm": 2.6678368583827288, + "language_loss": 0.72658038, + "learning_rate": 3.6059545639645955e-06, + "loss": 0.74834561, + "num_input_tokens_seen": 81314275, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9140625, + "step": 3778, + "time_per_iteration": 2.5019333362579346 + }, + { + "auxiliary_loss_clip": 0.0113896, + "auxiliary_loss_mlp": 0.01040521, + "balance_loss_clip": 1.02386749, + "balance_loss_mlp": 1.04576886, + "epoch": 0.22720577183225613, + "flos": 25989880844160.0, + "grad_norm": 2.229609453971581, + "language_loss": 0.6414392, + "learning_rate": 3.605722410602591e-06, + "loss": 0.663234, + "num_input_tokens_seen": 81333890, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.93359375, + "step": 3779, + "time_per_iteration": 2.5082859992980957 + }, + { + "auxiliary_loss_clip": 0.01138378, + "auxiliary_loss_mlp": 0.01043458, + "balance_loss_clip": 1.02794909, + "balance_loss_mlp": 1.04837573, + "epoch": 0.2272658950849241, + "flos": 20814076540800.0, + "grad_norm": 1.9715072832436495, + "language_loss": 0.70546824, + "learning_rate": 3.6054901963514323e-06, + "loss": 0.72728658, + "num_input_tokens_seen": 81353640, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8984375, + "step": 3780, + "time_per_iteration": 2.4703643321990967 + }, + { + "auxiliary_loss_clip": 0.01140054, + "auxiliary_loss_mlp": 0.0104493, + "balance_loss_clip": 1.02689338, + "balance_loss_mlp": 1.0489254, + "epoch": 0.22732601833759206, + "flos": 23909324342400.0, + "grad_norm": 2.5454366084291133, + "language_loss": 0.89717996, + "learning_rate": 3.6052579212199246e-06, + "loss": 0.91902977, + "num_input_tokens_seen": 81371595, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9140625, + "step": 3781, + "time_per_iteration": 2.4812376499176025 + }, + { + "auxiliary_loss_clip": 0.0113992, + "auxiliary_loss_mlp": 0.01041852, + "balance_loss_clip": 1.02436364, + "balance_loss_mlp": 1.04648304, + "epoch": 0.22738614159026002, + "flos": 15924407978880.0, + "grad_norm": 2.4601522898780805, + "language_loss": 0.7434786, + "learning_rate": 3.6050255852168753e-06, + "loss": 0.76529634, + "num_input_tokens_seen": 81388435, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.93359375, + "step": 3782, + "time_per_iteration": 2.4665582180023193 + }, + { + "auxiliary_loss_clip": 0.01136804, + "auxiliary_loss_mlp": 0.01041674, + "balance_loss_clip": 1.02587914, + "balance_loss_mlp": 1.04467201, + "epoch": 0.22744626484292801, + "flos": 24205515891840.0, + "grad_norm": 1.6148985015615094, + "language_loss": 0.82393098, + "learning_rate": 3.604793188351095e-06, + "loss": 0.84571576, + "num_input_tokens_seen": 81410195, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.921875, + "step": 3783, + "time_per_iteration": 2.4820034503936768 + }, + { + "auxiliary_loss_clip": 0.01137013, + "auxiliary_loss_mlp": 0.01040248, + "balance_loss_clip": 1.02310586, + "balance_loss_mlp": 1.04418266, + "epoch": 0.22750638809559598, + "flos": 24791614110720.0, + "grad_norm": 2.4165791890347714, + "language_loss": 0.75874048, + "learning_rate": 3.6045607306313964e-06, + "loss": 0.78051311, + "num_input_tokens_seen": 81430060, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9296875, + "step": 3784, + "time_per_iteration": 2.5087246894836426 + }, + { + "auxiliary_loss_clip": 0.01134704, + "auxiliary_loss_mlp": 0.01039995, + "balance_loss_clip": 1.02303135, + "balance_loss_mlp": 1.04345798, + "epoch": 0.22756651134826394, + "flos": 22236498097920.0, + "grad_norm": 1.6490497895559066, + "language_loss": 0.70716858, + "learning_rate": 3.604328212066594e-06, + "loss": 0.72891551, + "num_input_tokens_seen": 81447375, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.91015625, + "step": 3785, + "time_per_iteration": 2.4733574390411377 + }, + { + "auxiliary_loss_clip": 0.01051525, + "auxiliary_loss_mlp": 0.01004421, + "balance_loss_clip": 1.00252521, + "balance_loss_mlp": 1.01740241, + "epoch": 0.2276266346009319, + "flos": 62707466626560.0, + "grad_norm": 0.8187947911361427, + "language_loss": 0.61915314, + "learning_rate": 3.6040956326655047e-06, + "loss": 0.63971269, + "num_input_tokens_seen": 81505235, + "router_z_loss_clip": 0.0189209, + "router_z_loss_mlp": 0.34179688, + "step": 3786, + "time_per_iteration": 3.0474631786346436 + }, + { + "auxiliary_loss_clip": 0.01143523, + "auxiliary_loss_mlp": 0.01042446, + "balance_loss_clip": 1.02488649, + "balance_loss_mlp": 1.04777002, + "epoch": 0.22768675785359987, + "flos": 18613936684800.0, + "grad_norm": 2.6740153696427247, + "language_loss": 0.86285794, + "learning_rate": 3.6038629924369486e-06, + "loss": 0.88471758, + "num_input_tokens_seen": 81518685, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.953125, + "step": 3787, + "time_per_iteration": 2.4331281185150146 + }, + { + "auxiliary_loss_clip": 0.01137002, + "auxiliary_loss_mlp": 0.01040164, + "balance_loss_clip": 1.02433276, + "balance_loss_mlp": 1.04612255, + "epoch": 0.22774688110626784, + "flos": 26870195364480.0, + "grad_norm": 1.2844293081892826, + "language_loss": 0.72555876, + "learning_rate": 3.6036302913897474e-06, + "loss": 0.74733031, + "num_input_tokens_seen": 81538940, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.91015625, + "step": 3788, + "time_per_iteration": 2.5378167629241943 + }, + { + "auxiliary_loss_clip": 0.01136486, + "auxiliary_loss_mlp": 0.01036201, + "balance_loss_clip": 1.01929688, + "balance_loss_mlp": 1.04552293, + "epoch": 0.2278070043589358, + "flos": 15553593924480.0, + "grad_norm": 2.4737623033533587, + "language_loss": 0.67524469, + "learning_rate": 3.6033975295327243e-06, + "loss": 0.69697154, + "num_input_tokens_seen": 81555525, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.91015625, + "step": 3789, + "time_per_iteration": 2.412086248397827 + }, + { + "auxiliary_loss_clip": 0.01136455, + "auxiliary_loss_mlp": 0.01041211, + "balance_loss_clip": 1.02416384, + "balance_loss_mlp": 1.04507327, + "epoch": 0.2278671276116038, + "flos": 22416805393920.0, + "grad_norm": 2.1501364843402335, + "language_loss": 0.76075745, + "learning_rate": 3.6031647068747065e-06, + "loss": 0.78253406, + "num_input_tokens_seen": 81576305, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9140625, + "step": 3790, + "time_per_iteration": 2.503600835800171 + }, + { + "auxiliary_loss_clip": 0.01134768, + "auxiliary_loss_mlp": 0.01038527, + "balance_loss_clip": 1.02174211, + "balance_loss_mlp": 1.04253387, + "epoch": 0.22792725086427176, + "flos": 20631363033600.0, + "grad_norm": 2.0794940610838397, + "language_loss": 0.90613973, + "learning_rate": 3.602931823424522e-06, + "loss": 0.92787266, + "num_input_tokens_seen": 81594115, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.921875, + "step": 3791, + "time_per_iteration": 2.4503557682037354 + }, + { + "auxiliary_loss_clip": 0.01138123, + "auxiliary_loss_mlp": 0.01036907, + "balance_loss_clip": 1.02000308, + "balance_loss_mlp": 1.04407096, + "epoch": 0.22798737411693973, + "flos": 31428946903680.0, + "grad_norm": 1.8390004860332834, + "language_loss": 0.82869208, + "learning_rate": 3.6026988791910026e-06, + "loss": 0.85044241, + "num_input_tokens_seen": 81615355, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.94140625, + "step": 3792, + "time_per_iteration": 2.5451550483703613 + }, + { + "auxiliary_loss_clip": 0.01045824, + "auxiliary_loss_mlp": 0.01012041, + "balance_loss_clip": 1.01015747, + "balance_loss_mlp": 1.01168287, + "epoch": 0.2280474973696077, + "flos": 52396685827200.0, + "grad_norm": 1.1436128607221614, + "language_loss": 0.65615487, + "learning_rate": 3.602465874182981e-06, + "loss": 0.67673355, + "num_input_tokens_seen": 81662075, + "router_z_loss_clip": 0.01879883, + "router_z_loss_mlp": 0.34179688, + "step": 3793, + "time_per_iteration": 2.7929015159606934 + }, + { + "auxiliary_loss_clip": 0.01142047, + "auxiliary_loss_mlp": 0.01050177, + "balance_loss_clip": 1.03241456, + "balance_loss_mlp": 1.04557967, + "epoch": 0.22810762062227566, + "flos": 26396066816640.0, + "grad_norm": 2.282271850248546, + "language_loss": 0.77100229, + "learning_rate": 3.602232808409293e-06, + "loss": 0.79292452, + "num_input_tokens_seen": 81681625, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.96484375, + "step": 3794, + "time_per_iteration": 2.4882023334503174 + }, + { + "auxiliary_loss_clip": 0.01139112, + "auxiliary_loss_mlp": 0.01038286, + "balance_loss_clip": 1.02146518, + "balance_loss_mlp": 1.04517698, + "epoch": 0.22816774387494362, + "flos": 25630271832960.0, + "grad_norm": 2.1931228295055716, + "language_loss": 0.80724937, + "learning_rate": 3.6019996818787755e-06, + "loss": 0.82902336, + "num_input_tokens_seen": 81701170, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9375, + "step": 3795, + "time_per_iteration": 2.475311279296875 + }, + { + "auxiliary_loss_clip": 0.0113575, + "auxiliary_loss_mlp": 0.01044854, + "balance_loss_clip": 1.02747297, + "balance_loss_mlp": 1.04336488, + "epoch": 0.22822786712761162, + "flos": 22451602694400.0, + "grad_norm": 1.8416311408581074, + "language_loss": 0.77002209, + "learning_rate": 3.6017664946002704e-06, + "loss": 0.79182816, + "num_input_tokens_seen": 81721265, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.92578125, + "step": 3796, + "time_per_iteration": 2.4734761714935303 + }, + { + "auxiliary_loss_clip": 0.01135997, + "auxiliary_loss_mlp": 0.01038978, + "balance_loss_clip": 1.02236056, + "balance_loss_mlp": 1.04312813, + "epoch": 0.22828799038027958, + "flos": 12202554395520.0, + "grad_norm": 2.506500245398156, + "language_loss": 0.9594354, + "learning_rate": 3.6015332465826188e-06, + "loss": 0.98118514, + "num_input_tokens_seen": 81736565, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9296875, + "step": 3797, + "time_per_iteration": 2.4146203994750977 + }, + { + "auxiliary_loss_clip": 0.01138366, + "auxiliary_loss_mlp": 0.01040269, + "balance_loss_clip": 1.02338922, + "balance_loss_mlp": 1.04537892, + "epoch": 0.22834811363294755, + "flos": 22085708803200.0, + "grad_norm": 1.6428427275001165, + "language_loss": 0.81446218, + "learning_rate": 3.601299937834666e-06, + "loss": 0.83624852, + "num_input_tokens_seen": 81756240, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9296875, + "step": 3798, + "time_per_iteration": 2.490849733352661 + }, + { + "auxiliary_loss_clip": 0.01137089, + "auxiliary_loss_mlp": 0.01038732, + "balance_loss_clip": 1.02080309, + "balance_loss_mlp": 1.04262519, + "epoch": 0.2284082368856155, + "flos": 24860634094080.0, + "grad_norm": 2.3515161945239833, + "language_loss": 0.78744864, + "learning_rate": 3.6010665683652596e-06, + "loss": 0.80920684, + "num_input_tokens_seen": 81775720, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9453125, + "step": 3799, + "time_per_iteration": 2.470564842224121 + }, + { + "auxiliary_loss_clip": 0.0113724, + "auxiliary_loss_mlp": 0.01050228, + "balance_loss_clip": 1.0332408, + "balance_loss_mlp": 1.04381084, + "epoch": 0.22846836013828348, + "flos": 23292882109440.0, + "grad_norm": 1.655995083326211, + "language_loss": 0.75234401, + "learning_rate": 3.6008331381832484e-06, + "loss": 0.77421868, + "num_input_tokens_seen": 81795830, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.93359375, + "step": 3800, + "time_per_iteration": 2.510788917541504 + }, + { + "auxiliary_loss_clip": 0.01137174, + "auxiliary_loss_mlp": 0.01039053, + "balance_loss_clip": 1.02320981, + "balance_loss_mlp": 1.04583156, + "epoch": 0.22852848339095144, + "flos": 27416288810880.0, + "grad_norm": 1.661997570582357, + "language_loss": 0.63433349, + "learning_rate": 3.600599647297484e-06, + "loss": 0.6560958, + "num_input_tokens_seen": 81815745, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.9140625, + "step": 3801, + "time_per_iteration": 2.503643035888672 + }, + { + "auxiliary_loss_clip": 0.01136229, + "auxiliary_loss_mlp": 0.0103618, + "balance_loss_clip": 1.02027762, + "balance_loss_mlp": 1.04721296, + "epoch": 0.2285886066436194, + "flos": 26321157002880.0, + "grad_norm": 1.7846583359688928, + "language_loss": 0.81602335, + "learning_rate": 3.60036609571682e-06, + "loss": 0.83774745, + "num_input_tokens_seen": 81835155, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.890625, + "step": 3802, + "time_per_iteration": 4.002788782119751 + }, + { + "auxiliary_loss_clip": 0.01138233, + "auxiliary_loss_mlp": 0.01047462, + "balance_loss_clip": 1.03027248, + "balance_loss_mlp": 1.04454207, + "epoch": 0.2286487298962874, + "flos": 29716475022720.0, + "grad_norm": 1.7683413549342115, + "language_loss": 0.78830242, + "learning_rate": 3.600132483450114e-06, + "loss": 0.81015933, + "num_input_tokens_seen": 81855655, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9375, + "step": 3803, + "time_per_iteration": 3.9494168758392334 + }, + { + "auxiliary_loss_clip": 0.01135958, + "auxiliary_loss_mlp": 0.01042656, + "balance_loss_clip": 1.02544212, + "balance_loss_mlp": 1.04115725, + "epoch": 0.22870885314895537, + "flos": 21287199507840.0, + "grad_norm": 1.6939241338011581, + "language_loss": 0.85561395, + "learning_rate": 3.5998988105062235e-06, + "loss": 0.87740004, + "num_input_tokens_seen": 81876385, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.94921875, + "step": 3804, + "time_per_iteration": 2.4504544734954834 + }, + { + "auxiliary_loss_clip": 0.01139159, + "auxiliary_loss_mlp": 0.01043693, + "balance_loss_clip": 1.02744436, + "balance_loss_mlp": 1.04339862, + "epoch": 0.22876897640162333, + "flos": 14939450161920.0, + "grad_norm": 2.1651494765134736, + "language_loss": 0.76485813, + "learning_rate": 3.59966507689401e-06, + "loss": 0.78668666, + "num_input_tokens_seen": 81893225, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.9609375, + "step": 3805, + "time_per_iteration": 2.4578893184661865 + }, + { + "auxiliary_loss_clip": 0.01139764, + "auxiliary_loss_mlp": 0.01043484, + "balance_loss_clip": 1.02560234, + "balance_loss_mlp": 1.04387915, + "epoch": 0.2288290996542913, + "flos": 18113917409280.0, + "grad_norm": 2.4014048134005628, + "language_loss": 0.79309744, + "learning_rate": 3.5994312826223363e-06, + "loss": 0.81492996, + "num_input_tokens_seen": 81911350, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.95703125, + "step": 3806, + "time_per_iteration": 2.415726900100708 + }, + { + "auxiliary_loss_clip": 0.01139425, + "auxiliary_loss_mlp": 0.01043738, + "balance_loss_clip": 1.02717948, + "balance_loss_mlp": 1.04547703, + "epoch": 0.22888922290695926, + "flos": 39855457071360.0, + "grad_norm": 2.230394288716221, + "language_loss": 0.69194484, + "learning_rate": 3.5991974277000684e-06, + "loss": 0.71377647, + "num_input_tokens_seen": 81935420, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9375, + "step": 3807, + "time_per_iteration": 2.6051764488220215 + }, + { + "auxiliary_loss_clip": 0.01144692, + "auxiliary_loss_mlp": 0.01053011, + "balance_loss_clip": 1.03484392, + "balance_loss_mlp": 1.04811931, + "epoch": 0.22894934615962723, + "flos": 23403774372480.0, + "grad_norm": 2.5207266425605668, + "language_loss": 0.65717816, + "learning_rate": 3.5989635121360733e-06, + "loss": 0.67915517, + "num_input_tokens_seen": 81953845, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.96484375, + "step": 3808, + "time_per_iteration": 2.463885545730591 + }, + { + "auxiliary_loss_clip": 0.01137242, + "auxiliary_loss_mlp": 0.01041588, + "balance_loss_clip": 1.02564931, + "balance_loss_mlp": 1.04470515, + "epoch": 0.22900946941229522, + "flos": 18843011671680.0, + "grad_norm": 1.8002654314964242, + "language_loss": 0.74498177, + "learning_rate": 3.598729535939222e-06, + "loss": 0.76677001, + "num_input_tokens_seen": 81972100, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.92578125, + "step": 3809, + "time_per_iteration": 2.4587652683258057 + }, + { + "auxiliary_loss_clip": 0.01138179, + "auxiliary_loss_mlp": 0.01042926, + "balance_loss_clip": 1.02695227, + "balance_loss_mlp": 1.04707646, + "epoch": 0.22906959266496318, + "flos": 22929394429440.0, + "grad_norm": 1.6413135962032894, + "language_loss": 0.81699908, + "learning_rate": 3.5984954991183862e-06, + "loss": 0.83881009, + "num_input_tokens_seen": 81992760, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.9140625, + "step": 3810, + "time_per_iteration": 2.454545736312866 + }, + { + "auxiliary_loss_clip": 0.01135521, + "auxiliary_loss_mlp": 0.01040213, + "balance_loss_clip": 1.02448893, + "balance_loss_mlp": 1.04428005, + "epoch": 0.22912971591763115, + "flos": 19354523299200.0, + "grad_norm": 2.1876822434942245, + "language_loss": 0.78671384, + "learning_rate": 3.598261401682441e-06, + "loss": 0.8084712, + "num_input_tokens_seen": 82009080, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.9140625, + "step": 3811, + "time_per_iteration": 2.4564197063446045 + }, + { + "auxiliary_loss_clip": 0.01135961, + "auxiliary_loss_mlp": 0.01046878, + "balance_loss_clip": 1.0296042, + "balance_loss_mlp": 1.04317403, + "epoch": 0.22918983917029911, + "flos": 19933546538880.0, + "grad_norm": 1.8120535445273127, + "language_loss": 0.82811391, + "learning_rate": 3.5980272436402632e-06, + "loss": 0.84994221, + "num_input_tokens_seen": 82026705, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9296875, + "step": 3812, + "time_per_iteration": 2.4357566833496094 + }, + { + "auxiliary_loss_clip": 0.01144518, + "auxiliary_loss_mlp": 0.01051465, + "balance_loss_clip": 1.03463292, + "balance_loss_mlp": 1.04750013, + "epoch": 0.22924996242296708, + "flos": 16690885320960.0, + "grad_norm": 3.041111828111396, + "language_loss": 0.82337058, + "learning_rate": 3.5977930250007324e-06, + "loss": 0.84533036, + "num_input_tokens_seen": 82043245, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.96875, + "step": 3813, + "time_per_iteration": 2.4521987438201904 + }, + { + "auxiliary_loss_clip": 0.01139715, + "auxiliary_loss_mlp": 0.01046648, + "balance_loss_clip": 1.03009009, + "balance_loss_mlp": 1.04595184, + "epoch": 0.22931008567563504, + "flos": 33036164956800.0, + "grad_norm": 3.1740680187078896, + "language_loss": 0.69927102, + "learning_rate": 3.5975587457727298e-06, + "loss": 0.72113466, + "num_input_tokens_seen": 82066870, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9375, + "step": 3814, + "time_per_iteration": 2.5528602600097656 + }, + { + "auxiliary_loss_clip": 0.01134595, + "auxiliary_loss_mlp": 0.01044391, + "balance_loss_clip": 1.02773738, + "balance_loss_mlp": 1.04310775, + "epoch": 0.229370208928303, + "flos": 23330696152320.0, + "grad_norm": 2.479981906508555, + "language_loss": 0.67106915, + "learning_rate": 3.597324405965139e-06, + "loss": 0.69285899, + "num_input_tokens_seen": 82083180, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9140625, + "step": 3815, + "time_per_iteration": 2.4768760204315186 + }, + { + "auxiliary_loss_clip": 0.01139552, + "auxiliary_loss_mlp": 0.01052238, + "balance_loss_clip": 1.03593004, + "balance_loss_mlp": 1.04644942, + "epoch": 0.229430332180971, + "flos": 28617213150720.0, + "grad_norm": 1.8467960453518941, + "language_loss": 0.83103681, + "learning_rate": 3.597090005586848e-06, + "loss": 0.85295475, + "num_input_tokens_seen": 82102950, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.9296875, + "step": 3816, + "time_per_iteration": 2.507967710494995 + }, + { + "auxiliary_loss_clip": 0.0113842, + "auxiliary_loss_mlp": 0.01038991, + "balance_loss_clip": 1.02221847, + "balance_loss_mlp": 1.04643357, + "epoch": 0.22949045543363897, + "flos": 17238199829760.0, + "grad_norm": 2.1171855882825636, + "language_loss": 0.86756372, + "learning_rate": 3.596855544646742e-06, + "loss": 0.8893379, + "num_input_tokens_seen": 82119510, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.91796875, + "step": 3817, + "time_per_iteration": 2.4445815086364746 + }, + { + "auxiliary_loss_clip": 0.01142243, + "auxiliary_loss_mlp": 0.01049311, + "balance_loss_clip": 1.03278852, + "balance_loss_mlp": 1.04829407, + "epoch": 0.22955057868630693, + "flos": 27489438858240.0, + "grad_norm": 2.403232678237585, + "language_loss": 0.75039381, + "learning_rate": 3.5966210231537154e-06, + "loss": 0.77230936, + "num_input_tokens_seen": 82140095, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.9375, + "step": 3818, + "time_per_iteration": 2.508527994155884 + }, + { + "auxiliary_loss_clip": 0.01141204, + "auxiliary_loss_mlp": 0.01041338, + "balance_loss_clip": 1.02426732, + "balance_loss_mlp": 1.04769611, + "epoch": 0.2296107019389749, + "flos": 23476421629440.0, + "grad_norm": 1.6537639427714739, + "language_loss": 0.74597251, + "learning_rate": 3.596386441116659e-06, + "loss": 0.76779795, + "num_input_tokens_seen": 82159510, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.93359375, + "step": 3819, + "time_per_iteration": 2.5009493827819824 + }, + { + "auxiliary_loss_clip": 0.01138376, + "auxiliary_loss_mlp": 0.01044786, + "balance_loss_clip": 1.02806103, + "balance_loss_mlp": 1.04632187, + "epoch": 0.22967082519164286, + "flos": 31285160760960.0, + "grad_norm": 1.815385500594849, + "language_loss": 0.80775046, + "learning_rate": 3.5961517985444684e-06, + "loss": 0.8295821, + "num_input_tokens_seen": 82179580, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.921875, + "step": 3820, + "time_per_iteration": 2.5374531745910645 + }, + { + "auxiliary_loss_clip": 0.01142613, + "auxiliary_loss_mlp": 0.01041984, + "balance_loss_clip": 1.02384043, + "balance_loss_mlp": 1.04725921, + "epoch": 0.22973094844431083, + "flos": 14642935390080.0, + "grad_norm": 2.0886359367899763, + "language_loss": 0.69226766, + "learning_rate": 3.595917095446042e-06, + "loss": 0.71411359, + "num_input_tokens_seen": 82195585, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.953125, + "step": 3821, + "time_per_iteration": 2.4539082050323486 + }, + { + "auxiliary_loss_clip": 0.0113954, + "auxiliary_loss_mlp": 0.01036487, + "balance_loss_clip": 1.01912975, + "balance_loss_mlp": 1.0466336, + "epoch": 0.2297910716969788, + "flos": 22823853292800.0, + "grad_norm": 1.623620301878745, + "language_loss": 0.82655883, + "learning_rate": 3.5956823318302796e-06, + "loss": 0.84831905, + "num_input_tokens_seen": 82217530, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9296875, + "step": 3822, + "time_per_iteration": 2.5025360584259033 + }, + { + "auxiliary_loss_clip": 0.01137437, + "auxiliary_loss_mlp": 0.01040965, + "balance_loss_clip": 1.02264285, + "balance_loss_mlp": 1.04520607, + "epoch": 0.2298511949496468, + "flos": 23039029716480.0, + "grad_norm": 1.581563173789708, + "language_loss": 0.66093826, + "learning_rate": 3.5954475077060833e-06, + "loss": 0.68272227, + "num_input_tokens_seen": 82237980, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.921875, + "step": 3823, + "time_per_iteration": 2.500643253326416 + }, + { + "auxiliary_loss_clip": 0.0104753, + "auxiliary_loss_mlp": 0.01001124, + "balance_loss_clip": 0.99913329, + "balance_loss_mlp": 1.01448655, + "epoch": 0.22991131820231475, + "flos": 66890914911360.0, + "grad_norm": 0.8191682875264555, + "language_loss": 0.56770015, + "learning_rate": 3.595212623082357e-06, + "loss": 0.58818674, + "num_input_tokens_seen": 82301785, + "router_z_loss_clip": 0.01989746, + "router_z_loss_mlp": 0.33203125, + "step": 3824, + "time_per_iteration": 3.1365485191345215 + }, + { + "auxiliary_loss_clip": 0.01135805, + "auxiliary_loss_mlp": 0.01039563, + "balance_loss_clip": 1.02363658, + "balance_loss_mlp": 1.04575276, + "epoch": 0.22997144145498272, + "flos": 17887248633600.0, + "grad_norm": 2.487273324074565, + "language_loss": 0.72840559, + "learning_rate": 3.594977677968009e-06, + "loss": 0.75015926, + "num_input_tokens_seen": 82317355, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.90234375, + "step": 3825, + "time_per_iteration": 2.444730758666992 + }, + { + "auxiliary_loss_clip": 0.01143286, + "auxiliary_loss_mlp": 0.01046033, + "balance_loss_clip": 1.02810407, + "balance_loss_mlp": 1.04978526, + "epoch": 0.23003156470765068, + "flos": 24676843178880.0, + "grad_norm": 1.8892090994393747, + "language_loss": 0.87760615, + "learning_rate": 3.5947426723719473e-06, + "loss": 0.89949936, + "num_input_tokens_seen": 82336645, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9375, + "step": 3826, + "time_per_iteration": 2.492682456970215 + }, + { + "auxiliary_loss_clip": 0.01142911, + "auxiliary_loss_mlp": 0.01043844, + "balance_loss_clip": 1.0258677, + "balance_loss_mlp": 1.04683542, + "epoch": 0.23009168796031865, + "flos": 15814126247040.0, + "grad_norm": 2.6663888482282623, + "language_loss": 0.81568289, + "learning_rate": 3.594507606303083e-06, + "loss": 0.8375504, + "num_input_tokens_seen": 82354225, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9609375, + "step": 3827, + "time_per_iteration": 2.488593578338623 + }, + { + "auxiliary_loss_clip": 0.01135849, + "auxiliary_loss_mlp": 0.01043921, + "balance_loss_clip": 1.02750623, + "balance_loss_mlp": 1.04553437, + "epoch": 0.2301518112129866, + "flos": 16212842190720.0, + "grad_norm": 1.8456206141648608, + "language_loss": 0.86791205, + "learning_rate": 3.5942724797703314e-06, + "loss": 0.88970977, + "num_input_tokens_seen": 82370240, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90625, + "step": 3828, + "time_per_iteration": 2.4386606216430664 + }, + { + "auxiliary_loss_clip": 0.01138396, + "auxiliary_loss_mlp": 0.01049169, + "balance_loss_clip": 1.03147864, + "balance_loss_mlp": 1.04512644, + "epoch": 0.2302119344656546, + "flos": 20595452411520.0, + "grad_norm": 2.106420485404446, + "language_loss": 0.70638877, + "learning_rate": 3.594037292782607e-06, + "loss": 0.72826439, + "num_input_tokens_seen": 82389145, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.93359375, + "step": 3829, + "time_per_iteration": 2.475399971008301 + }, + { + "auxiliary_loss_clip": 0.01139852, + "auxiliary_loss_mlp": 0.01038095, + "balance_loss_clip": 1.02241933, + "balance_loss_mlp": 1.05011487, + "epoch": 0.23027205771832257, + "flos": 26796901662720.0, + "grad_norm": 1.5719627508253273, + "language_loss": 0.84045994, + "learning_rate": 3.5938020453488293e-06, + "loss": 0.86223942, + "num_input_tokens_seen": 82409185, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8984375, + "step": 3830, + "time_per_iteration": 2.4943718910217285 + }, + { + "auxiliary_loss_clip": 0.01139071, + "auxiliary_loss_mlp": 0.01049012, + "balance_loss_clip": 1.03172636, + "balance_loss_mlp": 1.04637957, + "epoch": 0.23033218097099054, + "flos": 43873143068160.0, + "grad_norm": 1.733206127117623, + "language_loss": 0.66863495, + "learning_rate": 3.5935667374779177e-06, + "loss": 0.69051576, + "num_input_tokens_seen": 82432070, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.92578125, + "step": 3831, + "time_per_iteration": 2.6513662338256836 + }, + { + "auxiliary_loss_clip": 0.01141151, + "auxiliary_loss_mlp": 0.01042727, + "balance_loss_clip": 1.02603793, + "balance_loss_mlp": 1.04735637, + "epoch": 0.2303923042236585, + "flos": 26067663745920.0, + "grad_norm": 2.238850649877041, + "language_loss": 0.75253022, + "learning_rate": 3.5933313691787957e-06, + "loss": 0.77436894, + "num_input_tokens_seen": 82450625, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9375, + "step": 3832, + "time_per_iteration": 2.4889180660247803 + }, + { + "auxiliary_loss_clip": 0.01139559, + "auxiliary_loss_mlp": 0.01043075, + "balance_loss_clip": 1.02515745, + "balance_loss_mlp": 1.04709673, + "epoch": 0.23045242747632647, + "flos": 18296379521280.0, + "grad_norm": 1.8583815246829203, + "language_loss": 0.87474239, + "learning_rate": 3.593095940460389e-06, + "loss": 0.89656878, + "num_input_tokens_seen": 82468575, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.92578125, + "step": 3833, + "time_per_iteration": 2.46744966506958 + }, + { + "auxiliary_loss_clip": 0.01140821, + "auxiliary_loss_mlp": 0.01047215, + "balance_loss_clip": 1.02950096, + "balance_loss_mlp": 1.0478369, + "epoch": 0.23051255072899443, + "flos": 25520528805120.0, + "grad_norm": 3.2120713643012206, + "language_loss": 0.74875945, + "learning_rate": 3.592860451331624e-06, + "loss": 0.77063978, + "num_input_tokens_seen": 82488655, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9296875, + "step": 3834, + "time_per_iteration": 2.485504627227783 + }, + { + "auxiliary_loss_clip": 0.0113943, + "auxiliary_loss_mlp": 0.01051682, + "balance_loss_clip": 1.03408706, + "balance_loss_mlp": 1.0484879, + "epoch": 0.2305726739816624, + "flos": 21215198695680.0, + "grad_norm": 1.820281268490984, + "language_loss": 0.85338157, + "learning_rate": 3.592624901801432e-06, + "loss": 0.87529278, + "num_input_tokens_seen": 82507220, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.91015625, + "step": 3835, + "time_per_iteration": 2.4730474948883057 + }, + { + "auxiliary_loss_clip": 0.01146651, + "auxiliary_loss_mlp": 0.01049278, + "balance_loss_clip": 1.03142083, + "balance_loss_mlp": 1.04814029, + "epoch": 0.2306327972343304, + "flos": 23331127115520.0, + "grad_norm": 2.799799470431086, + "language_loss": 0.81974924, + "learning_rate": 3.5923892918787432e-06, + "loss": 0.84170854, + "num_input_tokens_seen": 82527920, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.984375, + "step": 3836, + "time_per_iteration": 2.464657783508301 + }, + { + "auxiliary_loss_clip": 0.0114557, + "auxiliary_loss_mlp": 0.01043707, + "balance_loss_clip": 1.02726793, + "balance_loss_mlp": 1.05202293, + "epoch": 0.23069292048699835, + "flos": 20666734951680.0, + "grad_norm": 1.7793450137018207, + "language_loss": 0.79603267, + "learning_rate": 3.5921536215724934e-06, + "loss": 0.81792545, + "num_input_tokens_seen": 82549040, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9375, + "step": 3837, + "time_per_iteration": 2.4715559482574463 + }, + { + "auxiliary_loss_clip": 0.01055276, + "auxiliary_loss_mlp": 0.01017826, + "balance_loss_clip": 1.01614499, + "balance_loss_mlp": 1.02046371, + "epoch": 0.23075304373966632, + "flos": 70454832393600.0, + "grad_norm": 0.9409846751082755, + "language_loss": 0.65487945, + "learning_rate": 3.5919178908916184e-06, + "loss": 0.67561042, + "num_input_tokens_seen": 82604070, + "router_z_loss_clip": 0.0168457, + "router_z_loss_mlp": 0.34765625, + "step": 3838, + "time_per_iteration": 2.9852375984191895 + }, + { + "auxiliary_loss_clip": 0.01139351, + "auxiliary_loss_mlp": 0.01047713, + "balance_loss_clip": 1.03131008, + "balance_loss_mlp": 1.04721856, + "epoch": 0.23081316699233428, + "flos": 16617986668800.0, + "grad_norm": 2.6310373190732648, + "language_loss": 0.7527796, + "learning_rate": 3.591682099845058e-06, + "loss": 0.77465028, + "num_input_tokens_seen": 82619665, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.921875, + "step": 3839, + "time_per_iteration": 2.4290778636932373 + }, + { + "auxiliary_loss_clip": 0.01145463, + "auxiliary_loss_mlp": 0.01042021, + "balance_loss_clip": 1.02486694, + "balance_loss_mlp": 1.0510757, + "epoch": 0.23087329024500225, + "flos": 13298081253120.0, + "grad_norm": 4.016837458595543, + "language_loss": 0.68691337, + "learning_rate": 3.591446248441752e-06, + "loss": 0.70878816, + "num_input_tokens_seen": 82637530, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9453125, + "step": 3840, + "time_per_iteration": 2.456422805786133 + }, + { + "auxiliary_loss_clip": 0.01143425, + "auxiliary_loss_mlp": 0.01039716, + "balance_loss_clip": 1.02084517, + "balance_loss_mlp": 1.04936612, + "epoch": 0.23093341349767021, + "flos": 17785729820160.0, + "grad_norm": 2.1574295618121426, + "language_loss": 0.79412574, + "learning_rate": 3.591210336690645e-06, + "loss": 0.81595719, + "num_input_tokens_seen": 82656130, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.9375, + "step": 3841, + "time_per_iteration": 2.4762818813323975 + }, + { + "auxiliary_loss_clip": 0.01141641, + "auxiliary_loss_mlp": 0.01041348, + "balance_loss_clip": 1.02557695, + "balance_loss_mlp": 1.04872346, + "epoch": 0.23099353675033818, + "flos": 23988076911360.0, + "grad_norm": 5.070488540070664, + "language_loss": 0.83171731, + "learning_rate": 3.590974364600683e-06, + "loss": 0.85354722, + "num_input_tokens_seen": 82675295, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.9296875, + "step": 3842, + "time_per_iteration": 2.4908032417297363 + }, + { + "auxiliary_loss_clip": 0.01139394, + "auxiliary_loss_mlp": 0.0104314, + "balance_loss_clip": 1.0255568, + "balance_loss_mlp": 1.04567111, + "epoch": 0.23105366000300617, + "flos": 35995168471680.0, + "grad_norm": 1.6842769818445011, + "language_loss": 0.66523731, + "learning_rate": 3.5907383321808135e-06, + "loss": 0.68706262, + "num_input_tokens_seen": 82703260, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9375, + "step": 3843, + "time_per_iteration": 2.6503937244415283 + }, + { + "auxiliary_loss_clip": 0.01138914, + "auxiliary_loss_mlp": 0.01043512, + "balance_loss_clip": 1.02642977, + "balance_loss_mlp": 1.04793119, + "epoch": 0.23111378325567414, + "flos": 31245335556480.0, + "grad_norm": 1.8910129932977493, + "language_loss": 0.77445257, + "learning_rate": 3.590502239439987e-06, + "loss": 0.79627681, + "num_input_tokens_seen": 82725060, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.91015625, + "step": 3844, + "time_per_iteration": 5.4645676612854 + }, + { + "auxiliary_loss_clip": 0.01142407, + "auxiliary_loss_mlp": 0.01041287, + "balance_loss_clip": 1.02321458, + "balance_loss_mlp": 1.04744804, + "epoch": 0.2311739065083421, + "flos": 19208223204480.0, + "grad_norm": 1.6615026518232119, + "language_loss": 0.77974623, + "learning_rate": 3.590266086387156e-06, + "loss": 0.80158317, + "num_input_tokens_seen": 82742960, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.94921875, + "step": 3845, + "time_per_iteration": 2.467289686203003 + }, + { + "auxiliary_loss_clip": 0.01133475, + "auxiliary_loss_mlp": 0.01032005, + "balance_loss_clip": 1.01687717, + "balance_loss_mlp": 1.04577661, + "epoch": 0.23123402976101007, + "flos": 23360178240000.0, + "grad_norm": 2.1438137502119425, + "language_loss": 0.76064527, + "learning_rate": 3.590029873031276e-06, + "loss": 0.78230006, + "num_input_tokens_seen": 82760205, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.875, + "step": 3846, + "time_per_iteration": 2.4985382556915283 + }, + { + "auxiliary_loss_clip": 0.01140881, + "auxiliary_loss_mlp": 0.0104335, + "balance_loss_clip": 1.02638626, + "balance_loss_mlp": 1.04725194, + "epoch": 0.23129415301367803, + "flos": 13735365425280.0, + "grad_norm": 2.4609763976845556, + "language_loss": 0.69493651, + "learning_rate": 3.589793599381304e-06, + "loss": 0.71677887, + "num_input_tokens_seen": 82778590, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9375, + "step": 3847, + "time_per_iteration": 2.4514195919036865 + }, + { + "auxiliary_loss_clip": 0.01048129, + "auxiliary_loss_mlp": 0.01002559, + "balance_loss_clip": 1.00074661, + "balance_loss_mlp": 1.01598144, + "epoch": 0.231354276266346, + "flos": 69737015001600.0, + "grad_norm": 0.7927409416341922, + "language_loss": 0.61051595, + "learning_rate": 3.589557265446198e-06, + "loss": 0.63102281, + "num_input_tokens_seen": 82833925, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.3203125, + "step": 3848, + "time_per_iteration": 2.981518030166626 + }, + { + "auxiliary_loss_clip": 0.011385, + "auxiliary_loss_mlp": 0.0104523, + "balance_loss_clip": 1.02789688, + "balance_loss_mlp": 1.04593349, + "epoch": 0.231414399519014, + "flos": 18835900778880.0, + "grad_norm": 2.568019101440284, + "language_loss": 0.7746805, + "learning_rate": 3.589320871234923e-06, + "loss": 0.79651785, + "num_input_tokens_seen": 82850625, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.92578125, + "step": 3849, + "time_per_iteration": 2.450693130493164 + }, + { + "auxiliary_loss_clip": 0.01139635, + "auxiliary_loss_mlp": 0.01042495, + "balance_loss_clip": 1.02551913, + "balance_loss_mlp": 1.04533124, + "epoch": 0.23147452277168196, + "flos": 36135470995200.0, + "grad_norm": 1.9223002445017061, + "language_loss": 0.71673942, + "learning_rate": 3.5890844167564405e-06, + "loss": 0.73856068, + "num_input_tokens_seen": 82872105, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9453125, + "step": 3850, + "time_per_iteration": 2.589395761489868 + }, + { + "auxiliary_loss_clip": 0.01137166, + "auxiliary_loss_mlp": 0.01035391, + "balance_loss_clip": 1.01870215, + "balance_loss_mlp": 1.04362154, + "epoch": 0.23153464602434992, + "flos": 20812927305600.0, + "grad_norm": 3.8422038584857665, + "language_loss": 0.75846308, + "learning_rate": 3.588847902019718e-06, + "loss": 0.78018856, + "num_input_tokens_seen": 82890595, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.93359375, + "step": 3851, + "time_per_iteration": 2.495729446411133 + }, + { + "auxiliary_loss_clip": 0.01138492, + "auxiliary_loss_mlp": 0.01040821, + "balance_loss_clip": 1.0234046, + "balance_loss_mlp": 1.04747272, + "epoch": 0.2315947692770179, + "flos": 19939256801280.0, + "grad_norm": 1.914141324585442, + "language_loss": 0.69797802, + "learning_rate": 3.588611327033723e-06, + "loss": 0.71977121, + "num_input_tokens_seen": 82908910, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.91015625, + "step": 3852, + "time_per_iteration": 2.478408098220825 + }, + { + "auxiliary_loss_clip": 0.01140513, + "auxiliary_loss_mlp": 0.01037305, + "balance_loss_clip": 1.0206399, + "balance_loss_mlp": 1.04643583, + "epoch": 0.23165489252968585, + "flos": 12855553695360.0, + "grad_norm": 2.1861380100726144, + "language_loss": 0.67030561, + "learning_rate": 3.588374691807428e-06, + "loss": 0.69208378, + "num_input_tokens_seen": 82925405, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.94140625, + "step": 3853, + "time_per_iteration": 2.4445838928222656 + }, + { + "auxiliary_loss_clip": 0.01141194, + "auxiliary_loss_mlp": 0.01035545, + "balance_loss_clip": 1.01815248, + "balance_loss_mlp": 1.04680121, + "epoch": 0.23171501578235382, + "flos": 30628282792320.0, + "grad_norm": 1.6671703506367506, + "language_loss": 0.79851103, + "learning_rate": 3.5881379963498053e-06, + "loss": 0.82027847, + "num_input_tokens_seen": 82945615, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9453125, + "step": 3854, + "time_per_iteration": 2.5455782413482666 + }, + { + "auxiliary_loss_clip": 0.01146661, + "auxiliary_loss_mlp": 0.01042654, + "balance_loss_clip": 1.02476025, + "balance_loss_mlp": 1.04726899, + "epoch": 0.23177513903502178, + "flos": 23842782397440.0, + "grad_norm": 3.8560715318244556, + "language_loss": 0.64987147, + "learning_rate": 3.587901240669831e-06, + "loss": 0.67176461, + "num_input_tokens_seen": 82967570, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9921875, + "step": 3855, + "time_per_iteration": 2.5996124744415283 + }, + { + "auxiliary_loss_clip": 0.01140829, + "auxiliary_loss_mlp": 0.01044083, + "balance_loss_clip": 1.02753139, + "balance_loss_mlp": 1.04570055, + "epoch": 0.23183526228768978, + "flos": 29570282668800.0, + "grad_norm": 2.1096123404526623, + "language_loss": 0.70711654, + "learning_rate": 3.5876644247764815e-06, + "loss": 0.72896564, + "num_input_tokens_seen": 82987435, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.953125, + "step": 3856, + "time_per_iteration": 2.5024092197418213 + }, + { + "auxiliary_loss_clip": 0.01137323, + "auxiliary_loss_mlp": 0.01035633, + "balance_loss_clip": 1.02062488, + "balance_loss_mlp": 1.0464257, + "epoch": 0.23189538554035774, + "flos": 34458694254720.0, + "grad_norm": 6.089384897844753, + "language_loss": 0.76997125, + "learning_rate": 3.5874275486787387e-06, + "loss": 0.79170084, + "num_input_tokens_seen": 83010505, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.91015625, + "step": 3857, + "time_per_iteration": 2.5962576866149902 + }, + { + "auxiliary_loss_clip": 0.01143962, + "auxiliary_loss_mlp": 0.01048446, + "balance_loss_clip": 1.03018308, + "balance_loss_mlp": 1.0477798, + "epoch": 0.2319555087930257, + "flos": 18003815245440.0, + "grad_norm": 3.478057752262005, + "language_loss": 0.91006696, + "learning_rate": 3.587190612385584e-06, + "loss": 0.93199098, + "num_input_tokens_seen": 83026705, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.96484375, + "step": 3858, + "time_per_iteration": 2.4276509284973145 + }, + { + "auxiliary_loss_clip": 0.01136894, + "auxiliary_loss_mlp": 0.01042737, + "balance_loss_clip": 1.02576208, + "balance_loss_mlp": 1.04679012, + "epoch": 0.23201563204569367, + "flos": 23143852581120.0, + "grad_norm": 2.1437168922033747, + "language_loss": 0.75995493, + "learning_rate": 3.5869536159060026e-06, + "loss": 0.78175128, + "num_input_tokens_seen": 83046500, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.8984375, + "step": 3859, + "time_per_iteration": 2.485426187515259 + }, + { + "auxiliary_loss_clip": 0.01136619, + "auxiliary_loss_mlp": 0.01036655, + "balance_loss_clip": 1.01962614, + "balance_loss_mlp": 1.04423487, + "epoch": 0.23207575529836164, + "flos": 20667991927680.0, + "grad_norm": 1.9055462071213993, + "language_loss": 0.84061682, + "learning_rate": 3.58671655924898e-06, + "loss": 0.86234951, + "num_input_tokens_seen": 83065280, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.92578125, + "step": 3860, + "time_per_iteration": 2.4607324600219727 + }, + { + "auxiliary_loss_clip": 0.01137991, + "auxiliary_loss_mlp": 0.01040318, + "balance_loss_clip": 1.02317619, + "balance_loss_mlp": 1.04656291, + "epoch": 0.2321358785510296, + "flos": 16472189364480.0, + "grad_norm": 2.1337823805291047, + "language_loss": 0.82972974, + "learning_rate": 3.586479442423508e-06, + "loss": 0.85151279, + "num_input_tokens_seen": 83082310, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9140625, + "step": 3861, + "time_per_iteration": 2.451805591583252 + }, + { + "auxiliary_loss_clip": 0.01142125, + "auxiliary_loss_mlp": 0.01043892, + "balance_loss_clip": 1.02702415, + "balance_loss_mlp": 1.04800034, + "epoch": 0.2321960018036976, + "flos": 21616320850560.0, + "grad_norm": 1.8456518711772996, + "language_loss": 0.85918242, + "learning_rate": 3.586242265438576e-06, + "loss": 0.8810426, + "num_input_tokens_seen": 83102065, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.94140625, + "step": 3862, + "time_per_iteration": 2.4582395553588867 + }, + { + "auxiliary_loss_clip": 0.01136451, + "auxiliary_loss_mlp": 0.01044214, + "balance_loss_clip": 1.02914643, + "balance_loss_mlp": 1.0468179, + "epoch": 0.23225612505636556, + "flos": 22271474966400.0, + "grad_norm": 1.3833481647146872, + "language_loss": 0.7492758, + "learning_rate": 3.5860050283031773e-06, + "loss": 0.7710824, + "num_input_tokens_seen": 83121445, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8984375, + "step": 3863, + "time_per_iteration": 2.496985912322998 + }, + { + "auxiliary_loss_clip": 0.01139904, + "auxiliary_loss_mlp": 0.01042767, + "balance_loss_clip": 1.02723408, + "balance_loss_mlp": 1.05037498, + "epoch": 0.23231624830903352, + "flos": 17052325925760.0, + "grad_norm": 2.003739732436234, + "language_loss": 0.74640852, + "learning_rate": 3.58576773102631e-06, + "loss": 0.76823521, + "num_input_tokens_seen": 83138175, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.89453125, + "step": 3864, + "time_per_iteration": 2.440204381942749 + }, + { + "auxiliary_loss_clip": 0.0113912, + "auxiliary_loss_mlp": 0.01035726, + "balance_loss_clip": 1.01952517, + "balance_loss_mlp": 1.0468204, + "epoch": 0.2323763715617015, + "flos": 34640043045120.0, + "grad_norm": 3.940820538439298, + "language_loss": 0.70690906, + "learning_rate": 3.5855303736169714e-06, + "loss": 0.72865754, + "num_input_tokens_seen": 83161975, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.921875, + "step": 3865, + "time_per_iteration": 2.598194122314453 + }, + { + "auxiliary_loss_clip": 0.01148702, + "auxiliary_loss_mlp": 0.01049623, + "balance_loss_clip": 1.03091884, + "balance_loss_mlp": 1.04987264, + "epoch": 0.23243649481436945, + "flos": 25551698832000.0, + "grad_norm": 1.9658537667403149, + "language_loss": 0.94853866, + "learning_rate": 3.5852929560841617e-06, + "loss": 0.97052193, + "num_input_tokens_seen": 83180905, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.98828125, + "step": 3866, + "time_per_iteration": 2.496276617050171 + }, + { + "auxiliary_loss_clip": 0.01138876, + "auxiliary_loss_mlp": 0.01040339, + "balance_loss_clip": 1.02412629, + "balance_loss_mlp": 1.04817796, + "epoch": 0.23249661806703742, + "flos": 20483482740480.0, + "grad_norm": 2.6667540210019123, + "language_loss": 0.72528732, + "learning_rate": 3.5850554784368846e-06, + "loss": 0.74707949, + "num_input_tokens_seen": 83196390, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.90625, + "step": 3867, + "time_per_iteration": 2.4933414459228516 + }, + { + "auxiliary_loss_clip": 0.01140693, + "auxiliary_loss_mlp": 0.01043897, + "balance_loss_clip": 1.02625418, + "balance_loss_mlp": 1.04734945, + "epoch": 0.23255674131970538, + "flos": 20376612800640.0, + "grad_norm": 1.8421111702540602, + "language_loss": 0.82411921, + "learning_rate": 3.584817940684145e-06, + "loss": 0.84596509, + "num_input_tokens_seen": 83216165, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.93359375, + "step": 3868, + "time_per_iteration": 2.4994540214538574 + }, + { + "auxiliary_loss_clip": 0.01136829, + "auxiliary_loss_mlp": 0.01040452, + "balance_loss_clip": 1.02433491, + "balance_loss_mlp": 1.04700828, + "epoch": 0.23261686457237338, + "flos": 17056096853760.0, + "grad_norm": 1.815886356300666, + "language_loss": 0.73335075, + "learning_rate": 3.58458034283495e-06, + "loss": 0.75512362, + "num_input_tokens_seen": 83233845, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8984375, + "step": 3869, + "time_per_iteration": 2.4486095905303955 + }, + { + "auxiliary_loss_clip": 0.01139645, + "auxiliary_loss_mlp": 0.01047185, + "balance_loss_clip": 1.03108525, + "balance_loss_mlp": 1.04929376, + "epoch": 0.23267698782504134, + "flos": 29169878785920.0, + "grad_norm": 1.6948965109205438, + "language_loss": 0.79564929, + "learning_rate": 3.5843426848983097e-06, + "loss": 0.81751764, + "num_input_tokens_seen": 83254930, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.90234375, + "step": 3870, + "time_per_iteration": 2.506114959716797 + }, + { + "auxiliary_loss_clip": 0.01143066, + "auxiliary_loss_mlp": 0.01043212, + "balance_loss_clip": 1.02574801, + "balance_loss_mlp": 1.04845953, + "epoch": 0.2327371110777093, + "flos": 21174655219200.0, + "grad_norm": 3.2368167151878797, + "language_loss": 0.70599115, + "learning_rate": 3.5841049668832357e-06, + "loss": 0.72785389, + "num_input_tokens_seen": 83272095, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9453125, + "step": 3871, + "time_per_iteration": 2.455266237258911 + }, + { + "auxiliary_loss_clip": 0.01145685, + "auxiliary_loss_mlp": 0.01055983, + "balance_loss_clip": 1.03674293, + "balance_loss_mlp": 1.05011845, + "epoch": 0.23279723433037727, + "flos": 24863112132480.0, + "grad_norm": 2.2694181422477313, + "language_loss": 0.69087327, + "learning_rate": 3.5838671887987433e-06, + "loss": 0.71289003, + "num_input_tokens_seen": 83290980, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.95703125, + "step": 3872, + "time_per_iteration": 2.482089042663574 + }, + { + "auxiliary_loss_clip": 0.01147162, + "auxiliary_loss_mlp": 0.01045167, + "balance_loss_clip": 1.0271188, + "balance_loss_mlp": 1.04984593, + "epoch": 0.23285735758304524, + "flos": 38800617344640.0, + "grad_norm": 1.4965805681858408, + "language_loss": 0.78046703, + "learning_rate": 3.5836293506538474e-06, + "loss": 0.80239034, + "num_input_tokens_seen": 83315175, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.97265625, + "step": 3873, + "time_per_iteration": 2.6179678440093994 + }, + { + "auxiliary_loss_clip": 0.01053819, + "auxiliary_loss_mlp": 0.01009657, + "balance_loss_clip": 1.00777328, + "balance_loss_mlp": 1.02347898, + "epoch": 0.2329174808357132, + "flos": 53944113692160.0, + "grad_norm": 0.841863213022928, + "language_loss": 0.60519493, + "learning_rate": 3.5833914524575687e-06, + "loss": 0.6258297, + "num_input_tokens_seen": 83372060, + "router_z_loss_clip": 0.01879883, + "router_z_loss_mlp": 0.3046875, + "step": 3874, + "time_per_iteration": 2.955524444580078 + }, + { + "auxiliary_loss_clip": 0.01142096, + "auxiliary_loss_mlp": 0.01044266, + "balance_loss_clip": 1.02695727, + "balance_loss_mlp": 1.04998708, + "epoch": 0.23297760408838117, + "flos": 21216024708480.0, + "grad_norm": 2.0817330720741287, + "language_loss": 0.8082279, + "learning_rate": 3.583153494218927e-06, + "loss": 0.83009154, + "num_input_tokens_seen": 83389795, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.921875, + "step": 3875, + "time_per_iteration": 2.4961941242218018 + }, + { + "auxiliary_loss_clip": 0.01141065, + "auxiliary_loss_mlp": 0.01039949, + "balance_loss_clip": 1.02440381, + "balance_loss_mlp": 1.04931068, + "epoch": 0.23303772734104916, + "flos": 28403006394240.0, + "grad_norm": 1.6586054731564495, + "language_loss": 0.60997009, + "learning_rate": 3.5829154759469464e-06, + "loss": 0.63178027, + "num_input_tokens_seen": 83410005, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.91796875, + "step": 3876, + "time_per_iteration": 2.5234174728393555 + }, + { + "auxiliary_loss_clip": 0.01144475, + "auxiliary_loss_mlp": 0.0104992, + "balance_loss_clip": 1.0319072, + "balance_loss_mlp": 1.05151403, + "epoch": 0.23309785059371713, + "flos": 24314720215680.0, + "grad_norm": 1.9912662806979935, + "language_loss": 0.70357525, + "learning_rate": 3.5826773976506523e-06, + "loss": 0.72551912, + "num_input_tokens_seen": 83430250, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9296875, + "step": 3877, + "time_per_iteration": 2.5117876529693604 + }, + { + "auxiliary_loss_clip": 0.01142635, + "auxiliary_loss_mlp": 0.0104808, + "balance_loss_clip": 1.02984059, + "balance_loss_mlp": 1.04846656, + "epoch": 0.2331579738463851, + "flos": 15992925171840.0, + "grad_norm": 2.20617127152986, + "language_loss": 0.81169856, + "learning_rate": 3.582439259339073e-06, + "loss": 0.83360565, + "num_input_tokens_seen": 83447950, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.94140625, + "step": 3878, + "time_per_iteration": 2.418745517730713 + }, + { + "auxiliary_loss_clip": 0.01145943, + "auxiliary_loss_mlp": 0.01047242, + "balance_loss_clip": 1.02914643, + "balance_loss_mlp": 1.04905999, + "epoch": 0.23321809709905306, + "flos": 36426957863040.0, + "grad_norm": 2.449565501872003, + "language_loss": 0.74765849, + "learning_rate": 3.5822010610212374e-06, + "loss": 0.76959032, + "num_input_tokens_seen": 83467785, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.96875, + "step": 3879, + "time_per_iteration": 2.627453088760376 + }, + { + "auxiliary_loss_clip": 0.0113984, + "auxiliary_loss_mlp": 0.01043428, + "balance_loss_clip": 1.02597582, + "balance_loss_mlp": 1.04611635, + "epoch": 0.23327822035172102, + "flos": 21324762155520.0, + "grad_norm": 2.3281305870509685, + "language_loss": 0.89896512, + "learning_rate": 3.5819628027061795e-06, + "loss": 0.92079782, + "num_input_tokens_seen": 83485390, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9375, + "step": 3880, + "time_per_iteration": 2.529181957244873 + }, + { + "auxiliary_loss_clip": 0.01144521, + "auxiliary_loss_mlp": 0.01046534, + "balance_loss_clip": 1.02926075, + "balance_loss_mlp": 1.05019975, + "epoch": 0.233338343604389, + "flos": 19171881619200.0, + "grad_norm": 1.7300006336865508, + "language_loss": 0.72026277, + "learning_rate": 3.5817244844029334e-06, + "loss": 0.74217331, + "num_input_tokens_seen": 83504890, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9453125, + "step": 3881, + "time_per_iteration": 2.5004756450653076 + }, + { + "auxiliary_loss_clip": 0.01138796, + "auxiliary_loss_mlp": 0.01044785, + "balance_loss_clip": 1.02798867, + "balance_loss_mlp": 1.04610527, + "epoch": 0.23339846685705698, + "flos": 26908368543360.0, + "grad_norm": 1.5765664683306326, + "language_loss": 0.67988127, + "learning_rate": 3.581486106120537e-06, + "loss": 0.70171714, + "num_input_tokens_seen": 83526475, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9296875, + "step": 3882, + "time_per_iteration": 2.5134541988372803 + }, + { + "auxiliary_loss_clip": 0.01143679, + "auxiliary_loss_mlp": 0.01057975, + "balance_loss_clip": 1.04020119, + "balance_loss_mlp": 1.0481658, + "epoch": 0.23345859010972494, + "flos": 32343160884480.0, + "grad_norm": 3.2831975264627116, + "language_loss": 0.76596051, + "learning_rate": 3.5812476678680287e-06, + "loss": 0.78797704, + "num_input_tokens_seen": 83546620, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.953125, + "step": 3883, + "time_per_iteration": 2.5556836128234863 + }, + { + "auxiliary_loss_clip": 0.01046918, + "auxiliary_loss_mlp": 0.01002528, + "balance_loss_clip": 1.00059688, + "balance_loss_mlp": 1.01619315, + "epoch": 0.2335187133623929, + "flos": 58484229050880.0, + "grad_norm": 0.7953130928556094, + "language_loss": 0.59102494, + "learning_rate": 3.58100916965445e-06, + "loss": 0.6115194, + "num_input_tokens_seen": 83616160, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.30859375, + "step": 3884, + "time_per_iteration": 3.210090398788452 + }, + { + "auxiliary_loss_clip": 0.01139917, + "auxiliary_loss_mlp": 0.01035881, + "balance_loss_clip": 1.0196687, + "balance_loss_mlp": 1.04723644, + "epoch": 0.23357883661506088, + "flos": 24502317972480.0, + "grad_norm": 3.4795297654408617, + "language_loss": 0.80128157, + "learning_rate": 3.5807706114888455e-06, + "loss": 0.82303953, + "num_input_tokens_seen": 83636795, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.92578125, + "step": 3885, + "time_per_iteration": 4.129857301712036 + }, + { + "auxiliary_loss_clip": 0.01139579, + "auxiliary_loss_mlp": 0.01039954, + "balance_loss_clip": 1.02257299, + "balance_loss_mlp": 1.04763317, + "epoch": 0.23363895986772884, + "flos": 18948516894720.0, + "grad_norm": 2.392049069504846, + "language_loss": 0.88482237, + "learning_rate": 3.580531993380261e-06, + "loss": 0.9066177, + "num_input_tokens_seen": 83654050, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.921875, + "step": 3886, + "time_per_iteration": 4.002579689025879 + }, + { + "auxiliary_loss_clip": 0.01143892, + "auxiliary_loss_mlp": 0.01041505, + "balance_loss_clip": 1.02452922, + "balance_loss_mlp": 1.04953825, + "epoch": 0.2336990831203968, + "flos": 31686821619840.0, + "grad_norm": 2.2740188667520815, + "language_loss": 0.73199034, + "learning_rate": 3.5802933153377445e-06, + "loss": 0.75384426, + "num_input_tokens_seen": 83673720, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9453125, + "step": 3887, + "time_per_iteration": 2.5730721950531006 + }, + { + "auxiliary_loss_clip": 0.0114256, + "auxiliary_loss_mlp": 0.01043796, + "balance_loss_clip": 1.02709508, + "balance_loss_mlp": 1.04827881, + "epoch": 0.23375920637306477, + "flos": 27709750926720.0, + "grad_norm": 1.8689872769958875, + "language_loss": 0.84098816, + "learning_rate": 3.5800545773703475e-06, + "loss": 0.86285174, + "num_input_tokens_seen": 83693470, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.94140625, + "step": 3888, + "time_per_iteration": 2.526090145111084 + }, + { + "auxiliary_loss_clip": 0.01140206, + "auxiliary_loss_mlp": 0.01051088, + "balance_loss_clip": 1.03400528, + "balance_loss_mlp": 1.04775357, + "epoch": 0.23381932962573276, + "flos": 17675627656320.0, + "grad_norm": 5.34722340994348, + "language_loss": 0.87174153, + "learning_rate": 3.5798157794871225e-06, + "loss": 0.89365447, + "num_input_tokens_seen": 83711620, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.921875, + "step": 3889, + "time_per_iteration": 2.465535879135132 + }, + { + "auxiliary_loss_clip": 0.01143335, + "auxiliary_loss_mlp": 0.0104294, + "balance_loss_clip": 1.02659607, + "balance_loss_mlp": 1.04914057, + "epoch": 0.23387945287840073, + "flos": 14390842763520.0, + "grad_norm": 4.26980733686294, + "language_loss": 0.7660414, + "learning_rate": 3.579576921697125e-06, + "loss": 0.78790414, + "num_input_tokens_seen": 83727890, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.94140625, + "step": 3890, + "time_per_iteration": 2.4164645671844482 + }, + { + "auxiliary_loss_clip": 0.01144006, + "auxiliary_loss_mlp": 0.01046428, + "balance_loss_clip": 1.02940536, + "balance_loss_mlp": 1.05018783, + "epoch": 0.2339395761310687, + "flos": 46097988503040.0, + "grad_norm": 3.12388753004446, + "language_loss": 0.73396742, + "learning_rate": 3.579338004009412e-06, + "loss": 0.75587177, + "num_input_tokens_seen": 83749370, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9375, + "step": 3891, + "time_per_iteration": 2.692251443862915 + }, + { + "auxiliary_loss_clip": 0.01136776, + "auxiliary_loss_mlp": 0.01040069, + "balance_loss_clip": 1.0227952, + "balance_loss_mlp": 1.04672241, + "epoch": 0.23399969938373666, + "flos": 22382044007040.0, + "grad_norm": 1.6638493558493535, + "language_loss": 0.82791233, + "learning_rate": 3.5790990264330433e-06, + "loss": 0.84968084, + "num_input_tokens_seen": 83769560, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.8984375, + "step": 3892, + "time_per_iteration": 2.4657654762268066 + }, + { + "auxiliary_loss_clip": 0.01143467, + "auxiliary_loss_mlp": 0.01042619, + "balance_loss_clip": 1.02550626, + "balance_loss_mlp": 1.04892194, + "epoch": 0.23405982263640462, + "flos": 43508542066560.0, + "grad_norm": 2.124834647136637, + "language_loss": 0.64928782, + "learning_rate": 3.578859988977082e-06, + "loss": 0.67114866, + "num_input_tokens_seen": 83795635, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9453125, + "step": 3893, + "time_per_iteration": 2.6640076637268066 + }, + { + "auxiliary_loss_clip": 0.01139173, + "auxiliary_loss_mlp": 0.01038221, + "balance_loss_clip": 1.02056575, + "balance_loss_mlp": 1.04930127, + "epoch": 0.2341199458890726, + "flos": 22564685687040.0, + "grad_norm": 2.3013698222001753, + "language_loss": 0.79011095, + "learning_rate": 3.5786208916505916e-06, + "loss": 0.81188488, + "num_input_tokens_seen": 83814090, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.8984375, + "step": 3894, + "time_per_iteration": 2.4596238136291504 + }, + { + "auxiliary_loss_clip": 0.01139997, + "auxiliary_loss_mlp": 0.01044293, + "balance_loss_clip": 1.02772284, + "balance_loss_mlp": 1.0473485, + "epoch": 0.23418006914174055, + "flos": 25633970933760.0, + "grad_norm": 1.4729608662155413, + "language_loss": 0.81608742, + "learning_rate": 3.5783817344626383e-06, + "loss": 0.83793032, + "num_input_tokens_seen": 83836870, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.92578125, + "step": 3895, + "time_per_iteration": 2.5229499340057373 + }, + { + "auxiliary_loss_clip": 0.01141397, + "auxiliary_loss_mlp": 0.01048743, + "balance_loss_clip": 1.03210139, + "balance_loss_mlp": 1.04895353, + "epoch": 0.23424019239440855, + "flos": 13545936074880.0, + "grad_norm": 2.370345363223057, + "language_loss": 0.79861861, + "learning_rate": 3.578142517422292e-06, + "loss": 0.82052004, + "num_input_tokens_seen": 83853275, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.92578125, + "step": 3896, + "time_per_iteration": 2.4219553470611572 + }, + { + "auxiliary_loss_clip": 0.01142956, + "auxiliary_loss_mlp": 0.01042754, + "balance_loss_clip": 1.02507555, + "balance_loss_mlp": 1.04863656, + "epoch": 0.2343003156470765, + "flos": 22419498913920.0, + "grad_norm": 1.6083647422684384, + "language_loss": 0.83279634, + "learning_rate": 3.577903240538623e-06, + "loss": 0.85465348, + "num_input_tokens_seen": 83872340, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9453125, + "step": 3897, + "time_per_iteration": 2.497347593307495 + }, + { + "auxiliary_loss_clip": 0.01144102, + "auxiliary_loss_mlp": 0.01048556, + "balance_loss_clip": 1.03093636, + "balance_loss_mlp": 1.04880857, + "epoch": 0.23436043889974448, + "flos": 14790815683200.0, + "grad_norm": 2.0551194275294784, + "language_loss": 0.79281437, + "learning_rate": 3.577663903820705e-06, + "loss": 0.8147409, + "num_input_tokens_seen": 83888795, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.953125, + "step": 3898, + "time_per_iteration": 2.4275295734405518 + }, + { + "auxiliary_loss_clip": 0.01139139, + "auxiliary_loss_mlp": 0.01047646, + "balance_loss_clip": 1.0316844, + "balance_loss_mlp": 1.05034626, + "epoch": 0.23442056215241244, + "flos": 22965700101120.0, + "grad_norm": 3.329769754331659, + "language_loss": 0.73955798, + "learning_rate": 3.577424507277614e-06, + "loss": 0.76142585, + "num_input_tokens_seen": 83906820, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.88671875, + "step": 3899, + "time_per_iteration": 2.5017077922821045 + }, + { + "auxiliary_loss_clip": 0.01141437, + "auxiliary_loss_mlp": 0.01051006, + "balance_loss_clip": 1.03412604, + "balance_loss_mlp": 1.04896975, + "epoch": 0.2344806854050804, + "flos": 23071887682560.0, + "grad_norm": 1.8374782290855665, + "language_loss": 0.75695914, + "learning_rate": 3.5771850509184277e-06, + "loss": 0.77888358, + "num_input_tokens_seen": 83926370, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.92578125, + "step": 3900, + "time_per_iteration": 2.4796969890594482 + }, + { + "auxiliary_loss_clip": 0.01137483, + "auxiliary_loss_mlp": 0.01049218, + "balance_loss_clip": 1.03224266, + "balance_loss_mlp": 1.04685295, + "epoch": 0.23454080865774837, + "flos": 16327074418560.0, + "grad_norm": 1.9641187800197561, + "language_loss": 0.66949147, + "learning_rate": 3.5769455347522256e-06, + "loss": 0.69135845, + "num_input_tokens_seen": 83944600, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.90625, + "step": 3901, + "time_per_iteration": 2.5052907466888428 + }, + { + "auxiliary_loss_clip": 0.01050259, + "auxiliary_loss_mlp": 0.0101831, + "balance_loss_clip": 1.01646185, + "balance_loss_mlp": 1.01950026, + "epoch": 0.23460093191041637, + "flos": 67760958142080.0, + "grad_norm": 0.7670843237762338, + "language_loss": 0.58209252, + "learning_rate": 3.576705958788091e-06, + "loss": 0.6027782, + "num_input_tokens_seen": 84005100, + "router_z_loss_clip": 0.01843262, + "router_z_loss_mlp": 0.30859375, + "step": 3902, + "time_per_iteration": 3.0522701740264893 + }, + { + "auxiliary_loss_clip": 0.01140756, + "auxiliary_loss_mlp": 0.01044175, + "balance_loss_clip": 1.02684176, + "balance_loss_mlp": 1.04932666, + "epoch": 0.23466105516308433, + "flos": 20077619990400.0, + "grad_norm": 1.9913375770157136, + "language_loss": 0.80411339, + "learning_rate": 3.576466323035108e-06, + "loss": 0.82596278, + "num_input_tokens_seen": 84023775, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9140625, + "step": 3903, + "time_per_iteration": 2.515796184539795 + }, + { + "auxiliary_loss_clip": 0.01139226, + "auxiliary_loss_mlp": 0.01039647, + "balance_loss_clip": 1.02274299, + "balance_loss_mlp": 1.04670942, + "epoch": 0.2347211784157523, + "flos": 24535714642560.0, + "grad_norm": 3.712536549247666, + "language_loss": 0.82183945, + "learning_rate": 3.5762266275023645e-06, + "loss": 0.84362817, + "num_input_tokens_seen": 84042605, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.92578125, + "step": 3904, + "time_per_iteration": 2.48119854927063 + }, + { + "auxiliary_loss_clip": 0.01140858, + "auxiliary_loss_mlp": 0.0104346, + "balance_loss_clip": 1.02642536, + "balance_loss_mlp": 1.05013537, + "epoch": 0.23478130166842026, + "flos": 23805040181760.0, + "grad_norm": 1.9990680719867946, + "language_loss": 0.7137326, + "learning_rate": 3.57598687219895e-06, + "loss": 0.7355758, + "num_input_tokens_seen": 84061520, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.90625, + "step": 3905, + "time_per_iteration": 2.494558811187744 + }, + { + "auxiliary_loss_clip": 0.01136571, + "auxiliary_loss_mlp": 0.01036326, + "balance_loss_clip": 1.01987517, + "balance_loss_mlp": 1.04811251, + "epoch": 0.23484142492108823, + "flos": 24093618048000.0, + "grad_norm": 1.865256832649412, + "language_loss": 0.70834756, + "learning_rate": 3.5757470571339543e-06, + "loss": 0.73007655, + "num_input_tokens_seen": 84081800, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8828125, + "step": 3906, + "time_per_iteration": 2.5057764053344727 + }, + { + "auxiliary_loss_clip": 0.01144181, + "auxiliary_loss_mlp": 0.01037627, + "balance_loss_clip": 1.01881528, + "balance_loss_mlp": 1.04728532, + "epoch": 0.2349015481737562, + "flos": 29095830898560.0, + "grad_norm": 2.129912307166789, + "language_loss": 0.73542202, + "learning_rate": 3.575507182316473e-06, + "loss": 0.75724012, + "num_input_tokens_seen": 84102340, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.96875, + "step": 3907, + "time_per_iteration": 2.5734074115753174 + }, + { + "auxiliary_loss_clip": 0.01141507, + "auxiliary_loss_mlp": 0.01047564, + "balance_loss_clip": 1.03004074, + "balance_loss_mlp": 1.04927719, + "epoch": 0.23496167142642416, + "flos": 18916305373440.0, + "grad_norm": 1.7646530569469054, + "language_loss": 0.72807813, + "learning_rate": 3.575267247755601e-06, + "loss": 0.74996883, + "num_input_tokens_seen": 84120370, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.921875, + "step": 3908, + "time_per_iteration": 2.438422441482544 + }, + { + "auxiliary_loss_clip": 0.01049243, + "auxiliary_loss_mlp": 0.01002133, + "balance_loss_clip": 1.00030959, + "balance_loss_mlp": 1.01835775, + "epoch": 0.23502179467909215, + "flos": 55868062896000.0, + "grad_norm": 1.0194055540826834, + "language_loss": 0.73271406, + "learning_rate": 3.5750272534604367e-06, + "loss": 0.75322783, + "num_input_tokens_seen": 84165515, + "router_z_loss_clip": 0.01818848, + "router_z_loss_mlp": 0.30859375, + "step": 3909, + "time_per_iteration": 2.8451788425445557 + }, + { + "auxiliary_loss_clip": 0.01139398, + "auxiliary_loss_mlp": 0.01043023, + "balance_loss_clip": 1.02607155, + "balance_loss_mlp": 1.04842734, + "epoch": 0.23508191793176011, + "flos": 23401763210880.0, + "grad_norm": 1.5487453833335116, + "language_loss": 0.87906706, + "learning_rate": 3.5747871994400822e-06, + "loss": 0.9008913, + "num_input_tokens_seen": 84184540, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.91015625, + "step": 3910, + "time_per_iteration": 2.4648385047912598 + }, + { + "auxiliary_loss_clip": 0.01141916, + "auxiliary_loss_mlp": 0.01040084, + "balance_loss_clip": 1.02370465, + "balance_loss_mlp": 1.04950166, + "epoch": 0.23514204118442808, + "flos": 20047671025920.0, + "grad_norm": 2.1910966534760297, + "language_loss": 0.75809109, + "learning_rate": 3.5745470857036386e-06, + "loss": 0.7799111, + "num_input_tokens_seen": 84202025, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.921875, + "step": 3911, + "time_per_iteration": 2.4715898036956787 + }, + { + "auxiliary_loss_clip": 0.01136455, + "auxiliary_loss_mlp": 0.01042737, + "balance_loss_clip": 1.02729297, + "balance_loss_mlp": 1.04807627, + "epoch": 0.23520216443709605, + "flos": 21580589796480.0, + "grad_norm": 1.9083148186883727, + "language_loss": 0.81775904, + "learning_rate": 3.5743069122602122e-06, + "loss": 0.83955097, + "num_input_tokens_seen": 84221895, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8828125, + "step": 3912, + "time_per_iteration": 2.4627628326416016 + }, + { + "auxiliary_loss_clip": 0.01139949, + "auxiliary_loss_mlp": 0.01050703, + "balance_loss_clip": 1.03270197, + "balance_loss_mlp": 1.04939759, + "epoch": 0.235262287689764, + "flos": 23185796688000.0, + "grad_norm": 1.7554989092460516, + "language_loss": 0.71664345, + "learning_rate": 3.574066679118909e-06, + "loss": 0.73854995, + "num_input_tokens_seen": 84240455, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.90625, + "step": 3913, + "time_per_iteration": 2.5080020427703857 + }, + { + "auxiliary_loss_clip": 0.01147528, + "auxiliary_loss_mlp": 0.01045028, + "balance_loss_clip": 1.02691996, + "balance_loss_mlp": 1.05220175, + "epoch": 0.23532241094243198, + "flos": 23185222070400.0, + "grad_norm": 1.7040704955860875, + "language_loss": 0.75903499, + "learning_rate": 3.57382638628884e-06, + "loss": 0.78096056, + "num_input_tokens_seen": 84261605, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.953125, + "step": 3914, + "time_per_iteration": 2.487429618835449 + }, + { + "auxiliary_loss_clip": 0.01141443, + "auxiliary_loss_mlp": 0.01040515, + "balance_loss_clip": 1.02307451, + "balance_loss_mlp": 1.05093837, + "epoch": 0.23538253419509997, + "flos": 17019324305280.0, + "grad_norm": 2.554647654086476, + "language_loss": 0.89353001, + "learning_rate": 3.5735860337791174e-06, + "loss": 0.9153496, + "num_input_tokens_seen": 84278675, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.90625, + "step": 3915, + "time_per_iteration": 2.500753402709961 + }, + { + "auxiliary_loss_clip": 0.01044736, + "auxiliary_loss_mlp": 0.01003661, + "balance_loss_clip": 1.00158656, + "balance_loss_mlp": 1.0141747, + "epoch": 0.23544265744776793, + "flos": 63448588967040.0, + "grad_norm": 0.8049654288159457, + "language_loss": 0.5935356, + "learning_rate": 3.573345621598854e-06, + "loss": 0.61401957, + "num_input_tokens_seen": 84329765, + "router_z_loss_clip": 0.02075195, + "router_z_loss_mlp": 0.3046875, + "step": 3916, + "time_per_iteration": 2.9926259517669678 + }, + { + "auxiliary_loss_clip": 0.01042644, + "auxiliary_loss_mlp": 0.01002857, + "balance_loss_clip": 1.00075865, + "balance_loss_mlp": 1.01226258, + "epoch": 0.2355027807004359, + "flos": 70515343831680.0, + "grad_norm": 0.7742950949727582, + "language_loss": 0.49486533, + "learning_rate": 3.5731051497571675e-06, + "loss": 0.51532036, + "num_input_tokens_seen": 84393680, + "router_z_loss_clip": 0.02099609, + "router_z_loss_mlp": 0.3046875, + "step": 3917, + "time_per_iteration": 3.085294723510742 + }, + { + "auxiliary_loss_clip": 0.01142529, + "auxiliary_loss_mlp": 0.01052441, + "balance_loss_clip": 1.03615093, + "balance_loss_mlp": 1.04923129, + "epoch": 0.23556290395310386, + "flos": 21434289701760.0, + "grad_norm": 2.000752484300541, + "language_loss": 0.76012552, + "learning_rate": 3.5728646182631756e-06, + "loss": 0.78207517, + "num_input_tokens_seen": 84412640, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.93359375, + "step": 3918, + "time_per_iteration": 2.4883201122283936 + }, + { + "auxiliary_loss_clip": 0.01145359, + "auxiliary_loss_mlp": 0.0104448, + "balance_loss_clip": 1.02805305, + "balance_loss_mlp": 1.04997587, + "epoch": 0.23562302720577183, + "flos": 18186421011840.0, + "grad_norm": 2.209135495431813, + "language_loss": 0.68728662, + "learning_rate": 3.5726240271259995e-06, + "loss": 0.709185, + "num_input_tokens_seen": 84431605, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.953125, + "step": 3919, + "time_per_iteration": 2.4489476680755615 + }, + { + "auxiliary_loss_clip": 0.01137524, + "auxiliary_loss_mlp": 0.01038874, + "balance_loss_clip": 1.02216101, + "balance_loss_mlp": 1.04864836, + "epoch": 0.2356831504584398, + "flos": 33730497832320.0, + "grad_norm": 1.8210843900818243, + "language_loss": 0.70324695, + "learning_rate": 3.5723833763547634e-06, + "loss": 0.72501087, + "num_input_tokens_seen": 84454210, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.88671875, + "step": 3920, + "time_per_iteration": 2.6011908054351807 + }, + { + "auxiliary_loss_clip": 0.01141332, + "auxiliary_loss_mlp": 0.01046958, + "balance_loss_clip": 1.03128195, + "balance_loss_mlp": 1.05122209, + "epoch": 0.23574327371110776, + "flos": 24932778560640.0, + "grad_norm": 1.6333300745229378, + "language_loss": 0.77596343, + "learning_rate": 3.5721426659585916e-06, + "loss": 0.79784632, + "num_input_tokens_seen": 84475540, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8984375, + "step": 3921, + "time_per_iteration": 2.498924732208252 + }, + { + "auxiliary_loss_clip": 0.01142015, + "auxiliary_loss_mlp": 0.01042804, + "balance_loss_clip": 1.02615058, + "balance_loss_mlp": 1.05108023, + "epoch": 0.23580339696377575, + "flos": 17822107319040.0, + "grad_norm": 2.5438781918161375, + "language_loss": 0.7561245, + "learning_rate": 3.571901895946612e-06, + "loss": 0.7779727, + "num_input_tokens_seen": 84494580, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.91015625, + "step": 3922, + "time_per_iteration": 2.467103958129883 + }, + { + "auxiliary_loss_clip": 0.01138646, + "auxiliary_loss_mlp": 0.01041381, + "balance_loss_clip": 1.02583599, + "balance_loss_mlp": 1.0489881, + "epoch": 0.23586352021644372, + "flos": 26286611097600.0, + "grad_norm": 2.3317912313524625, + "language_loss": 0.80016744, + "learning_rate": 3.571661066327956e-06, + "loss": 0.82196772, + "num_input_tokens_seen": 84513850, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8984375, + "step": 3923, + "time_per_iteration": 2.5075273513793945 + }, + { + "auxiliary_loss_clip": 0.01138213, + "auxiliary_loss_mlp": 0.01046068, + "balance_loss_clip": 1.02985525, + "balance_loss_mlp": 1.04845715, + "epoch": 0.23592364346911168, + "flos": 14246697484800.0, + "grad_norm": 1.9692150152538963, + "language_loss": 0.74753797, + "learning_rate": 3.571420177111754e-06, + "loss": 0.76938081, + "num_input_tokens_seen": 84532315, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8984375, + "step": 3924, + "time_per_iteration": 2.442448377609253 + }, + { + "auxiliary_loss_clip": 0.01141205, + "auxiliary_loss_mlp": 0.01046148, + "balance_loss_clip": 1.03013766, + "balance_loss_mlp": 1.04995513, + "epoch": 0.23598376672177965, + "flos": 18587938216320.0, + "grad_norm": 2.1681544357284093, + "language_loss": 0.82770467, + "learning_rate": 3.5711792283071416e-06, + "loss": 0.84957814, + "num_input_tokens_seen": 84550970, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.9140625, + "step": 3925, + "time_per_iteration": 2.44718337059021 + }, + { + "auxiliary_loss_clip": 0.01138195, + "auxiliary_loss_mlp": 0.01047882, + "balance_loss_clip": 1.03100252, + "balance_loss_mlp": 1.04645014, + "epoch": 0.2360438899744476, + "flos": 22675542036480.0, + "grad_norm": 1.8844556004317345, + "language_loss": 0.59408414, + "learning_rate": 3.5709382199232564e-06, + "loss": 0.61594486, + "num_input_tokens_seen": 84571655, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.91796875, + "step": 3926, + "time_per_iteration": 2.4840757846832275 + }, + { + "auxiliary_loss_clip": 0.01135063, + "auxiliary_loss_mlp": 0.01045392, + "balance_loss_clip": 1.02977526, + "balance_loss_mlp": 1.04721665, + "epoch": 0.23610401322711558, + "flos": 29570139014400.0, + "grad_norm": 1.967091588265342, + "language_loss": 0.71317631, + "learning_rate": 3.570697151969235e-06, + "loss": 0.73498082, + "num_input_tokens_seen": 84593130, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.87890625, + "step": 3927, + "time_per_iteration": 4.117234945297241 + }, + { + "auxiliary_loss_clip": 0.01137568, + "auxiliary_loss_mlp": 0.01044401, + "balance_loss_clip": 1.0295651, + "balance_loss_mlp": 1.04787612, + "epoch": 0.23616413647978354, + "flos": 17858520731520.0, + "grad_norm": 1.8263460078369782, + "language_loss": 0.75102496, + "learning_rate": 3.570456024454221e-06, + "loss": 0.77284467, + "num_input_tokens_seen": 84612410, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8984375, + "step": 3928, + "time_per_iteration": 3.9637200832366943 + }, + { + "auxiliary_loss_clip": 0.01137493, + "auxiliary_loss_mlp": 0.01048389, + "balance_loss_clip": 1.03086567, + "balance_loss_mlp": 1.04693556, + "epoch": 0.23622425973245154, + "flos": 11034847157760.0, + "grad_norm": 2.885999758146942, + "language_loss": 0.81520462, + "learning_rate": 3.5702148373873576e-06, + "loss": 0.83706343, + "num_input_tokens_seen": 84627610, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.90625, + "step": 3929, + "time_per_iteration": 2.499310255050659 + }, + { + "auxiliary_loss_clip": 0.01146116, + "auxiliary_loss_mlp": 0.01047853, + "balance_loss_clip": 1.02998328, + "balance_loss_mlp": 1.04974854, + "epoch": 0.2362843829851195, + "flos": 23404061681280.0, + "grad_norm": 4.669381706210694, + "language_loss": 0.7194528, + "learning_rate": 3.569973590777789e-06, + "loss": 0.74139249, + "num_input_tokens_seen": 84648415, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.9609375, + "step": 3930, + "time_per_iteration": 2.4964945316314697 + }, + { + "auxiliary_loss_clip": 0.01137432, + "auxiliary_loss_mlp": 0.0103904, + "balance_loss_clip": 1.02245224, + "balance_loss_mlp": 1.046561, + "epoch": 0.23634450623778747, + "flos": 39529855261440.0, + "grad_norm": 2.489267518834959, + "language_loss": 0.73764896, + "learning_rate": 3.569732284634665e-06, + "loss": 0.7594136, + "num_input_tokens_seen": 84670080, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.91015625, + "step": 3931, + "time_per_iteration": 2.6283528804779053 + }, + { + "auxiliary_loss_clip": 0.01140852, + "auxiliary_loss_mlp": 0.01039788, + "balance_loss_clip": 1.02245522, + "balance_loss_mlp": 1.04971111, + "epoch": 0.23640462949045543, + "flos": 24207167917440.0, + "grad_norm": 2.06419219579993, + "language_loss": 0.8026945, + "learning_rate": 3.569490918967136e-06, + "loss": 0.82450092, + "num_input_tokens_seen": 84686465, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9140625, + "step": 3932, + "time_per_iteration": 2.4901018142700195 + }, + { + "auxiliary_loss_clip": 0.01138855, + "auxiliary_loss_mlp": 0.01039585, + "balance_loss_clip": 1.02483916, + "balance_loss_mlp": 1.05032694, + "epoch": 0.2364647527431234, + "flos": 26177622255360.0, + "grad_norm": 1.5491195596348342, + "language_loss": 0.85760093, + "learning_rate": 3.5692494937843537e-06, + "loss": 0.87938541, + "num_input_tokens_seen": 84708825, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8828125, + "step": 3933, + "time_per_iteration": 2.5625483989715576 + }, + { + "auxiliary_loss_clip": 0.01146232, + "auxiliary_loss_mlp": 0.01037898, + "balance_loss_clip": 1.02008784, + "balance_loss_mlp": 1.0532943, + "epoch": 0.23652487599579136, + "flos": 22637009721600.0, + "grad_norm": 2.0322099534023685, + "language_loss": 0.8277775, + "learning_rate": 3.5690080090954727e-06, + "loss": 0.84961879, + "num_input_tokens_seen": 84726165, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9296875, + "step": 3934, + "time_per_iteration": 2.512068748474121 + }, + { + "auxiliary_loss_clip": 0.01141394, + "auxiliary_loss_mlp": 0.01037778, + "balance_loss_clip": 1.02102923, + "balance_loss_mlp": 1.04977798, + "epoch": 0.23658499924845935, + "flos": 21762261809280.0, + "grad_norm": 1.774494675769988, + "language_loss": 0.7864846, + "learning_rate": 3.5687664649096515e-06, + "loss": 0.80827636, + "num_input_tokens_seen": 84745815, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.91796875, + "step": 3935, + "time_per_iteration": 2.4996352195739746 + }, + { + "auxiliary_loss_clip": 0.01138141, + "auxiliary_loss_mlp": 0.01035042, + "balance_loss_clip": 1.01913905, + "balance_loss_mlp": 1.04973102, + "epoch": 0.23664512250112732, + "flos": 21798998444160.0, + "grad_norm": 1.7164724890649055, + "language_loss": 0.79656923, + "learning_rate": 3.5685248612360487e-06, + "loss": 0.81830108, + "num_input_tokens_seen": 84765415, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8828125, + "step": 3936, + "time_per_iteration": 2.4868710041046143 + }, + { + "auxiliary_loss_clip": 0.01138439, + "auxiliary_loss_mlp": 0.01036243, + "balance_loss_clip": 1.0192436, + "balance_loss_mlp": 1.04798818, + "epoch": 0.23670524575379528, + "flos": 22637871648000.0, + "grad_norm": 1.4334555797897097, + "language_loss": 0.78783411, + "learning_rate": 3.568283198083826e-06, + "loss": 0.80958092, + "num_input_tokens_seen": 84787080, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.90625, + "step": 3937, + "time_per_iteration": 2.499565362930298 + }, + { + "auxiliary_loss_clip": 0.01136409, + "auxiliary_loss_mlp": 0.01037518, + "balance_loss_clip": 1.02244997, + "balance_loss_mlp": 1.04970455, + "epoch": 0.23676536900646325, + "flos": 16725000263040.0, + "grad_norm": 2.078138882715826, + "language_loss": 0.85105085, + "learning_rate": 3.568041475462147e-06, + "loss": 0.8727901, + "num_input_tokens_seen": 84805395, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8671875, + "step": 3938, + "time_per_iteration": 2.449214220046997 + }, + { + "auxiliary_loss_clip": 0.01135246, + "auxiliary_loss_mlp": 0.01044603, + "balance_loss_clip": 1.0285933, + "balance_loss_mlp": 1.04824734, + "epoch": 0.23682549225913122, + "flos": 11135611785600.0, + "grad_norm": 2.4851234695326423, + "language_loss": 0.93872499, + "learning_rate": 3.5677996933801785e-06, + "loss": 0.96052349, + "num_input_tokens_seen": 84818090, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87109375, + "step": 3939, + "time_per_iteration": 2.415891647338867 + }, + { + "auxiliary_loss_clip": 0.01140924, + "auxiliary_loss_mlp": 0.01043341, + "balance_loss_clip": 1.02598429, + "balance_loss_mlp": 1.04769599, + "epoch": 0.23688561551179918, + "flos": 22559226819840.0, + "grad_norm": 1.6764835140151866, + "language_loss": 0.8238095, + "learning_rate": 3.567557851847088e-06, + "loss": 0.84565216, + "num_input_tokens_seen": 84837695, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9296875, + "step": 3940, + "time_per_iteration": 2.47468900680542 + }, + { + "auxiliary_loss_clip": 0.01145021, + "auxiliary_loss_mlp": 0.01042824, + "balance_loss_clip": 1.02592003, + "balance_loss_mlp": 1.04990602, + "epoch": 0.23694573876446715, + "flos": 18514895909760.0, + "grad_norm": 2.2107440191497054, + "language_loss": 0.88986713, + "learning_rate": 3.5673159508720464e-06, + "loss": 0.91174555, + "num_input_tokens_seen": 84854630, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.953125, + "step": 3941, + "time_per_iteration": 2.455631971359253 + }, + { + "auxiliary_loss_clip": 0.01136515, + "auxiliary_loss_mlp": 0.01043393, + "balance_loss_clip": 1.02580976, + "balance_loss_mlp": 1.04538155, + "epoch": 0.23700586201713514, + "flos": 15335723980800.0, + "grad_norm": 2.1526885300024072, + "language_loss": 0.84676927, + "learning_rate": 3.5670739904642274e-06, + "loss": 0.86856836, + "num_input_tokens_seen": 84871805, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9140625, + "step": 3942, + "time_per_iteration": 2.43743634223938 + }, + { + "auxiliary_loss_clip": 0.01140361, + "auxiliary_loss_mlp": 0.01045145, + "balance_loss_clip": 1.02769232, + "balance_loss_mlp": 1.04840159, + "epoch": 0.2370659852698031, + "flos": 23947605262080.0, + "grad_norm": 1.8547641010298248, + "language_loss": 0.80905575, + "learning_rate": 3.5668319706328065e-06, + "loss": 0.83091086, + "num_input_tokens_seen": 84889815, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.921875, + "step": 3943, + "time_per_iteration": 2.5058658123016357 + }, + { + "auxiliary_loss_clip": 0.01143585, + "auxiliary_loss_mlp": 0.01039213, + "balance_loss_clip": 1.02084267, + "balance_loss_mlp": 1.04731488, + "epoch": 0.23712610852247107, + "flos": 15332527670400.0, + "grad_norm": 2.308079684052438, + "language_loss": 0.67493033, + "learning_rate": 3.566589891386959e-06, + "loss": 0.69675827, + "num_input_tokens_seen": 84904380, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.9609375, + "step": 3944, + "time_per_iteration": 2.4276273250579834 + }, + { + "auxiliary_loss_clip": 0.01144217, + "auxiliary_loss_mlp": 0.01038427, + "balance_loss_clip": 1.02116549, + "balance_loss_mlp": 1.05084419, + "epoch": 0.23718623177513903, + "flos": 19682567233920.0, + "grad_norm": 2.061169456768298, + "language_loss": 0.75421506, + "learning_rate": 3.566347752735866e-06, + "loss": 0.77604151, + "num_input_tokens_seen": 84922935, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.93359375, + "step": 3945, + "time_per_iteration": 2.474323272705078 + }, + { + "auxiliary_loss_clip": 0.01137318, + "auxiliary_loss_mlp": 0.01039206, + "balance_loss_clip": 1.02304149, + "balance_loss_mlp": 1.0469377, + "epoch": 0.237246355027807, + "flos": 24973322037120.0, + "grad_norm": 1.6081639136691026, + "language_loss": 0.63469779, + "learning_rate": 3.5661055546887094e-06, + "loss": 0.65646303, + "num_input_tokens_seen": 84943685, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.90234375, + "step": 3946, + "time_per_iteration": 2.5087931156158447 + }, + { + "auxiliary_loss_clip": 0.01137558, + "auxiliary_loss_mlp": 0.01038922, + "balance_loss_clip": 1.02186346, + "balance_loss_mlp": 1.04692435, + "epoch": 0.23730647828047496, + "flos": 15377416692480.0, + "grad_norm": 2.27613511663784, + "language_loss": 0.77508283, + "learning_rate": 3.5658632972546734e-06, + "loss": 0.79684764, + "num_input_tokens_seen": 84959505, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.90625, + "step": 3947, + "time_per_iteration": 2.4716949462890625 + }, + { + "auxiliary_loss_clip": 0.01141281, + "auxiliary_loss_mlp": 0.0104192, + "balance_loss_clip": 1.02496827, + "balance_loss_mlp": 1.05008841, + "epoch": 0.23736660153314296, + "flos": 28150662372480.0, + "grad_norm": 1.6255497375782806, + "language_loss": 0.80575311, + "learning_rate": 3.565620980442944e-06, + "loss": 0.8275851, + "num_input_tokens_seen": 84982130, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.91015625, + "step": 3948, + "time_per_iteration": 2.5750784873962402 + }, + { + "auxiliary_loss_clip": 0.01138983, + "auxiliary_loss_mlp": 0.01043821, + "balance_loss_clip": 1.02715611, + "balance_loss_mlp": 1.04736018, + "epoch": 0.23742672478581092, + "flos": 22086570729600.0, + "grad_norm": 2.0638215262656696, + "language_loss": 0.80578661, + "learning_rate": 3.5653786042627107e-06, + "loss": 0.82761467, + "num_input_tokens_seen": 85000640, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.91796875, + "step": 3949, + "time_per_iteration": 2.512665271759033 + }, + { + "auxiliary_loss_clip": 0.01138607, + "auxiliary_loss_mlp": 0.0104063, + "balance_loss_clip": 1.02382135, + "balance_loss_mlp": 1.04584646, + "epoch": 0.2374868480384789, + "flos": 19537093152000.0, + "grad_norm": 1.8976071400358168, + "language_loss": 0.73124689, + "learning_rate": 3.565136168723163e-06, + "loss": 0.75303924, + "num_input_tokens_seen": 85018970, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.92578125, + "step": 3950, + "time_per_iteration": 2.4842302799224854 + }, + { + "auxiliary_loss_clip": 0.01135058, + "auxiliary_loss_mlp": 0.01034229, + "balance_loss_clip": 1.01944709, + "balance_loss_mlp": 1.04712903, + "epoch": 0.23754697129114685, + "flos": 19422501788160.0, + "grad_norm": 2.0688047231241247, + "language_loss": 0.73064256, + "learning_rate": 3.564893673833495e-06, + "loss": 0.75233537, + "num_input_tokens_seen": 85035905, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8828125, + "step": 3951, + "time_per_iteration": 2.5215439796447754 + }, + { + "auxiliary_loss_clip": 0.01144126, + "auxiliary_loss_mlp": 0.0104004, + "balance_loss_clip": 1.02258813, + "balance_loss_mlp": 1.0507673, + "epoch": 0.23760709454381482, + "flos": 19501002961920.0, + "grad_norm": 1.7591828710207016, + "language_loss": 0.73658371, + "learning_rate": 3.564651119602903e-06, + "loss": 0.75842535, + "num_input_tokens_seen": 85054560, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.93359375, + "step": 3952, + "time_per_iteration": 2.550182342529297 + }, + { + "auxiliary_loss_clip": 0.0113686, + "auxiliary_loss_mlp": 0.01037761, + "balance_loss_clip": 1.02213275, + "balance_loss_mlp": 1.04537988, + "epoch": 0.23766721779648278, + "flos": 27636600879360.0, + "grad_norm": 1.6791264380286672, + "language_loss": 0.71064484, + "learning_rate": 3.564408506040583e-06, + "loss": 0.73239112, + "num_input_tokens_seen": 85074425, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.9140625, + "step": 3953, + "time_per_iteration": 2.530381202697754 + }, + { + "auxiliary_loss_clip": 0.01140701, + "auxiliary_loss_mlp": 0.01042499, + "balance_loss_clip": 1.02522552, + "balance_loss_mlp": 1.04806364, + "epoch": 0.23772734104915075, + "flos": 23404348990080.0, + "grad_norm": 1.9696108021357461, + "language_loss": 0.81686246, + "learning_rate": 3.5641658331557356e-06, + "loss": 0.83869451, + "num_input_tokens_seen": 85092865, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.92578125, + "step": 3954, + "time_per_iteration": 2.491629123687744 + }, + { + "auxiliary_loss_clip": 0.01141999, + "auxiliary_loss_mlp": 0.01047189, + "balance_loss_clip": 1.02915251, + "balance_loss_mlp": 1.04870319, + "epoch": 0.23778746430181874, + "flos": 15705496540800.0, + "grad_norm": 2.155968963382196, + "language_loss": 0.65756261, + "learning_rate": 3.5639231009575634e-06, + "loss": 0.67945445, + "num_input_tokens_seen": 85110175, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.93359375, + "step": 3955, + "time_per_iteration": 2.4659719467163086 + }, + { + "auxiliary_loss_clip": 0.01138242, + "auxiliary_loss_mlp": 0.01053219, + "balance_loss_clip": 1.0362916, + "balance_loss_mlp": 1.04739583, + "epoch": 0.2378475875544867, + "flos": 19426452284160.0, + "grad_norm": 1.3846492045019327, + "language_loss": 0.83788121, + "learning_rate": 3.5636803094552704e-06, + "loss": 0.85979581, + "num_input_tokens_seen": 85129925, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.91015625, + "step": 3956, + "time_per_iteration": 2.48734712600708 + }, + { + "auxiliary_loss_clip": 0.011338, + "auxiliary_loss_mlp": 0.01040785, + "balance_loss_clip": 1.02471578, + "balance_loss_mlp": 1.04647636, + "epoch": 0.23790771080715467, + "flos": 22268565964800.0, + "grad_norm": 2.1805686912335656, + "language_loss": 0.85228634, + "learning_rate": 3.5634374586580635e-06, + "loss": 0.8740322, + "num_input_tokens_seen": 85147755, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87109375, + "step": 3957, + "time_per_iteration": 2.50199294090271 + }, + { + "auxiliary_loss_clip": 0.01139099, + "auxiliary_loss_mlp": 0.01041833, + "balance_loss_clip": 1.02686596, + "balance_loss_mlp": 1.04807806, + "epoch": 0.23796783405982264, + "flos": 20047311889920.0, + "grad_norm": 2.0218180107915757, + "language_loss": 0.70133704, + "learning_rate": 3.563194548575151e-06, + "loss": 0.72314632, + "num_input_tokens_seen": 85165270, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.91015625, + "step": 3958, + "time_per_iteration": 2.4798173904418945 + }, + { + "auxiliary_loss_clip": 0.01136893, + "auxiliary_loss_mlp": 0.01043034, + "balance_loss_clip": 1.02530742, + "balance_loss_mlp": 1.04581285, + "epoch": 0.2380279573124906, + "flos": 14245943299200.0, + "grad_norm": 3.373562251556634, + "language_loss": 0.65834582, + "learning_rate": 3.562951579215745e-06, + "loss": 0.68014508, + "num_input_tokens_seen": 85181555, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.91015625, + "step": 3959, + "time_per_iteration": 2.4558637142181396 + }, + { + "auxiliary_loss_clip": 0.01138452, + "auxiliary_loss_mlp": 0.01041764, + "balance_loss_clip": 1.02565885, + "balance_loss_mlp": 1.04832602, + "epoch": 0.23808808056515857, + "flos": 21179180332800.0, + "grad_norm": 1.7230243338870097, + "language_loss": 0.72128749, + "learning_rate": 3.5627085505890586e-06, + "loss": 0.74308968, + "num_input_tokens_seen": 85199455, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.90234375, + "step": 3960, + "time_per_iteration": 2.4831748008728027 + }, + { + "auxiliary_loss_clip": 0.01139565, + "auxiliary_loss_mlp": 0.01041249, + "balance_loss_clip": 1.0249052, + "balance_loss_mlp": 1.04867244, + "epoch": 0.23814820381782653, + "flos": 22528308188160.0, + "grad_norm": 1.8711627571775973, + "language_loss": 0.74181205, + "learning_rate": 3.562465462704307e-06, + "loss": 0.7636202, + "num_input_tokens_seen": 85219170, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.91015625, + "step": 3961, + "time_per_iteration": 2.5167927742004395 + }, + { + "auxiliary_loss_clip": 0.01138898, + "auxiliary_loss_mlp": 0.0105126, + "balance_loss_clip": 1.03318763, + "balance_loss_mlp": 1.04605162, + "epoch": 0.23820832707049452, + "flos": 22304332932480.0, + "grad_norm": 2.643011810367893, + "language_loss": 0.66067994, + "learning_rate": 3.5622223155707085e-06, + "loss": 0.68258154, + "num_input_tokens_seen": 85238480, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9296875, + "step": 3962, + "time_per_iteration": 2.4900338649749756 + }, + { + "auxiliary_loss_clip": 0.01138484, + "auxiliary_loss_mlp": 0.01050468, + "balance_loss_clip": 1.03387976, + "balance_loss_mlp": 1.04738379, + "epoch": 0.2382684503231625, + "flos": 24864225454080.0, + "grad_norm": 1.7740384877146562, + "language_loss": 0.74581182, + "learning_rate": 3.561979109197483e-06, + "loss": 0.76770139, + "num_input_tokens_seen": 85259180, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.91015625, + "step": 3963, + "time_per_iteration": 2.5409018993377686 + }, + { + "auxiliary_loss_clip": 0.01142407, + "auxiliary_loss_mlp": 0.01046013, + "balance_loss_clip": 1.02899039, + "balance_loss_mlp": 1.0498383, + "epoch": 0.23832857357583045, + "flos": 21871609787520.0, + "grad_norm": 2.0190521185084753, + "language_loss": 0.76898873, + "learning_rate": 3.5617358435938538e-06, + "loss": 0.79087293, + "num_input_tokens_seen": 85278550, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.92578125, + "step": 3964, + "time_per_iteration": 2.492861270904541 + }, + { + "auxiliary_loss_clip": 0.01137102, + "auxiliary_loss_mlp": 0.01044921, + "balance_loss_clip": 1.02911341, + "balance_loss_mlp": 1.04792333, + "epoch": 0.23838869682849842, + "flos": 21288061434240.0, + "grad_norm": 2.0459212281672956, + "language_loss": 0.71593058, + "learning_rate": 3.561492518769045e-06, + "loss": 0.73775077, + "num_input_tokens_seen": 85297345, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.890625, + "step": 3965, + "time_per_iteration": 2.5120911598205566 + }, + { + "auxiliary_loss_clip": 0.01134569, + "auxiliary_loss_mlp": 0.01047354, + "balance_loss_clip": 1.03158259, + "balance_loss_mlp": 1.04674065, + "epoch": 0.23844882008116638, + "flos": 16180594755840.0, + "grad_norm": 1.8902557347099018, + "language_loss": 0.78008091, + "learning_rate": 3.561249134732282e-06, + "loss": 0.80190015, + "num_input_tokens_seen": 85315105, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.87890625, + "step": 3966, + "time_per_iteration": 2.4576594829559326 + }, + { + "auxiliary_loss_clip": 0.01135801, + "auxiliary_loss_mlp": 0.01042292, + "balance_loss_clip": 1.02656794, + "balance_loss_mlp": 1.04652119, + "epoch": 0.23850894333383435, + "flos": 21069724613760.0, + "grad_norm": 2.8460709531404, + "language_loss": 0.68860286, + "learning_rate": 3.561005691492797e-06, + "loss": 0.71038377, + "num_input_tokens_seen": 85334735, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.89453125, + "step": 3967, + "time_per_iteration": 2.484840154647827 + }, + { + "auxiliary_loss_clip": 0.01137019, + "auxiliary_loss_mlp": 0.01053581, + "balance_loss_clip": 1.03739274, + "balance_loss_mlp": 1.04645443, + "epoch": 0.23856906658650234, + "flos": 17201606849280.0, + "grad_norm": 2.11266161128335, + "language_loss": 0.67849773, + "learning_rate": 3.5607621890598185e-06, + "loss": 0.70040375, + "num_input_tokens_seen": 85352875, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.90234375, + "step": 3968, + "time_per_iteration": 2.441445827484131 + }, + { + "auxiliary_loss_clip": 0.01134651, + "auxiliary_loss_mlp": 0.01038945, + "balance_loss_clip": 1.02318573, + "balance_loss_mlp": 1.0451827, + "epoch": 0.2386291898391703, + "flos": 29494223619840.0, + "grad_norm": 1.8948052650888014, + "language_loss": 0.76742399, + "learning_rate": 3.5605186274425823e-06, + "loss": 0.78916001, + "num_input_tokens_seen": 85372205, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.89453125, + "step": 3969, + "time_per_iteration": 5.413191318511963 + }, + { + "auxiliary_loss_clip": 0.01134724, + "auxiliary_loss_mlp": 0.01040503, + "balance_loss_clip": 1.02519631, + "balance_loss_mlp": 1.04734492, + "epoch": 0.23868931309183827, + "flos": 21142443697920.0, + "grad_norm": 2.7243772241637263, + "language_loss": 0.76300085, + "learning_rate": 3.5602750066503225e-06, + "loss": 0.78475308, + "num_input_tokens_seen": 85389705, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.875, + "step": 3970, + "time_per_iteration": 2.4792258739471436 + }, + { + "auxiliary_loss_clip": 0.01137573, + "auxiliary_loss_mlp": 0.01042951, + "balance_loss_clip": 1.02545094, + "balance_loss_mlp": 1.04645324, + "epoch": 0.23874943634450624, + "flos": 25659394784640.0, + "grad_norm": 3.3207921386663584, + "language_loss": 0.85399735, + "learning_rate": 3.5600313266922793e-06, + "loss": 0.87580258, + "num_input_tokens_seen": 85407855, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9140625, + "step": 3971, + "time_per_iteration": 2.500506639480591 + }, + { + "auxiliary_loss_clip": 0.01055799, + "auxiliary_loss_mlp": 0.01017218, + "balance_loss_clip": 1.01547742, + "balance_loss_mlp": 1.02590835, + "epoch": 0.2388095595971742, + "flos": 58986618624000.0, + "grad_norm": 0.7461637295582213, + "language_loss": 0.62814003, + "learning_rate": 3.5597875875776915e-06, + "loss": 0.64887029, + "num_input_tokens_seen": 85470885, + "router_z_loss_clip": 0.01745605, + "router_z_loss_mlp": 0.29882812, + "step": 3972, + "time_per_iteration": 3.173640012741089 + }, + { + "auxiliary_loss_clip": 0.0113938, + "auxiliary_loss_mlp": 0.01037064, + "balance_loss_clip": 1.02119696, + "balance_loss_mlp": 1.04922092, + "epoch": 0.23886968284984217, + "flos": 16800341040000.0, + "grad_norm": 1.9456864585596687, + "language_loss": 0.8170895, + "learning_rate": 3.5595437893158013e-06, + "loss": 0.8388539, + "num_input_tokens_seen": 85488460, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.90234375, + "step": 3973, + "time_per_iteration": 2.4529452323913574 + }, + { + "auxiliary_loss_clip": 0.01137225, + "auxiliary_loss_mlp": 0.01044983, + "balance_loss_clip": 1.02849591, + "balance_loss_mlp": 1.04869485, + "epoch": 0.23892980610251013, + "flos": 22382654538240.0, + "grad_norm": 1.6994626560625323, + "language_loss": 0.79299271, + "learning_rate": 3.5592999319158546e-06, + "loss": 0.81481481, + "num_input_tokens_seen": 85508590, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8828125, + "step": 3974, + "time_per_iteration": 2.5395772457122803 + }, + { + "auxiliary_loss_clip": 0.01139215, + "auxiliary_loss_mlp": 0.01038332, + "balance_loss_clip": 1.02155876, + "balance_loss_mlp": 1.04858148, + "epoch": 0.23898992935517813, + "flos": 12823198519680.0, + "grad_norm": 1.8925619228877844, + "language_loss": 0.84428573, + "learning_rate": 3.5590560153870984e-06, + "loss": 0.86606121, + "num_input_tokens_seen": 85525970, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.90625, + "step": 3975, + "time_per_iteration": 2.430361032485962 + }, + { + "auxiliary_loss_clip": 0.01135199, + "auxiliary_loss_mlp": 0.01037689, + "balance_loss_clip": 1.02215612, + "balance_loss_mlp": 1.0471369, + "epoch": 0.2390500526078461, + "flos": 22345666508160.0, + "grad_norm": 2.06825719132721, + "language_loss": 0.8375293, + "learning_rate": 3.5588120397387816e-06, + "loss": 0.85925817, + "num_input_tokens_seen": 85543700, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.87890625, + "step": 3976, + "time_per_iteration": 2.480534791946411 + }, + { + "auxiliary_loss_clip": 0.01132825, + "auxiliary_loss_mlp": 0.01032287, + "balance_loss_clip": 1.01798213, + "balance_loss_mlp": 1.04606938, + "epoch": 0.23911017586051406, + "flos": 22635142214400.0, + "grad_norm": 1.747752931490835, + "language_loss": 0.74532628, + "learning_rate": 3.5585680049801566e-06, + "loss": 0.76697731, + "num_input_tokens_seen": 85562765, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8671875, + "step": 3977, + "time_per_iteration": 2.4641239643096924 + }, + { + "auxiliary_loss_clip": 0.01138905, + "auxiliary_loss_mlp": 0.01045175, + "balance_loss_clip": 1.02818775, + "balance_loss_mlp": 1.04930067, + "epoch": 0.23917029911318202, + "flos": 23653281219840.0, + "grad_norm": 1.6638092474338306, + "language_loss": 0.72395146, + "learning_rate": 3.5583239111204764e-06, + "loss": 0.74579227, + "num_input_tokens_seen": 85581755, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.89453125, + "step": 3978, + "time_per_iteration": 2.5007903575897217 + }, + { + "auxiliary_loss_clip": 0.01143288, + "auxiliary_loss_mlp": 0.01042156, + "balance_loss_clip": 1.0256691, + "balance_loss_mlp": 1.05204654, + "epoch": 0.23923042236585, + "flos": 22783597125120.0, + "grad_norm": 2.0169903221822683, + "language_loss": 0.78654587, + "learning_rate": 3.558079758168997e-06, + "loss": 0.80840027, + "num_input_tokens_seen": 85599455, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.91015625, + "step": 3979, + "time_per_iteration": 2.5006349086761475 + }, + { + "auxiliary_loss_clip": 0.0113581, + "auxiliary_loss_mlp": 0.01044452, + "balance_loss_clip": 1.02769148, + "balance_loss_mlp": 1.04762173, + "epoch": 0.23929054561851795, + "flos": 28147717457280.0, + "grad_norm": 1.6987462202935262, + "language_loss": 0.81945407, + "learning_rate": 3.557835546134977e-06, + "loss": 0.84125668, + "num_input_tokens_seen": 85619970, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8828125, + "step": 3980, + "time_per_iteration": 2.5287020206451416 + }, + { + "auxiliary_loss_clip": 0.01136489, + "auxiliary_loss_mlp": 0.01036341, + "balance_loss_clip": 1.01974702, + "balance_loss_mlp": 1.04967999, + "epoch": 0.23935066887118592, + "flos": 21686525982720.0, + "grad_norm": 1.749461413213386, + "language_loss": 0.8401112, + "learning_rate": 3.5575912750276775e-06, + "loss": 0.86183953, + "num_input_tokens_seen": 85638850, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8671875, + "step": 3981, + "time_per_iteration": 2.466660261154175 + }, + { + "auxiliary_loss_clip": 0.01141626, + "auxiliary_loss_mlp": 0.01044752, + "balance_loss_clip": 1.0275383, + "balance_loss_mlp": 1.04951072, + "epoch": 0.2394107921238539, + "flos": 32122274198400.0, + "grad_norm": 3.6241006318049864, + "language_loss": 0.76872683, + "learning_rate": 3.5573469448563607e-06, + "loss": 0.79059052, + "num_input_tokens_seen": 85656285, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.921875, + "step": 3982, + "time_per_iteration": 2.558145046234131 + }, + { + "auxiliary_loss_clip": 0.01135351, + "auxiliary_loss_mlp": 0.01043953, + "balance_loss_clip": 1.02811027, + "balance_loss_mlp": 1.04844236, + "epoch": 0.23947091537652188, + "flos": 17019180650880.0, + "grad_norm": 6.059829142106342, + "language_loss": 0.77878481, + "learning_rate": 3.5571025556302915e-06, + "loss": 0.80057788, + "num_input_tokens_seen": 85673020, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8671875, + "step": 3983, + "time_per_iteration": 2.4443132877349854 + }, + { + "auxiliary_loss_clip": 0.01136897, + "auxiliary_loss_mlp": 0.0104106, + "balance_loss_clip": 1.02446592, + "balance_loss_mlp": 1.04759789, + "epoch": 0.23953103862918984, + "flos": 20593584904320.0, + "grad_norm": 1.9981470653963032, + "language_loss": 0.73163629, + "learning_rate": 3.556858107358737e-06, + "loss": 0.75341582, + "num_input_tokens_seen": 85692565, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.89453125, + "step": 3984, + "time_per_iteration": 2.491344690322876 + }, + { + "auxiliary_loss_clip": 0.01137825, + "auxiliary_loss_mlp": 0.01045273, + "balance_loss_clip": 1.02860713, + "balance_loss_mlp": 1.04674625, + "epoch": 0.2395911618818578, + "flos": 20704405340160.0, + "grad_norm": 2.064924146489818, + "language_loss": 0.79049474, + "learning_rate": 3.5566136000509674e-06, + "loss": 0.81232572, + "num_input_tokens_seen": 85709730, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.90625, + "step": 3985, + "time_per_iteration": 2.4587738513946533 + }, + { + "auxiliary_loss_clip": 0.01139616, + "auxiliary_loss_mlp": 0.01041503, + "balance_loss_clip": 1.02484989, + "balance_loss_mlp": 1.04980683, + "epoch": 0.23965128513452577, + "flos": 27053519402880.0, + "grad_norm": 2.0182764415160563, + "language_loss": 0.73312742, + "learning_rate": 3.556369033716254e-06, + "loss": 0.7549386, + "num_input_tokens_seen": 85730045, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8984375, + "step": 3986, + "time_per_iteration": 2.5608811378479004 + }, + { + "auxiliary_loss_clip": 0.0114189, + "auxiliary_loss_mlp": 0.01051013, + "balance_loss_clip": 1.03495562, + "balance_loss_mlp": 1.04923773, + "epoch": 0.23971140838719374, + "flos": 23144319457920.0, + "grad_norm": 2.2624046500679333, + "language_loss": 0.87836051, + "learning_rate": 3.556124408363871e-06, + "loss": 0.90028954, + "num_input_tokens_seen": 85747590, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.92578125, + "step": 3987, + "time_per_iteration": 2.461778402328491 + }, + { + "auxiliary_loss_clip": 0.01132193, + "auxiliary_loss_mlp": 0.01036037, + "balance_loss_clip": 1.02161288, + "balance_loss_mlp": 1.04831004, + "epoch": 0.23977153163986173, + "flos": 18034554309120.0, + "grad_norm": 2.3750633167266306, + "language_loss": 0.8308624, + "learning_rate": 3.5558797240030945e-06, + "loss": 0.85254467, + "num_input_tokens_seen": 85763460, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.83984375, + "step": 3988, + "time_per_iteration": 2.4527788162231445 + }, + { + "auxiliary_loss_clip": 0.01134459, + "auxiliary_loss_mlp": 0.01040007, + "balance_loss_clip": 1.02336502, + "balance_loss_mlp": 1.04686844, + "epoch": 0.2398316548925297, + "flos": 18113378705280.0, + "grad_norm": 1.649806875732991, + "language_loss": 0.85145879, + "learning_rate": 3.5556349806432035e-06, + "loss": 0.87320346, + "num_input_tokens_seen": 85782050, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.875, + "step": 3989, + "time_per_iteration": 2.43949031829834 + }, + { + "auxiliary_loss_clip": 0.01135126, + "auxiliary_loss_mlp": 0.01037638, + "balance_loss_clip": 1.02249837, + "balance_loss_mlp": 1.04763699, + "epoch": 0.23989177814519766, + "flos": 12567730014720.0, + "grad_norm": 2.0784071273800944, + "language_loss": 0.84493041, + "learning_rate": 3.555390178293477e-06, + "loss": 0.86665809, + "num_input_tokens_seen": 85797400, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.875, + "step": 3990, + "time_per_iteration": 2.4476051330566406 + }, + { + "auxiliary_loss_clip": 0.01133399, + "auxiliary_loss_mlp": 0.01040211, + "balance_loss_clip": 1.02507186, + "balance_loss_mlp": 1.0463922, + "epoch": 0.23995190139786562, + "flos": 25264593423360.0, + "grad_norm": 3.585202907729512, + "language_loss": 0.75312221, + "learning_rate": 3.5551453169631994e-06, + "loss": 0.77485824, + "num_input_tokens_seen": 85818995, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.87109375, + "step": 3991, + "time_per_iteration": 2.502324104309082 + }, + { + "auxiliary_loss_clip": 0.01050073, + "auxiliary_loss_mlp": 0.01009423, + "balance_loss_clip": 1.00774217, + "balance_loss_mlp": 1.02049088, + "epoch": 0.2400120246505336, + "flos": 61960379650560.0, + "grad_norm": 0.8894590829003932, + "language_loss": 0.63734841, + "learning_rate": 3.554900396661656e-06, + "loss": 0.65794337, + "num_input_tokens_seen": 85876695, + "router_z_loss_clip": 0.0168457, + "router_z_loss_mlp": 0.296875, + "step": 3992, + "time_per_iteration": 3.0017786026000977 + }, + { + "auxiliary_loss_clip": 0.01050397, + "auxiliary_loss_mlp": 0.01010168, + "balance_loss_clip": 1.00857067, + "balance_loss_mlp": 1.02071452, + "epoch": 0.24007214790320155, + "flos": 66708560540160.0, + "grad_norm": 0.7530514643625366, + "language_loss": 0.62963343, + "learning_rate": 3.5546554173981334e-06, + "loss": 0.65023899, + "num_input_tokens_seen": 85940990, + "router_z_loss_clip": 0.01599121, + "router_z_loss_mlp": 0.296875, + "step": 3993, + "time_per_iteration": 3.176184892654419 + }, + { + "auxiliary_loss_clip": 0.01140668, + "auxiliary_loss_mlp": 0.01047015, + "balance_loss_clip": 1.03085065, + "balance_loss_mlp": 1.05099177, + "epoch": 0.24013227115586952, + "flos": 25809070757760.0, + "grad_norm": 1.6383486345725178, + "language_loss": 0.76938868, + "learning_rate": 3.5544103791819218e-06, + "loss": 0.79126549, + "num_input_tokens_seen": 85961165, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8984375, + "step": 3994, + "time_per_iteration": 2.4940826892852783 + }, + { + "auxiliary_loss_clip": 0.01135853, + "auxiliary_loss_mlp": 0.01047966, + "balance_loss_clip": 1.0305258, + "balance_loss_mlp": 1.04680216, + "epoch": 0.2401923944085375, + "flos": 25557480921600.0, + "grad_norm": 1.7751147523393542, + "language_loss": 0.78457522, + "learning_rate": 3.5541652820223124e-06, + "loss": 0.80641341, + "num_input_tokens_seen": 85982710, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.890625, + "step": 3995, + "time_per_iteration": 2.5075032711029053 + }, + { + "auxiliary_loss_clip": 0.01047716, + "auxiliary_loss_mlp": 0.01003894, + "balance_loss_clip": 1.00232053, + "balance_loss_mlp": 1.01837659, + "epoch": 0.24025251766120548, + "flos": 54941138478720.0, + "grad_norm": 0.8913570860108078, + "language_loss": 0.63479292, + "learning_rate": 3.5539201259286006e-06, + "loss": 0.65530908, + "num_input_tokens_seen": 86046935, + "router_z_loss_clip": 0.01574707, + "router_z_loss_mlp": 0.29296875, + "step": 3996, + "time_per_iteration": 3.1365764141082764 + }, + { + "auxiliary_loss_clip": 0.01137569, + "auxiliary_loss_mlp": 0.01045722, + "balance_loss_clip": 1.02916384, + "balance_loss_mlp": 1.04678392, + "epoch": 0.24031264091387344, + "flos": 20631075724800.0, + "grad_norm": 2.906997418482602, + "language_loss": 0.7009505, + "learning_rate": 3.5536749109100808e-06, + "loss": 0.72278345, + "num_input_tokens_seen": 86064355, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.90625, + "step": 3997, + "time_per_iteration": 2.464714765548706 + }, + { + "auxiliary_loss_clip": 0.01134848, + "auxiliary_loss_mlp": 0.01042521, + "balance_loss_clip": 1.02654672, + "balance_loss_mlp": 1.04642928, + "epoch": 0.2403727641665414, + "flos": 20886256920960.0, + "grad_norm": 1.9831176119326495, + "language_loss": 0.87292743, + "learning_rate": 3.5534296369760535e-06, + "loss": 0.89470112, + "num_input_tokens_seen": 86081340, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8828125, + "step": 3998, + "time_per_iteration": 2.4639480113983154 + }, + { + "auxiliary_loss_clip": 0.01134933, + "auxiliary_loss_mlp": 0.0103944, + "balance_loss_clip": 1.02306032, + "balance_loss_mlp": 1.04208946, + "epoch": 0.24043288741920937, + "flos": 22820046451200.0, + "grad_norm": 1.9745565965944727, + "language_loss": 0.75798607, + "learning_rate": 3.5531843041358183e-06, + "loss": 0.77972972, + "num_input_tokens_seen": 86102260, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9296875, + "step": 3999, + "time_per_iteration": 2.4753127098083496 + }, + { + "auxiliary_loss_clip": 0.01132817, + "auxiliary_loss_mlp": 0.01038028, + "balance_loss_clip": 1.02317488, + "balance_loss_mlp": 1.04545271, + "epoch": 0.24049301067187734, + "flos": 27959652823680.0, + "grad_norm": 1.9306579449884984, + "language_loss": 0.72642016, + "learning_rate": 3.552938912398679e-06, + "loss": 0.74812865, + "num_input_tokens_seen": 86123400, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.875, + "step": 4000, + "time_per_iteration": 2.5172412395477295 + }, + { + "auxiliary_loss_clip": 0.01140243, + "auxiliary_loss_mlp": 0.01036911, + "balance_loss_clip": 1.02025795, + "balance_loss_mlp": 1.04728866, + "epoch": 0.24055313392454533, + "flos": 27451409333760.0, + "grad_norm": 2.4587541869300824, + "language_loss": 0.65991902, + "learning_rate": 3.5526934617739397e-06, + "loss": 0.68169051, + "num_input_tokens_seen": 86144060, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9296875, + "step": 4001, + "time_per_iteration": 2.511198043823242 + }, + { + "auxiliary_loss_clip": 0.01131233, + "auxiliary_loss_mlp": 0.01040424, + "balance_loss_clip": 1.02330589, + "balance_loss_mlp": 1.0427444, + "epoch": 0.2406132571772133, + "flos": 25556618995200.0, + "grad_norm": 2.6796652593661903, + "language_loss": 0.82567388, + "learning_rate": 3.5524479522709095e-06, + "loss": 0.84739041, + "num_input_tokens_seen": 86163005, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.8828125, + "step": 4002, + "time_per_iteration": 2.5147531032562256 + }, + { + "auxiliary_loss_clip": 0.01136125, + "auxiliary_loss_mlp": 0.01038837, + "balance_loss_clip": 1.02382851, + "balance_loss_mlp": 1.04682446, + "epoch": 0.24067338042988126, + "flos": 24791398629120.0, + "grad_norm": 1.8902513751119636, + "language_loss": 0.82875729, + "learning_rate": 3.552202383898897e-06, + "loss": 0.8505069, + "num_input_tokens_seen": 86182580, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.890625, + "step": 4003, + "time_per_iteration": 2.508004665374756 + }, + { + "auxiliary_loss_clip": 0.01134817, + "auxiliary_loss_mlp": 0.01037746, + "balance_loss_clip": 1.0214386, + "balance_loss_mlp": 1.04608846, + "epoch": 0.24073350368254923, + "flos": 21177923356800.0, + "grad_norm": 2.0497424292602835, + "language_loss": 0.87504768, + "learning_rate": 3.551956756667215e-06, + "loss": 0.89677334, + "num_input_tokens_seen": 86200665, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.88671875, + "step": 4004, + "time_per_iteration": 2.4581985473632812 + }, + { + "auxiliary_loss_clip": 0.01133072, + "auxiliary_loss_mlp": 0.0104917, + "balance_loss_clip": 1.03356552, + "balance_loss_mlp": 1.04228568, + "epoch": 0.2407936269352172, + "flos": 22494300986880.0, + "grad_norm": 1.9722136456468877, + "language_loss": 0.77630293, + "learning_rate": 3.551711070585177e-06, + "loss": 0.79812533, + "num_input_tokens_seen": 86221640, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.90625, + "step": 4005, + "time_per_iteration": 2.556365728378296 + }, + { + "auxiliary_loss_clip": 0.01130485, + "auxiliary_loss_mlp": 0.01036948, + "balance_loss_clip": 1.02141535, + "balance_loss_mlp": 1.04398429, + "epoch": 0.24085375018788516, + "flos": 18551129754240.0, + "grad_norm": 1.7295620858093623, + "language_loss": 0.78973985, + "learning_rate": 3.5514653256620995e-06, + "loss": 0.81141412, + "num_input_tokens_seen": 86240795, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8671875, + "step": 4006, + "time_per_iteration": 2.460961103439331 + }, + { + "auxiliary_loss_clip": 0.0113781, + "auxiliary_loss_mlp": 0.01038185, + "balance_loss_clip": 1.02072108, + "balance_loss_mlp": 1.04375279, + "epoch": 0.24091387344055312, + "flos": 24170539023360.0, + "grad_norm": 2.2017624810959346, + "language_loss": 0.71201313, + "learning_rate": 3.551219521907302e-06, + "loss": 0.73377299, + "num_input_tokens_seen": 86262000, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.94140625, + "step": 4007, + "time_per_iteration": 2.5169517993927 + }, + { + "auxiliary_loss_clip": 0.01131131, + "auxiliary_loss_mlp": 0.01039619, + "balance_loss_clip": 1.02459884, + "balance_loss_mlp": 1.04453456, + "epoch": 0.24097399669322112, + "flos": 11036319615360.0, + "grad_norm": 1.805972702734942, + "language_loss": 0.75857127, + "learning_rate": 3.5509736593301042e-06, + "loss": 0.7802788, + "num_input_tokens_seen": 86279680, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8671875, + "step": 4008, + "time_per_iteration": 2.4489922523498535 + }, + { + "auxiliary_loss_clip": 0.01131483, + "auxiliary_loss_mlp": 0.01034828, + "balance_loss_clip": 1.01940203, + "balance_loss_mlp": 1.04296207, + "epoch": 0.24103411994588908, + "flos": 17165085696000.0, + "grad_norm": 2.356516377050019, + "language_loss": 0.73922294, + "learning_rate": 3.5507277379398295e-06, + "loss": 0.76088601, + "num_input_tokens_seen": 86297180, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8828125, + "step": 4009, + "time_per_iteration": 2.4701087474823 + }, + { + "auxiliary_loss_clip": 0.01133056, + "auxiliary_loss_mlp": 0.0104248, + "balance_loss_clip": 1.02664948, + "balance_loss_mlp": 1.04632092, + "epoch": 0.24109424319855705, + "flos": 20667956014080.0, + "grad_norm": 1.636895821506206, + "language_loss": 0.79938453, + "learning_rate": 3.550481757745804e-06, + "loss": 0.82113993, + "num_input_tokens_seen": 86317660, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8671875, + "step": 4010, + "time_per_iteration": 3.9670608043670654 + }, + { + "auxiliary_loss_clip": 0.01131615, + "auxiliary_loss_mlp": 0.01047202, + "balance_loss_clip": 1.02923679, + "balance_loss_mlp": 1.04108143, + "epoch": 0.241154366451225, + "flos": 28181796485760.0, + "grad_norm": 2.295886994366384, + "language_loss": 0.70799017, + "learning_rate": 3.5502357187573555e-06, + "loss": 0.72977829, + "num_input_tokens_seen": 86338325, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.90625, + "step": 4011, + "time_per_iteration": 3.9544472694396973 + }, + { + "auxiliary_loss_clip": 0.01131445, + "auxiliary_loss_mlp": 0.01039733, + "balance_loss_clip": 1.02429593, + "balance_loss_mlp": 1.04258561, + "epoch": 0.24121448970389298, + "flos": 21689722293120.0, + "grad_norm": 1.6166610897431488, + "language_loss": 0.69062299, + "learning_rate": 3.5499896209838118e-06, + "loss": 0.71233475, + "num_input_tokens_seen": 86357615, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.88671875, + "step": 4012, + "time_per_iteration": 2.501347303390503 + }, + { + "auxiliary_loss_clip": 0.01133874, + "auxiliary_loss_mlp": 0.01039375, + "balance_loss_clip": 1.02145839, + "balance_loss_mlp": 1.04454589, + "epoch": 0.24127461295656094, + "flos": 39676191269760.0, + "grad_norm": 2.0861437601678303, + "language_loss": 0.73424822, + "learning_rate": 3.5497434644345073e-06, + "loss": 0.75598073, + "num_input_tokens_seen": 86380355, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.89453125, + "step": 4013, + "time_per_iteration": 2.6360883712768555 + }, + { + "auxiliary_loss_clip": 0.01133872, + "auxiliary_loss_mlp": 0.01036616, + "balance_loss_clip": 1.02110672, + "balance_loss_mlp": 1.04450822, + "epoch": 0.2413347362092289, + "flos": 19135863256320.0, + "grad_norm": 1.8416541794010313, + "language_loss": 0.88554955, + "learning_rate": 3.5494972491187753e-06, + "loss": 0.9072544, + "num_input_tokens_seen": 86399125, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.89453125, + "step": 4014, + "time_per_iteration": 2.4663264751434326 + }, + { + "auxiliary_loss_clip": 0.01137985, + "auxiliary_loss_mlp": 0.01043677, + "balance_loss_clip": 1.02643979, + "balance_loss_mlp": 1.04453659, + "epoch": 0.2413948594618969, + "flos": 26939430829440.0, + "grad_norm": 2.755357499792604, + "language_loss": 0.94270647, + "learning_rate": 3.549250975045952e-06, + "loss": 0.96452308, + "num_input_tokens_seen": 86418625, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.93359375, + "step": 4015, + "time_per_iteration": 2.470952033996582 + }, + { + "auxiliary_loss_clip": 0.01133849, + "auxiliary_loss_mlp": 0.01038159, + "balance_loss_clip": 1.02174377, + "balance_loss_mlp": 1.04334664, + "epoch": 0.24145498271456486, + "flos": 25228108183680.0, + "grad_norm": 1.8402084517778015, + "language_loss": 0.82513833, + "learning_rate": 3.5490046422253768e-06, + "loss": 0.84685838, + "num_input_tokens_seen": 86438375, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90625, + "step": 4016, + "time_per_iteration": 2.4922966957092285 + }, + { + "auxiliary_loss_clip": 0.01127395, + "auxiliary_loss_mlp": 0.01040098, + "balance_loss_clip": 1.02423143, + "balance_loss_mlp": 1.04197156, + "epoch": 0.24151510596723283, + "flos": 40661759617920.0, + "grad_norm": 3.4212830828584386, + "language_loss": 0.69553781, + "learning_rate": 3.54875825066639e-06, + "loss": 0.71721268, + "num_input_tokens_seen": 86463230, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8515625, + "step": 4017, + "time_per_iteration": 2.596977710723877 + }, + { + "auxiliary_loss_clip": 0.01135423, + "auxiliary_loss_mlp": 0.01046549, + "balance_loss_clip": 1.02959788, + "balance_loss_mlp": 1.04421043, + "epoch": 0.2415752292199008, + "flos": 18146667634560.0, + "grad_norm": 2.0038503347112084, + "language_loss": 0.85114455, + "learning_rate": 3.5485118003783353e-06, + "loss": 0.87296432, + "num_input_tokens_seen": 86481230, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.91015625, + "step": 4018, + "time_per_iteration": 2.440749406814575 + }, + { + "auxiliary_loss_clip": 0.01046553, + "auxiliary_loss_mlp": 0.01012788, + "balance_loss_clip": 1.0109762, + "balance_loss_mlp": 1.01676679, + "epoch": 0.24163535247256876, + "flos": 67288409792640.0, + "grad_norm": 0.8182663934779763, + "language_loss": 0.60620981, + "learning_rate": 3.548265291370558e-06, + "loss": 0.62680322, + "num_input_tokens_seen": 86541260, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.296875, + "step": 4019, + "time_per_iteration": 3.112665891647339 + }, + { + "auxiliary_loss_clip": 0.01134098, + "auxiliary_loss_mlp": 0.01038378, + "balance_loss_clip": 1.02299976, + "balance_loss_mlp": 1.04433608, + "epoch": 0.24169547572523672, + "flos": 24929941386240.0, + "grad_norm": 1.880182475838635, + "language_loss": 0.73690915, + "learning_rate": 3.5480187236524055e-06, + "loss": 0.75863391, + "num_input_tokens_seen": 86559580, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8984375, + "step": 4020, + "time_per_iteration": 2.5049281120300293 + }, + { + "auxiliary_loss_clip": 0.01134711, + "auxiliary_loss_mlp": 0.01037599, + "balance_loss_clip": 1.02199471, + "balance_loss_mlp": 1.04660118, + "epoch": 0.24175559897790472, + "flos": 18728312567040.0, + "grad_norm": 1.9671591580269927, + "language_loss": 0.82012737, + "learning_rate": 3.5477720972332285e-06, + "loss": 0.84185052, + "num_input_tokens_seen": 86577560, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8828125, + "step": 4021, + "time_per_iteration": 2.464092493057251 + }, + { + "auxiliary_loss_clip": 0.01137652, + "auxiliary_loss_mlp": 0.01048543, + "balance_loss_clip": 1.03036344, + "balance_loss_mlp": 1.04551053, + "epoch": 0.24181572223057268, + "flos": 23039281111680.0, + "grad_norm": 1.9434993168468309, + "language_loss": 0.76464498, + "learning_rate": 3.547525412122378e-06, + "loss": 0.78650689, + "num_input_tokens_seen": 86595350, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.921875, + "step": 4022, + "time_per_iteration": 2.4939990043640137 + }, + { + "auxiliary_loss_clip": 0.01140564, + "auxiliary_loss_mlp": 0.01042084, + "balance_loss_clip": 1.0248704, + "balance_loss_mlp": 1.04610109, + "epoch": 0.24187584548324065, + "flos": 20376145923840.0, + "grad_norm": 1.893594506248005, + "language_loss": 0.75172901, + "learning_rate": 3.5472786683292083e-06, + "loss": 0.77355558, + "num_input_tokens_seen": 86614805, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9453125, + "step": 4023, + "time_per_iteration": 2.442469358444214 + }, + { + "auxiliary_loss_clip": 0.0113425, + "auxiliary_loss_mlp": 0.010453, + "balance_loss_clip": 1.0288136, + "balance_loss_mlp": 1.04636168, + "epoch": 0.2419359687359086, + "flos": 21397517153280.0, + "grad_norm": 1.7406117596406352, + "language_loss": 0.81464303, + "learning_rate": 3.5470318658630766e-06, + "loss": 0.83643848, + "num_input_tokens_seen": 86633700, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.87890625, + "step": 4024, + "time_per_iteration": 2.45035719871521 + }, + { + "auxiliary_loss_clip": 0.01134068, + "auxiliary_loss_mlp": 0.01045811, + "balance_loss_clip": 1.02951503, + "balance_loss_mlp": 1.0462923, + "epoch": 0.24199609198857658, + "flos": 18369385914240.0, + "grad_norm": 1.8550338864746303, + "language_loss": 0.85851878, + "learning_rate": 3.5467850047333424e-06, + "loss": 0.88031757, + "num_input_tokens_seen": 86650905, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.87890625, + "step": 4025, + "time_per_iteration": 2.4191699028015137 + }, + { + "auxiliary_loss_clip": 0.01136643, + "auxiliary_loss_mlp": 0.01048637, + "balance_loss_clip": 1.03154194, + "balance_loss_mlp": 1.04397535, + "epoch": 0.24205621524124454, + "flos": 19463871277440.0, + "grad_norm": 1.9498897834730646, + "language_loss": 0.71243072, + "learning_rate": 3.546538084949365e-06, + "loss": 0.73428357, + "num_input_tokens_seen": 86669185, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9296875, + "step": 4026, + "time_per_iteration": 2.476792812347412 + }, + { + "auxiliary_loss_clip": 0.01132732, + "auxiliary_loss_mlp": 0.01041866, + "balance_loss_clip": 1.0259757, + "balance_loss_mlp": 1.04589748, + "epoch": 0.2421163384939125, + "flos": 14976330451200.0, + "grad_norm": 1.8853181761927913, + "language_loss": 0.64215046, + "learning_rate": 3.546291106520509e-06, + "loss": 0.66389644, + "num_input_tokens_seen": 86686805, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8671875, + "step": 4027, + "time_per_iteration": 2.443652868270874 + }, + { + "auxiliary_loss_clip": 0.01136833, + "auxiliary_loss_mlp": 0.01037586, + "balance_loss_clip": 1.02226758, + "balance_loss_mlp": 1.04601741, + "epoch": 0.2421764617465805, + "flos": 18662057930880.0, + "grad_norm": 2.5479611354975007, + "language_loss": 0.70294374, + "learning_rate": 3.5460440694561388e-06, + "loss": 0.72468793, + "num_input_tokens_seen": 86705520, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.91015625, + "step": 4028, + "time_per_iteration": 2.48252534866333 + }, + { + "auxiliary_loss_clip": 0.01044866, + "auxiliary_loss_mlp": 0.01007457, + "balance_loss_clip": 1.00585961, + "balance_loss_mlp": 1.01464319, + "epoch": 0.24223658499924847, + "flos": 64347327164160.0, + "grad_norm": 0.8570499142131055, + "language_loss": 0.55407649, + "learning_rate": 3.545796973765623e-06, + "loss": 0.57459968, + "num_input_tokens_seen": 86767320, + "router_z_loss_clip": 0.01599121, + "router_z_loss_mlp": 0.30078125, + "step": 4029, + "time_per_iteration": 3.094402551651001 + }, + { + "auxiliary_loss_clip": 0.01135659, + "auxiliary_loss_mlp": 0.01043386, + "balance_loss_clip": 1.02567101, + "balance_loss_mlp": 1.04526591, + "epoch": 0.24229670825191643, + "flos": 25775243124480.0, + "grad_norm": 2.019101437715354, + "language_loss": 0.73829788, + "learning_rate": 3.54554981945833e-06, + "loss": 0.76008832, + "num_input_tokens_seen": 86788110, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.90234375, + "step": 4030, + "time_per_iteration": 2.5176522731781006 + }, + { + "auxiliary_loss_clip": 0.01135714, + "auxiliary_loss_mlp": 0.01053146, + "balance_loss_clip": 1.03655171, + "balance_loss_mlp": 1.04541922, + "epoch": 0.2423568315045844, + "flos": 20667094087680.0, + "grad_norm": 2.062987020241499, + "language_loss": 0.76440287, + "learning_rate": 3.5453026065436343e-06, + "loss": 0.78629148, + "num_input_tokens_seen": 86807640, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.90234375, + "step": 4031, + "time_per_iteration": 2.4774179458618164 + }, + { + "auxiliary_loss_clip": 0.01140068, + "auxiliary_loss_mlp": 0.01046331, + "balance_loss_clip": 1.02974856, + "balance_loss_mlp": 1.0464952, + "epoch": 0.24241695475725236, + "flos": 22416805393920.0, + "grad_norm": 7.078640241023749, + "language_loss": 0.65947008, + "learning_rate": 3.5450553350309083e-06, + "loss": 0.68133402, + "num_input_tokens_seen": 86826795, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9375, + "step": 4032, + "time_per_iteration": 2.500488519668579 + }, + { + "auxiliary_loss_clip": 0.01130465, + "auxiliary_loss_mlp": 0.01046589, + "balance_loss_clip": 1.03026938, + "balance_loss_mlp": 1.04175007, + "epoch": 0.24247707800992033, + "flos": 17128995505920.0, + "grad_norm": 3.1167913511387995, + "language_loss": 0.81353086, + "learning_rate": 3.5448080049295286e-06, + "loss": 0.83530146, + "num_input_tokens_seen": 86843175, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.88671875, + "step": 4033, + "time_per_iteration": 2.434652805328369 + }, + { + "auxiliary_loss_clip": 0.0113019, + "auxiliary_loss_mlp": 0.01039201, + "balance_loss_clip": 1.02310205, + "balance_loss_mlp": 1.04302979, + "epoch": 0.2425372012625883, + "flos": 31613743399680.0, + "grad_norm": 2.0372289343003023, + "language_loss": 0.69200158, + "learning_rate": 3.5445606162488754e-06, + "loss": 0.71369547, + "num_input_tokens_seen": 86863185, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.87109375, + "step": 4034, + "time_per_iteration": 2.583693027496338 + }, + { + "auxiliary_loss_clip": 0.01132981, + "auxiliary_loss_mlp": 0.01036321, + "balance_loss_clip": 1.01868999, + "balance_loss_mlp": 1.04278564, + "epoch": 0.24259732451525629, + "flos": 16326032924160.0, + "grad_norm": 2.4913709616978554, + "language_loss": 0.95772272, + "learning_rate": 3.5443131689983283e-06, + "loss": 0.97941571, + "num_input_tokens_seen": 86880040, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8984375, + "step": 4035, + "time_per_iteration": 2.4757437705993652 + }, + { + "auxiliary_loss_clip": 0.01126986, + "auxiliary_loss_mlp": 0.01047233, + "balance_loss_clip": 1.03220701, + "balance_loss_mlp": 1.04172754, + "epoch": 0.24265744776792425, + "flos": 22856639431680.0, + "grad_norm": 2.0212510419571794, + "language_loss": 0.77875686, + "learning_rate": 3.5440656631872715e-06, + "loss": 0.80049908, + "num_input_tokens_seen": 86900610, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8515625, + "step": 4036, + "time_per_iteration": 2.5642547607421875 + }, + { + "auxiliary_loss_clip": 0.01134779, + "auxiliary_loss_mlp": 0.01043471, + "balance_loss_clip": 1.02642441, + "balance_loss_mlp": 1.04447269, + "epoch": 0.24271757102059222, + "flos": 21871573873920.0, + "grad_norm": 1.648393445666421, + "language_loss": 0.74427915, + "learning_rate": 3.5438180988250898e-06, + "loss": 0.76606166, + "num_input_tokens_seen": 86919385, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.90234375, + "step": 4037, + "time_per_iteration": 2.4529507160186768 + }, + { + "auxiliary_loss_clip": 0.01133991, + "auxiliary_loss_mlp": 0.0104144, + "balance_loss_clip": 1.02497733, + "balance_loss_mlp": 1.04398596, + "epoch": 0.24277769427326018, + "flos": 19208582340480.0, + "grad_norm": 2.7681997598872656, + "language_loss": 0.76223898, + "learning_rate": 3.543570475921171e-06, + "loss": 0.78399336, + "num_input_tokens_seen": 86938885, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8984375, + "step": 4038, + "time_per_iteration": 2.4618003368377686 + }, + { + "auxiliary_loss_clip": 0.01135029, + "auxiliary_loss_mlp": 0.01044933, + "balance_loss_clip": 1.02742147, + "balance_loss_mlp": 1.04415751, + "epoch": 0.24283781752592815, + "flos": 19499889640320.0, + "grad_norm": 2.0050890767905645, + "language_loss": 0.72632921, + "learning_rate": 3.543322794484905e-06, + "loss": 0.74812889, + "num_input_tokens_seen": 86957705, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.90625, + "step": 4039, + "time_per_iteration": 2.4261560440063477 + }, + { + "auxiliary_loss_clip": 0.01135297, + "auxiliary_loss_mlp": 0.01043184, + "balance_loss_clip": 1.02631593, + "balance_loss_mlp": 1.04608393, + "epoch": 0.2428979407785961, + "flos": 19902196944000.0, + "grad_norm": 1.6810247735848671, + "language_loss": 0.78330719, + "learning_rate": 3.5430750545256843e-06, + "loss": 0.80509198, + "num_input_tokens_seen": 86975845, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.890625, + "step": 4040, + "time_per_iteration": 2.4808037281036377 + }, + { + "auxiliary_loss_clip": 0.01128006, + "auxiliary_loss_mlp": 0.01034019, + "balance_loss_clip": 1.01912999, + "balance_loss_mlp": 1.04237986, + "epoch": 0.2429580640312641, + "flos": 24715878284160.0, + "grad_norm": 1.8145876332629047, + "language_loss": 0.80390251, + "learning_rate": 3.5428272560529027e-06, + "loss": 0.82552278, + "num_input_tokens_seen": 86994800, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.85546875, + "step": 4041, + "time_per_iteration": 2.482576847076416 + }, + { + "auxiliary_loss_clip": 0.01134051, + "auxiliary_loss_mlp": 0.01043295, + "balance_loss_clip": 1.02769041, + "balance_loss_mlp": 1.04653025, + "epoch": 0.24301818728393207, + "flos": 25630343660160.0, + "grad_norm": 4.455498217071982, + "language_loss": 0.76670969, + "learning_rate": 3.542579399075957e-06, + "loss": 0.78848314, + "num_input_tokens_seen": 87016845, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.875, + "step": 4042, + "time_per_iteration": 2.4944398403167725 + }, + { + "auxiliary_loss_clip": 0.01130826, + "auxiliary_loss_mlp": 0.01033112, + "balance_loss_clip": 1.01815128, + "balance_loss_mlp": 1.04393744, + "epoch": 0.24307831053660003, + "flos": 26141388410880.0, + "grad_norm": 1.7591863299055037, + "language_loss": 0.8139993, + "learning_rate": 3.542331483604246e-06, + "loss": 0.83563864, + "num_input_tokens_seen": 87036270, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8671875, + "step": 4043, + "time_per_iteration": 2.4965035915374756 + }, + { + "auxiliary_loss_clip": 0.01135997, + "auxiliary_loss_mlp": 0.01037391, + "balance_loss_clip": 1.02053475, + "balance_loss_mlp": 1.04298007, + "epoch": 0.243138433789268, + "flos": 14972415868800.0, + "grad_norm": 2.448799092011911, + "language_loss": 0.73345625, + "learning_rate": 3.5420835096471706e-06, + "loss": 0.75519013, + "num_input_tokens_seen": 87049920, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9296875, + "step": 4044, + "time_per_iteration": 2.42809796333313 + }, + { + "auxiliary_loss_clip": 0.01136098, + "auxiliary_loss_mlp": 0.01042356, + "balance_loss_clip": 1.0252496, + "balance_loss_mlp": 1.04730773, + "epoch": 0.24319855704193596, + "flos": 25191694771200.0, + "grad_norm": 1.780616714891853, + "language_loss": 0.83562207, + "learning_rate": 3.5418354772141337e-06, + "loss": 0.85740674, + "num_input_tokens_seen": 87068230, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.88671875, + "step": 4045, + "time_per_iteration": 2.4965107440948486 + }, + { + "auxiliary_loss_clip": 0.01134201, + "auxiliary_loss_mlp": 0.01045916, + "balance_loss_clip": 1.02944136, + "balance_loss_mlp": 1.04542089, + "epoch": 0.24325868029460393, + "flos": 22127221946880.0, + "grad_norm": 2.1598753545738663, + "language_loss": 0.86787856, + "learning_rate": 3.541587386314541e-06, + "loss": 0.88967973, + "num_input_tokens_seen": 87086435, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.890625, + "step": 4046, + "time_per_iteration": 2.5126357078552246 + }, + { + "auxiliary_loss_clip": 0.01128157, + "auxiliary_loss_mlp": 0.01041362, + "balance_loss_clip": 1.02526259, + "balance_loss_mlp": 1.04252553, + "epoch": 0.2433188035472719, + "flos": 23582106420480.0, + "grad_norm": 1.9885516182116696, + "language_loss": 0.7281425, + "learning_rate": 3.5413392369578e-06, + "loss": 0.7498377, + "num_input_tokens_seen": 87105340, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.859375, + "step": 4047, + "time_per_iteration": 2.4886271953582764 + }, + { + "auxiliary_loss_clip": 0.01133305, + "auxiliary_loss_mlp": 0.01039984, + "balance_loss_clip": 1.02243662, + "balance_loss_mlp": 1.0435816, + "epoch": 0.2433789267999399, + "flos": 24462815990400.0, + "grad_norm": 2.411807088840578, + "language_loss": 0.72845596, + "learning_rate": 3.5410910291533213e-06, + "loss": 0.75018883, + "num_input_tokens_seen": 87125780, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8984375, + "step": 4048, + "time_per_iteration": 2.522012710571289 + }, + { + "auxiliary_loss_clip": 0.01132229, + "auxiliary_loss_mlp": 0.01042433, + "balance_loss_clip": 1.02720952, + "balance_loss_mlp": 1.04504991, + "epoch": 0.24343905005260785, + "flos": 16727909264640.0, + "grad_norm": 4.923738678144707, + "language_loss": 0.72984087, + "learning_rate": 3.5408427629105155e-06, + "loss": 0.75158751, + "num_input_tokens_seen": 87144470, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.875, + "step": 4049, + "time_per_iteration": 2.4399380683898926 + }, + { + "auxiliary_loss_clip": 0.0112907, + "auxiliary_loss_mlp": 0.01041944, + "balance_loss_clip": 1.02654243, + "balance_loss_mlp": 1.04297137, + "epoch": 0.24349917330527582, + "flos": 20043756443520.0, + "grad_norm": 6.058583880667159, + "language_loss": 0.7388249, + "learning_rate": 3.5405944382387985e-06, + "loss": 0.760535, + "num_input_tokens_seen": 87162830, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.859375, + "step": 4050, + "time_per_iteration": 2.4589998722076416 + }, + { + "auxiliary_loss_clip": 0.01128476, + "auxiliary_loss_mlp": 0.01044223, + "balance_loss_clip": 1.02925062, + "balance_loss_mlp": 1.04373455, + "epoch": 0.24355929655794378, + "flos": 17420554200960.0, + "grad_norm": 3.083460080669968, + "language_loss": 0.74948591, + "learning_rate": 3.5403460551475854e-06, + "loss": 0.77121294, + "num_input_tokens_seen": 87180905, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.84765625, + "step": 4051, + "time_per_iteration": 2.4284183979034424 + }, + { + "auxiliary_loss_clip": 0.01129104, + "auxiliary_loss_mlp": 0.01038015, + "balance_loss_clip": 1.02251768, + "balance_loss_mlp": 1.04273975, + "epoch": 0.24361941981061175, + "flos": 25410929431680.0, + "grad_norm": 2.420510968298769, + "language_loss": 0.70638204, + "learning_rate": 3.540097613646296e-06, + "loss": 0.72805327, + "num_input_tokens_seen": 87202290, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.86328125, + "step": 4052, + "time_per_iteration": 5.468756675720215 + }, + { + "auxiliary_loss_clip": 0.01131368, + "auxiliary_loss_mlp": 0.01048115, + "balance_loss_clip": 1.03215313, + "balance_loss_mlp": 1.04370522, + "epoch": 0.2436795430632797, + "flos": 22820800636800.0, + "grad_norm": 1.61331134721481, + "language_loss": 0.81265736, + "learning_rate": 3.539849113744351e-06, + "loss": 0.83445215, + "num_input_tokens_seen": 87221650, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.875, + "step": 4053, + "time_per_iteration": 2.5280394554138184 + }, + { + "auxiliary_loss_clip": 0.01135173, + "auxiliary_loss_mlp": 0.01035062, + "balance_loss_clip": 1.01895714, + "balance_loss_mlp": 1.04522192, + "epoch": 0.2437396663159477, + "flos": 15157786982400.0, + "grad_norm": 1.5461481286352234, + "language_loss": 0.77842951, + "learning_rate": 3.539600555451172e-06, + "loss": 0.80013186, + "num_input_tokens_seen": 87238515, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8984375, + "step": 4054, + "time_per_iteration": 2.424604892730713 + }, + { + "auxiliary_loss_clip": 0.01128011, + "auxiliary_loss_mlp": 0.01044969, + "balance_loss_clip": 1.02990091, + "balance_loss_mlp": 1.04097724, + "epoch": 0.24379978956861567, + "flos": 22091131756800.0, + "grad_norm": 1.616998838355979, + "language_loss": 0.83784473, + "learning_rate": 3.5393519387761866e-06, + "loss": 0.85957456, + "num_input_tokens_seen": 87256290, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.87109375, + "step": 4055, + "time_per_iteration": 2.4814612865448 + }, + { + "auxiliary_loss_clip": 0.0113426, + "auxiliary_loss_mlp": 0.01038657, + "balance_loss_clip": 1.02194405, + "balance_loss_mlp": 1.04221749, + "epoch": 0.24385991282128364, + "flos": 31467766527360.0, + "grad_norm": 3.407480313131798, + "language_loss": 0.55291057, + "learning_rate": 3.5391032637288217e-06, + "loss": 0.57463974, + "num_input_tokens_seen": 87277085, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.921875, + "step": 4056, + "time_per_iteration": 2.5356216430664062 + }, + { + "auxiliary_loss_clip": 0.01133242, + "auxiliary_loss_mlp": 0.01043161, + "balance_loss_clip": 1.02626896, + "balance_loss_mlp": 1.04361272, + "epoch": 0.2439200360739516, + "flos": 23838795987840.0, + "grad_norm": 2.24663888381965, + "language_loss": 0.79832959, + "learning_rate": 3.538854530318506e-06, + "loss": 0.82009363, + "num_input_tokens_seen": 87293020, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8984375, + "step": 4057, + "time_per_iteration": 2.4915707111358643 + }, + { + "auxiliary_loss_clip": 0.01128391, + "auxiliary_loss_mlp": 0.01037777, + "balance_loss_clip": 1.02195764, + "balance_loss_mlp": 1.04218984, + "epoch": 0.24398015932661957, + "flos": 19169978198400.0, + "grad_norm": 1.7432058239394113, + "language_loss": 0.78817719, + "learning_rate": 3.538605738554673e-06, + "loss": 0.80983889, + "num_input_tokens_seen": 87311445, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.86328125, + "step": 4058, + "time_per_iteration": 2.426687002182007 + }, + { + "auxiliary_loss_clip": 0.01133605, + "auxiliary_loss_mlp": 0.01038515, + "balance_loss_clip": 1.02366126, + "balance_loss_mlp": 1.04273307, + "epoch": 0.24404028257928753, + "flos": 25262474520960.0, + "grad_norm": 1.688831116872718, + "language_loss": 0.85133582, + "learning_rate": 3.538356888446756e-06, + "loss": 0.87305701, + "num_input_tokens_seen": 87332055, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.90625, + "step": 4059, + "time_per_iteration": 2.499464511871338 + }, + { + "auxiliary_loss_clip": 0.01127196, + "auxiliary_loss_mlp": 0.01035816, + "balance_loss_clip": 1.02079606, + "balance_loss_mlp": 1.04288411, + "epoch": 0.2441004058319555, + "flos": 26467600752000.0, + "grad_norm": 1.6494662829711617, + "language_loss": 0.73770267, + "learning_rate": 3.5381079800041913e-06, + "loss": 0.75933278, + "num_input_tokens_seen": 87351295, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84375, + "step": 4060, + "time_per_iteration": 2.4955050945281982 + }, + { + "auxiliary_loss_clip": 0.01137637, + "auxiliary_loss_mlp": 0.01050854, + "balance_loss_clip": 1.03262711, + "balance_loss_mlp": 1.04506934, + "epoch": 0.2441605290846235, + "flos": 26760524163840.0, + "grad_norm": 1.8597953216817902, + "language_loss": 0.73587501, + "learning_rate": 3.5378590132364182e-06, + "loss": 0.75775993, + "num_input_tokens_seen": 87370650, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.92578125, + "step": 4061, + "time_per_iteration": 2.5002825260162354 + }, + { + "auxiliary_loss_clip": 0.01129662, + "auxiliary_loss_mlp": 0.01036553, + "balance_loss_clip": 1.02248669, + "balance_loss_mlp": 1.04437923, + "epoch": 0.24422065233729146, + "flos": 21105850717440.0, + "grad_norm": 1.6775055914479682, + "language_loss": 0.76006806, + "learning_rate": 3.5376099881528768e-06, + "loss": 0.78173012, + "num_input_tokens_seen": 87389020, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8515625, + "step": 4062, + "time_per_iteration": 2.478625535964966 + }, + { + "auxiliary_loss_clip": 0.01126984, + "auxiliary_loss_mlp": 0.01035611, + "balance_loss_clip": 1.0205195, + "balance_loss_mlp": 1.04376316, + "epoch": 0.24428077558995942, + "flos": 25263156879360.0, + "grad_norm": 1.7282475931571, + "language_loss": 0.85710216, + "learning_rate": 3.537360904763011e-06, + "loss": 0.87872803, + "num_input_tokens_seen": 87409695, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.83203125, + "step": 4063, + "time_per_iteration": 2.5161943435668945 + }, + { + "auxiliary_loss_clip": 0.01135931, + "auxiliary_loss_mlp": 0.01042417, + "balance_loss_clip": 1.02603722, + "balance_loss_mlp": 1.04589176, + "epoch": 0.24434089884262739, + "flos": 20485278420480.0, + "grad_norm": 6.32752237165424, + "language_loss": 0.68127096, + "learning_rate": 3.5371117630762656e-06, + "loss": 0.70305437, + "num_input_tokens_seen": 87428250, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8984375, + "step": 4064, + "time_per_iteration": 2.4434523582458496 + }, + { + "auxiliary_loss_clip": 0.01134926, + "auxiliary_loss_mlp": 0.01037547, + "balance_loss_clip": 1.02083397, + "balance_loss_mlp": 1.04318714, + "epoch": 0.24440102209529535, + "flos": 23621895711360.0, + "grad_norm": 1.5178524812834733, + "language_loss": 0.7003206, + "learning_rate": 3.536862563102088e-06, + "loss": 0.72204536, + "num_input_tokens_seen": 87449380, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.91796875, + "step": 4065, + "time_per_iteration": 2.513827085494995 + }, + { + "auxiliary_loss_clip": 0.01136726, + "auxiliary_loss_mlp": 0.01047876, + "balance_loss_clip": 1.02960134, + "balance_loss_mlp": 1.04461718, + "epoch": 0.24446114534796332, + "flos": 20554729367040.0, + "grad_norm": 2.0517728790430048, + "language_loss": 0.83912247, + "learning_rate": 3.5366133048499282e-06, + "loss": 0.86096847, + "num_input_tokens_seen": 87465365, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.921875, + "step": 4066, + "time_per_iteration": 2.4601314067840576 + }, + { + "auxiliary_loss_clip": 0.01053849, + "auxiliary_loss_mlp": 0.01006665, + "balance_loss_clip": 1.00455475, + "balance_loss_mlp": 1.02389407, + "epoch": 0.24452126860063128, + "flos": 60389575009920.0, + "grad_norm": 0.7387464995159381, + "language_loss": 0.52291965, + "learning_rate": 3.5363639883292374e-06, + "loss": 0.54352474, + "num_input_tokens_seen": 87522525, + "router_z_loss_clip": 0.02111816, + "router_z_loss_mlp": 0.29882812, + "step": 4067, + "time_per_iteration": 2.9973862171173096 + }, + { + "auxiliary_loss_clip": 0.01133754, + "auxiliary_loss_mlp": 0.01040771, + "balance_loss_clip": 1.0242008, + "balance_loss_mlp": 1.04483843, + "epoch": 0.24458139185329927, + "flos": 15121660878720.0, + "grad_norm": 3.022186633601072, + "language_loss": 0.71927387, + "learning_rate": 3.5361146135494706e-06, + "loss": 0.74101913, + "num_input_tokens_seen": 87539170, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.890625, + "step": 4068, + "time_per_iteration": 2.4484708309173584 + }, + { + "auxiliary_loss_clip": 0.01132664, + "auxiliary_loss_mlp": 0.01040777, + "balance_loss_clip": 1.02457666, + "balance_loss_mlp": 1.04505873, + "epoch": 0.24464151510596724, + "flos": 27998723842560.0, + "grad_norm": 1.494083672668599, + "language_loss": 0.77513826, + "learning_rate": 3.5358651805200835e-06, + "loss": 0.79687262, + "num_input_tokens_seen": 87558875, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.875, + "step": 4069, + "time_per_iteration": 2.5724000930786133 + }, + { + "auxiliary_loss_clip": 0.01133522, + "auxiliary_loss_mlp": 0.01047384, + "balance_loss_clip": 1.03101087, + "balance_loss_mlp": 1.04646873, + "epoch": 0.2447016383586352, + "flos": 19792884879360.0, + "grad_norm": 1.9755919994455295, + "language_loss": 0.80163878, + "learning_rate": 3.5356156892505347e-06, + "loss": 0.82344782, + "num_input_tokens_seen": 87576485, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.87109375, + "step": 4070, + "time_per_iteration": 2.4932186603546143 + }, + { + "auxiliary_loss_clip": 0.01130692, + "auxiliary_loss_mlp": 0.01045764, + "balance_loss_clip": 1.03018379, + "balance_loss_mlp": 1.04351497, + "epoch": 0.24476176161130317, + "flos": 26067340523520.0, + "grad_norm": 1.6271146290001441, + "language_loss": 0.8410303, + "learning_rate": 3.5353661397502854e-06, + "loss": 0.86279482, + "num_input_tokens_seen": 87598620, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.875, + "step": 4071, + "time_per_iteration": 2.5299296379089355 + }, + { + "auxiliary_loss_clip": 0.0113627, + "auxiliary_loss_mlp": 0.01045362, + "balance_loss_clip": 1.02795792, + "balance_loss_mlp": 1.04406631, + "epoch": 0.24482188486397113, + "flos": 18843550375680.0, + "grad_norm": 1.720640728536457, + "language_loss": 0.79751229, + "learning_rate": 3.535116532028798e-06, + "loss": 0.81932867, + "num_input_tokens_seen": 87616595, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.921875, + "step": 4072, + "time_per_iteration": 2.470327854156494 + }, + { + "auxiliary_loss_clip": 0.01129102, + "auxiliary_loss_mlp": 0.01043575, + "balance_loss_clip": 1.02906084, + "balance_loss_mlp": 1.04437995, + "epoch": 0.2448820081166391, + "flos": 21251791676160.0, + "grad_norm": 1.615929332251483, + "language_loss": 0.70322561, + "learning_rate": 3.5348668660955382e-06, + "loss": 0.7249524, + "num_input_tokens_seen": 87635755, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.84765625, + "step": 4073, + "time_per_iteration": 2.4951980113983154 + }, + { + "auxiliary_loss_clip": 0.01129351, + "auxiliary_loss_mlp": 0.01041111, + "balance_loss_clip": 1.02662683, + "balance_loss_mlp": 1.04456043, + "epoch": 0.2449421313693071, + "flos": 23950586090880.0, + "grad_norm": 2.5968867848691133, + "language_loss": 0.67692697, + "learning_rate": 3.5346171419599728e-06, + "loss": 0.69863164, + "num_input_tokens_seen": 87652885, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.84765625, + "step": 4074, + "time_per_iteration": 2.4697325229644775 + }, + { + "auxiliary_loss_clip": 0.01052266, + "auxiliary_loss_mlp": 0.01006876, + "balance_loss_clip": 1.00504053, + "balance_loss_mlp": 1.0222578, + "epoch": 0.24500225462197506, + "flos": 60687669980160.0, + "grad_norm": 0.896032421619399, + "language_loss": 0.68665123, + "learning_rate": 3.5343673596315718e-06, + "loss": 0.70724261, + "num_input_tokens_seen": 87713220, + "router_z_loss_clip": 0.01831055, + "router_z_loss_mlp": 0.30078125, + "step": 4075, + "time_per_iteration": 3.1993846893310547 + }, + { + "auxiliary_loss_clip": 0.01131428, + "auxiliary_loss_mlp": 0.01040459, + "balance_loss_clip": 1.02548659, + "balance_loss_mlp": 1.04603517, + "epoch": 0.24506237787464302, + "flos": 26284204886400.0, + "grad_norm": 2.243483207404797, + "language_loss": 0.79306483, + "learning_rate": 3.5341175191198063e-06, + "loss": 0.81478369, + "num_input_tokens_seen": 87732680, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.85546875, + "step": 4076, + "time_per_iteration": 2.542245388031006 + }, + { + "auxiliary_loss_clip": 0.01134594, + "auxiliary_loss_mlp": 0.01045082, + "balance_loss_clip": 1.02749884, + "balance_loss_mlp": 1.04342794, + "epoch": 0.245122501127311, + "flos": 20552287242240.0, + "grad_norm": 2.0630196459837618, + "language_loss": 0.82211018, + "learning_rate": 3.533867620434151e-06, + "loss": 0.84390688, + "num_input_tokens_seen": 87751880, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.91015625, + "step": 4077, + "time_per_iteration": 2.5165140628814697 + }, + { + "auxiliary_loss_clip": 0.01132098, + "auxiliary_loss_mlp": 0.01044565, + "balance_loss_clip": 1.02695799, + "balance_loss_mlp": 1.04380083, + "epoch": 0.24518262437997895, + "flos": 29132603447040.0, + "grad_norm": 12.782264679420269, + "language_loss": 0.61930454, + "learning_rate": 3.533617663584082e-06, + "loss": 0.64107114, + "num_input_tokens_seen": 87771795, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8828125, + "step": 4078, + "time_per_iteration": 2.5202372074127197 + }, + { + "auxiliary_loss_clip": 0.01129452, + "auxiliary_loss_mlp": 0.0103596, + "balance_loss_clip": 1.02035594, + "balance_loss_mlp": 1.04474652, + "epoch": 0.24524274763264692, + "flos": 23476924419840.0, + "grad_norm": 1.7044874550491866, + "language_loss": 0.75514519, + "learning_rate": 3.5333676485790765e-06, + "loss": 0.77679932, + "num_input_tokens_seen": 87793640, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84765625, + "step": 4079, + "time_per_iteration": 2.483339309692383 + }, + { + "auxiliary_loss_clip": 0.01129188, + "auxiliary_loss_mlp": 0.01042937, + "balance_loss_clip": 1.02686739, + "balance_loss_mlp": 1.04370368, + "epoch": 0.24530287088531488, + "flos": 17201175886080.0, + "grad_norm": 1.8257477744529516, + "language_loss": 0.74925131, + "learning_rate": 3.5331175754286173e-06, + "loss": 0.77097261, + "num_input_tokens_seen": 87812390, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.85546875, + "step": 4080, + "time_per_iteration": 2.4843389987945557 + }, + { + "auxiliary_loss_clip": 0.01125805, + "auxiliary_loss_mlp": 0.01039252, + "balance_loss_clip": 1.02375531, + "balance_loss_mlp": 1.04129529, + "epoch": 0.24536299413798288, + "flos": 14867449349760.0, + "grad_norm": 2.211780780293779, + "language_loss": 0.82807517, + "learning_rate": 3.532867444142186e-06, + "loss": 0.84972572, + "num_input_tokens_seen": 87830640, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.84375, + "step": 4081, + "time_per_iteration": 2.4753835201263428 + }, + { + "auxiliary_loss_clip": 0.01128982, + "auxiliary_loss_mlp": 0.01039204, + "balance_loss_clip": 1.02445221, + "balance_loss_mlp": 1.04313576, + "epoch": 0.24542311739065084, + "flos": 35262051886080.0, + "grad_norm": 4.1574914526272515, + "language_loss": 0.73153239, + "learning_rate": 3.532617254729267e-06, + "loss": 0.75321424, + "num_input_tokens_seen": 87850450, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.859375, + "step": 4082, + "time_per_iteration": 2.5975396633148193 + }, + { + "auxiliary_loss_clip": 0.01127179, + "auxiliary_loss_mlp": 0.01042857, + "balance_loss_clip": 1.02837873, + "balance_loss_mlp": 1.04274178, + "epoch": 0.2454832406433188, + "flos": 21503130117120.0, + "grad_norm": 1.543838453785988, + "language_loss": 0.71628594, + "learning_rate": 3.5323670071993485e-06, + "loss": 0.73798621, + "num_input_tokens_seen": 87868810, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.84375, + "step": 4083, + "time_per_iteration": 2.471519947052002 + }, + { + "auxiliary_loss_clip": 0.01131409, + "auxiliary_loss_mlp": 0.01040567, + "balance_loss_clip": 1.02285206, + "balance_loss_mlp": 1.04234004, + "epoch": 0.24554336389598677, + "flos": 14756664827520.0, + "grad_norm": 2.1941070650453094, + "language_loss": 0.74700832, + "learning_rate": 3.532116701561919e-06, + "loss": 0.76872808, + "num_input_tokens_seen": 87885685, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.890625, + "step": 4084, + "time_per_iteration": 2.4286506175994873 + }, + { + "auxiliary_loss_clip": 0.01126312, + "auxiliary_loss_mlp": 0.01035336, + "balance_loss_clip": 1.01986289, + "balance_loss_mlp": 1.04189909, + "epoch": 0.24560348714865474, + "flos": 14976402278400.0, + "grad_norm": 2.042106499003273, + "language_loss": 0.85206825, + "learning_rate": 3.531866337826471e-06, + "loss": 0.8736847, + "num_input_tokens_seen": 87903715, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84375, + "step": 4085, + "time_per_iteration": 2.4283318519592285 + }, + { + "auxiliary_loss_clip": 0.01130256, + "auxiliary_loss_mlp": 0.01048422, + "balance_loss_clip": 1.03209007, + "balance_loss_mlp": 1.04266381, + "epoch": 0.2456636104013227, + "flos": 22675326554880.0, + "grad_norm": 1.8090063737063005, + "language_loss": 0.7876097, + "learning_rate": 3.5316159160024982e-06, + "loss": 0.80939639, + "num_input_tokens_seen": 87923375, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.875, + "step": 4086, + "time_per_iteration": 2.478954792022705 + }, + { + "auxiliary_loss_clip": 0.01126651, + "auxiliary_loss_mlp": 0.01041575, + "balance_loss_clip": 1.02669752, + "balance_loss_mlp": 1.04330873, + "epoch": 0.2457237336539907, + "flos": 27417869009280.0, + "grad_norm": 1.6669278195562474, + "language_loss": 0.75269985, + "learning_rate": 3.531365436099496e-06, + "loss": 0.77438211, + "num_input_tokens_seen": 87943115, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8359375, + "step": 4087, + "time_per_iteration": 2.4871292114257812 + }, + { + "auxiliary_loss_clip": 0.01132319, + "auxiliary_loss_mlp": 0.01041093, + "balance_loss_clip": 1.02364135, + "balance_loss_mlp": 1.04574418, + "epoch": 0.24578385690665866, + "flos": 20412379768320.0, + "grad_norm": 2.5789657141026, + "language_loss": 0.79284519, + "learning_rate": 3.5311148981269635e-06, + "loss": 0.81457937, + "num_input_tokens_seen": 87959505, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.8671875, + "step": 4088, + "time_per_iteration": 2.479841709136963 + }, + { + "auxiliary_loss_clip": 0.01123487, + "auxiliary_loss_mlp": 0.01033801, + "balance_loss_clip": 1.0196631, + "balance_loss_mlp": 1.04091823, + "epoch": 0.24584398015932662, + "flos": 23915393740800.0, + "grad_norm": 1.6187757849670203, + "language_loss": 0.7736612, + "learning_rate": 3.5308643020944e-06, + "loss": 0.79523408, + "num_input_tokens_seen": 87979725, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.828125, + "step": 4089, + "time_per_iteration": 2.483436346054077 + }, + { + "auxiliary_loss_clip": 0.01129539, + "auxiliary_loss_mlp": 0.01040613, + "balance_loss_clip": 1.02440071, + "balance_loss_mlp": 1.04232669, + "epoch": 0.2459041034119946, + "flos": 41496359103360.0, + "grad_norm": 3.8690522662716416, + "language_loss": 0.81463957, + "learning_rate": 3.530613648011309e-06, + "loss": 0.83634108, + "num_input_tokens_seen": 87998270, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.87109375, + "step": 4090, + "time_per_iteration": 2.657944917678833 + }, + { + "auxiliary_loss_clip": 0.01132703, + "auxiliary_loss_mlp": 0.01049826, + "balance_loss_clip": 1.03265369, + "balance_loss_mlp": 1.04411578, + "epoch": 0.24596422666466256, + "flos": 19936814676480.0, + "grad_norm": 1.9398667366019489, + "language_loss": 0.72874928, + "learning_rate": 3.5303629358871946e-06, + "loss": 0.75057453, + "num_input_tokens_seen": 88016760, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.88671875, + "step": 4091, + "time_per_iteration": 2.448307991027832 + }, + { + "auxiliary_loss_clip": 0.0113285, + "auxiliary_loss_mlp": 0.01036521, + "balance_loss_clip": 1.02166772, + "balance_loss_mlp": 1.04811478, + "epoch": 0.24602434991733052, + "flos": 21544391865600.0, + "grad_norm": 1.9209724672120978, + "language_loss": 0.76486623, + "learning_rate": 3.5301121657315653e-06, + "loss": 0.78656, + "num_input_tokens_seen": 88036465, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.84765625, + "step": 4092, + "time_per_iteration": 2.510815143585205 + }, + { + "auxiliary_loss_clip": 0.01134482, + "auxiliary_loss_mlp": 0.01035407, + "balance_loss_clip": 1.01899242, + "balance_loss_mlp": 1.04404068, + "epoch": 0.24608447316999849, + "flos": 23185078416000.0, + "grad_norm": 2.544549098738024, + "language_loss": 0.80905128, + "learning_rate": 3.5298613375539287e-06, + "loss": 0.83075017, + "num_input_tokens_seen": 88053270, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90234375, + "step": 4093, + "time_per_iteration": 2.4658117294311523 + }, + { + "auxiliary_loss_clip": 0.01133522, + "auxiliary_loss_mlp": 0.01042815, + "balance_loss_clip": 1.02542281, + "balance_loss_mlp": 1.04285693, + "epoch": 0.24614459642266648, + "flos": 19641951930240.0, + "grad_norm": 1.9793331271335382, + "language_loss": 0.87355959, + "learning_rate": 3.529610451363797e-06, + "loss": 0.89532292, + "num_input_tokens_seen": 88072305, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.90625, + "step": 4094, + "time_per_iteration": 5.436578035354614 + }, + { + "auxiliary_loss_clip": 0.01055645, + "auxiliary_loss_mlp": 0.01004731, + "balance_loss_clip": 1.00285995, + "balance_loss_mlp": 1.02449679, + "epoch": 0.24620471967533444, + "flos": 61739816186880.0, + "grad_norm": 0.7591937233735362, + "language_loss": 0.57501638, + "learning_rate": 3.5293595071706833e-06, + "loss": 0.59562016, + "num_input_tokens_seen": 88137995, + "router_z_loss_clip": 0.01867676, + "router_z_loss_mlp": 0.3125, + "step": 4095, + "time_per_iteration": 3.1966967582702637 + }, + { + "auxiliary_loss_clip": 0.01055105, + "auxiliary_loss_mlp": 0.01001708, + "balance_loss_clip": 0.99987203, + "balance_loss_mlp": 1.02336812, + "epoch": 0.2462648429280024, + "flos": 69154436315520.0, + "grad_norm": 0.643968481445629, + "language_loss": 0.56195372, + "learning_rate": 3.5291085049841042e-06, + "loss": 0.58252186, + "num_input_tokens_seen": 88208490, + "router_z_loss_clip": 0.01831055, + "router_z_loss_mlp": 0.31640625, + "step": 4096, + "time_per_iteration": 3.187084436416626 + }, + { + "auxiliary_loss_clip": 0.01134655, + "auxiliary_loss_mlp": 0.01035351, + "balance_loss_clip": 1.02030087, + "balance_loss_mlp": 1.04697204, + "epoch": 0.24632496618067037, + "flos": 29459605887360.0, + "grad_norm": 2.0390556104017907, + "language_loss": 0.77674699, + "learning_rate": 3.5288574448135773e-06, + "loss": 0.79844701, + "num_input_tokens_seen": 88228050, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.875, + "step": 4097, + "time_per_iteration": 2.5585436820983887 + }, + { + "auxiliary_loss_clip": 0.01134087, + "auxiliary_loss_mlp": 0.01044655, + "balance_loss_clip": 1.02608228, + "balance_loss_mlp": 1.04491377, + "epoch": 0.24638508943333834, + "flos": 24316444068480.0, + "grad_norm": 2.135816170269485, + "language_loss": 0.76393569, + "learning_rate": 3.5286063266686235e-06, + "loss": 0.78572309, + "num_input_tokens_seen": 88248090, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.890625, + "step": 4098, + "time_per_iteration": 2.478665828704834 + }, + { + "auxiliary_loss_clip": 0.01133268, + "auxiliary_loss_mlp": 0.01040789, + "balance_loss_clip": 1.02568507, + "balance_loss_mlp": 1.04479909, + "epoch": 0.2464452126860063, + "flos": 26613254401920.0, + "grad_norm": 2.152719854213413, + "language_loss": 0.68733507, + "learning_rate": 3.528355150558764e-06, + "loss": 0.70907569, + "num_input_tokens_seen": 88267545, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8828125, + "step": 4099, + "time_per_iteration": 2.515821933746338 + }, + { + "auxiliary_loss_clip": 0.01124761, + "auxiliary_loss_mlp": 0.01041381, + "balance_loss_clip": 1.02621734, + "balance_loss_mlp": 1.04163074, + "epoch": 0.24650533593867427, + "flos": 31212405763200.0, + "grad_norm": 2.459538616056665, + "language_loss": 0.65975124, + "learning_rate": 3.5281039164935237e-06, + "loss": 0.68141258, + "num_input_tokens_seen": 88289785, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.83203125, + "step": 4100, + "time_per_iteration": 2.562962532043457 + }, + { + "auxiliary_loss_clip": 0.01051305, + "auxiliary_loss_mlp": 0.01002462, + "balance_loss_clip": 1.00055432, + "balance_loss_mlp": 1.02057505, + "epoch": 0.24656545919134226, + "flos": 68494002900480.0, + "grad_norm": 0.7078763540659354, + "language_loss": 0.61549371, + "learning_rate": 3.5278526244824304e-06, + "loss": 0.63603139, + "num_input_tokens_seen": 88357320, + "router_z_loss_clip": 0.01904297, + "router_z_loss_mlp": 0.30859375, + "step": 4101, + "time_per_iteration": 3.1617352962493896 + }, + { + "auxiliary_loss_clip": 0.01128803, + "auxiliary_loss_mlp": 0.01034815, + "balance_loss_clip": 1.01893687, + "balance_loss_mlp": 1.04385781, + "epoch": 0.24662558244401023, + "flos": 20084192179200.0, + "grad_norm": 1.7154022892986804, + "language_loss": 0.73020113, + "learning_rate": 3.527601274535012e-06, + "loss": 0.75183737, + "num_input_tokens_seen": 88377040, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8515625, + "step": 4102, + "time_per_iteration": 2.5522637367248535 + }, + { + "auxiliary_loss_clip": 0.01132375, + "auxiliary_loss_mlp": 0.0104013, + "balance_loss_clip": 1.02463281, + "balance_loss_mlp": 1.04294777, + "epoch": 0.2466857056966782, + "flos": 30701361012480.0, + "grad_norm": 2.2979425011191528, + "language_loss": 0.75574934, + "learning_rate": 3.5273498666608004e-06, + "loss": 0.7774744, + "num_input_tokens_seen": 88395085, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.89453125, + "step": 4103, + "time_per_iteration": 2.5117204189300537 + }, + { + "auxiliary_loss_clip": 0.01129454, + "auxiliary_loss_mlp": 0.01043396, + "balance_loss_clip": 1.02647424, + "balance_loss_mlp": 1.04096079, + "epoch": 0.24674582894934616, + "flos": 22528523669760.0, + "grad_norm": 2.002646106823912, + "language_loss": 0.78701174, + "learning_rate": 3.5270984008693288e-06, + "loss": 0.80874026, + "num_input_tokens_seen": 88413205, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.88671875, + "step": 4104, + "time_per_iteration": 2.5791869163513184 + }, + { + "auxiliary_loss_clip": 0.011264, + "auxiliary_loss_mlp": 0.01041575, + "balance_loss_clip": 1.02333593, + "balance_loss_mlp": 1.0411272, + "epoch": 0.24680595220201412, + "flos": 20704297599360.0, + "grad_norm": 1.7283937272898544, + "language_loss": 0.83567655, + "learning_rate": 3.526846877170133e-06, + "loss": 0.85735631, + "num_input_tokens_seen": 88431525, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.8515625, + "step": 4105, + "time_per_iteration": 2.447399854660034 + }, + { + "auxiliary_loss_clip": 0.01134164, + "auxiliary_loss_mlp": 0.01043152, + "balance_loss_clip": 1.02768457, + "balance_loss_mlp": 1.04806173, + "epoch": 0.2468660754546821, + "flos": 21831174051840.0, + "grad_norm": 1.7373974977996043, + "language_loss": 0.7646578, + "learning_rate": 3.52659529557275e-06, + "loss": 0.78643101, + "num_input_tokens_seen": 88451210, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.859375, + "step": 4106, + "time_per_iteration": 2.519059658050537 + }, + { + "auxiliary_loss_clip": 0.01127139, + "auxiliary_loss_mlp": 0.01042215, + "balance_loss_clip": 1.02539492, + "balance_loss_mlp": 1.04087114, + "epoch": 0.24692619870735008, + "flos": 15267709578240.0, + "grad_norm": 2.1665884513414513, + "language_loss": 0.72764528, + "learning_rate": 3.5263436560867205e-06, + "loss": 0.74933887, + "num_input_tokens_seen": 88467790, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.86328125, + "step": 4107, + "time_per_iteration": 2.4489266872406006 + }, + { + "auxiliary_loss_clip": 0.01131987, + "auxiliary_loss_mlp": 0.01048032, + "balance_loss_clip": 1.03173625, + "balance_loss_mlp": 1.0454886, + "epoch": 0.24698632196001805, + "flos": 29680097523840.0, + "grad_norm": 2.3712774609847274, + "language_loss": 0.65420353, + "learning_rate": 3.526091958721587e-06, + "loss": 0.67600369, + "num_input_tokens_seen": 88490330, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8671875, + "step": 4108, + "time_per_iteration": 2.5401792526245117 + }, + { + "auxiliary_loss_clip": 0.01131766, + "auxiliary_loss_mlp": 0.01046054, + "balance_loss_clip": 1.02961504, + "balance_loss_mlp": 1.04324555, + "epoch": 0.247046445212686, + "flos": 39165469741440.0, + "grad_norm": 2.174268382145969, + "language_loss": 0.72611141, + "learning_rate": 3.5258402034868936e-06, + "loss": 0.74788952, + "num_input_tokens_seen": 88512435, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8828125, + "step": 4109, + "time_per_iteration": 2.593358278274536 + }, + { + "auxiliary_loss_clip": 0.01133432, + "auxiliary_loss_mlp": 0.010446, + "balance_loss_clip": 1.02788687, + "balance_loss_mlp": 1.04414606, + "epoch": 0.24710656846535398, + "flos": 22998845376000.0, + "grad_norm": 1.7026194733932167, + "language_loss": 0.79302657, + "learning_rate": 3.5255883903921866e-06, + "loss": 0.81480682, + "num_input_tokens_seen": 88529780, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.89453125, + "step": 4110, + "time_per_iteration": 2.4776864051818848 + }, + { + "auxiliary_loss_clip": 0.01133691, + "auxiliary_loss_mlp": 0.01032561, + "balance_loss_clip": 1.01618171, + "balance_loss_mlp": 1.04541993, + "epoch": 0.24716669171802194, + "flos": 26432803451520.0, + "grad_norm": 2.5002063230568545, + "language_loss": 0.80653715, + "learning_rate": 3.5253365194470144e-06, + "loss": 0.82819968, + "num_input_tokens_seen": 88547200, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8828125, + "step": 4111, + "time_per_iteration": 2.4957237243652344 + }, + { + "auxiliary_loss_clip": 0.01130011, + "auxiliary_loss_mlp": 0.01039883, + "balance_loss_clip": 1.02517819, + "balance_loss_mlp": 1.04273677, + "epoch": 0.2472268149706899, + "flos": 23329870139520.0, + "grad_norm": 2.4547784256207663, + "language_loss": 0.75205207, + "learning_rate": 3.5250845906609294e-06, + "loss": 0.77375102, + "num_input_tokens_seen": 88566415, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.875, + "step": 4112, + "time_per_iteration": 2.481778860092163 + }, + { + "auxiliary_loss_clip": 0.01130648, + "auxiliary_loss_mlp": 0.01044261, + "balance_loss_clip": 1.02868617, + "balance_loss_mlp": 1.04366612, + "epoch": 0.24728693822335787, + "flos": 23768734510080.0, + "grad_norm": 1.9927491285660106, + "language_loss": 0.82454932, + "learning_rate": 3.5248326040434835e-06, + "loss": 0.8462984, + "num_input_tokens_seen": 88585225, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.87109375, + "step": 4113, + "time_per_iteration": 2.4658617973327637 + }, + { + "auxiliary_loss_clip": 0.01129834, + "auxiliary_loss_mlp": 0.01036833, + "balance_loss_clip": 1.0205375, + "balance_loss_mlp": 1.0423646, + "epoch": 0.24734706147602586, + "flos": 19317499355520.0, + "grad_norm": 2.834925175676511, + "language_loss": 0.87073094, + "learning_rate": 3.5245805596042322e-06, + "loss": 0.89239764, + "num_input_tokens_seen": 88603280, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.875, + "step": 4114, + "time_per_iteration": 2.4575555324554443 + }, + { + "auxiliary_loss_clip": 0.01130204, + "auxiliary_loss_mlp": 0.01038167, + "balance_loss_clip": 1.02274156, + "balance_loss_mlp": 1.04354906, + "epoch": 0.24740718472869383, + "flos": 28036932935040.0, + "grad_norm": 2.804779626044085, + "language_loss": 0.753479, + "learning_rate": 3.524328457352734e-06, + "loss": 0.7751627, + "num_input_tokens_seen": 88624925, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8671875, + "step": 4115, + "time_per_iteration": 2.5051238536834717 + }, + { + "auxiliary_loss_clip": 0.01052886, + "auxiliary_loss_mlp": 0.01002125, + "balance_loss_clip": 1.00016963, + "balance_loss_mlp": 1.02261877, + "epoch": 0.2474673079813618, + "flos": 68107569408000.0, + "grad_norm": 0.6664049604648837, + "language_loss": 0.58203655, + "learning_rate": 3.5240762972985475e-06, + "loss": 0.60258663, + "num_input_tokens_seen": 88691475, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.30273438, + "step": 4116, + "time_per_iteration": 3.172032117843628 + }, + { + "auxiliary_loss_clip": 0.01130845, + "auxiliary_loss_mlp": 0.01035114, + "balance_loss_clip": 1.01992679, + "balance_loss_mlp": 1.04510772, + "epoch": 0.24752743123402976, + "flos": 29462119839360.0, + "grad_norm": 1.6806447251481575, + "language_loss": 0.83616889, + "learning_rate": 3.523824079451235e-06, + "loss": 0.8578285, + "num_input_tokens_seen": 88713425, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.85546875, + "step": 4117, + "time_per_iteration": 2.5228748321533203 + }, + { + "auxiliary_loss_clip": 0.01053619, + "auxiliary_loss_mlp": 0.0100274, + "balance_loss_clip": 1.00073707, + "balance_loss_mlp": 1.02337885, + "epoch": 0.24758755448669773, + "flos": 58350459824640.0, + "grad_norm": 0.9069522642789956, + "language_loss": 0.63507527, + "learning_rate": 3.5235718038203602e-06, + "loss": 0.65563887, + "num_input_tokens_seen": 88769995, + "router_z_loss_clip": 0.02001953, + "router_z_loss_mlp": 0.30078125, + "step": 4118, + "time_per_iteration": 2.9459333419799805 + }, + { + "auxiliary_loss_clip": 0.0113153, + "auxiliary_loss_mlp": 0.01040526, + "balance_loss_clip": 1.02470684, + "balance_loss_mlp": 1.04544902, + "epoch": 0.2476476777393657, + "flos": 20484416494080.0, + "grad_norm": 1.5050779056214143, + "language_loss": 0.79252797, + "learning_rate": 3.523319470415491e-06, + "loss": 0.8142485, + "num_input_tokens_seen": 88789970, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.859375, + "step": 4119, + "time_per_iteration": 2.438519239425659 + }, + { + "auxiliary_loss_clip": 0.01129874, + "auxiliary_loss_mlp": 0.01039225, + "balance_loss_clip": 1.02359676, + "balance_loss_mlp": 1.04430819, + "epoch": 0.24770780099203366, + "flos": 20485853038080.0, + "grad_norm": 1.9430586352888408, + "language_loss": 0.73955107, + "learning_rate": 3.5230670792461943e-06, + "loss": 0.76124215, + "num_input_tokens_seen": 88810000, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.85546875, + "step": 4120, + "time_per_iteration": 2.4728164672851562 + }, + { + "auxiliary_loss_clip": 0.01132188, + "auxiliary_loss_mlp": 0.01047049, + "balance_loss_clip": 1.03010893, + "balance_loss_mlp": 1.0446558, + "epoch": 0.24776792424470165, + "flos": 15153405523200.0, + "grad_norm": 3.4886461941998563, + "language_loss": 0.88028777, + "learning_rate": 3.522814630322041e-06, + "loss": 0.90208006, + "num_input_tokens_seen": 88827515, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.875, + "step": 4121, + "time_per_iteration": 2.4117653369903564 + }, + { + "auxiliary_loss_clip": 0.01134577, + "auxiliary_loss_mlp": 0.01037836, + "balance_loss_clip": 1.02102745, + "balance_loss_mlp": 1.04516518, + "epoch": 0.2478280474973696, + "flos": 21725453347200.0, + "grad_norm": 2.7360865086006285, + "language_loss": 0.69088298, + "learning_rate": 3.5225621236526045e-06, + "loss": 0.71260709, + "num_input_tokens_seen": 88845025, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.89453125, + "step": 4122, + "time_per_iteration": 2.484830617904663 + }, + { + "auxiliary_loss_clip": 0.0113273, + "auxiliary_loss_mlp": 0.01040601, + "balance_loss_clip": 1.02224231, + "balance_loss_mlp": 1.04380226, + "epoch": 0.24788817075003758, + "flos": 20412200200320.0, + "grad_norm": 2.016808492688271, + "language_loss": 0.80196065, + "learning_rate": 3.5223095592474596e-06, + "loss": 0.82369387, + "num_input_tokens_seen": 88861740, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.890625, + "step": 4123, + "time_per_iteration": 2.43839955329895 + }, + { + "auxiliary_loss_clip": 0.01130784, + "auxiliary_loss_mlp": 0.01041496, + "balance_loss_clip": 1.02620113, + "balance_loss_mlp": 1.04464054, + "epoch": 0.24794829400270554, + "flos": 22594455083520.0, + "grad_norm": 2.3250466211888745, + "language_loss": 0.74919629, + "learning_rate": 3.5220569371161846e-06, + "loss": 0.77091914, + "num_input_tokens_seen": 88879740, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.86328125, + "step": 4124, + "time_per_iteration": 2.4909141063690186 + }, + { + "auxiliary_loss_clip": 0.01127616, + "auxiliary_loss_mlp": 0.01034142, + "balance_loss_clip": 1.01922846, + "balance_loss_mlp": 1.0432241, + "epoch": 0.2480084172553735, + "flos": 39676047615360.0, + "grad_norm": 1.6909299882519486, + "language_loss": 0.73759794, + "learning_rate": 3.521804257268357e-06, + "loss": 0.75921559, + "num_input_tokens_seen": 88904095, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.84375, + "step": 4125, + "time_per_iteration": 2.6068458557128906 + }, + { + "auxiliary_loss_clip": 0.01135393, + "auxiliary_loss_mlp": 0.01046005, + "balance_loss_clip": 1.02914929, + "balance_loss_mlp": 1.04383993, + "epoch": 0.24806854050804147, + "flos": 22053712763520.0, + "grad_norm": 2.376019449241759, + "language_loss": 0.69416726, + "learning_rate": 3.5215515197135595e-06, + "loss": 0.71598125, + "num_input_tokens_seen": 88920740, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9140625, + "step": 4126, + "time_per_iteration": 2.4516806602478027 + }, + { + "auxiliary_loss_clip": 0.01130323, + "auxiliary_loss_mlp": 0.01047803, + "balance_loss_clip": 1.03112614, + "balance_loss_mlp": 1.04299593, + "epoch": 0.24812866376070947, + "flos": 15486764670720.0, + "grad_norm": 2.081795572279456, + "language_loss": 0.81602275, + "learning_rate": 3.5212987244613764e-06, + "loss": 0.83780402, + "num_input_tokens_seen": 88938510, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.875, + "step": 4127, + "time_per_iteration": 2.482492446899414 + }, + { + "auxiliary_loss_clip": 0.01134053, + "auxiliary_loss_mlp": 0.01045575, + "balance_loss_clip": 1.02998269, + "balance_loss_mlp": 1.04527378, + "epoch": 0.24818878701337743, + "flos": 14757419013120.0, + "grad_norm": 5.2721581441441465, + "language_loss": 0.84604752, + "learning_rate": 3.5210458715213927e-06, + "loss": 0.86784381, + "num_input_tokens_seen": 88955235, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.88671875, + "step": 4128, + "time_per_iteration": 2.4577715396881104 + }, + { + "auxiliary_loss_clip": 0.01132567, + "auxiliary_loss_mlp": 0.01043389, + "balance_loss_clip": 1.02779055, + "balance_loss_mlp": 1.04397762, + "epoch": 0.2482489102660454, + "flos": 27089501852160.0, + "grad_norm": 3.598051635390234, + "language_loss": 0.65576231, + "learning_rate": 3.5207929609031973e-06, + "loss": 0.67752188, + "num_input_tokens_seen": 88975210, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8828125, + "step": 4129, + "time_per_iteration": 2.498321294784546 + }, + { + "auxiliary_loss_clip": 0.01130769, + "auxiliary_loss_mlp": 0.01043091, + "balance_loss_clip": 1.02573466, + "balance_loss_mlp": 1.04308498, + "epoch": 0.24830903351871336, + "flos": 26467528924800.0, + "grad_norm": 2.23477186449736, + "language_loss": 0.75251818, + "learning_rate": 3.5205399926163806e-06, + "loss": 0.77425677, + "num_input_tokens_seen": 88996120, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.875, + "step": 4130, + "time_per_iteration": 2.534014940261841 + }, + { + "auxiliary_loss_clip": 0.01132521, + "auxiliary_loss_mlp": 0.01048652, + "balance_loss_clip": 1.03198647, + "balance_loss_mlp": 1.04404271, + "epoch": 0.24836915677138133, + "flos": 10228436870400.0, + "grad_norm": 2.282827015603824, + "language_loss": 0.77323985, + "learning_rate": 3.520286966670535e-06, + "loss": 0.79505157, + "num_input_tokens_seen": 89008685, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8828125, + "step": 4131, + "time_per_iteration": 2.3971383571624756 + }, + { + "auxiliary_loss_clip": 0.011274, + "auxiliary_loss_mlp": 0.01036942, + "balance_loss_clip": 1.02241063, + "balance_loss_mlp": 1.0428257, + "epoch": 0.2484292800240493, + "flos": 30080429579520.0, + "grad_norm": 1.5452946340590639, + "language_loss": 0.83932686, + "learning_rate": 3.520033883075255e-06, + "loss": 0.86097032, + "num_input_tokens_seen": 89031160, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.84375, + "step": 4132, + "time_per_iteration": 2.552804470062256 + }, + { + "auxiliary_loss_clip": 0.01129759, + "auxiliary_loss_mlp": 0.01042276, + "balance_loss_clip": 1.02601552, + "balance_loss_mlp": 1.04280567, + "epoch": 0.24848940327671726, + "flos": 13442944803840.0, + "grad_norm": 2.4707160060639857, + "language_loss": 0.71077073, + "learning_rate": 3.5197807418401386e-06, + "loss": 0.73249108, + "num_input_tokens_seen": 89047235, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.87109375, + "step": 4133, + "time_per_iteration": 2.40258526802063 + }, + { + "auxiliary_loss_clip": 0.01138495, + "auxiliary_loss_mlp": 0.01044523, + "balance_loss_clip": 1.02486503, + "balance_loss_mlp": 1.0454644, + "epoch": 0.24854952652938525, + "flos": 19970247260160.0, + "grad_norm": 2.206352055564895, + "language_loss": 0.61492884, + "learning_rate": 3.5195275429747834e-06, + "loss": 0.63675898, + "num_input_tokens_seen": 89064790, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.9296875, + "step": 4134, + "time_per_iteration": 2.476027250289917 + }, + { + "auxiliary_loss_clip": 0.01133349, + "auxiliary_loss_mlp": 0.01037132, + "balance_loss_clip": 1.02063298, + "balance_loss_mlp": 1.04393268, + "epoch": 0.24860964978205322, + "flos": 18150187167360.0, + "grad_norm": 2.276340033899988, + "language_loss": 0.78899026, + "learning_rate": 3.5192742864887914e-06, + "loss": 0.81069505, + "num_input_tokens_seen": 89083250, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.89453125, + "step": 4135, + "time_per_iteration": 3.9668710231781006 + }, + { + "auxiliary_loss_clip": 0.01136879, + "auxiliary_loss_mlp": 0.01032054, + "balance_loss_clip": 1.01746297, + "balance_loss_mlp": 1.04908156, + "epoch": 0.24866977303472118, + "flos": 11728641329280.0, + "grad_norm": 2.12923907223803, + "language_loss": 0.82729924, + "learning_rate": 3.5190209723917662e-06, + "loss": 0.84898853, + "num_input_tokens_seen": 89100905, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.87890625, + "step": 4136, + "time_per_iteration": 3.8651821613311768 + }, + { + "auxiliary_loss_clip": 0.01135708, + "auxiliary_loss_mlp": 0.01045715, + "balance_loss_clip": 1.02919221, + "balance_loss_mlp": 1.04593039, + "epoch": 0.24872989628738915, + "flos": 34823582565120.0, + "grad_norm": 1.7063584090687087, + "language_loss": 0.70454097, + "learning_rate": 3.518767600693314e-06, + "loss": 0.72635514, + "num_input_tokens_seen": 89122630, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8984375, + "step": 4137, + "time_per_iteration": 2.581270456314087 + }, + { + "auxiliary_loss_clip": 0.01135031, + "auxiliary_loss_mlp": 0.01035007, + "balance_loss_clip": 1.0193553, + "balance_loss_mlp": 1.04428291, + "epoch": 0.2487900195400571, + "flos": 13699347062400.0, + "grad_norm": 2.0340803052703236, + "language_loss": 0.66840076, + "learning_rate": 3.518514171403042e-06, + "loss": 0.69010115, + "num_input_tokens_seen": 89141050, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.90625, + "step": 4138, + "time_per_iteration": 2.438858985900879 + }, + { + "auxiliary_loss_clip": 0.01130089, + "auxiliary_loss_mlp": 0.01035018, + "balance_loss_clip": 1.01977062, + "balance_loss_mlp": 1.0451256, + "epoch": 0.24885014279272508, + "flos": 25337815297920.0, + "grad_norm": 2.467393625239628, + "language_loss": 0.83937073, + "learning_rate": 3.51826068453056e-06, + "loss": 0.86102176, + "num_input_tokens_seen": 89160810, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8515625, + "step": 4139, + "time_per_iteration": 2.4858012199401855 + }, + { + "auxiliary_loss_clip": 0.01134672, + "auxiliary_loss_mlp": 0.0104164, + "balance_loss_clip": 1.02424788, + "balance_loss_mlp": 1.04416132, + "epoch": 0.24891026604539307, + "flos": 20631434860800.0, + "grad_norm": 1.5320149755260415, + "language_loss": 0.7864905, + "learning_rate": 3.518007140085481e-06, + "loss": 0.80825365, + "num_input_tokens_seen": 89180610, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.90625, + "step": 4140, + "time_per_iteration": 2.4608240127563477 + }, + { + "auxiliary_loss_clip": 0.01058216, + "auxiliary_loss_mlp": 0.01013447, + "balance_loss_clip": 1.01150382, + "balance_loss_mlp": 1.02780879, + "epoch": 0.24897038929806103, + "flos": 66960294030720.0, + "grad_norm": 0.8230161703115366, + "language_loss": 0.60980695, + "learning_rate": 3.51775353807742e-06, + "loss": 0.63052356, + "num_input_tokens_seen": 89241880, + "router_z_loss_clip": 0.01940918, + "router_z_loss_mlp": 0.3046875, + "step": 4141, + "time_per_iteration": 3.1306700706481934 + }, + { + "auxiliary_loss_clip": 0.01136317, + "auxiliary_loss_mlp": 0.01042658, + "balance_loss_clip": 1.02537298, + "balance_loss_mlp": 1.04692519, + "epoch": 0.249030512550729, + "flos": 36392555612160.0, + "grad_norm": 2.804889663143828, + "language_loss": 0.72997624, + "learning_rate": 3.5174998785159913e-06, + "loss": 0.75176597, + "num_input_tokens_seen": 89263340, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.89453125, + "step": 4142, + "time_per_iteration": 2.60341215133667 + }, + { + "auxiliary_loss_clip": 0.011336, + "auxiliary_loss_mlp": 0.01039484, + "balance_loss_clip": 1.02335465, + "balance_loss_mlp": 1.04601634, + "epoch": 0.24909063580339696, + "flos": 20154576879360.0, + "grad_norm": 2.0852522280017873, + "language_loss": 0.80985868, + "learning_rate": 3.5172461614108157e-06, + "loss": 0.83158958, + "num_input_tokens_seen": 89282870, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.875, + "step": 4143, + "time_per_iteration": 2.4621787071228027 + }, + { + "auxiliary_loss_clip": 0.01127478, + "auxiliary_loss_mlp": 0.01036024, + "balance_loss_clip": 1.02113485, + "balance_loss_mlp": 1.04291701, + "epoch": 0.24915075905606493, + "flos": 26396569607040.0, + "grad_norm": 1.8417531415701045, + "language_loss": 0.5884496, + "learning_rate": 3.5169923867715137e-06, + "loss": 0.61008459, + "num_input_tokens_seen": 89303830, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.84765625, + "step": 4144, + "time_per_iteration": 2.5253236293792725 + }, + { + "auxiliary_loss_clip": 0.0113091, + "auxiliary_loss_mlp": 0.0103722, + "balance_loss_clip": 1.02135301, + "balance_loss_mlp": 1.04400194, + "epoch": 0.2492108823087329, + "flos": 27527216987520.0, + "grad_norm": 2.2350400575734146, + "language_loss": 0.78882402, + "learning_rate": 3.516738554607708e-06, + "loss": 0.81050527, + "num_input_tokens_seen": 89324350, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8671875, + "step": 4145, + "time_per_iteration": 2.500868797302246 + }, + { + "auxiliary_loss_clip": 0.01141282, + "auxiliary_loss_mlp": 0.01049792, + "balance_loss_clip": 1.02981293, + "balance_loss_mlp": 1.04593182, + "epoch": 0.24927100556140086, + "flos": 16691388111360.0, + "grad_norm": 2.0986803435557415, + "language_loss": 0.65651333, + "learning_rate": 3.5164846649290253e-06, + "loss": 0.678424, + "num_input_tokens_seen": 89342875, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.953125, + "step": 4146, + "time_per_iteration": 2.482405424118042 + }, + { + "auxiliary_loss_clip": 0.01048172, + "auxiliary_loss_mlp": 0.01006681, + "balance_loss_clip": 1.00482178, + "balance_loss_mlp": 1.01849687, + "epoch": 0.24933112881406885, + "flos": 62772464286720.0, + "grad_norm": 3.0854856510049458, + "language_loss": 0.67327654, + "learning_rate": 3.5162307177450915e-06, + "loss": 0.69382501, + "num_input_tokens_seen": 89404925, + "router_z_loss_clip": 0.01855469, + "router_z_loss_mlp": 0.296875, + "step": 4147, + "time_per_iteration": 3.1769258975982666 + }, + { + "auxiliary_loss_clip": 0.01136528, + "auxiliary_loss_mlp": 0.01046222, + "balance_loss_clip": 1.02930617, + "balance_loss_mlp": 1.04857254, + "epoch": 0.24939125206673682, + "flos": 26651894457600.0, + "grad_norm": 2.0368820911017025, + "language_loss": 0.8893261, + "learning_rate": 3.5159767130655366e-06, + "loss": 0.91115361, + "num_input_tokens_seen": 89425090, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8828125, + "step": 4148, + "time_per_iteration": 2.5202085971832275 + }, + { + "auxiliary_loss_clip": 0.0113885, + "auxiliary_loss_mlp": 0.01045745, + "balance_loss_clip": 1.02649307, + "balance_loss_mlp": 1.04754162, + "epoch": 0.24945137531940478, + "flos": 20704333512960.0, + "grad_norm": 1.8605307211390085, + "language_loss": 0.68053228, + "learning_rate": 3.5157226508999935e-06, + "loss": 0.70237827, + "num_input_tokens_seen": 89442615, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.9140625, + "step": 4149, + "time_per_iteration": 2.455733060836792 + }, + { + "auxiliary_loss_clip": 0.01133288, + "auxiliary_loss_mlp": 0.01038884, + "balance_loss_clip": 1.02291596, + "balance_loss_mlp": 1.04694724, + "epoch": 0.24951149857207275, + "flos": 23768662682880.0, + "grad_norm": 2.99652773874907, + "language_loss": 0.71235985, + "learning_rate": 3.515468531258095e-06, + "loss": 0.73408163, + "num_input_tokens_seen": 89463025, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.86328125, + "step": 4150, + "time_per_iteration": 2.514190196990967 + }, + { + "auxiliary_loss_clip": 0.01134014, + "auxiliary_loss_mlp": 0.01049321, + "balance_loss_clip": 1.03256035, + "balance_loss_mlp": 1.04471052, + "epoch": 0.2495716218247407, + "flos": 15664881237120.0, + "grad_norm": 1.862035570914478, + "language_loss": 0.72954226, + "learning_rate": 3.515214354149478e-06, + "loss": 0.75137556, + "num_input_tokens_seen": 89480225, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.89453125, + "step": 4151, + "time_per_iteration": 2.4198975563049316 + }, + { + "auxiliary_loss_clip": 0.01141172, + "auxiliary_loss_mlp": 0.01049288, + "balance_loss_clip": 1.03213382, + "balance_loss_mlp": 1.04694724, + "epoch": 0.24963174507740868, + "flos": 24052499953920.0, + "grad_norm": 4.099427504771762, + "language_loss": 0.62436807, + "learning_rate": 3.514960119583781e-06, + "loss": 0.64627266, + "num_input_tokens_seen": 89496985, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.94140625, + "step": 4152, + "time_per_iteration": 2.563032865524292 + }, + { + "auxiliary_loss_clip": 0.01131413, + "auxiliary_loss_mlp": 0.01038045, + "balance_loss_clip": 1.02188039, + "balance_loss_mlp": 1.04631066, + "epoch": 0.24969186833007664, + "flos": 21799501234560.0, + "grad_norm": 2.3735561607913596, + "language_loss": 0.77219248, + "learning_rate": 3.514705827570645e-06, + "loss": 0.79388708, + "num_input_tokens_seen": 89514420, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8515625, + "step": 4153, + "time_per_iteration": 2.5059967041015625 + }, + { + "auxiliary_loss_clip": 0.01132512, + "auxiliary_loss_mlp": 0.01040076, + "balance_loss_clip": 1.0242573, + "balance_loss_mlp": 1.04642224, + "epoch": 0.24975199158274464, + "flos": 19938143479680.0, + "grad_norm": 2.164577963489155, + "language_loss": 0.76443702, + "learning_rate": 3.514451478119711e-06, + "loss": 0.78616285, + "num_input_tokens_seen": 89532925, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.859375, + "step": 4154, + "time_per_iteration": 2.48317551612854 + }, + { + "auxiliary_loss_clip": 0.01138697, + "auxiliary_loss_mlp": 0.0104451, + "balance_loss_clip": 1.02586532, + "balance_loss_mlp": 1.04451203, + "epoch": 0.2498121148354126, + "flos": 25338389915520.0, + "grad_norm": 2.2000943153895722, + "language_loss": 0.70740849, + "learning_rate": 3.5141970712406258e-06, + "loss": 0.72924054, + "num_input_tokens_seen": 89552855, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.94140625, + "step": 4155, + "time_per_iteration": 2.498227834701538 + }, + { + "auxiliary_loss_clip": 0.01137147, + "auxiliary_loss_mlp": 0.01050913, + "balance_loss_clip": 1.03379464, + "balance_loss_mlp": 1.04736114, + "epoch": 0.24987223808808057, + "flos": 20558787603840.0, + "grad_norm": 1.8252469259439843, + "language_loss": 0.7499637, + "learning_rate": 3.513942606943036e-06, + "loss": 0.77184427, + "num_input_tokens_seen": 89572830, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.8984375, + "step": 4156, + "time_per_iteration": 2.473536729812622 + }, + { + "auxiliary_loss_clip": 0.01132111, + "auxiliary_loss_mlp": 0.01040008, + "balance_loss_clip": 1.0244987, + "balance_loss_mlp": 1.04498601, + "epoch": 0.24993236134074853, + "flos": 19749037351680.0, + "grad_norm": 2.1247768054564333, + "language_loss": 0.76757634, + "learning_rate": 3.513688085236591e-06, + "loss": 0.78929752, + "num_input_tokens_seen": 89590345, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.87109375, + "step": 4157, + "time_per_iteration": 2.476402759552002 + }, + { + "auxiliary_loss_clip": 0.01135567, + "auxiliary_loss_mlp": 0.01044785, + "balance_loss_clip": 1.02821517, + "balance_loss_mlp": 1.04551077, + "epoch": 0.2499924845934165, + "flos": 18770292587520.0, + "grad_norm": 1.6430173172536622, + "language_loss": 0.81497854, + "learning_rate": 3.513433506130942e-06, + "loss": 0.8367821, + "num_input_tokens_seen": 89610295, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8984375, + "step": 4158, + "time_per_iteration": 2.4706146717071533 + }, + { + "auxiliary_loss_clip": 0.01134661, + "auxiliary_loss_mlp": 0.01031651, + "balance_loss_clip": 1.01533163, + "balance_loss_mlp": 1.04511046, + "epoch": 0.25005260784608446, + "flos": 16872198197760.0, + "grad_norm": 2.425058111765743, + "language_loss": 0.75573325, + "learning_rate": 3.5131788696357427e-06, + "loss": 0.77739644, + "num_input_tokens_seen": 89627795, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.89453125, + "step": 4159, + "time_per_iteration": 2.447530746459961 + }, + { + "auxiliary_loss_clip": 0.01137664, + "auxiliary_loss_mlp": 0.01036787, + "balance_loss_clip": 1.01928759, + "balance_loss_mlp": 1.04643881, + "epoch": 0.2501127310987524, + "flos": 22124923476480.0, + "grad_norm": 2.3851333770237044, + "language_loss": 0.71434534, + "learning_rate": 3.512924175760649e-06, + "loss": 0.73608989, + "num_input_tokens_seen": 89648090, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9140625, + "step": 4160, + "time_per_iteration": 2.4909448623657227 + }, + { + "auxiliary_loss_clip": 0.01045571, + "auxiliary_loss_mlp": 0.01008394, + "balance_loss_clip": 1.0062604, + "balance_loss_mlp": 1.01615632, + "epoch": 0.2501728543514204, + "flos": 69458061980160.0, + "grad_norm": 0.7574731626167057, + "language_loss": 0.56755257, + "learning_rate": 3.5126694245153186e-06, + "loss": 0.58809221, + "num_input_tokens_seen": 89710345, + "router_z_loss_clip": 0.0213623, + "router_z_loss_mlp": 0.29492188, + "step": 4161, + "time_per_iteration": 3.1169064044952393 + }, + { + "auxiliary_loss_clip": 0.01143652, + "auxiliary_loss_mlp": 0.01045602, + "balance_loss_clip": 1.02767324, + "balance_loss_mlp": 1.04854345, + "epoch": 0.25023297760408836, + "flos": 16289978647680.0, + "grad_norm": 1.822598728260487, + "language_loss": 0.8071059, + "learning_rate": 3.5124146159094125e-06, + "loss": 0.82899845, + "num_input_tokens_seen": 89729390, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.94921875, + "step": 4162, + "time_per_iteration": 2.4679477214813232 + }, + { + "auxiliary_loss_clip": 0.01136921, + "auxiliary_loss_mlp": 0.01039377, + "balance_loss_clip": 1.02212739, + "balance_loss_mlp": 1.04364812, + "epoch": 0.2502931008567563, + "flos": 12237998140800.0, + "grad_norm": 2.543272880301035, + "language_loss": 0.87439299, + "learning_rate": 3.5121597499525927e-06, + "loss": 0.89615595, + "num_input_tokens_seen": 89742805, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.93359375, + "step": 4163, + "time_per_iteration": 2.411324977874756 + }, + { + "auxiliary_loss_clip": 0.01135069, + "auxiliary_loss_mlp": 0.01036709, + "balance_loss_clip": 1.02013874, + "balance_loss_mlp": 1.04609334, + "epoch": 0.25035322410942434, + "flos": 23181882105600.0, + "grad_norm": 1.8835095650007205, + "language_loss": 0.83242726, + "learning_rate": 3.5119048266545232e-06, + "loss": 0.85414505, + "num_input_tokens_seen": 89761145, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.890625, + "step": 4164, + "time_per_iteration": 2.4910058975219727 + }, + { + "auxiliary_loss_clip": 0.01130392, + "auxiliary_loss_mlp": 0.01047055, + "balance_loss_clip": 1.03235698, + "balance_loss_mlp": 1.04616356, + "epoch": 0.2504133473620923, + "flos": 20917534688640.0, + "grad_norm": 1.7333709529875627, + "language_loss": 0.74548686, + "learning_rate": 3.5116498460248716e-06, + "loss": 0.76726139, + "num_input_tokens_seen": 89780905, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.84375, + "step": 4165, + "time_per_iteration": 2.4566714763641357 + }, + { + "auxiliary_loss_clip": 0.01139627, + "auxiliary_loss_mlp": 0.01045895, + "balance_loss_clip": 1.02819216, + "balance_loss_mlp": 1.04689348, + "epoch": 0.2504734706147603, + "flos": 20776549806720.0, + "grad_norm": 5.301488379412456, + "language_loss": 0.74214685, + "learning_rate": 3.5113948080733062e-06, + "loss": 0.76400197, + "num_input_tokens_seen": 89799230, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9296875, + "step": 4166, + "time_per_iteration": 2.462092161178589 + }, + { + "auxiliary_loss_clip": 0.01134276, + "auxiliary_loss_mlp": 0.01045442, + "balance_loss_clip": 1.02898526, + "balance_loss_mlp": 1.04551435, + "epoch": 0.25053359386742824, + "flos": 24349373861760.0, + "grad_norm": 1.9752225074857819, + "language_loss": 0.82011521, + "learning_rate": 3.5111397128094973e-06, + "loss": 0.84191239, + "num_input_tokens_seen": 89818240, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.890625, + "step": 4167, + "time_per_iteration": 2.482534885406494 + }, + { + "auxiliary_loss_clip": 0.01134736, + "auxiliary_loss_mlp": 0.01044102, + "balance_loss_clip": 1.0280689, + "balance_loss_mlp": 1.04616201, + "epoch": 0.2505937171200962, + "flos": 21214336769280.0, + "grad_norm": 2.42679689243218, + "language_loss": 0.79602242, + "learning_rate": 3.51088456024312e-06, + "loss": 0.81781083, + "num_input_tokens_seen": 89834485, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8828125, + "step": 4168, + "time_per_iteration": 2.463700532913208 + }, + { + "auxiliary_loss_clip": 0.01139283, + "auxiliary_loss_mlp": 0.01042051, + "balance_loss_clip": 1.02353752, + "balance_loss_mlp": 1.04523754, + "epoch": 0.25065384037276417, + "flos": 41427231379200.0, + "grad_norm": 2.966293758738445, + "language_loss": 0.70029891, + "learning_rate": 3.510629350383849e-06, + "loss": 0.72211224, + "num_input_tokens_seen": 89855645, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9375, + "step": 4169, + "time_per_iteration": 2.6148693561553955 + }, + { + "auxiliary_loss_clip": 0.01131562, + "auxiliary_loss_mlp": 0.0104538, + "balance_loss_clip": 1.02926338, + "balance_loss_mlp": 1.0446701, + "epoch": 0.25071396362543213, + "flos": 26102389219200.0, + "grad_norm": 1.8138505316100015, + "language_loss": 0.77564663, + "learning_rate": 3.510374083241361e-06, + "loss": 0.79741603, + "num_input_tokens_seen": 89874895, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8671875, + "step": 4170, + "time_per_iteration": 2.522921562194824 + }, + { + "auxiliary_loss_clip": 0.01137572, + "auxiliary_loss_mlp": 0.01043275, + "balance_loss_clip": 1.02731323, + "balance_loss_mlp": 1.04796529, + "epoch": 0.2507740868781001, + "flos": 19098982967040.0, + "grad_norm": 2.4512078878938404, + "language_loss": 0.76246989, + "learning_rate": 3.5101187588253368e-06, + "loss": 0.78427839, + "num_input_tokens_seen": 89891700, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8984375, + "step": 4171, + "time_per_iteration": 2.4322195053100586 + }, + { + "auxiliary_loss_clip": 0.01046694, + "auxiliary_loss_mlp": 0.01021172, + "balance_loss_clip": 1.01924038, + "balance_loss_mlp": 1.01739454, + "epoch": 0.25083421013076806, + "flos": 64341868296960.0, + "grad_norm": 0.8497756598481241, + "language_loss": 0.60047227, + "learning_rate": 3.509863377145458e-06, + "loss": 0.62115091, + "num_input_tokens_seen": 89955775, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.29296875, + "step": 4172, + "time_per_iteration": 3.1110994815826416 + }, + { + "auxiliary_loss_clip": 0.01137052, + "auxiliary_loss_mlp": 0.01042686, + "balance_loss_clip": 1.02567458, + "balance_loss_mlp": 1.04652381, + "epoch": 0.25089433338343603, + "flos": 24279599692800.0, + "grad_norm": 1.4442293166181488, + "language_loss": 0.78647727, + "learning_rate": 3.509607938211409e-06, + "loss": 0.80827463, + "num_input_tokens_seen": 89977150, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.90625, + "step": 4173, + "time_per_iteration": 2.481062889099121 + }, + { + "auxiliary_loss_clip": 0.01140203, + "auxiliary_loss_mlp": 0.01046542, + "balance_loss_clip": 1.0300796, + "balance_loss_mlp": 1.05017626, + "epoch": 0.250954456636104, + "flos": 14721472477440.0, + "grad_norm": 2.4202296115923883, + "language_loss": 0.83543748, + "learning_rate": 3.509352442032875e-06, + "loss": 0.85730493, + "num_input_tokens_seen": 89994925, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8984375, + "step": 4174, + "time_per_iteration": 2.4566147327423096 + }, + { + "auxiliary_loss_clip": 0.01138282, + "auxiliary_loss_mlp": 0.01040651, + "balance_loss_clip": 1.02299595, + "balance_loss_mlp": 1.04786515, + "epoch": 0.25101457988877196, + "flos": 22273593868800.0, + "grad_norm": 2.0903096624482624, + "language_loss": 0.71291864, + "learning_rate": 3.509096888619545e-06, + "loss": 0.73470795, + "num_input_tokens_seen": 90013235, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.90234375, + "step": 4175, + "time_per_iteration": 2.4616360664367676 + }, + { + "auxiliary_loss_clip": 0.01138348, + "auxiliary_loss_mlp": 0.01036282, + "balance_loss_clip": 1.01866269, + "balance_loss_mlp": 1.0460453, + "epoch": 0.2510747031414399, + "flos": 25188929424000.0, + "grad_norm": 2.247188920587568, + "language_loss": 0.80564427, + "learning_rate": 3.50884127798111e-06, + "loss": 0.82739055, + "num_input_tokens_seen": 90032150, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.921875, + "step": 4176, + "time_per_iteration": 2.525686740875244 + }, + { + "auxiliary_loss_clip": 0.01138723, + "auxiliary_loss_mlp": 0.01044107, + "balance_loss_clip": 1.02553427, + "balance_loss_mlp": 1.04782593, + "epoch": 0.25113482639410795, + "flos": 20704189858560.0, + "grad_norm": 2.362252442770041, + "language_loss": 0.83099151, + "learning_rate": 3.5085856101272623e-06, + "loss": 0.8528198, + "num_input_tokens_seen": 90049085, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.90625, + "step": 4177, + "time_per_iteration": 5.424759387969971 + }, + { + "auxiliary_loss_clip": 0.01135735, + "auxiliary_loss_mlp": 0.01043794, + "balance_loss_clip": 1.02675891, + "balance_loss_mlp": 1.04777622, + "epoch": 0.2511949496467759, + "flos": 21506936958720.0, + "grad_norm": 2.9753996759374846, + "language_loss": 0.8209883, + "learning_rate": 3.508329885067698e-06, + "loss": 0.84278357, + "num_input_tokens_seen": 90067695, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.87890625, + "step": 4178, + "time_per_iteration": 2.451418161392212 + }, + { + "auxiliary_loss_clip": 0.01130203, + "auxiliary_loss_mlp": 0.01042984, + "balance_loss_clip": 1.02702177, + "balance_loss_mlp": 1.04445124, + "epoch": 0.2512550728994439, + "flos": 20701999128960.0, + "grad_norm": 2.6671564243834505, + "language_loss": 0.75406277, + "learning_rate": 3.508074102812112e-06, + "loss": 0.77579463, + "num_input_tokens_seen": 90083890, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.859375, + "step": 4179, + "time_per_iteration": 2.4710347652435303 + }, + { + "auxiliary_loss_clip": 0.01135846, + "auxiliary_loss_mlp": 0.01048206, + "balance_loss_clip": 1.03050375, + "balance_loss_mlp": 1.04526711, + "epoch": 0.25131519615211184, + "flos": 18478626151680.0, + "grad_norm": 2.189208999533023, + "language_loss": 0.70452499, + "learning_rate": 3.507818263370206e-06, + "loss": 0.72636557, + "num_input_tokens_seen": 90100995, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.90625, + "step": 4180, + "time_per_iteration": 2.433922290802002 + }, + { + "auxiliary_loss_clip": 0.01132491, + "auxiliary_loss_mlp": 0.01041648, + "balance_loss_clip": 1.02485168, + "balance_loss_mlp": 1.04449701, + "epoch": 0.2513753194047798, + "flos": 20484955198080.0, + "grad_norm": 2.0603947372587244, + "language_loss": 0.85379761, + "learning_rate": 3.5075623667516796e-06, + "loss": 0.875539, + "num_input_tokens_seen": 90120365, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8828125, + "step": 4181, + "time_per_iteration": 2.4513771533966064 + }, + { + "auxiliary_loss_clip": 0.01135555, + "auxiliary_loss_mlp": 0.01042648, + "balance_loss_clip": 1.02608991, + "balance_loss_mlp": 1.0464716, + "epoch": 0.25143544265744777, + "flos": 37670077704960.0, + "grad_norm": 1.9568163341605829, + "language_loss": 0.67662674, + "learning_rate": 3.507306412966238e-06, + "loss": 0.69840884, + "num_input_tokens_seen": 90142610, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.890625, + "step": 4182, + "time_per_iteration": 2.588513135910034 + }, + { + "auxiliary_loss_clip": 0.01047089, + "auxiliary_loss_mlp": 0.01008874, + "balance_loss_clip": 1.00675201, + "balance_loss_mlp": 1.01742792, + "epoch": 0.25149556591011574, + "flos": 69367457923200.0, + "grad_norm": 0.8484678873575391, + "language_loss": 0.70098495, + "learning_rate": 3.5070504020235853e-06, + "loss": 0.72154456, + "num_input_tokens_seen": 90200555, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.296875, + "step": 4183, + "time_per_iteration": 3.0990090370178223 + }, + { + "auxiliary_loss_clip": 0.01129729, + "auxiliary_loss_mlp": 0.01038846, + "balance_loss_clip": 1.02088118, + "balance_loss_mlp": 1.04070854, + "epoch": 0.2515556891627837, + "flos": 13990402967040.0, + "grad_norm": 1.7162399200173233, + "language_loss": 0.7452544, + "learning_rate": 3.506794333933431e-06, + "loss": 0.76694012, + "num_input_tokens_seen": 90218120, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.890625, + "step": 4184, + "time_per_iteration": 2.4367544651031494 + }, + { + "auxiliary_loss_clip": 0.01137253, + "auxiliary_loss_mlp": 0.01045885, + "balance_loss_clip": 1.02888608, + "balance_loss_mlp": 1.04825735, + "epoch": 0.25161581241545167, + "flos": 22163527618560.0, + "grad_norm": 1.9130230292696613, + "language_loss": 0.82872695, + "learning_rate": 3.506538208705484e-06, + "loss": 0.85055834, + "num_input_tokens_seen": 90236790, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.890625, + "step": 4185, + "time_per_iteration": 2.4604692459106445 + }, + { + "auxiliary_loss_clip": 0.01047588, + "auxiliary_loss_mlp": 0.01003961, + "balance_loss_clip": 1.00194633, + "balance_loss_mlp": 1.01820421, + "epoch": 0.25167593566811963, + "flos": 69358407696000.0, + "grad_norm": 0.7885291752286397, + "language_loss": 0.61534387, + "learning_rate": 3.5062820263494574e-06, + "loss": 0.63585937, + "num_input_tokens_seen": 90297070, + "router_z_loss_clip": 0.0201416, + "router_z_loss_mlp": 0.29296875, + "step": 4186, + "time_per_iteration": 2.9629924297332764 + }, + { + "auxiliary_loss_clip": 0.01133243, + "auxiliary_loss_mlp": 0.01040873, + "balance_loss_clip": 1.02320647, + "balance_loss_mlp": 1.04432559, + "epoch": 0.2517360589207876, + "flos": 13261452359040.0, + "grad_norm": 2.1070381215060308, + "language_loss": 0.79260957, + "learning_rate": 3.5060257868750656e-06, + "loss": 0.81435084, + "num_input_tokens_seen": 90315255, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.890625, + "step": 4187, + "time_per_iteration": 2.454988479614258 + }, + { + "auxiliary_loss_clip": 0.01136483, + "auxiliary_loss_mlp": 0.01049456, + "balance_loss_clip": 1.03235006, + "balance_loss_mlp": 1.04733062, + "epoch": 0.25179618217345556, + "flos": 20376828282240.0, + "grad_norm": 1.5254881034867085, + "language_loss": 0.79854965, + "learning_rate": 3.5057694902920244e-06, + "loss": 0.82040906, + "num_input_tokens_seen": 90334990, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.890625, + "step": 4188, + "time_per_iteration": 2.4807493686676025 + }, + { + "auxiliary_loss_clip": 0.01135001, + "auxiliary_loss_mlp": 0.01046554, + "balance_loss_clip": 1.03022218, + "balance_loss_mlp": 1.04635882, + "epoch": 0.25185630542612353, + "flos": 27664718250240.0, + "grad_norm": 1.727912733373243, + "language_loss": 0.74509478, + "learning_rate": 3.5055131366100534e-06, + "loss": 0.76691031, + "num_input_tokens_seen": 90351825, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.88671875, + "step": 4189, + "time_per_iteration": 2.4887545108795166 + }, + { + "auxiliary_loss_clip": 0.01131737, + "auxiliary_loss_mlp": 0.01044524, + "balance_loss_clip": 1.02914619, + "balance_loss_mlp": 1.04616165, + "epoch": 0.25191642867879155, + "flos": 20996430912000.0, + "grad_norm": 1.957544272457229, + "language_loss": 0.84454727, + "learning_rate": 3.5052567258388745e-06, + "loss": 0.86630988, + "num_input_tokens_seen": 90369860, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.85546875, + "step": 4190, + "time_per_iteration": 2.4629735946655273 + }, + { + "auxiliary_loss_clip": 0.01134245, + "auxiliary_loss_mlp": 0.01044478, + "balance_loss_clip": 1.02633452, + "balance_loss_mlp": 1.04529381, + "epoch": 0.2519765519314595, + "flos": 21105671149440.0, + "grad_norm": 1.9468541382775664, + "language_loss": 0.75593925, + "learning_rate": 3.5050002579882082e-06, + "loss": 0.77772641, + "num_input_tokens_seen": 90389245, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.88671875, + "step": 4191, + "time_per_iteration": 2.451493263244629 + }, + { + "auxiliary_loss_clip": 0.01042669, + "auxiliary_loss_mlp": 0.0101771, + "balance_loss_clip": 1.01577878, + "balance_loss_mlp": 1.01320672, + "epoch": 0.2520366751841275, + "flos": 62744993360640.0, + "grad_norm": 0.7165761170014687, + "language_loss": 0.57155997, + "learning_rate": 3.5047437330677823e-06, + "loss": 0.59216374, + "num_input_tokens_seen": 90456735, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.29492188, + "step": 4192, + "time_per_iteration": 3.1455304622650146 + }, + { + "auxiliary_loss_clip": 0.01132992, + "auxiliary_loss_mlp": 0.01042104, + "balance_loss_clip": 1.02593958, + "balance_loss_mlp": 1.04640245, + "epoch": 0.25209679843679544, + "flos": 22230716008320.0, + "grad_norm": 2.0419031963399434, + "language_loss": 0.76306844, + "learning_rate": 3.504487151087323e-06, + "loss": 0.78481936, + "num_input_tokens_seen": 90474165, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8671875, + "step": 4193, + "time_per_iteration": 2.46201491355896 + }, + { + "auxiliary_loss_clip": 0.01136471, + "auxiliary_loss_mlp": 0.01048175, + "balance_loss_clip": 1.03115189, + "balance_loss_mlp": 1.04506373, + "epoch": 0.2521569216894634, + "flos": 12166643773440.0, + "grad_norm": 2.1192679618590007, + "language_loss": 0.84261906, + "learning_rate": 3.5042305120565598e-06, + "loss": 0.86446548, + "num_input_tokens_seen": 90491660, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9140625, + "step": 4194, + "time_per_iteration": 2.4525146484375 + }, + { + "auxiliary_loss_clip": 0.01138489, + "auxiliary_loss_mlp": 0.01049416, + "balance_loss_clip": 1.03404951, + "balance_loss_mlp": 1.04636192, + "epoch": 0.2522170449421314, + "flos": 23699786353920.0, + "grad_norm": 1.488794247862028, + "language_loss": 0.88176262, + "learning_rate": 3.5039738159852253e-06, + "loss": 0.90364158, + "num_input_tokens_seen": 90514025, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.921875, + "step": 4195, + "time_per_iteration": 2.507788896560669 + }, + { + "auxiliary_loss_clip": 0.01136069, + "auxiliary_loss_mlp": 0.01042605, + "balance_loss_clip": 1.02323329, + "balance_loss_mlp": 1.04540074, + "epoch": 0.25227716819479934, + "flos": 20955456472320.0, + "grad_norm": 2.8940350432545787, + "language_loss": 0.85288155, + "learning_rate": 3.503717062883053e-06, + "loss": 0.87466824, + "num_input_tokens_seen": 90533530, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.90625, + "step": 4196, + "time_per_iteration": 2.4843344688415527 + }, + { + "auxiliary_loss_clip": 0.01135455, + "auxiliary_loss_mlp": 0.01042942, + "balance_loss_clip": 1.02644312, + "balance_loss_mlp": 1.0454607, + "epoch": 0.2523372914474673, + "flos": 23331342597120.0, + "grad_norm": 1.6596186150335415, + "language_loss": 0.83368516, + "learning_rate": 3.5034602527597786e-06, + "loss": 0.85546911, + "num_input_tokens_seen": 90554025, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8984375, + "step": 4197, + "time_per_iteration": 2.480834484100342 + }, + { + "auxiliary_loss_clip": 0.01139173, + "auxiliary_loss_mlp": 0.01047624, + "balance_loss_clip": 1.02840698, + "balance_loss_mlp": 1.04775643, + "epoch": 0.25239741470013527, + "flos": 36970321875840.0, + "grad_norm": 2.7573342641631093, + "language_loss": 0.72406292, + "learning_rate": 3.5032033856251405e-06, + "loss": 0.74593097, + "num_input_tokens_seen": 90576930, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.9140625, + "step": 4198, + "time_per_iteration": 2.6081368923187256 + }, + { + "auxiliary_loss_clip": 0.01139571, + "auxiliary_loss_mlp": 0.01052953, + "balance_loss_clip": 1.03469038, + "balance_loss_mlp": 1.0462662, + "epoch": 0.25245753795280323, + "flos": 18515757836160.0, + "grad_norm": 1.9511850390779815, + "language_loss": 0.76798427, + "learning_rate": 3.50294646148888e-06, + "loss": 0.7899096, + "num_input_tokens_seen": 90595710, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.93359375, + "step": 4199, + "time_per_iteration": 2.463322162628174 + }, + { + "auxiliary_loss_clip": 0.01137047, + "auxiliary_loss_mlp": 0.01039237, + "balance_loss_clip": 1.02334595, + "balance_loss_mlp": 1.04600453, + "epoch": 0.2525176612054712, + "flos": 32344884737280.0, + "grad_norm": 1.6881838085079777, + "language_loss": 0.727651, + "learning_rate": 3.502689480360739e-06, + "loss": 0.74941385, + "num_input_tokens_seen": 90617945, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.91015625, + "step": 4200, + "time_per_iteration": 2.586298942565918 + }, + { + "auxiliary_loss_clip": 0.01134782, + "auxiliary_loss_mlp": 0.01047981, + "balance_loss_clip": 1.03206062, + "balance_loss_mlp": 1.04300654, + "epoch": 0.25257778445813917, + "flos": 45258217459200.0, + "grad_norm": 1.7166145531144803, + "language_loss": 0.82271791, + "learning_rate": 3.5024324422504616e-06, + "loss": 0.84454548, + "num_input_tokens_seen": 90640855, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.91796875, + "step": 4201, + "time_per_iteration": 2.6430721282958984 + }, + { + "auxiliary_loss_clip": 0.01138395, + "auxiliary_loss_mlp": 0.01046441, + "balance_loss_clip": 1.02960861, + "balance_loss_mlp": 1.04680324, + "epoch": 0.25263790771080713, + "flos": 23367791923200.0, + "grad_norm": 1.8945534984036327, + "language_loss": 0.74844849, + "learning_rate": 3.5021753471677965e-06, + "loss": 0.77029681, + "num_input_tokens_seen": 90661350, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9140625, + "step": 4202, + "time_per_iteration": 2.477376699447632 + }, + { + "auxiliary_loss_clip": 0.01133899, + "auxiliary_loss_mlp": 0.01041807, + "balance_loss_clip": 1.02545786, + "balance_loss_mlp": 1.04550529, + "epoch": 0.25269803096347515, + "flos": 18515039564160.0, + "grad_norm": 1.8769942277842264, + "language_loss": 0.73058856, + "learning_rate": 3.501918195122491e-06, + "loss": 0.75234556, + "num_input_tokens_seen": 90680540, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.88671875, + "step": 4203, + "time_per_iteration": 2.4526968002319336 + }, + { + "auxiliary_loss_clip": 0.01134593, + "auxiliary_loss_mlp": 0.01040695, + "balance_loss_clip": 1.02403569, + "balance_loss_mlp": 1.04434335, + "epoch": 0.2527581542161431, + "flos": 24610552629120.0, + "grad_norm": 1.7217444479200419, + "language_loss": 0.77377844, + "learning_rate": 3.501660986124297e-06, + "loss": 0.79553127, + "num_input_tokens_seen": 90703460, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.90234375, + "step": 4204, + "time_per_iteration": 2.540573835372925 + }, + { + "auxiliary_loss_clip": 0.01135598, + "auxiliary_loss_mlp": 0.01051513, + "balance_loss_clip": 1.03463292, + "balance_loss_mlp": 1.04443574, + "epoch": 0.2528182774688111, + "flos": 12641275111680.0, + "grad_norm": 3.2226665017353655, + "language_loss": 0.72443974, + "learning_rate": 3.5014037201829684e-06, + "loss": 0.74631095, + "num_input_tokens_seen": 90718815, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9140625, + "step": 4205, + "time_per_iteration": 2.405823230743408 + }, + { + "auxiliary_loss_clip": 0.01131667, + "auxiliary_loss_mlp": 0.01038371, + "balance_loss_clip": 1.02304697, + "balance_loss_mlp": 1.04673433, + "epoch": 0.25287840072147905, + "flos": 46936789879680.0, + "grad_norm": 1.4419344159614245, + "language_loss": 0.75674903, + "learning_rate": 3.50114639730826e-06, + "loss": 0.77844942, + "num_input_tokens_seen": 90742125, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.84765625, + "step": 4206, + "time_per_iteration": 2.7117254734039307 + }, + { + "auxiliary_loss_clip": 0.01134608, + "auxiliary_loss_mlp": 0.01041465, + "balance_loss_clip": 1.02502584, + "balance_loss_mlp": 1.04381466, + "epoch": 0.252938523974147, + "flos": 18879712392960.0, + "grad_norm": 1.8459801280493204, + "language_loss": 0.79013956, + "learning_rate": 3.5008890175099296e-06, + "loss": 0.81190026, + "num_input_tokens_seen": 90760785, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90625, + "step": 4207, + "time_per_iteration": 2.4338433742523193 + }, + { + "auxiliary_loss_clip": 0.01131159, + "auxiliary_loss_mlp": 0.01042915, + "balance_loss_clip": 1.02688169, + "balance_loss_mlp": 1.04521704, + "epoch": 0.252998647226815, + "flos": 21434720664960.0, + "grad_norm": 1.5263501886522268, + "language_loss": 0.76010746, + "learning_rate": 3.5006315807977375e-06, + "loss": 0.78184819, + "num_input_tokens_seen": 90780045, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.859375, + "step": 4208, + "time_per_iteration": 2.4712774753570557 + }, + { + "auxiliary_loss_clip": 0.01131243, + "auxiliary_loss_mlp": 0.01042204, + "balance_loss_clip": 1.02559781, + "balance_loss_mlp": 1.04407811, + "epoch": 0.25305877047948294, + "flos": 25442171285760.0, + "grad_norm": 1.8494822470113228, + "language_loss": 0.6965062, + "learning_rate": 3.5003740871814456e-06, + "loss": 0.71824062, + "num_input_tokens_seen": 90797980, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.87109375, + "step": 4209, + "time_per_iteration": 2.4723262786865234 + }, + { + "auxiliary_loss_clip": 0.01046036, + "auxiliary_loss_mlp": 0.00999993, + "balance_loss_clip": 0.99819291, + "balance_loss_mlp": 1.01643014, + "epoch": 0.2531188937321509, + "flos": 60185603629440.0, + "grad_norm": 0.7581785291884388, + "language_loss": 0.55080217, + "learning_rate": 3.5001165366708175e-06, + "loss": 0.57126248, + "num_input_tokens_seen": 90864865, + "router_z_loss_clip": 0.01794434, + "router_z_loss_mlp": 0.296875, + "step": 4210, + "time_per_iteration": 3.141958236694336 + }, + { + "auxiliary_loss_clip": 0.0113523, + "auxiliary_loss_mlp": 0.01034659, + "balance_loss_clip": 1.01853585, + "balance_loss_mlp": 1.04541481, + "epoch": 0.25317901698481887, + "flos": 19682387665920.0, + "grad_norm": 2.0581011511690606, + "language_loss": 0.8021341, + "learning_rate": 3.4998589292756204e-06, + "loss": 0.82383299, + "num_input_tokens_seen": 90882885, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8984375, + "step": 4211, + "time_per_iteration": 2.4423909187316895 + }, + { + "auxiliary_loss_clip": 0.01128499, + "auxiliary_loss_mlp": 0.01039387, + "balance_loss_clip": 1.02402079, + "balance_loss_mlp": 1.04284227, + "epoch": 0.25323914023748684, + "flos": 24424355502720.0, + "grad_norm": 1.6375033978461933, + "language_loss": 0.78310406, + "learning_rate": 3.499601265005622e-06, + "loss": 0.80478293, + "num_input_tokens_seen": 90902985, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.85546875, + "step": 4212, + "time_per_iteration": 2.535416841506958 + }, + { + "auxiliary_loss_clip": 0.01131331, + "auxiliary_loss_mlp": 0.01040125, + "balance_loss_clip": 1.02356696, + "balance_loss_mlp": 1.04314673, + "epoch": 0.2532992634901548, + "flos": 25447450584960.0, + "grad_norm": 2.0206536972721088, + "language_loss": 0.53393918, + "learning_rate": 3.4993435438705938e-06, + "loss": 0.55565375, + "num_input_tokens_seen": 90923550, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8828125, + "step": 4213, + "time_per_iteration": 2.488844871520996 + }, + { + "auxiliary_loss_clip": 0.01132972, + "auxiliary_loss_mlp": 0.01042806, + "balance_loss_clip": 1.02566385, + "balance_loss_mlp": 1.04508567, + "epoch": 0.25335938674282277, + "flos": 18880538405760.0, + "grad_norm": 2.6682600080383816, + "language_loss": 0.65329081, + "learning_rate": 3.499085765880308e-06, + "loss": 0.67504859, + "num_input_tokens_seen": 90943260, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.87890625, + "step": 4214, + "time_per_iteration": 2.478422164916992 + }, + { + "auxiliary_loss_clip": 0.01043385, + "auxiliary_loss_mlp": 0.0100812, + "balance_loss_clip": 1.00630808, + "balance_loss_mlp": 1.0142169, + "epoch": 0.25341950999549073, + "flos": 53062649936640.0, + "grad_norm": 0.8479929036578698, + "language_loss": 0.58049941, + "learning_rate": 3.4988279310445396e-06, + "loss": 0.60101438, + "num_input_tokens_seen": 90996295, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.29296875, + "step": 4215, + "time_per_iteration": 2.824084997177124 + }, + { + "auxiliary_loss_clip": 0.01133433, + "auxiliary_loss_mlp": 0.01043479, + "balance_loss_clip": 1.02636075, + "balance_loss_mlp": 1.04583967, + "epoch": 0.2534796332481587, + "flos": 39020247054720.0, + "grad_norm": 1.7693463876532338, + "language_loss": 0.83949232, + "learning_rate": 3.498570039373066e-06, + "loss": 0.86126143, + "num_input_tokens_seen": 91017545, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.875, + "step": 4216, + "time_per_iteration": 2.650329828262329 + }, + { + "auxiliary_loss_clip": 0.01134428, + "auxiliary_loss_mlp": 0.01041784, + "balance_loss_clip": 1.02504706, + "balance_loss_mlp": 1.04571652, + "epoch": 0.2535397565008267, + "flos": 23586990670080.0, + "grad_norm": 1.7652170119003572, + "language_loss": 0.80028123, + "learning_rate": 3.498312090875666e-06, + "loss": 0.82204342, + "num_input_tokens_seen": 91037715, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.88671875, + "step": 4217, + "time_per_iteration": 2.49381160736084 + }, + { + "auxiliary_loss_clip": 0.01129632, + "auxiliary_loss_mlp": 0.01039348, + "balance_loss_clip": 1.02422011, + "balance_loss_mlp": 1.04193234, + "epoch": 0.2535998797534947, + "flos": 19281373251840.0, + "grad_norm": 2.1701414828965464, + "language_loss": 0.75014293, + "learning_rate": 3.4980540855621218e-06, + "loss": 0.7718327, + "num_input_tokens_seen": 91055295, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.87890625, + "step": 4218, + "time_per_iteration": 2.4794864654541016 + }, + { + "auxiliary_loss_clip": 0.01135591, + "auxiliary_loss_mlp": 0.01041436, + "balance_loss_clip": 1.02462721, + "balance_loss_mlp": 1.04470503, + "epoch": 0.25366000300616265, + "flos": 24024382583040.0, + "grad_norm": 1.8718582993796022, + "language_loss": 0.74483025, + "learning_rate": 3.4977960234422167e-06, + "loss": 0.76660055, + "num_input_tokens_seen": 91075485, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.90625, + "step": 4219, + "time_per_iteration": 5.428370952606201 + }, + { + "auxiliary_loss_clip": 0.01137942, + "auxiliary_loss_mlp": 0.01052625, + "balance_loss_clip": 1.0351491, + "balance_loss_mlp": 1.04695058, + "epoch": 0.2537201262588306, + "flos": 16289368116480.0, + "grad_norm": 2.1507448030921057, + "language_loss": 0.81194967, + "learning_rate": 3.497537904525736e-06, + "loss": 0.83385527, + "num_input_tokens_seen": 91093620, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.91015625, + "step": 4220, + "time_per_iteration": 2.454045534133911 + }, + { + "auxiliary_loss_clip": 0.01134951, + "auxiliary_loss_mlp": 0.01047743, + "balance_loss_clip": 1.03007603, + "balance_loss_mlp": 1.04596126, + "epoch": 0.2537802495114986, + "flos": 23294677789440.0, + "grad_norm": 2.058400170489012, + "language_loss": 0.70873475, + "learning_rate": 3.497279728822468e-06, + "loss": 0.73056173, + "num_input_tokens_seen": 91114110, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.890625, + "step": 4221, + "time_per_iteration": 2.4728429317474365 + }, + { + "auxiliary_loss_clip": 0.01134228, + "auxiliary_loss_mlp": 0.01039832, + "balance_loss_clip": 1.02309537, + "balance_loss_mlp": 1.0444454, + "epoch": 0.25384037276416654, + "flos": 17639142416640.0, + "grad_norm": 2.3290205392002847, + "language_loss": 0.62039649, + "learning_rate": 3.497021496342202e-06, + "loss": 0.64213717, + "num_input_tokens_seen": 91133135, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8984375, + "step": 4222, + "time_per_iteration": 2.4465436935424805 + }, + { + "auxiliary_loss_clip": 0.01137839, + "auxiliary_loss_mlp": 0.01052178, + "balance_loss_clip": 1.0352385, + "balance_loss_mlp": 1.04635429, + "epoch": 0.2539004960168345, + "flos": 21507044699520.0, + "grad_norm": 1.6514367228652884, + "language_loss": 0.74686599, + "learning_rate": 3.496763207094731e-06, + "loss": 0.76876616, + "num_input_tokens_seen": 91151805, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9140625, + "step": 4223, + "time_per_iteration": 2.449887275695801 + }, + { + "auxiliary_loss_clip": 0.01134875, + "auxiliary_loss_mlp": 0.01035973, + "balance_loss_clip": 1.02001095, + "balance_loss_mlp": 1.04763556, + "epoch": 0.2539606192695025, + "flos": 23950909313280.0, + "grad_norm": 1.7274606282993847, + "language_loss": 0.79782087, + "learning_rate": 3.49650486108985e-06, + "loss": 0.81952935, + "num_input_tokens_seen": 91172270, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.875, + "step": 4224, + "time_per_iteration": 2.4809348583221436 + }, + { + "auxiliary_loss_clip": 0.01129812, + "auxiliary_loss_mlp": 0.01043453, + "balance_loss_clip": 1.02668035, + "balance_loss_mlp": 1.04306865, + "epoch": 0.25402074252217044, + "flos": 24169784837760.0, + "grad_norm": 1.7388314634599362, + "language_loss": 0.77813148, + "learning_rate": 3.496246458337354e-06, + "loss": 0.79986417, + "num_input_tokens_seen": 91192080, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8671875, + "step": 4225, + "time_per_iteration": 2.4813735485076904 + }, + { + "auxiliary_loss_clip": 0.01135622, + "auxiliary_loss_mlp": 0.01054065, + "balance_loss_clip": 1.03661263, + "balance_loss_mlp": 1.04603362, + "epoch": 0.2540808657748384, + "flos": 22303758314880.0, + "grad_norm": 1.6070040517314534, + "language_loss": 0.84763634, + "learning_rate": 3.4959879988470426e-06, + "loss": 0.86953318, + "num_input_tokens_seen": 91211450, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.89453125, + "step": 4226, + "time_per_iteration": 2.4583990573883057 + }, + { + "auxiliary_loss_clip": 0.01130131, + "auxiliary_loss_mlp": 0.0104498, + "balance_loss_clip": 1.0277667, + "balance_loss_mlp": 1.04317141, + "epoch": 0.25414098902750637, + "flos": 27599541022080.0, + "grad_norm": 2.4872704745527168, + "language_loss": 0.70759654, + "learning_rate": 3.4957294826287164e-06, + "loss": 0.72934765, + "num_input_tokens_seen": 91231835, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.8671875, + "step": 4227, + "time_per_iteration": 2.532057762145996 + }, + { + "auxiliary_loss_clip": 0.01041509, + "auxiliary_loss_mlp": 0.01000975, + "balance_loss_clip": 0.9989962, + "balance_loss_mlp": 1.01186037, + "epoch": 0.25420111228017434, + "flos": 58170834887040.0, + "grad_norm": 0.9701035361715339, + "language_loss": 0.61865914, + "learning_rate": 3.4954709096921785e-06, + "loss": 0.63908398, + "num_input_tokens_seen": 91288755, + "router_z_loss_clip": 0.01977539, + "router_z_loss_mlp": 0.296875, + "step": 4228, + "time_per_iteration": 2.9040682315826416 + }, + { + "auxiliary_loss_clip": 0.01136332, + "auxiliary_loss_mlp": 0.01037582, + "balance_loss_clip": 1.02026105, + "balance_loss_mlp": 1.04564357, + "epoch": 0.2542612355328423, + "flos": 11464409905920.0, + "grad_norm": 4.885618231754604, + "language_loss": 0.86024547, + "learning_rate": 3.4952122800472336e-06, + "loss": 0.88198459, + "num_input_tokens_seen": 91302485, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.90625, + "step": 4229, + "time_per_iteration": 2.404157876968384 + }, + { + "auxiliary_loss_clip": 0.0113505, + "auxiliary_loss_mlp": 0.01044312, + "balance_loss_clip": 1.02696753, + "balance_loss_mlp": 1.0466435, + "epoch": 0.2543213587855103, + "flos": 22965879669120.0, + "grad_norm": 1.8862111092995248, + "language_loss": 0.77280557, + "learning_rate": 3.4949535937036892e-06, + "loss": 0.79459918, + "num_input_tokens_seen": 91321120, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.8828125, + "step": 4230, + "time_per_iteration": 2.4956207275390625 + }, + { + "auxiliary_loss_clip": 0.01133757, + "auxiliary_loss_mlp": 0.01046935, + "balance_loss_clip": 1.02980483, + "balance_loss_mlp": 1.04598594, + "epoch": 0.2543814820381783, + "flos": 18253178438400.0, + "grad_norm": 1.9381647251913205, + "language_loss": 0.75116754, + "learning_rate": 3.4946948506713544e-06, + "loss": 0.77297449, + "num_input_tokens_seen": 91338575, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.87890625, + "step": 4231, + "time_per_iteration": 2.4570302963256836 + }, + { + "auxiliary_loss_clip": 0.0113225, + "auxiliary_loss_mlp": 0.01038768, + "balance_loss_clip": 1.02253127, + "balance_loss_mlp": 1.04484463, + "epoch": 0.25444160529084625, + "flos": 15632705629440.0, + "grad_norm": 2.3236339630790916, + "language_loss": 0.74055511, + "learning_rate": 3.4944360509600416e-06, + "loss": 0.76226532, + "num_input_tokens_seen": 91357355, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.875, + "step": 4232, + "time_per_iteration": 2.4537932872772217 + }, + { + "auxiliary_loss_clip": 0.01134838, + "auxiliary_loss_mlp": 0.01041691, + "balance_loss_clip": 1.02412581, + "balance_loss_mlp": 1.04658151, + "epoch": 0.2545017285435142, + "flos": 24601610142720.0, + "grad_norm": 1.8521853851823955, + "language_loss": 0.86557174, + "learning_rate": 3.4941771945795637e-06, + "loss": 0.88733703, + "num_input_tokens_seen": 91376515, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8828125, + "step": 4233, + "time_per_iteration": 2.4943323135375977 + }, + { + "auxiliary_loss_clip": 0.01125532, + "auxiliary_loss_mlp": 0.01040876, + "balance_loss_clip": 1.02570057, + "balance_loss_mlp": 1.04215169, + "epoch": 0.2545618517961822, + "flos": 24679069822080.0, + "grad_norm": 1.5280608213400515, + "language_loss": 0.74841732, + "learning_rate": 3.493918281539737e-06, + "loss": 0.7700814, + "num_input_tokens_seen": 91397595, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8359375, + "step": 4234, + "time_per_iteration": 2.541349172592163 + }, + { + "auxiliary_loss_clip": 0.01133471, + "auxiliary_loss_mlp": 0.01042663, + "balance_loss_clip": 1.02661681, + "balance_loss_mlp": 1.04286838, + "epoch": 0.25462197504885015, + "flos": 23915106432000.0, + "grad_norm": 1.542232814469661, + "language_loss": 0.7489568, + "learning_rate": 3.493659311850379e-06, + "loss": 0.77071816, + "num_input_tokens_seen": 91417775, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.90625, + "step": 4235, + "time_per_iteration": 2.5059099197387695 + }, + { + "auxiliary_loss_clip": 0.01141785, + "auxiliary_loss_mlp": 0.01044345, + "balance_loss_clip": 1.02570069, + "balance_loss_mlp": 1.04655004, + "epoch": 0.2546820983015181, + "flos": 24789387467520.0, + "grad_norm": 2.0015253194085645, + "language_loss": 0.64487904, + "learning_rate": 3.4934002855213106e-06, + "loss": 0.6667403, + "num_input_tokens_seen": 91437665, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.953125, + "step": 4236, + "time_per_iteration": 2.512286424636841 + }, + { + "auxiliary_loss_clip": 0.01131709, + "auxiliary_loss_mlp": 0.01032901, + "balance_loss_clip": 1.01757693, + "balance_loss_mlp": 1.04509079, + "epoch": 0.2547422215541861, + "flos": 18734130570240.0, + "grad_norm": 1.5430935122242522, + "language_loss": 0.67046815, + "learning_rate": 3.493141202562354e-06, + "loss": 0.69211423, + "num_input_tokens_seen": 91456705, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8671875, + "step": 4237, + "time_per_iteration": 2.455911636352539 + }, + { + "auxiliary_loss_clip": 0.01134325, + "auxiliary_loss_mlp": 0.01045903, + "balance_loss_clip": 1.02916634, + "balance_loss_mlp": 1.04509199, + "epoch": 0.25480234480685404, + "flos": 21032449274880.0, + "grad_norm": 1.9754127990153556, + "language_loss": 0.74863333, + "learning_rate": 3.492882062983333e-06, + "loss": 0.77043563, + "num_input_tokens_seen": 91475535, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.89453125, + "step": 4238, + "time_per_iteration": 2.4770114421844482 + }, + { + "auxiliary_loss_clip": 0.01136693, + "auxiliary_loss_mlp": 0.0104647, + "balance_loss_clip": 1.02848125, + "balance_loss_mlp": 1.04734778, + "epoch": 0.254862468059522, + "flos": 25082167224960.0, + "grad_norm": 1.8397193389954023, + "language_loss": 0.8033936, + "learning_rate": 3.492622866794074e-06, + "loss": 0.82522523, + "num_input_tokens_seen": 91499140, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.89453125, + "step": 4239, + "time_per_iteration": 2.5087499618530273 + }, + { + "auxiliary_loss_clip": 0.01131893, + "auxiliary_loss_mlp": 0.01041684, + "balance_loss_clip": 1.02457762, + "balance_loss_mlp": 1.04512548, + "epoch": 0.25492259131219, + "flos": 20558392554240.0, + "grad_norm": 1.749971041952711, + "language_loss": 0.77208781, + "learning_rate": 3.492363614004407e-06, + "loss": 0.7938236, + "num_input_tokens_seen": 91518335, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.8671875, + "step": 4240, + "time_per_iteration": 2.4757072925567627 + }, + { + "auxiliary_loss_clip": 0.01141112, + "auxiliary_loss_mlp": 0.01042401, + "balance_loss_clip": 1.02463925, + "balance_loss_mlp": 1.04773092, + "epoch": 0.25498271456485794, + "flos": 25042485674880.0, + "grad_norm": 2.0511352101670126, + "language_loss": 0.83254647, + "learning_rate": 3.492104304624162e-06, + "loss": 0.85438156, + "num_input_tokens_seen": 91537655, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.93359375, + "step": 4241, + "time_per_iteration": 2.5062708854675293 + }, + { + "auxiliary_loss_clip": 0.01136148, + "auxiliary_loss_mlp": 0.01044003, + "balance_loss_clip": 1.02761221, + "balance_loss_mlp": 1.0463624, + "epoch": 0.2550428378175259, + "flos": 26178412354560.0, + "grad_norm": 1.6663950411566644, + "language_loss": 0.73410285, + "learning_rate": 3.4918449386631725e-06, + "loss": 0.75590432, + "num_input_tokens_seen": 91557545, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8984375, + "step": 4242, + "time_per_iteration": 2.5570173263549805 + }, + { + "auxiliary_loss_clip": 0.01136205, + "auxiliary_loss_mlp": 0.01038733, + "balance_loss_clip": 1.02249646, + "balance_loss_mlp": 1.04695976, + "epoch": 0.2551029610701939, + "flos": 15267170874240.0, + "grad_norm": 2.4092613771466453, + "language_loss": 0.72371018, + "learning_rate": 3.491585516131273e-06, + "loss": 0.74545956, + "num_input_tokens_seen": 91574405, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.890625, + "step": 4243, + "time_per_iteration": 2.440492868423462 + }, + { + "auxiliary_loss_clip": 0.01136318, + "auxiliary_loss_mlp": 0.01041492, + "balance_loss_clip": 1.02507675, + "balance_loss_mlp": 1.04668963, + "epoch": 0.2551630843228619, + "flos": 18112193556480.0, + "grad_norm": 2.3937572910440847, + "language_loss": 0.81865323, + "learning_rate": 3.491326037038301e-06, + "loss": 0.84043133, + "num_input_tokens_seen": 91593755, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8984375, + "step": 4244, + "time_per_iteration": 2.4728784561157227 + }, + { + "auxiliary_loss_clip": 0.01044231, + "auxiliary_loss_mlp": 0.01002536, + "balance_loss_clip": 1.00084293, + "balance_loss_mlp": 1.01474202, + "epoch": 0.25522320757552985, + "flos": 70520192167680.0, + "grad_norm": 0.7400094393930867, + "language_loss": 0.5777986, + "learning_rate": 3.4910665013940967e-06, + "loss": 0.5982663, + "num_input_tokens_seen": 91660335, + "router_z_loss_clip": 0.01696777, + "router_z_loss_mlp": 0.29492188, + "step": 4245, + "time_per_iteration": 3.155487537384033 + }, + { + "auxiliary_loss_clip": 0.01135489, + "auxiliary_loss_mlp": 0.01049355, + "balance_loss_clip": 1.03248656, + "balance_loss_mlp": 1.04526567, + "epoch": 0.2552833308281978, + "flos": 22893088757760.0, + "grad_norm": 1.9776048921576397, + "language_loss": 0.65246034, + "learning_rate": 3.4908069092085015e-06, + "loss": 0.67430878, + "num_input_tokens_seen": 91678500, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.90234375, + "step": 4246, + "time_per_iteration": 2.4889461994171143 + }, + { + "auxiliary_loss_clip": 0.01127053, + "auxiliary_loss_mlp": 0.0104224, + "balance_loss_clip": 1.02715993, + "balance_loss_mlp": 1.04366493, + "epoch": 0.2553434540808658, + "flos": 22053605022720.0, + "grad_norm": 1.748925776992144, + "language_loss": 0.81467927, + "learning_rate": 3.4905472604913585e-06, + "loss": 0.83637214, + "num_input_tokens_seen": 91696430, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8359375, + "step": 4247, + "time_per_iteration": 2.4680213928222656 + }, + { + "auxiliary_loss_clip": 0.0114026, + "auxiliary_loss_mlp": 0.01045845, + "balance_loss_clip": 1.02718902, + "balance_loss_mlp": 1.04570985, + "epoch": 0.25540357733353375, + "flos": 16544190176640.0, + "grad_norm": 2.9702547035135165, + "language_loss": 0.83062297, + "learning_rate": 3.490287555252514e-06, + "loss": 0.85248411, + "num_input_tokens_seen": 91713270, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.9453125, + "step": 4248, + "time_per_iteration": 2.446810245513916 + }, + { + "auxiliary_loss_clip": 0.01136577, + "auxiliary_loss_mlp": 0.01045007, + "balance_loss_clip": 1.02793586, + "balance_loss_mlp": 1.04672599, + "epoch": 0.2554637005862017, + "flos": 17565022702080.0, + "grad_norm": 2.21885342952208, + "language_loss": 0.84529531, + "learning_rate": 3.4900277935018166e-06, + "loss": 0.86711109, + "num_input_tokens_seen": 91728865, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.8984375, + "step": 4249, + "time_per_iteration": 2.4372382164001465 + }, + { + "auxiliary_loss_clip": 0.01044447, + "auxiliary_loss_mlp": 0.01003984, + "balance_loss_clip": 1.00217199, + "balance_loss_mlp": 1.01503897, + "epoch": 0.2555238238388697, + "flos": 72244763953920.0, + "grad_norm": 0.7531523874953217, + "language_loss": 0.56312215, + "learning_rate": 3.489767975249115e-06, + "loss": 0.58360648, + "num_input_tokens_seen": 91787470, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.29492188, + "step": 4250, + "time_per_iteration": 3.047654628753662 + }, + { + "auxiliary_loss_clip": 0.01133573, + "auxiliary_loss_mlp": 0.01038351, + "balance_loss_clip": 1.02139914, + "balance_loss_mlp": 1.04434705, + "epoch": 0.25558394709153764, + "flos": 24389414547840.0, + "grad_norm": 2.1374171101673243, + "language_loss": 0.80306417, + "learning_rate": 3.4895081005042632e-06, + "loss": 0.82478344, + "num_input_tokens_seen": 91805640, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.890625, + "step": 4251, + "time_per_iteration": 2.4866387844085693 + }, + { + "auxiliary_loss_clip": 0.01042955, + "auxiliary_loss_mlp": 0.01004928, + "balance_loss_clip": 1.00307989, + "balance_loss_mlp": 1.01383376, + "epoch": 0.2556440703442056, + "flos": 69231213636480.0, + "grad_norm": 0.7958061962206047, + "language_loss": 0.66077995, + "learning_rate": 3.4892481692771146e-06, + "loss": 0.6812588, + "num_input_tokens_seen": 91869695, + "router_z_loss_clip": 0.01843262, + "router_z_loss_mlp": 0.29296875, + "step": 4252, + "time_per_iteration": 3.117496967315674 + }, + { + "auxiliary_loss_clip": 0.011309, + "auxiliary_loss_mlp": 0.01037068, + "balance_loss_clip": 1.02198839, + "balance_loss_mlp": 1.04373813, + "epoch": 0.2557041935968736, + "flos": 24863902231680.0, + "grad_norm": 2.169743717969613, + "language_loss": 0.73382849, + "learning_rate": 3.4889881815775267e-06, + "loss": 0.75550812, + "num_input_tokens_seen": 91889920, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.87109375, + "step": 4253, + "time_per_iteration": 2.5709948539733887 + }, + { + "auxiliary_loss_clip": 0.01134729, + "auxiliary_loss_mlp": 0.01044447, + "balance_loss_clip": 1.02873516, + "balance_loss_mlp": 1.04698956, + "epoch": 0.25576431684954154, + "flos": 22492110257280.0, + "grad_norm": 1.9741012093631007, + "language_loss": 0.72927308, + "learning_rate": 3.488728137415357e-06, + "loss": 0.75106484, + "num_input_tokens_seen": 91908665, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.875, + "step": 4254, + "time_per_iteration": 2.509932518005371 + }, + { + "auxiliary_loss_clip": 0.01133463, + "auxiliary_loss_mlp": 0.01043601, + "balance_loss_clip": 1.02636361, + "balance_loss_mlp": 1.04452896, + "epoch": 0.2558244401022095, + "flos": 19826748426240.0, + "grad_norm": 1.7290530974650873, + "language_loss": 0.80863065, + "learning_rate": 3.4884680368004675e-06, + "loss": 0.8304013, + "num_input_tokens_seen": 91927855, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.890625, + "step": 4255, + "time_per_iteration": 2.4473092555999756 + }, + { + "auxiliary_loss_clip": 0.01133499, + "auxiliary_loss_mlp": 0.01043496, + "balance_loss_clip": 1.02681875, + "balance_loss_mlp": 1.04673088, + "epoch": 0.2558845633548775, + "flos": 23220486247680.0, + "grad_norm": 1.512169748685899, + "language_loss": 0.85572308, + "learning_rate": 3.488207879742721e-06, + "loss": 0.87749302, + "num_input_tokens_seen": 91948500, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8671875, + "step": 4256, + "time_per_iteration": 2.500788927078247 + }, + { + "auxiliary_loss_clip": 0.01136428, + "auxiliary_loss_mlp": 0.01048361, + "balance_loss_clip": 1.03119493, + "balance_loss_mlp": 1.04482555, + "epoch": 0.2559446866075455, + "flos": 16837867774080.0, + "grad_norm": 4.026866255210063, + "language_loss": 0.74821836, + "learning_rate": 3.4879476662519826e-06, + "loss": 0.77006626, + "num_input_tokens_seen": 91968375, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9140625, + "step": 4257, + "time_per_iteration": 2.4511358737945557 + }, + { + "auxiliary_loss_clip": 0.01040508, + "auxiliary_loss_mlp": 0.01009541, + "balance_loss_clip": 1.00763345, + "balance_loss_mlp": 1.01154876, + "epoch": 0.25600480986021346, + "flos": 57593786895360.0, + "grad_norm": 0.8061088541165783, + "language_loss": 0.65227318, + "learning_rate": 3.4876873963381196e-06, + "loss": 0.67277366, + "num_input_tokens_seen": 92028490, + "router_z_loss_clip": 0.01904297, + "router_z_loss_mlp": 0.2890625, + "step": 4258, + "time_per_iteration": 2.9953789710998535 + }, + { + "auxiliary_loss_clip": 0.01131454, + "auxiliary_loss_mlp": 0.0103789, + "balance_loss_clip": 1.0202589, + "balance_loss_mlp": 1.04548264, + "epoch": 0.2560649331128814, + "flos": 27819529868160.0, + "grad_norm": 1.622828615893818, + "language_loss": 0.7647177, + "learning_rate": 3.4874270700110013e-06, + "loss": 0.78641111, + "num_input_tokens_seen": 92048060, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.859375, + "step": 4259, + "time_per_iteration": 2.5079360008239746 + }, + { + "auxiliary_loss_clip": 0.01038142, + "auxiliary_loss_mlp": 0.01004188, + "balance_loss_clip": 1.00237584, + "balance_loss_mlp": 1.0093925, + "epoch": 0.2561250563655494, + "flos": 70950509101440.0, + "grad_norm": 0.7946947905759578, + "language_loss": 0.58501768, + "learning_rate": 3.4871666872804994e-06, + "loss": 0.60544097, + "num_input_tokens_seen": 92118180, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.28710938, + "step": 4260, + "time_per_iteration": 4.636982202529907 + }, + { + "auxiliary_loss_clip": 0.01131187, + "auxiliary_loss_mlp": 0.01044504, + "balance_loss_clip": 1.02759969, + "balance_loss_mlp": 1.04300261, + "epoch": 0.25618517961821735, + "flos": 27012329481600.0, + "grad_norm": 1.8728817118968701, + "language_loss": 0.76659095, + "learning_rate": 3.4869062481564875e-06, + "loss": 0.7883479, + "num_input_tokens_seen": 92137570, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8828125, + "step": 4261, + "time_per_iteration": 3.974956750869751 + }, + { + "auxiliary_loss_clip": 0.01130829, + "auxiliary_loss_mlp": 0.01037912, + "balance_loss_clip": 1.02280843, + "balance_loss_mlp": 1.04460573, + "epoch": 0.2562453028708853, + "flos": 23068296322560.0, + "grad_norm": 1.6516780840688012, + "language_loss": 0.8323037, + "learning_rate": 3.486645752648842e-06, + "loss": 0.85399115, + "num_input_tokens_seen": 92157625, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.86328125, + "step": 4262, + "time_per_iteration": 2.5251948833465576 + }, + { + "auxiliary_loss_clip": 0.01136997, + "auxiliary_loss_mlp": 0.01048847, + "balance_loss_clip": 1.03123951, + "balance_loss_mlp": 1.04404712, + "epoch": 0.2563054261235533, + "flos": 15120942606720.0, + "grad_norm": 2.7380780768968016, + "language_loss": 0.74153852, + "learning_rate": 3.4863852007674405e-06, + "loss": 0.76339698, + "num_input_tokens_seen": 92175350, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9296875, + "step": 4263, + "time_per_iteration": 2.42657208442688 + }, + { + "auxiliary_loss_clip": 0.01133473, + "auxiliary_loss_mlp": 0.01051758, + "balance_loss_clip": 1.03533101, + "balance_loss_mlp": 1.04720163, + "epoch": 0.25636554937622125, + "flos": 27854865872640.0, + "grad_norm": 1.7828084139599185, + "language_loss": 0.82793939, + "learning_rate": 3.486124592522163e-06, + "loss": 0.84979165, + "num_input_tokens_seen": 92196070, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.86328125, + "step": 4264, + "time_per_iteration": 2.534097194671631 + }, + { + "auxiliary_loss_clip": 0.01134463, + "auxiliary_loss_mlp": 0.01041936, + "balance_loss_clip": 1.02506804, + "balance_loss_mlp": 1.04660988, + "epoch": 0.2564256726288892, + "flos": 28906509288960.0, + "grad_norm": 1.7080317762970965, + "language_loss": 0.7443161, + "learning_rate": 3.4858639279228924e-06, + "loss": 0.76608008, + "num_input_tokens_seen": 92216310, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.87890625, + "step": 4265, + "time_per_iteration": 2.51088809967041 + }, + { + "auxiliary_loss_clip": 0.01129849, + "auxiliary_loss_mlp": 0.01032538, + "balance_loss_clip": 1.01679027, + "balance_loss_mlp": 1.0425024, + "epoch": 0.2564857958815572, + "flos": 18514931823360.0, + "grad_norm": 1.644190377842657, + "language_loss": 0.8153013, + "learning_rate": 3.485603206979513e-06, + "loss": 0.83692515, + "num_input_tokens_seen": 92234510, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.875, + "step": 4266, + "time_per_iteration": 2.4706335067749023 + }, + { + "auxiliary_loss_clip": 0.01128054, + "auxiliary_loss_mlp": 0.01035268, + "balance_loss_clip": 1.01909137, + "balance_loss_mlp": 1.04252076, + "epoch": 0.25654591913422514, + "flos": 25808280658560.0, + "grad_norm": 1.6333370834261398, + "language_loss": 0.79287028, + "learning_rate": 3.4853424297019103e-06, + "loss": 0.81450343, + "num_input_tokens_seen": 92254070, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.85546875, + "step": 4267, + "time_per_iteration": 2.4819366931915283 + }, + { + "auxiliary_loss_clip": 0.01127366, + "auxiliary_loss_mlp": 0.01041101, + "balance_loss_clip": 1.02480555, + "balance_loss_mlp": 1.04406714, + "epoch": 0.2566060423868931, + "flos": 19099665325440.0, + "grad_norm": 1.7559000109968124, + "language_loss": 0.78708017, + "learning_rate": 3.4850815960999736e-06, + "loss": 0.80876482, + "num_input_tokens_seen": 92275060, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8359375, + "step": 4268, + "time_per_iteration": 2.4778378009796143 + }, + { + "auxiliary_loss_clip": 0.0113239, + "auxiliary_loss_mlp": 0.01037875, + "balance_loss_clip": 1.02198434, + "balance_loss_mlp": 1.04507172, + "epoch": 0.25666616563956113, + "flos": 23842674656640.0, + "grad_norm": 2.2514359992660204, + "language_loss": 0.68120348, + "learning_rate": 3.484820706183595e-06, + "loss": 0.70290613, + "num_input_tokens_seen": 92293610, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.87109375, + "step": 4269, + "time_per_iteration": 2.4696271419525146 + }, + { + "auxiliary_loss_clip": 0.01134604, + "auxiliary_loss_mlp": 0.01042059, + "balance_loss_clip": 1.0249877, + "balance_loss_mlp": 1.04593778, + "epoch": 0.2567262888922291, + "flos": 14604259420800.0, + "grad_norm": 4.018282830570473, + "language_loss": 0.78496158, + "learning_rate": 3.484559759962666e-06, + "loss": 0.80672824, + "num_input_tokens_seen": 92308305, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.88671875, + "step": 4270, + "time_per_iteration": 2.418912172317505 + }, + { + "auxiliary_loss_clip": 0.01139603, + "auxiliary_loss_mlp": 0.01037807, + "balance_loss_clip": 1.01953244, + "balance_loss_mlp": 1.04711556, + "epoch": 0.25678641214489706, + "flos": 32923117877760.0, + "grad_norm": 2.0502449379686256, + "language_loss": 0.68136632, + "learning_rate": 3.4842987574470816e-06, + "loss": 0.70314038, + "num_input_tokens_seen": 92329875, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.921875, + "step": 4271, + "time_per_iteration": 2.5410749912261963 + }, + { + "auxiliary_loss_clip": 0.01137314, + "auxiliary_loss_mlp": 0.0104993, + "balance_loss_clip": 1.0325973, + "balance_loss_mlp": 1.04592848, + "epoch": 0.256846535397565, + "flos": 24098933260800.0, + "grad_norm": 4.518410893879739, + "language_loss": 0.8741951, + "learning_rate": 3.4840376986467403e-06, + "loss": 0.8960675, + "num_input_tokens_seen": 92348780, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9140625, + "step": 4272, + "time_per_iteration": 2.5022568702697754 + }, + { + "auxiliary_loss_clip": 0.0113724, + "auxiliary_loss_mlp": 0.01044761, + "balance_loss_clip": 1.02734506, + "balance_loss_mlp": 1.04770613, + "epoch": 0.256906658650233, + "flos": 19718441942400.0, + "grad_norm": 1.953603621991432, + "language_loss": 0.81442308, + "learning_rate": 3.483776583571541e-06, + "loss": 0.83624303, + "num_input_tokens_seen": 92368175, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.8984375, + "step": 4273, + "time_per_iteration": 2.453834295272827 + }, + { + "auxiliary_loss_clip": 0.01131691, + "auxiliary_loss_mlp": 0.01041869, + "balance_loss_clip": 1.02492929, + "balance_loss_mlp": 1.04724693, + "epoch": 0.25696678190290095, + "flos": 22926018551040.0, + "grad_norm": 1.682161023261006, + "language_loss": 0.77215779, + "learning_rate": 3.4835154122313846e-06, + "loss": 0.79389334, + "num_input_tokens_seen": 92387755, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.84375, + "step": 4274, + "time_per_iteration": 2.486238956451416 + }, + { + "auxiliary_loss_clip": 0.01129914, + "auxiliary_loss_mlp": 0.01037046, + "balance_loss_clip": 1.02061856, + "balance_loss_mlp": 1.04450369, + "epoch": 0.2570269051555689, + "flos": 27307838672640.0, + "grad_norm": 1.8548211040661395, + "language_loss": 0.8401829, + "learning_rate": 3.4832541846361743e-06, + "loss": 0.86185247, + "num_input_tokens_seen": 92409850, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8515625, + "step": 4275, + "time_per_iteration": 2.5145719051361084 + }, + { + "auxiliary_loss_clip": 0.01133209, + "auxiliary_loss_mlp": 0.0103751, + "balance_loss_clip": 1.02078438, + "balance_loss_mlp": 1.04492021, + "epoch": 0.2570870284082369, + "flos": 27563414918400.0, + "grad_norm": 3.0116628321367678, + "language_loss": 0.78124094, + "learning_rate": 3.4829929007958175e-06, + "loss": 0.80294812, + "num_input_tokens_seen": 92431250, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8828125, + "step": 4276, + "time_per_iteration": 2.533989906311035 + }, + { + "auxiliary_loss_clip": 0.01133318, + "auxiliary_loss_mlp": 0.01043592, + "balance_loss_clip": 1.02723646, + "balance_loss_mlp": 1.04575086, + "epoch": 0.25714715166090485, + "flos": 28730834847360.0, + "grad_norm": 1.750550841347414, + "language_loss": 0.79439288, + "learning_rate": 3.4827315607202214e-06, + "loss": 0.81616199, + "num_input_tokens_seen": 92452065, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.875, + "step": 4277, + "time_per_iteration": 2.5131442546844482 + }, + { + "auxiliary_loss_clip": 0.01134263, + "auxiliary_loss_mlp": 0.01036244, + "balance_loss_clip": 1.01981688, + "balance_loss_mlp": 1.04671657, + "epoch": 0.2572072749135728, + "flos": 20116152305280.0, + "grad_norm": 2.0431628844466543, + "language_loss": 0.78804862, + "learning_rate": 3.482470164419295e-06, + "loss": 0.80975372, + "num_input_tokens_seen": 92470025, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.875, + "step": 4278, + "time_per_iteration": 2.4813432693481445 + }, + { + "auxiliary_loss_clip": 0.01137794, + "auxiliary_loss_mlp": 0.01039572, + "balance_loss_clip": 1.02299643, + "balance_loss_mlp": 1.04657972, + "epoch": 0.2572673981662408, + "flos": 26030855283840.0, + "grad_norm": 2.020871128069371, + "language_loss": 0.74624676, + "learning_rate": 3.482208711902952e-06, + "loss": 0.76802039, + "num_input_tokens_seen": 92489825, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9140625, + "step": 4279, + "time_per_iteration": 2.4989213943481445 + }, + { + "auxiliary_loss_clip": 0.01136508, + "auxiliary_loss_mlp": 0.01051836, + "balance_loss_clip": 1.03472984, + "balance_loss_mlp": 1.04528475, + "epoch": 0.25732752141890874, + "flos": 16106618695680.0, + "grad_norm": 2.295268067844067, + "language_loss": 0.85406947, + "learning_rate": 3.4819472031811065e-06, + "loss": 0.87595296, + "num_input_tokens_seen": 92507270, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9140625, + "step": 4280, + "time_per_iteration": 2.479163408279419 + }, + { + "auxiliary_loss_clip": 0.0113599, + "auxiliary_loss_mlp": 0.01041209, + "balance_loss_clip": 1.02362585, + "balance_loss_mlp": 1.04583955, + "epoch": 0.2573876446715767, + "flos": 22524429519360.0, + "grad_norm": 2.2211313624852447, + "language_loss": 0.78780186, + "learning_rate": 3.4816856382636744e-06, + "loss": 0.80957377, + "num_input_tokens_seen": 92526300, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8984375, + "step": 4281, + "time_per_iteration": 2.463003158569336 + }, + { + "auxiliary_loss_clip": 0.01134819, + "auxiliary_loss_mlp": 0.01039212, + "balance_loss_clip": 1.02277303, + "balance_loss_mlp": 1.0472312, + "epoch": 0.2574477679242447, + "flos": 23950837486080.0, + "grad_norm": 1.9444978312753, + "language_loss": 0.87356091, + "learning_rate": 3.4814240171605737e-06, + "loss": 0.89530122, + "num_input_tokens_seen": 92546465, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.875, + "step": 4282, + "time_per_iteration": 2.5049889087677 + }, + { + "auxiliary_loss_clip": 0.01137104, + "auxiliary_loss_mlp": 0.0104319, + "balance_loss_clip": 1.02739513, + "balance_loss_mlp": 1.04648709, + "epoch": 0.2575078911769127, + "flos": 21981711951360.0, + "grad_norm": 1.5754049466604292, + "language_loss": 0.70172656, + "learning_rate": 3.4811623398817267e-06, + "loss": 0.72352946, + "num_input_tokens_seen": 92567260, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.90625, + "step": 4283, + "time_per_iteration": 2.520315408706665 + }, + { + "auxiliary_loss_clip": 0.01132284, + "auxiliary_loss_mlp": 0.01042212, + "balance_loss_clip": 1.02698922, + "balance_loss_mlp": 1.04772711, + "epoch": 0.25756801442958066, + "flos": 21945406279680.0, + "grad_norm": 2.712350413324169, + "language_loss": 0.80323613, + "learning_rate": 3.4809006064370553e-06, + "loss": 0.82498109, + "num_input_tokens_seen": 92585425, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.84765625, + "step": 4284, + "time_per_iteration": 2.483292579650879 + }, + { + "auxiliary_loss_clip": 0.01134487, + "auxiliary_loss_mlp": 0.01040012, + "balance_loss_clip": 1.02538466, + "balance_loss_mlp": 1.04674387, + "epoch": 0.2576281376822486, + "flos": 35261980058880.0, + "grad_norm": 2.1742402973432893, + "language_loss": 0.70485193, + "learning_rate": 3.4806388168364835e-06, + "loss": 0.72659695, + "num_input_tokens_seen": 92604770, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.875, + "step": 4285, + "time_per_iteration": 2.564211130142212 + }, + { + "auxiliary_loss_clip": 0.01137353, + "auxiliary_loss_mlp": 0.01038151, + "balance_loss_clip": 1.02282071, + "balance_loss_mlp": 1.04953337, + "epoch": 0.2576882609349166, + "flos": 14132285688960.0, + "grad_norm": 2.328286971317511, + "language_loss": 0.58380014, + "learning_rate": 3.4803769710899402e-06, + "loss": 0.60555518, + "num_input_tokens_seen": 92622635, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.87890625, + "step": 4286, + "time_per_iteration": 2.4425430297851562 + }, + { + "auxiliary_loss_clip": 0.01139826, + "auxiliary_loss_mlp": 0.01043664, + "balance_loss_clip": 1.02702272, + "balance_loss_mlp": 1.04858327, + "epoch": 0.25774838418758456, + "flos": 23258336204160.0, + "grad_norm": 1.6452331987585218, + "language_loss": 0.64191288, + "learning_rate": 3.480115069207354e-06, + "loss": 0.66374773, + "num_input_tokens_seen": 92642960, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.91015625, + "step": 4287, + "time_per_iteration": 2.470015287399292 + }, + { + "auxiliary_loss_clip": 0.0113955, + "auxiliary_loss_mlp": 0.01040533, + "balance_loss_clip": 1.02293801, + "balance_loss_mlp": 1.04739881, + "epoch": 0.2578085074402525, + "flos": 22601745544320.0, + "grad_norm": 2.0830358142366148, + "language_loss": 0.72029591, + "learning_rate": 3.4798531111986557e-06, + "loss": 0.74209672, + "num_input_tokens_seen": 92662455, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.921875, + "step": 4288, + "time_per_iteration": 2.4983417987823486 + }, + { + "auxiliary_loss_clip": 0.01135736, + "auxiliary_loss_mlp": 0.01038411, + "balance_loss_clip": 1.02263355, + "balance_loss_mlp": 1.04882312, + "epoch": 0.2578686306929205, + "flos": 24571840746240.0, + "grad_norm": 1.9870049696680936, + "language_loss": 0.76965904, + "learning_rate": 3.4795910970737786e-06, + "loss": 0.79140055, + "num_input_tokens_seen": 92683520, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8671875, + "step": 4289, + "time_per_iteration": 2.4997475147247314 + }, + { + "auxiliary_loss_clip": 0.01134323, + "auxiliary_loss_mlp": 0.01040378, + "balance_loss_clip": 1.02311635, + "balance_loss_mlp": 1.04562807, + "epoch": 0.25792875394558845, + "flos": 18113953322880.0, + "grad_norm": 1.946897603323323, + "language_loss": 0.85123539, + "learning_rate": 3.4793290268426592e-06, + "loss": 0.87298238, + "num_input_tokens_seen": 92701450, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.88671875, + "step": 4290, + "time_per_iteration": 2.454871416091919 + }, + { + "auxiliary_loss_clip": 0.01140117, + "auxiliary_loss_mlp": 0.0105053, + "balance_loss_clip": 1.03159952, + "balance_loss_mlp": 1.04959655, + "epoch": 0.2579888771982564, + "flos": 17712902995200.0, + "grad_norm": 2.195715426849753, + "language_loss": 0.72170424, + "learning_rate": 3.4790669005152354e-06, + "loss": 0.74361074, + "num_input_tokens_seen": 92720355, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.90625, + "step": 4291, + "time_per_iteration": 2.4512693881988525 + }, + { + "auxiliary_loss_clip": 0.01142047, + "auxiliary_loss_mlp": 0.01041391, + "balance_loss_clip": 1.02436781, + "balance_loss_mlp": 1.05002344, + "epoch": 0.2580490004509244, + "flos": 16434878112000.0, + "grad_norm": 2.4805881311796423, + "language_loss": 0.80718195, + "learning_rate": 3.4788047181014458e-06, + "loss": 0.82901633, + "num_input_tokens_seen": 92736755, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.921875, + "step": 4292, + "time_per_iteration": 2.469034433364868 + }, + { + "auxiliary_loss_clip": 0.01141659, + "auxiliary_loss_mlp": 0.01044805, + "balance_loss_clip": 1.02767503, + "balance_loss_mlp": 1.05171072, + "epoch": 0.25810912370359235, + "flos": 33835141128960.0, + "grad_norm": 7.501455001056755, + "language_loss": 0.67646754, + "learning_rate": 3.4785424796112337e-06, + "loss": 0.69833219, + "num_input_tokens_seen": 92757655, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.8984375, + "step": 4293, + "time_per_iteration": 2.5785787105560303 + }, + { + "auxiliary_loss_clip": 0.01130106, + "auxiliary_loss_mlp": 0.01042426, + "balance_loss_clip": 1.02660704, + "balance_loss_mlp": 1.04503, + "epoch": 0.2581692469562603, + "flos": 25192197561600.0, + "grad_norm": 1.9136357435420137, + "language_loss": 0.75409257, + "learning_rate": 3.478280185054542e-06, + "loss": 0.77581787, + "num_input_tokens_seen": 92776100, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8515625, + "step": 4294, + "time_per_iteration": 2.5044636726379395 + }, + { + "auxiliary_loss_clip": 0.01136505, + "auxiliary_loss_mlp": 0.01047021, + "balance_loss_clip": 1.02974749, + "balance_loss_mlp": 1.04808116, + "epoch": 0.2582293702089283, + "flos": 34932212271360.0, + "grad_norm": 2.168244565891273, + "language_loss": 0.81049722, + "learning_rate": 3.478017834441318e-06, + "loss": 0.83233249, + "num_input_tokens_seen": 92798880, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.8828125, + "step": 4295, + "time_per_iteration": 2.5875558853149414 + }, + { + "auxiliary_loss_clip": 0.01140472, + "auxiliary_loss_mlp": 0.01046123, + "balance_loss_clip": 1.02797985, + "balance_loss_mlp": 1.04796624, + "epoch": 0.2582894934615963, + "flos": 26833746038400.0, + "grad_norm": 2.1973562505628026, + "language_loss": 0.72515166, + "learning_rate": 3.4777554277815096e-06, + "loss": 0.74701762, + "num_input_tokens_seen": 92817750, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.92578125, + "step": 4296, + "time_per_iteration": 2.535693407058716 + }, + { + "auxiliary_loss_clip": 0.01138613, + "auxiliary_loss_mlp": 0.01039903, + "balance_loss_clip": 1.02322531, + "balance_loss_mlp": 1.04918242, + "epoch": 0.25834961671426426, + "flos": 23515241253120.0, + "grad_norm": 1.8330269406357795, + "language_loss": 0.86766148, + "learning_rate": 3.477492965085067e-06, + "loss": 0.88944662, + "num_input_tokens_seen": 92837995, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.89453125, + "step": 4297, + "time_per_iteration": 2.5001306533813477 + }, + { + "auxiliary_loss_clip": 0.01137068, + "auxiliary_loss_mlp": 0.01048271, + "balance_loss_clip": 1.03208232, + "balance_loss_mlp": 1.04755223, + "epoch": 0.25840973996693223, + "flos": 22451028076800.0, + "grad_norm": 2.2622150737063955, + "language_loss": 0.84706259, + "learning_rate": 3.477230446361943e-06, + "loss": 0.86891592, + "num_input_tokens_seen": 92857245, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.89453125, + "step": 4298, + "time_per_iteration": 2.489917278289795 + }, + { + "auxiliary_loss_clip": 0.01137188, + "auxiliary_loss_mlp": 0.0103747, + "balance_loss_clip": 1.02069676, + "balance_loss_mlp": 1.04739285, + "epoch": 0.2584698632196002, + "flos": 11290854366720.0, + "grad_norm": 2.0676974538336266, + "language_loss": 0.83596241, + "learning_rate": 3.4769678716220927e-06, + "loss": 0.85770899, + "num_input_tokens_seen": 92873265, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8984375, + "step": 4299, + "time_per_iteration": 2.4274845123291016 + }, + { + "auxiliary_loss_clip": 0.0113508, + "auxiliary_loss_mlp": 0.01035558, + "balance_loss_clip": 1.01985788, + "balance_loss_mlp": 1.04795814, + "epoch": 0.25852998647226816, + "flos": 17929982839680.0, + "grad_norm": 2.477231855960524, + "language_loss": 0.82685435, + "learning_rate": 3.4767052408754726e-06, + "loss": 0.84856081, + "num_input_tokens_seen": 92890880, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.87109375, + "step": 4300, + "time_per_iteration": 2.4730846881866455 + }, + { + "auxiliary_loss_clip": 0.01137103, + "auxiliary_loss_mlp": 0.0104166, + "balance_loss_clip": 1.02492332, + "balance_loss_mlp": 1.04620934, + "epoch": 0.2585901097249361, + "flos": 33256117889280.0, + "grad_norm": 2.2046546957653077, + "language_loss": 0.67186987, + "learning_rate": 3.4764425541320417e-06, + "loss": 0.69365752, + "num_input_tokens_seen": 92910770, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.91015625, + "step": 4301, + "time_per_iteration": 2.5633106231689453 + }, + { + "auxiliary_loss_clip": 0.01141797, + "auxiliary_loss_mlp": 0.01039122, + "balance_loss_clip": 1.02191997, + "balance_loss_mlp": 1.04805672, + "epoch": 0.2586502329776041, + "flos": 18441278985600.0, + "grad_norm": 2.459016606739088, + "language_loss": 0.80929118, + "learning_rate": 3.4761798114017617e-06, + "loss": 0.83110034, + "num_input_tokens_seen": 92929520, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9375, + "step": 4302, + "time_per_iteration": 5.438407897949219 + }, + { + "auxiliary_loss_clip": 0.01138396, + "auxiliary_loss_mlp": 0.0104179, + "balance_loss_clip": 1.02535129, + "balance_loss_mlp": 1.04789591, + "epoch": 0.25871035623027205, + "flos": 17968120104960.0, + "grad_norm": 2.9925401825996545, + "language_loss": 0.92246419, + "learning_rate": 3.475917012694595e-06, + "loss": 0.94426608, + "num_input_tokens_seen": 92947890, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90625, + "step": 4303, + "time_per_iteration": 2.514573574066162 + }, + { + "auxiliary_loss_clip": 0.01139372, + "auxiliary_loss_mlp": 0.01036604, + "balance_loss_clip": 1.020046, + "balance_loss_mlp": 1.04932761, + "epoch": 0.25877047948294, + "flos": 27777729415680.0, + "grad_norm": 1.8070234866344623, + "language_loss": 0.67034984, + "learning_rate": 3.475654158020507e-06, + "loss": 0.69210964, + "num_input_tokens_seen": 92967690, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8984375, + "step": 4304, + "time_per_iteration": 2.540682315826416 + }, + { + "auxiliary_loss_clip": 0.01138164, + "auxiliary_loss_mlp": 0.0105089, + "balance_loss_clip": 1.03355694, + "balance_loss_mlp": 1.04595923, + "epoch": 0.258830602735608, + "flos": 27125843437440.0, + "grad_norm": 2.73594521825367, + "language_loss": 0.72829735, + "learning_rate": 3.4753912473894657e-06, + "loss": 0.75018799, + "num_input_tokens_seen": 92986830, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.921875, + "step": 4305, + "time_per_iteration": 2.580801248550415 + }, + { + "auxiliary_loss_clip": 0.01138565, + "auxiliary_loss_mlp": 0.01041261, + "balance_loss_clip": 1.02417874, + "balance_loss_mlp": 1.04731607, + "epoch": 0.25889072598827595, + "flos": 17891486438400.0, + "grad_norm": 2.196623082948333, + "language_loss": 0.75595653, + "learning_rate": 3.4751282808114403e-06, + "loss": 0.77775478, + "num_input_tokens_seen": 93002740, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9140625, + "step": 4306, + "time_per_iteration": 2.44267201423645 + }, + { + "auxiliary_loss_clip": 0.01045399, + "auxiliary_loss_mlp": 0.01003539, + "balance_loss_clip": 1.00138175, + "balance_loss_mlp": 1.01567113, + "epoch": 0.2589508492409439, + "flos": 53934955724160.0, + "grad_norm": 0.8506593293873899, + "language_loss": 0.5717386, + "learning_rate": 3.474865258296403e-06, + "loss": 0.59222794, + "num_input_tokens_seen": 93058645, + "router_z_loss_clip": 0.02160645, + "router_z_loss_mlp": 0.296875, + "step": 4307, + "time_per_iteration": 3.0457189083099365 + }, + { + "auxiliary_loss_clip": 0.01135839, + "auxiliary_loss_mlp": 0.01039878, + "balance_loss_clip": 1.02389181, + "balance_loss_mlp": 1.04729199, + "epoch": 0.2590109724936119, + "flos": 22125785402880.0, + "grad_norm": 1.7695447826328226, + "language_loss": 0.71543598, + "learning_rate": 3.474602179854327e-06, + "loss": 0.73719311, + "num_input_tokens_seen": 93077140, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8828125, + "step": 4308, + "time_per_iteration": 2.4612655639648438 + }, + { + "auxiliary_loss_clip": 0.0113812, + "auxiliary_loss_mlp": 0.01041087, + "balance_loss_clip": 1.02439809, + "balance_loss_mlp": 1.04625905, + "epoch": 0.2590710957462799, + "flos": 13474294398720.0, + "grad_norm": 2.097007373458932, + "language_loss": 0.84195936, + "learning_rate": 3.4743390454951886e-06, + "loss": 0.86375141, + "num_input_tokens_seen": 93093580, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.91796875, + "step": 4309, + "time_per_iteration": 2.458937883377075 + }, + { + "auxiliary_loss_clip": 0.01138522, + "auxiliary_loss_mlp": 0.01041522, + "balance_loss_clip": 1.02609062, + "balance_loss_mlp": 1.04893243, + "epoch": 0.25913121899894787, + "flos": 22307098279680.0, + "grad_norm": 1.520786669442297, + "language_loss": 0.8451637, + "learning_rate": 3.474075855228966e-06, + "loss": 0.8669641, + "num_input_tokens_seen": 93112345, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8984375, + "step": 4310, + "time_per_iteration": 2.453946828842163 + }, + { + "auxiliary_loss_clip": 0.0113925, + "auxiliary_loss_mlp": 0.01043346, + "balance_loss_clip": 1.02706194, + "balance_loss_mlp": 1.04705715, + "epoch": 0.25919134225161583, + "flos": 25811728364160.0, + "grad_norm": 2.3904067628525305, + "language_loss": 0.77478111, + "learning_rate": 3.473812609065639e-06, + "loss": 0.79660702, + "num_input_tokens_seen": 93131545, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.921875, + "step": 4311, + "time_per_iteration": 2.5142812728881836 + }, + { + "auxiliary_loss_clip": 0.01138542, + "auxiliary_loss_mlp": 0.01041115, + "balance_loss_clip": 1.0248189, + "balance_loss_mlp": 1.04691362, + "epoch": 0.2592514655042838, + "flos": 31212262108800.0, + "grad_norm": 3.1447136536803852, + "language_loss": 0.72220832, + "learning_rate": 3.4735493070151904e-06, + "loss": 0.74400491, + "num_input_tokens_seen": 93150730, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.91796875, + "step": 4312, + "time_per_iteration": 2.5275332927703857 + }, + { + "auxiliary_loss_clip": 0.01134993, + "auxiliary_loss_mlp": 0.01040705, + "balance_loss_clip": 1.02434921, + "balance_loss_mlp": 1.04480851, + "epoch": 0.25931158875695176, + "flos": 18474998878080.0, + "grad_norm": 2.2264539824076683, + "language_loss": 0.69908661, + "learning_rate": 3.4732859490876044e-06, + "loss": 0.72084355, + "num_input_tokens_seen": 93167895, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90234375, + "step": 4313, + "time_per_iteration": 2.479011058807373 + }, + { + "auxiliary_loss_clip": 0.01133563, + "auxiliary_loss_mlp": 0.01043367, + "balance_loss_clip": 1.02800131, + "balance_loss_mlp": 1.04467726, + "epoch": 0.2593717120096197, + "flos": 19207935895680.0, + "grad_norm": 1.7186396349483555, + "language_loss": 0.80486274, + "learning_rate": 3.473022535292867e-06, + "loss": 0.82663202, + "num_input_tokens_seen": 93187650, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.88671875, + "step": 4314, + "time_per_iteration": 2.443934679031372 + }, + { + "auxiliary_loss_clip": 0.01138226, + "auxiliary_loss_mlp": 0.01047643, + "balance_loss_clip": 1.03030992, + "balance_loss_mlp": 1.04506671, + "epoch": 0.2594318352622877, + "flos": 31248100903680.0, + "grad_norm": 2.0498851814527863, + "language_loss": 0.6687156, + "learning_rate": 3.472759065640968e-06, + "loss": 0.69057429, + "num_input_tokens_seen": 93207370, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9296875, + "step": 4315, + "time_per_iteration": 2.5375983715057373 + }, + { + "auxiliary_loss_clip": 0.01132586, + "auxiliary_loss_mlp": 0.01040869, + "balance_loss_clip": 1.02563381, + "balance_loss_mlp": 1.04426146, + "epoch": 0.25949195851495566, + "flos": 22237144542720.0, + "grad_norm": 1.5303062780919283, + "language_loss": 0.7911852, + "learning_rate": 3.4724955401418976e-06, + "loss": 0.81291974, + "num_input_tokens_seen": 93227925, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8828125, + "step": 4316, + "time_per_iteration": 2.448997735977173 + }, + { + "auxiliary_loss_clip": 0.01136257, + "auxiliary_loss_mlp": 0.01039906, + "balance_loss_clip": 1.02333546, + "balance_loss_mlp": 1.0446136, + "epoch": 0.2595520817676236, + "flos": 28075716645120.0, + "grad_norm": 1.687308210321376, + "language_loss": 0.77601087, + "learning_rate": 3.4722319588056487e-06, + "loss": 0.79777247, + "num_input_tokens_seen": 93250020, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9140625, + "step": 4317, + "time_per_iteration": 2.5545339584350586 + }, + { + "auxiliary_loss_clip": 0.01136641, + "auxiliary_loss_mlp": 0.01048751, + "balance_loss_clip": 1.03160882, + "balance_loss_mlp": 1.04599953, + "epoch": 0.2596122050202916, + "flos": 20190954378240.0, + "grad_norm": 2.5535432929686883, + "language_loss": 0.77773315, + "learning_rate": 3.4719683216422163e-06, + "loss": 0.79958701, + "num_input_tokens_seen": 93269070, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.90625, + "step": 4318, + "time_per_iteration": 2.450573682785034 + }, + { + "auxiliary_loss_clip": 0.01133741, + "auxiliary_loss_mlp": 0.01045128, + "balance_loss_clip": 1.02717471, + "balance_loss_mlp": 1.04450393, + "epoch": 0.25967232827295955, + "flos": 22527949052160.0, + "grad_norm": 1.801084946435003, + "language_loss": 0.76197278, + "learning_rate": 3.471704628661598e-06, + "loss": 0.78376144, + "num_input_tokens_seen": 93290250, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.890625, + "step": 4319, + "time_per_iteration": 2.5243709087371826 + }, + { + "auxiliary_loss_clip": 0.01131874, + "auxiliary_loss_mlp": 0.01037382, + "balance_loss_clip": 1.0216701, + "balance_loss_mlp": 1.04500592, + "epoch": 0.2597324515256275, + "flos": 21068252156160.0, + "grad_norm": 1.8511829127720039, + "language_loss": 0.76338619, + "learning_rate": 3.4714408798737925e-06, + "loss": 0.78507876, + "num_input_tokens_seen": 93310090, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8671875, + "step": 4320, + "time_per_iteration": 2.4792070388793945 + }, + { + "auxiliary_loss_clip": 0.01135729, + "auxiliary_loss_mlp": 0.01038323, + "balance_loss_clip": 1.02205038, + "balance_loss_mlp": 1.04641151, + "epoch": 0.2597925747782955, + "flos": 22050013662720.0, + "grad_norm": 1.7592602092397844, + "language_loss": 0.71143925, + "learning_rate": 3.471177075288801e-06, + "loss": 0.73317981, + "num_input_tokens_seen": 93329570, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.890625, + "step": 4321, + "time_per_iteration": 2.5381112098693848 + }, + { + "auxiliary_loss_clip": 0.01137982, + "auxiliary_loss_mlp": 0.01044713, + "balance_loss_clip": 1.02813125, + "balance_loss_mlp": 1.04517424, + "epoch": 0.2598526980309635, + "flos": 19536949497600.0, + "grad_norm": 2.037757848326605, + "language_loss": 0.74483943, + "learning_rate": 3.4709132149166277e-06, + "loss": 0.76666641, + "num_input_tokens_seen": 93347920, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9296875, + "step": 4322, + "time_per_iteration": 2.4379777908325195 + }, + { + "auxiliary_loss_clip": 0.01134471, + "auxiliary_loss_mlp": 0.01047461, + "balance_loss_clip": 1.03059244, + "balance_loss_mlp": 1.04368353, + "epoch": 0.25991282128363147, + "flos": 24495207079680.0, + "grad_norm": 1.9467125010752846, + "language_loss": 0.73674595, + "learning_rate": 3.470649298767278e-06, + "loss": 0.75856531, + "num_input_tokens_seen": 93367145, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.90625, + "step": 4323, + "time_per_iteration": 2.517399549484253 + }, + { + "auxiliary_loss_clip": 0.01141538, + "auxiliary_loss_mlp": 0.01044133, + "balance_loss_clip": 1.0263952, + "balance_loss_mlp": 1.04524922, + "epoch": 0.25997294453629943, + "flos": 24201457655040.0, + "grad_norm": 2.197207179409235, + "language_loss": 0.6710211, + "learning_rate": 3.4703853268507597e-06, + "loss": 0.69287789, + "num_input_tokens_seen": 93386555, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.96484375, + "step": 4324, + "time_per_iteration": 2.478419303894043 + }, + { + "auxiliary_loss_clip": 0.01132905, + "auxiliary_loss_mlp": 0.01043334, + "balance_loss_clip": 1.02839708, + "balance_loss_mlp": 1.04456055, + "epoch": 0.2600330677889674, + "flos": 31431460855680.0, + "grad_norm": 2.3342631450552838, + "language_loss": 0.70809424, + "learning_rate": 3.470121299177082e-06, + "loss": 0.72985667, + "num_input_tokens_seen": 93405590, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8828125, + "step": 4325, + "time_per_iteration": 2.5444648265838623 + }, + { + "auxiliary_loss_clip": 0.01133012, + "auxiliary_loss_mlp": 0.01037608, + "balance_loss_clip": 1.02139568, + "balance_loss_mlp": 1.04295206, + "epoch": 0.26009319104163536, + "flos": 32266527217920.0, + "grad_norm": 2.476658211689484, + "language_loss": 0.73041123, + "learning_rate": 3.469857215756257e-06, + "loss": 0.7521174, + "num_input_tokens_seen": 93424750, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.90234375, + "step": 4326, + "time_per_iteration": 2.5281147956848145 + }, + { + "auxiliary_loss_clip": 0.01127256, + "auxiliary_loss_mlp": 0.01038648, + "balance_loss_clip": 1.02424729, + "balance_loss_mlp": 1.04237306, + "epoch": 0.26015331429430333, + "flos": 26286754752000.0, + "grad_norm": 1.820673081097861, + "language_loss": 0.8661378, + "learning_rate": 3.4695930765982997e-06, + "loss": 0.88779688, + "num_input_tokens_seen": 93443465, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8515625, + "step": 4327, + "time_per_iteration": 2.4929087162017822 + }, + { + "auxiliary_loss_clip": 0.01138344, + "auxiliary_loss_mlp": 0.01049414, + "balance_loss_clip": 1.03121042, + "balance_loss_mlp": 1.04679346, + "epoch": 0.2602134375469713, + "flos": 21142335957120.0, + "grad_norm": 2.002075266566112, + "language_loss": 0.80111909, + "learning_rate": 3.4693288817132255e-06, + "loss": 0.82299662, + "num_input_tokens_seen": 93462580, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.9140625, + "step": 4328, + "time_per_iteration": 2.451131582260132 + }, + { + "auxiliary_loss_clip": 0.0112995, + "auxiliary_loss_mlp": 0.01040006, + "balance_loss_clip": 1.02394891, + "balance_loss_mlp": 1.04219353, + "epoch": 0.26027356079963926, + "flos": 25921327737600.0, + "grad_norm": 1.514483384647774, + "language_loss": 0.87428784, + "learning_rate": 3.4690646311110525e-06, + "loss": 0.89598739, + "num_input_tokens_seen": 93482790, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87890625, + "step": 4329, + "time_per_iteration": 2.522368907928467 + }, + { + "auxiliary_loss_clip": 0.01132983, + "auxiliary_loss_mlp": 0.010381, + "balance_loss_clip": 1.02261448, + "balance_loss_mlp": 1.04585731, + "epoch": 0.2603336840523072, + "flos": 26359222440960.0, + "grad_norm": 2.096665977126354, + "language_loss": 0.77746803, + "learning_rate": 3.468800324801802e-06, + "loss": 0.79917884, + "num_input_tokens_seen": 93498795, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.87109375, + "step": 4330, + "time_per_iteration": 2.4771482944488525 + }, + { + "auxiliary_loss_clip": 0.01134796, + "auxiliary_loss_mlp": 0.01047613, + "balance_loss_clip": 1.03136468, + "balance_loss_mlp": 1.04525268, + "epoch": 0.2603938073049752, + "flos": 23513661054720.0, + "grad_norm": 2.4595446714184654, + "language_loss": 0.75248575, + "learning_rate": 3.4685359627954958e-06, + "loss": 0.77430975, + "num_input_tokens_seen": 93518335, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.89453125, + "step": 4331, + "time_per_iteration": 2.5284199714660645 + }, + { + "auxiliary_loss_clip": 0.01137533, + "auxiliary_loss_mlp": 0.0103716, + "balance_loss_clip": 1.02158558, + "balance_loss_mlp": 1.05026567, + "epoch": 0.26045393055764315, + "flos": 25374300537600.0, + "grad_norm": 1.3491085383994963, + "language_loss": 0.69003588, + "learning_rate": 3.4682715451021584e-06, + "loss": 0.71178281, + "num_input_tokens_seen": 93539170, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.87109375, + "step": 4332, + "time_per_iteration": 2.476125478744507 + }, + { + "auxiliary_loss_clip": 0.0113624, + "auxiliary_loss_mlp": 0.01041054, + "balance_loss_clip": 1.02453184, + "balance_loss_mlp": 1.04542089, + "epoch": 0.2605140538103111, + "flos": 27635272076160.0, + "grad_norm": 2.3270567941112854, + "language_loss": 0.79674375, + "learning_rate": 3.4680070717318174e-06, + "loss": 0.81851673, + "num_input_tokens_seen": 93558480, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.91015625, + "step": 4333, + "time_per_iteration": 2.5234756469726562 + }, + { + "auxiliary_loss_clip": 0.01129676, + "auxiliary_loss_mlp": 0.01043365, + "balance_loss_clip": 1.02791548, + "balance_loss_mlp": 1.04336357, + "epoch": 0.2605741770629791, + "flos": 13769839503360.0, + "grad_norm": 1.7608965931322442, + "language_loss": 0.80725265, + "learning_rate": 3.467742542694501e-06, + "loss": 0.82898307, + "num_input_tokens_seen": 93575220, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.86328125, + "step": 4334, + "time_per_iteration": 2.4361026287078857 + }, + { + "auxiliary_loss_clip": 0.01132792, + "auxiliary_loss_mlp": 0.01038034, + "balance_loss_clip": 1.02128482, + "balance_loss_mlp": 1.04452491, + "epoch": 0.26063430031564705, + "flos": 26031681296640.0, + "grad_norm": 1.8337144126432974, + "language_loss": 0.80039275, + "learning_rate": 3.46747795800024e-06, + "loss": 0.822101, + "num_input_tokens_seen": 93597015, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.87890625, + "step": 4335, + "time_per_iteration": 2.5246174335479736 + }, + { + "auxiliary_loss_clip": 0.01043695, + "auxiliary_loss_mlp": 0.0102207, + "balance_loss_clip": 1.02024579, + "balance_loss_mlp": 1.01431763, + "epoch": 0.26069442356831507, + "flos": 62443809820800.0, + "grad_norm": 0.849908687169067, + "language_loss": 0.60851145, + "learning_rate": 3.467213317659068e-06, + "loss": 0.62916911, + "num_input_tokens_seen": 93657775, + "router_z_loss_clip": 0.01818848, + "router_z_loss_mlp": 0.29296875, + "step": 4336, + "time_per_iteration": 3.0349080562591553 + }, + { + "auxiliary_loss_clip": 0.01136323, + "auxiliary_loss_mlp": 0.01047902, + "balance_loss_clip": 1.03172541, + "balance_loss_mlp": 1.04599738, + "epoch": 0.26075454682098304, + "flos": 13626376583040.0, + "grad_norm": 6.860825703537795, + "language_loss": 0.77407634, + "learning_rate": 3.46694862168102e-06, + "loss": 0.79591858, + "num_input_tokens_seen": 93676145, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.90625, + "step": 4337, + "time_per_iteration": 2.4549763202667236 + }, + { + "auxiliary_loss_clip": 0.01135126, + "auxiliary_loss_mlp": 0.01045126, + "balance_loss_clip": 1.02755404, + "balance_loss_mlp": 1.04531193, + "epoch": 0.260814670073651, + "flos": 12126531260160.0, + "grad_norm": 2.1553767319060646, + "language_loss": 0.74116468, + "learning_rate": 3.4666838700761334e-06, + "loss": 0.76296723, + "num_input_tokens_seen": 93692480, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8984375, + "step": 4338, + "time_per_iteration": 2.4109654426574707 + }, + { + "auxiliary_loss_clip": 0.01137659, + "auxiliary_loss_mlp": 0.01042073, + "balance_loss_clip": 1.02495456, + "balance_loss_mlp": 1.0451895, + "epoch": 0.26087479332631897, + "flos": 15122522805120.0, + "grad_norm": 2.414973208379154, + "language_loss": 0.80645537, + "learning_rate": 3.466419062854447e-06, + "loss": 0.82825273, + "num_input_tokens_seen": 93710165, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.92578125, + "step": 4339, + "time_per_iteration": 2.4671595096588135 + }, + { + "auxiliary_loss_clip": 0.01133141, + "auxiliary_loss_mlp": 0.01038061, + "balance_loss_clip": 1.02287948, + "balance_loss_mlp": 1.04559159, + "epoch": 0.26093491657898693, + "flos": 24680937329280.0, + "grad_norm": 1.5844023841754464, + "language_loss": 0.76694596, + "learning_rate": 3.4661542000260033e-06, + "loss": 0.78865802, + "num_input_tokens_seen": 93730185, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.875, + "step": 4340, + "time_per_iteration": 2.4803388118743896 + }, + { + "auxiliary_loss_clip": 0.01137352, + "auxiliary_loss_mlp": 0.01037831, + "balance_loss_clip": 1.02185678, + "balance_loss_mlp": 1.04666209, + "epoch": 0.2609950398316549, + "flos": 25116138512640.0, + "grad_norm": 1.5290989424491332, + "language_loss": 0.82436979, + "learning_rate": 3.465889281600845e-06, + "loss": 0.84612167, + "num_input_tokens_seen": 93747690, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.90625, + "step": 4341, + "time_per_iteration": 2.5263681411743164 + }, + { + "auxiliary_loss_clip": 0.01134552, + "auxiliary_loss_mlp": 0.01039374, + "balance_loss_clip": 1.02236271, + "balance_loss_mlp": 1.04563117, + "epoch": 0.26105516308432286, + "flos": 28548588216960.0, + "grad_norm": 2.4125290221035773, + "language_loss": 0.76542389, + "learning_rate": 3.4656243075890183e-06, + "loss": 0.78716314, + "num_input_tokens_seen": 93767405, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.890625, + "step": 4342, + "time_per_iteration": 2.5043585300445557 + }, + { + "auxiliary_loss_clip": 0.01132446, + "auxiliary_loss_mlp": 0.010328, + "balance_loss_clip": 1.01570523, + "balance_loss_mlp": 1.04324019, + "epoch": 0.2611152863369908, + "flos": 39530609447040.0, + "grad_norm": 1.8018778201456855, + "language_loss": 0.66747689, + "learning_rate": 3.4653592780005707e-06, + "loss": 0.68912935, + "num_input_tokens_seen": 93789950, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.89453125, + "step": 4343, + "time_per_iteration": 2.6470234394073486 + }, + { + "auxiliary_loss_clip": 0.01136306, + "auxiliary_loss_mlp": 0.01041522, + "balance_loss_clip": 1.02467799, + "balance_loss_mlp": 1.04494977, + "epoch": 0.2611754095896588, + "flos": 13735329511680.0, + "grad_norm": 2.0339901471708646, + "language_loss": 0.73817015, + "learning_rate": 3.465094192845553e-06, + "loss": 0.75994843, + "num_input_tokens_seen": 93807835, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9140625, + "step": 4344, + "time_per_iteration": 5.431513071060181 + }, + { + "auxiliary_loss_clip": 0.0113578, + "auxiliary_loss_mlp": 0.01038278, + "balance_loss_clip": 1.02257776, + "balance_loss_mlp": 1.04692459, + "epoch": 0.26123553284232676, + "flos": 21506649649920.0, + "grad_norm": 3.7636245605224072, + "language_loss": 0.86394477, + "learning_rate": 3.4648290521340165e-06, + "loss": 0.88568532, + "num_input_tokens_seen": 93825670, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.890625, + "step": 4345, + "time_per_iteration": 2.4908552169799805 + }, + { + "auxiliary_loss_clip": 0.01129626, + "auxiliary_loss_mlp": 0.01039942, + "balance_loss_clip": 1.02422452, + "balance_loss_mlp": 1.04427588, + "epoch": 0.2612956560949947, + "flos": 21139786091520.0, + "grad_norm": 1.88977116996907, + "language_loss": 0.7612443, + "learning_rate": 3.464563855876015e-06, + "loss": 0.78293997, + "num_input_tokens_seen": 93844045, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.85546875, + "step": 4346, + "time_per_iteration": 2.4966983795166016 + }, + { + "auxiliary_loss_clip": 0.01133219, + "auxiliary_loss_mlp": 0.01041377, + "balance_loss_clip": 1.02547407, + "balance_loss_mlp": 1.04483962, + "epoch": 0.2613557793476627, + "flos": 25119011600640.0, + "grad_norm": 1.5621162347417301, + "language_loss": 0.75868237, + "learning_rate": 3.464298604081606e-06, + "loss": 0.78042835, + "num_input_tokens_seen": 93864380, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8828125, + "step": 4347, + "time_per_iteration": 2.5392181873321533 + }, + { + "auxiliary_loss_clip": 0.01133725, + "auxiliary_loss_mlp": 0.01033882, + "balance_loss_clip": 1.01846862, + "balance_loss_mlp": 1.04549503, + "epoch": 0.26141590260033065, + "flos": 26067699659520.0, + "grad_norm": 1.4125954345922265, + "language_loss": 0.73354399, + "learning_rate": 3.4640332967608476e-06, + "loss": 0.75522006, + "num_input_tokens_seen": 93885475, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8828125, + "step": 4348, + "time_per_iteration": 2.5206878185272217 + }, + { + "auxiliary_loss_clip": 0.01134547, + "auxiliary_loss_mlp": 0.01039621, + "balance_loss_clip": 1.02286005, + "balance_loss_mlp": 1.04503882, + "epoch": 0.2614760258529987, + "flos": 25701518459520.0, + "grad_norm": 1.8182616406273437, + "language_loss": 0.91063923, + "learning_rate": 3.463767933923799e-06, + "loss": 0.93238091, + "num_input_tokens_seen": 93905545, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.89453125, + "step": 4349, + "time_per_iteration": 2.526134967803955 + }, + { + "auxiliary_loss_clip": 0.01132572, + "auxiliary_loss_mlp": 0.01042392, + "balance_loss_clip": 1.02663279, + "balance_loss_mlp": 1.0461632, + "epoch": 0.26153614910566664, + "flos": 17457147181440.0, + "grad_norm": 1.7312169360414529, + "language_loss": 0.79879099, + "learning_rate": 3.463502515580524e-06, + "loss": 0.82054067, + "num_input_tokens_seen": 93924185, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.86328125, + "step": 4350, + "time_per_iteration": 2.4420506954193115 + }, + { + "auxiliary_loss_clip": 0.01129246, + "auxiliary_loss_mlp": 0.01039783, + "balance_loss_clip": 1.02388072, + "balance_loss_mlp": 1.04430401, + "epoch": 0.2615962723583346, + "flos": 17712831168000.0, + "grad_norm": 1.8647374515536046, + "language_loss": 0.62139511, + "learning_rate": 3.4632370417410866e-06, + "loss": 0.64308536, + "num_input_tokens_seen": 93942825, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8515625, + "step": 4351, + "time_per_iteration": 2.4613640308380127 + }, + { + "auxiliary_loss_clip": 0.01134641, + "auxiliary_loss_mlp": 0.01038195, + "balance_loss_clip": 1.02241123, + "balance_loss_mlp": 1.04469466, + "epoch": 0.26165639561100257, + "flos": 23257725672960.0, + "grad_norm": 2.09308554357217, + "language_loss": 0.83596927, + "learning_rate": 3.462971512415555e-06, + "loss": 0.85769767, + "num_input_tokens_seen": 93962045, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8984375, + "step": 4352, + "time_per_iteration": 2.4712979793548584 + }, + { + "auxiliary_loss_clip": 0.01045226, + "auxiliary_loss_mlp": 0.01000353, + "balance_loss_clip": 0.9986006, + "balance_loss_mlp": 1.01526213, + "epoch": 0.26171651886367053, + "flos": 66737970800640.0, + "grad_norm": 0.8010954727993301, + "language_loss": 0.70645392, + "learning_rate": 3.462705927613996e-06, + "loss": 0.72690976, + "num_input_tokens_seen": 94021175, + "router_z_loss_clip": 0.01757812, + "router_z_loss_mlp": 0.29882812, + "step": 4353, + "time_per_iteration": 3.026418447494507 + }, + { + "auxiliary_loss_clip": 0.01132608, + "auxiliary_loss_mlp": 0.01047561, + "balance_loss_clip": 1.03045464, + "balance_loss_mlp": 1.04494369, + "epoch": 0.2617766421163385, + "flos": 22349581090560.0, + "grad_norm": 1.7700850953213416, + "language_loss": 0.77393121, + "learning_rate": 3.4624402873464816e-06, + "loss": 0.79573292, + "num_input_tokens_seen": 94043370, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.875, + "step": 4354, + "time_per_iteration": 2.535482883453369 + }, + { + "auxiliary_loss_clip": 0.01138552, + "auxiliary_loss_mlp": 0.0104457, + "balance_loss_clip": 1.02826262, + "balance_loss_mlp": 1.04513574, + "epoch": 0.26183676536900646, + "flos": 26067125041920.0, + "grad_norm": 2.1625978203859826, + "language_loss": 0.68280292, + "learning_rate": 3.462174591623085e-06, + "loss": 0.70463413, + "num_input_tokens_seen": 94063510, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.93359375, + "step": 4355, + "time_per_iteration": 2.5276527404785156 + }, + { + "auxiliary_loss_clip": 0.01130838, + "auxiliary_loss_mlp": 0.01039393, + "balance_loss_clip": 1.02207148, + "balance_loss_mlp": 1.04375613, + "epoch": 0.26189688862167443, + "flos": 20996466825600.0, + "grad_norm": 1.9702640724114775, + "language_loss": 0.67509294, + "learning_rate": 3.4619088404538815e-06, + "loss": 0.69679523, + "num_input_tokens_seen": 94083865, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.87109375, + "step": 4356, + "time_per_iteration": 2.454436779022217 + }, + { + "auxiliary_loss_clip": 0.01043638, + "auxiliary_loss_mlp": 0.01003266, + "balance_loss_clip": 1.00139415, + "balance_loss_mlp": 1.01376009, + "epoch": 0.2619570118743424, + "flos": 65798261141760.0, + "grad_norm": 0.6781381277043278, + "language_loss": 0.53156137, + "learning_rate": 3.4616430338489487e-06, + "loss": 0.55203032, + "num_input_tokens_seen": 94144095, + "router_z_loss_clip": 0.01867676, + "router_z_loss_mlp": 0.29882812, + "step": 4357, + "time_per_iteration": 2.99239444732666 + }, + { + "auxiliary_loss_clip": 0.01138081, + "auxiliary_loss_mlp": 0.01045526, + "balance_loss_clip": 1.02955151, + "balance_loss_mlp": 1.04608119, + "epoch": 0.26201713512701036, + "flos": 28766817296640.0, + "grad_norm": 1.843205511563007, + "language_loss": 0.84329486, + "learning_rate": 3.4613771718183654e-06, + "loss": 0.86513096, + "num_input_tokens_seen": 94163035, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.921875, + "step": 4358, + "time_per_iteration": 2.511441707611084 + }, + { + "auxiliary_loss_clip": 0.0113833, + "auxiliary_loss_mlp": 0.01042477, + "balance_loss_clip": 1.02476251, + "balance_loss_mlp": 1.0450834, + "epoch": 0.2620772583796783, + "flos": 26432516142720.0, + "grad_norm": 2.1805365254718367, + "language_loss": 0.67303276, + "learning_rate": 3.4611112543722127e-06, + "loss": 0.69484085, + "num_input_tokens_seen": 94182520, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9296875, + "step": 4359, + "time_per_iteration": 2.5318756103515625 + }, + { + "auxiliary_loss_clip": 0.0113089, + "auxiliary_loss_mlp": 0.01042276, + "balance_loss_clip": 1.02725601, + "balance_loss_mlp": 1.04242957, + "epoch": 0.2621373816323463, + "flos": 20156552127360.0, + "grad_norm": 1.947910834650985, + "language_loss": 0.78673261, + "learning_rate": 3.4608452815205757e-06, + "loss": 0.80846429, + "num_input_tokens_seen": 94201795, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.88671875, + "step": 4360, + "time_per_iteration": 2.4551331996917725 + }, + { + "auxiliary_loss_clip": 0.01129221, + "auxiliary_loss_mlp": 0.01040075, + "balance_loss_clip": 1.02454162, + "balance_loss_mlp": 1.04250073, + "epoch": 0.26219750488501425, + "flos": 28621235473920.0, + "grad_norm": 1.9921513845886445, + "language_loss": 0.68169516, + "learning_rate": 3.4605792532735387e-06, + "loss": 0.70338809, + "num_input_tokens_seen": 94222390, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8671875, + "step": 4361, + "time_per_iteration": 2.57106351852417 + }, + { + "auxiliary_loss_clip": 0.01135372, + "auxiliary_loss_mlp": 0.01057475, + "balance_loss_clip": 1.04022598, + "balance_loss_mlp": 1.04400647, + "epoch": 0.2622576281376823, + "flos": 15042549173760.0, + "grad_norm": 1.9312179198305752, + "language_loss": 0.84310883, + "learning_rate": 3.46031316964119e-06, + "loss": 0.86503732, + "num_input_tokens_seen": 94239980, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9140625, + "step": 4362, + "time_per_iteration": 2.430020570755005 + }, + { + "auxiliary_loss_clip": 0.01133753, + "auxiliary_loss_mlp": 0.01040156, + "balance_loss_clip": 1.02282345, + "balance_loss_mlp": 1.04637551, + "epoch": 0.26231775139035024, + "flos": 26396174557440.0, + "grad_norm": 1.792780117353334, + "language_loss": 0.65294504, + "learning_rate": 3.4600470306336197e-06, + "loss": 0.67468411, + "num_input_tokens_seen": 94260715, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.875, + "step": 4363, + "time_per_iteration": 2.546393632888794 + }, + { + "auxiliary_loss_clip": 0.01042076, + "auxiliary_loss_mlp": 0.01004318, + "balance_loss_clip": 1.00252998, + "balance_loss_mlp": 1.0123173, + "epoch": 0.2623778746430182, + "flos": 65408918647680.0, + "grad_norm": 0.8867533167936222, + "language_loss": 0.61098528, + "learning_rate": 3.4597808362609194e-06, + "loss": 0.63144922, + "num_input_tokens_seen": 94321285, + "router_z_loss_clip": 0.01782227, + "router_z_loss_mlp": 0.296875, + "step": 4364, + "time_per_iteration": 3.150812864303589 + }, + { + "auxiliary_loss_clip": 0.01138346, + "auxiliary_loss_mlp": 0.01051385, + "balance_loss_clip": 1.03358722, + "balance_loss_mlp": 1.0468297, + "epoch": 0.26243799789568617, + "flos": 12604215254400.0, + "grad_norm": 2.424942653514092, + "language_loss": 0.71549827, + "learning_rate": 3.459514586533184e-06, + "loss": 0.73739558, + "num_input_tokens_seen": 94335420, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9140625, + "step": 4365, + "time_per_iteration": 2.493540048599243 + }, + { + "auxiliary_loss_clip": 0.0113494, + "auxiliary_loss_mlp": 0.01045115, + "balance_loss_clip": 1.02917075, + "balance_loss_mlp": 1.04654169, + "epoch": 0.26249812114835414, + "flos": 28623821253120.0, + "grad_norm": 1.8316261966241354, + "language_loss": 0.76925993, + "learning_rate": 3.459248281460509e-06, + "loss": 0.79106045, + "num_input_tokens_seen": 94357440, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8828125, + "step": 4366, + "time_per_iteration": 2.536853313446045 + }, + { + "auxiliary_loss_clip": 0.01135829, + "auxiliary_loss_mlp": 0.01042418, + "balance_loss_clip": 1.02684951, + "balance_loss_mlp": 1.04666197, + "epoch": 0.2625582444010221, + "flos": 14465393441280.0, + "grad_norm": 2.2091260788228975, + "language_loss": 0.75838757, + "learning_rate": 3.4589819210529927e-06, + "loss": 0.78017008, + "num_input_tokens_seen": 94375690, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.890625, + "step": 4367, + "time_per_iteration": 2.4576163291931152 + }, + { + "auxiliary_loss_clip": 0.01131307, + "auxiliary_loss_mlp": 0.01040361, + "balance_loss_clip": 1.02454233, + "balance_loss_mlp": 1.04452682, + "epoch": 0.26261836765369007, + "flos": 16613174246400.0, + "grad_norm": 2.1913456464974392, + "language_loss": 0.69633925, + "learning_rate": 3.458715505320736e-06, + "loss": 0.71805596, + "num_input_tokens_seen": 94393190, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8671875, + "step": 4368, + "time_per_iteration": 2.4301586151123047 + }, + { + "auxiliary_loss_clip": 0.01130278, + "auxiliary_loss_mlp": 0.01046678, + "balance_loss_clip": 1.02970243, + "balance_loss_mlp": 1.04319167, + "epoch": 0.26267849090635803, + "flos": 20519932066560.0, + "grad_norm": 1.7035150195415922, + "language_loss": 0.78589904, + "learning_rate": 3.458449034273841e-06, + "loss": 0.80766863, + "num_input_tokens_seen": 94410975, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.8671875, + "step": 4369, + "time_per_iteration": 2.489316701889038 + }, + { + "auxiliary_loss_clip": 0.01132105, + "auxiliary_loss_mlp": 0.01042711, + "balance_loss_clip": 1.02653408, + "balance_loss_mlp": 1.04431546, + "epoch": 0.262738614159026, + "flos": 21323936142720.0, + "grad_norm": 2.0413446884893047, + "language_loss": 0.83486217, + "learning_rate": 3.4581825079224133e-06, + "loss": 0.85661036, + "num_input_tokens_seen": 94429985, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.87890625, + "step": 4370, + "time_per_iteration": 2.4422430992126465 + }, + { + "auxiliary_loss_clip": 0.01136913, + "auxiliary_loss_mlp": 0.01050187, + "balance_loss_clip": 1.03060055, + "balance_loss_mlp": 1.04530215, + "epoch": 0.26279873741169396, + "flos": 17603590930560.0, + "grad_norm": 2.3340239620956287, + "language_loss": 0.70963454, + "learning_rate": 3.4579159262765575e-06, + "loss": 0.73150551, + "num_input_tokens_seen": 94448660, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.9140625, + "step": 4371, + "time_per_iteration": 2.5099778175354004 + }, + { + "auxiliary_loss_clip": 0.01043374, + "auxiliary_loss_mlp": 0.00999769, + "balance_loss_clip": 0.99784929, + "balance_loss_mlp": 1.01338005, + "epoch": 0.2628588606643619, + "flos": 60949746587520.0, + "grad_norm": 0.7657034729714577, + "language_loss": 0.56477904, + "learning_rate": 3.457649289346384e-06, + "loss": 0.58521044, + "num_input_tokens_seen": 94515630, + "router_z_loss_clip": 0.01916504, + "router_z_loss_mlp": 0.30078125, + "step": 4372, + "time_per_iteration": 3.244558572769165 + }, + { + "auxiliary_loss_clip": 0.01129835, + "auxiliary_loss_mlp": 0.01038918, + "balance_loss_clip": 1.02283084, + "balance_loss_mlp": 1.04335582, + "epoch": 0.2629189839170299, + "flos": 27016315891200.0, + "grad_norm": 1.7597219251079876, + "language_loss": 0.77415234, + "learning_rate": 3.4573825971420042e-06, + "loss": 0.79583991, + "num_input_tokens_seen": 94535385, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.86328125, + "step": 4373, + "time_per_iteration": 2.517784833908081 + }, + { + "auxiliary_loss_clip": 0.01131814, + "auxiliary_loss_mlp": 0.01041505, + "balance_loss_clip": 1.02563787, + "balance_loss_mlp": 1.04454422, + "epoch": 0.26297910716969786, + "flos": 17019863009280.0, + "grad_norm": 4.0873872332994905, + "language_loss": 0.71538949, + "learning_rate": 3.4571158496735294e-06, + "loss": 0.73712265, + "num_input_tokens_seen": 94552650, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.87109375, + "step": 4374, + "time_per_iteration": 2.442124605178833 + }, + { + "auxiliary_loss_clip": 0.01133779, + "auxiliary_loss_mlp": 0.01042, + "balance_loss_clip": 1.02435732, + "balance_loss_mlp": 1.0458709, + "epoch": 0.2630392304223659, + "flos": 24897370728960.0, + "grad_norm": 2.271567992891854, + "language_loss": 0.80945283, + "learning_rate": 3.4568490469510756e-06, + "loss": 0.83121061, + "num_input_tokens_seen": 94574075, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.875, + "step": 4375, + "time_per_iteration": 2.4889678955078125 + }, + { + "auxiliary_loss_clip": 0.0113002, + "auxiliary_loss_mlp": 0.0104209, + "balance_loss_clip": 1.0265336, + "balance_loss_mlp": 1.04366982, + "epoch": 0.26309935367503384, + "flos": 32854026067200.0, + "grad_norm": 2.3689389683703, + "language_loss": 0.65721256, + "learning_rate": 3.4565821889847603e-06, + "loss": 0.67893362, + "num_input_tokens_seen": 94594255, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.86328125, + "step": 4376, + "time_per_iteration": 2.563701629638672 + }, + { + "auxiliary_loss_clip": 0.01134504, + "auxiliary_loss_mlp": 0.0104592, + "balance_loss_clip": 1.02940989, + "balance_loss_mlp": 1.04445267, + "epoch": 0.2631594769277018, + "flos": 15887958652800.0, + "grad_norm": 1.8646607453842572, + "language_loss": 0.69517326, + "learning_rate": 3.4563152757847026e-06, + "loss": 0.71697748, + "num_input_tokens_seen": 94611410, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8984375, + "step": 4377, + "time_per_iteration": 2.486117124557495 + }, + { + "auxiliary_loss_clip": 0.01134243, + "auxiliary_loss_mlp": 0.01044305, + "balance_loss_clip": 1.02786613, + "balance_loss_mlp": 1.04500914, + "epoch": 0.2632196001803698, + "flos": 50804943557760.0, + "grad_norm": 1.711844873276418, + "language_loss": 0.7866202, + "learning_rate": 3.4560483073610233e-06, + "loss": 0.80840576, + "num_input_tokens_seen": 94636575, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.890625, + "step": 4378, + "time_per_iteration": 2.7608227729797363 + }, + { + "auxiliary_loss_clip": 0.01133245, + "auxiliary_loss_mlp": 0.01045029, + "balance_loss_clip": 1.03000844, + "balance_loss_mlp": 1.04554546, + "epoch": 0.26327972343303774, + "flos": 13733031041280.0, + "grad_norm": 2.6216377344963004, + "language_loss": 0.76320505, + "learning_rate": 3.455781283723846e-06, + "loss": 0.78498781, + "num_input_tokens_seen": 94654345, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.875, + "step": 4379, + "time_per_iteration": 2.4329168796539307 + }, + { + "auxiliary_loss_clip": 0.01138135, + "auxiliary_loss_mlp": 0.01041523, + "balance_loss_clip": 1.02252114, + "balance_loss_mlp": 1.04633284, + "epoch": 0.2633398466857057, + "flos": 23769057732480.0, + "grad_norm": 2.3003567904549156, + "language_loss": 0.78237861, + "learning_rate": 3.4555142048832975e-06, + "loss": 0.8041752, + "num_input_tokens_seen": 94673985, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.91796875, + "step": 4380, + "time_per_iteration": 2.5423548221588135 + }, + { + "auxiliary_loss_clip": 0.01135772, + "auxiliary_loss_mlp": 0.01040582, + "balance_loss_clip": 1.02419698, + "balance_loss_mlp": 1.0444113, + "epoch": 0.26339996993837367, + "flos": 27600223380480.0, + "grad_norm": 2.288842357619654, + "language_loss": 0.63811409, + "learning_rate": 3.4552470708495036e-06, + "loss": 0.65987766, + "num_input_tokens_seen": 94693145, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9140625, + "step": 4381, + "time_per_iteration": 2.5096213817596436 + }, + { + "auxiliary_loss_clip": 0.01131521, + "auxiliary_loss_mlp": 0.01037713, + "balance_loss_clip": 1.02148831, + "balance_loss_mlp": 1.04359913, + "epoch": 0.26346009319104163, + "flos": 16946317912320.0, + "grad_norm": 1.8729093449566216, + "language_loss": 0.82822418, + "learning_rate": 3.454979881632595e-06, + "loss": 0.84991652, + "num_input_tokens_seen": 94710185, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.87890625, + "step": 4382, + "time_per_iteration": 2.4691555500030518 + }, + { + "auxiliary_loss_clip": 0.01138155, + "auxiliary_loss_mlp": 0.010471, + "balance_loss_clip": 1.02902842, + "balance_loss_mlp": 1.04550982, + "epoch": 0.2635202164437096, + "flos": 37232218915200.0, + "grad_norm": 2.126733729537993, + "language_loss": 0.69686437, + "learning_rate": 3.4547126372427035e-06, + "loss": 0.71871686, + "num_input_tokens_seen": 94730280, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9296875, + "step": 4383, + "time_per_iteration": 2.5923891067504883 + }, + { + "auxiliary_loss_clip": 0.01135659, + "auxiliary_loss_mlp": 0.01042881, + "balance_loss_clip": 1.02732468, + "balance_loss_mlp": 1.04591441, + "epoch": 0.26358033969637756, + "flos": 20996359084800.0, + "grad_norm": 1.929045699346076, + "language_loss": 0.69191134, + "learning_rate": 3.4544453376899638e-06, + "loss": 0.71369672, + "num_input_tokens_seen": 94748560, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8984375, + "step": 4384, + "time_per_iteration": 2.5067081451416016 + }, + { + "auxiliary_loss_clip": 0.01132133, + "auxiliary_loss_mlp": 0.01039726, + "balance_loss_clip": 1.02319217, + "balance_loss_mlp": 1.04400492, + "epoch": 0.26364046294904553, + "flos": 27746092512000.0, + "grad_norm": 2.1647401570854075, + "language_loss": 0.6994158, + "learning_rate": 3.45417798298451e-06, + "loss": 0.72113448, + "num_input_tokens_seen": 94767570, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8828125, + "step": 4385, + "time_per_iteration": 4.062510251998901 + }, + { + "auxiliary_loss_clip": 0.01138578, + "auxiliary_loss_mlp": 0.01042633, + "balance_loss_clip": 1.02551472, + "balance_loss_mlp": 1.04978371, + "epoch": 0.2637005862017135, + "flos": 22893088757760.0, + "grad_norm": 2.0926426044309543, + "language_loss": 0.85188037, + "learning_rate": 3.453910573136482e-06, + "loss": 0.87369245, + "num_input_tokens_seen": 94784985, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.88671875, + "step": 4386, + "time_per_iteration": 3.9604547023773193 + }, + { + "auxiliary_loss_clip": 0.0113699, + "auxiliary_loss_mlp": 0.01041328, + "balance_loss_clip": 1.02487707, + "balance_loss_mlp": 1.04755282, + "epoch": 0.26376070945438146, + "flos": 15048834053760.0, + "grad_norm": 2.2248904155103637, + "language_loss": 0.77169371, + "learning_rate": 3.4536431081560196e-06, + "loss": 0.79347688, + "num_input_tokens_seen": 94802545, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.89453125, + "step": 4387, + "time_per_iteration": 2.472367286682129 + }, + { + "auxiliary_loss_clip": 0.01137279, + "auxiliary_loss_mlp": 0.01046481, + "balance_loss_clip": 1.0305903, + "balance_loss_mlp": 1.04989982, + "epoch": 0.2638208327070494, + "flos": 21141833166720.0, + "grad_norm": 3.996041212149396, + "language_loss": 0.76269597, + "learning_rate": 3.453375588053264e-06, + "loss": 0.78453362, + "num_input_tokens_seen": 94820730, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.875, + "step": 4388, + "time_per_iteration": 2.4858386516571045 + }, + { + "auxiliary_loss_clip": 0.01132552, + "auxiliary_loss_mlp": 0.01035954, + "balance_loss_clip": 1.01924086, + "balance_loss_mlp": 1.04387724, + "epoch": 0.26388095595971744, + "flos": 21725597001600.0, + "grad_norm": 1.9510825560869567, + "language_loss": 0.86210662, + "learning_rate": 3.4531080128383617e-06, + "loss": 0.88379163, + "num_input_tokens_seen": 94839175, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.88671875, + "step": 4389, + "time_per_iteration": 2.508162260055542 + }, + { + "auxiliary_loss_clip": 0.0104392, + "auxiliary_loss_mlp": 0.01009323, + "balance_loss_clip": 1.00736833, + "balance_loss_mlp": 1.01341343, + "epoch": 0.2639410792123854, + "flos": 65515537192320.0, + "grad_norm": 0.8096176904924934, + "language_loss": 0.60333931, + "learning_rate": 3.452840382521457e-06, + "loss": 0.6238718, + "num_input_tokens_seen": 94898865, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.3046875, + "step": 4390, + "time_per_iteration": 3.0593924522399902 + }, + { + "auxiliary_loss_clip": 0.01135834, + "auxiliary_loss_mlp": 0.01038563, + "balance_loss_clip": 1.02213633, + "balance_loss_mlp": 1.04522729, + "epoch": 0.2640012024650534, + "flos": 23948574929280.0, + "grad_norm": 1.7836890720002585, + "language_loss": 0.77702433, + "learning_rate": 3.4525726971127e-06, + "loss": 0.79876828, + "num_input_tokens_seen": 94917490, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90625, + "step": 4391, + "time_per_iteration": 2.5331051349639893 + }, + { + "auxiliary_loss_clip": 0.0104332, + "auxiliary_loss_mlp": 0.01003932, + "balance_loss_clip": 1.00221586, + "balance_loss_mlp": 1.01322889, + "epoch": 0.26406132571772134, + "flos": 56441163369600.0, + "grad_norm": 0.9020745061185262, + "language_loss": 0.58752227, + "learning_rate": 3.45230495662224e-06, + "loss": 0.60799479, + "num_input_tokens_seen": 94969065, + "router_z_loss_clip": 0.01721191, + "router_z_loss_mlp": 0.30078125, + "step": 4392, + "time_per_iteration": 3.047438144683838 + }, + { + "auxiliary_loss_clip": 0.01140884, + "auxiliary_loss_mlp": 0.0104677, + "balance_loss_clip": 1.03039694, + "balance_loss_mlp": 1.04925656, + "epoch": 0.2641214489703893, + "flos": 22090557139200.0, + "grad_norm": 2.5811541881681697, + "language_loss": 0.68459845, + "learning_rate": 3.4520371610602306e-06, + "loss": 0.70647496, + "num_input_tokens_seen": 94988540, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.91796875, + "step": 4393, + "time_per_iteration": 2.5537288188934326 + }, + { + "auxiliary_loss_clip": 0.01139955, + "auxiliary_loss_mlp": 0.01040744, + "balance_loss_clip": 1.02258813, + "balance_loss_mlp": 1.04662204, + "epoch": 0.26418157222305727, + "flos": 16544764794240.0, + "grad_norm": 1.8702197697463565, + "language_loss": 0.83116519, + "learning_rate": 3.4517693104368267e-06, + "loss": 0.85297221, + "num_input_tokens_seen": 95004810, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.93359375, + "step": 4394, + "time_per_iteration": 2.421211004257202 + }, + { + "auxiliary_loss_clip": 0.01143407, + "auxiliary_loss_mlp": 0.01042347, + "balance_loss_clip": 1.02357125, + "balance_loss_mlp": 1.04951847, + "epoch": 0.26424169547572524, + "flos": 18002486442240.0, + "grad_norm": 2.049654769643576, + "language_loss": 0.70211649, + "learning_rate": 3.4515014047621856e-06, + "loss": 0.72397399, + "num_input_tokens_seen": 95024085, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9375, + "step": 4395, + "time_per_iteration": 2.522111654281616 + }, + { + "auxiliary_loss_clip": 0.01135756, + "auxiliary_loss_mlp": 0.01035967, + "balance_loss_clip": 1.01925397, + "balance_loss_mlp": 1.04784906, + "epoch": 0.2643018187283932, + "flos": 16983162288000.0, + "grad_norm": 1.822626622734132, + "language_loss": 0.86866504, + "learning_rate": 3.4512334440464655e-06, + "loss": 0.89038229, + "num_input_tokens_seen": 95042515, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8828125, + "step": 4396, + "time_per_iteration": 2.4450392723083496 + }, + { + "auxiliary_loss_clip": 0.01042786, + "auxiliary_loss_mlp": 0.01024145, + "balance_loss_clip": 1.02226114, + "balance_loss_mlp": 1.01312816, + "epoch": 0.26436194198106117, + "flos": 59664359416320.0, + "grad_norm": 0.7917805441344085, + "language_loss": 0.54999918, + "learning_rate": 3.4509654282998277e-06, + "loss": 0.57066846, + "num_input_tokens_seen": 95094835, + "router_z_loss_clip": 0.01879883, + "router_z_loss_mlp": 0.296875, + "step": 4397, + "time_per_iteration": 2.8438708782196045 + }, + { + "auxiliary_loss_clip": 0.01134821, + "auxiliary_loss_mlp": 0.01052373, + "balance_loss_clip": 1.03567195, + "balance_loss_mlp": 1.04701614, + "epoch": 0.26442206523372913, + "flos": 32921322197760.0, + "grad_norm": 2.0493441687219724, + "language_loss": 0.77840483, + "learning_rate": 3.450697357532435e-06, + "loss": 0.80027676, + "num_input_tokens_seen": 95113480, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.87890625, + "step": 4398, + "time_per_iteration": 2.562499523162842 + }, + { + "auxiliary_loss_clip": 0.01141073, + "auxiliary_loss_mlp": 0.01040123, + "balance_loss_clip": 1.02262306, + "balance_loss_mlp": 1.05005002, + "epoch": 0.2644821884863971, + "flos": 21031300039680.0, + "grad_norm": 2.041566803030235, + "language_loss": 0.67037976, + "learning_rate": 3.4504292317544534e-06, + "loss": 0.69219166, + "num_input_tokens_seen": 95132580, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.91015625, + "step": 4399, + "time_per_iteration": 2.487778663635254 + }, + { + "auxiliary_loss_clip": 0.01128661, + "auxiliary_loss_mlp": 0.0103792, + "balance_loss_clip": 1.02288818, + "balance_loss_mlp": 1.04565811, + "epoch": 0.26454231173906506, + "flos": 20776801201920.0, + "grad_norm": 2.1160884119586303, + "language_loss": 0.86152196, + "learning_rate": 3.4501610509760504e-06, + "loss": 0.88318777, + "num_input_tokens_seen": 95152375, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 4400, + "time_per_iteration": 2.4837841987609863 + }, + { + "auxiliary_loss_clip": 0.01138875, + "auxiliary_loss_mlp": 0.01039625, + "balance_loss_clip": 1.02188635, + "balance_loss_mlp": 1.04813862, + "epoch": 0.264602434991733, + "flos": 16618669027200.0, + "grad_norm": 2.751022626956878, + "language_loss": 0.75779396, + "learning_rate": 3.4498928152073944e-06, + "loss": 0.77957898, + "num_input_tokens_seen": 95170265, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.90625, + "step": 4401, + "time_per_iteration": 2.548297166824341 + }, + { + "auxiliary_loss_clip": 0.01138206, + "auxiliary_loss_mlp": 0.01050893, + "balance_loss_clip": 1.03236771, + "balance_loss_mlp": 1.04606974, + "epoch": 0.26466255824440105, + "flos": 19062677295360.0, + "grad_norm": 1.9215434150559794, + "language_loss": 0.88267732, + "learning_rate": 3.4496245244586577e-06, + "loss": 0.90456831, + "num_input_tokens_seen": 95188655, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.921875, + "step": 4402, + "time_per_iteration": 2.4422647953033447 + }, + { + "auxiliary_loss_clip": 0.01135603, + "auxiliary_loss_mlp": 0.01048039, + "balance_loss_clip": 1.03151679, + "balance_loss_mlp": 1.04594266, + "epoch": 0.264722681497069, + "flos": 22638554006400.0, + "grad_norm": 1.8196807161845878, + "language_loss": 0.78123331, + "learning_rate": 3.4493561787400137e-06, + "loss": 0.80306977, + "num_input_tokens_seen": 95209615, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8984375, + "step": 4403, + "time_per_iteration": 2.587623357772827 + }, + { + "auxiliary_loss_clip": 0.0113528, + "auxiliary_loss_mlp": 0.01040463, + "balance_loss_clip": 1.02334428, + "balance_loss_mlp": 1.04440784, + "epoch": 0.264782804749737, + "flos": 22492253911680.0, + "grad_norm": 1.9946669841411302, + "language_loss": 0.87767446, + "learning_rate": 3.4490877780616387e-06, + "loss": 0.89943182, + "num_input_tokens_seen": 95227810, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.91015625, + "step": 4404, + "time_per_iteration": 2.492913246154785 + }, + { + "auxiliary_loss_clip": 0.01138307, + "auxiliary_loss_mlp": 0.01036911, + "balance_loss_clip": 1.02106786, + "balance_loss_mlp": 1.04683399, + "epoch": 0.26484292800240494, + "flos": 16800269212800.0, + "grad_norm": 1.7395093434050468, + "language_loss": 0.7593658, + "learning_rate": 3.448819322433709e-06, + "loss": 0.78111804, + "num_input_tokens_seen": 95245890, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.9140625, + "step": 4405, + "time_per_iteration": 2.508970260620117 + }, + { + "auxiliary_loss_clip": 0.01138042, + "auxiliary_loss_mlp": 0.01038835, + "balance_loss_clip": 1.02166891, + "balance_loss_mlp": 1.04870844, + "epoch": 0.2649030512550729, + "flos": 20449583280000.0, + "grad_norm": 1.9681610481113616, + "language_loss": 0.69979274, + "learning_rate": 3.4485508118664066e-06, + "loss": 0.72156149, + "num_input_tokens_seen": 95264955, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.89453125, + "step": 4406, + "time_per_iteration": 2.4548041820526123 + }, + { + "auxiliary_loss_clip": 0.01134971, + "auxiliary_loss_mlp": 0.01047688, + "balance_loss_clip": 1.03255999, + "balance_loss_mlp": 1.04781294, + "epoch": 0.2649631745077409, + "flos": 22416123035520.0, + "grad_norm": 1.7455123192469384, + "language_loss": 0.83764267, + "learning_rate": 3.448282246369912e-06, + "loss": 0.85946929, + "num_input_tokens_seen": 95284245, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.875, + "step": 4407, + "time_per_iteration": 2.5359292030334473 + }, + { + "auxiliary_loss_clip": 0.01134967, + "auxiliary_loss_mlp": 0.01032434, + "balance_loss_clip": 1.01566172, + "balance_loss_mlp": 1.04678226, + "epoch": 0.26502329776040884, + "flos": 35116110927360.0, + "grad_norm": 1.7942044569518307, + "language_loss": 0.76068008, + "learning_rate": 3.4480136259544084e-06, + "loss": 0.78235412, + "num_input_tokens_seen": 95307125, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8828125, + "step": 4408, + "time_per_iteration": 2.6124041080474854 + }, + { + "auxiliary_loss_clip": 0.011362, + "auxiliary_loss_mlp": 0.01034702, + "balance_loss_clip": 1.01832306, + "balance_loss_mlp": 1.04918611, + "epoch": 0.2650834210130768, + "flos": 38687498438400.0, + "grad_norm": 1.8724720588087471, + "language_loss": 0.70920485, + "learning_rate": 3.447744950630084e-06, + "loss": 0.73091388, + "num_input_tokens_seen": 95329150, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.87109375, + "step": 4409, + "time_per_iteration": 2.6539366245269775 + }, + { + "auxiliary_loss_clip": 0.01136441, + "auxiliary_loss_mlp": 0.01037034, + "balance_loss_clip": 1.01931942, + "balance_loss_mlp": 1.04666233, + "epoch": 0.26514354426574477, + "flos": 24716847951360.0, + "grad_norm": 1.7884535623295956, + "language_loss": 0.73085511, + "learning_rate": 3.4474762204071253e-06, + "loss": 0.75258988, + "num_input_tokens_seen": 95349880, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.8984375, + "step": 4410, + "time_per_iteration": 2.545083999633789 + }, + { + "auxiliary_loss_clip": 0.01139704, + "auxiliary_loss_mlp": 0.01049137, + "balance_loss_clip": 1.03218508, + "balance_loss_mlp": 1.04741001, + "epoch": 0.26520366751841273, + "flos": 20340055733760.0, + "grad_norm": 1.9280641145018393, + "language_loss": 0.73272175, + "learning_rate": 3.4472074352957244e-06, + "loss": 0.75461018, + "num_input_tokens_seen": 95368570, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.921875, + "step": 4411, + "time_per_iteration": 2.4818248748779297 + }, + { + "auxiliary_loss_clip": 0.01137094, + "auxiliary_loss_mlp": 0.01042757, + "balance_loss_clip": 1.02593684, + "balance_loss_mlp": 1.04815316, + "epoch": 0.2652637907710807, + "flos": 22343870828160.0, + "grad_norm": 2.073752901007566, + "language_loss": 0.82294202, + "learning_rate": 3.446938595306071e-06, + "loss": 0.84474051, + "num_input_tokens_seen": 95387065, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.88671875, + "step": 4412, + "time_per_iteration": 2.56634521484375 + }, + { + "auxiliary_loss_clip": 0.01134293, + "auxiliary_loss_mlp": 0.01047936, + "balance_loss_clip": 1.03161621, + "balance_loss_mlp": 1.04541004, + "epoch": 0.26532391402374866, + "flos": 19354235990400.0, + "grad_norm": 1.721718037322793, + "language_loss": 0.74245501, + "learning_rate": 3.4466697004483622e-06, + "loss": 0.76427728, + "num_input_tokens_seen": 95406345, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.890625, + "step": 4413, + "time_per_iteration": 2.4994029998779297 + }, + { + "auxiliary_loss_clip": 0.01046706, + "auxiliary_loss_mlp": 0.01018291, + "balance_loss_clip": 1.01659799, + "balance_loss_mlp": 1.0160358, + "epoch": 0.26538403727641663, + "flos": 44787611422080.0, + "grad_norm": 0.8825812455559224, + "language_loss": 0.56986731, + "learning_rate": 3.446400750732793e-06, + "loss": 0.59051728, + "num_input_tokens_seen": 95463595, + "router_z_loss_clip": 0.01696777, + "router_z_loss_mlp": 0.30664062, + "step": 4414, + "time_per_iteration": 2.9884986877441406 + }, + { + "auxiliary_loss_clip": 0.01128281, + "auxiliary_loss_mlp": 0.01041185, + "balance_loss_clip": 1.02605712, + "balance_loss_mlp": 1.04307461, + "epoch": 0.26544416052908465, + "flos": 28182119708160.0, + "grad_norm": 1.8727128035200367, + "language_loss": 0.74535894, + "learning_rate": 3.4461317461695625e-06, + "loss": 0.76705366, + "num_input_tokens_seen": 95484115, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8515625, + "step": 4415, + "time_per_iteration": 2.5531253814697266 + }, + { + "auxiliary_loss_clip": 0.01138825, + "auxiliary_loss_mlp": 0.01043694, + "balance_loss_clip": 1.02506185, + "balance_loss_mlp": 1.04656732, + "epoch": 0.2655042837817526, + "flos": 17565274097280.0, + "grad_norm": 2.3504707987247917, + "language_loss": 0.86662048, + "learning_rate": 3.4458626867688707e-06, + "loss": 0.88844568, + "num_input_tokens_seen": 95501435, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.921875, + "step": 4416, + "time_per_iteration": 2.4751384258270264 + }, + { + "auxiliary_loss_clip": 0.0113975, + "auxiliary_loss_mlp": 0.01042134, + "balance_loss_clip": 1.02439594, + "balance_loss_mlp": 1.0492208, + "epoch": 0.2655644070344206, + "flos": 23404636298880.0, + "grad_norm": 1.6281293305848954, + "language_loss": 0.76152384, + "learning_rate": 3.4455935725409217e-06, + "loss": 0.78334266, + "num_input_tokens_seen": 95520135, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.90625, + "step": 4417, + "time_per_iteration": 2.5017013549804688 + }, + { + "auxiliary_loss_clip": 0.01135215, + "auxiliary_loss_mlp": 0.01039785, + "balance_loss_clip": 1.02167702, + "balance_loss_mlp": 1.04778051, + "epoch": 0.26562453028708854, + "flos": 26468462678400.0, + "grad_norm": 1.7397383944852411, + "language_loss": 0.79984045, + "learning_rate": 3.4453244034959196e-06, + "loss": 0.82159042, + "num_input_tokens_seen": 95541705, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.875, + "step": 4418, + "time_per_iteration": 2.539454460144043 + }, + { + "auxiliary_loss_clip": 0.01138688, + "auxiliary_loss_mlp": 0.01046556, + "balance_loss_clip": 1.02983057, + "balance_loss_mlp": 1.04861307, + "epoch": 0.2656846535397565, + "flos": 19207576759680.0, + "grad_norm": 2.7780034581995965, + "language_loss": 0.67397833, + "learning_rate": 3.445055179644071e-06, + "loss": 0.69583082, + "num_input_tokens_seen": 95560300, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.90234375, + "step": 4419, + "time_per_iteration": 2.461444616317749 + }, + { + "auxiliary_loss_clip": 0.01139197, + "auxiliary_loss_mlp": 0.01045382, + "balance_loss_clip": 1.02739358, + "balance_loss_mlp": 1.04920876, + "epoch": 0.2657447767924245, + "flos": 30551325903360.0, + "grad_norm": 2.097903587873874, + "language_loss": 0.79365611, + "learning_rate": 3.444785900995585e-06, + "loss": 0.81550193, + "num_input_tokens_seen": 95580150, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.8984375, + "step": 4420, + "time_per_iteration": 2.5908427238464355 + }, + { + "auxiliary_loss_clip": 0.01141654, + "auxiliary_loss_mlp": 0.01049212, + "balance_loss_clip": 1.02990031, + "balance_loss_mlp": 1.0493983, + "epoch": 0.26580490004509244, + "flos": 20922742160640.0, + "grad_norm": 2.1223383047232933, + "language_loss": 0.81612432, + "learning_rate": 3.444516567560673e-06, + "loss": 0.83803296, + "num_input_tokens_seen": 95597570, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.921875, + "step": 4421, + "time_per_iteration": 2.4869320392608643 + }, + { + "auxiliary_loss_clip": 0.01134642, + "auxiliary_loss_mlp": 0.01037045, + "balance_loss_clip": 1.02027202, + "balance_loss_mlp": 1.04734015, + "epoch": 0.2658650232977604, + "flos": 43945682584320.0, + "grad_norm": 1.5724937400793966, + "language_loss": 0.65278006, + "learning_rate": 3.444247179349548e-06, + "loss": 0.67449689, + "num_input_tokens_seen": 95619415, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.87109375, + "step": 4422, + "time_per_iteration": 2.7370638847351074 + }, + { + "auxiliary_loss_clip": 0.01138513, + "auxiliary_loss_mlp": 0.01046085, + "balance_loss_clip": 1.02965808, + "balance_loss_mlp": 1.04750621, + "epoch": 0.26592514655042837, + "flos": 29716439109120.0, + "grad_norm": 2.411979213410041, + "language_loss": 0.73841226, + "learning_rate": 3.4439777363724252e-06, + "loss": 0.76025832, + "num_input_tokens_seen": 95639155, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.91015625, + "step": 4423, + "time_per_iteration": 2.5510191917419434 + }, + { + "auxiliary_loss_clip": 0.01136367, + "auxiliary_loss_mlp": 0.01046611, + "balance_loss_clip": 1.03017163, + "balance_loss_mlp": 1.04504442, + "epoch": 0.26598526980309634, + "flos": 46677730014720.0, + "grad_norm": 1.6317340067044743, + "language_loss": 0.77703154, + "learning_rate": 3.443708238639522e-06, + "loss": 0.79886127, + "num_input_tokens_seen": 95663320, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9140625, + "step": 4424, + "time_per_iteration": 2.809495449066162 + }, + { + "auxiliary_loss_clip": 0.01137168, + "auxiliary_loss_mlp": 0.01043079, + "balance_loss_clip": 1.02675951, + "balance_loss_mlp": 1.04695249, + "epoch": 0.2660453930557643, + "flos": 11509442582400.0, + "grad_norm": 2.064218808714238, + "language_loss": 0.79345673, + "learning_rate": 3.4434386861610573e-06, + "loss": 0.81525922, + "num_input_tokens_seen": 95680260, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.90234375, + "step": 4425, + "time_per_iteration": 2.48149037361145 + }, + { + "auxiliary_loss_clip": 0.01133425, + "auxiliary_loss_mlp": 0.01046814, + "balance_loss_clip": 1.03138816, + "balance_loss_mlp": 1.04685736, + "epoch": 0.26610551630843227, + "flos": 24791578197120.0, + "grad_norm": 1.774406296589384, + "language_loss": 0.80463314, + "learning_rate": 3.4431690789472532e-06, + "loss": 0.82643557, + "num_input_tokens_seen": 95701140, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8671875, + "step": 4426, + "time_per_iteration": 2.5968613624572754 + }, + { + "auxiliary_loss_clip": 0.01138948, + "auxiliary_loss_mlp": 0.0104835, + "balance_loss_clip": 1.03180957, + "balance_loss_mlp": 1.04982209, + "epoch": 0.26616563956110023, + "flos": 27636385397760.0, + "grad_norm": 1.8207507571493768, + "language_loss": 0.77337295, + "learning_rate": 3.442899417008333e-06, + "loss": 0.79524601, + "num_input_tokens_seen": 95722060, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.890625, + "step": 4427, + "time_per_iteration": 4.045380353927612 + }, + { + "auxiliary_loss_clip": 0.01133558, + "auxiliary_loss_mlp": 0.010331, + "balance_loss_clip": 1.01760316, + "balance_loss_mlp": 1.04737306, + "epoch": 0.26622576281376825, + "flos": 28362893880960.0, + "grad_norm": 1.8400253790543033, + "language_loss": 0.76800078, + "learning_rate": 3.4426297003545227e-06, + "loss": 0.78966737, + "num_input_tokens_seen": 95742495, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.86328125, + "step": 4428, + "time_per_iteration": 4.018831491470337 + }, + { + "auxiliary_loss_clip": 0.01135115, + "auxiliary_loss_mlp": 0.01034355, + "balance_loss_clip": 1.01858354, + "balance_loss_mlp": 1.04529297, + "epoch": 0.2662858860664362, + "flos": 18041341979520.0, + "grad_norm": 1.9075878866801723, + "language_loss": 0.83010298, + "learning_rate": 3.4423599289960495e-06, + "loss": 0.8517977, + "num_input_tokens_seen": 95761510, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8984375, + "step": 4429, + "time_per_iteration": 2.576535940170288 + }, + { + "auxiliary_loss_clip": 0.01133677, + "auxiliary_loss_mlp": 0.01042932, + "balance_loss_clip": 1.02644563, + "balance_loss_mlp": 1.04664719, + "epoch": 0.2663460093191042, + "flos": 22745818995840.0, + "grad_norm": 3.2197583620662082, + "language_loss": 0.72143924, + "learning_rate": 3.442090102943143e-06, + "loss": 0.74320537, + "num_input_tokens_seen": 95782385, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.87109375, + "step": 4430, + "time_per_iteration": 2.5262365341186523 + }, + { + "auxiliary_loss_clip": 0.01136153, + "auxiliary_loss_mlp": 0.01042808, + "balance_loss_clip": 1.02453375, + "balance_loss_mlp": 1.04667306, + "epoch": 0.26640613257177215, + "flos": 16508782344960.0, + "grad_norm": 2.382555523964676, + "language_loss": 0.81635833, + "learning_rate": 3.441820222206035e-06, + "loss": 0.83814788, + "num_input_tokens_seen": 95800595, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.89453125, + "step": 4431, + "time_per_iteration": 2.5135624408721924 + }, + { + "auxiliary_loss_clip": 0.01142285, + "auxiliary_loss_mlp": 0.01050952, + "balance_loss_clip": 1.03360736, + "balance_loss_mlp": 1.04865289, + "epoch": 0.2664662558244401, + "flos": 23075945919360.0, + "grad_norm": 2.34486467491615, + "language_loss": 0.76153386, + "learning_rate": 3.44155028679496e-06, + "loss": 0.78346616, + "num_input_tokens_seen": 95818480, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9375, + "step": 4432, + "time_per_iteration": 2.469515562057495 + }, + { + "auxiliary_loss_clip": 0.01136779, + "auxiliary_loss_mlp": 0.01044676, + "balance_loss_clip": 1.02711606, + "balance_loss_mlp": 1.04703665, + "epoch": 0.2665263790771081, + "flos": 23769273214080.0, + "grad_norm": 2.148919041496035, + "language_loss": 0.82521772, + "learning_rate": 3.441280296720154e-06, + "loss": 0.84703225, + "num_input_tokens_seen": 95837205, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8984375, + "step": 4433, + "time_per_iteration": 2.540174961090088 + }, + { + "auxiliary_loss_clip": 0.01138849, + "auxiliary_loss_mlp": 0.01048222, + "balance_loss_clip": 1.03065097, + "balance_loss_mlp": 1.04955435, + "epoch": 0.26658650232977604, + "flos": 28001273708160.0, + "grad_norm": 2.091984027516481, + "language_loss": 0.76638913, + "learning_rate": 3.441010251991854e-06, + "loss": 0.78825986, + "num_input_tokens_seen": 95858395, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.890625, + "step": 4434, + "time_per_iteration": 2.549769878387451 + }, + { + "auxiliary_loss_clip": 0.01133542, + "auxiliary_loss_mlp": 0.01043118, + "balance_loss_clip": 1.02770376, + "balance_loss_mlp": 1.04645348, + "epoch": 0.266646625582444, + "flos": 22163635359360.0, + "grad_norm": 2.251252650424801, + "language_loss": 0.82632279, + "learning_rate": 3.440740152620301e-06, + "loss": 0.84808934, + "num_input_tokens_seen": 95877875, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.87109375, + "step": 4435, + "time_per_iteration": 2.5329744815826416 + }, + { + "auxiliary_loss_clip": 0.01140704, + "auxiliary_loss_mlp": 0.0105698, + "balance_loss_clip": 1.03870487, + "balance_loss_mlp": 1.04742312, + "epoch": 0.266706748835112, + "flos": 27853537069440.0, + "grad_norm": 2.2611652281579397, + "language_loss": 0.87278962, + "learning_rate": 3.4404699986157376e-06, + "loss": 0.89476645, + "num_input_tokens_seen": 95895820, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.9296875, + "step": 4436, + "time_per_iteration": 2.5375254154205322 + }, + { + "auxiliary_loss_clip": 0.01136328, + "auxiliary_loss_mlp": 0.0104305, + "balance_loss_clip": 1.02670658, + "balance_loss_mlp": 1.04566383, + "epoch": 0.26676687208777994, + "flos": 25812123413760.0, + "grad_norm": 1.4304916595737875, + "language_loss": 0.78941, + "learning_rate": 3.440199789988407e-06, + "loss": 0.81120378, + "num_input_tokens_seen": 95918025, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90625, + "step": 4437, + "time_per_iteration": 2.591017007827759 + }, + { + "auxiliary_loss_clip": 0.01134502, + "auxiliary_loss_mlp": 0.01041567, + "balance_loss_clip": 1.02533066, + "balance_loss_mlp": 1.04595256, + "epoch": 0.2668269953404479, + "flos": 36064583504640.0, + "grad_norm": 2.0731379310987412, + "language_loss": 0.63412011, + "learning_rate": 3.439929526748556e-06, + "loss": 0.65588087, + "num_input_tokens_seen": 95937725, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8828125, + "step": 4438, + "time_per_iteration": 2.6429452896118164 + }, + { + "auxiliary_loss_clip": 0.01137556, + "auxiliary_loss_mlp": 0.01036987, + "balance_loss_clip": 1.02125144, + "balance_loss_mlp": 1.04869223, + "epoch": 0.26688711859311587, + "flos": 26570987072640.0, + "grad_norm": 1.8133794638407341, + "language_loss": 0.75628942, + "learning_rate": 3.4396592089064334e-06, + "loss": 0.77803481, + "num_input_tokens_seen": 95956335, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.890625, + "step": 4439, + "time_per_iteration": 2.5296032428741455 + }, + { + "auxiliary_loss_clip": 0.01140638, + "auxiliary_loss_mlp": 0.01038527, + "balance_loss_clip": 1.02052629, + "balance_loss_mlp": 1.04913759, + "epoch": 0.26694724184578383, + "flos": 26761565658240.0, + "grad_norm": 1.7792140134846064, + "language_loss": 0.71444011, + "learning_rate": 3.4393888364722897e-06, + "loss": 0.7362318, + "num_input_tokens_seen": 95977135, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9140625, + "step": 4440, + "time_per_iteration": 2.5714335441589355 + }, + { + "auxiliary_loss_clip": 0.01139576, + "auxiliary_loss_mlp": 0.01045623, + "balance_loss_clip": 1.02757502, + "balance_loss_mlp": 1.04816949, + "epoch": 0.2670073650984518, + "flos": 20959586536320.0, + "grad_norm": 1.8363906583736056, + "language_loss": 0.66291904, + "learning_rate": 3.439118409456376e-06, + "loss": 0.68477106, + "num_input_tokens_seen": 95995435, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9140625, + "step": 4441, + "time_per_iteration": 2.522589683532715 + }, + { + "auxiliary_loss_clip": 0.01137665, + "auxiliary_loss_mlp": 0.0104418, + "balance_loss_clip": 1.02654862, + "balance_loss_mlp": 1.04803538, + "epoch": 0.2670674883511198, + "flos": 28366054277760.0, + "grad_norm": 1.5597318548365904, + "language_loss": 0.76451373, + "learning_rate": 3.4388479278689486e-06, + "loss": 0.78633213, + "num_input_tokens_seen": 96016340, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.89453125, + "step": 4442, + "time_per_iteration": 2.5659492015838623 + }, + { + "auxiliary_loss_clip": 0.01060214, + "auxiliary_loss_mlp": 0.0100059, + "balance_loss_clip": 0.99855101, + "balance_loss_mlp": 1.02895594, + "epoch": 0.2671276116037878, + "flos": 58971319430400.0, + "grad_norm": 0.912864167592289, + "language_loss": 0.61270142, + "learning_rate": 3.4385773917202637e-06, + "loss": 0.63330936, + "num_input_tokens_seen": 96071205, + "router_z_loss_clip": 0.02038574, + "router_z_loss_mlp": 0.3125, + "step": 4443, + "time_per_iteration": 3.0256776809692383 + }, + { + "auxiliary_loss_clip": 0.01140806, + "auxiliary_loss_mlp": 0.01035952, + "balance_loss_clip": 1.01968026, + "balance_loss_mlp": 1.0495882, + "epoch": 0.26718773485645575, + "flos": 43945072053120.0, + "grad_norm": 1.5525166591100914, + "language_loss": 0.76200545, + "learning_rate": 3.4383068010205793e-06, + "loss": 0.78377306, + "num_input_tokens_seen": 96094240, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.91015625, + "step": 4444, + "time_per_iteration": 2.7414674758911133 + }, + { + "auxiliary_loss_clip": 0.0114013, + "auxiliary_loss_mlp": 0.01040836, + "balance_loss_clip": 1.02330077, + "balance_loss_mlp": 1.04932773, + "epoch": 0.2672478581091237, + "flos": 25228323665280.0, + "grad_norm": 3.16165776963455, + "language_loss": 0.80212528, + "learning_rate": 3.438036155780158e-06, + "loss": 0.82393491, + "num_input_tokens_seen": 96114105, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.90625, + "step": 4445, + "time_per_iteration": 2.5349111557006836 + }, + { + "auxiliary_loss_clip": 0.01139166, + "auxiliary_loss_mlp": 0.0104, + "balance_loss_clip": 1.02232134, + "balance_loss_mlp": 1.04797101, + "epoch": 0.2673079813617917, + "flos": 15268176455040.0, + "grad_norm": 2.3952290716593825, + "language_loss": 0.89144397, + "learning_rate": 3.43776545600926e-06, + "loss": 0.91323566, + "num_input_tokens_seen": 96132140, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.91015625, + "step": 4446, + "time_per_iteration": 2.5512521266937256 + }, + { + "auxiliary_loss_clip": 0.01140462, + "auxiliary_loss_mlp": 0.01047593, + "balance_loss_clip": 1.0311892, + "balance_loss_mlp": 1.04977763, + "epoch": 0.26736810461445965, + "flos": 25812733944960.0, + "grad_norm": 1.831363923725005, + "language_loss": 0.68259656, + "learning_rate": 3.437494701718153e-06, + "loss": 0.70447719, + "num_input_tokens_seen": 96152090, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90625, + "step": 4447, + "time_per_iteration": 2.5752837657928467 + }, + { + "auxiliary_loss_clip": 0.01140858, + "auxiliary_loss_mlp": 0.01040004, + "balance_loss_clip": 1.02261138, + "balance_loss_mlp": 1.04972827, + "epoch": 0.2674282278671276, + "flos": 24312709054080.0, + "grad_norm": 1.9862084341014827, + "language_loss": 0.82976532, + "learning_rate": 3.4372238929171026e-06, + "loss": 0.85157394, + "num_input_tokens_seen": 96170015, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.91015625, + "step": 4448, + "time_per_iteration": 2.6524059772491455 + }, + { + "auxiliary_loss_clip": 0.01137667, + "auxiliary_loss_mlp": 0.01048508, + "balance_loss_clip": 1.03110301, + "balance_loss_mlp": 1.04973495, + "epoch": 0.2674883511197956, + "flos": 22815521337600.0, + "grad_norm": 2.185461436072074, + "language_loss": 0.84288895, + "learning_rate": 3.436953029616378e-06, + "loss": 0.86475068, + "num_input_tokens_seen": 96188065, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.87890625, + "step": 4449, + "time_per_iteration": 2.5167598724365234 + }, + { + "auxiliary_loss_clip": 0.01148403, + "auxiliary_loss_mlp": 0.01047832, + "balance_loss_clip": 1.02892506, + "balance_loss_mlp": 1.05114913, + "epoch": 0.26754847437246354, + "flos": 25370170473600.0, + "grad_norm": 1.9936425417360089, + "language_loss": 0.84260273, + "learning_rate": 3.4366821118262506e-06, + "loss": 0.86456501, + "num_input_tokens_seen": 96205780, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.97265625, + "step": 4450, + "time_per_iteration": 2.555941343307495 + }, + { + "auxiliary_loss_clip": 0.01133946, + "auxiliary_loss_mlp": 0.0104095, + "balance_loss_clip": 1.02560782, + "balance_loss_mlp": 1.04674196, + "epoch": 0.2676085976251315, + "flos": 20230420446720.0, + "grad_norm": 1.900524277018137, + "language_loss": 0.81065774, + "learning_rate": 3.4364111395569937e-06, + "loss": 0.83240664, + "num_input_tokens_seen": 96224990, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.87109375, + "step": 4451, + "time_per_iteration": 2.5289859771728516 + }, + { + "auxiliary_loss_clip": 0.01140947, + "auxiliary_loss_mlp": 0.01041834, + "balance_loss_clip": 1.02593148, + "balance_loss_mlp": 1.05186319, + "epoch": 0.26766872087779947, + "flos": 28038225824640.0, + "grad_norm": 1.8040621200757803, + "language_loss": 0.86401796, + "learning_rate": 3.436140112818882e-06, + "loss": 0.88584578, + "num_input_tokens_seen": 96245345, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.890625, + "step": 4452, + "time_per_iteration": 2.617918014526367 + }, + { + "auxiliary_loss_clip": 0.01143372, + "auxiliary_loss_mlp": 0.01037743, + "balance_loss_clip": 1.02055311, + "balance_loss_mlp": 1.05132198, + "epoch": 0.26772884413046744, + "flos": 18325179250560.0, + "grad_norm": 1.9731948573099198, + "language_loss": 0.83129871, + "learning_rate": 3.435869031622194e-06, + "loss": 0.8531099, + "num_input_tokens_seen": 96259000, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.921875, + "step": 4453, + "time_per_iteration": 2.483130931854248 + }, + { + "auxiliary_loss_clip": 0.0113897, + "auxiliary_loss_mlp": 0.01046975, + "balance_loss_clip": 1.02936745, + "balance_loss_mlp": 1.04995108, + "epoch": 0.2677889673831354, + "flos": 22127509255680.0, + "grad_norm": 1.62656613015929, + "language_loss": 0.79744816, + "learning_rate": 3.435597895977208e-06, + "loss": 0.81930768, + "num_input_tokens_seen": 96277000, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.890625, + "step": 4454, + "time_per_iteration": 2.537853717803955 + }, + { + "auxiliary_loss_clip": 0.01141821, + "auxiliary_loss_mlp": 0.01042652, + "balance_loss_clip": 1.02599859, + "balance_loss_mlp": 1.04989707, + "epoch": 0.2678490906358034, + "flos": 23729699404800.0, + "grad_norm": 1.7640316216704761, + "language_loss": 0.7215519, + "learning_rate": 3.435326705894206e-06, + "loss": 0.74339664, + "num_input_tokens_seen": 96297010, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.91796875, + "step": 4455, + "time_per_iteration": 2.5023562908172607 + }, + { + "auxiliary_loss_clip": 0.01137457, + "auxiliary_loss_mlp": 0.01039805, + "balance_loss_clip": 1.02406991, + "balance_loss_mlp": 1.05066276, + "epoch": 0.2679092138884714, + "flos": 21762872340480.0, + "grad_norm": 1.5496021720121687, + "language_loss": 0.74044335, + "learning_rate": 3.435055461383471e-06, + "loss": 0.76221603, + "num_input_tokens_seen": 96315780, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8671875, + "step": 4456, + "time_per_iteration": 2.487581729888916 + }, + { + "auxiliary_loss_clip": 0.01141742, + "auxiliary_loss_mlp": 0.01038216, + "balance_loss_clip": 1.02121687, + "balance_loss_mlp": 1.04937947, + "epoch": 0.26796933714113935, + "flos": 19861186590720.0, + "grad_norm": 2.2089309948453697, + "language_loss": 0.70965469, + "learning_rate": 3.4347841624552896e-06, + "loss": 0.73145425, + "num_input_tokens_seen": 96333465, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.921875, + "step": 4457, + "time_per_iteration": 2.4584691524505615 + }, + { + "auxiliary_loss_clip": 0.01143072, + "auxiliary_loss_mlp": 0.010439, + "balance_loss_clip": 1.02681756, + "balance_loss_mlp": 1.05237103, + "epoch": 0.2680294603938073, + "flos": 20047886507520.0, + "grad_norm": 2.29797460876898, + "language_loss": 0.79029202, + "learning_rate": 3.4345128091199493e-06, + "loss": 0.81216174, + "num_input_tokens_seen": 96352005, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.90625, + "step": 4458, + "time_per_iteration": 2.6079578399658203 + }, + { + "auxiliary_loss_clip": 0.01052787, + "auxiliary_loss_mlp": 0.01006207, + "balance_loss_clip": 1.00439513, + "balance_loss_mlp": 1.02259135, + "epoch": 0.2680895836464753, + "flos": 72113763052800.0, + "grad_norm": 0.8640508796264214, + "language_loss": 0.58716619, + "learning_rate": 3.434241401387739e-06, + "loss": 0.60775614, + "num_input_tokens_seen": 96406265, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.30078125, + "step": 4459, + "time_per_iteration": 3.0725412368774414 + }, + { + "auxiliary_loss_clip": 0.0113409, + "auxiliary_loss_mlp": 0.01040081, + "balance_loss_clip": 1.02444053, + "balance_loss_mlp": 1.04671741, + "epoch": 0.26814970689914325, + "flos": 20449044576000.0, + "grad_norm": 2.0778557825519055, + "language_loss": 0.85224575, + "learning_rate": 3.4339699392689507e-06, + "loss": 0.87398744, + "num_input_tokens_seen": 96425225, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.875, + "step": 4460, + "time_per_iteration": 2.483299732208252 + }, + { + "auxiliary_loss_clip": 0.01136074, + "auxiliary_loss_mlp": 0.01043042, + "balance_loss_clip": 1.02653205, + "balance_loss_mlp": 1.04752469, + "epoch": 0.2682098301518112, + "flos": 17566674727680.0, + "grad_norm": 2.805871571962145, + "language_loss": 0.68256581, + "learning_rate": 3.4336984227738796e-06, + "loss": 0.70435691, + "num_input_tokens_seen": 96443780, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8828125, + "step": 4461, + "time_per_iteration": 2.439304828643799 + }, + { + "auxiliary_loss_clip": 0.01135713, + "auxiliary_loss_mlp": 0.01049055, + "balance_loss_clip": 1.03198409, + "balance_loss_mlp": 1.0470686, + "epoch": 0.2682699534044792, + "flos": 18333259810560.0, + "grad_norm": 1.5557483279788171, + "language_loss": 0.67342007, + "learning_rate": 3.43342685191282e-06, + "loss": 0.69526774, + "num_input_tokens_seen": 96464530, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.88671875, + "step": 4462, + "time_per_iteration": 2.5081140995025635 + }, + { + "auxiliary_loss_clip": 0.01136996, + "auxiliary_loss_mlp": 0.01041529, + "balance_loss_clip": 1.02413619, + "balance_loss_mlp": 1.04865909, + "epoch": 0.26833007665714714, + "flos": 25301294144640.0, + "grad_norm": 1.8707784514564991, + "language_loss": 0.6927141, + "learning_rate": 3.4331552266960705e-06, + "loss": 0.71449935, + "num_input_tokens_seen": 96483345, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.8828125, + "step": 4463, + "time_per_iteration": 2.5280556678771973 + }, + { + "auxiliary_loss_clip": 0.01140107, + "auxiliary_loss_mlp": 0.01041395, + "balance_loss_clip": 1.02414584, + "balance_loss_mlp": 1.04812574, + "epoch": 0.2683901999098151, + "flos": 16099759198080.0, + "grad_norm": 2.4976114648735304, + "language_loss": 0.77389008, + "learning_rate": 3.432883547133931e-06, + "loss": 0.79570508, + "num_input_tokens_seen": 96498305, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.921875, + "step": 4464, + "time_per_iteration": 2.469650983810425 + }, + { + "auxiliary_loss_clip": 0.01134508, + "auxiliary_loss_mlp": 0.01039425, + "balance_loss_clip": 1.02215123, + "balance_loss_mlp": 1.0458076, + "epoch": 0.2684503231624831, + "flos": 27308054154240.0, + "grad_norm": 1.844577670487785, + "language_loss": 0.70796561, + "learning_rate": 3.432611813236704e-06, + "loss": 0.72970498, + "num_input_tokens_seen": 96519740, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.88671875, + "step": 4465, + "time_per_iteration": 2.5685060024261475 + }, + { + "auxiliary_loss_clip": 0.01049569, + "auxiliary_loss_mlp": 0.0100238, + "balance_loss_clip": 1.00067484, + "balance_loss_mlp": 1.01956284, + "epoch": 0.26851044641515104, + "flos": 71858007239040.0, + "grad_norm": 0.6800540965400289, + "language_loss": 0.53096056, + "learning_rate": 3.4323400250146943e-06, + "loss": 0.55148005, + "num_input_tokens_seen": 96588870, + "router_z_loss_clip": 0.01708984, + "router_z_loss_mlp": 0.30078125, + "step": 4466, + "time_per_iteration": 3.2327654361724854 + }, + { + "auxiliary_loss_clip": 0.01133624, + "auxiliary_loss_mlp": 0.01039482, + "balance_loss_clip": 1.02219653, + "balance_loss_mlp": 1.04600596, + "epoch": 0.268570569667819, + "flos": 18733771434240.0, + "grad_norm": 2.0764143418179213, + "language_loss": 0.7343837, + "learning_rate": 3.4320681824782057e-06, + "loss": 0.75611472, + "num_input_tokens_seen": 96605100, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.875, + "step": 4467, + "time_per_iteration": 2.5052013397216797 + }, + { + "auxiliary_loss_clip": 0.01138792, + "auxiliary_loss_mlp": 0.01045438, + "balance_loss_clip": 1.0278548, + "balance_loss_mlp": 1.04801464, + "epoch": 0.268630692920487, + "flos": 18178376365440.0, + "grad_norm": 2.5834152956256555, + "language_loss": 0.80703115, + "learning_rate": 3.4317962856375493e-06, + "loss": 0.82887346, + "num_input_tokens_seen": 96621410, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.90625, + "step": 4468, + "time_per_iteration": 2.4547622203826904 + }, + { + "auxiliary_loss_clip": 0.01047735, + "auxiliary_loss_mlp": 0.01005617, + "balance_loss_clip": 1.00407946, + "balance_loss_mlp": 1.01768315, + "epoch": 0.268690816173155, + "flos": 68731768978560.0, + "grad_norm": 0.8449159500606429, + "language_loss": 0.59532088, + "learning_rate": 3.4315243345030334e-06, + "loss": 0.61585438, + "num_input_tokens_seen": 96684810, + "router_z_loss_clip": 0.01538086, + "router_z_loss_mlp": 0.30078125, + "step": 4469, + "time_per_iteration": 4.6310715675354 + }, + { + "auxiliary_loss_clip": 0.01137988, + "auxiliary_loss_mlp": 0.01045172, + "balance_loss_clip": 1.02687383, + "balance_loss_mlp": 1.04844749, + "epoch": 0.26875093942582295, + "flos": 23293636295040.0, + "grad_norm": 2.3316897890333954, + "language_loss": 0.81785607, + "learning_rate": 3.431252329084972e-06, + "loss": 0.83968771, + "num_input_tokens_seen": 96701920, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.89453125, + "step": 4470, + "time_per_iteration": 2.5501935482025146 + }, + { + "auxiliary_loss_clip": 0.01129268, + "auxiliary_loss_mlp": 0.01037606, + "balance_loss_clip": 1.02091098, + "balance_loss_mlp": 1.04484963, + "epoch": 0.2688110626784909, + "flos": 21543458112000.0, + "grad_norm": 1.6194658793917844, + "language_loss": 0.82648492, + "learning_rate": 3.4309802693936786e-06, + "loss": 0.84815365, + "num_input_tokens_seen": 96721260, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.84375, + "step": 4471, + "time_per_iteration": 2.559220552444458 + }, + { + "auxiliary_loss_clip": 0.0113472, + "auxiliary_loss_mlp": 0.010367, + "balance_loss_clip": 1.02042806, + "balance_loss_mlp": 1.04853129, + "epoch": 0.2688711859311589, + "flos": 28400600183040.0, + "grad_norm": 8.458966217412893, + "language_loss": 0.69382554, + "learning_rate": 3.43070815543947e-06, + "loss": 0.71553975, + "num_input_tokens_seen": 96740385, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.86328125, + "step": 4472, + "time_per_iteration": 2.561326742172241 + }, + { + "auxiliary_loss_clip": 0.01135298, + "auxiliary_loss_mlp": 0.01036687, + "balance_loss_clip": 1.02045035, + "balance_loss_mlp": 1.04783702, + "epoch": 0.26893130918382685, + "flos": 25994944661760.0, + "grad_norm": 1.596928542569954, + "language_loss": 0.67870784, + "learning_rate": 3.4304359872326656e-06, + "loss": 0.70042771, + "num_input_tokens_seen": 96761860, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.875, + "step": 4473, + "time_per_iteration": 2.5437636375427246 + }, + { + "auxiliary_loss_clip": 0.01133236, + "auxiliary_loss_mlp": 0.01044607, + "balance_loss_clip": 1.02844238, + "balance_loss_mlp": 1.04768729, + "epoch": 0.2689914324364948, + "flos": 20339624770560.0, + "grad_norm": 1.8504576821316179, + "language_loss": 0.82971931, + "learning_rate": 3.4301637647835843e-06, + "loss": 0.85149777, + "num_input_tokens_seen": 96781890, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.85546875, + "step": 4474, + "time_per_iteration": 2.474095582962036 + }, + { + "auxiliary_loss_clip": 0.01132567, + "auxiliary_loss_mlp": 0.01046818, + "balance_loss_clip": 1.03042698, + "balance_loss_mlp": 1.04697323, + "epoch": 0.2690515556891628, + "flos": 19464553635840.0, + "grad_norm": 2.0689967373005977, + "language_loss": 0.70303237, + "learning_rate": 3.4298914881025494e-06, + "loss": 0.72482622, + "num_input_tokens_seen": 96800390, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.85546875, + "step": 4475, + "time_per_iteration": 2.4865996837615967 + }, + { + "auxiliary_loss_clip": 0.01135068, + "auxiliary_loss_mlp": 0.01040112, + "balance_loss_clip": 1.02335167, + "balance_loss_mlp": 1.04614162, + "epoch": 0.26911167894183075, + "flos": 18146631720960.0, + "grad_norm": 1.7721029234489851, + "language_loss": 0.73711979, + "learning_rate": 3.4296191571998863e-06, + "loss": 0.75887156, + "num_input_tokens_seen": 96816685, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.890625, + "step": 4476, + "time_per_iteration": 2.477308988571167 + }, + { + "auxiliary_loss_clip": 0.01130545, + "auxiliary_loss_mlp": 0.01040281, + "balance_loss_clip": 1.02456927, + "balance_loss_mlp": 1.04561102, + "epoch": 0.2691718021944987, + "flos": 19975131509760.0, + "grad_norm": 1.720914514753409, + "language_loss": 0.80110955, + "learning_rate": 3.429346772085922e-06, + "loss": 0.8228178, + "num_input_tokens_seen": 96836285, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84765625, + "step": 4477, + "time_per_iteration": 2.497809648513794 + }, + { + "auxiliary_loss_clip": 0.01133072, + "auxiliary_loss_mlp": 0.01042879, + "balance_loss_clip": 1.02578449, + "balance_loss_mlp": 1.04442573, + "epoch": 0.2692319254471667, + "flos": 37447215770880.0, + "grad_norm": 1.9038830637231319, + "language_loss": 0.64580482, + "learning_rate": 3.429074332770984e-06, + "loss": 0.66756433, + "num_input_tokens_seen": 96857745, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.8828125, + "step": 4478, + "time_per_iteration": 2.6485564708709717 + }, + { + "auxiliary_loss_clip": 0.01130767, + "auxiliary_loss_mlp": 0.0104511, + "balance_loss_clip": 1.02876592, + "balance_loss_mlp": 1.04380882, + "epoch": 0.26929204869983464, + "flos": 22127796564480.0, + "grad_norm": 1.8571100614964546, + "language_loss": 0.80653036, + "learning_rate": 3.4288018392654047e-06, + "loss": 0.82828909, + "num_input_tokens_seen": 96877295, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8671875, + "step": 4479, + "time_per_iteration": 2.4851014614105225 + }, + { + "auxiliary_loss_clip": 0.01135761, + "auxiliary_loss_mlp": 0.01043964, + "balance_loss_clip": 1.02725112, + "balance_loss_mlp": 1.04611528, + "epoch": 0.2693521719525026, + "flos": 19792813052160.0, + "grad_norm": 2.4630797167742458, + "language_loss": 0.80834484, + "learning_rate": 3.4285292915795166e-06, + "loss": 0.83014214, + "num_input_tokens_seen": 96896160, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.89453125, + "step": 4480, + "time_per_iteration": 2.490147590637207 + }, + { + "auxiliary_loss_clip": 0.01124775, + "auxiliary_loss_mlp": 0.01036236, + "balance_loss_clip": 1.02066684, + "balance_loss_mlp": 1.04153395, + "epoch": 0.2694122952051706, + "flos": 20994383836800.0, + "grad_norm": 1.7677898796301312, + "language_loss": 0.77612787, + "learning_rate": 3.4282566897236543e-06, + "loss": 0.79773796, + "num_input_tokens_seen": 96915410, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.83203125, + "step": 4481, + "time_per_iteration": 2.4699158668518066 + }, + { + "auxiliary_loss_clip": 0.01134279, + "auxiliary_loss_mlp": 0.01044694, + "balance_loss_clip": 1.02737296, + "balance_loss_mlp": 1.04591584, + "epoch": 0.2694724184578386, + "flos": 25849291011840.0, + "grad_norm": 2.5981026313468525, + "language_loss": 0.74701524, + "learning_rate": 3.4279840337081547e-06, + "loss": 0.76880491, + "num_input_tokens_seen": 96937865, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.8828125, + "step": 4482, + "time_per_iteration": 2.556087017059326 + }, + { + "auxiliary_loss_clip": 0.01135034, + "auxiliary_loss_mlp": 0.01039094, + "balance_loss_clip": 1.02198792, + "balance_loss_mlp": 1.04693186, + "epoch": 0.26953254171050656, + "flos": 21726961718400.0, + "grad_norm": 1.852738059166697, + "language_loss": 0.72176206, + "learning_rate": 3.4277113235433584e-06, + "loss": 0.74350333, + "num_input_tokens_seen": 96957710, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.8828125, + "step": 4483, + "time_per_iteration": 2.4762344360351562 + }, + { + "auxiliary_loss_clip": 0.01133416, + "auxiliary_loss_mlp": 0.01043511, + "balance_loss_clip": 1.02635717, + "balance_loss_mlp": 1.04290676, + "epoch": 0.2695926649631745, + "flos": 19682926369920.0, + "grad_norm": 2.626283812761087, + "language_loss": 0.87107188, + "learning_rate": 3.427438559239605e-06, + "loss": 0.8928411, + "num_input_tokens_seen": 96975890, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.90625, + "step": 4484, + "time_per_iteration": 2.486185073852539 + }, + { + "auxiliary_loss_clip": 0.01131969, + "auxiliary_loss_mlp": 0.01040339, + "balance_loss_clip": 1.02447212, + "balance_loss_mlp": 1.04373026, + "epoch": 0.2696527882158425, + "flos": 32886596724480.0, + "grad_norm": 1.901905407661022, + "language_loss": 0.66389644, + "learning_rate": 3.427165740807239e-06, + "loss": 0.68561947, + "num_input_tokens_seen": 96998595, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8828125, + "step": 4485, + "time_per_iteration": 2.5674586296081543 + }, + { + "auxiliary_loss_clip": 0.01133447, + "auxiliary_loss_mlp": 0.01040269, + "balance_loss_clip": 1.02371132, + "balance_loss_mlp": 1.0445261, + "epoch": 0.26971291146851045, + "flos": 12124843320960.0, + "grad_norm": 2.8933932068842783, + "language_loss": 0.72378826, + "learning_rate": 3.426892868256604e-06, + "loss": 0.74552536, + "num_input_tokens_seen": 97013715, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.890625, + "step": 4486, + "time_per_iteration": 2.471036434173584 + }, + { + "auxiliary_loss_clip": 0.01137696, + "auxiliary_loss_mlp": 0.01038547, + "balance_loss_clip": 1.02257311, + "balance_loss_mlp": 1.04809284, + "epoch": 0.2697730347211784, + "flos": 22634459856000.0, + "grad_norm": 1.8546648123058087, + "language_loss": 0.83810318, + "learning_rate": 3.4266199415980495e-06, + "loss": 0.85986561, + "num_input_tokens_seen": 97031570, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8984375, + "step": 4487, + "time_per_iteration": 2.4867916107177734 + }, + { + "auxiliary_loss_clip": 0.01137573, + "auxiliary_loss_mlp": 0.01044901, + "balance_loss_clip": 1.02749646, + "balance_loss_mlp": 1.0477773, + "epoch": 0.2698331579738464, + "flos": 23513050523520.0, + "grad_norm": 2.2079504028023598, + "language_loss": 0.71220767, + "learning_rate": 3.4263469608419234e-06, + "loss": 0.73403245, + "num_input_tokens_seen": 97049815, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.8984375, + "step": 4488, + "time_per_iteration": 2.5174567699432373 + }, + { + "auxiliary_loss_clip": 0.01136886, + "auxiliary_loss_mlp": 0.01045434, + "balance_loss_clip": 1.02851868, + "balance_loss_mlp": 1.04792523, + "epoch": 0.26989328122651435, + "flos": 24641040297600.0, + "grad_norm": 1.6338784898376273, + "language_loss": 0.83736706, + "learning_rate": 3.426073925998578e-06, + "loss": 0.85919023, + "num_input_tokens_seen": 97067570, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.890625, + "step": 4489, + "time_per_iteration": 2.5314295291900635 + }, + { + "auxiliary_loss_clip": 0.01136964, + "auxiliary_loss_mlp": 0.01054545, + "balance_loss_clip": 1.03696203, + "balance_loss_mlp": 1.04693484, + "epoch": 0.2699534044791823, + "flos": 10772555068800.0, + "grad_norm": 2.5551945574509176, + "language_loss": 0.89805245, + "learning_rate": 3.4258008370783656e-06, + "loss": 0.91996753, + "num_input_tokens_seen": 97082180, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8984375, + "step": 4490, + "time_per_iteration": 2.4975826740264893 + }, + { + "auxiliary_loss_clip": 0.01128305, + "auxiliary_loss_mlp": 0.01042718, + "balance_loss_clip": 1.02741122, + "balance_loss_mlp": 1.04349554, + "epoch": 0.2700135277318503, + "flos": 36171597098880.0, + "grad_norm": 1.8455290723250308, + "language_loss": 0.73354411, + "learning_rate": 3.4255276940916434e-06, + "loss": 0.75525427, + "num_input_tokens_seen": 97103470, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.84765625, + "step": 4491, + "time_per_iteration": 2.6303470134735107 + }, + { + "auxiliary_loss_clip": 0.01138617, + "auxiliary_loss_mlp": 0.01043046, + "balance_loss_clip": 1.02613568, + "balance_loss_mlp": 1.04974079, + "epoch": 0.27007365098451824, + "flos": 17418614866560.0, + "grad_norm": 3.089516252272487, + "language_loss": 0.74379975, + "learning_rate": 3.4252544970487676e-06, + "loss": 0.7656163, + "num_input_tokens_seen": 97118100, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.890625, + "step": 4492, + "time_per_iteration": 2.5124619007110596 + }, + { + "auxiliary_loss_clip": 0.01133231, + "auxiliary_loss_mlp": 0.01040234, + "balance_loss_clip": 1.0241406, + "balance_loss_mlp": 1.04671812, + "epoch": 0.2701337742371862, + "flos": 23185688947200.0, + "grad_norm": 1.896651323252439, + "language_loss": 0.88740528, + "learning_rate": 3.4249812459600986e-06, + "loss": 0.90913987, + "num_input_tokens_seen": 97136765, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8671875, + "step": 4493, + "time_per_iteration": 2.480473756790161 + }, + { + "auxiliary_loss_clip": 0.01134006, + "auxiliary_loss_mlp": 0.01041715, + "balance_loss_clip": 1.02564538, + "balance_loss_mlp": 1.04676843, + "epoch": 0.2701938974898542, + "flos": 24389450461440.0, + "grad_norm": 1.468971775969503, + "language_loss": 0.70976114, + "learning_rate": 3.424707940835998e-06, + "loss": 0.73151839, + "num_input_tokens_seen": 97157470, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87109375, + "step": 4494, + "time_per_iteration": 2.5703446865081787 + }, + { + "auxiliary_loss_clip": 0.0112953, + "auxiliary_loss_mlp": 0.01034192, + "balance_loss_clip": 1.01920152, + "balance_loss_mlp": 1.04545951, + "epoch": 0.2702540207425222, + "flos": 26214322976640.0, + "grad_norm": 2.0322990364449325, + "language_loss": 0.86294192, + "learning_rate": 3.42443458168683e-06, + "loss": 0.88457918, + "num_input_tokens_seen": 97176905, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.83984375, + "step": 4495, + "time_per_iteration": 2.5428457260131836 + }, + { + "auxiliary_loss_clip": 0.01134, + "auxiliary_loss_mlp": 0.0104559, + "balance_loss_clip": 1.02968764, + "balance_loss_mlp": 1.04731214, + "epoch": 0.27031414399519016, + "flos": 22926377687040.0, + "grad_norm": 1.8698467905293557, + "language_loss": 0.76562083, + "learning_rate": 3.424161168522959e-06, + "loss": 0.7874167, + "num_input_tokens_seen": 97196380, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8671875, + "step": 4496, + "time_per_iteration": 2.5074446201324463 + }, + { + "auxiliary_loss_clip": 0.01048323, + "auxiliary_loss_mlp": 0.01012102, + "balance_loss_clip": 1.01042128, + "balance_loss_mlp": 1.01925802, + "epoch": 0.2703742672478581, + "flos": 63019780404480.0, + "grad_norm": 0.7221920911850954, + "language_loss": 0.50221699, + "learning_rate": 3.423887701354754e-06, + "loss": 0.52282125, + "num_input_tokens_seen": 97260100, + "router_z_loss_clip": 0.0168457, + "router_z_loss_mlp": 0.2890625, + "step": 4497, + "time_per_iteration": 3.110724687576294 + }, + { + "auxiliary_loss_clip": 0.01137008, + "auxiliary_loss_mlp": 0.01045622, + "balance_loss_clip": 1.03011322, + "balance_loss_mlp": 1.05020094, + "epoch": 0.2704343905005261, + "flos": 18840820942080.0, + "grad_norm": 1.6519561002314052, + "language_loss": 0.72420043, + "learning_rate": 3.4236141801925847e-06, + "loss": 0.74602675, + "num_input_tokens_seen": 97277935, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8671875, + "step": 4498, + "time_per_iteration": 2.522507429122925 + }, + { + "auxiliary_loss_clip": 0.01047265, + "auxiliary_loss_mlp": 0.0100549, + "balance_loss_clip": 1.0038569, + "balance_loss_mlp": 1.0182879, + "epoch": 0.27049451375319405, + "flos": 71233412618880.0, + "grad_norm": 0.7584910907853958, + "language_loss": 0.59222841, + "learning_rate": 3.4233406050468237e-06, + "loss": 0.61275595, + "num_input_tokens_seen": 97338845, + "router_z_loss_clip": 0.01635742, + "router_z_loss_mlp": 0.2890625, + "step": 4499, + "time_per_iteration": 3.1193060874938965 + }, + { + "auxiliary_loss_clip": 0.01132683, + "auxiliary_loss_mlp": 0.01036933, + "balance_loss_clip": 1.02085209, + "balance_loss_mlp": 1.04637063, + "epoch": 0.270554637005862, + "flos": 24278594112000.0, + "grad_norm": 2.0468109740969576, + "language_loss": 0.7361812, + "learning_rate": 3.4230669759278438e-06, + "loss": 0.75787735, + "num_input_tokens_seen": 97356640, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.86328125, + "step": 4500, + "time_per_iteration": 2.5073533058166504 + }, + { + "auxiliary_loss_clip": 0.01130893, + "auxiliary_loss_mlp": 0.01044299, + "balance_loss_clip": 1.02739513, + "balance_loss_mlp": 1.04379177, + "epoch": 0.27061476025853, + "flos": 17632318832640.0, + "grad_norm": 3.2528800155878765, + "language_loss": 0.80392325, + "learning_rate": 3.4227932928460215e-06, + "loss": 0.82567519, + "num_input_tokens_seen": 97372585, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.87109375, + "step": 4501, + "time_per_iteration": 2.4665989875793457 + }, + { + "auxiliary_loss_clip": 0.01134872, + "auxiliary_loss_mlp": 0.01044198, + "balance_loss_clip": 1.0278666, + "balance_loss_mlp": 1.04683352, + "epoch": 0.27067488351119795, + "flos": 22710123855360.0, + "grad_norm": 1.9148884605164396, + "language_loss": 0.72832727, + "learning_rate": 3.422519555811735e-06, + "loss": 0.75011796, + "num_input_tokens_seen": 97393315, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8828125, + "step": 4502, + "time_per_iteration": 2.511070489883423 + }, + { + "auxiliary_loss_clip": 0.01134337, + "auxiliary_loss_mlp": 0.01038575, + "balance_loss_clip": 1.0209558, + "balance_loss_mlp": 1.04282784, + "epoch": 0.2707350067638659, + "flos": 41719616087040.0, + "grad_norm": 1.724044037192685, + "language_loss": 0.68474984, + "learning_rate": 3.4222457648353642e-06, + "loss": 0.70647895, + "num_input_tokens_seen": 97417860, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9140625, + "step": 4503, + "time_per_iteration": 2.6554527282714844 + }, + { + "auxiliary_loss_clip": 0.01133759, + "auxiliary_loss_mlp": 0.01040282, + "balance_loss_clip": 1.02425468, + "balance_loss_mlp": 1.04659927, + "epoch": 0.2707951300165339, + "flos": 20193037367040.0, + "grad_norm": 2.0245220791315655, + "language_loss": 0.68488902, + "learning_rate": 3.4219719199272918e-06, + "loss": 0.7066294, + "num_input_tokens_seen": 97436780, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87109375, + "step": 4504, + "time_per_iteration": 2.4813036918640137 + }, + { + "auxiliary_loss_clip": 0.01135516, + "auxiliary_loss_mlp": 0.01043406, + "balance_loss_clip": 1.02811766, + "balance_loss_mlp": 1.05043292, + "epoch": 0.27085525326920185, + "flos": 21433966479360.0, + "grad_norm": 1.7616188880043606, + "language_loss": 0.75553012, + "learning_rate": 3.421698021097902e-06, + "loss": 0.77731931, + "num_input_tokens_seen": 97456190, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8515625, + "step": 4505, + "time_per_iteration": 2.482228994369507 + }, + { + "auxiliary_loss_clip": 0.01138199, + "auxiliary_loss_mlp": 0.01049925, + "balance_loss_clip": 1.03271127, + "balance_loss_mlp": 1.047171, + "epoch": 0.2709153765218698, + "flos": 17675232606720.0, + "grad_norm": 1.8888030992954683, + "language_loss": 0.73508286, + "learning_rate": 3.42142406835758e-06, + "loss": 0.75696409, + "num_input_tokens_seen": 97474545, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9140625, + "step": 4506, + "time_per_iteration": 2.493534803390503 + }, + { + "auxiliary_loss_clip": 0.01137077, + "auxiliary_loss_mlp": 0.01040919, + "balance_loss_clip": 1.02390218, + "balance_loss_mlp": 1.04818904, + "epoch": 0.2709754997745378, + "flos": 24456243801600.0, + "grad_norm": 2.012438120988393, + "language_loss": 0.80958861, + "learning_rate": 3.421150061716715e-06, + "loss": 0.83136857, + "num_input_tokens_seen": 97494520, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.890625, + "step": 4507, + "time_per_iteration": 2.488477945327759 + }, + { + "auxiliary_loss_clip": 0.01046128, + "auxiliary_loss_mlp": 0.01011944, + "balance_loss_clip": 1.0102514, + "balance_loss_mlp": 1.01738429, + "epoch": 0.2710356230272058, + "flos": 65210798206080.0, + "grad_norm": 0.7384209784394716, + "language_loss": 0.50892401, + "learning_rate": 3.420876001185698e-06, + "loss": 0.52950472, + "num_input_tokens_seen": 97552455, + "router_z_loss_clip": 0.01696777, + "router_z_loss_mlp": 0.28710938, + "step": 4508, + "time_per_iteration": 3.005894660949707 + }, + { + "auxiliary_loss_clip": 0.01129132, + "auxiliary_loss_mlp": 0.01039667, + "balance_loss_clip": 1.02413416, + "balance_loss_mlp": 1.04509401, + "epoch": 0.27109574627987376, + "flos": 25484438615040.0, + "grad_norm": 4.914093534195162, + "language_loss": 0.74373507, + "learning_rate": 3.4206018867749197e-06, + "loss": 0.76542306, + "num_input_tokens_seen": 97572650, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.83984375, + "step": 4509, + "time_per_iteration": 2.555645227432251 + }, + { + "auxiliary_loss_clip": 0.01126467, + "auxiliary_loss_mlp": 0.01039629, + "balance_loss_clip": 1.02418542, + "balance_loss_mlp": 1.04368544, + "epoch": 0.2711558695325417, + "flos": 19682782715520.0, + "grad_norm": 1.7859895301291084, + "language_loss": 0.71706283, + "learning_rate": 3.4203277184947757e-06, + "loss": 0.73872381, + "num_input_tokens_seen": 97591150, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.828125, + "step": 4510, + "time_per_iteration": 2.469756841659546 + }, + { + "auxiliary_loss_clip": 0.01133239, + "auxiliary_loss_mlp": 0.01034855, + "balance_loss_clip": 1.01921451, + "balance_loss_mlp": 1.04728365, + "epoch": 0.2712159927852097, + "flos": 18587758648320.0, + "grad_norm": 4.171230322312489, + "language_loss": 0.70698422, + "learning_rate": 3.4200534963556627e-06, + "loss": 0.72866517, + "num_input_tokens_seen": 97607410, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.859375, + "step": 4511, + "time_per_iteration": 3.9261832237243652 + }, + { + "auxiliary_loss_clip": 0.01133865, + "auxiliary_loss_mlp": 0.01043141, + "balance_loss_clip": 1.02660656, + "balance_loss_mlp": 1.04600286, + "epoch": 0.27127611603787766, + "flos": 25630235919360.0, + "grad_norm": 2.0859148079323564, + "language_loss": 0.80823237, + "learning_rate": 3.419779220367979e-06, + "loss": 0.83000243, + "num_input_tokens_seen": 97626870, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.87890625, + "step": 4512, + "time_per_iteration": 2.5112404823303223 + }, + { + "auxiliary_loss_clip": 0.01128916, + "auxiliary_loss_mlp": 0.01035298, + "balance_loss_clip": 1.02108788, + "balance_loss_mlp": 1.04543233, + "epoch": 0.2713362392905456, + "flos": 23148952312320.0, + "grad_norm": 1.880665339674376, + "language_loss": 0.80508482, + "learning_rate": 3.419504890542124e-06, + "loss": 0.82672697, + "num_input_tokens_seen": 97646595, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8359375, + "step": 4513, + "time_per_iteration": 2.5550525188446045 + }, + { + "auxiliary_loss_clip": 0.01132709, + "auxiliary_loss_mlp": 0.01042049, + "balance_loss_clip": 1.02668297, + "balance_loss_mlp": 1.04505134, + "epoch": 0.2713963625432136, + "flos": 18366045949440.0, + "grad_norm": 1.8883190176483522, + "language_loss": 0.88062817, + "learning_rate": 3.4192305068885026e-06, + "loss": 0.90237576, + "num_input_tokens_seen": 97665485, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.875, + "step": 4514, + "time_per_iteration": 2.4411823749542236 + }, + { + "auxiliary_loss_clip": 0.0113378, + "auxiliary_loss_mlp": 0.01041006, + "balance_loss_clip": 1.02475166, + "balance_loss_mlp": 1.04799736, + "epoch": 0.27145648579588155, + "flos": 22491751121280.0, + "grad_norm": 2.468440108941068, + "language_loss": 0.92064375, + "learning_rate": 3.418956069417517e-06, + "loss": 0.94239157, + "num_input_tokens_seen": 97683800, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.859375, + "step": 4515, + "time_per_iteration": 2.507073402404785 + }, + { + "auxiliary_loss_clip": 0.01140812, + "auxiliary_loss_mlp": 0.01050656, + "balance_loss_clip": 1.03202391, + "balance_loss_mlp": 1.04952395, + "epoch": 0.2715166090485495, + "flos": 19239177749760.0, + "grad_norm": 2.5869205534481017, + "language_loss": 0.73691195, + "learning_rate": 3.4186815781395756e-06, + "loss": 0.75882661, + "num_input_tokens_seen": 97700505, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.9140625, + "step": 4516, + "time_per_iteration": 2.4427852630615234 + }, + { + "auxiliary_loss_clip": 0.01134153, + "auxiliary_loss_mlp": 0.01040166, + "balance_loss_clip": 1.02352417, + "balance_loss_mlp": 1.0466857, + "epoch": 0.2715767323012175, + "flos": 17709598944000.0, + "grad_norm": 6.588152355110397, + "language_loss": 0.76239699, + "learning_rate": 3.4184070330650866e-06, + "loss": 0.78414017, + "num_input_tokens_seen": 97717410, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.875, + "step": 4517, + "time_per_iteration": 2.4891836643218994 + }, + { + "auxiliary_loss_clip": 0.01133662, + "auxiliary_loss_mlp": 0.01039085, + "balance_loss_clip": 1.02201402, + "balance_loss_mlp": 1.0473218, + "epoch": 0.27163685555388545, + "flos": 22382834106240.0, + "grad_norm": 2.2012309941627066, + "language_loss": 0.76785064, + "learning_rate": 3.4181324342044607e-06, + "loss": 0.78957808, + "num_input_tokens_seen": 97734545, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.86328125, + "step": 4518, + "time_per_iteration": 2.503117561340332 + }, + { + "auxiliary_loss_clip": 0.01133735, + "auxiliary_loss_mlp": 0.01039304, + "balance_loss_clip": 1.0241586, + "balance_loss_mlp": 1.04699707, + "epoch": 0.2716969788065534, + "flos": 22346708002560.0, + "grad_norm": 1.6415373198141725, + "language_loss": 0.68314338, + "learning_rate": 3.41785778156811e-06, + "loss": 0.7048738, + "num_input_tokens_seen": 97754000, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8671875, + "step": 4519, + "time_per_iteration": 2.573230028152466 + }, + { + "auxiliary_loss_clip": 0.01132669, + "auxiliary_loss_mlp": 0.01034398, + "balance_loss_clip": 1.01893663, + "balance_loss_mlp": 1.04631245, + "epoch": 0.2717571020592214, + "flos": 25228467319680.0, + "grad_norm": 2.6734918677628685, + "language_loss": 0.755759, + "learning_rate": 3.417583075166451e-06, + "loss": 0.7774297, + "num_input_tokens_seen": 97772080, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.86328125, + "step": 4520, + "time_per_iteration": 2.535546064376831 + }, + { + "auxiliary_loss_clip": 0.01138716, + "auxiliary_loss_mlp": 0.01044302, + "balance_loss_clip": 1.02628946, + "balance_loss_mlp": 1.0501039, + "epoch": 0.2718172253118894, + "flos": 20189769229440.0, + "grad_norm": 2.5201661256644523, + "language_loss": 0.76219606, + "learning_rate": 3.4173083150099e-06, + "loss": 0.78402621, + "num_input_tokens_seen": 97789370, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.88671875, + "step": 4521, + "time_per_iteration": 2.491654396057129 + }, + { + "auxiliary_loss_clip": 0.01137284, + "auxiliary_loss_mlp": 0.01047464, + "balance_loss_clip": 1.03102481, + "balance_loss_mlp": 1.04803133, + "epoch": 0.27187734856455736, + "flos": 14319129260160.0, + "grad_norm": 2.3970894391693967, + "language_loss": 0.75911158, + "learning_rate": 3.417033501108875e-06, + "loss": 0.78095901, + "num_input_tokens_seen": 97807385, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.890625, + "step": 4522, + "time_per_iteration": 2.471673011779785 + }, + { + "auxiliary_loss_clip": 0.01137707, + "auxiliary_loss_mlp": 0.0103702, + "balance_loss_clip": 1.02042627, + "balance_loss_mlp": 1.04873872, + "epoch": 0.27193747181722533, + "flos": 21107682311040.0, + "grad_norm": 5.0666434109354075, + "language_loss": 0.72895801, + "learning_rate": 3.416758633473798e-06, + "loss": 0.75070536, + "num_input_tokens_seen": 97827930, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.890625, + "step": 4523, + "time_per_iteration": 2.5152363777160645 + }, + { + "auxiliary_loss_clip": 0.01129262, + "auxiliary_loss_mlp": 0.01038594, + "balance_loss_clip": 1.02208352, + "balance_loss_mlp": 1.04448104, + "epoch": 0.2719975950698933, + "flos": 19682782715520.0, + "grad_norm": 1.5338044020439772, + "language_loss": 0.74324989, + "learning_rate": 3.4164837121150915e-06, + "loss": 0.76492846, + "num_input_tokens_seen": 97847440, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.84765625, + "step": 4524, + "time_per_iteration": 2.495253562927246 + }, + { + "auxiliary_loss_clip": 0.01135118, + "auxiliary_loss_mlp": 0.01039959, + "balance_loss_clip": 1.02380621, + "balance_loss_mlp": 1.04772878, + "epoch": 0.27205771832256126, + "flos": 24754482426240.0, + "grad_norm": 2.881398237919427, + "language_loss": 0.76651889, + "learning_rate": 3.4162087370431803e-06, + "loss": 0.78826964, + "num_input_tokens_seen": 97867620, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.875, + "step": 4525, + "time_per_iteration": 2.511634111404419 + }, + { + "auxiliary_loss_clip": 0.01131035, + "auxiliary_loss_mlp": 0.01049235, + "balance_loss_clip": 1.0334518, + "balance_loss_mlp": 1.04626358, + "epoch": 0.2721178415752292, + "flos": 21755581879680.0, + "grad_norm": 1.8599028556429251, + "language_loss": 0.81914634, + "learning_rate": 3.4159337082684926e-06, + "loss": 0.84094906, + "num_input_tokens_seen": 97884345, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.84765625, + "step": 4526, + "time_per_iteration": 2.495011568069458 + }, + { + "auxiliary_loss_clip": 0.01137971, + "auxiliary_loss_mlp": 0.01044775, + "balance_loss_clip": 1.02770483, + "balance_loss_mlp": 1.0466783, + "epoch": 0.2721779648278972, + "flos": 12676826597760.0, + "grad_norm": 3.313629745591453, + "language_loss": 0.77007318, + "learning_rate": 3.4156586258014566e-06, + "loss": 0.79190063, + "num_input_tokens_seen": 97901500, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9140625, + "step": 4527, + "time_per_iteration": 2.5181260108947754 + }, + { + "auxiliary_loss_clip": 0.0113407, + "auxiliary_loss_mlp": 0.01041797, + "balance_loss_clip": 1.02496481, + "balance_loss_mlp": 1.04637635, + "epoch": 0.27223808808056515, + "flos": 16253206099200.0, + "grad_norm": 2.1845797146290784, + "language_loss": 0.81825048, + "learning_rate": 3.415383489652503e-06, + "loss": 0.84000921, + "num_input_tokens_seen": 97917800, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.87890625, + "step": 4528, + "time_per_iteration": 2.469916582107544 + }, + { + "auxiliary_loss_clip": 0.01131576, + "auxiliary_loss_mlp": 0.01042667, + "balance_loss_clip": 1.0273608, + "balance_loss_mlp": 1.04669189, + "epoch": 0.2722982113332331, + "flos": 27745805203200.0, + "grad_norm": 1.6672454466706952, + "language_loss": 0.77123594, + "learning_rate": 3.4151082998320666e-06, + "loss": 0.79297841, + "num_input_tokens_seen": 97937225, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8515625, + "step": 4529, + "time_per_iteration": 2.5379140377044678 + }, + { + "auxiliary_loss_clip": 0.01133862, + "auxiliary_loss_mlp": 0.01044178, + "balance_loss_clip": 1.02900243, + "balance_loss_mlp": 1.04580855, + "epoch": 0.2723583345859011, + "flos": 21726243446400.0, + "grad_norm": 2.4153957329893228, + "language_loss": 0.8195889, + "learning_rate": 3.4148330563505805e-06, + "loss": 0.84136933, + "num_input_tokens_seen": 97956845, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8828125, + "step": 4530, + "time_per_iteration": 2.5363659858703613 + }, + { + "auxiliary_loss_clip": 0.01133042, + "auxiliary_loss_mlp": 0.01036315, + "balance_loss_clip": 1.02010226, + "balance_loss_mlp": 1.04630172, + "epoch": 0.27241845783856905, + "flos": 17347260499200.0, + "grad_norm": 2.1797176655983432, + "language_loss": 0.91650689, + "learning_rate": 3.4145577592184838e-06, + "loss": 0.93820047, + "num_input_tokens_seen": 97972465, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8671875, + "step": 4531, + "time_per_iteration": 2.508429765701294 + }, + { + "auxiliary_loss_clip": 0.01134833, + "auxiliary_loss_mlp": 0.01047772, + "balance_loss_clip": 1.03159511, + "balance_loss_mlp": 1.04611766, + "epoch": 0.272478581091237, + "flos": 24754302858240.0, + "grad_norm": 2.532443443519077, + "language_loss": 0.76107466, + "learning_rate": 3.4142824084462155e-06, + "loss": 0.78290069, + "num_input_tokens_seen": 97990770, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.88671875, + "step": 4532, + "time_per_iteration": 2.499457359313965 + }, + { + "auxiliary_loss_clip": 0.01130352, + "auxiliary_loss_mlp": 0.01034139, + "balance_loss_clip": 1.01861846, + "balance_loss_mlp": 1.04643464, + "epoch": 0.272538704343905, + "flos": 17890624512000.0, + "grad_norm": 3.1928401528407746, + "language_loss": 0.89197671, + "learning_rate": 3.4140070040442162e-06, + "loss": 0.91362166, + "num_input_tokens_seen": 98005775, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.83984375, + "step": 4533, + "time_per_iteration": 2.508202075958252 + }, + { + "auxiliary_loss_clip": 0.0113001, + "auxiliary_loss_mlp": 0.01037014, + "balance_loss_clip": 1.02118278, + "balance_loss_mlp": 1.04587626, + "epoch": 0.272598827596573, + "flos": 22932016122240.0, + "grad_norm": 2.096334750916122, + "language_loss": 0.7125262, + "learning_rate": 3.413731546022929e-06, + "loss": 0.73419642, + "num_input_tokens_seen": 98025750, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.84375, + "step": 4534, + "time_per_iteration": 2.5111024379730225 + }, + { + "auxiliary_loss_clip": 0.01134655, + "auxiliary_loss_mlp": 0.01040405, + "balance_loss_clip": 1.02315593, + "balance_loss_mlp": 1.04651427, + "epoch": 0.27265895084924097, + "flos": 24238409771520.0, + "grad_norm": 1.9613498766130548, + "language_loss": 0.91064882, + "learning_rate": 3.4134560343928005e-06, + "loss": 0.93239939, + "num_input_tokens_seen": 98044955, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.8828125, + "step": 4535, + "time_per_iteration": 2.5509371757507324 + }, + { + "auxiliary_loss_clip": 0.01138846, + "auxiliary_loss_mlp": 0.01039245, + "balance_loss_clip": 1.02262712, + "balance_loss_mlp": 1.05108571, + "epoch": 0.27271907410190893, + "flos": 27013155494400.0, + "grad_norm": 1.5906078149456282, + "language_loss": 0.72618866, + "learning_rate": 3.4131804691642778e-06, + "loss": 0.74796963, + "num_input_tokens_seen": 98065860, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.87890625, + "step": 4536, + "time_per_iteration": 2.5106241703033447 + }, + { + "auxiliary_loss_clip": 0.01133436, + "auxiliary_loss_mlp": 0.0103976, + "balance_loss_clip": 1.02302337, + "balance_loss_mlp": 1.04617631, + "epoch": 0.2727791973545769, + "flos": 34452588942720.0, + "grad_norm": 1.839444357786457, + "language_loss": 0.7144469, + "learning_rate": 3.41290485034781e-06, + "loss": 0.73617887, + "num_input_tokens_seen": 98085450, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.87109375, + "step": 4537, + "time_per_iteration": 2.588439464569092 + }, + { + "auxiliary_loss_clip": 0.01132537, + "auxiliary_loss_mlp": 0.01040014, + "balance_loss_clip": 1.02363503, + "balance_loss_mlp": 1.04501796, + "epoch": 0.27283932060724486, + "flos": 15041723160960.0, + "grad_norm": 2.431092364938405, + "language_loss": 0.78177559, + "learning_rate": 3.4126291779538485e-06, + "loss": 0.80350113, + "num_input_tokens_seen": 98099115, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.875, + "step": 4538, + "time_per_iteration": 2.438603639602661 + }, + { + "auxiliary_loss_clip": 0.01134265, + "auxiliary_loss_mlp": 0.01041521, + "balance_loss_clip": 1.02609527, + "balance_loss_mlp": 1.04698634, + "epoch": 0.2728994438599128, + "flos": 21652411040640.0, + "grad_norm": 1.4794812227008705, + "language_loss": 0.90038705, + "learning_rate": 3.412353451992847e-06, + "loss": 0.92214489, + "num_input_tokens_seen": 98118415, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.875, + "step": 4539, + "time_per_iteration": 2.5052709579467773 + }, + { + "auxiliary_loss_clip": 0.01132202, + "auxiliary_loss_mlp": 0.01041987, + "balance_loss_clip": 1.02414095, + "balance_loss_mlp": 1.04627967, + "epoch": 0.2729595671125808, + "flos": 17488424949120.0, + "grad_norm": 2.0712338481270884, + "language_loss": 0.88711655, + "learning_rate": 3.4120776724752607e-06, + "loss": 0.90885842, + "num_input_tokens_seen": 98136300, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.859375, + "step": 4540, + "time_per_iteration": 2.457939624786377 + }, + { + "auxiliary_loss_clip": 0.01133918, + "auxiliary_loss_mlp": 0.01033711, + "balance_loss_clip": 1.01771343, + "balance_loss_mlp": 1.04666936, + "epoch": 0.27301969036524876, + "flos": 19318145800320.0, + "grad_norm": 1.9363402300433894, + "language_loss": 0.81993663, + "learning_rate": 3.4118018394115476e-06, + "loss": 0.84161294, + "num_input_tokens_seen": 98154580, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.875, + "step": 4541, + "time_per_iteration": 2.461517333984375 + }, + { + "auxiliary_loss_clip": 0.01133224, + "auxiliary_loss_mlp": 0.01041774, + "balance_loss_clip": 1.02484596, + "balance_loss_mlp": 1.04623377, + "epoch": 0.2730798136179167, + "flos": 21065666376960.0, + "grad_norm": 1.8882731025231656, + "language_loss": 0.7925449, + "learning_rate": 3.4115259528121678e-06, + "loss": 0.81429487, + "num_input_tokens_seen": 98173115, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.87109375, + "step": 4542, + "time_per_iteration": 2.487905979156494 + }, + { + "auxiliary_loss_clip": 0.01136186, + "auxiliary_loss_mlp": 0.01040722, + "balance_loss_clip": 1.02441418, + "balance_loss_mlp": 1.04965162, + "epoch": 0.2731399368705847, + "flos": 19171737964800.0, + "grad_norm": 2.197105758262293, + "language_loss": 0.89471424, + "learning_rate": 3.411250012687582e-06, + "loss": 0.91648328, + "num_input_tokens_seen": 98190260, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8671875, + "step": 4543, + "time_per_iteration": 2.4903039932250977 + }, + { + "auxiliary_loss_clip": 0.01137887, + "auxiliary_loss_mlp": 0.01046974, + "balance_loss_clip": 1.02955735, + "balance_loss_mlp": 1.04841042, + "epoch": 0.27320006012325265, + "flos": 18290130554880.0, + "grad_norm": 2.084938235366164, + "language_loss": 0.63666493, + "learning_rate": 3.410974019048255e-06, + "loss": 0.65851355, + "num_input_tokens_seen": 98207115, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.89453125, + "step": 4544, + "time_per_iteration": 2.4529080390930176 + }, + { + "auxiliary_loss_clip": 0.01137894, + "auxiliary_loss_mlp": 0.01047722, + "balance_loss_clip": 1.03043687, + "balance_loss_mlp": 1.05032265, + "epoch": 0.2732601833759206, + "flos": 34860929731200.0, + "grad_norm": 1.5170655618085727, + "language_loss": 0.6996637, + "learning_rate": 3.410697971904651e-06, + "loss": 0.72151983, + "num_input_tokens_seen": 98230610, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.875, + "step": 4545, + "time_per_iteration": 2.6089117527008057 + }, + { + "auxiliary_loss_clip": 0.01048793, + "auxiliary_loss_mlp": 0.01019944, + "balance_loss_clip": 1.01828671, + "balance_loss_mlp": 1.01938868, + "epoch": 0.2733203066285886, + "flos": 53910824762880.0, + "grad_norm": 0.7273987605446792, + "language_loss": 0.61571473, + "learning_rate": 3.4104218712672383e-06, + "loss": 0.63640207, + "num_input_tokens_seen": 98293585, + "router_z_loss_clip": 0.01660156, + "router_z_loss_mlp": 0.29296875, + "step": 4546, + "time_per_iteration": 3.1125431060791016 + }, + { + "auxiliary_loss_clip": 0.01136724, + "auxiliary_loss_mlp": 0.0104828, + "balance_loss_clip": 1.03199649, + "balance_loss_mlp": 1.05012798, + "epoch": 0.2733804298812566, + "flos": 20660378244480.0, + "grad_norm": 1.9369682323358774, + "language_loss": 0.64982706, + "learning_rate": 3.410145717146488e-06, + "loss": 0.67167711, + "num_input_tokens_seen": 98311680, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8671875, + "step": 4547, + "time_per_iteration": 2.497563600540161 + }, + { + "auxiliary_loss_clip": 0.01132998, + "auxiliary_loss_mlp": 0.01041584, + "balance_loss_clip": 1.0262835, + "balance_loss_mlp": 1.04765081, + "epoch": 0.27344055313392457, + "flos": 25884339707520.0, + "grad_norm": 2.2377196076559183, + "language_loss": 0.77178854, + "learning_rate": 3.4098695095528694e-06, + "loss": 0.7935344, + "num_input_tokens_seen": 98330770, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8515625, + "step": 4548, + "time_per_iteration": 2.536813259124756 + }, + { + "auxiliary_loss_clip": 0.01133984, + "auxiliary_loss_mlp": 0.01043122, + "balance_loss_clip": 1.02854848, + "balance_loss_mlp": 1.04827595, + "epoch": 0.27350067638659253, + "flos": 22929753565440.0, + "grad_norm": 1.8894391736419274, + "language_loss": 0.82382214, + "learning_rate": 3.4095932484968585e-06, + "loss": 0.84559321, + "num_input_tokens_seen": 98349860, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.859375, + "step": 4549, + "time_per_iteration": 2.5156633853912354 + }, + { + "auxiliary_loss_clip": 0.01132691, + "auxiliary_loss_mlp": 0.0104484, + "balance_loss_clip": 1.02744722, + "balance_loss_mlp": 1.04482448, + "epoch": 0.2735607996392605, + "flos": 16574821499520.0, + "grad_norm": 2.2209993145005793, + "language_loss": 0.70675868, + "learning_rate": 3.4093169339889305e-06, + "loss": 0.72853404, + "num_input_tokens_seen": 98367040, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.875, + "step": 4550, + "time_per_iteration": 2.4510462284088135 + }, + { + "auxiliary_loss_clip": 0.0113302, + "auxiliary_loss_mlp": 0.01046908, + "balance_loss_clip": 1.03272784, + "balance_loss_mlp": 1.04789186, + "epoch": 0.27362092289192846, + "flos": 19645291895040.0, + "grad_norm": 2.43111621366583, + "language_loss": 0.78738058, + "learning_rate": 3.409040566039563e-06, + "loss": 0.80917984, + "num_input_tokens_seen": 98384010, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8515625, + "step": 4551, + "time_per_iteration": 2.470520496368408 + }, + { + "auxiliary_loss_clip": 0.01132621, + "auxiliary_loss_mlp": 0.01051474, + "balance_loss_clip": 1.03548765, + "balance_loss_mlp": 1.04601097, + "epoch": 0.27368104614459643, + "flos": 17639142416640.0, + "grad_norm": 2.681171335598487, + "language_loss": 0.70585275, + "learning_rate": 3.4087641446592362e-06, + "loss": 0.72769368, + "num_input_tokens_seen": 98399625, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8671875, + "step": 4552, + "time_per_iteration": 3.9179859161376953 + }, + { + "auxiliary_loss_clip": 0.01135382, + "auxiliary_loss_mlp": 0.01046031, + "balance_loss_clip": 1.02936506, + "balance_loss_mlp": 1.04864776, + "epoch": 0.2737411693972644, + "flos": 21580015178880.0, + "grad_norm": 2.3865688662341005, + "language_loss": 0.71857619, + "learning_rate": 3.408487669858431e-06, + "loss": 0.7403903, + "num_input_tokens_seen": 98417310, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8671875, + "step": 4553, + "time_per_iteration": 4.032766342163086 + }, + { + "auxiliary_loss_clip": 0.01131855, + "auxiliary_loss_mlp": 0.01044919, + "balance_loss_clip": 1.02853942, + "balance_loss_mlp": 1.04585433, + "epoch": 0.27380129264993236, + "flos": 25484043565440.0, + "grad_norm": 1.5870570208244068, + "language_loss": 0.59154749, + "learning_rate": 3.4082111416476337e-06, + "loss": 0.61331522, + "num_input_tokens_seen": 98438670, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.859375, + "step": 4554, + "time_per_iteration": 2.549534320831299 + }, + { + "auxiliary_loss_clip": 0.01138763, + "auxiliary_loss_mlp": 0.01041638, + "balance_loss_clip": 1.02448368, + "balance_loss_mlp": 1.04893517, + "epoch": 0.2738614159026003, + "flos": 18661196004480.0, + "grad_norm": 1.7727518382715788, + "language_loss": 0.73820007, + "learning_rate": 3.4079345600373275e-06, + "loss": 0.76000404, + "num_input_tokens_seen": 98456060, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.8984375, + "step": 4555, + "time_per_iteration": 2.5162432193756104 + }, + { + "auxiliary_loss_clip": 0.01136837, + "auxiliary_loss_mlp": 0.01039479, + "balance_loss_clip": 1.02348125, + "balance_loss_mlp": 1.04923606, + "epoch": 0.2739215391552683, + "flos": 23477139901440.0, + "grad_norm": 1.956724452661134, + "language_loss": 0.7785511, + "learning_rate": 3.407657925038002e-06, + "loss": 0.80031419, + "num_input_tokens_seen": 98473765, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.875, + "step": 4556, + "time_per_iteration": 2.5205135345458984 + }, + { + "auxiliary_loss_clip": 0.01145391, + "auxiliary_loss_mlp": 0.0105386, + "balance_loss_clip": 1.03640783, + "balance_loss_mlp": 1.04952264, + "epoch": 0.27398166240793626, + "flos": 17128636369920.0, + "grad_norm": 1.7956202604517526, + "language_loss": 0.82272434, + "learning_rate": 3.4073812366601473e-06, + "loss": 0.84471685, + "num_input_tokens_seen": 98490590, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9609375, + "step": 4557, + "time_per_iteration": 2.486485719680786 + }, + { + "auxiliary_loss_clip": 0.01133092, + "auxiliary_loss_mlp": 0.01042572, + "balance_loss_clip": 1.02691972, + "balance_loss_mlp": 1.04657316, + "epoch": 0.2740417856606042, + "flos": 23404744039680.0, + "grad_norm": 1.7971714372597054, + "language_loss": 0.72697943, + "learning_rate": 3.4071044949142547e-06, + "loss": 0.74873614, + "num_input_tokens_seen": 98510590, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.86328125, + "step": 4558, + "time_per_iteration": 2.5272727012634277 + }, + { + "auxiliary_loss_clip": 0.01131967, + "auxiliary_loss_mlp": 0.01048867, + "balance_loss_clip": 1.03243995, + "balance_loss_mlp": 1.04504418, + "epoch": 0.2741019089132722, + "flos": 12780428400000.0, + "grad_norm": 2.1318143008079686, + "language_loss": 0.6804775, + "learning_rate": 3.406827699810819e-06, + "loss": 0.70228577, + "num_input_tokens_seen": 98527875, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8671875, + "step": 4559, + "time_per_iteration": 2.4787509441375732 + }, + { + "auxiliary_loss_clip": 0.01131026, + "auxiliary_loss_mlp": 0.01043891, + "balance_loss_clip": 1.02750015, + "balance_loss_mlp": 1.04517901, + "epoch": 0.27416203216594015, + "flos": 20631542601600.0, + "grad_norm": 3.5500966853689673, + "language_loss": 0.71847737, + "learning_rate": 3.4065508513603353e-06, + "loss": 0.74022651, + "num_input_tokens_seen": 98547575, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.859375, + "step": 4560, + "time_per_iteration": 2.490152359008789 + }, + { + "auxiliary_loss_clip": 0.0113572, + "auxiliary_loss_mlp": 0.01041958, + "balance_loss_clip": 1.02642488, + "balance_loss_mlp": 1.04779601, + "epoch": 0.27422215541860817, + "flos": 26541576812160.0, + "grad_norm": 1.7948619898284635, + "language_loss": 0.80998009, + "learning_rate": 3.406273949573303e-06, + "loss": 0.83175689, + "num_input_tokens_seen": 98566290, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.87890625, + "step": 4561, + "time_per_iteration": 2.554872512817383 + }, + { + "auxiliary_loss_clip": 0.01136406, + "auxiliary_loss_mlp": 0.01041546, + "balance_loss_clip": 1.02600157, + "balance_loss_mlp": 1.04711854, + "epoch": 0.27428227867127614, + "flos": 23331163029120.0, + "grad_norm": 1.7370289005889625, + "language_loss": 0.7531321, + "learning_rate": 3.4059969944602214e-06, + "loss": 0.77491164, + "num_input_tokens_seen": 98586255, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.890625, + "step": 4562, + "time_per_iteration": 2.4925429821014404 + }, + { + "auxiliary_loss_clip": 0.01133486, + "auxiliary_loss_mlp": 0.01037482, + "balance_loss_clip": 1.02173424, + "balance_loss_mlp": 1.04701662, + "epoch": 0.2743424019239441, + "flos": 23035115134080.0, + "grad_norm": 1.598166418515773, + "language_loss": 0.74503827, + "learning_rate": 3.4057199860315928e-06, + "loss": 0.76674795, + "num_input_tokens_seen": 98606030, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.86328125, + "step": 4563, + "time_per_iteration": 2.5514259338378906 + }, + { + "auxiliary_loss_clip": 0.01138282, + "auxiliary_loss_mlp": 0.01045739, + "balance_loss_clip": 1.02798915, + "balance_loss_mlp": 1.04708612, + "epoch": 0.27440252517661207, + "flos": 21981101420160.0, + "grad_norm": 1.8271759108968861, + "language_loss": 0.62526429, + "learning_rate": 3.4054429242979213e-06, + "loss": 0.64710456, + "num_input_tokens_seen": 98625225, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9140625, + "step": 4564, + "time_per_iteration": 2.479156494140625 + }, + { + "auxiliary_loss_clip": 0.01136574, + "auxiliary_loss_mlp": 0.01042695, + "balance_loss_clip": 1.02513587, + "balance_loss_mlp": 1.04808652, + "epoch": 0.27446264842928003, + "flos": 40187451502080.0, + "grad_norm": 1.9245884320117708, + "language_loss": 0.78135669, + "learning_rate": 3.4051658092697135e-06, + "loss": 0.80314934, + "num_input_tokens_seen": 98649470, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8828125, + "step": 4565, + "time_per_iteration": 2.714069366455078 + }, + { + "auxiliary_loss_clip": 0.01133378, + "auxiliary_loss_mlp": 0.01039885, + "balance_loss_clip": 1.02443504, + "balance_loss_mlp": 1.04669619, + "epoch": 0.274522771681948, + "flos": 13479681438720.0, + "grad_norm": 2.3377831889988547, + "language_loss": 0.68350124, + "learning_rate": 3.404888640957477e-06, + "loss": 0.70523381, + "num_input_tokens_seen": 98666915, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8671875, + "step": 4566, + "time_per_iteration": 2.469357967376709 + }, + { + "auxiliary_loss_clip": 0.01133208, + "auxiliary_loss_mlp": 0.01046416, + "balance_loss_clip": 1.03211665, + "balance_loss_mlp": 1.04901338, + "epoch": 0.27458289493461596, + "flos": 28622133313920.0, + "grad_norm": 1.7938914020631171, + "language_loss": 0.60886472, + "learning_rate": 3.404611419371723e-06, + "loss": 0.63066101, + "num_input_tokens_seen": 98688240, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.84375, + "step": 4567, + "time_per_iteration": 2.5856754779815674 + }, + { + "auxiliary_loss_clip": 0.01134122, + "auxiliary_loss_mlp": 0.01042972, + "balance_loss_clip": 1.02597237, + "balance_loss_mlp": 1.04754972, + "epoch": 0.2746430181872839, + "flos": 20119815492480.0, + "grad_norm": 1.7650663548751138, + "language_loss": 0.82787997, + "learning_rate": 3.4043341445229627e-06, + "loss": 0.84965092, + "num_input_tokens_seen": 98708245, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.86328125, + "step": 4568, + "time_per_iteration": 2.476353168487549 + }, + { + "auxiliary_loss_clip": 0.0113839, + "auxiliary_loss_mlp": 0.01034679, + "balance_loss_clip": 1.01868141, + "balance_loss_mlp": 1.05012584, + "epoch": 0.2747031414399519, + "flos": 20193468330240.0, + "grad_norm": 2.0155686346894415, + "language_loss": 0.68656778, + "learning_rate": 3.4040568164217117e-06, + "loss": 0.7082985, + "num_input_tokens_seen": 98724575, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8828125, + "step": 4569, + "time_per_iteration": 2.5027451515197754 + }, + { + "auxiliary_loss_clip": 0.01133852, + "auxiliary_loss_mlp": 0.0103613, + "balance_loss_clip": 1.01947594, + "balance_loss_mlp": 1.0464673, + "epoch": 0.27476326469261986, + "flos": 13516346246400.0, + "grad_norm": 2.247407128453888, + "language_loss": 0.71138883, + "learning_rate": 3.4037794350784848e-06, + "loss": 0.73308867, + "num_input_tokens_seen": 98740700, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.875, + "step": 4570, + "time_per_iteration": 2.466845750808716 + }, + { + "auxiliary_loss_clip": 0.0104735, + "auxiliary_loss_mlp": 0.01010434, + "balance_loss_clip": 1.00881279, + "balance_loss_mlp": 1.01781416, + "epoch": 0.2748233879452878, + "flos": 65937127121280.0, + "grad_norm": 0.7344992896847644, + "language_loss": 0.55774754, + "learning_rate": 3.4035020005038014e-06, + "loss": 0.57832539, + "num_input_tokens_seen": 98803030, + "router_z_loss_clip": 0.01623535, + "router_z_loss_mlp": 0.296875, + "step": 4571, + "time_per_iteration": 3.192523241043091 + }, + { + "auxiliary_loss_clip": 0.01140339, + "auxiliary_loss_mlp": 0.01044242, + "balance_loss_clip": 1.02805328, + "balance_loss_mlp": 1.05039406, + "epoch": 0.2748835111979558, + "flos": 17384212615680.0, + "grad_norm": 3.6883594473706482, + "language_loss": 0.77785081, + "learning_rate": 3.4032245127081812e-06, + "loss": 0.79969662, + "num_input_tokens_seen": 98820505, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8984375, + "step": 4572, + "time_per_iteration": 2.4755914211273193 + }, + { + "auxiliary_loss_clip": 0.01129408, + "auxiliary_loss_mlp": 0.01036409, + "balance_loss_clip": 1.02200866, + "balance_loss_mlp": 1.04679561, + "epoch": 0.27494363445062375, + "flos": 23587565287680.0, + "grad_norm": 1.7042315716847805, + "language_loss": 0.81357443, + "learning_rate": 3.402946971702147e-06, + "loss": 0.83523262, + "num_input_tokens_seen": 98842150, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.828125, + "step": 4573, + "time_per_iteration": 2.540905237197876 + }, + { + "auxiliary_loss_clip": 0.01129787, + "auxiliary_loss_mlp": 0.01036343, + "balance_loss_clip": 1.02038062, + "balance_loss_mlp": 1.04580402, + "epoch": 0.2750037577032918, + "flos": 17164582905600.0, + "grad_norm": 1.7927939239771835, + "language_loss": 0.79077196, + "learning_rate": 3.402669377496223e-06, + "loss": 0.81243324, + "num_input_tokens_seen": 98861050, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.83984375, + "step": 4574, + "time_per_iteration": 2.451016664505005 + }, + { + "auxiliary_loss_clip": 0.01136155, + "auxiliary_loss_mlp": 0.01044603, + "balance_loss_clip": 1.02889121, + "balance_loss_mlp": 1.04886127, + "epoch": 0.27506388095595974, + "flos": 24491903028480.0, + "grad_norm": 2.232643844604772, + "language_loss": 0.74191976, + "learning_rate": 3.402391730100936e-06, + "loss": 0.76372731, + "num_input_tokens_seen": 98879695, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.875, + "step": 4575, + "time_per_iteration": 2.5744149684906006 + }, + { + "auxiliary_loss_clip": 0.01131901, + "auxiliary_loss_mlp": 0.01038458, + "balance_loss_clip": 1.02353263, + "balance_loss_mlp": 1.04711711, + "epoch": 0.2751240042086277, + "flos": 38764706722560.0, + "grad_norm": 1.8105072672356382, + "language_loss": 0.71877766, + "learning_rate": 3.402114029526814e-06, + "loss": 0.7404812, + "num_input_tokens_seen": 98902035, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.84765625, + "step": 4576, + "time_per_iteration": 2.634305715560913 + }, + { + "auxiliary_loss_clip": 0.01134924, + "auxiliary_loss_mlp": 0.01041859, + "balance_loss_clip": 1.02495503, + "balance_loss_mlp": 1.04823232, + "epoch": 0.27518412746129567, + "flos": 26907039740160.0, + "grad_norm": 1.7690392048384511, + "language_loss": 0.73200434, + "learning_rate": 3.4018362757843866e-06, + "loss": 0.75377214, + "num_input_tokens_seen": 98921835, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8671875, + "step": 4577, + "time_per_iteration": 2.5365946292877197 + }, + { + "auxiliary_loss_clip": 0.01137469, + "auxiliary_loss_mlp": 0.01038584, + "balance_loss_clip": 1.02182376, + "balance_loss_mlp": 1.04931974, + "epoch": 0.27524425071396363, + "flos": 24900531125760.0, + "grad_norm": 5.099060573221768, + "language_loss": 0.75943893, + "learning_rate": 3.401558468884188e-06, + "loss": 0.78119946, + "num_input_tokens_seen": 98939610, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8828125, + "step": 4578, + "time_per_iteration": 2.5121536254882812 + }, + { + "auxiliary_loss_clip": 0.01135832, + "auxiliary_loss_mlp": 0.01046459, + "balance_loss_clip": 1.02704024, + "balance_loss_mlp": 1.0475626, + "epoch": 0.2753043739666316, + "flos": 26288047641600.0, + "grad_norm": 2.3614458833507603, + "language_loss": 0.66299897, + "learning_rate": 3.4012806088367516e-06, + "loss": 0.68482184, + "num_input_tokens_seen": 98962250, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.8828125, + "step": 4579, + "time_per_iteration": 2.5445947647094727 + }, + { + "auxiliary_loss_clip": 0.01137742, + "auxiliary_loss_mlp": 0.01056341, + "balance_loss_clip": 1.03841197, + "balance_loss_mlp": 1.04862928, + "epoch": 0.27536449721929956, + "flos": 24206772867840.0, + "grad_norm": 1.9384727438162337, + "language_loss": 0.8013078, + "learning_rate": 3.4010026956526137e-06, + "loss": 0.82324862, + "num_input_tokens_seen": 98981845, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.890625, + "step": 4580, + "time_per_iteration": 2.4895741939544678 + }, + { + "auxiliary_loss_clip": 0.01138586, + "auxiliary_loss_mlp": 0.01044508, + "balance_loss_clip": 1.02581632, + "balance_loss_mlp": 1.05140579, + "epoch": 0.27542462047196753, + "flos": 19537272720000.0, + "grad_norm": 1.4702192551629332, + "language_loss": 0.67702103, + "learning_rate": 3.4007247293423137e-06, + "loss": 0.698852, + "num_input_tokens_seen": 99001855, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.87109375, + "step": 4581, + "time_per_iteration": 2.5905539989471436 + }, + { + "auxiliary_loss_clip": 0.01137135, + "auxiliary_loss_mlp": 0.01046005, + "balance_loss_clip": 1.03024602, + "balance_loss_mlp": 1.04847145, + "epoch": 0.2754847437246355, + "flos": 14319165173760.0, + "grad_norm": 1.8568978026073784, + "language_loss": 0.78120708, + "learning_rate": 3.400446709916392e-06, + "loss": 0.80303848, + "num_input_tokens_seen": 99019880, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.88671875, + "step": 4582, + "time_per_iteration": 2.467210531234741 + }, + { + "auxiliary_loss_clip": 0.01133579, + "auxiliary_loss_mlp": 0.01040863, + "balance_loss_clip": 1.02537727, + "balance_loss_mlp": 1.04905152, + "epoch": 0.27554486697730346, + "flos": 18838773866880.0, + "grad_norm": 2.5358708072067406, + "language_loss": 0.84527528, + "learning_rate": 3.4001686373853895e-06, + "loss": 0.86701977, + "num_input_tokens_seen": 99037570, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.84375, + "step": 4583, + "time_per_iteration": 2.511457920074463 + }, + { + "auxiliary_loss_clip": 0.01138165, + "auxiliary_loss_mlp": 0.01041348, + "balance_loss_clip": 1.02529025, + "balance_loss_mlp": 1.04905808, + "epoch": 0.2756049902299714, + "flos": 22382295402240.0, + "grad_norm": 2.037294788318467, + "language_loss": 0.67308438, + "learning_rate": 3.3998905117598528e-06, + "loss": 0.69487947, + "num_input_tokens_seen": 99056875, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.890625, + "step": 4584, + "time_per_iteration": 2.5193254947662354 + }, + { + "auxiliary_loss_clip": 0.01132805, + "auxiliary_loss_mlp": 0.01041645, + "balance_loss_clip": 1.02645802, + "balance_loss_mlp": 1.04761386, + "epoch": 0.2756651134826394, + "flos": 19573901614080.0, + "grad_norm": 1.737999785464117, + "language_loss": 0.77330101, + "learning_rate": 3.399612333050327e-06, + "loss": 0.7950455, + "num_input_tokens_seen": 99074685, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8515625, + "step": 4585, + "time_per_iteration": 2.5393707752227783 + }, + { + "auxiliary_loss_clip": 0.0114213, + "auxiliary_loss_mlp": 0.01039297, + "balance_loss_clip": 1.02227354, + "balance_loss_mlp": 1.0530591, + "epoch": 0.27572523673530736, + "flos": 23586559706880.0, + "grad_norm": 1.654604836009794, + "language_loss": 0.71854031, + "learning_rate": 3.399334101267362e-06, + "loss": 0.74035466, + "num_input_tokens_seen": 99095300, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.890625, + "step": 4586, + "time_per_iteration": 2.534979820251465 + }, + { + "auxiliary_loss_clip": 0.01136306, + "auxiliary_loss_mlp": 0.01035904, + "balance_loss_clip": 1.01996541, + "balance_loss_mlp": 1.04988265, + "epoch": 0.2757853599879754, + "flos": 22820118278400.0, + "grad_norm": 1.5248017982775213, + "language_loss": 0.80546939, + "learning_rate": 3.3990558164215073e-06, + "loss": 0.82719147, + "num_input_tokens_seen": 99115965, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.86328125, + "step": 4587, + "time_per_iteration": 2.5424065589904785 + }, + { + "auxiliary_loss_clip": 0.01133784, + "auxiliary_loss_mlp": 0.0103925, + "balance_loss_clip": 1.02356219, + "balance_loss_mlp": 1.04939508, + "epoch": 0.27584548324064334, + "flos": 18551704371840.0, + "grad_norm": 2.136921841599078, + "language_loss": 0.82694119, + "learning_rate": 3.398777478523316e-06, + "loss": 0.8486715, + "num_input_tokens_seen": 99134265, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.84375, + "step": 4588, + "time_per_iteration": 2.467923879623413 + }, + { + "auxiliary_loss_clip": 0.01132148, + "auxiliary_loss_mlp": 0.01038443, + "balance_loss_clip": 1.0228622, + "balance_loss_mlp": 1.04754925, + "epoch": 0.2759056064933113, + "flos": 23769883745280.0, + "grad_norm": 1.3980423175693042, + "language_loss": 0.75352502, + "learning_rate": 3.398499087583342e-06, + "loss": 0.775231, + "num_input_tokens_seen": 99156185, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84375, + "step": 4589, + "time_per_iteration": 2.535837173461914 + }, + { + "auxiliary_loss_clip": 0.01132096, + "auxiliary_loss_mlp": 0.01042232, + "balance_loss_clip": 1.02526879, + "balance_loss_mlp": 1.04686022, + "epoch": 0.27596572974597927, + "flos": 24281898163200.0, + "grad_norm": 1.7720046877472317, + "language_loss": 0.88438141, + "learning_rate": 3.398220643612143e-06, + "loss": 0.90612471, + "num_input_tokens_seen": 99176735, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.8515625, + "step": 4590, + "time_per_iteration": 2.5248916149139404 + }, + { + "auxiliary_loss_clip": 0.01135164, + "auxiliary_loss_mlp": 0.01045359, + "balance_loss_clip": 1.02946877, + "balance_loss_mlp": 1.04789972, + "epoch": 0.27602585299864724, + "flos": 35040985632000.0, + "grad_norm": 1.6299691755620427, + "language_loss": 0.7129395, + "learning_rate": 3.397942146620277e-06, + "loss": 0.73474467, + "num_input_tokens_seen": 99199765, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.87109375, + "step": 4591, + "time_per_iteration": 2.6112425327301025 + }, + { + "auxiliary_loss_clip": 0.01135759, + "auxiliary_loss_mlp": 0.01049557, + "balance_loss_clip": 1.03268862, + "balance_loss_mlp": 1.04847574, + "epoch": 0.2760859762513152, + "flos": 24309405002880.0, + "grad_norm": 1.8477043284936983, + "language_loss": 0.80190659, + "learning_rate": 3.3976635966183046e-06, + "loss": 0.82375979, + "num_input_tokens_seen": 99218435, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.875, + "step": 4592, + "time_per_iteration": 2.483894109725952 + }, + { + "auxiliary_loss_clip": 0.01048363, + "auxiliary_loss_mlp": 0.01005872, + "balance_loss_clip": 1.00416684, + "balance_loss_mlp": 1.0189774, + "epoch": 0.27614609950398317, + "flos": 71260739890560.0, + "grad_norm": 0.7716758671018623, + "language_loss": 0.61627746, + "learning_rate": 3.3973849936167886e-06, + "loss": 0.63681984, + "num_input_tokens_seen": 99276200, + "router_z_loss_clip": 0.01708984, + "router_z_loss_mlp": 0.29296875, + "step": 4593, + "time_per_iteration": 3.0616326332092285 + }, + { + "auxiliary_loss_clip": 0.01135076, + "auxiliary_loss_mlp": 0.01045597, + "balance_loss_clip": 1.02965856, + "balance_loss_mlp": 1.04938328, + "epoch": 0.27620622275665113, + "flos": 29674854138240.0, + "grad_norm": 1.8877557773606983, + "language_loss": 0.77589142, + "learning_rate": 3.3971063376262937e-06, + "loss": 0.79769808, + "num_input_tokens_seen": 99297625, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.859375, + "step": 4594, + "time_per_iteration": 4.043708086013794 + }, + { + "auxiliary_loss_clip": 0.01134807, + "auxiliary_loss_mlp": 0.01033386, + "balance_loss_clip": 1.01769793, + "balance_loss_mlp": 1.04991734, + "epoch": 0.2762663460093191, + "flos": 15378063137280.0, + "grad_norm": 1.7681451067423914, + "language_loss": 0.91645586, + "learning_rate": 3.3968276286573866e-06, + "loss": 0.93813777, + "num_input_tokens_seen": 99315790, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84765625, + "step": 4595, + "time_per_iteration": 3.973101854324341 + }, + { + "auxiliary_loss_clip": 0.01138485, + "auxiliary_loss_mlp": 0.01047274, + "balance_loss_clip": 1.03034675, + "balance_loss_mlp": 1.05122674, + "epoch": 0.27632646926198706, + "flos": 20704082117760.0, + "grad_norm": 1.7288059110569738, + "language_loss": 0.69101036, + "learning_rate": 3.3965488667206353e-06, + "loss": 0.71286798, + "num_input_tokens_seen": 99334615, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.87109375, + "step": 4596, + "time_per_iteration": 2.509199380874634 + }, + { + "auxiliary_loss_clip": 0.0114029, + "auxiliary_loss_mlp": 0.01041278, + "balance_loss_clip": 1.0249939, + "balance_loss_mlp": 1.04883707, + "epoch": 0.276386592514655, + "flos": 32813374849920.0, + "grad_norm": 2.01522187594791, + "language_loss": 0.63536406, + "learning_rate": 3.3962700518266113e-06, + "loss": 0.65717971, + "num_input_tokens_seen": 99356685, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.9140625, + "step": 4597, + "time_per_iteration": 2.5944221019744873 + }, + { + "auxiliary_loss_clip": 0.01133967, + "auxiliary_loss_mlp": 0.0104198, + "balance_loss_clip": 1.02629232, + "balance_loss_mlp": 1.05002272, + "epoch": 0.276446715767323, + "flos": 18551704371840.0, + "grad_norm": 2.1842552390134586, + "language_loss": 0.86612505, + "learning_rate": 3.395991183985887e-06, + "loss": 0.88788456, + "num_input_tokens_seen": 99374810, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.83984375, + "step": 4598, + "time_per_iteration": 2.4870996475219727 + }, + { + "auxiliary_loss_clip": 0.01135257, + "auxiliary_loss_mlp": 0.01042781, + "balance_loss_clip": 1.02586544, + "balance_loss_mlp": 1.04847229, + "epoch": 0.27650683901999096, + "flos": 22819615488000.0, + "grad_norm": 2.0694668215518996, + "language_loss": 0.79822165, + "learning_rate": 3.395712263209037e-06, + "loss": 0.82000202, + "num_input_tokens_seen": 99391290, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8671875, + "step": 4599, + "time_per_iteration": 2.4923834800720215 + }, + { + "auxiliary_loss_clip": 0.01140028, + "auxiliary_loss_mlp": 0.0104597, + "balance_loss_clip": 1.02965581, + "balance_loss_mlp": 1.04958415, + "epoch": 0.276566962272659, + "flos": 21361534704000.0, + "grad_norm": 1.9049018096400723, + "language_loss": 0.78357869, + "learning_rate": 3.395433289506639e-06, + "loss": 0.80543864, + "num_input_tokens_seen": 99409120, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.90625, + "step": 4600, + "time_per_iteration": 2.496173620223999 + }, + { + "auxiliary_loss_clip": 0.01139042, + "auxiliary_loss_mlp": 0.01046211, + "balance_loss_clip": 1.03007007, + "balance_loss_mlp": 1.04887986, + "epoch": 0.27662708552532694, + "flos": 17710604524800.0, + "grad_norm": 1.9474431855639402, + "language_loss": 0.73361742, + "learning_rate": 3.3951542628892694e-06, + "loss": 0.75546992, + "num_input_tokens_seen": 99426180, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.90234375, + "step": 4601, + "time_per_iteration": 2.475919246673584 + }, + { + "auxiliary_loss_clip": 0.01135661, + "auxiliary_loss_mlp": 0.01045476, + "balance_loss_clip": 1.02883482, + "balance_loss_mlp": 1.04879355, + "epoch": 0.2766872087779949, + "flos": 21252725429760.0, + "grad_norm": 1.9134344988482315, + "language_loss": 0.79341739, + "learning_rate": 3.3948751833675113e-06, + "loss": 0.81522876, + "num_input_tokens_seen": 99447720, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8671875, + "step": 4602, + "time_per_iteration": 2.511716842651367 + }, + { + "auxiliary_loss_clip": 0.01140401, + "auxiliary_loss_mlp": 0.01051234, + "balance_loss_clip": 1.03349614, + "balance_loss_mlp": 1.04920423, + "epoch": 0.2767473320306629, + "flos": 12931900053120.0, + "grad_norm": 2.260382216699142, + "language_loss": 0.76887643, + "learning_rate": 3.3945960509519455e-06, + "loss": 0.79079276, + "num_input_tokens_seen": 99464720, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.91015625, + "step": 4603, + "time_per_iteration": 2.4667811393737793 + }, + { + "auxiliary_loss_clip": 0.0112975, + "auxiliary_loss_mlp": 0.01040095, + "balance_loss_clip": 1.0252831, + "balance_loss_mlp": 1.04736543, + "epoch": 0.27680745528333084, + "flos": 15012851604480.0, + "grad_norm": 1.7288101924316703, + "language_loss": 0.81411278, + "learning_rate": 3.3943168656531585e-06, + "loss": 0.83581114, + "num_input_tokens_seen": 99482310, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.82421875, + "step": 4604, + "time_per_iteration": 2.4586222171783447 + }, + { + "auxiliary_loss_clip": 0.01135813, + "auxiliary_loss_mlp": 0.01031451, + "balance_loss_clip": 1.01516712, + "balance_loss_mlp": 1.04756212, + "epoch": 0.2768675785359988, + "flos": 22637835734400.0, + "grad_norm": 1.7513688477785454, + "language_loss": 0.69912565, + "learning_rate": 3.3940376274817363e-06, + "loss": 0.72079831, + "num_input_tokens_seen": 99501255, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8828125, + "step": 4605, + "time_per_iteration": 2.5138533115386963 + }, + { + "auxiliary_loss_clip": 0.01045677, + "auxiliary_loss_mlp": 0.01001918, + "balance_loss_clip": 1.00033224, + "balance_loss_mlp": 1.01580858, + "epoch": 0.27692770178866677, + "flos": 66130542881280.0, + "grad_norm": 0.7252635192802935, + "language_loss": 0.57151282, + "learning_rate": 3.3937583364482673e-06, + "loss": 0.59198874, + "num_input_tokens_seen": 99568925, + "router_z_loss_clip": 0.01586914, + "router_z_loss_mlp": 0.296875, + "step": 4606, + "time_per_iteration": 3.184955596923828 + }, + { + "auxiliary_loss_clip": 0.01136733, + "auxiliary_loss_mlp": 0.01049361, + "balance_loss_clip": 1.03234947, + "balance_loss_mlp": 1.0481658, + "epoch": 0.27698782504133473, + "flos": 26464979059200.0, + "grad_norm": 2.0717297663627825, + "language_loss": 0.69666946, + "learning_rate": 3.3934789925633424e-06, + "loss": 0.71853042, + "num_input_tokens_seen": 99588455, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.88671875, + "step": 4607, + "time_per_iteration": 2.5373001098632812 + }, + { + "auxiliary_loss_clip": 0.011299, + "auxiliary_loss_mlp": 0.01035631, + "balance_loss_clip": 1.02014589, + "balance_loss_mlp": 1.04721832, + "epoch": 0.2770479482940027, + "flos": 25884806584320.0, + "grad_norm": 3.332085537790215, + "language_loss": 0.6982615, + "learning_rate": 3.393199595837555e-06, + "loss": 0.71991682, + "num_input_tokens_seen": 99609355, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.828125, + "step": 4608, + "time_per_iteration": 2.5396809577941895 + }, + { + "auxiliary_loss_clip": 0.01135928, + "auxiliary_loss_mlp": 0.01037862, + "balance_loss_clip": 1.02185202, + "balance_loss_mlp": 1.04715931, + "epoch": 0.27710807154667066, + "flos": 22857249962880.0, + "grad_norm": 1.8242818121189563, + "language_loss": 0.72541273, + "learning_rate": 3.392920146281499e-06, + "loss": 0.74715054, + "num_input_tokens_seen": 99628780, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.88671875, + "step": 4609, + "time_per_iteration": 2.5383543968200684 + }, + { + "auxiliary_loss_clip": 0.01134274, + "auxiliary_loss_mlp": 0.01048832, + "balance_loss_clip": 1.03226149, + "balance_loss_mlp": 1.04623055, + "epoch": 0.27716819479933863, + "flos": 17711071401600.0, + "grad_norm": 2.2576811985082967, + "language_loss": 0.84010947, + "learning_rate": 3.3926406439057714e-06, + "loss": 0.86194062, + "num_input_tokens_seen": 99644545, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.87890625, + "step": 4610, + "time_per_iteration": 2.4456827640533447 + }, + { + "auxiliary_loss_clip": 0.01141086, + "auxiliary_loss_mlp": 0.01051097, + "balance_loss_clip": 1.03344178, + "balance_loss_mlp": 1.04996872, + "epoch": 0.2772283180520066, + "flos": 19646046080640.0, + "grad_norm": 2.570198611472629, + "language_loss": 0.68948054, + "learning_rate": 3.3923610887209705e-06, + "loss": 0.71140236, + "num_input_tokens_seen": 99663125, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9140625, + "step": 4611, + "time_per_iteration": 2.5342319011688232 + }, + { + "auxiliary_loss_clip": 0.01130823, + "auxiliary_loss_mlp": 0.01036995, + "balance_loss_clip": 1.0212357, + "balance_loss_mlp": 1.04892015, + "epoch": 0.27728844130467456, + "flos": 21032628842880.0, + "grad_norm": 2.354058548299899, + "language_loss": 0.73450744, + "learning_rate": 3.392081480737698e-06, + "loss": 0.75618565, + "num_input_tokens_seen": 99682645, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8203125, + "step": 4612, + "time_per_iteration": 2.472200632095337 + }, + { + "auxiliary_loss_clip": 0.01137408, + "auxiliary_loss_mlp": 0.0105089, + "balance_loss_clip": 1.03378379, + "balance_loss_mlp": 1.04807258, + "epoch": 0.2773485645573425, + "flos": 18989204025600.0, + "grad_norm": 2.166254073057622, + "language_loss": 0.66736221, + "learning_rate": 3.3918018199665563e-06, + "loss": 0.68924516, + "num_input_tokens_seen": 99700520, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.89453125, + "step": 4613, + "time_per_iteration": 2.5313632488250732 + }, + { + "auxiliary_loss_clip": 0.01131014, + "auxiliary_loss_mlp": 0.01043283, + "balance_loss_clip": 1.02721334, + "balance_loss_mlp": 1.04604864, + "epoch": 0.27740868781001055, + "flos": 21468440557440.0, + "grad_norm": 1.8826548789840187, + "language_loss": 0.79452634, + "learning_rate": 3.39152210641815e-06, + "loss": 0.81626928, + "num_input_tokens_seen": 99720355, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8515625, + "step": 4614, + "time_per_iteration": 2.4869751930236816 + }, + { + "auxiliary_loss_clip": 0.01135901, + "auxiliary_loss_mlp": 0.01043201, + "balance_loss_clip": 1.02684534, + "balance_loss_mlp": 1.0477469, + "epoch": 0.2774688110626785, + "flos": 19827825834240.0, + "grad_norm": 2.573597172535304, + "language_loss": 0.80251336, + "learning_rate": 3.3912423401030865e-06, + "loss": 0.8243044, + "num_input_tokens_seen": 99736090, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8828125, + "step": 4615, + "time_per_iteration": 2.521615505218506 + }, + { + "auxiliary_loss_clip": 0.01135416, + "auxiliary_loss_mlp": 0.01043384, + "balance_loss_clip": 1.02676582, + "balance_loss_mlp": 1.04627132, + "epoch": 0.2775289343153465, + "flos": 18216226321920.0, + "grad_norm": 2.403593727320557, + "language_loss": 0.63926548, + "learning_rate": 3.3909625210319735e-06, + "loss": 0.66105354, + "num_input_tokens_seen": 99751805, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.890625, + "step": 4616, + "time_per_iteration": 2.439410448074341 + }, + { + "auxiliary_loss_clip": 0.01133721, + "auxiliary_loss_mlp": 0.01041991, + "balance_loss_clip": 1.02593398, + "balance_loss_mlp": 1.04661143, + "epoch": 0.27758905756801444, + "flos": 16472476673280.0, + "grad_norm": 1.8467628074440183, + "language_loss": 0.82283223, + "learning_rate": 3.3906826492154226e-06, + "loss": 0.84458935, + "num_input_tokens_seen": 99770610, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87109375, + "step": 4617, + "time_per_iteration": 2.49495792388916 + }, + { + "auxiliary_loss_clip": 0.01133289, + "auxiliary_loss_mlp": 0.01041846, + "balance_loss_clip": 1.02587175, + "balance_loss_mlp": 1.04613662, + "epoch": 0.2776491808206824, + "flos": 18728240739840.0, + "grad_norm": 2.1015666973838942, + "language_loss": 0.76835418, + "learning_rate": 3.3904027246640458e-06, + "loss": 0.79010552, + "num_input_tokens_seen": 99787305, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87109375, + "step": 4618, + "time_per_iteration": 2.4882123470306396 + }, + { + "auxiliary_loss_clip": 0.01136682, + "auxiliary_loss_mlp": 0.0104057, + "balance_loss_clip": 1.02501273, + "balance_loss_mlp": 1.0495801, + "epoch": 0.27770930407335037, + "flos": 28038189911040.0, + "grad_norm": 1.6700061931983001, + "language_loss": 0.84698343, + "learning_rate": 3.390122747388459e-06, + "loss": 0.868756, + "num_input_tokens_seen": 99808940, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.87109375, + "step": 4619, + "time_per_iteration": 2.5996124744415283 + }, + { + "auxiliary_loss_clip": 0.01128767, + "auxiliary_loss_mlp": 0.0103795, + "balance_loss_clip": 1.02340662, + "balance_loss_mlp": 1.04523671, + "epoch": 0.27776942732601834, + "flos": 23549823072000.0, + "grad_norm": 1.4068177028172657, + "language_loss": 0.76720011, + "learning_rate": 3.3898427173992778e-06, + "loss": 0.78886724, + "num_input_tokens_seen": 99829575, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8359375, + "step": 4620, + "time_per_iteration": 2.4851698875427246 + }, + { + "auxiliary_loss_clip": 0.01130943, + "auxiliary_loss_mlp": 0.01036697, + "balance_loss_clip": 1.02126586, + "balance_loss_mlp": 1.04728413, + "epoch": 0.2778295505786863, + "flos": 23908713811200.0, + "grad_norm": 2.4956264272783084, + "language_loss": 0.78746819, + "learning_rate": 3.389562634707122e-06, + "loss": 0.80914462, + "num_input_tokens_seen": 99847575, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8359375, + "step": 4621, + "time_per_iteration": 2.543513774871826 + }, + { + "auxiliary_loss_clip": 0.01135835, + "auxiliary_loss_mlp": 0.01046354, + "balance_loss_clip": 1.02953315, + "balance_loss_mlp": 1.04871762, + "epoch": 0.27788967383135427, + "flos": 25554571920000.0, + "grad_norm": 1.9988562622182164, + "language_loss": 0.87520665, + "learning_rate": 3.389282499322611e-06, + "loss": 0.89702857, + "num_input_tokens_seen": 99864995, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.87109375, + "step": 4622, + "time_per_iteration": 2.4818174839019775 + }, + { + "auxiliary_loss_clip": 0.01133366, + "auxiliary_loss_mlp": 0.01046006, + "balance_loss_clip": 1.02960837, + "balance_loss_mlp": 1.04635906, + "epoch": 0.27794979708402223, + "flos": 16252631481600.0, + "grad_norm": 1.9062066208333321, + "language_loss": 0.81094646, + "learning_rate": 3.389002311256369e-06, + "loss": 0.83274019, + "num_input_tokens_seen": 99881540, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8671875, + "step": 4623, + "time_per_iteration": 2.509218692779541 + }, + { + "auxiliary_loss_clip": 0.01136736, + "auxiliary_loss_mlp": 0.01039194, + "balance_loss_clip": 1.02357817, + "balance_loss_mlp": 1.04981863, + "epoch": 0.2780099203366902, + "flos": 20667632791680.0, + "grad_norm": 1.93503772017796, + "language_loss": 0.81099498, + "learning_rate": 3.3887220705190204e-06, + "loss": 0.83275431, + "num_input_tokens_seen": 99899595, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8671875, + "step": 4624, + "time_per_iteration": 2.470041513442993 + }, + { + "auxiliary_loss_clip": 0.01135607, + "auxiliary_loss_mlp": 0.01041344, + "balance_loss_clip": 1.02563787, + "balance_loss_mlp": 1.05091214, + "epoch": 0.27807004358935816, + "flos": 17739583822080.0, + "grad_norm": 3.184384520938543, + "language_loss": 0.76514304, + "learning_rate": 3.388441777121191e-06, + "loss": 0.7869125, + "num_input_tokens_seen": 99913020, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.84765625, + "step": 4625, + "time_per_iteration": 2.4965567588806152 + }, + { + "auxiliary_loss_clip": 0.01133566, + "auxiliary_loss_mlp": 0.01041252, + "balance_loss_clip": 1.02439606, + "balance_loss_mlp": 1.04835677, + "epoch": 0.2781301668420261, + "flos": 16727119165440.0, + "grad_norm": 2.5511238477154095, + "language_loss": 0.70091927, + "learning_rate": 3.388161431073511e-06, + "loss": 0.7226674, + "num_input_tokens_seen": 99931405, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8515625, + "step": 4626, + "time_per_iteration": 2.462007522583008 + }, + { + "auxiliary_loss_clip": 0.01142353, + "auxiliary_loss_mlp": 0.01036943, + "balance_loss_clip": 1.01981282, + "balance_loss_mlp": 1.05177855, + "epoch": 0.27819029009469415, + "flos": 13844749317120.0, + "grad_norm": 2.1576082410571704, + "language_loss": 0.92738312, + "learning_rate": 3.38788103238661e-06, + "loss": 0.94917607, + "num_input_tokens_seen": 99948100, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.90625, + "step": 4627, + "time_per_iteration": 2.5731146335601807 + }, + { + "auxiliary_loss_clip": 0.01137702, + "auxiliary_loss_mlp": 0.01041394, + "balance_loss_clip": 1.02640903, + "balance_loss_mlp": 1.04856014, + "epoch": 0.2782504133473621, + "flos": 27089286370560.0, + "grad_norm": 4.44086075484182, + "language_loss": 0.85802954, + "learning_rate": 3.387600581071121e-06, + "loss": 0.87982047, + "num_input_tokens_seen": 99966470, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.890625, + "step": 4628, + "time_per_iteration": 2.502816915512085 + }, + { + "auxiliary_loss_clip": 0.01136721, + "auxiliary_loss_mlp": 0.01039197, + "balance_loss_clip": 1.02358079, + "balance_loss_mlp": 1.05035257, + "epoch": 0.2783105366000301, + "flos": 21068826773760.0, + "grad_norm": 1.4685731198996637, + "language_loss": 0.79003006, + "learning_rate": 3.387320077137679e-06, + "loss": 0.81178927, + "num_input_tokens_seen": 99985930, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.86328125, + "step": 4629, + "time_per_iteration": 2.544255256652832 + }, + { + "auxiliary_loss_clip": 0.01132865, + "auxiliary_loss_mlp": 0.01038616, + "balance_loss_clip": 1.02419138, + "balance_loss_mlp": 1.05083036, + "epoch": 0.27837065985269804, + "flos": 26501823434880.0, + "grad_norm": 1.4531737557023054, + "language_loss": 0.84322643, + "learning_rate": 3.3870395205969208e-06, + "loss": 0.86494124, + "num_input_tokens_seen": 100006235, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8203125, + "step": 4630, + "time_per_iteration": 2.514413833618164 + }, + { + "auxiliary_loss_clip": 0.01136217, + "auxiliary_loss_mlp": 0.01040004, + "balance_loss_clip": 1.02343392, + "balance_loss_mlp": 1.04834175, + "epoch": 0.278430783105366, + "flos": 20223201813120.0, + "grad_norm": 2.1800575167200997, + "language_loss": 0.80845618, + "learning_rate": 3.386758911459485e-06, + "loss": 0.83021843, + "num_input_tokens_seen": 100023655, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.87890625, + "step": 4631, + "time_per_iteration": 2.530393123626709 + }, + { + "auxiliary_loss_clip": 0.01141592, + "auxiliary_loss_mlp": 0.01050096, + "balance_loss_clip": 1.03403842, + "balance_loss_mlp": 1.05319762, + "epoch": 0.278490906358034, + "flos": 25592888753280.0, + "grad_norm": 2.154319840219951, + "language_loss": 0.71817827, + "learning_rate": 3.3864782497360126e-06, + "loss": 0.74009514, + "num_input_tokens_seen": 100043280, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8828125, + "step": 4632, + "time_per_iteration": 2.504826307296753 + }, + { + "auxiliary_loss_clip": 0.01135617, + "auxiliary_loss_mlp": 0.01040662, + "balance_loss_clip": 1.02571952, + "balance_loss_mlp": 1.05240536, + "epoch": 0.27855102961070194, + "flos": 16171544528640.0, + "grad_norm": 1.8401586776799086, + "language_loss": 0.82518554, + "learning_rate": 3.386197535437145e-06, + "loss": 0.84694839, + "num_input_tokens_seen": 100057690, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.83203125, + "step": 4633, + "time_per_iteration": 2.484894037246704 + }, + { + "auxiliary_loss_clip": 0.0113869, + "auxiliary_loss_mlp": 0.01044294, + "balance_loss_clip": 1.02622163, + "balance_loss_mlp": 1.05006409, + "epoch": 0.2786111528633699, + "flos": 22927598749440.0, + "grad_norm": 1.740894494158558, + "language_loss": 0.87933433, + "learning_rate": 3.385916768573529e-06, + "loss": 0.90116417, + "num_input_tokens_seen": 100075875, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.88671875, + "step": 4634, + "time_per_iteration": 2.465115785598755 + }, + { + "auxiliary_loss_clip": 0.01139508, + "auxiliary_loss_mlp": 0.0103873, + "balance_loss_clip": 1.02182591, + "balance_loss_mlp": 1.05175185, + "epoch": 0.27867127611603787, + "flos": 23404205335680.0, + "grad_norm": 1.5848956099548452, + "language_loss": 0.77060932, + "learning_rate": 3.38563594915581e-06, + "loss": 0.79239166, + "num_input_tokens_seen": 100092930, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.875, + "step": 4635, + "time_per_iteration": 2.5032925605773926 + }, + { + "auxiliary_loss_clip": 0.01137724, + "auxiliary_loss_mlp": 0.01045364, + "balance_loss_clip": 1.02843595, + "balance_loss_mlp": 1.04919934, + "epoch": 0.27873139936870583, + "flos": 19829010983040.0, + "grad_norm": 1.7277393232375848, + "language_loss": 0.65047133, + "learning_rate": 3.385355077194637e-06, + "loss": 0.67230225, + "num_input_tokens_seen": 100110790, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8828125, + "step": 4636, + "time_per_iteration": 4.078390121459961 + }, + { + "auxiliary_loss_clip": 0.01137292, + "auxiliary_loss_mlp": 0.01039979, + "balance_loss_clip": 1.02249098, + "balance_loss_mlp": 1.04898095, + "epoch": 0.2787915226213738, + "flos": 17707659609600.0, + "grad_norm": 2.3949865449269034, + "language_loss": 0.84131932, + "learning_rate": 3.3850741527006604e-06, + "loss": 0.86309206, + "num_input_tokens_seen": 100126970, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.8828125, + "step": 4637, + "time_per_iteration": 3.9023706912994385 + }, + { + "auxiliary_loss_clip": 0.01131584, + "auxiliary_loss_mlp": 0.0104016, + "balance_loss_clip": 1.02468669, + "balance_loss_mlp": 1.04683113, + "epoch": 0.27885164587404176, + "flos": 22090557139200.0, + "grad_norm": 1.9572077756422592, + "language_loss": 0.75880706, + "learning_rate": 3.384793175684533e-06, + "loss": 0.78052455, + "num_input_tokens_seen": 100146720, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84765625, + "step": 4638, + "time_per_iteration": 2.5291664600372314 + }, + { + "auxiliary_loss_clip": 0.01137756, + "auxiliary_loss_mlp": 0.01044501, + "balance_loss_clip": 1.0281812, + "balance_loss_mlp": 1.04918075, + "epoch": 0.27891176912670973, + "flos": 19207684500480.0, + "grad_norm": 1.663593201704466, + "language_loss": 0.71469444, + "learning_rate": 3.38451214615691e-06, + "loss": 0.73651695, + "num_input_tokens_seen": 100165920, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8828125, + "step": 4639, + "time_per_iteration": 2.4396321773529053 + }, + { + "auxiliary_loss_clip": 0.01135046, + "auxiliary_loss_mlp": 0.01035153, + "balance_loss_clip": 1.01814222, + "balance_loss_mlp": 1.0477488, + "epoch": 0.27897189237937775, + "flos": 27600007898880.0, + "grad_norm": 2.020838508390905, + "language_loss": 0.65634811, + "learning_rate": 3.384231064128447e-06, + "loss": 0.67805016, + "num_input_tokens_seen": 100185525, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.87109375, + "step": 4640, + "time_per_iteration": 2.524146556854248 + }, + { + "auxiliary_loss_clip": 0.01135389, + "auxiliary_loss_mlp": 0.01038864, + "balance_loss_clip": 1.02278829, + "balance_loss_mlp": 1.04838169, + "epoch": 0.2790320156320457, + "flos": 21178210665600.0, + "grad_norm": 1.8663182251903623, + "language_loss": 0.71682954, + "learning_rate": 3.383949929609804e-06, + "loss": 0.738572, + "num_input_tokens_seen": 100204850, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.87109375, + "step": 4641, + "time_per_iteration": 2.45416522026062 + }, + { + "auxiliary_loss_clip": 0.01137426, + "auxiliary_loss_mlp": 0.01043433, + "balance_loss_clip": 1.02620697, + "balance_loss_mlp": 1.04805887, + "epoch": 0.2790921388847137, + "flos": 22783920347520.0, + "grad_norm": 1.721157258136314, + "language_loss": 0.74843872, + "learning_rate": 3.383668742611641e-06, + "loss": 0.77024734, + "num_input_tokens_seen": 100224520, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.890625, + "step": 4642, + "time_per_iteration": 2.498901128768921 + }, + { + "auxiliary_loss_clip": 0.01136083, + "auxiliary_loss_mlp": 0.01041154, + "balance_loss_clip": 1.0241071, + "balance_loss_mlp": 1.04755557, + "epoch": 0.27915226213738165, + "flos": 23400649889280.0, + "grad_norm": 1.7771181879405247, + "language_loss": 0.85500491, + "learning_rate": 3.3833875031446205e-06, + "loss": 0.87677723, + "num_input_tokens_seen": 100243935, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.8828125, + "step": 4643, + "time_per_iteration": 2.4678151607513428 + }, + { + "auxiliary_loss_clip": 0.01135774, + "auxiliary_loss_mlp": 0.01044591, + "balance_loss_clip": 1.02831888, + "balance_loss_mlp": 1.04914284, + "epoch": 0.2792123853900496, + "flos": 22747794243840.0, + "grad_norm": 1.8372365182177028, + "language_loss": 0.8320173, + "learning_rate": 3.383106211219407e-06, + "loss": 0.85382092, + "num_input_tokens_seen": 100262290, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8671875, + "step": 4644, + "time_per_iteration": 2.4989511966705322 + }, + { + "auxiliary_loss_clip": 0.01137034, + "auxiliary_loss_mlp": 0.01039698, + "balance_loss_clip": 1.02340162, + "balance_loss_mlp": 1.04927874, + "epoch": 0.2792725086427176, + "flos": 15049372757760.0, + "grad_norm": 2.1578284197730246, + "language_loss": 0.7905547, + "learning_rate": 3.3828248668466673e-06, + "loss": 0.81232202, + "num_input_tokens_seen": 100280015, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.87890625, + "step": 4645, + "time_per_iteration": 2.444539785385132 + }, + { + "auxiliary_loss_clip": 0.01045698, + "auxiliary_loss_mlp": 0.01013694, + "balance_loss_clip": 1.01202476, + "balance_loss_mlp": 1.01603949, + "epoch": 0.27933263189538554, + "flos": 62544861757440.0, + "grad_norm": 0.7789852310638867, + "language_loss": 0.62276232, + "learning_rate": 3.3825434700370705e-06, + "loss": 0.64335632, + "num_input_tokens_seen": 100338935, + "router_z_loss_clip": 0.01672363, + "router_z_loss_mlp": 0.296875, + "step": 4646, + "time_per_iteration": 3.0487425327301025 + }, + { + "auxiliary_loss_clip": 0.01130687, + "auxiliary_loss_mlp": 0.01035262, + "balance_loss_clip": 1.02039671, + "balance_loss_mlp": 1.04760003, + "epoch": 0.2793927551480535, + "flos": 25118365155840.0, + "grad_norm": 1.6043045349905556, + "language_loss": 0.89379698, + "learning_rate": 3.3822620208012865e-06, + "loss": 0.91545647, + "num_input_tokens_seen": 100359905, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.83203125, + "step": 4647, + "time_per_iteration": 2.537818193435669 + }, + { + "auxiliary_loss_clip": 0.01137315, + "auxiliary_loss_mlp": 0.0104209, + "balance_loss_clip": 1.02559125, + "balance_loss_mlp": 1.04848313, + "epoch": 0.27945287840072147, + "flos": 21324582587520.0, + "grad_norm": 1.6404696751402497, + "language_loss": 0.87119055, + "learning_rate": 3.381980519149988e-06, + "loss": 0.89298457, + "num_input_tokens_seen": 100376955, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.890625, + "step": 4648, + "time_per_iteration": 2.490755081176758 + }, + { + "auxiliary_loss_clip": 0.01138515, + "auxiliary_loss_mlp": 0.01036468, + "balance_loss_clip": 1.01993406, + "balance_loss_mlp": 1.04894495, + "epoch": 0.27951300165338944, + "flos": 27450547407360.0, + "grad_norm": 4.859667262510518, + "language_loss": 0.72424746, + "learning_rate": 3.38169896509385e-06, + "loss": 0.74599725, + "num_input_tokens_seen": 100397545, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.89453125, + "step": 4649, + "time_per_iteration": 2.551149368286133 + }, + { + "auxiliary_loss_clip": 0.01134145, + "auxiliary_loss_mlp": 0.0104133, + "balance_loss_clip": 1.02275741, + "balance_loss_mlp": 1.04667568, + "epoch": 0.2795731249060574, + "flos": 15159008044800.0, + "grad_norm": 2.198213539311656, + "language_loss": 0.80241156, + "learning_rate": 3.381417358643549e-06, + "loss": 0.8241663, + "num_input_tokens_seen": 100415080, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.875, + "step": 4650, + "time_per_iteration": 2.495481252670288 + }, + { + "auxiliary_loss_clip": 0.01043234, + "auxiliary_loss_mlp": 0.01001825, + "balance_loss_clip": 1.00015628, + "balance_loss_mlp": 1.01336908, + "epoch": 0.27963324815872537, + "flos": 60120103178880.0, + "grad_norm": 1.2001935939690993, + "language_loss": 0.58821332, + "learning_rate": 3.3811356998097624e-06, + "loss": 0.60866392, + "num_input_tokens_seen": 100471105, + "router_z_loss_clip": 0.01672363, + "router_z_loss_mlp": 0.296875, + "step": 4651, + "time_per_iteration": 3.089278221130371 + }, + { + "auxiliary_loss_clip": 0.01136266, + "auxiliary_loss_mlp": 0.01041939, + "balance_loss_clip": 1.0239383, + "balance_loss_mlp": 1.04576242, + "epoch": 0.27969337141139333, + "flos": 21765960910080.0, + "grad_norm": 1.6305345142383205, + "language_loss": 0.74335963, + "learning_rate": 3.3808539886031726e-06, + "loss": 0.76514173, + "num_input_tokens_seen": 100492520, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.90625, + "step": 4652, + "time_per_iteration": 2.5034215450286865 + }, + { + "auxiliary_loss_clip": 0.01140774, + "auxiliary_loss_mlp": 0.01044834, + "balance_loss_clip": 1.02826357, + "balance_loss_mlp": 1.05137777, + "epoch": 0.27975349466406135, + "flos": 39851398834560.0, + "grad_norm": 2.1744902530470527, + "language_loss": 0.79703641, + "learning_rate": 3.380572225034461e-06, + "loss": 0.81889254, + "num_input_tokens_seen": 100512870, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.89453125, + "step": 4653, + "time_per_iteration": 2.654989242553711 + }, + { + "auxiliary_loss_clip": 0.0113484, + "auxiliary_loss_mlp": 0.01045549, + "balance_loss_clip": 1.02851391, + "balance_loss_mlp": 1.04782343, + "epoch": 0.2798136179167293, + "flos": 21579799697280.0, + "grad_norm": 2.2131663157599597, + "language_loss": 0.79123974, + "learning_rate": 3.380290409114312e-06, + "loss": 0.81304365, + "num_input_tokens_seen": 100531655, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.87109375, + "step": 4654, + "time_per_iteration": 2.4707679748535156 + }, + { + "auxiliary_loss_clip": 0.01139148, + "auxiliary_loss_mlp": 0.01041113, + "balance_loss_clip": 1.02370811, + "balance_loss_mlp": 1.04861951, + "epoch": 0.2798737411693973, + "flos": 21537676022400.0, + "grad_norm": 2.2002818233708497, + "language_loss": 0.80829996, + "learning_rate": 3.3800085408534127e-06, + "loss": 0.83010256, + "num_input_tokens_seen": 100548005, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.90625, + "step": 4655, + "time_per_iteration": 2.513359546661377 + }, + { + "auxiliary_loss_clip": 0.01135255, + "auxiliary_loss_mlp": 0.01040566, + "balance_loss_clip": 1.0232811, + "balance_loss_mlp": 1.04709148, + "epoch": 0.27993386442206525, + "flos": 26981051713920.0, + "grad_norm": 1.5763016498426998, + "language_loss": 0.8125751, + "learning_rate": 3.3797266202624506e-06, + "loss": 0.8343333, + "num_input_tokens_seen": 100567980, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.87890625, + "step": 4656, + "time_per_iteration": 2.519552707672119 + }, + { + "auxiliary_loss_clip": 0.01135028, + "auxiliary_loss_mlp": 0.01039911, + "balance_loss_clip": 1.02292323, + "balance_loss_mlp": 1.04802632, + "epoch": 0.2799939876747332, + "flos": 24349876652160.0, + "grad_norm": 1.6475258015019663, + "language_loss": 0.83235347, + "learning_rate": 3.3794446473521176e-06, + "loss": 0.85410285, + "num_input_tokens_seen": 100588630, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.87109375, + "step": 4657, + "time_per_iteration": 2.533052444458008 + }, + { + "auxiliary_loss_clip": 0.01136508, + "auxiliary_loss_mlp": 0.01042865, + "balance_loss_clip": 1.0267477, + "balance_loss_mlp": 1.04885554, + "epoch": 0.2800541109274012, + "flos": 33656988648960.0, + "grad_norm": 1.9420207304275756, + "language_loss": 0.63918132, + "learning_rate": 3.379162622133105e-06, + "loss": 0.66097504, + "num_input_tokens_seen": 100608775, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.875, + "step": 4658, + "time_per_iteration": 2.577223777770996 + }, + { + "auxiliary_loss_clip": 0.01137419, + "auxiliary_loss_mlp": 0.01048486, + "balance_loss_clip": 1.03177238, + "balance_loss_mlp": 1.04906631, + "epoch": 0.28011423418006914, + "flos": 21614417429760.0, + "grad_norm": 1.71469006603513, + "language_loss": 0.78447223, + "learning_rate": 3.3788805446161073e-06, + "loss": 0.80633128, + "num_input_tokens_seen": 100627975, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8828125, + "step": 4659, + "time_per_iteration": 2.5102882385253906 + }, + { + "auxiliary_loss_clip": 0.01141159, + "auxiliary_loss_mlp": 0.01052526, + "balance_loss_clip": 1.03565836, + "balance_loss_mlp": 1.05118299, + "epoch": 0.2801743574327371, + "flos": 23112431159040.0, + "grad_norm": 1.8275002529569282, + "language_loss": 0.79481149, + "learning_rate": 3.3785984148118215e-06, + "loss": 0.81674838, + "num_input_tokens_seen": 100645430, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8984375, + "step": 4660, + "time_per_iteration": 2.478348731994629 + }, + { + "auxiliary_loss_clip": 0.01134524, + "auxiliary_loss_mlp": 0.0103899, + "balance_loss_clip": 1.02289653, + "balance_loss_mlp": 1.04855609, + "epoch": 0.2802344806854051, + "flos": 12641418766080.0, + "grad_norm": 1.7763153734220711, + "language_loss": 0.80286032, + "learning_rate": 3.3783162327309453e-06, + "loss": 0.82459545, + "num_input_tokens_seen": 100663775, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.859375, + "step": 4661, + "time_per_iteration": 2.514369249343872 + }, + { + "auxiliary_loss_clip": 0.01140753, + "auxiliary_loss_mlp": 0.01055451, + "balance_loss_clip": 1.03888094, + "balance_loss_mlp": 1.05259752, + "epoch": 0.28029460393807304, + "flos": 37267878142080.0, + "grad_norm": 1.5344085017366311, + "language_loss": 0.78856266, + "learning_rate": 3.3780339983841794e-06, + "loss": 0.8105247, + "num_input_tokens_seen": 100686085, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8828125, + "step": 4662, + "time_per_iteration": 2.6068239212036133 + }, + { + "auxiliary_loss_clip": 0.01142079, + "auxiliary_loss_mlp": 0.01052002, + "balance_loss_clip": 1.03345299, + "balance_loss_mlp": 1.04998207, + "epoch": 0.280354727190741, + "flos": 20741106061440.0, + "grad_norm": 2.3559784459233923, + "language_loss": 0.70354843, + "learning_rate": 3.377751711782227e-06, + "loss": 0.72548926, + "num_input_tokens_seen": 100705135, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.921875, + "step": 4663, + "time_per_iteration": 2.530852794647217 + }, + { + "auxiliary_loss_clip": 0.01139833, + "auxiliary_loss_mlp": 0.01053723, + "balance_loss_clip": 1.03522193, + "balance_loss_mlp": 1.05016875, + "epoch": 0.28041485044340897, + "flos": 21471026336640.0, + "grad_norm": 1.7070620658846938, + "language_loss": 0.77552772, + "learning_rate": 3.377469372935791e-06, + "loss": 0.7974633, + "num_input_tokens_seen": 100724960, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.8984375, + "step": 4664, + "time_per_iteration": 2.5026586055755615 + }, + { + "auxiliary_loss_clip": 0.01132144, + "auxiliary_loss_mlp": 0.01043613, + "balance_loss_clip": 1.02688766, + "balance_loss_mlp": 1.04697514, + "epoch": 0.28047497369607693, + "flos": 14794263388800.0, + "grad_norm": 1.9676420802042491, + "language_loss": 0.79575229, + "learning_rate": 3.377186981855578e-06, + "loss": 0.81750983, + "num_input_tokens_seen": 100741995, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8515625, + "step": 4665, + "time_per_iteration": 2.496948003768921 + }, + { + "auxiliary_loss_clip": 0.01136069, + "auxiliary_loss_mlp": 0.01042713, + "balance_loss_clip": 1.02628565, + "balance_loss_mlp": 1.04934978, + "epoch": 0.2805350969487449, + "flos": 23070738447360.0, + "grad_norm": 8.778135585709748, + "language_loss": 0.80523062, + "learning_rate": 3.3769045385522968e-06, + "loss": 0.82701844, + "num_input_tokens_seen": 100758985, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8671875, + "step": 4666, + "time_per_iteration": 2.4551992416381836 + }, + { + "auxiliary_loss_clip": 0.0113922, + "auxiliary_loss_mlp": 0.01054727, + "balance_loss_clip": 1.03710806, + "balance_loss_mlp": 1.05058241, + "epoch": 0.2805952202014129, + "flos": 20479855466880.0, + "grad_norm": 2.0519370530418493, + "language_loss": 0.84514672, + "learning_rate": 3.376622043036658e-06, + "loss": 0.86708617, + "num_input_tokens_seen": 100777820, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.88671875, + "step": 4667, + "time_per_iteration": 2.503024101257324 + }, + { + "auxiliary_loss_clip": 0.01141868, + "auxiliary_loss_mlp": 0.0104464, + "balance_loss_clip": 1.02771258, + "balance_loss_mlp": 1.05165899, + "epoch": 0.2806553434540809, + "flos": 27417330305280.0, + "grad_norm": 1.59556786146991, + "language_loss": 0.79110259, + "learning_rate": 3.376339495319373e-06, + "loss": 0.81296772, + "num_input_tokens_seen": 100798205, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.90234375, + "step": 4668, + "time_per_iteration": 2.5109217166900635 + }, + { + "auxiliary_loss_clip": 0.01137821, + "auxiliary_loss_mlp": 0.01045025, + "balance_loss_clip": 1.02783513, + "balance_loss_mlp": 1.0472095, + "epoch": 0.28071546670674885, + "flos": 26505019745280.0, + "grad_norm": 5.202292388628492, + "language_loss": 0.7594949, + "learning_rate": 3.3760568954111563e-06, + "loss": 0.78132337, + "num_input_tokens_seen": 100819800, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.90625, + "step": 4669, + "time_per_iteration": 2.5443029403686523 + }, + { + "auxiliary_loss_clip": 0.01139015, + "auxiliary_loss_mlp": 0.01050472, + "balance_loss_clip": 1.03276944, + "balance_loss_mlp": 1.05060363, + "epoch": 0.2807755899594168, + "flos": 20558679863040.0, + "grad_norm": 2.249572842905479, + "language_loss": 0.78818107, + "learning_rate": 3.375774243322725e-06, + "loss": 0.81007588, + "num_input_tokens_seen": 100837880, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.8828125, + "step": 4670, + "time_per_iteration": 2.4583303928375244 + }, + { + "auxiliary_loss_clip": 0.01142576, + "auxiliary_loss_mlp": 0.010505, + "balance_loss_clip": 1.03272545, + "balance_loss_mlp": 1.05169237, + "epoch": 0.2808357132120848, + "flos": 24313319585280.0, + "grad_norm": 2.1344815005037323, + "language_loss": 0.78915119, + "learning_rate": 3.3754915390647955e-06, + "loss": 0.81108201, + "num_input_tokens_seen": 100856350, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.91015625, + "step": 4671, + "time_per_iteration": 2.576904296875 + }, + { + "auxiliary_loss_clip": 0.01136631, + "auxiliary_loss_mlp": 0.01039569, + "balance_loss_clip": 1.02419102, + "balance_loss_mlp": 1.05212355, + "epoch": 0.28089583646475275, + "flos": 26432408401920.0, + "grad_norm": 1.655300005604084, + "language_loss": 0.74891758, + "learning_rate": 3.37520878264809e-06, + "loss": 0.77067947, + "num_input_tokens_seen": 100876135, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84375, + "step": 4672, + "time_per_iteration": 2.5101001262664795 + }, + { + "auxiliary_loss_clip": 0.01139664, + "auxiliary_loss_mlp": 0.01048727, + "balance_loss_clip": 1.0297612, + "balance_loss_mlp": 1.05017138, + "epoch": 0.2809559597174207, + "flos": 23111820627840.0, + "grad_norm": 2.377632390973165, + "language_loss": 0.7485683, + "learning_rate": 3.3749259740833286e-06, + "loss": 0.77045226, + "num_input_tokens_seen": 100894790, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.89453125, + "step": 4673, + "time_per_iteration": 2.5559215545654297 + }, + { + "auxiliary_loss_clip": 0.0113758, + "auxiliary_loss_mlp": 0.01040684, + "balance_loss_clip": 1.02367294, + "balance_loss_mlp": 1.04911065, + "epoch": 0.2810160829700887, + "flos": 20923496346240.0, + "grad_norm": 2.162495737742732, + "language_loss": 0.72274792, + "learning_rate": 3.374643113381237e-06, + "loss": 0.74453062, + "num_input_tokens_seen": 100915100, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.88671875, + "step": 4674, + "time_per_iteration": 2.4755725860595703 + }, + { + "auxiliary_loss_clip": 0.01142202, + "auxiliary_loss_mlp": 0.01043016, + "balance_loss_clip": 1.02487254, + "balance_loss_mlp": 1.05152214, + "epoch": 0.28107620622275664, + "flos": 14355901808640.0, + "grad_norm": 1.8501022214838438, + "language_loss": 0.77636325, + "learning_rate": 3.374360200552541e-06, + "loss": 0.79821539, + "num_input_tokens_seen": 100932795, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.90625, + "step": 4675, + "time_per_iteration": 2.5076191425323486 + }, + { + "auxiliary_loss_clip": 0.011417, + "auxiliary_loss_mlp": 0.01048524, + "balance_loss_clip": 1.03059506, + "balance_loss_mlp": 1.05080581, + "epoch": 0.2811363294754246, + "flos": 20919078973440.0, + "grad_norm": 4.743769816525981, + "language_loss": 0.7033428, + "learning_rate": 3.374077235607968e-06, + "loss": 0.72524506, + "num_input_tokens_seen": 100950505, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.90625, + "step": 4676, + "time_per_iteration": 2.4664652347564697 + }, + { + "auxiliary_loss_clip": 0.01136213, + "auxiliary_loss_mlp": 0.01042174, + "balance_loss_clip": 1.02637279, + "balance_loss_mlp": 1.05219054, + "epoch": 0.28119645272809257, + "flos": 20594841880320.0, + "grad_norm": 1.6504598517134752, + "language_loss": 0.70294476, + "learning_rate": 3.3737942185582487e-06, + "loss": 0.7247287, + "num_input_tokens_seen": 100968790, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.83984375, + "step": 4677, + "time_per_iteration": 3.9926962852478027 + }, + { + "auxiliary_loss_clip": 0.01140831, + "auxiliary_loss_mlp": 0.01046995, + "balance_loss_clip": 1.02779067, + "balance_loss_mlp": 1.05172849, + "epoch": 0.28125657598076054, + "flos": 25337420248320.0, + "grad_norm": 1.7155329144241396, + "language_loss": 0.63506716, + "learning_rate": 3.3735111494141153e-06, + "loss": 0.65694547, + "num_input_tokens_seen": 100990205, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.890625, + "step": 4678, + "time_per_iteration": 5.452545642852783 + }, + { + "auxiliary_loss_clip": 0.01140503, + "auxiliary_loss_mlp": 0.01048157, + "balance_loss_clip": 1.031039, + "balance_loss_mlp": 1.05193949, + "epoch": 0.2813166992334285, + "flos": 24827093769600.0, + "grad_norm": 1.4644682748892532, + "language_loss": 0.70249045, + "learning_rate": 3.3732280281863013e-06, + "loss": 0.7243771, + "num_input_tokens_seen": 101009815, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.88671875, + "step": 4679, + "time_per_iteration": 2.557156801223755 + }, + { + "auxiliary_loss_clip": 0.01138678, + "auxiliary_loss_mlp": 0.01040208, + "balance_loss_clip": 1.02276742, + "balance_loss_mlp": 1.05024076, + "epoch": 0.2813768224860965, + "flos": 21760753438080.0, + "grad_norm": 1.8307759218313573, + "language_loss": 0.74600148, + "learning_rate": 3.3729448548855422e-06, + "loss": 0.76779038, + "num_input_tokens_seen": 101026780, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.8828125, + "step": 4680, + "time_per_iteration": 2.478760004043579 + }, + { + "auxiliary_loss_clip": 0.01140599, + "auxiliary_loss_mlp": 0.01043469, + "balance_loss_clip": 1.0268507, + "balance_loss_mlp": 1.0514679, + "epoch": 0.2814369457387645, + "flos": 24316803204480.0, + "grad_norm": 1.8069902018568411, + "language_loss": 0.77090317, + "learning_rate": 3.3726616295225774e-06, + "loss": 0.79274386, + "num_input_tokens_seen": 101046215, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.890625, + "step": 4681, + "time_per_iteration": 2.5532946586608887 + }, + { + "auxiliary_loss_clip": 0.01142988, + "auxiliary_loss_mlp": 0.01041039, + "balance_loss_clip": 1.02353942, + "balance_loss_mlp": 1.05301392, + "epoch": 0.28149706899143245, + "flos": 18515326872960.0, + "grad_norm": 4.33574203258507, + "language_loss": 0.74047244, + "learning_rate": 3.372378352108146e-06, + "loss": 0.76231277, + "num_input_tokens_seen": 101063365, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.8984375, + "step": 4682, + "time_per_iteration": 2.450707197189331 + }, + { + "auxiliary_loss_clip": 0.0113683, + "auxiliary_loss_mlp": 0.01043566, + "balance_loss_clip": 1.02712727, + "balance_loss_mlp": 1.04989302, + "epoch": 0.2815571922441004, + "flos": 24863255786880.0, + "grad_norm": 1.4103030378304897, + "language_loss": 0.80830532, + "learning_rate": 3.3720950226529894e-06, + "loss": 0.8301093, + "num_input_tokens_seen": 101083835, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8671875, + "step": 4683, + "time_per_iteration": 2.5405828952789307 + }, + { + "auxiliary_loss_clip": 0.01142223, + "auxiliary_loss_mlp": 0.01047785, + "balance_loss_clip": 1.02984428, + "balance_loss_mlp": 1.05146146, + "epoch": 0.2816173154967684, + "flos": 19901622326400.0, + "grad_norm": 1.6936052100643573, + "language_loss": 0.76107442, + "learning_rate": 3.371811641167852e-06, + "loss": 0.78297454, + "num_input_tokens_seen": 101101740, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.90625, + "step": 4684, + "time_per_iteration": 2.4734883308410645 + }, + { + "auxiliary_loss_clip": 0.01135013, + "auxiliary_loss_mlp": 0.01038474, + "balance_loss_clip": 1.02196348, + "balance_loss_mlp": 1.04849601, + "epoch": 0.28167743874943635, + "flos": 17491333950720.0, + "grad_norm": 1.9675146174992446, + "language_loss": 0.7601878, + "learning_rate": 3.3715282076634807e-06, + "loss": 0.7819227, + "num_input_tokens_seen": 101120480, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.86328125, + "step": 4685, + "time_per_iteration": 2.521883010864258 + }, + { + "auxiliary_loss_clip": 0.01136456, + "auxiliary_loss_mlp": 0.01043262, + "balance_loss_clip": 1.02666783, + "balance_loss_mlp": 1.05083728, + "epoch": 0.2817375620021043, + "flos": 25302120157440.0, + "grad_norm": 2.003036282603561, + "language_loss": 0.7616905, + "learning_rate": 3.3712447221506218e-06, + "loss": 0.78348768, + "num_input_tokens_seen": 101142910, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.85546875, + "step": 4686, + "time_per_iteration": 2.5261688232421875 + }, + { + "auxiliary_loss_clip": 0.01140126, + "auxiliary_loss_mlp": 0.01051506, + "balance_loss_clip": 1.03319538, + "balance_loss_mlp": 1.04916072, + "epoch": 0.2817976852547723, + "flos": 18693227957760.0, + "grad_norm": 2.230965321609006, + "language_loss": 0.63345516, + "learning_rate": 3.370961184640025e-06, + "loss": 0.65537149, + "num_input_tokens_seen": 101160030, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.90625, + "step": 4687, + "time_per_iteration": 2.473508834838867 + }, + { + "auxiliary_loss_clip": 0.0114172, + "auxiliary_loss_mlp": 0.01052796, + "balance_loss_clip": 1.03577256, + "balance_loss_mlp": 1.05180609, + "epoch": 0.28185780850744024, + "flos": 22742263549440.0, + "grad_norm": 1.9761865692880811, + "language_loss": 0.76504958, + "learning_rate": 3.3706775951424433e-06, + "loss": 0.7869947, + "num_input_tokens_seen": 101177675, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.8984375, + "step": 4688, + "time_per_iteration": 2.4815330505371094 + }, + { + "auxiliary_loss_clip": 0.01135292, + "auxiliary_loss_mlp": 0.01040309, + "balance_loss_clip": 1.02364409, + "balance_loss_mlp": 1.04902148, + "epoch": 0.2819179317601082, + "flos": 14933919467520.0, + "grad_norm": 2.291650314126009, + "language_loss": 0.78333032, + "learning_rate": 3.37039395366863e-06, + "loss": 0.80508631, + "num_input_tokens_seen": 101192225, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.86328125, + "step": 4689, + "time_per_iteration": 2.464221239089966 + }, + { + "auxiliary_loss_clip": 0.01136671, + "auxiliary_loss_mlp": 0.01042633, + "balance_loss_clip": 1.02566934, + "balance_loss_mlp": 1.04886627, + "epoch": 0.2819780550127762, + "flos": 23145325038720.0, + "grad_norm": 2.2251394110426896, + "language_loss": 0.77819848, + "learning_rate": 3.37011026022934e-06, + "loss": 0.79999155, + "num_input_tokens_seen": 101210870, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.87890625, + "step": 4690, + "time_per_iteration": 2.4802086353302 + }, + { + "auxiliary_loss_clip": 0.01138887, + "auxiliary_loss_mlp": 0.01044233, + "balance_loss_clip": 1.02809191, + "balance_loss_mlp": 1.04984617, + "epoch": 0.28203817826544414, + "flos": 21616356764160.0, + "grad_norm": 1.762007121853784, + "language_loss": 0.8775022, + "learning_rate": 3.369826514835332e-06, + "loss": 0.89933336, + "num_input_tokens_seen": 101229965, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.890625, + "step": 4691, + "time_per_iteration": 2.5098307132720947 + }, + { + "auxiliary_loss_clip": 0.01144357, + "auxiliary_loss_mlp": 0.01043906, + "balance_loss_clip": 1.02714467, + "balance_loss_mlp": 1.0519383, + "epoch": 0.2820983015181121, + "flos": 24026788794240.0, + "grad_norm": 2.144178457094415, + "language_loss": 0.81952238, + "learning_rate": 3.3695427174973654e-06, + "loss": 0.84140503, + "num_input_tokens_seen": 101250980, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.921875, + "step": 4692, + "time_per_iteration": 2.501150131225586 + }, + { + "auxiliary_loss_clip": 0.01137061, + "auxiliary_loss_mlp": 0.01039873, + "balance_loss_clip": 1.02284956, + "balance_loss_mlp": 1.04852128, + "epoch": 0.2821584247707801, + "flos": 30007925976960.0, + "grad_norm": 1.7100054669520195, + "language_loss": 0.74535745, + "learning_rate": 3.3692588682262022e-06, + "loss": 0.7671268, + "num_input_tokens_seen": 101273335, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.88671875, + "step": 4693, + "time_per_iteration": 2.581108808517456 + }, + { + "auxiliary_loss_clip": 0.01139239, + "auxiliary_loss_mlp": 0.01036265, + "balance_loss_clip": 1.01942062, + "balance_loss_mlp": 1.04924035, + "epoch": 0.2822185480234481, + "flos": 21396762967680.0, + "grad_norm": 1.6174705324311944, + "language_loss": 0.7761777, + "learning_rate": 3.3689749670326046e-06, + "loss": 0.79793274, + "num_input_tokens_seen": 101292110, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8984375, + "step": 4694, + "time_per_iteration": 2.479616403579712 + }, + { + "auxiliary_loss_clip": 0.01136707, + "auxiliary_loss_mlp": 0.01038934, + "balance_loss_clip": 1.02220941, + "balance_loss_mlp": 1.05057073, + "epoch": 0.28227867127611606, + "flos": 27452809964160.0, + "grad_norm": 2.0658621313481604, + "language_loss": 0.66812259, + "learning_rate": 3.3686910139273392e-06, + "loss": 0.68987906, + "num_input_tokens_seen": 101312815, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.86328125, + "step": 4695, + "time_per_iteration": 2.560234546661377 + }, + { + "auxiliary_loss_clip": 0.0114143, + "auxiliary_loss_mlp": 0.01047559, + "balance_loss_clip": 1.02859259, + "balance_loss_mlp": 1.05084562, + "epoch": 0.282338794528784, + "flos": 22593736811520.0, + "grad_norm": 2.206840044366299, + "language_loss": 0.75868189, + "learning_rate": 3.3684070089211736e-06, + "loss": 0.78057176, + "num_input_tokens_seen": 101329045, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.90625, + "step": 4696, + "time_per_iteration": 2.484731674194336 + }, + { + "auxiliary_loss_clip": 0.01142111, + "auxiliary_loss_mlp": 0.01049599, + "balance_loss_clip": 1.03283811, + "balance_loss_mlp": 1.05234432, + "epoch": 0.282398917781452, + "flos": 42010923386880.0, + "grad_norm": 4.801168729119655, + "language_loss": 0.62373543, + "learning_rate": 3.368122952024877e-06, + "loss": 0.64565253, + "num_input_tokens_seen": 101352715, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8984375, + "step": 4697, + "time_per_iteration": 2.6771903038024902 + }, + { + "auxiliary_loss_clip": 0.01131406, + "auxiliary_loss_mlp": 0.01035664, + "balance_loss_clip": 1.02003598, + "balance_loss_mlp": 1.0468322, + "epoch": 0.28245904103411995, + "flos": 23224724052480.0, + "grad_norm": 1.6839402690923742, + "language_loss": 0.73317522, + "learning_rate": 3.3678388432492214e-06, + "loss": 0.75484592, + "num_input_tokens_seen": 101374640, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84375, + "step": 4698, + "time_per_iteration": 2.5262162685394287 + }, + { + "auxiliary_loss_clip": 0.01130801, + "auxiliary_loss_mlp": 0.01044648, + "balance_loss_clip": 1.029091, + "balance_loss_mlp": 1.0463903, + "epoch": 0.2825191642867879, + "flos": 25374623760000.0, + "grad_norm": 2.1160143892835275, + "language_loss": 0.74896884, + "learning_rate": 3.3675546826049788e-06, + "loss": 0.77072334, + "num_input_tokens_seen": 101393595, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.84375, + "step": 4699, + "time_per_iteration": 2.5613014698028564 + }, + { + "auxiliary_loss_clip": 0.01139697, + "auxiliary_loss_mlp": 0.01041633, + "balance_loss_clip": 1.02369165, + "balance_loss_mlp": 1.05032122, + "epoch": 0.2825792875394559, + "flos": 17236799199360.0, + "grad_norm": 3.187545417707515, + "language_loss": 0.80256712, + "learning_rate": 3.3672704701029265e-06, + "loss": 0.8243804, + "num_input_tokens_seen": 101409265, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.89453125, + "step": 4700, + "time_per_iteration": 2.4355719089508057 + }, + { + "auxiliary_loss_clip": 0.01132153, + "auxiliary_loss_mlp": 0.01049134, + "balance_loss_clip": 1.03461456, + "balance_loss_mlp": 1.05022645, + "epoch": 0.28263941079212385, + "flos": 26723967096960.0, + "grad_norm": 1.7483881606912919, + "language_loss": 0.81309319, + "learning_rate": 3.3669862057538402e-06, + "loss": 0.8349061, + "num_input_tokens_seen": 101428365, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8203125, + "step": 4701, + "time_per_iteration": 2.590824842453003 + }, + { + "auxiliary_loss_clip": 0.0113653, + "auxiliary_loss_mlp": 0.01038832, + "balance_loss_clip": 1.02301347, + "balance_loss_mlp": 1.05007911, + "epoch": 0.2826995340447918, + "flos": 25921327737600.0, + "grad_norm": 2.214271940066586, + "language_loss": 0.73758674, + "learning_rate": 3.3667018895685004e-06, + "loss": 0.75934035, + "num_input_tokens_seen": 101447280, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.86328125, + "step": 4702, + "time_per_iteration": 2.496689796447754 + }, + { + "auxiliary_loss_clip": 0.01136189, + "auxiliary_loss_mlp": 0.01038892, + "balance_loss_clip": 1.02251232, + "balance_loss_mlp": 1.05127287, + "epoch": 0.2827596572974598, + "flos": 22379709623040.0, + "grad_norm": 1.7981890053968508, + "language_loss": 0.78189409, + "learning_rate": 3.3664175215576886e-06, + "loss": 0.8036449, + "num_input_tokens_seen": 101465435, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8515625, + "step": 4703, + "time_per_iteration": 2.5225300788879395 + }, + { + "auxiliary_loss_clip": 0.011353, + "auxiliary_loss_mlp": 0.01046746, + "balance_loss_clip": 1.02923465, + "balance_loss_mlp": 1.0484302, + "epoch": 0.28281978055012774, + "flos": 33547137880320.0, + "grad_norm": 1.6026897384097336, + "language_loss": 0.6944623, + "learning_rate": 3.3661331017321867e-06, + "loss": 0.71628278, + "num_input_tokens_seen": 101486355, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8671875, + "step": 4704, + "time_per_iteration": 2.5721168518066406 + }, + { + "auxiliary_loss_clip": 0.0113917, + "auxiliary_loss_mlp": 0.01043731, + "balance_loss_clip": 1.02685118, + "balance_loss_mlp": 1.05374229, + "epoch": 0.2828799038027957, + "flos": 23440870143360.0, + "grad_norm": 1.9868129767490792, + "language_loss": 0.69884789, + "learning_rate": 3.3658486301027807e-06, + "loss": 0.7206769, + "num_input_tokens_seen": 101505875, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.85546875, + "step": 4705, + "time_per_iteration": 2.532034397125244 + }, + { + "auxiliary_loss_clip": 0.01057982, + "auxiliary_loss_mlp": 0.01003525, + "balance_loss_clip": 1.0018208, + "balance_loss_mlp": 1.02761459, + "epoch": 0.2829400270554637, + "flos": 69873690251520.0, + "grad_norm": 0.7396595768854823, + "language_loss": 0.59243953, + "learning_rate": 3.3655641066802577e-06, + "loss": 0.61305463, + "num_input_tokens_seen": 101565045, + "router_z_loss_clip": 0.01708984, + "router_z_loss_mlp": 0.3046875, + "step": 4706, + "time_per_iteration": 3.1149942874908447 + }, + { + "auxiliary_loss_clip": 0.01135425, + "auxiliary_loss_mlp": 0.01040531, + "balance_loss_clip": 1.02586842, + "balance_loss_mlp": 1.05135274, + "epoch": 0.2830001503081317, + "flos": 24789028331520.0, + "grad_norm": 1.3972451569930537, + "language_loss": 0.82227451, + "learning_rate": 3.365279531475407e-06, + "loss": 0.84403402, + "num_input_tokens_seen": 101585825, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.83984375, + "step": 4707, + "time_per_iteration": 2.5387215614318848 + }, + { + "auxiliary_loss_clip": 0.01137999, + "auxiliary_loss_mlp": 0.01039747, + "balance_loss_clip": 1.02199709, + "balance_loss_mlp": 1.04914331, + "epoch": 0.28306027356079966, + "flos": 27669387018240.0, + "grad_norm": 1.4509576382878049, + "language_loss": 0.80561262, + "learning_rate": 3.36499490449902e-06, + "loss": 0.82739007, + "num_input_tokens_seen": 101606105, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.890625, + "step": 4708, + "time_per_iteration": 2.5140204429626465 + }, + { + "auxiliary_loss_clip": 0.0105521, + "auxiliary_loss_mlp": 0.01000508, + "balance_loss_clip": 0.99875605, + "balance_loss_mlp": 1.02517498, + "epoch": 0.2831203968134676, + "flos": 60527938199040.0, + "grad_norm": 0.9117312370003612, + "language_loss": 0.62801576, + "learning_rate": 3.3647102257618895e-06, + "loss": 0.64857292, + "num_input_tokens_seen": 101656875, + "router_z_loss_clip": 0.01757812, + "router_z_loss_mlp": 0.30078125, + "step": 4709, + "time_per_iteration": 2.936171054840088 + }, + { + "auxiliary_loss_clip": 0.01133236, + "auxiliary_loss_mlp": 0.01038943, + "balance_loss_clip": 1.02320743, + "balance_loss_mlp": 1.04888415, + "epoch": 0.2831805200661356, + "flos": 22054790171520.0, + "grad_norm": 1.3738384560226649, + "language_loss": 0.73850632, + "learning_rate": 3.3644254952748103e-06, + "loss": 0.76022816, + "num_input_tokens_seen": 101676225, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.84375, + "step": 4710, + "time_per_iteration": 2.4954519271850586 + }, + { + "auxiliary_loss_clip": 0.01137863, + "auxiliary_loss_mlp": 0.01049743, + "balance_loss_clip": 1.03191566, + "balance_loss_mlp": 1.04925823, + "epoch": 0.28324064331880355, + "flos": 22600668136320.0, + "grad_norm": 1.9168276099157815, + "language_loss": 0.79272872, + "learning_rate": 3.364140713048579e-06, + "loss": 0.81460476, + "num_input_tokens_seen": 101693710, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.88671875, + "step": 4711, + "time_per_iteration": 2.4867448806762695 + }, + { + "auxiliary_loss_clip": 0.01138366, + "auxiliary_loss_mlp": 0.01043891, + "balance_loss_clip": 1.02646244, + "balance_loss_mlp": 1.04965401, + "epoch": 0.2833007665714715, + "flos": 30404127968640.0, + "grad_norm": 2.0504814559042064, + "language_loss": 0.71246219, + "learning_rate": 3.363855879093996e-06, + "loss": 0.73428476, + "num_input_tokens_seen": 101714010, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.88671875, + "step": 4712, + "time_per_iteration": 2.575636863708496 + }, + { + "auxiliary_loss_clip": 0.01138441, + "auxiliary_loss_mlp": 0.01049763, + "balance_loss_clip": 1.03291881, + "balance_loss_mlp": 1.05000687, + "epoch": 0.2833608898241395, + "flos": 23549499849600.0, + "grad_norm": 1.8055678270358249, + "language_loss": 0.82008445, + "learning_rate": 3.3635709934218605e-06, + "loss": 0.84196651, + "num_input_tokens_seen": 101732995, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8828125, + "step": 4713, + "time_per_iteration": 2.493767499923706 + }, + { + "auxiliary_loss_clip": 0.01136571, + "auxiliary_loss_mlp": 0.01041134, + "balance_loss_clip": 1.02401519, + "balance_loss_mlp": 1.05028057, + "epoch": 0.28342101307680745, + "flos": 20266726118400.0, + "grad_norm": 1.7485744544400377, + "language_loss": 0.75356781, + "learning_rate": 3.3632860560429766e-06, + "loss": 0.77534491, + "num_input_tokens_seen": 101751385, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.86328125, + "step": 4714, + "time_per_iteration": 2.505153179168701 + }, + { + "auxiliary_loss_clip": 0.01136297, + "auxiliary_loss_mlp": 0.01045701, + "balance_loss_clip": 1.02967894, + "balance_loss_mlp": 1.04942465, + "epoch": 0.2834811363294754, + "flos": 30847050576000.0, + "grad_norm": 1.4087892826571713, + "language_loss": 0.78411347, + "learning_rate": 3.3630010669681494e-06, + "loss": 0.80593348, + "num_input_tokens_seen": 101773825, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8671875, + "step": 4715, + "time_per_iteration": 2.554814100265503 + }, + { + "auxiliary_loss_clip": 0.01135347, + "auxiliary_loss_mlp": 0.01036876, + "balance_loss_clip": 1.02042472, + "balance_loss_mlp": 1.04960322, + "epoch": 0.2835412595821434, + "flos": 22711021695360.0, + "grad_norm": 1.6801208741854476, + "language_loss": 0.73694074, + "learning_rate": 3.3627160262081845e-06, + "loss": 0.758663, + "num_input_tokens_seen": 101791920, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.859375, + "step": 4716, + "time_per_iteration": 2.5286571979522705 + }, + { + "auxiliary_loss_clip": 0.01139786, + "auxiliary_loss_mlp": 0.01041969, + "balance_loss_clip": 1.02437401, + "balance_loss_mlp": 1.04774714, + "epoch": 0.28360138283481134, + "flos": 18077719478400.0, + "grad_norm": 2.328876822443367, + "language_loss": 0.74648547, + "learning_rate": 3.3624309337738917e-06, + "loss": 0.76830298, + "num_input_tokens_seen": 101809515, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.921875, + "step": 4717, + "time_per_iteration": 2.46952223777771 + }, + { + "auxiliary_loss_clip": 0.01139264, + "auxiliary_loss_mlp": 0.01044702, + "balance_loss_clip": 1.02846563, + "balance_loss_mlp": 1.04963374, + "epoch": 0.2836615060874793, + "flos": 17854785717120.0, + "grad_norm": 1.4913957575980352, + "language_loss": 0.669999, + "learning_rate": 3.3621457896760813e-06, + "loss": 0.69183862, + "num_input_tokens_seen": 101827735, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.89453125, + "step": 4718, + "time_per_iteration": 2.4831228256225586 + }, + { + "auxiliary_loss_clip": 0.01137489, + "auxiliary_loss_mlp": 0.01046854, + "balance_loss_clip": 1.03000975, + "balance_loss_mlp": 1.04782009, + "epoch": 0.2837216293401473, + "flos": 25740302169600.0, + "grad_norm": 1.8756812569885382, + "language_loss": 0.72633672, + "learning_rate": 3.361860593925566e-06, + "loss": 0.74818015, + "num_input_tokens_seen": 101845970, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.89453125, + "step": 4719, + "time_per_iteration": 4.022828102111816 + }, + { + "auxiliary_loss_clip": 0.01135497, + "auxiliary_loss_mlp": 0.01041437, + "balance_loss_clip": 1.02554655, + "balance_loss_mlp": 1.04928601, + "epoch": 0.2837817525928153, + "flos": 20923532259840.0, + "grad_norm": 1.5135010931827333, + "language_loss": 0.80621493, + "learning_rate": 3.3615753465331605e-06, + "loss": 0.82798427, + "num_input_tokens_seen": 101865040, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.86328125, + "step": 4720, + "time_per_iteration": 5.367753505706787 + }, + { + "auxiliary_loss_clip": 0.0113932, + "auxiliary_loss_mlp": 0.01044199, + "balance_loss_clip": 1.02752209, + "balance_loss_mlp": 1.05115819, + "epoch": 0.28384187584548326, + "flos": 18916700423040.0, + "grad_norm": 1.7029911565101727, + "language_loss": 0.79467577, + "learning_rate": 3.3612900475096817e-06, + "loss": 0.81651098, + "num_input_tokens_seen": 101883735, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8828125, + "step": 4721, + "time_per_iteration": 2.50327730178833 + }, + { + "auxiliary_loss_clip": 0.01133729, + "auxiliary_loss_mlp": 0.01035212, + "balance_loss_clip": 1.01929736, + "balance_loss_mlp": 1.04810679, + "epoch": 0.2839019990981512, + "flos": 27343964776320.0, + "grad_norm": 2.0644081658079343, + "language_loss": 0.82823032, + "learning_rate": 3.3610046968659474e-06, + "loss": 0.84991974, + "num_input_tokens_seen": 101903025, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.859375, + "step": 4722, + "time_per_iteration": 2.4968478679656982 + }, + { + "auxiliary_loss_clip": 0.01137825, + "auxiliary_loss_mlp": 0.01039882, + "balance_loss_clip": 1.02364612, + "balance_loss_mlp": 1.05073261, + "epoch": 0.2839621223508192, + "flos": 18114312458880.0, + "grad_norm": 1.6187910677092856, + "language_loss": 0.70086461, + "learning_rate": 3.3607192946127785e-06, + "loss": 0.72264171, + "num_input_tokens_seen": 101922255, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.87109375, + "step": 4723, + "time_per_iteration": 2.4899258613586426 + }, + { + "auxiliary_loss_clip": 0.01135132, + "auxiliary_loss_mlp": 0.01044726, + "balance_loss_clip": 1.02747679, + "balance_loss_mlp": 1.04938078, + "epoch": 0.28402224560348716, + "flos": 26358360514560.0, + "grad_norm": 1.736224288784384, + "language_loss": 0.78556609, + "learning_rate": 3.360433840760998e-06, + "loss": 0.8073647, + "num_input_tokens_seen": 101943100, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.859375, + "step": 4724, + "time_per_iteration": 2.496594190597534 + }, + { + "auxiliary_loss_clip": 0.01139767, + "auxiliary_loss_mlp": 0.01043591, + "balance_loss_clip": 1.02660346, + "balance_loss_mlp": 1.05093193, + "epoch": 0.2840823688561551, + "flos": 24060795995520.0, + "grad_norm": 1.6232572980988387, + "language_loss": 0.92404163, + "learning_rate": 3.36014833532143e-06, + "loss": 0.94587529, + "num_input_tokens_seen": 101963160, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.88671875, + "step": 4725, + "time_per_iteration": 2.511526584625244 + }, + { + "auxiliary_loss_clip": 0.01140103, + "auxiliary_loss_mlp": 0.01043108, + "balance_loss_clip": 1.0257988, + "balance_loss_mlp": 1.05020452, + "epoch": 0.2841424921088231, + "flos": 29459821368960.0, + "grad_norm": 2.0539060112221645, + "language_loss": 0.88626051, + "learning_rate": 3.3598627783049e-06, + "loss": 0.90809256, + "num_input_tokens_seen": 101984300, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.8984375, + "step": 4726, + "time_per_iteration": 2.5431292057037354 + }, + { + "auxiliary_loss_clip": 0.01139706, + "auxiliary_loss_mlp": 0.01048538, + "balance_loss_clip": 1.03090727, + "balance_loss_mlp": 1.05034256, + "epoch": 0.28420261536149105, + "flos": 48100367053440.0, + "grad_norm": 2.15176079657567, + "language_loss": 0.78793001, + "learning_rate": 3.359577169722238e-06, + "loss": 0.80981243, + "num_input_tokens_seen": 102005765, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.89453125, + "step": 4727, + "time_per_iteration": 2.7037220001220703 + }, + { + "auxiliary_loss_clip": 0.01133758, + "auxiliary_loss_mlp": 0.01037347, + "balance_loss_clip": 1.02229071, + "balance_loss_mlp": 1.04985464, + "epoch": 0.284262738614159, + "flos": 25666146541440.0, + "grad_norm": 2.258515630996078, + "language_loss": 0.66358554, + "learning_rate": 3.3592915095842733e-06, + "loss": 0.68529654, + "num_input_tokens_seen": 102022755, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.83984375, + "step": 4728, + "time_per_iteration": 2.5066046714782715 + }, + { + "auxiliary_loss_clip": 0.01134281, + "auxiliary_loss_mlp": 0.01045863, + "balance_loss_clip": 1.02941179, + "balance_loss_mlp": 1.04727221, + "epoch": 0.284322861866827, + "flos": 19718980646400.0, + "grad_norm": 1.756924339447767, + "language_loss": 0.75958216, + "learning_rate": 3.3590057979018386e-06, + "loss": 0.78138363, + "num_input_tokens_seen": 102041850, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.87109375, + "step": 4729, + "time_per_iteration": 2.4989402294158936 + }, + { + "auxiliary_loss_clip": 0.01140784, + "auxiliary_loss_mlp": 0.01050786, + "balance_loss_clip": 1.03383398, + "balance_loss_mlp": 1.05095756, + "epoch": 0.28438298511949495, + "flos": 23915250086400.0, + "grad_norm": 1.9682162336594704, + "language_loss": 0.66691023, + "learning_rate": 3.3587200346857674e-06, + "loss": 0.68882596, + "num_input_tokens_seen": 102059500, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.8984375, + "step": 4730, + "time_per_iteration": 2.509514570236206 + }, + { + "auxiliary_loss_clip": 0.01138579, + "auxiliary_loss_mlp": 0.01039094, + "balance_loss_clip": 1.02232122, + "balance_loss_mlp": 1.05049443, + "epoch": 0.2844431083721629, + "flos": 26067340523520.0, + "grad_norm": 1.7814838549320247, + "language_loss": 0.74382442, + "learning_rate": 3.3584342199468965e-06, + "loss": 0.76560116, + "num_input_tokens_seen": 102080460, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8828125, + "step": 4731, + "time_per_iteration": 2.547813653945923 + }, + { + "auxiliary_loss_clip": 0.01136629, + "auxiliary_loss_mlp": 0.01033401, + "balance_loss_clip": 1.01700974, + "balance_loss_mlp": 1.04890573, + "epoch": 0.2845032316248309, + "flos": 25810435474560.0, + "grad_norm": 1.530013147894791, + "language_loss": 0.83553517, + "learning_rate": 3.3581483536960638e-06, + "loss": 0.85723549, + "num_input_tokens_seen": 102100950, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.87890625, + "step": 4732, + "time_per_iteration": 2.5120863914489746 + }, + { + "auxiliary_loss_clip": 0.01136161, + "auxiliary_loss_mlp": 0.0105072, + "balance_loss_clip": 1.03301716, + "balance_loss_mlp": 1.04855001, + "epoch": 0.2845633548774989, + "flos": 19823192979840.0, + "grad_norm": 1.9723104549008028, + "language_loss": 0.79331958, + "learning_rate": 3.357862435944109e-06, + "loss": 0.81518835, + "num_input_tokens_seen": 102119345, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.875, + "step": 4733, + "time_per_iteration": 2.5007243156433105 + }, + { + "auxiliary_loss_clip": 0.01142281, + "auxiliary_loss_mlp": 0.01047444, + "balance_loss_clip": 1.02999151, + "balance_loss_mlp": 1.05076027, + "epoch": 0.28462347813016686, + "flos": 23182815859200.0, + "grad_norm": 2.3591023601535834, + "language_loss": 0.71619761, + "learning_rate": 3.357576466701875e-06, + "loss": 0.73809481, + "num_input_tokens_seen": 102139050, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9140625, + "step": 4734, + "time_per_iteration": 2.482696771621704 + }, + { + "auxiliary_loss_clip": 0.01131669, + "auxiliary_loss_mlp": 0.01036215, + "balance_loss_clip": 1.02036047, + "balance_loss_mlp": 1.04631829, + "epoch": 0.2846836013828348, + "flos": 18660477732480.0, + "grad_norm": 1.8927344989841068, + "language_loss": 0.73762977, + "learning_rate": 3.3572904459802056e-06, + "loss": 0.75930858, + "num_input_tokens_seen": 102157935, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.85546875, + "step": 4735, + "time_per_iteration": 2.4837005138397217 + }, + { + "auxiliary_loss_clip": 0.011344, + "auxiliary_loss_mlp": 0.01045777, + "balance_loss_clip": 1.03006482, + "balance_loss_mlp": 1.04755783, + "epoch": 0.2847437246355028, + "flos": 14173511523840.0, + "grad_norm": 1.630230460143418, + "language_loss": 0.79573876, + "learning_rate": 3.357004373789946e-06, + "loss": 0.81754053, + "num_input_tokens_seen": 102175325, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8671875, + "step": 4736, + "time_per_iteration": 2.4434666633605957 + }, + { + "auxiliary_loss_clip": 0.01139538, + "auxiliary_loss_mlp": 0.0104413, + "balance_loss_clip": 1.02740479, + "balance_loss_mlp": 1.05133057, + "epoch": 0.28480384788817076, + "flos": 29278364837760.0, + "grad_norm": 2.7860738328288637, + "language_loss": 0.59551513, + "learning_rate": 3.3567182501419453e-06, + "loss": 0.61735177, + "num_input_tokens_seen": 102196625, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8828125, + "step": 4737, + "time_per_iteration": 2.580573558807373 + }, + { + "auxiliary_loss_clip": 0.01132871, + "auxiliary_loss_mlp": 0.01039338, + "balance_loss_clip": 1.02334046, + "balance_loss_mlp": 1.04766428, + "epoch": 0.2848639711408387, + "flos": 22601314581120.0, + "grad_norm": 1.7923236486738074, + "language_loss": 0.86353856, + "learning_rate": 3.356432075047052e-06, + "loss": 0.8852607, + "num_input_tokens_seen": 102214975, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8515625, + "step": 4738, + "time_per_iteration": 2.483482837677002 + }, + { + "auxiliary_loss_clip": 0.0113957, + "auxiliary_loss_mlp": 0.01045248, + "balance_loss_clip": 1.02778435, + "balance_loss_mlp": 1.04864287, + "epoch": 0.2849240943935067, + "flos": 17599460866560.0, + "grad_norm": 2.438418234236932, + "language_loss": 0.89730442, + "learning_rate": 3.356145848516118e-06, + "loss": 0.91915256, + "num_input_tokens_seen": 102231885, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.91015625, + "step": 4739, + "time_per_iteration": 2.4746406078338623 + }, + { + "auxiliary_loss_clip": 0.01137971, + "auxiliary_loss_mlp": 0.01041062, + "balance_loss_clip": 1.02450418, + "balance_loss_mlp": 1.05253863, + "epoch": 0.28498421764617465, + "flos": 24862573428480.0, + "grad_norm": 1.3849266219761887, + "language_loss": 0.7207197, + "learning_rate": 3.355859570559998e-06, + "loss": 0.74250996, + "num_input_tokens_seen": 102252725, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.85546875, + "step": 4740, + "time_per_iteration": 2.49682879447937 + }, + { + "auxiliary_loss_clip": 0.01135048, + "auxiliary_loss_mlp": 0.01036754, + "balance_loss_clip": 1.0209707, + "balance_loss_mlp": 1.04970956, + "epoch": 0.2850443408988426, + "flos": 22782555630720.0, + "grad_norm": 1.6055473402712246, + "language_loss": 0.77937335, + "learning_rate": 3.3555732411895477e-06, + "loss": 0.80109143, + "num_input_tokens_seen": 102271730, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8515625, + "step": 4741, + "time_per_iteration": 2.51096248626709 + }, + { + "auxiliary_loss_clip": 0.01136227, + "auxiliary_loss_mlp": 0.01045688, + "balance_loss_clip": 1.02828324, + "balance_loss_mlp": 1.04566443, + "epoch": 0.2851044641515106, + "flos": 18844053166080.0, + "grad_norm": 1.6279093143019605, + "language_loss": 0.76295173, + "learning_rate": 3.3552868604156235e-06, + "loss": 0.78477085, + "num_input_tokens_seen": 102291325, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.90625, + "step": 4742, + "time_per_iteration": 2.462972402572632 + }, + { + "auxiliary_loss_clip": 0.01139125, + "auxiliary_loss_mlp": 0.01048964, + "balance_loss_clip": 1.03039074, + "balance_loss_mlp": 1.04792476, + "epoch": 0.28516458740417855, + "flos": 18880502492160.0, + "grad_norm": 1.8587468959738758, + "language_loss": 0.5772593, + "learning_rate": 3.355000428249086e-06, + "loss": 0.59914023, + "num_input_tokens_seen": 102309000, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9140625, + "step": 4743, + "time_per_iteration": 2.511903762817383 + }, + { + "auxiliary_loss_clip": 0.01141389, + "auxiliary_loss_mlp": 0.01054233, + "balance_loss_clip": 1.03724515, + "balance_loss_mlp": 1.05195451, + "epoch": 0.2852247106568465, + "flos": 25299821687040.0, + "grad_norm": 2.12515026406258, + "language_loss": 0.74454999, + "learning_rate": 3.354713944700797e-06, + "loss": 0.7665062, + "num_input_tokens_seen": 102329240, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.89453125, + "step": 4744, + "time_per_iteration": 2.48883318901062 + }, + { + "auxiliary_loss_clip": 0.01135189, + "auxiliary_loss_mlp": 0.01043767, + "balance_loss_clip": 1.02801967, + "balance_loss_mlp": 1.04948175, + "epoch": 0.2852848339095145, + "flos": 11655383541120.0, + "grad_norm": 2.362002737479584, + "language_loss": 0.77483714, + "learning_rate": 3.3544274097816185e-06, + "loss": 0.79662669, + "num_input_tokens_seen": 102344440, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.859375, + "step": 4745, + "time_per_iteration": 2.4669532775878906 + }, + { + "auxiliary_loss_clip": 0.01130558, + "auxiliary_loss_mlp": 0.01039375, + "balance_loss_clip": 1.02363896, + "balance_loss_mlp": 1.04884791, + "epoch": 0.2853449571621825, + "flos": 12933228856320.0, + "grad_norm": 1.753549870597739, + "language_loss": 0.83101368, + "learning_rate": 3.3541408235024173e-06, + "loss": 0.85271305, + "num_input_tokens_seen": 102360985, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.81640625, + "step": 4746, + "time_per_iteration": 2.4236245155334473 + }, + { + "auxiliary_loss_clip": 0.01138419, + "auxiliary_loss_mlp": 0.01039496, + "balance_loss_clip": 1.02243769, + "balance_loss_mlp": 1.04718721, + "epoch": 0.28540508041485046, + "flos": 20010575255040.0, + "grad_norm": 1.6977094615171933, + "language_loss": 0.79818654, + "learning_rate": 3.3538541858740604e-06, + "loss": 0.81996572, + "num_input_tokens_seen": 102380320, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9140625, + "step": 4747, + "time_per_iteration": 2.47261118888855 + }, + { + "auxiliary_loss_clip": 0.01044617, + "auxiliary_loss_mlp": 0.01004042, + "balance_loss_clip": 1.00208712, + "balance_loss_mlp": 1.01364255, + "epoch": 0.28546520366751843, + "flos": 68139349966080.0, + "grad_norm": 0.7754058718106229, + "language_loss": 0.60505557, + "learning_rate": 3.3535674969074173e-06, + "loss": 0.62554216, + "num_input_tokens_seen": 102439140, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.30859375, + "step": 4748, + "time_per_iteration": 3.087096691131592 + }, + { + "auxiliary_loss_clip": 0.0113463, + "auxiliary_loss_mlp": 0.01041876, + "balance_loss_clip": 1.02596188, + "balance_loss_mlp": 1.04764485, + "epoch": 0.2855253269201864, + "flos": 13251540205440.0, + "grad_norm": 2.177788697298361, + "language_loss": 0.80300528, + "learning_rate": 3.3532807566133592e-06, + "loss": 0.82477033, + "num_input_tokens_seen": 102450990, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.87109375, + "step": 4749, + "time_per_iteration": 2.4132721424102783 + }, + { + "auxiliary_loss_clip": 0.01134988, + "auxiliary_loss_mlp": 0.01038736, + "balance_loss_clip": 1.022488, + "balance_loss_mlp": 1.04882109, + "epoch": 0.28558545017285436, + "flos": 28620876337920.0, + "grad_norm": 1.910787577049047, + "language_loss": 0.7067076, + "learning_rate": 3.3529939650027587e-06, + "loss": 0.72844481, + "num_input_tokens_seen": 102471820, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.86328125, + "step": 4750, + "time_per_iteration": 2.5576114654541016 + }, + { + "auxiliary_loss_clip": 0.01132669, + "auxiliary_loss_mlp": 0.01037305, + "balance_loss_clip": 1.02121782, + "balance_loss_mlp": 1.04961181, + "epoch": 0.2856455734255223, + "flos": 34130470752000.0, + "grad_norm": 1.569446011166348, + "language_loss": 0.81798106, + "learning_rate": 3.3527071220864917e-06, + "loss": 0.83968079, + "num_input_tokens_seen": 102492625, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.83203125, + "step": 4751, + "time_per_iteration": 2.5805511474609375 + }, + { + "auxiliary_loss_clip": 0.01134337, + "auxiliary_loss_mlp": 0.01044364, + "balance_loss_clip": 1.02847314, + "balance_loss_mlp": 1.04876757, + "epoch": 0.2857056966781903, + "flos": 39786149779200.0, + "grad_norm": 1.8824724995030706, + "language_loss": 0.80753136, + "learning_rate": 3.3524202278754353e-06, + "loss": 0.82931828, + "num_input_tokens_seen": 102514145, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.859375, + "step": 4752, + "time_per_iteration": 2.6541080474853516 + }, + { + "auxiliary_loss_clip": 0.01134255, + "auxiliary_loss_mlp": 0.01039105, + "balance_loss_clip": 1.02258289, + "balance_loss_mlp": 1.04778147, + "epoch": 0.28576581993085826, + "flos": 21872292145920.0, + "grad_norm": 1.8943096426553439, + "language_loss": 0.78827929, + "learning_rate": 3.3521332823804676e-06, + "loss": 0.81001288, + "num_input_tokens_seen": 102532365, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.86328125, + "step": 4753, + "time_per_iteration": 2.4775567054748535 + }, + { + "auxiliary_loss_clip": 0.01140638, + "auxiliary_loss_mlp": 0.01043914, + "balance_loss_clip": 1.02559114, + "balance_loss_mlp": 1.05078959, + "epoch": 0.2858259431835262, + "flos": 19091656592640.0, + "grad_norm": 2.205371578508451, + "language_loss": 0.89809895, + "learning_rate": 3.3518462856124704e-06, + "loss": 0.91994447, + "num_input_tokens_seen": 102548425, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.8984375, + "step": 4754, + "time_per_iteration": 2.486128091812134 + }, + { + "auxiliary_loss_clip": 0.01134093, + "auxiliary_loss_mlp": 0.01041613, + "balance_loss_clip": 1.02616322, + "balance_loss_mlp": 1.04897058, + "epoch": 0.2858860664361942, + "flos": 20334309557760.0, + "grad_norm": 1.932227485650823, + "language_loss": 0.8234359, + "learning_rate": 3.3515592375823267e-06, + "loss": 0.84519303, + "num_input_tokens_seen": 102566370, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8515625, + "step": 4755, + "time_per_iteration": 2.491184711456299 + }, + { + "auxiliary_loss_clip": 0.01133783, + "auxiliary_loss_mlp": 0.010447, + "balance_loss_clip": 1.02915466, + "balance_loss_mlp": 1.04667544, + "epoch": 0.28594618968886215, + "flos": 24461738582400.0, + "grad_norm": 1.4908389000148254, + "language_loss": 0.83846784, + "learning_rate": 3.351272138300922e-06, + "loss": 0.86025268, + "num_input_tokens_seen": 102588715, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.87109375, + "step": 4756, + "time_per_iteration": 2.5934014320373535 + }, + { + "auxiliary_loss_clip": 0.01048134, + "auxiliary_loss_mlp": 0.01008558, + "balance_loss_clip": 1.0067457, + "balance_loss_mlp": 1.01677859, + "epoch": 0.2860063129415301, + "flos": 71652850709760.0, + "grad_norm": 0.8659269702666513, + "language_loss": 0.61012161, + "learning_rate": 3.350984987779142e-06, + "loss": 0.63068855, + "num_input_tokens_seen": 102656715, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.3125, + "step": 4757, + "time_per_iteration": 3.2122225761413574 + }, + { + "auxiliary_loss_clip": 0.01137202, + "auxiliary_loss_mlp": 0.01033086, + "balance_loss_clip": 1.0173862, + "balance_loss_mlp": 1.05204773, + "epoch": 0.2860664361941981, + "flos": 20558679863040.0, + "grad_norm": 1.9457322051707677, + "language_loss": 0.65794766, + "learning_rate": 3.3506977860278756e-06, + "loss": 0.67965055, + "num_input_tokens_seen": 102676545, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8515625, + "step": 4758, + "time_per_iteration": 2.60023832321167 + }, + { + "auxiliary_loss_clip": 0.01134399, + "auxiliary_loss_mlp": 0.01036695, + "balance_loss_clip": 1.02027392, + "balance_loss_mlp": 1.04756904, + "epoch": 0.2861265594468661, + "flos": 35996389534080.0, + "grad_norm": 1.560843999265526, + "language_loss": 0.62950313, + "learning_rate": 3.3504105330580143e-06, + "loss": 0.65121412, + "num_input_tokens_seen": 102702875, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8671875, + "step": 4759, + "time_per_iteration": 2.6352102756500244 + }, + { + "auxiliary_loss_clip": 0.0113658, + "auxiliary_loss_mlp": 0.01042718, + "balance_loss_clip": 1.02611256, + "balance_loss_mlp": 1.05098844, + "epoch": 0.28618668269953407, + "flos": 20047419630720.0, + "grad_norm": 1.76909488275169, + "language_loss": 0.7385608, + "learning_rate": 3.3501232288804496e-06, + "loss": 0.76035368, + "num_input_tokens_seen": 102723160, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.85546875, + "step": 4760, + "time_per_iteration": 2.5397889614105225 + }, + { + "auxiliary_loss_clip": 0.01132288, + "auxiliary_loss_mlp": 0.01038545, + "balance_loss_clip": 1.02357185, + "balance_loss_mlp": 1.04949427, + "epoch": 0.28624680595220203, + "flos": 24971849579520.0, + "grad_norm": 1.9401243114633073, + "language_loss": 0.72422945, + "learning_rate": 3.3498358735060773e-06, + "loss": 0.74593776, + "num_input_tokens_seen": 102743855, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 4761, + "time_per_iteration": 4.029369592666626 + }, + { + "auxiliary_loss_clip": 0.01135721, + "auxiliary_loss_mlp": 0.0104628, + "balance_loss_clip": 1.0303421, + "balance_loss_mlp": 1.04875946, + "epoch": 0.28630692920487, + "flos": 22492253911680.0, + "grad_norm": 2.026540334724573, + "language_loss": 0.74605787, + "learning_rate": 3.349548466945793e-06, + "loss": 0.76787788, + "num_input_tokens_seen": 102761370, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.87109375, + "step": 4762, + "time_per_iteration": 3.9056994915008545 + }, + { + "auxiliary_loss_clip": 0.01134836, + "auxiliary_loss_mlp": 0.01045458, + "balance_loss_clip": 1.02963901, + "balance_loss_mlp": 1.05027771, + "epoch": 0.28636705245753796, + "flos": 21249888255360.0, + "grad_norm": 1.79451974437327, + "language_loss": 0.76088154, + "learning_rate": 3.349261009210496e-06, + "loss": 0.78268445, + "num_input_tokens_seen": 102780885, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.84375, + "step": 4763, + "time_per_iteration": 2.521223545074463 + }, + { + "auxiliary_loss_clip": 0.01133105, + "auxiliary_loss_mlp": 0.01035478, + "balance_loss_clip": 1.01907468, + "balance_loss_mlp": 1.04712808, + "epoch": 0.28642717571020593, + "flos": 24095772864000.0, + "grad_norm": 1.9430054907967222, + "language_loss": 0.76937616, + "learning_rate": 3.348973500311086e-06, + "loss": 0.79106188, + "num_input_tokens_seen": 102801000, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.859375, + "step": 4764, + "time_per_iteration": 2.4924814701080322 + }, + { + "auxiliary_loss_clip": 0.01137128, + "auxiliary_loss_mlp": 0.01041403, + "balance_loss_clip": 1.02354538, + "balance_loss_mlp": 1.04996395, + "epoch": 0.2864872989628739, + "flos": 22601386408320.0, + "grad_norm": 1.8973954036904035, + "language_loss": 0.71061826, + "learning_rate": 3.348685940258466e-06, + "loss": 0.73240352, + "num_input_tokens_seen": 102820230, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.87109375, + "step": 4765, + "time_per_iteration": 2.509204387664795 + }, + { + "auxiliary_loss_clip": 0.01131492, + "auxiliary_loss_mlp": 0.01037402, + "balance_loss_clip": 1.02149963, + "balance_loss_mlp": 1.04705501, + "epoch": 0.28654742221554186, + "flos": 32745073138560.0, + "grad_norm": 1.5129940587619137, + "language_loss": 0.75756145, + "learning_rate": 3.3483983290635395e-06, + "loss": 0.77925038, + "num_input_tokens_seen": 102842670, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.84375, + "step": 4766, + "time_per_iteration": 2.562422513961792 + }, + { + "auxiliary_loss_clip": 0.01135318, + "auxiliary_loss_mlp": 0.0103558, + "balance_loss_clip": 1.01960635, + "balance_loss_mlp": 1.05073392, + "epoch": 0.2866075454682098, + "flos": 26981626331520.0, + "grad_norm": 1.5780141248071407, + "language_loss": 0.77556801, + "learning_rate": 3.348110666737214e-06, + "loss": 0.79727697, + "num_input_tokens_seen": 102864480, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.84375, + "step": 4767, + "time_per_iteration": 2.5476057529449463 + }, + { + "auxiliary_loss_clip": 0.01133832, + "auxiliary_loss_mlp": 0.01041655, + "balance_loss_clip": 1.02591908, + "balance_loss_mlp": 1.04878676, + "epoch": 0.2866676687208778, + "flos": 23253847004160.0, + "grad_norm": 2.169490874338027, + "language_loss": 0.6494413, + "learning_rate": 3.3478229532903956e-06, + "loss": 0.67119616, + "num_input_tokens_seen": 102883740, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8515625, + "step": 4768, + "time_per_iteration": 2.4961044788360596 + }, + { + "auxiliary_loss_clip": 0.01137611, + "auxiliary_loss_mlp": 0.01044314, + "balance_loss_clip": 1.02807736, + "balance_loss_mlp": 1.04944301, + "epoch": 0.28672779197354575, + "flos": 21579727870080.0, + "grad_norm": 1.5253191671074575, + "language_loss": 0.70345664, + "learning_rate": 3.3475351887339967e-06, + "loss": 0.72527587, + "num_input_tokens_seen": 102902945, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8828125, + "step": 4769, + "time_per_iteration": 2.5243568420410156 + }, + { + "auxiliary_loss_clip": 0.01136117, + "auxiliary_loss_mlp": 0.01034848, + "balance_loss_clip": 1.01992261, + "balance_loss_mlp": 1.04866219, + "epoch": 0.2867879152262137, + "flos": 19865568049920.0, + "grad_norm": 1.7483868508562144, + "language_loss": 0.75552189, + "learning_rate": 3.3472473730789288e-06, + "loss": 0.77723145, + "num_input_tokens_seen": 102922405, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.875, + "step": 4770, + "time_per_iteration": 2.468655586242676 + }, + { + "auxiliary_loss_clip": 0.01137893, + "auxiliary_loss_mlp": 0.01043906, + "balance_loss_clip": 1.02745509, + "balance_loss_mlp": 1.0500282, + "epoch": 0.2868480384788817, + "flos": 28213325648640.0, + "grad_norm": 3.1666126901900107, + "language_loss": 0.6730839, + "learning_rate": 3.3469595063361045e-06, + "loss": 0.69490194, + "num_input_tokens_seen": 102938980, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.87890625, + "step": 4771, + "time_per_iteration": 2.5334818363189697 + }, + { + "auxiliary_loss_clip": 0.01046415, + "auxiliary_loss_mlp": 0.01005401, + "balance_loss_clip": 1.00367248, + "balance_loss_mlp": 1.01655006, + "epoch": 0.2869081617315497, + "flos": 65424286690560.0, + "grad_norm": 0.7694277286160668, + "language_loss": 0.56883639, + "learning_rate": 3.3466715885164414e-06, + "loss": 0.58935452, + "num_input_tokens_seen": 103000405, + "router_z_loss_clip": 0.01733398, + "router_z_loss_mlp": 0.29882812, + "step": 4772, + "time_per_iteration": 3.0373501777648926 + }, + { + "auxiliary_loss_clip": 0.01136901, + "auxiliary_loss_mlp": 0.01041473, + "balance_loss_clip": 1.02567768, + "balance_loss_mlp": 1.05014777, + "epoch": 0.28696828498421767, + "flos": 18660729127680.0, + "grad_norm": 2.6517872983988844, + "language_loss": 0.83356023, + "learning_rate": 3.346383619630856e-06, + "loss": 0.85534406, + "num_input_tokens_seen": 103017970, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8671875, + "step": 4773, + "time_per_iteration": 2.477537155151367 + }, + { + "auxiliary_loss_clip": 0.0113402, + "auxiliary_loss_mlp": 0.0103818, + "balance_loss_clip": 1.02159762, + "balance_loss_mlp": 1.04630029, + "epoch": 0.28702840823688563, + "flos": 23659745667840.0, + "grad_norm": 2.6367186533355356, + "language_loss": 0.77910906, + "learning_rate": 3.34609559969027e-06, + "loss": 0.80083102, + "num_input_tokens_seen": 103036385, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.875, + "step": 4774, + "time_per_iteration": 2.514545440673828 + }, + { + "auxiliary_loss_clip": 0.01136368, + "auxiliary_loss_mlp": 0.01037759, + "balance_loss_clip": 1.02091432, + "balance_loss_mlp": 1.05010271, + "epoch": 0.2870885314895536, + "flos": 13804744544640.0, + "grad_norm": 1.7122435327393783, + "language_loss": 0.73488462, + "learning_rate": 3.3458075287056034e-06, + "loss": 0.75662589, + "num_input_tokens_seen": 103052170, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.86328125, + "step": 4775, + "time_per_iteration": 2.4526851177215576 + }, + { + "auxiliary_loss_clip": 0.0113744, + "auxiliary_loss_mlp": 0.01037967, + "balance_loss_clip": 1.02267885, + "balance_loss_mlp": 1.05033445, + "epoch": 0.28714865474222157, + "flos": 17786771314560.0, + "grad_norm": 1.655187901014976, + "language_loss": 0.88345891, + "learning_rate": 3.34551940668778e-06, + "loss": 0.905213, + "num_input_tokens_seen": 103070510, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.87109375, + "step": 4776, + "time_per_iteration": 2.5487112998962402 + }, + { + "auxiliary_loss_clip": 0.01135791, + "auxiliary_loss_mlp": 0.01037024, + "balance_loss_clip": 1.02170587, + "balance_loss_mlp": 1.05060029, + "epoch": 0.28720877799488953, + "flos": 15997486199040.0, + "grad_norm": 1.7920640817181568, + "language_loss": 0.74046421, + "learning_rate": 3.345231233647726e-06, + "loss": 0.76219237, + "num_input_tokens_seen": 103089590, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8515625, + "step": 4777, + "time_per_iteration": 2.4858744144439697 + }, + { + "auxiliary_loss_clip": 0.01143681, + "auxiliary_loss_mlp": 0.01044417, + "balance_loss_clip": 1.02763224, + "balance_loss_mlp": 1.05306673, + "epoch": 0.2872689012475575, + "flos": 20923137210240.0, + "grad_norm": 1.9679293284940167, + "language_loss": 0.80052459, + "learning_rate": 3.3449430095963696e-06, + "loss": 0.82240558, + "num_input_tokens_seen": 103109080, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.90625, + "step": 4778, + "time_per_iteration": 2.536553382873535 + }, + { + "auxiliary_loss_clip": 0.01135156, + "auxiliary_loss_mlp": 0.01046142, + "balance_loss_clip": 1.03032279, + "balance_loss_mlp": 1.05058503, + "epoch": 0.28732902450022546, + "flos": 21325121291520.0, + "grad_norm": 1.6265242751714746, + "language_loss": 0.73940611, + "learning_rate": 3.3446547345446386e-06, + "loss": 0.76121908, + "num_input_tokens_seen": 103127755, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.84765625, + "step": 4779, + "time_per_iteration": 2.5068604946136475 + }, + { + "auxiliary_loss_clip": 0.01139025, + "auxiliary_loss_mlp": 0.01044309, + "balance_loss_clip": 1.02791739, + "balance_loss_mlp": 1.05089593, + "epoch": 0.2873891477528934, + "flos": 20850382212480.0, + "grad_norm": 1.5791887497798731, + "language_loss": 0.76378506, + "learning_rate": 3.3443664085034656e-06, + "loss": 0.78561842, + "num_input_tokens_seen": 103147035, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8828125, + "step": 4780, + "time_per_iteration": 2.6357336044311523 + }, + { + "auxiliary_loss_clip": 0.01132548, + "auxiliary_loss_mlp": 0.01042513, + "balance_loss_clip": 1.02789187, + "balance_loss_mlp": 1.04874134, + "epoch": 0.2874492710055614, + "flos": 17420051410560.0, + "grad_norm": 1.8554557560955622, + "language_loss": 0.81367111, + "learning_rate": 3.344078031483784e-06, + "loss": 0.83542168, + "num_input_tokens_seen": 103165410, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8359375, + "step": 4781, + "time_per_iteration": 2.484217405319214 + }, + { + "auxiliary_loss_clip": 0.01138983, + "auxiliary_loss_mlp": 0.01044127, + "balance_loss_clip": 1.02688909, + "balance_loss_mlp": 1.0511862, + "epoch": 0.28750939425822936, + "flos": 13406818700160.0, + "grad_norm": 1.9124031057386872, + "language_loss": 0.86249948, + "learning_rate": 3.3437896034965283e-06, + "loss": 0.88433063, + "num_input_tokens_seen": 103183710, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.87890625, + "step": 4782, + "time_per_iteration": 2.4822945594787598 + }, + { + "auxiliary_loss_clip": 0.0113749, + "auxiliary_loss_mlp": 0.0104499, + "balance_loss_clip": 1.02842641, + "balance_loss_mlp": 1.05222881, + "epoch": 0.2875695175108973, + "flos": 21870029589120.0, + "grad_norm": 1.5584901619772236, + "language_loss": 0.71195668, + "learning_rate": 3.3435011245526357e-06, + "loss": 0.73378146, + "num_input_tokens_seen": 103203790, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.85546875, + "step": 4783, + "time_per_iteration": 2.4959099292755127 + }, + { + "auxiliary_loss_clip": 0.01136896, + "auxiliary_loss_mlp": 0.0104062, + "balance_loss_clip": 1.02443171, + "balance_loss_mlp": 1.05179179, + "epoch": 0.2876296407635653, + "flos": 26245457089920.0, + "grad_norm": 3.6731562407195932, + "language_loss": 0.77011871, + "learning_rate": 3.343212594663047e-06, + "loss": 0.79189384, + "num_input_tokens_seen": 103223925, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8515625, + "step": 4784, + "time_per_iteration": 2.55037784576416 + }, + { + "auxiliary_loss_clip": 0.0113214, + "auxiliary_loss_mlp": 0.01041887, + "balance_loss_clip": 1.02603197, + "balance_loss_mlp": 1.04896331, + "epoch": 0.28768976401623325, + "flos": 25373654092800.0, + "grad_norm": 1.5223386635016902, + "language_loss": 0.75859249, + "learning_rate": 3.3429240138387015e-06, + "loss": 0.7803328, + "num_input_tokens_seen": 103244760, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.83203125, + "step": 4785, + "time_per_iteration": 2.526587724685669 + }, + { + "auxiliary_loss_clip": 0.01135192, + "auxiliary_loss_mlp": 0.01042659, + "balance_loss_clip": 1.02724528, + "balance_loss_mlp": 1.04946601, + "epoch": 0.28774988726890127, + "flos": 30664372982400.0, + "grad_norm": 1.9982438427344784, + "language_loss": 0.83033895, + "learning_rate": 3.3426353820905425e-06, + "loss": 0.85211748, + "num_input_tokens_seen": 103261995, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.859375, + "step": 4786, + "time_per_iteration": 2.5786821842193604 + }, + { + "auxiliary_loss_clip": 0.01134245, + "auxiliary_loss_mlp": 0.01033568, + "balance_loss_clip": 1.01899481, + "balance_loss_mlp": 1.04868317, + "epoch": 0.28781001052156924, + "flos": 20595452411520.0, + "grad_norm": 1.95457297040312, + "language_loss": 0.80007184, + "learning_rate": 3.342346699429516e-06, + "loss": 0.82174993, + "num_input_tokens_seen": 103279780, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.85546875, + "step": 4787, + "time_per_iteration": 2.4734396934509277 + }, + { + "auxiliary_loss_clip": 0.01136278, + "auxiliary_loss_mlp": 0.01039735, + "balance_loss_clip": 1.02397585, + "balance_loss_mlp": 1.04906642, + "epoch": 0.2878701337742372, + "flos": 26542330997760.0, + "grad_norm": 2.6671828195015044, + "language_loss": 0.83666658, + "learning_rate": 3.3420579658665677e-06, + "loss": 0.85842675, + "num_input_tokens_seen": 103300580, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.87109375, + "step": 4788, + "time_per_iteration": 2.5388548374176025 + }, + { + "auxiliary_loss_clip": 0.01137234, + "auxiliary_loss_mlp": 0.01046523, + "balance_loss_clip": 1.03135991, + "balance_loss_mlp": 1.05051816, + "epoch": 0.28793025702690517, + "flos": 28146855530880.0, + "grad_norm": 1.8168797658695668, + "language_loss": 0.73769903, + "learning_rate": 3.3417691814126468e-06, + "loss": 0.75953662, + "num_input_tokens_seen": 103320430, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8671875, + "step": 4789, + "time_per_iteration": 2.5259692668914795 + }, + { + "auxiliary_loss_clip": 0.01129641, + "auxiliary_loss_mlp": 0.01043253, + "balance_loss_clip": 1.02819657, + "balance_loss_mlp": 1.0466274, + "epoch": 0.28799038027957313, + "flos": 23805471144960.0, + "grad_norm": 1.7572733449240283, + "language_loss": 0.83982229, + "learning_rate": 3.341480346078704e-06, + "loss": 0.86155128, + "num_input_tokens_seen": 103337695, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 4790, + "time_per_iteration": 2.5347094535827637 + }, + { + "auxiliary_loss_clip": 0.01136016, + "auxiliary_loss_mlp": 0.01039193, + "balance_loss_clip": 1.02267063, + "balance_loss_mlp": 1.05011547, + "epoch": 0.2880505035322411, + "flos": 22344122223360.0, + "grad_norm": 1.8137236403798864, + "language_loss": 0.77924603, + "learning_rate": 3.3411914598756922e-06, + "loss": 0.80099815, + "num_input_tokens_seen": 103357010, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.859375, + "step": 4791, + "time_per_iteration": 2.475328207015991 + }, + { + "auxiliary_loss_clip": 0.0113676, + "auxiliary_loss_mlp": 0.01034669, + "balance_loss_clip": 1.01854002, + "balance_loss_mlp": 1.04824567, + "epoch": 0.28811062678490906, + "flos": 18004246208640.0, + "grad_norm": 1.933659829708973, + "language_loss": 0.70760292, + "learning_rate": 3.3409025228145654e-06, + "loss": 0.72931719, + "num_input_tokens_seen": 103375600, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.88671875, + "step": 4792, + "time_per_iteration": 2.4705538749694824 + }, + { + "auxiliary_loss_clip": 0.01135222, + "auxiliary_loss_mlp": 0.01035984, + "balance_loss_clip": 1.02065361, + "balance_loss_mlp": 1.04968917, + "epoch": 0.28817075003757703, + "flos": 22090880361600.0, + "grad_norm": 2.08648870526395, + "language_loss": 0.79392564, + "learning_rate": 3.3406135349062812e-06, + "loss": 0.81563771, + "num_input_tokens_seen": 103395225, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.85546875, + "step": 4793, + "time_per_iteration": 2.509697914123535 + }, + { + "auxiliary_loss_clip": 0.01131221, + "auxiliary_loss_mlp": 0.01040002, + "balance_loss_clip": 1.02479076, + "balance_loss_mlp": 1.04920101, + "epoch": 0.288230873290245, + "flos": 41683130847360.0, + "grad_norm": 1.6269924793239006, + "language_loss": 0.77731872, + "learning_rate": 3.340324496161797e-06, + "loss": 0.7990309, + "num_input_tokens_seen": 103417245, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8203125, + "step": 4794, + "time_per_iteration": 2.6943047046661377 + }, + { + "auxiliary_loss_clip": 0.01134923, + "auxiliary_loss_mlp": 0.0104449, + "balance_loss_clip": 1.02819395, + "balance_loss_mlp": 1.04913807, + "epoch": 0.28829099654291296, + "flos": 18624423456000.0, + "grad_norm": 2.663854929830155, + "language_loss": 0.8254813, + "learning_rate": 3.340035406592074e-06, + "loss": 0.84727538, + "num_input_tokens_seen": 103435500, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.859375, + "step": 4795, + "time_per_iteration": 2.4633255004882812 + }, + { + "auxiliary_loss_clip": 0.01129713, + "auxiliary_loss_mlp": 0.01038999, + "balance_loss_clip": 1.02387166, + "balance_loss_mlp": 1.04899204, + "epoch": 0.2883511197955809, + "flos": 24674832017280.0, + "grad_norm": 2.661730786650402, + "language_loss": 0.74650323, + "learning_rate": 3.339746266208074e-06, + "loss": 0.76819038, + "num_input_tokens_seen": 103451040, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.80859375, + "step": 4796, + "time_per_iteration": 2.5179266929626465 + }, + { + "auxiliary_loss_clip": 0.01138692, + "auxiliary_loss_mlp": 0.01040905, + "balance_loss_clip": 1.02334583, + "balance_loss_mlp": 1.04789257, + "epoch": 0.2884112430482489, + "flos": 23112143850240.0, + "grad_norm": 1.8865626242662115, + "language_loss": 0.72797763, + "learning_rate": 3.3394570750207614e-06, + "loss": 0.74977362, + "num_input_tokens_seen": 103471330, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.90625, + "step": 4797, + "time_per_iteration": 2.4910430908203125 + }, + { + "auxiliary_loss_clip": 0.01135339, + "auxiliary_loss_mlp": 0.01040629, + "balance_loss_clip": 1.02475667, + "balance_loss_mlp": 1.04989898, + "epoch": 0.28847136630091685, + "flos": 16873347432960.0, + "grad_norm": 2.109884297899412, + "language_loss": 0.74219149, + "learning_rate": 3.3391678330411017e-06, + "loss": 0.76395118, + "num_input_tokens_seen": 103488060, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8515625, + "step": 4798, + "time_per_iteration": 2.472590923309326 + }, + { + "auxiliary_loss_clip": 0.01134882, + "auxiliary_loss_mlp": 0.01043827, + "balance_loss_clip": 1.02631509, + "balance_loss_mlp": 1.04689598, + "epoch": 0.2885314895535849, + "flos": 25657527277440.0, + "grad_norm": 2.7660889265500996, + "language_loss": 0.64920753, + "learning_rate": 3.3388785402800642e-06, + "loss": 0.67099464, + "num_input_tokens_seen": 103503600, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.87890625, + "step": 4799, + "time_per_iteration": 2.4816339015960693 + }, + { + "auxiliary_loss_clip": 0.01136164, + "auxiliary_loss_mlp": 0.01043963, + "balance_loss_clip": 1.02784538, + "balance_loss_mlp": 1.04912758, + "epoch": 0.28859161280625284, + "flos": 21107251347840.0, + "grad_norm": 2.0794132014970272, + "language_loss": 0.82202137, + "learning_rate": 3.3385891967486178e-06, + "loss": 0.84382272, + "num_input_tokens_seen": 103524195, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.87109375, + "step": 4800, + "time_per_iteration": 2.5249674320220947 + }, + { + "auxiliary_loss_clip": 0.01128617, + "auxiliary_loss_mlp": 0.01038626, + "balance_loss_clip": 1.02312899, + "balance_loss_mlp": 1.04702258, + "epoch": 0.2886517360589208, + "flos": 26469540086400.0, + "grad_norm": 1.639042715490093, + "language_loss": 0.90946537, + "learning_rate": 3.3382998024577347e-06, + "loss": 0.93113768, + "num_input_tokens_seen": 103545235, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.81640625, + "step": 4801, + "time_per_iteration": 2.531658172607422 + }, + { + "auxiliary_loss_clip": 0.01133327, + "auxiliary_loss_mlp": 0.01038392, + "balance_loss_clip": 1.0221796, + "balance_loss_mlp": 1.04792547, + "epoch": 0.28871185931158877, + "flos": 25265275781760.0, + "grad_norm": 2.176318344562637, + "language_loss": 0.73644328, + "learning_rate": 3.33801035741839e-06, + "loss": 0.75816047, + "num_input_tokens_seen": 103563305, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8515625, + "step": 4802, + "time_per_iteration": 4.080524444580078 + }, + { + "auxiliary_loss_clip": 0.01040178, + "auxiliary_loss_mlp": 0.01006047, + "balance_loss_clip": 1.00423479, + "balance_loss_mlp": 1.01114249, + "epoch": 0.28877198256425674, + "flos": 66665431284480.0, + "grad_norm": 0.7820100192493779, + "language_loss": 0.63009298, + "learning_rate": 3.337720861641558e-06, + "loss": 0.65055525, + "num_input_tokens_seen": 103625025, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.2890625, + "step": 4803, + "time_per_iteration": 4.464243412017822 + }, + { + "auxiliary_loss_clip": 0.0112919, + "auxiliary_loss_mlp": 0.01046023, + "balance_loss_clip": 1.03008461, + "balance_loss_mlp": 1.04523563, + "epoch": 0.2888321058169247, + "flos": 20303031790080.0, + "grad_norm": 1.7581002683255658, + "language_loss": 0.70800668, + "learning_rate": 3.3374313151382165e-06, + "loss": 0.72975886, + "num_input_tokens_seen": 103644235, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8359375, + "step": 4804, + "time_per_iteration": 2.4655730724334717 + }, + { + "auxiliary_loss_clip": 0.01134858, + "auxiliary_loss_mlp": 0.0104232, + "balance_loss_clip": 1.02464128, + "balance_loss_mlp": 1.04650438, + "epoch": 0.28889222906959267, + "flos": 25516721963520.0, + "grad_norm": 1.8916446417141755, + "language_loss": 0.68253011, + "learning_rate": 3.337141717919346e-06, + "loss": 0.70430195, + "num_input_tokens_seen": 103664700, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.8828125, + "step": 4805, + "time_per_iteration": 2.53932523727417 + }, + { + "auxiliary_loss_clip": 0.01133301, + "auxiliary_loss_mlp": 0.01041795, + "balance_loss_clip": 1.0262022, + "balance_loss_mlp": 1.04706144, + "epoch": 0.28895235232226063, + "flos": 32671312560000.0, + "grad_norm": 1.968490446816616, + "language_loss": 0.69469118, + "learning_rate": 3.3368520699959272e-06, + "loss": 0.71644211, + "num_input_tokens_seen": 103686595, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.86328125, + "step": 4806, + "time_per_iteration": 2.558811902999878 + }, + { + "auxiliary_loss_clip": 0.0113054, + "auxiliary_loss_mlp": 0.01046922, + "balance_loss_clip": 1.031461, + "balance_loss_mlp": 1.04788303, + "epoch": 0.2890124755749286, + "flos": 29714679342720.0, + "grad_norm": 1.428284074184194, + "language_loss": 0.71372461, + "learning_rate": 3.3365623713789443e-06, + "loss": 0.73549926, + "num_input_tokens_seen": 103707525, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.828125, + "step": 4807, + "time_per_iteration": 2.5614373683929443 + }, + { + "auxiliary_loss_clip": 0.01132479, + "auxiliary_loss_mlp": 0.01043529, + "balance_loss_clip": 1.02736425, + "balance_loss_mlp": 1.04677331, + "epoch": 0.28907259882759656, + "flos": 22674464628480.0, + "grad_norm": 1.7487230864068215, + "language_loss": 0.81519878, + "learning_rate": 3.336272622079382e-06, + "loss": 0.83695877, + "num_input_tokens_seen": 103727905, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.85546875, + "step": 4808, + "time_per_iteration": 2.4744319915771484 + }, + { + "auxiliary_loss_clip": 0.01128992, + "auxiliary_loss_mlp": 0.0105043, + "balance_loss_clip": 1.03418779, + "balance_loss_mlp": 1.04669142, + "epoch": 0.2891327220802645, + "flos": 22566050403840.0, + "grad_norm": 1.636259514454852, + "language_loss": 0.78387201, + "learning_rate": 3.3359828221082276e-06, + "loss": 0.80566621, + "num_input_tokens_seen": 103748335, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8203125, + "step": 4809, + "time_per_iteration": 2.4998364448547363 + }, + { + "auxiliary_loss_clip": 0.01134273, + "auxiliary_loss_mlp": 0.0104619, + "balance_loss_clip": 1.02908349, + "balance_loss_mlp": 1.04490733, + "epoch": 0.2891928453329325, + "flos": 21652806090240.0, + "grad_norm": 1.6563631129995537, + "language_loss": 0.78611737, + "learning_rate": 3.3356929714764714e-06, + "loss": 0.80792195, + "num_input_tokens_seen": 103767020, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.890625, + "step": 4810, + "time_per_iteration": 2.4702351093292236 + }, + { + "auxiliary_loss_clip": 0.01129985, + "auxiliary_loss_mlp": 0.01045099, + "balance_loss_clip": 1.02966762, + "balance_loss_mlp": 1.04653728, + "epoch": 0.28925296858560046, + "flos": 23222102359680.0, + "grad_norm": 2.008599276638055, + "language_loss": 0.77134252, + "learning_rate": 3.3354030701951032e-06, + "loss": 0.79309338, + "num_input_tokens_seen": 103786355, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8359375, + "step": 4811, + "time_per_iteration": 2.502671718597412 + }, + { + "auxiliary_loss_clip": 0.01130702, + "auxiliary_loss_mlp": 0.01050867, + "balance_loss_clip": 1.03385544, + "balance_loss_mlp": 1.0460732, + "epoch": 0.2893130918382685, + "flos": 28621666437120.0, + "grad_norm": 1.3273574459957262, + "language_loss": 0.76748705, + "learning_rate": 3.335113118275117e-06, + "loss": 0.78930271, + "num_input_tokens_seen": 103809345, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.84375, + "step": 4812, + "time_per_iteration": 2.5386435985565186 + }, + { + "auxiliary_loss_clip": 0.01038211, + "auxiliary_loss_mlp": 0.01023073, + "balance_loss_clip": 1.02121317, + "balance_loss_mlp": 1.00933552, + "epoch": 0.28937321509093644, + "flos": 72301288982400.0, + "grad_norm": 0.8452992206378583, + "language_loss": 0.60239071, + "learning_rate": 3.3348231157275085e-06, + "loss": 0.62300354, + "num_input_tokens_seen": 103871180, + "router_z_loss_clip": 0.01855469, + "router_z_loss_mlp": 0.2890625, + "step": 4813, + "time_per_iteration": 3.227616548538208 + }, + { + "auxiliary_loss_clip": 0.01130233, + "auxiliary_loss_mlp": 0.0104328, + "balance_loss_clip": 1.02727079, + "balance_loss_mlp": 1.04549837, + "epoch": 0.2894333383436044, + "flos": 16216397637120.0, + "grad_norm": 1.8826759768804342, + "language_loss": 0.81616402, + "learning_rate": 3.3345330625632725e-06, + "loss": 0.83789915, + "num_input_tokens_seen": 103889040, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.84765625, + "step": 4814, + "time_per_iteration": 2.5389657020568848 + }, + { + "auxiliary_loss_clip": 0.01132807, + "auxiliary_loss_mlp": 0.01045738, + "balance_loss_clip": 1.0297873, + "balance_loss_mlp": 1.04464495, + "epoch": 0.2894934615962724, + "flos": 24828278918400.0, + "grad_norm": 1.6532361717230013, + "language_loss": 0.72615647, + "learning_rate": 3.3342429587934094e-06, + "loss": 0.74794197, + "num_input_tokens_seen": 103910380, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8828125, + "step": 4815, + "time_per_iteration": 2.517458438873291 + }, + { + "auxiliary_loss_clip": 0.01129383, + "auxiliary_loss_mlp": 0.0104797, + "balance_loss_clip": 1.03274667, + "balance_loss_mlp": 1.04815507, + "epoch": 0.28955358484894034, + "flos": 20449978329600.0, + "grad_norm": 1.520143184033477, + "language_loss": 0.70801306, + "learning_rate": 3.3339528044289198e-06, + "loss": 0.72978652, + "num_input_tokens_seen": 103929955, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8125, + "step": 4816, + "time_per_iteration": 2.5287740230560303 + }, + { + "auxiliary_loss_clip": 0.01135738, + "auxiliary_loss_mlp": 0.0104281, + "balance_loss_clip": 1.02590585, + "balance_loss_mlp": 1.04615664, + "epoch": 0.2896137081016083, + "flos": 22565188477440.0, + "grad_norm": 3.3715101323822174, + "language_loss": 0.74736607, + "learning_rate": 3.3336625994808055e-06, + "loss": 0.76915157, + "num_input_tokens_seen": 103948020, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8984375, + "step": 4817, + "time_per_iteration": 2.4828009605407715 + }, + { + "auxiliary_loss_clip": 0.01134031, + "auxiliary_loss_mlp": 0.0105341, + "balance_loss_clip": 1.03637469, + "balance_loss_mlp": 1.0465169, + "epoch": 0.28967383135427627, + "flos": 26687948734080.0, + "grad_norm": 1.754631597755812, + "language_loss": 0.76169789, + "learning_rate": 3.3333723439600723e-06, + "loss": 0.78357232, + "num_input_tokens_seen": 103968740, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.875, + "step": 4818, + "time_per_iteration": 2.5453133583068848 + }, + { + "auxiliary_loss_clip": 0.011322, + "auxiliary_loss_mlp": 0.01035125, + "balance_loss_clip": 1.0184474, + "balance_loss_mlp": 1.04606366, + "epoch": 0.28973395460694423, + "flos": 15558262692480.0, + "grad_norm": 1.8604375380991018, + "language_loss": 0.79827082, + "learning_rate": 3.3330820378777263e-06, + "loss": 0.81994408, + "num_input_tokens_seen": 103986005, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.859375, + "step": 4819, + "time_per_iteration": 2.4516472816467285 + }, + { + "auxiliary_loss_clip": 0.01135104, + "auxiliary_loss_mlp": 0.01043377, + "balance_loss_clip": 1.02553141, + "balance_loss_mlp": 1.04452121, + "epoch": 0.2897940778596122, + "flos": 18697465762560.0, + "grad_norm": 1.6026789889191464, + "language_loss": 0.78726941, + "learning_rate": 3.332791681244776e-06, + "loss": 0.80905426, + "num_input_tokens_seen": 104005070, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.90625, + "step": 4820, + "time_per_iteration": 2.512927770614624 + }, + { + "auxiliary_loss_clip": 0.0113352, + "auxiliary_loss_mlp": 0.01036477, + "balance_loss_clip": 1.0202527, + "balance_loss_mlp": 1.04560018, + "epoch": 0.28985420111228016, + "flos": 18770292587520.0, + "grad_norm": 2.352701358428358, + "language_loss": 0.73083222, + "learning_rate": 3.332501274072231e-06, + "loss": 0.75253224, + "num_input_tokens_seen": 104022945, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.87890625, + "step": 4821, + "time_per_iteration": 2.4575939178466797 + }, + { + "auxiliary_loss_clip": 0.01130585, + "auxiliary_loss_mlp": 0.01036495, + "balance_loss_clip": 1.01979387, + "balance_loss_mlp": 1.04503322, + "epoch": 0.28991432436494813, + "flos": 23069840607360.0, + "grad_norm": 1.843174914976853, + "language_loss": 0.72629523, + "learning_rate": 3.332210816371104e-06, + "loss": 0.74796605, + "num_input_tokens_seen": 104042080, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.85546875, + "step": 4822, + "time_per_iteration": 2.4981486797332764 + }, + { + "auxiliary_loss_clip": 0.01133236, + "auxiliary_loss_mlp": 0.01047323, + "balance_loss_clip": 1.03044343, + "balance_loss_mlp": 1.04679179, + "epoch": 0.2899744476176161, + "flos": 17603195880960.0, + "grad_norm": 1.7581642571514904, + "language_loss": 0.66571164, + "learning_rate": 3.3319203081524102e-06, + "loss": 0.68751729, + "num_input_tokens_seen": 104060975, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.86328125, + "step": 4823, + "time_per_iteration": 2.4363584518432617 + }, + { + "auxiliary_loss_clip": 0.01128693, + "auxiliary_loss_mlp": 0.01036254, + "balance_loss_clip": 1.02018452, + "balance_loss_mlp": 1.04382014, + "epoch": 0.29003457087028406, + "flos": 22309360836480.0, + "grad_norm": 3.6840420234688684, + "language_loss": 0.80786806, + "learning_rate": 3.331629749427164e-06, + "loss": 0.82951754, + "num_input_tokens_seen": 104081395, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8515625, + "step": 4824, + "time_per_iteration": 2.4978654384613037 + }, + { + "auxiliary_loss_clip": 0.01132559, + "auxiliary_loss_mlp": 0.01043716, + "balance_loss_clip": 1.02547669, + "balance_loss_mlp": 1.04512334, + "epoch": 0.2900946941229521, + "flos": 21944975316480.0, + "grad_norm": 1.8817460080316075, + "language_loss": 0.72507697, + "learning_rate": 3.331339140206385e-06, + "loss": 0.74683976, + "num_input_tokens_seen": 104099995, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.875, + "step": 4825, + "time_per_iteration": 2.4740118980407715 + }, + { + "auxiliary_loss_clip": 0.01136872, + "auxiliary_loss_mlp": 0.01035046, + "balance_loss_clip": 1.01760566, + "balance_loss_mlp": 1.04886889, + "epoch": 0.29015481737562004, + "flos": 17932173569280.0, + "grad_norm": 2.3450778905142813, + "language_loss": 0.73504382, + "learning_rate": 3.331048480501092e-06, + "loss": 0.75676298, + "num_input_tokens_seen": 104118930, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.8828125, + "step": 4826, + "time_per_iteration": 2.4689221382141113 + }, + { + "auxiliary_loss_clip": 0.01131943, + "auxiliary_loss_mlp": 0.01036484, + "balance_loss_clip": 1.02041411, + "balance_loss_mlp": 1.04524112, + "epoch": 0.290214940628288, + "flos": 22783525297920.0, + "grad_norm": 3.139827505949132, + "language_loss": 0.68472409, + "learning_rate": 3.3307577703223073e-06, + "loss": 0.70640838, + "num_input_tokens_seen": 104136940, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8671875, + "step": 4827, + "time_per_iteration": 2.5236809253692627 + }, + { + "auxiliary_loss_clip": 0.01136266, + "auxiliary_loss_mlp": 0.01036355, + "balance_loss_clip": 1.01816392, + "balance_loss_mlp": 1.04921937, + "epoch": 0.290275063880956, + "flos": 20006481104640.0, + "grad_norm": 1.8651963869616242, + "language_loss": 0.80072737, + "learning_rate": 3.3304670096810545e-06, + "loss": 0.82245356, + "num_input_tokens_seen": 104154280, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.87109375, + "step": 4828, + "time_per_iteration": 2.491584300994873 + }, + { + "auxiliary_loss_clip": 0.01133081, + "auxiliary_loss_mlp": 0.01042375, + "balance_loss_clip": 1.026335, + "balance_loss_mlp": 1.0482254, + "epoch": 0.29033518713362394, + "flos": 22053605022720.0, + "grad_norm": 2.2252387209358666, + "language_loss": 0.80475402, + "learning_rate": 3.33017619858836e-06, + "loss": 0.82650864, + "num_input_tokens_seen": 104172605, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.84765625, + "step": 4829, + "time_per_iteration": 2.473210334777832 + }, + { + "auxiliary_loss_clip": 0.01131629, + "auxiliary_loss_mlp": 0.01041142, + "balance_loss_clip": 1.02482176, + "balance_loss_mlp": 1.04794419, + "epoch": 0.2903953103862919, + "flos": 25630056351360.0, + "grad_norm": 1.544892870636461, + "language_loss": 0.82288766, + "learning_rate": 3.329885337055249e-06, + "loss": 0.84461534, + "num_input_tokens_seen": 104194120, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8359375, + "step": 4830, + "time_per_iteration": 2.52874755859375 + }, + { + "auxiliary_loss_clip": 0.01136051, + "auxiliary_loss_mlp": 0.01046661, + "balance_loss_clip": 1.02992344, + "balance_loss_mlp": 1.04847991, + "epoch": 0.29045543363895987, + "flos": 16945851035520.0, + "grad_norm": 2.366175746199002, + "language_loss": 0.78858435, + "learning_rate": 3.3295944250927546e-06, + "loss": 0.81041145, + "num_input_tokens_seen": 104210875, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.875, + "step": 4831, + "time_per_iteration": 2.5465588569641113 + }, + { + "auxiliary_loss_clip": 0.0112817, + "auxiliary_loss_mlp": 0.010386, + "balance_loss_clip": 1.02356744, + "balance_loss_mlp": 1.045138, + "epoch": 0.29051555689162784, + "flos": 26395492199040.0, + "grad_norm": 1.8105888440812088, + "language_loss": 0.74415791, + "learning_rate": 3.3293034627119055e-06, + "loss": 0.76582563, + "num_input_tokens_seen": 104229875, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 4832, + "time_per_iteration": 2.6398987770080566 + }, + { + "auxiliary_loss_clip": 0.0113002, + "auxiliary_loss_mlp": 0.01032742, + "balance_loss_clip": 1.01806784, + "balance_loss_mlp": 1.04516697, + "epoch": 0.2905756801442958, + "flos": 21103875469440.0, + "grad_norm": 1.6051950803449415, + "language_loss": 0.75986588, + "learning_rate": 3.329012449923736e-06, + "loss": 0.78149348, + "num_input_tokens_seen": 104250405, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.84765625, + "step": 4833, + "time_per_iteration": 2.4772675037384033 + }, + { + "auxiliary_loss_clip": 0.01129626, + "auxiliary_loss_mlp": 0.01037033, + "balance_loss_clip": 1.02108264, + "balance_loss_mlp": 1.04542434, + "epoch": 0.29063580339696377, + "flos": 15706071158400.0, + "grad_norm": 1.807689816327527, + "language_loss": 0.64523911, + "learning_rate": 3.3287213867392813e-06, + "loss": 0.6669057, + "num_input_tokens_seen": 104269185, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.83984375, + "step": 4834, + "time_per_iteration": 2.4944729804992676 + }, + { + "auxiliary_loss_clip": 0.0112967, + "auxiliary_loss_mlp": 0.01031422, + "balance_loss_clip": 1.01674771, + "balance_loss_mlp": 1.04650283, + "epoch": 0.29069592664963173, + "flos": 24644990793600.0, + "grad_norm": 1.5516449013863105, + "language_loss": 0.71436119, + "learning_rate": 3.3284302731695783e-06, + "loss": 0.73597211, + "num_input_tokens_seen": 104289400, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.83203125, + "step": 4835, + "time_per_iteration": 2.5122785568237305 + }, + { + "auxiliary_loss_clip": 0.01129192, + "auxiliary_loss_mlp": 0.01038882, + "balance_loss_clip": 1.02430248, + "balance_loss_mlp": 1.04510283, + "epoch": 0.2907560499022997, + "flos": 24973753000320.0, + "grad_norm": 2.123413568873549, + "language_loss": 0.79669547, + "learning_rate": 3.3281391092256668e-06, + "loss": 0.81837618, + "num_input_tokens_seen": 104310485, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.83984375, + "step": 4836, + "time_per_iteration": 2.533221483230591 + }, + { + "auxiliary_loss_clip": 0.01129403, + "auxiliary_loss_mlp": 0.01039274, + "balance_loss_clip": 1.02338338, + "balance_loss_mlp": 1.04589558, + "epoch": 0.29081617315496766, + "flos": 18657496903680.0, + "grad_norm": 1.6671781935549963, + "language_loss": 0.80777872, + "learning_rate": 3.3278478949185865e-06, + "loss": 0.82946539, + "num_input_tokens_seen": 104327330, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8359375, + "step": 4837, + "time_per_iteration": 2.4579083919525146 + }, + { + "auxiliary_loss_clip": 0.01131777, + "auxiliary_loss_mlp": 0.01037569, + "balance_loss_clip": 1.02170265, + "balance_loss_mlp": 1.04491532, + "epoch": 0.2908762964076356, + "flos": 35331035955840.0, + "grad_norm": 1.8624538054458508, + "language_loss": 0.67733121, + "learning_rate": 3.327556630259381e-06, + "loss": 0.69902468, + "num_input_tokens_seen": 104350350, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8671875, + "step": 4838, + "time_per_iteration": 2.613682270050049 + }, + { + "auxiliary_loss_clip": 0.01137044, + "auxiliary_loss_mlp": 0.010412, + "balance_loss_clip": 1.02485621, + "balance_loss_mlp": 1.04893696, + "epoch": 0.29093641966030365, + "flos": 23076305055360.0, + "grad_norm": 1.6135989987029238, + "language_loss": 0.71288264, + "learning_rate": 3.327265315259095e-06, + "loss": 0.73466504, + "num_input_tokens_seen": 104369995, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.87890625, + "step": 4839, + "time_per_iteration": 2.506908416748047 + }, + { + "auxiliary_loss_clip": 0.0112979, + "auxiliary_loss_mlp": 0.01038337, + "balance_loss_clip": 1.02341795, + "balance_loss_mlp": 1.04433274, + "epoch": 0.2909965429129716, + "flos": 35955415094400.0, + "grad_norm": 1.876317037835641, + "language_loss": 0.75619674, + "learning_rate": 3.326973949928776e-06, + "loss": 0.77787805, + "num_input_tokens_seen": 104392285, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.85546875, + "step": 4840, + "time_per_iteration": 2.6259472370147705 + }, + { + "auxiliary_loss_clip": 0.011316, + "auxiliary_loss_mlp": 0.01041868, + "balance_loss_clip": 1.02688372, + "balance_loss_mlp": 1.0469749, + "epoch": 0.2910566661656396, + "flos": 30880231764480.0, + "grad_norm": 1.9955793585576265, + "language_loss": 0.60459495, + "learning_rate": 3.326682534279471e-06, + "loss": 0.62632966, + "num_input_tokens_seen": 104412640, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84375, + "step": 4841, + "time_per_iteration": 2.5497686862945557 + }, + { + "auxiliary_loss_clip": 0.01133928, + "auxiliary_loss_mlp": 0.01038335, + "balance_loss_clip": 1.0215385, + "balance_loss_mlp": 1.0483892, + "epoch": 0.29111678941830754, + "flos": 30010188533760.0, + "grad_norm": 1.7266193979009703, + "language_loss": 0.71366, + "learning_rate": 3.326391068322232e-06, + "loss": 0.73538262, + "num_input_tokens_seen": 104435245, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.85546875, + "step": 4842, + "time_per_iteration": 2.5817017555236816 + }, + { + "auxiliary_loss_clip": 0.01131749, + "auxiliary_loss_mlp": 0.0103654, + "balance_loss_clip": 1.02188897, + "balance_loss_mlp": 1.04632473, + "epoch": 0.2911769126709755, + "flos": 22857393617280.0, + "grad_norm": 1.5806493177236067, + "language_loss": 0.72846174, + "learning_rate": 3.3260995520681098e-06, + "loss": 0.7501446, + "num_input_tokens_seen": 104455395, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.85546875, + "step": 4843, + "time_per_iteration": 2.4728853702545166 + }, + { + "auxiliary_loss_clip": 0.01132332, + "auxiliary_loss_mlp": 0.01038034, + "balance_loss_clip": 1.0223223, + "balance_loss_mlp": 1.04598284, + "epoch": 0.2912370359236435, + "flos": 21650507619840.0, + "grad_norm": 2.0237546438656393, + "language_loss": 0.5840022, + "learning_rate": 3.3258079855281602e-06, + "loss": 0.60570586, + "num_input_tokens_seen": 104473350, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.86328125, + "step": 4844, + "time_per_iteration": 3.9377825260162354 + }, + { + "auxiliary_loss_clip": 0.01136792, + "auxiliary_loss_mlp": 0.01042261, + "balance_loss_clip": 1.02518439, + "balance_loss_mlp": 1.04942751, + "epoch": 0.29129715917631144, + "flos": 22893340152960.0, + "grad_norm": 2.1502970284536493, + "language_loss": 0.86360186, + "learning_rate": 3.3255163687134396e-06, + "loss": 0.88539243, + "num_input_tokens_seen": 104492265, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.875, + "step": 4845, + "time_per_iteration": 5.415091276168823 + }, + { + "auxiliary_loss_clip": 0.01134782, + "auxiliary_loss_mlp": 0.01051996, + "balance_loss_clip": 1.03494883, + "balance_loss_mlp": 1.04779911, + "epoch": 0.2913572824289794, + "flos": 22674464628480.0, + "grad_norm": 1.7275133095664568, + "language_loss": 0.66684157, + "learning_rate": 3.3252247016350046e-06, + "loss": 0.68870938, + "num_input_tokens_seen": 104510755, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.8671875, + "step": 4846, + "time_per_iteration": 2.495901584625244 + }, + { + "auxiliary_loss_clip": 0.01131044, + "auxiliary_loss_mlp": 0.01042533, + "balance_loss_clip": 1.02700055, + "balance_loss_mlp": 1.04691291, + "epoch": 0.29141740568164737, + "flos": 23107403255040.0, + "grad_norm": 1.7117272730106567, + "language_loss": 0.70501876, + "learning_rate": 3.3249329843039166e-06, + "loss": 0.72675455, + "num_input_tokens_seen": 104530830, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.83984375, + "step": 4847, + "time_per_iteration": 2.50537109375 + }, + { + "auxiliary_loss_clip": 0.01131589, + "auxiliary_loss_mlp": 0.01035574, + "balance_loss_clip": 1.01918232, + "balance_loss_mlp": 1.04682243, + "epoch": 0.29147752893431533, + "flos": 23587026583680.0, + "grad_norm": 2.14972579950547, + "language_loss": 0.73494464, + "learning_rate": 3.324641216731237e-06, + "loss": 0.75661629, + "num_input_tokens_seen": 104550115, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.84765625, + "step": 4848, + "time_per_iteration": 2.506683111190796 + }, + { + "auxiliary_loss_clip": 0.01132855, + "auxiliary_loss_mlp": 0.01042107, + "balance_loss_clip": 1.02569222, + "balance_loss_mlp": 1.04670119, + "epoch": 0.2915376521869833, + "flos": 20591968792320.0, + "grad_norm": 2.106691725132959, + "language_loss": 0.76689458, + "learning_rate": 3.3243493989280295e-06, + "loss": 0.78864431, + "num_input_tokens_seen": 104566255, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.86328125, + "step": 4849, + "time_per_iteration": 2.475512742996216 + }, + { + "auxiliary_loss_clip": 0.01134647, + "auxiliary_loss_mlp": 0.01043325, + "balance_loss_clip": 1.02732718, + "balance_loss_mlp": 1.04683709, + "epoch": 0.29159777543965126, + "flos": 20811490761600.0, + "grad_norm": 1.7698868684834754, + "language_loss": 0.78437513, + "learning_rate": 3.3240575309053596e-06, + "loss": 0.80615485, + "num_input_tokens_seen": 104585235, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87890625, + "step": 4850, + "time_per_iteration": 2.4774062633514404 + }, + { + "auxiliary_loss_clip": 0.01130071, + "auxiliary_loss_mlp": 0.01038553, + "balance_loss_clip": 1.02231026, + "balance_loss_mlp": 1.04620552, + "epoch": 0.29165789869231923, + "flos": 24244155947520.0, + "grad_norm": 1.7416717517415665, + "language_loss": 0.75775445, + "learning_rate": 3.323765612674296e-06, + "loss": 0.77944064, + "num_input_tokens_seen": 104605315, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.83984375, + "step": 4851, + "time_per_iteration": 2.4973719120025635 + }, + { + "auxiliary_loss_clip": 0.01130818, + "auxiliary_loss_mlp": 0.01045515, + "balance_loss_clip": 1.03071558, + "balance_loss_mlp": 1.04819655, + "epoch": 0.29171802194498725, + "flos": 28949925853440.0, + "grad_norm": 1.378687766604426, + "language_loss": 0.77111661, + "learning_rate": 3.3234736442459078e-06, + "loss": 0.79287988, + "num_input_tokens_seen": 104626055, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.828125, + "step": 4852, + "time_per_iteration": 2.5339767932891846 + }, + { + "auxiliary_loss_clip": 0.01132717, + "auxiliary_loss_mlp": 0.01045328, + "balance_loss_clip": 1.0296402, + "balance_loss_mlp": 1.04735672, + "epoch": 0.2917781451976552, + "flos": 22598226011520.0, + "grad_norm": 1.5345579183576068, + "language_loss": 0.78385615, + "learning_rate": 3.3231816256312665e-06, + "loss": 0.80563664, + "num_input_tokens_seen": 104646005, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8515625, + "step": 4853, + "time_per_iteration": 2.511125087738037 + }, + { + "auxiliary_loss_clip": 0.0113401, + "auxiliary_loss_mlp": 0.01038526, + "balance_loss_clip": 1.02278996, + "balance_loss_mlp": 1.04668474, + "epoch": 0.2918382684503232, + "flos": 21574448570880.0, + "grad_norm": 2.984154109703724, + "language_loss": 0.87946999, + "learning_rate": 3.322889556841445e-06, + "loss": 0.90119541, + "num_input_tokens_seen": 104661620, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.875, + "step": 4854, + "time_per_iteration": 2.4654700756073 + }, + { + "auxiliary_loss_clip": 0.0113237, + "auxiliary_loss_mlp": 0.01052716, + "balance_loss_clip": 1.03352284, + "balance_loss_mlp": 1.04678071, + "epoch": 0.29189839170299114, + "flos": 24353503925760.0, + "grad_norm": 1.8357290509449282, + "language_loss": 0.86585724, + "learning_rate": 3.322597437887519e-06, + "loss": 0.88770819, + "num_input_tokens_seen": 104681445, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.85546875, + "step": 4855, + "time_per_iteration": 2.519432783126831 + }, + { + "auxiliary_loss_clip": 0.01043355, + "auxiliary_loss_mlp": 0.01004722, + "balance_loss_clip": 1.00283837, + "balance_loss_mlp": 1.01374364, + "epoch": 0.2919585149556591, + "flos": 71316726215040.0, + "grad_norm": 0.8090362112321295, + "language_loss": 0.60199535, + "learning_rate": 3.322305268780566e-06, + "loss": 0.6224761, + "num_input_tokens_seen": 104747945, + "router_z_loss_clip": 0.01879883, + "router_z_loss_mlp": 0.296875, + "step": 4856, + "time_per_iteration": 3.164905309677124 + }, + { + "auxiliary_loss_clip": 0.01130578, + "auxiliary_loss_mlp": 0.01039982, + "balance_loss_clip": 1.02499735, + "balance_loss_mlp": 1.04626632, + "epoch": 0.2920186382083271, + "flos": 15633208419840.0, + "grad_norm": 2.394144218040463, + "language_loss": 0.67995465, + "learning_rate": 3.322013049531664e-06, + "loss": 0.70166028, + "num_input_tokens_seen": 104766225, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.84375, + "step": 4857, + "time_per_iteration": 2.4615678787231445 + }, + { + "auxiliary_loss_clip": 0.01129998, + "auxiliary_loss_mlp": 0.01035751, + "balance_loss_clip": 1.0210768, + "balance_loss_mlp": 1.04613733, + "epoch": 0.29207876146099504, + "flos": 28366018364160.0, + "grad_norm": 2.1807634638236566, + "language_loss": 0.83958411, + "learning_rate": 3.321720780151895e-06, + "loss": 0.86124158, + "num_input_tokens_seen": 104785345, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.83984375, + "step": 4858, + "time_per_iteration": 2.561347723007202 + }, + { + "auxiliary_loss_clip": 0.01131346, + "auxiliary_loss_mlp": 0.0103964, + "balance_loss_clip": 1.02478647, + "balance_loss_mlp": 1.04746854, + "epoch": 0.292138884713663, + "flos": 21870963342720.0, + "grad_norm": 2.0714117361066298, + "language_loss": 0.77547097, + "learning_rate": 3.321428460652342e-06, + "loss": 0.79718083, + "num_input_tokens_seen": 104804560, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.83984375, + "step": 4859, + "time_per_iteration": 2.4801361560821533 + }, + { + "auxiliary_loss_clip": 0.01132855, + "auxiliary_loss_mlp": 0.01043796, + "balance_loss_clip": 1.02764332, + "balance_loss_mlp": 1.04424477, + "epoch": 0.29219900796633097, + "flos": 20992552243200.0, + "grad_norm": 2.0548529873010564, + "language_loss": 0.68948561, + "learning_rate": 3.3211360910440885e-06, + "loss": 0.71125209, + "num_input_tokens_seen": 104821105, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8828125, + "step": 4860, + "time_per_iteration": 2.531022071838379 + }, + { + "auxiliary_loss_clip": 0.01129954, + "auxiliary_loss_mlp": 0.01040561, + "balance_loss_clip": 1.0267868, + "balance_loss_mlp": 1.04821134, + "epoch": 0.29225913121899894, + "flos": 35004608133120.0, + "grad_norm": 2.771004145303475, + "language_loss": 0.75952631, + "learning_rate": 3.320843671338222e-06, + "loss": 0.78123146, + "num_input_tokens_seen": 104841440, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.81640625, + "step": 4861, + "time_per_iteration": 2.619257926940918 + }, + { + "auxiliary_loss_clip": 0.01129568, + "auxiliary_loss_mlp": 0.01048123, + "balance_loss_clip": 1.03350759, + "balance_loss_mlp": 1.04631817, + "epoch": 0.2923192544716669, + "flos": 13515663888000.0, + "grad_norm": 1.7230129115334698, + "language_loss": 0.91648388, + "learning_rate": 3.320551201545832e-06, + "loss": 0.93826073, + "num_input_tokens_seen": 104858210, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.83203125, + "step": 4862, + "time_per_iteration": 2.4596564769744873 + }, + { + "auxiliary_loss_clip": 0.01129785, + "auxiliary_loss_mlp": 0.01038215, + "balance_loss_clip": 1.02336144, + "balance_loss_mlp": 1.04544663, + "epoch": 0.29237937772433487, + "flos": 19463512141440.0, + "grad_norm": 2.061794510539927, + "language_loss": 0.73736131, + "learning_rate": 3.320258681678008e-06, + "loss": 0.75904131, + "num_input_tokens_seen": 104875620, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.84375, + "step": 4863, + "time_per_iteration": 2.4478728771209717 + }, + { + "auxiliary_loss_clip": 0.01125934, + "auxiliary_loss_mlp": 0.01038806, + "balance_loss_clip": 1.02474487, + "balance_loss_mlp": 1.04584527, + "epoch": 0.29243950097700283, + "flos": 20850597694080.0, + "grad_norm": 1.6779515608592832, + "language_loss": 0.78057373, + "learning_rate": 3.319966111745842e-06, + "loss": 0.80222106, + "num_input_tokens_seen": 104894600, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.80078125, + "step": 4864, + "time_per_iteration": 2.487544059753418 + }, + { + "auxiliary_loss_clip": 0.0113348, + "auxiliary_loss_mlp": 0.01045654, + "balance_loss_clip": 1.02927482, + "balance_loss_mlp": 1.04763806, + "epoch": 0.29249962422967085, + "flos": 23584225322880.0, + "grad_norm": 2.699456605470703, + "language_loss": 0.81919956, + "learning_rate": 3.319673491760429e-06, + "loss": 0.8409909, + "num_input_tokens_seen": 104914530, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.859375, + "step": 4865, + "time_per_iteration": 2.486553192138672 + }, + { + "auxiliary_loss_clip": 0.01130825, + "auxiliary_loss_mlp": 0.01040981, + "balance_loss_clip": 1.02523327, + "balance_loss_mlp": 1.04592669, + "epoch": 0.2925597474823388, + "flos": 22273342473600.0, + "grad_norm": 1.8393536761495908, + "language_loss": 0.85281575, + "learning_rate": 3.3193808217328645e-06, + "loss": 0.87453377, + "num_input_tokens_seen": 104933460, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8515625, + "step": 4866, + "time_per_iteration": 2.4981276988983154 + }, + { + "auxiliary_loss_clip": 0.01124877, + "auxiliary_loss_mlp": 0.01037248, + "balance_loss_clip": 1.02263868, + "balance_loss_mlp": 1.04323506, + "epoch": 0.2926198707350068, + "flos": 34456108475520.0, + "grad_norm": 1.627734535935432, + "language_loss": 0.755858, + "learning_rate": 3.3190881016742476e-06, + "loss": 0.77747923, + "num_input_tokens_seen": 104954495, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.81640625, + "step": 4867, + "time_per_iteration": 2.5813703536987305 + }, + { + "auxiliary_loss_clip": 0.01129928, + "auxiliary_loss_mlp": 0.01049325, + "balance_loss_clip": 1.03337526, + "balance_loss_mlp": 1.04375887, + "epoch": 0.29267999398767475, + "flos": 20704153944960.0, + "grad_norm": 4.179606236398783, + "language_loss": 0.73403615, + "learning_rate": 3.3187953315956776e-06, + "loss": 0.75582874, + "num_input_tokens_seen": 104971915, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.859375, + "step": 4868, + "time_per_iteration": 2.48374342918396 + }, + { + "auxiliary_loss_clip": 0.01128319, + "auxiliary_loss_mlp": 0.01033217, + "balance_loss_clip": 1.01857829, + "balance_loss_mlp": 1.04520726, + "epoch": 0.2927401172403427, + "flos": 18368667642240.0, + "grad_norm": 1.3015957921166281, + "language_loss": 0.74555755, + "learning_rate": 3.3185025115082566e-06, + "loss": 0.76717293, + "num_input_tokens_seen": 104991335, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.83203125, + "step": 4869, + "time_per_iteration": 2.458434820175171 + }, + { + "auxiliary_loss_clip": 0.01131121, + "auxiliary_loss_mlp": 0.01038828, + "balance_loss_clip": 1.02390289, + "balance_loss_mlp": 1.04639244, + "epoch": 0.2928002404930107, + "flos": 26104041244800.0, + "grad_norm": 1.465584897312906, + "language_loss": 0.76539874, + "learning_rate": 3.318209641423088e-06, + "loss": 0.78709823, + "num_input_tokens_seen": 105012015, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.84765625, + "step": 4870, + "time_per_iteration": 2.5194873809814453 + }, + { + "auxiliary_loss_clip": 0.01133405, + "auxiliary_loss_mlp": 0.01046415, + "balance_loss_clip": 1.03040564, + "balance_loss_mlp": 1.04584765, + "epoch": 0.29286036374567864, + "flos": 21324726241920.0, + "grad_norm": 2.259080578005736, + "language_loss": 0.67315602, + "learning_rate": 3.3179167213512777e-06, + "loss": 0.69495422, + "num_input_tokens_seen": 105031460, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.875, + "step": 4871, + "time_per_iteration": 2.4556169509887695 + }, + { + "auxiliary_loss_clip": 0.01125512, + "auxiliary_loss_mlp": 0.01039548, + "balance_loss_clip": 1.02509975, + "balance_loss_mlp": 1.04283524, + "epoch": 0.2929204869983466, + "flos": 29569492569600.0, + "grad_norm": 1.8081222369362746, + "language_loss": 0.76924586, + "learning_rate": 3.317623751303933e-06, + "loss": 0.79089642, + "num_input_tokens_seen": 105052965, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.828125, + "step": 4872, + "time_per_iteration": 2.575421094894409 + }, + { + "auxiliary_loss_clip": 0.01131072, + "auxiliary_loss_mlp": 0.01043663, + "balance_loss_clip": 1.0271883, + "balance_loss_mlp": 1.04527128, + "epoch": 0.2929806102510146, + "flos": 19058259922560.0, + "grad_norm": 2.2968152323379347, + "language_loss": 0.72835052, + "learning_rate": 3.317330731292164e-06, + "loss": 0.75009787, + "num_input_tokens_seen": 105071840, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.859375, + "step": 4873, + "time_per_iteration": 2.4370815753936768 + }, + { + "auxiliary_loss_clip": 0.01133087, + "auxiliary_loss_mlp": 0.01041861, + "balance_loss_clip": 1.02518392, + "balance_loss_mlp": 1.04519463, + "epoch": 0.29304073350368254, + "flos": 21944221130880.0, + "grad_norm": 1.8384173868300016, + "language_loss": 0.77871835, + "learning_rate": 3.3170376613270812e-06, + "loss": 0.80046785, + "num_input_tokens_seen": 105089445, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.87890625, + "step": 4874, + "time_per_iteration": 2.512613534927368 + }, + { + "auxiliary_loss_clip": 0.01135856, + "auxiliary_loss_mlp": 0.01045857, + "balance_loss_clip": 1.02962041, + "balance_loss_mlp": 1.04670048, + "epoch": 0.2931008567563505, + "flos": 15450818135040.0, + "grad_norm": 2.084283832751276, + "language_loss": 0.77047002, + "learning_rate": 3.3167445414197985e-06, + "loss": 0.79228717, + "num_input_tokens_seen": 105106210, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.890625, + "step": 4875, + "time_per_iteration": 2.487417221069336 + }, + { + "auxiliary_loss_clip": 0.01136141, + "auxiliary_loss_mlp": 0.0103441, + "balance_loss_clip": 1.01912785, + "balance_loss_mlp": 1.04909277, + "epoch": 0.29316098000901847, + "flos": 16983162288000.0, + "grad_norm": 1.6806867883636405, + "language_loss": 0.69183826, + "learning_rate": 3.316451371581431e-06, + "loss": 0.71354383, + "num_input_tokens_seen": 105124200, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.87109375, + "step": 4876, + "time_per_iteration": 2.4764888286590576 + }, + { + "auxiliary_loss_clip": 0.01128897, + "auxiliary_loss_mlp": 0.01045114, + "balance_loss_clip": 1.03027201, + "balance_loss_mlp": 1.04482532, + "epoch": 0.29322110326168643, + "flos": 16357705741440.0, + "grad_norm": 2.3621737524413913, + "language_loss": 0.8195532, + "learning_rate": 3.316158151823096e-06, + "loss": 0.84129333, + "num_input_tokens_seen": 105140400, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.83984375, + "step": 4877, + "time_per_iteration": 2.4738340377807617 + }, + { + "auxiliary_loss_clip": 0.01134087, + "auxiliary_loss_mlp": 0.01042806, + "balance_loss_clip": 1.02765405, + "balance_loss_mlp": 1.04704273, + "epoch": 0.29328122651435445, + "flos": 13990869843840.0, + "grad_norm": 1.8654341954981455, + "language_loss": 0.67843962, + "learning_rate": 3.315864882155911e-06, + "loss": 0.70020854, + "num_input_tokens_seen": 105157535, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.87109375, + "step": 4878, + "time_per_iteration": 2.4606332778930664 + }, + { + "auxiliary_loss_clip": 0.01130502, + "auxiliary_loss_mlp": 0.01041377, + "balance_loss_clip": 1.02624929, + "balance_loss_mlp": 1.04562759, + "epoch": 0.2933413497670224, + "flos": 25264593423360.0, + "grad_norm": 1.8286598598322423, + "language_loss": 0.7351383, + "learning_rate": 3.3155715625909982e-06, + "loss": 0.7568571, + "num_input_tokens_seen": 105175185, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.84765625, + "step": 4879, + "time_per_iteration": 2.502901315689087 + }, + { + "auxiliary_loss_clip": 0.01137009, + "auxiliary_loss_mlp": 0.01046436, + "balance_loss_clip": 1.02881706, + "balance_loss_mlp": 1.0484302, + "epoch": 0.2934014730196904, + "flos": 32123746656000.0, + "grad_norm": 2.0641755158914634, + "language_loss": 0.65864384, + "learning_rate": 3.3152781931394803e-06, + "loss": 0.68047822, + "num_input_tokens_seen": 105194540, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.88671875, + "step": 4880, + "time_per_iteration": 2.5785939693450928 + }, + { + "auxiliary_loss_clip": 0.01130839, + "auxiliary_loss_mlp": 0.01045571, + "balance_loss_clip": 1.02962136, + "balance_loss_mlp": 1.04453218, + "epoch": 0.29346159627235835, + "flos": 24352498344960.0, + "grad_norm": 2.157512175932489, + "language_loss": 0.70518327, + "learning_rate": 3.314984773812481e-06, + "loss": 0.72694737, + "num_input_tokens_seen": 105213215, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.86328125, + "step": 4881, + "time_per_iteration": 2.4913742542266846 + }, + { + "auxiliary_loss_clip": 0.01133082, + "auxiliary_loss_mlp": 0.01039157, + "balance_loss_clip": 1.02336192, + "balance_loss_mlp": 1.0471015, + "epoch": 0.2935217195250263, + "flos": 22746752749440.0, + "grad_norm": 2.112776228996839, + "language_loss": 0.83907056, + "learning_rate": 3.314691304621127e-06, + "loss": 0.86079299, + "num_input_tokens_seen": 105231585, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.859375, + "step": 4882, + "time_per_iteration": 2.4955010414123535 + }, + { + "auxiliary_loss_clip": 0.01135526, + "auxiliary_loss_mlp": 0.01041581, + "balance_loss_clip": 1.02495086, + "balance_loss_mlp": 1.0470233, + "epoch": 0.2935818427776943, + "flos": 21725561088000.0, + "grad_norm": 2.198383771985309, + "language_loss": 0.71811014, + "learning_rate": 3.314397785576548e-06, + "loss": 0.73988116, + "num_input_tokens_seen": 105250120, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8828125, + "step": 4883, + "time_per_iteration": 2.474574089050293 + }, + { + "auxiliary_loss_clip": 0.01132572, + "auxiliary_loss_mlp": 0.01038466, + "balance_loss_clip": 1.02225327, + "balance_loss_mlp": 1.04580843, + "epoch": 0.29364196603036224, + "flos": 23804968354560.0, + "grad_norm": 3.497082861184858, + "language_loss": 0.92629534, + "learning_rate": 3.3141042166898726e-06, + "loss": 0.94800568, + "num_input_tokens_seen": 105266065, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8671875, + "step": 4884, + "time_per_iteration": 2.4947426319122314 + }, + { + "auxiliary_loss_clip": 0.01138135, + "auxiliary_loss_mlp": 0.01045606, + "balance_loss_clip": 1.03032374, + "balance_loss_mlp": 1.05094171, + "epoch": 0.2937020892830302, + "flos": 23470064922240.0, + "grad_norm": 2.2315982417854876, + "language_loss": 0.73729408, + "learning_rate": 3.313810597972234e-06, + "loss": 0.75913155, + "num_input_tokens_seen": 105282155, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.87109375, + "step": 4885, + "time_per_iteration": 2.5076494216918945 + }, + { + "auxiliary_loss_clip": 0.01132864, + "auxiliary_loss_mlp": 0.0104824, + "balance_loss_clip": 1.03185511, + "balance_loss_mlp": 1.0468272, + "epoch": 0.2937622125356982, + "flos": 24272740195200.0, + "grad_norm": 2.1964333946604135, + "language_loss": 0.85011208, + "learning_rate": 3.3135169294347655e-06, + "loss": 0.87192315, + "num_input_tokens_seen": 105299225, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.859375, + "step": 4886, + "time_per_iteration": 3.911407232284546 + }, + { + "auxiliary_loss_clip": 0.01135202, + "auxiliary_loss_mlp": 0.01041375, + "balance_loss_clip": 1.02624702, + "balance_loss_mlp": 1.04678059, + "epoch": 0.29382233578836614, + "flos": 20662461233280.0, + "grad_norm": 2.1393217933297657, + "language_loss": 0.77027792, + "learning_rate": 3.313223211088603e-06, + "loss": 0.79204369, + "num_input_tokens_seen": 105315710, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.88671875, + "step": 4887, + "time_per_iteration": 3.906132936477661 + }, + { + "auxiliary_loss_clip": 0.01135916, + "auxiliary_loss_mlp": 0.01046614, + "balance_loss_clip": 1.03127122, + "balance_loss_mlp": 1.04697633, + "epoch": 0.2938824590410341, + "flos": 16545052103040.0, + "grad_norm": 2.1952396364021536, + "language_loss": 0.79558414, + "learning_rate": 3.3129294429448855e-06, + "loss": 0.8174094, + "num_input_tokens_seen": 105333505, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.890625, + "step": 4888, + "time_per_iteration": 2.4338221549987793 + }, + { + "auxiliary_loss_clip": 0.01130748, + "auxiliary_loss_mlp": 0.01032451, + "balance_loss_clip": 1.0173831, + "balance_loss_mlp": 1.04529762, + "epoch": 0.29394258229370207, + "flos": 37925474382720.0, + "grad_norm": 1.4299668586503376, + "language_loss": 0.55301261, + "learning_rate": 3.3126356250147517e-06, + "loss": 0.57464457, + "num_input_tokens_seen": 105355605, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.85546875, + "step": 4889, + "time_per_iteration": 2.637645959854126 + }, + { + "auxiliary_loss_clip": 0.01134449, + "auxiliary_loss_mlp": 0.01039798, + "balance_loss_clip": 1.02314413, + "balance_loss_mlp": 1.0465076, + "epoch": 0.29400270554637004, + "flos": 20044690197120.0, + "grad_norm": 1.9477461279926194, + "language_loss": 0.84309214, + "learning_rate": 3.3123417573093434e-06, + "loss": 0.86483455, + "num_input_tokens_seen": 105374225, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.87890625, + "step": 4890, + "time_per_iteration": 2.445218801498413 + }, + { + "auxiliary_loss_clip": 0.01135501, + "auxiliary_loss_mlp": 0.01039459, + "balance_loss_clip": 1.02402174, + "balance_loss_mlp": 1.04780436, + "epoch": 0.294062828799038, + "flos": 15266380775040.0, + "grad_norm": 1.9951401673219091, + "language_loss": 0.72357798, + "learning_rate": 3.3120478398398046e-06, + "loss": 0.74532759, + "num_input_tokens_seen": 105391565, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.875, + "step": 4891, + "time_per_iteration": 2.434298515319824 + }, + { + "auxiliary_loss_clip": 0.01134115, + "auxiliary_loss_mlp": 0.01045308, + "balance_loss_clip": 1.02910721, + "balance_loss_mlp": 1.04683042, + "epoch": 0.294122952051706, + "flos": 22747147799040.0, + "grad_norm": 1.9834299238301316, + "language_loss": 0.77230573, + "learning_rate": 3.3117538726172797e-06, + "loss": 0.79410005, + "num_input_tokens_seen": 105409840, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.875, + "step": 4892, + "time_per_iteration": 2.4628806114196777 + }, + { + "auxiliary_loss_clip": 0.01130172, + "auxiliary_loss_mlp": 0.01035757, + "balance_loss_clip": 1.01989055, + "balance_loss_mlp": 1.04514182, + "epoch": 0.294183075304374, + "flos": 24972891073920.0, + "grad_norm": 1.7053650125053033, + "language_loss": 0.7846024, + "learning_rate": 3.3114598556529164e-06, + "loss": 0.80626166, + "num_input_tokens_seen": 105428645, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8515625, + "step": 4893, + "time_per_iteration": 2.505946159362793 + }, + { + "auxiliary_loss_clip": 0.01132333, + "auxiliary_loss_mlp": 0.01048117, + "balance_loss_clip": 1.03252435, + "balance_loss_mlp": 1.04651928, + "epoch": 0.29424319855704195, + "flos": 30952986762240.0, + "grad_norm": 1.8389301673785101, + "language_loss": 0.85052156, + "learning_rate": 3.311165788957864e-06, + "loss": 0.87232608, + "num_input_tokens_seen": 105447480, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.85546875, + "step": 4894, + "time_per_iteration": 2.5221872329711914 + }, + { + "auxiliary_loss_clip": 0.01132661, + "auxiliary_loss_mlp": 0.01036474, + "balance_loss_clip": 1.02120304, + "balance_loss_mlp": 1.04568195, + "epoch": 0.2943033218097099, + "flos": 15231583474560.0, + "grad_norm": 2.595597690193387, + "language_loss": 0.9027828, + "learning_rate": 3.310871672543274e-06, + "loss": 0.92447418, + "num_input_tokens_seen": 105464600, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.87109375, + "step": 4895, + "time_per_iteration": 2.4466798305511475 + }, + { + "auxiliary_loss_clip": 0.01135692, + "auxiliary_loss_mlp": 0.01040955, + "balance_loss_clip": 1.02434874, + "balance_loss_mlp": 1.04720199, + "epoch": 0.2943634450623779, + "flos": 21725884310400.0, + "grad_norm": 3.001231056574592, + "language_loss": 0.86597103, + "learning_rate": 3.3105775064202982e-06, + "loss": 0.88773751, + "num_input_tokens_seen": 105481510, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8828125, + "step": 4896, + "time_per_iteration": 2.459611654281616 + }, + { + "auxiliary_loss_clip": 0.01134294, + "auxiliary_loss_mlp": 0.01050105, + "balance_loss_clip": 1.03402412, + "balance_loss_mlp": 1.04802299, + "epoch": 0.29442356831504585, + "flos": 22602104680320.0, + "grad_norm": 2.652800133974417, + "language_loss": 0.73196733, + "learning_rate": 3.3102832906000924e-06, + "loss": 0.75381136, + "num_input_tokens_seen": 105501390, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.86328125, + "step": 4897, + "time_per_iteration": 2.4981348514556885 + }, + { + "auxiliary_loss_clip": 0.01136241, + "auxiliary_loss_mlp": 0.01042547, + "balance_loss_clip": 1.02546394, + "balance_loss_mlp": 1.0458895, + "epoch": 0.2944836915677138, + "flos": 20011401267840.0, + "grad_norm": 1.867954953207583, + "language_loss": 0.73798919, + "learning_rate": 3.309989025093813e-06, + "loss": 0.75977707, + "num_input_tokens_seen": 105519600, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.90234375, + "step": 4898, + "time_per_iteration": 2.439952850341797 + }, + { + "auxiliary_loss_clip": 0.01142949, + "auxiliary_loss_mlp": 0.01042887, + "balance_loss_clip": 1.02471972, + "balance_loss_mlp": 1.05136585, + "epoch": 0.2945438148203818, + "flos": 20045875345920.0, + "grad_norm": 2.6754375338801477, + "language_loss": 0.70309317, + "learning_rate": 3.309694709912618e-06, + "loss": 0.72495157, + "num_input_tokens_seen": 105535970, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9140625, + "step": 4899, + "time_per_iteration": 2.4757347106933594 + }, + { + "auxiliary_loss_clip": 0.01135914, + "auxiliary_loss_mlp": 0.01041458, + "balance_loss_clip": 1.02458405, + "balance_loss_mlp": 1.0484879, + "epoch": 0.29460393807304974, + "flos": 23733542160000.0, + "grad_norm": 1.9063479453414416, + "language_loss": 0.79007781, + "learning_rate": 3.3094003450676685e-06, + "loss": 0.8118515, + "num_input_tokens_seen": 105556735, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.875, + "step": 4900, + "time_per_iteration": 2.50555419921875 + }, + { + "auxiliary_loss_clip": 0.01131673, + "auxiliary_loss_mlp": 0.01042831, + "balance_loss_clip": 1.02720261, + "balance_loss_mlp": 1.04425764, + "epoch": 0.2946640613257177, + "flos": 14976079056000.0, + "grad_norm": 1.709443882500664, + "language_loss": 0.80718857, + "learning_rate": 3.3091059305701268e-06, + "loss": 0.8289336, + "num_input_tokens_seen": 105574875, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.875, + "step": 4901, + "time_per_iteration": 2.481768846511841 + }, + { + "auxiliary_loss_clip": 0.01127885, + "auxiliary_loss_mlp": 0.01035027, + "balance_loss_clip": 1.02062666, + "balance_loss_mlp": 1.04583955, + "epoch": 0.2947241845783857, + "flos": 24243904552320.0, + "grad_norm": 1.9567596526300628, + "language_loss": 0.57923675, + "learning_rate": 3.308811466431157e-06, + "loss": 0.60086584, + "num_input_tokens_seen": 105594225, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8203125, + "step": 4902, + "time_per_iteration": 2.491337299346924 + }, + { + "auxiliary_loss_clip": 0.01131951, + "auxiliary_loss_mlp": 0.01038919, + "balance_loss_clip": 1.02416682, + "balance_loss_mlp": 1.045946, + "epoch": 0.29478430783105364, + "flos": 19938394874880.0, + "grad_norm": 1.6713771638909152, + "language_loss": 0.75298065, + "learning_rate": 3.308516952661925e-06, + "loss": 0.77468932, + "num_input_tokens_seen": 105614000, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.859375, + "step": 4903, + "time_per_iteration": 2.4884400367736816 + }, + { + "auxiliary_loss_clip": 0.01132991, + "auxiliary_loss_mlp": 0.01042452, + "balance_loss_clip": 1.02560806, + "balance_loss_mlp": 1.04630995, + "epoch": 0.2948444310837216, + "flos": 27381347856000.0, + "grad_norm": 1.8012466742437707, + "language_loss": 0.6254617, + "learning_rate": 3.3082223892736e-06, + "loss": 0.64721614, + "num_input_tokens_seen": 105634575, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8671875, + "step": 4904, + "time_per_iteration": 2.5288941860198975 + }, + { + "auxiliary_loss_clip": 0.01134735, + "auxiliary_loss_mlp": 0.01038462, + "balance_loss_clip": 1.02252424, + "balance_loss_mlp": 1.04603219, + "epoch": 0.2949045543363896, + "flos": 23405462311680.0, + "grad_norm": 1.5173763027357385, + "language_loss": 0.7301079, + "learning_rate": 3.3079277762773496e-06, + "loss": 0.75183994, + "num_input_tokens_seen": 105654385, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.88671875, + "step": 4905, + "time_per_iteration": 2.5069708824157715 + }, + { + "auxiliary_loss_clip": 0.01131529, + "auxiliary_loss_mlp": 0.01041997, + "balance_loss_clip": 1.02577305, + "balance_loss_mlp": 1.0456897, + "epoch": 0.2949646775890576, + "flos": 23951483930880.0, + "grad_norm": 1.6701950888056076, + "language_loss": 0.81584871, + "learning_rate": 3.3076331136843476e-06, + "loss": 0.8375839, + "num_input_tokens_seen": 105673570, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.859375, + "step": 4906, + "time_per_iteration": 2.473604202270508 + }, + { + "auxiliary_loss_clip": 0.01128251, + "auxiliary_loss_mlp": 0.01034193, + "balance_loss_clip": 1.01870799, + "balance_loss_mlp": 1.04443395, + "epoch": 0.29502480084172555, + "flos": 22784315397120.0, + "grad_norm": 1.9494272179492087, + "language_loss": 0.87158448, + "learning_rate": 3.3073384015057667e-06, + "loss": 0.89320892, + "num_input_tokens_seen": 105691940, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.83984375, + "step": 4907, + "time_per_iteration": 2.490842819213867 + }, + { + "auxiliary_loss_clip": 0.01135464, + "auxiliary_loss_mlp": 0.01042187, + "balance_loss_clip": 1.02623653, + "balance_loss_mlp": 1.04758191, + "epoch": 0.2950849240943935, + "flos": 19646656611840.0, + "grad_norm": 2.3387997458884833, + "language_loss": 0.81563503, + "learning_rate": 3.307043639752782e-06, + "loss": 0.83741152, + "num_input_tokens_seen": 105709825, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87890625, + "step": 4908, + "time_per_iteration": 2.4586410522460938 + }, + { + "auxiliary_loss_clip": 0.01054339, + "auxiliary_loss_mlp": 0.01042068, + "balance_loss_clip": 1.03970814, + "balance_loss_mlp": 1.0157342, + "epoch": 0.2951450473470615, + "flos": 71002829260800.0, + "grad_norm": 0.7811313355607663, + "language_loss": 0.57214808, + "learning_rate": 3.3067488284365728e-06, + "loss": 0.59311211, + "num_input_tokens_seen": 105766880, + "router_z_loss_clip": 0.02355957, + "router_z_loss_mlp": 0.38671875, + "step": 4909, + "time_per_iteration": 2.9739394187927246 + }, + { + "auxiliary_loss_clip": 0.01136234, + "auxiliary_loss_mlp": 0.01038405, + "balance_loss_clip": 1.02340245, + "balance_loss_mlp": 1.05156505, + "epoch": 0.29520517059972945, + "flos": 22966310632320.0, + "grad_norm": 1.44395719574742, + "language_loss": 0.86585498, + "learning_rate": 3.3064539675683163e-06, + "loss": 0.88760138, + "num_input_tokens_seen": 105786875, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.84765625, + "step": 4910, + "time_per_iteration": 2.4779117107391357 + }, + { + "auxiliary_loss_clip": 0.01126914, + "auxiliary_loss_mlp": 0.01040378, + "balance_loss_clip": 1.02551222, + "balance_loss_mlp": 1.04549575, + "epoch": 0.2952652938523974, + "flos": 20485673470080.0, + "grad_norm": 1.8630755123750513, + "language_loss": 0.72632295, + "learning_rate": 3.3061590571591946e-06, + "loss": 0.74799585, + "num_input_tokens_seen": 105805315, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8125, + "step": 4911, + "time_per_iteration": 2.4959700107574463 + }, + { + "auxiliary_loss_clip": 0.01131053, + "auxiliary_loss_mlp": 0.01037399, + "balance_loss_clip": 1.02239108, + "balance_loss_mlp": 1.04823601, + "epoch": 0.2953254171050654, + "flos": 19646584784640.0, + "grad_norm": 1.774615067737937, + "language_loss": 0.8988539, + "learning_rate": 3.3058640972203904e-06, + "loss": 0.92053854, + "num_input_tokens_seen": 105825125, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 4912, + "time_per_iteration": 2.4532997608184814 + }, + { + "auxiliary_loss_clip": 0.01131716, + "auxiliary_loss_mlp": 0.01046481, + "balance_loss_clip": 1.03022075, + "balance_loss_mlp": 1.04712319, + "epoch": 0.29538554035773334, + "flos": 22747973811840.0, + "grad_norm": 1.458226475428025, + "language_loss": 0.83448595, + "learning_rate": 3.3055690877630894e-06, + "loss": 0.85626793, + "num_input_tokens_seen": 105846085, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.84765625, + "step": 4913, + "time_per_iteration": 2.515580654144287 + }, + { + "auxiliary_loss_clip": 0.01129704, + "auxiliary_loss_mlp": 0.01039162, + "balance_loss_clip": 1.02385521, + "balance_loss_mlp": 1.0438993, + "epoch": 0.2954456636104013, + "flos": 21871861182720.0, + "grad_norm": 1.6602062940724112, + "language_loss": 0.77029538, + "learning_rate": 3.3052740287984765e-06, + "loss": 0.79198408, + "num_input_tokens_seen": 105865400, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.859375, + "step": 4914, + "time_per_iteration": 2.457158088684082 + }, + { + "auxiliary_loss_clip": 0.01128554, + "auxiliary_loss_mlp": 0.01039033, + "balance_loss_clip": 1.02302349, + "balance_loss_mlp": 1.04553497, + "epoch": 0.2955057868630693, + "flos": 40442560871040.0, + "grad_norm": 1.9027466376674422, + "language_loss": 0.81550008, + "learning_rate": 3.3049789203377424e-06, + "loss": 0.83717597, + "num_input_tokens_seen": 105887920, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.828125, + "step": 4915, + "time_per_iteration": 2.6669511795043945 + }, + { + "auxiliary_loss_clip": 0.01134085, + "auxiliary_loss_mlp": 0.01037926, + "balance_loss_clip": 1.02215445, + "balance_loss_mlp": 1.0477066, + "epoch": 0.29556591011573724, + "flos": 22564506119040.0, + "grad_norm": 1.9544787473030132, + "language_loss": 0.84415555, + "learning_rate": 3.3046837623920772e-06, + "loss": 0.8658756, + "num_input_tokens_seen": 105904035, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.86328125, + "step": 4916, + "time_per_iteration": 2.473867416381836 + }, + { + "auxiliary_loss_clip": 0.01127987, + "auxiliary_loss_mlp": 0.01033684, + "balance_loss_clip": 1.01874673, + "balance_loss_mlp": 1.04477537, + "epoch": 0.2956260333684052, + "flos": 22089300163200.0, + "grad_norm": 3.5737730841451225, + "language_loss": 0.69611692, + "learning_rate": 3.3043885549726723e-06, + "loss": 0.71773368, + "num_input_tokens_seen": 105922685, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.83203125, + "step": 4917, + "time_per_iteration": 2.5078670978546143 + }, + { + "auxiliary_loss_clip": 0.01134116, + "auxiliary_loss_mlp": 0.01041431, + "balance_loss_clip": 1.02550471, + "balance_loss_mlp": 1.04932523, + "epoch": 0.2956861566210732, + "flos": 16435488643200.0, + "grad_norm": 2.1750223310256507, + "language_loss": 0.90840054, + "learning_rate": 3.3040932980907226e-06, + "loss": 0.93015605, + "num_input_tokens_seen": 105940425, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.84765625, + "step": 4918, + "time_per_iteration": 2.438870668411255 + }, + { + "auxiliary_loss_clip": 0.01134586, + "auxiliary_loss_mlp": 0.01040808, + "balance_loss_clip": 1.02504885, + "balance_loss_mlp": 1.04929781, + "epoch": 0.2957462798737412, + "flos": 25812087500160.0, + "grad_norm": 1.9164121886210477, + "language_loss": 0.72399461, + "learning_rate": 3.303797991757425e-06, + "loss": 0.74574864, + "num_input_tokens_seen": 105960550, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8515625, + "step": 4919, + "time_per_iteration": 2.5533134937286377 + }, + { + "auxiliary_loss_clip": 0.01130751, + "auxiliary_loss_mlp": 0.01042531, + "balance_loss_clip": 1.02661633, + "balance_loss_mlp": 1.04704165, + "epoch": 0.29580640312640916, + "flos": 16690849407360.0, + "grad_norm": 1.7148380002351797, + "language_loss": 0.75758076, + "learning_rate": 3.3035026359839763e-06, + "loss": 0.77931356, + "num_input_tokens_seen": 105978820, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8359375, + "step": 4920, + "time_per_iteration": 2.4288933277130127 + }, + { + "auxiliary_loss_clip": 0.01138995, + "auxiliary_loss_mlp": 0.01045405, + "balance_loss_clip": 1.02953875, + "balance_loss_mlp": 1.05214858, + "epoch": 0.2958665263790771, + "flos": 23945594100480.0, + "grad_norm": 2.2591712667141075, + "language_loss": 0.68327153, + "learning_rate": 3.3032072307815774e-06, + "loss": 0.7051155, + "num_input_tokens_seen": 105997545, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8671875, + "step": 4921, + "time_per_iteration": 2.5978074073791504 + }, + { + "auxiliary_loss_clip": 0.01136262, + "auxiliary_loss_mlp": 0.01042633, + "balance_loss_clip": 1.02580023, + "balance_loss_mlp": 1.04953861, + "epoch": 0.2959266496317451, + "flos": 18478410670080.0, + "grad_norm": 1.8781945072150448, + "language_loss": 0.74265885, + "learning_rate": 3.3029117761614298e-06, + "loss": 0.76444781, + "num_input_tokens_seen": 106015320, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8671875, + "step": 4922, + "time_per_iteration": 2.4518954753875732 + }, + { + "auxiliary_loss_clip": 0.0113841, + "auxiliary_loss_mlp": 0.0103604, + "balance_loss_clip": 1.01932716, + "balance_loss_mlp": 1.04900336, + "epoch": 0.29598677288441305, + "flos": 25957489754880.0, + "grad_norm": 2.178664992776949, + "language_loss": 0.76679426, + "learning_rate": 3.302616272134737e-06, + "loss": 0.78853875, + "num_input_tokens_seen": 106034555, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.89453125, + "step": 4923, + "time_per_iteration": 2.5565848350524902 + }, + { + "auxiliary_loss_clip": 0.0113218, + "auxiliary_loss_mlp": 0.01039495, + "balance_loss_clip": 1.02359807, + "balance_loss_mlp": 1.04730439, + "epoch": 0.296046896137081, + "flos": 25155999630720.0, + "grad_norm": 1.616043641477794, + "language_loss": 0.86307567, + "learning_rate": 3.3023207187127042e-06, + "loss": 0.88479245, + "num_input_tokens_seen": 106054200, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8515625, + "step": 4924, + "time_per_iteration": 2.5081374645233154 + }, + { + "auxiliary_loss_clip": 0.01132422, + "auxiliary_loss_mlp": 0.01034495, + "balance_loss_clip": 1.01864016, + "balance_loss_mlp": 1.04767513, + "epoch": 0.296107019389749, + "flos": 21761148487680.0, + "grad_norm": 1.3983202546472309, + "language_loss": 0.8180936, + "learning_rate": 3.3020251159065396e-06, + "loss": 0.83976275, + "num_input_tokens_seen": 106074700, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.84765625, + "step": 4925, + "time_per_iteration": 2.5473146438598633 + }, + { + "auxiliary_loss_clip": 0.01132696, + "auxiliary_loss_mlp": 0.01036748, + "balance_loss_clip": 1.02175128, + "balance_loss_mlp": 1.04893184, + "epoch": 0.29616714264241695, + "flos": 17960039544960.0, + "grad_norm": 2.5479827750219735, + "language_loss": 0.85168374, + "learning_rate": 3.301729463727452e-06, + "loss": 0.87337816, + "num_input_tokens_seen": 106091415, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8359375, + "step": 4926, + "time_per_iteration": 2.4603803157806396 + }, + { + "auxiliary_loss_clip": 0.0113285, + "auxiliary_loss_mlp": 0.01039481, + "balance_loss_clip": 1.02391791, + "balance_loss_mlp": 1.04658842, + "epoch": 0.2962272658950849, + "flos": 15012779777280.0, + "grad_norm": 2.1014080951069913, + "language_loss": 0.85908806, + "learning_rate": 3.3014337621866527e-06, + "loss": 0.88081133, + "num_input_tokens_seen": 106109135, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.86328125, + "step": 4927, + "time_per_iteration": 2.4724504947662354 + }, + { + "auxiliary_loss_clip": 0.01129564, + "auxiliary_loss_mlp": 0.01039461, + "balance_loss_clip": 1.02434492, + "balance_loss_mlp": 1.04636681, + "epoch": 0.2962873891477529, + "flos": 14720861946240.0, + "grad_norm": 1.8730507383843338, + "language_loss": 0.80967462, + "learning_rate": 3.3011380112953553e-06, + "loss": 0.83136487, + "num_input_tokens_seen": 106125750, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.83203125, + "step": 4928, + "time_per_iteration": 5.46146297454834 + }, + { + "auxiliary_loss_clip": 0.01138553, + "auxiliary_loss_mlp": 0.01041915, + "balance_loss_clip": 1.023211, + "balance_loss_mlp": 1.04749835, + "epoch": 0.29634751240042084, + "flos": 26723787528960.0, + "grad_norm": 3.002605920988437, + "language_loss": 0.72472513, + "learning_rate": 3.300842211064773e-06, + "loss": 0.7465297, + "num_input_tokens_seen": 106142835, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.91015625, + "step": 4929, + "time_per_iteration": 2.4938502311706543 + }, + { + "auxiliary_loss_clip": 0.01136289, + "auxiliary_loss_mlp": 0.01043304, + "balance_loss_clip": 1.02631676, + "balance_loss_mlp": 1.04823208, + "epoch": 0.2964076356530888, + "flos": 14571293713920.0, + "grad_norm": 2.429634231323073, + "language_loss": 0.72424346, + "learning_rate": 3.3005463615061246e-06, + "loss": 0.74603939, + "num_input_tokens_seen": 106160680, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.8828125, + "step": 4930, + "time_per_iteration": 2.486492156982422 + }, + { + "auxiliary_loss_clip": 0.01059615, + "auxiliary_loss_mlp": 0.01002568, + "balance_loss_clip": 1.00047004, + "balance_loss_mlp": 1.0186131, + "epoch": 0.29646775890575683, + "flos": 63104315063040.0, + "grad_norm": 0.8134562784526058, + "language_loss": 0.60710716, + "learning_rate": 3.3002504626306275e-06, + "loss": 0.627729, + "num_input_tokens_seen": 106224415, + "router_z_loss_clip": 0.02099609, + "router_z_loss_mlp": 0.41015625, + "step": 4931, + "time_per_iteration": 3.002444267272949 + }, + { + "auxiliary_loss_clip": 0.01058931, + "auxiliary_loss_mlp": 0.01001224, + "balance_loss_clip": 0.99926931, + "balance_loss_mlp": 1.01823413, + "epoch": 0.2965278821584248, + "flos": 63067686168960.0, + "grad_norm": 0.7413672345708404, + "language_loss": 0.52383232, + "learning_rate": 3.2999545144495023e-06, + "loss": 0.54443383, + "num_input_tokens_seen": 106279140, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.40625, + "step": 4932, + "time_per_iteration": 2.974777936935425 + }, + { + "auxiliary_loss_clip": 0.01127694, + "auxiliary_loss_mlp": 0.01039106, + "balance_loss_clip": 1.02322757, + "balance_loss_mlp": 1.04449248, + "epoch": 0.29658800541109276, + "flos": 23768734510080.0, + "grad_norm": 3.155895790893495, + "language_loss": 0.81622797, + "learning_rate": 3.299658516973972e-06, + "loss": 0.83789599, + "num_input_tokens_seen": 106298190, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.83203125, + "step": 4933, + "time_per_iteration": 2.518906593322754 + }, + { + "auxiliary_loss_clip": 0.0112788, + "auxiliary_loss_mlp": 0.0103376, + "balance_loss_clip": 1.01854897, + "balance_loss_mlp": 1.04651821, + "epoch": 0.2966481286637607, + "flos": 23988543788160.0, + "grad_norm": 1.671865304120784, + "language_loss": 0.75257647, + "learning_rate": 3.299362470215261e-06, + "loss": 0.77419287, + "num_input_tokens_seen": 106319065, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8125, + "step": 4934, + "time_per_iteration": 2.5015640258789062 + }, + { + "auxiliary_loss_clip": 0.01134944, + "auxiliary_loss_mlp": 0.01045163, + "balance_loss_clip": 1.02837849, + "balance_loss_mlp": 1.04699588, + "epoch": 0.2967082519164287, + "flos": 17165157523200.0, + "grad_norm": 1.752558919138232, + "language_loss": 0.62510157, + "learning_rate": 3.299066374184594e-06, + "loss": 0.64690268, + "num_input_tokens_seen": 106338040, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.87890625, + "step": 4935, + "time_per_iteration": 2.462982654571533 + }, + { + "auxiliary_loss_clip": 0.01129673, + "auxiliary_loss_mlp": 0.01041832, + "balance_loss_clip": 1.02585804, + "balance_loss_mlp": 1.04613912, + "epoch": 0.29676837516909665, + "flos": 29387712816000.0, + "grad_norm": 1.4993711353436514, + "language_loss": 0.79789758, + "learning_rate": 3.2987702288932e-06, + "loss": 0.81961262, + "num_input_tokens_seen": 106358900, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8359375, + "step": 4936, + "time_per_iteration": 2.5267326831817627 + }, + { + "auxiliary_loss_clip": 0.01132719, + "auxiliary_loss_mlp": 0.01045272, + "balance_loss_clip": 1.02854681, + "balance_loss_mlp": 1.04649782, + "epoch": 0.2968284984217646, + "flos": 34751222616960.0, + "grad_norm": 1.8807271027259396, + "language_loss": 0.74074632, + "learning_rate": 3.298474034352309e-06, + "loss": 0.76252627, + "num_input_tokens_seen": 106381805, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.86328125, + "step": 4937, + "time_per_iteration": 2.607790946960449 + }, + { + "auxiliary_loss_clip": 0.01132772, + "auxiliary_loss_mlp": 0.01038823, + "balance_loss_clip": 1.0224793, + "balance_loss_mlp": 1.04839468, + "epoch": 0.2968886216744326, + "flos": 21544104556800.0, + "grad_norm": 1.629632810423829, + "language_loss": 0.7804476, + "learning_rate": 3.2981777905731526e-06, + "loss": 0.80216354, + "num_input_tokens_seen": 106402365, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.84375, + "step": 4938, + "time_per_iteration": 2.469116687774658 + }, + { + "auxiliary_loss_clip": 0.01134705, + "auxiliary_loss_mlp": 0.01041256, + "balance_loss_clip": 1.02543736, + "balance_loss_mlp": 1.04814208, + "epoch": 0.29694874492710055, + "flos": 12787323811200.0, + "grad_norm": 2.041677851061636, + "language_loss": 0.77017808, + "learning_rate": 3.297881497566964e-06, + "loss": 0.79193771, + "num_input_tokens_seen": 106419800, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8671875, + "step": 4939, + "time_per_iteration": 2.453615427017212 + }, + { + "auxiliary_loss_clip": 0.01136816, + "auxiliary_loss_mlp": 0.01036899, + "balance_loss_clip": 1.02075171, + "balance_loss_mlp": 1.04958081, + "epoch": 0.2970088681797685, + "flos": 24569973239040.0, + "grad_norm": 1.5588161926919628, + "language_loss": 0.78206903, + "learning_rate": 3.297585155344979e-06, + "loss": 0.80380619, + "num_input_tokens_seen": 106440300, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.875, + "step": 4940, + "time_per_iteration": 2.5125393867492676 + }, + { + "auxiliary_loss_clip": 0.01133351, + "auxiliary_loss_mlp": 0.01040737, + "balance_loss_clip": 1.0233798, + "balance_loss_mlp": 1.04633832, + "epoch": 0.2970689914324365, + "flos": 23659171050240.0, + "grad_norm": 3.9307439231373884, + "language_loss": 0.75487554, + "learning_rate": 3.297288763918435e-06, + "loss": 0.77661633, + "num_input_tokens_seen": 106460035, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.87109375, + "step": 4941, + "time_per_iteration": 2.5308516025543213 + }, + { + "auxiliary_loss_clip": 0.0113684, + "auxiliary_loss_mlp": 0.01050296, + "balance_loss_clip": 1.03295147, + "balance_loss_mlp": 1.04803753, + "epoch": 0.29712911468510445, + "flos": 39670301439360.0, + "grad_norm": 2.557458362521145, + "language_loss": 0.73998737, + "learning_rate": 3.2969923232985712e-06, + "loss": 0.7618587, + "num_input_tokens_seen": 106481095, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.890625, + "step": 4942, + "time_per_iteration": 2.6214303970336914 + }, + { + "auxiliary_loss_clip": 0.0113696, + "auxiliary_loss_mlp": 0.01047243, + "balance_loss_clip": 1.03017855, + "balance_loss_mlp": 1.04778039, + "epoch": 0.2971892379377724, + "flos": 26395312631040.0, + "grad_norm": 1.997792424787015, + "language_loss": 0.70484138, + "learning_rate": 3.2966958334966287e-06, + "loss": 0.72668344, + "num_input_tokens_seen": 106501590, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.890625, + "step": 4943, + "time_per_iteration": 2.533313751220703 + }, + { + "auxiliary_loss_clip": 0.01137748, + "auxiliary_loss_mlp": 0.01043005, + "balance_loss_clip": 1.02657795, + "balance_loss_mlp": 1.04838014, + "epoch": 0.2972493611904404, + "flos": 17603195880960.0, + "grad_norm": 1.9523342898428475, + "language_loss": 0.80111414, + "learning_rate": 3.2963992945238497e-06, + "loss": 0.82292169, + "num_input_tokens_seen": 106519430, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.89453125, + "step": 4944, + "time_per_iteration": 2.464364528656006 + }, + { + "auxiliary_loss_clip": 0.01129992, + "auxiliary_loss_mlp": 0.01044699, + "balance_loss_clip": 1.02979231, + "balance_loss_mlp": 1.04640603, + "epoch": 0.2973094844431084, + "flos": 20412774817920.0, + "grad_norm": 2.1633352367153105, + "language_loss": 0.83451837, + "learning_rate": 3.2961027063914795e-06, + "loss": 0.85626531, + "num_input_tokens_seen": 106535870, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8359375, + "step": 4945, + "time_per_iteration": 2.4981510639190674 + }, + { + "auxiliary_loss_clip": 0.011318, + "auxiliary_loss_mlp": 0.01039077, + "balance_loss_clip": 1.02353168, + "balance_loss_mlp": 1.04738569, + "epoch": 0.29736960769577636, + "flos": 17493488766720.0, + "grad_norm": 2.2158088930062747, + "language_loss": 0.66624904, + "learning_rate": 3.2958060691107654e-06, + "loss": 0.68795776, + "num_input_tokens_seen": 106553560, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84375, + "step": 4946, + "time_per_iteration": 2.526228666305542 + }, + { + "auxiliary_loss_clip": 0.0113802, + "auxiliary_loss_mlp": 0.01034492, + "balance_loss_clip": 1.01880383, + "balance_loss_mlp": 1.0509392, + "epoch": 0.2974297309484443, + "flos": 26103969417600.0, + "grad_norm": 1.7941079108563611, + "language_loss": 0.73766255, + "learning_rate": 3.2955093826929547e-06, + "loss": 0.75938767, + "num_input_tokens_seen": 106574115, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.87109375, + "step": 4947, + "time_per_iteration": 2.5380265712738037 + }, + { + "auxiliary_loss_clip": 0.01135403, + "auxiliary_loss_mlp": 0.01044741, + "balance_loss_clip": 1.02774215, + "balance_loss_mlp": 1.04653597, + "epoch": 0.2974898542011123, + "flos": 25666433850240.0, + "grad_norm": 2.40735653244717, + "language_loss": 0.7330308, + "learning_rate": 3.2952126471492985e-06, + "loss": 0.75483221, + "num_input_tokens_seen": 106593070, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.890625, + "step": 4948, + "time_per_iteration": 2.5096492767333984 + }, + { + "auxiliary_loss_clip": 0.01129361, + "auxiliary_loss_mlp": 0.01033618, + "balance_loss_clip": 1.01824629, + "balance_loss_mlp": 1.04442465, + "epoch": 0.29754997745378026, + "flos": 18661339658880.0, + "grad_norm": 2.0973131899278825, + "language_loss": 0.84031421, + "learning_rate": 3.2949158624910497e-06, + "loss": 0.86194396, + "num_input_tokens_seen": 106610695, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8515625, + "step": 4949, + "time_per_iteration": 2.4650402069091797 + }, + { + "auxiliary_loss_clip": 0.01129505, + "auxiliary_loss_mlp": 0.01036103, + "balance_loss_clip": 1.02019429, + "balance_loss_mlp": 1.04509461, + "epoch": 0.2976101007064482, + "flos": 22274599449600.0, + "grad_norm": 1.77267818675948, + "language_loss": 0.71322602, + "learning_rate": 3.2946190287294603e-06, + "loss": 0.73488206, + "num_input_tokens_seen": 106631300, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.84375, + "step": 4950, + "time_per_iteration": 2.491163969039917 + }, + { + "auxiliary_loss_clip": 0.01127031, + "auxiliary_loss_mlp": 0.01043355, + "balance_loss_clip": 1.02792883, + "balance_loss_mlp": 1.04543924, + "epoch": 0.2976702239591162, + "flos": 21945657674880.0, + "grad_norm": 1.7996518465212372, + "language_loss": 0.82192945, + "learning_rate": 3.294322145875789e-06, + "loss": 0.84363329, + "num_input_tokens_seen": 106650065, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.81640625, + "step": 4951, + "time_per_iteration": 2.5001299381256104 + }, + { + "auxiliary_loss_clip": 0.01127377, + "auxiliary_loss_mlp": 0.01035652, + "balance_loss_clip": 1.01936841, + "balance_loss_mlp": 1.04211378, + "epoch": 0.29773034721178415, + "flos": 24637197542400.0, + "grad_norm": 2.6816702718299763, + "language_loss": 0.73421168, + "learning_rate": 3.2940252139412912e-06, + "loss": 0.75584191, + "num_input_tokens_seen": 106668230, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8515625, + "step": 4952, + "time_per_iteration": 2.4888715744018555 + }, + { + "auxiliary_loss_clip": 0.01132645, + "auxiliary_loss_mlp": 0.01041244, + "balance_loss_clip": 1.0246501, + "balance_loss_mlp": 1.04677546, + "epoch": 0.2977904704644521, + "flos": 20557566541440.0, + "grad_norm": 1.7548041314188605, + "language_loss": 0.83702904, + "learning_rate": 3.293728232937228e-06, + "loss": 0.85876799, + "num_input_tokens_seen": 106687785, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.859375, + "step": 4953, + "time_per_iteration": 2.486267566680908 + }, + { + "auxiliary_loss_clip": 0.01131661, + "auxiliary_loss_mlp": 0.0103863, + "balance_loss_clip": 1.02330005, + "balance_loss_mlp": 1.04566419, + "epoch": 0.2978505937171201, + "flos": 18916449027840.0, + "grad_norm": 2.078619348093555, + "language_loss": 0.74560732, + "learning_rate": 3.2934312028748597e-06, + "loss": 0.7673102, + "num_input_tokens_seen": 106706875, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.859375, + "step": 4954, + "time_per_iteration": 2.454066276550293 + }, + { + "auxiliary_loss_clip": 0.01129016, + "auxiliary_loss_mlp": 0.01036885, + "balance_loss_clip": 1.02153063, + "balance_loss_mlp": 1.0450201, + "epoch": 0.29791071696978805, + "flos": 19317750750720.0, + "grad_norm": 1.9786208165821892, + "language_loss": 0.75643009, + "learning_rate": 3.293134123765452e-06, + "loss": 0.77808911, + "num_input_tokens_seen": 106725105, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.83984375, + "step": 4955, + "time_per_iteration": 2.487297773361206 + }, + { + "auxiliary_loss_clip": 0.01132846, + "auxiliary_loss_mlp": 0.01035521, + "balance_loss_clip": 1.01980329, + "balance_loss_mlp": 1.04604173, + "epoch": 0.297970840222456, + "flos": 18806813740800.0, + "grad_norm": 3.347495877937089, + "language_loss": 0.72235912, + "learning_rate": 3.2928369956202684e-06, + "loss": 0.74404275, + "num_input_tokens_seen": 106744780, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8671875, + "step": 4956, + "time_per_iteration": 2.453639507293701 + }, + { + "auxiliary_loss_clip": 0.01134178, + "auxiliary_loss_mlp": 0.01044496, + "balance_loss_clip": 1.02737164, + "balance_loss_mlp": 1.04482651, + "epoch": 0.298030963475124, + "flos": 22852760762880.0, + "grad_norm": 1.6786835957024704, + "language_loss": 0.79504669, + "learning_rate": 3.2925398184505754e-06, + "loss": 0.81683344, + "num_input_tokens_seen": 106764670, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.89453125, + "step": 4957, + "time_per_iteration": 2.4680192470550537 + }, + { + "auxiliary_loss_clip": 0.01133842, + "auxiliary_loss_mlp": 0.0103974, + "balance_loss_clip": 1.02283621, + "balance_loss_mlp": 1.04692602, + "epoch": 0.298091086727792, + "flos": 21868485304320.0, + "grad_norm": 1.5505958112028584, + "language_loss": 0.70515305, + "learning_rate": 3.2922425922676437e-06, + "loss": 0.7268889, + "num_input_tokens_seen": 106783695, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8671875, + "step": 4958, + "time_per_iteration": 2.463550090789795 + }, + { + "auxiliary_loss_clip": 0.01130665, + "auxiliary_loss_mlp": 0.0104304, + "balance_loss_clip": 1.02685153, + "balance_loss_mlp": 1.04660892, + "epoch": 0.29815120998045996, + "flos": 21175014355200.0, + "grad_norm": 1.6483091075690746, + "language_loss": 0.78709656, + "learning_rate": 3.291945317082743e-06, + "loss": 0.8088336, + "num_input_tokens_seen": 106803150, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.83984375, + "step": 4959, + "time_per_iteration": 2.4896273612976074 + }, + { + "auxiliary_loss_clip": 0.0112987, + "auxiliary_loss_mlp": 0.01045688, + "balance_loss_clip": 1.03010738, + "balance_loss_mlp": 1.04477429, + "epoch": 0.29821133323312793, + "flos": 19896271200000.0, + "grad_norm": 1.8058675414038505, + "language_loss": 0.79814601, + "learning_rate": 3.291647992907147e-06, + "loss": 0.81990159, + "num_input_tokens_seen": 106820705, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8515625, + "step": 4960, + "time_per_iteration": 2.4524307250976562 + }, + { + "auxiliary_loss_clip": 0.01133353, + "auxiliary_loss_mlp": 0.01047089, + "balance_loss_clip": 1.02998269, + "balance_loss_mlp": 1.04504156, + "epoch": 0.2982714564857959, + "flos": 12750766744320.0, + "grad_norm": 2.8105894923901418, + "language_loss": 0.73709917, + "learning_rate": 3.291350619752129e-06, + "loss": 0.75890362, + "num_input_tokens_seen": 106837335, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.8828125, + "step": 4961, + "time_per_iteration": 2.463160991668701 + }, + { + "auxiliary_loss_clip": 0.01132538, + "auxiliary_loss_mlp": 0.0103792, + "balance_loss_clip": 1.02301908, + "balance_loss_mlp": 1.0466218, + "epoch": 0.29833157973846386, + "flos": 22271905929600.0, + "grad_norm": 1.946317435202559, + "language_loss": 0.62041843, + "learning_rate": 3.291053197628967e-06, + "loss": 0.64212298, + "num_input_tokens_seen": 106856250, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.859375, + "step": 4962, + "time_per_iteration": 2.4734280109405518 + }, + { + "auxiliary_loss_clip": 0.0113099, + "auxiliary_loss_mlp": 0.01038012, + "balance_loss_clip": 1.02143037, + "balance_loss_mlp": 1.04580986, + "epoch": 0.2983917029911318, + "flos": 15372999319680.0, + "grad_norm": 1.708438122809617, + "language_loss": 0.83075964, + "learning_rate": 3.2907557265489375e-06, + "loss": 0.85244966, + "num_input_tokens_seen": 106873370, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8515625, + "step": 4963, + "time_per_iteration": 2.4676647186279297 + }, + { + "auxiliary_loss_clip": 0.01132139, + "auxiliary_loss_mlp": 0.01037543, + "balance_loss_clip": 1.02108073, + "balance_loss_mlp": 1.04811728, + "epoch": 0.2984518262437998, + "flos": 15377632174080.0, + "grad_norm": 2.8539744131594924, + "language_loss": 0.66537225, + "learning_rate": 3.290458206523322e-06, + "loss": 0.68706906, + "num_input_tokens_seen": 106890330, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.83984375, + "step": 4964, + "time_per_iteration": 2.425261974334717 + }, + { + "auxiliary_loss_clip": 0.01128116, + "auxiliary_loss_mlp": 0.01034534, + "balance_loss_clip": 1.01994288, + "balance_loss_mlp": 1.04498291, + "epoch": 0.29851194949646775, + "flos": 18108458542080.0, + "grad_norm": 1.6142193033036512, + "language_loss": 0.70836121, + "learning_rate": 3.2901606375634015e-06, + "loss": 0.72998774, + "num_input_tokens_seen": 106909190, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.83203125, + "step": 4965, + "time_per_iteration": 2.468221664428711 + }, + { + "auxiliary_loss_clip": 0.01137695, + "auxiliary_loss_mlp": 0.01047125, + "balance_loss_clip": 1.0309124, + "balance_loss_mlp": 1.05098724, + "epoch": 0.2985720727491357, + "flos": 22018233104640.0, + "grad_norm": 2.501073720290292, + "language_loss": 0.66185117, + "learning_rate": 3.289863019680461e-06, + "loss": 0.68369937, + "num_input_tokens_seen": 106927825, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8671875, + "step": 4966, + "time_per_iteration": 2.479327440261841 + }, + { + "auxiliary_loss_clip": 0.01134994, + "auxiliary_loss_mlp": 0.01040953, + "balance_loss_clip": 1.02595615, + "balance_loss_mlp": 1.04869342, + "epoch": 0.2986321960018037, + "flos": 13041355772160.0, + "grad_norm": 2.7651343279829215, + "language_loss": 0.74186444, + "learning_rate": 3.289565352885785e-06, + "loss": 0.76362395, + "num_input_tokens_seen": 106943155, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.86328125, + "step": 4967, + "time_per_iteration": 2.4752163887023926 + }, + { + "auxiliary_loss_clip": 0.01129475, + "auxiliary_loss_mlp": 0.01035044, + "balance_loss_clip": 1.02035177, + "balance_loss_mlp": 1.04422212, + "epoch": 0.29869231925447165, + "flos": 14465034305280.0, + "grad_norm": 1.9700123684688966, + "language_loss": 0.71222222, + "learning_rate": 3.2892676371906614e-06, + "loss": 0.73386747, + "num_input_tokens_seen": 106960295, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8515625, + "step": 4968, + "time_per_iteration": 2.448028564453125 + }, + { + "auxiliary_loss_clip": 0.01131577, + "auxiliary_loss_mlp": 0.0103395, + "balance_loss_clip": 1.01884651, + "balance_loss_mlp": 1.04596853, + "epoch": 0.2987524425071396, + "flos": 31650228639360.0, + "grad_norm": 2.0898000655075752, + "language_loss": 0.77127141, + "learning_rate": 3.2889698726063805e-06, + "loss": 0.79292667, + "num_input_tokens_seen": 106982870, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.85546875, + "step": 4969, + "time_per_iteration": 2.5737853050231934 + }, + { + "auxiliary_loss_clip": 0.01131698, + "auxiliary_loss_mlp": 0.01037718, + "balance_loss_clip": 1.022578, + "balance_loss_mlp": 1.04641569, + "epoch": 0.2988125657598076, + "flos": 21433427775360.0, + "grad_norm": 1.5683816051841135, + "language_loss": 0.69798505, + "learning_rate": 3.2886720591442327e-06, + "loss": 0.71967924, + "num_input_tokens_seen": 107002405, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8515625, + "step": 4970, + "time_per_iteration": 5.428143501281738 + }, + { + "auxiliary_loss_clip": 0.01135849, + "auxiliary_loss_mlp": 0.01045402, + "balance_loss_clip": 1.02831888, + "balance_loss_mlp": 1.04582572, + "epoch": 0.2988726890124756, + "flos": 18076965292800.0, + "grad_norm": 2.0403310419369314, + "language_loss": 0.85269564, + "learning_rate": 3.2883741968155103e-06, + "loss": 0.8745082, + "num_input_tokens_seen": 107017310, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.8984375, + "step": 4971, + "time_per_iteration": 2.4557158946990967 + }, + { + "auxiliary_loss_clip": 0.0113165, + "auxiliary_loss_mlp": 0.01044418, + "balance_loss_clip": 1.02905178, + "balance_loss_mlp": 1.0487361, + "epoch": 0.29893281226514357, + "flos": 21755653706880.0, + "grad_norm": 1.8300460221108372, + "language_loss": 0.79116535, + "learning_rate": 3.2880762856315107e-06, + "loss": 0.81292605, + "num_input_tokens_seen": 107034645, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.828125, + "step": 4972, + "time_per_iteration": 2.492119550704956 + }, + { + "auxiliary_loss_clip": 0.0113418, + "auxiliary_loss_mlp": 0.01040036, + "balance_loss_clip": 1.02457476, + "balance_loss_mlp": 1.0491786, + "epoch": 0.29899293551781153, + "flos": 16836718538880.0, + "grad_norm": 1.9080397703774756, + "language_loss": 0.85019803, + "learning_rate": 3.2877783256035285e-06, + "loss": 0.87194014, + "num_input_tokens_seen": 107051125, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84765625, + "step": 4973, + "time_per_iteration": 2.4409923553466797 + }, + { + "auxiliary_loss_clip": 0.01128243, + "auxiliary_loss_mlp": 0.0103693, + "balance_loss_clip": 1.02192163, + "balance_loss_mlp": 1.04866779, + "epoch": 0.2990530587704795, + "flos": 11729215946880.0, + "grad_norm": 1.5302170897903997, + "language_loss": 0.77397263, + "learning_rate": 3.287480316742863e-06, + "loss": 0.79562438, + "num_input_tokens_seen": 107068815, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.796875, + "step": 4974, + "time_per_iteration": 2.4786176681518555 + }, + { + "auxiliary_loss_clip": 0.01135129, + "auxiliary_loss_mlp": 0.01042004, + "balance_loss_clip": 1.02723432, + "balance_loss_mlp": 1.04905188, + "epoch": 0.29911318202314746, + "flos": 28039877850240.0, + "grad_norm": 2.0911748108299015, + "language_loss": 0.72264957, + "learning_rate": 3.287182259060815e-06, + "loss": 0.74442089, + "num_input_tokens_seen": 107090420, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.859375, + "step": 4975, + "time_per_iteration": 2.5267655849456787 + }, + { + "auxiliary_loss_clip": 0.01133427, + "auxiliary_loss_mlp": 0.01038056, + "balance_loss_clip": 1.02204621, + "balance_loss_mlp": 1.0501368, + "epoch": 0.2991733052758154, + "flos": 18733555952640.0, + "grad_norm": 4.957635138610608, + "language_loss": 0.76028466, + "learning_rate": 3.286884152568687e-06, + "loss": 0.78199953, + "num_input_tokens_seen": 107107255, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.83203125, + "step": 4976, + "time_per_iteration": 2.46476149559021 + }, + { + "auxiliary_loss_clip": 0.01131159, + "auxiliary_loss_mlp": 0.01039669, + "balance_loss_clip": 1.02464914, + "balance_loss_mlp": 1.04786563, + "epoch": 0.2992334285284834, + "flos": 15559160532480.0, + "grad_norm": 2.141179611311424, + "language_loss": 0.86060619, + "learning_rate": 3.2865859972777827e-06, + "loss": 0.88231456, + "num_input_tokens_seen": 107123840, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.83203125, + "step": 4977, + "time_per_iteration": 2.4342682361602783 + }, + { + "auxiliary_loss_clip": 0.01135764, + "auxiliary_loss_mlp": 0.01041989, + "balance_loss_clip": 1.02605033, + "balance_loss_mlp": 1.0510987, + "epoch": 0.29929355178115136, + "flos": 21797561900160.0, + "grad_norm": 1.6147948075287948, + "language_loss": 0.68286109, + "learning_rate": 3.2862877931994088e-06, + "loss": 0.7046386, + "num_input_tokens_seen": 107143475, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.84375, + "step": 4978, + "time_per_iteration": 2.539616823196411 + }, + { + "auxiliary_loss_clip": 0.01138133, + "auxiliary_loss_mlp": 0.01036989, + "balance_loss_clip": 1.02078843, + "balance_loss_mlp": 1.053123, + "epoch": 0.2993536750338193, + "flos": 21178533888000.0, + "grad_norm": 1.9781984123500023, + "language_loss": 0.7654568, + "learning_rate": 3.2859895403448726e-06, + "loss": 0.78720796, + "num_input_tokens_seen": 107161725, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8515625, + "step": 4979, + "time_per_iteration": 2.4865188598632812 + }, + { + "auxiliary_loss_clip": 0.01130641, + "auxiliary_loss_mlp": 0.01038072, + "balance_loss_clip": 1.02265859, + "balance_loss_mlp": 1.04520524, + "epoch": 0.2994137982864873, + "flos": 32122130544000.0, + "grad_norm": 1.7578947600277828, + "language_loss": 0.68300819, + "learning_rate": 3.285691238725484e-06, + "loss": 0.70469534, + "num_input_tokens_seen": 107183935, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.85546875, + "step": 4980, + "time_per_iteration": 2.6137757301330566 + }, + { + "auxiliary_loss_clip": 0.01132193, + "auxiliary_loss_mlp": 0.01039056, + "balance_loss_clip": 1.02396405, + "balance_loss_mlp": 1.05068171, + "epoch": 0.29947392153915525, + "flos": 21105419754240.0, + "grad_norm": 1.9242198828448243, + "language_loss": 0.73239923, + "learning_rate": 3.285392888352555e-06, + "loss": 0.75411171, + "num_input_tokens_seen": 107204285, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8125, + "step": 4981, + "time_per_iteration": 2.5342931747436523 + }, + { + "auxiliary_loss_clip": 0.01135451, + "auxiliary_loss_mlp": 0.01037274, + "balance_loss_clip": 1.02227712, + "balance_loss_mlp": 1.04691803, + "epoch": 0.2995340447918232, + "flos": 21542632099200.0, + "grad_norm": 1.470312251429405, + "language_loss": 0.86429024, + "learning_rate": 3.2850944892373987e-06, + "loss": 0.8860175, + "num_input_tokens_seen": 107225265, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.88671875, + "step": 4982, + "time_per_iteration": 2.5238943099975586 + }, + { + "auxiliary_loss_clip": 0.01138194, + "auxiliary_loss_mlp": 0.01041283, + "balance_loss_clip": 1.02446246, + "balance_loss_mlp": 1.04975057, + "epoch": 0.2995941680444912, + "flos": 16725143917440.0, + "grad_norm": 2.2481661066872904, + "language_loss": 0.86378068, + "learning_rate": 3.2847960413913307e-06, + "loss": 0.88557541, + "num_input_tokens_seen": 107241335, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8828125, + "step": 4983, + "time_per_iteration": 2.4477322101593018 + }, + { + "auxiliary_loss_clip": 0.01133456, + "auxiliary_loss_mlp": 0.01040756, + "balance_loss_clip": 1.02577138, + "balance_loss_mlp": 1.0483377, + "epoch": 0.2996542912971592, + "flos": 20923496346240.0, + "grad_norm": 1.8474343514891325, + "language_loss": 0.78286207, + "learning_rate": 3.284497544825668e-06, + "loss": 0.80460417, + "num_input_tokens_seen": 107259375, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8515625, + "step": 4984, + "time_per_iteration": 2.490079402923584 + }, + { + "auxiliary_loss_clip": 0.01136807, + "auxiliary_loss_mlp": 0.01046143, + "balance_loss_clip": 1.02960873, + "balance_loss_mlp": 1.05052662, + "epoch": 0.29971441454982717, + "flos": 25079868754560.0, + "grad_norm": 1.555514289558953, + "language_loss": 0.78418988, + "learning_rate": 3.2841989995517303e-06, + "loss": 0.80601943, + "num_input_tokens_seen": 107279890, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.86328125, + "step": 4985, + "time_per_iteration": 2.5188379287719727 + }, + { + "auxiliary_loss_clip": 0.01138287, + "auxiliary_loss_mlp": 0.01037976, + "balance_loss_clip": 1.02115583, + "balance_loss_mlp": 1.05010915, + "epoch": 0.29977453780249513, + "flos": 52555911840000.0, + "grad_norm": 3.8074401298215905, + "language_loss": 0.72157449, + "learning_rate": 3.283900405580837e-06, + "loss": 0.74333715, + "num_input_tokens_seen": 107303430, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8828125, + "step": 4986, + "time_per_iteration": 2.7730660438537598 + }, + { + "auxiliary_loss_clip": 0.01136069, + "auxiliary_loss_mlp": 0.01041734, + "balance_loss_clip": 1.02523577, + "balance_loss_mlp": 1.04813981, + "epoch": 0.2998346610551631, + "flos": 22237144542720.0, + "grad_norm": 1.7357810931981628, + "language_loss": 0.73332191, + "learning_rate": 3.283601762924312e-06, + "loss": 0.75509989, + "num_input_tokens_seen": 107323700, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.87890625, + "step": 4987, + "time_per_iteration": 2.4857406616210938 + }, + { + "auxiliary_loss_clip": 0.01131799, + "auxiliary_loss_mlp": 0.01036451, + "balance_loss_clip": 1.02162147, + "balance_loss_mlp": 1.04787469, + "epoch": 0.29989478430783106, + "flos": 16873203778560.0, + "grad_norm": 2.6184059112472817, + "language_loss": 0.80173379, + "learning_rate": 3.2833030715934793e-06, + "loss": 0.82341629, + "num_input_tokens_seen": 107341965, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.83984375, + "step": 4988, + "time_per_iteration": 2.477614641189575 + }, + { + "auxiliary_loss_clip": 0.01133993, + "auxiliary_loss_mlp": 0.01044495, + "balance_loss_clip": 1.02874756, + "balance_loss_mlp": 1.04897678, + "epoch": 0.29995490756049903, + "flos": 23768878164480.0, + "grad_norm": 1.615528223125509, + "language_loss": 0.70302641, + "learning_rate": 3.2830043315996658e-06, + "loss": 0.72481132, + "num_input_tokens_seen": 107362615, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8515625, + "step": 4989, + "time_per_iteration": 2.4942874908447266 + }, + { + "auxiliary_loss_clip": 0.01137636, + "auxiliary_loss_mlp": 0.01040507, + "balance_loss_clip": 1.02382946, + "balance_loss_mlp": 1.05045295, + "epoch": 0.300015030813167, + "flos": 14465321614080.0, + "grad_norm": 2.0547136882256654, + "language_loss": 0.85636222, + "learning_rate": 3.282705542954199e-06, + "loss": 0.87814367, + "num_input_tokens_seen": 107378980, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.87109375, + "step": 4990, + "time_per_iteration": 2.455134391784668 + }, + { + "auxiliary_loss_clip": 0.01135916, + "auxiliary_loss_mlp": 0.01035323, + "balance_loss_clip": 1.01884782, + "balance_loss_mlp": 1.04822564, + "epoch": 0.30007515406583496, + "flos": 25191982080000.0, + "grad_norm": 1.6641511475566748, + "language_loss": 0.67125142, + "learning_rate": 3.28240670566841e-06, + "loss": 0.69296378, + "num_input_tokens_seen": 107397640, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.87890625, + "step": 4991, + "time_per_iteration": 2.4928019046783447 + }, + { + "auxiliary_loss_clip": 0.01137405, + "auxiliary_loss_mlp": 0.0103888, + "balance_loss_clip": 1.02165437, + "balance_loss_mlp": 1.0479908, + "epoch": 0.3001352773185029, + "flos": 19391188106880.0, + "grad_norm": 1.5868946812173, + "language_loss": 0.78707612, + "learning_rate": 3.28210781975363e-06, + "loss": 0.80883896, + "num_input_tokens_seen": 107416020, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.89453125, + "step": 4992, + "time_per_iteration": 2.5030534267425537 + }, + { + "auxiliary_loss_clip": 0.01135049, + "auxiliary_loss_mlp": 0.01036168, + "balance_loss_clip": 1.02056348, + "balance_loss_mlp": 1.04976213, + "epoch": 0.3001954005711709, + "flos": 21543853161600.0, + "grad_norm": 1.8035914694742925, + "language_loss": 0.824085, + "learning_rate": 3.281808885221193e-06, + "loss": 0.84579718, + "num_input_tokens_seen": 107436340, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8515625, + "step": 4993, + "time_per_iteration": 2.475588083267212 + }, + { + "auxiliary_loss_clip": 0.01138101, + "auxiliary_loss_mlp": 0.01042764, + "balance_loss_clip": 1.02522802, + "balance_loss_mlp": 1.04808736, + "epoch": 0.30025552382383885, + "flos": 17384320356480.0, + "grad_norm": 2.0505124462232898, + "language_loss": 0.85850489, + "learning_rate": 3.2815099020824345e-06, + "loss": 0.88031358, + "num_input_tokens_seen": 107454585, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8984375, + "step": 4994, + "time_per_iteration": 2.47881817817688 + }, + { + "auxiliary_loss_clip": 0.0113641, + "auxiliary_loss_mlp": 0.01036445, + "balance_loss_clip": 1.02086425, + "balance_loss_mlp": 1.05017769, + "epoch": 0.3003156470765068, + "flos": 29533330552320.0, + "grad_norm": 1.5183999234373478, + "language_loss": 0.8111707, + "learning_rate": 3.2812108703486924e-06, + "loss": 0.83289921, + "num_input_tokens_seen": 107477180, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.86328125, + "step": 4995, + "time_per_iteration": 2.5481183528900146 + }, + { + "auxiliary_loss_clip": 0.0113418, + "auxiliary_loss_mlp": 0.01041404, + "balance_loss_clip": 1.02522683, + "balance_loss_mlp": 1.05089867, + "epoch": 0.3003757703291748, + "flos": 43646402465280.0, + "grad_norm": 1.7074459415862762, + "language_loss": 0.67098773, + "learning_rate": 3.2809117900313055e-06, + "loss": 0.69274354, + "num_input_tokens_seen": 107500250, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.83203125, + "step": 4996, + "time_per_iteration": 2.6810193061828613 + }, + { + "auxiliary_loss_clip": 0.01134671, + "auxiliary_loss_mlp": 0.01040082, + "balance_loss_clip": 1.02392912, + "balance_loss_mlp": 1.04883564, + "epoch": 0.30043589358184275, + "flos": 22528380015360.0, + "grad_norm": 1.7509046873587113, + "language_loss": 0.75304276, + "learning_rate": 3.280612661141615e-06, + "loss": 0.77479029, + "num_input_tokens_seen": 107520070, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.859375, + "step": 4997, + "time_per_iteration": 2.472226858139038 + }, + { + "auxiliary_loss_clip": 0.01132042, + "auxiliary_loss_mlp": 0.01038973, + "balance_loss_clip": 1.02372646, + "balance_loss_mlp": 1.04816282, + "epoch": 0.30049601683451077, + "flos": 20995892208000.0, + "grad_norm": 1.9401125864941864, + "language_loss": 0.77664721, + "learning_rate": 3.2803134836909646e-06, + "loss": 0.79835731, + "num_input_tokens_seen": 107539285, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.83984375, + "step": 4998, + "time_per_iteration": 2.495087146759033 + }, + { + "auxiliary_loss_clip": 0.01129805, + "auxiliary_loss_mlp": 0.01039417, + "balance_loss_clip": 1.02469468, + "balance_loss_mlp": 1.04812598, + "epoch": 0.30055614008717874, + "flos": 23916004272000.0, + "grad_norm": 1.5996751316274151, + "language_loss": 0.73429006, + "learning_rate": 3.2800142576906985e-06, + "loss": 0.75598228, + "num_input_tokens_seen": 107560260, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.81640625, + "step": 4999, + "time_per_iteration": 2.491774082183838 + }, + { + "auxiliary_loss_clip": 0.01134839, + "auxiliary_loss_mlp": 0.01037955, + "balance_loss_clip": 1.02250576, + "balance_loss_mlp": 1.0498935, + "epoch": 0.3006162633398467, + "flos": 19169798630400.0, + "grad_norm": 1.6017930279588588, + "language_loss": 0.756015, + "learning_rate": 3.2797149831521626e-06, + "loss": 0.77774298, + "num_input_tokens_seen": 107579260, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8515625, + "step": 5000, + "time_per_iteration": 2.572003126144409 + }, + { + "auxiliary_loss_clip": 0.01131295, + "auxiliary_loss_mlp": 0.01036738, + "balance_loss_clip": 1.02329731, + "balance_loss_mlp": 1.04886353, + "epoch": 0.30067638659251467, + "flos": 14679241061760.0, + "grad_norm": 1.977226227337592, + "language_loss": 0.81681275, + "learning_rate": 3.2794156600867073e-06, + "loss": 0.83849311, + "num_input_tokens_seen": 107595245, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.82421875, + "step": 5001, + "time_per_iteration": 2.4240355491638184 + }, + { + "auxiliary_loss_clip": 0.01136183, + "auxiliary_loss_mlp": 0.01041393, + "balance_loss_clip": 1.02538288, + "balance_loss_mlp": 1.05103087, + "epoch": 0.30073650984518263, + "flos": 23368007404800.0, + "grad_norm": 1.5846802536013025, + "language_loss": 0.8056432, + "learning_rate": 3.2791162885056815e-06, + "loss": 0.82741892, + "num_input_tokens_seen": 107613985, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8515625, + "step": 5002, + "time_per_iteration": 2.5848264694213867 + }, + { + "auxiliary_loss_clip": 0.01137551, + "auxiliary_loss_mlp": 0.01037496, + "balance_loss_clip": 1.02240372, + "balance_loss_mlp": 1.04907179, + "epoch": 0.3007966330978506, + "flos": 22966633854720.0, + "grad_norm": 1.6918091030667293, + "language_loss": 0.71209854, + "learning_rate": 3.2788168684204376e-06, + "loss": 0.73384899, + "num_input_tokens_seen": 107631435, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8828125, + "step": 5003, + "time_per_iteration": 2.4672186374664307 + }, + { + "auxiliary_loss_clip": 0.01136595, + "auxiliary_loss_mlp": 0.01038624, + "balance_loss_clip": 1.02377009, + "balance_loss_mlp": 1.05050564, + "epoch": 0.30085675635051856, + "flos": 27818452460160.0, + "grad_norm": 1.8725932973877313, + "language_loss": 0.70613277, + "learning_rate": 3.27851739984233e-06, + "loss": 0.72788501, + "num_input_tokens_seen": 107650530, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.859375, + "step": 5004, + "time_per_iteration": 2.579941511154175 + }, + { + "auxiliary_loss_clip": 0.01135872, + "auxiliary_loss_mlp": 0.01044061, + "balance_loss_clip": 1.02817035, + "balance_loss_mlp": 1.04977477, + "epoch": 0.3009168796031865, + "flos": 10882729059840.0, + "grad_norm": 2.8634075898885767, + "language_loss": 0.81359464, + "learning_rate": 3.278217882782715e-06, + "loss": 0.83539397, + "num_input_tokens_seen": 107662240, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.859375, + "step": 5005, + "time_per_iteration": 2.4043233394622803 + }, + { + "auxiliary_loss_clip": 0.01132041, + "auxiliary_loss_mlp": 0.01041947, + "balance_loss_clip": 1.02702177, + "balance_loss_mlp": 1.04792035, + "epoch": 0.3009770028558545, + "flos": 23805399317760.0, + "grad_norm": 2.9232502202927266, + "language_loss": 0.74906754, + "learning_rate": 3.2779183172529497e-06, + "loss": 0.77080745, + "num_input_tokens_seen": 107680330, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.84375, + "step": 5006, + "time_per_iteration": 2.5169718265533447 + }, + { + "auxiliary_loss_clip": 0.0113004, + "auxiliary_loss_mlp": 0.01041924, + "balance_loss_clip": 1.02712977, + "balance_loss_mlp": 1.04745531, + "epoch": 0.30103712610852246, + "flos": 26468211283200.0, + "grad_norm": 2.157802275476472, + "language_loss": 0.70810544, + "learning_rate": 3.2776187032643932e-06, + "loss": 0.72982514, + "num_input_tokens_seen": 107700020, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.828125, + "step": 5007, + "time_per_iteration": 2.500135898590088 + }, + { + "auxiliary_loss_clip": 0.01133792, + "auxiliary_loss_mlp": 0.01040278, + "balance_loss_clip": 1.02453065, + "balance_loss_mlp": 1.04947257, + "epoch": 0.3010972493611904, + "flos": 22856459863680.0, + "grad_norm": 2.301214894203853, + "language_loss": 0.76435697, + "learning_rate": 3.2773190408284075e-06, + "loss": 0.78609765, + "num_input_tokens_seen": 107718575, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.84375, + "step": 5008, + "time_per_iteration": 2.5071120262145996 + }, + { + "auxiliary_loss_clip": 0.0113182, + "auxiliary_loss_mlp": 0.01039886, + "balance_loss_clip": 1.02464485, + "balance_loss_mlp": 1.04823518, + "epoch": 0.3011573726138584, + "flos": 24053685102720.0, + "grad_norm": 1.7973688674758703, + "language_loss": 0.84830707, + "learning_rate": 3.2770193299563564e-06, + "loss": 0.87002409, + "num_input_tokens_seen": 107738635, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8359375, + "step": 5009, + "time_per_iteration": 2.531024694442749 + }, + { + "auxiliary_loss_clip": 0.01135897, + "auxiliary_loss_mlp": 0.01037547, + "balance_loss_clip": 1.0211432, + "balance_loss_mlp": 1.04830122, + "epoch": 0.30121749586652635, + "flos": 20259687052800.0, + "grad_norm": 1.9976209282841157, + "language_loss": 0.83813334, + "learning_rate": 3.276719570659604e-06, + "loss": 0.85986781, + "num_input_tokens_seen": 107753415, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.875, + "step": 5010, + "time_per_iteration": 2.4690375328063965 + }, + { + "auxiliary_loss_clip": 0.01130658, + "auxiliary_loss_mlp": 0.01034306, + "balance_loss_clip": 1.02003646, + "balance_loss_mlp": 1.04724431, + "epoch": 0.3012776191191944, + "flos": 26943058103040.0, + "grad_norm": 1.9597018241269177, + "language_loss": 0.85013181, + "learning_rate": 3.2764197629495176e-06, + "loss": 0.87178147, + "num_input_tokens_seen": 107773840, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.83203125, + "step": 5011, + "time_per_iteration": 2.501708745956421 + }, + { + "auxiliary_loss_clip": 0.01134213, + "auxiliary_loss_mlp": 0.01039104, + "balance_loss_clip": 1.02335644, + "balance_loss_mlp": 1.04754543, + "epoch": 0.30133774237186234, + "flos": 20412307941120.0, + "grad_norm": 2.0524404295798013, + "language_loss": 0.71966654, + "learning_rate": 3.2761199068374656e-06, + "loss": 0.74139971, + "num_input_tokens_seen": 107792020, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8671875, + "step": 5012, + "time_per_iteration": 3.979128360748291 + }, + { + "auxiliary_loss_clip": 0.01131878, + "auxiliary_loss_mlp": 0.01037572, + "balance_loss_clip": 1.0229032, + "balance_loss_mlp": 1.04721081, + "epoch": 0.3013978656245303, + "flos": 19792453916160.0, + "grad_norm": 1.9997819947408795, + "language_loss": 0.87396109, + "learning_rate": 3.275820002334819e-06, + "loss": 0.89565563, + "num_input_tokens_seen": 107809595, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.84765625, + "step": 5013, + "time_per_iteration": 2.467177629470825 + }, + { + "auxiliary_loss_clip": 0.01136565, + "auxiliary_loss_mlp": 0.01036881, + "balance_loss_clip": 1.0200367, + "balance_loss_mlp": 1.04842985, + "epoch": 0.30145798887719827, + "flos": 16249650652800.0, + "grad_norm": 3.4702040063697313, + "language_loss": 0.83367115, + "learning_rate": 3.2755200494529496e-06, + "loss": 0.85540557, + "num_input_tokens_seen": 107827230, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8828125, + "step": 5014, + "time_per_iteration": 2.4654901027679443 + }, + { + "auxiliary_loss_clip": 0.01128425, + "auxiliary_loss_mlp": 0.01033529, + "balance_loss_clip": 1.01896727, + "balance_loss_mlp": 1.0471499, + "epoch": 0.30151811212986623, + "flos": 24571733005440.0, + "grad_norm": 1.6346146355602116, + "language_loss": 0.68218327, + "learning_rate": 3.2752200482032323e-06, + "loss": 0.70380276, + "num_input_tokens_seen": 107847195, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8125, + "step": 5015, + "time_per_iteration": 2.4994328022003174 + }, + { + "auxiliary_loss_clip": 0.01132371, + "auxiliary_loss_mlp": 0.01038543, + "balance_loss_clip": 1.02309942, + "balance_loss_mlp": 1.04864407, + "epoch": 0.3015782353825342, + "flos": 21872076664320.0, + "grad_norm": 2.7110353723362635, + "language_loss": 0.74712509, + "learning_rate": 3.2749199985970436e-06, + "loss": 0.76883423, + "num_input_tokens_seen": 107866420, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8359375, + "step": 5016, + "time_per_iteration": 2.5168755054473877 + }, + { + "auxiliary_loss_clip": 0.01135364, + "auxiliary_loss_mlp": 0.01036445, + "balance_loss_clip": 1.0210197, + "balance_loss_mlp": 1.0498333, + "epoch": 0.30163835863520216, + "flos": 28769331248640.0, + "grad_norm": 1.6963436015958502, + "language_loss": 0.65179884, + "learning_rate": 3.2746199006457603e-06, + "loss": 0.67351693, + "num_input_tokens_seen": 107889090, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.85546875, + "step": 5017, + "time_per_iteration": 2.543577194213867 + }, + { + "auxiliary_loss_clip": 0.01134511, + "auxiliary_loss_mlp": 0.01043761, + "balance_loss_clip": 1.02860379, + "balance_loss_mlp": 1.05030179, + "epoch": 0.30169848188787013, + "flos": 22966202891520.0, + "grad_norm": 2.078433105892768, + "language_loss": 0.69045079, + "learning_rate": 3.2743197543607628e-06, + "loss": 0.71223348, + "num_input_tokens_seen": 107907520, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.84375, + "step": 5018, + "time_per_iteration": 2.498060464859009 + }, + { + "auxiliary_loss_clip": 0.01129538, + "auxiliary_loss_mlp": 0.01041131, + "balance_loss_clip": 1.02772546, + "balance_loss_mlp": 1.04842138, + "epoch": 0.3017586051405381, + "flos": 21835268202240.0, + "grad_norm": 1.9198297669603306, + "language_loss": 0.78841144, + "learning_rate": 3.2740195597534327e-06, + "loss": 0.81011814, + "num_input_tokens_seen": 107925650, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.8125, + "step": 5019, + "time_per_iteration": 2.4873573780059814 + }, + { + "auxiliary_loss_clip": 0.01134625, + "auxiliary_loss_mlp": 0.01041878, + "balance_loss_clip": 1.02695298, + "balance_loss_mlp": 1.05073094, + "epoch": 0.30181872839320606, + "flos": 22160403135360.0, + "grad_norm": 2.24109756344656, + "language_loss": 0.69867152, + "learning_rate": 3.2737193168351527e-06, + "loss": 0.72043651, + "num_input_tokens_seen": 107943975, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.83984375, + "step": 5020, + "time_per_iteration": 2.493370532989502 + }, + { + "auxiliary_loss_clip": 0.01136052, + "auxiliary_loss_mlp": 0.01040456, + "balance_loss_clip": 1.0256741, + "balance_loss_mlp": 1.04941368, + "epoch": 0.301878851645874, + "flos": 18114168804480.0, + "grad_norm": 1.9013759847828555, + "language_loss": 0.78134364, + "learning_rate": 3.2734190256173085e-06, + "loss": 0.80310869, + "num_input_tokens_seen": 107962950, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8671875, + "step": 5021, + "time_per_iteration": 2.4670474529266357 + }, + { + "auxiliary_loss_clip": 0.01133279, + "auxiliary_loss_mlp": 0.01029745, + "balance_loss_clip": 1.01527357, + "balance_loss_mlp": 1.04964936, + "epoch": 0.301938974898542, + "flos": 17602226213760.0, + "grad_norm": 2.3821225807179696, + "language_loss": 0.76075405, + "learning_rate": 3.2731186861112877e-06, + "loss": 0.78238434, + "num_input_tokens_seen": 107979700, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8359375, + "step": 5022, + "time_per_iteration": 2.4737884998321533 + }, + { + "auxiliary_loss_clip": 0.01133657, + "auxiliary_loss_mlp": 0.01042925, + "balance_loss_clip": 1.02791631, + "balance_loss_mlp": 1.04880631, + "epoch": 0.30199909815120995, + "flos": 11181219079680.0, + "grad_norm": 1.7684005868111572, + "language_loss": 0.69896525, + "learning_rate": 3.2728182983284793e-06, + "loss": 0.72073108, + "num_input_tokens_seen": 107996645, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84765625, + "step": 5023, + "time_per_iteration": 2.4453155994415283 + }, + { + "auxiliary_loss_clip": 0.01136173, + "auxiliary_loss_mlp": 0.01041698, + "balance_loss_clip": 1.02673686, + "balance_loss_mlp": 1.04927671, + "epoch": 0.302059221403878, + "flos": 21907843632000.0, + "grad_norm": 2.0912728997662127, + "language_loss": 0.71588898, + "learning_rate": 3.2725178622802724e-06, + "loss": 0.73766768, + "num_input_tokens_seen": 108015020, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8671875, + "step": 5024, + "time_per_iteration": 2.4998810291290283 + }, + { + "auxiliary_loss_clip": 0.0113052, + "auxiliary_loss_mlp": 0.01047301, + "balance_loss_clip": 1.0314939, + "balance_loss_mlp": 1.04858792, + "epoch": 0.30211934465654594, + "flos": 26396390039040.0, + "grad_norm": 1.6483742353836974, + "language_loss": 0.73955721, + "learning_rate": 3.272217377978061e-06, + "loss": 0.76133543, + "num_input_tokens_seen": 108036430, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8203125, + "step": 5025, + "time_per_iteration": 2.5167019367218018 + }, + { + "auxiliary_loss_clip": 0.0113244, + "auxiliary_loss_mlp": 0.01042201, + "balance_loss_clip": 1.02800322, + "balance_loss_mlp": 1.0518502, + "epoch": 0.3021794679092139, + "flos": 23400470321280.0, + "grad_norm": 1.4799709397217862, + "language_loss": 0.67022824, + "learning_rate": 3.2719168454332387e-06, + "loss": 0.6919747, + "num_input_tokens_seen": 108054250, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8046875, + "step": 5026, + "time_per_iteration": 2.5326507091522217 + }, + { + "auxiliary_loss_clip": 0.01134018, + "auxiliary_loss_mlp": 0.01043238, + "balance_loss_clip": 1.02799106, + "balance_loss_mlp": 1.05083036, + "epoch": 0.30223959116188187, + "flos": 20260979942400.0, + "grad_norm": 1.6876842646939136, + "language_loss": 0.85252607, + "learning_rate": 3.2716162646572034e-06, + "loss": 0.87429863, + "num_input_tokens_seen": 108071495, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.83203125, + "step": 5027, + "time_per_iteration": 2.4527347087860107 + }, + { + "auxiliary_loss_clip": 0.01129327, + "auxiliary_loss_mlp": 0.01045705, + "balance_loss_clip": 1.03187656, + "balance_loss_mlp": 1.04739702, + "epoch": 0.30229971441454984, + "flos": 26687840993280.0, + "grad_norm": 1.665552114762065, + "language_loss": 0.78757018, + "learning_rate": 3.271315635661351e-06, + "loss": 0.80932051, + "num_input_tokens_seen": 108092135, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.8203125, + "step": 5028, + "time_per_iteration": 2.5677576065063477 + }, + { + "auxiliary_loss_clip": 0.01132481, + "auxiliary_loss_mlp": 0.01044847, + "balance_loss_clip": 1.0295043, + "balance_loss_mlp": 1.04922223, + "epoch": 0.3023598376672178, + "flos": 34345323953280.0, + "grad_norm": 2.0260385179345346, + "language_loss": 0.76721144, + "learning_rate": 3.2710149584570826e-06, + "loss": 0.78898472, + "num_input_tokens_seen": 108112945, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.83203125, + "step": 5029, + "time_per_iteration": 2.611917734146118 + }, + { + "auxiliary_loss_clip": 0.01133028, + "auxiliary_loss_mlp": 0.01043332, + "balance_loss_clip": 1.02642775, + "balance_loss_mlp": 1.04855132, + "epoch": 0.30241996091988577, + "flos": 23112143850240.0, + "grad_norm": 1.944959289407135, + "language_loss": 0.81868339, + "learning_rate": 3.2707142330557993e-06, + "loss": 0.84044701, + "num_input_tokens_seen": 108130325, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.84375, + "step": 5030, + "time_per_iteration": 2.605531930923462 + }, + { + "auxiliary_loss_clip": 0.01132926, + "auxiliary_loss_mlp": 0.01045193, + "balance_loss_clip": 1.02982664, + "balance_loss_mlp": 1.04754734, + "epoch": 0.30248008417255373, + "flos": 19390002958080.0, + "grad_norm": 1.748277903644489, + "language_loss": 0.69869608, + "learning_rate": 3.270413459468905e-06, + "loss": 0.72047728, + "num_input_tokens_seen": 108150300, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.85546875, + "step": 5031, + "time_per_iteration": 2.496833086013794 + }, + { + "auxiliary_loss_clip": 0.01132221, + "auxiliary_loss_mlp": 0.01036128, + "balance_loss_clip": 1.02103615, + "balance_loss_mlp": 1.04892659, + "epoch": 0.3025402074252217, + "flos": 23769704177280.0, + "grad_norm": 1.8467264077922103, + "language_loss": 0.82302773, + "learning_rate": 3.2701126377078047e-06, + "loss": 0.84471118, + "num_input_tokens_seen": 108170330, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 5032, + "time_per_iteration": 2.5062966346740723 + }, + { + "auxiliary_loss_clip": 0.01140181, + "auxiliary_loss_mlp": 0.01046543, + "balance_loss_clip": 1.02991903, + "balance_loss_mlp": 1.05332685, + "epoch": 0.30260033067788966, + "flos": 25994118648960.0, + "grad_norm": 2.10117653020426, + "language_loss": 0.73383862, + "learning_rate": 3.269811767783906e-06, + "loss": 0.75570583, + "num_input_tokens_seen": 108191265, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8671875, + "step": 5033, + "time_per_iteration": 2.561467170715332 + }, + { + "auxiliary_loss_clip": 0.01130223, + "auxiliary_loss_mlp": 0.01045217, + "balance_loss_clip": 1.03000593, + "balance_loss_mlp": 1.04782772, + "epoch": 0.3026604539305576, + "flos": 25374551932800.0, + "grad_norm": 1.437497934350084, + "language_loss": 0.74057245, + "learning_rate": 3.2695108497086185e-06, + "loss": 0.76232684, + "num_input_tokens_seen": 108211615, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.82421875, + "step": 5034, + "time_per_iteration": 2.511861801147461 + }, + { + "auxiliary_loss_clip": 0.01131916, + "auxiliary_loss_mlp": 0.01033507, + "balance_loss_clip": 1.01840353, + "balance_loss_mlp": 1.04825819, + "epoch": 0.3027205771832256, + "flos": 25812733944960.0, + "grad_norm": 1.9672144407329994, + "language_loss": 0.71617639, + "learning_rate": 3.269209883493352e-06, + "loss": 0.73783064, + "num_input_tokens_seen": 108231080, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8359375, + "step": 5035, + "time_per_iteration": 2.545917272567749 + }, + { + "auxiliary_loss_clip": 0.0113067, + "auxiliary_loss_mlp": 0.01032255, + "balance_loss_clip": 1.01835537, + "balance_loss_mlp": 1.04876685, + "epoch": 0.30278070043589356, + "flos": 27344539393920.0, + "grad_norm": 1.774174351542542, + "language_loss": 0.87232339, + "learning_rate": 3.2689088691495196e-06, + "loss": 0.89395267, + "num_input_tokens_seen": 108251125, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.8203125, + "step": 5036, + "time_per_iteration": 2.5197184085845947 + }, + { + "auxiliary_loss_clip": 0.01131426, + "auxiliary_loss_mlp": 0.01043041, + "balance_loss_clip": 1.02679288, + "balance_loss_mlp": 1.04866219, + "epoch": 0.3028408236885616, + "flos": 24786227070720.0, + "grad_norm": 2.2121077897300134, + "language_loss": 0.77760899, + "learning_rate": 3.268607806688536e-06, + "loss": 0.7993536, + "num_input_tokens_seen": 108272545, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.828125, + "step": 5037, + "time_per_iteration": 2.5372917652130127 + }, + { + "auxiliary_loss_clip": 0.01133533, + "auxiliary_loss_mlp": 0.01041477, + "balance_loss_clip": 1.02603984, + "balance_loss_mlp": 1.04973745, + "epoch": 0.30290094694122954, + "flos": 12932474670720.0, + "grad_norm": 2.4260021818478634, + "language_loss": 0.77920854, + "learning_rate": 3.268306696121816e-06, + "loss": 0.80095863, + "num_input_tokens_seen": 108289725, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8359375, + "step": 5038, + "time_per_iteration": 2.4360761642456055 + }, + { + "auxiliary_loss_clip": 0.01128654, + "auxiliary_loss_mlp": 0.01034869, + "balance_loss_clip": 1.02073669, + "balance_loss_mlp": 1.04859674, + "epoch": 0.3029610701938975, + "flos": 25916443488000.0, + "grad_norm": 1.8428508909689656, + "language_loss": 0.74134624, + "learning_rate": 3.2680055374607804e-06, + "loss": 0.76298141, + "num_input_tokens_seen": 108310690, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.80078125, + "step": 5039, + "time_per_iteration": 2.520517587661743 + }, + { + "auxiliary_loss_clip": 0.01129815, + "auxiliary_loss_mlp": 0.01037874, + "balance_loss_clip": 1.02426052, + "balance_loss_mlp": 1.05003977, + "epoch": 0.3030211934465655, + "flos": 21980993679360.0, + "grad_norm": 1.8268154911840482, + "language_loss": 0.80263746, + "learning_rate": 3.267704330716847e-06, + "loss": 0.82431436, + "num_input_tokens_seen": 108328905, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.796875, + "step": 5040, + "time_per_iteration": 2.469822406768799 + }, + { + "auxiliary_loss_clip": 0.01131744, + "auxiliary_loss_mlp": 0.01036261, + "balance_loss_clip": 1.02227795, + "balance_loss_mlp": 1.05101466, + "epoch": 0.30308131669923344, + "flos": 20991977625600.0, + "grad_norm": 1.5747579863116856, + "language_loss": 0.81914759, + "learning_rate": 3.267403075901438e-06, + "loss": 0.8408277, + "num_input_tokens_seen": 108346680, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.8046875, + "step": 5041, + "time_per_iteration": 2.5240108966827393 + }, + { + "auxiliary_loss_clip": 0.01062494, + "auxiliary_loss_mlp": 0.01003022, + "balance_loss_clip": 1.00106716, + "balance_loss_mlp": 1.02890241, + "epoch": 0.3031414399519014, + "flos": 60548875827840.0, + "grad_norm": 0.7678965945904674, + "language_loss": 0.59521127, + "learning_rate": 3.267101773025978e-06, + "loss": 0.61586642, + "num_input_tokens_seen": 108413885, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.3359375, + "step": 5042, + "time_per_iteration": 3.169004440307617 + }, + { + "auxiliary_loss_clip": 0.01134227, + "auxiliary_loss_mlp": 0.01037406, + "balance_loss_clip": 1.02271986, + "balance_loss_mlp": 1.05006266, + "epoch": 0.30320156320456937, + "flos": 21907664064000.0, + "grad_norm": 1.6113397759888244, + "language_loss": 0.71136838, + "learning_rate": 3.266800422101892e-06, + "loss": 0.73308468, + "num_input_tokens_seen": 108433640, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.84375, + "step": 5043, + "time_per_iteration": 2.5217440128326416 + }, + { + "auxiliary_loss_clip": 0.01132657, + "auxiliary_loss_mlp": 0.01037151, + "balance_loss_clip": 1.02207136, + "balance_loss_mlp": 1.04824769, + "epoch": 0.30326168645723733, + "flos": 21652770176640.0, + "grad_norm": 2.6644669890018773, + "language_loss": 0.69351244, + "learning_rate": 3.266499023140606e-06, + "loss": 0.71521056, + "num_input_tokens_seen": 108452640, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84375, + "step": 5044, + "time_per_iteration": 2.4741897583007812 + }, + { + "auxiliary_loss_clip": 0.01129908, + "auxiliary_loss_mlp": 0.01037342, + "balance_loss_clip": 1.02252388, + "balance_loss_mlp": 1.04823565, + "epoch": 0.3033218097099053, + "flos": 21871286565120.0, + "grad_norm": 1.3748845619029404, + "language_loss": 0.77210236, + "learning_rate": 3.2661975761535513e-06, + "loss": 0.79377484, + "num_input_tokens_seen": 108472470, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.81640625, + "step": 5045, + "time_per_iteration": 2.5023043155670166 + }, + { + "auxiliary_loss_clip": 0.01132495, + "auxiliary_loss_mlp": 0.0103816, + "balance_loss_clip": 1.02240646, + "balance_loss_mlp": 1.04892182, + "epoch": 0.30338193296257326, + "flos": 27089717333760.0, + "grad_norm": 1.538768377317596, + "language_loss": 0.72444695, + "learning_rate": 3.2658960811521564e-06, + "loss": 0.74615347, + "num_input_tokens_seen": 108493025, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8359375, + "step": 5046, + "time_per_iteration": 2.5163753032684326 + }, + { + "auxiliary_loss_clip": 0.01134062, + "auxiliary_loss_mlp": 0.01042653, + "balance_loss_clip": 1.02611256, + "balance_loss_mlp": 1.04859519, + "epoch": 0.30344205621524123, + "flos": 19534363718400.0, + "grad_norm": 3.2419373644374176, + "language_loss": 0.80737638, + "learning_rate": 3.2655945381478564e-06, + "loss": 0.82914352, + "num_input_tokens_seen": 108513480, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.85546875, + "step": 5047, + "time_per_iteration": 2.547245979309082 + }, + { + "auxiliary_loss_clip": 0.01131045, + "auxiliary_loss_mlp": 0.01040382, + "balance_loss_clip": 1.02569556, + "balance_loss_mlp": 1.04871237, + "epoch": 0.3035021794679092, + "flos": 23910976368000.0, + "grad_norm": 1.9357354539113198, + "language_loss": 0.72334075, + "learning_rate": 3.265292947152084e-06, + "loss": 0.74505508, + "num_input_tokens_seen": 108533155, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.82421875, + "step": 5048, + "time_per_iteration": 2.494016170501709 + }, + { + "auxiliary_loss_clip": 0.01129755, + "auxiliary_loss_mlp": 0.01035796, + "balance_loss_clip": 1.02093613, + "balance_loss_mlp": 1.04574537, + "epoch": 0.30356230272057716, + "flos": 16143606725760.0, + "grad_norm": 1.7731178616486785, + "language_loss": 0.75098324, + "learning_rate": 3.2649913081762763e-06, + "loss": 0.7726388, + "num_input_tokens_seen": 108551900, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.83984375, + "step": 5049, + "time_per_iteration": 2.502979040145874 + }, + { + "auxiliary_loss_clip": 0.01133123, + "auxiliary_loss_mlp": 0.01037727, + "balance_loss_clip": 1.0226109, + "balance_loss_mlp": 1.04864645, + "epoch": 0.3036224259732452, + "flos": 28914697589760.0, + "grad_norm": 1.6762363098185904, + "language_loss": 0.8194561, + "learning_rate": 3.2646896212318717e-06, + "loss": 0.84116459, + "num_input_tokens_seen": 108574005, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.84375, + "step": 5050, + "time_per_iteration": 2.5254666805267334 + }, + { + "auxiliary_loss_clip": 0.01132852, + "auxiliary_loss_mlp": 0.01038752, + "balance_loss_clip": 1.02299261, + "balance_loss_mlp": 1.04868484, + "epoch": 0.30368254922591315, + "flos": 21105599322240.0, + "grad_norm": 2.8996577335854625, + "language_loss": 0.73712784, + "learning_rate": 3.2643878863303106e-06, + "loss": 0.7588439, + "num_input_tokens_seen": 108592715, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.83984375, + "step": 5051, + "time_per_iteration": 2.511455774307251 + }, + { + "auxiliary_loss_clip": 0.01130282, + "auxiliary_loss_mlp": 0.01035032, + "balance_loss_clip": 1.01967764, + "balance_loss_mlp": 1.04650712, + "epoch": 0.3037426724785811, + "flos": 23002293081600.0, + "grad_norm": 1.5939626777548828, + "language_loss": 0.76463652, + "learning_rate": 3.264086103483033e-06, + "loss": 0.78628969, + "num_input_tokens_seen": 108611770, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8359375, + "step": 5052, + "time_per_iteration": 2.478046417236328 + }, + { + "auxiliary_loss_clip": 0.01131682, + "auxiliary_loss_mlp": 0.01039995, + "balance_loss_clip": 1.02484894, + "balance_loss_mlp": 1.04609728, + "epoch": 0.3038027957312491, + "flos": 15632705629440.0, + "grad_norm": 1.8043694132732864, + "language_loss": 0.82780337, + "learning_rate": 3.2637842727014836e-06, + "loss": 0.84952009, + "num_input_tokens_seen": 108629070, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.85546875, + "step": 5053, + "time_per_iteration": 3.983353614807129 + }, + { + "auxiliary_loss_clip": 0.01130411, + "auxiliary_loss_mlp": 0.01042277, + "balance_loss_clip": 1.02661896, + "balance_loss_mlp": 1.04685903, + "epoch": 0.30386291898391704, + "flos": 12713994195840.0, + "grad_norm": 1.5364375285570075, + "language_loss": 0.70702368, + "learning_rate": 3.2634823939971083e-06, + "loss": 0.72875059, + "num_input_tokens_seen": 108646315, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8359375, + "step": 5054, + "time_per_iteration": 2.4379446506500244 + }, + { + "auxiliary_loss_clip": 0.01132155, + "auxiliary_loss_mlp": 0.01033029, + "balance_loss_clip": 1.01768088, + "balance_loss_mlp": 1.04817367, + "epoch": 0.303923042236585, + "flos": 26359437922560.0, + "grad_norm": 1.8280069054430388, + "language_loss": 0.69543922, + "learning_rate": 3.2631804673813545e-06, + "loss": 0.71709108, + "num_input_tokens_seen": 108665920, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.84375, + "step": 5055, + "time_per_iteration": 2.5247206687927246 + }, + { + "auxiliary_loss_clip": 0.01131491, + "auxiliary_loss_mlp": 0.01036764, + "balance_loss_clip": 1.02056348, + "balance_loss_mlp": 1.04682207, + "epoch": 0.30398316548925297, + "flos": 19719232041600.0, + "grad_norm": 2.038005952710024, + "language_loss": 0.67502165, + "learning_rate": 3.2628784928656707e-06, + "loss": 0.69670427, + "num_input_tokens_seen": 108683485, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.84765625, + "step": 5056, + "time_per_iteration": 2.4767425060272217 + }, + { + "auxiliary_loss_clip": 0.01130078, + "auxiliary_loss_mlp": 0.01039078, + "balance_loss_clip": 1.02434373, + "balance_loss_mlp": 1.04886115, + "epoch": 0.30404328874192094, + "flos": 24239846315520.0, + "grad_norm": 1.5579435169669187, + "language_loss": 0.82500231, + "learning_rate": 3.262576470461507e-06, + "loss": 0.84669387, + "num_input_tokens_seen": 108702700, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8125, + "step": 5057, + "time_per_iteration": 2.499105453491211 + }, + { + "auxiliary_loss_clip": 0.01129487, + "auxiliary_loss_mlp": 0.01036385, + "balance_loss_clip": 1.02171588, + "balance_loss_mlp": 1.04686213, + "epoch": 0.3041034119945889, + "flos": 24498942094080.0, + "grad_norm": 3.274565054245196, + "language_loss": 0.89040101, + "learning_rate": 3.2622744001803176e-06, + "loss": 0.91205966, + "num_input_tokens_seen": 108721860, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.828125, + "step": 5058, + "time_per_iteration": 2.4966368675231934 + }, + { + "auxiliary_loss_clip": 0.01131903, + "auxiliary_loss_mlp": 0.01042482, + "balance_loss_clip": 1.02681756, + "balance_loss_mlp": 1.04829955, + "epoch": 0.30416353524725687, + "flos": 28288881907200.0, + "grad_norm": 2.2189779437975274, + "language_loss": 0.71709251, + "learning_rate": 3.2619722820335564e-06, + "loss": 0.73883629, + "num_input_tokens_seen": 108743215, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8359375, + "step": 5059, + "time_per_iteration": 2.5429141521453857 + }, + { + "auxiliary_loss_clip": 0.01130965, + "auxiliary_loss_mlp": 0.01037733, + "balance_loss_clip": 1.0233928, + "balance_loss_mlp": 1.04720807, + "epoch": 0.30422365849992483, + "flos": 23660392112640.0, + "grad_norm": 10.158939103063299, + "language_loss": 0.73069966, + "learning_rate": 3.26167011603268e-06, + "loss": 0.75238669, + "num_input_tokens_seen": 108765505, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.8359375, + "step": 5060, + "time_per_iteration": 2.529862403869629 + }, + { + "auxiliary_loss_clip": 0.01132671, + "auxiliary_loss_mlp": 0.01034539, + "balance_loss_clip": 1.01979291, + "balance_loss_mlp": 1.04885316, + "epoch": 0.3042837817525928, + "flos": 22998773548800.0, + "grad_norm": 1.8510962431794071, + "language_loss": 0.76926744, + "learning_rate": 3.2613679021891463e-06, + "loss": 0.79093957, + "num_input_tokens_seen": 108783370, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8359375, + "step": 5061, + "time_per_iteration": 2.496739149093628 + }, + { + "auxiliary_loss_clip": 0.01138048, + "auxiliary_loss_mlp": 0.01039513, + "balance_loss_clip": 1.02312136, + "balance_loss_mlp": 1.0527482, + "epoch": 0.30434390500526076, + "flos": 22082332924800.0, + "grad_norm": 2.264413063412747, + "language_loss": 0.82064837, + "learning_rate": 3.261065640514415e-06, + "loss": 0.84242392, + "num_input_tokens_seen": 108797430, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8515625, + "step": 5062, + "time_per_iteration": 2.476290702819824 + }, + { + "auxiliary_loss_clip": 0.01128914, + "auxiliary_loss_mlp": 0.01032652, + "balance_loss_clip": 1.01821566, + "balance_loss_mlp": 1.04721808, + "epoch": 0.3044040282579287, + "flos": 25483504861440.0, + "grad_norm": 1.7072945635391377, + "language_loss": 0.74737656, + "learning_rate": 3.2607633310199483e-06, + "loss": 0.76899219, + "num_input_tokens_seen": 108816945, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.81640625, + "step": 5063, + "time_per_iteration": 2.5384082794189453 + }, + { + "auxiliary_loss_clip": 0.01132221, + "auxiliary_loss_mlp": 0.01037959, + "balance_loss_clip": 1.0214901, + "balance_loss_mlp": 1.04908288, + "epoch": 0.30446415151059675, + "flos": 21945478106880.0, + "grad_norm": 1.8176932093217915, + "language_loss": 0.84120226, + "learning_rate": 3.26046097371721e-06, + "loss": 0.86290407, + "num_input_tokens_seen": 108836615, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.83203125, + "step": 5064, + "time_per_iteration": 2.5108115673065186 + }, + { + "auxiliary_loss_clip": 0.01131651, + "auxiliary_loss_mlp": 0.01034827, + "balance_loss_clip": 1.01888871, + "balance_loss_mlp": 1.04751444, + "epoch": 0.3045242747632647, + "flos": 16435416816000.0, + "grad_norm": 1.7759562417820063, + "language_loss": 0.75990027, + "learning_rate": 3.2601585686176655e-06, + "loss": 0.78156507, + "num_input_tokens_seen": 108855165, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.84375, + "step": 5065, + "time_per_iteration": 2.5061376094818115 + }, + { + "auxiliary_loss_clip": 0.01133071, + "auxiliary_loss_mlp": 0.01040555, + "balance_loss_clip": 1.02470005, + "balance_loss_mlp": 1.04716659, + "epoch": 0.3045843980159327, + "flos": 31540341957120.0, + "grad_norm": 2.0133457948817406, + "language_loss": 0.62271762, + "learning_rate": 3.2598561157327814e-06, + "loss": 0.64445394, + "num_input_tokens_seen": 108874690, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.859375, + "step": 5066, + "time_per_iteration": 2.6000661849975586 + }, + { + "auxiliary_loss_clip": 0.01140413, + "auxiliary_loss_mlp": 0.01049285, + "balance_loss_clip": 1.03385913, + "balance_loss_mlp": 1.05344141, + "epoch": 0.30464452126860064, + "flos": 17853636481920.0, + "grad_norm": 1.7828452375691122, + "language_loss": 0.82887459, + "learning_rate": 3.2595536150740265e-06, + "loss": 0.85077155, + "num_input_tokens_seen": 108893140, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.87109375, + "step": 5067, + "time_per_iteration": 2.4754927158355713 + }, + { + "auxiliary_loss_clip": 0.01130661, + "auxiliary_loss_mlp": 0.01043304, + "balance_loss_clip": 1.02829516, + "balance_loss_mlp": 1.04839194, + "epoch": 0.3047046445212686, + "flos": 20631398947200.0, + "grad_norm": 2.0779895110277535, + "language_loss": 0.62978256, + "learning_rate": 3.259251066652873e-06, + "loss": 0.65152222, + "num_input_tokens_seen": 108911880, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8203125, + "step": 5068, + "time_per_iteration": 2.4957847595214844 + }, + { + "auxiliary_loss_clip": 0.01127866, + "auxiliary_loss_mlp": 0.01031598, + "balance_loss_clip": 1.01633286, + "balance_loss_mlp": 1.04544926, + "epoch": 0.3047647677739366, + "flos": 21287594557440.0, + "grad_norm": 1.6700683770947133, + "language_loss": 0.75058538, + "learning_rate": 3.258948470480793e-06, + "loss": 0.77217996, + "num_input_tokens_seen": 108930440, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.82421875, + "step": 5069, + "time_per_iteration": 2.487473964691162 + }, + { + "auxiliary_loss_clip": 0.0112831, + "auxiliary_loss_mlp": 0.01043362, + "balance_loss_clip": 1.02798414, + "balance_loss_mlp": 1.04746199, + "epoch": 0.30482489102660454, + "flos": 20995928121600.0, + "grad_norm": 1.839652658151057, + "language_loss": 0.75732648, + "learning_rate": 3.258645826569261e-06, + "loss": 0.7790432, + "num_input_tokens_seen": 108949125, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.80859375, + "step": 5070, + "time_per_iteration": 2.500335216522217 + }, + { + "auxiliary_loss_clip": 0.0113368, + "auxiliary_loss_mlp": 0.0103861, + "balance_loss_clip": 1.02229047, + "balance_loss_mlp": 1.04640067, + "epoch": 0.3048850142792725, + "flos": 26290812988800.0, + "grad_norm": 1.7318177446844936, + "language_loss": 0.81738281, + "learning_rate": 3.2583431349297527e-06, + "loss": 0.83910567, + "num_input_tokens_seen": 108972190, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.875, + "step": 5071, + "time_per_iteration": 2.5726318359375 + }, + { + "auxiliary_loss_clip": 0.01134597, + "auxiliary_loss_mlp": 0.01041754, + "balance_loss_clip": 1.02507651, + "balance_loss_mlp": 1.04737437, + "epoch": 0.30494513753194047, + "flos": 22346241125760.0, + "grad_norm": 1.5942809817556516, + "language_loss": 0.76252651, + "learning_rate": 3.2580403955737467e-06, + "loss": 0.78428996, + "num_input_tokens_seen": 108990325, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.875, + "step": 5072, + "time_per_iteration": 2.5147287845611572 + }, + { + "auxiliary_loss_clip": 0.01132045, + "auxiliary_loss_mlp": 0.01046119, + "balance_loss_clip": 1.03059769, + "balance_loss_mlp": 1.04904687, + "epoch": 0.30500526078460843, + "flos": 19537667769600.0, + "grad_norm": 2.176920469303851, + "language_loss": 0.71318722, + "learning_rate": 3.257737608512723e-06, + "loss": 0.73496878, + "num_input_tokens_seen": 109009505, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.83203125, + "step": 5073, + "time_per_iteration": 2.4736156463623047 + }, + { + "auxiliary_loss_clip": 0.01135708, + "auxiliary_loss_mlp": 0.0104584, + "balance_loss_clip": 1.02974713, + "balance_loss_mlp": 1.04842663, + "epoch": 0.3050653840372764, + "flos": 14465321614080.0, + "grad_norm": 2.146618897096623, + "language_loss": 0.7663309, + "learning_rate": 3.257434773758163e-06, + "loss": 0.78814638, + "num_input_tokens_seen": 109026350, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.875, + "step": 5074, + "time_per_iteration": 2.4547433853149414 + }, + { + "auxiliary_loss_clip": 0.01131716, + "auxiliary_loss_mlp": 0.01035183, + "balance_loss_clip": 1.02015638, + "balance_loss_mlp": 1.04879379, + "epoch": 0.30512550728994436, + "flos": 24243796811520.0, + "grad_norm": 1.8636036931869358, + "language_loss": 0.73939347, + "learning_rate": 3.25713189132155e-06, + "loss": 0.76106244, + "num_input_tokens_seen": 109044165, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 5075, + "time_per_iteration": 2.4922661781311035 + }, + { + "auxiliary_loss_clip": 0.01135073, + "auxiliary_loss_mlp": 0.01042646, + "balance_loss_clip": 1.02508652, + "balance_loss_mlp": 1.04769778, + "epoch": 0.30518563054261233, + "flos": 16360542915840.0, + "grad_norm": 2.14961805392919, + "language_loss": 0.75488788, + "learning_rate": 3.2568289612143703e-06, + "loss": 0.77666509, + "num_input_tokens_seen": 109060665, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.875, + "step": 5076, + "time_per_iteration": 2.471381187438965 + }, + { + "auxiliary_loss_clip": 0.0113449, + "auxiliary_loss_mlp": 0.01039246, + "balance_loss_clip": 1.02407038, + "balance_loss_mlp": 1.05137944, + "epoch": 0.30524575379528035, + "flos": 21579584215680.0, + "grad_norm": 1.505999917432091, + "language_loss": 0.79183954, + "learning_rate": 3.25652598344811e-06, + "loss": 0.81357688, + "num_input_tokens_seen": 109080035, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.828125, + "step": 5077, + "time_per_iteration": 2.5000534057617188 + }, + { + "auxiliary_loss_clip": 0.01127394, + "auxiliary_loss_mlp": 0.01030923, + "balance_loss_clip": 1.01739252, + "balance_loss_mlp": 1.0478642, + "epoch": 0.3053058770479483, + "flos": 16545231671040.0, + "grad_norm": 1.9961733055656423, + "language_loss": 0.74662113, + "learning_rate": 3.256222958034259e-06, + "loss": 0.76820433, + "num_input_tokens_seen": 109097385, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.796875, + "step": 5078, + "time_per_iteration": 2.4746944904327393 + }, + { + "auxiliary_loss_clip": 0.01130678, + "auxiliary_loss_mlp": 0.01047379, + "balance_loss_clip": 1.03203678, + "balance_loss_mlp": 1.04787958, + "epoch": 0.3053660003006163, + "flos": 12312907954560.0, + "grad_norm": 2.113994612729099, + "language_loss": 0.67216343, + "learning_rate": 3.255919884984307e-06, + "loss": 0.69394398, + "num_input_tokens_seen": 109115495, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.828125, + "step": 5079, + "time_per_iteration": 2.4575493335723877 + }, + { + "auxiliary_loss_clip": 0.01130366, + "auxiliary_loss_mlp": 0.01034996, + "balance_loss_clip": 1.02034521, + "balance_loss_mlp": 1.04758203, + "epoch": 0.30542612355328425, + "flos": 23112287504640.0, + "grad_norm": 1.7438542216491464, + "language_loss": 0.80291754, + "learning_rate": 3.2556167643097477e-06, + "loss": 0.82457113, + "num_input_tokens_seen": 109134235, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.828125, + "step": 5080, + "time_per_iteration": 2.490842342376709 + }, + { + "auxiliary_loss_clip": 0.01129694, + "auxiliary_loss_mlp": 0.01039719, + "balance_loss_clip": 1.02475858, + "balance_loss_mlp": 1.04612935, + "epoch": 0.3054862468059522, + "flos": 24389450461440.0, + "grad_norm": 2.2926909410882903, + "language_loss": 0.80971938, + "learning_rate": 3.255313596022074e-06, + "loss": 0.83141345, + "num_input_tokens_seen": 109152760, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8359375, + "step": 5081, + "time_per_iteration": 2.5298712253570557 + }, + { + "auxiliary_loss_clip": 0.01129539, + "auxiliary_loss_mlp": 0.01034881, + "balance_loss_clip": 1.01952672, + "balance_loss_mlp": 1.04690182, + "epoch": 0.3055463700586202, + "flos": 29386096704000.0, + "grad_norm": 1.691443128795128, + "language_loss": 0.71810889, + "learning_rate": 3.255010380132783e-06, + "loss": 0.73975313, + "num_input_tokens_seen": 109173925, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.828125, + "step": 5082, + "time_per_iteration": 2.5567750930786133 + }, + { + "auxiliary_loss_clip": 0.0113354, + "auxiliary_loss_mlp": 0.01038275, + "balance_loss_clip": 1.02073884, + "balance_loss_mlp": 1.0468955, + "epoch": 0.30560649331128814, + "flos": 25591775431680.0, + "grad_norm": 1.9955003311475592, + "language_loss": 0.73615241, + "learning_rate": 3.2547071166533736e-06, + "loss": 0.75787055, + "num_input_tokens_seen": 109192510, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8671875, + "step": 5083, + "time_per_iteration": 2.5083980560302734 + }, + { + "auxiliary_loss_clip": 0.01129694, + "auxiliary_loss_mlp": 0.0103765, + "balance_loss_clip": 1.02184248, + "balance_loss_mlp": 1.04441404, + "epoch": 0.3056666165639561, + "flos": 19128321400320.0, + "grad_norm": 3.7957379738132517, + "language_loss": 0.70895267, + "learning_rate": 3.254403805595344e-06, + "loss": 0.73062611, + "num_input_tokens_seen": 109210885, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8515625, + "step": 5084, + "time_per_iteration": 2.477665424346924 + }, + { + "auxiliary_loss_clip": 0.01134775, + "auxiliary_loss_mlp": 0.01032514, + "balance_loss_clip": 1.01631355, + "balance_loss_mlp": 1.04818797, + "epoch": 0.30572673981662407, + "flos": 15523860441600.0, + "grad_norm": 2.0055460894973933, + "language_loss": 0.78791595, + "learning_rate": 3.2541004469701962e-06, + "loss": 0.80958885, + "num_input_tokens_seen": 109229180, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8671875, + "step": 5085, + "time_per_iteration": 2.475783586502075 + }, + { + "auxiliary_loss_clip": 0.01127203, + "auxiliary_loss_mlp": 0.01037047, + "balance_loss_clip": 1.02187788, + "balance_loss_mlp": 1.04529142, + "epoch": 0.30578686306929204, + "flos": 21506541909120.0, + "grad_norm": 1.5510153728860234, + "language_loss": 0.77846372, + "learning_rate": 3.2537970407894342e-06, + "loss": 0.80010617, + "num_input_tokens_seen": 109249510, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8203125, + "step": 5086, + "time_per_iteration": 2.514472007751465 + }, + { + "auxiliary_loss_clip": 0.01132639, + "auxiliary_loss_mlp": 0.01041993, + "balance_loss_clip": 1.02592945, + "balance_loss_mlp": 1.04930758, + "epoch": 0.30584698632196, + "flos": 20954271323520.0, + "grad_norm": 1.7256556540888637, + "language_loss": 0.77121228, + "learning_rate": 3.253493587064563e-06, + "loss": 0.79295856, + "num_input_tokens_seen": 109268200, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8359375, + "step": 5087, + "time_per_iteration": 2.4817616939544678 + }, + { + "auxiliary_loss_clip": 0.01132683, + "auxiliary_loss_mlp": 0.01040214, + "balance_loss_clip": 1.02346563, + "balance_loss_mlp": 1.04716742, + "epoch": 0.30590710957462797, + "flos": 24681116897280.0, + "grad_norm": 2.0600622883478517, + "language_loss": 0.72582048, + "learning_rate": 3.2531900858070885e-06, + "loss": 0.74754953, + "num_input_tokens_seen": 109288370, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.85546875, + "step": 5088, + "time_per_iteration": 2.538318395614624 + }, + { + "auxiliary_loss_clip": 0.01135035, + "auxiliary_loss_mlp": 0.01039164, + "balance_loss_clip": 1.02300477, + "balance_loss_mlp": 1.04673004, + "epoch": 0.30596723282729593, + "flos": 17086907744640.0, + "grad_norm": 2.417480227404851, + "language_loss": 0.7889666, + "learning_rate": 3.252886537028521e-06, + "loss": 0.81070858, + "num_input_tokens_seen": 109306730, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8828125, + "step": 5089, + "time_per_iteration": 2.4561989307403564 + }, + { + "auxiliary_loss_clip": 0.0113113, + "auxiliary_loss_mlp": 0.0103884, + "balance_loss_clip": 1.02328289, + "balance_loss_mlp": 1.04813027, + "epoch": 0.30602735607996395, + "flos": 22857106308480.0, + "grad_norm": 2.044405318996134, + "language_loss": 0.77061844, + "learning_rate": 3.2525829407403703e-06, + "loss": 0.79231811, + "num_input_tokens_seen": 109327360, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.828125, + "step": 5090, + "time_per_iteration": 2.5215258598327637 + }, + { + "auxiliary_loss_clip": 0.01134524, + "auxiliary_loss_mlp": 0.01046182, + "balance_loss_clip": 1.02999353, + "balance_loss_mlp": 1.04693675, + "epoch": 0.3060874793326319, + "flos": 29861482227840.0, + "grad_norm": 1.7474050348479595, + "language_loss": 0.76481628, + "learning_rate": 3.2522792969541488e-06, + "loss": 0.78662336, + "num_input_tokens_seen": 109348135, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.875, + "step": 5091, + "time_per_iteration": 2.535468578338623 + }, + { + "auxiliary_loss_clip": 0.01133443, + "auxiliary_loss_mlp": 0.0103559, + "balance_loss_clip": 1.01955616, + "balance_loss_mlp": 1.04671383, + "epoch": 0.3061476025852999, + "flos": 20448577699200.0, + "grad_norm": 1.638842582319787, + "language_loss": 0.71933579, + "learning_rate": 3.2519756056813705e-06, + "loss": 0.7410261, + "num_input_tokens_seen": 109366220, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8671875, + "step": 5092, + "time_per_iteration": 2.512096405029297 + }, + { + "auxiliary_loss_clip": 0.01131454, + "auxiliary_loss_mlp": 0.01035497, + "balance_loss_clip": 1.02131701, + "balance_loss_mlp": 1.04765177, + "epoch": 0.30620772583796785, + "flos": 19391475415680.0, + "grad_norm": 1.9362192703697652, + "language_loss": 0.8216877, + "learning_rate": 3.2516718669335522e-06, + "loss": 0.84335721, + "num_input_tokens_seen": 109385260, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8359375, + "step": 5093, + "time_per_iteration": 2.464477300643921 + }, + { + "auxiliary_loss_clip": 0.01129105, + "auxiliary_loss_mlp": 0.01036711, + "balance_loss_clip": 1.02239954, + "balance_loss_mlp": 1.04639721, + "epoch": 0.3062678490906358, + "flos": 24024562151040.0, + "grad_norm": 1.6957020618246583, + "language_loss": 0.75365555, + "learning_rate": 3.2513680807222114e-06, + "loss": 0.77531368, + "num_input_tokens_seen": 109405025, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.828125, + "step": 5094, + "time_per_iteration": 2.5149855613708496 + }, + { + "auxiliary_loss_clip": 0.01128293, + "auxiliary_loss_mlp": 0.01039658, + "balance_loss_clip": 1.02464378, + "balance_loss_mlp": 1.04530072, + "epoch": 0.3063279723433038, + "flos": 19754639873280.0, + "grad_norm": 1.922814039194465, + "language_loss": 0.76033115, + "learning_rate": 3.251064247058868e-06, + "loss": 0.78201067, + "num_input_tokens_seen": 109422465, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 5095, + "time_per_iteration": 5.438723802566528 + }, + { + "auxiliary_loss_clip": 0.01127363, + "auxiliary_loss_mlp": 0.01038505, + "balance_loss_clip": 1.02325845, + "balance_loss_mlp": 1.04581833, + "epoch": 0.30638809559597174, + "flos": 22450022496000.0, + "grad_norm": 1.7577098515851188, + "language_loss": 0.8050971, + "learning_rate": 3.250760365955042e-06, + "loss": 0.82675582, + "num_input_tokens_seen": 109440575, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.81640625, + "step": 5096, + "time_per_iteration": 2.4706614017486572 + }, + { + "auxiliary_loss_clip": 0.01130131, + "auxiliary_loss_mlp": 0.01035159, + "balance_loss_clip": 1.02052069, + "balance_loss_mlp": 1.04556763, + "epoch": 0.3064482188486397, + "flos": 17165157523200.0, + "grad_norm": 2.0672553061960586, + "language_loss": 0.8209089, + "learning_rate": 3.250456437422258e-06, + "loss": 0.84256178, + "num_input_tokens_seen": 109459050, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.84375, + "step": 5097, + "time_per_iteration": 2.457242250442505 + }, + { + "auxiliary_loss_clip": 0.0112984, + "auxiliary_loss_mlp": 0.01039085, + "balance_loss_clip": 1.02227616, + "balance_loss_mlp": 1.04537082, + "epoch": 0.3065083421013077, + "flos": 23768483114880.0, + "grad_norm": 1.9081721986815667, + "language_loss": 0.77858478, + "learning_rate": 3.250152461472041e-06, + "loss": 0.80027401, + "num_input_tokens_seen": 109475860, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.84375, + "step": 5098, + "time_per_iteration": 2.4709839820861816 + }, + { + "auxiliary_loss_clip": 0.01128893, + "auxiliary_loss_mlp": 0.01035797, + "balance_loss_clip": 1.02057385, + "balance_loss_mlp": 1.0466584, + "epoch": 0.30656846535397564, + "flos": 26431833784320.0, + "grad_norm": 1.9501450681008343, + "language_loss": 0.83948421, + "learning_rate": 3.249848438115917e-06, + "loss": 0.86113107, + "num_input_tokens_seen": 109494760, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8203125, + "step": 5099, + "time_per_iteration": 2.537771224975586 + }, + { + "auxiliary_loss_clip": 0.01130145, + "auxiliary_loss_mlp": 0.01042266, + "balance_loss_clip": 1.02653074, + "balance_loss_mlp": 1.04364753, + "epoch": 0.3066285886066436, + "flos": 26651786716800.0, + "grad_norm": 2.2273819247618376, + "language_loss": 0.85744429, + "learning_rate": 3.2495443673654148e-06, + "loss": 0.87916839, + "num_input_tokens_seen": 109516480, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8671875, + "step": 5100, + "time_per_iteration": 2.5103259086608887 + }, + { + "auxiliary_loss_clip": 0.01129277, + "auxiliary_loss_mlp": 0.01038498, + "balance_loss_clip": 1.02259541, + "balance_loss_mlp": 1.04542243, + "epoch": 0.30668871185931157, + "flos": 15049947375360.0, + "grad_norm": 1.8863659276771934, + "language_loss": 0.79225194, + "learning_rate": 3.249240249232065e-06, + "loss": 0.81392968, + "num_input_tokens_seen": 109534615, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8359375, + "step": 5101, + "time_per_iteration": 2.4733920097351074 + }, + { + "auxiliary_loss_clip": 0.01131914, + "auxiliary_loss_mlp": 0.01045873, + "balance_loss_clip": 1.02869534, + "balance_loss_mlp": 1.04708326, + "epoch": 0.30674883511197953, + "flos": 20082109190400.0, + "grad_norm": 1.7393564952665503, + "language_loss": 0.79405224, + "learning_rate": 3.2489360837273998e-06, + "loss": 0.81583011, + "num_input_tokens_seen": 109554040, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.84765625, + "step": 5102, + "time_per_iteration": 2.4608778953552246 + }, + { + "auxiliary_loss_clip": 0.01134414, + "auxiliary_loss_mlp": 0.01038608, + "balance_loss_clip": 1.02135825, + "balance_loss_mlp": 1.04940438, + "epoch": 0.30680895836464755, + "flos": 22893807029760.0, + "grad_norm": 1.7201607461659805, + "language_loss": 0.88999605, + "learning_rate": 3.2486318708629532e-06, + "loss": 0.9117263, + "num_input_tokens_seen": 109574345, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.84765625, + "step": 5103, + "time_per_iteration": 2.5295228958129883 + }, + { + "auxiliary_loss_clip": 0.01131581, + "auxiliary_loss_mlp": 0.01041048, + "balance_loss_clip": 1.02549076, + "balance_loss_mlp": 1.04700959, + "epoch": 0.3068690816173155, + "flos": 23696159080320.0, + "grad_norm": 1.6453097169103326, + "language_loss": 0.74079049, + "learning_rate": 3.2483276106502607e-06, + "loss": 0.76251674, + "num_input_tokens_seen": 109593670, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84375, + "step": 5104, + "time_per_iteration": 2.4923107624053955 + }, + { + "auxiliary_loss_clip": 0.01132054, + "auxiliary_loss_mlp": 0.01042794, + "balance_loss_clip": 1.02690291, + "balance_loss_mlp": 1.04555643, + "epoch": 0.3069292048699835, + "flos": 23551044134400.0, + "grad_norm": 1.8308515164246026, + "language_loss": 0.73333633, + "learning_rate": 3.2480233031008605e-06, + "loss": 0.75508481, + "num_input_tokens_seen": 109613385, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.86328125, + "step": 5105, + "time_per_iteration": 2.542391777038574 + }, + { + "auxiliary_loss_clip": 0.01131684, + "auxiliary_loss_mlp": 0.01047183, + "balance_loss_clip": 1.03058875, + "balance_loss_mlp": 1.04582942, + "epoch": 0.30698932812265145, + "flos": 24531656405760.0, + "grad_norm": 5.5167708582846515, + "language_loss": 0.8714695, + "learning_rate": 3.2477189482262916e-06, + "loss": 0.89325809, + "num_input_tokens_seen": 109632395, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.859375, + "step": 5106, + "time_per_iteration": 2.5054032802581787 + }, + { + "auxiliary_loss_clip": 0.01136079, + "auxiliary_loss_mlp": 0.01048019, + "balance_loss_clip": 1.03128242, + "balance_loss_mlp": 1.04750919, + "epoch": 0.3070494513753194, + "flos": 20996430912000.0, + "grad_norm": 2.142568748510771, + "language_loss": 0.71183497, + "learning_rate": 3.2474145460380945e-06, + "loss": 0.73367596, + "num_input_tokens_seen": 109651380, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.88671875, + "step": 5107, + "time_per_iteration": 2.4980053901672363 + }, + { + "auxiliary_loss_clip": 0.01125715, + "auxiliary_loss_mlp": 0.01050168, + "balance_loss_clip": 1.03372955, + "balance_loss_mlp": 1.04304433, + "epoch": 0.3071095746279874, + "flos": 19025940660480.0, + "grad_norm": 1.7923615416213727, + "language_loss": 0.72302651, + "learning_rate": 3.247110096547814e-06, + "loss": 0.74478543, + "num_input_tokens_seen": 109670240, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.82421875, + "step": 5108, + "time_per_iteration": 2.4588091373443604 + }, + { + "auxiliary_loss_clip": 0.01129796, + "auxiliary_loss_mlp": 0.01039934, + "balance_loss_clip": 1.02435362, + "balance_loss_mlp": 1.04538584, + "epoch": 0.30716969788065535, + "flos": 21215521918080.0, + "grad_norm": 1.5361542639570684, + "language_loss": 0.85768104, + "learning_rate": 3.2468055997669926e-06, + "loss": 0.87937832, + "num_input_tokens_seen": 109690810, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84375, + "step": 5109, + "time_per_iteration": 2.5077664852142334 + }, + { + "auxiliary_loss_clip": 0.01129418, + "auxiliary_loss_mlp": 0.01036702, + "balance_loss_clip": 1.02176476, + "balance_loss_mlp": 1.04534364, + "epoch": 0.3072298211333233, + "flos": 25772765086080.0, + "grad_norm": 1.6710196569280569, + "language_loss": 0.67220587, + "learning_rate": 3.2465010557071788e-06, + "loss": 0.69386709, + "num_input_tokens_seen": 109711145, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.84375, + "step": 5110, + "time_per_iteration": 2.5019631385803223 + }, + { + "auxiliary_loss_clip": 0.01126741, + "auxiliary_loss_mlp": 0.01036309, + "balance_loss_clip": 1.0220511, + "balance_loss_mlp": 1.04472136, + "epoch": 0.3072899443859913, + "flos": 25848931875840.0, + "grad_norm": 1.5071731281437177, + "language_loss": 0.76981276, + "learning_rate": 3.246196464379919e-06, + "loss": 0.79144323, + "num_input_tokens_seen": 109731425, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8203125, + "step": 5111, + "time_per_iteration": 2.544111490249634 + }, + { + "auxiliary_loss_clip": 0.01130011, + "auxiliary_loss_mlp": 0.01040184, + "balance_loss_clip": 1.02486551, + "balance_loss_mlp": 1.04580235, + "epoch": 0.30735006763865924, + "flos": 25922800195200.0, + "grad_norm": 1.9077726149637915, + "language_loss": 0.67174292, + "learning_rate": 3.245891825796765e-06, + "loss": 0.69344485, + "num_input_tokens_seen": 109752720, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.84375, + "step": 5112, + "time_per_iteration": 2.5171637535095215 + }, + { + "auxiliary_loss_clip": 0.01136791, + "auxiliary_loss_mlp": 0.01041143, + "balance_loss_clip": 1.02382207, + "balance_loss_mlp": 1.04846382, + "epoch": 0.3074101908913272, + "flos": 30917004312960.0, + "grad_norm": 1.8925702151041777, + "language_loss": 0.798181, + "learning_rate": 3.2455871399692678e-06, + "loss": 0.81996036, + "num_input_tokens_seen": 109772840, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.8828125, + "step": 5113, + "time_per_iteration": 2.55889892578125 + }, + { + "auxiliary_loss_clip": 0.01130603, + "auxiliary_loss_mlp": 0.01041707, + "balance_loss_clip": 1.0257802, + "balance_loss_mlp": 1.04549623, + "epoch": 0.30747031414399517, + "flos": 18401058731520.0, + "grad_norm": 1.951625458848465, + "language_loss": 0.77243912, + "learning_rate": 3.2452824069089815e-06, + "loss": 0.79416221, + "num_input_tokens_seen": 109790150, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8515625, + "step": 5114, + "time_per_iteration": 2.4328107833862305 + }, + { + "auxiliary_loss_clip": 0.01132188, + "auxiliary_loss_mlp": 0.01037897, + "balance_loss_clip": 1.02079093, + "balance_loss_mlp": 1.04755759, + "epoch": 0.30753043739666314, + "flos": 22633166966400.0, + "grad_norm": 1.8985095809631356, + "language_loss": 0.62356925, + "learning_rate": 3.2449776266274623e-06, + "loss": 0.64527011, + "num_input_tokens_seen": 109807985, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.84765625, + "step": 5115, + "time_per_iteration": 2.480536699295044 + }, + { + "auxiliary_loss_clip": 0.01132859, + "auxiliary_loss_mlp": 0.01036205, + "balance_loss_clip": 1.02033865, + "balance_loss_mlp": 1.04663444, + "epoch": 0.3075905606493311, + "flos": 27344072517120.0, + "grad_norm": 3.0190652682973176, + "language_loss": 0.82743216, + "learning_rate": 3.2446727991362657e-06, + "loss": 0.84912288, + "num_input_tokens_seen": 109825920, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.859375, + "step": 5116, + "time_per_iteration": 2.5121662616729736 + }, + { + "auxiliary_loss_clip": 0.01131907, + "auxiliary_loss_mlp": 0.0103869, + "balance_loss_clip": 1.02322841, + "balance_loss_mlp": 1.04825926, + "epoch": 0.3076506839019991, + "flos": 22090808534400.0, + "grad_norm": 1.8681947014951163, + "language_loss": 0.75772393, + "learning_rate": 3.244367924446952e-06, + "loss": 0.77942991, + "num_input_tokens_seen": 109846220, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8359375, + "step": 5117, + "time_per_iteration": 2.48750376701355 + }, + { + "auxiliary_loss_clip": 0.0113581, + "auxiliary_loss_mlp": 0.01035582, + "balance_loss_clip": 1.01952434, + "balance_loss_mlp": 1.05018401, + "epoch": 0.3077108071546671, + "flos": 21289533891840.0, + "grad_norm": 2.225887232792708, + "language_loss": 0.71873093, + "learning_rate": 3.2440630025710826e-06, + "loss": 0.74044484, + "num_input_tokens_seen": 109863870, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.85546875, + "step": 5118, + "time_per_iteration": 2.4745492935180664 + }, + { + "auxiliary_loss_clip": 0.01130971, + "auxiliary_loss_mlp": 0.01039981, + "balance_loss_clip": 1.02442479, + "balance_loss_mlp": 1.04630661, + "epoch": 0.30777093040733505, + "flos": 21430985650560.0, + "grad_norm": 1.5789952404099556, + "language_loss": 0.74312431, + "learning_rate": 3.243758033520219e-06, + "loss": 0.76483381, + "num_input_tokens_seen": 109883500, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.84375, + "step": 5119, + "time_per_iteration": 2.5185489654541016 + }, + { + "auxiliary_loss_clip": 0.01136122, + "auxiliary_loss_mlp": 0.01051479, + "balance_loss_clip": 1.03291845, + "balance_loss_mlp": 1.04891181, + "epoch": 0.307831053660003, + "flos": 23149275534720.0, + "grad_norm": 1.733023320063412, + "language_loss": 0.80267692, + "learning_rate": 3.243453017305926e-06, + "loss": 0.82455289, + "num_input_tokens_seen": 109904620, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.875, + "step": 5120, + "time_per_iteration": 2.5592849254608154 + }, + { + "auxiliary_loss_clip": 0.01127219, + "auxiliary_loss_mlp": 0.01048202, + "balance_loss_clip": 1.03299093, + "balance_loss_mlp": 1.04384947, + "epoch": 0.307891176912671, + "flos": 17019755268480.0, + "grad_norm": 1.564134517039273, + "language_loss": 0.80110037, + "learning_rate": 3.24314795393977e-06, + "loss": 0.82285464, + "num_input_tokens_seen": 109922275, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8359375, + "step": 5121, + "time_per_iteration": 2.440516948699951 + }, + { + "auxiliary_loss_clip": 0.0113076, + "auxiliary_loss_mlp": 0.01035014, + "balance_loss_clip": 1.01981497, + "balance_loss_mlp": 1.0480212, + "epoch": 0.30795130016533895, + "flos": 27705046245120.0, + "grad_norm": 1.5001896125792977, + "language_loss": 0.82594395, + "learning_rate": 3.242842843433319e-06, + "loss": 0.84760171, + "num_input_tokens_seen": 109944265, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.828125, + "step": 5122, + "time_per_iteration": 2.510576009750366 + }, + { + "auxiliary_loss_clip": 0.01050329, + "auxiliary_loss_mlp": 0.01017411, + "balance_loss_clip": 1.01562333, + "balance_loss_mlp": 1.01982307, + "epoch": 0.3080114234180069, + "flos": 69058699591680.0, + "grad_norm": 0.7473381596642288, + "language_loss": 0.58639288, + "learning_rate": 3.242537685798143e-06, + "loss": 0.60707027, + "num_input_tokens_seen": 110014160, + "router_z_loss_clip": 0.01782227, + "router_z_loss_mlp": 0.3046875, + "step": 5123, + "time_per_iteration": 3.2167654037475586 + }, + { + "auxiliary_loss_clip": 0.01134332, + "auxiliary_loss_mlp": 0.01036733, + "balance_loss_clip": 1.01917315, + "balance_loss_mlp": 1.04640436, + "epoch": 0.3080715466706749, + "flos": 24060221377920.0, + "grad_norm": 1.5767520801619384, + "language_loss": 0.83622873, + "learning_rate": 3.242232481045813e-06, + "loss": 0.85793942, + "num_input_tokens_seen": 110034865, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.87890625, + "step": 5124, + "time_per_iteration": 2.474625587463379 + }, + { + "auxiliary_loss_clip": 0.01135515, + "auxiliary_loss_mlp": 0.01039715, + "balance_loss_clip": 1.02420545, + "balance_loss_mlp": 1.04945302, + "epoch": 0.30813166992334284, + "flos": 25848680480640.0, + "grad_norm": 1.8429802725909379, + "language_loss": 0.78703862, + "learning_rate": 3.2419272291879035e-06, + "loss": 0.80879092, + "num_input_tokens_seen": 110052930, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.859375, + "step": 5125, + "time_per_iteration": 2.5806493759155273 + }, + { + "auxiliary_loss_clip": 0.01134207, + "auxiliary_loss_mlp": 0.01037354, + "balance_loss_clip": 1.02050948, + "balance_loss_mlp": 1.04717779, + "epoch": 0.3081917931760108, + "flos": 20449619193600.0, + "grad_norm": 1.8928574451074776, + "language_loss": 0.6450479, + "learning_rate": 3.241621930235989e-06, + "loss": 0.66676342, + "num_input_tokens_seen": 110071765, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.87109375, + "step": 5126, + "time_per_iteration": 2.467099666595459 + }, + { + "auxiliary_loss_clip": 0.01129876, + "auxiliary_loss_mlp": 0.01039444, + "balance_loss_clip": 1.02367234, + "balance_loss_mlp": 1.04831636, + "epoch": 0.3082519164286788, + "flos": 22166257052160.0, + "grad_norm": 1.5538294270453243, + "language_loss": 0.86619091, + "learning_rate": 3.241316584201646e-06, + "loss": 0.88788408, + "num_input_tokens_seen": 110092660, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.81640625, + "step": 5127, + "time_per_iteration": 2.543095111846924 + }, + { + "auxiliary_loss_clip": 0.01129649, + "auxiliary_loss_mlp": 0.01040552, + "balance_loss_clip": 1.02439952, + "balance_loss_mlp": 1.04648781, + "epoch": 0.30831203968134674, + "flos": 28913404700160.0, + "grad_norm": 2.186420023793508, + "language_loss": 0.68816996, + "learning_rate": 3.2410111910964538e-06, + "loss": 0.70987189, + "num_input_tokens_seen": 110114960, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.83203125, + "step": 5128, + "time_per_iteration": 2.525390863418579 + }, + { + "auxiliary_loss_clip": 0.01134323, + "auxiliary_loss_mlp": 0.01041963, + "balance_loss_clip": 1.02571476, + "balance_loss_mlp": 1.04763198, + "epoch": 0.3083721629340147, + "flos": 25667726739840.0, + "grad_norm": 1.801256837086347, + "language_loss": 0.71226776, + "learning_rate": 3.240705750931993e-06, + "loss": 0.7340306, + "num_input_tokens_seen": 110135750, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8671875, + "step": 5129, + "time_per_iteration": 2.5417068004608154 + }, + { + "auxiliary_loss_clip": 0.01045915, + "auxiliary_loss_mlp": 0.01008464, + "balance_loss_clip": 1.00633001, + "balance_loss_mlp": 1.01580441, + "epoch": 0.3084322861866827, + "flos": 68212679581440.0, + "grad_norm": 0.9000157132793972, + "language_loss": 0.59171313, + "learning_rate": 3.240400263719846e-06, + "loss": 0.61225688, + "num_input_tokens_seen": 110189480, + "router_z_loss_clip": 0.0213623, + "router_z_loss_mlp": 0.30078125, + "step": 5130, + "time_per_iteration": 3.024799108505249 + }, + { + "auxiliary_loss_clip": 0.01135089, + "auxiliary_loss_mlp": 0.01038466, + "balance_loss_clip": 1.02233696, + "balance_loss_mlp": 1.0485276, + "epoch": 0.3084924094393507, + "flos": 20296495514880.0, + "grad_norm": 2.1422150520884773, + "language_loss": 0.72951442, + "learning_rate": 3.2400947294715957e-06, + "loss": 0.75124997, + "num_input_tokens_seen": 110206445, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8671875, + "step": 5131, + "time_per_iteration": 2.5145480632781982 + }, + { + "auxiliary_loss_clip": 0.01130631, + "auxiliary_loss_mlp": 0.01036573, + "balance_loss_clip": 1.02222049, + "balance_loss_mlp": 1.04737425, + "epoch": 0.30855253269201866, + "flos": 23949831905280.0, + "grad_norm": 1.759562546324366, + "language_loss": 0.71208251, + "learning_rate": 3.2397891481988303e-06, + "loss": 0.73375452, + "num_input_tokens_seen": 110226845, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.83203125, + "step": 5132, + "time_per_iteration": 2.4997506141662598 + }, + { + "auxiliary_loss_clip": 0.01128489, + "auxiliary_loss_mlp": 0.01040365, + "balance_loss_clip": 1.02580929, + "balance_loss_mlp": 1.04823279, + "epoch": 0.3086126559446866, + "flos": 19281876042240.0, + "grad_norm": 1.7072095629792627, + "language_loss": 0.8999784, + "learning_rate": 3.239483519913136e-06, + "loss": 0.92166698, + "num_input_tokens_seen": 110244095, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8046875, + "step": 5133, + "time_per_iteration": 2.4972143173217773 + }, + { + "auxiliary_loss_clip": 0.01136466, + "auxiliary_loss_mlp": 0.01047935, + "balance_loss_clip": 1.03186607, + "balance_loss_mlp": 1.04911399, + "epoch": 0.3086727791973546, + "flos": 33760770019200.0, + "grad_norm": 1.8506383958840185, + "language_loss": 0.67226613, + "learning_rate": 3.239177844626102e-06, + "loss": 0.6941101, + "num_input_tokens_seen": 110264240, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.875, + "step": 5134, + "time_per_iteration": 2.5700669288635254 + }, + { + "auxiliary_loss_clip": 0.0113384, + "auxiliary_loss_mlp": 0.01047239, + "balance_loss_clip": 1.0317775, + "balance_loss_mlp": 1.04718161, + "epoch": 0.30873290245002255, + "flos": 16034151006720.0, + "grad_norm": 2.423009332179396, + "language_loss": 0.82865155, + "learning_rate": 3.2388721223493197e-06, + "loss": 0.85046244, + "num_input_tokens_seen": 110282450, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8671875, + "step": 5135, + "time_per_iteration": 2.4712367057800293 + }, + { + "auxiliary_loss_clip": 0.0104583, + "auxiliary_loss_mlp": 0.01004049, + "balance_loss_clip": 1.00197494, + "balance_loss_mlp": 1.015975, + "epoch": 0.3087930257026905, + "flos": 65048304055680.0, + "grad_norm": 0.7120747448350507, + "language_loss": 0.55243868, + "learning_rate": 3.2385663530943824e-06, + "loss": 0.57293749, + "num_input_tokens_seen": 110343715, + "router_z_loss_clip": 0.02075195, + "router_z_loss_mlp": 0.29882812, + "step": 5136, + "time_per_iteration": 3.1432137489318848 + }, + { + "auxiliary_loss_clip": 0.01132561, + "auxiliary_loss_mlp": 0.01040613, + "balance_loss_clip": 1.02502048, + "balance_loss_mlp": 1.04724097, + "epoch": 0.3088531489553585, + "flos": 74738829824640.0, + "grad_norm": 1.9824711220984585, + "language_loss": 0.76057774, + "learning_rate": 3.2382605368728852e-06, + "loss": 0.78230941, + "num_input_tokens_seen": 110368430, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8515625, + "step": 5137, + "time_per_iteration": 5.764686822891235 + }, + { + "auxiliary_loss_clip": 0.0113183, + "auxiliary_loss_mlp": 0.01037097, + "balance_loss_clip": 1.02310133, + "balance_loss_mlp": 1.04696631, + "epoch": 0.30891327220802645, + "flos": 21142300043520.0, + "grad_norm": 2.0179579208290264, + "language_loss": 0.79909992, + "learning_rate": 3.237954673696424e-06, + "loss": 0.8207891, + "num_input_tokens_seen": 110386735, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.84765625, + "step": 5138, + "time_per_iteration": 2.45621657371521 + }, + { + "auxiliary_loss_clip": 0.01132491, + "auxiliary_loss_mlp": 0.01042876, + "balance_loss_clip": 1.02666378, + "balance_loss_mlp": 1.04560494, + "epoch": 0.3089733954606944, + "flos": 25664494515840.0, + "grad_norm": 1.4272945699581137, + "language_loss": 0.81220984, + "learning_rate": 3.2376487635765983e-06, + "loss": 0.83396351, + "num_input_tokens_seen": 110406820, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.87109375, + "step": 5139, + "time_per_iteration": 2.5283203125 + }, + { + "auxiliary_loss_clip": 0.01137198, + "auxiliary_loss_mlp": 0.01042206, + "balance_loss_clip": 1.02492023, + "balance_loss_mlp": 1.04787183, + "epoch": 0.3090335187133624, + "flos": 19427350124160.0, + "grad_norm": 2.1565991279061736, + "language_loss": 0.77528149, + "learning_rate": 3.2373428065250067e-06, + "loss": 0.79707557, + "num_input_tokens_seen": 110424225, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.89453125, + "step": 5140, + "time_per_iteration": 2.43929386138916 + }, + { + "auxiliary_loss_clip": 0.01129104, + "auxiliary_loss_mlp": 0.01044008, + "balance_loss_clip": 1.02920234, + "balance_loss_mlp": 1.04757929, + "epoch": 0.30909364196603034, + "flos": 20011329440640.0, + "grad_norm": 2.2023621297160156, + "language_loss": 0.78595555, + "learning_rate": 3.237036802553252e-06, + "loss": 0.80768663, + "num_input_tokens_seen": 110443310, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.81640625, + "step": 5141, + "time_per_iteration": 2.5164880752563477 + }, + { + "auxiliary_loss_clip": 0.01134378, + "auxiliary_loss_mlp": 0.01047349, + "balance_loss_clip": 1.03046894, + "balance_loss_mlp": 1.04716825, + "epoch": 0.3091537652186983, + "flos": 19677575243520.0, + "grad_norm": 2.127714885761315, + "language_loss": 0.87142885, + "learning_rate": 3.2367307516729377e-06, + "loss": 0.89324611, + "num_input_tokens_seen": 110460215, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.87109375, + "step": 5142, + "time_per_iteration": 2.4362974166870117 + }, + { + "auxiliary_loss_clip": 0.01131531, + "auxiliary_loss_mlp": 0.0104755, + "balance_loss_clip": 1.03220749, + "balance_loss_mlp": 1.04556274, + "epoch": 0.3092138884713663, + "flos": 17020042577280.0, + "grad_norm": 1.7972015737501748, + "language_loss": 0.7877624, + "learning_rate": 3.23642465389567e-06, + "loss": 0.80955315, + "num_input_tokens_seen": 110479385, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.859375, + "step": 5143, + "time_per_iteration": 2.459317445755005 + }, + { + "auxiliary_loss_clip": 0.01130331, + "auxiliary_loss_mlp": 0.01043432, + "balance_loss_clip": 1.02742219, + "balance_loss_mlp": 1.04593444, + "epoch": 0.3092740117240343, + "flos": 25009986844800.0, + "grad_norm": 1.9461458902951219, + "language_loss": 0.72098875, + "learning_rate": 3.236118509233055e-06, + "loss": 0.74272639, + "num_input_tokens_seen": 110499885, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.84375, + "step": 5144, + "time_per_iteration": 2.4872243404388428 + }, + { + "auxiliary_loss_clip": 0.01132569, + "auxiliary_loss_mlp": 0.01040748, + "balance_loss_clip": 1.02418947, + "balance_loss_mlp": 1.04587483, + "epoch": 0.30933413497670226, + "flos": 25590410714880.0, + "grad_norm": 1.7305751805857612, + "language_loss": 0.74054307, + "learning_rate": 3.235812317696702e-06, + "loss": 0.76227629, + "num_input_tokens_seen": 110519690, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8671875, + "step": 5145, + "time_per_iteration": 2.524683952331543 + }, + { + "auxiliary_loss_clip": 0.01132717, + "auxiliary_loss_mlp": 0.01045609, + "balance_loss_clip": 1.02951622, + "balance_loss_mlp": 1.04737079, + "epoch": 0.3093942582293702, + "flos": 24389665943040.0, + "grad_norm": 1.6607552662218326, + "language_loss": 0.76461762, + "learning_rate": 3.2355060792982224e-06, + "loss": 0.78640091, + "num_input_tokens_seen": 110540520, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8515625, + "step": 5146, + "time_per_iteration": 2.4848198890686035 + }, + { + "auxiliary_loss_clip": 0.01130265, + "auxiliary_loss_mlp": 0.01037136, + "balance_loss_clip": 1.02213407, + "balance_loss_mlp": 1.04672074, + "epoch": 0.3094543814820382, + "flos": 19646441130240.0, + "grad_norm": 2.385312171088194, + "language_loss": 0.66755533, + "learning_rate": 3.2351997940492286e-06, + "loss": 0.68922937, + "num_input_tokens_seen": 110557950, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8359375, + "step": 5147, + "time_per_iteration": 2.4861929416656494 + }, + { + "auxiliary_loss_clip": 0.01135751, + "auxiliary_loss_mlp": 0.01040015, + "balance_loss_clip": 1.02517319, + "balance_loss_mlp": 1.04931486, + "epoch": 0.30951450473470615, + "flos": 25663812157440.0, + "grad_norm": 2.0402709532397205, + "language_loss": 0.75148058, + "learning_rate": 3.2348934619613346e-06, + "loss": 0.77323824, + "num_input_tokens_seen": 110578215, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8671875, + "step": 5148, + "time_per_iteration": 2.505180597305298 + }, + { + "auxiliary_loss_clip": 0.01139245, + "auxiliary_loss_mlp": 0.0104464, + "balance_loss_clip": 1.02815318, + "balance_loss_mlp": 1.04876494, + "epoch": 0.3095746279873741, + "flos": 12020415505920.0, + "grad_norm": 2.1288750992632677, + "language_loss": 0.72576058, + "learning_rate": 3.2345870830461567e-06, + "loss": 0.74759942, + "num_input_tokens_seen": 110592990, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.90625, + "step": 5149, + "time_per_iteration": 2.4605252742767334 + }, + { + "auxiliary_loss_clip": 0.01133233, + "auxiliary_loss_mlp": 0.01041255, + "balance_loss_clip": 1.02442312, + "balance_loss_mlp": 1.0457058, + "epoch": 0.3096347512400421, + "flos": 23623044946560.0, + "grad_norm": 2.112154456836484, + "language_loss": 0.84981489, + "learning_rate": 3.2342806573153132e-06, + "loss": 0.87155974, + "num_input_tokens_seen": 110612130, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.875, + "step": 5150, + "time_per_iteration": 2.4866578578948975 + }, + { + "auxiliary_loss_clip": 0.01131574, + "auxiliary_loss_mlp": 0.01041618, + "balance_loss_clip": 1.02515531, + "balance_loss_mlp": 1.04593086, + "epoch": 0.30969487449271005, + "flos": 22529313768960.0, + "grad_norm": 1.9529089609254688, + "language_loss": 0.79053164, + "learning_rate": 3.233974184780424e-06, + "loss": 0.81226349, + "num_input_tokens_seen": 110632045, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.859375, + "step": 5151, + "time_per_iteration": 2.4936540126800537 + }, + { + "auxiliary_loss_clip": 0.01133842, + "auxiliary_loss_mlp": 0.01042555, + "balance_loss_clip": 1.02580595, + "balance_loss_mlp": 1.0471015, + "epoch": 0.309754997745378, + "flos": 15267925059840.0, + "grad_norm": 3.1311630498810774, + "language_loss": 0.67020154, + "learning_rate": 3.2336676654531084e-06, + "loss": 0.69196552, + "num_input_tokens_seen": 110649340, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8671875, + "step": 5152, + "time_per_iteration": 2.429640054702759 + }, + { + "auxiliary_loss_clip": 0.01132623, + "auxiliary_loss_mlp": 0.01043705, + "balance_loss_clip": 1.0275166, + "balance_loss_mlp": 1.04688787, + "epoch": 0.309815120998046, + "flos": 26979291947520.0, + "grad_norm": 12.57465651148819, + "language_loss": 0.82058132, + "learning_rate": 3.2333610993449926e-06, + "loss": 0.84234464, + "num_input_tokens_seen": 110668450, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.859375, + "step": 5153, + "time_per_iteration": 2.578856945037842 + }, + { + "auxiliary_loss_clip": 0.01133847, + "auxiliary_loss_mlp": 0.01042916, + "balance_loss_clip": 1.02788973, + "balance_loss_mlp": 1.04822588, + "epoch": 0.30987524425071394, + "flos": 21143161969920.0, + "grad_norm": 1.7956706783057126, + "language_loss": 0.73902357, + "learning_rate": 3.2330544864676997e-06, + "loss": 0.76079118, + "num_input_tokens_seen": 110689410, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.85546875, + "step": 5154, + "time_per_iteration": 2.5063655376434326 + }, + { + "auxiliary_loss_clip": 0.01133271, + "auxiliary_loss_mlp": 0.01039056, + "balance_loss_clip": 1.02287924, + "balance_loss_mlp": 1.04747653, + "epoch": 0.3099353675033819, + "flos": 15268284195840.0, + "grad_norm": 2.516871287947693, + "language_loss": 0.76051688, + "learning_rate": 3.232747826832858e-06, + "loss": 0.78224009, + "num_input_tokens_seen": 110707350, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.859375, + "step": 5155, + "time_per_iteration": 2.4838123321533203 + }, + { + "auxiliary_loss_clip": 0.01135928, + "auxiliary_loss_mlp": 0.01042973, + "balance_loss_clip": 1.02701044, + "balance_loss_mlp": 1.04871869, + "epoch": 0.30999549075604993, + "flos": 15413794191360.0, + "grad_norm": 1.7492301646526522, + "language_loss": 0.7883296, + "learning_rate": 3.232441120452094e-06, + "loss": 0.81011862, + "num_input_tokens_seen": 110724910, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87109375, + "step": 5156, + "time_per_iteration": 2.4420597553253174 + }, + { + "auxiliary_loss_clip": 0.01134302, + "auxiliary_loss_mlp": 0.01046544, + "balance_loss_clip": 1.02894902, + "balance_loss_mlp": 1.04688191, + "epoch": 0.3100556140087179, + "flos": 23184539712000.0, + "grad_norm": 3.007667649484548, + "language_loss": 0.75094402, + "learning_rate": 3.23213436733704e-06, + "loss": 0.77275252, + "num_input_tokens_seen": 110744010, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.875, + "step": 5157, + "time_per_iteration": 2.4922094345092773 + }, + { + "auxiliary_loss_clip": 0.01131262, + "auxiliary_loss_mlp": 0.01037688, + "balance_loss_clip": 1.02282262, + "balance_loss_mlp": 1.04701662, + "epoch": 0.31011573726138586, + "flos": 25742169676800.0, + "grad_norm": 1.583276716554569, + "language_loss": 0.69391131, + "learning_rate": 3.231827567499327e-06, + "loss": 0.71560085, + "num_input_tokens_seen": 110765835, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.84375, + "step": 5158, + "time_per_iteration": 2.5119874477386475 + }, + { + "auxiliary_loss_clip": 0.0113222, + "auxiliary_loss_mlp": 0.01040926, + "balance_loss_clip": 1.0260725, + "balance_loss_mlp": 1.04802489, + "epoch": 0.3101758605140538, + "flos": 20011329440640.0, + "grad_norm": 1.8674515495135584, + "language_loss": 0.84731698, + "learning_rate": 3.2315207209505896e-06, + "loss": 0.86904848, + "num_input_tokens_seen": 110784655, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.84375, + "step": 5159, + "time_per_iteration": 2.5553479194641113 + }, + { + "auxiliary_loss_clip": 0.01130577, + "auxiliary_loss_mlp": 0.01037318, + "balance_loss_clip": 1.0215224, + "balance_loss_mlp": 1.04617286, + "epoch": 0.3102359837667218, + "flos": 19135683688320.0, + "grad_norm": 1.6286624468626467, + "language_loss": 0.85222661, + "learning_rate": 3.231213827702462e-06, + "loss": 0.87390554, + "num_input_tokens_seen": 110802545, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.84375, + "step": 5160, + "time_per_iteration": 2.4521608352661133 + }, + { + "auxiliary_loss_clip": 0.01131067, + "auxiliary_loss_mlp": 0.01039214, + "balance_loss_clip": 1.02385354, + "balance_loss_mlp": 1.04720986, + "epoch": 0.31029610701938976, + "flos": 22265405568000.0, + "grad_norm": 2.1323719792042404, + "language_loss": 0.76438844, + "learning_rate": 3.230906887766584e-06, + "loss": 0.78609127, + "num_input_tokens_seen": 110820265, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.83984375, + "step": 5161, + "time_per_iteration": 2.4705073833465576 + }, + { + "auxiliary_loss_clip": 0.01133183, + "auxiliary_loss_mlp": 0.01040129, + "balance_loss_clip": 1.02420259, + "balance_loss_mlp": 1.04661226, + "epoch": 0.3103562302720577, + "flos": 20805349536000.0, + "grad_norm": 1.9681741891595628, + "language_loss": 0.81644946, + "learning_rate": 3.2305999011545924e-06, + "loss": 0.83818257, + "num_input_tokens_seen": 110836195, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8671875, + "step": 5162, + "time_per_iteration": 2.4359090328216553 + }, + { + "auxiliary_loss_clip": 0.01129525, + "auxiliary_loss_mlp": 0.01037231, + "balance_loss_clip": 1.0231998, + "balance_loss_mlp": 1.04580498, + "epoch": 0.3104163535247257, + "flos": 22344158136960.0, + "grad_norm": 1.6668116654420786, + "language_loss": 0.82879269, + "learning_rate": 3.2302928678781295e-06, + "loss": 0.85046029, + "num_input_tokens_seen": 110856420, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8359375, + "step": 5163, + "time_per_iteration": 2.536198854446411 + }, + { + "auxiliary_loss_clip": 0.01134589, + "auxiliary_loss_mlp": 0.01042558, + "balance_loss_clip": 1.02670264, + "balance_loss_mlp": 1.04848182, + "epoch": 0.31047647677739365, + "flos": 21689363157120.0, + "grad_norm": 1.61479678935284, + "language_loss": 0.76103258, + "learning_rate": 3.2299857879488376e-06, + "loss": 0.78280413, + "num_input_tokens_seen": 110876650, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.859375, + "step": 5164, + "time_per_iteration": 2.4736320972442627 + }, + { + "auxiliary_loss_clip": 0.01134485, + "auxiliary_loss_mlp": 0.01041252, + "balance_loss_clip": 1.02492666, + "balance_loss_mlp": 1.04932189, + "epoch": 0.3105366000300616, + "flos": 18917275040640.0, + "grad_norm": 1.73414256762253, + "language_loss": 0.74515426, + "learning_rate": 3.2296786613783626e-06, + "loss": 0.76691169, + "num_input_tokens_seen": 110894445, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8515625, + "step": 5165, + "time_per_iteration": 2.4788122177124023 + }, + { + "auxiliary_loss_clip": 0.01132367, + "auxiliary_loss_mlp": 0.01042006, + "balance_loss_clip": 1.02627063, + "balance_loss_mlp": 1.0472759, + "epoch": 0.3105967232827296, + "flos": 18260397072000.0, + "grad_norm": 2.461614607097325, + "language_loss": 0.75987816, + "learning_rate": 3.229371488178348e-06, + "loss": 0.78162187, + "num_input_tokens_seen": 110912855, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8515625, + "step": 5166, + "time_per_iteration": 2.4461371898651123 + }, + { + "auxiliary_loss_clip": 0.01133631, + "auxiliary_loss_mlp": 0.01045635, + "balance_loss_clip": 1.02939892, + "balance_loss_mlp": 1.04844868, + "epoch": 0.31065684653539755, + "flos": 17672144037120.0, + "grad_norm": 2.4324780660218557, + "language_loss": 0.73424876, + "learning_rate": 3.229064268360444e-06, + "loss": 0.75604147, + "num_input_tokens_seen": 110928025, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8515625, + "step": 5167, + "time_per_iteration": 2.4301631450653076 + }, + { + "auxiliary_loss_clip": 0.01047334, + "auxiliary_loss_mlp": 0.01006703, + "balance_loss_clip": 1.00467682, + "balance_loss_mlp": 1.01844001, + "epoch": 0.3107169697880655, + "flos": 68531996511360.0, + "grad_norm": 0.725291341239906, + "language_loss": 0.53031516, + "learning_rate": 3.2287570019362997e-06, + "loss": 0.55085552, + "num_input_tokens_seen": 110992215, + "router_z_loss_clip": 0.02026367, + "router_z_loss_mlp": 0.2890625, + "step": 5168, + "time_per_iteration": 3.1146020889282227 + }, + { + "auxiliary_loss_clip": 0.01133751, + "auxiliary_loss_mlp": 0.01043639, + "balance_loss_clip": 1.0269258, + "balance_loss_mlp": 1.0465318, + "epoch": 0.3107770930407335, + "flos": 13188733274880.0, + "grad_norm": 1.782356602828545, + "language_loss": 0.78745592, + "learning_rate": 3.2284496889175668e-06, + "loss": 0.80922985, + "num_input_tokens_seen": 111010400, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.87109375, + "step": 5169, + "time_per_iteration": 2.4755852222442627 + }, + { + "auxiliary_loss_clip": 0.01132974, + "auxiliary_loss_mlp": 0.01038846, + "balance_loss_clip": 1.02337217, + "balance_loss_mlp": 1.04640126, + "epoch": 0.3108372162934015, + "flos": 31580849520000.0, + "grad_norm": 1.536235209485244, + "language_loss": 0.6414057, + "learning_rate": 3.2281423293158986e-06, + "loss": 0.66312397, + "num_input_tokens_seen": 111033960, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8671875, + "step": 5170, + "time_per_iteration": 2.5690839290618896 + }, + { + "auxiliary_loss_clip": 0.01133399, + "auxiliary_loss_mlp": 0.01042243, + "balance_loss_clip": 1.02635252, + "balance_loss_mlp": 1.04721069, + "epoch": 0.31089733954606946, + "flos": 28729829266560.0, + "grad_norm": 2.41080559035864, + "language_loss": 0.77698815, + "learning_rate": 3.22783492314295e-06, + "loss": 0.79874456, + "num_input_tokens_seen": 111053265, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.86328125, + "step": 5171, + "time_per_iteration": 2.558258295059204 + }, + { + "auxiliary_loss_clip": 0.01132946, + "auxiliary_loss_mlp": 0.01053954, + "balance_loss_clip": 1.03769374, + "balance_loss_mlp": 1.04645526, + "epoch": 0.3109574627987374, + "flos": 19683249592320.0, + "grad_norm": 1.9319520361735263, + "language_loss": 0.83802366, + "learning_rate": 3.2275274704103785e-06, + "loss": 0.85989261, + "num_input_tokens_seen": 111071130, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8671875, + "step": 5172, + "time_per_iteration": 2.4601597785949707 + }, + { + "auxiliary_loss_clip": 0.01133186, + "auxiliary_loss_mlp": 0.01045771, + "balance_loss_clip": 1.02948654, + "balance_loss_mlp": 1.0467186, + "epoch": 0.3110175860514054, + "flos": 14683981656960.0, + "grad_norm": 1.9586589765002733, + "language_loss": 0.84225619, + "learning_rate": 3.227219971129842e-06, + "loss": 0.86404574, + "num_input_tokens_seen": 111089560, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8671875, + "step": 5173, + "time_per_iteration": 2.501840591430664 + }, + { + "auxiliary_loss_clip": 0.01128358, + "auxiliary_loss_mlp": 0.01034767, + "balance_loss_clip": 1.02038455, + "balance_loss_mlp": 1.04595959, + "epoch": 0.31107770930407336, + "flos": 25739655724800.0, + "grad_norm": 1.622637298809784, + "language_loss": 0.83323705, + "learning_rate": 3.226912425313001e-06, + "loss": 0.85486829, + "num_input_tokens_seen": 111109960, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.82421875, + "step": 5174, + "time_per_iteration": 2.507127285003662 + }, + { + "auxiliary_loss_clip": 0.01131648, + "auxiliary_loss_mlp": 0.01047063, + "balance_loss_clip": 1.03155434, + "balance_loss_mlp": 1.04670012, + "epoch": 0.3111378325567413, + "flos": 19208259118080.0, + "grad_norm": 2.3340025504670003, + "language_loss": 0.84681082, + "learning_rate": 3.2266048329715183e-06, + "loss": 0.86859798, + "num_input_tokens_seen": 111127960, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84765625, + "step": 5175, + "time_per_iteration": 2.4853246212005615 + }, + { + "auxiliary_loss_clip": 0.01133689, + "auxiliary_loss_mlp": 0.01047203, + "balance_loss_clip": 1.03029919, + "balance_loss_mlp": 1.04996502, + "epoch": 0.3111979558094093, + "flos": 23696374561920.0, + "grad_norm": 1.6466695594130172, + "language_loss": 0.83448446, + "learning_rate": 3.2262971941170575e-06, + "loss": 0.85629338, + "num_input_tokens_seen": 111146730, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8359375, + "step": 5176, + "time_per_iteration": 2.4759509563446045 + }, + { + "auxiliary_loss_clip": 0.01128858, + "auxiliary_loss_mlp": 0.01044446, + "balance_loss_clip": 1.02836514, + "balance_loss_mlp": 1.04442942, + "epoch": 0.31125807906207725, + "flos": 21033023892480.0, + "grad_norm": 1.7899579393784935, + "language_loss": 0.80820966, + "learning_rate": 3.2259895087612837e-06, + "loss": 0.8299427, + "num_input_tokens_seen": 111166295, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.84375, + "step": 5177, + "time_per_iteration": 2.5106611251831055 + }, + { + "auxiliary_loss_clip": 0.0113295, + "auxiliary_loss_mlp": 0.01041813, + "balance_loss_clip": 1.02629185, + "balance_loss_mlp": 1.048877, + "epoch": 0.3113182023147452, + "flos": 23076628277760.0, + "grad_norm": 1.9871899212943351, + "language_loss": 0.80703342, + "learning_rate": 3.2256817769158657e-06, + "loss": 0.82878101, + "num_input_tokens_seen": 111185665, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.84375, + "step": 5178, + "time_per_iteration": 4.0482330322265625 + }, + { + "auxiliary_loss_clip": 0.01131397, + "auxiliary_loss_mlp": 0.01048541, + "balance_loss_clip": 1.03310347, + "balance_loss_mlp": 1.04518402, + "epoch": 0.3113783255674132, + "flos": 11838994888320.0, + "grad_norm": 1.8347450184704097, + "language_loss": 0.81340981, + "learning_rate": 3.225373998592471e-06, + "loss": 0.83520925, + "num_input_tokens_seen": 111201615, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.86328125, + "step": 5179, + "time_per_iteration": 3.82991886138916 + }, + { + "auxiliary_loss_clip": 0.01132507, + "auxiliary_loss_mlp": 0.01049787, + "balance_loss_clip": 1.0338006, + "balance_loss_mlp": 1.04824936, + "epoch": 0.31143844882008115, + "flos": 16289547684480.0, + "grad_norm": 1.599561013411363, + "language_loss": 0.78199375, + "learning_rate": 3.2250661738027715e-06, + "loss": 0.8038168, + "num_input_tokens_seen": 111220515, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.84375, + "step": 5180, + "time_per_iteration": 2.4656291007995605 + }, + { + "auxiliary_loss_clip": 0.01130701, + "auxiliary_loss_mlp": 0.01035311, + "balance_loss_clip": 1.01915836, + "balance_loss_mlp": 1.04672408, + "epoch": 0.3114985720727491, + "flos": 23217792727680.0, + "grad_norm": 1.6380256774064115, + "language_loss": 0.83046079, + "learning_rate": 3.22475830255844e-06, + "loss": 0.85212088, + "num_input_tokens_seen": 111240395, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.84375, + "step": 5181, + "time_per_iteration": 2.5661914348602295 + }, + { + "auxiliary_loss_clip": 0.01128181, + "auxiliary_loss_mlp": 0.01043679, + "balance_loss_clip": 1.02903986, + "balance_loss_mlp": 1.0464232, + "epoch": 0.3115586953254171, + "flos": 30044626698240.0, + "grad_norm": 1.700886032828765, + "language_loss": 0.74084079, + "learning_rate": 3.2244503848711516e-06, + "loss": 0.76255929, + "num_input_tokens_seen": 111261100, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8203125, + "step": 5182, + "time_per_iteration": 2.5913209915161133 + }, + { + "auxiliary_loss_clip": 0.01136348, + "auxiliary_loss_mlp": 0.01050649, + "balance_loss_clip": 1.03479409, + "balance_loss_mlp": 1.04858768, + "epoch": 0.3116188185780851, + "flos": 25666326109440.0, + "grad_norm": 1.8010906920491343, + "language_loss": 0.70658493, + "learning_rate": 3.2241424207525815e-06, + "loss": 0.72845489, + "num_input_tokens_seen": 111281320, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.875, + "step": 5183, + "time_per_iteration": 2.4991438388824463 + }, + { + "auxiliary_loss_clip": 0.01045533, + "auxiliary_loss_mlp": 0.01014757, + "balance_loss_clip": 1.01301634, + "balance_loss_mlp": 1.01690507, + "epoch": 0.31167894183075306, + "flos": 69510058917120.0, + "grad_norm": 0.9414003998762589, + "language_loss": 0.59602594, + "learning_rate": 3.223834410214408e-06, + "loss": 0.61662877, + "num_input_tokens_seen": 111341405, + "router_z_loss_clip": 0.01745605, + "router_z_loss_mlp": 0.28515625, + "step": 5184, + "time_per_iteration": 3.0754520893096924 + }, + { + "auxiliary_loss_clip": 0.01130364, + "auxiliary_loss_mlp": 0.01047375, + "balance_loss_clip": 1.03264058, + "balance_loss_mlp": 1.04596519, + "epoch": 0.31173906508342103, + "flos": 14939845211520.0, + "grad_norm": 2.811836993883612, + "language_loss": 0.69750082, + "learning_rate": 3.223526353268311e-06, + "loss": 0.71927822, + "num_input_tokens_seen": 111358975, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.84375, + "step": 5185, + "time_per_iteration": 2.435033082962036 + }, + { + "auxiliary_loss_clip": 0.01136749, + "auxiliary_loss_mlp": 0.01048147, + "balance_loss_clip": 1.0323875, + "balance_loss_mlp": 1.05073345, + "epoch": 0.311799188336089, + "flos": 16176033728640.0, + "grad_norm": 2.346024133586612, + "language_loss": 0.63920057, + "learning_rate": 3.2232182499259725e-06, + "loss": 0.66104954, + "num_input_tokens_seen": 111375845, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.859375, + "step": 5186, + "time_per_iteration": 2.463900327682495 + }, + { + "auxiliary_loss_clip": 0.01137188, + "auxiliary_loss_mlp": 0.01049347, + "balance_loss_clip": 1.03219295, + "balance_loss_mlp": 1.04886758, + "epoch": 0.31185931158875696, + "flos": 25009627708800.0, + "grad_norm": 2.108066194391345, + "language_loss": 0.86249322, + "learning_rate": 3.2229101001990747e-06, + "loss": 0.88435853, + "num_input_tokens_seen": 111394150, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.8828125, + "step": 5187, + "time_per_iteration": 2.4854979515075684 + }, + { + "auxiliary_loss_clip": 0.01129847, + "auxiliary_loss_mlp": 0.0104672, + "balance_loss_clip": 1.03048384, + "balance_loss_mlp": 1.0451926, + "epoch": 0.3119194348414249, + "flos": 37232901273600.0, + "grad_norm": 1.7445298378798078, + "language_loss": 0.62983185, + "learning_rate": 3.2226019040993036e-06, + "loss": 0.6515975, + "num_input_tokens_seen": 111418355, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.84765625, + "step": 5188, + "time_per_iteration": 2.6161019802093506 + }, + { + "auxiliary_loss_clip": 0.01135744, + "auxiliary_loss_mlp": 0.01045566, + "balance_loss_clip": 1.02961564, + "balance_loss_mlp": 1.05116081, + "epoch": 0.3119795580940929, + "flos": 15012779777280.0, + "grad_norm": 2.1633857437120256, + "language_loss": 0.8347863, + "learning_rate": 3.222293661638346e-06, + "loss": 0.85659939, + "num_input_tokens_seen": 111435445, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.84375, + "step": 5189, + "time_per_iteration": 2.4360432624816895 + }, + { + "auxiliary_loss_clip": 0.01129905, + "auxiliary_loss_mlp": 0.01036753, + "balance_loss_clip": 1.0213753, + "balance_loss_mlp": 1.04657507, + "epoch": 0.31203968134676086, + "flos": 15998168557440.0, + "grad_norm": 1.6712014044776404, + "language_loss": 0.7916308, + "learning_rate": 3.22198537282789e-06, + "loss": 0.81329739, + "num_input_tokens_seen": 111453430, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.83203125, + "step": 5190, + "time_per_iteration": 2.472668170928955 + }, + { + "auxiliary_loss_clip": 0.01133914, + "auxiliary_loss_mlp": 0.01035258, + "balance_loss_clip": 1.01986194, + "balance_loss_mlp": 1.04946673, + "epoch": 0.3120998045994288, + "flos": 23837359443840.0, + "grad_norm": 1.4545499288259176, + "language_loss": 0.75318813, + "learning_rate": 3.2216770376796262e-06, + "loss": 0.77487987, + "num_input_tokens_seen": 111475325, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84375, + "step": 5191, + "time_per_iteration": 2.486673355102539 + }, + { + "auxiliary_loss_clip": 0.01049091, + "auxiliary_loss_mlp": 0.01002214, + "balance_loss_clip": 1.00025892, + "balance_loss_mlp": 1.02067924, + "epoch": 0.3121599278520968, + "flos": 69184205712000.0, + "grad_norm": 0.8451593954944295, + "language_loss": 0.63957787, + "learning_rate": 3.221368656205247e-06, + "loss": 0.66009092, + "num_input_tokens_seen": 111533960, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.28515625, + "step": 5192, + "time_per_iteration": 3.1464638710021973 + }, + { + "auxiliary_loss_clip": 0.01134311, + "auxiliary_loss_mlp": 0.01041004, + "balance_loss_clip": 1.02365923, + "balance_loss_mlp": 1.04795599, + "epoch": 0.31222005110476475, + "flos": 23806368984960.0, + "grad_norm": 1.6164756923867671, + "language_loss": 0.80154347, + "learning_rate": 3.221060228416446e-06, + "loss": 0.82329667, + "num_input_tokens_seen": 111554055, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.86328125, + "step": 5193, + "time_per_iteration": 2.5156989097595215 + }, + { + "auxiliary_loss_clip": 0.01131615, + "auxiliary_loss_mlp": 0.01042627, + "balance_loss_clip": 1.02610445, + "balance_loss_mlp": 1.045856, + "epoch": 0.3122801743574327, + "flos": 25226132935680.0, + "grad_norm": 1.8140889441731107, + "language_loss": 0.72050476, + "learning_rate": 3.2207517543249183e-06, + "loss": 0.74224722, + "num_input_tokens_seen": 111574305, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.85546875, + "step": 5194, + "time_per_iteration": 2.519972801208496 + }, + { + "auxiliary_loss_clip": 0.01133223, + "auxiliary_loss_mlp": 0.01039811, + "balance_loss_clip": 1.02471924, + "balance_loss_mlp": 1.04870749, + "epoch": 0.3123402976101007, + "flos": 22966490200320.0, + "grad_norm": 1.3544515008303952, + "language_loss": 0.76475823, + "learning_rate": 3.2204432339423616e-06, + "loss": 0.78648859, + "num_input_tokens_seen": 111595680, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84765625, + "step": 5195, + "time_per_iteration": 2.512247323989868 + }, + { + "auxiliary_loss_clip": 0.01131656, + "auxiliary_loss_mlp": 0.01042642, + "balance_loss_clip": 1.02718091, + "balance_loss_mlp": 1.0449183, + "epoch": 0.3124004208627687, + "flos": 25192089820800.0, + "grad_norm": 1.3526234536893298, + "language_loss": 0.7817502, + "learning_rate": 3.220134667280476e-06, + "loss": 0.80349314, + "num_input_tokens_seen": 111618135, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8671875, + "step": 5196, + "time_per_iteration": 2.528002977371216 + }, + { + "auxiliary_loss_clip": 0.01044386, + "auxiliary_loss_mlp": 0.01000444, + "balance_loss_clip": 0.99860841, + "balance_loss_mlp": 1.01643729, + "epoch": 0.31246054411543667, + "flos": 67485165517440.0, + "grad_norm": 0.7752479618797538, + "language_loss": 0.54834789, + "learning_rate": 3.2198260543509613e-06, + "loss": 0.56879622, + "num_input_tokens_seen": 111682220, + "router_z_loss_clip": 0.01831055, + "router_z_loss_mlp": 0.27929688, + "step": 5197, + "time_per_iteration": 3.0728254318237305 + }, + { + "auxiliary_loss_clip": 0.01130689, + "auxiliary_loss_mlp": 0.01038137, + "balance_loss_clip": 1.02328372, + "balance_loss_mlp": 1.0477525, + "epoch": 0.31252066736810463, + "flos": 17858520731520.0, + "grad_norm": 1.6543672060788046, + "language_loss": 0.66300559, + "learning_rate": 3.21951739516552e-06, + "loss": 0.68469381, + "num_input_tokens_seen": 111700815, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.828125, + "step": 5198, + "time_per_iteration": 2.4312028884887695 + }, + { + "auxiliary_loss_clip": 0.01135097, + "auxiliary_loss_mlp": 0.01037705, + "balance_loss_clip": 1.02156413, + "balance_loss_mlp": 1.0472604, + "epoch": 0.3125807906207726, + "flos": 18475034791680.0, + "grad_norm": 2.083859755504136, + "language_loss": 0.69763082, + "learning_rate": 3.219208689735857e-06, + "loss": 0.71935886, + "num_input_tokens_seen": 111718195, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.875, + "step": 5199, + "time_per_iteration": 2.454464912414551 + }, + { + "auxiliary_loss_clip": 0.01131797, + "auxiliary_loss_mlp": 0.01049575, + "balance_loss_clip": 1.0336132, + "balance_loss_mlp": 1.04692471, + "epoch": 0.31264091387344056, + "flos": 18946541646720.0, + "grad_norm": 1.8982997112015956, + "language_loss": 0.79004937, + "learning_rate": 3.2188999380736785e-06, + "loss": 0.81186306, + "num_input_tokens_seen": 111734440, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.84765625, + "step": 5200, + "time_per_iteration": 2.4382827281951904 + }, + { + "auxiliary_loss_clip": 0.01127793, + "auxiliary_loss_mlp": 0.01036846, + "balance_loss_clip": 1.02187347, + "balance_loss_mlp": 1.04621911, + "epoch": 0.3127010371261085, + "flos": 21468512384640.0, + "grad_norm": 2.042457973745699, + "language_loss": 0.83946276, + "learning_rate": 3.2185911401906917e-06, + "loss": 0.86110914, + "num_input_tokens_seen": 111751960, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.81640625, + "step": 5201, + "time_per_iteration": 2.475511074066162 + }, + { + "auxiliary_loss_clip": 0.01134303, + "auxiliary_loss_mlp": 0.01046403, + "balance_loss_clip": 1.02990484, + "balance_loss_mlp": 1.04985881, + "epoch": 0.3127611603787765, + "flos": 15336047203200.0, + "grad_norm": 2.37604325800411, + "language_loss": 0.69560832, + "learning_rate": 3.2182822960986072e-06, + "loss": 0.71741533, + "num_input_tokens_seen": 111769585, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.84375, + "step": 5202, + "time_per_iteration": 2.4265501499176025 + }, + { + "auxiliary_loss_clip": 0.01133329, + "auxiliary_loss_mlp": 0.01041339, + "balance_loss_clip": 1.02737963, + "balance_loss_mlp": 1.04759419, + "epoch": 0.31282128363144446, + "flos": 17602980399360.0, + "grad_norm": 1.800546738819683, + "language_loss": 0.84001613, + "learning_rate": 3.2179734058091358e-06, + "loss": 0.86176282, + "num_input_tokens_seen": 111787880, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.85546875, + "step": 5203, + "time_per_iteration": 2.480233907699585 + }, + { + "auxiliary_loss_clip": 0.01131997, + "auxiliary_loss_mlp": 0.01047163, + "balance_loss_clip": 1.03176749, + "balance_loss_mlp": 1.04697657, + "epoch": 0.3128814068841124, + "flos": 26756753235840.0, + "grad_norm": 2.9129021624211417, + "language_loss": 0.60623944, + "learning_rate": 3.2176644693339913e-06, + "loss": 0.62803102, + "num_input_tokens_seen": 111805950, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8515625, + "step": 5204, + "time_per_iteration": 2.50688099861145 + }, + { + "auxiliary_loss_clip": 0.01129885, + "auxiliary_loss_mlp": 0.01041088, + "balance_loss_clip": 1.02672338, + "balance_loss_mlp": 1.04707503, + "epoch": 0.3129415301367804, + "flos": 22272372806400.0, + "grad_norm": 1.6006708998064776, + "language_loss": 0.65964866, + "learning_rate": 3.217355486684887e-06, + "loss": 0.68135834, + "num_input_tokens_seen": 111826135, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.828125, + "step": 5205, + "time_per_iteration": 2.4824163913726807 + }, + { + "auxiliary_loss_clip": 0.01132532, + "auxiliary_loss_mlp": 0.01043219, + "balance_loss_clip": 1.0264169, + "balance_loss_mlp": 1.0476222, + "epoch": 0.31300165338944835, + "flos": 26464907232000.0, + "grad_norm": 1.9498647702732133, + "language_loss": 0.76618874, + "learning_rate": 3.2170464578735414e-06, + "loss": 0.78794622, + "num_input_tokens_seen": 111844700, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.84765625, + "step": 5206, + "time_per_iteration": 2.4947307109832764 + }, + { + "auxiliary_loss_clip": 0.0112786, + "auxiliary_loss_mlp": 0.01039372, + "balance_loss_clip": 1.02416039, + "balance_loss_mlp": 1.04438448, + "epoch": 0.3130617766421163, + "flos": 21944652094080.0, + "grad_norm": 3.088705810465425, + "language_loss": 0.83287984, + "learning_rate": 3.216737382911672e-06, + "loss": 0.85455215, + "num_input_tokens_seen": 111861585, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8359375, + "step": 5207, + "time_per_iteration": 2.4767825603485107 + }, + { + "auxiliary_loss_clip": 0.01128039, + "auxiliary_loss_mlp": 0.01041894, + "balance_loss_clip": 1.02784562, + "balance_loss_mlp": 1.04694057, + "epoch": 0.3131218998947843, + "flos": 23292774368640.0, + "grad_norm": 1.5219202808663073, + "language_loss": 0.71293664, + "learning_rate": 3.216428261810999e-06, + "loss": 0.73463601, + "num_input_tokens_seen": 111882950, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8125, + "step": 5208, + "time_per_iteration": 2.4853296279907227 + }, + { + "auxiliary_loss_clip": 0.01133198, + "auxiliary_loss_mlp": 0.01041056, + "balance_loss_clip": 1.02534437, + "balance_loss_mlp": 1.04957032, + "epoch": 0.3131820231474523, + "flos": 21139642437120.0, + "grad_norm": 1.8332946649412374, + "language_loss": 0.74547577, + "learning_rate": 3.2161190945832445e-06, + "loss": 0.76721835, + "num_input_tokens_seen": 111901640, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8359375, + "step": 5209, + "time_per_iteration": 2.5162742137908936 + }, + { + "auxiliary_loss_clip": 0.0113008, + "auxiliary_loss_mlp": 0.01040855, + "balance_loss_clip": 1.02695489, + "balance_loss_mlp": 1.04557538, + "epoch": 0.31324214640012027, + "flos": 23909863046400.0, + "grad_norm": 1.818845882779476, + "language_loss": 0.77656835, + "learning_rate": 3.2158098812401325e-06, + "loss": 0.79827774, + "num_input_tokens_seen": 111919615, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.84375, + "step": 5210, + "time_per_iteration": 2.4701180458068848 + }, + { + "auxiliary_loss_clip": 0.01125909, + "auxiliary_loss_mlp": 0.0103947, + "balance_loss_clip": 1.02443743, + "balance_loss_mlp": 1.04593706, + "epoch": 0.31330226965278823, + "flos": 22236929061120.0, + "grad_norm": 1.8627745841798442, + "language_loss": 0.79177994, + "learning_rate": 3.2155006217933874e-06, + "loss": 0.81343371, + "num_input_tokens_seen": 111938485, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.80078125, + "step": 5211, + "time_per_iteration": 2.482102870941162 + }, + { + "auxiliary_loss_clip": 0.01130248, + "auxiliary_loss_mlp": 0.01038221, + "balance_loss_clip": 1.02448201, + "balance_loss_mlp": 1.04849112, + "epoch": 0.3133623929054562, + "flos": 19753993428480.0, + "grad_norm": 1.64859412039223, + "language_loss": 0.79837513, + "learning_rate": 3.2151913162547367e-06, + "loss": 0.82005984, + "num_input_tokens_seen": 111956425, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.8203125, + "step": 5212, + "time_per_iteration": 2.460986852645874 + }, + { + "auxiliary_loss_clip": 0.01133278, + "auxiliary_loss_mlp": 0.01049778, + "balance_loss_clip": 1.03395939, + "balance_loss_mlp": 1.04740417, + "epoch": 0.31342251615812416, + "flos": 27162256849920.0, + "grad_norm": 2.096287390218497, + "language_loss": 0.71467483, + "learning_rate": 3.2148819646359097e-06, + "loss": 0.73650539, + "num_input_tokens_seen": 111975915, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.859375, + "step": 5213, + "time_per_iteration": 2.5129754543304443 + }, + { + "auxiliary_loss_clip": 0.01135204, + "auxiliary_loss_mlp": 0.01041521, + "balance_loss_clip": 1.02660799, + "balance_loss_mlp": 1.05014026, + "epoch": 0.31348263941079213, + "flos": 20229809915520.0, + "grad_norm": 5.183832853627301, + "language_loss": 0.77595121, + "learning_rate": 3.2145725669486374e-06, + "loss": 0.79771841, + "num_input_tokens_seen": 111995055, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8515625, + "step": 5214, + "time_per_iteration": 2.453228712081909 + }, + { + "auxiliary_loss_clip": 0.01126524, + "auxiliary_loss_mlp": 0.01034893, + "balance_loss_clip": 1.02082658, + "balance_loss_mlp": 1.04599309, + "epoch": 0.3135427626634601, + "flos": 24607643627520.0, + "grad_norm": 1.6576138068605464, + "language_loss": 0.82562625, + "learning_rate": 3.2142631232046517e-06, + "loss": 0.84724051, + "num_input_tokens_seen": 112015830, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8046875, + "step": 5215, + "time_per_iteration": 2.544684886932373 + }, + { + "auxiliary_loss_clip": 0.01131802, + "auxiliary_loss_mlp": 0.01037959, + "balance_loss_clip": 1.02242613, + "balance_loss_mlp": 1.04732776, + "epoch": 0.31360288591612806, + "flos": 20959873845120.0, + "grad_norm": 2.510877303679677, + "language_loss": 0.79557931, + "learning_rate": 3.213953633415686e-06, + "loss": 0.81727695, + "num_input_tokens_seen": 112035065, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.84375, + "step": 5216, + "time_per_iteration": 2.4559943675994873 + }, + { + "auxiliary_loss_clip": 0.0113211, + "auxiliary_loss_mlp": 0.01047322, + "balance_loss_clip": 1.03042984, + "balance_loss_mlp": 1.04632115, + "epoch": 0.313663009168796, + "flos": 26980513009920.0, + "grad_norm": 2.0079960226100293, + "language_loss": 0.68489361, + "learning_rate": 3.213644097593477e-06, + "loss": 0.70668793, + "num_input_tokens_seen": 112058405, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.859375, + "step": 5217, + "time_per_iteration": 2.524624824523926 + }, + { + "auxiliary_loss_clip": 0.01134019, + "auxiliary_loss_mlp": 0.01036007, + "balance_loss_clip": 1.02095652, + "balance_loss_mlp": 1.04952598, + "epoch": 0.313723132421464, + "flos": 18040911016320.0, + "grad_norm": 1.8597778329644077, + "language_loss": 0.80357039, + "learning_rate": 3.2133345157497624e-06, + "loss": 0.82527065, + "num_input_tokens_seen": 112076420, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84765625, + "step": 5218, + "time_per_iteration": 2.437819480895996 + }, + { + "auxiliary_loss_clip": 0.01130766, + "auxiliary_loss_mlp": 0.01041589, + "balance_loss_clip": 1.025931, + "balance_loss_mlp": 1.04692423, + "epoch": 0.31378325567413196, + "flos": 22488913946880.0, + "grad_norm": 2.311414379590861, + "language_loss": 0.68608415, + "learning_rate": 3.2130248878962813e-06, + "loss": 0.70780772, + "num_input_tokens_seen": 112090775, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8359375, + "step": 5219, + "time_per_iteration": 2.4811697006225586 + }, + { + "auxiliary_loss_clip": 0.01132783, + "auxiliary_loss_mlp": 0.01040103, + "balance_loss_clip": 1.02585125, + "balance_loss_mlp": 1.05002093, + "epoch": 0.3138433789267999, + "flos": 22419247518720.0, + "grad_norm": 1.886141735907444, + "language_loss": 0.7973401, + "learning_rate": 3.2127152140447747e-06, + "loss": 0.81906897, + "num_input_tokens_seen": 112110980, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.828125, + "step": 5220, + "time_per_iteration": 5.5014426708221436 + }, + { + "auxiliary_loss_clip": 0.01129795, + "auxiliary_loss_mlp": 0.0103477, + "balance_loss_clip": 1.02036917, + "balance_loss_mlp": 1.0470016, + "epoch": 0.3139035021794679, + "flos": 13005912026880.0, + "grad_norm": 1.696615671785811, + "language_loss": 0.72865409, + "learning_rate": 3.212405494206986e-06, + "loss": 0.75029969, + "num_input_tokens_seen": 112129020, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.828125, + "step": 5221, + "time_per_iteration": 2.4286248683929443 + }, + { + "auxiliary_loss_clip": 0.01129062, + "auxiliary_loss_mlp": 0.01037622, + "balance_loss_clip": 1.02370405, + "balance_loss_mlp": 1.0478735, + "epoch": 0.31396362543213585, + "flos": 16945994689920.0, + "grad_norm": 1.5798649053475948, + "language_loss": 0.8195132, + "learning_rate": 3.2120957283946588e-06, + "loss": 0.84118003, + "num_input_tokens_seen": 112147865, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.8125, + "step": 5222, + "time_per_iteration": 2.453622817993164 + }, + { + "auxiliary_loss_clip": 0.01133873, + "auxiliary_loss_mlp": 0.01044471, + "balance_loss_clip": 1.02744806, + "balance_loss_mlp": 1.04833627, + "epoch": 0.31402374868480387, + "flos": 20156731695360.0, + "grad_norm": 1.948806511089887, + "language_loss": 0.70150459, + "learning_rate": 3.2117859166195407e-06, + "loss": 0.723288, + "num_input_tokens_seen": 112166745, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.85546875, + "step": 5223, + "time_per_iteration": 2.442513942718506 + }, + { + "auxiliary_loss_clip": 0.01130042, + "auxiliary_loss_mlp": 0.01034314, + "balance_loss_clip": 1.01980042, + "balance_loss_mlp": 1.04643512, + "epoch": 0.31408387193747184, + "flos": 21251073404160.0, + "grad_norm": 1.6111281957709347, + "language_loss": 0.80361176, + "learning_rate": 3.211476058893379e-06, + "loss": 0.82525527, + "num_input_tokens_seen": 112185895, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8359375, + "step": 5224, + "time_per_iteration": 2.5533599853515625 + }, + { + "auxiliary_loss_clip": 0.01139668, + "auxiliary_loss_mlp": 0.01042146, + "balance_loss_clip": 1.02615976, + "balance_loss_mlp": 1.05134106, + "epoch": 0.3141439951901398, + "flos": 27484267299840.0, + "grad_norm": 1.9819108050216143, + "language_loss": 0.58416283, + "learning_rate": 3.2111661552279243e-06, + "loss": 0.60598099, + "num_input_tokens_seen": 112204465, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8828125, + "step": 5225, + "time_per_iteration": 2.493633508682251 + }, + { + "auxiliary_loss_clip": 0.01125852, + "auxiliary_loss_mlp": 0.01031717, + "balance_loss_clip": 1.01826406, + "balance_loss_mlp": 1.04575014, + "epoch": 0.31420411844280777, + "flos": 17852235851520.0, + "grad_norm": 1.9016989590060558, + "language_loss": 0.81870753, + "learning_rate": 3.2108562056349273e-06, + "loss": 0.84028322, + "num_input_tokens_seen": 112221635, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.80078125, + "step": 5226, + "time_per_iteration": 2.455474376678467 + }, + { + "auxiliary_loss_clip": 0.01132046, + "auxiliary_loss_mlp": 0.0103993, + "balance_loss_clip": 1.0245285, + "balance_loss_mlp": 1.04804921, + "epoch": 0.31426424169547573, + "flos": 21616967295360.0, + "grad_norm": 3.2929472014065864, + "language_loss": 0.73947561, + "learning_rate": 3.210546210126141e-06, + "loss": 0.7611953, + "num_input_tokens_seen": 112241240, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.83984375, + "step": 5227, + "time_per_iteration": 2.4582889080047607 + }, + { + "auxiliary_loss_clip": 0.01132913, + "auxiliary_loss_mlp": 0.01042937, + "balance_loss_clip": 1.02783334, + "balance_loss_mlp": 1.04827404, + "epoch": 0.3143243649481437, + "flos": 30920631586560.0, + "grad_norm": 1.9061545786481, + "language_loss": 0.67636049, + "learning_rate": 3.2102361687133213e-06, + "loss": 0.69811898, + "num_input_tokens_seen": 112262350, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84375, + "step": 5228, + "time_per_iteration": 2.572122573852539 + }, + { + "auxiliary_loss_clip": 0.01130676, + "auxiliary_loss_mlp": 0.01040068, + "balance_loss_clip": 1.02567399, + "balance_loss_mlp": 1.04645872, + "epoch": 0.31438448820081166, + "flos": 22821411168000.0, + "grad_norm": 1.857425256773369, + "language_loss": 0.79938543, + "learning_rate": 3.2099260814082254e-06, + "loss": 0.82109284, + "num_input_tokens_seen": 112283710, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.83984375, + "step": 5229, + "time_per_iteration": 2.4785192012786865 + }, + { + "auxiliary_loss_clip": 0.01129346, + "auxiliary_loss_mlp": 0.01039876, + "balance_loss_clip": 1.02474797, + "balance_loss_mlp": 1.04716849, + "epoch": 0.3144446114534796, + "flos": 23292127923840.0, + "grad_norm": 1.8246409730399047, + "language_loss": 0.70264775, + "learning_rate": 3.209615948222611e-06, + "loss": 0.72434002, + "num_input_tokens_seen": 112304285, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8203125, + "step": 5230, + "time_per_iteration": 2.504387140274048 + }, + { + "auxiliary_loss_clip": 0.01129413, + "auxiliary_loss_mlp": 0.01043609, + "balance_loss_clip": 1.02805161, + "balance_loss_mlp": 1.04486191, + "epoch": 0.3145047347061476, + "flos": 31355976424320.0, + "grad_norm": 1.680902640440715, + "language_loss": 0.79707456, + "learning_rate": 3.209305769168239e-06, + "loss": 0.81880474, + "num_input_tokens_seen": 112325110, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84375, + "step": 5231, + "time_per_iteration": 2.535352945327759 + }, + { + "auxiliary_loss_clip": 0.01129002, + "auxiliary_loss_mlp": 0.01042731, + "balance_loss_clip": 1.02675736, + "balance_loss_mlp": 1.04756021, + "epoch": 0.31456485795881556, + "flos": 10889552643840.0, + "grad_norm": 2.0146998384070254, + "language_loss": 0.8507638, + "learning_rate": 3.2089955442568704e-06, + "loss": 0.87248111, + "num_input_tokens_seen": 112339855, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8125, + "step": 5232, + "time_per_iteration": 2.5626280307769775 + }, + { + "auxiliary_loss_clip": 0.01127281, + "auxiliary_loss_mlp": 0.01049783, + "balance_loss_clip": 1.03439283, + "balance_loss_mlp": 1.0461762, + "epoch": 0.3146249812114835, + "flos": 17092438439040.0, + "grad_norm": 1.5681064196444345, + "language_loss": 0.7984041, + "learning_rate": 3.2086852735002692e-06, + "loss": 0.82017469, + "num_input_tokens_seen": 112358480, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.80859375, + "step": 5233, + "time_per_iteration": 2.4478254318237305 + }, + { + "auxiliary_loss_clip": 0.01132884, + "auxiliary_loss_mlp": 0.01038194, + "balance_loss_clip": 1.0233047, + "balance_loss_mlp": 1.04861724, + "epoch": 0.3146851044641515, + "flos": 55291442889600.0, + "grad_norm": 1.628646597563271, + "language_loss": 0.70788991, + "learning_rate": 3.2083749569102024e-06, + "loss": 0.72960073, + "num_input_tokens_seen": 112382350, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.84375, + "step": 5234, + "time_per_iteration": 2.775871992111206 + }, + { + "auxiliary_loss_clip": 0.01131513, + "auxiliary_loss_mlp": 0.01035924, + "balance_loss_clip": 1.0205102, + "balance_loss_mlp": 1.04739237, + "epoch": 0.31474522771681945, + "flos": 27015884928000.0, + "grad_norm": 1.8519873535555593, + "language_loss": 0.72068667, + "learning_rate": 3.2080645944984356e-06, + "loss": 0.74236101, + "num_input_tokens_seen": 112400260, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.83984375, + "step": 5235, + "time_per_iteration": 2.515869617462158 + }, + { + "auxiliary_loss_clip": 0.01126993, + "auxiliary_loss_mlp": 0.01036359, + "balance_loss_clip": 1.02204823, + "balance_loss_mlp": 1.04428434, + "epoch": 0.3148053509694875, + "flos": 21251935330560.0, + "grad_norm": 2.06424580772138, + "language_loss": 0.7832365, + "learning_rate": 3.2077541862767384e-06, + "loss": 0.80487001, + "num_input_tokens_seen": 112419400, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.828125, + "step": 5236, + "time_per_iteration": 2.5591800212860107 + }, + { + "auxiliary_loss_clip": 0.01134794, + "auxiliary_loss_mlp": 0.01041698, + "balance_loss_clip": 1.02609372, + "balance_loss_mlp": 1.04730821, + "epoch": 0.31486547422215544, + "flos": 31248675521280.0, + "grad_norm": 1.44778330648976, + "language_loss": 0.75856584, + "learning_rate": 3.207443732256881e-06, + "loss": 0.78033078, + "num_input_tokens_seen": 112440825, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.875, + "step": 5237, + "time_per_iteration": 2.5414791107177734 + }, + { + "auxiliary_loss_clip": 0.01125329, + "auxiliary_loss_mlp": 0.01037879, + "balance_loss_clip": 1.02424169, + "balance_loss_mlp": 1.04500508, + "epoch": 0.3149255974748234, + "flos": 19828615933440.0, + "grad_norm": 2.1889759499940813, + "language_loss": 0.79916662, + "learning_rate": 3.2071332324506372e-06, + "loss": 0.82079864, + "num_input_tokens_seen": 112459180, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.8046875, + "step": 5238, + "time_per_iteration": 2.484102725982666 + }, + { + "auxiliary_loss_clip": 0.01045144, + "auxiliary_loss_mlp": 0.0100711, + "balance_loss_clip": 1.0053103, + "balance_loss_mlp": 1.01739836, + "epoch": 0.31498572072749137, + "flos": 67683965339520.0, + "grad_norm": 0.8333107882681854, + "language_loss": 0.67920464, + "learning_rate": 3.2068226868697795e-06, + "loss": 0.69972724, + "num_input_tokens_seen": 112516680, + "router_z_loss_clip": 0.01794434, + "router_z_loss_mlp": 0.27734375, + "step": 5239, + "time_per_iteration": 3.0362496376037598 + }, + { + "auxiliary_loss_clip": 0.01130796, + "auxiliary_loss_mlp": 0.01038602, + "balance_loss_clip": 1.02197254, + "balance_loss_mlp": 1.04535258, + "epoch": 0.31504584398015933, + "flos": 19793136274560.0, + "grad_norm": 2.0536997136778847, + "language_loss": 0.82329869, + "learning_rate": 3.2065120955260846e-06, + "loss": 0.84499264, + "num_input_tokens_seen": 112535895, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.85546875, + "step": 5240, + "time_per_iteration": 2.5182995796203613 + }, + { + "auxiliary_loss_clip": 0.01130164, + "auxiliary_loss_mlp": 0.01039014, + "balance_loss_clip": 1.02451253, + "balance_loss_mlp": 1.04874361, + "epoch": 0.3151059672328273, + "flos": 26615409217920.0, + "grad_norm": 2.2630790499207962, + "language_loss": 0.80981195, + "learning_rate": 3.2062014584313302e-06, + "loss": 0.83150375, + "num_input_tokens_seen": 112557490, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8125, + "step": 5241, + "time_per_iteration": 2.5001909732818604 + }, + { + "auxiliary_loss_clip": 0.01129016, + "auxiliary_loss_mlp": 0.01036038, + "balance_loss_clip": 1.02131534, + "balance_loss_mlp": 1.04834199, + "epoch": 0.31516609048549526, + "flos": 24204438483840.0, + "grad_norm": 1.5804052674973608, + "language_loss": 0.74575627, + "learning_rate": 3.2058907755972956e-06, + "loss": 0.76740676, + "num_input_tokens_seen": 112577075, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8046875, + "step": 5242, + "time_per_iteration": 2.530768871307373 + }, + { + "auxiliary_loss_clip": 0.01129278, + "auxiliary_loss_mlp": 0.01034942, + "balance_loss_clip": 1.0189085, + "balance_loss_mlp": 1.04601228, + "epoch": 0.31522621373816323, + "flos": 25958710817280.0, + "grad_norm": 1.9335835713568477, + "language_loss": 0.74171245, + "learning_rate": 3.2055800470357626e-06, + "loss": 0.7633546, + "num_input_tokens_seen": 112597620, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.83203125, + "step": 5243, + "time_per_iteration": 2.495138168334961 + }, + { + "auxiliary_loss_clip": 0.01129958, + "auxiliary_loss_mlp": 0.01036952, + "balance_loss_clip": 1.02221215, + "balance_loss_mlp": 1.04677868, + "epoch": 0.3152863369908312, + "flos": 21908813299200.0, + "grad_norm": 3.400707627247709, + "language_loss": 0.64608908, + "learning_rate": 3.205269272758513e-06, + "loss": 0.66775823, + "num_input_tokens_seen": 112617150, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.83203125, + "step": 5244, + "time_per_iteration": 2.4930343627929688 + }, + { + "auxiliary_loss_clip": 0.01132393, + "auxiliary_loss_mlp": 0.01035579, + "balance_loss_clip": 1.02088022, + "balance_loss_mlp": 1.04716229, + "epoch": 0.31534646024349916, + "flos": 16281072074880.0, + "grad_norm": 2.1590647535644965, + "language_loss": 0.91464043, + "learning_rate": 3.2049584527773313e-06, + "loss": 0.93632007, + "num_input_tokens_seen": 112631090, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8515625, + "step": 5245, + "time_per_iteration": 2.4007837772369385 + }, + { + "auxiliary_loss_clip": 0.0113079, + "auxiliary_loss_mlp": 0.0104148, + "balance_loss_clip": 1.02636433, + "balance_loss_mlp": 1.04643655, + "epoch": 0.3154065834961671, + "flos": 24717243000960.0, + "grad_norm": 9.888646015204756, + "language_loss": 0.75272042, + "learning_rate": 3.2046475871040048e-06, + "loss": 0.77444315, + "num_input_tokens_seen": 112651220, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.84375, + "step": 5246, + "time_per_iteration": 2.4886202812194824 + }, + { + "auxiliary_loss_clip": 0.01131208, + "auxiliary_loss_mlp": 0.01040085, + "balance_loss_clip": 1.02524352, + "balance_loss_mlp": 1.04602718, + "epoch": 0.3154667067488351, + "flos": 35371148469120.0, + "grad_norm": 1.4670109155165818, + "language_loss": 0.6160199, + "learning_rate": 3.204336675750321e-06, + "loss": 0.63773286, + "num_input_tokens_seen": 112671560, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8515625, + "step": 5247, + "time_per_iteration": 2.567185640335083 + }, + { + "auxiliary_loss_clip": 0.01132287, + "auxiliary_loss_mlp": 0.01038251, + "balance_loss_clip": 1.02283072, + "balance_loss_mlp": 1.04756081, + "epoch": 0.31552683000150306, + "flos": 17456464823040.0, + "grad_norm": 2.2084660310503526, + "language_loss": 0.82410538, + "learning_rate": 3.2040257187280693e-06, + "loss": 0.84581077, + "num_input_tokens_seen": 112689790, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84375, + "step": 5248, + "time_per_iteration": 2.52426815032959 + }, + { + "auxiliary_loss_clip": 0.01129578, + "auxiliary_loss_mlp": 0.01050015, + "balance_loss_clip": 1.03413653, + "balance_loss_mlp": 1.04662156, + "epoch": 0.3155869532541711, + "flos": 18405763413120.0, + "grad_norm": 1.8083364563285407, + "language_loss": 0.85017586, + "learning_rate": 3.2037147160490423e-06, + "loss": 0.87197179, + "num_input_tokens_seen": 112708265, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.828125, + "step": 5249, + "time_per_iteration": 2.4549005031585693 + }, + { + "auxiliary_loss_clip": 0.0113226, + "auxiliary_loss_mlp": 0.01038074, + "balance_loss_clip": 1.02245772, + "balance_loss_mlp": 1.04802227, + "epoch": 0.31564707650683904, + "flos": 21579763783680.0, + "grad_norm": 1.8090626711780673, + "language_loss": 0.85569501, + "learning_rate": 3.2034036677250322e-06, + "loss": 0.87739837, + "num_input_tokens_seen": 112727820, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84375, + "step": 5250, + "time_per_iteration": 2.502629041671753 + }, + { + "auxiliary_loss_clip": 0.01128678, + "auxiliary_loss_mlp": 0.01042591, + "balance_loss_clip": 1.02766562, + "balance_loss_mlp": 1.04532385, + "epoch": 0.315707199759507, + "flos": 21030976817280.0, + "grad_norm": 4.215523946509053, + "language_loss": 0.68559456, + "learning_rate": 3.203092573767835e-06, + "loss": 0.70730722, + "num_input_tokens_seen": 112743140, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8359375, + "step": 5251, + "time_per_iteration": 2.4467368125915527 + }, + { + "auxiliary_loss_clip": 0.01130513, + "auxiliary_loss_mlp": 0.01039965, + "balance_loss_clip": 1.02487266, + "balance_loss_mlp": 1.04848695, + "epoch": 0.31576732301217497, + "flos": 26828861788800.0, + "grad_norm": 1.7890606859490685, + "language_loss": 0.78783, + "learning_rate": 3.202781434189246e-06, + "loss": 0.80953479, + "num_input_tokens_seen": 112764705, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8203125, + "step": 5252, + "time_per_iteration": 2.5056369304656982 + }, + { + "auxiliary_loss_clip": 0.01129131, + "auxiliary_loss_mlp": 0.01040491, + "balance_loss_clip": 1.02635264, + "balance_loss_mlp": 1.04820085, + "epoch": 0.31582744626484294, + "flos": 22711165349760.0, + "grad_norm": 1.7467438086499925, + "language_loss": 0.74374568, + "learning_rate": 3.202470249001066e-06, + "loss": 0.76544189, + "num_input_tokens_seen": 112785310, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.80859375, + "step": 5253, + "time_per_iteration": 2.485865592956543 + }, + { + "auxiliary_loss_clip": 0.01129339, + "auxiliary_loss_mlp": 0.01038803, + "balance_loss_clip": 1.02308559, + "balance_loss_mlp": 1.04530692, + "epoch": 0.3158875695175109, + "flos": 23951914894080.0, + "grad_norm": 1.6622002067810395, + "language_loss": 0.73305148, + "learning_rate": 3.2021590182150924e-06, + "loss": 0.75473285, + "num_input_tokens_seen": 112802905, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.83984375, + "step": 5254, + "time_per_iteration": 2.5044641494750977 + }, + { + "auxiliary_loss_clip": 0.01131731, + "auxiliary_loss_mlp": 0.01038119, + "balance_loss_clip": 1.02293146, + "balance_loss_mlp": 1.04714012, + "epoch": 0.31594769277017887, + "flos": 13261883322240.0, + "grad_norm": 1.9319514966089122, + "language_loss": 0.78156364, + "learning_rate": 3.201847741843128e-06, + "loss": 0.80326211, + "num_input_tokens_seen": 112820305, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.84375, + "step": 5255, + "time_per_iteration": 2.4380881786346436 + }, + { + "auxiliary_loss_clip": 0.01130732, + "auxiliary_loss_mlp": 0.0104233, + "balance_loss_clip": 1.02565229, + "balance_loss_mlp": 1.04770398, + "epoch": 0.31600781602284683, + "flos": 23368258800000.0, + "grad_norm": 2.551434599641695, + "language_loss": 0.78019011, + "learning_rate": 3.2015364198969772e-06, + "loss": 0.80192077, + "num_input_tokens_seen": 112841185, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.828125, + "step": 5256, + "time_per_iteration": 2.517211437225342 + }, + { + "auxiliary_loss_clip": 0.01125561, + "auxiliary_loss_mlp": 0.01035033, + "balance_loss_clip": 1.02159786, + "balance_loss_mlp": 1.04710865, + "epoch": 0.3160679392755148, + "flos": 19828580019840.0, + "grad_norm": 1.6136648036258991, + "language_loss": 0.71117795, + "learning_rate": 3.2012250523884453e-06, + "loss": 0.73278391, + "num_input_tokens_seen": 112860570, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.78515625, + "step": 5257, + "time_per_iteration": 2.4690449237823486 + }, + { + "auxiliary_loss_clip": 0.01129863, + "auxiliary_loss_mlp": 0.01037255, + "balance_loss_clip": 1.02207994, + "balance_loss_mlp": 1.04662931, + "epoch": 0.31612806252818276, + "flos": 20193216935040.0, + "grad_norm": 1.9672329013590102, + "language_loss": 0.77098101, + "learning_rate": 3.2009136393293393e-06, + "loss": 0.79265225, + "num_input_tokens_seen": 112877975, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.83203125, + "step": 5258, + "time_per_iteration": 2.4586384296417236 + }, + { + "auxiliary_loss_clip": 0.01130533, + "auxiliary_loss_mlp": 0.01037626, + "balance_loss_clip": 1.02291536, + "balance_loss_mlp": 1.04706669, + "epoch": 0.31618818578085073, + "flos": 24235967646720.0, + "grad_norm": 4.102208009404704, + "language_loss": 0.72829109, + "learning_rate": 3.200602180731467e-06, + "loss": 0.7499727, + "num_input_tokens_seen": 112896170, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8359375, + "step": 5259, + "time_per_iteration": 2.463867425918579 + }, + { + "auxiliary_loss_clip": 0.011339, + "auxiliary_loss_mlp": 0.01048149, + "balance_loss_clip": 1.03382003, + "balance_loss_mlp": 1.04840684, + "epoch": 0.3162483090335187, + "flos": 25081844002560.0, + "grad_norm": 1.940451679167918, + "language_loss": 0.66212165, + "learning_rate": 3.20029067660664e-06, + "loss": 0.68394214, + "num_input_tokens_seen": 112916180, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.85546875, + "step": 5260, + "time_per_iteration": 2.498173475265503 + }, + { + "auxiliary_loss_clip": 0.01125905, + "auxiliary_loss_mlp": 0.0103285, + "balance_loss_clip": 1.01806808, + "balance_loss_mlp": 1.04255199, + "epoch": 0.31630843228618666, + "flos": 26323383646080.0, + "grad_norm": 1.9564366458132632, + "language_loss": 0.72557104, + "learning_rate": 3.1999791269666706e-06, + "loss": 0.74715853, + "num_input_tokens_seen": 112936745, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8359375, + "step": 5261, + "time_per_iteration": 4.0577170848846436 + }, + { + "auxiliary_loss_clip": 0.01040968, + "auxiliary_loss_mlp": 0.01005761, + "balance_loss_clip": 1.00365114, + "balance_loss_mlp": 1.01333809, + "epoch": 0.3163685555388547, + "flos": 66758441552640.0, + "grad_norm": 0.7495327099187281, + "language_loss": 0.50639355, + "learning_rate": 3.1996675318233716e-06, + "loss": 0.52686083, + "num_input_tokens_seen": 112994845, + "router_z_loss_clip": 0.02111816, + "router_z_loss_mlp": 0.27734375, + "step": 5262, + "time_per_iteration": 5.9139063358306885 + }, + { + "auxiliary_loss_clip": 0.01133191, + "auxiliary_loss_mlp": 0.01038454, + "balance_loss_clip": 1.02408338, + "balance_loss_mlp": 1.04845881, + "epoch": 0.31642867879152264, + "flos": 25995662933760.0, + "grad_norm": 1.4936033884005069, + "language_loss": 0.85241222, + "learning_rate": 3.19935589118856e-06, + "loss": 0.87412858, + "num_input_tokens_seen": 113015125, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.84765625, + "step": 5263, + "time_per_iteration": 2.4966084957122803 + }, + { + "auxiliary_loss_clip": 0.01127359, + "auxiliary_loss_mlp": 0.01045858, + "balance_loss_clip": 1.03201818, + "balance_loss_mlp": 1.04657304, + "epoch": 0.3164888020441906, + "flos": 25774955815680.0, + "grad_norm": 1.4671140059184749, + "language_loss": 0.81675243, + "learning_rate": 3.1990442050740535e-06, + "loss": 0.83848464, + "num_input_tokens_seen": 113035535, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.80859375, + "step": 5264, + "time_per_iteration": 2.5126495361328125 + }, + { + "auxiliary_loss_clip": 0.01133844, + "auxiliary_loss_mlp": 0.0103675, + "balance_loss_clip": 1.02107441, + "balance_loss_mlp": 1.0484283, + "epoch": 0.3165489252968586, + "flos": 19756220071680.0, + "grad_norm": 1.6829803459821215, + "language_loss": 0.79974926, + "learning_rate": 3.19873247349167e-06, + "loss": 0.82145512, + "num_input_tokens_seen": 113052720, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8515625, + "step": 5265, + "time_per_iteration": 2.444263219833374 + }, + { + "auxiliary_loss_clip": 0.0113354, + "auxiliary_loss_mlp": 0.01039316, + "balance_loss_clip": 1.02361572, + "balance_loss_mlp": 1.04815876, + "epoch": 0.31660904854952654, + "flos": 23183929180800.0, + "grad_norm": 1.5672890574859826, + "language_loss": 0.74875605, + "learning_rate": 3.1984206964532307e-06, + "loss": 0.77048463, + "num_input_tokens_seen": 113071435, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8515625, + "step": 5266, + "time_per_iteration": 2.5323407649993896 + }, + { + "auxiliary_loss_clip": 0.01131974, + "auxiliary_loss_mlp": 0.01043072, + "balance_loss_clip": 1.02851653, + "balance_loss_mlp": 1.04640543, + "epoch": 0.3166691718021945, + "flos": 20408501099520.0, + "grad_norm": 2.021043754719528, + "language_loss": 0.78872609, + "learning_rate": 3.1981088739705585e-06, + "loss": 0.81047654, + "num_input_tokens_seen": 113088645, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.85546875, + "step": 5267, + "time_per_iteration": 2.4591164588928223 + }, + { + "auxiliary_loss_clip": 0.01042632, + "auxiliary_loss_mlp": 0.01004279, + "balance_loss_clip": 1.00216913, + "balance_loss_mlp": 1.01493907, + "epoch": 0.31672929505486247, + "flos": 70144781172480.0, + "grad_norm": 0.7322532755123746, + "language_loss": 0.57800645, + "learning_rate": 3.197797006055478e-06, + "loss": 0.59847558, + "num_input_tokens_seen": 113152775, + "router_z_loss_clip": 0.02111816, + "router_z_loss_mlp": 0.27734375, + "step": 5268, + "time_per_iteration": 3.061121702194214 + }, + { + "auxiliary_loss_clip": 0.01132182, + "auxiliary_loss_mlp": 0.01037998, + "balance_loss_clip": 1.02291262, + "balance_loss_mlp": 1.04683709, + "epoch": 0.31678941830753043, + "flos": 14355758154240.0, + "grad_norm": 1.8728828385616285, + "language_loss": 0.72881675, + "learning_rate": 3.197485092719815e-06, + "loss": 0.75051844, + "num_input_tokens_seen": 113171410, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.85546875, + "step": 5269, + "time_per_iteration": 2.4871747493743896 + }, + { + "auxiliary_loss_clip": 0.0113037, + "auxiliary_loss_mlp": 0.01039313, + "balance_loss_clip": 1.02470934, + "balance_loss_mlp": 1.04689598, + "epoch": 0.3168495415601984, + "flos": 22747722416640.0, + "grad_norm": 2.0592855460289394, + "language_loss": 0.79914796, + "learning_rate": 3.1971731339753973e-06, + "loss": 0.82084477, + "num_input_tokens_seen": 113189965, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8359375, + "step": 5270, + "time_per_iteration": 2.502607822418213 + }, + { + "auxiliary_loss_clip": 0.01134389, + "auxiliary_loss_mlp": 0.01041999, + "balance_loss_clip": 1.02582264, + "balance_loss_mlp": 1.04792333, + "epoch": 0.31690966481286637, + "flos": 20115254465280.0, + "grad_norm": 1.9728362515560998, + "language_loss": 0.79207718, + "learning_rate": 3.1968611298340545e-06, + "loss": 0.8138411, + "num_input_tokens_seen": 113206355, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8671875, + "step": 5271, + "time_per_iteration": 2.4412505626678467 + }, + { + "auxiliary_loss_clip": 0.0113132, + "auxiliary_loss_mlp": 0.01040651, + "balance_loss_clip": 1.02440262, + "balance_loss_mlp": 1.04685235, + "epoch": 0.31696978806553433, + "flos": 21178928937600.0, + "grad_norm": 1.769221166791082, + "language_loss": 0.73264146, + "learning_rate": 3.1965490803076173e-06, + "loss": 0.75436121, + "num_input_tokens_seen": 113225440, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.84375, + "step": 5272, + "time_per_iteration": 2.4992945194244385 + }, + { + "auxiliary_loss_clip": 0.0113408, + "auxiliary_loss_mlp": 0.01039209, + "balance_loss_clip": 1.02262676, + "balance_loss_mlp": 1.04613161, + "epoch": 0.3170299113182023, + "flos": 42997030439040.0, + "grad_norm": 1.9537759660060814, + "language_loss": 0.69159341, + "learning_rate": 3.1962369854079194e-06, + "loss": 0.71332633, + "num_input_tokens_seen": 113248840, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.87890625, + "step": 5273, + "time_per_iteration": 2.6510114669799805 + }, + { + "auxiliary_loss_clip": 0.01128979, + "auxiliary_loss_mlp": 0.01036407, + "balance_loss_clip": 1.02110016, + "balance_loss_mlp": 1.04609132, + "epoch": 0.31709003457087026, + "flos": 24460158384000.0, + "grad_norm": 1.4826309074588198, + "language_loss": 0.67691469, + "learning_rate": 3.195924845146795e-06, + "loss": 0.69856858, + "num_input_tokens_seen": 113269630, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.828125, + "step": 5274, + "time_per_iteration": 2.5467329025268555 + }, + { + "auxiliary_loss_clip": 0.01124583, + "auxiliary_loss_mlp": 0.01035156, + "balance_loss_clip": 1.02092862, + "balance_loss_mlp": 1.04432762, + "epoch": 0.3171501578235382, + "flos": 24135310759680.0, + "grad_norm": 1.5251182195487059, + "language_loss": 0.80846918, + "learning_rate": 3.195612659536081e-06, + "loss": 0.83006656, + "num_input_tokens_seen": 113291200, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8046875, + "step": 5275, + "time_per_iteration": 2.511544704437256 + }, + { + "auxiliary_loss_clip": 0.0113165, + "auxiliary_loss_mlp": 0.01044428, + "balance_loss_clip": 1.0286448, + "balance_loss_mlp": 1.04539275, + "epoch": 0.31721028107620625, + "flos": 18879712392960.0, + "grad_norm": 1.952892513614063, + "language_loss": 0.72608984, + "learning_rate": 3.1953004285876147e-06, + "loss": 0.7478506, + "num_input_tokens_seen": 113310170, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.859375, + "step": 5276, + "time_per_iteration": 2.5273983478546143 + }, + { + "auxiliary_loss_clip": 0.01124489, + "auxiliary_loss_mlp": 0.0103537, + "balance_loss_clip": 1.02098107, + "balance_loss_mlp": 1.04455817, + "epoch": 0.3172704043288742, + "flos": 23147874904320.0, + "grad_norm": 1.3590988237701342, + "language_loss": 0.77843654, + "learning_rate": 3.194988152313236e-06, + "loss": 0.80003512, + "num_input_tokens_seen": 113331140, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.796875, + "step": 5277, + "time_per_iteration": 2.51247501373291 + }, + { + "auxiliary_loss_clip": 0.0112964, + "auxiliary_loss_mlp": 0.01034254, + "balance_loss_clip": 1.01833999, + "balance_loss_mlp": 1.04444003, + "epoch": 0.3173305275815422, + "flos": 17858520731520.0, + "grad_norm": 1.8256288285105424, + "language_loss": 0.78756094, + "learning_rate": 3.1946758307247878e-06, + "loss": 0.80919981, + "num_input_tokens_seen": 113350030, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8515625, + "step": 5278, + "time_per_iteration": 2.5376405715942383 + }, + { + "auxiliary_loss_clip": 0.01037546, + "auxiliary_loss_mlp": 0.01002993, + "balance_loss_clip": 1.0011332, + "balance_loss_mlp": 1.00972891, + "epoch": 0.31739065083421014, + "flos": 59973476883840.0, + "grad_norm": 0.8755672893463982, + "language_loss": 0.62821174, + "learning_rate": 3.1943634638341114e-06, + "loss": 0.64861709, + "num_input_tokens_seen": 113395820, + "router_z_loss_clip": 0.01855469, + "router_z_loss_mlp": 0.27734375, + "step": 5279, + "time_per_iteration": 2.823489189147949 + }, + { + "auxiliary_loss_clip": 0.01133426, + "auxiliary_loss_mlp": 0.01040678, + "balance_loss_clip": 1.0242753, + "balance_loss_mlp": 1.04568505, + "epoch": 0.3174507740868781, + "flos": 23800981944960.0, + "grad_norm": 1.6672726712999033, + "language_loss": 0.8099947, + "learning_rate": 3.194051051653053e-06, + "loss": 0.83173573, + "num_input_tokens_seen": 113416835, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.87890625, + "step": 5280, + "time_per_iteration": 2.490154981613159 + }, + { + "auxiliary_loss_clip": 0.01130309, + "auxiliary_loss_mlp": 0.01044119, + "balance_loss_clip": 1.02963543, + "balance_loss_mlp": 1.04713202, + "epoch": 0.31751089733954607, + "flos": 27638899349760.0, + "grad_norm": 1.444928497123541, + "language_loss": 0.77968711, + "learning_rate": 3.19373859419346e-06, + "loss": 0.80143142, + "num_input_tokens_seen": 113440850, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.83203125, + "step": 5281, + "time_per_iteration": 2.590106248855591 + }, + { + "auxiliary_loss_clip": 0.01129621, + "auxiliary_loss_mlp": 0.01035628, + "balance_loss_clip": 1.02001119, + "balance_loss_mlp": 1.0464325, + "epoch": 0.31757102059221404, + "flos": 23769273214080.0, + "grad_norm": 1.6441690082428626, + "language_loss": 0.78319824, + "learning_rate": 3.193426091467179e-06, + "loss": 0.8048507, + "num_input_tokens_seen": 113461000, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.83203125, + "step": 5282, + "time_per_iteration": 2.4879021644592285 + }, + { + "auxiliary_loss_clip": 0.01133826, + "auxiliary_loss_mlp": 0.0103931, + "balance_loss_clip": 1.02429008, + "balance_loss_mlp": 1.04685783, + "epoch": 0.317631143844882, + "flos": 25264521596160.0, + "grad_norm": 2.066002014025373, + "language_loss": 0.66989815, + "learning_rate": 3.193113543486061e-06, + "loss": 0.69162953, + "num_input_tokens_seen": 113480820, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8671875, + "step": 5283, + "time_per_iteration": 2.4914467334747314 + }, + { + "auxiliary_loss_clip": 0.01037416, + "auxiliary_loss_mlp": 0.01002537, + "balance_loss_clip": 1.00047421, + "balance_loss_mlp": 1.00956297, + "epoch": 0.31769126709754997, + "flos": 55825939221120.0, + "grad_norm": 0.7287723120729913, + "language_loss": 0.52796859, + "learning_rate": 3.192800950261958e-06, + "loss": 0.5483681, + "num_input_tokens_seen": 113536910, + "router_z_loss_clip": 0.02062988, + "router_z_loss_mlp": 0.27734375, + "step": 5284, + "time_per_iteration": 3.0077779293060303 + }, + { + "auxiliary_loss_clip": 0.01137201, + "auxiliary_loss_mlp": 0.01037818, + "balance_loss_clip": 1.02314341, + "balance_loss_mlp": 1.04976773, + "epoch": 0.31775139035021793, + "flos": 16690562098560.0, + "grad_norm": 1.732541053937659, + "language_loss": 0.7061168, + "learning_rate": 3.1924883118067235e-06, + "loss": 0.72786701, + "num_input_tokens_seen": 113555480, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.875, + "step": 5285, + "time_per_iteration": 2.4796152114868164 + }, + { + "auxiliary_loss_clip": 0.0103775, + "auxiliary_loss_mlp": 0.01003604, + "balance_loss_clip": 1.00170827, + "balance_loss_mlp": 1.00987303, + "epoch": 0.3178115136028859, + "flos": 64227241019520.0, + "grad_norm": 0.8184329386673247, + "language_loss": 0.60497808, + "learning_rate": 3.1921756281322123e-06, + "loss": 0.6253916, + "num_input_tokens_seen": 113616790, + "router_z_loss_clip": 0.0189209, + "router_z_loss_mlp": 0.27929688, + "step": 5286, + "time_per_iteration": 3.060959815979004 + }, + { + "auxiliary_loss_clip": 0.01131379, + "auxiliary_loss_mlp": 0.01042363, + "balance_loss_clip": 1.02701449, + "balance_loss_mlp": 1.04520202, + "epoch": 0.31787163685555386, + "flos": 18697465762560.0, + "grad_norm": 1.8142745455991967, + "language_loss": 0.72112805, + "learning_rate": 3.1918628992502826e-06, + "loss": 0.74286544, + "num_input_tokens_seen": 113635320, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.86328125, + "step": 5287, + "time_per_iteration": 2.480926752090454 + }, + { + "auxiliary_loss_clip": 0.01131312, + "auxiliary_loss_mlp": 0.01047698, + "balance_loss_clip": 1.03083003, + "balance_loss_mlp": 1.04454064, + "epoch": 0.31793176010822183, + "flos": 21324762155520.0, + "grad_norm": 1.8467549942081902, + "language_loss": 0.75335222, + "learning_rate": 3.191550125172792e-06, + "loss": 0.77514231, + "num_input_tokens_seen": 113654000, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8671875, + "step": 5288, + "time_per_iteration": 2.4506337642669678 + }, + { + "auxiliary_loss_clip": 0.01123463, + "auxiliary_loss_mlp": 0.01036721, + "balance_loss_clip": 1.02344155, + "balance_loss_mlp": 1.04175711, + "epoch": 0.31799188336088985, + "flos": 20958688696320.0, + "grad_norm": 2.214262263159222, + "language_loss": 0.87642509, + "learning_rate": 3.1912373059116007e-06, + "loss": 0.89802694, + "num_input_tokens_seen": 113672375, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.8203125, + "step": 5289, + "time_per_iteration": 2.4887404441833496 + }, + { + "auxiliary_loss_clip": 0.01127988, + "auxiliary_loss_mlp": 0.01039305, + "balance_loss_clip": 1.02569127, + "balance_loss_mlp": 1.04635859, + "epoch": 0.3180520066135578, + "flos": 22491930689280.0, + "grad_norm": 1.8563377401537928, + "language_loss": 0.67677546, + "learning_rate": 3.190924441478572e-06, + "loss": 0.69844842, + "num_input_tokens_seen": 113692385, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.81640625, + "step": 5290, + "time_per_iteration": 2.4699981212615967 + }, + { + "auxiliary_loss_clip": 0.01130209, + "auxiliary_loss_mlp": 0.01045373, + "balance_loss_clip": 1.02983999, + "balance_loss_mlp": 1.04348135, + "epoch": 0.3181121298662258, + "flos": 27235335070080.0, + "grad_norm": 1.9889060202243536, + "language_loss": 0.79926544, + "learning_rate": 3.1906115318855687e-06, + "loss": 0.82102132, + "num_input_tokens_seen": 113712145, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8671875, + "step": 5291, + "time_per_iteration": 2.5350663661956787 + }, + { + "auxiliary_loss_clip": 0.011332, + "auxiliary_loss_mlp": 0.01037344, + "balance_loss_clip": 1.02160883, + "balance_loss_mlp": 1.04684091, + "epoch": 0.31817225311889374, + "flos": 23180158252800.0, + "grad_norm": 2.2851564798864694, + "language_loss": 0.79887748, + "learning_rate": 3.1902985771444577e-06, + "loss": 0.82058293, + "num_input_tokens_seen": 113731435, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8671875, + "step": 5292, + "time_per_iteration": 2.4561853408813477 + }, + { + "auxiliary_loss_clip": 0.01124086, + "auxiliary_loss_mlp": 0.01034983, + "balance_loss_clip": 1.02173245, + "balance_loss_mlp": 1.04506028, + "epoch": 0.3182323763715617, + "flos": 23258803080960.0, + "grad_norm": 1.6321803022225574, + "language_loss": 0.74406421, + "learning_rate": 3.1899855772671043e-06, + "loss": 0.76565492, + "num_input_tokens_seen": 113750825, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7890625, + "step": 5293, + "time_per_iteration": 2.562264919281006 + }, + { + "auxiliary_loss_clip": 0.01127349, + "auxiliary_loss_mlp": 0.0104221, + "balance_loss_clip": 1.02864981, + "balance_loss_mlp": 1.04655647, + "epoch": 0.3182924996242297, + "flos": 29016683280000.0, + "grad_norm": 1.669926034583184, + "language_loss": 0.74003655, + "learning_rate": 3.189672532265379e-06, + "loss": 0.7617321, + "num_input_tokens_seen": 113770010, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.80859375, + "step": 5294, + "time_per_iteration": 2.511491537094116 + }, + { + "auxiliary_loss_clip": 0.01131359, + "auxiliary_loss_mlp": 0.01034334, + "balance_loss_clip": 1.0186758, + "balance_loss_mlp": 1.04616928, + "epoch": 0.31835262287689764, + "flos": 20449188230400.0, + "grad_norm": 1.856323864882145, + "language_loss": 0.76211727, + "learning_rate": 3.189359442151152e-06, + "loss": 0.78377414, + "num_input_tokens_seen": 113788640, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8515625, + "step": 5295, + "time_per_iteration": 2.482302665710449 + }, + { + "auxiliary_loss_clip": 0.01134404, + "auxiliary_loss_mlp": 0.01042471, + "balance_loss_clip": 1.02765322, + "balance_loss_mlp": 1.04831004, + "epoch": 0.3184127461295656, + "flos": 25119478477440.0, + "grad_norm": 1.6316405915506296, + "language_loss": 0.69476807, + "learning_rate": 3.189046306936296e-06, + "loss": 0.71653676, + "num_input_tokens_seen": 113809515, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.859375, + "step": 5296, + "time_per_iteration": 2.4972259998321533 + }, + { + "auxiliary_loss_clip": 0.01129364, + "auxiliary_loss_mlp": 0.0103894, + "balance_loss_clip": 1.02421129, + "balance_loss_mlp": 1.04513788, + "epoch": 0.31847286938223357, + "flos": 25551231955200.0, + "grad_norm": 2.3772504575271367, + "language_loss": 0.77559733, + "learning_rate": 3.1887331266326846e-06, + "loss": 0.79728031, + "num_input_tokens_seen": 113829770, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.84375, + "step": 5297, + "time_per_iteration": 2.5681862831115723 + }, + { + "auxiliary_loss_clip": 0.01126969, + "auxiliary_loss_mlp": 0.01030144, + "balance_loss_clip": 1.01533866, + "balance_loss_mlp": 1.04480934, + "epoch": 0.31853299263490154, + "flos": 27782470010880.0, + "grad_norm": 1.9869765921291695, + "language_loss": 0.79451257, + "learning_rate": 3.1884199012521942e-06, + "loss": 0.81608367, + "num_input_tokens_seen": 113849320, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8203125, + "step": 5298, + "time_per_iteration": 2.4990038871765137 + }, + { + "auxiliary_loss_clip": 0.01132136, + "auxiliary_loss_mlp": 0.01039187, + "balance_loss_clip": 1.0245657, + "balance_loss_mlp": 1.04609096, + "epoch": 0.3185931158875695, + "flos": 22706747976960.0, + "grad_norm": 2.132815699592654, + "language_loss": 0.7431671, + "learning_rate": 3.1881066308067016e-06, + "loss": 0.7648803, + "num_input_tokens_seen": 113867860, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.859375, + "step": 5299, + "time_per_iteration": 2.4902234077453613 + }, + { + "auxiliary_loss_clip": 0.01130922, + "auxiliary_loss_mlp": 0.01042737, + "balance_loss_clip": 1.02775824, + "balance_loss_mlp": 1.04395795, + "epoch": 0.31865323914023747, + "flos": 24571517523840.0, + "grad_norm": 5.1444082132017925, + "language_loss": 0.7834971, + "learning_rate": 3.1877933153080873e-06, + "loss": 0.80523366, + "num_input_tokens_seen": 113886375, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8671875, + "step": 5300, + "time_per_iteration": 2.476113796234131 + }, + { + "auxiliary_loss_clip": 0.01127423, + "auxiliary_loss_mlp": 0.01038051, + "balance_loss_clip": 1.02245879, + "balance_loss_mlp": 1.04332328, + "epoch": 0.31871336239290543, + "flos": 18186564666240.0, + "grad_norm": 4.220537638442504, + "language_loss": 0.8416568, + "learning_rate": 3.1874799547682304e-06, + "loss": 0.86331153, + "num_input_tokens_seen": 113904065, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84375, + "step": 5301, + "time_per_iteration": 2.4672341346740723 + }, + { + "auxiliary_loss_clip": 0.01132761, + "auxiliary_loss_mlp": 0.01045513, + "balance_loss_clip": 1.0299325, + "balance_loss_mlp": 1.05064154, + "epoch": 0.31877348564557345, + "flos": 21826756679040.0, + "grad_norm": 2.4555807672502277, + "language_loss": 0.77689236, + "learning_rate": 3.187166549199015e-06, + "loss": 0.79867512, + "num_input_tokens_seen": 113918415, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8203125, + "step": 5302, + "time_per_iteration": 2.4480254650115967 + }, + { + "auxiliary_loss_clip": 0.011261, + "auxiliary_loss_mlp": 0.01037244, + "balance_loss_clip": 1.02197289, + "balance_loss_mlp": 1.0458461, + "epoch": 0.3188336088982414, + "flos": 22015252275840.0, + "grad_norm": 1.6601771821563076, + "language_loss": 0.79729378, + "learning_rate": 3.1868530986123255e-06, + "loss": 0.81892729, + "num_input_tokens_seen": 113938135, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8046875, + "step": 5303, + "time_per_iteration": 5.451193809509277 + }, + { + "auxiliary_loss_clip": 0.01137183, + "auxiliary_loss_mlp": 0.0104561, + "balance_loss_clip": 1.0295403, + "balance_loss_mlp": 1.04810047, + "epoch": 0.3188937321509094, + "flos": 20047886507520.0, + "grad_norm": 2.065727829234295, + "language_loss": 0.72734123, + "learning_rate": 3.186539603020047e-06, + "loss": 0.74916923, + "num_input_tokens_seen": 113957125, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.890625, + "step": 5304, + "time_per_iteration": 3.835230588912964 + }, + { + "auxiliary_loss_clip": 0.01126733, + "auxiliary_loss_mlp": 0.01039176, + "balance_loss_clip": 1.02546668, + "balance_loss_mlp": 1.04595399, + "epoch": 0.31895385540357735, + "flos": 25848105863040.0, + "grad_norm": 1.8866410100018438, + "language_loss": 0.71773344, + "learning_rate": 3.186226062434068e-06, + "loss": 0.73939252, + "num_input_tokens_seen": 113974875, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.80859375, + "step": 5305, + "time_per_iteration": 2.5330212116241455 + }, + { + "auxiliary_loss_clip": 0.01129402, + "auxiliary_loss_mlp": 0.0103604, + "balance_loss_clip": 1.02209806, + "balance_loss_mlp": 1.0472002, + "epoch": 0.3190139786562453, + "flos": 23477714519040.0, + "grad_norm": 1.6861128411196662, + "language_loss": 0.64708328, + "learning_rate": 3.1859124768662778e-06, + "loss": 0.66873765, + "num_input_tokens_seen": 113994450, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.82421875, + "step": 5306, + "time_per_iteration": 2.4788570404052734 + }, + { + "auxiliary_loss_clip": 0.01135221, + "auxiliary_loss_mlp": 0.01042556, + "balance_loss_clip": 1.02714205, + "balance_loss_mlp": 1.05026746, + "epoch": 0.3190741019089133, + "flos": 29095543589760.0, + "grad_norm": 2.161280639112344, + "language_loss": 0.79625881, + "learning_rate": 3.1855988463285678e-06, + "loss": 0.81803662, + "num_input_tokens_seen": 114013945, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84765625, + "step": 5307, + "time_per_iteration": 2.5614371299743652 + }, + { + "auxiliary_loss_clip": 0.0112354, + "auxiliary_loss_mlp": 0.01039882, + "balance_loss_clip": 1.02412832, + "balance_loss_mlp": 1.04311657, + "epoch": 0.31913422516158124, + "flos": 17129534209920.0, + "grad_norm": 1.727529620646192, + "language_loss": 0.77898794, + "learning_rate": 3.1852851708328308e-06, + "loss": 0.80062222, + "num_input_tokens_seen": 114031375, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8046875, + "step": 5308, + "time_per_iteration": 2.4443254470825195 + }, + { + "auxiliary_loss_clip": 0.01142678, + "auxiliary_loss_mlp": 0.01048979, + "balance_loss_clip": 1.03182518, + "balance_loss_mlp": 1.05046844, + "epoch": 0.3191943484142492, + "flos": 16069846147200.0, + "grad_norm": 5.1649453810283426, + "language_loss": 0.74302876, + "learning_rate": 3.184971450390961e-06, + "loss": 0.76494527, + "num_input_tokens_seen": 114048465, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.921875, + "step": 5309, + "time_per_iteration": 2.494800090789795 + }, + { + "auxiliary_loss_clip": 0.0112957, + "auxiliary_loss_mlp": 0.01034444, + "balance_loss_clip": 1.01998436, + "balance_loss_mlp": 1.04589248, + "epoch": 0.3192544716669172, + "flos": 22966166977920.0, + "grad_norm": 1.754429841361115, + "language_loss": 0.82606339, + "learning_rate": 3.184657685014856e-06, + "loss": 0.84770352, + "num_input_tokens_seen": 114068415, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8359375, + "step": 5310, + "time_per_iteration": 2.4630603790283203 + }, + { + "auxiliary_loss_clip": 0.01129012, + "auxiliary_loss_mlp": 0.01041266, + "balance_loss_clip": 1.02762246, + "balance_loss_mlp": 1.04536486, + "epoch": 0.31931459491958514, + "flos": 26870339018880.0, + "grad_norm": 1.4405475768569584, + "language_loss": 0.78319013, + "learning_rate": 3.184343874716412e-06, + "loss": 0.8048929, + "num_input_tokens_seen": 114088565, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.8359375, + "step": 5311, + "time_per_iteration": 2.5892724990844727 + }, + { + "auxiliary_loss_clip": 0.01130953, + "auxiliary_loss_mlp": 0.01040389, + "balance_loss_clip": 1.02419996, + "balance_loss_mlp": 1.04695129, + "epoch": 0.3193747181722531, + "flos": 21836525178240.0, + "grad_norm": 2.475613964939968, + "language_loss": 0.84316272, + "learning_rate": 3.1840300195075295e-06, + "loss": 0.86487615, + "num_input_tokens_seen": 114107160, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.83984375, + "step": 5312, + "time_per_iteration": 2.4625802040100098 + }, + { + "auxiliary_loss_clip": 0.01137215, + "auxiliary_loss_mlp": 0.01044515, + "balance_loss_clip": 1.02808809, + "balance_loss_mlp": 1.0480628, + "epoch": 0.31943484142492107, + "flos": 18324999682560.0, + "grad_norm": 2.3910939905221302, + "language_loss": 0.78584075, + "learning_rate": 3.1837161194001102e-06, + "loss": 0.80765808, + "num_input_tokens_seen": 114123420, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.890625, + "step": 5313, + "time_per_iteration": 2.548509359359741 + }, + { + "auxiliary_loss_clip": 0.01132383, + "auxiliary_loss_mlp": 0.01036276, + "balance_loss_clip": 1.02133918, + "balance_loss_mlp": 1.04814112, + "epoch": 0.31949496467758903, + "flos": 21615818060160.0, + "grad_norm": 2.1643333364087582, + "language_loss": 0.85868084, + "learning_rate": 3.183402174406057e-06, + "loss": 0.88036746, + "num_input_tokens_seen": 114139230, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.83984375, + "step": 5314, + "time_per_iteration": 2.4721946716308594 + }, + { + "auxiliary_loss_clip": 0.01131852, + "auxiliary_loss_mlp": 0.01040473, + "balance_loss_clip": 1.02502346, + "balance_loss_mlp": 1.04725409, + "epoch": 0.31955508793025705, + "flos": 21760214734080.0, + "grad_norm": 1.7188296838329389, + "language_loss": 0.79836512, + "learning_rate": 3.1830881845372747e-06, + "loss": 0.82008839, + "num_input_tokens_seen": 114159290, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84375, + "step": 5315, + "time_per_iteration": 2.512554407119751 + }, + { + "auxiliary_loss_clip": 0.01135172, + "auxiliary_loss_mlp": 0.01049715, + "balance_loss_clip": 1.03331804, + "balance_loss_mlp": 1.0493269, + "epoch": 0.319615211182925, + "flos": 17164331510400.0, + "grad_norm": 6.566744634036759, + "language_loss": 0.67652613, + "learning_rate": 3.18277414980567e-06, + "loss": 0.69837505, + "num_input_tokens_seen": 114177655, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.859375, + "step": 5316, + "time_per_iteration": 2.4364819526672363 + }, + { + "auxiliary_loss_clip": 0.01133826, + "auxiliary_loss_mlp": 0.01034907, + "balance_loss_clip": 1.02105474, + "balance_loss_mlp": 1.04888916, + "epoch": 0.319675334435593, + "flos": 28112812416000.0, + "grad_norm": 1.4751284993654519, + "language_loss": 0.69336772, + "learning_rate": 3.1824600702231515e-06, + "loss": 0.71505511, + "num_input_tokens_seen": 114200880, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.84765625, + "step": 5317, + "time_per_iteration": 2.6055562496185303 + }, + { + "auxiliary_loss_clip": 0.01043016, + "auxiliary_loss_mlp": 0.0100349, + "balance_loss_clip": 1.00143993, + "balance_loss_mlp": 1.01474404, + "epoch": 0.31973545768826095, + "flos": 69501119408640.0, + "grad_norm": 0.7259742625655435, + "language_loss": 0.53048342, + "learning_rate": 3.182145945801628e-06, + "loss": 0.5509485, + "num_input_tokens_seen": 114267145, + "router_z_loss_clip": 0.02050781, + "router_z_loss_mlp": 0.28320312, + "step": 5318, + "time_per_iteration": 3.200087308883667 + }, + { + "auxiliary_loss_clip": 0.01132062, + "auxiliary_loss_mlp": 0.01037492, + "balance_loss_clip": 1.02311563, + "balance_loss_mlp": 1.04900801, + "epoch": 0.3197955809409289, + "flos": 13699203408000.0, + "grad_norm": 1.839211184718713, + "language_loss": 0.83865941, + "learning_rate": 3.181831776553012e-06, + "loss": 0.8603549, + "num_input_tokens_seen": 114284630, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.83203125, + "step": 5319, + "time_per_iteration": 2.471498966217041 + }, + { + "auxiliary_loss_clip": 0.01131434, + "auxiliary_loss_mlp": 0.01042883, + "balance_loss_clip": 1.0279578, + "balance_loss_mlp": 1.04728413, + "epoch": 0.3198557041935969, + "flos": 33218124278400.0, + "grad_norm": 1.3959306603032393, + "language_loss": 0.63542199, + "learning_rate": 3.1815175624892165e-06, + "loss": 0.65716517, + "num_input_tokens_seen": 114305830, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.83984375, + "step": 5320, + "time_per_iteration": 2.5526087284088135 + }, + { + "auxiliary_loss_clip": 0.01136898, + "auxiliary_loss_mlp": 0.01040372, + "balance_loss_clip": 1.02528036, + "balance_loss_mlp": 1.04970324, + "epoch": 0.31991582744626484, + "flos": 23732033788800.0, + "grad_norm": 1.9943779690432752, + "language_loss": 0.70519614, + "learning_rate": 3.1812033036221567e-06, + "loss": 0.72696882, + "num_input_tokens_seen": 114325165, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.87109375, + "step": 5321, + "time_per_iteration": 2.5262763500213623 + }, + { + "auxiliary_loss_clip": 0.01141108, + "auxiliary_loss_mlp": 0.01056872, + "balance_loss_clip": 1.04030156, + "balance_loss_mlp": 1.05110431, + "epoch": 0.3199759506989328, + "flos": 18550842445440.0, + "grad_norm": 3.2234904552907238, + "language_loss": 0.86543447, + "learning_rate": 3.180888999963749e-06, + "loss": 0.88741434, + "num_input_tokens_seen": 114341310, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8984375, + "step": 5322, + "time_per_iteration": 2.4432008266448975 + }, + { + "auxiliary_loss_clip": 0.01132235, + "auxiliary_loss_mlp": 0.01035962, + "balance_loss_clip": 1.02119207, + "balance_loss_mlp": 1.04827893, + "epoch": 0.3200360739516008, + "flos": 22418888382720.0, + "grad_norm": 1.7854648356549414, + "language_loss": 0.82820231, + "learning_rate": 3.1805746515259123e-06, + "loss": 0.84988427, + "num_input_tokens_seen": 114360355, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.83984375, + "step": 5323, + "time_per_iteration": 2.554539680480957 + }, + { + "auxiliary_loss_clip": 0.01130058, + "auxiliary_loss_mlp": 0.01037837, + "balance_loss_clip": 1.02157664, + "balance_loss_mlp": 1.04700553, + "epoch": 0.32009619720426874, + "flos": 20595236929920.0, + "grad_norm": 1.8735349940723531, + "language_loss": 0.77858555, + "learning_rate": 3.1802602583205663e-06, + "loss": 0.8002646, + "num_input_tokens_seen": 114379220, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.828125, + "step": 5324, + "time_per_iteration": 2.452894687652588 + }, + { + "auxiliary_loss_clip": 0.0113163, + "auxiliary_loss_mlp": 0.01034726, + "balance_loss_clip": 1.01981282, + "balance_loss_mlp": 1.04770339, + "epoch": 0.3201563204569367, + "flos": 18147637301760.0, + "grad_norm": 1.8150910160625646, + "language_loss": 0.80162597, + "learning_rate": 3.1799458203596333e-06, + "loss": 0.82328951, + "num_input_tokens_seen": 114396365, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.83984375, + "step": 5325, + "time_per_iteration": 2.5261802673339844 + }, + { + "auxiliary_loss_clip": 0.01133025, + "auxiliary_loss_mlp": 0.01041931, + "balance_loss_clip": 1.02690446, + "balance_loss_mlp": 1.04872847, + "epoch": 0.32021644370960467, + "flos": 31684235840640.0, + "grad_norm": 1.8959189814779316, + "language_loss": 0.75171864, + "learning_rate": 3.179631337655037e-06, + "loss": 0.77346826, + "num_input_tokens_seen": 114416780, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84375, + "step": 5326, + "time_per_iteration": 2.5300135612487793 + }, + { + "auxiliary_loss_clip": 0.01129958, + "auxiliary_loss_mlp": 0.01037614, + "balance_loss_clip": 1.02285552, + "balance_loss_mlp": 1.04836321, + "epoch": 0.32027656696227264, + "flos": 26865921646080.0, + "grad_norm": 1.4421847054475023, + "language_loss": 0.80826092, + "learning_rate": 3.179316810218701e-06, + "loss": 0.82993662, + "num_input_tokens_seen": 114437405, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.81640625, + "step": 5327, + "time_per_iteration": 2.5393614768981934 + }, + { + "auxiliary_loss_clip": 0.01135097, + "auxiliary_loss_mlp": 0.01037836, + "balance_loss_clip": 1.022434, + "balance_loss_mlp": 1.04888535, + "epoch": 0.32033669021494066, + "flos": 24169928492160.0, + "grad_norm": 1.5386676468863185, + "language_loss": 0.77926928, + "learning_rate": 3.179002238062554e-06, + "loss": 0.80099857, + "num_input_tokens_seen": 114458505, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.859375, + "step": 5328, + "time_per_iteration": 2.471806287765503 + }, + { + "auxiliary_loss_clip": 0.011322, + "auxiliary_loss_mlp": 0.01041791, + "balance_loss_clip": 1.02550721, + "balance_loss_mlp": 1.04632294, + "epoch": 0.3203968134676086, + "flos": 24460768915200.0, + "grad_norm": 2.9951100938200765, + "language_loss": 0.73971635, + "learning_rate": 3.178687621198524e-06, + "loss": 0.76145625, + "num_input_tokens_seen": 114479050, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.859375, + "step": 5329, + "time_per_iteration": 2.52327561378479 + }, + { + "auxiliary_loss_clip": 0.01127399, + "auxiliary_loss_mlp": 0.01033731, + "balance_loss_clip": 1.02012336, + "balance_loss_mlp": 1.04675198, + "epoch": 0.3204569367202766, + "flos": 18004713085440.0, + "grad_norm": 2.060461898980319, + "language_loss": 0.71036464, + "learning_rate": 3.1783729596385415e-06, + "loss": 0.73197591, + "num_input_tokens_seen": 114497415, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.8046875, + "step": 5330, + "time_per_iteration": 2.4405477046966553 + }, + { + "auxiliary_loss_clip": 0.01136038, + "auxiliary_loss_mlp": 0.01049965, + "balance_loss_clip": 1.03343058, + "balance_loss_mlp": 1.0474323, + "epoch": 0.32051705997294455, + "flos": 30589678650240.0, + "grad_norm": 1.7909305839918348, + "language_loss": 0.80022657, + "learning_rate": 3.1780582533945376e-06, + "loss": 0.82208663, + "num_input_tokens_seen": 114518785, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8828125, + "step": 5331, + "time_per_iteration": 2.5934245586395264 + }, + { + "auxiliary_loss_clip": 0.01037799, + "auxiliary_loss_mlp": 0.01004509, + "balance_loss_clip": 1.00256538, + "balance_loss_mlp": 1.01001608, + "epoch": 0.3205771832256125, + "flos": 68417979765120.0, + "grad_norm": 0.8366333048595008, + "language_loss": 0.57806182, + "learning_rate": 3.177743502478447e-06, + "loss": 0.59848487, + "num_input_tokens_seen": 114577710, + "router_z_loss_clip": 0.01940918, + "router_z_loss_mlp": 0.27734375, + "step": 5332, + "time_per_iteration": 2.9984278678894043 + }, + { + "auxiliary_loss_clip": 0.01134361, + "auxiliary_loss_mlp": 0.01039294, + "balance_loss_clip": 1.02450585, + "balance_loss_mlp": 1.04747975, + "epoch": 0.3206373064782805, + "flos": 30443953173120.0, + "grad_norm": 1.7943987990453594, + "language_loss": 0.73309821, + "learning_rate": 3.177428706902205e-06, + "loss": 0.75483477, + "num_input_tokens_seen": 114598640, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.87109375, + "step": 5333, + "time_per_iteration": 2.554401159286499 + }, + { + "auxiliary_loss_clip": 0.01133668, + "auxiliary_loss_mlp": 0.01042462, + "balance_loss_clip": 1.02686942, + "balance_loss_mlp": 1.04836345, + "epoch": 0.32069742973094845, + "flos": 22054502862720.0, + "grad_norm": 1.5896288664703238, + "language_loss": 0.71050882, + "learning_rate": 3.1771138666777485e-06, + "loss": 0.73227012, + "num_input_tokens_seen": 114618780, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8515625, + "step": 5334, + "time_per_iteration": 2.468472957611084 + }, + { + "auxiliary_loss_clip": 0.01132404, + "auxiliary_loss_mlp": 0.01041967, + "balance_loss_clip": 1.02658951, + "balance_loss_mlp": 1.04644001, + "epoch": 0.3207575529836164, + "flos": 22054000072320.0, + "grad_norm": 1.9528247502362917, + "language_loss": 0.77601135, + "learning_rate": 3.1767989818170156e-06, + "loss": 0.797755, + "num_input_tokens_seen": 114637525, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.859375, + "step": 5335, + "time_per_iteration": 2.524211883544922 + }, + { + "auxiliary_loss_clip": 0.01131695, + "auxiliary_loss_mlp": 0.01040383, + "balance_loss_clip": 1.02519548, + "balance_loss_mlp": 1.04687452, + "epoch": 0.3208176762362844, + "flos": 34057536186240.0, + "grad_norm": 1.5197552931214375, + "language_loss": 0.68353152, + "learning_rate": 3.1764840523319477e-06, + "loss": 0.70525241, + "num_input_tokens_seen": 114659705, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.84765625, + "step": 5336, + "time_per_iteration": 2.5674326419830322 + }, + { + "auxiliary_loss_clip": 0.01131682, + "auxiliary_loss_mlp": 0.01045646, + "balance_loss_clip": 1.03027439, + "balance_loss_mlp": 1.04688144, + "epoch": 0.32087779948895234, + "flos": 21798711135360.0, + "grad_norm": 1.7063748564330914, + "language_loss": 0.7895453, + "learning_rate": 3.176169078234487e-06, + "loss": 0.81131858, + "num_input_tokens_seen": 114678340, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84765625, + "step": 5337, + "time_per_iteration": 2.5010595321655273 + }, + { + "auxiliary_loss_clip": 0.01124535, + "auxiliary_loss_mlp": 0.01035607, + "balance_loss_clip": 1.02194548, + "balance_loss_mlp": 1.04505002, + "epoch": 0.3209379227416203, + "flos": 21434110133760.0, + "grad_norm": 1.7193225847880926, + "language_loss": 0.73997593, + "learning_rate": 3.1758540595365766e-06, + "loss": 0.76157737, + "num_input_tokens_seen": 114696980, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.796875, + "step": 5338, + "time_per_iteration": 2.4961647987365723 + }, + { + "auxiliary_loss_clip": 0.01132045, + "auxiliary_loss_mlp": 0.01041805, + "balance_loss_clip": 1.02633142, + "balance_loss_mlp": 1.04477298, + "epoch": 0.3209980459942883, + "flos": 25849075530240.0, + "grad_norm": 1.8336519924948942, + "language_loss": 0.63149244, + "learning_rate": 3.1755389962501626e-06, + "loss": 0.65323097, + "num_input_tokens_seen": 114717330, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.87109375, + "step": 5339, + "time_per_iteration": 2.5218987464904785 + }, + { + "auxiliary_loss_clip": 0.01130495, + "auxiliary_loss_mlp": 0.01039604, + "balance_loss_clip": 1.02409506, + "balance_loss_mlp": 1.04546928, + "epoch": 0.32105816924695624, + "flos": 19099162535040.0, + "grad_norm": 1.814332726776551, + "language_loss": 0.81917858, + "learning_rate": 3.175223888387192e-06, + "loss": 0.84087962, + "num_input_tokens_seen": 114736320, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8515625, + "step": 5340, + "time_per_iteration": 2.427483558654785 + }, + { + "auxiliary_loss_clip": 0.0113181, + "auxiliary_loss_mlp": 0.01043319, + "balance_loss_clip": 1.02847123, + "balance_loss_mlp": 1.04696941, + "epoch": 0.3211182924996242, + "flos": 16581860565120.0, + "grad_norm": 1.7172536004624983, + "language_loss": 0.7620244, + "learning_rate": 3.1749087359596137e-06, + "loss": 0.78377569, + "num_input_tokens_seen": 114754575, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.84765625, + "step": 5341, + "time_per_iteration": 2.4785468578338623 + }, + { + "auxiliary_loss_clip": 0.01130847, + "auxiliary_loss_mlp": 0.01036241, + "balance_loss_clip": 1.02154231, + "balance_loss_mlp": 1.04897809, + "epoch": 0.3211784157522922, + "flos": 22672202071680.0, + "grad_norm": 1.9213308470980235, + "language_loss": 0.78627086, + "learning_rate": 3.1745935389793786e-06, + "loss": 0.80794168, + "num_input_tokens_seen": 114773590, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.81640625, + "step": 5342, + "time_per_iteration": 2.4524106979370117 + }, + { + "auxiliary_loss_clip": 0.01133398, + "auxiliary_loss_mlp": 0.01039022, + "balance_loss_clip": 1.02290499, + "balance_loss_mlp": 1.04772902, + "epoch": 0.3212385390049602, + "flos": 20558787603840.0, + "grad_norm": 3.762302479650767, + "language_loss": 0.74934483, + "learning_rate": 3.174278297458438e-06, + "loss": 0.77106899, + "num_input_tokens_seen": 114790775, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.859375, + "step": 5343, + "time_per_iteration": 2.4744415283203125 + }, + { + "auxiliary_loss_clip": 0.01130661, + "auxiliary_loss_mlp": 0.01035912, + "balance_loss_clip": 1.02040279, + "balance_loss_mlp": 1.04623377, + "epoch": 0.32129866225762815, + "flos": 24791147233920.0, + "grad_norm": 1.6135516142824962, + "language_loss": 0.82859504, + "learning_rate": 3.173963011408748e-06, + "loss": 0.85026079, + "num_input_tokens_seen": 114809835, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.84375, + "step": 5344, + "time_per_iteration": 2.47578763961792 + }, + { + "auxiliary_loss_clip": 0.01130938, + "auxiliary_loss_mlp": 0.01039787, + "balance_loss_clip": 1.02407503, + "balance_loss_mlp": 1.04474425, + "epoch": 0.3213587855102961, + "flos": 18366871962240.0, + "grad_norm": 2.07297685310976, + "language_loss": 0.79812628, + "learning_rate": 3.173647680842262e-06, + "loss": 0.81983352, + "num_input_tokens_seen": 114826505, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.859375, + "step": 5345, + "time_per_iteration": 5.33889365196228 + }, + { + "auxiliary_loss_clip": 0.01130545, + "auxiliary_loss_mlp": 0.01036519, + "balance_loss_clip": 1.02149296, + "balance_loss_mlp": 1.04473424, + "epoch": 0.3214189087629641, + "flos": 27015992668800.0, + "grad_norm": 1.8810220564208493, + "language_loss": 0.83404821, + "learning_rate": 3.1733323057709384e-06, + "loss": 0.85571885, + "num_input_tokens_seen": 114846140, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.859375, + "step": 5346, + "time_per_iteration": 2.500577688217163 + }, + { + "auxiliary_loss_clip": 0.01131977, + "auxiliary_loss_mlp": 0.0103944, + "balance_loss_clip": 1.02362108, + "balance_loss_mlp": 1.04492784, + "epoch": 0.32147903201563205, + "flos": 23148269953920.0, + "grad_norm": 1.4095386913443633, + "language_loss": 0.81571388, + "learning_rate": 3.1730168862067366e-06, + "loss": 0.83742809, + "num_input_tokens_seen": 114866660, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.87109375, + "step": 5347, + "time_per_iteration": 2.4491653442382812 + }, + { + "auxiliary_loss_clip": 0.01130206, + "auxiliary_loss_mlp": 0.01039058, + "balance_loss_clip": 1.02332854, + "balance_loss_mlp": 1.04715562, + "epoch": 0.3215391552683, + "flos": 16580747243520.0, + "grad_norm": 1.9965712334987884, + "language_loss": 0.79898697, + "learning_rate": 3.1727014221616164e-06, + "loss": 0.82067955, + "num_input_tokens_seen": 114882820, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.828125, + "step": 5348, + "time_per_iteration": 2.471261501312256 + }, + { + "auxiliary_loss_clip": 0.01132661, + "auxiliary_loss_mlp": 0.01047853, + "balance_loss_clip": 1.03262997, + "balance_loss_mlp": 1.04691792, + "epoch": 0.321599278520968, + "flos": 17821820010240.0, + "grad_norm": 1.9690807455187813, + "language_loss": 0.8506968, + "learning_rate": 3.172385913647542e-06, + "loss": 0.87250197, + "num_input_tokens_seen": 114900745, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.859375, + "step": 5349, + "time_per_iteration": 2.4376416206359863 + }, + { + "auxiliary_loss_clip": 0.01130553, + "auxiliary_loss_mlp": 0.01037186, + "balance_loss_clip": 1.02215409, + "balance_loss_mlp": 1.04589188, + "epoch": 0.32165940177363594, + "flos": 16251769555200.0, + "grad_norm": 1.7092259574450879, + "language_loss": 0.80862331, + "learning_rate": 3.172070360676475e-06, + "loss": 0.83030069, + "num_input_tokens_seen": 114917940, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84765625, + "step": 5350, + "time_per_iteration": 2.463998794555664 + }, + { + "auxiliary_loss_clip": 0.01129559, + "auxiliary_loss_mlp": 0.01040074, + "balance_loss_clip": 1.02545869, + "balance_loss_mlp": 1.04548049, + "epoch": 0.3217195250263039, + "flos": 27599900158080.0, + "grad_norm": 1.7709203173786705, + "language_loss": 0.79856229, + "learning_rate": 3.1717547632603828e-06, + "loss": 0.82025862, + "num_input_tokens_seen": 114937735, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.83984375, + "step": 5351, + "time_per_iteration": 2.5017340183258057 + }, + { + "auxiliary_loss_clip": 0.01129171, + "auxiliary_loss_mlp": 0.01040328, + "balance_loss_clip": 1.02396047, + "balance_loss_mlp": 1.04505897, + "epoch": 0.3217796482789719, + "flos": 21470595373440.0, + "grad_norm": 1.701097630272038, + "language_loss": 0.75491166, + "learning_rate": 3.1714391214112326e-06, + "loss": 0.77660662, + "num_input_tokens_seen": 114956630, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.83984375, + "step": 5352, + "time_per_iteration": 2.4916653633117676 + }, + { + "auxiliary_loss_clip": 0.01132153, + "auxiliary_loss_mlp": 0.01038225, + "balance_loss_clip": 1.02179837, + "balance_loss_mlp": 1.0472436, + "epoch": 0.32183977153163984, + "flos": 21215593745280.0, + "grad_norm": 1.8428416092094815, + "language_loss": 0.8174473, + "learning_rate": 3.1711234351409933e-06, + "loss": 0.83915108, + "num_input_tokens_seen": 114976470, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8515625, + "step": 5353, + "time_per_iteration": 2.4554946422576904 + }, + { + "auxiliary_loss_clip": 0.01127699, + "auxiliary_loss_mlp": 0.01037405, + "balance_loss_clip": 1.02147865, + "balance_loss_mlp": 1.04577875, + "epoch": 0.3218998947843078, + "flos": 24608182331520.0, + "grad_norm": 1.533417142425662, + "language_loss": 0.73054826, + "learning_rate": 3.1708077044616365e-06, + "loss": 0.75219929, + "num_input_tokens_seen": 114996710, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8203125, + "step": 5354, + "time_per_iteration": 2.521679639816284 + }, + { + "auxiliary_loss_clip": 0.01129194, + "auxiliary_loss_mlp": 0.01033035, + "balance_loss_clip": 1.01830053, + "balance_loss_mlp": 1.04482782, + "epoch": 0.3219600180369758, + "flos": 22270577126400.0, + "grad_norm": 1.5056594732405602, + "language_loss": 0.8349731, + "learning_rate": 3.1704919293851334e-06, + "loss": 0.8565954, + "num_input_tokens_seen": 115015775, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.84375, + "step": 5355, + "time_per_iteration": 2.4590871334075928 + }, + { + "auxiliary_loss_clip": 0.0113427, + "auxiliary_loss_mlp": 0.01045552, + "balance_loss_clip": 1.0299834, + "balance_loss_mlp": 1.04840243, + "epoch": 0.3220201412896438, + "flos": 14939126939520.0, + "grad_norm": 2.2450583198173737, + "language_loss": 0.71577442, + "learning_rate": 3.1701761099234597e-06, + "loss": 0.73757267, + "num_input_tokens_seen": 115034265, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.859375, + "step": 5356, + "time_per_iteration": 2.4499382972717285 + }, + { + "auxiliary_loss_clip": 0.01137452, + "auxiliary_loss_mlp": 0.0103626, + "balance_loss_clip": 1.0196538, + "balance_loss_mlp": 1.04720378, + "epoch": 0.32208026454231176, + "flos": 22667389649280.0, + "grad_norm": 2.5072162620412968, + "language_loss": 0.68480343, + "learning_rate": 3.1698602460885903e-06, + "loss": 0.70654052, + "num_input_tokens_seen": 115051945, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.90234375, + "step": 5357, + "time_per_iteration": 2.449125289916992 + }, + { + "auxiliary_loss_clip": 0.01042111, + "auxiliary_loss_mlp": 0.01002103, + "balance_loss_clip": 1.00029111, + "balance_loss_mlp": 1.01435876, + "epoch": 0.3221403877949797, + "flos": 64605130053120.0, + "grad_norm": 0.7023861387911429, + "language_loss": 0.58256829, + "learning_rate": 3.1695443378925035e-06, + "loss": 0.60301042, + "num_input_tokens_seen": 115119090, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.27734375, + "step": 5358, + "time_per_iteration": 3.1561930179595947 + }, + { + "auxiliary_loss_clip": 0.01130123, + "auxiliary_loss_mlp": 0.01041349, + "balance_loss_clip": 1.02506542, + "balance_loss_mlp": 1.04423356, + "epoch": 0.3222005110476477, + "flos": 20157019004160.0, + "grad_norm": 5.918956850418863, + "language_loss": 0.83524048, + "learning_rate": 3.1692283853471777e-06, + "loss": 0.85695517, + "num_input_tokens_seen": 115137755, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.859375, + "step": 5359, + "time_per_iteration": 2.4850337505340576 + }, + { + "auxiliary_loss_clip": 0.01132117, + "auxiliary_loss_mlp": 0.01034071, + "balance_loss_clip": 1.019122, + "balance_loss_mlp": 1.04514802, + "epoch": 0.32226063430031565, + "flos": 22674177319680.0, + "grad_norm": 1.5557598040672038, + "language_loss": 0.79817981, + "learning_rate": 3.168912388464595e-06, + "loss": 0.81984174, + "num_input_tokens_seen": 115158150, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8671875, + "step": 5360, + "time_per_iteration": 2.476698637008667 + }, + { + "auxiliary_loss_clip": 0.01040711, + "auxiliary_loss_mlp": 0.00999439, + "balance_loss_clip": 0.99754351, + "balance_loss_mlp": 1.01316833, + "epoch": 0.3223207575529836, + "flos": 63828525075840.0, + "grad_norm": 0.750004294413456, + "language_loss": 0.5697335, + "learning_rate": 3.168596347256737e-06, + "loss": 0.59013498, + "num_input_tokens_seen": 115212755, + "router_z_loss_clip": 0.0189209, + "router_z_loss_mlp": 0.27539062, + "step": 5361, + "time_per_iteration": 2.933368444442749 + }, + { + "auxiliary_loss_clip": 0.01129938, + "auxiliary_loss_mlp": 0.01039744, + "balance_loss_clip": 1.02452111, + "balance_loss_mlp": 1.04625082, + "epoch": 0.3223808808056516, + "flos": 26870123537280.0, + "grad_norm": 1.730134050345621, + "language_loss": 0.71349204, + "learning_rate": 3.168280261735588e-06, + "loss": 0.73518884, + "num_input_tokens_seen": 115233090, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8359375, + "step": 5362, + "time_per_iteration": 2.508444309234619 + }, + { + "auxiliary_loss_clip": 0.0113054, + "auxiliary_loss_mlp": 0.01040003, + "balance_loss_clip": 1.02606201, + "balance_loss_mlp": 1.04685211, + "epoch": 0.32244100405831955, + "flos": 26761350176640.0, + "grad_norm": 1.6566995758494631, + "language_loss": 0.74008292, + "learning_rate": 3.167964131913135e-06, + "loss": 0.76178837, + "num_input_tokens_seen": 115252645, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.8359375, + "step": 5363, + "time_per_iteration": 2.530428409576416 + }, + { + "auxiliary_loss_clip": 0.01134592, + "auxiliary_loss_mlp": 0.01040493, + "balance_loss_clip": 1.02481735, + "balance_loss_mlp": 1.04535139, + "epoch": 0.3225011273109875, + "flos": 23803029020160.0, + "grad_norm": 2.5112112412179624, + "language_loss": 0.77012563, + "learning_rate": 3.167647957801365e-06, + "loss": 0.79187649, + "num_input_tokens_seen": 115269085, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.890625, + "step": 5364, + "time_per_iteration": 2.475532054901123 + }, + { + "auxiliary_loss_clip": 0.01128803, + "auxiliary_loss_mlp": 0.01043179, + "balance_loss_clip": 1.02747917, + "balance_loss_mlp": 1.04455853, + "epoch": 0.3225612505636555, + "flos": 17274505501440.0, + "grad_norm": 2.1198351151285992, + "language_loss": 0.77043676, + "learning_rate": 3.1673317394122672e-06, + "loss": 0.79215652, + "num_input_tokens_seen": 115286470, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.84375, + "step": 5365, + "time_per_iteration": 2.4466004371643066 + }, + { + "auxiliary_loss_clip": 0.01133051, + "auxiliary_loss_mlp": 0.01049625, + "balance_loss_clip": 1.03444982, + "balance_loss_mlp": 1.04861832, + "epoch": 0.32262137381632344, + "flos": 23366247638400.0, + "grad_norm": 1.5183743876703555, + "language_loss": 0.76853883, + "learning_rate": 3.1670154767578333e-06, + "loss": 0.79036558, + "num_input_tokens_seen": 115307000, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.84375, + "step": 5366, + "time_per_iteration": 2.4716286659240723 + }, + { + "auxiliary_loss_clip": 0.01129975, + "auxiliary_loss_mlp": 0.01042819, + "balance_loss_clip": 1.02767324, + "balance_loss_mlp": 1.04463363, + "epoch": 0.3226814970689914, + "flos": 23258803080960.0, + "grad_norm": 1.6325357922005805, + "language_loss": 0.7200039, + "learning_rate": 3.166699169850055e-06, + "loss": 0.74173188, + "num_input_tokens_seen": 115325925, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8515625, + "step": 5367, + "time_per_iteration": 2.4936037063598633 + }, + { + "auxiliary_loss_clip": 0.01125689, + "auxiliary_loss_mlp": 0.01042014, + "balance_loss_clip": 1.02759588, + "balance_loss_mlp": 1.04335558, + "epoch": 0.32274162032165943, + "flos": 16395196561920.0, + "grad_norm": 1.8801069032327764, + "language_loss": 0.7456941, + "learning_rate": 3.1663828187009274e-06, + "loss": 0.76737112, + "num_input_tokens_seen": 115343705, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8203125, + "step": 5368, + "time_per_iteration": 2.436897039413452 + }, + { + "auxiliary_loss_clip": 0.01125271, + "auxiliary_loss_mlp": 0.0104042, + "balance_loss_clip": 1.02592432, + "balance_loss_mlp": 1.04390144, + "epoch": 0.3228017435743274, + "flos": 27855081354240.0, + "grad_norm": 1.5502047591083525, + "language_loss": 0.79212499, + "learning_rate": 3.1660664233224467e-06, + "loss": 0.81378186, + "num_input_tokens_seen": 115364170, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8125, + "step": 5369, + "time_per_iteration": 2.516191244125366 + }, + { + "auxiliary_loss_clip": 0.01124886, + "auxiliary_loss_mlp": 0.01034584, + "balance_loss_clip": 1.02042747, + "balance_loss_mlp": 1.04432988, + "epoch": 0.32286186682699536, + "flos": 19608770741760.0, + "grad_norm": 1.8370527927944635, + "language_loss": 0.83173579, + "learning_rate": 3.16574998372661e-06, + "loss": 0.85333049, + "num_input_tokens_seen": 115382495, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8046875, + "step": 5370, + "time_per_iteration": 2.423494338989258 + }, + { + "auxiliary_loss_clip": 0.01128659, + "auxiliary_loss_mlp": 0.01038158, + "balance_loss_clip": 1.02367377, + "balance_loss_mlp": 1.04524064, + "epoch": 0.3229219900796633, + "flos": 24134017870080.0, + "grad_norm": 1.743608915284185, + "language_loss": 0.83372939, + "learning_rate": 3.1654334999254177e-06, + "loss": 0.85539752, + "num_input_tokens_seen": 115399450, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8359375, + "step": 5371, + "time_per_iteration": 2.481677532196045 + }, + { + "auxiliary_loss_clip": 0.01131779, + "auxiliary_loss_mlp": 0.01048903, + "balance_loss_clip": 1.0323211, + "balance_loss_mlp": 1.04514813, + "epoch": 0.3229821133323313, + "flos": 17748705876480.0, + "grad_norm": 2.043238736788368, + "language_loss": 0.88539696, + "learning_rate": 3.1651169719308695e-06, + "loss": 0.90720367, + "num_input_tokens_seen": 115417700, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8671875, + "step": 5372, + "time_per_iteration": 2.434785842895508 + }, + { + "auxiliary_loss_clip": 0.01128015, + "auxiliary_loss_mlp": 0.01045541, + "balance_loss_clip": 1.03011537, + "balance_loss_mlp": 1.04532862, + "epoch": 0.32304223658499925, + "flos": 22346025644160.0, + "grad_norm": 1.9701661898720624, + "language_loss": 0.73064935, + "learning_rate": 3.1648003997549694e-06, + "loss": 0.75238496, + "num_input_tokens_seen": 115435840, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.828125, + "step": 5373, + "time_per_iteration": 2.509288787841797 + }, + { + "auxiliary_loss_clip": 0.01126431, + "auxiliary_loss_mlp": 0.01036263, + "balance_loss_clip": 1.0217371, + "balance_loss_mlp": 1.04496944, + "epoch": 0.3231023598376672, + "flos": 18478302929280.0, + "grad_norm": 2.118108535598075, + "language_loss": 0.81306481, + "learning_rate": 3.1644837834097214e-06, + "loss": 0.83469176, + "num_input_tokens_seen": 115454210, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8125, + "step": 5374, + "time_per_iteration": 2.43719744682312 + }, + { + "auxiliary_loss_clip": 0.01122361, + "auxiliary_loss_mlp": 0.01036065, + "balance_loss_clip": 1.02135515, + "balance_loss_mlp": 1.04158425, + "epoch": 0.3231624830903352, + "flos": 27636313570560.0, + "grad_norm": 2.0253542373007223, + "language_loss": 0.87507123, + "learning_rate": 3.1641671229071317e-06, + "loss": 0.89665556, + "num_input_tokens_seen": 115471785, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.80859375, + "step": 5375, + "time_per_iteration": 2.5192272663116455 + }, + { + "auxiliary_loss_clip": 0.0112955, + "auxiliary_loss_mlp": 0.01037551, + "balance_loss_clip": 1.02163696, + "balance_loss_mlp": 1.04312396, + "epoch": 0.32322260634300315, + "flos": 21726423014400.0, + "grad_norm": 1.8491566525281582, + "language_loss": 0.75873786, + "learning_rate": 3.1638504182592076e-06, + "loss": 0.78040886, + "num_input_tokens_seen": 115491405, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8671875, + "step": 5376, + "time_per_iteration": 2.463103771209717 + }, + { + "auxiliary_loss_clip": 0.01123814, + "auxiliary_loss_mlp": 0.0103315, + "balance_loss_clip": 1.01955426, + "balance_loss_mlp": 1.04269242, + "epoch": 0.3232827295956711, + "flos": 22637656166400.0, + "grad_norm": 1.5890241026671568, + "language_loss": 0.67173672, + "learning_rate": 3.1635336694779594e-06, + "loss": 0.69330645, + "num_input_tokens_seen": 115511555, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.8125, + "step": 5377, + "time_per_iteration": 2.5341343879699707 + }, + { + "auxiliary_loss_clip": 0.01127281, + "auxiliary_loss_mlp": 0.01046076, + "balance_loss_clip": 1.02922571, + "balance_loss_mlp": 1.04433763, + "epoch": 0.3233428528483391, + "flos": 26322593546880.0, + "grad_norm": 1.5071806558198568, + "language_loss": 0.7231617, + "learning_rate": 3.1632168765753982e-06, + "loss": 0.74489522, + "num_input_tokens_seen": 115532860, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.828125, + "step": 5378, + "time_per_iteration": 2.4838621616363525 + }, + { + "auxiliary_loss_clip": 0.01123972, + "auxiliary_loss_mlp": 0.01032073, + "balance_loss_clip": 1.0174818, + "balance_loss_mlp": 1.04056036, + "epoch": 0.32340297610100704, + "flos": 28585217111040.0, + "grad_norm": 1.9527598104570445, + "language_loss": 0.82083338, + "learning_rate": 3.1629000395635357e-06, + "loss": 0.84239388, + "num_input_tokens_seen": 115553850, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8359375, + "step": 5379, + "time_per_iteration": 2.5433154106140137 + }, + { + "auxiliary_loss_clip": 0.01127314, + "auxiliary_loss_mlp": 0.01032505, + "balance_loss_clip": 1.01805711, + "balance_loss_mlp": 1.04230165, + "epoch": 0.323463099353675, + "flos": 30773792787840.0, + "grad_norm": 1.9705325619840932, + "language_loss": 0.78379917, + "learning_rate": 3.162583158454388e-06, + "loss": 0.80539739, + "num_input_tokens_seen": 115575530, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8515625, + "step": 5380, + "time_per_iteration": 2.5306878089904785 + }, + { + "auxiliary_loss_clip": 0.0112988, + "auxiliary_loss_mlp": 0.01036402, + "balance_loss_clip": 1.02207887, + "balance_loss_mlp": 1.04637241, + "epoch": 0.32352322260634303, + "flos": 25228610974080.0, + "grad_norm": 1.5992937517204726, + "language_loss": 0.76871669, + "learning_rate": 3.1622662332599697e-06, + "loss": 0.79037952, + "num_input_tokens_seen": 115594885, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8359375, + "step": 5381, + "time_per_iteration": 2.545740842819214 + }, + { + "auxiliary_loss_clip": 0.0112089, + "auxiliary_loss_mlp": 0.01035663, + "balance_loss_clip": 1.02228761, + "balance_loss_mlp": 1.04212475, + "epoch": 0.323583345859011, + "flos": 23330480670720.0, + "grad_norm": 1.912812068704809, + "language_loss": 0.71864545, + "learning_rate": 3.1619492639922998e-06, + "loss": 0.74021101, + "num_input_tokens_seen": 115614080, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7890625, + "step": 5382, + "time_per_iteration": 2.488344430923462 + }, + { + "auxiliary_loss_clip": 0.01127382, + "auxiliary_loss_mlp": 0.0103402, + "balance_loss_clip": 1.0192976, + "balance_loss_mlp": 1.0424943, + "epoch": 0.32364346911167896, + "flos": 26207499392640.0, + "grad_norm": 2.8562908675977754, + "language_loss": 0.70752692, + "learning_rate": 3.1616322506633964e-06, + "loss": 0.72914088, + "num_input_tokens_seen": 115632820, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8515625, + "step": 5383, + "time_per_iteration": 2.5236711502075195 + }, + { + "auxiliary_loss_clip": 0.01123876, + "auxiliary_loss_mlp": 0.01039099, + "balance_loss_clip": 1.0259378, + "balance_loss_mlp": 1.0442363, + "epoch": 0.3237035923643469, + "flos": 23695764030720.0, + "grad_norm": 2.094388352971362, + "language_loss": 0.78742963, + "learning_rate": 3.161315193285283e-06, + "loss": 0.80905938, + "num_input_tokens_seen": 115652860, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.796875, + "step": 5384, + "time_per_iteration": 2.4685723781585693 + }, + { + "auxiliary_loss_clip": 0.0112912, + "auxiliary_loss_mlp": 0.01038116, + "balance_loss_clip": 1.0222249, + "balance_loss_mlp": 1.04443073, + "epoch": 0.3237637156170149, + "flos": 14428728633600.0, + "grad_norm": 2.069351852322995, + "language_loss": 0.74553645, + "learning_rate": 3.16099809186998e-06, + "loss": 0.76720881, + "num_input_tokens_seen": 115670940, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.84765625, + "step": 5385, + "time_per_iteration": 2.46968936920166 + }, + { + "auxiliary_loss_clip": 0.01127931, + "auxiliary_loss_mlp": 0.01035567, + "balance_loss_clip": 1.02101183, + "balance_loss_mlp": 1.04604125, + "epoch": 0.32382383886968286, + "flos": 31062981185280.0, + "grad_norm": 1.8196037573439483, + "language_loss": 0.72068852, + "learning_rate": 3.1606809464295145e-06, + "loss": 0.74232352, + "num_input_tokens_seen": 115691155, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8203125, + "step": 5386, + "time_per_iteration": 2.559480667114258 + }, + { + "auxiliary_loss_clip": 0.01128094, + "auxiliary_loss_mlp": 0.01036855, + "balance_loss_clip": 1.02119136, + "balance_loss_mlp": 1.04176617, + "epoch": 0.3238839621223508, + "flos": 23256935573760.0, + "grad_norm": 1.8525904099951498, + "language_loss": 0.94343817, + "learning_rate": 3.1603637569759095e-06, + "loss": 0.96508765, + "num_input_tokens_seen": 115710340, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.86328125, + "step": 5387, + "time_per_iteration": 5.378048896789551 + }, + { + "auxiliary_loss_clip": 0.0112709, + "auxiliary_loss_mlp": 0.01037979, + "balance_loss_clip": 1.02227962, + "balance_loss_mlp": 1.04373097, + "epoch": 0.3239440853750188, + "flos": 22964658606720.0, + "grad_norm": 2.7647642243142747, + "language_loss": 0.77544433, + "learning_rate": 3.1600465235211956e-06, + "loss": 0.79709506, + "num_input_tokens_seen": 115726745, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8359375, + "step": 5388, + "time_per_iteration": 2.4804563522338867 + }, + { + "auxiliary_loss_clip": 0.0112736, + "auxiliary_loss_mlp": 0.01030728, + "balance_loss_clip": 1.01554048, + "balance_loss_mlp": 1.04277194, + "epoch": 0.32400420862768675, + "flos": 36246614653440.0, + "grad_norm": 2.092216766577811, + "language_loss": 0.71867704, + "learning_rate": 3.1597292460774006e-06, + "loss": 0.74025786, + "num_input_tokens_seen": 115749385, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.84375, + "step": 5389, + "time_per_iteration": 2.5753331184387207 + }, + { + "auxiliary_loss_clip": 0.01128194, + "auxiliary_loss_mlp": 0.0103865, + "balance_loss_clip": 1.0233078, + "balance_loss_mlp": 1.04672205, + "epoch": 0.3240643318803547, + "flos": 21616500418560.0, + "grad_norm": 2.0374979548818497, + "language_loss": 0.80883735, + "learning_rate": 3.159411924656557e-06, + "loss": 0.83050573, + "num_input_tokens_seen": 115768105, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.81640625, + "step": 5390, + "time_per_iteration": 2.479557991027832 + }, + { + "auxiliary_loss_clip": 0.01130573, + "auxiliary_loss_mlp": 0.01044181, + "balance_loss_clip": 1.02911294, + "balance_loss_mlp": 1.04798484, + "epoch": 0.3241244551330227, + "flos": 23295611543040.0, + "grad_norm": 2.0682587448682384, + "language_loss": 0.72983515, + "learning_rate": 3.1590945592706967e-06, + "loss": 0.75158268, + "num_input_tokens_seen": 115787340, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.82421875, + "step": 5391, + "time_per_iteration": 2.4689247608184814 + }, + { + "auxiliary_loss_clip": 0.01125432, + "auxiliary_loss_mlp": 0.01041396, + "balance_loss_clip": 1.02728176, + "balance_loss_mlp": 1.04465139, + "epoch": 0.32418457838569065, + "flos": 14097236993280.0, + "grad_norm": 1.6356435132494873, + "language_loss": 0.77357036, + "learning_rate": 3.158777149931855e-06, + "loss": 0.79523861, + "num_input_tokens_seen": 115805565, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.80859375, + "step": 5392, + "time_per_iteration": 2.4942643642425537 + }, + { + "auxiliary_loss_clip": 0.01129141, + "auxiliary_loss_mlp": 0.01040265, + "balance_loss_clip": 1.02454162, + "balance_loss_mlp": 1.04454243, + "epoch": 0.3242447016383586, + "flos": 29752672953600.0, + "grad_norm": 2.035025217222515, + "language_loss": 0.62445068, + "learning_rate": 3.158459696652067e-06, + "loss": 0.64614469, + "num_input_tokens_seen": 115826725, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.84375, + "step": 5393, + "time_per_iteration": 2.5294058322906494 + }, + { + "auxiliary_loss_clip": 0.01127178, + "auxiliary_loss_mlp": 0.01037592, + "balance_loss_clip": 1.02292883, + "balance_loss_mlp": 1.0455395, + "epoch": 0.3243048248910266, + "flos": 24351205455360.0, + "grad_norm": 1.541011228274946, + "language_loss": 0.8250984, + "learning_rate": 3.158142199443371e-06, + "loss": 0.84674609, + "num_input_tokens_seen": 115846955, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.81640625, + "step": 5394, + "time_per_iteration": 2.5204803943634033 + }, + { + "auxiliary_loss_clip": 0.01125244, + "auxiliary_loss_mlp": 0.0104429, + "balance_loss_clip": 1.03089094, + "balance_loss_mlp": 1.04596353, + "epoch": 0.3243649481436946, + "flos": 24353037048960.0, + "grad_norm": 1.8431569167236632, + "language_loss": 0.81585443, + "learning_rate": 3.1578246583178076e-06, + "loss": 0.83754981, + "num_input_tokens_seen": 115865975, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.79296875, + "step": 5395, + "time_per_iteration": 2.481722116470337 + }, + { + "auxiliary_loss_clip": 0.01126361, + "auxiliary_loss_mlp": 0.01042015, + "balance_loss_clip": 1.02844906, + "balance_loss_mlp": 1.04834461, + "epoch": 0.32442507139636256, + "flos": 22925228451840.0, + "grad_norm": 3.644291671680186, + "language_loss": 0.83163011, + "learning_rate": 3.157507073287417e-06, + "loss": 0.8533138, + "num_input_tokens_seen": 115884950, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.78125, + "step": 5396, + "time_per_iteration": 2.5014734268188477 + }, + { + "auxiliary_loss_clip": 0.01133358, + "auxiliary_loss_mlp": 0.01039347, + "balance_loss_clip": 1.02392137, + "balance_loss_mlp": 1.04687238, + "epoch": 0.32448519464903053, + "flos": 22200192426240.0, + "grad_norm": 1.8637158339296453, + "language_loss": 0.75718713, + "learning_rate": 3.1571894443642414e-06, + "loss": 0.77891421, + "num_input_tokens_seen": 115904170, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8671875, + "step": 5397, + "time_per_iteration": 2.475958824157715 + }, + { + "auxiliary_loss_clip": 0.01125578, + "auxiliary_loss_mlp": 0.0103396, + "balance_loss_clip": 1.01953566, + "balance_loss_mlp": 1.04540443, + "epoch": 0.3245453179016985, + "flos": 18838450644480.0, + "grad_norm": 2.571224523552484, + "language_loss": 0.66835862, + "learning_rate": 3.1568717715603263e-06, + "loss": 0.68995398, + "num_input_tokens_seen": 115919255, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8046875, + "step": 5398, + "time_per_iteration": 2.447065830230713 + }, + { + "auxiliary_loss_clip": 0.01124854, + "auxiliary_loss_mlp": 0.0103244, + "balance_loss_clip": 1.0183022, + "balance_loss_mlp": 1.04326463, + "epoch": 0.32460544115436646, + "flos": 21178390233600.0, + "grad_norm": 1.4279244162742584, + "language_loss": 0.73232102, + "learning_rate": 3.156554054887718e-06, + "loss": 0.75389397, + "num_input_tokens_seen": 115938535, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8203125, + "step": 5399, + "time_per_iteration": 2.466137409210205 + }, + { + "auxiliary_loss_clip": 0.01129831, + "auxiliary_loss_mlp": 0.01035026, + "balance_loss_clip": 1.02016079, + "balance_loss_mlp": 1.04749155, + "epoch": 0.3246655644070344, + "flos": 21981137333760.0, + "grad_norm": 2.110147681467196, + "language_loss": 0.71391356, + "learning_rate": 3.1562362943584645e-06, + "loss": 0.73556215, + "num_input_tokens_seen": 115955005, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.82421875, + "step": 5400, + "time_per_iteration": 2.484243631362915 + }, + { + "auxiliary_loss_clip": 0.01128373, + "auxiliary_loss_mlp": 0.01035494, + "balance_loss_clip": 1.02108145, + "balance_loss_mlp": 1.04439175, + "epoch": 0.3247256876597024, + "flos": 32159729105280.0, + "grad_norm": 3.048924003265154, + "language_loss": 0.79583031, + "learning_rate": 3.155918489984614e-06, + "loss": 0.81746894, + "num_input_tokens_seen": 115975305, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.83984375, + "step": 5401, + "time_per_iteration": 2.5695505142211914 + }, + { + "auxiliary_loss_clip": 0.01130508, + "auxiliary_loss_mlp": 0.01042722, + "balance_loss_clip": 1.02642608, + "balance_loss_mlp": 1.04700303, + "epoch": 0.32478581091237035, + "flos": 20997544233600.0, + "grad_norm": 1.4209306386542333, + "language_loss": 0.87675726, + "learning_rate": 3.1556006417782196e-06, + "loss": 0.89848959, + "num_input_tokens_seen": 115994810, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8359375, + "step": 5402, + "time_per_iteration": 2.4811201095581055 + }, + { + "auxiliary_loss_clip": 0.01122645, + "auxiliary_loss_mlp": 0.01036689, + "balance_loss_clip": 1.02249742, + "balance_loss_mlp": 1.04369164, + "epoch": 0.3248459341650383, + "flos": 17924990849280.0, + "grad_norm": 1.934597728175988, + "language_loss": 0.84513289, + "learning_rate": 3.155282749751332e-06, + "loss": 0.86672628, + "num_input_tokens_seen": 116011095, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7890625, + "step": 5403, + "time_per_iteration": 2.418501377105713 + }, + { + "auxiliary_loss_clip": 0.01129275, + "auxiliary_loss_mlp": 0.01041866, + "balance_loss_clip": 1.02852631, + "balance_loss_mlp": 1.05024314, + "epoch": 0.3249060574177063, + "flos": 24535606901760.0, + "grad_norm": 2.0001546098828955, + "language_loss": 0.87642342, + "learning_rate": 3.154964813916007e-06, + "loss": 0.89813483, + "num_input_tokens_seen": 116028805, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7890625, + "step": 5404, + "time_per_iteration": 2.5094971656799316 + }, + { + "auxiliary_loss_clip": 0.01125879, + "auxiliary_loss_mlp": 0.01038939, + "balance_loss_clip": 1.02413273, + "balance_loss_mlp": 1.04579973, + "epoch": 0.32496618067037425, + "flos": 25994765093760.0, + "grad_norm": 1.6336968005079966, + "language_loss": 0.72491479, + "learning_rate": 3.1546468342843008e-06, + "loss": 0.74656296, + "num_input_tokens_seen": 116047765, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.80078125, + "step": 5405, + "time_per_iteration": 2.4927978515625 + }, + { + "auxiliary_loss_clip": 0.01125757, + "auxiliary_loss_mlp": 0.01035734, + "balance_loss_clip": 1.02147698, + "balance_loss_mlp": 1.04514825, + "epoch": 0.3250263039230422, + "flos": 19573757959680.0, + "grad_norm": 1.8637721662214948, + "language_loss": 0.83356953, + "learning_rate": 3.1543288108682707e-06, + "loss": 0.85518444, + "num_input_tokens_seen": 116068385, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.80859375, + "step": 5406, + "time_per_iteration": 2.534508228302002 + }, + { + "auxiliary_loss_clip": 0.01127659, + "auxiliary_loss_mlp": 0.01036296, + "balance_loss_clip": 1.02241969, + "balance_loss_mlp": 1.0469048, + "epoch": 0.3250864271757102, + "flos": 16763640318720.0, + "grad_norm": 1.836635199790601, + "language_loss": 0.8826412, + "learning_rate": 3.1540107436799764e-06, + "loss": 0.90428072, + "num_input_tokens_seen": 116085350, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.80859375, + "step": 5407, + "time_per_iteration": 2.4199326038360596 + }, + { + "auxiliary_loss_clip": 0.01127405, + "auxiliary_loss_mlp": 0.01036157, + "balance_loss_clip": 1.02160144, + "balance_loss_mlp": 1.04602861, + "epoch": 0.3251465504283782, + "flos": 27819458040960.0, + "grad_norm": 1.5140887230520799, + "language_loss": 0.69643426, + "learning_rate": 3.153692632731479e-06, + "loss": 0.71806979, + "num_input_tokens_seen": 116107560, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8125, + "step": 5408, + "time_per_iteration": 2.5646731853485107 + }, + { + "auxiliary_loss_clip": 0.01131319, + "auxiliary_loss_mlp": 0.01035172, + "balance_loss_clip": 1.02013946, + "balance_loss_mlp": 1.04438102, + "epoch": 0.32520667368104617, + "flos": 19063144172160.0, + "grad_norm": 1.6429750268405912, + "language_loss": 0.77442145, + "learning_rate": 3.153374478034841e-06, + "loss": 0.79608637, + "num_input_tokens_seen": 116125980, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.87109375, + "step": 5409, + "time_per_iteration": 2.450200080871582 + }, + { + "auxiliary_loss_clip": 0.01129924, + "auxiliary_loss_mlp": 0.01046371, + "balance_loss_clip": 1.03142262, + "balance_loss_mlp": 1.04331136, + "epoch": 0.32526679693371413, + "flos": 29382146208000.0, + "grad_norm": 2.3862040562488716, + "language_loss": 0.83582234, + "learning_rate": 3.1530562796021285e-06, + "loss": 0.85758531, + "num_input_tokens_seen": 116146530, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8671875, + "step": 5410, + "time_per_iteration": 2.5161662101745605 + }, + { + "auxiliary_loss_clip": 0.01121858, + "auxiliary_loss_mlp": 0.01034854, + "balance_loss_clip": 1.02089429, + "balance_loss_mlp": 1.04224813, + "epoch": 0.3253269201863821, + "flos": 20704513080960.0, + "grad_norm": 1.5577179591930796, + "language_loss": 0.71270931, + "learning_rate": 3.152738037445405e-06, + "loss": 0.73427641, + "num_input_tokens_seen": 116165695, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.796875, + "step": 5411, + "time_per_iteration": 2.4465057849884033 + }, + { + "auxiliary_loss_clip": 0.01125475, + "auxiliary_loss_mlp": 0.01039417, + "balance_loss_clip": 1.02544606, + "balance_loss_mlp": 1.04381669, + "epoch": 0.32538704343905006, + "flos": 29094142959360.0, + "grad_norm": 1.6024997274503978, + "language_loss": 0.83103073, + "learning_rate": 3.1524197515767403e-06, + "loss": 0.85267961, + "num_input_tokens_seen": 116185375, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.81640625, + "step": 5412, + "time_per_iteration": 2.512471914291382 + }, + { + "auxiliary_loss_clip": 0.01129762, + "auxiliary_loss_mlp": 0.01035505, + "balance_loss_clip": 1.01963782, + "balance_loss_mlp": 1.04417348, + "epoch": 0.325447166691718, + "flos": 24676124906880.0, + "grad_norm": 2.3149031646834577, + "language_loss": 0.80794364, + "learning_rate": 3.152101422008203e-06, + "loss": 0.82959628, + "num_input_tokens_seen": 116204335, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.85546875, + "step": 5413, + "time_per_iteration": 2.483309030532837 + }, + { + "auxiliary_loss_clip": 0.01128818, + "auxiliary_loss_mlp": 0.01042957, + "balance_loss_clip": 1.02723312, + "balance_loss_mlp": 1.04606462, + "epoch": 0.325507289944386, + "flos": 21543134889600.0, + "grad_norm": 1.5892127721025033, + "language_loss": 0.76887989, + "learning_rate": 3.151783048751864e-06, + "loss": 0.79059768, + "num_input_tokens_seen": 116222840, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.828125, + "step": 5414, + "time_per_iteration": 2.4696640968322754 + }, + { + "auxiliary_loss_clip": 0.01039619, + "auxiliary_loss_mlp": 0.01008091, + "balance_loss_clip": 1.00601661, + "balance_loss_mlp": 1.01271892, + "epoch": 0.32556741319705396, + "flos": 71518722347520.0, + "grad_norm": 0.9084647328862615, + "language_loss": 0.64009887, + "learning_rate": 3.1514646318197965e-06, + "loss": 0.66057593, + "num_input_tokens_seen": 116274940, + "router_z_loss_clip": 0.02075195, + "router_z_loss_mlp": 0.26953125, + "step": 5415, + "time_per_iteration": 2.982389450073242 + }, + { + "auxiliary_loss_clip": 0.01124624, + "auxiliary_loss_mlp": 0.01036817, + "balance_loss_clip": 1.02214265, + "balance_loss_mlp": 1.04286838, + "epoch": 0.3256275364497219, + "flos": 23732428838400.0, + "grad_norm": 2.942597496869342, + "language_loss": 0.74265057, + "learning_rate": 3.151146171224075e-06, + "loss": 0.764265, + "num_input_tokens_seen": 116297300, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.81640625, + "step": 5416, + "time_per_iteration": 2.526956558227539 + }, + { + "auxiliary_loss_clip": 0.01039656, + "auxiliary_loss_mlp": 0.01005548, + "balance_loss_clip": 1.00335431, + "balance_loss_mlp": 1.01254702, + "epoch": 0.3256876597023899, + "flos": 67289199891840.0, + "grad_norm": 0.7736939008633222, + "language_loss": 0.57947183, + "learning_rate": 3.1508276669767757e-06, + "loss": 0.59992385, + "num_input_tokens_seen": 116362370, + "router_z_loss_clip": 0.02197266, + "router_z_loss_mlp": 0.26953125, + "step": 5417, + "time_per_iteration": 3.1500296592712402 + }, + { + "auxiliary_loss_clip": 0.01038219, + "auxiliary_loss_mlp": 0.01002181, + "balance_loss_clip": 1.0002141, + "balance_loss_mlp": 1.01140058, + "epoch": 0.32574778295505785, + "flos": 71282323964160.0, + "grad_norm": 0.9133944403169288, + "language_loss": 0.63476181, + "learning_rate": 3.150509119089975e-06, + "loss": 0.65516579, + "num_input_tokens_seen": 116430365, + "router_z_loss_clip": 0.01965332, + "router_z_loss_mlp": 0.26953125, + "step": 5418, + "time_per_iteration": 3.1724026203155518 + }, + { + "auxiliary_loss_clip": 0.01125951, + "auxiliary_loss_mlp": 0.01041707, + "balance_loss_clip": 1.02739, + "balance_loss_mlp": 1.0441196, + "epoch": 0.3258079062077258, + "flos": 20776370238720.0, + "grad_norm": 3.240595355482155, + "language_loss": 0.69061959, + "learning_rate": 3.1501905275757537e-06, + "loss": 0.71229619, + "num_input_tokens_seen": 116447525, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.8203125, + "step": 5419, + "time_per_iteration": 2.4643847942352295 + }, + { + "auxiliary_loss_clip": 0.01125895, + "auxiliary_loss_mlp": 0.01035053, + "balance_loss_clip": 1.01951957, + "balance_loss_mlp": 1.04326844, + "epoch": 0.3258680294603938, + "flos": 22235456603520.0, + "grad_norm": 2.1209544014848443, + "language_loss": 0.77064359, + "learning_rate": 3.1498718924461926e-06, + "loss": 0.79225302, + "num_input_tokens_seen": 116466310, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.828125, + "step": 5420, + "time_per_iteration": 2.5241270065307617 + }, + { + "auxiliary_loss_clip": 0.01128645, + "auxiliary_loss_mlp": 0.01035079, + "balance_loss_clip": 1.01945043, + "balance_loss_mlp": 1.04400003, + "epoch": 0.3259281527130618, + "flos": 26979974305920.0, + "grad_norm": 1.4823274263144444, + "language_loss": 0.80134791, + "learning_rate": 3.1495532137133736e-06, + "loss": 0.82298517, + "num_input_tokens_seen": 116487825, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84375, + "step": 5421, + "time_per_iteration": 2.5376439094543457 + }, + { + "auxiliary_loss_clip": 0.01122338, + "auxiliary_loss_mlp": 0.01037347, + "balance_loss_clip": 1.02359045, + "balance_loss_mlp": 1.04254711, + "epoch": 0.32598827596572977, + "flos": 26214251149440.0, + "grad_norm": 1.5045024534641303, + "language_loss": 0.75446749, + "learning_rate": 3.149234491389381e-06, + "loss": 0.77606434, + "num_input_tokens_seen": 116509950, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.796875, + "step": 5422, + "time_per_iteration": 2.5713820457458496 + }, + { + "auxiliary_loss_clip": 0.01128336, + "auxiliary_loss_mlp": 0.01039164, + "balance_loss_clip": 1.02324986, + "balance_loss_mlp": 1.04553628, + "epoch": 0.32604839921839773, + "flos": 17639752947840.0, + "grad_norm": 2.780294141224906, + "language_loss": 0.62795889, + "learning_rate": 3.1489157254863026e-06, + "loss": 0.64963388, + "num_input_tokens_seen": 116527695, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.828125, + "step": 5423, + "time_per_iteration": 2.4667959213256836 + }, + { + "auxiliary_loss_clip": 0.01118583, + "auxiliary_loss_mlp": 0.01031258, + "balance_loss_clip": 1.01824594, + "balance_loss_mlp": 1.04085255, + "epoch": 0.3261085224710657, + "flos": 23622721724160.0, + "grad_norm": 4.488088575635961, + "language_loss": 0.74664211, + "learning_rate": 3.148596916016224e-06, + "loss": 0.76814055, + "num_input_tokens_seen": 116547800, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.77734375, + "step": 5424, + "time_per_iteration": 2.488187313079834 + }, + { + "auxiliary_loss_clip": 0.01122401, + "auxiliary_loss_mlp": 0.01035954, + "balance_loss_clip": 1.02231038, + "balance_loss_mlp": 1.04298568, + "epoch": 0.32616864572373366, + "flos": 23260455106560.0, + "grad_norm": 1.6359586167011877, + "language_loss": 0.76958472, + "learning_rate": 3.1482780629912355e-06, + "loss": 0.79116821, + "num_input_tokens_seen": 116568460, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.796875, + "step": 5425, + "time_per_iteration": 2.5025157928466797 + }, + { + "auxiliary_loss_clip": 0.01127865, + "auxiliary_loss_mlp": 0.01047272, + "balance_loss_clip": 1.03051138, + "balance_loss_mlp": 1.04193544, + "epoch": 0.32622876897640163, + "flos": 25593427457280.0, + "grad_norm": 4.663874352034687, + "language_loss": 0.78857136, + "learning_rate": 3.147959166423428e-06, + "loss": 0.8103227, + "num_input_tokens_seen": 116588705, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.859375, + "step": 5426, + "time_per_iteration": 2.484064817428589 + }, + { + "auxiliary_loss_clip": 0.01124966, + "auxiliary_loss_mlp": 0.01037083, + "balance_loss_clip": 1.02116871, + "balance_loss_mlp": 1.04324198, + "epoch": 0.3262888922290696, + "flos": 22418996123520.0, + "grad_norm": 1.7688447582142532, + "language_loss": 0.74363142, + "learning_rate": 3.147640226324893e-06, + "loss": 0.76525187, + "num_input_tokens_seen": 116608845, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.81640625, + "step": 5427, + "time_per_iteration": 2.4785962104797363 + }, + { + "auxiliary_loss_clip": 0.0112706, + "auxiliary_loss_mlp": 0.01043058, + "balance_loss_clip": 1.02742934, + "balance_loss_mlp": 1.04290414, + "epoch": 0.32634901548173756, + "flos": 19718908819200.0, + "grad_norm": 1.911492416062928, + "language_loss": 0.79305124, + "learning_rate": 3.1473212427077266e-06, + "loss": 0.8147524, + "num_input_tokens_seen": 116628145, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.83984375, + "step": 5428, + "time_per_iteration": 3.9864413738250732 + }, + { + "auxiliary_loss_clip": 0.01123467, + "auxiliary_loss_mlp": 0.01041045, + "balance_loss_clip": 1.02597678, + "balance_loss_mlp": 1.04084587, + "epoch": 0.3264091387344055, + "flos": 16142924367360.0, + "grad_norm": 1.7222830625250152, + "language_loss": 0.71369523, + "learning_rate": 3.147002215584023e-06, + "loss": 0.73534036, + "num_input_tokens_seen": 116646920, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.82421875, + "step": 5429, + "time_per_iteration": 3.8856096267700195 + }, + { + "auxiliary_loss_clip": 0.01124973, + "auxiliary_loss_mlp": 0.01038401, + "balance_loss_clip": 1.02448976, + "balance_loss_mlp": 1.04308093, + "epoch": 0.3264692619870735, + "flos": 16399075230720.0, + "grad_norm": 1.889570703315701, + "language_loss": 0.78612322, + "learning_rate": 3.146683144965881e-06, + "loss": 0.80775696, + "num_input_tokens_seen": 116665100, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.8203125, + "step": 5430, + "time_per_iteration": 2.4374818801879883 + }, + { + "auxiliary_loss_clip": 0.01128219, + "auxiliary_loss_mlp": 0.01037824, + "balance_loss_clip": 1.02077675, + "balance_loss_mlp": 1.04359281, + "epoch": 0.32652938523974145, + "flos": 22382331315840.0, + "grad_norm": 1.8594684871120744, + "language_loss": 0.83897448, + "learning_rate": 3.146364030865399e-06, + "loss": 0.86063492, + "num_input_tokens_seen": 116682205, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.84765625, + "step": 5431, + "time_per_iteration": 2.4513139724731445 + }, + { + "auxiliary_loss_clip": 0.01122027, + "auxiliary_loss_mlp": 0.01038117, + "balance_loss_clip": 1.02431297, + "balance_loss_mlp": 1.04116321, + "epoch": 0.3265895084924094, + "flos": 21908059113600.0, + "grad_norm": 1.7565110160676718, + "language_loss": 0.70459324, + "learning_rate": 3.146044873294678e-06, + "loss": 0.72619462, + "num_input_tokens_seen": 116702575, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.80859375, + "step": 5432, + "time_per_iteration": 2.529365301132202 + }, + { + "auxiliary_loss_clip": 0.01123519, + "auxiliary_loss_mlp": 0.01035948, + "balance_loss_clip": 1.02182746, + "balance_loss_mlp": 1.04076195, + "epoch": 0.3266496317450774, + "flos": 16067152627200.0, + "grad_norm": 1.4205622330102, + "language_loss": 0.84161848, + "learning_rate": 3.1457256722658203e-06, + "loss": 0.86321318, + "num_input_tokens_seen": 116720885, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.828125, + "step": 5433, + "time_per_iteration": 2.4302597045898438 + }, + { + "auxiliary_loss_clip": 0.01123612, + "auxiliary_loss_mlp": 0.01035544, + "balance_loss_clip": 1.02132881, + "balance_loss_mlp": 1.0439055, + "epoch": 0.3267097549977454, + "flos": 22528236360960.0, + "grad_norm": 1.4699213962063424, + "language_loss": 0.85906386, + "learning_rate": 3.145406427790931e-06, + "loss": 0.88065541, + "num_input_tokens_seen": 116740395, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.796875, + "step": 5434, + "time_per_iteration": 2.496676445007324 + }, + { + "auxiliary_loss_clip": 0.01129105, + "auxiliary_loss_mlp": 0.01035794, + "balance_loss_clip": 1.02083361, + "balance_loss_mlp": 1.04468119, + "epoch": 0.32676987825041337, + "flos": 27270419679360.0, + "grad_norm": 1.8331918492971015, + "language_loss": 0.87817061, + "learning_rate": 3.1450871398821147e-06, + "loss": 0.89981961, + "num_input_tokens_seen": 116758870, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.84375, + "step": 5435, + "time_per_iteration": 2.51159405708313 + }, + { + "auxiliary_loss_clip": 0.0112533, + "auxiliary_loss_mlp": 0.01035758, + "balance_loss_clip": 1.02140474, + "balance_loss_mlp": 1.04326773, + "epoch": 0.32683000150308134, + "flos": 11508257433600.0, + "grad_norm": 2.5496215899058443, + "language_loss": 0.76460963, + "learning_rate": 3.144767808551479e-06, + "loss": 0.78622043, + "num_input_tokens_seen": 116773440, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.8203125, + "step": 5436, + "time_per_iteration": 2.43637752532959 + }, + { + "auxiliary_loss_clip": 0.01125315, + "auxiliary_loss_mlp": 0.01034854, + "balance_loss_clip": 1.02040625, + "balance_loss_mlp": 1.04435849, + "epoch": 0.3268901247557493, + "flos": 25630200005760.0, + "grad_norm": 1.5905557916714361, + "language_loss": 0.72127515, + "learning_rate": 3.144448433811134e-06, + "loss": 0.74287689, + "num_input_tokens_seen": 116794375, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.80859375, + "step": 5437, + "time_per_iteration": 2.493673086166382 + }, + { + "auxiliary_loss_clip": 0.01126466, + "auxiliary_loss_mlp": 0.01039117, + "balance_loss_clip": 1.02236819, + "balance_loss_mlp": 1.04143524, + "epoch": 0.32695024800841727, + "flos": 24860849575680.0, + "grad_norm": 1.6336098458574233, + "language_loss": 0.64049256, + "learning_rate": 3.144129015673189e-06, + "loss": 0.66214842, + "num_input_tokens_seen": 116815095, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8515625, + "step": 5438, + "time_per_iteration": 2.5062596797943115 + }, + { + "auxiliary_loss_clip": 0.01126505, + "auxiliary_loss_mlp": 0.01034195, + "balance_loss_clip": 1.01943088, + "balance_loss_mlp": 1.04510128, + "epoch": 0.32701037126108523, + "flos": 28839249072000.0, + "grad_norm": 1.5452802319075516, + "language_loss": 0.74544024, + "learning_rate": 3.1438095541497576e-06, + "loss": 0.76704717, + "num_input_tokens_seen": 116836630, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8125, + "step": 5439, + "time_per_iteration": 2.501279830932617 + }, + { + "auxiliary_loss_clip": 0.01126727, + "auxiliary_loss_mlp": 0.01045237, + "balance_loss_clip": 1.02985907, + "balance_loss_mlp": 1.04374349, + "epoch": 0.3270704945137532, + "flos": 27965075777280.0, + "grad_norm": 2.6196339079167323, + "language_loss": 0.75183308, + "learning_rate": 3.1434900492529527e-06, + "loss": 0.77355272, + "num_input_tokens_seen": 116856880, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.828125, + "step": 5440, + "time_per_iteration": 2.507341146469116 + }, + { + "auxiliary_loss_clip": 0.01124779, + "auxiliary_loss_mlp": 0.01047409, + "balance_loss_clip": 1.03317571, + "balance_loss_mlp": 1.04308057, + "epoch": 0.32713061776642116, + "flos": 23690700213120.0, + "grad_norm": 1.9066250681455874, + "language_loss": 0.84613734, + "learning_rate": 3.1431705009948914e-06, + "loss": 0.86785924, + "num_input_tokens_seen": 116873770, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8203125, + "step": 5441, + "time_per_iteration": 2.4737346172332764 + }, + { + "auxiliary_loss_clip": 0.01126255, + "auxiliary_loss_mlp": 0.01042859, + "balance_loss_clip": 1.02743292, + "balance_loss_mlp": 1.04209113, + "epoch": 0.3271907410190891, + "flos": 22455625017600.0, + "grad_norm": 1.9602585650153952, + "language_loss": 0.8673979, + "learning_rate": 3.1428509093876897e-06, + "loss": 0.88908899, + "num_input_tokens_seen": 116891225, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84375, + "step": 5442, + "time_per_iteration": 2.4779980182647705 + }, + { + "auxiliary_loss_clip": 0.0113032, + "auxiliary_loss_mlp": 0.01038435, + "balance_loss_clip": 1.02193677, + "balance_loss_mlp": 1.04526424, + "epoch": 0.3272508642717571, + "flos": 22820118278400.0, + "grad_norm": 1.8849886885636646, + "language_loss": 0.77500421, + "learning_rate": 3.1425312744434668e-06, + "loss": 0.79669178, + "num_input_tokens_seen": 116912300, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8515625, + "step": 5443, + "time_per_iteration": 2.5263850688934326 + }, + { + "auxiliary_loss_clip": 0.01126577, + "auxiliary_loss_mlp": 0.01039448, + "balance_loss_clip": 1.02428412, + "balance_loss_mlp": 1.04207098, + "epoch": 0.32731098752442506, + "flos": 11801360413440.0, + "grad_norm": 2.0180593262473487, + "language_loss": 0.81630802, + "learning_rate": 3.142211596174343e-06, + "loss": 0.83796823, + "num_input_tokens_seen": 116929425, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.84765625, + "step": 5444, + "time_per_iteration": 2.447061061859131 + }, + { + "auxiliary_loss_clip": 0.0112612, + "auxiliary_loss_mlp": 0.01038044, + "balance_loss_clip": 1.02335095, + "balance_loss_mlp": 1.04356718, + "epoch": 0.327371110777093, + "flos": 21027780506880.0, + "grad_norm": 2.9587875585664523, + "language_loss": 0.59421074, + "learning_rate": 3.1418918745924423e-06, + "loss": 0.61585242, + "num_input_tokens_seen": 116948255, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.82421875, + "step": 5445, + "time_per_iteration": 2.4542667865753174 + }, + { + "auxiliary_loss_clip": 0.01128674, + "auxiliary_loss_mlp": 0.01039464, + "balance_loss_clip": 1.02397823, + "balance_loss_mlp": 1.04482532, + "epoch": 0.327431234029761, + "flos": 19062102677760.0, + "grad_norm": 2.043321690225375, + "language_loss": 0.88286638, + "learning_rate": 3.1415721097098865e-06, + "loss": 0.90454781, + "num_input_tokens_seen": 116964905, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8359375, + "step": 5446, + "time_per_iteration": 2.4518625736236572 + }, + { + "auxiliary_loss_clip": 0.01133247, + "auxiliary_loss_mlp": 0.01042878, + "balance_loss_clip": 1.02577102, + "balance_loss_mlp": 1.04609275, + "epoch": 0.32749135728242895, + "flos": 25849219184640.0, + "grad_norm": 1.9059445881205361, + "language_loss": 0.78455317, + "learning_rate": 3.141252301538802e-06, + "loss": 0.80631441, + "num_input_tokens_seen": 116983650, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.87109375, + "step": 5447, + "time_per_iteration": 2.488555669784546 + }, + { + "auxiliary_loss_clip": 0.01125433, + "auxiliary_loss_mlp": 0.01039956, + "balance_loss_clip": 1.02621138, + "balance_loss_mlp": 1.04297531, + "epoch": 0.327551480535097, + "flos": 20120533764480.0, + "grad_norm": 1.7948266966340543, + "language_loss": 0.73349774, + "learning_rate": 3.1409324500913157e-06, + "loss": 0.75515163, + "num_input_tokens_seen": 117003265, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.82421875, + "step": 5448, + "time_per_iteration": 2.460759162902832 + }, + { + "auxiliary_loss_clip": 0.01125074, + "auxiliary_loss_mlp": 0.0104344, + "balance_loss_clip": 1.02788281, + "balance_loss_mlp": 1.04221821, + "epoch": 0.32761160378776494, + "flos": 28803553931520.0, + "grad_norm": 1.3797343272994427, + "language_loss": 0.66896623, + "learning_rate": 3.1406125553795567e-06, + "loss": 0.69065142, + "num_input_tokens_seen": 117025370, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.828125, + "step": 5449, + "time_per_iteration": 2.5101547241210938 + }, + { + "auxiliary_loss_clip": 0.01125182, + "auxiliary_loss_mlp": 0.01035774, + "balance_loss_clip": 1.02111173, + "balance_loss_mlp": 1.04373384, + "epoch": 0.3276717270404329, + "flos": 26937778803840.0, + "grad_norm": 1.3889431777217922, + "language_loss": 0.65617704, + "learning_rate": 3.1402926174156556e-06, + "loss": 0.67778659, + "num_input_tokens_seen": 117044350, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8125, + "step": 5450, + "time_per_iteration": 2.4815587997436523 + }, + { + "auxiliary_loss_clip": 0.01126325, + "auxiliary_loss_mlp": 0.01041593, + "balance_loss_clip": 1.02644145, + "balance_loss_mlp": 1.04330397, + "epoch": 0.32773185029310087, + "flos": 25338425829120.0, + "grad_norm": 1.5376267502191867, + "language_loss": 0.77276003, + "learning_rate": 3.1399726362117437e-06, + "loss": 0.7944392, + "num_input_tokens_seen": 117064450, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.828125, + "step": 5451, + "time_per_iteration": 2.496264696121216 + }, + { + "auxiliary_loss_clip": 0.0112906, + "auxiliary_loss_mlp": 0.01039497, + "balance_loss_clip": 1.02348745, + "balance_loss_mlp": 1.04470944, + "epoch": 0.32779197354576883, + "flos": 26391721271040.0, + "grad_norm": 2.4373215337565015, + "language_loss": 0.7011131, + "learning_rate": 3.1396526117799555e-06, + "loss": 0.72279859, + "num_input_tokens_seen": 117083060, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.84375, + "step": 5452, + "time_per_iteration": 2.504953384399414 + }, + { + "auxiliary_loss_clip": 0.01121729, + "auxiliary_loss_mlp": 0.0103441, + "balance_loss_clip": 1.01944947, + "balance_loss_mlp": 1.04188132, + "epoch": 0.3278520967984368, + "flos": 24899381890560.0, + "grad_norm": 1.7019757848824575, + "language_loss": 0.78734571, + "learning_rate": 3.1393325441324256e-06, + "loss": 0.80890715, + "num_input_tokens_seen": 117101860, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.796875, + "step": 5453, + "time_per_iteration": 2.493701219558716 + }, + { + "auxiliary_loss_clip": 0.01126073, + "auxiliary_loss_mlp": 0.01030616, + "balance_loss_clip": 1.01610184, + "balance_loss_mlp": 1.04306984, + "epoch": 0.32791222005110476, + "flos": 29752996176000.0, + "grad_norm": 2.2894918901687333, + "language_loss": 0.75428879, + "learning_rate": 3.1390124332812916e-06, + "loss": 0.77585566, + "num_input_tokens_seen": 117123100, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.828125, + "step": 5454, + "time_per_iteration": 2.5295286178588867 + }, + { + "auxiliary_loss_clip": 0.01121153, + "auxiliary_loss_mlp": 0.01037163, + "balance_loss_clip": 1.02382326, + "balance_loss_mlp": 1.04198301, + "epoch": 0.32797234330377273, + "flos": 16508064072960.0, + "grad_norm": 2.0725507665811826, + "language_loss": 0.77059573, + "learning_rate": 3.1386922792386924e-06, + "loss": 0.79217887, + "num_input_tokens_seen": 117140515, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7890625, + "step": 5455, + "time_per_iteration": 2.426988124847412 + }, + { + "auxiliary_loss_clip": 0.0112837, + "auxiliary_loss_mlp": 0.01039409, + "balance_loss_clip": 1.02304173, + "balance_loss_mlp": 1.04281068, + "epoch": 0.3280324665564407, + "flos": 26577918397440.0, + "grad_norm": 1.669914346129418, + "language_loss": 0.74029738, + "learning_rate": 3.138372082016768e-06, + "loss": 0.76197511, + "num_input_tokens_seen": 117161485, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.85546875, + "step": 5456, + "time_per_iteration": 2.512131929397583 + }, + { + "auxiliary_loss_clip": 0.01126084, + "auxiliary_loss_mlp": 0.01049831, + "balance_loss_clip": 1.03444123, + "balance_loss_mlp": 1.04250574, + "epoch": 0.32809258980910866, + "flos": 22929969047040.0, + "grad_norm": 1.518027485126158, + "language_loss": 0.78283882, + "learning_rate": 3.1380518416276596e-06, + "loss": 0.80459797, + "num_input_tokens_seen": 117181870, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8359375, + "step": 5457, + "time_per_iteration": 2.4819135665893555 + }, + { + "auxiliary_loss_clip": 0.0112739, + "auxiliary_loss_mlp": 0.01038783, + "balance_loss_clip": 1.02432334, + "balance_loss_mlp": 1.04155684, + "epoch": 0.3281527130617766, + "flos": 22783848520320.0, + "grad_norm": 2.199350012619834, + "language_loss": 0.79332864, + "learning_rate": 3.1377315580835115e-06, + "loss": 0.81499034, + "num_input_tokens_seen": 117201380, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.859375, + "step": 5458, + "time_per_iteration": 2.4749457836151123 + }, + { + "auxiliary_loss_clip": 0.01123398, + "auxiliary_loss_mlp": 0.01035313, + "balance_loss_clip": 1.01988721, + "balance_loss_mlp": 1.04204702, + "epoch": 0.3282128363144446, + "flos": 21250678354560.0, + "grad_norm": 4.694290331797846, + "language_loss": 0.72896576, + "learning_rate": 3.1374112313964686e-06, + "loss": 0.75055289, + "num_input_tokens_seen": 117221040, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.81640625, + "step": 5459, + "time_per_iteration": 2.4506032466888428 + }, + { + "auxiliary_loss_clip": 0.01128673, + "auxiliary_loss_mlp": 0.01037647, + "balance_loss_clip": 1.02303815, + "balance_loss_mlp": 1.04444695, + "epoch": 0.32827295956711255, + "flos": 30843064166400.0, + "grad_norm": 1.8402325574836436, + "language_loss": 0.84511495, + "learning_rate": 3.1370908615786783e-06, + "loss": 0.86677814, + "num_input_tokens_seen": 117241395, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.84375, + "step": 5460, + "time_per_iteration": 2.521491527557373 + }, + { + "auxiliary_loss_clip": 0.01125172, + "auxiliary_loss_mlp": 0.01035571, + "balance_loss_clip": 1.02176023, + "balance_loss_mlp": 1.0420599, + "epoch": 0.3283330828197806, + "flos": 25915006944000.0, + "grad_norm": 1.7736363390075318, + "language_loss": 0.76822042, + "learning_rate": 3.136770448642288e-06, + "loss": 0.78982782, + "num_input_tokens_seen": 117259340, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.83203125, + "step": 5461, + "time_per_iteration": 2.4919962882995605 + }, + { + "auxiliary_loss_clip": 0.01129873, + "auxiliary_loss_mlp": 0.010367, + "balance_loss_clip": 1.02015376, + "balance_loss_mlp": 1.04589903, + "epoch": 0.32839320607244854, + "flos": 38582065042560.0, + "grad_norm": 1.6989905310418616, + "language_loss": 0.62835252, + "learning_rate": 3.1364499925994484e-06, + "loss": 0.65001822, + "num_input_tokens_seen": 117282375, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.83984375, + "step": 5462, + "time_per_iteration": 2.6128923892974854 + }, + { + "auxiliary_loss_clip": 0.0112585, + "auxiliary_loss_mlp": 0.01033948, + "balance_loss_clip": 1.02048922, + "balance_loss_mlp": 1.04426169, + "epoch": 0.3284533293251165, + "flos": 26650888876800.0, + "grad_norm": 1.8014296603715538, + "language_loss": 0.78155506, + "learning_rate": 3.1361294934623115e-06, + "loss": 0.80315304, + "num_input_tokens_seen": 117303830, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.8125, + "step": 5463, + "time_per_iteration": 2.5255165100097656 + }, + { + "auxiliary_loss_clip": 0.0112647, + "auxiliary_loss_mlp": 0.01034449, + "balance_loss_clip": 1.02001238, + "balance_loss_mlp": 1.04409099, + "epoch": 0.32851345257778447, + "flos": 15304158904320.0, + "grad_norm": 2.049558292675733, + "language_loss": 0.7029627, + "learning_rate": 3.1358089512430303e-06, + "loss": 0.72457188, + "num_input_tokens_seen": 117320665, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.82421875, + "step": 5464, + "time_per_iteration": 2.460951089859009 + }, + { + "auxiliary_loss_clip": 0.01127719, + "auxiliary_loss_mlp": 0.01039646, + "balance_loss_clip": 1.02505457, + "balance_loss_mlp": 1.04683673, + "epoch": 0.32857357583045244, + "flos": 23513732881920.0, + "grad_norm": 1.6142145677103121, + "language_loss": 0.72746348, + "learning_rate": 3.1354883659537594e-06, + "loss": 0.74913716, + "num_input_tokens_seen": 117339795, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.80859375, + "step": 5465, + "time_per_iteration": 2.4767887592315674 + }, + { + "auxiliary_loss_clip": 0.01128882, + "auxiliary_loss_mlp": 0.01036634, + "balance_loss_clip": 1.02208447, + "balance_loss_mlp": 1.04690027, + "epoch": 0.3286336990831204, + "flos": 20995209849600.0, + "grad_norm": 1.6282981827525145, + "language_loss": 0.82756901, + "learning_rate": 3.1351677376066567e-06, + "loss": 0.84922415, + "num_input_tokens_seen": 117359525, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8203125, + "step": 5466, + "time_per_iteration": 2.463127613067627 + }, + { + "auxiliary_loss_clip": 0.01127231, + "auxiliary_loss_mlp": 0.01037656, + "balance_loss_clip": 1.02343404, + "balance_loss_mlp": 1.04421949, + "epoch": 0.32869382233578837, + "flos": 23658811914240.0, + "grad_norm": 1.6977355395672606, + "language_loss": 0.79485095, + "learning_rate": 3.134847066213879e-06, + "loss": 0.81649983, + "num_input_tokens_seen": 117380320, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.83203125, + "step": 5467, + "time_per_iteration": 2.482245683670044 + }, + { + "auxiliary_loss_clip": 0.01128659, + "auxiliary_loss_mlp": 0.01034682, + "balance_loss_clip": 1.02011502, + "balance_loss_mlp": 1.0452255, + "epoch": 0.32875394558845633, + "flos": 25336522408320.0, + "grad_norm": 1.5356074654715184, + "language_loss": 0.74795353, + "learning_rate": 3.134526351787587e-06, + "loss": 0.76958692, + "num_input_tokens_seen": 117400695, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8359375, + "step": 5468, + "time_per_iteration": 2.4828743934631348 + }, + { + "auxiliary_loss_clip": 0.01136832, + "auxiliary_loss_mlp": 0.01041148, + "balance_loss_clip": 1.02467322, + "balance_loss_mlp": 1.04996455, + "epoch": 0.3288140688411243, + "flos": 14903108576640.0, + "grad_norm": 1.8525214053644714, + "language_loss": 0.78469932, + "learning_rate": 3.134205594339942e-06, + "loss": 0.8064791, + "num_input_tokens_seen": 117418800, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8671875, + "step": 5469, + "time_per_iteration": 2.455672264099121 + }, + { + "auxiliary_loss_clip": 0.01129512, + "auxiliary_loss_mlp": 0.01034665, + "balance_loss_clip": 1.02008545, + "balance_loss_mlp": 1.04602098, + "epoch": 0.32887419209379226, + "flos": 18551345235840.0, + "grad_norm": 1.646072726718358, + "language_loss": 0.82014406, + "learning_rate": 3.133884793883107e-06, + "loss": 0.84178579, + "num_input_tokens_seen": 117438220, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8359375, + "step": 5470, + "time_per_iteration": 5.531651020050049 + }, + { + "auxiliary_loss_clip": 0.0112936, + "auxiliary_loss_mlp": 0.01038355, + "balance_loss_clip": 1.02315605, + "balance_loss_mlp": 1.04359245, + "epoch": 0.3289343153464602, + "flos": 48105610439040.0, + "grad_norm": 1.806312825179731, + "language_loss": 0.67675972, + "learning_rate": 3.1335639504292478e-06, + "loss": 0.69843686, + "num_input_tokens_seen": 117462560, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.859375, + "step": 5471, + "time_per_iteration": 2.7400858402252197 + }, + { + "auxiliary_loss_clip": 0.01135248, + "auxiliary_loss_mlp": 0.01042507, + "balance_loss_clip": 1.02578163, + "balance_loss_mlp": 1.04856122, + "epoch": 0.3289944385991282, + "flos": 27600295207680.0, + "grad_norm": 1.6357076803377442, + "language_loss": 0.65059721, + "learning_rate": 3.1332430639905288e-06, + "loss": 0.67237478, + "num_input_tokens_seen": 117483665, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8671875, + "step": 5472, + "time_per_iteration": 2.530604124069214 + }, + { + "auxiliary_loss_clip": 0.01132922, + "auxiliary_loss_mlp": 0.01043552, + "balance_loss_clip": 1.0271014, + "balance_loss_mlp": 1.04821706, + "epoch": 0.32905456185179616, + "flos": 20120318282880.0, + "grad_norm": 1.6631612231063349, + "language_loss": 0.88497955, + "learning_rate": 3.13292213457912e-06, + "loss": 0.9067443, + "num_input_tokens_seen": 117503565, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.84765625, + "step": 5473, + "time_per_iteration": 2.521026611328125 + }, + { + "auxiliary_loss_clip": 0.01133162, + "auxiliary_loss_mlp": 0.01043181, + "balance_loss_clip": 1.02669442, + "balance_loss_mlp": 1.0483191, + "epoch": 0.3291146851044642, + "flos": 23180230080000.0, + "grad_norm": 2.3087074790673423, + "language_loss": 0.78349268, + "learning_rate": 3.1326011622071903e-06, + "loss": 0.80525613, + "num_input_tokens_seen": 117521460, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.84765625, + "step": 5474, + "time_per_iteration": 2.4769628047943115 + }, + { + "auxiliary_loss_clip": 0.01047146, + "auxiliary_loss_mlp": 0.00999487, + "balance_loss_clip": 0.99740046, + "balance_loss_mlp": 1.02056372, + "epoch": 0.32917480835713214, + "flos": 67621912594560.0, + "grad_norm": 0.888273800575083, + "language_loss": 0.60237771, + "learning_rate": 3.132280146886911e-06, + "loss": 0.62284404, + "num_input_tokens_seen": 117580550, + "router_z_loss_clip": 0.02087402, + "router_z_loss_mlp": 0.265625, + "step": 5475, + "time_per_iteration": 3.039971351623535 + }, + { + "auxiliary_loss_clip": 0.01133227, + "auxiliary_loss_mlp": 0.01051514, + "balance_loss_clip": 1.03437138, + "balance_loss_mlp": 1.04512429, + "epoch": 0.3292349316098001, + "flos": 27964537073280.0, + "grad_norm": 2.5350164106808766, + "language_loss": 0.76634103, + "learning_rate": 3.131959088630455e-06, + "loss": 0.78818846, + "num_input_tokens_seen": 117600645, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.8828125, + "step": 5476, + "time_per_iteration": 2.488698959350586 + }, + { + "auxiliary_loss_clip": 0.01131587, + "auxiliary_loss_mlp": 0.01040976, + "balance_loss_clip": 1.02640307, + "balance_loss_mlp": 1.04819024, + "epoch": 0.3292950548624681, + "flos": 20263673462400.0, + "grad_norm": 1.8435246505513339, + "language_loss": 0.74520677, + "learning_rate": 3.131637987449997e-06, + "loss": 0.76693243, + "num_input_tokens_seen": 117618880, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8359375, + "step": 5477, + "time_per_iteration": 2.533641815185547 + }, + { + "auxiliary_loss_clip": 0.01124642, + "auxiliary_loss_mlp": 0.01034244, + "balance_loss_clip": 1.02036786, + "balance_loss_mlp": 1.04507232, + "epoch": 0.32935517811513604, + "flos": 20812999132800.0, + "grad_norm": 1.9138938380730264, + "language_loss": 0.75581098, + "learning_rate": 3.131316843357713e-06, + "loss": 0.7773999, + "num_input_tokens_seen": 117636445, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.796875, + "step": 5478, + "time_per_iteration": 2.4541866779327393 + }, + { + "auxiliary_loss_clip": 0.01129718, + "auxiliary_loss_mlp": 0.01036647, + "balance_loss_clip": 1.02218664, + "balance_loss_mlp": 1.04736805, + "epoch": 0.329415301367804, + "flos": 18441853603200.0, + "grad_norm": 1.6780134795902322, + "language_loss": 0.80241555, + "learning_rate": 3.1309956563657807e-06, + "loss": 0.82407916, + "num_input_tokens_seen": 117653105, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8203125, + "step": 5479, + "time_per_iteration": 2.5348050594329834 + }, + { + "auxiliary_loss_clip": 0.01046129, + "auxiliary_loss_mlp": 0.01003977, + "balance_loss_clip": 1.00210512, + "balance_loss_mlp": 1.01921439, + "epoch": 0.32947542462047197, + "flos": 66323024887680.0, + "grad_norm": 0.7411588561506779, + "language_loss": 0.56543052, + "learning_rate": 3.1306744264863804e-06, + "loss": 0.58593154, + "num_input_tokens_seen": 117719225, + "router_z_loss_clip": 0.01867676, + "router_z_loss_mlp": 0.26953125, + "step": 5480, + "time_per_iteration": 3.121812343597412 + }, + { + "auxiliary_loss_clip": 0.01128951, + "auxiliary_loss_mlp": 0.01044263, + "balance_loss_clip": 1.02871847, + "balance_loss_mlp": 1.04606879, + "epoch": 0.32953554787313993, + "flos": 23221599569280.0, + "grad_norm": 1.656023636160042, + "language_loss": 0.77029848, + "learning_rate": 3.1303531537316915e-06, + "loss": 0.79203057, + "num_input_tokens_seen": 117738725, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.828125, + "step": 5481, + "time_per_iteration": 2.4819936752319336 + }, + { + "auxiliary_loss_clip": 0.01129556, + "auxiliary_loss_mlp": 0.01034734, + "balance_loss_clip": 1.02028024, + "balance_loss_mlp": 1.04622722, + "epoch": 0.3295956711258079, + "flos": 27009492307200.0, + "grad_norm": 1.8057287203311059, + "language_loss": 0.78732938, + "learning_rate": 3.130031838113899e-06, + "loss": 0.80897224, + "num_input_tokens_seen": 117757765, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.83203125, + "step": 5482, + "time_per_iteration": 2.501615285873413 + }, + { + "auxiliary_loss_clip": 0.01129525, + "auxiliary_loss_mlp": 0.01041437, + "balance_loss_clip": 1.02601135, + "balance_loss_mlp": 1.04573894, + "epoch": 0.32965579437847586, + "flos": 19171702051200.0, + "grad_norm": 1.6414395423474737, + "language_loss": 0.74055123, + "learning_rate": 3.129710479645185e-06, + "loss": 0.76226085, + "num_input_tokens_seen": 117776810, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8359375, + "step": 5483, + "time_per_iteration": 2.5213518142700195 + }, + { + "auxiliary_loss_clip": 0.01128456, + "auxiliary_loss_mlp": 0.01032447, + "balance_loss_clip": 1.0187676, + "balance_loss_mlp": 1.04614615, + "epoch": 0.32971591763114383, + "flos": 30482521401600.0, + "grad_norm": 1.8373674608308554, + "language_loss": 0.75627816, + "learning_rate": 3.1293890783377366e-06, + "loss": 0.77788723, + "num_input_tokens_seen": 117797730, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.8203125, + "step": 5484, + "time_per_iteration": 2.543795108795166 + }, + { + "auxiliary_loss_clip": 0.01129378, + "auxiliary_loss_mlp": 0.01038991, + "balance_loss_clip": 1.02441156, + "balance_loss_mlp": 1.04699099, + "epoch": 0.3297760408838118, + "flos": 16289583598080.0, + "grad_norm": 2.1329507570753243, + "language_loss": 0.7209897, + "learning_rate": 3.129067634203742e-06, + "loss": 0.74267334, + "num_input_tokens_seen": 117815365, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.82421875, + "step": 5485, + "time_per_iteration": 2.4598846435546875 + }, + { + "auxiliary_loss_clip": 0.01124565, + "auxiliary_loss_mlp": 0.01040064, + "balance_loss_clip": 1.02626562, + "balance_loss_mlp": 1.04448354, + "epoch": 0.32983616413647976, + "flos": 29530924341120.0, + "grad_norm": 1.7963509228415293, + "language_loss": 0.80416954, + "learning_rate": 3.128746147255388e-06, + "loss": 0.8258158, + "num_input_tokens_seen": 117836095, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.80078125, + "step": 5486, + "time_per_iteration": 2.5368754863739014 + }, + { + "auxiliary_loss_clip": 0.011236, + "auxiliary_loss_mlp": 0.01037413, + "balance_loss_clip": 1.02264309, + "balance_loss_mlp": 1.04300976, + "epoch": 0.3298962873891478, + "flos": 20631398947200.0, + "grad_norm": 2.3473245188806056, + "language_loss": 0.84351611, + "learning_rate": 3.1284246175048683e-06, + "loss": 0.86512625, + "num_input_tokens_seen": 117854655, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8046875, + "step": 5487, + "time_per_iteration": 2.5140841007232666 + }, + { + "auxiliary_loss_clip": 0.01131842, + "auxiliary_loss_mlp": 0.01040276, + "balance_loss_clip": 1.02440929, + "balance_loss_mlp": 1.04636502, + "epoch": 0.32995641064181574, + "flos": 14976007228800.0, + "grad_norm": 2.289610395509379, + "language_loss": 0.74163198, + "learning_rate": 3.1281030449643735e-06, + "loss": 0.76335323, + "num_input_tokens_seen": 117873300, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.85546875, + "step": 5488, + "time_per_iteration": 2.4159257411956787 + }, + { + "auxiliary_loss_clip": 0.01127802, + "auxiliary_loss_mlp": 0.01040136, + "balance_loss_clip": 1.02519917, + "balance_loss_mlp": 1.04548192, + "epoch": 0.3300165338944837, + "flos": 18661447399680.0, + "grad_norm": 2.3379517114480004, + "language_loss": 0.72564352, + "learning_rate": 3.127781429646098e-06, + "loss": 0.74732298, + "num_input_tokens_seen": 117891540, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.82421875, + "step": 5489, + "time_per_iteration": 2.4810056686401367 + }, + { + "auxiliary_loss_clip": 0.01122617, + "auxiliary_loss_mlp": 0.01033113, + "balance_loss_clip": 1.01852202, + "balance_loss_mlp": 1.04076719, + "epoch": 0.3300766571471517, + "flos": 25583730785280.0, + "grad_norm": 2.5348585918072235, + "language_loss": 0.88752508, + "learning_rate": 3.127459771562238e-06, + "loss": 0.90908241, + "num_input_tokens_seen": 117907690, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8203125, + "step": 5490, + "time_per_iteration": 2.448437452316284 + }, + { + "auxiliary_loss_clip": 0.01121475, + "auxiliary_loss_mlp": 0.01034389, + "balance_loss_clip": 1.02022719, + "balance_loss_mlp": 1.0403626, + "epoch": 0.33013678039981964, + "flos": 11363501623680.0, + "grad_norm": 1.9493471797358817, + "language_loss": 0.83395195, + "learning_rate": 3.1271380707249907e-06, + "loss": 0.85551059, + "num_input_tokens_seen": 117925640, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8125, + "step": 5491, + "time_per_iteration": 2.44634747505188 + }, + { + "auxiliary_loss_clip": 0.01126063, + "auxiliary_loss_mlp": 0.01039892, + "balance_loss_clip": 1.02492499, + "balance_loss_mlp": 1.04421842, + "epoch": 0.3301969036524876, + "flos": 24821203939200.0, + "grad_norm": 2.715750342336911, + "language_loss": 0.77514994, + "learning_rate": 3.126816327146554e-06, + "loss": 0.79680943, + "num_input_tokens_seen": 117944525, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.81640625, + "step": 5492, + "time_per_iteration": 2.4870479106903076 + }, + { + "auxiliary_loss_clip": 0.01131001, + "auxiliary_loss_mlp": 0.0104338, + "balance_loss_clip": 1.0269649, + "balance_loss_mlp": 1.04629827, + "epoch": 0.33025702690515557, + "flos": 15961144613760.0, + "grad_norm": 2.2776411561569265, + "language_loss": 0.7450884, + "learning_rate": 3.12649454083913e-06, + "loss": 0.76683223, + "num_input_tokens_seen": 117962515, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.84375, + "step": 5493, + "time_per_iteration": 2.4506607055664062 + }, + { + "auxiliary_loss_clip": 0.01045286, + "auxiliary_loss_mlp": 0.01012729, + "balance_loss_clip": 1.01074982, + "balance_loss_mlp": 1.01881337, + "epoch": 0.33031715015782354, + "flos": 59416755989760.0, + "grad_norm": 0.7955029917088393, + "language_loss": 0.53910893, + "learning_rate": 3.12617271181492e-06, + "loss": 0.55968904, + "num_input_tokens_seen": 118018780, + "router_z_loss_clip": 0.01977539, + "router_z_loss_mlp": 0.265625, + "step": 5494, + "time_per_iteration": 3.0042550563812256 + }, + { + "auxiliary_loss_clip": 0.01124159, + "auxiliary_loss_mlp": 0.01037133, + "balance_loss_clip": 1.02245855, + "balance_loss_mlp": 1.04378355, + "epoch": 0.3303772734104915, + "flos": 23184360144000.0, + "grad_norm": 1.6073630563578136, + "language_loss": 0.87087989, + "learning_rate": 3.1258508400861276e-06, + "loss": 0.89249277, + "num_input_tokens_seen": 118038610, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8046875, + "step": 5495, + "time_per_iteration": 2.4716837406158447 + }, + { + "auxiliary_loss_clip": 0.01128875, + "auxiliary_loss_mlp": 0.01047751, + "balance_loss_clip": 1.03133559, + "balance_loss_mlp": 1.04508138, + "epoch": 0.33043739666315947, + "flos": 33071896010880.0, + "grad_norm": 3.5655917637781784, + "language_loss": 0.73526418, + "learning_rate": 3.1255289256649587e-06, + "loss": 0.75703049, + "num_input_tokens_seen": 118055905, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8359375, + "step": 5496, + "time_per_iteration": 2.531670570373535 + }, + { + "auxiliary_loss_clip": 0.01124295, + "auxiliary_loss_mlp": 0.01028859, + "balance_loss_clip": 1.01509058, + "balance_loss_mlp": 1.04384971, + "epoch": 0.33049751991582743, + "flos": 24895431394560.0, + "grad_norm": 2.1703031984353514, + "language_loss": 0.72764325, + "learning_rate": 3.1252069685636196e-06, + "loss": 0.74917477, + "num_input_tokens_seen": 118073695, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.8046875, + "step": 5497, + "time_per_iteration": 2.5148839950561523 + }, + { + "auxiliary_loss_clip": 0.01123603, + "auxiliary_loss_mlp": 0.01033477, + "balance_loss_clip": 1.01861119, + "balance_loss_mlp": 1.04340625, + "epoch": 0.3305576431684954, + "flos": 29460575554560.0, + "grad_norm": 2.5654673530164307, + "language_loss": 0.80193126, + "learning_rate": 3.124884968794321e-06, + "loss": 0.82350206, + "num_input_tokens_seen": 118094030, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.80078125, + "step": 5498, + "time_per_iteration": 2.517765522003174 + }, + { + "auxiliary_loss_clip": 0.01123393, + "auxiliary_loss_mlp": 0.01038843, + "balance_loss_clip": 1.02397776, + "balance_loss_mlp": 1.03977811, + "epoch": 0.33061776642116336, + "flos": 22632305040000.0, + "grad_norm": 2.1435474357237405, + "language_loss": 0.76491725, + "learning_rate": 3.12456292636927e-06, + "loss": 0.78653955, + "num_input_tokens_seen": 118111665, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8359375, + "step": 5499, + "time_per_iteration": 2.5006067752838135 + }, + { + "auxiliary_loss_clip": 0.01122541, + "auxiliary_loss_mlp": 0.01031983, + "balance_loss_clip": 1.0175705, + "balance_loss_mlp": 1.04131985, + "epoch": 0.3306778896738313, + "flos": 25776320532480.0, + "grad_norm": 1.506886865759599, + "language_loss": 0.79332948, + "learning_rate": 3.124240841300681e-06, + "loss": 0.81487471, + "num_input_tokens_seen": 118132435, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8125, + "step": 5500, + "time_per_iteration": 2.4859495162963867 + }, + { + "auxiliary_loss_clip": 0.01129022, + "auxiliary_loss_mlp": 0.0103113, + "balance_loss_clip": 1.01607347, + "balance_loss_mlp": 1.04564214, + "epoch": 0.33073801292649935, + "flos": 36940552479360.0, + "grad_norm": 2.164639953437845, + "language_loss": 0.66065335, + "learning_rate": 3.1239187136007665e-06, + "loss": 0.68225485, + "num_input_tokens_seen": 118155255, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.83203125, + "step": 5501, + "time_per_iteration": 2.6189892292022705 + }, + { + "auxiliary_loss_clip": 0.01126823, + "auxiliary_loss_mlp": 0.01041122, + "balance_loss_clip": 1.02471876, + "balance_loss_mlp": 1.04285216, + "epoch": 0.3307981361791673, + "flos": 12967738848000.0, + "grad_norm": 2.260615362067107, + "language_loss": 0.77580702, + "learning_rate": 3.1235965432817417e-06, + "loss": 0.79748642, + "num_input_tokens_seen": 118169865, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.83984375, + "step": 5502, + "time_per_iteration": 2.4086782932281494 + }, + { + "auxiliary_loss_clip": 0.01130061, + "auxiliary_loss_mlp": 0.01039, + "balance_loss_clip": 1.02389622, + "balance_loss_mlp": 1.04632545, + "epoch": 0.3308582594318353, + "flos": 25374372364800.0, + "grad_norm": 2.045089737815956, + "language_loss": 0.72346115, + "learning_rate": 3.123274330355824e-06, + "loss": 0.74515176, + "num_input_tokens_seen": 118190760, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8359375, + "step": 5503, + "time_per_iteration": 2.5176749229431152 + }, + { + "auxiliary_loss_clip": 0.0112493, + "auxiliary_loss_mlp": 0.01033516, + "balance_loss_clip": 1.01865053, + "balance_loss_mlp": 1.04248357, + "epoch": 0.33091838268450324, + "flos": 26468570419200.0, + "grad_norm": 1.5402224202893484, + "language_loss": 0.75216055, + "learning_rate": 3.12295207483523e-06, + "loss": 0.77374506, + "num_input_tokens_seen": 118213620, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.82421875, + "step": 5504, + "time_per_iteration": 2.530212879180908 + }, + { + "auxiliary_loss_clip": 0.01127019, + "auxiliary_loss_mlp": 0.01038843, + "balance_loss_clip": 1.02438283, + "balance_loss_mlp": 1.04382253, + "epoch": 0.3309785059371712, + "flos": 24971167221120.0, + "grad_norm": 1.6148817370045387, + "language_loss": 0.70049053, + "learning_rate": 3.1226297767321816e-06, + "loss": 0.72214913, + "num_input_tokens_seen": 118235010, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.83203125, + "step": 5505, + "time_per_iteration": 2.5212292671203613 + }, + { + "auxiliary_loss_clip": 0.01126444, + "auxiliary_loss_mlp": 0.01041221, + "balance_loss_clip": 1.02720845, + "balance_loss_mlp": 1.04601455, + "epoch": 0.3310386291898392, + "flos": 20446710192000.0, + "grad_norm": 1.586520967819923, + "language_loss": 0.81541443, + "learning_rate": 3.122307436058899e-06, + "loss": 0.83709103, + "num_input_tokens_seen": 118255820, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.8046875, + "step": 5506, + "time_per_iteration": 2.5494561195373535 + }, + { + "auxiliary_loss_clip": 0.01128621, + "auxiliary_loss_mlp": 0.01037729, + "balance_loss_clip": 1.02277398, + "balance_loss_mlp": 1.04704857, + "epoch": 0.33109875244250714, + "flos": 23182672204800.0, + "grad_norm": 1.929478423939084, + "language_loss": 0.79097712, + "learning_rate": 3.121985052827606e-06, + "loss": 0.81264055, + "num_input_tokens_seen": 118274160, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.81640625, + "step": 5507, + "time_per_iteration": 2.498659610748291 + }, + { + "auxiliary_loss_clip": 0.01123401, + "auxiliary_loss_mlp": 0.01040623, + "balance_loss_clip": 1.02594829, + "balance_loss_mlp": 1.04136062, + "epoch": 0.3311588756951751, + "flos": 24168384207360.0, + "grad_norm": 1.6667627205960738, + "language_loss": 0.71733725, + "learning_rate": 3.1216626270505274e-06, + "loss": 0.73897743, + "num_input_tokens_seen": 118294385, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8203125, + "step": 5508, + "time_per_iteration": 2.478593111038208 + }, + { + "auxiliary_loss_clip": 0.01124563, + "auxiliary_loss_mlp": 0.0102968, + "balance_loss_clip": 1.01566064, + "balance_loss_mlp": 1.04539418, + "epoch": 0.33121899894784307, + "flos": 28145742209280.0, + "grad_norm": 2.030813517097255, + "language_loss": 0.72023594, + "learning_rate": 3.12134015873989e-06, + "loss": 0.74177837, + "num_input_tokens_seen": 118313105, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 5509, + "time_per_iteration": 2.539806842803955 + }, + { + "auxiliary_loss_clip": 0.01126062, + "auxiliary_loss_mlp": 0.01034216, + "balance_loss_clip": 1.01975, + "balance_loss_mlp": 1.04503942, + "epoch": 0.33127912220051103, + "flos": 29567660976000.0, + "grad_norm": 1.5191607241878, + "language_loss": 0.73049426, + "learning_rate": 3.121017647907921e-06, + "loss": 0.75209701, + "num_input_tokens_seen": 118335250, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8125, + "step": 5510, + "time_per_iteration": 2.536083698272705 + }, + { + "auxiliary_loss_clip": 0.01123553, + "auxiliary_loss_mlp": 0.01035708, + "balance_loss_clip": 1.02148628, + "balance_loss_mlp": 1.0429213, + "epoch": 0.331339245453179, + "flos": 14428836374400.0, + "grad_norm": 2.1286159820346984, + "language_loss": 0.87371129, + "learning_rate": 3.1206950945668508e-06, + "loss": 0.89530391, + "num_input_tokens_seen": 118351470, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8046875, + "step": 5511, + "time_per_iteration": 2.4380695819854736 + }, + { + "auxiliary_loss_clip": 0.01119745, + "auxiliary_loss_mlp": 0.01032859, + "balance_loss_clip": 1.01986468, + "balance_loss_mlp": 1.04396749, + "epoch": 0.33139936870584696, + "flos": 20887118847360.0, + "grad_norm": 1.6025966363766477, + "language_loss": 0.72926772, + "learning_rate": 3.12037249872891e-06, + "loss": 0.7507937, + "num_input_tokens_seen": 118370970, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7578125, + "step": 5512, + "time_per_iteration": 5.464786767959595 + }, + { + "auxiliary_loss_clip": 0.01124343, + "auxiliary_loss_mlp": 0.01041694, + "balance_loss_clip": 1.02759719, + "balance_loss_mlp": 1.04466701, + "epoch": 0.33145949195851493, + "flos": 36284356869120.0, + "grad_norm": 1.8365879467062751, + "language_loss": 0.72230887, + "learning_rate": 3.1200498604063317e-06, + "loss": 0.7439692, + "num_input_tokens_seen": 118393125, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.796875, + "step": 5513, + "time_per_iteration": 2.6175873279571533 + }, + { + "auxiliary_loss_clip": 0.01128264, + "auxiliary_loss_mlp": 0.01034719, + "balance_loss_clip": 1.01972222, + "balance_loss_mlp": 1.04398656, + "epoch": 0.33151961521118295, + "flos": 14279735018880.0, + "grad_norm": 1.8557947519919487, + "language_loss": 0.68629253, + "learning_rate": 3.1197271796113507e-06, + "loss": 0.70792234, + "num_input_tokens_seen": 118410860, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84375, + "step": 5514, + "time_per_iteration": 2.4340810775756836 + }, + { + "auxiliary_loss_clip": 0.01127749, + "auxiliary_loss_mlp": 0.01041934, + "balance_loss_clip": 1.0251019, + "balance_loss_mlp": 1.04505849, + "epoch": 0.3315797384638509, + "flos": 20774323163520.0, + "grad_norm": 2.411486097564539, + "language_loss": 0.66439879, + "learning_rate": 3.1194044563562026e-06, + "loss": 0.6860956, + "num_input_tokens_seen": 118429570, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.828125, + "step": 5515, + "time_per_iteration": 2.4983339309692383 + }, + { + "auxiliary_loss_clip": 0.01124572, + "auxiliary_loss_mlp": 0.01034351, + "balance_loss_clip": 1.01960468, + "balance_loss_mlp": 1.04258537, + "epoch": 0.3316398617165189, + "flos": 24679464871680.0, + "grad_norm": 1.4970111675637168, + "language_loss": 0.69111156, + "learning_rate": 3.1190816906531257e-06, + "loss": 0.71270084, + "num_input_tokens_seen": 118450285, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8203125, + "step": 5516, + "time_per_iteration": 2.515367031097412 + }, + { + "auxiliary_loss_clip": 0.0112502, + "auxiliary_loss_mlp": 0.01036046, + "balance_loss_clip": 1.02154398, + "balance_loss_mlp": 1.04021645, + "epoch": 0.33169998496918685, + "flos": 18587974129920.0, + "grad_norm": 2.365933570102145, + "language_loss": 0.80287617, + "learning_rate": 3.118758882514359e-06, + "loss": 0.82448685, + "num_input_tokens_seen": 118468270, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.84765625, + "step": 5517, + "time_per_iteration": 2.5149497985839844 + }, + { + "auxiliary_loss_clip": 0.01121413, + "auxiliary_loss_mlp": 0.01036775, + "balance_loss_clip": 1.02264261, + "balance_loss_mlp": 1.04258931, + "epoch": 0.3317601082218548, + "flos": 20193647898240.0, + "grad_norm": 2.188422581245926, + "language_loss": 0.74551105, + "learning_rate": 3.118436031952143e-06, + "loss": 0.76709294, + "num_input_tokens_seen": 118486615, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7890625, + "step": 5518, + "time_per_iteration": 2.450188159942627 + }, + { + "auxiliary_loss_clip": 0.01048984, + "auxiliary_loss_mlp": 0.01008888, + "balance_loss_clip": 1.00682592, + "balance_loss_mlp": 1.02244139, + "epoch": 0.3318202314745228, + "flos": 68974703637120.0, + "grad_norm": 0.6172932492598038, + "language_loss": 0.54346693, + "learning_rate": 3.1181131389787206e-06, + "loss": 0.56404567, + "num_input_tokens_seen": 118553580, + "router_z_loss_clip": 0.02062988, + "router_z_loss_mlp": 0.265625, + "step": 5519, + "time_per_iteration": 3.167750358581543 + }, + { + "auxiliary_loss_clip": 0.0112453, + "auxiliary_loss_mlp": 0.01039354, + "balance_loss_clip": 1.0239042, + "balance_loss_mlp": 1.0434345, + "epoch": 0.33188035472719074, + "flos": 21500113374720.0, + "grad_norm": 3.8105825888408855, + "language_loss": 0.78854358, + "learning_rate": 3.117790203606336e-06, + "loss": 0.81018245, + "num_input_tokens_seen": 118570280, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8125, + "step": 5520, + "time_per_iteration": 2.451781988143921 + }, + { + "auxiliary_loss_clip": 0.01121269, + "auxiliary_loss_mlp": 0.01032127, + "balance_loss_clip": 1.01835227, + "balance_loss_mlp": 1.04244733, + "epoch": 0.3319404779798587, + "flos": 28870490926080.0, + "grad_norm": 2.656623957411012, + "language_loss": 0.76576293, + "learning_rate": 3.1174672258472344e-06, + "loss": 0.78729689, + "num_input_tokens_seen": 118590455, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7890625, + "step": 5521, + "time_per_iteration": 2.525865077972412 + }, + { + "auxiliary_loss_clip": 0.01126792, + "auxiliary_loss_mlp": 0.01044731, + "balance_loss_clip": 1.02932894, + "balance_loss_mlp": 1.04259682, + "epoch": 0.33200060123252667, + "flos": 23076915586560.0, + "grad_norm": 3.3004720611075964, + "language_loss": 0.70353854, + "learning_rate": 3.117144205713664e-06, + "loss": 0.72525376, + "num_input_tokens_seen": 118609495, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.83984375, + "step": 5522, + "time_per_iteration": 2.472001791000366 + }, + { + "auxiliary_loss_clip": 0.01123475, + "auxiliary_loss_mlp": 0.01030854, + "balance_loss_clip": 1.01739514, + "balance_loss_mlp": 1.04362595, + "epoch": 0.33206072448519464, + "flos": 21142479611520.0, + "grad_norm": 1.7154852702320889, + "language_loss": 0.74052203, + "learning_rate": 3.1168211432178735e-06, + "loss": 0.76206541, + "num_input_tokens_seen": 118628720, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.80078125, + "step": 5523, + "time_per_iteration": 2.4924776554107666 + }, + { + "auxiliary_loss_clip": 0.01122263, + "auxiliary_loss_mlp": 0.0103648, + "balance_loss_clip": 1.0211792, + "balance_loss_mlp": 1.04308188, + "epoch": 0.3321208477378626, + "flos": 13079097987840.0, + "grad_norm": 1.6905303226226114, + "language_loss": 0.82272083, + "learning_rate": 3.116498038372114e-06, + "loss": 0.84430826, + "num_input_tokens_seen": 118645955, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.79296875, + "step": 5524, + "time_per_iteration": 2.439711332321167 + }, + { + "auxiliary_loss_clip": 0.01123508, + "auxiliary_loss_mlp": 0.01038508, + "balance_loss_clip": 1.0251627, + "balance_loss_mlp": 1.04402184, + "epoch": 0.33218097099053057, + "flos": 21215414177280.0, + "grad_norm": 1.6540586406432352, + "language_loss": 0.8307848, + "learning_rate": 3.116174891188636e-06, + "loss": 0.85240501, + "num_input_tokens_seen": 118665605, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.79296875, + "step": 5525, + "time_per_iteration": 2.4927310943603516 + }, + { + "auxiliary_loss_clip": 0.01044531, + "auxiliary_loss_mlp": 0.01006175, + "balance_loss_clip": 1.00405347, + "balance_loss_mlp": 1.01804781, + "epoch": 0.33224109424319853, + "flos": 64348979189760.0, + "grad_norm": 0.7716933739699889, + "language_loss": 0.5260945, + "learning_rate": 3.1158517016796945e-06, + "loss": 0.54660153, + "num_input_tokens_seen": 118728155, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.265625, + "step": 5526, + "time_per_iteration": 3.0598835945129395 + }, + { + "auxiliary_loss_clip": 0.01126467, + "auxiliary_loss_mlp": 0.01042827, + "balance_loss_clip": 1.02724671, + "balance_loss_mlp": 1.04371929, + "epoch": 0.33230121749586655, + "flos": 17346003523200.0, + "grad_norm": 2.1037159361855737, + "language_loss": 0.77490491, + "learning_rate": 3.1155284698575445e-06, + "loss": 0.79659784, + "num_input_tokens_seen": 118743955, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.828125, + "step": 5527, + "time_per_iteration": 2.4878480434417725 + }, + { + "auxiliary_loss_clip": 0.01126946, + "auxiliary_loss_mlp": 0.01044009, + "balance_loss_clip": 1.03025246, + "balance_loss_mlp": 1.04651201, + "epoch": 0.3323613407485345, + "flos": 20997041443200.0, + "grad_norm": 2.9813221594214494, + "language_loss": 0.72143763, + "learning_rate": 3.1152051957344434e-06, + "loss": 0.74314719, + "num_input_tokens_seen": 118763275, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.8046875, + "step": 5528, + "time_per_iteration": 2.4562795162200928 + }, + { + "auxiliary_loss_clip": 0.0112635, + "auxiliary_loss_mlp": 0.01036386, + "balance_loss_clip": 1.02256346, + "balance_loss_mlp": 1.04463542, + "epoch": 0.3324214640012025, + "flos": 13152535344000.0, + "grad_norm": 1.7054310511699202, + "language_loss": 0.82638806, + "learning_rate": 3.1148818793226497e-06, + "loss": 0.84801543, + "num_input_tokens_seen": 118781110, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.81640625, + "step": 5529, + "time_per_iteration": 2.474243640899658 + }, + { + "auxiliary_loss_clip": 0.01129499, + "auxiliary_loss_mlp": 0.01036464, + "balance_loss_clip": 1.02223659, + "balance_loss_mlp": 1.04554248, + "epoch": 0.33248158725387045, + "flos": 22273522041600.0, + "grad_norm": 1.9738718949190572, + "language_loss": 0.69718957, + "learning_rate": 3.114558520634423e-06, + "loss": 0.71884924, + "num_input_tokens_seen": 118800620, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.83984375, + "step": 5530, + "time_per_iteration": 2.471686840057373 + }, + { + "auxiliary_loss_clip": 0.01127236, + "auxiliary_loss_mlp": 0.01045423, + "balance_loss_clip": 1.02996182, + "balance_loss_mlp": 1.04500127, + "epoch": 0.3325417105065384, + "flos": 20740998320640.0, + "grad_norm": 2.4616968900166643, + "language_loss": 0.7616601, + "learning_rate": 3.1142351196820256e-06, + "loss": 0.78338665, + "num_input_tokens_seen": 118818725, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8203125, + "step": 5531, + "time_per_iteration": 2.473328113555908 + }, + { + "auxiliary_loss_clip": 0.01128043, + "auxiliary_loss_mlp": 0.01037476, + "balance_loss_clip": 1.0231235, + "balance_loss_mlp": 1.04481292, + "epoch": 0.3326018337592064, + "flos": 24790536702720.0, + "grad_norm": 1.7553607817915955, + "language_loss": 0.73413068, + "learning_rate": 3.1139116764777206e-06, + "loss": 0.75578588, + "num_input_tokens_seen": 118839390, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.83203125, + "step": 5532, + "time_per_iteration": 2.4864931106567383 + }, + { + "auxiliary_loss_clip": 0.01128977, + "auxiliary_loss_mlp": 0.01026777, + "balance_loss_clip": 1.01321709, + "balance_loss_mlp": 1.04721618, + "epoch": 0.33266195701187434, + "flos": 14501699112960.0, + "grad_norm": 2.2280638741168057, + "language_loss": 0.65813714, + "learning_rate": 3.1135881910337735e-06, + "loss": 0.67969465, + "num_input_tokens_seen": 118856275, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.8203125, + "step": 5533, + "time_per_iteration": 2.5232229232788086 + }, + { + "auxiliary_loss_clip": 0.01126882, + "auxiliary_loss_mlp": 0.01040338, + "balance_loss_clip": 1.02541876, + "balance_loss_mlp": 1.04451632, + "epoch": 0.3327220802645423, + "flos": 15304410299520.0, + "grad_norm": 1.9248590192503388, + "language_loss": 0.70790148, + "learning_rate": 3.113264663362451e-06, + "loss": 0.72957367, + "num_input_tokens_seen": 118873830, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.82421875, + "step": 5534, + "time_per_iteration": 2.418875217437744 + }, + { + "auxiliary_loss_clip": 0.01125629, + "auxiliary_loss_mlp": 0.01033414, + "balance_loss_clip": 1.01890588, + "balance_loss_mlp": 1.04565191, + "epoch": 0.3327822035172103, + "flos": 23477534951040.0, + "grad_norm": 1.8142926842561948, + "language_loss": 0.6684956, + "learning_rate": 3.1129410934760204e-06, + "loss": 0.69008601, + "num_input_tokens_seen": 118891560, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.80078125, + "step": 5535, + "time_per_iteration": 2.5031726360321045 + }, + { + "auxiliary_loss_clip": 0.01126804, + "auxiliary_loss_mlp": 0.01038594, + "balance_loss_clip": 1.02450383, + "balance_loss_mlp": 1.04416704, + "epoch": 0.33284232676987824, + "flos": 25374516019200.0, + "grad_norm": 2.1308907042960525, + "language_loss": 0.72915065, + "learning_rate": 3.1126174813867517e-06, + "loss": 0.75080466, + "num_input_tokens_seen": 118910260, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.828125, + "step": 5536, + "time_per_iteration": 2.494007110595703 + }, + { + "auxiliary_loss_clip": 0.01126771, + "auxiliary_loss_mlp": 0.01038616, + "balance_loss_clip": 1.02474046, + "balance_loss_mlp": 1.0450089, + "epoch": 0.3329024500225462, + "flos": 23694363400320.0, + "grad_norm": 1.6653416647198893, + "language_loss": 0.81801486, + "learning_rate": 3.112293827106917e-06, + "loss": 0.83966869, + "num_input_tokens_seen": 118929985, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.81640625, + "step": 5537, + "time_per_iteration": 2.611788272857666 + }, + { + "auxiliary_loss_clip": 0.01131655, + "auxiliary_loss_mlp": 0.01042421, + "balance_loss_clip": 1.02805638, + "balance_loss_mlp": 1.04771638, + "epoch": 0.33296257327521417, + "flos": 31723163205120.0, + "grad_norm": 1.938500745409862, + "language_loss": 0.71606827, + "learning_rate": 3.111970130648789e-06, + "loss": 0.73780894, + "num_input_tokens_seen": 118951355, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.83984375, + "step": 5538, + "time_per_iteration": 2.538574695587158 + }, + { + "auxiliary_loss_clip": 0.01125022, + "auxiliary_loss_mlp": 0.01030337, + "balance_loss_clip": 1.01642489, + "balance_loss_mlp": 1.04461074, + "epoch": 0.33302269652788213, + "flos": 22744705674240.0, + "grad_norm": 2.0173985756025417, + "language_loss": 0.7442342, + "learning_rate": 3.1116463920246424e-06, + "loss": 0.76578778, + "num_input_tokens_seen": 118970910, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.8046875, + "step": 5539, + "time_per_iteration": 2.539393424987793 + }, + { + "auxiliary_loss_clip": 0.01132315, + "auxiliary_loss_mlp": 0.01045465, + "balance_loss_clip": 1.03062367, + "balance_loss_mlp": 1.04543138, + "epoch": 0.33308281978055015, + "flos": 11473747441920.0, + "grad_norm": 1.8798801752229715, + "language_loss": 0.70726681, + "learning_rate": 3.1113226112467527e-06, + "loss": 0.72904468, + "num_input_tokens_seen": 118989200, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8671875, + "step": 5540, + "time_per_iteration": 2.460745096206665 + }, + { + "auxiliary_loss_clip": 0.01123042, + "auxiliary_loss_mlp": 0.01035986, + "balance_loss_clip": 1.02156138, + "balance_loss_mlp": 1.04151917, + "epoch": 0.3331429430332181, + "flos": 38213693112960.0, + "grad_norm": 2.212860979219503, + "language_loss": 0.60678709, + "learning_rate": 3.1109987883273983e-06, + "loss": 0.62837738, + "num_input_tokens_seen": 119011030, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8125, + "step": 5541, + "time_per_iteration": 2.643308162689209 + }, + { + "auxiliary_loss_clip": 0.01129096, + "auxiliary_loss_mlp": 0.01040856, + "balance_loss_clip": 1.0256207, + "balance_loss_mlp": 1.04428339, + "epoch": 0.3332030662858861, + "flos": 22528667324160.0, + "grad_norm": 1.7250198470895146, + "language_loss": 0.68636936, + "learning_rate": 3.1106749232788584e-06, + "loss": 0.70806885, + "num_input_tokens_seen": 119030620, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8515625, + "step": 5542, + "time_per_iteration": 2.472029209136963 + }, + { + "auxiliary_loss_clip": 0.0112742, + "auxiliary_loss_mlp": 0.01037594, + "balance_loss_clip": 1.02362895, + "balance_loss_mlp": 1.04488277, + "epoch": 0.33326318953855405, + "flos": 15997773507840.0, + "grad_norm": 1.6472310915335262, + "language_loss": 0.75526464, + "learning_rate": 3.110351016113414e-06, + "loss": 0.77691472, + "num_input_tokens_seen": 119048015, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.82421875, + "step": 5543, + "time_per_iteration": 2.453550100326538 + }, + { + "auxiliary_loss_clip": 0.01132047, + "auxiliary_loss_mlp": 0.01037735, + "balance_loss_clip": 1.02342415, + "balance_loss_mlp": 1.04834402, + "epoch": 0.333323312791222, + "flos": 25593535198080.0, + "grad_norm": 1.6694578175563026, + "language_loss": 0.75282717, + "learning_rate": 3.110027066843348e-06, + "loss": 0.77452493, + "num_input_tokens_seen": 119066280, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.8359375, + "step": 5544, + "time_per_iteration": 2.486992835998535 + }, + { + "auxiliary_loss_clip": 0.01124934, + "auxiliary_loss_mlp": 0.01033224, + "balance_loss_clip": 1.01910329, + "balance_loss_mlp": 1.04350412, + "epoch": 0.33338343604389, + "flos": 25119550304640.0, + "grad_norm": 1.4864809930890506, + "language_loss": 0.70886022, + "learning_rate": 3.1097030754809456e-06, + "loss": 0.73044181, + "num_input_tokens_seen": 119087680, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8125, + "step": 5545, + "time_per_iteration": 2.5813279151916504 + }, + { + "auxiliary_loss_clip": 0.01125022, + "auxiliary_loss_mlp": 0.01037243, + "balance_loss_clip": 1.02333164, + "balance_loss_mlp": 1.04530168, + "epoch": 0.33344355929655795, + "flos": 16947287579520.0, + "grad_norm": 1.7150542013191912, + "language_loss": 0.69300294, + "learning_rate": 3.1093790420384894e-06, + "loss": 0.7146256, + "num_input_tokens_seen": 119105820, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.796875, + "step": 5546, + "time_per_iteration": 2.4564788341522217 + }, + { + "auxiliary_loss_clip": 0.01129119, + "auxiliary_loss_mlp": 0.01037833, + "balance_loss_clip": 1.02340865, + "balance_loss_mlp": 1.04343665, + "epoch": 0.3335036825492259, + "flos": 27889591345920.0, + "grad_norm": 1.6632006519185205, + "language_loss": 0.64804697, + "learning_rate": 3.1090549665282702e-06, + "loss": 0.66971648, + "num_input_tokens_seen": 119126630, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.859375, + "step": 5547, + "time_per_iteration": 2.554959774017334 + }, + { + "auxiliary_loss_clip": 0.01127842, + "auxiliary_loss_mlp": 0.01030841, + "balance_loss_clip": 1.01782918, + "balance_loss_mlp": 1.0467664, + "epoch": 0.3335638058018939, + "flos": 16179553261440.0, + "grad_norm": 2.454082693277369, + "language_loss": 0.856148, + "learning_rate": 3.1087308489625742e-06, + "loss": 0.87773478, + "num_input_tokens_seen": 119143375, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.8125, + "step": 5548, + "time_per_iteration": 2.451032876968384 + }, + { + "auxiliary_loss_clip": 0.01129139, + "auxiliary_loss_mlp": 0.01036581, + "balance_loss_clip": 1.02100003, + "balance_loss_mlp": 1.04508662, + "epoch": 0.33362392905456184, + "flos": 39896108288640.0, + "grad_norm": 2.024965729715467, + "language_loss": 0.74754196, + "learning_rate": 3.1084066893536945e-06, + "loss": 0.76919919, + "num_input_tokens_seen": 119166450, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.83984375, + "step": 5549, + "time_per_iteration": 2.6875991821289062 + }, + { + "auxiliary_loss_clip": 0.01128755, + "auxiliary_loss_mlp": 0.01038769, + "balance_loss_clip": 1.02362955, + "balance_loss_mlp": 1.04486775, + "epoch": 0.3336840523072298, + "flos": 44271212567040.0, + "grad_norm": 1.8150391856089545, + "language_loss": 0.68361247, + "learning_rate": 3.108082487713921e-06, + "loss": 0.70528769, + "num_input_tokens_seen": 119189645, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.83984375, + "step": 5550, + "time_per_iteration": 2.640758752822876 + }, + { + "auxiliary_loss_clip": 0.0112866, + "auxiliary_loss_mlp": 0.01039899, + "balance_loss_clip": 1.02611244, + "balance_loss_mlp": 1.04545677, + "epoch": 0.33374417555989777, + "flos": 15085678429440.0, + "grad_norm": 1.742869766825136, + "language_loss": 0.60666394, + "learning_rate": 3.1077582440555495e-06, + "loss": 0.62834954, + "num_input_tokens_seen": 119208045, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.83203125, + "step": 5551, + "time_per_iteration": 2.454871654510498 + }, + { + "auxiliary_loss_clip": 0.01127389, + "auxiliary_loss_mlp": 0.01040643, + "balance_loss_clip": 1.02569366, + "balance_loss_mlp": 1.0459497, + "epoch": 0.33380429881256574, + "flos": 15849174942720.0, + "grad_norm": 1.6119589143573256, + "language_loss": 0.70450759, + "learning_rate": 3.1074339583908746e-06, + "loss": 0.72618788, + "num_input_tokens_seen": 119224910, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8125, + "step": 5552, + "time_per_iteration": 2.4226949214935303 + }, + { + "auxiliary_loss_clip": 0.01127587, + "auxiliary_loss_mlp": 0.01036933, + "balance_loss_clip": 1.02297902, + "balance_loss_mlp": 1.04462051, + "epoch": 0.33386442206523376, + "flos": 13480327883520.0, + "grad_norm": 2.0022942324560145, + "language_loss": 0.8289907, + "learning_rate": 3.107109630732192e-06, + "loss": 0.85063589, + "num_input_tokens_seen": 119243290, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.828125, + "step": 5553, + "time_per_iteration": 3.8951358795166016 + }, + { + "auxiliary_loss_clip": 0.01128647, + "auxiliary_loss_mlp": 0.01036827, + "balance_loss_clip": 1.0211153, + "balance_loss_mlp": 1.04528964, + "epoch": 0.3339245453179017, + "flos": 16690669839360.0, + "grad_norm": 2.095475541363027, + "language_loss": 0.81220448, + "learning_rate": 3.1067852610918017e-06, + "loss": 0.83385921, + "num_input_tokens_seen": 119261195, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.83203125, + "step": 5554, + "time_per_iteration": 3.8097896575927734 + }, + { + "auxiliary_loss_clip": 0.01128551, + "auxiliary_loss_mlp": 0.0104249, + "balance_loss_clip": 1.02811968, + "balance_loss_mlp": 1.0457983, + "epoch": 0.3339846685705697, + "flos": 24610624456320.0, + "grad_norm": 1.4459560856203526, + "language_loss": 0.81277251, + "learning_rate": 3.1064608494820032e-06, + "loss": 0.83448291, + "num_input_tokens_seen": 119282845, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.828125, + "step": 5555, + "time_per_iteration": 2.51686954498291 + }, + { + "auxiliary_loss_clip": 0.01126865, + "auxiliary_loss_mlp": 0.01038536, + "balance_loss_clip": 1.02469552, + "balance_loss_mlp": 1.04441357, + "epoch": 0.33404479182323765, + "flos": 30953812775040.0, + "grad_norm": 1.713035899616047, + "language_loss": 0.74563497, + "learning_rate": 3.106136395915099e-06, + "loss": 0.76728898, + "num_input_tokens_seen": 119304430, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.82421875, + "step": 5556, + "time_per_iteration": 2.550630807876587 + }, + { + "auxiliary_loss_clip": 0.0112773, + "auxiliary_loss_mlp": 0.01038673, + "balance_loss_clip": 1.02459431, + "balance_loss_mlp": 1.04586554, + "epoch": 0.3341049150759056, + "flos": 23513301918720.0, + "grad_norm": 1.4096864083862861, + "language_loss": 0.82588691, + "learning_rate": 3.105811900403391e-06, + "loss": 0.84755093, + "num_input_tokens_seen": 119323830, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8203125, + "step": 5557, + "time_per_iteration": 2.498108148574829 + }, + { + "auxiliary_loss_clip": 0.01129625, + "auxiliary_loss_mlp": 0.01045289, + "balance_loss_clip": 1.03055513, + "balance_loss_mlp": 1.04486346, + "epoch": 0.3341650383285736, + "flos": 24026824707840.0, + "grad_norm": 1.7414701325609587, + "language_loss": 0.80056083, + "learning_rate": 3.1054873629591855e-06, + "loss": 0.82230997, + "num_input_tokens_seen": 119346340, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.84765625, + "step": 5558, + "time_per_iteration": 2.5519607067108154 + }, + { + "auxiliary_loss_clip": 0.01129512, + "auxiliary_loss_mlp": 0.01034927, + "balance_loss_clip": 1.02159929, + "balance_loss_mlp": 1.04537535, + "epoch": 0.33422516158124155, + "flos": 24901967669760.0, + "grad_norm": 1.595273660638049, + "language_loss": 0.81953323, + "learning_rate": 3.105162783594788e-06, + "loss": 0.84117764, + "num_input_tokens_seen": 119367285, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.84375, + "step": 5559, + "time_per_iteration": 2.5202248096466064 + }, + { + "auxiliary_loss_clip": 0.01126195, + "auxiliary_loss_mlp": 0.01038303, + "balance_loss_clip": 1.02384293, + "balance_loss_mlp": 1.04450536, + "epoch": 0.3342852848339095, + "flos": 18333403464960.0, + "grad_norm": 2.784570608011319, + "language_loss": 0.72027284, + "learning_rate": 3.1048381623225074e-06, + "loss": 0.74191785, + "num_input_tokens_seen": 119385370, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.81640625, + "step": 5560, + "time_per_iteration": 2.453016757965088 + }, + { + "auxiliary_loss_clip": 0.01133571, + "auxiliary_loss_mlp": 0.01046441, + "balance_loss_clip": 1.03118193, + "balance_loss_mlp": 1.04679513, + "epoch": 0.3343454080865775, + "flos": 30046530119040.0, + "grad_norm": 2.584817000325422, + "language_loss": 0.74888778, + "learning_rate": 3.1045134991546526e-06, + "loss": 0.77068788, + "num_input_tokens_seen": 119409150, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8671875, + "step": 5561, + "time_per_iteration": 2.526980400085449 + }, + { + "auxiliary_loss_clip": 0.01128977, + "auxiliary_loss_mlp": 0.01038649, + "balance_loss_clip": 1.02410603, + "balance_loss_mlp": 1.04610825, + "epoch": 0.33440553133924544, + "flos": 16398823835520.0, + "grad_norm": 2.2689753945529176, + "language_loss": 0.69638503, + "learning_rate": 3.1041887941035355e-06, + "loss": 0.71806127, + "num_input_tokens_seen": 119426475, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.828125, + "step": 5562, + "time_per_iteration": 2.483530282974243 + }, + { + "auxiliary_loss_clip": 0.01127212, + "auxiliary_loss_mlp": 0.01041398, + "balance_loss_clip": 1.02821374, + "balance_loss_mlp": 1.04549575, + "epoch": 0.3344656545919134, + "flos": 24242072958720.0, + "grad_norm": 1.5595683236821118, + "language_loss": 0.65407914, + "learning_rate": 3.1038640471814685e-06, + "loss": 0.67576528, + "num_input_tokens_seen": 119446900, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.8203125, + "step": 5563, + "time_per_iteration": 2.489734649658203 + }, + { + "auxiliary_loss_clip": 0.01131891, + "auxiliary_loss_mlp": 0.0104331, + "balance_loss_clip": 1.027843, + "balance_loss_mlp": 1.0464654, + "epoch": 0.3345257778445814, + "flos": 52118843149440.0, + "grad_norm": 3.650208894964183, + "language_loss": 0.74457055, + "learning_rate": 3.103539258400766e-06, + "loss": 0.76632255, + "num_input_tokens_seen": 119470945, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.85546875, + "step": 5564, + "time_per_iteration": 2.7312686443328857 + }, + { + "auxiliary_loss_clip": 0.01049511, + "auxiliary_loss_mlp": 0.00999253, + "balance_loss_clip": 0.99735802, + "balance_loss_mlp": 1.02280784, + "epoch": 0.33458590109724934, + "flos": 68048602254720.0, + "grad_norm": 0.7800603717209338, + "language_loss": 0.55489159, + "learning_rate": 3.103214427773745e-06, + "loss": 0.57537925, + "num_input_tokens_seen": 119529925, + "router_z_loss_clip": 0.0189209, + "router_z_loss_mlp": 0.265625, + "step": 5565, + "time_per_iteration": 3.0266246795654297 + }, + { + "auxiliary_loss_clip": 0.01126829, + "auxiliary_loss_mlp": 0.0103706, + "balance_loss_clip": 1.02271366, + "balance_loss_mlp": 1.04589689, + "epoch": 0.3346460243499173, + "flos": 37414788768000.0, + "grad_norm": 1.7346222757402157, + "language_loss": 0.64754677, + "learning_rate": 3.102889555312721e-06, + "loss": 0.66918564, + "num_input_tokens_seen": 119550700, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.80859375, + "step": 5566, + "time_per_iteration": 2.5819363594055176 + }, + { + "auxiliary_loss_clip": 0.01128946, + "auxiliary_loss_mlp": 0.01040566, + "balance_loss_clip": 1.0259037, + "balance_loss_mlp": 1.04706717, + "epoch": 0.3347061476025853, + "flos": 18697358021760.0, + "grad_norm": 1.73011072762743, + "language_loss": 0.77735972, + "learning_rate": 3.102564641030016e-06, + "loss": 0.7990548, + "num_input_tokens_seen": 119569295, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8203125, + "step": 5567, + "time_per_iteration": 2.508108377456665 + }, + { + "auxiliary_loss_clip": 0.01130701, + "auxiliary_loss_mlp": 0.01040305, + "balance_loss_clip": 1.02480745, + "balance_loss_mlp": 1.04583585, + "epoch": 0.3347662708552533, + "flos": 13917827537280.0, + "grad_norm": 1.719738804733239, + "language_loss": 0.76512182, + "learning_rate": 3.102239684937949e-06, + "loss": 0.78683186, + "num_input_tokens_seen": 119587375, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84765625, + "step": 5568, + "time_per_iteration": 2.4344217777252197 + }, + { + "auxiliary_loss_clip": 0.01130164, + "auxiliary_loss_mlp": 0.01044901, + "balance_loss_clip": 1.02973104, + "balance_loss_mlp": 1.04528308, + "epoch": 0.33482639410792125, + "flos": 19750402068480.0, + "grad_norm": 2.265483767853782, + "language_loss": 0.71277773, + "learning_rate": 3.101914687048842e-06, + "loss": 0.73452842, + "num_input_tokens_seen": 119604530, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.84765625, + "step": 5569, + "time_per_iteration": 2.462592840194702 + }, + { + "auxiliary_loss_clip": 0.0112772, + "auxiliary_loss_mlp": 0.0103514, + "balance_loss_clip": 1.01920176, + "balance_loss_mlp": 1.04275155, + "epoch": 0.3348865173605892, + "flos": 16102991422080.0, + "grad_norm": 1.859999754882374, + "language_loss": 0.90291858, + "learning_rate": 3.10158964737502e-06, + "loss": 0.9245472, + "num_input_tokens_seen": 119621025, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8515625, + "step": 5570, + "time_per_iteration": 2.432124614715576 + }, + { + "auxiliary_loss_clip": 0.0112712, + "auxiliary_loss_mlp": 0.01030792, + "balance_loss_clip": 1.01634383, + "balance_loss_mlp": 1.04461455, + "epoch": 0.3349466406132572, + "flos": 25008945350400.0, + "grad_norm": 1.7333982724081918, + "language_loss": 0.80038494, + "learning_rate": 3.101264565928808e-06, + "loss": 0.82196403, + "num_input_tokens_seen": 119641725, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.82421875, + "step": 5571, + "time_per_iteration": 2.52752947807312 + }, + { + "auxiliary_loss_clip": 0.0104544, + "auxiliary_loss_mlp": 0.00998336, + "balance_loss_clip": 0.99651235, + "balance_loss_mlp": 1.01880455, + "epoch": 0.33500676386592515, + "flos": 54319991564160.0, + "grad_norm": 0.9063074837999179, + "language_loss": 0.55948162, + "learning_rate": 3.1009394427225335e-06, + "loss": 0.5799194, + "num_input_tokens_seen": 119693560, + "router_z_loss_clip": 0.01818848, + "router_z_loss_mlp": 0.265625, + "step": 5572, + "time_per_iteration": 3.0247979164123535 + }, + { + "auxiliary_loss_clip": 0.01131084, + "auxiliary_loss_mlp": 0.01046374, + "balance_loss_clip": 1.03212237, + "balance_loss_mlp": 1.04797339, + "epoch": 0.3350668871185931, + "flos": 26797332625920.0, + "grad_norm": 2.028320341949736, + "language_loss": 0.78112698, + "learning_rate": 3.1006142777685257e-06, + "loss": 0.80290151, + "num_input_tokens_seen": 119712935, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.83203125, + "step": 5573, + "time_per_iteration": 2.5152878761291504 + }, + { + "auxiliary_loss_clip": 0.01130248, + "auxiliary_loss_mlp": 0.01046989, + "balance_loss_clip": 1.03143215, + "balance_loss_mlp": 1.04525197, + "epoch": 0.3351270103712611, + "flos": 33510508986240.0, + "grad_norm": 2.1279768530108503, + "language_loss": 0.72473001, + "learning_rate": 3.1002890710791133e-06, + "loss": 0.7465024, + "num_input_tokens_seen": 119731680, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8515625, + "step": 5574, + "time_per_iteration": 2.543531656265259 + }, + { + "auxiliary_loss_clip": 0.01125319, + "auxiliary_loss_mlp": 0.0103147, + "balance_loss_clip": 1.017308, + "balance_loss_mlp": 1.04292774, + "epoch": 0.33518713362392905, + "flos": 26506240807680.0, + "grad_norm": 2.78085640379241, + "language_loss": 0.87911499, + "learning_rate": 3.0999638226666287e-06, + "loss": 0.90068293, + "num_input_tokens_seen": 119752155, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.82421875, + "step": 5575, + "time_per_iteration": 2.546952724456787 + }, + { + "auxiliary_loss_clip": 0.01132707, + "auxiliary_loss_mlp": 0.01045745, + "balance_loss_clip": 1.02899647, + "balance_loss_mlp": 1.04479516, + "epoch": 0.335247256876597, + "flos": 17232345912960.0, + "grad_norm": 2.569353520757799, + "language_loss": 0.82441479, + "learning_rate": 3.0996385325434063e-06, + "loss": 0.84619927, + "num_input_tokens_seen": 119769195, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.875, + "step": 5576, + "time_per_iteration": 2.414294958114624 + }, + { + "auxiliary_loss_clip": 0.01129312, + "auxiliary_loss_mlp": 0.01044917, + "balance_loss_clip": 1.0286808, + "balance_loss_mlp": 1.043697, + "epoch": 0.335307380129265, + "flos": 25629373992960.0, + "grad_norm": 3.008815557703919, + "language_loss": 0.73384887, + "learning_rate": 3.0993132007217806e-06, + "loss": 0.75559115, + "num_input_tokens_seen": 119786810, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.85546875, + "step": 5577, + "time_per_iteration": 2.50136399269104 + }, + { + "auxiliary_loss_clip": 0.01131921, + "auxiliary_loss_mlp": 0.0104202, + "balance_loss_clip": 1.02667177, + "balance_loss_mlp": 1.04811549, + "epoch": 0.33536750338193294, + "flos": 19680089195520.0, + "grad_norm": 1.7225109171896533, + "language_loss": 0.81555498, + "learning_rate": 3.0989878272140883e-06, + "loss": 0.8372944, + "num_input_tokens_seen": 119805395, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8359375, + "step": 5578, + "time_per_iteration": 2.431365728378296 + }, + { + "auxiliary_loss_clip": 0.01125183, + "auxiliary_loss_mlp": 0.0103725, + "balance_loss_clip": 1.02277184, + "balance_loss_mlp": 1.04578936, + "epoch": 0.3354276266346009, + "flos": 18332613365760.0, + "grad_norm": 1.8947087551065327, + "language_loss": 0.71785814, + "learning_rate": 3.0986624120326676e-06, + "loss": 0.73948246, + "num_input_tokens_seen": 119823135, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.79296875, + "step": 5579, + "time_per_iteration": 2.4519495964050293 + }, + { + "auxiliary_loss_clip": 0.01130811, + "auxiliary_loss_mlp": 0.01037429, + "balance_loss_clip": 1.02191353, + "balance_loss_mlp": 1.0456152, + "epoch": 0.3354877498872689, + "flos": 17858556645120.0, + "grad_norm": 2.0306401350469225, + "language_loss": 0.81084043, + "learning_rate": 3.0983369551898573e-06, + "loss": 0.83252287, + "num_input_tokens_seen": 119842265, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8515625, + "step": 5580, + "time_per_iteration": 2.427481174468994 + }, + { + "auxiliary_loss_clip": 0.01130056, + "auxiliary_loss_mlp": 0.01036614, + "balance_loss_clip": 1.02091432, + "balance_loss_mlp": 1.04496789, + "epoch": 0.3355478731399369, + "flos": 24717745791360.0, + "grad_norm": 1.8687829543354073, + "language_loss": 0.77912092, + "learning_rate": 3.0980114566980003e-06, + "loss": 0.80078757, + "num_input_tokens_seen": 119862500, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8515625, + "step": 5581, + "time_per_iteration": 2.5320229530334473 + }, + { + "auxiliary_loss_clip": 0.01132086, + "auxiliary_loss_mlp": 0.01045037, + "balance_loss_clip": 1.02735782, + "balance_loss_mlp": 1.04367673, + "epoch": 0.33560799639260486, + "flos": 16873886136960.0, + "grad_norm": 5.02896087449, + "language_loss": 0.74623251, + "learning_rate": 3.0976859165694384e-06, + "loss": 0.76800376, + "num_input_tokens_seen": 119880160, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.8828125, + "step": 5582, + "time_per_iteration": 2.421482801437378 + }, + { + "auxiliary_loss_clip": 0.0113015, + "auxiliary_loss_mlp": 0.01041343, + "balance_loss_clip": 1.02528524, + "balance_loss_mlp": 1.04456937, + "epoch": 0.3356681196452728, + "flos": 18333511205760.0, + "grad_norm": 1.790512330860928, + "language_loss": 0.82143587, + "learning_rate": 3.0973603348165166e-06, + "loss": 0.84315073, + "num_input_tokens_seen": 119899040, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.85546875, + "step": 5583, + "time_per_iteration": 2.4543566703796387 + }, + { + "auxiliary_loss_clip": 0.01127596, + "auxiliary_loss_mlp": 0.01044573, + "balance_loss_clip": 1.02991009, + "balance_loss_mlp": 1.04491317, + "epoch": 0.3357282428979408, + "flos": 34750612085760.0, + "grad_norm": 1.9267692381394996, + "language_loss": 0.7779209, + "learning_rate": 3.097034711451581e-06, + "loss": 0.79964256, + "num_input_tokens_seen": 119921120, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.828125, + "step": 5584, + "time_per_iteration": 2.6100947856903076 + }, + { + "auxiliary_loss_clip": 0.01129164, + "auxiliary_loss_mlp": 0.01038197, + "balance_loss_clip": 1.02343249, + "balance_loss_mlp": 1.04359186, + "epoch": 0.33578836615060875, + "flos": 21580087006080.0, + "grad_norm": 1.4758908421399493, + "language_loss": 0.75978506, + "learning_rate": 3.0967090464869795e-06, + "loss": 0.78145868, + "num_input_tokens_seen": 119940165, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.85546875, + "step": 5585, + "time_per_iteration": 2.4898715019226074 + }, + { + "auxiliary_loss_clip": 0.01121936, + "auxiliary_loss_mlp": 0.01037049, + "balance_loss_clip": 1.02170694, + "balance_loss_mlp": 1.04066801, + "epoch": 0.3358484894032767, + "flos": 24530291688960.0, + "grad_norm": 1.4987207146888684, + "language_loss": 0.77731383, + "learning_rate": 3.0963833399350608e-06, + "loss": 0.79890364, + "num_input_tokens_seen": 119959730, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8125, + "step": 5586, + "time_per_iteration": 2.4825005531311035 + }, + { + "auxiliary_loss_clip": 0.01136236, + "auxiliary_loss_mlp": 0.01048607, + "balance_loss_clip": 1.03070199, + "balance_loss_mlp": 1.04787183, + "epoch": 0.3359086126559447, + "flos": 22455589104000.0, + "grad_norm": 1.6235624689574053, + "language_loss": 0.81027555, + "learning_rate": 3.0960575918081756e-06, + "loss": 0.83212399, + "num_input_tokens_seen": 119979315, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.8828125, + "step": 5587, + "time_per_iteration": 2.486459493637085 + }, + { + "auxiliary_loss_clip": 0.01125436, + "auxiliary_loss_mlp": 0.01040884, + "balance_loss_clip": 1.0270915, + "balance_loss_mlp": 1.04548144, + "epoch": 0.33596873590861265, + "flos": 16543687386240.0, + "grad_norm": 1.7952449023594161, + "language_loss": 0.67014575, + "learning_rate": 3.095731802118677e-06, + "loss": 0.69180894, + "num_input_tokens_seen": 119996140, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.80078125, + "step": 5588, + "time_per_iteration": 2.435070753097534 + }, + { + "auxiliary_loss_clip": 0.01130516, + "auxiliary_loss_mlp": 0.0104412, + "balance_loss_clip": 1.02784824, + "balance_loss_mlp": 1.04568088, + "epoch": 0.3360288591612806, + "flos": 31175812782720.0, + "grad_norm": 1.6839710852868943, + "language_loss": 0.69882601, + "learning_rate": 3.095405970878919e-06, + "loss": 0.72057241, + "num_input_tokens_seen": 120017720, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.84765625, + "step": 5589, + "time_per_iteration": 2.548051118850708 + }, + { + "auxiliary_loss_clip": 0.01129859, + "auxiliary_loss_mlp": 0.01043753, + "balance_loss_clip": 1.02709961, + "balance_loss_mlp": 1.04461861, + "epoch": 0.3360889824139486, + "flos": 23696913265920.0, + "grad_norm": 2.1328325025080987, + "language_loss": 0.66886735, + "learning_rate": 3.0950800981012567e-06, + "loss": 0.69060349, + "num_input_tokens_seen": 120036335, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8515625, + "step": 5590, + "time_per_iteration": 2.4735047817230225 + }, + { + "auxiliary_loss_clip": 0.01126204, + "auxiliary_loss_mlp": 0.01045607, + "balance_loss_clip": 1.02993059, + "balance_loss_mlp": 1.04570127, + "epoch": 0.33614910566661654, + "flos": 19318109886720.0, + "grad_norm": 1.8322479695472769, + "language_loss": 0.73409903, + "learning_rate": 3.094754183798047e-06, + "loss": 0.75581712, + "num_input_tokens_seen": 120056120, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8046875, + "step": 5591, + "time_per_iteration": 2.4736244678497314 + }, + { + "auxiliary_loss_clip": 0.01127166, + "auxiliary_loss_mlp": 0.01042172, + "balance_loss_clip": 1.02686584, + "balance_loss_mlp": 1.04408562, + "epoch": 0.3362092289192845, + "flos": 16472261191680.0, + "grad_norm": 1.9183925576882788, + "language_loss": 0.69446647, + "learning_rate": 3.0944282279816493e-06, + "loss": 0.71615982, + "num_input_tokens_seen": 120073650, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.828125, + "step": 5592, + "time_per_iteration": 2.4232676029205322 + }, + { + "auxiliary_loss_clip": 0.0112535, + "auxiliary_loss_mlp": 0.01037895, + "balance_loss_clip": 1.02366149, + "balance_loss_mlp": 1.0442183, + "epoch": 0.33626935217195253, + "flos": 24243581329920.0, + "grad_norm": 2.4700576130478367, + "language_loss": 0.76281321, + "learning_rate": 3.094102230664423e-06, + "loss": 0.78444564, + "num_input_tokens_seen": 120093260, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8125, + "step": 5593, + "time_per_iteration": 2.4856812953948975 + }, + { + "auxiliary_loss_clip": 0.01128845, + "auxiliary_loss_mlp": 0.01044628, + "balance_loss_clip": 1.02703261, + "balance_loss_mlp": 1.04333365, + "epoch": 0.3363294754246205, + "flos": 19718765164800.0, + "grad_norm": 2.2267028217655516, + "language_loss": 0.71435678, + "learning_rate": 3.093776191858731e-06, + "loss": 0.73609149, + "num_input_tokens_seen": 120111830, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8515625, + "step": 5594, + "time_per_iteration": 2.437554359436035 + }, + { + "auxiliary_loss_clip": 0.0113233, + "auxiliary_loss_mlp": 0.01045948, + "balance_loss_clip": 1.02985501, + "balance_loss_mlp": 1.04690135, + "epoch": 0.33638959867728846, + "flos": 22596286677120.0, + "grad_norm": 1.637052204404589, + "language_loss": 0.80350173, + "learning_rate": 3.0934501115769363e-06, + "loss": 0.82528448, + "num_input_tokens_seen": 120130470, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.85546875, + "step": 5595, + "time_per_iteration": 5.51651668548584 + }, + { + "auxiliary_loss_clip": 0.0112868, + "auxiliary_loss_mlp": 0.01033292, + "balance_loss_clip": 1.01964831, + "balance_loss_mlp": 1.04542542, + "epoch": 0.3364497219299564, + "flos": 20994742972800.0, + "grad_norm": 1.8244163047079407, + "language_loss": 0.81611145, + "learning_rate": 3.0931239898314037e-06, + "loss": 0.83773112, + "num_input_tokens_seen": 120150735, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.83203125, + "step": 5596, + "time_per_iteration": 2.4959781169891357 + }, + { + "auxiliary_loss_clip": 0.01128091, + "auxiliary_loss_mlp": 0.01039521, + "balance_loss_clip": 1.02508509, + "balance_loss_mlp": 1.04461718, + "epoch": 0.3365098451826244, + "flos": 25228610974080.0, + "grad_norm": 1.7014468319312177, + "language_loss": 0.76001227, + "learning_rate": 3.0927978266344995e-06, + "loss": 0.78168839, + "num_input_tokens_seen": 120173230, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8359375, + "step": 5597, + "time_per_iteration": 2.4965333938598633 + }, + { + "auxiliary_loss_clip": 0.01126223, + "auxiliary_loss_mlp": 0.01037273, + "balance_loss_clip": 1.0233258, + "balance_loss_mlp": 1.04597533, + "epoch": 0.33656996843529235, + "flos": 24571697091840.0, + "grad_norm": 1.8007239192940239, + "language_loss": 0.78937811, + "learning_rate": 3.0924716219985916e-06, + "loss": 0.81101304, + "num_input_tokens_seen": 120191860, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.80078125, + "step": 5598, + "time_per_iteration": 2.587813377380371 + }, + { + "auxiliary_loss_clip": 0.01133011, + "auxiliary_loss_mlp": 0.01036012, + "balance_loss_clip": 1.02036011, + "balance_loss_mlp": 1.04606342, + "epoch": 0.3366300916879603, + "flos": 44091120752640.0, + "grad_norm": 1.4664560154247552, + "language_loss": 0.64197004, + "learning_rate": 3.0921453759360514e-06, + "loss": 0.66366023, + "num_input_tokens_seen": 120219195, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.87109375, + "step": 5599, + "time_per_iteration": 2.647618293762207 + }, + { + "auxiliary_loss_clip": 0.0113527, + "auxiliary_loss_mlp": 0.01043272, + "balance_loss_clip": 1.02685726, + "balance_loss_mlp": 1.0468514, + "epoch": 0.3366902149406283, + "flos": 13879869840000.0, + "grad_norm": 2.652004853610392, + "language_loss": 0.8172245, + "learning_rate": 3.091819088459249e-06, + "loss": 0.83900994, + "num_input_tokens_seen": 120232950, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.88671875, + "step": 5600, + "time_per_iteration": 2.441237211227417 + }, + { + "auxiliary_loss_clip": 0.01130498, + "auxiliary_loss_mlp": 0.01050016, + "balance_loss_clip": 1.03369582, + "balance_loss_mlp": 1.04399288, + "epoch": 0.33675033819329625, + "flos": 16253098358400.0, + "grad_norm": 3.359102963412802, + "language_loss": 0.82717538, + "learning_rate": 3.0914927595805573e-06, + "loss": 0.84898043, + "num_input_tokens_seen": 120248865, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.86328125, + "step": 5601, + "time_per_iteration": 2.4369428157806396 + }, + { + "auxiliary_loss_clip": 0.01127768, + "auxiliary_loss_mlp": 0.01032812, + "balance_loss_clip": 1.01911497, + "balance_loss_mlp": 1.04890418, + "epoch": 0.3368104614459642, + "flos": 17055809544960.0, + "grad_norm": 1.6511579237160083, + "language_loss": 0.82726496, + "learning_rate": 3.0911663893123507e-06, + "loss": 0.84887075, + "num_input_tokens_seen": 120267820, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7890625, + "step": 5602, + "time_per_iteration": 2.463291645050049 + }, + { + "auxiliary_loss_clip": 0.01130933, + "auxiliary_loss_mlp": 0.01055384, + "balance_loss_clip": 1.04039955, + "balance_loss_mlp": 1.04712546, + "epoch": 0.3368705846986322, + "flos": 17858628472320.0, + "grad_norm": 1.700541242008466, + "language_loss": 0.70208776, + "learning_rate": 3.0908399776670048e-06, + "loss": 0.72395098, + "num_input_tokens_seen": 120286540, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8359375, + "step": 5603, + "time_per_iteration": 2.4309756755828857 + }, + { + "auxiliary_loss_clip": 0.01133654, + "auxiliary_loss_mlp": 0.0103849, + "balance_loss_clip": 1.02392292, + "balance_loss_mlp": 1.04724145, + "epoch": 0.33693070795130015, + "flos": 22929502170240.0, + "grad_norm": 1.625433979180813, + "language_loss": 0.82925308, + "learning_rate": 3.090513524656898e-06, + "loss": 0.8509745, + "num_input_tokens_seen": 120307305, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.86328125, + "step": 5604, + "time_per_iteration": 2.4980010986328125 + }, + { + "auxiliary_loss_clip": 0.01129789, + "auxiliary_loss_mlp": 0.01042861, + "balance_loss_clip": 1.02782226, + "balance_loss_mlp": 1.0447166, + "epoch": 0.3369908312039681, + "flos": 22017443005440.0, + "grad_norm": 3.2518642032613654, + "language_loss": 0.73756403, + "learning_rate": 3.090187030294409e-06, + "loss": 0.75929046, + "num_input_tokens_seen": 120327845, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8515625, + "step": 5605, + "time_per_iteration": 2.4563212394714355 + }, + { + "auxiliary_loss_clip": 0.01132634, + "auxiliary_loss_mlp": 0.01040526, + "balance_loss_clip": 1.02520752, + "balance_loss_mlp": 1.04604197, + "epoch": 0.33705095445663613, + "flos": 11801970944640.0, + "grad_norm": 2.772980532366942, + "language_loss": 0.83487791, + "learning_rate": 3.089860494591919e-06, + "loss": 0.85660958, + "num_input_tokens_seen": 120343255, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8671875, + "step": 5606, + "time_per_iteration": 2.456441640853882 + }, + { + "auxiliary_loss_clip": 0.0112361, + "auxiliary_loss_mlp": 0.01039979, + "balance_loss_clip": 1.02549469, + "balance_loss_mlp": 1.0414753, + "epoch": 0.3371110777093041, + "flos": 25046400257280.0, + "grad_norm": 1.7790448991820722, + "language_loss": 0.67335433, + "learning_rate": 3.089533917561809e-06, + "loss": 0.69499022, + "num_input_tokens_seen": 120361745, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8203125, + "step": 5607, + "time_per_iteration": 2.4964821338653564 + }, + { + "auxiliary_loss_clip": 0.01130916, + "auxiliary_loss_mlp": 0.01041895, + "balance_loss_clip": 1.02694631, + "balance_loss_mlp": 1.04507923, + "epoch": 0.33717120096197206, + "flos": 26579031719040.0, + "grad_norm": 2.032375572186737, + "language_loss": 0.71093041, + "learning_rate": 3.089207299216464e-06, + "loss": 0.73265851, + "num_input_tokens_seen": 120380565, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.859375, + "step": 5608, + "time_per_iteration": 2.5247933864593506 + }, + { + "auxiliary_loss_clip": 0.01128549, + "auxiliary_loss_mlp": 0.01038175, + "balance_loss_clip": 1.0236311, + "balance_loss_mlp": 1.0446682, + "epoch": 0.33723132421464, + "flos": 15158541168000.0, + "grad_norm": 1.8968208773724307, + "language_loss": 0.79062563, + "learning_rate": 3.088880639568269e-06, + "loss": 0.81229293, + "num_input_tokens_seen": 120399235, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.83984375, + "step": 5609, + "time_per_iteration": 2.439502477645874 + }, + { + "auxiliary_loss_clip": 0.01129667, + "auxiliary_loss_mlp": 0.01042877, + "balance_loss_clip": 1.02706969, + "balance_loss_mlp": 1.04544735, + "epoch": 0.337291447467308, + "flos": 23436093634560.0, + "grad_norm": 2.0456898754189354, + "language_loss": 0.82218611, + "learning_rate": 3.0885539386296114e-06, + "loss": 0.84391159, + "num_input_tokens_seen": 120420095, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.84375, + "step": 5610, + "time_per_iteration": 2.502028226852417 + }, + { + "auxiliary_loss_clip": 0.01123686, + "auxiliary_loss_mlp": 0.01040586, + "balance_loss_clip": 1.02520823, + "balance_loss_mlp": 1.04264688, + "epoch": 0.33735157071997596, + "flos": 17238163916160.0, + "grad_norm": 1.8264685829582996, + "language_loss": 0.81998217, + "learning_rate": 3.088227196412879e-06, + "loss": 0.84162486, + "num_input_tokens_seen": 120437690, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8125, + "step": 5611, + "time_per_iteration": 2.4255177974700928 + }, + { + "auxiliary_loss_clip": 0.01130986, + "auxiliary_loss_mlp": 0.01044325, + "balance_loss_clip": 1.02728975, + "balance_loss_mlp": 1.04550552, + "epoch": 0.3374116939726439, + "flos": 28257388657920.0, + "grad_norm": 1.5753494383615703, + "language_loss": 0.79407716, + "learning_rate": 3.0879004129304626e-06, + "loss": 0.81583023, + "num_input_tokens_seen": 120459240, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.85546875, + "step": 5612, + "time_per_iteration": 2.537048578262329 + }, + { + "auxiliary_loss_clip": 0.01124133, + "auxiliary_loss_mlp": 0.01037075, + "balance_loss_clip": 1.02212596, + "balance_loss_mlp": 1.04021907, + "epoch": 0.3374718172253119, + "flos": 35919396731520.0, + "grad_norm": 2.519050824799004, + "language_loss": 0.70024467, + "learning_rate": 3.087573588194753e-06, + "loss": 0.72185683, + "num_input_tokens_seen": 120481090, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.83984375, + "step": 5613, + "time_per_iteration": 2.570373773574829 + }, + { + "auxiliary_loss_clip": 0.01129945, + "auxiliary_loss_mlp": 0.01037211, + "balance_loss_clip": 1.02203548, + "balance_loss_mlp": 1.04490113, + "epoch": 0.33753194047797985, + "flos": 18186672407040.0, + "grad_norm": 1.6646408753448763, + "language_loss": 0.79615057, + "learning_rate": 3.087246722218144e-06, + "loss": 0.81782216, + "num_input_tokens_seen": 120500045, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8515625, + "step": 5614, + "time_per_iteration": 2.4379053115844727 + }, + { + "auxiliary_loss_clip": 0.01126744, + "auxiliary_loss_mlp": 0.01040084, + "balance_loss_clip": 1.02331161, + "balance_loss_mlp": 1.04260945, + "epoch": 0.3375920637306478, + "flos": 23148916398720.0, + "grad_norm": 1.8534958586083128, + "language_loss": 0.90879035, + "learning_rate": 3.086919815013031e-06, + "loss": 0.93045861, + "num_input_tokens_seen": 120521125, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.84375, + "step": 5615, + "time_per_iteration": 2.4876632690429688 + }, + { + "auxiliary_loss_clip": 0.0112252, + "auxiliary_loss_mlp": 0.01040203, + "balance_loss_clip": 1.02596951, + "balance_loss_mlp": 1.04105914, + "epoch": 0.3376521869833158, + "flos": 23112215677440.0, + "grad_norm": 1.6970154369052728, + "language_loss": 0.80636102, + "learning_rate": 3.086592866591809e-06, + "loss": 0.82798827, + "num_input_tokens_seen": 120539180, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8125, + "step": 5616, + "time_per_iteration": 2.476569175720215 + }, + { + "auxiliary_loss_clip": 0.01131427, + "auxiliary_loss_mlp": 0.01044004, + "balance_loss_clip": 1.02715993, + "balance_loss_mlp": 1.04379678, + "epoch": 0.33771231023598375, + "flos": 19274585581440.0, + "grad_norm": 2.5053489219363754, + "language_loss": 0.84079826, + "learning_rate": 3.0862658769668774e-06, + "loss": 0.86255258, + "num_input_tokens_seen": 120556280, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.875, + "step": 5617, + "time_per_iteration": 2.4204065799713135 + }, + { + "auxiliary_loss_clip": 0.01125211, + "auxiliary_loss_mlp": 0.01036412, + "balance_loss_clip": 1.02190411, + "balance_loss_mlp": 1.04171932, + "epoch": 0.3377724334886517, + "flos": 18150187167360.0, + "grad_norm": 1.648273719366553, + "language_loss": 0.80173457, + "learning_rate": 3.0859388461506343e-06, + "loss": 0.82335079, + "num_input_tokens_seen": 120575395, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8359375, + "step": 5618, + "time_per_iteration": 2.4789302349090576 + }, + { + "auxiliary_loss_clip": 0.01128326, + "auxiliary_loss_mlp": 0.01034119, + "balance_loss_clip": 1.01895535, + "balance_loss_mlp": 1.04367077, + "epoch": 0.3378325567413197, + "flos": 25775997310080.0, + "grad_norm": 1.9548255306646998, + "language_loss": 0.70458674, + "learning_rate": 3.085611774155481e-06, + "loss": 0.72621119, + "num_input_tokens_seen": 120596075, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.84765625, + "step": 5619, + "time_per_iteration": 2.4674489498138428 + }, + { + "auxiliary_loss_clip": 0.01127452, + "auxiliary_loss_mlp": 0.01047075, + "balance_loss_clip": 1.0322814, + "balance_loss_mlp": 1.04403424, + "epoch": 0.3378926799939877, + "flos": 21317112558720.0, + "grad_norm": 5.009208052913787, + "language_loss": 0.69223797, + "learning_rate": 3.085284660993821e-06, + "loss": 0.7139833, + "num_input_tokens_seen": 120614195, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8359375, + "step": 5620, + "time_per_iteration": 2.475889205932617 + }, + { + "auxiliary_loss_clip": 0.01127115, + "auxiliary_loss_mlp": 0.01046185, + "balance_loss_clip": 1.03159392, + "balance_loss_mlp": 1.04497766, + "epoch": 0.33795280324665566, + "flos": 24900028335360.0, + "grad_norm": 2.0914960236262075, + "language_loss": 0.67498147, + "learning_rate": 3.084957506678058e-06, + "loss": 0.69671446, + "num_input_tokens_seen": 120634475, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8203125, + "step": 5621, + "time_per_iteration": 2.4732306003570557 + }, + { + "auxiliary_loss_clip": 0.01124388, + "auxiliary_loss_mlp": 0.01036572, + "balance_loss_clip": 1.02258897, + "balance_loss_mlp": 1.04336381, + "epoch": 0.33801292649932363, + "flos": 24753943722240.0, + "grad_norm": 1.811430245584347, + "language_loss": 0.82714671, + "learning_rate": 3.0846303112205975e-06, + "loss": 0.84875631, + "num_input_tokens_seen": 120654980, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.80859375, + "step": 5622, + "time_per_iteration": 2.5028531551361084 + }, + { + "auxiliary_loss_clip": 0.01122679, + "auxiliary_loss_mlp": 0.01041633, + "balance_loss_clip": 1.0279355, + "balance_loss_mlp": 1.04111528, + "epoch": 0.3380730497519916, + "flos": 26723967096960.0, + "grad_norm": 1.4271980952069887, + "language_loss": 0.73785996, + "learning_rate": 3.0843030746338464e-06, + "loss": 0.75950313, + "num_input_tokens_seen": 120676245, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.81640625, + "step": 5623, + "time_per_iteration": 2.483354091644287 + }, + { + "auxiliary_loss_clip": 0.01044412, + "auxiliary_loss_mlp": 0.01001556, + "balance_loss_clip": 0.99976796, + "balance_loss_mlp": 1.01787817, + "epoch": 0.33813317300465956, + "flos": 70035756416640.0, + "grad_norm": 0.7308868621653948, + "language_loss": 0.54898107, + "learning_rate": 3.083975796930215e-06, + "loss": 0.56944072, + "num_input_tokens_seen": 120741965, + "router_z_loss_clip": 0.01782227, + "router_z_loss_mlp": 0.265625, + "step": 5624, + "time_per_iteration": 3.2154293060302734 + }, + { + "auxiliary_loss_clip": 0.01128701, + "auxiliary_loss_mlp": 0.01040775, + "balance_loss_clip": 1.02536166, + "balance_loss_mlp": 1.04464245, + "epoch": 0.3381932962573275, + "flos": 24097317148800.0, + "grad_norm": 3.114382300094, + "language_loss": 0.73013008, + "learning_rate": 3.083648478122111e-06, + "loss": 0.75182486, + "num_input_tokens_seen": 120760410, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.83984375, + "step": 5625, + "time_per_iteration": 2.4632089138031006 + }, + { + "auxiliary_loss_clip": 0.01129587, + "auxiliary_loss_mlp": 0.01038275, + "balance_loss_clip": 1.02315879, + "balance_loss_mlp": 1.04408085, + "epoch": 0.3382534195099955, + "flos": 19278248768640.0, + "grad_norm": 1.7442247016960708, + "language_loss": 0.70501375, + "learning_rate": 3.0833211182219497e-06, + "loss": 0.72669238, + "num_input_tokens_seen": 120777705, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.85546875, + "step": 5626, + "time_per_iteration": 2.4782652854919434 + }, + { + "auxiliary_loss_clip": 0.01123049, + "auxiliary_loss_mlp": 0.01033781, + "balance_loss_clip": 1.01887965, + "balance_loss_mlp": 1.04265583, + "epoch": 0.33831354276266346, + "flos": 25226240676480.0, + "grad_norm": 1.496721640957227, + "language_loss": 0.81184483, + "learning_rate": 3.0829937172421425e-06, + "loss": 0.83341312, + "num_input_tokens_seen": 120798660, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8046875, + "step": 5627, + "time_per_iteration": 2.48683762550354 + }, + { + "auxiliary_loss_clip": 0.01133025, + "auxiliary_loss_mlp": 0.01038727, + "balance_loss_clip": 1.02332532, + "balance_loss_mlp": 1.04643917, + "epoch": 0.3383736660153314, + "flos": 23112000195840.0, + "grad_norm": 2.112092075284961, + "language_loss": 0.80725849, + "learning_rate": 3.0826662751951055e-06, + "loss": 0.82897604, + "num_input_tokens_seen": 120816705, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.86328125, + "step": 5628, + "time_per_iteration": 2.485978841781616 + }, + { + "auxiliary_loss_clip": 0.01125942, + "auxiliary_loss_mlp": 0.01032154, + "balance_loss_clip": 1.01716328, + "balance_loss_mlp": 1.04272234, + "epoch": 0.3384337892679994, + "flos": 23477139901440.0, + "grad_norm": 1.9378827683544937, + "language_loss": 0.77360773, + "learning_rate": 3.082338792093254e-06, + "loss": 0.79518872, + "num_input_tokens_seen": 120835375, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.83203125, + "step": 5629, + "time_per_iteration": 2.459749937057495 + }, + { + "auxiliary_loss_clip": 0.0112767, + "auxiliary_loss_mlp": 0.01042637, + "balance_loss_clip": 1.02604353, + "balance_loss_mlp": 1.0426172, + "epoch": 0.33849391252066735, + "flos": 19425805839360.0, + "grad_norm": 1.750727836719773, + "language_loss": 0.84873146, + "learning_rate": 3.0820112679490074e-06, + "loss": 0.87043452, + "num_input_tokens_seen": 120854260, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.84765625, + "step": 5630, + "time_per_iteration": 2.502168655395508 + }, + { + "auxiliary_loss_clip": 0.01128287, + "auxiliary_loss_mlp": 0.01039609, + "balance_loss_clip": 1.02593017, + "balance_loss_mlp": 1.04496086, + "epoch": 0.3385540357733353, + "flos": 21064840364160.0, + "grad_norm": 2.44277401951878, + "language_loss": 0.71778762, + "learning_rate": 3.0816837027747857e-06, + "loss": 0.73946661, + "num_input_tokens_seen": 120871590, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.83203125, + "step": 5631, + "time_per_iteration": 2.4541988372802734 + }, + { + "auxiliary_loss_clip": 0.01044995, + "auxiliary_loss_mlp": 0.01006836, + "balance_loss_clip": 1.0050118, + "balance_loss_mlp": 1.01844144, + "epoch": 0.3386141590260033, + "flos": 69208013450880.0, + "grad_norm": 0.84858361279948, + "language_loss": 0.56171906, + "learning_rate": 3.0813560965830084e-06, + "loss": 0.58223736, + "num_input_tokens_seen": 120925550, + "router_z_loss_clip": 0.01818848, + "router_z_loss_mlp": 0.265625, + "step": 5632, + "time_per_iteration": 3.130112409591675 + }, + { + "auxiliary_loss_clip": 0.01126092, + "auxiliary_loss_mlp": 0.01034757, + "balance_loss_clip": 1.01925933, + "balance_loss_mlp": 1.04301071, + "epoch": 0.3386742822786713, + "flos": 25519487310720.0, + "grad_norm": 1.4746675536042473, + "language_loss": 0.80288029, + "learning_rate": 3.0810284493861005e-06, + "loss": 0.82448882, + "num_input_tokens_seen": 120947620, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.828125, + "step": 5633, + "time_per_iteration": 2.4772210121154785 + }, + { + "auxiliary_loss_clip": 0.01126262, + "auxiliary_loss_mlp": 0.01031131, + "balance_loss_clip": 1.01671278, + "balance_loss_mlp": 1.04355168, + "epoch": 0.33873440553133927, + "flos": 23623116773760.0, + "grad_norm": 2.3860801146544692, + "language_loss": 0.59222949, + "learning_rate": 3.0807007611964855e-06, + "loss": 0.61380345, + "num_input_tokens_seen": 120965205, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.828125, + "step": 5634, + "time_per_iteration": 2.490783214569092 + }, + { + "auxiliary_loss_clip": 0.01124946, + "auxiliary_loss_mlp": 0.01033397, + "balance_loss_clip": 1.01930678, + "balance_loss_mlp": 1.04328096, + "epoch": 0.33879452878400723, + "flos": 17088882992640.0, + "grad_norm": 1.758176339753219, + "language_loss": 0.92591304, + "learning_rate": 3.080373032026589e-06, + "loss": 0.94749641, + "num_input_tokens_seen": 120983560, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.81640625, + "step": 5635, + "time_per_iteration": 2.4895272254943848 + }, + { + "auxiliary_loss_clip": 0.01123271, + "auxiliary_loss_mlp": 0.01030062, + "balance_loss_clip": 1.01594758, + "balance_loss_mlp": 1.04428411, + "epoch": 0.3388546520366752, + "flos": 15742053607680.0, + "grad_norm": 1.7397877385381144, + "language_loss": 0.74791968, + "learning_rate": 3.0800452618888386e-06, + "loss": 0.76945299, + "num_input_tokens_seen": 121001400, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 5636, + "time_per_iteration": 2.4868686199188232 + }, + { + "auxiliary_loss_clip": 0.01123089, + "auxiliary_loss_mlp": 0.01037449, + "balance_loss_clip": 1.02264357, + "balance_loss_mlp": 1.04291928, + "epoch": 0.33891477528934316, + "flos": 22418744728320.0, + "grad_norm": 1.533650755617547, + "language_loss": 0.83216572, + "learning_rate": 3.0797174507956637e-06, + "loss": 0.85377115, + "num_input_tokens_seen": 121021760, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.80078125, + "step": 5637, + "time_per_iteration": 5.43249249458313 + }, + { + "auxiliary_loss_clip": 0.0112926, + "auxiliary_loss_mlp": 0.01044612, + "balance_loss_clip": 1.02837586, + "balance_loss_mlp": 1.04624391, + "epoch": 0.3389748985420111, + "flos": 17274828723840.0, + "grad_norm": 1.6200031021198193, + "language_loss": 0.70037901, + "learning_rate": 3.079389598759495e-06, + "loss": 0.72211778, + "num_input_tokens_seen": 121041070, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.828125, + "step": 5638, + "time_per_iteration": 2.430814504623413 + }, + { + "auxiliary_loss_clip": 0.01128885, + "auxiliary_loss_mlp": 0.0104494, + "balance_loss_clip": 1.02993131, + "balance_loss_mlp": 1.0461942, + "epoch": 0.3390350217946791, + "flos": 27744979190400.0, + "grad_norm": 1.644027939558444, + "language_loss": 0.80699074, + "learning_rate": 3.079061705792765e-06, + "loss": 0.82872897, + "num_input_tokens_seen": 121060890, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 5639, + "time_per_iteration": 2.5219810009002686 + }, + { + "auxiliary_loss_clip": 0.01129363, + "auxiliary_loss_mlp": 0.01042542, + "balance_loss_clip": 1.02714002, + "balance_loss_mlp": 1.044734, + "epoch": 0.33909514504734706, + "flos": 20339804338560.0, + "grad_norm": 2.006873412015597, + "language_loss": 0.67907631, + "learning_rate": 3.078733771907907e-06, + "loss": 0.70079535, + "num_input_tokens_seen": 121079135, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84765625, + "step": 5640, + "time_per_iteration": 2.4252562522888184 + }, + { + "auxiliary_loss_clip": 0.01123424, + "auxiliary_loss_mlp": 0.01037389, + "balance_loss_clip": 1.02229738, + "balance_loss_mlp": 1.0432744, + "epoch": 0.339155268300015, + "flos": 14830030356480.0, + "grad_norm": 1.561334672972187, + "language_loss": 0.70158339, + "learning_rate": 3.0784057971173554e-06, + "loss": 0.72319156, + "num_input_tokens_seen": 121097685, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.80078125, + "step": 5641, + "time_per_iteration": 2.4703073501586914 + }, + { + "auxiliary_loss_clip": 0.01129782, + "auxiliary_loss_mlp": 0.01043462, + "balance_loss_clip": 1.02881122, + "balance_loss_mlp": 1.04692698, + "epoch": 0.339215391552683, + "flos": 26067951054720.0, + "grad_norm": 1.7323035027878293, + "language_loss": 0.87336594, + "learning_rate": 3.0780777814335483e-06, + "loss": 0.89509839, + "num_input_tokens_seen": 121115640, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.828125, + "step": 5642, + "time_per_iteration": 2.489957332611084 + }, + { + "auxiliary_loss_clip": 0.01119376, + "auxiliary_loss_mlp": 0.01030563, + "balance_loss_clip": 1.01812363, + "balance_loss_mlp": 1.04361117, + "epoch": 0.33927551480535095, + "flos": 14574705505920.0, + "grad_norm": 1.899951429632433, + "language_loss": 0.83783317, + "learning_rate": 3.077749724868924e-06, + "loss": 0.85933256, + "num_input_tokens_seen": 121132485, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7578125, + "step": 5643, + "time_per_iteration": 2.454176902770996 + }, + { + "auxiliary_loss_clip": 0.01122874, + "auxiliary_loss_mlp": 0.01041824, + "balance_loss_clip": 1.02779329, + "balance_loss_mlp": 1.04303253, + "epoch": 0.3393356380580189, + "flos": 23805578885760.0, + "grad_norm": 1.6286036888414737, + "language_loss": 0.76940101, + "learning_rate": 3.077421627435922e-06, + "loss": 0.79104799, + "num_input_tokens_seen": 121152935, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.796875, + "step": 5644, + "time_per_iteration": 2.46893048286438 + }, + { + "auxiliary_loss_clip": 0.01124612, + "auxiliary_loss_mlp": 0.01043858, + "balance_loss_clip": 1.02898121, + "balance_loss_mlp": 1.04242706, + "epoch": 0.3393957613106869, + "flos": 17347871030400.0, + "grad_norm": 4.638882451456986, + "language_loss": 0.62893367, + "learning_rate": 3.0770934891469832e-06, + "loss": 0.65061837, + "num_input_tokens_seen": 121169835, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8203125, + "step": 5645, + "time_per_iteration": 2.4539859294891357 + }, + { + "auxiliary_loss_clip": 0.01120022, + "auxiliary_loss_mlp": 0.01033694, + "balance_loss_clip": 1.02033067, + "balance_loss_mlp": 1.04122853, + "epoch": 0.3394558845633549, + "flos": 28433960939520.0, + "grad_norm": 2.1237754414429637, + "language_loss": 0.76276195, + "learning_rate": 3.076765310014552e-06, + "loss": 0.78429914, + "num_input_tokens_seen": 121190290, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7890625, + "step": 5646, + "time_per_iteration": 2.4913554191589355 + }, + { + "auxiliary_loss_clip": 0.01128945, + "auxiliary_loss_mlp": 0.01043856, + "balance_loss_clip": 1.02835846, + "balance_loss_mlp": 1.04360342, + "epoch": 0.33951600781602287, + "flos": 22086929865600.0, + "grad_norm": 1.9547585113359744, + "language_loss": 0.79175937, + "learning_rate": 3.0764370900510727e-06, + "loss": 0.81348741, + "num_input_tokens_seen": 121209060, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.85546875, + "step": 5647, + "time_per_iteration": 2.521603584289551 + }, + { + "auxiliary_loss_clip": 0.01128449, + "auxiliary_loss_mlp": 0.01040236, + "balance_loss_clip": 1.02541864, + "balance_loss_mlp": 1.04706085, + "epoch": 0.33957613106869083, + "flos": 23878262056320.0, + "grad_norm": 1.87789373580567, + "language_loss": 0.77358377, + "learning_rate": 3.0761088292689904e-06, + "loss": 0.79527068, + "num_input_tokens_seen": 121227480, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8125, + "step": 5648, + "time_per_iteration": 2.4812231063842773 + }, + { + "auxiliary_loss_clip": 0.0104448, + "auxiliary_loss_mlp": 0.01001624, + "balance_loss_clip": 0.99964541, + "balance_loss_mlp": 1.01817107, + "epoch": 0.3396362543213588, + "flos": 71242642414080.0, + "grad_norm": 0.7825270224300925, + "language_loss": 0.56261832, + "learning_rate": 3.075780527680754e-06, + "loss": 0.5830794, + "num_input_tokens_seen": 121291305, + "router_z_loss_clip": 0.01977539, + "router_z_loss_mlp": 0.26171875, + "step": 5649, + "time_per_iteration": 3.1050350666046143 + }, + { + "auxiliary_loss_clip": 0.01123703, + "auxiliary_loss_mlp": 0.01042955, + "balance_loss_clip": 1.02804756, + "balance_loss_mlp": 1.0422622, + "epoch": 0.33969637757402676, + "flos": 25921615046400.0, + "grad_norm": 1.5021179324123226, + "language_loss": 0.85269898, + "learning_rate": 3.0754521852988117e-06, + "loss": 0.87436557, + "num_input_tokens_seen": 121312740, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8125, + "step": 5650, + "time_per_iteration": 2.5013816356658936 + }, + { + "auxiliary_loss_clip": 0.01123225, + "auxiliary_loss_mlp": 0.01029214, + "balance_loss_clip": 1.01540327, + "balance_loss_mlp": 1.04317355, + "epoch": 0.33975650082669473, + "flos": 35261728663680.0, + "grad_norm": 1.6954461839420942, + "language_loss": 0.70868433, + "learning_rate": 3.0751238021356152e-06, + "loss": 0.73020875, + "num_input_tokens_seen": 121334220, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.80078125, + "step": 5651, + "time_per_iteration": 2.579455852508545 + }, + { + "auxiliary_loss_clip": 0.01123721, + "auxiliary_loss_mlp": 0.01037329, + "balance_loss_clip": 1.02354813, + "balance_loss_mlp": 1.04347372, + "epoch": 0.3398166240793627, + "flos": 16647001879680.0, + "grad_norm": 1.7042541017727943, + "language_loss": 0.81267643, + "learning_rate": 3.074795378203616e-06, + "loss": 0.83428693, + "num_input_tokens_seen": 121351870, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.8046875, + "step": 5652, + "time_per_iteration": 2.4690871238708496 + }, + { + "auxiliary_loss_clip": 0.01128696, + "auxiliary_loss_mlp": 0.01041185, + "balance_loss_clip": 1.02670693, + "balance_loss_mlp": 1.04464078, + "epoch": 0.33987674733203066, + "flos": 24062196625920.0, + "grad_norm": 1.8642865553854127, + "language_loss": 0.77315342, + "learning_rate": 3.0744669135152685e-06, + "loss": 0.79485226, + "num_input_tokens_seen": 121373400, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.83984375, + "step": 5653, + "time_per_iteration": 2.4836156368255615 + }, + { + "auxiliary_loss_clip": 0.01123907, + "auxiliary_loss_mlp": 0.01036159, + "balance_loss_clip": 1.02225959, + "balance_loss_mlp": 1.04310441, + "epoch": 0.3399368705846986, + "flos": 13250678279040.0, + "grad_norm": 4.3033812467068895, + "language_loss": 0.85072839, + "learning_rate": 3.0741384080830278e-06, + "loss": 0.87232912, + "num_input_tokens_seen": 121385225, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.80859375, + "step": 5654, + "time_per_iteration": 2.4139702320098877 + }, + { + "auxiliary_loss_clip": 0.01122836, + "auxiliary_loss_mlp": 0.01042828, + "balance_loss_clip": 1.02853489, + "balance_loss_mlp": 1.04074049, + "epoch": 0.3399969938373666, + "flos": 27012832272000.0, + "grad_norm": 5.132089356193866, + "language_loss": 0.65128249, + "learning_rate": 3.073809861919351e-06, + "loss": 0.67293918, + "num_input_tokens_seen": 121404735, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8203125, + "step": 5655, + "time_per_iteration": 2.475292444229126 + }, + { + "auxiliary_loss_clip": 0.01124776, + "auxiliary_loss_mlp": 0.01041891, + "balance_loss_clip": 1.02781832, + "balance_loss_mlp": 1.04365194, + "epoch": 0.34005711709003456, + "flos": 28550096588160.0, + "grad_norm": 1.4436453355930483, + "language_loss": 0.76766688, + "learning_rate": 3.073481275036697e-06, + "loss": 0.78933358, + "num_input_tokens_seen": 121426780, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8125, + "step": 5656, + "time_per_iteration": 2.550999879837036 + }, + { + "auxiliary_loss_clip": 0.01130894, + "auxiliary_loss_mlp": 0.01039317, + "balance_loss_clip": 1.02413023, + "balance_loss_mlp": 1.04413342, + "epoch": 0.3401172403427025, + "flos": 21617003208960.0, + "grad_norm": 1.5863892165941962, + "language_loss": 0.82438695, + "learning_rate": 3.073152647447525e-06, + "loss": 0.84608912, + "num_input_tokens_seen": 121447245, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8671875, + "step": 5657, + "time_per_iteration": 2.4573473930358887 + }, + { + "auxiliary_loss_clip": 0.01122831, + "auxiliary_loss_mlp": 0.01040787, + "balance_loss_clip": 1.02675629, + "balance_loss_mlp": 1.04342616, + "epoch": 0.3401773635953705, + "flos": 25885776251520.0, + "grad_norm": 1.6511746791476316, + "language_loss": 0.85153604, + "learning_rate": 3.0728239791642976e-06, + "loss": 0.87317222, + "num_input_tokens_seen": 121468165, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.79296875, + "step": 5658, + "time_per_iteration": 2.505319833755493 + }, + { + "auxiliary_loss_clip": 0.01042351, + "auxiliary_loss_mlp": 0.01002353, + "balance_loss_clip": 1.0001955, + "balance_loss_mlp": 1.01611352, + "epoch": 0.3402374868480385, + "flos": 65507995336320.0, + "grad_norm": 0.8147477326465351, + "language_loss": 0.60012162, + "learning_rate": 3.072495270199477e-06, + "loss": 0.62056863, + "num_input_tokens_seen": 121523795, + "router_z_loss_clip": 0.02160645, + "router_z_loss_mlp": 0.26171875, + "step": 5659, + "time_per_iteration": 3.024125814437866 + }, + { + "auxiliary_loss_clip": 0.01122626, + "auxiliary_loss_mlp": 0.01035679, + "balance_loss_clip": 1.02190423, + "balance_loss_mlp": 1.04398155, + "epoch": 0.34029761010070647, + "flos": 24060580513920.0, + "grad_norm": 1.936270792227836, + "language_loss": 0.67855251, + "learning_rate": 3.0721665205655284e-06, + "loss": 0.70013559, + "num_input_tokens_seen": 121542950, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.78515625, + "step": 5660, + "time_per_iteration": 2.5009706020355225 + }, + { + "auxiliary_loss_clip": 0.01125634, + "auxiliary_loss_mlp": 0.01045639, + "balance_loss_clip": 1.0307138, + "balance_loss_mlp": 1.04558277, + "epoch": 0.34035773335337444, + "flos": 27599720590080.0, + "grad_norm": 1.6106101267942714, + "language_loss": 0.67213613, + "learning_rate": 3.071837730274918e-06, + "loss": 0.69384885, + "num_input_tokens_seen": 121562765, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.80078125, + "step": 5661, + "time_per_iteration": 2.501034736633301 + }, + { + "auxiliary_loss_clip": 0.01123137, + "auxiliary_loss_mlp": 0.01037886, + "balance_loss_clip": 1.0241766, + "balance_loss_mlp": 1.04442382, + "epoch": 0.3404178566060424, + "flos": 20812783651200.0, + "grad_norm": 1.9145784194305409, + "language_loss": 0.78845918, + "learning_rate": 3.071508899340113e-06, + "loss": 0.81006938, + "num_input_tokens_seen": 121581610, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7890625, + "step": 5662, + "time_per_iteration": 2.4689018726348877 + }, + { + "auxiliary_loss_clip": 0.01123734, + "auxiliary_loss_mlp": 0.01039121, + "balance_loss_clip": 1.02395773, + "balance_loss_mlp": 1.04277706, + "epoch": 0.34047797985871037, + "flos": 26833566470400.0, + "grad_norm": 1.9415115692891318, + "language_loss": 0.73675144, + "learning_rate": 3.0711800277735833e-06, + "loss": 0.75838, + "num_input_tokens_seen": 121601885, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.80859375, + "step": 5663, + "time_per_iteration": 2.4802587032318115 + }, + { + "auxiliary_loss_clip": 0.01121343, + "auxiliary_loss_mlp": 0.01034164, + "balance_loss_clip": 1.02101541, + "balance_loss_mlp": 1.04342198, + "epoch": 0.34053810311137833, + "flos": 19682639061120.0, + "grad_norm": 2.0753473798431608, + "language_loss": 0.85900557, + "learning_rate": 3.0708511155877997e-06, + "loss": 0.88056058, + "num_input_tokens_seen": 121621335, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.77734375, + "step": 5664, + "time_per_iteration": 2.46343731880188 + }, + { + "auxiliary_loss_clip": 0.01127455, + "auxiliary_loss_mlp": 0.01033802, + "balance_loss_clip": 1.02055156, + "balance_loss_mlp": 1.0459125, + "epoch": 0.3405982263640463, + "flos": 21725740656000.0, + "grad_norm": 1.782528704092853, + "language_loss": 0.69047546, + "learning_rate": 3.070522162795235e-06, + "loss": 0.71208799, + "num_input_tokens_seen": 121641310, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.81640625, + "step": 5665, + "time_per_iteration": 2.4448721408843994 + }, + { + "auxiliary_loss_clip": 0.01123992, + "auxiliary_loss_mlp": 0.01035732, + "balance_loss_clip": 1.02006817, + "balance_loss_mlp": 1.04218054, + "epoch": 0.34065834961671426, + "flos": 18041629288320.0, + "grad_norm": 2.296518315240935, + "language_loss": 0.72806692, + "learning_rate": 3.0701931694083626e-06, + "loss": 0.74966413, + "num_input_tokens_seen": 121659625, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8203125, + "step": 5666, + "time_per_iteration": 2.4749717712402344 + }, + { + "auxiliary_loss_clip": 0.01126484, + "auxiliary_loss_mlp": 0.0103642, + "balance_loss_clip": 1.02236485, + "balance_loss_mlp": 1.04428983, + "epoch": 0.3407184728693822, + "flos": 21397337585280.0, + "grad_norm": 1.5083890198292058, + "language_loss": 0.73306108, + "learning_rate": 3.0698641354396576e-06, + "loss": 0.75469005, + "num_input_tokens_seen": 121679205, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8203125, + "step": 5667, + "time_per_iteration": 2.467684030532837 + }, + { + "auxiliary_loss_clip": 0.0104148, + "auxiliary_loss_mlp": 0.01001962, + "balance_loss_clip": 1.00007808, + "balance_loss_mlp": 1.01518095, + "epoch": 0.3407785961220502, + "flos": 68688101018880.0, + "grad_norm": 0.8424548288565059, + "language_loss": 0.6331358, + "learning_rate": 3.069535060901597e-06, + "loss": 0.65357018, + "num_input_tokens_seen": 121751085, + "router_z_loss_clip": 0.01879883, + "router_z_loss_mlp": 0.26367188, + "step": 5668, + "time_per_iteration": 3.233991861343384 + }, + { + "auxiliary_loss_clip": 0.01124776, + "auxiliary_loss_mlp": 0.01039147, + "balance_loss_clip": 1.02460372, + "balance_loss_mlp": 1.04407477, + "epoch": 0.34083871937471816, + "flos": 14064379027200.0, + "grad_norm": 2.1457172939364892, + "language_loss": 0.72030753, + "learning_rate": 3.0692059458066596e-06, + "loss": 0.74194676, + "num_input_tokens_seen": 121768565, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.80859375, + "step": 5669, + "time_per_iteration": 2.4226186275482178 + }, + { + "auxiliary_loss_clip": 0.01127607, + "auxiliary_loss_mlp": 0.01035265, + "balance_loss_clip": 1.02078128, + "balance_loss_mlp": 1.04468203, + "epoch": 0.3408988426273861, + "flos": 17085435287040.0, + "grad_norm": 1.9050671295461388, + "language_loss": 0.80285168, + "learning_rate": 3.0688767901673265e-06, + "loss": 0.82448041, + "num_input_tokens_seen": 121784925, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.828125, + "step": 5670, + "time_per_iteration": 2.4354984760284424 + }, + { + "auxiliary_loss_clip": 0.01127772, + "auxiliary_loss_mlp": 0.0103567, + "balance_loss_clip": 1.02122176, + "balance_loss_mlp": 1.04374027, + "epoch": 0.3409589658800541, + "flos": 24024562151040.0, + "grad_norm": 1.5994061750955757, + "language_loss": 0.76886785, + "learning_rate": 3.068547593996078e-06, + "loss": 0.79050225, + "num_input_tokens_seen": 121804425, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.83984375, + "step": 5671, + "time_per_iteration": 2.4775397777557373 + }, + { + "auxiliary_loss_clip": 0.01125342, + "auxiliary_loss_mlp": 0.01040087, + "balance_loss_clip": 1.02513266, + "balance_loss_mlp": 1.04437792, + "epoch": 0.34101908913272205, + "flos": 21142012734720.0, + "grad_norm": 1.9602332848552635, + "language_loss": 0.74416959, + "learning_rate": 3.0682183573053974e-06, + "loss": 0.7658239, + "num_input_tokens_seen": 121825145, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.80859375, + "step": 5672, + "time_per_iteration": 2.5027272701263428 + }, + { + "auxiliary_loss_clip": 0.01127201, + "auxiliary_loss_mlp": 0.01032286, + "balance_loss_clip": 1.01889324, + "balance_loss_mlp": 1.04523087, + "epoch": 0.3410792123853901, + "flos": 15702012921600.0, + "grad_norm": 1.991076139860355, + "language_loss": 0.73781157, + "learning_rate": 3.06788908010777e-06, + "loss": 0.75940639, + "num_input_tokens_seen": 121842185, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.8203125, + "step": 5673, + "time_per_iteration": 2.424955368041992 + }, + { + "auxiliary_loss_clip": 0.01123926, + "auxiliary_loss_mlp": 0.01036446, + "balance_loss_clip": 1.02243853, + "balance_loss_mlp": 1.04432535, + "epoch": 0.34113933563805804, + "flos": 23036012974080.0, + "grad_norm": 1.774655206888726, + "language_loss": 0.79900169, + "learning_rate": 3.067559762415682e-06, + "loss": 0.8206054, + "num_input_tokens_seen": 121862260, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.796875, + "step": 5674, + "time_per_iteration": 2.490407705307007 + }, + { + "auxiliary_loss_clip": 0.01041345, + "auxiliary_loss_mlp": 0.01001058, + "balance_loss_clip": 0.99942493, + "balance_loss_mlp": 1.01517344, + "epoch": 0.341199458890726, + "flos": 69614235336960.0, + "grad_norm": 0.7963469989165133, + "language_loss": 0.56096685, + "learning_rate": 3.0672304042416198e-06, + "loss": 0.58139086, + "num_input_tokens_seen": 121923560, + "router_z_loss_clip": 0.01635742, + "router_z_loss_mlp": 0.26171875, + "step": 5675, + "time_per_iteration": 3.223119020462036 + }, + { + "auxiliary_loss_clip": 0.01123194, + "auxiliary_loss_mlp": 0.01041089, + "balance_loss_clip": 1.0270282, + "balance_loss_mlp": 1.04428756, + "epoch": 0.34125958214339397, + "flos": 22346348866560.0, + "grad_norm": 1.6179892480447855, + "language_loss": 0.79029286, + "learning_rate": 3.0669010055980734e-06, + "loss": 0.81193566, + "num_input_tokens_seen": 121943515, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 5676, + "time_per_iteration": 2.4798848628997803 + }, + { + "auxiliary_loss_clip": 0.01123343, + "auxiliary_loss_mlp": 0.0103332, + "balance_loss_clip": 1.01836538, + "balance_loss_mlp": 1.0424788, + "epoch": 0.34131970539606193, + "flos": 21871933009920.0, + "grad_norm": 1.8072554320592242, + "language_loss": 0.85598934, + "learning_rate": 3.0665715664975357e-06, + "loss": 0.87755597, + "num_input_tokens_seen": 121962540, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.80859375, + "step": 5677, + "time_per_iteration": 2.4501733779907227 + }, + { + "auxiliary_loss_clip": 0.01122654, + "auxiliary_loss_mlp": 0.01041495, + "balance_loss_clip": 1.0266118, + "balance_loss_mlp": 1.04244757, + "epoch": 0.3413798286487299, + "flos": 24935723475840.0, + "grad_norm": 2.009404852791833, + "language_loss": 0.79283166, + "learning_rate": 3.0662420869524966e-06, + "loss": 0.81447315, + "num_input_tokens_seen": 121979830, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.80078125, + "step": 5678, + "time_per_iteration": 4.054651260375977 + }, + { + "auxiliary_loss_clip": 0.01123013, + "auxiliary_loss_mlp": 0.01033318, + "balance_loss_clip": 1.01983547, + "balance_loss_mlp": 1.04135132, + "epoch": 0.34143995190139786, + "flos": 25374372364800.0, + "grad_norm": 1.8818653655236122, + "language_loss": 0.74546856, + "learning_rate": 3.0659125669754506e-06, + "loss": 0.76703185, + "num_input_tokens_seen": 121999055, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.81640625, + "step": 5679, + "time_per_iteration": 3.9024462699890137 + }, + { + "auxiliary_loss_clip": 0.01042201, + "auxiliary_loss_mlp": 0.01001255, + "balance_loss_clip": 0.99970549, + "balance_loss_mlp": 1.01624846, + "epoch": 0.34150007515406583, + "flos": 67782578129280.0, + "grad_norm": 0.7519133883291979, + "language_loss": 0.59481025, + "learning_rate": 3.0655830065788923e-06, + "loss": 0.61524487, + "num_input_tokens_seen": 122067015, + "router_z_loss_clip": 0.01544189, + "router_z_loss_mlp": 0.25976562, + "step": 5680, + "time_per_iteration": 3.152480125427246 + }, + { + "auxiliary_loss_clip": 0.01121207, + "auxiliary_loss_mlp": 0.01033444, + "balance_loss_clip": 1.01953864, + "balance_loss_mlp": 1.04320455, + "epoch": 0.3415601984067338, + "flos": 20302421258880.0, + "grad_norm": 2.208026502208574, + "language_loss": 0.7233687, + "learning_rate": 3.0652534057753206e-06, + "loss": 0.74491525, + "num_input_tokens_seen": 122085295, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.78125, + "step": 5681, + "time_per_iteration": 2.4450337886810303 + }, + { + "auxiliary_loss_clip": 0.01118824, + "auxiliary_loss_mlp": 0.01041972, + "balance_loss_clip": 1.02798879, + "balance_loss_mlp": 1.04110432, + "epoch": 0.34162032165940176, + "flos": 26031178506240.0, + "grad_norm": 2.0075854608407058, + "language_loss": 0.7144351, + "learning_rate": 3.064923764577233e-06, + "loss": 0.7360431, + "num_input_tokens_seen": 122104020, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7734375, + "step": 5682, + "time_per_iteration": 2.53000807762146 + }, + { + "auxiliary_loss_clip": 0.01120348, + "auxiliary_loss_mlp": 0.01039153, + "balance_loss_clip": 1.02446055, + "balance_loss_mlp": 1.04079127, + "epoch": 0.3416804449120697, + "flos": 28803338449920.0, + "grad_norm": 1.4570201559150766, + "language_loss": 0.8396616, + "learning_rate": 3.0645940829971295e-06, + "loss": 0.86125666, + "num_input_tokens_seen": 122125080, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.796875, + "step": 5683, + "time_per_iteration": 2.511646270751953 + }, + { + "auxiliary_loss_clip": 0.01126192, + "auxiliary_loss_mlp": 0.01047188, + "balance_loss_clip": 1.03189898, + "balance_loss_mlp": 1.04384482, + "epoch": 0.3417405681647377, + "flos": 22601601889920.0, + "grad_norm": 2.5567263249521965, + "language_loss": 0.70622635, + "learning_rate": 3.0642643610475116e-06, + "loss": 0.72796011, + "num_input_tokens_seen": 122146350, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.82421875, + "step": 5684, + "time_per_iteration": 2.58811616897583 + }, + { + "auxiliary_loss_clip": 0.01120756, + "auxiliary_loss_mlp": 0.01034084, + "balance_loss_clip": 1.02119195, + "balance_loss_mlp": 1.0428822, + "epoch": 0.34180069141740566, + "flos": 24716237420160.0, + "grad_norm": 1.480860615854928, + "language_loss": 0.75386423, + "learning_rate": 3.0639345987408823e-06, + "loss": 0.77541268, + "num_input_tokens_seen": 122168085, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.78125, + "step": 5685, + "time_per_iteration": 2.485405445098877 + }, + { + "auxiliary_loss_clip": 0.01120925, + "auxiliary_loss_mlp": 0.0103682, + "balance_loss_clip": 1.02399325, + "balance_loss_mlp": 1.04268134, + "epoch": 0.3418608146700737, + "flos": 30518755246080.0, + "grad_norm": 1.6707381387615057, + "language_loss": 0.70186603, + "learning_rate": 3.0636047960897468e-06, + "loss": 0.72344351, + "num_input_tokens_seen": 122191040, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.78125, + "step": 5686, + "time_per_iteration": 2.5536224842071533 + }, + { + "auxiliary_loss_clip": 0.01121848, + "auxiliary_loss_mlp": 0.01042102, + "balance_loss_clip": 1.02681327, + "balance_loss_mlp": 1.04087019, + "epoch": 0.34192093792274164, + "flos": 15122343237120.0, + "grad_norm": 2.6880234800017844, + "language_loss": 0.77629769, + "learning_rate": 3.06327495310661e-06, + "loss": 0.79793721, + "num_input_tokens_seen": 122209225, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8125, + "step": 5687, + "time_per_iteration": 2.4526383876800537 + }, + { + "auxiliary_loss_clip": 0.01122013, + "auxiliary_loss_mlp": 0.01034386, + "balance_loss_clip": 1.01947296, + "balance_loss_mlp": 1.04425466, + "epoch": 0.3419810611754096, + "flos": 13187799521280.0, + "grad_norm": 1.7522626505921908, + "language_loss": 0.86505169, + "learning_rate": 3.062945069803981e-06, + "loss": 0.88661563, + "num_input_tokens_seen": 122226160, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.77734375, + "step": 5688, + "time_per_iteration": 2.457873821258545 + }, + { + "auxiliary_loss_clip": 0.01129554, + "auxiliary_loss_mlp": 0.01038372, + "balance_loss_clip": 1.02274323, + "balance_loss_mlp": 1.04438853, + "epoch": 0.34204118442807757, + "flos": 19536267139200.0, + "grad_norm": 1.6277101200549902, + "language_loss": 0.79875666, + "learning_rate": 3.0626151461943684e-06, + "loss": 0.82043588, + "num_input_tokens_seen": 122243115, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8515625, + "step": 5689, + "time_per_iteration": 2.4494895935058594 + }, + { + "auxiliary_loss_clip": 0.01124588, + "auxiliary_loss_mlp": 0.01038419, + "balance_loss_clip": 1.02351832, + "balance_loss_mlp": 1.04300821, + "epoch": 0.34210130768074554, + "flos": 15194846839680.0, + "grad_norm": 2.0745412821804057, + "language_loss": 0.7351048, + "learning_rate": 3.0622851822902834e-06, + "loss": 0.75673485, + "num_input_tokens_seen": 122261105, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.81640625, + "step": 5690, + "time_per_iteration": 2.448133945465088 + }, + { + "auxiliary_loss_clip": 0.01120421, + "auxiliary_loss_mlp": 0.01036215, + "balance_loss_clip": 1.02270865, + "balance_loss_mlp": 1.03998768, + "epoch": 0.3421614309334135, + "flos": 24936226266240.0, + "grad_norm": 2.433761635396741, + "language_loss": 0.7631194, + "learning_rate": 3.061955178104237e-06, + "loss": 0.78468573, + "num_input_tokens_seen": 122279995, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.8046875, + "step": 5691, + "time_per_iteration": 2.479569435119629 + }, + { + "auxiliary_loss_clip": 0.01120907, + "auxiliary_loss_mlp": 0.01041441, + "balance_loss_clip": 1.02782106, + "balance_loss_mlp": 1.0415988, + "epoch": 0.34222155418608147, + "flos": 21908633731200.0, + "grad_norm": 1.5387604656502187, + "language_loss": 0.68159282, + "learning_rate": 3.0616251336487447e-06, + "loss": 0.70321631, + "num_input_tokens_seen": 122299070, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.79296875, + "step": 5692, + "time_per_iteration": 2.490466356277466 + }, + { + "auxiliary_loss_clip": 0.01124667, + "auxiliary_loss_mlp": 0.01042741, + "balance_loss_clip": 1.02682638, + "balance_loss_mlp": 1.04275179, + "epoch": 0.34228167743874943, + "flos": 18114061063680.0, + "grad_norm": 2.6924087388900606, + "language_loss": 0.72292894, + "learning_rate": 3.06129504893632e-06, + "loss": 0.74460298, + "num_input_tokens_seen": 122316800, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8203125, + "step": 5693, + "time_per_iteration": 2.451026439666748 + }, + { + "auxiliary_loss_clip": 0.01122133, + "auxiliary_loss_mlp": 0.01037278, + "balance_loss_clip": 1.02408743, + "balance_loss_mlp": 1.0417974, + "epoch": 0.3423418006914174, + "flos": 21288600138240.0, + "grad_norm": 1.7157866574439644, + "language_loss": 0.75877678, + "learning_rate": 3.0609649239794813e-06, + "loss": 0.78037089, + "num_input_tokens_seen": 122335275, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.8046875, + "step": 5694, + "time_per_iteration": 2.499997615814209 + }, + { + "auxiliary_loss_clip": 0.01119791, + "auxiliary_loss_mlp": 0.01036927, + "balance_loss_clip": 1.02320051, + "balance_loss_mlp": 1.04253125, + "epoch": 0.34240192394408536, + "flos": 19823480288640.0, + "grad_norm": 1.9697512050835562, + "language_loss": 0.79815507, + "learning_rate": 3.060634758790747e-06, + "loss": 0.81972229, + "num_input_tokens_seen": 122353215, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7734375, + "step": 5695, + "time_per_iteration": 2.4279983043670654 + }, + { + "auxiliary_loss_clip": 0.01122261, + "auxiliary_loss_mlp": 0.01039624, + "balance_loss_clip": 1.0248661, + "balance_loss_mlp": 1.04168487, + "epoch": 0.3424620471967533, + "flos": 24535535074560.0, + "grad_norm": 1.7314755849975545, + "language_loss": 0.73487073, + "learning_rate": 3.060304553382635e-06, + "loss": 0.75648957, + "num_input_tokens_seen": 122372495, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8046875, + "step": 5696, + "time_per_iteration": 2.507782459259033 + }, + { + "auxiliary_loss_clip": 0.01122963, + "auxiliary_loss_mlp": 0.01047657, + "balance_loss_clip": 1.03301835, + "balance_loss_mlp": 1.0419805, + "epoch": 0.3425221704494213, + "flos": 25848895962240.0, + "grad_norm": 1.6676891559017708, + "language_loss": 0.70874155, + "learning_rate": 3.0599743077676685e-06, + "loss": 0.73044771, + "num_input_tokens_seen": 122394600, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.80859375, + "step": 5697, + "time_per_iteration": 2.4868175983428955 + }, + { + "auxiliary_loss_clip": 0.01122392, + "auxiliary_loss_mlp": 0.01034383, + "balance_loss_clip": 1.01949954, + "balance_loss_mlp": 1.04456246, + "epoch": 0.34258229370208926, + "flos": 21540513196800.0, + "grad_norm": 1.6712097888676536, + "language_loss": 0.81875223, + "learning_rate": 3.05964402195837e-06, + "loss": 0.84031999, + "num_input_tokens_seen": 122414700, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.77734375, + "step": 5698, + "time_per_iteration": 2.500499725341797 + }, + { + "auxiliary_loss_clip": 0.01121288, + "auxiliary_loss_mlp": 0.01043706, + "balance_loss_clip": 1.02712393, + "balance_loss_mlp": 1.03982306, + "epoch": 0.3426424169547573, + "flos": 23652778429440.0, + "grad_norm": 1.9988541020523172, + "language_loss": 0.69163442, + "learning_rate": 3.0593136959672645e-06, + "loss": 0.71328437, + "num_input_tokens_seen": 122432760, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8125, + "step": 5699, + "time_per_iteration": 2.4522063732147217 + }, + { + "auxiliary_loss_clip": 0.01123011, + "auxiliary_loss_mlp": 0.01035122, + "balance_loss_clip": 1.0211153, + "balance_loss_mlp": 1.0424068, + "epoch": 0.34270254020742524, + "flos": 24644883052800.0, + "grad_norm": 2.0139701241951196, + "language_loss": 0.72246462, + "learning_rate": 3.058983329806877e-06, + "loss": 0.74404591, + "num_input_tokens_seen": 122449105, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8046875, + "step": 5700, + "time_per_iteration": 2.4942879676818848 + }, + { + "auxiliary_loss_clip": 0.01123902, + "auxiliary_loss_mlp": 0.01033961, + "balance_loss_clip": 1.02018046, + "balance_loss_mlp": 1.04403377, + "epoch": 0.3427626634600932, + "flos": 20996754134400.0, + "grad_norm": 2.026861038115517, + "language_loss": 0.81818259, + "learning_rate": 3.0586529234897354e-06, + "loss": 0.83976114, + "num_input_tokens_seen": 122468700, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.796875, + "step": 5701, + "time_per_iteration": 2.4650135040283203 + }, + { + "auxiliary_loss_clip": 0.01124816, + "auxiliary_loss_mlp": 0.01032731, + "balance_loss_clip": 1.01886129, + "balance_loss_mlp": 1.04328442, + "epoch": 0.3428227867127612, + "flos": 21433786911360.0, + "grad_norm": 1.616013756330385, + "language_loss": 0.71818215, + "learning_rate": 3.0583224770283694e-06, + "loss": 0.73975766, + "num_input_tokens_seen": 122488160, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.81640625, + "step": 5702, + "time_per_iteration": 2.446018695831299 + }, + { + "auxiliary_loss_clip": 0.01038258, + "auxiliary_loss_mlp": 0.01007974, + "balance_loss_clip": 1.00623345, + "balance_loss_mlp": 1.01261425, + "epoch": 0.34288290996542914, + "flos": 55731782695680.0, + "grad_norm": 0.78067456401119, + "language_loss": 0.57387871, + "learning_rate": 3.057991990435309e-06, + "loss": 0.5943411, + "num_input_tokens_seen": 122542890, + "router_z_loss_clip": 0.01745605, + "router_z_loss_mlp": 0.2578125, + "step": 5703, + "time_per_iteration": 2.9596943855285645 + }, + { + "auxiliary_loss_clip": 0.01125647, + "auxiliary_loss_mlp": 0.01041995, + "balance_loss_clip": 1.02599692, + "balance_loss_mlp": 1.04436553, + "epoch": 0.3429430332180971, + "flos": 20156803522560.0, + "grad_norm": 1.8868866692845514, + "language_loss": 0.74849427, + "learning_rate": 3.057661463723086e-06, + "loss": 0.77017069, + "num_input_tokens_seen": 122561770, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8125, + "step": 5704, + "time_per_iteration": 2.475206136703491 + }, + { + "auxiliary_loss_clip": 0.01122188, + "auxiliary_loss_mlp": 0.01035238, + "balance_loss_clip": 1.0218513, + "balance_loss_mlp": 1.0432725, + "epoch": 0.34300315647076507, + "flos": 17965857548160.0, + "grad_norm": 2.4058395538044572, + "language_loss": 0.73303944, + "learning_rate": 3.0573308969042346e-06, + "loss": 0.75461364, + "num_input_tokens_seen": 122580580, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7890625, + "step": 5705, + "time_per_iteration": 2.435140609741211 + }, + { + "auxiliary_loss_clip": 0.0112299, + "auxiliary_loss_mlp": 0.01035313, + "balance_loss_clip": 1.0204711, + "balance_loss_mlp": 1.04320812, + "epoch": 0.34306327972343303, + "flos": 22086822124800.0, + "grad_norm": 3.54760070735666, + "language_loss": 0.79599071, + "learning_rate": 3.057000289991289e-06, + "loss": 0.81757367, + "num_input_tokens_seen": 122599810, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 5706, + "time_per_iteration": 2.4922068119049072 + }, + { + "auxiliary_loss_clip": 0.01127669, + "auxiliary_loss_mlp": 0.01032738, + "balance_loss_clip": 1.01815271, + "balance_loss_mlp": 1.04497337, + "epoch": 0.343123402976101, + "flos": 18442679616000.0, + "grad_norm": 1.9921713202453553, + "language_loss": 0.83170593, + "learning_rate": 3.056669642996787e-06, + "loss": 0.85330999, + "num_input_tokens_seen": 122616035, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.828125, + "step": 5707, + "time_per_iteration": 2.441812753677368 + }, + { + "auxiliary_loss_clip": 0.01126551, + "auxiliary_loss_mlp": 0.0103365, + "balance_loss_clip": 1.01919019, + "balance_loss_mlp": 1.04623604, + "epoch": 0.34318352622876896, + "flos": 17163685065600.0, + "grad_norm": 1.5424527465289883, + "language_loss": 0.75429368, + "learning_rate": 3.056338955933266e-06, + "loss": 0.77589571, + "num_input_tokens_seen": 122633785, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8046875, + "step": 5708, + "time_per_iteration": 2.448415756225586 + }, + { + "auxiliary_loss_clip": 0.01120092, + "auxiliary_loss_mlp": 0.01034667, + "balance_loss_clip": 1.02046943, + "balance_loss_mlp": 1.04284358, + "epoch": 0.34324364948143693, + "flos": 26688164215680.0, + "grad_norm": 1.6552343197625845, + "language_loss": 0.81159383, + "learning_rate": 3.0560082288132662e-06, + "loss": 0.83314145, + "num_input_tokens_seen": 122652100, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7734375, + "step": 5709, + "time_per_iteration": 2.488879919052124 + }, + { + "auxiliary_loss_clip": 0.01125291, + "auxiliary_loss_mlp": 0.01039585, + "balance_loss_clip": 1.0235213, + "balance_loss_mlp": 1.04413152, + "epoch": 0.3433037727341049, + "flos": 21251576194560.0, + "grad_norm": 2.1306910299424677, + "language_loss": 0.79152101, + "learning_rate": 3.055677461649329e-06, + "loss": 0.81316978, + "num_input_tokens_seen": 122669720, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8125, + "step": 5710, + "time_per_iteration": 2.487224817276001 + }, + { + "auxiliary_loss_clip": 0.01124884, + "auxiliary_loss_mlp": 0.01036256, + "balance_loss_clip": 1.0209142, + "balance_loss_mlp": 1.04181814, + "epoch": 0.34336389598677286, + "flos": 20629423699200.0, + "grad_norm": 1.821164645381994, + "language_loss": 0.69994622, + "learning_rate": 3.055346654453996e-06, + "loss": 0.72155762, + "num_input_tokens_seen": 122688715, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.828125, + "step": 5711, + "time_per_iteration": 2.471989631652832 + }, + { + "auxiliary_loss_clip": 0.01123068, + "auxiliary_loss_mlp": 0.01039448, + "balance_loss_clip": 1.02455926, + "balance_loss_mlp": 1.04235482, + "epoch": 0.3434240192394409, + "flos": 14538579402240.0, + "grad_norm": 1.7360043656013842, + "language_loss": 0.68002397, + "learning_rate": 3.055015807239812e-06, + "loss": 0.70164913, + "num_input_tokens_seen": 122706970, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.80859375, + "step": 5712, + "time_per_iteration": 2.440960168838501 + }, + { + "auxiliary_loss_clip": 0.01036814, + "auxiliary_loss_mlp": 0.01007067, + "balance_loss_clip": 1.00550556, + "balance_loss_mlp": 1.011006, + "epoch": 0.34348414249210885, + "flos": 58051538841600.0, + "grad_norm": 0.8415582534154722, + "language_loss": 0.58101094, + "learning_rate": 3.0546849200193226e-06, + "loss": 0.60144973, + "num_input_tokens_seen": 122758095, + "router_z_loss_clip": 0.01556396, + "router_z_loss_mlp": 0.2578125, + "step": 5713, + "time_per_iteration": 3.018573045730591 + }, + { + "auxiliary_loss_clip": 0.01122962, + "auxiliary_loss_mlp": 0.01042443, + "balance_loss_clip": 1.02773833, + "balance_loss_mlp": 1.04283524, + "epoch": 0.3435442657447768, + "flos": 20704441253760.0, + "grad_norm": 1.6636797952259372, + "language_loss": 0.80745685, + "learning_rate": 3.054353992805076e-06, + "loss": 0.82911092, + "num_input_tokens_seen": 122777815, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.80078125, + "step": 5714, + "time_per_iteration": 2.4916322231292725 + }, + { + "auxiliary_loss_clip": 0.01126185, + "auxiliary_loss_mlp": 0.01040552, + "balance_loss_clip": 1.02519822, + "balance_loss_mlp": 1.04508591, + "epoch": 0.3436043889974448, + "flos": 22930256355840.0, + "grad_norm": 1.759201097406795, + "language_loss": 0.71844554, + "learning_rate": 3.05402302560962e-06, + "loss": 0.7401129, + "num_input_tokens_seen": 122797555, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8125, + "step": 5715, + "time_per_iteration": 2.468292474746704 + }, + { + "auxiliary_loss_clip": 0.01036063, + "auxiliary_loss_mlp": 0.01006756, + "balance_loss_clip": 1.00499201, + "balance_loss_mlp": 1.01020741, + "epoch": 0.34366451225011274, + "flos": 58403285752320.0, + "grad_norm": 0.8941035310387452, + "language_loss": 0.65942305, + "learning_rate": 3.053692018445505e-06, + "loss": 0.67985129, + "num_input_tokens_seen": 122863955, + "router_z_loss_clip": 0.0177002, + "router_z_loss_mlp": 0.2578125, + "step": 5716, + "time_per_iteration": 3.101933717727661 + }, + { + "auxiliary_loss_clip": 0.0112152, + "auxiliary_loss_mlp": 0.0104123, + "balance_loss_clip": 1.02705014, + "balance_loss_mlp": 1.04254961, + "epoch": 0.3437246355027807, + "flos": 15596292216960.0, + "grad_norm": 2.0405702698755657, + "language_loss": 0.74612904, + "learning_rate": 3.0533609713252838e-06, + "loss": 0.76775646, + "num_input_tokens_seen": 122883000, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7890625, + "step": 5717, + "time_per_iteration": 2.426793098449707 + }, + { + "auxiliary_loss_clip": 0.01123044, + "auxiliary_loss_mlp": 0.01042851, + "balance_loss_clip": 1.02894473, + "balance_loss_mlp": 1.0413748, + "epoch": 0.34378475875544867, + "flos": 27672260106240.0, + "grad_norm": 1.6999619338826393, + "language_loss": 0.7507081, + "learning_rate": 3.0530298842615077e-06, + "loss": 0.77236706, + "num_input_tokens_seen": 122903265, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.81640625, + "step": 5718, + "time_per_iteration": 2.534557342529297 + }, + { + "auxiliary_loss_clip": 0.01125265, + "auxiliary_loss_mlp": 0.01040651, + "balance_loss_clip": 1.02563679, + "balance_loss_mlp": 1.04245746, + "epoch": 0.34384488200811664, + "flos": 31431496769280.0, + "grad_norm": 1.9991347741656986, + "language_loss": 0.63971305, + "learning_rate": 3.052698757266734e-06, + "loss": 0.66137218, + "num_input_tokens_seen": 122923860, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 5719, + "time_per_iteration": 2.5236892700195312 + }, + { + "auxiliary_loss_clip": 0.01124826, + "auxiliary_loss_mlp": 0.01038568, + "balance_loss_clip": 1.02251017, + "balance_loss_mlp": 1.0418756, + "epoch": 0.3439050052607846, + "flos": 24899920594560.0, + "grad_norm": 2.111950804429908, + "language_loss": 0.73612356, + "learning_rate": 3.0523675903535183e-06, + "loss": 0.75775748, + "num_input_tokens_seen": 122945305, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.828125, + "step": 5720, + "time_per_iteration": 5.3536376953125 + }, + { + "auxiliary_loss_clip": 0.0112352, + "auxiliary_loss_mlp": 0.01040582, + "balance_loss_clip": 1.02520978, + "balance_loss_mlp": 1.04300022, + "epoch": 0.34396512851345257, + "flos": 18150079426560.0, + "grad_norm": 1.805745396214866, + "language_loss": 0.74198145, + "learning_rate": 3.0520363835344173e-06, + "loss": 0.76362252, + "num_input_tokens_seen": 122962535, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8046875, + "step": 5721, + "time_per_iteration": 2.4301607608795166 + }, + { + "auxiliary_loss_clip": 0.01126876, + "auxiliary_loss_mlp": 0.0104413, + "balance_loss_clip": 1.0286088, + "balance_loss_mlp": 1.04481733, + "epoch": 0.34402525176612053, + "flos": 16034438315520.0, + "grad_norm": 3.5063882769532313, + "language_loss": 0.80132651, + "learning_rate": 3.051705136821992e-06, + "loss": 0.82303661, + "num_input_tokens_seen": 122979750, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8203125, + "step": 5722, + "time_per_iteration": 2.411731243133545 + }, + { + "auxiliary_loss_clip": 0.01122709, + "auxiliary_loss_mlp": 0.01032098, + "balance_loss_clip": 1.01809728, + "balance_loss_mlp": 1.04312289, + "epoch": 0.3440853750187885, + "flos": 21178641628800.0, + "grad_norm": 1.5863267197766868, + "language_loss": 0.8194539, + "learning_rate": 3.051373850228801e-06, + "loss": 0.84100199, + "num_input_tokens_seen": 122998955, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.796875, + "step": 5723, + "time_per_iteration": 2.476672410964966 + }, + { + "auxiliary_loss_clip": 0.01124156, + "auxiliary_loss_mlp": 0.01039991, + "balance_loss_clip": 1.02559686, + "balance_loss_mlp": 1.0428493, + "epoch": 0.34414549827145646, + "flos": 12677868092160.0, + "grad_norm": 1.852885568649272, + "language_loss": 0.8147676, + "learning_rate": 3.0510425237674096e-06, + "loss": 0.83640903, + "num_input_tokens_seen": 123016165, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.8125, + "step": 5724, + "time_per_iteration": 2.4115889072418213 + }, + { + "auxiliary_loss_clip": 0.01125316, + "auxiliary_loss_mlp": 0.01036091, + "balance_loss_clip": 1.0210526, + "balance_loss_mlp": 1.04397368, + "epoch": 0.3442056215241244, + "flos": 31284514316160.0, + "grad_norm": 1.759268883551978, + "language_loss": 0.6919744, + "learning_rate": 3.05071115745038e-06, + "loss": 0.71358848, + "num_input_tokens_seen": 123036900, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8125, + "step": 5725, + "time_per_iteration": 2.589571714401245 + }, + { + "auxiliary_loss_clip": 0.01130624, + "auxiliary_loss_mlp": 0.01042614, + "balance_loss_clip": 1.02578139, + "balance_loss_mlp": 1.04464412, + "epoch": 0.34426574477679245, + "flos": 23367289132800.0, + "grad_norm": 1.4578739764018875, + "language_loss": 0.69519544, + "learning_rate": 3.0503797512902773e-06, + "loss": 0.71692783, + "num_input_tokens_seen": 123057480, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.859375, + "step": 5726, + "time_per_iteration": 2.4600956439971924 + }, + { + "auxiliary_loss_clip": 0.01123936, + "auxiliary_loss_mlp": 0.0103637, + "balance_loss_clip": 1.02222002, + "balance_loss_mlp": 1.0427928, + "epoch": 0.3443258680294604, + "flos": 24535427333760.0, + "grad_norm": 1.656148044371735, + "language_loss": 0.73426235, + "learning_rate": 3.0500483052996703e-06, + "loss": 0.7558654, + "num_input_tokens_seen": 123076890, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8125, + "step": 5727, + "time_per_iteration": 2.5102531909942627 + }, + { + "auxiliary_loss_clip": 0.01125161, + "auxiliary_loss_mlp": 0.01041626, + "balance_loss_clip": 1.02636731, + "balance_loss_mlp": 1.04398954, + "epoch": 0.3443859912821284, + "flos": 20230133137920.0, + "grad_norm": 1.8280399137078096, + "language_loss": 0.87897557, + "learning_rate": 3.0497168194911257e-06, + "loss": 0.90064341, + "num_input_tokens_seen": 123092530, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8125, + "step": 5728, + "time_per_iteration": 2.4304542541503906 + }, + { + "auxiliary_loss_clip": 0.01122947, + "auxiliary_loss_mlp": 0.01045129, + "balance_loss_clip": 1.03106284, + "balance_loss_mlp": 1.04264569, + "epoch": 0.34444611453479634, + "flos": 24316515895680.0, + "grad_norm": 2.0505664478102426, + "language_loss": 0.70451075, + "learning_rate": 3.0493852938772143e-06, + "loss": 0.72619152, + "num_input_tokens_seen": 123110560, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8046875, + "step": 5729, + "time_per_iteration": 2.4979374408721924 + }, + { + "auxiliary_loss_clip": 0.01122265, + "auxiliary_loss_mlp": 0.0103421, + "balance_loss_clip": 1.01952362, + "balance_loss_mlp": 1.0427525, + "epoch": 0.3445062377874643, + "flos": 16983413683200.0, + "grad_norm": 1.7284434335955414, + "language_loss": 0.73995942, + "learning_rate": 3.0490537284705078e-06, + "loss": 0.7615242, + "num_input_tokens_seen": 123128655, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.796875, + "step": 5730, + "time_per_iteration": 2.4471776485443115 + }, + { + "auxiliary_loss_clip": 0.0112363, + "auxiliary_loss_mlp": 0.01041517, + "balance_loss_clip": 1.02693152, + "balance_loss_mlp": 1.04263377, + "epoch": 0.3445663610401323, + "flos": 20302708567680.0, + "grad_norm": 2.104777326243209, + "language_loss": 0.80005515, + "learning_rate": 3.048722123283578e-06, + "loss": 0.82170659, + "num_input_tokens_seen": 123145130, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8125, + "step": 5731, + "time_per_iteration": 2.454735279083252 + }, + { + "auxiliary_loss_clip": 0.01123928, + "auxiliary_loss_mlp": 0.0104428, + "balance_loss_clip": 1.02953923, + "balance_loss_mlp": 1.04394484, + "epoch": 0.34462648429280024, + "flos": 15888102307200.0, + "grad_norm": 2.039149215632527, + "language_loss": 0.78837991, + "learning_rate": 3.0483904783290006e-06, + "loss": 0.81006193, + "num_input_tokens_seen": 123162265, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.796875, + "step": 5732, + "time_per_iteration": 2.4177064895629883 + }, + { + "auxiliary_loss_clip": 0.01043649, + "auxiliary_loss_mlp": 0.01003776, + "balance_loss_clip": 1.00184536, + "balance_loss_mlp": 1.01788378, + "epoch": 0.3446866075454682, + "flos": 59311035285120.0, + "grad_norm": 0.7440231134556253, + "language_loss": 0.53498071, + "learning_rate": 3.0480587936193505e-06, + "loss": 0.55545497, + "num_input_tokens_seen": 123218620, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.2578125, + "step": 5733, + "time_per_iteration": 3.0976667404174805 + }, + { + "auxiliary_loss_clip": 0.0112691, + "auxiliary_loss_mlp": 0.01042834, + "balance_loss_clip": 1.02806389, + "balance_loss_mlp": 1.04630947, + "epoch": 0.34474673079813617, + "flos": 22343799000960.0, + "grad_norm": 1.6025085195413686, + "language_loss": 0.83345532, + "learning_rate": 3.047727069167207e-06, + "loss": 0.85515279, + "num_input_tokens_seen": 123237325, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8046875, + "step": 5734, + "time_per_iteration": 2.462327718734741 + }, + { + "auxiliary_loss_clip": 0.01125766, + "auxiliary_loss_mlp": 0.0103401, + "balance_loss_clip": 1.01899588, + "balance_loss_mlp": 1.04382658, + "epoch": 0.34480685405080413, + "flos": 27670141203840.0, + "grad_norm": 2.7233898634254525, + "language_loss": 0.9245038, + "learning_rate": 3.0473953049851478e-06, + "loss": 0.94610149, + "num_input_tokens_seen": 123258650, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8203125, + "step": 5735, + "time_per_iteration": 2.600933790206909 + }, + { + "auxiliary_loss_clip": 0.01129266, + "auxiliary_loss_mlp": 0.01041814, + "balance_loss_clip": 1.02607846, + "balance_loss_mlp": 1.04662871, + "epoch": 0.3448669773034721, + "flos": 22456020067200.0, + "grad_norm": 1.628548106881684, + "language_loss": 0.76666284, + "learning_rate": 3.0470635010857533e-06, + "loss": 0.78837371, + "num_input_tokens_seen": 123277155, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.828125, + "step": 5736, + "time_per_iteration": 2.4607973098754883 + }, + { + "auxiliary_loss_clip": 0.0113014, + "auxiliary_loss_mlp": 0.01043797, + "balance_loss_clip": 1.02948046, + "balance_loss_mlp": 1.04773998, + "epoch": 0.34492710055614006, + "flos": 24936190352640.0, + "grad_norm": 1.59823002014571, + "language_loss": 0.78745639, + "learning_rate": 3.0467316574816064e-06, + "loss": 0.80919576, + "num_input_tokens_seen": 123297640, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.82421875, + "step": 5737, + "time_per_iteration": 2.5059142112731934 + }, + { + "auxiliary_loss_clip": 0.0112976, + "auxiliary_loss_mlp": 0.01040269, + "balance_loss_clip": 1.02459311, + "balance_loss_mlp": 1.04445243, + "epoch": 0.34498722380880803, + "flos": 20120821073280.0, + "grad_norm": 2.0456946138928767, + "language_loss": 0.71714234, + "learning_rate": 3.0463997741852893e-06, + "loss": 0.73884267, + "num_input_tokens_seen": 123314370, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8515625, + "step": 5738, + "time_per_iteration": 2.4374310970306396 + }, + { + "auxiliary_loss_clip": 0.01129235, + "auxiliary_loss_mlp": 0.01042362, + "balance_loss_clip": 1.02727044, + "balance_loss_mlp": 1.04496205, + "epoch": 0.34504734706147605, + "flos": 28438126917120.0, + "grad_norm": 1.8999072115309161, + "language_loss": 0.81518626, + "learning_rate": 3.046067851209389e-06, + "loss": 0.83690214, + "num_input_tokens_seen": 123336085, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84375, + "step": 5739, + "time_per_iteration": 2.559990406036377 + }, + { + "auxiliary_loss_clip": 0.0112747, + "auxiliary_loss_mlp": 0.01045734, + "balance_loss_clip": 1.03067827, + "balance_loss_mlp": 1.04620492, + "epoch": 0.345107470314144, + "flos": 22674464628480.0, + "grad_norm": 2.6856273454827275, + "language_loss": 0.8322401, + "learning_rate": 3.0457358885664898e-06, + "loss": 0.85397214, + "num_input_tokens_seen": 123354460, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8125, + "step": 5740, + "time_per_iteration": 2.4684722423553467 + }, + { + "auxiliary_loss_clip": 0.01127563, + "auxiliary_loss_mlp": 0.01038564, + "balance_loss_clip": 1.0227803, + "balance_loss_mlp": 1.04611385, + "epoch": 0.345167593566812, + "flos": 20630716588800.0, + "grad_norm": 2.03424253553345, + "language_loss": 0.77135098, + "learning_rate": 3.045403886269181e-06, + "loss": 0.7930122, + "num_input_tokens_seen": 123373420, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8125, + "step": 5741, + "time_per_iteration": 2.48624587059021 + }, + { + "auxiliary_loss_clip": 0.01125981, + "auxiliary_loss_mlp": 0.01036869, + "balance_loss_clip": 1.02226019, + "balance_loss_mlp": 1.04276562, + "epoch": 0.34522771681947995, + "flos": 26214358890240.0, + "grad_norm": 1.4993687582247586, + "language_loss": 0.77224493, + "learning_rate": 3.045071844330053e-06, + "loss": 0.79387349, + "num_input_tokens_seen": 123394730, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.83203125, + "step": 5742, + "time_per_iteration": 2.5046300888061523 + }, + { + "auxiliary_loss_clip": 0.01123657, + "auxiliary_loss_mlp": 0.01039728, + "balance_loss_clip": 1.02479076, + "balance_loss_mlp": 1.04310095, + "epoch": 0.3452878400721479, + "flos": 19062354072960.0, + "grad_norm": 1.823337430242114, + "language_loss": 0.76346177, + "learning_rate": 3.0447397627616955e-06, + "loss": 0.78509557, + "num_input_tokens_seen": 123412895, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8046875, + "step": 5743, + "time_per_iteration": 2.4554226398468018 + }, + { + "auxiliary_loss_clip": 0.01124183, + "auxiliary_loss_mlp": 0.01036757, + "balance_loss_clip": 1.02278566, + "balance_loss_mlp": 1.04435802, + "epoch": 0.3453479633248159, + "flos": 27929739772800.0, + "grad_norm": 1.5691807126711539, + "language_loss": 0.70255435, + "learning_rate": 3.0444076415767016e-06, + "loss": 0.72416371, + "num_input_tokens_seen": 123432320, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.796875, + "step": 5744, + "time_per_iteration": 2.497314929962158 + }, + { + "auxiliary_loss_clip": 0.01121947, + "auxiliary_loss_mlp": 0.01036476, + "balance_loss_clip": 1.02205133, + "balance_loss_mlp": 1.04318309, + "epoch": 0.34540808657748384, + "flos": 19606113135360.0, + "grad_norm": 1.629619176768893, + "language_loss": 0.79692256, + "learning_rate": 3.044075480787665e-06, + "loss": 0.81850678, + "num_input_tokens_seen": 123450980, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7890625, + "step": 5745, + "time_per_iteration": 2.5154099464416504 + }, + { + "auxiliary_loss_clip": 0.01129348, + "auxiliary_loss_mlp": 0.01040489, + "balance_loss_clip": 1.02460432, + "balance_loss_mlp": 1.04556072, + "epoch": 0.3454682098301518, + "flos": 20411661496320.0, + "grad_norm": 1.7858540966841563, + "language_loss": 0.88775939, + "learning_rate": 3.043743280407182e-06, + "loss": 0.9094578, + "num_input_tokens_seen": 123469365, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8359375, + "step": 5746, + "time_per_iteration": 2.436028003692627 + }, + { + "auxiliary_loss_clip": 0.01129654, + "auxiliary_loss_mlp": 0.01039755, + "balance_loss_clip": 1.02438855, + "balance_loss_mlp": 1.04509354, + "epoch": 0.34552833308281977, + "flos": 21325121291520.0, + "grad_norm": 1.8755596522528313, + "language_loss": 0.64010286, + "learning_rate": 3.043411040447849e-06, + "loss": 0.66179693, + "num_input_tokens_seen": 123489425, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.84765625, + "step": 5747, + "time_per_iteration": 2.465817451477051 + }, + { + "auxiliary_loss_clip": 0.0112633, + "auxiliary_loss_mlp": 0.01035998, + "balance_loss_clip": 1.02193761, + "balance_loss_mlp": 1.04486203, + "epoch": 0.34558845633548774, + "flos": 36243633824640.0, + "grad_norm": 1.5413680181151455, + "language_loss": 0.72813559, + "learning_rate": 3.043078760922264e-06, + "loss": 0.74975884, + "num_input_tokens_seen": 123509970, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8125, + "step": 5748, + "time_per_iteration": 2.566849946975708 + }, + { + "auxiliary_loss_clip": 0.01123147, + "auxiliary_loss_mlp": 0.01034299, + "balance_loss_clip": 1.020715, + "balance_loss_mlp": 1.04517043, + "epoch": 0.3456485795881557, + "flos": 22450561200000.0, + "grad_norm": 1.6451707518978071, + "language_loss": 0.75697249, + "learning_rate": 3.042746441843029e-06, + "loss": 0.77854693, + "num_input_tokens_seen": 123531055, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.77734375, + "step": 5749, + "time_per_iteration": 2.5068271160125732 + }, + { + "auxiliary_loss_clip": 0.01036655, + "auxiliary_loss_mlp": 0.01004838, + "balance_loss_clip": 1.00293088, + "balance_loss_mlp": 1.01066136, + "epoch": 0.34570870284082367, + "flos": 62004299005440.0, + "grad_norm": 0.8931526891439046, + "language_loss": 0.62754983, + "learning_rate": 3.0424140832227437e-06, + "loss": 0.64796478, + "num_input_tokens_seen": 123584720, + "router_z_loss_clip": 0.01904297, + "router_z_loss_mlp": 0.25976562, + "step": 5750, + "time_per_iteration": 2.930236577987671 + }, + { + "auxiliary_loss_clip": 0.01119501, + "auxiliary_loss_mlp": 0.01033201, + "balance_loss_clip": 1.01933062, + "balance_loss_mlp": 1.04268134, + "epoch": 0.34576882609349163, + "flos": 22782196494720.0, + "grad_norm": 2.1199041216122314, + "language_loss": 0.80762947, + "learning_rate": 3.042081685074012e-06, + "loss": 0.82915652, + "num_input_tokens_seen": 123604465, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 5751, + "time_per_iteration": 2.4710936546325684 + }, + { + "auxiliary_loss_clip": 0.01121328, + "auxiliary_loss_mlp": 0.01046759, + "balance_loss_clip": 1.03268027, + "balance_loss_mlp": 1.04408574, + "epoch": 0.34582894934615965, + "flos": 12348818576640.0, + "grad_norm": 3.882107217624466, + "language_loss": 0.83630323, + "learning_rate": 3.041749247409439e-06, + "loss": 0.85798407, + "num_input_tokens_seen": 123622320, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7734375, + "step": 5752, + "time_per_iteration": 2.421095132827759 + }, + { + "auxiliary_loss_clip": 0.01036836, + "auxiliary_loss_mlp": 0.01002038, + "balance_loss_clip": 1.00014234, + "balance_loss_mlp": 1.01131189, + "epoch": 0.3458890725988276, + "flos": 70167691071360.0, + "grad_norm": 0.7425573992046552, + "language_loss": 0.63106978, + "learning_rate": 3.0414167702416296e-06, + "loss": 0.6514585, + "num_input_tokens_seen": 123678010, + "router_z_loss_clip": 0.0189209, + "router_z_loss_mlp": 0.25585938, + "step": 5753, + "time_per_iteration": 2.960430383682251 + }, + { + "auxiliary_loss_clip": 0.01122475, + "auxiliary_loss_mlp": 0.01040243, + "balance_loss_clip": 1.0252701, + "balance_loss_mlp": 1.0433172, + "epoch": 0.3459491958514956, + "flos": 17092582093440.0, + "grad_norm": 1.7337780765213762, + "language_loss": 0.70964289, + "learning_rate": 3.0410842535831914e-06, + "loss": 0.73127007, + "num_input_tokens_seen": 123696830, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.79296875, + "step": 5754, + "time_per_iteration": 2.473090171813965 + }, + { + "auxiliary_loss_clip": 0.01126645, + "auxiliary_loss_mlp": 0.01033305, + "balance_loss_clip": 1.01889825, + "balance_loss_mlp": 1.04436386, + "epoch": 0.34600931910416355, + "flos": 16650952375680.0, + "grad_norm": 3.1958037374869357, + "language_loss": 0.72880316, + "learning_rate": 3.0407516974467343e-06, + "loss": 0.75040269, + "num_input_tokens_seen": 123714360, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.82421875, + "step": 5755, + "time_per_iteration": 2.486187219619751 + }, + { + "auxiliary_loss_clip": 0.01122516, + "auxiliary_loss_mlp": 0.01034129, + "balance_loss_clip": 1.01985335, + "balance_loss_mlp": 1.04448533, + "epoch": 0.3460694423568315, + "flos": 38546190334080.0, + "grad_norm": 1.6620890991055186, + "language_loss": 0.72366977, + "learning_rate": 3.040419101844869e-06, + "loss": 0.74523616, + "num_input_tokens_seen": 123739250, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 5756, + "time_per_iteration": 2.6883044242858887 + }, + { + "auxiliary_loss_clip": 0.01036738, + "auxiliary_loss_mlp": 0.01004698, + "balance_loss_clip": 1.00295758, + "balance_loss_mlp": 1.01152658, + "epoch": 0.3461295656094995, + "flos": 72081479704320.0, + "grad_norm": 0.7127234008063932, + "language_loss": 0.62522227, + "learning_rate": 3.040086466790207e-06, + "loss": 0.64563662, + "num_input_tokens_seen": 123802845, + "router_z_loss_clip": 0.01745605, + "router_z_loss_mlp": 0.25195312, + "step": 5757, + "time_per_iteration": 3.0644619464874268 + }, + { + "auxiliary_loss_clip": 0.01036676, + "auxiliary_loss_mlp": 0.01006374, + "balance_loss_clip": 1.00465703, + "balance_loss_mlp": 1.01123941, + "epoch": 0.34618968886216744, + "flos": 65460089571840.0, + "grad_norm": 0.8513650993905141, + "language_loss": 0.59153563, + "learning_rate": 3.039753792295362e-06, + "loss": 0.61196613, + "num_input_tokens_seen": 123861805, + "router_z_loss_clip": 0.01721191, + "router_z_loss_mlp": 0.25390625, + "step": 5758, + "time_per_iteration": 3.0601916313171387 + }, + { + "auxiliary_loss_clip": 0.01126165, + "auxiliary_loss_mlp": 0.0103975, + "balance_loss_clip": 1.02576697, + "balance_loss_mlp": 1.04562724, + "epoch": 0.3462498121148354, + "flos": 23472542960640.0, + "grad_norm": 1.8469236817688628, + "language_loss": 0.71498728, + "learning_rate": 3.0394210783729487e-06, + "loss": 0.73664641, + "num_input_tokens_seen": 123881820, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.8046875, + "step": 5759, + "time_per_iteration": 2.4722588062286377 + }, + { + "auxiliary_loss_clip": 0.0112123, + "auxiliary_loss_mlp": 0.01046171, + "balance_loss_clip": 1.03079295, + "balance_loss_mlp": 1.04248834, + "epoch": 0.3463099353675034, + "flos": 24170790418560.0, + "grad_norm": 1.8727439754442439, + "language_loss": 0.83008277, + "learning_rate": 3.0390883250355836e-06, + "loss": 0.85175675, + "num_input_tokens_seen": 123903700, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.7890625, + "step": 5760, + "time_per_iteration": 2.5002012252807617 + }, + { + "auxiliary_loss_clip": 0.01035648, + "auxiliary_loss_mlp": 0.01005512, + "balance_loss_clip": 1.00358045, + "balance_loss_mlp": 1.01033783, + "epoch": 0.34637005862017134, + "flos": 63700609766400.0, + "grad_norm": 0.8745886359800412, + "language_loss": 0.5653646, + "learning_rate": 3.0387555322958865e-06, + "loss": 0.58577621, + "num_input_tokens_seen": 123960075, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.25390625, + "step": 5761, + "time_per_iteration": 3.0950896739959717 + }, + { + "auxiliary_loss_clip": 0.01120096, + "auxiliary_loss_mlp": 0.01038691, + "balance_loss_clip": 1.02470756, + "balance_loss_mlp": 1.04127657, + "epoch": 0.3464301818728393, + "flos": 13145532192000.0, + "grad_norm": 2.0018538772922883, + "language_loss": 0.95053494, + "learning_rate": 3.038422700166474e-06, + "loss": 0.97212291, + "num_input_tokens_seen": 123975805, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7890625, + "step": 5762, + "time_per_iteration": 5.290884256362915 + }, + { + "auxiliary_loss_clip": 0.01123668, + "auxiliary_loss_mlp": 0.01034396, + "balance_loss_clip": 1.01935804, + "balance_loss_mlp": 1.0417943, + "epoch": 0.34649030512550727, + "flos": 29315173299840.0, + "grad_norm": 2.194288284173203, + "language_loss": 0.69335818, + "learning_rate": 3.0380898286599692e-06, + "loss": 0.71493888, + "num_input_tokens_seen": 123997530, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8203125, + "step": 5763, + "time_per_iteration": 2.5411787033081055 + }, + { + "auxiliary_loss_clip": 0.01130216, + "auxiliary_loss_mlp": 0.01045092, + "balance_loss_clip": 1.02862906, + "balance_loss_mlp": 1.0458554, + "epoch": 0.34655042837817523, + "flos": 23730884553600.0, + "grad_norm": 2.0099592928074497, + "language_loss": 0.83589876, + "learning_rate": 3.0377569177889945e-06, + "loss": 0.85765183, + "num_input_tokens_seen": 124016375, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.84375, + "step": 5764, + "time_per_iteration": 2.48040771484375 + }, + { + "auxiliary_loss_clip": 0.01123556, + "auxiliary_loss_mlp": 0.01033291, + "balance_loss_clip": 1.0186758, + "balance_loss_mlp": 1.04343057, + "epoch": 0.34661055163084326, + "flos": 22054215553920.0, + "grad_norm": 2.159805793212971, + "language_loss": 0.67403859, + "learning_rate": 3.0374239675661722e-06, + "loss": 0.69560707, + "num_input_tokens_seen": 124033975, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.80078125, + "step": 5765, + "time_per_iteration": 2.502297878265381 + }, + { + "auxiliary_loss_clip": 0.01130095, + "auxiliary_loss_mlp": 0.01037318, + "balance_loss_clip": 1.02291703, + "balance_loss_mlp": 1.04937232, + "epoch": 0.3466706748835112, + "flos": 21799213925760.0, + "grad_norm": 2.083918060213648, + "language_loss": 0.77861524, + "learning_rate": 3.03709097800413e-06, + "loss": 0.80028939, + "num_input_tokens_seen": 124051930, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.80859375, + "step": 5766, + "time_per_iteration": 2.465325355529785 + }, + { + "auxiliary_loss_clip": 0.01122551, + "auxiliary_loss_mlp": 0.01035113, + "balance_loss_clip": 1.0215292, + "balance_loss_mlp": 1.04335451, + "epoch": 0.3467307981361792, + "flos": 19461680547840.0, + "grad_norm": 1.5377908130541305, + "language_loss": 0.73529994, + "learning_rate": 3.0367579491154943e-06, + "loss": 0.75687665, + "num_input_tokens_seen": 124071220, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7890625, + "step": 5767, + "time_per_iteration": 2.4656143188476562 + }, + { + "auxiliary_loss_clip": 0.01127128, + "auxiliary_loss_mlp": 0.01040956, + "balance_loss_clip": 1.02538764, + "balance_loss_mlp": 1.04720497, + "epoch": 0.34679092138884715, + "flos": 24827452905600.0, + "grad_norm": 2.233359981487989, + "language_loss": 0.77795279, + "learning_rate": 3.036424880912893e-06, + "loss": 0.79963356, + "num_input_tokens_seen": 124090140, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.796875, + "step": 5768, + "time_per_iteration": 2.4951131343841553 + }, + { + "auxiliary_loss_clip": 0.0103542, + "auxiliary_loss_mlp": 0.01008769, + "balance_loss_clip": 1.00693345, + "balance_loss_mlp": 1.01015306, + "epoch": 0.3468510446415151, + "flos": 63236070149760.0, + "grad_norm": 0.7739728920865777, + "language_loss": 0.57404095, + "learning_rate": 3.036091773408956e-06, + "loss": 0.59448284, + "num_input_tokens_seen": 124152025, + "router_z_loss_clip": 0.01831055, + "router_z_loss_mlp": 0.25195312, + "step": 5769, + "time_per_iteration": 3.0867085456848145 + }, + { + "auxiliary_loss_clip": 0.01135857, + "auxiliary_loss_mlp": 0.01043057, + "balance_loss_clip": 1.02577174, + "balance_loss_mlp": 1.04723847, + "epoch": 0.3469111678941831, + "flos": 12120713256960.0, + "grad_norm": 2.3808887206764244, + "language_loss": 0.85625517, + "learning_rate": 3.0357586266163154e-06, + "loss": 0.87804437, + "num_input_tokens_seen": 124165795, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.88671875, + "step": 5770, + "time_per_iteration": 2.4296391010284424 + }, + { + "auxiliary_loss_clip": 0.0103532, + "auxiliary_loss_mlp": 0.01003334, + "balance_loss_clip": 1.00152194, + "balance_loss_mlp": 1.01001954, + "epoch": 0.34697129114685105, + "flos": 65934110378880.0, + "grad_norm": 0.7779481231658855, + "language_loss": 0.59827816, + "learning_rate": 3.0354254405476036e-06, + "loss": 0.61866474, + "num_input_tokens_seen": 124222925, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.25390625, + "step": 5771, + "time_per_iteration": 2.858952522277832 + }, + { + "auxiliary_loss_clip": 0.0112466, + "auxiliary_loss_mlp": 0.01046684, + "balance_loss_clip": 1.03183091, + "balance_loss_mlp": 1.04478061, + "epoch": 0.347031414399519, + "flos": 34454205054720.0, + "grad_norm": 2.6949016474557475, + "language_loss": 0.71790159, + "learning_rate": 3.0350922152154557e-06, + "loss": 0.73961502, + "num_input_tokens_seen": 124240915, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 5772, + "time_per_iteration": 2.629441976547241 + }, + { + "auxiliary_loss_clip": 0.01125724, + "auxiliary_loss_mlp": 0.01041085, + "balance_loss_clip": 1.02608275, + "balance_loss_mlp": 1.04398608, + "epoch": 0.347091537652187, + "flos": 26944135511040.0, + "grad_norm": 1.4939658014033708, + "language_loss": 0.76165307, + "learning_rate": 3.034758950632507e-06, + "loss": 0.78332114, + "num_input_tokens_seen": 124262770, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.81640625, + "step": 5773, + "time_per_iteration": 2.5281848907470703 + }, + { + "auxiliary_loss_clip": 0.01127127, + "auxiliary_loss_mlp": 0.01043606, + "balance_loss_clip": 1.02811444, + "balance_loss_mlp": 1.04447389, + "epoch": 0.34715166090485494, + "flos": 21142228216320.0, + "grad_norm": 2.0748415381607717, + "language_loss": 0.70428938, + "learning_rate": 3.034425646811396e-06, + "loss": 0.72599673, + "num_input_tokens_seen": 124280950, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.828125, + "step": 5774, + "time_per_iteration": 2.4930198192596436 + }, + { + "auxiliary_loss_clip": 0.01125136, + "auxiliary_loss_mlp": 0.01040671, + "balance_loss_clip": 1.02630043, + "balance_loss_mlp": 1.04615033, + "epoch": 0.3472117841575229, + "flos": 23478001827840.0, + "grad_norm": 1.6801460468757594, + "language_loss": 0.76410925, + "learning_rate": 3.0340923037647602e-06, + "loss": 0.78576738, + "num_input_tokens_seen": 124299540, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7890625, + "step": 5775, + "time_per_iteration": 2.501793622970581 + }, + { + "auxiliary_loss_clip": 0.01129926, + "auxiliary_loss_mlp": 0.0104389, + "balance_loss_clip": 1.02778447, + "balance_loss_mlp": 1.04408336, + "epoch": 0.34727190741019087, + "flos": 17492806408320.0, + "grad_norm": 2.2786937073337956, + "language_loss": 0.78098702, + "learning_rate": 3.0337589215052404e-06, + "loss": 0.8027252, + "num_input_tokens_seen": 124316285, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.859375, + "step": 5776, + "time_per_iteration": 2.547508716583252 + }, + { + "auxiliary_loss_clip": 0.01034004, + "auxiliary_loss_mlp": 0.01012403, + "balance_loss_clip": 1.01073408, + "balance_loss_mlp": 1.00864577, + "epoch": 0.34733203066285884, + "flos": 65265491640960.0, + "grad_norm": 0.8366551978688649, + "language_loss": 0.63353252, + "learning_rate": 3.033425500045478e-06, + "loss": 0.65399659, + "num_input_tokens_seen": 124376650, + "router_z_loss_clip": 0.01672363, + "router_z_loss_mlp": 0.25390625, + "step": 5777, + "time_per_iteration": 3.118314743041992 + }, + { + "auxiliary_loss_clip": 0.01124542, + "auxiliary_loss_mlp": 0.01047894, + "balance_loss_clip": 1.03253984, + "balance_loss_mlp": 1.04198289, + "epoch": 0.3473921539155268, + "flos": 28658726294400.0, + "grad_norm": 2.1982821508403956, + "language_loss": 0.64399695, + "learning_rate": 3.033092039398119e-06, + "loss": 0.66572136, + "num_input_tokens_seen": 124396475, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.82421875, + "step": 5778, + "time_per_iteration": 2.5438621044158936 + }, + { + "auxiliary_loss_clip": 0.01128237, + "auxiliary_loss_mlp": 0.01054212, + "balance_loss_clip": 1.03947175, + "balance_loss_mlp": 1.04425573, + "epoch": 0.3474522771681948, + "flos": 40836895355520.0, + "grad_norm": 1.7264375706792277, + "language_loss": 0.71190178, + "learning_rate": 3.0327585395758046e-06, + "loss": 0.73372632, + "num_input_tokens_seen": 124416480, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.83984375, + "step": 5779, + "time_per_iteration": 2.6116013526916504 + }, + { + "auxiliary_loss_clip": 0.01128331, + "auxiliary_loss_mlp": 0.01048092, + "balance_loss_clip": 1.03270197, + "balance_loss_mlp": 1.04354596, + "epoch": 0.3475124004208628, + "flos": 24608577381120.0, + "grad_norm": 1.874853063849031, + "language_loss": 0.62552947, + "learning_rate": 3.0324250005911837e-06, + "loss": 0.64729369, + "num_input_tokens_seen": 124435950, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84765625, + "step": 5780, + "time_per_iteration": 2.5024712085723877 + }, + { + "auxiliary_loss_clip": 0.01124027, + "auxiliary_loss_mlp": 0.01041985, + "balance_loss_clip": 1.0278883, + "balance_loss_mlp": 1.04260445, + "epoch": 0.34757252367353075, + "flos": 22711309004160.0, + "grad_norm": 1.604616792806945, + "language_loss": 0.72373253, + "learning_rate": 3.0320914224569033e-06, + "loss": 0.74539268, + "num_input_tokens_seen": 124455410, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.81640625, + "step": 5781, + "time_per_iteration": 2.471235513687134 + }, + { + "auxiliary_loss_clip": 0.01125801, + "auxiliary_loss_mlp": 0.01050458, + "balance_loss_clip": 1.03416181, + "balance_loss_mlp": 1.04316914, + "epoch": 0.3476326469261987, + "flos": 19828184970240.0, + "grad_norm": 2.0942988164582266, + "language_loss": 0.76741016, + "learning_rate": 3.031757805185612e-06, + "loss": 0.78917271, + "num_input_tokens_seen": 124474870, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.828125, + "step": 5782, + "time_per_iteration": 2.4831414222717285 + }, + { + "auxiliary_loss_clip": 0.01123989, + "auxiliary_loss_mlp": 0.01036279, + "balance_loss_clip": 1.02140737, + "balance_loss_mlp": 1.04221606, + "epoch": 0.3476927701788667, + "flos": 19938107566080.0, + "grad_norm": 1.9917493867858045, + "language_loss": 0.62131268, + "learning_rate": 3.0314241487899622e-06, + "loss": 0.64291537, + "num_input_tokens_seen": 124494105, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8203125, + "step": 5783, + "time_per_iteration": 2.4483954906463623 + }, + { + "auxiliary_loss_clip": 0.01119293, + "auxiliary_loss_mlp": 0.01031994, + "balance_loss_clip": 1.01833832, + "balance_loss_mlp": 1.0410347, + "epoch": 0.34775289343153465, + "flos": 20735108490240.0, + "grad_norm": 1.6546414102961637, + "language_loss": 0.88575971, + "learning_rate": 3.031090453282605e-06, + "loss": 0.90727258, + "num_input_tokens_seen": 124512030, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.78125, + "step": 5784, + "time_per_iteration": 2.5281262397766113 + }, + { + "auxiliary_loss_clip": 0.01121731, + "auxiliary_loss_mlp": 0.01036934, + "balance_loss_clip": 1.02219379, + "balance_loss_mlp": 1.04283547, + "epoch": 0.3478130166842026, + "flos": 19354846521600.0, + "grad_norm": 1.7834042756277195, + "language_loss": 0.81664282, + "learning_rate": 3.0307567186761946e-06, + "loss": 0.83822948, + "num_input_tokens_seen": 124530980, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.7890625, + "step": 5785, + "time_per_iteration": 2.444279432296753 + }, + { + "auxiliary_loss_clip": 0.01126224, + "auxiliary_loss_mlp": 0.01037443, + "balance_loss_clip": 1.02301908, + "balance_loss_mlp": 1.04558039, + "epoch": 0.3478731399368706, + "flos": 22051198811520.0, + "grad_norm": 1.6236713309130966, + "language_loss": 0.80679643, + "learning_rate": 3.0304229449833862e-06, + "loss": 0.82843316, + "num_input_tokens_seen": 124549330, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8046875, + "step": 5786, + "time_per_iteration": 2.506639242172241 + }, + { + "auxiliary_loss_clip": 0.01123366, + "auxiliary_loss_mlp": 0.01033692, + "balance_loss_clip": 1.01860058, + "balance_loss_mlp": 1.0443275, + "epoch": 0.34793326318953854, + "flos": 18041449720320.0, + "grad_norm": 1.5789553434659291, + "language_loss": 0.74868137, + "learning_rate": 3.030089132216836e-06, + "loss": 0.77025199, + "num_input_tokens_seen": 124567200, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7890625, + "step": 5787, + "time_per_iteration": 2.4305543899536133 + }, + { + "auxiliary_loss_clip": 0.01122978, + "auxiliary_loss_mlp": 0.01037288, + "balance_loss_clip": 1.02276862, + "balance_loss_mlp": 1.04133916, + "epoch": 0.3479933864422065, + "flos": 29314670509440.0, + "grad_norm": 1.685205733624188, + "language_loss": 0.81207466, + "learning_rate": 3.029755280389203e-06, + "loss": 0.83367729, + "num_input_tokens_seen": 124587025, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.81640625, + "step": 5788, + "time_per_iteration": 2.58461332321167 + }, + { + "auxiliary_loss_clip": 0.01130932, + "auxiliary_loss_mlp": 0.01038586, + "balance_loss_clip": 1.02333927, + "balance_loss_mlp": 1.04716599, + "epoch": 0.3480535096948745, + "flos": 20120713332480.0, + "grad_norm": 1.7599288417752579, + "language_loss": 0.85399663, + "learning_rate": 3.029421389513147e-06, + "loss": 0.87569183, + "num_input_tokens_seen": 124605860, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8359375, + "step": 5789, + "time_per_iteration": 2.4460527896881104 + }, + { + "auxiliary_loss_clip": 0.01127788, + "auxiliary_loss_mlp": 0.01050411, + "balance_loss_clip": 1.03517616, + "balance_loss_mlp": 1.04420161, + "epoch": 0.34811363294754244, + "flos": 18548974938240.0, + "grad_norm": 1.9217222904205502, + "language_loss": 0.84973574, + "learning_rate": 3.029087459601328e-06, + "loss": 0.87151778, + "num_input_tokens_seen": 124624270, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8359375, + "step": 5790, + "time_per_iteration": 2.4690423011779785 + }, + { + "auxiliary_loss_clip": 0.0112493, + "auxiliary_loss_mlp": 0.01044909, + "balance_loss_clip": 1.0295074, + "balance_loss_mlp": 1.04403305, + "epoch": 0.3481737562002104, + "flos": 26870303105280.0, + "grad_norm": 2.0218239222922785, + "language_loss": 0.82098949, + "learning_rate": 3.0287534906664097e-06, + "loss": 0.8426879, + "num_input_tokens_seen": 124644005, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.80859375, + "step": 5791, + "time_per_iteration": 2.4949092864990234 + }, + { + "auxiliary_loss_clip": 0.01124824, + "auxiliary_loss_mlp": 0.01038811, + "balance_loss_clip": 1.02386248, + "balance_loss_mlp": 1.04235744, + "epoch": 0.3482338794528784, + "flos": 28908664104960.0, + "grad_norm": 1.7691925727921667, + "language_loss": 0.77531552, + "learning_rate": 3.028419482721056e-06, + "loss": 0.79695195, + "num_input_tokens_seen": 124663020, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.82421875, + "step": 5792, + "time_per_iteration": 2.5464468002319336 + }, + { + "auxiliary_loss_clip": 0.01121021, + "auxiliary_loss_mlp": 0.01031113, + "balance_loss_clip": 1.01623607, + "balance_loss_mlp": 1.04100966, + "epoch": 0.3482940027055464, + "flos": 22200767043840.0, + "grad_norm": 1.5041206153246893, + "language_loss": 0.81592953, + "learning_rate": 3.0280854357779325e-06, + "loss": 0.83745086, + "num_input_tokens_seen": 124682975, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.80078125, + "step": 5793, + "time_per_iteration": 2.454220771789551 + }, + { + "auxiliary_loss_clip": 0.01126572, + "auxiliary_loss_mlp": 0.01046613, + "balance_loss_clip": 1.03078222, + "balance_loss_mlp": 1.04426205, + "epoch": 0.34835412595821436, + "flos": 20302708567680.0, + "grad_norm": 1.7524057524538565, + "language_loss": 0.76222527, + "learning_rate": 3.027751349849706e-06, + "loss": 0.78395712, + "num_input_tokens_seen": 124701340, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8203125, + "step": 5794, + "time_per_iteration": 2.485077142715454 + }, + { + "auxiliary_loss_clip": 0.01121136, + "auxiliary_loss_mlp": 0.01036584, + "balance_loss_clip": 1.02165866, + "balance_loss_mlp": 1.04168189, + "epoch": 0.3484142492108823, + "flos": 20449691020800.0, + "grad_norm": 2.2347385462744165, + "language_loss": 0.56926, + "learning_rate": 3.0274172249490456e-06, + "loss": 0.59083712, + "num_input_tokens_seen": 124719165, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.796875, + "step": 5795, + "time_per_iteration": 2.4378490447998047 + }, + { + "auxiliary_loss_clip": 0.01121205, + "auxiliary_loss_mlp": 0.01036633, + "balance_loss_clip": 1.02250659, + "balance_loss_mlp": 1.04285967, + "epoch": 0.3484743724635503, + "flos": 24352929308160.0, + "grad_norm": 2.137832792929428, + "language_loss": 0.82437253, + "learning_rate": 3.0270830610886213e-06, + "loss": 0.84595084, + "num_input_tokens_seen": 124738670, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.78125, + "step": 5796, + "time_per_iteration": 2.5187671184539795 + }, + { + "auxiliary_loss_clip": 0.01120499, + "auxiliary_loss_mlp": 0.01029245, + "balance_loss_clip": 1.0153811, + "balance_loss_mlp": 1.043782, + "epoch": 0.34853449571621825, + "flos": 24353001135360.0, + "grad_norm": 1.7817355656860259, + "language_loss": 0.83580989, + "learning_rate": 3.0267488582811033e-06, + "loss": 0.85730731, + "num_input_tokens_seen": 124758760, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 5797, + "time_per_iteration": 2.518832206726074 + }, + { + "auxiliary_loss_clip": 0.01119982, + "auxiliary_loss_mlp": 0.01034692, + "balance_loss_clip": 1.02017224, + "balance_loss_mlp": 1.04206371, + "epoch": 0.3485946189688862, + "flos": 27267690245760.0, + "grad_norm": 1.7199370679887815, + "language_loss": 0.73215538, + "learning_rate": 3.026414616539167e-06, + "loss": 0.7537021, + "num_input_tokens_seen": 124777765, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 5798, + "time_per_iteration": 2.499967575073242 + }, + { + "auxiliary_loss_clip": 0.01123251, + "auxiliary_loss_mlp": 0.01041885, + "balance_loss_clip": 1.02660251, + "balance_loss_mlp": 1.04203498, + "epoch": 0.3486547422215542, + "flos": 20156695781760.0, + "grad_norm": 2.0872044860332597, + "language_loss": 0.75936413, + "learning_rate": 3.026080335875485e-06, + "loss": 0.78101552, + "num_input_tokens_seen": 124796775, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8125, + "step": 5799, + "time_per_iteration": 2.4452474117279053 + }, + { + "auxiliary_loss_clip": 0.01121272, + "auxiliary_loss_mlp": 0.01038695, + "balance_loss_clip": 1.0248909, + "balance_loss_mlp": 1.04197407, + "epoch": 0.34871486547422215, + "flos": 20230348619520.0, + "grad_norm": 1.7461935027983841, + "language_loss": 0.75557071, + "learning_rate": 3.025746016302734e-06, + "loss": 0.7771703, + "num_input_tokens_seen": 124815825, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.79296875, + "step": 5800, + "time_per_iteration": 2.4526796340942383 + }, + { + "auxiliary_loss_clip": 0.01129939, + "auxiliary_loss_mlp": 0.0104466, + "balance_loss_clip": 1.02854276, + "balance_loss_mlp": 1.04578733, + "epoch": 0.3487749887268901, + "flos": 44053234882560.0, + "grad_norm": 2.3150001070935127, + "language_loss": 0.67645729, + "learning_rate": 3.025411657833591e-06, + "loss": 0.69820327, + "num_input_tokens_seen": 124838420, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.84375, + "step": 5801, + "time_per_iteration": 2.644601821899414 + }, + { + "auxiliary_loss_clip": 0.01122812, + "auxiliary_loss_mlp": 0.0104057, + "balance_loss_clip": 1.02563334, + "balance_loss_mlp": 1.04446411, + "epoch": 0.3488351119795581, + "flos": 23295144666240.0, + "grad_norm": 1.9000140831486088, + "language_loss": 0.76785576, + "learning_rate": 3.025077260480735e-06, + "loss": 0.78948951, + "num_input_tokens_seen": 124857320, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.78515625, + "step": 5802, + "time_per_iteration": 2.46921968460083 + }, + { + "auxiliary_loss_clip": 0.01118956, + "auxiliary_loss_mlp": 0.01033761, + "balance_loss_clip": 1.01905692, + "balance_loss_mlp": 1.04294538, + "epoch": 0.34889523523222604, + "flos": 19934839428480.0, + "grad_norm": 1.750768588632487, + "language_loss": 0.78868455, + "learning_rate": 3.0247428242568474e-06, + "loss": 0.81021172, + "num_input_tokens_seen": 124875685, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.76171875, + "step": 5803, + "time_per_iteration": 3.979863405227661 + }, + { + "auxiliary_loss_clip": 0.01123257, + "auxiliary_loss_mlp": 0.01036614, + "balance_loss_clip": 1.02266085, + "balance_loss_mlp": 1.0410372, + "epoch": 0.348955358484894, + "flos": 30446179816320.0, + "grad_norm": 1.9657380954946277, + "language_loss": 0.67745399, + "learning_rate": 3.0244083491746085e-06, + "loss": 0.69905275, + "num_input_tokens_seen": 124895960, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.8203125, + "step": 5804, + "time_per_iteration": 3.8562989234924316 + }, + { + "auxiliary_loss_clip": 0.01121349, + "auxiliary_loss_mlp": 0.01044714, + "balance_loss_clip": 1.03001559, + "balance_loss_mlp": 1.0454638, + "epoch": 0.349015481737562, + "flos": 17999972490240.0, + "grad_norm": 2.669385195944029, + "language_loss": 0.76021814, + "learning_rate": 3.024073835246702e-06, + "loss": 0.78187871, + "num_input_tokens_seen": 124914140, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7578125, + "step": 5805, + "time_per_iteration": 2.458235263824463 + }, + { + "auxiliary_loss_clip": 0.01124464, + "auxiliary_loss_mlp": 0.01036858, + "balance_loss_clip": 1.02199244, + "balance_loss_mlp": 1.0451802, + "epoch": 0.34907560499023, + "flos": 27198490694400.0, + "grad_norm": 3.0752866237359884, + "language_loss": 0.67804134, + "learning_rate": 3.023739282485814e-06, + "loss": 0.69965458, + "num_input_tokens_seen": 124934180, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.79296875, + "step": 5806, + "time_per_iteration": 2.4840877056121826 + }, + { + "auxiliary_loss_clip": 0.01126527, + "auxiliary_loss_mlp": 0.01040199, + "balance_loss_clip": 1.02523851, + "balance_loss_mlp": 1.04571056, + "epoch": 0.34913572824289796, + "flos": 30226873328640.0, + "grad_norm": 1.4876164360326454, + "language_loss": 0.71957624, + "learning_rate": 3.023404690904629e-06, + "loss": 0.74124348, + "num_input_tokens_seen": 124956060, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8046875, + "step": 5807, + "time_per_iteration": 2.542815685272217 + }, + { + "auxiliary_loss_clip": 0.01123687, + "auxiliary_loss_mlp": 0.01038505, + "balance_loss_clip": 1.02295971, + "balance_loss_mlp": 1.04158592, + "epoch": 0.3491958514955659, + "flos": 29971907614080.0, + "grad_norm": 1.7054576034597768, + "language_loss": 0.74218416, + "learning_rate": 3.0230700605158364e-06, + "loss": 0.7638061, + "num_input_tokens_seen": 124976070, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8203125, + "step": 5808, + "time_per_iteration": 2.503438949584961 + }, + { + "auxiliary_loss_clip": 0.01122149, + "auxiliary_loss_mlp": 0.01048536, + "balance_loss_clip": 1.03412986, + "balance_loss_mlp": 1.04479396, + "epoch": 0.3492559747482339, + "flos": 22783273902720.0, + "grad_norm": 1.5095416937429198, + "language_loss": 0.84245461, + "learning_rate": 3.0227353913321238e-06, + "loss": 0.86416149, + "num_input_tokens_seen": 124996995, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7734375, + "step": 5809, + "time_per_iteration": 2.4860358238220215 + }, + { + "auxiliary_loss_clip": 0.01119567, + "auxiliary_loss_mlp": 0.01036784, + "balance_loss_clip": 1.02354026, + "balance_loss_mlp": 1.04322374, + "epoch": 0.34931609800090185, + "flos": 26068022881920.0, + "grad_norm": 1.8434153763939258, + "language_loss": 0.80251479, + "learning_rate": 3.0224006833661835e-06, + "loss": 0.82407832, + "num_input_tokens_seen": 125015600, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.76171875, + "step": 5810, + "time_per_iteration": 2.481653928756714 + }, + { + "auxiliary_loss_clip": 0.01124044, + "auxiliary_loss_mlp": 0.01039794, + "balance_loss_clip": 1.02613211, + "balance_loss_mlp": 1.04406404, + "epoch": 0.3493762212535698, + "flos": 29242023252480.0, + "grad_norm": 1.967526444092296, + "language_loss": 0.75335366, + "learning_rate": 3.0220659366307057e-06, + "loss": 0.77499199, + "num_input_tokens_seen": 125035290, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.80078125, + "step": 5811, + "time_per_iteration": 2.534524440765381 + }, + { + "auxiliary_loss_clip": 0.01128245, + "auxiliary_loss_mlp": 0.01039888, + "balance_loss_clip": 1.02543986, + "balance_loss_mlp": 1.04616523, + "epoch": 0.3494363445062378, + "flos": 27126058919040.0, + "grad_norm": 1.4977831051483896, + "language_loss": 0.80070162, + "learning_rate": 3.021731151138386e-06, + "loss": 0.82238293, + "num_input_tokens_seen": 125057130, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8203125, + "step": 5812, + "time_per_iteration": 2.503074884414673 + }, + { + "auxiliary_loss_clip": 0.01123214, + "auxiliary_loss_mlp": 0.01042781, + "balance_loss_clip": 1.02746272, + "balance_loss_mlp": 1.04195547, + "epoch": 0.34949646775890575, + "flos": 12276207233280.0, + "grad_norm": 1.9471141693502576, + "language_loss": 0.6923517, + "learning_rate": 3.021396326901918e-06, + "loss": 0.71401167, + "num_input_tokens_seen": 125073720, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8125, + "step": 5813, + "time_per_iteration": 2.4503591060638428 + }, + { + "auxiliary_loss_clip": 0.01122458, + "auxiliary_loss_mlp": 0.01039452, + "balance_loss_clip": 1.02492023, + "balance_loss_mlp": 1.04438448, + "epoch": 0.3495565910115737, + "flos": 17165516659200.0, + "grad_norm": 2.4036318537481334, + "language_loss": 0.77007949, + "learning_rate": 3.0210614639339998e-06, + "loss": 0.79169858, + "num_input_tokens_seen": 125090635, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.78125, + "step": 5814, + "time_per_iteration": 2.4173405170440674 + }, + { + "auxiliary_loss_clip": 0.01126142, + "auxiliary_loss_mlp": 0.01042541, + "balance_loss_clip": 1.02692485, + "balance_loss_mlp": 1.04406822, + "epoch": 0.3496167142642417, + "flos": 26465661417600.0, + "grad_norm": 1.5090517849605465, + "language_loss": 0.84283173, + "learning_rate": 3.020726562247328e-06, + "loss": 0.86451852, + "num_input_tokens_seen": 125110070, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8203125, + "step": 5815, + "time_per_iteration": 2.5173141956329346 + }, + { + "auxiliary_loss_clip": 0.01124466, + "auxiliary_loss_mlp": 0.01033251, + "balance_loss_clip": 1.01981044, + "balance_loss_mlp": 1.04368711, + "epoch": 0.34967683751690964, + "flos": 17414843938560.0, + "grad_norm": 2.123091285603595, + "language_loss": 0.77423191, + "learning_rate": 3.0203916218546024e-06, + "loss": 0.79580915, + "num_input_tokens_seen": 125125730, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.80859375, + "step": 5816, + "time_per_iteration": 2.413438558578491 + }, + { + "auxiliary_loss_clip": 0.01128865, + "auxiliary_loss_mlp": 0.01042596, + "balance_loss_clip": 1.02761126, + "balance_loss_mlp": 1.0468061, + "epoch": 0.3497369607695776, + "flos": 22600021691520.0, + "grad_norm": 2.144763996717865, + "language_loss": 0.58441401, + "learning_rate": 3.0200566427685246e-06, + "loss": 0.60612863, + "num_input_tokens_seen": 125146195, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8203125, + "step": 5817, + "time_per_iteration": 2.5161447525024414 + }, + { + "auxiliary_loss_clip": 0.01042618, + "auxiliary_loss_mlp": 0.01011257, + "balance_loss_clip": 1.00957632, + "balance_loss_mlp": 1.01738954, + "epoch": 0.34979708402224563, + "flos": 68529374818560.0, + "grad_norm": 0.8658844915790124, + "language_loss": 0.59855008, + "learning_rate": 3.0197216250017975e-06, + "loss": 0.61908889, + "num_input_tokens_seen": 125207790, + "router_z_loss_clip": 0.0168457, + "router_z_loss_mlp": 0.25195312, + "step": 5818, + "time_per_iteration": 3.105595111846924 + }, + { + "auxiliary_loss_clip": 0.01123632, + "auxiliary_loss_mlp": 0.01036752, + "balance_loss_clip": 1.02226782, + "balance_loss_mlp": 1.04561055, + "epoch": 0.3498572072749136, + "flos": 18989634988800.0, + "grad_norm": 3.0068929936640103, + "language_loss": 0.83458424, + "learning_rate": 3.019386568567123e-06, + "loss": 0.85618806, + "num_input_tokens_seen": 125226220, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 5819, + "time_per_iteration": 2.47537899017334 + }, + { + "auxiliary_loss_clip": 0.01123279, + "auxiliary_loss_mlp": 0.0103063, + "balance_loss_clip": 1.01655149, + "balance_loss_mlp": 1.04359841, + "epoch": 0.34991733052758156, + "flos": 27818883423360.0, + "grad_norm": 3.6330435008795483, + "language_loss": 0.70765841, + "learning_rate": 3.0190514734772083e-06, + "loss": 0.7291975, + "num_input_tokens_seen": 125247485, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.796875, + "step": 5820, + "time_per_iteration": 2.4817428588867188 + }, + { + "auxiliary_loss_clip": 0.01125706, + "auxiliary_loss_mlp": 0.01035768, + "balance_loss_clip": 1.022434, + "balance_loss_mlp": 1.04544306, + "epoch": 0.3499774537802495, + "flos": 33584197737600.0, + "grad_norm": 2.1579309336976547, + "language_loss": 0.70112801, + "learning_rate": 3.018716339744759e-06, + "loss": 0.7227428, + "num_input_tokens_seen": 125268625, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.80078125, + "step": 5821, + "time_per_iteration": 2.578753709793091 + }, + { + "auxiliary_loss_clip": 0.01131817, + "auxiliary_loss_mlp": 0.0103919, + "balance_loss_clip": 1.02328706, + "balance_loss_mlp": 1.04798198, + "epoch": 0.3500375770329175, + "flos": 23476744851840.0, + "grad_norm": 2.9634934958204076, + "language_loss": 0.73591399, + "learning_rate": 3.0183811673824842e-06, + "loss": 0.75762403, + "num_input_tokens_seen": 125287530, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.83984375, + "step": 5822, + "time_per_iteration": 2.469041109085083 + }, + { + "auxiliary_loss_clip": 0.01127055, + "auxiliary_loss_mlp": 0.01036959, + "balance_loss_clip": 1.02150989, + "balance_loss_mlp": 1.0447278, + "epoch": 0.35009770028558546, + "flos": 19026048401280.0, + "grad_norm": 1.5203539526389718, + "language_loss": 0.78104019, + "learning_rate": 3.018045956403094e-06, + "loss": 0.80268037, + "num_input_tokens_seen": 125307020, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.82421875, + "step": 5823, + "time_per_iteration": 2.4932196140289307 + }, + { + "auxiliary_loss_clip": 0.01038228, + "auxiliary_loss_mlp": 0.01001335, + "balance_loss_clip": 0.99964237, + "balance_loss_mlp": 1.01332808, + "epoch": 0.3501578235382534, + "flos": 68351868783360.0, + "grad_norm": 1.4438996436497689, + "language_loss": 0.59237444, + "learning_rate": 3.017710706819298e-06, + "loss": 0.61277008, + "num_input_tokens_seen": 125370445, + "router_z_loss_clip": 0.01696777, + "router_z_loss_mlp": 0.24902344, + "step": 5824, + "time_per_iteration": 3.109966278076172 + }, + { + "auxiliary_loss_clip": 0.01125511, + "auxiliary_loss_mlp": 0.01036598, + "balance_loss_clip": 1.0213685, + "balance_loss_mlp": 1.04462993, + "epoch": 0.3502179467909214, + "flos": 21250893836160.0, + "grad_norm": 1.8425293735622459, + "language_loss": 0.84740114, + "learning_rate": 3.017375418643811e-06, + "loss": 0.86902225, + "num_input_tokens_seen": 125388900, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.80859375, + "step": 5825, + "time_per_iteration": 2.4780030250549316 + }, + { + "auxiliary_loss_clip": 0.01125254, + "auxiliary_loss_mlp": 0.01038054, + "balance_loss_clip": 1.02292657, + "balance_loss_mlp": 1.04522121, + "epoch": 0.35027807004358935, + "flos": 11942955826560.0, + "grad_norm": 2.24584207136959, + "language_loss": 0.82778502, + "learning_rate": 3.0170400918893464e-06, + "loss": 0.84941804, + "num_input_tokens_seen": 125402675, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.80078125, + "step": 5826, + "time_per_iteration": 2.4147045612335205 + }, + { + "auxiliary_loss_clip": 0.01126938, + "auxiliary_loss_mlp": 0.01041103, + "balance_loss_clip": 1.02587962, + "balance_loss_mlp": 1.04480314, + "epoch": 0.3503381932962573, + "flos": 21470918595840.0, + "grad_norm": 1.5075773428374344, + "language_loss": 0.80714649, + "learning_rate": 3.0167047265686186e-06, + "loss": 0.8288269, + "num_input_tokens_seen": 125421360, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8203125, + "step": 5827, + "time_per_iteration": 2.4650330543518066 + }, + { + "auxiliary_loss_clip": 0.01123347, + "auxiliary_loss_mlp": 0.01035841, + "balance_loss_clip": 1.0220902, + "balance_loss_mlp": 1.04475152, + "epoch": 0.3503983165489253, + "flos": 21251109317760.0, + "grad_norm": 2.7582821019631836, + "language_loss": 0.70936024, + "learning_rate": 3.0163693226943467e-06, + "loss": 0.73095214, + "num_input_tokens_seen": 125440000, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.78515625, + "step": 5828, + "time_per_iteration": 2.4710564613342285 + }, + { + "auxiliary_loss_clip": 0.01130881, + "auxiliary_loss_mlp": 0.01043725, + "balance_loss_clip": 1.02666616, + "balance_loss_mlp": 1.04788435, + "epoch": 0.35045843980159325, + "flos": 27815723026560.0, + "grad_norm": 1.628373483521701, + "language_loss": 0.79397106, + "learning_rate": 3.016033880279248e-06, + "loss": 0.81571716, + "num_input_tokens_seen": 125460390, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.828125, + "step": 5829, + "time_per_iteration": 2.5081264972686768 + }, + { + "auxiliary_loss_clip": 0.01129997, + "auxiliary_loss_mlp": 0.0104564, + "balance_loss_clip": 1.02900994, + "balance_loss_mlp": 1.04607642, + "epoch": 0.3505185630542612, + "flos": 25921148169600.0, + "grad_norm": 1.7135270810407168, + "language_loss": 0.72111332, + "learning_rate": 3.0156983993360417e-06, + "loss": 0.74286962, + "num_input_tokens_seen": 125478410, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.83984375, + "step": 5830, + "time_per_iteration": 2.507263422012329 + }, + { + "auxiliary_loss_clip": 0.01122818, + "auxiliary_loss_mlp": 0.01033023, + "balance_loss_clip": 1.01801419, + "balance_loss_mlp": 1.04352021, + "epoch": 0.35057868630692923, + "flos": 20521763660160.0, + "grad_norm": 2.0188022258715996, + "language_loss": 0.88740343, + "learning_rate": 3.0153628798774513e-06, + "loss": 0.90896189, + "num_input_tokens_seen": 125495975, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.79296875, + "step": 5831, + "time_per_iteration": 2.4769816398620605 + }, + { + "auxiliary_loss_clip": 0.01122435, + "auxiliary_loss_mlp": 0.01040768, + "balance_loss_clip": 1.02560508, + "balance_loss_mlp": 1.04128802, + "epoch": 0.3506388095595972, + "flos": 20448649526400.0, + "grad_norm": 1.9377344606434141, + "language_loss": 0.78478962, + "learning_rate": 3.0150273219161985e-06, + "loss": 0.80642164, + "num_input_tokens_seen": 125515035, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8125, + "step": 5832, + "time_per_iteration": 2.458019971847534 + }, + { + "auxiliary_loss_clip": 0.01125835, + "auxiliary_loss_mlp": 0.01043672, + "balance_loss_clip": 1.02744734, + "balance_loss_mlp": 1.04360127, + "epoch": 0.35069893281226516, + "flos": 23109665811840.0, + "grad_norm": 1.8976688118149017, + "language_loss": 0.70859557, + "learning_rate": 3.014691725465008e-06, + "loss": 0.73029065, + "num_input_tokens_seen": 125535555, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.82421875, + "step": 5833, + "time_per_iteration": 2.494739055633545 + }, + { + "auxiliary_loss_clip": 0.01121087, + "auxiliary_loss_mlp": 0.01030807, + "balance_loss_clip": 1.01635337, + "balance_loss_mlp": 1.04384482, + "epoch": 0.35075905606493313, + "flos": 27271999877760.0, + "grad_norm": 1.3472514068868482, + "language_loss": 0.80878949, + "learning_rate": 3.014356090536606e-06, + "loss": 0.83030844, + "num_input_tokens_seen": 125558195, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7734375, + "step": 5834, + "time_per_iteration": 2.521343231201172 + }, + { + "auxiliary_loss_clip": 0.01124914, + "auxiliary_loss_mlp": 0.01043402, + "balance_loss_clip": 1.02823853, + "balance_loss_mlp": 1.04525888, + "epoch": 0.3508191793176011, + "flos": 19128608709120.0, + "grad_norm": 2.219662071096021, + "language_loss": 0.83629, + "learning_rate": 3.0140204171437183e-06, + "loss": 0.8579731, + "num_input_tokens_seen": 125575375, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.796875, + "step": 5835, + "time_per_iteration": 2.53587007522583 + }, + { + "auxiliary_loss_clip": 0.01123177, + "auxiliary_loss_mlp": 0.01043674, + "balance_loss_clip": 1.02932119, + "balance_loss_mlp": 1.04351568, + "epoch": 0.35087930257026906, + "flos": 25557588662400.0, + "grad_norm": 2.120648036265282, + "language_loss": 0.76607329, + "learning_rate": 3.0136847052990754e-06, + "loss": 0.78774178, + "num_input_tokens_seen": 125596745, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.796875, + "step": 5836, + "time_per_iteration": 2.54390549659729 + }, + { + "auxiliary_loss_clip": 0.01128097, + "auxiliary_loss_mlp": 0.01038562, + "balance_loss_clip": 1.02382731, + "balance_loss_mlp": 1.04872775, + "epoch": 0.350939425822937, + "flos": 18004246208640.0, + "grad_norm": 2.2292749531356986, + "language_loss": 0.77354801, + "learning_rate": 3.0133489550154074e-06, + "loss": 0.79521459, + "num_input_tokens_seen": 125613980, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.79296875, + "step": 5837, + "time_per_iteration": 2.4478273391723633 + }, + { + "auxiliary_loss_clip": 0.01123898, + "auxiliary_loss_mlp": 0.01044754, + "balance_loss_clip": 1.02998376, + "balance_loss_mlp": 1.04441822, + "epoch": 0.350999549075605, + "flos": 22273198819200.0, + "grad_norm": 1.6098451794116821, + "language_loss": 0.68129408, + "learning_rate": 3.0130131663054442e-06, + "loss": 0.70298064, + "num_input_tokens_seen": 125632100, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.79296875, + "step": 5838, + "time_per_iteration": 2.505833864212036 + }, + { + "auxiliary_loss_clip": 0.01122037, + "auxiliary_loss_mlp": 0.01034351, + "balance_loss_clip": 1.01945019, + "balance_loss_mlp": 1.04240978, + "epoch": 0.35105967232827295, + "flos": 14392279307520.0, + "grad_norm": 2.0937603738721173, + "language_loss": 0.83561182, + "learning_rate": 3.0126773391819215e-06, + "loss": 0.85717571, + "num_input_tokens_seen": 125649190, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.796875, + "step": 5839, + "time_per_iteration": 2.4378576278686523 + }, + { + "auxiliary_loss_clip": 0.01126069, + "auxiliary_loss_mlp": 0.01042905, + "balance_loss_clip": 1.02775335, + "balance_loss_mlp": 1.04351032, + "epoch": 0.3511197955809409, + "flos": 25082346792960.0, + "grad_norm": 1.6277808139419232, + "language_loss": 0.58590645, + "learning_rate": 3.012341473657572e-06, + "loss": 0.60759622, + "num_input_tokens_seen": 125668680, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.828125, + "step": 5840, + "time_per_iteration": 2.4883387088775635 + }, + { + "auxiliary_loss_clip": 0.01125241, + "auxiliary_loss_mlp": 0.01035574, + "balance_loss_clip": 1.02015984, + "balance_loss_mlp": 1.04445219, + "epoch": 0.3511799188336089, + "flos": 25884160139520.0, + "grad_norm": 2.7790843018814058, + "language_loss": 0.87061596, + "learning_rate": 3.0120055697451322e-06, + "loss": 0.89222413, + "num_input_tokens_seen": 125686935, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.80859375, + "step": 5841, + "time_per_iteration": 2.5035836696624756 + }, + { + "auxiliary_loss_clip": 0.01128185, + "auxiliary_loss_mlp": 0.01041931, + "balance_loss_clip": 1.02551615, + "balance_loss_mlp": 1.0455035, + "epoch": 0.35124004208627685, + "flos": 20083725302400.0, + "grad_norm": 1.6842451001577108, + "language_loss": 0.74924648, + "learning_rate": 3.0116696274573406e-06, + "loss": 0.77094764, + "num_input_tokens_seen": 125707180, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.828125, + "step": 5842, + "time_per_iteration": 2.4677891731262207 + }, + { + "auxiliary_loss_clip": 0.01125535, + "auxiliary_loss_mlp": 0.01040751, + "balance_loss_clip": 1.02552199, + "balance_loss_mlp": 1.04403496, + "epoch": 0.3513001653389448, + "flos": 17783431349760.0, + "grad_norm": 3.45436030057014, + "language_loss": 0.68184745, + "learning_rate": 3.0113336468069346e-06, + "loss": 0.70351034, + "num_input_tokens_seen": 125722780, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8125, + "step": 5843, + "time_per_iteration": 2.4356935024261475 + }, + { + "auxiliary_loss_clip": 0.01123467, + "auxiliary_loss_mlp": 0.01042343, + "balance_loss_clip": 1.02734041, + "balance_loss_mlp": 1.04418659, + "epoch": 0.3513602885916128, + "flos": 29387138198400.0, + "grad_norm": 3.71115813366519, + "language_loss": 0.65957326, + "learning_rate": 3.010997627806655e-06, + "loss": 0.68123138, + "num_input_tokens_seen": 125742110, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.79296875, + "step": 5844, + "time_per_iteration": 2.4961743354797363 + }, + { + "auxiliary_loss_clip": 0.01124887, + "auxiliary_loss_mlp": 0.01040447, + "balance_loss_clip": 1.02446079, + "balance_loss_mlp": 1.04466677, + "epoch": 0.3514204118442808, + "flos": 16179876483840.0, + "grad_norm": 2.036064641334285, + "language_loss": 0.75629944, + "learning_rate": 3.010661570469245e-06, + "loss": 0.77795279, + "num_input_tokens_seen": 125759980, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8046875, + "step": 5845, + "time_per_iteration": 5.325402498245239 + }, + { + "auxiliary_loss_clip": 0.01123091, + "auxiliary_loss_mlp": 0.01039418, + "balance_loss_clip": 1.02483845, + "balance_loss_mlp": 1.04537153, + "epoch": 0.35148053509694877, + "flos": 23834665923840.0, + "grad_norm": 2.494167784966283, + "language_loss": 0.73075795, + "learning_rate": 3.0103254748074465e-06, + "loss": 0.75238299, + "num_input_tokens_seen": 125772660, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.77734375, + "step": 5846, + "time_per_iteration": 2.4515323638916016 + }, + { + "auxiliary_loss_clip": 0.01127959, + "auxiliary_loss_mlp": 0.01040823, + "balance_loss_clip": 1.02587426, + "balance_loss_mlp": 1.04755926, + "epoch": 0.35154065834961673, + "flos": 20991295267200.0, + "grad_norm": 1.6229430725765215, + "language_loss": 0.75876832, + "learning_rate": 3.0099893408340046e-06, + "loss": 0.78045619, + "num_input_tokens_seen": 125791935, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8046875, + "step": 5847, + "time_per_iteration": 2.4869656562805176 + }, + { + "auxiliary_loss_clip": 0.0112227, + "auxiliary_loss_mlp": 0.01034732, + "balance_loss_clip": 1.02067161, + "balance_loss_mlp": 1.04212832, + "epoch": 0.3516007816022847, + "flos": 33255471444480.0, + "grad_norm": 2.14189752244475, + "language_loss": 0.72070903, + "learning_rate": 3.009653168561666e-06, + "loss": 0.74227905, + "num_input_tokens_seen": 125813455, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.80078125, + "step": 5848, + "time_per_iteration": 2.5580503940582275 + }, + { + "auxiliary_loss_clip": 0.01127957, + "auxiliary_loss_mlp": 0.01044586, + "balance_loss_clip": 1.02953017, + "balance_loss_mlp": 1.04648554, + "epoch": 0.35166090485495266, + "flos": 11726953390080.0, + "grad_norm": 2.252970750126207, + "language_loss": 0.89321303, + "learning_rate": 3.009316958003178e-06, + "loss": 0.91493851, + "num_input_tokens_seen": 125827660, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8125, + "step": 5849, + "time_per_iteration": 2.4167070388793945 + }, + { + "auxiliary_loss_clip": 0.01123705, + "auxiliary_loss_mlp": 0.01032745, + "balance_loss_clip": 1.01810622, + "balance_loss_mlp": 1.04373825, + "epoch": 0.3517210281076206, + "flos": 22638446265600.0, + "grad_norm": 2.8040734708025026, + "language_loss": 0.74810916, + "learning_rate": 3.0089807091712897e-06, + "loss": 0.76967371, + "num_input_tokens_seen": 125846655, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.80078125, + "step": 5850, + "time_per_iteration": 2.457970142364502 + }, + { + "auxiliary_loss_clip": 0.0112382, + "auxiliary_loss_mlp": 0.01032618, + "balance_loss_clip": 1.01809859, + "balance_loss_mlp": 1.04618788, + "epoch": 0.3517811513602886, + "flos": 21322750993920.0, + "grad_norm": 1.5003899492593988, + "language_loss": 0.7563765, + "learning_rate": 3.0086444220787515e-06, + "loss": 0.77794087, + "num_input_tokens_seen": 125866290, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.77734375, + "step": 5851, + "time_per_iteration": 2.48270845413208 + }, + { + "auxiliary_loss_clip": 0.01126446, + "auxiliary_loss_mlp": 0.01036787, + "balance_loss_clip": 1.0219928, + "balance_loss_mlp": 1.04683256, + "epoch": 0.35184127461295656, + "flos": 21032880238080.0, + "grad_norm": 2.074837490144385, + "language_loss": 0.87552518, + "learning_rate": 3.0083080967383165e-06, + "loss": 0.89715755, + "num_input_tokens_seen": 125884620, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 5852, + "time_per_iteration": 2.4690029621124268 + }, + { + "auxiliary_loss_clip": 0.01122074, + "auxiliary_loss_mlp": 0.01035979, + "balance_loss_clip": 1.02193666, + "balance_loss_mlp": 1.04361391, + "epoch": 0.3519013978656245, + "flos": 22455265881600.0, + "grad_norm": 2.0973347969099048, + "language_loss": 0.67880064, + "learning_rate": 3.007971733162737e-06, + "loss": 0.70038116, + "num_input_tokens_seen": 125902430, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.78515625, + "step": 5853, + "time_per_iteration": 2.4953458309173584 + }, + { + "auxiliary_loss_clip": 0.01125495, + "auxiliary_loss_mlp": 0.01034243, + "balance_loss_clip": 1.0195092, + "balance_loss_mlp": 1.04545975, + "epoch": 0.3519615211182925, + "flos": 13115295918720.0, + "grad_norm": 1.6680659623481517, + "language_loss": 0.8122859, + "learning_rate": 3.0076353313647686e-06, + "loss": 0.83388329, + "num_input_tokens_seen": 125920570, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.80078125, + "step": 5854, + "time_per_iteration": 2.4702916145324707 + }, + { + "auxiliary_loss_clip": 0.01122667, + "auxiliary_loss_mlp": 0.01030591, + "balance_loss_clip": 1.01734662, + "balance_loss_mlp": 1.04566765, + "epoch": 0.35202164437096045, + "flos": 19135144984320.0, + "grad_norm": 1.6003148952985655, + "language_loss": 0.73131359, + "learning_rate": 3.0072988913571666e-06, + "loss": 0.75284624, + "num_input_tokens_seen": 125939800, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.76953125, + "step": 5855, + "time_per_iteration": 2.4895823001861572 + }, + { + "auxiliary_loss_clip": 0.01120527, + "auxiliary_loss_mlp": 0.01039285, + "balance_loss_clip": 1.02549887, + "balance_loss_mlp": 1.04334307, + "epoch": 0.3520817676236284, + "flos": 26542187343360.0, + "grad_norm": 3.701560840262617, + "language_loss": 0.70894778, + "learning_rate": 3.006962413152691e-06, + "loss": 0.73054588, + "num_input_tokens_seen": 125958720, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7734375, + "step": 5856, + "time_per_iteration": 2.5133585929870605 + }, + { + "auxiliary_loss_clip": 0.01126422, + "auxiliary_loss_mlp": 0.01044153, + "balance_loss_clip": 1.02881038, + "balance_loss_mlp": 1.0456897, + "epoch": 0.3521418908762964, + "flos": 44893472803200.0, + "grad_norm": 1.8086114170356375, + "language_loss": 0.60915685, + "learning_rate": 3.0066258967640987e-06, + "loss": 0.63086259, + "num_input_tokens_seen": 125984310, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.80859375, + "step": 5857, + "time_per_iteration": 2.723238468170166 + }, + { + "auxiliary_loss_clip": 0.01123346, + "auxiliary_loss_mlp": 0.01039329, + "balance_loss_clip": 1.02434421, + "balance_loss_mlp": 1.04425693, + "epoch": 0.3522020141289644, + "flos": 20187398931840.0, + "grad_norm": 1.754440516271971, + "language_loss": 0.73341751, + "learning_rate": 3.006289342204152e-06, + "loss": 0.75504428, + "num_input_tokens_seen": 126002410, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7890625, + "step": 5858, + "time_per_iteration": 2.509556293487549 + }, + { + "auxiliary_loss_clip": 0.01125415, + "auxiliary_loss_mlp": 0.01041448, + "balance_loss_clip": 1.02720821, + "balance_loss_mlp": 1.04428148, + "epoch": 0.35226213738163237, + "flos": 27563917708800.0, + "grad_norm": 1.4710047028379252, + "language_loss": 0.76090813, + "learning_rate": 3.0059527494856126e-06, + "loss": 0.7825768, + "num_input_tokens_seen": 126022490, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8125, + "step": 5859, + "time_per_iteration": 2.584312677383423 + }, + { + "auxiliary_loss_clip": 0.0113226, + "auxiliary_loss_mlp": 0.01038835, + "balance_loss_clip": 1.0230875, + "balance_loss_mlp": 1.04828274, + "epoch": 0.35232226063430033, + "flos": 22966310632320.0, + "grad_norm": 1.6944630123418771, + "language_loss": 0.71475387, + "learning_rate": 3.0056161186212435e-06, + "loss": 0.73646474, + "num_input_tokens_seen": 126042895, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.83984375, + "step": 5860, + "time_per_iteration": 2.5120623111724854 + }, + { + "auxiliary_loss_clip": 0.01125655, + "auxiliary_loss_mlp": 0.0104098, + "balance_loss_clip": 1.02506578, + "balance_loss_mlp": 1.04208136, + "epoch": 0.3523823838869683, + "flos": 19168290259200.0, + "grad_norm": 2.10777684168558, + "language_loss": 0.6624974, + "learning_rate": 3.005279449623811e-06, + "loss": 0.68416381, + "num_input_tokens_seen": 126060130, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8359375, + "step": 5861, + "time_per_iteration": 2.4927096366882324 + }, + { + "auxiliary_loss_clip": 0.01123555, + "auxiliary_loss_mlp": 0.0103431, + "balance_loss_clip": 1.01994538, + "balance_loss_mlp": 1.04497313, + "epoch": 0.35244250713963626, + "flos": 17930988420480.0, + "grad_norm": 2.1064993181157843, + "language_loss": 0.66780227, + "learning_rate": 3.0049427425060815e-06, + "loss": 0.68938088, + "num_input_tokens_seen": 126077850, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78515625, + "step": 5862, + "time_per_iteration": 2.4275379180908203 + }, + { + "auxiliary_loss_clip": 0.01124727, + "auxiliary_loss_mlp": 0.01037294, + "balance_loss_clip": 1.02132034, + "balance_loss_mlp": 1.04420304, + "epoch": 0.35250263039230423, + "flos": 21432529935360.0, + "grad_norm": 2.0193315360348842, + "language_loss": 0.77049166, + "learning_rate": 3.0046059972808215e-06, + "loss": 0.79211187, + "num_input_tokens_seen": 126095985, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8046875, + "step": 5863, + "time_per_iteration": 2.504391670227051 + }, + { + "auxiliary_loss_clip": 0.0112515, + "auxiliary_loss_mlp": 0.01034667, + "balance_loss_clip": 1.02027822, + "balance_loss_mlp": 1.04449666, + "epoch": 0.3525627536449722, + "flos": 27416863428480.0, + "grad_norm": 2.7341123556359297, + "language_loss": 0.75018549, + "learning_rate": 3.0042692139608024e-06, + "loss": 0.77178371, + "num_input_tokens_seen": 126116070, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8046875, + "step": 5864, + "time_per_iteration": 2.4962751865386963 + }, + { + "auxiliary_loss_clip": 0.01123376, + "auxiliary_loss_mlp": 0.0104564, + "balance_loss_clip": 1.03110838, + "balance_loss_mlp": 1.04376507, + "epoch": 0.35262287689764016, + "flos": 24789818430720.0, + "grad_norm": 1.9972182581193567, + "language_loss": 0.79051632, + "learning_rate": 3.003932392558793e-06, + "loss": 0.81220651, + "num_input_tokens_seen": 126135205, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.796875, + "step": 5865, + "time_per_iteration": 2.5369789600372314 + }, + { + "auxiliary_loss_clip": 0.01130515, + "auxiliary_loss_mlp": 0.01045214, + "balance_loss_clip": 1.02901387, + "balance_loss_mlp": 1.04835618, + "epoch": 0.3526830001503081, + "flos": 17821604528640.0, + "grad_norm": 1.8375125007543296, + "language_loss": 0.81622374, + "learning_rate": 3.0035955330875677e-06, + "loss": 0.8379811, + "num_input_tokens_seen": 126151895, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8203125, + "step": 5866, + "time_per_iteration": 2.497587203979492 + }, + { + "auxiliary_loss_clip": 0.01131205, + "auxiliary_loss_mlp": 0.01037377, + "balance_loss_clip": 1.02081871, + "balance_loss_mlp": 1.04493296, + "epoch": 0.3527431234029761, + "flos": 18078114528000.0, + "grad_norm": 2.1796505180833696, + "language_loss": 0.84552217, + "learning_rate": 3.0032586355598986e-06, + "loss": 0.867208, + "num_input_tokens_seen": 126168515, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.86328125, + "step": 5867, + "time_per_iteration": 2.5673649311065674 + }, + { + "auxiliary_loss_clip": 0.01126594, + "auxiliary_loss_mlp": 0.01043148, + "balance_loss_clip": 1.02764452, + "balance_loss_mlp": 1.04441357, + "epoch": 0.35280324665564405, + "flos": 19427350124160.0, + "grad_norm": 2.2018810166756873, + "language_loss": 0.74618357, + "learning_rate": 3.0029216999885613e-06, + "loss": 0.76788092, + "num_input_tokens_seen": 126186460, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8203125, + "step": 5868, + "time_per_iteration": 2.4571762084960938 + }, + { + "auxiliary_loss_clip": 0.01127392, + "auxiliary_loss_mlp": 0.01039387, + "balance_loss_clip": 1.02433038, + "balance_loss_mlp": 1.04489541, + "epoch": 0.352863369908312, + "flos": 21504027957120.0, + "grad_norm": 2.0366485396940615, + "language_loss": 0.61648643, + "learning_rate": 3.0025847263863327e-06, + "loss": 0.63815421, + "num_input_tokens_seen": 126206170, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.82421875, + "step": 5869, + "time_per_iteration": 2.5125019550323486 + }, + { + "auxiliary_loss_clip": 0.01124688, + "auxiliary_loss_mlp": 0.01042493, + "balance_loss_clip": 1.02690625, + "balance_loss_mlp": 1.04286385, + "epoch": 0.35292349316098, + "flos": 22309504490880.0, + "grad_norm": 2.290977208251557, + "language_loss": 0.74328029, + "learning_rate": 3.0022477147659917e-06, + "loss": 0.76495212, + "num_input_tokens_seen": 126225605, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8203125, + "step": 5870, + "time_per_iteration": 2.4636306762695312 + }, + { + "auxiliary_loss_clip": 0.0112446, + "auxiliary_loss_mlp": 0.01036399, + "balance_loss_clip": 1.02097332, + "balance_loss_mlp": 1.04412317, + "epoch": 0.352983616413648, + "flos": 33109745967360.0, + "grad_norm": 1.44010977521146, + "language_loss": 0.71498513, + "learning_rate": 3.001910665140316e-06, + "loss": 0.73659372, + "num_input_tokens_seen": 126250230, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8046875, + "step": 5871, + "time_per_iteration": 2.629002094268799 + }, + { + "auxiliary_loss_clip": 0.01120822, + "auxiliary_loss_mlp": 0.01033704, + "balance_loss_clip": 1.01999545, + "balance_loss_mlp": 1.04340768, + "epoch": 0.35304373966631597, + "flos": 18696603836160.0, + "grad_norm": 2.215441176085892, + "language_loss": 0.74219513, + "learning_rate": 3.0015735775220873e-06, + "loss": 0.76374042, + "num_input_tokens_seen": 126268315, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 5872, + "time_per_iteration": 2.4672691822052 + }, + { + "auxiliary_loss_clip": 0.01121667, + "auxiliary_loss_mlp": 0.01036996, + "balance_loss_clip": 1.02291727, + "balance_loss_mlp": 1.04295182, + "epoch": 0.35310386291898394, + "flos": 23364954748800.0, + "grad_norm": 1.6120105579455812, + "language_loss": 0.82492435, + "learning_rate": 3.001236451924089e-06, + "loss": 0.84651101, + "num_input_tokens_seen": 126288390, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 5873, + "time_per_iteration": 2.549706220626831 + }, + { + "auxiliary_loss_clip": 0.01128213, + "auxiliary_loss_mlp": 0.01044661, + "balance_loss_clip": 1.02800715, + "balance_loss_mlp": 1.04399252, + "epoch": 0.3531639861716519, + "flos": 24461954064000.0, + "grad_norm": 1.8495868157058504, + "language_loss": 0.6583339, + "learning_rate": 3.000899288359104e-06, + "loss": 0.68006265, + "num_input_tokens_seen": 126305750, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.84375, + "step": 5874, + "time_per_iteration": 2.4949634075164795 + }, + { + "auxiliary_loss_clip": 0.01044147, + "auxiliary_loss_mlp": 0.01006984, + "balance_loss_clip": 1.00510025, + "balance_loss_mlp": 1.01915693, + "epoch": 0.35322410942431987, + "flos": 70312446881280.0, + "grad_norm": 0.771003921858337, + "language_loss": 0.61583531, + "learning_rate": 3.000562086839917e-06, + "loss": 0.63634658, + "num_input_tokens_seen": 126362495, + "router_z_loss_clip": 0.01879883, + "router_z_loss_mlp": 0.25, + "step": 5875, + "time_per_iteration": 2.9931485652923584 + }, + { + "auxiliary_loss_clip": 0.01124819, + "auxiliary_loss_mlp": 0.01043824, + "balance_loss_clip": 1.02995443, + "balance_loss_mlp": 1.04544568, + "epoch": 0.35328423267698783, + "flos": 19820894509440.0, + "grad_norm": 1.6836782364007539, + "language_loss": 0.800933, + "learning_rate": 3.0002248473793163e-06, + "loss": 0.82261944, + "num_input_tokens_seen": 126378320, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.796875, + "step": 5876, + "time_per_iteration": 2.443178415298462 + }, + { + "auxiliary_loss_clip": 0.01041911, + "auxiliary_loss_mlp": 0.01006634, + "balance_loss_clip": 1.00477409, + "balance_loss_mlp": 1.01663578, + "epoch": 0.3533443559296558, + "flos": 60826356391680.0, + "grad_norm": 1.6287450036197537, + "language_loss": 0.5674026, + "learning_rate": 2.999887569990088e-06, + "loss": 0.587888, + "num_input_tokens_seen": 126442735, + "router_z_loss_clip": 0.01855469, + "router_z_loss_mlp": 0.25195312, + "step": 5877, + "time_per_iteration": 3.1782116889953613 + }, + { + "auxiliary_loss_clip": 0.01124291, + "auxiliary_loss_mlp": 0.01030449, + "balance_loss_clip": 1.01677024, + "balance_loss_mlp": 1.04401922, + "epoch": 0.35340447918232376, + "flos": 24755775315840.0, + "grad_norm": 1.5579095187110108, + "language_loss": 0.71649593, + "learning_rate": 2.999550254685024e-06, + "loss": 0.73804337, + "num_input_tokens_seen": 126463090, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.80078125, + "step": 5878, + "time_per_iteration": 2.4984474182128906 + }, + { + "auxiliary_loss_clip": 0.01123007, + "auxiliary_loss_mlp": 0.0103937, + "balance_loss_clip": 1.02498162, + "balance_loss_mlp": 1.04198527, + "epoch": 0.3534646024349917, + "flos": 21796304924160.0, + "grad_norm": 1.9384917614544617, + "language_loss": 0.78492844, + "learning_rate": 2.9992129014769136e-06, + "loss": 0.80655217, + "num_input_tokens_seen": 126482105, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.8125, + "step": 5879, + "time_per_iteration": 2.5369913578033447 + }, + { + "auxiliary_loss_clip": 0.01126898, + "auxiliary_loss_mlp": 0.01045347, + "balance_loss_clip": 1.02870536, + "balance_loss_mlp": 1.04373121, + "epoch": 0.3535247256876597, + "flos": 20012119539840.0, + "grad_norm": 2.0656781659104917, + "language_loss": 0.63695049, + "learning_rate": 2.9988755103785493e-06, + "loss": 0.65867293, + "num_input_tokens_seen": 126502125, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.83203125, + "step": 5880, + "time_per_iteration": 2.457787036895752 + }, + { + "auxiliary_loss_clip": 0.01125585, + "auxiliary_loss_mlp": 0.01036241, + "balance_loss_clip": 1.02078009, + "balance_loss_mlp": 1.04375386, + "epoch": 0.35358484894032766, + "flos": 18187929383040.0, + "grad_norm": 3.125568384757795, + "language_loss": 0.65818816, + "learning_rate": 2.998538081402727e-06, + "loss": 0.67980647, + "num_input_tokens_seen": 126521950, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.81640625, + "step": 5881, + "time_per_iteration": 2.5198867321014404 + }, + { + "auxiliary_loss_clip": 0.01119138, + "auxiliary_loss_mlp": 0.01031894, + "balance_loss_clip": 1.01851869, + "balance_loss_mlp": 1.04197288, + "epoch": 0.3536449721929956, + "flos": 22820369673600.0, + "grad_norm": 1.3882047203281038, + "language_loss": 0.75280428, + "learning_rate": 2.998200614562239e-06, + "loss": 0.77431458, + "num_input_tokens_seen": 126542445, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7734375, + "step": 5882, + "time_per_iteration": 2.4526872634887695 + }, + { + "auxiliary_loss_clip": 0.01126623, + "auxiliary_loss_mlp": 0.01037746, + "balance_loss_clip": 1.02266037, + "balance_loss_mlp": 1.04543018, + "epoch": 0.3537050954456636, + "flos": 26432336574720.0, + "grad_norm": 2.123888211837838, + "language_loss": 0.70349854, + "learning_rate": 2.9978631098698847e-06, + "loss": 0.72514224, + "num_input_tokens_seen": 126560690, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8125, + "step": 5883, + "time_per_iteration": 2.538865566253662 + }, + { + "auxiliary_loss_clip": 0.01129519, + "auxiliary_loss_mlp": 0.01038175, + "balance_loss_clip": 1.0228982, + "balance_loss_mlp": 1.04584253, + "epoch": 0.3537652186983316, + "flos": 17197153562880.0, + "grad_norm": 2.009195754637657, + "language_loss": 0.78500903, + "learning_rate": 2.9975255673384614e-06, + "loss": 0.80668598, + "num_input_tokens_seen": 126577620, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8359375, + "step": 5884, + "time_per_iteration": 2.4410510063171387 + }, + { + "auxiliary_loss_clip": 0.0112138, + "auxiliary_loss_mlp": 0.01032576, + "balance_loss_clip": 1.01901007, + "balance_loss_mlp": 1.04336667, + "epoch": 0.3538253419509996, + "flos": 19536769929600.0, + "grad_norm": 1.8922441591552446, + "language_loss": 0.75478536, + "learning_rate": 2.9971879869807673e-06, + "loss": 0.77632499, + "num_input_tokens_seen": 126596235, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.78125, + "step": 5885, + "time_per_iteration": 2.555816650390625 + }, + { + "auxiliary_loss_clip": 0.01127447, + "auxiliary_loss_mlp": 0.01042213, + "balance_loss_clip": 1.02666783, + "balance_loss_mlp": 1.04478371, + "epoch": 0.35388546520366754, + "flos": 12128578335360.0, + "grad_norm": 2.2081606315958635, + "language_loss": 0.82679224, + "learning_rate": 2.996850368809606e-06, + "loss": 0.84848893, + "num_input_tokens_seen": 126612830, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.82421875, + "step": 5886, + "time_per_iteration": 2.482151985168457 + }, + { + "auxiliary_loss_clip": 0.01124743, + "auxiliary_loss_mlp": 0.01032293, + "balance_loss_clip": 1.01717782, + "balance_loss_mlp": 1.04533887, + "epoch": 0.3539455884563355, + "flos": 19678149861120.0, + "grad_norm": 2.4580910750403775, + "language_loss": 0.78723359, + "learning_rate": 2.9965127128377787e-06, + "loss": 0.80880398, + "num_input_tokens_seen": 126630910, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.79296875, + "step": 5887, + "time_per_iteration": 5.388309001922607 + }, + { + "auxiliary_loss_clip": 0.01122251, + "auxiliary_loss_mlp": 0.0104141, + "balance_loss_clip": 1.0269978, + "balance_loss_mlp": 1.04226518, + "epoch": 0.35400571170900347, + "flos": 18072045129600.0, + "grad_norm": 3.1093010737907867, + "language_loss": 0.65404654, + "learning_rate": 2.996175019078089e-06, + "loss": 0.67568314, + "num_input_tokens_seen": 126648365, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.80078125, + "step": 5888, + "time_per_iteration": 2.4438626766204834 + }, + { + "auxiliary_loss_clip": 0.01123271, + "auxiliary_loss_mlp": 0.01036138, + "balance_loss_clip": 1.02248812, + "balance_loss_mlp": 1.04373193, + "epoch": 0.35406583496167143, + "flos": 26068058795520.0, + "grad_norm": 1.6702882106954304, + "language_loss": 0.76662588, + "learning_rate": 2.9958372875433437e-06, + "loss": 0.78821993, + "num_input_tokens_seen": 126667500, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.796875, + "step": 5889, + "time_per_iteration": 2.503023624420166 + }, + { + "auxiliary_loss_clip": 0.01125083, + "auxiliary_loss_mlp": 0.01037766, + "balance_loss_clip": 1.02329397, + "balance_loss_mlp": 1.0469135, + "epoch": 0.3541259582143394, + "flos": 19792453916160.0, + "grad_norm": 1.7418080185903937, + "language_loss": 0.80142188, + "learning_rate": 2.9954995182463478e-06, + "loss": 0.82305038, + "num_input_tokens_seen": 126686820, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 5890, + "time_per_iteration": 2.4669902324676514 + }, + { + "auxiliary_loss_clip": 0.01118725, + "auxiliary_loss_mlp": 0.01034555, + "balance_loss_clip": 1.02204418, + "balance_loss_mlp": 1.04123974, + "epoch": 0.35418608146700736, + "flos": 24022084112640.0, + "grad_norm": 1.4765808553545194, + "language_loss": 0.79590207, + "learning_rate": 2.99516171119991e-06, + "loss": 0.81743479, + "num_input_tokens_seen": 126706965, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7734375, + "step": 5891, + "time_per_iteration": 2.491048812866211 + }, + { + "auxiliary_loss_clip": 0.01123501, + "auxiliary_loss_mlp": 0.01037192, + "balance_loss_clip": 1.02260685, + "balance_loss_mlp": 1.04425383, + "epoch": 0.35424620471967533, + "flos": 12385770693120.0, + "grad_norm": 2.0747162768055616, + "language_loss": 0.73339593, + "learning_rate": 2.9948238664168415e-06, + "loss": 0.7550028, + "num_input_tokens_seen": 126724015, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.79296875, + "step": 5892, + "time_per_iteration": 2.497422695159912 + }, + { + "auxiliary_loss_clip": 0.01124613, + "auxiliary_loss_mlp": 0.01038788, + "balance_loss_clip": 1.02425075, + "balance_loss_mlp": 1.04473233, + "epoch": 0.3543063279723433, + "flos": 19673624747520.0, + "grad_norm": 1.9338165898472526, + "language_loss": 0.66916019, + "learning_rate": 2.9944859839099518e-06, + "loss": 0.69079423, + "num_input_tokens_seen": 126737565, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.796875, + "step": 5893, + "time_per_iteration": 2.4516420364379883 + }, + { + "auxiliary_loss_clip": 0.01123079, + "auxiliary_loss_mlp": 0.01037638, + "balance_loss_clip": 1.02253413, + "balance_loss_mlp": 1.04405212, + "epoch": 0.35436645122501126, + "flos": 21909208348800.0, + "grad_norm": 1.878049090913109, + "language_loss": 0.69472313, + "learning_rate": 2.9941480636920533e-06, + "loss": 0.71633029, + "num_input_tokens_seen": 126756095, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7890625, + "step": 5894, + "time_per_iteration": 2.479174852371216 + }, + { + "auxiliary_loss_clip": 0.01123499, + "auxiliary_loss_mlp": 0.01033075, + "balance_loss_clip": 1.01983714, + "balance_loss_mlp": 1.04524636, + "epoch": 0.3544265744776792, + "flos": 21719527603200.0, + "grad_norm": 1.6954645527360779, + "language_loss": 0.74891931, + "learning_rate": 2.9938101057759615e-06, + "loss": 0.77048504, + "num_input_tokens_seen": 126775455, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.78125, + "step": 5895, + "time_per_iteration": 2.4786908626556396 + }, + { + "auxiliary_loss_clip": 0.01122907, + "auxiliary_loss_mlp": 0.01037799, + "balance_loss_clip": 1.02366102, + "balance_loss_mlp": 1.04388869, + "epoch": 0.3544866977303472, + "flos": 21213223447680.0, + "grad_norm": 2.0548310630504854, + "language_loss": 0.83688253, + "learning_rate": 2.993472110174491e-06, + "loss": 0.85848963, + "num_input_tokens_seen": 126792320, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 5896, + "time_per_iteration": 2.4765214920043945 + }, + { + "auxiliary_loss_clip": 0.01122608, + "auxiliary_loss_mlp": 0.01048408, + "balance_loss_clip": 1.03348279, + "balance_loss_mlp": 1.0444181, + "epoch": 0.35454682098301515, + "flos": 29311402371840.0, + "grad_norm": 1.6634726813042469, + "language_loss": 0.70031154, + "learning_rate": 2.9931340769004576e-06, + "loss": 0.7220217, + "num_input_tokens_seen": 126813680, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.78125, + "step": 5897, + "time_per_iteration": 2.5142548084259033 + }, + { + "auxiliary_loss_clip": 0.01121754, + "auxiliary_loss_mlp": 0.01038358, + "balance_loss_clip": 1.02430916, + "balance_loss_mlp": 1.04337025, + "epoch": 0.3546069442356832, + "flos": 24316587722880.0, + "grad_norm": 1.7331024671064506, + "language_loss": 0.82091749, + "learning_rate": 2.9927960059666816e-06, + "loss": 0.84251857, + "num_input_tokens_seen": 126834395, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.78515625, + "step": 5898, + "time_per_iteration": 2.4900712966918945 + }, + { + "auxiliary_loss_clip": 0.0112068, + "auxiliary_loss_mlp": 0.01036408, + "balance_loss_clip": 1.0234853, + "balance_loss_mlp": 1.04411018, + "epoch": 0.35466706748835114, + "flos": 22857285876480.0, + "grad_norm": 1.4876974136883365, + "language_loss": 0.73901182, + "learning_rate": 2.9924578973859804e-06, + "loss": 0.76058269, + "num_input_tokens_seen": 126855145, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.765625, + "step": 5899, + "time_per_iteration": 2.498659133911133 + }, + { + "auxiliary_loss_clip": 0.01121982, + "auxiliary_loss_mlp": 0.01042838, + "balance_loss_clip": 1.02825308, + "balance_loss_mlp": 1.04316258, + "epoch": 0.3547271907410191, + "flos": 28330107742080.0, + "grad_norm": 1.69682390123668, + "language_loss": 0.79345262, + "learning_rate": 2.9921197511711763e-06, + "loss": 0.81510079, + "num_input_tokens_seen": 126873790, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7890625, + "step": 5900, + "time_per_iteration": 2.548612594604492 + }, + { + "auxiliary_loss_clip": 0.01123598, + "auxiliary_loss_mlp": 0.01040285, + "balance_loss_clip": 1.02556252, + "balance_loss_mlp": 1.04530048, + "epoch": 0.35478731399368707, + "flos": 23514092017920.0, + "grad_norm": 1.7758743329418227, + "language_loss": 0.81637204, + "learning_rate": 2.991781567335093e-06, + "loss": 0.83801091, + "num_input_tokens_seen": 126892865, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.78125, + "step": 5901, + "time_per_iteration": 2.6031999588012695 + }, + { + "auxiliary_loss_clip": 0.01127681, + "auxiliary_loss_mlp": 0.01034926, + "balance_loss_clip": 1.02063251, + "balance_loss_mlp": 1.04535294, + "epoch": 0.35484743724635504, + "flos": 18624315715200.0, + "grad_norm": 1.92677562296577, + "language_loss": 0.75667071, + "learning_rate": 2.9914433458905525e-06, + "loss": 0.77829683, + "num_input_tokens_seen": 126911935, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.82421875, + "step": 5902, + "time_per_iteration": 2.528026819229126 + }, + { + "auxiliary_loss_clip": 0.0112195, + "auxiliary_loss_mlp": 0.01036748, + "balance_loss_clip": 1.02359962, + "balance_loss_mlp": 1.04320014, + "epoch": 0.354907560499023, + "flos": 17384499924480.0, + "grad_norm": 1.7304108811682997, + "language_loss": 0.70582771, + "learning_rate": 2.991105086850381e-06, + "loss": 0.72741467, + "num_input_tokens_seen": 126930040, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7890625, + "step": 5903, + "time_per_iteration": 2.423454999923706 + }, + { + "auxiliary_loss_clip": 0.01124223, + "auxiliary_loss_mlp": 0.01034819, + "balance_loss_clip": 1.0205555, + "balance_loss_mlp": 1.04234982, + "epoch": 0.35496768375169097, + "flos": 19208546426880.0, + "grad_norm": 2.52210089781831, + "language_loss": 0.74574983, + "learning_rate": 2.9907667902274053e-06, + "loss": 0.76734024, + "num_input_tokens_seen": 126948390, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8203125, + "step": 5904, + "time_per_iteration": 2.462024688720703 + }, + { + "auxiliary_loss_clip": 0.0112423, + "auxiliary_loss_mlp": 0.01040901, + "balance_loss_clip": 1.02649426, + "balance_loss_mlp": 1.04362941, + "epoch": 0.35502780700435893, + "flos": 18332792933760.0, + "grad_norm": 2.0389703534000443, + "language_loss": 0.78855121, + "learning_rate": 2.9904284560344536e-06, + "loss": 0.81020248, + "num_input_tokens_seen": 126964905, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.8046875, + "step": 5905, + "time_per_iteration": 2.418665885925293 + }, + { + "auxiliary_loss_clip": 0.0111773, + "auxiliary_loss_mlp": 0.01031377, + "balance_loss_clip": 1.0190388, + "balance_loss_mlp": 1.04383469, + "epoch": 0.3550879302570269, + "flos": 15448555578240.0, + "grad_norm": 2.1398902938273547, + "language_loss": 0.72515827, + "learning_rate": 2.990090084284356e-06, + "loss": 0.74664938, + "num_input_tokens_seen": 126982000, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.73828125, + "step": 5906, + "time_per_iteration": 2.441795825958252 + }, + { + "auxiliary_loss_clip": 0.01128267, + "auxiliary_loss_mlp": 0.01037607, + "balance_loss_clip": 1.02187109, + "balance_loss_mlp": 1.04545534, + "epoch": 0.35514805350969486, + "flos": 21979197999360.0, + "grad_norm": 2.0230910533888107, + "language_loss": 0.74762344, + "learning_rate": 2.9897516749899426e-06, + "loss": 0.7692821, + "num_input_tokens_seen": 126998390, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.828125, + "step": 5907, + "time_per_iteration": 2.4404122829437256 + }, + { + "auxiliary_loss_clip": 0.01123497, + "auxiliary_loss_mlp": 0.01033795, + "balance_loss_clip": 1.01939988, + "balance_loss_mlp": 1.04492426, + "epoch": 0.3552081767623628, + "flos": 29861949104640.0, + "grad_norm": 1.7742327577799557, + "language_loss": 0.75751841, + "learning_rate": 2.989413228164047e-06, + "loss": 0.77909136, + "num_input_tokens_seen": 127020220, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78515625, + "step": 5908, + "time_per_iteration": 2.5631895065307617 + }, + { + "auxiliary_loss_clip": 0.01126393, + "auxiliary_loss_mlp": 0.01033964, + "balance_loss_clip": 1.01961696, + "balance_loss_mlp": 1.04734707, + "epoch": 0.3552683000150308, + "flos": 26432264747520.0, + "grad_norm": 1.7057235578436956, + "language_loss": 0.68026733, + "learning_rate": 2.989074743819502e-06, + "loss": 0.70187092, + "num_input_tokens_seen": 127038585, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7890625, + "step": 5909, + "time_per_iteration": 2.480511426925659 + }, + { + "auxiliary_loss_clip": 0.0112022, + "auxiliary_loss_mlp": 0.01032654, + "balance_loss_clip": 1.01937413, + "balance_loss_mlp": 1.04523396, + "epoch": 0.35532842326769876, + "flos": 19785989468160.0, + "grad_norm": 3.5777269988287297, + "language_loss": 0.78628188, + "learning_rate": 2.988736221969144e-06, + "loss": 0.8078106, + "num_input_tokens_seen": 127056215, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75, + "step": 5910, + "time_per_iteration": 2.4763131141662598 + }, + { + "auxiliary_loss_clip": 0.01128543, + "auxiliary_loss_mlp": 0.01040834, + "balance_loss_clip": 1.02545595, + "balance_loss_mlp": 1.04625309, + "epoch": 0.3553885465203668, + "flos": 17239277237760.0, + "grad_norm": 1.525011794663279, + "language_loss": 0.70639479, + "learning_rate": 2.98839766262581e-06, + "loss": 0.72808856, + "num_input_tokens_seen": 127075825, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8203125, + "step": 5911, + "time_per_iteration": 2.4335598945617676 + }, + { + "auxiliary_loss_clip": 0.01119575, + "auxiliary_loss_mlp": 0.01035647, + "balance_loss_clip": 1.02149105, + "balance_loss_mlp": 1.04294884, + "epoch": 0.35544866977303474, + "flos": 14934350430720.0, + "grad_norm": 1.9668748220600272, + "language_loss": 0.87014282, + "learning_rate": 2.9880590658023366e-06, + "loss": 0.89169508, + "num_input_tokens_seen": 127091205, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.765625, + "step": 5912, + "time_per_iteration": 2.461251735687256 + }, + { + "auxiliary_loss_clip": 0.01123002, + "auxiliary_loss_mlp": 0.01032384, + "balance_loss_clip": 1.018556, + "balance_loss_mlp": 1.04507196, + "epoch": 0.3555087930257027, + "flos": 19756040503680.0, + "grad_norm": 1.7619620740638822, + "language_loss": 0.7701745, + "learning_rate": 2.9877204315115646e-06, + "loss": 0.79172838, + "num_input_tokens_seen": 127109210, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.78125, + "step": 5913, + "time_per_iteration": 2.4517738819122314 + }, + { + "auxiliary_loss_clip": 0.01124528, + "auxiliary_loss_mlp": 0.01033929, + "balance_loss_clip": 1.02001143, + "balance_loss_mlp": 1.04793298, + "epoch": 0.3555689162783707, + "flos": 21068252156160.0, + "grad_norm": 1.3300117090522248, + "language_loss": 0.82507938, + "learning_rate": 2.9873817597663353e-06, + "loss": 0.84666395, + "num_input_tokens_seen": 127128400, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.765625, + "step": 5914, + "time_per_iteration": 2.4964141845703125 + }, + { + "auxiliary_loss_clip": 0.01124534, + "auxiliary_loss_mlp": 0.01031988, + "balance_loss_clip": 1.01771307, + "balance_loss_mlp": 1.04573739, + "epoch": 0.35562903953103864, + "flos": 33069633454080.0, + "grad_norm": 2.1657623831524604, + "language_loss": 0.70703268, + "learning_rate": 2.98704305057949e-06, + "loss": 0.72859794, + "num_input_tokens_seen": 127149965, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7890625, + "step": 5915, + "time_per_iteration": 2.5425658226013184 + }, + { + "auxiliary_loss_clip": 0.01120767, + "auxiliary_loss_mlp": 0.01038436, + "balance_loss_clip": 1.0249182, + "balance_loss_mlp": 1.04248476, + "epoch": 0.3556891627837066, + "flos": 20557853850240.0, + "grad_norm": 1.7489130528457595, + "language_loss": 0.76365829, + "learning_rate": 2.9867043039638737e-06, + "loss": 0.78525031, + "num_input_tokens_seen": 127169865, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.78125, + "step": 5916, + "time_per_iteration": 2.49629545211792 + }, + { + "auxiliary_loss_clip": 0.01128234, + "auxiliary_loss_mlp": 0.01037051, + "balance_loss_clip": 1.02360404, + "balance_loss_mlp": 1.04853928, + "epoch": 0.35574928603637457, + "flos": 20703327932160.0, + "grad_norm": 1.96232440030472, + "language_loss": 0.88380635, + "learning_rate": 2.986365519932332e-06, + "loss": 0.90545923, + "num_input_tokens_seen": 127188075, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.796875, + "step": 5917, + "time_per_iteration": 2.4549498558044434 + }, + { + "auxiliary_loss_clip": 0.01123557, + "auxiliary_loss_mlp": 0.01025214, + "balance_loss_clip": 1.01144493, + "balance_loss_mlp": 1.04562521, + "epoch": 0.35580940928904253, + "flos": 15194595444480.0, + "grad_norm": 2.0473051476373048, + "language_loss": 0.74389327, + "learning_rate": 2.98602669849771e-06, + "loss": 0.76538098, + "num_input_tokens_seen": 127206065, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.78125, + "step": 5918, + "time_per_iteration": 2.448164701461792 + }, + { + "auxiliary_loss_clip": 0.01039303, + "auxiliary_loss_mlp": 0.01015071, + "balance_loss_clip": 1.01344931, + "balance_loss_mlp": 1.01430607, + "epoch": 0.3558695325417105, + "flos": 58639145431680.0, + "grad_norm": 1.0267040132589962, + "language_loss": 0.63732457, + "learning_rate": 2.985687839672857e-06, + "loss": 0.65786839, + "num_input_tokens_seen": 127257885, + "router_z_loss_clip": 0.01623535, + "router_z_loss_mlp": 0.25, + "step": 5919, + "time_per_iteration": 2.837815999984741 + }, + { + "auxiliary_loss_clip": 0.01124878, + "auxiliary_loss_mlp": 0.01032772, + "balance_loss_clip": 1.01805615, + "balance_loss_mlp": 1.04376245, + "epoch": 0.35592965579437846, + "flos": 22018233104640.0, + "grad_norm": 2.8747663216478503, + "language_loss": 0.73868048, + "learning_rate": 2.9853489434706223e-06, + "loss": 0.76025695, + "num_input_tokens_seen": 127275550, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.80859375, + "step": 5920, + "time_per_iteration": 2.4837357997894287 + }, + { + "auxiliary_loss_clip": 0.0112079, + "auxiliary_loss_mlp": 0.01034089, + "balance_loss_clip": 1.02015972, + "balance_loss_mlp": 1.04353166, + "epoch": 0.35598977904704643, + "flos": 23367684182400.0, + "grad_norm": 1.659561193633535, + "language_loss": 0.77124226, + "learning_rate": 2.985010009903857e-06, + "loss": 0.79279101, + "num_input_tokens_seen": 127295110, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7734375, + "step": 5921, + "time_per_iteration": 2.461014986038208 + }, + { + "auxiliary_loss_clip": 0.01122705, + "auxiliary_loss_mlp": 0.0103307, + "balance_loss_clip": 1.01968277, + "balance_loss_mlp": 1.04409981, + "epoch": 0.3560499022997144, + "flos": 17785334770560.0, + "grad_norm": 3.1644779785561563, + "language_loss": 0.67710596, + "learning_rate": 2.9846710389854133e-06, + "loss": 0.69866371, + "num_input_tokens_seen": 127312865, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7890625, + "step": 5922, + "time_per_iteration": 2.495504140853882 + }, + { + "auxiliary_loss_clip": 0.01122686, + "auxiliary_loss_mlp": 0.01029775, + "balance_loss_clip": 1.01567268, + "balance_loss_mlp": 1.04373431, + "epoch": 0.35611002555238236, + "flos": 20740459616640.0, + "grad_norm": 1.9745978513449503, + "language_loss": 0.79269004, + "learning_rate": 2.9843320307281454e-06, + "loss": 0.81421471, + "num_input_tokens_seen": 127331710, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 5923, + "time_per_iteration": 2.4515416622161865 + }, + { + "auxiliary_loss_clip": 0.01124058, + "auxiliary_loss_mlp": 0.01039223, + "balance_loss_clip": 1.02631271, + "balance_loss_mlp": 1.04502511, + "epoch": 0.3561701488050504, + "flos": 19462219251840.0, + "grad_norm": 1.7698063934253627, + "language_loss": 0.85475516, + "learning_rate": 2.983992985144908e-06, + "loss": 0.87638795, + "num_input_tokens_seen": 127350950, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7890625, + "step": 5924, + "time_per_iteration": 2.4790685176849365 + }, + { + "auxiliary_loss_clip": 0.01121235, + "auxiliary_loss_mlp": 0.01037826, + "balance_loss_clip": 1.02344394, + "balance_loss_mlp": 1.04368067, + "epoch": 0.35623027205771834, + "flos": 30774942023040.0, + "grad_norm": 1.844353158814239, + "language_loss": 0.77513188, + "learning_rate": 2.9836539022485578e-06, + "loss": 0.79672253, + "num_input_tokens_seen": 127369385, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7734375, + "step": 5925, + "time_per_iteration": 2.5064613819122314 + }, + { + "auxiliary_loss_clip": 0.01119102, + "auxiliary_loss_mlp": 0.01043971, + "balance_loss_clip": 1.0301789, + "balance_loss_mlp": 1.04067063, + "epoch": 0.3562903953103863, + "flos": 16981079299200.0, + "grad_norm": 1.7016119178915972, + "language_loss": 0.75874609, + "learning_rate": 2.9833147820519535e-06, + "loss": 0.78037679, + "num_input_tokens_seen": 127386965, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.78125, + "step": 5926, + "time_per_iteration": 2.451852798461914 + }, + { + "auxiliary_loss_clip": 0.01125239, + "auxiliary_loss_mlp": 0.01036384, + "balance_loss_clip": 1.02194762, + "balance_loss_mlp": 1.04408717, + "epoch": 0.3563505185630543, + "flos": 23839837482240.0, + "grad_norm": 2.0486133546267737, + "language_loss": 0.69321811, + "learning_rate": 2.9829756245679544e-06, + "loss": 0.71483439, + "num_input_tokens_seen": 127406075, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.80859375, + "step": 5927, + "time_per_iteration": 2.4770915508270264 + }, + { + "auxiliary_loss_clip": 0.01119921, + "auxiliary_loss_mlp": 0.01036862, + "balance_loss_clip": 1.0237366, + "balance_loss_mlp": 1.0428226, + "epoch": 0.35641064181572224, + "flos": 22273450214400.0, + "grad_norm": 1.8762651107969224, + "language_loss": 0.79633021, + "learning_rate": 2.9826364298094212e-06, + "loss": 0.81789798, + "num_input_tokens_seen": 127425350, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.76953125, + "step": 5928, + "time_per_iteration": 4.019433259963989 + }, + { + "auxiliary_loss_clip": 0.01120965, + "auxiliary_loss_mlp": 0.01039766, + "balance_loss_clip": 1.02581263, + "balance_loss_mlp": 1.04338682, + "epoch": 0.3564707650683902, + "flos": 23001251587200.0, + "grad_norm": 1.4128421638180557, + "language_loss": 0.81568098, + "learning_rate": 2.982297197789215e-06, + "loss": 0.83728826, + "num_input_tokens_seen": 127446335, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7734375, + "step": 5929, + "time_per_iteration": 3.869184970855713 + }, + { + "auxiliary_loss_clip": 0.01116246, + "auxiliary_loss_mlp": 0.01034774, + "balance_loss_clip": 1.02172661, + "balance_loss_mlp": 1.0402571, + "epoch": 0.35653088832105817, + "flos": 14684268965760.0, + "grad_norm": 1.7650523310611956, + "language_loss": 0.69981778, + "learning_rate": 2.981957928520201e-06, + "loss": 0.7213279, + "num_input_tokens_seen": 127462795, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7578125, + "step": 5930, + "time_per_iteration": 2.418992519378662 + }, + { + "auxiliary_loss_clip": 0.01123929, + "auxiliary_loss_mlp": 0.01043162, + "balance_loss_clip": 1.02858853, + "balance_loss_mlp": 1.04340863, + "epoch": 0.35659101157372614, + "flos": 23477068074240.0, + "grad_norm": 1.9164187115059894, + "language_loss": 0.67766178, + "learning_rate": 2.981618622015244e-06, + "loss": 0.69933271, + "num_input_tokens_seen": 127482675, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8046875, + "step": 5931, + "time_per_iteration": 2.4688074588775635 + }, + { + "auxiliary_loss_clip": 0.01121557, + "auxiliary_loss_mlp": 0.01033991, + "balance_loss_clip": 1.0203712, + "balance_loss_mlp": 1.04403675, + "epoch": 0.3566511348263941, + "flos": 26578672583040.0, + "grad_norm": 1.736290109138699, + "language_loss": 0.67451715, + "learning_rate": 2.981279278287211e-06, + "loss": 0.69607264, + "num_input_tokens_seen": 127502275, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.77734375, + "step": 5932, + "time_per_iteration": 2.4908299446105957 + }, + { + "auxiliary_loss_clip": 0.01118994, + "auxiliary_loss_mlp": 0.01031751, + "balance_loss_clip": 1.0182085, + "balance_loss_mlp": 1.04304647, + "epoch": 0.35671125807906207, + "flos": 13115008609920.0, + "grad_norm": 2.602576254435761, + "language_loss": 0.7878592, + "learning_rate": 2.980939897348969e-06, + "loss": 0.8093667, + "num_input_tokens_seen": 127520195, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 5933, + "time_per_iteration": 2.442464590072632 + }, + { + "auxiliary_loss_clip": 0.01122141, + "auxiliary_loss_mlp": 0.01042886, + "balance_loss_clip": 1.02893806, + "balance_loss_mlp": 1.04176354, + "epoch": 0.35677138133173003, + "flos": 33000577557120.0, + "grad_norm": 1.4946029259135472, + "language_loss": 0.69271672, + "learning_rate": 2.980600479213388e-06, + "loss": 0.71436697, + "num_input_tokens_seen": 127544495, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.8046875, + "step": 5934, + "time_per_iteration": 2.5435454845428467 + }, + { + "auxiliary_loss_clip": 0.01131019, + "auxiliary_loss_mlp": 0.0104198, + "balance_loss_clip": 1.02636409, + "balance_loss_mlp": 1.04726946, + "epoch": 0.356831504584398, + "flos": 20777842696320.0, + "grad_norm": 1.881720756405168, + "language_loss": 0.71268845, + "learning_rate": 2.9802610238933384e-06, + "loss": 0.73441839, + "num_input_tokens_seen": 127563810, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8359375, + "step": 5935, + "time_per_iteration": 2.460548162460327 + }, + { + "auxiliary_loss_clip": 0.01124043, + "auxiliary_loss_mlp": 0.01039228, + "balance_loss_clip": 1.02476776, + "balance_loss_mlp": 1.04411018, + "epoch": 0.35689162783706596, + "flos": 12165566365440.0, + "grad_norm": 2.474293421119334, + "language_loss": 0.78293073, + "learning_rate": 2.979921531401692e-06, + "loss": 0.8045634, + "num_input_tokens_seen": 127579065, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.796875, + "step": 5936, + "time_per_iteration": 2.4517645835876465 + }, + { + "auxiliary_loss_clip": 0.01121611, + "auxiliary_loss_mlp": 0.01039592, + "balance_loss_clip": 1.02472031, + "balance_loss_mlp": 1.04367638, + "epoch": 0.356951751089734, + "flos": 23841489507840.0, + "grad_norm": 1.4518862241402966, + "language_loss": 0.64218014, + "learning_rate": 2.9795820017513242e-06, + "loss": 0.66379213, + "num_input_tokens_seen": 127599105, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78125, + "step": 5937, + "time_per_iteration": 2.5837321281433105 + }, + { + "auxiliary_loss_clip": 0.01124449, + "auxiliary_loss_mlp": 0.01038603, + "balance_loss_clip": 1.02395844, + "balance_loss_mlp": 1.04442978, + "epoch": 0.35701187434240195, + "flos": 11722176881280.0, + "grad_norm": 2.5143509931773553, + "language_loss": 0.77877963, + "learning_rate": 2.9792424349551073e-06, + "loss": 0.80041015, + "num_input_tokens_seen": 127614940, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.80078125, + "step": 5938, + "time_per_iteration": 2.4190945625305176 + }, + { + "auxiliary_loss_clip": 0.0112532, + "auxiliary_loss_mlp": 0.01042565, + "balance_loss_clip": 1.02890944, + "balance_loss_mlp": 1.04582071, + "epoch": 0.3570719975950699, + "flos": 24898879100160.0, + "grad_norm": 1.8770011073758637, + "language_loss": 0.80256367, + "learning_rate": 2.9789028310259202e-06, + "loss": 0.82424247, + "num_input_tokens_seen": 127634960, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.796875, + "step": 5939, + "time_per_iteration": 2.5029094219207764 + }, + { + "auxiliary_loss_clip": 0.01126611, + "auxiliary_loss_mlp": 0.01035861, + "balance_loss_clip": 1.0213412, + "balance_loss_mlp": 1.04299128, + "epoch": 0.3571321208477379, + "flos": 25994836920960.0, + "grad_norm": 1.6875415435298406, + "language_loss": 0.79203522, + "learning_rate": 2.9785631899766395e-06, + "loss": 0.81365997, + "num_input_tokens_seen": 127654545, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8359375, + "step": 5940, + "time_per_iteration": 2.526545524597168 + }, + { + "auxiliary_loss_clip": 0.01124522, + "auxiliary_loss_mlp": 0.01031852, + "balance_loss_clip": 1.01704049, + "balance_loss_mlp": 1.0441246, + "epoch": 0.35719224410040584, + "flos": 14501663199360.0, + "grad_norm": 2.480743427796476, + "language_loss": 0.72739166, + "learning_rate": 2.9782235118201443e-06, + "loss": 0.74895537, + "num_input_tokens_seen": 127672320, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8046875, + "step": 5941, + "time_per_iteration": 2.4599413871765137 + }, + { + "auxiliary_loss_clip": 0.01123947, + "auxiliary_loss_mlp": 0.01040367, + "balance_loss_clip": 1.02546012, + "balance_loss_mlp": 1.04480743, + "epoch": 0.3572523673530738, + "flos": 31175453646720.0, + "grad_norm": 1.979069530543237, + "language_loss": 0.64202702, + "learning_rate": 2.9778837965693154e-06, + "loss": 0.66367018, + "num_input_tokens_seen": 127693315, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7890625, + "step": 5942, + "time_per_iteration": 2.5174636840820312 + }, + { + "auxiliary_loss_clip": 0.01123104, + "auxiliary_loss_mlp": 0.01036752, + "balance_loss_clip": 1.02194643, + "balance_loss_mlp": 1.04385567, + "epoch": 0.3573124906057418, + "flos": 15851976203520.0, + "grad_norm": 2.2469009256176053, + "language_loss": 0.74055374, + "learning_rate": 2.9775440442370354e-06, + "loss": 0.76215225, + "num_input_tokens_seen": 127711570, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.79296875, + "step": 5943, + "time_per_iteration": 2.5392913818359375 + }, + { + "auxiliary_loss_clip": 0.01039679, + "auxiliary_loss_mlp": 0.01008275, + "balance_loss_clip": 1.00640345, + "balance_loss_mlp": 1.01455188, + "epoch": 0.35737261385840974, + "flos": 60822729118080.0, + "grad_norm": 0.7872915284740177, + "language_loss": 0.60689372, + "learning_rate": 2.9772042548361867e-06, + "loss": 0.62737316, + "num_input_tokens_seen": 127772475, + "router_z_loss_clip": 0.01867676, + "router_z_loss_mlp": 0.25, + "step": 5944, + "time_per_iteration": 3.17051100730896 + }, + { + "auxiliary_loss_clip": 0.01121351, + "auxiliary_loss_mlp": 0.01034271, + "balance_loss_clip": 1.02003157, + "balance_loss_mlp": 1.04313469, + "epoch": 0.3574327371110777, + "flos": 18843765857280.0, + "grad_norm": 2.033108996495456, + "language_loss": 0.72646821, + "learning_rate": 2.976864428379655e-06, + "loss": 0.7480244, + "num_input_tokens_seen": 127790940, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 5945, + "time_per_iteration": 2.444373846054077 + }, + { + "auxiliary_loss_clip": 0.01121962, + "auxiliary_loss_mlp": 0.01039125, + "balance_loss_clip": 1.02446246, + "balance_loss_mlp": 1.04313612, + "epoch": 0.35749286036374567, + "flos": 23549679417600.0, + "grad_norm": 1.7423109631574678, + "language_loss": 0.81255424, + "learning_rate": 2.976524564880326e-06, + "loss": 0.8341651, + "num_input_tokens_seen": 127808275, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7890625, + "step": 5946, + "time_per_iteration": 2.470513343811035 + }, + { + "auxiliary_loss_clip": 0.01124684, + "auxiliary_loss_mlp": 0.01042743, + "balance_loss_clip": 1.02808666, + "balance_loss_mlp": 1.04524601, + "epoch": 0.35755298361641363, + "flos": 21105491581440.0, + "grad_norm": 1.9099881709146462, + "language_loss": 0.68893784, + "learning_rate": 2.9761846643510882e-06, + "loss": 0.71061212, + "num_input_tokens_seen": 127828840, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.79296875, + "step": 5947, + "time_per_iteration": 2.4653477668762207 + }, + { + "auxiliary_loss_clip": 0.01120435, + "auxiliary_loss_mlp": 0.01039661, + "balance_loss_clip": 1.02568388, + "balance_loss_mlp": 1.04441905, + "epoch": 0.3576131068690816, + "flos": 19245031666560.0, + "grad_norm": 1.655085874443405, + "language_loss": 0.75428057, + "learning_rate": 2.9758447268048297e-06, + "loss": 0.77588153, + "num_input_tokens_seen": 127846240, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7578125, + "step": 5948, + "time_per_iteration": 2.4385483264923096 + }, + { + "auxiliary_loss_clip": 0.01119692, + "auxiliary_loss_mlp": 0.0104111, + "balance_loss_clip": 1.02650094, + "balance_loss_mlp": 1.04049134, + "epoch": 0.35767323012174956, + "flos": 28654703971200.0, + "grad_norm": 2.354345427402619, + "language_loss": 0.70556438, + "learning_rate": 2.9755047522544415e-06, + "loss": 0.72717237, + "num_input_tokens_seen": 127866880, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.79296875, + "step": 5949, + "time_per_iteration": 2.4992663860321045 + }, + { + "auxiliary_loss_clip": 0.01121175, + "auxiliary_loss_mlp": 0.0103916, + "balance_loss_clip": 1.02567744, + "balance_loss_mlp": 1.04348552, + "epoch": 0.35773335337441753, + "flos": 17085363459840.0, + "grad_norm": 1.8941983472442732, + "language_loss": 0.77248389, + "learning_rate": 2.9751647407128154e-06, + "loss": 0.79408723, + "num_input_tokens_seen": 127883560, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.77734375, + "step": 5950, + "time_per_iteration": 2.4295101165771484 + }, + { + "auxiliary_loss_clip": 0.0112255, + "auxiliary_loss_mlp": 0.01038813, + "balance_loss_clip": 1.02394795, + "balance_loss_mlp": 1.04274225, + "epoch": 0.35779347662708555, + "flos": 15888605097600.0, + "grad_norm": 1.5707876816938207, + "language_loss": 0.72766685, + "learning_rate": 2.9748246921928445e-06, + "loss": 0.74928057, + "num_input_tokens_seen": 127902330, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 5951, + "time_per_iteration": 2.444349765777588 + }, + { + "auxiliary_loss_clip": 0.0112562, + "auxiliary_loss_mlp": 0.01039318, + "balance_loss_clip": 1.02452421, + "balance_loss_mlp": 1.04390478, + "epoch": 0.3578535998797535, + "flos": 28658834035200.0, + "grad_norm": 1.9955959935597258, + "language_loss": 0.69730532, + "learning_rate": 2.9744846067074236e-06, + "loss": 0.71895468, + "num_input_tokens_seen": 127922325, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.81640625, + "step": 5952, + "time_per_iteration": 2.49656081199646 + }, + { + "auxiliary_loss_clip": 0.01120518, + "auxiliary_loss_mlp": 0.010387, + "balance_loss_clip": 1.02497923, + "balance_loss_mlp": 1.04271066, + "epoch": 0.3579137231324215, + "flos": 37852432076160.0, + "grad_norm": 2.0583657570083416, + "language_loss": 0.69432503, + "learning_rate": 2.974144484269449e-06, + "loss": 0.71591723, + "num_input_tokens_seen": 127942635, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.77734375, + "step": 5953, + "time_per_iteration": 2.6221721172332764 + }, + { + "auxiliary_loss_clip": 0.0112099, + "auxiliary_loss_mlp": 0.01030421, + "balance_loss_clip": 1.01641417, + "balance_loss_mlp": 1.04322994, + "epoch": 0.35797384638508944, + "flos": 22346851656960.0, + "grad_norm": 1.5429391611916807, + "language_loss": 0.66673422, + "learning_rate": 2.9738043248918175e-06, + "loss": 0.68824828, + "num_input_tokens_seen": 127962520, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.77734375, + "step": 5954, + "time_per_iteration": 2.465116262435913 + }, + { + "auxiliary_loss_clip": 0.01123263, + "auxiliary_loss_mlp": 0.01037735, + "balance_loss_clip": 1.02383566, + "balance_loss_mlp": 1.04475307, + "epoch": 0.3580339696377574, + "flos": 13589711775360.0, + "grad_norm": 1.7040470297828096, + "language_loss": 0.74838006, + "learning_rate": 2.9734641285874282e-06, + "loss": 0.76998997, + "num_input_tokens_seen": 127981180, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.78515625, + "step": 5955, + "time_per_iteration": 2.4968783855438232 + }, + { + "auxiliary_loss_clip": 0.01117597, + "auxiliary_loss_mlp": 0.01035772, + "balance_loss_clip": 1.0219382, + "balance_loss_mlp": 1.04289603, + "epoch": 0.3580940928904254, + "flos": 23768231719680.0, + "grad_norm": 1.6820855707774873, + "language_loss": 0.76043999, + "learning_rate": 2.973123895369182e-06, + "loss": 0.78197372, + "num_input_tokens_seen": 127999725, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.74609375, + "step": 5956, + "time_per_iteration": 2.498699903488159 + }, + { + "auxiliary_loss_clip": 0.01117465, + "auxiliary_loss_mlp": 0.01034975, + "balance_loss_clip": 1.0214982, + "balance_loss_mlp": 1.04263568, + "epoch": 0.35815421614309334, + "flos": 19463871277440.0, + "grad_norm": 1.7390523407913014, + "language_loss": 0.73059452, + "learning_rate": 2.9727836252499805e-06, + "loss": 0.75211895, + "num_input_tokens_seen": 128018885, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 5957, + "time_per_iteration": 2.4503817558288574 + }, + { + "auxiliary_loss_clip": 0.0112235, + "auxiliary_loss_mlp": 0.01035332, + "balance_loss_clip": 1.02197433, + "balance_loss_mlp": 1.04503369, + "epoch": 0.3582143393957613, + "flos": 23368186972800.0, + "grad_norm": 2.990259024529503, + "language_loss": 0.70640051, + "learning_rate": 2.972443318242726e-06, + "loss": 0.7279774, + "num_input_tokens_seen": 128037875, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7734375, + "step": 5958, + "time_per_iteration": 2.4611945152282715 + }, + { + "auxiliary_loss_clip": 0.01116817, + "auxiliary_loss_mlp": 0.01029572, + "balance_loss_clip": 1.0165484, + "balance_loss_mlp": 1.0413444, + "epoch": 0.35827446264842927, + "flos": 26323275905280.0, + "grad_norm": 1.7206269565580243, + "language_loss": 0.88610697, + "learning_rate": 2.972102974360324e-06, + "loss": 0.90757084, + "num_input_tokens_seen": 128056045, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.75390625, + "step": 5959, + "time_per_iteration": 2.5129401683807373 + }, + { + "auxiliary_loss_clip": 0.01121057, + "auxiliary_loss_mlp": 0.01036795, + "balance_loss_clip": 1.02281785, + "balance_loss_mlp": 1.04400599, + "epoch": 0.35833458590109724, + "flos": 30446610779520.0, + "grad_norm": 1.483187088646708, + "language_loss": 0.58103061, + "learning_rate": 2.971762593615679e-06, + "loss": 0.6026091, + "num_input_tokens_seen": 128077815, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76953125, + "step": 5960, + "time_per_iteration": 2.5110409259796143 + }, + { + "auxiliary_loss_clip": 0.01120594, + "auxiliary_loss_mlp": 0.01037396, + "balance_loss_clip": 1.02201176, + "balance_loss_mlp": 1.04267251, + "epoch": 0.3583947091537652, + "flos": 14829886702080.0, + "grad_norm": 1.9323395592862886, + "language_loss": 0.76102602, + "learning_rate": 2.9714221760216993e-06, + "loss": 0.78260595, + "num_input_tokens_seen": 128095460, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.77734375, + "step": 5961, + "time_per_iteration": 2.46943736076355 + }, + { + "auxiliary_loss_clip": 0.01121367, + "auxiliary_loss_mlp": 0.01033122, + "balance_loss_clip": 1.01862621, + "balance_loss_mlp": 1.04458857, + "epoch": 0.35845483240643317, + "flos": 34240644743040.0, + "grad_norm": 1.8327349140058107, + "language_loss": 0.69974017, + "learning_rate": 2.971081721591294e-06, + "loss": 0.72128505, + "num_input_tokens_seen": 128118605, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.765625, + "step": 5962, + "time_per_iteration": 2.5654361248016357 + }, + { + "auxiliary_loss_clip": 0.01118608, + "auxiliary_loss_mlp": 0.01033334, + "balance_loss_clip": 1.0210433, + "balance_loss_mlp": 1.04321802, + "epoch": 0.35851495565910113, + "flos": 20960089326720.0, + "grad_norm": 1.5613001239774846, + "language_loss": 0.74749398, + "learning_rate": 2.9707412303373716e-06, + "loss": 0.76901346, + "num_input_tokens_seen": 128139205, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.75390625, + "step": 5963, + "time_per_iteration": 2.5135319232940674 + }, + { + "auxiliary_loss_clip": 0.01122172, + "auxiliary_loss_mlp": 0.01034993, + "balance_loss_clip": 1.02149796, + "balance_loss_mlp": 1.04597044, + "epoch": 0.35857507891176915, + "flos": 22309863626880.0, + "grad_norm": 1.5825069258384938, + "language_loss": 0.78811383, + "learning_rate": 2.9704007022728447e-06, + "loss": 0.80968547, + "num_input_tokens_seen": 128158765, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.76171875, + "step": 5964, + "time_per_iteration": 2.493169069290161 + }, + { + "auxiliary_loss_clip": 0.01124119, + "auxiliary_loss_mlp": 0.01033282, + "balance_loss_clip": 1.01870322, + "balance_loss_mlp": 1.04482806, + "epoch": 0.3586352021644371, + "flos": 23367863750400.0, + "grad_norm": 1.8296471859577264, + "language_loss": 0.66694742, + "learning_rate": 2.970060137410626e-06, + "loss": 0.6885215, + "num_input_tokens_seen": 128177850, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.79296875, + "step": 5965, + "time_per_iteration": 2.4995884895324707 + }, + { + "auxiliary_loss_clip": 0.01120182, + "auxiliary_loss_mlp": 0.01032631, + "balance_loss_clip": 1.01876068, + "balance_loss_mlp": 1.04270399, + "epoch": 0.3586953254171051, + "flos": 27849227437440.0, + "grad_norm": 4.210402322068537, + "language_loss": 0.79008359, + "learning_rate": 2.9697195357636294e-06, + "loss": 0.81161171, + "num_input_tokens_seen": 128196925, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7734375, + "step": 5966, + "time_per_iteration": 2.485438346862793 + }, + { + "auxiliary_loss_clip": 0.01121545, + "auxiliary_loss_mlp": 0.01037084, + "balance_loss_clip": 1.02238536, + "balance_loss_mlp": 1.04341781, + "epoch": 0.35875544866977305, + "flos": 19500500171520.0, + "grad_norm": 5.107721360348662, + "language_loss": 0.90911728, + "learning_rate": 2.9693788973447715e-06, + "loss": 0.93070352, + "num_input_tokens_seen": 128213955, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78125, + "step": 5967, + "time_per_iteration": 2.547287702560425 + }, + { + "auxiliary_loss_clip": 0.01125829, + "auxiliary_loss_mlp": 0.01041518, + "balance_loss_clip": 1.02648592, + "balance_loss_mlp": 1.04528475, + "epoch": 0.358815571922441, + "flos": 21471134077440.0, + "grad_norm": 1.7620117516801617, + "language_loss": 0.79739827, + "learning_rate": 2.9690382221669682e-06, + "loss": 0.81907177, + "num_input_tokens_seen": 128232980, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8046875, + "step": 5968, + "time_per_iteration": 2.4543471336364746 + }, + { + "auxiliary_loss_clip": 0.01127455, + "auxiliary_loss_mlp": 0.01052904, + "balance_loss_clip": 1.0384376, + "balance_loss_mlp": 1.04604244, + "epoch": 0.358875695175109, + "flos": 21835411856640.0, + "grad_norm": 2.0044885906540424, + "language_loss": 0.83642054, + "learning_rate": 2.9686975102431384e-06, + "loss": 0.85822409, + "num_input_tokens_seen": 128252795, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8125, + "step": 5969, + "time_per_iteration": 2.502815008163452 + }, + { + "auxiliary_loss_clip": 0.0111906, + "auxiliary_loss_mlp": 0.01032145, + "balance_loss_clip": 1.01893663, + "balance_loss_mlp": 1.04245603, + "epoch": 0.35893581842777694, + "flos": 32011633330560.0, + "grad_norm": 1.876228198696561, + "language_loss": 0.72377515, + "learning_rate": 2.968356761586202e-06, + "loss": 0.74528718, + "num_input_tokens_seen": 128273115, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.765625, + "step": 5970, + "time_per_iteration": 4.051819086074829 + }, + { + "auxiliary_loss_clip": 0.01119038, + "auxiliary_loss_mlp": 0.01035066, + "balance_loss_clip": 1.02178049, + "balance_loss_mlp": 1.0424037, + "epoch": 0.3589959416804449, + "flos": 20485817124480.0, + "grad_norm": 1.6844020581036279, + "language_loss": 0.79522693, + "learning_rate": 2.9680159762090805e-06, + "loss": 0.81676805, + "num_input_tokens_seen": 128292220, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.765625, + "step": 5971, + "time_per_iteration": 3.8910434246063232 + }, + { + "auxiliary_loss_clip": 0.01120261, + "auxiliary_loss_mlp": 0.01039002, + "balance_loss_clip": 1.02427924, + "balance_loss_mlp": 1.0402174, + "epoch": 0.3590560649331129, + "flos": 16180666583040.0, + "grad_norm": 1.924864359347905, + "language_loss": 0.78594625, + "learning_rate": 2.967675154124696e-06, + "loss": 0.80753887, + "num_input_tokens_seen": 128310305, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.80078125, + "step": 5972, + "time_per_iteration": 2.4272611141204834 + }, + { + "auxiliary_loss_clip": 0.01120688, + "auxiliary_loss_mlp": 0.01037349, + "balance_loss_clip": 1.02378309, + "balance_loss_mlp": 1.04185021, + "epoch": 0.35911618818578084, + "flos": 20375391738240.0, + "grad_norm": 3.2741380987368327, + "language_loss": 0.81252539, + "learning_rate": 2.9673342953459722e-06, + "loss": 0.83410573, + "num_input_tokens_seen": 128328305, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7890625, + "step": 5973, + "time_per_iteration": 2.469438314437866 + }, + { + "auxiliary_loss_clip": 0.0103695, + "auxiliary_loss_mlp": 0.01001955, + "balance_loss_clip": 0.9999882, + "balance_loss_mlp": 1.01160312, + "epoch": 0.3591763114384488, + "flos": 41236691685120.0, + "grad_norm": 0.9181567019376142, + "language_loss": 0.56828684, + "learning_rate": 2.9669933998858355e-06, + "loss": 0.58867586, + "num_input_tokens_seen": 128378380, + "router_z_loss_clip": 0.01965332, + "router_z_loss_mlp": 0.25390625, + "step": 5974, + "time_per_iteration": 2.918166399002075 + }, + { + "auxiliary_loss_clip": 0.01122634, + "auxiliary_loss_mlp": 0.01038804, + "balance_loss_clip": 1.02548242, + "balance_loss_mlp": 1.04407859, + "epoch": 0.35923643469111677, + "flos": 18695454600960.0, + "grad_norm": 1.6252506462115286, + "language_loss": 0.68750453, + "learning_rate": 2.9666524677572114e-06, + "loss": 0.7091189, + "num_input_tokens_seen": 128394315, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.78515625, + "step": 5975, + "time_per_iteration": 2.4578702449798584 + }, + { + "auxiliary_loss_clip": 0.01119888, + "auxiliary_loss_mlp": 0.01034451, + "balance_loss_clip": 1.02132642, + "balance_loss_mlp": 1.04269934, + "epoch": 0.35929655794378473, + "flos": 25009950931200.0, + "grad_norm": 1.7542310571392548, + "language_loss": 0.79961413, + "learning_rate": 2.96631149897303e-06, + "loss": 0.82115752, + "num_input_tokens_seen": 128414515, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7734375, + "step": 5976, + "time_per_iteration": 2.494723081588745 + }, + { + "auxiliary_loss_clip": 0.01119534, + "auxiliary_loss_mlp": 0.0104186, + "balance_loss_clip": 1.02761412, + "balance_loss_mlp": 1.04172039, + "epoch": 0.35935668119645275, + "flos": 14975576265600.0, + "grad_norm": 1.7409485188517788, + "language_loss": 0.79081398, + "learning_rate": 2.9659704935462194e-06, + "loss": 0.81242788, + "num_input_tokens_seen": 128430615, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.77734375, + "step": 5977, + "time_per_iteration": 2.4949100017547607 + }, + { + "auxiliary_loss_clip": 0.01116029, + "auxiliary_loss_mlp": 0.01034751, + "balance_loss_clip": 1.02151847, + "balance_loss_mlp": 1.04029524, + "epoch": 0.3594168044491207, + "flos": 21178138838400.0, + "grad_norm": 1.7920092294573908, + "language_loss": 0.80654621, + "learning_rate": 2.9656294514897102e-06, + "loss": 0.82805401, + "num_input_tokens_seen": 128449480, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7578125, + "step": 5978, + "time_per_iteration": 2.445866584777832 + }, + { + "auxiliary_loss_clip": 0.01122409, + "auxiliary_loss_mlp": 0.01034873, + "balance_loss_clip": 1.02046657, + "balance_loss_mlp": 1.04394007, + "epoch": 0.3594769277017887, + "flos": 27672152365440.0, + "grad_norm": 1.5382295990908517, + "language_loss": 0.67741489, + "learning_rate": 2.965288372816436e-06, + "loss": 0.69898772, + "num_input_tokens_seen": 128471465, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78515625, + "step": 5979, + "time_per_iteration": 2.538585662841797 + }, + { + "auxiliary_loss_clip": 0.01119324, + "auxiliary_loss_mlp": 0.01038492, + "balance_loss_clip": 1.02478838, + "balance_loss_mlp": 1.04136634, + "epoch": 0.35953705095445665, + "flos": 23002328995200.0, + "grad_norm": 2.3207911240165697, + "language_loss": 0.67176729, + "learning_rate": 2.9649472575393296e-06, + "loss": 0.69334549, + "num_input_tokens_seen": 128490645, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.78125, + "step": 5980, + "time_per_iteration": 2.4896938800811768 + }, + { + "auxiliary_loss_clip": 0.01123377, + "auxiliary_loss_mlp": 0.01039239, + "balance_loss_clip": 1.02377748, + "balance_loss_mlp": 1.0416832, + "epoch": 0.3595971742071246, + "flos": 25513992529920.0, + "grad_norm": 1.8107777091561479, + "language_loss": 0.71148199, + "learning_rate": 2.964606105671327e-06, + "loss": 0.73310816, + "num_input_tokens_seen": 128510225, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.81640625, + "step": 5981, + "time_per_iteration": 2.49064302444458 + }, + { + "auxiliary_loss_clip": 0.01121979, + "auxiliary_loss_mlp": 0.01038955, + "balance_loss_clip": 1.02387476, + "balance_loss_mlp": 1.0432086, + "epoch": 0.3596572974597926, + "flos": 29862559635840.0, + "grad_norm": 1.7933500913622242, + "language_loss": 0.71331298, + "learning_rate": 2.9642649172253635e-06, + "loss": 0.73492229, + "num_input_tokens_seen": 128530195, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7890625, + "step": 5982, + "time_per_iteration": 2.5167934894561768 + }, + { + "auxiliary_loss_clip": 0.01117371, + "auxiliary_loss_mlp": 0.01036663, + "balance_loss_clip": 1.02361536, + "balance_loss_mlp": 1.0427959, + "epoch": 0.35971742071246054, + "flos": 23112538899840.0, + "grad_norm": 1.6761533335073455, + "language_loss": 0.75808942, + "learning_rate": 2.9639236922143786e-06, + "loss": 0.77962971, + "num_input_tokens_seen": 128549990, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.74609375, + "step": 5983, + "time_per_iteration": 2.4915101528167725 + }, + { + "auxiliary_loss_clip": 0.01126703, + "auxiliary_loss_mlp": 0.01043227, + "balance_loss_clip": 1.02771819, + "balance_loss_mlp": 1.04474413, + "epoch": 0.3597775439651285, + "flos": 16725359399040.0, + "grad_norm": 2.1804669018597043, + "language_loss": 0.76302433, + "learning_rate": 2.96358243065131e-06, + "loss": 0.78472364, + "num_input_tokens_seen": 128567925, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8203125, + "step": 5984, + "time_per_iteration": 2.436640501022339 + }, + { + "auxiliary_loss_clip": 0.01118377, + "auxiliary_loss_mlp": 0.01037581, + "balance_loss_clip": 1.02356207, + "balance_loss_mlp": 1.0420785, + "epoch": 0.3598376672177965, + "flos": 19719483436800.0, + "grad_norm": 1.837904559260202, + "language_loss": 0.86617446, + "learning_rate": 2.9632411325490993e-06, + "loss": 0.88773406, + "num_input_tokens_seen": 128585655, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 5985, + "time_per_iteration": 2.476853609085083 + }, + { + "auxiliary_loss_clip": 0.0111809, + "auxiliary_loss_mlp": 0.01036238, + "balance_loss_clip": 1.02130079, + "balance_loss_mlp": 1.04078126, + "epoch": 0.35989779047046444, + "flos": 17311529445120.0, + "grad_norm": 1.416236209566339, + "language_loss": 0.72801065, + "learning_rate": 2.9628997979206884e-06, + "loss": 0.74955392, + "num_input_tokens_seen": 128604820, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.7734375, + "step": 5986, + "time_per_iteration": 2.443871021270752 + }, + { + "auxiliary_loss_clip": 0.01124328, + "auxiliary_loss_mlp": 0.0103792, + "balance_loss_clip": 1.02354908, + "balance_loss_mlp": 1.04230642, + "epoch": 0.3599579137231324, + "flos": 22711237176960.0, + "grad_norm": 1.880079313238184, + "language_loss": 0.73711401, + "learning_rate": 2.9625584267790204e-06, + "loss": 0.75873649, + "num_input_tokens_seen": 128623070, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.8203125, + "step": 5987, + "time_per_iteration": 2.517045736312866 + }, + { + "auxiliary_loss_clip": 0.01121357, + "auxiliary_loss_mlp": 0.01036656, + "balance_loss_clip": 1.0217309, + "balance_loss_mlp": 1.04161966, + "epoch": 0.36001803697580037, + "flos": 20959873845120.0, + "grad_norm": 1.8583263097896845, + "language_loss": 0.69824201, + "learning_rate": 2.9622170191370404e-06, + "loss": 0.71982217, + "num_input_tokens_seen": 128642430, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.796875, + "step": 5988, + "time_per_iteration": 2.484654426574707 + }, + { + "auxiliary_loss_clip": 0.01125207, + "auxiliary_loss_mlp": 0.01041231, + "balance_loss_clip": 1.02675915, + "balance_loss_mlp": 1.04297233, + "epoch": 0.36007816022846834, + "flos": 20485565729280.0, + "grad_norm": 1.851186734533378, + "language_loss": 0.72918314, + "learning_rate": 2.9618755750076953e-06, + "loss": 0.75084746, + "num_input_tokens_seen": 128661285, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8203125, + "step": 5989, + "time_per_iteration": 2.464378833770752 + }, + { + "auxiliary_loss_clip": 0.01120868, + "auxiliary_loss_mlp": 0.0103281, + "balance_loss_clip": 1.0194943, + "balance_loss_mlp": 1.04283333, + "epoch": 0.36013828348113636, + "flos": 28001237794560.0, + "grad_norm": 1.8425061302669492, + "language_loss": 0.79664916, + "learning_rate": 2.961534094403931e-06, + "loss": 0.81818593, + "num_input_tokens_seen": 128682210, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.78125, + "step": 5990, + "time_per_iteration": 2.4947755336761475 + }, + { + "auxiliary_loss_clip": 0.01121243, + "auxiliary_loss_mlp": 0.01028868, + "balance_loss_clip": 1.01472998, + "balance_loss_mlp": 1.04281235, + "epoch": 0.3601984067338043, + "flos": 20082181017600.0, + "grad_norm": 1.9352260247419832, + "language_loss": 0.84225297, + "learning_rate": 2.961192577338698e-06, + "loss": 0.86375415, + "num_input_tokens_seen": 128700445, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.78515625, + "step": 5991, + "time_per_iteration": 2.4728991985321045 + }, + { + "auxiliary_loss_clip": 0.01123597, + "auxiliary_loss_mlp": 0.01039266, + "balance_loss_clip": 1.02490079, + "balance_loss_mlp": 1.04197788, + "epoch": 0.3602585299864723, + "flos": 18617599872000.0, + "grad_norm": 1.9640325518662143, + "language_loss": 0.75616056, + "learning_rate": 2.9608510238249463e-06, + "loss": 0.77778924, + "num_input_tokens_seen": 128716855, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.81640625, + "step": 5992, + "time_per_iteration": 2.4422738552093506 + }, + { + "auxiliary_loss_clip": 0.01119253, + "auxiliary_loss_mlp": 0.01034904, + "balance_loss_clip": 1.02022302, + "balance_loss_mlp": 1.04177451, + "epoch": 0.36031865323914025, + "flos": 19573003774080.0, + "grad_norm": 6.32582004359923, + "language_loss": 0.77500135, + "learning_rate": 2.960509433875627e-06, + "loss": 0.79654288, + "num_input_tokens_seen": 128735835, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7734375, + "step": 5993, + "time_per_iteration": 2.4513776302337646 + }, + { + "auxiliary_loss_clip": 0.01124951, + "auxiliary_loss_mlp": 0.01036544, + "balance_loss_clip": 1.02281737, + "balance_loss_mlp": 1.04405534, + "epoch": 0.3603787764918082, + "flos": 17490615678720.0, + "grad_norm": 1.9096274983436938, + "language_loss": 0.74686468, + "learning_rate": 2.9601678075036943e-06, + "loss": 0.7684797, + "num_input_tokens_seen": 128752465, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.80859375, + "step": 5994, + "time_per_iteration": 2.4278860092163086 + }, + { + "auxiliary_loss_clip": 0.0112434, + "auxiliary_loss_mlp": 0.01038632, + "balance_loss_clip": 1.02506554, + "balance_loss_mlp": 1.04320991, + "epoch": 0.3604388997444762, + "flos": 15523393564800.0, + "grad_norm": 1.8397117218597796, + "language_loss": 0.68890274, + "learning_rate": 2.9598261447221024e-06, + "loss": 0.71053243, + "num_input_tokens_seen": 128770865, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.8125, + "step": 5995, + "time_per_iteration": 2.462557554244995 + }, + { + "auxiliary_loss_clip": 0.01124519, + "auxiliary_loss_mlp": 0.01040187, + "balance_loss_clip": 1.02548289, + "balance_loss_mlp": 1.04238582, + "epoch": 0.36049902299714415, + "flos": 17310883000320.0, + "grad_norm": 1.7352965040741237, + "language_loss": 0.82057822, + "learning_rate": 2.9594844455438057e-06, + "loss": 0.84222531, + "num_input_tokens_seen": 128789730, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8203125, + "step": 5996, + "time_per_iteration": 2.4284703731536865 + }, + { + "auxiliary_loss_clip": 0.01119849, + "auxiliary_loss_mlp": 0.01034523, + "balance_loss_clip": 1.02054608, + "balance_loss_mlp": 1.04242694, + "epoch": 0.3605591462498121, + "flos": 17056025026560.0, + "grad_norm": 1.56212250683249, + "language_loss": 0.73570979, + "learning_rate": 2.959142709981763e-06, + "loss": 0.75725353, + "num_input_tokens_seen": 128806610, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7734375, + "step": 5997, + "time_per_iteration": 2.4418485164642334 + }, + { + "auxiliary_loss_clip": 0.01120213, + "auxiliary_loss_mlp": 0.01036336, + "balance_loss_clip": 1.02272177, + "balance_loss_mlp": 1.04307055, + "epoch": 0.3606192695024801, + "flos": 16836862193280.0, + "grad_norm": 2.1655767572067637, + "language_loss": 0.68651283, + "learning_rate": 2.9588009380489337e-06, + "loss": 0.70807832, + "num_input_tokens_seen": 128824830, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 5998, + "time_per_iteration": 2.435884475708008 + }, + { + "auxiliary_loss_clip": 0.01124048, + "auxiliary_loss_mlp": 0.01034007, + "balance_loss_clip": 1.01983321, + "balance_loss_mlp": 1.04494119, + "epoch": 0.36067939275514804, + "flos": 12129655743360.0, + "grad_norm": 2.6750874406601914, + "language_loss": 0.77190387, + "learning_rate": 2.9584591297582758e-06, + "loss": 0.79348445, + "num_input_tokens_seen": 128838170, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7890625, + "step": 5999, + "time_per_iteration": 2.415649175643921 + }, + { + "auxiliary_loss_clip": 0.01123679, + "auxiliary_loss_mlp": 0.01037913, + "balance_loss_clip": 1.02381015, + "balance_loss_mlp": 1.04481769, + "epoch": 0.360739516007816, + "flos": 18041449720320.0, + "grad_norm": 2.719833162653021, + "language_loss": 0.78307509, + "learning_rate": 2.9581172851227516e-06, + "loss": 0.80469108, + "num_input_tokens_seen": 128855625, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7890625, + "step": 6000, + "time_per_iteration": 2.450085401535034 + }, + { + "auxiliary_loss_clip": 0.01122067, + "auxiliary_loss_mlp": 0.01034294, + "balance_loss_clip": 1.02061474, + "balance_loss_mlp": 1.04283905, + "epoch": 0.360799639260484, + "flos": 18549800951040.0, + "grad_norm": 1.6917067376727954, + "language_loss": 0.78621352, + "learning_rate": 2.9577754041553243e-06, + "loss": 0.80777717, + "num_input_tokens_seen": 128873540, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.79296875, + "step": 6001, + "time_per_iteration": 2.4247405529022217 + }, + { + "auxiliary_loss_clip": 0.01119251, + "auxiliary_loss_mlp": 0.0103132, + "balance_loss_clip": 1.01761651, + "balance_loss_mlp": 1.04341698, + "epoch": 0.36085976251315194, + "flos": 19682028529920.0, + "grad_norm": 1.9017223481518102, + "language_loss": 0.83743405, + "learning_rate": 2.9574334868689575e-06, + "loss": 0.85893983, + "num_input_tokens_seen": 128889925, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7578125, + "step": 6002, + "time_per_iteration": 2.4587790966033936 + }, + { + "auxiliary_loss_clip": 0.01117677, + "auxiliary_loss_mlp": 0.01030372, + "balance_loss_clip": 1.01753855, + "balance_loss_mlp": 1.04298413, + "epoch": 0.3609198857658199, + "flos": 24198943703040.0, + "grad_norm": 2.101850625944426, + "language_loss": 0.90627617, + "learning_rate": 2.9570915332766165e-06, + "loss": 0.92775667, + "num_input_tokens_seen": 128906890, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.75, + "step": 6003, + "time_per_iteration": 2.450408697128296 + }, + { + "auxiliary_loss_clip": 0.01040628, + "auxiliary_loss_mlp": 0.01013073, + "balance_loss_clip": 1.01102221, + "balance_loss_mlp": 1.01496768, + "epoch": 0.3609800090184879, + "flos": 57115995160320.0, + "grad_norm": 0.8843653445723816, + "language_loss": 0.53374904, + "learning_rate": 2.9567495433912693e-06, + "loss": 0.55428606, + "num_input_tokens_seen": 128965940, + "router_z_loss_clip": 0.02050781, + "router_z_loss_mlp": 0.25585938, + "step": 6004, + "time_per_iteration": 3.005659341812134 + }, + { + "auxiliary_loss_clip": 0.01121195, + "auxiliary_loss_mlp": 0.01037049, + "balance_loss_clip": 1.02152824, + "balance_loss_mlp": 1.04164577, + "epoch": 0.3610401322711559, + "flos": 20811239366400.0, + "grad_norm": 1.7248099575523852, + "language_loss": 0.77609527, + "learning_rate": 2.956407517225883e-06, + "loss": 0.7976777, + "num_input_tokens_seen": 128985835, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.796875, + "step": 6005, + "time_per_iteration": 2.4916067123413086 + }, + { + "auxiliary_loss_clip": 0.01124405, + "auxiliary_loss_mlp": 0.0103607, + "balance_loss_clip": 1.02230704, + "balance_loss_mlp": 1.04700613, + "epoch": 0.36110025552382385, + "flos": 13699167494400.0, + "grad_norm": 2.24467290311728, + "language_loss": 0.79267776, + "learning_rate": 2.956065454793429e-06, + "loss": 0.81428248, + "num_input_tokens_seen": 129003120, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7734375, + "step": 6006, + "time_per_iteration": 2.4366166591644287 + }, + { + "auxiliary_loss_clip": 0.01124848, + "auxiliary_loss_mlp": 0.01038917, + "balance_loss_clip": 1.02309775, + "balance_loss_mlp": 1.04587984, + "epoch": 0.3611603787764918, + "flos": 22455014486400.0, + "grad_norm": 1.7888636143213261, + "language_loss": 0.84360719, + "learning_rate": 2.955723356106876e-06, + "loss": 0.86524487, + "num_input_tokens_seen": 129021645, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.7890625, + "step": 6007, + "time_per_iteration": 2.51680850982666 + }, + { + "auxiliary_loss_clip": 0.01129854, + "auxiliary_loss_mlp": 0.01037601, + "balance_loss_clip": 1.02166319, + "balance_loss_mlp": 1.04622328, + "epoch": 0.3612205020291598, + "flos": 20886651970560.0, + "grad_norm": 2.0771979180574425, + "language_loss": 0.72564125, + "learning_rate": 2.955381221179198e-06, + "loss": 0.74731576, + "num_input_tokens_seen": 129038375, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8359375, + "step": 6008, + "time_per_iteration": 2.4473018646240234 + }, + { + "auxiliary_loss_clip": 0.01120564, + "auxiliary_loss_mlp": 0.01033967, + "balance_loss_clip": 1.02066362, + "balance_loss_mlp": 1.04255283, + "epoch": 0.36128062528182775, + "flos": 15741981780480.0, + "grad_norm": 1.9836274680059969, + "language_loss": 0.8284781, + "learning_rate": 2.955039050023368e-06, + "loss": 0.85002339, + "num_input_tokens_seen": 129056235, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.78125, + "step": 6009, + "time_per_iteration": 2.470031261444092 + }, + { + "auxiliary_loss_clip": 0.01125455, + "auxiliary_loss_mlp": 0.01043722, + "balance_loss_clip": 1.02945232, + "balance_loss_mlp": 1.04598057, + "epoch": 0.3613407485344957, + "flos": 16764502245120.0, + "grad_norm": 1.714442270200285, + "language_loss": 0.76139152, + "learning_rate": 2.954696842652362e-06, + "loss": 0.78308332, + "num_input_tokens_seen": 129072405, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.79296875, + "step": 6010, + "time_per_iteration": 2.446833848953247 + }, + { + "auxiliary_loss_clip": 0.01123758, + "auxiliary_loss_mlp": 0.01037408, + "balance_loss_clip": 1.0236752, + "balance_loss_mlp": 1.04619896, + "epoch": 0.3614008717871637, + "flos": 20371189847040.0, + "grad_norm": 1.905716478313633, + "language_loss": 0.82946253, + "learning_rate": 2.9543545990791554e-06, + "loss": 0.85107422, + "num_input_tokens_seen": 129090225, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.77734375, + "step": 6011, + "time_per_iteration": 2.508147716522217 + }, + { + "auxiliary_loss_clip": 0.01132257, + "auxiliary_loss_mlp": 0.01041461, + "balance_loss_clip": 1.0264287, + "balance_loss_mlp": 1.0491302, + "epoch": 0.36146099503983165, + "flos": 22776665800320.0, + "grad_norm": 1.8484903271380355, + "language_loss": 0.62762833, + "learning_rate": 2.954012319316727e-06, + "loss": 0.64936543, + "num_input_tokens_seen": 129107685, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 6012, + "time_per_iteration": 5.36588454246521 + }, + { + "auxiliary_loss_clip": 0.01118968, + "auxiliary_loss_mlp": 0.01034853, + "balance_loss_clip": 1.02112007, + "balance_loss_mlp": 1.04337454, + "epoch": 0.3615211182924996, + "flos": 22996654646400.0, + "grad_norm": 1.8689670235824563, + "language_loss": 0.84111822, + "learning_rate": 2.9536700033780565e-06, + "loss": 0.86265635, + "num_input_tokens_seen": 129125315, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7578125, + "step": 6013, + "time_per_iteration": 2.494051933288574 + }, + { + "auxiliary_loss_clip": 0.01124804, + "auxiliary_loss_mlp": 0.0104133, + "balance_loss_clip": 1.02690601, + "balance_loss_mlp": 1.04570448, + "epoch": 0.3615812415451676, + "flos": 16648079287680.0, + "grad_norm": 1.7351999387675028, + "language_loss": 0.91496456, + "learning_rate": 2.9533276512761228e-06, + "loss": 0.93662584, + "num_input_tokens_seen": 129141600, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7890625, + "step": 6014, + "time_per_iteration": 2.4356749057769775 + }, + { + "auxiliary_loss_clip": 0.01123597, + "auxiliary_loss_mlp": 0.01044534, + "balance_loss_clip": 1.03078914, + "balance_loss_mlp": 1.04549718, + "epoch": 0.36164136479783554, + "flos": 21320093387520.0, + "grad_norm": 1.727703603585928, + "language_loss": 0.73830914, + "learning_rate": 2.95298526302391e-06, + "loss": 0.75999045, + "num_input_tokens_seen": 129160665, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.78125, + "step": 6015, + "time_per_iteration": 2.4990644454956055 + }, + { + "auxiliary_loss_clip": 0.01125644, + "auxiliary_loss_mlp": 0.01038195, + "balance_loss_clip": 1.02394915, + "balance_loss_mlp": 1.04633307, + "epoch": 0.3617014880505035, + "flos": 24169569356160.0, + "grad_norm": 1.7277224025907603, + "language_loss": 0.65316677, + "learning_rate": 2.9526428386344e-06, + "loss": 0.67480516, + "num_input_tokens_seen": 129179220, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.79296875, + "step": 6016, + "time_per_iteration": 2.5260934829711914 + }, + { + "auxiliary_loss_clip": 0.01126131, + "auxiliary_loss_mlp": 0.01041518, + "balance_loss_clip": 1.02522171, + "balance_loss_mlp": 1.04727304, + "epoch": 0.3617616113031715, + "flos": 39014824101120.0, + "grad_norm": 1.744160138264151, + "language_loss": 0.72101283, + "learning_rate": 2.9523003781205785e-06, + "loss": 0.74268931, + "num_input_tokens_seen": 129200385, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.7890625, + "step": 6017, + "time_per_iteration": 2.638683795928955 + }, + { + "auxiliary_loss_clip": 0.01126121, + "auxiliary_loss_mlp": 0.01038852, + "balance_loss_clip": 1.02413559, + "balance_loss_mlp": 1.04454577, + "epoch": 0.3618217345558395, + "flos": 12130840892160.0, + "grad_norm": 1.9120538903838002, + "language_loss": 0.73590356, + "learning_rate": 2.9519578814954307e-06, + "loss": 0.75755334, + "num_input_tokens_seen": 129217395, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.81640625, + "step": 6018, + "time_per_iteration": 2.4477858543395996 + }, + { + "auxiliary_loss_clip": 0.01119909, + "auxiliary_loss_mlp": 0.01033819, + "balance_loss_clip": 1.02013361, + "balance_loss_mlp": 1.04458487, + "epoch": 0.36188185780850746, + "flos": 24935005203840.0, + "grad_norm": 1.754547200149591, + "language_loss": 0.69080901, + "learning_rate": 2.9516153487719448e-06, + "loss": 0.71234632, + "num_input_tokens_seen": 129238940, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75390625, + "step": 6019, + "time_per_iteration": 2.519831657409668 + }, + { + "auxiliary_loss_clip": 0.01124958, + "auxiliary_loss_mlp": 0.01034647, + "balance_loss_clip": 1.01980555, + "balance_loss_mlp": 1.0443728, + "epoch": 0.3619419810611754, + "flos": 20958832350720.0, + "grad_norm": 1.5467952079219929, + "language_loss": 0.76299942, + "learning_rate": 2.95127277996311e-06, + "loss": 0.78459549, + "num_input_tokens_seen": 129258240, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8046875, + "step": 6020, + "time_per_iteration": 2.4692177772521973 + }, + { + "auxiliary_loss_clip": 0.01125932, + "auxiliary_loss_mlp": 0.01043324, + "balance_loss_clip": 1.02814841, + "balance_loss_mlp": 1.04721653, + "epoch": 0.3620021043138434, + "flos": 22528882805760.0, + "grad_norm": 1.938447153390643, + "language_loss": 0.73921824, + "learning_rate": 2.9509301750819156e-06, + "loss": 0.76091087, + "num_input_tokens_seen": 129279040, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.78515625, + "step": 6021, + "time_per_iteration": 2.5069808959960938 + }, + { + "auxiliary_loss_clip": 0.01123146, + "auxiliary_loss_mlp": 0.01034023, + "balance_loss_clip": 1.02059376, + "balance_loss_mlp": 1.04596186, + "epoch": 0.36206222756651135, + "flos": 15596687266560.0, + "grad_norm": 1.8648032073369731, + "language_loss": 0.80978441, + "learning_rate": 2.9505875341413533e-06, + "loss": 0.83135605, + "num_input_tokens_seen": 129295415, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.76953125, + "step": 6022, + "time_per_iteration": 2.4620115756988525 + }, + { + "auxiliary_loss_clip": 0.01122576, + "auxiliary_loss_mlp": 0.0103516, + "balance_loss_clip": 1.02212477, + "balance_loss_mlp": 1.04778302, + "epoch": 0.3621223508191793, + "flos": 23587170238080.0, + "grad_norm": 1.6799220656127192, + "language_loss": 0.81351119, + "learning_rate": 2.950244857154417e-06, + "loss": 0.83508855, + "num_input_tokens_seen": 129312620, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.74609375, + "step": 6023, + "time_per_iteration": 2.4969308376312256 + }, + { + "auxiliary_loss_clip": 0.01124727, + "auxiliary_loss_mlp": 0.01034523, + "balance_loss_clip": 1.01975274, + "balance_loss_mlp": 1.04494548, + "epoch": 0.3621824740718473, + "flos": 22309899540480.0, + "grad_norm": 1.8793265875700644, + "language_loss": 0.79767907, + "learning_rate": 2.9499021441341e-06, + "loss": 0.81927156, + "num_input_tokens_seen": 129331825, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.796875, + "step": 6024, + "time_per_iteration": 2.468369245529175 + }, + { + "auxiliary_loss_clip": 0.01119855, + "auxiliary_loss_mlp": 0.01029797, + "balance_loss_clip": 1.01629043, + "balance_loss_mlp": 1.04456711, + "epoch": 0.36224259732451525, + "flos": 16763640318720.0, + "grad_norm": 1.7897574616215441, + "language_loss": 0.74720407, + "learning_rate": 2.9495593950933997e-06, + "loss": 0.7687006, + "num_input_tokens_seen": 129350400, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75390625, + "step": 6025, + "time_per_iteration": 2.4410412311553955 + }, + { + "auxiliary_loss_clip": 0.01120005, + "auxiliary_loss_mlp": 0.0103221, + "balance_loss_clip": 1.01849484, + "balance_loss_mlp": 1.04340899, + "epoch": 0.3623027205771832, + "flos": 23149742411520.0, + "grad_norm": 1.5522426900619628, + "language_loss": 0.72055018, + "learning_rate": 2.9492166100453107e-06, + "loss": 0.74207234, + "num_input_tokens_seen": 129371155, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.765625, + "step": 6026, + "time_per_iteration": 2.4997596740722656 + }, + { + "auxiliary_loss_clip": 0.01128673, + "auxiliary_loss_mlp": 0.01041263, + "balance_loss_clip": 1.02645707, + "balance_loss_mlp": 1.04604256, + "epoch": 0.3623628438298512, + "flos": 28549162834560.0, + "grad_norm": 2.401846993246305, + "language_loss": 0.79332775, + "learning_rate": 2.948873789002833e-06, + "loss": 0.81502712, + "num_input_tokens_seen": 129391230, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.82421875, + "step": 6027, + "time_per_iteration": 2.5326383113861084 + }, + { + "auxiliary_loss_clip": 0.0112338, + "auxiliary_loss_mlp": 0.01040685, + "balance_loss_clip": 1.02576041, + "balance_loss_mlp": 1.04399586, + "epoch": 0.36242296708251914, + "flos": 25484941405440.0, + "grad_norm": 1.7548337209278033, + "language_loss": 0.67809385, + "learning_rate": 2.9485309319789667e-06, + "loss": 0.69973445, + "num_input_tokens_seen": 129410065, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.796875, + "step": 6028, + "time_per_iteration": 2.548088788986206 + }, + { + "auxiliary_loss_clip": 0.0112104, + "auxiliary_loss_mlp": 0.01032512, + "balance_loss_clip": 1.01922584, + "balance_loss_mlp": 1.04415894, + "epoch": 0.3624830903351871, + "flos": 16290373697280.0, + "grad_norm": 1.63067637662311, + "language_loss": 0.85700679, + "learning_rate": 2.9481880389867117e-06, + "loss": 0.8785423, + "num_input_tokens_seen": 129428655, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.76953125, + "step": 6029, + "time_per_iteration": 2.429720878601074 + }, + { + "auxiliary_loss_clip": 0.01120137, + "auxiliary_loss_mlp": 0.01038059, + "balance_loss_clip": 1.02412939, + "balance_loss_mlp": 1.04442835, + "epoch": 0.36254321358785513, + "flos": 18296307694080.0, + "grad_norm": 1.6511023563359555, + "language_loss": 0.72693753, + "learning_rate": 2.9478451100390714e-06, + "loss": 0.74851942, + "num_input_tokens_seen": 129447845, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 6030, + "time_per_iteration": 2.4299302101135254 + }, + { + "auxiliary_loss_clip": 0.01123199, + "auxiliary_loss_mlp": 0.01041671, + "balance_loss_clip": 1.02529144, + "balance_loss_mlp": 1.04264557, + "epoch": 0.3626033368405231, + "flos": 14865294533760.0, + "grad_norm": 2.02536170930057, + "language_loss": 0.73986644, + "learning_rate": 2.94750214514905e-06, + "loss": 0.76151514, + "num_input_tokens_seen": 129463275, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8046875, + "step": 6031, + "time_per_iteration": 2.4376232624053955 + }, + { + "auxiliary_loss_clip": 0.01120355, + "auxiliary_loss_mlp": 0.01031654, + "balance_loss_clip": 1.0177424, + "balance_loss_mlp": 1.04309845, + "epoch": 0.36266346009319106, + "flos": 22306595489280.0, + "grad_norm": 1.8475328889194098, + "language_loss": 0.73286617, + "learning_rate": 2.9471591443296516e-06, + "loss": 0.75438625, + "num_input_tokens_seen": 129483205, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76953125, + "step": 6032, + "time_per_iteration": 2.4811155796051025 + }, + { + "auxiliary_loss_clip": 0.01121253, + "auxiliary_loss_mlp": 0.01038206, + "balance_loss_clip": 1.02412748, + "balance_loss_mlp": 1.0427382, + "epoch": 0.362723583345859, + "flos": 18222331633920.0, + "grad_norm": 1.684246043345259, + "language_loss": 0.77953577, + "learning_rate": 2.946816107593884e-06, + "loss": 0.80113035, + "num_input_tokens_seen": 129499885, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.78515625, + "step": 6033, + "time_per_iteration": 2.4283456802368164 + }, + { + "auxiliary_loss_clip": 0.01040416, + "auxiliary_loss_mlp": 0.01019079, + "balance_loss_clip": 1.01733828, + "balance_loss_mlp": 1.01487339, + "epoch": 0.362783706598527, + "flos": 68499174458880.0, + "grad_norm": 0.786107382559835, + "language_loss": 0.64822888, + "learning_rate": 2.9464730349547547e-06, + "loss": 0.66882384, + "num_input_tokens_seen": 129561885, + "router_z_loss_clip": 0.01745605, + "router_z_loss_mlp": 0.25585938, + "step": 6034, + "time_per_iteration": 3.1253511905670166 + }, + { + "auxiliary_loss_clip": 0.01118206, + "auxiliary_loss_mlp": 0.01035423, + "balance_loss_clip": 1.02139246, + "balance_loss_mlp": 1.04131126, + "epoch": 0.36284382985119495, + "flos": 26576589594240.0, + "grad_norm": 1.4985312456135769, + "language_loss": 0.90059769, + "learning_rate": 2.946129926425273e-06, + "loss": 0.92213392, + "num_input_tokens_seen": 129582325, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76953125, + "step": 6035, + "time_per_iteration": 2.4888923168182373 + }, + { + "auxiliary_loss_clip": 0.01122001, + "auxiliary_loss_mlp": 0.01034945, + "balance_loss_clip": 1.02030611, + "balance_loss_mlp": 1.04239392, + "epoch": 0.3629039531038629, + "flos": 20156767608960.0, + "grad_norm": 1.7493433732375512, + "language_loss": 0.73526931, + "learning_rate": 2.9457867820184496e-06, + "loss": 0.7568388, + "num_input_tokens_seen": 129600350, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.796875, + "step": 6036, + "time_per_iteration": 2.445058822631836 + }, + { + "auxiliary_loss_clip": 0.01124436, + "auxiliary_loss_mlp": 0.01029235, + "balance_loss_clip": 1.01500189, + "balance_loss_mlp": 1.04274487, + "epoch": 0.3629640763565309, + "flos": 18625716345600.0, + "grad_norm": 1.901551926176817, + "language_loss": 0.75938255, + "learning_rate": 2.945443601747297e-06, + "loss": 0.78091925, + "num_input_tokens_seen": 129618425, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.81640625, + "step": 6037, + "time_per_iteration": 2.422229766845703 + }, + { + "auxiliary_loss_clip": 0.0111661, + "auxiliary_loss_mlp": 0.0103799, + "balance_loss_clip": 1.0238812, + "balance_loss_mlp": 1.04227912, + "epoch": 0.36302419960919885, + "flos": 19571459489280.0, + "grad_norm": 1.6899683541385933, + "language_loss": 0.78120697, + "learning_rate": 2.945100385624828e-06, + "loss": 0.80275297, + "num_input_tokens_seen": 129636750, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 6038, + "time_per_iteration": 2.4582855701446533 + }, + { + "auxiliary_loss_clip": 0.0103994, + "auxiliary_loss_mlp": 0.01006466, + "balance_loss_clip": 1.00467765, + "balance_loss_mlp": 1.01452303, + "epoch": 0.3630843228618668, + "flos": 63797606444160.0, + "grad_norm": 0.8286249809211084, + "language_loss": 0.63413143, + "learning_rate": 2.9447571336640573e-06, + "loss": 0.65459549, + "num_input_tokens_seen": 129699030, + "router_z_loss_clip": 0.01782227, + "router_z_loss_mlp": 0.25390625, + "step": 6039, + "time_per_iteration": 3.1417860984802246 + }, + { + "auxiliary_loss_clip": 0.01120334, + "auxiliary_loss_mlp": 0.01035281, + "balance_loss_clip": 1.02175713, + "balance_loss_mlp": 1.04391789, + "epoch": 0.3631444461145348, + "flos": 21835160461440.0, + "grad_norm": 1.9215128015710738, + "language_loss": 0.70857447, + "learning_rate": 2.944413845878002e-06, + "loss": 0.73013067, + "num_input_tokens_seen": 129717135, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 6040, + "time_per_iteration": 2.505627155303955 + }, + { + "auxiliary_loss_clip": 0.0112497, + "auxiliary_loss_mlp": 0.01032549, + "balance_loss_clip": 1.01827383, + "balance_loss_mlp": 1.04445744, + "epoch": 0.36320456936720275, + "flos": 21722041555200.0, + "grad_norm": 2.327350689124367, + "language_loss": 0.81322253, + "learning_rate": 2.9440705222796783e-06, + "loss": 0.83479762, + "num_input_tokens_seen": 129735940, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8046875, + "step": 6041, + "time_per_iteration": 2.4475231170654297 + }, + { + "auxiliary_loss_clip": 0.01120173, + "auxiliary_loss_mlp": 0.01030159, + "balance_loss_clip": 1.01526928, + "balance_loss_mlp": 1.04150891, + "epoch": 0.3632646926198707, + "flos": 17019072910080.0, + "grad_norm": 2.252727008735842, + "language_loss": 0.83721769, + "learning_rate": 2.943727162882107e-06, + "loss": 0.85872102, + "num_input_tokens_seen": 129752790, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78515625, + "step": 6042, + "time_per_iteration": 2.461111545562744 + }, + { + "auxiliary_loss_clip": 0.01120803, + "auxiliary_loss_mlp": 0.01039778, + "balance_loss_clip": 1.02583623, + "balance_loss_mlp": 1.04390788, + "epoch": 0.36332481587253873, + "flos": 23331163029120.0, + "grad_norm": 1.6644116234057968, + "language_loss": 0.78122932, + "learning_rate": 2.9433837676983064e-06, + "loss": 0.80283511, + "num_input_tokens_seen": 129773655, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76953125, + "step": 6043, + "time_per_iteration": 2.477030038833618 + }, + { + "auxiliary_loss_clip": 0.01117368, + "auxiliary_loss_mlp": 0.01035057, + "balance_loss_clip": 1.02017403, + "balance_loss_mlp": 1.04266226, + "epoch": 0.3633849391252067, + "flos": 10743539857920.0, + "grad_norm": 3.8032713581650515, + "language_loss": 0.65792918, + "learning_rate": 2.943040336741298e-06, + "loss": 0.67945337, + "num_input_tokens_seen": 129791605, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.74609375, + "step": 6044, + "time_per_iteration": 2.471221446990967 + }, + { + "auxiliary_loss_clip": 0.01118191, + "auxiliary_loss_mlp": 0.01030901, + "balance_loss_clip": 1.01706135, + "balance_loss_mlp": 1.04186332, + "epoch": 0.36344506237787466, + "flos": 25849147357440.0, + "grad_norm": 1.74112377533005, + "language_loss": 0.80978471, + "learning_rate": 2.9426968700241066e-06, + "loss": 0.83127558, + "num_input_tokens_seen": 129811075, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76171875, + "step": 6045, + "time_per_iteration": 2.482147693634033 + }, + { + "auxiliary_loss_clip": 0.01122131, + "auxiliary_loss_mlp": 0.01038556, + "balance_loss_clip": 1.02388096, + "balance_loss_mlp": 1.04342091, + "epoch": 0.3635051856305426, + "flos": 30154046503680.0, + "grad_norm": 1.7414472049280392, + "language_loss": 0.64214617, + "learning_rate": 2.942353367559755e-06, + "loss": 0.66375309, + "num_input_tokens_seen": 129833755, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78515625, + "step": 6046, + "time_per_iteration": 2.593209743499756 + }, + { + "auxiliary_loss_clip": 0.01119542, + "auxiliary_loss_mlp": 0.01035387, + "balance_loss_clip": 1.02142787, + "balance_loss_mlp": 1.04214859, + "epoch": 0.3635653088832106, + "flos": 22198396746240.0, + "grad_norm": 1.623453692259123, + "language_loss": 0.77366132, + "learning_rate": 2.9420098293612692e-06, + "loss": 0.7952106, + "num_input_tokens_seen": 129854475, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7734375, + "step": 6047, + "time_per_iteration": 2.4650797843933105 + }, + { + "auxiliary_loss_clip": 0.01125471, + "auxiliary_loss_mlp": 0.01041953, + "balance_loss_clip": 1.02609777, + "balance_loss_mlp": 1.04148006, + "epoch": 0.36362543213587856, + "flos": 24787053083520.0, + "grad_norm": 1.508802610673932, + "language_loss": 0.79679012, + "learning_rate": 2.9416662554416767e-06, + "loss": 0.81846434, + "num_input_tokens_seen": 129873530, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8359375, + "step": 6048, + "time_per_iteration": 2.5329999923706055 + }, + { + "auxiliary_loss_clip": 0.01037747, + "auxiliary_loss_mlp": 0.01000265, + "balance_loss_clip": 0.99839348, + "balance_loss_mlp": 1.0124383, + "epoch": 0.3636855553885465, + "flos": 62526369231360.0, + "grad_norm": 0.7564639677567045, + "language_loss": 0.52584642, + "learning_rate": 2.9413226458140054e-06, + "loss": 0.54622656, + "num_input_tokens_seen": 129940400, + "router_z_loss_clip": 0.01867676, + "router_z_loss_mlp": 0.25390625, + "step": 6049, + "time_per_iteration": 3.1051762104034424 + }, + { + "auxiliary_loss_clip": 0.0112103, + "auxiliary_loss_mlp": 0.01036313, + "balance_loss_clip": 1.02172136, + "balance_loss_mlp": 1.04254675, + "epoch": 0.3637456786412145, + "flos": 24060652341120.0, + "grad_norm": 2.0453292842004833, + "language_loss": 0.86365628, + "learning_rate": 2.9409790004912845e-06, + "loss": 0.88522977, + "num_input_tokens_seen": 129958635, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.78515625, + "step": 6050, + "time_per_iteration": 2.469092845916748 + }, + { + "auxiliary_loss_clip": 0.01119484, + "auxiliary_loss_mlp": 0.01036489, + "balance_loss_clip": 1.0227803, + "balance_loss_mlp": 1.04309154, + "epoch": 0.36380580189388245, + "flos": 16691495852160.0, + "grad_norm": 1.7649295268136813, + "language_loss": 0.7855531, + "learning_rate": 2.940635319486546e-06, + "loss": 0.80711287, + "num_input_tokens_seen": 129977685, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76171875, + "step": 6051, + "time_per_iteration": 2.425166368484497 + }, + { + "auxiliary_loss_clip": 0.0111821, + "auxiliary_loss_mlp": 0.01034377, + "balance_loss_clip": 1.02044129, + "balance_loss_mlp": 1.04047346, + "epoch": 0.3638659251465504, + "flos": 25114091437440.0, + "grad_norm": 2.0280679706971423, + "language_loss": 0.83024764, + "learning_rate": 2.940291602812822e-06, + "loss": 0.8517735, + "num_input_tokens_seen": 129997530, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.77734375, + "step": 6052, + "time_per_iteration": 2.511826992034912 + }, + { + "auxiliary_loss_clip": 0.01114918, + "auxiliary_loss_mlp": 0.01034279, + "balance_loss_clip": 1.02146947, + "balance_loss_mlp": 1.03992438, + "epoch": 0.3639260483992184, + "flos": 23003011353600.0, + "grad_norm": 3.055248278017369, + "language_loss": 0.72156489, + "learning_rate": 2.939947850483145e-06, + "loss": 0.74305683, + "num_input_tokens_seen": 130017955, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.75, + "step": 6053, + "time_per_iteration": 4.030078887939453 + }, + { + "auxiliary_loss_clip": 0.01038499, + "auxiliary_loss_mlp": 0.01000787, + "balance_loss_clip": 0.99893934, + "balance_loss_mlp": 1.01315093, + "epoch": 0.36398617165188635, + "flos": 70716011160960.0, + "grad_norm": 0.7695228081579073, + "language_loss": 0.61234874, + "learning_rate": 2.9396040625105532e-06, + "loss": 0.63274157, + "num_input_tokens_seen": 130074275, + "router_z_loss_clip": 0.01843262, + "router_z_loss_mlp": 0.25390625, + "step": 6054, + "time_per_iteration": 4.498634576797485 + }, + { + "auxiliary_loss_clip": 0.01121607, + "auxiliary_loss_mlp": 0.01038553, + "balance_loss_clip": 1.02378917, + "balance_loss_mlp": 1.0425837, + "epoch": 0.3640462949045543, + "flos": 22235456603520.0, + "grad_norm": 1.9647165397438333, + "language_loss": 0.75846946, + "learning_rate": 2.9392602389080802e-06, + "loss": 0.78007108, + "num_input_tokens_seen": 130091375, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.7890625, + "step": 6055, + "time_per_iteration": 2.46478271484375 + }, + { + "auxiliary_loss_clip": 0.01121913, + "auxiliary_loss_mlp": 0.010384, + "balance_loss_clip": 1.0240891, + "balance_loss_mlp": 1.04369521, + "epoch": 0.3641064181572223, + "flos": 21543529939200.0, + "grad_norm": 1.6567803669377452, + "language_loss": 0.75263339, + "learning_rate": 2.938916379688765e-06, + "loss": 0.7742365, + "num_input_tokens_seen": 130111595, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78125, + "step": 6056, + "time_per_iteration": 2.4739041328430176 + }, + { + "auxiliary_loss_clip": 0.01120031, + "auxiliary_loss_mlp": 0.01039041, + "balance_loss_clip": 1.02447379, + "balance_loss_mlp": 1.04331231, + "epoch": 0.3641665414098903, + "flos": 22273306560000.0, + "grad_norm": 2.0844054878938607, + "language_loss": 0.80676425, + "learning_rate": 2.9385724848656468e-06, + "loss": 0.82835501, + "num_input_tokens_seen": 130131440, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.765625, + "step": 6057, + "time_per_iteration": 2.4778594970703125 + }, + { + "auxiliary_loss_clip": 0.01119344, + "auxiliary_loss_mlp": 0.0103916, + "balance_loss_clip": 1.02457452, + "balance_loss_mlp": 1.04333091, + "epoch": 0.36422666466255826, + "flos": 28329676778880.0, + "grad_norm": 1.8744131952209395, + "language_loss": 0.79986346, + "learning_rate": 2.9382285544517647e-06, + "loss": 0.82144856, + "num_input_tokens_seen": 130151375, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.76171875, + "step": 6058, + "time_per_iteration": 2.5267081260681152 + }, + { + "auxiliary_loss_clip": 0.01119278, + "auxiliary_loss_mlp": 0.01035858, + "balance_loss_clip": 1.02142191, + "balance_loss_mlp": 1.04207647, + "epoch": 0.36428678791522623, + "flos": 24170503109760.0, + "grad_norm": 1.8448855765347556, + "language_loss": 0.8485254, + "learning_rate": 2.9378845884601636e-06, + "loss": 0.87007678, + "num_input_tokens_seen": 130169960, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.76953125, + "step": 6059, + "time_per_iteration": 2.4876210689544678 + }, + { + "auxiliary_loss_clip": 0.01123355, + "auxiliary_loss_mlp": 0.01040047, + "balance_loss_clip": 1.02527666, + "balance_loss_mlp": 1.04397857, + "epoch": 0.3643469111678942, + "flos": 22528451842560.0, + "grad_norm": 1.4958849024653313, + "language_loss": 0.8783946, + "learning_rate": 2.937540586903884e-06, + "loss": 0.90002865, + "num_input_tokens_seen": 130189800, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.79296875, + "step": 6060, + "time_per_iteration": 2.516439199447632 + }, + { + "auxiliary_loss_clip": 0.01124396, + "auxiliary_loss_mlp": 0.0104086, + "balance_loss_clip": 1.02583957, + "balance_loss_mlp": 1.04366183, + "epoch": 0.36440703442056216, + "flos": 19426595938560.0, + "grad_norm": 2.6600271028380824, + "language_loss": 0.67965293, + "learning_rate": 2.937196549795971e-06, + "loss": 0.70130551, + "num_input_tokens_seen": 130206370, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.80859375, + "step": 6061, + "time_per_iteration": 2.4436440467834473 + }, + { + "auxiliary_loss_clip": 0.01127668, + "auxiliary_loss_mlp": 0.01039684, + "balance_loss_clip": 1.02444267, + "balance_loss_mlp": 1.04622734, + "epoch": 0.3644671576732301, + "flos": 18040515966720.0, + "grad_norm": 2.142951671935031, + "language_loss": 0.75072217, + "learning_rate": 2.9368524771494718e-06, + "loss": 0.77239573, + "num_input_tokens_seen": 130224445, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8125, + "step": 6062, + "time_per_iteration": 2.4325368404388428 + }, + { + "auxiliary_loss_clip": 0.011222, + "auxiliary_loss_mlp": 0.01035014, + "balance_loss_clip": 1.01910567, + "balance_loss_mlp": 1.04460645, + "epoch": 0.3645272809258981, + "flos": 21542811667200.0, + "grad_norm": 1.6782897381106048, + "language_loss": 0.72632384, + "learning_rate": 2.936508368977432e-06, + "loss": 0.74789596, + "num_input_tokens_seen": 130245380, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.77734375, + "step": 6063, + "time_per_iteration": 2.498168468475342 + }, + { + "auxiliary_loss_clip": 0.0112083, + "auxiliary_loss_mlp": 0.01039322, + "balance_loss_clip": 1.0249579, + "balance_loss_mlp": 1.04365671, + "epoch": 0.36458740417856605, + "flos": 22746860490240.0, + "grad_norm": 1.8702732296649918, + "language_loss": 0.68128121, + "learning_rate": 2.936164225292901e-06, + "loss": 0.70288265, + "num_input_tokens_seen": 130265575, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7734375, + "step": 6064, + "time_per_iteration": 2.4951584339141846 + }, + { + "auxiliary_loss_clip": 0.01125722, + "auxiliary_loss_mlp": 0.01046801, + "balance_loss_clip": 1.03205502, + "balance_loss_mlp": 1.04549003, + "epoch": 0.364647527431234, + "flos": 26140670138880.0, + "grad_norm": 1.679838788119498, + "language_loss": 0.74604851, + "learning_rate": 2.9358200461089297e-06, + "loss": 0.76777375, + "num_input_tokens_seen": 130286195, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8046875, + "step": 6065, + "time_per_iteration": 2.4980344772338867 + }, + { + "auxiliary_loss_clip": 0.01125488, + "auxiliary_loss_mlp": 0.01041621, + "balance_loss_clip": 1.02544403, + "balance_loss_mlp": 1.04464209, + "epoch": 0.364707650683902, + "flos": 31029907737600.0, + "grad_norm": 1.8520658730284223, + "language_loss": 0.75248677, + "learning_rate": 2.9354758314385676e-06, + "loss": 0.77415788, + "num_input_tokens_seen": 130306095, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8046875, + "step": 6066, + "time_per_iteration": 2.5525264739990234 + }, + { + "auxiliary_loss_clip": 0.01116764, + "auxiliary_loss_mlp": 0.01034497, + "balance_loss_clip": 1.02101445, + "balance_loss_mlp": 1.04115653, + "epoch": 0.36476777393656995, + "flos": 19572896033280.0, + "grad_norm": 2.55479391525507, + "language_loss": 0.76988614, + "learning_rate": 2.9351315812948684e-06, + "loss": 0.79139876, + "num_input_tokens_seen": 130324685, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 6067, + "time_per_iteration": 2.440595865249634 + }, + { + "auxiliary_loss_clip": 0.01120327, + "auxiliary_loss_mlp": 0.01037255, + "balance_loss_clip": 1.02422583, + "balance_loss_mlp": 1.04442596, + "epoch": 0.3648278971892379, + "flos": 17748849530880.0, + "grad_norm": 2.1532465459722574, + "language_loss": 0.70826519, + "learning_rate": 2.934787295690886e-06, + "loss": 0.72984099, + "num_input_tokens_seen": 130343855, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7578125, + "step": 6068, + "time_per_iteration": 2.4555468559265137 + }, + { + "auxiliary_loss_clip": 0.01123082, + "auxiliary_loss_mlp": 0.01037893, + "balance_loss_clip": 1.02359986, + "balance_loss_mlp": 1.04301953, + "epoch": 0.3648880204419059, + "flos": 17931167988480.0, + "grad_norm": 1.8428063971352102, + "language_loss": 0.73987395, + "learning_rate": 2.9344429746396755e-06, + "loss": 0.76148373, + "num_input_tokens_seen": 130362320, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.80078125, + "step": 6069, + "time_per_iteration": 2.4380593299865723 + }, + { + "auxiliary_loss_clip": 0.01124432, + "auxiliary_loss_mlp": 0.01035086, + "balance_loss_clip": 1.0203104, + "balance_loss_mlp": 1.04434299, + "epoch": 0.3649481436945739, + "flos": 22638266697600.0, + "grad_norm": 1.740540431199334, + "language_loss": 0.66149801, + "learning_rate": 2.9340986181542945e-06, + "loss": 0.68309319, + "num_input_tokens_seen": 130383165, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.80078125, + "step": 6070, + "time_per_iteration": 2.4852278232574463 + }, + { + "auxiliary_loss_clip": 0.01120342, + "auxiliary_loss_mlp": 0.01036109, + "balance_loss_clip": 1.02225685, + "balance_loss_mlp": 1.04412127, + "epoch": 0.36500826694724187, + "flos": 21579656042880.0, + "grad_norm": 1.5531027619052142, + "language_loss": 0.74474913, + "learning_rate": 2.9337542262477994e-06, + "loss": 0.76631367, + "num_input_tokens_seen": 130402425, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76171875, + "step": 6071, + "time_per_iteration": 2.483961820602417 + }, + { + "auxiliary_loss_clip": 0.01119978, + "auxiliary_loss_mlp": 0.01034213, + "balance_loss_clip": 1.01926446, + "balance_loss_mlp": 1.04232538, + "epoch": 0.36506839019990983, + "flos": 13772533023360.0, + "grad_norm": 2.0347636440980277, + "language_loss": 0.88132894, + "learning_rate": 2.9334097989332506e-06, + "loss": 0.90287089, + "num_input_tokens_seen": 130419440, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.77734375, + "step": 6072, + "time_per_iteration": 2.4083876609802246 + }, + { + "auxiliary_loss_clip": 0.01121735, + "auxiliary_loss_mlp": 0.01035678, + "balance_loss_clip": 1.02184379, + "balance_loss_mlp": 1.04389739, + "epoch": 0.3651285134525778, + "flos": 17274972378240.0, + "grad_norm": 2.230203116909298, + "language_loss": 0.72432441, + "learning_rate": 2.9330653362237094e-06, + "loss": 0.74589849, + "num_input_tokens_seen": 130438495, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.77734375, + "step": 6073, + "time_per_iteration": 2.4769015312194824 + }, + { + "auxiliary_loss_clip": 0.01123465, + "auxiliary_loss_mlp": 0.01039544, + "balance_loss_clip": 1.02520275, + "balance_loss_mlp": 1.04425395, + "epoch": 0.36518863670524576, + "flos": 21907987286400.0, + "grad_norm": 1.8811318432297164, + "language_loss": 0.66584921, + "learning_rate": 2.932720838132236e-06, + "loss": 0.68747932, + "num_input_tokens_seen": 130455575, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7890625, + "step": 6074, + "time_per_iteration": 2.4474194049835205 + }, + { + "auxiliary_loss_clip": 0.01117894, + "auxiliary_loss_mlp": 0.01032639, + "balance_loss_clip": 1.01891208, + "balance_loss_mlp": 1.04079318, + "epoch": 0.3652487599579137, + "flos": 27122180250240.0, + "grad_norm": 1.5068114870819531, + "language_loss": 0.72946787, + "learning_rate": 2.9323763046718954e-06, + "loss": 0.75097322, + "num_input_tokens_seen": 130476385, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76953125, + "step": 6075, + "time_per_iteration": 2.5063765048980713 + }, + { + "auxiliary_loss_clip": 0.01126029, + "auxiliary_loss_mlp": 0.01044219, + "balance_loss_clip": 1.02888894, + "balance_loss_mlp": 1.04484594, + "epoch": 0.3653088832105817, + "flos": 19755573626880.0, + "grad_norm": 2.7314154698808113, + "language_loss": 0.8938573, + "learning_rate": 2.9320317358557524e-06, + "loss": 0.91555977, + "num_input_tokens_seen": 130493630, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8125, + "step": 6076, + "time_per_iteration": 2.4518303871154785 + }, + { + "auxiliary_loss_clip": 0.01121617, + "auxiliary_loss_mlp": 0.01038999, + "balance_loss_clip": 1.02438378, + "balance_loss_mlp": 1.04457617, + "epoch": 0.36536900646324966, + "flos": 13115008609920.0, + "grad_norm": 2.2164690925931976, + "language_loss": 0.69506466, + "learning_rate": 2.931687131696872e-06, + "loss": 0.71667087, + "num_input_tokens_seen": 130510735, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.76953125, + "step": 6077, + "time_per_iteration": 2.447659730911255 + }, + { + "auxiliary_loss_clip": 0.01043202, + "auxiliary_loss_mlp": 0.01009421, + "balance_loss_clip": 1.00758541, + "balance_loss_mlp": 1.01693892, + "epoch": 0.3654291297159176, + "flos": 71100472383360.0, + "grad_norm": 0.7520139059893192, + "language_loss": 0.61798048, + "learning_rate": 2.9313424922083224e-06, + "loss": 0.63850671, + "num_input_tokens_seen": 130577050, + "router_z_loss_clip": 0.01831055, + "router_z_loss_mlp": 0.26171875, + "step": 6078, + "time_per_iteration": 3.1669509410858154 + }, + { + "auxiliary_loss_clip": 0.01119836, + "auxiliary_loss_mlp": 0.01036426, + "balance_loss_clip": 1.02238369, + "balance_loss_mlp": 1.04217839, + "epoch": 0.3654892529685856, + "flos": 23617478338560.0, + "grad_norm": 1.8851740765331422, + "language_loss": 0.78088033, + "learning_rate": 2.930997817403173e-06, + "loss": 0.80244297, + "num_input_tokens_seen": 130593780, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.77734375, + "step": 6079, + "time_per_iteration": 2.4570510387420654 + }, + { + "auxiliary_loss_clip": 0.01124854, + "auxiliary_loss_mlp": 0.01040383, + "balance_loss_clip": 1.02517176, + "balance_loss_mlp": 1.04497504, + "epoch": 0.36554937622125355, + "flos": 43470799850880.0, + "grad_norm": 2.129422570654268, + "language_loss": 0.62885886, + "learning_rate": 2.9306531072944913e-06, + "loss": 0.65051121, + "num_input_tokens_seen": 130615510, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.796875, + "step": 6080, + "time_per_iteration": 2.65580415725708 + }, + { + "auxiliary_loss_clip": 0.01122781, + "auxiliary_loss_mlp": 0.01034606, + "balance_loss_clip": 1.01974106, + "balance_loss_mlp": 1.04280567, + "epoch": 0.3656094994739215, + "flos": 23294641875840.0, + "grad_norm": 2.4061972925673385, + "language_loss": 0.67665905, + "learning_rate": 2.930308361895352e-06, + "loss": 0.69823289, + "num_input_tokens_seen": 130635410, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 6081, + "time_per_iteration": 2.4747202396392822 + }, + { + "auxiliary_loss_clip": 0.01124634, + "auxiliary_loss_mlp": 0.01038138, + "balance_loss_clip": 1.02287912, + "balance_loss_mlp": 1.04305673, + "epoch": 0.3656696227265895, + "flos": 24571984400640.0, + "grad_norm": 1.9082106177767983, + "language_loss": 0.74747473, + "learning_rate": 2.9299635812188257e-06, + "loss": 0.76910245, + "num_input_tokens_seen": 130657725, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.81640625, + "step": 6082, + "time_per_iteration": 2.5238633155822754 + }, + { + "auxiliary_loss_clip": 0.01126171, + "auxiliary_loss_mlp": 0.01029096, + "balance_loss_clip": 1.01576877, + "balance_loss_mlp": 1.04598689, + "epoch": 0.3657297459792575, + "flos": 27928375056000.0, + "grad_norm": 1.8091692998669453, + "language_loss": 0.82823056, + "learning_rate": 2.929618765277987e-06, + "loss": 0.84978318, + "num_input_tokens_seen": 130678360, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.8046875, + "step": 6083, + "time_per_iteration": 2.517704963684082 + }, + { + "auxiliary_loss_clip": 0.01041849, + "auxiliary_loss_mlp": 0.01002206, + "balance_loss_clip": 1.00026309, + "balance_loss_mlp": 1.01621974, + "epoch": 0.36578986923192547, + "flos": 67392622126080.0, + "grad_norm": 0.8152809684063654, + "language_loss": 0.59372437, + "learning_rate": 2.9292739140859125e-06, + "loss": 0.61416495, + "num_input_tokens_seen": 130742110, + "router_z_loss_clip": 0.01940918, + "router_z_loss_mlp": 0.25585938, + "step": 6084, + "time_per_iteration": 3.126275062561035 + }, + { + "auxiliary_loss_clip": 0.01121734, + "auxiliary_loss_mlp": 0.01036969, + "balance_loss_clip": 1.02273536, + "balance_loss_mlp": 1.04410744, + "epoch": 0.36584999248459343, + "flos": 20227511445120.0, + "grad_norm": 2.719357970509058, + "language_loss": 0.73096633, + "learning_rate": 2.9289290276556767e-06, + "loss": 0.75255334, + "num_input_tokens_seen": 130759870, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.77734375, + "step": 6085, + "time_per_iteration": 2.436722755432129 + }, + { + "auxiliary_loss_clip": 0.01122986, + "auxiliary_loss_mlp": 0.01028561, + "balance_loss_clip": 1.01485801, + "balance_loss_mlp": 1.0447793, + "epoch": 0.3659101157372614, + "flos": 19062461813760.0, + "grad_norm": 4.360512376704014, + "language_loss": 0.7831111, + "learning_rate": 2.9285841060003604e-06, + "loss": 0.80462652, + "num_input_tokens_seen": 130778510, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.78125, + "step": 6086, + "time_per_iteration": 2.557521104812622 + }, + { + "auxiliary_loss_clip": 0.0111444, + "auxiliary_loss_mlp": 0.01029117, + "balance_loss_clip": 1.0150919, + "balance_loss_mlp": 1.0403074, + "epoch": 0.36597023898992936, + "flos": 30810708990720.0, + "grad_norm": 1.7974113126538098, + "language_loss": 0.77105325, + "learning_rate": 2.9282391491330416e-06, + "loss": 0.79248881, + "num_input_tokens_seen": 130798535, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 6087, + "time_per_iteration": 2.544868230819702 + }, + { + "auxiliary_loss_clip": 0.01121777, + "auxiliary_loss_mlp": 0.01030672, + "balance_loss_clip": 1.01587856, + "balance_loss_mlp": 1.04190612, + "epoch": 0.36603036224259733, + "flos": 20521799573760.0, + "grad_norm": 5.741725291334025, + "language_loss": 0.70710862, + "learning_rate": 2.9278941570668002e-06, + "loss": 0.72863311, + "num_input_tokens_seen": 130816655, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.80078125, + "step": 6088, + "time_per_iteration": 2.491933822631836 + }, + { + "auxiliary_loss_clip": 0.01130389, + "auxiliary_loss_mlp": 0.01034477, + "balance_loss_clip": 1.01897383, + "balance_loss_mlp": 1.04569137, + "epoch": 0.3660904854952653, + "flos": 38329397798400.0, + "grad_norm": 1.6695945607154594, + "language_loss": 0.79878473, + "learning_rate": 2.92754912981472e-06, + "loss": 0.82043338, + "num_input_tokens_seen": 130841225, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84765625, + "step": 6089, + "time_per_iteration": 2.666814088821411 + }, + { + "auxiliary_loss_clip": 0.01119748, + "auxiliary_loss_mlp": 0.01031445, + "balance_loss_clip": 1.01816463, + "balance_loss_mlp": 1.04267049, + "epoch": 0.36615060874793326, + "flos": 21835555511040.0, + "grad_norm": 1.7190941707632215, + "language_loss": 0.71335226, + "learning_rate": 2.927204067389884e-06, + "loss": 0.73486418, + "num_input_tokens_seen": 130861050, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7734375, + "step": 6090, + "time_per_iteration": 2.5138063430786133 + }, + { + "auxiliary_loss_clip": 0.01118993, + "auxiliary_loss_mlp": 0.01041328, + "balance_loss_clip": 1.02757084, + "balance_loss_mlp": 1.04391527, + "epoch": 0.3662107320006012, + "flos": 16581537342720.0, + "grad_norm": 1.9784029627642763, + "language_loss": 0.74276829, + "learning_rate": 2.9268589698053763e-06, + "loss": 0.76437145, + "num_input_tokens_seen": 130879775, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.75, + "step": 6091, + "time_per_iteration": 2.437126636505127 + }, + { + "auxiliary_loss_clip": 0.01120866, + "auxiliary_loss_mlp": 0.01039193, + "balance_loss_clip": 1.02506638, + "balance_loss_mlp": 1.04396391, + "epoch": 0.3662708552532692, + "flos": 20958365473920.0, + "grad_norm": 1.8707748404117035, + "language_loss": 0.72492194, + "learning_rate": 2.926513837074284e-06, + "loss": 0.74652249, + "num_input_tokens_seen": 130898070, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76953125, + "step": 6092, + "time_per_iteration": 2.5038540363311768 + }, + { + "auxiliary_loss_clip": 0.01122728, + "auxiliary_loss_mlp": 0.01045071, + "balance_loss_clip": 1.03072441, + "balance_loss_mlp": 1.04359424, + "epoch": 0.36633097850593715, + "flos": 21902707987200.0, + "grad_norm": 1.9548617375197639, + "language_loss": 0.78251863, + "learning_rate": 2.9261686692096942e-06, + "loss": 0.8041966, + "num_input_tokens_seen": 130915250, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7890625, + "step": 6093, + "time_per_iteration": 2.453854560852051 + }, + { + "auxiliary_loss_clip": 0.01119754, + "auxiliary_loss_mlp": 0.01036262, + "balance_loss_clip": 1.02226686, + "balance_loss_mlp": 1.04095936, + "epoch": 0.3663911017586051, + "flos": 32854133808000.0, + "grad_norm": 1.7535936892187265, + "language_loss": 0.74123377, + "learning_rate": 2.925823466224696e-06, + "loss": 0.76279384, + "num_input_tokens_seen": 130936995, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7890625, + "step": 6094, + "time_per_iteration": 2.5953075885772705 + }, + { + "auxiliary_loss_clip": 0.01125058, + "auxiliary_loss_mlp": 0.01052761, + "balance_loss_clip": 1.0381875, + "balance_loss_mlp": 1.04492939, + "epoch": 0.3664512250112731, + "flos": 27271748482560.0, + "grad_norm": 1.5564182913572622, + "language_loss": 0.79226458, + "learning_rate": 2.9254782281323785e-06, + "loss": 0.81404281, + "num_input_tokens_seen": 130957970, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.80078125, + "step": 6095, + "time_per_iteration": 5.4338037967681885 + }, + { + "auxiliary_loss_clip": 0.01125087, + "auxiliary_loss_mlp": 0.01035776, + "balance_loss_clip": 1.02055264, + "balance_loss_mlp": 1.04422212, + "epoch": 0.3665113482639411, + "flos": 17784436930560.0, + "grad_norm": 2.287741364035224, + "language_loss": 0.73586392, + "learning_rate": 2.925132954945834e-06, + "loss": 0.75747252, + "num_input_tokens_seen": 130974915, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.80859375, + "step": 6096, + "time_per_iteration": 3.923590660095215 + }, + { + "auxiliary_loss_clip": 0.0112257, + "auxiliary_loss_mlp": 0.01033526, + "balance_loss_clip": 1.0195781, + "balance_loss_mlp": 1.04206252, + "epoch": 0.36657147151660907, + "flos": 27854614477440.0, + "grad_norm": 2.2038030169597875, + "language_loss": 0.67285162, + "learning_rate": 2.924787646678155e-06, + "loss": 0.69441259, + "num_input_tokens_seen": 130995745, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.8046875, + "step": 6097, + "time_per_iteration": 2.4843504428863525 + }, + { + "auxiliary_loss_clip": 0.01123525, + "auxiliary_loss_mlp": 0.01038839, + "balance_loss_clip": 1.0249629, + "balance_loss_mlp": 1.04401898, + "epoch": 0.36663159476927704, + "flos": 25374013228800.0, + "grad_norm": 1.6404590263223953, + "language_loss": 0.77676886, + "learning_rate": 2.9244423033424365e-06, + "loss": 0.79839253, + "num_input_tokens_seen": 131015545, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.796875, + "step": 6098, + "time_per_iteration": 2.5663979053497314 + }, + { + "auxiliary_loss_clip": 0.0111895, + "auxiliary_loss_mlp": 0.01038815, + "balance_loss_clip": 1.02467644, + "balance_loss_mlp": 1.04334557, + "epoch": 0.366691718021945, + "flos": 21357225072000.0, + "grad_norm": 1.7512654587161538, + "language_loss": 0.73807114, + "learning_rate": 2.9240969249517723e-06, + "loss": 0.7596488, + "num_input_tokens_seen": 131033990, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7578125, + "step": 6099, + "time_per_iteration": 2.442549705505371 + }, + { + "auxiliary_loss_clip": 0.01116483, + "auxiliary_loss_mlp": 0.01047226, + "balance_loss_clip": 1.03380322, + "balance_loss_mlp": 1.04073739, + "epoch": 0.36675184127461297, + "flos": 16800376953600.0, + "grad_norm": 1.739052204204903, + "language_loss": 0.84383607, + "learning_rate": 2.9237515115192602e-06, + "loss": 0.86547315, + "num_input_tokens_seen": 131050710, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 6100, + "time_per_iteration": 2.4783878326416016 + }, + { + "auxiliary_loss_clip": 0.01124265, + "auxiliary_loss_mlp": 0.01034789, + "balance_loss_clip": 1.02046633, + "balance_loss_mlp": 1.04215789, + "epoch": 0.36681196452728093, + "flos": 21906514828800.0, + "grad_norm": 2.450199870045222, + "language_loss": 0.70504647, + "learning_rate": 2.9234060630579992e-06, + "loss": 0.72663701, + "num_input_tokens_seen": 131071435, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.8203125, + "step": 6101, + "time_per_iteration": 2.4591257572174072 + }, + { + "auxiliary_loss_clip": 0.01121661, + "auxiliary_loss_mlp": 0.01041857, + "balance_loss_clip": 1.02629983, + "balance_loss_mlp": 1.04228854, + "epoch": 0.3668720877799489, + "flos": 17712436118400.0, + "grad_norm": 2.0513606804107543, + "language_loss": 0.76049435, + "learning_rate": 2.9230605795810865e-06, + "loss": 0.78212953, + "num_input_tokens_seen": 131088775, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.79296875, + "step": 6102, + "time_per_iteration": 2.491046190261841 + }, + { + "auxiliary_loss_clip": 0.01126584, + "auxiliary_loss_mlp": 0.01036727, + "balance_loss_clip": 1.02142096, + "balance_loss_mlp": 1.04445052, + "epoch": 0.36693221103261686, + "flos": 47045455499520.0, + "grad_norm": 1.6383228145690705, + "language_loss": 0.69930172, + "learning_rate": 2.922715061101625e-06, + "loss": 0.72093487, + "num_input_tokens_seen": 131112800, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8203125, + "step": 6103, + "time_per_iteration": 2.676790952682495 + }, + { + "auxiliary_loss_clip": 0.01121704, + "auxiliary_loss_mlp": 0.01036936, + "balance_loss_clip": 1.02213061, + "balance_loss_mlp": 1.0423454, + "epoch": 0.3669923342852848, + "flos": 15960929132160.0, + "grad_norm": 1.8701272650505458, + "language_loss": 0.71414149, + "learning_rate": 2.922369507632716e-06, + "loss": 0.73572791, + "num_input_tokens_seen": 131131150, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.79296875, + "step": 6104, + "time_per_iteration": 2.438197374343872 + }, + { + "auxiliary_loss_clip": 0.01121263, + "auxiliary_loss_mlp": 0.01032552, + "balance_loss_clip": 1.01794899, + "balance_loss_mlp": 1.04288161, + "epoch": 0.3670524575379528, + "flos": 19974485064960.0, + "grad_norm": 2.0275913231037923, + "language_loss": 0.81653488, + "learning_rate": 2.9220239191874617e-06, + "loss": 0.83807302, + "num_input_tokens_seen": 131150365, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78515625, + "step": 6105, + "time_per_iteration": 2.437201976776123 + }, + { + "auxiliary_loss_clip": 0.0112675, + "auxiliary_loss_mlp": 0.010372, + "balance_loss_clip": 1.02255476, + "balance_loss_mlp": 1.0441767, + "epoch": 0.36711258079062076, + "flos": 25702955003520.0, + "grad_norm": 1.7477833912391936, + "language_loss": 0.81079835, + "learning_rate": 2.9216782957789692e-06, + "loss": 0.83243787, + "num_input_tokens_seen": 131169310, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.82421875, + "step": 6106, + "time_per_iteration": 2.5447771549224854 + }, + { + "auxiliary_loss_clip": 0.01041229, + "auxiliary_loss_mlp": 0.0100622, + "balance_loss_clip": 1.00440836, + "balance_loss_mlp": 1.01511836, + "epoch": 0.3671727040432887, + "flos": 60772743342720.0, + "grad_norm": 0.6829750500510474, + "language_loss": 0.59212124, + "learning_rate": 2.9213326374203426e-06, + "loss": 0.6125958, + "num_input_tokens_seen": 131232900, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.26171875, + "step": 6107, + "time_per_iteration": 3.0983083248138428 + }, + { + "auxiliary_loss_clip": 0.01119584, + "auxiliary_loss_mlp": 0.01031391, + "balance_loss_clip": 1.01756859, + "balance_loss_mlp": 1.04195333, + "epoch": 0.3672328272959567, + "flos": 18661303745280.0, + "grad_norm": 1.5524752326282045, + "language_loss": 0.74417794, + "learning_rate": 2.92098694412469e-06, + "loss": 0.7656877, + "num_input_tokens_seen": 131250920, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.77734375, + "step": 6108, + "time_per_iteration": 2.5146114826202393 + }, + { + "auxiliary_loss_clip": 0.01120578, + "auxiliary_loss_mlp": 0.01036173, + "balance_loss_clip": 1.02218354, + "balance_loss_mlp": 1.04104972, + "epoch": 0.3672929505486247, + "flos": 15049049535360.0, + "grad_norm": 2.0732100862766294, + "language_loss": 0.73141801, + "learning_rate": 2.9206412159051213e-06, + "loss": 0.7529856, + "num_input_tokens_seen": 131267910, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.796875, + "step": 6109, + "time_per_iteration": 2.4597368240356445 + }, + { + "auxiliary_loss_clip": 0.01118669, + "auxiliary_loss_mlp": 0.01034099, + "balance_loss_clip": 1.02015734, + "balance_loss_mlp": 1.0407654, + "epoch": 0.3673530738012927, + "flos": 20589347099520.0, + "grad_norm": 1.8280489650426288, + "language_loss": 0.53282952, + "learning_rate": 2.920295452774744e-06, + "loss": 0.55435723, + "num_input_tokens_seen": 131287150, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.77734375, + "step": 6110, + "time_per_iteration": 2.5454814434051514 + }, + { + "auxiliary_loss_clip": 0.01120475, + "auxiliary_loss_mlp": 0.01034416, + "balance_loss_clip": 1.01949728, + "balance_loss_mlp": 1.04360104, + "epoch": 0.36741319705396064, + "flos": 21689830033920.0, + "grad_norm": 1.4515242715586747, + "language_loss": 0.8026799, + "learning_rate": 2.919949654746672e-06, + "loss": 0.82422882, + "num_input_tokens_seen": 131308225, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.76953125, + "step": 6111, + "time_per_iteration": 2.4838016033172607 + }, + { + "auxiliary_loss_clip": 0.01119124, + "auxiliary_loss_mlp": 0.01040745, + "balance_loss_clip": 1.02637434, + "balance_loss_mlp": 1.04195952, + "epoch": 0.3674733203066286, + "flos": 29862200499840.0, + "grad_norm": 1.7574831080907656, + "language_loss": 0.72220403, + "learning_rate": 2.9196038218340163e-06, + "loss": 0.74380273, + "num_input_tokens_seen": 131332115, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.76953125, + "step": 6112, + "time_per_iteration": 2.590109348297119 + }, + { + "auxiliary_loss_clip": 0.01120572, + "auxiliary_loss_mlp": 0.01039294, + "balance_loss_clip": 1.02503061, + "balance_loss_mlp": 1.04220295, + "epoch": 0.36753344355929657, + "flos": 18257021193600.0, + "grad_norm": 1.6166739673118746, + "language_loss": 0.85398543, + "learning_rate": 2.919257954049892e-06, + "loss": 0.87558413, + "num_input_tokens_seen": 131351885, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78515625, + "step": 6113, + "time_per_iteration": 2.4480674266815186 + }, + { + "auxiliary_loss_clip": 0.01122714, + "auxiliary_loss_mlp": 0.01036848, + "balance_loss_clip": 1.02228022, + "balance_loss_mlp": 1.04214144, + "epoch": 0.36759356681196453, + "flos": 25301150490240.0, + "grad_norm": 1.8814317352542869, + "language_loss": 0.78741604, + "learning_rate": 2.918912051407413e-06, + "loss": 0.80901164, + "num_input_tokens_seen": 131370245, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8046875, + "step": 6114, + "time_per_iteration": 2.4870779514312744 + }, + { + "auxiliary_loss_clip": 0.01125295, + "auxiliary_loss_mlp": 0.01044195, + "balance_loss_clip": 1.0278033, + "balance_loss_mlp": 1.04344988, + "epoch": 0.3676536900646325, + "flos": 21032952065280.0, + "grad_norm": 1.5830307408310422, + "language_loss": 0.66854429, + "learning_rate": 2.918566113919698e-06, + "loss": 0.69023919, + "num_input_tokens_seen": 131388115, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.81640625, + "step": 6115, + "time_per_iteration": 2.4361841678619385 + }, + { + "auxiliary_loss_clip": 0.01114869, + "auxiliary_loss_mlp": 0.01033503, + "balance_loss_clip": 1.01953745, + "balance_loss_mlp": 1.03984118, + "epoch": 0.36771381331730046, + "flos": 16288506190080.0, + "grad_norm": 2.406761648754093, + "language_loss": 0.76663208, + "learning_rate": 2.9182201415998636e-06, + "loss": 0.78811574, + "num_input_tokens_seen": 131404595, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75, + "step": 6116, + "time_per_iteration": 2.417569875717163 + }, + { + "auxiliary_loss_clip": 0.01119646, + "auxiliary_loss_mlp": 0.01040473, + "balance_loss_clip": 1.02685893, + "balance_loss_mlp": 1.04111099, + "epoch": 0.36777393656996843, + "flos": 22309971367680.0, + "grad_norm": 1.9705222106020779, + "language_loss": 0.62811542, + "learning_rate": 2.9178741344610286e-06, + "loss": 0.64971662, + "num_input_tokens_seen": 131423760, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.78515625, + "step": 6117, + "time_per_iteration": 2.443798065185547 + }, + { + "auxiliary_loss_clip": 0.01118324, + "auxiliary_loss_mlp": 0.0103365, + "balance_loss_clip": 1.019261, + "balance_loss_mlp": 1.04137671, + "epoch": 0.3678340598226364, + "flos": 26834069260800.0, + "grad_norm": 1.9131647495504847, + "language_loss": 0.72974634, + "learning_rate": 2.9175280925163156e-06, + "loss": 0.75126612, + "num_input_tokens_seen": 131444955, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.76953125, + "step": 6118, + "time_per_iteration": 2.531804084777832 + }, + { + "auxiliary_loss_clip": 0.01123956, + "auxiliary_loss_mlp": 0.01042313, + "balance_loss_clip": 1.02694678, + "balance_loss_mlp": 1.04156733, + "epoch": 0.36789418307530436, + "flos": 21761723105280.0, + "grad_norm": 2.002097677722335, + "language_loss": 0.72413695, + "learning_rate": 2.9171820157788445e-06, + "loss": 0.7457996, + "num_input_tokens_seen": 131465720, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.82421875, + "step": 6119, + "time_per_iteration": 2.4641144275665283 + }, + { + "auxiliary_loss_clip": 0.01121284, + "auxiliary_loss_mlp": 0.01032475, + "balance_loss_clip": 1.0179317, + "balance_loss_mlp": 1.04397964, + "epoch": 0.3679543063279723, + "flos": 15924192497280.0, + "grad_norm": 1.84976209385018, + "language_loss": 0.79848421, + "learning_rate": 2.9168359042617404e-06, + "loss": 0.82002181, + "num_input_tokens_seen": 131483080, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7734375, + "step": 6120, + "time_per_iteration": 2.487030029296875 + }, + { + "auxiliary_loss_clip": 0.01117761, + "auxiliary_loss_mlp": 0.01040133, + "balance_loss_clip": 1.02612031, + "balance_loss_mlp": 1.04084468, + "epoch": 0.3680144295806403, + "flos": 24275541456000.0, + "grad_norm": 1.8961465807450149, + "language_loss": 0.63855267, + "learning_rate": 2.916489757978126e-06, + "loss": 0.66013169, + "num_input_tokens_seen": 131502545, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76953125, + "step": 6121, + "time_per_iteration": 2.4573564529418945 + }, + { + "auxiliary_loss_clip": 0.01122895, + "auxiliary_loss_mlp": 0.01042434, + "balance_loss_clip": 1.02755642, + "balance_loss_mlp": 1.0431416, + "epoch": 0.36807455283330826, + "flos": 26104148985600.0, + "grad_norm": 1.8845840511442051, + "language_loss": 0.71209222, + "learning_rate": 2.9161435769411286e-06, + "loss": 0.73374552, + "num_input_tokens_seen": 131522155, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 6122, + "time_per_iteration": 2.5197854042053223 + }, + { + "auxiliary_loss_clip": 0.01116909, + "auxiliary_loss_mlp": 0.01034858, + "balance_loss_clip": 1.02091694, + "balance_loss_mlp": 1.04319501, + "epoch": 0.3681346760859763, + "flos": 24644990793600.0, + "grad_norm": 1.8566190114316727, + "language_loss": 0.69493115, + "learning_rate": 2.915797361163875e-06, + "loss": 0.71644878, + "num_input_tokens_seen": 131543865, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.734375, + "step": 6123, + "time_per_iteration": 2.5585381984710693 + }, + { + "auxiliary_loss_clip": 0.0112515, + "auxiliary_loss_mlp": 0.01039826, + "balance_loss_clip": 1.02426958, + "balance_loss_mlp": 1.04312396, + "epoch": 0.36819479933864424, + "flos": 23878369797120.0, + "grad_norm": 1.995367064863914, + "language_loss": 0.73392212, + "learning_rate": 2.9154511106594933e-06, + "loss": 0.7555719, + "num_input_tokens_seen": 131562155, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8203125, + "step": 6124, + "time_per_iteration": 2.56925368309021 + }, + { + "auxiliary_loss_clip": 0.01121929, + "auxiliary_loss_mlp": 0.01040848, + "balance_loss_clip": 1.02465916, + "balance_loss_mlp": 1.04337013, + "epoch": 0.3682549225913122, + "flos": 25553997302400.0, + "grad_norm": 1.997016319446362, + "language_loss": 0.74426562, + "learning_rate": 2.915104825441114e-06, + "loss": 0.76589334, + "num_input_tokens_seen": 131581695, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.7890625, + "step": 6125, + "time_per_iteration": 2.493232488632202 + }, + { + "auxiliary_loss_clip": 0.01124729, + "auxiliary_loss_mlp": 0.01046169, + "balance_loss_clip": 1.03009367, + "balance_loss_mlp": 1.04400194, + "epoch": 0.36831504584398017, + "flos": 16946605221120.0, + "grad_norm": 1.8135805598812564, + "language_loss": 0.78254056, + "learning_rate": 2.9147585055218686e-06, + "loss": 0.80424947, + "num_input_tokens_seen": 131599465, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8046875, + "step": 6126, + "time_per_iteration": 2.4767327308654785 + }, + { + "auxiliary_loss_clip": 0.01123227, + "auxiliary_loss_mlp": 0.0103777, + "balance_loss_clip": 1.02125943, + "balance_loss_mlp": 1.04164457, + "epoch": 0.36837516909664814, + "flos": 19865065259520.0, + "grad_norm": 2.275366104968191, + "language_loss": 0.66100526, + "learning_rate": 2.914412150914888e-06, + "loss": 0.68261528, + "num_input_tokens_seen": 131618330, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.81640625, + "step": 6127, + "time_per_iteration": 2.4442801475524902 + }, + { + "auxiliary_loss_clip": 0.01126304, + "auxiliary_loss_mlp": 0.01042916, + "balance_loss_clip": 1.02783585, + "balance_loss_mlp": 1.04527378, + "epoch": 0.3684352923493161, + "flos": 37626984362880.0, + "grad_norm": 1.809419798014635, + "language_loss": 0.70553637, + "learning_rate": 2.9140657616333074e-06, + "loss": 0.72722864, + "num_input_tokens_seen": 131638960, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.80859375, + "step": 6128, + "time_per_iteration": 2.6163570880889893 + }, + { + "auxiliary_loss_clip": 0.01121361, + "auxiliary_loss_mlp": 0.01041508, + "balance_loss_clip": 1.0266788, + "balance_loss_mlp": 1.04374862, + "epoch": 0.36849541560198407, + "flos": 14465501182080.0, + "grad_norm": 2.366686546837111, + "language_loss": 0.75425905, + "learning_rate": 2.9137193376902614e-06, + "loss": 0.77588773, + "num_input_tokens_seen": 131657440, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.77734375, + "step": 6129, + "time_per_iteration": 2.418318510055542 + }, + { + "auxiliary_loss_clip": 0.01119858, + "auxiliary_loss_mlp": 0.01041313, + "balance_loss_clip": 1.02652466, + "balance_loss_mlp": 1.0419023, + "epoch": 0.36855553885465203, + "flos": 25770753924480.0, + "grad_norm": 1.583632674026135, + "language_loss": 0.84801334, + "learning_rate": 2.9133728790988868e-06, + "loss": 0.86962497, + "num_input_tokens_seen": 131678035, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.78125, + "step": 6130, + "time_per_iteration": 2.4933249950408936 + }, + { + "auxiliary_loss_clip": 0.01041681, + "auxiliary_loss_mlp": 0.010081, + "balance_loss_clip": 1.00623989, + "balance_loss_mlp": 1.01602125, + "epoch": 0.36861566210732, + "flos": 65049417377280.0, + "grad_norm": 0.8093683158704721, + "language_loss": 0.60352623, + "learning_rate": 2.913026385872321e-06, + "loss": 0.62402403, + "num_input_tokens_seen": 131742470, + "router_z_loss_clip": 0.01855469, + "router_z_loss_mlp": 0.2578125, + "step": 6131, + "time_per_iteration": 3.1686718463897705 + }, + { + "auxiliary_loss_clip": 0.01117904, + "auxiliary_loss_mlp": 0.01031101, + "balance_loss_clip": 1.01657534, + "balance_loss_mlp": 1.04083943, + "epoch": 0.36867578535998796, + "flos": 30954495133440.0, + "grad_norm": 1.5510352980860918, + "language_loss": 0.72903317, + "learning_rate": 2.9126798580237034e-06, + "loss": 0.75052321, + "num_input_tokens_seen": 131764570, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.76953125, + "step": 6132, + "time_per_iteration": 2.54154109954834 + }, + { + "auxiliary_loss_clip": 0.01124361, + "auxiliary_loss_mlp": 0.0103786, + "balance_loss_clip": 1.02221942, + "balance_loss_mlp": 1.04263651, + "epoch": 0.3687359086126559, + "flos": 28837956182400.0, + "grad_norm": 1.665822939326855, + "language_loss": 0.74255228, + "learning_rate": 2.9123332955661736e-06, + "loss": 0.76417446, + "num_input_tokens_seen": 131785720, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.81640625, + "step": 6133, + "time_per_iteration": 2.501119375228882 + }, + { + "auxiliary_loss_clip": 0.01118037, + "auxiliary_loss_mlp": 0.01038324, + "balance_loss_clip": 1.02420318, + "balance_loss_mlp": 1.04308438, + "epoch": 0.3687960318653239, + "flos": 21396798881280.0, + "grad_norm": 1.60564703390979, + "language_loss": 0.71415824, + "learning_rate": 2.911986698512874e-06, + "loss": 0.73572183, + "num_input_tokens_seen": 131804430, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.75, + "step": 6134, + "time_per_iteration": 2.472978353500366 + }, + { + "auxiliary_loss_clip": 0.01121139, + "auxiliary_loss_mlp": 0.01035306, + "balance_loss_clip": 1.0202322, + "balance_loss_mlp": 1.04333591, + "epoch": 0.36885615511799186, + "flos": 20266043760000.0, + "grad_norm": 1.501197032587339, + "language_loss": 0.74985242, + "learning_rate": 2.9116400668769477e-06, + "loss": 0.77141684, + "num_input_tokens_seen": 131822060, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.77734375, + "step": 6135, + "time_per_iteration": 2.458523750305176 + }, + { + "auxiliary_loss_clip": 0.01043215, + "auxiliary_loss_mlp": 0.01004045, + "balance_loss_clip": 1.00209045, + "balance_loss_mlp": 1.01762199, + "epoch": 0.3689162783706599, + "flos": 63088836301440.0, + "grad_norm": 0.8063752733434837, + "language_loss": 0.5878793, + "learning_rate": 2.9112934006715376e-06, + "loss": 0.60835183, + "num_input_tokens_seen": 131880715, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.25585938, + "step": 6136, + "time_per_iteration": 2.9917385578155518 + }, + { + "auxiliary_loss_clip": 0.0112236, + "auxiliary_loss_mlp": 0.01035902, + "balance_loss_clip": 1.02095878, + "balance_loss_mlp": 1.04477668, + "epoch": 0.36897640162332784, + "flos": 10961984419200.0, + "grad_norm": 1.8816926848284692, + "language_loss": 0.78812146, + "learning_rate": 2.9109466999097918e-06, + "loss": 0.80970407, + "num_input_tokens_seen": 131895850, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.77734375, + "step": 6137, + "time_per_iteration": 6.900243520736694 + }, + { + "auxiliary_loss_clip": 0.01122666, + "auxiliary_loss_mlp": 0.01040761, + "balance_loss_clip": 1.02594304, + "balance_loss_mlp": 1.04392326, + "epoch": 0.3690365248759958, + "flos": 20704297599360.0, + "grad_norm": 2.0278297083458345, + "language_loss": 0.74142605, + "learning_rate": 2.9105999646048552e-06, + "loss": 0.76306027, + "num_input_tokens_seen": 131915775, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7890625, + "step": 6138, + "time_per_iteration": 2.5056889057159424 + }, + { + "auxiliary_loss_clip": 0.01127012, + "auxiliary_loss_mlp": 0.01038954, + "balance_loss_clip": 1.02365959, + "balance_loss_mlp": 1.04482222, + "epoch": 0.3690966481286638, + "flos": 31826369957760.0, + "grad_norm": 1.957735157830462, + "language_loss": 0.64818108, + "learning_rate": 2.9102531947698764e-06, + "loss": 0.66984075, + "num_input_tokens_seen": 131935715, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8203125, + "step": 6139, + "time_per_iteration": 2.5345380306243896 + }, + { + "auxiliary_loss_clip": 0.01119273, + "auxiliary_loss_mlp": 0.01040439, + "balance_loss_clip": 1.02563334, + "balance_loss_mlp": 1.04279661, + "epoch": 0.36915677138133174, + "flos": 13114936782720.0, + "grad_norm": 2.0918485574433734, + "language_loss": 0.71384197, + "learning_rate": 2.909906390418006e-06, + "loss": 0.73543906, + "num_input_tokens_seen": 131954120, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.765625, + "step": 6140, + "time_per_iteration": 2.4318323135375977 + }, + { + "auxiliary_loss_clip": 0.01042951, + "auxiliary_loss_mlp": 0.00999596, + "balance_loss_clip": 0.99771231, + "balance_loss_mlp": 1.01712704, + "epoch": 0.3692168946339997, + "flos": 68686879956480.0, + "grad_norm": 0.7479140823872853, + "language_loss": 0.59281325, + "learning_rate": 2.9095595515623934e-06, + "loss": 0.61323869, + "num_input_tokens_seen": 132017485, + "router_z_loss_clip": 0.01879883, + "router_z_loss_mlp": 0.2578125, + "step": 6141, + "time_per_iteration": 3.1505937576293945 + }, + { + "auxiliary_loss_clip": 0.01122987, + "auxiliary_loss_mlp": 0.0103975, + "balance_loss_clip": 1.02499199, + "balance_loss_mlp": 1.04369187, + "epoch": 0.36927701788666767, + "flos": 22017873968640.0, + "grad_norm": 1.768624510630746, + "language_loss": 0.7473368, + "learning_rate": 2.909212678216192e-06, + "loss": 0.76896417, + "num_input_tokens_seen": 132036760, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.79296875, + "step": 6142, + "time_per_iteration": 2.4768457412719727 + }, + { + "auxiliary_loss_clip": 0.01119694, + "auxiliary_loss_mlp": 0.01036766, + "balance_loss_clip": 1.02291358, + "balance_loss_mlp": 1.04270506, + "epoch": 0.36933714113933563, + "flos": 21835591424640.0, + "grad_norm": 2.5385068391341603, + "language_loss": 0.76985848, + "learning_rate": 2.908865770392555e-06, + "loss": 0.79142308, + "num_input_tokens_seen": 132056935, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76953125, + "step": 6143, + "time_per_iteration": 2.4604313373565674 + }, + { + "auxiliary_loss_clip": 0.01118955, + "auxiliary_loss_mlp": 0.01035839, + "balance_loss_clip": 1.02289248, + "balance_loss_mlp": 1.04277074, + "epoch": 0.3693972643920036, + "flos": 23691705793920.0, + "grad_norm": 1.4994482416842545, + "language_loss": 0.81616801, + "learning_rate": 2.9085188281046364e-06, + "loss": 0.83771598, + "num_input_tokens_seen": 132077285, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.76171875, + "step": 6144, + "time_per_iteration": 2.529298782348633 + }, + { + "auxiliary_loss_clip": 0.0112261, + "auxiliary_loss_mlp": 0.01038443, + "balance_loss_clip": 1.02425694, + "balance_loss_mlp": 1.04323006, + "epoch": 0.36945738764467156, + "flos": 22856747172480.0, + "grad_norm": 1.9122738225408384, + "language_loss": 0.77019674, + "learning_rate": 2.908171851365593e-06, + "loss": 0.79180729, + "num_input_tokens_seen": 132095520, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.79296875, + "step": 6145, + "time_per_iteration": 2.4642515182495117 + }, + { + "auxiliary_loss_clip": 0.01123051, + "auxiliary_loss_mlp": 0.01032135, + "balance_loss_clip": 1.01760387, + "balance_loss_mlp": 1.04384804, + "epoch": 0.36951751089733953, + "flos": 16615939593600.0, + "grad_norm": 1.7518336089815172, + "language_loss": 0.76903462, + "learning_rate": 2.9078248401885815e-06, + "loss": 0.79058653, + "num_input_tokens_seen": 132112810, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.79296875, + "step": 6146, + "time_per_iteration": 2.49208927154541 + }, + { + "auxiliary_loss_clip": 0.01125412, + "auxiliary_loss_mlp": 0.01042993, + "balance_loss_clip": 1.02746034, + "balance_loss_mlp": 1.04481673, + "epoch": 0.3695776341500075, + "flos": 18914545607040.0, + "grad_norm": 1.7861503855196468, + "language_loss": 0.80794239, + "learning_rate": 2.907477794586761e-06, + "loss": 0.82962638, + "num_input_tokens_seen": 132131615, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8046875, + "step": 6147, + "time_per_iteration": 2.417968988418579 + }, + { + "auxiliary_loss_clip": 0.01120028, + "auxiliary_loss_mlp": 0.01037464, + "balance_loss_clip": 1.0238626, + "balance_loss_mlp": 1.04083371, + "epoch": 0.36963775740267546, + "flos": 20808474019200.0, + "grad_norm": 1.7356953572419536, + "language_loss": 0.83196342, + "learning_rate": 2.9071307145732926e-06, + "loss": 0.85353833, + "num_input_tokens_seen": 132149585, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.79296875, + "step": 6148, + "time_per_iteration": 2.4493086338043213 + }, + { + "auxiliary_loss_clip": 0.01118838, + "auxiliary_loss_mlp": 0.01038426, + "balance_loss_clip": 1.02424645, + "balance_loss_mlp": 1.04304922, + "epoch": 0.3696978806553435, + "flos": 26061881656320.0, + "grad_norm": 2.337121678381176, + "language_loss": 0.74373478, + "learning_rate": 2.9067836001613357e-06, + "loss": 0.76530743, + "num_input_tokens_seen": 132165555, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7578125, + "step": 6149, + "time_per_iteration": 2.4594686031341553 + }, + { + "auxiliary_loss_clip": 0.01124701, + "auxiliary_loss_mlp": 0.01038071, + "balance_loss_clip": 1.02210915, + "balance_loss_mlp": 1.04449439, + "epoch": 0.36975800390801145, + "flos": 26833925606400.0, + "grad_norm": 1.7562888589836316, + "language_loss": 0.70538592, + "learning_rate": 2.906436451364054e-06, + "loss": 0.72701365, + "num_input_tokens_seen": 132185100, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.80078125, + "step": 6150, + "time_per_iteration": 2.5232975482940674 + }, + { + "auxiliary_loss_clip": 0.01121201, + "auxiliary_loss_mlp": 0.01039019, + "balance_loss_clip": 1.02470183, + "balance_loss_mlp": 1.04390609, + "epoch": 0.3698181271606794, + "flos": 21142623265920.0, + "grad_norm": 1.6469943204532072, + "language_loss": 0.82023048, + "learning_rate": 2.906089268194611e-06, + "loss": 0.84183264, + "num_input_tokens_seen": 132203930, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7734375, + "step": 6151, + "time_per_iteration": 2.448066473007202 + }, + { + "auxiliary_loss_clip": 0.01036606, + "auxiliary_loss_mlp": 0.01001329, + "balance_loss_clip": 0.99951726, + "balance_loss_mlp": 1.01119328, + "epoch": 0.3698782504133474, + "flos": 66742639568640.0, + "grad_norm": 0.838014312453704, + "language_loss": 0.63083476, + "learning_rate": 2.9057420506661726e-06, + "loss": 0.65121406, + "num_input_tokens_seen": 132263845, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.25390625, + "step": 6152, + "time_per_iteration": 3.170707941055298 + }, + { + "auxiliary_loss_clip": 0.01117624, + "auxiliary_loss_mlp": 0.01037374, + "balance_loss_clip": 1.02347398, + "balance_loss_mlp": 1.0429337, + "epoch": 0.36993837366601534, + "flos": 24311523905280.0, + "grad_norm": 1.8166659348284784, + "language_loss": 0.70360208, + "learning_rate": 2.9053947987919044e-06, + "loss": 0.72515202, + "num_input_tokens_seen": 132282350, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 6153, + "time_per_iteration": 2.480318546295166 + }, + { + "auxiliary_loss_clip": 0.01123537, + "auxiliary_loss_mlp": 0.01039275, + "balance_loss_clip": 1.02420688, + "balance_loss_mlp": 1.04319179, + "epoch": 0.3699984969186833, + "flos": 24349194293760.0, + "grad_norm": 2.0600031325492107, + "language_loss": 0.72201782, + "learning_rate": 2.9050475125849755e-06, + "loss": 0.74364597, + "num_input_tokens_seen": 132301930, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8046875, + "step": 6154, + "time_per_iteration": 2.48018479347229 + }, + { + "auxiliary_loss_clip": 0.0111958, + "auxiliary_loss_mlp": 0.01029952, + "balance_loss_clip": 1.01624274, + "balance_loss_mlp": 1.04201758, + "epoch": 0.37005862017135127, + "flos": 19829154637440.0, + "grad_norm": 1.8383479148193087, + "language_loss": 0.67877179, + "learning_rate": 2.9047001920585534e-06, + "loss": 0.70026708, + "num_input_tokens_seen": 132320915, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 6155, + "time_per_iteration": 2.454582929611206 + }, + { + "auxiliary_loss_clip": 0.01119091, + "auxiliary_loss_mlp": 0.01028886, + "balance_loss_clip": 1.01518905, + "balance_loss_mlp": 1.0420723, + "epoch": 0.37011874342401924, + "flos": 19573793873280.0, + "grad_norm": 1.7213710867444976, + "language_loss": 0.67835188, + "learning_rate": 2.9043528372258097e-06, + "loss": 0.6998316, + "num_input_tokens_seen": 132340415, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76953125, + "step": 6156, + "time_per_iteration": 2.456244707107544 + }, + { + "auxiliary_loss_clip": 0.01117904, + "auxiliary_loss_mlp": 0.01038018, + "balance_loss_clip": 1.02461255, + "balance_loss_mlp": 1.04180884, + "epoch": 0.3701788666766872, + "flos": 20374350243840.0, + "grad_norm": 1.7871024658649661, + "language_loss": 0.82324016, + "learning_rate": 2.904005448099916e-06, + "loss": 0.8447994, + "num_input_tokens_seen": 132358600, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 6157, + "time_per_iteration": 2.467258930206299 + }, + { + "auxiliary_loss_clip": 0.0112233, + "auxiliary_loss_mlp": 0.0103747, + "balance_loss_clip": 1.02214015, + "balance_loss_mlp": 1.04224074, + "epoch": 0.37023898992935517, + "flos": 15340931452800.0, + "grad_norm": 2.319348977212497, + "language_loss": 0.76519799, + "learning_rate": 2.9036580246940444e-06, + "loss": 0.78679597, + "num_input_tokens_seen": 132373160, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.80078125, + "step": 6158, + "time_per_iteration": 2.4462850093841553 + }, + { + "auxiliary_loss_clip": 0.01121021, + "auxiliary_loss_mlp": 0.01037892, + "balance_loss_clip": 1.02276468, + "balance_loss_mlp": 1.04128695, + "epoch": 0.37029911318202313, + "flos": 19573937527680.0, + "grad_norm": 2.3237426114128903, + "language_loss": 0.6888833, + "learning_rate": 2.9033105670213708e-06, + "loss": 0.71047246, + "num_input_tokens_seen": 132392345, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.796875, + "step": 6159, + "time_per_iteration": 2.444615364074707 + }, + { + "auxiliary_loss_clip": 0.0111775, + "auxiliary_loss_mlp": 0.0103647, + "balance_loss_clip": 1.02298164, + "balance_loss_mlp": 1.04054952, + "epoch": 0.3703592364346911, + "flos": 26213353309440.0, + "grad_norm": 1.7829911261722147, + "language_loss": 0.7101602, + "learning_rate": 2.9029630750950697e-06, + "loss": 0.73170245, + "num_input_tokens_seen": 132412620, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7734375, + "step": 6160, + "time_per_iteration": 2.4807472229003906 + }, + { + "auxiliary_loss_clip": 0.01114504, + "auxiliary_loss_mlp": 0.01030762, + "balance_loss_clip": 1.01808465, + "balance_loss_mlp": 1.04033566, + "epoch": 0.37041935968735906, + "flos": 20048317470720.0, + "grad_norm": 1.5671410195286926, + "language_loss": 0.79049259, + "learning_rate": 2.9026155489283176e-06, + "loss": 0.81194532, + "num_input_tokens_seen": 132431570, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7421875, + "step": 6161, + "time_per_iteration": 2.445615768432617 + }, + { + "auxiliary_loss_clip": 0.01119907, + "auxiliary_loss_mlp": 0.01037146, + "balance_loss_clip": 1.02266204, + "balance_loss_mlp": 1.04217172, + "epoch": 0.3704794829400271, + "flos": 24133802388480.0, + "grad_norm": 1.6578530571842398, + "language_loss": 0.7961942, + "learning_rate": 2.902267988534295e-06, + "loss": 0.81776464, + "num_input_tokens_seen": 132451525, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.77734375, + "step": 6162, + "time_per_iteration": 2.474179267883301 + }, + { + "auxiliary_loss_clip": 0.01118518, + "auxiliary_loss_mlp": 0.01035343, + "balance_loss_clip": 1.02122831, + "balance_loss_mlp": 1.04136944, + "epoch": 0.37053960619269505, + "flos": 14866874732160.0, + "grad_norm": 1.751569507310971, + "language_loss": 0.79592955, + "learning_rate": 2.9019203939261783e-06, + "loss": 0.81746811, + "num_input_tokens_seen": 132469875, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76953125, + "step": 6163, + "time_per_iteration": 2.429410696029663 + }, + { + "auxiliary_loss_clip": 0.01121642, + "auxiliary_loss_mlp": 0.01032856, + "balance_loss_clip": 1.01815772, + "balance_loss_mlp": 1.04239571, + "epoch": 0.370599729445363, + "flos": 21361498790400.0, + "grad_norm": 1.6995697719291154, + "language_loss": 0.68002689, + "learning_rate": 2.9015727651171507e-06, + "loss": 0.70157188, + "num_input_tokens_seen": 132488360, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.79296875, + "step": 6164, + "time_per_iteration": 2.4500439167022705 + }, + { + "auxiliary_loss_clip": 0.01125233, + "auxiliary_loss_mlp": 0.01035754, + "balance_loss_clip": 1.0206207, + "balance_loss_mlp": 1.04507017, + "epoch": 0.370659852698031, + "flos": 26829041356800.0, + "grad_norm": 2.4697759057606197, + "language_loss": 0.82807398, + "learning_rate": 2.9012251021203935e-06, + "loss": 0.84968388, + "num_input_tokens_seen": 132508630, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.80078125, + "step": 6165, + "time_per_iteration": 2.4863715171813965 + }, + { + "auxiliary_loss_clip": 0.01125688, + "auxiliary_loss_mlp": 0.01036899, + "balance_loss_clip": 1.02060854, + "balance_loss_mlp": 1.04388845, + "epoch": 0.37071997595069894, + "flos": 19099018880640.0, + "grad_norm": 1.8224972170046692, + "language_loss": 0.69500774, + "learning_rate": 2.9008774049490896e-06, + "loss": 0.71663356, + "num_input_tokens_seen": 132527465, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.81640625, + "step": 6166, + "time_per_iteration": 2.560605049133301 + }, + { + "auxiliary_loss_clip": 0.01038031, + "auxiliary_loss_mlp": 0.01006399, + "balance_loss_clip": 1.00471771, + "balance_loss_mlp": 1.01302195, + "epoch": 0.3707800992033669, + "flos": 52178384920320.0, + "grad_norm": 0.8093247029889314, + "language_loss": 0.56892115, + "learning_rate": 2.9005296736164244e-06, + "loss": 0.58936548, + "num_input_tokens_seen": 132579940, + "router_z_loss_clip": 0.0168457, + "router_z_loss_mlp": 0.25, + "step": 6167, + "time_per_iteration": 2.922917127609253 + }, + { + "auxiliary_loss_clip": 0.01118731, + "auxiliary_loss_mlp": 0.01033249, + "balance_loss_clip": 1.01992154, + "balance_loss_mlp": 1.04288507, + "epoch": 0.3708402224560349, + "flos": 19901837808000.0, + "grad_norm": 1.945139483069219, + "language_loss": 0.75539452, + "learning_rate": 2.900181908135584e-06, + "loss": 0.77691436, + "num_input_tokens_seen": 132598390, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7578125, + "step": 6168, + "time_per_iteration": 2.4489872455596924 + }, + { + "auxiliary_loss_clip": 0.01120115, + "auxiliary_loss_mlp": 0.01035803, + "balance_loss_clip": 1.02202857, + "balance_loss_mlp": 1.04180634, + "epoch": 0.37090034570870284, + "flos": 20007630339840.0, + "grad_norm": 2.5586684776543853, + "language_loss": 0.7432459, + "learning_rate": 2.899834108519755e-06, + "loss": 0.76480508, + "num_input_tokens_seen": 132616920, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.78125, + "step": 6169, + "time_per_iteration": 2.4537463188171387 + }, + { + "auxiliary_loss_clip": 0.01120897, + "auxiliary_loss_mlp": 0.01032585, + "balance_loss_clip": 1.01891184, + "balance_loss_mlp": 1.04480267, + "epoch": 0.3709604689613708, + "flos": 24134700228480.0, + "grad_norm": 1.3706540261028175, + "language_loss": 0.79311681, + "learning_rate": 2.899486274782127e-06, + "loss": 0.81465161, + "num_input_tokens_seen": 132637660, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7578125, + "step": 6170, + "time_per_iteration": 2.4723992347717285 + }, + { + "auxiliary_loss_clip": 0.01122845, + "auxiliary_loss_mlp": 0.01038875, + "balance_loss_clip": 1.02390242, + "balance_loss_mlp": 1.04451621, + "epoch": 0.37102059221403877, + "flos": 23876071326720.0, + "grad_norm": 1.6235616399590074, + "language_loss": 0.76385272, + "learning_rate": 2.8991384069358885e-06, + "loss": 0.78546989, + "num_input_tokens_seen": 132657635, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.78125, + "step": 6171, + "time_per_iteration": 2.5364768505096436 + }, + { + "auxiliary_loss_clip": 0.01123724, + "auxiliary_loss_mlp": 0.01031905, + "balance_loss_clip": 1.01663446, + "balance_loss_mlp": 1.04594254, + "epoch": 0.37108071546670673, + "flos": 14501268149760.0, + "grad_norm": 1.9768297571305458, + "language_loss": 0.80696416, + "learning_rate": 2.898790504994232e-06, + "loss": 0.82852054, + "num_input_tokens_seen": 132674455, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.77734375, + "step": 6172, + "time_per_iteration": 2.451099395751953 + }, + { + "auxiliary_loss_clip": 0.01124197, + "auxiliary_loss_mlp": 0.01037606, + "balance_loss_clip": 1.0219543, + "balance_loss_mlp": 1.04385138, + "epoch": 0.3711408387193747, + "flos": 34562619279360.0, + "grad_norm": 2.2157067962534875, + "language_loss": 0.59447742, + "learning_rate": 2.89844256897035e-06, + "loss": 0.61609542, + "num_input_tokens_seen": 132695140, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8046875, + "step": 6173, + "time_per_iteration": 2.5750677585601807 + }, + { + "auxiliary_loss_clip": 0.01121876, + "auxiliary_loss_mlp": 0.01036073, + "balance_loss_clip": 1.02122533, + "balance_loss_mlp": 1.04391754, + "epoch": 0.37120096197204266, + "flos": 17310703432320.0, + "grad_norm": 1.9248503394254857, + "language_loss": 0.81157243, + "learning_rate": 2.898094598877435e-06, + "loss": 0.83315188, + "num_input_tokens_seen": 132712470, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78125, + "step": 6174, + "time_per_iteration": 2.421182155609131 + }, + { + "auxiliary_loss_clip": 0.01117919, + "auxiliary_loss_mlp": 0.01033906, + "balance_loss_clip": 1.02035165, + "balance_loss_mlp": 1.04281855, + "epoch": 0.37126108522471063, + "flos": 30664049760000.0, + "grad_norm": 1.8542839121663495, + "language_loss": 0.79834068, + "learning_rate": 2.8977465947286826e-06, + "loss": 0.81985891, + "num_input_tokens_seen": 132732945, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.75, + "step": 6175, + "time_per_iteration": 2.533447027206421 + }, + { + "auxiliary_loss_clip": 0.01124428, + "auxiliary_loss_mlp": 0.01046668, + "balance_loss_clip": 1.03194535, + "balance_loss_mlp": 1.04644537, + "epoch": 0.37132120847737865, + "flos": 25155640494720.0, + "grad_norm": 1.6734071315129293, + "language_loss": 0.88764346, + "learning_rate": 2.89739855653729e-06, + "loss": 0.90935433, + "num_input_tokens_seen": 132752470, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.78125, + "step": 6176, + "time_per_iteration": 2.486224412918091 + }, + { + "auxiliary_loss_clip": 0.01122363, + "auxiliary_loss_mlp": 0.01036031, + "balance_loss_clip": 1.02174938, + "balance_loss_mlp": 1.04402244, + "epoch": 0.3713813317300466, + "flos": 21213474842880.0, + "grad_norm": 1.5809846817738957, + "language_loss": 0.73293233, + "learning_rate": 2.8970504843164546e-06, + "loss": 0.75451624, + "num_input_tokens_seen": 132771485, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 6177, + "time_per_iteration": 2.492033004760742 + }, + { + "auxiliary_loss_clip": 0.01119881, + "auxiliary_loss_mlp": 0.01039443, + "balance_loss_clip": 1.02551913, + "balance_loss_mlp": 1.04359818, + "epoch": 0.3714414549827146, + "flos": 21616644072960.0, + "grad_norm": 1.8832415058442271, + "language_loss": 0.75425023, + "learning_rate": 2.896702378079374e-06, + "loss": 0.77584344, + "num_input_tokens_seen": 132791465, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76171875, + "step": 6178, + "time_per_iteration": 4.005537748336792 + }, + { + "auxiliary_loss_clip": 0.01123036, + "auxiliary_loss_mlp": 0.01033082, + "balance_loss_clip": 1.01896191, + "balance_loss_mlp": 1.04618645, + "epoch": 0.37150157823538255, + "flos": 19972294335360.0, + "grad_norm": 1.761738877644596, + "language_loss": 0.7228415, + "learning_rate": 2.8963542378392502e-06, + "loss": 0.74440265, + "num_input_tokens_seen": 132810160, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76953125, + "step": 6179, + "time_per_iteration": 5.333393812179565 + }, + { + "auxiliary_loss_clip": 0.01122372, + "auxiliary_loss_mlp": 0.01035165, + "balance_loss_clip": 1.01987052, + "balance_loss_mlp": 1.04356897, + "epoch": 0.3715617014880505, + "flos": 24860562266880.0, + "grad_norm": 2.1666258639633518, + "language_loss": 0.69705212, + "learning_rate": 2.896006063609283e-06, + "loss": 0.71862751, + "num_input_tokens_seen": 132831265, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.7890625, + "step": 6180, + "time_per_iteration": 2.4896974563598633 + }, + { + "auxiliary_loss_clip": 0.01117383, + "auxiliary_loss_mlp": 0.01030855, + "balance_loss_clip": 1.01695561, + "balance_loss_mlp": 1.04157031, + "epoch": 0.3716218247407185, + "flos": 20449080489600.0, + "grad_norm": 1.7756296340851163, + "language_loss": 0.77702844, + "learning_rate": 2.8956578554026767e-06, + "loss": 0.79851079, + "num_input_tokens_seen": 132850005, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 6181, + "time_per_iteration": 2.4324231147766113 + }, + { + "auxiliary_loss_clip": 0.01118444, + "auxiliary_loss_mlp": 0.0103491, + "balance_loss_clip": 1.0202775, + "balance_loss_mlp": 1.04225945, + "epoch": 0.37168194799338644, + "flos": 24133479166080.0, + "grad_norm": 1.8526172549307973, + "language_loss": 0.78767365, + "learning_rate": 2.8953096132326343e-06, + "loss": 0.80920726, + "num_input_tokens_seen": 132865790, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.76171875, + "step": 6182, + "time_per_iteration": 2.47566819190979 + }, + { + "auxiliary_loss_clip": 0.01036072, + "auxiliary_loss_mlp": 0.01008449, + "balance_loss_clip": 1.00650644, + "balance_loss_mlp": 1.01082778, + "epoch": 0.3717420712460544, + "flos": 67408926900480.0, + "grad_norm": 0.7841437663574693, + "language_loss": 0.5748502, + "learning_rate": 2.894961337112362e-06, + "loss": 0.59529543, + "num_input_tokens_seen": 132921775, + "router_z_loss_clip": 0.01940918, + "router_z_loss_mlp": 0.25195312, + "step": 6183, + "time_per_iteration": 3.0538721084594727 + }, + { + "auxiliary_loss_clip": 0.01124733, + "auxiliary_loss_mlp": 0.010435, + "balance_loss_clip": 1.02772832, + "balance_loss_mlp": 1.04238844, + "epoch": 0.37180219449872237, + "flos": 22376908362240.0, + "grad_norm": 2.1996761862640715, + "language_loss": 0.76940209, + "learning_rate": 2.894613027055066e-06, + "loss": 0.79108441, + "num_input_tokens_seen": 132941060, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.82421875, + "step": 6184, + "time_per_iteration": 2.4653987884521484 + }, + { + "auxiliary_loss_clip": 0.0111964, + "auxiliary_loss_mlp": 0.01036854, + "balance_loss_clip": 1.02268612, + "balance_loss_mlp": 1.04353404, + "epoch": 0.37186231775139034, + "flos": 21869885934720.0, + "grad_norm": 13.965274526936179, + "language_loss": 0.72047049, + "learning_rate": 2.894264683073954e-06, + "loss": 0.74203539, + "num_input_tokens_seen": 132961850, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.76171875, + "step": 6185, + "time_per_iteration": 2.458340644836426 + }, + { + "auxiliary_loss_clip": 0.01117506, + "auxiliary_loss_mlp": 0.01027176, + "balance_loss_clip": 1.01282895, + "balance_loss_mlp": 1.04169369, + "epoch": 0.3719224410040583, + "flos": 22415225195520.0, + "grad_norm": 1.55661462109525, + "language_loss": 0.7702297, + "learning_rate": 2.8939163051822363e-06, + "loss": 0.79167652, + "num_input_tokens_seen": 132981625, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7578125, + "step": 6186, + "time_per_iteration": 2.4665393829345703 + }, + { + "auxiliary_loss_clip": 0.01125099, + "auxiliary_loss_mlp": 0.01041622, + "balance_loss_clip": 1.02608871, + "balance_loss_mlp": 1.0436089, + "epoch": 0.37198256425672627, + "flos": 25151223121920.0, + "grad_norm": 1.8483894715485976, + "language_loss": 0.83475709, + "learning_rate": 2.8935678933931224e-06, + "loss": 0.85642433, + "num_input_tokens_seen": 133001225, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8125, + "step": 6187, + "time_per_iteration": 2.520294427871704 + }, + { + "auxiliary_loss_clip": 0.01118811, + "auxiliary_loss_mlp": 0.0103693, + "balance_loss_clip": 1.02228546, + "balance_loss_mlp": 1.0421021, + "epoch": 0.37204268750939423, + "flos": 21138313633920.0, + "grad_norm": 2.555128723697134, + "language_loss": 0.84544367, + "learning_rate": 2.893219447719824e-06, + "loss": 0.86700106, + "num_input_tokens_seen": 133018820, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.765625, + "step": 6188, + "time_per_iteration": 2.4926793575286865 + }, + { + "auxiliary_loss_clip": 0.01121509, + "auxiliary_loss_mlp": 0.01034942, + "balance_loss_clip": 1.01966548, + "balance_loss_mlp": 1.04392672, + "epoch": 0.37210281076206225, + "flos": 21506829217920.0, + "grad_norm": 1.6829112555225307, + "language_loss": 0.65646267, + "learning_rate": 2.8928709681755548e-06, + "loss": 0.67802715, + "num_input_tokens_seen": 133040205, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.7734375, + "step": 6189, + "time_per_iteration": 2.447175979614258 + }, + { + "auxiliary_loss_clip": 0.01122656, + "auxiliary_loss_mlp": 0.01039465, + "balance_loss_clip": 1.02514815, + "balance_loss_mlp": 1.04456878, + "epoch": 0.3721629340147302, + "flos": 17347835116800.0, + "grad_norm": 2.6073714147883162, + "language_loss": 0.83948457, + "learning_rate": 2.8925224547735293e-06, + "loss": 0.8611058, + "num_input_tokens_seen": 133058095, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 6190, + "time_per_iteration": 2.4410126209259033 + }, + { + "auxiliary_loss_clip": 0.01125721, + "auxiliary_loss_mlp": 0.01033909, + "balance_loss_clip": 1.01949084, + "balance_loss_mlp": 1.04337156, + "epoch": 0.3722230572673982, + "flos": 16432400073600.0, + "grad_norm": 2.3404623023220643, + "language_loss": 0.88506198, + "learning_rate": 2.8921739075269633e-06, + "loss": 0.90665835, + "num_input_tokens_seen": 133071530, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.82421875, + "step": 6191, + "time_per_iteration": 2.452972650527954 + }, + { + "auxiliary_loss_clip": 0.01123549, + "auxiliary_loss_mlp": 0.01033146, + "balance_loss_clip": 1.01648057, + "balance_loss_mlp": 1.04218102, + "epoch": 0.37228318052006615, + "flos": 22674716023680.0, + "grad_norm": 1.570395080331924, + "language_loss": 0.74228191, + "learning_rate": 2.891825326449073e-06, + "loss": 0.76384884, + "num_input_tokens_seen": 133091410, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8125, + "step": 6192, + "time_per_iteration": 2.6486353874206543 + }, + { + "auxiliary_loss_clip": 0.01119485, + "auxiliary_loss_mlp": 0.01036496, + "balance_loss_clip": 1.02246475, + "balance_loss_mlp": 1.0427109, + "epoch": 0.3723433037727341, + "flos": 25265491263360.0, + "grad_norm": 2.4820365699908944, + "language_loss": 0.79760754, + "learning_rate": 2.8914767115530766e-06, + "loss": 0.81916732, + "num_input_tokens_seen": 133110365, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.765625, + "step": 6193, + "time_per_iteration": 2.525973081588745 + }, + { + "auxiliary_loss_clip": 0.01123101, + "auxiliary_loss_mlp": 0.01039009, + "balance_loss_clip": 1.02436423, + "balance_loss_mlp": 1.043504, + "epoch": 0.3724034270254021, + "flos": 10524664333440.0, + "grad_norm": 1.7895472081978328, + "language_loss": 0.84495157, + "learning_rate": 2.891128062852194e-06, + "loss": 0.86657262, + "num_input_tokens_seen": 133128255, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.796875, + "step": 6194, + "time_per_iteration": 2.419099807739258 + }, + { + "auxiliary_loss_clip": 0.01118251, + "auxiliary_loss_mlp": 0.01034958, + "balance_loss_clip": 1.02080166, + "balance_loss_mlp": 1.04037666, + "epoch": 0.37246355027807004, + "flos": 20266223328000.0, + "grad_norm": 2.9207659578016463, + "language_loss": 0.77555239, + "learning_rate": 2.890779380359646e-06, + "loss": 0.79708451, + "num_input_tokens_seen": 133143975, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.77734375, + "step": 6195, + "time_per_iteration": 2.3995044231414795 + }, + { + "auxiliary_loss_clip": 0.01119279, + "auxiliary_loss_mlp": 0.01032495, + "balance_loss_clip": 1.01814234, + "balance_loss_mlp": 1.0428412, + "epoch": 0.372523673530738, + "flos": 19500571998720.0, + "grad_norm": 1.677102671463593, + "language_loss": 0.79111922, + "learning_rate": 2.890430664088655e-06, + "loss": 0.81263697, + "num_input_tokens_seen": 133162935, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.765625, + "step": 6196, + "time_per_iteration": 2.445478916168213 + }, + { + "auxiliary_loss_clip": 0.01119995, + "auxiliary_loss_mlp": 0.01036406, + "balance_loss_clip": 1.02235723, + "balance_loss_mlp": 1.04315817, + "epoch": 0.372583796783406, + "flos": 16764250849920.0, + "grad_norm": 1.8393036550873767, + "language_loss": 0.8332746, + "learning_rate": 2.890081914052443e-06, + "loss": 0.85483867, + "num_input_tokens_seen": 133181180, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.765625, + "step": 6197, + "time_per_iteration": 2.392005443572998 + }, + { + "auxiliary_loss_clip": 0.01115911, + "auxiliary_loss_mlp": 0.01035382, + "balance_loss_clip": 1.0202899, + "balance_loss_mlp": 1.04070568, + "epoch": 0.37264392003607394, + "flos": 22637979388800.0, + "grad_norm": 2.267147370646453, + "language_loss": 0.64613056, + "learning_rate": 2.889733130264237e-06, + "loss": 0.66764355, + "num_input_tokens_seen": 133199615, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.75, + "step": 6198, + "time_per_iteration": 2.4624876976013184 + }, + { + "auxiliary_loss_clip": 0.0111678, + "auxiliary_loss_mlp": 0.01043759, + "balance_loss_clip": 1.02989507, + "balance_loss_mlp": 1.04129016, + "epoch": 0.3727040432887419, + "flos": 19973120348160.0, + "grad_norm": 2.4815957641530084, + "language_loss": 0.7439245, + "learning_rate": 2.889384312737261e-06, + "loss": 0.76552987, + "num_input_tokens_seen": 133219650, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75390625, + "step": 6199, + "time_per_iteration": 2.454932689666748 + }, + { + "auxiliary_loss_clip": 0.01117342, + "auxiliary_loss_mlp": 0.01032553, + "balance_loss_clip": 1.01881397, + "balance_loss_mlp": 1.04112601, + "epoch": 0.37276416654140987, + "flos": 63899122279680.0, + "grad_norm": 1.569210214205425, + "language_loss": 0.80711329, + "learning_rate": 2.889035461484742e-06, + "loss": 0.82861221, + "num_input_tokens_seen": 133245675, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76171875, + "step": 6200, + "time_per_iteration": 2.853854179382324 + }, + { + "auxiliary_loss_clip": 0.01118801, + "auxiliary_loss_mlp": 0.01040438, + "balance_loss_clip": 1.02588272, + "balance_loss_mlp": 1.04248428, + "epoch": 0.37282428979407783, + "flos": 39785970211200.0, + "grad_norm": 2.046105641958108, + "language_loss": 0.60723466, + "learning_rate": 2.88868657651991e-06, + "loss": 0.6288271, + "num_input_tokens_seen": 133266905, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.765625, + "step": 6201, + "time_per_iteration": 2.58642315864563 + }, + { + "auxiliary_loss_clip": 0.01122167, + "auxiliary_loss_mlp": 0.01032753, + "balance_loss_clip": 1.01813745, + "balance_loss_mlp": 1.04334736, + "epoch": 0.37288441304674586, + "flos": 22709046447360.0, + "grad_norm": 1.5967185311646992, + "language_loss": 0.72980845, + "learning_rate": 2.8883376578559934e-06, + "loss": 0.75135767, + "num_input_tokens_seen": 133286865, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7890625, + "step": 6202, + "time_per_iteration": 2.461116075515747 + }, + { + "auxiliary_loss_clip": 0.01120095, + "auxiliary_loss_mlp": 0.01034793, + "balance_loss_clip": 1.02064919, + "balance_loss_mlp": 1.04372942, + "epoch": 0.3729445362994138, + "flos": 18770292587520.0, + "grad_norm": 2.8761852736669793, + "language_loss": 0.739654, + "learning_rate": 2.8879887055062243e-06, + "loss": 0.76120287, + "num_input_tokens_seen": 133305295, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.765625, + "step": 6203, + "time_per_iteration": 2.4199976921081543 + }, + { + "auxiliary_loss_clip": 0.01113815, + "auxiliary_loss_mlp": 0.01033282, + "balance_loss_clip": 1.02083671, + "balance_loss_mlp": 1.03933048, + "epoch": 0.3730046595520818, + "flos": 22456199635200.0, + "grad_norm": 1.6894031212763305, + "language_loss": 0.81359541, + "learning_rate": 2.8876397194838353e-06, + "loss": 0.83506644, + "num_input_tokens_seen": 133324625, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.74609375, + "step": 6204, + "time_per_iteration": 2.527442693710327 + }, + { + "auxiliary_loss_clip": 0.01122288, + "auxiliary_loss_mlp": 0.01040396, + "balance_loss_clip": 1.02538753, + "balance_loss_mlp": 1.04287875, + "epoch": 0.37306478280474975, + "flos": 24316372241280.0, + "grad_norm": 1.5818895271767701, + "language_loss": 0.75028086, + "learning_rate": 2.8872906998020577e-06, + "loss": 0.77190769, + "num_input_tokens_seen": 133344625, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.796875, + "step": 6205, + "time_per_iteration": 2.515028953552246 + }, + { + "auxiliary_loss_clip": 0.01118084, + "auxiliary_loss_mlp": 0.01034946, + "balance_loss_clip": 1.02002704, + "balance_loss_mlp": 1.04183412, + "epoch": 0.3731249060574177, + "flos": 15815167741440.0, + "grad_norm": 1.8699710225203796, + "language_loss": 0.78044879, + "learning_rate": 2.886941646474128e-06, + "loss": 0.80197906, + "num_input_tokens_seen": 133363605, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.76171875, + "step": 6206, + "time_per_iteration": 2.433136224746704 + }, + { + "auxiliary_loss_clip": 0.01119546, + "auxiliary_loss_mlp": 0.01032561, + "balance_loss_clip": 1.01752925, + "balance_loss_mlp": 1.04182768, + "epoch": 0.3731850293100857, + "flos": 19828077229440.0, + "grad_norm": 2.1358392378140487, + "language_loss": 0.93595111, + "learning_rate": 2.886592559513283e-06, + "loss": 0.95747221, + "num_input_tokens_seen": 133379405, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7734375, + "step": 6207, + "time_per_iteration": 2.422592878341675 + }, + { + "auxiliary_loss_clip": 0.01120785, + "auxiliary_loss_mlp": 0.01031375, + "balance_loss_clip": 1.01774943, + "balance_loss_mlp": 1.04154027, + "epoch": 0.37324515256275365, + "flos": 19062354072960.0, + "grad_norm": 2.238385364236049, + "language_loss": 0.82666922, + "learning_rate": 2.886243438932759e-06, + "loss": 0.84819084, + "num_input_tokens_seen": 133397585, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.79296875, + "step": 6208, + "time_per_iteration": 2.5171287059783936 + }, + { + "auxiliary_loss_clip": 0.01122491, + "auxiliary_loss_mlp": 0.0103487, + "balance_loss_clip": 1.01911068, + "balance_loss_mlp": 1.04320371, + "epoch": 0.3733052758154216, + "flos": 20704333512960.0, + "grad_norm": 1.7601988102738153, + "language_loss": 0.73197794, + "learning_rate": 2.8858942847457953e-06, + "loss": 0.75355148, + "num_input_tokens_seen": 133415365, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.79296875, + "step": 6209, + "time_per_iteration": 2.480943202972412 + }, + { + "auxiliary_loss_clip": 0.01120081, + "auxiliary_loss_mlp": 0.01037238, + "balance_loss_clip": 1.02178252, + "balance_loss_mlp": 1.0430553, + "epoch": 0.3733653990680896, + "flos": 20193504243840.0, + "grad_norm": 1.4781766070975684, + "language_loss": 0.69951272, + "learning_rate": 2.8855450969656305e-06, + "loss": 0.72108591, + "num_input_tokens_seen": 133435700, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.76953125, + "step": 6210, + "time_per_iteration": 2.5063016414642334 + }, + { + "auxiliary_loss_clip": 0.01121548, + "auxiliary_loss_mlp": 0.01030573, + "balance_loss_clip": 1.01533842, + "balance_loss_mlp": 1.04171228, + "epoch": 0.37342552232075754, + "flos": 20339660684160.0, + "grad_norm": 1.960293983782413, + "language_loss": 0.77729124, + "learning_rate": 2.8851958756055073e-06, + "loss": 0.79881245, + "num_input_tokens_seen": 133455180, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.796875, + "step": 6211, + "time_per_iteration": 2.4845266342163086 + }, + { + "auxiliary_loss_clip": 0.01121905, + "auxiliary_loss_mlp": 0.01038644, + "balance_loss_clip": 1.0240593, + "balance_loss_mlp": 1.04219186, + "epoch": 0.3734856455734255, + "flos": 35517879527040.0, + "grad_norm": 1.9911666037414828, + "language_loss": 0.73026669, + "learning_rate": 2.884846620678668e-06, + "loss": 0.75187218, + "num_input_tokens_seen": 133476715, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.796875, + "step": 6212, + "time_per_iteration": 2.615323066711426 + }, + { + "auxiliary_loss_clip": 0.01130473, + "auxiliary_loss_mlp": 0.01047817, + "balance_loss_clip": 1.03231955, + "balance_loss_mlp": 1.04560018, + "epoch": 0.37354576882609347, + "flos": 21142300043520.0, + "grad_norm": 4.00760557025762, + "language_loss": 0.81895888, + "learning_rate": 2.884497332198356e-06, + "loss": 0.84074175, + "num_input_tokens_seen": 133494550, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.84765625, + "step": 6213, + "time_per_iteration": 2.4621500968933105 + }, + { + "auxiliary_loss_clip": 0.01119566, + "auxiliary_loss_mlp": 0.01039017, + "balance_loss_clip": 1.02433026, + "balance_loss_mlp": 1.04143643, + "epoch": 0.37360589207876144, + "flos": 21506793304320.0, + "grad_norm": 2.2631910468903014, + "language_loss": 0.7890203, + "learning_rate": 2.8841480101778167e-06, + "loss": 0.81060612, + "num_input_tokens_seen": 133512640, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.78125, + "step": 6214, + "time_per_iteration": 2.5582997798919678 + }, + { + "auxiliary_loss_clip": 0.01117625, + "auxiliary_loss_mlp": 0.01043035, + "balance_loss_clip": 1.02859902, + "balance_loss_mlp": 1.04069364, + "epoch": 0.37366601533142946, + "flos": 38435800861440.0, + "grad_norm": 1.7789401165216012, + "language_loss": 0.84881294, + "learning_rate": 2.883798654630296e-06, + "loss": 0.87041962, + "num_input_tokens_seen": 133535540, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.76953125, + "step": 6215, + "time_per_iteration": 2.6216535568237305 + }, + { + "auxiliary_loss_clip": 0.01121702, + "auxiliary_loss_mlp": 0.01041572, + "balance_loss_clip": 1.02595592, + "balance_loss_mlp": 1.04088581, + "epoch": 0.3737261385840974, + "flos": 18441171244800.0, + "grad_norm": 5.614431195109344, + "language_loss": 0.67669535, + "learning_rate": 2.8834492655690423e-06, + "loss": 0.69832802, + "num_input_tokens_seen": 133555795, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.80859375, + "step": 6216, + "time_per_iteration": 2.4592814445495605 + }, + { + "auxiliary_loss_clip": 0.01121492, + "auxiliary_loss_mlp": 0.01040499, + "balance_loss_clip": 1.02500176, + "balance_loss_mlp": 1.04252148, + "epoch": 0.3737862618367654, + "flos": 22929861306240.0, + "grad_norm": 2.041107256757408, + "language_loss": 0.65695626, + "learning_rate": 2.883099843007303e-06, + "loss": 0.67857617, + "num_input_tokens_seen": 133575905, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.7890625, + "step": 6217, + "time_per_iteration": 2.50801420211792 + }, + { + "auxiliary_loss_clip": 0.01122909, + "auxiliary_loss_mlp": 0.01039721, + "balance_loss_clip": 1.02378845, + "balance_loss_mlp": 1.04290843, + "epoch": 0.37384638508943335, + "flos": 15409664127360.0, + "grad_norm": 3.2488334570714725, + "language_loss": 0.80776107, + "learning_rate": 2.88275038695833e-06, + "loss": 0.82938731, + "num_input_tokens_seen": 133592585, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.80078125, + "step": 6218, + "time_per_iteration": 2.469524383544922 + }, + { + "auxiliary_loss_clip": 0.01117083, + "auxiliary_loss_mlp": 0.01032877, + "balance_loss_clip": 1.01851249, + "balance_loss_mlp": 1.04241216, + "epoch": 0.3739065083421013, + "flos": 24280820755200.0, + "grad_norm": 1.3682227753048604, + "language_loss": 0.78710622, + "learning_rate": 2.8824008974353736e-06, + "loss": 0.80860579, + "num_input_tokens_seen": 133615070, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.74609375, + "step": 6219, + "time_per_iteration": 2.595862627029419 + }, + { + "auxiliary_loss_clip": 0.01119648, + "auxiliary_loss_mlp": 0.0104261, + "balance_loss_clip": 1.02776265, + "balance_loss_mlp": 1.0430454, + "epoch": 0.3739666315947693, + "flos": 23002831785600.0, + "grad_norm": 2.1916352692915217, + "language_loss": 0.76985866, + "learning_rate": 2.8820513744516866e-06, + "loss": 0.79148126, + "num_input_tokens_seen": 133633490, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.765625, + "step": 6220, + "time_per_iteration": 6.68864631652832 + }, + { + "auxiliary_loss_clip": 0.01120187, + "auxiliary_loss_mlp": 0.01041991, + "balance_loss_clip": 1.02635062, + "balance_loss_mlp": 1.04149485, + "epoch": 0.37402675484743725, + "flos": 19391116279680.0, + "grad_norm": 1.921342744454882, + "language_loss": 0.82958305, + "learning_rate": 2.8817018180205235e-06, + "loss": 0.85120487, + "num_input_tokens_seen": 133653425, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.7890625, + "step": 6221, + "time_per_iteration": 3.9474618434906006 + }, + { + "auxiliary_loss_clip": 0.0111979, + "auxiliary_loss_mlp": 0.01042782, + "balance_loss_clip": 1.02852452, + "balance_loss_mlp": 1.04195023, + "epoch": 0.3740868781001052, + "flos": 17126158331520.0, + "grad_norm": 1.6461952088047174, + "language_loss": 0.75817096, + "learning_rate": 2.8813522281551387e-06, + "loss": 0.7797966, + "num_input_tokens_seen": 133670220, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 6222, + "time_per_iteration": 2.43192720413208 + }, + { + "auxiliary_loss_clip": 0.01121141, + "auxiliary_loss_mlp": 0.01033917, + "balance_loss_clip": 1.0191592, + "balance_loss_mlp": 1.04333961, + "epoch": 0.3741470013527732, + "flos": 20043505048320.0, + "grad_norm": 1.6728060456550218, + "language_loss": 0.70215583, + "learning_rate": 2.881002604868789e-06, + "loss": 0.72370636, + "num_input_tokens_seen": 133688910, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.77734375, + "step": 6223, + "time_per_iteration": 2.4719529151916504 + }, + { + "auxiliary_loss_clip": 0.01123096, + "auxiliary_loss_mlp": 0.010342, + "balance_loss_clip": 1.01976991, + "balance_loss_mlp": 1.04556298, + "epoch": 0.37420712460544114, + "flos": 36897279569280.0, + "grad_norm": 2.209456781749309, + "language_loss": 0.69100869, + "learning_rate": 2.8806529481747325e-06, + "loss": 0.71258163, + "num_input_tokens_seen": 133708690, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.77734375, + "step": 6224, + "time_per_iteration": 2.6382336616516113 + }, + { + "auxiliary_loss_clip": 0.01120784, + "auxiliary_loss_mlp": 0.01033639, + "balance_loss_clip": 1.01942348, + "balance_loss_mlp": 1.04488885, + "epoch": 0.3742672478581091, + "flos": 22201198007040.0, + "grad_norm": 1.8205395187863704, + "language_loss": 0.69828689, + "learning_rate": 2.880303258086228e-06, + "loss": 0.71983123, + "num_input_tokens_seen": 133728095, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7578125, + "step": 6225, + "time_per_iteration": 2.501041889190674 + }, + { + "auxiliary_loss_clip": 0.01118888, + "auxiliary_loss_mlp": 0.01038877, + "balance_loss_clip": 1.02376127, + "balance_loss_mlp": 1.04357982, + "epoch": 0.3743273711107771, + "flos": 24681547860480.0, + "grad_norm": 2.305559014636685, + "language_loss": 0.79056358, + "learning_rate": 2.879953534616536e-06, + "loss": 0.81214118, + "num_input_tokens_seen": 133745590, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.75390625, + "step": 6226, + "time_per_iteration": 2.485196113586426 + }, + { + "auxiliary_loss_clip": 0.01121484, + "auxiliary_loss_mlp": 0.01040329, + "balance_loss_clip": 1.02517128, + "balance_loss_mlp": 1.04342556, + "epoch": 0.37438749436344504, + "flos": 24459619680000.0, + "grad_norm": 2.1155280603994546, + "language_loss": 0.68059194, + "learning_rate": 2.879603777778917e-06, + "loss": 0.70221007, + "num_input_tokens_seen": 133766155, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.78125, + "step": 6227, + "time_per_iteration": 2.553396463394165 + }, + { + "auxiliary_loss_clip": 0.01119717, + "auxiliary_loss_mlp": 0.01034725, + "balance_loss_clip": 1.02044404, + "balance_loss_mlp": 1.04391932, + "epoch": 0.374447617616113, + "flos": 21798747048960.0, + "grad_norm": 1.719573737271176, + "language_loss": 0.82955533, + "learning_rate": 2.879253987586635e-06, + "loss": 0.85109973, + "num_input_tokens_seen": 133783185, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7578125, + "step": 6228, + "time_per_iteration": 2.449979305267334 + }, + { + "auxiliary_loss_clip": 0.01120724, + "auxiliary_loss_mlp": 0.01038988, + "balance_loss_clip": 1.0244565, + "balance_loss_mlp": 1.0452075, + "epoch": 0.374507740868781, + "flos": 17968191932160.0, + "grad_norm": 1.610770216359874, + "language_loss": 0.74802738, + "learning_rate": 2.8789041640529535e-06, + "loss": 0.76962447, + "num_input_tokens_seen": 133800975, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7578125, + "step": 6229, + "time_per_iteration": 2.4768621921539307 + }, + { + "auxiliary_loss_clip": 0.01121137, + "auxiliary_loss_mlp": 0.01039119, + "balance_loss_clip": 1.02384853, + "balance_loss_mlp": 1.04209936, + "epoch": 0.374567864121449, + "flos": 16105828596480.0, + "grad_norm": 1.8233250091751425, + "language_loss": 0.83350682, + "learning_rate": 2.8785543071911383e-06, + "loss": 0.85510933, + "num_input_tokens_seen": 133818020, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.7890625, + "step": 6230, + "time_per_iteration": 2.4503889083862305 + }, + { + "auxiliary_loss_clip": 0.01125186, + "auxiliary_loss_mlp": 0.01039118, + "balance_loss_clip": 1.02383518, + "balance_loss_mlp": 1.04665947, + "epoch": 0.37462798737411696, + "flos": 25773160135680.0, + "grad_norm": 1.8327028169227884, + "language_loss": 0.73589134, + "learning_rate": 2.878204417014456e-06, + "loss": 0.75753438, + "num_input_tokens_seen": 133840690, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.78515625, + "step": 6231, + "time_per_iteration": 2.5793888568878174 + }, + { + "auxiliary_loss_clip": 0.01126351, + "auxiliary_loss_mlp": 0.01042616, + "balance_loss_clip": 1.02754807, + "balance_loss_mlp": 1.04669595, + "epoch": 0.3746881106267849, + "flos": 16654507822080.0, + "grad_norm": 2.0748427868287536, + "language_loss": 0.72982037, + "learning_rate": 2.8778544935361735e-06, + "loss": 0.75151008, + "num_input_tokens_seen": 133858350, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.796875, + "step": 6232, + "time_per_iteration": 2.5400028228759766 + }, + { + "auxiliary_loss_clip": 0.01120736, + "auxiliary_loss_mlp": 0.01034639, + "balance_loss_clip": 1.01927304, + "balance_loss_mlp": 1.04244757, + "epoch": 0.3747482338794529, + "flos": 26177981391360.0, + "grad_norm": 1.7557793199484253, + "language_loss": 0.77042818, + "learning_rate": 2.877504536769561e-06, + "loss": 0.791982, + "num_input_tokens_seen": 133879775, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.78125, + "step": 6233, + "time_per_iteration": 2.6110641956329346 + }, + { + "auxiliary_loss_clip": 0.01124346, + "auxiliary_loss_mlp": 0.01039458, + "balance_loss_clip": 1.02521205, + "balance_loss_mlp": 1.04520559, + "epoch": 0.37480835713212085, + "flos": 12021061950720.0, + "grad_norm": 1.733253645903673, + "language_loss": 0.68936831, + "learning_rate": 2.8771545467278883e-06, + "loss": 0.71100628, + "num_input_tokens_seen": 133898295, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.79296875, + "step": 6234, + "time_per_iteration": 2.4476797580718994 + }, + { + "auxiliary_loss_clip": 0.01121608, + "auxiliary_loss_mlp": 0.01040174, + "balance_loss_clip": 1.02685833, + "balance_loss_mlp": 1.04514599, + "epoch": 0.3748684803847888, + "flos": 19679263182720.0, + "grad_norm": 1.8436539021155727, + "language_loss": 0.82329285, + "learning_rate": 2.8768045234244276e-06, + "loss": 0.84491062, + "num_input_tokens_seen": 133915230, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.765625, + "step": 6235, + "time_per_iteration": 2.4766016006469727 + }, + { + "auxiliary_loss_clip": 0.01127866, + "auxiliary_loss_mlp": 0.01032441, + "balance_loss_clip": 1.01823175, + "balance_loss_mlp": 1.04744995, + "epoch": 0.3749286036374568, + "flos": 20521189042560.0, + "grad_norm": 1.8082481713782126, + "language_loss": 0.77776909, + "learning_rate": 2.8764544668724517e-06, + "loss": 0.79937214, + "num_input_tokens_seen": 133934110, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8046875, + "step": 6236, + "time_per_iteration": 2.440678596496582 + }, + { + "auxiliary_loss_clip": 0.01124108, + "auxiliary_loss_mlp": 0.0104869, + "balance_loss_clip": 1.03139293, + "balance_loss_mlp": 1.04308259, + "epoch": 0.37498872689012475, + "flos": 20704620821760.0, + "grad_norm": 2.0063576687211704, + "language_loss": 0.73203218, + "learning_rate": 2.876104377085234e-06, + "loss": 0.7537601, + "num_input_tokens_seen": 133952395, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.80859375, + "step": 6237, + "time_per_iteration": 2.5782086849212646 + }, + { + "auxiliary_loss_clip": 0.01120953, + "auxiliary_loss_mlp": 0.01037906, + "balance_loss_clip": 1.02257562, + "balance_loss_mlp": 1.04084682, + "epoch": 0.3750488501427927, + "flos": 21574843620480.0, + "grad_norm": 2.2861902523152935, + "language_loss": 0.93017888, + "learning_rate": 2.8757542540760508e-06, + "loss": 0.9517675, + "num_input_tokens_seen": 133969635, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.80078125, + "step": 6238, + "time_per_iteration": 2.514997720718384 + }, + { + "auxiliary_loss_clip": 0.01121834, + "auxiliary_loss_mlp": 0.01033577, + "balance_loss_clip": 1.01821709, + "balance_loss_mlp": 1.04316592, + "epoch": 0.3751089733954607, + "flos": 15923869274880.0, + "grad_norm": 1.9811721217026943, + "language_loss": 0.71066076, + "learning_rate": 2.8754040978581777e-06, + "loss": 0.73221493, + "num_input_tokens_seen": 133987215, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.78515625, + "step": 6239, + "time_per_iteration": 2.5054962635040283 + }, + { + "auxiliary_loss_clip": 0.01127026, + "auxiliary_loss_mlp": 0.01031398, + "balance_loss_clip": 1.01659262, + "balance_loss_mlp": 1.04635918, + "epoch": 0.37516909664812864, + "flos": 36284644177920.0, + "grad_norm": 1.6550300124553972, + "language_loss": 0.6566934, + "learning_rate": 2.875053908444895e-06, + "loss": 0.67827761, + "num_input_tokens_seen": 134009250, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8046875, + "step": 6240, + "time_per_iteration": 2.5776519775390625 + }, + { + "auxiliary_loss_clip": 0.01124905, + "auxiliary_loss_mlp": 0.01031367, + "balance_loss_clip": 1.01703799, + "balance_loss_mlp": 1.04560649, + "epoch": 0.3752292199007966, + "flos": 13515915283200.0, + "grad_norm": 2.0148493018475877, + "language_loss": 0.75634778, + "learning_rate": 2.8747036858494795e-06, + "loss": 0.77791047, + "num_input_tokens_seen": 134026875, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.79296875, + "step": 6241, + "time_per_iteration": 2.503861904144287 + }, + { + "auxiliary_loss_clip": 0.01123464, + "auxiliary_loss_mlp": 0.01040235, + "balance_loss_clip": 1.02436805, + "balance_loss_mlp": 1.04321361, + "epoch": 0.3752893431534646, + "flos": 27198095644800.0, + "grad_norm": 2.5579725641576876, + "language_loss": 0.83610159, + "learning_rate": 2.874353430085213e-06, + "loss": 0.85773861, + "num_input_tokens_seen": 134047185, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.80078125, + "step": 6242, + "time_per_iteration": 2.4933042526245117 + }, + { + "auxiliary_loss_clip": 0.01122935, + "auxiliary_loss_mlp": 0.01038353, + "balance_loss_clip": 1.02435803, + "balance_loss_mlp": 1.04265308, + "epoch": 0.3753494664061326, + "flos": 30007674581760.0, + "grad_norm": 2.190530656574709, + "language_loss": 0.67888391, + "learning_rate": 2.8740031411653766e-06, + "loss": 0.70049673, + "num_input_tokens_seen": 134067330, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.8046875, + "step": 6243, + "time_per_iteration": 2.543820381164551 + }, + { + "auxiliary_loss_clip": 0.01121963, + "auxiliary_loss_mlp": 0.01038078, + "balance_loss_clip": 1.02241397, + "balance_loss_mlp": 1.04404676, + "epoch": 0.37540958965880056, + "flos": 24461954064000.0, + "grad_norm": 1.7974063962239055, + "language_loss": 0.84275806, + "learning_rate": 2.8736528191032535e-06, + "loss": 0.86435848, + "num_input_tokens_seen": 134085525, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.78125, + "step": 6244, + "time_per_iteration": 2.4710450172424316 + }, + { + "auxiliary_loss_clip": 0.01119065, + "auxiliary_loss_mlp": 0.01036596, + "balance_loss_clip": 1.02229667, + "balance_loss_mlp": 1.0436101, + "epoch": 0.3754697129114685, + "flos": 16508387295360.0, + "grad_norm": 2.387588700969948, + "language_loss": 0.83019805, + "learning_rate": 2.8733024639121277e-06, + "loss": 0.85175467, + "num_input_tokens_seen": 134101855, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.75390625, + "step": 6245, + "time_per_iteration": 2.4594197273254395 + }, + { + "auxiliary_loss_clip": 0.01122149, + "auxiliary_loss_mlp": 0.01037692, + "balance_loss_clip": 1.02207565, + "balance_loss_mlp": 1.04337263, + "epoch": 0.3755298361641365, + "flos": 19390900798080.0, + "grad_norm": 1.94802763897559, + "language_loss": 0.64043313, + "learning_rate": 2.8729520756052853e-06, + "loss": 0.66203153, + "num_input_tokens_seen": 134119360, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.7890625, + "step": 6246, + "time_per_iteration": 2.4522809982299805 + }, + { + "auxiliary_loss_clip": 0.01125162, + "auxiliary_loss_mlp": 0.01038669, + "balance_loss_clip": 1.0231837, + "balance_loss_mlp": 1.04382014, + "epoch": 0.37558995941680445, + "flos": 14720395069440.0, + "grad_norm": 1.7195896287931138, + "language_loss": 0.75146973, + "learning_rate": 2.8726016541960124e-06, + "loss": 0.77310807, + "num_input_tokens_seen": 134137475, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8125, + "step": 6247, + "time_per_iteration": 2.4527103900909424 + }, + { + "auxiliary_loss_clip": 0.01122539, + "auxiliary_loss_mlp": 0.01037762, + "balance_loss_clip": 1.02281308, + "balance_loss_mlp": 1.04276609, + "epoch": 0.3756500826694724, + "flos": 21689901861120.0, + "grad_norm": 3.472354315090956, + "language_loss": 0.55157161, + "learning_rate": 2.872251199697598e-06, + "loss": 0.5731746, + "num_input_tokens_seen": 134154580, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.796875, + "step": 6248, + "time_per_iteration": 2.4399521350860596 + }, + { + "auxiliary_loss_clip": 0.01119734, + "auxiliary_loss_mlp": 0.0103806, + "balance_loss_clip": 1.02334976, + "balance_loss_mlp": 1.04241502, + "epoch": 0.3757102059221404, + "flos": 26505666190080.0, + "grad_norm": 2.875026035710993, + "language_loss": 0.84247208, + "learning_rate": 2.8719007121233297e-06, + "loss": 0.86404997, + "num_input_tokens_seen": 134174285, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.7734375, + "step": 6249, + "time_per_iteration": 2.529763698577881 + }, + { + "auxiliary_loss_clip": 0.01120861, + "auxiliary_loss_mlp": 0.01033042, + "balance_loss_clip": 1.018713, + "balance_loss_mlp": 1.0427655, + "epoch": 0.37577032917480835, + "flos": 37338083274240.0, + "grad_norm": 1.7253468577749267, + "language_loss": 0.68124413, + "learning_rate": 2.8715501914864993e-06, + "loss": 0.70278323, + "num_input_tokens_seen": 134195940, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78125, + "step": 6250, + "time_per_iteration": 2.572439193725586 + }, + { + "auxiliary_loss_clip": 0.01124257, + "auxiliary_loss_mlp": 0.01042227, + "balance_loss_clip": 1.02791047, + "balance_loss_mlp": 1.04538727, + "epoch": 0.3758304524274763, + "flos": 21908597817600.0, + "grad_norm": 2.0419035804756716, + "language_loss": 0.77633286, + "learning_rate": 2.8711996378003987e-06, + "loss": 0.79799771, + "num_input_tokens_seen": 134212235, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7890625, + "step": 6251, + "time_per_iteration": 2.58437442779541 + }, + { + "auxiliary_loss_clip": 0.01120391, + "auxiliary_loss_mlp": 0.01033287, + "balance_loss_clip": 1.01910138, + "balance_loss_mlp": 1.04232824, + "epoch": 0.3758905756801443, + "flos": 36569343375360.0, + "grad_norm": 2.137051103462404, + "language_loss": 0.58463252, + "learning_rate": 2.8708490510783203e-06, + "loss": 0.60616934, + "num_input_tokens_seen": 134233810, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 6252, + "time_per_iteration": 2.6117262840270996 + }, + { + "auxiliary_loss_clip": 0.01124494, + "auxiliary_loss_mlp": 0.01043009, + "balance_loss_clip": 1.02730918, + "balance_loss_mlp": 1.04393482, + "epoch": 0.37595069893281224, + "flos": 24528783317760.0, + "grad_norm": 2.9959533965383836, + "language_loss": 0.89689183, + "learning_rate": 2.8704984313335584e-06, + "loss": 0.91856694, + "num_input_tokens_seen": 134252020, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8046875, + "step": 6253, + "time_per_iteration": 2.5241925716400146 + }, + { + "auxiliary_loss_clip": 0.01123311, + "auxiliary_loss_mlp": 0.01036763, + "balance_loss_clip": 1.0227623, + "balance_loss_mlp": 1.04618073, + "epoch": 0.3760108221854802, + "flos": 16435021766400.0, + "grad_norm": 1.9568868773694639, + "language_loss": 0.76368916, + "learning_rate": 2.8701477785794097e-06, + "loss": 0.78528988, + "num_input_tokens_seen": 134269495, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76953125, + "step": 6254, + "time_per_iteration": 2.44631028175354 + }, + { + "auxiliary_loss_clip": 0.011269, + "auxiliary_loss_mlp": 0.01044619, + "balance_loss_clip": 1.02906847, + "balance_loss_mlp": 1.04640615, + "epoch": 0.37607094543814823, + "flos": 13771742924160.0, + "grad_norm": 2.019237604940679, + "language_loss": 0.61830014, + "learning_rate": 2.869797092829169e-06, + "loss": 0.6400153, + "num_input_tokens_seen": 134287035, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8046875, + "step": 6255, + "time_per_iteration": 2.474303960800171 + }, + { + "auxiliary_loss_clip": 0.01125813, + "auxiliary_loss_mlp": 0.01037924, + "balance_loss_clip": 1.02204537, + "balance_loss_mlp": 1.0434109, + "epoch": 0.3761310686908162, + "flos": 19857918453120.0, + "grad_norm": 2.4357923747979675, + "language_loss": 0.74234015, + "learning_rate": 2.869446374096135e-06, + "loss": 0.76397753, + "num_input_tokens_seen": 134304840, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.82421875, + "step": 6256, + "time_per_iteration": 2.4332830905914307 + }, + { + "auxiliary_loss_clip": 0.01129168, + "auxiliary_loss_mlp": 0.01045861, + "balance_loss_clip": 1.03029239, + "balance_loss_mlp": 1.04842019, + "epoch": 0.37619119194348416, + "flos": 12750802657920.0, + "grad_norm": 1.807318668329893, + "language_loss": 0.70297635, + "learning_rate": 2.8690956223936088e-06, + "loss": 0.72472662, + "num_input_tokens_seen": 134323180, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.80859375, + "step": 6257, + "time_per_iteration": 2.600249767303467 + }, + { + "auxiliary_loss_clip": 0.01123849, + "auxiliary_loss_mlp": 0.01034306, + "balance_loss_clip": 1.01998889, + "balance_loss_mlp": 1.04582894, + "epoch": 0.3762513151961521, + "flos": 17530548624000.0, + "grad_norm": 1.8628634379537026, + "language_loss": 0.84647095, + "learning_rate": 2.868744837734889e-06, + "loss": 0.86805254, + "num_input_tokens_seen": 134341390, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78125, + "step": 6258, + "time_per_iteration": 2.443833351135254 + }, + { + "auxiliary_loss_clip": 0.01122949, + "auxiliary_loss_mlp": 0.01043602, + "balance_loss_clip": 1.02936888, + "balance_loss_mlp": 1.04430962, + "epoch": 0.3763114384488201, + "flos": 23617406511360.0, + "grad_norm": 1.514941849696829, + "language_loss": 0.81009686, + "learning_rate": 2.868394020133277e-06, + "loss": 0.83176237, + "num_input_tokens_seen": 134360425, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78515625, + "step": 6259, + "time_per_iteration": 2.5727832317352295 + }, + { + "auxiliary_loss_clip": 0.01130377, + "auxiliary_loss_mlp": 0.01042246, + "balance_loss_clip": 1.02660608, + "balance_loss_mlp": 1.04775453, + "epoch": 0.37637156170148806, + "flos": 25406978935680.0, + "grad_norm": 1.8915772167347047, + "language_loss": 0.71919596, + "learning_rate": 2.8680431696020783e-06, + "loss": 0.74092221, + "num_input_tokens_seen": 134379775, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.828125, + "step": 6260, + "time_per_iteration": 2.5225539207458496 + }, + { + "auxiliary_loss_clip": 0.0112693, + "auxiliary_loss_mlp": 0.01036069, + "balance_loss_clip": 1.02061951, + "balance_loss_mlp": 1.04538989, + "epoch": 0.376431684954156, + "flos": 23440906056960.0, + "grad_norm": 1.725193491542272, + "language_loss": 0.78423822, + "learning_rate": 2.867692286154594e-06, + "loss": 0.80586827, + "num_input_tokens_seen": 134400315, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.81640625, + "step": 6261, + "time_per_iteration": 2.4926671981811523 + }, + { + "auxiliary_loss_clip": 0.01132188, + "auxiliary_loss_mlp": 0.01043226, + "balance_loss_clip": 1.02784848, + "balance_loss_mlp": 1.04861188, + "epoch": 0.376491808206824, + "flos": 34204482725760.0, + "grad_norm": 1.7544905551461754, + "language_loss": 0.80327791, + "learning_rate": 2.867341369804132e-06, + "loss": 0.82503211, + "num_input_tokens_seen": 134422875, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8359375, + "step": 6262, + "time_per_iteration": 6.861605167388916 + }, + { + "auxiliary_loss_clip": 0.01121874, + "auxiliary_loss_mlp": 0.01032438, + "balance_loss_clip": 1.01796031, + "balance_loss_mlp": 1.04471791, + "epoch": 0.37655193145949195, + "flos": 35185669614720.0, + "grad_norm": 1.7128267856657793, + "language_loss": 0.80543715, + "learning_rate": 2.866990420563998e-06, + "loss": 0.82698023, + "num_input_tokens_seen": 134443025, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.76953125, + "step": 6263, + "time_per_iteration": 2.6574654579162598 + }, + { + "auxiliary_loss_clip": 0.01128017, + "auxiliary_loss_mlp": 0.01041966, + "balance_loss_clip": 1.02705324, + "balance_loss_mlp": 1.04757583, + "epoch": 0.3766120547121599, + "flos": 16761844638720.0, + "grad_norm": 2.7435231382382033, + "language_loss": 0.80158919, + "learning_rate": 2.866639438447501e-06, + "loss": 0.82328904, + "num_input_tokens_seen": 134460945, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8046875, + "step": 6264, + "time_per_iteration": 2.4326720237731934 + }, + { + "auxiliary_loss_clip": 0.01122852, + "auxiliary_loss_mlp": 0.01046441, + "balance_loss_clip": 1.03120613, + "balance_loss_mlp": 1.04323912, + "epoch": 0.3766721779648279, + "flos": 23550361776000.0, + "grad_norm": 2.2579254623504585, + "language_loss": 0.73604524, + "learning_rate": 2.8662884234679497e-06, + "loss": 0.75773823, + "num_input_tokens_seen": 134480440, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.796875, + "step": 6265, + "time_per_iteration": 2.481248617172241 + }, + { + "auxiliary_loss_clip": 0.01126145, + "auxiliary_loss_mlp": 0.01038865, + "balance_loss_clip": 1.02525079, + "balance_loss_mlp": 1.04878664, + "epoch": 0.37673230121749585, + "flos": 29129191655040.0, + "grad_norm": 1.6798839148056366, + "language_loss": 0.68685853, + "learning_rate": 2.865937375638654e-06, + "loss": 0.70850861, + "num_input_tokens_seen": 134501110, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 6266, + "time_per_iteration": 2.517972946166992 + }, + { + "auxiliary_loss_clip": 0.01129377, + "auxiliary_loss_mlp": 0.0104268, + "balance_loss_clip": 1.02746832, + "balance_loss_mlp": 1.04570127, + "epoch": 0.3767924244701638, + "flos": 28146783703680.0, + "grad_norm": 21.71943634627446, + "language_loss": 0.6330213, + "learning_rate": 2.8655862949729264e-06, + "loss": 0.65474188, + "num_input_tokens_seen": 134522460, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8359375, + "step": 6267, + "time_per_iteration": 2.534775733947754 + }, + { + "auxiliary_loss_clip": 0.01049589, + "auxiliary_loss_mlp": 0.01002617, + "balance_loss_clip": 1.00076914, + "balance_loss_mlp": 1.02342653, + "epoch": 0.37685254772283183, + "flos": 60797197526400.0, + "grad_norm": 0.7181832227527338, + "language_loss": 0.58946306, + "learning_rate": 2.8652351814840795e-06, + "loss": 0.60998511, + "num_input_tokens_seen": 134589545, + "router_z_loss_clip": 0.01843262, + "router_z_loss_mlp": 0.26171875, + "step": 6268, + "time_per_iteration": 3.168419361114502 + }, + { + "auxiliary_loss_clip": 0.011283, + "auxiliary_loss_mlp": 0.01038795, + "balance_loss_clip": 1.02268982, + "balance_loss_mlp": 1.04734302, + "epoch": 0.3769126709754998, + "flos": 26032543223040.0, + "grad_norm": 1.4797604992869704, + "language_loss": 0.65026355, + "learning_rate": 2.8648840351854283e-06, + "loss": 0.67193449, + "num_input_tokens_seen": 134610550, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8125, + "step": 6269, + "time_per_iteration": 2.5472333431243896 + }, + { + "auxiliary_loss_clip": 0.01127949, + "auxiliary_loss_mlp": 0.01038619, + "balance_loss_clip": 1.02263296, + "balance_loss_mlp": 1.05022144, + "epoch": 0.37697279422816776, + "flos": 23579879777280.0, + "grad_norm": 1.46875421159053, + "language_loss": 0.70592397, + "learning_rate": 2.8645328560902874e-06, + "loss": 0.72758961, + "num_input_tokens_seen": 134630485, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.77734375, + "step": 6270, + "time_per_iteration": 2.4763948917388916 + }, + { + "auxiliary_loss_clip": 0.01045864, + "auxiliary_loss_mlp": 0.0100198, + "balance_loss_clip": 1.00021577, + "balance_loss_mlp": 1.02014744, + "epoch": 0.3770329174808357, + "flos": 64745935367040.0, + "grad_norm": 0.7024360778923162, + "language_loss": 0.56136239, + "learning_rate": 2.8641816442119746e-06, + "loss": 0.58184087, + "num_input_tokens_seen": 134693510, + "router_z_loss_clip": 0.0177002, + "router_z_loss_mlp": 0.2578125, + "step": 6271, + "time_per_iteration": 3.0738816261291504 + }, + { + "auxiliary_loss_clip": 0.01124439, + "auxiliary_loss_mlp": 0.01039364, + "balance_loss_clip": 1.02326441, + "balance_loss_mlp": 1.04638743, + "epoch": 0.3770930407335037, + "flos": 21835304115840.0, + "grad_norm": 2.066611127756055, + "language_loss": 0.79340166, + "learning_rate": 2.8638303995638066e-06, + "loss": 0.81503969, + "num_input_tokens_seen": 134713115, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.78125, + "step": 6272, + "time_per_iteration": 2.4686055183410645 + }, + { + "auxiliary_loss_clip": 0.01122198, + "auxiliary_loss_mlp": 0.01031935, + "balance_loss_clip": 1.01802933, + "balance_loss_mlp": 1.04578209, + "epoch": 0.37715316398617166, + "flos": 22747901984640.0, + "grad_norm": 1.4641670728096365, + "language_loss": 0.74172843, + "learning_rate": 2.863479122159103e-06, + "loss": 0.76326972, + "num_input_tokens_seen": 134732635, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 6273, + "time_per_iteration": 2.5079009532928467 + }, + { + "auxiliary_loss_clip": 0.01124789, + "auxiliary_loss_mlp": 0.01045969, + "balance_loss_clip": 1.03112721, + "balance_loss_mlp": 1.04621577, + "epoch": 0.3772132872388396, + "flos": 18914581520640.0, + "grad_norm": 1.4163029825487425, + "language_loss": 0.71801323, + "learning_rate": 2.8631278120111858e-06, + "loss": 0.73972082, + "num_input_tokens_seen": 134750695, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78515625, + "step": 6274, + "time_per_iteration": 2.460338592529297 + }, + { + "auxiliary_loss_clip": 0.01128245, + "auxiliary_loss_mlp": 0.01036844, + "balance_loss_clip": 1.02277732, + "balance_loss_mlp": 1.04794264, + "epoch": 0.3772734104915076, + "flos": 17346219004800.0, + "grad_norm": 1.663376044288712, + "language_loss": 0.83692443, + "learning_rate": 2.8627764691333742e-06, + "loss": 0.85857534, + "num_input_tokens_seen": 134768935, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.80078125, + "step": 6275, + "time_per_iteration": 2.48319149017334 + }, + { + "auxiliary_loss_clip": 0.01121629, + "auxiliary_loss_mlp": 0.0103252, + "balance_loss_clip": 1.01949656, + "balance_loss_mlp": 1.04532933, + "epoch": 0.37733353374417555, + "flos": 32342370785280.0, + "grad_norm": 1.4340123311349162, + "language_loss": 0.75342453, + "learning_rate": 2.8624250935389935e-06, + "loss": 0.77496612, + "num_input_tokens_seen": 134791260, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.76171875, + "step": 6276, + "time_per_iteration": 2.5773236751556396 + }, + { + "auxiliary_loss_clip": 0.01127758, + "auxiliary_loss_mlp": 0.01042729, + "balance_loss_clip": 1.02724338, + "balance_loss_mlp": 1.04667568, + "epoch": 0.3773936569968435, + "flos": 23360681030400.0, + "grad_norm": 1.858122502551201, + "language_loss": 0.85519129, + "learning_rate": 2.862073685241366e-06, + "loss": 0.87689614, + "num_input_tokens_seen": 134808350, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8125, + "step": 6277, + "time_per_iteration": 2.5827369689941406 + }, + { + "auxiliary_loss_clip": 0.01123645, + "auxiliary_loss_mlp": 0.01032265, + "balance_loss_clip": 1.01833546, + "balance_loss_mlp": 1.04713118, + "epoch": 0.3774537802495115, + "flos": 21466788531840.0, + "grad_norm": 2.807350675061797, + "language_loss": 0.78055024, + "learning_rate": 2.861722244253818e-06, + "loss": 0.80210936, + "num_input_tokens_seen": 134826005, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 6278, + "time_per_iteration": 2.491334915161133 + }, + { + "auxiliary_loss_clip": 0.01128448, + "auxiliary_loss_mlp": 0.01044224, + "balance_loss_clip": 1.02795196, + "balance_loss_mlp": 1.04698181, + "epoch": 0.37751390350217945, + "flos": 24973717086720.0, + "grad_norm": 1.933979010172509, + "language_loss": 0.82702643, + "learning_rate": 2.8613707705896767e-06, + "loss": 0.84875309, + "num_input_tokens_seen": 134844995, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8125, + "step": 6279, + "time_per_iteration": 2.538426160812378 + }, + { + "auxiliary_loss_clip": 0.01125885, + "auxiliary_loss_mlp": 0.01037058, + "balance_loss_clip": 1.02310467, + "balance_loss_mlp": 1.04578614, + "epoch": 0.3775740267548474, + "flos": 27819098904960.0, + "grad_norm": 2.0225623598483358, + "language_loss": 0.74985826, + "learning_rate": 2.861019264262269e-06, + "loss": 0.77148765, + "num_input_tokens_seen": 134865285, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.80078125, + "step": 6280, + "time_per_iteration": 2.5161032676696777 + }, + { + "auxiliary_loss_clip": 0.01123339, + "auxiliary_loss_mlp": 0.01036466, + "balance_loss_clip": 1.02283478, + "balance_loss_mlp": 1.04662085, + "epoch": 0.3776341500075154, + "flos": 22565224391040.0, + "grad_norm": 1.4438938373085308, + "language_loss": 0.76017272, + "learning_rate": 2.8606677252849242e-06, + "loss": 0.78177071, + "num_input_tokens_seen": 134886535, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.765625, + "step": 6281, + "time_per_iteration": 2.504711151123047 + }, + { + "auxiliary_loss_clip": 0.01122332, + "auxiliary_loss_mlp": 0.01035077, + "balance_loss_clip": 1.02049732, + "balance_loss_mlp": 1.04368496, + "epoch": 0.3776942732601834, + "flos": 23077238808960.0, + "grad_norm": 1.7476205657776698, + "language_loss": 0.8391279, + "learning_rate": 2.860316153670974e-06, + "loss": 0.86070192, + "num_input_tokens_seen": 134907435, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.78515625, + "step": 6282, + "time_per_iteration": 2.4668593406677246 + }, + { + "auxiliary_loss_clip": 0.01120742, + "auxiliary_loss_mlp": 0.01037931, + "balance_loss_clip": 1.02337587, + "balance_loss_mlp": 1.04434681, + "epoch": 0.37775439651285136, + "flos": 21724411852800.0, + "grad_norm": 1.8037618077250128, + "language_loss": 0.70150751, + "learning_rate": 2.8599645494337484e-06, + "loss": 0.72309422, + "num_input_tokens_seen": 134925360, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.765625, + "step": 6283, + "time_per_iteration": 2.481948137283325 + }, + { + "auxiliary_loss_clip": 0.0112321, + "auxiliary_loss_mlp": 0.01045267, + "balance_loss_clip": 1.02967477, + "balance_loss_mlp": 1.04516089, + "epoch": 0.37781451976551933, + "flos": 23987753688960.0, + "grad_norm": 1.804590454145544, + "language_loss": 0.76529062, + "learning_rate": 2.859612912586581e-06, + "loss": 0.78697532, + "num_input_tokens_seen": 134944205, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.78125, + "step": 6284, + "time_per_iteration": 2.462968349456787 + }, + { + "auxiliary_loss_clip": 0.01130082, + "auxiliary_loss_mlp": 0.01034565, + "balance_loss_clip": 1.01884174, + "balance_loss_mlp": 1.0466392, + "epoch": 0.3778746430181873, + "flos": 13727967223680.0, + "grad_norm": 2.0529722445272167, + "language_loss": 0.85851312, + "learning_rate": 2.8592612431428055e-06, + "loss": 0.88015962, + "num_input_tokens_seen": 134960255, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8359375, + "step": 6285, + "time_per_iteration": 2.4435150623321533 + }, + { + "auxiliary_loss_clip": 0.01125611, + "auxiliary_loss_mlp": 0.0104034, + "balance_loss_clip": 1.0240438, + "balance_loss_mlp": 1.04457164, + "epoch": 0.37793476627085526, + "flos": 19460495399040.0, + "grad_norm": 1.9682053367320125, + "language_loss": 0.83967972, + "learning_rate": 2.858909541115758e-06, + "loss": 0.86133921, + "num_input_tokens_seen": 134978605, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8125, + "step": 6286, + "time_per_iteration": 2.4270951747894287 + }, + { + "auxiliary_loss_clip": 0.01123272, + "auxiliary_loss_mlp": 0.01041948, + "balance_loss_clip": 1.0268203, + "balance_loss_mlp": 1.04474115, + "epoch": 0.3779948895235232, + "flos": 10707018704640.0, + "grad_norm": 2.20319687907872, + "language_loss": 0.81550682, + "learning_rate": 2.858557806518775e-06, + "loss": 0.83715904, + "num_input_tokens_seen": 134995020, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.78515625, + "step": 6287, + "time_per_iteration": 2.4504740238189697 + }, + { + "auxiliary_loss_clip": 0.01121581, + "auxiliary_loss_mlp": 0.01040758, + "balance_loss_clip": 1.02559495, + "balance_loss_mlp": 1.04340911, + "epoch": 0.3780550127761912, + "flos": 22310007281280.0, + "grad_norm": 2.428511311582982, + "language_loss": 0.73038173, + "learning_rate": 2.8582060393651927e-06, + "loss": 0.75200516, + "num_input_tokens_seen": 135012620, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.78125, + "step": 6288, + "time_per_iteration": 2.4988601207733154 + }, + { + "auxiliary_loss_clip": 0.01126071, + "auxiliary_loss_mlp": 0.01036255, + "balance_loss_clip": 1.02103162, + "balance_loss_mlp": 1.04705048, + "epoch": 0.37811513602885916, + "flos": 28950644125440.0, + "grad_norm": 1.726028925404572, + "language_loss": 0.75453335, + "learning_rate": 2.857854239668352e-06, + "loss": 0.7761566, + "num_input_tokens_seen": 135033365, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.7890625, + "step": 6289, + "time_per_iteration": 2.5323870182037354 + }, + { + "auxiliary_loss_clip": 0.01121413, + "auxiliary_loss_mlp": 0.01038832, + "balance_loss_clip": 1.02428889, + "balance_loss_mlp": 1.04395676, + "epoch": 0.3781752592815271, + "flos": 23112933949440.0, + "grad_norm": 1.9121243331279245, + "language_loss": 0.7341041, + "learning_rate": 2.857502407441593e-06, + "loss": 0.75570655, + "num_input_tokens_seen": 135052185, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7734375, + "step": 6290, + "time_per_iteration": 2.4703667163848877 + }, + { + "auxiliary_loss_clip": 0.01126076, + "auxiliary_loss_mlp": 0.01040267, + "balance_loss_clip": 1.02388752, + "balance_loss_mlp": 1.0441103, + "epoch": 0.3782353825341951, + "flos": 19755932762880.0, + "grad_norm": 2.4130424762969502, + "language_loss": 0.79729307, + "learning_rate": 2.8571505426982566e-06, + "loss": 0.81895649, + "num_input_tokens_seen": 135070425, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8203125, + "step": 6291, + "time_per_iteration": 2.590517520904541 + }, + { + "auxiliary_loss_clip": 0.01124797, + "auxiliary_loss_mlp": 0.0103595, + "balance_loss_clip": 1.02038157, + "balance_loss_mlp": 1.04347014, + "epoch": 0.37829550578686305, + "flos": 22050839675520.0, + "grad_norm": 1.7851511943573266, + "language_loss": 0.76090503, + "learning_rate": 2.8567986454516854e-06, + "loss": 0.78251249, + "num_input_tokens_seen": 135090525, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8125, + "step": 6292, + "time_per_iteration": 2.486375570297241 + }, + { + "auxiliary_loss_clip": 0.0112214, + "auxiliary_loss_mlp": 0.01042986, + "balance_loss_clip": 1.02708387, + "balance_loss_mlp": 1.04380596, + "epoch": 0.378355629039531, + "flos": 16470357770880.0, + "grad_norm": 1.8744506208430416, + "language_loss": 0.69510674, + "learning_rate": 2.856446715715224e-06, + "loss": 0.71675801, + "num_input_tokens_seen": 135109575, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.78125, + "step": 6293, + "time_per_iteration": 2.477025032043457 + }, + { + "auxiliary_loss_clip": 0.01120173, + "auxiliary_loss_mlp": 0.01036754, + "balance_loss_clip": 1.02140629, + "balance_loss_mlp": 1.04180205, + "epoch": 0.378415752292199, + "flos": 19974844200960.0, + "grad_norm": 1.812028848861632, + "language_loss": 0.71631789, + "learning_rate": 2.8560947535022173e-06, + "loss": 0.73788714, + "num_input_tokens_seen": 135127000, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.78515625, + "step": 6294, + "time_per_iteration": 2.446382522583008 + }, + { + "auxiliary_loss_clip": 0.01128463, + "auxiliary_loss_mlp": 0.01035795, + "balance_loss_clip": 1.02050054, + "balance_loss_mlp": 1.04522586, + "epoch": 0.378475875544867, + "flos": 14647388676480.0, + "grad_norm": 2.0852903309957815, + "language_loss": 0.8254326, + "learning_rate": 2.855742758826011e-06, + "loss": 0.84707516, + "num_input_tokens_seen": 135145285, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.83203125, + "step": 6295, + "time_per_iteration": 2.4684417247772217 + }, + { + "auxiliary_loss_clip": 0.01123253, + "auxiliary_loss_mlp": 0.01033263, + "balance_loss_clip": 1.01870751, + "balance_loss_mlp": 1.04352689, + "epoch": 0.37853599879753497, + "flos": 26650996617600.0, + "grad_norm": 1.687128097470698, + "language_loss": 0.71806532, + "learning_rate": 2.8553907316999547e-06, + "loss": 0.73963046, + "num_input_tokens_seen": 135165240, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.796875, + "step": 6296, + "time_per_iteration": 2.515676975250244 + }, + { + "auxiliary_loss_clip": 0.01119269, + "auxiliary_loss_mlp": 0.01039959, + "balance_loss_clip": 1.02523708, + "balance_loss_mlp": 1.04370534, + "epoch": 0.37859612205020293, + "flos": 17311960408320.0, + "grad_norm": 1.741193546240543, + "language_loss": 0.77094543, + "learning_rate": 2.855038672137396e-06, + "loss": 0.79253769, + "num_input_tokens_seen": 135184045, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7578125, + "step": 6297, + "time_per_iteration": 2.4617502689361572 + }, + { + "auxiliary_loss_clip": 0.01123428, + "auxiliary_loss_mlp": 0.01035161, + "balance_loss_clip": 1.02042699, + "balance_loss_mlp": 1.04360187, + "epoch": 0.3786562453028709, + "flos": 18220392299520.0, + "grad_norm": 2.034703790395703, + "language_loss": 0.79179847, + "learning_rate": 2.854686580151684e-06, + "loss": 0.81338429, + "num_input_tokens_seen": 135202365, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.796875, + "step": 6298, + "time_per_iteration": 2.4516994953155518 + }, + { + "auxiliary_loss_clip": 0.01121762, + "auxiliary_loss_mlp": 0.01034647, + "balance_loss_clip": 1.02001977, + "balance_loss_mlp": 1.04453242, + "epoch": 0.37871636855553886, + "flos": 21214875473280.0, + "grad_norm": 2.0947541210526466, + "language_loss": 0.84758198, + "learning_rate": 2.8543344557561722e-06, + "loss": 0.86914611, + "num_input_tokens_seen": 135220955, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7734375, + "step": 6299, + "time_per_iteration": 2.4814558029174805 + }, + { + "auxiliary_loss_clip": 0.01123706, + "auxiliary_loss_mlp": 0.0103612, + "balance_loss_clip": 1.02153504, + "balance_loss_mlp": 1.04462421, + "epoch": 0.3787764918082068, + "flos": 20952727038720.0, + "grad_norm": 2.218392777517032, + "language_loss": 0.7657811, + "learning_rate": 2.8539822989642116e-06, + "loss": 0.78737932, + "num_input_tokens_seen": 135239715, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7890625, + "step": 6300, + "time_per_iteration": 2.4615044593811035 + }, + { + "auxiliary_loss_clip": 0.01127842, + "auxiliary_loss_mlp": 0.01039306, + "balance_loss_clip": 1.02135265, + "balance_loss_mlp": 1.04486537, + "epoch": 0.3788366150608748, + "flos": 17308009912320.0, + "grad_norm": 2.28104869272164, + "language_loss": 0.82490808, + "learning_rate": 2.8536301097891577e-06, + "loss": 0.84657955, + "num_input_tokens_seen": 135257035, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.828125, + "step": 6301, + "time_per_iteration": 2.4864752292633057 + }, + { + "auxiliary_loss_clip": 0.01119304, + "auxiliary_loss_mlp": 0.0104447, + "balance_loss_clip": 1.02967012, + "balance_loss_mlp": 1.04097867, + "epoch": 0.37889673831354276, + "flos": 24311092942080.0, + "grad_norm": 1.8461206090891127, + "language_loss": 0.67669666, + "learning_rate": 2.8532778882443636e-06, + "loss": 0.69833434, + "num_input_tokens_seen": 135275720, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78125, + "step": 6302, + "time_per_iteration": 2.501873016357422 + }, + { + "auxiliary_loss_clip": 0.01122155, + "auxiliary_loss_mlp": 0.0104003, + "balance_loss_clip": 1.02617788, + "balance_loss_mlp": 1.04561174, + "epoch": 0.3789568615662107, + "flos": 26683603188480.0, + "grad_norm": 1.9271400579859064, + "language_loss": 0.68487787, + "learning_rate": 2.8529256343431867e-06, + "loss": 0.7064997, + "num_input_tokens_seen": 135294140, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 6303, + "time_per_iteration": 4.003960371017456 + }, + { + "auxiliary_loss_clip": 0.01119108, + "auxiliary_loss_mlp": 0.01034602, + "balance_loss_clip": 1.02055335, + "balance_loss_mlp": 1.04143763, + "epoch": 0.3790169848188787, + "flos": 23585194990080.0, + "grad_norm": 1.8915662489351535, + "language_loss": 0.77611423, + "learning_rate": 2.8525733480989846e-06, + "loss": 0.79765135, + "num_input_tokens_seen": 135314845, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.77734375, + "step": 6304, + "time_per_iteration": 5.393261432647705 + }, + { + "auxiliary_loss_clip": 0.01127431, + "auxiliary_loss_mlp": 0.01037711, + "balance_loss_clip": 1.02176046, + "balance_loss_mlp": 1.04611588, + "epoch": 0.37907710807154665, + "flos": 18437436230400.0, + "grad_norm": 2.1278904960845724, + "language_loss": 0.80447114, + "learning_rate": 2.8522210295251146e-06, + "loss": 0.82612252, + "num_input_tokens_seen": 135333055, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8125, + "step": 6305, + "time_per_iteration": 2.471761703491211 + }, + { + "auxiliary_loss_clip": 0.01041012, + "auxiliary_loss_mlp": 0.0101182, + "balance_loss_clip": 1.01011562, + "balance_loss_mlp": 1.01491702, + "epoch": 0.3791372313242146, + "flos": 50107165954560.0, + "grad_norm": 0.9794242329238577, + "language_loss": 0.64524716, + "learning_rate": 2.8518686786349387e-06, + "loss": 0.66577548, + "num_input_tokens_seen": 135387865, + "router_z_loss_clip": 0.01708984, + "router_z_loss_mlp": 0.26171875, + "step": 6306, + "time_per_iteration": 2.9702882766723633 + }, + { + "auxiliary_loss_clip": 0.01126961, + "auxiliary_loss_mlp": 0.01048893, + "balance_loss_clip": 1.03371215, + "balance_loss_mlp": 1.04693508, + "epoch": 0.3791973545768826, + "flos": 24316551809280.0, + "grad_norm": 1.6253037153644523, + "language_loss": 0.73722827, + "learning_rate": 2.851516295441817e-06, + "loss": 0.75898677, + "num_input_tokens_seen": 135409095, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.80078125, + "step": 6307, + "time_per_iteration": 2.508127450942993 + }, + { + "auxiliary_loss_clip": 0.01124488, + "auxiliary_loss_mlp": 0.01040535, + "balance_loss_clip": 1.02550268, + "balance_loss_mlp": 1.04390907, + "epoch": 0.3792574778295506, + "flos": 21579907438080.0, + "grad_norm": 1.494726737463818, + "language_loss": 0.78469551, + "learning_rate": 2.851163879959112e-06, + "loss": 0.80634576, + "num_input_tokens_seen": 135429585, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8046875, + "step": 6308, + "time_per_iteration": 2.453012466430664 + }, + { + "auxiliary_loss_clip": 0.01120475, + "auxiliary_loss_mlp": 0.01040507, + "balance_loss_clip": 1.02552223, + "balance_loss_mlp": 1.04146767, + "epoch": 0.37931760108221857, + "flos": 22272731942400.0, + "grad_norm": 2.8302348181917263, + "language_loss": 0.73083341, + "learning_rate": 2.8508114322001876e-06, + "loss": 0.75244319, + "num_input_tokens_seen": 135446320, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7890625, + "step": 6309, + "time_per_iteration": 2.495020866394043 + }, + { + "auxiliary_loss_clip": 0.01122333, + "auxiliary_loss_mlp": 0.01039635, + "balance_loss_clip": 1.02509165, + "balance_loss_mlp": 1.04503894, + "epoch": 0.37937772433488653, + "flos": 19682998197120.0, + "grad_norm": 1.4661467923449947, + "language_loss": 0.78449893, + "learning_rate": 2.8504589521784083e-06, + "loss": 0.80611867, + "num_input_tokens_seen": 135465720, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7734375, + "step": 6310, + "time_per_iteration": 2.466533899307251 + }, + { + "auxiliary_loss_clip": 0.01121702, + "auxiliary_loss_mlp": 0.0103985, + "balance_loss_clip": 1.02562881, + "balance_loss_mlp": 1.04319441, + "epoch": 0.3794378475875545, + "flos": 19099378016640.0, + "grad_norm": 1.894743489836823, + "language_loss": 0.76103079, + "learning_rate": 2.8501064399071403e-06, + "loss": 0.7826463, + "num_input_tokens_seen": 135485155, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78515625, + "step": 6311, + "time_per_iteration": 2.4859142303466797 + }, + { + "auxiliary_loss_clip": 0.0112103, + "auxiliary_loss_mlp": 0.0103355, + "balance_loss_clip": 1.01906657, + "balance_loss_mlp": 1.04379332, + "epoch": 0.37949797084022246, + "flos": 20339660684160.0, + "grad_norm": 1.4829862533126659, + "language_loss": 0.71025705, + "learning_rate": 2.8497538953997504e-06, + "loss": 0.73180288, + "num_input_tokens_seen": 135502675, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7734375, + "step": 6312, + "time_per_iteration": 2.4632480144500732 + }, + { + "auxiliary_loss_clip": 0.01041554, + "auxiliary_loss_mlp": 0.01005886, + "balance_loss_clip": 1.00425243, + "balance_loss_mlp": 1.01538157, + "epoch": 0.37955809409289043, + "flos": 63972203477760.0, + "grad_norm": 0.7762054489660294, + "language_loss": 0.56084001, + "learning_rate": 2.849401318669608e-06, + "loss": 0.58131444, + "num_input_tokens_seen": 135562005, + "router_z_loss_clip": 0.01635742, + "router_z_loss_mlp": 0.26171875, + "step": 6313, + "time_per_iteration": 3.0646302700042725 + }, + { + "auxiliary_loss_clip": 0.0112246, + "auxiliary_loss_mlp": 0.01043557, + "balance_loss_clip": 1.02876949, + "balance_loss_mlp": 1.04362202, + "epoch": 0.3796182173455584, + "flos": 31540665179520.0, + "grad_norm": 4.480184070608776, + "language_loss": 0.7158128, + "learning_rate": 2.849048709730083e-06, + "loss": 0.73747301, + "num_input_tokens_seen": 135582600, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.7890625, + "step": 6314, + "time_per_iteration": 2.5263309478759766 + }, + { + "auxiliary_loss_clip": 0.01126357, + "auxiliary_loss_mlp": 0.01038649, + "balance_loss_clip": 1.02331841, + "balance_loss_mlp": 1.04427075, + "epoch": 0.37967834059822636, + "flos": 12130804978560.0, + "grad_norm": 1.7655759267809688, + "language_loss": 0.73132306, + "learning_rate": 2.848696068594545e-06, + "loss": 0.75297308, + "num_input_tokens_seen": 135600280, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8203125, + "step": 6315, + "time_per_iteration": 2.4753336906433105 + }, + { + "auxiliary_loss_clip": 0.0111862, + "auxiliary_loss_mlp": 0.01038847, + "balance_loss_clip": 1.02454782, + "balance_loss_mlp": 1.04206967, + "epoch": 0.3797384638508943, + "flos": 39348578298240.0, + "grad_norm": 2.0286726324195477, + "language_loss": 0.71049547, + "learning_rate": 2.8483433952763677e-06, + "loss": 0.73207021, + "num_input_tokens_seen": 135621560, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.765625, + "step": 6316, + "time_per_iteration": 2.636176824569702 + }, + { + "auxiliary_loss_clip": 0.01122, + "auxiliary_loss_mlp": 0.01038731, + "balance_loss_clip": 1.02524233, + "balance_loss_mlp": 1.04524136, + "epoch": 0.3797985871035623, + "flos": 34054016653440.0, + "grad_norm": 1.8086467732489355, + "language_loss": 0.65270519, + "learning_rate": 2.847990689788923e-06, + "loss": 0.67431247, + "num_input_tokens_seen": 135641745, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 6317, + "time_per_iteration": 2.595952033996582 + }, + { + "auxiliary_loss_clip": 0.01118597, + "auxiliary_loss_mlp": 0.0103544, + "balance_loss_clip": 1.02174878, + "balance_loss_mlp": 1.04161143, + "epoch": 0.37985871035623026, + "flos": 23222174186880.0, + "grad_norm": 2.0501625369641867, + "language_loss": 0.85361171, + "learning_rate": 2.8476379521455877e-06, + "loss": 0.87515211, + "num_input_tokens_seen": 135660650, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76953125, + "step": 6318, + "time_per_iteration": 2.4805264472961426 + }, + { + "auxiliary_loss_clip": 0.01124758, + "auxiliary_loss_mlp": 0.01040235, + "balance_loss_clip": 1.02443993, + "balance_loss_mlp": 1.04483223, + "epoch": 0.3799188336088982, + "flos": 18114958903680.0, + "grad_norm": 2.489676718863087, + "language_loss": 0.76274204, + "learning_rate": 2.8472851823597354e-06, + "loss": 0.784392, + "num_input_tokens_seen": 135679980, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.796875, + "step": 6319, + "time_per_iteration": 2.4780025482177734 + }, + { + "auxiliary_loss_clip": 0.01123743, + "auxiliary_loss_mlp": 0.01044293, + "balance_loss_clip": 1.02961218, + "balance_loss_mlp": 1.04587555, + "epoch": 0.3799789568615662, + "flos": 21871897096320.0, + "grad_norm": 1.6998661229427972, + "language_loss": 0.63923568, + "learning_rate": 2.846932380444744e-06, + "loss": 0.66091597, + "num_input_tokens_seen": 135699400, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78125, + "step": 6320, + "time_per_iteration": 2.4700872898101807 + }, + { + "auxiliary_loss_clip": 0.01121041, + "auxiliary_loss_mlp": 0.01038091, + "balance_loss_clip": 1.02375042, + "balance_loss_mlp": 1.04365289, + "epoch": 0.3800390801142342, + "flos": 32962943082240.0, + "grad_norm": 1.883216130529445, + "language_loss": 0.7112022, + "learning_rate": 2.846579546413992e-06, + "loss": 0.73279351, + "num_input_tokens_seen": 135723455, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7734375, + "step": 6321, + "time_per_iteration": 2.5686967372894287 + }, + { + "auxiliary_loss_clip": 0.01123308, + "auxiliary_loss_mlp": 0.01038205, + "balance_loss_clip": 1.02372098, + "balance_loss_mlp": 1.04298186, + "epoch": 0.38009920336690217, + "flos": 26907075653760.0, + "grad_norm": 1.720302384597662, + "language_loss": 0.74730933, + "learning_rate": 2.846226680280859e-06, + "loss": 0.76892447, + "num_input_tokens_seen": 135744335, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8046875, + "step": 6322, + "time_per_iteration": 2.5368685722351074 + }, + { + "auxiliary_loss_clip": 0.01121658, + "auxiliary_loss_mlp": 0.01036997, + "balance_loss_clip": 1.02155948, + "balance_loss_mlp": 1.04405749, + "epoch": 0.38015932661957014, + "flos": 22488913946880.0, + "grad_norm": 2.6715016816856787, + "language_loss": 0.84910119, + "learning_rate": 2.845873782058725e-06, + "loss": 0.87068772, + "num_input_tokens_seen": 135761440, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.77734375, + "step": 6323, + "time_per_iteration": 2.483771562576294 + }, + { + "auxiliary_loss_clip": 0.01123254, + "auxiliary_loss_mlp": 0.01035788, + "balance_loss_clip": 1.01983762, + "balance_loss_mlp": 1.04395103, + "epoch": 0.3802194498722381, + "flos": 21980993679360.0, + "grad_norm": 2.3955157937634586, + "language_loss": 0.73466647, + "learning_rate": 2.845520851760973e-06, + "loss": 0.75625694, + "num_input_tokens_seen": 135779955, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.79296875, + "step": 6324, + "time_per_iteration": 2.4709885120391846 + }, + { + "auxiliary_loss_clip": 0.0112564, + "auxiliary_loss_mlp": 0.01035755, + "balance_loss_clip": 1.02020979, + "balance_loss_mlp": 1.045573, + "epoch": 0.38027957312490607, + "flos": 21324869896320.0, + "grad_norm": 1.6580896914625747, + "language_loss": 0.84147018, + "learning_rate": 2.8451678894009847e-06, + "loss": 0.86308414, + "num_input_tokens_seen": 135799840, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.80078125, + "step": 6325, + "time_per_iteration": 2.4846572875976562 + }, + { + "auxiliary_loss_clip": 0.01122273, + "auxiliary_loss_mlp": 0.01032055, + "balance_loss_clip": 1.01833439, + "balance_loss_mlp": 1.04476464, + "epoch": 0.38033969637757403, + "flos": 16691244456960.0, + "grad_norm": 1.7291759572194114, + "language_loss": 0.79642469, + "learning_rate": 2.8448148949921465e-06, + "loss": 0.81796801, + "num_input_tokens_seen": 135817880, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.77734375, + "step": 6326, + "time_per_iteration": 2.4206631183624268 + }, + { + "auxiliary_loss_clip": 0.0111945, + "auxiliary_loss_mlp": 0.01038943, + "balance_loss_clip": 1.02524638, + "balance_loss_mlp": 1.04261708, + "epoch": 0.380399819630242, + "flos": 36210847685760.0, + "grad_norm": 1.8040593924859922, + "language_loss": 0.72696453, + "learning_rate": 2.844461868547842e-06, + "loss": 0.74854851, + "num_input_tokens_seen": 135838940, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76953125, + "step": 6327, + "time_per_iteration": 2.5964794158935547 + }, + { + "auxiliary_loss_clip": 0.01122464, + "auxiliary_loss_mlp": 0.01037019, + "balance_loss_clip": 1.02165246, + "balance_loss_mlp": 1.04614949, + "epoch": 0.38045994288290996, + "flos": 21288851533440.0, + "grad_norm": 1.6287717027141382, + "language_loss": 0.83090091, + "learning_rate": 2.844108810081459e-06, + "loss": 0.85249579, + "num_input_tokens_seen": 135858325, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.76171875, + "step": 6328, + "time_per_iteration": 2.4602181911468506 + }, + { + "auxiliary_loss_clip": 0.01120102, + "auxiliary_loss_mlp": 0.01032163, + "balance_loss_clip": 1.01746464, + "balance_loss_mlp": 1.04347932, + "epoch": 0.38052006613557793, + "flos": 20922885815040.0, + "grad_norm": 1.31755328246291, + "language_loss": 0.61384171, + "learning_rate": 2.843755719606385e-06, + "loss": 0.63536435, + "num_input_tokens_seen": 135878430, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.765625, + "step": 6329, + "time_per_iteration": 2.5268959999084473 + }, + { + "auxiliary_loss_clip": 0.01124125, + "auxiliary_loss_mlp": 0.01041725, + "balance_loss_clip": 1.02731824, + "balance_loss_mlp": 1.04603863, + "epoch": 0.3805801893882459, + "flos": 20990720649600.0, + "grad_norm": 1.7232754549878644, + "language_loss": 0.5586049, + "learning_rate": 2.8434025971360104e-06, + "loss": 0.58026338, + "num_input_tokens_seen": 135894755, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 6330, + "time_per_iteration": 2.450221061706543 + }, + { + "auxiliary_loss_clip": 0.01119473, + "auxiliary_loss_mlp": 0.01035672, + "balance_loss_clip": 1.02255917, + "balance_loss_mlp": 1.04540074, + "epoch": 0.38064031264091386, + "flos": 25558594243200.0, + "grad_norm": 1.7778053530951745, + "language_loss": 0.65694439, + "learning_rate": 2.8430494426837243e-06, + "loss": 0.67849582, + "num_input_tokens_seen": 135918275, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 6331, + "time_per_iteration": 2.544187545776367 + }, + { + "auxiliary_loss_clip": 0.01126283, + "auxiliary_loss_mlp": 0.01041557, + "balance_loss_clip": 1.02635133, + "balance_loss_mlp": 1.04744291, + "epoch": 0.3807004358935818, + "flos": 15085857997440.0, + "grad_norm": 1.725296368277029, + "language_loss": 0.75737906, + "learning_rate": 2.842696256262919e-06, + "loss": 0.77905744, + "num_input_tokens_seen": 135937430, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.7890625, + "step": 6332, + "time_per_iteration": 2.443654775619507 + }, + { + "auxiliary_loss_clip": 0.01123212, + "auxiliary_loss_mlp": 0.0104071, + "balance_loss_clip": 1.02546334, + "balance_loss_mlp": 1.04323936, + "epoch": 0.3807605591462498, + "flos": 16399398453120.0, + "grad_norm": 2.2212054448627425, + "language_loss": 0.81889552, + "learning_rate": 2.842343037886987e-06, + "loss": 0.84053469, + "num_input_tokens_seen": 135954210, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.80078125, + "step": 6333, + "time_per_iteration": 2.467007637023926 + }, + { + "auxiliary_loss_clip": 0.01121534, + "auxiliary_loss_mlp": 0.01033012, + "balance_loss_clip": 1.0190227, + "balance_loss_mlp": 1.04437923, + "epoch": 0.3808206823989178, + "flos": 29057083102080.0, + "grad_norm": 1.583221243495577, + "language_loss": 0.86192155, + "learning_rate": 2.8419897875693226e-06, + "loss": 0.88346696, + "num_input_tokens_seen": 135974425, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7734375, + "step": 6334, + "time_per_iteration": 2.521341323852539 + }, + { + "auxiliary_loss_clip": 0.01123312, + "auxiliary_loss_mlp": 0.01035753, + "balance_loss_clip": 1.02130485, + "balance_loss_mlp": 1.04498506, + "epoch": 0.3808808056515858, + "flos": 15705855676800.0, + "grad_norm": 2.2115670432842847, + "language_loss": 0.79179001, + "learning_rate": 2.841636505323321e-06, + "loss": 0.8133806, + "num_input_tokens_seen": 135991985, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 6335, + "time_per_iteration": 2.4648449420928955 + }, + { + "auxiliary_loss_clip": 0.01124606, + "auxiliary_loss_mlp": 0.01035281, + "balance_loss_clip": 1.02027273, + "balance_loss_mlp": 1.04485524, + "epoch": 0.38094092890425374, + "flos": 20704584908160.0, + "grad_norm": 1.872233235491229, + "language_loss": 0.72775364, + "learning_rate": 2.8412831911623795e-06, + "loss": 0.74935251, + "num_input_tokens_seen": 136010015, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.796875, + "step": 6336, + "time_per_iteration": 2.443255662918091 + }, + { + "auxiliary_loss_clip": 0.01119223, + "auxiliary_loss_mlp": 0.01031785, + "balance_loss_clip": 1.0180763, + "balance_loss_mlp": 1.0430727, + "epoch": 0.3810010521569217, + "flos": 20667956014080.0, + "grad_norm": 1.9910419737037044, + "language_loss": 0.69146657, + "learning_rate": 2.840929845099894e-06, + "loss": 0.71297657, + "num_input_tokens_seen": 136028440, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76171875, + "step": 6337, + "time_per_iteration": 2.4838876724243164 + }, + { + "auxiliary_loss_clip": 0.01124374, + "auxiliary_loss_mlp": 0.01035164, + "balance_loss_clip": 1.02016187, + "balance_loss_mlp": 1.04606009, + "epoch": 0.38106117540958967, + "flos": 31827626933760.0, + "grad_norm": 1.9033617326941272, + "language_loss": 0.63247615, + "learning_rate": 2.8405764671492652e-06, + "loss": 0.65407151, + "num_input_tokens_seen": 136048360, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.78125, + "step": 6338, + "time_per_iteration": 2.5538294315338135 + }, + { + "auxiliary_loss_clip": 0.01123732, + "auxiliary_loss_mlp": 0.01040443, + "balance_loss_clip": 1.02492189, + "balance_loss_mlp": 1.04498446, + "epoch": 0.38112129866225763, + "flos": 16902757693440.0, + "grad_norm": 1.8718033662194862, + "language_loss": 0.69288802, + "learning_rate": 2.8402230573238923e-06, + "loss": 0.71452975, + "num_input_tokens_seen": 136065500, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.7890625, + "step": 6339, + "time_per_iteration": 2.490813970565796 + }, + { + "auxiliary_loss_clip": 0.0112515, + "auxiliary_loss_mlp": 0.01040007, + "balance_loss_clip": 1.0256902, + "balance_loss_mlp": 1.0461787, + "epoch": 0.3811814219149256, + "flos": 20887226588160.0, + "grad_norm": 2.5980221539464914, + "language_loss": 0.68312418, + "learning_rate": 2.839869615637177e-06, + "loss": 0.70477575, + "num_input_tokens_seen": 136084060, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7890625, + "step": 6340, + "time_per_iteration": 2.4576282501220703 + }, + { + "auxiliary_loss_clip": 0.01124677, + "auxiliary_loss_mlp": 0.01036157, + "balance_loss_clip": 1.02026618, + "balance_loss_mlp": 1.04393721, + "epoch": 0.38124154516759357, + "flos": 16690813493760.0, + "grad_norm": 2.141170258916756, + "language_loss": 0.89404309, + "learning_rate": 2.839516142102522e-06, + "loss": 0.91565144, + "num_input_tokens_seen": 136102310, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.80859375, + "step": 6341, + "time_per_iteration": 2.4688920974731445 + }, + { + "auxiliary_loss_clip": 0.01126312, + "auxiliary_loss_mlp": 0.01040778, + "balance_loss_clip": 1.02477455, + "balance_loss_mlp": 1.04559851, + "epoch": 0.38130166842026153, + "flos": 19681956702720.0, + "grad_norm": 1.5516456894508346, + "language_loss": 0.74665564, + "learning_rate": 2.83916263673333e-06, + "loss": 0.76832652, + "num_input_tokens_seen": 136120725, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8046875, + "step": 6342, + "time_per_iteration": 2.4610931873321533 + }, + { + "auxiliary_loss_clip": 0.0112203, + "auxiliary_loss_mlp": 0.01034151, + "balance_loss_clip": 1.01900578, + "balance_loss_mlp": 1.04325199, + "epoch": 0.3813617916729295, + "flos": 22198432659840.0, + "grad_norm": 1.6121504127073445, + "language_loss": 0.83334327, + "learning_rate": 2.838809099543007e-06, + "loss": 0.85490513, + "num_input_tokens_seen": 136139105, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.7890625, + "step": 6343, + "time_per_iteration": 2.490952730178833 + }, + { + "auxiliary_loss_clip": 0.0112236, + "auxiliary_loss_mlp": 0.01037814, + "balance_loss_clip": 1.0233357, + "balance_loss_mlp": 1.04305577, + "epoch": 0.38142191492559746, + "flos": 19096899978240.0, + "grad_norm": 1.5912858717665679, + "language_loss": 0.76965082, + "learning_rate": 2.838455530544959e-06, + "loss": 0.79125255, + "num_input_tokens_seen": 136158265, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.79296875, + "step": 6344, + "time_per_iteration": 2.458669424057007 + }, + { + "auxiliary_loss_clip": 0.01126022, + "auxiliary_loss_mlp": 0.01039382, + "balance_loss_clip": 1.02413464, + "balance_loss_mlp": 1.04601693, + "epoch": 0.3814820381782654, + "flos": 24097748112000.0, + "grad_norm": 2.369132092535199, + "language_loss": 0.72790027, + "learning_rate": 2.838101929752593e-06, + "loss": 0.7495544, + "num_input_tokens_seen": 136176100, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.796875, + "step": 6345, + "time_per_iteration": 5.361874341964722 + }, + { + "auxiliary_loss_clip": 0.0112125, + "auxiliary_loss_mlp": 0.01035577, + "balance_loss_clip": 1.02172494, + "balance_loss_mlp": 1.04348969, + "epoch": 0.3815421614309334, + "flos": 15778502933760.0, + "grad_norm": 1.723509048793367, + "language_loss": 0.69687438, + "learning_rate": 2.8377482971793187e-06, + "loss": 0.71844268, + "num_input_tokens_seen": 136195125, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.77734375, + "step": 6346, + "time_per_iteration": 3.8780832290649414 + }, + { + "auxiliary_loss_clip": 0.0112555, + "auxiliary_loss_mlp": 0.01037985, + "balance_loss_clip": 1.02351856, + "balance_loss_mlp": 1.04639161, + "epoch": 0.38160228468360136, + "flos": 19899754819200.0, + "grad_norm": 1.8691929226070287, + "language_loss": 0.75860906, + "learning_rate": 2.8373946328385437e-06, + "loss": 0.78024441, + "num_input_tokens_seen": 136213885, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.79296875, + "step": 6347, + "time_per_iteration": 2.4724838733673096 + }, + { + "auxiliary_loss_clip": 0.01121549, + "auxiliary_loss_mlp": 0.01036633, + "balance_loss_clip": 1.02258432, + "balance_loss_mlp": 1.04272556, + "epoch": 0.3816624079362694, + "flos": 19281050029440.0, + "grad_norm": 1.5494744961647557, + "language_loss": 0.74775678, + "learning_rate": 2.8370409367436813e-06, + "loss": 0.76933861, + "num_input_tokens_seen": 136232700, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 6348, + "time_per_iteration": 2.4360201358795166 + }, + { + "auxiliary_loss_clip": 0.01121636, + "auxiliary_loss_mlp": 0.01034098, + "balance_loss_clip": 1.01947105, + "balance_loss_mlp": 1.04346061, + "epoch": 0.38172253118893734, + "flos": 21177564220800.0, + "grad_norm": 2.012782025185047, + "language_loss": 0.86987114, + "learning_rate": 2.836687208908142e-06, + "loss": 0.89142847, + "num_input_tokens_seen": 136248975, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78125, + "step": 6349, + "time_per_iteration": 2.4653983116149902 + }, + { + "auxiliary_loss_clip": 0.01121297, + "auxiliary_loss_mlp": 0.01040466, + "balance_loss_clip": 1.02576792, + "balance_loss_mlp": 1.04300261, + "epoch": 0.3817826544416053, + "flos": 17529219820800.0, + "grad_norm": 3.1419886249283624, + "language_loss": 0.76335979, + "learning_rate": 2.836333449345341e-06, + "loss": 0.78497744, + "num_input_tokens_seen": 136266710, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78125, + "step": 6350, + "time_per_iteration": 2.4111151695251465 + }, + { + "auxiliary_loss_clip": 0.01122319, + "auxiliary_loss_mlp": 0.01032772, + "balance_loss_clip": 1.01693547, + "balance_loss_mlp": 1.04389453, + "epoch": 0.38184277769427327, + "flos": 16326535714560.0, + "grad_norm": 2.0441694615934325, + "language_loss": 0.76182568, + "learning_rate": 2.8359796580686907e-06, + "loss": 0.78337657, + "num_input_tokens_seen": 136284445, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.78125, + "step": 6351, + "time_per_iteration": 2.449831485748291 + }, + { + "auxiliary_loss_clip": 0.0112512, + "auxiliary_loss_mlp": 0.01039723, + "balance_loss_clip": 1.0235939, + "balance_loss_mlp": 1.04464602, + "epoch": 0.38190290094694124, + "flos": 30443450382720.0, + "grad_norm": 1.6974231581634962, + "language_loss": 0.74360836, + "learning_rate": 2.8356258350916085e-06, + "loss": 0.76525676, + "num_input_tokens_seen": 136305730, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8046875, + "step": 6352, + "time_per_iteration": 2.5342295169830322 + }, + { + "auxiliary_loss_clip": 0.01117506, + "auxiliary_loss_mlp": 0.01033939, + "balance_loss_clip": 1.02103508, + "balance_loss_mlp": 1.04153097, + "epoch": 0.3819630241996092, + "flos": 14209924936320.0, + "grad_norm": 1.834359776939538, + "language_loss": 0.64362574, + "learning_rate": 2.8352719804275104e-06, + "loss": 0.66514015, + "num_input_tokens_seen": 136323850, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7578125, + "step": 6353, + "time_per_iteration": 2.434100866317749 + }, + { + "auxiliary_loss_clip": 0.01120333, + "auxiliary_loss_mlp": 0.01033689, + "balance_loss_clip": 1.02020061, + "balance_loss_mlp": 1.04363215, + "epoch": 0.38202314745227717, + "flos": 25009699536000.0, + "grad_norm": 1.6268216674771125, + "language_loss": 0.83035302, + "learning_rate": 2.834918094089816e-06, + "loss": 0.85189331, + "num_input_tokens_seen": 136344880, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 6354, + "time_per_iteration": 2.4903476238250732 + }, + { + "auxiliary_loss_clip": 0.0112166, + "auxiliary_loss_mlp": 0.01035018, + "balance_loss_clip": 1.02154744, + "balance_loss_mlp": 1.04571426, + "epoch": 0.38208327070494513, + "flos": 20814507504000.0, + "grad_norm": 1.7360324347242302, + "language_loss": 0.8071996, + "learning_rate": 2.834564176091943e-06, + "loss": 0.82876635, + "num_input_tokens_seen": 136366060, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 6355, + "time_per_iteration": 2.5086817741394043 + }, + { + "auxiliary_loss_clip": 0.0112186, + "auxiliary_loss_mlp": 0.01033195, + "balance_loss_clip": 1.01959288, + "balance_loss_mlp": 1.04464841, + "epoch": 0.3821433939576131, + "flos": 22637727993600.0, + "grad_norm": 1.7080815693685156, + "language_loss": 0.75032043, + "learning_rate": 2.8342102264473125e-06, + "loss": 0.77187097, + "num_input_tokens_seen": 136385625, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7734375, + "step": 6356, + "time_per_iteration": 2.471919298171997 + }, + { + "auxiliary_loss_clip": 0.01121242, + "auxiliary_loss_mlp": 0.0103649, + "balance_loss_clip": 1.02251887, + "balance_loss_mlp": 1.04420352, + "epoch": 0.38220351721028106, + "flos": 26869872142080.0, + "grad_norm": 1.8091380313160346, + "language_loss": 0.81251574, + "learning_rate": 2.833856245169348e-06, + "loss": 0.83409309, + "num_input_tokens_seen": 136405750, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76953125, + "step": 6357, + "time_per_iteration": 2.5302257537841797 + }, + { + "auxiliary_loss_clip": 0.01127375, + "auxiliary_loss_mlp": 0.01040855, + "balance_loss_clip": 1.02465415, + "balance_loss_mlp": 1.04773057, + "epoch": 0.38226364046294903, + "flos": 23367468700800.0, + "grad_norm": 3.08273691075534, + "language_loss": 0.77903318, + "learning_rate": 2.8335022322714695e-06, + "loss": 0.80071545, + "num_input_tokens_seen": 136426085, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.796875, + "step": 6358, + "time_per_iteration": 2.4700090885162354 + }, + { + "auxiliary_loss_clip": 0.01122323, + "auxiliary_loss_mlp": 0.01040593, + "balance_loss_clip": 1.02576303, + "balance_loss_mlp": 1.0432725, + "epoch": 0.382323763715617, + "flos": 19646225648640.0, + "grad_norm": 2.070211767582473, + "language_loss": 0.78700459, + "learning_rate": 2.8331481877671036e-06, + "loss": 0.80863374, + "num_input_tokens_seen": 136442670, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7890625, + "step": 6359, + "time_per_iteration": 2.4555094242095947 + }, + { + "auxiliary_loss_clip": 0.01118608, + "auxiliary_loss_mlp": 0.01041395, + "balance_loss_clip": 1.02698255, + "balance_loss_mlp": 1.04290545, + "epoch": 0.38238388696828496, + "flos": 54124741232640.0, + "grad_norm": 2.6399902686671113, + "language_loss": 0.69392359, + "learning_rate": 2.8327941116696754e-06, + "loss": 0.7155236, + "num_input_tokens_seen": 136465730, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7578125, + "step": 6360, + "time_per_iteration": 2.736069440841675 + }, + { + "auxiliary_loss_clip": 0.01118797, + "auxiliary_loss_mlp": 0.01031345, + "balance_loss_clip": 1.01632452, + "balance_loss_mlp": 1.04197633, + "epoch": 0.382444010220953, + "flos": 24936190352640.0, + "grad_norm": 1.9168722583294633, + "language_loss": 0.78836095, + "learning_rate": 2.83244000399261e-06, + "loss": 0.80986238, + "num_input_tokens_seen": 136487215, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.765625, + "step": 6361, + "time_per_iteration": 2.511254072189331 + }, + { + "auxiliary_loss_clip": 0.01115444, + "auxiliary_loss_mlp": 0.01036962, + "balance_loss_clip": 1.02274048, + "balance_loss_mlp": 1.04114652, + "epoch": 0.38250413347362094, + "flos": 42337351209600.0, + "grad_norm": 1.4566170801765106, + "language_loss": 0.65315771, + "learning_rate": 2.832085864749337e-06, + "loss": 0.67468172, + "num_input_tokens_seen": 136510365, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7421875, + "step": 6362, + "time_per_iteration": 2.632784128189087 + }, + { + "auxiliary_loss_clip": 0.01118848, + "auxiliary_loss_mlp": 0.01032495, + "balance_loss_clip": 1.01779675, + "balance_loss_mlp": 1.04175615, + "epoch": 0.3825642567262889, + "flos": 16289224462080.0, + "grad_norm": 1.8527291741217293, + "language_loss": 0.82063204, + "learning_rate": 2.8317316939532848e-06, + "loss": 0.84214544, + "num_input_tokens_seen": 136527100, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.76953125, + "step": 6363, + "time_per_iteration": 2.4478373527526855 + }, + { + "auxiliary_loss_clip": 0.01119064, + "auxiliary_loss_mlp": 0.01042512, + "balance_loss_clip": 1.02837944, + "balance_loss_mlp": 1.0446111, + "epoch": 0.3826243799789569, + "flos": 45654778586880.0, + "grad_norm": 1.811422380776527, + "language_loss": 0.58428323, + "learning_rate": 2.8313774916178825e-06, + "loss": 0.60589898, + "num_input_tokens_seen": 136550870, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7421875, + "step": 6364, + "time_per_iteration": 2.655128002166748 + }, + { + "auxiliary_loss_clip": 0.01123151, + "auxiliary_loss_mlp": 0.01039269, + "balance_loss_clip": 1.02496374, + "balance_loss_mlp": 1.04423463, + "epoch": 0.38268450323162484, + "flos": 25301581453440.0, + "grad_norm": 2.1451175401130893, + "language_loss": 0.68881112, + "learning_rate": 2.8310232577565635e-06, + "loss": 0.71043533, + "num_input_tokens_seen": 136569895, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7890625, + "step": 6365, + "time_per_iteration": 2.51526141166687 + }, + { + "auxiliary_loss_clip": 0.01121408, + "auxiliary_loss_mlp": 0.01036007, + "balance_loss_clip": 1.02065301, + "balance_loss_mlp": 1.04057527, + "epoch": 0.3827446264842928, + "flos": 21836022387840.0, + "grad_norm": 4.555943608034253, + "language_loss": 0.73442698, + "learning_rate": 2.830668992382758e-06, + "loss": 0.75600111, + "num_input_tokens_seen": 136588585, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8046875, + "step": 6366, + "time_per_iteration": 2.448585033416748 + }, + { + "auxiliary_loss_clip": 0.01120534, + "auxiliary_loss_mlp": 0.01035025, + "balance_loss_clip": 1.02026677, + "balance_loss_mlp": 1.04226327, + "epoch": 0.38280474973696077, + "flos": 25734591907200.0, + "grad_norm": 2.0234001922769327, + "language_loss": 0.68829554, + "learning_rate": 2.830314695509902e-06, + "loss": 0.70985115, + "num_input_tokens_seen": 136606640, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.78125, + "step": 6367, + "time_per_iteration": 2.569301128387451 + }, + { + "auxiliary_loss_clip": 0.0111708, + "auxiliary_loss_mlp": 0.01033716, + "balance_loss_clip": 1.01917887, + "balance_loss_mlp": 1.04202485, + "epoch": 0.38286487298962874, + "flos": 24895934184960.0, + "grad_norm": 4.344593393004367, + "language_loss": 0.6481666, + "learning_rate": 2.82996036715143e-06, + "loss": 0.66967463, + "num_input_tokens_seen": 136624940, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.75, + "step": 6368, + "time_per_iteration": 2.4531960487365723 + }, + { + "auxiliary_loss_clip": 0.01120319, + "auxiliary_loss_mlp": 0.01034921, + "balance_loss_clip": 1.02053833, + "balance_loss_mlp": 1.04277039, + "epoch": 0.3829249962422967, + "flos": 28543703967360.0, + "grad_norm": 1.315785818077373, + "language_loss": 0.68389189, + "learning_rate": 2.8296060073207763e-06, + "loss": 0.70544434, + "num_input_tokens_seen": 136645540, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7734375, + "step": 6369, + "time_per_iteration": 2.5403318405151367 + }, + { + "auxiliary_loss_clip": 0.01116957, + "auxiliary_loss_mlp": 0.01042768, + "balance_loss_clip": 1.02774167, + "balance_loss_mlp": 1.04172897, + "epoch": 0.38298511949496467, + "flos": 21471205904640.0, + "grad_norm": 1.7184057003296296, + "language_loss": 0.78214431, + "learning_rate": 2.8292516160313804e-06, + "loss": 0.80374157, + "num_input_tokens_seen": 136664530, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.75, + "step": 6370, + "time_per_iteration": 2.4397096633911133 + }, + { + "auxiliary_loss_clip": 0.01119925, + "auxiliary_loss_mlp": 0.01039652, + "balance_loss_clip": 1.02569818, + "balance_loss_mlp": 1.04368424, + "epoch": 0.38304524274763263, + "flos": 31679998035840.0, + "grad_norm": 2.8055794910549525, + "language_loss": 0.64556968, + "learning_rate": 2.8288971932966805e-06, + "loss": 0.66716546, + "num_input_tokens_seen": 136682315, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76171875, + "step": 6371, + "time_per_iteration": 2.5470147132873535 + }, + { + "auxiliary_loss_clip": 0.01124784, + "auxiliary_loss_mlp": 0.01037674, + "balance_loss_clip": 1.0221653, + "balance_loss_mlp": 1.04452634, + "epoch": 0.3831053660003006, + "flos": 25076816098560.0, + "grad_norm": 1.8238449128176952, + "language_loss": 0.72682339, + "learning_rate": 2.8285427391301155e-06, + "loss": 0.7484479, + "num_input_tokens_seen": 136701185, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.80078125, + "step": 6372, + "time_per_iteration": 2.47695255279541 + }, + { + "auxiliary_loss_clip": 0.01121696, + "auxiliary_loss_mlp": 0.01038223, + "balance_loss_clip": 1.02325058, + "balance_loss_mlp": 1.04308939, + "epoch": 0.38316548925296856, + "flos": 23259018562560.0, + "grad_norm": 1.5970403518130607, + "language_loss": 0.84758627, + "learning_rate": 2.8281882535451266e-06, + "loss": 0.86918551, + "num_input_tokens_seen": 136721265, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.7890625, + "step": 6373, + "time_per_iteration": 2.514571189880371 + }, + { + "auxiliary_loss_clip": 0.01124014, + "auxiliary_loss_mlp": 0.01043172, + "balance_loss_clip": 1.02784181, + "balance_loss_mlp": 1.04392529, + "epoch": 0.3832256125056366, + "flos": 34423465991040.0, + "grad_norm": 4.718004058381721, + "language_loss": 0.74721354, + "learning_rate": 2.8278337365551567e-06, + "loss": 0.76888537, + "num_input_tokens_seen": 136741885, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.80078125, + "step": 6374, + "time_per_iteration": 2.5505032539367676 + }, + { + "auxiliary_loss_clip": 0.01124139, + "auxiliary_loss_mlp": 0.01041765, + "balance_loss_clip": 1.02675653, + "balance_loss_mlp": 1.04414058, + "epoch": 0.38328573575830455, + "flos": 21762764599680.0, + "grad_norm": 2.8586580554057472, + "language_loss": 0.75701195, + "learning_rate": 2.8274791881736485e-06, + "loss": 0.77867097, + "num_input_tokens_seen": 136760905, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.80078125, + "step": 6375, + "time_per_iteration": 2.467555522918701 + }, + { + "auxiliary_loss_clip": 0.01122331, + "auxiliary_loss_mlp": 0.01037666, + "balance_loss_clip": 1.02300918, + "balance_loss_mlp": 1.04375613, + "epoch": 0.3833458590109725, + "flos": 17380010724480.0, + "grad_norm": 2.257221103761015, + "language_loss": 0.72827101, + "learning_rate": 2.8271246084140457e-06, + "loss": 0.7498709, + "num_input_tokens_seen": 136777240, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78515625, + "step": 6376, + "time_per_iteration": 2.4082555770874023 + }, + { + "auxiliary_loss_clip": 0.01118574, + "auxiliary_loss_mlp": 0.01039859, + "balance_loss_clip": 1.02455282, + "balance_loss_mlp": 1.04245007, + "epoch": 0.3834059822636405, + "flos": 29424557191680.0, + "grad_norm": 1.5879949283042905, + "language_loss": 0.67586625, + "learning_rate": 2.826769997289796e-06, + "loss": 0.69745058, + "num_input_tokens_seen": 136801040, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.76171875, + "step": 6377, + "time_per_iteration": 2.54896879196167 + }, + { + "auxiliary_loss_clip": 0.01124961, + "auxiliary_loss_mlp": 0.01039863, + "balance_loss_clip": 1.02448511, + "balance_loss_mlp": 1.04608607, + "epoch": 0.38346610551630844, + "flos": 21470739027840.0, + "grad_norm": 2.1973025079181117, + "language_loss": 0.72991705, + "learning_rate": 2.826415354814344e-06, + "loss": 0.75156534, + "num_input_tokens_seen": 136819495, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.7890625, + "step": 6378, + "time_per_iteration": 2.4442975521087646 + }, + { + "auxiliary_loss_clip": 0.01121801, + "auxiliary_loss_mlp": 0.0104221, + "balance_loss_clip": 1.02755964, + "balance_loss_mlp": 1.04327178, + "epoch": 0.3835262287689764, + "flos": 27561224188800.0, + "grad_norm": 1.6808845830991803, + "language_loss": 0.69162869, + "learning_rate": 2.8260606810011396e-06, + "loss": 0.71326876, + "num_input_tokens_seen": 136838840, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78515625, + "step": 6379, + "time_per_iteration": 2.529088258743286 + }, + { + "auxiliary_loss_clip": 0.01121458, + "auxiliary_loss_mlp": 0.01038205, + "balance_loss_clip": 1.02344704, + "balance_loss_mlp": 1.04552865, + "epoch": 0.3835863520216444, + "flos": 15523716787200.0, + "grad_norm": 1.6321901167852362, + "language_loss": 0.82979369, + "learning_rate": 2.8257059758636315e-06, + "loss": 0.85139024, + "num_input_tokens_seen": 136854425, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7578125, + "step": 6380, + "time_per_iteration": 2.4336190223693848 + }, + { + "auxiliary_loss_clip": 0.01120843, + "auxiliary_loss_mlp": 0.0103481, + "balance_loss_clip": 1.02090406, + "balance_loss_mlp": 1.04595208, + "epoch": 0.38364647527431234, + "flos": 21904934630400.0, + "grad_norm": 1.4297951270127425, + "language_loss": 0.81347466, + "learning_rate": 2.8253512394152697e-06, + "loss": 0.83503115, + "num_input_tokens_seen": 136874355, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 6381, + "time_per_iteration": 2.5029306411743164 + }, + { + "auxiliary_loss_clip": 0.0104681, + "auxiliary_loss_mlp": 0.01005882, + "balance_loss_clip": 1.00420141, + "balance_loss_mlp": 1.02098966, + "epoch": 0.3837065985269803, + "flos": 65534927558400.0, + "grad_norm": 0.796129115027233, + "language_loss": 0.60459685, + "learning_rate": 2.8249964716695068e-06, + "loss": 0.6251238, + "num_input_tokens_seen": 136937475, + "router_z_loss_clip": 0.0168457, + "router_z_loss_mlp": 0.2578125, + "step": 6382, + "time_per_iteration": 3.0525829792022705 + }, + { + "auxiliary_loss_clip": 0.01123582, + "auxiliary_loss_mlp": 0.01036921, + "balance_loss_clip": 1.02186477, + "balance_loss_mlp": 1.04358447, + "epoch": 0.38376672177964827, + "flos": 28256598558720.0, + "grad_norm": 2.302869327575685, + "language_loss": 0.66052485, + "learning_rate": 2.824641672639794e-06, + "loss": 0.68212986, + "num_input_tokens_seen": 136955805, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.796875, + "step": 6383, + "time_per_iteration": 2.5166289806365967 + }, + { + "auxiliary_loss_clip": 0.01124634, + "auxiliary_loss_mlp": 0.01033937, + "balance_loss_clip": 1.01944149, + "balance_loss_mlp": 1.04657924, + "epoch": 0.38382684503231623, + "flos": 20631363033600.0, + "grad_norm": 2.2385812040155932, + "language_loss": 0.74811673, + "learning_rate": 2.824286842339587e-06, + "loss": 0.76970243, + "num_input_tokens_seen": 136975240, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 6384, + "time_per_iteration": 2.4451465606689453 + }, + { + "auxiliary_loss_clip": 0.01120418, + "auxiliary_loss_mlp": 0.01036466, + "balance_loss_clip": 1.02219081, + "balance_loss_mlp": 1.04429483, + "epoch": 0.3838869682849842, + "flos": 19605825826560.0, + "grad_norm": 1.4336247312181014, + "language_loss": 0.75883526, + "learning_rate": 2.823931980782341e-06, + "loss": 0.78040409, + "num_input_tokens_seen": 136994985, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7578125, + "step": 6385, + "time_per_iteration": 2.4994513988494873 + }, + { + "auxiliary_loss_clip": 0.01046845, + "auxiliary_loss_mlp": 0.0100207, + "balance_loss_clip": 1.0002346, + "balance_loss_mlp": 1.02044809, + "epoch": 0.38394709153765216, + "flos": 56556110891520.0, + "grad_norm": 0.9433326566144719, + "language_loss": 0.67094183, + "learning_rate": 2.82357708798151e-06, + "loss": 0.69143105, + "num_input_tokens_seen": 137046290, + "router_z_loss_clip": 0.01831055, + "router_z_loss_mlp": 0.265625, + "step": 6386, + "time_per_iteration": 2.938122272491455 + }, + { + "auxiliary_loss_clip": 0.0112227, + "auxiliary_loss_mlp": 0.01032989, + "balance_loss_clip": 1.01933384, + "balance_loss_mlp": 1.0465281, + "epoch": 0.3840072147903202, + "flos": 15888748752000.0, + "grad_norm": 1.7796918810721745, + "language_loss": 0.72464442, + "learning_rate": 2.8232221639505547e-06, + "loss": 0.74619704, + "num_input_tokens_seen": 137064725, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7578125, + "step": 6387, + "time_per_iteration": 5.465053081512451 + }, + { + "auxiliary_loss_clip": 0.01120429, + "auxiliary_loss_mlp": 0.01038992, + "balance_loss_clip": 1.02478194, + "balance_loss_mlp": 1.0451014, + "epoch": 0.38406733804298815, + "flos": 28218030330240.0, + "grad_norm": 1.6321565887315352, + "language_loss": 0.81181073, + "learning_rate": 2.822867208702932e-06, + "loss": 0.8334049, + "num_input_tokens_seen": 137086030, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.75390625, + "step": 6388, + "time_per_iteration": 3.940337657928467 + }, + { + "auxiliary_loss_clip": 0.01117866, + "auxiliary_loss_mlp": 0.01035288, + "balance_loss_clip": 1.02183485, + "balance_loss_mlp": 1.04249692, + "epoch": 0.3841274612956561, + "flos": 18223588609920.0, + "grad_norm": 1.6383752800672902, + "language_loss": 0.76158738, + "learning_rate": 2.8225122222521026e-06, + "loss": 0.78311884, + "num_input_tokens_seen": 137105400, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75390625, + "step": 6389, + "time_per_iteration": 2.4720914363861084 + }, + { + "auxiliary_loss_clip": 0.01125023, + "auxiliary_loss_mlp": 0.01044293, + "balance_loss_clip": 1.02846217, + "balance_loss_mlp": 1.04541564, + "epoch": 0.3841875845483241, + "flos": 19792884879360.0, + "grad_norm": 1.5616719605863645, + "language_loss": 0.76284117, + "learning_rate": 2.8221572046115273e-06, + "loss": 0.78453434, + "num_input_tokens_seen": 137124985, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.796875, + "step": 6390, + "time_per_iteration": 2.4576520919799805 + }, + { + "auxiliary_loss_clip": 0.01124413, + "auxiliary_loss_mlp": 0.01048913, + "balance_loss_clip": 1.03295112, + "balance_loss_mlp": 1.04433882, + "epoch": 0.38424770780099204, + "flos": 29898829393920.0, + "grad_norm": 1.6285452565530243, + "language_loss": 0.70119178, + "learning_rate": 2.821802155794668e-06, + "loss": 0.72292501, + "num_input_tokens_seen": 137146745, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.80078125, + "step": 6391, + "time_per_iteration": 2.5657877922058105 + }, + { + "auxiliary_loss_clip": 0.01121063, + "auxiliary_loss_mlp": 0.01035269, + "balance_loss_clip": 1.01978421, + "balance_loss_mlp": 1.04267848, + "epoch": 0.38430783105366, + "flos": 20813717404800.0, + "grad_norm": 1.938766253942268, + "language_loss": 0.84100312, + "learning_rate": 2.8214470758149884e-06, + "loss": 0.86256641, + "num_input_tokens_seen": 137163195, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.78515625, + "step": 6392, + "time_per_iteration": 2.4366884231567383 + }, + { + "auxiliary_loss_clip": 0.01120524, + "auxiliary_loss_mlp": 0.01035539, + "balance_loss_clip": 1.0215621, + "balance_loss_mlp": 1.04348612, + "epoch": 0.384367954306328, + "flos": 10998577399680.0, + "grad_norm": 2.11211623143903, + "language_loss": 0.61170864, + "learning_rate": 2.8210919646859536e-06, + "loss": 0.63326931, + "num_input_tokens_seen": 137179330, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76953125, + "step": 6393, + "time_per_iteration": 2.428238868713379 + }, + { + "auxiliary_loss_clip": 0.01128297, + "auxiliary_loss_mlp": 0.01035177, + "balance_loss_clip": 1.01886964, + "balance_loss_mlp": 1.04589796, + "epoch": 0.38442807755899594, + "flos": 25338030779520.0, + "grad_norm": 2.3555579295861775, + "language_loss": 0.71295553, + "learning_rate": 2.820736822421029e-06, + "loss": 0.73459029, + "num_input_tokens_seen": 137198655, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.82421875, + "step": 6394, + "time_per_iteration": 2.483506679534912 + }, + { + "auxiliary_loss_clip": 0.01129724, + "auxiliary_loss_mlp": 0.01033781, + "balance_loss_clip": 1.01760483, + "balance_loss_mlp": 1.04732203, + "epoch": 0.3844882008116639, + "flos": 21069760527360.0, + "grad_norm": 2.3366242235467047, + "language_loss": 0.81172824, + "learning_rate": 2.8203816490336822e-06, + "loss": 0.83336329, + "num_input_tokens_seen": 137217120, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.82421875, + "step": 6395, + "time_per_iteration": 2.471301317214966 + }, + { + "auxiliary_loss_clip": 0.01126851, + "auxiliary_loss_mlp": 0.01043233, + "balance_loss_clip": 1.02880275, + "balance_loss_mlp": 1.04770553, + "epoch": 0.38454832406433187, + "flos": 17963235855360.0, + "grad_norm": 3.9526859148826707, + "language_loss": 0.70642132, + "learning_rate": 2.8200264445373813e-06, + "loss": 0.72812212, + "num_input_tokens_seen": 137234410, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7890625, + "step": 6396, + "time_per_iteration": 2.4121108055114746 + }, + { + "auxiliary_loss_clip": 0.01046507, + "auxiliary_loss_mlp": 0.00999241, + "balance_loss_clip": 0.99745274, + "balance_loss_mlp": 1.01972008, + "epoch": 0.38460844731699984, + "flos": 67924999555200.0, + "grad_norm": 0.8889613923167966, + "language_loss": 0.59708536, + "learning_rate": 2.8196712089455954e-06, + "loss": 0.61754286, + "num_input_tokens_seen": 137294940, + "router_z_loss_clip": 0.01782227, + "router_z_loss_mlp": 0.26757812, + "step": 6397, + "time_per_iteration": 3.1453351974487305 + }, + { + "auxiliary_loss_clip": 0.01123309, + "auxiliary_loss_mlp": 0.01031546, + "balance_loss_clip": 1.01682329, + "balance_loss_mlp": 1.0459342, + "epoch": 0.3846685705696678, + "flos": 25849075530240.0, + "grad_norm": 1.8498202803423767, + "language_loss": 0.84868926, + "learning_rate": 2.819315942271794e-06, + "loss": 0.87023783, + "num_input_tokens_seen": 137315035, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.7734375, + "step": 6398, + "time_per_iteration": 2.488083839416504 + }, + { + "auxiliary_loss_clip": 0.01121502, + "auxiliary_loss_mlp": 0.01032478, + "balance_loss_clip": 1.01826787, + "balance_loss_mlp": 1.0444839, + "epoch": 0.38472869382233577, + "flos": 16290194129280.0, + "grad_norm": 1.942979036208199, + "language_loss": 0.79634017, + "learning_rate": 2.8189606445294515e-06, + "loss": 0.81787992, + "num_input_tokens_seen": 137333155, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7734375, + "step": 6399, + "time_per_iteration": 2.4537224769592285 + }, + { + "auxiliary_loss_clip": 0.01124087, + "auxiliary_loss_mlp": 0.01036794, + "balance_loss_clip": 1.02149892, + "balance_loss_mlp": 1.04439902, + "epoch": 0.38478881707500373, + "flos": 19353122668800.0, + "grad_norm": 1.8928366067789952, + "language_loss": 0.67337728, + "learning_rate": 2.818605315732038e-06, + "loss": 0.69498605, + "num_input_tokens_seen": 137351515, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.796875, + "step": 6400, + "time_per_iteration": 2.434598207473755 + }, + { + "auxiliary_loss_clip": 0.0112665, + "auxiliary_loss_mlp": 0.0104604, + "balance_loss_clip": 1.030936, + "balance_loss_mlp": 1.04645705, + "epoch": 0.38484894032767175, + "flos": 24860849575680.0, + "grad_norm": 1.6542190438860391, + "language_loss": 0.73004973, + "learning_rate": 2.81824995589303e-06, + "loss": 0.7517767, + "num_input_tokens_seen": 137371255, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.80078125, + "step": 6401, + "time_per_iteration": 2.4963061809539795 + }, + { + "auxiliary_loss_clip": 0.01123878, + "auxiliary_loss_mlp": 0.01038712, + "balance_loss_clip": 1.02329874, + "balance_loss_mlp": 1.045017, + "epoch": 0.3849090635803397, + "flos": 14501806853760.0, + "grad_norm": 1.9430058457885813, + "language_loss": 0.71920168, + "learning_rate": 2.8178945650259012e-06, + "loss": 0.74082762, + "num_input_tokens_seen": 137388980, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.7890625, + "step": 6402, + "time_per_iteration": 2.426349639892578 + }, + { + "auxiliary_loss_clip": 0.01118079, + "auxiliary_loss_mlp": 0.01034485, + "balance_loss_clip": 1.02007246, + "balance_loss_mlp": 1.04232907, + "epoch": 0.3849691868330077, + "flos": 18515865576960.0, + "grad_norm": 1.7846208976590752, + "language_loss": 0.82449806, + "learning_rate": 2.817539143144128e-06, + "loss": 0.84602368, + "num_input_tokens_seen": 137406885, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7578125, + "step": 6403, + "time_per_iteration": 2.4700570106506348 + }, + { + "auxiliary_loss_clip": 0.0112163, + "auxiliary_loss_mlp": 0.01038876, + "balance_loss_clip": 1.02340865, + "balance_loss_mlp": 1.04500651, + "epoch": 0.38502931008567565, + "flos": 21616392677760.0, + "grad_norm": 1.8891944292176732, + "language_loss": 0.82468271, + "learning_rate": 2.817183690261189e-06, + "loss": 0.84628773, + "num_input_tokens_seen": 137425535, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.765625, + "step": 6404, + "time_per_iteration": 2.481968402862549 + }, + { + "auxiliary_loss_clip": 0.01122268, + "auxiliary_loss_mlp": 0.01035856, + "balance_loss_clip": 1.02136576, + "balance_loss_mlp": 1.04299283, + "epoch": 0.3850894333383436, + "flos": 25415346804480.0, + "grad_norm": 1.6334992055527433, + "language_loss": 0.69588619, + "learning_rate": 2.816828206390563e-06, + "loss": 0.71746749, + "num_input_tokens_seen": 137447700, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.79296875, + "step": 6405, + "time_per_iteration": 2.5947635173797607 + }, + { + "auxiliary_loss_clip": 0.01119754, + "auxiliary_loss_mlp": 0.01038243, + "balance_loss_clip": 1.02475476, + "balance_loss_mlp": 1.04411674, + "epoch": 0.3851495565910116, + "flos": 20227870581120.0, + "grad_norm": 1.9268009005119906, + "language_loss": 0.79068285, + "learning_rate": 2.816472691545729e-06, + "loss": 0.81226277, + "num_input_tokens_seen": 137462245, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75390625, + "step": 6406, + "time_per_iteration": 2.4195396900177 + }, + { + "auxiliary_loss_clip": 0.01125718, + "auxiliary_loss_mlp": 0.01037827, + "balance_loss_clip": 1.02247298, + "balance_loss_mlp": 1.04682863, + "epoch": 0.38520967984367954, + "flos": 16508459122560.0, + "grad_norm": 2.277779532957622, + "language_loss": 0.8438794, + "learning_rate": 2.8161171457401694e-06, + "loss": 0.86551487, + "num_input_tokens_seen": 137476455, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.7890625, + "step": 6407, + "time_per_iteration": 2.4518916606903076 + }, + { + "auxiliary_loss_clip": 0.01043854, + "auxiliary_loss_mlp": 0.01007721, + "balance_loss_clip": 1.00623727, + "balance_loss_mlp": 1.01778841, + "epoch": 0.3852698030963475, + "flos": 61313772971520.0, + "grad_norm": 0.8214817017046727, + "language_loss": 0.64868087, + "learning_rate": 2.815761568987365e-06, + "loss": 0.66919661, + "num_input_tokens_seen": 137539845, + "router_z_loss_clip": 0.01483154, + "router_z_loss_mlp": 0.25976562, + "step": 6408, + "time_per_iteration": 3.090940475463867 + }, + { + "auxiliary_loss_clip": 0.01123062, + "auxiliary_loss_mlp": 0.01043064, + "balance_loss_clip": 1.02676785, + "balance_loss_mlp": 1.04405272, + "epoch": 0.3853299263490155, + "flos": 22893016930560.0, + "grad_norm": 1.5501960898767924, + "language_loss": 0.73628408, + "learning_rate": 2.8154059613008e-06, + "loss": 0.7579453, + "num_input_tokens_seen": 137559880, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.7890625, + "step": 6409, + "time_per_iteration": 2.4831972122192383 + }, + { + "auxiliary_loss_clip": 0.01129844, + "auxiliary_loss_mlp": 0.01049195, + "balance_loss_clip": 1.03255367, + "balance_loss_mlp": 1.04574656, + "epoch": 0.38539004960168344, + "flos": 20047491457920.0, + "grad_norm": 2.0394333066705874, + "language_loss": 0.70208335, + "learning_rate": 2.81505032269396e-06, + "loss": 0.72387373, + "num_input_tokens_seen": 137578225, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.83984375, + "step": 6410, + "time_per_iteration": 2.430617332458496 + }, + { + "auxiliary_loss_clip": 0.01043682, + "auxiliary_loss_mlp": 0.01003736, + "balance_loss_clip": 1.0021385, + "balance_loss_mlp": 1.01802111, + "epoch": 0.3854501728543514, + "flos": 68730691570560.0, + "grad_norm": 0.6794214350275563, + "language_loss": 0.60311568, + "learning_rate": 2.81469465318033e-06, + "loss": 0.62358987, + "num_input_tokens_seen": 137645770, + "router_z_loss_clip": 0.01599121, + "router_z_loss_mlp": 0.2578125, + "step": 6411, + "time_per_iteration": 3.1681244373321533 + }, + { + "auxiliary_loss_clip": 0.01118542, + "auxiliary_loss_mlp": 0.01029148, + "balance_loss_clip": 1.01543355, + "balance_loss_mlp": 1.04146707, + "epoch": 0.38551029610701937, + "flos": 20485027025280.0, + "grad_norm": 1.9543275921913768, + "language_loss": 0.7770192, + "learning_rate": 2.814338952773397e-06, + "loss": 0.79849613, + "num_input_tokens_seen": 137664090, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76953125, + "step": 6412, + "time_per_iteration": 2.4670822620391846 + }, + { + "auxiliary_loss_clip": 0.01124348, + "auxiliary_loss_mlp": 0.01037148, + "balance_loss_clip": 1.02093506, + "balance_loss_mlp": 1.0437274, + "epoch": 0.38557041935968733, + "flos": 23471788775040.0, + "grad_norm": 1.7609162802618283, + "language_loss": 0.78148544, + "learning_rate": 2.8139832214866493e-06, + "loss": 0.80310041, + "num_input_tokens_seen": 137683190, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8046875, + "step": 6413, + "time_per_iteration": 2.4506192207336426 + }, + { + "auxiliary_loss_clip": 0.01040458, + "auxiliary_loss_mlp": 0.01006495, + "balance_loss_clip": 1.00485027, + "balance_loss_mlp": 1.01477003, + "epoch": 0.38563054261235535, + "flos": 63966636869760.0, + "grad_norm": 0.8068957555662655, + "language_loss": 0.61344963, + "learning_rate": 2.813627459333576e-06, + "loss": 0.63391918, + "num_input_tokens_seen": 137737315, + "router_z_loss_clip": 0.01647949, + "router_z_loss_mlp": 0.2578125, + "step": 6414, + "time_per_iteration": 2.897420883178711 + }, + { + "auxiliary_loss_clip": 0.01124739, + "auxiliary_loss_mlp": 0.0104191, + "balance_loss_clip": 1.02712834, + "balance_loss_mlp": 1.04452538, + "epoch": 0.3856906658650233, + "flos": 23987789602560.0, + "grad_norm": 2.3808373048749543, + "language_loss": 0.77121973, + "learning_rate": 2.8132716663276685e-06, + "loss": 0.79288626, + "num_input_tokens_seen": 137753535, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.80078125, + "step": 6415, + "time_per_iteration": 2.455246686935425 + }, + { + "auxiliary_loss_clip": 0.01115597, + "auxiliary_loss_mlp": 0.01032062, + "balance_loss_clip": 1.01916933, + "balance_loss_mlp": 1.04303658, + "epoch": 0.3857507891176913, + "flos": 25007436979200.0, + "grad_norm": 1.6468091717833364, + "language_loss": 0.79597795, + "learning_rate": 2.8129158424824173e-06, + "loss": 0.81745458, + "num_input_tokens_seen": 137773405, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7265625, + "step": 6416, + "time_per_iteration": 2.5162863731384277 + }, + { + "auxiliary_loss_clip": 0.0111887, + "auxiliary_loss_mlp": 0.01034214, + "balance_loss_clip": 1.02100587, + "balance_loss_mlp": 1.04190922, + "epoch": 0.38581091237035925, + "flos": 21536778182400.0, + "grad_norm": 1.6816352340920986, + "language_loss": 0.7957328, + "learning_rate": 2.8125599878113155e-06, + "loss": 0.81726366, + "num_input_tokens_seen": 137790810, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.76953125, + "step": 6417, + "time_per_iteration": 2.462679862976074 + }, + { + "auxiliary_loss_clip": 0.01116808, + "auxiliary_loss_mlp": 0.01037406, + "balance_loss_clip": 1.02369118, + "balance_loss_mlp": 1.03945839, + "epoch": 0.3858710356230272, + "flos": 17383889393280.0, + "grad_norm": 9.924006648688666, + "language_loss": 0.80246758, + "learning_rate": 2.8122041023278583e-06, + "loss": 0.82400978, + "num_input_tokens_seen": 137810265, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 6418, + "time_per_iteration": 2.4485208988189697 + }, + { + "auxiliary_loss_clip": 0.01114184, + "auxiliary_loss_mlp": 0.01033119, + "balance_loss_clip": 1.01992905, + "balance_loss_mlp": 1.03939319, + "epoch": 0.3859311588756952, + "flos": 20339588856960.0, + "grad_norm": 1.9958339666442106, + "language_loss": 0.79694712, + "learning_rate": 2.8118481860455407e-06, + "loss": 0.81842011, + "num_input_tokens_seen": 137828580, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.75, + "step": 6419, + "time_per_iteration": 2.4360008239746094 + }, + { + "auxiliary_loss_clip": 0.01115104, + "auxiliary_loss_mlp": 0.01034912, + "balance_loss_clip": 1.01972449, + "balance_loss_mlp": 1.04120576, + "epoch": 0.38599128212836314, + "flos": 26321157002880.0, + "grad_norm": 2.0553625572614678, + "language_loss": 0.67804086, + "learning_rate": 2.8114922389778573e-06, + "loss": 0.69954103, + "num_input_tokens_seen": 137846145, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.73828125, + "step": 6420, + "time_per_iteration": 2.489661931991577 + }, + { + "auxiliary_loss_clip": 0.01116038, + "auxiliary_loss_mlp": 0.01036432, + "balance_loss_clip": 1.02286029, + "balance_loss_mlp": 1.04163957, + "epoch": 0.3860514053810311, + "flos": 13553837066880.0, + "grad_norm": 2.4512212791744576, + "language_loss": 0.81831443, + "learning_rate": 2.8111362611383076e-06, + "loss": 0.83983916, + "num_input_tokens_seen": 137863705, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.74609375, + "step": 6421, + "time_per_iteration": 2.4278934001922607 + }, + { + "auxiliary_loss_clip": 0.01118285, + "auxiliary_loss_mlp": 0.01033321, + "balance_loss_clip": 1.01888454, + "balance_loss_mlp": 1.04031229, + "epoch": 0.3861115286336991, + "flos": 20954271323520.0, + "grad_norm": 2.2431145476637266, + "language_loss": 0.72079587, + "learning_rate": 2.8107802525403886e-06, + "loss": 0.74231195, + "num_input_tokens_seen": 137880285, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 6422, + "time_per_iteration": 2.441708564758301 + }, + { + "auxiliary_loss_clip": 0.01116019, + "auxiliary_loss_mlp": 0.01038012, + "balance_loss_clip": 1.02482104, + "balance_loss_mlp": 1.0425638, + "epoch": 0.38617165188636704, + "flos": 16362697731840.0, + "grad_norm": 1.6611822537555545, + "language_loss": 0.65814191, + "learning_rate": 2.8104242131976025e-06, + "loss": 0.6796822, + "num_input_tokens_seen": 137898335, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 6423, + "time_per_iteration": 2.4211878776550293 + }, + { + "auxiliary_loss_clip": 0.01121429, + "auxiliary_loss_mlp": 0.01039251, + "balance_loss_clip": 1.02561951, + "balance_loss_mlp": 1.0439117, + "epoch": 0.386231775139035, + "flos": 34787276893440.0, + "grad_norm": 1.965242475874499, + "language_loss": 0.68746173, + "learning_rate": 2.810068143123449e-06, + "loss": 0.70906854, + "num_input_tokens_seen": 137918605, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 6424, + "time_per_iteration": 2.5804436206817627 + }, + { + "auxiliary_loss_clip": 0.01117257, + "auxiliary_loss_mlp": 0.0103853, + "balance_loss_clip": 1.0243144, + "balance_loss_mlp": 1.04261661, + "epoch": 0.38629189839170297, + "flos": 21726171619200.0, + "grad_norm": 1.3808875353222407, + "language_loss": 0.72237349, + "learning_rate": 2.809712042331429e-06, + "loss": 0.74393135, + "num_input_tokens_seen": 137938245, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.74609375, + "step": 6425, + "time_per_iteration": 2.4568634033203125 + }, + { + "auxiliary_loss_clip": 0.01121874, + "auxiliary_loss_mlp": 0.01038367, + "balance_loss_clip": 1.02413344, + "balance_loss_mlp": 1.0424571, + "epoch": 0.38635202164437094, + "flos": 27923634460800.0, + "grad_norm": 2.566599175889616, + "language_loss": 0.80062914, + "learning_rate": 2.8093559108350484e-06, + "loss": 0.82223159, + "num_input_tokens_seen": 137956770, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.79296875, + "step": 6426, + "time_per_iteration": 2.5236575603485107 + }, + { + "auxiliary_loss_clip": 0.01123371, + "auxiliary_loss_mlp": 0.01036233, + "balance_loss_clip": 1.0222559, + "balance_loss_mlp": 1.04582727, + "epoch": 0.38641214489703896, + "flos": 23586631534080.0, + "grad_norm": 2.32293087490025, + "language_loss": 0.74624443, + "learning_rate": 2.80899974864781e-06, + "loss": 0.7678405, + "num_input_tokens_seen": 137977040, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7734375, + "step": 6427, + "time_per_iteration": 2.467555046081543 + }, + { + "auxiliary_loss_clip": 0.01118544, + "auxiliary_loss_mlp": 0.01039206, + "balance_loss_clip": 1.02530599, + "balance_loss_mlp": 1.04256904, + "epoch": 0.3864722681497069, + "flos": 12641239198080.0, + "grad_norm": 1.6951631816528543, + "language_loss": 0.69630527, + "learning_rate": 2.8086435557832203e-06, + "loss": 0.71788281, + "num_input_tokens_seen": 137993545, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 6428, + "time_per_iteration": 2.4336817264556885 + }, + { + "auxiliary_loss_clip": 0.01120968, + "auxiliary_loss_mlp": 0.01042024, + "balance_loss_clip": 1.02787971, + "balance_loss_mlp": 1.0427897, + "epoch": 0.3865323914023749, + "flos": 17598922162560.0, + "grad_norm": 2.175868568260599, + "language_loss": 0.84272587, + "learning_rate": 2.8082873322547863e-06, + "loss": 0.86435586, + "num_input_tokens_seen": 138010140, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.78125, + "step": 6429, + "time_per_iteration": 5.324048757553101 + }, + { + "auxiliary_loss_clip": 0.01121297, + "auxiliary_loss_mlp": 0.01037029, + "balance_loss_clip": 1.02358222, + "balance_loss_mlp": 1.04458523, + "epoch": 0.38659251465504285, + "flos": 18478949374080.0, + "grad_norm": 2.0434704200334726, + "language_loss": 0.808312, + "learning_rate": 2.807931078076015e-06, + "loss": 0.82989526, + "num_input_tokens_seen": 138028880, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 6430, + "time_per_iteration": 3.8362674713134766 + }, + { + "auxiliary_loss_clip": 0.01037896, + "auxiliary_loss_mlp": 0.01001686, + "balance_loss_clip": 1.00019002, + "balance_loss_mlp": 1.01247668, + "epoch": 0.3866526379077108, + "flos": 64165726978560.0, + "grad_norm": 0.7147232834997996, + "language_loss": 0.58793551, + "learning_rate": 2.807574793260416e-06, + "loss": 0.60833132, + "num_input_tokens_seen": 138098090, + "router_z_loss_clip": 0.01495361, + "router_z_loss_mlp": 0.25390625, + "step": 6431, + "time_per_iteration": 3.1054275035858154 + }, + { + "auxiliary_loss_clip": 0.01123522, + "auxiliary_loss_mlp": 0.01036133, + "balance_loss_clip": 1.0213275, + "balance_loss_mlp": 1.04425848, + "epoch": 0.3867127611603788, + "flos": 14388292897920.0, + "grad_norm": 1.8418420222570902, + "language_loss": 0.78914982, + "learning_rate": 2.8072184778215004e-06, + "loss": 0.81074637, + "num_input_tokens_seen": 138114735, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 6432, + "time_per_iteration": 2.441103458404541 + }, + { + "auxiliary_loss_clip": 0.01120861, + "auxiliary_loss_mlp": 0.01042942, + "balance_loss_clip": 1.02820802, + "balance_loss_mlp": 1.04033065, + "epoch": 0.38677288441304675, + "flos": 20010754823040.0, + "grad_norm": 3.1335187433073006, + "language_loss": 0.80734611, + "learning_rate": 2.806862131772779e-06, + "loss": 0.82898408, + "num_input_tokens_seen": 138130480, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8046875, + "step": 6433, + "time_per_iteration": 2.4334840774536133 + }, + { + "auxiliary_loss_clip": 0.01122101, + "auxiliary_loss_mlp": 0.01036931, + "balance_loss_clip": 1.02167201, + "balance_loss_mlp": 1.04427695, + "epoch": 0.3868330076657147, + "flos": 22236893147520.0, + "grad_norm": 1.9920607209076013, + "language_loss": 0.70712543, + "learning_rate": 2.806505755127765e-06, + "loss": 0.72871572, + "num_input_tokens_seen": 138150640, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.77734375, + "step": 6434, + "time_per_iteration": 2.4485912322998047 + }, + { + "auxiliary_loss_clip": 0.01122001, + "auxiliary_loss_mlp": 0.01037218, + "balance_loss_clip": 1.02259684, + "balance_loss_mlp": 1.04096544, + "epoch": 0.3868931309183827, + "flos": 16727442387840.0, + "grad_norm": 3.1146547904297615, + "language_loss": 0.77674437, + "learning_rate": 2.806149347899972e-06, + "loss": 0.79833651, + "num_input_tokens_seen": 138169700, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8125, + "step": 6435, + "time_per_iteration": 2.4734902381896973 + }, + { + "auxiliary_loss_clip": 0.01117121, + "auxiliary_loss_mlp": 0.01032568, + "balance_loss_clip": 1.01877558, + "balance_loss_mlp": 1.04157901, + "epoch": 0.38695325417105064, + "flos": 22674716023680.0, + "grad_norm": 1.6626735995393465, + "language_loss": 0.79557228, + "learning_rate": 2.805792910102915e-06, + "loss": 0.81706917, + "num_input_tokens_seen": 138185835, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.75390625, + "step": 6436, + "time_per_iteration": 2.461880922317505 + }, + { + "auxiliary_loss_clip": 0.01115966, + "auxiliary_loss_mlp": 0.01032941, + "balance_loss_clip": 1.01937521, + "balance_loss_mlp": 1.04099202, + "epoch": 0.3870133774237186, + "flos": 23112036109440.0, + "grad_norm": 1.7213495950653388, + "language_loss": 0.77057981, + "learning_rate": 2.8054364417501093e-06, + "loss": 0.79206884, + "num_input_tokens_seen": 138204080, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.75, + "step": 6437, + "time_per_iteration": 2.506342649459839 + }, + { + "auxiliary_loss_clip": 0.01118581, + "auxiliary_loss_mlp": 0.0104126, + "balance_loss_clip": 1.02759838, + "balance_loss_mlp": 1.0425818, + "epoch": 0.3870735006763866, + "flos": 17675699483520.0, + "grad_norm": 2.0991099349261013, + "language_loss": 0.8199805, + "learning_rate": 2.805079942855074e-06, + "loss": 0.84157896, + "num_input_tokens_seen": 138220710, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7578125, + "step": 6438, + "time_per_iteration": 2.4236960411071777 + }, + { + "auxiliary_loss_clip": 0.01119447, + "auxiliary_loss_mlp": 0.01039004, + "balance_loss_clip": 1.02413225, + "balance_loss_mlp": 1.04198575, + "epoch": 0.38713362392905454, + "flos": 23295791111040.0, + "grad_norm": 1.4416179830694351, + "language_loss": 0.75274503, + "learning_rate": 2.804723413431326e-06, + "loss": 0.77432954, + "num_input_tokens_seen": 138241720, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7734375, + "step": 6439, + "time_per_iteration": 2.4746499061584473 + }, + { + "auxiliary_loss_clip": 0.01115954, + "auxiliary_loss_mlp": 0.01030927, + "balance_loss_clip": 1.01804042, + "balance_loss_mlp": 1.04231787, + "epoch": 0.38719374718172256, + "flos": 21031192298880.0, + "grad_norm": 1.4591961315755648, + "language_loss": 0.74029297, + "learning_rate": 2.8043668534923855e-06, + "loss": 0.76176178, + "num_input_tokens_seen": 138261885, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.734375, + "step": 6440, + "time_per_iteration": 2.470442056655884 + }, + { + "auxiliary_loss_clip": 0.01120633, + "auxiliary_loss_mlp": 0.01041682, + "balance_loss_clip": 1.02755535, + "balance_loss_mlp": 1.04172719, + "epoch": 0.3872538704343905, + "flos": 19609776322560.0, + "grad_norm": 1.882594032026591, + "language_loss": 0.82420492, + "learning_rate": 2.804010263051774e-06, + "loss": 0.84582806, + "num_input_tokens_seen": 138280255, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 6441, + "time_per_iteration": 2.4857184886932373 + }, + { + "auxiliary_loss_clip": 0.01119023, + "auxiliary_loss_mlp": 0.01044385, + "balance_loss_clip": 1.03132594, + "balance_loss_mlp": 1.04210794, + "epoch": 0.3873139936870585, + "flos": 17530045833600.0, + "grad_norm": 2.099147848905264, + "language_loss": 0.81835496, + "learning_rate": 2.8036536421230118e-06, + "loss": 0.83998901, + "num_input_tokens_seen": 138296675, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.76953125, + "step": 6442, + "time_per_iteration": 2.4149296283721924 + }, + { + "auxiliary_loss_clip": 0.01116335, + "auxiliary_loss_mlp": 0.01035738, + "balance_loss_clip": 1.02142096, + "balance_loss_mlp": 1.04025602, + "epoch": 0.38737411693972645, + "flos": 17786555832960.0, + "grad_norm": 1.5694674536603201, + "language_loss": 0.83847654, + "learning_rate": 2.803296990719624e-06, + "loss": 0.85999727, + "num_input_tokens_seen": 138314985, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7578125, + "step": 6443, + "time_per_iteration": 2.4515957832336426 + }, + { + "auxiliary_loss_clip": 0.01039021, + "auxiliary_loss_mlp": 0.01007024, + "balance_loss_clip": 1.00551593, + "balance_loss_mlp": 1.0140909, + "epoch": 0.3874342401923944, + "flos": 58304637048960.0, + "grad_norm": 0.7719544775144753, + "language_loss": 0.50268674, + "learning_rate": 2.8029403088551327e-06, + "loss": 0.52314723, + "num_input_tokens_seen": 138373275, + "router_z_loss_clip": 0.01507568, + "router_z_loss_mlp": 0.24902344, + "step": 6444, + "time_per_iteration": 3.092834711074829 + }, + { + "auxiliary_loss_clip": 0.01115245, + "auxiliary_loss_mlp": 0.01037927, + "balance_loss_clip": 1.02502251, + "balance_loss_mlp": 1.04225266, + "epoch": 0.3874943634450624, + "flos": 17711933328000.0, + "grad_norm": 1.537835026490341, + "language_loss": 0.78736365, + "learning_rate": 2.802583596543065e-06, + "loss": 0.80889541, + "num_input_tokens_seen": 138391145, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7265625, + "step": 6445, + "time_per_iteration": 2.435347557067871 + }, + { + "auxiliary_loss_clip": 0.01115913, + "auxiliary_loss_mlp": 0.01033846, + "balance_loss_clip": 1.02055407, + "balance_loss_mlp": 1.04211605, + "epoch": 0.38755448669773035, + "flos": 19244852098560.0, + "grad_norm": 1.672895701432963, + "language_loss": 0.81121695, + "learning_rate": 2.8022268537969474e-06, + "loss": 0.83271456, + "num_input_tokens_seen": 138409875, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73828125, + "step": 6446, + "time_per_iteration": 2.469536781311035 + }, + { + "auxiliary_loss_clip": 0.01114406, + "auxiliary_loss_mlp": 0.01037114, + "balance_loss_clip": 1.02434087, + "balance_loss_mlp": 1.03933239, + "epoch": 0.3876146099503983, + "flos": 20594267262720.0, + "grad_norm": 1.877585125713849, + "language_loss": 0.77093089, + "learning_rate": 2.801870080630306e-06, + "loss": 0.79244608, + "num_input_tokens_seen": 138428965, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.75, + "step": 6447, + "time_per_iteration": 2.428525447845459 + }, + { + "auxiliary_loss_clip": 0.01116221, + "auxiliary_loss_mlp": 0.01032372, + "balance_loss_clip": 1.01940775, + "balance_loss_mlp": 1.04256356, + "epoch": 0.3876747332030663, + "flos": 19281121856640.0, + "grad_norm": 1.5240627220637166, + "language_loss": 0.75767821, + "learning_rate": 2.801513277056671e-06, + "loss": 0.7791642, + "num_input_tokens_seen": 138448090, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.734375, + "step": 6448, + "time_per_iteration": 2.4325876235961914 + }, + { + "auxiliary_loss_clip": 0.01115196, + "auxiliary_loss_mlp": 0.01033743, + "balance_loss_clip": 1.02023029, + "balance_loss_mlp": 1.04179466, + "epoch": 0.38773485645573424, + "flos": 18945895201920.0, + "grad_norm": 1.6442003276819328, + "language_loss": 0.75754648, + "learning_rate": 2.8011564430895725e-06, + "loss": 0.77903593, + "num_input_tokens_seen": 138466105, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 6449, + "time_per_iteration": 2.435208320617676 + }, + { + "auxiliary_loss_clip": 0.0111808, + "auxiliary_loss_mlp": 0.01033453, + "balance_loss_clip": 1.0194999, + "balance_loss_mlp": 1.03956699, + "epoch": 0.3877949797084022, + "flos": 23071348978560.0, + "grad_norm": 1.5394171504545016, + "language_loss": 0.78183508, + "learning_rate": 2.800799578742542e-06, + "loss": 0.80335045, + "num_input_tokens_seen": 138485160, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.78515625, + "step": 6450, + "time_per_iteration": 2.467933177947998 + }, + { + "auxiliary_loss_clip": 0.0112145, + "auxiliary_loss_mlp": 0.01036071, + "balance_loss_clip": 1.02190948, + "balance_loss_mlp": 1.04104686, + "epoch": 0.3878551029610702, + "flos": 29095543589760.0, + "grad_norm": 2.1284571270947263, + "language_loss": 0.77706474, + "learning_rate": 2.8004426840291106e-06, + "loss": 0.79863995, + "num_input_tokens_seen": 138504135, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8046875, + "step": 6451, + "time_per_iteration": 2.513192892074585 + }, + { + "auxiliary_loss_clip": 0.01112409, + "auxiliary_loss_mlp": 0.01026865, + "balance_loss_clip": 1.01337111, + "balance_loss_mlp": 1.03988457, + "epoch": 0.38791522621373814, + "flos": 20996394998400.0, + "grad_norm": 1.5965207120841256, + "language_loss": 0.7642619, + "learning_rate": 2.800085758962812e-06, + "loss": 0.7856546, + "num_input_tokens_seen": 138523955, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7265625, + "step": 6452, + "time_per_iteration": 2.453756809234619 + }, + { + "auxiliary_loss_clip": 0.01118677, + "auxiliary_loss_mlp": 0.01040253, + "balance_loss_clip": 1.02721739, + "balance_loss_mlp": 1.04313231, + "epoch": 0.3879753494664061, + "flos": 15486836497920.0, + "grad_norm": 1.5417712426283914, + "language_loss": 0.79843581, + "learning_rate": 2.799728803557182e-06, + "loss": 0.82002515, + "num_input_tokens_seen": 138541655, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7578125, + "step": 6453, + "time_per_iteration": 2.434788465499878 + }, + { + "auxiliary_loss_clip": 0.01126032, + "auxiliary_loss_mlp": 0.01037051, + "balance_loss_clip": 1.02257931, + "balance_loss_mlp": 1.0456028, + "epoch": 0.3880354727190741, + "flos": 22053964158720.0, + "grad_norm": 1.779502658436086, + "language_loss": 0.71759796, + "learning_rate": 2.7993718178257555e-06, + "loss": 0.73922884, + "num_input_tokens_seen": 138560860, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8046875, + "step": 6454, + "time_per_iteration": 2.456637382507324 + }, + { + "auxiliary_loss_clip": 0.01122488, + "auxiliary_loss_mlp": 0.01039713, + "balance_loss_clip": 1.02489531, + "balance_loss_mlp": 1.04253364, + "epoch": 0.3880955959717421, + "flos": 20340307128960.0, + "grad_norm": 2.1246626443539216, + "language_loss": 0.77918947, + "learning_rate": 2.7990148017820694e-06, + "loss": 0.80081153, + "num_input_tokens_seen": 138580200, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 6455, + "time_per_iteration": 2.4589757919311523 + }, + { + "auxiliary_loss_clip": 0.01118002, + "auxiliary_loss_mlp": 0.01034735, + "balance_loss_clip": 1.02040577, + "balance_loss_mlp": 1.04232621, + "epoch": 0.38815571922441006, + "flos": 23075407215360.0, + "grad_norm": 1.6339807395025958, + "language_loss": 0.75865024, + "learning_rate": 2.798657755439662e-06, + "loss": 0.78017759, + "num_input_tokens_seen": 138598315, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7578125, + "step": 6456, + "time_per_iteration": 2.4390318393707275 + }, + { + "auxiliary_loss_clip": 0.01121145, + "auxiliary_loss_mlp": 0.01033254, + "balance_loss_clip": 1.01944995, + "balance_loss_mlp": 1.04276633, + "epoch": 0.388215842477078, + "flos": 20776944856320.0, + "grad_norm": 2.085241252102015, + "language_loss": 0.60518527, + "learning_rate": 2.7983006788120726e-06, + "loss": 0.62672919, + "num_input_tokens_seen": 138615695, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.78515625, + "step": 6457, + "time_per_iteration": 2.459535837173462 + }, + { + "auxiliary_loss_clip": 0.01121291, + "auxiliary_loss_mlp": 0.01037459, + "balance_loss_clip": 1.02167547, + "balance_loss_mlp": 1.04195237, + "epoch": 0.388275965729746, + "flos": 20448182649600.0, + "grad_norm": 2.1234505206368475, + "language_loss": 0.80247247, + "learning_rate": 2.797943571912841e-06, + "loss": 0.82405996, + "num_input_tokens_seen": 138633180, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.79296875, + "step": 6458, + "time_per_iteration": 2.425049066543579 + }, + { + "auxiliary_loss_clip": 0.01120771, + "auxiliary_loss_mlp": 0.0103458, + "balance_loss_clip": 1.02072167, + "balance_loss_mlp": 1.04291797, + "epoch": 0.38833608898241395, + "flos": 27892392606720.0, + "grad_norm": 1.8371533851039183, + "language_loss": 0.81683058, + "learning_rate": 2.797586434755509e-06, + "loss": 0.83838403, + "num_input_tokens_seen": 138654785, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.77734375, + "step": 6459, + "time_per_iteration": 2.5234129428863525 + }, + { + "auxiliary_loss_clip": 0.01117247, + "auxiliary_loss_mlp": 0.01034445, + "balance_loss_clip": 1.02105141, + "balance_loss_mlp": 1.04261899, + "epoch": 0.3883962122350819, + "flos": 18076390675200.0, + "grad_norm": 3.3845315312390643, + "language_loss": 0.61609662, + "learning_rate": 2.7972292673536202e-06, + "loss": 0.63761353, + "num_input_tokens_seen": 138673330, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 6460, + "time_per_iteration": 2.4271440505981445 + }, + { + "auxiliary_loss_clip": 0.01121121, + "auxiliary_loss_mlp": 0.01034276, + "balance_loss_clip": 1.0216701, + "balance_loss_mlp": 1.04498553, + "epoch": 0.3884563354877499, + "flos": 23622254847360.0, + "grad_norm": 1.999840896697599, + "language_loss": 0.85928953, + "learning_rate": 2.796872069720717e-06, + "loss": 0.88084352, + "num_input_tokens_seen": 138694185, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.76171875, + "step": 6461, + "time_per_iteration": 2.4874932765960693 + }, + { + "auxiliary_loss_clip": 0.01121067, + "auxiliary_loss_mlp": 0.01041247, + "balance_loss_clip": 1.02712059, + "balance_loss_mlp": 1.04198229, + "epoch": 0.38851645874041785, + "flos": 27453528236160.0, + "grad_norm": 5.6194775515218085, + "language_loss": 0.71397054, + "learning_rate": 2.7965148418703456e-06, + "loss": 0.73559368, + "num_input_tokens_seen": 138714625, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 6462, + "time_per_iteration": 2.4839894771575928 + }, + { + "auxiliary_loss_clip": 0.01120048, + "auxiliary_loss_mlp": 0.01036945, + "balance_loss_clip": 1.02274752, + "balance_loss_mlp": 1.04190457, + "epoch": 0.3885765819930858, + "flos": 25228072270080.0, + "grad_norm": 2.13487298932128, + "language_loss": 0.7582581, + "learning_rate": 2.796157583816052e-06, + "loss": 0.77982807, + "num_input_tokens_seen": 138733585, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.78125, + "step": 6463, + "time_per_iteration": 2.4897215366363525 + }, + { + "auxiliary_loss_clip": 0.0112511, + "auxiliary_loss_mlp": 0.01046321, + "balance_loss_clip": 1.0305022, + "balance_loss_mlp": 1.04482341, + "epoch": 0.3886367052457538, + "flos": 16946605221120.0, + "grad_norm": 1.9442764767857983, + "language_loss": 0.70078236, + "learning_rate": 2.795800295571382e-06, + "loss": 0.72249663, + "num_input_tokens_seen": 138752335, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8046875, + "step": 6464, + "time_per_iteration": 2.4519219398498535 + }, + { + "auxiliary_loss_clip": 0.01118845, + "auxiliary_loss_mlp": 0.01036529, + "balance_loss_clip": 1.02258134, + "balance_loss_mlp": 1.04280329, + "epoch": 0.38869682849842174, + "flos": 27154140376320.0, + "grad_norm": 1.8350923871455525, + "language_loss": 0.69608724, + "learning_rate": 2.7954429771498858e-06, + "loss": 0.717641, + "num_input_tokens_seen": 138768450, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7578125, + "step": 6465, + "time_per_iteration": 2.524698495864868 + }, + { + "auxiliary_loss_clip": 0.01120474, + "auxiliary_loss_mlp": 0.01043161, + "balance_loss_clip": 1.02772307, + "balance_loss_mlp": 1.04204226, + "epoch": 0.3887569517510897, + "flos": 21063619301760.0, + "grad_norm": 2.02186972310505, + "language_loss": 0.77957165, + "learning_rate": 2.7950856285651117e-06, + "loss": 0.80120802, + "num_input_tokens_seen": 138786775, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.78125, + "step": 6466, + "time_per_iteration": 2.4420318603515625 + }, + { + "auxiliary_loss_clip": 0.0112437, + "auxiliary_loss_mlp": 0.01039754, + "balance_loss_clip": 1.02520418, + "balance_loss_mlp": 1.04476476, + "epoch": 0.38881707500375773, + "flos": 29497384016640.0, + "grad_norm": 1.578436157089315, + "language_loss": 0.69438803, + "learning_rate": 2.794728249830611e-06, + "loss": 0.71602929, + "num_input_tokens_seen": 138810100, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.796875, + "step": 6467, + "time_per_iteration": 2.526315212249756 + }, + { + "auxiliary_loss_clip": 0.01122941, + "auxiliary_loss_mlp": 0.01048409, + "balance_loss_clip": 1.03337657, + "balance_loss_mlp": 1.04374123, + "epoch": 0.3888771982564257, + "flos": 17488281294720.0, + "grad_norm": 2.7189933074164316, + "language_loss": 0.83444071, + "learning_rate": 2.794370840959936e-06, + "loss": 0.85615414, + "num_input_tokens_seen": 138825140, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.79296875, + "step": 6468, + "time_per_iteration": 2.433612108230591 + }, + { + "auxiliary_loss_clip": 0.01119544, + "auxiliary_loss_mlp": 0.01040242, + "balance_loss_clip": 1.02720666, + "balance_loss_mlp": 1.04250181, + "epoch": 0.38893732150909366, + "flos": 21942425450880.0, + "grad_norm": 5.890128393718138, + "language_loss": 0.84300733, + "learning_rate": 2.7940134019666383e-06, + "loss": 0.86460519, + "num_input_tokens_seen": 138844115, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.76953125, + "step": 6469, + "time_per_iteration": 2.501368284225464 + }, + { + "auxiliary_loss_clip": 0.011205, + "auxiliary_loss_mlp": 0.0104307, + "balance_loss_clip": 1.02871704, + "balance_loss_mlp": 1.0433706, + "epoch": 0.3889974447617616, + "flos": 24276367468800.0, + "grad_norm": 1.6566744770772097, + "language_loss": 0.74790764, + "learning_rate": 2.793655932864273e-06, + "loss": 0.76954335, + "num_input_tokens_seen": 138860860, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.76953125, + "step": 6470, + "time_per_iteration": 5.350924015045166 + }, + { + "auxiliary_loss_clip": 0.01120302, + "auxiliary_loss_mlp": 0.01041359, + "balance_loss_clip": 1.02632678, + "balance_loss_mlp": 1.04234362, + "epoch": 0.3890575680144296, + "flos": 25667116208640.0, + "grad_norm": 1.5254918915202156, + "language_loss": 0.74916464, + "learning_rate": 2.7932984336663953e-06, + "loss": 0.77078122, + "num_input_tokens_seen": 138881910, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.78125, + "step": 6471, + "time_per_iteration": 5.323298215866089 + }, + { + "auxiliary_loss_clip": 0.01121653, + "auxiliary_loss_mlp": 0.01045365, + "balance_loss_clip": 1.0310601, + "balance_loss_mlp": 1.04548645, + "epoch": 0.38911769126709755, + "flos": 22855274714880.0, + "grad_norm": 1.9258613787227117, + "language_loss": 0.68053186, + "learning_rate": 2.792940904386562e-06, + "loss": 0.70220202, + "num_input_tokens_seen": 138900975, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.76171875, + "step": 6472, + "time_per_iteration": 2.453610420227051 + }, + { + "auxiliary_loss_clip": 0.01120597, + "auxiliary_loss_mlp": 0.01046672, + "balance_loss_clip": 1.0330584, + "balance_loss_mlp": 1.04305148, + "epoch": 0.3891778145197655, + "flos": 25447522412160.0, + "grad_norm": 1.6233097762345425, + "language_loss": 0.76542008, + "learning_rate": 2.7925833450383293e-06, + "loss": 0.7870928, + "num_input_tokens_seen": 138920795, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7734375, + "step": 6473, + "time_per_iteration": 2.487966775894165 + }, + { + "auxiliary_loss_clip": 0.01123459, + "auxiliary_loss_mlp": 0.01046447, + "balance_loss_clip": 1.03157008, + "balance_loss_mlp": 1.04532015, + "epoch": 0.3892379377724335, + "flos": 14027965614720.0, + "grad_norm": 1.8986671727726652, + "language_loss": 0.70897496, + "learning_rate": 2.792225755635257e-06, + "loss": 0.73067403, + "num_input_tokens_seen": 138938770, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78125, + "step": 6474, + "time_per_iteration": 2.4192309379577637 + }, + { + "auxiliary_loss_clip": 0.01121654, + "auxiliary_loss_mlp": 0.01039414, + "balance_loss_clip": 1.02607441, + "balance_loss_mlp": 1.04441047, + "epoch": 0.38929806102510145, + "flos": 20157449967360.0, + "grad_norm": 1.400231739949646, + "language_loss": 0.68822956, + "learning_rate": 2.7918681361909046e-06, + "loss": 0.70984024, + "num_input_tokens_seen": 138958880, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7734375, + "step": 6475, + "time_per_iteration": 2.508747100830078 + }, + { + "auxiliary_loss_clip": 0.01129756, + "auxiliary_loss_mlp": 0.0104873, + "balance_loss_clip": 1.03369188, + "balance_loss_mlp": 1.04747105, + "epoch": 0.3893581842777694, + "flos": 22163958581760.0, + "grad_norm": 2.0025883037810055, + "language_loss": 0.76052523, + "learning_rate": 2.7915104867188332e-06, + "loss": 0.78231013, + "num_input_tokens_seen": 138977240, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.82421875, + "step": 6476, + "time_per_iteration": 2.4432644844055176 + }, + { + "auxiliary_loss_clip": 0.01040957, + "auxiliary_loss_mlp": 0.01003672, + "balance_loss_clip": 1.00199068, + "balance_loss_mlp": 1.01581097, + "epoch": 0.3894183075304374, + "flos": 67301877392640.0, + "grad_norm": 0.7803986728659921, + "language_loss": 0.58254546, + "learning_rate": 2.7911528072326055e-06, + "loss": 0.60299176, + "num_input_tokens_seen": 139039035, + "router_z_loss_clip": 0.0168457, + "router_z_loss_mlp": 0.25, + "step": 6477, + "time_per_iteration": 3.0704691410064697 + }, + { + "auxiliary_loss_clip": 0.01123971, + "auxiliary_loss_mlp": 0.01038208, + "balance_loss_clip": 1.02279997, + "balance_loss_mlp": 1.04507279, + "epoch": 0.38947843078310534, + "flos": 18547502480640.0, + "grad_norm": 1.75333723767605, + "language_loss": 0.77916539, + "learning_rate": 2.7907950977457832e-06, + "loss": 0.80078721, + "num_input_tokens_seen": 139055560, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.7890625, + "step": 6478, + "time_per_iteration": 2.488922357559204 + }, + { + "auxiliary_loss_clip": 0.01118156, + "auxiliary_loss_mlp": 0.0103482, + "balance_loss_clip": 1.0212301, + "balance_loss_mlp": 1.04128957, + "epoch": 0.3895385540357733, + "flos": 14605875532800.0, + "grad_norm": 1.928920480761015, + "language_loss": 0.82250136, + "learning_rate": 2.7904373582719317e-06, + "loss": 0.8440311, + "num_input_tokens_seen": 139071865, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.76953125, + "step": 6479, + "time_per_iteration": 2.4171228408813477 + }, + { + "auxiliary_loss_clip": 0.01118219, + "auxiliary_loss_mlp": 0.01036864, + "balance_loss_clip": 1.02262461, + "balance_loss_mlp": 1.04175949, + "epoch": 0.38959867728844133, + "flos": 19975203336960.0, + "grad_norm": 1.7024032073041733, + "language_loss": 0.80111545, + "learning_rate": 2.790079588824617e-06, + "loss": 0.82266629, + "num_input_tokens_seen": 139089640, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.765625, + "step": 6480, + "time_per_iteration": 2.4750797748565674 + }, + { + "auxiliary_loss_clip": 0.01117569, + "auxiliary_loss_mlp": 0.01027596, + "balance_loss_clip": 1.01428056, + "balance_loss_mlp": 1.04215932, + "epoch": 0.3896588005411093, + "flos": 22672130244480.0, + "grad_norm": 1.550121095479633, + "language_loss": 0.83083898, + "learning_rate": 2.7897217894174038e-06, + "loss": 0.85229063, + "num_input_tokens_seen": 139109365, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75390625, + "step": 6481, + "time_per_iteration": 2.4715166091918945 + }, + { + "auxiliary_loss_clip": 0.01117656, + "auxiliary_loss_mlp": 0.01037477, + "balance_loss_clip": 1.02437592, + "balance_loss_mlp": 1.04459131, + "epoch": 0.38971892379377726, + "flos": 20996035862400.0, + "grad_norm": 1.557560720892756, + "language_loss": 0.75559932, + "learning_rate": 2.789363960063863e-06, + "loss": 0.77715063, + "num_input_tokens_seen": 139128260, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73046875, + "step": 6482, + "time_per_iteration": 2.4623568058013916 + }, + { + "auxiliary_loss_clip": 0.01119557, + "auxiliary_loss_mlp": 0.01033451, + "balance_loss_clip": 1.01972985, + "balance_loss_mlp": 1.04252028, + "epoch": 0.3897790470464452, + "flos": 22528487756160.0, + "grad_norm": 3.29893715214875, + "language_loss": 0.79150903, + "learning_rate": 2.78900610077756e-06, + "loss": 0.81303906, + "num_input_tokens_seen": 139147315, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76953125, + "step": 6483, + "time_per_iteration": 2.4530816078186035 + }, + { + "auxiliary_loss_clip": 0.01117537, + "auxiliary_loss_mlp": 0.01028675, + "balance_loss_clip": 1.0135119, + "balance_loss_mlp": 1.04091668, + "epoch": 0.3898391702991132, + "flos": 26209905603840.0, + "grad_norm": 1.4423872752445677, + "language_loss": 0.79842782, + "learning_rate": 2.788648211572067e-06, + "loss": 0.81989002, + "num_input_tokens_seen": 139167270, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.765625, + "step": 6484, + "time_per_iteration": 2.511016845703125 + }, + { + "auxiliary_loss_clip": 0.01121595, + "auxiliary_loss_mlp": 0.01044531, + "balance_loss_clip": 1.02905726, + "balance_loss_mlp": 1.04556251, + "epoch": 0.38989929355178116, + "flos": 21065558636160.0, + "grad_norm": 1.7756536915325172, + "language_loss": 0.78321344, + "learning_rate": 2.7882902924609557e-06, + "loss": 0.80487472, + "num_input_tokens_seen": 139185970, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.76171875, + "step": 6485, + "time_per_iteration": 2.443439245223999 + }, + { + "auxiliary_loss_clip": 0.01121432, + "auxiliary_loss_mlp": 0.0103836, + "balance_loss_clip": 1.02298832, + "balance_loss_mlp": 1.0427072, + "epoch": 0.3899594168044491, + "flos": 25484115392640.0, + "grad_norm": 2.7221954850945425, + "language_loss": 0.85305119, + "learning_rate": 2.7879323434577965e-06, + "loss": 0.87464917, + "num_input_tokens_seen": 139203730, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.78515625, + "step": 6486, + "time_per_iteration": 2.5056657791137695 + }, + { + "auxiliary_loss_clip": 0.01120884, + "auxiliary_loss_mlp": 0.01033404, + "balance_loss_clip": 1.01942706, + "balance_loss_mlp": 1.04115701, + "epoch": 0.3900195400571171, + "flos": 31139363456640.0, + "grad_norm": 1.7551040773297495, + "language_loss": 0.85345674, + "learning_rate": 2.7875743645761645e-06, + "loss": 0.87499964, + "num_input_tokens_seen": 139222560, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.796875, + "step": 6487, + "time_per_iteration": 2.577178478240967 + }, + { + "auxiliary_loss_clip": 0.01117565, + "auxiliary_loss_mlp": 0.01032303, + "balance_loss_clip": 1.01737833, + "balance_loss_mlp": 1.04198551, + "epoch": 0.39007966330978505, + "flos": 20229917656320.0, + "grad_norm": 1.5246902220393208, + "language_loss": 0.73225224, + "learning_rate": 2.787216355829633e-06, + "loss": 0.75375092, + "num_input_tokens_seen": 139242165, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.75390625, + "step": 6488, + "time_per_iteration": 2.523616075515747 + }, + { + "auxiliary_loss_clip": 0.01123928, + "auxiliary_loss_mlp": 0.0103261, + "balance_loss_clip": 1.01795912, + "balance_loss_mlp": 1.04519773, + "epoch": 0.390139786562453, + "flos": 22528739151360.0, + "grad_norm": 2.5708303691917815, + "language_loss": 0.68585873, + "learning_rate": 2.786858317231779e-06, + "loss": 0.7074241, + "num_input_tokens_seen": 139262525, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7890625, + "step": 6489, + "time_per_iteration": 2.478531837463379 + }, + { + "auxiliary_loss_clip": 0.01115096, + "auxiliary_loss_mlp": 0.01041079, + "balance_loss_clip": 1.02680993, + "balance_loss_mlp": 1.04124475, + "epoch": 0.390199909815121, + "flos": 26432911192320.0, + "grad_norm": 1.801271673710844, + "language_loss": 0.81112868, + "learning_rate": 2.7865002487961788e-06, + "loss": 0.83269042, + "num_input_tokens_seen": 139282835, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.734375, + "step": 6490, + "time_per_iteration": 2.511854887008667 + }, + { + "auxiliary_loss_clip": 0.01121469, + "auxiliary_loss_mlp": 0.01033838, + "balance_loss_clip": 1.0193367, + "balance_loss_mlp": 1.04286718, + "epoch": 0.39026003306778895, + "flos": 17274577328640.0, + "grad_norm": 1.9146492238240407, + "language_loss": 0.89305747, + "learning_rate": 2.7861421505364104e-06, + "loss": 0.91461056, + "num_input_tokens_seen": 139299490, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78515625, + "step": 6491, + "time_per_iteration": 2.460026264190674 + }, + { + "auxiliary_loss_clip": 0.01121295, + "auxiliary_loss_mlp": 0.01035704, + "balance_loss_clip": 1.02187026, + "balance_loss_mlp": 1.04215312, + "epoch": 0.3903201563204569, + "flos": 24532841554560.0, + "grad_norm": 1.8200320241713732, + "language_loss": 0.78811067, + "learning_rate": 2.7857840224660523e-06, + "loss": 0.80968064, + "num_input_tokens_seen": 139317865, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7890625, + "step": 6492, + "time_per_iteration": 2.529750108718872 + }, + { + "auxiliary_loss_clip": 0.01122151, + "auxiliary_loss_mlp": 0.01037174, + "balance_loss_clip": 1.02316093, + "balance_loss_mlp": 1.04309416, + "epoch": 0.39038027957312493, + "flos": 23767944410880.0, + "grad_norm": 1.613220074099035, + "language_loss": 0.74635601, + "learning_rate": 2.7854258645986857e-06, + "loss": 0.76794928, + "num_input_tokens_seen": 139339840, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 6493, + "time_per_iteration": 2.506000280380249 + }, + { + "auxiliary_loss_clip": 0.01123496, + "auxiliary_loss_mlp": 0.0103661, + "balance_loss_clip": 1.02160168, + "balance_loss_mlp": 1.04215276, + "epoch": 0.3904404028257929, + "flos": 14100612871680.0, + "grad_norm": 1.9992899078543964, + "language_loss": 0.76100057, + "learning_rate": 2.7850676769478916e-06, + "loss": 0.78260159, + "num_input_tokens_seen": 139357555, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8125, + "step": 6494, + "time_per_iteration": 2.4696662425994873 + }, + { + "auxiliary_loss_clip": 0.01128232, + "auxiliary_loss_mlp": 0.01048514, + "balance_loss_clip": 1.03233767, + "balance_loss_mlp": 1.04337156, + "epoch": 0.39050052607846086, + "flos": 16910048154240.0, + "grad_norm": 2.027559897328472, + "language_loss": 0.74284697, + "learning_rate": 2.7847094595272525e-06, + "loss": 0.76461446, + "num_input_tokens_seen": 139374455, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.84765625, + "step": 6495, + "time_per_iteration": 2.4156551361083984 + }, + { + "auxiliary_loss_clip": 0.01121782, + "auxiliary_loss_mlp": 0.01041912, + "balance_loss_clip": 1.02683187, + "balance_loss_mlp": 1.04346669, + "epoch": 0.39056064933112883, + "flos": 25915761129600.0, + "grad_norm": 1.725682312794404, + "language_loss": 0.67885542, + "learning_rate": 2.784351212350352e-06, + "loss": 0.70049238, + "num_input_tokens_seen": 139394770, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.78125, + "step": 6496, + "time_per_iteration": 2.530111789703369 + }, + { + "auxiliary_loss_clip": 0.01038749, + "auxiliary_loss_mlp": 0.01000219, + "balance_loss_clip": 0.99871743, + "balance_loss_mlp": 1.01313972, + "epoch": 0.3906207725837968, + "flos": 60028421713920.0, + "grad_norm": 0.6624336186281815, + "language_loss": 0.53998011, + "learning_rate": 2.783992935430775e-06, + "loss": 0.56036979, + "num_input_tokens_seen": 139454760, + "router_z_loss_clip": 0.01501465, + "router_z_loss_mlp": 0.25585938, + "step": 6497, + "time_per_iteration": 3.140427589416504 + }, + { + "auxiliary_loss_clip": 0.01119875, + "auxiliary_loss_mlp": 0.01038317, + "balance_loss_clip": 1.02404737, + "balance_loss_mlp": 1.04236674, + "epoch": 0.39068089583646476, + "flos": 21068683119360.0, + "grad_norm": 2.818865741362812, + "language_loss": 0.68966502, + "learning_rate": 2.7836346287821068e-06, + "loss": 0.71124697, + "num_input_tokens_seen": 139472645, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7734375, + "step": 6498, + "time_per_iteration": 2.4631001949310303 + }, + { + "auxiliary_loss_clip": 0.01037794, + "auxiliary_loss_mlp": 0.01003613, + "balance_loss_clip": 1.00210512, + "balance_loss_mlp": 1.0124712, + "epoch": 0.3907410190891327, + "flos": 70445677403520.0, + "grad_norm": 1.032001330091421, + "language_loss": 0.51830518, + "learning_rate": 2.783276292417936e-06, + "loss": 0.5387193, + "num_input_tokens_seen": 139536730, + "router_z_loss_clip": 0.01507568, + "router_z_loss_mlp": 0.25390625, + "step": 6499, + "time_per_iteration": 3.1206116676330566 + }, + { + "auxiliary_loss_clip": 0.01122549, + "auxiliary_loss_mlp": 0.01043094, + "balance_loss_clip": 1.0266552, + "balance_loss_mlp": 1.04158521, + "epoch": 0.3908011423418007, + "flos": 27962454084480.0, + "grad_norm": 1.8695650437594764, + "language_loss": 0.73693466, + "learning_rate": 2.7829179263518487e-06, + "loss": 0.75859112, + "num_input_tokens_seen": 139557540, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.80859375, + "step": 6500, + "time_per_iteration": 2.5413036346435547 + }, + { + "auxiliary_loss_clip": 0.01125544, + "auxiliary_loss_mlp": 0.01041341, + "balance_loss_clip": 1.02720869, + "balance_loss_mlp": 1.04501247, + "epoch": 0.39086126559446865, + "flos": 24462097718400.0, + "grad_norm": 2.5451317073491353, + "language_loss": 0.68355215, + "learning_rate": 2.7825595305974354e-06, + "loss": 0.70522094, + "num_input_tokens_seen": 139576875, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8046875, + "step": 6501, + "time_per_iteration": 2.4725823402404785 + }, + { + "auxiliary_loss_clip": 0.01118681, + "auxiliary_loss_mlp": 0.01039085, + "balance_loss_clip": 1.02550125, + "balance_loss_mlp": 1.04143763, + "epoch": 0.3909213888471366, + "flos": 16941541403520.0, + "grad_norm": 1.6766627212042646, + "language_loss": 0.79162323, + "learning_rate": 2.782201105168287e-06, + "loss": 0.81320089, + "num_input_tokens_seen": 139594295, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7734375, + "step": 6502, + "time_per_iteration": 2.4758012294769287 + }, + { + "auxiliary_loss_clip": 0.01118294, + "auxiliary_loss_mlp": 0.01037474, + "balance_loss_clip": 1.02378237, + "balance_loss_mlp": 1.0435648, + "epoch": 0.3909815120998046, + "flos": 29278400751360.0, + "grad_norm": 2.24722484247342, + "language_loss": 0.79379106, + "learning_rate": 2.7818426500779932e-06, + "loss": 0.81534874, + "num_input_tokens_seen": 139614080, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 6503, + "time_per_iteration": 2.510356903076172 + }, + { + "auxiliary_loss_clip": 0.0111339, + "auxiliary_loss_mlp": 0.01034049, + "balance_loss_clip": 1.02076924, + "balance_loss_mlp": 1.03882694, + "epoch": 0.39104163535247255, + "flos": 18951246328320.0, + "grad_norm": 1.8991979162106922, + "language_loss": 0.71695077, + "learning_rate": 2.7814841653401485e-06, + "loss": 0.73842514, + "num_input_tokens_seen": 139632755, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.74609375, + "step": 6504, + "time_per_iteration": 2.474257230758667 + }, + { + "auxiliary_loss_clip": 0.01116218, + "auxiliary_loss_mlp": 0.01038584, + "balance_loss_clip": 1.02404082, + "balance_loss_mlp": 1.03938556, + "epoch": 0.3911017586051405, + "flos": 26323347732480.0, + "grad_norm": 1.4403698273396093, + "language_loss": 0.83054864, + "learning_rate": 2.7811256509683454e-06, + "loss": 0.85209668, + "num_input_tokens_seen": 139654205, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.765625, + "step": 6505, + "time_per_iteration": 2.4917776584625244 + }, + { + "auxiliary_loss_clip": 0.01118318, + "auxiliary_loss_mlp": 0.01039423, + "balance_loss_clip": 1.02379465, + "balance_loss_mlp": 1.04268944, + "epoch": 0.3911618818578085, + "flos": 21835770992640.0, + "grad_norm": 1.9728617659661118, + "language_loss": 0.71202552, + "learning_rate": 2.7807671069761797e-06, + "loss": 0.73360288, + "num_input_tokens_seen": 139673595, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.7578125, + "step": 6506, + "time_per_iteration": 2.4846489429473877 + }, + { + "auxiliary_loss_clip": 0.01115165, + "auxiliary_loss_mlp": 0.01038977, + "balance_loss_clip": 1.02529216, + "balance_loss_mlp": 1.04129732, + "epoch": 0.3912220051104765, + "flos": 16359680989440.0, + "grad_norm": 2.0442674369719547, + "language_loss": 0.74914789, + "learning_rate": 2.7804085333772477e-06, + "loss": 0.77068931, + "num_input_tokens_seen": 139690565, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73828125, + "step": 6507, + "time_per_iteration": 2.4173166751861572 + }, + { + "auxiliary_loss_clip": 0.01036092, + "auxiliary_loss_mlp": 0.01010532, + "balance_loss_clip": 1.00900638, + "balance_loss_mlp": 1.01097417, + "epoch": 0.39128212836314447, + "flos": 71050986420480.0, + "grad_norm": 0.7697412763639314, + "language_loss": 0.56554615, + "learning_rate": 2.7800499301851446e-06, + "loss": 0.58601236, + "num_input_tokens_seen": 139749420, + "router_z_loss_clip": 0.01525879, + "router_z_loss_mlp": 0.25195312, + "step": 6508, + "time_per_iteration": 3.222599744796753 + }, + { + "auxiliary_loss_clip": 0.01118923, + "auxiliary_loss_mlp": 0.0103919, + "balance_loss_clip": 1.0256958, + "balance_loss_mlp": 1.04224479, + "epoch": 0.39134225161581243, + "flos": 20331975173760.0, + "grad_norm": 1.8903485988869968, + "language_loss": 0.7639432, + "learning_rate": 2.779691297413471e-06, + "loss": 0.78552431, + "num_input_tokens_seen": 139766265, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 6509, + "time_per_iteration": 2.4504122734069824 + }, + { + "auxiliary_loss_clip": 0.01119308, + "auxiliary_loss_mlp": 0.01046298, + "balance_loss_clip": 1.02919126, + "balance_loss_mlp": 1.04120517, + "epoch": 0.3914023748684804, + "flos": 17018390551680.0, + "grad_norm": 2.5320410479027284, + "language_loss": 0.82538676, + "learning_rate": 2.779332635075825e-06, + "loss": 0.84704286, + "num_input_tokens_seen": 139782400, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.78125, + "step": 6510, + "time_per_iteration": 2.4280829429626465 + }, + { + "auxiliary_loss_clip": 0.01119081, + "auxiliary_loss_mlp": 0.01036031, + "balance_loss_clip": 1.02202439, + "balance_loss_mlp": 1.04137504, + "epoch": 0.39146249812114836, + "flos": 18405224709120.0, + "grad_norm": 1.9726874536239134, + "language_loss": 0.76478642, + "learning_rate": 2.7789739431858073e-06, + "loss": 0.78633761, + "num_input_tokens_seen": 139801435, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.77734375, + "step": 6511, + "time_per_iteration": 2.438093662261963 + }, + { + "auxiliary_loss_clip": 0.01035954, + "auxiliary_loss_mlp": 0.01004811, + "balance_loss_clip": 1.0033921, + "balance_loss_mlp": 1.01070499, + "epoch": 0.3915226213738163, + "flos": 67637355442560.0, + "grad_norm": 0.7278620231464888, + "language_loss": 0.57780313, + "learning_rate": 2.7786152217570196e-06, + "loss": 0.59821081, + "num_input_tokens_seen": 139869700, + "router_z_loss_clip": 0.01416016, + "router_z_loss_mlp": 0.25390625, + "step": 6512, + "time_per_iteration": 6.094903230667114 + }, + { + "auxiliary_loss_clip": 0.01120485, + "auxiliary_loss_mlp": 0.01036295, + "balance_loss_clip": 1.02039289, + "balance_loss_mlp": 1.04215658, + "epoch": 0.3915827446264843, + "flos": 26359330181760.0, + "grad_norm": 1.6857291908308145, + "language_loss": 0.69891763, + "learning_rate": 2.7782564708030647e-06, + "loss": 0.72048545, + "num_input_tokens_seen": 139890140, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.78125, + "step": 6513, + "time_per_iteration": 3.8939309120178223 + }, + { + "auxiliary_loss_clip": 0.01122702, + "auxiliary_loss_mlp": 0.01039276, + "balance_loss_clip": 1.02474439, + "balance_loss_mlp": 1.04184556, + "epoch": 0.39164286787915226, + "flos": 21943897908480.0, + "grad_norm": 2.2930968868818606, + "language_loss": 0.76267236, + "learning_rate": 2.7778976903375464e-06, + "loss": 0.7842921, + "num_input_tokens_seen": 139908020, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.80859375, + "step": 6514, + "time_per_iteration": 2.4622693061828613 + }, + { + "auxiliary_loss_clip": 0.01118584, + "auxiliary_loss_mlp": 0.01035563, + "balance_loss_clip": 1.02168727, + "balance_loss_mlp": 1.04042864, + "epoch": 0.3917029911318202, + "flos": 16399829416320.0, + "grad_norm": 1.7838082674219136, + "language_loss": 0.77452338, + "learning_rate": 2.7775388803740693e-06, + "loss": 0.79606491, + "num_input_tokens_seen": 139926180, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.78125, + "step": 6515, + "time_per_iteration": 2.4336462020874023 + }, + { + "auxiliary_loss_clip": 0.01114007, + "auxiliary_loss_mlp": 0.01038217, + "balance_loss_clip": 1.02564025, + "balance_loss_mlp": 1.03940558, + "epoch": 0.3917631143844882, + "flos": 26211701283840.0, + "grad_norm": 1.4542421972503212, + "language_loss": 0.79846406, + "learning_rate": 2.7771800409262406e-06, + "loss": 0.81998634, + "num_input_tokens_seen": 139947420, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.74609375, + "step": 6516, + "time_per_iteration": 2.500826597213745 + }, + { + "auxiliary_loss_clip": 0.01118601, + "auxiliary_loss_mlp": 0.01033224, + "balance_loss_clip": 1.01891923, + "balance_loss_mlp": 1.04082477, + "epoch": 0.39182323763715615, + "flos": 18548364407040.0, + "grad_norm": 2.228742695866407, + "language_loss": 0.70205939, + "learning_rate": 2.7768211720076665e-06, + "loss": 0.72357762, + "num_input_tokens_seen": 139965800, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.77734375, + "step": 6517, + "time_per_iteration": 2.425739288330078 + }, + { + "auxiliary_loss_clip": 0.01117481, + "auxiliary_loss_mlp": 0.01036962, + "balance_loss_clip": 1.0218817, + "balance_loss_mlp": 1.03986263, + "epoch": 0.3918833608898241, + "flos": 34313543395200.0, + "grad_norm": 1.595983335780194, + "language_loss": 0.72092575, + "learning_rate": 2.776462273631956e-06, + "loss": 0.74247015, + "num_input_tokens_seen": 139988140, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7734375, + "step": 6518, + "time_per_iteration": 2.559140205383301 + }, + { + "auxiliary_loss_clip": 0.01118745, + "auxiliary_loss_mlp": 0.01032386, + "balance_loss_clip": 1.0179677, + "balance_loss_mlp": 1.04041731, + "epoch": 0.3919434841424921, + "flos": 36939582812160.0, + "grad_norm": 1.563160017416143, + "language_loss": 0.61668754, + "learning_rate": 2.7761033458127177e-06, + "loss": 0.63819885, + "num_input_tokens_seen": 140010060, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 6519, + "time_per_iteration": 2.5673322677612305 + }, + { + "auxiliary_loss_clip": 0.01124684, + "auxiliary_loss_mlp": 0.0104391, + "balance_loss_clip": 1.02800775, + "balance_loss_mlp": 1.04341698, + "epoch": 0.3920036073951601, + "flos": 23508956373120.0, + "grad_norm": 2.4564373100444232, + "language_loss": 0.6693083, + "learning_rate": 2.775744388563563e-06, + "loss": 0.6909942, + "num_input_tokens_seen": 140029400, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8125, + "step": 6520, + "time_per_iteration": 2.487650156021118 + }, + { + "auxiliary_loss_clip": 0.0111526, + "auxiliary_loss_mlp": 0.01033685, + "balance_loss_clip": 1.01958799, + "balance_loss_mlp": 1.03966665, + "epoch": 0.39206373064782807, + "flos": 18406086635520.0, + "grad_norm": 1.7599889377917473, + "language_loss": 0.78522319, + "learning_rate": 2.775385401898104e-06, + "loss": 0.80671263, + "num_input_tokens_seen": 140048940, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7578125, + "step": 6521, + "time_per_iteration": 2.418458938598633 + }, + { + "auxiliary_loss_clip": 0.0112345, + "auxiliary_loss_mlp": 0.01036245, + "balance_loss_clip": 1.01853049, + "balance_loss_mlp": 1.04218912, + "epoch": 0.39212385390049603, + "flos": 12313051608960.0, + "grad_norm": 2.4256865138527353, + "language_loss": 0.70340407, + "learning_rate": 2.775026385829952e-06, + "loss": 0.7250011, + "num_input_tokens_seen": 140066380, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.8125, + "step": 6522, + "time_per_iteration": 2.435802936553955 + }, + { + "auxiliary_loss_clip": 0.01120666, + "auxiliary_loss_mlp": 0.01034593, + "balance_loss_clip": 1.02013338, + "balance_loss_mlp": 1.04137838, + "epoch": 0.392183977153164, + "flos": 19719160214400.0, + "grad_norm": 1.8374103087918643, + "language_loss": 0.76740485, + "learning_rate": 2.774667340372722e-06, + "loss": 0.78895748, + "num_input_tokens_seen": 140085275, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7890625, + "step": 6523, + "time_per_iteration": 2.4279329776763916 + }, + { + "auxiliary_loss_clip": 0.01120195, + "auxiliary_loss_mlp": 0.0103949, + "balance_loss_clip": 1.02522683, + "balance_loss_mlp": 1.04124415, + "epoch": 0.39224410040583196, + "flos": 33144902403840.0, + "grad_norm": 2.339335808739943, + "language_loss": 0.61661494, + "learning_rate": 2.7743082655400293e-06, + "loss": 0.63821173, + "num_input_tokens_seen": 140105105, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7890625, + "step": 6524, + "time_per_iteration": 2.5591442584991455 + }, + { + "auxiliary_loss_clip": 0.01117506, + "auxiliary_loss_mlp": 0.0103718, + "balance_loss_clip": 1.02181363, + "balance_loss_mlp": 1.03898454, + "epoch": 0.39230422365849993, + "flos": 27782434097280.0, + "grad_norm": 1.6728206813409823, + "language_loss": 0.73940414, + "learning_rate": 2.773949161345489e-06, + "loss": 0.76095104, + "num_input_tokens_seen": 140125645, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.78515625, + "step": 6525, + "time_per_iteration": 2.4897830486297607 + }, + { + "auxiliary_loss_clip": 0.01117533, + "auxiliary_loss_mlp": 0.01036063, + "balance_loss_clip": 1.02224112, + "balance_loss_mlp": 1.03882146, + "epoch": 0.3923643469111679, + "flos": 17931634865280.0, + "grad_norm": 2.0942212479104363, + "language_loss": 0.81385779, + "learning_rate": 2.773590027802719e-06, + "loss": 0.83539373, + "num_input_tokens_seen": 140141925, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.78515625, + "step": 6526, + "time_per_iteration": 2.442091226577759 + }, + { + "auxiliary_loss_clip": 0.01115953, + "auxiliary_loss_mlp": 0.01036718, + "balance_loss_clip": 1.02265131, + "balance_loss_mlp": 1.03931344, + "epoch": 0.39242447016383586, + "flos": 24059539019520.0, + "grad_norm": 1.56527231709598, + "language_loss": 0.69802964, + "learning_rate": 2.7732308649253383e-06, + "loss": 0.71955633, + "num_input_tokens_seen": 140160965, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.765625, + "step": 6527, + "time_per_iteration": 2.465498924255371 + }, + { + "auxiliary_loss_clip": 0.01116064, + "auxiliary_loss_mlp": 0.01029624, + "balance_loss_clip": 1.0154264, + "balance_loss_mlp": 1.04067612, + "epoch": 0.3924845934165038, + "flos": 10664069016960.0, + "grad_norm": 2.4439619967755983, + "language_loss": 0.82215756, + "learning_rate": 2.772871672726965e-06, + "loss": 0.84361446, + "num_input_tokens_seen": 140177780, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.75390625, + "step": 6528, + "time_per_iteration": 2.488581418991089 + }, + { + "auxiliary_loss_clip": 0.01114295, + "auxiliary_loss_mlp": 0.0103716, + "balance_loss_clip": 1.02282465, + "balance_loss_mlp": 1.04024255, + "epoch": 0.3925447166691718, + "flos": 31245910174080.0, + "grad_norm": 1.4897772961790412, + "language_loss": 0.68726033, + "learning_rate": 2.7725124512212205e-06, + "loss": 0.70877492, + "num_input_tokens_seen": 140201660, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7421875, + "step": 6529, + "time_per_iteration": 2.5409562587738037 + }, + { + "auxiliary_loss_clip": 0.0111896, + "auxiliary_loss_mlp": 0.01039977, + "balance_loss_clip": 1.02561271, + "balance_loss_mlp": 1.04070282, + "epoch": 0.39260483992183975, + "flos": 29415040087680.0, + "grad_norm": 2.9003920421281926, + "language_loss": 0.79728955, + "learning_rate": 2.7721532004217267e-06, + "loss": 0.81887889, + "num_input_tokens_seen": 140218585, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78125, + "step": 6530, + "time_per_iteration": 2.514547109603882 + }, + { + "auxiliary_loss_clip": 0.01112608, + "auxiliary_loss_mlp": 0.01036942, + "balance_loss_clip": 1.02267241, + "balance_loss_mlp": 1.03750181, + "epoch": 0.3926649631745077, + "flos": 22857788666880.0, + "grad_norm": 1.6221630004730245, + "language_loss": 0.75564003, + "learning_rate": 2.7717939203421063e-06, + "loss": 0.77713549, + "num_input_tokens_seen": 140239905, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.75, + "step": 6531, + "time_per_iteration": 2.4572982788085938 + }, + { + "auxiliary_loss_clip": 0.01038893, + "auxiliary_loss_mlp": 0.0100286, + "balance_loss_clip": 1.00127435, + "balance_loss_mlp": 1.01370025, + "epoch": 0.3927250864271757, + "flos": 63893881872000.0, + "grad_norm": 0.8170127744653651, + "language_loss": 0.60378772, + "learning_rate": 2.7714346109959822e-06, + "loss": 0.62420523, + "num_input_tokens_seen": 140293820, + "router_z_loss_clip": 0.01586914, + "router_z_loss_mlp": 0.25195312, + "step": 6532, + "time_per_iteration": 2.929732084274292 + }, + { + "auxiliary_loss_clip": 0.01036987, + "auxiliary_loss_mlp": 0.01003862, + "balance_loss_clip": 1.00225282, + "balance_loss_mlp": 1.0117799, + "epoch": 0.3927852096798437, + "flos": 68909741890560.0, + "grad_norm": 0.7837299971611431, + "language_loss": 0.55545104, + "learning_rate": 2.771075272396981e-06, + "loss": 0.57585955, + "num_input_tokens_seen": 140360420, + "router_z_loss_clip": 0.01611328, + "router_z_loss_mlp": 0.25195312, + "step": 6533, + "time_per_iteration": 3.1820483207702637 + }, + { + "auxiliary_loss_clip": 0.01120735, + "auxiliary_loss_mlp": 0.01037419, + "balance_loss_clip": 1.02316761, + "balance_loss_mlp": 1.04170942, + "epoch": 0.39284533293251167, + "flos": 29715972232320.0, + "grad_norm": 1.9313522305780093, + "language_loss": 0.75972468, + "learning_rate": 2.7707159045587284e-06, + "loss": 0.78130615, + "num_input_tokens_seen": 140381950, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7890625, + "step": 6534, + "time_per_iteration": 2.5650813579559326 + }, + { + "auxiliary_loss_clip": 0.01122617, + "auxiliary_loss_mlp": 0.01038287, + "balance_loss_clip": 1.02376163, + "balance_loss_mlp": 1.04177046, + "epoch": 0.39290545618517964, + "flos": 18552027594240.0, + "grad_norm": 2.213634574223379, + "language_loss": 0.78067005, + "learning_rate": 2.770356507494851e-06, + "loss": 0.802279, + "num_input_tokens_seen": 140399410, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.80859375, + "step": 6535, + "time_per_iteration": 2.447950839996338 + }, + { + "auxiliary_loss_clip": 0.01113628, + "auxiliary_loss_mlp": 0.01032655, + "balance_loss_clip": 1.01950026, + "balance_loss_mlp": 1.03985262, + "epoch": 0.3929655794378476, + "flos": 26249479413120.0, + "grad_norm": 2.091132286884177, + "language_loss": 0.68613565, + "learning_rate": 2.769997081218978e-06, + "loss": 0.70759845, + "num_input_tokens_seen": 140419055, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73828125, + "step": 6536, + "time_per_iteration": 2.4873242378234863 + }, + { + "auxiliary_loss_clip": 0.01112038, + "auxiliary_loss_mlp": 0.01035235, + "balance_loss_clip": 1.0223484, + "balance_loss_mlp": 1.03908086, + "epoch": 0.39302570269051557, + "flos": 29277933874560.0, + "grad_norm": 1.7105256577096235, + "language_loss": 0.69052541, + "learning_rate": 2.769637625744738e-06, + "loss": 0.71199811, + "num_input_tokens_seen": 140438800, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.73046875, + "step": 6537, + "time_per_iteration": 2.5867457389831543 + }, + { + "auxiliary_loss_clip": 0.01117392, + "auxiliary_loss_mlp": 0.01038473, + "balance_loss_clip": 1.02420986, + "balance_loss_mlp": 1.04011965, + "epoch": 0.39308582594318353, + "flos": 17347440067200.0, + "grad_norm": 1.6628056753547982, + "language_loss": 0.79044384, + "learning_rate": 2.769278141085763e-06, + "loss": 0.81200254, + "num_input_tokens_seen": 140456880, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7734375, + "step": 6538, + "time_per_iteration": 2.437757968902588 + }, + { + "auxiliary_loss_clip": 0.01034351, + "auxiliary_loss_mlp": 0.01009828, + "balance_loss_clip": 1.0084635, + "balance_loss_mlp": 1.00972295, + "epoch": 0.3931459491958515, + "flos": 61007094650880.0, + "grad_norm": 0.8042725449961473, + "language_loss": 0.61871827, + "learning_rate": 2.768918627255683e-06, + "loss": 0.63916004, + "num_input_tokens_seen": 140507510, + "router_z_loss_clip": 0.01367188, + "router_z_loss_mlp": 0.24609375, + "step": 6539, + "time_per_iteration": 2.9012601375579834 + }, + { + "auxiliary_loss_clip": 0.01115165, + "auxiliary_loss_mlp": 0.01038758, + "balance_loss_clip": 1.02417326, + "balance_loss_mlp": 1.03897023, + "epoch": 0.39320607244851946, + "flos": 39016009249920.0, + "grad_norm": 2.1025744829352306, + "language_loss": 0.68334043, + "learning_rate": 2.7685590842681315e-06, + "loss": 0.70487964, + "num_input_tokens_seen": 140528740, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.76171875, + "step": 6540, + "time_per_iteration": 2.617544412612915 + }, + { + "auxiliary_loss_clip": 0.01114279, + "auxiliary_loss_mlp": 0.01035483, + "balance_loss_clip": 1.02167249, + "balance_loss_mlp": 1.0387044, + "epoch": 0.3932661957011874, + "flos": 24679752180480.0, + "grad_norm": 1.7155589252050778, + "language_loss": 0.72714561, + "learning_rate": 2.7681995121367433e-06, + "loss": 0.74864328, + "num_input_tokens_seen": 140547560, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 6541, + "time_per_iteration": 2.5576202869415283 + }, + { + "auxiliary_loss_clip": 0.01034882, + "auxiliary_loss_mlp": 0.01010056, + "balance_loss_clip": 1.00863171, + "balance_loss_mlp": 1.0103662, + "epoch": 0.3933263189538554, + "flos": 70096552185600.0, + "grad_norm": 0.8254504926360222, + "language_loss": 0.60302341, + "learning_rate": 2.7678399108751516e-06, + "loss": 0.62347269, + "num_input_tokens_seen": 140601175, + "router_z_loss_clip": 0.01422119, + "router_z_loss_mlp": 0.24511719, + "step": 6542, + "time_per_iteration": 2.921311378479004 + }, + { + "auxiliary_loss_clip": 0.01115263, + "auxiliary_loss_mlp": 0.01035713, + "balance_loss_clip": 1.02204013, + "balance_loss_mlp": 1.03968477, + "epoch": 0.39338644220652336, + "flos": 22929071207040.0, + "grad_norm": 1.9294145782355336, + "language_loss": 0.82255107, + "learning_rate": 2.7674802804969947e-06, + "loss": 0.84406084, + "num_input_tokens_seen": 140622200, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75390625, + "step": 6543, + "time_per_iteration": 2.5267767906188965 + }, + { + "auxiliary_loss_clip": 0.01111884, + "auxiliary_loss_mlp": 0.01035514, + "balance_loss_clip": 1.02153063, + "balance_loss_mlp": 1.03692436, + "epoch": 0.3934465654591913, + "flos": 30848163897600.0, + "grad_norm": 1.6066266241550669, + "language_loss": 0.69336796, + "learning_rate": 2.767120621015908e-06, + "loss": 0.7148419, + "num_input_tokens_seen": 140643125, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75, + "step": 6544, + "time_per_iteration": 2.5192980766296387 + }, + { + "auxiliary_loss_clip": 0.01118616, + "auxiliary_loss_mlp": 0.01042644, + "balance_loss_clip": 1.02729011, + "balance_loss_mlp": 1.03997457, + "epoch": 0.3935066887118593, + "flos": 29236528471680.0, + "grad_norm": 1.880723151689185, + "language_loss": 0.75104976, + "learning_rate": 2.76676093244553e-06, + "loss": 0.77266246, + "num_input_tokens_seen": 140662500, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.7890625, + "step": 6545, + "time_per_iteration": 2.5483953952789307 + }, + { + "auxiliary_loss_clip": 0.01112383, + "auxiliary_loss_mlp": 0.01035537, + "balance_loss_clip": 1.02350879, + "balance_loss_mlp": 1.04072022, + "epoch": 0.3935668119645273, + "flos": 19135288638720.0, + "grad_norm": 1.4191511939867936, + "language_loss": 0.74600172, + "learning_rate": 2.7664012147995015e-06, + "loss": 0.76748097, + "num_input_tokens_seen": 140681960, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.71875, + "step": 6546, + "time_per_iteration": 2.435189962387085 + }, + { + "auxiliary_loss_clip": 0.01120275, + "auxiliary_loss_mlp": 0.01037024, + "balance_loss_clip": 1.02256405, + "balance_loss_mlp": 1.03998446, + "epoch": 0.3936269352171953, + "flos": 18516116972160.0, + "grad_norm": 2.8050093889996326, + "language_loss": 0.81520575, + "learning_rate": 2.7660414680914617e-06, + "loss": 0.83677876, + "num_input_tokens_seen": 140699170, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.80078125, + "step": 6547, + "time_per_iteration": 2.5359435081481934 + }, + { + "auxiliary_loss_clip": 0.0111424, + "auxiliary_loss_mlp": 0.01028344, + "balance_loss_clip": 1.01444387, + "balance_loss_mlp": 1.03795588, + "epoch": 0.39368705846986324, + "flos": 15632813370240.0, + "grad_norm": 2.282095961224954, + "language_loss": 0.84300089, + "learning_rate": 2.7656816923350525e-06, + "loss": 0.86442673, + "num_input_tokens_seen": 140714920, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.765625, + "step": 6548, + "time_per_iteration": 2.430497407913208 + }, + { + "auxiliary_loss_clip": 0.01110548, + "auxiliary_loss_mlp": 0.01030679, + "balance_loss_clip": 1.01784039, + "balance_loss_mlp": 1.0382576, + "epoch": 0.3937471817225312, + "flos": 21325839563520.0, + "grad_norm": 1.5261467823901598, + "language_loss": 0.72481942, + "learning_rate": 2.7653218875439174e-06, + "loss": 0.74623168, + "num_input_tokens_seen": 140734595, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.72265625, + "step": 6549, + "time_per_iteration": 2.484938383102417 + }, + { + "auxiliary_loss_clip": 0.01116832, + "auxiliary_loss_mlp": 0.01034368, + "balance_loss_clip": 1.02025914, + "balance_loss_mlp": 1.04114747, + "epoch": 0.39380730497519917, + "flos": 20776693461120.0, + "grad_norm": 1.525417369659451, + "language_loss": 0.77678335, + "learning_rate": 2.764962053731699e-06, + "loss": 0.79829538, + "num_input_tokens_seen": 140754050, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7578125, + "step": 6550, + "time_per_iteration": 2.4533822536468506 + }, + { + "auxiliary_loss_clip": 0.01112095, + "auxiliary_loss_mlp": 0.0103049, + "balance_loss_clip": 1.01695979, + "balance_loss_mlp": 1.03770638, + "epoch": 0.39386742822786713, + "flos": 21609784575360.0, + "grad_norm": 1.6825180459961226, + "language_loss": 0.81065381, + "learning_rate": 2.7646021909120434e-06, + "loss": 0.83207965, + "num_input_tokens_seen": 140771440, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7421875, + "step": 6551, + "time_per_iteration": 2.4740419387817383 + }, + { + "auxiliary_loss_clip": 0.01115626, + "auxiliary_loss_mlp": 0.01037041, + "balance_loss_clip": 1.02310574, + "balance_loss_mlp": 1.03833413, + "epoch": 0.3939275514805351, + "flos": 12414642249600.0, + "grad_norm": 2.2350138021364003, + "language_loss": 0.80241704, + "learning_rate": 2.764242299098596e-06, + "loss": 0.82394373, + "num_input_tokens_seen": 140786715, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7734375, + "step": 6552, + "time_per_iteration": 2.4066245555877686 + }, + { + "auxiliary_loss_clip": 0.01118032, + "auxiliary_loss_mlp": 0.01038605, + "balance_loss_clip": 1.02449059, + "balance_loss_mlp": 1.04108357, + "epoch": 0.39398767473320306, + "flos": 18552027594240.0, + "grad_norm": 2.2028177738118884, + "language_loss": 0.71154666, + "learning_rate": 2.763882378305003e-06, + "loss": 0.73311305, + "num_input_tokens_seen": 140804950, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.76953125, + "step": 6553, + "time_per_iteration": 2.454035997390747 + }, + { + "auxiliary_loss_clip": 0.01115775, + "auxiliary_loss_mlp": 0.01034177, + "balance_loss_clip": 1.02037239, + "balance_loss_mlp": 1.0409205, + "epoch": 0.39404779798587103, + "flos": 29308888419840.0, + "grad_norm": 1.9276274050376605, + "language_loss": 0.63445336, + "learning_rate": 2.7635224285449144e-06, + "loss": 0.65595293, + "num_input_tokens_seen": 140822800, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.75, + "step": 6554, + "time_per_iteration": 5.467530250549316 + }, + { + "auxiliary_loss_clip": 0.01116231, + "auxiliary_loss_mlp": 0.0103909, + "balance_loss_clip": 1.02620983, + "balance_loss_mlp": 1.041237, + "epoch": 0.394107921238539, + "flos": 34897055834880.0, + "grad_norm": 2.7325305725381703, + "language_loss": 0.79567587, + "learning_rate": 2.7631624498319796e-06, + "loss": 0.81722915, + "num_input_tokens_seen": 140842940, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.75, + "step": 6555, + "time_per_iteration": 3.9707608222961426 + }, + { + "auxiliary_loss_clip": 0.01119332, + "auxiliary_loss_mlp": 0.01036045, + "balance_loss_clip": 1.0209887, + "balance_loss_mlp": 1.04194546, + "epoch": 0.39416804449120696, + "flos": 25081413039360.0, + "grad_norm": 1.8303237809157376, + "language_loss": 0.71571302, + "learning_rate": 2.7628024421798473e-06, + "loss": 0.73726678, + "num_input_tokens_seen": 140863060, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7734375, + "step": 6556, + "time_per_iteration": 2.5013363361358643 + }, + { + "auxiliary_loss_clip": 0.01115996, + "auxiliary_loss_mlp": 0.0103255, + "balance_loss_clip": 1.01806605, + "balance_loss_mlp": 1.03954887, + "epoch": 0.3942281677438749, + "flos": 32306639731200.0, + "grad_norm": 2.056709462434603, + "language_loss": 0.83915412, + "learning_rate": 2.7624424056021705e-06, + "loss": 0.86063957, + "num_input_tokens_seen": 140883795, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.765625, + "step": 6557, + "time_per_iteration": 2.7162060737609863 + }, + { + "auxiliary_loss_clip": 0.01115806, + "auxiliary_loss_mlp": 0.01035387, + "balance_loss_clip": 1.02195859, + "balance_loss_mlp": 1.04014397, + "epoch": 0.3942882909965429, + "flos": 24936621315840.0, + "grad_norm": 3.2694171829217953, + "language_loss": 0.80285048, + "learning_rate": 2.7620823401126004e-06, + "loss": 0.8243624, + "num_input_tokens_seen": 140903055, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 6558, + "time_per_iteration": 2.466904401779175 + }, + { + "auxiliary_loss_clip": 0.01115408, + "auxiliary_loss_mlp": 0.01033231, + "balance_loss_clip": 1.02037418, + "balance_loss_mlp": 1.04165912, + "epoch": 0.39434841424921085, + "flos": 11874797769600.0, + "grad_norm": 1.7254990423790144, + "language_loss": 0.71022832, + "learning_rate": 2.761722245724792e-06, + "loss": 0.73171461, + "num_input_tokens_seen": 140920685, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.734375, + "step": 6559, + "time_per_iteration": 2.474142551422119 + }, + { + "auxiliary_loss_clip": 0.01120627, + "auxiliary_loss_mlp": 0.0103693, + "balance_loss_clip": 1.02111125, + "balance_loss_mlp": 1.04030299, + "epoch": 0.3944085375018789, + "flos": 16361620323840.0, + "grad_norm": 1.8853849407225942, + "language_loss": 0.80391413, + "learning_rate": 2.7613621224524003e-06, + "loss": 0.82548964, + "num_input_tokens_seen": 140937320, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8046875, + "step": 6560, + "time_per_iteration": 2.4220218658447266 + }, + { + "auxiliary_loss_clip": 0.01121865, + "auxiliary_loss_mlp": 0.01037184, + "balance_loss_clip": 1.022223, + "balance_loss_mlp": 1.04395843, + "epoch": 0.39446866075454684, + "flos": 10633365866880.0, + "grad_norm": 3.2514761912447283, + "language_loss": 0.83440554, + "learning_rate": 2.7610019703090803e-06, + "loss": 0.85599601, + "num_input_tokens_seen": 140954855, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.78125, + "step": 6561, + "time_per_iteration": 2.458305835723877 + }, + { + "auxiliary_loss_clip": 0.01117482, + "auxiliary_loss_mlp": 0.01038407, + "balance_loss_clip": 1.02458477, + "balance_loss_mlp": 1.04098439, + "epoch": 0.3945287840072148, + "flos": 18187498419840.0, + "grad_norm": 2.862241713271481, + "language_loss": 0.79548055, + "learning_rate": 2.7606417893084887e-06, + "loss": 0.81703943, + "num_input_tokens_seen": 140973250, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 6562, + "time_per_iteration": 2.4390974044799805 + }, + { + "auxiliary_loss_clip": 0.01113935, + "auxiliary_loss_mlp": 0.01036907, + "balance_loss_clip": 1.02301359, + "balance_loss_mlp": 1.04043949, + "epoch": 0.39458890725988277, + "flos": 23039891642880.0, + "grad_norm": 1.512260767998718, + "language_loss": 0.81355608, + "learning_rate": 2.7602815794642853e-06, + "loss": 0.83506453, + "num_input_tokens_seen": 140993050, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.734375, + "step": 6563, + "time_per_iteration": 2.518843650817871 + }, + { + "auxiliary_loss_clip": 0.0111742, + "auxiliary_loss_mlp": 0.01040253, + "balance_loss_clip": 1.02541161, + "balance_loss_mlp": 1.041682, + "epoch": 0.39464903051255074, + "flos": 17159052211200.0, + "grad_norm": 1.9438463538262531, + "language_loss": 0.69416577, + "learning_rate": 2.759921340790127e-06, + "loss": 0.71574247, + "num_input_tokens_seen": 141010815, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7578125, + "step": 6564, + "time_per_iteration": 2.446140766143799 + }, + { + "auxiliary_loss_clip": 0.01118477, + "auxiliary_loss_mlp": 0.01034591, + "balance_loss_clip": 1.02079892, + "balance_loss_mlp": 1.04157352, + "epoch": 0.3947091537652187, + "flos": 15889000147200.0, + "grad_norm": 3.234298893133154, + "language_loss": 0.83141822, + "learning_rate": 2.759561073299676e-06, + "loss": 0.8529489, + "num_input_tokens_seen": 141028720, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76953125, + "step": 6565, + "time_per_iteration": 2.474611520767212 + }, + { + "auxiliary_loss_clip": 0.01115253, + "auxiliary_loss_mlp": 0.01033237, + "balance_loss_clip": 1.02002859, + "balance_loss_mlp": 1.04039359, + "epoch": 0.39476927701788667, + "flos": 18545491319040.0, + "grad_norm": 1.7678460287206497, + "language_loss": 0.82917452, + "learning_rate": 2.7592007770065937e-06, + "loss": 0.85065943, + "num_input_tokens_seen": 141046025, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.75, + "step": 6566, + "time_per_iteration": 2.432832956314087 + }, + { + "auxiliary_loss_clip": 0.01122918, + "auxiliary_loss_mlp": 0.01038867, + "balance_loss_clip": 1.02493143, + "balance_loss_mlp": 1.04225016, + "epoch": 0.39482940027055463, + "flos": 22275712771200.0, + "grad_norm": 2.357536272997057, + "language_loss": 0.7778033, + "learning_rate": 2.7588404519245403e-06, + "loss": 0.79942119, + "num_input_tokens_seen": 141066865, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.8046875, + "step": 6567, + "time_per_iteration": 2.5020110607147217 + }, + { + "auxiliary_loss_clip": 0.01110775, + "auxiliary_loss_mlp": 0.01040399, + "balance_loss_clip": 1.02689242, + "balance_loss_mlp": 1.04026425, + "epoch": 0.3948895235232226, + "flos": 14757634494720.0, + "grad_norm": 2.0625384967809546, + "language_loss": 0.80381507, + "learning_rate": 2.758480098067182e-06, + "loss": 0.8253268, + "num_input_tokens_seen": 141084210, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.70703125, + "step": 6568, + "time_per_iteration": 2.464186429977417 + }, + { + "auxiliary_loss_clip": 0.01116352, + "auxiliary_loss_mlp": 0.01036196, + "balance_loss_clip": 1.02282655, + "balance_loss_mlp": 1.04130197, + "epoch": 0.39494964677589056, + "flos": 22565763095040.0, + "grad_norm": 1.6625556258765348, + "language_loss": 0.84206939, + "learning_rate": 2.7581197154481816e-06, + "loss": 0.86359489, + "num_input_tokens_seen": 141103895, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.75, + "step": 6569, + "time_per_iteration": 2.4947829246520996 + }, + { + "auxiliary_loss_clip": 0.01118805, + "auxiliary_loss_mlp": 0.01037428, + "balance_loss_clip": 1.02418959, + "balance_loss_mlp": 1.04450357, + "epoch": 0.3950097700285585, + "flos": 22963186149120.0, + "grad_norm": 1.920459843417803, + "language_loss": 0.74973899, + "learning_rate": 2.7577593040812066e-06, + "loss": 0.77130127, + "num_input_tokens_seen": 141124000, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7421875, + "step": 6570, + "time_per_iteration": 2.50211763381958 + }, + { + "auxiliary_loss_clip": 0.01117011, + "auxiliary_loss_mlp": 0.01037708, + "balance_loss_clip": 1.02365923, + "balance_loss_mlp": 1.04104555, + "epoch": 0.3950698932812265, + "flos": 20595236929920.0, + "grad_norm": 1.649568183340291, + "language_loss": 0.79813123, + "learning_rate": 2.757398863979922e-06, + "loss": 0.81967843, + "num_input_tokens_seen": 141142535, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 6571, + "time_per_iteration": 2.477740526199341 + }, + { + "auxiliary_loss_clip": 0.01116017, + "auxiliary_loss_mlp": 0.01041795, + "balance_loss_clip": 1.02846146, + "balance_loss_mlp": 1.04203689, + "epoch": 0.39513001653389446, + "flos": 20375786787840.0, + "grad_norm": 1.628324795196944, + "language_loss": 0.77873337, + "learning_rate": 2.757038395157997e-06, + "loss": 0.80031145, + "num_input_tokens_seen": 141161575, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73828125, + "step": 6572, + "time_per_iteration": 2.4463839530944824 + }, + { + "auxiliary_loss_clip": 0.01118116, + "auxiliary_loss_mlp": 0.01040167, + "balance_loss_clip": 1.02636874, + "balance_loss_mlp": 1.0404911, + "epoch": 0.3951901397865625, + "flos": 26463650256000.0, + "grad_norm": 1.6456702645470058, + "language_loss": 0.7506038, + "learning_rate": 2.7566778976291002e-06, + "loss": 0.77218664, + "num_input_tokens_seen": 141181150, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.77734375, + "step": 6573, + "time_per_iteration": 2.501692295074463 + }, + { + "auxiliary_loss_clip": 0.01114036, + "auxiliary_loss_mlp": 0.01034006, + "balance_loss_clip": 1.02165031, + "balance_loss_mlp": 1.04046559, + "epoch": 0.39525026303923044, + "flos": 43838345767680.0, + "grad_norm": 1.4003162240803297, + "language_loss": 0.67956495, + "learning_rate": 2.7563173714069017e-06, + "loss": 0.70104533, + "num_input_tokens_seen": 141206310, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.734375, + "step": 6574, + "time_per_iteration": 2.6566920280456543 + }, + { + "auxiliary_loss_clip": 0.01119799, + "auxiliary_loss_mlp": 0.01034669, + "balance_loss_clip": 1.01978612, + "balance_loss_mlp": 1.04216623, + "epoch": 0.3953103862918984, + "flos": 18040803275520.0, + "grad_norm": 2.170019312223073, + "language_loss": 0.71719187, + "learning_rate": 2.755956816505072e-06, + "loss": 0.73873657, + "num_input_tokens_seen": 141223925, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7734375, + "step": 6575, + "time_per_iteration": 2.463792085647583 + }, + { + "auxiliary_loss_clip": 0.01119276, + "auxiliary_loss_mlp": 0.01042807, + "balance_loss_clip": 1.02859664, + "balance_loss_mlp": 1.04105997, + "epoch": 0.3953705095445664, + "flos": 16976015481600.0, + "grad_norm": 2.0080051897694324, + "language_loss": 0.73535955, + "learning_rate": 2.7555962329372845e-06, + "loss": 0.75698036, + "num_input_tokens_seen": 141239010, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 6576, + "time_per_iteration": 2.409817934036255 + }, + { + "auxiliary_loss_clip": 0.01115385, + "auxiliary_loss_mlp": 0.01036906, + "balance_loss_clip": 1.0243237, + "balance_loss_mlp": 1.03979337, + "epoch": 0.39543063279723434, + "flos": 17411144837760.0, + "grad_norm": 2.36733568983198, + "language_loss": 0.83294857, + "learning_rate": 2.7552356207172124e-06, + "loss": 0.8544715, + "num_input_tokens_seen": 141252255, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7578125, + "step": 6577, + "time_per_iteration": 2.4421181678771973 + }, + { + "auxiliary_loss_clip": 0.01115466, + "auxiliary_loss_mlp": 0.01031962, + "balance_loss_clip": 1.01860428, + "balance_loss_mlp": 1.04138541, + "epoch": 0.3954907560499023, + "flos": 22784207656320.0, + "grad_norm": 3.8530294325048984, + "language_loss": 0.89916354, + "learning_rate": 2.75487497985853e-06, + "loss": 0.92063785, + "num_input_tokens_seen": 141269325, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7421875, + "step": 6578, + "time_per_iteration": 2.470369577407837 + }, + { + "auxiliary_loss_clip": 0.01118987, + "auxiliary_loss_mlp": 0.01037124, + "balance_loss_clip": 1.02170992, + "balance_loss_mlp": 1.04030561, + "epoch": 0.39555087930257027, + "flos": 21944400698880.0, + "grad_norm": 1.7408596896151103, + "language_loss": 0.77871025, + "learning_rate": 2.7545143103749117e-06, + "loss": 0.80027139, + "num_input_tokens_seen": 141288505, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.7890625, + "step": 6579, + "time_per_iteration": 2.4619040489196777 + }, + { + "auxiliary_loss_clip": 0.01119633, + "auxiliary_loss_mlp": 0.01031625, + "balance_loss_clip": 1.01760054, + "balance_loss_mlp": 1.0407021, + "epoch": 0.39561100255523823, + "flos": 20404622430720.0, + "grad_norm": 2.037188254408411, + "language_loss": 0.68324131, + "learning_rate": 2.754153612280037e-06, + "loss": 0.70475388, + "num_input_tokens_seen": 141303680, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 6580, + "time_per_iteration": 2.4363577365875244 + }, + { + "auxiliary_loss_clip": 0.01115068, + "auxiliary_loss_mlp": 0.01028446, + "balance_loss_clip": 1.01499939, + "balance_loss_mlp": 1.04099488, + "epoch": 0.3956711258079062, + "flos": 27964572986880.0, + "grad_norm": 1.613777567548473, + "language_loss": 0.58620721, + "learning_rate": 2.7537928855875797e-06, + "loss": 0.60764229, + "num_input_tokens_seen": 141324090, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7421875, + "step": 6581, + "time_per_iteration": 2.5704734325408936 + }, + { + "auxiliary_loss_clip": 0.01118807, + "auxiliary_loss_mlp": 0.01038995, + "balance_loss_clip": 1.02479148, + "balance_loss_mlp": 1.04165769, + "epoch": 0.39573124906057416, + "flos": 14428297670400.0, + "grad_norm": 2.015576445189345, + "language_loss": 0.698632, + "learning_rate": 2.7534321303112224e-06, + "loss": 0.72021002, + "num_input_tokens_seen": 141342235, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7734375, + "step": 6582, + "time_per_iteration": 2.4640939235687256 + }, + { + "auxiliary_loss_clip": 0.01118406, + "auxiliary_loss_mlp": 0.01035395, + "balance_loss_clip": 1.02167404, + "balance_loss_mlp": 1.0415566, + "epoch": 0.39579137231324213, + "flos": 18733699607040.0, + "grad_norm": 2.285451965985758, + "language_loss": 0.76454568, + "learning_rate": 2.753071346464642e-06, + "loss": 0.78608364, + "num_input_tokens_seen": 141361195, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76953125, + "step": 6583, + "time_per_iteration": 2.437396287918091 + }, + { + "auxiliary_loss_clip": 0.01118401, + "auxiliary_loss_mlp": 0.01030837, + "balance_loss_clip": 1.01708043, + "balance_loss_mlp": 1.04192805, + "epoch": 0.3958514955659101, + "flos": 17676417755520.0, + "grad_norm": 1.5685917359515968, + "language_loss": 0.65989023, + "learning_rate": 2.7527105340615207e-06, + "loss": 0.68138266, + "num_input_tokens_seen": 141378275, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.765625, + "step": 6584, + "time_per_iteration": 2.4562485218048096 + }, + { + "auxiliary_loss_clip": 0.01120331, + "auxiliary_loss_mlp": 0.01037784, + "balance_loss_clip": 1.02262115, + "balance_loss_mlp": 1.04122627, + "epoch": 0.39591161881857806, + "flos": 29309103901440.0, + "grad_norm": 2.6735523944320136, + "language_loss": 0.72423065, + "learning_rate": 2.7523496931155413e-06, + "loss": 0.74581182, + "num_input_tokens_seen": 141396960, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.7890625, + "step": 6585, + "time_per_iteration": 2.517333984375 + }, + { + "auxiliary_loss_clip": 0.0111653, + "auxiliary_loss_mlp": 0.01031743, + "balance_loss_clip": 1.01811159, + "balance_loss_mlp": 1.04010367, + "epoch": 0.3959717420712461, + "flos": 25771831332480.0, + "grad_norm": 1.986310622320223, + "language_loss": 0.73430967, + "learning_rate": 2.7519888236403856e-06, + "loss": 0.75579244, + "num_input_tokens_seen": 141417320, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.765625, + "step": 6586, + "time_per_iteration": 2.513847827911377 + }, + { + "auxiliary_loss_clip": 0.01117404, + "auxiliary_loss_mlp": 0.01031194, + "balance_loss_clip": 1.01738322, + "balance_loss_mlp": 1.04139459, + "epoch": 0.39603186532391405, + "flos": 20923783655040.0, + "grad_norm": 2.2420315368265915, + "language_loss": 0.71627617, + "learning_rate": 2.7516279256497382e-06, + "loss": 0.73776209, + "num_input_tokens_seen": 141435985, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 6587, + "time_per_iteration": 2.498534917831421 + }, + { + "auxiliary_loss_clip": 0.01038457, + "auxiliary_loss_mlp": 0.01003592, + "balance_loss_clip": 1.0020541, + "balance_loss_mlp": 1.01416993, + "epoch": 0.396091988576582, + "flos": 54880986176640.0, + "grad_norm": 0.9067384171744824, + "language_loss": 0.61162889, + "learning_rate": 2.751266999157285e-06, + "loss": 0.63204944, + "num_input_tokens_seen": 141486075, + "router_z_loss_clip": 0.01531982, + "router_z_loss_mlp": 0.2421875, + "step": 6588, + "time_per_iteration": 2.9129557609558105 + }, + { + "auxiliary_loss_clip": 0.01117429, + "auxiliary_loss_mlp": 0.01035443, + "balance_loss_clip": 1.0215075, + "balance_loss_mlp": 1.04087436, + "epoch": 0.39615211182925, + "flos": 20702896968960.0, + "grad_norm": 1.9745840784771536, + "language_loss": 0.81579673, + "learning_rate": 2.7509060441767115e-06, + "loss": 0.83732545, + "num_input_tokens_seen": 141505280, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 6589, + "time_per_iteration": 2.487581253051758 + }, + { + "auxiliary_loss_clip": 0.01118186, + "auxiliary_loss_mlp": 0.01033247, + "balance_loss_clip": 1.01858449, + "balance_loss_mlp": 1.04102254, + "epoch": 0.39621223508191794, + "flos": 20994312009600.0, + "grad_norm": 2.0157149751951606, + "language_loss": 0.70171028, + "learning_rate": 2.7505450607217057e-06, + "loss": 0.72322464, + "num_input_tokens_seen": 141523930, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7734375, + "step": 6590, + "time_per_iteration": 2.4837629795074463 + }, + { + "auxiliary_loss_clip": 0.01118811, + "auxiliary_loss_mlp": 0.01039001, + "balance_loss_clip": 1.02517259, + "balance_loss_mlp": 1.04276454, + "epoch": 0.3962723583345859, + "flos": 23368833417600.0, + "grad_norm": 1.6568331410473631, + "language_loss": 0.76061213, + "learning_rate": 2.750184048805956e-06, + "loss": 0.7821902, + "num_input_tokens_seen": 141541320, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76171875, + "step": 6591, + "time_per_iteration": 2.574401617050171 + }, + { + "auxiliary_loss_clip": 0.01119076, + "auxiliary_loss_mlp": 0.0104207, + "balance_loss_clip": 1.02803326, + "balance_loss_mlp": 1.04253912, + "epoch": 0.39633248158725387, + "flos": 25115599808640.0, + "grad_norm": 1.7800794685008139, + "language_loss": 0.79121935, + "learning_rate": 2.749823008443152e-06, + "loss": 0.81283081, + "num_input_tokens_seen": 141561880, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.765625, + "step": 6592, + "time_per_iteration": 2.5065057277679443 + }, + { + "auxiliary_loss_clip": 0.01112832, + "auxiliary_loss_mlp": 0.010329, + "balance_loss_clip": 1.01945305, + "balance_loss_mlp": 1.04020298, + "epoch": 0.39639260483992184, + "flos": 39787622236800.0, + "grad_norm": 1.6584377020479992, + "language_loss": 0.69372392, + "learning_rate": 2.7494619396469843e-06, + "loss": 0.71518123, + "num_input_tokens_seen": 141586460, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7265625, + "step": 6593, + "time_per_iteration": 2.691351890563965 + }, + { + "auxiliary_loss_clip": 0.01119923, + "auxiliary_loss_mlp": 0.01038681, + "balance_loss_clip": 1.02389932, + "balance_loss_mlp": 1.04100418, + "epoch": 0.3964527280925898, + "flos": 17347045017600.0, + "grad_norm": 1.6545825162449217, + "language_loss": 0.77913815, + "learning_rate": 2.7491008424311452e-06, + "loss": 0.80072421, + "num_input_tokens_seen": 141605955, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7890625, + "step": 6594, + "time_per_iteration": 2.452536106109619 + }, + { + "auxiliary_loss_clip": 0.01038921, + "auxiliary_loss_mlp": 0.01002091, + "balance_loss_clip": 1.0005945, + "balance_loss_mlp": 1.0146898, + "epoch": 0.39651285134525777, + "flos": 71717848369920.0, + "grad_norm": 0.9454940833877284, + "language_loss": 0.63038307, + "learning_rate": 2.7487397168093265e-06, + "loss": 0.65079319, + "num_input_tokens_seen": 141673140, + "router_z_loss_clip": 0.01495361, + "router_z_loss_mlp": 0.2421875, + "step": 6595, + "time_per_iteration": 6.018520355224609 + }, + { + "auxiliary_loss_clip": 0.01121925, + "auxiliary_loss_mlp": 0.01044146, + "balance_loss_clip": 1.02908421, + "balance_loss_mlp": 1.04294038, + "epoch": 0.39657297459792573, + "flos": 25775710001280.0, + "grad_norm": 2.072222886004575, + "language_loss": 0.6329869, + "learning_rate": 2.748378562795223e-06, + "loss": 0.65464759, + "num_input_tokens_seen": 141692955, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7890625, + "step": 6596, + "time_per_iteration": 5.302752494812012 + }, + { + "auxiliary_loss_clip": 0.01115587, + "auxiliary_loss_mlp": 0.01035004, + "balance_loss_clip": 1.02110457, + "balance_loss_mlp": 1.04157937, + "epoch": 0.3966330978505937, + "flos": 20266115587200.0, + "grad_norm": 2.0492451282774273, + "language_loss": 0.78553772, + "learning_rate": 2.7480173804025293e-06, + "loss": 0.80704355, + "num_input_tokens_seen": 141710680, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.73828125, + "step": 6597, + "time_per_iteration": 2.457028388977051 + }, + { + "auxiliary_loss_clip": 0.01121814, + "auxiliary_loss_mlp": 0.01040279, + "balance_loss_clip": 1.02558672, + "balance_loss_mlp": 1.04262114, + "epoch": 0.39669322110326166, + "flos": 20631183465600.0, + "grad_norm": 1.95592503590265, + "language_loss": 0.67559552, + "learning_rate": 2.747656169644941e-06, + "loss": 0.69721651, + "num_input_tokens_seen": 141729860, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.79296875, + "step": 6598, + "time_per_iteration": 2.4448981285095215 + }, + { + "auxiliary_loss_clip": 0.01117545, + "auxiliary_loss_mlp": 0.01034771, + "balance_loss_clip": 1.02153933, + "balance_loss_mlp": 1.0411458, + "epoch": 0.3967533443559297, + "flos": 21726063878400.0, + "grad_norm": 2.3323846151329235, + "language_loss": 0.78922117, + "learning_rate": 2.747294930536157e-06, + "loss": 0.81074429, + "num_input_tokens_seen": 141749060, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.765625, + "step": 6599, + "time_per_iteration": 2.4799394607543945 + }, + { + "auxiliary_loss_clip": 0.01117884, + "auxiliary_loss_mlp": 0.01032083, + "balance_loss_clip": 1.01680064, + "balance_loss_mlp": 1.04196167, + "epoch": 0.39681346760859765, + "flos": 25484151306240.0, + "grad_norm": 1.67964508136209, + "language_loss": 0.72716624, + "learning_rate": 2.7469336630898737e-06, + "loss": 0.74866593, + "num_input_tokens_seen": 141769860, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.7578125, + "step": 6600, + "time_per_iteration": 2.4940543174743652 + }, + { + "auxiliary_loss_clip": 0.01115602, + "auxiliary_loss_mlp": 0.01032131, + "balance_loss_clip": 1.01864827, + "balance_loss_mlp": 1.03997052, + "epoch": 0.3968735908612656, + "flos": 20959586536320.0, + "grad_norm": 1.9442093512958227, + "language_loss": 0.85773253, + "learning_rate": 2.746572367319791e-06, + "loss": 0.87920988, + "num_input_tokens_seen": 141788465, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 6601, + "time_per_iteration": 2.4826369285583496 + }, + { + "auxiliary_loss_clip": 0.01124374, + "auxiliary_loss_mlp": 0.01038225, + "balance_loss_clip": 1.02191091, + "balance_loss_mlp": 1.04298782, + "epoch": 0.3969337141139336, + "flos": 10707090531840.0, + "grad_norm": 2.3202277168625054, + "language_loss": 0.70015699, + "learning_rate": 2.7462110432396095e-06, + "loss": 0.72178292, + "num_input_tokens_seen": 141804955, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8125, + "step": 6602, + "time_per_iteration": 2.4452199935913086 + }, + { + "auxiliary_loss_clip": 0.01119686, + "auxiliary_loss_mlp": 0.01038286, + "balance_loss_clip": 1.02458847, + "balance_loss_mlp": 1.04225206, + "epoch": 0.39699383736660154, + "flos": 17593714690560.0, + "grad_norm": 2.564497124514123, + "language_loss": 0.83408487, + "learning_rate": 2.7458496908630305e-06, + "loss": 0.85566461, + "num_input_tokens_seen": 141820025, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 6603, + "time_per_iteration": 2.50046968460083 + }, + { + "auxiliary_loss_clip": 0.01115539, + "auxiliary_loss_mlp": 0.01032527, + "balance_loss_clip": 1.0192889, + "balance_loss_mlp": 1.04076076, + "epoch": 0.3970539606192695, + "flos": 17785945301760.0, + "grad_norm": 1.4733286794124776, + "language_loss": 0.72804213, + "learning_rate": 2.7454883102037563e-06, + "loss": 0.74952281, + "num_input_tokens_seen": 141838735, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.74609375, + "step": 6604, + "time_per_iteration": 2.435645580291748 + }, + { + "auxiliary_loss_clip": 0.01114015, + "auxiliary_loss_mlp": 0.01037208, + "balance_loss_clip": 1.02366602, + "balance_loss_mlp": 1.0427258, + "epoch": 0.3971140838719375, + "flos": 24789495208320.0, + "grad_norm": 1.694386771997249, + "language_loss": 0.82919562, + "learning_rate": 2.745126901275491e-06, + "loss": 0.85070789, + "num_input_tokens_seen": 141858090, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7109375, + "step": 6605, + "time_per_iteration": 2.538792371749878 + }, + { + "auxiliary_loss_clip": 0.01113567, + "auxiliary_loss_mlp": 0.0103244, + "balance_loss_clip": 1.02053654, + "balance_loss_mlp": 1.04017544, + "epoch": 0.39717420712460544, + "flos": 24243581329920.0, + "grad_norm": 1.515379376113219, + "language_loss": 0.73755872, + "learning_rate": 2.7447654640919383e-06, + "loss": 0.75901884, + "num_input_tokens_seen": 141877540, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.734375, + "step": 6606, + "time_per_iteration": 2.4766290187835693 + }, + { + "auxiliary_loss_clip": 0.0111968, + "auxiliary_loss_mlp": 0.01034451, + "balance_loss_clip": 1.0207423, + "balance_loss_mlp": 1.04279184, + "epoch": 0.3972343303772734, + "flos": 25884698843520.0, + "grad_norm": 1.9669838489657716, + "language_loss": 0.73925817, + "learning_rate": 2.744403998666805e-06, + "loss": 0.76079941, + "num_input_tokens_seen": 141897315, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76953125, + "step": 6607, + "time_per_iteration": 2.550140380859375 + }, + { + "auxiliary_loss_clip": 0.01121372, + "auxiliary_loss_mlp": 0.01034109, + "balance_loss_clip": 1.02045417, + "balance_loss_mlp": 1.04417753, + "epoch": 0.39729445362994137, + "flos": 45623716300800.0, + "grad_norm": 1.5241940789626238, + "language_loss": 0.67978024, + "learning_rate": 2.744042505013797e-06, + "loss": 0.70133507, + "num_input_tokens_seen": 141919580, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 6608, + "time_per_iteration": 2.70333194732666 + }, + { + "auxiliary_loss_clip": 0.01120221, + "auxiliary_loss_mlp": 0.01042402, + "balance_loss_clip": 1.0263803, + "balance_loss_mlp": 1.04247403, + "epoch": 0.39735457688260933, + "flos": 20193971120640.0, + "grad_norm": 2.3779993769587486, + "language_loss": 0.74649572, + "learning_rate": 2.7436809831466233e-06, + "loss": 0.76812196, + "num_input_tokens_seen": 141937045, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.77734375, + "step": 6609, + "time_per_iteration": 2.4810678958892822 + }, + { + "auxiliary_loss_clip": 0.01119236, + "auxiliary_loss_mlp": 0.01032767, + "balance_loss_clip": 1.01909387, + "balance_loss_mlp": 1.04284418, + "epoch": 0.3974147001352773, + "flos": 23331163029120.0, + "grad_norm": 4.182923272039756, + "language_loss": 0.71530509, + "learning_rate": 2.7433194330789927e-06, + "loss": 0.73682511, + "num_input_tokens_seen": 141956695, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.765625, + "step": 6610, + "time_per_iteration": 2.483358860015869 + }, + { + "auxiliary_loss_clip": 0.01110348, + "auxiliary_loss_mlp": 0.0103254, + "balance_loss_clip": 1.01881909, + "balance_loss_mlp": 1.03868747, + "epoch": 0.39747482338794526, + "flos": 21688644885120.0, + "grad_norm": 1.6591621928280806, + "language_loss": 0.7848928, + "learning_rate": 2.7429578548246133e-06, + "loss": 0.80632162, + "num_input_tokens_seen": 141975935, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71484375, + "step": 6611, + "time_per_iteration": 2.4707412719726562 + }, + { + "auxiliary_loss_clip": 0.01120047, + "auxiliary_loss_mlp": 0.01036907, + "balance_loss_clip": 1.0234127, + "balance_loss_mlp": 1.04496026, + "epoch": 0.3975349466406133, + "flos": 30988717816320.0, + "grad_norm": 1.7910222988347433, + "language_loss": 0.78681552, + "learning_rate": 2.7425962483971985e-06, + "loss": 0.80838501, + "num_input_tokens_seen": 141995750, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 6612, + "time_per_iteration": 2.552384614944458 + }, + { + "auxiliary_loss_clip": 0.01042423, + "auxiliary_loss_mlp": 0.01023175, + "balance_loss_clip": 1.02180374, + "balance_loss_mlp": 1.01794136, + "epoch": 0.39759506989328125, + "flos": 63683948833920.0, + "grad_norm": 0.8703127674216669, + "language_loss": 0.64956641, + "learning_rate": 2.742234613810459e-06, + "loss": 0.6702224, + "num_input_tokens_seen": 142057655, + "router_z_loss_clip": 0.01373291, + "router_z_loss_mlp": 0.24414062, + "step": 6613, + "time_per_iteration": 2.978494882583618 + }, + { + "auxiliary_loss_clip": 0.01116625, + "auxiliary_loss_mlp": 0.01031073, + "balance_loss_clip": 1.01683927, + "balance_loss_mlp": 1.04148316, + "epoch": 0.3976551931459492, + "flos": 23695835857920.0, + "grad_norm": 2.0550022834902797, + "language_loss": 0.71538055, + "learning_rate": 2.741872951078109e-06, + "loss": 0.73685759, + "num_input_tokens_seen": 142076020, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.75390625, + "step": 6614, + "time_per_iteration": 2.4898061752319336 + }, + { + "auxiliary_loss_clip": 0.01116246, + "auxiliary_loss_mlp": 0.0103036, + "balance_loss_clip": 1.01644266, + "balance_loss_mlp": 1.04124689, + "epoch": 0.3977153163986172, + "flos": 15669657745920.0, + "grad_norm": 1.8540793086422767, + "language_loss": 0.81317735, + "learning_rate": 2.741511260213862e-06, + "loss": 0.83464336, + "num_input_tokens_seen": 142093790, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 6615, + "time_per_iteration": 2.4708592891693115 + }, + { + "auxiliary_loss_clip": 0.01116463, + "auxiliary_loss_mlp": 0.01033552, + "balance_loss_clip": 1.02074313, + "balance_loss_mlp": 1.04221725, + "epoch": 0.39777543965128515, + "flos": 14064702249600.0, + "grad_norm": 2.466828000769562, + "language_loss": 0.67015827, + "learning_rate": 2.741149541231434e-06, + "loss": 0.69165838, + "num_input_tokens_seen": 142110545, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7421875, + "step": 6616, + "time_per_iteration": 2.4453790187835693 + }, + { + "auxiliary_loss_clip": 0.01120268, + "auxiliary_loss_mlp": 0.01034152, + "balance_loss_clip": 1.02032995, + "balance_loss_mlp": 1.04185963, + "epoch": 0.3978355629039531, + "flos": 23367468700800.0, + "grad_norm": 2.097035382924748, + "language_loss": 0.83857769, + "learning_rate": 2.740787794144541e-06, + "loss": 0.86012185, + "num_input_tokens_seen": 142128695, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.78125, + "step": 6617, + "time_per_iteration": 2.4740309715270996 + }, + { + "auxiliary_loss_clip": 0.01113934, + "auxiliary_loss_mlp": 0.01035523, + "balance_loss_clip": 1.02256477, + "balance_loss_mlp": 1.04305041, + "epoch": 0.3978956861566211, + "flos": 19062785036160.0, + "grad_norm": 1.6139116519566428, + "language_loss": 0.72253633, + "learning_rate": 2.7404260189669e-06, + "loss": 0.74403095, + "num_input_tokens_seen": 142148375, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 6618, + "time_per_iteration": 2.451362371444702 + }, + { + "auxiliary_loss_clip": 0.01117142, + "auxiliary_loss_mlp": 0.01035822, + "balance_loss_clip": 1.02070642, + "balance_loss_mlp": 1.04263783, + "epoch": 0.39795580940928904, + "flos": 30227699341440.0, + "grad_norm": 1.9091502235972209, + "language_loss": 0.65847683, + "learning_rate": 2.740064215712231e-06, + "loss": 0.6800065, + "num_input_tokens_seen": 142169735, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.74609375, + "step": 6619, + "time_per_iteration": 2.5479021072387695 + }, + { + "auxiliary_loss_clip": 0.01041684, + "auxiliary_loss_mlp": 0.00999907, + "balance_loss_clip": 0.99843466, + "balance_loss_mlp": 1.0170114, + "epoch": 0.398015932661957, + "flos": 69847224906240.0, + "grad_norm": 0.7720250582246381, + "language_loss": 0.58222711, + "learning_rate": 2.7397023843942527e-06, + "loss": 0.60264301, + "num_input_tokens_seen": 142229520, + "router_z_loss_clip": 0.01470947, + "router_z_loss_mlp": 0.24609375, + "step": 6620, + "time_per_iteration": 3.0502688884735107 + }, + { + "auxiliary_loss_clip": 0.01116159, + "auxiliary_loss_mlp": 0.01036059, + "balance_loss_clip": 1.02383971, + "balance_loss_mlp": 1.04254556, + "epoch": 0.39807605591462497, + "flos": 20157773189760.0, + "grad_norm": 1.5861085047038441, + "language_loss": 0.79551339, + "learning_rate": 2.739340525026686e-06, + "loss": 0.81703556, + "num_input_tokens_seen": 142247660, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.73828125, + "step": 6621, + "time_per_iteration": 2.4595162868499756 + }, + { + "auxiliary_loss_clip": 0.01115012, + "auxiliary_loss_mlp": 0.01030289, + "balance_loss_clip": 1.01709294, + "balance_loss_mlp": 1.04198873, + "epoch": 0.39813617916729294, + "flos": 21141761339520.0, + "grad_norm": 1.9955210259775171, + "language_loss": 0.78070045, + "learning_rate": 2.738978637623252e-06, + "loss": 0.80215347, + "num_input_tokens_seen": 142266990, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73046875, + "step": 6622, + "time_per_iteration": 2.487805128097534 + }, + { + "auxiliary_loss_clip": 0.01116184, + "auxiliary_loss_mlp": 0.01030398, + "balance_loss_clip": 1.01685607, + "balance_loss_mlp": 1.04132223, + "epoch": 0.3981963024199609, + "flos": 18988485753600.0, + "grad_norm": 1.5290489885204759, + "language_loss": 0.75010175, + "learning_rate": 2.738616722197674e-06, + "loss": 0.77156758, + "num_input_tokens_seen": 142287170, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.75, + "step": 6623, + "time_per_iteration": 2.464571714401245 + }, + { + "auxiliary_loss_clip": 0.01116211, + "auxiliary_loss_mlp": 0.01036455, + "balance_loss_clip": 1.02278805, + "balance_loss_mlp": 1.04220378, + "epoch": 0.39825642567262887, + "flos": 16575108808320.0, + "grad_norm": 1.7278538768787957, + "language_loss": 0.79535556, + "learning_rate": 2.7382547787636766e-06, + "loss": 0.81688213, + "num_input_tokens_seen": 142305405, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73828125, + "step": 6624, + "time_per_iteration": 2.4550037384033203 + }, + { + "auxiliary_loss_clip": 0.01120431, + "auxiliary_loss_mlp": 0.01041321, + "balance_loss_clip": 1.02627707, + "balance_loss_mlp": 1.04234707, + "epoch": 0.39831654892529683, + "flos": 22199833290240.0, + "grad_norm": 2.035642441182755, + "language_loss": 0.83558613, + "learning_rate": 2.7378928073349832e-06, + "loss": 0.85720372, + "num_input_tokens_seen": 142322710, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.78125, + "step": 6625, + "time_per_iteration": 2.456171989440918 + }, + { + "auxiliary_loss_clip": 0.01114643, + "auxiliary_loss_mlp": 0.01042231, + "balance_loss_clip": 1.02839124, + "balance_loss_mlp": 1.04085207, + "epoch": 0.39837667217796485, + "flos": 10487963612160.0, + "grad_norm": 2.051687002705142, + "language_loss": 0.86593187, + "learning_rate": 2.737530807925321e-06, + "loss": 0.88750064, + "num_input_tokens_seen": 142338535, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.73828125, + "step": 6626, + "time_per_iteration": 2.4335460662841797 + }, + { + "auxiliary_loss_clip": 0.01115116, + "auxiliary_loss_mlp": 0.01036656, + "balance_loss_clip": 1.02238643, + "balance_loss_mlp": 1.04094946, + "epoch": 0.3984367954306328, + "flos": 17965282930560.0, + "grad_norm": 2.3900066005878386, + "language_loss": 0.83897698, + "learning_rate": 2.737168780548417e-06, + "loss": 0.86049473, + "num_input_tokens_seen": 142354570, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7421875, + "step": 6627, + "time_per_iteration": 2.4269766807556152 + }, + { + "auxiliary_loss_clip": 0.01111344, + "auxiliary_loss_mlp": 0.01038178, + "balance_loss_clip": 1.02514243, + "balance_loss_mlp": 1.03955984, + "epoch": 0.3984969186833008, + "flos": 22711057608960.0, + "grad_norm": 1.4398151096773946, + "language_loss": 0.82760668, + "learning_rate": 2.736806725217998e-06, + "loss": 0.8491019, + "num_input_tokens_seen": 142374395, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71875, + "step": 6628, + "time_per_iteration": 2.529315948486328 + }, + { + "auxiliary_loss_clip": 0.01115476, + "auxiliary_loss_mlp": 0.01040725, + "balance_loss_clip": 1.027421, + "balance_loss_mlp": 1.04130399, + "epoch": 0.39855704193596875, + "flos": 23405785534080.0, + "grad_norm": 1.8256672588255014, + "language_loss": 0.70683473, + "learning_rate": 2.7364446419477945e-06, + "loss": 0.72839677, + "num_input_tokens_seen": 142396040, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7421875, + "step": 6629, + "time_per_iteration": 2.5025413036346436 + }, + { + "auxiliary_loss_clip": 0.01114279, + "auxiliary_loss_mlp": 0.01035106, + "balance_loss_clip": 1.02155161, + "balance_loss_mlp": 1.04309297, + "epoch": 0.3986171651886367, + "flos": 21251935330560.0, + "grad_norm": 4.278612279497538, + "language_loss": 0.80683714, + "learning_rate": 2.7360825307515366e-06, + "loss": 0.82833099, + "num_input_tokens_seen": 142415495, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7109375, + "step": 6630, + "time_per_iteration": 2.4792280197143555 + }, + { + "auxiliary_loss_clip": 0.01116417, + "auxiliary_loss_mlp": 0.01027934, + "balance_loss_clip": 1.01485634, + "balance_loss_mlp": 1.04143131, + "epoch": 0.3986772884413047, + "flos": 12458705258880.0, + "grad_norm": 1.8749880656247468, + "language_loss": 0.75354141, + "learning_rate": 2.7357203916429555e-06, + "loss": 0.7749849, + "num_input_tokens_seen": 142431865, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.75, + "step": 6631, + "time_per_iteration": 2.417546272277832 + }, + { + "auxiliary_loss_clip": 0.0111705, + "auxiliary_loss_mlp": 0.01035263, + "balance_loss_clip": 1.0218699, + "balance_loss_mlp": 1.04246461, + "epoch": 0.39873741169397264, + "flos": 19646117907840.0, + "grad_norm": 2.3246230169523194, + "language_loss": 0.7156167, + "learning_rate": 2.735358224635783e-06, + "loss": 0.73713982, + "num_input_tokens_seen": 142450595, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.74609375, + "step": 6632, + "time_per_iteration": 2.446089744567871 + }, + { + "auxiliary_loss_clip": 0.01111142, + "auxiliary_loss_mlp": 0.01037094, + "balance_loss_clip": 1.02449358, + "balance_loss_mlp": 1.03939462, + "epoch": 0.3987975349466406, + "flos": 21684766216320.0, + "grad_norm": 1.8450465759001686, + "language_loss": 0.74742806, + "learning_rate": 2.7349960297437533e-06, + "loss": 0.76891041, + "num_input_tokens_seen": 142466650, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71875, + "step": 6633, + "time_per_iteration": 2.431104898452759 + }, + { + "auxiliary_loss_clip": 0.011138, + "auxiliary_loss_mlp": 0.01027649, + "balance_loss_clip": 1.01455402, + "balance_loss_mlp": 1.03961205, + "epoch": 0.3988576581993086, + "flos": 23914064937600.0, + "grad_norm": 1.781985159362602, + "language_loss": 0.808864, + "learning_rate": 2.7346338069806e-06, + "loss": 0.83027852, + "num_input_tokens_seen": 142486165, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7421875, + "step": 6634, + "time_per_iteration": 2.471496105194092 + }, + { + "auxiliary_loss_clip": 0.01116689, + "auxiliary_loss_mlp": 0.01032338, + "balance_loss_clip": 1.01856947, + "balance_loss_mlp": 1.04252565, + "epoch": 0.39891778145197654, + "flos": 18149899858560.0, + "grad_norm": 1.7295196741572958, + "language_loss": 0.74605262, + "learning_rate": 2.7342715563600597e-06, + "loss": 0.7675429, + "num_input_tokens_seen": 142505035, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7421875, + "step": 6635, + "time_per_iteration": 2.4630682468414307 + }, + { + "auxiliary_loss_clip": 0.01120499, + "auxiliary_loss_mlp": 0.01044274, + "balance_loss_clip": 1.02930093, + "balance_loss_mlp": 1.04096711, + "epoch": 0.3989779047046445, + "flos": 22595281096320.0, + "grad_norm": 1.9670463450002986, + "language_loss": 0.66429746, + "learning_rate": 2.733909277895868e-06, + "loss": 0.68594521, + "num_input_tokens_seen": 142521870, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.796875, + "step": 6636, + "time_per_iteration": 2.454789876937866 + }, + { + "auxiliary_loss_clip": 0.0111332, + "auxiliary_loss_mlp": 0.01034295, + "balance_loss_clip": 1.02131867, + "balance_loss_mlp": 1.0403626, + "epoch": 0.39903802795731247, + "flos": 18077216688000.0, + "grad_norm": 1.695302941119513, + "language_loss": 0.81410646, + "learning_rate": 2.733546971601763e-06, + "loss": 0.83558261, + "num_input_tokens_seen": 142540455, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.73046875, + "step": 6637, + "time_per_iteration": 5.387745380401611 + }, + { + "auxiliary_loss_clip": 0.01040567, + "auxiliary_loss_mlp": 0.01000444, + "balance_loss_clip": 0.99893045, + "balance_loss_mlp": 1.0159328, + "epoch": 0.39909815120998043, + "flos": 70441367771520.0, + "grad_norm": 0.7139106827959352, + "language_loss": 0.53211641, + "learning_rate": 2.733184637491484e-06, + "loss": 0.55252659, + "num_input_tokens_seen": 142599665, + "router_z_loss_clip": 0.01513672, + "router_z_loss_mlp": 0.24609375, + "step": 6638, + "time_per_iteration": 4.465191125869751 + }, + { + "auxiliary_loss_clip": 0.01114413, + "auxiliary_loss_mlp": 0.01035276, + "balance_loss_clip": 1.02260959, + "balance_loss_mlp": 1.04064405, + "epoch": 0.39915827446264845, + "flos": 18549262247040.0, + "grad_norm": 1.9403504228046689, + "language_loss": 0.75377512, + "learning_rate": 2.732822275578769e-06, + "loss": 0.77527201, + "num_input_tokens_seen": 142618845, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.73828125, + "step": 6639, + "time_per_iteration": 2.4947104454040527 + }, + { + "auxiliary_loss_clip": 0.01112086, + "auxiliary_loss_mlp": 0.01030627, + "balance_loss_clip": 1.01788926, + "balance_loss_mlp": 1.04078937, + "epoch": 0.3992183977153164, + "flos": 29897249195520.0, + "grad_norm": 1.632879790681491, + "language_loss": 0.76217377, + "learning_rate": 2.7324598858773603e-06, + "loss": 0.78360093, + "num_input_tokens_seen": 142640885, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 6640, + "time_per_iteration": 2.524815320968628 + }, + { + "auxiliary_loss_clip": 0.01112883, + "auxiliary_loss_mlp": 0.01037412, + "balance_loss_clip": 1.02448368, + "balance_loss_mlp": 1.03855717, + "epoch": 0.3992785209679844, + "flos": 22565080736640.0, + "grad_norm": 2.5962495804033794, + "language_loss": 0.82264209, + "learning_rate": 2.7320974684009996e-06, + "loss": 0.84414506, + "num_input_tokens_seen": 142659340, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7421875, + "step": 6641, + "time_per_iteration": 2.4753921031951904 + }, + { + "auxiliary_loss_clip": 0.01116915, + "auxiliary_loss_mlp": 0.010322, + "balance_loss_clip": 1.01891971, + "balance_loss_mlp": 1.04188418, + "epoch": 0.39933864422065235, + "flos": 19682674974720.0, + "grad_norm": 2.015070946619467, + "language_loss": 0.7685014, + "learning_rate": 2.7317350231634288e-06, + "loss": 0.78999245, + "num_input_tokens_seen": 142677085, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75, + "step": 6642, + "time_per_iteration": 2.431239604949951 + }, + { + "auxiliary_loss_clip": 0.01114257, + "auxiliary_loss_mlp": 0.0103328, + "balance_loss_clip": 1.019642, + "balance_loss_mlp": 1.03963089, + "epoch": 0.3993987674733203, + "flos": 23038491012480.0, + "grad_norm": 2.2960488262105145, + "language_loss": 0.7247656, + "learning_rate": 2.731372550178393e-06, + "loss": 0.74624097, + "num_input_tokens_seen": 142694595, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 6643, + "time_per_iteration": 2.4759740829467773 + }, + { + "auxiliary_loss_clip": 0.01115242, + "auxiliary_loss_mlp": 0.01035377, + "balance_loss_clip": 1.0214113, + "balance_loss_mlp": 1.04014993, + "epoch": 0.3994588907259883, + "flos": 19390828970880.0, + "grad_norm": 1.5171926718970592, + "language_loss": 0.65988386, + "learning_rate": 2.7310100494596375e-06, + "loss": 0.68139005, + "num_input_tokens_seen": 142714175, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75, + "step": 6644, + "time_per_iteration": 2.437404155731201 + }, + { + "auxiliary_loss_clip": 0.01113182, + "auxiliary_loss_mlp": 0.01037023, + "balance_loss_clip": 1.0235281, + "balance_loss_mlp": 1.0386616, + "epoch": 0.39951901397865625, + "flos": 13734395758080.0, + "grad_norm": 1.956427678643188, + "language_loss": 0.78470129, + "learning_rate": 2.730647521020907e-06, + "loss": 0.80620331, + "num_input_tokens_seen": 142730955, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 6645, + "time_per_iteration": 2.44826078414917 + }, + { + "auxiliary_loss_clip": 0.01115381, + "auxiliary_loss_mlp": 0.01033765, + "balance_loss_clip": 1.02034187, + "balance_loss_mlp": 1.04042077, + "epoch": 0.3995791372313242, + "flos": 23586451966080.0, + "grad_norm": 1.409098570486763, + "language_loss": 0.69889182, + "learning_rate": 2.73028496487595e-06, + "loss": 0.72038329, + "num_input_tokens_seen": 142751200, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 6646, + "time_per_iteration": 2.4746181964874268 + }, + { + "auxiliary_loss_clip": 0.01113557, + "auxiliary_loss_mlp": 0.01035502, + "balance_loss_clip": 1.0222578, + "balance_loss_mlp": 1.03869605, + "epoch": 0.3996392604839922, + "flos": 21355896268800.0, + "grad_norm": 1.7478077072518943, + "language_loss": 0.72165501, + "learning_rate": 2.729922381038513e-06, + "loss": 0.74314553, + "num_input_tokens_seen": 142770170, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.74609375, + "step": 6647, + "time_per_iteration": 2.4814393520355225 + }, + { + "auxiliary_loss_clip": 0.01108545, + "auxiliary_loss_mlp": 0.01037927, + "balance_loss_clip": 1.02576208, + "balance_loss_mlp": 1.03874063, + "epoch": 0.39969938373666014, + "flos": 26032255914240.0, + "grad_norm": 1.4937426139380796, + "language_loss": 0.74371958, + "learning_rate": 2.7295597695223463e-06, + "loss": 0.76518434, + "num_input_tokens_seen": 142792680, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69921875, + "step": 6648, + "time_per_iteration": 2.4970345497131348 + }, + { + "auxiliary_loss_clip": 0.01115329, + "auxiliary_loss_mlp": 0.01036503, + "balance_loss_clip": 1.02300286, + "balance_loss_mlp": 1.04061389, + "epoch": 0.3997595069893281, + "flos": 20116367786880.0, + "grad_norm": 2.209642859907432, + "language_loss": 0.66124469, + "learning_rate": 2.7291971303412006e-06, + "loss": 0.68276298, + "num_input_tokens_seen": 142810510, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 6649, + "time_per_iteration": 2.4624104499816895 + }, + { + "auxiliary_loss_clip": 0.01116294, + "auxiliary_loss_mlp": 0.01036155, + "balance_loss_clip": 1.02280378, + "balance_loss_mlp": 1.0420115, + "epoch": 0.39981963024199607, + "flos": 27783403764480.0, + "grad_norm": 1.57860522688022, + "language_loss": 0.75273359, + "learning_rate": 2.728834463508826e-06, + "loss": 0.77425814, + "num_input_tokens_seen": 142832455, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7421875, + "step": 6650, + "time_per_iteration": 2.5091254711151123 + }, + { + "auxiliary_loss_clip": 0.01112744, + "auxiliary_loss_mlp": 0.01037491, + "balance_loss_clip": 1.02397823, + "balance_loss_mlp": 1.03905869, + "epoch": 0.39987975349466404, + "flos": 21944436612480.0, + "grad_norm": 1.4583647344722164, + "language_loss": 0.71954048, + "learning_rate": 2.728471769038975e-06, + "loss": 0.74104279, + "num_input_tokens_seen": 142852590, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73828125, + "step": 6651, + "time_per_iteration": 2.4820897579193115 + }, + { + "auxiliary_loss_clip": 0.01113579, + "auxiliary_loss_mlp": 0.01035523, + "balance_loss_clip": 1.02220726, + "balance_loss_mlp": 1.03815126, + "epoch": 0.39993987674733206, + "flos": 20704405340160.0, + "grad_norm": 1.787132664616244, + "language_loss": 0.72906494, + "learning_rate": 2.728109046945403e-06, + "loss": 0.75055599, + "num_input_tokens_seen": 142870595, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75390625, + "step": 6652, + "time_per_iteration": 2.4568119049072266 + }, + { + "auxiliary_loss_clip": 0.01039541, + "auxiliary_loss_mlp": 0.00999581, + "balance_loss_clip": 0.99819815, + "balance_loss_mlp": 1.01483345, + "epoch": 0.4, + "flos": 61525429862400.0, + "grad_norm": 0.8299860195083637, + "language_loss": 0.61066198, + "learning_rate": 2.727746297241862e-06, + "loss": 0.63105321, + "num_input_tokens_seen": 142925805, + "router_z_loss_clip": 0.01385498, + "router_z_loss_mlp": 0.24707031, + "step": 6653, + "time_per_iteration": 3.0071723461151123 + }, + { + "auxiliary_loss_clip": 0.01113323, + "auxiliary_loss_mlp": 0.01038964, + "balance_loss_clip": 1.02607179, + "balance_loss_mlp": 1.04303741, + "epoch": 0.400060123252668, + "flos": 14502309644160.0, + "grad_norm": 2.127427836980077, + "language_loss": 0.67038172, + "learning_rate": 2.7273835199421085e-06, + "loss": 0.6919046, + "num_input_tokens_seen": 142943145, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 6654, + "time_per_iteration": 2.442049026489258 + }, + { + "auxiliary_loss_clip": 0.01113347, + "auxiliary_loss_mlp": 0.01039111, + "balance_loss_clip": 1.02741051, + "balance_loss_mlp": 1.03887355, + "epoch": 0.40012024650533595, + "flos": 19093308618240.0, + "grad_norm": 2.299433298478917, + "language_loss": 0.89737195, + "learning_rate": 2.7270207150599e-06, + "loss": 0.91889656, + "num_input_tokens_seen": 142956925, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.74609375, + "step": 6655, + "time_per_iteration": 2.4836323261260986 + }, + { + "auxiliary_loss_clip": 0.01110377, + "auxiliary_loss_mlp": 0.0102991, + "balance_loss_clip": 1.01865685, + "balance_loss_mlp": 1.04077053, + "epoch": 0.4001803697580039, + "flos": 29351012094720.0, + "grad_norm": 1.5855954082229138, + "language_loss": 0.73497427, + "learning_rate": 2.7266578826089917e-06, + "loss": 0.75637716, + "num_input_tokens_seen": 142978040, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6953125, + "step": 6656, + "time_per_iteration": 2.5071847438812256 + }, + { + "auxiliary_loss_clip": 0.01116544, + "auxiliary_loss_mlp": 0.01046403, + "balance_loss_clip": 1.03248513, + "balance_loss_mlp": 1.04179835, + "epoch": 0.4002404930106719, + "flos": 20920048640640.0, + "grad_norm": 1.4675228136273628, + "language_loss": 0.7344414, + "learning_rate": 2.726295022603144e-06, + "loss": 0.75607085, + "num_input_tokens_seen": 142998390, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75, + "step": 6657, + "time_per_iteration": 2.575587034225464 + }, + { + "auxiliary_loss_clip": 0.01116565, + "auxiliary_loss_mlp": 0.0103855, + "balance_loss_clip": 1.02432823, + "balance_loss_mlp": 1.04162562, + "epoch": 0.40030061626333985, + "flos": 28405735827840.0, + "grad_norm": 1.4527474123065993, + "language_loss": 0.79588759, + "learning_rate": 2.725932135056117e-06, + "loss": 0.81743878, + "num_input_tokens_seen": 143021505, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.75, + "step": 6658, + "time_per_iteration": 2.7093567848205566 + }, + { + "auxiliary_loss_clip": 0.01115311, + "auxiliary_loss_mlp": 0.01041911, + "balance_loss_clip": 1.02917993, + "balance_loss_mlp": 1.0406971, + "epoch": 0.4003607395160078, + "flos": 25921615046400.0, + "grad_norm": 1.8904694620172307, + "language_loss": 0.77345288, + "learning_rate": 2.72556921998167e-06, + "loss": 0.79502499, + "num_input_tokens_seen": 143041375, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7421875, + "step": 6659, + "time_per_iteration": 2.5323445796966553 + }, + { + "auxiliary_loss_clip": 0.01105934, + "auxiliary_loss_mlp": 0.01028537, + "balance_loss_clip": 1.01713443, + "balance_loss_mlp": 1.03853416, + "epoch": 0.4004208627686758, + "flos": 20768648814720.0, + "grad_norm": 1.7715585064718242, + "language_loss": 0.72642064, + "learning_rate": 2.7252062773935662e-06, + "loss": 0.7477653, + "num_input_tokens_seen": 143058725, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.671875, + "step": 6660, + "time_per_iteration": 2.4459004402160645 + }, + { + "auxiliary_loss_clip": 0.01113964, + "auxiliary_loss_mlp": 0.01039676, + "balance_loss_clip": 1.02753496, + "balance_loss_mlp": 1.04069686, + "epoch": 0.40048098602134374, + "flos": 24681224638080.0, + "grad_norm": 1.7053131194953803, + "language_loss": 0.70897067, + "learning_rate": 2.7248433073055674e-06, + "loss": 0.73050702, + "num_input_tokens_seen": 143076995, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.73046875, + "step": 6661, + "time_per_iteration": 2.5339720249176025 + }, + { + "auxiliary_loss_clip": 0.011183, + "auxiliary_loss_mlp": 0.01041337, + "balance_loss_clip": 1.02808094, + "balance_loss_mlp": 1.04304504, + "epoch": 0.4005411092740117, + "flos": 23185688947200.0, + "grad_norm": 1.7756888608898216, + "language_loss": 0.75688839, + "learning_rate": 2.724480309731437e-06, + "loss": 0.77848476, + "num_input_tokens_seen": 143096780, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75390625, + "step": 6662, + "time_per_iteration": 2.4546353816986084 + }, + { + "auxiliary_loss_clip": 0.01115994, + "auxiliary_loss_mlp": 0.01033447, + "balance_loss_clip": 1.01979184, + "balance_loss_mlp": 1.03956914, + "epoch": 0.4006012325266797, + "flos": 17522324409600.0, + "grad_norm": 2.0032115325237076, + "language_loss": 0.66019243, + "learning_rate": 2.7241172846849417e-06, + "loss": 0.68168688, + "num_input_tokens_seen": 143112590, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.765625, + "step": 6663, + "time_per_iteration": 2.4437708854675293 + }, + { + "auxiliary_loss_clip": 0.01115313, + "auxiliary_loss_mlp": 0.01036965, + "balance_loss_clip": 1.02409601, + "balance_loss_mlp": 1.0406127, + "epoch": 0.40066135577934764, + "flos": 19857200181120.0, + "grad_norm": 2.5671112933527542, + "language_loss": 0.85808247, + "learning_rate": 2.7237542321798455e-06, + "loss": 0.87960517, + "num_input_tokens_seen": 143130220, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.74609375, + "step": 6664, + "time_per_iteration": 2.423644781112671 + }, + { + "auxiliary_loss_clip": 0.01116399, + "auxiliary_loss_mlp": 0.01033701, + "balance_loss_clip": 1.02062321, + "balance_loss_mlp": 1.04155052, + "epoch": 0.40072147903201566, + "flos": 18150007599360.0, + "grad_norm": 1.9940684324093096, + "language_loss": 0.84890211, + "learning_rate": 2.723391152229917e-06, + "loss": 0.87040305, + "num_input_tokens_seen": 143147160, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.75, + "step": 6665, + "time_per_iteration": 2.4386377334594727 + }, + { + "auxiliary_loss_clip": 0.01119995, + "auxiliary_loss_mlp": 0.01034483, + "balance_loss_clip": 1.02107859, + "balance_loss_mlp": 1.04381645, + "epoch": 0.4007816022846836, + "flos": 18661267831680.0, + "grad_norm": 1.7199178144884215, + "language_loss": 0.78264785, + "learning_rate": 2.7230280448489236e-06, + "loss": 0.8041926, + "num_input_tokens_seen": 143164605, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.76171875, + "step": 6666, + "time_per_iteration": 2.434093952178955 + }, + { + "auxiliary_loss_clip": 0.01118396, + "auxiliary_loss_mlp": 0.0103542, + "balance_loss_clip": 1.02121019, + "balance_loss_mlp": 1.04240537, + "epoch": 0.4008417255373516, + "flos": 25703170485120.0, + "grad_norm": 1.6354204552723763, + "language_loss": 0.73558462, + "learning_rate": 2.7226649100506333e-06, + "loss": 0.75712276, + "num_input_tokens_seen": 143183965, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.76171875, + "step": 6667, + "time_per_iteration": 2.520869255065918 + }, + { + "auxiliary_loss_clip": 0.01117838, + "auxiliary_loss_mlp": 0.0104414, + "balance_loss_clip": 1.02944148, + "balance_loss_mlp": 1.04147649, + "epoch": 0.40090184879001955, + "flos": 22858614679680.0, + "grad_norm": 1.370510933760038, + "language_loss": 0.75832677, + "learning_rate": 2.7223017478488183e-06, + "loss": 0.77994657, + "num_input_tokens_seen": 143204965, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.76171875, + "step": 6668, + "time_per_iteration": 2.475261688232422 + }, + { + "auxiliary_loss_clip": 0.0111899, + "auxiliary_loss_mlp": 0.01038268, + "balance_loss_clip": 1.02470183, + "balance_loss_mlp": 1.04511833, + "epoch": 0.4009619720426875, + "flos": 29059848449280.0, + "grad_norm": 1.7348003262037657, + "language_loss": 0.82309943, + "learning_rate": 2.721938558257248e-06, + "loss": 0.84467208, + "num_input_tokens_seen": 143225015, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.73828125, + "step": 6669, + "time_per_iteration": 2.530458927154541 + }, + { + "auxiliary_loss_clip": 0.0103961, + "auxiliary_loss_mlp": 0.01001267, + "balance_loss_clip": 0.99993151, + "balance_loss_mlp": 1.01565075, + "epoch": 0.4010220952953555, + "flos": 66059763131520.0, + "grad_norm": 0.698912500879513, + "language_loss": 0.53386176, + "learning_rate": 2.721575341289695e-06, + "loss": 0.55427051, + "num_input_tokens_seen": 143294925, + "router_z_loss_clip": 0.0133667, + "router_z_loss_mlp": 0.23925781, + "step": 6670, + "time_per_iteration": 3.247837781906128 + }, + { + "auxiliary_loss_clip": 0.01115169, + "auxiliary_loss_mlp": 0.01037927, + "balance_loss_clip": 1.02476037, + "balance_loss_mlp": 1.0415678, + "epoch": 0.40108221854802345, + "flos": 29642822184960.0, + "grad_norm": 1.8543411810419943, + "language_loss": 0.88405877, + "learning_rate": 2.7212120969599333e-06, + "loss": 0.9055897, + "num_input_tokens_seen": 143314170, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 6671, + "time_per_iteration": 2.5657830238342285 + }, + { + "auxiliary_loss_clip": 0.01115344, + "auxiliary_loss_mlp": 0.01034806, + "balance_loss_clip": 1.02088797, + "balance_loss_mlp": 1.04077482, + "epoch": 0.4011423418006914, + "flos": 19929560129280.0, + "grad_norm": 1.813982967664466, + "language_loss": 0.78926146, + "learning_rate": 2.720848825281736e-06, + "loss": 0.81076294, + "num_input_tokens_seen": 143330050, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.74609375, + "step": 6672, + "time_per_iteration": 2.444209337234497 + }, + { + "auxiliary_loss_clip": 0.01110996, + "auxiliary_loss_mlp": 0.01031049, + "balance_loss_clip": 1.01829374, + "balance_loss_mlp": 1.03889108, + "epoch": 0.4012024650533594, + "flos": 20084299920000.0, + "grad_norm": 1.9086088279717175, + "language_loss": 0.63218224, + "learning_rate": 2.72048552626888e-06, + "loss": 0.65360266, + "num_input_tokens_seen": 143348650, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 6673, + "time_per_iteration": 2.577171564102173 + }, + { + "auxiliary_loss_clip": 0.01114754, + "auxiliary_loss_mlp": 0.01033396, + "balance_loss_clip": 1.02027059, + "balance_loss_mlp": 1.0399313, + "epoch": 0.40126258830602735, + "flos": 21695719864320.0, + "grad_norm": 1.4529148407259798, + "language_loss": 0.80390126, + "learning_rate": 2.7201221999351402e-06, + "loss": 0.82538271, + "num_input_tokens_seen": 143370275, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.75, + "step": 6674, + "time_per_iteration": 2.5402464866638184 + }, + { + "auxiliary_loss_clip": 0.01119667, + "auxiliary_loss_mlp": 0.01030214, + "balance_loss_clip": 1.01687407, + "balance_loss_mlp": 1.04199886, + "epoch": 0.4013227115586953, + "flos": 12020379592320.0, + "grad_norm": 2.6082453610380574, + "language_loss": 0.82641548, + "learning_rate": 2.719758846294294e-06, + "loss": 0.84791422, + "num_input_tokens_seen": 143385390, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.77734375, + "step": 6675, + "time_per_iteration": 2.4605085849761963 + }, + { + "auxiliary_loss_clip": 0.0111374, + "auxiliary_loss_mlp": 0.01032911, + "balance_loss_clip": 1.0189873, + "balance_loss_mlp": 1.04002738, + "epoch": 0.4013828348113633, + "flos": 25447522412160.0, + "grad_norm": 1.7135878896985557, + "language_loss": 0.93308246, + "learning_rate": 2.71939546536012e-06, + "loss": 0.95454895, + "num_input_tokens_seen": 143404215, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.73828125, + "step": 6676, + "time_per_iteration": 2.496168851852417 + }, + { + "auxiliary_loss_clip": 0.01121217, + "auxiliary_loss_mlp": 0.01039781, + "balance_loss_clip": 1.02516031, + "balance_loss_mlp": 1.04100275, + "epoch": 0.40144295806403124, + "flos": 18582946225920.0, + "grad_norm": 4.942241320167032, + "language_loss": 0.79622304, + "learning_rate": 2.719032057146399e-06, + "loss": 0.81783295, + "num_input_tokens_seen": 143422245, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.80078125, + "step": 6677, + "time_per_iteration": 2.4565844535827637 + }, + { + "auxiliary_loss_clip": 0.01116689, + "auxiliary_loss_mlp": 0.01032915, + "balance_loss_clip": 1.01977801, + "balance_loss_mlp": 1.0429368, + "epoch": 0.4015030813166992, + "flos": 22930220442240.0, + "grad_norm": 3.7422980142657374, + "language_loss": 0.83766311, + "learning_rate": 2.71866862166691e-06, + "loss": 0.85915917, + "num_input_tokens_seen": 143443130, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73828125, + "step": 6678, + "time_per_iteration": 4.12173318862915 + }, + { + "auxiliary_loss_clip": 0.01114267, + "auxiliary_loss_mlp": 0.01037817, + "balance_loss_clip": 1.02480578, + "balance_loss_mlp": 1.04150224, + "epoch": 0.4015632045693672, + "flos": 20595057361920.0, + "grad_norm": 2.988298740497095, + "language_loss": 0.63948399, + "learning_rate": 2.718305158935434e-06, + "loss": 0.66100478, + "num_input_tokens_seen": 143461385, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7265625, + "step": 6679, + "time_per_iteration": 5.297976016998291 + }, + { + "auxiliary_loss_clip": 0.01112719, + "auxiliary_loss_mlp": 0.01029551, + "balance_loss_clip": 1.01653934, + "balance_loss_mlp": 1.04000115, + "epoch": 0.4016233278220352, + "flos": 23438930808960.0, + "grad_norm": 1.456514191681199, + "language_loss": 0.78654617, + "learning_rate": 2.7179416689657554e-06, + "loss": 0.80796885, + "num_input_tokens_seen": 143481750, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7265625, + "step": 6680, + "time_per_iteration": 2.467042922973633 + }, + { + "auxiliary_loss_clip": 0.01119549, + "auxiliary_loss_mlp": 0.0104553, + "balance_loss_clip": 1.03065872, + "balance_loss_mlp": 1.04160023, + "epoch": 0.40168345107470316, + "flos": 21431057477760.0, + "grad_norm": 1.6886011670643926, + "language_loss": 0.75628668, + "learning_rate": 2.7175781517716556e-06, + "loss": 0.77793747, + "num_input_tokens_seen": 143501540, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78125, + "step": 6681, + "time_per_iteration": 2.579265594482422 + }, + { + "auxiliary_loss_clip": 0.01118059, + "auxiliary_loss_mlp": 0.01030247, + "balance_loss_clip": 1.01727676, + "balance_loss_mlp": 1.04282522, + "epoch": 0.4017435743273711, + "flos": 22857214049280.0, + "grad_norm": 2.058228157074571, + "language_loss": 0.64001781, + "learning_rate": 2.7172146073669213e-06, + "loss": 0.66150093, + "num_input_tokens_seen": 143520530, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.75390625, + "step": 6682, + "time_per_iteration": 2.4423694610595703 + }, + { + "auxiliary_loss_clip": 0.01115099, + "auxiliary_loss_mlp": 0.01032652, + "balance_loss_clip": 1.01953304, + "balance_loss_mlp": 1.03868985, + "epoch": 0.4018036975800391, + "flos": 28622312881920.0, + "grad_norm": 1.6867457181896433, + "language_loss": 0.73334014, + "learning_rate": 2.716851035765337e-06, + "loss": 0.75481766, + "num_input_tokens_seen": 143540210, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.765625, + "step": 6683, + "time_per_iteration": 2.5543196201324463 + }, + { + "auxiliary_loss_clip": 0.01113172, + "auxiliary_loss_mlp": 0.01043204, + "balance_loss_clip": 1.02971554, + "balance_loss_mlp": 1.03814459, + "epoch": 0.40186382083270705, + "flos": 26651212099200.0, + "grad_norm": 1.6157462356379846, + "language_loss": 0.73054385, + "learning_rate": 2.7164874369806896e-06, + "loss": 0.75210762, + "num_input_tokens_seen": 143560940, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 6684, + "time_per_iteration": 2.584984302520752 + }, + { + "auxiliary_loss_clip": 0.01036703, + "auxiliary_loss_mlp": 0.01002873, + "balance_loss_clip": 1.0016098, + "balance_loss_mlp": 1.01262808, + "epoch": 0.401923944085375, + "flos": 59259969123840.0, + "grad_norm": 0.8051502477983452, + "language_loss": 0.60442972, + "learning_rate": 2.716123811026767e-06, + "loss": 0.62482548, + "num_input_tokens_seen": 143624015, + "router_z_loss_clip": 0.01263428, + "router_z_loss_mlp": 0.24023438, + "step": 6685, + "time_per_iteration": 3.2001583576202393 + }, + { + "auxiliary_loss_clip": 0.01118672, + "auxiliary_loss_mlp": 0.01032258, + "balance_loss_clip": 1.0184474, + "balance_loss_mlp": 1.0410161, + "epoch": 0.401984067338043, + "flos": 16982803152000.0, + "grad_norm": 2.1343445795660956, + "language_loss": 0.69979215, + "learning_rate": 2.715760157917357e-06, + "loss": 0.72130144, + "num_input_tokens_seen": 143642750, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.77734375, + "step": 6686, + "time_per_iteration": 2.486487627029419 + }, + { + "auxiliary_loss_clip": 0.01113204, + "auxiliary_loss_mlp": 0.01030839, + "balance_loss_clip": 1.01784527, + "balance_loss_mlp": 1.03917289, + "epoch": 0.40204419059071095, + "flos": 24972496024320.0, + "grad_norm": 1.4076322562781298, + "language_loss": 0.74622524, + "learning_rate": 2.7153964776662504e-06, + "loss": 0.76766562, + "num_input_tokens_seen": 143664515, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7421875, + "step": 6687, + "time_per_iteration": 2.4854915142059326 + }, + { + "auxiliary_loss_clip": 0.01117283, + "auxiliary_loss_mlp": 0.01035998, + "balance_loss_clip": 1.02219915, + "balance_loss_mlp": 1.04146934, + "epoch": 0.4021043138433789, + "flos": 23477463123840.0, + "grad_norm": 1.852699339351418, + "language_loss": 0.70648831, + "learning_rate": 2.7150327702872385e-06, + "loss": 0.72802114, + "num_input_tokens_seen": 143683135, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7578125, + "step": 6688, + "time_per_iteration": 2.452765703201294 + }, + { + "auxiliary_loss_clip": 0.01117224, + "auxiliary_loss_mlp": 0.0103962, + "balance_loss_clip": 1.02558923, + "balance_loss_mlp": 1.0390867, + "epoch": 0.4021644370960469, + "flos": 25995806588160.0, + "grad_norm": 1.7360862235805987, + "language_loss": 0.64509618, + "learning_rate": 2.7146690357941112e-06, + "loss": 0.6666646, + "num_input_tokens_seen": 143703985, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.78125, + "step": 6689, + "time_per_iteration": 2.5217337608337402 + }, + { + "auxiliary_loss_clip": 0.01117214, + "auxiliary_loss_mlp": 0.01033972, + "balance_loss_clip": 1.02059698, + "balance_loss_mlp": 1.03956485, + "epoch": 0.40222456034871484, + "flos": 13587987922560.0, + "grad_norm": 2.322807889185569, + "language_loss": 0.7306338, + "learning_rate": 2.7143052742006632e-06, + "loss": 0.75214565, + "num_input_tokens_seen": 143719245, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.77734375, + "step": 6690, + "time_per_iteration": 2.421478509902954 + }, + { + "auxiliary_loss_clip": 0.01114039, + "auxiliary_loss_mlp": 0.01036823, + "balance_loss_clip": 1.02357256, + "balance_loss_mlp": 1.03967643, + "epoch": 0.4022846836013828, + "flos": 24278019494400.0, + "grad_norm": 1.4867559931284213, + "language_loss": 0.74789405, + "learning_rate": 2.7139414855206872e-06, + "loss": 0.76940262, + "num_input_tokens_seen": 143739575, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7421875, + "step": 6691, + "time_per_iteration": 2.5322606563568115 + }, + { + "auxiliary_loss_clip": 0.01119421, + "auxiliary_loss_mlp": 0.01038807, + "balance_loss_clip": 1.02530634, + "balance_loss_mlp": 1.04281604, + "epoch": 0.40234480685405083, + "flos": 20151596050560.0, + "grad_norm": 1.5836527032457117, + "language_loss": 0.72676492, + "learning_rate": 2.7135776697679785e-06, + "loss": 0.74834728, + "num_input_tokens_seen": 143758515, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 6692, + "time_per_iteration": 2.486466407775879 + }, + { + "auxiliary_loss_clip": 0.01115579, + "auxiliary_loss_mlp": 0.01037632, + "balance_loss_clip": 1.02444792, + "balance_loss_mlp": 1.039814, + "epoch": 0.4024049301067188, + "flos": 22930220442240.0, + "grad_norm": 1.7516389520719526, + "language_loss": 0.83851349, + "learning_rate": 2.7132138269563333e-06, + "loss": 0.86004555, + "num_input_tokens_seen": 143776770, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7578125, + "step": 6693, + "time_per_iteration": 2.5068037509918213 + }, + { + "auxiliary_loss_clip": 0.01118781, + "auxiliary_loss_mlp": 0.01036408, + "balance_loss_clip": 1.02325296, + "balance_loss_mlp": 1.04313457, + "epoch": 0.40246505335938676, + "flos": 36028421487360.0, + "grad_norm": 1.699829604816944, + "language_loss": 0.71295136, + "learning_rate": 2.7128499570995483e-06, + "loss": 0.73450321, + "num_input_tokens_seen": 143798450, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7578125, + "step": 6694, + "time_per_iteration": 2.5704145431518555 + }, + { + "auxiliary_loss_clip": 0.01114045, + "auxiliary_loss_mlp": 0.01038433, + "balance_loss_clip": 1.02460432, + "balance_loss_mlp": 1.03981924, + "epoch": 0.4025251766120547, + "flos": 20594303176320.0, + "grad_norm": 2.0155422945498223, + "language_loss": 0.67754763, + "learning_rate": 2.7124860602114212e-06, + "loss": 0.69907242, + "num_input_tokens_seen": 143816995, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7421875, + "step": 6695, + "time_per_iteration": 2.4664762020111084 + }, + { + "auxiliary_loss_clip": 0.0111296, + "auxiliary_loss_mlp": 0.01030605, + "balance_loss_clip": 1.01736653, + "balance_loss_mlp": 1.03826809, + "epoch": 0.4025852998647227, + "flos": 64523932381440.0, + "grad_norm": 2.459399840574827, + "language_loss": 0.79355788, + "learning_rate": 2.7121221363057515e-06, + "loss": 0.81499356, + "num_input_tokens_seen": 143842090, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.74609375, + "step": 6696, + "time_per_iteration": 2.883577346801758 + }, + { + "auxiliary_loss_clip": 0.01118448, + "auxiliary_loss_mlp": 0.0103721, + "balance_loss_clip": 1.02291059, + "balance_loss_mlp": 1.04224885, + "epoch": 0.40264542311739066, + "flos": 20886292834560.0, + "grad_norm": 1.6846278858215487, + "language_loss": 0.70899725, + "learning_rate": 2.7117581853963393e-06, + "loss": 0.73055387, + "num_input_tokens_seen": 143860800, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.76171875, + "step": 6697, + "time_per_iteration": 2.4922237396240234 + }, + { + "auxiliary_loss_clip": 0.01113557, + "auxiliary_loss_mlp": 0.01038169, + "balance_loss_clip": 1.02555108, + "balance_loss_mlp": 1.04018331, + "epoch": 0.4027055463700586, + "flos": 26250197685120.0, + "grad_norm": 2.4926240162149162, + "language_loss": 0.61456931, + "learning_rate": 2.711394207496984e-06, + "loss": 0.63608658, + "num_input_tokens_seen": 143878950, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.734375, + "step": 6698, + "time_per_iteration": 2.4892961978912354 + }, + { + "auxiliary_loss_clip": 0.01115982, + "auxiliary_loss_mlp": 0.0103183, + "balance_loss_clip": 1.01840675, + "balance_loss_mlp": 1.03997493, + "epoch": 0.4027656696227266, + "flos": 20631398947200.0, + "grad_norm": 1.8414423865451628, + "language_loss": 0.76245844, + "learning_rate": 2.711030202621491e-06, + "loss": 0.78393662, + "num_input_tokens_seen": 143898385, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.76171875, + "step": 6699, + "time_per_iteration": 2.4576990604400635 + }, + { + "auxiliary_loss_clip": 0.01110513, + "auxiliary_loss_mlp": 0.01030361, + "balance_loss_clip": 1.0171113, + "balance_loss_mlp": 1.03855538, + "epoch": 0.40282579287539455, + "flos": 22346277039360.0, + "grad_norm": 1.5844300780087603, + "language_loss": 0.80345184, + "learning_rate": 2.7106661707836605e-06, + "loss": 0.82486057, + "num_input_tokens_seen": 143918795, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 6700, + "time_per_iteration": 2.4449126720428467 + }, + { + "auxiliary_loss_clip": 0.01117537, + "auxiliary_loss_mlp": 0.01043995, + "balance_loss_clip": 1.02886689, + "balance_loss_mlp": 1.03814912, + "epoch": 0.4028859161280625, + "flos": 29274988959360.0, + "grad_norm": 2.2662820598104227, + "language_loss": 0.74967611, + "learning_rate": 2.7103021119972977e-06, + "loss": 0.77129138, + "num_input_tokens_seen": 143938245, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.79296875, + "step": 6701, + "time_per_iteration": 2.5474703311920166 + }, + { + "auxiliary_loss_clip": 0.01112492, + "auxiliary_loss_mlp": 0.01038921, + "balance_loss_clip": 1.02598631, + "balance_loss_mlp": 1.03800225, + "epoch": 0.4029460393807305, + "flos": 28622312881920.0, + "grad_norm": 1.5176135502188826, + "language_loss": 0.65989178, + "learning_rate": 2.709938026276208e-06, + "loss": 0.6814059, + "num_input_tokens_seen": 143960995, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.74609375, + "step": 6702, + "time_per_iteration": 2.5158073902130127 + }, + { + "auxiliary_loss_clip": 0.01116121, + "auxiliary_loss_mlp": 0.01039218, + "balance_loss_clip": 1.02409053, + "balance_loss_mlp": 1.03949153, + "epoch": 0.40300616263339845, + "flos": 22601925112320.0, + "grad_norm": 1.577366316976287, + "language_loss": 0.66134161, + "learning_rate": 2.7095739136341964e-06, + "loss": 0.68289495, + "num_input_tokens_seen": 143979910, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.765625, + "step": 6703, + "time_per_iteration": 2.4974560737609863 + }, + { + "auxiliary_loss_clip": 0.01119665, + "auxiliary_loss_mlp": 0.01035087, + "balance_loss_clip": 1.0210917, + "balance_loss_mlp": 1.04285431, + "epoch": 0.4030662858860664, + "flos": 25520313323520.0, + "grad_norm": 2.6870156282512245, + "language_loss": 0.82005399, + "learning_rate": 2.709209774085071e-06, + "loss": 0.84160155, + "num_input_tokens_seen": 144000095, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.765625, + "step": 6704, + "time_per_iteration": 2.5040299892425537 + }, + { + "auxiliary_loss_clip": 0.01117271, + "auxiliary_loss_mlp": 0.01034919, + "balance_loss_clip": 1.02110291, + "balance_loss_mlp": 1.03974569, + "epoch": 0.40312640913873443, + "flos": 23586703361280.0, + "grad_norm": 2.5805971030690578, + "language_loss": 0.73468685, + "learning_rate": 2.7088456076426407e-06, + "loss": 0.75620878, + "num_input_tokens_seen": 144019695, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.77734375, + "step": 6705, + "time_per_iteration": 2.520252227783203 + }, + { + "auxiliary_loss_clip": 0.01113466, + "auxiliary_loss_mlp": 0.01035202, + "balance_loss_clip": 1.02208292, + "balance_loss_mlp": 1.03979278, + "epoch": 0.4031865323914024, + "flos": 20011042131840.0, + "grad_norm": 1.712587367637223, + "language_loss": 0.66288096, + "learning_rate": 2.708481414320713e-06, + "loss": 0.68436766, + "num_input_tokens_seen": 144038525, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.734375, + "step": 6706, + "time_per_iteration": 2.4254331588745117 + }, + { + "auxiliary_loss_clip": 0.01114724, + "auxiliary_loss_mlp": 0.0103993, + "balance_loss_clip": 1.02619088, + "balance_loss_mlp": 1.03957605, + "epoch": 0.40324665564407036, + "flos": 21871430219520.0, + "grad_norm": 1.3675174561755612, + "language_loss": 0.71328777, + "learning_rate": 2.7081171941330992e-06, + "loss": 0.73483431, + "num_input_tokens_seen": 144059485, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75, + "step": 6707, + "time_per_iteration": 2.5285422801971436 + }, + { + "auxiliary_loss_clip": 0.01109979, + "auxiliary_loss_mlp": 0.01035824, + "balance_loss_clip": 1.02169156, + "balance_loss_mlp": 1.03867698, + "epoch": 0.4033067788967383, + "flos": 23878728933120.0, + "grad_norm": 1.4937460074112463, + "language_loss": 0.80080485, + "learning_rate": 2.707752947093611e-06, + "loss": 0.82226288, + "num_input_tokens_seen": 144080265, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7109375, + "step": 6708, + "time_per_iteration": 2.4664134979248047 + }, + { + "auxiliary_loss_clip": 0.01117266, + "auxiliary_loss_mlp": 0.0103605, + "balance_loss_clip": 1.02170968, + "balance_loss_mlp": 1.03778601, + "epoch": 0.4033669021494063, + "flos": 17419907756160.0, + "grad_norm": 2.013607365016592, + "language_loss": 0.82944471, + "learning_rate": 2.70738867321606e-06, + "loss": 0.8509779, + "num_input_tokens_seen": 144098040, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.796875, + "step": 6709, + "time_per_iteration": 2.461277723312378 + }, + { + "auxiliary_loss_clip": 0.01119346, + "auxiliary_loss_mlp": 0.01038536, + "balance_loss_clip": 1.02454066, + "balance_loss_mlp": 1.04260051, + "epoch": 0.40342702540207426, + "flos": 29600554855680.0, + "grad_norm": 1.4165591336273893, + "language_loss": 0.71036613, + "learning_rate": 2.70702437251426e-06, + "loss": 0.73194492, + "num_input_tokens_seen": 144118265, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.765625, + "step": 6710, + "time_per_iteration": 2.5579922199249268 + }, + { + "auxiliary_loss_clip": 0.01116194, + "auxiliary_loss_mlp": 0.01038571, + "balance_loss_clip": 1.02461195, + "balance_loss_mlp": 1.04049003, + "epoch": 0.4034871486547422, + "flos": 11284605400320.0, + "grad_norm": 1.9864485278108117, + "language_loss": 0.85366702, + "learning_rate": 2.7066600450020236e-06, + "loss": 0.87521464, + "num_input_tokens_seen": 144133865, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7578125, + "step": 6711, + "time_per_iteration": 2.511082410812378 + }, + { + "auxiliary_loss_clip": 0.01116602, + "auxiliary_loss_mlp": 0.01034461, + "balance_loss_clip": 1.02038825, + "balance_loss_mlp": 1.04072142, + "epoch": 0.4035472719074102, + "flos": 15552839738880.0, + "grad_norm": 1.9069456024701996, + "language_loss": 0.76074743, + "learning_rate": 2.706295690693168e-06, + "loss": 0.78225803, + "num_input_tokens_seen": 144150125, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 6712, + "time_per_iteration": 2.419672727584839 + }, + { + "auxiliary_loss_clip": 0.0111779, + "auxiliary_loss_mlp": 0.01037728, + "balance_loss_clip": 1.02364349, + "balance_loss_mlp": 1.04200089, + "epoch": 0.40360739516007815, + "flos": 24674365140480.0, + "grad_norm": 2.1216019240756765, + "language_loss": 0.78926992, + "learning_rate": 2.7059313096015096e-06, + "loss": 0.81082511, + "num_input_tokens_seen": 144169295, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7578125, + "step": 6713, + "time_per_iteration": 2.520109176635742 + }, + { + "auxiliary_loss_clip": 0.01113814, + "auxiliary_loss_mlp": 0.01033041, + "balance_loss_clip": 1.01912916, + "balance_loss_mlp": 1.03721881, + "epoch": 0.4036675184127461, + "flos": 17304095329920.0, + "grad_norm": 1.8945946455640421, + "language_loss": 0.88507473, + "learning_rate": 2.705566901740865e-06, + "loss": 0.90654337, + "num_input_tokens_seen": 144185790, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.765625, + "step": 6714, + "time_per_iteration": 2.4076859951019287 + }, + { + "auxiliary_loss_clip": 0.01115997, + "auxiliary_loss_mlp": 0.01040851, + "balance_loss_clip": 1.02688611, + "balance_loss_mlp": 1.04049468, + "epoch": 0.4037276416654141, + "flos": 19864023765120.0, + "grad_norm": 2.116493132238348, + "language_loss": 0.69099832, + "learning_rate": 2.7052024671250527e-06, + "loss": 0.71256685, + "num_input_tokens_seen": 144205190, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75390625, + "step": 6715, + "time_per_iteration": 2.4805076122283936 + }, + { + "auxiliary_loss_clip": 0.01117346, + "auxiliary_loss_mlp": 0.01031825, + "balance_loss_clip": 1.01785374, + "balance_loss_mlp": 1.03944981, + "epoch": 0.40378776491808205, + "flos": 18296271780480.0, + "grad_norm": 7.495764991407429, + "language_loss": 0.76919901, + "learning_rate": 2.704838005767892e-06, + "loss": 0.79069078, + "num_input_tokens_seen": 144222705, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.78125, + "step": 6716, + "time_per_iteration": 2.4244720935821533 + }, + { + "auxiliary_loss_clip": 0.0111299, + "auxiliary_loss_mlp": 0.01037832, + "balance_loss_clip": 1.02485037, + "balance_loss_mlp": 1.03992844, + "epoch": 0.40384788817075, + "flos": 15049372757760.0, + "grad_norm": 1.8407988101654404, + "language_loss": 0.76272923, + "learning_rate": 2.7044735176832037e-06, + "loss": 0.78423738, + "num_input_tokens_seen": 144239545, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.73046875, + "step": 6717, + "time_per_iteration": 2.5080463886260986 + }, + { + "auxiliary_loss_clip": 0.01036903, + "auxiliary_loss_mlp": 0.01007011, + "balance_loss_clip": 1.00571179, + "balance_loss_mlp": 1.01217222, + "epoch": 0.40390801142341803, + "flos": 61929927895680.0, + "grad_norm": 0.940083561343906, + "language_loss": 0.60735488, + "learning_rate": 2.7041090028848084e-06, + "loss": 0.62779397, + "num_input_tokens_seen": 144288145, + "router_z_loss_clip": 0.01300049, + "router_z_loss_mlp": 0.24707031, + "step": 6718, + "time_per_iteration": 2.9391937255859375 + }, + { + "auxiliary_loss_clip": 0.01120577, + "auxiliary_loss_mlp": 0.01036292, + "balance_loss_clip": 1.02140856, + "balance_loss_mlp": 1.04066229, + "epoch": 0.403968134676086, + "flos": 22738779930240.0, + "grad_norm": 2.1744660134680776, + "language_loss": 0.74794078, + "learning_rate": 2.7037444613865306e-06, + "loss": 0.76950943, + "num_input_tokens_seen": 144302315, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 6719, + "time_per_iteration": 2.4630534648895264 + }, + { + "auxiliary_loss_clip": 0.01117045, + "auxiliary_loss_mlp": 0.01043036, + "balance_loss_clip": 1.02762175, + "balance_loss_mlp": 1.0402683, + "epoch": 0.40402825792875396, + "flos": 19784409269760.0, + "grad_norm": 2.5217598497166422, + "language_loss": 0.81235194, + "learning_rate": 2.7033798932021906e-06, + "loss": 0.83395278, + "num_input_tokens_seen": 144318990, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.76953125, + "step": 6720, + "time_per_iteration": 6.786137104034424 + }, + { + "auxiliary_loss_clip": 0.01113768, + "auxiliary_loss_mlp": 0.01030391, + "balance_loss_clip": 1.01644325, + "balance_loss_mlp": 1.0376296, + "epoch": 0.40408838118142193, + "flos": 19609273532160.0, + "grad_norm": 1.933287838521713, + "language_loss": 0.7720241, + "learning_rate": 2.7030152983456153e-06, + "loss": 0.79346573, + "num_input_tokens_seen": 144335765, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76171875, + "step": 6721, + "time_per_iteration": 3.9910030364990234 + }, + { + "auxiliary_loss_clip": 0.01112718, + "auxiliary_loss_mlp": 0.01026726, + "balance_loss_clip": 1.01460266, + "balance_loss_mlp": 1.04090941, + "epoch": 0.4041485044340899, + "flos": 24426043441920.0, + "grad_norm": 2.3110658804222566, + "language_loss": 0.7264756, + "learning_rate": 2.7026506768306304e-06, + "loss": 0.74787009, + "num_input_tokens_seen": 144355825, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.71875, + "step": 6722, + "time_per_iteration": 2.5377390384674072 + }, + { + "auxiliary_loss_clip": 0.01113608, + "auxiliary_loss_mlp": 0.01036417, + "balance_loss_clip": 1.02270842, + "balance_loss_mlp": 1.03896952, + "epoch": 0.40420862768675786, + "flos": 16760192613120.0, + "grad_norm": 1.7096890061042316, + "language_loss": 0.65681767, + "learning_rate": 2.7022860286710602e-06, + "loss": 0.67831796, + "num_input_tokens_seen": 144374320, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 6723, + "time_per_iteration": 2.429657220840454 + }, + { + "auxiliary_loss_clip": 0.01118233, + "auxiliary_loss_mlp": 0.01043022, + "balance_loss_clip": 1.02834117, + "balance_loss_mlp": 1.04056454, + "epoch": 0.4042687509394258, + "flos": 22491571553280.0, + "grad_norm": 1.4515559648574707, + "language_loss": 0.74074364, + "learning_rate": 2.701921353880734e-06, + "loss": 0.76235622, + "num_input_tokens_seen": 144394325, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7734375, + "step": 6724, + "time_per_iteration": 2.485166072845459 + }, + { + "auxiliary_loss_clip": 0.011096, + "auxiliary_loss_mlp": 0.01034503, + "balance_loss_clip": 1.02133048, + "balance_loss_mlp": 1.03799534, + "epoch": 0.4043288741920938, + "flos": 30336149479680.0, + "grad_norm": 1.783988932028688, + "language_loss": 0.74764013, + "learning_rate": 2.7015566524734787e-06, + "loss": 0.76908118, + "num_input_tokens_seen": 144412765, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71484375, + "step": 6725, + "time_per_iteration": 2.5141966342926025 + }, + { + "auxiliary_loss_clip": 0.01112534, + "auxiliary_loss_mlp": 0.01034717, + "balance_loss_clip": 1.02024531, + "balance_loss_mlp": 1.03874183, + "epoch": 0.40438899744476176, + "flos": 46348321363200.0, + "grad_norm": 1.8781247850607437, + "language_loss": 0.76928914, + "learning_rate": 2.701191924463126e-06, + "loss": 0.79076171, + "num_input_tokens_seen": 144435400, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.73828125, + "step": 6726, + "time_per_iteration": 2.685609817504883 + }, + { + "auxiliary_loss_clip": 0.01115432, + "auxiliary_loss_mlp": 0.01034649, + "balance_loss_clip": 1.02004611, + "balance_loss_mlp": 1.03858769, + "epoch": 0.4044491206974297, + "flos": 13333524998400.0, + "grad_norm": 2.1780936913008646, + "language_loss": 0.81682861, + "learning_rate": 2.7008271698635054e-06, + "loss": 0.83832943, + "num_input_tokens_seen": 144452925, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.76953125, + "step": 6727, + "time_per_iteration": 2.4221317768096924 + }, + { + "auxiliary_loss_clip": 0.0111635, + "auxiliary_loss_mlp": 0.01034771, + "balance_loss_clip": 1.02088916, + "balance_loss_mlp": 1.0411514, + "epoch": 0.4045092439500977, + "flos": 12093745121280.0, + "grad_norm": 2.0089286405461246, + "language_loss": 0.85300338, + "learning_rate": 2.700462388688447e-06, + "loss": 0.87451458, + "num_input_tokens_seen": 144470195, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 6728, + "time_per_iteration": 2.4719340801239014 + }, + { + "auxiliary_loss_clip": 0.01117368, + "auxiliary_loss_mlp": 0.01034182, + "balance_loss_clip": 1.02059257, + "balance_loss_mlp": 1.04241705, + "epoch": 0.40456936720276565, + "flos": 21179683123200.0, + "grad_norm": 1.6690883830899332, + "language_loss": 0.81804991, + "learning_rate": 2.700097580951786e-06, + "loss": 0.8395654, + "num_input_tokens_seen": 144490320, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.75, + "step": 6729, + "time_per_iteration": 2.4482905864715576 + }, + { + "auxiliary_loss_clip": 0.01114628, + "auxiliary_loss_mlp": 0.01034739, + "balance_loss_clip": 1.02092838, + "balance_loss_mlp": 1.04034996, + "epoch": 0.4046294904554336, + "flos": 23915286000000.0, + "grad_norm": 1.841339511320202, + "language_loss": 0.72582501, + "learning_rate": 2.6997327466673533e-06, + "loss": 0.74731869, + "num_input_tokens_seen": 144508990, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7421875, + "step": 6730, + "time_per_iteration": 2.537121295928955 + }, + { + "auxiliary_loss_clip": 0.01114402, + "auxiliary_loss_mlp": 0.01035728, + "balance_loss_clip": 1.0216732, + "balance_loss_mlp": 1.04037821, + "epoch": 0.4046896137081016, + "flos": 38071235773440.0, + "grad_norm": 1.6090983176176454, + "language_loss": 0.67394918, + "learning_rate": 2.699367885848985e-06, + "loss": 0.69545048, + "num_input_tokens_seen": 144529550, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 6731, + "time_per_iteration": 2.645958423614502 + }, + { + "auxiliary_loss_clip": 0.01114135, + "auxiliary_loss_mlp": 0.01034866, + "balance_loss_clip": 1.02196193, + "balance_loss_mlp": 1.03986645, + "epoch": 0.4047497369607696, + "flos": 23617262856960.0, + "grad_norm": 1.6078062973222544, + "language_loss": 0.74067897, + "learning_rate": 2.699002998510517e-06, + "loss": 0.76216894, + "num_input_tokens_seen": 144549310, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7421875, + "step": 6732, + "time_per_iteration": 2.5182886123657227 + }, + { + "auxiliary_loss_clip": 0.01114756, + "auxiliary_loss_mlp": 0.01028887, + "balance_loss_clip": 1.01650739, + "balance_loss_mlp": 1.04178488, + "epoch": 0.40480986021343757, + "flos": 12823593569280.0, + "grad_norm": 1.830865433765548, + "language_loss": 0.7690779, + "learning_rate": 2.6986380846657852e-06, + "loss": 0.79051435, + "num_input_tokens_seen": 144567430, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.73046875, + "step": 6733, + "time_per_iteration": 2.430748701095581 + }, + { + "auxiliary_loss_clip": 0.01120623, + "auxiliary_loss_mlp": 0.01038866, + "balance_loss_clip": 1.02358902, + "balance_loss_mlp": 1.04164028, + "epoch": 0.40486998346610553, + "flos": 23768770423680.0, + "grad_norm": 1.8916182343646197, + "language_loss": 0.7649287, + "learning_rate": 2.698273144328627e-06, + "loss": 0.78652358, + "num_input_tokens_seen": 144585975, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.7890625, + "step": 6734, + "time_per_iteration": 2.507070541381836 + }, + { + "auxiliary_loss_clip": 0.01121282, + "auxiliary_loss_mlp": 0.01030778, + "balance_loss_clip": 1.01729572, + "balance_loss_mlp": 1.04258728, + "epoch": 0.4049301067187735, + "flos": 22856818999680.0, + "grad_norm": 2.227264135735927, + "language_loss": 0.65026176, + "learning_rate": 2.6979081775128805e-06, + "loss": 0.67178231, + "num_input_tokens_seen": 144605225, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7890625, + "step": 6735, + "time_per_iteration": 2.4677040576934814 + }, + { + "auxiliary_loss_clip": 0.01113204, + "auxiliary_loss_mlp": 0.01034185, + "balance_loss_clip": 1.02154267, + "balance_loss_mlp": 1.04025424, + "epoch": 0.40499022997144146, + "flos": 22783992174720.0, + "grad_norm": 1.9551652085107198, + "language_loss": 0.83177966, + "learning_rate": 2.697543184232387e-06, + "loss": 0.85325354, + "num_input_tokens_seen": 144624145, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7265625, + "step": 6736, + "time_per_iteration": 2.5244226455688477 + }, + { + "auxiliary_loss_clip": 0.01121161, + "auxiliary_loss_mlp": 0.01037611, + "balance_loss_clip": 1.02344942, + "balance_loss_mlp": 1.04291666, + "epoch": 0.4050503532241094, + "flos": 23039352938880.0, + "grad_norm": 1.699075737504615, + "language_loss": 0.7520684, + "learning_rate": 2.6971781645009863e-06, + "loss": 0.77365613, + "num_input_tokens_seen": 144644470, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.78125, + "step": 6737, + "time_per_iteration": 2.510906457901001 + }, + { + "auxiliary_loss_clip": 0.01117535, + "auxiliary_loss_mlp": 0.0103775, + "balance_loss_clip": 1.02408242, + "balance_loss_mlp": 1.04335642, + "epoch": 0.4051104764767774, + "flos": 16647756065280.0, + "grad_norm": 2.288492776548484, + "language_loss": 0.71790028, + "learning_rate": 2.696813118332519e-06, + "loss": 0.73945308, + "num_input_tokens_seen": 144661055, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7421875, + "step": 6738, + "time_per_iteration": 2.514575481414795 + }, + { + "auxiliary_loss_clip": 0.01114478, + "auxiliary_loss_mlp": 0.01031321, + "balance_loss_clip": 1.01845288, + "balance_loss_mlp": 1.04022241, + "epoch": 0.40517059972944536, + "flos": 16358962717440.0, + "grad_norm": 2.003378473366394, + "language_loss": 0.75169361, + "learning_rate": 2.696448045740828e-06, + "loss": 0.77315164, + "num_input_tokens_seen": 144677935, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7421875, + "step": 6739, + "time_per_iteration": 2.4737000465393066 + }, + { + "auxiliary_loss_clip": 0.01119431, + "auxiliary_loss_mlp": 0.01034852, + "balance_loss_clip": 1.02107763, + "balance_loss_mlp": 1.04296541, + "epoch": 0.4052307229821133, + "flos": 28803374363520.0, + "grad_norm": 1.7865413260400147, + "language_loss": 0.73943472, + "learning_rate": 2.6960829467397576e-06, + "loss": 0.76097751, + "num_input_tokens_seen": 144697725, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.765625, + "step": 6740, + "time_per_iteration": 2.5434296131134033 + }, + { + "auxiliary_loss_clip": 0.0111643, + "auxiliary_loss_mlp": 0.01032682, + "balance_loss_clip": 1.0190562, + "balance_loss_mlp": 1.04310441, + "epoch": 0.4052908462347813, + "flos": 21397876289280.0, + "grad_norm": 1.5350516452213203, + "language_loss": 0.77179801, + "learning_rate": 2.695717821343153e-06, + "loss": 0.79328907, + "num_input_tokens_seen": 144718805, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.734375, + "step": 6741, + "time_per_iteration": 2.473451852798462 + }, + { + "auxiliary_loss_clip": 0.01120883, + "auxiliary_loss_mlp": 0.01035782, + "balance_loss_clip": 1.02082753, + "balance_loss_mlp": 1.04359269, + "epoch": 0.40535096948744925, + "flos": 22419067950720.0, + "grad_norm": 1.8990417013226273, + "language_loss": 0.70827335, + "learning_rate": 2.6953526695648577e-06, + "loss": 0.72983992, + "num_input_tokens_seen": 144737105, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.7734375, + "step": 6742, + "time_per_iteration": 2.4797537326812744 + }, + { + "auxiliary_loss_clip": 0.01121445, + "auxiliary_loss_mlp": 0.01029673, + "balance_loss_clip": 1.01517677, + "balance_loss_mlp": 1.04446578, + "epoch": 0.4054110927401172, + "flos": 17010776868480.0, + "grad_norm": 2.180199258846301, + "language_loss": 0.72242743, + "learning_rate": 2.6949874914187202e-06, + "loss": 0.74393857, + "num_input_tokens_seen": 144751350, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.76953125, + "step": 6743, + "time_per_iteration": 2.409444808959961 + }, + { + "auxiliary_loss_clip": 0.0112179, + "auxiliary_loss_mlp": 0.0103567, + "balance_loss_clip": 1.02128196, + "balance_loss_mlp": 1.04374886, + "epoch": 0.4054712159927852, + "flos": 21614848392960.0, + "grad_norm": 3.287949139408167, + "language_loss": 0.70554733, + "learning_rate": 2.694622286918588e-06, + "loss": 0.72712195, + "num_input_tokens_seen": 144770030, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 6744, + "time_per_iteration": 2.475775957107544 + }, + { + "auxiliary_loss_clip": 0.01116341, + "auxiliary_loss_mlp": 0.01034834, + "balance_loss_clip": 1.02154207, + "balance_loss_mlp": 1.04163671, + "epoch": 0.4055313392454532, + "flos": 25812554376960.0, + "grad_norm": 1.534678646828984, + "language_loss": 0.79982138, + "learning_rate": 2.6942570560783076e-06, + "loss": 0.82133317, + "num_input_tokens_seen": 144790965, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.74609375, + "step": 6745, + "time_per_iteration": 2.492379903793335 + }, + { + "auxiliary_loss_clip": 0.01120523, + "auxiliary_loss_mlp": 0.01033491, + "balance_loss_clip": 1.01904845, + "balance_loss_mlp": 1.0463028, + "epoch": 0.40559146249812117, + "flos": 14137098111360.0, + "grad_norm": 1.8557240822638386, + "language_loss": 0.66450787, + "learning_rate": 2.693891798911731e-06, + "loss": 0.68604791, + "num_input_tokens_seen": 144807755, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7421875, + "step": 6746, + "time_per_iteration": 2.4547531604766846 + }, + { + "auxiliary_loss_clip": 0.01118105, + "auxiliary_loss_mlp": 0.01029204, + "balance_loss_clip": 1.01573384, + "balance_loss_mlp": 1.04319298, + "epoch": 0.40565158575078913, + "flos": 41355481962240.0, + "grad_norm": 1.5006534813974708, + "language_loss": 0.5713616, + "learning_rate": 2.6935265154327075e-06, + "loss": 0.59283465, + "num_input_tokens_seen": 144832405, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 6747, + "time_per_iteration": 2.627912998199463 + }, + { + "auxiliary_loss_clip": 0.01119274, + "auxiliary_loss_mlp": 0.01041713, + "balance_loss_clip": 1.02859426, + "balance_loss_mlp": 1.04399908, + "epoch": 0.4057117090034571, + "flos": 28544529980160.0, + "grad_norm": 1.605109327396707, + "language_loss": 0.8454957, + "learning_rate": 2.693161205655089e-06, + "loss": 0.8671056, + "num_input_tokens_seen": 144853890, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.75390625, + "step": 6748, + "time_per_iteration": 2.5783345699310303 + }, + { + "auxiliary_loss_clip": 0.01120452, + "auxiliary_loss_mlp": 0.01035858, + "balance_loss_clip": 1.02210689, + "balance_loss_mlp": 1.04356313, + "epoch": 0.40577183225612506, + "flos": 18004066640640.0, + "grad_norm": 2.1468645636667705, + "language_loss": 0.81288636, + "learning_rate": 2.6927958695927287e-06, + "loss": 0.83444953, + "num_input_tokens_seen": 144871395, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76953125, + "step": 6749, + "time_per_iteration": 2.433042049407959 + }, + { + "auxiliary_loss_clip": 0.01120578, + "auxiliary_loss_mlp": 0.01037463, + "balance_loss_clip": 1.02395105, + "balance_loss_mlp": 1.04512405, + "epoch": 0.40583195550879303, + "flos": 19536734016000.0, + "grad_norm": 1.6093122324869749, + "language_loss": 0.75051296, + "learning_rate": 2.6924305072594784e-06, + "loss": 0.77209336, + "num_input_tokens_seen": 144890975, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 6750, + "time_per_iteration": 2.500444173812866 + }, + { + "auxiliary_loss_clip": 0.01120094, + "auxiliary_loss_mlp": 0.01034106, + "balance_loss_clip": 1.01919341, + "balance_loss_mlp": 1.04114318, + "epoch": 0.405892078761461, + "flos": 22309468577280.0, + "grad_norm": 2.1309201825140662, + "language_loss": 0.73826647, + "learning_rate": 2.692065118669195e-06, + "loss": 0.75980842, + "num_input_tokens_seen": 144908170, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.7890625, + "step": 6751, + "time_per_iteration": 2.4808826446533203 + }, + { + "auxiliary_loss_clip": 0.01120759, + "auxiliary_loss_mlp": 0.01031901, + "balance_loss_clip": 1.01758409, + "balance_loss_mlp": 1.04471755, + "epoch": 0.40595220201412896, + "flos": 25484402701440.0, + "grad_norm": 5.559089751596236, + "language_loss": 0.6666553, + "learning_rate": 2.6916997038357326e-06, + "loss": 0.68818188, + "num_input_tokens_seen": 144928020, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.76171875, + "step": 6752, + "time_per_iteration": 2.568223714828491 + }, + { + "auxiliary_loss_clip": 0.0112446, + "auxiliary_loss_mlp": 0.0103634, + "balance_loss_clip": 1.02189183, + "balance_loss_mlp": 1.04458666, + "epoch": 0.4060123252667969, + "flos": 49856004103680.0, + "grad_norm": 1.70284971706228, + "language_loss": 0.70600617, + "learning_rate": 2.691334262772948e-06, + "loss": 0.72761416, + "num_input_tokens_seen": 144951240, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.796875, + "step": 6753, + "time_per_iteration": 2.696746587753296 + }, + { + "auxiliary_loss_clip": 0.01119466, + "auxiliary_loss_mlp": 0.01035823, + "balance_loss_clip": 1.02145791, + "balance_loss_mlp": 1.04105067, + "epoch": 0.4060724485194649, + "flos": 21135476459520.0, + "grad_norm": 2.1929566205477804, + "language_loss": 0.71584499, + "learning_rate": 2.690968795494699e-06, + "loss": 0.73739791, + "num_input_tokens_seen": 144969100, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78515625, + "step": 6754, + "time_per_iteration": 2.49405837059021 + }, + { + "auxiliary_loss_clip": 0.01120059, + "auxiliary_loss_mlp": 0.0103975, + "balance_loss_clip": 1.02568889, + "balance_loss_mlp": 1.04273617, + "epoch": 0.40613257177213286, + "flos": 21758059918080.0, + "grad_norm": 1.7112877357577985, + "language_loss": 0.82864529, + "learning_rate": 2.690603302014844e-06, + "loss": 0.85024333, + "num_input_tokens_seen": 144987065, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7734375, + "step": 6755, + "time_per_iteration": 2.4666147232055664 + }, + { + "auxiliary_loss_clip": 0.01122705, + "auxiliary_loss_mlp": 0.01040879, + "balance_loss_clip": 1.02599001, + "balance_loss_mlp": 1.04292035, + "epoch": 0.4061926950248008, + "flos": 25555074710400.0, + "grad_norm": 1.484337354822898, + "language_loss": 0.70812732, + "learning_rate": 2.6902377823472426e-06, + "loss": 0.72976315, + "num_input_tokens_seen": 145007310, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 6756, + "time_per_iteration": 2.539236307144165 + }, + { + "auxiliary_loss_clip": 0.01120038, + "auxiliary_loss_mlp": 0.01041859, + "balance_loss_clip": 1.02724361, + "balance_loss_mlp": 1.04106975, + "epoch": 0.4062528182774688, + "flos": 23695799944320.0, + "grad_norm": 1.6617053894159006, + "language_loss": 0.79047221, + "learning_rate": 2.689872236505755e-06, + "loss": 0.81209117, + "num_input_tokens_seen": 145026210, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.79296875, + "step": 6757, + "time_per_iteration": 2.4614784717559814 + }, + { + "auxiliary_loss_clip": 0.01121935, + "auxiliary_loss_mlp": 0.0103313, + "balance_loss_clip": 1.01865852, + "balance_loss_mlp": 1.04454553, + "epoch": 0.4063129415301368, + "flos": 21726027964800.0, + "grad_norm": 1.5700268222495364, + "language_loss": 0.7851724, + "learning_rate": 2.6895066645042437e-06, + "loss": 0.806723, + "num_input_tokens_seen": 145045475, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7734375, + "step": 6758, + "time_per_iteration": 2.495060920715332 + }, + { + "auxiliary_loss_clip": 0.01116636, + "auxiliary_loss_mlp": 0.01031594, + "balance_loss_clip": 1.01692557, + "balance_loss_mlp": 1.04113591, + "epoch": 0.40637306478280477, + "flos": 12787575206400.0, + "grad_norm": 2.1344538838988454, + "language_loss": 0.88668954, + "learning_rate": 2.6891410663565703e-06, + "loss": 0.90817189, + "num_input_tokens_seen": 145062260, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.75390625, + "step": 6759, + "time_per_iteration": 2.410628318786621 + }, + { + "auxiliary_loss_clip": 0.01120377, + "auxiliary_loss_mlp": 0.01033455, + "balance_loss_clip": 1.01986527, + "balance_loss_mlp": 1.04366982, + "epoch": 0.40643318803547274, + "flos": 24024490323840.0, + "grad_norm": 2.0728742760332546, + "language_loss": 0.63888443, + "learning_rate": 2.688775442076598e-06, + "loss": 0.66042268, + "num_input_tokens_seen": 145082470, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.765625, + "step": 6760, + "time_per_iteration": 2.553819417953491 + }, + { + "auxiliary_loss_clip": 0.01119037, + "auxiliary_loss_mlp": 0.01032802, + "balance_loss_clip": 1.01796103, + "balance_loss_mlp": 1.0422858, + "epoch": 0.4064933112881407, + "flos": 25592421876480.0, + "grad_norm": 1.4242582463540345, + "language_loss": 0.75060493, + "learning_rate": 2.688409791678193e-06, + "loss": 0.77212334, + "num_input_tokens_seen": 145105685, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.765625, + "step": 6761, + "time_per_iteration": 2.520904302597046 + }, + { + "auxiliary_loss_clip": 0.01111351, + "auxiliary_loss_mlp": 0.01033598, + "balance_loss_clip": 1.02029395, + "balance_loss_mlp": 1.04054725, + "epoch": 0.40655343454080867, + "flos": 22054323294720.0, + "grad_norm": 1.4265975037167853, + "language_loss": 0.70109248, + "learning_rate": 2.6880441151752185e-06, + "loss": 0.72254199, + "num_input_tokens_seen": 145125590, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 6762, + "time_per_iteration": 6.884980916976929 + }, + { + "auxiliary_loss_clip": 0.01117935, + "auxiliary_loss_mlp": 0.01032239, + "balance_loss_clip": 1.01893568, + "balance_loss_mlp": 1.04316521, + "epoch": 0.40661355779347663, + "flos": 26468893641600.0, + "grad_norm": 2.223786523351799, + "language_loss": 0.73175049, + "learning_rate": 2.6876784125815433e-06, + "loss": 0.75325227, + "num_input_tokens_seen": 145146810, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75, + "step": 6763, + "time_per_iteration": 3.8783130645751953 + }, + { + "auxiliary_loss_clip": 0.01119915, + "auxiliary_loss_mlp": 0.01036194, + "balance_loss_clip": 1.02200174, + "balance_loss_mlp": 1.04246914, + "epoch": 0.4066736810461446, + "flos": 13261129136640.0, + "grad_norm": 1.725584811158307, + "language_loss": 0.6908524, + "learning_rate": 2.687312683911033e-06, + "loss": 0.71241343, + "num_input_tokens_seen": 145163130, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7734375, + "step": 6764, + "time_per_iteration": 2.4408676624298096 + }, + { + "auxiliary_loss_clip": 0.01123793, + "auxiliary_loss_mlp": 0.01040267, + "balance_loss_clip": 1.02481747, + "balance_loss_mlp": 1.04485261, + "epoch": 0.40673380429881256, + "flos": 28803625758720.0, + "grad_norm": 2.20566464671706, + "language_loss": 0.91570717, + "learning_rate": 2.686946929177557e-06, + "loss": 0.93734777, + "num_input_tokens_seen": 145181420, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.7890625, + "step": 6765, + "time_per_iteration": 2.4904191493988037 + }, + { + "auxiliary_loss_clip": 0.01122971, + "auxiliary_loss_mlp": 0.01041584, + "balance_loss_clip": 1.02672434, + "balance_loss_mlp": 1.04374599, + "epoch": 0.4067939275514805, + "flos": 12495334152960.0, + "grad_norm": 2.279622168201086, + "language_loss": 0.78459442, + "learning_rate": 2.6865811483949855e-06, + "loss": 0.80623996, + "num_input_tokens_seen": 145198545, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.79296875, + "step": 6766, + "time_per_iteration": 2.4594480991363525 + }, + { + "auxiliary_loss_clip": 0.01119893, + "auxiliary_loss_mlp": 0.01038568, + "balance_loss_clip": 1.02457929, + "balance_loss_mlp": 1.04144108, + "epoch": 0.4068540508041485, + "flos": 18770508069120.0, + "grad_norm": 1.9487336600068845, + "language_loss": 0.76438922, + "learning_rate": 2.6862153415771867e-06, + "loss": 0.78597391, + "num_input_tokens_seen": 145215835, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.78125, + "step": 6767, + "time_per_iteration": 2.4127700328826904 + }, + { + "auxiliary_loss_clip": 0.01119033, + "auxiliary_loss_mlp": 0.01036408, + "balance_loss_clip": 1.0229435, + "balance_loss_mlp": 1.0442543, + "epoch": 0.40691417405681646, + "flos": 28512821249280.0, + "grad_norm": 1.7431301492707811, + "language_loss": 0.77572781, + "learning_rate": 2.685849508738034e-06, + "loss": 0.79728222, + "num_input_tokens_seen": 145236555, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 6768, + "time_per_iteration": 2.5312347412109375 + }, + { + "auxiliary_loss_clip": 0.01118014, + "auxiliary_loss_mlp": 0.01031889, + "balance_loss_clip": 1.01861525, + "balance_loss_mlp": 1.04248428, + "epoch": 0.4069742973094844, + "flos": 20814040627200.0, + "grad_norm": 2.7094466648077935, + "language_loss": 0.87585759, + "learning_rate": 2.6854836498913995e-06, + "loss": 0.89735663, + "num_input_tokens_seen": 145254595, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75390625, + "step": 6769, + "time_per_iteration": 2.434276580810547 + }, + { + "auxiliary_loss_clip": 0.01120302, + "auxiliary_loss_mlp": 0.01032733, + "balance_loss_clip": 1.02028155, + "balance_loss_mlp": 1.04659963, + "epoch": 0.4070344205621524, + "flos": 21470272151040.0, + "grad_norm": 1.8989360481904207, + "language_loss": 0.80883789, + "learning_rate": 2.685117765051156e-06, + "loss": 0.83036822, + "num_input_tokens_seen": 145274005, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.73828125, + "step": 6770, + "time_per_iteration": 2.4768316745758057 + }, + { + "auxiliary_loss_clip": 0.01121746, + "auxiliary_loss_mlp": 0.01032044, + "balance_loss_clip": 1.01699948, + "balance_loss_mlp": 1.04308331, + "epoch": 0.4070945438148204, + "flos": 26830046937600.0, + "grad_norm": 1.6240016049823844, + "language_loss": 0.80161405, + "learning_rate": 2.6847518542311783e-06, + "loss": 0.82315195, + "num_input_tokens_seen": 145294850, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.78515625, + "step": 6771, + "time_per_iteration": 2.4864251613616943 + }, + { + "auxiliary_loss_clip": 0.01116481, + "auxiliary_loss_mlp": 0.01038824, + "balance_loss_clip": 1.02476382, + "balance_loss_mlp": 1.04181063, + "epoch": 0.4071546670674884, + "flos": 26354158623360.0, + "grad_norm": 1.5515756087522081, + "language_loss": 0.76267636, + "learning_rate": 2.6843859174453417e-06, + "loss": 0.7842294, + "num_input_tokens_seen": 145317050, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.74609375, + "step": 6772, + "time_per_iteration": 2.5570874214172363 + }, + { + "auxiliary_loss_clip": 0.01116059, + "auxiliary_loss_mlp": 0.01040475, + "balance_loss_clip": 1.0259316, + "balance_loss_mlp": 1.04014397, + "epoch": 0.40721479032015634, + "flos": 17895401020800.0, + "grad_norm": 1.6577007729475706, + "language_loss": 0.81418705, + "learning_rate": 2.6840199547075218e-06, + "loss": 0.83575237, + "num_input_tokens_seen": 145334480, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7578125, + "step": 6773, + "time_per_iteration": 2.4311835765838623 + }, + { + "auxiliary_loss_clip": 0.01040526, + "auxiliary_loss_mlp": 0.01005684, + "balance_loss_clip": 1.00416398, + "balance_loss_mlp": 1.01639521, + "epoch": 0.4072749135728243, + "flos": 49854570537600.0, + "grad_norm": 0.8363890316728796, + "language_loss": 0.6434871, + "learning_rate": 2.683653966031597e-06, + "loss": 0.66394925, + "num_input_tokens_seen": 145388695, + "router_z_loss_clip": 0.01519775, + "router_z_loss_mlp": 0.24121094, + "step": 6774, + "time_per_iteration": 2.987610340118408 + }, + { + "auxiliary_loss_clip": 0.01119504, + "auxiliary_loss_mlp": 0.01035806, + "balance_loss_clip": 1.02136981, + "balance_loss_mlp": 1.04115796, + "epoch": 0.40733503682549227, + "flos": 27563630400000.0, + "grad_norm": 13.875946104557459, + "language_loss": 0.72097111, + "learning_rate": 2.683287951431446e-06, + "loss": 0.74252421, + "num_input_tokens_seen": 145408240, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 6775, + "time_per_iteration": 2.5014185905456543 + }, + { + "auxiliary_loss_clip": 0.01118561, + "auxiliary_loss_mlp": 0.01041249, + "balance_loss_clip": 1.02736115, + "balance_loss_mlp": 1.04123604, + "epoch": 0.40739516007816023, + "flos": 22126970551680.0, + "grad_norm": 1.3741783359801052, + "language_loss": 0.77956975, + "learning_rate": 2.6829219109209474e-06, + "loss": 0.80116785, + "num_input_tokens_seen": 145428395, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7734375, + "step": 6776, + "time_per_iteration": 2.484910488128662 + }, + { + "auxiliary_loss_clip": 0.0112306, + "auxiliary_loss_mlp": 0.01038548, + "balance_loss_clip": 1.0240761, + "balance_loss_mlp": 1.04408884, + "epoch": 0.4074552833308282, + "flos": 23842243693440.0, + "grad_norm": 2.6337418369090404, + "language_loss": 0.79015827, + "learning_rate": 2.682555844513981e-06, + "loss": 0.81177437, + "num_input_tokens_seen": 145448290, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7890625, + "step": 6777, + "time_per_iteration": 2.4701852798461914 + }, + { + "auxiliary_loss_clip": 0.01039569, + "auxiliary_loss_mlp": 0.01005822, + "balance_loss_clip": 1.00424814, + "balance_loss_mlp": 1.01542926, + "epoch": 0.40751540658349616, + "flos": 58000008781440.0, + "grad_norm": 0.6828077953919364, + "language_loss": 0.5320037, + "learning_rate": 2.6821897522244286e-06, + "loss": 0.55245763, + "num_input_tokens_seen": 145509785, + "router_z_loss_clip": 0.01574707, + "router_z_loss_mlp": 0.2421875, + "step": 6778, + "time_per_iteration": 3.117647647857666 + }, + { + "auxiliary_loss_clip": 0.01119188, + "auxiliary_loss_mlp": 0.01041042, + "balance_loss_clip": 1.02658224, + "balance_loss_mlp": 1.04310179, + "epoch": 0.40757552983616413, + "flos": 21214659991680.0, + "grad_norm": 2.2984205071258272, + "language_loss": 0.82367444, + "learning_rate": 2.6818236340661718e-06, + "loss": 0.84527671, + "num_input_tokens_seen": 145528620, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.76171875, + "step": 6779, + "time_per_iteration": 2.4653449058532715 + }, + { + "auxiliary_loss_clip": 0.0111837, + "auxiliary_loss_mlp": 0.01037706, + "balance_loss_clip": 1.02289438, + "balance_loss_mlp": 1.0422008, + "epoch": 0.4076356530888321, + "flos": 26833530556800.0, + "grad_norm": 1.7439910283418456, + "language_loss": 0.7628178, + "learning_rate": 2.6814574900530957e-06, + "loss": 0.78437853, + "num_input_tokens_seen": 145547775, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.76171875, + "step": 6780, + "time_per_iteration": 2.5031514167785645 + }, + { + "auxiliary_loss_clip": 0.01114202, + "auxiliary_loss_mlp": 0.01030127, + "balance_loss_clip": 1.01759243, + "balance_loss_mlp": 1.04146945, + "epoch": 0.40769577634150006, + "flos": 12203021272320.0, + "grad_norm": 2.107375049179959, + "language_loss": 0.65990937, + "learning_rate": 2.6810913201990827e-06, + "loss": 0.68135262, + "num_input_tokens_seen": 145564465, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7265625, + "step": 6781, + "time_per_iteration": 2.431759834289551 + }, + { + "auxiliary_loss_clip": 0.01117153, + "auxiliary_loss_mlp": 0.01036981, + "balance_loss_clip": 1.02233076, + "balance_loss_mlp": 1.04050446, + "epoch": 0.407755899594168, + "flos": 33655264796160.0, + "grad_norm": 2.315782733130647, + "language_loss": 0.71046883, + "learning_rate": 2.6807251245180183e-06, + "loss": 0.73201013, + "num_input_tokens_seen": 145585965, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.765625, + "step": 6782, + "time_per_iteration": 2.567138433456421 + }, + { + "auxiliary_loss_clip": 0.01117461, + "auxiliary_loss_mlp": 0.01031478, + "balance_loss_clip": 1.01789367, + "balance_loss_mlp": 1.04120076, + "epoch": 0.407816022846836, + "flos": 20157342226560.0, + "grad_norm": 1.7193598407967954, + "language_loss": 0.82066965, + "learning_rate": 2.6803589030237897e-06, + "loss": 0.84215903, + "num_input_tokens_seen": 145605000, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.76171875, + "step": 6783, + "time_per_iteration": 2.46891188621521 + }, + { + "auxiliary_loss_clip": 0.01116877, + "auxiliary_loss_mlp": 0.01034742, + "balance_loss_clip": 1.02065194, + "balance_loss_mlp": 1.04063141, + "epoch": 0.40787614609950396, + "flos": 21178821196800.0, + "grad_norm": 1.6682285001774693, + "language_loss": 0.80728561, + "learning_rate": 2.679992655730283e-06, + "loss": 0.82880187, + "num_input_tokens_seen": 145623740, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 6784, + "time_per_iteration": 2.456216812133789 + }, + { + "auxiliary_loss_clip": 0.01122913, + "auxiliary_loss_mlp": 0.01041361, + "balance_loss_clip": 1.0258038, + "balance_loss_mlp": 1.04271793, + "epoch": 0.407936269352172, + "flos": 20520650338560.0, + "grad_norm": 1.7628578717327703, + "language_loss": 0.65640736, + "learning_rate": 2.679626382651386e-06, + "loss": 0.67805004, + "num_input_tokens_seen": 145643515, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.80078125, + "step": 6785, + "time_per_iteration": 2.46173357963562 + }, + { + "auxiliary_loss_clip": 0.01115104, + "auxiliary_loss_mlp": 0.01030368, + "balance_loss_clip": 1.01650357, + "balance_loss_mlp": 1.0397855, + "epoch": 0.40799639260483994, + "flos": 20118809911680.0, + "grad_norm": 1.9756209352263352, + "language_loss": 0.79518569, + "learning_rate": 2.679260083800989e-06, + "loss": 0.81664044, + "num_input_tokens_seen": 145660890, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75390625, + "step": 6786, + "time_per_iteration": 2.430769205093384 + }, + { + "auxiliary_loss_clip": 0.01115587, + "auxiliary_loss_mlp": 0.01036368, + "balance_loss_clip": 1.02349889, + "balance_loss_mlp": 1.04094195, + "epoch": 0.4080565158575079, + "flos": 20997328752000.0, + "grad_norm": 1.5131366331092475, + "language_loss": 0.81249726, + "learning_rate": 2.678893759192982e-06, + "loss": 0.8340168, + "num_input_tokens_seen": 145680070, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.74609375, + "step": 6787, + "time_per_iteration": 2.4589040279388428 + }, + { + "auxiliary_loss_clip": 0.01115847, + "auxiliary_loss_mlp": 0.01033088, + "balance_loss_clip": 1.01907516, + "balance_loss_mlp": 1.04059005, + "epoch": 0.40811663911017587, + "flos": 19317714837120.0, + "grad_norm": 1.9559544882723985, + "language_loss": 0.67917293, + "learning_rate": 2.678527408841255e-06, + "loss": 0.70066231, + "num_input_tokens_seen": 145698010, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.75390625, + "step": 6788, + "time_per_iteration": 2.4450576305389404 + }, + { + "auxiliary_loss_clip": 0.01116018, + "auxiliary_loss_mlp": 0.01041079, + "balance_loss_clip": 1.02644002, + "balance_loss_mlp": 1.03975677, + "epoch": 0.40817676236284384, + "flos": 40625382119040.0, + "grad_norm": 2.2689407766698584, + "language_loss": 0.6605472, + "learning_rate": 2.678161032759701e-06, + "loss": 0.68211812, + "num_input_tokens_seen": 145722215, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.765625, + "step": 6789, + "time_per_iteration": 2.6358134746551514 + }, + { + "auxiliary_loss_clip": 0.01116808, + "auxiliary_loss_mlp": 0.01035749, + "balance_loss_clip": 1.02133691, + "balance_loss_mlp": 1.0408318, + "epoch": 0.4082368856155118, + "flos": 20522086882560.0, + "grad_norm": 1.683929923970831, + "language_loss": 0.60006517, + "learning_rate": 2.6777946309622123e-06, + "loss": 0.62159079, + "num_input_tokens_seen": 145741090, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7578125, + "step": 6790, + "time_per_iteration": 2.4339373111724854 + }, + { + "auxiliary_loss_clip": 0.01117331, + "auxiliary_loss_mlp": 0.01041648, + "balance_loss_clip": 1.02752209, + "balance_loss_mlp": 1.04277873, + "epoch": 0.40829700886817977, + "flos": 11427745098240.0, + "grad_norm": 3.0836688581186538, + "language_loss": 0.69763649, + "learning_rate": 2.677428203462683e-06, + "loss": 0.71922624, + "num_input_tokens_seen": 145754985, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.74609375, + "step": 6791, + "time_per_iteration": 2.3970839977264404 + }, + { + "auxiliary_loss_clip": 0.01036371, + "auxiliary_loss_mlp": 0.01001155, + "balance_loss_clip": 0.99973643, + "balance_loss_mlp": 1.01245427, + "epoch": 0.40835713212084773, + "flos": 67330677121920.0, + "grad_norm": 0.7479961411193888, + "language_loss": 0.59600538, + "learning_rate": 2.6770617502750093e-06, + "loss": 0.61638063, + "num_input_tokens_seen": 145815260, + "router_z_loss_clip": 0.01416016, + "router_z_loss_mlp": 0.23828125, + "step": 6792, + "time_per_iteration": 3.0660579204559326 + }, + { + "auxiliary_loss_clip": 0.01122654, + "auxiliary_loss_mlp": 0.01046383, + "balance_loss_clip": 1.03205419, + "balance_loss_mlp": 1.04478419, + "epoch": 0.4084172553735157, + "flos": 21762010414080.0, + "grad_norm": 2.1865523890186975, + "language_loss": 0.8017205, + "learning_rate": 2.6766952714130857e-06, + "loss": 0.82341087, + "num_input_tokens_seen": 145832665, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 6793, + "time_per_iteration": 2.4930570125579834 + }, + { + "auxiliary_loss_clip": 0.01117695, + "auxiliary_loss_mlp": 0.01034941, + "balance_loss_clip": 1.0203917, + "balance_loss_mlp": 1.04145718, + "epoch": 0.40847737862618366, + "flos": 27417258478080.0, + "grad_norm": 1.7948567342085118, + "language_loss": 0.85040581, + "learning_rate": 2.6763287668908094e-06, + "loss": 0.87193215, + "num_input_tokens_seen": 145850240, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.76171875, + "step": 6794, + "time_per_iteration": 2.500248670578003 + }, + { + "auxiliary_loss_clip": 0.01119372, + "auxiliary_loss_mlp": 0.01036853, + "balance_loss_clip": 1.02316737, + "balance_loss_mlp": 1.04290628, + "epoch": 0.4085375018788516, + "flos": 18587255857920.0, + "grad_norm": 1.6403079662436217, + "language_loss": 0.79991007, + "learning_rate": 2.6759622367220788e-06, + "loss": 0.82147229, + "num_input_tokens_seen": 145869545, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.765625, + "step": 6795, + "time_per_iteration": 2.4969587326049805 + }, + { + "auxiliary_loss_clip": 0.01121457, + "auxiliary_loss_mlp": 0.01034155, + "balance_loss_clip": 1.01903319, + "balance_loss_mlp": 1.0415107, + "epoch": 0.4085976251315196, + "flos": 15411783029760.0, + "grad_norm": 3.0496031094407767, + "language_loss": 0.69604456, + "learning_rate": 2.675595680920792e-06, + "loss": 0.7176007, + "num_input_tokens_seen": 145884025, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.80078125, + "step": 6796, + "time_per_iteration": 2.415790319442749 + }, + { + "auxiliary_loss_clip": 0.01115637, + "auxiliary_loss_mlp": 0.01037628, + "balance_loss_clip": 1.02436018, + "balance_loss_mlp": 1.04028058, + "epoch": 0.40865774838418756, + "flos": 21252222639360.0, + "grad_norm": 1.6154855191434097, + "language_loss": 0.77814329, + "learning_rate": 2.6752290995008498e-06, + "loss": 0.799676, + "num_input_tokens_seen": 145903210, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75390625, + "step": 6797, + "time_per_iteration": 2.4960498809814453 + }, + { + "auxiliary_loss_clip": 0.01114842, + "auxiliary_loss_mlp": 0.01043476, + "balance_loss_clip": 1.03020835, + "balance_loss_mlp": 1.03869152, + "epoch": 0.4087178716368556, + "flos": 13772245714560.0, + "grad_norm": 2.268592052790042, + "language_loss": 0.85668063, + "learning_rate": 2.6748624924761523e-06, + "loss": 0.87826383, + "num_input_tokens_seen": 145920985, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.76171875, + "step": 6798, + "time_per_iteration": 2.4271299839019775 + }, + { + "auxiliary_loss_clip": 0.01115644, + "auxiliary_loss_mlp": 0.01035575, + "balance_loss_clip": 1.02341557, + "balance_loss_mlp": 1.04205322, + "epoch": 0.40877799488952354, + "flos": 23621752056960.0, + "grad_norm": 1.4625848333242037, + "language_loss": 0.8396889, + "learning_rate": 2.674495859860601e-06, + "loss": 0.86120105, + "num_input_tokens_seen": 145940350, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.734375, + "step": 6799, + "time_per_iteration": 2.5059525966644287 + }, + { + "auxiliary_loss_clip": 0.01118535, + "auxiliary_loss_mlp": 0.01043278, + "balance_loss_clip": 1.02861547, + "balance_loss_mlp": 1.04282522, + "epoch": 0.4088381181421915, + "flos": 20918791664640.0, + "grad_norm": 2.2336787226224453, + "language_loss": 0.83352369, + "learning_rate": 2.6741292016681e-06, + "loss": 0.85514188, + "num_input_tokens_seen": 145957460, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7578125, + "step": 6800, + "time_per_iteration": 2.441771984100342 + }, + { + "auxiliary_loss_clip": 0.01118367, + "auxiliary_loss_mlp": 0.01041201, + "balance_loss_clip": 1.02665734, + "balance_loss_mlp": 1.04080248, + "epoch": 0.4088982413948595, + "flos": 13297578462720.0, + "grad_norm": 1.815509221734431, + "language_loss": 0.74838769, + "learning_rate": 2.6737625179125514e-06, + "loss": 0.76998335, + "num_input_tokens_seen": 145975285, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.77734375, + "step": 6801, + "time_per_iteration": 2.4573957920074463 + }, + { + "auxiliary_loss_clip": 0.01118841, + "auxiliary_loss_mlp": 0.01039037, + "balance_loss_clip": 1.02418303, + "balance_loss_mlp": 1.04115379, + "epoch": 0.40895836464752744, + "flos": 15267673664640.0, + "grad_norm": 3.5876275394170682, + "language_loss": 0.79983771, + "learning_rate": 2.673395808607861e-06, + "loss": 0.8214165, + "num_input_tokens_seen": 145989150, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.77734375, + "step": 6802, + "time_per_iteration": 2.4583706855773926 + }, + { + "auxiliary_loss_clip": 0.01121333, + "auxiliary_loss_mlp": 0.01040482, + "balance_loss_clip": 1.02436495, + "balance_loss_mlp": 1.04269981, + "epoch": 0.4090184879001954, + "flos": 14501411804160.0, + "grad_norm": 1.9920926766799116, + "language_loss": 0.75564265, + "learning_rate": 2.673029073767934e-06, + "loss": 0.77726078, + "num_input_tokens_seen": 146006980, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.7890625, + "step": 6803, + "time_per_iteration": 3.8293817043304443 + }, + { + "auxiliary_loss_clip": 0.01117955, + "auxiliary_loss_mlp": 0.01037436, + "balance_loss_clip": 1.02296996, + "balance_loss_mlp": 1.04163659, + "epoch": 0.40907861115286337, + "flos": 13881593692800.0, + "grad_norm": 1.8273723177462575, + "language_loss": 0.78676009, + "learning_rate": 2.6726623134066764e-06, + "loss": 0.80831397, + "num_input_tokens_seen": 146025125, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.765625, + "step": 6804, + "time_per_iteration": 5.276589393615723 + }, + { + "auxiliary_loss_clip": 0.01121753, + "auxiliary_loss_mlp": 0.01038873, + "balance_loss_clip": 1.02486575, + "balance_loss_mlp": 1.04170704, + "epoch": 0.40913873440553133, + "flos": 28037615293440.0, + "grad_norm": 1.824409853433396, + "language_loss": 0.74958569, + "learning_rate": 2.672295527537998e-06, + "loss": 0.77119195, + "num_input_tokens_seen": 146044990, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.80078125, + "step": 6805, + "time_per_iteration": 2.4856061935424805 + }, + { + "auxiliary_loss_clip": 0.01121334, + "auxiliary_loss_mlp": 0.01040926, + "balance_loss_clip": 1.02701998, + "balance_loss_mlp": 1.04323924, + "epoch": 0.4091988576581993, + "flos": 21618188357760.0, + "grad_norm": 1.6270528279533119, + "language_loss": 0.79471934, + "learning_rate": 2.671928716175804e-06, + "loss": 0.816342, + "num_input_tokens_seen": 146066045, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.78125, + "step": 6806, + "time_per_iteration": 2.4999823570251465 + }, + { + "auxiliary_loss_clip": 0.01120343, + "auxiliary_loss_mlp": 0.01034262, + "balance_loss_clip": 1.02002871, + "balance_loss_mlp": 1.04182625, + "epoch": 0.40925898091086726, + "flos": 25224085860480.0, + "grad_norm": 1.8904572172377134, + "language_loss": 0.72131455, + "learning_rate": 2.671561879334007e-06, + "loss": 0.74286067, + "num_input_tokens_seen": 146086280, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78515625, + "step": 6807, + "time_per_iteration": 2.4900894165039062 + }, + { + "auxiliary_loss_clip": 0.01035827, + "auxiliary_loss_mlp": 0.01000695, + "balance_loss_clip": 0.99931204, + "balance_loss_mlp": 1.01169431, + "epoch": 0.40931910416353523, + "flos": 68930568800640.0, + "grad_norm": 0.8333385820049739, + "language_loss": 0.58798856, + "learning_rate": 2.6711950170265155e-06, + "loss": 0.60835379, + "num_input_tokens_seen": 146148840, + "router_z_loss_clip": 0.01385498, + "router_z_loss_mlp": 0.24121094, + "step": 6808, + "time_per_iteration": 3.1670446395874023 + }, + { + "auxiliary_loss_clip": 0.0111783, + "auxiliary_loss_mlp": 0.01047199, + "balance_loss_clip": 1.03397894, + "balance_loss_mlp": 1.04200959, + "epoch": 0.4093792274162032, + "flos": 20189553747840.0, + "grad_norm": 1.6310291749342813, + "language_loss": 0.54454345, + "learning_rate": 2.670828129267242e-06, + "loss": 0.56619376, + "num_input_tokens_seen": 146166195, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7578125, + "step": 6809, + "time_per_iteration": 2.445084571838379 + }, + { + "auxiliary_loss_clip": 0.01117961, + "auxiliary_loss_mlp": 0.01031185, + "balance_loss_clip": 1.0176785, + "balance_loss_mlp": 1.0413785, + "epoch": 0.40943935066887116, + "flos": 25228754628480.0, + "grad_norm": 1.8964783600080724, + "language_loss": 0.83296275, + "learning_rate": 2.6704612160700983e-06, + "loss": 0.85445428, + "num_input_tokens_seen": 146185045, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 6810, + "time_per_iteration": 2.507234573364258 + }, + { + "auxiliary_loss_clip": 0.01121577, + "auxiliary_loss_mlp": 0.01042346, + "balance_loss_clip": 1.02736187, + "balance_loss_mlp": 1.04350328, + "epoch": 0.4094994739215392, + "flos": 23255319461760.0, + "grad_norm": 2.219108175656967, + "language_loss": 0.77739668, + "learning_rate": 2.670094277448999e-06, + "loss": 0.79903591, + "num_input_tokens_seen": 146204655, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.78125, + "step": 6811, + "time_per_iteration": 2.4652421474456787 + }, + { + "auxiliary_loss_clip": 0.01118877, + "auxiliary_loss_mlp": 0.01033588, + "balance_loss_clip": 1.01804352, + "balance_loss_mlp": 1.04151464, + "epoch": 0.40955959717420715, + "flos": 17382165540480.0, + "grad_norm": 1.8555113442690365, + "language_loss": 0.69810557, + "learning_rate": 2.669727313417857e-06, + "loss": 0.7196303, + "num_input_tokens_seen": 146222000, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.7734375, + "step": 6812, + "time_per_iteration": 2.4447555541992188 + }, + { + "auxiliary_loss_clip": 0.0111498, + "auxiliary_loss_mlp": 0.01040667, + "balance_loss_clip": 1.02644539, + "balance_loss_mlp": 1.03930998, + "epoch": 0.4096197204268751, + "flos": 25082418620160.0, + "grad_norm": 1.4849650877087106, + "language_loss": 0.66131341, + "learning_rate": 2.6693603239905872e-06, + "loss": 0.68286985, + "num_input_tokens_seen": 146242630, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7578125, + "step": 6813, + "time_per_iteration": 2.461461067199707 + }, + { + "auxiliary_loss_clip": 0.01115791, + "auxiliary_loss_mlp": 0.01036723, + "balance_loss_clip": 1.02209592, + "balance_loss_mlp": 1.04076779, + "epoch": 0.4096798436795431, + "flos": 30586769648640.0, + "grad_norm": 1.8347983960230858, + "language_loss": 0.73899138, + "learning_rate": 2.6689933091811087e-06, + "loss": 0.76051652, + "num_input_tokens_seen": 146263070, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.75, + "step": 6814, + "time_per_iteration": 2.5444507598876953 + }, + { + "auxiliary_loss_clip": 0.01120309, + "auxiliary_loss_mlp": 0.01034627, + "balance_loss_clip": 1.02035785, + "balance_loss_mlp": 1.04147315, + "epoch": 0.40973996693221104, + "flos": 24133622820480.0, + "grad_norm": 2.162963447393967, + "language_loss": 0.65966797, + "learning_rate": 2.6686262690033357e-06, + "loss": 0.68121737, + "num_input_tokens_seen": 146282890, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7890625, + "step": 6815, + "time_per_iteration": 2.4877898693084717 + }, + { + "auxiliary_loss_clip": 0.01116543, + "auxiliary_loss_mlp": 0.01037411, + "balance_loss_clip": 1.02371955, + "balance_loss_mlp": 1.04337275, + "epoch": 0.409800090184879, + "flos": 23988974751360.0, + "grad_norm": 1.6370882031659308, + "language_loss": 0.76553667, + "learning_rate": 2.668259203471188e-06, + "loss": 0.78707623, + "num_input_tokens_seen": 146301755, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73046875, + "step": 6816, + "time_per_iteration": 2.5013954639434814 + }, + { + "auxiliary_loss_clip": 0.01119372, + "auxiliary_loss_mlp": 0.01038562, + "balance_loss_clip": 1.02404261, + "balance_loss_mlp": 1.04302227, + "epoch": 0.40986021343754697, + "flos": 16143678552960.0, + "grad_norm": 2.8457932880819463, + "language_loss": 0.81718624, + "learning_rate": 2.6678921125985843e-06, + "loss": 0.8387655, + "num_input_tokens_seen": 146316835, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.765625, + "step": 6817, + "time_per_iteration": 2.407566785812378 + }, + { + "auxiliary_loss_clip": 0.01121536, + "auxiliary_loss_mlp": 0.01037881, + "balance_loss_clip": 1.02179992, + "balance_loss_mlp": 1.04166436, + "epoch": 0.40992033669021494, + "flos": 24790824011520.0, + "grad_norm": 1.7366839484469832, + "language_loss": 0.79938078, + "learning_rate": 2.667524996399444e-06, + "loss": 0.82097495, + "num_input_tokens_seen": 146336650, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.80078125, + "step": 6818, + "time_per_iteration": 2.49364972114563 + }, + { + "auxiliary_loss_clip": 0.01114596, + "auxiliary_loss_mlp": 0.01036542, + "balance_loss_clip": 1.02288651, + "balance_loss_mlp": 1.03982878, + "epoch": 0.4099804599428829, + "flos": 29641888431360.0, + "grad_norm": 1.4683684500872527, + "language_loss": 0.65939564, + "learning_rate": 2.66715785488769e-06, + "loss": 0.68090701, + "num_input_tokens_seen": 146357640, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 6819, + "time_per_iteration": 2.5122451782226562 + }, + { + "auxiliary_loss_clip": 0.01123256, + "auxiliary_loss_mlp": 0.0103677, + "balance_loss_clip": 1.02191615, + "balance_loss_mlp": 1.04243147, + "epoch": 0.41004058319555087, + "flos": 24826590979200.0, + "grad_norm": 1.4566856211473176, + "language_loss": 0.85411352, + "learning_rate": 2.6667906880772428e-06, + "loss": 0.87571383, + "num_input_tokens_seen": 146379325, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.80859375, + "step": 6820, + "time_per_iteration": 2.4924051761627197 + }, + { + "auxiliary_loss_clip": 0.01116594, + "auxiliary_loss_mlp": 0.01033155, + "balance_loss_clip": 1.0189811, + "balance_loss_mlp": 1.04211807, + "epoch": 0.41010070644821883, + "flos": 25737464995200.0, + "grad_norm": 1.9363068637508836, + "language_loss": 0.71033639, + "learning_rate": 2.6664234959820256e-06, + "loss": 0.73183382, + "num_input_tokens_seen": 146398635, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7421875, + "step": 6821, + "time_per_iteration": 2.5236756801605225 + }, + { + "auxiliary_loss_clip": 0.01115707, + "auxiliary_loss_mlp": 0.01032479, + "balance_loss_clip": 1.01859689, + "balance_loss_mlp": 1.03997672, + "epoch": 0.4101608297008868, + "flos": 22346061557760.0, + "grad_norm": 2.2789873913326404, + "language_loss": 0.74732232, + "learning_rate": 2.6660562786159634e-06, + "loss": 0.76880419, + "num_input_tokens_seen": 146417585, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 6822, + "time_per_iteration": 2.485173225402832 + }, + { + "auxiliary_loss_clip": 0.01117367, + "auxiliary_loss_mlp": 0.01036407, + "balance_loss_clip": 1.02226305, + "balance_loss_mlp": 1.04145467, + "epoch": 0.41022095295355476, + "flos": 21945083057280.0, + "grad_norm": 1.8990120981529888, + "language_loss": 0.7503438, + "learning_rate": 2.6656890359929796e-06, + "loss": 0.77188146, + "num_input_tokens_seen": 146437035, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7578125, + "step": 6823, + "time_per_iteration": 2.46115779876709 + }, + { + "auxiliary_loss_clip": 0.01124707, + "auxiliary_loss_mlp": 0.01039141, + "balance_loss_clip": 1.02359605, + "balance_loss_mlp": 1.04229724, + "epoch": 0.4102810762062228, + "flos": 27450511493760.0, + "grad_norm": 2.6227876605231986, + "language_loss": 0.73347652, + "learning_rate": 2.665321768127001e-06, + "loss": 0.75511503, + "num_input_tokens_seen": 146457370, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8203125, + "step": 6824, + "time_per_iteration": 2.504561185836792 + }, + { + "auxiliary_loss_clip": 0.01120752, + "auxiliary_loss_mlp": 0.01035065, + "balance_loss_clip": 1.01985359, + "balance_loss_mlp": 1.04105759, + "epoch": 0.41034119945889075, + "flos": 24499265316480.0, + "grad_norm": 2.228764168551681, + "language_loss": 0.71601099, + "learning_rate": 2.6649544750319548e-06, + "loss": 0.73756915, + "num_input_tokens_seen": 146478105, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.796875, + "step": 6825, + "time_per_iteration": 2.476551055908203 + }, + { + "auxiliary_loss_clip": 0.01117579, + "auxiliary_loss_mlp": 0.0103678, + "balance_loss_clip": 1.02359533, + "balance_loss_mlp": 1.04292464, + "epoch": 0.4104013227115587, + "flos": 24352641999360.0, + "grad_norm": 1.9864880407367733, + "language_loss": 0.84743512, + "learning_rate": 2.664587156721768e-06, + "loss": 0.86897874, + "num_input_tokens_seen": 146497835, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.74609375, + "step": 6826, + "time_per_iteration": 2.492030382156372 + }, + { + "auxiliary_loss_clip": 0.01117058, + "auxiliary_loss_mlp": 0.01035255, + "balance_loss_clip": 1.02066422, + "balance_loss_mlp": 1.0431006, + "epoch": 0.4104614459642267, + "flos": 23729340268800.0, + "grad_norm": 1.962634793360081, + "language_loss": 0.66582263, + "learning_rate": 2.6642198132103696e-06, + "loss": 0.68734574, + "num_input_tokens_seen": 146517735, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.73828125, + "step": 6827, + "time_per_iteration": 2.4629759788513184 + }, + { + "auxiliary_loss_clip": 0.01113749, + "auxiliary_loss_mlp": 0.01032027, + "balance_loss_clip": 1.01799607, + "balance_loss_mlp": 1.03989482, + "epoch": 0.41052156921689464, + "flos": 22127976132480.0, + "grad_norm": 1.3616881749334155, + "language_loss": 0.72346127, + "learning_rate": 2.663852444511689e-06, + "loss": 0.74491906, + "num_input_tokens_seen": 146537640, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.73828125, + "step": 6828, + "time_per_iteration": 2.4807186126708984 + }, + { + "auxiliary_loss_clip": 0.01120586, + "auxiliary_loss_mlp": 0.01042781, + "balance_loss_clip": 1.02777803, + "balance_loss_mlp": 1.0410856, + "epoch": 0.4105816924695626, + "flos": 20084371747200.0, + "grad_norm": 1.900432401993592, + "language_loss": 0.83422399, + "learning_rate": 2.6634850506396574e-06, + "loss": 0.85585773, + "num_input_tokens_seen": 146554695, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.796875, + "step": 6829, + "time_per_iteration": 2.4298055171966553 + }, + { + "auxiliary_loss_clip": 0.01114334, + "auxiliary_loss_mlp": 0.0103303, + "balance_loss_clip": 1.01940441, + "balance_loss_mlp": 1.03960419, + "epoch": 0.4106418157222306, + "flos": 18076785724800.0, + "grad_norm": 1.5044787550344432, + "language_loss": 0.9002744, + "learning_rate": 2.663117631608206e-06, + "loss": 0.92174798, + "num_input_tokens_seen": 146573740, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 6830, + "time_per_iteration": 2.4607503414154053 + }, + { + "auxiliary_loss_clip": 0.01115903, + "auxiliary_loss_mlp": 0.01025937, + "balance_loss_clip": 1.01268673, + "balance_loss_mlp": 1.04088628, + "epoch": 0.41070193897489854, + "flos": 21647850013440.0, + "grad_norm": 2.455330668305064, + "language_loss": 0.65950698, + "learning_rate": 2.662750187431268e-06, + "loss": 0.68092537, + "num_input_tokens_seen": 146592885, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75, + "step": 6831, + "time_per_iteration": 2.4402008056640625 + }, + { + "auxiliary_loss_clip": 0.01114416, + "auxiliary_loss_mlp": 0.01035741, + "balance_loss_clip": 1.02233577, + "balance_loss_mlp": 1.04019713, + "epoch": 0.4107620622275665, + "flos": 26648195356800.0, + "grad_norm": 1.7503077174044546, + "language_loss": 0.69414657, + "learning_rate": 2.662382718122776e-06, + "loss": 0.71564817, + "num_input_tokens_seen": 146611995, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7421875, + "step": 6832, + "time_per_iteration": 2.4985976219177246 + }, + { + "auxiliary_loss_clip": 0.0111274, + "auxiliary_loss_mlp": 0.0103582, + "balance_loss_clip": 1.02265322, + "balance_loss_mlp": 1.03861785, + "epoch": 0.41082218548023447, + "flos": 18734310138240.0, + "grad_norm": 2.137055635154832, + "language_loss": 0.73675501, + "learning_rate": 2.662015223696666e-06, + "loss": 0.75824058, + "num_input_tokens_seen": 146628045, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 6833, + "time_per_iteration": 2.423802375793457 + }, + { + "auxiliary_loss_clip": 0.01120262, + "auxiliary_loss_mlp": 0.01041415, + "balance_loss_clip": 1.02648401, + "balance_loss_mlp": 1.04171228, + "epoch": 0.41088230873290243, + "flos": 22893771116160.0, + "grad_norm": 1.6404428787043481, + "language_loss": 0.72538, + "learning_rate": 2.6616477041668713e-06, + "loss": 0.74699682, + "num_input_tokens_seen": 146648355, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78515625, + "step": 6834, + "time_per_iteration": 2.5415680408477783 + }, + { + "auxiliary_loss_clip": 0.01119029, + "auxiliary_loss_mlp": 0.01044226, + "balance_loss_clip": 1.03027868, + "balance_loss_mlp": 1.04038835, + "epoch": 0.4109424319855704, + "flos": 24276978000000.0, + "grad_norm": 2.0754355899076717, + "language_loss": 0.71026015, + "learning_rate": 2.661280159547329e-06, + "loss": 0.7318927, + "num_input_tokens_seen": 146668370, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.78515625, + "step": 6835, + "time_per_iteration": 2.4709722995758057 + }, + { + "auxiliary_loss_clip": 0.01118649, + "auxiliary_loss_mlp": 0.01040202, + "balance_loss_clip": 1.02521181, + "balance_loss_mlp": 1.04203069, + "epoch": 0.41100255523823837, + "flos": 12969139478400.0, + "grad_norm": 1.9290870315127813, + "language_loss": 0.86998641, + "learning_rate": 2.660912589851978e-06, + "loss": 0.89157486, + "num_input_tokens_seen": 146686665, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.765625, + "step": 6836, + "time_per_iteration": 2.4478323459625244 + }, + { + "auxiliary_loss_clip": 0.01114601, + "auxiliary_loss_mlp": 0.01038609, + "balance_loss_clip": 1.02464342, + "balance_loss_mlp": 1.040609, + "epoch": 0.4110626784909064, + "flos": 23145648261120.0, + "grad_norm": 1.7219230799083993, + "language_loss": 0.69017011, + "learning_rate": 2.6605449950947547e-06, + "loss": 0.71170223, + "num_input_tokens_seen": 146706570, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.73828125, + "step": 6837, + "time_per_iteration": 2.4600830078125 + }, + { + "auxiliary_loss_clip": 0.01116898, + "auxiliary_loss_mlp": 0.01038198, + "balance_loss_clip": 1.02394605, + "balance_loss_mlp": 1.04047167, + "epoch": 0.41112280174357435, + "flos": 22747399194240.0, + "grad_norm": 1.7295939332860302, + "language_loss": 0.75087547, + "learning_rate": 2.660177375289599e-06, + "loss": 0.77242649, + "num_input_tokens_seen": 146723425, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.765625, + "step": 6838, + "time_per_iteration": 2.460449695587158 + }, + { + "auxiliary_loss_clip": 0.01115474, + "auxiliary_loss_mlp": 0.01035463, + "balance_loss_clip": 1.02075219, + "balance_loss_mlp": 1.04058707, + "epoch": 0.4111829249962423, + "flos": 21102403011840.0, + "grad_norm": 1.8679563507274572, + "language_loss": 0.82247162, + "learning_rate": 2.659809730450451e-06, + "loss": 0.84398103, + "num_input_tokens_seen": 146741640, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.75, + "step": 6839, + "time_per_iteration": 2.4339215755462646 + }, + { + "auxiliary_loss_clip": 0.01112221, + "auxiliary_loss_mlp": 0.01032315, + "balance_loss_clip": 1.01875496, + "balance_loss_mlp": 1.03766727, + "epoch": 0.4112430482489103, + "flos": 21505787723520.0, + "grad_norm": 1.9294791670505813, + "language_loss": 0.80338049, + "learning_rate": 2.6594420605912523e-06, + "loss": 0.82482588, + "num_input_tokens_seen": 146759195, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.74609375, + "step": 6840, + "time_per_iteration": 2.464096784591675 + }, + { + "auxiliary_loss_clip": 0.01111724, + "auxiliary_loss_mlp": 0.01034503, + "balance_loss_clip": 1.02119339, + "balance_loss_mlp": 1.03856099, + "epoch": 0.41130317150157825, + "flos": 19570022945280.0, + "grad_norm": 1.7525143939260106, + "language_loss": 0.67515284, + "learning_rate": 2.6590743657259442e-06, + "loss": 0.6966151, + "num_input_tokens_seen": 146774990, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73046875, + "step": 6841, + "time_per_iteration": 2.412872314453125 + }, + { + "auxiliary_loss_clip": 0.01035921, + "auxiliary_loss_mlp": 0.01010132, + "balance_loss_clip": 1.00880933, + "balance_loss_mlp": 1.01203704, + "epoch": 0.4113632947542462, + "flos": 62383157706240.0, + "grad_norm": 0.7700890610990695, + "language_loss": 0.5963515, + "learning_rate": 2.65870664586847e-06, + "loss": 0.61681211, + "num_input_tokens_seen": 146839610, + "router_z_loss_clip": 0.01324463, + "router_z_loss_mlp": 0.23828125, + "step": 6842, + "time_per_iteration": 3.167282819747925 + }, + { + "auxiliary_loss_clip": 0.01111896, + "auxiliary_loss_mlp": 0.01033298, + "balance_loss_clip": 1.02044773, + "balance_loss_mlp": 1.04057288, + "epoch": 0.4114234180069142, + "flos": 13918617636480.0, + "grad_norm": 2.121884132790859, + "language_loss": 0.69212461, + "learning_rate": 2.6583389010327742e-06, + "loss": 0.71357656, + "num_input_tokens_seen": 146857360, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71484375, + "step": 6843, + "time_per_iteration": 2.4664626121520996 + }, + { + "auxiliary_loss_clip": 0.01035393, + "auxiliary_loss_mlp": 0.01003775, + "balance_loss_clip": 1.00222576, + "balance_loss_mlp": 1.01154804, + "epoch": 0.41148354125958214, + "flos": 64928505219840.0, + "grad_norm": 0.7178401469554447, + "language_loss": 0.53669417, + "learning_rate": 2.6579711312328013e-06, + "loss": 0.55708587, + "num_input_tokens_seen": 146917055, + "router_z_loss_clip": 0.01550293, + "router_z_loss_mlp": 0.23828125, + "step": 6844, + "time_per_iteration": 3.0998694896698 + }, + { + "auxiliary_loss_clip": 0.0111189, + "auxiliary_loss_mlp": 0.0103482, + "balance_loss_clip": 1.02213013, + "balance_loss_mlp": 1.03937054, + "epoch": 0.4115436645122501, + "flos": 18728779443840.0, + "grad_norm": 1.6545259135728443, + "language_loss": 0.66114587, + "learning_rate": 2.6576033364824967e-06, + "loss": 0.68261302, + "num_input_tokens_seen": 146935215, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7265625, + "step": 6845, + "time_per_iteration": 6.8190038204193115 + }, + { + "auxiliary_loss_clip": 0.01113046, + "auxiliary_loss_mlp": 0.01034986, + "balance_loss_clip": 1.0221113, + "balance_loss_mlp": 1.04133987, + "epoch": 0.41160378776491807, + "flos": 16252918790400.0, + "grad_norm": 1.8380761864561301, + "language_loss": 0.70359266, + "learning_rate": 2.657235516795808e-06, + "loss": 0.72507298, + "num_input_tokens_seen": 146951970, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 6846, + "time_per_iteration": 3.941171646118164 + }, + { + "auxiliary_loss_clip": 0.01112317, + "auxiliary_loss_mlp": 0.01035623, + "balance_loss_clip": 1.02163363, + "balance_loss_mlp": 1.03892803, + "epoch": 0.41166391101758604, + "flos": 27970031854080.0, + "grad_norm": 1.507800360258476, + "language_loss": 0.64964008, + "learning_rate": 2.6568676721866826e-06, + "loss": 0.67111951, + "num_input_tokens_seen": 146975615, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.73046875, + "step": 6847, + "time_per_iteration": 2.5782458782196045 + }, + { + "auxiliary_loss_clip": 0.01112352, + "auxiliary_loss_mlp": 0.0104301, + "balance_loss_clip": 1.02921724, + "balance_loss_mlp": 1.03790998, + "epoch": 0.411724034270254, + "flos": 34131296764800.0, + "grad_norm": 1.3239337291849294, + "language_loss": 0.70368952, + "learning_rate": 2.656499802669069e-06, + "loss": 0.72524321, + "num_input_tokens_seen": 146998855, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7421875, + "step": 6848, + "time_per_iteration": 2.552729606628418 + }, + { + "auxiliary_loss_clip": 0.01035603, + "auxiliary_loss_mlp": 0.00998835, + "balance_loss_clip": 0.99738103, + "balance_loss_mlp": 1.01178169, + "epoch": 0.41178415752292197, + "flos": 67923670752000.0, + "grad_norm": 0.8862972606407307, + "language_loss": 0.56235039, + "learning_rate": 2.6561319082569174e-06, + "loss": 0.58269477, + "num_input_tokens_seen": 147062710, + "router_z_loss_clip": 0.01452637, + "router_z_loss_mlp": 0.23828125, + "step": 6849, + "time_per_iteration": 3.144639730453491 + }, + { + "auxiliary_loss_clip": 0.01112679, + "auxiliary_loss_mlp": 0.01036148, + "balance_loss_clip": 1.02255821, + "balance_loss_mlp": 1.04060721, + "epoch": 0.41184428077558993, + "flos": 34313938444800.0, + "grad_norm": 1.58670522574793, + "language_loss": 0.76169646, + "learning_rate": 2.6557639889641783e-06, + "loss": 0.78318465, + "num_input_tokens_seen": 147086075, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.71875, + "step": 6850, + "time_per_iteration": 2.5668234825134277 + }, + { + "auxiliary_loss_clip": 0.01111269, + "auxiliary_loss_mlp": 0.01033693, + "balance_loss_clip": 1.02075291, + "balance_loss_mlp": 1.03937149, + "epoch": 0.41190440402825795, + "flos": 35444118948480.0, + "grad_norm": 1.4904377439692653, + "language_loss": 0.67717403, + "learning_rate": 2.6553960448048025e-06, + "loss": 0.69862366, + "num_input_tokens_seen": 147107590, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71875, + "step": 6851, + "time_per_iteration": 2.588646650314331 + }, + { + "auxiliary_loss_clip": 0.01116771, + "auxiliary_loss_mlp": 0.01043217, + "balance_loss_clip": 1.02792835, + "balance_loss_mlp": 1.03957748, + "epoch": 0.4119645272809259, + "flos": 20849879422080.0, + "grad_norm": 2.5339755397297776, + "language_loss": 0.79547226, + "learning_rate": 2.655028075792743e-06, + "loss": 0.81707215, + "num_input_tokens_seen": 147123715, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.76953125, + "step": 6852, + "time_per_iteration": 2.4342472553253174 + }, + { + "auxiliary_loss_clip": 0.01120035, + "auxiliary_loss_mlp": 0.01033443, + "balance_loss_clip": 1.01818419, + "balance_loss_mlp": 1.04227197, + "epoch": 0.4120246505335939, + "flos": 27562050201600.0, + "grad_norm": 3.302073757908878, + "language_loss": 0.78002989, + "learning_rate": 2.6546600819419537e-06, + "loss": 0.80156463, + "num_input_tokens_seen": 147144290, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.77734375, + "step": 6853, + "time_per_iteration": 2.536959409713745 + }, + { + "auxiliary_loss_clip": 0.01118617, + "auxiliary_loss_mlp": 0.01037367, + "balance_loss_clip": 1.022156, + "balance_loss_mlp": 1.04021645, + "epoch": 0.41208477378626185, + "flos": 37815444046080.0, + "grad_norm": 1.636675456410819, + "language_loss": 0.65871978, + "learning_rate": 2.6542920632663883e-06, + "loss": 0.68027961, + "num_input_tokens_seen": 147166340, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.78515625, + "step": 6854, + "time_per_iteration": 2.587641477584839 + }, + { + "auxiliary_loss_clip": 0.01113423, + "auxiliary_loss_mlp": 0.01032528, + "balance_loss_clip": 1.01973081, + "balance_loss_mlp": 1.04029512, + "epoch": 0.4121448970389298, + "flos": 23440762402560.0, + "grad_norm": 1.819965675297277, + "language_loss": 0.83530807, + "learning_rate": 2.6539240197800023e-06, + "loss": 0.85676759, + "num_input_tokens_seen": 147184025, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.73046875, + "step": 6855, + "time_per_iteration": 2.5173020362854004 + }, + { + "auxiliary_loss_clip": 0.01112, + "auxiliary_loss_mlp": 0.01036416, + "balance_loss_clip": 1.02333903, + "balance_loss_mlp": 1.03945315, + "epoch": 0.4122050202915978, + "flos": 21325300859520.0, + "grad_norm": 1.701531451547931, + "language_loss": 0.7926302, + "learning_rate": 2.6535559514967517e-06, + "loss": 0.81411433, + "num_input_tokens_seen": 147202730, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 6856, + "time_per_iteration": 2.4496660232543945 + }, + { + "auxiliary_loss_clip": 0.01115557, + "auxiliary_loss_mlp": 0.01034607, + "balance_loss_clip": 1.021119, + "balance_loss_mlp": 1.04115629, + "epoch": 0.41226514354426574, + "flos": 17306286059520.0, + "grad_norm": 6.346447490864035, + "language_loss": 0.79253089, + "learning_rate": 2.6531878584305935e-06, + "loss": 0.81403255, + "num_input_tokens_seen": 147215315, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 6857, + "time_per_iteration": 2.454458236694336 + }, + { + "auxiliary_loss_clip": 0.01114343, + "auxiliary_loss_mlp": 0.01036012, + "balance_loss_clip": 1.02169538, + "balance_loss_mlp": 1.03821683, + "epoch": 0.4123252667969337, + "flos": 17638855107840.0, + "grad_norm": 1.6045712878894351, + "language_loss": 0.70696247, + "learning_rate": 2.6528197405954873e-06, + "loss": 0.72846603, + "num_input_tokens_seen": 147233330, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.76171875, + "step": 6858, + "time_per_iteration": 2.453808069229126 + }, + { + "auxiliary_loss_clip": 0.01113834, + "auxiliary_loss_mlp": 0.01035636, + "balance_loss_clip": 1.02162266, + "balance_loss_mlp": 1.04016411, + "epoch": 0.4123853900496017, + "flos": 46424811375360.0, + "grad_norm": 1.4836752505963042, + "language_loss": 0.59489501, + "learning_rate": 2.652451598005391e-06, + "loss": 0.61638969, + "num_input_tokens_seen": 147257780, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.73828125, + "step": 6859, + "time_per_iteration": 2.6645431518554688 + }, + { + "auxiliary_loss_clip": 0.01112236, + "auxiliary_loss_mlp": 0.01036677, + "balance_loss_clip": 1.02283669, + "balance_loss_mlp": 1.03694463, + "epoch": 0.41244551330226964, + "flos": 17675160779520.0, + "grad_norm": 2.017738864380765, + "language_loss": 0.73062313, + "learning_rate": 2.652083430674264e-06, + "loss": 0.75211227, + "num_input_tokens_seen": 147276055, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75390625, + "step": 6860, + "time_per_iteration": 2.4230310916900635 + }, + { + "auxiliary_loss_clip": 0.01111098, + "auxiliary_loss_mlp": 0.01033206, + "balance_loss_clip": 1.02037311, + "balance_loss_mlp": 1.03779876, + "epoch": 0.4125056365549376, + "flos": 18693730748160.0, + "grad_norm": 1.603033952512427, + "language_loss": 0.74057221, + "learning_rate": 2.651715238616068e-06, + "loss": 0.76201528, + "num_input_tokens_seen": 147293200, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.734375, + "step": 6861, + "time_per_iteration": 2.466261863708496 + }, + { + "auxiliary_loss_clip": 0.01111959, + "auxiliary_loss_mlp": 0.01031192, + "balance_loss_clip": 1.0190326, + "balance_loss_mlp": 1.04026282, + "epoch": 0.41256575980760557, + "flos": 17895293280000.0, + "grad_norm": 2.017273954904035, + "language_loss": 0.79431915, + "learning_rate": 2.651347021844765e-06, + "loss": 0.81575066, + "num_input_tokens_seen": 147310640, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.71875, + "step": 6862, + "time_per_iteration": 2.4272851943969727 + }, + { + "auxiliary_loss_clip": 0.01115421, + "auxiliary_loss_mlp": 0.01032509, + "balance_loss_clip": 1.01946771, + "balance_loss_mlp": 1.04104841, + "epoch": 0.41262588306027354, + "flos": 21981316901760.0, + "grad_norm": 1.7023318630513873, + "language_loss": 0.76025152, + "learning_rate": 2.650978780374318e-06, + "loss": 0.78173077, + "num_input_tokens_seen": 147329435, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.74609375, + "step": 6863, + "time_per_iteration": 2.491703987121582 + }, + { + "auxiliary_loss_clip": 0.01034073, + "auxiliary_loss_mlp": 0.01002883, + "balance_loss_clip": 1.00128579, + "balance_loss_mlp": 1.01038253, + "epoch": 0.41268600631294156, + "flos": 53350006740480.0, + "grad_norm": 0.6998724627349664, + "language_loss": 0.52726007, + "learning_rate": 2.650610514218691e-06, + "loss": 0.54762965, + "num_input_tokens_seen": 147385805, + "router_z_loss_clip": 0.01599121, + "router_z_loss_mlp": 0.23632812, + "step": 6864, + "time_per_iteration": 3.05096173286438 + }, + { + "auxiliary_loss_clip": 0.01117449, + "auxiliary_loss_mlp": 0.01034441, + "balance_loss_clip": 1.02002299, + "balance_loss_mlp": 1.04010963, + "epoch": 0.4127461295656095, + "flos": 24385356311040.0, + "grad_norm": 1.8277977271365335, + "language_loss": 0.72328234, + "learning_rate": 2.6502422233918468e-06, + "loss": 0.74480128, + "num_input_tokens_seen": 147405160, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7734375, + "step": 6865, + "time_per_iteration": 2.5138418674468994 + }, + { + "auxiliary_loss_clip": 0.0103371, + "auxiliary_loss_mlp": 0.01003681, + "balance_loss_clip": 1.00216091, + "balance_loss_mlp": 1.00997901, + "epoch": 0.4128062528182775, + "flos": 71705242696320.0, + "grad_norm": 0.9175964026476935, + "language_loss": 0.66545808, + "learning_rate": 2.649873907907753e-06, + "loss": 0.68583202, + "num_input_tokens_seen": 147460245, + "router_z_loss_clip": 0.01519775, + "router_z_loss_mlp": 0.23730469, + "step": 6866, + "time_per_iteration": 2.965301513671875 + }, + { + "auxiliary_loss_clip": 0.01111664, + "auxiliary_loss_mlp": 0.01037824, + "balance_loss_clip": 1.02442443, + "balance_loss_mlp": 1.03779757, + "epoch": 0.41286637607094545, + "flos": 17849111368320.0, + "grad_norm": 1.9494269702964535, + "language_loss": 0.80854523, + "learning_rate": 2.649505567780375e-06, + "loss": 0.8300401, + "num_input_tokens_seen": 147476200, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73828125, + "step": 6867, + "time_per_iteration": 2.4153382778167725 + }, + { + "auxiliary_loss_clip": 0.01118424, + "auxiliary_loss_mlp": 0.01037514, + "balance_loss_clip": 1.02335191, + "balance_loss_mlp": 1.04141474, + "epoch": 0.4129264993236134, + "flos": 25549544016000.0, + "grad_norm": 2.031901046820099, + "language_loss": 0.77580094, + "learning_rate": 2.6491372030236815e-06, + "loss": 0.7973603, + "num_input_tokens_seen": 147494315, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7734375, + "step": 6868, + "time_per_iteration": 2.535595178604126 + }, + { + "auxiliary_loss_clip": 0.01033303, + "auxiliary_loss_mlp": 0.00999485, + "balance_loss_clip": 0.99789923, + "balance_loss_mlp": 1.0095768, + "epoch": 0.4129866225762814, + "flos": 65414446364160.0, + "grad_norm": 0.8413704541135547, + "language_loss": 0.5779494, + "learning_rate": 2.64876881365164e-06, + "loss": 0.59827721, + "num_input_tokens_seen": 147543665, + "router_z_loss_clip": 0.01586914, + "router_z_loss_mlp": 0.23730469, + "step": 6869, + "time_per_iteration": 2.8164174556732178 + }, + { + "auxiliary_loss_clip": 0.01112645, + "auxiliary_loss_mlp": 0.01034982, + "balance_loss_clip": 1.02101028, + "balance_loss_mlp": 1.03904057, + "epoch": 0.41304674582894935, + "flos": 28876991287680.0, + "grad_norm": 1.6360017889096097, + "language_loss": 0.74995548, + "learning_rate": 2.64840039967822e-06, + "loss": 0.77143168, + "num_input_tokens_seen": 147564870, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.734375, + "step": 6870, + "time_per_iteration": 2.5370054244995117 + }, + { + "auxiliary_loss_clip": 0.01114255, + "auxiliary_loss_mlp": 0.01042162, + "balance_loss_clip": 1.02757072, + "balance_loss_mlp": 1.03925085, + "epoch": 0.4131068690816173, + "flos": 22891975436160.0, + "grad_norm": 1.504144022647526, + "language_loss": 0.83272427, + "learning_rate": 2.6480319611173912e-06, + "loss": 0.85428846, + "num_input_tokens_seen": 147584840, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.75, + "step": 6871, + "time_per_iteration": 2.596686601638794 + }, + { + "auxiliary_loss_clip": 0.01117357, + "auxiliary_loss_mlp": 0.01041675, + "balance_loss_clip": 1.02738237, + "balance_loss_mlp": 1.04108167, + "epoch": 0.4131669923342853, + "flos": 26065185707520.0, + "grad_norm": 5.838045745285431, + "language_loss": 0.68951505, + "learning_rate": 2.6476634979831263e-06, + "loss": 0.71110535, + "num_input_tokens_seen": 147604635, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.76171875, + "step": 6872, + "time_per_iteration": 2.6045477390289307 + }, + { + "auxiliary_loss_clip": 0.01115693, + "auxiliary_loss_mlp": 0.01035465, + "balance_loss_clip": 1.02197695, + "balance_loss_mlp": 1.04050374, + "epoch": 0.41322711558695324, + "flos": 19244564789760.0, + "grad_norm": 1.864312912622832, + "language_loss": 0.75716275, + "learning_rate": 2.6472950102893964e-06, + "loss": 0.7786743, + "num_input_tokens_seen": 147620700, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 6873, + "time_per_iteration": 2.4200570583343506 + }, + { + "auxiliary_loss_clip": 0.01117091, + "auxiliary_loss_mlp": 0.01033505, + "balance_loss_clip": 1.01943827, + "balance_loss_mlp": 1.04055679, + "epoch": 0.4132872388396212, + "flos": 22674464628480.0, + "grad_norm": 1.671510122752512, + "language_loss": 0.82721817, + "learning_rate": 2.6469264980501746e-06, + "loss": 0.84872413, + "num_input_tokens_seen": 147639490, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.765625, + "step": 6874, + "time_per_iteration": 2.4689133167266846 + }, + { + "auxiliary_loss_clip": 0.01116401, + "auxiliary_loss_mlp": 0.01034964, + "balance_loss_clip": 1.02054608, + "balance_loss_mlp": 1.0397824, + "epoch": 0.4133473620922892, + "flos": 20150195420160.0, + "grad_norm": 2.003609916019722, + "language_loss": 0.71075761, + "learning_rate": 2.646557961279436e-06, + "loss": 0.73227131, + "num_input_tokens_seen": 147657205, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.765625, + "step": 6875, + "time_per_iteration": 2.4145123958587646 + }, + { + "auxiliary_loss_clip": 0.01110907, + "auxiliary_loss_mlp": 0.01040098, + "balance_loss_clip": 1.02686, + "balance_loss_mlp": 1.04001451, + "epoch": 0.41340748534495714, + "flos": 24242755317120.0, + "grad_norm": 1.617534223510663, + "language_loss": 0.82538921, + "learning_rate": 2.646189399991154e-06, + "loss": 0.84689927, + "num_input_tokens_seen": 147677005, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 6876, + "time_per_iteration": 2.49533748626709 + }, + { + "auxiliary_loss_clip": 0.01118483, + "auxiliary_loss_mlp": 0.010388, + "balance_loss_clip": 1.02354097, + "balance_loss_mlp": 1.03916812, + "epoch": 0.41346760859762516, + "flos": 14392171566720.0, + "grad_norm": 2.858959916779265, + "language_loss": 0.65397477, + "learning_rate": 2.6458208141993048e-06, + "loss": 0.6755476, + "num_input_tokens_seen": 147693435, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.79296875, + "step": 6877, + "time_per_iteration": 2.4231626987457275 + }, + { + "auxiliary_loss_clip": 0.01113806, + "auxiliary_loss_mlp": 0.01031336, + "balance_loss_clip": 1.01795483, + "balance_loss_mlp": 1.04000914, + "epoch": 0.4135277318502931, + "flos": 22492002516480.0, + "grad_norm": 2.013643508242888, + "language_loss": 0.76686853, + "learning_rate": 2.6454522039178668e-06, + "loss": 0.78831995, + "num_input_tokens_seen": 147714000, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73828125, + "step": 6878, + "time_per_iteration": 2.492220640182495 + }, + { + "auxiliary_loss_clip": 0.01114835, + "auxiliary_loss_mlp": 0.01039959, + "balance_loss_clip": 1.02589822, + "balance_loss_mlp": 1.040084, + "epoch": 0.4135878551029611, + "flos": 22418744728320.0, + "grad_norm": 1.8674435899066546, + "language_loss": 0.80248523, + "learning_rate": 2.6450835691608154e-06, + "loss": 0.82403314, + "num_input_tokens_seen": 147731010, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.75, + "step": 6879, + "time_per_iteration": 2.458623170852661 + }, + { + "auxiliary_loss_clip": 0.01114903, + "auxiliary_loss_mlp": 0.01036028, + "balance_loss_clip": 1.02160931, + "balance_loss_mlp": 1.03960526, + "epoch": 0.41364797835562905, + "flos": 27053232094080.0, + "grad_norm": 1.9200458523415633, + "language_loss": 0.84693611, + "learning_rate": 2.6447149099421315e-06, + "loss": 0.86844546, + "num_input_tokens_seen": 147750880, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.75390625, + "step": 6880, + "time_per_iteration": 2.605189323425293 + }, + { + "auxiliary_loss_clip": 0.01116516, + "auxiliary_loss_mlp": 0.01028456, + "balance_loss_clip": 1.01478863, + "balance_loss_mlp": 1.04023683, + "epoch": 0.413708101608297, + "flos": 22967603521920.0, + "grad_norm": 1.672120688006926, + "language_loss": 0.70195448, + "learning_rate": 2.6443462262757927e-06, + "loss": 0.72340417, + "num_input_tokens_seen": 147771360, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76171875, + "step": 6881, + "time_per_iteration": 2.4585211277008057 + }, + { + "auxiliary_loss_clip": 0.01113486, + "auxiliary_loss_mlp": 0.01037109, + "balance_loss_clip": 1.02450848, + "balance_loss_mlp": 1.04145753, + "epoch": 0.413768224860965, + "flos": 13333991875200.0, + "grad_norm": 1.702675342664879, + "language_loss": 0.81404376, + "learning_rate": 2.6439775181757805e-06, + "loss": 0.83554971, + "num_input_tokens_seen": 147787440, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71875, + "step": 6882, + "time_per_iteration": 2.451544761657715 + }, + { + "auxiliary_loss_clip": 0.01121461, + "auxiliary_loss_mlp": 0.01047549, + "balance_loss_clip": 1.0311873, + "balance_loss_mlp": 1.04304028, + "epoch": 0.41382834811363295, + "flos": 20813968800000.0, + "grad_norm": 1.9410860498070561, + "language_loss": 0.69296026, + "learning_rate": 2.643608785656077e-06, + "loss": 0.71465033, + "num_input_tokens_seen": 147805720, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.78515625, + "step": 6883, + "time_per_iteration": 2.4320569038391113 + }, + { + "auxiliary_loss_clip": 0.01115479, + "auxiliary_loss_mlp": 0.01035258, + "balance_loss_clip": 1.02175713, + "balance_loss_mlp": 1.04087615, + "epoch": 0.4138884713663009, + "flos": 20667130001280.0, + "grad_norm": 1.7677749997866015, + "language_loss": 0.75449616, + "learning_rate": 2.643240028730663e-06, + "loss": 0.77600354, + "num_input_tokens_seen": 147824605, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 6884, + "time_per_iteration": 2.4846954345703125 + }, + { + "auxiliary_loss_clip": 0.01116957, + "auxiliary_loss_mlp": 0.01037789, + "balance_loss_clip": 1.02394891, + "balance_loss_mlp": 1.04013455, + "epoch": 0.4139485946189689, + "flos": 29056616225280.0, + "grad_norm": 1.3782226444678463, + "language_loss": 0.75763476, + "learning_rate": 2.642871247413523e-06, + "loss": 0.7791822, + "num_input_tokens_seen": 147845445, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76953125, + "step": 6885, + "time_per_iteration": 2.513087511062622 + }, + { + "auxiliary_loss_clip": 0.01117144, + "auxiliary_loss_mlp": 0.01038796, + "balance_loss_clip": 1.0245266, + "balance_loss_mlp": 1.0402348, + "epoch": 0.41400871787163684, + "flos": 24425720219520.0, + "grad_norm": 1.8637223642679819, + "language_loss": 0.69820571, + "learning_rate": 2.6425024417186414e-06, + "loss": 0.71976513, + "num_input_tokens_seen": 147865580, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.765625, + "step": 6886, + "time_per_iteration": 2.49245285987854 + }, + { + "auxiliary_loss_clip": 0.01118338, + "auxiliary_loss_mlp": 0.0103733, + "balance_loss_clip": 1.02326965, + "balance_loss_mlp": 1.04143095, + "epoch": 0.4140688411243048, + "flos": 19464050845440.0, + "grad_norm": 1.5567308495418615, + "language_loss": 0.7542249, + "learning_rate": 2.642133611660002e-06, + "loss": 0.77578151, + "num_input_tokens_seen": 147885230, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76953125, + "step": 6887, + "time_per_iteration": 6.723928451538086 + }, + { + "auxiliary_loss_clip": 0.01114585, + "auxiliary_loss_mlp": 0.01031306, + "balance_loss_clip": 1.01735878, + "balance_loss_mlp": 1.03900433, + "epoch": 0.4141289643769728, + "flos": 19313656600320.0, + "grad_norm": 1.8847126889252832, + "language_loss": 0.69881892, + "learning_rate": 2.641764757251592e-06, + "loss": 0.72027779, + "num_input_tokens_seen": 147903035, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75390625, + "step": 6888, + "time_per_iteration": 3.9012765884399414 + }, + { + "auxiliary_loss_clip": 0.01110331, + "auxiliary_loss_mlp": 0.0103512, + "balance_loss_clip": 1.02070749, + "balance_loss_mlp": 1.03661156, + "epoch": 0.41418908762964074, + "flos": 16726903683840.0, + "grad_norm": 1.8064637161795956, + "language_loss": 0.75730169, + "learning_rate": 2.6413958785073976e-06, + "loss": 0.7787562, + "num_input_tokens_seen": 147918745, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.73828125, + "step": 6889, + "time_per_iteration": 2.4043526649475098 + }, + { + "auxiliary_loss_clip": 0.01115863, + "auxiliary_loss_mlp": 0.01033766, + "balance_loss_clip": 1.020468, + "balance_loss_mlp": 1.04220176, + "epoch": 0.41424921088230876, + "flos": 25296840858240.0, + "grad_norm": 1.5362774650785178, + "language_loss": 0.80159467, + "learning_rate": 2.6410269754414074e-06, + "loss": 0.82309097, + "num_input_tokens_seen": 147938265, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.734375, + "step": 6890, + "time_per_iteration": 2.515199661254883 + }, + { + "auxiliary_loss_clip": 0.01113118, + "auxiliary_loss_mlp": 0.01042194, + "balance_loss_clip": 1.02752495, + "balance_loss_mlp": 1.04047, + "epoch": 0.4143093341349767, + "flos": 20960520289920.0, + "grad_norm": 1.56935265602887, + "language_loss": 0.74256909, + "learning_rate": 2.6406580480676113e-06, + "loss": 0.76412225, + "num_input_tokens_seen": 147957320, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7265625, + "step": 6891, + "time_per_iteration": 2.4265213012695312 + }, + { + "auxiliary_loss_clip": 0.01120303, + "auxiliary_loss_mlp": 0.01037383, + "balance_loss_clip": 1.02144444, + "balance_loss_mlp": 1.04260397, + "epoch": 0.4143694573876447, + "flos": 22017694400640.0, + "grad_norm": 1.5959140747346865, + "language_loss": 0.84173661, + "learning_rate": 2.6402890963999963e-06, + "loss": 0.86331344, + "num_input_tokens_seen": 147977045, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.77734375, + "step": 6892, + "time_per_iteration": 2.4921038150787354 + }, + { + "auxiliary_loss_clip": 0.01116229, + "auxiliary_loss_mlp": 0.01035744, + "balance_loss_clip": 1.02204704, + "balance_loss_mlp": 1.04263163, + "epoch": 0.41442958064031266, + "flos": 35697396723840.0, + "grad_norm": 1.6122583846612435, + "language_loss": 0.70197237, + "learning_rate": 2.6399201204525554e-06, + "loss": 0.72349209, + "num_input_tokens_seen": 147996905, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.734375, + "step": 6893, + "time_per_iteration": 2.548509359359741 + }, + { + "auxiliary_loss_clip": 0.01115822, + "auxiliary_loss_mlp": 0.01029427, + "balance_loss_clip": 1.01573586, + "balance_loss_mlp": 1.04117119, + "epoch": 0.4144897038929806, + "flos": 28293766156800.0, + "grad_norm": 1.3754181360448814, + "language_loss": 0.72850323, + "learning_rate": 2.639551120239279e-06, + "loss": 0.74995577, + "num_input_tokens_seen": 148017875, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 6894, + "time_per_iteration": 2.521559715270996 + }, + { + "auxiliary_loss_clip": 0.01118828, + "auxiliary_loss_mlp": 0.01032562, + "balance_loss_clip": 1.01859689, + "balance_loss_mlp": 1.04199624, + "epoch": 0.4145498271456486, + "flos": 11648093080320.0, + "grad_norm": 2.672622146105704, + "language_loss": 0.6200121, + "learning_rate": 2.63918209577416e-06, + "loss": 0.64152598, + "num_input_tokens_seen": 148032300, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.765625, + "step": 6895, + "time_per_iteration": 2.3899357318878174 + }, + { + "auxiliary_loss_clip": 0.01112997, + "auxiliary_loss_mlp": 0.01034538, + "balance_loss_clip": 1.02091241, + "balance_loss_mlp": 1.03973091, + "epoch": 0.41460995039831655, + "flos": 27235622378880.0, + "grad_norm": 1.6922649240649819, + "language_loss": 0.70685059, + "learning_rate": 2.638813047071192e-06, + "loss": 0.72832596, + "num_input_tokens_seen": 148053260, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.73046875, + "step": 6896, + "time_per_iteration": 2.5296781063079834 + }, + { + "auxiliary_loss_clip": 0.01115349, + "auxiliary_loss_mlp": 0.01040374, + "balance_loss_clip": 1.02541351, + "balance_loss_mlp": 1.03898549, + "epoch": 0.4146700736509845, + "flos": 25922369232000.0, + "grad_norm": 1.6224007586570597, + "language_loss": 0.72848749, + "learning_rate": 2.6384439741443696e-06, + "loss": 0.7500447, + "num_input_tokens_seen": 148072965, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.765625, + "step": 6897, + "time_per_iteration": 2.481219530105591 + }, + { + "auxiliary_loss_clip": 0.01115287, + "auxiliary_loss_mlp": 0.01043208, + "balance_loss_clip": 1.02870619, + "balance_loss_mlp": 1.04093742, + "epoch": 0.4147301969036525, + "flos": 26833243248000.0, + "grad_norm": 4.403783878749548, + "language_loss": 0.84646589, + "learning_rate": 2.6380748770076873e-06, + "loss": 0.86805081, + "num_input_tokens_seen": 148093240, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7421875, + "step": 6898, + "time_per_iteration": 2.5150201320648193 + }, + { + "auxiliary_loss_clip": 0.01112871, + "auxiliary_loss_mlp": 0.01031359, + "balance_loss_clip": 1.01719725, + "balance_loss_mlp": 1.03681874, + "epoch": 0.41479032015632045, + "flos": 20298291194880.0, + "grad_norm": 1.644475487803214, + "language_loss": 0.74555075, + "learning_rate": 2.6377057556751416e-06, + "loss": 0.76699305, + "num_input_tokens_seen": 148110925, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.76171875, + "step": 6899, + "time_per_iteration": 2.4348104000091553 + }, + { + "auxiliary_loss_clip": 0.0112093, + "auxiliary_loss_mlp": 0.01037394, + "balance_loss_clip": 1.02145016, + "balance_loss_mlp": 1.04058647, + "epoch": 0.4148504434089884, + "flos": 25264988472960.0, + "grad_norm": 1.717830619902866, + "language_loss": 0.75609112, + "learning_rate": 2.6373366101607306e-06, + "loss": 0.77767438, + "num_input_tokens_seen": 148130670, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8046875, + "step": 6900, + "time_per_iteration": 2.5260136127471924 + }, + { + "auxiliary_loss_clip": 0.01116235, + "auxiliary_loss_mlp": 0.01040453, + "balance_loss_clip": 1.02496767, + "balance_loss_mlp": 1.04113388, + "epoch": 0.4149105666616564, + "flos": 12822300679680.0, + "grad_norm": 2.5866137476185087, + "language_loss": 0.80409849, + "learning_rate": 2.6369674404784503e-06, + "loss": 0.82566535, + "num_input_tokens_seen": 148148350, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.75, + "step": 6901, + "time_per_iteration": 2.4218883514404297 + }, + { + "auxiliary_loss_clip": 0.01114876, + "auxiliary_loss_mlp": 0.01035504, + "balance_loss_clip": 1.02178299, + "balance_loss_mlp": 1.03989518, + "epoch": 0.41497068991432434, + "flos": 16763891713920.0, + "grad_norm": 1.8085429941764752, + "language_loss": 0.69120753, + "learning_rate": 2.6365982466423014e-06, + "loss": 0.71271133, + "num_input_tokens_seen": 148167550, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75, + "step": 6902, + "time_per_iteration": 2.525836944580078 + }, + { + "auxiliary_loss_clip": 0.0111323, + "auxiliary_loss_mlp": 0.01037724, + "balance_loss_clip": 1.02421129, + "balance_loss_mlp": 1.04042315, + "epoch": 0.4150308131669923, + "flos": 18000906243840.0, + "grad_norm": 2.1056004636318817, + "language_loss": 0.83287692, + "learning_rate": 2.6362290286662834e-06, + "loss": 0.85438645, + "num_input_tokens_seen": 148184740, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7265625, + "step": 6903, + "time_per_iteration": 2.402722120285034 + }, + { + "auxiliary_loss_clip": 0.01120503, + "auxiliary_loss_mlp": 0.01038275, + "balance_loss_clip": 1.02232492, + "balance_loss_mlp": 1.0413456, + "epoch": 0.41509093641966033, + "flos": 30044770352640.0, + "grad_norm": 1.8768082111891207, + "language_loss": 0.67704409, + "learning_rate": 2.6358597865643968e-06, + "loss": 0.69863188, + "num_input_tokens_seen": 148204605, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.79296875, + "step": 6904, + "time_per_iteration": 2.5442733764648438 + }, + { + "auxiliary_loss_clip": 0.01119512, + "auxiliary_loss_mlp": 0.01035175, + "balance_loss_clip": 1.02082872, + "balance_loss_mlp": 1.04166162, + "epoch": 0.4151510596723283, + "flos": 24279994742400.0, + "grad_norm": 1.5140892492412166, + "language_loss": 0.77502626, + "learning_rate": 2.635490520350643e-06, + "loss": 0.79657316, + "num_input_tokens_seen": 148224675, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.77734375, + "step": 6905, + "time_per_iteration": 2.471850633621216 + }, + { + "auxiliary_loss_clip": 0.01119242, + "auxiliary_loss_mlp": 0.01030653, + "balance_loss_clip": 1.0168426, + "balance_loss_mlp": 1.04261923, + "epoch": 0.41521118292499626, + "flos": 23476206147840.0, + "grad_norm": 2.8616602480779427, + "language_loss": 0.68461335, + "learning_rate": 2.635121230039025e-06, + "loss": 0.70611238, + "num_input_tokens_seen": 148243375, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 6906, + "time_per_iteration": 2.501025676727295 + }, + { + "auxiliary_loss_clip": 0.0111511, + "auxiliary_loss_mlp": 0.01034311, + "balance_loss_clip": 1.02097726, + "balance_loss_mlp": 1.041152, + "epoch": 0.4152713061776642, + "flos": 22125498094080.0, + "grad_norm": 3.9013632738704347, + "language_loss": 0.67466414, + "learning_rate": 2.6347519156435467e-06, + "loss": 0.69615829, + "num_input_tokens_seen": 148261140, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7421875, + "step": 6907, + "time_per_iteration": 2.467179298400879 + }, + { + "auxiliary_loss_clip": 0.01118262, + "auxiliary_loss_mlp": 0.01034081, + "balance_loss_clip": 1.02107513, + "balance_loss_mlp": 1.04266894, + "epoch": 0.4153314294303322, + "flos": 21251396626560.0, + "grad_norm": 1.8641722195673653, + "language_loss": 0.77219629, + "learning_rate": 2.6343825771782123e-06, + "loss": 0.79371971, + "num_input_tokens_seen": 148279655, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7578125, + "step": 6908, + "time_per_iteration": 2.5124471187591553 + }, + { + "auxiliary_loss_clip": 0.01043525, + "auxiliary_loss_mlp": 0.01011962, + "balance_loss_clip": 1.01046562, + "balance_loss_mlp": 1.01946032, + "epoch": 0.41539155268300015, + "flos": 57920681594880.0, + "grad_norm": 0.7844742119516283, + "language_loss": 0.64862758, + "learning_rate": 2.634013214657026e-06, + "loss": 0.66918248, + "num_input_tokens_seen": 148339005, + "router_z_loss_clip": 0.01495361, + "router_z_loss_mlp": 0.24023438, + "step": 6909, + "time_per_iteration": 3.0118794441223145 + }, + { + "auxiliary_loss_clip": 0.01116053, + "auxiliary_loss_mlp": 0.01037602, + "balance_loss_clip": 1.02441156, + "balance_loss_mlp": 1.04182351, + "epoch": 0.4154516759356681, + "flos": 21903677654400.0, + "grad_norm": 1.432390678759805, + "language_loss": 0.87292743, + "learning_rate": 2.633643828093996e-06, + "loss": 0.8944639, + "num_input_tokens_seen": 148358715, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 6910, + "time_per_iteration": 2.4972214698791504 + }, + { + "auxiliary_loss_clip": 0.01041579, + "auxiliary_loss_mlp": 0.01001773, + "balance_loss_clip": 1.00033653, + "balance_loss_mlp": 1.01748466, + "epoch": 0.4155117991883361, + "flos": 67833677226240.0, + "grad_norm": 0.808989444092677, + "language_loss": 0.6214478, + "learning_rate": 2.633274417503128e-06, + "loss": 0.64188129, + "num_input_tokens_seen": 148417280, + "router_z_loss_clip": 0.01434326, + "router_z_loss_mlp": 0.24023438, + "step": 6911, + "time_per_iteration": 3.040469169616699 + }, + { + "auxiliary_loss_clip": 0.01126363, + "auxiliary_loss_mlp": 0.01038312, + "balance_loss_clip": 1.02386987, + "balance_loss_mlp": 1.04570675, + "epoch": 0.41557192244100405, + "flos": 14282679934080.0, + "grad_norm": 2.7143139070983313, + "language_loss": 0.87920213, + "learning_rate": 2.6329049828984312e-06, + "loss": 0.90084887, + "num_input_tokens_seen": 148432610, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8046875, + "step": 6912, + "time_per_iteration": 2.449566602706909 + }, + { + "auxiliary_loss_clip": 0.01119018, + "auxiliary_loss_mlp": 0.01031086, + "balance_loss_clip": 1.01842034, + "balance_loss_mlp": 1.04461241, + "epoch": 0.415632045693672, + "flos": 24461954064000.0, + "grad_norm": 3.208266477782979, + "language_loss": 0.62984204, + "learning_rate": 2.632535524293914e-06, + "loss": 0.65134311, + "num_input_tokens_seen": 148451510, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.74609375, + "step": 6913, + "time_per_iteration": 2.4690184593200684 + }, + { + "auxiliary_loss_clip": 0.01117176, + "auxiliary_loss_mlp": 0.01030635, + "balance_loss_clip": 1.01793909, + "balance_loss_mlp": 1.04389513, + "epoch": 0.41569216894634, + "flos": 20115290378880.0, + "grad_norm": 1.933222600231973, + "language_loss": 0.75131822, + "learning_rate": 2.632166041703586e-06, + "loss": 0.77279633, + "num_input_tokens_seen": 148469945, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.734375, + "step": 6914, + "time_per_iteration": 2.483322858810425 + }, + { + "auxiliary_loss_clip": 0.01118579, + "auxiliary_loss_mlp": 0.01035638, + "balance_loss_clip": 1.0218277, + "balance_loss_mlp": 1.04198337, + "epoch": 0.41575229219900794, + "flos": 23798827128960.0, + "grad_norm": 1.8027192281548683, + "language_loss": 0.87621439, + "learning_rate": 2.631796535141458e-06, + "loss": 0.89775658, + "num_input_tokens_seen": 148486655, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.765625, + "step": 6915, + "time_per_iteration": 2.448347806930542 + }, + { + "auxiliary_loss_clip": 0.01120782, + "auxiliary_loss_mlp": 0.01038445, + "balance_loss_clip": 1.02461123, + "balance_loss_mlp": 1.0447371, + "epoch": 0.4158124154516759, + "flos": 23108229267840.0, + "grad_norm": 2.7843871284315007, + "language_loss": 0.71427178, + "learning_rate": 2.6314270046215426e-06, + "loss": 0.7358641, + "num_input_tokens_seen": 148505035, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76171875, + "step": 6916, + "time_per_iteration": 2.490709066390991 + }, + { + "auxiliary_loss_clip": 0.01124406, + "auxiliary_loss_mlp": 0.01032755, + "balance_loss_clip": 1.018736, + "balance_loss_mlp": 1.04548466, + "epoch": 0.41587253870434393, + "flos": 24242970798720.0, + "grad_norm": 1.511699121237688, + "language_loss": 0.71604288, + "learning_rate": 2.631057450157852e-06, + "loss": 0.73761451, + "num_input_tokens_seen": 148525575, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 6917, + "time_per_iteration": 2.471165895462036 + }, + { + "auxiliary_loss_clip": 0.01118269, + "auxiliary_loss_mlp": 0.01033966, + "balance_loss_clip": 1.0205791, + "balance_loss_mlp": 1.04267478, + "epoch": 0.4159326619570119, + "flos": 23881602021120.0, + "grad_norm": 1.6845020116344738, + "language_loss": 0.80811357, + "learning_rate": 2.6306878717643988e-06, + "loss": 0.82963598, + "num_input_tokens_seen": 148547270, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7578125, + "step": 6918, + "time_per_iteration": 2.526092767715454 + }, + { + "auxiliary_loss_clip": 0.01123257, + "auxiliary_loss_mlp": 0.01037246, + "balance_loss_clip": 1.02276754, + "balance_loss_mlp": 1.04565763, + "epoch": 0.41599278520967986, + "flos": 40626531354240.0, + "grad_norm": 1.4136427424617275, + "language_loss": 0.70455492, + "learning_rate": 2.6303182694551995e-06, + "loss": 0.72615993, + "num_input_tokens_seen": 148572100, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7734375, + "step": 6919, + "time_per_iteration": 2.6142234802246094 + }, + { + "auxiliary_loss_clip": 0.01122602, + "auxiliary_loss_mlp": 0.0103457, + "balance_loss_clip": 1.02063489, + "balance_loss_mlp": 1.04595828, + "epoch": 0.4160529084623478, + "flos": 18222942165120.0, + "grad_norm": 3.306135174045704, + "language_loss": 0.80995989, + "learning_rate": 2.6299486432442677e-06, + "loss": 0.83153164, + "num_input_tokens_seen": 148591245, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.765625, + "step": 6920, + "time_per_iteration": 2.4816763401031494 + }, + { + "auxiliary_loss_clip": 0.01123811, + "auxiliary_loss_mlp": 0.01037947, + "balance_loss_clip": 1.02265263, + "balance_loss_mlp": 1.04559636, + "epoch": 0.4161130317150158, + "flos": 13661963982720.0, + "grad_norm": 1.8850349699187139, + "language_loss": 0.66103178, + "learning_rate": 2.6295789931456195e-06, + "loss": 0.68264937, + "num_input_tokens_seen": 148607980, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.78125, + "step": 6921, + "time_per_iteration": 2.4444103240966797 + }, + { + "auxiliary_loss_clip": 0.01122422, + "auxiliary_loss_mlp": 0.01039999, + "balance_loss_clip": 1.02613473, + "balance_loss_mlp": 1.04591656, + "epoch": 0.41617315496768376, + "flos": 16178511767040.0, + "grad_norm": 2.004797667242706, + "language_loss": 0.80354667, + "learning_rate": 2.629209319173274e-06, + "loss": 0.82517087, + "num_input_tokens_seen": 148624490, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 6922, + "time_per_iteration": 2.4668424129486084 + }, + { + "auxiliary_loss_clip": 0.01124248, + "auxiliary_loss_mlp": 0.01032463, + "balance_loss_clip": 1.01878977, + "balance_loss_mlp": 1.04562068, + "epoch": 0.4162332782203517, + "flos": 26213317395840.0, + "grad_norm": 1.7750243686484017, + "language_loss": 0.67461836, + "learning_rate": 2.628839621341247e-06, + "loss": 0.69618553, + "num_input_tokens_seen": 148646490, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7890625, + "step": 6923, + "time_per_iteration": 2.500643014907837 + }, + { + "auxiliary_loss_clip": 0.0112335, + "auxiliary_loss_mlp": 0.01043172, + "balance_loss_clip": 1.02822304, + "balance_loss_mlp": 1.04540539, + "epoch": 0.4162934014730197, + "flos": 28183987215360.0, + "grad_norm": 1.7543246434734396, + "language_loss": 0.75878662, + "learning_rate": 2.6284698996635593e-06, + "loss": 0.78045189, + "num_input_tokens_seen": 148668580, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.77734375, + "step": 6924, + "time_per_iteration": 2.5196292400360107 + }, + { + "auxiliary_loss_clip": 0.01120451, + "auxiliary_loss_mlp": 0.01037748, + "balance_loss_clip": 1.02382445, + "balance_loss_mlp": 1.04238617, + "epoch": 0.41635352472568765, + "flos": 19865316654720.0, + "grad_norm": 1.7266126934206025, + "language_loss": 0.72481495, + "learning_rate": 2.62810015415423e-06, + "loss": 0.74639702, + "num_input_tokens_seen": 148688410, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.78125, + "step": 6925, + "time_per_iteration": 2.4335598945617676 + }, + { + "auxiliary_loss_clip": 0.01118059, + "auxiliary_loss_mlp": 0.01034152, + "balance_loss_clip": 1.02069342, + "balance_loss_mlp": 1.0413928, + "epoch": 0.4164136479783556, + "flos": 14935356011520.0, + "grad_norm": 1.8465053152696829, + "language_loss": 0.83475816, + "learning_rate": 2.6277303848272792e-06, + "loss": 0.85628033, + "num_input_tokens_seen": 148704855, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 6926, + "time_per_iteration": 2.5088613033294678 + }, + { + "auxiliary_loss_clip": 0.01115859, + "auxiliary_loss_mlp": 0.01035349, + "balance_loss_clip": 1.02305889, + "balance_loss_mlp": 1.04325294, + "epoch": 0.4164737712310236, + "flos": 21757593041280.0, + "grad_norm": 1.6423809052501923, + "language_loss": 0.86620545, + "learning_rate": 2.6273605916967302e-06, + "loss": 0.88771755, + "num_input_tokens_seen": 148723065, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.7265625, + "step": 6927, + "time_per_iteration": 2.534503936767578 + }, + { + "auxiliary_loss_clip": 0.01118504, + "auxiliary_loss_mlp": 0.0104184, + "balance_loss_clip": 1.0272553, + "balance_loss_mlp": 1.04246414, + "epoch": 0.41653389448369155, + "flos": 20740136394240.0, + "grad_norm": 1.9802013979545179, + "language_loss": 0.72300684, + "learning_rate": 2.626990774776604e-06, + "loss": 0.74461025, + "num_input_tokens_seen": 148741780, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.76171875, + "step": 6928, + "time_per_iteration": 3.88004732131958 + }, + { + "auxiliary_loss_clip": 0.01116658, + "auxiliary_loss_mlp": 0.01039078, + "balance_loss_clip": 1.02459407, + "balance_loss_mlp": 1.04092073, + "epoch": 0.4165940177363595, + "flos": 24972891073920.0, + "grad_norm": 1.862862690513255, + "language_loss": 0.78142846, + "learning_rate": 2.6266209340809254e-06, + "loss": 0.80298579, + "num_input_tokens_seen": 148759795, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7578125, + "step": 6929, + "time_per_iteration": 5.323524713516235 + }, + { + "auxiliary_loss_clip": 0.01119115, + "auxiliary_loss_mlp": 0.01034916, + "balance_loss_clip": 1.02201128, + "balance_loss_mlp": 1.0432961, + "epoch": 0.41665414098902753, + "flos": 20521727746560.0, + "grad_norm": 1.7470362991732848, + "language_loss": 0.71024638, + "learning_rate": 2.6262510696237182e-06, + "loss": 0.73178667, + "num_input_tokens_seen": 148778680, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7578125, + "step": 6930, + "time_per_iteration": 2.4636495113372803 + }, + { + "auxiliary_loss_clip": 0.01116513, + "auxiliary_loss_mlp": 0.01035142, + "balance_loss_clip": 1.02139127, + "balance_loss_mlp": 1.04026747, + "epoch": 0.4167142642416955, + "flos": 19682926369920.0, + "grad_norm": 1.7271533589437842, + "language_loss": 0.80665648, + "learning_rate": 2.625881181419007e-06, + "loss": 0.82817304, + "num_input_tokens_seen": 148796470, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.765625, + "step": 6931, + "time_per_iteration": 2.4350993633270264 + }, + { + "auxiliary_loss_clip": 0.01115154, + "auxiliary_loss_mlp": 0.01038095, + "balance_loss_clip": 1.02392721, + "balance_loss_mlp": 1.04003608, + "epoch": 0.41677438749436346, + "flos": 23763742519680.0, + "grad_norm": 1.8450466812598405, + "language_loss": 0.79109526, + "learning_rate": 2.6255112694808193e-06, + "loss": 0.81262779, + "num_input_tokens_seen": 148815300, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.75, + "step": 6932, + "time_per_iteration": 2.499152660369873 + }, + { + "auxiliary_loss_clip": 0.01117704, + "auxiliary_loss_mlp": 0.01039084, + "balance_loss_clip": 1.02421236, + "balance_loss_mlp": 1.04105997, + "epoch": 0.41683451074703143, + "flos": 30410053712640.0, + "grad_norm": 2.265953381144445, + "language_loss": 0.81735384, + "learning_rate": 2.6251413338231813e-06, + "loss": 0.83892173, + "num_input_tokens_seen": 148834315, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.765625, + "step": 6933, + "time_per_iteration": 2.5096874237060547 + }, + { + "auxiliary_loss_clip": 0.01119747, + "auxiliary_loss_mlp": 0.01037299, + "balance_loss_clip": 1.02184963, + "balance_loss_mlp": 1.04056907, + "epoch": 0.4168946339996994, + "flos": 21506757390720.0, + "grad_norm": 2.1923639109766144, + "language_loss": 0.76769817, + "learning_rate": 2.624771374460121e-06, + "loss": 0.78926861, + "num_input_tokens_seen": 148852420, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.79296875, + "step": 6934, + "time_per_iteration": 2.4590814113616943 + }, + { + "auxiliary_loss_clip": 0.01120428, + "auxiliary_loss_mlp": 0.01034168, + "balance_loss_clip": 1.02003551, + "balance_loss_mlp": 1.04396558, + "epoch": 0.41695475725236736, + "flos": 17638675539840.0, + "grad_norm": 1.774753965654226, + "language_loss": 0.67036676, + "learning_rate": 2.624401391405668e-06, + "loss": 0.69191271, + "num_input_tokens_seen": 148869305, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.765625, + "step": 6935, + "time_per_iteration": 2.4111990928649902 + }, + { + "auxiliary_loss_clip": 0.01120243, + "auxiliary_loss_mlp": 0.01040903, + "balance_loss_clip": 1.0266757, + "balance_loss_mlp": 1.04329324, + "epoch": 0.4170148805050353, + "flos": 15668903560320.0, + "grad_norm": 2.7357101171275504, + "language_loss": 0.73245633, + "learning_rate": 2.6240313846738513e-06, + "loss": 0.75406778, + "num_input_tokens_seen": 148886395, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.76953125, + "step": 6936, + "time_per_iteration": 2.452911376953125 + }, + { + "auxiliary_loss_clip": 0.01117055, + "auxiliary_loss_mlp": 0.01034796, + "balance_loss_clip": 1.02102733, + "balance_loss_mlp": 1.0418582, + "epoch": 0.4170750037577033, + "flos": 15159151699200.0, + "grad_norm": 1.8471548990860345, + "language_loss": 0.73746514, + "learning_rate": 2.6236613542787024e-06, + "loss": 0.75898361, + "num_input_tokens_seen": 148905235, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.75, + "step": 6937, + "time_per_iteration": 2.426177978515625 + }, + { + "auxiliary_loss_clip": 0.01116111, + "auxiliary_loss_mlp": 0.01035449, + "balance_loss_clip": 1.02194881, + "balance_loss_mlp": 1.04150152, + "epoch": 0.41713512701037125, + "flos": 28768289754240.0, + "grad_norm": 1.512143650526939, + "language_loss": 0.8406328, + "learning_rate": 2.6232913002342518e-06, + "loss": 0.8621484, + "num_input_tokens_seen": 148928130, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 6938, + "time_per_iteration": 2.543088436126709 + }, + { + "auxiliary_loss_clip": 0.0112279, + "auxiliary_loss_mlp": 0.01034707, + "balance_loss_clip": 1.01959753, + "balance_loss_mlp": 1.04346168, + "epoch": 0.4171952502630392, + "flos": 28256993608320.0, + "grad_norm": 2.0225615339435183, + "language_loss": 0.74319148, + "learning_rate": 2.6229212225545334e-06, + "loss": 0.76476645, + "num_input_tokens_seen": 148948790, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.79296875, + "step": 6939, + "time_per_iteration": 2.5119175910949707 + }, + { + "auxiliary_loss_clip": 0.01120397, + "auxiliary_loss_mlp": 0.01033285, + "balance_loss_clip": 1.01864552, + "balance_loss_mlp": 1.04396725, + "epoch": 0.4172553735157072, + "flos": 24571697091840.0, + "grad_norm": 1.7048101001333908, + "language_loss": 0.7502594, + "learning_rate": 2.622551121253579e-06, + "loss": 0.77179623, + "num_input_tokens_seen": 148967690, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.765625, + "step": 6940, + "time_per_iteration": 2.505476474761963 + }, + { + "auxiliary_loss_clip": 0.01118418, + "auxiliary_loss_mlp": 0.0103924, + "balance_loss_clip": 1.02621651, + "balance_loss_mlp": 1.04277742, + "epoch": 0.41731549676837515, + "flos": 27045797978880.0, + "grad_norm": 1.6601557953990327, + "language_loss": 0.71575844, + "learning_rate": 2.622180996345424e-06, + "loss": 0.73733509, + "num_input_tokens_seen": 148987150, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7578125, + "step": 6941, + "time_per_iteration": 2.4826831817626953 + }, + { + "auxiliary_loss_clip": 0.01120873, + "auxiliary_loss_mlp": 0.0103738, + "balance_loss_clip": 1.02307487, + "balance_loss_mlp": 1.04215777, + "epoch": 0.4173756200210431, + "flos": 28394063907840.0, + "grad_norm": 1.8824806717934597, + "language_loss": 0.73884863, + "learning_rate": 2.621810847844104e-06, + "loss": 0.76043111, + "num_input_tokens_seen": 149004895, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7890625, + "step": 6942, + "time_per_iteration": 2.510179281234741 + }, + { + "auxiliary_loss_clip": 0.01124355, + "auxiliary_loss_mlp": 0.01037141, + "balance_loss_clip": 1.02190626, + "balance_loss_mlp": 1.04450595, + "epoch": 0.41743574327371114, + "flos": 22521556431360.0, + "grad_norm": 2.1000096782313644, + "language_loss": 0.72619486, + "learning_rate": 2.6214406757636534e-06, + "loss": 0.74780977, + "num_input_tokens_seen": 149020970, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.80078125, + "step": 6943, + "time_per_iteration": 2.437713861465454 + }, + { + "auxiliary_loss_clip": 0.01121913, + "auxiliary_loss_mlp": 0.01033826, + "balance_loss_clip": 1.01844811, + "balance_loss_mlp": 1.04391849, + "epoch": 0.4174958665263791, + "flos": 30113431200000.0, + "grad_norm": 1.5914405962225948, + "language_loss": 0.63451827, + "learning_rate": 2.621070480118111e-06, + "loss": 0.6560756, + "num_input_tokens_seen": 149041795, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.78125, + "step": 6944, + "time_per_iteration": 2.5866405963897705 + }, + { + "auxiliary_loss_clip": 0.01118766, + "auxiliary_loss_mlp": 0.01031312, + "balance_loss_clip": 1.01747799, + "balance_loss_mlp": 1.04272938, + "epoch": 0.41755598977904707, + "flos": 25263444188160.0, + "grad_norm": 1.6963739292171327, + "language_loss": 0.7014094, + "learning_rate": 2.620700260921513e-06, + "loss": 0.72291017, + "num_input_tokens_seen": 149063700, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76171875, + "step": 6945, + "time_per_iteration": 2.4984183311462402 + }, + { + "auxiliary_loss_clip": 0.01116559, + "auxiliary_loss_mlp": 0.01041419, + "balance_loss_clip": 1.02556372, + "balance_loss_mlp": 1.04024088, + "epoch": 0.41761611303171503, + "flos": 19828580019840.0, + "grad_norm": 1.623733928455925, + "language_loss": 0.80850792, + "learning_rate": 2.620330018187899e-06, + "loss": 0.83008766, + "num_input_tokens_seen": 149082410, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.76171875, + "step": 6946, + "time_per_iteration": 2.5301356315612793 + }, + { + "auxiliary_loss_clip": 0.01118432, + "auxiliary_loss_mlp": 0.01036458, + "balance_loss_clip": 1.02281451, + "balance_loss_mlp": 1.04321134, + "epoch": 0.417676236284383, + "flos": 15523249910400.0, + "grad_norm": 2.2176705837507784, + "language_loss": 0.77525783, + "learning_rate": 2.6199597519313086e-06, + "loss": 0.79680669, + "num_input_tokens_seen": 149098745, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75390625, + "step": 6947, + "time_per_iteration": 2.432767391204834 + }, + { + "auxiliary_loss_clip": 0.01119017, + "auxiliary_loss_mlp": 0.01035109, + "balance_loss_clip": 1.0204227, + "balance_loss_mlp": 1.04268038, + "epoch": 0.41773635953705096, + "flos": 32524473761280.0, + "grad_norm": 2.207686964264854, + "language_loss": 0.71242738, + "learning_rate": 2.6195894621657825e-06, + "loss": 0.73396862, + "num_input_tokens_seen": 149122255, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.76171875, + "step": 6948, + "time_per_iteration": 2.565560817718506 + }, + { + "auxiliary_loss_clip": 0.01112399, + "auxiliary_loss_mlp": 0.01029195, + "balance_loss_clip": 1.01575994, + "balance_loss_mlp": 1.03894424, + "epoch": 0.4177964827897189, + "flos": 23440941970560.0, + "grad_norm": 1.5189916920378803, + "language_loss": 0.77142775, + "learning_rate": 2.619219148905362e-06, + "loss": 0.7928437, + "num_input_tokens_seen": 149142845, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.734375, + "step": 6949, + "time_per_iteration": 2.459484338760376 + }, + { + "auxiliary_loss_clip": 0.01122421, + "auxiliary_loss_mlp": 0.01036015, + "balance_loss_clip": 1.02156091, + "balance_loss_mlp": 1.04367769, + "epoch": 0.4178566060423869, + "flos": 22748907565440.0, + "grad_norm": 1.5094834159772865, + "language_loss": 0.81985492, + "learning_rate": 2.6188488121640888e-06, + "loss": 0.84143925, + "num_input_tokens_seen": 149163375, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7890625, + "step": 6950, + "time_per_iteration": 2.5348877906799316 + }, + { + "auxiliary_loss_clip": 0.01113505, + "auxiliary_loss_mlp": 0.01030512, + "balance_loss_clip": 1.01804328, + "balance_loss_mlp": 1.04157758, + "epoch": 0.41791672929505486, + "flos": 26032794618240.0, + "grad_norm": 1.3221945547908684, + "language_loss": 0.76189649, + "learning_rate": 2.618478451956007e-06, + "loss": 0.78333664, + "num_input_tokens_seen": 149185610, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71875, + "step": 6951, + "time_per_iteration": 2.5055410861968994 + }, + { + "auxiliary_loss_clip": 0.01121988, + "auxiliary_loss_mlp": 0.01034046, + "balance_loss_clip": 1.01894784, + "balance_loss_mlp": 1.04247046, + "epoch": 0.4179768525477228, + "flos": 19568694142080.0, + "grad_norm": 1.7645474682355455, + "language_loss": 0.72922826, + "learning_rate": 2.61810806829516e-06, + "loss": 0.75078857, + "num_input_tokens_seen": 149203990, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.796875, + "step": 6952, + "time_per_iteration": 2.499979019165039 + }, + { + "auxiliary_loss_clip": 0.01117763, + "auxiliary_loss_mlp": 0.01032178, + "balance_loss_clip": 1.01826596, + "balance_loss_mlp": 1.04266691, + "epoch": 0.4180369758003908, + "flos": 17783826399360.0, + "grad_norm": 3.0061867681934795, + "language_loss": 0.7182008, + "learning_rate": 2.617737661195593e-06, + "loss": 0.73970026, + "num_input_tokens_seen": 149221385, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 6953, + "time_per_iteration": 2.4045305252075195 + }, + { + "auxiliary_loss_clip": 0.01116286, + "auxiliary_loss_mlp": 0.01035837, + "balance_loss_clip": 1.02106667, + "balance_loss_mlp": 1.04293513, + "epoch": 0.41809709905305875, + "flos": 20960663944320.0, + "grad_norm": 1.696123367289706, + "language_loss": 0.76163101, + "learning_rate": 2.617367230671353e-06, + "loss": 0.78315222, + "num_input_tokens_seen": 149241175, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.734375, + "step": 6954, + "time_per_iteration": 2.5208778381347656 + }, + { + "auxiliary_loss_clip": 0.01117033, + "auxiliary_loss_mlp": 0.01037952, + "balance_loss_clip": 1.02243114, + "balance_loss_mlp": 1.0407306, + "epoch": 0.4181572223057267, + "flos": 22017622573440.0, + "grad_norm": 2.123626835554744, + "language_loss": 0.84569108, + "learning_rate": 2.616996776736485e-06, + "loss": 0.86724097, + "num_input_tokens_seen": 149259115, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.765625, + "step": 6955, + "time_per_iteration": 2.4470770359039307 + }, + { + "auxiliary_loss_clip": 0.01115036, + "auxiliary_loss_mlp": 0.01035899, + "balance_loss_clip": 1.02206469, + "balance_loss_mlp": 1.04131222, + "epoch": 0.4182173455583947, + "flos": 26245528917120.0, + "grad_norm": 1.7424753883235222, + "language_loss": 0.83219767, + "learning_rate": 2.616626299405037e-06, + "loss": 0.85370708, + "num_input_tokens_seen": 149278705, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.73828125, + "step": 6956, + "time_per_iteration": 2.53238582611084 + }, + { + "auxiliary_loss_clip": 0.01120034, + "auxiliary_loss_mlp": 0.01041481, + "balance_loss_clip": 1.02661586, + "balance_loss_mlp": 1.04286742, + "epoch": 0.4182774688110627, + "flos": 14791605782400.0, + "grad_norm": 2.117667338273699, + "language_loss": 0.71621263, + "learning_rate": 2.616255798691059e-06, + "loss": 0.73782784, + "num_input_tokens_seen": 149294040, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7734375, + "step": 6957, + "time_per_iteration": 2.4127233028411865 + }, + { + "auxiliary_loss_clip": 0.01116705, + "auxiliary_loss_mlp": 0.01037238, + "balance_loss_clip": 1.02450657, + "balance_loss_mlp": 1.0416609, + "epoch": 0.41833759206373067, + "flos": 20412020632320.0, + "grad_norm": 2.020066118448717, + "language_loss": 0.75841641, + "learning_rate": 2.6158852746085982e-06, + "loss": 0.77995586, + "num_input_tokens_seen": 149310385, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.75, + "step": 6958, + "time_per_iteration": 2.621243476867676 + }, + { + "auxiliary_loss_clip": 0.01116903, + "auxiliary_loss_mlp": 0.01031018, + "balance_loss_clip": 1.01718402, + "balance_loss_mlp": 1.04121447, + "epoch": 0.41839771531639863, + "flos": 23656333875840.0, + "grad_norm": 1.5992923753241641, + "language_loss": 0.76712382, + "learning_rate": 2.6155147271717066e-06, + "loss": 0.78860307, + "num_input_tokens_seen": 149328235, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 6959, + "time_per_iteration": 2.4936535358428955 + }, + { + "auxiliary_loss_clip": 0.01117896, + "auxiliary_loss_mlp": 0.01036611, + "balance_loss_clip": 1.02191257, + "balance_loss_mlp": 1.04106176, + "epoch": 0.4184578385690666, + "flos": 19754137082880.0, + "grad_norm": 1.629552094504433, + "language_loss": 0.76652783, + "learning_rate": 2.6151441563944347e-06, + "loss": 0.78807288, + "num_input_tokens_seen": 149347465, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.765625, + "step": 6960, + "time_per_iteration": 2.513699769973755 + }, + { + "auxiliary_loss_clip": 0.01111464, + "auxiliary_loss_mlp": 0.01030825, + "balance_loss_clip": 1.01822484, + "balance_loss_mlp": 1.04088879, + "epoch": 0.41851796182173456, + "flos": 20193396503040.0, + "grad_norm": 1.8359587043053753, + "language_loss": 0.75856298, + "learning_rate": 2.614773562290835e-06, + "loss": 0.7799859, + "num_input_tokens_seen": 149366685, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.70703125, + "step": 6961, + "time_per_iteration": 2.4798686504364014 + }, + { + "auxiliary_loss_clip": 0.01040549, + "auxiliary_loss_mlp": 0.010007, + "balance_loss_clip": 0.99909067, + "balance_loss_mlp": 1.01660466, + "epoch": 0.41857808507440253, + "flos": 59018794231680.0, + "grad_norm": 0.7788999280449799, + "language_loss": 0.5466665, + "learning_rate": 2.61440294487496e-06, + "loss": 0.56707895, + "num_input_tokens_seen": 149422925, + "router_z_loss_clip": 0.01611328, + "router_z_loss_mlp": 0.23925781, + "step": 6962, + "time_per_iteration": 3.0343000888824463 + }, + { + "auxiliary_loss_clip": 0.01119412, + "auxiliary_loss_mlp": 0.01036235, + "balance_loss_clip": 1.02266252, + "balance_loss_mlp": 1.04263735, + "epoch": 0.4186382083270705, + "flos": 18478805719680.0, + "grad_norm": 1.8026406871934313, + "language_loss": 0.85487044, + "learning_rate": 2.614032304160864e-06, + "loss": 0.87642694, + "num_input_tokens_seen": 149440820, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.765625, + "step": 6963, + "time_per_iteration": 2.4352054595947266 + }, + { + "auxiliary_loss_clip": 0.01117647, + "auxiliary_loss_mlp": 0.01035822, + "balance_loss_clip": 1.02210093, + "balance_loss_mlp": 1.04331315, + "epoch": 0.41869833157973846, + "flos": 21578758202880.0, + "grad_norm": 1.6053381131745172, + "language_loss": 0.70357138, + "learning_rate": 2.6136616401626014e-06, + "loss": 0.72510606, + "num_input_tokens_seen": 149461060, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7421875, + "step": 6964, + "time_per_iteration": 2.50482439994812 + }, + { + "auxiliary_loss_clip": 0.01113376, + "auxiliary_loss_mlp": 0.01035614, + "balance_loss_clip": 1.02268004, + "balance_loss_mlp": 1.04087543, + "epoch": 0.4187584548324064, + "flos": 35517412650240.0, + "grad_norm": 1.8351593031507138, + "language_loss": 0.70862091, + "learning_rate": 2.6132909528942273e-06, + "loss": 0.73011076, + "num_input_tokens_seen": 149483115, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.72265625, + "step": 6965, + "time_per_iteration": 2.6057491302490234 + }, + { + "auxiliary_loss_clip": 0.01113418, + "auxiliary_loss_mlp": 0.01032504, + "balance_loss_clip": 1.02033257, + "balance_loss_mlp": 1.0413456, + "epoch": 0.4188185780850744, + "flos": 18655880791680.0, + "grad_norm": 1.4950689447506187, + "language_loss": 0.7175675, + "learning_rate": 2.6129202423697997e-06, + "loss": 0.73902673, + "num_input_tokens_seen": 149501495, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.71875, + "step": 6966, + "time_per_iteration": 2.4892048835754395 + }, + { + "auxiliary_loss_clip": 0.01120204, + "auxiliary_loss_mlp": 0.0103471, + "balance_loss_clip": 1.02016091, + "balance_loss_mlp": 1.0421617, + "epoch": 0.41887870133774235, + "flos": 40333428374400.0, + "grad_norm": 2.333720493500319, + "language_loss": 0.71266413, + "learning_rate": 2.612549508603375e-06, + "loss": 0.73421323, + "num_input_tokens_seen": 149523170, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.78125, + "step": 6967, + "time_per_iteration": 2.604076862335205 + }, + { + "auxiliary_loss_clip": 0.01039013, + "auxiliary_loss_mlp": 0.01005246, + "balance_loss_clip": 1.00366104, + "balance_loss_mlp": 1.01515508, + "epoch": 0.4189388245904103, + "flos": 61371336516480.0, + "grad_norm": 0.6722087248044618, + "language_loss": 0.46224236, + "learning_rate": 2.612178751609011e-06, + "loss": 0.48268497, + "num_input_tokens_seen": 149583955, + "router_z_loss_clip": 0.01586914, + "router_z_loss_mlp": 0.23828125, + "step": 6968, + "time_per_iteration": 3.0401268005371094 + }, + { + "auxiliary_loss_clip": 0.01117965, + "auxiliary_loss_mlp": 0.01038122, + "balance_loss_clip": 1.02345359, + "balance_loss_mlp": 1.03981948, + "epoch": 0.4189989478430783, + "flos": 28215624119040.0, + "grad_norm": 1.6180807795397785, + "language_loss": 0.74930859, + "learning_rate": 2.6118079714007685e-06, + "loss": 0.77086943, + "num_input_tokens_seen": 149604440, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78125, + "step": 6969, + "time_per_iteration": 2.5126969814300537 + }, + { + "auxiliary_loss_clip": 0.01112428, + "auxiliary_loss_mlp": 0.01034589, + "balance_loss_clip": 1.02160668, + "balance_loss_mlp": 1.0382787, + "epoch": 0.4190590710957463, + "flos": 24565879088640.0, + "grad_norm": 2.2016737043444903, + "language_loss": 0.80248457, + "learning_rate": 2.611437167992705e-06, + "loss": 0.8239547, + "num_input_tokens_seen": 149623745, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7421875, + "step": 6970, + "time_per_iteration": 5.640556573867798 + }, + { + "auxiliary_loss_clip": 0.01114015, + "auxiliary_loss_mlp": 0.01030966, + "balance_loss_clip": 1.01774538, + "balance_loss_mlp": 1.04030848, + "epoch": 0.41911919434841427, + "flos": 21726027964800.0, + "grad_norm": 1.9623449568843938, + "language_loss": 0.82789886, + "learning_rate": 2.6110663413988835e-06, + "loss": 0.84934866, + "num_input_tokens_seen": 149643025, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 6971, + "time_per_iteration": 3.8554296493530273 + }, + { + "auxiliary_loss_clip": 0.01113275, + "auxiliary_loss_mlp": 0.01035215, + "balance_loss_clip": 1.02057588, + "balance_loss_mlp": 1.04049933, + "epoch": 0.41917931760108224, + "flos": 17601543855360.0, + "grad_norm": 1.6158786040890867, + "language_loss": 0.7468822, + "learning_rate": 2.6106954916333648e-06, + "loss": 0.76836711, + "num_input_tokens_seen": 149660695, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7265625, + "step": 6972, + "time_per_iteration": 2.474414587020874 + }, + { + "auxiliary_loss_clip": 0.01113414, + "auxiliary_loss_mlp": 0.01033398, + "balance_loss_clip": 1.02039838, + "balance_loss_mlp": 1.0393647, + "epoch": 0.4192394408537502, + "flos": 37816701022080.0, + "grad_norm": 1.4614195470734719, + "language_loss": 0.72808421, + "learning_rate": 2.610324618710212e-06, + "loss": 0.74955231, + "num_input_tokens_seen": 149682040, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7421875, + "step": 6973, + "time_per_iteration": 2.5945606231689453 + }, + { + "auxiliary_loss_clip": 0.0112256, + "auxiliary_loss_mlp": 0.01041918, + "balance_loss_clip": 1.02769673, + "balance_loss_mlp": 1.04242992, + "epoch": 0.41929956410641817, + "flos": 23107726477440.0, + "grad_norm": 2.1718837857164464, + "language_loss": 0.74863386, + "learning_rate": 2.609953722643489e-06, + "loss": 0.77027869, + "num_input_tokens_seen": 149700855, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8046875, + "step": 6974, + "time_per_iteration": 2.4790663719177246 + }, + { + "auxiliary_loss_clip": 0.01112575, + "auxiliary_loss_mlp": 0.01030019, + "balance_loss_clip": 1.01669776, + "balance_loss_mlp": 1.03879452, + "epoch": 0.41935968735908613, + "flos": 22524537260160.0, + "grad_norm": 2.8466202693933265, + "language_loss": 0.72836936, + "learning_rate": 2.609582803447259e-06, + "loss": 0.74979532, + "num_input_tokens_seen": 149717360, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73828125, + "step": 6975, + "time_per_iteration": 2.4560608863830566 + }, + { + "auxiliary_loss_clip": 0.01114785, + "auxiliary_loss_mlp": 0.01033126, + "balance_loss_clip": 1.01961374, + "balance_loss_mlp": 1.04139054, + "epoch": 0.4194198106117541, + "flos": 26870446759680.0, + "grad_norm": 1.6070899494887878, + "language_loss": 0.80725533, + "learning_rate": 2.6092118611355885e-06, + "loss": 0.82873446, + "num_input_tokens_seen": 149738975, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 6976, + "time_per_iteration": 2.5148777961730957 + }, + { + "auxiliary_loss_clip": 0.01112592, + "auxiliary_loss_mlp": 0.01025549, + "balance_loss_clip": 1.0124954, + "balance_loss_mlp": 1.03755522, + "epoch": 0.41947993386442206, + "flos": 19902412425600.0, + "grad_norm": 2.297468657248195, + "language_loss": 0.67767072, + "learning_rate": 2.6088408957225425e-06, + "loss": 0.6990521, + "num_input_tokens_seen": 149757055, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.75, + "step": 6977, + "time_per_iteration": 2.4294896125793457 + }, + { + "auxiliary_loss_clip": 0.01116519, + "auxiliary_loss_mlp": 0.01034878, + "balance_loss_clip": 1.02193213, + "balance_loss_mlp": 1.04046345, + "epoch": 0.41954005711709, + "flos": 17383889393280.0, + "grad_norm": 2.6461140984259304, + "language_loss": 0.80869353, + "learning_rate": 2.6084699072221898e-06, + "loss": 0.83020747, + "num_input_tokens_seen": 149772885, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.76171875, + "step": 6978, + "time_per_iteration": 2.4688472747802734 + }, + { + "auxiliary_loss_clip": 0.01114359, + "auxiliary_loss_mlp": 0.01036249, + "balance_loss_clip": 1.02207506, + "balance_loss_mlp": 1.0377202, + "epoch": 0.419600180369758, + "flos": 25003306915200.0, + "grad_norm": 1.725404980402679, + "language_loss": 0.82583737, + "learning_rate": 2.6080988956485964e-06, + "loss": 0.84734344, + "num_input_tokens_seen": 149791515, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.765625, + "step": 6979, + "time_per_iteration": 2.4702186584472656 + }, + { + "auxiliary_loss_clip": 0.01113345, + "auxiliary_loss_mlp": 0.01033062, + "balance_loss_clip": 1.0194428, + "balance_loss_mlp": 1.0388211, + "epoch": 0.41966030362242596, + "flos": 17383781652480.0, + "grad_norm": 1.8637978278873943, + "language_loss": 0.83381826, + "learning_rate": 2.6077278610158325e-06, + "loss": 0.85528231, + "num_input_tokens_seen": 149807250, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 6980, + "time_per_iteration": 2.5195069313049316 + }, + { + "auxiliary_loss_clip": 0.01116413, + "auxiliary_loss_mlp": 0.01032944, + "balance_loss_clip": 1.01975989, + "balance_loss_mlp": 1.03946161, + "epoch": 0.4197204268750939, + "flos": 22156165330560.0, + "grad_norm": 2.9241676519266004, + "language_loss": 0.79068786, + "learning_rate": 2.6073568033379665e-06, + "loss": 0.81218135, + "num_input_tokens_seen": 149821640, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.765625, + "step": 6981, + "time_per_iteration": 2.4457991123199463 + }, + { + "auxiliary_loss_clip": 0.01110331, + "auxiliary_loss_mlp": 0.01034012, + "balance_loss_clip": 1.02078593, + "balance_loss_mlp": 1.03806782, + "epoch": 0.4197805501277619, + "flos": 22084128604800.0, + "grad_norm": 1.6203222993930824, + "language_loss": 0.84426481, + "learning_rate": 2.6069857226290696e-06, + "loss": 0.86570823, + "num_input_tokens_seen": 149840545, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.72265625, + "step": 6982, + "time_per_iteration": 2.483635425567627 + }, + { + "auxiliary_loss_clip": 0.01116431, + "auxiliary_loss_mlp": 0.01036883, + "balance_loss_clip": 1.02191043, + "balance_loss_mlp": 1.03910255, + "epoch": 0.4198406733804299, + "flos": 26432192920320.0, + "grad_norm": 1.9325593989695682, + "language_loss": 0.56615967, + "learning_rate": 2.606614618903214e-06, + "loss": 0.58769286, + "num_input_tokens_seen": 149860375, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7734375, + "step": 6983, + "time_per_iteration": 2.4729864597320557 + }, + { + "auxiliary_loss_clip": 0.01114232, + "auxiliary_loss_mlp": 0.01035133, + "balance_loss_clip": 1.02243733, + "balance_loss_mlp": 1.0403446, + "epoch": 0.4199007966330979, + "flos": 12531029293440.0, + "grad_norm": 2.639890794043824, + "language_loss": 0.82404107, + "learning_rate": 2.606243492174471e-06, + "loss": 0.84553468, + "num_input_tokens_seen": 149877850, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.73828125, + "step": 6984, + "time_per_iteration": 2.4610702991485596 + }, + { + "auxiliary_loss_clip": 0.01113379, + "auxiliary_loss_mlp": 0.01028562, + "balance_loss_clip": 1.01515102, + "balance_loss_mlp": 1.03938794, + "epoch": 0.41996091988576584, + "flos": 21762944167680.0, + "grad_norm": 1.6654879970317658, + "language_loss": 0.78883481, + "learning_rate": 2.605872342456914e-06, + "loss": 0.81025428, + "num_input_tokens_seen": 149896110, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73828125, + "step": 6985, + "time_per_iteration": 2.4739370346069336 + }, + { + "auxiliary_loss_clip": 0.01118591, + "auxiliary_loss_mlp": 0.01034658, + "balance_loss_clip": 1.02042401, + "balance_loss_mlp": 1.03950381, + "epoch": 0.4200210431384338, + "flos": 26541935948160.0, + "grad_norm": 3.375844113891133, + "language_loss": 0.77833611, + "learning_rate": 2.6055011697646173e-06, + "loss": 0.79986858, + "num_input_tokens_seen": 149916495, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7890625, + "step": 6986, + "time_per_iteration": 2.5488531589508057 + }, + { + "auxiliary_loss_clip": 0.01111943, + "auxiliary_loss_mlp": 0.01030974, + "balance_loss_clip": 1.01886213, + "balance_loss_mlp": 1.03984082, + "epoch": 0.42008116639110177, + "flos": 26795824254720.0, + "grad_norm": 1.5789932508621725, + "language_loss": 0.72640669, + "learning_rate": 2.605129974111655e-06, + "loss": 0.74783587, + "num_input_tokens_seen": 149936445, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71875, + "step": 6987, + "time_per_iteration": 2.522143840789795 + }, + { + "auxiliary_loss_clip": 0.01117787, + "auxiliary_loss_mlp": 0.01042745, + "balance_loss_clip": 1.02886939, + "balance_loss_mlp": 1.04176915, + "epoch": 0.42014128964376973, + "flos": 32087333243520.0, + "grad_norm": 1.4538200585449164, + "language_loss": 0.75399673, + "learning_rate": 2.604758755512104e-06, + "loss": 0.77560198, + "num_input_tokens_seen": 149959430, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76171875, + "step": 6988, + "time_per_iteration": 2.57265305519104 + }, + { + "auxiliary_loss_clip": 0.01118811, + "auxiliary_loss_mlp": 0.01036964, + "balance_loss_clip": 1.02287364, + "balance_loss_mlp": 1.04034519, + "epoch": 0.4202014128964377, + "flos": 26467133875200.0, + "grad_norm": 1.6383736622893421, + "language_loss": 0.74155712, + "learning_rate": 2.60438751398004e-06, + "loss": 0.76311487, + "num_input_tokens_seen": 149980365, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.78125, + "step": 6989, + "time_per_iteration": 2.4846689701080322 + }, + { + "auxiliary_loss_clip": 0.01118468, + "auxiliary_loss_mlp": 0.01036151, + "balance_loss_clip": 1.02213192, + "balance_loss_mlp": 1.041116, + "epoch": 0.42026153614910566, + "flos": 13401216178560.0, + "grad_norm": 2.649933968591077, + "language_loss": 0.70989478, + "learning_rate": 2.6040162495295404e-06, + "loss": 0.73144102, + "num_input_tokens_seen": 149997375, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7734375, + "step": 6990, + "time_per_iteration": 2.5092554092407227 + }, + { + "auxiliary_loss_clip": 0.01038945, + "auxiliary_loss_mlp": 0.01004482, + "balance_loss_clip": 1.00287271, + "balance_loss_mlp": 1.01510215, + "epoch": 0.42032165940177363, + "flos": 60250457635200.0, + "grad_norm": 0.8281033043630844, + "language_loss": 0.60529578, + "learning_rate": 2.603644962174685e-06, + "loss": 0.62573004, + "num_input_tokens_seen": 150051230, + "router_z_loss_clip": 0.01611328, + "router_z_loss_mlp": 0.23828125, + "step": 6991, + "time_per_iteration": 2.921936511993408 + }, + { + "auxiliary_loss_clip": 0.01120177, + "auxiliary_loss_mlp": 0.01037059, + "balance_loss_clip": 1.02322519, + "balance_loss_mlp": 1.04332614, + "epoch": 0.4203817826544416, + "flos": 24535211852160.0, + "grad_norm": 1.5069916983433078, + "language_loss": 0.83222365, + "learning_rate": 2.6032736519295517e-06, + "loss": 0.85379601, + "num_input_tokens_seen": 150071135, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76953125, + "step": 6992, + "time_per_iteration": 2.495664358139038 + }, + { + "auxiliary_loss_clip": 0.01039195, + "auxiliary_loss_mlp": 0.01003357, + "balance_loss_clip": 1.00179517, + "balance_loss_mlp": 1.01546574, + "epoch": 0.42044190590710956, + "flos": 58820781530880.0, + "grad_norm": 0.8165124973650228, + "language_loss": 0.65523541, + "learning_rate": 2.6029023188082217e-06, + "loss": 0.67566097, + "num_input_tokens_seen": 150125220, + "router_z_loss_clip": 0.01556396, + "router_z_loss_mlp": 0.23730469, + "step": 6993, + "time_per_iteration": 3.078948736190796 + }, + { + "auxiliary_loss_clip": 0.01122889, + "auxiliary_loss_mlp": 0.01034205, + "balance_loss_clip": 1.01845777, + "balance_loss_mlp": 1.04213274, + "epoch": 0.4205020291597775, + "flos": 16436063260800.0, + "grad_norm": 2.0847143106579806, + "language_loss": 0.83213866, + "learning_rate": 2.6025309628247746e-06, + "loss": 0.85370958, + "num_input_tokens_seen": 150142300, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8046875, + "step": 6994, + "time_per_iteration": 2.42958402633667 + }, + { + "auxiliary_loss_clip": 0.01115372, + "auxiliary_loss_mlp": 0.01034094, + "balance_loss_clip": 1.02112424, + "balance_loss_mlp": 1.04195786, + "epoch": 0.4205621524124455, + "flos": 18405655672320.0, + "grad_norm": 1.6590785995391892, + "language_loss": 0.78497195, + "learning_rate": 2.6021595839932934e-06, + "loss": 0.8064667, + "num_input_tokens_seen": 150161345, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.734375, + "step": 6995, + "time_per_iteration": 2.4311602115631104 + }, + { + "auxiliary_loss_clip": 0.01113356, + "auxiliary_loss_mlp": 0.01031571, + "balance_loss_clip": 1.01849341, + "balance_loss_mlp": 1.04043221, + "epoch": 0.4206222756651135, + "flos": 25520097841920.0, + "grad_norm": 1.5317093362831764, + "language_loss": 0.79829741, + "learning_rate": 2.60178818232786e-06, + "loss": 0.81974673, + "num_input_tokens_seen": 150182420, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73046875, + "step": 6996, + "time_per_iteration": 2.5032711029052734 + }, + { + "auxiliary_loss_clip": 0.01118604, + "auxiliary_loss_mlp": 0.01031481, + "balance_loss_clip": 1.01837945, + "balance_loss_mlp": 1.04208779, + "epoch": 0.4206823989177815, + "flos": 15304338472320.0, + "grad_norm": 2.3208366966184837, + "language_loss": 0.7522642, + "learning_rate": 2.601416757842559e-06, + "loss": 0.77376509, + "num_input_tokens_seen": 150200175, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.765625, + "step": 6997, + "time_per_iteration": 2.4281609058380127 + }, + { + "auxiliary_loss_clip": 0.01117176, + "auxiliary_loss_mlp": 0.01038831, + "balance_loss_clip": 1.02492523, + "balance_loss_mlp": 1.03965962, + "epoch": 0.42074252217044944, + "flos": 15554096714880.0, + "grad_norm": 1.9779533128263025, + "language_loss": 0.76193553, + "learning_rate": 2.6010453105514743e-06, + "loss": 0.78349566, + "num_input_tokens_seen": 150217100, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7734375, + "step": 6998, + "time_per_iteration": 2.4484825134277344 + }, + { + "auxiliary_loss_clip": 0.01121567, + "auxiliary_loss_mlp": 0.01043992, + "balance_loss_clip": 1.02950823, + "balance_loss_mlp": 1.04302716, + "epoch": 0.4208026454231174, + "flos": 26145877610880.0, + "grad_norm": 1.545568275541188, + "language_loss": 0.76295245, + "learning_rate": 2.60067384046869e-06, + "loss": 0.78460807, + "num_input_tokens_seen": 150239830, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78515625, + "step": 6999, + "time_per_iteration": 2.5371389389038086 + }, + { + "auxiliary_loss_clip": 0.01116809, + "auxiliary_loss_mlp": 0.01039134, + "balance_loss_clip": 1.02512717, + "balance_loss_mlp": 1.04221511, + "epoch": 0.42086276867578537, + "flos": 23550110380800.0, + "grad_norm": 1.7925226690493865, + "language_loss": 0.64549243, + "learning_rate": 2.600302347608295e-06, + "loss": 0.66705179, + "num_input_tokens_seen": 150260690, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.74609375, + "step": 7000, + "time_per_iteration": 2.492664337158203 + }, + { + "auxiliary_loss_clip": 0.01117436, + "auxiliary_loss_mlp": 0.01037742, + "balance_loss_clip": 1.02347827, + "balance_loss_mlp": 1.04157186, + "epoch": 0.42092289192845334, + "flos": 18113414618880.0, + "grad_norm": 1.6489015448559594, + "language_loss": 0.76201057, + "learning_rate": 2.5999308319843743e-06, + "loss": 0.7835623, + "num_input_tokens_seen": 150279885, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7578125, + "step": 7001, + "time_per_iteration": 2.4374375343322754 + }, + { + "auxiliary_loss_clip": 0.01116209, + "auxiliary_loss_mlp": 0.0103509, + "balance_loss_clip": 1.02163076, + "balance_loss_mlp": 1.04236293, + "epoch": 0.4209830151811213, + "flos": 20006588845440.0, + "grad_norm": 1.558613926183474, + "language_loss": 0.86427414, + "learning_rate": 2.5995592936110154e-06, + "loss": 0.88578713, + "num_input_tokens_seen": 150297390, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73828125, + "step": 7002, + "time_per_iteration": 2.4840235710144043 + }, + { + "auxiliary_loss_clip": 0.01116213, + "auxiliary_loss_mlp": 0.01035955, + "balance_loss_clip": 1.02331328, + "balance_loss_mlp": 1.04153061, + "epoch": 0.42104313843378927, + "flos": 21978946604160.0, + "grad_norm": 2.8393435321353713, + "language_loss": 0.67447579, + "learning_rate": 2.5991877325023096e-06, + "loss": 0.69599748, + "num_input_tokens_seen": 150317390, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.75, + "step": 7003, + "time_per_iteration": 2.452779531478882 + }, + { + "auxiliary_loss_clip": 0.01120595, + "auxiliary_loss_mlp": 0.01042271, + "balance_loss_clip": 1.02727461, + "balance_loss_mlp": 1.04151964, + "epoch": 0.42110326168645723, + "flos": 25443966965760.0, + "grad_norm": 2.097012731379119, + "language_loss": 0.76887131, + "learning_rate": 2.598816148672344e-06, + "loss": 0.79049993, + "num_input_tokens_seen": 150337455, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7890625, + "step": 7004, + "time_per_iteration": 2.4988765716552734 + }, + { + "auxiliary_loss_clip": 0.0111532, + "auxiliary_loss_mlp": 0.01041124, + "balance_loss_clip": 1.02649117, + "balance_loss_mlp": 1.04101729, + "epoch": 0.4211633849391252, + "flos": 17822574195840.0, + "grad_norm": 1.5948979245136696, + "language_loss": 0.68152726, + "learning_rate": 2.59844454213521e-06, + "loss": 0.70309174, + "num_input_tokens_seen": 150355385, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7421875, + "step": 7005, + "time_per_iteration": 2.4434568881988525 + }, + { + "auxiliary_loss_clip": 0.01118015, + "auxiliary_loss_mlp": 0.01037165, + "balance_loss_clip": 1.02340817, + "balance_loss_mlp": 1.04088581, + "epoch": 0.42122350819179316, + "flos": 16282436791680.0, + "grad_norm": 1.9728430752981747, + "language_loss": 0.72047079, + "learning_rate": 2.5980729129049994e-06, + "loss": 0.74202257, + "num_input_tokens_seen": 150371750, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76953125, + "step": 7006, + "time_per_iteration": 2.4487879276275635 + }, + { + "auxiliary_loss_clip": 0.01117712, + "auxiliary_loss_mlp": 0.01033901, + "balance_loss_clip": 1.01978087, + "balance_loss_mlp": 1.04068065, + "epoch": 0.4212836314444611, + "flos": 19645866512640.0, + "grad_norm": 1.688876483049264, + "language_loss": 0.70708871, + "learning_rate": 2.5977012609958033e-06, + "loss": 0.72860485, + "num_input_tokens_seen": 150389955, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76953125, + "step": 7007, + "time_per_iteration": 2.437270164489746 + }, + { + "auxiliary_loss_clip": 0.01116417, + "auxiliary_loss_mlp": 0.01037894, + "balance_loss_clip": 1.02416158, + "balance_loss_mlp": 1.04059708, + "epoch": 0.4213437546971291, + "flos": 18369026778240.0, + "grad_norm": 1.7353334268618703, + "language_loss": 0.82159567, + "learning_rate": 2.5973295864217166e-06, + "loss": 0.84313881, + "num_input_tokens_seen": 150405780, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7578125, + "step": 7008, + "time_per_iteration": 2.460923194885254 + }, + { + "auxiliary_loss_clip": 0.011146, + "auxiliary_loss_mlp": 0.01040233, + "balance_loss_clip": 1.02642226, + "balance_loss_mlp": 1.03877473, + "epoch": 0.42140387794979706, + "flos": 27704507541120.0, + "grad_norm": 2.1040552452231505, + "language_loss": 0.71574211, + "learning_rate": 2.596957889196831e-06, + "loss": 0.7372905, + "num_input_tokens_seen": 150425615, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 7009, + "time_per_iteration": 2.501915693283081 + }, + { + "auxiliary_loss_clip": 0.01116238, + "auxiliary_loss_mlp": 0.01032831, + "balance_loss_clip": 1.01875222, + "balance_loss_mlp": 1.03954792, + "epoch": 0.4214640012024651, + "flos": 28147071012480.0, + "grad_norm": 2.7512785082136952, + "language_loss": 0.66407478, + "learning_rate": 2.596586169335243e-06, + "loss": 0.68556547, + "num_input_tokens_seen": 150445765, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.765625, + "step": 7010, + "time_per_iteration": 2.5036494731903076 + }, + { + "auxiliary_loss_clip": 0.01114938, + "auxiliary_loss_mlp": 0.01037239, + "balance_loss_clip": 1.02353597, + "balance_loss_mlp": 1.03993797, + "epoch": 0.42152412445513304, + "flos": 22997265177600.0, + "grad_norm": 1.553770179625671, + "language_loss": 0.7243132, + "learning_rate": 2.5962144268510477e-06, + "loss": 0.74583495, + "num_input_tokens_seen": 150464405, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75, + "step": 7011, + "time_per_iteration": 2.471482276916504 + }, + { + "auxiliary_loss_clip": 0.01036961, + "auxiliary_loss_mlp": 0.01009192, + "balance_loss_clip": 1.00765407, + "balance_loss_mlp": 1.01291788, + "epoch": 0.421584247707801, + "flos": 63749592938880.0, + "grad_norm": 0.789677431109339, + "language_loss": 0.54321265, + "learning_rate": 2.5958426617583417e-06, + "loss": 0.56367421, + "num_input_tokens_seen": 150520430, + "router_z_loss_clip": 0.01538086, + "router_z_loss_mlp": 0.24023438, + "step": 7012, + "time_per_iteration": 7.156486511230469 + }, + { + "auxiliary_loss_clip": 0.01118573, + "auxiliary_loss_mlp": 0.01033855, + "balance_loss_clip": 1.01982975, + "balance_loss_mlp": 1.04137254, + "epoch": 0.421644370960469, + "flos": 24314612474880.0, + "grad_norm": 1.3072085820070551, + "language_loss": 0.78510618, + "learning_rate": 2.5954708740712215e-06, + "loss": 0.80663049, + "num_input_tokens_seen": 150542610, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7734375, + "step": 7013, + "time_per_iteration": 2.4873650074005127 + }, + { + "auxiliary_loss_clip": 0.0111676, + "auxiliary_loss_mlp": 0.0103421, + "balance_loss_clip": 1.01945186, + "balance_loss_mlp": 1.0393039, + "epoch": 0.42170449421313694, + "flos": 23440690575360.0, + "grad_norm": 1.8972197450653994, + "language_loss": 0.8102268, + "learning_rate": 2.595099063803787e-06, + "loss": 0.83173645, + "num_input_tokens_seen": 150560970, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.7734375, + "step": 7014, + "time_per_iteration": 2.4698970317840576 + }, + { + "auxiliary_loss_clip": 0.01116577, + "auxiliary_loss_mlp": 0.01032875, + "balance_loss_clip": 1.01885617, + "balance_loss_mlp": 1.039801, + "epoch": 0.4217646174658049, + "flos": 23695476721920.0, + "grad_norm": 1.584816158328088, + "language_loss": 0.7775718, + "learning_rate": 2.5947272309701354e-06, + "loss": 0.79906625, + "num_input_tokens_seen": 150582615, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76953125, + "step": 7015, + "time_per_iteration": 2.48061203956604 + }, + { + "auxiliary_loss_clip": 0.01119879, + "auxiliary_loss_mlp": 0.01038639, + "balance_loss_clip": 1.02382123, + "balance_loss_mlp": 1.04211378, + "epoch": 0.42182474071847287, + "flos": 24971562270720.0, + "grad_norm": 1.4014002437510662, + "language_loss": 0.82126868, + "learning_rate": 2.594355375584368e-06, + "loss": 0.84285378, + "num_input_tokens_seen": 150603640, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.77734375, + "step": 7016, + "time_per_iteration": 2.4971818923950195 + }, + { + "auxiliary_loss_clip": 0.01119768, + "auxiliary_loss_mlp": 0.01033652, + "balance_loss_clip": 1.01964498, + "balance_loss_mlp": 1.04142356, + "epoch": 0.42188486397114083, + "flos": 22856639431680.0, + "grad_norm": 2.18227993050423, + "language_loss": 0.68093193, + "learning_rate": 2.593983497660586e-06, + "loss": 0.70246613, + "num_input_tokens_seen": 150622490, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.78125, + "step": 7017, + "time_per_iteration": 2.4511165618896484 + }, + { + "auxiliary_loss_clip": 0.01038936, + "auxiliary_loss_mlp": 0.00999099, + "balance_loss_clip": 0.9975912, + "balance_loss_mlp": 1.01494193, + "epoch": 0.4219449872238088, + "flos": 66975700965120.0, + "grad_norm": 0.6893654540123721, + "language_loss": 0.59420347, + "learning_rate": 2.5936115972128895e-06, + "loss": 0.61458385, + "num_input_tokens_seen": 150689545, + "router_z_loss_clip": 0.01507568, + "router_z_loss_mlp": 0.24023438, + "step": 7018, + "time_per_iteration": 3.1184492111206055 + }, + { + "auxiliary_loss_clip": 0.01118505, + "auxiliary_loss_mlp": 0.01034307, + "balance_loss_clip": 1.02027655, + "balance_loss_mlp": 1.03985381, + "epoch": 0.42200511047647676, + "flos": 13115367745920.0, + "grad_norm": 1.7697613946295114, + "language_loss": 0.75391936, + "learning_rate": 2.593239674255382e-06, + "loss": 0.77544749, + "num_input_tokens_seen": 150707610, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 7019, + "time_per_iteration": 2.415177822113037 + }, + { + "auxiliary_loss_clip": 0.01116733, + "auxiliary_loss_mlp": 0.01034757, + "balance_loss_clip": 1.01955771, + "balance_loss_mlp": 1.04044795, + "epoch": 0.42206523372914473, + "flos": 13991193066240.0, + "grad_norm": 2.151945399878188, + "language_loss": 0.69014722, + "learning_rate": 2.592867728802166e-06, + "loss": 0.71166205, + "num_input_tokens_seen": 150724530, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.76171875, + "step": 7020, + "time_per_iteration": 2.502906560897827 + }, + { + "auxiliary_loss_clip": 0.01115881, + "auxiliary_loss_mlp": 0.01032881, + "balance_loss_clip": 1.01976776, + "balance_loss_mlp": 1.04312158, + "epoch": 0.4221253569818127, + "flos": 21942317710080.0, + "grad_norm": 1.807686142219978, + "language_loss": 0.80839896, + "learning_rate": 2.592495760867347e-06, + "loss": 0.82988656, + "num_input_tokens_seen": 150742870, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 7021, + "time_per_iteration": 2.4480793476104736 + }, + { + "auxiliary_loss_clip": 0.01117987, + "auxiliary_loss_mlp": 0.01030288, + "balance_loss_clip": 1.01682925, + "balance_loss_mlp": 1.04118109, + "epoch": 0.42218548023448066, + "flos": 32192587071360.0, + "grad_norm": 1.7624230978889854, + "language_loss": 0.70018518, + "learning_rate": 2.5921237704650293e-06, + "loss": 0.721668, + "num_input_tokens_seen": 150765500, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 7022, + "time_per_iteration": 2.5637993812561035 + }, + { + "auxiliary_loss_clip": 0.01110409, + "auxiliary_loss_mlp": 0.01030174, + "balance_loss_clip": 1.01816332, + "balance_loss_mlp": 1.03993058, + "epoch": 0.4222456034871487, + "flos": 30118961894400.0, + "grad_norm": 1.4995673529455043, + "language_loss": 0.66985959, + "learning_rate": 2.5917517576093188e-06, + "loss": 0.69126534, + "num_input_tokens_seen": 150784945, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.703125, + "step": 7023, + "time_per_iteration": 2.518887996673584 + }, + { + "auxiliary_loss_clip": 0.01113824, + "auxiliary_loss_mlp": 0.01032224, + "balance_loss_clip": 1.01872325, + "balance_loss_mlp": 1.04102015, + "epoch": 0.42230572673981664, + "flos": 22127904305280.0, + "grad_norm": 1.5242794814383198, + "language_loss": 0.69374228, + "learning_rate": 2.591379722314322e-06, + "loss": 0.71520281, + "num_input_tokens_seen": 150803120, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7265625, + "step": 7024, + "time_per_iteration": 2.47479510307312 + }, + { + "auxiliary_loss_clip": 0.01115853, + "auxiliary_loss_mlp": 0.01036383, + "balance_loss_clip": 1.02272165, + "balance_loss_mlp": 1.0406878, + "epoch": 0.4223658499924846, + "flos": 22055077480320.0, + "grad_norm": 1.4987089123245305, + "language_loss": 0.76659822, + "learning_rate": 2.591007664594147e-06, + "loss": 0.78812057, + "num_input_tokens_seen": 150823135, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75, + "step": 7025, + "time_per_iteration": 2.459552526473999 + }, + { + "auxiliary_loss_clip": 0.01111611, + "auxiliary_loss_mlp": 0.01032742, + "balance_loss_clip": 1.01950371, + "balance_loss_mlp": 1.03944087, + "epoch": 0.4224259732451526, + "flos": 20410727742720.0, + "grad_norm": 1.7650754883430373, + "language_loss": 0.79574716, + "learning_rate": 2.5906355844629024e-06, + "loss": 0.81719071, + "num_input_tokens_seen": 150842070, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.72265625, + "step": 7026, + "time_per_iteration": 2.4876604080200195 + }, + { + "auxiliary_loss_clip": 0.01039298, + "auxiliary_loss_mlp": 0.00998847, + "balance_loss_clip": 0.99741668, + "balance_loss_mlp": 1.01518142, + "epoch": 0.42248609649782054, + "flos": 62846655828480.0, + "grad_norm": 0.7186593098349721, + "language_loss": 0.6191169, + "learning_rate": 2.5902634819346966e-06, + "loss": 0.63949835, + "num_input_tokens_seen": 150907450, + "router_z_loss_clip": 0.01428223, + "router_z_loss_mlp": 0.24121094, + "step": 7027, + "time_per_iteration": 3.1553335189819336 + }, + { + "auxiliary_loss_clip": 0.01115441, + "auxiliary_loss_mlp": 0.01038835, + "balance_loss_clip": 1.02524519, + "balance_loss_mlp": 1.04096365, + "epoch": 0.4225462197504885, + "flos": 26249946289920.0, + "grad_norm": 4.428318649676281, + "language_loss": 0.70515895, + "learning_rate": 2.5898913570236414e-06, + "loss": 0.72670174, + "num_input_tokens_seen": 150928040, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.74609375, + "step": 7028, + "time_per_iteration": 2.5373435020446777 + }, + { + "auxiliary_loss_clip": 0.01117282, + "auxiliary_loss_mlp": 0.01038664, + "balance_loss_clip": 1.02488303, + "balance_loss_mlp": 1.04104543, + "epoch": 0.42260634300315647, + "flos": 20521943228160.0, + "grad_norm": 1.8463743475085548, + "language_loss": 0.82555425, + "learning_rate": 2.589519209743846e-06, + "loss": 0.84711367, + "num_input_tokens_seen": 150945760, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76171875, + "step": 7029, + "time_per_iteration": 2.5120980739593506 + }, + { + "auxiliary_loss_clip": 0.0112087, + "auxiliary_loss_mlp": 0.010423, + "balance_loss_clip": 1.02790523, + "balance_loss_mlp": 1.04274035, + "epoch": 0.42266646625582444, + "flos": 24316731377280.0, + "grad_norm": 2.3903311172404, + "language_loss": 0.75230241, + "learning_rate": 2.589147040109424e-06, + "loss": 0.77393407, + "num_input_tokens_seen": 150965665, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 7030, + "time_per_iteration": 2.5118141174316406 + }, + { + "auxiliary_loss_clip": 0.01114314, + "auxiliary_loss_mlp": 0.01038089, + "balance_loss_clip": 1.02359271, + "balance_loss_mlp": 1.03835046, + "epoch": 0.4227265895084924, + "flos": 24204151175040.0, + "grad_norm": 1.9474535697331137, + "language_loss": 0.86421049, + "learning_rate": 2.588774848134486e-06, + "loss": 0.88573444, + "num_input_tokens_seen": 150982260, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7578125, + "step": 7031, + "time_per_iteration": 2.500140905380249 + }, + { + "auxiliary_loss_clip": 0.01115501, + "auxiliary_loss_mlp": 0.0103786, + "balance_loss_clip": 1.02328062, + "balance_loss_mlp": 1.04060841, + "epoch": 0.42278671276116037, + "flos": 16909760845440.0, + "grad_norm": 2.1339679402128717, + "language_loss": 0.72855937, + "learning_rate": 2.5884026338331473e-06, + "loss": 0.75009298, + "num_input_tokens_seen": 150999990, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.75, + "step": 7032, + "time_per_iteration": 2.477363109588623 + }, + { + "auxiliary_loss_clip": 0.0111615, + "auxiliary_loss_mlp": 0.01040791, + "balance_loss_clip": 1.02711725, + "balance_loss_mlp": 1.0390861, + "epoch": 0.42284683601382833, + "flos": 25411073086080.0, + "grad_norm": 1.7148750065903648, + "language_loss": 0.699175, + "learning_rate": 2.5880303972195222e-06, + "loss": 0.72074443, + "num_input_tokens_seen": 151021105, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 7033, + "time_per_iteration": 2.5661494731903076 + }, + { + "auxiliary_loss_clip": 0.01115751, + "auxiliary_loss_mlp": 0.01032627, + "balance_loss_clip": 1.01895976, + "balance_loss_mlp": 1.03992891, + "epoch": 0.4229069592664963, + "flos": 23040322606080.0, + "grad_norm": 1.8649473631938416, + "language_loss": 0.90448046, + "learning_rate": 2.5876581383077256e-06, + "loss": 0.92596424, + "num_input_tokens_seen": 151040665, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7578125, + "step": 7034, + "time_per_iteration": 2.4802892208099365 + }, + { + "auxiliary_loss_clip": 0.01112625, + "auxiliary_loss_mlp": 0.01036891, + "balance_loss_clip": 1.02369416, + "balance_loss_mlp": 1.03800857, + "epoch": 0.42296708251916426, + "flos": 26067448264320.0, + "grad_norm": 1.6052176008605175, + "language_loss": 0.77130729, + "learning_rate": 2.5872858571118723e-06, + "loss": 0.79280239, + "num_input_tokens_seen": 151061240, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.74609375, + "step": 7035, + "time_per_iteration": 2.5044498443603516 + }, + { + "auxiliary_loss_clip": 0.01118013, + "auxiliary_loss_mlp": 0.01040699, + "balance_loss_clip": 1.02682912, + "balance_loss_mlp": 1.0414331, + "epoch": 0.4230272057718323, + "flos": 19458376496640.0, + "grad_norm": 1.9123378440021823, + "language_loss": 0.82216996, + "learning_rate": 2.5869135536460817e-06, + "loss": 0.84375703, + "num_input_tokens_seen": 151076870, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 7036, + "time_per_iteration": 2.4178695678710938 + }, + { + "auxiliary_loss_clip": 0.01112842, + "auxiliary_loss_mlp": 0.01036995, + "balance_loss_clip": 1.02382207, + "balance_loss_mlp": 1.0403924, + "epoch": 0.42308732902450025, + "flos": 22383300983040.0, + "grad_norm": 1.6417488866700152, + "language_loss": 0.70871484, + "learning_rate": 2.58654122792447e-06, + "loss": 0.73021322, + "num_input_tokens_seen": 151095110, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7265625, + "step": 7037, + "time_per_iteration": 2.485499858856201 + }, + { + "auxiliary_loss_clip": 0.01115475, + "auxiliary_loss_mlp": 0.01037386, + "balance_loss_clip": 1.02303314, + "balance_loss_mlp": 1.03976059, + "epoch": 0.4231474522771682, + "flos": 20995425331200.0, + "grad_norm": 1.5138937767155718, + "language_loss": 0.77942061, + "learning_rate": 2.586168879961155e-06, + "loss": 0.80094922, + "num_input_tokens_seen": 151114355, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7578125, + "step": 7038, + "time_per_iteration": 2.4569690227508545 + }, + { + "auxiliary_loss_clip": 0.01120787, + "auxiliary_loss_mlp": 0.01044399, + "balance_loss_clip": 1.02919412, + "balance_loss_mlp": 1.04072356, + "epoch": 0.4232075755298362, + "flos": 14975863574400.0, + "grad_norm": 2.366884859254005, + "language_loss": 0.66797423, + "learning_rate": 2.585796509770259e-06, + "loss": 0.6896261, + "num_input_tokens_seen": 151131505, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.80078125, + "step": 7039, + "time_per_iteration": 2.441373825073242 + }, + { + "auxiliary_loss_clip": 0.01119114, + "auxiliary_loss_mlp": 0.01037872, + "balance_loss_clip": 1.02372193, + "balance_loss_mlp": 1.04042578, + "epoch": 0.42326769878250414, + "flos": 24532661986560.0, + "grad_norm": 1.6082175120791662, + "language_loss": 0.75897467, + "learning_rate": 2.5854241173658996e-06, + "loss": 0.78054452, + "num_input_tokens_seen": 151151555, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.78515625, + "step": 7040, + "time_per_iteration": 2.471653938293457 + }, + { + "auxiliary_loss_clip": 0.01117046, + "auxiliary_loss_mlp": 0.01035054, + "balance_loss_clip": 1.02067101, + "balance_loss_mlp": 1.03962982, + "epoch": 0.4233278220351721, + "flos": 26870303105280.0, + "grad_norm": 1.477939672492119, + "language_loss": 0.65098798, + "learning_rate": 2.5850517027621996e-06, + "loss": 0.67250896, + "num_input_tokens_seen": 151172385, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7734375, + "step": 7041, + "time_per_iteration": 2.502443313598633 + }, + { + "auxiliary_loss_clip": 0.01118281, + "auxiliary_loss_mlp": 0.01036522, + "balance_loss_clip": 1.02233624, + "balance_loss_mlp": 1.04045236, + "epoch": 0.4233879452878401, + "flos": 42814927463040.0, + "grad_norm": 1.7627160436135367, + "language_loss": 0.73621082, + "learning_rate": 2.5846792659732803e-06, + "loss": 0.75775892, + "num_input_tokens_seen": 151194930, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.77734375, + "step": 7042, + "time_per_iteration": 2.6498820781707764 + }, + { + "auxiliary_loss_clip": 0.01112749, + "auxiliary_loss_mlp": 0.01033182, + "balance_loss_clip": 1.02020609, + "balance_loss_mlp": 1.03977966, + "epoch": 0.42344806854050804, + "flos": 25229006023680.0, + "grad_norm": 1.3177903064215164, + "language_loss": 0.82185107, + "learning_rate": 2.5843068070132643e-06, + "loss": 0.84331036, + "num_input_tokens_seen": 151217905, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.73046875, + "step": 7043, + "time_per_iteration": 2.528604745864868 + }, + { + "auxiliary_loss_clip": 0.0111836, + "auxiliary_loss_mlp": 0.01041223, + "balance_loss_clip": 1.02608395, + "balance_loss_mlp": 1.04329216, + "epoch": 0.423508191793176, + "flos": 22778820616320.0, + "grad_norm": 2.3747778329738742, + "language_loss": 0.65231359, + "learning_rate": 2.5839343258962763e-06, + "loss": 0.67390943, + "num_input_tokens_seen": 151234580, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.75, + "step": 7044, + "time_per_iteration": 2.4399802684783936 + }, + { + "auxiliary_loss_clip": 0.01121384, + "auxiliary_loss_mlp": 0.01046534, + "balance_loss_clip": 1.03126323, + "balance_loss_mlp": 1.04322433, + "epoch": 0.42356831504584397, + "flos": 34637493179520.0, + "grad_norm": 1.7497316034691441, + "language_loss": 0.7502315, + "learning_rate": 2.5835618226364393e-06, + "loss": 0.77191073, + "num_input_tokens_seen": 151254765, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.78125, + "step": 7045, + "time_per_iteration": 2.612898588180542 + }, + { + "auxiliary_loss_clip": 0.01116302, + "auxiliary_loss_mlp": 0.01035318, + "balance_loss_clip": 1.02141845, + "balance_loss_mlp": 1.04219389, + "epoch": 0.42362843829851193, + "flos": 17596767346560.0, + "grad_norm": 2.1011396794876385, + "language_loss": 0.80564952, + "learning_rate": 2.5831892972478797e-06, + "loss": 0.82716572, + "num_input_tokens_seen": 151269045, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.73828125, + "step": 7046, + "time_per_iteration": 2.4105727672576904 + }, + { + "auxiliary_loss_clip": 0.01119082, + "auxiliary_loss_mlp": 0.0103605, + "balance_loss_clip": 1.021685, + "balance_loss_mlp": 1.04078197, + "epoch": 0.4236885615511799, + "flos": 22565691267840.0, + "grad_norm": 1.59844067944401, + "language_loss": 0.76846749, + "learning_rate": 2.5828167497447242e-06, + "loss": 0.7900188, + "num_input_tokens_seen": 151287530, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78125, + "step": 7047, + "time_per_iteration": 2.486297130584717 + }, + { + "auxiliary_loss_clip": 0.01116569, + "auxiliary_loss_mlp": 0.01034292, + "balance_loss_clip": 1.02102375, + "balance_loss_mlp": 1.04264975, + "epoch": 0.42374868480384786, + "flos": 26469216864000.0, + "grad_norm": 1.8697996227798281, + "language_loss": 0.67980373, + "learning_rate": 2.582444180141098e-06, + "loss": 0.70131224, + "num_input_tokens_seen": 151308905, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73828125, + "step": 7048, + "time_per_iteration": 2.5031991004943848 + }, + { + "auxiliary_loss_clip": 0.01119136, + "auxiliary_loss_mlp": 0.01038379, + "balance_loss_clip": 1.02371609, + "balance_loss_mlp": 1.04227185, + "epoch": 0.4238088080565159, + "flos": 20370220179840.0, + "grad_norm": 1.7311423758965327, + "language_loss": 0.7829181, + "learning_rate": 2.5820715884511307e-06, + "loss": 0.80449331, + "num_input_tokens_seen": 151326525, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.76953125, + "step": 7049, + "time_per_iteration": 2.549767255783081 + }, + { + "auxiliary_loss_clip": 0.01121261, + "auxiliary_loss_mlp": 0.01039585, + "balance_loss_clip": 1.02570868, + "balance_loss_mlp": 1.0433383, + "epoch": 0.42386893130918385, + "flos": 21172105353600.0, + "grad_norm": 1.7774881318176563, + "language_loss": 0.82656097, + "learning_rate": 2.5816989746889504e-06, + "loss": 0.84816945, + "num_input_tokens_seen": 151344675, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.78125, + "step": 7050, + "time_per_iteration": 2.498494863510132 + }, + { + "auxiliary_loss_clip": 0.01115122, + "auxiliary_loss_mlp": 0.01035845, + "balance_loss_clip": 1.02233815, + "balance_loss_mlp": 1.0382762, + "epoch": 0.4239290545618518, + "flos": 17675627656320.0, + "grad_norm": 2.0169322630318844, + "language_loss": 0.73429018, + "learning_rate": 2.581326338868687e-06, + "loss": 0.75579983, + "num_input_tokens_seen": 151360730, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 7051, + "time_per_iteration": 2.441920042037964 + }, + { + "auxiliary_loss_clip": 0.01118227, + "auxiliary_loss_mlp": 0.01033059, + "balance_loss_clip": 1.01983249, + "balance_loss_mlp": 1.04219055, + "epoch": 0.4239891778145198, + "flos": 24314504734080.0, + "grad_norm": 1.4713561275118965, + "language_loss": 0.86205333, + "learning_rate": 2.5809536810044706e-06, + "loss": 0.8835662, + "num_input_tokens_seen": 151380445, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7578125, + "step": 7052, + "time_per_iteration": 2.511756658554077 + }, + { + "auxiliary_loss_clip": 0.01116616, + "auxiliary_loss_mlp": 0.01041035, + "balance_loss_clip": 1.02657533, + "balance_loss_mlp": 1.03951788, + "epoch": 0.42404930106718774, + "flos": 20558428467840.0, + "grad_norm": 1.4100722391624452, + "language_loss": 0.7240659, + "learning_rate": 2.5805810011104323e-06, + "loss": 0.74564236, + "num_input_tokens_seen": 151399325, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7734375, + "step": 7053, + "time_per_iteration": 3.9099857807159424 + }, + { + "auxiliary_loss_clip": 0.01116742, + "auxiliary_loss_mlp": 0.01033237, + "balance_loss_clip": 1.0190872, + "balance_loss_mlp": 1.04233611, + "epoch": 0.4241094243198557, + "flos": 22308067946880.0, + "grad_norm": 1.5741365926511655, + "language_loss": 0.82153803, + "learning_rate": 2.580208299200704e-06, + "loss": 0.84303784, + "num_input_tokens_seen": 151417240, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.74609375, + "step": 7054, + "time_per_iteration": 5.327679634094238 + }, + { + "auxiliary_loss_clip": 0.01040448, + "auxiliary_loss_mlp": 0.0101831, + "balance_loss_clip": 1.01700425, + "balance_loss_mlp": 1.01674867, + "epoch": 0.4241695475725237, + "flos": 70612445272320.0, + "grad_norm": 0.7840713570529064, + "language_loss": 0.60388172, + "learning_rate": 2.5798355752894183e-06, + "loss": 0.62446928, + "num_input_tokens_seen": 151476015, + "router_z_loss_clip": 0.01306152, + "router_z_loss_mlp": 0.23632812, + "step": 7055, + "time_per_iteration": 3.0450727939605713 + }, + { + "auxiliary_loss_clip": 0.01119418, + "auxiliary_loss_mlp": 0.01041139, + "balance_loss_clip": 1.02651238, + "balance_loss_mlp": 1.04204714, + "epoch": 0.42422967082519164, + "flos": 14027462824320.0, + "grad_norm": 2.951771931203088, + "language_loss": 0.76762712, + "learning_rate": 2.5794628293907107e-06, + "loss": 0.78923267, + "num_input_tokens_seen": 151492035, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7734375, + "step": 7056, + "time_per_iteration": 2.442148447036743 + }, + { + "auxiliary_loss_clip": 0.01121258, + "auxiliary_loss_mlp": 0.01039415, + "balance_loss_clip": 1.02375674, + "balance_loss_mlp": 1.04127979, + "epoch": 0.4242897940778596, + "flos": 22345522853760.0, + "grad_norm": 2.7846662247260388, + "language_loss": 0.84346795, + "learning_rate": 2.579090061518714e-06, + "loss": 0.86507463, + "num_input_tokens_seen": 151508970, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.80078125, + "step": 7057, + "time_per_iteration": 2.474519968032837 + }, + { + "auxiliary_loss_clip": 0.01119549, + "auxiliary_loss_mlp": 0.010377, + "balance_loss_clip": 1.02272737, + "balance_loss_mlp": 1.04053187, + "epoch": 0.42434991733052757, + "flos": 22595855713920.0, + "grad_norm": 3.1820547358610605, + "language_loss": 0.82999814, + "learning_rate": 2.5787172716875642e-06, + "loss": 0.85157061, + "num_input_tokens_seen": 151525295, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.7890625, + "step": 7058, + "time_per_iteration": 2.473520517349243 + }, + { + "auxiliary_loss_clip": 0.01118587, + "auxiliary_loss_mlp": 0.01028517, + "balance_loss_clip": 1.01533902, + "balance_loss_mlp": 1.04417813, + "epoch": 0.42441004058319554, + "flos": 20011437181440.0, + "grad_norm": 1.7435131696457398, + "language_loss": 0.80453449, + "learning_rate": 2.5783444599113973e-06, + "loss": 0.82600558, + "num_input_tokens_seen": 151544435, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 7059, + "time_per_iteration": 2.4719533920288086 + }, + { + "auxiliary_loss_clip": 0.01120005, + "auxiliary_loss_mlp": 0.01033964, + "balance_loss_clip": 1.01860404, + "balance_loss_mlp": 1.041839, + "epoch": 0.4244701638358635, + "flos": 11144985235200.0, + "grad_norm": 1.9429107045123646, + "language_loss": 0.70341688, + "learning_rate": 2.57797162620435e-06, + "loss": 0.72495657, + "num_input_tokens_seen": 151559520, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.78125, + "step": 7060, + "time_per_iteration": 2.4377660751342773 + }, + { + "auxiliary_loss_clip": 0.0112048, + "auxiliary_loss_mlp": 0.01032925, + "balance_loss_clip": 1.01914454, + "balance_loss_mlp": 1.04378521, + "epoch": 0.42453028708853147, + "flos": 23987753688960.0, + "grad_norm": 1.5364996273974925, + "language_loss": 0.76182258, + "learning_rate": 2.577598770580562e-06, + "loss": 0.78335667, + "num_input_tokens_seen": 151579790, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.765625, + "step": 7061, + "time_per_iteration": 2.486786365509033 + }, + { + "auxiliary_loss_clip": 0.01122599, + "auxiliary_loss_mlp": 0.01038557, + "balance_loss_clip": 1.02319098, + "balance_loss_mlp": 1.04407752, + "epoch": 0.42459041034119943, + "flos": 18406338030720.0, + "grad_norm": 3.328289037638814, + "language_loss": 0.729635, + "learning_rate": 2.5772258930541693e-06, + "loss": 0.75124645, + "num_input_tokens_seen": 151598285, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.7890625, + "step": 7062, + "time_per_iteration": 2.474193572998047 + }, + { + "auxiliary_loss_clip": 0.01116832, + "auxiliary_loss_mlp": 0.0104003, + "balance_loss_clip": 1.02600455, + "balance_loss_mlp": 1.03964305, + "epoch": 0.42465053359386745, + "flos": 20958006337920.0, + "grad_norm": 1.701854582957673, + "language_loss": 0.66343361, + "learning_rate": 2.5768529936393137e-06, + "loss": 0.68500221, + "num_input_tokens_seen": 151615430, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7734375, + "step": 7063, + "time_per_iteration": 2.458003520965576 + }, + { + "auxiliary_loss_clip": 0.01115284, + "auxiliary_loss_mlp": 0.01031318, + "balance_loss_clip": 1.0181458, + "balance_loss_mlp": 1.04179168, + "epoch": 0.4247106568465354, + "flos": 33106190520960.0, + "grad_norm": 1.4878317325171677, + "language_loss": 0.78371775, + "learning_rate": 2.5764800723501354e-06, + "loss": 0.80518377, + "num_input_tokens_seen": 151637030, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 7064, + "time_per_iteration": 2.5735623836517334 + }, + { + "auxiliary_loss_clip": 0.01118889, + "auxiliary_loss_mlp": 0.01040848, + "balance_loss_clip": 1.02636456, + "balance_loss_mlp": 1.04172683, + "epoch": 0.4247707800992034, + "flos": 20046916840320.0, + "grad_norm": 1.8409826195637737, + "language_loss": 0.74893892, + "learning_rate": 2.5761071292007736e-06, + "loss": 0.7705363, + "num_input_tokens_seen": 151655745, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.76953125, + "step": 7065, + "time_per_iteration": 2.4962844848632812 + }, + { + "auxiliary_loss_clip": 0.01119456, + "auxiliary_loss_mlp": 0.01035708, + "balance_loss_clip": 1.0206933, + "balance_loss_mlp": 1.04322076, + "epoch": 0.42483090335187135, + "flos": 22385132576640.0, + "grad_norm": 1.415711347923808, + "language_loss": 0.72713453, + "learning_rate": 2.5757341642053725e-06, + "loss": 0.74868619, + "num_input_tokens_seen": 151678040, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.76171875, + "step": 7066, + "time_per_iteration": 2.551297426223755 + }, + { + "auxiliary_loss_clip": 0.01119285, + "auxiliary_loss_mlp": 0.01038218, + "balance_loss_clip": 1.02307224, + "balance_loss_mlp": 1.04031396, + "epoch": 0.4248910266045393, + "flos": 21356830022400.0, + "grad_norm": 1.9392042625935109, + "language_loss": 0.79517603, + "learning_rate": 2.5753611773780745e-06, + "loss": 0.81675112, + "num_input_tokens_seen": 151696410, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.7890625, + "step": 7067, + "time_per_iteration": 2.4871444702148438 + }, + { + "auxiliary_loss_clip": 0.010394, + "auxiliary_loss_mlp": 0.01005215, + "balance_loss_clip": 1.00373113, + "balance_loss_mlp": 1.01538539, + "epoch": 0.4249511498572073, + "flos": 64008114099840.0, + "grad_norm": 0.919528911316311, + "language_loss": 0.63477993, + "learning_rate": 2.574988168733022e-06, + "loss": 0.65522605, + "num_input_tokens_seen": 151756365, + "router_z_loss_clip": 0.01483154, + "router_z_loss_mlp": 0.24023438, + "step": 7068, + "time_per_iteration": 3.0116004943847656 + }, + { + "auxiliary_loss_clip": 0.01119716, + "auxiliary_loss_mlp": 0.01036194, + "balance_loss_clip": 1.02073288, + "balance_loss_mlp": 1.04235375, + "epoch": 0.42501127310987524, + "flos": 19607046888960.0, + "grad_norm": 1.681037886347605, + "language_loss": 0.72381866, + "learning_rate": 2.574615138284361e-06, + "loss": 0.74537772, + "num_input_tokens_seen": 151775165, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.7734375, + "step": 7069, + "time_per_iteration": 2.5046679973602295 + }, + { + "auxiliary_loss_clip": 0.01122307, + "auxiliary_loss_mlp": 0.01034999, + "balance_loss_clip": 1.01864338, + "balance_loss_mlp": 1.04424644, + "epoch": 0.4250713963625432, + "flos": 19462326992640.0, + "grad_norm": 3.2712432047864852, + "language_loss": 0.79297352, + "learning_rate": 2.5742420860462364e-06, + "loss": 0.81454653, + "num_input_tokens_seen": 151792620, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.78125, + "step": 7070, + "time_per_iteration": 2.43115496635437 + }, + { + "auxiliary_loss_clip": 0.01118123, + "auxiliary_loss_mlp": 0.01033635, + "balance_loss_clip": 1.01863861, + "balance_loss_mlp": 1.04104066, + "epoch": 0.4251315196152112, + "flos": 25337707557120.0, + "grad_norm": 1.8101520547589562, + "language_loss": 0.70179212, + "learning_rate": 2.573869012032795e-06, + "loss": 0.7233097, + "num_input_tokens_seen": 151812850, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7734375, + "step": 7071, + "time_per_iteration": 2.5141680240631104 + }, + { + "auxiliary_loss_clip": 0.01118096, + "auxiliary_loss_mlp": 0.01033548, + "balance_loss_clip": 1.01942205, + "balance_loss_mlp": 1.04123151, + "epoch": 0.42519164286787914, + "flos": 26359186527360.0, + "grad_norm": 2.3450864635540825, + "language_loss": 0.71075511, + "learning_rate": 2.5734959162581824e-06, + "loss": 0.73227149, + "num_input_tokens_seen": 151831785, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76953125, + "step": 7072, + "time_per_iteration": 2.489187002182007 + }, + { + "auxiliary_loss_clip": 0.01122118, + "auxiliary_loss_mlp": 0.01032433, + "balance_loss_clip": 1.01865244, + "balance_loss_mlp": 1.04270983, + "epoch": 0.4252517661205471, + "flos": 26031070765440.0, + "grad_norm": 1.5399076436438217, + "language_loss": 0.81655496, + "learning_rate": 2.5731227987365475e-06, + "loss": 0.83810043, + "num_input_tokens_seen": 151853885, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.79296875, + "step": 7073, + "time_per_iteration": 2.5192041397094727 + }, + { + "auxiliary_loss_clip": 0.0111768, + "auxiliary_loss_mlp": 0.01034416, + "balance_loss_clip": 1.02097535, + "balance_loss_mlp": 1.04180706, + "epoch": 0.42531188937321507, + "flos": 12713635059840.0, + "grad_norm": 2.1264240253054227, + "language_loss": 0.90777069, + "learning_rate": 2.5727496594820386e-06, + "loss": 0.92929167, + "num_input_tokens_seen": 151871780, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 7074, + "time_per_iteration": 2.418611526489258 + }, + { + "auxiliary_loss_clip": 0.01122809, + "auxiliary_loss_mlp": 0.01039179, + "balance_loss_clip": 1.0234437, + "balance_loss_mlp": 1.04282892, + "epoch": 0.42537201262588303, + "flos": 22091670460800.0, + "grad_norm": 1.5751331844442036, + "language_loss": 0.63971686, + "learning_rate": 2.572376498508805e-06, + "loss": 0.66133678, + "num_input_tokens_seen": 151891600, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.80078125, + "step": 7075, + "time_per_iteration": 2.5064475536346436 + }, + { + "auxiliary_loss_clip": 0.01114521, + "auxiliary_loss_mlp": 0.01030161, + "balance_loss_clip": 1.01708984, + "balance_loss_mlp": 1.04121399, + "epoch": 0.42543213587855105, + "flos": 23003119094400.0, + "grad_norm": 1.5599863464934922, + "language_loss": 0.73547149, + "learning_rate": 2.5720033158309973e-06, + "loss": 0.75691831, + "num_input_tokens_seen": 151911330, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.734375, + "step": 7076, + "time_per_iteration": 2.487424850463867 + }, + { + "auxiliary_loss_clip": 0.01122674, + "auxiliary_loss_mlp": 0.01040326, + "balance_loss_clip": 1.02565181, + "balance_loss_mlp": 1.04370356, + "epoch": 0.425492259131219, + "flos": 25082454533760.0, + "grad_norm": 1.8221025125090708, + "language_loss": 0.78215933, + "learning_rate": 2.571630111462766e-06, + "loss": 0.80378938, + "num_input_tokens_seen": 151930355, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7890625, + "step": 7077, + "time_per_iteration": 2.4964394569396973 + }, + { + "auxiliary_loss_clip": 0.01114549, + "auxiliary_loss_mlp": 0.01034843, + "balance_loss_clip": 1.0221417, + "balance_loss_mlp": 1.04220366, + "epoch": 0.425552382383887, + "flos": 22816850140800.0, + "grad_norm": 1.6016827264272244, + "language_loss": 0.73013902, + "learning_rate": 2.571256885418265e-06, + "loss": 0.75163293, + "num_input_tokens_seen": 151949695, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 7078, + "time_per_iteration": 2.47660756111145 + }, + { + "auxiliary_loss_clip": 0.01120871, + "auxiliary_loss_mlp": 0.01042162, + "balance_loss_clip": 1.02880406, + "balance_loss_mlp": 1.0461756, + "epoch": 0.42561250563655495, + "flos": 13553585671680.0, + "grad_norm": 1.731645410920913, + "language_loss": 0.79469633, + "learning_rate": 2.5708836377116445e-06, + "loss": 0.81632668, + "num_input_tokens_seen": 151967640, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.74609375, + "step": 7079, + "time_per_iteration": 2.499232769012451 + }, + { + "auxiliary_loss_clip": 0.0112172, + "auxiliary_loss_mlp": 0.01031179, + "balance_loss_clip": 1.0181613, + "balance_loss_mlp": 1.04761243, + "epoch": 0.4256726288892229, + "flos": 46978303023360.0, + "grad_norm": 1.4705007316204746, + "language_loss": 0.72263241, + "learning_rate": 2.5705103683570592e-06, + "loss": 0.74416137, + "num_input_tokens_seen": 151994020, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7421875, + "step": 7080, + "time_per_iteration": 2.732074499130249 + }, + { + "auxiliary_loss_clip": 0.0111869, + "auxiliary_loss_mlp": 0.01035587, + "balance_loss_clip": 1.02206242, + "balance_loss_mlp": 1.04246545, + "epoch": 0.4257327521418909, + "flos": 23586451966080.0, + "grad_norm": 2.328741773172896, + "language_loss": 0.80405676, + "learning_rate": 2.5701370773686646e-06, + "loss": 0.82559955, + "num_input_tokens_seen": 152013415, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.76171875, + "step": 7081, + "time_per_iteration": 2.6035380363464355 + }, + { + "auxiliary_loss_clip": 0.01115229, + "auxiliary_loss_mlp": 0.01030383, + "balance_loss_clip": 1.01753235, + "balance_loss_mlp": 1.04303384, + "epoch": 0.42579287539455885, + "flos": 18989994124800.0, + "grad_norm": 1.7894721227922463, + "language_loss": 0.81618208, + "learning_rate": 2.5697637647606138e-06, + "loss": 0.8376382, + "num_input_tokens_seen": 152030860, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.72265625, + "step": 7082, + "time_per_iteration": 2.444728374481201 + }, + { + "auxiliary_loss_clip": 0.01119852, + "auxiliary_loss_mlp": 0.0103706, + "balance_loss_clip": 1.02286816, + "balance_loss_mlp": 1.04368842, + "epoch": 0.4258529986472268, + "flos": 25191910252800.0, + "grad_norm": 2.6988843094625508, + "language_loss": 0.69388473, + "learning_rate": 2.569390430547065e-06, + "loss": 0.71545386, + "num_input_tokens_seen": 152050395, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.76171875, + "step": 7083, + "time_per_iteration": 2.5369133949279785 + }, + { + "auxiliary_loss_clip": 0.01040302, + "auxiliary_loss_mlp": 0.00999977, + "balance_loss_clip": 0.99864787, + "balance_loss_mlp": 1.01655924, + "epoch": 0.4259131218998948, + "flos": 69968280718080.0, + "grad_norm": 0.8706759407802692, + "language_loss": 0.67112887, + "learning_rate": 2.569017074742173e-06, + "loss": 0.69153166, + "num_input_tokens_seen": 152113555, + "router_z_loss_clip": 0.01330566, + "router_z_loss_mlp": 0.23828125, + "step": 7084, + "time_per_iteration": 3.1631839275360107 + }, + { + "auxiliary_loss_clip": 0.01118847, + "auxiliary_loss_mlp": 0.01044199, + "balance_loss_clip": 1.02887428, + "balance_loss_mlp": 1.04295874, + "epoch": 0.42597324515256274, + "flos": 18004964480640.0, + "grad_norm": 2.6244995349856595, + "language_loss": 0.78095287, + "learning_rate": 2.5686436973600964e-06, + "loss": 0.80258334, + "num_input_tokens_seen": 152131575, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.7578125, + "step": 7085, + "time_per_iteration": 2.493157148361206 + }, + { + "auxiliary_loss_clip": 0.01129017, + "auxiliary_loss_mlp": 0.01046431, + "balance_loss_clip": 1.03102934, + "balance_loss_mlp": 1.04819477, + "epoch": 0.4260333684052307, + "flos": 15158792563200.0, + "grad_norm": 2.071277468695464, + "language_loss": 0.75757217, + "learning_rate": 2.568270298414995e-06, + "loss": 0.77932662, + "num_input_tokens_seen": 152149435, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.80859375, + "step": 7086, + "time_per_iteration": 2.426295280456543 + }, + { + "auxiliary_loss_clip": 0.01119794, + "auxiliary_loss_mlp": 0.01037065, + "balance_loss_clip": 1.02280188, + "balance_loss_mlp": 1.0433557, + "epoch": 0.42609349165789867, + "flos": 14939342421120.0, + "grad_norm": 2.1734108107028147, + "language_loss": 0.8001647, + "learning_rate": 2.5678968779210255e-06, + "loss": 0.82173336, + "num_input_tokens_seen": 152166860, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.765625, + "step": 7087, + "time_per_iteration": 2.46087384223938 + }, + { + "auxiliary_loss_clip": 0.01123365, + "auxiliary_loss_mlp": 0.01032798, + "balance_loss_clip": 1.01846862, + "balance_loss_mlp": 1.04632342, + "epoch": 0.42615361491056664, + "flos": 23731961961600.0, + "grad_norm": 1.8444426655441133, + "language_loss": 0.6603114, + "learning_rate": 2.5675234358923505e-06, + "loss": 0.68187302, + "num_input_tokens_seen": 152187475, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.76953125, + "step": 7088, + "time_per_iteration": 2.481919527053833 + }, + { + "auxiliary_loss_clip": 0.01123249, + "auxiliary_loss_mlp": 0.01039067, + "balance_loss_clip": 1.02472591, + "balance_loss_mlp": 1.0449152, + "epoch": 0.42621373816323466, + "flos": 24936441747840.0, + "grad_norm": 1.8812259313043718, + "language_loss": 0.68482029, + "learning_rate": 2.56714997234313e-06, + "loss": 0.70644343, + "num_input_tokens_seen": 152207235, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78125, + "step": 7089, + "time_per_iteration": 2.523918628692627 + }, + { + "auxiliary_loss_clip": 0.01121302, + "auxiliary_loss_mlp": 0.01038776, + "balance_loss_clip": 1.02473295, + "balance_loss_mlp": 1.0418849, + "epoch": 0.4262738614159026, + "flos": 13552975140480.0, + "grad_norm": 2.8669230196035027, + "language_loss": 0.72897398, + "learning_rate": 2.566776487287525e-06, + "loss": 0.75057483, + "num_input_tokens_seen": 152224240, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.79296875, + "step": 7090, + "time_per_iteration": 2.4340648651123047 + }, + { + "auxiliary_loss_clip": 0.01124016, + "auxiliary_loss_mlp": 0.01046428, + "balance_loss_clip": 1.03208125, + "balance_loss_mlp": 1.04372311, + "epoch": 0.4263339846685706, + "flos": 29748794284800.0, + "grad_norm": 1.7953532910276222, + "language_loss": 0.75347531, + "learning_rate": 2.5664029807396994e-06, + "loss": 0.77517974, + "num_input_tokens_seen": 152242595, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.80078125, + "step": 7091, + "time_per_iteration": 2.5973541736602783 + }, + { + "auxiliary_loss_clip": 0.0111574, + "auxiliary_loss_mlp": 0.01034559, + "balance_loss_clip": 1.02188134, + "balance_loss_mlp": 1.04312468, + "epoch": 0.42639410792123855, + "flos": 16834204586880.0, + "grad_norm": 1.6821401092021848, + "language_loss": 0.82308388, + "learning_rate": 2.5660294527138156e-06, + "loss": 0.84458697, + "num_input_tokens_seen": 152260840, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7265625, + "step": 7092, + "time_per_iteration": 2.453181266784668 + }, + { + "auxiliary_loss_clip": 0.0112628, + "auxiliary_loss_mlp": 0.01045355, + "balance_loss_clip": 1.03138983, + "balance_loss_mlp": 1.0454514, + "epoch": 0.4264542311739065, + "flos": 28763118195840.0, + "grad_norm": 1.6505279256890275, + "language_loss": 0.73916072, + "learning_rate": 2.565655903224038e-06, + "loss": 0.76087701, + "num_input_tokens_seen": 152280580, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.80859375, + "step": 7093, + "time_per_iteration": 2.5176479816436768 + }, + { + "auxiliary_loss_clip": 0.01120741, + "auxiliary_loss_mlp": 0.01039533, + "balance_loss_clip": 1.02482259, + "balance_loss_mlp": 1.04376769, + "epoch": 0.4265143544265745, + "flos": 24713615727360.0, + "grad_norm": 2.5315083588078555, + "language_loss": 0.69390249, + "learning_rate": 2.565282332284532e-06, + "loss": 0.71550524, + "num_input_tokens_seen": 152298455, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.76953125, + "step": 7094, + "time_per_iteration": 2.489561080932617 + }, + { + "auxiliary_loss_clip": 0.01122789, + "auxiliary_loss_mlp": 0.01038187, + "balance_loss_clip": 1.02379799, + "balance_loss_mlp": 1.04475617, + "epoch": 0.42657447767924245, + "flos": 21865971352320.0, + "grad_norm": 1.6055215896501054, + "language_loss": 0.81466055, + "learning_rate": 2.564908739909464e-06, + "loss": 0.83627033, + "num_input_tokens_seen": 152316995, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 7095, + "time_per_iteration": 6.829655647277832 + }, + { + "auxiliary_loss_clip": 0.01122192, + "auxiliary_loss_mlp": 0.01044565, + "balance_loss_clip": 1.02972341, + "balance_loss_mlp": 1.04453826, + "epoch": 0.4266346009319104, + "flos": 21470236237440.0, + "grad_norm": 1.7098780852895776, + "language_loss": 0.80283463, + "learning_rate": 2.5645351261129996e-06, + "loss": 0.82450223, + "num_input_tokens_seen": 152334800, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.77734375, + "step": 7096, + "time_per_iteration": 3.894577980041504 + }, + { + "auxiliary_loss_clip": 0.01125109, + "auxiliary_loss_mlp": 0.01041794, + "balance_loss_clip": 1.02754259, + "balance_loss_mlp": 1.04520798, + "epoch": 0.4266947241845784, + "flos": 25519379569920.0, + "grad_norm": 1.947200367016257, + "language_loss": 0.65628326, + "learning_rate": 2.5641614909093066e-06, + "loss": 0.67795235, + "num_input_tokens_seen": 152355175, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.796875, + "step": 7097, + "time_per_iteration": 2.5192034244537354 + }, + { + "auxiliary_loss_clip": 0.01117089, + "auxiliary_loss_mlp": 0.0103085, + "balance_loss_clip": 1.01711667, + "balance_loss_mlp": 1.04297018, + "epoch": 0.42675484743724634, + "flos": 26541217676160.0, + "grad_norm": 1.8194330831870058, + "language_loss": 0.74512994, + "learning_rate": 2.5637878343125535e-06, + "loss": 0.76660931, + "num_input_tokens_seen": 152377245, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7421875, + "step": 7098, + "time_per_iteration": 2.498380661010742 + }, + { + "auxiliary_loss_clip": 0.01118318, + "auxiliary_loss_mlp": 0.01032967, + "balance_loss_clip": 1.01969302, + "balance_loss_mlp": 1.04259086, + "epoch": 0.4268149706899143, + "flos": 23112718467840.0, + "grad_norm": 1.7218259388529535, + "language_loss": 0.75169343, + "learning_rate": 2.5634141563369086e-06, + "loss": 0.77320623, + "num_input_tokens_seen": 152396985, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7578125, + "step": 7099, + "time_per_iteration": 2.4900684356689453 + }, + { + "auxiliary_loss_clip": 0.01122249, + "auxiliary_loss_mlp": 0.01039401, + "balance_loss_clip": 1.02458942, + "balance_loss_mlp": 1.0437479, + "epoch": 0.4268750939425823, + "flos": 22706532495360.0, + "grad_norm": 1.9952935228943551, + "language_loss": 0.83543229, + "learning_rate": 2.5630404569965432e-06, + "loss": 0.85704881, + "num_input_tokens_seen": 152415590, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78515625, + "step": 7100, + "time_per_iteration": 2.467902183532715 + }, + { + "auxiliary_loss_clip": 0.01121229, + "auxiliary_loss_mlp": 0.01034586, + "balance_loss_clip": 1.0208652, + "balance_loss_mlp": 1.04333866, + "epoch": 0.42693521719525024, + "flos": 25374875155200.0, + "grad_norm": 1.3501788659102136, + "language_loss": 0.82243335, + "learning_rate": 2.562666736305627e-06, + "loss": 0.84399146, + "num_input_tokens_seen": 152436735, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.77734375, + "step": 7101, + "time_per_iteration": 2.5363035202026367 + }, + { + "auxiliary_loss_clip": 0.01124462, + "auxiliary_loss_mlp": 0.01034069, + "balance_loss_clip": 1.01972795, + "balance_loss_mlp": 1.04426765, + "epoch": 0.42699534044791826, + "flos": 18150689957760.0, + "grad_norm": 1.8760573998828747, + "language_loss": 0.7243284, + "learning_rate": 2.5622929942783314e-06, + "loss": 0.74591374, + "num_input_tokens_seen": 152455685, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.80078125, + "step": 7102, + "time_per_iteration": 2.443894624710083 + }, + { + "auxiliary_loss_clip": 0.01117542, + "auxiliary_loss_mlp": 0.01033335, + "balance_loss_clip": 1.02012062, + "balance_loss_mlp": 1.04262853, + "epoch": 0.4270554637005862, + "flos": 13698413308800.0, + "grad_norm": 1.799822548331586, + "language_loss": 0.82910782, + "learning_rate": 2.5619192309288297e-06, + "loss": 0.85061657, + "num_input_tokens_seen": 152473500, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.75, + "step": 7103, + "time_per_iteration": 2.4751625061035156 + }, + { + "auxiliary_loss_clip": 0.01122919, + "auxiliary_loss_mlp": 0.01036815, + "balance_loss_clip": 1.02205122, + "balance_loss_mlp": 1.04319, + "epoch": 0.4271155869532542, + "flos": 17493596507520.0, + "grad_norm": 2.0452515416159227, + "language_loss": 0.73823762, + "learning_rate": 2.561545446271294e-06, + "loss": 0.759835, + "num_input_tokens_seen": 152491320, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.796875, + "step": 7104, + "time_per_iteration": 2.433727264404297 + }, + { + "auxiliary_loss_clip": 0.01120598, + "auxiliary_loss_mlp": 0.01031923, + "balance_loss_clip": 1.01842821, + "balance_loss_mlp": 1.04307532, + "epoch": 0.42717571020592215, + "flos": 32452293381120.0, + "grad_norm": 2.0713947006575713, + "language_loss": 0.75097072, + "learning_rate": 2.5611716403198987e-06, + "loss": 0.77249593, + "num_input_tokens_seen": 152511970, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.77734375, + "step": 7105, + "time_per_iteration": 2.553220748901367 + }, + { + "auxiliary_loss_clip": 0.01123627, + "auxiliary_loss_mlp": 0.01038594, + "balance_loss_clip": 1.02499223, + "balance_loss_mlp": 1.04497468, + "epoch": 0.4272358334585901, + "flos": 16253062444800.0, + "grad_norm": 1.944135826622959, + "language_loss": 0.7652669, + "learning_rate": 2.560797813088819e-06, + "loss": 0.78688908, + "num_input_tokens_seen": 152530515, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.78515625, + "step": 7106, + "time_per_iteration": 2.4320499897003174 + }, + { + "auxiliary_loss_clip": 0.01116905, + "auxiliary_loss_mlp": 0.0103438, + "balance_loss_clip": 1.02062345, + "balance_loss_mlp": 1.04073668, + "epoch": 0.4272959567112581, + "flos": 24200092938240.0, + "grad_norm": 1.7002032775641, + "language_loss": 0.79748225, + "learning_rate": 2.560423964592229e-06, + "loss": 0.81899506, + "num_input_tokens_seen": 152549295, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76171875, + "step": 7107, + "time_per_iteration": 2.5138087272644043 + }, + { + "auxiliary_loss_clip": 0.01118344, + "auxiliary_loss_mlp": 0.01032973, + "balance_loss_clip": 1.01978803, + "balance_loss_mlp": 1.04365969, + "epoch": 0.42735607996392605, + "flos": 27963495578880.0, + "grad_norm": 1.5777370161888564, + "language_loss": 0.67986816, + "learning_rate": 2.5600500948443075e-06, + "loss": 0.70138133, + "num_input_tokens_seen": 152570725, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.74609375, + "step": 7108, + "time_per_iteration": 2.5148940086364746 + }, + { + "auxiliary_loss_clip": 0.01118179, + "auxiliary_loss_mlp": 0.01037518, + "balance_loss_clip": 1.02417231, + "balance_loss_mlp": 1.04141963, + "epoch": 0.427416203216594, + "flos": 20295597674880.0, + "grad_norm": 1.697941372596268, + "language_loss": 0.71379381, + "learning_rate": 2.5596762038592294e-06, + "loss": 0.73535079, + "num_input_tokens_seen": 152588950, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.765625, + "step": 7109, + "time_per_iteration": 2.514293909072876 + }, + { + "auxiliary_loss_clip": 0.01119837, + "auxiliary_loss_mlp": 0.01032817, + "balance_loss_clip": 1.01668775, + "balance_loss_mlp": 1.04248762, + "epoch": 0.427476326469262, + "flos": 26943955943040.0, + "grad_norm": 1.808555345827523, + "language_loss": 0.64390564, + "learning_rate": 2.559302291651174e-06, + "loss": 0.66543221, + "num_input_tokens_seen": 152608965, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.7734375, + "step": 7110, + "time_per_iteration": 2.507896661758423 + }, + { + "auxiliary_loss_clip": 0.01121216, + "auxiliary_loss_mlp": 0.01033451, + "balance_loss_clip": 1.01876426, + "balance_loss_mlp": 1.04310989, + "epoch": 0.42753644972192995, + "flos": 25702847262720.0, + "grad_norm": 1.6911252843933642, + "language_loss": 0.76596475, + "learning_rate": 2.5589283582343197e-06, + "loss": 0.78751141, + "num_input_tokens_seen": 152630220, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78125, + "step": 7111, + "time_per_iteration": 2.5065102577209473 + }, + { + "auxiliary_loss_clip": 0.01122655, + "auxiliary_loss_mlp": 0.01034743, + "balance_loss_clip": 1.02051497, + "balance_loss_mlp": 1.04446638, + "epoch": 0.4275965729745979, + "flos": 18767419499520.0, + "grad_norm": 1.6101339491766522, + "language_loss": 0.73021042, + "learning_rate": 2.558554403622845e-06, + "loss": 0.75178432, + "num_input_tokens_seen": 152648835, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 7112, + "time_per_iteration": 2.462275266647339 + }, + { + "auxiliary_loss_clip": 0.0111568, + "auxiliary_loss_mlp": 0.01038733, + "balance_loss_clip": 1.02527392, + "balance_loss_mlp": 1.04112434, + "epoch": 0.4276566962272659, + "flos": 23764424878080.0, + "grad_norm": 1.5100904202471843, + "language_loss": 0.71723974, + "learning_rate": 2.5581804278309323e-06, + "loss": 0.7387839, + "num_input_tokens_seen": 152668375, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 7113, + "time_per_iteration": 2.517184019088745 + }, + { + "auxiliary_loss_clip": 0.01122905, + "auxiliary_loss_mlp": 0.01044494, + "balance_loss_clip": 1.03027248, + "balance_loss_mlp": 1.04463625, + "epoch": 0.42771681947993384, + "flos": 22492505306880.0, + "grad_norm": 4.019227207544938, + "language_loss": 0.62055492, + "learning_rate": 2.5578064308727617e-06, + "loss": 0.64222896, + "num_input_tokens_seen": 152689725, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 7114, + "time_per_iteration": 2.4808969497680664 + }, + { + "auxiliary_loss_clip": 0.01127351, + "auxiliary_loss_mlp": 0.01044357, + "balance_loss_clip": 1.02779305, + "balance_loss_mlp": 1.045439, + "epoch": 0.42777694273260186, + "flos": 25044712318080.0, + "grad_norm": 1.7285817614937915, + "language_loss": 0.64558339, + "learning_rate": 2.5574324127625153e-06, + "loss": 0.66730046, + "num_input_tokens_seen": 152709375, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8203125, + "step": 7115, + "time_per_iteration": 2.4979755878448486 + }, + { + "auxiliary_loss_clip": 0.01118312, + "auxiliary_loss_mlp": 0.01036556, + "balance_loss_clip": 1.02321672, + "balance_loss_mlp": 1.04225278, + "epoch": 0.4278370659852698, + "flos": 18661519226880.0, + "grad_norm": 1.5459011503250888, + "language_loss": 0.7331425, + "learning_rate": 2.5570583735143753e-06, + "loss": 0.75469118, + "num_input_tokens_seen": 152727510, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.76171875, + "step": 7116, + "time_per_iteration": 2.4514083862304688 + }, + { + "auxiliary_loss_clip": 0.01115475, + "auxiliary_loss_mlp": 0.01042446, + "balance_loss_clip": 1.02976263, + "balance_loss_mlp": 1.04102111, + "epoch": 0.4278971892379378, + "flos": 27308269635840.0, + "grad_norm": 1.5398002166428786, + "language_loss": 0.69214165, + "learning_rate": 2.5566843131425275e-06, + "loss": 0.7137208, + "num_input_tokens_seen": 152746670, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7421875, + "step": 7117, + "time_per_iteration": 2.522881269454956 + }, + { + "auxiliary_loss_clip": 0.01122059, + "auxiliary_loss_mlp": 0.01040733, + "balance_loss_clip": 1.02657676, + "balance_loss_mlp": 1.04530859, + "epoch": 0.42795731249060576, + "flos": 12888698970240.0, + "grad_norm": 2.268053258549222, + "language_loss": 0.69909632, + "learning_rate": 2.5563102316611536e-06, + "loss": 0.72072423, + "num_input_tokens_seen": 152760545, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.765625, + "step": 7118, + "time_per_iteration": 2.3870341777801514 + }, + { + "auxiliary_loss_clip": 0.01119033, + "auxiliary_loss_mlp": 0.01043307, + "balance_loss_clip": 1.02948511, + "balance_loss_mlp": 1.04353809, + "epoch": 0.4280174357432737, + "flos": 33401448316800.0, + "grad_norm": 2.1225989928468803, + "language_loss": 0.74740356, + "learning_rate": 2.55593612908444e-06, + "loss": 0.76902699, + "num_input_tokens_seen": 152780970, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75390625, + "step": 7119, + "time_per_iteration": 2.5487277507781982 + }, + { + "auxiliary_loss_clip": 0.01118083, + "auxiliary_loss_mlp": 0.01033891, + "balance_loss_clip": 1.02040291, + "balance_loss_mlp": 1.04196107, + "epoch": 0.4280775589959417, + "flos": 18259104182400.0, + "grad_norm": 1.8104905013477006, + "language_loss": 0.74987411, + "learning_rate": 2.555562005426573e-06, + "loss": 0.7713939, + "num_input_tokens_seen": 152798475, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.76171875, + "step": 7120, + "time_per_iteration": 2.415062427520752 + }, + { + "auxiliary_loss_clip": 0.01120406, + "auxiliary_loss_mlp": 0.01036574, + "balance_loss_clip": 1.02321029, + "balance_loss_mlp": 1.04422045, + "epoch": 0.42813768224860965, + "flos": 21471277731840.0, + "grad_norm": 1.6187265972443616, + "language_loss": 0.77002251, + "learning_rate": 2.5551878607017385e-06, + "loss": 0.7915923, + "num_input_tokens_seen": 152817555, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.76171875, + "step": 7121, + "time_per_iteration": 2.4686522483825684 + }, + { + "auxiliary_loss_clip": 0.01117478, + "auxiliary_loss_mlp": 0.01035893, + "balance_loss_clip": 1.02299464, + "balance_loss_mlp": 1.04225755, + "epoch": 0.4281978055012776, + "flos": 15669262696320.0, + "grad_norm": 1.8413618192799084, + "language_loss": 0.85525274, + "learning_rate": 2.554813694924126e-06, + "loss": 0.87678635, + "num_input_tokens_seen": 152836295, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.75, + "step": 7122, + "time_per_iteration": 2.4149863719940186 + }, + { + "auxiliary_loss_clip": 0.01114983, + "auxiliary_loss_mlp": 0.01034591, + "balance_loss_clip": 1.02088189, + "balance_loss_mlp": 1.04111362, + "epoch": 0.4282579287539456, + "flos": 17712005155200.0, + "grad_norm": 1.6495062264118223, + "language_loss": 0.81354666, + "learning_rate": 2.554439508107921e-06, + "loss": 0.83504236, + "num_input_tokens_seen": 152854950, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73828125, + "step": 7123, + "time_per_iteration": 2.4846510887145996 + }, + { + "auxiliary_loss_clip": 0.01116497, + "auxiliary_loss_mlp": 0.01035689, + "balance_loss_clip": 1.02171159, + "balance_loss_mlp": 1.04286349, + "epoch": 0.42831805200661355, + "flos": 19281157770240.0, + "grad_norm": 1.6842679543274752, + "language_loss": 0.81069416, + "learning_rate": 2.5540653002673153e-06, + "loss": 0.83221602, + "num_input_tokens_seen": 152873995, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.734375, + "step": 7124, + "time_per_iteration": 2.477781057357788 + }, + { + "auxiliary_loss_clip": 0.01116184, + "auxiliary_loss_mlp": 0.01039979, + "balance_loss_clip": 1.02485132, + "balance_loss_mlp": 1.04072952, + "epoch": 0.4283781752592815, + "flos": 19792633484160.0, + "grad_norm": 7.024350858631177, + "language_loss": 0.80178392, + "learning_rate": 2.553691071416498e-06, + "loss": 0.82334554, + "num_input_tokens_seen": 152892925, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.75390625, + "step": 7125, + "time_per_iteration": 2.466099262237549 + }, + { + "auxiliary_loss_clip": 0.01117521, + "auxiliary_loss_mlp": 0.01035658, + "balance_loss_clip": 1.0230993, + "balance_loss_mlp": 1.04386544, + "epoch": 0.4284382985119495, + "flos": 16508064072960.0, + "grad_norm": 1.7536027507395449, + "language_loss": 0.74772543, + "learning_rate": 2.553316821569659e-06, + "loss": 0.76925719, + "num_input_tokens_seen": 152910935, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.734375, + "step": 7126, + "time_per_iteration": 2.4476282596588135 + }, + { + "auxiliary_loss_clip": 0.01118141, + "auxiliary_loss_mlp": 0.01037481, + "balance_loss_clip": 1.02313387, + "balance_loss_mlp": 1.04261374, + "epoch": 0.42849842176461744, + "flos": 23330767979520.0, + "grad_norm": 2.2527301233175496, + "language_loss": 0.81376731, + "learning_rate": 2.5529425507409913e-06, + "loss": 0.83532357, + "num_input_tokens_seen": 152931030, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7578125, + "step": 7127, + "time_per_iteration": 2.50627064704895 + }, + { + "auxiliary_loss_clip": 0.01117482, + "auxiliary_loss_mlp": 0.01039553, + "balance_loss_clip": 1.02554011, + "balance_loss_mlp": 1.04140556, + "epoch": 0.4285585450172854, + "flos": 17274433674240.0, + "grad_norm": 1.7148593982179101, + "language_loss": 0.76451397, + "learning_rate": 2.5525682589446867e-06, + "loss": 0.78608435, + "num_input_tokens_seen": 152948085, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 7128, + "time_per_iteration": 2.4261910915374756 + }, + { + "auxiliary_loss_clip": 0.01119221, + "auxiliary_loss_mlp": 0.01034781, + "balance_loss_clip": 1.02018988, + "balance_loss_mlp": 1.04154372, + "epoch": 0.42861866826995343, + "flos": 24279599692800.0, + "grad_norm": 1.979642374109765, + "language_loss": 0.74111116, + "learning_rate": 2.552193946194937e-06, + "loss": 0.76265121, + "num_input_tokens_seen": 152966265, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.77734375, + "step": 7129, + "time_per_iteration": 2.4977691173553467 + }, + { + "auxiliary_loss_clip": 0.01119175, + "auxiliary_loss_mlp": 0.01034497, + "balance_loss_clip": 1.02102661, + "balance_loss_mlp": 1.04335773, + "epoch": 0.4286787915226214, + "flos": 24353108876160.0, + "grad_norm": 1.7995906720856931, + "language_loss": 0.77753568, + "learning_rate": 2.5518196125059394e-06, + "loss": 0.79907238, + "num_input_tokens_seen": 152986775, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 7130, + "time_per_iteration": 2.4983179569244385 + }, + { + "auxiliary_loss_clip": 0.01123055, + "auxiliary_loss_mlp": 0.01039363, + "balance_loss_clip": 1.02476025, + "balance_loss_mlp": 1.04523921, + "epoch": 0.42873891477528936, + "flos": 15449992122240.0, + "grad_norm": 1.8571755273934152, + "language_loss": 0.7349695, + "learning_rate": 2.551445257891886e-06, + "loss": 0.75659359, + "num_input_tokens_seen": 153003595, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.77734375, + "step": 7131, + "time_per_iteration": 2.5469563007354736 + }, + { + "auxiliary_loss_clip": 0.01120536, + "auxiliary_loss_mlp": 0.01036711, + "balance_loss_clip": 1.02257299, + "balance_loss_mlp": 1.04343748, + "epoch": 0.4287990380279573, + "flos": 17639573379840.0, + "grad_norm": 2.0596069487020268, + "language_loss": 0.76299751, + "learning_rate": 2.551070882366973e-06, + "loss": 0.78456992, + "num_input_tokens_seen": 153021960, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7734375, + "step": 7132, + "time_per_iteration": 2.432889223098755 + }, + { + "auxiliary_loss_clip": 0.01119567, + "auxiliary_loss_mlp": 0.01042884, + "balance_loss_clip": 1.02821565, + "balance_loss_mlp": 1.04352558, + "epoch": 0.4288591612806253, + "flos": 27162328677120.0, + "grad_norm": 1.5221162096651724, + "language_loss": 0.78525162, + "learning_rate": 2.550696485945397e-06, + "loss": 0.80687612, + "num_input_tokens_seen": 153042110, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7578125, + "step": 7133, + "time_per_iteration": 2.544379472732544 + }, + { + "auxiliary_loss_clip": 0.01120837, + "auxiliary_loss_mlp": 0.0103873, + "balance_loss_clip": 1.02484238, + "balance_loss_mlp": 1.04305482, + "epoch": 0.42891928453329325, + "flos": 17163182275200.0, + "grad_norm": 1.8479371259746051, + "language_loss": 0.75017452, + "learning_rate": 2.550322068641355e-06, + "loss": 0.77177012, + "num_input_tokens_seen": 153058925, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.77734375, + "step": 7134, + "time_per_iteration": 2.416792154312134 + }, + { + "auxiliary_loss_clip": 0.01114501, + "auxiliary_loss_mlp": 0.01031974, + "balance_loss_clip": 1.01937902, + "balance_loss_mlp": 1.04046178, + "epoch": 0.4289794077859612, + "flos": 18187031543040.0, + "grad_norm": 2.2902258120670975, + "language_loss": 0.84066433, + "learning_rate": 2.5499476304690455e-06, + "loss": 0.86212909, + "num_input_tokens_seen": 153078070, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7421875, + "step": 7135, + "time_per_iteration": 2.4513847827911377 + }, + { + "auxiliary_loss_clip": 0.01114319, + "auxiliary_loss_mlp": 0.01036122, + "balance_loss_clip": 1.02250218, + "balance_loss_mlp": 1.04050052, + "epoch": 0.4290395310386292, + "flos": 28256885867520.0, + "grad_norm": 1.9123929145525593, + "language_loss": 0.74716437, + "learning_rate": 2.549573171442666e-06, + "loss": 0.76866877, + "num_input_tokens_seen": 153096680, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73828125, + "step": 7136, + "time_per_iteration": 2.5260956287384033 + }, + { + "auxiliary_loss_clip": 0.01117454, + "auxiliary_loss_mlp": 0.01038013, + "balance_loss_clip": 1.0243752, + "balance_loss_mlp": 1.04027987, + "epoch": 0.42909965429129715, + "flos": 16216074414720.0, + "grad_norm": 1.9374198184766858, + "language_loss": 0.78982937, + "learning_rate": 2.5491986915764175e-06, + "loss": 0.81138408, + "num_input_tokens_seen": 153113305, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76953125, + "step": 7137, + "time_per_iteration": 6.664285898208618 + }, + { + "auxiliary_loss_clip": 0.01123569, + "auxiliary_loss_mlp": 0.01034938, + "balance_loss_clip": 1.02053773, + "balance_loss_mlp": 1.04498768, + "epoch": 0.4291597775439651, + "flos": 23112862122240.0, + "grad_norm": 1.8145904182691066, + "language_loss": 0.76599205, + "learning_rate": 2.548824190884499e-06, + "loss": 0.78757715, + "num_input_tokens_seen": 153132735, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7890625, + "step": 7138, + "time_per_iteration": 2.4640390872955322 + }, + { + "auxiliary_loss_clip": 0.01043511, + "auxiliary_loss_mlp": 0.01001663, + "balance_loss_clip": 1.00025678, + "balance_loss_mlp": 1.02006102, + "epoch": 0.4292199007966331, + "flos": 67546212681600.0, + "grad_norm": 0.7743592729173089, + "language_loss": 0.56193811, + "learning_rate": 2.548449669381113e-06, + "loss": 0.58238983, + "num_input_tokens_seen": 153187925, + "router_z_loss_clip": 0.01403809, + "router_z_loss_mlp": 0.234375, + "step": 7139, + "time_per_iteration": 2.938645362854004 + }, + { + "auxiliary_loss_clip": 0.01114131, + "auxiliary_loss_mlp": 0.01041532, + "balance_loss_clip": 1.02957499, + "balance_loss_mlp": 1.04185057, + "epoch": 0.42928002404930105, + "flos": 22999850956800.0, + "grad_norm": 1.6343660010586272, + "language_loss": 0.81107223, + "learning_rate": 2.5480751270804595e-06, + "loss": 0.83262885, + "num_input_tokens_seen": 153206990, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.72265625, + "step": 7140, + "time_per_iteration": 2.4621551036834717 + }, + { + "auxiliary_loss_clip": 0.01117324, + "auxiliary_loss_mlp": 0.01032433, + "balance_loss_clip": 1.01819944, + "balance_loss_mlp": 1.04155135, + "epoch": 0.429340147301969, + "flos": 11544922241280.0, + "grad_norm": 1.7453668118354997, + "language_loss": 0.81973499, + "learning_rate": 2.5477005639967424e-06, + "loss": 0.84123254, + "num_input_tokens_seen": 153222345, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7578125, + "step": 7141, + "time_per_iteration": 2.4552011489868164 + }, + { + "auxiliary_loss_clip": 0.011238, + "auxiliary_loss_mlp": 0.01040057, + "balance_loss_clip": 1.02569795, + "balance_loss_mlp": 1.04469872, + "epoch": 0.42940027055463703, + "flos": 25264988472960.0, + "grad_norm": 1.6365702711839187, + "language_loss": 0.86302745, + "learning_rate": 2.547325980144166e-06, + "loss": 0.88466609, + "num_input_tokens_seen": 153240570, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7890625, + "step": 7142, + "time_per_iteration": 2.466599464416504 + }, + { + "auxiliary_loss_clip": 0.01119038, + "auxiliary_loss_mlp": 0.01033549, + "balance_loss_clip": 1.0205493, + "balance_loss_mlp": 1.04692888, + "epoch": 0.429460393807305, + "flos": 23805004268160.0, + "grad_norm": 1.8779834210446977, + "language_loss": 0.78367496, + "learning_rate": 2.5469513755369323e-06, + "loss": 0.80520082, + "num_input_tokens_seen": 153259575, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71875, + "step": 7143, + "time_per_iteration": 2.528383493423462 + }, + { + "auxiliary_loss_clip": 0.01120121, + "auxiliary_loss_mlp": 0.01040708, + "balance_loss_clip": 1.02731538, + "balance_loss_mlp": 1.04566526, + "epoch": 0.42952051705997296, + "flos": 13918294414080.0, + "grad_norm": 2.185103050312315, + "language_loss": 0.76671416, + "learning_rate": 2.5465767501892484e-06, + "loss": 0.78832245, + "num_input_tokens_seen": 153276650, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.74609375, + "step": 7144, + "time_per_iteration": 2.4433047771453857 + }, + { + "auxiliary_loss_clip": 0.01119183, + "auxiliary_loss_mlp": 0.01031791, + "balance_loss_clip": 1.01801622, + "balance_loss_mlp": 1.043118, + "epoch": 0.4295806403126409, + "flos": 26760380509440.0, + "grad_norm": 2.969999234773645, + "language_loss": 0.73481476, + "learning_rate": 2.54620210411532e-06, + "loss": 0.75632453, + "num_input_tokens_seen": 153298025, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76171875, + "step": 7145, + "time_per_iteration": 2.5330073833465576 + }, + { + "auxiliary_loss_clip": 0.01120569, + "auxiliary_loss_mlp": 0.01038539, + "balance_loss_clip": 1.02447844, + "balance_loss_mlp": 1.04405165, + "epoch": 0.4296407635653089, + "flos": 20952619297920.0, + "grad_norm": 1.854643653820381, + "language_loss": 0.78928959, + "learning_rate": 2.545827437329352e-06, + "loss": 0.81088066, + "num_input_tokens_seen": 153315775, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.765625, + "step": 7146, + "time_per_iteration": 2.4481821060180664 + }, + { + "auxiliary_loss_clip": 0.01116396, + "auxiliary_loss_mlp": 0.0102847, + "balance_loss_clip": 1.01590514, + "balance_loss_mlp": 1.04295409, + "epoch": 0.42970088681797686, + "flos": 15852335339520.0, + "grad_norm": 1.9767254736067894, + "language_loss": 0.83134973, + "learning_rate": 2.5454527498455532e-06, + "loss": 0.85279846, + "num_input_tokens_seen": 153332765, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.734375, + "step": 7147, + "time_per_iteration": 2.500633478164673 + }, + { + "auxiliary_loss_clip": 0.01124897, + "auxiliary_loss_mlp": 0.01039853, + "balance_loss_clip": 1.02473724, + "balance_loss_mlp": 1.04802537, + "epoch": 0.4297610100706448, + "flos": 22382618624640.0, + "grad_norm": 1.8398177405042841, + "language_loss": 0.86894512, + "learning_rate": 2.545078041678131e-06, + "loss": 0.89059258, + "num_input_tokens_seen": 153350760, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.76953125, + "step": 7148, + "time_per_iteration": 2.481743097305298 + }, + { + "auxiliary_loss_clip": 0.01120854, + "auxiliary_loss_mlp": 0.01036737, + "balance_loss_clip": 1.02405918, + "balance_loss_mlp": 1.04469061, + "epoch": 0.4298211333233128, + "flos": 27925681536000.0, + "grad_norm": 1.5258683369520107, + "language_loss": 0.77855921, + "learning_rate": 2.5447033128412957e-06, + "loss": 0.80013508, + "num_input_tokens_seen": 153370765, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.76171875, + "step": 7149, + "time_per_iteration": 2.6060431003570557 + }, + { + "auxiliary_loss_clip": 0.01118454, + "auxiliary_loss_mlp": 0.01036469, + "balance_loss_clip": 1.02247977, + "balance_loss_mlp": 1.04456902, + "epoch": 0.42988125657598075, + "flos": 24425612478720.0, + "grad_norm": 1.7047076849986806, + "language_loss": 0.79828095, + "learning_rate": 2.544328563349256e-06, + "loss": 0.81983018, + "num_input_tokens_seen": 153390725, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.73828125, + "step": 7150, + "time_per_iteration": 2.4652955532073975 + }, + { + "auxiliary_loss_clip": 0.01125949, + "auxiliary_loss_mlp": 0.0104301, + "balance_loss_clip": 1.02763176, + "balance_loss_mlp": 1.0467031, + "epoch": 0.4299413798286487, + "flos": 15850180523520.0, + "grad_norm": 1.7972230644563891, + "language_loss": 0.74738395, + "learning_rate": 2.5439537932162222e-06, + "loss": 0.76907349, + "num_input_tokens_seen": 153408010, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.79296875, + "step": 7151, + "time_per_iteration": 2.5019421577453613 + }, + { + "auxiliary_loss_clip": 0.0112419, + "auxiliary_loss_mlp": 0.01036782, + "balance_loss_clip": 1.02284098, + "balance_loss_mlp": 1.0458225, + "epoch": 0.4300015030813167, + "flos": 22309504490880.0, + "grad_norm": 1.924911798883302, + "language_loss": 0.70084447, + "learning_rate": 2.543579002456406e-06, + "loss": 0.72245419, + "num_input_tokens_seen": 153426865, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.78515625, + "step": 7152, + "time_per_iteration": 2.456465482711792 + }, + { + "auxiliary_loss_clip": 0.01117938, + "auxiliary_loss_mlp": 0.01035992, + "balance_loss_clip": 1.02268243, + "balance_loss_mlp": 1.04186821, + "epoch": 0.43006162633398465, + "flos": 34897666366080.0, + "grad_norm": 1.5367633238023177, + "language_loss": 0.71064591, + "learning_rate": 2.54320419108402e-06, + "loss": 0.73218524, + "num_input_tokens_seen": 153449410, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7578125, + "step": 7153, + "time_per_iteration": 2.6120920181274414 + }, + { + "auxiliary_loss_clip": 0.01120146, + "auxiliary_loss_mlp": 0.01033168, + "balance_loss_clip": 1.01941729, + "balance_loss_mlp": 1.04342091, + "epoch": 0.4301217495866526, + "flos": 15961575576960.0, + "grad_norm": 1.8794751780958798, + "language_loss": 0.79155993, + "learning_rate": 2.542829359113276e-06, + "loss": 0.81309307, + "num_input_tokens_seen": 153467910, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.765625, + "step": 7154, + "time_per_iteration": 2.4222962856292725 + }, + { + "auxiliary_loss_clip": 0.0111738, + "auxiliary_loss_mlp": 0.01030563, + "balance_loss_clip": 1.01818347, + "balance_loss_mlp": 1.04361236, + "epoch": 0.43018187283932063, + "flos": 18770364414720.0, + "grad_norm": 1.4801057977091479, + "language_loss": 0.78793395, + "learning_rate": 2.542454506558389e-06, + "loss": 0.80941343, + "num_input_tokens_seen": 153487100, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.73828125, + "step": 7155, + "time_per_iteration": 2.4554193019866943 + }, + { + "auxiliary_loss_clip": 0.01117238, + "auxiliary_loss_mlp": 0.01028737, + "balance_loss_clip": 1.01582694, + "balance_loss_mlp": 1.04335082, + "epoch": 0.4302419960919886, + "flos": 20151703791360.0, + "grad_norm": 1.7176839192841982, + "language_loss": 0.88779187, + "learning_rate": 2.5420796334335723e-06, + "loss": 0.90925157, + "num_input_tokens_seen": 153505565, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.734375, + "step": 7156, + "time_per_iteration": 2.446831464767456 + }, + { + "auxiliary_loss_clip": 0.01120931, + "auxiliary_loss_mlp": 0.01033948, + "balance_loss_clip": 1.01953602, + "balance_loss_mlp": 1.04361558, + "epoch": 0.43030211934465656, + "flos": 26432731624320.0, + "grad_norm": 1.9517774058288286, + "language_loss": 0.82738447, + "learning_rate": 2.541704739753042e-06, + "loss": 0.84893334, + "num_input_tokens_seen": 153526130, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7734375, + "step": 7157, + "time_per_iteration": 2.5298144817352295 + }, + { + "auxiliary_loss_clip": 0.01124397, + "auxiliary_loss_mlp": 0.01035742, + "balance_loss_clip": 1.0210917, + "balance_loss_mlp": 1.04532623, + "epoch": 0.43036224259732453, + "flos": 24389234979840.0, + "grad_norm": 1.8458285726729726, + "language_loss": 0.72177351, + "learning_rate": 2.5413298255310132e-06, + "loss": 0.74337494, + "num_input_tokens_seen": 153546370, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7890625, + "step": 7158, + "time_per_iteration": 2.4691712856292725 + }, + { + "auxiliary_loss_clip": 0.0111825, + "auxiliary_loss_mlp": 0.0103104, + "balance_loss_clip": 1.01796317, + "balance_loss_mlp": 1.04215837, + "epoch": 0.4304223658499925, + "flos": 17201714590080.0, + "grad_norm": 2.077812294320108, + "language_loss": 0.82865965, + "learning_rate": 2.5409548907817034e-06, + "loss": 0.85015261, + "num_input_tokens_seen": 153562800, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7578125, + "step": 7159, + "time_per_iteration": 2.4462857246398926 + }, + { + "auxiliary_loss_clip": 0.01118056, + "auxiliary_loss_mlp": 0.01032317, + "balance_loss_clip": 1.01887655, + "balance_loss_mlp": 1.04236865, + "epoch": 0.43048248910266046, + "flos": 14903000835840.0, + "grad_norm": 2.094804075931644, + "language_loss": 0.83043528, + "learning_rate": 2.54057993551933e-06, + "loss": 0.85193908, + "num_input_tokens_seen": 153578395, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 7160, + "time_per_iteration": 2.587928533554077 + }, + { + "auxiliary_loss_clip": 0.01123066, + "auxiliary_loss_mlp": 0.01038163, + "balance_loss_clip": 1.02249885, + "balance_loss_mlp": 1.04402685, + "epoch": 0.4305426123553284, + "flos": 21579835610880.0, + "grad_norm": 3.027641474238522, + "language_loss": 0.77379316, + "learning_rate": 2.5402049597581116e-06, + "loss": 0.79540545, + "num_input_tokens_seen": 153596880, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.79296875, + "step": 7161, + "time_per_iteration": 2.502628803253174 + }, + { + "auxiliary_loss_clip": 0.01119327, + "auxiliary_loss_mlp": 0.01034462, + "balance_loss_clip": 1.020854, + "balance_loss_mlp": 1.04304039, + "epoch": 0.4306027356079964, + "flos": 22601278667520.0, + "grad_norm": 2.05136398687674, + "language_loss": 0.73137891, + "learning_rate": 2.5398299635122662e-06, + "loss": 0.75291681, + "num_input_tokens_seen": 153616570, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.765625, + "step": 7162, + "time_per_iteration": 2.439053773880005 + }, + { + "auxiliary_loss_clip": 0.01042786, + "auxiliary_loss_mlp": 0.01005692, + "balance_loss_clip": 1.00411832, + "balance_loss_mlp": 1.01966858, + "epoch": 0.43066285886066435, + "flos": 70672091806080.0, + "grad_norm": 0.7926335078551056, + "language_loss": 0.59016478, + "learning_rate": 2.5394549467960147e-06, + "loss": 0.61064959, + "num_input_tokens_seen": 153671450, + "router_z_loss_clip": 0.01574707, + "router_z_loss_mlp": 0.23046875, + "step": 7163, + "time_per_iteration": 2.9588072299957275 + }, + { + "auxiliary_loss_clip": 0.01115064, + "auxiliary_loss_mlp": 0.01035604, + "balance_loss_clip": 1.02299142, + "balance_loss_mlp": 1.04035139, + "epoch": 0.4307229821133323, + "flos": 26720591218560.0, + "grad_norm": 1.6277980092745115, + "language_loss": 0.79140532, + "learning_rate": 2.5390799096235783e-06, + "loss": 0.81291205, + "num_input_tokens_seen": 153691405, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.74609375, + "step": 7164, + "time_per_iteration": 2.484001398086548 + }, + { + "auxiliary_loss_clip": 0.01119155, + "auxiliary_loss_mlp": 0.01041343, + "balance_loss_clip": 1.0275383, + "balance_loss_mlp": 1.04078794, + "epoch": 0.4307831053660003, + "flos": 26177119464960.0, + "grad_norm": 1.8180486110770353, + "language_loss": 0.67282438, + "learning_rate": 2.538704852009177e-06, + "loss": 0.69442934, + "num_input_tokens_seen": 153711555, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.78125, + "step": 7165, + "time_per_iteration": 2.533599376678467 + }, + { + "auxiliary_loss_clip": 0.01119036, + "auxiliary_loss_mlp": 0.01043422, + "balance_loss_clip": 1.03069651, + "balance_loss_mlp": 1.04327762, + "epoch": 0.43084322861866825, + "flos": 18910343715840.0, + "grad_norm": 1.850302447549428, + "language_loss": 0.75248688, + "learning_rate": 2.538329773967034e-06, + "loss": 0.77411151, + "num_input_tokens_seen": 153730095, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7578125, + "step": 7166, + "time_per_iteration": 2.439861536026001 + }, + { + "auxiliary_loss_clip": 0.01117069, + "auxiliary_loss_mlp": 0.01036345, + "balance_loss_clip": 1.0239172, + "balance_loss_mlp": 1.04362941, + "epoch": 0.4309033518713362, + "flos": 26432911192320.0, + "grad_norm": 1.612504951400803, + "language_loss": 0.71537554, + "learning_rate": 2.537954675511372e-06, + "loss": 0.73690969, + "num_input_tokens_seen": 153749320, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.734375, + "step": 7167, + "time_per_iteration": 2.499190092086792 + }, + { + "auxiliary_loss_clip": 0.01111616, + "auxiliary_loss_mlp": 0.01034998, + "balance_loss_clip": 1.02232647, + "balance_loss_mlp": 1.03984129, + "epoch": 0.43096347512400424, + "flos": 21213295274880.0, + "grad_norm": 1.6022700342177734, + "language_loss": 0.78459173, + "learning_rate": 2.537579556656414e-06, + "loss": 0.80605787, + "num_input_tokens_seen": 153767825, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 7168, + "time_per_iteration": 2.4372310638427734 + }, + { + "auxiliary_loss_clip": 0.01118326, + "auxiliary_loss_mlp": 0.01041364, + "balance_loss_clip": 1.02733326, + "balance_loss_mlp": 1.04224193, + "epoch": 0.4310235983766722, + "flos": 16540131939840.0, + "grad_norm": 2.3121674941994383, + "language_loss": 0.82260263, + "learning_rate": 2.537204417416387e-06, + "loss": 0.8441996, + "num_input_tokens_seen": 153785350, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 7169, + "time_per_iteration": 2.4545183181762695 + }, + { + "auxiliary_loss_clip": 0.01038578, + "auxiliary_loss_mlp": 0.01010207, + "balance_loss_clip": 1.00865698, + "balance_loss_mlp": 1.0153358, + "epoch": 0.43108372162934017, + "flos": 64775704763520.0, + "grad_norm": 0.6800543146405372, + "language_loss": 0.60812157, + "learning_rate": 2.5368292578055132e-06, + "loss": 0.62860942, + "num_input_tokens_seen": 153856400, + "router_z_loss_clip": 0.01550293, + "router_z_loss_mlp": 0.23242188, + "step": 7170, + "time_per_iteration": 3.2204582691192627 + }, + { + "auxiliary_loss_clip": 0.01116832, + "auxiliary_loss_mlp": 0.01033041, + "balance_loss_clip": 1.02039874, + "balance_loss_mlp": 1.04148889, + "epoch": 0.43114384488200813, + "flos": 13444094039040.0, + "grad_norm": 2.0659828341911615, + "language_loss": 0.76225841, + "learning_rate": 2.536454077838021e-06, + "loss": 0.78375715, + "num_input_tokens_seen": 153875230, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.75390625, + "step": 7171, + "time_per_iteration": 2.465665817260742 + }, + { + "auxiliary_loss_clip": 0.01117327, + "auxiliary_loss_mlp": 0.0103468, + "balance_loss_clip": 1.02172232, + "balance_loss_mlp": 1.04197574, + "epoch": 0.4312039681346761, + "flos": 26286682924800.0, + "grad_norm": 1.6834410044967325, + "language_loss": 0.77283418, + "learning_rate": 2.5360788775281357e-06, + "loss": 0.7943542, + "num_input_tokens_seen": 153894740, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.75390625, + "step": 7172, + "time_per_iteration": 2.4916739463806152 + }, + { + "auxiliary_loss_clip": 0.0111787, + "auxiliary_loss_mlp": 0.01039799, + "balance_loss_clip": 1.02544653, + "balance_loss_mlp": 1.04015696, + "epoch": 0.43126409138734406, + "flos": 20376684627840.0, + "grad_norm": 1.7953579135961333, + "language_loss": 0.76852405, + "learning_rate": 2.535703656890086e-06, + "loss": 0.79010069, + "num_input_tokens_seen": 153913230, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.77734375, + "step": 7173, + "time_per_iteration": 2.4764745235443115 + }, + { + "auxiliary_loss_clip": 0.01115542, + "auxiliary_loss_mlp": 0.01029073, + "balance_loss_clip": 1.01571512, + "balance_loss_mlp": 1.04070854, + "epoch": 0.431324214640012, + "flos": 22123091882880.0, + "grad_norm": 1.4568106417702447, + "language_loss": 0.77103329, + "learning_rate": 2.5353284159381e-06, + "loss": 0.79247946, + "num_input_tokens_seen": 153933250, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.75, + "step": 7174, + "time_per_iteration": 2.4860222339630127 + }, + { + "auxiliary_loss_clip": 0.01119703, + "auxiliary_loss_mlp": 0.01032961, + "balance_loss_clip": 1.01815498, + "balance_loss_mlp": 1.04199743, + "epoch": 0.43138433789268, + "flos": 15231008856960.0, + "grad_norm": 1.4198827217143106, + "language_loss": 0.82505399, + "learning_rate": 2.534953154686407e-06, + "loss": 0.84658062, + "num_input_tokens_seen": 153951325, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.77734375, + "step": 7175, + "time_per_iteration": 2.462977647781372 + }, + { + "auxiliary_loss_clip": 0.01121086, + "auxiliary_loss_mlp": 0.01036761, + "balance_loss_clip": 1.0223192, + "balance_loss_mlp": 1.04153752, + "epoch": 0.43144446114534796, + "flos": 18150294908160.0, + "grad_norm": 2.338333143716513, + "language_loss": 0.74985862, + "learning_rate": 2.5345778731492366e-06, + "loss": 0.77143705, + "num_input_tokens_seen": 153966975, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.79296875, + "step": 7176, + "time_per_iteration": 2.4185218811035156 + }, + { + "auxiliary_loss_clip": 0.01117308, + "auxiliary_loss_mlp": 0.01034436, + "balance_loss_clip": 1.020643, + "balance_loss_mlp": 1.03969014, + "epoch": 0.4315045843980159, + "flos": 22929861306240.0, + "grad_norm": 1.6024853029290826, + "language_loss": 0.73364419, + "learning_rate": 2.534202571340819e-06, + "loss": 0.75516164, + "num_input_tokens_seen": 153986695, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.77734375, + "step": 7177, + "time_per_iteration": 2.487114667892456 + }, + { + "auxiliary_loss_clip": 0.01124437, + "auxiliary_loss_mlp": 0.01042376, + "balance_loss_clip": 1.0264492, + "balance_loss_mlp": 1.04060507, + "epoch": 0.4315647076506839, + "flos": 22126862810880.0, + "grad_norm": 1.878519248272382, + "language_loss": 0.81681836, + "learning_rate": 2.533827249275387e-06, + "loss": 0.83848649, + "num_input_tokens_seen": 154004710, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8359375, + "step": 7178, + "time_per_iteration": 2.443887948989868 + }, + { + "auxiliary_loss_clip": 0.01113093, + "auxiliary_loss_mlp": 0.01033286, + "balance_loss_clip": 1.01988733, + "balance_loss_mlp": 1.04052329, + "epoch": 0.43162483090335185, + "flos": 26871129118080.0, + "grad_norm": 1.4541906286028654, + "language_loss": 0.83824348, + "learning_rate": 2.5334519069671725e-06, + "loss": 0.8597073, + "num_input_tokens_seen": 154024320, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7265625, + "step": 7179, + "time_per_iteration": 5.329441547393799 + }, + { + "auxiliary_loss_clip": 0.01114931, + "auxiliary_loss_mlp": 0.01033766, + "balance_loss_clip": 1.02040303, + "balance_loss_mlp": 1.03945267, + "epoch": 0.4316849541560198, + "flos": 13913122855680.0, + "grad_norm": 2.045990303945265, + "language_loss": 0.75710779, + "learning_rate": 2.5330765444304075e-06, + "loss": 0.77859473, + "num_input_tokens_seen": 154041755, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.75390625, + "step": 7180, + "time_per_iteration": 2.5520315170288086 + }, + { + "auxiliary_loss_clip": 0.01116541, + "auxiliary_loss_mlp": 0.0103858, + "balance_loss_clip": 1.0240128, + "balance_loss_mlp": 1.03862667, + "epoch": 0.4317450774086878, + "flos": 16435165420800.0, + "grad_norm": 1.7639080321754919, + "language_loss": 0.81907403, + "learning_rate": 2.5327011616793274e-06, + "loss": 0.84062529, + "num_input_tokens_seen": 154056775, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.78125, + "step": 7181, + "time_per_iteration": 2.4059271812438965 + }, + { + "auxiliary_loss_clip": 0.01118777, + "auxiliary_loss_mlp": 0.0103845, + "balance_loss_clip": 1.02357888, + "balance_loss_mlp": 1.04020417, + "epoch": 0.4318052006613558, + "flos": 20554980762240.0, + "grad_norm": 1.5777864051255721, + "language_loss": 0.88434547, + "learning_rate": 2.532325758728165e-06, + "loss": 0.90591776, + "num_input_tokens_seen": 154075015, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78515625, + "step": 7182, + "time_per_iteration": 2.463463306427002 + }, + { + "auxiliary_loss_clip": 0.01113816, + "auxiliary_loss_mlp": 0.0103166, + "balance_loss_clip": 1.01873803, + "balance_loss_mlp": 1.03918862, + "epoch": 0.43186532391402377, + "flos": 22820046451200.0, + "grad_norm": 1.70694658333996, + "language_loss": 0.75826657, + "learning_rate": 2.5319503355911566e-06, + "loss": 0.77972138, + "num_input_tokens_seen": 154095170, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.74609375, + "step": 7183, + "time_per_iteration": 2.4562740325927734 + }, + { + "auxiliary_loss_clip": 0.01116225, + "auxiliary_loss_mlp": 0.01032272, + "balance_loss_clip": 1.01819921, + "balance_loss_mlp": 1.03917336, + "epoch": 0.43192544716669173, + "flos": 25556583081600.0, + "grad_norm": 2.311500131527462, + "language_loss": 0.77666485, + "learning_rate": 2.5315748922825393e-06, + "loss": 0.79814982, + "num_input_tokens_seen": 154116895, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76953125, + "step": 7184, + "time_per_iteration": 2.5283145904541016 + }, + { + "auxiliary_loss_clip": 0.01110208, + "auxiliary_loss_mlp": 0.01033883, + "balance_loss_clip": 1.02065074, + "balance_loss_mlp": 1.03938413, + "epoch": 0.4319855704193597, + "flos": 30954674701440.0, + "grad_norm": 1.5490664406704935, + "language_loss": 0.73325193, + "learning_rate": 2.5311994288165474e-06, + "loss": 0.75469285, + "num_input_tokens_seen": 154138395, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.70703125, + "step": 7185, + "time_per_iteration": 2.520885467529297 + }, + { + "auxiliary_loss_clip": 0.01118704, + "auxiliary_loss_mlp": 0.01037072, + "balance_loss_clip": 1.02283251, + "balance_loss_mlp": 1.03961062, + "epoch": 0.43204569367202766, + "flos": 24238732993920.0, + "grad_norm": 2.5540588454326, + "language_loss": 0.75974178, + "learning_rate": 2.530823945207421e-06, + "loss": 0.78129953, + "num_input_tokens_seen": 154156775, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7890625, + "step": 7186, + "time_per_iteration": 2.5005605220794678 + }, + { + "auxiliary_loss_clip": 0.01116031, + "auxiliary_loss_mlp": 0.01035244, + "balance_loss_clip": 1.02164185, + "balance_loss_mlp": 1.03987479, + "epoch": 0.43210581692469563, + "flos": 18406948561920.0, + "grad_norm": 5.067701176656461, + "language_loss": 0.76043296, + "learning_rate": 2.5304484414693962e-06, + "loss": 0.78194571, + "num_input_tokens_seen": 154177500, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.76171875, + "step": 7187, + "time_per_iteration": 2.4769227504730225 + }, + { + "auxiliary_loss_clip": 0.0103801, + "auxiliary_loss_mlp": 0.00999247, + "balance_loss_clip": 0.99792367, + "balance_loss_mlp": 1.0145607, + "epoch": 0.4321659401773636, + "flos": 49832378910720.0, + "grad_norm": 0.8526585096921939, + "language_loss": 0.68180382, + "learning_rate": 2.530072917616714e-06, + "loss": 0.70217645, + "num_input_tokens_seen": 154237110, + "router_z_loss_clip": 0.01324463, + "router_z_loss_mlp": 0.234375, + "step": 7188, + "time_per_iteration": 3.095301389694214 + }, + { + "auxiliary_loss_clip": 0.01112959, + "auxiliary_loss_mlp": 0.01035631, + "balance_loss_clip": 1.02231503, + "balance_loss_mlp": 1.03992498, + "epoch": 0.43222606343003156, + "flos": 17128564542720.0, + "grad_norm": 1.742468102969242, + "language_loss": 0.7809816, + "learning_rate": 2.529697373663614e-06, + "loss": 0.80246753, + "num_input_tokens_seen": 154253910, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73046875, + "step": 7189, + "time_per_iteration": 2.4332470893859863 + }, + { + "auxiliary_loss_clip": 0.01118752, + "auxiliary_loss_mlp": 0.01041299, + "balance_loss_clip": 1.0263027, + "balance_loss_mlp": 1.03817415, + "epoch": 0.4322861866826995, + "flos": 22749949059840.0, + "grad_norm": 1.8713383629003246, + "language_loss": 0.7119785, + "learning_rate": 2.5293218096243364e-06, + "loss": 0.73357898, + "num_input_tokens_seen": 154274770, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8046875, + "step": 7190, + "time_per_iteration": 2.494537115097046 + }, + { + "auxiliary_loss_clip": 0.01113042, + "auxiliary_loss_mlp": 0.0103584, + "balance_loss_clip": 1.02275729, + "balance_loss_mlp": 1.0380528, + "epoch": 0.4323463099353675, + "flos": 27891925729920.0, + "grad_norm": 1.5245278530879214, + "language_loss": 0.79833174, + "learning_rate": 2.5289462255131223e-06, + "loss": 0.81982064, + "num_input_tokens_seen": 154295035, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.75, + "step": 7191, + "time_per_iteration": 2.478376865386963 + }, + { + "auxiliary_loss_clip": 0.01113503, + "auxiliary_loss_mlp": 0.0103395, + "balance_loss_clip": 1.020944, + "balance_loss_mlp": 1.03872573, + "epoch": 0.43240643318803546, + "flos": 21614740652160.0, + "grad_norm": 1.7647822638177795, + "language_loss": 0.74647141, + "learning_rate": 2.5285706213442146e-06, + "loss": 0.76794595, + "num_input_tokens_seen": 154314905, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.75, + "step": 7192, + "time_per_iteration": 2.4613609313964844 + }, + { + "auxiliary_loss_clip": 0.011176, + "auxiliary_loss_mlp": 0.01041388, + "balance_loss_clip": 1.02696347, + "balance_loss_mlp": 1.04183233, + "epoch": 0.4324665564407034, + "flos": 17558378686080.0, + "grad_norm": 2.014554632256561, + "language_loss": 0.78898597, + "learning_rate": 2.5281949971318557e-06, + "loss": 0.81057584, + "num_input_tokens_seen": 154331740, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7578125, + "step": 7193, + "time_per_iteration": 2.4220309257507324 + }, + { + "auxiliary_loss_clip": 0.011155, + "auxiliary_loss_mlp": 0.01040121, + "balance_loss_clip": 1.02609015, + "balance_loss_mlp": 1.0394038, + "epoch": 0.4325266796933714, + "flos": 18402423448320.0, + "grad_norm": 1.7200377707292065, + "language_loss": 0.75406849, + "learning_rate": 2.5278193528902897e-06, + "loss": 0.77562475, + "num_input_tokens_seen": 154348740, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 7194, + "time_per_iteration": 2.466512441635132 + }, + { + "auxiliary_loss_clip": 0.01117198, + "auxiliary_loss_mlp": 0.01037831, + "balance_loss_clip": 1.02435398, + "balance_loss_mlp": 1.04108119, + "epoch": 0.4325868029460394, + "flos": 22564793427840.0, + "grad_norm": 5.005212308773382, + "language_loss": 0.60044503, + "learning_rate": 2.5274436886337613e-06, + "loss": 0.62199533, + "num_input_tokens_seen": 154368835, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.76171875, + "step": 7195, + "time_per_iteration": 2.4522454738616943 + }, + { + "auxiliary_loss_clip": 0.0111962, + "auxiliary_loss_mlp": 0.01041876, + "balance_loss_clip": 1.02713561, + "balance_loss_mlp": 1.04041934, + "epoch": 0.43264692619870737, + "flos": 14605516396800.0, + "grad_norm": 2.2806268233026628, + "language_loss": 0.64930809, + "learning_rate": 2.527068004376515e-06, + "loss": 0.67092311, + "num_input_tokens_seen": 154384620, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.7890625, + "step": 7196, + "time_per_iteration": 2.4453718662261963 + }, + { + "auxiliary_loss_clip": 0.011204, + "auxiliary_loss_mlp": 0.01034681, + "balance_loss_clip": 1.02024436, + "balance_loss_mlp": 1.04024911, + "epoch": 0.43270704945137534, + "flos": 21501657659520.0, + "grad_norm": 4.696072713783665, + "language_loss": 0.72759318, + "learning_rate": 2.526692300132797e-06, + "loss": 0.74914396, + "num_input_tokens_seen": 154402865, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.80078125, + "step": 7197, + "time_per_iteration": 2.500256061553955 + }, + { + "auxiliary_loss_clip": 0.01116404, + "auxiliary_loss_mlp": 0.01045003, + "balance_loss_clip": 1.03106129, + "balance_loss_mlp": 1.04246271, + "epoch": 0.4327671727040433, + "flos": 25155891889920.0, + "grad_norm": 1.598666024351184, + "language_loss": 0.72644413, + "learning_rate": 2.5263165759168547e-06, + "loss": 0.7480582, + "num_input_tokens_seen": 154423625, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7421875, + "step": 7198, + "time_per_iteration": 2.567762613296509 + }, + { + "auxiliary_loss_clip": 0.01115203, + "auxiliary_loss_mlp": 0.01034805, + "balance_loss_clip": 1.02138782, + "balance_loss_mlp": 1.03913903, + "epoch": 0.43282729595671127, + "flos": 25447163276160.0, + "grad_norm": 1.3766106050597056, + "language_loss": 0.81292808, + "learning_rate": 2.525940831742934e-06, + "loss": 0.83442813, + "num_input_tokens_seen": 154444775, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7578125, + "step": 7199, + "time_per_iteration": 2.4782636165618896 + }, + { + "auxiliary_loss_clip": 0.01118715, + "auxiliary_loss_mlp": 0.01041613, + "balance_loss_clip": 1.02829099, + "balance_loss_mlp": 1.04219055, + "epoch": 0.43288741920937923, + "flos": 24126116878080.0, + "grad_norm": 2.2182298419994346, + "language_loss": 0.68883061, + "learning_rate": 2.525565067625286e-06, + "loss": 0.71043384, + "num_input_tokens_seen": 154460815, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.765625, + "step": 7200, + "time_per_iteration": 2.4730873107910156 + }, + { + "auxiliary_loss_clip": 0.01117767, + "auxiliary_loss_mlp": 0.01043187, + "balance_loss_clip": 1.02809453, + "balance_loss_mlp": 1.04055738, + "epoch": 0.4329475424620472, + "flos": 19204955066880.0, + "grad_norm": 2.134839210265846, + "language_loss": 0.87135142, + "learning_rate": 2.525189283578157e-06, + "loss": 0.89296097, + "num_input_tokens_seen": 154479145, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7734375, + "step": 7201, + "time_per_iteration": 2.47463321685791 + }, + { + "auxiliary_loss_clip": 0.01125345, + "auxiliary_loss_mlp": 0.01042574, + "balance_loss_clip": 1.02696979, + "balance_loss_mlp": 1.04488945, + "epoch": 0.43300766571471516, + "flos": 22638374438400.0, + "grad_norm": 2.16649852661544, + "language_loss": 0.64551014, + "learning_rate": 2.5248134796157974e-06, + "loss": 0.66718936, + "num_input_tokens_seen": 154498905, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8046875, + "step": 7202, + "time_per_iteration": 2.520963668823242 + }, + { + "auxiliary_loss_clip": 0.0111734, + "auxiliary_loss_mlp": 0.01031708, + "balance_loss_clip": 1.01931047, + "balance_loss_mlp": 1.04092193, + "epoch": 0.4330677889673831, + "flos": 22121080721280.0, + "grad_norm": 1.7838197935762699, + "language_loss": 0.81707418, + "learning_rate": 2.5244376557524586e-06, + "loss": 0.83856463, + "num_input_tokens_seen": 154517270, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.765625, + "step": 7203, + "time_per_iteration": 2.474724531173706 + }, + { + "auxiliary_loss_clip": 0.01121178, + "auxiliary_loss_mlp": 0.01047095, + "balance_loss_clip": 1.03284955, + "balance_loss_mlp": 1.04118741, + "epoch": 0.4331279122200511, + "flos": 23221527742080.0, + "grad_norm": 1.864866510083204, + "language_loss": 0.81476939, + "learning_rate": 2.5240618120023912e-06, + "loss": 0.83645213, + "num_input_tokens_seen": 154535945, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.80078125, + "step": 7204, + "time_per_iteration": 2.527064323425293 + }, + { + "auxiliary_loss_clip": 0.01117221, + "auxiliary_loss_mlp": 0.01035641, + "balance_loss_clip": 1.02226007, + "balance_loss_mlp": 1.04050207, + "epoch": 0.43318803547271906, + "flos": 18259750627200.0, + "grad_norm": 1.78968083236078, + "language_loss": 0.73432428, + "learning_rate": 2.5236859483798468e-06, + "loss": 0.75585294, + "num_input_tokens_seen": 154554935, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.765625, + "step": 7205, + "time_per_iteration": 2.406350612640381 + }, + { + "auxiliary_loss_clip": 0.01116769, + "auxiliary_loss_mlp": 0.01037164, + "balance_loss_clip": 1.02414668, + "balance_loss_mlp": 1.04308569, + "epoch": 0.433248158725387, + "flos": 27418407713280.0, + "grad_norm": 1.6284714357196102, + "language_loss": 0.75110108, + "learning_rate": 2.5233100648990803e-06, + "loss": 0.77264041, + "num_input_tokens_seen": 154576065, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.73828125, + "step": 7206, + "time_per_iteration": 2.527343511581421 + }, + { + "auxiliary_loss_clip": 0.01115193, + "auxiliary_loss_mlp": 0.01036596, + "balance_loss_clip": 1.02247548, + "balance_loss_mlp": 1.03899562, + "epoch": 0.433308281978055, + "flos": 23218008209280.0, + "grad_norm": 2.1762520186821854, + "language_loss": 0.78700626, + "learning_rate": 2.522934161574342e-06, + "loss": 0.80852419, + "num_input_tokens_seen": 154595110, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 7207, + "time_per_iteration": 2.4470536708831787 + }, + { + "auxiliary_loss_clip": 0.01121794, + "auxiliary_loss_mlp": 0.0103555, + "balance_loss_clip": 1.02026772, + "balance_loss_mlp": 1.04215813, + "epoch": 0.433368405230723, + "flos": 15852407166720.0, + "grad_norm": 1.6893238531796995, + "language_loss": 0.81100202, + "learning_rate": 2.5225582384198888e-06, + "loss": 0.83257544, + "num_input_tokens_seen": 154612255, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.796875, + "step": 7208, + "time_per_iteration": 2.4634876251220703 + }, + { + "auxiliary_loss_clip": 0.0111942, + "auxiliary_loss_mlp": 0.01034218, + "balance_loss_clip": 1.02083671, + "balance_loss_mlp": 1.04337454, + "epoch": 0.433428528483391, + "flos": 19026084314880.0, + "grad_norm": 2.072374936090108, + "language_loss": 0.70074689, + "learning_rate": 2.5221822954499744e-06, + "loss": 0.72228324, + "num_input_tokens_seen": 154630440, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7578125, + "step": 7209, + "time_per_iteration": 2.4699575901031494 + }, + { + "auxiliary_loss_clip": 0.01113916, + "auxiliary_loss_mlp": 0.01034855, + "balance_loss_clip": 1.02102125, + "balance_loss_mlp": 1.0392952, + "epoch": 0.43348865173605894, + "flos": 24718248581760.0, + "grad_norm": 1.533200118487429, + "language_loss": 0.81202382, + "learning_rate": 2.5218063326788557e-06, + "loss": 0.83351159, + "num_input_tokens_seen": 154652515, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.74609375, + "step": 7210, + "time_per_iteration": 2.5462334156036377 + }, + { + "auxiliary_loss_clip": 0.01114494, + "auxiliary_loss_mlp": 0.01036333, + "balance_loss_clip": 1.02280319, + "balance_loss_mlp": 1.03895545, + "epoch": 0.4335487749887269, + "flos": 22090664880000.0, + "grad_norm": 1.7483210767520514, + "language_loss": 0.81570554, + "learning_rate": 2.5214303501207885e-06, + "loss": 0.83721387, + "num_input_tokens_seen": 154670965, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 7211, + "time_per_iteration": 2.4835634231567383 + }, + { + "auxiliary_loss_clip": 0.01113663, + "auxiliary_loss_mlp": 0.01033957, + "balance_loss_clip": 1.02150583, + "balance_loss_mlp": 1.03778863, + "epoch": 0.43360889824139487, + "flos": 22382941847040.0, + "grad_norm": 2.083548110229539, + "language_loss": 0.74785221, + "learning_rate": 2.521054347790029e-06, + "loss": 0.76932836, + "num_input_tokens_seen": 154689980, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7578125, + "step": 7212, + "time_per_iteration": 2.492600917816162 + }, + { + "auxiliary_loss_clip": 0.01117192, + "auxiliary_loss_mlp": 0.01032782, + "balance_loss_clip": 1.01990747, + "balance_loss_mlp": 1.04162407, + "epoch": 0.43366902149406283, + "flos": 17528286067200.0, + "grad_norm": 1.6640529640233686, + "language_loss": 0.76755834, + "learning_rate": 2.5206783257008375e-06, + "loss": 0.78905809, + "num_input_tokens_seen": 154706570, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.75390625, + "step": 7213, + "time_per_iteration": 2.4060752391815186 + }, + { + "auxiliary_loss_clip": 0.01114624, + "auxiliary_loss_mlp": 0.01034054, + "balance_loss_clip": 1.02070832, + "balance_loss_mlp": 1.03933454, + "epoch": 0.4337291447467308, + "flos": 19022672522880.0, + "grad_norm": 1.5718517519296942, + "language_loss": 0.64949977, + "learning_rate": 2.520302283867471e-06, + "loss": 0.67098659, + "num_input_tokens_seen": 154725210, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75, + "step": 7214, + "time_per_iteration": 2.512662410736084 + }, + { + "auxiliary_loss_clip": 0.01110495, + "auxiliary_loss_mlp": 0.01034308, + "balance_loss_clip": 1.02173781, + "balance_loss_mlp": 1.03869057, + "epoch": 0.43378926799939876, + "flos": 27234042180480.0, + "grad_norm": 1.5916808794412316, + "language_loss": 0.71483207, + "learning_rate": 2.519926222304191e-06, + "loss": 0.73628008, + "num_input_tokens_seen": 154745945, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71875, + "step": 7215, + "time_per_iteration": 2.5099971294403076 + }, + { + "auxiliary_loss_clip": 0.01113826, + "auxiliary_loss_mlp": 0.0103555, + "balance_loss_clip": 1.02224684, + "balance_loss_mlp": 1.04080701, + "epoch": 0.43384939125206673, + "flos": 15961108700160.0, + "grad_norm": 2.1029551712935692, + "language_loss": 0.7531544, + "learning_rate": 2.519550141025255e-06, + "loss": 0.77464819, + "num_input_tokens_seen": 154763580, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73046875, + "step": 7216, + "time_per_iteration": 2.496631383895874 + }, + { + "auxiliary_loss_clip": 0.01124083, + "auxiliary_loss_mlp": 0.01044464, + "balance_loss_clip": 1.02873421, + "balance_loss_mlp": 1.04232287, + "epoch": 0.4339095145047347, + "flos": 21793216354560.0, + "grad_norm": 2.4885665438006086, + "language_loss": 0.75943911, + "learning_rate": 2.519174040044927e-06, + "loss": 0.78112465, + "num_input_tokens_seen": 154776825, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.81640625, + "step": 7217, + "time_per_iteration": 2.4563424587249756 + }, + { + "auxiliary_loss_clip": 0.01116821, + "auxiliary_loss_mlp": 0.01034071, + "balance_loss_clip": 1.02048075, + "balance_loss_mlp": 1.04149795, + "epoch": 0.43396963775740266, + "flos": 14209853109120.0, + "grad_norm": 2.0012841708103677, + "language_loss": 0.73723286, + "learning_rate": 2.5187979193774664e-06, + "loss": 0.7587418, + "num_input_tokens_seen": 154794025, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.75390625, + "step": 7218, + "time_per_iteration": 2.5055034160614014 + }, + { + "auxiliary_loss_clip": 0.01119586, + "auxiliary_loss_mlp": 0.01030517, + "balance_loss_clip": 1.01706386, + "balance_loss_mlp": 1.0420804, + "epoch": 0.4340297610100706, + "flos": 19719052473600.0, + "grad_norm": 1.7121326309499156, + "language_loss": 0.68759704, + "learning_rate": 2.5184217790371367e-06, + "loss": 0.7090981, + "num_input_tokens_seen": 154813105, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7734375, + "step": 7219, + "time_per_iteration": 2.4480419158935547 + }, + { + "auxiliary_loss_clip": 0.01117032, + "auxiliary_loss_mlp": 0.01034297, + "balance_loss_clip": 1.02088046, + "balance_loss_mlp": 1.0424881, + "epoch": 0.4340898842627386, + "flos": 18953508885120.0, + "grad_norm": 1.5876624694807844, + "language_loss": 0.77227521, + "learning_rate": 2.518045619038202e-06, + "loss": 0.79378843, + "num_input_tokens_seen": 154833525, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7421875, + "step": 7220, + "time_per_iteration": 6.918288230895996 + }, + { + "auxiliary_loss_clip": 0.01116062, + "auxiliary_loss_mlp": 0.01035178, + "balance_loss_clip": 1.02162933, + "balance_loss_mlp": 1.04022503, + "epoch": 0.4341500075154066, + "flos": 22018304931840.0, + "grad_norm": 1.9118836764348202, + "language_loss": 0.69684327, + "learning_rate": 2.5176694393949243e-06, + "loss": 0.71835566, + "num_input_tokens_seen": 154853090, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7578125, + "step": 7221, + "time_per_iteration": 2.470270872116089 + }, + { + "auxiliary_loss_clip": 0.0111827, + "auxiliary_loss_mlp": 0.01037458, + "balance_loss_clip": 1.02436888, + "balance_loss_mlp": 1.04102325, + "epoch": 0.4342101307680746, + "flos": 23582465556480.0, + "grad_norm": 2.3043912227088206, + "language_loss": 0.64915985, + "learning_rate": 2.51729324012157e-06, + "loss": 0.67071712, + "num_input_tokens_seen": 154872055, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7734375, + "step": 7222, + "time_per_iteration": 2.553450584411621 + }, + { + "auxiliary_loss_clip": 0.01115314, + "auxiliary_loss_mlp": 0.01033134, + "balance_loss_clip": 1.01851892, + "balance_loss_mlp": 1.0400629, + "epoch": 0.43427025402074254, + "flos": 17967976450560.0, + "grad_norm": 1.98015103861908, + "language_loss": 0.73039752, + "learning_rate": 2.5169170212324053e-06, + "loss": 0.75188196, + "num_input_tokens_seen": 154886645, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.75390625, + "step": 7223, + "time_per_iteration": 2.4311954975128174 + }, + { + "auxiliary_loss_clip": 0.01117336, + "auxiliary_loss_mlp": 0.01030543, + "balance_loss_clip": 1.01639247, + "balance_loss_mlp": 1.03914881, + "epoch": 0.4343303772734105, + "flos": 26286395616000.0, + "grad_norm": 1.7516175042559776, + "language_loss": 0.93677819, + "learning_rate": 2.516540782741694e-06, + "loss": 0.95825702, + "num_input_tokens_seen": 154906775, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.78125, + "step": 7224, + "time_per_iteration": 2.5507140159606934 + }, + { + "auxiliary_loss_clip": 0.0111604, + "auxiliary_loss_mlp": 0.01035929, + "balance_loss_clip": 1.02230883, + "balance_loss_mlp": 1.04143298, + "epoch": 0.43439050052607847, + "flos": 26833961520000.0, + "grad_norm": 1.4456333860398556, + "language_loss": 0.61234355, + "learning_rate": 2.5161645246637056e-06, + "loss": 0.63386333, + "num_input_tokens_seen": 154926990, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 7225, + "time_per_iteration": 2.4982893466949463 + }, + { + "auxiliary_loss_clip": 0.01118584, + "auxiliary_loss_mlp": 0.01040064, + "balance_loss_clip": 1.02594388, + "balance_loss_mlp": 1.04326594, + "epoch": 0.43445062377874644, + "flos": 21397660807680.0, + "grad_norm": 1.8262630970377216, + "language_loss": 0.77771807, + "learning_rate": 2.5157882470127054e-06, + "loss": 0.79930449, + "num_input_tokens_seen": 154946210, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.75390625, + "step": 7226, + "time_per_iteration": 2.5427355766296387 + }, + { + "auxiliary_loss_clip": 0.0111488, + "auxiliary_loss_mlp": 0.01032848, + "balance_loss_clip": 1.01968753, + "balance_loss_mlp": 1.04169869, + "epoch": 0.4345107470314144, + "flos": 19901945548800.0, + "grad_norm": 1.6421213218207402, + "language_loss": 0.84485722, + "learning_rate": 2.515411949802964e-06, + "loss": 0.8663345, + "num_input_tokens_seen": 154964995, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73046875, + "step": 7227, + "time_per_iteration": 2.450390577316284 + }, + { + "auxiliary_loss_clip": 0.01115781, + "auxiliary_loss_mlp": 0.01035722, + "balance_loss_clip": 1.02163696, + "balance_loss_mlp": 1.04135513, + "epoch": 0.43457087028408237, + "flos": 26432623883520.0, + "grad_norm": 2.0443971193166735, + "language_loss": 0.76866895, + "learning_rate": 2.5150356330487498e-06, + "loss": 0.79018396, + "num_input_tokens_seen": 154984775, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 7228, + "time_per_iteration": 2.5690906047821045 + }, + { + "auxiliary_loss_clip": 0.01118098, + "auxiliary_loss_mlp": 0.01036235, + "balance_loss_clip": 1.02229989, + "balance_loss_mlp": 1.04278994, + "epoch": 0.43463099353675033, + "flos": 31868816855040.0, + "grad_norm": 1.4832672479414948, + "language_loss": 0.80732882, + "learning_rate": 2.5146592967643324e-06, + "loss": 0.82887214, + "num_input_tokens_seen": 155008125, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75390625, + "step": 7229, + "time_per_iteration": 2.552069902420044 + }, + { + "auxiliary_loss_clip": 0.01118257, + "auxiliary_loss_mlp": 0.01040852, + "balance_loss_clip": 1.02682161, + "balance_loss_mlp": 1.04213512, + "epoch": 0.4346911167894183, + "flos": 24571266128640.0, + "grad_norm": 2.091517296377785, + "language_loss": 0.81964421, + "learning_rate": 2.5142829409639834e-06, + "loss": 0.84123534, + "num_input_tokens_seen": 155027885, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 7230, + "time_per_iteration": 2.5944671630859375 + }, + { + "auxiliary_loss_clip": 0.01123399, + "auxiliary_loss_mlp": 0.01044498, + "balance_loss_clip": 1.03034186, + "balance_loss_mlp": 1.0445168, + "epoch": 0.43475124004208626, + "flos": 17090678672640.0, + "grad_norm": 2.146338977702966, + "language_loss": 0.77091062, + "learning_rate": 2.513906565661973e-06, + "loss": 0.79258955, + "num_input_tokens_seen": 155043375, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7890625, + "step": 7231, + "time_per_iteration": 2.460886001586914 + }, + { + "auxiliary_loss_clip": 0.01116567, + "auxiliary_loss_mlp": 0.01034718, + "balance_loss_clip": 1.02217722, + "balance_loss_mlp": 1.0421958, + "epoch": 0.4348113632947542, + "flos": 26104615862400.0, + "grad_norm": 1.391615561962781, + "language_loss": 0.6858201, + "learning_rate": 2.513530170872575e-06, + "loss": 0.70733297, + "num_input_tokens_seen": 155062930, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7421875, + "step": 7232, + "time_per_iteration": 2.614415407180786 + }, + { + "auxiliary_loss_clip": 0.01119763, + "auxiliary_loss_mlp": 0.01034466, + "balance_loss_clip": 1.02036333, + "balance_loss_mlp": 1.04160166, + "epoch": 0.4348714865474222, + "flos": 34200496316160.0, + "grad_norm": 1.6911603415584286, + "language_loss": 0.7200706, + "learning_rate": 2.5131537566100605e-06, + "loss": 0.74161285, + "num_input_tokens_seen": 155084980, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.78125, + "step": 7233, + "time_per_iteration": 2.5665411949157715 + }, + { + "auxiliary_loss_clip": 0.01120637, + "auxiliary_loss_mlp": 0.01040107, + "balance_loss_clip": 1.02490747, + "balance_loss_mlp": 1.04198027, + "epoch": 0.43493160980009016, + "flos": 31537468869120.0, + "grad_norm": 1.536262058034198, + "language_loss": 0.746382, + "learning_rate": 2.5127773228887053e-06, + "loss": 0.7679894, + "num_input_tokens_seen": 155107260, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.78515625, + "step": 7234, + "time_per_iteration": 2.577014207839966 + }, + { + "auxiliary_loss_clip": 0.01123093, + "auxiliary_loss_mlp": 0.01039703, + "balance_loss_clip": 1.02523136, + "balance_loss_mlp": 1.04223037, + "epoch": 0.4349917330527582, + "flos": 24061334699520.0, + "grad_norm": 1.829117772001415, + "language_loss": 0.58860987, + "learning_rate": 2.512400869722782e-06, + "loss": 0.61023784, + "num_input_tokens_seen": 155126720, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.80859375, + "step": 7235, + "time_per_iteration": 2.4759557247161865 + }, + { + "auxiliary_loss_clip": 0.01116416, + "auxiliary_loss_mlp": 0.01032457, + "balance_loss_clip": 1.01931453, + "balance_loss_mlp": 1.04053211, + "epoch": 0.43505185630542614, + "flos": 30519329863680.0, + "grad_norm": 1.4942606531447196, + "language_loss": 0.7751596, + "learning_rate": 2.512024397126566e-06, + "loss": 0.79664838, + "num_input_tokens_seen": 155148640, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7578125, + "step": 7236, + "time_per_iteration": 2.6113193035125732 + }, + { + "auxiliary_loss_clip": 0.01113405, + "auxiliary_loss_mlp": 0.01033592, + "balance_loss_clip": 1.01958489, + "balance_loss_mlp": 1.04001045, + "epoch": 0.4351119795580941, + "flos": 15735158196480.0, + "grad_norm": 1.713978383195529, + "language_loss": 0.8155449, + "learning_rate": 2.5116479051143345e-06, + "loss": 0.83701491, + "num_input_tokens_seen": 155165870, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.734375, + "step": 7237, + "time_per_iteration": 2.4341909885406494 + }, + { + "auxiliary_loss_clip": 0.01116801, + "auxiliary_loss_mlp": 0.01035584, + "balance_loss_clip": 1.02109957, + "balance_loss_mlp": 1.04103971, + "epoch": 0.4351721028107621, + "flos": 18731760272640.0, + "grad_norm": 3.0219595130639156, + "language_loss": 0.62897265, + "learning_rate": 2.5112713937003623e-06, + "loss": 0.65049648, + "num_input_tokens_seen": 155185315, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7578125, + "step": 7238, + "time_per_iteration": 2.5014469623565674 + }, + { + "auxiliary_loss_clip": 0.01111642, + "auxiliary_loss_mlp": 0.01041748, + "balance_loss_clip": 1.02848005, + "balance_loss_mlp": 1.03874493, + "epoch": 0.43523222606343004, + "flos": 25226887121280.0, + "grad_norm": 1.5839613956475427, + "language_loss": 0.85889554, + "learning_rate": 2.510894862898928e-06, + "loss": 0.88042951, + "num_input_tokens_seen": 155205790, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73046875, + "step": 7239, + "time_per_iteration": 2.4976143836975098 + }, + { + "auxiliary_loss_clip": 0.01118679, + "auxiliary_loss_mlp": 0.01032563, + "balance_loss_clip": 1.01896167, + "balance_loss_mlp": 1.0434041, + "epoch": 0.435292349316098, + "flos": 22709190101760.0, + "grad_norm": 1.4715329043565741, + "language_loss": 0.7269268, + "learning_rate": 2.510518312724309e-06, + "loss": 0.74843925, + "num_input_tokens_seen": 155226475, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.75, + "step": 7240, + "time_per_iteration": 2.5350124835968018 + }, + { + "auxiliary_loss_clip": 0.01119982, + "auxiliary_loss_mlp": 0.01033555, + "balance_loss_clip": 1.01897597, + "balance_loss_mlp": 1.04185855, + "epoch": 0.43535247256876597, + "flos": 25775889569280.0, + "grad_norm": 1.6878068305061695, + "language_loss": 0.81562793, + "learning_rate": 2.5101417431907842e-06, + "loss": 0.83716333, + "num_input_tokens_seen": 155247110, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.78125, + "step": 7241, + "time_per_iteration": 2.4924368858337402 + }, + { + "auxiliary_loss_clip": 0.01125084, + "auxiliary_loss_mlp": 0.01041861, + "balance_loss_clip": 1.02636945, + "balance_loss_mlp": 1.04387474, + "epoch": 0.43541259582143393, + "flos": 17528142412800.0, + "grad_norm": 3.067853888150903, + "language_loss": 0.79639387, + "learning_rate": 2.5097651543126345e-06, + "loss": 0.81806338, + "num_input_tokens_seen": 155261335, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8125, + "step": 7242, + "time_per_iteration": 2.4884228706359863 + }, + { + "auxiliary_loss_clip": 0.01118288, + "auxiliary_loss_mlp": 0.01035892, + "balance_loss_clip": 1.02146733, + "balance_loss_mlp": 1.03994465, + "epoch": 0.4354727190741019, + "flos": 15195205975680.0, + "grad_norm": 2.2924190339180135, + "language_loss": 0.6872946, + "learning_rate": 2.509388546104138e-06, + "loss": 0.70883644, + "num_input_tokens_seen": 155278510, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78515625, + "step": 7243, + "time_per_iteration": 2.428065538406372 + }, + { + "auxiliary_loss_clip": 0.01114023, + "auxiliary_loss_mlp": 0.0103125, + "balance_loss_clip": 1.01814318, + "balance_loss_mlp": 1.04141152, + "epoch": 0.43553284232676986, + "flos": 16649264436480.0, + "grad_norm": 1.6975937608840317, + "language_loss": 0.8125546, + "learning_rate": 2.5090119185795766e-06, + "loss": 0.83400726, + "num_input_tokens_seen": 155296450, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 7244, + "time_per_iteration": 2.4931905269622803 + }, + { + "auxiliary_loss_clip": 0.01118248, + "auxiliary_loss_mlp": 0.01030839, + "balance_loss_clip": 1.01785159, + "balance_loss_mlp": 1.0428431, + "epoch": 0.43559296557943783, + "flos": 23400865370880.0, + "grad_norm": 1.7229772693729426, + "language_loss": 0.74017537, + "learning_rate": 2.508635271753234e-06, + "loss": 0.7616663, + "num_input_tokens_seen": 155316080, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.75390625, + "step": 7245, + "time_per_iteration": 2.4678800106048584 + }, + { + "auxiliary_loss_clip": 0.01116663, + "auxiliary_loss_mlp": 0.0103805, + "balance_loss_clip": 1.0248003, + "balance_loss_mlp": 1.041008, + "epoch": 0.4356530888321058, + "flos": 22419067950720.0, + "grad_norm": 1.577710817669204, + "language_loss": 0.7671771, + "learning_rate": 2.508258605639389e-06, + "loss": 0.78872424, + "num_input_tokens_seen": 155336765, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75390625, + "step": 7246, + "time_per_iteration": 2.5109541416168213 + }, + { + "auxiliary_loss_clip": 0.01118541, + "auxiliary_loss_mlp": 0.01037789, + "balance_loss_clip": 1.02348995, + "balance_loss_mlp": 1.04209638, + "epoch": 0.43571321208477376, + "flos": 21616141282560.0, + "grad_norm": 1.7904357433283469, + "language_loss": 0.85364228, + "learning_rate": 2.5078819202523275e-06, + "loss": 0.87520564, + "num_input_tokens_seen": 155356440, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.765625, + "step": 7247, + "time_per_iteration": 2.4546074867248535 + }, + { + "auxiliary_loss_clip": 0.01117365, + "auxiliary_loss_mlp": 0.01039048, + "balance_loss_clip": 1.02600694, + "balance_loss_mlp": 1.0420599, + "epoch": 0.4357733353374418, + "flos": 23987358639360.0, + "grad_norm": 1.5214849587217785, + "language_loss": 0.72576565, + "learning_rate": 2.507505215606333e-06, + "loss": 0.74732977, + "num_input_tokens_seen": 155377070, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.75, + "step": 7248, + "time_per_iteration": 2.5288567543029785 + }, + { + "auxiliary_loss_clip": 0.01117005, + "auxiliary_loss_mlp": 0.01036462, + "balance_loss_clip": 1.02280688, + "balance_loss_mlp": 1.04225719, + "epoch": 0.43583345859010975, + "flos": 25264737077760.0, + "grad_norm": 1.6049303411594007, + "language_loss": 0.87276042, + "learning_rate": 2.5071284917156893e-06, + "loss": 0.8942951, + "num_input_tokens_seen": 155398415, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 7249, + "time_per_iteration": 2.497281312942505 + }, + { + "auxiliary_loss_clip": 0.0111866, + "auxiliary_loss_mlp": 0.01043972, + "balance_loss_clip": 1.03053117, + "balance_loss_mlp": 1.04112244, + "epoch": 0.4358935818427777, + "flos": 23696302734720.0, + "grad_norm": 1.835450546624213, + "language_loss": 0.81989753, + "learning_rate": 2.506751748594683e-06, + "loss": 0.84152383, + "num_input_tokens_seen": 155415625, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7734375, + "step": 7250, + "time_per_iteration": 2.5563321113586426 + }, + { + "auxiliary_loss_clip": 0.01124846, + "auxiliary_loss_mlp": 0.01038743, + "balance_loss_clip": 1.02484369, + "balance_loss_mlp": 1.04729581, + "epoch": 0.4359537050954457, + "flos": 29532827761920.0, + "grad_norm": 1.737362510880261, + "language_loss": 0.84760177, + "learning_rate": 2.5063749862575988e-06, + "loss": 0.86923766, + "num_input_tokens_seen": 155435505, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.77734375, + "step": 7251, + "time_per_iteration": 2.5427803993225098 + }, + { + "auxiliary_loss_clip": 0.01113729, + "auxiliary_loss_mlp": 0.01038592, + "balance_loss_clip": 1.02469254, + "balance_loss_mlp": 1.03979266, + "epoch": 0.43601382834811364, + "flos": 22711273090560.0, + "grad_norm": 1.5112002334274994, + "language_loss": 0.69018251, + "learning_rate": 2.5059982047187245e-06, + "loss": 0.71170568, + "num_input_tokens_seen": 155455425, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.73828125, + "step": 7252, + "time_per_iteration": 2.5041210651397705 + }, + { + "auxiliary_loss_clip": 0.01115762, + "auxiliary_loss_mlp": 0.01036375, + "balance_loss_clip": 1.02233779, + "balance_loss_mlp": 1.04257536, + "epoch": 0.4360739516007816, + "flos": 19098731571840.0, + "grad_norm": 1.7846888638519947, + "language_loss": 0.83733922, + "learning_rate": 2.505621403992348e-06, + "loss": 0.85886061, + "num_input_tokens_seen": 155474250, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.734375, + "step": 7253, + "time_per_iteration": 2.434375047683716 + }, + { + "auxiliary_loss_clip": 0.01116361, + "auxiliary_loss_mlp": 0.01038184, + "balance_loss_clip": 1.02386165, + "balance_loss_mlp": 1.04254532, + "epoch": 0.43613407485344957, + "flos": 23404420817280.0, + "grad_norm": 1.4489781171091827, + "language_loss": 0.70361209, + "learning_rate": 2.505244584092757e-06, + "loss": 0.72515762, + "num_input_tokens_seen": 155494685, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.73828125, + "step": 7254, + "time_per_iteration": 2.5304319858551025 + }, + { + "auxiliary_loss_clip": 0.01116723, + "auxiliary_loss_mlp": 0.01038221, + "balance_loss_clip": 1.02503693, + "balance_loss_mlp": 1.04295266, + "epoch": 0.43619419810611754, + "flos": 22637799820800.0, + "grad_norm": 2.261189856456705, + "language_loss": 0.80833256, + "learning_rate": 2.5048677450342406e-06, + "loss": 0.82988203, + "num_input_tokens_seen": 155513040, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73828125, + "step": 7255, + "time_per_iteration": 2.4619336128234863 + }, + { + "auxiliary_loss_clip": 0.01115842, + "auxiliary_loss_mlp": 0.0103715, + "balance_loss_clip": 1.02375722, + "balance_loss_mlp": 1.0402987, + "epoch": 0.4362543213587855, + "flos": 20047958334720.0, + "grad_norm": 1.6623402785544918, + "language_loss": 0.77301329, + "learning_rate": 2.504490886831089e-06, + "loss": 0.79454327, + "num_input_tokens_seen": 155530100, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75390625, + "step": 7256, + "time_per_iteration": 2.502201557159424 + }, + { + "auxiliary_loss_clip": 0.01117553, + "auxiliary_loss_mlp": 0.01039022, + "balance_loss_clip": 1.02568853, + "balance_loss_mlp": 1.04400241, + "epoch": 0.43631444461145347, + "flos": 21361319222400.0, + "grad_norm": 1.8521029690454978, + "language_loss": 0.76273203, + "learning_rate": 2.5041140094975922e-06, + "loss": 0.78429782, + "num_input_tokens_seen": 155549375, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.734375, + "step": 7257, + "time_per_iteration": 2.4721548557281494 + }, + { + "auxiliary_loss_clip": 0.01115455, + "auxiliary_loss_mlp": 0.01039484, + "balance_loss_clip": 1.02553642, + "balance_loss_mlp": 1.04027009, + "epoch": 0.43637456786412143, + "flos": 22418529246720.0, + "grad_norm": 1.675034420512285, + "language_loss": 0.73065001, + "learning_rate": 2.5037371130480417e-06, + "loss": 0.75219941, + "num_input_tokens_seen": 155569395, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75, + "step": 7258, + "time_per_iteration": 2.5251166820526123 + }, + { + "auxiliary_loss_clip": 0.0111727, + "auxiliary_loss_mlp": 0.01034289, + "balance_loss_clip": 1.02083004, + "balance_loss_mlp": 1.04163384, + "epoch": 0.4364346911167894, + "flos": 28548839612160.0, + "grad_norm": 2.491243867162561, + "language_loss": 0.76496607, + "learning_rate": 2.5033601974967297e-06, + "loss": 0.78648162, + "num_input_tokens_seen": 155589090, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 7259, + "time_per_iteration": 2.4948387145996094 + }, + { + "auxiliary_loss_clip": 0.01038123, + "auxiliary_loss_mlp": 0.01006149, + "balance_loss_clip": 1.00483215, + "balance_loss_mlp": 1.01505399, + "epoch": 0.43649481436945736, + "flos": 62659345380480.0, + "grad_norm": 0.7446610885032177, + "language_loss": 0.570382, + "learning_rate": 2.5029832628579483e-06, + "loss": 0.59082472, + "num_input_tokens_seen": 155648660, + "router_z_loss_clip": 0.01318359, + "router_z_loss_mlp": 0.23144531, + "step": 7260, + "time_per_iteration": 3.023712396621704 + }, + { + "auxiliary_loss_clip": 0.01119405, + "auxiliary_loss_mlp": 0.01044844, + "balance_loss_clip": 1.03061068, + "balance_loss_mlp": 1.0423255, + "epoch": 0.4365549376221254, + "flos": 30592120775040.0, + "grad_norm": 2.013500079504657, + "language_loss": 0.71356845, + "learning_rate": 2.5026063091459907e-06, + "loss": 0.7352109, + "num_input_tokens_seen": 155669945, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.76953125, + "step": 7261, + "time_per_iteration": 2.559830665588379 + }, + { + "auxiliary_loss_clip": 0.01117377, + "auxiliary_loss_mlp": 0.01045312, + "balance_loss_clip": 1.0311265, + "balance_loss_mlp": 1.04076374, + "epoch": 0.43661506087479335, + "flos": 17165875795200.0, + "grad_norm": 1.767533570577482, + "language_loss": 0.69423878, + "learning_rate": 2.5022293363751522e-06, + "loss": 0.71586561, + "num_input_tokens_seen": 155688555, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.765625, + "step": 7262, + "time_per_iteration": 5.4921791553497314 + }, + { + "auxiliary_loss_clip": 0.01111099, + "auxiliary_loss_mlp": 0.01029231, + "balance_loss_clip": 1.01699996, + "balance_loss_mlp": 1.04062569, + "epoch": 0.4366751841274613, + "flos": 22047499710720.0, + "grad_norm": 1.7128833789230435, + "language_loss": 0.80033064, + "learning_rate": 2.501852344559726e-06, + "loss": 0.82173395, + "num_input_tokens_seen": 155705370, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.70703125, + "step": 7263, + "time_per_iteration": 2.5026779174804688 + }, + { + "auxiliary_loss_clip": 0.0111778, + "auxiliary_loss_mlp": 0.01046117, + "balance_loss_clip": 1.03210425, + "balance_loss_mlp": 1.043383, + "epoch": 0.4367353073801293, + "flos": 15997306631040.0, + "grad_norm": 1.8087965620474522, + "language_loss": 0.75092399, + "learning_rate": 2.50147533371401e-06, + "loss": 0.77256304, + "num_input_tokens_seen": 155721890, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 7264, + "time_per_iteration": 2.487065553665161 + }, + { + "auxiliary_loss_clip": 0.01114844, + "auxiliary_loss_mlp": 0.01035156, + "balance_loss_clip": 1.02143478, + "balance_loss_mlp": 1.04089546, + "epoch": 0.43679543063279724, + "flos": 38217535868160.0, + "grad_norm": 1.8571442110240568, + "language_loss": 0.61855227, + "learning_rate": 2.501098303852298e-06, + "loss": 0.6400522, + "num_input_tokens_seen": 155743970, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.73828125, + "step": 7265, + "time_per_iteration": 2.5982677936553955 + }, + { + "auxiliary_loss_clip": 0.01112809, + "auxiliary_loss_mlp": 0.01031457, + "balance_loss_clip": 1.01859391, + "balance_loss_mlp": 1.04026711, + "epoch": 0.4368555538854652, + "flos": 15193230727680.0, + "grad_norm": 2.1628188735926845, + "language_loss": 0.72982574, + "learning_rate": 2.5007212549888884e-06, + "loss": 0.75126845, + "num_input_tokens_seen": 155761830, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7265625, + "step": 7266, + "time_per_iteration": 2.4690847396850586 + }, + { + "auxiliary_loss_clip": 0.0111929, + "auxiliary_loss_mlp": 0.01037863, + "balance_loss_clip": 1.0240345, + "balance_loss_mlp": 1.04332638, + "epoch": 0.4369156771381332, + "flos": 23069086421760.0, + "grad_norm": 2.2896909207829954, + "language_loss": 0.81570059, + "learning_rate": 2.5003441871380794e-06, + "loss": 0.83727205, + "num_input_tokens_seen": 155779610, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76171875, + "step": 7267, + "time_per_iteration": 2.463283061981201 + }, + { + "auxiliary_loss_clip": 0.01113248, + "auxiliary_loss_mlp": 0.01030451, + "balance_loss_clip": 1.01803577, + "balance_loss_mlp": 1.04085267, + "epoch": 0.43697580039080114, + "flos": 23441085624960.0, + "grad_norm": 1.9116109849221483, + "language_loss": 0.74723095, + "learning_rate": 2.4999671003141674e-06, + "loss": 0.76866794, + "num_input_tokens_seen": 155798765, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.72265625, + "step": 7268, + "time_per_iteration": 2.516263723373413 + }, + { + "auxiliary_loss_clip": 0.01121105, + "auxiliary_loss_mlp": 0.01042039, + "balance_loss_clip": 1.02736425, + "balance_loss_mlp": 1.04315591, + "epoch": 0.4370359236434691, + "flos": 18514680428160.0, + "grad_norm": 1.9119374296408282, + "language_loss": 0.7954827, + "learning_rate": 2.499589994531454e-06, + "loss": 0.81711417, + "num_input_tokens_seen": 155817750, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78125, + "step": 7269, + "time_per_iteration": 2.4647111892700195 + }, + { + "auxiliary_loss_clip": 0.01117424, + "auxiliary_loss_mlp": 0.01037217, + "balance_loss_clip": 1.02404499, + "balance_loss_mlp": 1.04315174, + "epoch": 0.43709604689613707, + "flos": 23222497409280.0, + "grad_norm": 2.072373926876921, + "language_loss": 0.75031221, + "learning_rate": 2.499212869804237e-06, + "loss": 0.77185863, + "num_input_tokens_seen": 155836490, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 7270, + "time_per_iteration": 2.4963974952697754 + }, + { + "auxiliary_loss_clip": 0.01116927, + "auxiliary_loss_mlp": 0.01029854, + "balance_loss_clip": 1.01639485, + "balance_loss_mlp": 1.04269087, + "epoch": 0.43715617014880503, + "flos": 23803711378560.0, + "grad_norm": 1.906091328168401, + "language_loss": 0.79437554, + "learning_rate": 2.4988357261468182e-06, + "loss": 0.81584334, + "num_input_tokens_seen": 155856225, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7421875, + "step": 7271, + "time_per_iteration": 2.487238645553589 + }, + { + "auxiliary_loss_clip": 0.01039832, + "auxiliary_loss_mlp": 0.01001038, + "balance_loss_clip": 0.99965489, + "balance_loss_mlp": 1.01678514, + "epoch": 0.437216293401473, + "flos": 61941204766080.0, + "grad_norm": 0.6948313241096988, + "language_loss": 0.54902828, + "learning_rate": 2.4984585635734993e-06, + "loss": 0.56943697, + "num_input_tokens_seen": 155916770, + "router_z_loss_clip": 0.01385498, + "router_z_loss_mlp": 0.23046875, + "step": 7272, + "time_per_iteration": 3.1392502784729004 + }, + { + "auxiliary_loss_clip": 0.011197, + "auxiliary_loss_mlp": 0.01042804, + "balance_loss_clip": 1.0286535, + "balance_loss_mlp": 1.04332781, + "epoch": 0.43727641665414096, + "flos": 21982250655360.0, + "grad_norm": 2.967819772960297, + "language_loss": 0.70136559, + "learning_rate": 2.498081382098581e-06, + "loss": 0.72299063, + "num_input_tokens_seen": 155936490, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.765625, + "step": 7273, + "time_per_iteration": 2.468592643737793 + }, + { + "auxiliary_loss_clip": 0.01119234, + "auxiliary_loss_mlp": 0.01039173, + "balance_loss_clip": 1.02515411, + "balance_loss_mlp": 1.04280722, + "epoch": 0.437336539906809, + "flos": 39530860842240.0, + "grad_norm": 1.832145479464728, + "language_loss": 0.75091398, + "learning_rate": 2.497704181736367e-06, + "loss": 0.77249801, + "num_input_tokens_seen": 155957595, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.765625, + "step": 7274, + "time_per_iteration": 2.669516086578369 + }, + { + "auxiliary_loss_clip": 0.01112894, + "auxiliary_loss_mlp": 0.01029332, + "balance_loss_clip": 1.01741123, + "balance_loss_mlp": 1.04002881, + "epoch": 0.43739666315947695, + "flos": 17457147181440.0, + "grad_norm": 1.8126381729021082, + "language_loss": 0.80507416, + "learning_rate": 2.49732696250116e-06, + "loss": 0.82649636, + "num_input_tokens_seen": 155975710, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.7265625, + "step": 7275, + "time_per_iteration": 2.455235481262207 + }, + { + "auxiliary_loss_clip": 0.01118348, + "auxiliary_loss_mlp": 0.01036773, + "balance_loss_clip": 1.02357626, + "balance_loss_mlp": 1.04496706, + "epoch": 0.4374567864121449, + "flos": 16358747235840.0, + "grad_norm": 2.065941875742038, + "language_loss": 0.80955482, + "learning_rate": 2.496949724407266e-06, + "loss": 0.83110607, + "num_input_tokens_seen": 155993090, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 7276, + "time_per_iteration": 2.543306827545166 + }, + { + "auxiliary_loss_clip": 0.01122471, + "auxiliary_loss_mlp": 0.01034927, + "balance_loss_clip": 1.02145052, + "balance_loss_mlp": 1.04409111, + "epoch": 0.4375169096648129, + "flos": 30587523834240.0, + "grad_norm": 1.794283698167311, + "language_loss": 0.73373604, + "learning_rate": 2.496572467468988e-06, + "loss": 0.75530994, + "num_input_tokens_seen": 156013685, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.78515625, + "step": 7277, + "time_per_iteration": 2.5931403636932373 + }, + { + "auxiliary_loss_clip": 0.01117806, + "auxiliary_loss_mlp": 0.01035681, + "balance_loss_clip": 1.0222764, + "balance_loss_mlp": 1.04351854, + "epoch": 0.43757703291748085, + "flos": 30555599621760.0, + "grad_norm": 1.8969119275678887, + "language_loss": 0.72953606, + "learning_rate": 2.4961951917006317e-06, + "loss": 0.75107086, + "num_input_tokens_seen": 156034300, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7421875, + "step": 7278, + "time_per_iteration": 2.576266288757324 + }, + { + "auxiliary_loss_clip": 0.0111536, + "auxiliary_loss_mlp": 0.01033831, + "balance_loss_clip": 1.02152252, + "balance_loss_mlp": 1.04212785, + "epoch": 0.4376371561701488, + "flos": 21397373498880.0, + "grad_norm": 1.6273415021791042, + "language_loss": 0.65815622, + "learning_rate": 2.4958178971165046e-06, + "loss": 0.6796481, + "num_input_tokens_seen": 156053805, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.734375, + "step": 7279, + "time_per_iteration": 2.4717864990234375 + }, + { + "auxiliary_loss_clip": 0.01122391, + "auxiliary_loss_mlp": 0.01034407, + "balance_loss_clip": 1.02098393, + "balance_loss_mlp": 1.04393768, + "epoch": 0.4376972794228168, + "flos": 23404384903680.0, + "grad_norm": 1.838486718423984, + "language_loss": 0.82088757, + "learning_rate": 2.4954405837309126e-06, + "loss": 0.84245551, + "num_input_tokens_seen": 156073295, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.78515625, + "step": 7280, + "time_per_iteration": 2.5370771884918213 + }, + { + "auxiliary_loss_clip": 0.01114089, + "auxiliary_loss_mlp": 0.01033911, + "balance_loss_clip": 1.0209589, + "balance_loss_mlp": 1.04176164, + "epoch": 0.43775740267548474, + "flos": 22892945103360.0, + "grad_norm": 1.430381072646336, + "language_loss": 0.76786566, + "learning_rate": 2.4950632515581653e-06, + "loss": 0.78934562, + "num_input_tokens_seen": 156094540, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.72265625, + "step": 7281, + "time_per_iteration": 2.5260467529296875 + }, + { + "auxiliary_loss_clip": 0.01116043, + "auxiliary_loss_mlp": 0.01038639, + "balance_loss_clip": 1.02582431, + "balance_loss_mlp": 1.04211211, + "epoch": 0.4378175259281527, + "flos": 23294390480640.0, + "grad_norm": 1.8435972134321474, + "language_loss": 0.7572853, + "learning_rate": 2.494685900612569e-06, + "loss": 0.77883214, + "num_input_tokens_seen": 156114070, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7421875, + "step": 7282, + "time_per_iteration": 2.5332953929901123 + }, + { + "auxiliary_loss_clip": 0.01119087, + "auxiliary_loss_mlp": 0.01039188, + "balance_loss_clip": 1.02581239, + "balance_loss_mlp": 1.04421043, + "epoch": 0.43787764918082067, + "flos": 23876897339520.0, + "grad_norm": 1.8874106414487752, + "language_loss": 0.8494271, + "learning_rate": 2.4943085309084333e-06, + "loss": 0.87100983, + "num_input_tokens_seen": 156132130, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.75, + "step": 7283, + "time_per_iteration": 2.458500623703003 + }, + { + "auxiliary_loss_clip": 0.01119709, + "auxiliary_loss_mlp": 0.01034549, + "balance_loss_clip": 1.02060771, + "balance_loss_mlp": 1.04216719, + "epoch": 0.43793777243348864, + "flos": 23988148738560.0, + "grad_norm": 1.9095323636494845, + "language_loss": 0.8005324, + "learning_rate": 2.49393114246007e-06, + "loss": 0.82207501, + "num_input_tokens_seen": 156150820, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7734375, + "step": 7284, + "time_per_iteration": 2.5258796215057373 + }, + { + "auxiliary_loss_clip": 0.01115467, + "auxiliary_loss_mlp": 0.01040827, + "balance_loss_clip": 1.02851903, + "balance_loss_mlp": 1.04236269, + "epoch": 0.4379978956861566, + "flos": 18624064320000.0, + "grad_norm": 1.535068058496724, + "language_loss": 0.8028115, + "learning_rate": 2.493553735281787e-06, + "loss": 0.82437444, + "num_input_tokens_seen": 156170125, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.734375, + "step": 7285, + "time_per_iteration": 2.4441394805908203 + }, + { + "auxiliary_loss_clip": 0.0111367, + "auxiliary_loss_mlp": 0.01028929, + "balance_loss_clip": 1.01576853, + "balance_loss_mlp": 1.04086363, + "epoch": 0.43805801893882457, + "flos": 21981388728960.0, + "grad_norm": 1.9937836479025883, + "language_loss": 0.75031531, + "learning_rate": 2.493176309387897e-06, + "loss": 0.77174133, + "num_input_tokens_seen": 156187320, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7265625, + "step": 7286, + "time_per_iteration": 2.539954423904419 + }, + { + "auxiliary_loss_clip": 0.01118753, + "auxiliary_loss_mlp": 0.01030019, + "balance_loss_clip": 1.01642346, + "balance_loss_mlp": 1.04179096, + "epoch": 0.43811814219149253, + "flos": 26393337383040.0, + "grad_norm": 1.7090844157721894, + "language_loss": 0.73834682, + "learning_rate": 2.492798864792712e-06, + "loss": 0.75983447, + "num_input_tokens_seen": 156207455, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.76953125, + "step": 7287, + "time_per_iteration": 2.5056257247924805 + }, + { + "auxiliary_loss_clip": 0.01117808, + "auxiliary_loss_mlp": 0.010426, + "balance_loss_clip": 1.02887869, + "balance_loss_mlp": 1.04187727, + "epoch": 0.43817826544416055, + "flos": 17493309198720.0, + "grad_norm": 1.8325493621162303, + "language_loss": 0.82288051, + "learning_rate": 2.492421401510545e-06, + "loss": 0.84448457, + "num_input_tokens_seen": 156226560, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7578125, + "step": 7288, + "time_per_iteration": 2.4812850952148438 + }, + { + "auxiliary_loss_clip": 0.01117047, + "auxiliary_loss_mlp": 0.01033722, + "balance_loss_clip": 1.02008474, + "balance_loss_mlp": 1.03895211, + "epoch": 0.4382383886968285, + "flos": 21581020759680.0, + "grad_norm": 1.476666560822241, + "language_loss": 0.84346598, + "learning_rate": 2.4920439195557093e-06, + "loss": 0.86497366, + "num_input_tokens_seen": 156246740, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.78125, + "step": 7289, + "time_per_iteration": 2.482379674911499 + }, + { + "auxiliary_loss_clip": 0.01119976, + "auxiliary_loss_mlp": 0.01036661, + "balance_loss_clip": 1.0235244, + "balance_loss_mlp": 1.04139173, + "epoch": 0.4382985119494965, + "flos": 27923742201600.0, + "grad_norm": 1.4352131560569001, + "language_loss": 0.78107727, + "learning_rate": 2.4916664189425183e-06, + "loss": 0.80264366, + "num_input_tokens_seen": 156266440, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.78515625, + "step": 7290, + "time_per_iteration": 2.5521459579467773 + }, + { + "auxiliary_loss_clip": 0.01115969, + "auxiliary_loss_mlp": 0.0104054, + "balance_loss_clip": 1.02761197, + "balance_loss_mlp": 1.04235792, + "epoch": 0.43835863520216445, + "flos": 24936836797440.0, + "grad_norm": 3.384239132873348, + "language_loss": 0.77987993, + "learning_rate": 2.491288899685288e-06, + "loss": 0.80144495, + "num_input_tokens_seen": 156286900, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.734375, + "step": 7291, + "time_per_iteration": 2.512519121170044 + }, + { + "auxiliary_loss_clip": 0.01117762, + "auxiliary_loss_mlp": 0.01031364, + "balance_loss_clip": 1.01792359, + "balance_loss_mlp": 1.04297888, + "epoch": 0.4384187584548324, + "flos": 33510293504640.0, + "grad_norm": 1.5428221976657872, + "language_loss": 0.65224636, + "learning_rate": 2.4909113617983325e-06, + "loss": 0.67373765, + "num_input_tokens_seen": 156307690, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 7292, + "time_per_iteration": 2.597714424133301 + }, + { + "auxiliary_loss_clip": 0.0111598, + "auxiliary_loss_mlp": 0.01031038, + "balance_loss_clip": 1.01864016, + "balance_loss_mlp": 1.03967905, + "epoch": 0.4384788817075004, + "flos": 23951052967680.0, + "grad_norm": 1.884679810356821, + "language_loss": 0.74216962, + "learning_rate": 2.49053380529597e-06, + "loss": 0.76363981, + "num_input_tokens_seen": 156326620, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.76171875, + "step": 7293, + "time_per_iteration": 2.4943923950195312 + }, + { + "auxiliary_loss_clip": 0.01119197, + "auxiliary_loss_mlp": 0.01040872, + "balance_loss_clip": 1.02732337, + "balance_loss_mlp": 1.04433274, + "epoch": 0.43853900496016834, + "flos": 19098516090240.0, + "grad_norm": 2.4110491255972684, + "language_loss": 0.78757977, + "learning_rate": 2.490156230192516e-06, + "loss": 0.8091805, + "num_input_tokens_seen": 156345495, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.75, + "step": 7294, + "time_per_iteration": 2.495358467102051 + }, + { + "auxiliary_loss_clip": 0.0111963, + "auxiliary_loss_mlp": 0.01041568, + "balance_loss_clip": 1.02864015, + "balance_loss_mlp": 1.04313052, + "epoch": 0.4385991282128363, + "flos": 13225362168960.0, + "grad_norm": 1.7229696907351246, + "language_loss": 0.73184276, + "learning_rate": 2.4897786365022883e-06, + "loss": 0.7534548, + "num_input_tokens_seen": 156363155, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.765625, + "step": 7295, + "time_per_iteration": 2.4645302295684814 + }, + { + "auxiliary_loss_clip": 0.01119056, + "auxiliary_loss_mlp": 0.01039679, + "balance_loss_clip": 1.02573109, + "balance_loss_mlp": 1.042575, + "epoch": 0.4386592514655043, + "flos": 14319883445760.0, + "grad_norm": 2.059865438640582, + "language_loss": 0.75337231, + "learning_rate": 2.4894010242396063e-06, + "loss": 0.77495956, + "num_input_tokens_seen": 156380940, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.765625, + "step": 7296, + "time_per_iteration": 2.46444034576416 + }, + { + "auxiliary_loss_clip": 0.01117475, + "auxiliary_loss_mlp": 0.01033002, + "balance_loss_clip": 1.01976418, + "balance_loss_mlp": 1.04255402, + "epoch": 0.43871937471817224, + "flos": 22784423137920.0, + "grad_norm": 1.6034841999072227, + "language_loss": 0.69515687, + "learning_rate": 2.4890233934187873e-06, + "loss": 0.71666169, + "num_input_tokens_seen": 156400415, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75, + "step": 7297, + "time_per_iteration": 2.4995949268341064 + }, + { + "auxiliary_loss_clip": 0.01115206, + "auxiliary_loss_mlp": 0.01031948, + "balance_loss_clip": 1.01913857, + "balance_loss_mlp": 1.04173827, + "epoch": 0.4387794979708402, + "flos": 28072304853120.0, + "grad_norm": 1.494373898338378, + "language_loss": 0.70457232, + "learning_rate": 2.4886457440541535e-06, + "loss": 0.72604382, + "num_input_tokens_seen": 156421120, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.734375, + "step": 7298, + "time_per_iteration": 2.574982166290283 + }, + { + "auxiliary_loss_clip": 0.01117164, + "auxiliary_loss_mlp": 0.01029544, + "balance_loss_clip": 1.01672888, + "balance_loss_mlp": 1.04384279, + "epoch": 0.43883962122350817, + "flos": 26249551240320.0, + "grad_norm": 1.5912334767066174, + "language_loss": 0.7241621, + "learning_rate": 2.4882680761600238e-06, + "loss": 0.74562919, + "num_input_tokens_seen": 156441535, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.734375, + "step": 7299, + "time_per_iteration": 2.539013385772705 + }, + { + "auxiliary_loss_clip": 0.01120808, + "auxiliary_loss_mlp": 0.01047348, + "balance_loss_clip": 1.03278041, + "balance_loss_mlp": 1.043944, + "epoch": 0.43889974447617613, + "flos": 25883765089920.0, + "grad_norm": 1.8082969607549542, + "language_loss": 0.77112591, + "learning_rate": 2.487890389750719e-06, + "loss": 0.79280752, + "num_input_tokens_seen": 156462015, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.76953125, + "step": 7300, + "time_per_iteration": 2.567291259765625 + }, + { + "auxiliary_loss_clip": 0.0111673, + "auxiliary_loss_mlp": 0.01037561, + "balance_loss_clip": 1.02442431, + "balance_loss_mlp": 1.04064155, + "epoch": 0.43895986772884416, + "flos": 25046615738880.0, + "grad_norm": 1.6241879676388415, + "language_loss": 0.70685148, + "learning_rate": 2.4875126848405626e-06, + "loss": 0.72839439, + "num_input_tokens_seen": 156482165, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7578125, + "step": 7301, + "time_per_iteration": 2.497025489807129 + }, + { + "auxiliary_loss_clip": 0.01122863, + "auxiliary_loss_mlp": 0.01033554, + "balance_loss_clip": 1.01932693, + "balance_loss_mlp": 1.04512143, + "epoch": 0.4390199909815121, + "flos": 25994585525760.0, + "grad_norm": 1.911748384222125, + "language_loss": 0.70491576, + "learning_rate": 2.4871349614438757e-06, + "loss": 0.72647995, + "num_input_tokens_seen": 156503170, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.77734375, + "step": 7302, + "time_per_iteration": 2.5212793350219727 + }, + { + "auxiliary_loss_clip": 0.011184, + "auxiliary_loss_mlp": 0.01039693, + "balance_loss_clip": 1.02676439, + "balance_loss_mlp": 1.04383337, + "epoch": 0.4390801142341801, + "flos": 29022249888000.0, + "grad_norm": 1.741042450815644, + "language_loss": 0.82304549, + "learning_rate": 2.486757219574983e-06, + "loss": 0.84462643, + "num_input_tokens_seen": 156523005, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.74609375, + "step": 7303, + "time_per_iteration": 2.5407814979553223 + }, + { + "auxiliary_loss_clip": 0.01123737, + "auxiliary_loss_mlp": 0.01042372, + "balance_loss_clip": 1.02753651, + "balance_loss_mlp": 1.04429436, + "epoch": 0.43914023748684805, + "flos": 33438544087680.0, + "grad_norm": 2.4492152950747412, + "language_loss": 0.68408841, + "learning_rate": 2.4863794592482067e-06, + "loss": 0.70574951, + "num_input_tokens_seen": 156544440, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 7304, + "time_per_iteration": 4.099287509918213 + }, + { + "auxiliary_loss_clip": 0.01116014, + "auxiliary_loss_mlp": 0.01039361, + "balance_loss_clip": 1.02631354, + "balance_loss_mlp": 1.04335666, + "epoch": 0.439200360739516, + "flos": 34531844302080.0, + "grad_norm": 1.4059546174528585, + "language_loss": 0.78115439, + "learning_rate": 2.486001680477873e-06, + "loss": 0.80270815, + "num_input_tokens_seen": 156565410, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 7305, + "time_per_iteration": 2.6079509258270264 + }, + { + "auxiliary_loss_clip": 0.01116718, + "auxiliary_loss_mlp": 0.01037045, + "balance_loss_clip": 1.02376556, + "balance_loss_mlp": 1.04186165, + "epoch": 0.439260483992184, + "flos": 21907843632000.0, + "grad_norm": 1.688110038500655, + "language_loss": 0.68754542, + "learning_rate": 2.485623883278308e-06, + "loss": 0.70908302, + "num_input_tokens_seen": 156584210, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.74609375, + "step": 7306, + "time_per_iteration": 2.4539954662323 + }, + { + "auxiliary_loss_clip": 0.01119821, + "auxiliary_loss_mlp": 0.0103523, + "balance_loss_clip": 1.0214076, + "balance_loss_mlp": 1.04369712, + "epoch": 0.43932060724485195, + "flos": 20996430912000.0, + "grad_norm": 1.4603628541776523, + "language_loss": 0.6270709, + "learning_rate": 2.4852460676638344e-06, + "loss": 0.64862138, + "num_input_tokens_seen": 156602730, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76171875, + "step": 7307, + "time_per_iteration": 2.490736484527588 + }, + { + "auxiliary_loss_clip": 0.01121175, + "auxiliary_loss_mlp": 0.01032991, + "balance_loss_clip": 1.02001536, + "balance_loss_mlp": 1.04338455, + "epoch": 0.4393807304975199, + "flos": 17747053850880.0, + "grad_norm": 1.9032558944481925, + "language_loss": 0.72409779, + "learning_rate": 2.4848682336487828e-06, + "loss": 0.74563944, + "num_input_tokens_seen": 156619405, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.77734375, + "step": 7308, + "time_per_iteration": 2.4319982528686523 + }, + { + "auxiliary_loss_clip": 0.01120176, + "auxiliary_loss_mlp": 0.01037474, + "balance_loss_clip": 1.02347863, + "balance_loss_mlp": 1.04077995, + "epoch": 0.4394408537501879, + "flos": 22528523669760.0, + "grad_norm": 1.6404677903158766, + "language_loss": 0.76631165, + "learning_rate": 2.4844903812474787e-06, + "loss": 0.78788805, + "num_input_tokens_seen": 156638165, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.79296875, + "step": 7309, + "time_per_iteration": 2.5045857429504395 + }, + { + "auxiliary_loss_clip": 0.01115088, + "auxiliary_loss_mlp": 0.01032267, + "balance_loss_clip": 1.01943445, + "balance_loss_mlp": 1.04314303, + "epoch": 0.43950097700285584, + "flos": 23440654661760.0, + "grad_norm": 1.788496009330223, + "language_loss": 0.70666951, + "learning_rate": 2.484112510474251e-06, + "loss": 0.72814304, + "num_input_tokens_seen": 156658845, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 7310, + "time_per_iteration": 2.4732789993286133 + }, + { + "auxiliary_loss_clip": 0.01120896, + "auxiliary_loss_mlp": 0.01036599, + "balance_loss_clip": 1.02293789, + "balance_loss_mlp": 1.04397106, + "epoch": 0.4395611002555238, + "flos": 23180696956800.0, + "grad_norm": 2.1134854859852505, + "language_loss": 0.75800377, + "learning_rate": 2.483734621343429e-06, + "loss": 0.77957869, + "num_input_tokens_seen": 156677275, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.765625, + "step": 7311, + "time_per_iteration": 2.5372462272644043 + }, + { + "auxiliary_loss_clip": 0.01119727, + "auxiliary_loss_mlp": 0.01034557, + "balance_loss_clip": 1.02171779, + "balance_loss_mlp": 1.04376173, + "epoch": 0.43962122350819177, + "flos": 22127365601280.0, + "grad_norm": 1.9313159099964634, + "language_loss": 0.8127231, + "learning_rate": 2.483356713869341e-06, + "loss": 0.83426595, + "num_input_tokens_seen": 156695815, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7578125, + "step": 7312, + "time_per_iteration": 2.4858858585357666 + }, + { + "auxiliary_loss_clip": 0.01115776, + "auxiliary_loss_mlp": 0.01037621, + "balance_loss_clip": 1.02404332, + "balance_loss_mlp": 1.04030704, + "epoch": 0.43968134676085974, + "flos": 17420554200960.0, + "grad_norm": 2.2005104401689177, + "language_loss": 0.85444236, + "learning_rate": 2.482978788066318e-06, + "loss": 0.87597632, + "num_input_tokens_seen": 156714385, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.75390625, + "step": 7313, + "time_per_iteration": 2.493032932281494 + }, + { + "auxiliary_loss_clip": 0.01119815, + "auxiliary_loss_mlp": 0.01035042, + "balance_loss_clip": 1.02176809, + "balance_loss_mlp": 1.04182911, + "epoch": 0.43974147001352776, + "flos": 18952646958720.0, + "grad_norm": 3.8587100296686145, + "language_loss": 0.67464912, + "learning_rate": 2.4826008439486904e-06, + "loss": 0.69619775, + "num_input_tokens_seen": 156732615, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.78125, + "step": 7314, + "time_per_iteration": 2.4542195796966553 + }, + { + "auxiliary_loss_clip": 0.01121265, + "auxiliary_loss_mlp": 0.01034379, + "balance_loss_clip": 1.02063417, + "balance_loss_mlp": 1.04389846, + "epoch": 0.4398015932661957, + "flos": 18953508885120.0, + "grad_norm": 1.8025616803524547, + "language_loss": 0.76954508, + "learning_rate": 2.4822228815307915e-06, + "loss": 0.79110146, + "num_input_tokens_seen": 156750920, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7734375, + "step": 7315, + "time_per_iteration": 2.4988253116607666 + }, + { + "auxiliary_loss_clip": 0.01117641, + "auxiliary_loss_mlp": 0.01032745, + "balance_loss_clip": 1.01938725, + "balance_loss_mlp": 1.04280567, + "epoch": 0.4398617165188637, + "flos": 24199913370240.0, + "grad_norm": 2.4575060004131895, + "language_loss": 0.74807358, + "learning_rate": 2.4818449008269523e-06, + "loss": 0.76957744, + "num_input_tokens_seen": 156768520, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.75, + "step": 7316, + "time_per_iteration": 2.530104398727417 + }, + { + "auxiliary_loss_clip": 0.0112083, + "auxiliary_loss_mlp": 0.01041846, + "balance_loss_clip": 1.02928746, + "balance_loss_mlp": 1.04640257, + "epoch": 0.43992183977153165, + "flos": 22236677665920.0, + "grad_norm": 2.8405076524150568, + "language_loss": 0.65180635, + "learning_rate": 2.481466901851506e-06, + "loss": 0.67343318, + "num_input_tokens_seen": 156788700, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.74609375, + "step": 7317, + "time_per_iteration": 2.5233771800994873 + }, + { + "auxiliary_loss_clip": 0.01121891, + "auxiliary_loss_mlp": 0.01034231, + "balance_loss_clip": 1.02082634, + "balance_loss_mlp": 1.04455566, + "epoch": 0.4399819630241996, + "flos": 18697465762560.0, + "grad_norm": 1.7710834755986071, + "language_loss": 0.7968365, + "learning_rate": 2.4810888846187865e-06, + "loss": 0.8183977, + "num_input_tokens_seen": 156806470, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7734375, + "step": 7318, + "time_per_iteration": 2.4618961811065674 + }, + { + "auxiliary_loss_clip": 0.01122714, + "auxiliary_loss_mlp": 0.01036897, + "balance_loss_clip": 1.02316427, + "balance_loss_mlp": 1.04423118, + "epoch": 0.4400420862768676, + "flos": 23879375377920.0, + "grad_norm": 1.4932738321413537, + "language_loss": 0.79472506, + "learning_rate": 2.4807108491431283e-06, + "loss": 0.81632113, + "num_input_tokens_seen": 156825895, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.78515625, + "step": 7319, + "time_per_iteration": 2.5342819690704346 + }, + { + "auxiliary_loss_clip": 0.01117114, + "auxiliary_loss_mlp": 0.01040515, + "balance_loss_clip": 1.02637124, + "balance_loss_mlp": 1.04102063, + "epoch": 0.44010220952953555, + "flos": 28037615293440.0, + "grad_norm": 1.641668171652613, + "language_loss": 0.80221331, + "learning_rate": 2.4803327954388667e-06, + "loss": 0.82378966, + "num_input_tokens_seen": 156845990, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 7320, + "time_per_iteration": 2.520888566970825 + }, + { + "auxiliary_loss_clip": 0.01116164, + "auxiliary_loss_mlp": 0.01036235, + "balance_loss_clip": 1.02323556, + "balance_loss_mlp": 1.04136741, + "epoch": 0.4401623327822035, + "flos": 23768985905280.0, + "grad_norm": 1.6986497736973376, + "language_loss": 0.69795078, + "learning_rate": 2.4799547235203376e-06, + "loss": 0.71947479, + "num_input_tokens_seen": 156866685, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.75, + "step": 7321, + "time_per_iteration": 2.5457892417907715 + }, + { + "auxiliary_loss_clip": 0.01039878, + "auxiliary_loss_mlp": 0.01008287, + "balance_loss_clip": 1.00702953, + "balance_loss_mlp": 1.01681685, + "epoch": 0.4402224560348715, + "flos": 70774583264640.0, + "grad_norm": 0.8741267032944617, + "language_loss": 0.56908953, + "learning_rate": 2.4795766334018763e-06, + "loss": 0.58957124, + "num_input_tokens_seen": 156923450, + "router_z_loss_clip": 0.01257324, + "router_z_loss_mlp": 0.23046875, + "step": 7322, + "time_per_iteration": 3.164207935333252 + }, + { + "auxiliary_loss_clip": 0.01117179, + "auxiliary_loss_mlp": 0.01029685, + "balance_loss_clip": 1.01813388, + "balance_loss_mlp": 1.04277694, + "epoch": 0.44028257928753944, + "flos": 22891795868160.0, + "grad_norm": 1.4567737767029483, + "language_loss": 0.76075542, + "learning_rate": 2.479198525097822e-06, + "loss": 0.78222406, + "num_input_tokens_seen": 156944795, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.74609375, + "step": 7323, + "time_per_iteration": 2.5279085636138916 + }, + { + "auxiliary_loss_clip": 0.01117385, + "auxiliary_loss_mlp": 0.01037268, + "balance_loss_clip": 1.02369034, + "balance_loss_mlp": 1.0409224, + "epoch": 0.4403427025402074, + "flos": 17895760156800.0, + "grad_norm": 1.5548582319563429, + "language_loss": 0.8034448, + "learning_rate": 2.478820398622511e-06, + "loss": 0.82499135, + "num_input_tokens_seen": 156962755, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.765625, + "step": 7324, + "time_per_iteration": 2.4854304790496826 + }, + { + "auxiliary_loss_clip": 0.01039688, + "auxiliary_loss_mlp": 0.01006776, + "balance_loss_clip": 1.00549471, + "balance_loss_mlp": 1.01659369, + "epoch": 0.4404028257928754, + "flos": 69562525708800.0, + "grad_norm": 0.66599266679982, + "language_loss": 0.54557002, + "learning_rate": 2.478442253990283e-06, + "loss": 0.56603467, + "num_input_tokens_seen": 157028095, + "router_z_loss_clip": 0.01281738, + "router_z_loss_mlp": 0.23144531, + "step": 7325, + "time_per_iteration": 3.081268787384033 + }, + { + "auxiliary_loss_clip": 0.01116252, + "auxiliary_loss_mlp": 0.0103012, + "balance_loss_clip": 1.01792467, + "balance_loss_mlp": 1.04348588, + "epoch": 0.44046294904554334, + "flos": 20923675914240.0, + "grad_norm": 1.5427042359768692, + "language_loss": 0.69823551, + "learning_rate": 2.4780640912154766e-06, + "loss": 0.71969926, + "num_input_tokens_seen": 157048365, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.7265625, + "step": 7326, + "time_per_iteration": 2.489088535308838 + }, + { + "auxiliary_loss_clip": 0.01112531, + "auxiliary_loss_mlp": 0.01029175, + "balance_loss_clip": 1.01634765, + "balance_loss_mlp": 1.03926969, + "epoch": 0.44052307229821136, + "flos": 23623475909760.0, + "grad_norm": 1.4106900729498488, + "language_loss": 0.76410896, + "learning_rate": 2.477685910312432e-06, + "loss": 0.78552604, + "num_input_tokens_seen": 157069130, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.734375, + "step": 7327, + "time_per_iteration": 2.5099427700042725 + }, + { + "auxiliary_loss_clip": 0.01112963, + "auxiliary_loss_mlp": 0.01032486, + "balance_loss_clip": 1.01947999, + "balance_loss_mlp": 1.04029953, + "epoch": 0.4405831955508793, + "flos": 17597665186560.0, + "grad_norm": 1.92290278058118, + "language_loss": 0.83856362, + "learning_rate": 2.4773077112954897e-06, + "loss": 0.86001813, + "num_input_tokens_seen": 157084940, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7265625, + "step": 7328, + "time_per_iteration": 2.453078269958496 + }, + { + "auxiliary_loss_clip": 0.01114955, + "auxiliary_loss_mlp": 0.01028238, + "balance_loss_clip": 1.01505983, + "balance_loss_mlp": 1.04100752, + "epoch": 0.4406433188035473, + "flos": 21463376739840.0, + "grad_norm": 2.489103584507488, + "language_loss": 0.77842677, + "learning_rate": 2.4769294941789908e-06, + "loss": 0.79985875, + "num_input_tokens_seen": 157102770, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73828125, + "step": 7329, + "time_per_iteration": 2.4908933639526367 + }, + { + "auxiliary_loss_clip": 0.01118689, + "auxiliary_loss_mlp": 0.01033841, + "balance_loss_clip": 1.02069247, + "balance_loss_mlp": 1.04125428, + "epoch": 0.44070344205621526, + "flos": 22673566788480.0, + "grad_norm": 1.7085588184823939, + "language_loss": 0.73343551, + "learning_rate": 2.476551258977278e-06, + "loss": 0.75496078, + "num_input_tokens_seen": 157122035, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7734375, + "step": 7330, + "time_per_iteration": 2.463330030441284 + }, + { + "auxiliary_loss_clip": 0.01116337, + "auxiliary_loss_mlp": 0.01032094, + "balance_loss_clip": 1.01974368, + "balance_loss_mlp": 1.04176283, + "epoch": 0.4407635653088832, + "flos": 23441193365760.0, + "grad_norm": 1.7732063146110093, + "language_loss": 0.74867487, + "learning_rate": 2.4761730057046936e-06, + "loss": 0.77015924, + "num_input_tokens_seen": 157142800, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.74609375, + "step": 7331, + "time_per_iteration": 2.5421340465545654 + }, + { + "auxiliary_loss_clip": 0.01111824, + "auxiliary_loss_mlp": 0.01030025, + "balance_loss_clip": 1.01797271, + "balance_loss_mlp": 1.03957462, + "epoch": 0.4408236885615512, + "flos": 24021294013440.0, + "grad_norm": 1.4577784912363292, + "language_loss": 0.76381409, + "learning_rate": 2.475794734375581e-06, + "loss": 0.78523266, + "num_input_tokens_seen": 157163295, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.72265625, + "step": 7332, + "time_per_iteration": 2.5218567848205566 + }, + { + "auxiliary_loss_clip": 0.01114527, + "auxiliary_loss_mlp": 0.0103924, + "balance_loss_clip": 1.02724767, + "balance_loss_mlp": 1.03985786, + "epoch": 0.44088381181421915, + "flos": 12676826597760.0, + "grad_norm": 1.6787739774558346, + "language_loss": 0.7317301, + "learning_rate": 2.475416445004285e-06, + "loss": 0.75326777, + "num_input_tokens_seen": 157180890, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.74609375, + "step": 7333, + "time_per_iteration": 2.4611384868621826 + }, + { + "auxiliary_loss_clip": 0.01113948, + "auxiliary_loss_mlp": 0.01034468, + "balance_loss_clip": 1.0218792, + "balance_loss_mlp": 1.04222834, + "epoch": 0.4409439350668871, + "flos": 24569865498240.0, + "grad_norm": 1.7946296457229314, + "language_loss": 0.79795265, + "learning_rate": 2.4750381376051493e-06, + "loss": 0.81943679, + "num_input_tokens_seen": 157200580, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71875, + "step": 7334, + "time_per_iteration": 2.4846577644348145 + }, + { + "auxiliary_loss_clip": 0.01123312, + "auxiliary_loss_mlp": 0.01041094, + "balance_loss_clip": 1.02473879, + "balance_loss_mlp": 1.04168534, + "epoch": 0.4410040583195551, + "flos": 22668574798080.0, + "grad_norm": 2.170087212124324, + "language_loss": 0.7549156, + "learning_rate": 2.47465981219252e-06, + "loss": 0.77655965, + "num_input_tokens_seen": 157218345, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.81640625, + "step": 7335, + "time_per_iteration": 2.5086324214935303 + }, + { + "auxiliary_loss_clip": 0.01118233, + "auxiliary_loss_mlp": 0.01039933, + "balance_loss_clip": 1.02661777, + "balance_loss_mlp": 1.04259086, + "epoch": 0.44106418157222305, + "flos": 10852528700160.0, + "grad_norm": 1.91450979477167, + "language_loss": 0.72583538, + "learning_rate": 2.4742814687807423e-06, + "loss": 0.74741697, + "num_input_tokens_seen": 157234395, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7578125, + "step": 7336, + "time_per_iteration": 2.436680555343628 + }, + { + "auxiliary_loss_clip": 0.01118765, + "auxiliary_loss_mlp": 0.01039128, + "balance_loss_clip": 1.0251267, + "balance_loss_mlp": 1.04040349, + "epoch": 0.441124304824891, + "flos": 21726710323200.0, + "grad_norm": 9.267090991138677, + "language_loss": 0.62665188, + "learning_rate": 2.473903107384165e-06, + "loss": 0.64823085, + "num_input_tokens_seen": 157254805, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.78515625, + "step": 7337, + "time_per_iteration": 2.484269618988037 + }, + { + "auxiliary_loss_clip": 0.01039049, + "auxiliary_loss_mlp": 0.0100578, + "balance_loss_clip": 1.00452268, + "balance_loss_mlp": 1.01618195, + "epoch": 0.441184428077559, + "flos": 63220486625280.0, + "grad_norm": 0.7410103266773326, + "language_loss": 0.52670205, + "learning_rate": 2.473524728017134e-06, + "loss": 0.54715037, + "num_input_tokens_seen": 157317870, + "router_z_loss_clip": 0.01257324, + "router_z_loss_mlp": 0.22851562, + "step": 7338, + "time_per_iteration": 3.104921340942383 + }, + { + "auxiliary_loss_clip": 0.01120745, + "auxiliary_loss_mlp": 0.01047395, + "balance_loss_clip": 1.03303015, + "balance_loss_mlp": 1.04076958, + "epoch": 0.44124455133022694, + "flos": 21177959270400.0, + "grad_norm": 1.7777015345810536, + "language_loss": 0.70687723, + "learning_rate": 2.473146330693997e-06, + "loss": 0.7285586, + "num_input_tokens_seen": 157336505, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.80078125, + "step": 7339, + "time_per_iteration": 2.5172934532165527 + }, + { + "auxiliary_loss_clip": 0.01113596, + "auxiliary_loss_mlp": 0.01038279, + "balance_loss_clip": 1.02603626, + "balance_loss_mlp": 1.04237795, + "epoch": 0.4413046745828949, + "flos": 17457865453440.0, + "grad_norm": 1.6032661325040427, + "language_loss": 0.69992614, + "learning_rate": 2.472767915429105e-06, + "loss": 0.7214449, + "num_input_tokens_seen": 157354995, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.7109375, + "step": 7340, + "time_per_iteration": 2.4677066802978516 + }, + { + "auxiliary_loss_clip": 0.0103753, + "auxiliary_loss_mlp": 0.01002043, + "balance_loss_clip": 1.00078511, + "balance_loss_mlp": 1.01463652, + "epoch": 0.4413647978355629, + "flos": 61586153804160.0, + "grad_norm": 0.8913600985584349, + "language_loss": 0.64017105, + "learning_rate": 2.4723894822368054e-06, + "loss": 0.66056681, + "num_input_tokens_seen": 157404260, + "router_z_loss_clip": 0.01257324, + "router_z_loss_mlp": 0.22851562, + "step": 7341, + "time_per_iteration": 2.87821888923645 + }, + { + "auxiliary_loss_clip": 0.01113838, + "auxiliary_loss_mlp": 0.01038155, + "balance_loss_clip": 1.02473783, + "balance_loss_mlp": 1.04029536, + "epoch": 0.4414249210882309, + "flos": 27527001505920.0, + "grad_norm": 2.415120536593597, + "language_loss": 0.73162079, + "learning_rate": 2.47201103113145e-06, + "loss": 0.75314075, + "num_input_tokens_seen": 157423045, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 7342, + "time_per_iteration": 2.6009373664855957 + }, + { + "auxiliary_loss_clip": 0.01114735, + "auxiliary_loss_mlp": 0.01037861, + "balance_loss_clip": 1.02390742, + "balance_loss_mlp": 1.03866804, + "epoch": 0.44148504434089886, + "flos": 23513984277120.0, + "grad_norm": 1.834134484008718, + "language_loss": 0.7961756, + "learning_rate": 2.4716325621273886e-06, + "loss": 0.81770158, + "num_input_tokens_seen": 157441815, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76171875, + "step": 7343, + "time_per_iteration": 2.5102362632751465 + }, + { + "auxiliary_loss_clip": 0.01113089, + "auxiliary_loss_mlp": 0.01030659, + "balance_loss_clip": 1.01745617, + "balance_loss_mlp": 1.03901291, + "epoch": 0.4415451675935668, + "flos": 21580589796480.0, + "grad_norm": 1.5507634652992637, + "language_loss": 0.76845753, + "learning_rate": 2.4712540752389725e-06, + "loss": 0.789895, + "num_input_tokens_seen": 157460470, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73828125, + "step": 7344, + "time_per_iteration": 2.517014741897583 + }, + { + "auxiliary_loss_clip": 0.01036094, + "auxiliary_loss_mlp": 0.01000265, + "balance_loss_clip": 0.99887604, + "balance_loss_mlp": 1.01319945, + "epoch": 0.4416052908462348, + "flos": 59006368126080.0, + "grad_norm": 0.7920555871551813, + "language_loss": 0.63752162, + "learning_rate": 2.470875570480556e-06, + "loss": 0.65788519, + "num_input_tokens_seen": 157512655, + "router_z_loss_clip": 0.01391602, + "router_z_loss_mlp": 0.22949219, + "step": 7345, + "time_per_iteration": 7.267446517944336 + }, + { + "auxiliary_loss_clip": 0.01121083, + "auxiliary_loss_mlp": 0.01039556, + "balance_loss_clip": 1.02610314, + "balance_loss_mlp": 1.04385495, + "epoch": 0.44166541409890275, + "flos": 26357642242560.0, + "grad_norm": 2.1109182100548596, + "language_loss": 0.86316586, + "learning_rate": 2.470497047866489e-06, + "loss": 0.88477224, + "num_input_tokens_seen": 157533700, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7734375, + "step": 7346, + "time_per_iteration": 2.5508806705474854 + }, + { + "auxiliary_loss_clip": 0.01118131, + "auxiliary_loss_mlp": 0.01040679, + "balance_loss_clip": 1.02691066, + "balance_loss_mlp": 1.04238844, + "epoch": 0.4417255373515707, + "flos": 20192678231040.0, + "grad_norm": 1.947149735733886, + "language_loss": 0.8050105, + "learning_rate": 2.470118507411128e-06, + "loss": 0.82659858, + "num_input_tokens_seen": 157551105, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7578125, + "step": 7347, + "time_per_iteration": 2.474933624267578 + }, + { + "auxiliary_loss_clip": 0.01117003, + "auxiliary_loss_mlp": 0.01037561, + "balance_loss_clip": 1.02367926, + "balance_loss_mlp": 1.04158723, + "epoch": 0.4417856606042387, + "flos": 17887895078400.0, + "grad_norm": 1.6941368254206504, + "language_loss": 0.82639945, + "learning_rate": 2.4697399491288263e-06, + "loss": 0.84794509, + "num_input_tokens_seen": 157568285, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 7348, + "time_per_iteration": 2.4525363445281982 + }, + { + "auxiliary_loss_clip": 0.01119343, + "auxiliary_loss_mlp": 0.01037184, + "balance_loss_clip": 1.02335548, + "balance_loss_mlp": 1.04179621, + "epoch": 0.44184578385690665, + "flos": 27964034282880.0, + "grad_norm": 1.5736626646923677, + "language_loss": 0.7025882, + "learning_rate": 2.469361373033938e-06, + "loss": 0.72415352, + "num_input_tokens_seen": 157590405, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7734375, + "step": 7349, + "time_per_iteration": 2.511890172958374 + }, + { + "auxiliary_loss_clip": 0.01117351, + "auxiliary_loss_mlp": 0.01038625, + "balance_loss_clip": 1.02426577, + "balance_loss_mlp": 1.03973794, + "epoch": 0.4419059071095746, + "flos": 23367899664000.0, + "grad_norm": 1.6465526230005572, + "language_loss": 0.74427998, + "learning_rate": 2.468982779140819e-06, + "loss": 0.76583976, + "num_input_tokens_seen": 157607420, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.77734375, + "step": 7350, + "time_per_iteration": 2.496570110321045 + }, + { + "auxiliary_loss_clip": 0.01116736, + "auxiliary_loss_mlp": 0.01034613, + "balance_loss_clip": 1.02167273, + "balance_loss_mlp": 1.0410589, + "epoch": 0.4419660303622426, + "flos": 15012169246080.0, + "grad_norm": 1.9521663807923895, + "language_loss": 0.80709779, + "learning_rate": 2.468604167463827e-06, + "loss": 0.8286112, + "num_input_tokens_seen": 157624990, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7578125, + "step": 7351, + "time_per_iteration": 2.432551860809326 + }, + { + "auxiliary_loss_clip": 0.01111348, + "auxiliary_loss_mlp": 0.01035932, + "balance_loss_clip": 1.02401161, + "balance_loss_mlp": 1.03947091, + "epoch": 0.44202615361491054, + "flos": 25371750672000.0, + "grad_norm": 1.5082806208548023, + "language_loss": 0.73055673, + "learning_rate": 2.4682255380173176e-06, + "loss": 0.75202954, + "num_input_tokens_seen": 157645300, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.71875, + "step": 7352, + "time_per_iteration": 2.515235424041748 + }, + { + "auxiliary_loss_clip": 0.01116736, + "auxiliary_loss_mlp": 0.01031954, + "balance_loss_clip": 1.0184238, + "balance_loss_mlp": 1.04159904, + "epoch": 0.4420862768675785, + "flos": 24681116897280.0, + "grad_norm": 1.8470037483547026, + "language_loss": 0.87457407, + "learning_rate": 2.467846890815649e-06, + "loss": 0.89606094, + "num_input_tokens_seen": 157664060, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.75, + "step": 7353, + "time_per_iteration": 2.4880294799804688 + }, + { + "auxiliary_loss_clip": 0.0111698, + "auxiliary_loss_mlp": 0.01035238, + "balance_loss_clip": 1.02288198, + "balance_loss_mlp": 1.04091954, + "epoch": 0.44214640012024653, + "flos": 19528437974400.0, + "grad_norm": 2.0344010928875567, + "language_loss": 0.75522006, + "learning_rate": 2.4674682258731795e-06, + "loss": 0.77674222, + "num_input_tokens_seen": 157680905, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.7578125, + "step": 7354, + "time_per_iteration": 2.454554319381714 + }, + { + "auxiliary_loss_clip": 0.01112104, + "auxiliary_loss_mlp": 0.01035786, + "balance_loss_clip": 1.0235672, + "balance_loss_mlp": 1.03940272, + "epoch": 0.4422065233729145, + "flos": 47557434003840.0, + "grad_norm": 1.7346650465528282, + "language_loss": 0.64754039, + "learning_rate": 2.467089543204268e-06, + "loss": 0.66901928, + "num_input_tokens_seen": 157701980, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.7265625, + "step": 7355, + "time_per_iteration": 2.711973190307617 + }, + { + "auxiliary_loss_clip": 0.0112036, + "auxiliary_loss_mlp": 0.01036556, + "balance_loss_clip": 1.02248383, + "balance_loss_mlp": 1.04187799, + "epoch": 0.44226664662558246, + "flos": 19281050029440.0, + "grad_norm": 1.914030541413853, + "language_loss": 0.78126168, + "learning_rate": 2.466710842823274e-06, + "loss": 0.80283082, + "num_input_tokens_seen": 157720555, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.78515625, + "step": 7356, + "time_per_iteration": 2.470214366912842 + }, + { + "auxiliary_loss_clip": 0.01118926, + "auxiliary_loss_mlp": 0.01036798, + "balance_loss_clip": 1.02317214, + "balance_loss_mlp": 1.0414896, + "epoch": 0.4423267698782504, + "flos": 17821820010240.0, + "grad_norm": 1.5192892311950144, + "language_loss": 0.7712661, + "learning_rate": 2.4663321247445577e-06, + "loss": 0.79282331, + "num_input_tokens_seen": 157739160, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 7357, + "time_per_iteration": 2.461174249649048 + }, + { + "auxiliary_loss_clip": 0.01117699, + "auxiliary_loss_mlp": 0.01038392, + "balance_loss_clip": 1.02454567, + "balance_loss_mlp": 1.0424664, + "epoch": 0.4423868931309184, + "flos": 29204424691200.0, + "grad_norm": 1.4937655647898813, + "language_loss": 0.73591524, + "learning_rate": 2.465953388982481e-06, + "loss": 0.75747615, + "num_input_tokens_seen": 157760020, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 7358, + "time_per_iteration": 2.556330919265747 + }, + { + "auxiliary_loss_clip": 0.01117067, + "auxiliary_loss_mlp": 0.01030767, + "balance_loss_clip": 1.01871514, + "balance_loss_mlp": 1.0415349, + "epoch": 0.44244701638358636, + "flos": 29713135057920.0, + "grad_norm": 1.6567493539100802, + "language_loss": 0.75616974, + "learning_rate": 2.465574635551405e-06, + "loss": 0.77764809, + "num_input_tokens_seen": 157780435, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.75390625, + "step": 7359, + "time_per_iteration": 2.50827693939209 + }, + { + "auxiliary_loss_clip": 0.01116785, + "auxiliary_loss_mlp": 0.01033041, + "balance_loss_clip": 1.01920068, + "balance_loss_mlp": 1.04107249, + "epoch": 0.4425071396362543, + "flos": 22930040874240.0, + "grad_norm": 1.743382279224751, + "language_loss": 0.7001307, + "learning_rate": 2.4651958644656923e-06, + "loss": 0.72162896, + "num_input_tokens_seen": 157799420, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 7360, + "time_per_iteration": 2.4941389560699463 + }, + { + "auxiliary_loss_clip": 0.01117522, + "auxiliary_loss_mlp": 0.01033558, + "balance_loss_clip": 1.0205518, + "balance_loss_mlp": 1.04113221, + "epoch": 0.4425672628889223, + "flos": 19792346175360.0, + "grad_norm": 2.0593935576965996, + "language_loss": 0.69252694, + "learning_rate": 2.4648170757397053e-06, + "loss": 0.71403772, + "num_input_tokens_seen": 157817025, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.76171875, + "step": 7361, + "time_per_iteration": 2.4985222816467285 + }, + { + "auxiliary_loss_clip": 0.01116054, + "auxiliary_loss_mlp": 0.01032222, + "balance_loss_clip": 1.01840568, + "balance_loss_mlp": 1.04025078, + "epoch": 0.44262738614159025, + "flos": 13662215377920.0, + "grad_norm": 3.464971296188532, + "language_loss": 0.82380062, + "learning_rate": 2.464438269387809e-06, + "loss": 0.84528339, + "num_input_tokens_seen": 157834345, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 7362, + "time_per_iteration": 2.5396664142608643 + }, + { + "auxiliary_loss_clip": 0.01120785, + "auxiliary_loss_mlp": 0.0103602, + "balance_loss_clip": 1.0216198, + "balance_loss_mlp": 1.0414443, + "epoch": 0.4426875093942582, + "flos": 14210212245120.0, + "grad_norm": 1.6248096382426125, + "language_loss": 0.74421227, + "learning_rate": 2.464059445424366e-06, + "loss": 0.76578033, + "num_input_tokens_seen": 157852290, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.79296875, + "step": 7363, + "time_per_iteration": 2.452195167541504 + }, + { + "auxiliary_loss_clip": 0.01036428, + "auxiliary_loss_mlp": 0.01011165, + "balance_loss_clip": 1.00969243, + "balance_loss_mlp": 1.0129478, + "epoch": 0.4427476326469262, + "flos": 70117525728000.0, + "grad_norm": 0.6750552451063064, + "language_loss": 0.55668789, + "learning_rate": 2.463680603863743e-06, + "loss": 0.57716382, + "num_input_tokens_seen": 157923060, + "router_z_loss_clip": 0.01470947, + "router_z_loss_mlp": 0.234375, + "step": 7364, + "time_per_iteration": 3.1631510257720947 + }, + { + "auxiliary_loss_clip": 0.0111342, + "auxiliary_loss_mlp": 0.01031288, + "balance_loss_clip": 1.01869917, + "balance_loss_mlp": 1.0388242, + "epoch": 0.44280775589959415, + "flos": 25445080287360.0, + "grad_norm": 1.5647849634077904, + "language_loss": 0.74008644, + "learning_rate": 2.463301744720305e-06, + "loss": 0.76153356, + "num_input_tokens_seen": 157944110, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.74609375, + "step": 7365, + "time_per_iteration": 2.5025317668914795 + }, + { + "auxiliary_loss_clip": 0.01113151, + "auxiliary_loss_mlp": 0.01038694, + "balance_loss_clip": 1.02544355, + "balance_loss_mlp": 1.0385282, + "epoch": 0.4428678791522621, + "flos": 22857214049280.0, + "grad_norm": 1.5168930353966135, + "language_loss": 0.74242592, + "learning_rate": 2.4629228680084184e-06, + "loss": 0.76394439, + "num_input_tokens_seen": 157964295, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.74609375, + "step": 7366, + "time_per_iteration": 2.4882071018218994 + }, + { + "auxiliary_loss_clip": 0.01117127, + "auxiliary_loss_mlp": 0.01032858, + "balance_loss_clip": 1.01911306, + "balance_loss_mlp": 1.04244351, + "epoch": 0.44292800240493013, + "flos": 25812446636160.0, + "grad_norm": 1.7268166919008578, + "language_loss": 0.73934573, + "learning_rate": 2.46254397374245e-06, + "loss": 0.7608456, + "num_input_tokens_seen": 157983970, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.74609375, + "step": 7367, + "time_per_iteration": 2.494215250015259 + }, + { + "auxiliary_loss_clip": 0.01115817, + "auxiliary_loss_mlp": 0.01038126, + "balance_loss_clip": 1.02484, + "balance_loss_mlp": 1.04093957, + "epoch": 0.4429881256575981, + "flos": 32416885549440.0, + "grad_norm": 1.708386000191459, + "language_loss": 0.7409333, + "learning_rate": 2.4621650619367677e-06, + "loss": 0.76247275, + "num_input_tokens_seen": 158006515, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75, + "step": 7368, + "time_per_iteration": 2.5647008419036865 + }, + { + "auxiliary_loss_clip": 0.01114523, + "auxiliary_loss_mlp": 0.01031707, + "balance_loss_clip": 1.01905274, + "balance_loss_mlp": 1.04091215, + "epoch": 0.44304824891026606, + "flos": 22163707186560.0, + "grad_norm": 1.8689444780395545, + "language_loss": 0.79986328, + "learning_rate": 2.4617861326057403e-06, + "loss": 0.82132554, + "num_input_tokens_seen": 158025565, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.734375, + "step": 7369, + "time_per_iteration": 2.4666872024536133 + }, + { + "auxiliary_loss_clip": 0.01112296, + "auxiliary_loss_mlp": 0.0102878, + "balance_loss_clip": 1.01627517, + "balance_loss_mlp": 1.04060125, + "epoch": 0.443108372162934, + "flos": 25338569483520.0, + "grad_norm": 1.7167890006148945, + "language_loss": 0.72231519, + "learning_rate": 2.461407185763737e-06, + "loss": 0.74372596, + "num_input_tokens_seen": 158045620, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71484375, + "step": 7370, + "time_per_iteration": 2.5508570671081543 + }, + { + "auxiliary_loss_clip": 0.01113663, + "auxiliary_loss_mlp": 0.01033079, + "balance_loss_clip": 1.02021682, + "balance_loss_mlp": 1.03883541, + "epoch": 0.443168495415602, + "flos": 23330947547520.0, + "grad_norm": 1.7515847136682843, + "language_loss": 0.70318949, + "learning_rate": 2.461028221425126e-06, + "loss": 0.72465694, + "num_input_tokens_seen": 158063505, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.75, + "step": 7371, + "time_per_iteration": 2.4617960453033447 + }, + { + "auxiliary_loss_clip": 0.01110948, + "auxiliary_loss_mlp": 0.01030075, + "balance_loss_clip": 1.01818371, + "balance_loss_mlp": 1.03891456, + "epoch": 0.44322861866826996, + "flos": 21871502046720.0, + "grad_norm": 2.199744355071377, + "language_loss": 0.68163198, + "learning_rate": 2.4606492396042786e-06, + "loss": 0.70304221, + "num_input_tokens_seen": 158080335, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.72265625, + "step": 7372, + "time_per_iteration": 2.4743239879608154 + }, + { + "auxiliary_loss_clip": 0.0111515, + "auxiliary_loss_mlp": 0.01030978, + "balance_loss_clip": 1.01702499, + "balance_loss_mlp": 1.03971767, + "epoch": 0.4432887419209379, + "flos": 20084407660800.0, + "grad_norm": 1.696523180994532, + "language_loss": 0.83959508, + "learning_rate": 2.4602702403155664e-06, + "loss": 0.86105639, + "num_input_tokens_seen": 158098955, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75390625, + "step": 7373, + "time_per_iteration": 2.44077467918396 + }, + { + "auxiliary_loss_clip": 0.01038641, + "auxiliary_loss_mlp": 0.01004854, + "balance_loss_clip": 1.00331616, + "balance_loss_mlp": 1.01527071, + "epoch": 0.4433488651736059, + "flos": 70035540935040.0, + "grad_norm": 0.8140024563186875, + "language_loss": 0.55299437, + "learning_rate": 2.4598912235733604e-06, + "loss": 0.57342935, + "num_input_tokens_seen": 158164110, + "router_z_loss_clip": 0.01538086, + "router_z_loss_mlp": 0.234375, + "step": 7374, + "time_per_iteration": 3.1360692977905273 + }, + { + "auxiliary_loss_clip": 0.01113767, + "auxiliary_loss_mlp": 0.01042375, + "balance_loss_clip": 1.02858198, + "balance_loss_mlp": 1.04092741, + "epoch": 0.44340898842627385, + "flos": 16282472705280.0, + "grad_norm": 2.2551701608050636, + "language_loss": 0.82651508, + "learning_rate": 2.4595121893920327e-06, + "loss": 0.84807646, + "num_input_tokens_seen": 158179850, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7265625, + "step": 7375, + "time_per_iteration": 2.4277329444885254 + }, + { + "auxiliary_loss_clip": 0.01116501, + "auxiliary_loss_mlp": 0.01029081, + "balance_loss_clip": 1.01610494, + "balance_loss_mlp": 1.04118764, + "epoch": 0.4434691116789418, + "flos": 16611989097600.0, + "grad_norm": 1.7856786314152562, + "language_loss": 0.83470213, + "learning_rate": 2.4591331377859578e-06, + "loss": 0.85615796, + "num_input_tokens_seen": 158196590, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.75390625, + "step": 7376, + "time_per_iteration": 2.481781482696533 + }, + { + "auxiliary_loss_clip": 0.01114604, + "auxiliary_loss_mlp": 0.01032944, + "balance_loss_clip": 1.02043331, + "balance_loss_mlp": 1.04121447, + "epoch": 0.4435292349316098, + "flos": 19063251912960.0, + "grad_norm": 1.7657537697851593, + "language_loss": 0.77321744, + "learning_rate": 2.4587540687695077e-06, + "loss": 0.79469293, + "num_input_tokens_seen": 158216355, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.734375, + "step": 7377, + "time_per_iteration": 2.4599812030792236 + }, + { + "auxiliary_loss_clip": 0.01112621, + "auxiliary_loss_mlp": 0.0102944, + "balance_loss_clip": 1.01692927, + "balance_loss_mlp": 1.04132032, + "epoch": 0.44358935818427775, + "flos": 21251324799360.0, + "grad_norm": 1.8620341755948002, + "language_loss": 0.75641978, + "learning_rate": 2.458374982357057e-06, + "loss": 0.77784032, + "num_input_tokens_seen": 158235825, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7109375, + "step": 7378, + "time_per_iteration": 2.5178849697113037 + }, + { + "auxiliary_loss_clip": 0.01114317, + "auxiliary_loss_mlp": 0.01035639, + "balance_loss_clip": 1.02302647, + "balance_loss_mlp": 1.04010391, + "epoch": 0.4436494814369457, + "flos": 12495298239360.0, + "grad_norm": 2.670150777415059, + "language_loss": 0.69005907, + "learning_rate": 2.457995878562982e-06, + "loss": 0.71155864, + "num_input_tokens_seen": 158254230, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7421875, + "step": 7379, + "time_per_iteration": 2.460470199584961 + }, + { + "auxiliary_loss_clip": 0.01116042, + "auxiliary_loss_mlp": 0.01029842, + "balance_loss_clip": 1.01689601, + "balance_loss_mlp": 1.04134107, + "epoch": 0.44370960468961373, + "flos": 23659853408640.0, + "grad_norm": 1.5614200394729, + "language_loss": 0.73110741, + "learning_rate": 2.457616757401656e-06, + "loss": 0.75256622, + "num_input_tokens_seen": 158273400, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.75, + "step": 7380, + "time_per_iteration": 2.5134148597717285 + }, + { + "auxiliary_loss_clip": 0.0111454, + "auxiliary_loss_mlp": 0.01031046, + "balance_loss_clip": 1.01841021, + "balance_loss_mlp": 1.0408597, + "epoch": 0.4437697279422817, + "flos": 32416849635840.0, + "grad_norm": 1.5217984285789272, + "language_loss": 0.6470772, + "learning_rate": 2.457237618887458e-06, + "loss": 0.66853309, + "num_input_tokens_seen": 158296840, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.734375, + "step": 7381, + "time_per_iteration": 2.5547850131988525 + }, + { + "auxiliary_loss_clip": 0.01116209, + "auxiliary_loss_mlp": 0.0103301, + "balance_loss_clip": 1.02020693, + "balance_loss_mlp": 1.04110599, + "epoch": 0.44382985119494966, + "flos": 18112875914880.0, + "grad_norm": 2.3862697145357394, + "language_loss": 0.8018291, + "learning_rate": 2.456858463034763e-06, + "loss": 0.82332134, + "num_input_tokens_seen": 158314935, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.75, + "step": 7382, + "time_per_iteration": 2.575241804122925 + }, + { + "auxiliary_loss_clip": 0.01118453, + "auxiliary_loss_mlp": 0.01039182, + "balance_loss_clip": 1.02631903, + "balance_loss_mlp": 1.04359293, + "epoch": 0.44388997444761763, + "flos": 30774151923840.0, + "grad_norm": 1.657830016653087, + "language_loss": 0.65369737, + "learning_rate": 2.456479289857949e-06, + "loss": 0.67527372, + "num_input_tokens_seen": 158334620, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.75, + "step": 7383, + "time_per_iteration": 2.530205726623535 + }, + { + "auxiliary_loss_clip": 0.01118822, + "auxiliary_loss_mlp": 0.01032867, + "balance_loss_clip": 1.01928902, + "balance_loss_mlp": 1.04226518, + "epoch": 0.4439500977002856, + "flos": 20339157893760.0, + "grad_norm": 3.0329093562680023, + "language_loss": 0.75660288, + "learning_rate": 2.4561000993713953e-06, + "loss": 0.77811974, + "num_input_tokens_seen": 158350550, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.765625, + "step": 7384, + "time_per_iteration": 2.5266385078430176 + }, + { + "auxiliary_loss_clip": 0.01118828, + "auxiliary_loss_mlp": 0.01033934, + "balance_loss_clip": 1.02092242, + "balance_loss_mlp": 1.04284334, + "epoch": 0.44401022095295356, + "flos": 20371225760640.0, + "grad_norm": 1.5666997146068944, + "language_loss": 0.81029254, + "learning_rate": 2.4557208915894796e-06, + "loss": 0.83182013, + "num_input_tokens_seen": 158369555, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.76171875, + "step": 7385, + "time_per_iteration": 2.4479992389678955 + }, + { + "auxiliary_loss_clip": 0.01116566, + "auxiliary_loss_mlp": 0.01035084, + "balance_loss_clip": 1.02111292, + "balance_loss_mlp": 1.04122996, + "epoch": 0.4440703442056215, + "flos": 20230635928320.0, + "grad_norm": 1.6468061831775258, + "language_loss": 0.82127023, + "learning_rate": 2.455341666526582e-06, + "loss": 0.84278667, + "num_input_tokens_seen": 158388045, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75390625, + "step": 7386, + "time_per_iteration": 2.48417067527771 + }, + { + "auxiliary_loss_clip": 0.01120079, + "auxiliary_loss_mlp": 0.01037797, + "balance_loss_clip": 1.02320611, + "balance_loss_mlp": 1.04189587, + "epoch": 0.4441304674582895, + "flos": 39494698824960.0, + "grad_norm": 1.953099317045194, + "language_loss": 0.69732893, + "learning_rate": 2.4549624241970832e-06, + "loss": 0.71890771, + "num_input_tokens_seen": 158410115, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.78125, + "step": 7387, + "time_per_iteration": 5.494876146316528 + }, + { + "auxiliary_loss_clip": 0.01114673, + "auxiliary_loss_mlp": 0.01038672, + "balance_loss_clip": 1.02546382, + "balance_loss_mlp": 1.03957582, + "epoch": 0.44419059071095746, + "flos": 14829671220480.0, + "grad_norm": 2.035383956259629, + "language_loss": 0.7170803, + "learning_rate": 2.4545831646153628e-06, + "loss": 0.73861378, + "num_input_tokens_seen": 158427765, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75, + "step": 7388, + "time_per_iteration": 2.4271323680877686 + }, + { + "auxiliary_loss_clip": 0.011178, + "auxiliary_loss_mlp": 0.01031227, + "balance_loss_clip": 1.01776195, + "balance_loss_mlp": 1.04137266, + "epoch": 0.4442507139636254, + "flos": 22637835734400.0, + "grad_norm": 1.4848855642281624, + "language_loss": 0.6881609, + "learning_rate": 2.4542038877958044e-06, + "loss": 0.70965117, + "num_input_tokens_seen": 158446375, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 7389, + "time_per_iteration": 2.4847142696380615 + }, + { + "auxiliary_loss_clip": 0.01115516, + "auxiliary_loss_mlp": 0.01032017, + "balance_loss_clip": 1.01918983, + "balance_loss_mlp": 1.04167664, + "epoch": 0.4443108372162934, + "flos": 38290721829120.0, + "grad_norm": 2.0051609497188587, + "language_loss": 0.74621141, + "learning_rate": 2.453824593752788e-06, + "loss": 0.76768672, + "num_input_tokens_seen": 158467260, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.73828125, + "step": 7390, + "time_per_iteration": 2.594834804534912 + }, + { + "auxiliary_loss_clip": 0.01116041, + "auxiliary_loss_mlp": 0.01033099, + "balance_loss_clip": 1.0202961, + "balance_loss_mlp": 1.04296565, + "epoch": 0.44437096046896135, + "flos": 17748993185280.0, + "grad_norm": 2.702415761973985, + "language_loss": 0.811364, + "learning_rate": 2.4534452825006988e-06, + "loss": 0.83285546, + "num_input_tokens_seen": 158486720, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.73046875, + "step": 7391, + "time_per_iteration": 2.4757862091064453 + }, + { + "auxiliary_loss_clip": 0.01116609, + "auxiliary_loss_mlp": 0.01034214, + "balance_loss_clip": 1.02070808, + "balance_loss_mlp": 1.04341137, + "epoch": 0.4444310837216293, + "flos": 13732348682880.0, + "grad_norm": 1.6224407429556025, + "language_loss": 0.73400211, + "learning_rate": 2.4530659540539185e-06, + "loss": 0.75551033, + "num_input_tokens_seen": 158502530, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 7392, + "time_per_iteration": 2.423929214477539 + }, + { + "auxiliary_loss_clip": 0.01113533, + "auxiliary_loss_mlp": 0.01029467, + "balance_loss_clip": 1.01705766, + "balance_loss_mlp": 1.03988051, + "epoch": 0.44449120697429734, + "flos": 25010238240000.0, + "grad_norm": 1.5529830220947678, + "language_loss": 0.79523122, + "learning_rate": 2.4526866084268313e-06, + "loss": 0.81666124, + "num_input_tokens_seen": 158522715, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.734375, + "step": 7393, + "time_per_iteration": 2.5162272453308105 + }, + { + "auxiliary_loss_clip": 0.01119885, + "auxiliary_loss_mlp": 0.01034531, + "balance_loss_clip": 1.02125716, + "balance_loss_mlp": 1.04248941, + "epoch": 0.4445513302269653, + "flos": 32671707609600.0, + "grad_norm": 1.9165659224437794, + "language_loss": 0.8090415, + "learning_rate": 2.4523072456338226e-06, + "loss": 0.83058566, + "num_input_tokens_seen": 158543615, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7734375, + "step": 7394, + "time_per_iteration": 2.5386714935302734 + }, + { + "auxiliary_loss_clip": 0.01114869, + "auxiliary_loss_mlp": 0.01039877, + "balance_loss_clip": 1.02772927, + "balance_loss_mlp": 1.04228508, + "epoch": 0.44461145347963327, + "flos": 11655814504320.0, + "grad_norm": 3.6807348725160502, + "language_loss": 0.79471326, + "learning_rate": 2.4519278656892785e-06, + "loss": 0.81626076, + "num_input_tokens_seen": 158560330, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.7265625, + "step": 7395, + "time_per_iteration": 2.4668092727661133 + }, + { + "auxiliary_loss_clip": 0.01114654, + "auxiliary_loss_mlp": 0.01033845, + "balance_loss_clip": 1.02162027, + "balance_loss_mlp": 1.04132056, + "epoch": 0.44467157673230123, + "flos": 20886759711360.0, + "grad_norm": 1.800276006342892, + "language_loss": 0.68493867, + "learning_rate": 2.451548468607584e-06, + "loss": 0.70642376, + "num_input_tokens_seen": 158579735, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.734375, + "step": 7396, + "time_per_iteration": 2.463660717010498 + }, + { + "auxiliary_loss_clip": 0.01117407, + "auxiliary_loss_mlp": 0.01031157, + "balance_loss_clip": 1.01831245, + "balance_loss_mlp": 1.0412426, + "epoch": 0.4447316999849692, + "flos": 18546137763840.0, + "grad_norm": 1.8246827609425533, + "language_loss": 0.81007254, + "learning_rate": 2.451169054403126e-06, + "loss": 0.83155811, + "num_input_tokens_seen": 158597075, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.76171875, + "step": 7397, + "time_per_iteration": 2.4812188148498535 + }, + { + "auxiliary_loss_clip": 0.01116158, + "auxiliary_loss_mlp": 0.01033503, + "balance_loss_clip": 1.02078366, + "balance_loss_mlp": 1.04323518, + "epoch": 0.44479182323763716, + "flos": 23769057732480.0, + "grad_norm": 1.7006854584246183, + "language_loss": 0.67145807, + "learning_rate": 2.450789623090293e-06, + "loss": 0.69295466, + "num_input_tokens_seen": 158616650, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7265625, + "step": 7398, + "time_per_iteration": 2.5075526237487793 + }, + { + "auxiliary_loss_clip": 0.01114723, + "auxiliary_loss_mlp": 0.01036485, + "balance_loss_clip": 1.02443874, + "balance_loss_mlp": 1.04204428, + "epoch": 0.44485194649030513, + "flos": 16543831040640.0, + "grad_norm": 1.9000444103330927, + "language_loss": 0.69551516, + "learning_rate": 2.450410174683472e-06, + "loss": 0.71702719, + "num_input_tokens_seen": 158634515, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.7265625, + "step": 7399, + "time_per_iteration": 2.522737741470337 + }, + { + "auxiliary_loss_clip": 0.01113023, + "auxiliary_loss_mlp": 0.01037037, + "balance_loss_clip": 1.02465105, + "balance_loss_mlp": 1.0408442, + "epoch": 0.4449120697429731, + "flos": 22600955445120.0, + "grad_norm": 1.713461165054691, + "language_loss": 0.7287724, + "learning_rate": 2.4500307091970514e-06, + "loss": 0.75027299, + "num_input_tokens_seen": 158653760, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71875, + "step": 7400, + "time_per_iteration": 2.4633662700653076 + }, + { + "auxiliary_loss_clip": 0.0111367, + "auxiliary_loss_mlp": 0.01030255, + "balance_loss_clip": 1.01755965, + "balance_loss_mlp": 1.04038024, + "epoch": 0.44497219299564106, + "flos": 20004864992640.0, + "grad_norm": 1.5216060200654076, + "language_loss": 0.85054708, + "learning_rate": 2.449651226645422e-06, + "loss": 0.87198627, + "num_input_tokens_seen": 158672190, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.734375, + "step": 7401, + "time_per_iteration": 2.5034339427948 + }, + { + "auxiliary_loss_clip": 0.01111761, + "auxiliary_loss_mlp": 0.01033771, + "balance_loss_clip": 1.02213049, + "balance_loss_mlp": 1.04065824, + "epoch": 0.445032316248309, + "flos": 25594253470080.0, + "grad_norm": 1.696028331559664, + "language_loss": 0.83296156, + "learning_rate": 2.449271727042973e-06, + "loss": 0.85441685, + "num_input_tokens_seen": 158694115, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.7109375, + "step": 7402, + "time_per_iteration": 2.501981258392334 + }, + { + "auxiliary_loss_clip": 0.0111579, + "auxiliary_loss_mlp": 0.01032303, + "balance_loss_clip": 1.01979768, + "balance_loss_mlp": 1.0420711, + "epoch": 0.445092439500977, + "flos": 21250426959360.0, + "grad_norm": 1.736524647333069, + "language_loss": 0.76953578, + "learning_rate": 2.4488922104040947e-06, + "loss": 0.7910167, + "num_input_tokens_seen": 158711000, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.73828125, + "step": 7403, + "time_per_iteration": 2.4778058528900146 + }, + { + "auxiliary_loss_clip": 0.01038113, + "auxiliary_loss_mlp": 0.01001838, + "balance_loss_clip": 1.00046086, + "balance_loss_mlp": 1.014925, + "epoch": 0.44515256275364495, + "flos": 57764900309760.0, + "grad_norm": 0.7475420058163609, + "language_loss": 0.60081208, + "learning_rate": 2.4485126767431793e-06, + "loss": 0.62121159, + "num_input_tokens_seen": 158769675, + "router_z_loss_clip": 0.01379395, + "router_z_loss_mlp": 0.23242188, + "step": 7404, + "time_per_iteration": 3.0548532009124756 + }, + { + "auxiliary_loss_clip": 0.01118666, + "auxiliary_loss_mlp": 0.01035192, + "balance_loss_clip": 1.02225208, + "balance_loss_mlp": 1.04285121, + "epoch": 0.4452126860063129, + "flos": 15596004908160.0, + "grad_norm": 1.6312624429793499, + "language_loss": 0.81696916, + "learning_rate": 2.4481331260746177e-06, + "loss": 0.83850771, + "num_input_tokens_seen": 158788215, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7578125, + "step": 7405, + "time_per_iteration": 2.474632978439331 + }, + { + "auxiliary_loss_clip": 0.0111153, + "auxiliary_loss_mlp": 0.01031071, + "balance_loss_clip": 1.01864958, + "balance_loss_mlp": 1.03843176, + "epoch": 0.4452728092589809, + "flos": 21617398258560.0, + "grad_norm": 1.4258557139975254, + "language_loss": 0.74869186, + "learning_rate": 2.4477535584128036e-06, + "loss": 0.77011788, + "num_input_tokens_seen": 158809090, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.73046875, + "step": 7406, + "time_per_iteration": 2.4767563343048096 + }, + { + "auxiliary_loss_clip": 0.01108887, + "auxiliary_loss_mlp": 0.01030592, + "balance_loss_clip": 1.01837921, + "balance_loss_mlp": 1.03819203, + "epoch": 0.4453329325116489, + "flos": 29497491757440.0, + "grad_norm": 1.5627122296340765, + "language_loss": 0.65510803, + "learning_rate": 2.447373973772129e-06, + "loss": 0.67650282, + "num_input_tokens_seen": 158828320, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.70703125, + "step": 7407, + "time_per_iteration": 2.5395827293395996 + }, + { + "auxiliary_loss_clip": 0.01118546, + "auxiliary_loss_mlp": 0.01029122, + "balance_loss_clip": 1.01691461, + "balance_loss_mlp": 1.04306138, + "epoch": 0.44539305576431687, + "flos": 21361139654400.0, + "grad_norm": 1.5061477696527659, + "language_loss": 0.67724633, + "learning_rate": 2.4469943721669887e-06, + "loss": 0.69872296, + "num_input_tokens_seen": 158847040, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.75390625, + "step": 7408, + "time_per_iteration": 2.462306261062622 + }, + { + "auxiliary_loss_clip": 0.0111265, + "auxiliary_loss_mlp": 0.01032186, + "balance_loss_clip": 1.01891828, + "balance_loss_mlp": 1.0386107, + "epoch": 0.44545317901698483, + "flos": 41427626428800.0, + "grad_norm": 1.4978343447976226, + "language_loss": 0.71923941, + "learning_rate": 2.4466147536117776e-06, + "loss": 0.74068785, + "num_input_tokens_seen": 158870490, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7421875, + "step": 7409, + "time_per_iteration": 2.674224615097046 + }, + { + "auxiliary_loss_clip": 0.01114249, + "auxiliary_loss_mlp": 0.01034826, + "balance_loss_clip": 1.02100968, + "balance_loss_mlp": 1.03980279, + "epoch": 0.4455133022696528, + "flos": 22055005653120.0, + "grad_norm": 2.031581575195052, + "language_loss": 0.64823419, + "learning_rate": 2.4462351181208895e-06, + "loss": 0.66972494, + "num_input_tokens_seen": 158889920, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7421875, + "step": 7410, + "time_per_iteration": 2.524874687194824 + }, + { + "auxiliary_loss_clip": 0.01120724, + "auxiliary_loss_mlp": 0.01033754, + "balance_loss_clip": 1.0200448, + "balance_loss_mlp": 1.04309118, + "epoch": 0.44557342552232077, + "flos": 23476960333440.0, + "grad_norm": 2.015615502497161, + "language_loss": 0.74042189, + "learning_rate": 2.4458554657087217e-06, + "loss": 0.76196671, + "num_input_tokens_seen": 158909580, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.77734375, + "step": 7411, + "time_per_iteration": 2.512510061264038 + }, + { + "auxiliary_loss_clip": 0.01112773, + "auxiliary_loss_mlp": 0.01031337, + "balance_loss_clip": 1.01900446, + "balance_loss_mlp": 1.04189968, + "epoch": 0.44563354877498873, + "flos": 19134678107520.0, + "grad_norm": 1.869475782048451, + "language_loss": 0.79242551, + "learning_rate": 2.4454757963896695e-06, + "loss": 0.81386662, + "num_input_tokens_seen": 158924600, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.70703125, + "step": 7412, + "time_per_iteration": 2.472858190536499 + }, + { + "auxiliary_loss_clip": 0.01114909, + "auxiliary_loss_mlp": 0.01035461, + "balance_loss_clip": 1.02299762, + "balance_loss_mlp": 1.03920937, + "epoch": 0.4456936720276567, + "flos": 13621420506240.0, + "grad_norm": 3.400478569187806, + "language_loss": 0.798675, + "learning_rate": 2.4450961101781304e-06, + "loss": 0.82017869, + "num_input_tokens_seen": 158939345, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7578125, + "step": 7413, + "time_per_iteration": 2.4117238521575928 + }, + { + "auxiliary_loss_clip": 0.01112114, + "auxiliary_loss_mlp": 0.01028076, + "balance_loss_clip": 1.01601171, + "balance_loss_mlp": 1.04039168, + "epoch": 0.44575379528032466, + "flos": 14713715139840.0, + "grad_norm": 1.7210919700182319, + "language_loss": 0.76510686, + "learning_rate": 2.4447164070885026e-06, + "loss": 0.7865088, + "num_input_tokens_seen": 158955855, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.71875, + "step": 7414, + "time_per_iteration": 2.460224151611328 + }, + { + "auxiliary_loss_clip": 0.01113268, + "auxiliary_loss_mlp": 0.01033582, + "balance_loss_clip": 1.02064216, + "balance_loss_mlp": 1.04047227, + "epoch": 0.4458139185329926, + "flos": 24170682677760.0, + "grad_norm": 1.4395051245379855, + "language_loss": 0.83344847, + "learning_rate": 2.4443366871351837e-06, + "loss": 0.85491699, + "num_input_tokens_seen": 158976315, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7265625, + "step": 7415, + "time_per_iteration": 2.487433910369873 + }, + { + "auxiliary_loss_clip": 0.01111103, + "auxiliary_loss_mlp": 0.01039317, + "balance_loss_clip": 1.02675223, + "balance_loss_mlp": 1.03786182, + "epoch": 0.4458740417856606, + "flos": 21762225895680.0, + "grad_norm": 1.5295363489819147, + "language_loss": 0.84025514, + "learning_rate": 2.4439569503325732e-06, + "loss": 0.86175931, + "num_input_tokens_seen": 158996725, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.734375, + "step": 7416, + "time_per_iteration": 2.4827380180358887 + }, + { + "auxiliary_loss_clip": 0.0111513, + "auxiliary_loss_mlp": 0.01031747, + "balance_loss_clip": 1.01872349, + "balance_loss_mlp": 1.03937066, + "epoch": 0.44593416503832856, + "flos": 21068790860160.0, + "grad_norm": 1.5840815969934987, + "language_loss": 0.8099134, + "learning_rate": 2.4435771966950706e-06, + "loss": 0.83138216, + "num_input_tokens_seen": 159017255, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7578125, + "step": 7417, + "time_per_iteration": 2.48150897026062 + }, + { + "auxiliary_loss_clip": 0.01115498, + "auxiliary_loss_mlp": 0.01039656, + "balance_loss_clip": 1.02679276, + "balance_loss_mlp": 1.04055572, + "epoch": 0.4459942882909965, + "flos": 22600488568320.0, + "grad_norm": 1.9543176040955477, + "language_loss": 0.81078619, + "learning_rate": 2.443197426237077e-06, + "loss": 0.83233768, + "num_input_tokens_seen": 159035010, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.75, + "step": 7418, + "time_per_iteration": 2.489847421646118 + }, + { + "auxiliary_loss_clip": 0.0111452, + "auxiliary_loss_mlp": 0.01029153, + "balance_loss_clip": 1.01647544, + "balance_loss_mlp": 1.04015303, + "epoch": 0.4460544115436645, + "flos": 26505486622080.0, + "grad_norm": 1.586204851514133, + "language_loss": 0.77404898, + "learning_rate": 2.442817638972991e-06, + "loss": 0.79548573, + "num_input_tokens_seen": 159055345, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7421875, + "step": 7419, + "time_per_iteration": 2.497434377670288 + }, + { + "auxiliary_loss_clip": 0.01112333, + "auxiliary_loss_mlp": 0.01034157, + "balance_loss_clip": 1.02190208, + "balance_loss_mlp": 1.03983605, + "epoch": 0.4461145347963325, + "flos": 17604021893760.0, + "grad_norm": 1.7862585645473121, + "language_loss": 0.72408056, + "learning_rate": 2.4424378349172176e-06, + "loss": 0.74554545, + "num_input_tokens_seen": 159074225, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.7265625, + "step": 7420, + "time_per_iteration": 2.459458351135254 + }, + { + "auxiliary_loss_clip": 0.01113499, + "auxiliary_loss_mlp": 0.0103005, + "balance_loss_clip": 1.0166688, + "balance_loss_mlp": 1.0416131, + "epoch": 0.44617465804900047, + "flos": 27268193036160.0, + "grad_norm": 1.6779849239209732, + "language_loss": 0.75009704, + "learning_rate": 2.442058014084156e-06, + "loss": 0.77153254, + "num_input_tokens_seen": 159095415, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.71875, + "step": 7421, + "time_per_iteration": 2.51987624168396 + }, + { + "auxiliary_loss_clip": 0.01110345, + "auxiliary_loss_mlp": 0.01032284, + "balance_loss_clip": 1.02002299, + "balance_loss_mlp": 1.04095602, + "epoch": 0.44623478130166844, + "flos": 17786412178560.0, + "grad_norm": 1.9054244397804427, + "language_loss": 0.76410532, + "learning_rate": 2.44167817648821e-06, + "loss": 0.78553158, + "num_input_tokens_seen": 159114615, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 7422, + "time_per_iteration": 2.4755024909973145 + }, + { + "auxiliary_loss_clip": 0.0111206, + "auxiliary_loss_mlp": 0.01031918, + "balance_loss_clip": 1.01975894, + "balance_loss_mlp": 1.03931499, + "epoch": 0.4462949045543364, + "flos": 23003011353600.0, + "grad_norm": 1.4448000656244153, + "language_loss": 0.65126681, + "learning_rate": 2.441298322143784e-06, + "loss": 0.6727066, + "num_input_tokens_seen": 159134370, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.7265625, + "step": 7423, + "time_per_iteration": 2.4828243255615234 + }, + { + "auxiliary_loss_clip": 0.01110236, + "auxiliary_loss_mlp": 0.01028687, + "balance_loss_clip": 1.01719534, + "balance_loss_mlp": 1.04027271, + "epoch": 0.44635502780700437, + "flos": 17820096157440.0, + "grad_norm": 1.510185037273786, + "language_loss": 0.78842837, + "learning_rate": 2.4409184510652807e-06, + "loss": 0.80981761, + "num_input_tokens_seen": 159152540, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.69921875, + "step": 7424, + "time_per_iteration": 2.4399938583374023 + }, + { + "auxiliary_loss_clip": 0.01111318, + "auxiliary_loss_mlp": 0.01032011, + "balance_loss_clip": 1.02010214, + "balance_loss_mlp": 1.04070699, + "epoch": 0.44641515105967233, + "flos": 26688020561280.0, + "grad_norm": 1.3563203456934205, + "language_loss": 0.80225039, + "learning_rate": 2.4405385632671063e-06, + "loss": 0.82368374, + "num_input_tokens_seen": 159173425, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.703125, + "step": 7425, + "time_per_iteration": 2.5406088829040527 + }, + { + "auxiliary_loss_clip": 0.01111697, + "auxiliary_loss_mlp": 0.01031502, + "balance_loss_clip": 1.0190568, + "balance_loss_mlp": 1.04027843, + "epoch": 0.4464752743123403, + "flos": 18913324544640.0, + "grad_norm": 2.6114514678489895, + "language_loss": 0.77294517, + "learning_rate": 2.4401586587636655e-06, + "loss": 0.79437709, + "num_input_tokens_seen": 159191210, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71484375, + "step": 7426, + "time_per_iteration": 2.4845876693725586 + }, + { + "auxiliary_loss_clip": 0.01112123, + "auxiliary_loss_mlp": 0.01028013, + "balance_loss_clip": 1.01636636, + "balance_loss_mlp": 1.03881311, + "epoch": 0.44653539756500826, + "flos": 29570318582400.0, + "grad_norm": 1.552934875151276, + "language_loss": 0.64668226, + "learning_rate": 2.4397787375693634e-06, + "loss": 0.66808361, + "num_input_tokens_seen": 159211755, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.734375, + "step": 7427, + "time_per_iteration": 2.540630340576172 + }, + { + "auxiliary_loss_clip": 0.01116984, + "auxiliary_loss_mlp": 0.01032083, + "balance_loss_clip": 1.02009046, + "balance_loss_mlp": 1.04497719, + "epoch": 0.44659552081767623, + "flos": 21468979261440.0, + "grad_norm": 1.583763048167789, + "language_loss": 0.75103819, + "learning_rate": 2.439398799698608e-06, + "loss": 0.77252889, + "num_input_tokens_seen": 159230315, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.71875, + "step": 7428, + "time_per_iteration": 3.8718421459198 + }, + { + "auxiliary_loss_clip": 0.01111211, + "auxiliary_loss_mlp": 0.01032965, + "balance_loss_clip": 1.0205195, + "balance_loss_mlp": 1.03955674, + "epoch": 0.4466556440703442, + "flos": 17931886260480.0, + "grad_norm": 1.8476152433667956, + "language_loss": 0.77595931, + "learning_rate": 2.439018845165806e-06, + "loss": 0.79740107, + "num_input_tokens_seen": 159249810, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71875, + "step": 7429, + "time_per_iteration": 5.381062984466553 + }, + { + "auxiliary_loss_clip": 0.01114674, + "auxiliary_loss_mlp": 0.01032028, + "balance_loss_clip": 1.01935029, + "balance_loss_mlp": 1.04038692, + "epoch": 0.44671576732301216, + "flos": 21107430915840.0, + "grad_norm": 1.5332211966047418, + "language_loss": 0.91229695, + "learning_rate": 2.438638873985366e-06, + "loss": 0.93376398, + "num_input_tokens_seen": 159271715, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7421875, + "step": 7430, + "time_per_iteration": 2.4677700996398926 + }, + { + "auxiliary_loss_clip": 0.0111698, + "auxiliary_loss_mlp": 0.01038071, + "balance_loss_clip": 1.02439737, + "balance_loss_mlp": 1.04052413, + "epoch": 0.4467758905756801, + "flos": 23508920459520.0, + "grad_norm": 1.5443417480404311, + "language_loss": 0.79630744, + "learning_rate": 2.4382588861716954e-06, + "loss": 0.81785798, + "num_input_tokens_seen": 159290690, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.765625, + "step": 7431, + "time_per_iteration": 2.567082405090332 + }, + { + "auxiliary_loss_clip": 0.0111745, + "auxiliary_loss_mlp": 0.01037244, + "balance_loss_clip": 1.02438116, + "balance_loss_mlp": 1.04187393, + "epoch": 0.4468360138283481, + "flos": 18734022829440.0, + "grad_norm": 2.0676923701008807, + "language_loss": 0.80376756, + "learning_rate": 2.437878881739204e-06, + "loss": 0.82531446, + "num_input_tokens_seen": 159309400, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.75390625, + "step": 7432, + "time_per_iteration": 2.4359145164489746 + }, + { + "auxiliary_loss_clip": 0.01115042, + "auxiliary_loss_mlp": 0.01036362, + "balance_loss_clip": 1.02394009, + "balance_loss_mlp": 1.03957176, + "epoch": 0.4468961370810161, + "flos": 23477139901440.0, + "grad_norm": 2.022128912320156, + "language_loss": 0.76601076, + "learning_rate": 2.437498860702301e-06, + "loss": 0.78752482, + "num_input_tokens_seen": 159327425, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.75390625, + "step": 7433, + "time_per_iteration": 2.48732852935791 + }, + { + "auxiliary_loss_clip": 0.0110862, + "auxiliary_loss_mlp": 0.01033692, + "balance_loss_clip": 1.0233326, + "balance_loss_mlp": 1.03873658, + "epoch": 0.4469562603336841, + "flos": 30075042539520.0, + "grad_norm": 1.6660023236153727, + "language_loss": 0.7773807, + "learning_rate": 2.437118823075398e-06, + "loss": 0.79880381, + "num_input_tokens_seen": 159345805, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.69921875, + "step": 7434, + "time_per_iteration": 2.501410961151123 + }, + { + "auxiliary_loss_clip": 0.01117105, + "auxiliary_loss_mlp": 0.01034097, + "balance_loss_clip": 1.02160966, + "balance_loss_mlp": 1.04261708, + "epoch": 0.44701638358635204, + "flos": 22456415116800.0, + "grad_norm": 1.6324454169441744, + "language_loss": 0.64255738, + "learning_rate": 2.436738768872905e-06, + "loss": 0.66406941, + "num_input_tokens_seen": 159364595, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.74609375, + "step": 7435, + "time_per_iteration": 2.506918430328369 + }, + { + "auxiliary_loss_clip": 0.01116438, + "auxiliary_loss_mlp": 0.01029509, + "balance_loss_clip": 1.01706398, + "balance_loss_mlp": 1.04181314, + "epoch": 0.44707650683902, + "flos": 24057851080320.0, + "grad_norm": 1.4705490989927619, + "language_loss": 0.83558768, + "learning_rate": 2.4363586981092346e-06, + "loss": 0.8570472, + "num_input_tokens_seen": 159385265, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.74609375, + "step": 7436, + "time_per_iteration": 2.482273817062378 + }, + { + "auxiliary_loss_clip": 0.01114793, + "auxiliary_loss_mlp": 0.01033883, + "balance_loss_clip": 1.02067423, + "balance_loss_mlp": 1.0400939, + "epoch": 0.44713663009168797, + "flos": 23766938830080.0, + "grad_norm": 1.6782401052542175, + "language_loss": 0.79564971, + "learning_rate": 2.435978610798798e-06, + "loss": 0.81713653, + "num_input_tokens_seen": 159405080, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.74609375, + "step": 7437, + "time_per_iteration": 2.519118309020996 + }, + { + "auxiliary_loss_clip": 0.01114275, + "auxiliary_loss_mlp": 0.01032586, + "balance_loss_clip": 1.02017021, + "balance_loss_mlp": 1.03965664, + "epoch": 0.44719675334435594, + "flos": 24499265316480.0, + "grad_norm": 1.5877629147247494, + "language_loss": 0.71921134, + "learning_rate": 2.435598506956009e-06, + "loss": 0.74067998, + "num_input_tokens_seen": 159424595, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.74609375, + "step": 7438, + "time_per_iteration": 2.4918689727783203 + }, + { + "auxiliary_loss_clip": 0.01114196, + "auxiliary_loss_mlp": 0.01034386, + "balance_loss_clip": 1.02114832, + "balance_loss_mlp": 1.03908634, + "epoch": 0.4472568765970239, + "flos": 29781759991680.0, + "grad_norm": 1.558408845854645, + "language_loss": 0.67469549, + "learning_rate": 2.4352183865952808e-06, + "loss": 0.6961813, + "num_input_tokens_seen": 159443865, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75, + "step": 7439, + "time_per_iteration": 2.549445390701294 + }, + { + "auxiliary_loss_clip": 0.01116567, + "auxiliary_loss_mlp": 0.01035107, + "balance_loss_clip": 1.0218277, + "balance_loss_mlp": 1.04164815, + "epoch": 0.44731699984969187, + "flos": 24643123286400.0, + "grad_norm": 1.6525243551580215, + "language_loss": 0.73600596, + "learning_rate": 2.4348382497310285e-06, + "loss": 0.7575227, + "num_input_tokens_seen": 159464525, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75, + "step": 7440, + "time_per_iteration": 2.487545967102051 + }, + { + "auxiliary_loss_clip": 0.01112285, + "auxiliary_loss_mlp": 0.01034061, + "balance_loss_clip": 1.02215195, + "balance_loss_mlp": 1.03937638, + "epoch": 0.44737712310235983, + "flos": 29455691304960.0, + "grad_norm": 1.5916362290459067, + "language_loss": 0.74376386, + "learning_rate": 2.4344580963776655e-06, + "loss": 0.76522732, + "num_input_tokens_seen": 159486385, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.73046875, + "step": 7441, + "time_per_iteration": 2.537848472595215 + }, + { + "auxiliary_loss_clip": 0.01116121, + "auxiliary_loss_mlp": 0.01039305, + "balance_loss_clip": 1.02585804, + "balance_loss_mlp": 1.04112506, + "epoch": 0.4474372463550278, + "flos": 24896832024960.0, + "grad_norm": 2.062950208020596, + "language_loss": 0.74780977, + "learning_rate": 2.4340779265496082e-06, + "loss": 0.769364, + "num_input_tokens_seen": 159503880, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 7442, + "time_per_iteration": 2.45829701423645 + }, + { + "auxiliary_loss_clip": 0.0111647, + "auxiliary_loss_mlp": 0.01034752, + "balance_loss_clip": 1.02123356, + "balance_loss_mlp": 1.03977489, + "epoch": 0.44749736960769576, + "flos": 33181603125120.0, + "grad_norm": 1.7358505546612006, + "language_loss": 0.7456758, + "learning_rate": 2.433697740261273e-06, + "loss": 0.76718801, + "num_input_tokens_seen": 159522980, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.76953125, + "step": 7443, + "time_per_iteration": 2.604759931564331 + }, + { + "auxiliary_loss_clip": 0.01111225, + "auxiliary_loss_mlp": 0.01028498, + "balance_loss_clip": 1.01500916, + "balance_loss_mlp": 1.0379262, + "epoch": 0.4475574928603637, + "flos": 21071807602560.0, + "grad_norm": 1.8898561004653542, + "language_loss": 0.77591091, + "learning_rate": 2.4333175375270748e-06, + "loss": 0.79730821, + "num_input_tokens_seen": 159543340, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 7444, + "time_per_iteration": 2.5373945236206055 + }, + { + "auxiliary_loss_clip": 0.01110179, + "auxiliary_loss_mlp": 0.01030626, + "balance_loss_clip": 1.01813924, + "balance_loss_mlp": 1.03841698, + "epoch": 0.4476176161130317, + "flos": 21862523646720.0, + "grad_norm": 2.3020631966175893, + "language_loss": 0.85495317, + "learning_rate": 2.4329373183614333e-06, + "loss": 0.87636125, + "num_input_tokens_seen": 159558210, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71875, + "step": 7445, + "time_per_iteration": 2.4707260131835938 + }, + { + "auxiliary_loss_clip": 0.01116558, + "auxiliary_loss_mlp": 0.01030825, + "balance_loss_clip": 1.01741982, + "balance_loss_mlp": 1.04191256, + "epoch": 0.4476777393656997, + "flos": 22528667324160.0, + "grad_norm": 3.672789877680737, + "language_loss": 0.64349431, + "learning_rate": 2.432557082778765e-06, + "loss": 0.66496813, + "num_input_tokens_seen": 159577920, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.74609375, + "step": 7446, + "time_per_iteration": 2.4802255630493164 + }, + { + "auxiliary_loss_clip": 0.0103814, + "auxiliary_loss_mlp": 0.0100263, + "balance_loss_clip": 1.00128329, + "balance_loss_mlp": 1.01421368, + "epoch": 0.4477378626183677, + "flos": 49017133877760.0, + "grad_norm": 0.7477421339074387, + "language_loss": 0.50242257, + "learning_rate": 2.4321768307934884e-06, + "loss": 0.52283025, + "num_input_tokens_seen": 159632295, + "router_z_loss_clip": 0.01348877, + "router_z_loss_mlp": 0.24023438, + "step": 7447, + "time_per_iteration": 2.9262073040008545 + }, + { + "auxiliary_loss_clip": 0.01037975, + "auxiliary_loss_mlp": 0.01002161, + "balance_loss_clip": 1.00088537, + "balance_loss_mlp": 1.01407075, + "epoch": 0.44779798587103564, + "flos": 56542179392640.0, + "grad_norm": 0.7583700928831021, + "language_loss": 0.59290731, + "learning_rate": 2.4317965624200235e-06, + "loss": 0.61330867, + "num_input_tokens_seen": 159698435, + "router_z_loss_clip": 0.01275635, + "router_z_loss_mlp": 0.23925781, + "step": 7448, + "time_per_iteration": 3.2298059463500977 + }, + { + "auxiliary_loss_clip": 0.01112419, + "auxiliary_loss_mlp": 0.01032837, + "balance_loss_clip": 1.02082074, + "balance_loss_mlp": 1.03913987, + "epoch": 0.4478581091237036, + "flos": 46498536040320.0, + "grad_norm": 1.4697324100578784, + "language_loss": 0.59226847, + "learning_rate": 2.431416277672789e-06, + "loss": 0.61372101, + "num_input_tokens_seen": 159722150, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.73046875, + "step": 7449, + "time_per_iteration": 2.667651891708374 + }, + { + "auxiliary_loss_clip": 0.01114654, + "auxiliary_loss_mlp": 0.01028568, + "balance_loss_clip": 1.01638436, + "balance_loss_mlp": 1.04082561, + "epoch": 0.4479182323763716, + "flos": 20814363849600.0, + "grad_norm": 1.6912833904949394, + "language_loss": 0.79799938, + "learning_rate": 2.4310359765662065e-06, + "loss": 0.8194316, + "num_input_tokens_seen": 159740550, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.73828125, + "step": 7450, + "time_per_iteration": 2.488041400909424 + }, + { + "auxiliary_loss_clip": 0.01112446, + "auxiliary_loss_mlp": 0.01031639, + "balance_loss_clip": 1.01900911, + "balance_loss_mlp": 1.03948057, + "epoch": 0.44797835562903954, + "flos": 14245979212800.0, + "grad_norm": 2.443005371711525, + "language_loss": 0.79474008, + "learning_rate": 2.430655659114697e-06, + "loss": 0.81618094, + "num_input_tokens_seen": 159758245, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.73046875, + "step": 7451, + "time_per_iteration": 2.4184346199035645 + }, + { + "auxiliary_loss_clip": 0.01037194, + "auxiliary_loss_mlp": 0.01000693, + "balance_loss_clip": 0.99944174, + "balance_loss_mlp": 1.01323009, + "epoch": 0.4480384788817075, + "flos": 63534560169600.0, + "grad_norm": 2.1611139577707608, + "language_loss": 0.62848771, + "learning_rate": 2.430275325332681e-06, + "loss": 0.64886659, + "num_input_tokens_seen": 159826790, + "router_z_loss_clip": 0.01251221, + "router_z_loss_mlp": 0.24023438, + "step": 7452, + "time_per_iteration": 3.1637966632843018 + }, + { + "auxiliary_loss_clip": 0.01115495, + "auxiliary_loss_mlp": 0.01036415, + "balance_loss_clip": 1.0227952, + "balance_loss_mlp": 1.04087877, + "epoch": 0.44809860213437547, + "flos": 21652626522240.0, + "grad_norm": 1.7752989444397396, + "language_loss": 0.62657529, + "learning_rate": 2.429894975234582e-06, + "loss": 0.64809442, + "num_input_tokens_seen": 159845805, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.74609375, + "step": 7453, + "time_per_iteration": 2.4473493099212646 + }, + { + "auxiliary_loss_clip": 0.01036714, + "auxiliary_loss_mlp": 0.01000711, + "balance_loss_clip": 0.99935836, + "balance_loss_mlp": 1.01265335, + "epoch": 0.44815872538704343, + "flos": 69190634246400.0, + "grad_norm": 0.7532005340797263, + "language_loss": 0.57028639, + "learning_rate": 2.4295146088348224e-06, + "loss": 0.59066069, + "num_input_tokens_seen": 159898860, + "router_z_loss_clip": 0.0135498, + "router_z_loss_mlp": 0.24023438, + "step": 7454, + "time_per_iteration": 2.9524526596069336 + }, + { + "auxiliary_loss_clip": 0.01111502, + "auxiliary_loss_mlp": 0.01027601, + "balance_loss_clip": 1.01563811, + "balance_loss_mlp": 1.03850055, + "epoch": 0.4482188486397114, + "flos": 12598289510400.0, + "grad_norm": 2.2509965352428334, + "language_loss": 0.75078607, + "learning_rate": 2.4291342261478255e-06, + "loss": 0.7721771, + "num_input_tokens_seen": 159911555, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.73046875, + "step": 7455, + "time_per_iteration": 2.4103891849517822 + }, + { + "auxiliary_loss_clip": 0.01112978, + "auxiliary_loss_mlp": 0.01029679, + "balance_loss_clip": 1.01761508, + "balance_loss_mlp": 1.03976846, + "epoch": 0.44827897189237936, + "flos": 34058182631040.0, + "grad_norm": 1.6579032105665654, + "language_loss": 0.76428723, + "learning_rate": 2.428753827188016e-06, + "loss": 0.78571379, + "num_input_tokens_seen": 159931470, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.734375, + "step": 7456, + "time_per_iteration": 2.5631935596466064 + }, + { + "auxiliary_loss_clip": 0.01115214, + "auxiliary_loss_mlp": 0.01035659, + "balance_loss_clip": 1.02398849, + "balance_loss_mlp": 1.04312015, + "epoch": 0.44833909514504733, + "flos": 25147416280320.0, + "grad_norm": 1.9831255862845865, + "language_loss": 0.76475745, + "learning_rate": 2.428373411969818e-06, + "loss": 0.78626615, + "num_input_tokens_seen": 159946115, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.71875, + "step": 7457, + "time_per_iteration": 2.464808702468872 + }, + { + "auxiliary_loss_clip": 0.01113345, + "auxiliary_loss_mlp": 0.01029291, + "balance_loss_clip": 1.01611805, + "balance_loss_mlp": 1.03910387, + "epoch": 0.4483992183977153, + "flos": 16179984224640.0, + "grad_norm": 1.9767465188311044, + "language_loss": 0.67705971, + "learning_rate": 2.4279929805076576e-06, + "loss": 0.69848609, + "num_input_tokens_seen": 159963915, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 7458, + "time_per_iteration": 2.4457101821899414 + }, + { + "auxiliary_loss_clip": 0.01116638, + "auxiliary_loss_mlp": 0.01031464, + "balance_loss_clip": 1.01787972, + "balance_loss_mlp": 1.04051626, + "epoch": 0.44845934165038326, + "flos": 17746048270080.0, + "grad_norm": 1.5619796593676711, + "language_loss": 0.72202468, + "learning_rate": 2.427612532815961e-06, + "loss": 0.74350572, + "num_input_tokens_seen": 159982140, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.76171875, + "step": 7459, + "time_per_iteration": 2.433029890060425 + }, + { + "auxiliary_loss_clip": 0.0110945, + "auxiliary_loss_mlp": 0.01029093, + "balance_loss_clip": 1.01676071, + "balance_loss_mlp": 1.03716815, + "epoch": 0.4485194649030513, + "flos": 21835914647040.0, + "grad_norm": 1.8000530949283695, + "language_loss": 0.69520539, + "learning_rate": 2.427232068909154e-06, + "loss": 0.71659082, + "num_input_tokens_seen": 160002280, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.72265625, + "step": 7460, + "time_per_iteration": 2.4872210025787354 + }, + { + "auxiliary_loss_clip": 0.01111602, + "auxiliary_loss_mlp": 0.01034327, + "balance_loss_clip": 1.02144051, + "balance_loss_mlp": 1.03848231, + "epoch": 0.44857958815571924, + "flos": 20084515401600.0, + "grad_norm": 1.9864484577730697, + "language_loss": 0.77204525, + "learning_rate": 2.4268515888016635e-06, + "loss": 0.79350454, + "num_input_tokens_seen": 160020260, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.734375, + "step": 7461, + "time_per_iteration": 2.455543279647827 + }, + { + "auxiliary_loss_clip": 0.01111999, + "auxiliary_loss_mlp": 0.01029771, + "balance_loss_clip": 1.0180943, + "balance_loss_mlp": 1.03780031, + "epoch": 0.4486397114083872, + "flos": 27053519402880.0, + "grad_norm": 1.7106561387980361, + "language_loss": 0.67983574, + "learning_rate": 2.4264710925079184e-06, + "loss": 0.70125341, + "num_input_tokens_seen": 160040240, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.7421875, + "step": 7462, + "time_per_iteration": 2.5366299152374268 + }, + { + "auxiliary_loss_clip": 0.01034999, + "auxiliary_loss_mlp": 0.01002003, + "balance_loss_clip": 1.00071561, + "balance_loss_mlp": 1.01134682, + "epoch": 0.4486998346610552, + "flos": 67321195931520.0, + "grad_norm": 0.7463947253576576, + "language_loss": 0.54503644, + "learning_rate": 2.4260905800423462e-06, + "loss": 0.56540644, + "num_input_tokens_seen": 160093865, + "router_z_loss_clip": 0.01287842, + "router_z_loss_mlp": 0.23632812, + "step": 7463, + "time_per_iteration": 3.0639255046844482 + }, + { + "auxiliary_loss_clip": 0.01110954, + "auxiliary_loss_mlp": 0.01029704, + "balance_loss_clip": 1.01699638, + "balance_loss_mlp": 1.03847826, + "epoch": 0.44875995791372314, + "flos": 27636816360960.0, + "grad_norm": 1.9527582175804243, + "language_loss": 0.75866246, + "learning_rate": 2.4257100514193775e-06, + "loss": 0.78006899, + "num_input_tokens_seen": 160113590, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 7464, + "time_per_iteration": 2.5135347843170166 + }, + { + "auxiliary_loss_clip": 0.01109858, + "auxiliary_loss_mlp": 0.01033694, + "balance_loss_clip": 1.02225554, + "balance_loss_mlp": 1.03903246, + "epoch": 0.4488200811663911, + "flos": 13005947940480.0, + "grad_norm": 1.8117694427226085, + "language_loss": 0.73671377, + "learning_rate": 2.425329506653441e-06, + "loss": 0.75814927, + "num_input_tokens_seen": 160131795, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.70703125, + "step": 7465, + "time_per_iteration": 2.433394432067871 + }, + { + "auxiliary_loss_clip": 0.01118642, + "auxiliary_loss_mlp": 0.01035747, + "balance_loss_clip": 1.02193666, + "balance_loss_mlp": 1.04127038, + "epoch": 0.44888020441905907, + "flos": 27489977562240.0, + "grad_norm": 1.824586312100338, + "language_loss": 0.7996276, + "learning_rate": 2.424948945758966e-06, + "loss": 0.82117152, + "num_input_tokens_seen": 160150635, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7734375, + "step": 7466, + "time_per_iteration": 2.5013458728790283 + }, + { + "auxiliary_loss_clip": 0.01114545, + "auxiliary_loss_mlp": 0.01031895, + "balance_loss_clip": 1.01967633, + "balance_loss_mlp": 1.04118383, + "epoch": 0.44894032767172704, + "flos": 18259678800000.0, + "grad_norm": 2.612382799524426, + "language_loss": 0.80522013, + "learning_rate": 2.4245683687503844e-06, + "loss": 0.82668447, + "num_input_tokens_seen": 160168615, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.734375, + "step": 7467, + "time_per_iteration": 2.4517929553985596 + }, + { + "auxiliary_loss_clip": 0.01109457, + "auxiliary_loss_mlp": 0.01031548, + "balance_loss_clip": 1.01998448, + "balance_loss_mlp": 1.03988719, + "epoch": 0.449000450924395, + "flos": 21579835610880.0, + "grad_norm": 1.7208509955346651, + "language_loss": 0.75153285, + "learning_rate": 2.424187775642129e-06, + "loss": 0.7729429, + "num_input_tokens_seen": 160187295, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6953125, + "step": 7468, + "time_per_iteration": 2.4585771560668945 + }, + { + "auxiliary_loss_clip": 0.01112269, + "auxiliary_loss_mlp": 0.01025298, + "balance_loss_clip": 1.01422918, + "balance_loss_mlp": 1.04034877, + "epoch": 0.44906057417706297, + "flos": 17967904623360.0, + "grad_norm": 1.8721286685005696, + "language_loss": 0.7099303, + "learning_rate": 2.4238071664486297e-06, + "loss": 0.73130596, + "num_input_tokens_seen": 160205115, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.71875, + "step": 7469, + "time_per_iteration": 2.420208692550659 + }, + { + "auxiliary_loss_clip": 0.01114048, + "auxiliary_loss_mlp": 0.01035745, + "balance_loss_clip": 1.02298415, + "balance_loss_mlp": 1.04046845, + "epoch": 0.44912069742973093, + "flos": 20047347803520.0, + "grad_norm": 1.7828692415308351, + "language_loss": 0.71891844, + "learning_rate": 2.4234265411843203e-06, + "loss": 0.74041635, + "num_input_tokens_seen": 160222580, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.734375, + "step": 7470, + "time_per_iteration": 5.381145477294922 + }, + { + "auxiliary_loss_clip": 0.01112344, + "auxiliary_loss_mlp": 0.01032399, + "balance_loss_clip": 1.01940536, + "balance_loss_mlp": 1.03871441, + "epoch": 0.4491808206823989, + "flos": 21033526682880.0, + "grad_norm": 2.1026178485463274, + "language_loss": 0.76912111, + "learning_rate": 2.423045899863634e-06, + "loss": 0.79056853, + "num_input_tokens_seen": 160241520, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.73828125, + "step": 7471, + "time_per_iteration": 3.925541400909424 + }, + { + "auxiliary_loss_clip": 0.01113353, + "auxiliary_loss_mlp": 0.01033085, + "balance_loss_clip": 1.02128363, + "balance_loss_mlp": 1.04100883, + "epoch": 0.44924094393506686, + "flos": 22967136645120.0, + "grad_norm": 1.8719894830330126, + "language_loss": 0.70339048, + "learning_rate": 2.4226652425010048e-06, + "loss": 0.72485489, + "num_input_tokens_seen": 160261815, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.7265625, + "step": 7472, + "time_per_iteration": 2.5138602256774902 + }, + { + "auxiliary_loss_clip": 0.01038244, + "auxiliary_loss_mlp": 0.01015151, + "balance_loss_clip": 1.01388156, + "balance_loss_mlp": 1.01404762, + "epoch": 0.4493010671877349, + "flos": 59233467864960.0, + "grad_norm": 0.7429949026472541, + "language_loss": 0.61734539, + "learning_rate": 2.4222845691108676e-06, + "loss": 0.63787931, + "num_input_tokens_seen": 160317070, + "router_z_loss_clip": 0.01269531, + "router_z_loss_mlp": 0.2421875, + "step": 7473, + "time_per_iteration": 3.0049262046813965 + }, + { + "auxiliary_loss_clip": 0.01114767, + "auxiliary_loss_mlp": 0.01037887, + "balance_loss_clip": 1.02495253, + "balance_loss_mlp": 1.04087818, + "epoch": 0.44936119044040285, + "flos": 18004892653440.0, + "grad_norm": 2.4001000632965828, + "language_loss": 0.78185022, + "learning_rate": 2.421903879707657e-06, + "loss": 0.80337679, + "num_input_tokens_seen": 160334980, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.73828125, + "step": 7474, + "time_per_iteration": 2.4396324157714844 + }, + { + "auxiliary_loss_clip": 0.01110455, + "auxiliary_loss_mlp": 0.01034912, + "balance_loss_clip": 1.0225265, + "balance_loss_mlp": 1.04009926, + "epoch": 0.4494213136930708, + "flos": 21251827589760.0, + "grad_norm": 1.704620828516005, + "language_loss": 0.72103465, + "learning_rate": 2.4215231743058086e-06, + "loss": 0.74248827, + "num_input_tokens_seen": 160354500, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.703125, + "step": 7475, + "time_per_iteration": 2.464167356491089 + }, + { + "auxiliary_loss_clip": 0.01112269, + "auxiliary_loss_mlp": 0.01030142, + "balance_loss_clip": 1.01847768, + "balance_loss_mlp": 1.03917694, + "epoch": 0.4494814369457388, + "flos": 27418695022080.0, + "grad_norm": 1.7869016250475191, + "language_loss": 0.76343799, + "learning_rate": 2.4211424529197594e-06, + "loss": 0.7848621, + "num_input_tokens_seen": 160373650, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.73046875, + "step": 7476, + "time_per_iteration": 2.529374837875366 + }, + { + "auxiliary_loss_clip": 0.01116691, + "auxiliary_loss_mlp": 0.010361, + "balance_loss_clip": 1.02194357, + "balance_loss_mlp": 1.04036331, + "epoch": 0.44954156019840674, + "flos": 22854053652480.0, + "grad_norm": 2.3312494175836034, + "language_loss": 0.71774453, + "learning_rate": 2.4207617155639464e-06, + "loss": 0.73927242, + "num_input_tokens_seen": 160393430, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.765625, + "step": 7477, + "time_per_iteration": 2.4914534091949463 + }, + { + "auxiliary_loss_clip": 0.01116651, + "auxiliary_loss_mlp": 0.01030749, + "balance_loss_clip": 1.01757061, + "balance_loss_mlp": 1.04089749, + "epoch": 0.4496016834510747, + "flos": 17201570935680.0, + "grad_norm": 2.2338487326584073, + "language_loss": 0.68136394, + "learning_rate": 2.4203809622528062e-06, + "loss": 0.70283794, + "num_input_tokens_seen": 160410545, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7578125, + "step": 7478, + "time_per_iteration": 2.4622039794921875 + }, + { + "auxiliary_loss_clip": 0.01112091, + "auxiliary_loss_mlp": 0.01032835, + "balance_loss_clip": 1.02097332, + "balance_loss_mlp": 1.04130244, + "epoch": 0.4496618067037427, + "flos": 18916628595840.0, + "grad_norm": 1.8288012816153718, + "language_loss": 0.89528286, + "learning_rate": 2.420000193000779e-06, + "loss": 0.91673213, + "num_input_tokens_seen": 160428105, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.7109375, + "step": 7479, + "time_per_iteration": 2.4738242626190186 + }, + { + "auxiliary_loss_clip": 0.01114783, + "auxiliary_loss_mlp": 0.01032708, + "balance_loss_clip": 1.01970804, + "balance_loss_mlp": 1.0423162, + "epoch": 0.44972192995641064, + "flos": 21031659175680.0, + "grad_norm": 2.1133613410879155, + "language_loss": 0.75824946, + "learning_rate": 2.419619407822302e-06, + "loss": 0.77972436, + "num_input_tokens_seen": 160448815, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7265625, + "step": 7480, + "time_per_iteration": 2.536190986633301 + }, + { + "auxiliary_loss_clip": 0.01116796, + "auxiliary_loss_mlp": 0.01031783, + "balance_loss_clip": 1.01906347, + "balance_loss_mlp": 1.04211199, + "epoch": 0.4497820532090786, + "flos": 20777088510720.0, + "grad_norm": 2.1813635775429794, + "language_loss": 0.80066407, + "learning_rate": 2.419238606731815e-06, + "loss": 0.82214987, + "num_input_tokens_seen": 160465940, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.74609375, + "step": 7481, + "time_per_iteration": 2.4618031978607178 + }, + { + "auxiliary_loss_clip": 0.01110042, + "auxiliary_loss_mlp": 0.01030863, + "balance_loss_clip": 1.01809597, + "balance_loss_mlp": 1.04028749, + "epoch": 0.44984217646174657, + "flos": 33802606385280.0, + "grad_norm": 1.5995355023246276, + "language_loss": 0.68636084, + "learning_rate": 2.418857789743758e-06, + "loss": 0.70776993, + "num_input_tokens_seen": 160486710, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 7482, + "time_per_iteration": 2.5711851119995117 + }, + { + "auxiliary_loss_clip": 0.0111451, + "auxiliary_loss_mlp": 0.01035168, + "balance_loss_clip": 1.02260911, + "balance_loss_mlp": 1.04059076, + "epoch": 0.44990229971441453, + "flos": 15518365660800.0, + "grad_norm": 2.0339843826279504, + "language_loss": 0.84802616, + "learning_rate": 2.418476956872571e-06, + "loss": 0.86952293, + "num_input_tokens_seen": 160503405, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.73828125, + "step": 7483, + "time_per_iteration": 2.4510746002197266 + }, + { + "auxiliary_loss_clip": 0.01120092, + "auxiliary_loss_mlp": 0.01034767, + "balance_loss_clip": 1.02177286, + "balance_loss_mlp": 1.04386485, + "epoch": 0.4499624229670825, + "flos": 29861913191040.0, + "grad_norm": 1.8187080510096723, + "language_loss": 0.80409968, + "learning_rate": 2.4180961081326967e-06, + "loss": 0.82564819, + "num_input_tokens_seen": 160525080, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.765625, + "step": 7484, + "time_per_iteration": 2.539834976196289 + }, + { + "auxiliary_loss_clip": 0.01118118, + "auxiliary_loss_mlp": 0.01028797, + "balance_loss_clip": 1.01529098, + "balance_loss_mlp": 1.03992271, + "epoch": 0.45002254621975046, + "flos": 18513674847360.0, + "grad_norm": 2.310143901315373, + "language_loss": 0.75594473, + "learning_rate": 2.4177152435385754e-06, + "loss": 0.77741385, + "num_input_tokens_seen": 160540895, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.78125, + "step": 7485, + "time_per_iteration": 2.408979892730713 + }, + { + "auxiliary_loss_clip": 0.01041505, + "auxiliary_loss_mlp": 0.01002218, + "balance_loss_clip": 1.00065601, + "balance_loss_mlp": 1.0170331, + "epoch": 0.4500826694724185, + "flos": 70420394229120.0, + "grad_norm": 0.7895891566174408, + "language_loss": 0.5867179, + "learning_rate": 2.4173343631046504e-06, + "loss": 0.60715508, + "num_input_tokens_seen": 160598270, + "router_z_loss_clip": 0.015625, + "router_z_loss_mlp": 0.24511719, + "step": 7486, + "time_per_iteration": 3.09049654006958 + }, + { + "auxiliary_loss_clip": 0.01113596, + "auxiliary_loss_mlp": 0.01031271, + "balance_loss_clip": 1.0184797, + "balance_loss_mlp": 1.04104531, + "epoch": 0.45014279272508645, + "flos": 15778897983360.0, + "grad_norm": 2.266854053846726, + "language_loss": 0.83153397, + "learning_rate": 2.4169534668453654e-06, + "loss": 0.85298264, + "num_input_tokens_seen": 160614720, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7265625, + "step": 7487, + "time_per_iteration": 2.431209087371826 + }, + { + "auxiliary_loss_clip": 0.01113173, + "auxiliary_loss_mlp": 0.01028971, + "balance_loss_clip": 1.01626313, + "balance_loss_mlp": 1.04103804, + "epoch": 0.4502029159777544, + "flos": 21799573061760.0, + "grad_norm": 1.5035728003068896, + "language_loss": 0.77055335, + "learning_rate": 2.4165725547751622e-06, + "loss": 0.79197478, + "num_input_tokens_seen": 160635170, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 7488, + "time_per_iteration": 2.5085837841033936 + }, + { + "auxiliary_loss_clip": 0.01121581, + "auxiliary_loss_mlp": 0.01038217, + "balance_loss_clip": 1.02446008, + "balance_loss_mlp": 1.04378915, + "epoch": 0.4502630392304224, + "flos": 28767966531840.0, + "grad_norm": 2.6401168824150574, + "language_loss": 0.71564645, + "learning_rate": 2.4161916269084858e-06, + "loss": 0.73724437, + "num_input_tokens_seen": 160654490, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.77734375, + "step": 7489, + "time_per_iteration": 2.5106120109558105 + }, + { + "auxiliary_loss_clip": 0.01119744, + "auxiliary_loss_mlp": 0.01032559, + "balance_loss_clip": 1.01856422, + "balance_loss_mlp": 1.04424906, + "epoch": 0.45032316248309034, + "flos": 15844182952320.0, + "grad_norm": 2.1685657644370853, + "language_loss": 0.6962117, + "learning_rate": 2.4158106832597817e-06, + "loss": 0.71773469, + "num_input_tokens_seen": 160669400, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7578125, + "step": 7490, + "time_per_iteration": 2.4383597373962402 + }, + { + "auxiliary_loss_clip": 0.01038961, + "auxiliary_loss_mlp": 0.01000463, + "balance_loss_clip": 0.99907476, + "balance_loss_mlp": 1.01472032, + "epoch": 0.4503832857357583, + "flos": 57853600945920.0, + "grad_norm": 1.805652104877531, + "language_loss": 0.56691748, + "learning_rate": 2.415429723843495e-06, + "loss": 0.5873118, + "num_input_tokens_seen": 160733820, + "router_z_loss_clip": 0.01391602, + "router_z_loss_mlp": 0.2421875, + "step": 7491, + "time_per_iteration": 3.0662994384765625 + }, + { + "auxiliary_loss_clip": 0.01111025, + "auxiliary_loss_mlp": 0.01029852, + "balance_loss_clip": 1.01719177, + "balance_loss_mlp": 1.03987265, + "epoch": 0.4504434089884263, + "flos": 23878082488320.0, + "grad_norm": 1.5869212574214921, + "language_loss": 0.79462028, + "learning_rate": 2.4150487486740713e-06, + "loss": 0.81602901, + "num_input_tokens_seen": 160753175, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 7492, + "time_per_iteration": 2.497849464416504 + }, + { + "auxiliary_loss_clip": 0.01119638, + "auxiliary_loss_mlp": 0.01038139, + "balance_loss_clip": 1.02474022, + "balance_loss_mlp": 1.04271042, + "epoch": 0.45050353224109424, + "flos": 17785083375360.0, + "grad_norm": 2.074371460837293, + "language_loss": 0.92560953, + "learning_rate": 2.4146677577659573e-06, + "loss": 0.9471873, + "num_input_tokens_seen": 160768310, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.76953125, + "step": 7493, + "time_per_iteration": 2.4717981815338135 + }, + { + "auxiliary_loss_clip": 0.01039013, + "auxiliary_loss_mlp": 0.01000993, + "balance_loss_clip": 0.99946707, + "balance_loss_mlp": 1.01443267, + "epoch": 0.4505636554937622, + "flos": 65063420703360.0, + "grad_norm": 0.8118074327791402, + "language_loss": 0.62908041, + "learning_rate": 2.4142867511336e-06, + "loss": 0.64948046, + "num_input_tokens_seen": 160827370, + "router_z_loss_clip": 0.01525879, + "router_z_loss_mlp": 0.24609375, + "step": 7494, + "time_per_iteration": 3.1021509170532227 + }, + { + "auxiliary_loss_clip": 0.01113816, + "auxiliary_loss_mlp": 0.01032596, + "balance_loss_clip": 1.02063334, + "balance_loss_mlp": 1.04122376, + "epoch": 0.45062377874643017, + "flos": 22200084685440.0, + "grad_norm": 1.4599772474200656, + "language_loss": 0.81980979, + "learning_rate": 2.4139057287914484e-06, + "loss": 0.8412739, + "num_input_tokens_seen": 160849140, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.7265625, + "step": 7495, + "time_per_iteration": 2.528707981109619 + }, + { + "auxiliary_loss_clip": 0.01114077, + "auxiliary_loss_mlp": 0.010313, + "balance_loss_clip": 1.01793659, + "balance_loss_mlp": 1.04069221, + "epoch": 0.45068390199909814, + "flos": 37670293186560.0, + "grad_norm": 1.6718702145442927, + "language_loss": 0.85639864, + "learning_rate": 2.41352469075395e-06, + "loss": 0.87785244, + "num_input_tokens_seen": 160871280, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.734375, + "step": 7496, + "time_per_iteration": 2.5862984657287598 + }, + { + "auxiliary_loss_clip": 0.0111579, + "auxiliary_loss_mlp": 0.0103187, + "balance_loss_clip": 1.01913798, + "balance_loss_mlp": 1.04234052, + "epoch": 0.4507440252517661, + "flos": 22302501338880.0, + "grad_norm": 2.117680053603533, + "language_loss": 0.76342994, + "learning_rate": 2.4131436370355534e-06, + "loss": 0.78490651, + "num_input_tokens_seen": 160888625, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.734375, + "step": 7497, + "time_per_iteration": 2.4831669330596924 + }, + { + "auxiliary_loss_clip": 0.01114815, + "auxiliary_loss_mlp": 0.01030719, + "balance_loss_clip": 1.01798773, + "balance_loss_mlp": 1.03939152, + "epoch": 0.45080414850443407, + "flos": 13188374138880.0, + "grad_norm": 2.971687057549937, + "language_loss": 0.75124824, + "learning_rate": 2.4127625676507088e-06, + "loss": 0.77270365, + "num_input_tokens_seen": 160907040, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.75390625, + "step": 7498, + "time_per_iteration": 2.4243438243865967 + }, + { + "auxiliary_loss_clip": 0.01116531, + "auxiliary_loss_mlp": 0.01041824, + "balance_loss_clip": 1.02853799, + "balance_loss_mlp": 1.04190993, + "epoch": 0.4508642717571021, + "flos": 21944939402880.0, + "grad_norm": 1.8265166276024245, + "language_loss": 0.70487583, + "learning_rate": 2.4123814826138663e-06, + "loss": 0.72645926, + "num_input_tokens_seen": 160927115, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.74609375, + "step": 7499, + "time_per_iteration": 2.496595859527588 + }, + { + "auxiliary_loss_clip": 0.01118241, + "auxiliary_loss_mlp": 0.0103412, + "balance_loss_clip": 1.02090549, + "balance_loss_mlp": 1.04258835, + "epoch": 0.45092439500977005, + "flos": 23367468700800.0, + "grad_norm": 1.819855114084185, + "language_loss": 0.76870257, + "learning_rate": 2.412000381939477e-06, + "loss": 0.79022616, + "num_input_tokens_seen": 160944405, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7578125, + "step": 7500, + "time_per_iteration": 2.4659407138824463 + }, + { + "auxiliary_loss_clip": 0.01114886, + "auxiliary_loss_mlp": 0.01032223, + "balance_loss_clip": 1.01943755, + "balance_loss_mlp": 1.04146719, + "epoch": 0.450984518262438, + "flos": 20772958446720.0, + "grad_norm": 1.7705256698152247, + "language_loss": 0.62966442, + "learning_rate": 2.411619265641992e-06, + "loss": 0.6511355, + "num_input_tokens_seen": 160961345, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.734375, + "step": 7501, + "time_per_iteration": 2.474149703979492 + }, + { + "auxiliary_loss_clip": 0.01117269, + "auxiliary_loss_mlp": 0.0103454, + "balance_loss_clip": 1.02093208, + "balance_loss_mlp": 1.04161, + "epoch": 0.451044641515106, + "flos": 17707372300800.0, + "grad_norm": 1.9049764473951474, + "language_loss": 0.84758866, + "learning_rate": 2.411238133735863e-06, + "loss": 0.86910677, + "num_input_tokens_seen": 160977330, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7578125, + "step": 7502, + "time_per_iteration": 2.419093370437622 + }, + { + "auxiliary_loss_clip": 0.01111337, + "auxiliary_loss_mlp": 0.01033516, + "balance_loss_clip": 1.02135682, + "balance_loss_mlp": 1.04026246, + "epoch": 0.45110476476777395, + "flos": 20594698225920.0, + "grad_norm": 1.4187712379612754, + "language_loss": 0.79906255, + "learning_rate": 2.4108569862355418e-06, + "loss": 0.8205111, + "num_input_tokens_seen": 160997280, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.7109375, + "step": 7503, + "time_per_iteration": 2.536954164505005 + }, + { + "auxiliary_loss_clip": 0.01112743, + "auxiliary_loss_mlp": 0.01036948, + "balance_loss_clip": 1.02458, + "balance_loss_mlp": 1.04287815, + "epoch": 0.4511648880204419, + "flos": 16034043265920.0, + "grad_norm": 3.706114905397956, + "language_loss": 0.80931562, + "learning_rate": 2.410475823155484e-06, + "loss": 0.83081251, + "num_input_tokens_seen": 161014235, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6953125, + "step": 7504, + "time_per_iteration": 2.4356000423431396 + }, + { + "auxiliary_loss_clip": 0.01112245, + "auxiliary_loss_mlp": 0.01034569, + "balance_loss_clip": 1.02284479, + "balance_loss_mlp": 1.04033744, + "epoch": 0.4512250112731099, + "flos": 23978811202560.0, + "grad_norm": 5.269565558405545, + "language_loss": 0.63377774, + "learning_rate": 2.4100946445101405e-06, + "loss": 0.6552459, + "num_input_tokens_seen": 161032360, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.71875, + "step": 7505, + "time_per_iteration": 2.4934160709381104 + }, + { + "auxiliary_loss_clip": 0.01036723, + "auxiliary_loss_mlp": 0.0101133, + "balance_loss_clip": 1.00969648, + "balance_loss_mlp": 1.01246166, + "epoch": 0.45128513452577784, + "flos": 71462308037760.0, + "grad_norm": 0.8504866778221882, + "language_loss": 0.5887711, + "learning_rate": 2.409713450313968e-06, + "loss": 0.60925162, + "num_input_tokens_seen": 161091360, + "router_z_loss_clip": 0.01635742, + "router_z_loss_mlp": 0.2421875, + "step": 7506, + "time_per_iteration": 3.1150898933410645 + }, + { + "auxiliary_loss_clip": 0.01112738, + "auxiliary_loss_mlp": 0.0103395, + "balance_loss_clip": 1.02087879, + "balance_loss_mlp": 1.04194486, + "epoch": 0.4513452577784458, + "flos": 22090844448000.0, + "grad_norm": 1.6347442617822043, + "language_loss": 0.79238498, + "learning_rate": 2.40933224058142e-06, + "loss": 0.81385183, + "num_input_tokens_seen": 161110825, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.70703125, + "step": 7507, + "time_per_iteration": 2.484036684036255 + }, + { + "auxiliary_loss_clip": 0.0111511, + "auxiliary_loss_mlp": 0.01033622, + "balance_loss_clip": 1.019871, + "balance_loss_mlp": 1.04084098, + "epoch": 0.4514053810311138, + "flos": 24276403382400.0, + "grad_norm": 1.5108356171854629, + "language_loss": 0.7397756, + "learning_rate": 2.4089510153269526e-06, + "loss": 0.76126289, + "num_input_tokens_seen": 161130685, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7421875, + "step": 7508, + "time_per_iteration": 2.4958505630493164 + }, + { + "auxiliary_loss_clip": 0.01112961, + "auxiliary_loss_mlp": 0.01036508, + "balance_loss_clip": 1.02423549, + "balance_loss_mlp": 1.04263186, + "epoch": 0.45146550428378174, + "flos": 17886781756800.0, + "grad_norm": 1.9053667394121476, + "language_loss": 0.78955048, + "learning_rate": 2.4085697745650217e-06, + "loss": 0.81104517, + "num_input_tokens_seen": 161147555, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 7509, + "time_per_iteration": 2.4640209674835205 + }, + { + "auxiliary_loss_clip": 0.01114289, + "auxiliary_loss_mlp": 0.01029902, + "balance_loss_clip": 1.01759398, + "balance_loss_mlp": 1.0420239, + "epoch": 0.4515256275364497, + "flos": 24243437675520.0, + "grad_norm": 1.8944319049742213, + "language_loss": 0.73495883, + "learning_rate": 2.4081885183100837e-06, + "loss": 0.75640076, + "num_input_tokens_seen": 161166255, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.72265625, + "step": 7510, + "time_per_iteration": 2.462289810180664 + }, + { + "auxiliary_loss_clip": 0.01115796, + "auxiliary_loss_mlp": 0.01032073, + "balance_loss_clip": 1.01856017, + "balance_loss_mlp": 1.04091644, + "epoch": 0.45158575078911767, + "flos": 20631039811200.0, + "grad_norm": 1.9974195471898801, + "language_loss": 0.77053016, + "learning_rate": 2.4078072465765964e-06, + "loss": 0.79200888, + "num_input_tokens_seen": 161184720, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 7511, + "time_per_iteration": 2.5831305980682373 + }, + { + "auxiliary_loss_clip": 0.01114808, + "auxiliary_loss_mlp": 0.01032776, + "balance_loss_clip": 1.01937711, + "balance_loss_mlp": 1.04086745, + "epoch": 0.45164587404178563, + "flos": 23327751237120.0, + "grad_norm": 1.734048899080759, + "language_loss": 0.79124206, + "learning_rate": 2.4074259593790174e-06, + "loss": 0.81271791, + "num_input_tokens_seen": 161204360, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73828125, + "step": 7512, + "time_per_iteration": 6.862476587295532 + }, + { + "auxiliary_loss_clip": 0.01118735, + "auxiliary_loss_mlp": 0.01037966, + "balance_loss_clip": 1.02435863, + "balance_loss_mlp": 1.04064548, + "epoch": 0.45170599729445365, + "flos": 23805973935360.0, + "grad_norm": 1.9681233127218394, + "language_loss": 0.87461096, + "learning_rate": 2.4070446567318053e-06, + "loss": 0.89617801, + "num_input_tokens_seen": 161223575, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.78125, + "step": 7513, + "time_per_iteration": 2.5551092624664307 + }, + { + "auxiliary_loss_clip": 0.01105419, + "auxiliary_loss_mlp": 0.01030567, + "balance_loss_clip": 1.01893246, + "balance_loss_mlp": 1.0379355, + "epoch": 0.4517661205471216, + "flos": 23512942782720.0, + "grad_norm": 1.6638824980939535, + "language_loss": 0.67135286, + "learning_rate": 2.406663338649419e-06, + "loss": 0.69271272, + "num_input_tokens_seen": 161243805, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.671875, + "step": 7514, + "time_per_iteration": 2.4804775714874268 + }, + { + "auxiliary_loss_clip": 0.01115847, + "auxiliary_loss_mlp": 0.01029857, + "balance_loss_clip": 1.01448536, + "balance_loss_mlp": 1.04221404, + "epoch": 0.4518262437997896, + "flos": 23513948363520.0, + "grad_norm": 2.644844833078513, + "language_loss": 0.69455916, + "learning_rate": 2.406282005146318e-06, + "loss": 0.71601617, + "num_input_tokens_seen": 161261450, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.734375, + "step": 7515, + "time_per_iteration": 2.530089855194092 + }, + { + "auxiliary_loss_clip": 0.01117096, + "auxiliary_loss_mlp": 0.01034746, + "balance_loss_clip": 1.02060795, + "balance_loss_mlp": 1.04084945, + "epoch": 0.45188636705245755, + "flos": 14568061489920.0, + "grad_norm": 2.154684023631233, + "language_loss": 0.81658673, + "learning_rate": 2.405900656236963e-06, + "loss": 0.83810514, + "num_input_tokens_seen": 161276965, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.76171875, + "step": 7516, + "time_per_iteration": 2.405810832977295 + }, + { + "auxiliary_loss_clip": 0.01111826, + "auxiliary_loss_mlp": 0.01032545, + "balance_loss_clip": 1.01940227, + "balance_loss_mlp": 1.04099917, + "epoch": 0.4519464903051255, + "flos": 19901550499200.0, + "grad_norm": 1.5513632113186169, + "language_loss": 0.65810448, + "learning_rate": 2.4055192919358137e-06, + "loss": 0.6795482, + "num_input_tokens_seen": 161295375, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.70703125, + "step": 7517, + "time_per_iteration": 2.487539768218994 + }, + { + "auxiliary_loss_clip": 0.0111082, + "auxiliary_loss_mlp": 0.0102731, + "balance_loss_clip": 1.01549673, + "balance_loss_mlp": 1.04066491, + "epoch": 0.4520066135577935, + "flos": 18844376388480.0, + "grad_norm": 1.7604175245242084, + "language_loss": 0.63401121, + "learning_rate": 2.405137912257333e-06, + "loss": 0.65539253, + "num_input_tokens_seen": 161313010, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.69921875, + "step": 7518, + "time_per_iteration": 2.4280178546905518 + }, + { + "auxiliary_loss_clip": 0.01112305, + "auxiliary_loss_mlp": 0.010337, + "balance_loss_clip": 1.02124858, + "balance_loss_mlp": 1.04022479, + "epoch": 0.45206673681046144, + "flos": 48214419713280.0, + "grad_norm": 1.4125127095428567, + "language_loss": 0.59552354, + "learning_rate": 2.404756517215982e-06, + "loss": 0.61698353, + "num_input_tokens_seen": 161336690, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71875, + "step": 7519, + "time_per_iteration": 2.706774950027466 + }, + { + "auxiliary_loss_clip": 0.01114162, + "auxiliary_loss_mlp": 0.01036796, + "balance_loss_clip": 1.02404702, + "balance_loss_mlp": 1.04053855, + "epoch": 0.4521268600631294, + "flos": 23842171866240.0, + "grad_norm": 1.3128892020538214, + "language_loss": 0.72288704, + "learning_rate": 2.404375106826223e-06, + "loss": 0.74439663, + "num_input_tokens_seen": 161357845, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.734375, + "step": 7520, + "time_per_iteration": 2.4802541732788086 + }, + { + "auxiliary_loss_clip": 0.01113212, + "auxiliary_loss_mlp": 0.01037416, + "balance_loss_clip": 1.0250659, + "balance_loss_mlp": 1.04033482, + "epoch": 0.4521869833157974, + "flos": 18843622202880.0, + "grad_norm": 1.8726393810843218, + "language_loss": 0.75520414, + "learning_rate": 2.4039936811025194e-06, + "loss": 0.77671039, + "num_input_tokens_seen": 161375160, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.73046875, + "step": 7521, + "time_per_iteration": 2.4384777545928955 + }, + { + "auxiliary_loss_clip": 0.0111833, + "auxiliary_loss_mlp": 0.01035726, + "balance_loss_clip": 1.022416, + "balance_loss_mlp": 1.04222465, + "epoch": 0.45224710656846534, + "flos": 19788072456960.0, + "grad_norm": 1.6736116772601735, + "language_loss": 0.67521721, + "learning_rate": 2.4036122400593343e-06, + "loss": 0.69675779, + "num_input_tokens_seen": 161393690, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.76171875, + "step": 7522, + "time_per_iteration": 2.4317188262939453 + }, + { + "auxiliary_loss_clip": 0.01109922, + "auxiliary_loss_mlp": 0.01033885, + "balance_loss_clip": 1.02090335, + "balance_loss_mlp": 1.03857231, + "epoch": 0.4523072298211333, + "flos": 28256131681920.0, + "grad_norm": 1.5002177443666298, + "language_loss": 0.60627949, + "learning_rate": 2.403230783711134e-06, + "loss": 0.62771761, + "num_input_tokens_seen": 161415015, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 7523, + "time_per_iteration": 2.5312907695770264 + }, + { + "auxiliary_loss_clip": 0.01116524, + "auxiliary_loss_mlp": 0.01039355, + "balance_loss_clip": 1.02556825, + "balance_loss_mlp": 1.0399549, + "epoch": 0.45236735307380127, + "flos": 11181039511680.0, + "grad_norm": 2.0404967948828796, + "language_loss": 0.78325248, + "learning_rate": 2.4028493120723813e-06, + "loss": 0.80481124, + "num_input_tokens_seen": 161432940, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.765625, + "step": 7524, + "time_per_iteration": 2.4078996181488037 + }, + { + "auxiliary_loss_clip": 0.01111336, + "auxiliary_loss_mlp": 0.01034812, + "balance_loss_clip": 1.02216387, + "balance_loss_mlp": 1.03912878, + "epoch": 0.45242747632646924, + "flos": 22601386408320.0, + "grad_norm": 1.9789251534337415, + "language_loss": 0.63518596, + "learning_rate": 2.4024678251575417e-06, + "loss": 0.65664744, + "num_input_tokens_seen": 161452215, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.72265625, + "step": 7525, + "time_per_iteration": 2.503176212310791 + }, + { + "auxiliary_loss_clip": 0.01112174, + "auxiliary_loss_mlp": 0.01035043, + "balance_loss_clip": 1.02262783, + "balance_loss_mlp": 1.04040241, + "epoch": 0.45248759957913726, + "flos": 18256267008000.0, + "grad_norm": 1.5288172547930599, + "language_loss": 0.79163349, + "learning_rate": 2.402086322981083e-06, + "loss": 0.8131057, + "num_input_tokens_seen": 161469520, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71875, + "step": 7526, + "time_per_iteration": 2.4558780193328857 + }, + { + "auxiliary_loss_clip": 0.01111805, + "auxiliary_loss_mlp": 0.01030821, + "balance_loss_clip": 1.01851869, + "balance_loss_mlp": 1.04029512, + "epoch": 0.4525477228318052, + "flos": 22450094323200.0, + "grad_norm": 1.6413449131819307, + "language_loss": 0.80729342, + "learning_rate": 2.40170480555747e-06, + "loss": 0.82871962, + "num_input_tokens_seen": 161487335, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.7109375, + "step": 7527, + "time_per_iteration": 2.470186948776245 + }, + { + "auxiliary_loss_clip": 0.0111113, + "auxiliary_loss_mlp": 0.01028615, + "balance_loss_clip": 1.01566291, + "balance_loss_mlp": 1.039428, + "epoch": 0.4526078460844732, + "flos": 29644869260160.0, + "grad_norm": 1.450835161887395, + "language_loss": 0.65505683, + "learning_rate": 2.4013232729011706e-06, + "loss": 0.67645425, + "num_input_tokens_seen": 161510095, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71875, + "step": 7528, + "time_per_iteration": 2.541700601577759 + }, + { + "auxiliary_loss_clip": 0.01110752, + "auxiliary_loss_mlp": 0.01032238, + "balance_loss_clip": 1.02031136, + "balance_loss_mlp": 1.03976476, + "epoch": 0.45266796933714115, + "flos": 23039747988480.0, + "grad_norm": 1.6649436204324595, + "language_loss": 0.7542727, + "learning_rate": 2.4009417250266525e-06, + "loss": 0.7757026, + "num_input_tokens_seen": 161528725, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.7109375, + "step": 7529, + "time_per_iteration": 2.5726876258850098 + }, + { + "auxiliary_loss_clip": 0.01112607, + "auxiliary_loss_mlp": 0.01030788, + "balance_loss_clip": 1.01853299, + "balance_loss_mlp": 1.03971684, + "epoch": 0.4527280925898091, + "flos": 14428405411200.0, + "grad_norm": 1.7825780716691442, + "language_loss": 0.73193467, + "learning_rate": 2.400560161948384e-06, + "loss": 0.75336862, + "num_input_tokens_seen": 161547195, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.73046875, + "step": 7530, + "time_per_iteration": 2.4584052562713623 + }, + { + "auxiliary_loss_clip": 0.01113111, + "auxiliary_loss_mlp": 0.01033658, + "balance_loss_clip": 1.02193975, + "balance_loss_mlp": 1.04003453, + "epoch": 0.4527882158424771, + "flos": 22925515760640.0, + "grad_norm": 1.6012488985464985, + "language_loss": 0.75947326, + "learning_rate": 2.400178583680834e-06, + "loss": 0.78094089, + "num_input_tokens_seen": 161565565, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.734375, + "step": 7531, + "time_per_iteration": 2.484959363937378 + }, + { + "auxiliary_loss_clip": 0.01108375, + "auxiliary_loss_mlp": 0.01034859, + "balance_loss_clip": 1.02182305, + "balance_loss_mlp": 1.0382148, + "epoch": 0.45284833909514505, + "flos": 25555326105600.0, + "grad_norm": 1.4359815558452909, + "language_loss": 0.66874713, + "learning_rate": 2.3997969902384717e-06, + "loss": 0.69017947, + "num_input_tokens_seen": 161586630, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 7532, + "time_per_iteration": 2.486598253250122 + }, + { + "auxiliary_loss_clip": 0.01112272, + "auxiliary_loss_mlp": 0.01035317, + "balance_loss_clip": 1.02322936, + "balance_loss_mlp": 1.04091084, + "epoch": 0.452908462347813, + "flos": 18150007599360.0, + "grad_norm": 2.0450394734969874, + "language_loss": 0.78902352, + "learning_rate": 2.399415381635768e-06, + "loss": 0.81049943, + "num_input_tokens_seen": 161603815, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71484375, + "step": 7533, + "time_per_iteration": 2.4407958984375 + }, + { + "auxiliary_loss_clip": 0.01115719, + "auxiliary_loss_mlp": 0.01034272, + "balance_loss_clip": 1.02032459, + "balance_loss_mlp": 1.03807485, + "epoch": 0.452968585600481, + "flos": 19062749122560.0, + "grad_norm": 1.646532255034537, + "language_loss": 0.83279264, + "learning_rate": 2.3990337578871927e-06, + "loss": 0.85429263, + "num_input_tokens_seen": 161622900, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7734375, + "step": 7534, + "time_per_iteration": 2.430670976638794 + }, + { + "auxiliary_loss_clip": 0.01113826, + "auxiliary_loss_mlp": 0.01034802, + "balance_loss_clip": 1.02148068, + "balance_loss_mlp": 1.03927064, + "epoch": 0.45302870885314894, + "flos": 22051737515520.0, + "grad_norm": 1.4654832124358697, + "language_loss": 0.76578003, + "learning_rate": 2.3986521190072176e-06, + "loss": 0.78726631, + "num_input_tokens_seen": 161641700, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.74609375, + "step": 7535, + "time_per_iteration": 2.4744579792022705 + }, + { + "auxiliary_loss_clip": 0.0110944, + "auxiliary_loss_mlp": 0.01031373, + "balance_loss_clip": 1.01957679, + "balance_loss_mlp": 1.03883696, + "epoch": 0.4530888321058169, + "flos": 20376217751040.0, + "grad_norm": 1.5977579258117844, + "language_loss": 0.80234635, + "learning_rate": 2.3982704650103138e-06, + "loss": 0.82375443, + "num_input_tokens_seen": 161661955, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.703125, + "step": 7536, + "time_per_iteration": 2.4481444358825684 + }, + { + "auxiliary_loss_clip": 0.01111518, + "auxiliary_loss_mlp": 0.0102989, + "balance_loss_clip": 1.0173198, + "balance_loss_mlp": 1.03711987, + "epoch": 0.4531489553584849, + "flos": 14830425406080.0, + "grad_norm": 2.0610118763249536, + "language_loss": 0.75895774, + "learning_rate": 2.3978887959109544e-06, + "loss": 0.78037184, + "num_input_tokens_seen": 161679245, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7421875, + "step": 7537, + "time_per_iteration": 2.430119276046753 + }, + { + "auxiliary_loss_clip": 0.01115071, + "auxiliary_loss_mlp": 0.01032409, + "balance_loss_clip": 1.02058339, + "balance_loss_mlp": 1.04172075, + "epoch": 0.45320907861115284, + "flos": 21944975316480.0, + "grad_norm": 2.095176663386117, + "language_loss": 0.76420474, + "learning_rate": 2.3975071117236118e-06, + "loss": 0.78567952, + "num_input_tokens_seen": 161698795, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.734375, + "step": 7538, + "time_per_iteration": 2.4675159454345703 + }, + { + "auxiliary_loss_clip": 0.01041439, + "auxiliary_loss_mlp": 0.0100041, + "balance_loss_clip": 0.99908096, + "balance_loss_mlp": 1.01700771, + "epoch": 0.45326920186382086, + "flos": 66251455038720.0, + "grad_norm": 0.7965700347609973, + "language_loss": 0.62345123, + "learning_rate": 2.3971254124627593e-06, + "loss": 0.64386964, + "num_input_tokens_seen": 161761980, + "router_z_loss_clip": 0.01330566, + "router_z_loss_mlp": 0.24414062, + "step": 7539, + "time_per_iteration": 3.0961101055145264 + }, + { + "auxiliary_loss_clip": 0.01112571, + "auxiliary_loss_mlp": 0.01036685, + "balance_loss_clip": 1.02466285, + "balance_loss_mlp": 1.04064226, + "epoch": 0.4533293251164888, + "flos": 14684233052160.0, + "grad_norm": 1.8102149318529874, + "language_loss": 0.65997463, + "learning_rate": 2.396743698142872e-06, + "loss": 0.68146718, + "num_input_tokens_seen": 161779455, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.71875, + "step": 7540, + "time_per_iteration": 2.418170928955078 + }, + { + "auxiliary_loss_clip": 0.01118532, + "auxiliary_loss_mlp": 0.01040664, + "balance_loss_clip": 1.02721667, + "balance_loss_mlp": 1.04177594, + "epoch": 0.4533894483691568, + "flos": 22601206840320.0, + "grad_norm": 1.6922846601909878, + "language_loss": 0.84666622, + "learning_rate": 2.396361968778424e-06, + "loss": 0.86825818, + "num_input_tokens_seen": 161798980, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 7541, + "time_per_iteration": 2.4960954189300537 + }, + { + "auxiliary_loss_clip": 0.01113117, + "auxiliary_loss_mlp": 0.01031114, + "balance_loss_clip": 1.01888943, + "balance_loss_mlp": 1.03968024, + "epoch": 0.45344957162182475, + "flos": 34751617666560.0, + "grad_norm": 1.7180151747286094, + "language_loss": 0.76435781, + "learning_rate": 2.395980224383889e-06, + "loss": 0.78580016, + "num_input_tokens_seen": 161819745, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.734375, + "step": 7542, + "time_per_iteration": 2.574286937713623 + }, + { + "auxiliary_loss_clip": 0.011146, + "auxiliary_loss_mlp": 0.01029849, + "balance_loss_clip": 1.01687872, + "balance_loss_mlp": 1.04101157, + "epoch": 0.4535096948744927, + "flos": 23550218121600.0, + "grad_norm": 1.4680148354813627, + "language_loss": 0.80267954, + "learning_rate": 2.395598464973746e-06, + "loss": 0.82412398, + "num_input_tokens_seen": 161838575, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.734375, + "step": 7543, + "time_per_iteration": 2.5228359699249268 + }, + { + "auxiliary_loss_clip": 0.01115681, + "auxiliary_loss_mlp": 0.01037869, + "balance_loss_clip": 1.02517343, + "balance_loss_mlp": 1.04107285, + "epoch": 0.4535698181271607, + "flos": 25557552748800.0, + "grad_norm": 1.6471991367559184, + "language_loss": 0.75933033, + "learning_rate": 2.395216690562469e-06, + "loss": 0.78086591, + "num_input_tokens_seen": 161858590, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.74609375, + "step": 7544, + "time_per_iteration": 2.4976110458374023 + }, + { + "auxiliary_loss_clip": 0.01117877, + "auxiliary_loss_mlp": 0.01033773, + "balance_loss_clip": 1.02154779, + "balance_loss_mlp": 1.04304671, + "epoch": 0.45362994137982865, + "flos": 24864117713280.0, + "grad_norm": 1.8438932042246456, + "language_loss": 0.75447458, + "learning_rate": 2.3948349011645355e-06, + "loss": 0.77599108, + "num_input_tokens_seen": 161878390, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.75, + "step": 7545, + "time_per_iteration": 2.5022737979888916 + }, + { + "auxiliary_loss_clip": 0.01114305, + "auxiliary_loss_mlp": 0.0102975, + "balance_loss_clip": 1.01697659, + "balance_loss_mlp": 1.04100811, + "epoch": 0.4536900646324966, + "flos": 30806794408320.0, + "grad_norm": 1.5497429650402368, + "language_loss": 0.7210325, + "learning_rate": 2.394453096794423e-06, + "loss": 0.74247307, + "num_input_tokens_seen": 161898610, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.734375, + "step": 7546, + "time_per_iteration": 2.5246150493621826 + }, + { + "auxiliary_loss_clip": 0.01118375, + "auxiliary_loss_mlp": 0.01032123, + "balance_loss_clip": 1.01857507, + "balance_loss_mlp": 1.04212511, + "epoch": 0.4537501878851646, + "flos": 23404313076480.0, + "grad_norm": 1.558937793954525, + "language_loss": 0.7557559, + "learning_rate": 2.394071277466609e-06, + "loss": 0.77726084, + "num_input_tokens_seen": 161918210, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.76171875, + "step": 7547, + "time_per_iteration": 2.4949920177459717 + }, + { + "auxiliary_loss_clip": 0.01116665, + "auxiliary_loss_mlp": 0.0103361, + "balance_loss_clip": 1.02041912, + "balance_loss_mlp": 1.04200041, + "epoch": 0.45381031113783254, + "flos": 18149289327360.0, + "grad_norm": 2.0285954992459865, + "language_loss": 0.69878972, + "learning_rate": 2.393689443195573e-06, + "loss": 0.72029251, + "num_input_tokens_seen": 161936950, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.74609375, + "step": 7548, + "time_per_iteration": 2.4486818313598633 + }, + { + "auxiliary_loss_clip": 0.01114191, + "auxiliary_loss_mlp": 0.01040331, + "balance_loss_clip": 1.02771258, + "balance_loss_mlp": 1.04018688, + "epoch": 0.4538704343905005, + "flos": 25336666062720.0, + "grad_norm": 2.0627316040888117, + "language_loss": 0.72691673, + "learning_rate": 2.393307593995794e-06, + "loss": 0.74846196, + "num_input_tokens_seen": 161955550, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7421875, + "step": 7549, + "time_per_iteration": 2.509470224380493 + }, + { + "auxiliary_loss_clip": 0.01112378, + "auxiliary_loss_mlp": 0.01029099, + "balance_loss_clip": 1.01698172, + "balance_loss_mlp": 1.04035378, + "epoch": 0.4539305576431685, + "flos": 28731445378560.0, + "grad_norm": 1.7136809619022837, + "language_loss": 0.65253317, + "learning_rate": 2.392925729881751e-06, + "loss": 0.67394793, + "num_input_tokens_seen": 161976760, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71875, + "step": 7550, + "time_per_iteration": 2.5133440494537354 + }, + { + "auxiliary_loss_clip": 0.01113494, + "auxiliary_loss_mlp": 0.01037344, + "balance_loss_clip": 1.0250591, + "balance_loss_mlp": 1.04179323, + "epoch": 0.45399068089583644, + "flos": 22492397566080.0, + "grad_norm": 1.6025854653449239, + "language_loss": 0.68823695, + "learning_rate": 2.3925438508679263e-06, + "loss": 0.70974535, + "num_input_tokens_seen": 161996120, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.71875, + "step": 7551, + "time_per_iteration": 2.5188024044036865 + }, + { + "auxiliary_loss_clip": 0.01113711, + "auxiliary_loss_mlp": 0.01033106, + "balance_loss_clip": 1.02022541, + "balance_loss_mlp": 1.03923821, + "epoch": 0.45405080414850446, + "flos": 12893403651840.0, + "grad_norm": 1.6542843637965088, + "language_loss": 0.79214859, + "learning_rate": 2.392161956968798e-06, + "loss": 0.81361675, + "num_input_tokens_seen": 162011125, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.74609375, + "step": 7552, + "time_per_iteration": 2.4087183475494385 + }, + { + "auxiliary_loss_clip": 0.01039804, + "auxiliary_loss_mlp": 0.01010172, + "balance_loss_clip": 1.00893259, + "balance_loss_mlp": 1.01586497, + "epoch": 0.4541109274011724, + "flos": 59766919724160.0, + "grad_norm": 0.8232859688183145, + "language_loss": 0.57765305, + "learning_rate": 2.39178004819885e-06, + "loss": 0.59815282, + "num_input_tokens_seen": 162068705, + "router_z_loss_clip": 0.01239014, + "router_z_loss_mlp": 0.24023438, + "step": 7553, + "time_per_iteration": 4.437517881393433 + }, + { + "auxiliary_loss_clip": 0.01110712, + "auxiliary_loss_mlp": 0.01035759, + "balance_loss_clip": 1.02388608, + "balance_loss_mlp": 1.03907371, + "epoch": 0.4541710506538404, + "flos": 28511743841280.0, + "grad_norm": 1.3573100009257986, + "language_loss": 0.76541936, + "learning_rate": 2.3913981245725626e-06, + "loss": 0.78688413, + "num_input_tokens_seen": 162089655, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.71875, + "step": 7554, + "time_per_iteration": 5.404860258102417 + }, + { + "auxiliary_loss_clip": 0.01116899, + "auxiliary_loss_mlp": 0.01032245, + "balance_loss_clip": 1.01859498, + "balance_loss_mlp": 1.04073453, + "epoch": 0.45423117390650836, + "flos": 17675591742720.0, + "grad_norm": 2.6663912268828156, + "language_loss": 0.77148789, + "learning_rate": 2.3910161861044194e-06, + "loss": 0.79297936, + "num_input_tokens_seen": 162108465, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76171875, + "step": 7555, + "time_per_iteration": 2.5254242420196533 + }, + { + "auxiliary_loss_clip": 0.01111282, + "auxiliary_loss_mlp": 0.01033205, + "balance_loss_clip": 1.02112269, + "balance_loss_mlp": 1.03910041, + "epoch": 0.4542912971591763, + "flos": 28072556248320.0, + "grad_norm": 1.268885764239303, + "language_loss": 0.72658741, + "learning_rate": 2.390634232808903e-06, + "loss": 0.74803221, + "num_input_tokens_seen": 162129910, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.72265625, + "step": 7556, + "time_per_iteration": 2.5096001625061035 + }, + { + "auxiliary_loss_clip": 0.01117527, + "auxiliary_loss_mlp": 0.01033023, + "balance_loss_clip": 1.01987422, + "balance_loss_mlp": 1.0412432, + "epoch": 0.4543514204118443, + "flos": 22671771108480.0, + "grad_norm": 1.9256457801142723, + "language_loss": 0.63244998, + "learning_rate": 2.3902522647004982e-06, + "loss": 0.65395546, + "num_input_tokens_seen": 162148840, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.76171875, + "step": 7557, + "time_per_iteration": 2.489269495010376 + }, + { + "auxiliary_loss_clip": 0.010384, + "auxiliary_loss_mlp": 0.01000398, + "balance_loss_clip": 0.99909872, + "balance_loss_mlp": 1.01432419, + "epoch": 0.45441154366451225, + "flos": 58216549921920.0, + "grad_norm": 0.6891763329400619, + "language_loss": 0.57655525, + "learning_rate": 2.3898702817936875e-06, + "loss": 0.59694326, + "num_input_tokens_seen": 162208500, + "router_z_loss_clip": 0.01300049, + "router_z_loss_mlp": 0.24023438, + "step": 7558, + "time_per_iteration": 2.9631850719451904 + }, + { + "auxiliary_loss_clip": 0.01117663, + "auxiliary_loss_mlp": 0.01034797, + "balance_loss_clip": 1.02106977, + "balance_loss_mlp": 1.04180217, + "epoch": 0.4544716669171802, + "flos": 16764286763520.0, + "grad_norm": 2.9054431891281847, + "language_loss": 0.56152129, + "learning_rate": 2.3894882841029573e-06, + "loss": 0.58304584, + "num_input_tokens_seen": 162224650, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7578125, + "step": 7559, + "time_per_iteration": 2.4718172550201416 + }, + { + "auxiliary_loss_clip": 0.01116333, + "auxiliary_loss_mlp": 0.01036141, + "balance_loss_clip": 1.02320707, + "balance_loss_mlp": 1.04311991, + "epoch": 0.4545317901698482, + "flos": 15925233991680.0, + "grad_norm": 2.1225715432080863, + "language_loss": 0.72038132, + "learning_rate": 2.389106271642792e-06, + "loss": 0.74190605, + "num_input_tokens_seen": 162242930, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.734375, + "step": 7560, + "time_per_iteration": 2.4289052486419678 + }, + { + "auxiliary_loss_clip": 0.01118313, + "auxiliary_loss_mlp": 0.01032424, + "balance_loss_clip": 1.01870942, + "balance_loss_mlp": 1.04184937, + "epoch": 0.45459191342251615, + "flos": 17639752947840.0, + "grad_norm": 1.8567895139214563, + "language_loss": 0.68786752, + "learning_rate": 2.3887242444276775e-06, + "loss": 0.70937485, + "num_input_tokens_seen": 162261455, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.765625, + "step": 7561, + "time_per_iteration": 2.483013153076172 + }, + { + "auxiliary_loss_clip": 0.01111487, + "auxiliary_loss_mlp": 0.01031067, + "balance_loss_clip": 1.01933646, + "balance_loss_mlp": 1.04098606, + "epoch": 0.4546520366751841, + "flos": 16176608346240.0, + "grad_norm": 1.6472040447099916, + "language_loss": 0.84813452, + "learning_rate": 2.3883422024721015e-06, + "loss": 0.86956006, + "num_input_tokens_seen": 162279725, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.703125, + "step": 7562, + "time_per_iteration": 2.435842752456665 + }, + { + "auxiliary_loss_clip": 0.0111239, + "auxiliary_loss_mlp": 0.01033708, + "balance_loss_clip": 1.02132261, + "balance_loss_mlp": 1.0416292, + "epoch": 0.4547121599278521, + "flos": 19751443562880.0, + "grad_norm": 1.8588056575997567, + "language_loss": 0.89808047, + "learning_rate": 2.38796014579055e-06, + "loss": 0.91954148, + "num_input_tokens_seen": 162297865, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.70703125, + "step": 7563, + "time_per_iteration": 2.4962618350982666 + }, + { + "auxiliary_loss_clip": 0.0111349, + "auxiliary_loss_mlp": 0.01037794, + "balance_loss_clip": 1.02425742, + "balance_loss_mlp": 1.03999305, + "epoch": 0.45477228318052004, + "flos": 19937461121280.0, + "grad_norm": 1.9222778596605532, + "language_loss": 0.71644425, + "learning_rate": 2.3875780743975097e-06, + "loss": 0.73795712, + "num_input_tokens_seen": 162316010, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 7564, + "time_per_iteration": 2.4343371391296387 + }, + { + "auxiliary_loss_clip": 0.01115348, + "auxiliary_loss_mlp": 0.01031628, + "balance_loss_clip": 1.01898563, + "balance_loss_mlp": 1.04060352, + "epoch": 0.454832406433188, + "flos": 21288312829440.0, + "grad_norm": 2.0985180699884496, + "language_loss": 0.67973971, + "learning_rate": 2.3871959883074713e-06, + "loss": 0.70120943, + "num_input_tokens_seen": 162336115, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.74609375, + "step": 7565, + "time_per_iteration": 2.5114333629608154 + }, + { + "auxiliary_loss_clip": 0.0111081, + "auxiliary_loss_mlp": 0.010292, + "balance_loss_clip": 1.01651037, + "balance_loss_mlp": 1.03948641, + "epoch": 0.45489252968585603, + "flos": 24498726612480.0, + "grad_norm": 1.555148092913002, + "language_loss": 0.80112624, + "learning_rate": 2.386813887534922e-06, + "loss": 0.8225264, + "num_input_tokens_seen": 162355705, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71484375, + "step": 7566, + "time_per_iteration": 2.4678473472595215 + }, + { + "auxiliary_loss_clip": 0.01114664, + "auxiliary_loss_mlp": 0.01028518, + "balance_loss_clip": 1.01451695, + "balance_loss_mlp": 1.04058981, + "epoch": 0.454952652938524, + "flos": 17092474352640.0, + "grad_norm": 1.5438575571986708, + "language_loss": 0.73526263, + "learning_rate": 2.3864317720943508e-06, + "loss": 0.75669444, + "num_input_tokens_seen": 162374055, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 7567, + "time_per_iteration": 2.4749765396118164 + }, + { + "auxiliary_loss_clip": 0.01117694, + "auxiliary_loss_mlp": 0.01031931, + "balance_loss_clip": 1.01924706, + "balance_loss_mlp": 1.04315984, + "epoch": 0.45501277619119196, + "flos": 27630387826560.0, + "grad_norm": 1.4420173241258303, + "language_loss": 0.80870211, + "learning_rate": 2.386049642000249e-06, + "loss": 0.83019841, + "num_input_tokens_seen": 162393560, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.74609375, + "step": 7568, + "time_per_iteration": 2.5098068714141846 + }, + { + "auxiliary_loss_clip": 0.01119299, + "auxiliary_loss_mlp": 0.01043854, + "balance_loss_clip": 1.02927494, + "balance_loss_mlp": 1.04110444, + "epoch": 0.4550728994438599, + "flos": 19974664632960.0, + "grad_norm": 1.9046518074434846, + "language_loss": 0.79472029, + "learning_rate": 2.3856674972671055e-06, + "loss": 0.81635177, + "num_input_tokens_seen": 162413170, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.78125, + "step": 7569, + "time_per_iteration": 2.5105931758880615 + }, + { + "auxiliary_loss_clip": 0.0111814, + "auxiliary_loss_mlp": 0.01032381, + "balance_loss_clip": 1.01811135, + "balance_loss_mlp": 1.04233003, + "epoch": 0.4551330226965279, + "flos": 26066873646720.0, + "grad_norm": 1.3375300297611126, + "language_loss": 0.74826288, + "learning_rate": 2.385285337909412e-06, + "loss": 0.76976812, + "num_input_tokens_seen": 162434080, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7578125, + "step": 7570, + "time_per_iteration": 2.5360968112945557 + }, + { + "auxiliary_loss_clip": 0.0111381, + "auxiliary_loss_mlp": 0.01037907, + "balance_loss_clip": 1.02502048, + "balance_loss_mlp": 1.04281187, + "epoch": 0.45519314594919585, + "flos": 32781091501440.0, + "grad_norm": 1.5540611030471656, + "language_loss": 0.74696088, + "learning_rate": 2.3849031639416596e-06, + "loss": 0.76847816, + "num_input_tokens_seen": 162455445, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 7571, + "time_per_iteration": 2.5796499252319336 + }, + { + "auxiliary_loss_clip": 0.01110782, + "auxiliary_loss_mlp": 0.01029523, + "balance_loss_clip": 1.01708317, + "balance_loss_mlp": 1.04096079, + "epoch": 0.4552532692018638, + "flos": 19172671718400.0, + "grad_norm": 1.522963408290285, + "language_loss": 0.81392241, + "learning_rate": 2.3845209753783414e-06, + "loss": 0.83532542, + "num_input_tokens_seen": 162474940, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.69921875, + "step": 7572, + "time_per_iteration": 2.452230215072632 + }, + { + "auxiliary_loss_clip": 0.01119128, + "auxiliary_loss_mlp": 0.01034444, + "balance_loss_clip": 1.02052081, + "balance_loss_mlp": 1.04266822, + "epoch": 0.4553133924545318, + "flos": 26027156183040.0, + "grad_norm": 2.158291075293226, + "language_loss": 0.72932756, + "learning_rate": 2.3841387722339486e-06, + "loss": 0.75086331, + "num_input_tokens_seen": 162493340, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 7573, + "time_per_iteration": 2.547351598739624 + }, + { + "auxiliary_loss_clip": 0.01119875, + "auxiliary_loss_mlp": 0.01036094, + "balance_loss_clip": 1.02106202, + "balance_loss_mlp": 1.04362583, + "epoch": 0.45537351570719975, + "flos": 30661535808000.0, + "grad_norm": 1.8799787689923733, + "language_loss": 0.74544156, + "learning_rate": 2.3837565545229748e-06, + "loss": 0.76700127, + "num_input_tokens_seen": 162514360, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.76171875, + "step": 7574, + "time_per_iteration": 2.512343406677246 + }, + { + "auxiliary_loss_clip": 0.01116382, + "auxiliary_loss_mlp": 0.01031805, + "balance_loss_clip": 1.01870358, + "balance_loss_mlp": 1.0413028, + "epoch": 0.4554336389598677, + "flos": 24353396184960.0, + "grad_norm": 1.8832109226527793, + "language_loss": 0.7161721, + "learning_rate": 2.383374322259915e-06, + "loss": 0.73765397, + "num_input_tokens_seen": 162535240, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.75, + "step": 7575, + "time_per_iteration": 2.516036033630371 + }, + { + "auxiliary_loss_clip": 0.01114571, + "auxiliary_loss_mlp": 0.01030488, + "balance_loss_clip": 1.0174526, + "balance_loss_mlp": 1.04138458, + "epoch": 0.4554937622125357, + "flos": 20557925677440.0, + "grad_norm": 1.7001526143902996, + "language_loss": 0.73163939, + "learning_rate": 2.3829920754592617e-06, + "loss": 0.75308996, + "num_input_tokens_seen": 162553880, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.734375, + "step": 7576, + "time_per_iteration": 2.446596145629883 + }, + { + "auxiliary_loss_clip": 0.01114194, + "auxiliary_loss_mlp": 0.01035671, + "balance_loss_clip": 1.02232563, + "balance_loss_mlp": 1.04252386, + "epoch": 0.45555388546520365, + "flos": 22820764723200.0, + "grad_norm": 1.8829162969496007, + "language_loss": 0.66556787, + "learning_rate": 2.382609814135511e-06, + "loss": 0.68706656, + "num_input_tokens_seen": 162574485, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.71484375, + "step": 7577, + "time_per_iteration": 2.496425151824951 + }, + { + "auxiliary_loss_clip": 0.01119433, + "auxiliary_loss_mlp": 0.01041229, + "balance_loss_clip": 1.02655983, + "balance_loss_mlp": 1.04481244, + "epoch": 0.4556140087178716, + "flos": 21725992051200.0, + "grad_norm": 1.905892479596231, + "language_loss": 0.74408162, + "learning_rate": 2.382227538303157e-06, + "loss": 0.76568818, + "num_input_tokens_seen": 162595130, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.74609375, + "step": 7578, + "time_per_iteration": 2.4517569541931152 + }, + { + "auxiliary_loss_clip": 0.01117156, + "auxiliary_loss_mlp": 0.01031849, + "balance_loss_clip": 1.01923108, + "balance_loss_mlp": 1.0432775, + "epoch": 0.45567413197053963, + "flos": 25994513698560.0, + "grad_norm": 1.9332037742405612, + "language_loss": 0.70189863, + "learning_rate": 2.381845247976697e-06, + "loss": 0.72338867, + "num_input_tokens_seen": 162615720, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.73828125, + "step": 7579, + "time_per_iteration": 2.5487825870513916 + }, + { + "auxiliary_loss_clip": 0.0111145, + "auxiliary_loss_mlp": 0.01032903, + "balance_loss_clip": 1.02031469, + "balance_loss_mlp": 1.03969145, + "epoch": 0.4557342552232076, + "flos": 21537604195200.0, + "grad_norm": 1.6152122780510265, + "language_loss": 0.78727221, + "learning_rate": 2.381462943170627e-06, + "loss": 0.8087157, + "num_input_tokens_seen": 162635825, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71875, + "step": 7580, + "time_per_iteration": 2.465355157852173 + }, + { + "auxiliary_loss_clip": 0.01115593, + "auxiliary_loss_mlp": 0.01028037, + "balance_loss_clip": 1.01463163, + "balance_loss_mlp": 1.04341292, + "epoch": 0.45579437847587556, + "flos": 40001972647680.0, + "grad_norm": 1.4438503581091628, + "language_loss": 0.68864352, + "learning_rate": 2.381080623899444e-06, + "loss": 0.71007979, + "num_input_tokens_seen": 162659130, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.72265625, + "step": 7581, + "time_per_iteration": 2.6738851070404053 + }, + { + "auxiliary_loss_clip": 0.01111798, + "auxiliary_loss_mlp": 0.0103026, + "balance_loss_clip": 1.01742125, + "balance_loss_mlp": 1.03975797, + "epoch": 0.4558545017285435, + "flos": 31138501530240.0, + "grad_norm": 1.5604567804249607, + "language_loss": 0.73416924, + "learning_rate": 2.3806982901776455e-06, + "loss": 0.75558978, + "num_input_tokens_seen": 162681665, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 7582, + "time_per_iteration": 2.5402657985687256 + }, + { + "auxiliary_loss_clip": 0.01118117, + "auxiliary_loss_mlp": 0.01045735, + "balance_loss_clip": 1.03065467, + "balance_loss_mlp": 1.04215884, + "epoch": 0.4559146249812115, + "flos": 21725776569600.0, + "grad_norm": 1.7600515256353326, + "language_loss": 0.72337949, + "learning_rate": 2.380315942019729e-06, + "loss": 0.74501801, + "num_input_tokens_seen": 162702040, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7578125, + "step": 7583, + "time_per_iteration": 2.51399564743042 + }, + { + "auxiliary_loss_clip": 0.01119408, + "auxiliary_loss_mlp": 0.01036162, + "balance_loss_clip": 1.02300692, + "balance_loss_mlp": 1.04282498, + "epoch": 0.45597474823387946, + "flos": 23805973935360.0, + "grad_norm": 1.711799016610791, + "language_loss": 0.72402817, + "learning_rate": 2.379933579440195e-06, + "loss": 0.74558389, + "num_input_tokens_seen": 162722375, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.765625, + "step": 7584, + "time_per_iteration": 2.4907238483428955 + }, + { + "auxiliary_loss_clip": 0.01116974, + "auxiliary_loss_mlp": 0.01032287, + "balance_loss_clip": 1.01922798, + "balance_loss_mlp": 1.04356861, + "epoch": 0.4560348714865474, + "flos": 31905661230720.0, + "grad_norm": 1.4921764730017937, + "language_loss": 0.68272889, + "learning_rate": 2.379551202453541e-06, + "loss": 0.70422149, + "num_input_tokens_seen": 162746095, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.734375, + "step": 7585, + "time_per_iteration": 2.5741868019104004 + }, + { + "auxiliary_loss_clip": 0.01114249, + "auxiliary_loss_mlp": 0.01031651, + "balance_loss_clip": 1.01928306, + "balance_loss_mlp": 1.04099321, + "epoch": 0.4560949947392154, + "flos": 22048828513920.0, + "grad_norm": 1.3206982799231843, + "language_loss": 0.76102924, + "learning_rate": 2.379168811074267e-06, + "loss": 0.78248823, + "num_input_tokens_seen": 162766330, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.73046875, + "step": 7586, + "time_per_iteration": 2.466991662979126 + }, + { + "auxiliary_loss_clip": 0.01112098, + "auxiliary_loss_mlp": 0.01028236, + "balance_loss_clip": 1.01651812, + "balance_loss_mlp": 1.0406158, + "epoch": 0.45615511799188335, + "flos": 24571804832640.0, + "grad_norm": 1.9114474136682882, + "language_loss": 0.77912259, + "learning_rate": 2.3787864053168747e-06, + "loss": 0.80052596, + "num_input_tokens_seen": 162784755, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.71484375, + "step": 7587, + "time_per_iteration": 2.534231185913086 + }, + { + "auxiliary_loss_clip": 0.01118125, + "auxiliary_loss_mlp": 0.01039323, + "balance_loss_clip": 1.02616787, + "balance_loss_mlp": 1.03976679, + "epoch": 0.4562152412445513, + "flos": 18330709944960.0, + "grad_norm": 2.2451216970422068, + "language_loss": 0.69211191, + "learning_rate": 2.378403985195863e-06, + "loss": 0.71368635, + "num_input_tokens_seen": 162803850, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.78125, + "step": 7588, + "time_per_iteration": 2.4104104042053223 + }, + { + "auxiliary_loss_clip": 0.011124, + "auxiliary_loss_mlp": 0.01034229, + "balance_loss_clip": 1.02178395, + "balance_loss_mlp": 1.0401839, + "epoch": 0.4562753644972193, + "flos": 13516525814400.0, + "grad_norm": 1.610626761932897, + "language_loss": 0.79335272, + "learning_rate": 2.378021550725735e-06, + "loss": 0.81481898, + "num_input_tokens_seen": 162820775, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.72265625, + "step": 7589, + "time_per_iteration": 2.465728998184204 + }, + { + "auxiliary_loss_clip": 0.01113978, + "auxiliary_loss_mlp": 0.01032914, + "balance_loss_clip": 1.01955092, + "balance_loss_mlp": 1.04108429, + "epoch": 0.45633548774988725, + "flos": 29639697701760.0, + "grad_norm": 2.193606067712595, + "language_loss": 0.6227479, + "learning_rate": 2.377639101920992e-06, + "loss": 0.64421678, + "num_input_tokens_seen": 162839695, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73046875, + "step": 7590, + "time_per_iteration": 2.509962558746338 + }, + { + "auxiliary_loss_clip": 0.0111218, + "auxiliary_loss_mlp": 0.01040835, + "balance_loss_clip": 1.02830625, + "balance_loss_mlp": 1.03874183, + "epoch": 0.4563956110025552, + "flos": 22233409528320.0, + "grad_norm": 5.263909382371274, + "language_loss": 0.72727275, + "learning_rate": 2.377256638796135e-06, + "loss": 0.74880284, + "num_input_tokens_seen": 162856095, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.734375, + "step": 7591, + "time_per_iteration": 2.529491424560547 + }, + { + "auxiliary_loss_clip": 0.01117071, + "auxiliary_loss_mlp": 0.01037677, + "balance_loss_clip": 1.02413523, + "balance_loss_mlp": 1.04252648, + "epoch": 0.45645573425522323, + "flos": 17092043389440.0, + "grad_norm": 2.0725698163141058, + "language_loss": 0.76985544, + "learning_rate": 2.3768741613656695e-06, + "loss": 0.79140294, + "num_input_tokens_seen": 162874070, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 7592, + "time_per_iteration": 2.4446723461151123 + }, + { + "auxiliary_loss_clip": 0.01113834, + "auxiliary_loss_mlp": 0.01028586, + "balance_loss_clip": 1.01604521, + "balance_loss_mlp": 1.04070461, + "epoch": 0.4565158575078912, + "flos": 20332334309760.0, + "grad_norm": 1.9266503814961675, + "language_loss": 0.69611561, + "learning_rate": 2.376491669644098e-06, + "loss": 0.71753979, + "num_input_tokens_seen": 162891000, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.73046875, + "step": 7593, + "time_per_iteration": 2.4879302978515625 + }, + { + "auxiliary_loss_clip": 0.01107529, + "auxiliary_loss_mlp": 0.01030986, + "balance_loss_clip": 1.01882672, + "balance_loss_mlp": 1.03803527, + "epoch": 0.45657598076055916, + "flos": 23983013093760.0, + "grad_norm": 2.17790627040614, + "language_loss": 0.84199911, + "learning_rate": 2.3761091636459248e-06, + "loss": 0.86338425, + "num_input_tokens_seen": 162910120, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 7594, + "time_per_iteration": 2.464733362197876 + }, + { + "auxiliary_loss_clip": 0.01035796, + "auxiliary_loss_mlp": 0.00998737, + "balance_loss_clip": 0.99745506, + "balance_loss_mlp": 1.01167154, + "epoch": 0.45663610401322713, + "flos": 69364297526400.0, + "grad_norm": 0.7964417819777524, + "language_loss": 0.52721512, + "learning_rate": 2.375726643385654e-06, + "loss": 0.54756045, + "num_input_tokens_seen": 162963720, + "router_z_loss_clip": 0.01281738, + "router_z_loss_mlp": 0.2421875, + "step": 7595, + "time_per_iteration": 6.0974061489105225 + }, + { + "auxiliary_loss_clip": 0.01117501, + "auxiliary_loss_mlp": 0.01031747, + "balance_loss_clip": 1.01843739, + "balance_loss_mlp": 1.04165292, + "epoch": 0.4566962272658951, + "flos": 15149095891200.0, + "grad_norm": 2.1595430840247714, + "language_loss": 0.87448329, + "learning_rate": 2.3753441088777915e-06, + "loss": 0.89597577, + "num_input_tokens_seen": 162975760, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7578125, + "step": 7596, + "time_per_iteration": 3.862628936767578 + }, + { + "auxiliary_loss_clip": 0.01113625, + "auxiliary_loss_mlp": 0.01039379, + "balance_loss_clip": 1.02698088, + "balance_loss_mlp": 1.03993344, + "epoch": 0.45675635051856306, + "flos": 18697465762560.0, + "grad_norm": 2.2425847761174196, + "language_loss": 0.77131474, + "learning_rate": 2.374961560136843e-06, + "loss": 0.79284477, + "num_input_tokens_seen": 162994865, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.734375, + "step": 7597, + "time_per_iteration": 2.4821672439575195 + }, + { + "auxiliary_loss_clip": 0.01113745, + "auxiliary_loss_mlp": 0.01034483, + "balance_loss_clip": 1.02122104, + "balance_loss_mlp": 1.04004443, + "epoch": 0.456816473771231, + "flos": 19098300608640.0, + "grad_norm": 1.7340388440754042, + "language_loss": 0.78560513, + "learning_rate": 2.374578997177314e-06, + "loss": 0.80708742, + "num_input_tokens_seen": 163014730, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73828125, + "step": 7598, + "time_per_iteration": 2.4350392818450928 + }, + { + "auxiliary_loss_clip": 0.01113148, + "auxiliary_loss_mlp": 0.01029189, + "balance_loss_clip": 1.01735115, + "balance_loss_mlp": 1.04057133, + "epoch": 0.456876597023899, + "flos": 28950069507840.0, + "grad_norm": 2.435026889485133, + "language_loss": 0.71715307, + "learning_rate": 2.374196420013712e-06, + "loss": 0.73857641, + "num_input_tokens_seen": 163033405, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.7265625, + "step": 7599, + "time_per_iteration": 2.5838844776153564 + }, + { + "auxiliary_loss_clip": 0.01108114, + "auxiliary_loss_mlp": 0.01035222, + "balance_loss_clip": 1.02238345, + "balance_loss_mlp": 1.03702497, + "epoch": 0.45693672027656695, + "flos": 23289470317440.0, + "grad_norm": 1.734840239500452, + "language_loss": 0.69377261, + "learning_rate": 2.373813828660544e-06, + "loss": 0.71520597, + "num_input_tokens_seen": 163051400, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7109375, + "step": 7600, + "time_per_iteration": 2.4499921798706055 + }, + { + "auxiliary_loss_clip": 0.01112216, + "auxiliary_loss_mlp": 0.01038134, + "balance_loss_clip": 1.02584386, + "balance_loss_mlp": 1.03979039, + "epoch": 0.4569968435292349, + "flos": 20558212986240.0, + "grad_norm": 1.9688741418230387, + "language_loss": 0.78654951, + "learning_rate": 2.373431223132319e-06, + "loss": 0.80805302, + "num_input_tokens_seen": 163069250, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.72265625, + "step": 7601, + "time_per_iteration": 2.555522918701172 + }, + { + "auxiliary_loss_clip": 0.01114065, + "auxiliary_loss_mlp": 0.01036912, + "balance_loss_clip": 1.02500272, + "balance_loss_mlp": 1.04013097, + "epoch": 0.4570569667819029, + "flos": 41282619223680.0, + "grad_norm": 1.706657696767707, + "language_loss": 0.71609282, + "learning_rate": 2.3730486034435448e-06, + "loss": 0.73760259, + "num_input_tokens_seen": 163091755, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.73828125, + "step": 7602, + "time_per_iteration": 2.6383092403411865 + }, + { + "auxiliary_loss_clip": 0.01112609, + "auxiliary_loss_mlp": 0.01031807, + "balance_loss_clip": 1.01735842, + "balance_loss_mlp": 1.03901231, + "epoch": 0.45711709003457085, + "flos": 26031573555840.0, + "grad_norm": 1.778856324344474, + "language_loss": 0.72776276, + "learning_rate": 2.372665969608729e-06, + "loss": 0.7492069, + "num_input_tokens_seen": 163111600, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.734375, + "step": 7603, + "time_per_iteration": 2.566542387008667 + }, + { + "auxiliary_loss_clip": 0.01113258, + "auxiliary_loss_mlp": 0.0103744, + "balance_loss_clip": 1.02284837, + "balance_loss_mlp": 1.03945732, + "epoch": 0.4571772132872388, + "flos": 22158068751360.0, + "grad_norm": 1.783042546573846, + "language_loss": 0.83495164, + "learning_rate": 2.372283321642383e-06, + "loss": 0.8564586, + "num_input_tokens_seen": 163127350, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.73828125, + "step": 7604, + "time_per_iteration": 2.4322941303253174 + }, + { + "auxiliary_loss_clip": 0.0112315, + "auxiliary_loss_mlp": 0.0103587, + "balance_loss_clip": 1.02152371, + "balance_loss_mlp": 1.04472041, + "epoch": 0.45723733653990684, + "flos": 23878872587520.0, + "grad_norm": 1.742561007105776, + "language_loss": 0.85827744, + "learning_rate": 2.371900659559016e-06, + "loss": 0.87986767, + "num_input_tokens_seen": 163145855, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78515625, + "step": 7605, + "time_per_iteration": 2.495654582977295 + }, + { + "auxiliary_loss_clip": 0.01116353, + "auxiliary_loss_mlp": 0.01035384, + "balance_loss_clip": 1.02216911, + "balance_loss_mlp": 1.04045463, + "epoch": 0.4572974597925748, + "flos": 16871803148160.0, + "grad_norm": 1.9150435252301277, + "language_loss": 0.73814523, + "learning_rate": 2.371517983373138e-06, + "loss": 0.75966263, + "num_input_tokens_seen": 163163830, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7578125, + "step": 7606, + "time_per_iteration": 2.472698926925659 + }, + { + "auxiliary_loss_clip": 0.01115234, + "auxiliary_loss_mlp": 0.0103916, + "balance_loss_clip": 1.02525389, + "balance_loss_mlp": 1.03985333, + "epoch": 0.45735758304524277, + "flos": 13771491528960.0, + "grad_norm": 4.395321075422478, + "language_loss": 0.7975688, + "learning_rate": 2.371135293099262e-06, + "loss": 0.81911278, + "num_input_tokens_seen": 163180700, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75390625, + "step": 7607, + "time_per_iteration": 2.500666618347168 + }, + { + "auxiliary_loss_clip": 0.01117549, + "auxiliary_loss_mlp": 0.01042987, + "balance_loss_clip": 1.0295403, + "balance_loss_mlp": 1.0436604, + "epoch": 0.45741770629791073, + "flos": 21100750986240.0, + "grad_norm": 2.5876510188713437, + "language_loss": 0.80827034, + "learning_rate": 2.3707525887518982e-06, + "loss": 0.82987565, + "num_input_tokens_seen": 163199450, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73828125, + "step": 7608, + "time_per_iteration": 2.454738140106201 + }, + { + "auxiliary_loss_clip": 0.0111299, + "auxiliary_loss_mlp": 0.01040349, + "balance_loss_clip": 1.02624631, + "balance_loss_mlp": 1.03830588, + "epoch": 0.4574778295505787, + "flos": 23112898035840.0, + "grad_norm": 1.6879461416077837, + "language_loss": 0.68500757, + "learning_rate": 2.370369870345559e-06, + "loss": 0.70654094, + "num_input_tokens_seen": 163217875, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.75, + "step": 7609, + "time_per_iteration": 2.567387580871582 + }, + { + "auxiliary_loss_clip": 0.01113281, + "auxiliary_loss_mlp": 0.01039479, + "balance_loss_clip": 1.02609158, + "balance_loss_mlp": 1.03981042, + "epoch": 0.45753795280324666, + "flos": 24352929308160.0, + "grad_norm": 1.861126687806453, + "language_loss": 0.80749559, + "learning_rate": 2.369987137894757e-06, + "loss": 0.82902324, + "num_input_tokens_seen": 163237430, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.734375, + "step": 7610, + "time_per_iteration": 2.5181450843811035 + }, + { + "auxiliary_loss_clip": 0.01115569, + "auxiliary_loss_mlp": 0.01034872, + "balance_loss_clip": 1.02122259, + "balance_loss_mlp": 1.04017019, + "epoch": 0.4575980760559146, + "flos": 16653789550080.0, + "grad_norm": 1.991436967054915, + "language_loss": 0.82063943, + "learning_rate": 2.3696043914140057e-06, + "loss": 0.84214383, + "num_input_tokens_seen": 163253905, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75390625, + "step": 7611, + "time_per_iteration": 2.5181667804718018 + }, + { + "auxiliary_loss_clip": 0.01117824, + "auxiliary_loss_mlp": 0.01030256, + "balance_loss_clip": 1.01684475, + "balance_loss_mlp": 1.04256463, + "epoch": 0.4576581993085826, + "flos": 35911423912320.0, + "grad_norm": 1.7999257820591783, + "language_loss": 0.74032104, + "learning_rate": 2.369221630917819e-06, + "loss": 0.76180184, + "num_input_tokens_seen": 163274285, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.75, + "step": 7612, + "time_per_iteration": 2.573192596435547 + }, + { + "auxiliary_loss_clip": 0.01111253, + "auxiliary_loss_mlp": 0.01031239, + "balance_loss_clip": 1.01775634, + "balance_loss_mlp": 1.03739977, + "epoch": 0.45771832256125056, + "flos": 20080421251200.0, + "grad_norm": 1.4998899682115554, + "language_loss": 0.84958243, + "learning_rate": 2.368838856420711e-06, + "loss": 0.87100732, + "num_input_tokens_seen": 163293150, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73828125, + "step": 7613, + "time_per_iteration": 2.519374132156372 + }, + { + "auxiliary_loss_clip": 0.01113962, + "auxiliary_loss_mlp": 0.01028737, + "balance_loss_clip": 1.01548696, + "balance_loss_mlp": 1.04007339, + "epoch": 0.4577784458139185, + "flos": 10744329957120.0, + "grad_norm": 2.119092433129462, + "language_loss": 0.75686407, + "learning_rate": 2.3684560679371965e-06, + "loss": 0.77829111, + "num_input_tokens_seen": 163310065, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73828125, + "step": 7614, + "time_per_iteration": 2.435258388519287 + }, + { + "auxiliary_loss_clip": 0.01111665, + "auxiliary_loss_mlp": 0.01031457, + "balance_loss_clip": 1.01870763, + "balance_loss_mlp": 1.03973377, + "epoch": 0.4578385690665865, + "flos": 21907269014400.0, + "grad_norm": 1.4729553038511707, + "language_loss": 0.74797261, + "learning_rate": 2.368073265481791e-06, + "loss": 0.76940382, + "num_input_tokens_seen": 163329415, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.71875, + "step": 7615, + "time_per_iteration": 2.4776275157928467 + }, + { + "auxiliary_loss_clip": 0.01037994, + "auxiliary_loss_mlp": 0.00999141, + "balance_loss_clip": 0.99766314, + "balance_loss_mlp": 1.01355577, + "epoch": 0.45789869231925445, + "flos": 64758286667520.0, + "grad_norm": 0.7822572530544061, + "language_loss": 0.57660586, + "learning_rate": 2.3676904490690105e-06, + "loss": 0.59697717, + "num_input_tokens_seen": 163385875, + "router_z_loss_clip": 0.01477051, + "router_z_loss_mlp": 0.24414062, + "step": 7616, + "time_per_iteration": 2.9986298084259033 + }, + { + "auxiliary_loss_clip": 0.01111756, + "auxiliary_loss_mlp": 0.01038669, + "balance_loss_clip": 1.0251503, + "balance_loss_mlp": 1.03939307, + "epoch": 0.4579588155719224, + "flos": 16144001775360.0, + "grad_norm": 1.5412759634284317, + "language_loss": 0.70953274, + "learning_rate": 2.3673076187133704e-06, + "loss": 0.73103696, + "num_input_tokens_seen": 163405170, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.72265625, + "step": 7617, + "time_per_iteration": 2.514575958251953 + }, + { + "auxiliary_loss_clip": 0.01116383, + "auxiliary_loss_mlp": 0.01032517, + "balance_loss_clip": 1.01886725, + "balance_loss_mlp": 1.04211044, + "epoch": 0.45801893882459044, + "flos": 21395541905280.0, + "grad_norm": 2.1003257335678245, + "language_loss": 0.76458549, + "learning_rate": 2.36692477442939e-06, + "loss": 0.78607446, + "num_input_tokens_seen": 163423155, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7421875, + "step": 7618, + "time_per_iteration": 2.431196689605713 + }, + { + "auxiliary_loss_clip": 0.01118549, + "auxiliary_loss_mlp": 0.01044975, + "balance_loss_clip": 1.0323689, + "balance_loss_mlp": 1.0429455, + "epoch": 0.4580790620772584, + "flos": 19536554448000.0, + "grad_norm": 1.7069120237831286, + "language_loss": 0.76705682, + "learning_rate": 2.366541916231585e-06, + "loss": 0.788692, + "num_input_tokens_seen": 163442450, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.75390625, + "step": 7619, + "time_per_iteration": 2.491133213043213 + }, + { + "auxiliary_loss_clip": 0.01112973, + "auxiliary_loss_mlp": 0.0103583, + "balance_loss_clip": 1.02378964, + "balance_loss_mlp": 1.04174709, + "epoch": 0.45813918532992637, + "flos": 16581070465920.0, + "grad_norm": 1.9887034550999254, + "language_loss": 0.7175532, + "learning_rate": 2.366159044134473e-06, + "loss": 0.73904121, + "num_input_tokens_seen": 163459810, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.7109375, + "step": 7620, + "time_per_iteration": 2.429659366607666 + }, + { + "auxiliary_loss_clip": 0.0111009, + "auxiliary_loss_mlp": 0.01028718, + "balance_loss_clip": 1.01643384, + "balance_loss_mlp": 1.03828478, + "epoch": 0.45819930858259433, + "flos": 42230301701760.0, + "grad_norm": 2.3637648648526035, + "language_loss": 0.78374821, + "learning_rate": 2.3657761581525748e-06, + "loss": 0.80513632, + "num_input_tokens_seen": 163482970, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.71875, + "step": 7621, + "time_per_iteration": 2.69990611076355 + }, + { + "auxiliary_loss_clip": 0.01037733, + "auxiliary_loss_mlp": 0.01001998, + "balance_loss_clip": 1.00071096, + "balance_loss_mlp": 1.01315987, + "epoch": 0.4582594318352623, + "flos": 63714795638400.0, + "grad_norm": 0.7958411378428579, + "language_loss": 0.6499809, + "learning_rate": 2.3653932583004063e-06, + "loss": 0.67037821, + "num_input_tokens_seen": 163545330, + "router_z_loss_clip": 0.01287842, + "router_z_loss_mlp": 0.24609375, + "step": 7622, + "time_per_iteration": 3.0476205348968506 + }, + { + "auxiliary_loss_clip": 0.01114449, + "auxiliary_loss_mlp": 0.01030125, + "balance_loss_clip": 1.01667762, + "balance_loss_mlp": 1.04142582, + "epoch": 0.45831955508793026, + "flos": 26869979882880.0, + "grad_norm": 1.9256202714320767, + "language_loss": 0.79611146, + "learning_rate": 2.3650103445924903e-06, + "loss": 0.81755722, + "num_input_tokens_seen": 163564620, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73046875, + "step": 7623, + "time_per_iteration": 2.547234535217285 + }, + { + "auxiliary_loss_clip": 0.01115872, + "auxiliary_loss_mlp": 0.01036985, + "balance_loss_clip": 1.02382421, + "balance_loss_mlp": 1.04050457, + "epoch": 0.45837967834059823, + "flos": 18733951002240.0, + "grad_norm": 1.996922752989922, + "language_loss": 0.70809233, + "learning_rate": 2.3646274170433452e-06, + "loss": 0.72962081, + "num_input_tokens_seen": 163581010, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.75390625, + "step": 7624, + "time_per_iteration": 2.442575693130493 + }, + { + "auxiliary_loss_clip": 0.01113872, + "auxiliary_loss_mlp": 0.01032309, + "balance_loss_clip": 1.01944637, + "balance_loss_mlp": 1.0383656, + "epoch": 0.4584398015932662, + "flos": 21178102924800.0, + "grad_norm": 2.876738245253823, + "language_loss": 0.7299192, + "learning_rate": 2.364244475667491e-06, + "loss": 0.75138104, + "num_input_tokens_seen": 163599955, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7578125, + "step": 7625, + "time_per_iteration": 2.53002667427063 + }, + { + "auxiliary_loss_clip": 0.01116016, + "auxiliary_loss_mlp": 0.01033086, + "balance_loss_clip": 1.02058113, + "balance_loss_mlp": 1.04226136, + "epoch": 0.45849992484593416, + "flos": 19790047704960.0, + "grad_norm": 3.1470354950748716, + "language_loss": 0.78132713, + "learning_rate": 2.363861520479451e-06, + "loss": 0.80281818, + "num_input_tokens_seen": 163618545, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.73828125, + "step": 7626, + "time_per_iteration": 2.4544708728790283 + }, + { + "auxiliary_loss_clip": 0.01117004, + "auxiliary_loss_mlp": 0.0103582, + "balance_loss_clip": 1.02270126, + "balance_loss_mlp": 1.04142714, + "epoch": 0.4585600480986021, + "flos": 18223265387520.0, + "grad_norm": 1.604401840334718, + "language_loss": 0.85191864, + "learning_rate": 2.3634785514937445e-06, + "loss": 0.87344688, + "num_input_tokens_seen": 163636055, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7578125, + "step": 7627, + "time_per_iteration": 2.478867769241333 + }, + { + "auxiliary_loss_clip": 0.01117716, + "auxiliary_loss_mlp": 0.01036428, + "balance_loss_clip": 1.02293992, + "balance_loss_mlp": 1.04074025, + "epoch": 0.4586201713512701, + "flos": 29022213974400.0, + "grad_norm": 1.506714204397822, + "language_loss": 0.69413865, + "learning_rate": 2.3630955687248953e-06, + "loss": 0.71568, + "num_input_tokens_seen": 163657485, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.76953125, + "step": 7628, + "time_per_iteration": 2.5127782821655273 + }, + { + "auxiliary_loss_clip": 0.01113376, + "auxiliary_loss_mlp": 0.01030025, + "balance_loss_clip": 1.01654255, + "balance_loss_mlp": 1.04060626, + "epoch": 0.45868029460393805, + "flos": 23404600385280.0, + "grad_norm": 1.5379008002675938, + "language_loss": 0.78294545, + "learning_rate": 2.3627125721874265e-06, + "loss": 0.8043794, + "num_input_tokens_seen": 163676030, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7265625, + "step": 7629, + "time_per_iteration": 2.4944000244140625 + }, + { + "auxiliary_loss_clip": 0.0111907, + "auxiliary_loss_mlp": 0.01039687, + "balance_loss_clip": 1.02578115, + "balance_loss_mlp": 1.04031289, + "epoch": 0.458740417856606, + "flos": 18221972497920.0, + "grad_norm": 2.0009780664883223, + "language_loss": 0.79405141, + "learning_rate": 2.3623295618958595e-06, + "loss": 0.81563896, + "num_input_tokens_seen": 163694490, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7890625, + "step": 7630, + "time_per_iteration": 2.443598747253418 + }, + { + "auxiliary_loss_clip": 0.0111732, + "auxiliary_loss_mlp": 0.01034928, + "balance_loss_clip": 1.02108812, + "balance_loss_mlp": 1.03952336, + "epoch": 0.458800541109274, + "flos": 34568760504960.0, + "grad_norm": 1.67887072973593, + "language_loss": 0.71819407, + "learning_rate": 2.3619465378647198e-06, + "loss": 0.73971653, + "num_input_tokens_seen": 163717035, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7734375, + "step": 7631, + "time_per_iteration": 2.613935708999634 + }, + { + "auxiliary_loss_clip": 0.01118321, + "auxiliary_loss_mlp": 0.0103646, + "balance_loss_clip": 1.02248299, + "balance_loss_mlp": 1.04306722, + "epoch": 0.458860664361942, + "flos": 17712112896000.0, + "grad_norm": 2.655938907200588, + "language_loss": 0.71337265, + "learning_rate": 2.361563500108531e-06, + "loss": 0.7349205, + "num_input_tokens_seen": 163734525, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75390625, + "step": 7632, + "time_per_iteration": 2.4854414463043213 + }, + { + "auxiliary_loss_clip": 0.01118604, + "auxiliary_loss_mlp": 0.01033523, + "balance_loss_clip": 1.0190748, + "balance_loss_mlp": 1.04055059, + "epoch": 0.45892078761460997, + "flos": 18441889516800.0, + "grad_norm": 15.51679170955813, + "language_loss": 0.69212449, + "learning_rate": 2.3611804486418178e-06, + "loss": 0.71364582, + "num_input_tokens_seen": 163752860, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 7633, + "time_per_iteration": 2.488741874694824 + }, + { + "auxiliary_loss_clip": 0.01115341, + "auxiliary_loss_mlp": 0.01036682, + "balance_loss_clip": 1.02366996, + "balance_loss_mlp": 1.04068875, + "epoch": 0.45898091086727794, + "flos": 22672956257280.0, + "grad_norm": 1.4724338826500494, + "language_loss": 0.80777454, + "learning_rate": 2.3607973834791062e-06, + "loss": 0.82929468, + "num_input_tokens_seen": 163772495, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.74609375, + "step": 7634, + "time_per_iteration": 2.4676551818847656 + }, + { + "auxiliary_loss_clip": 0.01118954, + "auxiliary_loss_mlp": 0.0103355, + "balance_loss_clip": 1.0188632, + "balance_loss_mlp": 1.04032791, + "epoch": 0.4590410341199459, + "flos": 21652949744640.0, + "grad_norm": 1.9575518559569576, + "language_loss": 0.81853092, + "learning_rate": 2.3604143046349216e-06, + "loss": 0.84005594, + "num_input_tokens_seen": 163791475, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78515625, + "step": 7635, + "time_per_iteration": 2.513383150100708 + }, + { + "auxiliary_loss_clip": 0.01112964, + "auxiliary_loss_mlp": 0.01040022, + "balance_loss_clip": 1.02696204, + "balance_loss_mlp": 1.04045606, + "epoch": 0.45910115737261387, + "flos": 36535372087680.0, + "grad_norm": 1.4265799385965707, + "language_loss": 0.64948833, + "learning_rate": 2.3600312121237905e-06, + "loss": 0.67101824, + "num_input_tokens_seen": 163812995, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 7636, + "time_per_iteration": 4.062237501144409 + }, + { + "auxiliary_loss_clip": 0.01114223, + "auxiliary_loss_mlp": 0.01029599, + "balance_loss_clip": 1.01690328, + "balance_loss_mlp": 1.04186797, + "epoch": 0.45916128062528183, + "flos": 24419866302720.0, + "grad_norm": 1.4568741521374282, + "language_loss": 0.80726147, + "learning_rate": 2.3596481059602395e-06, + "loss": 0.82869971, + "num_input_tokens_seen": 163833945, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 7637, + "time_per_iteration": 4.017204999923706 + }, + { + "auxiliary_loss_clip": 0.011204, + "auxiliary_loss_mlp": 0.01037267, + "balance_loss_clip": 1.02297974, + "balance_loss_mlp": 1.0438447, + "epoch": 0.4592214038779498, + "flos": 23221958705280.0, + "grad_norm": 1.56098785708404, + "language_loss": 0.75311542, + "learning_rate": 2.3592649861587965e-06, + "loss": 0.77469212, + "num_input_tokens_seen": 163853885, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.765625, + "step": 7638, + "time_per_iteration": 2.4801623821258545 + }, + { + "auxiliary_loss_clip": 0.01113334, + "auxiliary_loss_mlp": 0.01033656, + "balance_loss_clip": 1.02054262, + "balance_loss_mlp": 1.04093051, + "epoch": 0.45928152713061776, + "flos": 19172133014400.0, + "grad_norm": 1.6757486640396035, + "language_loss": 0.74225289, + "learning_rate": 2.358881852733989e-06, + "loss": 0.76372278, + "num_input_tokens_seen": 163871855, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 7639, + "time_per_iteration": 2.457977294921875 + }, + { + "auxiliary_loss_clip": 0.0111659, + "auxiliary_loss_mlp": 0.01034149, + "balance_loss_clip": 1.02073193, + "balance_loss_mlp": 1.0410862, + "epoch": 0.4593416503832857, + "flos": 22414686491520.0, + "grad_norm": 2.7996676169839856, + "language_loss": 0.68441081, + "learning_rate": 2.358498705700346e-06, + "loss": 0.70591819, + "num_input_tokens_seen": 163891450, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75390625, + "step": 7640, + "time_per_iteration": 2.4815306663513184 + }, + { + "auxiliary_loss_clip": 0.01116242, + "auxiliary_loss_mlp": 0.01039241, + "balance_loss_clip": 1.02532363, + "balance_loss_mlp": 1.03950286, + "epoch": 0.4594017736359537, + "flos": 18880215183360.0, + "grad_norm": 4.694339799219563, + "language_loss": 0.75290608, + "learning_rate": 2.3581155450723958e-06, + "loss": 0.77446091, + "num_input_tokens_seen": 163909345, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 7641, + "time_per_iteration": 2.4738545417785645 + }, + { + "auxiliary_loss_clip": 0.0111703, + "auxiliary_loss_mlp": 0.01031975, + "balance_loss_clip": 1.0180217, + "balance_loss_mlp": 1.041008, + "epoch": 0.45946189688862166, + "flos": 20518567349760.0, + "grad_norm": 1.7266679695779108, + "language_loss": 0.74649787, + "learning_rate": 2.357732370864668e-06, + "loss": 0.76798791, + "num_input_tokens_seen": 163926940, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76171875, + "step": 7642, + "time_per_iteration": 2.474160671234131 + }, + { + "auxiliary_loss_clip": 0.01036998, + "auxiliary_loss_mlp": 0.00999788, + "balance_loss_clip": 0.99855977, + "balance_loss_mlp": 1.01273584, + "epoch": 0.4595220201412896, + "flos": 61405990162560.0, + "grad_norm": 0.8383581259748949, + "language_loss": 0.58191991, + "learning_rate": 2.357349183091694e-06, + "loss": 0.60228777, + "num_input_tokens_seen": 163977785, + "router_z_loss_clip": 0.01226807, + "router_z_loss_mlp": 0.2421875, + "step": 7643, + "time_per_iteration": 2.810622453689575 + }, + { + "auxiliary_loss_clip": 0.01118319, + "auxiliary_loss_mlp": 0.01036506, + "balance_loss_clip": 1.02267814, + "balance_loss_mlp": 1.03810704, + "epoch": 0.4595821433939576, + "flos": 23330947547520.0, + "grad_norm": 1.5583198955297553, + "language_loss": 0.92945647, + "learning_rate": 2.3569659817680016e-06, + "loss": 0.95100462, + "num_input_tokens_seen": 163996630, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.80078125, + "step": 7644, + "time_per_iteration": 2.4740004539489746 + }, + { + "auxiliary_loss_clip": 0.01116764, + "auxiliary_loss_mlp": 0.01038017, + "balance_loss_clip": 1.02458835, + "balance_loss_mlp": 1.04016256, + "epoch": 0.4596422666466256, + "flos": 14282356711680.0, + "grad_norm": 1.923875093759249, + "language_loss": 0.8283661, + "learning_rate": 2.3565827669081243e-06, + "loss": 0.8499139, + "num_input_tokens_seen": 164013190, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 7645, + "time_per_iteration": 2.459575891494751 + }, + { + "auxiliary_loss_clip": 0.01035246, + "auxiliary_loss_mlp": 0.00999372, + "balance_loss_clip": 0.99805516, + "balance_loss_mlp": 1.0108279, + "epoch": 0.4597023898992936, + "flos": 65727337737600.0, + "grad_norm": 0.7553504929083139, + "language_loss": 0.59931064, + "learning_rate": 2.356199538526593e-06, + "loss": 0.6196568, + "num_input_tokens_seen": 164074030, + "router_z_loss_clip": 0.01318359, + "router_z_loss_mlp": 0.24414062, + "step": 7646, + "time_per_iteration": 3.0040318965911865 + }, + { + "auxiliary_loss_clip": 0.01116678, + "auxiliary_loss_mlp": 0.01032804, + "balance_loss_clip": 1.01953018, + "balance_loss_mlp": 1.04043436, + "epoch": 0.45976251315196154, + "flos": 26907075653760.0, + "grad_norm": 1.6094604606837348, + "language_loss": 0.72804034, + "learning_rate": 2.355816296637939e-06, + "loss": 0.74953508, + "num_input_tokens_seen": 164095515, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.76171875, + "step": 7647, + "time_per_iteration": 2.539550304412842 + }, + { + "auxiliary_loss_clip": 0.01114997, + "auxiliary_loss_mlp": 0.01034751, + "balance_loss_clip": 1.02135134, + "balance_loss_mlp": 1.03845108, + "epoch": 0.4598226364046295, + "flos": 26618066824320.0, + "grad_norm": 1.5906503149252664, + "language_loss": 0.66864169, + "learning_rate": 2.3554330412566957e-06, + "loss": 0.69013917, + "num_input_tokens_seen": 164117270, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.765625, + "step": 7648, + "time_per_iteration": 2.538694143295288 + }, + { + "auxiliary_loss_clip": 0.01112764, + "auxiliary_loss_mlp": 0.01033419, + "balance_loss_clip": 1.01969171, + "balance_loss_mlp": 1.03751159, + "epoch": 0.45988275965729747, + "flos": 24387762522240.0, + "grad_norm": 1.4797855079557312, + "language_loss": 0.78785735, + "learning_rate": 2.3550497723973953e-06, + "loss": 0.80931914, + "num_input_tokens_seen": 164137850, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.75, + "step": 7649, + "time_per_iteration": 2.5164248943328857 + }, + { + "auxiliary_loss_clip": 0.01113076, + "auxiliary_loss_mlp": 0.01037179, + "balance_loss_clip": 1.02412558, + "balance_loss_mlp": 1.03840113, + "epoch": 0.45994288290996543, + "flos": 24535822383360.0, + "grad_norm": 3.1550947466117303, + "language_loss": 0.69324255, + "learning_rate": 2.3546664900745726e-06, + "loss": 0.7147451, + "num_input_tokens_seen": 164157960, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.75, + "step": 7650, + "time_per_iteration": 2.5182442665100098 + }, + { + "auxiliary_loss_clip": 0.01118739, + "auxiliary_loss_mlp": 0.01039502, + "balance_loss_clip": 1.0245893, + "balance_loss_mlp": 1.03925538, + "epoch": 0.4600030061626334, + "flos": 14830245838080.0, + "grad_norm": 1.968615763904363, + "language_loss": 0.83896518, + "learning_rate": 2.354283194302761e-06, + "loss": 0.86054754, + "num_input_tokens_seen": 164174590, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.79296875, + "step": 7651, + "time_per_iteration": 2.4545249938964844 + }, + { + "auxiliary_loss_clip": 0.01114537, + "auxiliary_loss_mlp": 0.01030219, + "balance_loss_clip": 1.01685548, + "balance_loss_mlp": 1.04122114, + "epoch": 0.46006312941530136, + "flos": 18113845582080.0, + "grad_norm": 2.1703456469435944, + "language_loss": 0.75375223, + "learning_rate": 2.3538998850964948e-06, + "loss": 0.77519977, + "num_input_tokens_seen": 164192935, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.734375, + "step": 7652, + "time_per_iteration": 2.4435648918151855 + }, + { + "auxiliary_loss_clip": 0.01113746, + "auxiliary_loss_mlp": 0.01029979, + "balance_loss_clip": 1.01611495, + "balance_loss_mlp": 1.03735042, + "epoch": 0.46012325266796933, + "flos": 21976468565760.0, + "grad_norm": 1.8091521205399639, + "language_loss": 0.75805604, + "learning_rate": 2.3535165624703097e-06, + "loss": 0.77949333, + "num_input_tokens_seen": 164213160, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 7653, + "time_per_iteration": 2.530977487564087 + }, + { + "auxiliary_loss_clip": 0.01121671, + "auxiliary_loss_mlp": 0.01038496, + "balance_loss_clip": 1.02338028, + "balance_loss_mlp": 1.04202819, + "epoch": 0.4601833759206373, + "flos": 15268068714240.0, + "grad_norm": 2.3598469293633584, + "language_loss": 0.6584686, + "learning_rate": 2.353133226438741e-06, + "loss": 0.68007028, + "num_input_tokens_seen": 164229330, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.796875, + "step": 7654, + "time_per_iteration": 2.3942883014678955 + }, + { + "auxiliary_loss_clip": 0.01112793, + "auxiliary_loss_mlp": 0.0103367, + "balance_loss_clip": 1.02026534, + "balance_loss_mlp": 1.0375098, + "epoch": 0.46024349917330526, + "flos": 27088999061760.0, + "grad_norm": 1.647085409720671, + "language_loss": 0.79088843, + "learning_rate": 2.3527498770163248e-06, + "loss": 0.81235307, + "num_input_tokens_seen": 164248240, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.75390625, + "step": 7655, + "time_per_iteration": 2.5213396549224854 + }, + { + "auxiliary_loss_clip": 0.01110004, + "auxiliary_loss_mlp": 0.01030693, + "balance_loss_clip": 1.01755643, + "balance_loss_mlp": 1.03802609, + "epoch": 0.4603036224259732, + "flos": 24462923731200.0, + "grad_norm": 2.0582079675710134, + "language_loss": 0.67502171, + "learning_rate": 2.3523665142175985e-06, + "loss": 0.69642866, + "num_input_tokens_seen": 164268020, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 7656, + "time_per_iteration": 2.4714531898498535 + }, + { + "auxiliary_loss_clip": 0.01112759, + "auxiliary_loss_mlp": 0.01032502, + "balance_loss_clip": 1.01965153, + "balance_loss_mlp": 1.03784871, + "epoch": 0.4603637456786412, + "flos": 28109292883200.0, + "grad_norm": 1.7896797448491664, + "language_loss": 0.81050038, + "learning_rate": 2.351983138057098e-06, + "loss": 0.83195299, + "num_input_tokens_seen": 164287305, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.75, + "step": 7657, + "time_per_iteration": 2.549114227294922 + }, + { + "auxiliary_loss_clip": 0.01113625, + "auxiliary_loss_mlp": 0.01031549, + "balance_loss_clip": 1.01767325, + "balance_loss_mlp": 1.03843951, + "epoch": 0.4604238689313092, + "flos": 24348942898560.0, + "grad_norm": 2.212167065380131, + "language_loss": 0.70071685, + "learning_rate": 2.3515997485493623e-06, + "loss": 0.72216856, + "num_input_tokens_seen": 164306835, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 7658, + "time_per_iteration": 2.4548964500427246 + }, + { + "auxiliary_loss_clip": 0.0103337, + "auxiliary_loss_mlp": 0.01002994, + "balance_loss_clip": 1.00179565, + "balance_loss_mlp": 1.00924027, + "epoch": 0.4604839921839772, + "flos": 53606229431040.0, + "grad_norm": 0.9542906494873047, + "language_loss": 0.62159562, + "learning_rate": 2.351216345708928e-06, + "loss": 0.64195925, + "num_input_tokens_seen": 164367095, + "router_z_loss_clip": 0.01196289, + "router_z_loss_mlp": 0.2421875, + "step": 7659, + "time_per_iteration": 3.194460153579712 + }, + { + "auxiliary_loss_clip": 0.01114248, + "auxiliary_loss_mlp": 0.0103108, + "balance_loss_clip": 1.01774633, + "balance_loss_mlp": 1.04089022, + "epoch": 0.46054411543664514, + "flos": 31248424126080.0, + "grad_norm": 2.0710979138047123, + "language_loss": 0.68395913, + "learning_rate": 2.350832929550336e-06, + "loss": 0.70541239, + "num_input_tokens_seen": 164388895, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.734375, + "step": 7660, + "time_per_iteration": 2.5212934017181396 + }, + { + "auxiliary_loss_clip": 0.01112449, + "auxiliary_loss_mlp": 0.01041428, + "balance_loss_clip": 1.02767086, + "balance_loss_mlp": 1.03826356, + "epoch": 0.4606042386893131, + "flos": 24092863862400.0, + "grad_norm": 1.7599753910943126, + "language_loss": 0.76785183, + "learning_rate": 2.3504495000881227e-06, + "loss": 0.78939056, + "num_input_tokens_seen": 164409080, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7421875, + "step": 7661, + "time_per_iteration": 2.504199981689453 + }, + { + "auxiliary_loss_clip": 0.01111854, + "auxiliary_loss_mlp": 0.0103438, + "balance_loss_clip": 1.02109385, + "balance_loss_mlp": 1.03997183, + "epoch": 0.46066436194198107, + "flos": 26578457101440.0, + "grad_norm": 1.743819837097498, + "language_loss": 0.74565995, + "learning_rate": 2.3500660573368305e-06, + "loss": 0.76712227, + "num_input_tokens_seen": 164427585, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 7662, + "time_per_iteration": 2.479710817337036 + }, + { + "auxiliary_loss_clip": 0.01118488, + "auxiliary_loss_mlp": 0.01032606, + "balance_loss_clip": 1.01835489, + "balance_loss_mlp": 1.03899062, + "epoch": 0.46072448519464904, + "flos": 17775602184960.0, + "grad_norm": 2.744789888238294, + "language_loss": 0.78880358, + "learning_rate": 2.349682601310998e-06, + "loss": 0.81031454, + "num_input_tokens_seen": 164438455, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.796875, + "step": 7663, + "time_per_iteration": 2.433105230331421 + }, + { + "auxiliary_loss_clip": 0.01110139, + "auxiliary_loss_mlp": 0.01035881, + "balance_loss_clip": 1.02286935, + "balance_loss_mlp": 1.03860092, + "epoch": 0.460784608447317, + "flos": 15086109392640.0, + "grad_norm": 1.8568277173945746, + "language_loss": 0.73164225, + "learning_rate": 2.3492991320251653e-06, + "loss": 0.75310248, + "num_input_tokens_seen": 164456830, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71484375, + "step": 7664, + "time_per_iteration": 2.4182069301605225 + }, + { + "auxiliary_loss_clip": 0.01114696, + "auxiliary_loss_mlp": 0.01033369, + "balance_loss_clip": 1.02064347, + "balance_loss_mlp": 1.040645, + "epoch": 0.46084473169998497, + "flos": 18588261438720.0, + "grad_norm": 1.6231584574242337, + "language_loss": 0.72039741, + "learning_rate": 2.3489156494938753e-06, + "loss": 0.74187809, + "num_input_tokens_seen": 164475375, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.73828125, + "step": 7665, + "time_per_iteration": 2.4458460807800293 + }, + { + "auxiliary_loss_clip": 0.01115054, + "auxiliary_loss_mlp": 0.01032258, + "balance_loss_clip": 1.01965141, + "balance_loss_mlp": 1.03982568, + "epoch": 0.46090485495265293, + "flos": 19494789909120.0, + "grad_norm": 1.8683756247621939, + "language_loss": 0.78134775, + "learning_rate": 2.348532153731669e-06, + "loss": 0.80282086, + "num_input_tokens_seen": 164492040, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.75390625, + "step": 7666, + "time_per_iteration": 2.4217963218688965 + }, + { + "auxiliary_loss_clip": 0.01112281, + "auxiliary_loss_mlp": 0.01034367, + "balance_loss_clip": 1.02005553, + "balance_loss_mlp": 1.03926802, + "epoch": 0.4609649782053209, + "flos": 33364927163520.0, + "grad_norm": 1.2927592404362929, + "language_loss": 0.73972279, + "learning_rate": 2.348148644753088e-06, + "loss": 0.76118922, + "num_input_tokens_seen": 164513665, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.73046875, + "step": 7667, + "time_per_iteration": 2.586657762527466 + }, + { + "auxiliary_loss_clip": 0.0111122, + "auxiliary_loss_mlp": 0.010306, + "balance_loss_clip": 1.01803541, + "balance_loss_mlp": 1.03743756, + "epoch": 0.46102510145798886, + "flos": 23769165473280.0, + "grad_norm": 1.3923437909363505, + "language_loss": 0.75857067, + "learning_rate": 2.347765122572676e-06, + "loss": 0.77998888, + "num_input_tokens_seen": 164533890, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.73828125, + "step": 7668, + "time_per_iteration": 2.456688642501831 + }, + { + "auxiliary_loss_clip": 0.01112338, + "auxiliary_loss_mlp": 0.01029444, + "balance_loss_clip": 1.01699305, + "balance_loss_mlp": 1.04143405, + "epoch": 0.4610852247106568, + "flos": 23294821443840.0, + "grad_norm": 2.015120719246451, + "language_loss": 0.77794099, + "learning_rate": 2.347381587204975e-06, + "loss": 0.79935884, + "num_input_tokens_seen": 164553815, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 7669, + "time_per_iteration": 2.503912925720215 + }, + { + "auxiliary_loss_clip": 0.01112792, + "auxiliary_loss_mlp": 0.01029623, + "balance_loss_clip": 1.01688588, + "balance_loss_mlp": 1.03798747, + "epoch": 0.4611453479633248, + "flos": 25447450584960.0, + "grad_norm": 1.8162494299938103, + "language_loss": 0.82330608, + "learning_rate": 2.34699803866453e-06, + "loss": 0.84473014, + "num_input_tokens_seen": 164573125, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.75, + "step": 7670, + "time_per_iteration": 2.481456995010376 + }, + { + "auxiliary_loss_clip": 0.01111476, + "auxiliary_loss_mlp": 0.0103085, + "balance_loss_clip": 1.01781416, + "balance_loss_mlp": 1.03845906, + "epoch": 0.4612054712159928, + "flos": 21139606523520.0, + "grad_norm": 1.6076372414606255, + "language_loss": 0.63204038, + "learning_rate": 2.3466144769658845e-06, + "loss": 0.6534636, + "num_input_tokens_seen": 164592575, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73046875, + "step": 7671, + "time_per_iteration": 2.4743082523345947 + }, + { + "auxiliary_loss_clip": 0.01034608, + "auxiliary_loss_mlp": 0.01007042, + "balance_loss_clip": 1.00571287, + "balance_loss_mlp": 1.01008546, + "epoch": 0.4612655944686608, + "flos": 69959266404480.0, + "grad_norm": 0.6877278401983052, + "language_loss": 0.55879581, + "learning_rate": 2.346230902123583e-06, + "loss": 0.57921231, + "num_input_tokens_seen": 164659795, + "router_z_loss_clip": 0.01330566, + "router_z_loss_mlp": 0.24609375, + "step": 7672, + "time_per_iteration": 3.15800142288208 + }, + { + "auxiliary_loss_clip": 0.0111558, + "auxiliary_loss_mlp": 0.01035904, + "balance_loss_clip": 1.02255249, + "balance_loss_mlp": 1.04003441, + "epoch": 0.46132571772132874, + "flos": 16837149502080.0, + "grad_norm": 1.8329231831015789, + "language_loss": 0.70920408, + "learning_rate": 2.3458473141521715e-06, + "loss": 0.73071891, + "num_input_tokens_seen": 164678735, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7578125, + "step": 7673, + "time_per_iteration": 2.4639430046081543 + }, + { + "auxiliary_loss_clip": 0.01112366, + "auxiliary_loss_mlp": 0.01034204, + "balance_loss_clip": 1.02145457, + "balance_loss_mlp": 1.04083312, + "epoch": 0.4613858409739967, + "flos": 35808935431680.0, + "grad_norm": 1.6780898708072003, + "language_loss": 0.70402145, + "learning_rate": 2.345463713066195e-06, + "loss": 0.72548711, + "num_input_tokens_seen": 164700885, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71484375, + "step": 7674, + "time_per_iteration": 2.5660369396209717 + }, + { + "auxiliary_loss_clip": 0.01111645, + "auxiliary_loss_mlp": 0.01037491, + "balance_loss_clip": 1.02384138, + "balance_loss_mlp": 1.03684926, + "epoch": 0.4614459642266647, + "flos": 35266756567680.0, + "grad_norm": 1.5790047103218752, + "language_loss": 0.65408182, + "learning_rate": 2.3450800988801996e-06, + "loss": 0.67557311, + "num_input_tokens_seen": 164726960, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75, + "step": 7675, + "time_per_iteration": 2.616771697998047 + }, + { + "auxiliary_loss_clip": 0.01034961, + "auxiliary_loss_mlp": 0.010075, + "balance_loss_clip": 1.00611675, + "balance_loss_mlp": 1.01053035, + "epoch": 0.46150608747933264, + "flos": 66704610044160.0, + "grad_norm": 0.7425701763607123, + "language_loss": 0.58600932, + "learning_rate": 2.3446964716087327e-06, + "loss": 0.60643393, + "num_input_tokens_seen": 164788525, + "router_z_loss_clip": 0.01385498, + "router_z_loss_mlp": 0.24511719, + "step": 7676, + "time_per_iteration": 3.09281325340271 + }, + { + "auxiliary_loss_clip": 0.01034023, + "auxiliary_loss_mlp": 0.01002968, + "balance_loss_clip": 1.00172222, + "balance_loss_mlp": 1.00993788, + "epoch": 0.4615662107320006, + "flos": 55830177025920.0, + "grad_norm": 0.7891273111868267, + "language_loss": 0.62684548, + "learning_rate": 2.344312831266341e-06, + "loss": 0.64721537, + "num_input_tokens_seen": 164843525, + "router_z_loss_clip": 0.01245117, + "router_z_loss_mlp": 0.24121094, + "step": 7677, + "time_per_iteration": 2.9087297916412354 + }, + { + "auxiliary_loss_clip": 0.01112185, + "auxiliary_loss_mlp": 0.01031192, + "balance_loss_clip": 1.018502, + "balance_loss_mlp": 1.03929043, + "epoch": 0.46162633398466857, + "flos": 15483245137920.0, + "grad_norm": 2.8566258545012464, + "language_loss": 0.76442772, + "learning_rate": 2.3439291778675718e-06, + "loss": 0.78586149, + "num_input_tokens_seen": 164859895, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7265625, + "step": 7678, + "time_per_iteration": 3.80979061126709 + }, + { + "auxiliary_loss_clip": 0.01115647, + "auxiliary_loss_mlp": 0.01035074, + "balance_loss_clip": 1.02148438, + "balance_loss_mlp": 1.04122365, + "epoch": 0.46168645723733653, + "flos": 20011437181440.0, + "grad_norm": 1.9875640695173902, + "language_loss": 0.66738796, + "learning_rate": 2.343545511426974e-06, + "loss": 0.68889523, + "num_input_tokens_seen": 164878030, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7421875, + "step": 7679, + "time_per_iteration": 5.473088502883911 + }, + { + "auxiliary_loss_clip": 0.01112323, + "auxiliary_loss_mlp": 0.01038079, + "balance_loss_clip": 1.02563381, + "balance_loss_mlp": 1.03913581, + "epoch": 0.4617465804900045, + "flos": 20298542590080.0, + "grad_norm": 1.9247599304086902, + "language_loss": 0.69658661, + "learning_rate": 2.3431618319590963e-06, + "loss": 0.71809065, + "num_input_tokens_seen": 164895710, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.73046875, + "step": 7680, + "time_per_iteration": 2.435971736907959 + }, + { + "auxiliary_loss_clip": 0.01121586, + "auxiliary_loss_mlp": 0.01041647, + "balance_loss_clip": 1.02805138, + "balance_loss_mlp": 1.04467559, + "epoch": 0.46180670374267246, + "flos": 22346312952960.0, + "grad_norm": 3.979685754880411, + "language_loss": 0.63813865, + "learning_rate": 2.342778139478487e-06, + "loss": 0.65977097, + "num_input_tokens_seen": 164913365, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.76953125, + "step": 7681, + "time_per_iteration": 2.486614942550659 + }, + { + "auxiliary_loss_clip": 0.01111536, + "auxiliary_loss_mlp": 0.01029983, + "balance_loss_clip": 1.01790738, + "balance_loss_mlp": 1.03925776, + "epoch": 0.46186682699534043, + "flos": 19895696582400.0, + "grad_norm": 1.518283771877835, + "language_loss": 0.66871607, + "learning_rate": 2.342394433999697e-06, + "loss": 0.69013125, + "num_input_tokens_seen": 164931620, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.72265625, + "step": 7682, + "time_per_iteration": 2.434720516204834 + }, + { + "auxiliary_loss_clip": 0.01114783, + "auxiliary_loss_mlp": 0.01036687, + "balance_loss_clip": 1.02353811, + "balance_loss_mlp": 1.03967464, + "epoch": 0.4619269502480084, + "flos": 31503569408640.0, + "grad_norm": 2.2113144827233397, + "language_loss": 0.74337292, + "learning_rate": 2.342010715537275e-06, + "loss": 0.76488769, + "num_input_tokens_seen": 164950905, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.75, + "step": 7683, + "time_per_iteration": 2.532867908477783 + }, + { + "auxiliary_loss_clip": 0.01113653, + "auxiliary_loss_mlp": 0.0103323, + "balance_loss_clip": 1.02046251, + "balance_loss_mlp": 1.04082799, + "epoch": 0.46198707350067636, + "flos": 25009484054400.0, + "grad_norm": 1.7237723920320163, + "language_loss": 0.76637614, + "learning_rate": 2.3416269841057726e-06, + "loss": 0.78784502, + "num_input_tokens_seen": 164970950, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7265625, + "step": 7684, + "time_per_iteration": 2.4763615131378174 + }, + { + "auxiliary_loss_clip": 0.01121747, + "auxiliary_loss_mlp": 0.01039637, + "balance_loss_clip": 1.02557588, + "balance_loss_mlp": 1.04270399, + "epoch": 0.4620471967533444, + "flos": 18292357198080.0, + "grad_norm": 2.012138726469413, + "language_loss": 0.80012244, + "learning_rate": 2.3412432397197412e-06, + "loss": 0.82173628, + "num_input_tokens_seen": 164989855, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 7685, + "time_per_iteration": 2.467780113220215 + }, + { + "auxiliary_loss_clip": 0.01113653, + "auxiliary_loss_mlp": 0.01038402, + "balance_loss_clip": 1.02434742, + "balance_loss_mlp": 1.04206526, + "epoch": 0.46210732000601235, + "flos": 33985104410880.0, + "grad_norm": 2.0493507584177424, + "language_loss": 0.66546774, + "learning_rate": 2.340859482393731e-06, + "loss": 0.68698829, + "num_input_tokens_seen": 165012290, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.71484375, + "step": 7686, + "time_per_iteration": 2.5675110816955566 + }, + { + "auxiliary_loss_clip": 0.01115805, + "auxiliary_loss_mlp": 0.01031167, + "balance_loss_clip": 1.01730859, + "balance_loss_mlp": 1.03924084, + "epoch": 0.4621674432586803, + "flos": 25009412227200.0, + "grad_norm": 2.0396518023333243, + "language_loss": 0.73831183, + "learning_rate": 2.340475712142296e-06, + "loss": 0.75978148, + "num_input_tokens_seen": 165030810, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 7687, + "time_per_iteration": 2.5077569484710693 + }, + { + "auxiliary_loss_clip": 0.01113947, + "auxiliary_loss_mlp": 0.01030534, + "balance_loss_clip": 1.01686668, + "balance_loss_mlp": 1.04119587, + "epoch": 0.4622275665113483, + "flos": 22014031213440.0, + "grad_norm": 2.1950912061668784, + "language_loss": 0.74758142, + "learning_rate": 2.3400919289799873e-06, + "loss": 0.76902628, + "num_input_tokens_seen": 165050205, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7265625, + "step": 7688, + "time_per_iteration": 2.4487764835357666 + }, + { + "auxiliary_loss_clip": 0.01112886, + "auxiliary_loss_mlp": 0.01035944, + "balance_loss_clip": 1.0214963, + "balance_loss_mlp": 1.03912246, + "epoch": 0.46228768976401624, + "flos": 24058820747520.0, + "grad_norm": 1.6667608580722473, + "language_loss": 0.78718561, + "learning_rate": 2.3397081329213585e-06, + "loss": 0.80867392, + "num_input_tokens_seen": 165069370, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.73828125, + "step": 7689, + "time_per_iteration": 2.504210948944092 + }, + { + "auxiliary_loss_clip": 0.01118414, + "auxiliary_loss_mlp": 0.01040294, + "balance_loss_clip": 1.02561891, + "balance_loss_mlp": 1.04086494, + "epoch": 0.4623478130166842, + "flos": 26651391667200.0, + "grad_norm": 3.5840156670541448, + "language_loss": 0.56649667, + "learning_rate": 2.339324323980964e-06, + "loss": 0.58808374, + "num_input_tokens_seen": 165089610, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7734375, + "step": 7690, + "time_per_iteration": 2.4970550537109375 + }, + { + "auxiliary_loss_clip": 0.01113577, + "auxiliary_loss_mlp": 0.01034848, + "balance_loss_clip": 1.02076888, + "balance_loss_mlp": 1.03844917, + "epoch": 0.46240793626935217, + "flos": 20558428467840.0, + "grad_norm": 2.2671044925643202, + "language_loss": 0.82513797, + "learning_rate": 2.3389405021733562e-06, + "loss": 0.84662223, + "num_input_tokens_seen": 165109050, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.75, + "step": 7691, + "time_per_iteration": 2.4712584018707275 + }, + { + "auxiliary_loss_clip": 0.01115177, + "auxiliary_loss_mlp": 0.01028123, + "balance_loss_clip": 1.01502824, + "balance_loss_mlp": 1.04124403, + "epoch": 0.46246805952202014, + "flos": 22456055980800.0, + "grad_norm": 1.513473472081282, + "language_loss": 0.75326777, + "learning_rate": 2.338556667513091e-06, + "loss": 0.77470076, + "num_input_tokens_seen": 165130130, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73828125, + "step": 7692, + "time_per_iteration": 2.462574005126953 + }, + { + "auxiliary_loss_clip": 0.01117023, + "auxiliary_loss_mlp": 0.01036723, + "balance_loss_clip": 1.0225668, + "balance_loss_mlp": 1.04110909, + "epoch": 0.4625281827746881, + "flos": 35041308854400.0, + "grad_norm": 4.10345040195295, + "language_loss": 0.74055338, + "learning_rate": 2.338172820014723e-06, + "loss": 0.76209086, + "num_input_tokens_seen": 165152685, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7578125, + "step": 7693, + "time_per_iteration": 2.578394889831543 + }, + { + "auxiliary_loss_clip": 0.01114967, + "auxiliary_loss_mlp": 0.01035157, + "balance_loss_clip": 1.02170396, + "balance_loss_mlp": 1.04132485, + "epoch": 0.46258830602735607, + "flos": 21068647205760.0, + "grad_norm": 1.5049695528407014, + "language_loss": 0.85576218, + "learning_rate": 2.337788959692808e-06, + "loss": 0.87726343, + "num_input_tokens_seen": 165173315, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 7694, + "time_per_iteration": 2.447938919067383 + }, + { + "auxiliary_loss_clip": 0.01116538, + "auxiliary_loss_mlp": 0.01036993, + "balance_loss_clip": 1.02379656, + "balance_loss_mlp": 1.04131126, + "epoch": 0.46264842928002403, + "flos": 26177227205760.0, + "grad_norm": 2.103971064334481, + "language_loss": 0.78631961, + "learning_rate": 2.337405086561902e-06, + "loss": 0.80785489, + "num_input_tokens_seen": 165192395, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.75, + "step": 7695, + "time_per_iteration": 2.510712146759033 + }, + { + "auxiliary_loss_clip": 0.01110008, + "auxiliary_loss_mlp": 0.01034157, + "balance_loss_clip": 1.021294, + "balance_loss_mlp": 1.0382899, + "epoch": 0.462708552532692, + "flos": 16764214936320.0, + "grad_norm": 1.7164209999926379, + "language_loss": 0.72215033, + "learning_rate": 2.3370212006365606e-06, + "loss": 0.74359202, + "num_input_tokens_seen": 165211355, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 7696, + "time_per_iteration": 2.427879571914673 + }, + { + "auxiliary_loss_clip": 0.01116967, + "auxiliary_loss_mlp": 0.01044874, + "balance_loss_clip": 1.03040195, + "balance_loss_mlp": 1.04200339, + "epoch": 0.46276867578535996, + "flos": 15560453422080.0, + "grad_norm": 1.7618442658513396, + "language_loss": 0.69068033, + "learning_rate": 2.3366373019313423e-06, + "loss": 0.71229875, + "num_input_tokens_seen": 165229380, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.75, + "step": 7697, + "time_per_iteration": 2.4759252071380615 + }, + { + "auxiliary_loss_clip": 0.01115214, + "auxiliary_loss_mlp": 0.01030228, + "balance_loss_clip": 1.01716232, + "balance_loss_mlp": 1.0421176, + "epoch": 0.462828799038028, + "flos": 22415404763520.0, + "grad_norm": 1.7059169761391482, + "language_loss": 0.84603721, + "learning_rate": 2.3362533904608025e-06, + "loss": 0.8674916, + "num_input_tokens_seen": 165247200, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73046875, + "step": 7698, + "time_per_iteration": 2.4416439533233643 + }, + { + "auxiliary_loss_clip": 0.01114129, + "auxiliary_loss_mlp": 0.01033925, + "balance_loss_clip": 1.02094316, + "balance_loss_mlp": 1.04008198, + "epoch": 0.46288892229069595, + "flos": 21069580959360.0, + "grad_norm": 2.2131790671554894, + "language_loss": 0.71495068, + "learning_rate": 2.335869466239502e-06, + "loss": 0.73643124, + "num_input_tokens_seen": 165265825, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7421875, + "step": 7699, + "time_per_iteration": 2.477674722671509 + }, + { + "auxiliary_loss_clip": 0.01115631, + "auxiliary_loss_mlp": 0.01036078, + "balance_loss_clip": 1.02183843, + "balance_loss_mlp": 1.03854418, + "epoch": 0.4629490455433639, + "flos": 23185688947200.0, + "grad_norm": 1.667240614809052, + "language_loss": 0.7189334, + "learning_rate": 2.335485529281996e-06, + "loss": 0.7404505, + "num_input_tokens_seen": 165284380, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7734375, + "step": 7700, + "time_per_iteration": 2.4664909839630127 + }, + { + "auxiliary_loss_clip": 0.01113806, + "auxiliary_loss_mlp": 0.01036044, + "balance_loss_clip": 1.0229491, + "balance_loss_mlp": 1.04012191, + "epoch": 0.4630091687960319, + "flos": 18835541642880.0, + "grad_norm": 1.9820544405348388, + "language_loss": 0.7245025, + "learning_rate": 2.3351015796028467e-06, + "loss": 0.74600095, + "num_input_tokens_seen": 165300320, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73828125, + "step": 7701, + "time_per_iteration": 2.4769680500030518 + }, + { + "auxiliary_loss_clip": 0.01117689, + "auxiliary_loss_mlp": 0.01035148, + "balance_loss_clip": 1.02129054, + "balance_loss_mlp": 1.04037929, + "epoch": 0.46306929204869984, + "flos": 38907020407680.0, + "grad_norm": 1.837243395087381, + "language_loss": 0.64583158, + "learning_rate": 2.3347176172166114e-06, + "loss": 0.66735995, + "num_input_tokens_seen": 165318130, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7734375, + "step": 7702, + "time_per_iteration": 2.5806338787078857 + }, + { + "auxiliary_loss_clip": 0.0111042, + "auxiliary_loss_mlp": 0.01030383, + "balance_loss_clip": 1.01753259, + "balance_loss_mlp": 1.03832746, + "epoch": 0.4631294153013678, + "flos": 19644178573440.0, + "grad_norm": 1.912512853345874, + "language_loss": 0.73265111, + "learning_rate": 2.33433364213785e-06, + "loss": 0.7540592, + "num_input_tokens_seen": 165336225, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.71875, + "step": 7703, + "time_per_iteration": 2.482374429702759 + }, + { + "auxiliary_loss_clip": 0.01119217, + "auxiliary_loss_mlp": 0.01033067, + "balance_loss_clip": 1.01882708, + "balance_loss_mlp": 1.04163849, + "epoch": 0.4631895385540358, + "flos": 24608254158720.0, + "grad_norm": 1.555397834218836, + "language_loss": 0.68780202, + "learning_rate": 2.3339496543811243e-06, + "loss": 0.70932484, + "num_input_tokens_seen": 165355005, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7734375, + "step": 7704, + "time_per_iteration": 2.4661428928375244 + }, + { + "auxiliary_loss_clip": 0.01118717, + "auxiliary_loss_mlp": 0.01027068, + "balance_loss_clip": 1.01313281, + "balance_loss_mlp": 1.04138649, + "epoch": 0.46324966180670374, + "flos": 26320115508480.0, + "grad_norm": 4.360671756910266, + "language_loss": 0.80963224, + "learning_rate": 2.3335656539609934e-06, + "loss": 0.83109009, + "num_input_tokens_seen": 165374910, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7734375, + "step": 7705, + "time_per_iteration": 2.5129587650299072 + }, + { + "auxiliary_loss_clip": 0.01116357, + "auxiliary_loss_mlp": 0.01032014, + "balance_loss_clip": 1.01863885, + "balance_loss_mlp": 1.03983259, + "epoch": 0.4633097850593717, + "flos": 19240506552960.0, + "grad_norm": 1.6860050062378817, + "language_loss": 0.77783883, + "learning_rate": 2.3331816408920196e-06, + "loss": 0.79932249, + "num_input_tokens_seen": 165392590, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.765625, + "step": 7706, + "time_per_iteration": 2.4212512969970703 + }, + { + "auxiliary_loss_clip": 0.01109433, + "auxiliary_loss_mlp": 0.0102981, + "balance_loss_clip": 1.01676846, + "balance_loss_mlp": 1.03858304, + "epoch": 0.46336990831203967, + "flos": 22783166161920.0, + "grad_norm": 1.9896841653009631, + "language_loss": 0.69805431, + "learning_rate": 2.3327976151887654e-06, + "loss": 0.71944684, + "num_input_tokens_seen": 165411195, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.70703125, + "step": 7707, + "time_per_iteration": 2.452716112136841 + }, + { + "auxiliary_loss_clip": 0.0111828, + "auxiliary_loss_mlp": 0.01037502, + "balance_loss_clip": 1.02268386, + "balance_loss_mlp": 1.03958869, + "epoch": 0.46343003156470763, + "flos": 38210604543360.0, + "grad_norm": 1.9384057680294333, + "language_loss": 0.61103344, + "learning_rate": 2.332413576865791e-06, + "loss": 0.63259125, + "num_input_tokens_seen": 165430150, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7890625, + "step": 7708, + "time_per_iteration": 2.567363739013672 + }, + { + "auxiliary_loss_clip": 0.01115409, + "auxiliary_loss_mlp": 0.01033039, + "balance_loss_clip": 1.01932991, + "balance_loss_mlp": 1.0407182, + "epoch": 0.4634901548173756, + "flos": 31938555110400.0, + "grad_norm": 1.9580912850569934, + "language_loss": 0.77165091, + "learning_rate": 2.3320295259376614e-06, + "loss": 0.7931354, + "num_input_tokens_seen": 165450595, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 7709, + "time_per_iteration": 2.532893657684326 + }, + { + "auxiliary_loss_clip": 0.01120131, + "auxiliary_loss_mlp": 0.0103614, + "balance_loss_clip": 1.02199614, + "balance_loss_mlp": 1.04260027, + "epoch": 0.46355027807004356, + "flos": 20082540153600.0, + "grad_norm": 1.8889269845152723, + "language_loss": 0.76972783, + "learning_rate": 2.3316454624189385e-06, + "loss": 0.79129058, + "num_input_tokens_seen": 165469515, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7734375, + "step": 7710, + "time_per_iteration": 2.4608266353607178 + }, + { + "auxiliary_loss_clip": 0.01119418, + "auxiliary_loss_mlp": 0.01032956, + "balance_loss_clip": 1.01812005, + "balance_loss_mlp": 1.04201198, + "epoch": 0.4636104013227116, + "flos": 24061370613120.0, + "grad_norm": 8.865430766980356, + "language_loss": 0.73548961, + "learning_rate": 2.3312613863241865e-06, + "loss": 0.75701332, + "num_input_tokens_seen": 165488125, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7734375, + "step": 7711, + "time_per_iteration": 2.4964261054992676 + }, + { + "auxiliary_loss_clip": 0.01114775, + "auxiliary_loss_mlp": 0.01043054, + "balance_loss_clip": 1.02818859, + "balance_loss_mlp": 1.04039836, + "epoch": 0.46367052457537955, + "flos": 23914639555200.0, + "grad_norm": 1.6554647385393604, + "language_loss": 0.71667624, + "learning_rate": 2.33087729766797e-06, + "loss": 0.73825449, + "num_input_tokens_seen": 165509225, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.74609375, + "step": 7712, + "time_per_iteration": 2.46760630607605 + }, + { + "auxiliary_loss_clip": 0.01121722, + "auxiliary_loss_mlp": 0.01038556, + "balance_loss_clip": 1.02325535, + "balance_loss_mlp": 1.04231286, + "epoch": 0.4637306478280475, + "flos": 26396533693440.0, + "grad_norm": 3.3767356374822053, + "language_loss": 0.72924775, + "learning_rate": 2.3304931964648524e-06, + "loss": 0.7508505, + "num_input_tokens_seen": 165529945, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.79296875, + "step": 7713, + "time_per_iteration": 2.501405954360962 + }, + { + "auxiliary_loss_clip": 0.01117475, + "auxiliary_loss_mlp": 0.01034085, + "balance_loss_clip": 1.0192256, + "balance_loss_mlp": 1.0397234, + "epoch": 0.4637907710807155, + "flos": 21980706370560.0, + "grad_norm": 1.980318346106041, + "language_loss": 0.58787149, + "learning_rate": 2.3301090827294e-06, + "loss": 0.60938716, + "num_input_tokens_seen": 165550690, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.77734375, + "step": 7714, + "time_per_iteration": 2.495403528213501 + }, + { + "auxiliary_loss_clip": 0.01113059, + "auxiliary_loss_mlp": 0.01032058, + "balance_loss_clip": 1.01873016, + "balance_loss_mlp": 1.03932118, + "epoch": 0.46385089433338345, + "flos": 12422291846400.0, + "grad_norm": 2.071541116221401, + "language_loss": 0.70241058, + "learning_rate": 2.3297249564761784e-06, + "loss": 0.72386181, + "num_input_tokens_seen": 165567775, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73828125, + "step": 7715, + "time_per_iteration": 2.4438905715942383 + }, + { + "auxiliary_loss_clip": 0.01120226, + "auxiliary_loss_mlp": 0.01034367, + "balance_loss_clip": 1.0211767, + "balance_loss_mlp": 1.04094183, + "epoch": 0.4639110175860514, + "flos": 23915752876800.0, + "grad_norm": 2.6792778299233775, + "language_loss": 0.67974752, + "learning_rate": 2.3293408177197527e-06, + "loss": 0.70129347, + "num_input_tokens_seen": 165587010, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.796875, + "step": 7716, + "time_per_iteration": 2.4544179439544678 + }, + { + "auxiliary_loss_clip": 0.01119502, + "auxiliary_loss_mlp": 0.010334, + "balance_loss_clip": 1.01913667, + "balance_loss_mlp": 1.04161263, + "epoch": 0.4639711408387194, + "flos": 25300396304640.0, + "grad_norm": 1.7705358267642153, + "language_loss": 0.81100738, + "learning_rate": 2.328956666474691e-06, + "loss": 0.8325364, + "num_input_tokens_seen": 165607850, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 7717, + "time_per_iteration": 2.491530179977417 + }, + { + "auxiliary_loss_clip": 0.0111535, + "auxiliary_loss_mlp": 0.0103239, + "balance_loss_clip": 1.01868117, + "balance_loss_mlp": 1.04001844, + "epoch": 0.46403126409138734, + "flos": 21211822817280.0, + "grad_norm": 1.8289041555667496, + "language_loss": 0.73165905, + "learning_rate": 2.3285725027555593e-06, + "loss": 0.75313652, + "num_input_tokens_seen": 165627175, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75390625, + "step": 7718, + "time_per_iteration": 2.4480137825012207 + }, + { + "auxiliary_loss_clip": 0.01114178, + "auxiliary_loss_mlp": 0.01037785, + "balance_loss_clip": 1.02355695, + "balance_loss_mlp": 1.03966463, + "epoch": 0.4640913873440553, + "flos": 35845564325760.0, + "grad_norm": 1.5484606356008148, + "language_loss": 0.70390046, + "learning_rate": 2.3281883265769254e-06, + "loss": 0.72542012, + "num_input_tokens_seen": 165648340, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.74609375, + "step": 7719, + "time_per_iteration": 2.565831422805786 + }, + { + "auxiliary_loss_clip": 0.0112125, + "auxiliary_loss_mlp": 0.01039419, + "balance_loss_clip": 1.02523875, + "balance_loss_mlp": 1.0433172, + "epoch": 0.46415151059672327, + "flos": 19166207270400.0, + "grad_norm": 1.6620583446293502, + "language_loss": 0.86685133, + "learning_rate": 2.327804137953357e-06, + "loss": 0.88845801, + "num_input_tokens_seen": 165667195, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.77734375, + "step": 7720, + "time_per_iteration": 5.243311166763306 + }, + { + "auxiliary_loss_clip": 0.01036993, + "auxiliary_loss_mlp": 0.01001352, + "balance_loss_clip": 0.99992698, + "balance_loss_mlp": 1.01241243, + "epoch": 0.46421163384939124, + "flos": 58912750304640.0, + "grad_norm": 0.7219170830729655, + "language_loss": 0.55086505, + "learning_rate": 2.3274199368994226e-06, + "loss": 0.57124853, + "num_input_tokens_seen": 165726760, + "router_z_loss_clip": 0.01422119, + "router_z_loss_mlp": 0.24609375, + "step": 7721, + "time_per_iteration": 4.553914785385132 + }, + { + "auxiliary_loss_clip": 0.01116315, + "auxiliary_loss_mlp": 0.01037313, + "balance_loss_clip": 1.02322233, + "balance_loss_mlp": 1.041767, + "epoch": 0.4642717571020592, + "flos": 20157342226560.0, + "grad_norm": 2.566766868002949, + "language_loss": 0.79665279, + "learning_rate": 2.3270357234296918e-06, + "loss": 0.81818902, + "num_input_tokens_seen": 165745005, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.74609375, + "step": 7722, + "time_per_iteration": 2.445401430130005 + }, + { + "auxiliary_loss_clip": 0.01118781, + "auxiliary_loss_mlp": 0.01032902, + "balance_loss_clip": 1.01957417, + "balance_loss_mlp": 1.04163325, + "epoch": 0.46433188035472717, + "flos": 25046184775680.0, + "grad_norm": 1.5891837623192666, + "language_loss": 0.77772748, + "learning_rate": 2.3266514975587332e-06, + "loss": 0.79924428, + "num_input_tokens_seen": 165765750, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7734375, + "step": 7723, + "time_per_iteration": 2.4992403984069824 + }, + { + "auxiliary_loss_clip": 0.01112839, + "auxiliary_loss_mlp": 0.01030607, + "balance_loss_clip": 1.01748788, + "balance_loss_mlp": 1.03973961, + "epoch": 0.4643920036073952, + "flos": 28075644817920.0, + "grad_norm": 1.5026814907271808, + "language_loss": 0.68433344, + "learning_rate": 2.326267259301118e-06, + "loss": 0.70576787, + "num_input_tokens_seen": 165787515, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73046875, + "step": 7724, + "time_per_iteration": 2.496286630630493 + }, + { + "auxiliary_loss_clip": 0.01112054, + "auxiliary_loss_mlp": 0.0103399, + "balance_loss_clip": 1.02032912, + "balance_loss_mlp": 1.03761983, + "epoch": 0.46445212686006315, + "flos": 18369350000640.0, + "grad_norm": 2.246547977212262, + "language_loss": 0.67335129, + "learning_rate": 2.325883008671415e-06, + "loss": 0.6948117, + "num_input_tokens_seen": 165806675, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7421875, + "step": 7725, + "time_per_iteration": 2.471104621887207 + }, + { + "auxiliary_loss_clip": 0.01108683, + "auxiliary_loss_mlp": 0.01037158, + "balance_loss_clip": 1.02523649, + "balance_loss_mlp": 1.03763461, + "epoch": 0.4645122501127311, + "flos": 31721618920320.0, + "grad_norm": 1.6153664866621378, + "language_loss": 0.64700842, + "learning_rate": 2.3254987456841955e-06, + "loss": 0.66846681, + "num_input_tokens_seen": 165829835, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.7109375, + "step": 7726, + "time_per_iteration": 2.5408668518066406 + }, + { + "auxiliary_loss_clip": 0.01117387, + "auxiliary_loss_mlp": 0.01032558, + "balance_loss_clip": 1.01916456, + "balance_loss_mlp": 1.04313767, + "epoch": 0.4645723733653991, + "flos": 23768806337280.0, + "grad_norm": 1.8244750339479887, + "language_loss": 0.74908936, + "learning_rate": 2.3251144703540307e-06, + "loss": 0.77058876, + "num_input_tokens_seen": 165849380, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7421875, + "step": 7727, + "time_per_iteration": 2.4853005409240723 + }, + { + "auxiliary_loss_clip": 0.01114218, + "auxiliary_loss_mlp": 0.01036565, + "balance_loss_clip": 1.02248645, + "balance_loss_mlp": 1.03968906, + "epoch": 0.46463249661806705, + "flos": 33145512935040.0, + "grad_norm": 2.0019169498028657, + "language_loss": 0.78683269, + "learning_rate": 2.3247301826954936e-06, + "loss": 0.80834055, + "num_input_tokens_seen": 165868620, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 7728, + "time_per_iteration": 2.5397188663482666 + }, + { + "auxiliary_loss_clip": 0.0111559, + "auxiliary_loss_mlp": 0.01036908, + "balance_loss_clip": 1.02303171, + "balance_loss_mlp": 1.0405283, + "epoch": 0.464692619870735, + "flos": 18296020385280.0, + "grad_norm": 2.3286376832796343, + "language_loss": 0.76053888, + "learning_rate": 2.324345882723155e-06, + "loss": 0.78206384, + "num_input_tokens_seen": 165885915, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 7729, + "time_per_iteration": 2.4818129539489746 + }, + { + "auxiliary_loss_clip": 0.011162, + "auxiliary_loss_mlp": 0.0103847, + "balance_loss_clip": 1.02543473, + "balance_loss_mlp": 1.04205704, + "epoch": 0.464752743123403, + "flos": 22638051216000.0, + "grad_norm": 1.578112141950269, + "language_loss": 0.79568058, + "learning_rate": 2.323961570451588e-06, + "loss": 0.81722724, + "num_input_tokens_seen": 165905465, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7421875, + "step": 7730, + "time_per_iteration": 2.5124597549438477 + }, + { + "auxiliary_loss_clip": 0.01113512, + "auxiliary_loss_mlp": 0.01037643, + "balance_loss_clip": 1.0245595, + "balance_loss_mlp": 1.03948402, + "epoch": 0.46481286637607094, + "flos": 20412128373120.0, + "grad_norm": 1.5075999703309564, + "language_loss": 0.76621842, + "learning_rate": 2.3235772458953655e-06, + "loss": 0.78772998, + "num_input_tokens_seen": 165924640, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7421875, + "step": 7731, + "time_per_iteration": 2.4976460933685303 + }, + { + "auxiliary_loss_clip": 0.01112526, + "auxiliary_loss_mlp": 0.01031386, + "balance_loss_clip": 1.01798737, + "balance_loss_mlp": 1.0393635, + "epoch": 0.4648729896287389, + "flos": 34275406129920.0, + "grad_norm": 1.7163179847514425, + "language_loss": 0.65824252, + "learning_rate": 2.323192909069061e-06, + "loss": 0.67968166, + "num_input_tokens_seen": 165945765, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73046875, + "step": 7732, + "time_per_iteration": 2.5720393657684326 + }, + { + "auxiliary_loss_clip": 0.01116963, + "auxiliary_loss_mlp": 0.01036386, + "balance_loss_clip": 1.02186668, + "balance_loss_mlp": 1.03906608, + "epoch": 0.4649331128814069, + "flos": 21321781326720.0, + "grad_norm": 2.6101927282287454, + "language_loss": 0.72711408, + "learning_rate": 2.32280855998725e-06, + "loss": 0.74864757, + "num_input_tokens_seen": 165964025, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.78125, + "step": 7733, + "time_per_iteration": 2.4926271438598633 + }, + { + "auxiliary_loss_clip": 0.01036248, + "auxiliary_loss_mlp": 0.01002748, + "balance_loss_clip": 1.00131154, + "balance_loss_mlp": 1.01211238, + "epoch": 0.46499323613407484, + "flos": 58308515717760.0, + "grad_norm": 1.2459739814545432, + "language_loss": 0.51962316, + "learning_rate": 2.3224241986645057e-06, + "loss": 0.54001307, + "num_input_tokens_seen": 166021950, + "router_z_loss_clip": 0.01434326, + "router_z_loss_mlp": 0.2421875, + "step": 7734, + "time_per_iteration": 3.0107176303863525 + }, + { + "auxiliary_loss_clip": 0.01113986, + "auxiliary_loss_mlp": 0.0103542, + "balance_loss_clip": 1.02194381, + "balance_loss_mlp": 1.04043412, + "epoch": 0.4650533593867428, + "flos": 10889660384640.0, + "grad_norm": 2.036607770310226, + "language_loss": 0.75633866, + "learning_rate": 2.3220398251154035e-06, + "loss": 0.77783275, + "num_input_tokens_seen": 166039675, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 7735, + "time_per_iteration": 2.487781286239624 + }, + { + "auxiliary_loss_clip": 0.01111506, + "auxiliary_loss_mlp": 0.01040266, + "balance_loss_clip": 1.02682567, + "balance_loss_mlp": 1.03985715, + "epoch": 0.46511348263941077, + "flos": 19974592805760.0, + "grad_norm": 2.402877095125316, + "language_loss": 0.70207214, + "learning_rate": 2.321655439354519e-06, + "loss": 0.7235899, + "num_input_tokens_seen": 166057745, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71484375, + "step": 7736, + "time_per_iteration": 2.4449374675750732 + }, + { + "auxiliary_loss_clip": 0.0111302, + "auxiliary_loss_mlp": 0.0103234, + "balance_loss_clip": 1.0199604, + "balance_loss_mlp": 1.04052627, + "epoch": 0.46517360589207873, + "flos": 19678401256320.0, + "grad_norm": 1.6375102922586726, + "language_loss": 0.72185129, + "learning_rate": 2.321271041396427e-06, + "loss": 0.74330497, + "num_input_tokens_seen": 166076440, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7265625, + "step": 7737, + "time_per_iteration": 2.494582176208496 + }, + { + "auxiliary_loss_clip": 0.01118991, + "auxiliary_loss_mlp": 0.01038693, + "balance_loss_clip": 1.02450085, + "balance_loss_mlp": 1.04341006, + "epoch": 0.46523372914474675, + "flos": 16872665074560.0, + "grad_norm": 2.6166748549663605, + "language_loss": 0.83362406, + "learning_rate": 2.3208866312557065e-06, + "loss": 0.85520089, + "num_input_tokens_seen": 166092520, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7578125, + "step": 7738, + "time_per_iteration": 2.427828550338745 + }, + { + "auxiliary_loss_clip": 0.01037214, + "auxiliary_loss_mlp": 0.01002748, + "balance_loss_clip": 1.0013417, + "balance_loss_mlp": 1.0132978, + "epoch": 0.4652938523974147, + "flos": 53439138339840.0, + "grad_norm": 0.7680630195464891, + "language_loss": 0.57788324, + "learning_rate": 2.320502208946932e-06, + "loss": 0.59828281, + "num_input_tokens_seen": 166156285, + "router_z_loss_clip": 0.01403809, + "router_z_loss_mlp": 0.24023438, + "step": 7739, + "time_per_iteration": 3.133042335510254 + }, + { + "auxiliary_loss_clip": 0.01113786, + "auxiliary_loss_mlp": 0.01038133, + "balance_loss_clip": 1.02543104, + "balance_loss_mlp": 1.03974605, + "epoch": 0.4653539756500827, + "flos": 15231296165760.0, + "grad_norm": 1.823827375035505, + "language_loss": 0.8481009, + "learning_rate": 2.3201177744846815e-06, + "loss": 0.86962008, + "num_input_tokens_seen": 166173455, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7421875, + "step": 7740, + "time_per_iteration": 2.4921228885650635 + }, + { + "auxiliary_loss_clip": 0.0111501, + "auxiliary_loss_mlp": 0.01037681, + "balance_loss_clip": 1.02391815, + "balance_loss_mlp": 1.04139423, + "epoch": 0.46541409890275065, + "flos": 23732249270400.0, + "grad_norm": 1.5033977780241194, + "language_loss": 0.76110768, + "learning_rate": 2.3197333278835327e-06, + "loss": 0.7826345, + "num_input_tokens_seen": 166194370, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.734375, + "step": 7741, + "time_per_iteration": 2.4922451972961426 + }, + { + "auxiliary_loss_clip": 0.01117905, + "auxiliary_loss_mlp": 0.01032255, + "balance_loss_clip": 1.01915359, + "balance_loss_mlp": 1.0404247, + "epoch": 0.4654742221554186, + "flos": 20847329556480.0, + "grad_norm": 1.7276921705055903, + "language_loss": 0.80555934, + "learning_rate": 2.319348869158064e-06, + "loss": 0.82706094, + "num_input_tokens_seen": 166213195, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7734375, + "step": 7742, + "time_per_iteration": 2.4906904697418213 + }, + { + "auxiliary_loss_clip": 0.01116814, + "auxiliary_loss_mlp": 0.01039288, + "balance_loss_clip": 1.02518523, + "balance_loss_mlp": 1.04049921, + "epoch": 0.4655343454080866, + "flos": 20704836303360.0, + "grad_norm": 1.9912151117228205, + "language_loss": 0.72541988, + "learning_rate": 2.3189643983228555e-06, + "loss": 0.74698091, + "num_input_tokens_seen": 166231350, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.765625, + "step": 7743, + "time_per_iteration": 2.4746901988983154 + }, + { + "auxiliary_loss_clip": 0.01114723, + "auxiliary_loss_mlp": 0.01030494, + "balance_loss_clip": 1.01745892, + "balance_loss_mlp": 1.0409807, + "epoch": 0.46559446866075455, + "flos": 18989850470400.0, + "grad_norm": 2.076205829431248, + "language_loss": 0.71137214, + "learning_rate": 2.318579915392483e-06, + "loss": 0.73282433, + "num_input_tokens_seen": 166250530, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.734375, + "step": 7744, + "time_per_iteration": 2.4928057193756104 + }, + { + "auxiliary_loss_clip": 0.01112536, + "auxiliary_loss_mlp": 0.01033232, + "balance_loss_clip": 1.02108455, + "balance_loss_mlp": 1.04053736, + "epoch": 0.4656545919134225, + "flos": 34496364643200.0, + "grad_norm": 1.5849641227794893, + "language_loss": 0.85084593, + "learning_rate": 2.31819542038153e-06, + "loss": 0.87230361, + "num_input_tokens_seen": 166272545, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71875, + "step": 7745, + "time_per_iteration": 2.574612617492676 + }, + { + "auxiliary_loss_clip": 0.01112672, + "auxiliary_loss_mlp": 0.01039212, + "balance_loss_clip": 1.02561605, + "balance_loss_mlp": 1.04127502, + "epoch": 0.4657147151660905, + "flos": 24310554238080.0, + "grad_norm": 1.35434162506916, + "language_loss": 0.73171556, + "learning_rate": 2.317810913304574e-06, + "loss": 0.75323439, + "num_input_tokens_seen": 166292135, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.71484375, + "step": 7746, + "time_per_iteration": 2.5375149250030518 + }, + { + "auxiliary_loss_clip": 0.01112894, + "auxiliary_loss_mlp": 0.01035164, + "balance_loss_clip": 1.02271867, + "balance_loss_mlp": 1.04081106, + "epoch": 0.46577483841875844, + "flos": 58795139220480.0, + "grad_norm": 1.5285629366651527, + "language_loss": 0.6993416, + "learning_rate": 2.3174263941761963e-06, + "loss": 0.72082222, + "num_input_tokens_seen": 166316710, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71875, + "step": 7747, + "time_per_iteration": 2.792043685913086 + }, + { + "auxiliary_loss_clip": 0.01112826, + "auxiliary_loss_mlp": 0.01031484, + "balance_loss_clip": 1.01872873, + "balance_loss_mlp": 1.03958046, + "epoch": 0.4658349616714264, + "flos": 31321969223040.0, + "grad_norm": 1.4175797777041124, + "language_loss": 0.67509431, + "learning_rate": 2.317041863010978e-06, + "loss": 0.69653738, + "num_input_tokens_seen": 166338535, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.73046875, + "step": 7748, + "time_per_iteration": 2.625060796737671 + }, + { + "auxiliary_loss_clip": 0.01117966, + "auxiliary_loss_mlp": 0.01037997, + "balance_loss_clip": 1.02341771, + "balance_loss_mlp": 1.04018533, + "epoch": 0.46589508492409437, + "flos": 14860338456960.0, + "grad_norm": 2.247229042591788, + "language_loss": 0.63667625, + "learning_rate": 2.3166573198235007e-06, + "loss": 0.65823585, + "num_input_tokens_seen": 166355540, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.77734375, + "step": 7749, + "time_per_iteration": 2.4132370948791504 + }, + { + "auxiliary_loss_clip": 0.01117494, + "auxiliary_loss_mlp": 0.0103098, + "balance_loss_clip": 1.01702619, + "balance_loss_mlp": 1.04231274, + "epoch": 0.46595520817676234, + "flos": 12895989431040.0, + "grad_norm": 2.928439488128299, + "language_loss": 0.74594498, + "learning_rate": 2.3162727646283456e-06, + "loss": 0.76742983, + "num_input_tokens_seen": 166372635, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75, + "step": 7750, + "time_per_iteration": 2.494771718978882 + }, + { + "auxiliary_loss_clip": 0.01115846, + "auxiliary_loss_mlp": 0.0103076, + "balance_loss_clip": 1.01701522, + "balance_loss_mlp": 1.0404911, + "epoch": 0.46601533142943036, + "flos": 32854169721600.0, + "grad_norm": 2.044073047720548, + "language_loss": 0.7496438, + "learning_rate": 2.3158881974400963e-06, + "loss": 0.77110994, + "num_input_tokens_seen": 166393175, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75390625, + "step": 7751, + "time_per_iteration": 2.5510993003845215 + }, + { + "auxiliary_loss_clip": 0.01118875, + "auxiliary_loss_mlp": 0.01031961, + "balance_loss_clip": 1.01816297, + "balance_loss_mlp": 1.04188776, + "epoch": 0.4660754546820983, + "flos": 19967517826560.0, + "grad_norm": 1.8775850665267624, + "language_loss": 0.73678327, + "learning_rate": 2.3155036182733345e-06, + "loss": 0.7582916, + "num_input_tokens_seen": 166408630, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76953125, + "step": 7752, + "time_per_iteration": 2.5834901332855225 + }, + { + "auxiliary_loss_clip": 0.01118438, + "auxiliary_loss_mlp": 0.0103798, + "balance_loss_clip": 1.02401483, + "balance_loss_mlp": 1.041453, + "epoch": 0.4661355779347663, + "flos": 26688164215680.0, + "grad_norm": 2.485236836866318, + "language_loss": 0.69320381, + "learning_rate": 2.315119027142644e-06, + "loss": 0.71476793, + "num_input_tokens_seen": 166428170, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76953125, + "step": 7753, + "time_per_iteration": 2.522881507873535 + }, + { + "auxiliary_loss_clip": 0.01111836, + "auxiliary_loss_mlp": 0.01031808, + "balance_loss_clip": 1.01862359, + "balance_loss_mlp": 1.04056942, + "epoch": 0.46619570118743425, + "flos": 20959442881920.0, + "grad_norm": 1.8174540980864333, + "language_loss": 0.72607052, + "learning_rate": 2.3147344240626076e-06, + "loss": 0.74750698, + "num_input_tokens_seen": 166446705, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71484375, + "step": 7754, + "time_per_iteration": 2.5403332710266113 + }, + { + "auxiliary_loss_clip": 0.01115444, + "auxiliary_loss_mlp": 0.01027181, + "balance_loss_clip": 1.01410365, + "balance_loss_mlp": 1.04032147, + "epoch": 0.4662558244401022, + "flos": 24426079355520.0, + "grad_norm": 1.501284890447191, + "language_loss": 0.78961611, + "learning_rate": 2.3143498090478114e-06, + "loss": 0.81104231, + "num_input_tokens_seen": 166466750, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.75, + "step": 7755, + "time_per_iteration": 2.4917664527893066 + }, + { + "auxiliary_loss_clip": 0.01110479, + "auxiliary_loss_mlp": 0.01029481, + "balance_loss_clip": 1.01675534, + "balance_loss_mlp": 1.03968203, + "epoch": 0.4663159476927702, + "flos": 20595452411520.0, + "grad_norm": 1.6390600579035761, + "language_loss": 0.72281897, + "learning_rate": 2.3139651821128382e-06, + "loss": 0.74421859, + "num_input_tokens_seen": 166485400, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 7756, + "time_per_iteration": 2.549678325653076 + }, + { + "auxiliary_loss_clip": 0.01111703, + "auxiliary_loss_mlp": 0.01030785, + "balance_loss_clip": 1.01770794, + "balance_loss_mlp": 1.03845477, + "epoch": 0.46637607094543815, + "flos": 25661872823040.0, + "grad_norm": 1.8004000990726714, + "language_loss": 0.78193069, + "learning_rate": 2.313580543272274e-06, + "loss": 0.80335552, + "num_input_tokens_seen": 166505730, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.734375, + "step": 7757, + "time_per_iteration": 2.483161687850952 + }, + { + "auxiliary_loss_clip": 0.01114658, + "auxiliary_loss_mlp": 0.01030645, + "balance_loss_clip": 1.01855707, + "balance_loss_mlp": 1.04131472, + "epoch": 0.4664361941981061, + "flos": 24273853516800.0, + "grad_norm": 2.024129481036371, + "language_loss": 0.66473371, + "learning_rate": 2.313195892540705e-06, + "loss": 0.68618673, + "num_input_tokens_seen": 166523770, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.734375, + "step": 7758, + "time_per_iteration": 2.5083394050598145 + }, + { + "auxiliary_loss_clip": 0.01113834, + "auxiliary_loss_mlp": 0.01037253, + "balance_loss_clip": 1.0243423, + "balance_loss_mlp": 1.04062152, + "epoch": 0.4664963174507741, + "flos": 18405871153920.0, + "grad_norm": 1.603488256474455, + "language_loss": 0.74207008, + "learning_rate": 2.3128112299327147e-06, + "loss": 0.76358092, + "num_input_tokens_seen": 166542935, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.73046875, + "step": 7759, + "time_per_iteration": 2.424461841583252 + }, + { + "auxiliary_loss_clip": 0.01113311, + "auxiliary_loss_mlp": 0.01033749, + "balance_loss_clip": 1.02089834, + "balance_loss_mlp": 1.04054224, + "epoch": 0.46655644070344204, + "flos": 22455122227200.0, + "grad_norm": 1.4805046968385447, + "language_loss": 0.77701056, + "learning_rate": 2.312426555462893e-06, + "loss": 0.79848123, + "num_input_tokens_seen": 166563935, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.73046875, + "step": 7760, + "time_per_iteration": 2.5147666931152344 + }, + { + "auxiliary_loss_clip": 0.01109461, + "auxiliary_loss_mlp": 0.01028634, + "balance_loss_clip": 1.01549125, + "balance_loss_mlp": 1.03895068, + "epoch": 0.46661656395611, + "flos": 13808407731840.0, + "grad_norm": 1.6623756387577715, + "language_loss": 0.74081796, + "learning_rate": 2.3120418691458237e-06, + "loss": 0.76219893, + "num_input_tokens_seen": 166582175, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 7761, + "time_per_iteration": 3.816096305847168 + }, + { + "auxiliary_loss_clip": 0.01117051, + "auxiliary_loss_mlp": 0.01031991, + "balance_loss_clip": 1.01743007, + "balance_loss_mlp": 1.040905, + "epoch": 0.466676687208778, + "flos": 21652159645440.0, + "grad_norm": 1.9521312394592187, + "language_loss": 0.78150368, + "learning_rate": 2.3116571709960956e-06, + "loss": 0.80299413, + "num_input_tokens_seen": 166601870, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.76171875, + "step": 7762, + "time_per_iteration": 5.593664169311523 + }, + { + "auxiliary_loss_clip": 0.01036542, + "auxiliary_loss_mlp": 0.01002344, + "balance_loss_clip": 1.00103235, + "balance_loss_mlp": 1.0128268, + "epoch": 0.46673681046144594, + "flos": 68534259068160.0, + "grad_norm": 0.7996147947039336, + "language_loss": 0.59759605, + "learning_rate": 2.311272461028297e-06, + "loss": 0.61798495, + "num_input_tokens_seen": 166668960, + "router_z_loss_clip": 0.01312256, + "router_z_loss_mlp": 0.23828125, + "step": 7763, + "time_per_iteration": 4.692638874053955 + }, + { + "auxiliary_loss_clip": 0.01115827, + "auxiliary_loss_mlp": 0.01035827, + "balance_loss_clip": 1.02139115, + "balance_loss_mlp": 1.03950739, + "epoch": 0.46679693371411396, + "flos": 15814449469440.0, + "grad_norm": 2.0939196550691075, + "language_loss": 0.78502893, + "learning_rate": 2.3108877392570146e-06, + "loss": 0.80654544, + "num_input_tokens_seen": 166686110, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.765625, + "step": 7764, + "time_per_iteration": 2.437487840652466 + }, + { + "auxiliary_loss_clip": 0.01113145, + "auxiliary_loss_mlp": 0.01035809, + "balance_loss_clip": 1.02385855, + "balance_loss_mlp": 1.04100394, + "epoch": 0.4668570569667819, + "flos": 18514572687360.0, + "grad_norm": 1.8134732296760265, + "language_loss": 0.72272134, + "learning_rate": 2.310503005696839e-06, + "loss": 0.74421084, + "num_input_tokens_seen": 166703930, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.72265625, + "step": 7765, + "time_per_iteration": 2.4413938522338867 + }, + { + "auxiliary_loss_clip": 0.01114151, + "auxiliary_loss_mlp": 0.01034738, + "balance_loss_clip": 1.02123809, + "balance_loss_mlp": 1.03898025, + "epoch": 0.4669171802194499, + "flos": 19206643006080.0, + "grad_norm": 2.045608669049209, + "language_loss": 0.77604026, + "learning_rate": 2.3101182603623576e-06, + "loss": 0.79752916, + "num_input_tokens_seen": 166719940, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 7766, + "time_per_iteration": 2.4388277530670166 + }, + { + "auxiliary_loss_clip": 0.01112932, + "auxiliary_loss_mlp": 0.01033882, + "balance_loss_clip": 1.02094162, + "balance_loss_mlp": 1.03921056, + "epoch": 0.46697730347211786, + "flos": 12276135406080.0, + "grad_norm": 1.9270773145684021, + "language_loss": 0.65106744, + "learning_rate": 2.3097335032681607e-06, + "loss": 0.67253554, + "num_input_tokens_seen": 166738285, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.73828125, + "step": 7767, + "time_per_iteration": 2.4259531497955322 + }, + { + "auxiliary_loss_clip": 0.01115563, + "auxiliary_loss_mlp": 0.01036202, + "balance_loss_clip": 1.02361989, + "balance_loss_mlp": 1.04137385, + "epoch": 0.4670374267247858, + "flos": 23586739274880.0, + "grad_norm": 1.832674622819915, + "language_loss": 0.74584204, + "learning_rate": 2.3093487344288393e-06, + "loss": 0.76735973, + "num_input_tokens_seen": 166758170, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7421875, + "step": 7768, + "time_per_iteration": 2.5001304149627686 + }, + { + "auxiliary_loss_clip": 0.01114611, + "auxiliary_loss_mlp": 0.01031838, + "balance_loss_clip": 1.01907098, + "balance_loss_mlp": 1.04069757, + "epoch": 0.4670975499774538, + "flos": 15991093578240.0, + "grad_norm": 1.7275432453698176, + "language_loss": 0.70713127, + "learning_rate": 2.308963953858982e-06, + "loss": 0.72859579, + "num_input_tokens_seen": 166775750, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.73828125, + "step": 7769, + "time_per_iteration": 2.466909408569336 + }, + { + "auxiliary_loss_clip": 0.01113851, + "auxiliary_loss_mlp": 0.01033973, + "balance_loss_clip": 1.02159858, + "balance_loss_mlp": 1.03928077, + "epoch": 0.46715767323012175, + "flos": 15377596260480.0, + "grad_norm": 1.9729575937492385, + "language_loss": 0.8121224, + "learning_rate": 2.3085791615731803e-06, + "loss": 0.83360064, + "num_input_tokens_seen": 166791720, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.74609375, + "step": 7770, + "time_per_iteration": 2.458648204803467 + }, + { + "auxiliary_loss_clip": 0.01036054, + "auxiliary_loss_mlp": 0.01001838, + "balance_loss_clip": 1.00070572, + "balance_loss_mlp": 1.01253605, + "epoch": 0.4672177964827897, + "flos": 60252217401600.0, + "grad_norm": 0.7993613034211892, + "language_loss": 0.5567323, + "learning_rate": 2.3081943575860265e-06, + "loss": 0.57711124, + "num_input_tokens_seen": 166856360, + "router_z_loss_clip": 0.01135254, + "router_z_loss_mlp": 0.23632812, + "step": 7771, + "time_per_iteration": 3.0888803005218506 + }, + { + "auxiliary_loss_clip": 0.01111082, + "auxiliary_loss_mlp": 0.01036097, + "balance_loss_clip": 1.02332425, + "balance_loss_mlp": 1.03920853, + "epoch": 0.4672779197354577, + "flos": 27636134002560.0, + "grad_norm": 2.068311261086289, + "language_loss": 0.65702665, + "learning_rate": 2.3078095419121117e-06, + "loss": 0.67849845, + "num_input_tokens_seen": 166875925, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.71875, + "step": 7772, + "time_per_iteration": 2.5242044925689697 + }, + { + "auxiliary_loss_clip": 0.01112309, + "auxiliary_loss_mlp": 0.01032453, + "balance_loss_clip": 1.0201087, + "balance_loss_mlp": 1.04012156, + "epoch": 0.46733804298812565, + "flos": 31394257344000.0, + "grad_norm": 1.8148576314480773, + "language_loss": 0.63699466, + "learning_rate": 2.3074247145660283e-06, + "loss": 0.65844226, + "num_input_tokens_seen": 166896520, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.72265625, + "step": 7773, + "time_per_iteration": 2.5828921794891357 + }, + { + "auxiliary_loss_clip": 0.01114763, + "auxiliary_loss_mlp": 0.01034304, + "balance_loss_clip": 1.02112508, + "balance_loss_mlp": 1.04050922, + "epoch": 0.4673981662407936, + "flos": 19500607912320.0, + "grad_norm": 1.942265734861076, + "language_loss": 0.79793948, + "learning_rate": 2.3070398755623685e-06, + "loss": 0.81943017, + "num_input_tokens_seen": 166915370, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 7774, + "time_per_iteration": 2.448124647140503 + }, + { + "auxiliary_loss_clip": 0.01116733, + "auxiliary_loss_mlp": 0.01030065, + "balance_loss_clip": 1.01732183, + "balance_loss_mlp": 1.04113531, + "epoch": 0.4674582894934616, + "flos": 20521835487360.0, + "grad_norm": 1.627446474145158, + "language_loss": 0.77884328, + "learning_rate": 2.306655024915726e-06, + "loss": 0.80031127, + "num_input_tokens_seen": 166934875, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7578125, + "step": 7775, + "time_per_iteration": 2.527324676513672 + }, + { + "auxiliary_loss_clip": 0.01111153, + "auxiliary_loss_mlp": 0.01029234, + "balance_loss_clip": 1.01650286, + "balance_loss_mlp": 1.03931999, + "epoch": 0.46751841274612954, + "flos": 22090952188800.0, + "grad_norm": 1.8679682194131426, + "language_loss": 0.69634461, + "learning_rate": 2.306270162640694e-06, + "loss": 0.71774852, + "num_input_tokens_seen": 166954285, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 7776, + "time_per_iteration": 2.4637980461120605 + }, + { + "auxiliary_loss_clip": 0.01113537, + "auxiliary_loss_mlp": 0.01032642, + "balance_loss_clip": 1.02123928, + "balance_loss_mlp": 1.04122162, + "epoch": 0.46757853599879756, + "flos": 26980082046720.0, + "grad_norm": 1.3721760360464321, + "language_loss": 0.73558104, + "learning_rate": 2.3058852887518678e-06, + "loss": 0.75704277, + "num_input_tokens_seen": 166975975, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.72265625, + "step": 7777, + "time_per_iteration": 2.520732879638672 + }, + { + "auxiliary_loss_clip": 0.01113463, + "auxiliary_loss_mlp": 0.01029901, + "balance_loss_clip": 1.0170207, + "balance_loss_mlp": 1.04067683, + "epoch": 0.4676386592514655, + "flos": 24134053783680.0, + "grad_norm": 2.1302386072463717, + "language_loss": 0.69626892, + "learning_rate": 2.3055004032638394e-06, + "loss": 0.71770251, + "num_input_tokens_seen": 166996140, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7265625, + "step": 7778, + "time_per_iteration": 2.514420509338379 + }, + { + "auxiliary_loss_clip": 0.01114478, + "auxiliary_loss_mlp": 0.01039246, + "balance_loss_clip": 1.02606773, + "balance_loss_mlp": 1.04059839, + "epoch": 0.4676987825041335, + "flos": 25483720343040.0, + "grad_norm": 1.560538067350171, + "language_loss": 0.73252767, + "learning_rate": 2.305115506191206e-06, + "loss": 0.75406492, + "num_input_tokens_seen": 167016105, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 7779, + "time_per_iteration": 2.5243053436279297 + }, + { + "auxiliary_loss_clip": 0.01112098, + "auxiliary_loss_mlp": 0.01039794, + "balance_loss_clip": 1.02767682, + "balance_loss_mlp": 1.04009414, + "epoch": 0.46775890575680146, + "flos": 21945298538880.0, + "grad_norm": 1.5361358548392845, + "language_loss": 0.72206026, + "learning_rate": 2.304730597548562e-06, + "loss": 0.74357915, + "num_input_tokens_seen": 167036185, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71875, + "step": 7780, + "time_per_iteration": 2.462562322616577 + }, + { + "auxiliary_loss_clip": 0.01116485, + "auxiliary_loss_mlp": 0.01036348, + "balance_loss_clip": 1.02259159, + "balance_loss_mlp": 1.03972697, + "epoch": 0.4678190290094694, + "flos": 25228395492480.0, + "grad_norm": 2.377229275085917, + "language_loss": 0.73864317, + "learning_rate": 2.3043456773505023e-06, + "loss": 0.76017153, + "num_input_tokens_seen": 167054515, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.765625, + "step": 7781, + "time_per_iteration": 2.502406358718872 + }, + { + "auxiliary_loss_clip": 0.01117462, + "auxiliary_loss_mlp": 0.01035425, + "balance_loss_clip": 1.02216315, + "balance_loss_mlp": 1.04165602, + "epoch": 0.4678791522621374, + "flos": 32268358811520.0, + "grad_norm": 1.718665338253189, + "language_loss": 0.62727809, + "learning_rate": 2.3039607456116252e-06, + "loss": 0.64880699, + "num_input_tokens_seen": 167077245, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7578125, + "step": 7782, + "time_per_iteration": 2.5425686836242676 + }, + { + "auxiliary_loss_clip": 0.01117055, + "auxiliary_loss_mlp": 0.01039299, + "balance_loss_clip": 1.02660906, + "balance_loss_mlp": 1.0408988, + "epoch": 0.46793927551480535, + "flos": 27046480337280.0, + "grad_norm": 1.7203724678454408, + "language_loss": 0.62933487, + "learning_rate": 2.3035758023465254e-06, + "loss": 0.65089834, + "num_input_tokens_seen": 167097235, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.76171875, + "step": 7783, + "time_per_iteration": 2.5380141735076904 + }, + { + "auxiliary_loss_clip": 0.01122421, + "auxiliary_loss_mlp": 0.01036672, + "balance_loss_clip": 1.02271223, + "balance_loss_mlp": 1.04462993, + "epoch": 0.4679993987674733, + "flos": 17457398576640.0, + "grad_norm": 2.164400906730855, + "language_loss": 0.67745304, + "learning_rate": 2.303190847569801e-06, + "loss": 0.69904399, + "num_input_tokens_seen": 167113155, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.77734375, + "step": 7784, + "time_per_iteration": 2.4520463943481445 + }, + { + "auxiliary_loss_clip": 0.01110794, + "auxiliary_loss_mlp": 0.01031461, + "balance_loss_clip": 1.01965284, + "balance_loss_mlp": 1.03855705, + "epoch": 0.4680595220201413, + "flos": 17165121609600.0, + "grad_norm": 1.8603472350259396, + "language_loss": 0.84720063, + "learning_rate": 2.3028058812960497e-06, + "loss": 0.8686232, + "num_input_tokens_seen": 167131765, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.72265625, + "step": 7785, + "time_per_iteration": 2.459446907043457 + }, + { + "auxiliary_loss_clip": 0.01115116, + "auxiliary_loss_mlp": 0.01029458, + "balance_loss_clip": 1.01639259, + "balance_loss_mlp": 1.04066038, + "epoch": 0.46811964527280925, + "flos": 11327591001600.0, + "grad_norm": 2.0359259581468154, + "language_loss": 0.77018952, + "learning_rate": 2.3024209035398678e-06, + "loss": 0.79163527, + "num_input_tokens_seen": 167149030, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.74609375, + "step": 7786, + "time_per_iteration": 2.415062427520752 + }, + { + "auxiliary_loss_clip": 0.01110671, + "auxiliary_loss_mlp": 0.01027657, + "balance_loss_clip": 1.01558685, + "balance_loss_mlp": 1.0400672, + "epoch": 0.4681797685254772, + "flos": 24278809593600.0, + "grad_norm": 2.023612965965443, + "language_loss": 0.73795342, + "learning_rate": 2.302035914315856e-06, + "loss": 0.75933665, + "num_input_tokens_seen": 167167375, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.70703125, + "step": 7787, + "time_per_iteration": 2.5224268436431885 + }, + { + "auxiliary_loss_clip": 0.01113364, + "auxiliary_loss_mlp": 0.01039171, + "balance_loss_clip": 1.02599859, + "balance_loss_mlp": 1.04109263, + "epoch": 0.4682398917781452, + "flos": 31650372293760.0, + "grad_norm": 1.7002718084162438, + "language_loss": 0.65639925, + "learning_rate": 2.3016509136386116e-06, + "loss": 0.67792457, + "num_input_tokens_seen": 167188065, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.72265625, + "step": 7788, + "time_per_iteration": 2.534850835800171 + }, + { + "auxiliary_loss_clip": 0.01110419, + "auxiliary_loss_mlp": 0.01030066, + "balance_loss_clip": 1.01878858, + "balance_loss_mlp": 1.03911507, + "epoch": 0.46830001503081314, + "flos": 28110765340800.0, + "grad_norm": 1.9511727744147118, + "language_loss": 0.63813901, + "learning_rate": 2.3012659015227343e-06, + "loss": 0.65954381, + "num_input_tokens_seen": 167209675, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.71484375, + "step": 7789, + "time_per_iteration": 2.5479812622070312 + }, + { + "auxiliary_loss_clip": 0.01036451, + "auxiliary_loss_mlp": 0.01005013, + "balance_loss_clip": 1.00388098, + "balance_loss_mlp": 1.01292431, + "epoch": 0.4683601382834811, + "flos": 57881718316800.0, + "grad_norm": 0.7071467356489777, + "language_loss": 0.61922455, + "learning_rate": 2.300880877982825e-06, + "loss": 0.6396392, + "num_input_tokens_seen": 167273940, + "router_z_loss_clip": 0.01135254, + "router_z_loss_mlp": 0.23632812, + "step": 7790, + "time_per_iteration": 3.1510462760925293 + }, + { + "auxiliary_loss_clip": 0.01112801, + "auxiliary_loss_mlp": 0.0103052, + "balance_loss_clip": 1.01836109, + "balance_loss_mlp": 1.04223442, + "epoch": 0.46842026153614913, + "flos": 21871933009920.0, + "grad_norm": 1.5995715197713376, + "language_loss": 0.79338831, + "learning_rate": 2.3004958430334808e-06, + "loss": 0.81482148, + "num_input_tokens_seen": 167292730, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.703125, + "step": 7791, + "time_per_iteration": 2.5008740425109863 + }, + { + "auxiliary_loss_clip": 0.01114115, + "auxiliary_loss_mlp": 0.01035459, + "balance_loss_clip": 1.02297759, + "balance_loss_mlp": 1.04113936, + "epoch": 0.4684803847888171, + "flos": 24900818434560.0, + "grad_norm": 1.651557239680421, + "language_loss": 0.7484895, + "learning_rate": 2.3001107966893052e-06, + "loss": 0.76998532, + "num_input_tokens_seen": 167313460, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.73046875, + "step": 7792, + "time_per_iteration": 2.4964823722839355 + }, + { + "auxiliary_loss_clip": 0.01108357, + "auxiliary_loss_mlp": 0.01031288, + "balance_loss_clip": 1.01953983, + "balance_loss_mlp": 1.03747678, + "epoch": 0.46854050804148506, + "flos": 26251670142720.0, + "grad_norm": 1.7412725365893262, + "language_loss": 0.6822598, + "learning_rate": 2.299725738964898e-06, + "loss": 0.70365626, + "num_input_tokens_seen": 167335385, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.70703125, + "step": 7793, + "time_per_iteration": 2.5480096340179443 + }, + { + "auxiliary_loss_clip": 0.01112468, + "auxiliary_loss_mlp": 0.01027992, + "balance_loss_clip": 1.01638055, + "balance_loss_mlp": 1.04102671, + "epoch": 0.468600631294153, + "flos": 21579799697280.0, + "grad_norm": 1.577590367357015, + "language_loss": 0.73983628, + "learning_rate": 2.2993406698748607e-06, + "loss": 0.76124084, + "num_input_tokens_seen": 167353625, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.71484375, + "step": 7794, + "time_per_iteration": 2.453190803527832 + }, + { + "auxiliary_loss_clip": 0.01114261, + "auxiliary_loss_mlp": 0.01035788, + "balance_loss_clip": 1.0227052, + "balance_loss_mlp": 1.04182243, + "epoch": 0.468660754546821, + "flos": 25885632597120.0, + "grad_norm": 1.5518603627769951, + "language_loss": 0.63617218, + "learning_rate": 2.2989555894337953e-06, + "loss": 0.65767258, + "num_input_tokens_seen": 167374565, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 7795, + "time_per_iteration": 2.5087008476257324 + }, + { + "auxiliary_loss_clip": 0.01108593, + "auxiliary_loss_mlp": 0.01023894, + "balance_loss_clip": 1.01140058, + "balance_loss_mlp": 1.03883195, + "epoch": 0.46872087779948896, + "flos": 35475001666560.0, + "grad_norm": 1.6379638897021238, + "language_loss": 0.68002474, + "learning_rate": 2.298570497656304e-06, + "loss": 0.70134962, + "num_input_tokens_seen": 167395010, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 7796, + "time_per_iteration": 2.6073970794677734 + }, + { + "auxiliary_loss_clip": 0.01110063, + "auxiliary_loss_mlp": 0.01031393, + "balance_loss_clip": 1.01876903, + "balance_loss_mlp": 1.03811777, + "epoch": 0.4687810010521569, + "flos": 26396425952640.0, + "grad_norm": 1.6469110962479863, + "language_loss": 0.70039898, + "learning_rate": 2.2981853945569894e-06, + "loss": 0.72181356, + "num_input_tokens_seen": 167415285, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71875, + "step": 7797, + "time_per_iteration": 2.5202813148498535 + }, + { + "auxiliary_loss_clip": 0.01114247, + "auxiliary_loss_mlp": 0.01030143, + "balance_loss_clip": 1.01626134, + "balance_loss_mlp": 1.04066193, + "epoch": 0.4688411243048249, + "flos": 19972761212160.0, + "grad_norm": 5.424608495577661, + "language_loss": 0.67517138, + "learning_rate": 2.297800280150454e-06, + "loss": 0.69661522, + "num_input_tokens_seen": 167432405, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.734375, + "step": 7798, + "time_per_iteration": 2.425443649291992 + }, + { + "auxiliary_loss_clip": 0.01033599, + "auxiliary_loss_mlp": 0.00999727, + "balance_loss_clip": 0.99840373, + "balance_loss_mlp": 1.00991392, + "epoch": 0.46890124755749285, + "flos": 63977015900160.0, + "grad_norm": 0.9386412030406017, + "language_loss": 0.64531696, + "learning_rate": 2.2974151544513033e-06, + "loss": 0.66565025, + "num_input_tokens_seen": 167499365, + "router_z_loss_clip": 0.01324463, + "router_z_loss_mlp": 0.23730469, + "step": 7799, + "time_per_iteration": 3.2528939247131348 + }, + { + "auxiliary_loss_clip": 0.01108747, + "auxiliary_loss_mlp": 0.01025125, + "balance_loss_clip": 1.01308465, + "balance_loss_mlp": 1.03731787, + "epoch": 0.4689613708101608, + "flos": 23768985905280.0, + "grad_norm": 1.4163336480228355, + "language_loss": 0.72242683, + "learning_rate": 2.2970300174741395e-06, + "loss": 0.74376553, + "num_input_tokens_seen": 167520390, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.71484375, + "step": 7800, + "time_per_iteration": 2.481309175491333 + }, + { + "auxiliary_loss_clip": 0.01109702, + "auxiliary_loss_mlp": 0.01030663, + "balance_loss_clip": 1.01937377, + "balance_loss_mlp": 1.0401566, + "epoch": 0.4690214940628288, + "flos": 24788705109120.0, + "grad_norm": 2.26920520557406, + "language_loss": 0.72428536, + "learning_rate": 2.296644869233568e-06, + "loss": 0.74568903, + "num_input_tokens_seen": 167539865, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6953125, + "step": 7801, + "time_per_iteration": 2.491105079650879 + }, + { + "auxiliary_loss_clip": 0.01116821, + "auxiliary_loss_mlp": 0.0103741, + "balance_loss_clip": 1.02352786, + "balance_loss_mlp": 1.04097068, + "epoch": 0.46908161731549675, + "flos": 18077324428800.0, + "grad_norm": 2.06336431229611, + "language_loss": 0.62303418, + "learning_rate": 2.2962597097441936e-06, + "loss": 0.64457649, + "num_input_tokens_seen": 167558190, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 7802, + "time_per_iteration": 2.419229030609131 + }, + { + "auxiliary_loss_clip": 0.01113209, + "auxiliary_loss_mlp": 0.01033494, + "balance_loss_clip": 1.02101874, + "balance_loss_mlp": 1.03946614, + "epoch": 0.4691417405681647, + "flos": 25703350053120.0, + "grad_norm": 1.7578029510137774, + "language_loss": 0.73409998, + "learning_rate": 2.2958745390206206e-06, + "loss": 0.75556695, + "num_input_tokens_seen": 167577685, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.73828125, + "step": 7803, + "time_per_iteration": 3.984971523284912 + }, + { + "auxiliary_loss_clip": 0.01107549, + "auxiliary_loss_mlp": 0.01035068, + "balance_loss_clip": 1.02289057, + "balance_loss_mlp": 1.0363642, + "epoch": 0.46920186382083273, + "flos": 17457039440640.0, + "grad_norm": 2.1225810300999384, + "language_loss": 0.77638352, + "learning_rate": 2.2954893570774558e-06, + "loss": 0.79780972, + "num_input_tokens_seen": 167596390, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.7109375, + "step": 7804, + "time_per_iteration": 5.432345390319824 + }, + { + "auxiliary_loss_clip": 0.01108405, + "auxiliary_loss_mlp": 0.01026664, + "balance_loss_clip": 1.01417041, + "balance_loss_mlp": 1.03702545, + "epoch": 0.4692619870735007, + "flos": 20339445202560.0, + "grad_norm": 1.8629622532391696, + "language_loss": 0.77384996, + "learning_rate": 2.295104163929305e-06, + "loss": 0.79520065, + "num_input_tokens_seen": 167614980, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71484375, + "step": 7805, + "time_per_iteration": 3.873565196990967 + }, + { + "auxiliary_loss_clip": 0.01119773, + "auxiliary_loss_mlp": 0.01037518, + "balance_loss_clip": 1.02423811, + "balance_loss_mlp": 1.04193878, + "epoch": 0.46932211032616866, + "flos": 29496558003840.0, + "grad_norm": 1.5711850680288217, + "language_loss": 0.82902926, + "learning_rate": 2.2947189595907742e-06, + "loss": 0.85060221, + "num_input_tokens_seen": 167635895, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.78125, + "step": 7806, + "time_per_iteration": 2.554081439971924 + }, + { + "auxiliary_loss_clip": 0.0111231, + "auxiliary_loss_mlp": 0.01034772, + "balance_loss_clip": 1.02150404, + "balance_loss_mlp": 1.03812897, + "epoch": 0.4693822335788366, + "flos": 36211242735360.0, + "grad_norm": 1.7011762555096541, + "language_loss": 0.77454185, + "learning_rate": 2.294333744076472e-06, + "loss": 0.79601264, + "num_input_tokens_seen": 167657440, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7421875, + "step": 7807, + "time_per_iteration": 2.5786170959472656 + }, + { + "auxiliary_loss_clip": 0.01112504, + "auxiliary_loss_mlp": 0.01033071, + "balance_loss_clip": 1.01985693, + "balance_loss_mlp": 1.03987944, + "epoch": 0.4694423568315046, + "flos": 20338978325760.0, + "grad_norm": 1.9089254292763438, + "language_loss": 0.51788038, + "learning_rate": 2.2939485174010035e-06, + "loss": 0.53933609, + "num_input_tokens_seen": 167675025, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7265625, + "step": 7808, + "time_per_iteration": 2.4730944633483887 + }, + { + "auxiliary_loss_clip": 0.01034297, + "auxiliary_loss_mlp": 0.01010423, + "balance_loss_clip": 1.00899839, + "balance_loss_mlp": 1.01039815, + "epoch": 0.46950248008417256, + "flos": 64326353621760.0, + "grad_norm": 0.782722095319277, + "language_loss": 0.57725239, + "learning_rate": 2.293563279578978e-06, + "loss": 0.59769958, + "num_input_tokens_seen": 167729635, + "router_z_loss_clip": 0.01422119, + "router_z_loss_mlp": 0.23925781, + "step": 7809, + "time_per_iteration": 2.9356954097747803 + }, + { + "auxiliary_loss_clip": 0.01116298, + "auxiliary_loss_mlp": 0.01036482, + "balance_loss_clip": 1.0237031, + "balance_loss_mlp": 1.04176784, + "epoch": 0.4695626033368405, + "flos": 19200106730880.0, + "grad_norm": 2.074581573353579, + "language_loss": 0.72116458, + "learning_rate": 2.2931780306250045e-06, + "loss": 0.74269235, + "num_input_tokens_seen": 167745135, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.74609375, + "step": 7810, + "time_per_iteration": 2.493408679962158 + }, + { + "auxiliary_loss_clip": 0.01114023, + "auxiliary_loss_mlp": 0.0103688, + "balance_loss_clip": 1.02402329, + "balance_loss_mlp": 1.040115, + "epoch": 0.4696227265895085, + "flos": 23002436736000.0, + "grad_norm": 2.1541938985336992, + "language_loss": 0.8075912, + "learning_rate": 2.29279277055369e-06, + "loss": 0.82910025, + "num_input_tokens_seen": 167763875, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.73828125, + "step": 7811, + "time_per_iteration": 2.4555575847625732 + }, + { + "auxiliary_loss_clip": 0.01114703, + "auxiliary_loss_mlp": 0.0103453, + "balance_loss_clip": 1.02146435, + "balance_loss_mlp": 1.04074228, + "epoch": 0.46968284984217645, + "flos": 21870855601920.0, + "grad_norm": 1.576643907851126, + "language_loss": 0.8039701, + "learning_rate": 2.292407499379644e-06, + "loss": 0.82546234, + "num_input_tokens_seen": 167784895, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7421875, + "step": 7812, + "time_per_iteration": 2.4640350341796875 + }, + { + "auxiliary_loss_clip": 0.01109494, + "auxiliary_loss_mlp": 0.01029039, + "balance_loss_clip": 1.0166117, + "balance_loss_mlp": 1.03902435, + "epoch": 0.4697429730948444, + "flos": 19974987855360.0, + "grad_norm": 1.5853543039664872, + "language_loss": 0.73764664, + "learning_rate": 2.292022217117477e-06, + "loss": 0.75903195, + "num_input_tokens_seen": 167803185, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.703125, + "step": 7813, + "time_per_iteration": 2.4320507049560547 + }, + { + "auxiliary_loss_clip": 0.01110282, + "auxiliary_loss_mlp": 0.01028304, + "balance_loss_clip": 1.01483905, + "balance_loss_mlp": 1.03869295, + "epoch": 0.4698030963475124, + "flos": 15156206784000.0, + "grad_norm": 2.2861298905980756, + "language_loss": 0.84540617, + "learning_rate": 2.291636923781798e-06, + "loss": 0.86679196, + "num_input_tokens_seen": 167816550, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71484375, + "step": 7814, + "time_per_iteration": 2.4274749755859375 + }, + { + "auxiliary_loss_clip": 0.01107762, + "auxiliary_loss_mlp": 0.01036717, + "balance_loss_clip": 1.02381229, + "balance_loss_mlp": 1.03796697, + "epoch": 0.46986321960018035, + "flos": 15151178880000.0, + "grad_norm": 1.8672463737050276, + "language_loss": 0.81747186, + "learning_rate": 2.291251619387217e-06, + "loss": 0.83891666, + "num_input_tokens_seen": 167831845, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69921875, + "step": 7815, + "time_per_iteration": 2.4163284301757812 + }, + { + "auxiliary_loss_clip": 0.01113111, + "auxiliary_loss_mlp": 0.01033733, + "balance_loss_clip": 1.02026868, + "balance_loss_mlp": 1.03994465, + "epoch": 0.4699233428528483, + "flos": 23108911626240.0, + "grad_norm": 2.4869249923010917, + "language_loss": 0.77289331, + "learning_rate": 2.2908663039483468e-06, + "loss": 0.79436171, + "num_input_tokens_seen": 167850360, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73046875, + "step": 7816, + "time_per_iteration": 2.4678542613983154 + }, + { + "auxiliary_loss_clip": 0.01033373, + "auxiliary_loss_mlp": 0.01001411, + "balance_loss_clip": 0.9998135, + "balance_loss_mlp": 1.00933015, + "epoch": 0.46998346610551633, + "flos": 68105558246400.0, + "grad_norm": 0.8340649958424211, + "language_loss": 0.5901494, + "learning_rate": 2.290480977479796e-06, + "loss": 0.61049724, + "num_input_tokens_seen": 167908660, + "router_z_loss_clip": 0.01599121, + "router_z_loss_mlp": 0.24023438, + "step": 7817, + "time_per_iteration": 3.0594780445098877 + }, + { + "auxiliary_loss_clip": 0.01108016, + "auxiliary_loss_mlp": 0.01029692, + "balance_loss_clip": 1.01726496, + "balance_loss_mlp": 1.03904927, + "epoch": 0.4700435893581843, + "flos": 24129456842880.0, + "grad_norm": 1.7036287613919965, + "language_loss": 0.79255462, + "learning_rate": 2.2900956399961775e-06, + "loss": 0.81393164, + "num_input_tokens_seen": 167927905, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 7818, + "time_per_iteration": 2.5072269439697266 + }, + { + "auxiliary_loss_clip": 0.01109812, + "auxiliary_loss_mlp": 0.01032563, + "balance_loss_clip": 1.02011776, + "balance_loss_mlp": 1.03705192, + "epoch": 0.47010371261085226, + "flos": 20150518642560.0, + "grad_norm": 1.8212678437549825, + "language_loss": 0.83521211, + "learning_rate": 2.289710291512104e-06, + "loss": 0.85663581, + "num_input_tokens_seen": 167945995, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7265625, + "step": 7819, + "time_per_iteration": 2.4294557571411133 + }, + { + "auxiliary_loss_clip": 0.01114675, + "auxiliary_loss_mlp": 0.01035104, + "balance_loss_clip": 1.02144313, + "balance_loss_mlp": 1.0395112, + "epoch": 0.47016383586352023, + "flos": 15122199582720.0, + "grad_norm": 2.0332467146742457, + "language_loss": 0.75860727, + "learning_rate": 2.289324932042186e-06, + "loss": 0.78010511, + "num_input_tokens_seen": 167963380, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75390625, + "step": 7820, + "time_per_iteration": 2.446664333343506 + }, + { + "auxiliary_loss_clip": 0.0111083, + "auxiliary_loss_mlp": 0.01033323, + "balance_loss_clip": 1.02034664, + "balance_loss_mlp": 1.04058981, + "epoch": 0.4702239591161882, + "flos": 13552975140480.0, + "grad_norm": 1.889014789758207, + "language_loss": 0.73767376, + "learning_rate": 2.288939561601039e-06, + "loss": 0.75911528, + "num_input_tokens_seen": 167981740, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.703125, + "step": 7821, + "time_per_iteration": 2.4138526916503906 + }, + { + "auxiliary_loss_clip": 0.01110991, + "auxiliary_loss_mlp": 0.01040092, + "balance_loss_clip": 1.02792668, + "balance_loss_mlp": 1.04042852, + "epoch": 0.47028408236885616, + "flos": 24276511123200.0, + "grad_norm": 1.6752111617055698, + "language_loss": 0.88782346, + "learning_rate": 2.2885541802032746e-06, + "loss": 0.9093343, + "num_input_tokens_seen": 167999380, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.703125, + "step": 7822, + "time_per_iteration": 2.5215280055999756 + }, + { + "auxiliary_loss_clip": 0.01110261, + "auxiliary_loss_mlp": 0.01029425, + "balance_loss_clip": 1.01693165, + "balance_loss_mlp": 1.03927922, + "epoch": 0.4703442056215241, + "flos": 22856926740480.0, + "grad_norm": 1.5082152139738452, + "language_loss": 0.79467583, + "learning_rate": 2.2881687878635055e-06, + "loss": 0.8160727, + "num_input_tokens_seen": 168018395, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 7823, + "time_per_iteration": 2.4513280391693115 + }, + { + "auxiliary_loss_clip": 0.01034267, + "auxiliary_loss_mlp": 0.01003747, + "balance_loss_clip": 1.00228715, + "balance_loss_mlp": 1.01028728, + "epoch": 0.4704043288741921, + "flos": 69240227950080.0, + "grad_norm": 0.6886986665104876, + "language_loss": 0.56664526, + "learning_rate": 2.2877833845963487e-06, + "loss": 0.5870254, + "num_input_tokens_seen": 168084080, + "router_z_loss_clip": 0.0145874, + "router_z_loss_mlp": 0.24023438, + "step": 7824, + "time_per_iteration": 3.1640188694000244 + }, + { + "auxiliary_loss_clip": 0.01113151, + "auxiliary_loss_mlp": 0.01035787, + "balance_loss_clip": 1.02209568, + "balance_loss_mlp": 1.03935504, + "epoch": 0.47046445212686006, + "flos": 18041090584320.0, + "grad_norm": 1.7687808389256934, + "language_loss": 0.81284839, + "learning_rate": 2.2873979704164157e-06, + "loss": 0.83433783, + "num_input_tokens_seen": 168101555, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73828125, + "step": 7825, + "time_per_iteration": 2.4225590229034424 + }, + { + "auxiliary_loss_clip": 0.01114172, + "auxiliary_loss_mlp": 0.01030918, + "balance_loss_clip": 1.01788807, + "balance_loss_mlp": 1.04160166, + "epoch": 0.470524575379528, + "flos": 23951448017280.0, + "grad_norm": 1.5897626143629002, + "language_loss": 0.66397595, + "learning_rate": 2.287012545338324e-06, + "loss": 0.68542683, + "num_input_tokens_seen": 168121530, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 7826, + "time_per_iteration": 2.512421131134033 + }, + { + "auxiliary_loss_clip": 0.0111203, + "auxiliary_loss_mlp": 0.01036996, + "balance_loss_clip": 1.02366889, + "balance_loss_mlp": 1.03788161, + "epoch": 0.470584698632196, + "flos": 18113558273280.0, + "grad_norm": 2.2414984964582354, + "language_loss": 0.83768737, + "learning_rate": 2.2866271093766877e-06, + "loss": 0.85917771, + "num_input_tokens_seen": 168140335, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7421875, + "step": 7827, + "time_per_iteration": 2.449002504348755 + }, + { + "auxiliary_loss_clip": 0.01034449, + "auxiliary_loss_mlp": 0.01000576, + "balance_loss_clip": 0.99914598, + "balance_loss_mlp": 1.01066613, + "epoch": 0.47064482188486395, + "flos": 57251916224640.0, + "grad_norm": 0.821565097847141, + "language_loss": 0.55694902, + "learning_rate": 2.286241662546122e-06, + "loss": 0.57729936, + "num_input_tokens_seen": 168200535, + "router_z_loss_clip": 0.01428223, + "router_z_loss_mlp": 0.23828125, + "step": 7828, + "time_per_iteration": 3.0819802284240723 + }, + { + "auxiliary_loss_clip": 0.01109156, + "auxiliary_loss_mlp": 0.01028442, + "balance_loss_clip": 1.01605594, + "balance_loss_mlp": 1.03884375, + "epoch": 0.4707049451375319, + "flos": 17895077798400.0, + "grad_norm": 1.9071991460911069, + "language_loss": 0.81054831, + "learning_rate": 2.285856204861245e-06, + "loss": 0.8319242, + "num_input_tokens_seen": 168219610, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.703125, + "step": 7829, + "time_per_iteration": 2.415055513381958 + }, + { + "auxiliary_loss_clip": 0.01110764, + "auxiliary_loss_mlp": 0.01032324, + "balance_loss_clip": 1.02058768, + "balance_loss_mlp": 1.04020715, + "epoch": 0.47076506839019994, + "flos": 25232669210880.0, + "grad_norm": 1.3327561380149306, + "language_loss": 0.7576915, + "learning_rate": 2.2854707363366703e-06, + "loss": 0.77912241, + "num_input_tokens_seen": 168242505, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.703125, + "step": 7830, + "time_per_iteration": 2.5643560886383057 + }, + { + "auxiliary_loss_clip": 0.0111195, + "auxiliary_loss_mlp": 0.01032106, + "balance_loss_clip": 1.01860535, + "balance_loss_mlp": 1.04144919, + "epoch": 0.4708251916428679, + "flos": 13479681438720.0, + "grad_norm": 1.972485160119179, + "language_loss": 0.78818381, + "learning_rate": 2.2850852569870177e-06, + "loss": 0.80962437, + "num_input_tokens_seen": 168260220, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.703125, + "step": 7831, + "time_per_iteration": 2.4193694591522217 + }, + { + "auxiliary_loss_clip": 0.01115316, + "auxiliary_loss_mlp": 0.01035851, + "balance_loss_clip": 1.02204037, + "balance_loss_mlp": 1.03843021, + "epoch": 0.47088531489553587, + "flos": 30147833450880.0, + "grad_norm": 1.7552368254682797, + "language_loss": 0.76044565, + "learning_rate": 2.2846997668269033e-06, + "loss": 0.78195733, + "num_input_tokens_seen": 168277360, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76953125, + "step": 7832, + "time_per_iteration": 2.5059313774108887 + }, + { + "auxiliary_loss_clip": 0.01110226, + "auxiliary_loss_mlp": 0.01028235, + "balance_loss_clip": 1.0164752, + "balance_loss_mlp": 1.03971505, + "epoch": 0.47094543814820383, + "flos": 21798280172160.0, + "grad_norm": 1.221217846393107, + "language_loss": 0.74499595, + "learning_rate": 2.2843142658709454e-06, + "loss": 0.76638055, + "num_input_tokens_seen": 168296605, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.703125, + "step": 7833, + "time_per_iteration": 2.473198652267456 + }, + { + "auxiliary_loss_clip": 0.01111984, + "auxiliary_loss_mlp": 0.01035389, + "balance_loss_clip": 1.0222286, + "balance_loss_mlp": 1.04079628, + "epoch": 0.4710055614008718, + "flos": 23003011353600.0, + "grad_norm": 1.540147977988576, + "language_loss": 0.7563647, + "learning_rate": 2.283928754133762e-06, + "loss": 0.77783847, + "num_input_tokens_seen": 168316205, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 7834, + "time_per_iteration": 2.4742865562438965 + }, + { + "auxiliary_loss_clip": 0.01110721, + "auxiliary_loss_mlp": 0.01038538, + "balance_loss_clip": 1.02601528, + "balance_loss_mlp": 1.04030991, + "epoch": 0.47106568465353976, + "flos": 42741346452480.0, + "grad_norm": 1.3686611384111311, + "language_loss": 0.66174978, + "learning_rate": 2.283543231629972e-06, + "loss": 0.68324244, + "num_input_tokens_seen": 168338935, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 7835, + "time_per_iteration": 2.631727933883667 + }, + { + "auxiliary_loss_clip": 0.01034831, + "auxiliary_loss_mlp": 0.01005422, + "balance_loss_clip": 1.00418234, + "balance_loss_mlp": 1.01069164, + "epoch": 0.4711258079062077, + "flos": 68554008570240.0, + "grad_norm": 0.8728088219103824, + "language_loss": 0.62162638, + "learning_rate": 2.283157698374194e-06, + "loss": 0.64202893, + "num_input_tokens_seen": 168392800, + "router_z_loss_clip": 0.01239014, + "router_z_loss_mlp": 0.2421875, + "step": 7836, + "time_per_iteration": 3.0448570251464844 + }, + { + "auxiliary_loss_clip": 0.01113991, + "auxiliary_loss_mlp": 0.01035938, + "balance_loss_clip": 1.02254474, + "balance_loss_mlp": 1.03829992, + "epoch": 0.4711859311588757, + "flos": 25446588658560.0, + "grad_norm": 1.5467691894783375, + "language_loss": 0.69550622, + "learning_rate": 2.2827721543810475e-06, + "loss": 0.71700549, + "num_input_tokens_seen": 168412940, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7578125, + "step": 7837, + "time_per_iteration": 2.480307102203369 + }, + { + "auxiliary_loss_clip": 0.01113119, + "auxiliary_loss_mlp": 0.01041426, + "balance_loss_clip": 1.02703786, + "balance_loss_mlp": 1.03986847, + "epoch": 0.47124605441154366, + "flos": 21981891519360.0, + "grad_norm": 1.8364060529940534, + "language_loss": 0.66015977, + "learning_rate": 2.282386599665153e-06, + "loss": 0.68170524, + "num_input_tokens_seen": 168431995, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.734375, + "step": 7838, + "time_per_iteration": 2.461975336074829 + }, + { + "auxiliary_loss_clip": 0.01112229, + "auxiliary_loss_mlp": 0.01030934, + "balance_loss_clip": 1.01755917, + "balance_loss_mlp": 1.03790629, + "epoch": 0.4713061776642116, + "flos": 25412689198080.0, + "grad_norm": 1.9120341376079564, + "language_loss": 0.77139461, + "learning_rate": 2.2820010342411304e-06, + "loss": 0.79282629, + "num_input_tokens_seen": 168454585, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7421875, + "step": 7839, + "time_per_iteration": 2.4788944721221924 + }, + { + "auxiliary_loss_clip": 0.01107554, + "auxiliary_loss_mlp": 0.01028891, + "balance_loss_clip": 1.0168395, + "balance_loss_mlp": 1.03794789, + "epoch": 0.4713663009168796, + "flos": 26542259170560.0, + "grad_norm": 1.9130481219619113, + "language_loss": 0.72918046, + "learning_rate": 2.2816154581235993e-06, + "loss": 0.75054491, + "num_input_tokens_seen": 168471265, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6953125, + "step": 7840, + "time_per_iteration": 2.495239019393921 + }, + { + "auxiliary_loss_clip": 0.01108309, + "auxiliary_loss_mlp": 0.01028993, + "balance_loss_clip": 1.01623714, + "balance_loss_mlp": 1.03712356, + "epoch": 0.47142642416954755, + "flos": 23623583650560.0, + "grad_norm": 1.5808172060169028, + "language_loss": 0.74886942, + "learning_rate": 2.2812298713271833e-06, + "loss": 0.77024251, + "num_input_tokens_seen": 168491360, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 7841, + "time_per_iteration": 2.454484224319458 + }, + { + "auxiliary_loss_clip": 0.01109803, + "auxiliary_loss_mlp": 0.01032659, + "balance_loss_clip": 1.02002275, + "balance_loss_mlp": 1.03838921, + "epoch": 0.4714865474222155, + "flos": 22310150935680.0, + "grad_norm": 1.602853925212418, + "language_loss": 0.70333457, + "learning_rate": 2.280844273866501e-06, + "loss": 0.72475922, + "num_input_tokens_seen": 168511335, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71484375, + "step": 7842, + "time_per_iteration": 2.4781782627105713 + }, + { + "auxiliary_loss_clip": 0.01111668, + "auxiliary_loss_mlp": 0.01029251, + "balance_loss_clip": 1.01659727, + "balance_loss_mlp": 1.04060411, + "epoch": 0.4715466706748835, + "flos": 17822430541440.0, + "grad_norm": 2.29732654226483, + "language_loss": 0.78893888, + "learning_rate": 2.280458665756177e-06, + "loss": 0.81034797, + "num_input_tokens_seen": 168529920, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 7843, + "time_per_iteration": 2.4125685691833496 + }, + { + "auxiliary_loss_clip": 0.01110204, + "auxiliary_loss_mlp": 0.0103101, + "balance_loss_clip": 1.01920795, + "balance_loss_mlp": 1.03860044, + "epoch": 0.4716067939275515, + "flos": 23659530186240.0, + "grad_norm": 1.6968163407172614, + "language_loss": 0.74375969, + "learning_rate": 2.280073047010832e-06, + "loss": 0.76517189, + "num_input_tokens_seen": 168550595, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.71484375, + "step": 7844, + "time_per_iteration": 3.915900230407715 + }, + { + "auxiliary_loss_clip": 0.01110838, + "auxiliary_loss_mlp": 0.0104121, + "balance_loss_clip": 1.0281688, + "balance_loss_mlp": 1.03888059, + "epoch": 0.47166691718021947, + "flos": 17930162407680.0, + "grad_norm": 1.5835392600478553, + "language_loss": 0.78286111, + "learning_rate": 2.279687417645088e-06, + "loss": 0.80438167, + "num_input_tokens_seen": 168569765, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 7845, + "time_per_iteration": 3.8502118587493896 + }, + { + "auxiliary_loss_clip": 0.01105883, + "auxiliary_loss_mlp": 0.01033342, + "balance_loss_clip": 1.02098632, + "balance_loss_mlp": 1.03725934, + "epoch": 0.47172704043288743, + "flos": 26614583205120.0, + "grad_norm": 1.4155938367608039, + "language_loss": 0.7311433, + "learning_rate": 2.2793017776735703e-06, + "loss": 0.75253546, + "num_input_tokens_seen": 168591525, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6875, + "step": 7846, + "time_per_iteration": 5.374008655548096 + }, + { + "auxiliary_loss_clip": 0.01105934, + "auxiliary_loss_mlp": 0.01030323, + "balance_loss_clip": 1.01794863, + "balance_loss_mlp": 1.03715074, + "epoch": 0.4717871636855554, + "flos": 27922700707200.0, + "grad_norm": 1.2885600176299252, + "language_loss": 0.74075842, + "learning_rate": 2.2789161271109e-06, + "loss": 0.76212096, + "num_input_tokens_seen": 168611235, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 7847, + "time_per_iteration": 2.5333058834075928 + }, + { + "auxiliary_loss_clip": 0.01110234, + "auxiliary_loss_mlp": 0.01034306, + "balance_loss_clip": 1.02229548, + "balance_loss_mlp": 1.03908157, + "epoch": 0.47184728693822336, + "flos": 14502237816960.0, + "grad_norm": 1.6263943719256755, + "language_loss": 0.80717957, + "learning_rate": 2.278530465971703e-06, + "loss": 0.82862496, + "num_input_tokens_seen": 168628710, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.7109375, + "step": 7848, + "time_per_iteration": 2.408688545227051 + }, + { + "auxiliary_loss_clip": 0.01115584, + "auxiliary_loss_mlp": 0.01031135, + "balance_loss_clip": 1.01844501, + "balance_loss_mlp": 1.04345632, + "epoch": 0.47190741019089133, + "flos": 17856545483520.0, + "grad_norm": 1.7499376956487047, + "language_loss": 0.70086265, + "learning_rate": 2.2781447942706032e-06, + "loss": 0.72232985, + "num_input_tokens_seen": 168645645, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 7849, + "time_per_iteration": 2.453542709350586 + }, + { + "auxiliary_loss_clip": 0.01114658, + "auxiliary_loss_mlp": 0.01035623, + "balance_loss_clip": 1.02144289, + "balance_loss_mlp": 1.03961349, + "epoch": 0.4719675334435593, + "flos": 17895472848000.0, + "grad_norm": 2.1591296324254095, + "language_loss": 0.69831544, + "learning_rate": 2.277759112022224e-06, + "loss": 0.71981823, + "num_input_tokens_seen": 168664165, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.75, + "step": 7850, + "time_per_iteration": 2.421095371246338 + }, + { + "auxiliary_loss_clip": 0.01115823, + "auxiliary_loss_mlp": 0.01030409, + "balance_loss_clip": 1.0175221, + "balance_loss_mlp": 1.04188704, + "epoch": 0.47202765669622726, + "flos": 20704369426560.0, + "grad_norm": 1.815710496912415, + "language_loss": 0.75220203, + "learning_rate": 2.2773734192411916e-06, + "loss": 0.7736643, + "num_input_tokens_seen": 168681940, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.73828125, + "step": 7851, + "time_per_iteration": 2.4666483402252197 + }, + { + "auxiliary_loss_clip": 0.01112485, + "auxiliary_loss_mlp": 0.01036309, + "balance_loss_clip": 1.02262962, + "balance_loss_mlp": 1.03831601, + "epoch": 0.4720877799488952, + "flos": 16360255607040.0, + "grad_norm": 1.7847776856215107, + "language_loss": 0.76165771, + "learning_rate": 2.276987715942132e-06, + "loss": 0.78314561, + "num_input_tokens_seen": 168698830, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7421875, + "step": 7852, + "time_per_iteration": 2.415109395980835 + }, + { + "auxiliary_loss_clip": 0.01111391, + "auxiliary_loss_mlp": 0.01029022, + "balance_loss_clip": 1.01553345, + "balance_loss_mlp": 1.04077876, + "epoch": 0.4721479032015632, + "flos": 20668171495680.0, + "grad_norm": 1.4478461916623044, + "language_loss": 0.68933171, + "learning_rate": 2.2766020021396696e-06, + "loss": 0.71073586, + "num_input_tokens_seen": 168718305, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.70703125, + "step": 7853, + "time_per_iteration": 2.4654150009155273 + }, + { + "auxiliary_loss_clip": 0.01033922, + "auxiliary_loss_mlp": 0.00998653, + "balance_loss_clip": 0.99743122, + "balance_loss_mlp": 1.01008511, + "epoch": 0.47220802645423116, + "flos": 67750438435200.0, + "grad_norm": 0.6983660788322832, + "language_loss": 0.50161922, + "learning_rate": 2.276216277848432e-06, + "loss": 0.52194494, + "num_input_tokens_seen": 168782365, + "router_z_loss_clip": 0.01220703, + "router_z_loss_mlp": 0.23828125, + "step": 7854, + "time_per_iteration": 3.190991163253784 + }, + { + "auxiliary_loss_clip": 0.0111395, + "auxiliary_loss_mlp": 0.01032681, + "balance_loss_clip": 1.0189656, + "balance_loss_mlp": 1.04039025, + "epoch": 0.4722681497068991, + "flos": 20921449271040.0, + "grad_norm": 1.7794050652620443, + "language_loss": 0.63844812, + "learning_rate": 2.2758305430830455e-06, + "loss": 0.65991443, + "num_input_tokens_seen": 168800485, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.734375, + "step": 7855, + "time_per_iteration": 2.503614664077759 + }, + { + "auxiliary_loss_clip": 0.01111503, + "auxiliary_loss_mlp": 0.01035422, + "balance_loss_clip": 1.02274394, + "balance_loss_mlp": 1.0393486, + "epoch": 0.4723282729595671, + "flos": 28293083798400.0, + "grad_norm": 1.8062233622492851, + "language_loss": 0.75802517, + "learning_rate": 2.2754447978581376e-06, + "loss": 0.7794944, + "num_input_tokens_seen": 168818965, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 7856, + "time_per_iteration": 2.499197244644165 + }, + { + "auxiliary_loss_clip": 0.01108332, + "auxiliary_loss_mlp": 0.01034982, + "balance_loss_clip": 1.02270377, + "balance_loss_mlp": 1.03774405, + "epoch": 0.4723883962122351, + "flos": 27125053338240.0, + "grad_norm": 1.914023874649731, + "language_loss": 0.7484442, + "learning_rate": 2.2750590421883347e-06, + "loss": 0.76987731, + "num_input_tokens_seen": 168840355, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.70703125, + "step": 7857, + "time_per_iteration": 2.5192370414733887 + }, + { + "auxiliary_loss_clip": 0.01109783, + "auxiliary_loss_mlp": 0.01043222, + "balance_loss_clip": 1.03118157, + "balance_loss_mlp": 1.03967714, + "epoch": 0.47244851946490307, + "flos": 31537253387520.0, + "grad_norm": 1.4716352183066603, + "language_loss": 0.6482265, + "learning_rate": 2.2746732760882655e-06, + "loss": 0.66975653, + "num_input_tokens_seen": 168861765, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.703125, + "step": 7858, + "time_per_iteration": 2.5169341564178467 + }, + { + "auxiliary_loss_clip": 0.01107114, + "auxiliary_loss_mlp": 0.01034557, + "balance_loss_clip": 1.02124774, + "balance_loss_mlp": 1.03680444, + "epoch": 0.47250864271757104, + "flos": 20886544229760.0, + "grad_norm": 1.569061056560701, + "language_loss": 0.70402861, + "learning_rate": 2.2742874995725575e-06, + "loss": 0.72544539, + "num_input_tokens_seen": 168881310, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.703125, + "step": 7859, + "time_per_iteration": 2.4850962162017822 + }, + { + "auxiliary_loss_clip": 0.01115812, + "auxiliary_loss_mlp": 0.01037422, + "balance_loss_clip": 1.0245533, + "balance_loss_mlp": 1.03993118, + "epoch": 0.472568765970239, + "flos": 20522086882560.0, + "grad_norm": 1.957216681544069, + "language_loss": 0.62261212, + "learning_rate": 2.2739017126558413e-06, + "loss": 0.64414442, + "num_input_tokens_seen": 168899470, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7578125, + "step": 7860, + "time_per_iteration": 2.435559034347534 + }, + { + "auxiliary_loss_clip": 0.01114067, + "auxiliary_loss_mlp": 0.01039582, + "balance_loss_clip": 1.02632594, + "balance_loss_mlp": 1.03998029, + "epoch": 0.47262888922290697, + "flos": 35805200417280.0, + "grad_norm": 2.1159962326169097, + "language_loss": 0.71988773, + "learning_rate": 2.2735159153527445e-06, + "loss": 0.7414242, + "num_input_tokens_seen": 168921495, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73828125, + "step": 7861, + "time_per_iteration": 2.5884346961975098 + }, + { + "auxiliary_loss_clip": 0.0111296, + "auxiliary_loss_mlp": 0.01037156, + "balance_loss_clip": 1.02440643, + "balance_loss_mlp": 1.03970647, + "epoch": 0.47268901247557493, + "flos": 20667740532480.0, + "grad_norm": 1.8695032169355525, + "language_loss": 0.85058391, + "learning_rate": 2.273130107677896e-06, + "loss": 0.87208509, + "num_input_tokens_seen": 168940515, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.734375, + "step": 7862, + "time_per_iteration": 2.439347505569458 + }, + { + "auxiliary_loss_clip": 0.01111085, + "auxiliary_loss_mlp": 0.01030878, + "balance_loss_clip": 1.01822364, + "balance_loss_mlp": 1.03786755, + "epoch": 0.4727491357282429, + "flos": 19573291082880.0, + "grad_norm": 1.736958967740828, + "language_loss": 0.8456251, + "learning_rate": 2.272744289645927e-06, + "loss": 0.86704469, + "num_input_tokens_seen": 168958340, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.734375, + "step": 7863, + "time_per_iteration": 2.48335862159729 + }, + { + "auxiliary_loss_clip": 0.01112215, + "auxiliary_loss_mlp": 0.0103699, + "balance_loss_clip": 1.02422917, + "balance_loss_mlp": 1.04029155, + "epoch": 0.47280925898091086, + "flos": 18217231902720.0, + "grad_norm": 1.8450896018132297, + "language_loss": 0.65939879, + "learning_rate": 2.272358461271467e-06, + "loss": 0.68089092, + "num_input_tokens_seen": 168974850, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.71875, + "step": 7864, + "time_per_iteration": 2.430302381515503 + }, + { + "auxiliary_loss_clip": 0.01111041, + "auxiliary_loss_mlp": 0.01030923, + "balance_loss_clip": 1.01771474, + "balance_loss_mlp": 1.03911948, + "epoch": 0.4728693822335788, + "flos": 17821820010240.0, + "grad_norm": 1.898956112201793, + "language_loss": 0.65435767, + "learning_rate": 2.271972622569147e-06, + "loss": 0.67577726, + "num_input_tokens_seen": 168992860, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71875, + "step": 7865, + "time_per_iteration": 2.4585866928100586 + }, + { + "auxiliary_loss_clip": 0.01107492, + "auxiliary_loss_mlp": 0.01033897, + "balance_loss_clip": 1.02195215, + "balance_loss_mlp": 1.0378449, + "epoch": 0.4729295054862468, + "flos": 20595057361920.0, + "grad_norm": 2.8918998215840244, + "language_loss": 0.74357843, + "learning_rate": 2.2715867735535976e-06, + "loss": 0.76499236, + "num_input_tokens_seen": 169010325, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6953125, + "step": 7866, + "time_per_iteration": 2.4264490604400635 + }, + { + "auxiliary_loss_clip": 0.01111501, + "auxiliary_loss_mlp": 0.0102998, + "balance_loss_clip": 1.01718307, + "balance_loss_mlp": 1.03777552, + "epoch": 0.47298962873891476, + "flos": 23368079232000.0, + "grad_norm": 3.2754467592530476, + "language_loss": 0.8285951, + "learning_rate": 2.271200914239451e-06, + "loss": 0.85000992, + "num_input_tokens_seen": 169029840, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.734375, + "step": 7867, + "time_per_iteration": 2.4925811290740967 + }, + { + "auxiliary_loss_clip": 0.011073, + "auxiliary_loss_mlp": 0.01029056, + "balance_loss_clip": 1.01655674, + "balance_loss_mlp": 1.03702307, + "epoch": 0.4730497519915827, + "flos": 22052240305920.0, + "grad_norm": 1.5927913973026295, + "language_loss": 0.79137915, + "learning_rate": 2.2708150446413385e-06, + "loss": 0.81274265, + "num_input_tokens_seen": 169049975, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 7868, + "time_per_iteration": 2.454094171524048 + }, + { + "auxiliary_loss_clip": 0.01114352, + "auxiliary_loss_mlp": 0.01029772, + "balance_loss_clip": 1.01608682, + "balance_loss_mlp": 1.03858244, + "epoch": 0.4731098752442507, + "flos": 21069724613760.0, + "grad_norm": 2.558281214251347, + "language_loss": 0.74588537, + "learning_rate": 2.2704291647738915e-06, + "loss": 0.76732659, + "num_input_tokens_seen": 169069540, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7578125, + "step": 7869, + "time_per_iteration": 2.4809184074401855 + }, + { + "auxiliary_loss_clip": 0.01114593, + "auxiliary_loss_mlp": 0.01042175, + "balance_loss_clip": 1.02767277, + "balance_loss_mlp": 1.04122782, + "epoch": 0.4731699984969187, + "flos": 22528775064960.0, + "grad_norm": 1.571794234452096, + "language_loss": 0.73950672, + "learning_rate": 2.2700432746517443e-06, + "loss": 0.76107442, + "num_input_tokens_seen": 169089940, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.734375, + "step": 7870, + "time_per_iteration": 2.4553706645965576 + }, + { + "auxiliary_loss_clip": 0.01117025, + "auxiliary_loss_mlp": 0.01038302, + "balance_loss_clip": 1.02400887, + "balance_loss_mlp": 1.04082036, + "epoch": 0.4732301217495867, + "flos": 24898124914560.0, + "grad_norm": 1.9039581815830153, + "language_loss": 0.81513011, + "learning_rate": 2.2696573742895292e-06, + "loss": 0.83668333, + "num_input_tokens_seen": 169109650, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.76171875, + "step": 7871, + "time_per_iteration": 2.5156424045562744 + }, + { + "auxiliary_loss_clip": 0.0111227, + "auxiliary_loss_mlp": 0.01033696, + "balance_loss_clip": 1.02067888, + "balance_loss_mlp": 1.03990555, + "epoch": 0.47329024500225464, + "flos": 22784423137920.0, + "grad_norm": 1.6438263319482285, + "language_loss": 0.75679815, + "learning_rate": 2.269271463701879e-06, + "loss": 0.77825779, + "num_input_tokens_seen": 169128990, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 7872, + "time_per_iteration": 2.453831672668457 + }, + { + "auxiliary_loss_clip": 0.01110565, + "auxiliary_loss_mlp": 0.01034313, + "balance_loss_clip": 1.02088451, + "balance_loss_mlp": 1.03784847, + "epoch": 0.4733503682549226, + "flos": 38695902220800.0, + "grad_norm": 1.7923349992019921, + "language_loss": 0.67857021, + "learning_rate": 2.268885542903428e-06, + "loss": 0.700019, + "num_input_tokens_seen": 169154645, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7265625, + "step": 7873, + "time_per_iteration": 2.6532957553863525 + }, + { + "auxiliary_loss_clip": 0.01113022, + "auxiliary_loss_mlp": 0.01031944, + "balance_loss_clip": 1.01881886, + "balance_loss_mlp": 1.04162037, + "epoch": 0.47341049150759057, + "flos": 22966849336320.0, + "grad_norm": 1.6289748569468698, + "language_loss": 0.72085869, + "learning_rate": 2.26849961190881e-06, + "loss": 0.74230838, + "num_input_tokens_seen": 169174995, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71484375, + "step": 7874, + "time_per_iteration": 2.474073648452759 + }, + { + "auxiliary_loss_clip": 0.01113429, + "auxiliary_loss_mlp": 0.01035269, + "balance_loss_clip": 1.02190506, + "balance_loss_mlp": 1.03987253, + "epoch": 0.47347061476025853, + "flos": 14538471661440.0, + "grad_norm": 2.446593699000123, + "language_loss": 0.65108937, + "learning_rate": 2.26811367073266e-06, + "loss": 0.67257631, + "num_input_tokens_seen": 169191815, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.734375, + "step": 7875, + "time_per_iteration": 2.4433648586273193 + }, + { + "auxiliary_loss_clip": 0.01115895, + "auxiliary_loss_mlp": 0.0103072, + "balance_loss_clip": 1.01718342, + "balance_loss_mlp": 1.04219341, + "epoch": 0.4735307380129265, + "flos": 30263250827520.0, + "grad_norm": 2.56524610984038, + "language_loss": 0.81091076, + "learning_rate": 2.2677277193896125e-06, + "loss": 0.83237696, + "num_input_tokens_seen": 169210430, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73828125, + "step": 7876, + "time_per_iteration": 2.540485143661499 + }, + { + "auxiliary_loss_clip": 0.01108757, + "auxiliary_loss_mlp": 0.01035508, + "balance_loss_clip": 1.02232385, + "balance_loss_mlp": 1.0358628, + "epoch": 0.47359086126559446, + "flos": 19391044452480.0, + "grad_norm": 1.7859307736041579, + "language_loss": 0.7925123, + "learning_rate": 2.267341757894304e-06, + "loss": 0.81395495, + "num_input_tokens_seen": 169229295, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73046875, + "step": 7877, + "time_per_iteration": 2.627589225769043 + }, + { + "auxiliary_loss_clip": 0.01110689, + "auxiliary_loss_mlp": 0.01030438, + "balance_loss_clip": 1.01751554, + "balance_loss_mlp": 1.03852785, + "epoch": 0.47365098451826243, + "flos": 21939408708480.0, + "grad_norm": 1.8692095295200843, + "language_loss": 0.70723194, + "learning_rate": 2.2669557862613685e-06, + "loss": 0.72864318, + "num_input_tokens_seen": 169247855, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.72265625, + "step": 7878, + "time_per_iteration": 2.535684108734131 + }, + { + "auxiliary_loss_clip": 0.01108668, + "auxiliary_loss_mlp": 0.01030671, + "balance_loss_clip": 1.01792121, + "balance_loss_mlp": 1.03918552, + "epoch": 0.4737111077709304, + "flos": 25845053207040.0, + "grad_norm": 1.811278524460759, + "language_loss": 0.75030494, + "learning_rate": 2.2665698045054425e-06, + "loss": 0.77169836, + "num_input_tokens_seen": 169268860, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 7879, + "time_per_iteration": 2.518188953399658 + }, + { + "auxiliary_loss_clip": 0.01034961, + "auxiliary_loss_mlp": 0.01000904, + "balance_loss_clip": 0.99943775, + "balance_loss_mlp": 1.01098931, + "epoch": 0.47377123102359836, + "flos": 67760886314880.0, + "grad_norm": 0.7286317750961989, + "language_loss": 0.6135056, + "learning_rate": 2.266183812641164e-06, + "loss": 0.63386428, + "num_input_tokens_seen": 169331855, + "router_z_loss_clip": 0.01464844, + "router_z_loss_mlp": 0.24023438, + "step": 7880, + "time_per_iteration": 3.0518951416015625 + }, + { + "auxiliary_loss_clip": 0.01110775, + "auxiliary_loss_mlp": 0.01033424, + "balance_loss_clip": 1.01922059, + "balance_loss_mlp": 1.03901792, + "epoch": 0.4738313542762663, + "flos": 24315977191680.0, + "grad_norm": 1.5146846775966347, + "language_loss": 0.6795128, + "learning_rate": 2.2657978106831675e-06, + "loss": 0.70095479, + "num_input_tokens_seen": 169352175, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.71875, + "step": 7881, + "time_per_iteration": 2.5058367252349854 + }, + { + "auxiliary_loss_clip": 0.01111211, + "auxiliary_loss_mlp": 0.01028797, + "balance_loss_clip": 1.01614857, + "balance_loss_mlp": 1.03997886, + "epoch": 0.4738914775289343, + "flos": 20705339093760.0, + "grad_norm": 1.916106799054198, + "language_loss": 0.77455914, + "learning_rate": 2.265411798646092e-06, + "loss": 0.79595923, + "num_input_tokens_seen": 169371215, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 7882, + "time_per_iteration": 2.475503921508789 + }, + { + "auxiliary_loss_clip": 0.01113056, + "auxiliary_loss_mlp": 0.01030232, + "balance_loss_clip": 1.01675582, + "balance_loss_mlp": 1.03993428, + "epoch": 0.4739516007816023, + "flos": 25446337263360.0, + "grad_norm": 1.505527482540033, + "language_loss": 0.7617712, + "learning_rate": 2.2650257765445747e-06, + "loss": 0.78320408, + "num_input_tokens_seen": 169391745, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 7883, + "time_per_iteration": 2.5051398277282715 + }, + { + "auxiliary_loss_clip": 0.01111273, + "auxiliary_loss_mlp": 0.01029698, + "balance_loss_clip": 1.01724708, + "balance_loss_mlp": 1.03893495, + "epoch": 0.4740117240342703, + "flos": 19974341410560.0, + "grad_norm": 1.7576670192685107, + "language_loss": 0.71994746, + "learning_rate": 2.2646397443932525e-06, + "loss": 0.74135715, + "num_input_tokens_seen": 169409845, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7265625, + "step": 7884, + "time_per_iteration": 2.4406635761260986 + }, + { + "auxiliary_loss_clip": 0.01117273, + "auxiliary_loss_mlp": 0.0103414, + "balance_loss_clip": 1.02024651, + "balance_loss_mlp": 1.04002821, + "epoch": 0.47407184728693824, + "flos": 15661146222720.0, + "grad_norm": 2.026641651540024, + "language_loss": 0.82025737, + "learning_rate": 2.2642537022067655e-06, + "loss": 0.84177154, + "num_input_tokens_seen": 169426085, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7734375, + "step": 7885, + "time_per_iteration": 2.463895797729492 + }, + { + "auxiliary_loss_clip": 0.01115601, + "auxiliary_loss_mlp": 0.01034821, + "balance_loss_clip": 1.02152371, + "balance_loss_mlp": 1.04353762, + "epoch": 0.4741319705396062, + "flos": 18588800142720.0, + "grad_norm": 1.728500395905687, + "language_loss": 0.73431885, + "learning_rate": 2.263867649999751e-06, + "loss": 0.75582302, + "num_input_tokens_seen": 169444705, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 7886, + "time_per_iteration": 3.8351001739501953 + }, + { + "auxiliary_loss_clip": 0.01116571, + "auxiliary_loss_mlp": 0.01034684, + "balance_loss_clip": 1.02036691, + "balance_loss_mlp": 1.03938007, + "epoch": 0.47419209379227417, + "flos": 13261093223040.0, + "grad_norm": 2.1265145819393667, + "language_loss": 0.73465097, + "learning_rate": 2.263481587786849e-06, + "loss": 0.75616348, + "num_input_tokens_seen": 169460850, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.76953125, + "step": 7887, + "time_per_iteration": 5.437266111373901 + }, + { + "auxiliary_loss_clip": 0.01108649, + "auxiliary_loss_mlp": 0.01028216, + "balance_loss_clip": 1.01562774, + "balance_loss_mlp": 1.03885245, + "epoch": 0.47425221704494214, + "flos": 20044043752320.0, + "grad_norm": 1.895223723891788, + "language_loss": 0.77138984, + "learning_rate": 2.2630955155826993e-06, + "loss": 0.79275852, + "num_input_tokens_seen": 169478890, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69921875, + "step": 7888, + "time_per_iteration": 3.8908259868621826 + }, + { + "auxiliary_loss_clip": 0.01113126, + "auxiliary_loss_mlp": 0.01032757, + "balance_loss_clip": 1.02004313, + "balance_loss_mlp": 1.04045427, + "epoch": 0.4743123402976101, + "flos": 27271892136960.0, + "grad_norm": 1.663584432705133, + "language_loss": 0.72822642, + "learning_rate": 2.2627094334019406e-06, + "loss": 0.74968517, + "num_input_tokens_seen": 169499690, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7265625, + "step": 7889, + "time_per_iteration": 2.5004560947418213 + }, + { + "auxiliary_loss_clip": 0.01036118, + "auxiliary_loss_mlp": 0.01004378, + "balance_loss_clip": 1.00301266, + "balance_loss_mlp": 1.0120219, + "epoch": 0.47437246355027807, + "flos": 55393970261760.0, + "grad_norm": 1.138520548555467, + "language_loss": 0.5608511, + "learning_rate": 2.262323341259214e-06, + "loss": 0.58125609, + "num_input_tokens_seen": 169560475, + "router_z_loss_clip": 0.01367188, + "router_z_loss_mlp": 0.24121094, + "step": 7890, + "time_per_iteration": 3.116922378540039 + }, + { + "auxiliary_loss_clip": 0.01114938, + "auxiliary_loss_mlp": 0.0103492, + "balance_loss_clip": 1.02009606, + "balance_loss_mlp": 1.04115105, + "epoch": 0.47443258680294603, + "flos": 23878477537920.0, + "grad_norm": 2.185015538438359, + "language_loss": 0.6552254, + "learning_rate": 2.2619372391691605e-06, + "loss": 0.67672396, + "num_input_tokens_seen": 169580110, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.734375, + "step": 7891, + "time_per_iteration": 2.475003242492676 + }, + { + "auxiliary_loss_clip": 0.011182, + "auxiliary_loss_mlp": 0.01035949, + "balance_loss_clip": 1.02170992, + "balance_loss_mlp": 1.04182184, + "epoch": 0.474492710055614, + "flos": 21977761455360.0, + "grad_norm": 2.136023484028619, + "language_loss": 0.70221758, + "learning_rate": 2.26155112714642e-06, + "loss": 0.72375906, + "num_input_tokens_seen": 169597510, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.76171875, + "step": 7892, + "time_per_iteration": 2.45662260055542 + }, + { + "auxiliary_loss_clip": 0.01036198, + "auxiliary_loss_mlp": 0.01003564, + "balance_loss_clip": 1.00212717, + "balance_loss_mlp": 1.01211762, + "epoch": 0.47455283330828196, + "flos": 62557180122240.0, + "grad_norm": 0.8097608885887184, + "language_loss": 0.5861572, + "learning_rate": 2.2611650052056355e-06, + "loss": 0.60655481, + "num_input_tokens_seen": 169660010, + "router_z_loss_clip": 0.01434326, + "router_z_loss_mlp": 0.24121094, + "step": 7893, + "time_per_iteration": 3.1652448177337646 + }, + { + "auxiliary_loss_clip": 0.01114001, + "auxiliary_loss_mlp": 0.01033874, + "balance_loss_clip": 1.02131534, + "balance_loss_mlp": 1.04149461, + "epoch": 0.47461295656094993, + "flos": 12093637380480.0, + "grad_norm": 1.8991850536849317, + "language_loss": 0.77645361, + "learning_rate": 2.2607788733614463e-06, + "loss": 0.79793239, + "num_input_tokens_seen": 169678485, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7265625, + "step": 7894, + "time_per_iteration": 2.4849085807800293 + }, + { + "auxiliary_loss_clip": 0.01112228, + "auxiliary_loss_mlp": 0.01031855, + "balance_loss_clip": 1.01912403, + "balance_loss_mlp": 1.04029822, + "epoch": 0.4746730798136179, + "flos": 20884568981760.0, + "grad_norm": 1.6188047164673534, + "language_loss": 0.74456996, + "learning_rate": 2.260392731628497e-06, + "loss": 0.76601076, + "num_input_tokens_seen": 169697335, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 7895, + "time_per_iteration": 2.456735372543335 + }, + { + "auxiliary_loss_clip": 0.01110765, + "auxiliary_loss_mlp": 0.01029148, + "balance_loss_clip": 1.01553416, + "balance_loss_mlp": 1.03990245, + "epoch": 0.4747332030662859, + "flos": 19974808287360.0, + "grad_norm": 1.9073077974003343, + "language_loss": 0.82539713, + "learning_rate": 2.260006580021429e-06, + "loss": 0.84679627, + "num_input_tokens_seen": 169715395, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7109375, + "step": 7896, + "time_per_iteration": 2.5201456546783447 + }, + { + "auxiliary_loss_clip": 0.01110585, + "auxiliary_loss_mlp": 0.01029897, + "balance_loss_clip": 1.0161047, + "balance_loss_mlp": 1.03953171, + "epoch": 0.4747933263189539, + "flos": 16034186920320.0, + "grad_norm": 1.922550471395919, + "language_loss": 0.75487721, + "learning_rate": 2.259620418554886e-06, + "loss": 0.77628207, + "num_input_tokens_seen": 169733755, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7109375, + "step": 7897, + "time_per_iteration": 2.42526912689209 + }, + { + "auxiliary_loss_clip": 0.01116598, + "auxiliary_loss_mlp": 0.01036051, + "balance_loss_clip": 1.02316415, + "balance_loss_mlp": 1.04003334, + "epoch": 0.47485344957162184, + "flos": 13955102876160.0, + "grad_norm": 2.1696415620255145, + "language_loss": 0.63682836, + "learning_rate": 2.25923424724351e-06, + "loss": 0.65835488, + "num_input_tokens_seen": 169751390, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.765625, + "step": 7898, + "time_per_iteration": 2.443390369415283 + }, + { + "auxiliary_loss_clip": 0.01111767, + "auxiliary_loss_mlp": 0.01036151, + "balance_loss_clip": 1.02263284, + "balance_loss_mlp": 1.03901982, + "epoch": 0.4749135728242898, + "flos": 20449080489600.0, + "grad_norm": 2.0733269605967997, + "language_loss": 0.6999402, + "learning_rate": 2.258848066101946e-06, + "loss": 0.72141939, + "num_input_tokens_seen": 169769500, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7265625, + "step": 7899, + "time_per_iteration": 2.5906245708465576 + }, + { + "auxiliary_loss_clip": 0.01114723, + "auxiliary_loss_mlp": 0.01036945, + "balance_loss_clip": 1.02314603, + "balance_loss_mlp": 1.04054523, + "epoch": 0.4749736960769578, + "flos": 28949961767040.0, + "grad_norm": 1.8534573860401393, + "language_loss": 0.68523431, + "learning_rate": 2.258461875144837e-06, + "loss": 0.70675093, + "num_input_tokens_seen": 169789215, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7421875, + "step": 7900, + "time_per_iteration": 2.5417144298553467 + }, + { + "auxiliary_loss_clip": 0.01112761, + "auxiliary_loss_mlp": 0.01033859, + "balance_loss_clip": 1.02096641, + "balance_loss_mlp": 1.03979492, + "epoch": 0.47503381932962574, + "flos": 31938770592000.0, + "grad_norm": 1.9751823447072345, + "language_loss": 0.70783907, + "learning_rate": 2.2580756743868273e-06, + "loss": 0.72930533, + "num_input_tokens_seen": 169808825, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.73046875, + "step": 7901, + "time_per_iteration": 2.5215682983398438 + }, + { + "auxiliary_loss_clip": 0.01115181, + "auxiliary_loss_mlp": 0.01041261, + "balance_loss_clip": 1.02833235, + "balance_loss_mlp": 1.0420568, + "epoch": 0.4750939425822937, + "flos": 22127257860480.0, + "grad_norm": 1.7245601487210742, + "language_loss": 0.73674953, + "learning_rate": 2.2576894638425636e-06, + "loss": 0.75831395, + "num_input_tokens_seen": 169827590, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.734375, + "step": 7902, + "time_per_iteration": 2.4854226112365723 + }, + { + "auxiliary_loss_clip": 0.01108872, + "auxiliary_loss_mlp": 0.01033639, + "balance_loss_clip": 1.02169394, + "balance_loss_mlp": 1.03990698, + "epoch": 0.47515406583496167, + "flos": 20850094903680.0, + "grad_norm": 1.6802974507725348, + "language_loss": 0.68601072, + "learning_rate": 2.257303243526688e-06, + "loss": 0.70743585, + "num_input_tokens_seen": 169844925, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.69140625, + "step": 7903, + "time_per_iteration": 2.44101619720459 + }, + { + "auxiliary_loss_clip": 0.01108361, + "auxiliary_loss_mlp": 0.0103213, + "balance_loss_clip": 1.01995277, + "balance_loss_mlp": 1.03901863, + "epoch": 0.47521418908762963, + "flos": 17524802448000.0, + "grad_norm": 1.4630263980427167, + "language_loss": 0.7225582, + "learning_rate": 2.256917013453848e-06, + "loss": 0.74396306, + "num_input_tokens_seen": 169862705, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6953125, + "step": 7904, + "time_per_iteration": 2.469230890274048 + }, + { + "auxiliary_loss_clip": 0.01108968, + "auxiliary_loss_mlp": 0.01030528, + "balance_loss_clip": 1.01894665, + "balance_loss_mlp": 1.03912354, + "epoch": 0.4752743123402976, + "flos": 20559434048640.0, + "grad_norm": 1.669936371268517, + "language_loss": 0.86257637, + "learning_rate": 2.25653077363869e-06, + "loss": 0.88397133, + "num_input_tokens_seen": 169880155, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6953125, + "step": 7905, + "time_per_iteration": 2.442215919494629 + }, + { + "auxiliary_loss_clip": 0.0110692, + "auxiliary_loss_mlp": 0.0102936, + "balance_loss_clip": 1.01750422, + "balance_loss_mlp": 1.03796053, + "epoch": 0.47533443559296557, + "flos": 26360623071360.0, + "grad_norm": 1.6116801799731275, + "language_loss": 0.82223809, + "learning_rate": 2.2561445240958583e-06, + "loss": 0.84360093, + "num_input_tokens_seen": 169901525, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 7906, + "time_per_iteration": 2.503708600997925 + }, + { + "auxiliary_loss_clip": 0.01033043, + "auxiliary_loss_mlp": 0.01004824, + "balance_loss_clip": 1.00345886, + "balance_loss_mlp": 1.00910616, + "epoch": 0.47539455884563353, + "flos": 65949660967680.0, + "grad_norm": 0.6702574149317626, + "language_loss": 0.59028685, + "learning_rate": 2.255758264840002e-06, + "loss": 0.61066544, + "num_input_tokens_seen": 169970345, + "router_z_loss_clip": 0.01367188, + "router_z_loss_mlp": 0.23925781, + "step": 7907, + "time_per_iteration": 3.156270980834961 + }, + { + "auxiliary_loss_clip": 0.01112242, + "auxiliary_loss_mlp": 0.01036172, + "balance_loss_clip": 1.02349377, + "balance_loss_mlp": 1.04145598, + "epoch": 0.4754546820983015, + "flos": 17238128002560.0, + "grad_norm": 1.9115330257313565, + "language_loss": 0.81044137, + "learning_rate": 2.255371995885765e-06, + "loss": 0.83192551, + "num_input_tokens_seen": 169986440, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 7908, + "time_per_iteration": 2.4719884395599365 + }, + { + "auxiliary_loss_clip": 0.01115991, + "auxiliary_loss_mlp": 0.01033248, + "balance_loss_clip": 1.01944923, + "balance_loss_mlp": 1.04349983, + "epoch": 0.47551480535096946, + "flos": 19825886499840.0, + "grad_norm": 1.7275790068018955, + "language_loss": 0.73515987, + "learning_rate": 2.254985717247797e-06, + "loss": 0.75665224, + "num_input_tokens_seen": 170005705, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.72265625, + "step": 7909, + "time_per_iteration": 2.4672436714172363 + }, + { + "auxiliary_loss_clip": 0.01110088, + "auxiliary_loss_mlp": 0.01031421, + "balance_loss_clip": 1.01887441, + "balance_loss_mlp": 1.03941047, + "epoch": 0.4755749286036375, + "flos": 22163958581760.0, + "grad_norm": 1.618978075546398, + "language_loss": 0.75284743, + "learning_rate": 2.2545994289407457e-06, + "loss": 0.77426249, + "num_input_tokens_seen": 170023415, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.70703125, + "step": 7910, + "time_per_iteration": 2.498745918273926 + }, + { + "auxiliary_loss_clip": 0.0110873, + "auxiliary_loss_mlp": 0.01026701, + "balance_loss_clip": 1.01494122, + "balance_loss_mlp": 1.03872323, + "epoch": 0.47563505185630545, + "flos": 21648280976640.0, + "grad_norm": 1.8146975429148502, + "language_loss": 0.78950047, + "learning_rate": 2.2542131309792577e-06, + "loss": 0.81085479, + "num_input_tokens_seen": 170042395, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.703125, + "step": 7911, + "time_per_iteration": 2.4530739784240723 + }, + { + "auxiliary_loss_clip": 0.01112727, + "auxiliary_loss_mlp": 0.01030628, + "balance_loss_clip": 1.01709199, + "balance_loss_mlp": 1.03904319, + "epoch": 0.4756951751089734, + "flos": 20628777254400.0, + "grad_norm": 1.5788116451196046, + "language_loss": 0.75611186, + "learning_rate": 2.253826823377983e-06, + "loss": 0.77754539, + "num_input_tokens_seen": 170061610, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 7912, + "time_per_iteration": 2.468348741531372 + }, + { + "auxiliary_loss_clip": 0.01107815, + "auxiliary_loss_mlp": 0.01033048, + "balance_loss_clip": 1.02094245, + "balance_loss_mlp": 1.03746927, + "epoch": 0.4757552983616414, + "flos": 25848788221440.0, + "grad_norm": 1.4305595105203048, + "language_loss": 0.74305665, + "learning_rate": 2.253440506151569e-06, + "loss": 0.76446521, + "num_input_tokens_seen": 170083505, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.703125, + "step": 7913, + "time_per_iteration": 2.4857094287872314 + }, + { + "auxiliary_loss_clip": 0.01111637, + "auxiliary_loss_mlp": 0.01026142, + "balance_loss_clip": 1.01336265, + "balance_loss_mlp": 1.04057527, + "epoch": 0.47581542161430934, + "flos": 18223013992320.0, + "grad_norm": 1.9652679728787295, + "language_loss": 0.72320372, + "learning_rate": 2.253054179314666e-06, + "loss": 0.74458152, + "num_input_tokens_seen": 170100690, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7109375, + "step": 7914, + "time_per_iteration": 2.4559848308563232 + }, + { + "auxiliary_loss_clip": 0.01114052, + "auxiliary_loss_mlp": 0.01031259, + "balance_loss_clip": 1.0191946, + "balance_loss_mlp": 1.04203475, + "epoch": 0.4758755448669773, + "flos": 21579763783680.0, + "grad_norm": 1.960460869956429, + "language_loss": 0.64513958, + "learning_rate": 2.2526678428819227e-06, + "loss": 0.66659272, + "num_input_tokens_seen": 170119240, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71875, + "step": 7915, + "time_per_iteration": 2.4528729915618896 + }, + { + "auxiliary_loss_clip": 0.01106319, + "auxiliary_loss_mlp": 0.01032601, + "balance_loss_clip": 1.020257, + "balance_loss_mlp": 1.03847694, + "epoch": 0.47593566811964527, + "flos": 15231152511360.0, + "grad_norm": 1.6765568872542898, + "language_loss": 0.76760435, + "learning_rate": 2.2522814968679896e-06, + "loss": 0.7889936, + "num_input_tokens_seen": 170136450, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6796875, + "step": 7916, + "time_per_iteration": 2.4544637203216553 + }, + { + "auxiliary_loss_clip": 0.01109831, + "auxiliary_loss_mlp": 0.01029473, + "balance_loss_clip": 1.01720083, + "balance_loss_mlp": 1.038872, + "epoch": 0.47599579137231324, + "flos": 21543242630400.0, + "grad_norm": 1.7964770898598468, + "language_loss": 0.64513361, + "learning_rate": 2.2518951412875173e-06, + "loss": 0.66652668, + "num_input_tokens_seen": 170155295, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.70703125, + "step": 7917, + "time_per_iteration": 2.4966535568237305 + }, + { + "auxiliary_loss_clip": 0.01033431, + "auxiliary_loss_mlp": 0.01003778, + "balance_loss_clip": 1.00258541, + "balance_loss_mlp": 1.00975943, + "epoch": 0.4760559146249812, + "flos": 64554602595840.0, + "grad_norm": 0.8336021747517385, + "language_loss": 0.6568867, + "learning_rate": 2.2515087761551557e-06, + "loss": 0.67725885, + "num_input_tokens_seen": 170222325, + "router_z_loss_clip": 0.01190186, + "router_z_loss_mlp": 0.23632812, + "step": 7918, + "time_per_iteration": 3.0902352333068848 + }, + { + "auxiliary_loss_clip": 0.01111138, + "auxiliary_loss_mlp": 0.01031837, + "balance_loss_clip": 1.01937342, + "balance_loss_mlp": 1.03909731, + "epoch": 0.47611603787764917, + "flos": 22233876405120.0, + "grad_norm": 1.7210259476746916, + "language_loss": 0.6884234, + "learning_rate": 2.2511224014855563e-06, + "loss": 0.70985305, + "num_input_tokens_seen": 170241625, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71875, + "step": 7919, + "time_per_iteration": 2.451730728149414 + }, + { + "auxiliary_loss_clip": 0.01111075, + "auxiliary_loss_mlp": 0.0103435, + "balance_loss_clip": 1.02188087, + "balance_loss_mlp": 1.03897047, + "epoch": 0.47617616113031713, + "flos": 22780005765120.0, + "grad_norm": 1.5380536315740185, + "language_loss": 0.74750632, + "learning_rate": 2.2507360172933694e-06, + "loss": 0.7689606, + "num_input_tokens_seen": 170262470, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71875, + "step": 7920, + "time_per_iteration": 2.5365359783172607 + }, + { + "auxiliary_loss_clip": 0.0111556, + "auxiliary_loss_mlp": 0.01032511, + "balance_loss_clip": 1.01854539, + "balance_loss_mlp": 1.04174948, + "epoch": 0.4762362843829851, + "flos": 24133802388480.0, + "grad_norm": 1.4190261222987137, + "language_loss": 0.77478063, + "learning_rate": 2.2503496235932487e-06, + "loss": 0.79626137, + "num_input_tokens_seen": 170283460, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.73828125, + "step": 7921, + "time_per_iteration": 2.4841856956481934 + }, + { + "auxiliary_loss_clip": 0.01112061, + "auxiliary_loss_mlp": 0.01035062, + "balance_loss_clip": 1.02113843, + "balance_loss_mlp": 1.03917885, + "epoch": 0.47629640763565306, + "flos": 22452069571200.0, + "grad_norm": 1.531083685843196, + "language_loss": 0.78213, + "learning_rate": 2.249963220399845e-06, + "loss": 0.80360126, + "num_input_tokens_seen": 170304225, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7265625, + "step": 7922, + "time_per_iteration": 2.537930965423584 + }, + { + "auxiliary_loss_clip": 0.01115671, + "auxiliary_loss_mlp": 0.0103746, + "balance_loss_clip": 1.02360809, + "balance_loss_mlp": 1.04113102, + "epoch": 0.4763565308883211, + "flos": 11181398647680.0, + "grad_norm": 1.7101716924021442, + "language_loss": 0.72932559, + "learning_rate": 2.2495768077278104e-06, + "loss": 0.75085688, + "num_input_tokens_seen": 170322110, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.74609375, + "step": 7923, + "time_per_iteration": 2.4527640342712402 + }, + { + "auxiliary_loss_clip": 0.01110421, + "auxiliary_loss_mlp": 0.01032066, + "balance_loss_clip": 1.01978159, + "balance_loss_mlp": 1.03808331, + "epoch": 0.47641665414098905, + "flos": 22382151747840.0, + "grad_norm": 2.125534979901623, + "language_loss": 0.81915551, + "learning_rate": 2.2491903855917992e-06, + "loss": 0.84058034, + "num_input_tokens_seen": 170340700, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.7265625, + "step": 7924, + "time_per_iteration": 2.480109930038452 + }, + { + "auxiliary_loss_clip": 0.01120558, + "auxiliary_loss_mlp": 0.01038344, + "balance_loss_clip": 1.0246644, + "balance_loss_mlp": 1.04359889, + "epoch": 0.476476777393657, + "flos": 25046148862080.0, + "grad_norm": 1.7710398873833821, + "language_loss": 0.80079067, + "learning_rate": 2.2488039540064626e-06, + "loss": 0.82237971, + "num_input_tokens_seen": 170359780, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76953125, + "step": 7925, + "time_per_iteration": 2.4877142906188965 + }, + { + "auxiliary_loss_clip": 0.01110581, + "auxiliary_loss_mlp": 0.01035936, + "balance_loss_clip": 1.02343702, + "balance_loss_mlp": 1.03800642, + "epoch": 0.476536900646325, + "flos": 27269916888960.0, + "grad_norm": 2.066985409764694, + "language_loss": 0.72263825, + "learning_rate": 2.2484175129864558e-06, + "loss": 0.74410343, + "num_input_tokens_seen": 170381260, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7265625, + "step": 7926, + "time_per_iteration": 2.561140298843384 + }, + { + "auxiliary_loss_clip": 0.01116818, + "auxiliary_loss_mlp": 0.01029726, + "balance_loss_clip": 1.01623797, + "balance_loss_mlp": 1.04205072, + "epoch": 0.47659702389899294, + "flos": 25301401885440.0, + "grad_norm": 8.404578303652414, + "language_loss": 0.68589562, + "learning_rate": 2.248031062546432e-06, + "loss": 0.7073611, + "num_input_tokens_seen": 170400595, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 7927, + "time_per_iteration": 2.4860117435455322 + }, + { + "auxiliary_loss_clip": 0.01111384, + "auxiliary_loss_mlp": 0.01025704, + "balance_loss_clip": 1.0138253, + "balance_loss_mlp": 1.04121518, + "epoch": 0.4766571471516609, + "flos": 25992861672960.0, + "grad_norm": 1.5906069345122125, + "language_loss": 0.68003678, + "learning_rate": 2.247644602701045e-06, + "loss": 0.70140767, + "num_input_tokens_seen": 170421110, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.703125, + "step": 7928, + "time_per_iteration": 3.917212724685669 + }, + { + "auxiliary_loss_clip": 0.0111287, + "auxiliary_loss_mlp": 0.01028459, + "balance_loss_clip": 1.0160315, + "balance_loss_mlp": 1.04099739, + "epoch": 0.4767172704043289, + "flos": 16032211672320.0, + "grad_norm": 2.0359036820122762, + "language_loss": 0.79055941, + "learning_rate": 2.2472581334649496e-06, + "loss": 0.81197274, + "num_input_tokens_seen": 170436700, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71875, + "step": 7929, + "time_per_iteration": 5.38159441947937 + }, + { + "auxiliary_loss_clip": 0.01109888, + "auxiliary_loss_mlp": 0.01032742, + "balance_loss_clip": 1.02098787, + "balance_loss_mlp": 1.04033756, + "epoch": 0.47677739365699684, + "flos": 39235351651200.0, + "grad_norm": 1.8427147864954625, + "language_loss": 0.6634798, + "learning_rate": 2.2468716548528016e-06, + "loss": 0.68490613, + "num_input_tokens_seen": 170459555, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6953125, + "step": 7930, + "time_per_iteration": 4.1562559604644775 + }, + { + "auxiliary_loss_clip": 0.01110022, + "auxiliary_loss_mlp": 0.01030402, + "balance_loss_clip": 1.01830864, + "balance_loss_mlp": 1.03929853, + "epoch": 0.4768375169096648, + "flos": 24717781704960.0, + "grad_norm": 1.7695493738399266, + "language_loss": 0.80279613, + "learning_rate": 2.2464851668792555e-06, + "loss": 0.82420039, + "num_input_tokens_seen": 170479175, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.70703125, + "step": 7931, + "time_per_iteration": 2.483144760131836 + }, + { + "auxiliary_loss_clip": 0.01112785, + "auxiliary_loss_mlp": 0.01029869, + "balance_loss_clip": 1.01667237, + "balance_loss_mlp": 1.04009867, + "epoch": 0.47689764016233277, + "flos": 22528667324160.0, + "grad_norm": 1.714860616709588, + "language_loss": 0.75956833, + "learning_rate": 2.2460986695589678e-06, + "loss": 0.78099489, + "num_input_tokens_seen": 170498450, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7265625, + "step": 7932, + "time_per_iteration": 2.4789490699768066 + }, + { + "auxiliary_loss_clip": 0.0111028, + "auxiliary_loss_mlp": 0.01033967, + "balance_loss_clip": 1.02110386, + "balance_loss_mlp": 1.04108882, + "epoch": 0.47695776341500074, + "flos": 15120619384320.0, + "grad_norm": 2.3368480026304748, + "language_loss": 0.79639196, + "learning_rate": 2.245712162906593e-06, + "loss": 0.81783438, + "num_input_tokens_seen": 170516255, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69140625, + "step": 7933, + "time_per_iteration": 2.4574432373046875 + }, + { + "auxiliary_loss_clip": 0.01116858, + "auxiliary_loss_mlp": 0.0103583, + "balance_loss_clip": 1.02131057, + "balance_loss_mlp": 1.04114437, + "epoch": 0.4770178866676687, + "flos": 14678917839360.0, + "grad_norm": 1.7879612820388389, + "language_loss": 0.73776019, + "learning_rate": 2.2453256469367888e-06, + "loss": 0.759287, + "num_input_tokens_seen": 170532705, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7578125, + "step": 7934, + "time_per_iteration": 2.4703593254089355 + }, + { + "auxiliary_loss_clip": 0.0111259, + "auxiliary_loss_mlp": 0.01028961, + "balance_loss_clip": 1.01611567, + "balance_loss_mlp": 1.03858674, + "epoch": 0.47707800992033667, + "flos": 22565583527040.0, + "grad_norm": 1.719427707895152, + "language_loss": 0.7973842, + "learning_rate": 2.244939121664211e-06, + "loss": 0.81879967, + "num_input_tokens_seen": 170551925, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7421875, + "step": 7935, + "time_per_iteration": 2.459326982498169 + }, + { + "auxiliary_loss_clip": 0.01119081, + "auxiliary_loss_mlp": 0.01039794, + "balance_loss_clip": 1.02566767, + "balance_loss_mlp": 1.04244995, + "epoch": 0.4771381331730047, + "flos": 30918225375360.0, + "grad_norm": 1.7712234775739364, + "language_loss": 0.71105671, + "learning_rate": 2.2445525871035177e-06, + "loss": 0.73264545, + "num_input_tokens_seen": 170572320, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.76953125, + "step": 7936, + "time_per_iteration": 2.599914312362671 + }, + { + "auxiliary_loss_clip": 0.01112402, + "auxiliary_loss_mlp": 0.01028093, + "balance_loss_clip": 1.01529551, + "balance_loss_mlp": 1.03864932, + "epoch": 0.47719825642567265, + "flos": 25738901539200.0, + "grad_norm": 2.8731818732430927, + "language_loss": 0.68026948, + "learning_rate": 2.2441660432693656e-06, + "loss": 0.7016744, + "num_input_tokens_seen": 170589470, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.73828125, + "step": 7937, + "time_per_iteration": 2.4884297847747803 + }, + { + "auxiliary_loss_clip": 0.01035404, + "auxiliary_loss_mlp": 0.00999711, + "balance_loss_clip": 0.99838793, + "balance_loss_mlp": 1.01120472, + "epoch": 0.4772583796783406, + "flos": 66355128668160.0, + "grad_norm": 0.7133873095384958, + "language_loss": 0.56401992, + "learning_rate": 2.2437794901764128e-06, + "loss": 0.58437109, + "num_input_tokens_seen": 170662265, + "router_z_loss_clip": 0.01324463, + "router_z_loss_mlp": 0.2421875, + "step": 7938, + "time_per_iteration": 3.27707576751709 + }, + { + "auxiliary_loss_clip": 0.01113753, + "auxiliary_loss_mlp": 0.01032085, + "balance_loss_clip": 1.01889467, + "balance_loss_mlp": 1.04162848, + "epoch": 0.4773185029310086, + "flos": 22051091070720.0, + "grad_norm": 1.6305385471502185, + "language_loss": 0.88721037, + "learning_rate": 2.243392927839317e-06, + "loss": 0.9086687, + "num_input_tokens_seen": 170679680, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.72265625, + "step": 7939, + "time_per_iteration": 2.503838300704956 + }, + { + "auxiliary_loss_clip": 0.01110311, + "auxiliary_loss_mlp": 0.01032369, + "balance_loss_clip": 1.02037096, + "balance_loss_mlp": 1.03832293, + "epoch": 0.47737862618367655, + "flos": 16727801523840.0, + "grad_norm": 2.146362570276984, + "language_loss": 0.76661658, + "learning_rate": 2.2430063562727367e-06, + "loss": 0.78804338, + "num_input_tokens_seen": 170697340, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.72265625, + "step": 7940, + "time_per_iteration": 2.4230127334594727 + }, + { + "auxiliary_loss_clip": 0.01109098, + "auxiliary_loss_mlp": 0.01031125, + "balance_loss_clip": 1.0194304, + "balance_loss_mlp": 1.03975916, + "epoch": 0.4774387494363445, + "flos": 19609453100160.0, + "grad_norm": 1.568994035010224, + "language_loss": 0.84892023, + "learning_rate": 2.2426197754913322e-06, + "loss": 0.87032247, + "num_input_tokens_seen": 170714905, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6953125, + "step": 7941, + "time_per_iteration": 2.4640510082244873 + }, + { + "auxiliary_loss_clip": 0.01116894, + "auxiliary_loss_mlp": 0.01035857, + "balance_loss_clip": 1.02263689, + "balance_loss_mlp": 1.04307771, + "epoch": 0.4774988726890125, + "flos": 16653969118080.0, + "grad_norm": 2.0154740266117104, + "language_loss": 0.75996536, + "learning_rate": 2.24223318550976e-06, + "loss": 0.78149283, + "num_input_tokens_seen": 170731810, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73828125, + "step": 7942, + "time_per_iteration": 2.4304351806640625 + }, + { + "auxiliary_loss_clip": 0.01113984, + "auxiliary_loss_mlp": 0.01038477, + "balance_loss_clip": 1.02646661, + "balance_loss_mlp": 1.0415473, + "epoch": 0.47755899594168044, + "flos": 20485565729280.0, + "grad_norm": 1.8198127192389717, + "language_loss": 0.64578187, + "learning_rate": 2.241846586342682e-06, + "loss": 0.66730648, + "num_input_tokens_seen": 170750270, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.72265625, + "step": 7943, + "time_per_iteration": 2.469884157180786 + }, + { + "auxiliary_loss_clip": 0.01114805, + "auxiliary_loss_mlp": 0.01036635, + "balance_loss_clip": 1.02318239, + "balance_loss_mlp": 1.04029822, + "epoch": 0.4776191191943484, + "flos": 21652806090240.0, + "grad_norm": 1.6437441778624493, + "language_loss": 0.73638076, + "learning_rate": 2.2414599780047577e-06, + "loss": 0.75789517, + "num_input_tokens_seen": 170769015, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 7944, + "time_per_iteration": 2.462620258331299 + }, + { + "auxiliary_loss_clip": 0.0111381, + "auxiliary_loss_mlp": 0.01034914, + "balance_loss_clip": 1.02092481, + "balance_loss_mlp": 1.04105759, + "epoch": 0.4776792424470164, + "flos": 18770220760320.0, + "grad_norm": 2.2015870606275785, + "language_loss": 0.67936689, + "learning_rate": 2.2410733605106456e-06, + "loss": 0.70085418, + "num_input_tokens_seen": 170785725, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7265625, + "step": 7945, + "time_per_iteration": 2.498506784439087 + }, + { + "auxiliary_loss_clip": 0.01110287, + "auxiliary_loss_mlp": 0.01036964, + "balance_loss_clip": 1.02483487, + "balance_loss_mlp": 1.03805077, + "epoch": 0.47773936569968434, + "flos": 29715828577920.0, + "grad_norm": 1.8282867356700874, + "language_loss": 0.75330615, + "learning_rate": 2.240686733875009e-06, + "loss": 0.77477872, + "num_input_tokens_seen": 170804600, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.72265625, + "step": 7946, + "time_per_iteration": 2.5168514251708984 + }, + { + "auxiliary_loss_clip": 0.01116531, + "auxiliary_loss_mlp": 0.01041116, + "balance_loss_clip": 1.02759135, + "balance_loss_mlp": 1.04283607, + "epoch": 0.4777994889523523, + "flos": 24791542283520.0, + "grad_norm": 1.7491504350819331, + "language_loss": 0.79312646, + "learning_rate": 2.240300098112506e-06, + "loss": 0.81470287, + "num_input_tokens_seen": 170824230, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.734375, + "step": 7947, + "time_per_iteration": 2.5980498790740967 + }, + { + "auxiliary_loss_clip": 0.01107555, + "auxiliary_loss_mlp": 0.01036726, + "balance_loss_clip": 1.02433419, + "balance_loss_mlp": 1.0381552, + "epoch": 0.47785961220502027, + "flos": 17858161595520.0, + "grad_norm": 1.7633094448758173, + "language_loss": 0.73717982, + "learning_rate": 2.2399134532377998e-06, + "loss": 0.75862265, + "num_input_tokens_seen": 170843365, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6953125, + "step": 7948, + "time_per_iteration": 2.446190357208252 + }, + { + "auxiliary_loss_clip": 0.01115036, + "auxiliary_loss_mlp": 0.01033851, + "balance_loss_clip": 1.02050555, + "balance_loss_mlp": 1.04240656, + "epoch": 0.4779197354576883, + "flos": 20266546550400.0, + "grad_norm": 1.5048270934573464, + "language_loss": 0.77945703, + "learning_rate": 2.2395267992655514e-06, + "loss": 0.80094588, + "num_input_tokens_seen": 170863515, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7265625, + "step": 7949, + "time_per_iteration": 2.4999916553497314 + }, + { + "auxiliary_loss_clip": 0.01107805, + "auxiliary_loss_mlp": 0.01032834, + "balance_loss_clip": 1.02077556, + "balance_loss_mlp": 1.0387454, + "epoch": 0.47797985871035625, + "flos": 17056599644160.0, + "grad_norm": 2.112378262987889, + "language_loss": 0.74019569, + "learning_rate": 2.2391401362104227e-06, + "loss": 0.7616021, + "num_input_tokens_seen": 170881245, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6875, + "step": 7950, + "time_per_iteration": 2.4387645721435547 + }, + { + "auxiliary_loss_clip": 0.01110159, + "auxiliary_loss_mlp": 0.01039658, + "balance_loss_clip": 1.02609253, + "balance_loss_mlp": 1.03978574, + "epoch": 0.4780399819630242, + "flos": 31358418549120.0, + "grad_norm": 1.7104198942075015, + "language_loss": 0.74135828, + "learning_rate": 2.2387534640870756e-06, + "loss": 0.76285648, + "num_input_tokens_seen": 170901285, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.703125, + "step": 7951, + "time_per_iteration": 2.579258680343628 + }, + { + "auxiliary_loss_clip": 0.01112662, + "auxiliary_loss_mlp": 0.01032575, + "balance_loss_clip": 1.01945043, + "balance_loss_mlp": 1.03915167, + "epoch": 0.4781001052156922, + "flos": 24899597372160.0, + "grad_norm": 1.8112920130665326, + "language_loss": 0.79960251, + "learning_rate": 2.238366782910174e-06, + "loss": 0.82105488, + "num_input_tokens_seen": 170919740, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.734375, + "step": 7952, + "time_per_iteration": 2.5007214546203613 + }, + { + "auxiliary_loss_clip": 0.01114258, + "auxiliary_loss_mlp": 0.01040283, + "balance_loss_clip": 1.02687836, + "balance_loss_mlp": 1.04040217, + "epoch": 0.47816022846836015, + "flos": 18697717157760.0, + "grad_norm": 1.7026148138194093, + "language_loss": 0.78196061, + "learning_rate": 2.23798009269438e-06, + "loss": 0.80350602, + "num_input_tokens_seen": 170938510, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73828125, + "step": 7953, + "time_per_iteration": 2.4699995517730713 + }, + { + "auxiliary_loss_clip": 0.01114922, + "auxiliary_loss_mlp": 0.01036085, + "balance_loss_clip": 1.02362204, + "balance_loss_mlp": 1.0405128, + "epoch": 0.4782203517210281, + "flos": 11977573559040.0, + "grad_norm": 2.2363441879819224, + "language_loss": 0.84142399, + "learning_rate": 2.2375933934543566e-06, + "loss": 0.86293399, + "num_input_tokens_seen": 170951170, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7421875, + "step": 7954, + "time_per_iteration": 2.41294527053833 + }, + { + "auxiliary_loss_clip": 0.01109876, + "auxiliary_loss_mlp": 0.01035461, + "balance_loss_clip": 1.02254462, + "balance_loss_mlp": 1.03839588, + "epoch": 0.4782804749736961, + "flos": 20813501923200.0, + "grad_norm": 1.442835840236476, + "language_loss": 0.70588672, + "learning_rate": 2.237206685204768e-06, + "loss": 0.72734004, + "num_input_tokens_seen": 170970990, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71484375, + "step": 7955, + "time_per_iteration": 2.4867892265319824 + }, + { + "auxiliary_loss_clip": 0.0111092, + "auxiliary_loss_mlp": 0.01037464, + "balance_loss_clip": 1.02507281, + "balance_loss_mlp": 1.03925073, + "epoch": 0.47834059822636404, + "flos": 23840304359040.0, + "grad_norm": 1.5835230785797205, + "language_loss": 0.817267, + "learning_rate": 2.2368199679602787e-06, + "loss": 0.83875084, + "num_input_tokens_seen": 170991215, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71484375, + "step": 7956, + "time_per_iteration": 2.4756619930267334 + }, + { + "auxiliary_loss_clip": 0.0111219, + "auxiliary_loss_mlp": 0.01033269, + "balance_loss_clip": 1.01935172, + "balance_loss_mlp": 1.04097366, + "epoch": 0.478400721479032, + "flos": 22633777497600.0, + "grad_norm": 1.8961411498697718, + "language_loss": 0.84901869, + "learning_rate": 2.2364332417355516e-06, + "loss": 0.87047327, + "num_input_tokens_seen": 171007325, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7109375, + "step": 7957, + "time_per_iteration": 2.4848859310150146 + }, + { + "auxiliary_loss_clip": 0.01110703, + "auxiliary_loss_mlp": 0.01032705, + "balance_loss_clip": 1.02065289, + "balance_loss_mlp": 1.0396328, + "epoch": 0.4784608447317, + "flos": 19354954262400.0, + "grad_norm": 1.5799276625975138, + "language_loss": 0.79682672, + "learning_rate": 2.2360465065452527e-06, + "loss": 0.81826073, + "num_input_tokens_seen": 171025650, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.7109375, + "step": 7958, + "time_per_iteration": 2.439040422439575 + }, + { + "auxiliary_loss_clip": 0.01109825, + "auxiliary_loss_mlp": 0.01034266, + "balance_loss_clip": 1.02074742, + "balance_loss_mlp": 1.03806448, + "epoch": 0.47852096798436794, + "flos": 24021114445440.0, + "grad_norm": 2.0401185124291406, + "language_loss": 0.82728368, + "learning_rate": 2.235659762404047e-06, + "loss": 0.8487246, + "num_input_tokens_seen": 171045045, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71875, + "step": 7959, + "time_per_iteration": 2.500182867050171 + }, + { + "auxiliary_loss_clip": 0.01108176, + "auxiliary_loss_mlp": 0.01033073, + "balance_loss_clip": 1.0219152, + "balance_loss_mlp": 1.04054058, + "epoch": 0.4785810912370359, + "flos": 25666433850240.0, + "grad_norm": 2.3853858164000292, + "language_loss": 0.7333414, + "learning_rate": 2.235273009326599e-06, + "loss": 0.75475383, + "num_input_tokens_seen": 171062910, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.67578125, + "step": 7960, + "time_per_iteration": 2.4852850437164307 + }, + { + "auxiliary_loss_clip": 0.01108515, + "auxiliary_loss_mlp": 0.01036385, + "balance_loss_clip": 1.02413607, + "balance_loss_mlp": 1.03937268, + "epoch": 0.47864121448970387, + "flos": 21432134885760.0, + "grad_norm": 1.8739024393884087, + "language_loss": 0.77067018, + "learning_rate": 2.2348862473275745e-06, + "loss": 0.79211915, + "num_input_tokens_seen": 171080875, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69140625, + "step": 7961, + "time_per_iteration": 2.482361316680908 + }, + { + "auxiliary_loss_clip": 0.01108097, + "auxiliary_loss_mlp": 0.01030382, + "balance_loss_clip": 1.01817513, + "balance_loss_mlp": 1.03838158, + "epoch": 0.47870133774237184, + "flos": 16143894034560.0, + "grad_norm": 1.629700477315198, + "language_loss": 0.77528512, + "learning_rate": 2.2344994764216405e-06, + "loss": 0.7966699, + "num_input_tokens_seen": 171099190, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6953125, + "step": 7962, + "time_per_iteration": 2.427537679672241 + }, + { + "auxiliary_loss_clip": 0.01112825, + "auxiliary_loss_mlp": 0.01034413, + "balance_loss_clip": 1.02196801, + "balance_loss_mlp": 1.04174328, + "epoch": 0.47876146099503986, + "flos": 26906788344960.0, + "grad_norm": 1.5913499246445781, + "language_loss": 0.64895082, + "learning_rate": 2.2341126966234635e-06, + "loss": 0.67042321, + "num_input_tokens_seen": 171119060, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 7963, + "time_per_iteration": 2.51082181930542 + }, + { + "auxiliary_loss_clip": 0.01110812, + "auxiliary_loss_mlp": 0.01030041, + "balance_loss_clip": 1.01748848, + "balance_loss_mlp": 1.03972077, + "epoch": 0.4788215842477078, + "flos": 45332085778560.0, + "grad_norm": 1.658229101322456, + "language_loss": 0.77974397, + "learning_rate": 2.2337259079477083e-06, + "loss": 0.80115253, + "num_input_tokens_seen": 171141900, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7109375, + "step": 7964, + "time_per_iteration": 2.6512372493743896 + }, + { + "auxiliary_loss_clip": 0.01113836, + "auxiliary_loss_mlp": 0.0103048, + "balance_loss_clip": 1.01617479, + "balance_loss_mlp": 1.03944111, + "epoch": 0.4788817075003758, + "flos": 22237180456320.0, + "grad_norm": 1.7558149312417117, + "language_loss": 0.76227248, + "learning_rate": 2.233339110409044e-06, + "loss": 0.78371561, + "num_input_tokens_seen": 171161045, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.74609375, + "step": 7965, + "time_per_iteration": 2.4919536113739014 + }, + { + "auxiliary_loss_clip": 0.01108501, + "auxiliary_loss_mlp": 0.01032128, + "balance_loss_clip": 1.01957512, + "balance_loss_mlp": 1.0382036, + "epoch": 0.47894183075304375, + "flos": 16471183783680.0, + "grad_norm": 2.251400870531799, + "language_loss": 0.74590349, + "learning_rate": 2.232952304022137e-06, + "loss": 0.76730978, + "num_input_tokens_seen": 171179675, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 7966, + "time_per_iteration": 2.4254770278930664 + }, + { + "auxiliary_loss_clip": 0.01108309, + "auxiliary_loss_mlp": 0.01030103, + "balance_loss_clip": 1.0169003, + "balance_loss_mlp": 1.03785586, + "epoch": 0.4790019540057117, + "flos": 24282688262400.0, + "grad_norm": 1.521959054408531, + "language_loss": 0.72728515, + "learning_rate": 2.232565488801655e-06, + "loss": 0.74866927, + "num_input_tokens_seen": 171201175, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 7967, + "time_per_iteration": 2.522883892059326 + }, + { + "auxiliary_loss_clip": 0.01103831, + "auxiliary_loss_mlp": 0.01026385, + "balance_loss_clip": 1.01433849, + "balance_loss_mlp": 1.0371958, + "epoch": 0.4790620772583797, + "flos": 25666469763840.0, + "grad_norm": 2.344774601020355, + "language_loss": 0.79174602, + "learning_rate": 2.232178664762267e-06, + "loss": 0.81304824, + "num_input_tokens_seen": 171221750, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66796875, + "step": 7968, + "time_per_iteration": 2.4777579307556152 + }, + { + "auxiliary_loss_clip": 0.01035385, + "auxiliary_loss_mlp": 0.01007575, + "balance_loss_clip": 1.00622833, + "balance_loss_mlp": 1.0118711, + "epoch": 0.47912220051104765, + "flos": 69428077102080.0, + "grad_norm": 0.7636022901302345, + "language_loss": 0.62258303, + "learning_rate": 2.2317918319186408e-06, + "loss": 0.64301264, + "num_input_tokens_seen": 171292235, + "router_z_loss_clip": 0.01348877, + "router_z_loss_mlp": 0.23535156, + "step": 7969, + "time_per_iteration": 4.618057012557983 + }, + { + "auxiliary_loss_clip": 0.01107101, + "auxiliary_loss_mlp": 0.01027179, + "balance_loss_clip": 1.01555026, + "balance_loss_mlp": 1.04000521, + "epoch": 0.4791823237637156, + "flos": 24168922911360.0, + "grad_norm": 1.5307915717866403, + "language_loss": 0.77086926, + "learning_rate": 2.2314049902854446e-06, + "loss": 0.79221207, + "num_input_tokens_seen": 171312215, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 7970, + "time_per_iteration": 2.469363212585449 + }, + { + "auxiliary_loss_clip": 0.01106643, + "auxiliary_loss_mlp": 0.01032734, + "balance_loss_clip": 1.01962733, + "balance_loss_mlp": 1.03676999, + "epoch": 0.4792424470163836, + "flos": 24751465683840.0, + "grad_norm": 1.595425961628827, + "language_loss": 0.70320344, + "learning_rate": 2.231018139877349e-06, + "loss": 0.72459716, + "num_input_tokens_seen": 171332975, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.69921875, + "step": 7971, + "time_per_iteration": 5.436426401138306 + }, + { + "auxiliary_loss_clip": 0.01107204, + "auxiliary_loss_mlp": 0.01025624, + "balance_loss_clip": 1.01228452, + "balance_loss_mlp": 1.03725302, + "epoch": 0.47930257026905154, + "flos": 23257905240960.0, + "grad_norm": 1.2757793979028687, + "language_loss": 0.79909688, + "learning_rate": 2.230631280709021e-06, + "loss": 0.82042515, + "num_input_tokens_seen": 171353880, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.69921875, + "step": 7972, + "time_per_iteration": 2.4788928031921387 + }, + { + "auxiliary_loss_clip": 0.01109213, + "auxiliary_loss_mlp": 0.01025441, + "balance_loss_clip": 1.01220274, + "balance_loss_mlp": 1.03801394, + "epoch": 0.4793626935217195, + "flos": 14064091718400.0, + "grad_norm": 2.154896563362021, + "language_loss": 0.69762838, + "learning_rate": 2.2302444127951327e-06, + "loss": 0.71897495, + "num_input_tokens_seen": 171370930, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 7973, + "time_per_iteration": 2.462674140930176 + }, + { + "auxiliary_loss_clip": 0.01108438, + "auxiliary_loss_mlp": 0.01031526, + "balance_loss_clip": 1.01943266, + "balance_loss_mlp": 1.0401777, + "epoch": 0.4794228167743875, + "flos": 21798854789760.0, + "grad_norm": 1.7300676969557445, + "language_loss": 0.78652924, + "learning_rate": 2.2298575361503523e-06, + "loss": 0.80792892, + "num_input_tokens_seen": 171387575, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 7974, + "time_per_iteration": 2.523935079574585 + }, + { + "auxiliary_loss_clip": 0.01035404, + "auxiliary_loss_mlp": 0.01004075, + "balance_loss_clip": 1.00275135, + "balance_loss_mlp": 1.01174331, + "epoch": 0.47948294002705544, + "flos": 66968805553920.0, + "grad_norm": 0.7575595850509929, + "language_loss": 0.54076326, + "learning_rate": 2.2294706507893517e-06, + "loss": 0.56115806, + "num_input_tokens_seen": 171449980, + "router_z_loss_clip": 0.01324463, + "router_z_loss_mlp": 0.23632812, + "step": 7975, + "time_per_iteration": 3.120290756225586 + }, + { + "auxiliary_loss_clip": 0.01113379, + "auxiliary_loss_mlp": 0.01033426, + "balance_loss_clip": 1.01946688, + "balance_loss_mlp": 1.03872228, + "epoch": 0.47954306327972346, + "flos": 12422471414400.0, + "grad_norm": 2.0952625936259226, + "language_loss": 0.90246761, + "learning_rate": 2.2290837567268008e-06, + "loss": 0.92393565, + "num_input_tokens_seen": 171465290, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75, + "step": 7976, + "time_per_iteration": 2.4177215099334717 + }, + { + "auxiliary_loss_clip": 0.01113502, + "auxiliary_loss_mlp": 0.01034601, + "balance_loss_clip": 1.02070153, + "balance_loss_mlp": 1.03989267, + "epoch": 0.4796031865323914, + "flos": 18361951799040.0, + "grad_norm": 2.1692733838107148, + "language_loss": 0.73631197, + "learning_rate": 2.2286968539773713e-06, + "loss": 0.75779295, + "num_input_tokens_seen": 171481130, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.734375, + "step": 7977, + "time_per_iteration": 2.478994846343994 + }, + { + "auxiliary_loss_clip": 0.01105095, + "auxiliary_loss_mlp": 0.01033101, + "balance_loss_clip": 1.02095962, + "balance_loss_mlp": 1.03737617, + "epoch": 0.4796633097850594, + "flos": 21835088634240.0, + "grad_norm": 1.5189317692466735, + "language_loss": 0.78386033, + "learning_rate": 2.228309942555734e-06, + "loss": 0.80524224, + "num_input_tokens_seen": 171501140, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6796875, + "step": 7978, + "time_per_iteration": 2.441770315170288 + }, + { + "auxiliary_loss_clip": 0.01110092, + "auxiliary_loss_mlp": 0.01032979, + "balance_loss_clip": 1.02036691, + "balance_loss_mlp": 1.03895688, + "epoch": 0.47972343303772735, + "flos": 23437350610560.0, + "grad_norm": 1.9080949377976553, + "language_loss": 0.89561266, + "learning_rate": 2.22792302247656e-06, + "loss": 0.91704339, + "num_input_tokens_seen": 171519835, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 7979, + "time_per_iteration": 2.5005874633789062 + }, + { + "auxiliary_loss_clip": 0.01111373, + "auxiliary_loss_mlp": 0.01032357, + "balance_loss_clip": 1.01854038, + "balance_loss_mlp": 1.03977728, + "epoch": 0.4797835562903953, + "flos": 24899776940160.0, + "grad_norm": 1.512941625260848, + "language_loss": 0.77104276, + "learning_rate": 2.227536093754523e-06, + "loss": 0.79248011, + "num_input_tokens_seen": 171540980, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.71875, + "step": 7980, + "time_per_iteration": 2.514702320098877 + }, + { + "auxiliary_loss_clip": 0.01112304, + "auxiliary_loss_mlp": 0.01031739, + "balance_loss_clip": 1.0177083, + "balance_loss_mlp": 1.03812611, + "epoch": 0.4798436795430633, + "flos": 35042996793600.0, + "grad_norm": 1.6709892763913308, + "language_loss": 0.71718562, + "learning_rate": 2.227149156404295e-06, + "loss": 0.738626, + "num_input_tokens_seen": 171563600, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 7981, + "time_per_iteration": 2.606919050216675 + }, + { + "auxiliary_loss_clip": 0.01107255, + "auxiliary_loss_mlp": 0.01029759, + "balance_loss_clip": 1.01743317, + "balance_loss_mlp": 1.03878653, + "epoch": 0.47990380279573125, + "flos": 20590209025920.0, + "grad_norm": 1.7550369517172573, + "language_loss": 0.70141387, + "learning_rate": 2.2267622104405473e-06, + "loss": 0.72278404, + "num_input_tokens_seen": 171580700, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.68359375, + "step": 7982, + "time_per_iteration": 2.4303736686706543 + }, + { + "auxiliary_loss_clip": 0.01102477, + "auxiliary_loss_mlp": 0.0102651, + "balance_loss_clip": 1.01558483, + "balance_loss_mlp": 1.03694749, + "epoch": 0.4799639260483992, + "flos": 26359402008960.0, + "grad_norm": 2.256566494766253, + "language_loss": 0.70977259, + "learning_rate": 2.2263752558779544e-06, + "loss": 0.73106241, + "num_input_tokens_seen": 171602035, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.65625, + "step": 7983, + "time_per_iteration": 2.520749092102051 + }, + { + "auxiliary_loss_clip": 0.01032541, + "auxiliary_loss_mlp": 0.01011047, + "balance_loss_clip": 1.00992036, + "balance_loss_mlp": 1.00916195, + "epoch": 0.4800240493010672, + "flos": 70979021521920.0, + "grad_norm": 0.8049867321392653, + "language_loss": 0.59458363, + "learning_rate": 2.2259882927311883e-06, + "loss": 0.6150195, + "num_input_tokens_seen": 171659215, + "router_z_loss_clip": 0.0112915, + "router_z_loss_mlp": 0.234375, + "step": 7984, + "time_per_iteration": 3.0019614696502686 + }, + { + "auxiliary_loss_clip": 0.01107343, + "auxiliary_loss_mlp": 0.01031912, + "balance_loss_clip": 1.01912713, + "balance_loss_mlp": 1.0376364, + "epoch": 0.48008417255373514, + "flos": 17086656349440.0, + "grad_norm": 1.5803111762139084, + "language_loss": 0.66603255, + "learning_rate": 2.2256013210149247e-06, + "loss": 0.68742514, + "num_input_tokens_seen": 171675710, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 7985, + "time_per_iteration": 2.459381341934204 + }, + { + "auxiliary_loss_clip": 0.01108889, + "auxiliary_loss_mlp": 0.01036103, + "balance_loss_clip": 1.02279973, + "balance_loss_mlp": 1.03655791, + "epoch": 0.4801442958064031, + "flos": 15413435055360.0, + "grad_norm": 1.8105960725352928, + "language_loss": 0.70750952, + "learning_rate": 2.225214340743835e-06, + "loss": 0.72895944, + "num_input_tokens_seen": 171692510, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.72265625, + "step": 7986, + "time_per_iteration": 2.412890911102295 + }, + { + "auxiliary_loss_clip": 0.01113566, + "auxiliary_loss_mlp": 0.01038838, + "balance_loss_clip": 1.02515244, + "balance_loss_mlp": 1.03964305, + "epoch": 0.4802044190590711, + "flos": 11473747441920.0, + "grad_norm": 2.571002176109277, + "language_loss": 0.78704774, + "learning_rate": 2.2248273519325956e-06, + "loss": 0.80857182, + "num_input_tokens_seen": 171710235, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7421875, + "step": 7987, + "time_per_iteration": 2.464531898498535 + }, + { + "auxiliary_loss_clip": 0.01107017, + "auxiliary_loss_mlp": 0.01036694, + "balance_loss_clip": 1.02410507, + "balance_loss_mlp": 1.03615475, + "epoch": 0.48026454231173904, + "flos": 20951003185920.0, + "grad_norm": 1.8312114483143844, + "language_loss": 0.75309592, + "learning_rate": 2.2244403545958812e-06, + "loss": 0.77453303, + "num_input_tokens_seen": 171726715, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 7988, + "time_per_iteration": 2.4185469150543213 + }, + { + "auxiliary_loss_clip": 0.01113071, + "auxiliary_loss_mlp": 0.01029368, + "balance_loss_clip": 1.01667249, + "balance_loss_mlp": 1.04115009, + "epoch": 0.48032466556440706, + "flos": 20448110822400.0, + "grad_norm": 1.9770525324174564, + "language_loss": 0.78992975, + "learning_rate": 2.224053348748365e-06, + "loss": 0.81135416, + "num_input_tokens_seen": 171743605, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 7989, + "time_per_iteration": 2.4614450931549072 + }, + { + "auxiliary_loss_clip": 0.01113161, + "auxiliary_loss_mlp": 0.01036646, + "balance_loss_clip": 1.02273488, + "balance_loss_mlp": 1.03810394, + "epoch": 0.480384788817075, + "flos": 37120823861760.0, + "grad_norm": 1.6525338075260034, + "language_loss": 0.73414218, + "learning_rate": 2.223666334404724e-06, + "loss": 0.75564027, + "num_input_tokens_seen": 171765445, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 7990, + "time_per_iteration": 2.562366008758545 + }, + { + "auxiliary_loss_clip": 0.01032695, + "auxiliary_loss_mlp": 0.0100018, + "balance_loss_clip": 0.99901813, + "balance_loss_mlp": 1.00915992, + "epoch": 0.480444912069743, + "flos": 69552577641600.0, + "grad_norm": 1.0595345338831614, + "language_loss": 0.59085703, + "learning_rate": 2.223279311579633e-06, + "loss": 0.61118573, + "num_input_tokens_seen": 171830115, + "router_z_loss_clip": 0.01159668, + "router_z_loss_mlp": 0.23535156, + "step": 7991, + "time_per_iteration": 3.1877033710479736 + }, + { + "auxiliary_loss_clip": 0.01107688, + "auxiliary_loss_mlp": 0.01029352, + "balance_loss_clip": 1.01626837, + "balance_loss_mlp": 1.03751063, + "epoch": 0.48050503532241096, + "flos": 29822231640960.0, + "grad_norm": 1.8662124275999659, + "language_loss": 0.67495418, + "learning_rate": 2.222892280287768e-06, + "loss": 0.69632453, + "num_input_tokens_seen": 171849135, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 7992, + "time_per_iteration": 2.5135016441345215 + }, + { + "auxiliary_loss_clip": 0.01109706, + "auxiliary_loss_mlp": 0.0103267, + "balance_loss_clip": 1.01969361, + "balance_loss_mlp": 1.03664112, + "epoch": 0.4805651585750789, + "flos": 23948539015680.0, + "grad_norm": 1.6211148746347477, + "language_loss": 0.76493919, + "learning_rate": 2.2225052405438056e-06, + "loss": 0.78636301, + "num_input_tokens_seen": 171868880, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.73046875, + "step": 7993, + "time_per_iteration": 2.5075619220733643 + }, + { + "auxiliary_loss_clip": 0.01108105, + "auxiliary_loss_mlp": 0.01035536, + "balance_loss_clip": 1.02267301, + "balance_loss_mlp": 1.03899574, + "epoch": 0.4806252818277469, + "flos": 25665428269440.0, + "grad_norm": 1.5028541481112037, + "language_loss": 0.78277898, + "learning_rate": 2.222118192362422e-06, + "loss": 0.80421537, + "num_input_tokens_seen": 171889455, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69140625, + "step": 7994, + "time_per_iteration": 2.4792723655700684 + }, + { + "auxiliary_loss_clip": 0.01108503, + "auxiliary_loss_mlp": 0.01032981, + "balance_loss_clip": 1.02010691, + "balance_loss_mlp": 1.03752637, + "epoch": 0.48068540508041485, + "flos": 13151996640000.0, + "grad_norm": 1.8792905950371066, + "language_loss": 0.79627287, + "learning_rate": 2.2217311357582946e-06, + "loss": 0.81768769, + "num_input_tokens_seen": 171906070, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 7995, + "time_per_iteration": 2.4605226516723633 + }, + { + "auxiliary_loss_clip": 0.0110729, + "auxiliary_loss_mlp": 0.01029971, + "balance_loss_clip": 1.01676297, + "balance_loss_mlp": 1.03693795, + "epoch": 0.4807455283330828, + "flos": 21176738208000.0, + "grad_norm": 1.8681673839648991, + "language_loss": 0.8255161, + "learning_rate": 2.2213440707461e-06, + "loss": 0.84688872, + "num_input_tokens_seen": 171926515, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 7996, + "time_per_iteration": 2.4627599716186523 + }, + { + "auxiliary_loss_clip": 0.01108595, + "auxiliary_loss_mlp": 0.01028236, + "balance_loss_clip": 1.01562989, + "balance_loss_mlp": 1.03879523, + "epoch": 0.4808056515857508, + "flos": 12275991751680.0, + "grad_norm": 1.619215200240117, + "language_loss": 0.80642337, + "learning_rate": 2.220956997340516e-06, + "loss": 0.82779169, + "num_input_tokens_seen": 171943845, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69921875, + "step": 7997, + "time_per_iteration": 2.450486660003662 + }, + { + "auxiliary_loss_clip": 0.01108626, + "auxiliary_loss_mlp": 0.01034627, + "balance_loss_clip": 1.02174699, + "balance_loss_mlp": 1.03695917, + "epoch": 0.48086577483841875, + "flos": 24826052275200.0, + "grad_norm": 1.8605056175819474, + "language_loss": 0.72481054, + "learning_rate": 2.220569915556221e-06, + "loss": 0.74624306, + "num_input_tokens_seen": 171964970, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 7998, + "time_per_iteration": 2.484501361846924 + }, + { + "auxiliary_loss_clip": 0.0111064, + "auxiliary_loss_mlp": 0.01032099, + "balance_loss_clip": 1.01893795, + "balance_loss_mlp": 1.03890526, + "epoch": 0.4809258980910867, + "flos": 24465365856000.0, + "grad_norm": 1.7021894106986095, + "language_loss": 0.71182632, + "learning_rate": 2.220182825407892e-06, + "loss": 0.73325378, + "num_input_tokens_seen": 171986340, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71484375, + "step": 7999, + "time_per_iteration": 2.5011837482452393 + }, + { + "auxiliary_loss_clip": 0.01112886, + "auxiliary_loss_mlp": 0.0104057, + "balance_loss_clip": 1.02758801, + "balance_loss_mlp": 1.03862715, + "epoch": 0.4809860213437547, + "flos": 21215952881280.0, + "grad_norm": 2.087936802810397, + "language_loss": 0.71136171, + "learning_rate": 2.2197957269102083e-06, + "loss": 0.73289621, + "num_input_tokens_seen": 172007300, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7421875, + "step": 8000, + "time_per_iteration": 2.473083019256592 + }, + { + "auxiliary_loss_clip": 0.01111531, + "auxiliary_loss_mlp": 0.01036663, + "balance_loss_clip": 1.02291203, + "balance_loss_mlp": 1.03987443, + "epoch": 0.48104614459642264, + "flos": 37632084094080.0, + "grad_norm": 1.2945806687832948, + "language_loss": 0.75104553, + "learning_rate": 2.2194086200778485e-06, + "loss": 0.77252746, + "num_input_tokens_seen": 172029585, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.71484375, + "step": 8001, + "time_per_iteration": 2.6078953742980957 + }, + { + "auxiliary_loss_clip": 0.0111278, + "auxiliary_loss_mlp": 0.01040194, + "balance_loss_clip": 1.02701581, + "balance_loss_mlp": 1.03889596, + "epoch": 0.48110626784909066, + "flos": 18406122549120.0, + "grad_norm": 1.8640621993165467, + "language_loss": 0.81407833, + "learning_rate": 2.219021504925493e-06, + "loss": 0.83560812, + "num_input_tokens_seen": 172047495, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73828125, + "step": 8002, + "time_per_iteration": 2.4381091594696045 + }, + { + "auxiliary_loss_clip": 0.01113899, + "auxiliary_loss_mlp": 0.01038529, + "balance_loss_clip": 1.02415216, + "balance_loss_mlp": 1.04037309, + "epoch": 0.48116639110175863, + "flos": 28439814856320.0, + "grad_norm": 1.7407260367663493, + "language_loss": 0.71673185, + "learning_rate": 2.218634381467819e-06, + "loss": 0.7382561, + "num_input_tokens_seen": 172067625, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.734375, + "step": 8003, + "time_per_iteration": 2.5028979778289795 + }, + { + "auxiliary_loss_clip": 0.01110475, + "auxiliary_loss_mlp": 0.01038852, + "balance_loss_clip": 1.0263828, + "balance_loss_mlp": 1.04041362, + "epoch": 0.4812265143544266, + "flos": 21725237865600.0, + "grad_norm": 1.9713418243952783, + "language_loss": 0.82751715, + "learning_rate": 2.218247249719507e-06, + "loss": 0.84901035, + "num_input_tokens_seen": 172087885, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 8004, + "time_per_iteration": 2.4438235759735107 + }, + { + "auxiliary_loss_clip": 0.0112055, + "auxiliary_loss_mlp": 0.01044746, + "balance_loss_clip": 1.02951062, + "balance_loss_mlp": 1.04235947, + "epoch": 0.48128663760709456, + "flos": 13224679810560.0, + "grad_norm": 2.0081127141146964, + "language_loss": 0.77780354, + "learning_rate": 2.217860109695239e-06, + "loss": 0.7994566, + "num_input_tokens_seen": 172105815, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.78125, + "step": 8005, + "time_per_iteration": 2.4440789222717285 + }, + { + "auxiliary_loss_clip": 0.01109918, + "auxiliary_loss_mlp": 0.01035382, + "balance_loss_clip": 1.0218395, + "balance_loss_mlp": 1.03705537, + "epoch": 0.4813467608597625, + "flos": 24243437675520.0, + "grad_norm": 3.988142696329101, + "language_loss": 0.70656502, + "learning_rate": 2.217472961409692e-06, + "loss": 0.72801799, + "num_input_tokens_seen": 172126125, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7265625, + "step": 8006, + "time_per_iteration": 2.4627490043640137 + }, + { + "auxiliary_loss_clip": 0.0111164, + "auxiliary_loss_mlp": 0.01036734, + "balance_loss_clip": 1.02357328, + "balance_loss_mlp": 1.03939271, + "epoch": 0.4814068841124305, + "flos": 27480424544640.0, + "grad_norm": 1.9148811651735764, + "language_loss": 0.70463514, + "learning_rate": 2.2170858048775495e-06, + "loss": 0.72611892, + "num_input_tokens_seen": 172141945, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.72265625, + "step": 8007, + "time_per_iteration": 2.4923551082611084 + }, + { + "auxiliary_loss_clip": 0.01112639, + "auxiliary_loss_mlp": 0.01035836, + "balance_loss_clip": 1.02225244, + "balance_loss_mlp": 1.03924334, + "epoch": 0.48146700736509845, + "flos": 19572896033280.0, + "grad_norm": 2.0099977087556202, + "language_loss": 0.71720552, + "learning_rate": 2.2166986401134914e-06, + "loss": 0.7386902, + "num_input_tokens_seen": 172161095, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.734375, + "step": 8008, + "time_per_iteration": 2.443068742752075 + }, + { + "auxiliary_loss_clip": 0.01114704, + "auxiliary_loss_mlp": 0.01046807, + "balance_loss_clip": 1.0317508, + "balance_loss_mlp": 1.03984571, + "epoch": 0.4815271306177664, + "flos": 20627771673600.0, + "grad_norm": 1.7155117192574523, + "language_loss": 0.60448718, + "learning_rate": 2.216311467132199e-06, + "loss": 0.62610233, + "num_input_tokens_seen": 172178750, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.74609375, + "step": 8009, + "time_per_iteration": 2.4860730171203613 + }, + { + "auxiliary_loss_clip": 0.01041953, + "auxiliary_loss_mlp": 0.01003034, + "balance_loss_clip": 1.00188339, + "balance_loss_mlp": 1.01788867, + "epoch": 0.4815872538704344, + "flos": 67691076232320.0, + "grad_norm": 0.861211973736155, + "language_loss": 0.61329502, + "learning_rate": 2.2159242859483547e-06, + "loss": 0.6337449, + "num_input_tokens_seen": 172240235, + "router_z_loss_clip": 0.01147461, + "router_z_loss_mlp": 0.24121094, + "step": 8010, + "time_per_iteration": 3.073617935180664 + }, + { + "auxiliary_loss_clip": 0.01115187, + "auxiliary_loss_mlp": 0.01045892, + "balance_loss_clip": 1.03135991, + "balance_loss_mlp": 1.04191947, + "epoch": 0.48164737712310235, + "flos": 22820764723200.0, + "grad_norm": 2.200850795507016, + "language_loss": 0.73003197, + "learning_rate": 2.215537096576639e-06, + "loss": 0.75164282, + "num_input_tokens_seen": 172259875, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.734375, + "step": 8011, + "time_per_iteration": 3.875464677810669 + }, + { + "auxiliary_loss_clip": 0.01108987, + "auxiliary_loss_mlp": 0.01036612, + "balance_loss_clip": 1.02398205, + "balance_loss_mlp": 1.03922546, + "epoch": 0.4817075003757703, + "flos": 23733865382400.0, + "grad_norm": 1.7669872730797296, + "language_loss": 0.79906964, + "learning_rate": 2.2151498990317354e-06, + "loss": 0.82052571, + "num_input_tokens_seen": 172280150, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 8012, + "time_per_iteration": 5.410374164581299 + }, + { + "auxiliary_loss_clip": 0.01114267, + "auxiliary_loss_mlp": 0.01047469, + "balance_loss_clip": 1.03336632, + "balance_loss_mlp": 1.04086518, + "epoch": 0.4817676236284383, + "flos": 28182909807360.0, + "grad_norm": 1.5982967759080098, + "language_loss": 0.73816693, + "learning_rate": 2.214762693328326e-06, + "loss": 0.75978434, + "num_input_tokens_seen": 172300810, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.734375, + "step": 8013, + "time_per_iteration": 4.00807785987854 + }, + { + "auxiliary_loss_clip": 0.01112131, + "auxiliary_loss_mlp": 0.01033013, + "balance_loss_clip": 1.02043676, + "balance_loss_mlp": 1.04102039, + "epoch": 0.48182774688110624, + "flos": 17091756080640.0, + "grad_norm": 4.768803838152643, + "language_loss": 0.90554619, + "learning_rate": 2.214375479481094e-06, + "loss": 0.92699754, + "num_input_tokens_seen": 172317930, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 8014, + "time_per_iteration": 2.4615042209625244 + }, + { + "auxiliary_loss_clip": 0.01116604, + "auxiliary_loss_mlp": 0.01038374, + "balance_loss_clip": 1.02456379, + "balance_loss_mlp": 1.04058647, + "epoch": 0.4818878701337742, + "flos": 12567873669120.0, + "grad_norm": 3.0531094865391073, + "language_loss": 0.74407947, + "learning_rate": 2.213988257504722e-06, + "loss": 0.76562929, + "num_input_tokens_seen": 172336340, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 8015, + "time_per_iteration": 2.434838056564331 + }, + { + "auxiliary_loss_clip": 0.01117647, + "auxiliary_loss_mlp": 0.01040504, + "balance_loss_clip": 1.02588332, + "balance_loss_mlp": 1.04072225, + "epoch": 0.48194799338644223, + "flos": 24608505553920.0, + "grad_norm": 2.017951331310383, + "language_loss": 0.8059243, + "learning_rate": 2.213601027413894e-06, + "loss": 0.82750583, + "num_input_tokens_seen": 172354315, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.76953125, + "step": 8016, + "time_per_iteration": 2.513319492340088 + }, + { + "auxiliary_loss_clip": 0.01109398, + "auxiliary_loss_mlp": 0.01035038, + "balance_loss_clip": 1.02206254, + "balance_loss_mlp": 1.04101717, + "epoch": 0.4820081166391102, + "flos": 21105204272640.0, + "grad_norm": 2.4127244097624847, + "language_loss": 0.76781118, + "learning_rate": 2.2132137892232933e-06, + "loss": 0.78925556, + "num_input_tokens_seen": 172372695, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6796875, + "step": 8017, + "time_per_iteration": 2.4602606296539307 + }, + { + "auxiliary_loss_clip": 0.011107, + "auxiliary_loss_mlp": 0.01032569, + "balance_loss_clip": 1.01862764, + "balance_loss_mlp": 1.04151559, + "epoch": 0.48206823989177816, + "flos": 25264593423360.0, + "grad_norm": 1.9887798442379552, + "language_loss": 0.80156118, + "learning_rate": 2.2128265429476043e-06, + "loss": 0.82299387, + "num_input_tokens_seen": 172390905, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.69140625, + "step": 8018, + "time_per_iteration": 2.5529282093048096 + }, + { + "auxiliary_loss_clip": 0.01113443, + "auxiliary_loss_mlp": 0.01029419, + "balance_loss_clip": 1.01667559, + "balance_loss_mlp": 1.04109669, + "epoch": 0.4821283631444461, + "flos": 24645062620800.0, + "grad_norm": 1.7653706812529009, + "language_loss": 0.75843483, + "learning_rate": 2.2124392886015124e-06, + "loss": 0.77986348, + "num_input_tokens_seen": 172412295, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 8019, + "time_per_iteration": 2.4978489875793457 + }, + { + "auxiliary_loss_clip": 0.01112605, + "auxiliary_loss_mlp": 0.01036673, + "balance_loss_clip": 1.02286255, + "balance_loss_mlp": 1.03955722, + "epoch": 0.4821884863971141, + "flos": 23952094462080.0, + "grad_norm": 1.7828460534537498, + "language_loss": 0.78554976, + "learning_rate": 2.212052026199701e-06, + "loss": 0.80704254, + "num_input_tokens_seen": 172432625, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.73046875, + "step": 8020, + "time_per_iteration": 2.503870725631714 + }, + { + "auxiliary_loss_clip": 0.01112649, + "auxiliary_loss_mlp": 0.01034544, + "balance_loss_clip": 1.02043533, + "balance_loss_mlp": 1.04134321, + "epoch": 0.48224860964978206, + "flos": 17160668323200.0, + "grad_norm": 2.4275685595470207, + "language_loss": 0.69718045, + "learning_rate": 2.211664755756855e-06, + "loss": 0.71865243, + "num_input_tokens_seen": 172450010, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7109375, + "step": 8021, + "time_per_iteration": 2.4298038482666016 + }, + { + "auxiliary_loss_clip": 0.011165, + "auxiliary_loss_mlp": 0.01032417, + "balance_loss_clip": 1.01797438, + "balance_loss_mlp": 1.0407902, + "epoch": 0.48230873290245, + "flos": 23075838178560.0, + "grad_norm": 1.6547112313669838, + "language_loss": 0.62773043, + "learning_rate": 2.2112774772876603e-06, + "loss": 0.64921963, + "num_input_tokens_seen": 172469080, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7578125, + "step": 8022, + "time_per_iteration": 2.4862682819366455 + }, + { + "auxiliary_loss_clip": 0.01109497, + "auxiliary_loss_mlp": 0.01029473, + "balance_loss_clip": 1.01683092, + "balance_loss_mlp": 1.03976464, + "epoch": 0.482368856155118, + "flos": 19353517718400.0, + "grad_norm": 2.257171661165274, + "language_loss": 0.66345549, + "learning_rate": 2.2108901908068028e-06, + "loss": 0.68484527, + "num_input_tokens_seen": 172484850, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 8023, + "time_per_iteration": 2.4498074054718018 + }, + { + "auxiliary_loss_clip": 0.01109691, + "auxiliary_loss_mlp": 0.01035383, + "balance_loss_clip": 1.02181077, + "balance_loss_mlp": 1.0379076, + "epoch": 0.48242897940778595, + "flos": 20078984707200.0, + "grad_norm": 2.6609441563285485, + "language_loss": 0.76680458, + "learning_rate": 2.2105028963289683e-06, + "loss": 0.78825533, + "num_input_tokens_seen": 172503525, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.71875, + "step": 8024, + "time_per_iteration": 2.5641326904296875 + }, + { + "auxiliary_loss_clip": 0.01111982, + "auxiliary_loss_mlp": 0.01033968, + "balance_loss_clip": 1.01926339, + "balance_loss_mlp": 1.03856826, + "epoch": 0.4824891026604539, + "flos": 23403989854080.0, + "grad_norm": 1.4456982310337658, + "language_loss": 0.75299227, + "learning_rate": 2.2101155938688423e-06, + "loss": 0.77445179, + "num_input_tokens_seen": 172524360, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.734375, + "step": 8025, + "time_per_iteration": 2.4700748920440674 + }, + { + "auxiliary_loss_clip": 0.0111201, + "auxiliary_loss_mlp": 0.01034955, + "balance_loss_clip": 1.02159774, + "balance_loss_mlp": 1.04015994, + "epoch": 0.4825492259131219, + "flos": 20368675895040.0, + "grad_norm": 1.85740453148256, + "language_loss": 0.71010149, + "learning_rate": 2.209728283441112e-06, + "loss": 0.7315712, + "num_input_tokens_seen": 172541480, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.71875, + "step": 8026, + "time_per_iteration": 2.451942205429077 + }, + { + "auxiliary_loss_clip": 0.01115796, + "auxiliary_loss_mlp": 0.01043054, + "balance_loss_clip": 1.02739012, + "balance_loss_mlp": 1.04088664, + "epoch": 0.48260934916578985, + "flos": 14319021519360.0, + "grad_norm": 2.002376238963681, + "language_loss": 0.74738306, + "learning_rate": 2.209340965060465e-06, + "loss": 0.76897156, + "num_input_tokens_seen": 172559005, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.75, + "step": 8027, + "time_per_iteration": 2.511625051498413 + }, + { + "auxiliary_loss_clip": 0.01116324, + "auxiliary_loss_mlp": 0.01036177, + "balance_loss_clip": 1.02260458, + "balance_loss_mlp": 1.0418303, + "epoch": 0.4826694724184578, + "flos": 22121152548480.0, + "grad_norm": 1.8015680699639052, + "language_loss": 0.6744982, + "learning_rate": 2.2089536387415868e-06, + "loss": 0.69602323, + "num_input_tokens_seen": 172578435, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.74609375, + "step": 8028, + "time_per_iteration": 2.487610101699829 + }, + { + "auxiliary_loss_clip": 0.01114464, + "auxiliary_loss_mlp": 0.01039272, + "balance_loss_clip": 1.02490783, + "balance_loss_mlp": 1.04192257, + "epoch": 0.48272959567112583, + "flos": 16181169373440.0, + "grad_norm": 1.8869203156454395, + "language_loss": 0.73063505, + "learning_rate": 2.2085663044991655e-06, + "loss": 0.75217235, + "num_input_tokens_seen": 172596095, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7265625, + "step": 8029, + "time_per_iteration": 2.4256598949432373 + }, + { + "auxiliary_loss_clip": 0.01114009, + "auxiliary_loss_mlp": 0.01031401, + "balance_loss_clip": 1.01691651, + "balance_loss_mlp": 1.03949094, + "epoch": 0.4827897189237938, + "flos": 23180445561600.0, + "grad_norm": 1.9568889088417416, + "language_loss": 0.85374999, + "learning_rate": 2.2081789623478896e-06, + "loss": 0.87520409, + "num_input_tokens_seen": 172615255, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7421875, + "step": 8030, + "time_per_iteration": 2.4838480949401855 + }, + { + "auxiliary_loss_clip": 0.01111314, + "auxiliary_loss_mlp": 0.01032477, + "balance_loss_clip": 1.01917291, + "balance_loss_mlp": 1.03858352, + "epoch": 0.48284984217646176, + "flos": 21652626522240.0, + "grad_norm": 1.946134860300181, + "language_loss": 0.74173188, + "learning_rate": 2.2077916123024466e-06, + "loss": 0.76316977, + "num_input_tokens_seen": 172633185, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7265625, + "step": 8031, + "time_per_iteration": 2.475564956665039 + }, + { + "auxiliary_loss_clip": 0.01118074, + "auxiliary_loss_mlp": 0.01045075, + "balance_loss_clip": 1.03023958, + "balance_loss_mlp": 1.04181576, + "epoch": 0.48290996542912973, + "flos": 31467443304960.0, + "grad_norm": 1.8194651882134072, + "language_loss": 0.71833324, + "learning_rate": 2.2074042543775245e-06, + "loss": 0.73996472, + "num_input_tokens_seen": 172654280, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.76171875, + "step": 8032, + "time_per_iteration": 2.5389230251312256 + }, + { + "auxiliary_loss_clip": 0.01111799, + "auxiliary_loss_mlp": 0.01036984, + "balance_loss_clip": 1.02326274, + "balance_loss_mlp": 1.03896618, + "epoch": 0.4829700886817977, + "flos": 24461954064000.0, + "grad_norm": 1.5190699612157064, + "language_loss": 0.74008, + "learning_rate": 2.2070168885878126e-06, + "loss": 0.76156777, + "num_input_tokens_seen": 172675545, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7265625, + "step": 8033, + "time_per_iteration": 2.497344493865967 + }, + { + "auxiliary_loss_clip": 0.01118056, + "auxiliary_loss_mlp": 0.01037099, + "balance_loss_clip": 1.02273428, + "balance_loss_mlp": 1.04200494, + "epoch": 0.48303021193446566, + "flos": 25702164904320.0, + "grad_norm": 1.7070178882470917, + "language_loss": 0.82929307, + "learning_rate": 2.2066295149479996e-06, + "loss": 0.85084462, + "num_input_tokens_seen": 172696455, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.76171875, + "step": 8034, + "time_per_iteration": 2.504986524581909 + }, + { + "auxiliary_loss_clip": 0.01110784, + "auxiliary_loss_mlp": 0.01032285, + "balance_loss_clip": 1.01862347, + "balance_loss_mlp": 1.04048431, + "epoch": 0.4830903351871336, + "flos": 20085233673600.0, + "grad_norm": 2.2841237596844493, + "language_loss": 0.79519325, + "learning_rate": 2.2062421334727744e-06, + "loss": 0.81662393, + "num_input_tokens_seen": 172716720, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.703125, + "step": 8035, + "time_per_iteration": 2.497851610183716 + }, + { + "auxiliary_loss_clip": 0.01115806, + "auxiliary_loss_mlp": 0.01041785, + "balance_loss_clip": 1.02656746, + "balance_loss_mlp": 1.04139149, + "epoch": 0.4831504584398016, + "flos": 39452216014080.0, + "grad_norm": 1.7925521800027493, + "language_loss": 0.69359076, + "learning_rate": 2.2058547441768267e-06, + "loss": 0.71516669, + "num_input_tokens_seen": 172737435, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.7421875, + "step": 8036, + "time_per_iteration": 2.6260759830474854 + }, + { + "auxiliary_loss_clip": 0.01112129, + "auxiliary_loss_mlp": 0.01034751, + "balance_loss_clip": 1.0211308, + "balance_loss_mlp": 1.03983057, + "epoch": 0.48321058169246955, + "flos": 20006588845440.0, + "grad_norm": 2.034912964838748, + "language_loss": 0.72518653, + "learning_rate": 2.205467347074847e-06, + "loss": 0.74665534, + "num_input_tokens_seen": 172755700, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.72265625, + "step": 8037, + "time_per_iteration": 2.4452965259552 + }, + { + "auxiliary_loss_clip": 0.01120439, + "auxiliary_loss_mlp": 0.0103565, + "balance_loss_clip": 1.02053404, + "balance_loss_mlp": 1.04226792, + "epoch": 0.4832707049451375, + "flos": 20741465197440.0, + "grad_norm": 2.369475157435804, + "language_loss": 0.69122416, + "learning_rate": 2.205079942181525e-06, + "loss": 0.71278501, + "num_input_tokens_seen": 172775185, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.78125, + "step": 8038, + "time_per_iteration": 2.4694747924804688 + }, + { + "auxiliary_loss_clip": 0.01114495, + "auxiliary_loss_mlp": 0.01036639, + "balance_loss_clip": 1.02201188, + "balance_loss_mlp": 1.04133189, + "epoch": 0.4833308281978055, + "flos": 33145584762240.0, + "grad_norm": 1.4952565926757524, + "language_loss": 0.78972542, + "learning_rate": 2.20469252951155e-06, + "loss": 0.8112368, + "num_input_tokens_seen": 172796990, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.73046875, + "step": 8039, + "time_per_iteration": 2.5778839588165283 + }, + { + "auxiliary_loss_clip": 0.01116832, + "auxiliary_loss_mlp": 0.01032622, + "balance_loss_clip": 1.01874638, + "balance_loss_mlp": 1.04335415, + "epoch": 0.48339095145047345, + "flos": 19099234362240.0, + "grad_norm": 1.6799663014860025, + "language_loss": 0.76981616, + "learning_rate": 2.2043051090796143e-06, + "loss": 0.79131073, + "num_input_tokens_seen": 172814915, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.734375, + "step": 8040, + "time_per_iteration": 2.4846322536468506 + }, + { + "auxiliary_loss_clip": 0.01116146, + "auxiliary_loss_mlp": 0.01037901, + "balance_loss_clip": 1.02283335, + "balance_loss_mlp": 1.04120946, + "epoch": 0.4834510747031414, + "flos": 34459448440320.0, + "grad_norm": 1.5584368035119462, + "language_loss": 0.75443131, + "learning_rate": 2.203917680900409e-06, + "loss": 0.77597177, + "num_input_tokens_seen": 172837060, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.75, + "step": 8041, + "time_per_iteration": 2.5853140354156494 + }, + { + "auxiliary_loss_clip": 0.01115991, + "auxiliary_loss_mlp": 0.0103594, + "balance_loss_clip": 1.02178383, + "balance_loss_mlp": 1.04486728, + "epoch": 0.48351119795580944, + "flos": 27380845065600.0, + "grad_norm": 1.8135207231669344, + "language_loss": 0.66745925, + "learning_rate": 2.203530244988624e-06, + "loss": 0.68897855, + "num_input_tokens_seen": 172856545, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7109375, + "step": 8042, + "time_per_iteration": 2.5322182178497314 + }, + { + "auxiliary_loss_clip": 0.01040325, + "auxiliary_loss_mlp": 0.00998367, + "balance_loss_clip": 0.99714488, + "balance_loss_mlp": 1.0165081, + "epoch": 0.4835713212084774, + "flos": 67143941291520.0, + "grad_norm": 0.687656922942032, + "language_loss": 0.58557642, + "learning_rate": 2.2031428013589517e-06, + "loss": 0.60596335, + "num_input_tokens_seen": 172923055, + "router_z_loss_clip": 0.01220703, + "router_z_loss_mlp": 0.23828125, + "step": 8043, + "time_per_iteration": 3.1435444355010986 + }, + { + "auxiliary_loss_clip": 0.01115264, + "auxiliary_loss_mlp": 0.01034627, + "balance_loss_clip": 1.01982713, + "balance_loss_mlp": 1.04060805, + "epoch": 0.48363144446114537, + "flos": 17967473660160.0, + "grad_norm": 1.8614249809437893, + "language_loss": 0.71973354, + "learning_rate": 2.2027553500260847e-06, + "loss": 0.7412324, + "num_input_tokens_seen": 172940700, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7421875, + "step": 8044, + "time_per_iteration": 2.4688329696655273 + }, + { + "auxiliary_loss_clip": 0.01113296, + "auxiliary_loss_mlp": 0.01032041, + "balance_loss_clip": 1.01702118, + "balance_loss_mlp": 1.04181921, + "epoch": 0.48369156771381333, + "flos": 20593513077120.0, + "grad_norm": 1.358705165779184, + "language_loss": 0.75938857, + "learning_rate": 2.202367891004714e-06, + "loss": 0.78084195, + "num_input_tokens_seen": 172961125, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.71484375, + "step": 8045, + "time_per_iteration": 2.455991506576538 + }, + { + "auxiliary_loss_clip": 0.01117445, + "auxiliary_loss_mlp": 0.0104056, + "balance_loss_clip": 1.02640939, + "balance_loss_mlp": 1.04251719, + "epoch": 0.4837516909664813, + "flos": 22675075159680.0, + "grad_norm": 1.8505124624812508, + "language_loss": 0.69661564, + "learning_rate": 2.201980424309533e-06, + "loss": 0.71819568, + "num_input_tokens_seen": 172980405, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.75, + "step": 8046, + "time_per_iteration": 2.480437994003296 + }, + { + "auxiliary_loss_clip": 0.01113741, + "auxiliary_loss_mlp": 0.01036039, + "balance_loss_clip": 1.02159083, + "balance_loss_mlp": 1.04073739, + "epoch": 0.48381181421914926, + "flos": 25518625384320.0, + "grad_norm": 3.209923694390607, + "language_loss": 0.819103, + "learning_rate": 2.2015929499552337e-06, + "loss": 0.84060085, + "num_input_tokens_seen": 172999105, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.73046875, + "step": 8047, + "time_per_iteration": 2.4875996112823486 + }, + { + "auxiliary_loss_clip": 0.01111465, + "auxiliary_loss_mlp": 0.01031694, + "balance_loss_clip": 1.01802719, + "balance_loss_mlp": 1.04047942, + "epoch": 0.4838719374718172, + "flos": 24207491139840.0, + "grad_norm": 1.602624612336977, + "language_loss": 0.80215144, + "learning_rate": 2.2012054679565092e-06, + "loss": 0.82358307, + "num_input_tokens_seen": 173019935, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7109375, + "step": 8048, + "time_per_iteration": 2.5097532272338867 + }, + { + "auxiliary_loss_clip": 0.0111735, + "auxiliary_loss_mlp": 0.01036589, + "balance_loss_clip": 1.02204585, + "balance_loss_mlp": 1.0415504, + "epoch": 0.4839320607244852, + "flos": 26724577628160.0, + "grad_norm": 1.5504815305200743, + "language_loss": 0.81360143, + "learning_rate": 2.200817978328054e-06, + "loss": 0.83514082, + "num_input_tokens_seen": 173039700, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7578125, + "step": 8049, + "time_per_iteration": 2.5025296211242676 + }, + { + "auxiliary_loss_clip": 0.01111119, + "auxiliary_loss_mlp": 0.01034433, + "balance_loss_clip": 1.02170801, + "balance_loss_mlp": 1.04200411, + "epoch": 0.48399218397715316, + "flos": 20448900921600.0, + "grad_norm": 1.7765572151997517, + "language_loss": 0.72636938, + "learning_rate": 2.2004304810845602e-06, + "loss": 0.74782485, + "num_input_tokens_seen": 173059170, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 8050, + "time_per_iteration": 2.4983279705047607 + }, + { + "auxiliary_loss_clip": 0.01039152, + "auxiliary_loss_mlp": 0.01005399, + "balance_loss_clip": 1.00414741, + "balance_loss_mlp": 1.01505625, + "epoch": 0.4840523072298211, + "flos": 67180570185600.0, + "grad_norm": 0.7015070380534334, + "language_loss": 0.56459856, + "learning_rate": 2.200042976240723e-06, + "loss": 0.58504415, + "num_input_tokens_seen": 173119000, + "router_z_loss_clip": 0.01251221, + "router_z_loss_mlp": 0.24121094, + "step": 8051, + "time_per_iteration": 3.1124837398529053 + }, + { + "auxiliary_loss_clip": 0.01116144, + "auxiliary_loss_mlp": 0.01033942, + "balance_loss_clip": 1.0198456, + "balance_loss_mlp": 1.04258502, + "epoch": 0.4841124304824891, + "flos": 22411490181120.0, + "grad_norm": 2.416646260203107, + "language_loss": 0.7510823, + "learning_rate": 2.199655463811236e-06, + "loss": 0.77258313, + "num_input_tokens_seen": 173137570, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.734375, + "step": 8052, + "time_per_iteration": 3.970653772354126 + }, + { + "auxiliary_loss_clip": 0.01114314, + "auxiliary_loss_mlp": 0.01033056, + "balance_loss_clip": 1.01953709, + "balance_loss_mlp": 1.04124272, + "epoch": 0.48417255373515705, + "flos": 13843959217920.0, + "grad_norm": 3.0848333967382855, + "language_loss": 0.65859687, + "learning_rate": 2.1992679438107936e-06, + "loss": 0.68007052, + "num_input_tokens_seen": 173154355, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73046875, + "step": 8053, + "time_per_iteration": 2.489314079284668 + }, + { + "auxiliary_loss_clip": 0.01108306, + "auxiliary_loss_mlp": 0.01033063, + "balance_loss_clip": 1.01981306, + "balance_loss_mlp": 1.03776336, + "epoch": 0.484232676987825, + "flos": 31649689935360.0, + "grad_norm": 1.8753990029707186, + "language_loss": 0.6933912, + "learning_rate": 2.198880416254091e-06, + "loss": 0.71480489, + "num_input_tokens_seen": 173174845, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.703125, + "step": 8054, + "time_per_iteration": 4.118170976638794 + }, + { + "auxiliary_loss_clip": 0.01110556, + "auxiliary_loss_mlp": 0.01035408, + "balance_loss_clip": 1.02187181, + "balance_loss_mlp": 1.03860784, + "epoch": 0.48429280024049304, + "flos": 24095377814400.0, + "grad_norm": 1.7081803235265158, + "language_loss": 0.69577026, + "learning_rate": 2.1984928811558233e-06, + "loss": 0.7172299, + "num_input_tokens_seen": 173195025, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71875, + "step": 8055, + "time_per_iteration": 3.932403326034546 + }, + { + "auxiliary_loss_clip": 0.01115214, + "auxiliary_loss_mlp": 0.01036587, + "balance_loss_clip": 1.0229013, + "balance_loss_mlp": 1.04260492, + "epoch": 0.484352923493161, + "flos": 17530081747200.0, + "grad_norm": 2.9345474086324397, + "language_loss": 0.631603, + "learning_rate": 2.198105338530685e-06, + "loss": 0.65312105, + "num_input_tokens_seen": 173213065, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7265625, + "step": 8056, + "time_per_iteration": 2.4628608226776123 + }, + { + "auxiliary_loss_clip": 0.01110953, + "auxiliary_loss_mlp": 0.0103397, + "balance_loss_clip": 1.01945043, + "balance_loss_mlp": 1.03856075, + "epoch": 0.48441304674582897, + "flos": 29166862043520.0, + "grad_norm": 1.6727278675155979, + "language_loss": 0.67380416, + "learning_rate": 2.1977177883933726e-06, + "loss": 0.69525343, + "num_input_tokens_seen": 173234545, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7265625, + "step": 8057, + "time_per_iteration": 2.5488758087158203 + }, + { + "auxiliary_loss_clip": 0.0111047, + "auxiliary_loss_mlp": 0.01036278, + "balance_loss_clip": 1.02286661, + "balance_loss_mlp": 1.03944063, + "epoch": 0.48447316999849693, + "flos": 15886701676800.0, + "grad_norm": 1.62294394814829, + "language_loss": 0.81633735, + "learning_rate": 2.1973302307585827e-06, + "loss": 0.83780485, + "num_input_tokens_seen": 173252175, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7109375, + "step": 8058, + "time_per_iteration": 2.4864389896392822 + }, + { + "auxiliary_loss_clip": 0.01116596, + "auxiliary_loss_mlp": 0.01038397, + "balance_loss_clip": 1.02458692, + "balance_loss_mlp": 1.04142284, + "epoch": 0.4845332932511649, + "flos": 24381405815040.0, + "grad_norm": 1.5675258134335472, + "language_loss": 0.79917222, + "learning_rate": 2.1969426656410097e-06, + "loss": 0.82072222, + "num_input_tokens_seen": 173268790, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.75390625, + "step": 8059, + "time_per_iteration": 2.4964730739593506 + }, + { + "auxiliary_loss_clip": 0.01117834, + "auxiliary_loss_mlp": 0.0104156, + "balance_loss_clip": 1.02709424, + "balance_loss_mlp": 1.04217446, + "epoch": 0.48459341650383286, + "flos": 37116478316160.0, + "grad_norm": 2.4233986338774347, + "language_loss": 0.66882968, + "learning_rate": 2.196555093055352e-06, + "loss": 0.69042355, + "num_input_tokens_seen": 173288030, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7578125, + "step": 8060, + "time_per_iteration": 2.6209259033203125 + }, + { + "auxiliary_loss_clip": 0.01116591, + "auxiliary_loss_mlp": 0.01040178, + "balance_loss_clip": 1.02654088, + "balance_loss_mlp": 1.04357326, + "epoch": 0.48465353975650083, + "flos": 22966777509120.0, + "grad_norm": 1.8494683744964096, + "language_loss": 0.67328548, + "learning_rate": 2.1961675130163046e-06, + "loss": 0.69485319, + "num_input_tokens_seen": 173305965, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73046875, + "step": 8061, + "time_per_iteration": 2.460986614227295 + }, + { + "auxiliary_loss_clip": 0.01116735, + "auxiliary_loss_mlp": 0.0104191, + "balance_loss_clip": 1.0274322, + "balance_loss_mlp": 1.04356933, + "epoch": 0.4847136630091688, + "flos": 17707695523200.0, + "grad_norm": 2.133282380017761, + "language_loss": 0.82559311, + "learning_rate": 2.1957799255385653e-06, + "loss": 0.84717953, + "num_input_tokens_seen": 173321985, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.73046875, + "step": 8062, + "time_per_iteration": 2.453993320465088 + }, + { + "auxiliary_loss_clip": 0.01112047, + "auxiliary_loss_mlp": 0.01035491, + "balance_loss_clip": 1.022277, + "balance_loss_mlp": 1.04087675, + "epoch": 0.48477378626183676, + "flos": 22018269018240.0, + "grad_norm": 1.7643008090816974, + "language_loss": 0.7443378, + "learning_rate": 2.1953923306368325e-06, + "loss": 0.76581317, + "num_input_tokens_seen": 173341315, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 8063, + "time_per_iteration": 2.4603588581085205 + }, + { + "auxiliary_loss_clip": 0.01113086, + "auxiliary_loss_mlp": 0.01033324, + "balance_loss_clip": 1.01978183, + "balance_loss_mlp": 1.04069591, + "epoch": 0.4848339095145047, + "flos": 27962956874880.0, + "grad_norm": 1.6491790763512546, + "language_loss": 0.78826106, + "learning_rate": 2.1950047283258023e-06, + "loss": 0.80972517, + "num_input_tokens_seen": 173361055, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7265625, + "step": 8064, + "time_per_iteration": 2.5214664936065674 + }, + { + "auxiliary_loss_clip": 0.01110182, + "auxiliary_loss_mlp": 0.01036452, + "balance_loss_clip": 1.02426863, + "balance_loss_mlp": 1.04178667, + "epoch": 0.4848940327671727, + "flos": 21688752625920.0, + "grad_norm": 1.866783501124255, + "language_loss": 0.79383814, + "learning_rate": 2.194617118620173e-06, + "loss": 0.81530446, + "num_input_tokens_seen": 173379255, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.68359375, + "step": 8065, + "time_per_iteration": 2.445235013961792 + }, + { + "auxiliary_loss_clip": 0.01104927, + "auxiliary_loss_mlp": 0.01033365, + "balance_loss_clip": 1.02112269, + "balance_loss_mlp": 1.03714252, + "epoch": 0.48495415601984065, + "flos": 20631578515200.0, + "grad_norm": 2.505071872189949, + "language_loss": 0.76120496, + "learning_rate": 2.194229501534644e-06, + "loss": 0.78258789, + "num_input_tokens_seen": 173398370, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.67578125, + "step": 8066, + "time_per_iteration": 2.484790325164795 + }, + { + "auxiliary_loss_clip": 0.01111648, + "auxiliary_loss_mlp": 0.01033332, + "balance_loss_clip": 1.02022457, + "balance_loss_mlp": 1.04121971, + "epoch": 0.4850142792725086, + "flos": 25628152930560.0, + "grad_norm": 1.8377201756800503, + "language_loss": 0.7205655, + "learning_rate": 2.193841877083912e-06, + "loss": 0.74201524, + "num_input_tokens_seen": 173419595, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 8067, + "time_per_iteration": 2.4876203536987305 + }, + { + "auxiliary_loss_clip": 0.01112073, + "auxiliary_loss_mlp": 0.0103587, + "balance_loss_clip": 1.02231634, + "balance_loss_mlp": 1.04024172, + "epoch": 0.4850744025251766, + "flos": 13771958405760.0, + "grad_norm": 2.0010459311949393, + "language_loss": 0.79434109, + "learning_rate": 2.1934542452826767e-06, + "loss": 0.81582052, + "num_input_tokens_seen": 173435390, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.71875, + "step": 8068, + "time_per_iteration": 2.4537808895111084 + }, + { + "auxiliary_loss_clip": 0.01108507, + "auxiliary_loss_mlp": 0.01034395, + "balance_loss_clip": 1.02171147, + "balance_loss_mlp": 1.0385673, + "epoch": 0.4851345257778446, + "flos": 20261339078400.0, + "grad_norm": 1.4177927500996443, + "language_loss": 0.8413924, + "learning_rate": 2.193066606145638e-06, + "loss": 0.86282146, + "num_input_tokens_seen": 173454095, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 8069, + "time_per_iteration": 2.4553275108337402 + }, + { + "auxiliary_loss_clip": 0.0110935, + "auxiliary_loss_mlp": 0.0103299, + "balance_loss_clip": 1.02042496, + "balance_loss_mlp": 1.03913558, + "epoch": 0.48519464903051257, + "flos": 27089681420160.0, + "grad_norm": 1.6522403411207847, + "language_loss": 0.77863526, + "learning_rate": 2.192678959687493e-06, + "loss": 0.8000586, + "num_input_tokens_seen": 173475300, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 8070, + "time_per_iteration": 2.5032036304473877 + }, + { + "auxiliary_loss_clip": 0.01110754, + "auxiliary_loss_mlp": 0.01033168, + "balance_loss_clip": 1.01985812, + "balance_loss_mlp": 1.0400399, + "epoch": 0.48525477228318054, + "flos": 17127235739520.0, + "grad_norm": 2.1929202067055993, + "language_loss": 0.78031409, + "learning_rate": 2.192291305922943e-06, + "loss": 0.80175334, + "num_input_tokens_seen": 173492005, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.70703125, + "step": 8071, + "time_per_iteration": 2.4315407276153564 + }, + { + "auxiliary_loss_clip": 0.01109006, + "auxiliary_loss_mlp": 0.01032016, + "balance_loss_clip": 1.01822925, + "balance_loss_mlp": 1.03733289, + "epoch": 0.4853148955358485, + "flos": 28180324028160.0, + "grad_norm": 1.7778798626181176, + "language_loss": 0.72204757, + "learning_rate": 2.1919036448666873e-06, + "loss": 0.74345779, + "num_input_tokens_seen": 173511995, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.71875, + "step": 8072, + "time_per_iteration": 2.510474920272827 + }, + { + "auxiliary_loss_clip": 0.01116993, + "auxiliary_loss_mlp": 0.01039835, + "balance_loss_clip": 1.02580357, + "balance_loss_mlp": 1.04254019, + "epoch": 0.48537501878851647, + "flos": 17493309198720.0, + "grad_norm": 2.999761551965867, + "language_loss": 0.8779549, + "learning_rate": 2.1915159765334262e-06, + "loss": 0.89952314, + "num_input_tokens_seen": 173530215, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 8073, + "time_per_iteration": 2.4295654296875 + }, + { + "auxiliary_loss_clip": 0.01106811, + "auxiliary_loss_mlp": 0.01031305, + "balance_loss_clip": 1.01805508, + "balance_loss_mlp": 1.03857493, + "epoch": 0.48543514204118443, + "flos": 28584857975040.0, + "grad_norm": 1.702758380167849, + "language_loss": 0.60793108, + "learning_rate": 2.19112830093786e-06, + "loss": 0.62931222, + "num_input_tokens_seen": 173550920, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.68359375, + "step": 8074, + "time_per_iteration": 2.641831636428833 + }, + { + "auxiliary_loss_clip": 0.01112393, + "auxiliary_loss_mlp": 0.01039275, + "balance_loss_clip": 1.02540481, + "balance_loss_mlp": 1.03871894, + "epoch": 0.4854952652938524, + "flos": 20959981585920.0, + "grad_norm": 1.6649133015556126, + "language_loss": 0.73151296, + "learning_rate": 2.19074061809469e-06, + "loss": 0.75302958, + "num_input_tokens_seen": 173569065, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.734375, + "step": 8075, + "time_per_iteration": 2.4624290466308594 + }, + { + "auxiliary_loss_clip": 0.01108632, + "auxiliary_loss_mlp": 0.01035606, + "balance_loss_clip": 1.02328563, + "balance_loss_mlp": 1.04028702, + "epoch": 0.48555538854652036, + "flos": 66529543155840.0, + "grad_norm": 1.6285965401893183, + "language_loss": 0.82012558, + "learning_rate": 2.1903529280186163e-06, + "loss": 0.84156799, + "num_input_tokens_seen": 173596085, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.68359375, + "step": 8076, + "time_per_iteration": 2.902468681335449 + }, + { + "auxiliary_loss_clip": 0.01112144, + "auxiliary_loss_mlp": 0.01033517, + "balance_loss_clip": 1.01899099, + "balance_loss_mlp": 1.0407958, + "epoch": 0.4856155117991883, + "flos": 15924982596480.0, + "grad_norm": 1.793912725367087, + "language_loss": 0.86204815, + "learning_rate": 2.1899652307243407e-06, + "loss": 0.88350475, + "num_input_tokens_seen": 173613900, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7109375, + "step": 8077, + "time_per_iteration": 2.4470572471618652 + }, + { + "auxiliary_loss_clip": 0.01035955, + "auxiliary_loss_mlp": 0.01003512, + "balance_loss_clip": 1.00206935, + "balance_loss_mlp": 1.01168394, + "epoch": 0.4856756350518563, + "flos": 71047395060480.0, + "grad_norm": 0.9017192941717106, + "language_loss": 0.58489066, + "learning_rate": 2.189577526226564e-06, + "loss": 0.60528529, + "num_input_tokens_seen": 173671305, + "router_z_loss_clip": 0.0144043, + "router_z_loss_mlp": 0.24316406, + "step": 8078, + "time_per_iteration": 3.061302661895752 + }, + { + "auxiliary_loss_clip": 0.01115187, + "auxiliary_loss_mlp": 0.01030587, + "balance_loss_clip": 1.01750946, + "balance_loss_mlp": 1.04146993, + "epoch": 0.48573575830452426, + "flos": 29825679346560.0, + "grad_norm": 1.8290534457206422, + "language_loss": 0.72197151, + "learning_rate": 2.1891898145399884e-06, + "loss": 0.7434293, + "num_input_tokens_seen": 173692070, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.734375, + "step": 8079, + "time_per_iteration": 2.545018434524536 + }, + { + "auxiliary_loss_clip": 0.0111477, + "auxiliary_loss_mlp": 0.01029859, + "balance_loss_clip": 1.01643038, + "balance_loss_mlp": 1.04235518, + "epoch": 0.4857958815571922, + "flos": 17639501552640.0, + "grad_norm": 2.180592453343409, + "language_loss": 0.79515052, + "learning_rate": 2.1888020956793172e-06, + "loss": 0.81659681, + "num_input_tokens_seen": 173709785, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.72265625, + "step": 8080, + "time_per_iteration": 2.4793026447296143 + }, + { + "auxiliary_loss_clip": 0.01111199, + "auxiliary_loss_mlp": 0.01030093, + "balance_loss_clip": 1.01659858, + "balance_loss_mlp": 1.03938115, + "epoch": 0.4858560048098602, + "flos": 21105491581440.0, + "grad_norm": 2.102088815710231, + "language_loss": 0.83866465, + "learning_rate": 2.188414369659251e-06, + "loss": 0.86007756, + "num_input_tokens_seen": 173728770, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71875, + "step": 8081, + "time_per_iteration": 2.4615542888641357 + }, + { + "auxiliary_loss_clip": 0.0110941, + "auxiliary_loss_mlp": 0.01032193, + "balance_loss_clip": 1.01766098, + "balance_loss_mlp": 1.03858256, + "epoch": 0.4859161280625282, + "flos": 22090844448000.0, + "grad_norm": 1.4514708090647532, + "language_loss": 0.83281112, + "learning_rate": 2.1880266364944924e-06, + "loss": 0.85422719, + "num_input_tokens_seen": 173747355, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.70703125, + "step": 8082, + "time_per_iteration": 2.506359100341797 + }, + { + "auxiliary_loss_clip": 0.01111508, + "auxiliary_loss_mlp": 0.01032003, + "balance_loss_clip": 1.01930749, + "balance_loss_mlp": 1.04239488, + "epoch": 0.4859762513151962, + "flos": 17493452853120.0, + "grad_norm": 2.0513098734750153, + "language_loss": 0.87210095, + "learning_rate": 2.187638896199746e-06, + "loss": 0.89353603, + "num_input_tokens_seen": 173764825, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 8083, + "time_per_iteration": 2.4269142150878906 + }, + { + "auxiliary_loss_clip": 0.01109655, + "auxiliary_loss_mlp": 0.01038876, + "balance_loss_clip": 1.0264957, + "balance_loss_mlp": 1.03958535, + "epoch": 0.48603637456786414, + "flos": 18004246208640.0, + "grad_norm": 1.6599209376706838, + "language_loss": 0.8107174, + "learning_rate": 2.1872511487897126e-06, + "loss": 0.83220273, + "num_input_tokens_seen": 173783215, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69921875, + "step": 8084, + "time_per_iteration": 2.451949119567871 + }, + { + "auxiliary_loss_clip": 0.01112614, + "auxiliary_loss_mlp": 0.01035292, + "balance_loss_clip": 1.02148795, + "balance_loss_mlp": 1.04034543, + "epoch": 0.4860964978205321, + "flos": 22492038430080.0, + "grad_norm": 2.346430029405153, + "language_loss": 0.68347323, + "learning_rate": 2.186863394279098e-06, + "loss": 0.70495236, + "num_input_tokens_seen": 173801905, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.72265625, + "step": 8085, + "time_per_iteration": 2.499215841293335 + }, + { + "auxiliary_loss_clip": 0.0111142, + "auxiliary_loss_mlp": 0.01040793, + "balance_loss_clip": 1.0276444, + "balance_loss_mlp": 1.04064536, + "epoch": 0.48615662107320007, + "flos": 23372532518400.0, + "grad_norm": 1.46412171762657, + "language_loss": 0.77375883, + "learning_rate": 2.1864756326826046e-06, + "loss": 0.79528093, + "num_input_tokens_seen": 173824690, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.70703125, + "step": 8086, + "time_per_iteration": 2.541616678237915 + }, + { + "auxiliary_loss_clip": 0.01111956, + "auxiliary_loss_mlp": 0.01029921, + "balance_loss_clip": 1.01655173, + "balance_loss_mlp": 1.04059958, + "epoch": 0.48621674432586803, + "flos": 34418833136640.0, + "grad_norm": 1.9494281519542558, + "language_loss": 0.69733107, + "learning_rate": 2.1860878640149355e-06, + "loss": 0.71874988, + "num_input_tokens_seen": 173844450, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7109375, + "step": 8087, + "time_per_iteration": 2.5694613456726074 + }, + { + "auxiliary_loss_clip": 0.01115057, + "auxiliary_loss_mlp": 0.01037115, + "balance_loss_clip": 1.02278614, + "balance_loss_mlp": 1.03913963, + "epoch": 0.486276867578536, + "flos": 33107555237760.0, + "grad_norm": 1.610275852133116, + "language_loss": 0.72411895, + "learning_rate": 2.1857000882907974e-06, + "loss": 0.7456407, + "num_input_tokens_seen": 173864975, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7578125, + "step": 8088, + "time_per_iteration": 2.5770511627197266 + }, + { + "auxiliary_loss_clip": 0.01111259, + "auxiliary_loss_mlp": 0.01037547, + "balance_loss_clip": 1.02443993, + "balance_loss_mlp": 1.04033983, + "epoch": 0.48633699083120396, + "flos": 21470703114240.0, + "grad_norm": 1.6468852838011347, + "language_loss": 0.7557345, + "learning_rate": 2.185312305524892e-06, + "loss": 0.77722251, + "num_input_tokens_seen": 173883805, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 8089, + "time_per_iteration": 2.4625489711761475 + }, + { + "auxiliary_loss_clip": 0.01114004, + "auxiliary_loss_mlp": 0.01030212, + "balance_loss_clip": 1.0165205, + "balance_loss_mlp": 1.04078937, + "epoch": 0.48639711408387193, + "flos": 20084335833600.0, + "grad_norm": 1.5811587339913937, + "language_loss": 0.83939755, + "learning_rate": 2.184924515731926e-06, + "loss": 0.86083972, + "num_input_tokens_seen": 173903520, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73046875, + "step": 8090, + "time_per_iteration": 2.500293731689453 + }, + { + "auxiliary_loss_clip": 0.01107626, + "auxiliary_loss_mlp": 0.010336, + "balance_loss_clip": 1.02016521, + "balance_loss_mlp": 1.03945088, + "epoch": 0.4864572373365399, + "flos": 20778884190720.0, + "grad_norm": 1.6075799019512609, + "language_loss": 0.76256877, + "learning_rate": 2.1845367189266045e-06, + "loss": 0.78398097, + "num_input_tokens_seen": 173924255, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.68359375, + "step": 8091, + "time_per_iteration": 2.465998411178589 + }, + { + "auxiliary_loss_clip": 0.01110716, + "auxiliary_loss_mlp": 0.01030434, + "balance_loss_clip": 1.01714182, + "balance_loss_mlp": 1.03904068, + "epoch": 0.48651736058920786, + "flos": 26025360503040.0, + "grad_norm": 1.4690121920213544, + "language_loss": 0.80391169, + "learning_rate": 2.184148915123631e-06, + "loss": 0.82532316, + "num_input_tokens_seen": 173943285, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 8092, + "time_per_iteration": 2.509016513824463 + }, + { + "auxiliary_loss_clip": 0.01113066, + "auxiliary_loss_mlp": 0.01030079, + "balance_loss_clip": 1.01679361, + "balance_loss_mlp": 1.040061, + "epoch": 0.4865774838418758, + "flos": 20485601642880.0, + "grad_norm": 1.4222056252501818, + "language_loss": 0.71696734, + "learning_rate": 2.1837611043377126e-06, + "loss": 0.73839879, + "num_input_tokens_seen": 173962205, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73046875, + "step": 8093, + "time_per_iteration": 2.47951078414917 + }, + { + "auxiliary_loss_clip": 0.01109125, + "auxiliary_loss_mlp": 0.01032178, + "balance_loss_clip": 1.0194819, + "balance_loss_mlp": 1.03917289, + "epoch": 0.4866376070945438, + "flos": 23547704169600.0, + "grad_norm": 1.5524869827771763, + "language_loss": 0.67529863, + "learning_rate": 2.1833732865835545e-06, + "loss": 0.69671166, + "num_input_tokens_seen": 173980945, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 8094, + "time_per_iteration": 3.9874253273010254 + }, + { + "auxiliary_loss_clip": 0.01116894, + "auxiliary_loss_mlp": 0.01033116, + "balance_loss_clip": 1.01933527, + "balance_loss_mlp": 1.04218793, + "epoch": 0.4866977303472118, + "flos": 16690598012160.0, + "grad_norm": 1.8480915023468016, + "language_loss": 0.66936231, + "learning_rate": 2.1829854618758636e-06, + "loss": 0.69086242, + "num_input_tokens_seen": 173998860, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.74609375, + "step": 8095, + "time_per_iteration": 2.477593183517456 + }, + { + "auxiliary_loss_clip": 0.01112855, + "auxiliary_loss_mlp": 0.01032969, + "balance_loss_clip": 1.01847899, + "balance_loss_mlp": 1.04048705, + "epoch": 0.4867578535998798, + "flos": 17896011552000.0, + "grad_norm": 2.265808316415622, + "language_loss": 0.78996563, + "learning_rate": 2.182597630229345e-06, + "loss": 0.8114239, + "num_input_tokens_seen": 174016665, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.72265625, + "step": 8096, + "time_per_iteration": 5.404834985733032 + }, + { + "auxiliary_loss_clip": 0.01107949, + "auxiliary_loss_mlp": 0.01032056, + "balance_loss_clip": 1.01872253, + "balance_loss_mlp": 1.03737998, + "epoch": 0.48681797685254774, + "flos": 22637799820800.0, + "grad_norm": 1.7396987354687747, + "language_loss": 0.67313123, + "learning_rate": 2.1822097916587067e-06, + "loss": 0.69453126, + "num_input_tokens_seen": 174034800, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.70703125, + "step": 8097, + "time_per_iteration": 2.450967788696289 + }, + { + "auxiliary_loss_clip": 0.01108969, + "auxiliary_loss_mlp": 0.01033813, + "balance_loss_clip": 1.02071154, + "balance_loss_mlp": 1.03922939, + "epoch": 0.4868781001052157, + "flos": 20886077352960.0, + "grad_norm": 1.4534902730904964, + "language_loss": 0.71347374, + "learning_rate": 2.1818219461786543e-06, + "loss": 0.73490155, + "num_input_tokens_seen": 174054445, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.69921875, + "step": 8098, + "time_per_iteration": 2.4994144439697266 + }, + { + "auxiliary_loss_clip": 0.01116904, + "auxiliary_loss_mlp": 0.01037143, + "balance_loss_clip": 1.02274871, + "balance_loss_mlp": 1.04109979, + "epoch": 0.48693822335788367, + "flos": 41974940937600.0, + "grad_norm": 1.7962943745015671, + "language_loss": 0.66037756, + "learning_rate": 2.1814340938038956e-06, + "loss": 0.68191803, + "num_input_tokens_seen": 174077890, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7578125, + "step": 8099, + "time_per_iteration": 2.624321222305298 + }, + { + "auxiliary_loss_clip": 0.01107533, + "auxiliary_loss_mlp": 0.01032829, + "balance_loss_clip": 1.01988339, + "balance_loss_mlp": 1.03698707, + "epoch": 0.48699834661055164, + "flos": 24243294021120.0, + "grad_norm": 1.6079322443898665, + "language_loss": 0.66464651, + "learning_rate": 2.181046234549138e-06, + "loss": 0.68605012, + "num_input_tokens_seen": 174097460, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.70703125, + "step": 8100, + "time_per_iteration": 2.52364182472229 + }, + { + "auxiliary_loss_clip": 0.01108299, + "auxiliary_loss_mlp": 0.01030722, + "balance_loss_clip": 1.01802635, + "balance_loss_mlp": 1.03990841, + "epoch": 0.4870584698632196, + "flos": 25923877603200.0, + "grad_norm": 1.3375285332360751, + "language_loss": 0.76606798, + "learning_rate": 2.180658368429088e-06, + "loss": 0.78745818, + "num_input_tokens_seen": 174120775, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.68359375, + "step": 8101, + "time_per_iteration": 2.5515174865722656 + }, + { + "auxiliary_loss_clip": 0.01037344, + "auxiliary_loss_mlp": 0.01004126, + "balance_loss_clip": 1.00279069, + "balance_loss_mlp": 1.01343942, + "epoch": 0.48711859311588757, + "flos": 70211933648640.0, + "grad_norm": 0.6857117323737989, + "language_loss": 0.52317238, + "learning_rate": 2.1802704954584565e-06, + "loss": 0.54358709, + "num_input_tokens_seen": 174189135, + "router_z_loss_clip": 0.0133667, + "router_z_loss_mlp": 0.23925781, + "step": 8102, + "time_per_iteration": 3.2370035648345947 + }, + { + "auxiliary_loss_clip": 0.01110233, + "auxiliary_loss_mlp": 0.01033636, + "balance_loss_clip": 1.02098215, + "balance_loss_mlp": 1.03864419, + "epoch": 0.48717871636855553, + "flos": 12342964659840.0, + "grad_norm": 2.066543814817077, + "language_loss": 0.73703957, + "learning_rate": 2.1798826156519484e-06, + "loss": 0.75847828, + "num_input_tokens_seen": 174203250, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71484375, + "step": 8103, + "time_per_iteration": 2.401146650314331 + }, + { + "auxiliary_loss_clip": 0.01113681, + "auxiliary_loss_mlp": 0.01042266, + "balance_loss_clip": 1.02845609, + "balance_loss_mlp": 1.04083562, + "epoch": 0.4872388396212235, + "flos": 23477139901440.0, + "grad_norm": 2.0729106414348686, + "language_loss": 0.62816393, + "learning_rate": 2.1794947290242737e-06, + "loss": 0.64972341, + "num_input_tokens_seen": 174224145, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7265625, + "step": 8104, + "time_per_iteration": 2.489887237548828 + }, + { + "auxiliary_loss_clip": 0.01111014, + "auxiliary_loss_mlp": 0.01029613, + "balance_loss_clip": 1.01661348, + "balance_loss_mlp": 1.04093325, + "epoch": 0.48729896287389146, + "flos": 31427582186880.0, + "grad_norm": 2.098514623938467, + "language_loss": 0.68962336, + "learning_rate": 2.1791068355901413e-06, + "loss": 0.71102965, + "num_input_tokens_seen": 174244435, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.703125, + "step": 8105, + "time_per_iteration": 2.521994113922119 + }, + { + "auxiliary_loss_clip": 0.01106075, + "auxiliary_loss_mlp": 0.01029561, + "balance_loss_clip": 1.01682925, + "balance_loss_mlp": 1.0371716, + "epoch": 0.4873590861265594, + "flos": 19057936700160.0, + "grad_norm": 1.8440715600711883, + "language_loss": 0.73333305, + "learning_rate": 2.178718935364259e-06, + "loss": 0.75468934, + "num_input_tokens_seen": 174262710, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 8106, + "time_per_iteration": 2.471409797668457 + }, + { + "auxiliary_loss_clip": 0.01116936, + "auxiliary_loss_mlp": 0.01033734, + "balance_loss_clip": 1.01994157, + "balance_loss_mlp": 1.04300117, + "epoch": 0.4874192093792274, + "flos": 24348296453760.0, + "grad_norm": 1.861183691551934, + "language_loss": 0.77122629, + "learning_rate": 2.1783310283613373e-06, + "loss": 0.79273301, + "num_input_tokens_seen": 174281545, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.73828125, + "step": 8107, + "time_per_iteration": 2.4802913665771484 + }, + { + "auxiliary_loss_clip": 0.01109095, + "auxiliary_loss_mlp": 0.01027736, + "balance_loss_clip": 1.01563621, + "balance_loss_mlp": 1.04061639, + "epoch": 0.4874793326318954, + "flos": 23112610727040.0, + "grad_norm": 1.543990493512169, + "language_loss": 0.75148052, + "learning_rate": 2.1779431145960853e-06, + "loss": 0.77284884, + "num_input_tokens_seen": 174300290, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 8108, + "time_per_iteration": 2.4680538177490234 + }, + { + "auxiliary_loss_clip": 0.01108856, + "auxiliary_loss_mlp": 0.0102965, + "balance_loss_clip": 1.01803327, + "balance_loss_mlp": 1.04023099, + "epoch": 0.4875394558845634, + "flos": 19026156142080.0, + "grad_norm": 1.75674444511609, + "language_loss": 0.73340857, + "learning_rate": 2.177555194083212e-06, + "loss": 0.75479364, + "num_input_tokens_seen": 174318490, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6875, + "step": 8109, + "time_per_iteration": 2.4528889656066895 + }, + { + "auxiliary_loss_clip": 0.01108152, + "auxiliary_loss_mlp": 0.01030298, + "balance_loss_clip": 1.0175966, + "balance_loss_mlp": 1.0391928, + "epoch": 0.48759957913723134, + "flos": 21433607343360.0, + "grad_norm": 1.7970671112238439, + "language_loss": 0.78590822, + "learning_rate": 2.177167266837428e-06, + "loss": 0.80729276, + "num_input_tokens_seen": 174335505, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 8110, + "time_per_iteration": 2.4653971195220947 + }, + { + "auxiliary_loss_clip": 0.0111191, + "auxiliary_loss_mlp": 0.01040228, + "balance_loss_clip": 1.02730024, + "balance_loss_mlp": 1.04083896, + "epoch": 0.4876597023898993, + "flos": 17748669962880.0, + "grad_norm": 1.8027530171186463, + "language_loss": 0.72216076, + "learning_rate": 2.176779332873444e-06, + "loss": 0.74368215, + "num_input_tokens_seen": 174353990, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 8111, + "time_per_iteration": 2.4242806434631348 + }, + { + "auxiliary_loss_clip": 0.0111024, + "auxiliary_loss_mlp": 0.01034699, + "balance_loss_clip": 1.02137125, + "balance_loss_mlp": 1.04143023, + "epoch": 0.4877198256425673, + "flos": 17019647527680.0, + "grad_norm": 1.5451794032223725, + "language_loss": 0.75719351, + "learning_rate": 2.17639139220597e-06, + "loss": 0.77864289, + "num_input_tokens_seen": 174373425, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6875, + "step": 8112, + "time_per_iteration": 2.4681711196899414 + }, + { + "auxiliary_loss_clip": 0.0111572, + "auxiliary_loss_mlp": 0.01036111, + "balance_loss_clip": 1.0223484, + "balance_loss_mlp": 1.04125154, + "epoch": 0.48777994889523524, + "flos": 22384091082240.0, + "grad_norm": 1.5422638957013077, + "language_loss": 0.75012642, + "learning_rate": 2.1760034448497166e-06, + "loss": 0.77164471, + "num_input_tokens_seen": 174393070, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.74609375, + "step": 8113, + "time_per_iteration": 2.458070993423462 + }, + { + "auxiliary_loss_clip": 0.0103493, + "auxiliary_loss_mlp": 0.00999333, + "balance_loss_clip": 0.99799174, + "balance_loss_mlp": 1.01145339, + "epoch": 0.4878400721479032, + "flos": 61241772159360.0, + "grad_norm": 0.779968435998717, + "language_loss": 0.48876739, + "learning_rate": 2.1756154908193943e-06, + "loss": 0.50911003, + "num_input_tokens_seen": 174446880, + "router_z_loss_clip": 0.01342773, + "router_z_loss_mlp": 0.23535156, + "step": 8114, + "time_per_iteration": 2.964735507965088 + }, + { + "auxiliary_loss_clip": 0.01112827, + "auxiliary_loss_mlp": 0.01041502, + "balance_loss_clip": 1.02769804, + "balance_loss_mlp": 1.04015875, + "epoch": 0.48790019540057117, + "flos": 24536612482560.0, + "grad_norm": 1.346675786458265, + "language_loss": 0.76713175, + "learning_rate": 2.1752275301297155e-06, + "loss": 0.78867507, + "num_input_tokens_seen": 174468485, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7265625, + "step": 8115, + "time_per_iteration": 2.5008208751678467 + }, + { + "auxiliary_loss_clip": 0.01116462, + "auxiliary_loss_mlp": 0.01036306, + "balance_loss_clip": 1.02220368, + "balance_loss_mlp": 1.0430454, + "epoch": 0.48796031865323913, + "flos": 21833939399040.0, + "grad_norm": 2.9741706409780697, + "language_loss": 0.72150338, + "learning_rate": 2.1748395627953915e-06, + "loss": 0.74303102, + "num_input_tokens_seen": 174486360, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.734375, + "step": 8116, + "time_per_iteration": 2.471170425415039 + }, + { + "auxiliary_loss_clip": 0.01108955, + "auxiliary_loss_mlp": 0.01038046, + "balance_loss_clip": 1.02506459, + "balance_loss_mlp": 1.03951752, + "epoch": 0.4880204419059071, + "flos": 18588907883520.0, + "grad_norm": 1.626628974836948, + "language_loss": 0.63457322, + "learning_rate": 2.1744515888311335e-06, + "loss": 0.65604323, + "num_input_tokens_seen": 174505075, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6953125, + "step": 8117, + "time_per_iteration": 2.4408295154571533 + }, + { + "auxiliary_loss_clip": 0.01106242, + "auxiliary_loss_mlp": 0.01033541, + "balance_loss_clip": 1.02082098, + "balance_loss_mlp": 1.03648984, + "epoch": 0.48808056515857506, + "flos": 19172168928000.0, + "grad_norm": 1.7937040821955612, + "language_loss": 0.79223609, + "learning_rate": 2.1740636082516533e-06, + "loss": 0.81363392, + "num_input_tokens_seen": 174523385, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 8118, + "time_per_iteration": 2.4724843502044678 + }, + { + "auxiliary_loss_clip": 0.01111434, + "auxiliary_loss_mlp": 0.01037003, + "balance_loss_clip": 1.02359247, + "balance_loss_mlp": 1.03926289, + "epoch": 0.48814068841124303, + "flos": 20120497850880.0, + "grad_norm": 2.8027989615224427, + "language_loss": 0.63472134, + "learning_rate": 2.1736756210716645e-06, + "loss": 0.65620571, + "num_input_tokens_seen": 174542200, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.72265625, + "step": 8119, + "time_per_iteration": 2.478968381881714 + }, + { + "auxiliary_loss_clip": 0.01111182, + "auxiliary_loss_mlp": 0.01032744, + "balance_loss_clip": 1.02006578, + "balance_loss_mlp": 1.04054463, + "epoch": 0.488200811663911, + "flos": 22965592360320.0, + "grad_norm": 1.9034604660173908, + "language_loss": 0.72397757, + "learning_rate": 2.173287627305878e-06, + "loss": 0.74541688, + "num_input_tokens_seen": 174563620, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 8120, + "time_per_iteration": 2.5204596519470215 + }, + { + "auxiliary_loss_clip": 0.01109957, + "auxiliary_loss_mlp": 0.01034651, + "balance_loss_clip": 1.02122211, + "balance_loss_mlp": 1.03855026, + "epoch": 0.48826093491657896, + "flos": 33910697387520.0, + "grad_norm": 1.5930525886491658, + "language_loss": 0.63636339, + "learning_rate": 2.1728996269690075e-06, + "loss": 0.65780938, + "num_input_tokens_seen": 174586465, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71484375, + "step": 8121, + "time_per_iteration": 2.5647690296173096 + }, + { + "auxiliary_loss_clip": 0.01113983, + "auxiliary_loss_mlp": 0.01038884, + "balance_loss_clip": 1.02521062, + "balance_loss_mlp": 1.04131413, + "epoch": 0.488321058169247, + "flos": 23070307484160.0, + "grad_norm": 1.870740841609923, + "language_loss": 0.82433021, + "learning_rate": 2.1725116200757664e-06, + "loss": 0.84585893, + "num_input_tokens_seen": 174604035, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7265625, + "step": 8122, + "time_per_iteration": 2.4753966331481934 + }, + { + "auxiliary_loss_clip": 0.01113704, + "auxiliary_loss_mlp": 0.01034348, + "balance_loss_clip": 1.02019167, + "balance_loss_mlp": 1.04063094, + "epoch": 0.48838118142191494, + "flos": 19317714837120.0, + "grad_norm": 2.206764356510625, + "language_loss": 0.85308874, + "learning_rate": 2.172123606640866e-06, + "loss": 0.8745693, + "num_input_tokens_seen": 174621715, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.73046875, + "step": 8123, + "time_per_iteration": 2.5124545097351074 + }, + { + "auxiliary_loss_clip": 0.01111875, + "auxiliary_loss_mlp": 0.01033202, + "balance_loss_clip": 1.02075016, + "balance_loss_mlp": 1.03892267, + "epoch": 0.4884413046745829, + "flos": 25410678036480.0, + "grad_norm": 2.940858316224804, + "language_loss": 0.85766631, + "learning_rate": 2.1717355866790227e-06, + "loss": 0.87911713, + "num_input_tokens_seen": 174643835, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.73046875, + "step": 8124, + "time_per_iteration": 2.5632708072662354 + }, + { + "auxiliary_loss_clip": 0.01112362, + "auxiliary_loss_mlp": 0.01034904, + "balance_loss_clip": 1.02157593, + "balance_loss_mlp": 1.04022837, + "epoch": 0.4885014279272509, + "flos": 20991546662400.0, + "grad_norm": 2.663608167377633, + "language_loss": 0.79223049, + "learning_rate": 2.171347560204948e-06, + "loss": 0.81370318, + "num_input_tokens_seen": 174660955, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 8125, + "time_per_iteration": 2.4487855434417725 + }, + { + "auxiliary_loss_clip": 0.01108941, + "auxiliary_loss_mlp": 0.01033765, + "balance_loss_clip": 1.0211587, + "balance_loss_mlp": 1.03887916, + "epoch": 0.48856155117991884, + "flos": 13771599269760.0, + "grad_norm": 2.7973571608225063, + "language_loss": 0.72273839, + "learning_rate": 2.170959527233356e-06, + "loss": 0.74416542, + "num_input_tokens_seen": 174678270, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 8126, + "time_per_iteration": 2.437833309173584 + }, + { + "auxiliary_loss_clip": 0.01111271, + "auxiliary_loss_mlp": 0.01033174, + "balance_loss_clip": 1.02007866, + "balance_loss_mlp": 1.0383321, + "epoch": 0.4886216744325868, + "flos": 32087764206720.0, + "grad_norm": 1.6636646152839605, + "language_loss": 0.68598747, + "learning_rate": 2.1705714877789633e-06, + "loss": 0.70743197, + "num_input_tokens_seen": 174698360, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73046875, + "step": 8127, + "time_per_iteration": 2.593252420425415 + }, + { + "auxiliary_loss_clip": 0.01111716, + "auxiliary_loss_mlp": 0.01036256, + "balance_loss_clip": 1.02271378, + "balance_loss_mlp": 1.03772545, + "epoch": 0.48868179768525477, + "flos": 19610063631360.0, + "grad_norm": 2.237259843406747, + "language_loss": 0.76160932, + "learning_rate": 2.170183441856481e-06, + "loss": 0.78308904, + "num_input_tokens_seen": 174716755, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73828125, + "step": 8128, + "time_per_iteration": 2.4540648460388184 + }, + { + "auxiliary_loss_clip": 0.01111402, + "auxiliary_loss_mlp": 0.01033977, + "balance_loss_clip": 1.02170467, + "balance_loss_mlp": 1.03979826, + "epoch": 0.48874192093792274, + "flos": 21286912199040.0, + "grad_norm": 1.8007841393953645, + "language_loss": 0.75974828, + "learning_rate": 2.1697953894806265e-06, + "loss": 0.78120208, + "num_input_tokens_seen": 174735560, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.71875, + "step": 8129, + "time_per_iteration": 2.4460771083831787 + }, + { + "auxiliary_loss_clip": 0.01108237, + "auxiliary_loss_mlp": 0.01031838, + "balance_loss_clip": 1.01829541, + "balance_loss_mlp": 1.03739452, + "epoch": 0.4888020441905907, + "flos": 14173439696640.0, + "grad_norm": 2.2474332482435684, + "language_loss": 0.64869368, + "learning_rate": 2.169407330666114e-06, + "loss": 0.67009449, + "num_input_tokens_seen": 174752730, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.70703125, + "step": 8130, + "time_per_iteration": 2.4403305053710938 + }, + { + "auxiliary_loss_clip": 0.01104742, + "auxiliary_loss_mlp": 0.01033698, + "balance_loss_clip": 1.0213058, + "balance_loss_mlp": 1.03528643, + "epoch": 0.48886216744325867, + "flos": 24097891766400.0, + "grad_norm": 2.48357292354413, + "language_loss": 0.71885133, + "learning_rate": 2.169019265427658e-06, + "loss": 0.74023575, + "num_input_tokens_seen": 174772520, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6953125, + "step": 8131, + "time_per_iteration": 2.4774324893951416 + }, + { + "auxiliary_loss_clip": 0.01113099, + "auxiliary_loss_mlp": 0.01038002, + "balance_loss_clip": 1.02447748, + "balance_loss_mlp": 1.04011512, + "epoch": 0.48892229069592663, + "flos": 38431419402240.0, + "grad_norm": 1.6326145167913504, + "language_loss": 0.69524658, + "learning_rate": 2.1686311937799745e-06, + "loss": 0.7167576, + "num_input_tokens_seen": 174796540, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73046875, + "step": 8132, + "time_per_iteration": 2.5888383388519287 + }, + { + "auxiliary_loss_clip": 0.011075, + "auxiliary_loss_mlp": 0.0102913, + "balance_loss_clip": 1.01641083, + "balance_loss_mlp": 1.03793633, + "epoch": 0.4889824139485946, + "flos": 23843321101440.0, + "grad_norm": 1.374551885233197, + "language_loss": 0.70177239, + "learning_rate": 2.1682431157377797e-06, + "loss": 0.72313869, + "num_input_tokens_seen": 174817840, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 8133, + "time_per_iteration": 2.5105628967285156 + }, + { + "auxiliary_loss_clip": 0.01108745, + "auxiliary_loss_mlp": 0.01033533, + "balance_loss_clip": 1.02086735, + "balance_loss_mlp": 1.03843439, + "epoch": 0.48904253720126256, + "flos": 24425827960320.0, + "grad_norm": 1.701581568458854, + "language_loss": 0.70707083, + "learning_rate": 2.1678550313157883e-06, + "loss": 0.72849363, + "num_input_tokens_seen": 174837885, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 8134, + "time_per_iteration": 2.4894602298736572 + }, + { + "auxiliary_loss_clip": 0.01114154, + "auxiliary_loss_mlp": 0.01035613, + "balance_loss_clip": 1.02214789, + "balance_loss_mlp": 1.04088461, + "epoch": 0.4891026604539306, + "flos": 24170682677760.0, + "grad_norm": 2.0967568848691105, + "language_loss": 0.80384946, + "learning_rate": 2.167466940528718e-06, + "loss": 0.82534719, + "num_input_tokens_seen": 174855240, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 8135, + "time_per_iteration": 2.453099489212036 + }, + { + "auxiliary_loss_clip": 0.0110553, + "auxiliary_loss_mlp": 0.0103091, + "balance_loss_clip": 1.01895332, + "balance_loss_mlp": 1.03636014, + "epoch": 0.48916278370659855, + "flos": 21470954509440.0, + "grad_norm": 1.7196560423786724, + "language_loss": 0.74302435, + "learning_rate": 2.1670788433912843e-06, + "loss": 0.7643888, + "num_input_tokens_seen": 174875145, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.69140625, + "step": 8136, + "time_per_iteration": 3.877336025238037 + }, + { + "auxiliary_loss_clip": 0.0110843, + "auxiliary_loss_mlp": 0.01030189, + "balance_loss_clip": 1.01817274, + "balance_loss_mlp": 1.03903699, + "epoch": 0.4892229069592665, + "flos": 22309755886080.0, + "grad_norm": 2.212302237726986, + "language_loss": 0.73165262, + "learning_rate": 2.166690739918204e-06, + "loss": 0.75303876, + "num_input_tokens_seen": 174894770, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6953125, + "step": 8137, + "time_per_iteration": 5.387110471725464 + }, + { + "auxiliary_loss_clip": 0.01109302, + "auxiliary_loss_mlp": 0.01031575, + "balance_loss_clip": 1.01846206, + "balance_loss_mlp": 1.03721762, + "epoch": 0.4892830302119345, + "flos": 12786856934400.0, + "grad_norm": 1.8416541749331667, + "language_loss": 0.74448442, + "learning_rate": 2.1663026301241944e-06, + "loss": 0.76589316, + "num_input_tokens_seen": 174912780, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 8138, + "time_per_iteration": 3.9045798778533936 + }, + { + "auxiliary_loss_clip": 0.01108399, + "auxiliary_loss_mlp": 0.01033464, + "balance_loss_clip": 1.02114367, + "balance_loss_mlp": 1.039101, + "epoch": 0.48934315346460244, + "flos": 20813896972800.0, + "grad_norm": 1.5284975125240874, + "language_loss": 0.74403191, + "learning_rate": 2.165914514023972e-06, + "loss": 0.76545048, + "num_input_tokens_seen": 174931250, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.69140625, + "step": 8139, + "time_per_iteration": 2.4808132648468018 + }, + { + "auxiliary_loss_clip": 0.01109253, + "auxiliary_loss_mlp": 0.01034607, + "balance_loss_clip": 1.02220941, + "balance_loss_mlp": 1.03792441, + "epoch": 0.4894032767172704, + "flos": 19755537713280.0, + "grad_norm": 1.7092479760411836, + "language_loss": 0.61867124, + "learning_rate": 2.165526391632255e-06, + "loss": 0.64010978, + "num_input_tokens_seen": 174951105, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 8140, + "time_per_iteration": 2.4676973819732666 + }, + { + "auxiliary_loss_clip": 0.01110437, + "auxiliary_loss_mlp": 0.01040632, + "balance_loss_clip": 1.02696478, + "balance_loss_mlp": 1.03864169, + "epoch": 0.4894633999699384, + "flos": 17818982835840.0, + "grad_norm": 11.553990271771063, + "language_loss": 0.82090259, + "learning_rate": 2.1651382629637608e-06, + "loss": 0.84241331, + "num_input_tokens_seen": 174969120, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71875, + "step": 8141, + "time_per_iteration": 2.4469456672668457 + }, + { + "auxiliary_loss_clip": 0.01112856, + "auxiliary_loss_mlp": 0.01033863, + "balance_loss_clip": 1.02006459, + "balance_loss_mlp": 1.04014516, + "epoch": 0.48952352322260634, + "flos": 25523222325120.0, + "grad_norm": 1.575169950356119, + "language_loss": 0.72470534, + "learning_rate": 2.1647501280332066e-06, + "loss": 0.74617255, + "num_input_tokens_seen": 174991295, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7265625, + "step": 8142, + "time_per_iteration": 2.5793039798736572 + }, + { + "auxiliary_loss_clip": 0.01105636, + "auxiliary_loss_mlp": 0.01032347, + "balance_loss_clip": 1.02019358, + "balance_loss_mlp": 1.03645492, + "epoch": 0.4895836464752743, + "flos": 29055502903680.0, + "grad_norm": 1.7422772510583273, + "language_loss": 0.66720849, + "learning_rate": 2.1643619868553105e-06, + "loss": 0.68858832, + "num_input_tokens_seen": 175012830, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69140625, + "step": 8143, + "time_per_iteration": 2.529869556427002 + }, + { + "auxiliary_loss_clip": 0.01104447, + "auxiliary_loss_mlp": 0.01029513, + "balance_loss_clip": 1.01746714, + "balance_loss_mlp": 1.03620982, + "epoch": 0.48964376972794227, + "flos": 33546958312320.0, + "grad_norm": 1.6744857165672533, + "language_loss": 0.75076014, + "learning_rate": 2.163973839444793e-06, + "loss": 0.77209973, + "num_input_tokens_seen": 175035695, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.68359375, + "step": 8144, + "time_per_iteration": 2.5917482376098633 + }, + { + "auxiliary_loss_clip": 0.0110724, + "auxiliary_loss_mlp": 0.01028535, + "balance_loss_clip": 1.0158155, + "balance_loss_mlp": 1.0373745, + "epoch": 0.48970389298061023, + "flos": 22054035985920.0, + "grad_norm": 1.7401505251342857, + "language_loss": 0.75606745, + "learning_rate": 2.1635856858163695e-06, + "loss": 0.77742517, + "num_input_tokens_seen": 175056425, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 8145, + "time_per_iteration": 2.4766342639923096 + }, + { + "auxiliary_loss_clip": 0.01110696, + "auxiliary_loss_mlp": 0.01035586, + "balance_loss_clip": 1.0224849, + "balance_loss_mlp": 1.03849018, + "epoch": 0.4897640162332782, + "flos": 20084299920000.0, + "grad_norm": 1.7624340526507305, + "language_loss": 0.79901314, + "learning_rate": 2.163197525984761e-06, + "loss": 0.820476, + "num_input_tokens_seen": 175074800, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 8146, + "time_per_iteration": 2.461480140686035 + }, + { + "auxiliary_loss_clip": 0.01102906, + "auxiliary_loss_mlp": 0.01031626, + "balance_loss_clip": 1.01866233, + "balance_loss_mlp": 1.03510666, + "epoch": 0.48982413948594616, + "flos": 23806225330560.0, + "grad_norm": 1.6218674355963285, + "language_loss": 0.74327677, + "learning_rate": 2.162809359964687e-06, + "loss": 0.76462203, + "num_input_tokens_seen": 175094500, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.67578125, + "step": 8147, + "time_per_iteration": 2.4981865882873535 + }, + { + "auxiliary_loss_clip": 0.01109193, + "auxiliary_loss_mlp": 0.01028663, + "balance_loss_clip": 1.01614654, + "balance_loss_mlp": 1.0397613, + "epoch": 0.4898842627386142, + "flos": 17639645207040.0, + "grad_norm": 2.4473724892456126, + "language_loss": 0.83147472, + "learning_rate": 2.162421187770864e-06, + "loss": 0.8528533, + "num_input_tokens_seen": 175112920, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 8148, + "time_per_iteration": 2.4251036643981934 + }, + { + "auxiliary_loss_clip": 0.01104505, + "auxiliary_loss_mlp": 0.01027888, + "balance_loss_clip": 1.01701021, + "balance_loss_mlp": 1.03808641, + "epoch": 0.48994438599128215, + "flos": 16617914841600.0, + "grad_norm": 1.6244569398372493, + "language_loss": 0.73749536, + "learning_rate": 2.162033009418015e-06, + "loss": 0.75881934, + "num_input_tokens_seen": 175129910, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6640625, + "step": 8149, + "time_per_iteration": 2.4356369972229004 + }, + { + "auxiliary_loss_clip": 0.01112401, + "auxiliary_loss_mlp": 0.01030368, + "balance_loss_clip": 1.01667118, + "balance_loss_mlp": 1.03944612, + "epoch": 0.4900045092439501, + "flos": 26614834600320.0, + "grad_norm": 2.7362049095417516, + "language_loss": 0.75515091, + "learning_rate": 2.1616448249208567e-06, + "loss": 0.77657855, + "num_input_tokens_seen": 175148705, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7265625, + "step": 8150, + "time_per_iteration": 2.4834423065185547 + }, + { + "auxiliary_loss_clip": 0.01111432, + "auxiliary_loss_mlp": 0.01030069, + "balance_loss_clip": 1.0169735, + "balance_loss_mlp": 1.04018414, + "epoch": 0.4900646324966181, + "flos": 19902125116800.0, + "grad_norm": 2.027803048960678, + "language_loss": 0.72891176, + "learning_rate": 2.1612566342941106e-06, + "loss": 0.75032675, + "num_input_tokens_seen": 175167425, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 8151, + "time_per_iteration": 2.448648691177368 + }, + { + "auxiliary_loss_clip": 0.01033992, + "auxiliary_loss_mlp": 0.01002772, + "balance_loss_clip": 1.0015738, + "balance_loss_mlp": 1.01003349, + "epoch": 0.49012475574928605, + "flos": 59189620337280.0, + "grad_norm": 0.8338756787223442, + "language_loss": 0.54366148, + "learning_rate": 2.1608684375524977e-06, + "loss": 0.5640291, + "num_input_tokens_seen": 175227985, + "router_z_loss_clip": 0.01196289, + "router_z_loss_mlp": 0.24023438, + "step": 8152, + "time_per_iteration": 3.0414862632751465 + }, + { + "auxiliary_loss_clip": 0.01109949, + "auxiliary_loss_mlp": 0.01030719, + "balance_loss_clip": 1.01807642, + "balance_loss_mlp": 1.03726649, + "epoch": 0.490184879001954, + "flos": 45259797657600.0, + "grad_norm": 1.8071588573161568, + "language_loss": 0.61403525, + "learning_rate": 2.1604802347107364e-06, + "loss": 0.6354419, + "num_input_tokens_seen": 175251895, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7265625, + "step": 8153, + "time_per_iteration": 2.6923155784606934 + }, + { + "auxiliary_loss_clip": 0.01105747, + "auxiliary_loss_mlp": 0.01036584, + "balance_loss_clip": 1.02371526, + "balance_loss_mlp": 1.03589535, + "epoch": 0.490245002254622, + "flos": 28002135634560.0, + "grad_norm": 1.4691031789751592, + "language_loss": 0.76673591, + "learning_rate": 2.160092025783549e-06, + "loss": 0.78815919, + "num_input_tokens_seen": 175272770, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69921875, + "step": 8154, + "time_per_iteration": 2.490353584289551 + }, + { + "auxiliary_loss_clip": 0.01034079, + "auxiliary_loss_mlp": 0.01008709, + "balance_loss_clip": 1.00767767, + "balance_loss_mlp": 1.01043367, + "epoch": 0.49030512550728994, + "flos": 58951318533120.0, + "grad_norm": 0.9669855284605297, + "language_loss": 0.67019808, + "learning_rate": 2.1597038107856564e-06, + "loss": 0.69062597, + "num_input_tokens_seen": 175336320, + "router_z_loss_clip": 0.01031494, + "router_z_loss_mlp": 0.23632812, + "step": 8155, + "time_per_iteration": 3.1443841457366943 + }, + { + "auxiliary_loss_clip": 0.0110817, + "auxiliary_loss_mlp": 0.0102773, + "balance_loss_clip": 1.01594031, + "balance_loss_mlp": 1.03842843, + "epoch": 0.4903652487599579, + "flos": 19791843384960.0, + "grad_norm": 2.3165784732113965, + "language_loss": 0.76883155, + "learning_rate": 2.1593155897317784e-06, + "loss": 0.79019058, + "num_input_tokens_seen": 175353540, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.69921875, + "step": 8156, + "time_per_iteration": 2.4431064128875732 + }, + { + "auxiliary_loss_clip": 0.01107345, + "auxiliary_loss_mlp": 0.01029515, + "balance_loss_clip": 1.01737309, + "balance_loss_mlp": 1.03692055, + "epoch": 0.49042537201262587, + "flos": 21762082241280.0, + "grad_norm": 2.1340841853754084, + "language_loss": 0.83395588, + "learning_rate": 2.1589273626366377e-06, + "loss": 0.85532445, + "num_input_tokens_seen": 175370445, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.703125, + "step": 8157, + "time_per_iteration": 2.478027582168579 + }, + { + "auxiliary_loss_clip": 0.01108499, + "auxiliary_loss_mlp": 0.01032069, + "balance_loss_clip": 1.01971316, + "balance_loss_mlp": 1.03797531, + "epoch": 0.49048549526529384, + "flos": 18953042008320.0, + "grad_norm": 1.799550006100146, + "language_loss": 0.79893947, + "learning_rate": 2.158539129514956e-06, + "loss": 0.8203451, + "num_input_tokens_seen": 175389020, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.70703125, + "step": 8158, + "time_per_iteration": 2.453590154647827 + }, + { + "auxiliary_loss_clip": 0.0111001, + "auxiliary_loss_mlp": 0.01030333, + "balance_loss_clip": 1.01731563, + "balance_loss_mlp": 1.03768444, + "epoch": 0.4905456185179618, + "flos": 26906393295360.0, + "grad_norm": 2.6065217447562015, + "language_loss": 0.69529265, + "learning_rate": 2.158150890381454e-06, + "loss": 0.71669614, + "num_input_tokens_seen": 175409545, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.72265625, + "step": 8159, + "time_per_iteration": 2.531371593475342 + }, + { + "auxiliary_loss_clip": 0.01106025, + "auxiliary_loss_mlp": 0.01032529, + "balance_loss_clip": 1.01975548, + "balance_loss_mlp": 1.03706563, + "epoch": 0.49060574177062977, + "flos": 20412343854720.0, + "grad_norm": 1.8340548446534848, + "language_loss": 0.73084885, + "learning_rate": 2.157762645250854e-06, + "loss": 0.7522344, + "num_input_tokens_seen": 175429335, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 8160, + "time_per_iteration": 2.4504506587982178 + }, + { + "auxiliary_loss_clip": 0.01109213, + "auxiliary_loss_mlp": 0.0103886, + "balance_loss_clip": 1.02510881, + "balance_loss_mlp": 1.03650105, + "epoch": 0.4906658650232978, + "flos": 17493704248320.0, + "grad_norm": 1.9580885379656197, + "language_loss": 0.71372044, + "learning_rate": 2.1573743941378796e-06, + "loss": 0.73520112, + "num_input_tokens_seen": 175446955, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7265625, + "step": 8161, + "time_per_iteration": 2.4428305625915527 + }, + { + "auxiliary_loss_clip": 0.01106928, + "auxiliary_loss_mlp": 0.01033381, + "balance_loss_clip": 1.02100754, + "balance_loss_mlp": 1.03813958, + "epoch": 0.49072598827596575, + "flos": 26614439550720.0, + "grad_norm": 1.8633116916333885, + "language_loss": 0.67950338, + "learning_rate": 2.1569861370572517e-06, + "loss": 0.70090652, + "num_input_tokens_seen": 175468195, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6875, + "step": 8162, + "time_per_iteration": 2.478804349899292 + }, + { + "auxiliary_loss_clip": 0.01110496, + "auxiliary_loss_mlp": 0.01033543, + "balance_loss_clip": 1.01964319, + "balance_loss_mlp": 1.03701675, + "epoch": 0.4907861115286337, + "flos": 20412595249920.0, + "grad_norm": 1.7117590070355053, + "language_loss": 0.63264233, + "learning_rate": 2.1565978740236944e-06, + "loss": 0.65408272, + "num_input_tokens_seen": 175487455, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.734375, + "step": 8163, + "time_per_iteration": 2.474439859390259 + }, + { + "auxiliary_loss_clip": 0.01104573, + "auxiliary_loss_mlp": 0.01030344, + "balance_loss_clip": 1.01754081, + "balance_loss_mlp": 1.03680897, + "epoch": 0.4908462347813017, + "flos": 14064271286400.0, + "grad_norm": 5.481003364843308, + "language_loss": 0.76853907, + "learning_rate": 2.1562096050519293e-06, + "loss": 0.78988826, + "num_input_tokens_seen": 175504450, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 8164, + "time_per_iteration": 2.4202303886413574 + }, + { + "auxiliary_loss_clip": 0.01106417, + "auxiliary_loss_mlp": 0.01028083, + "balance_loss_clip": 1.01487494, + "balance_loss_mlp": 1.03511751, + "epoch": 0.49090635803396965, + "flos": 18735100237440.0, + "grad_norm": 1.943812351193686, + "language_loss": 0.76509839, + "learning_rate": 2.1558213301566806e-06, + "loss": 0.78644335, + "num_input_tokens_seen": 175523600, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71484375, + "step": 8165, + "time_per_iteration": 2.4495608806610107 + }, + { + "auxiliary_loss_clip": 0.01106443, + "auxiliary_loss_mlp": 0.0103224, + "balance_loss_clip": 1.01949036, + "balance_loss_mlp": 1.03724587, + "epoch": 0.4909664812866376, + "flos": 20558500295040.0, + "grad_norm": 1.5511500992998777, + "language_loss": 0.77538848, + "learning_rate": 2.1554330493526716e-06, + "loss": 0.79677534, + "num_input_tokens_seen": 175542720, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 8166, + "time_per_iteration": 2.431838274002075 + }, + { + "auxiliary_loss_clip": 0.01035489, + "auxiliary_loss_mlp": 0.00999269, + "balance_loss_clip": 0.99796408, + "balance_loss_mlp": 1.01166928, + "epoch": 0.4910266045393056, + "flos": 54684017948160.0, + "grad_norm": 0.7997768420675069, + "language_loss": 0.54261303, + "learning_rate": 2.1550447626546253e-06, + "loss": 0.56296062, + "num_input_tokens_seen": 175598640, + "router_z_loss_clip": 0.01306152, + "router_z_loss_mlp": 0.23828125, + "step": 8167, + "time_per_iteration": 3.1150460243225098 + }, + { + "auxiliary_loss_clip": 0.01104818, + "auxiliary_loss_mlp": 0.0103103, + "balance_loss_clip": 1.0184176, + "balance_loss_mlp": 1.03619838, + "epoch": 0.49108672779197354, + "flos": 16246454342400.0, + "grad_norm": 2.5337625100343173, + "language_loss": 0.85566431, + "learning_rate": 2.1546564700772665e-06, + "loss": 0.8770228, + "num_input_tokens_seen": 175615675, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 8168, + "time_per_iteration": 2.4139063358306885 + }, + { + "auxiliary_loss_clip": 0.01105043, + "auxiliary_loss_mlp": 0.01029235, + "balance_loss_clip": 1.01706409, + "balance_loss_mlp": 1.03805184, + "epoch": 0.4911468510446415, + "flos": 19825419623040.0, + "grad_norm": 1.6015963996367162, + "language_loss": 0.73052484, + "learning_rate": 2.1542681716353193e-06, + "loss": 0.75186759, + "num_input_tokens_seen": 175632255, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.671875, + "step": 8169, + "time_per_iteration": 2.45638370513916 + }, + { + "auxiliary_loss_clip": 0.01104357, + "auxiliary_loss_mlp": 0.01028462, + "balance_loss_clip": 1.01673138, + "balance_loss_mlp": 1.03472865, + "epoch": 0.4912069742973095, + "flos": 21212684743680.0, + "grad_norm": 1.6971136818289634, + "language_loss": 0.78070778, + "learning_rate": 2.1538798673435068e-06, + "loss": 0.80203593, + "num_input_tokens_seen": 175651625, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6953125, + "step": 8170, + "time_per_iteration": 2.4314279556274414 + }, + { + "auxiliary_loss_clip": 0.01108102, + "auxiliary_loss_mlp": 0.01033192, + "balance_loss_clip": 1.02121162, + "balance_loss_mlp": 1.03809822, + "epoch": 0.49126709754997744, + "flos": 19537129065600.0, + "grad_norm": 3.6606474387116363, + "language_loss": 0.75769788, + "learning_rate": 2.1534915572165545e-06, + "loss": 0.77911079, + "num_input_tokens_seen": 175669265, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.703125, + "step": 8171, + "time_per_iteration": 2.4608027935028076 + }, + { + "auxiliary_loss_clip": 0.01109941, + "auxiliary_loss_mlp": 0.0103435, + "balance_loss_clip": 1.02194011, + "balance_loss_mlp": 1.03800821, + "epoch": 0.4913272208026454, + "flos": 12239686080000.0, + "grad_norm": 2.121204048765929, + "language_loss": 0.81676465, + "learning_rate": 2.1531032412691875e-06, + "loss": 0.83820748, + "num_input_tokens_seen": 175686065, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71875, + "step": 8172, + "time_per_iteration": 2.44052791595459 + }, + { + "auxiliary_loss_clip": 0.01034483, + "auxiliary_loss_mlp": 0.00996712, + "balance_loss_clip": 0.99551356, + "balance_loss_mlp": 1.0111028, + "epoch": 0.49138734405531337, + "flos": 65465871661440.0, + "grad_norm": 0.6914312886696967, + "language_loss": 0.53323382, + "learning_rate": 2.1527149195161295e-06, + "loss": 0.55354571, + "num_input_tokens_seen": 175748595, + "router_z_loss_clip": 0.01196289, + "router_z_loss_mlp": 0.234375, + "step": 8173, + "time_per_iteration": 3.0708565711975098 + }, + { + "auxiliary_loss_clip": 0.01108663, + "auxiliary_loss_mlp": 0.01032993, + "balance_loss_clip": 1.01985621, + "balance_loss_mlp": 1.0374558, + "epoch": 0.4914474673079814, + "flos": 18439052342400.0, + "grad_norm": 1.811286975884668, + "language_loss": 0.62879664, + "learning_rate": 2.152326591972107e-06, + "loss": 0.65021324, + "num_input_tokens_seen": 175766770, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 8174, + "time_per_iteration": 2.4336249828338623 + }, + { + "auxiliary_loss_clip": 0.01106845, + "auxiliary_loss_mlp": 0.01034775, + "balance_loss_clip": 1.02208483, + "balance_loss_mlp": 1.03750002, + "epoch": 0.49150759056064935, + "flos": 21685053525120.0, + "grad_norm": 1.779537870111139, + "language_loss": 0.69111979, + "learning_rate": 2.1519382586518445e-06, + "loss": 0.71253598, + "num_input_tokens_seen": 175783605, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 8175, + "time_per_iteration": 2.4554460048675537 + }, + { + "auxiliary_loss_clip": 0.01106829, + "auxiliary_loss_mlp": 0.01032059, + "balance_loss_clip": 1.01980472, + "balance_loss_mlp": 1.03808653, + "epoch": 0.4915677138133173, + "flos": 22382439056640.0, + "grad_norm": 1.5246237839161791, + "language_loss": 0.74398279, + "learning_rate": 2.151549919570068e-06, + "loss": 0.76537168, + "num_input_tokens_seen": 175801390, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 8176, + "time_per_iteration": 2.4888904094696045 + }, + { + "auxiliary_loss_clip": 0.01107276, + "auxiliary_loss_mlp": 0.01042259, + "balance_loss_clip": 1.0297358, + "balance_loss_mlp": 1.03694725, + "epoch": 0.4916278370659853, + "flos": 18402890325120.0, + "grad_norm": 1.7568126082203932, + "language_loss": 0.69846892, + "learning_rate": 2.1511615747415036e-06, + "loss": 0.71996421, + "num_input_tokens_seen": 175819830, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 8177, + "time_per_iteration": 3.8634564876556396 + }, + { + "auxiliary_loss_clip": 0.01035127, + "auxiliary_loss_mlp": 0.00999453, + "balance_loss_clip": 0.99834442, + "balance_loss_mlp": 1.01137829, + "epoch": 0.49168796031865325, + "flos": 66609124715520.0, + "grad_norm": 0.6749706589091774, + "language_loss": 0.46188164, + "learning_rate": 2.150773224180877e-06, + "loss": 0.48222741, + "num_input_tokens_seen": 175881765, + "router_z_loss_clip": 0.0111084, + "router_z_loss_mlp": 0.23828125, + "step": 8178, + "time_per_iteration": 3.0891001224517822 + }, + { + "auxiliary_loss_clip": 0.01110485, + "auxiliary_loss_mlp": 0.01036426, + "balance_loss_clip": 1.02311015, + "balance_loss_mlp": 1.03835034, + "epoch": 0.4917480835713212, + "flos": 20959335141120.0, + "grad_norm": 1.813634772504209, + "language_loss": 0.66008747, + "learning_rate": 2.1503848679029147e-06, + "loss": 0.68155658, + "num_input_tokens_seen": 175901795, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 8179, + "time_per_iteration": 5.296982049942017 + }, + { + "auxiliary_loss_clip": 0.01111217, + "auxiliary_loss_mlp": 0.01035796, + "balance_loss_clip": 1.021873, + "balance_loss_mlp": 1.03712761, + "epoch": 0.4918082068239892, + "flos": 15772900412160.0, + "grad_norm": 1.8426949121819989, + "language_loss": 0.70288503, + "learning_rate": 2.149996505922343e-06, + "loss": 0.72435522, + "num_input_tokens_seen": 175917770, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7421875, + "step": 8180, + "time_per_iteration": 3.9257376194000244 + }, + { + "auxiliary_loss_clip": 0.01104266, + "auxiliary_loss_mlp": 0.01037933, + "balance_loss_clip": 1.02467656, + "balance_loss_mlp": 1.03577447, + "epoch": 0.49186833007665715, + "flos": 24604806453120.0, + "grad_norm": 1.68068912028803, + "language_loss": 0.83982801, + "learning_rate": 2.1496081382538895e-06, + "loss": 0.86125004, + "num_input_tokens_seen": 175937000, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.68359375, + "step": 8181, + "time_per_iteration": 2.464665174484253 + }, + { + "auxiliary_loss_clip": 0.01104535, + "auxiliary_loss_mlp": 0.01030918, + "balance_loss_clip": 1.01947999, + "balance_loss_mlp": 1.03746653, + "epoch": 0.4919284533293251, + "flos": 22090557139200.0, + "grad_norm": 2.0240623883749724, + "language_loss": 0.72286201, + "learning_rate": 2.1492197649122793e-06, + "loss": 0.74421656, + "num_input_tokens_seen": 175955170, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 8182, + "time_per_iteration": 2.5358242988586426 + }, + { + "auxiliary_loss_clip": 0.01108049, + "auxiliary_loss_mlp": 0.01031803, + "balance_loss_clip": 1.01904118, + "balance_loss_mlp": 1.03814411, + "epoch": 0.4919885765819931, + "flos": 23368043318400.0, + "grad_norm": 2.2040850478726357, + "language_loss": 0.72828728, + "learning_rate": 2.1488313859122412e-06, + "loss": 0.74968582, + "num_input_tokens_seen": 175973725, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 8183, + "time_per_iteration": 2.484051465988159 + }, + { + "auxiliary_loss_clip": 0.01110545, + "auxiliary_loss_mlp": 0.01031345, + "balance_loss_clip": 1.0178628, + "balance_loss_mlp": 1.03733599, + "epoch": 0.49204869983466104, + "flos": 21360493209600.0, + "grad_norm": 1.6157316160481727, + "language_loss": 0.77338606, + "learning_rate": 2.1484430012685015e-06, + "loss": 0.79480493, + "num_input_tokens_seen": 175993885, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 8184, + "time_per_iteration": 2.4630794525146484 + }, + { + "auxiliary_loss_clip": 0.01107787, + "auxiliary_loss_mlp": 0.01035741, + "balance_loss_clip": 1.02359986, + "balance_loss_mlp": 1.03868532, + "epoch": 0.492108823087329, + "flos": 21142695093120.0, + "grad_norm": 1.7266312313882144, + "language_loss": 0.71020061, + "learning_rate": 2.148054610995789e-06, + "loss": 0.73163593, + "num_input_tokens_seen": 176014210, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69140625, + "step": 8185, + "time_per_iteration": 2.4472904205322266 + }, + { + "auxiliary_loss_clip": 0.01109756, + "auxiliary_loss_mlp": 0.01037838, + "balance_loss_clip": 1.02348495, + "balance_loss_mlp": 1.03818357, + "epoch": 0.49216894633999697, + "flos": 25116605389440.0, + "grad_norm": 2.357724154899576, + "language_loss": 0.75007719, + "learning_rate": 2.147666215108831e-06, + "loss": 0.7715531, + "num_input_tokens_seen": 176033890, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.71484375, + "step": 8186, + "time_per_iteration": 2.497887372970581 + }, + { + "auxiliary_loss_clip": 0.01108113, + "auxiliary_loss_mlp": 0.01036969, + "balance_loss_clip": 1.0240649, + "balance_loss_mlp": 1.03769946, + "epoch": 0.49222906959266494, + "flos": 22637943475200.0, + "grad_norm": 2.2731376810200947, + "language_loss": 0.67426246, + "learning_rate": 2.1472778136223545e-06, + "loss": 0.69571328, + "num_input_tokens_seen": 176052720, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 8187, + "time_per_iteration": 2.4402377605438232 + }, + { + "auxiliary_loss_clip": 0.01105993, + "auxiliary_loss_mlp": 0.01034798, + "balance_loss_clip": 1.02205503, + "balance_loss_mlp": 1.03659558, + "epoch": 0.49228919284533296, + "flos": 20410548174720.0, + "grad_norm": 1.3838016666023416, + "language_loss": 0.66984355, + "learning_rate": 2.1468894065510894e-06, + "loss": 0.69125152, + "num_input_tokens_seen": 176072545, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 8188, + "time_per_iteration": 2.4889986515045166 + }, + { + "auxiliary_loss_clip": 0.01108628, + "auxiliary_loss_mlp": 0.01027775, + "balance_loss_clip": 1.01627779, + "balance_loss_mlp": 1.03854966, + "epoch": 0.4923493160980009, + "flos": 27122359818240.0, + "grad_norm": 1.5428848144341532, + "language_loss": 0.7457763, + "learning_rate": 2.1465009939097623e-06, + "loss": 0.76714027, + "num_input_tokens_seen": 176091490, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.703125, + "step": 8189, + "time_per_iteration": 2.4837827682495117 + }, + { + "auxiliary_loss_clip": 0.011062, + "auxiliary_loss_mlp": 0.01032136, + "balance_loss_clip": 1.01975584, + "balance_loss_mlp": 1.03744173, + "epoch": 0.4924094393506689, + "flos": 35736683224320.0, + "grad_norm": 1.5888967888129601, + "language_loss": 0.64360684, + "learning_rate": 2.146112575713104e-06, + "loss": 0.66499019, + "num_input_tokens_seen": 176113200, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 8190, + "time_per_iteration": 2.606388807296753 + }, + { + "auxiliary_loss_clip": 0.01107034, + "auxiliary_loss_mlp": 0.01027622, + "balance_loss_clip": 1.01528418, + "balance_loss_mlp": 1.0383538, + "epoch": 0.49246956260333685, + "flos": 20412487509120.0, + "grad_norm": 1.9368790872615624, + "language_loss": 0.71231604, + "learning_rate": 2.1457241519758413e-06, + "loss": 0.73366261, + "num_input_tokens_seen": 176132485, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6875, + "step": 8191, + "time_per_iteration": 2.4383578300476074 + }, + { + "auxiliary_loss_clip": 0.01106251, + "auxiliary_loss_mlp": 0.01033855, + "balance_loss_clip": 1.02162957, + "balance_loss_mlp": 1.03718042, + "epoch": 0.4925296858560048, + "flos": 38976938231040.0, + "grad_norm": 1.5667911589112589, + "language_loss": 0.71698356, + "learning_rate": 2.1453357227127043e-06, + "loss": 0.7383846, + "num_input_tokens_seen": 176155755, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69140625, + "step": 8192, + "time_per_iteration": 2.6127231121063232 + }, + { + "auxiliary_loss_clip": 0.01033253, + "auxiliary_loss_mlp": 0.01011533, + "balance_loss_clip": 1.01047826, + "balance_loss_mlp": 1.00980878, + "epoch": 0.4925898091086728, + "flos": 64278917712000.0, + "grad_norm": 0.7610920789142134, + "language_loss": 0.52138889, + "learning_rate": 2.1449472879384224e-06, + "loss": 0.54183674, + "num_input_tokens_seen": 176216295, + "router_z_loss_clip": 0.01055908, + "router_z_loss_mlp": 0.234375, + "step": 8193, + "time_per_iteration": 3.1151235103607178 + }, + { + "auxiliary_loss_clip": 0.01106303, + "auxiliary_loss_mlp": 0.01037325, + "balance_loss_clip": 1.02470672, + "balance_loss_mlp": 1.03862, + "epoch": 0.49264993236134075, + "flos": 23036372110080.0, + "grad_norm": 1.5012892842908303, + "language_loss": 0.77071059, + "learning_rate": 2.1445588476677246e-06, + "loss": 0.79214686, + "num_input_tokens_seen": 176235925, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.67578125, + "step": 8194, + "time_per_iteration": 2.4766407012939453 + }, + { + "auxiliary_loss_clip": 0.01104661, + "auxiliary_loss_mlp": 0.01029401, + "balance_loss_clip": 1.01783228, + "balance_loss_mlp": 1.03554666, + "epoch": 0.4927100556140087, + "flos": 24718212668160.0, + "grad_norm": 1.9786600447906189, + "language_loss": 0.70556259, + "learning_rate": 2.144170401915341e-06, + "loss": 0.7269032, + "num_input_tokens_seen": 176253865, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.69140625, + "step": 8195, + "time_per_iteration": 2.489412784576416 + }, + { + "auxiliary_loss_clip": 0.01108151, + "auxiliary_loss_mlp": 0.01027525, + "balance_loss_clip": 1.01537156, + "balance_loss_mlp": 1.0380609, + "epoch": 0.4927701788666767, + "flos": 23505544581120.0, + "grad_norm": 1.8494849345903903, + "language_loss": 0.81095743, + "learning_rate": 2.143781950696001e-06, + "loss": 0.83231419, + "num_input_tokens_seen": 176271525, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69921875, + "step": 8196, + "time_per_iteration": 2.5489988327026367 + }, + { + "auxiliary_loss_clip": 0.01108856, + "auxiliary_loss_mlp": 0.01033443, + "balance_loss_clip": 1.02033019, + "balance_loss_mlp": 1.03709757, + "epoch": 0.49283030211934464, + "flos": 22928891639040.0, + "grad_norm": 1.848981865854384, + "language_loss": 0.7100687, + "learning_rate": 2.1433934940244356e-06, + "loss": 0.73149174, + "num_input_tokens_seen": 176290810, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 8197, + "time_per_iteration": 2.4621787071228027 + }, + { + "auxiliary_loss_clip": 0.01105723, + "auxiliary_loss_mlp": 0.01031298, + "balance_loss_clip": 1.01988339, + "balance_loss_mlp": 1.03815627, + "epoch": 0.4928904253720126, + "flos": 16873024210560.0, + "grad_norm": 1.7362069513061655, + "language_loss": 0.84122622, + "learning_rate": 2.143005031915374e-06, + "loss": 0.86259645, + "num_input_tokens_seen": 176309165, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.67578125, + "step": 8198, + "time_per_iteration": 2.4596786499023438 + }, + { + "auxiliary_loss_clip": 0.01110423, + "auxiliary_loss_mlp": 0.01034702, + "balance_loss_clip": 1.02139831, + "balance_loss_mlp": 1.03913713, + "epoch": 0.4929505486246806, + "flos": 14866551509760.0, + "grad_norm": 1.767623263247313, + "language_loss": 0.76214266, + "learning_rate": 2.1426165643835467e-06, + "loss": 0.78359395, + "num_input_tokens_seen": 176324960, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 8199, + "time_per_iteration": 2.413482189178467 + }, + { + "auxiliary_loss_clip": 0.01109624, + "auxiliary_loss_mlp": 0.01035996, + "balance_loss_clip": 1.02215028, + "balance_loss_mlp": 1.03712904, + "epoch": 0.49301067187734854, + "flos": 23842351434240.0, + "grad_norm": 1.555242231339172, + "language_loss": 0.59918249, + "learning_rate": 2.1422280914436864e-06, + "loss": 0.62063873, + "num_input_tokens_seen": 176346195, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7265625, + "step": 8200, + "time_per_iteration": 2.515371561050415 + }, + { + "auxiliary_loss_clip": 0.01101467, + "auxiliary_loss_mlp": 0.01033481, + "balance_loss_clip": 1.02128601, + "balance_loss_mlp": 1.03560054, + "epoch": 0.49307079513001656, + "flos": 22491284244480.0, + "grad_norm": 1.4972351372180894, + "language_loss": 0.78781515, + "learning_rate": 2.1418396131105213e-06, + "loss": 0.80916464, + "num_input_tokens_seen": 176366735, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66015625, + "step": 8201, + "time_per_iteration": 2.4688665866851807 + }, + { + "auxiliary_loss_clip": 0.01112129, + "auxiliary_loss_mlp": 0.01032302, + "balance_loss_clip": 1.01858091, + "balance_loss_mlp": 1.03761029, + "epoch": 0.4931309183826845, + "flos": 15924587546880.0, + "grad_norm": 2.1515546014570766, + "language_loss": 0.67352241, + "learning_rate": 2.141451129398785e-06, + "loss": 0.69496673, + "num_input_tokens_seen": 176384475, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7421875, + "step": 8202, + "time_per_iteration": 2.6021947860717773 + }, + { + "auxiliary_loss_clip": 0.01106578, + "auxiliary_loss_mlp": 0.01030125, + "balance_loss_clip": 1.01781058, + "balance_loss_mlp": 1.03682148, + "epoch": 0.4931910416353525, + "flos": 27309059735040.0, + "grad_norm": 3.4273755266911845, + "language_loss": 0.75192142, + "learning_rate": 2.1410626403232076e-06, + "loss": 0.77328843, + "num_input_tokens_seen": 176402645, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.69921875, + "step": 8203, + "time_per_iteration": 2.501173496246338 + }, + { + "auxiliary_loss_clip": 0.01106791, + "auxiliary_loss_mlp": 0.01034465, + "balance_loss_clip": 1.0214237, + "balance_loss_mlp": 1.03780818, + "epoch": 0.49325116488802045, + "flos": 20806139635200.0, + "grad_norm": 2.0656815740777152, + "language_loss": 0.80908394, + "learning_rate": 2.1406741458985197e-06, + "loss": 0.83049649, + "num_input_tokens_seen": 176416715, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6875, + "step": 8204, + "time_per_iteration": 2.481666088104248 + }, + { + "auxiliary_loss_clip": 0.01105243, + "auxiliary_loss_mlp": 0.01033404, + "balance_loss_clip": 1.02180493, + "balance_loss_mlp": 1.03788805, + "epoch": 0.4933112881406884, + "flos": 19865963099520.0, + "grad_norm": 2.2280647806743183, + "language_loss": 0.65550953, + "learning_rate": 2.140285646139455e-06, + "loss": 0.67689598, + "num_input_tokens_seen": 176435755, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 8205, + "time_per_iteration": 2.439408302307129 + }, + { + "auxiliary_loss_clip": 0.01112282, + "auxiliary_loss_mlp": 0.01035253, + "balance_loss_clip": 1.02083468, + "balance_loss_mlp": 1.03837705, + "epoch": 0.4933714113933564, + "flos": 21827977741440.0, + "grad_norm": 1.7727903919462147, + "language_loss": 0.67009246, + "learning_rate": 2.139897141060744e-06, + "loss": 0.69156778, + "num_input_tokens_seen": 176453915, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.73828125, + "step": 8206, + "time_per_iteration": 2.4607954025268555 + }, + { + "auxiliary_loss_clip": 0.01106251, + "auxiliary_loss_mlp": 0.01026736, + "balance_loss_clip": 1.01473176, + "balance_loss_mlp": 1.03630567, + "epoch": 0.49343153464602435, + "flos": 27890130049920.0, + "grad_norm": 1.822649710507408, + "language_loss": 0.76363301, + "learning_rate": 2.1395086306771196e-06, + "loss": 0.78496289, + "num_input_tokens_seen": 176475175, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69921875, + "step": 8207, + "time_per_iteration": 2.508553981781006 + }, + { + "auxiliary_loss_clip": 0.01109244, + "auxiliary_loss_mlp": 0.01032575, + "balance_loss_clip": 1.01912785, + "balance_loss_mlp": 1.03869963, + "epoch": 0.4934916578986923, + "flos": 24681080983680.0, + "grad_norm": 2.308112072386131, + "language_loss": 0.59984541, + "learning_rate": 2.1391201150033147e-06, + "loss": 0.62126362, + "num_input_tokens_seen": 176494250, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.703125, + "step": 8208, + "time_per_iteration": 2.505990982055664 + }, + { + "auxiliary_loss_clip": 0.01108969, + "auxiliary_loss_mlp": 0.01029375, + "balance_loss_clip": 1.01622033, + "balance_loss_mlp": 1.03816974, + "epoch": 0.4935517811513603, + "flos": 23405139089280.0, + "grad_norm": 2.3772506823576407, + "language_loss": 0.7851491, + "learning_rate": 2.1387315940540598e-06, + "loss": 0.80653256, + "num_input_tokens_seen": 176513325, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.70703125, + "step": 8209, + "time_per_iteration": 2.4622652530670166 + }, + { + "auxiliary_loss_clip": 0.01105789, + "auxiliary_loss_mlp": 0.01030424, + "balance_loss_clip": 1.01728129, + "balance_loss_mlp": 1.03630066, + "epoch": 0.49361190440402825, + "flos": 21944508439680.0, + "grad_norm": 1.7984719462813816, + "language_loss": 0.78806269, + "learning_rate": 2.138343067844089e-06, + "loss": 0.80942488, + "num_input_tokens_seen": 176532915, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6953125, + "step": 8210, + "time_per_iteration": 2.4884698390960693 + }, + { + "auxiliary_loss_clip": 0.01111365, + "auxiliary_loss_mlp": 0.01032283, + "balance_loss_clip": 1.01888382, + "balance_loss_mlp": 1.0381912, + "epoch": 0.4936720276566962, + "flos": 25115671635840.0, + "grad_norm": 2.2650712316686903, + "language_loss": 0.81229484, + "learning_rate": 2.1379545363881363e-06, + "loss": 0.83373135, + "num_input_tokens_seen": 176552775, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73046875, + "step": 8211, + "time_per_iteration": 2.4839043617248535 + }, + { + "auxiliary_loss_clip": 0.01109974, + "auxiliary_loss_mlp": 0.01036004, + "balance_loss_clip": 1.02280188, + "balance_loss_mlp": 1.03911519, + "epoch": 0.4937321509093642, + "flos": 26358935132160.0, + "grad_norm": 2.6136684102444665, + "language_loss": 0.91496241, + "learning_rate": 2.137565999700933e-06, + "loss": 0.93642217, + "num_input_tokens_seen": 176572185, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 8212, + "time_per_iteration": 2.5103862285614014 + }, + { + "auxiliary_loss_clip": 0.01106972, + "auxiliary_loss_mlp": 0.01031672, + "balance_loss_clip": 1.01925647, + "balance_loss_mlp": 1.036484, + "epoch": 0.49379227416203214, + "flos": 22961390469120.0, + "grad_norm": 1.7787072133843917, + "language_loss": 0.64901662, + "learning_rate": 2.1371774577972138e-06, + "loss": 0.670403, + "num_input_tokens_seen": 176591490, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.703125, + "step": 8213, + "time_per_iteration": 2.460123300552368 + }, + { + "auxiliary_loss_clip": 0.01106125, + "auxiliary_loss_mlp": 0.01027241, + "balance_loss_clip": 1.01356125, + "balance_loss_mlp": 1.03668904, + "epoch": 0.49385239741470016, + "flos": 32489101843200.0, + "grad_norm": 1.9389339120527038, + "language_loss": 0.75199962, + "learning_rate": 2.136788910691711e-06, + "loss": 0.77333331, + "num_input_tokens_seen": 176612715, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.69140625, + "step": 8214, + "time_per_iteration": 2.5719900131225586 + }, + { + "auxiliary_loss_clip": 0.01109359, + "auxiliary_loss_mlp": 0.01035274, + "balance_loss_clip": 1.02212512, + "balance_loss_mlp": 1.03959298, + "epoch": 0.4939125206673681, + "flos": 22492864442880.0, + "grad_norm": 1.828808325177945, + "language_loss": 0.84395385, + "learning_rate": 2.1364003583991594e-06, + "loss": 0.86540014, + "num_input_tokens_seen": 176631950, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.6953125, + "step": 8215, + "time_per_iteration": 2.468804121017456 + }, + { + "auxiliary_loss_clip": 0.01101226, + "auxiliary_loss_mlp": 0.0102806, + "balance_loss_clip": 1.01656199, + "balance_loss_mlp": 1.03478694, + "epoch": 0.4939726439200361, + "flos": 31176351486720.0, + "grad_norm": 1.6051587100805058, + "language_loss": 0.82859147, + "learning_rate": 2.136011800934292e-06, + "loss": 0.84988439, + "num_input_tokens_seen": 176653060, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 8216, + "time_per_iteration": 2.5819287300109863 + }, + { + "auxiliary_loss_clip": 0.01107134, + "auxiliary_loss_mlp": 0.01031262, + "balance_loss_clip": 1.01918006, + "balance_loss_mlp": 1.03821325, + "epoch": 0.49403276717270406, + "flos": 22674213233280.0, + "grad_norm": 1.4383830441547378, + "language_loss": 0.74774921, + "learning_rate": 2.1356232383118442e-06, + "loss": 0.76913321, + "num_input_tokens_seen": 176673895, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 8217, + "time_per_iteration": 2.4628379344940186 + }, + { + "auxiliary_loss_clip": 0.01104285, + "auxiliary_loss_mlp": 0.01032577, + "balance_loss_clip": 1.01928544, + "balance_loss_mlp": 1.03777707, + "epoch": 0.494092890425372, + "flos": 20741070147840.0, + "grad_norm": 1.733886360732455, + "language_loss": 0.78829861, + "learning_rate": 2.1352346705465494e-06, + "loss": 0.80966723, + "num_input_tokens_seen": 176692550, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6640625, + "step": 8218, + "time_per_iteration": 2.4809412956237793 + }, + { + "auxiliary_loss_clip": 0.0110198, + "auxiliary_loss_mlp": 0.01035085, + "balance_loss_clip": 1.02269292, + "balance_loss_mlp": 1.03510332, + "epoch": 0.49415301367804, + "flos": 18369026778240.0, + "grad_norm": 2.0240627965271187, + "language_loss": 0.76301086, + "learning_rate": 2.134846097653142e-06, + "loss": 0.78438151, + "num_input_tokens_seen": 176709335, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 8219, + "time_per_iteration": 3.8202009201049805 + }, + { + "auxiliary_loss_clip": 0.01108105, + "auxiliary_loss_mlp": 0.01033934, + "balance_loss_clip": 1.02106571, + "balance_loss_mlp": 1.03764367, + "epoch": 0.49421313693070795, + "flos": 17530620451200.0, + "grad_norm": 1.6690505128843895, + "language_loss": 0.6190055, + "learning_rate": 2.134457519646357e-06, + "loss": 0.64042592, + "num_input_tokens_seen": 176727715, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 8220, + "time_per_iteration": 2.453230857849121 + }, + { + "auxiliary_loss_clip": 0.01106287, + "auxiliary_loss_mlp": 0.01029403, + "balance_loss_clip": 1.01656425, + "balance_loss_mlp": 1.03672814, + "epoch": 0.4942732601833759, + "flos": 20812173120000.0, + "grad_norm": 1.7319378421104112, + "language_loss": 0.72381485, + "learning_rate": 2.1340689365409296e-06, + "loss": 0.74517179, + "num_input_tokens_seen": 176747530, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 8221, + "time_per_iteration": 5.506774187088013 + }, + { + "auxiliary_loss_clip": 0.01106502, + "auxiliary_loss_mlp": 0.01037169, + "balance_loss_clip": 1.02521193, + "balance_loss_mlp": 1.04006767, + "epoch": 0.4943333834360439, + "flos": 15048941794560.0, + "grad_norm": 1.681203667545881, + "language_loss": 0.79131603, + "learning_rate": 2.133680348351595e-06, + "loss": 0.81275266, + "num_input_tokens_seen": 176765260, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6640625, + "step": 8222, + "time_per_iteration": 2.491175889968872 + }, + { + "auxiliary_loss_clip": 0.01108448, + "auxiliary_loss_mlp": 0.01034922, + "balance_loss_clip": 1.02147555, + "balance_loss_mlp": 1.03941715, + "epoch": 0.49439350668871185, + "flos": 16070420764800.0, + "grad_norm": 2.3506903054927015, + "language_loss": 0.73205507, + "learning_rate": 2.133291755093088e-06, + "loss": 0.75348878, + "num_input_tokens_seen": 176781770, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.69140625, + "step": 8223, + "time_per_iteration": 2.4359662532806396 + }, + { + "auxiliary_loss_clip": 0.01109917, + "auxiliary_loss_mlp": 0.01035963, + "balance_loss_clip": 1.02264762, + "balance_loss_mlp": 1.03850269, + "epoch": 0.4944536299413798, + "flos": 20880079781760.0, + "grad_norm": 1.7533498543998463, + "language_loss": 0.75144434, + "learning_rate": 2.132903156780144e-06, + "loss": 0.7729032, + "num_input_tokens_seen": 176800655, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.71484375, + "step": 8224, + "time_per_iteration": 2.5716288089752197 + }, + { + "auxiliary_loss_clip": 0.01111376, + "auxiliary_loss_mlp": 0.01030429, + "balance_loss_clip": 1.01807868, + "balance_loss_mlp": 1.04080439, + "epoch": 0.4945137531940478, + "flos": 26608908856320.0, + "grad_norm": 2.086998261136206, + "language_loss": 0.63982892, + "learning_rate": 2.1325145534274997e-06, + "loss": 0.66124696, + "num_input_tokens_seen": 176820610, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.70703125, + "step": 8225, + "time_per_iteration": 2.524048089981079 + }, + { + "auxiliary_loss_clip": 0.01107484, + "auxiliary_loss_mlp": 0.01033109, + "balance_loss_clip": 1.0206579, + "balance_loss_mlp": 1.03766608, + "epoch": 0.49457387644671574, + "flos": 23988148738560.0, + "grad_norm": 1.839126557537864, + "language_loss": 0.76359057, + "learning_rate": 2.1321259450498893e-06, + "loss": 0.78499651, + "num_input_tokens_seen": 176840520, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 8226, + "time_per_iteration": 2.5069961547851562 + }, + { + "auxiliary_loss_clip": 0.01109174, + "auxiliary_loss_mlp": 0.01039999, + "balance_loss_clip": 1.02578914, + "balance_loss_mlp": 1.03735518, + "epoch": 0.49463399969938376, + "flos": 26976598427520.0, + "grad_norm": 1.6377261486682646, + "language_loss": 0.71156305, + "learning_rate": 2.131737331662051e-06, + "loss": 0.73305476, + "num_input_tokens_seen": 176860265, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.71875, + "step": 8227, + "time_per_iteration": 2.4763920307159424 + }, + { + "auxiliary_loss_clip": 0.01112289, + "auxiliary_loss_mlp": 0.01034696, + "balance_loss_clip": 1.02160668, + "balance_loss_mlp": 1.03914213, + "epoch": 0.49469412295205173, + "flos": 29681534067840.0, + "grad_norm": 1.614424212368193, + "language_loss": 0.71484196, + "learning_rate": 2.131348713278718e-06, + "loss": 0.73631173, + "num_input_tokens_seen": 176882910, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.734375, + "step": 8228, + "time_per_iteration": 2.550083637237549 + }, + { + "auxiliary_loss_clip": 0.01105792, + "auxiliary_loss_mlp": 0.0103118, + "balance_loss_clip": 1.01829386, + "balance_loss_mlp": 1.03837276, + "epoch": 0.4947542462047197, + "flos": 24131791226880.0, + "grad_norm": 1.6200219454444607, + "language_loss": 0.83788311, + "learning_rate": 2.1309600899146304e-06, + "loss": 0.85925281, + "num_input_tokens_seen": 176903030, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.67578125, + "step": 8229, + "time_per_iteration": 2.474684238433838 + }, + { + "auxiliary_loss_clip": 0.01108289, + "auxiliary_loss_mlp": 0.01035108, + "balance_loss_clip": 1.02103567, + "balance_loss_mlp": 1.03685689, + "epoch": 0.49481436945738766, + "flos": 20045049333120.0, + "grad_norm": 2.055489394198818, + "language_loss": 0.75105131, + "learning_rate": 2.1305714615845227e-06, + "loss": 0.77248526, + "num_input_tokens_seen": 176919025, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.71484375, + "step": 8230, + "time_per_iteration": 2.506950616836548 + }, + { + "auxiliary_loss_clip": 0.01109012, + "auxiliary_loss_mlp": 0.01027613, + "balance_loss_clip": 1.01497638, + "balance_loss_mlp": 1.03868175, + "epoch": 0.4948744927100556, + "flos": 15669550005120.0, + "grad_norm": 2.703005059233118, + "language_loss": 0.79713035, + "learning_rate": 2.1301828283031314e-06, + "loss": 0.8184967, + "num_input_tokens_seen": 176937945, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 8231, + "time_per_iteration": 2.4176137447357178 + }, + { + "auxiliary_loss_clip": 0.01035427, + "auxiliary_loss_mlp": 0.01002857, + "balance_loss_clip": 1.00169492, + "balance_loss_mlp": 1.01191425, + "epoch": 0.4949346159627236, + "flos": 68872071502080.0, + "grad_norm": 0.7419788553124401, + "language_loss": 0.60237485, + "learning_rate": 2.1297941900851944e-06, + "loss": 0.62275773, + "num_input_tokens_seen": 177004575, + "router_z_loss_clip": 0.01159668, + "router_z_loss_mlp": 0.23535156, + "step": 8232, + "time_per_iteration": 3.183783531188965 + }, + { + "auxiliary_loss_clip": 0.0111307, + "auxiliary_loss_mlp": 0.01035045, + "balance_loss_clip": 1.02119923, + "balance_loss_mlp": 1.03889871, + "epoch": 0.49499473921539155, + "flos": 24790285307520.0, + "grad_norm": 1.7147216218758814, + "language_loss": 0.69257128, + "learning_rate": 2.1294055469454496e-06, + "loss": 0.71405244, + "num_input_tokens_seen": 177024155, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7421875, + "step": 8233, + "time_per_iteration": 2.477755546569824 + }, + { + "auxiliary_loss_clip": 0.01106869, + "auxiliary_loss_mlp": 0.01035041, + "balance_loss_clip": 1.02111769, + "balance_loss_mlp": 1.03714275, + "epoch": 0.4950548624680595, + "flos": 32707905540480.0, + "grad_norm": 3.246275947254348, + "language_loss": 0.6678468, + "learning_rate": 2.129016898898633e-06, + "loss": 0.68926585, + "num_input_tokens_seen": 177046185, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.6953125, + "step": 8234, + "time_per_iteration": 2.5594117641448975 + }, + { + "auxiliary_loss_clip": 0.0103478, + "auxiliary_loss_mlp": 0.01003988, + "balance_loss_clip": 1.00288522, + "balance_loss_mlp": 1.01140106, + "epoch": 0.4951149857207275, + "flos": 50082173066880.0, + "grad_norm": 0.8288840425421409, + "language_loss": 0.57987183, + "learning_rate": 2.128628245959482e-06, + "loss": 0.60025948, + "num_input_tokens_seen": 177099025, + "router_z_loss_clip": 0.01104736, + "router_z_loss_mlp": 0.234375, + "step": 8235, + "time_per_iteration": 3.0041370391845703 + }, + { + "auxiliary_loss_clip": 0.01108932, + "auxiliary_loss_mlp": 0.01037443, + "balance_loss_clip": 1.02345991, + "balance_loss_mlp": 1.03770208, + "epoch": 0.49517510897339545, + "flos": 22236785406720.0, + "grad_norm": 1.4917768542550827, + "language_loss": 0.76824737, + "learning_rate": 2.1282395881427355e-06, + "loss": 0.78971112, + "num_input_tokens_seen": 177118365, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7109375, + "step": 8236, + "time_per_iteration": 2.498105525970459 + }, + { + "auxiliary_loss_clip": 0.01107773, + "auxiliary_loss_mlp": 0.01032214, + "balance_loss_clip": 1.01948881, + "balance_loss_mlp": 1.03860247, + "epoch": 0.4952352322260634, + "flos": 25374120969600.0, + "grad_norm": 1.8006519774313887, + "language_loss": 0.72554326, + "learning_rate": 2.1278509254631315e-06, + "loss": 0.74694312, + "num_input_tokens_seen": 177136415, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 8237, + "time_per_iteration": 2.487849473953247 + }, + { + "auxiliary_loss_clip": 0.01105674, + "auxiliary_loss_mlp": 0.01032739, + "balance_loss_clip": 1.02024627, + "balance_loss_mlp": 1.03722131, + "epoch": 0.4952953554787314, + "flos": 24608721035520.0, + "grad_norm": 1.8061825502363815, + "language_loss": 0.75687563, + "learning_rate": 2.127462257935406e-06, + "loss": 0.77825987, + "num_input_tokens_seen": 177155690, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 8238, + "time_per_iteration": 2.4926116466522217 + }, + { + "auxiliary_loss_clip": 0.01110283, + "auxiliary_loss_mlp": 0.01034271, + "balance_loss_clip": 1.02057362, + "balance_loss_mlp": 1.03765702, + "epoch": 0.49535547873139935, + "flos": 17311278049920.0, + "grad_norm": 2.197202607879525, + "language_loss": 0.73434591, + "learning_rate": 2.1270735855743008e-06, + "loss": 0.75579149, + "num_input_tokens_seen": 177173350, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7265625, + "step": 8239, + "time_per_iteration": 2.4181203842163086 + }, + { + "auxiliary_loss_clip": 0.01109997, + "auxiliary_loss_mlp": 0.01037672, + "balance_loss_clip": 1.02266932, + "balance_loss_mlp": 1.03704619, + "epoch": 0.4954156019840673, + "flos": 20740315962240.0, + "grad_norm": 2.4131176994917936, + "language_loss": 0.78344893, + "learning_rate": 2.126684908394552e-06, + "loss": 0.80492562, + "num_input_tokens_seen": 177191115, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.73046875, + "step": 8240, + "time_per_iteration": 2.479642391204834 + }, + { + "auxiliary_loss_clip": 0.01104608, + "auxiliary_loss_mlp": 0.01040833, + "balance_loss_clip": 1.02865601, + "balance_loss_mlp": 1.03746533, + "epoch": 0.49547572523673533, + "flos": 12820684567680.0, + "grad_norm": 2.0234307188816993, + "language_loss": 0.85579056, + "learning_rate": 2.126296226410898e-06, + "loss": 0.87724495, + "num_input_tokens_seen": 177206155, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.671875, + "step": 8241, + "time_per_iteration": 2.4081263542175293 + }, + { + "auxiliary_loss_clip": 0.01106442, + "auxiliary_loss_mlp": 0.01035574, + "balance_loss_clip": 1.02337933, + "balance_loss_mlp": 1.03813624, + "epoch": 0.4955358484894033, + "flos": 15597046402560.0, + "grad_norm": 1.761079127200854, + "language_loss": 0.77041149, + "learning_rate": 2.1259075396380794e-06, + "loss": 0.79183173, + "num_input_tokens_seen": 177224815, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 8242, + "time_per_iteration": 2.4439215660095215 + }, + { + "auxiliary_loss_clip": 0.01106589, + "auxiliary_loss_mlp": 0.01031002, + "balance_loss_clip": 1.01821673, + "balance_loss_mlp": 1.03676701, + "epoch": 0.49559597174207126, + "flos": 26464368528000.0, + "grad_norm": 1.7216813067847012, + "language_loss": 0.67493725, + "learning_rate": 2.125518848090833e-06, + "loss": 0.6963132, + "num_input_tokens_seen": 177244490, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 8243, + "time_per_iteration": 2.4888081550598145 + }, + { + "auxiliary_loss_clip": 0.01107757, + "auxiliary_loss_mlp": 0.01030474, + "balance_loss_clip": 1.01805878, + "balance_loss_mlp": 1.03910422, + "epoch": 0.4956560949947392, + "flos": 23148234040320.0, + "grad_norm": 1.8355775234908949, + "language_loss": 0.68218768, + "learning_rate": 2.125130151783901e-06, + "loss": 0.70357001, + "num_input_tokens_seen": 177264340, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 8244, + "time_per_iteration": 2.481220245361328 + }, + { + "auxiliary_loss_clip": 0.01109231, + "auxiliary_loss_mlp": 0.01035961, + "balance_loss_clip": 1.02201915, + "balance_loss_mlp": 1.03828287, + "epoch": 0.4957162182474072, + "flos": 20773461237120.0, + "grad_norm": 1.8414695050792438, + "language_loss": 0.74998277, + "learning_rate": 2.12474145073202e-06, + "loss": 0.77143466, + "num_input_tokens_seen": 177283055, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7109375, + "step": 8245, + "time_per_iteration": 2.459244728088379 + }, + { + "auxiliary_loss_clip": 0.01105994, + "auxiliary_loss_mlp": 0.01029176, + "balance_loss_clip": 1.01628923, + "balance_loss_mlp": 1.03797877, + "epoch": 0.49577634150007516, + "flos": 18734202397440.0, + "grad_norm": 3.047248940663427, + "language_loss": 0.81496358, + "learning_rate": 2.1243527449499306e-06, + "loss": 0.83631527, + "num_input_tokens_seen": 177301140, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6796875, + "step": 8246, + "time_per_iteration": 2.54664945602417 + }, + { + "auxiliary_loss_clip": 0.01110421, + "auxiliary_loss_mlp": 0.01039175, + "balance_loss_clip": 1.02553713, + "balance_loss_mlp": 1.03858495, + "epoch": 0.4958364647527431, + "flos": 25554176870400.0, + "grad_norm": 1.7095262667552558, + "language_loss": 0.83750397, + "learning_rate": 2.1239640344523733e-06, + "loss": 0.85899985, + "num_input_tokens_seen": 177323095, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71875, + "step": 8247, + "time_per_iteration": 2.478410482406616 + }, + { + "auxiliary_loss_clip": 0.01111487, + "auxiliary_loss_mlp": 0.01030982, + "balance_loss_clip": 1.01897812, + "balance_loss_mlp": 1.04011726, + "epoch": 0.4958965880054111, + "flos": 24425325169920.0, + "grad_norm": 2.0177325188605018, + "language_loss": 0.83758432, + "learning_rate": 2.123575319254087e-06, + "loss": 0.85900903, + "num_input_tokens_seen": 177339845, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.7109375, + "step": 8248, + "time_per_iteration": 2.490619659423828 + }, + { + "auxiliary_loss_clip": 0.01109734, + "auxiliary_loss_mlp": 0.01028278, + "balance_loss_clip": 1.01518941, + "balance_loss_mlp": 1.03800774, + "epoch": 0.49595671125807905, + "flos": 25083460114560.0, + "grad_norm": 2.055191909263014, + "language_loss": 0.73715985, + "learning_rate": 2.123186599369812e-06, + "loss": 0.75853992, + "num_input_tokens_seen": 177359980, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 8249, + "time_per_iteration": 2.5232534408569336 + }, + { + "auxiliary_loss_clip": 0.0111234, + "auxiliary_loss_mlp": 0.01038359, + "balance_loss_clip": 1.02504992, + "balance_loss_mlp": 1.04018188, + "epoch": 0.496016834510747, + "flos": 16435883692800.0, + "grad_norm": 1.9063816639589337, + "language_loss": 0.76176995, + "learning_rate": 2.122797874814289e-06, + "loss": 0.78327698, + "num_input_tokens_seen": 177378580, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.72265625, + "step": 8250, + "time_per_iteration": 2.5368192195892334 + }, + { + "auxiliary_loss_clip": 0.01108406, + "auxiliary_loss_mlp": 0.0103451, + "balance_loss_clip": 1.02170718, + "balance_loss_mlp": 1.03792036, + "epoch": 0.496076957763415, + "flos": 23437925228160.0, + "grad_norm": 1.615677709430237, + "language_loss": 0.69986647, + "learning_rate": 2.1224091456022585e-06, + "loss": 0.72129565, + "num_input_tokens_seen": 177398790, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.703125, + "step": 8251, + "time_per_iteration": 2.4543070793151855 + }, + { + "auxiliary_loss_clip": 0.01108023, + "auxiliary_loss_mlp": 0.01027913, + "balance_loss_clip": 1.01586699, + "balance_loss_mlp": 1.03890181, + "epoch": 0.49613708101608295, + "flos": 16909509450240.0, + "grad_norm": 1.8749041446582064, + "language_loss": 0.79864365, + "learning_rate": 2.122020411748461e-06, + "loss": 0.82000297, + "num_input_tokens_seen": 177416515, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 8252, + "time_per_iteration": 2.4386792182922363 + }, + { + "auxiliary_loss_clip": 0.01108859, + "auxiliary_loss_mlp": 0.01028142, + "balance_loss_clip": 1.01384854, + "balance_loss_mlp": 1.03821409, + "epoch": 0.4961972042687509, + "flos": 16618094409600.0, + "grad_norm": 1.7863838823967775, + "language_loss": 0.80688357, + "learning_rate": 2.1216316732676363e-06, + "loss": 0.82825357, + "num_input_tokens_seen": 177434425, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.70703125, + "step": 8253, + "time_per_iteration": 2.440727710723877 + }, + { + "auxiliary_loss_clip": 0.01105434, + "auxiliary_loss_mlp": 0.01030191, + "balance_loss_clip": 1.01863384, + "balance_loss_mlp": 1.03654194, + "epoch": 0.49625732752141893, + "flos": 28956749437440.0, + "grad_norm": 1.548882190492268, + "language_loss": 0.67088544, + "learning_rate": 2.1212429301745275e-06, + "loss": 0.69224173, + "num_input_tokens_seen": 177459675, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6875, + "step": 8254, + "time_per_iteration": 2.575490951538086 + }, + { + "auxiliary_loss_clip": 0.011067, + "auxiliary_loss_mlp": 0.01035621, + "balance_loss_clip": 1.02257323, + "balance_loss_mlp": 1.03522658, + "epoch": 0.4963174507740869, + "flos": 23112359331840.0, + "grad_norm": 1.5646536445016186, + "language_loss": 0.73859739, + "learning_rate": 2.1208541824838743e-06, + "loss": 0.76002055, + "num_input_tokens_seen": 177478895, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71484375, + "step": 8255, + "time_per_iteration": 2.478703498840332 + }, + { + "auxiliary_loss_clip": 0.0110576, + "auxiliary_loss_mlp": 0.01034939, + "balance_loss_clip": 1.02208281, + "balance_loss_mlp": 1.0362165, + "epoch": 0.49637757402675486, + "flos": 13917863450880.0, + "grad_norm": 1.8563521426834817, + "language_loss": 0.81378329, + "learning_rate": 2.1204654302104183e-06, + "loss": 0.8351903, + "num_input_tokens_seen": 177494920, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 8256, + "time_per_iteration": 2.4312291145324707 + }, + { + "auxiliary_loss_clip": 0.01105024, + "auxiliary_loss_mlp": 0.01024955, + "balance_loss_clip": 1.01246178, + "balance_loss_mlp": 1.03679466, + "epoch": 0.49643769727942283, + "flos": 22309001700480.0, + "grad_norm": 1.8572652078491616, + "language_loss": 0.80710369, + "learning_rate": 2.120076673368901e-06, + "loss": 0.82840347, + "num_input_tokens_seen": 177515455, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 8257, + "time_per_iteration": 2.4589884281158447 + }, + { + "auxiliary_loss_clip": 0.01111951, + "auxiliary_loss_mlp": 0.0103531, + "balance_loss_clip": 1.02173841, + "balance_loss_mlp": 1.03759003, + "epoch": 0.4964978205320908, + "flos": 19500248776320.0, + "grad_norm": 2.788575980623821, + "language_loss": 0.66533971, + "learning_rate": 2.1196879119740647e-06, + "loss": 0.68681228, + "num_input_tokens_seen": 177534040, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7421875, + "step": 8258, + "time_per_iteration": 2.477653741836548 + }, + { + "auxiliary_loss_clip": 0.01103911, + "auxiliary_loss_mlp": 0.01028497, + "balance_loss_clip": 1.01674283, + "balance_loss_mlp": 1.03566313, + "epoch": 0.49655794378475876, + "flos": 23436524597760.0, + "grad_norm": 2.207120440649978, + "language_loss": 0.77672231, + "learning_rate": 2.1192991460406502e-06, + "loss": 0.79804647, + "num_input_tokens_seen": 177554510, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6796875, + "step": 8259, + "time_per_iteration": 2.482516050338745 + }, + { + "auxiliary_loss_clip": 0.01107983, + "auxiliary_loss_mlp": 0.01030822, + "balance_loss_clip": 1.01802468, + "balance_loss_mlp": 1.03903294, + "epoch": 0.4966180670374267, + "flos": 26831124345600.0, + "grad_norm": 1.536511866358609, + "language_loss": 0.78612608, + "learning_rate": 2.1189103755834e-06, + "loss": 0.80751413, + "num_input_tokens_seen": 177575780, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6875, + "step": 8260, + "time_per_iteration": 4.0255560874938965 + }, + { + "auxiliary_loss_clip": 0.0110786, + "auxiliary_loss_mlp": 0.01030696, + "balance_loss_clip": 1.01785684, + "balance_loss_mlp": 1.03662324, + "epoch": 0.4966781902900947, + "flos": 22009326531840.0, + "grad_norm": 4.674193904345997, + "language_loss": 0.76227403, + "learning_rate": 2.1185216006170573e-06, + "loss": 0.78365964, + "num_input_tokens_seen": 177588965, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 8261, + "time_per_iteration": 2.537996530532837 + }, + { + "auxiliary_loss_clip": 0.01104467, + "auxiliary_loss_mlp": 0.01028346, + "balance_loss_clip": 1.01622844, + "balance_loss_mlp": 1.03667367, + "epoch": 0.49673831354276266, + "flos": 26213353309440.0, + "grad_norm": 1.9998040798137362, + "language_loss": 0.89328134, + "learning_rate": 2.1181328211563627e-06, + "loss": 0.91460943, + "num_input_tokens_seen": 177608425, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 8262, + "time_per_iteration": 5.405071020126343 + }, + { + "auxiliary_loss_clip": 0.01104636, + "auxiliary_loss_mlp": 0.01029475, + "balance_loss_clip": 1.0172143, + "balance_loss_mlp": 1.03765512, + "epoch": 0.4967984367954306, + "flos": 23182277155200.0, + "grad_norm": 1.4087924984120455, + "language_loss": 0.73918653, + "learning_rate": 2.11774403721606e-06, + "loss": 0.76052761, + "num_input_tokens_seen": 177628240, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.671875, + "step": 8263, + "time_per_iteration": 3.9610228538513184 + }, + { + "auxiliary_loss_clip": 0.01112691, + "auxiliary_loss_mlp": 0.01033653, + "balance_loss_clip": 1.0196991, + "balance_loss_mlp": 1.04077482, + "epoch": 0.4968585600480986, + "flos": 19281445079040.0, + "grad_norm": 2.641620630884259, + "language_loss": 0.69445115, + "learning_rate": 2.1173552488108923e-06, + "loss": 0.71591461, + "num_input_tokens_seen": 177645920, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.71875, + "step": 8264, + "time_per_iteration": 2.4799907207489014 + }, + { + "auxiliary_loss_clip": 0.01105995, + "auxiliary_loss_mlp": 0.01028905, + "balance_loss_clip": 1.01585722, + "balance_loss_mlp": 1.03470981, + "epoch": 0.49691868330076655, + "flos": 22528703237760.0, + "grad_norm": 1.3808235907294704, + "language_loss": 0.64915001, + "learning_rate": 2.1169664559556007e-06, + "loss": 0.67049909, + "num_input_tokens_seen": 177667185, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 8265, + "time_per_iteration": 2.491708517074585 + }, + { + "auxiliary_loss_clip": 0.01034788, + "auxiliary_loss_mlp": 0.01001781, + "balance_loss_clip": 1.00064886, + "balance_loss_mlp": 1.01169205, + "epoch": 0.4969788065534345, + "flos": 66577128675840.0, + "grad_norm": 0.8684712318419048, + "language_loss": 0.53446817, + "learning_rate": 2.1165776586649304e-06, + "loss": 0.55483389, + "num_input_tokens_seen": 177733020, + "router_z_loss_clip": 0.01135254, + "router_z_loss_mlp": 0.23144531, + "step": 8266, + "time_per_iteration": 3.1343002319335938 + }, + { + "auxiliary_loss_clip": 0.01104137, + "auxiliary_loss_mlp": 0.0102799, + "balance_loss_clip": 1.01567531, + "balance_loss_mlp": 1.03706813, + "epoch": 0.49703892980610254, + "flos": 24059503105920.0, + "grad_norm": 3.469499482915289, + "language_loss": 0.79616332, + "learning_rate": 2.1161888569536223e-06, + "loss": 0.81748462, + "num_input_tokens_seen": 177753370, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.671875, + "step": 8267, + "time_per_iteration": 2.5316126346588135 + }, + { + "auxiliary_loss_clip": 0.01109343, + "auxiliary_loss_mlp": 0.01032461, + "balance_loss_clip": 1.01856148, + "balance_loss_mlp": 1.03869104, + "epoch": 0.4970990530587705, + "flos": 29126174912640.0, + "grad_norm": 2.5132671844419434, + "language_loss": 0.74805677, + "learning_rate": 2.1158000508364223e-06, + "loss": 0.76947474, + "num_input_tokens_seen": 177771530, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.70703125, + "step": 8268, + "time_per_iteration": 2.5102896690368652 + }, + { + "auxiliary_loss_clip": 0.0110689, + "auxiliary_loss_mlp": 0.01033853, + "balance_loss_clip": 1.01998329, + "balance_loss_mlp": 1.0366255, + "epoch": 0.49715917631143847, + "flos": 46026167258880.0, + "grad_norm": 1.9572065929893177, + "language_loss": 0.67818397, + "learning_rate": 2.115411240328073e-06, + "loss": 0.6995914, + "num_input_tokens_seen": 177796355, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.703125, + "step": 8269, + "time_per_iteration": 2.7194817066192627 + }, + { + "auxiliary_loss_clip": 0.0110431, + "auxiliary_loss_mlp": 0.01032199, + "balance_loss_clip": 1.01968217, + "balance_loss_mlp": 1.03744197, + "epoch": 0.49721929956410643, + "flos": 20191277600640.0, + "grad_norm": 1.6139896668987463, + "language_loss": 0.85450721, + "learning_rate": 2.1150224254433167e-06, + "loss": 0.87587237, + "num_input_tokens_seen": 177814300, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.671875, + "step": 8270, + "time_per_iteration": 2.4423561096191406 + }, + { + "auxiliary_loss_clip": 0.01108462, + "auxiliary_loss_mlp": 0.01029428, + "balance_loss_clip": 1.01834702, + "balance_loss_mlp": 1.03857064, + "epoch": 0.4972794228167744, + "flos": 21653560275840.0, + "grad_norm": 1.6811398863814482, + "language_loss": 0.71087623, + "learning_rate": 2.114633606196899e-06, + "loss": 0.73225504, + "num_input_tokens_seen": 177833615, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.69921875, + "step": 8271, + "time_per_iteration": 2.54892635345459 + }, + { + "auxiliary_loss_clip": 0.01109407, + "auxiliary_loss_mlp": 0.01029685, + "balance_loss_clip": 1.01633358, + "balance_loss_mlp": 1.03880143, + "epoch": 0.49733954606944236, + "flos": 24279743347200.0, + "grad_norm": 1.4557340389451365, + "language_loss": 0.7848624, + "learning_rate": 2.1142447826035635e-06, + "loss": 0.80625331, + "num_input_tokens_seen": 177855315, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.703125, + "step": 8272, + "time_per_iteration": 2.462470054626465 + }, + { + "auxiliary_loss_clip": 0.01109645, + "auxiliary_loss_mlp": 0.01035441, + "balance_loss_clip": 1.02275074, + "balance_loss_mlp": 1.03950167, + "epoch": 0.4973996693221103, + "flos": 37852575730560.0, + "grad_norm": 2.5057831430835686, + "language_loss": 0.66278791, + "learning_rate": 2.1138559546780544e-06, + "loss": 0.68423879, + "num_input_tokens_seen": 177875590, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 8273, + "time_per_iteration": 2.6735026836395264 + }, + { + "auxiliary_loss_clip": 0.01109746, + "auxiliary_loss_mlp": 0.01031477, + "balance_loss_clip": 1.01891851, + "balance_loss_mlp": 1.03968048, + "epoch": 0.4974597925747783, + "flos": 21361426963200.0, + "grad_norm": 1.871691944459235, + "language_loss": 0.77977264, + "learning_rate": 2.1134671224351163e-06, + "loss": 0.80118477, + "num_input_tokens_seen": 177894175, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69921875, + "step": 8274, + "time_per_iteration": 2.462465763092041 + }, + { + "auxiliary_loss_clip": 0.01110748, + "auxiliary_loss_mlp": 0.01032049, + "balance_loss_clip": 1.01864374, + "balance_loss_mlp": 1.03865933, + "epoch": 0.49751991582744626, + "flos": 30738133560960.0, + "grad_norm": 2.0388244744713724, + "language_loss": 0.75829184, + "learning_rate": 2.113078285889493e-06, + "loss": 0.77971983, + "num_input_tokens_seen": 177913920, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.72265625, + "step": 8275, + "time_per_iteration": 2.6034398078918457 + }, + { + "auxiliary_loss_clip": 0.01110746, + "auxiliary_loss_mlp": 0.01034383, + "balance_loss_clip": 1.01974416, + "balance_loss_mlp": 1.03761268, + "epoch": 0.4975800390801142, + "flos": 14100541044480.0, + "grad_norm": 1.9341151140441402, + "language_loss": 0.8392635, + "learning_rate": 2.1126894450559303e-06, + "loss": 0.86071479, + "num_input_tokens_seen": 177930425, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.734375, + "step": 8276, + "time_per_iteration": 2.435999870300293 + }, + { + "auxiliary_loss_clip": 0.01102851, + "auxiliary_loss_mlp": 0.01028231, + "balance_loss_clip": 1.01664937, + "balance_loss_mlp": 1.03633988, + "epoch": 0.4976401623327822, + "flos": 24207275658240.0, + "grad_norm": 1.3535075156355831, + "language_loss": 0.70188868, + "learning_rate": 2.112300599949172e-06, + "loss": 0.72319949, + "num_input_tokens_seen": 177949885, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 8277, + "time_per_iteration": 2.5726187229156494 + }, + { + "auxiliary_loss_clip": 0.01105349, + "auxiliary_loss_mlp": 0.01032526, + "balance_loss_clip": 1.01952052, + "balance_loss_mlp": 1.03669858, + "epoch": 0.49770028558545015, + "flos": 21136769349120.0, + "grad_norm": 1.773647946812319, + "language_loss": 0.82609779, + "learning_rate": 2.111911750583964e-06, + "loss": 0.84747648, + "num_input_tokens_seen": 177965720, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6875, + "step": 8278, + "time_per_iteration": 2.4459898471832275 + }, + { + "auxiliary_loss_clip": 0.01108155, + "auxiliary_loss_mlp": 0.01032407, + "balance_loss_clip": 1.01992559, + "balance_loss_mlp": 1.03671384, + "epoch": 0.4977604088381181, + "flos": 16763927627520.0, + "grad_norm": 1.8017237706358624, + "language_loss": 0.6784246, + "learning_rate": 2.111522896975052e-06, + "loss": 0.69983023, + "num_input_tokens_seen": 177983190, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71484375, + "step": 8279, + "time_per_iteration": 2.4793283939361572 + }, + { + "auxiliary_loss_clip": 0.0110668, + "auxiliary_loss_mlp": 0.01034393, + "balance_loss_clip": 1.0204277, + "balance_loss_mlp": 1.03561902, + "epoch": 0.49782053209078614, + "flos": 15703521292800.0, + "grad_norm": 1.9740212049853438, + "language_loss": 0.70469928, + "learning_rate": 2.1111340391371794e-06, + "loss": 0.72610998, + "num_input_tokens_seen": 178000155, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7109375, + "step": 8280, + "time_per_iteration": 2.427778482437134 + }, + { + "auxiliary_loss_clip": 0.01104778, + "auxiliary_loss_mlp": 0.01033065, + "balance_loss_clip": 1.02028, + "balance_loss_mlp": 1.03475237, + "epoch": 0.4978806553434541, + "flos": 24753692327040.0, + "grad_norm": 1.6232736941666084, + "language_loss": 0.64461923, + "learning_rate": 2.1107451770850936e-06, + "loss": 0.66599762, + "num_input_tokens_seen": 178021060, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 8281, + "time_per_iteration": 2.511054515838623 + }, + { + "auxiliary_loss_clip": 0.01109361, + "auxiliary_loss_mlp": 0.01035185, + "balance_loss_clip": 1.02175605, + "balance_loss_mlp": 1.03830338, + "epoch": 0.49794077859612207, + "flos": 13115726881920.0, + "grad_norm": 1.82873470978674, + "language_loss": 0.72714734, + "learning_rate": 2.1103563108335387e-06, + "loss": 0.74859279, + "num_input_tokens_seen": 178038180, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7109375, + "step": 8282, + "time_per_iteration": 2.417059898376465 + }, + { + "auxiliary_loss_clip": 0.01103243, + "auxiliary_loss_mlp": 0.01029512, + "balance_loss_clip": 1.01804423, + "balance_loss_mlp": 1.03591275, + "epoch": 0.49800090184879003, + "flos": 27525133998720.0, + "grad_norm": 1.6753255120783885, + "language_loss": 0.73373008, + "learning_rate": 2.109967440397263e-06, + "loss": 0.75505757, + "num_input_tokens_seen": 178057565, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.671875, + "step": 8283, + "time_per_iteration": 2.531747341156006 + }, + { + "auxiliary_loss_clip": 0.01106082, + "auxiliary_loss_mlp": 0.01037237, + "balance_loss_clip": 1.02446926, + "balance_loss_mlp": 1.03696167, + "epoch": 0.498061025101458, + "flos": 19792489829760.0, + "grad_norm": 1.6101503544989328, + "language_loss": 0.78866243, + "learning_rate": 2.1095785657910095e-06, + "loss": 0.81009555, + "num_input_tokens_seen": 178076965, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69140625, + "step": 8284, + "time_per_iteration": 2.4609432220458984 + }, + { + "auxiliary_loss_clip": 0.01113439, + "auxiliary_loss_mlp": 0.01038109, + "balance_loss_clip": 1.02398884, + "balance_loss_mlp": 1.0390476, + "epoch": 0.49812114835412596, + "flos": 29893909230720.0, + "grad_norm": 1.8191212695174297, + "language_loss": 0.73705399, + "learning_rate": 2.109189687029526e-06, + "loss": 0.75856948, + "num_input_tokens_seen": 178095105, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.74609375, + "step": 8285, + "time_per_iteration": 2.5364696979522705 + }, + { + "auxiliary_loss_clip": 0.01112037, + "auxiliary_loss_mlp": 0.01032281, + "balance_loss_clip": 1.01872683, + "balance_loss_mlp": 1.0420599, + "epoch": 0.49818127160679393, + "flos": 23147048891520.0, + "grad_norm": 1.6445235471758528, + "language_loss": 0.74477649, + "learning_rate": 2.1088008041275598e-06, + "loss": 0.76621962, + "num_input_tokens_seen": 178114505, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.69921875, + "step": 8286, + "time_per_iteration": 2.4888620376586914 + }, + { + "auxiliary_loss_clip": 0.01112849, + "auxiliary_loss_mlp": 0.0104004, + "balance_loss_clip": 1.02713549, + "balance_loss_mlp": 1.04156506, + "epoch": 0.4982413948594619, + "flos": 21652806090240.0, + "grad_norm": 1.7365216069979077, + "language_loss": 0.85467643, + "learning_rate": 2.1084119170998545e-06, + "loss": 0.87620533, + "num_input_tokens_seen": 178131595, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 8287, + "time_per_iteration": 2.5058188438415527 + }, + { + "auxiliary_loss_clip": 0.01107755, + "auxiliary_loss_mlp": 0.01025542, + "balance_loss_clip": 1.01267338, + "balance_loss_mlp": 1.03729916, + "epoch": 0.49830151811212986, + "flos": 32486982940800.0, + "grad_norm": 1.6348463305948138, + "language_loss": 0.72363204, + "learning_rate": 2.108023025961159e-06, + "loss": 0.74496502, + "num_input_tokens_seen": 178152055, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 8288, + "time_per_iteration": 2.528475046157837 + }, + { + "auxiliary_loss_clip": 0.0111456, + "auxiliary_loss_mlp": 0.01038212, + "balance_loss_clip": 1.02319193, + "balance_loss_mlp": 1.04041409, + "epoch": 0.4983616413647978, + "flos": 18142358002560.0, + "grad_norm": 2.900373689725773, + "language_loss": 0.80002087, + "learning_rate": 2.10763413072622e-06, + "loss": 0.82154852, + "num_input_tokens_seen": 178168150, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7421875, + "step": 8289, + "time_per_iteration": 2.4667603969573975 + }, + { + "auxiliary_loss_clip": 0.01106957, + "auxiliary_loss_mlp": 0.01033142, + "balance_loss_clip": 1.0199995, + "balance_loss_mlp": 1.03680038, + "epoch": 0.4984217646174658, + "flos": 19718836992000.0, + "grad_norm": 2.15669041751919, + "language_loss": 0.73524791, + "learning_rate": 2.107245231409784e-06, + "loss": 0.7566489, + "num_input_tokens_seen": 178186150, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 8290, + "time_per_iteration": 2.4318900108337402 + }, + { + "auxiliary_loss_clip": 0.01112096, + "auxiliary_loss_mlp": 0.01037317, + "balance_loss_clip": 1.02232039, + "balance_loss_mlp": 1.04070783, + "epoch": 0.49848188787013376, + "flos": 24936549488640.0, + "grad_norm": 1.4681011524205945, + "language_loss": 0.84016359, + "learning_rate": 2.106856328026598e-06, + "loss": 0.86165774, + "num_input_tokens_seen": 178207665, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7109375, + "step": 8291, + "time_per_iteration": 2.502545118331909 + }, + { + "auxiliary_loss_clip": 0.01117247, + "auxiliary_loss_mlp": 0.01037074, + "balance_loss_clip": 1.02307272, + "balance_loss_mlp": 1.04216146, + "epoch": 0.4985420111228017, + "flos": 22382439056640.0, + "grad_norm": 1.910804847598398, + "language_loss": 0.67084122, + "learning_rate": 2.106467420591409e-06, + "loss": 0.69238442, + "num_input_tokens_seen": 178226325, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.75, + "step": 8292, + "time_per_iteration": 2.4527781009674072 + }, + { + "auxiliary_loss_clip": 0.01108825, + "auxiliary_loss_mlp": 0.01031205, + "balance_loss_clip": 1.01933742, + "balance_loss_mlp": 1.03864646, + "epoch": 0.4986021343754697, + "flos": 16216469464320.0, + "grad_norm": 1.7642237687107358, + "language_loss": 0.67300534, + "learning_rate": 2.106078509118965e-06, + "loss": 0.69440567, + "num_input_tokens_seen": 178244960, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.703125, + "step": 8293, + "time_per_iteration": 2.4598476886749268 + }, + { + "auxiliary_loss_clip": 0.01110354, + "auxiliary_loss_mlp": 0.01028466, + "balance_loss_clip": 1.01615214, + "balance_loss_mlp": 1.03958893, + "epoch": 0.4986622576281377, + "flos": 23403594804480.0, + "grad_norm": 1.987515516196069, + "language_loss": 0.8202461, + "learning_rate": 2.1056895936240133e-06, + "loss": 0.84163427, + "num_input_tokens_seen": 178265400, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.7109375, + "step": 8294, + "time_per_iteration": 2.4827442169189453 + }, + { + "auxiliary_loss_clip": 0.01110277, + "auxiliary_loss_mlp": 0.01034377, + "balance_loss_clip": 1.02032816, + "balance_loss_mlp": 1.03937042, + "epoch": 0.49872238088080567, + "flos": 19974556892160.0, + "grad_norm": 1.7471179574646651, + "language_loss": 0.73073918, + "learning_rate": 2.1053006741213016e-06, + "loss": 0.7521857, + "num_input_tokens_seen": 178284535, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7109375, + "step": 8295, + "time_per_iteration": 2.4712820053100586 + }, + { + "auxiliary_loss_clip": 0.01108254, + "auxiliary_loss_mlp": 0.01036676, + "balance_loss_clip": 1.02435029, + "balance_loss_mlp": 1.03895998, + "epoch": 0.49878250413347364, + "flos": 22893016930560.0, + "grad_norm": 1.9200384732673381, + "language_loss": 0.673262, + "learning_rate": 2.1049117506255775e-06, + "loss": 0.69471127, + "num_input_tokens_seen": 178302425, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 8296, + "time_per_iteration": 2.45139479637146 + }, + { + "auxiliary_loss_clip": 0.01111689, + "auxiliary_loss_mlp": 0.01033799, + "balance_loss_clip": 1.0202632, + "balance_loss_mlp": 1.03996015, + "epoch": 0.4988426273861416, + "flos": 32598449821440.0, + "grad_norm": 1.713618634115876, + "language_loss": 0.64634776, + "learning_rate": 2.1045228231515895e-06, + "loss": 0.66780269, + "num_input_tokens_seen": 178323065, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.71875, + "step": 8297, + "time_per_iteration": 2.5514614582061768 + }, + { + "auxiliary_loss_clip": 0.0110753, + "auxiliary_loss_mlp": 0.01033016, + "balance_loss_clip": 1.02121472, + "balance_loss_mlp": 1.03931689, + "epoch": 0.49890275063880957, + "flos": 20923604087040.0, + "grad_norm": 1.9440676372274848, + "language_loss": 0.69621831, + "learning_rate": 2.1041338917140857e-06, + "loss": 0.71762383, + "num_input_tokens_seen": 178343985, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.68359375, + "step": 8298, + "time_per_iteration": 2.4699370861053467 + }, + { + "auxiliary_loss_clip": 0.01107047, + "auxiliary_loss_mlp": 0.01036879, + "balance_loss_clip": 1.02421331, + "balance_loss_mlp": 1.03804398, + "epoch": 0.49896287389147753, + "flos": 18624459369600.0, + "grad_norm": 2.087380746796303, + "language_loss": 0.84278095, + "learning_rate": 2.103744956327814e-06, + "loss": 0.86422026, + "num_input_tokens_seen": 178362345, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 8299, + "time_per_iteration": 2.4820563793182373 + }, + { + "auxiliary_loss_clip": 0.01113334, + "auxiliary_loss_mlp": 0.01037602, + "balance_loss_clip": 1.02327859, + "balance_loss_mlp": 1.03978848, + "epoch": 0.4990229971441455, + "flos": 24826555065600.0, + "grad_norm": 5.591354549929027, + "language_loss": 0.69272447, + "learning_rate": 2.1033560170075234e-06, + "loss": 0.71423382, + "num_input_tokens_seen": 178383190, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.734375, + "step": 8300, + "time_per_iteration": 2.473634719848633 + }, + { + "auxiliary_loss_clip": 0.01037164, + "auxiliary_loss_mlp": 0.01003582, + "balance_loss_clip": 1.00239551, + "balance_loss_mlp": 1.01397431, + "epoch": 0.49908312039681346, + "flos": 71384525136000.0, + "grad_norm": 0.7592353305728455, + "language_loss": 0.51136976, + "learning_rate": 2.1029670737679623e-06, + "loss": 0.5317772, + "num_input_tokens_seen": 178444250, + "router_z_loss_clip": 0.01184082, + "router_z_loss_mlp": 0.23242188, + "step": 8301, + "time_per_iteration": 3.1719589233398438 + }, + { + "auxiliary_loss_clip": 0.01106374, + "auxiliary_loss_mlp": 0.01040035, + "balance_loss_clip": 1.02670741, + "balance_loss_mlp": 1.03841138, + "epoch": 0.4991432436494814, + "flos": 19828651847040.0, + "grad_norm": 1.9297901828770159, + "language_loss": 0.84423494, + "learning_rate": 2.102578126623879e-06, + "loss": 0.86569905, + "num_input_tokens_seen": 178463250, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.6796875, + "step": 8302, + "time_per_iteration": 3.8624472618103027 + }, + { + "auxiliary_loss_clip": 0.01107901, + "auxiliary_loss_mlp": 0.01027812, + "balance_loss_clip": 1.0157299, + "balance_loss_mlp": 1.03963566, + "epoch": 0.4992033669021494, + "flos": 15121912273920.0, + "grad_norm": 1.7245012471823244, + "language_loss": 0.68831706, + "learning_rate": 2.102189175590024e-06, + "loss": 0.70967424, + "num_input_tokens_seen": 178481340, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 8303, + "time_per_iteration": 2.4496121406555176 + }, + { + "auxiliary_loss_clip": 0.01111721, + "auxiliary_loss_mlp": 0.01031821, + "balance_loss_clip": 1.01871395, + "balance_loss_mlp": 1.0395093, + "epoch": 0.49926349015481736, + "flos": 31207952476800.0, + "grad_norm": 1.8500063703376581, + "language_loss": 0.72523201, + "learning_rate": 2.101800220681144e-06, + "loss": 0.7466675, + "num_input_tokens_seen": 178501545, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.72265625, + "step": 8304, + "time_per_iteration": 5.351519346237183 + }, + { + "auxiliary_loss_clip": 0.01109868, + "auxiliary_loss_mlp": 0.01038641, + "balance_loss_clip": 1.02633858, + "balance_loss_mlp": 1.03971672, + "epoch": 0.4993236134074853, + "flos": 24900207903360.0, + "grad_norm": 2.113610055263332, + "language_loss": 0.81011766, + "learning_rate": 2.10141126191199e-06, + "loss": 0.83160275, + "num_input_tokens_seen": 178519700, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 8305, + "time_per_iteration": 3.9764394760131836 + }, + { + "auxiliary_loss_clip": 0.0103618, + "auxiliary_loss_mlp": 0.01001804, + "balance_loss_clip": 1.00061762, + "balance_loss_mlp": 1.01301277, + "epoch": 0.4993837366601533, + "flos": 70420573797120.0, + "grad_norm": 0.7225706425993785, + "language_loss": 0.56916559, + "learning_rate": 2.1010222992973107e-06, + "loss": 0.58954537, + "num_input_tokens_seen": 178576740, + "router_z_loss_clip": 0.01184082, + "router_z_loss_mlp": 0.23144531, + "step": 8306, + "time_per_iteration": 3.1952388286590576 + }, + { + "auxiliary_loss_clip": 0.01114208, + "auxiliary_loss_mlp": 0.01037149, + "balance_loss_clip": 1.02323711, + "balance_loss_mlp": 1.04268515, + "epoch": 0.4994438599128213, + "flos": 15961216440960.0, + "grad_norm": 1.791967653711514, + "language_loss": 0.82407033, + "learning_rate": 2.1006333328518556e-06, + "loss": 0.84558392, + "num_input_tokens_seen": 178594745, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.71484375, + "step": 8307, + "time_per_iteration": 2.4501423835754395 + }, + { + "auxiliary_loss_clip": 0.01108969, + "auxiliary_loss_mlp": 0.01033029, + "balance_loss_clip": 1.01987445, + "balance_loss_mlp": 1.03845966, + "epoch": 0.4995039831654893, + "flos": 27928303228800.0, + "grad_norm": 2.0869484891217973, + "language_loss": 0.60544026, + "learning_rate": 2.1002443625903748e-06, + "loss": 0.62686026, + "num_input_tokens_seen": 178614110, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 8308, + "time_per_iteration": 2.5023903846740723 + }, + { + "auxiliary_loss_clip": 0.01106463, + "auxiliary_loss_mlp": 0.01030347, + "balance_loss_clip": 1.0179193, + "balance_loss_mlp": 1.03760242, + "epoch": 0.49956410641815724, + "flos": 24204797619840.0, + "grad_norm": 1.5917355796130328, + "language_loss": 0.74632615, + "learning_rate": 2.0998553885276168e-06, + "loss": 0.76769423, + "num_input_tokens_seen": 178634170, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6875, + "step": 8309, + "time_per_iteration": 2.473018169403076 + }, + { + "auxiliary_loss_clip": 0.01109782, + "auxiliary_loss_mlp": 0.01034327, + "balance_loss_clip": 1.02136922, + "balance_loss_mlp": 1.03926158, + "epoch": 0.4996242296708252, + "flos": 16180127879040.0, + "grad_norm": 2.147167346860859, + "language_loss": 0.80117911, + "learning_rate": 2.0994664106783335e-06, + "loss": 0.82262021, + "num_input_tokens_seen": 178651775, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.703125, + "step": 8310, + "time_per_iteration": 2.4172844886779785 + }, + { + "auxiliary_loss_clip": 0.01112518, + "auxiliary_loss_mlp": 0.01035729, + "balance_loss_clip": 1.02339089, + "balance_loss_mlp": 1.04019213, + "epoch": 0.49968435292349317, + "flos": 16873527000960.0, + "grad_norm": 1.6036366291386785, + "language_loss": 0.70938641, + "learning_rate": 2.0990774290572735e-06, + "loss": 0.73086882, + "num_input_tokens_seen": 178669720, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.72265625, + "step": 8311, + "time_per_iteration": 2.4804234504699707 + }, + { + "auxiliary_loss_clip": 0.01111462, + "auxiliary_loss_mlp": 0.01034214, + "balance_loss_clip": 1.02229297, + "balance_loss_mlp": 1.04154408, + "epoch": 0.49974447617616113, + "flos": 14939521989120.0, + "grad_norm": 1.923283457940722, + "language_loss": 0.77138013, + "learning_rate": 2.098688443679187e-06, + "loss": 0.79283684, + "num_input_tokens_seen": 178686765, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69921875, + "step": 8312, + "time_per_iteration": 2.4233593940734863 + }, + { + "auxiliary_loss_clip": 0.01111451, + "auxiliary_loss_mlp": 0.01032282, + "balance_loss_clip": 1.01910901, + "balance_loss_mlp": 1.04093099, + "epoch": 0.4998045994288291, + "flos": 26651535321600.0, + "grad_norm": 1.7466795572602452, + "language_loss": 0.84205925, + "learning_rate": 2.0982994545588256e-06, + "loss": 0.86349666, + "num_input_tokens_seen": 178705845, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.70703125, + "step": 8313, + "time_per_iteration": 2.509953260421753 + }, + { + "auxiliary_loss_clip": 0.01111508, + "auxiliary_loss_mlp": 0.01029516, + "balance_loss_clip": 1.01633728, + "balance_loss_mlp": 1.03987491, + "epoch": 0.49986472268149706, + "flos": 20953768533120.0, + "grad_norm": 2.119225345296983, + "language_loss": 0.80887723, + "learning_rate": 2.097910461710939e-06, + "loss": 0.83028746, + "num_input_tokens_seen": 178723410, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71484375, + "step": 8314, + "time_per_iteration": 2.452765703201294 + }, + { + "auxiliary_loss_clip": 0.01113768, + "auxiliary_loss_mlp": 0.01041835, + "balance_loss_clip": 1.02763736, + "balance_loss_mlp": 1.0418222, + "epoch": 0.49992484593416503, + "flos": 22783884433920.0, + "grad_norm": 2.4967995028767778, + "language_loss": 0.79017889, + "learning_rate": 2.0975214651502773e-06, + "loss": 0.81173497, + "num_input_tokens_seen": 178743560, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.71875, + "step": 8315, + "time_per_iteration": 2.4926230907440186 + }, + { + "auxiliary_loss_clip": 0.01110205, + "auxiliary_loss_mlp": 0.01034063, + "balance_loss_clip": 1.02123618, + "balance_loss_mlp": 1.04051793, + "epoch": 0.499984969186833, + "flos": 46786970252160.0, + "grad_norm": 2.5792388666411274, + "language_loss": 0.73983908, + "learning_rate": 2.0971324648915926e-06, + "loss": 0.76128173, + "num_input_tokens_seen": 178767225, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 8316, + "time_per_iteration": 2.692228317260742 + }, + { + "auxiliary_loss_clip": 0.01109445, + "auxiliary_loss_mlp": 0.01032836, + "balance_loss_clip": 1.02058125, + "balance_loss_mlp": 1.04118443, + "epoch": 0.500045092439501, + "flos": 25556978131200.0, + "grad_norm": 1.4190232020266644, + "language_loss": 0.81204319, + "learning_rate": 2.0967434609496343e-06, + "loss": 0.83346593, + "num_input_tokens_seen": 178786810, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.68359375, + "step": 8317, + "time_per_iteration": 2.4997825622558594 + }, + { + "auxiliary_loss_clip": 0.01111618, + "auxiliary_loss_mlp": 0.01038384, + "balance_loss_clip": 1.02436495, + "balance_loss_mlp": 1.04001343, + "epoch": 0.5001052156921689, + "flos": 20704764476160.0, + "grad_norm": 1.649167878849496, + "language_loss": 0.83189869, + "learning_rate": 2.0963544533391548e-06, + "loss": 0.85339868, + "num_input_tokens_seen": 178805660, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.71875, + "step": 8318, + "time_per_iteration": 2.516118049621582 + }, + { + "auxiliary_loss_clip": 0.01111509, + "auxiliary_loss_mlp": 0.01030416, + "balance_loss_clip": 1.01778626, + "balance_loss_mlp": 1.04068375, + "epoch": 0.500165338944837, + "flos": 21251109317760.0, + "grad_norm": 1.8062739344487506, + "language_loss": 0.81684446, + "learning_rate": 2.0959654420749045e-06, + "loss": 0.83826375, + "num_input_tokens_seen": 178824780, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.70703125, + "step": 8319, + "time_per_iteration": 2.4977705478668213 + }, + { + "auxiliary_loss_clip": 0.01112348, + "auxiliary_loss_mlp": 0.01026791, + "balance_loss_clip": 1.01469707, + "balance_loss_mlp": 1.04046464, + "epoch": 0.5002254621975049, + "flos": 27854398995840.0, + "grad_norm": 1.7611824883833367, + "language_loss": 0.71951354, + "learning_rate": 2.095576427171635e-06, + "loss": 0.74090493, + "num_input_tokens_seen": 178845640, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71875, + "step": 8320, + "time_per_iteration": 2.5664663314819336 + }, + { + "auxiliary_loss_clip": 0.01116964, + "auxiliary_loss_mlp": 0.01043637, + "balance_loss_clip": 1.02903366, + "balance_loss_mlp": 1.03925049, + "epoch": 0.5002855854501729, + "flos": 15551941898880.0, + "grad_norm": 3.538267489088781, + "language_loss": 0.76840645, + "learning_rate": 2.0951874086440978e-06, + "loss": 0.79001242, + "num_input_tokens_seen": 178862290, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7734375, + "step": 8321, + "time_per_iteration": 2.5154004096984863 + }, + { + "auxiliary_loss_clip": 0.01113289, + "auxiliary_loss_mlp": 0.01038756, + "balance_loss_clip": 1.0255599, + "balance_loss_mlp": 1.04125774, + "epoch": 0.5003457087028408, + "flos": 16107408794880.0, + "grad_norm": 1.9154758393965534, + "language_loss": 0.82959068, + "learning_rate": 2.0947983865070455e-06, + "loss": 0.85111117, + "num_input_tokens_seen": 178879805, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71875, + "step": 8322, + "time_per_iteration": 2.4235384464263916 + }, + { + "auxiliary_loss_clip": 0.01114951, + "auxiliary_loss_mlp": 0.01035, + "balance_loss_clip": 1.02180934, + "balance_loss_mlp": 1.04190695, + "epoch": 0.5004058319555088, + "flos": 22710518904960.0, + "grad_norm": 2.1453827228353166, + "language_loss": 0.73670769, + "learning_rate": 2.094409360775228e-06, + "loss": 0.7582072, + "num_input_tokens_seen": 178896985, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73046875, + "step": 8323, + "time_per_iteration": 2.495490312576294 + }, + { + "auxiliary_loss_clip": 0.01111344, + "auxiliary_loss_mlp": 0.01035029, + "balance_loss_clip": 1.02152205, + "balance_loss_mlp": 1.04043198, + "epoch": 0.5004659552081767, + "flos": 30117956313600.0, + "grad_norm": 1.517177144462768, + "language_loss": 0.69255745, + "learning_rate": 2.0940203314633977e-06, + "loss": 0.71402115, + "num_input_tokens_seen": 178920605, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7109375, + "step": 8324, + "time_per_iteration": 2.534043550491333 + }, + { + "auxiliary_loss_clip": 0.01110991, + "auxiliary_loss_mlp": 0.01033694, + "balance_loss_clip": 1.02072978, + "balance_loss_mlp": 1.03958941, + "epoch": 0.5005260784608447, + "flos": 18624710764800.0, + "grad_norm": 1.9198571129878061, + "language_loss": 0.72153628, + "learning_rate": 2.0936312985863077e-06, + "loss": 0.7429831, + "num_input_tokens_seen": 178937760, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71484375, + "step": 8325, + "time_per_iteration": 2.4783544540405273 + }, + { + "auxiliary_loss_clip": 0.01114311, + "auxiliary_loss_mlp": 0.01038383, + "balance_loss_clip": 1.0237087, + "balance_loss_mlp": 1.04212904, + "epoch": 0.5005862017135126, + "flos": 24859987649280.0, + "grad_norm": 1.5620326365302057, + "language_loss": 0.73494631, + "learning_rate": 2.093242262158709e-06, + "loss": 0.7564733, + "num_input_tokens_seen": 178957985, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.72265625, + "step": 8326, + "time_per_iteration": 2.4836461544036865 + }, + { + "auxiliary_loss_clip": 0.01110122, + "auxiliary_loss_mlp": 0.01031456, + "balance_loss_clip": 1.0189389, + "balance_loss_mlp": 1.03965449, + "epoch": 0.5006463249661807, + "flos": 18734381965440.0, + "grad_norm": 1.5385455876451686, + "language_loss": 0.78168696, + "learning_rate": 2.0928532221953544e-06, + "loss": 0.80310273, + "num_input_tokens_seen": 178977070, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 8327, + "time_per_iteration": 2.477095127105713 + }, + { + "auxiliary_loss_clip": 0.01117029, + "auxiliary_loss_mlp": 0.01035945, + "balance_loss_clip": 1.02261126, + "balance_loss_mlp": 1.04402947, + "epoch": 0.5007064482188487, + "flos": 13042145871360.0, + "grad_norm": 2.31963767631444, + "language_loss": 0.88008773, + "learning_rate": 2.092464178710997e-06, + "loss": 0.90161747, + "num_input_tokens_seen": 178994175, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73046875, + "step": 8328, + "time_per_iteration": 2.479931116104126 + }, + { + "auxiliary_loss_clip": 0.01116123, + "auxiliary_loss_mlp": 0.01035928, + "balance_loss_clip": 1.02290463, + "balance_loss_mlp": 1.0408715, + "epoch": 0.5007665714715166, + "flos": 21288671965440.0, + "grad_norm": 2.0106246059801482, + "language_loss": 0.74407351, + "learning_rate": 2.092075131720388e-06, + "loss": 0.76559395, + "num_input_tokens_seen": 179013710, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.75, + "step": 8329, + "time_per_iteration": 2.480037212371826 + }, + { + "auxiliary_loss_clip": 0.01112626, + "auxiliary_loss_mlp": 0.01033135, + "balance_loss_clip": 1.02014136, + "balance_loss_mlp": 1.04276633, + "epoch": 0.5008266947241846, + "flos": 29754576374400.0, + "grad_norm": 2.2897047741072063, + "language_loss": 0.79602063, + "learning_rate": 2.091686081238281e-06, + "loss": 0.81747818, + "num_input_tokens_seen": 179035255, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.69921875, + "step": 8330, + "time_per_iteration": 2.529446601867676 + }, + { + "auxiliary_loss_clip": 0.0103803, + "auxiliary_loss_mlp": 0.01000333, + "balance_loss_clip": 0.99922389, + "balance_loss_mlp": 1.01505685, + "epoch": 0.5008868179768525, + "flos": 63557829204480.0, + "grad_norm": 0.7317803530986337, + "language_loss": 0.56073356, + "learning_rate": 2.0912970272794282e-06, + "loss": 0.58111727, + "num_input_tokens_seen": 179090915, + "router_z_loss_clip": 0.0111084, + "router_z_loss_mlp": 0.23046875, + "step": 8331, + "time_per_iteration": 2.89511775970459 + }, + { + "auxiliary_loss_clip": 0.01110931, + "auxiliary_loss_mlp": 0.0102697, + "balance_loss_clip": 1.01504326, + "balance_loss_mlp": 1.041206, + "epoch": 0.5009469412295205, + "flos": 27375637593600.0, + "grad_norm": 2.865515028785386, + "language_loss": 0.65518546, + "learning_rate": 2.0909079698585833e-06, + "loss": 0.67656446, + "num_input_tokens_seen": 179109160, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 8332, + "time_per_iteration": 2.497129201889038 + }, + { + "auxiliary_loss_clip": 0.01109356, + "auxiliary_loss_mlp": 0.01034733, + "balance_loss_clip": 1.02261496, + "balance_loss_mlp": 1.0400846, + "epoch": 0.5010070644821885, + "flos": 27378833904000.0, + "grad_norm": 1.477043934406584, + "language_loss": 0.74687374, + "learning_rate": 2.0905189089904993e-06, + "loss": 0.76831466, + "num_input_tokens_seen": 179130610, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69140625, + "step": 8333, + "time_per_iteration": 2.506769895553589 + }, + { + "auxiliary_loss_clip": 0.01114084, + "auxiliary_loss_mlp": 0.01034969, + "balance_loss_clip": 1.02242804, + "balance_loss_mlp": 1.04128885, + "epoch": 0.5010671877348565, + "flos": 20662748542080.0, + "grad_norm": 3.419508092200526, + "language_loss": 0.80619013, + "learning_rate": 2.090129844689929e-06, + "loss": 0.82768065, + "num_input_tokens_seen": 179147860, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7265625, + "step": 8334, + "time_per_iteration": 2.4492759704589844 + }, + { + "auxiliary_loss_clip": 0.01038411, + "auxiliary_loss_mlp": 0.00996695, + "balance_loss_clip": 0.99557459, + "balance_loss_mlp": 1.01541471, + "epoch": 0.5011273109875244, + "flos": 59128645000320.0, + "grad_norm": 0.8938151962133672, + "language_loss": 0.62658346, + "learning_rate": 2.089740776971626e-06, + "loss": 0.64693451, + "num_input_tokens_seen": 179210490, + "router_z_loss_clip": 0.01123047, + "router_z_loss_mlp": 0.23046875, + "step": 8335, + "time_per_iteration": 3.044527530670166 + }, + { + "auxiliary_loss_clip": 0.01108292, + "auxiliary_loss_mlp": 0.01027703, + "balance_loss_clip": 1.01548398, + "balance_loss_mlp": 1.03883338, + "epoch": 0.5011874342401924, + "flos": 25336342840320.0, + "grad_norm": 1.39366543335018, + "language_loss": 0.79443586, + "learning_rate": 2.0893517058503435e-06, + "loss": 0.81579578, + "num_input_tokens_seen": 179231360, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6953125, + "step": 8336, + "time_per_iteration": 2.5133562088012695 + }, + { + "auxiliary_loss_clip": 0.01111717, + "auxiliary_loss_mlp": 0.01031101, + "balance_loss_clip": 1.01791, + "balance_loss_mlp": 1.0402261, + "epoch": 0.5012475574928603, + "flos": 20229953569920.0, + "grad_norm": 1.7464580749308463, + "language_loss": 0.80139911, + "learning_rate": 2.088962631340836e-06, + "loss": 0.82282722, + "num_input_tokens_seen": 179250625, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71484375, + "step": 8337, + "time_per_iteration": 2.4671413898468018 + }, + { + "auxiliary_loss_clip": 0.01114807, + "auxiliary_loss_mlp": 0.01033341, + "balance_loss_clip": 1.0201329, + "balance_loss_mlp": 1.03992128, + "epoch": 0.5013076807455283, + "flos": 22710123855360.0, + "grad_norm": 1.859552309481282, + "language_loss": 0.79314995, + "learning_rate": 2.0885735534578555e-06, + "loss": 0.8146314, + "num_input_tokens_seen": 179267360, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.75, + "step": 8338, + "time_per_iteration": 2.4763965606689453 + }, + { + "auxiliary_loss_clip": 0.01112164, + "auxiliary_loss_mlp": 0.01031399, + "balance_loss_clip": 1.0178982, + "balance_loss_mlp": 1.0390203, + "epoch": 0.5013678039981962, + "flos": 24245161528320.0, + "grad_norm": 1.6104717001039177, + "language_loss": 0.85006964, + "learning_rate": 2.0881844722161583e-06, + "loss": 0.87150526, + "num_input_tokens_seen": 179289810, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73046875, + "step": 8339, + "time_per_iteration": 2.507951259613037 + }, + { + "auxiliary_loss_clip": 0.01110925, + "auxiliary_loss_mlp": 0.01038014, + "balance_loss_clip": 1.02476954, + "balance_loss_mlp": 1.03943646, + "epoch": 0.5014279272508643, + "flos": 26176688501760.0, + "grad_norm": 1.484784321746097, + "language_loss": 0.70492387, + "learning_rate": 2.0877953876304962e-06, + "loss": 0.72641325, + "num_input_tokens_seen": 179310620, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71484375, + "step": 8340, + "time_per_iteration": 2.5271620750427246 + }, + { + "auxiliary_loss_clip": 0.01116646, + "auxiliary_loss_mlp": 0.01035561, + "balance_loss_clip": 1.02178025, + "balance_loss_mlp": 1.04153883, + "epoch": 0.5014880505035323, + "flos": 21430446946560.0, + "grad_norm": 1.9114275861555547, + "language_loss": 0.77793235, + "learning_rate": 2.0874062997156245e-06, + "loss": 0.79945439, + "num_input_tokens_seen": 179329005, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.75, + "step": 8341, + "time_per_iteration": 2.467557430267334 + }, + { + "auxiliary_loss_clip": 0.01116354, + "auxiliary_loss_mlp": 0.01039659, + "balance_loss_clip": 1.02543771, + "balance_loss_mlp": 1.04048502, + "epoch": 0.5015481737562002, + "flos": 15770745596160.0, + "grad_norm": 2.478803711535475, + "language_loss": 0.8961392, + "learning_rate": 2.0870172084862975e-06, + "loss": 0.91769934, + "num_input_tokens_seen": 179343785, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7578125, + "step": 8342, + "time_per_iteration": 2.454822063446045 + }, + { + "auxiliary_loss_clip": 0.01110124, + "auxiliary_loss_mlp": 0.01036028, + "balance_loss_clip": 1.02272439, + "balance_loss_mlp": 1.03894877, + "epoch": 0.5016082970088682, + "flos": 26830801123200.0, + "grad_norm": 3.1772216639919906, + "language_loss": 0.76625615, + "learning_rate": 2.0866281139572682e-06, + "loss": 0.7877177, + "num_input_tokens_seen": 179364070, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 8343, + "time_per_iteration": 2.485499143600464 + }, + { + "auxiliary_loss_clip": 0.0110844, + "auxiliary_loss_mlp": 0.01027749, + "balance_loss_clip": 1.01584053, + "balance_loss_mlp": 1.03967083, + "epoch": 0.5016684202615361, + "flos": 21470595373440.0, + "grad_norm": 2.1220779506727574, + "language_loss": 0.67086864, + "learning_rate": 2.086239016143293e-06, + "loss": 0.69223046, + "num_input_tokens_seen": 179384225, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 8344, + "time_per_iteration": 3.88729190826416 + }, + { + "auxiliary_loss_clip": 0.01111927, + "auxiliary_loss_mlp": 0.01034179, + "balance_loss_clip": 1.02143502, + "balance_loss_mlp": 1.03998613, + "epoch": 0.5017285435142042, + "flos": 26246821806720.0, + "grad_norm": 1.9395231632627998, + "language_loss": 0.75212955, + "learning_rate": 2.0858499150591258e-06, + "loss": 0.77359062, + "num_input_tokens_seen": 179402595, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 8345, + "time_per_iteration": 2.4836034774780273 + }, + { + "auxiliary_loss_clip": 0.01112737, + "auxiliary_loss_mlp": 0.01031643, + "balance_loss_clip": 1.01769578, + "balance_loss_mlp": 1.04121828, + "epoch": 0.5017886667668721, + "flos": 20777555387520.0, + "grad_norm": 1.95370753247372, + "language_loss": 0.78477418, + "learning_rate": 2.0854608107195203e-06, + "loss": 0.80621803, + "num_input_tokens_seen": 179419635, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.71484375, + "step": 8346, + "time_per_iteration": 5.420297861099243 + }, + { + "auxiliary_loss_clip": 0.01110161, + "auxiliary_loss_mlp": 0.01036529, + "balance_loss_clip": 1.02408957, + "balance_loss_mlp": 1.03860831, + "epoch": 0.5018487900195401, + "flos": 20156408472960.0, + "grad_norm": 1.6533044146295508, + "language_loss": 0.69167304, + "learning_rate": 2.0850717031392333e-06, + "loss": 0.71313995, + "num_input_tokens_seen": 179438770, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71484375, + "step": 8347, + "time_per_iteration": 2.5022430419921875 + }, + { + "auxiliary_loss_clip": 0.01112834, + "auxiliary_loss_mlp": 0.01034397, + "balance_loss_clip": 1.02136123, + "balance_loss_mlp": 1.03990984, + "epoch": 0.501908913272208, + "flos": 18150689957760.0, + "grad_norm": 1.8545802319259819, + "language_loss": 0.71527761, + "learning_rate": 2.0846825923330174e-06, + "loss": 0.73674989, + "num_input_tokens_seen": 179457475, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73046875, + "step": 8348, + "time_per_iteration": 2.491255760192871 + }, + { + "auxiliary_loss_clip": 0.01108777, + "auxiliary_loss_mlp": 0.01032852, + "balance_loss_clip": 1.02089548, + "balance_loss_mlp": 1.04003596, + "epoch": 0.501969036524876, + "flos": 23112287504640.0, + "grad_norm": 1.6664488621380107, + "language_loss": 0.73957872, + "learning_rate": 2.0842934783156303e-06, + "loss": 0.76099503, + "num_input_tokens_seen": 179478140, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6875, + "step": 8349, + "time_per_iteration": 2.478173017501831 + }, + { + "auxiliary_loss_clip": 0.01111134, + "auxiliary_loss_mlp": 0.01030526, + "balance_loss_clip": 1.01726353, + "balance_loss_mlp": 1.03897953, + "epoch": 0.5020291597775439, + "flos": 11363214314880.0, + "grad_norm": 2.0979883436616915, + "language_loss": 0.63680947, + "learning_rate": 2.0839043611018266e-06, + "loss": 0.65822613, + "num_input_tokens_seen": 179494325, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.72265625, + "step": 8350, + "time_per_iteration": 2.407949686050415 + }, + { + "auxiliary_loss_clip": 0.01035777, + "auxiliary_loss_mlp": 0.01011664, + "balance_loss_clip": 1.01064515, + "balance_loss_mlp": 1.01269341, + "epoch": 0.5020892830302119, + "flos": 64011094928640.0, + "grad_norm": 1.0786206787107346, + "language_loss": 0.59814817, + "learning_rate": 2.0835152407063597e-06, + "loss": 0.6186226, + "num_input_tokens_seen": 179553545, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.23144531, + "step": 8351, + "time_per_iteration": 3.199061393737793 + }, + { + "auxiliary_loss_clip": 0.01111613, + "auxiliary_loss_mlp": 0.01034378, + "balance_loss_clip": 1.02156925, + "balance_loss_mlp": 1.0395788, + "epoch": 0.5021494062828799, + "flos": 23732859801600.0, + "grad_norm": 2.3062568387149365, + "language_loss": 0.75367033, + "learning_rate": 2.0831261171439873e-06, + "loss": 0.77513033, + "num_input_tokens_seen": 179573645, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.71875, + "step": 8352, + "time_per_iteration": 2.506408214569092 + }, + { + "auxiliary_loss_clip": 0.01113074, + "auxiliary_loss_mlp": 0.01032571, + "balance_loss_clip": 1.01963115, + "balance_loss_mlp": 1.04205072, + "epoch": 0.5022095295355479, + "flos": 21576747041280.0, + "grad_norm": 1.6126052392954302, + "language_loss": 0.71743786, + "learning_rate": 2.082736990429464e-06, + "loss": 0.73889434, + "num_input_tokens_seen": 179591435, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 8353, + "time_per_iteration": 2.469383478164673 + }, + { + "auxiliary_loss_clip": 0.01115894, + "auxiliary_loss_mlp": 0.0103681, + "balance_loss_clip": 1.02279735, + "balance_loss_mlp": 1.04492378, + "epoch": 0.5022696527882159, + "flos": 21397229844480.0, + "grad_norm": 3.986170886248432, + "language_loss": 0.73818904, + "learning_rate": 2.0823478605775455e-06, + "loss": 0.75971609, + "num_input_tokens_seen": 179609955, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7109375, + "step": 8354, + "time_per_iteration": 2.510967254638672 + }, + { + "auxiliary_loss_clip": 0.01111051, + "auxiliary_loss_mlp": 0.01036606, + "balance_loss_clip": 1.02324271, + "balance_loss_mlp": 1.04122615, + "epoch": 0.5023297760408838, + "flos": 27160712565120.0, + "grad_norm": 1.6375075569861386, + "language_loss": 0.72198367, + "learning_rate": 2.0819587276029884e-06, + "loss": 0.74346024, + "num_input_tokens_seen": 179630875, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.69921875, + "step": 8355, + "time_per_iteration": 2.5355918407440186 + }, + { + "auxiliary_loss_clip": 0.01113009, + "auxiliary_loss_mlp": 0.01036959, + "balance_loss_clip": 1.0234164, + "balance_loss_mlp": 1.04037476, + "epoch": 0.5023898992935518, + "flos": 26213820186240.0, + "grad_norm": 1.5634548911110102, + "language_loss": 0.81171584, + "learning_rate": 2.081569591520548e-06, + "loss": 0.83321553, + "num_input_tokens_seen": 179649835, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7265625, + "step": 8356, + "time_per_iteration": 2.5366694927215576 + }, + { + "auxiliary_loss_clip": 0.01115056, + "auxiliary_loss_mlp": 0.01038235, + "balance_loss_clip": 1.0234828, + "balance_loss_mlp": 1.03943825, + "epoch": 0.5024500225462197, + "flos": 13440323111040.0, + "grad_norm": 2.216032444638608, + "language_loss": 0.76043326, + "learning_rate": 2.0811804523449803e-06, + "loss": 0.78196621, + "num_input_tokens_seen": 179667605, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.7578125, + "step": 8357, + "time_per_iteration": 2.4454803466796875 + }, + { + "auxiliary_loss_clip": 0.01112875, + "auxiliary_loss_mlp": 0.01033956, + "balance_loss_clip": 1.0196929, + "balance_loss_mlp": 1.04054666, + "epoch": 0.5025101457988878, + "flos": 21579584215680.0, + "grad_norm": 1.6874014883711121, + "language_loss": 0.75969183, + "learning_rate": 2.0807913100910417e-06, + "loss": 0.78116012, + "num_input_tokens_seen": 179686910, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7265625, + "step": 8358, + "time_per_iteration": 2.4932358264923096 + }, + { + "auxiliary_loss_clip": 0.01111732, + "auxiliary_loss_mlp": 0.01034386, + "balance_loss_clip": 1.02163708, + "balance_loss_mlp": 1.04097748, + "epoch": 0.5025702690515557, + "flos": 24645134448000.0, + "grad_norm": 2.322067399050787, + "language_loss": 0.72372258, + "learning_rate": 2.0804021647734887e-06, + "loss": 0.74518377, + "num_input_tokens_seen": 179706395, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 8359, + "time_per_iteration": 2.500152826309204 + }, + { + "auxiliary_loss_clip": 0.01111655, + "auxiliary_loss_mlp": 0.01036283, + "balance_loss_clip": 1.02361679, + "balance_loss_mlp": 1.04144287, + "epoch": 0.5026303923042237, + "flos": 22090162089600.0, + "grad_norm": 1.6242275025336705, + "language_loss": 0.77095789, + "learning_rate": 2.080013016407077e-06, + "loss": 0.79243731, + "num_input_tokens_seen": 179725735, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 8360, + "time_per_iteration": 2.5194928646087646 + }, + { + "auxiliary_loss_clip": 0.01111322, + "auxiliary_loss_mlp": 0.01032708, + "balance_loss_clip": 1.02062035, + "balance_loss_mlp": 1.04179871, + "epoch": 0.5026905155568916, + "flos": 23697200574720.0, + "grad_norm": 1.6325944972725464, + "language_loss": 0.76545495, + "learning_rate": 2.0796238650065645e-06, + "loss": 0.78689528, + "num_input_tokens_seen": 179746150, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 8361, + "time_per_iteration": 2.4667415618896484 + }, + { + "auxiliary_loss_clip": 0.0111058, + "auxiliary_loss_mlp": 0.01033627, + "balance_loss_clip": 1.01973319, + "balance_loss_mlp": 1.03841019, + "epoch": 0.5027506388095596, + "flos": 25812410722560.0, + "grad_norm": 1.6123805658340187, + "language_loss": 0.84681976, + "learning_rate": 2.0792347105867065e-06, + "loss": 0.86826181, + "num_input_tokens_seen": 179767550, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.71875, + "step": 8362, + "time_per_iteration": 2.5463051795959473 + }, + { + "auxiliary_loss_clip": 0.01109115, + "auxiliary_loss_mlp": 0.01034772, + "balance_loss_clip": 1.02232695, + "balance_loss_mlp": 1.03756952, + "epoch": 0.5028107620622275, + "flos": 27526606456320.0, + "grad_norm": 1.4590070504225026, + "language_loss": 0.78211838, + "learning_rate": 2.0788455531622605e-06, + "loss": 0.80355728, + "num_input_tokens_seen": 179790075, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71484375, + "step": 8363, + "time_per_iteration": 2.5163207054138184 + }, + { + "auxiliary_loss_clip": 0.0110737, + "auxiliary_loss_mlp": 0.0103085, + "balance_loss_clip": 1.01799965, + "balance_loss_mlp": 1.04016399, + "epoch": 0.5028708853148955, + "flos": 24534278098560.0, + "grad_norm": 3.0044110074814627, + "language_loss": 0.75747573, + "learning_rate": 2.0784563927479838e-06, + "loss": 0.77885795, + "num_input_tokens_seen": 179806515, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.671875, + "step": 8364, + "time_per_iteration": 2.490145444869995 + }, + { + "auxiliary_loss_clip": 0.01106443, + "auxiliary_loss_mlp": 0.01029652, + "balance_loss_clip": 1.01749849, + "balance_loss_mlp": 1.03816295, + "epoch": 0.5029310085675635, + "flos": 20813609664000.0, + "grad_norm": 1.5639014752994398, + "language_loss": 0.69354087, + "learning_rate": 2.0780672293586317e-06, + "loss": 0.7149018, + "num_input_tokens_seen": 179826450, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.68359375, + "step": 8365, + "time_per_iteration": 2.473787307739258 + }, + { + "auxiliary_loss_clip": 0.01113542, + "auxiliary_loss_mlp": 0.01035128, + "balance_loss_clip": 1.02149057, + "balance_loss_mlp": 1.03982782, + "epoch": 0.5029911318202315, + "flos": 22342470197760.0, + "grad_norm": 1.442330503817835, + "language_loss": 0.73213601, + "learning_rate": 2.0776780630089635e-06, + "loss": 0.75362265, + "num_input_tokens_seen": 179846770, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.734375, + "step": 8366, + "time_per_iteration": 2.549877405166626 + }, + { + "auxiliary_loss_clip": 0.01109966, + "auxiliary_loss_mlp": 0.01032343, + "balance_loss_clip": 1.02064812, + "balance_loss_mlp": 1.04103982, + "epoch": 0.5030512550728995, + "flos": 24352713826560.0, + "grad_norm": 1.4509464249778803, + "language_loss": 0.78301162, + "learning_rate": 2.077288893713735e-06, + "loss": 0.80443466, + "num_input_tokens_seen": 179866585, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6875, + "step": 8367, + "time_per_iteration": 2.495147705078125 + }, + { + "auxiliary_loss_clip": 0.01108781, + "auxiliary_loss_mlp": 0.01030247, + "balance_loss_clip": 1.0180459, + "balance_loss_mlp": 1.03853226, + "epoch": 0.5031113783255674, + "flos": 18259930195200.0, + "grad_norm": 1.842981496070619, + "language_loss": 0.69923592, + "learning_rate": 2.0768997214877035e-06, + "loss": 0.72062624, + "num_input_tokens_seen": 179885575, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.703125, + "step": 8368, + "time_per_iteration": 2.4830057621002197 + }, + { + "auxiliary_loss_clip": 0.01035945, + "auxiliary_loss_mlp": 0.01007176, + "balance_loss_clip": 1.00621665, + "balance_loss_mlp": 1.01321661, + "epoch": 0.5031715015782354, + "flos": 57253173200640.0, + "grad_norm": 0.8570502115037558, + "language_loss": 0.63344997, + "learning_rate": 2.0765105463456274e-06, + "loss": 0.65388119, + "num_input_tokens_seen": 179939650, + "router_z_loss_clip": 0.00958252, + "router_z_loss_mlp": 0.22851562, + "step": 8369, + "time_per_iteration": 3.0224173069000244 + }, + { + "auxiliary_loss_clip": 0.0110829, + "auxiliary_loss_mlp": 0.01031598, + "balance_loss_clip": 1.01973677, + "balance_loss_mlp": 1.03877878, + "epoch": 0.5032316248309033, + "flos": 27527360641920.0, + "grad_norm": 2.153532760870157, + "language_loss": 0.60134995, + "learning_rate": 2.076121368302263e-06, + "loss": 0.62274879, + "num_input_tokens_seen": 179961765, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6953125, + "step": 8370, + "time_per_iteration": 2.570244073867798 + }, + { + "auxiliary_loss_clip": 0.01110861, + "auxiliary_loss_mlp": 0.01034812, + "balance_loss_clip": 1.02094817, + "balance_loss_mlp": 1.03846478, + "epoch": 0.5032917480835714, + "flos": 34495825939200.0, + "grad_norm": 1.5686803599666441, + "language_loss": 0.68485558, + "learning_rate": 2.0757321873723695e-06, + "loss": 0.7063123, + "num_input_tokens_seen": 179983015, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.72265625, + "step": 8371, + "time_per_iteration": 2.5606741905212402 + }, + { + "auxiliary_loss_clip": 0.01110798, + "auxiliary_loss_mlp": 0.01030836, + "balance_loss_clip": 1.01710284, + "balance_loss_mlp": 1.04021561, + "epoch": 0.5033518713362393, + "flos": 33656773167360.0, + "grad_norm": 2.6972353884187776, + "language_loss": 0.67238319, + "learning_rate": 2.0753430035707042e-06, + "loss": 0.6937995, + "num_input_tokens_seen": 180003210, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.703125, + "step": 8372, + "time_per_iteration": 2.5703678131103516 + }, + { + "auxiliary_loss_clip": 0.0110914, + "auxiliary_loss_mlp": 0.01035865, + "balance_loss_clip": 1.02197719, + "balance_loss_mlp": 1.03876567, + "epoch": 0.5034119945889073, + "flos": 28185495586560.0, + "grad_norm": 2.7198935997293683, + "language_loss": 0.66590893, + "learning_rate": 2.0749538169120235e-06, + "loss": 0.68735898, + "num_input_tokens_seen": 180025530, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.703125, + "step": 8373, + "time_per_iteration": 2.526221513748169 + }, + { + "auxiliary_loss_clip": 0.01106535, + "auxiliary_loss_mlp": 0.01028349, + "balance_loss_clip": 1.01558208, + "balance_loss_mlp": 1.03755879, + "epoch": 0.5034721178415752, + "flos": 21358697529600.0, + "grad_norm": 1.6286907446961802, + "language_loss": 0.74674404, + "learning_rate": 2.0745646274110872e-06, + "loss": 0.76809293, + "num_input_tokens_seen": 180043180, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69140625, + "step": 8374, + "time_per_iteration": 2.488349199295044 + }, + { + "auxiliary_loss_clip": 0.01111709, + "auxiliary_loss_mlp": 0.01037533, + "balance_loss_clip": 1.02400887, + "balance_loss_mlp": 1.04047632, + "epoch": 0.5035322410942432, + "flos": 22674823764480.0, + "grad_norm": 1.5485355079726564, + "language_loss": 0.67947745, + "learning_rate": 2.0741754350826525e-06, + "loss": 0.70096987, + "num_input_tokens_seen": 180062905, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7109375, + "step": 8375, + "time_per_iteration": 2.4445972442626953 + }, + { + "auxiliary_loss_clip": 0.01114075, + "auxiliary_loss_mlp": 0.01034329, + "balance_loss_clip": 1.02008343, + "balance_loss_mlp": 1.04047072, + "epoch": 0.5035923643469111, + "flos": 19828723674240.0, + "grad_norm": 1.8481066708574578, + "language_loss": 0.78526819, + "learning_rate": 2.0737862399414777e-06, + "loss": 0.8067522, + "num_input_tokens_seen": 180082000, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.734375, + "step": 8376, + "time_per_iteration": 2.468104124069214 + }, + { + "auxiliary_loss_clip": 0.0111296, + "auxiliary_loss_mlp": 0.0103107, + "balance_loss_clip": 1.01704502, + "balance_loss_mlp": 1.03864694, + "epoch": 0.5036524875995791, + "flos": 30514625182080.0, + "grad_norm": 2.8611372201727234, + "language_loss": 0.59723544, + "learning_rate": 2.0733970420023213e-06, + "loss": 0.61867571, + "num_input_tokens_seen": 180101340, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 8377, + "time_per_iteration": 2.5277962684631348 + }, + { + "auxiliary_loss_clip": 0.01108859, + "auxiliary_loss_mlp": 0.01034548, + "balance_loss_clip": 1.02114892, + "balance_loss_mlp": 1.03836918, + "epoch": 0.5037126108522471, + "flos": 14720574637440.0, + "grad_norm": 1.9462161897860946, + "language_loss": 0.76360452, + "learning_rate": 2.0730078412799425e-06, + "loss": 0.78503865, + "num_input_tokens_seen": 180119160, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.703125, + "step": 8378, + "time_per_iteration": 2.448323965072632 + }, + { + "auxiliary_loss_clip": 0.01109358, + "auxiliary_loss_mlp": 0.01034908, + "balance_loss_clip": 1.02211046, + "balance_loss_mlp": 1.03916407, + "epoch": 0.5037727341049151, + "flos": 25297702784640.0, + "grad_norm": 1.6531450393233522, + "language_loss": 0.74565625, + "learning_rate": 2.0726186377890985e-06, + "loss": 0.7670989, + "num_input_tokens_seen": 180138730, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.703125, + "step": 8379, + "time_per_iteration": 2.5036356449127197 + }, + { + "auxiliary_loss_clip": 0.01109557, + "auxiliary_loss_mlp": 0.01031633, + "balance_loss_clip": 1.01952767, + "balance_loss_mlp": 1.04144955, + "epoch": 0.5038328573575831, + "flos": 28541764632960.0, + "grad_norm": 5.059413081923233, + "language_loss": 0.6692574, + "learning_rate": 2.072229431544548e-06, + "loss": 0.6906693, + "num_input_tokens_seen": 180158810, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 8380, + "time_per_iteration": 2.524144411087036 + }, + { + "auxiliary_loss_clip": 0.01109006, + "auxiliary_loss_mlp": 0.01030794, + "balance_loss_clip": 1.01879573, + "balance_loss_mlp": 1.03999329, + "epoch": 0.503892980610251, + "flos": 31649869503360.0, + "grad_norm": 1.7991215942112995, + "language_loss": 0.63869506, + "learning_rate": 2.071840222561051e-06, + "loss": 0.66009307, + "num_input_tokens_seen": 180179700, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 8381, + "time_per_iteration": 2.5605592727661133 + }, + { + "auxiliary_loss_clip": 0.01108854, + "auxiliary_loss_mlp": 0.01035256, + "balance_loss_clip": 1.02296555, + "balance_loss_mlp": 1.04009557, + "epoch": 0.503953103862919, + "flos": 27089358197760.0, + "grad_norm": 1.6170974847944384, + "language_loss": 0.67252153, + "learning_rate": 2.071451010853365e-06, + "loss": 0.69396263, + "num_input_tokens_seen": 180199890, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 8382, + "time_per_iteration": 2.5227982997894287 + }, + { + "auxiliary_loss_clip": 0.01116241, + "auxiliary_loss_mlp": 0.01039894, + "balance_loss_clip": 1.02614903, + "balance_loss_mlp": 1.04075313, + "epoch": 0.5040132271155869, + "flos": 15632957024640.0, + "grad_norm": 2.0398701191748, + "language_loss": 0.62190729, + "learning_rate": 2.0710617964362506e-06, + "loss": 0.64346862, + "num_input_tokens_seen": 180217840, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.75390625, + "step": 8383, + "time_per_iteration": 2.43418288230896 + }, + { + "auxiliary_loss_clip": 0.01106599, + "auxiliary_loss_mlp": 0.01034318, + "balance_loss_clip": 1.02198625, + "balance_loss_mlp": 1.03885436, + "epoch": 0.504073350368255, + "flos": 13590106824960.0, + "grad_norm": 3.355380782185913, + "language_loss": 0.67041314, + "learning_rate": 2.070672579324465e-06, + "loss": 0.69182235, + "num_input_tokens_seen": 180236465, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.67578125, + "step": 8384, + "time_per_iteration": 2.450605630874634 + }, + { + "auxiliary_loss_clip": 0.01112035, + "auxiliary_loss_mlp": 0.01036023, + "balance_loss_clip": 1.02414393, + "balance_loss_mlp": 1.0412066, + "epoch": 0.5041334736209229, + "flos": 29058160510080.0, + "grad_norm": 1.6534299501213623, + "language_loss": 0.70829523, + "learning_rate": 2.0702833595327674e-06, + "loss": 0.72977579, + "num_input_tokens_seen": 180258025, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.70703125, + "step": 8385, + "time_per_iteration": 3.9600095748901367 + }, + { + "auxiliary_loss_clip": 0.01106768, + "auxiliary_loss_mlp": 0.01027134, + "balance_loss_clip": 1.0147717, + "balance_loss_mlp": 1.03961098, + "epoch": 0.5041935968735909, + "flos": 24608361899520.0, + "grad_norm": 2.2280411323646687, + "language_loss": 0.83021009, + "learning_rate": 2.069894137075919e-06, + "loss": 0.85154909, + "num_input_tokens_seen": 180277825, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.671875, + "step": 8386, + "time_per_iteration": 2.5137035846710205 + }, + { + "auxiliary_loss_clip": 0.01109584, + "auxiliary_loss_mlp": 0.01034789, + "balance_loss_clip": 1.02156925, + "balance_loss_mlp": 1.03921139, + "epoch": 0.5042537201262588, + "flos": 26286934320000.0, + "grad_norm": 1.4630184477724049, + "language_loss": 0.66776884, + "learning_rate": 2.0695049119686766e-06, + "loss": 0.6892125, + "num_input_tokens_seen": 180300465, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 8387, + "time_per_iteration": 5.38523268699646 + }, + { + "auxiliary_loss_clip": 0.01110278, + "auxiliary_loss_mlp": 0.01029754, + "balance_loss_clip": 1.01780963, + "balance_loss_mlp": 1.04077113, + "epoch": 0.5043138433789268, + "flos": 22017371178240.0, + "grad_norm": 1.3874005116173278, + "language_loss": 0.80059648, + "learning_rate": 2.0691156842258016e-06, + "loss": 0.82199681, + "num_input_tokens_seen": 180321050, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 8388, + "time_per_iteration": 3.938295364379883 + }, + { + "auxiliary_loss_clip": 0.01109371, + "auxiliary_loss_mlp": 0.0103035, + "balance_loss_clip": 1.0181793, + "balance_loss_mlp": 1.03903794, + "epoch": 0.5043739666315947, + "flos": 28767104605440.0, + "grad_norm": 2.6549702991910453, + "language_loss": 0.69832838, + "learning_rate": 2.0687264538620537e-06, + "loss": 0.71972561, + "num_input_tokens_seen": 180338870, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.703125, + "step": 8389, + "time_per_iteration": 2.514204978942871 + }, + { + "auxiliary_loss_clip": 0.01110176, + "auxiliary_loss_mlp": 0.01033206, + "balance_loss_clip": 1.02127957, + "balance_loss_mlp": 1.03844476, + "epoch": 0.5044340898842627, + "flos": 27599253713280.0, + "grad_norm": 1.5923484046165255, + "language_loss": 0.69297862, + "learning_rate": 2.068337220892191e-06, + "loss": 0.71441251, + "num_input_tokens_seen": 180361285, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.71875, + "step": 8390, + "time_per_iteration": 2.517423152923584 + }, + { + "auxiliary_loss_clip": 0.01034589, + "auxiliary_loss_mlp": 0.01005008, + "balance_loss_clip": 1.00389957, + "balance_loss_mlp": 1.0117954, + "epoch": 0.5044942131369307, + "flos": 67458050749440.0, + "grad_norm": 0.8182221752596884, + "language_loss": 0.52977288, + "learning_rate": 2.067947985330974e-06, + "loss": 0.55016881, + "num_input_tokens_seen": 180415170, + "router_z_loss_clip": 0.0111084, + "router_z_loss_mlp": 0.22851562, + "step": 8391, + "time_per_iteration": 2.8990061283111572 + }, + { + "auxiliary_loss_clip": 0.01034773, + "auxiliary_loss_mlp": 0.01000958, + "balance_loss_clip": 0.99989092, + "balance_loss_mlp": 1.01217151, + "epoch": 0.5045543363895987, + "flos": 58630849390080.0, + "grad_norm": 0.8813101083301623, + "language_loss": 0.60678625, + "learning_rate": 2.0675587471931628e-06, + "loss": 0.62714356, + "num_input_tokens_seen": 180468060, + "router_z_loss_clip": 0.01068115, + "router_z_loss_mlp": 0.2265625, + "step": 8392, + "time_per_iteration": 2.91495680809021 + }, + { + "auxiliary_loss_clip": 0.01106534, + "auxiliary_loss_mlp": 0.01032936, + "balance_loss_clip": 1.02103257, + "balance_loss_mlp": 1.03893185, + "epoch": 0.5046144596422667, + "flos": 22526620248960.0, + "grad_norm": 1.5806327501196855, + "language_loss": 0.84691715, + "learning_rate": 2.067169506493517e-06, + "loss": 0.86831182, + "num_input_tokens_seen": 180486610, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 8393, + "time_per_iteration": 2.5033814907073975 + }, + { + "auxiliary_loss_clip": 0.01110241, + "auxiliary_loss_mlp": 0.01028823, + "balance_loss_clip": 1.01680708, + "balance_loss_mlp": 1.04046786, + "epoch": 0.5046745828949346, + "flos": 27454246508160.0, + "grad_norm": 2.96195836984414, + "language_loss": 0.50628948, + "learning_rate": 2.0667802632467974e-06, + "loss": 0.52768016, + "num_input_tokens_seen": 180508135, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6953125, + "step": 8394, + "time_per_iteration": 2.492766857147217 + }, + { + "auxiliary_loss_clip": 0.01108439, + "auxiliary_loss_mlp": 0.01032191, + "balance_loss_clip": 1.01906633, + "balance_loss_mlp": 1.03773594, + "epoch": 0.5047347061476026, + "flos": 17274541415040.0, + "grad_norm": 1.6061893361767445, + "language_loss": 0.75181741, + "learning_rate": 2.0663910174677627e-06, + "loss": 0.7732237, + "num_input_tokens_seen": 180527000, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.70703125, + "step": 8395, + "time_per_iteration": 2.4661927223205566 + }, + { + "auxiliary_loss_clip": 0.01107947, + "auxiliary_loss_mlp": 0.01030847, + "balance_loss_clip": 1.01859236, + "balance_loss_mlp": 1.03834832, + "epoch": 0.5047948294002705, + "flos": 16649515831680.0, + "grad_norm": 2.243385214175979, + "language_loss": 0.67677552, + "learning_rate": 2.0660017691711737e-06, + "loss": 0.69816345, + "num_input_tokens_seen": 180544715, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6953125, + "step": 8396, + "time_per_iteration": 2.416499376296997 + }, + { + "auxiliary_loss_clip": 0.01109504, + "auxiliary_loss_mlp": 0.01027964, + "balance_loss_clip": 1.01623356, + "balance_loss_mlp": 1.0404129, + "epoch": 0.5048549526529386, + "flos": 26865706164480.0, + "grad_norm": 1.7915756184866887, + "language_loss": 0.79064161, + "learning_rate": 2.065612518371792e-06, + "loss": 0.81201625, + "num_input_tokens_seen": 180565365, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6875, + "step": 8397, + "time_per_iteration": 2.5530309677124023 + }, + { + "auxiliary_loss_clip": 0.01107401, + "auxiliary_loss_mlp": 0.01029415, + "balance_loss_clip": 1.01741672, + "balance_loss_mlp": 1.03848135, + "epoch": 0.5049150759056065, + "flos": 21833939399040.0, + "grad_norm": 1.652903699623706, + "language_loss": 0.66017222, + "learning_rate": 2.065223265084376e-06, + "loss": 0.68154037, + "num_input_tokens_seen": 180586670, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6875, + "step": 8398, + "time_per_iteration": 2.4544124603271484 + }, + { + "auxiliary_loss_clip": 0.01108938, + "auxiliary_loss_mlp": 0.01029574, + "balance_loss_clip": 1.017313, + "balance_loss_mlp": 1.0395267, + "epoch": 0.5049751991582745, + "flos": 21685807710720.0, + "grad_norm": 1.639047703672107, + "language_loss": 0.71633506, + "learning_rate": 2.064834009323688e-06, + "loss": 0.73772013, + "num_input_tokens_seen": 180605085, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6953125, + "step": 8399, + "time_per_iteration": 2.5301358699798584 + }, + { + "auxiliary_loss_clip": 0.01111081, + "auxiliary_loss_mlp": 0.01038179, + "balance_loss_clip": 1.02533388, + "balance_loss_mlp": 1.03947675, + "epoch": 0.5050353224109424, + "flos": 21359379888000.0, + "grad_norm": 1.6970917460172408, + "language_loss": 0.81506133, + "learning_rate": 2.0644447511044878e-06, + "loss": 0.83655393, + "num_input_tokens_seen": 180624370, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71484375, + "step": 8400, + "time_per_iteration": 2.4705498218536377 + }, + { + "auxiliary_loss_clip": 0.01109099, + "auxiliary_loss_mlp": 0.01029733, + "balance_loss_clip": 1.01716256, + "balance_loss_mlp": 1.03942847, + "epoch": 0.5050954456636104, + "flos": 22820082364800.0, + "grad_norm": 1.8569234799708698, + "language_loss": 0.79040837, + "learning_rate": 2.0640554904415362e-06, + "loss": 0.81179667, + "num_input_tokens_seen": 180642450, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69921875, + "step": 8401, + "time_per_iteration": 2.4791224002838135 + }, + { + "auxiliary_loss_clip": 0.01109433, + "auxiliary_loss_mlp": 0.01030061, + "balance_loss_clip": 1.01775241, + "balance_loss_mlp": 1.03751659, + "epoch": 0.5051555689162783, + "flos": 30448226891520.0, + "grad_norm": 1.5775455049866824, + "language_loss": 0.69999743, + "learning_rate": 2.063666227349593e-06, + "loss": 0.72139227, + "num_input_tokens_seen": 180665250, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.71875, + "step": 8402, + "time_per_iteration": 2.5591325759887695 + }, + { + "auxiliary_loss_clip": 0.01105942, + "auxiliary_loss_mlp": 0.0102692, + "balance_loss_clip": 1.01515996, + "balance_loss_mlp": 1.03572834, + "epoch": 0.5052156921689464, + "flos": 21287953693440.0, + "grad_norm": 1.822367858534602, + "language_loss": 0.68917859, + "learning_rate": 2.063276961843422e-06, + "loss": 0.71050715, + "num_input_tokens_seen": 180687425, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.703125, + "step": 8403, + "time_per_iteration": 2.5292510986328125 + }, + { + "auxiliary_loss_clip": 0.01106316, + "auxiliary_loss_mlp": 0.01034839, + "balance_loss_clip": 1.02275133, + "balance_loss_mlp": 1.03929162, + "epoch": 0.5052758154216143, + "flos": 25081305298560.0, + "grad_norm": 1.4593040849849852, + "language_loss": 0.85396838, + "learning_rate": 2.062887693937781e-06, + "loss": 0.87537992, + "num_input_tokens_seen": 180708725, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.671875, + "step": 8404, + "time_per_iteration": 2.4852187633514404 + }, + { + "auxiliary_loss_clip": 0.01107561, + "auxiliary_loss_mlp": 0.01027359, + "balance_loss_clip": 1.01565218, + "balance_loss_mlp": 1.03806567, + "epoch": 0.5053359386742823, + "flos": 20885502735360.0, + "grad_norm": 1.5717367434630007, + "language_loss": 0.75364089, + "learning_rate": 2.0624984236474322e-06, + "loss": 0.77499014, + "num_input_tokens_seen": 180727990, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6953125, + "step": 8405, + "time_per_iteration": 2.4850387573242188 + }, + { + "auxiliary_loss_clip": 0.01109835, + "auxiliary_loss_mlp": 0.01028348, + "balance_loss_clip": 1.01514542, + "balance_loss_mlp": 1.0388459, + "epoch": 0.5053960619269503, + "flos": 37743335493120.0, + "grad_norm": 1.5541955318463554, + "language_loss": 0.72983336, + "learning_rate": 2.0621091509871378e-06, + "loss": 0.75121522, + "num_input_tokens_seen": 180749765, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 8406, + "time_per_iteration": 2.59979510307312 + }, + { + "auxiliary_loss_clip": 0.01102813, + "auxiliary_loss_mlp": 0.01028972, + "balance_loss_clip": 1.01712823, + "balance_loss_mlp": 1.03577971, + "epoch": 0.5054561851796182, + "flos": 23513840622720.0, + "grad_norm": 1.7094740961502104, + "language_loss": 0.76863986, + "learning_rate": 2.0617198759716568e-06, + "loss": 0.7899577, + "num_input_tokens_seen": 180769580, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 8407, + "time_per_iteration": 2.527543067932129 + }, + { + "auxiliary_loss_clip": 0.01108813, + "auxiliary_loss_mlp": 0.01027236, + "balance_loss_clip": 1.01535106, + "balance_loss_mlp": 1.03706717, + "epoch": 0.5055163084322862, + "flos": 30410233280640.0, + "grad_norm": 1.6525886874932982, + "language_loss": 0.63115776, + "learning_rate": 2.0613305986157535e-06, + "loss": 0.65251827, + "num_input_tokens_seen": 180790295, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.71875, + "step": 8408, + "time_per_iteration": 2.53218150138855 + }, + { + "auxiliary_loss_clip": 0.01106822, + "auxiliary_loss_mlp": 0.01031761, + "balance_loss_clip": 1.01871967, + "balance_loss_mlp": 1.0382477, + "epoch": 0.5055764316849541, + "flos": 20259651139200.0, + "grad_norm": 1.695436010833495, + "language_loss": 0.63705122, + "learning_rate": 2.0609413189341865e-06, + "loss": 0.65843707, + "num_input_tokens_seen": 180807875, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6875, + "step": 8409, + "time_per_iteration": 2.4916255474090576 + }, + { + "auxiliary_loss_clip": 0.01105638, + "auxiliary_loss_mlp": 0.01025409, + "balance_loss_clip": 1.01410186, + "balance_loss_mlp": 1.03845859, + "epoch": 0.5056365549376222, + "flos": 26070895969920.0, + "grad_norm": 1.3247049855298083, + "language_loss": 0.70876539, + "learning_rate": 2.0605520369417193e-06, + "loss": 0.73007584, + "num_input_tokens_seen": 180831300, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 8410, + "time_per_iteration": 2.527935266494751 + }, + { + "auxiliary_loss_clip": 0.01107655, + "auxiliary_loss_mlp": 0.0103361, + "balance_loss_clip": 1.02100372, + "balance_loss_mlp": 1.03812361, + "epoch": 0.5056966781902901, + "flos": 19279074781440.0, + "grad_norm": 1.5323244298402565, + "language_loss": 0.79243749, + "learning_rate": 2.060162752653113e-06, + "loss": 0.81385016, + "num_input_tokens_seen": 180849055, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 8411, + "time_per_iteration": 2.4926035404205322 + }, + { + "auxiliary_loss_clip": 0.01107995, + "auxiliary_loss_mlp": 0.0103704, + "balance_loss_clip": 1.02357578, + "balance_loss_mlp": 1.03764153, + "epoch": 0.5057568014429581, + "flos": 21323325611520.0, + "grad_norm": 1.7118743762511017, + "language_loss": 0.81584603, + "learning_rate": 2.0597734660831285e-06, + "loss": 0.83729643, + "num_input_tokens_seen": 180867395, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.703125, + "step": 8412, + "time_per_iteration": 2.4696593284606934 + }, + { + "auxiliary_loss_clip": 0.0110966, + "auxiliary_loss_mlp": 0.0103257, + "balance_loss_clip": 1.02057767, + "balance_loss_mlp": 1.04071307, + "epoch": 0.505816924695626, + "flos": 17493596507520.0, + "grad_norm": 2.1036912411500555, + "language_loss": 0.80586725, + "learning_rate": 2.0593841772465283e-06, + "loss": 0.82728952, + "num_input_tokens_seen": 180886670, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6875, + "step": 8413, + "time_per_iteration": 2.4840738773345947 + }, + { + "auxiliary_loss_clip": 0.01111974, + "auxiliary_loss_mlp": 0.01032705, + "balance_loss_clip": 1.01959252, + "balance_loss_mlp": 1.04003644, + "epoch": 0.505877047948294, + "flos": 21142084561920.0, + "grad_norm": 1.7598991939758672, + "language_loss": 0.80167186, + "learning_rate": 2.0589948861580737e-06, + "loss": 0.82311857, + "num_input_tokens_seen": 180904645, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 8414, + "time_per_iteration": 2.4437410831451416 + }, + { + "auxiliary_loss_clip": 0.01106268, + "auxiliary_loss_mlp": 0.0102984, + "balance_loss_clip": 1.0174123, + "balance_loss_mlp": 1.03536403, + "epoch": 0.5059371712009619, + "flos": 36350036887680.0, + "grad_norm": 2.1880801569958486, + "language_loss": 0.62188816, + "learning_rate": 2.058605592832528e-06, + "loss": 0.64324927, + "num_input_tokens_seen": 180922340, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 8415, + "time_per_iteration": 2.617699384689331 + }, + { + "auxiliary_loss_clip": 0.01107633, + "auxiliary_loss_mlp": 0.01029289, + "balance_loss_clip": 1.01712978, + "balance_loss_mlp": 1.03840709, + "epoch": 0.50599729445363, + "flos": 22673387220480.0, + "grad_norm": 1.5996951654726725, + "language_loss": 0.81836188, + "learning_rate": 2.0582162972846515e-06, + "loss": 0.8397311, + "num_input_tokens_seen": 180941350, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69140625, + "step": 8416, + "time_per_iteration": 2.484717607498169 + }, + { + "auxiliary_loss_clip": 0.0110817, + "auxiliary_loss_mlp": 0.01033768, + "balance_loss_clip": 1.02253819, + "balance_loss_mlp": 1.04098511, + "epoch": 0.5060574177062979, + "flos": 22747866071040.0, + "grad_norm": 1.7782267995500585, + "language_loss": 0.79110944, + "learning_rate": 2.0578269995292078e-06, + "loss": 0.81252885, + "num_input_tokens_seen": 180960720, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 8417, + "time_per_iteration": 2.544739246368408 + }, + { + "auxiliary_loss_clip": 0.01103419, + "auxiliary_loss_mlp": 0.01030044, + "balance_loss_clip": 1.01789641, + "balance_loss_mlp": 1.03713858, + "epoch": 0.5061175409589659, + "flos": 21653201139840.0, + "grad_norm": 1.8205649281423022, + "language_loss": 0.62930262, + "learning_rate": 2.0574376995809588e-06, + "loss": 0.65063727, + "num_input_tokens_seen": 180979725, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 8418, + "time_per_iteration": 2.4795963764190674 + }, + { + "auxiliary_loss_clip": 0.01109111, + "auxiliary_loss_mlp": 0.01034585, + "balance_loss_clip": 1.02232397, + "balance_loss_mlp": 1.03859878, + "epoch": 0.5061776642116339, + "flos": 21616249023360.0, + "grad_norm": 2.1933090002480182, + "language_loss": 0.77840686, + "learning_rate": 2.0570483974546653e-06, + "loss": 0.79984379, + "num_input_tokens_seen": 180998980, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.703125, + "step": 8419, + "time_per_iteration": 2.491931915283203 + }, + { + "auxiliary_loss_clip": 0.0110836, + "auxiliary_loss_mlp": 0.01032721, + "balance_loss_clip": 1.01950645, + "balance_loss_mlp": 1.0373354, + "epoch": 0.5062377874643018, + "flos": 24426294837120.0, + "grad_norm": 1.7154546366730201, + "language_loss": 0.77258635, + "learning_rate": 2.0566590931650917e-06, + "loss": 0.79399723, + "num_input_tokens_seen": 181019165, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.70703125, + "step": 8420, + "time_per_iteration": 2.5963363647460938 + }, + { + "auxiliary_loss_clip": 0.01109095, + "auxiliary_loss_mlp": 0.01037587, + "balance_loss_clip": 1.02459884, + "balance_loss_mlp": 1.03782094, + "epoch": 0.5062979107169698, + "flos": 22524429519360.0, + "grad_norm": 1.679092087125118, + "language_loss": 0.77511621, + "learning_rate": 2.056269786726999e-06, + "loss": 0.79658306, + "num_input_tokens_seen": 181037110, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 8421, + "time_per_iteration": 2.4954135417938232 + }, + { + "auxiliary_loss_clip": 0.01105449, + "auxiliary_loss_mlp": 0.01029273, + "balance_loss_clip": 1.01762629, + "balance_loss_mlp": 1.03668654, + "epoch": 0.5063580339696377, + "flos": 24571984400640.0, + "grad_norm": 1.4641430762434493, + "language_loss": 0.66987717, + "learning_rate": 2.0558804781551512e-06, + "loss": 0.69122434, + "num_input_tokens_seen": 181057775, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6875, + "step": 8422, + "time_per_iteration": 2.4802937507629395 + }, + { + "auxiliary_loss_clip": 0.01109498, + "auxiliary_loss_mlp": 0.01032318, + "balance_loss_clip": 1.01998544, + "balance_loss_mlp": 1.04081178, + "epoch": 0.5064181572223058, + "flos": 22596143022720.0, + "grad_norm": 1.8050040320885787, + "language_loss": 0.81599188, + "learning_rate": 2.05549116746431e-06, + "loss": 0.83741009, + "num_input_tokens_seen": 181078260, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 8423, + "time_per_iteration": 2.591792345046997 + }, + { + "auxiliary_loss_clip": 0.01109343, + "auxiliary_loss_mlp": 0.01031623, + "balance_loss_clip": 1.01859319, + "balance_loss_mlp": 1.03820443, + "epoch": 0.5064782804749737, + "flos": 25994944661760.0, + "grad_norm": 1.8632464802837558, + "language_loss": 0.74227667, + "learning_rate": 2.055101854669237e-06, + "loss": 0.76368636, + "num_input_tokens_seen": 181098755, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 8424, + "time_per_iteration": 2.5076076984405518 + }, + { + "auxiliary_loss_clip": 0.01105301, + "auxiliary_loss_mlp": 0.01033895, + "balance_loss_clip": 1.02120495, + "balance_loss_mlp": 1.03742146, + "epoch": 0.5065384037276417, + "flos": 28553041503360.0, + "grad_norm": 1.6339612294396895, + "language_loss": 0.71546394, + "learning_rate": 2.0547125397846975e-06, + "loss": 0.73685586, + "num_input_tokens_seen": 181121570, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 8425, + "time_per_iteration": 2.570103406906128 + }, + { + "auxiliary_loss_clip": 0.01108568, + "auxiliary_loss_mlp": 0.01035107, + "balance_loss_clip": 1.02325118, + "balance_loss_mlp": 1.0379858, + "epoch": 0.5065985269803096, + "flos": 22966023323520.0, + "grad_norm": 1.6987499343502257, + "language_loss": 0.78614688, + "learning_rate": 2.0543232228254524e-06, + "loss": 0.80758357, + "num_input_tokens_seen": 181140240, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.703125, + "step": 8426, + "time_per_iteration": 2.4616403579711914 + }, + { + "auxiliary_loss_clip": 0.01111124, + "auxiliary_loss_mlp": 0.01035589, + "balance_loss_clip": 1.02312577, + "balance_loss_mlp": 1.03994358, + "epoch": 0.5066586502329776, + "flos": 21608563512960.0, + "grad_norm": 2.818748758654822, + "language_loss": 0.77855921, + "learning_rate": 2.053933903806265e-06, + "loss": 0.80002636, + "num_input_tokens_seen": 181158630, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 8427, + "time_per_iteration": 3.908625364303589 + }, + { + "auxiliary_loss_clip": 0.0110433, + "auxiliary_loss_mlp": 0.01026092, + "balance_loss_clip": 1.01382565, + "balance_loss_mlp": 1.03709817, + "epoch": 0.5067187734856455, + "flos": 20339912079360.0, + "grad_norm": 1.8142719003609429, + "language_loss": 0.71444368, + "learning_rate": 2.0535445827418997e-06, + "loss": 0.73574793, + "num_input_tokens_seen": 181176405, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.671875, + "step": 8428, + "time_per_iteration": 2.4540021419525146 + }, + { + "auxiliary_loss_clip": 0.0110492, + "auxiliary_loss_mlp": 0.01031645, + "balance_loss_clip": 1.01983786, + "balance_loss_mlp": 1.03622389, + "epoch": 0.5067788967383136, + "flos": 28841080665600.0, + "grad_norm": 1.6344761677930288, + "language_loss": 0.82693905, + "learning_rate": 2.0531552596471168e-06, + "loss": 0.84830469, + "num_input_tokens_seen": 181197595, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 8429, + "time_per_iteration": 3.977104902267456 + }, + { + "auxiliary_loss_clip": 0.01113682, + "auxiliary_loss_mlp": 0.01036238, + "balance_loss_clip": 1.02267253, + "balance_loss_mlp": 1.04074979, + "epoch": 0.5068390199909815, + "flos": 32450174478720.0, + "grad_norm": 2.1730745276419485, + "language_loss": 0.73167485, + "learning_rate": 2.052765934536682e-06, + "loss": 0.75317407, + "num_input_tokens_seen": 181218560, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7265625, + "step": 8430, + "time_per_iteration": 4.066487073898315 + }, + { + "auxiliary_loss_clip": 0.01109473, + "auxiliary_loss_mlp": 0.01031357, + "balance_loss_clip": 1.01953173, + "balance_loss_mlp": 1.03904748, + "epoch": 0.5068991432436495, + "flos": 23146582014720.0, + "grad_norm": 1.7614160050819483, + "language_loss": 0.76304209, + "learning_rate": 2.0523766074253575e-06, + "loss": 0.78445041, + "num_input_tokens_seen": 181237095, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.703125, + "step": 8431, + "time_per_iteration": 2.459061861038208 + }, + { + "auxiliary_loss_clip": 0.01107362, + "auxiliary_loss_mlp": 0.01031584, + "balance_loss_clip": 1.01899588, + "balance_loss_mlp": 1.0388869, + "epoch": 0.5069592664963174, + "flos": 19936096404480.0, + "grad_norm": 1.4179396940955034, + "language_loss": 0.72168291, + "learning_rate": 2.0519872783279074e-06, + "loss": 0.74307233, + "num_input_tokens_seen": 181255940, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 8432, + "time_per_iteration": 2.4937191009521484 + }, + { + "auxiliary_loss_clip": 0.01040308, + "auxiliary_loss_mlp": 0.0100546, + "balance_loss_clip": 1.00428617, + "balance_loss_mlp": 1.01756871, + "epoch": 0.5070193897489854, + "flos": 65793771941760.0, + "grad_norm": 0.7612043046384747, + "language_loss": 0.63704848, + "learning_rate": 2.0515979472590945e-06, + "loss": 0.65750623, + "num_input_tokens_seen": 181316945, + "router_z_loss_clip": 0.01171875, + "router_z_loss_mlp": 0.22753906, + "step": 8433, + "time_per_iteration": 3.10312819480896 + }, + { + "auxiliary_loss_clip": 0.01109071, + "auxiliary_loss_mlp": 0.01035563, + "balance_loss_clip": 1.02276051, + "balance_loss_mlp": 1.0391171, + "epoch": 0.5070795130016534, + "flos": 17275331514240.0, + "grad_norm": 1.7667352609332163, + "language_loss": 0.77104461, + "learning_rate": 2.051208614233681e-06, + "loss": 0.79249096, + "num_input_tokens_seen": 181335555, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 8434, + "time_per_iteration": 2.4761765003204346 + }, + { + "auxiliary_loss_clip": 0.01110101, + "auxiliary_loss_mlp": 0.01032152, + "balance_loss_clip": 1.01997447, + "balance_loss_mlp": 1.03937244, + "epoch": 0.5071396362543213, + "flos": 21069940095360.0, + "grad_norm": 1.7167508969307774, + "language_loss": 0.71062863, + "learning_rate": 2.0508192792664326e-06, + "loss": 0.73205119, + "num_input_tokens_seen": 181354580, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.70703125, + "step": 8435, + "time_per_iteration": 2.476259231567383 + }, + { + "auxiliary_loss_clip": 0.01111512, + "auxiliary_loss_mlp": 0.01034776, + "balance_loss_clip": 1.02086425, + "balance_loss_mlp": 1.04086459, + "epoch": 0.5071997595069894, + "flos": 23144822248320.0, + "grad_norm": 2.1519666669040407, + "language_loss": 0.71635526, + "learning_rate": 2.050429942372112e-06, + "loss": 0.73781812, + "num_input_tokens_seen": 181374320, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.70703125, + "step": 8436, + "time_per_iteration": 2.4717278480529785 + }, + { + "auxiliary_loss_clip": 0.0111073, + "auxiliary_loss_mlp": 0.01029431, + "balance_loss_clip": 1.01621652, + "balance_loss_mlp": 1.04132712, + "epoch": 0.5072598827596573, + "flos": 22747183712640.0, + "grad_norm": 1.5051036444651287, + "language_loss": 0.8370682, + "learning_rate": 2.050040603565483e-06, + "loss": 0.85846984, + "num_input_tokens_seen": 181392190, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.6953125, + "step": 8437, + "time_per_iteration": 2.51187801361084 + }, + { + "auxiliary_loss_clip": 0.01107572, + "auxiliary_loss_mlp": 0.01025487, + "balance_loss_clip": 1.01340485, + "balance_loss_mlp": 1.03941774, + "epoch": 0.5073200060123253, + "flos": 22566301799040.0, + "grad_norm": 1.8339895444539178, + "language_loss": 0.80925703, + "learning_rate": 2.049651262861309e-06, + "loss": 0.83058763, + "num_input_tokens_seen": 181413890, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 8438, + "time_per_iteration": 2.5101053714752197 + }, + { + "auxiliary_loss_clip": 0.01112175, + "auxiliary_loss_mlp": 0.0103796, + "balance_loss_clip": 1.023947, + "balance_loss_mlp": 1.04053128, + "epoch": 0.5073801292649932, + "flos": 25806341324160.0, + "grad_norm": 1.458277190934999, + "language_loss": 0.79797888, + "learning_rate": 2.0492619202743543e-06, + "loss": 0.81948024, + "num_input_tokens_seen": 181433240, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.71875, + "step": 8439, + "time_per_iteration": 2.5196681022644043 + }, + { + "auxiliary_loss_clip": 0.01107511, + "auxiliary_loss_mlp": 0.01033327, + "balance_loss_clip": 1.02176344, + "balance_loss_mlp": 1.03948164, + "epoch": 0.5074402525176612, + "flos": 25373941401600.0, + "grad_norm": 1.5054968059802218, + "language_loss": 0.7129699, + "learning_rate": 2.048872575819383e-06, + "loss": 0.73437822, + "num_input_tokens_seen": 181453535, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6796875, + "step": 8440, + "time_per_iteration": 2.482475757598877 + }, + { + "auxiliary_loss_clip": 0.01110635, + "auxiliary_loss_mlp": 0.0103111, + "balance_loss_clip": 1.01877761, + "balance_loss_mlp": 1.03933895, + "epoch": 0.5075003757703291, + "flos": 26064431521920.0, + "grad_norm": 1.6937518353915977, + "language_loss": 0.70555139, + "learning_rate": 2.048483229511158e-06, + "loss": 0.72696882, + "num_input_tokens_seen": 181474195, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.7109375, + "step": 8441, + "time_per_iteration": 2.5299065113067627 + }, + { + "auxiliary_loss_clip": 0.01113885, + "auxiliary_loss_mlp": 0.01035023, + "balance_loss_clip": 1.0219456, + "balance_loss_mlp": 1.04142308, + "epoch": 0.5075604990229972, + "flos": 21835447770240.0, + "grad_norm": 1.8980066327338418, + "language_loss": 0.63670987, + "learning_rate": 2.0480938813644445e-06, + "loss": 0.65819889, + "num_input_tokens_seen": 181494000, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.72265625, + "step": 8442, + "time_per_iteration": 2.4623775482177734 + }, + { + "auxiliary_loss_clip": 0.01108296, + "auxiliary_loss_mlp": 0.01027699, + "balance_loss_clip": 1.016011, + "balance_loss_mlp": 1.04047632, + "epoch": 0.5076206222756651, + "flos": 31978703537280.0, + "grad_norm": 1.5153774279484464, + "language_loss": 0.7150898, + "learning_rate": 2.047704531394006e-06, + "loss": 0.73644972, + "num_input_tokens_seen": 181515955, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 8443, + "time_per_iteration": 2.586273670196533 + }, + { + "auxiliary_loss_clip": 0.01110925, + "auxiliary_loss_mlp": 0.01033689, + "balance_loss_clip": 1.02046299, + "balance_loss_mlp": 1.03887248, + "epoch": 0.5076807455283331, + "flos": 36904031326080.0, + "grad_norm": 1.223488951652841, + "language_loss": 0.61766541, + "learning_rate": 2.047315179614607e-06, + "loss": 0.63911152, + "num_input_tokens_seen": 181540225, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 8444, + "time_per_iteration": 2.5941321849823 + }, + { + "auxiliary_loss_clip": 0.01107921, + "auxiliary_loss_mlp": 0.01032806, + "balance_loss_clip": 1.02056909, + "balance_loss_mlp": 1.0380075, + "epoch": 0.507740868781001, + "flos": 29862415981440.0, + "grad_norm": 1.7476957798256931, + "language_loss": 0.6370405, + "learning_rate": 2.046925826041012e-06, + "loss": 0.65844774, + "num_input_tokens_seen": 181560125, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69921875, + "step": 8445, + "time_per_iteration": 2.622295379638672 + }, + { + "auxiliary_loss_clip": 0.01042597, + "auxiliary_loss_mlp": 0.01005213, + "balance_loss_clip": 1.00411069, + "balance_loss_mlp": 1.019732, + "epoch": 0.507800992033669, + "flos": 61918974247680.0, + "grad_norm": 0.8272934825203048, + "language_loss": 0.61873507, + "learning_rate": 2.0465364706879845e-06, + "loss": 0.6392132, + "num_input_tokens_seen": 181618830, + "router_z_loss_clip": 0.01104736, + "router_z_loss_mlp": 0.22851562, + "step": 8446, + "time_per_iteration": 3.106067180633545 + }, + { + "auxiliary_loss_clip": 0.01107421, + "auxiliary_loss_mlp": 0.01028834, + "balance_loss_clip": 1.01656127, + "balance_loss_mlp": 1.03849411, + "epoch": 0.507861115286337, + "flos": 20700490757760.0, + "grad_norm": 1.6783761303243148, + "language_loss": 0.80458808, + "learning_rate": 2.04614711357029e-06, + "loss": 0.82595056, + "num_input_tokens_seen": 181637120, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 8447, + "time_per_iteration": 2.483449935913086 + }, + { + "auxiliary_loss_clip": 0.01109683, + "auxiliary_loss_mlp": 0.01031748, + "balance_loss_clip": 1.01955903, + "balance_loss_mlp": 1.04166472, + "epoch": 0.507921238539005, + "flos": 30847050576000.0, + "grad_norm": 1.6097524760484219, + "language_loss": 0.70526159, + "learning_rate": 2.0457577547026916e-06, + "loss": 0.72667593, + "num_input_tokens_seen": 181659965, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6796875, + "step": 8448, + "time_per_iteration": 2.5377211570739746 + }, + { + "auxiliary_loss_clip": 0.01108561, + "auxiliary_loss_mlp": 0.01030678, + "balance_loss_clip": 1.01906157, + "balance_loss_mlp": 1.04054332, + "epoch": 0.507981361791673, + "flos": 35700197984640.0, + "grad_norm": 1.775058362169557, + "language_loss": 0.72186208, + "learning_rate": 2.045368394099955e-06, + "loss": 0.74325454, + "num_input_tokens_seen": 181685290, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 8449, + "time_per_iteration": 2.6247637271881104 + }, + { + "auxiliary_loss_clip": 0.01105391, + "auxiliary_loss_mlp": 0.01030497, + "balance_loss_clip": 1.01862371, + "balance_loss_mlp": 1.0373019, + "epoch": 0.5080414850443409, + "flos": 27161466750720.0, + "grad_norm": 1.4717194557779922, + "language_loss": 0.72751403, + "learning_rate": 2.044979031776844e-06, + "loss": 0.74887294, + "num_input_tokens_seen": 181706080, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 8450, + "time_per_iteration": 2.5097148418426514 + }, + { + "auxiliary_loss_clip": 0.01112404, + "auxiliary_loss_mlp": 0.01033223, + "balance_loss_clip": 1.02104533, + "balance_loss_mlp": 1.04217696, + "epoch": 0.5081016082970089, + "flos": 27085192220160.0, + "grad_norm": 1.631370100986613, + "language_loss": 0.7704621, + "learning_rate": 2.0445896677481234e-06, + "loss": 0.7919184, + "num_input_tokens_seen": 181724805, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69921875, + "step": 8451, + "time_per_iteration": 2.5109496116638184 + }, + { + "auxiliary_loss_clip": 0.01109885, + "auxiliary_loss_mlp": 0.01036862, + "balance_loss_clip": 1.02502477, + "balance_loss_mlp": 1.03928411, + "epoch": 0.5081617315496768, + "flos": 22856531690880.0, + "grad_norm": 1.7784899256909827, + "language_loss": 0.8518312, + "learning_rate": 2.044200302028559e-06, + "loss": 0.8732987, + "num_input_tokens_seen": 181743725, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.703125, + "step": 8452, + "time_per_iteration": 2.4603476524353027 + }, + { + "auxiliary_loss_clip": 0.01115612, + "auxiliary_loss_mlp": 0.01036365, + "balance_loss_clip": 1.02284074, + "balance_loss_mlp": 1.04209125, + "epoch": 0.5082218548023448, + "flos": 16281898087680.0, + "grad_norm": 2.2856093940760274, + "language_loss": 0.78046912, + "learning_rate": 2.0438109346329143e-06, + "loss": 0.80198884, + "num_input_tokens_seen": 181757720, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.734375, + "step": 8453, + "time_per_iteration": 2.450873613357544 + }, + { + "auxiliary_loss_clip": 0.01106928, + "auxiliary_loss_mlp": 0.01034326, + "balance_loss_clip": 1.02200532, + "balance_loss_mlp": 1.03973246, + "epoch": 0.5082819780550127, + "flos": 24460768915200.0, + "grad_norm": 1.6556718901191125, + "language_loss": 0.7626555, + "learning_rate": 2.0434215655759544e-06, + "loss": 0.78406799, + "num_input_tokens_seen": 181778545, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.671875, + "step": 8454, + "time_per_iteration": 2.4831783771514893 + }, + { + "auxiliary_loss_clip": 0.01110162, + "auxiliary_loss_mlp": 0.01032609, + "balance_loss_clip": 1.01998448, + "balance_loss_mlp": 1.03985167, + "epoch": 0.5083421013076808, + "flos": 23403271582080.0, + "grad_norm": 1.7440679508015728, + "language_loss": 0.89345592, + "learning_rate": 2.0430321948724446e-06, + "loss": 0.91488367, + "num_input_tokens_seen": 181799495, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 8455, + "time_per_iteration": 2.48486590385437 + }, + { + "auxiliary_loss_clip": 0.01116133, + "auxiliary_loss_mlp": 0.01035999, + "balance_loss_clip": 1.02230144, + "balance_loss_mlp": 1.04198599, + "epoch": 0.5084022245603487, + "flos": 23872695448320.0, + "grad_norm": 2.029385394187206, + "language_loss": 0.62613618, + "learning_rate": 2.042642822537149e-06, + "loss": 0.64765751, + "num_input_tokens_seen": 181818400, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7421875, + "step": 8456, + "time_per_iteration": 2.476060390472412 + }, + { + "auxiliary_loss_clip": 0.01038842, + "auxiliary_loss_mlp": 0.00998694, + "balance_loss_clip": 0.99766272, + "balance_loss_mlp": 1.01592362, + "epoch": 0.5084623478130167, + "flos": 62873336655360.0, + "grad_norm": 0.816065361839575, + "language_loss": 0.62538505, + "learning_rate": 2.0422534485848343e-06, + "loss": 0.64576042, + "num_input_tokens_seen": 181875975, + "router_z_loss_clip": 0.01031494, + "router_z_loss_mlp": 0.22949219, + "step": 8457, + "time_per_iteration": 2.9627416133880615 + }, + { + "auxiliary_loss_clip": 0.01110833, + "auxiliary_loss_mlp": 0.01033568, + "balance_loss_clip": 1.02069306, + "balance_loss_mlp": 1.04062462, + "epoch": 0.5085224710656846, + "flos": 22346133384960.0, + "grad_norm": 1.5574868486202833, + "language_loss": 0.67412502, + "learning_rate": 2.0418640730302644e-06, + "loss": 0.69556904, + "num_input_tokens_seen": 181896450, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 8458, + "time_per_iteration": 2.4851465225219727 + }, + { + "auxiliary_loss_clip": 0.01109854, + "auxiliary_loss_mlp": 0.01031481, + "balance_loss_clip": 1.01840341, + "balance_loss_mlp": 1.03811622, + "epoch": 0.5085825943183526, + "flos": 26066263115520.0, + "grad_norm": 1.6253676139168076, + "language_loss": 0.77861875, + "learning_rate": 2.0414746958882043e-06, + "loss": 0.80003208, + "num_input_tokens_seen": 181916770, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 8459, + "time_per_iteration": 2.5043020248413086 + }, + { + "auxiliary_loss_clip": 0.01117652, + "auxiliary_loss_mlp": 0.01035652, + "balance_loss_clip": 1.02252126, + "balance_loss_mlp": 1.04386926, + "epoch": 0.5086427175710206, + "flos": 17420733768960.0, + "grad_norm": 2.213093169353168, + "language_loss": 0.81109118, + "learning_rate": 2.0410853171734196e-06, + "loss": 0.83262426, + "num_input_tokens_seen": 181932710, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73828125, + "step": 8460, + "time_per_iteration": 2.4239838123321533 + }, + { + "auxiliary_loss_clip": 0.01111375, + "auxiliary_loss_mlp": 0.010378, + "balance_loss_clip": 1.02565289, + "balance_loss_mlp": 1.03999329, + "epoch": 0.5087028408236886, + "flos": 20631758083200.0, + "grad_norm": 1.5640945155523684, + "language_loss": 0.6866132, + "learning_rate": 2.0406959369006754e-06, + "loss": 0.70810497, + "num_input_tokens_seen": 181950665, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.7109375, + "step": 8461, + "time_per_iteration": 2.469954490661621 + }, + { + "auxiliary_loss_clip": 0.01107585, + "auxiliary_loss_mlp": 0.01032567, + "balance_loss_clip": 1.01997876, + "balance_loss_mlp": 1.03908265, + "epoch": 0.5087629640763566, + "flos": 25593822506880.0, + "grad_norm": 1.5611830538381608, + "language_loss": 0.76059598, + "learning_rate": 2.0403065550847375e-06, + "loss": 0.7819975, + "num_input_tokens_seen": 181971270, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 8462, + "time_per_iteration": 2.4907591342926025 + }, + { + "auxiliary_loss_clip": 0.01111001, + "auxiliary_loss_mlp": 0.01036225, + "balance_loss_clip": 1.02376187, + "balance_loss_mlp": 1.04031515, + "epoch": 0.5088230873290245, + "flos": 13261631927040.0, + "grad_norm": 1.977849325123916, + "language_loss": 0.8121528, + "learning_rate": 2.0399171717403706e-06, + "loss": 0.83362508, + "num_input_tokens_seen": 181988410, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.70703125, + "step": 8463, + "time_per_iteration": 2.460604190826416 + }, + { + "auxiliary_loss_clip": 0.01109081, + "auxiliary_loss_mlp": 0.01037256, + "balance_loss_clip": 1.02527571, + "balance_loss_mlp": 1.03999758, + "epoch": 0.5088832105816925, + "flos": 20043469134720.0, + "grad_norm": 1.7045720874408852, + "language_loss": 0.7630803, + "learning_rate": 2.039527786882341e-06, + "loss": 0.78454363, + "num_input_tokens_seen": 182006530, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 8464, + "time_per_iteration": 2.4307475090026855 + }, + { + "auxiliary_loss_clip": 0.01037487, + "auxiliary_loss_mlp": 0.01005228, + "balance_loss_clip": 1.00426793, + "balance_loss_mlp": 1.01476121, + "epoch": 0.5089433338343604, + "flos": 67422179018880.0, + "grad_norm": 0.687733273493157, + "language_loss": 0.59352195, + "learning_rate": 2.0391384005254133e-06, + "loss": 0.61394918, + "num_input_tokens_seen": 182074240, + "router_z_loss_clip": 0.00958252, + "router_z_loss_mlp": 0.2265625, + "step": 8465, + "time_per_iteration": 3.1989307403564453 + }, + { + "auxiliary_loss_clip": 0.01106873, + "auxiliary_loss_mlp": 0.01035022, + "balance_loss_clip": 1.02263045, + "balance_loss_mlp": 1.03822207, + "epoch": 0.5090034570870284, + "flos": 22710339336960.0, + "grad_norm": 1.7579634525926484, + "language_loss": 0.79857922, + "learning_rate": 2.038749012684354e-06, + "loss": 0.81999815, + "num_input_tokens_seen": 182093360, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 8466, + "time_per_iteration": 2.472186326980591 + }, + { + "auxiliary_loss_clip": 0.01104861, + "auxiliary_loss_mlp": 0.01032192, + "balance_loss_clip": 1.01950181, + "balance_loss_mlp": 1.03679371, + "epoch": 0.5090635803396963, + "flos": 20445812352000.0, + "grad_norm": 1.5999387152583837, + "language_loss": 0.78222281, + "learning_rate": 2.0383596233739286e-06, + "loss": 0.80359334, + "num_input_tokens_seen": 182110170, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 8467, + "time_per_iteration": 2.4692180156707764 + }, + { + "auxiliary_loss_clip": 0.01107209, + "auxiliary_loss_mlp": 0.01032197, + "balance_loss_clip": 1.02041364, + "balance_loss_mlp": 1.03994191, + "epoch": 0.5091237035923644, + "flos": 23768878164480.0, + "grad_norm": 1.7540939283261232, + "language_loss": 0.7467652, + "learning_rate": 2.0379702326089013e-06, + "loss": 0.76815927, + "num_input_tokens_seen": 182129570, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 8468, + "time_per_iteration": 3.8722333908081055 + }, + { + "auxiliary_loss_clip": 0.01107691, + "auxiliary_loss_mlp": 0.01031114, + "balance_loss_clip": 1.01877546, + "balance_loss_mlp": 1.03856027, + "epoch": 0.5091838268450323, + "flos": 18327908684160.0, + "grad_norm": 1.7320149470681812, + "language_loss": 0.77835757, + "learning_rate": 2.03758084040404e-06, + "loss": 0.79974556, + "num_input_tokens_seen": 182147565, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.69140625, + "step": 8469, + "time_per_iteration": 2.4514496326446533 + }, + { + "auxiliary_loss_clip": 0.01112445, + "auxiliary_loss_mlp": 0.01035475, + "balance_loss_clip": 1.0221895, + "balance_loss_mlp": 1.04265046, + "epoch": 0.5092439500977003, + "flos": 29057621806080.0, + "grad_norm": 1.5013208791161945, + "language_loss": 0.69422746, + "learning_rate": 2.037191446774109e-06, + "loss": 0.71570665, + "num_input_tokens_seen": 182169695, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6953125, + "step": 8470, + "time_per_iteration": 2.5658817291259766 + }, + { + "auxiliary_loss_clip": 0.01112957, + "auxiliary_loss_mlp": 0.01033067, + "balance_loss_clip": 1.01997817, + "balance_loss_mlp": 1.04058552, + "epoch": 0.5093040733503682, + "flos": 13553908894080.0, + "grad_norm": 2.018231732442679, + "language_loss": 0.73409355, + "learning_rate": 2.0368020517338745e-06, + "loss": 0.75555384, + "num_input_tokens_seen": 182186385, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.72265625, + "step": 8471, + "time_per_iteration": 5.355906009674072 + }, + { + "auxiliary_loss_clip": 0.01036047, + "auxiliary_loss_mlp": 0.01003435, + "balance_loss_clip": 1.00242805, + "balance_loss_mlp": 1.01322865, + "epoch": 0.5093641966030362, + "flos": 68906617407360.0, + "grad_norm": 0.7572542385247485, + "language_loss": 0.58153868, + "learning_rate": 2.036412655298103e-06, + "loss": 0.60193354, + "num_input_tokens_seen": 182247095, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.22851562, + "step": 8472, + "time_per_iteration": 3.0752861499786377 + }, + { + "auxiliary_loss_clip": 0.01111139, + "auxiliary_loss_mlp": 0.01032478, + "balance_loss_clip": 1.02100456, + "balance_loss_mlp": 1.04138827, + "epoch": 0.5094243198557042, + "flos": 21580948932480.0, + "grad_norm": 1.783541878810952, + "language_loss": 0.69200397, + "learning_rate": 2.03602325748156e-06, + "loss": 0.71344012, + "num_input_tokens_seen": 182266380, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6953125, + "step": 8473, + "time_per_iteration": 2.4832053184509277 + }, + { + "auxiliary_loss_clip": 0.01109225, + "auxiliary_loss_mlp": 0.01033767, + "balance_loss_clip": 1.02144074, + "balance_loss_mlp": 1.03987551, + "epoch": 0.5094844431083722, + "flos": 28840721529600.0, + "grad_norm": 2.2073606957030143, + "language_loss": 0.85564739, + "learning_rate": 2.0356338582990105e-06, + "loss": 0.87707734, + "num_input_tokens_seen": 182284685, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6953125, + "step": 8474, + "time_per_iteration": 2.5068845748901367 + }, + { + "auxiliary_loss_clip": 0.01110669, + "auxiliary_loss_mlp": 0.01031974, + "balance_loss_clip": 1.01944494, + "balance_loss_mlp": 1.03983307, + "epoch": 0.5095445663610402, + "flos": 14976114969600.0, + "grad_norm": 2.014074019348489, + "language_loss": 0.64659619, + "learning_rate": 2.035244457765222e-06, + "loss": 0.66802263, + "num_input_tokens_seen": 182301810, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.70703125, + "step": 8475, + "time_per_iteration": 2.4363739490509033 + }, + { + "auxiliary_loss_clip": 0.01115225, + "auxiliary_loss_mlp": 0.01038799, + "balance_loss_clip": 1.02557325, + "balance_loss_mlp": 1.04094887, + "epoch": 0.5096046896137081, + "flos": 20777088510720.0, + "grad_norm": 4.024838672705198, + "language_loss": 0.81962836, + "learning_rate": 2.0348550558949605e-06, + "loss": 0.84116852, + "num_input_tokens_seen": 182320285, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 8476, + "time_per_iteration": 2.448249578475952 + }, + { + "auxiliary_loss_clip": 0.01111186, + "auxiliary_loss_mlp": 0.01035572, + "balance_loss_clip": 1.02019382, + "balance_loss_mlp": 1.03794646, + "epoch": 0.5096648128663761, + "flos": 23185078416000.0, + "grad_norm": 1.9611523426566915, + "language_loss": 0.81148994, + "learning_rate": 2.0344656527029917e-06, + "loss": 0.83295757, + "num_input_tokens_seen": 182339465, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.734375, + "step": 8477, + "time_per_iteration": 2.470248222351074 + }, + { + "auxiliary_loss_clip": 0.01111185, + "auxiliary_loss_mlp": 0.01029928, + "balance_loss_clip": 1.01584899, + "balance_loss_mlp": 1.03962493, + "epoch": 0.509724936119044, + "flos": 22309432663680.0, + "grad_norm": 1.8342280591951767, + "language_loss": 0.61682522, + "learning_rate": 2.034076248204082e-06, + "loss": 0.6382364, + "num_input_tokens_seen": 182358375, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.71484375, + "step": 8478, + "time_per_iteration": 2.4439172744750977 + }, + { + "auxiliary_loss_clip": 0.01108849, + "auxiliary_loss_mlp": 0.01037275, + "balance_loss_clip": 1.02540779, + "balance_loss_mlp": 1.03930426, + "epoch": 0.509785059371712, + "flos": 26287077974400.0, + "grad_norm": 1.4883331760724325, + "language_loss": 0.65860271, + "learning_rate": 2.0336868424129968e-06, + "loss": 0.6800639, + "num_input_tokens_seen": 182377935, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6953125, + "step": 8479, + "time_per_iteration": 2.4965710639953613 + }, + { + "auxiliary_loss_clip": 0.01107177, + "auxiliary_loss_mlp": 0.01031743, + "balance_loss_clip": 1.01974487, + "balance_loss_mlp": 1.0389936, + "epoch": 0.50984518262438, + "flos": 22964586779520.0, + "grad_norm": 1.620468938265791, + "language_loss": 0.69455707, + "learning_rate": 2.0332974353445037e-06, + "loss": 0.71594626, + "num_input_tokens_seen": 182396440, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 8480, + "time_per_iteration": 2.4500057697296143 + }, + { + "auxiliary_loss_clip": 0.01109301, + "auxiliary_loss_mlp": 0.01031568, + "balance_loss_clip": 1.01871157, + "balance_loss_mlp": 1.03733814, + "epoch": 0.509905305877048, + "flos": 26213389223040.0, + "grad_norm": 1.6808533459383284, + "language_loss": 0.79027826, + "learning_rate": 2.0329080270133688e-06, + "loss": 0.81168693, + "num_input_tokens_seen": 182415890, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 8481, + "time_per_iteration": 2.507157564163208 + }, + { + "auxiliary_loss_clip": 0.01104124, + "auxiliary_loss_mlp": 0.01034339, + "balance_loss_clip": 1.02170324, + "balance_loss_mlp": 1.03702283, + "epoch": 0.5099654291297159, + "flos": 20340055733760.0, + "grad_norm": 1.5080021873745288, + "language_loss": 0.83429766, + "learning_rate": 2.0325186174343578e-06, + "loss": 0.85568231, + "num_input_tokens_seen": 182434235, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.671875, + "step": 8482, + "time_per_iteration": 2.4544076919555664 + }, + { + "auxiliary_loss_clip": 0.0111291, + "auxiliary_loss_mlp": 0.01032891, + "balance_loss_clip": 1.01925349, + "balance_loss_mlp": 1.03990221, + "epoch": 0.5100255523823839, + "flos": 29054820545280.0, + "grad_norm": 1.7853243252822575, + "language_loss": 0.85625446, + "learning_rate": 2.032129206622238e-06, + "loss": 0.87771249, + "num_input_tokens_seen": 182454360, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.73046875, + "step": 8483, + "time_per_iteration": 2.519747734069824 + }, + { + "auxiliary_loss_clip": 0.01107969, + "auxiliary_loss_mlp": 0.01031848, + "balance_loss_clip": 1.01965284, + "balance_loss_mlp": 1.03712344, + "epoch": 0.5100856756350518, + "flos": 22455912326400.0, + "grad_norm": 1.7164607290812173, + "language_loss": 0.83208412, + "learning_rate": 2.031739794591775e-06, + "loss": 0.85348231, + "num_input_tokens_seen": 182471940, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.70703125, + "step": 8484, + "time_per_iteration": 2.4549949169158936 + }, + { + "auxiliary_loss_clip": 0.01109177, + "auxiliary_loss_mlp": 0.01028587, + "balance_loss_clip": 1.0154798, + "balance_loss_mlp": 1.03849459, + "epoch": 0.5101457988877198, + "flos": 19171055606400.0, + "grad_norm": 2.0216137506651983, + "language_loss": 0.81388122, + "learning_rate": 2.031350381357736e-06, + "loss": 0.83525884, + "num_input_tokens_seen": 182490685, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.70703125, + "step": 8485, + "time_per_iteration": 2.4612390995025635 + }, + { + "auxiliary_loss_clip": 0.01103382, + "auxiliary_loss_mlp": 0.01032642, + "balance_loss_clip": 1.02036929, + "balance_loss_mlp": 1.03675199, + "epoch": 0.5102059221403878, + "flos": 14866371941760.0, + "grad_norm": 2.1191716083834025, + "language_loss": 0.73653662, + "learning_rate": 2.0309609669348874e-06, + "loss": 0.7578969, + "num_input_tokens_seen": 182508325, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.66796875, + "step": 8486, + "time_per_iteration": 2.426042318344116 + }, + { + "auxiliary_loss_clip": 0.01112031, + "auxiliary_loss_mlp": 0.01031218, + "balance_loss_clip": 1.01824152, + "balance_loss_mlp": 1.03990436, + "epoch": 0.5102660453930558, + "flos": 22961103160320.0, + "grad_norm": 1.4808929350883289, + "language_loss": 0.69956315, + "learning_rate": 2.0305715513379953e-06, + "loss": 0.72099566, + "num_input_tokens_seen": 182527020, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.72265625, + "step": 8487, + "time_per_iteration": 2.5032570362091064 + }, + { + "auxiliary_loss_clip": 0.01108669, + "auxiliary_loss_mlp": 0.01033355, + "balance_loss_clip": 1.01987231, + "balance_loss_mlp": 1.04012084, + "epoch": 0.5103261686457238, + "flos": 23149311448320.0, + "grad_norm": 1.9552461936614123, + "language_loss": 0.72984374, + "learning_rate": 2.030182134581827e-06, + "loss": 0.75126404, + "num_input_tokens_seen": 182543505, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.6875, + "step": 8488, + "time_per_iteration": 2.454589605331421 + }, + { + "auxiliary_loss_clip": 0.011087, + "auxiliary_loss_mlp": 0.01032468, + "balance_loss_clip": 1.02002835, + "balance_loss_mlp": 1.03795087, + "epoch": 0.5103862918983917, + "flos": 14319237000960.0, + "grad_norm": 1.814097723080907, + "language_loss": 0.69584548, + "learning_rate": 2.0297927166811503e-06, + "loss": 0.71725714, + "num_input_tokens_seen": 182562250, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.70703125, + "step": 8489, + "time_per_iteration": 2.4295358657836914 + }, + { + "auxiliary_loss_clip": 0.01108544, + "auxiliary_loss_mlp": 0.01030245, + "balance_loss_clip": 1.01800227, + "balance_loss_mlp": 1.03788161, + "epoch": 0.5104464151510597, + "flos": 25848536826240.0, + "grad_norm": 1.8877500438207433, + "language_loss": 0.72447532, + "learning_rate": 2.0294032976507297e-06, + "loss": 0.7458632, + "num_input_tokens_seen": 182581910, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.70703125, + "step": 8490, + "time_per_iteration": 2.484398603439331 + }, + { + "auxiliary_loss_clip": 0.01105533, + "auxiliary_loss_mlp": 0.01028247, + "balance_loss_clip": 1.01649261, + "balance_loss_mlp": 1.03803921, + "epoch": 0.5105065384037276, + "flos": 21652913831040.0, + "grad_norm": 1.594832362291185, + "language_loss": 0.80287743, + "learning_rate": 2.0290138775053337e-06, + "loss": 0.82421523, + "num_input_tokens_seen": 182601350, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.67578125, + "step": 8491, + "time_per_iteration": 2.4715051651000977 + }, + { + "auxiliary_loss_clip": 0.01103108, + "auxiliary_loss_mlp": 0.01028042, + "balance_loss_clip": 1.0155549, + "balance_loss_mlp": 1.03651989, + "epoch": 0.5105666616563956, + "flos": 22491571553280.0, + "grad_norm": 2.311833139697555, + "language_loss": 0.79033649, + "learning_rate": 2.028624456259728e-06, + "loss": 0.81164801, + "num_input_tokens_seen": 182619660, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6640625, + "step": 8492, + "time_per_iteration": 2.4697651863098145 + }, + { + "auxiliary_loss_clip": 0.01114847, + "auxiliary_loss_mlp": 0.01038448, + "balance_loss_clip": 1.02560329, + "balance_loss_mlp": 1.04234147, + "epoch": 0.5106267849090635, + "flos": 22455768672000.0, + "grad_norm": 2.1680982451379607, + "language_loss": 0.77821648, + "learning_rate": 2.0282350339286804e-06, + "loss": 0.79974937, + "num_input_tokens_seen": 182639815, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7265625, + "step": 8493, + "time_per_iteration": 2.490349054336548 + }, + { + "auxiliary_loss_clip": 0.01109447, + "auxiliary_loss_mlp": 0.01029414, + "balance_loss_clip": 1.01608634, + "balance_loss_mlp": 1.03989387, + "epoch": 0.5106869081617316, + "flos": 23547093638400.0, + "grad_norm": 2.213061013784994, + "language_loss": 0.83690828, + "learning_rate": 2.0278456105269574e-06, + "loss": 0.85829687, + "num_input_tokens_seen": 182659655, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6953125, + "step": 8494, + "time_per_iteration": 2.4604976177215576 + }, + { + "auxiliary_loss_clip": 0.01112511, + "auxiliary_loss_mlp": 0.01033364, + "balance_loss_clip": 1.02189648, + "balance_loss_mlp": 1.04180336, + "epoch": 0.5107470314143995, + "flos": 26792987080320.0, + "grad_norm": 1.8678450133518327, + "language_loss": 0.79117751, + "learning_rate": 2.027456186069326e-06, + "loss": 0.81263626, + "num_input_tokens_seen": 182677075, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.70703125, + "step": 8495, + "time_per_iteration": 2.5202648639678955 + }, + { + "auxiliary_loss_clip": 0.01109453, + "auxiliary_loss_mlp": 0.01034821, + "balance_loss_clip": 1.02276945, + "balance_loss_mlp": 1.04033172, + "epoch": 0.5108071546670675, + "flos": 25739691638400.0, + "grad_norm": 1.5685043948688704, + "language_loss": 0.78221929, + "learning_rate": 2.0270667605705535e-06, + "loss": 0.80366194, + "num_input_tokens_seen": 182699625, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 8496, + "time_per_iteration": 2.499793767929077 + }, + { + "auxiliary_loss_clip": 0.01105005, + "auxiliary_loss_mlp": 0.01026512, + "balance_loss_clip": 1.01508582, + "balance_loss_mlp": 1.03803635, + "epoch": 0.5108672779197354, + "flos": 18697537589760.0, + "grad_norm": 1.9336450862291243, + "language_loss": 0.7876817, + "learning_rate": 2.0266773340454066e-06, + "loss": 0.8089968, + "num_input_tokens_seen": 182717020, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 8497, + "time_per_iteration": 2.450246572494507 + }, + { + "auxiliary_loss_clip": 0.01106851, + "auxiliary_loss_mlp": 0.01032259, + "balance_loss_clip": 1.0203619, + "balance_loss_mlp": 1.03829265, + "epoch": 0.5109274011724034, + "flos": 26688164215680.0, + "grad_norm": 1.6296784083005205, + "language_loss": 0.8186121, + "learning_rate": 2.0262879065086525e-06, + "loss": 0.84000313, + "num_input_tokens_seen": 182736955, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.68359375, + "step": 8498, + "time_per_iteration": 2.4860284328460693 + }, + { + "auxiliary_loss_clip": 0.0110713, + "auxiliary_loss_mlp": 0.01028216, + "balance_loss_clip": 1.01559711, + "balance_loss_mlp": 1.03989053, + "epoch": 0.5109875244250714, + "flos": 22784028088320.0, + "grad_norm": 1.9511970266493632, + "language_loss": 0.71084464, + "learning_rate": 2.0258984779750584e-06, + "loss": 0.73219806, + "num_input_tokens_seen": 182757620, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.671875, + "step": 8499, + "time_per_iteration": 2.488870859146118 + }, + { + "auxiliary_loss_clip": 0.01108699, + "auxiliary_loss_mlp": 0.01031134, + "balance_loss_clip": 1.01899862, + "balance_loss_mlp": 1.03962827, + "epoch": 0.5110476476777394, + "flos": 35588515622400.0, + "grad_norm": 1.470448999091522, + "language_loss": 0.72600758, + "learning_rate": 2.0255090484593914e-06, + "loss": 0.74740595, + "num_input_tokens_seen": 182780195, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69140625, + "step": 8500, + "time_per_iteration": 2.554612874984741 + }, + { + "auxiliary_loss_clip": 0.01113166, + "auxiliary_loss_mlp": 0.01032162, + "balance_loss_clip": 1.01870334, + "balance_loss_mlp": 1.03988254, + "epoch": 0.5111077709304074, + "flos": 19280798634240.0, + "grad_norm": 2.631045408977224, + "language_loss": 0.63011086, + "learning_rate": 2.0251196179764183e-06, + "loss": 0.65156412, + "num_input_tokens_seen": 182795765, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 8501, + "time_per_iteration": 2.4470977783203125 + }, + { + "auxiliary_loss_clip": 0.01109012, + "auxiliary_loss_mlp": 0.01033513, + "balance_loss_clip": 1.02117443, + "balance_loss_mlp": 1.03708565, + "epoch": 0.5111678941830753, + "flos": 20668207409280.0, + "grad_norm": 1.7479031643347964, + "language_loss": 0.8759163, + "learning_rate": 2.024730186540907e-06, + "loss": 0.89734155, + "num_input_tokens_seen": 182813120, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.71875, + "step": 8502, + "time_per_iteration": 2.4252443313598633 + }, + { + "auxiliary_loss_clip": 0.01103318, + "auxiliary_loss_mlp": 0.01033556, + "balance_loss_clip": 1.02192163, + "balance_loss_mlp": 1.0349071, + "epoch": 0.5112280174357433, + "flos": 26287903987200.0, + "grad_norm": 1.3950925269756227, + "language_loss": 0.82526219, + "learning_rate": 2.0243407541676253e-06, + "loss": 0.84663093, + "num_input_tokens_seen": 182835745, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.68359375, + "step": 8503, + "time_per_iteration": 2.5170319080352783 + }, + { + "auxiliary_loss_clip": 0.01038121, + "auxiliary_loss_mlp": 0.01001996, + "balance_loss_clip": 1.00103021, + "balance_loss_mlp": 1.01512361, + "epoch": 0.5112881406884112, + "flos": 59474247707520.0, + "grad_norm": 0.8658208518316733, + "language_loss": 0.63857049, + "learning_rate": 2.023951320871339e-06, + "loss": 0.65897167, + "num_input_tokens_seen": 182892540, + "router_z_loss_clip": 0.00964355, + "router_z_loss_mlp": 0.23046875, + "step": 8504, + "time_per_iteration": 3.098529577255249 + }, + { + "auxiliary_loss_clip": 0.01108099, + "auxiliary_loss_mlp": 0.01030933, + "balance_loss_clip": 1.01815391, + "balance_loss_mlp": 1.03960776, + "epoch": 0.5113482639410792, + "flos": 26468857728000.0, + "grad_norm": 3.195489539056655, + "language_loss": 0.84326482, + "learning_rate": 2.023561886666816e-06, + "loss": 0.86465514, + "num_input_tokens_seen": 182911515, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.68359375, + "step": 8505, + "time_per_iteration": 2.5145134925842285 + }, + { + "auxiliary_loss_clip": 0.01107392, + "auxiliary_loss_mlp": 0.01026895, + "balance_loss_clip": 1.01499188, + "balance_loss_mlp": 1.0399797, + "epoch": 0.5114083871937471, + "flos": 29895848565120.0, + "grad_norm": 1.9725783043316722, + "language_loss": 0.75117159, + "learning_rate": 2.0231724515688246e-06, + "loss": 0.77251446, + "num_input_tokens_seen": 182930860, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 8506, + "time_per_iteration": 2.529463052749634 + }, + { + "auxiliary_loss_clip": 0.01107977, + "auxiliary_loss_mlp": 0.01032843, + "balance_loss_clip": 1.01951551, + "balance_loss_mlp": 1.03808045, + "epoch": 0.5114685104464152, + "flos": 24314576561280.0, + "grad_norm": 1.6477689192158658, + "language_loss": 0.58288801, + "learning_rate": 2.022783015592131e-06, + "loss": 0.60429621, + "num_input_tokens_seen": 182949960, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.69921875, + "step": 8507, + "time_per_iteration": 2.515449047088623 + }, + { + "auxiliary_loss_clip": 0.01111035, + "auxiliary_loss_mlp": 0.01039811, + "balance_loss_clip": 1.02690697, + "balance_loss_mlp": 1.04132211, + "epoch": 0.5115286336990831, + "flos": 17019288391680.0, + "grad_norm": 1.6046089096743523, + "language_loss": 0.85276306, + "learning_rate": 2.022393578751503e-06, + "loss": 0.87427151, + "num_input_tokens_seen": 182968085, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 8508, + "time_per_iteration": 2.4760663509368896 + }, + { + "auxiliary_loss_clip": 0.01110329, + "auxiliary_loss_mlp": 0.01033776, + "balance_loss_clip": 1.02051985, + "balance_loss_mlp": 1.03969765, + "epoch": 0.5115887569517511, + "flos": 23659386531840.0, + "grad_norm": 1.6014168180464263, + "language_loss": 0.72123772, + "learning_rate": 2.022004141061709e-06, + "loss": 0.74267876, + "num_input_tokens_seen": 182987275, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.70703125, + "step": 8509, + "time_per_iteration": 2.5354809761047363 + }, + { + "auxiliary_loss_clip": 0.01107381, + "auxiliary_loss_mlp": 0.01032, + "balance_loss_clip": 1.02060962, + "balance_loss_mlp": 1.03980041, + "epoch": 0.511648880204419, + "flos": 16107193313280.0, + "grad_norm": 1.6675565589278303, + "language_loss": 0.75862014, + "learning_rate": 2.0216147025375153e-06, + "loss": 0.78001392, + "num_input_tokens_seen": 183004700, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.67578125, + "step": 8510, + "time_per_iteration": 3.945136785507202 + }, + { + "auxiliary_loss_clip": 0.01108162, + "auxiliary_loss_mlp": 0.01033651, + "balance_loss_clip": 1.02163482, + "balance_loss_mlp": 1.04065561, + "epoch": 0.511709003457087, + "flos": 32634970974720.0, + "grad_norm": 1.6646040073598372, + "language_loss": 0.71192694, + "learning_rate": 2.0212252631936907e-06, + "loss": 0.73334503, + "num_input_tokens_seen": 183025830, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 8511, + "time_per_iteration": 2.541703701019287 + }, + { + "auxiliary_loss_clip": 0.01105914, + "auxiliary_loss_mlp": 0.01029434, + "balance_loss_clip": 1.01763797, + "balance_loss_mlp": 1.03958058, + "epoch": 0.511769126709755, + "flos": 21762082241280.0, + "grad_norm": 2.060947746528677, + "language_loss": 0.66430634, + "learning_rate": 2.020835823045001e-06, + "loss": 0.68565977, + "num_input_tokens_seen": 183045140, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 8512, + "time_per_iteration": 5.427145481109619 + }, + { + "auxiliary_loss_clip": 0.01109669, + "auxiliary_loss_mlp": 0.01036594, + "balance_loss_clip": 1.02326632, + "balance_loss_mlp": 1.03883505, + "epoch": 0.511829249962423, + "flos": 23915357827200.0, + "grad_norm": 2.433145093070313, + "language_loss": 0.66578728, + "learning_rate": 2.0204463821062146e-06, + "loss": 0.6872499, + "num_input_tokens_seen": 183063935, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.70703125, + "step": 8513, + "time_per_iteration": 3.935227870941162 + }, + { + "auxiliary_loss_clip": 0.01106032, + "auxiliary_loss_mlp": 0.01033163, + "balance_loss_clip": 1.02099788, + "balance_loss_mlp": 1.03927946, + "epoch": 0.511889373215091, + "flos": 23727005884800.0, + "grad_norm": 2.0509279474405115, + "language_loss": 0.69136906, + "learning_rate": 2.0200569403921e-06, + "loss": 0.71276104, + "num_input_tokens_seen": 183084135, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6640625, + "step": 8514, + "time_per_iteration": 2.5390119552612305 + }, + { + "auxiliary_loss_clip": 0.01102947, + "auxiliary_loss_mlp": 0.01026976, + "balance_loss_clip": 1.01599109, + "balance_loss_mlp": 1.03685427, + "epoch": 0.5119494964677589, + "flos": 28111519526400.0, + "grad_norm": 1.6362442678403473, + "language_loss": 0.66014814, + "learning_rate": 2.019667497917424e-06, + "loss": 0.68144739, + "num_input_tokens_seen": 183104570, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.66015625, + "step": 8515, + "time_per_iteration": 2.492664098739624 + }, + { + "auxiliary_loss_clip": 0.01103893, + "auxiliary_loss_mlp": 0.01031754, + "balance_loss_clip": 1.02031612, + "balance_loss_mlp": 1.03691602, + "epoch": 0.5120096197204269, + "flos": 24973214296320.0, + "grad_norm": 2.89314496105325, + "language_loss": 0.74966168, + "learning_rate": 2.019278054696955e-06, + "loss": 0.77101815, + "num_input_tokens_seen": 183123850, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 8516, + "time_per_iteration": 2.5428519248962402 + }, + { + "auxiliary_loss_clip": 0.01111253, + "auxiliary_loss_mlp": 0.01033594, + "balance_loss_clip": 1.02181602, + "balance_loss_mlp": 1.04198885, + "epoch": 0.5120697429730948, + "flos": 17968012364160.0, + "grad_norm": 1.7790403014833382, + "language_loss": 0.77862155, + "learning_rate": 2.0188886107454595e-06, + "loss": 0.80007005, + "num_input_tokens_seen": 183141725, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.69140625, + "step": 8517, + "time_per_iteration": 2.4259724617004395 + }, + { + "auxiliary_loss_clip": 0.01110887, + "auxiliary_loss_mlp": 0.01031413, + "balance_loss_clip": 1.01897407, + "balance_loss_mlp": 1.03983212, + "epoch": 0.5121298662257628, + "flos": 23292343405440.0, + "grad_norm": 1.7905284866787141, + "language_loss": 0.73672384, + "learning_rate": 2.0184991660777063e-06, + "loss": 0.75814688, + "num_input_tokens_seen": 183161300, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 8518, + "time_per_iteration": 2.5707037448883057 + }, + { + "auxiliary_loss_clip": 0.01107458, + "auxiliary_loss_mlp": 0.01037485, + "balance_loss_clip": 1.02557039, + "balance_loss_mlp": 1.03892565, + "epoch": 0.5121899894784308, + "flos": 17311062568320.0, + "grad_norm": 1.6752140453085944, + "language_loss": 0.78055197, + "learning_rate": 2.0181097207084625e-06, + "loss": 0.80200136, + "num_input_tokens_seen": 183180495, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 8519, + "time_per_iteration": 2.417372226715088 + }, + { + "auxiliary_loss_clip": 0.01109296, + "auxiliary_loss_mlp": 0.0103241, + "balance_loss_clip": 1.02049518, + "balance_loss_mlp": 1.04082775, + "epoch": 0.5122501127310988, + "flos": 24930085040640.0, + "grad_norm": 1.573776111474748, + "language_loss": 0.79204106, + "learning_rate": 2.017720274652497e-06, + "loss": 0.8134582, + "num_input_tokens_seen": 183200330, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.68359375, + "step": 8520, + "time_per_iteration": 2.63323712348938 + }, + { + "auxiliary_loss_clip": 0.01112541, + "auxiliary_loss_mlp": 0.01039702, + "balance_loss_clip": 1.02623105, + "balance_loss_mlp": 1.03924751, + "epoch": 0.5123102359837667, + "flos": 18442859184000.0, + "grad_norm": 1.6319482307550086, + "language_loss": 0.81403995, + "learning_rate": 2.0173308279245765e-06, + "loss": 0.83556241, + "num_input_tokens_seen": 183218230, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 8521, + "time_per_iteration": 2.4723713397979736 + }, + { + "auxiliary_loss_clip": 0.01106273, + "auxiliary_loss_mlp": 0.01029547, + "balance_loss_clip": 1.01685691, + "balance_loss_mlp": 1.03599286, + "epoch": 0.5123703592364347, + "flos": 26684860164480.0, + "grad_norm": 1.90297827684807, + "language_loss": 0.68368387, + "learning_rate": 2.0169413805394692e-06, + "loss": 0.70504206, + "num_input_tokens_seen": 183236735, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 8522, + "time_per_iteration": 2.516411066055298 + }, + { + "auxiliary_loss_clip": 0.01115928, + "auxiliary_loss_mlp": 0.01041085, + "balance_loss_clip": 1.02506292, + "balance_loss_mlp": 1.04201221, + "epoch": 0.5124304824891026, + "flos": 28803948981120.0, + "grad_norm": 2.718510344621862, + "language_loss": 0.6155864, + "learning_rate": 2.0165519325119433e-06, + "loss": 0.63715655, + "num_input_tokens_seen": 183257550, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.73828125, + "step": 8523, + "time_per_iteration": 2.524775266647339 + }, + { + "auxiliary_loss_clip": 0.01110788, + "auxiliary_loss_mlp": 0.010355, + "balance_loss_clip": 1.0238173, + "balance_loss_mlp": 1.04113579, + "epoch": 0.5124906057417706, + "flos": 21761830846080.0, + "grad_norm": 2.0609816781673884, + "language_loss": 0.78066456, + "learning_rate": 2.0161624838567656e-06, + "loss": 0.80212736, + "num_input_tokens_seen": 183275515, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6953125, + "step": 8524, + "time_per_iteration": 2.526226043701172 + }, + { + "auxiliary_loss_clip": 0.01109029, + "auxiliary_loss_mlp": 0.0103495, + "balance_loss_clip": 1.02350545, + "balance_loss_mlp": 1.0413003, + "epoch": 0.5125507289944387, + "flos": 18880538405760.0, + "grad_norm": 1.8496964430325211, + "language_loss": 0.75055063, + "learning_rate": 2.015773034588706e-06, + "loss": 0.77199042, + "num_input_tokens_seen": 183293880, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6796875, + "step": 8525, + "time_per_iteration": 2.432555913925171 + }, + { + "auxiliary_loss_clip": 0.01112941, + "auxiliary_loss_mlp": 0.01036722, + "balance_loss_clip": 1.02385902, + "balance_loss_mlp": 1.04111516, + "epoch": 0.5126108522471066, + "flos": 35627838036480.0, + "grad_norm": 1.559913373859493, + "language_loss": 0.74452645, + "learning_rate": 2.015383584722531e-06, + "loss": 0.76602304, + "num_input_tokens_seen": 183315860, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 8526, + "time_per_iteration": 2.6282670497894287 + }, + { + "auxiliary_loss_clip": 0.01110533, + "auxiliary_loss_mlp": 0.01040593, + "balance_loss_clip": 1.02799845, + "balance_loss_mlp": 1.04028583, + "epoch": 0.5126709754997746, + "flos": 20190918464640.0, + "grad_norm": 1.490779495017149, + "language_loss": 0.65322489, + "learning_rate": 2.0149941342730088e-06, + "loss": 0.67473614, + "num_input_tokens_seen": 183335480, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 8527, + "time_per_iteration": 2.467350482940674 + }, + { + "auxiliary_loss_clip": 0.01108518, + "auxiliary_loss_mlp": 0.01039646, + "balance_loss_clip": 1.02852428, + "balance_loss_mlp": 1.04277444, + "epoch": 0.5127310987524425, + "flos": 18588548747520.0, + "grad_norm": 1.5603597457219889, + "language_loss": 0.74514449, + "learning_rate": 2.014604683254908e-06, + "loss": 0.76662612, + "num_input_tokens_seen": 183354395, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 8528, + "time_per_iteration": 2.513795852661133 + }, + { + "auxiliary_loss_clip": 0.01105318, + "auxiliary_loss_mlp": 0.01034313, + "balance_loss_clip": 1.02236843, + "balance_loss_mlp": 1.03608227, + "epoch": 0.5127912220051105, + "flos": 22454691264000.0, + "grad_norm": 1.756255656529514, + "language_loss": 0.83061087, + "learning_rate": 2.014215231682995e-06, + "loss": 0.85200721, + "num_input_tokens_seen": 183372980, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 8529, + "time_per_iteration": 2.4574379920959473 + }, + { + "auxiliary_loss_clip": 0.01106885, + "auxiliary_loss_mlp": 0.01032856, + "balance_loss_clip": 1.02045822, + "balance_loss_mlp": 1.03895748, + "epoch": 0.5128513452577784, + "flos": 19093703667840.0, + "grad_norm": 1.6787234743344808, + "language_loss": 0.73559862, + "learning_rate": 2.01382577957204e-06, + "loss": 0.75699604, + "num_input_tokens_seen": 183390160, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 8530, + "time_per_iteration": 2.4669532775878906 + }, + { + "auxiliary_loss_clip": 0.01039899, + "auxiliary_loss_mlp": 0.0100398, + "balance_loss_clip": 1.00278807, + "balance_loss_mlp": 1.01703906, + "epoch": 0.5129114685104464, + "flos": 67892285243520.0, + "grad_norm": 0.7465649329198393, + "language_loss": 0.60806251, + "learning_rate": 2.0134363269368095e-06, + "loss": 0.6285013, + "num_input_tokens_seen": 183455280, + "router_z_loss_clip": 0.01190186, + "router_z_loss_mlp": 0.22851562, + "step": 8531, + "time_per_iteration": 3.1615967750549316 + }, + { + "auxiliary_loss_clip": 0.01110166, + "auxiliary_loss_mlp": 0.01029475, + "balance_loss_clip": 1.01732779, + "balance_loss_mlp": 1.04014051, + "epoch": 0.5129715917631144, + "flos": 20449152316800.0, + "grad_norm": 1.6561974446519532, + "language_loss": 0.76540768, + "learning_rate": 2.0130468737920725e-06, + "loss": 0.78680408, + "num_input_tokens_seen": 183473955, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.703125, + "step": 8532, + "time_per_iteration": 2.4836883544921875 + }, + { + "auxiliary_loss_clip": 0.01107783, + "auxiliary_loss_mlp": 0.01031606, + "balance_loss_clip": 1.01894033, + "balance_loss_mlp": 1.03866601, + "epoch": 0.5130317150157824, + "flos": 35116146840960.0, + "grad_norm": 2.847315245703251, + "language_loss": 0.67183244, + "learning_rate": 2.012657420152597e-06, + "loss": 0.6932264, + "num_input_tokens_seen": 183497195, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 8533, + "time_per_iteration": 2.6025052070617676 + }, + { + "auxiliary_loss_clip": 0.01112515, + "auxiliary_loss_mlp": 0.01036644, + "balance_loss_clip": 1.02333999, + "balance_loss_mlp": 1.04080868, + "epoch": 0.5130918382684503, + "flos": 19791627903360.0, + "grad_norm": 1.8363553974693196, + "language_loss": 0.81724054, + "learning_rate": 2.01226796603315e-06, + "loss": 0.83873212, + "num_input_tokens_seen": 183513675, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 8534, + "time_per_iteration": 2.465374231338501 + }, + { + "auxiliary_loss_clip": 0.01111356, + "auxiliary_loss_mlp": 0.01035387, + "balance_loss_clip": 1.02167177, + "balance_loss_mlp": 1.0399549, + "epoch": 0.5131519615211183, + "flos": 26323096337280.0, + "grad_norm": 1.5787063577136407, + "language_loss": 0.63588178, + "learning_rate": 2.0118785114485017e-06, + "loss": 0.65734923, + "num_input_tokens_seen": 183535165, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71484375, + "step": 8535, + "time_per_iteration": 2.50287127494812 + }, + { + "auxiliary_loss_clip": 0.01111823, + "auxiliary_loss_mlp": 0.0102686, + "balance_loss_clip": 1.01434922, + "balance_loss_mlp": 1.04166365, + "epoch": 0.5132120847737862, + "flos": 19171917532800.0, + "grad_norm": 1.5428442042942097, + "language_loss": 0.69746888, + "learning_rate": 2.011489056413418e-06, + "loss": 0.71885574, + "num_input_tokens_seen": 183553780, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69921875, + "step": 8536, + "time_per_iteration": 2.459897041320801 + }, + { + "auxiliary_loss_clip": 0.01113816, + "auxiliary_loss_mlp": 0.01033547, + "balance_loss_clip": 1.01963568, + "balance_loss_mlp": 1.04082823, + "epoch": 0.5132722080264542, + "flos": 20230420446720.0, + "grad_norm": 2.3299626101952784, + "language_loss": 0.71215963, + "learning_rate": 2.011099600942669e-06, + "loss": 0.73363328, + "num_input_tokens_seen": 183572285, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7265625, + "step": 8537, + "time_per_iteration": 2.4840991497039795 + }, + { + "auxiliary_loss_clip": 0.01111456, + "auxiliary_loss_mlp": 0.0103313, + "balance_loss_clip": 1.02013016, + "balance_loss_mlp": 1.03927016, + "epoch": 0.5133323312791223, + "flos": 16469459930880.0, + "grad_norm": 6.302946358508802, + "language_loss": 0.80441952, + "learning_rate": 2.0107101450510214e-06, + "loss": 0.82586539, + "num_input_tokens_seen": 183589330, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71875, + "step": 8538, + "time_per_iteration": 2.4378812313079834 + }, + { + "auxiliary_loss_clip": 0.01107763, + "auxiliary_loss_mlp": 0.01031809, + "balance_loss_clip": 1.01880276, + "balance_loss_mlp": 1.03764546, + "epoch": 0.5133924545317902, + "flos": 26068094709120.0, + "grad_norm": 1.8808034234185624, + "language_loss": 0.78517324, + "learning_rate": 2.0103206887532437e-06, + "loss": 0.80656898, + "num_input_tokens_seen": 183609205, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.703125, + "step": 8539, + "time_per_iteration": 2.5144600868225098 + }, + { + "auxiliary_loss_clip": 0.0111221, + "auxiliary_loss_mlp": 0.01032928, + "balance_loss_clip": 1.02025044, + "balance_loss_mlp": 1.04009342, + "epoch": 0.5134525777844582, + "flos": 29131023248640.0, + "grad_norm": 1.5130664168284647, + "language_loss": 0.75880563, + "learning_rate": 2.009931232064105e-06, + "loss": 0.78025699, + "num_input_tokens_seen": 183629985, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 8540, + "time_per_iteration": 2.55734920501709 + }, + { + "auxiliary_loss_clip": 0.0111462, + "auxiliary_loss_mlp": 0.01033022, + "balance_loss_clip": 1.01905608, + "balance_loss_mlp": 1.04176068, + "epoch": 0.5135127010371261, + "flos": 17454776883840.0, + "grad_norm": 2.8219986700547555, + "language_loss": 0.74552548, + "learning_rate": 2.0095417749983724e-06, + "loss": 0.76700193, + "num_input_tokens_seen": 183648220, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.73046875, + "step": 8541, + "time_per_iteration": 2.432055711746216 + }, + { + "auxiliary_loss_clip": 0.01110326, + "auxiliary_loss_mlp": 0.01033335, + "balance_loss_clip": 1.02005482, + "balance_loss_mlp": 1.03941679, + "epoch": 0.5135728242897941, + "flos": 21944975316480.0, + "grad_norm": 1.945278300015613, + "language_loss": 0.70215029, + "learning_rate": 2.0091523175708162e-06, + "loss": 0.72358692, + "num_input_tokens_seen": 183668230, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 8542, + "time_per_iteration": 2.5227723121643066 + }, + { + "auxiliary_loss_clip": 0.01112639, + "auxiliary_loss_mlp": 0.01026897, + "balance_loss_clip": 1.01403403, + "balance_loss_mlp": 1.04146171, + "epoch": 0.513632947542462, + "flos": 22674859678080.0, + "grad_norm": 1.83289507202946, + "language_loss": 0.78898811, + "learning_rate": 2.0087628597962023e-06, + "loss": 0.8103835, + "num_input_tokens_seen": 183687800, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 8543, + "time_per_iteration": 2.4559075832366943 + }, + { + "auxiliary_loss_clip": 0.0111214, + "auxiliary_loss_mlp": 0.01037576, + "balance_loss_clip": 1.02426672, + "balance_loss_mlp": 1.04161441, + "epoch": 0.51369307079513, + "flos": 29457163762560.0, + "grad_norm": 1.9171309591761885, + "language_loss": 0.68051696, + "learning_rate": 2.008373401689299e-06, + "loss": 0.70201409, + "num_input_tokens_seen": 183709025, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.703125, + "step": 8544, + "time_per_iteration": 2.5344274044036865 + }, + { + "auxiliary_loss_clip": 0.01113551, + "auxiliary_loss_mlp": 0.01039409, + "balance_loss_clip": 1.02671301, + "balance_loss_mlp": 1.04096842, + "epoch": 0.513753194047798, + "flos": 18989347680000.0, + "grad_norm": 2.2205990317105395, + "language_loss": 0.7225253, + "learning_rate": 2.0079839432648765e-06, + "loss": 0.74405491, + "num_input_tokens_seen": 183725740, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7265625, + "step": 8545, + "time_per_iteration": 2.4303176403045654 + }, + { + "auxiliary_loss_clip": 0.01112226, + "auxiliary_loss_mlp": 0.01038034, + "balance_loss_clip": 1.02431881, + "balance_loss_mlp": 1.03957486, + "epoch": 0.513813317300466, + "flos": 17821855923840.0, + "grad_norm": 1.967971348268394, + "language_loss": 0.81898367, + "learning_rate": 2.0075944845377016e-06, + "loss": 0.84048629, + "num_input_tokens_seen": 183743995, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7265625, + "step": 8546, + "time_per_iteration": 2.4504597187042236 + }, + { + "auxiliary_loss_clip": 0.01111418, + "auxiliary_loss_mlp": 0.01033938, + "balance_loss_clip": 1.02099776, + "balance_loss_mlp": 1.03963637, + "epoch": 0.5138734405531339, + "flos": 24061191045120.0, + "grad_norm": 1.6545588723955058, + "language_loss": 0.73301136, + "learning_rate": 2.007205025522544e-06, + "loss": 0.75446492, + "num_input_tokens_seen": 183764150, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71875, + "step": 8547, + "time_per_iteration": 2.4682819843292236 + }, + { + "auxiliary_loss_clip": 0.01108626, + "auxiliary_loss_mlp": 0.01043301, + "balance_loss_clip": 1.03010488, + "balance_loss_mlp": 1.03783822, + "epoch": 0.5139335638058019, + "flos": 26097253574400.0, + "grad_norm": 1.620202866362127, + "language_loss": 0.73577881, + "learning_rate": 2.0068155662341702e-06, + "loss": 0.75729811, + "num_input_tokens_seen": 183783280, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.70703125, + "step": 8548, + "time_per_iteration": 2.511922597885132 + }, + { + "auxiliary_loss_clip": 0.01110019, + "auxiliary_loss_mlp": 0.01034147, + "balance_loss_clip": 1.02117133, + "balance_loss_mlp": 1.03852081, + "epoch": 0.5139936870584698, + "flos": 18917095472640.0, + "grad_norm": 1.506476906057379, + "language_loss": 0.82239324, + "learning_rate": 2.0064261066873495e-06, + "loss": 0.84383494, + "num_input_tokens_seen": 183800725, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71484375, + "step": 8549, + "time_per_iteration": 2.433605194091797 + }, + { + "auxiliary_loss_clip": 0.01110043, + "auxiliary_loss_mlp": 0.01035127, + "balance_loss_clip": 1.02292621, + "balance_loss_mlp": 1.04096317, + "epoch": 0.5140538103111378, + "flos": 16144001775360.0, + "grad_norm": 1.8131541317091766, + "language_loss": 0.72331119, + "learning_rate": 2.0060366468968504e-06, + "loss": 0.7447629, + "num_input_tokens_seen": 183818735, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69140625, + "step": 8550, + "time_per_iteration": 2.4659972190856934 + }, + { + "auxiliary_loss_clip": 0.01114255, + "auxiliary_loss_mlp": 0.01034858, + "balance_loss_clip": 1.02173352, + "balance_loss_mlp": 1.0404501, + "epoch": 0.5141139335638057, + "flos": 22420145358720.0, + "grad_norm": 1.6035097357113468, + "language_loss": 0.75497758, + "learning_rate": 2.0056471868774408e-06, + "loss": 0.77646863, + "num_input_tokens_seen": 183840015, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73828125, + "step": 8551, + "time_per_iteration": 2.453734874725342 + }, + { + "auxiliary_loss_clip": 0.01108366, + "auxiliary_loss_mlp": 0.01030625, + "balance_loss_clip": 1.01805425, + "balance_loss_mlp": 1.04017091, + "epoch": 0.5141740568164738, + "flos": 27089645506560.0, + "grad_norm": 1.6015349884444547, + "language_loss": 0.69001007, + "learning_rate": 2.0052577266438897e-06, + "loss": 0.71140003, + "num_input_tokens_seen": 183860145, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.68359375, + "step": 8552, + "time_per_iteration": 3.9047505855560303 + }, + { + "auxiliary_loss_clip": 0.01110174, + "auxiliary_loss_mlp": 0.01032262, + "balance_loss_clip": 1.01927972, + "balance_loss_mlp": 1.03868091, + "epoch": 0.5142341800691418, + "flos": 24973250209920.0, + "grad_norm": 1.7916575293353634, + "language_loss": 0.74736363, + "learning_rate": 2.004868266210965e-06, + "loss": 0.76878798, + "num_input_tokens_seen": 183880540, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71484375, + "step": 8553, + "time_per_iteration": 2.5039455890655518 + }, + { + "auxiliary_loss_clip": 0.01109768, + "auxiliary_loss_mlp": 0.01035209, + "balance_loss_clip": 1.02241778, + "balance_loss_mlp": 1.0397613, + "epoch": 0.5142943033218097, + "flos": 20704513080960.0, + "grad_norm": 1.707634664835445, + "language_loss": 0.68126231, + "learning_rate": 2.004478805593435e-06, + "loss": 0.70271206, + "num_input_tokens_seen": 183900895, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 8554, + "time_per_iteration": 5.488779544830322 + }, + { + "auxiliary_loss_clip": 0.01112685, + "auxiliary_loss_mlp": 0.01036304, + "balance_loss_clip": 1.02173042, + "balance_loss_mlp": 1.03879559, + "epoch": 0.5143544265744777, + "flos": 22925479847040.0, + "grad_norm": 2.3217393931515846, + "language_loss": 0.73303884, + "learning_rate": 2.004089344806068e-06, + "loss": 0.75452876, + "num_input_tokens_seen": 183920335, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.73828125, + "step": 8555, + "time_per_iteration": 3.866107940673828 + }, + { + "auxiliary_loss_clip": 0.01111396, + "auxiliary_loss_mlp": 0.01035591, + "balance_loss_clip": 1.02278817, + "balance_loss_mlp": 1.04023397, + "epoch": 0.5144145498271456, + "flos": 15921391236480.0, + "grad_norm": 2.3509367679077124, + "language_loss": 0.74724478, + "learning_rate": 2.003699883863633e-06, + "loss": 0.76871467, + "num_input_tokens_seen": 183936220, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7109375, + "step": 8556, + "time_per_iteration": 2.423941135406494 + }, + { + "auxiliary_loss_clip": 0.01105419, + "auxiliary_loss_mlp": 0.0103358, + "balance_loss_clip": 1.02135563, + "balance_loss_mlp": 1.03695798, + "epoch": 0.5144746730798136, + "flos": 19681238430720.0, + "grad_norm": 1.7510489074761373, + "language_loss": 0.86147487, + "learning_rate": 2.003310422780898e-06, + "loss": 0.88286483, + "num_input_tokens_seen": 183953250, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 8557, + "time_per_iteration": 2.4232289791107178 + }, + { + "auxiliary_loss_clip": 0.01105513, + "auxiliary_loss_mlp": 0.01033194, + "balance_loss_clip": 1.02162433, + "balance_loss_mlp": 1.03741109, + "epoch": 0.5145347963324816, + "flos": 23914711382400.0, + "grad_norm": 1.4648111070630687, + "language_loss": 0.89026904, + "learning_rate": 2.0029209615726307e-06, + "loss": 0.91165608, + "num_input_tokens_seen": 183973865, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6796875, + "step": 8558, + "time_per_iteration": 2.4937002658843994 + }, + { + "auxiliary_loss_clip": 0.01106843, + "auxiliary_loss_mlp": 0.01032131, + "balance_loss_clip": 1.01952434, + "balance_loss_mlp": 1.03844643, + "epoch": 0.5145949195851496, + "flos": 18260002022400.0, + "grad_norm": 1.959206520418211, + "language_loss": 0.65027267, + "learning_rate": 2.002531500253602e-06, + "loss": 0.67166239, + "num_input_tokens_seen": 183992555, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.68359375, + "step": 8559, + "time_per_iteration": 2.4625425338745117 + }, + { + "auxiliary_loss_clip": 0.01109462, + "auxiliary_loss_mlp": 0.01035258, + "balance_loss_clip": 1.02255082, + "balance_loss_mlp": 1.04041696, + "epoch": 0.5146550428378175, + "flos": 26213425136640.0, + "grad_norm": 1.5416961138531182, + "language_loss": 0.62973124, + "learning_rate": 2.002142038838577e-06, + "loss": 0.65117842, + "num_input_tokens_seen": 184010825, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 8560, + "time_per_iteration": 2.509413719177246 + }, + { + "auxiliary_loss_clip": 0.01107571, + "auxiliary_loss_mlp": 0.01030305, + "balance_loss_clip": 1.01798463, + "balance_loss_mlp": 1.03850913, + "epoch": 0.5147151660904855, + "flos": 22674177319680.0, + "grad_norm": 1.5387222778191898, + "language_loss": 0.69879884, + "learning_rate": 2.0017525773423265e-06, + "loss": 0.72017759, + "num_input_tokens_seen": 184030155, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69140625, + "step": 8561, + "time_per_iteration": 2.4802825450897217 + }, + { + "auxiliary_loss_clip": 0.01108154, + "auxiliary_loss_mlp": 0.01030825, + "balance_loss_clip": 1.01894569, + "balance_loss_mlp": 1.03752971, + "epoch": 0.5147752893431534, + "flos": 24972388283520.0, + "grad_norm": 1.5731273846161422, + "language_loss": 0.66646934, + "learning_rate": 2.0013631157796177e-06, + "loss": 0.68785918, + "num_input_tokens_seen": 184051440, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.70703125, + "step": 8562, + "time_per_iteration": 2.505180835723877 + }, + { + "auxiliary_loss_clip": 0.01110444, + "auxiliary_loss_mlp": 0.01030865, + "balance_loss_clip": 1.01824713, + "balance_loss_mlp": 1.03924227, + "epoch": 0.5148354125958214, + "flos": 22744669760640.0, + "grad_norm": 1.6680045222139546, + "language_loss": 0.77707577, + "learning_rate": 2.0009736541652188e-06, + "loss": 0.79848886, + "num_input_tokens_seen": 184070205, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 8563, + "time_per_iteration": 2.4935452938079834 + }, + { + "auxiliary_loss_clip": 0.01112567, + "auxiliary_loss_mlp": 0.01033673, + "balance_loss_clip": 1.01932585, + "balance_loss_mlp": 1.03827047, + "epoch": 0.5148955358484893, + "flos": 23068763199360.0, + "grad_norm": 2.1629374301288284, + "language_loss": 0.82324845, + "learning_rate": 2.0005841925139e-06, + "loss": 0.84471083, + "num_input_tokens_seen": 184087345, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7421875, + "step": 8564, + "time_per_iteration": 2.4590399265289307 + }, + { + "auxiliary_loss_clip": 0.01112048, + "auxiliary_loss_mlp": 0.01035558, + "balance_loss_clip": 1.0223794, + "balance_loss_mlp": 1.03859615, + "epoch": 0.5149556591011574, + "flos": 20340127560960.0, + "grad_norm": 1.7207643570499924, + "language_loss": 0.73255235, + "learning_rate": 2.0001947308404283e-06, + "loss": 0.75402838, + "num_input_tokens_seen": 184107110, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 8565, + "time_per_iteration": 2.471970558166504 + }, + { + "auxiliary_loss_clip": 0.01113674, + "auxiliary_loss_mlp": 0.01033516, + "balance_loss_clip": 1.01884174, + "balance_loss_mlp": 1.03977931, + "epoch": 0.5150157823538254, + "flos": 22638230784000.0, + "grad_norm": 1.8782058792026062, + "language_loss": 0.683079, + "learning_rate": 1.9998052691595715e-06, + "loss": 0.70455092, + "num_input_tokens_seen": 184127105, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.73828125, + "step": 8566, + "time_per_iteration": 2.4981720447540283 + }, + { + "auxiliary_loss_clip": 0.01109217, + "auxiliary_loss_mlp": 0.01029217, + "balance_loss_clip": 1.01639605, + "balance_loss_mlp": 1.03583431, + "epoch": 0.5150759056064933, + "flos": 26067627832320.0, + "grad_norm": 2.0482874573832177, + "language_loss": 0.78111541, + "learning_rate": 1.9994158074861005e-06, + "loss": 0.80249971, + "num_input_tokens_seen": 184148060, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.734375, + "step": 8567, + "time_per_iteration": 2.490272045135498 + }, + { + "auxiliary_loss_clip": 0.01113521, + "auxiliary_loss_mlp": 0.01033959, + "balance_loss_clip": 1.02054214, + "balance_loss_mlp": 1.04046249, + "epoch": 0.5151360288591613, + "flos": 25952641418880.0, + "grad_norm": 2.0737995601061274, + "language_loss": 0.790721, + "learning_rate": 1.9990263458347806e-06, + "loss": 0.81219578, + "num_input_tokens_seen": 184166175, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73046875, + "step": 8568, + "time_per_iteration": 2.602315902709961 + }, + { + "auxiliary_loss_clip": 0.01106374, + "auxiliary_loss_mlp": 0.01031237, + "balance_loss_clip": 1.01885664, + "balance_loss_mlp": 1.03637588, + "epoch": 0.5151961521118292, + "flos": 18507246312960.0, + "grad_norm": 2.0499636702484945, + "language_loss": 0.90935498, + "learning_rate": 1.9986368842203825e-06, + "loss": 0.93073106, + "num_input_tokens_seen": 184182600, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.703125, + "step": 8569, + "time_per_iteration": 2.430600643157959 + }, + { + "auxiliary_loss_clip": 0.01110259, + "auxiliary_loss_mlp": 0.01030056, + "balance_loss_clip": 1.01677024, + "balance_loss_mlp": 1.03865302, + "epoch": 0.5152562753644973, + "flos": 22233696837120.0, + "grad_norm": 1.6639049645433037, + "language_loss": 0.76229095, + "learning_rate": 1.998247422657674e-06, + "loss": 0.78369409, + "num_input_tokens_seen": 184202020, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 8570, + "time_per_iteration": 2.48988676071167 + }, + { + "auxiliary_loss_clip": 0.01108277, + "auxiliary_loss_mlp": 0.01037495, + "balance_loss_clip": 1.02357769, + "balance_loss_mlp": 1.03741157, + "epoch": 0.5153163986171652, + "flos": 38436555047040.0, + "grad_norm": 1.5896565556148876, + "language_loss": 0.7375021, + "learning_rate": 1.9978579611614227e-06, + "loss": 0.75895989, + "num_input_tokens_seen": 184224850, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7109375, + "step": 8571, + "time_per_iteration": 2.618849754333496 + }, + { + "auxiliary_loss_clip": 0.01035305, + "auxiliary_loss_mlp": 0.00998776, + "balance_loss_clip": 0.99780464, + "balance_loss_mlp": 1.0127461, + "epoch": 0.5153765218698332, + "flos": 66384503015040.0, + "grad_norm": 0.7780004501915253, + "language_loss": 0.52940249, + "learning_rate": 1.9974684997463984e-06, + "loss": 0.54974329, + "num_input_tokens_seen": 184288520, + "router_z_loss_clip": 0.00970459, + "router_z_loss_mlp": 0.22558594, + "step": 8572, + "time_per_iteration": 3.1418654918670654 + }, + { + "auxiliary_loss_clip": 0.01108043, + "auxiliary_loss_mlp": 0.01032788, + "balance_loss_clip": 1.02087331, + "balance_loss_mlp": 1.04004169, + "epoch": 0.5154366451225011, + "flos": 24024669891840.0, + "grad_norm": 1.7275406058075027, + "language_loss": 0.76217729, + "learning_rate": 1.9970790384273687e-06, + "loss": 0.78358561, + "num_input_tokens_seen": 184308565, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 8573, + "time_per_iteration": 2.4757239818573 + }, + { + "auxiliary_loss_clip": 0.01105818, + "auxiliary_loss_mlp": 0.01029217, + "balance_loss_clip": 1.01627111, + "balance_loss_mlp": 1.03679562, + "epoch": 0.5154967683751691, + "flos": 23468843859840.0, + "grad_norm": 1.9279490614808483, + "language_loss": 0.77039665, + "learning_rate": 1.996689577219102e-06, + "loss": 0.79174697, + "num_input_tokens_seen": 184326795, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6875, + "step": 8574, + "time_per_iteration": 2.478935718536377 + }, + { + "auxiliary_loss_clip": 0.01107394, + "auxiliary_loss_mlp": 0.01029105, + "balance_loss_clip": 1.01714277, + "balance_loss_mlp": 1.03757906, + "epoch": 0.515556891627837, + "flos": 23805650712960.0, + "grad_norm": 1.6824577114627284, + "language_loss": 0.85421538, + "learning_rate": 1.996300116136367e-06, + "loss": 0.87558043, + "num_input_tokens_seen": 184345990, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.69921875, + "step": 8575, + "time_per_iteration": 2.4811151027679443 + }, + { + "auxiliary_loss_clip": 0.01108061, + "auxiliary_loss_mlp": 0.01032098, + "balance_loss_clip": 1.01971185, + "balance_loss_mlp": 1.03703451, + "epoch": 0.515617014880505, + "flos": 19828544106240.0, + "grad_norm": 1.6692718685381052, + "language_loss": 0.76704675, + "learning_rate": 1.995910655193932e-06, + "loss": 0.78844833, + "num_input_tokens_seen": 184366300, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 8576, + "time_per_iteration": 2.490389108657837 + }, + { + "auxiliary_loss_clip": 0.011134, + "auxiliary_loss_mlp": 0.01031565, + "balance_loss_clip": 1.01836872, + "balance_loss_mlp": 1.03960061, + "epoch": 0.515677138133173, + "flos": 14245907385600.0, + "grad_norm": 3.052053268886893, + "language_loss": 0.75463682, + "learning_rate": 1.9955211944065654e-06, + "loss": 0.77608645, + "num_input_tokens_seen": 184383030, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73828125, + "step": 8577, + "time_per_iteration": 2.416757583618164 + }, + { + "auxiliary_loss_clip": 0.0111005, + "auxiliary_loss_mlp": 0.01037518, + "balance_loss_clip": 1.02441728, + "balance_loss_mlp": 1.0376997, + "epoch": 0.515737261385841, + "flos": 28289707920000.0, + "grad_norm": 1.834882992604573, + "language_loss": 0.80803275, + "learning_rate": 1.9951317337890353e-06, + "loss": 0.82950842, + "num_input_tokens_seen": 184403410, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 8578, + "time_per_iteration": 2.517292022705078 + }, + { + "auxiliary_loss_clip": 0.01104508, + "auxiliary_loss_mlp": 0.01032552, + "balance_loss_clip": 1.02046442, + "balance_loss_mlp": 1.0357188, + "epoch": 0.515797384638509, + "flos": 27891925729920.0, + "grad_norm": 1.7011032882300805, + "language_loss": 0.76299787, + "learning_rate": 1.9947422733561105e-06, + "loss": 0.78436846, + "num_input_tokens_seen": 184423830, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 8579, + "time_per_iteration": 2.4907805919647217 + }, + { + "auxiliary_loss_clip": 0.01109486, + "auxiliary_loss_mlp": 0.01031503, + "balance_loss_clip": 1.01890254, + "balance_loss_mlp": 1.03864014, + "epoch": 0.5158575078911769, + "flos": 23040071210880.0, + "grad_norm": 1.5884760036798964, + "language_loss": 0.79018867, + "learning_rate": 1.994352813122559e-06, + "loss": 0.81159854, + "num_input_tokens_seen": 184445050, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 8580, + "time_per_iteration": 2.490298271179199 + }, + { + "auxiliary_loss_clip": 0.01111804, + "auxiliary_loss_mlp": 0.01036819, + "balance_loss_clip": 1.0237354, + "balance_loss_mlp": 1.03874159, + "epoch": 0.5159176311438449, + "flos": 12641346938880.0, + "grad_norm": 2.2420547036898277, + "language_loss": 0.72657341, + "learning_rate": 1.99396335310315e-06, + "loss": 0.74805963, + "num_input_tokens_seen": 184460775, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73046875, + "step": 8581, + "time_per_iteration": 2.419196367263794 + }, + { + "auxiliary_loss_clip": 0.01107618, + "auxiliary_loss_mlp": 0.01030311, + "balance_loss_clip": 1.01844954, + "balance_loss_mlp": 1.03848028, + "epoch": 0.5159777543965128, + "flos": 15558154951680.0, + "grad_norm": 2.260602789840083, + "language_loss": 0.74468267, + "learning_rate": 1.9935738933126508e-06, + "loss": 0.76606196, + "num_input_tokens_seen": 184477365, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.69140625, + "step": 8582, + "time_per_iteration": 2.4235429763793945 + }, + { + "auxiliary_loss_clip": 0.01107491, + "auxiliary_loss_mlp": 0.01033441, + "balance_loss_clip": 1.02201486, + "balance_loss_mlp": 1.03820109, + "epoch": 0.5160378776491809, + "flos": 23221671396480.0, + "grad_norm": 3.661326019284234, + "language_loss": 0.66308093, + "learning_rate": 1.99318443376583e-06, + "loss": 0.68449032, + "num_input_tokens_seen": 184497045, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.69140625, + "step": 8583, + "time_per_iteration": 2.483489990234375 + }, + { + "auxiliary_loss_clip": 0.0111088, + "auxiliary_loss_mlp": 0.01036135, + "balance_loss_clip": 1.02315259, + "balance_loss_mlp": 1.04015112, + "epoch": 0.5160980009018488, + "flos": 21944616180480.0, + "grad_norm": 1.4772972874821377, + "language_loss": 0.75878769, + "learning_rate": 1.9927949744774568e-06, + "loss": 0.78025782, + "num_input_tokens_seen": 184517675, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.70703125, + "step": 8584, + "time_per_iteration": 2.469770908355713 + }, + { + "auxiliary_loss_clip": 0.01109729, + "auxiliary_loss_mlp": 0.01038496, + "balance_loss_clip": 1.026057, + "balance_loss_mlp": 1.03763115, + "epoch": 0.5161581241545168, + "flos": 22784064001920.0, + "grad_norm": 1.908038470800245, + "language_loss": 0.78773153, + "learning_rate": 1.9924055154622983e-06, + "loss": 0.80921382, + "num_input_tokens_seen": 184537745, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.72265625, + "step": 8585, + "time_per_iteration": 2.4765405654907227 + }, + { + "auxiliary_loss_clip": 0.01103444, + "auxiliary_loss_mlp": 0.01031519, + "balance_loss_clip": 1.01976502, + "balance_loss_mlp": 1.03624129, + "epoch": 0.5162182474071847, + "flos": 19675384513920.0, + "grad_norm": 2.394419079152278, + "language_loss": 0.81022364, + "learning_rate": 1.9920160567351238e-06, + "loss": 0.83157325, + "num_input_tokens_seen": 184553630, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.671875, + "step": 8586, + "time_per_iteration": 2.45131254196167 + }, + { + "auxiliary_loss_clip": 0.01107797, + "auxiliary_loss_mlp": 0.01033426, + "balance_loss_clip": 1.02106369, + "balance_loss_mlp": 1.03754663, + "epoch": 0.5162783706598527, + "flos": 20046198568320.0, + "grad_norm": 2.0375667228771572, + "language_loss": 0.71716821, + "learning_rate": 1.991626598310701e-06, + "loss": 0.73858047, + "num_input_tokens_seen": 184573530, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.703125, + "step": 8587, + "time_per_iteration": 2.464603900909424 + }, + { + "auxiliary_loss_clip": 0.0103385, + "auxiliary_loss_mlp": 0.01011507, + "balance_loss_clip": 1.01052976, + "balance_loss_mlp": 1.01128352, + "epoch": 0.5163384939125206, + "flos": 69959553713280.0, + "grad_norm": 0.7317367951541988, + "language_loss": 0.57798368, + "learning_rate": 1.9912371402037984e-06, + "loss": 0.59843719, + "num_input_tokens_seen": 184637875, + "router_z_loss_clip": 0.00976562, + "router_z_loss_mlp": 0.22558594, + "step": 8588, + "time_per_iteration": 3.0708353519439697 + }, + { + "auxiliary_loss_clip": 0.01106665, + "auxiliary_loss_mlp": 0.01038792, + "balance_loss_clip": 1.02560759, + "balance_loss_mlp": 1.03631115, + "epoch": 0.5163986171651886, + "flos": 17417034668160.0, + "grad_norm": 1.9433685436573729, + "language_loss": 0.7553345, + "learning_rate": 1.990847682429185e-06, + "loss": 0.77678907, + "num_input_tokens_seen": 184656125, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 8589, + "time_per_iteration": 2.4392945766448975 + }, + { + "auxiliary_loss_clip": 0.0110855, + "auxiliary_loss_mlp": 0.01032646, + "balance_loss_clip": 1.02110088, + "balance_loss_mlp": 1.03822279, + "epoch": 0.5164587404178566, + "flos": 21322679166720.0, + "grad_norm": 2.018268520776434, + "language_loss": 0.67597556, + "learning_rate": 1.990458225001627e-06, + "loss": 0.69738752, + "num_input_tokens_seen": 184675920, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.703125, + "step": 8590, + "time_per_iteration": 2.480978012084961 + }, + { + "auxiliary_loss_clip": 0.01034536, + "auxiliary_loss_mlp": 0.01003309, + "balance_loss_clip": 1.00217628, + "balance_loss_mlp": 1.01181984, + "epoch": 0.5165188636705246, + "flos": 68057149691520.0, + "grad_norm": 0.7844517010344912, + "language_loss": 0.5593977, + "learning_rate": 1.990068767935895e-06, + "loss": 0.57977605, + "num_input_tokens_seen": 184730520, + "router_z_loss_clip": 0.01135254, + "router_z_loss_mlp": 0.2265625, + "step": 8591, + "time_per_iteration": 3.0380799770355225 + }, + { + "auxiliary_loss_clip": 0.01101472, + "auxiliary_loss_mlp": 0.01023222, + "balance_loss_clip": 1.01192665, + "balance_loss_mlp": 1.03659964, + "epoch": 0.5165789869231926, + "flos": 19385657412480.0, + "grad_norm": 1.5513724058155185, + "language_loss": 0.81425416, + "learning_rate": 1.9896793112467566e-06, + "loss": 0.83550113, + "num_input_tokens_seen": 184748340, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6484375, + "step": 8592, + "time_per_iteration": 2.4280107021331787 + }, + { + "auxiliary_loss_clip": 0.0110705, + "auxiliary_loss_mlp": 0.01023209, + "balance_loss_clip": 1.01141334, + "balance_loss_mlp": 1.04046106, + "epoch": 0.5166391101758605, + "flos": 20960197067520.0, + "grad_norm": 1.8100942034895195, + "language_loss": 0.83394146, + "learning_rate": 1.989289854948979e-06, + "loss": 0.85524404, + "num_input_tokens_seen": 184766615, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66796875, + "step": 8593, + "time_per_iteration": 3.9351704120635986 + }, + { + "auxiliary_loss_clip": 0.01109969, + "auxiliary_loss_mlp": 0.01031901, + "balance_loss_clip": 1.02004552, + "balance_loss_mlp": 1.04028952, + "epoch": 0.5166992334285285, + "flos": 29462407148160.0, + "grad_norm": 1.576203753972958, + "language_loss": 0.68724298, + "learning_rate": 1.9889003990573314e-06, + "loss": 0.70866162, + "num_input_tokens_seen": 184788075, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6953125, + "step": 8594, + "time_per_iteration": 2.547206163406372 + }, + { + "auxiliary_loss_clip": 0.01105211, + "auxiliary_loss_mlp": 0.0102705, + "balance_loss_clip": 1.01459885, + "balance_loss_mlp": 1.03660214, + "epoch": 0.5167593566811964, + "flos": 20304360593280.0, + "grad_norm": 1.9981153431236998, + "language_loss": 0.77706152, + "learning_rate": 1.988510943586582e-06, + "loss": 0.79838419, + "num_input_tokens_seen": 184808710, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6875, + "step": 8595, + "time_per_iteration": 2.5214362144470215 + }, + { + "auxiliary_loss_clip": 0.01107198, + "auxiliary_loss_mlp": 0.0103521, + "balance_loss_clip": 1.02278233, + "balance_loss_mlp": 1.03896379, + "epoch": 0.5168194799338645, + "flos": 14611370313600.0, + "grad_norm": 1.5236872991766963, + "language_loss": 0.64860648, + "learning_rate": 1.9881214885514986e-06, + "loss": 0.67003053, + "num_input_tokens_seen": 184826475, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6796875, + "step": 8596, + "time_per_iteration": 5.460975885391235 + }, + { + "auxiliary_loss_clip": 0.01109553, + "auxiliary_loss_mlp": 0.01029844, + "balance_loss_clip": 1.01603329, + "balance_loss_mlp": 1.04030609, + "epoch": 0.5168796031865324, + "flos": 25007257411200.0, + "grad_norm": 1.6129264208414336, + "language_loss": 0.75417203, + "learning_rate": 1.9877320339668492e-06, + "loss": 0.77556598, + "num_input_tokens_seen": 184845245, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.6953125, + "step": 8597, + "time_per_iteration": 2.477386236190796 + }, + { + "auxiliary_loss_clip": 0.01108076, + "auxiliary_loss_mlp": 0.01025716, + "balance_loss_clip": 1.01356828, + "balance_loss_mlp": 1.03728151, + "epoch": 0.5169397264392004, + "flos": 26939969533440.0, + "grad_norm": 1.684107970499364, + "language_loss": 0.80853873, + "learning_rate": 1.987342579847403e-06, + "loss": 0.82987666, + "num_input_tokens_seen": 184866605, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.70703125, + "step": 8598, + "time_per_iteration": 2.5056118965148926 + }, + { + "auxiliary_loss_clip": 0.01107998, + "auxiliary_loss_mlp": 0.01038534, + "balance_loss_clip": 1.02550411, + "balance_loss_mlp": 1.03853858, + "epoch": 0.5169998496918683, + "flos": 25407804948480.0, + "grad_norm": 1.5161151475530301, + "language_loss": 0.75315893, + "learning_rate": 1.9869531262079273e-06, + "loss": 0.77462423, + "num_input_tokens_seen": 184886945, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6953125, + "step": 8599, + "time_per_iteration": 2.4907233715057373 + }, + { + "auxiliary_loss_clip": 0.01106465, + "auxiliary_loss_mlp": 0.01033371, + "balance_loss_clip": 1.02142024, + "balance_loss_mlp": 1.03874612, + "epoch": 0.5170599729445363, + "flos": 24680793674880.0, + "grad_norm": 5.031269669902368, + "language_loss": 0.72193408, + "learning_rate": 1.9865636730631904e-06, + "loss": 0.74333239, + "num_input_tokens_seen": 184905590, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6796875, + "step": 8600, + "time_per_iteration": 2.4958672523498535 + }, + { + "auxiliary_loss_clip": 0.01107841, + "auxiliary_loss_mlp": 0.01031875, + "balance_loss_clip": 1.01924503, + "balance_loss_mlp": 1.03902841, + "epoch": 0.5171200961972042, + "flos": 20994455664000.0, + "grad_norm": 1.5543027238719596, + "language_loss": 0.74527812, + "learning_rate": 1.9861742204279602e-06, + "loss": 0.76667523, + "num_input_tokens_seen": 184925555, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 8601, + "time_per_iteration": 2.4545562267303467 + }, + { + "auxiliary_loss_clip": 0.01108233, + "auxiliary_loss_mlp": 0.01038986, + "balance_loss_clip": 1.02540207, + "balance_loss_mlp": 1.03855383, + "epoch": 0.5171802194498722, + "flos": 22745639427840.0, + "grad_norm": 1.930843678841908, + "language_loss": 0.83770829, + "learning_rate": 1.9857847683170045e-06, + "loss": 0.85918051, + "num_input_tokens_seen": 184944490, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.6953125, + "step": 8602, + "time_per_iteration": 2.478315591812134 + }, + { + "auxiliary_loss_clip": 0.01109334, + "auxiliary_loss_mlp": 0.01030291, + "balance_loss_clip": 1.01727891, + "balance_loss_mlp": 1.03919971, + "epoch": 0.5172403427025402, + "flos": 28176732668160.0, + "grad_norm": 1.739467426965746, + "language_loss": 0.74487793, + "learning_rate": 1.9853953167450926e-06, + "loss": 0.76627421, + "num_input_tokens_seen": 184963190, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 8603, + "time_per_iteration": 2.541987180709839 + }, + { + "auxiliary_loss_clip": 0.01110457, + "auxiliary_loss_mlp": 0.0103389, + "balance_loss_clip": 1.02172458, + "balance_loss_mlp": 1.04043818, + "epoch": 0.5173004659552082, + "flos": 20337829090560.0, + "grad_norm": 2.0493295845447435, + "language_loss": 0.72732627, + "learning_rate": 1.9850058657269915e-06, + "loss": 0.74876976, + "num_input_tokens_seen": 184981220, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69921875, + "step": 8604, + "time_per_iteration": 2.464036226272583 + }, + { + "auxiliary_loss_clip": 0.01113997, + "auxiliary_loss_mlp": 0.01032385, + "balance_loss_clip": 1.01927209, + "balance_loss_mlp": 1.03878832, + "epoch": 0.5173605892078762, + "flos": 19063323740160.0, + "grad_norm": 1.890584135418456, + "language_loss": 0.85098851, + "learning_rate": 1.984616415277469e-06, + "loss": 0.87245226, + "num_input_tokens_seen": 184998810, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.75, + "step": 8605, + "time_per_iteration": 2.469414472579956 + }, + { + "auxiliary_loss_clip": 0.01107307, + "auxiliary_loss_mlp": 0.01024655, + "balance_loss_clip": 1.01271009, + "balance_loss_mlp": 1.03827572, + "epoch": 0.5174207124605441, + "flos": 27995168396160.0, + "grad_norm": 1.4962077074735805, + "language_loss": 0.64887142, + "learning_rate": 1.984226965411294e-06, + "loss": 0.67019105, + "num_input_tokens_seen": 185021185, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.69140625, + "step": 8606, + "time_per_iteration": 2.5391039848327637 + }, + { + "auxiliary_loss_clip": 0.01108829, + "auxiliary_loss_mlp": 0.01027754, + "balance_loss_clip": 1.0153147, + "balance_loss_mlp": 1.04041243, + "epoch": 0.5174808357132121, + "flos": 19496657416320.0, + "grad_norm": 1.6359731326945595, + "language_loss": 0.77811146, + "learning_rate": 1.983837516143234e-06, + "loss": 0.79947728, + "num_input_tokens_seen": 185038465, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 8607, + "time_per_iteration": 2.4382975101470947 + }, + { + "auxiliary_loss_clip": 0.01111137, + "auxiliary_loss_mlp": 0.01033709, + "balance_loss_clip": 1.02053022, + "balance_loss_mlp": 1.0399344, + "epoch": 0.51754095896588, + "flos": 22784171742720.0, + "grad_norm": 3.5447610791610638, + "language_loss": 0.72232366, + "learning_rate": 1.983448067488057e-06, + "loss": 0.74377209, + "num_input_tokens_seen": 185057340, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 8608, + "time_per_iteration": 2.511740207672119 + }, + { + "auxiliary_loss_clip": 0.01115322, + "auxiliary_loss_mlp": 0.01032677, + "balance_loss_clip": 1.01927149, + "balance_loss_mlp": 1.04073501, + "epoch": 0.5176010822185481, + "flos": 22669257156480.0, + "grad_norm": 1.8799970026389359, + "language_loss": 0.86513162, + "learning_rate": 1.983058619460531e-06, + "loss": 0.88661158, + "num_input_tokens_seen": 185074935, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 8609, + "time_per_iteration": 2.453684091567993 + }, + { + "auxiliary_loss_clip": 0.01108892, + "auxiliary_loss_mlp": 0.01030489, + "balance_loss_clip": 1.01888371, + "balance_loss_mlp": 1.03858495, + "epoch": 0.517661205471216, + "flos": 23951196622080.0, + "grad_norm": 1.565375500859336, + "language_loss": 0.73396695, + "learning_rate": 1.9826691720754237e-06, + "loss": 0.75536072, + "num_input_tokens_seen": 185095050, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.703125, + "step": 8610, + "time_per_iteration": 2.5529308319091797 + }, + { + "auxiliary_loss_clip": 0.01115772, + "auxiliary_loss_mlp": 0.01032009, + "balance_loss_clip": 1.01813269, + "balance_loss_mlp": 1.04202247, + "epoch": 0.517721328723884, + "flos": 15596076735360.0, + "grad_norm": 1.8297114771569651, + "language_loss": 0.67358816, + "learning_rate": 1.9822797253475034e-06, + "loss": 0.69506592, + "num_input_tokens_seen": 185112275, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.73828125, + "step": 8611, + "time_per_iteration": 2.4198501110076904 + }, + { + "auxiliary_loss_clip": 0.01108783, + "auxiliary_loss_mlp": 0.01030358, + "balance_loss_clip": 1.01808488, + "balance_loss_mlp": 1.0382731, + "epoch": 0.5177814519765519, + "flos": 20960197067520.0, + "grad_norm": 2.316941620789411, + "language_loss": 0.77502143, + "learning_rate": 1.9818902792915373e-06, + "loss": 0.79641283, + "num_input_tokens_seen": 185132165, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.70703125, + "step": 8612, + "time_per_iteration": 2.4943206310272217 + }, + { + "auxiliary_loss_clip": 0.01110636, + "auxiliary_loss_mlp": 0.01034055, + "balance_loss_clip": 1.02186632, + "balance_loss_mlp": 1.03938198, + "epoch": 0.5178415752292199, + "flos": 17967832796160.0, + "grad_norm": 1.9039649692993772, + "language_loss": 0.8192755, + "learning_rate": 1.981500833922294e-06, + "loss": 0.84072244, + "num_input_tokens_seen": 185151025, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.71484375, + "step": 8613, + "time_per_iteration": 2.434479236602783 + }, + { + "auxiliary_loss_clip": 0.01114755, + "auxiliary_loss_mlp": 0.01034084, + "balance_loss_clip": 1.02059531, + "balance_loss_mlp": 1.04346251, + "epoch": 0.5179016984818878, + "flos": 17821496787840.0, + "grad_norm": 2.1674567731422987, + "language_loss": 0.66747862, + "learning_rate": 1.981111389254541e-06, + "loss": 0.68896699, + "num_input_tokens_seen": 185168455, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7109375, + "step": 8614, + "time_per_iteration": 2.4598941802978516 + }, + { + "auxiliary_loss_clip": 0.01112182, + "auxiliary_loss_mlp": 0.0103035, + "balance_loss_clip": 1.01736188, + "balance_loss_mlp": 1.04048586, + "epoch": 0.5179618217345558, + "flos": 17820455293440.0, + "grad_norm": 1.9388641649707037, + "language_loss": 0.86660814, + "learning_rate": 1.9807219453030453e-06, + "loss": 0.88803345, + "num_input_tokens_seen": 185184415, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71875, + "step": 8615, + "time_per_iteration": 2.434614419937134 + }, + { + "auxiliary_loss_clip": 0.01110692, + "auxiliary_loss_mlp": 0.01040873, + "balance_loss_clip": 1.02877903, + "balance_loss_mlp": 1.04087663, + "epoch": 0.5180219449872238, + "flos": 22522131048960.0, + "grad_norm": 1.572223272426788, + "language_loss": 0.80601507, + "learning_rate": 1.9803325020825763e-06, + "loss": 0.82753074, + "num_input_tokens_seen": 185202910, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69921875, + "step": 8616, + "time_per_iteration": 2.489898920059204 + }, + { + "auxiliary_loss_clip": 0.01119523, + "auxiliary_loss_mlp": 0.01042995, + "balance_loss_clip": 1.02928019, + "balance_loss_mlp": 1.04558134, + "epoch": 0.5180820682398918, + "flos": 23915465568000.0, + "grad_norm": 1.6322050900799092, + "language_loss": 0.7524333, + "learning_rate": 1.9799430596079e-06, + "loss": 0.77405852, + "num_input_tokens_seen": 185223085, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7421875, + "step": 8617, + "time_per_iteration": 2.4741597175598145 + }, + { + "auxiliary_loss_clip": 0.0111036, + "auxiliary_loss_mlp": 0.01032777, + "balance_loss_clip": 1.01977718, + "balance_loss_mlp": 1.03946304, + "epoch": 0.5181421914925598, + "flos": 16979930064000.0, + "grad_norm": 1.8314484463575909, + "language_loss": 0.70137858, + "learning_rate": 1.979553617893785e-06, + "loss": 0.72280991, + "num_input_tokens_seen": 185241295, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.70703125, + "step": 8618, + "time_per_iteration": 2.4596426486968994 + }, + { + "auxiliary_loss_clip": 0.01036764, + "auxiliary_loss_mlp": 0.01001258, + "balance_loss_clip": 1.00007808, + "balance_loss_mlp": 1.01408625, + "epoch": 0.5182023147452277, + "flos": 66059870872320.0, + "grad_norm": 0.9556911586994957, + "language_loss": 0.67222798, + "learning_rate": 1.979164176954999e-06, + "loss": 0.69260818, + "num_input_tokens_seen": 185298295, + "router_z_loss_clip": 0.01177979, + "router_z_loss_mlp": 0.2265625, + "step": 8619, + "time_per_iteration": 3.0123016834259033 + }, + { + "auxiliary_loss_clip": 0.01107081, + "auxiliary_loss_mlp": 0.01032829, + "balance_loss_clip": 1.02055597, + "balance_loss_mlp": 1.03924203, + "epoch": 0.5182624379978957, + "flos": 18187749815040.0, + "grad_norm": 2.197431442121674, + "language_loss": 0.79314506, + "learning_rate": 1.97877473680631e-06, + "loss": 0.81454414, + "num_input_tokens_seen": 185317000, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6796875, + "step": 8620, + "time_per_iteration": 2.445173740386963 + }, + { + "auxiliary_loss_clip": 0.01108259, + "auxiliary_loss_mlp": 0.01038483, + "balance_loss_clip": 1.02625203, + "balance_loss_mlp": 1.03989077, + "epoch": 0.5183225612505636, + "flos": 14026708638720.0, + "grad_norm": 2.0514402600561765, + "language_loss": 0.81893396, + "learning_rate": 1.9783852974624846e-06, + "loss": 0.84040135, + "num_input_tokens_seen": 185331185, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 8621, + "time_per_iteration": 2.4382779598236084 + }, + { + "auxiliary_loss_clip": 0.01109273, + "auxiliary_loss_mlp": 0.01032378, + "balance_loss_clip": 1.02073121, + "balance_loss_mlp": 1.0391438, + "epoch": 0.5183826845032317, + "flos": 23659781581440.0, + "grad_norm": 1.9740999547408657, + "language_loss": 0.65540636, + "learning_rate": 1.9779958589382905e-06, + "loss": 0.67682284, + "num_input_tokens_seen": 185348955, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.69921875, + "step": 8622, + "time_per_iteration": 2.494173288345337 + }, + { + "auxiliary_loss_clip": 0.01114132, + "auxiliary_loss_mlp": 0.01038751, + "balance_loss_clip": 1.02528644, + "balance_loss_mlp": 1.04077148, + "epoch": 0.5184428077558996, + "flos": 15888605097600.0, + "grad_norm": 1.975231537474399, + "language_loss": 0.60350323, + "learning_rate": 1.977606421248497e-06, + "loss": 0.62503201, + "num_input_tokens_seen": 185367330, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 8623, + "time_per_iteration": 2.427819013595581 + }, + { + "auxiliary_loss_clip": 0.0110912, + "auxiliary_loss_mlp": 0.01032142, + "balance_loss_clip": 1.01995301, + "balance_loss_mlp": 1.03832614, + "epoch": 0.5185029310085676, + "flos": 21030833162880.0, + "grad_norm": 1.7021073046505133, + "language_loss": 0.76074666, + "learning_rate": 1.9772169844078685e-06, + "loss": 0.78215921, + "num_input_tokens_seen": 185385060, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.70703125, + "step": 8624, + "time_per_iteration": 2.4636356830596924 + }, + { + "auxiliary_loss_clip": 0.01109665, + "auxiliary_loss_mlp": 0.01036507, + "balance_loss_clip": 1.02441311, + "balance_loss_mlp": 1.03890038, + "epoch": 0.5185630542612355, + "flos": 26542690133760.0, + "grad_norm": 2.7326139645058456, + "language_loss": 0.71175325, + "learning_rate": 1.9768275484311756e-06, + "loss": 0.73321491, + "num_input_tokens_seen": 185403745, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.70703125, + "step": 8625, + "time_per_iteration": 2.4977569580078125 + }, + { + "auxiliary_loss_clip": 0.01110816, + "auxiliary_loss_mlp": 0.01034143, + "balance_loss_clip": 1.02223408, + "balance_loss_mlp": 1.03980732, + "epoch": 0.5186231775139035, + "flos": 20668422890880.0, + "grad_norm": 1.8950159086376122, + "language_loss": 0.67929721, + "learning_rate": 1.976438113333184e-06, + "loss": 0.70074677, + "num_input_tokens_seen": 185422620, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.7109375, + "step": 8626, + "time_per_iteration": 2.4934957027435303 + }, + { + "auxiliary_loss_clip": 0.01108701, + "auxiliary_loss_mlp": 0.010311, + "balance_loss_clip": 1.01889873, + "balance_loss_mlp": 1.03984976, + "epoch": 0.5186833007665714, + "flos": 20885502735360.0, + "grad_norm": 2.322377605069906, + "language_loss": 0.70487207, + "learning_rate": 1.9760486791286612e-06, + "loss": 0.72627008, + "num_input_tokens_seen": 185439380, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 8627, + "time_per_iteration": 2.445827007293701 + }, + { + "auxiliary_loss_clip": 0.01114683, + "auxiliary_loss_mlp": 0.01037557, + "balance_loss_clip": 1.02539158, + "balance_loss_mlp": 1.04147446, + "epoch": 0.5187434240192395, + "flos": 20886903365760.0, + "grad_norm": 1.9255563847501656, + "language_loss": 0.73209083, + "learning_rate": 1.9756592458323753e-06, + "loss": 0.75361323, + "num_input_tokens_seen": 185458830, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.734375, + "step": 8628, + "time_per_iteration": 2.500955581665039 + }, + { + "auxiliary_loss_clip": 0.01110668, + "auxiliary_loss_mlp": 0.01032241, + "balance_loss_clip": 1.02039731, + "balance_loss_mlp": 1.04147768, + "epoch": 0.5188035472719074, + "flos": 19859929614720.0, + "grad_norm": 3.3927220028721994, + "language_loss": 0.77245331, + "learning_rate": 1.9752698134590927e-06, + "loss": 0.79388249, + "num_input_tokens_seen": 185477270, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.69140625, + "step": 8629, + "time_per_iteration": 2.4560301303863525 + }, + { + "auxiliary_loss_clip": 0.01113327, + "auxiliary_loss_mlp": 0.01030634, + "balance_loss_clip": 1.0179081, + "balance_loss_mlp": 1.04206562, + "epoch": 0.5188636705245754, + "flos": 21138313633920.0, + "grad_norm": 2.1928775386787187, + "language_loss": 0.74820137, + "learning_rate": 1.9748803820235815e-06, + "loss": 0.76964092, + "num_input_tokens_seen": 185495795, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 8630, + "time_per_iteration": 2.496370792388916 + }, + { + "auxiliary_loss_clip": 0.01110145, + "auxiliary_loss_mlp": 0.01035209, + "balance_loss_clip": 1.02210796, + "balance_loss_mlp": 1.03882229, + "epoch": 0.5189237937772434, + "flos": 22419786222720.0, + "grad_norm": 1.6137116253106134, + "language_loss": 0.80663669, + "learning_rate": 1.9744909515406093e-06, + "loss": 0.82809031, + "num_input_tokens_seen": 185514885, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 8631, + "time_per_iteration": 2.4534530639648438 + }, + { + "auxiliary_loss_clip": 0.01112884, + "auxiliary_loss_mlp": 0.01032333, + "balance_loss_clip": 1.01893413, + "balance_loss_mlp": 1.04085588, + "epoch": 0.5189839170299113, + "flos": 25446696399360.0, + "grad_norm": 1.5022963557810187, + "language_loss": 0.74575752, + "learning_rate": 1.974101522024942e-06, + "loss": 0.76720965, + "num_input_tokens_seen": 185537155, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.71875, + "step": 8632, + "time_per_iteration": 2.5295352935791016 + }, + { + "auxiliary_loss_clip": 0.01105073, + "auxiliary_loss_mlp": 0.01030609, + "balance_loss_clip": 1.01810372, + "balance_loss_mlp": 1.03738809, + "epoch": 0.5190440402825793, + "flos": 18587722734720.0, + "grad_norm": 1.784064079335437, + "language_loss": 0.78812337, + "learning_rate": 1.9737120934913477e-06, + "loss": 0.80948019, + "num_input_tokens_seen": 185555520, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.67578125, + "step": 8633, + "time_per_iteration": 2.4241905212402344 + }, + { + "auxiliary_loss_clip": 0.01109914, + "auxiliary_loss_mlp": 0.01031089, + "balance_loss_clip": 1.01873302, + "balance_loss_mlp": 1.03893745, + "epoch": 0.5191041635352472, + "flos": 21908633731200.0, + "grad_norm": 1.7026702061892323, + "language_loss": 0.80149853, + "learning_rate": 1.9733226659545936e-06, + "loss": 0.82290852, + "num_input_tokens_seen": 185573855, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.70703125, + "step": 8634, + "time_per_iteration": 2.4851884841918945 + }, + { + "auxiliary_loss_clip": 0.01108415, + "auxiliary_loss_mlp": 0.01035144, + "balance_loss_clip": 1.02305627, + "balance_loss_mlp": 1.04024315, + "epoch": 0.5191642867879153, + "flos": 27527971173120.0, + "grad_norm": 1.4600796720036056, + "language_loss": 0.68628252, + "learning_rate": 1.9729332394294467e-06, + "loss": 0.70771807, + "num_input_tokens_seen": 185595145, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 8635, + "time_per_iteration": 3.921346426010132 + }, + { + "auxiliary_loss_clip": 0.01113121, + "auxiliary_loss_mlp": 0.01033538, + "balance_loss_clip": 1.02083683, + "balance_loss_mlp": 1.04083443, + "epoch": 0.5192244100405832, + "flos": 15705999331200.0, + "grad_norm": 1.6781612563386181, + "language_loss": 0.7704699, + "learning_rate": 1.9725438139306742e-06, + "loss": 0.79193652, + "num_input_tokens_seen": 185613320, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 8636, + "time_per_iteration": 2.45908260345459 + }, + { + "auxiliary_loss_clip": 0.01112314, + "auxiliary_loss_mlp": 0.01031183, + "balance_loss_clip": 1.01861811, + "balance_loss_mlp": 1.04090476, + "epoch": 0.5192845332932512, + "flos": 12057080313600.0, + "grad_norm": 1.9891179602637588, + "language_loss": 0.71459377, + "learning_rate": 1.9721543894730425e-06, + "loss": 0.73602873, + "num_input_tokens_seen": 185630730, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71484375, + "step": 8637, + "time_per_iteration": 5.353722810745239 + }, + { + "auxiliary_loss_clip": 0.01108688, + "auxiliary_loss_mlp": 0.01032602, + "balance_loss_clip": 1.01964426, + "balance_loss_mlp": 1.0394423, + "epoch": 0.5193446565459191, + "flos": 18953185662720.0, + "grad_norm": 3.7284266214304576, + "language_loss": 0.75943041, + "learning_rate": 1.9717649660713194e-06, + "loss": 0.78084332, + "num_input_tokens_seen": 185648515, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.69140625, + "step": 8638, + "time_per_iteration": 3.902477741241455 + }, + { + "auxiliary_loss_clip": 0.0110838, + "auxiliary_loss_mlp": 0.01030358, + "balance_loss_clip": 1.0175786, + "balance_loss_mlp": 1.03863966, + "epoch": 0.5194047797985871, + "flos": 20374960775040.0, + "grad_norm": 2.006346025426826, + "language_loss": 0.74846971, + "learning_rate": 1.971375543740272e-06, + "loss": 0.76985711, + "num_input_tokens_seen": 185665220, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 8639, + "time_per_iteration": 2.453634738922119 + }, + { + "auxiliary_loss_clip": 0.01109964, + "auxiliary_loss_mlp": 0.01028741, + "balance_loss_clip": 1.01604497, + "balance_loss_mlp": 1.04051375, + "epoch": 0.519464903051255, + "flos": 24353001135360.0, + "grad_norm": 1.6163455561126134, + "language_loss": 0.77538067, + "learning_rate": 1.9709861224946665e-06, + "loss": 0.79676771, + "num_input_tokens_seen": 185683750, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 8640, + "time_per_iteration": 2.482334613800049 + }, + { + "auxiliary_loss_clip": 0.01110191, + "auxiliary_loss_mlp": 0.01030568, + "balance_loss_clip": 1.01883161, + "balance_loss_mlp": 1.04175985, + "epoch": 0.519525026303923, + "flos": 14061829161600.0, + "grad_norm": 1.623082815057782, + "language_loss": 0.65734208, + "learning_rate": 1.97059670234927e-06, + "loss": 0.67874962, + "num_input_tokens_seen": 185700625, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.68359375, + "step": 8641, + "time_per_iteration": 2.4567995071411133 + }, + { + "auxiliary_loss_clip": 0.01109994, + "auxiliary_loss_mlp": 0.01033178, + "balance_loss_clip": 1.02142978, + "balance_loss_mlp": 1.04105425, + "epoch": 0.519585149556591, + "flos": 28835873193600.0, + "grad_norm": 1.8491224599980307, + "language_loss": 0.76197445, + "learning_rate": 1.97020728331885e-06, + "loss": 0.78340614, + "num_input_tokens_seen": 185721155, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.69140625, + "step": 8642, + "time_per_iteration": 2.5128276348114014 + }, + { + "auxiliary_loss_clip": 0.01109094, + "auxiliary_loss_mlp": 0.01031951, + "balance_loss_clip": 1.02001774, + "balance_loss_mlp": 1.04037452, + "epoch": 0.519645272809259, + "flos": 25373007648000.0, + "grad_norm": 1.4733024685255247, + "language_loss": 0.83179498, + "learning_rate": 1.9698178654181726e-06, + "loss": 0.85320538, + "num_input_tokens_seen": 185740990, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 8643, + "time_per_iteration": 2.5094587802886963 + }, + { + "auxiliary_loss_clip": 0.0111188, + "auxiliary_loss_mlp": 0.01042001, + "balance_loss_clip": 1.02856052, + "balance_loss_mlp": 1.03983521, + "epoch": 0.519705396061927, + "flos": 25372863993600.0, + "grad_norm": 1.5341454697133152, + "language_loss": 0.70307451, + "learning_rate": 1.969428448662004e-06, + "loss": 0.72461337, + "num_input_tokens_seen": 185762235, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71875, + "step": 8644, + "time_per_iteration": 2.5111963748931885 + }, + { + "auxiliary_loss_clip": 0.01110422, + "auxiliary_loss_mlp": 0.01031611, + "balance_loss_clip": 1.01967788, + "balance_loss_mlp": 1.03966331, + "epoch": 0.5197655193145949, + "flos": 28476228268800.0, + "grad_norm": 1.8635414079348847, + "language_loss": 0.80144334, + "learning_rate": 1.9690390330651133e-06, + "loss": 0.82286364, + "num_input_tokens_seen": 185783415, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.70703125, + "step": 8645, + "time_per_iteration": 2.529616117477417 + }, + { + "auxiliary_loss_clip": 0.01109035, + "auxiliary_loss_mlp": 0.01029263, + "balance_loss_clip": 1.01647151, + "balance_loss_mlp": 1.03836131, + "epoch": 0.5198256425672629, + "flos": 20009138711040.0, + "grad_norm": 1.899493861617854, + "language_loss": 0.78147799, + "learning_rate": 1.968649618642264e-06, + "loss": 0.80286086, + "num_input_tokens_seen": 185801345, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.70703125, + "step": 8646, + "time_per_iteration": 2.4409830570220947 + }, + { + "auxiliary_loss_clip": 0.01112803, + "auxiliary_loss_mlp": 0.01033352, + "balance_loss_clip": 1.02101934, + "balance_loss_mlp": 1.04184628, + "epoch": 0.5198857658199308, + "flos": 19828867328640.0, + "grad_norm": 1.8109153766187511, + "language_loss": 0.66239858, + "learning_rate": 1.9682602054082252e-06, + "loss": 0.68386012, + "num_input_tokens_seen": 185820815, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.7109375, + "step": 8647, + "time_per_iteration": 2.4503657817840576 + }, + { + "auxiliary_loss_clip": 0.01113411, + "auxiliary_loss_mlp": 0.01032738, + "balance_loss_clip": 1.01834917, + "balance_loss_mlp": 1.04010677, + "epoch": 0.5199458890725989, + "flos": 24461918150400.0, + "grad_norm": 4.112424605735972, + "language_loss": 0.71817285, + "learning_rate": 1.967870793377763e-06, + "loss": 0.73963439, + "num_input_tokens_seen": 185841450, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.734375, + "step": 8648, + "time_per_iteration": 2.49595308303833 + }, + { + "auxiliary_loss_clip": 0.01112873, + "auxiliary_loss_mlp": 0.0103029, + "balance_loss_clip": 1.01714706, + "balance_loss_mlp": 1.0411458, + "epoch": 0.5200060123252668, + "flos": 23404779953280.0, + "grad_norm": 1.6438613988660609, + "language_loss": 0.64412069, + "learning_rate": 1.967481382565642e-06, + "loss": 0.66555232, + "num_input_tokens_seen": 185859935, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 8649, + "time_per_iteration": 2.4781436920166016 + }, + { + "auxiliary_loss_clip": 0.01116558, + "auxiliary_loss_mlp": 0.01035075, + "balance_loss_clip": 1.02025771, + "balance_loss_mlp": 1.04224229, + "epoch": 0.5200661355779348, + "flos": 17201355454080.0, + "grad_norm": 1.8268985026448872, + "language_loss": 0.70691884, + "learning_rate": 1.9670919729866315e-06, + "loss": 0.72843516, + "num_input_tokens_seen": 185876795, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7421875, + "step": 8650, + "time_per_iteration": 2.4350762367248535 + }, + { + "auxiliary_loss_clip": 0.01108729, + "auxiliary_loss_mlp": 0.01028355, + "balance_loss_clip": 1.01559973, + "balance_loss_mlp": 1.03854239, + "epoch": 0.5201262588306027, + "flos": 18515075477760.0, + "grad_norm": 1.6557672224542628, + "language_loss": 0.7709741, + "learning_rate": 1.966702564655496e-06, + "loss": 0.79234493, + "num_input_tokens_seen": 185895570, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 8651, + "time_per_iteration": 2.4439852237701416 + }, + { + "auxiliary_loss_clip": 0.01115555, + "auxiliary_loss_mlp": 0.01035706, + "balance_loss_clip": 1.02171111, + "balance_loss_mlp": 1.04384518, + "epoch": 0.5201863820832707, + "flos": 18619395552000.0, + "grad_norm": 1.7772284952150523, + "language_loss": 0.78304142, + "learning_rate": 1.966313157587003e-06, + "loss": 0.80455399, + "num_input_tokens_seen": 185913700, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.71484375, + "step": 8652, + "time_per_iteration": 2.4581267833709717 + }, + { + "auxiliary_loss_clip": 0.01114617, + "auxiliary_loss_mlp": 0.01031149, + "balance_loss_clip": 1.01683807, + "balance_loss_mlp": 1.04281044, + "epoch": 0.5202465053359386, + "flos": 22857142222080.0, + "grad_norm": 2.0186078989624017, + "language_loss": 0.7027083, + "learning_rate": 1.9659237517959187e-06, + "loss": 0.72416592, + "num_input_tokens_seen": 185932460, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.71875, + "step": 8653, + "time_per_iteration": 2.4945242404937744 + }, + { + "auxiliary_loss_clip": 0.01114383, + "auxiliary_loss_mlp": 0.01040745, + "balance_loss_clip": 1.02703571, + "balance_loss_mlp": 1.04092932, + "epoch": 0.5203066285886067, + "flos": 21981532383360.0, + "grad_norm": 1.6276924489714153, + "language_loss": 0.78420818, + "learning_rate": 1.965534347297008e-06, + "loss": 0.80575949, + "num_input_tokens_seen": 185952030, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.734375, + "step": 8654, + "time_per_iteration": 2.4857122898101807 + }, + { + "auxiliary_loss_clip": 0.01117815, + "auxiliary_loss_mlp": 0.01038442, + "balance_loss_clip": 1.02450645, + "balance_loss_mlp": 1.04275405, + "epoch": 0.5203667518412746, + "flos": 20233329448320.0, + "grad_norm": 2.316843494652732, + "language_loss": 0.8424964, + "learning_rate": 1.9651449441050393e-06, + "loss": 0.86405897, + "num_input_tokens_seen": 185973130, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 8655, + "time_per_iteration": 2.48307728767395 + }, + { + "auxiliary_loss_clip": 0.0111092, + "auxiliary_loss_mlp": 0.01030844, + "balance_loss_clip": 1.01860702, + "balance_loss_mlp": 1.04225183, + "epoch": 0.5204268750939426, + "flos": 15705460627200.0, + "grad_norm": 3.712191764961765, + "language_loss": 0.65503991, + "learning_rate": 1.9647555422347777e-06, + "loss": 0.67645752, + "num_input_tokens_seen": 185990200, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 8656, + "time_per_iteration": 2.442760705947876 + }, + { + "auxiliary_loss_clip": 0.01114044, + "auxiliary_loss_mlp": 0.01029702, + "balance_loss_clip": 1.0173285, + "balance_loss_mlp": 1.04263127, + "epoch": 0.5204869983466105, + "flos": 27449469999360.0, + "grad_norm": 2.4919467158509385, + "language_loss": 0.73240453, + "learning_rate": 1.9643661417009893e-06, + "loss": 0.753842, + "num_input_tokens_seen": 186009880, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.71484375, + "step": 8657, + "time_per_iteration": 2.5198535919189453 + }, + { + "auxiliary_loss_clip": 0.01111884, + "auxiliary_loss_mlp": 0.01033998, + "balance_loss_clip": 1.02064037, + "balance_loss_mlp": 1.042382, + "epoch": 0.5205471215992785, + "flos": 20595452411520.0, + "grad_norm": 1.757060291742625, + "language_loss": 0.71675289, + "learning_rate": 1.9639767425184408e-06, + "loss": 0.73821175, + "num_input_tokens_seen": 186026680, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.6953125, + "step": 8658, + "time_per_iteration": 2.4651598930358887 + }, + { + "auxiliary_loss_clip": 0.0111093, + "auxiliary_loss_mlp": 0.01031842, + "balance_loss_clip": 1.01868176, + "balance_loss_mlp": 1.0400281, + "epoch": 0.5206072448519465, + "flos": 22127904305280.0, + "grad_norm": 1.6795003925123537, + "language_loss": 0.83473611, + "learning_rate": 1.963587344701897e-06, + "loss": 0.85616386, + "num_input_tokens_seen": 186046920, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 8659, + "time_per_iteration": 2.462956428527832 + }, + { + "auxiliary_loss_clip": 0.01119845, + "auxiliary_loss_mlp": 0.01039223, + "balance_loss_clip": 1.02470934, + "balance_loss_mlp": 1.04351366, + "epoch": 0.5206673681046144, + "flos": 18330422636160.0, + "grad_norm": 1.9135176980647008, + "language_loss": 0.75763941, + "learning_rate": 1.9631979482661253e-06, + "loss": 0.77923, + "num_input_tokens_seen": 186062090, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.76171875, + "step": 8660, + "time_per_iteration": 2.4544646739959717 + }, + { + "auxiliary_loss_clip": 0.01111893, + "auxiliary_loss_mlp": 0.0103427, + "balance_loss_clip": 1.02199721, + "balance_loss_mlp": 1.04152977, + "epoch": 0.5207274913572825, + "flos": 20230240878720.0, + "grad_norm": 1.7715737398241405, + "language_loss": 0.78001404, + "learning_rate": 1.9628085532258906e-06, + "loss": 0.80147564, + "num_input_tokens_seen": 186081135, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.703125, + "step": 8661, + "time_per_iteration": 2.4456324577331543 + }, + { + "auxiliary_loss_clip": 0.01113873, + "auxiliary_loss_mlp": 0.01030884, + "balance_loss_clip": 1.01818848, + "balance_loss_mlp": 1.0404228, + "epoch": 0.5207876146099504, + "flos": 22127042378880.0, + "grad_norm": 1.805356331270093, + "language_loss": 0.70643514, + "learning_rate": 1.9624191595959603e-06, + "loss": 0.72788274, + "num_input_tokens_seen": 186099700, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.734375, + "step": 8662, + "time_per_iteration": 2.5272181034088135 + }, + { + "auxiliary_loss_clip": 0.01110335, + "auxiliary_loss_mlp": 0.01032574, + "balance_loss_clip": 1.01835203, + "balance_loss_mlp": 1.04033709, + "epoch": 0.5208477378626184, + "flos": 23878908501120.0, + "grad_norm": 1.669754729528693, + "language_loss": 0.6935755, + "learning_rate": 1.962029767391098e-06, + "loss": 0.71500456, + "num_input_tokens_seen": 186119740, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.69921875, + "step": 8663, + "time_per_iteration": 2.468287706375122 + }, + { + "auxiliary_loss_clip": 0.01113011, + "auxiliary_loss_mlp": 0.01031152, + "balance_loss_clip": 1.01822364, + "balance_loss_mlp": 1.04173064, + "epoch": 0.5209078611152863, + "flos": 20961525870720.0, + "grad_norm": 2.618720199838109, + "language_loss": 0.76771712, + "learning_rate": 1.961640376626072e-06, + "loss": 0.7891587, + "num_input_tokens_seen": 186140645, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71484375, + "step": 8664, + "time_per_iteration": 2.519645929336548 + }, + { + "auxiliary_loss_clip": 0.01111987, + "auxiliary_loss_mlp": 0.01036724, + "balance_loss_clip": 1.02387905, + "balance_loss_mlp": 1.04057467, + "epoch": 0.5209679843679543, + "flos": 20667740532480.0, + "grad_norm": 1.987870026093088, + "language_loss": 0.76193488, + "learning_rate": 1.961250987315646e-06, + "loss": 0.78342199, + "num_input_tokens_seen": 186160130, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71484375, + "step": 8665, + "time_per_iteration": 2.4501259326934814 + }, + { + "auxiliary_loss_clip": 0.01111359, + "auxiliary_loss_mlp": 0.01033252, + "balance_loss_clip": 1.02113414, + "balance_loss_mlp": 1.04135728, + "epoch": 0.5210281076206222, + "flos": 20227295963520.0, + "grad_norm": 1.609030555811117, + "language_loss": 0.71689177, + "learning_rate": 1.960861599474586e-06, + "loss": 0.73833793, + "num_input_tokens_seen": 186179485, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69921875, + "step": 8666, + "time_per_iteration": 2.4961183071136475 + }, + { + "auxiliary_loss_clip": 0.01119663, + "auxiliary_loss_mlp": 0.01035445, + "balance_loss_clip": 1.02031779, + "balance_loss_mlp": 1.04257357, + "epoch": 0.5210882308732903, + "flos": 16069989801600.0, + "grad_norm": 2.081998488723945, + "language_loss": 0.68599117, + "learning_rate": 1.9604722131176592e-06, + "loss": 0.7075423, + "num_input_tokens_seen": 186197140, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.76953125, + "step": 8667, + "time_per_iteration": 2.4216842651367188 + }, + { + "auxiliary_loss_clip": 0.01107841, + "auxiliary_loss_mlp": 0.01034805, + "balance_loss_clip": 1.02247858, + "balance_loss_mlp": 1.03913903, + "epoch": 0.5211483541259582, + "flos": 24825298089600.0, + "grad_norm": 1.3811752682570164, + "language_loss": 0.81006289, + "learning_rate": 1.960082828259629e-06, + "loss": 0.83148932, + "num_input_tokens_seen": 186216800, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6875, + "step": 8668, + "time_per_iteration": 2.5712640285491943 + }, + { + "auxiliary_loss_clip": 0.01112305, + "auxiliary_loss_mlp": 0.01031429, + "balance_loss_clip": 1.0184648, + "balance_loss_mlp": 1.0413909, + "epoch": 0.5212084773786262, + "flos": 20370651143040.0, + "grad_norm": 2.7130530435254507, + "language_loss": 0.63821161, + "learning_rate": 1.9596934449152623e-06, + "loss": 0.65964901, + "num_input_tokens_seen": 186235320, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 8669, + "time_per_iteration": 2.485560894012451 + }, + { + "auxiliary_loss_clip": 0.01114118, + "auxiliary_loss_mlp": 0.01040749, + "balance_loss_clip": 1.02779722, + "balance_loss_mlp": 1.0434041, + "epoch": 0.5212686006312941, + "flos": 23145468693120.0, + "grad_norm": 1.5472632399176471, + "language_loss": 0.66420943, + "learning_rate": 1.959304063099325e-06, + "loss": 0.68575811, + "num_input_tokens_seen": 186254460, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 8670, + "time_per_iteration": 2.5161590576171875 + }, + { + "auxiliary_loss_clip": 0.01107902, + "auxiliary_loss_mlp": 0.01034298, + "balance_loss_clip": 1.02204931, + "balance_loss_mlp": 1.04005504, + "epoch": 0.5213287238839621, + "flos": 27774030314880.0, + "grad_norm": 2.0274420083477436, + "language_loss": 0.7666502, + "learning_rate": 1.9589146828265806e-06, + "loss": 0.78807229, + "num_input_tokens_seen": 186269465, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6796875, + "step": 8671, + "time_per_iteration": 2.4505884647369385 + }, + { + "auxiliary_loss_clip": 0.01117202, + "auxiliary_loss_mlp": 0.01035681, + "balance_loss_clip": 1.022246, + "balance_loss_mlp": 1.0442729, + "epoch": 0.5213888471366301, + "flos": 19937676602880.0, + "grad_norm": 6.168212064153821, + "language_loss": 0.78184325, + "learning_rate": 1.958525304111796e-06, + "loss": 0.80337209, + "num_input_tokens_seen": 186288660, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73046875, + "step": 8672, + "time_per_iteration": 2.501171350479126 + }, + { + "auxiliary_loss_clip": 0.0110814, + "auxiliary_loss_mlp": 0.0103169, + "balance_loss_clip": 1.01958418, + "balance_loss_mlp": 1.03945541, + "epoch": 0.521448970389298, + "flos": 16982731324800.0, + "grad_norm": 1.8428028532242804, + "language_loss": 0.72013724, + "learning_rate": 1.958135926969736e-06, + "loss": 0.74153554, + "num_input_tokens_seen": 186305760, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 8673, + "time_per_iteration": 2.4188430309295654 + }, + { + "auxiliary_loss_clip": 0.01110821, + "auxiliary_loss_mlp": 0.0102766, + "balance_loss_clip": 1.01467764, + "balance_loss_mlp": 1.04007983, + "epoch": 0.5215090936419661, + "flos": 18989706816000.0, + "grad_norm": 1.5425888836045836, + "language_loss": 0.75258517, + "learning_rate": 1.957746551415166e-06, + "loss": 0.77397001, + "num_input_tokens_seen": 186324135, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.70703125, + "step": 8674, + "time_per_iteration": 2.4615721702575684 + }, + { + "auxiliary_loss_clip": 0.01112251, + "auxiliary_loss_mlp": 0.01034993, + "balance_loss_clip": 1.02111149, + "balance_loss_mlp": 1.03926849, + "epoch": 0.521569216894634, + "flos": 16143427157760.0, + "grad_norm": 2.4005630002003198, + "language_loss": 0.86177206, + "learning_rate": 1.9573571774628506e-06, + "loss": 0.88324457, + "num_input_tokens_seen": 186340205, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7265625, + "step": 8675, + "time_per_iteration": 2.4192757606506348 + }, + { + "auxiliary_loss_clip": 0.01036097, + "auxiliary_loss_mlp": 0.00999914, + "balance_loss_clip": 0.99874002, + "balance_loss_mlp": 1.01361609, + "epoch": 0.521629340147302, + "flos": 57579493282560.0, + "grad_norm": 0.8810836824461878, + "language_loss": 0.6315189, + "learning_rate": 1.9569678051275556e-06, + "loss": 0.65187901, + "num_input_tokens_seen": 186396940, + "router_z_loss_clip": 0.01171875, + "router_z_loss_mlp": 0.22460938, + "step": 8676, + "time_per_iteration": 4.428101062774658 + }, + { + "auxiliary_loss_clip": 0.01110201, + "auxiliary_loss_mlp": 0.01030366, + "balance_loss_clip": 1.0180341, + "balance_loss_mlp": 1.04064405, + "epoch": 0.5216894633999699, + "flos": 26796901662720.0, + "grad_norm": 1.671918865817182, + "language_loss": 0.68830431, + "learning_rate": 1.956578434424046e-06, + "loss": 0.70970994, + "num_input_tokens_seen": 186418680, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6953125, + "step": 8677, + "time_per_iteration": 2.54658579826355 + }, + { + "auxiliary_loss_clip": 0.01110241, + "auxiliary_loss_mlp": 0.01030131, + "balance_loss_clip": 1.01739907, + "balance_loss_mlp": 1.03994, + "epoch": 0.5217495866526379, + "flos": 26358719650560.0, + "grad_norm": 1.5408434392952677, + "language_loss": 0.65516353, + "learning_rate": 1.956189065367086e-06, + "loss": 0.6765672, + "num_input_tokens_seen": 186438265, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 8678, + "time_per_iteration": 2.4848899841308594 + }, + { + "auxiliary_loss_clip": 0.01115921, + "auxiliary_loss_mlp": 0.01039888, + "balance_loss_clip": 1.02607715, + "balance_loss_mlp": 1.04188991, + "epoch": 0.5218097099053058, + "flos": 23584009841280.0, + "grad_norm": 2.860112109233836, + "language_loss": 0.69020754, + "learning_rate": 1.9557996979714414e-06, + "loss": 0.71176565, + "num_input_tokens_seen": 186456870, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7421875, + "step": 8679, + "time_per_iteration": 5.267160654067993 + }, + { + "auxiliary_loss_clip": 0.01114296, + "auxiliary_loss_mlp": 0.01037367, + "balance_loss_clip": 1.02467108, + "balance_loss_mlp": 1.04272938, + "epoch": 0.5218698331579739, + "flos": 18077396256000.0, + "grad_norm": 1.7057222009225053, + "language_loss": 0.66956079, + "learning_rate": 1.9554103322518764e-06, + "loss": 0.69107741, + "num_input_tokens_seen": 186476425, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71484375, + "step": 8680, + "time_per_iteration": 3.938239574432373 + }, + { + "auxiliary_loss_clip": 0.01112432, + "auxiliary_loss_mlp": 0.01035918, + "balance_loss_clip": 1.02248955, + "balance_loss_mlp": 1.04123902, + "epoch": 0.5219299564106418, + "flos": 19281121856640.0, + "grad_norm": 1.8837479968625288, + "language_loss": 0.83069575, + "learning_rate": 1.955020968223156e-06, + "loss": 0.85217923, + "num_input_tokens_seen": 186492555, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7109375, + "step": 8681, + "time_per_iteration": 2.475834369659424 + }, + { + "auxiliary_loss_clip": 0.01110385, + "auxiliary_loss_mlp": 0.01034309, + "balance_loss_clip": 1.02189326, + "balance_loss_mlp": 1.03964293, + "epoch": 0.5219900796633098, + "flos": 26651355753600.0, + "grad_norm": 1.7236617199536146, + "language_loss": 0.77448237, + "learning_rate": 1.9546316059000454e-06, + "loss": 0.79592931, + "num_input_tokens_seen": 186513190, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.70703125, + "step": 8682, + "time_per_iteration": 2.484111785888672 + }, + { + "auxiliary_loss_clip": 0.01112356, + "auxiliary_loss_mlp": 0.01043116, + "balance_loss_clip": 1.03124917, + "balance_loss_mlp": 1.041852, + "epoch": 0.5220502029159777, + "flos": 34312717382400.0, + "grad_norm": 1.4820765209382558, + "language_loss": 0.68982363, + "learning_rate": 1.9542422452973082e-06, + "loss": 0.71137834, + "num_input_tokens_seen": 186534830, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.703125, + "step": 8683, + "time_per_iteration": 2.579467535018921 + }, + { + "auxiliary_loss_clip": 0.01112188, + "auxiliary_loss_mlp": 0.01040104, + "balance_loss_clip": 1.02706265, + "balance_loss_mlp": 1.04016137, + "epoch": 0.5221103261686457, + "flos": 22156488552960.0, + "grad_norm": 1.598693343235541, + "language_loss": 0.7622329, + "learning_rate": 1.9538528864297104e-06, + "loss": 0.78375584, + "num_input_tokens_seen": 186554390, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 8684, + "time_per_iteration": 2.4642298221588135 + }, + { + "auxiliary_loss_clip": 0.01107617, + "auxiliary_loss_mlp": 0.01031395, + "balance_loss_clip": 1.01886606, + "balance_loss_mlp": 1.03845632, + "epoch": 0.5221704494213137, + "flos": 19208402772480.0, + "grad_norm": 1.6077803987399797, + "language_loss": 0.75887376, + "learning_rate": 1.9534635293120153e-06, + "loss": 0.7802639, + "num_input_tokens_seen": 186572360, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 8685, + "time_per_iteration": 2.4533908367156982 + }, + { + "auxiliary_loss_clip": 0.01113803, + "auxiliary_loss_mlp": 0.01038269, + "balance_loss_clip": 1.02562094, + "balance_loss_mlp": 1.0427258, + "epoch": 0.5222305726739817, + "flos": 19354056422400.0, + "grad_norm": 1.88354393014551, + "language_loss": 0.80851054, + "learning_rate": 1.9530741739589876e-06, + "loss": 0.83003128, + "num_input_tokens_seen": 186590655, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 8686, + "time_per_iteration": 2.430154323577881 + }, + { + "auxiliary_loss_clip": 0.01106791, + "auxiliary_loss_mlp": 0.01036694, + "balance_loss_clip": 1.02474344, + "balance_loss_mlp": 1.03876567, + "epoch": 0.5222906959266497, + "flos": 27814789272960.0, + "grad_norm": 1.664143868034185, + "language_loss": 0.70208037, + "learning_rate": 1.9526848203853927e-06, + "loss": 0.72351515, + "num_input_tokens_seen": 186610345, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6796875, + "step": 8687, + "time_per_iteration": 2.510512590408325 + }, + { + "auxiliary_loss_clip": 0.01107628, + "auxiliary_loss_mlp": 0.01033442, + "balance_loss_clip": 1.02171767, + "balance_loss_mlp": 1.03840709, + "epoch": 0.5223508191793176, + "flos": 12712988615040.0, + "grad_norm": 2.0206883326938407, + "language_loss": 0.82963884, + "learning_rate": 1.9522954686059936e-06, + "loss": 0.85104954, + "num_input_tokens_seen": 186624360, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.69140625, + "step": 8688, + "time_per_iteration": 2.4092836380004883 + }, + { + "auxiliary_loss_clip": 0.0110979, + "auxiliary_loss_mlp": 0.01033698, + "balance_loss_clip": 1.02107966, + "balance_loss_mlp": 1.04007506, + "epoch": 0.5224109424319856, + "flos": 15632238752640.0, + "grad_norm": 2.711188417076446, + "language_loss": 0.73736638, + "learning_rate": 1.9519061186355558e-06, + "loss": 0.75880128, + "num_input_tokens_seen": 186638680, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69921875, + "step": 8689, + "time_per_iteration": 2.4741477966308594 + }, + { + "auxiliary_loss_clip": 0.01109408, + "auxiliary_loss_mlp": 0.01033862, + "balance_loss_clip": 1.02147067, + "balance_loss_mlp": 1.04056704, + "epoch": 0.5224710656846535, + "flos": 15742233175680.0, + "grad_norm": 1.8604688899774438, + "language_loss": 0.82882619, + "learning_rate": 1.9515167704888417e-06, + "loss": 0.85025889, + "num_input_tokens_seen": 186655840, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 8690, + "time_per_iteration": 2.4194648265838623 + }, + { + "auxiliary_loss_clip": 0.01110389, + "auxiliary_loss_mlp": 0.01038197, + "balance_loss_clip": 1.02476192, + "balance_loss_mlp": 1.03937626, + "epoch": 0.5225311889373215, + "flos": 26030998938240.0, + "grad_norm": 2.3332187959772246, + "language_loss": 0.79397631, + "learning_rate": 1.9511274241806173e-06, + "loss": 0.81546217, + "num_input_tokens_seen": 186674150, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7109375, + "step": 8691, + "time_per_iteration": 2.52500319480896 + }, + { + "auxiliary_loss_clip": 0.01113798, + "auxiliary_loss_mlp": 0.0104147, + "balance_loss_clip": 1.02794003, + "balance_loss_mlp": 1.04154706, + "epoch": 0.5225913121899894, + "flos": 18369278173440.0, + "grad_norm": 1.8556717943569576, + "language_loss": 0.7679857, + "learning_rate": 1.950738079725646e-06, + "loss": 0.78953838, + "num_input_tokens_seen": 186690675, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.72265625, + "step": 8692, + "time_per_iteration": 2.4420764446258545 + }, + { + "auxiliary_loss_clip": 0.0110865, + "auxiliary_loss_mlp": 0.01032578, + "balance_loss_clip": 1.02139628, + "balance_loss_mlp": 1.04145277, + "epoch": 0.5226514354426575, + "flos": 29273516501760.0, + "grad_norm": 1.6990103355094375, + "language_loss": 0.72441196, + "learning_rate": 1.950348737138691e-06, + "loss": 0.74582422, + "num_input_tokens_seen": 186710380, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.671875, + "step": 8693, + "time_per_iteration": 2.551316261291504 + }, + { + "auxiliary_loss_clip": 0.01114591, + "auxiliary_loss_mlp": 0.01042549, + "balance_loss_clip": 1.02841115, + "balance_loss_mlp": 1.04073966, + "epoch": 0.5227115586953254, + "flos": 22853299466880.0, + "grad_norm": 1.780524663497215, + "language_loss": 0.81990045, + "learning_rate": 1.949959396434517e-06, + "loss": 0.84147185, + "num_input_tokens_seen": 186729135, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7421875, + "step": 8694, + "time_per_iteration": 2.4666013717651367 + }, + { + "auxiliary_loss_clip": 0.01036217, + "auxiliary_loss_mlp": 0.01006918, + "balance_loss_clip": 1.00584531, + "balance_loss_mlp": 1.01379716, + "epoch": 0.5227716819479934, + "flos": 57474419022720.0, + "grad_norm": 0.771665075265138, + "language_loss": 0.55743444, + "learning_rate": 1.949570057627888e-06, + "loss": 0.57786584, + "num_input_tokens_seen": 186791115, + "router_z_loss_clip": 0.01074219, + "router_z_loss_mlp": 0.22460938, + "step": 8695, + "time_per_iteration": 3.116420269012451 + }, + { + "auxiliary_loss_clip": 0.01111884, + "auxiliary_loss_mlp": 0.01033913, + "balance_loss_clip": 1.02121711, + "balance_loss_mlp": 1.04176521, + "epoch": 0.5228318052006613, + "flos": 13808264077440.0, + "grad_norm": 1.693403101851131, + "language_loss": 0.7333045, + "learning_rate": 1.9491807207335672e-06, + "loss": 0.75476253, + "num_input_tokens_seen": 186808660, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 8696, + "time_per_iteration": 2.437974452972412 + }, + { + "auxiliary_loss_clip": 0.01112043, + "auxiliary_loss_mlp": 0.01030931, + "balance_loss_clip": 1.01840782, + "balance_loss_mlp": 1.04123831, + "epoch": 0.5228919284533293, + "flos": 15596184476160.0, + "grad_norm": 1.6647399718358808, + "language_loss": 0.7097398, + "learning_rate": 1.948791385766319e-06, + "loss": 0.73116946, + "num_input_tokens_seen": 186825900, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.70703125, + "step": 8697, + "time_per_iteration": 2.5316948890686035 + }, + { + "auxiliary_loss_clip": 0.01107343, + "auxiliary_loss_mlp": 0.01028965, + "balance_loss_clip": 1.0171392, + "balance_loss_mlp": 1.04016519, + "epoch": 0.5229520517059973, + "flos": 22491499726080.0, + "grad_norm": 1.6518576838111187, + "language_loss": 0.80392116, + "learning_rate": 1.948402052740906e-06, + "loss": 0.82528424, + "num_input_tokens_seen": 186843735, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 8698, + "time_per_iteration": 2.4515864849090576 + }, + { + "auxiliary_loss_clip": 0.01110863, + "auxiliary_loss_mlp": 0.01034022, + "balance_loss_clip": 1.0218327, + "balance_loss_mlp": 1.04055512, + "epoch": 0.5230121749586653, + "flos": 22090880361600.0, + "grad_norm": 1.702568194733703, + "language_loss": 0.74550211, + "learning_rate": 1.948012721672093e-06, + "loss": 0.76695091, + "num_input_tokens_seen": 186862440, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.703125, + "step": 8699, + "time_per_iteration": 2.508180856704712 + }, + { + "auxiliary_loss_clip": 0.01114495, + "auxiliary_loss_mlp": 0.01030324, + "balance_loss_clip": 1.01700819, + "balance_loss_mlp": 1.04079318, + "epoch": 0.5230722982113333, + "flos": 22127150119680.0, + "grad_norm": 1.4994824070372519, + "language_loss": 0.73465139, + "learning_rate": 1.947623392574642e-06, + "loss": 0.75609958, + "num_input_tokens_seen": 186880940, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.734375, + "step": 8700, + "time_per_iteration": 2.455620765686035 + }, + { + "auxiliary_loss_clip": 0.01114495, + "auxiliary_loss_mlp": 0.01035916, + "balance_loss_clip": 1.02276719, + "balance_loss_mlp": 1.0418222, + "epoch": 0.5231324214640012, + "flos": 25009268572800.0, + "grad_norm": 1.82733314477648, + "language_loss": 0.66863132, + "learning_rate": 1.947234065463318e-06, + "loss": 0.69013548, + "num_input_tokens_seen": 186900785, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7265625, + "step": 8701, + "time_per_iteration": 2.5278706550598145 + }, + { + "auxiliary_loss_clip": 0.01107483, + "auxiliary_loss_mlp": 0.01034421, + "balance_loss_clip": 1.02162433, + "balance_loss_mlp": 1.03844106, + "epoch": 0.5231925447166692, + "flos": 25740517651200.0, + "grad_norm": 2.0326391886622686, + "language_loss": 0.66616488, + "learning_rate": 1.9468447403528826e-06, + "loss": 0.68758386, + "num_input_tokens_seen": 186920895, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69140625, + "step": 8702, + "time_per_iteration": 2.474238872528076 + }, + { + "auxiliary_loss_clip": 0.01110954, + "auxiliary_loss_mlp": 0.01033297, + "balance_loss_clip": 1.02040434, + "balance_loss_mlp": 1.04128182, + "epoch": 0.5232526679693371, + "flos": 21433930565760.0, + "grad_norm": 1.9248840397651374, + "language_loss": 0.7671175, + "learning_rate": 1.946455417258101e-06, + "loss": 0.78856003, + "num_input_tokens_seen": 186940605, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 8703, + "time_per_iteration": 2.466836929321289 + }, + { + "auxiliary_loss_clip": 0.01115826, + "auxiliary_loss_mlp": 0.0104125, + "balance_loss_clip": 1.02648616, + "balance_loss_mlp": 1.04065156, + "epoch": 0.5233127912220051, + "flos": 35298393471360.0, + "grad_norm": 2.7352924521395576, + "language_loss": 0.76380461, + "learning_rate": 1.9460660961937348e-06, + "loss": 0.78537536, + "num_input_tokens_seen": 186960820, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.75390625, + "step": 8704, + "time_per_iteration": 2.566021680831909 + }, + { + "auxiliary_loss_clip": 0.01109442, + "auxiliary_loss_mlp": 0.01039766, + "balance_loss_clip": 1.0272727, + "balance_loss_mlp": 1.04157901, + "epoch": 0.523372914474673, + "flos": 17051320344960.0, + "grad_norm": 1.6527680542100833, + "language_loss": 0.7804389, + "learning_rate": 1.9456767771745474e-06, + "loss": 0.80193096, + "num_input_tokens_seen": 186976240, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 8705, + "time_per_iteration": 2.4414021968841553 + }, + { + "auxiliary_loss_clip": 0.01113477, + "auxiliary_loss_mlp": 0.01028501, + "balance_loss_clip": 1.01545918, + "balance_loss_mlp": 1.04121351, + "epoch": 0.5234330377273411, + "flos": 18406302117120.0, + "grad_norm": 1.9173845394592544, + "language_loss": 0.69808084, + "learning_rate": 1.9452874602153027e-06, + "loss": 0.7195006, + "num_input_tokens_seen": 186992855, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.72265625, + "step": 8706, + "time_per_iteration": 2.4252305030822754 + }, + { + "auxiliary_loss_clip": 0.01033927, + "auxiliary_loss_mlp": 0.00999849, + "balance_loss_clip": 0.99876386, + "balance_loss_mlp": 1.01179016, + "epoch": 0.523493160980009, + "flos": 65850296970240.0, + "grad_norm": 0.6804801593959132, + "language_loss": 0.52532774, + "learning_rate": 1.9448981453307623e-06, + "loss": 0.5456655, + "num_input_tokens_seen": 187051205, + "router_z_loss_clip": 0.01086426, + "router_z_loss_mlp": 0.22167969, + "step": 8707, + "time_per_iteration": 3.142758369445801 + }, + { + "auxiliary_loss_clip": 0.01109991, + "auxiliary_loss_mlp": 0.01035147, + "balance_loss_clip": 1.02262449, + "balance_loss_mlp": 1.03904724, + "epoch": 0.523553284232677, + "flos": 21872076664320.0, + "grad_norm": 1.7383881327323734, + "language_loss": 0.74716955, + "learning_rate": 1.9445088325356904e-06, + "loss": 0.76862097, + "num_input_tokens_seen": 187070540, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7109375, + "step": 8708, + "time_per_iteration": 2.4591562747955322 + }, + { + "auxiliary_loss_clip": 0.01109127, + "auxiliary_loss_mlp": 0.01026089, + "balance_loss_clip": 1.01402545, + "balance_loss_mlp": 1.04014444, + "epoch": 0.5236134074853449, + "flos": 20848191482880.0, + "grad_norm": 1.691977522935515, + "language_loss": 0.77432841, + "learning_rate": 1.944119521844849e-06, + "loss": 0.79568058, + "num_input_tokens_seen": 187089975, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.69140625, + "step": 8709, + "time_per_iteration": 2.480982780456543 + }, + { + "auxiliary_loss_clip": 0.01114299, + "auxiliary_loss_mlp": 0.01034543, + "balance_loss_clip": 1.01927257, + "balance_loss_mlp": 1.03814077, + "epoch": 0.5236735307380129, + "flos": 25520421064320.0, + "grad_norm": 1.9878514646446084, + "language_loss": 0.8357569, + "learning_rate": 1.9437302132730003e-06, + "loss": 0.85724527, + "num_input_tokens_seen": 187108775, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.76171875, + "step": 8710, + "time_per_iteration": 2.4901626110076904 + }, + { + "auxiliary_loss_clip": 0.01107894, + "auxiliary_loss_mlp": 0.01026835, + "balance_loss_clip": 1.01440704, + "balance_loss_mlp": 1.03936791, + "epoch": 0.523733653990681, + "flos": 23583112001280.0, + "grad_norm": 1.6699101384293633, + "language_loss": 0.69427162, + "learning_rate": 1.943340906834908e-06, + "loss": 0.71561891, + "num_input_tokens_seen": 187128830, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 8711, + "time_per_iteration": 2.476573944091797 + }, + { + "auxiliary_loss_clip": 0.01108558, + "auxiliary_loss_mlp": 0.0103175, + "balance_loss_clip": 1.01879799, + "balance_loss_mlp": 1.03732038, + "epoch": 0.5237937772433489, + "flos": 21106245767040.0, + "grad_norm": 1.8448951706521464, + "language_loss": 0.83195686, + "learning_rate": 1.9429516025453345e-06, + "loss": 0.85335994, + "num_input_tokens_seen": 187149570, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 8712, + "time_per_iteration": 2.4485836029052734 + }, + { + "auxiliary_loss_clip": 0.01111097, + "auxiliary_loss_mlp": 0.01036681, + "balance_loss_clip": 1.02286506, + "balance_loss_mlp": 1.03859973, + "epoch": 0.5238539004960169, + "flos": 19172887200000.0, + "grad_norm": 1.7709353735200277, + "language_loss": 0.69517416, + "learning_rate": 1.9425623004190415e-06, + "loss": 0.71665198, + "num_input_tokens_seen": 187170575, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7265625, + "step": 8713, + "time_per_iteration": 2.496649980545044 + }, + { + "auxiliary_loss_clip": 0.01112233, + "auxiliary_loss_mlp": 0.01033578, + "balance_loss_clip": 1.01934421, + "balance_loss_mlp": 1.03752589, + "epoch": 0.5239140237486848, + "flos": 17888218300800.0, + "grad_norm": 2.61615049353435, + "language_loss": 0.76978022, + "learning_rate": 1.9421730004707925e-06, + "loss": 0.79123831, + "num_input_tokens_seen": 187187190, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.74609375, + "step": 8714, + "time_per_iteration": 2.42134428024292 + }, + { + "auxiliary_loss_clip": 0.01113875, + "auxiliary_loss_mlp": 0.01030729, + "balance_loss_clip": 1.01703143, + "balance_loss_mlp": 1.04200637, + "epoch": 0.5239741470013528, + "flos": 17930413802880.0, + "grad_norm": 1.883747352805191, + "language_loss": 0.75953126, + "learning_rate": 1.9417837027153483e-06, + "loss": 0.78097725, + "num_input_tokens_seen": 187204350, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71875, + "step": 8715, + "time_per_iteration": 2.453313112258911 + }, + { + "auxiliary_loss_clip": 0.01106451, + "auxiliary_loss_mlp": 0.01030996, + "balance_loss_clip": 1.01807356, + "balance_loss_mlp": 1.0377413, + "epoch": 0.5240342702540207, + "flos": 30993386584320.0, + "grad_norm": 1.4951701207047352, + "language_loss": 0.7078892, + "learning_rate": 1.9413944071674723e-06, + "loss": 0.72926366, + "num_input_tokens_seen": 187225605, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6875, + "step": 8716, + "time_per_iteration": 2.536285638809204 + }, + { + "auxiliary_loss_clip": 0.01107976, + "auxiliary_loss_mlp": 0.01035517, + "balance_loss_clip": 1.02394176, + "balance_loss_mlp": 1.03838778, + "epoch": 0.5240943935066887, + "flos": 25005066681600.0, + "grad_norm": 2.055978260271784, + "language_loss": 0.86706465, + "learning_rate": 1.941005113841926e-06, + "loss": 0.88849956, + "num_input_tokens_seen": 187241335, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6953125, + "step": 8717, + "time_per_iteration": 2.5015134811401367 + }, + { + "auxiliary_loss_clip": 0.01108796, + "auxiliary_loss_mlp": 0.01029411, + "balance_loss_clip": 1.01737654, + "balance_loss_mlp": 1.03882921, + "epoch": 0.5241545167593566, + "flos": 23659099223040.0, + "grad_norm": 1.8178940063432978, + "language_loss": 0.60516441, + "learning_rate": 1.9406158227534723e-06, + "loss": 0.6265465, + "num_input_tokens_seen": 187259925, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.69921875, + "step": 8718, + "time_per_iteration": 4.028836488723755 + }, + { + "auxiliary_loss_clip": 0.01112927, + "auxiliary_loss_mlp": 0.01031382, + "balance_loss_clip": 1.01830447, + "balance_loss_mlp": 1.04012215, + "epoch": 0.5242146400120247, + "flos": 23400398494080.0, + "grad_norm": 1.7437517815053911, + "language_loss": 0.71897364, + "learning_rate": 1.940226533916872e-06, + "loss": 0.74041677, + "num_input_tokens_seen": 187279035, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 8719, + "time_per_iteration": 2.455796003341675 + }, + { + "auxiliary_loss_clip": 0.01106409, + "auxiliary_loss_mlp": 0.01027949, + "balance_loss_clip": 1.0163976, + "balance_loss_mlp": 1.03797865, + "epoch": 0.5242747632646926, + "flos": 17749065012480.0, + "grad_norm": 1.705660803101178, + "language_loss": 0.72716737, + "learning_rate": 1.9398372473468877e-06, + "loss": 0.74851096, + "num_input_tokens_seen": 187297555, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.68359375, + "step": 8720, + "time_per_iteration": 2.445131301879883 + }, + { + "auxiliary_loss_clip": 0.01110289, + "auxiliary_loss_mlp": 0.01032373, + "balance_loss_clip": 1.01948094, + "balance_loss_mlp": 1.04000795, + "epoch": 0.5243348865173606, + "flos": 32597731549440.0, + "grad_norm": 1.6022030744217663, + "language_loss": 0.70251679, + "learning_rate": 1.939447963058281e-06, + "loss": 0.72394347, + "num_input_tokens_seen": 187320265, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 8721, + "time_per_iteration": 5.4637322425842285 + }, + { + "auxiliary_loss_clip": 0.01106478, + "auxiliary_loss_mlp": 0.01031051, + "balance_loss_clip": 1.01883805, + "balance_loss_mlp": 1.03700781, + "epoch": 0.5243950097700285, + "flos": 25484115392640.0, + "grad_norm": 1.710812698690052, + "language_loss": 0.86623824, + "learning_rate": 1.939058681065813e-06, + "loss": 0.88761353, + "num_input_tokens_seen": 187338045, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6953125, + "step": 8722, + "time_per_iteration": 2.4582130908966064 + }, + { + "auxiliary_loss_clip": 0.01107687, + "auxiliary_loss_mlp": 0.01032314, + "balance_loss_clip": 1.01850319, + "balance_loss_mlp": 1.03929901, + "epoch": 0.5244551330226965, + "flos": 15268391936640.0, + "grad_norm": 1.6752601944842513, + "language_loss": 0.79654807, + "learning_rate": 1.938669401384247e-06, + "loss": 0.8179481, + "num_input_tokens_seen": 187356040, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.68359375, + "step": 8723, + "time_per_iteration": 2.4436798095703125 + }, + { + "auxiliary_loss_clip": 0.0111223, + "auxiliary_loss_mlp": 0.01035425, + "balance_loss_clip": 1.02165055, + "balance_loss_mlp": 1.04074168, + "epoch": 0.5245152562753645, + "flos": 22237108629120.0, + "grad_norm": 2.2643940307400054, + "language_loss": 0.74980783, + "learning_rate": 1.9382801240283426e-06, + "loss": 0.77128434, + "num_input_tokens_seen": 187374185, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.71484375, + "step": 8724, + "time_per_iteration": 2.4523351192474365 + }, + { + "auxiliary_loss_clip": 0.01114812, + "auxiliary_loss_mlp": 0.01033971, + "balance_loss_clip": 1.0193913, + "balance_loss_mlp": 1.03920281, + "epoch": 0.5245753795280325, + "flos": 29426460612480.0, + "grad_norm": 1.7907307804166401, + "language_loss": 0.70031178, + "learning_rate": 1.9378908490128625e-06, + "loss": 0.72179961, + "num_input_tokens_seen": 187396640, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7578125, + "step": 8725, + "time_per_iteration": 2.548102617263794 + }, + { + "auxiliary_loss_clip": 0.01033499, + "auxiliary_loss_mlp": 0.00998708, + "balance_loss_clip": 0.99755734, + "balance_loss_mlp": 1.01092362, + "epoch": 0.5246355027807005, + "flos": 58834392785280.0, + "grad_norm": 0.7538969042021075, + "language_loss": 0.55637997, + "learning_rate": 1.937501576352568e-06, + "loss": 0.576702, + "num_input_tokens_seen": 187455945, + "router_z_loss_clip": 0.01147461, + "router_z_loss_mlp": 0.2265625, + "step": 8726, + "time_per_iteration": 3.055438995361328 + }, + { + "auxiliary_loss_clip": 0.01033831, + "auxiliary_loss_mlp": 0.00998072, + "balance_loss_clip": 0.99698144, + "balance_loss_mlp": 1.01147294, + "epoch": 0.5246956260333684, + "flos": 64526592965760.0, + "grad_norm": 0.8042859023243575, + "language_loss": 0.58400142, + "learning_rate": 1.937112306062219e-06, + "loss": 0.60432053, + "num_input_tokens_seen": 187519975, + "router_z_loss_clip": 0.01092529, + "router_z_loss_mlp": 0.22460938, + "step": 8727, + "time_per_iteration": 3.071913719177246 + }, + { + "auxiliary_loss_clip": 0.0111222, + "auxiliary_loss_mlp": 0.0103046, + "balance_loss_clip": 1.01701272, + "balance_loss_mlp": 1.03976107, + "epoch": 0.5247557492860364, + "flos": 24533631653760.0, + "grad_norm": 1.3114988788354258, + "language_loss": 0.70559728, + "learning_rate": 1.9367230381565786e-06, + "loss": 0.72702408, + "num_input_tokens_seen": 187541775, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.72265625, + "step": 8728, + "time_per_iteration": -0.15050816535949707 + }, + { + "auxiliary_loss_clip": 0.01108011, + "auxiliary_loss_mlp": 0.01026221, + "balance_loss_clip": 1.01421666, + "balance_loss_mlp": 1.03783965, + "epoch": 0.5248158725387043, + "flos": 18806131382400.0, + "grad_norm": 1.5256282262341387, + "language_loss": 0.6966821, + "learning_rate": 1.9363337726504062e-06, + "loss": 0.71802437, + "num_input_tokens_seen": 187560425, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.703125, + "step": 8729, + "time_per_iteration": 2.470921039581299 + }, + { + "auxiliary_loss_clip": 0.0111289, + "auxiliary_loss_mlp": 0.01031243, + "balance_loss_clip": 1.01859486, + "balance_loss_mlp": 1.04002178, + "epoch": 0.5248759957913723, + "flos": 20955851521920.0, + "grad_norm": 1.7430499295764175, + "language_loss": 0.83498538, + "learning_rate": 1.935944509558464e-06, + "loss": 0.85642672, + "num_input_tokens_seen": 187579930, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7265625, + "step": 8730, + "time_per_iteration": 2.447209358215332 + }, + { + "auxiliary_loss_clip": 0.01109708, + "auxiliary_loss_mlp": 0.01033665, + "balance_loss_clip": 1.02034974, + "balance_loss_mlp": 1.03944659, + "epoch": 0.5249361190440403, + "flos": 18660980522880.0, + "grad_norm": 2.372255604306618, + "language_loss": 0.79440451, + "learning_rate": 1.9355552488955125e-06, + "loss": 0.81583822, + "num_input_tokens_seen": 187595365, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.703125, + "step": 8731, + "time_per_iteration": 2.4764487743377686 + }, + { + "auxiliary_loss_clip": 0.01104468, + "auxiliary_loss_mlp": 0.0103625, + "balance_loss_clip": 1.02373886, + "balance_loss_mlp": 1.03691411, + "epoch": 0.5249962422967083, + "flos": 24863327614080.0, + "grad_norm": 1.577877427677953, + "language_loss": 0.83057785, + "learning_rate": 1.935165990676312e-06, + "loss": 0.8519851, + "num_input_tokens_seen": 187614715, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.67578125, + "step": 8732, + "time_per_iteration": 2.4856929779052734 + }, + { + "auxiliary_loss_clip": 0.01106984, + "auxiliary_loss_mlp": 0.01032336, + "balance_loss_clip": 1.02020669, + "balance_loss_mlp": 1.03737712, + "epoch": 0.5250563655493762, + "flos": 15262681674240.0, + "grad_norm": 1.6308728168221684, + "language_loss": 0.77874607, + "learning_rate": 1.9347767349156237e-06, + "loss": 0.80013925, + "num_input_tokens_seen": 187630745, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 8733, + "time_per_iteration": 2.440887212753296 + }, + { + "auxiliary_loss_clip": 0.01113418, + "auxiliary_loss_mlp": 0.01038191, + "balance_loss_clip": 1.02521539, + "balance_loss_mlp": 1.04069221, + "epoch": 0.5251164888020442, + "flos": 18625177641600.0, + "grad_norm": 1.8154235824744323, + "language_loss": 0.81740808, + "learning_rate": 1.934387481628208e-06, + "loss": 0.83892411, + "num_input_tokens_seen": 187648200, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7265625, + "step": 8734, + "time_per_iteration": 2.4394965171813965 + }, + { + "auxiliary_loss_clip": 0.01106918, + "auxiliary_loss_mlp": 0.01030684, + "balance_loss_clip": 1.01828647, + "balance_loss_mlp": 1.03909111, + "epoch": 0.5251766120547121, + "flos": 29710764760320.0, + "grad_norm": 1.3786944232239873, + "language_loss": 0.76792759, + "learning_rate": 1.933998230828826e-06, + "loss": 0.78930354, + "num_input_tokens_seen": 187669205, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 8735, + "time_per_iteration": 2.5392351150512695 + }, + { + "auxiliary_loss_clip": 0.01110743, + "auxiliary_loss_mlp": 0.01030467, + "balance_loss_clip": 1.01861119, + "balance_loss_mlp": 1.03907919, + "epoch": 0.5252367353073801, + "flos": 23440295525760.0, + "grad_norm": 1.5767625018953106, + "language_loss": 0.80153042, + "learning_rate": 1.9336089825322376e-06, + "loss": 0.8229425, + "num_input_tokens_seen": 187690890, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.71484375, + "step": 8736, + "time_per_iteration": 2.470860242843628 + }, + { + "auxiliary_loss_clip": 0.01110972, + "auxiliary_loss_mlp": 0.01032625, + "balance_loss_clip": 1.01979208, + "balance_loss_mlp": 1.04068267, + "epoch": 0.5252968585600482, + "flos": 30810708990720.0, + "grad_norm": 2.2098484474485716, + "language_loss": 0.69838667, + "learning_rate": 1.9332197367532033e-06, + "loss": 0.71982265, + "num_input_tokens_seen": 187713045, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 8737, + "time_per_iteration": 2.5947840213775635 + }, + { + "auxiliary_loss_clip": 0.01106725, + "auxiliary_loss_mlp": 0.0103151, + "balance_loss_clip": 1.01885569, + "balance_loss_mlp": 1.0369395, + "epoch": 0.5253569818127161, + "flos": 20628274464000.0, + "grad_norm": 1.4975240773091183, + "language_loss": 0.77464664, + "learning_rate": 1.9328304935064833e-06, + "loss": 0.79602897, + "num_input_tokens_seen": 187733640, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 8738, + "time_per_iteration": 2.4910526275634766 + }, + { + "auxiliary_loss_clip": 0.01034294, + "auxiliary_loss_mlp": 0.01014673, + "balance_loss_clip": 1.01349294, + "balance_loss_mlp": 1.01161027, + "epoch": 0.5254171050653841, + "flos": 63428695810560.0, + "grad_norm": 0.7501251002484244, + "language_loss": 0.54472572, + "learning_rate": 1.932441252806837e-06, + "loss": 0.56521541, + "num_input_tokens_seen": 187792930, + "router_z_loss_clip": 0.01177979, + "router_z_loss_mlp": 0.2265625, + "step": 8739, + "time_per_iteration": 3.0936102867126465 + }, + { + "auxiliary_loss_clip": 0.01108375, + "auxiliary_loss_mlp": 0.01032467, + "balance_loss_clip": 1.02058792, + "balance_loss_mlp": 1.03920436, + "epoch": 0.525477228318052, + "flos": 34670782108800.0, + "grad_norm": 4.076584700627864, + "language_loss": 0.847902, + "learning_rate": 1.9320520146690263e-06, + "loss": 0.86931044, + "num_input_tokens_seen": 187812495, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.69140625, + "step": 8740, + "time_per_iteration": 2.5510640144348145 + }, + { + "auxiliary_loss_clip": 0.01107783, + "auxiliary_loss_mlp": 0.01034609, + "balance_loss_clip": 1.02204442, + "balance_loss_mlp": 1.0391773, + "epoch": 0.52553735157072, + "flos": 17930844766080.0, + "grad_norm": 1.9479054855450806, + "language_loss": 0.69464219, + "learning_rate": 1.9316627791078093e-06, + "loss": 0.71606612, + "num_input_tokens_seen": 187829685, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 8741, + "time_per_iteration": 2.4474291801452637 + }, + { + "auxiliary_loss_clip": 0.01112521, + "auxiliary_loss_mlp": 0.01029406, + "balance_loss_clip": 1.01657915, + "balance_loss_mlp": 1.04100168, + "epoch": 0.5255974748233879, + "flos": 9940864584960.0, + "grad_norm": 1.7696604002482594, + "language_loss": 0.6591152, + "learning_rate": 1.931273546137947e-06, + "loss": 0.68053448, + "num_input_tokens_seen": 187846495, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.71484375, + "step": 8742, + "time_per_iteration": 2.4151360988616943 + }, + { + "auxiliary_loss_clip": 0.01112065, + "auxiliary_loss_mlp": 0.01035718, + "balance_loss_clip": 1.02191377, + "balance_loss_mlp": 1.03977919, + "epoch": 0.5256575980760559, + "flos": 16868427269760.0, + "grad_norm": 2.337521906395912, + "language_loss": 0.63094312, + "learning_rate": 1.9308843157741983e-06, + "loss": 0.65242094, + "num_input_tokens_seen": 187862010, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.72265625, + "step": 8743, + "time_per_iteration": 2.4369444847106934 + }, + { + "auxiliary_loss_clip": 0.01033192, + "auxiliary_loss_mlp": 0.01006558, + "balance_loss_clip": 1.00549126, + "balance_loss_mlp": 1.01085198, + "epoch": 0.5257177213287239, + "flos": 62386210362240.0, + "grad_norm": 0.7737212884291378, + "language_loss": 0.54199207, + "learning_rate": 1.930495088031323e-06, + "loss": 0.56238955, + "num_input_tokens_seen": 187922730, + "router_z_loss_clip": 0.01068115, + "router_z_loss_mlp": 0.22363281, + "step": 8744, + "time_per_iteration": 3.1759095191955566 + }, + { + "auxiliary_loss_clip": 0.01114357, + "auxiliary_loss_mlp": 0.01031493, + "balance_loss_clip": 1.01773655, + "balance_loss_mlp": 1.04095125, + "epoch": 0.5257778445813919, + "flos": 20776908942720.0, + "grad_norm": 2.20739797588364, + "language_loss": 0.75574982, + "learning_rate": 1.9301058629240814e-06, + "loss": 0.77720833, + "num_input_tokens_seen": 187940160, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.734375, + "step": 8745, + "time_per_iteration": 2.447798728942871 + }, + { + "auxiliary_loss_clip": 0.01109482, + "auxiliary_loss_mlp": 0.01035025, + "balance_loss_clip": 1.02291942, + "balance_loss_mlp": 1.03964972, + "epoch": 0.5258379678340598, + "flos": 17018606033280.0, + "grad_norm": 1.9635902719056224, + "language_loss": 0.80408484, + "learning_rate": 1.9297166404672324e-06, + "loss": 0.82552993, + "num_input_tokens_seen": 187958625, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69921875, + "step": 8746, + "time_per_iteration": 2.4415667057037354 + }, + { + "auxiliary_loss_clip": 0.01107231, + "auxiliary_loss_mlp": 0.01032675, + "balance_loss_clip": 1.02002132, + "balance_loss_mlp": 1.03842771, + "epoch": 0.5258980910867278, + "flos": 21068754946560.0, + "grad_norm": 1.8094795225841998, + "language_loss": 0.75289273, + "learning_rate": 1.9293274206755353e-06, + "loss": 0.77429175, + "num_input_tokens_seen": 187977575, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 8747, + "time_per_iteration": 2.4909451007843018 + }, + { + "auxiliary_loss_clip": 0.01103122, + "auxiliary_loss_mlp": 0.01029398, + "balance_loss_clip": 1.01716161, + "balance_loss_mlp": 1.03701103, + "epoch": 0.5259582143393957, + "flos": 18004461690240.0, + "grad_norm": 2.3964471896172554, + "language_loss": 0.82515085, + "learning_rate": 1.9289382035637505e-06, + "loss": 0.84647602, + "num_input_tokens_seen": 187996650, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6640625, + "step": 8748, + "time_per_iteration": 2.4266607761383057 + }, + { + "auxiliary_loss_clip": 0.01108131, + "auxiliary_loss_mlp": 0.01032899, + "balance_loss_clip": 1.01948202, + "balance_loss_mlp": 1.03713202, + "epoch": 0.5260183375920637, + "flos": 22783848520320.0, + "grad_norm": 1.9711847853488498, + "language_loss": 0.80562335, + "learning_rate": 1.9285489891466345e-06, + "loss": 0.82703364, + "num_input_tokens_seen": 188013510, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7109375, + "step": 8749, + "time_per_iteration": 2.4795496463775635 + }, + { + "auxiliary_loss_clip": 0.01108885, + "auxiliary_loss_mlp": 0.01033706, + "balance_loss_clip": 1.02076626, + "balance_loss_mlp": 1.04021406, + "epoch": 0.5260784608447318, + "flos": 27052406081280.0, + "grad_norm": 1.712765899743528, + "language_loss": 0.72119522, + "learning_rate": 1.9281597774389487e-06, + "loss": 0.74262118, + "num_input_tokens_seen": 188032085, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6875, + "step": 8750, + "time_per_iteration": 2.5028066635131836 + }, + { + "auxiliary_loss_clip": 0.01105706, + "auxiliary_loss_mlp": 0.01029058, + "balance_loss_clip": 1.0166955, + "balance_loss_mlp": 1.03688407, + "epoch": 0.5261385840973997, + "flos": 20662820369280.0, + "grad_norm": 1.3484208983844765, + "language_loss": 0.76440692, + "learning_rate": 1.9277705684554517e-06, + "loss": 0.78575456, + "num_input_tokens_seen": 188050590, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6875, + "step": 8751, + "time_per_iteration": 2.49141788482666 + }, + { + "auxiliary_loss_clip": 0.0110665, + "auxiliary_loss_mlp": 0.0103178, + "balance_loss_clip": 1.01973987, + "balance_loss_mlp": 1.03969383, + "epoch": 0.5261987073500677, + "flos": 23622649896960.0, + "grad_norm": 1.3930828226372818, + "language_loss": 0.75950229, + "learning_rate": 1.927381362210902e-06, + "loss": 0.78088653, + "num_input_tokens_seen": 188071620, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66796875, + "step": 8752, + "time_per_iteration": 2.4891488552093506 + }, + { + "auxiliary_loss_clip": 0.01110452, + "auxiliary_loss_mlp": 0.0102755, + "balance_loss_clip": 1.01418078, + "balance_loss_mlp": 1.03927755, + "epoch": 0.5262588306027356, + "flos": 27636241743360.0, + "grad_norm": 1.4497375157025647, + "language_loss": 0.6776315, + "learning_rate": 1.926992158720058e-06, + "loss": 0.69901145, + "num_input_tokens_seen": 188091740, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7109375, + "step": 8753, + "time_per_iteration": 2.5364086627960205 + }, + { + "auxiliary_loss_clip": 0.01108994, + "auxiliary_loss_mlp": 0.01032754, + "balance_loss_clip": 1.02072024, + "balance_loss_mlp": 1.04052699, + "epoch": 0.5263189538554036, + "flos": 21759711943680.0, + "grad_norm": 1.4822261150811287, + "language_loss": 0.83834231, + "learning_rate": 1.9266029579976785e-06, + "loss": 0.85975981, + "num_input_tokens_seen": 188111165, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6875, + "step": 8754, + "time_per_iteration": 2.4782354831695557 + }, + { + "auxiliary_loss_clip": 0.01108303, + "auxiliary_loss_mlp": 0.01031002, + "balance_loss_clip": 1.01821733, + "balance_loss_mlp": 1.03804278, + "epoch": 0.5263790771080715, + "flos": 14276359140480.0, + "grad_norm": 2.116384687985529, + "language_loss": 0.8708753, + "learning_rate": 1.926213760058522e-06, + "loss": 0.8922683, + "num_input_tokens_seen": 188127825, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.703125, + "step": 8755, + "time_per_iteration": 2.426422357559204 + }, + { + "auxiliary_loss_clip": 0.01031717, + "auxiliary_loss_mlp": 0.01000414, + "balance_loss_clip": 0.99934119, + "balance_loss_mlp": 1.0092082, + "epoch": 0.5264392003607395, + "flos": 65806413528960.0, + "grad_norm": 0.7185760813251492, + "language_loss": 0.58853483, + "learning_rate": 1.9258245649173477e-06, + "loss": 0.60885608, + "num_input_tokens_seen": 188194050, + "router_z_loss_clip": 0.01074219, + "router_z_loss_mlp": 0.22460938, + "step": 8756, + "time_per_iteration": 3.1429710388183594 + }, + { + "auxiliary_loss_clip": 0.01109539, + "auxiliary_loss_mlp": 0.01033323, + "balance_loss_clip": 1.02052546, + "balance_loss_mlp": 1.03787899, + "epoch": 0.5264993236134075, + "flos": 21032413361280.0, + "grad_norm": 4.297833550953773, + "language_loss": 0.70166421, + "learning_rate": 1.925435372588913e-06, + "loss": 0.72309285, + "num_input_tokens_seen": 188212565, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.71875, + "step": 8757, + "time_per_iteration": 2.4352152347564697 + }, + { + "auxiliary_loss_clip": 0.01108207, + "auxiliary_loss_mlp": 0.01030645, + "balance_loss_clip": 1.01828289, + "balance_loss_mlp": 1.03741014, + "epoch": 0.5265594468660755, + "flos": 16618202150400.0, + "grad_norm": 1.637312529409449, + "language_loss": 0.8773526, + "learning_rate": 1.9250461830879768e-06, + "loss": 0.89874113, + "num_input_tokens_seen": 188229505, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.70703125, + "step": 8758, + "time_per_iteration": 2.4447832107543945 + }, + { + "auxiliary_loss_clip": 0.0110992, + "auxiliary_loss_mlp": 0.01033594, + "balance_loss_clip": 1.02048147, + "balance_loss_mlp": 1.03790975, + "epoch": 0.5266195701187434, + "flos": 24134125610880.0, + "grad_norm": 1.3883962898678874, + "language_loss": 0.76014191, + "learning_rate": 1.9246569964292965e-06, + "loss": 0.78157705, + "num_input_tokens_seen": 188250395, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.72265625, + "step": 8759, + "time_per_iteration": 2.4818501472473145 + }, + { + "auxiliary_loss_clip": 0.01102801, + "auxiliary_loss_mlp": 0.01026631, + "balance_loss_clip": 1.01460838, + "balance_loss_mlp": 1.0357269, + "epoch": 0.5266796933714114, + "flos": 15844111125120.0, + "grad_norm": 1.9978294175433113, + "language_loss": 0.71896535, + "learning_rate": 1.9242678126276307e-06, + "loss": 0.74025965, + "num_input_tokens_seen": 188266785, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 8760, + "time_per_iteration": 3.8544509410858154 + }, + { + "auxiliary_loss_clip": 0.01113013, + "auxiliary_loss_mlp": 0.01034813, + "balance_loss_clip": 1.02161074, + "balance_loss_mlp": 1.03947306, + "epoch": 0.5267398166240793, + "flos": 20951434149120.0, + "grad_norm": 1.9164441807727424, + "language_loss": 0.76221085, + "learning_rate": 1.923878631697736e-06, + "loss": 0.78368914, + "num_input_tokens_seen": 188282525, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 8761, + "time_per_iteration": 2.43031907081604 + }, + { + "auxiliary_loss_clip": 0.01108816, + "auxiliary_loss_mlp": 0.01030109, + "balance_loss_clip": 1.01879597, + "balance_loss_mlp": 1.03958154, + "epoch": 0.5267999398767473, + "flos": 20996394998400.0, + "grad_norm": 1.712095639698782, + "language_loss": 0.70643085, + "learning_rate": 1.923489453654373e-06, + "loss": 0.7278201, + "num_input_tokens_seen": 188301395, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.69140625, + "step": 8762, + "time_per_iteration": 5.321688652038574 + }, + { + "auxiliary_loss_clip": 0.01031212, + "auxiliary_loss_mlp": 0.0100382, + "balance_loss_clip": 1.00266957, + "balance_loss_mlp": 1.00896931, + "epoch": 0.5268600631294152, + "flos": 66849401767680.0, + "grad_norm": 0.9468786857883086, + "language_loss": 0.65414345, + "learning_rate": 1.9231002785122963e-06, + "loss": 0.67449379, + "num_input_tokens_seen": 188357665, + "router_z_loss_clip": 0.01147461, + "router_z_loss_mlp": 0.22265625, + "step": 8763, + "time_per_iteration": 4.360533237457275 + }, + { + "auxiliary_loss_clip": 0.0110798, + "auxiliary_loss_mlp": 0.01031724, + "balance_loss_clip": 1.01927209, + "balance_loss_mlp": 1.03798556, + "epoch": 0.5269201863820833, + "flos": 17165552572800.0, + "grad_norm": 1.6073395480000416, + "language_loss": 0.70771408, + "learning_rate": 1.922711106286265e-06, + "loss": 0.72911114, + "num_input_tokens_seen": 188376935, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.69921875, + "step": 8764, + "time_per_iteration": 2.4463791847229004 + }, + { + "auxiliary_loss_clip": 0.0110759, + "auxiliary_loss_mlp": 0.01029413, + "balance_loss_clip": 1.01640153, + "balance_loss_mlp": 1.03704798, + "epoch": 0.5269803096347513, + "flos": 20522589672960.0, + "grad_norm": 1.6766716538329436, + "language_loss": 0.74135405, + "learning_rate": 1.9223219369910368e-06, + "loss": 0.76272404, + "num_input_tokens_seen": 188394995, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.703125, + "step": 8765, + "time_per_iteration": 2.4344265460968018 + }, + { + "auxiliary_loss_clip": 0.0110988, + "auxiliary_loss_mlp": 0.01033345, + "balance_loss_clip": 1.0194571, + "balance_loss_mlp": 1.03650451, + "epoch": 0.5270404328874192, + "flos": 27230989524480.0, + "grad_norm": 1.4935943977467754, + "language_loss": 0.85193348, + "learning_rate": 1.9219327706413677e-06, + "loss": 0.87336564, + "num_input_tokens_seen": 188415475, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.734375, + "step": 8766, + "time_per_iteration": 2.52951979637146 + }, + { + "auxiliary_loss_clip": 0.0111099, + "auxiliary_loss_mlp": 0.01033628, + "balance_loss_clip": 1.0204674, + "balance_loss_mlp": 1.03980124, + "epoch": 0.5271005561400872, + "flos": 23110491824640.0, + "grad_norm": 1.7377061989269131, + "language_loss": 0.79036993, + "learning_rate": 1.921543607252017e-06, + "loss": 0.8118161, + "num_input_tokens_seen": 188435665, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 8767, + "time_per_iteration": 2.4478976726531982 + }, + { + "auxiliary_loss_clip": 0.0110965, + "auxiliary_loss_mlp": 0.01032349, + "balance_loss_clip": 1.01897943, + "balance_loss_mlp": 1.03842282, + "epoch": 0.5271606793927551, + "flos": 22564793427840.0, + "grad_norm": 1.871676480421452, + "language_loss": 0.73691523, + "learning_rate": 1.9211544468377394e-06, + "loss": 0.75833523, + "num_input_tokens_seen": 188455405, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7109375, + "step": 8768, + "time_per_iteration": 2.464952230453491 + }, + { + "auxiliary_loss_clip": 0.01106727, + "auxiliary_loss_mlp": 0.0103356, + "balance_loss_clip": 1.02222896, + "balance_loss_mlp": 1.03777611, + "epoch": 0.5272208026454231, + "flos": 18764259102720.0, + "grad_norm": 3.4895191769574354, + "language_loss": 0.74093413, + "learning_rate": 1.9207652894132933e-06, + "loss": 0.76233703, + "num_input_tokens_seen": 188472940, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.69140625, + "step": 8769, + "time_per_iteration": 2.4464261531829834 + }, + { + "auxiliary_loss_clip": 0.01108124, + "auxiliary_loss_mlp": 0.01036366, + "balance_loss_clip": 1.02372384, + "balance_loss_mlp": 1.03890908, + "epoch": 0.5272809258980911, + "flos": 20412164286720.0, + "grad_norm": 1.6831893733690892, + "language_loss": 0.7382611, + "learning_rate": 1.920376134993436e-06, + "loss": 0.75970602, + "num_input_tokens_seen": 188493035, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 8770, + "time_per_iteration": 2.4870028495788574 + }, + { + "auxiliary_loss_clip": 0.01110065, + "auxiliary_loss_mlp": 0.01030567, + "balance_loss_clip": 1.01798415, + "balance_loss_mlp": 1.03966439, + "epoch": 0.5273410491507591, + "flos": 28256742213120.0, + "grad_norm": 1.642757388746556, + "language_loss": 0.68108106, + "learning_rate": 1.9199869835929224e-06, + "loss": 0.70248735, + "num_input_tokens_seen": 188513860, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 8771, + "time_per_iteration": 2.5180561542510986 + }, + { + "auxiliary_loss_clip": 0.01106371, + "auxiliary_loss_mlp": 0.0103421, + "balance_loss_clip": 1.02130556, + "balance_loss_mlp": 1.03755426, + "epoch": 0.527401172403427, + "flos": 22455158140800.0, + "grad_norm": 1.8518077177131755, + "language_loss": 0.76476532, + "learning_rate": 1.9195978352265115e-06, + "loss": 0.78617108, + "num_input_tokens_seen": 188533345, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6875, + "step": 8772, + "time_per_iteration": 2.491196870803833 + }, + { + "auxiliary_loss_clip": 0.0111024, + "auxiliary_loss_mlp": 0.01040318, + "balance_loss_clip": 1.0271337, + "balance_loss_mlp": 1.03862512, + "epoch": 0.527461295656095, + "flos": 21031084558080.0, + "grad_norm": 1.8756798124264933, + "language_loss": 0.65986812, + "learning_rate": 1.9192086899089585e-06, + "loss": 0.68137372, + "num_input_tokens_seen": 188551550, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71484375, + "step": 8773, + "time_per_iteration": 2.464393138885498 + }, + { + "auxiliary_loss_clip": 0.01109322, + "auxiliary_loss_mlp": 0.01039476, + "balance_loss_clip": 1.02802014, + "balance_loss_mlp": 1.03791332, + "epoch": 0.5275214189087629, + "flos": 26322018929280.0, + "grad_norm": 1.5758079694219151, + "language_loss": 0.86029238, + "learning_rate": 1.91881954765502e-06, + "loss": 0.88178039, + "num_input_tokens_seen": 188571615, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.71484375, + "step": 8774, + "time_per_iteration": 2.5185675621032715 + }, + { + "auxiliary_loss_clip": 0.01105827, + "auxiliary_loss_mlp": 0.01030351, + "balance_loss_clip": 1.01860261, + "balance_loss_mlp": 1.03663182, + "epoch": 0.5275815421614309, + "flos": 20047024581120.0, + "grad_norm": 1.5254562165137588, + "language_loss": 0.79877412, + "learning_rate": 1.9184304084794523e-06, + "loss": 0.82013589, + "num_input_tokens_seen": 188591965, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.69140625, + "step": 8775, + "time_per_iteration": 2.454387664794922 + }, + { + "auxiliary_loss_clip": 0.01104897, + "auxiliary_loss_mlp": 0.01037755, + "balance_loss_clip": 1.02507758, + "balance_loss_mlp": 1.03681672, + "epoch": 0.5276416654140988, + "flos": 21432206712960.0, + "grad_norm": 1.7390352493983339, + "language_loss": 0.83807105, + "learning_rate": 1.918041272397012e-06, + "loss": 0.85949761, + "num_input_tokens_seen": 188610675, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 8776, + "time_per_iteration": 2.5026144981384277 + }, + { + "auxiliary_loss_clip": 0.01108103, + "auxiliary_loss_mlp": 0.01028347, + "balance_loss_clip": 1.01603246, + "balance_loss_mlp": 1.03759074, + "epoch": 0.5277017886667669, + "flos": 17165085696000.0, + "grad_norm": 1.6658876230443522, + "language_loss": 0.68375832, + "learning_rate": 1.9176521394224547e-06, + "loss": 0.70512283, + "num_input_tokens_seen": 188628235, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 8777, + "time_per_iteration": 2.417186975479126 + }, + { + "auxiliary_loss_clip": 0.01108134, + "auxiliary_loss_mlp": 0.01037656, + "balance_loss_clip": 1.0258069, + "balance_loss_mlp": 1.04009652, + "epoch": 0.5277619119194349, + "flos": 20448146736000.0, + "grad_norm": 2.132165937202497, + "language_loss": 0.82494706, + "learning_rate": 1.9172630095705358e-06, + "loss": 0.84640491, + "num_input_tokens_seen": 188648925, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 8778, + "time_per_iteration": 2.487772226333618 + }, + { + "auxiliary_loss_clip": 0.01110776, + "auxiliary_loss_mlp": 0.01033482, + "balance_loss_clip": 1.02037513, + "balance_loss_mlp": 1.04014647, + "epoch": 0.5278220351721028, + "flos": 24061083304320.0, + "grad_norm": 2.126071455139116, + "language_loss": 0.79359961, + "learning_rate": 1.916873882856013e-06, + "loss": 0.8150422, + "num_input_tokens_seen": 188668125, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.70703125, + "step": 8779, + "time_per_iteration": 2.4676833152770996 + }, + { + "auxiliary_loss_clip": 0.01102313, + "auxiliary_loss_mlp": 0.01031379, + "balance_loss_clip": 1.01942825, + "balance_loss_mlp": 1.03535295, + "epoch": 0.5278821584247708, + "flos": 24642907804800.0, + "grad_norm": 2.916693496001438, + "language_loss": 0.7667526, + "learning_rate": 1.9164847592936406e-06, + "loss": 0.78808951, + "num_input_tokens_seen": 188684410, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66796875, + "step": 8780, + "time_per_iteration": 2.489880323410034 + }, + { + "auxiliary_loss_clip": 0.01113237, + "auxiliary_loss_mlp": 0.01030309, + "balance_loss_clip": 1.01724982, + "balance_loss_mlp": 1.04084253, + "epoch": 0.5279422816774387, + "flos": 35408244240000.0, + "grad_norm": 1.5814481661794648, + "language_loss": 0.69506466, + "learning_rate": 1.916095638898174e-06, + "loss": 0.71650016, + "num_input_tokens_seen": 188706130, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.72265625, + "step": 8781, + "time_per_iteration": 2.570308208465576 + }, + { + "auxiliary_loss_clip": 0.01105161, + "auxiliary_loss_mlp": 0.0103252, + "balance_loss_clip": 1.02130246, + "balance_loss_mlp": 1.03748012, + "epoch": 0.5280024049301068, + "flos": 22967028904320.0, + "grad_norm": 1.5392288400315197, + "language_loss": 0.72434068, + "learning_rate": 1.9157065216843696e-06, + "loss": 0.74571753, + "num_input_tokens_seen": 188725030, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6796875, + "step": 8782, + "time_per_iteration": 2.4902799129486084 + }, + { + "auxiliary_loss_clip": 0.01104346, + "auxiliary_loss_mlp": 0.01027508, + "balance_loss_clip": 1.01557565, + "balance_loss_mlp": 1.03629112, + "epoch": 0.5280625281827747, + "flos": 21507619317120.0, + "grad_norm": 1.9147695733655095, + "language_loss": 0.68684381, + "learning_rate": 1.915317407666982e-06, + "loss": 0.70816237, + "num_input_tokens_seen": 188744325, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 8783, + "time_per_iteration": 2.4489378929138184 + }, + { + "auxiliary_loss_clip": 0.01115533, + "auxiliary_loss_mlp": 0.01037336, + "balance_loss_clip": 1.02257824, + "balance_loss_mlp": 1.04052663, + "epoch": 0.5281226514354427, + "flos": 31208167958400.0, + "grad_norm": 1.8253305439767769, + "language_loss": 0.69502926, + "learning_rate": 1.9149282968607674e-06, + "loss": 0.71655798, + "num_input_tokens_seen": 188765100, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.75, + "step": 8784, + "time_per_iteration": 2.55877947807312 + }, + { + "auxiliary_loss_clip": 0.0111041, + "auxiliary_loss_mlp": 0.01032831, + "balance_loss_clip": 1.01936626, + "balance_loss_mlp": 1.03718495, + "epoch": 0.5281827746881106, + "flos": 25077821679360.0, + "grad_norm": 2.137542562274274, + "language_loss": 0.75317723, + "learning_rate": 1.91453918928048e-06, + "loss": 0.77460963, + "num_input_tokens_seen": 188783995, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73046875, + "step": 8785, + "time_per_iteration": 2.5042202472686768 + }, + { + "auxiliary_loss_clip": 0.01109301, + "auxiliary_loss_mlp": 0.01031712, + "balance_loss_clip": 1.01858115, + "balance_loss_mlp": 1.03923512, + "epoch": 0.5282428979407786, + "flos": 20631255292800.0, + "grad_norm": 1.5356836172740989, + "language_loss": 0.8301636, + "learning_rate": 1.9141500849408745e-06, + "loss": 0.85157377, + "num_input_tokens_seen": 188803120, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.69921875, + "step": 8786, + "time_per_iteration": 2.530207872390747 + }, + { + "auxiliary_loss_clip": 0.01102608, + "auxiliary_loss_mlp": 0.01023798, + "balance_loss_clip": 1.01268828, + "balance_loss_mlp": 1.03662145, + "epoch": 0.5283030211934465, + "flos": 22419391173120.0, + "grad_norm": 6.419117505425037, + "language_loss": 0.8292653, + "learning_rate": 1.9137609838567076e-06, + "loss": 0.85052931, + "num_input_tokens_seen": 188820960, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66015625, + "step": 8787, + "time_per_iteration": 2.450303792953491 + }, + { + "auxiliary_loss_clip": 0.0110476, + "auxiliary_loss_mlp": 0.01027361, + "balance_loss_clip": 1.01601219, + "balance_loss_mlp": 1.03739762, + "epoch": 0.5283631444461145, + "flos": 23615467176960.0, + "grad_norm": 1.657610649379585, + "language_loss": 0.83385652, + "learning_rate": 1.9133718860427316e-06, + "loss": 0.85517776, + "num_input_tokens_seen": 188837165, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.671875, + "step": 8788, + "time_per_iteration": 2.4752538204193115 + }, + { + "auxiliary_loss_clip": 0.01107311, + "auxiliary_loss_mlp": 0.01036961, + "balance_loss_clip": 1.02353776, + "balance_loss_mlp": 1.04022217, + "epoch": 0.5284232676987825, + "flos": 32671994918400.0, + "grad_norm": 1.6616469699693164, + "language_loss": 0.7467941, + "learning_rate": 1.9129827915137027e-06, + "loss": 0.76823682, + "num_input_tokens_seen": 188858555, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.671875, + "step": 8789, + "time_per_iteration": 2.5324580669403076 + }, + { + "auxiliary_loss_clip": 0.01109504, + "auxiliary_loss_mlp": 0.01032501, + "balance_loss_clip": 1.01999021, + "balance_loss_mlp": 1.03898668, + "epoch": 0.5284833909514505, + "flos": 26760919213440.0, + "grad_norm": 1.4692396487834778, + "language_loss": 0.69505095, + "learning_rate": 1.9125937002843754e-06, + "loss": 0.71647108, + "num_input_tokens_seen": 188879050, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 8790, + "time_per_iteration": 2.51625919342041 + }, + { + "auxiliary_loss_clip": 0.01104373, + "auxiliary_loss_mlp": 0.01027676, + "balance_loss_clip": 1.01602292, + "balance_loss_mlp": 1.03740895, + "epoch": 0.5285435142041185, + "flos": 22090700793600.0, + "grad_norm": 1.5973748463846205, + "language_loss": 0.78992987, + "learning_rate": 1.9122046123695036e-06, + "loss": 0.81125033, + "num_input_tokens_seen": 188898885, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.671875, + "step": 8791, + "time_per_iteration": 2.4552273750305176 + }, + { + "auxiliary_loss_clip": 0.01108186, + "auxiliary_loss_mlp": 0.01029947, + "balance_loss_clip": 1.01800871, + "balance_loss_mlp": 1.04050541, + "epoch": 0.5286036374567864, + "flos": 20375463565440.0, + "grad_norm": 1.8738977568036352, + "language_loss": 0.66256213, + "learning_rate": 1.9118155277838423e-06, + "loss": 0.68394351, + "num_input_tokens_seen": 188917225, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 8792, + "time_per_iteration": 2.485501527786255 + }, + { + "auxiliary_loss_clip": 0.01103536, + "auxiliary_loss_mlp": 0.01035011, + "balance_loss_clip": 1.02362621, + "balance_loss_mlp": 1.03610563, + "epoch": 0.5286637607094544, + "flos": 24352175122560.0, + "grad_norm": 2.0158719758485226, + "language_loss": 0.79919344, + "learning_rate": 1.9114264465421443e-06, + "loss": 0.82057893, + "num_input_tokens_seen": 188936120, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.671875, + "step": 8793, + "time_per_iteration": 2.4918789863586426 + }, + { + "auxiliary_loss_clip": 0.01108596, + "auxiliary_loss_mlp": 0.01036682, + "balance_loss_clip": 1.02393866, + "balance_loss_mlp": 1.03883982, + "epoch": 0.5287238839621223, + "flos": 17271165536640.0, + "grad_norm": 1.8030848585204593, + "language_loss": 0.84791529, + "learning_rate": 1.9110373686591645e-06, + "loss": 0.86936802, + "num_input_tokens_seen": 188953405, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 8794, + "time_per_iteration": 2.451828718185425 + }, + { + "auxiliary_loss_clip": 0.01110927, + "auxiliary_loss_mlp": 0.01032186, + "balance_loss_clip": 1.01894772, + "balance_loss_mlp": 1.03798628, + "epoch": 0.5287840072147904, + "flos": 17566890209280.0, + "grad_norm": 1.927550813134725, + "language_loss": 0.67570889, + "learning_rate": 1.9106482941496564e-06, + "loss": 0.69714004, + "num_input_tokens_seen": 188971150, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73046875, + "step": 8795, + "time_per_iteration": 2.4460599422454834 + }, + { + "auxiliary_loss_clip": 0.01107843, + "auxiliary_loss_mlp": 0.01029404, + "balance_loss_clip": 1.01716161, + "balance_loss_mlp": 1.03754616, + "epoch": 0.5288441304674583, + "flos": 18552099421440.0, + "grad_norm": 1.883468232968509, + "language_loss": 0.80662012, + "learning_rate": 1.910259223028374e-06, + "loss": 0.82799256, + "num_input_tokens_seen": 188989550, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.703125, + "step": 8796, + "time_per_iteration": 2.4592626094818115 + }, + { + "auxiliary_loss_clip": 0.01112299, + "auxiliary_loss_mlp": 0.0103268, + "balance_loss_clip": 1.01978111, + "balance_loss_mlp": 1.04186153, + "epoch": 0.5289042537201263, + "flos": 20814507504000.0, + "grad_norm": 1.9732503530858911, + "language_loss": 0.69071984, + "learning_rate": 1.909870155310071e-06, + "loss": 0.71216959, + "num_input_tokens_seen": 189008795, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 8797, + "time_per_iteration": 2.4451231956481934 + }, + { + "auxiliary_loss_clip": 0.01103286, + "auxiliary_loss_mlp": 0.01032584, + "balance_loss_clip": 1.02128911, + "balance_loss_mlp": 1.03739119, + "epoch": 0.5289643769727942, + "flos": 15735265937280.0, + "grad_norm": 1.7017381786261847, + "language_loss": 0.82339096, + "learning_rate": 1.9094810910095005e-06, + "loss": 0.84474969, + "num_input_tokens_seen": 189025540, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.66015625, + "step": 8798, + "time_per_iteration": 2.4694111347198486 + }, + { + "auxiliary_loss_clip": 0.01111092, + "auxiliary_loss_mlp": 0.0103436, + "balance_loss_clip": 1.02102065, + "balance_loss_mlp": 1.03840899, + "epoch": 0.5290245002254622, + "flos": 19537308633600.0, + "grad_norm": 2.0619187329461575, + "language_loss": 0.70591879, + "learning_rate": 1.9090920301414166e-06, + "loss": 0.72737336, + "num_input_tokens_seen": 189044885, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7265625, + "step": 8799, + "time_per_iteration": 2.456692695617676 + }, + { + "auxiliary_loss_clip": 0.01104599, + "auxiliary_loss_mlp": 0.01032577, + "balance_loss_clip": 1.02132988, + "balance_loss_mlp": 1.03975451, + "epoch": 0.5290846234781301, + "flos": 15815131827840.0, + "grad_norm": 1.8240531153484045, + "language_loss": 0.69601536, + "learning_rate": 1.9087029727205716e-06, + "loss": 0.71738708, + "num_input_tokens_seen": 189061280, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 8800, + "time_per_iteration": 2.490417242050171 + }, + { + "auxiliary_loss_clip": 0.01036269, + "auxiliary_loss_mlp": 0.01012691, + "balance_loss_clip": 1.01148117, + "balance_loss_mlp": 1.01404071, + "epoch": 0.5291447467307981, + "flos": 70057624821120.0, + "grad_norm": 0.998441198923784, + "language_loss": 0.57013941, + "learning_rate": 1.9083139187617193e-06, + "loss": 0.59062898, + "num_input_tokens_seen": 189114775, + "router_z_loss_clip": 0.01208496, + "router_z_loss_mlp": 0.22265625, + "step": 8801, + "time_per_iteration": 4.385375022888184 + }, + { + "auxiliary_loss_clip": 0.01109021, + "auxiliary_loss_mlp": 0.0103377, + "balance_loss_clip": 1.02168214, + "balance_loss_mlp": 1.03874719, + "epoch": 0.529204869983466, + "flos": 28364186770560.0, + "grad_norm": 1.5128121202389628, + "language_loss": 0.63942313, + "learning_rate": 1.9079248682796123e-06, + "loss": 0.66085106, + "num_input_tokens_seen": 189134700, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.703125, + "step": 8802, + "time_per_iteration": 2.5486578941345215 + }, + { + "auxiliary_loss_clip": 0.01104672, + "auxiliary_loss_mlp": 0.01029396, + "balance_loss_clip": 1.01684964, + "balance_loss_mlp": 1.03677487, + "epoch": 0.5292649932361341, + "flos": 33758830684800.0, + "grad_norm": 1.7172902320691381, + "language_loss": 0.68250531, + "learning_rate": 1.907535821289003e-06, + "loss": 0.70384604, + "num_input_tokens_seen": 189155365, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 8803, + "time_per_iteration": 2.576460361480713 + }, + { + "auxiliary_loss_clip": 0.01102984, + "auxiliary_loss_mlp": 0.01034692, + "balance_loss_clip": 1.02233613, + "balance_loss_mlp": 1.03654003, + "epoch": 0.5293251164888021, + "flos": 20447679859200.0, + "grad_norm": 1.6769492859989101, + "language_loss": 0.76551962, + "learning_rate": 1.9071467778046458e-06, + "loss": 0.78689635, + "num_input_tokens_seen": 189173885, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6640625, + "step": 8804, + "time_per_iteration": 4.018502473831177 + }, + { + "auxiliary_loss_clip": 0.0103564, + "auxiliary_loss_mlp": 0.01005394, + "balance_loss_clip": 1.00417256, + "balance_loss_mlp": 1.01327515, + "epoch": 0.52938523974147, + "flos": 66545312204160.0, + "grad_norm": 0.749734320345171, + "language_loss": 0.53018034, + "learning_rate": 1.906757737841291e-06, + "loss": 0.55059063, + "num_input_tokens_seen": 189236515, + "router_z_loss_clip": 0.01220703, + "router_z_loss_mlp": 0.22363281, + "step": 8805, + "time_per_iteration": 4.599541902542114 + }, + { + "auxiliary_loss_clip": 0.01034831, + "auxiliary_loss_mlp": 0.01001215, + "balance_loss_clip": 0.99995738, + "balance_loss_mlp": 1.0124402, + "epoch": 0.529445362994138, + "flos": 67151734542720.0, + "grad_norm": 0.7381494507925852, + "language_loss": 0.63778675, + "learning_rate": 1.906368701413693e-06, + "loss": 0.65814722, + "num_input_tokens_seen": 189300500, + "router_z_loss_clip": 0.01257324, + "router_z_loss_mlp": 0.22460938, + "step": 8806, + "time_per_iteration": 3.067852735519409 + }, + { + "auxiliary_loss_clip": 0.01110413, + "auxiliary_loss_mlp": 0.01034235, + "balance_loss_clip": 1.02229047, + "balance_loss_mlp": 1.03770947, + "epoch": 0.5295054862468059, + "flos": 17749316407680.0, + "grad_norm": 1.9894097123133165, + "language_loss": 0.72397399, + "learning_rate": 1.9059796685366026e-06, + "loss": 0.74542046, + "num_input_tokens_seen": 189319745, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.7265625, + "step": 8807, + "time_per_iteration": 2.4303808212280273 + }, + { + "auxiliary_loss_clip": 0.0110442, + "auxiliary_loss_mlp": 0.01029131, + "balance_loss_clip": 1.01775241, + "balance_loss_mlp": 1.03735805, + "epoch": 0.529565609499474, + "flos": 11397401084160.0, + "grad_norm": 4.619049711580288, + "language_loss": 0.69640231, + "learning_rate": 1.9055906392247723e-06, + "loss": 0.71773779, + "num_input_tokens_seen": 189334550, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.671875, + "step": 8808, + "time_per_iteration": 2.418649435043335 + }, + { + "auxiliary_loss_clip": 0.01105928, + "auxiliary_loss_mlp": 0.01029731, + "balance_loss_clip": 1.01817942, + "balance_loss_mlp": 1.03796387, + "epoch": 0.5296257327521419, + "flos": 17196363463680.0, + "grad_norm": 1.7756221154666856, + "language_loss": 0.8668943, + "learning_rate": 1.9052016134929554e-06, + "loss": 0.88825089, + "num_input_tokens_seen": 189351735, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6796875, + "step": 8809, + "time_per_iteration": 2.413883686065674 + }, + { + "auxiliary_loss_clip": 0.01112398, + "auxiliary_loss_mlp": 0.01034558, + "balance_loss_clip": 1.0209322, + "balance_loss_mlp": 1.03908372, + "epoch": 0.5296858560048099, + "flos": 39964086777600.0, + "grad_norm": 1.608353260814621, + "language_loss": 0.64362073, + "learning_rate": 1.9048125913559016e-06, + "loss": 0.66509026, + "num_input_tokens_seen": 189373105, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73046875, + "step": 8810, + "time_per_iteration": 2.6121585369110107 + }, + { + "auxiliary_loss_clip": 0.0110573, + "auxiliary_loss_mlp": 0.01035568, + "balance_loss_clip": 1.02372456, + "balance_loss_mlp": 1.03820479, + "epoch": 0.5297459792574778, + "flos": 20961418129920.0, + "grad_norm": 1.5055977388002117, + "language_loss": 0.68083066, + "learning_rate": 1.9044235728283646e-06, + "loss": 0.70224369, + "num_input_tokens_seen": 189394615, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 8811, + "time_per_iteration": 2.4806406497955322 + }, + { + "auxiliary_loss_clip": 0.010328, + "auxiliary_loss_mlp": 0.00998698, + "balance_loss_clip": 0.99739295, + "balance_loss_mlp": 1.01059103, + "epoch": 0.5298061025101458, + "flos": 66523620389760.0, + "grad_norm": 0.6652461754552681, + "language_loss": 0.53400505, + "learning_rate": 1.9040345579250953e-06, + "loss": 0.5543201, + "num_input_tokens_seen": 189459750, + "router_z_loss_clip": 0.01306152, + "router_z_loss_mlp": 0.22265625, + "step": 8812, + "time_per_iteration": 3.175478458404541 + }, + { + "auxiliary_loss_clip": 0.01032825, + "auxiliary_loss_mlp": 0.01000267, + "balance_loss_clip": 0.99906272, + "balance_loss_mlp": 1.01074851, + "epoch": 0.5298662257628137, + "flos": 67662994775040.0, + "grad_norm": 0.7207460213448722, + "language_loss": 0.56372511, + "learning_rate": 1.9036455466608453e-06, + "loss": 0.58405602, + "num_input_tokens_seen": 189527540, + "router_z_loss_clip": 0.01202393, + "router_z_loss_mlp": 0.22070312, + "step": 8813, + "time_per_iteration": 3.1315269470214844 + }, + { + "auxiliary_loss_clip": 0.01102589, + "auxiliary_loss_mlp": 0.01028915, + "balance_loss_clip": 1.01751852, + "balance_loss_mlp": 1.03824615, + "epoch": 0.5299263490154817, + "flos": 19646405216640.0, + "grad_norm": 1.5478508872520975, + "language_loss": 0.81618506, + "learning_rate": 1.9032565390503657e-06, + "loss": 0.8375001, + "num_input_tokens_seen": 189546900, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.640625, + "step": 8814, + "time_per_iteration": 2.431269884109497 + }, + { + "auxiliary_loss_clip": 0.01113436, + "auxiliary_loss_mlp": 0.01028986, + "balance_loss_clip": 1.01687407, + "balance_loss_mlp": 1.04241931, + "epoch": 0.5299864722681497, + "flos": 22055005653120.0, + "grad_norm": 1.5843849623618003, + "language_loss": 0.84997016, + "learning_rate": 1.9028675351084076e-06, + "loss": 0.8713944, + "num_input_tokens_seen": 189566490, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.7109375, + "step": 8815, + "time_per_iteration": 2.531074285507202 + }, + { + "auxiliary_loss_clip": 0.01105692, + "auxiliary_loss_mlp": 0.01030775, + "balance_loss_clip": 1.01968288, + "balance_loss_mlp": 1.03940964, + "epoch": 0.5300465955208177, + "flos": 21763698353280.0, + "grad_norm": 2.126267576495584, + "language_loss": 0.66768968, + "learning_rate": 1.9024785348497225e-06, + "loss": 0.68905437, + "num_input_tokens_seen": 189585580, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6640625, + "step": 8816, + "time_per_iteration": 2.525468111038208 + }, + { + "auxiliary_loss_clip": 0.01107527, + "auxiliary_loss_mlp": 0.01033036, + "balance_loss_clip": 1.0210259, + "balance_loss_mlp": 1.03860188, + "epoch": 0.5301067187734857, + "flos": 42996491735040.0, + "grad_norm": 1.7854125043951103, + "language_loss": 0.72206688, + "learning_rate": 1.9020895382890611e-06, + "loss": 0.74347246, + "num_input_tokens_seen": 189608485, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6875, + "step": 8817, + "time_per_iteration": 2.6937406063079834 + }, + { + "auxiliary_loss_clip": 0.01107001, + "auxiliary_loss_mlp": 0.01031476, + "balance_loss_clip": 1.01921499, + "balance_loss_mlp": 1.03620088, + "epoch": 0.5301668420261536, + "flos": 20554298403840.0, + "grad_norm": 1.6863401200151742, + "language_loss": 0.6522249, + "learning_rate": 1.9017005454411743e-06, + "loss": 0.67360961, + "num_input_tokens_seen": 189627815, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.70703125, + "step": 8818, + "time_per_iteration": 2.509539842605591 + }, + { + "auxiliary_loss_clip": 0.0110849, + "auxiliary_loss_mlp": 0.01027368, + "balance_loss_clip": 1.01462412, + "balance_loss_mlp": 1.0393914, + "epoch": 0.5302269652788216, + "flos": 17486665182720.0, + "grad_norm": 1.999877555758676, + "language_loss": 0.75154972, + "learning_rate": 1.9013115563208126e-06, + "loss": 0.77290833, + "num_input_tokens_seen": 189644850, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 8819, + "time_per_iteration": 2.473130702972412 + }, + { + "auxiliary_loss_clip": 0.01110233, + "auxiliary_loss_mlp": 0.0103388, + "balance_loss_clip": 1.0214107, + "balance_loss_mlp": 1.03858495, + "epoch": 0.5302870885314895, + "flos": 14574202715520.0, + "grad_norm": 2.27674417450437, + "language_loss": 0.82333302, + "learning_rate": 1.9009225709427267e-06, + "loss": 0.84477413, + "num_input_tokens_seen": 189660945, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71875, + "step": 8820, + "time_per_iteration": 2.4328434467315674 + }, + { + "auxiliary_loss_clip": 0.01106236, + "auxiliary_loss_mlp": 0.01031495, + "balance_loss_clip": 1.02030122, + "balance_loss_mlp": 1.03725612, + "epoch": 0.5303472117841576, + "flos": 23438032968960.0, + "grad_norm": 2.049749716635941, + "language_loss": 0.72593045, + "learning_rate": 1.9005335893216667e-06, + "loss": 0.74730772, + "num_input_tokens_seen": 189680425, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.69140625, + "step": 8821, + "time_per_iteration": 2.508608102798462 + }, + { + "auxiliary_loss_clip": 0.01102657, + "auxiliary_loss_mlp": 0.01028883, + "balance_loss_clip": 1.01779675, + "balance_loss_mlp": 1.0363605, + "epoch": 0.5304073350368255, + "flos": 22709010533760.0, + "grad_norm": 1.3923419148404492, + "language_loss": 0.73939008, + "learning_rate": 1.9001446114723824e-06, + "loss": 0.76070547, + "num_input_tokens_seen": 189700375, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6640625, + "step": 8822, + "time_per_iteration": 2.4427592754364014 + }, + { + "auxiliary_loss_clip": 0.01107083, + "auxiliary_loss_mlp": 0.01035958, + "balance_loss_clip": 1.02312553, + "balance_loss_mlp": 1.03773904, + "epoch": 0.5304674582894935, + "flos": 27928554624000.0, + "grad_norm": 1.6902308577802683, + "language_loss": 0.67477053, + "learning_rate": 1.8997556374096257e-06, + "loss": 0.69620097, + "num_input_tokens_seen": 189721225, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 8823, + "time_per_iteration": 2.5047175884246826 + }, + { + "auxiliary_loss_clip": 0.0110955, + "auxiliary_loss_mlp": 0.0103452, + "balance_loss_clip": 1.02113247, + "balance_loss_mlp": 1.03756142, + "epoch": 0.5305275815421614, + "flos": 21250642440960.0, + "grad_norm": 1.5189625554392572, + "language_loss": 0.69347805, + "learning_rate": 1.8993666671481444e-06, + "loss": 0.71491873, + "num_input_tokens_seen": 189740170, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71875, + "step": 8824, + "time_per_iteration": 2.4358925819396973 + }, + { + "auxiliary_loss_clip": 0.01104053, + "auxiliary_loss_mlp": 0.01030719, + "balance_loss_clip": 1.01879227, + "balance_loss_mlp": 1.03755724, + "epoch": 0.5305877047948294, + "flos": 17603088140160.0, + "grad_norm": 2.2315847136946956, + "language_loss": 0.75412273, + "learning_rate": 1.898977700702689e-06, + "loss": 0.77547044, + "num_input_tokens_seen": 189757890, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 8825, + "time_per_iteration": 2.480656385421753 + }, + { + "auxiliary_loss_clip": 0.01105338, + "auxiliary_loss_mlp": 0.01036746, + "balance_loss_clip": 1.02433622, + "balance_loss_mlp": 1.03730893, + "epoch": 0.5306478280474973, + "flos": 15195493284480.0, + "grad_norm": 2.0577399670241125, + "language_loss": 0.85668242, + "learning_rate": 1.8985887380880103e-06, + "loss": 0.87810326, + "num_input_tokens_seen": 189775390, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 8826, + "time_per_iteration": 2.422227621078491 + }, + { + "auxiliary_loss_clip": 0.01103641, + "auxiliary_loss_mlp": 0.01030185, + "balance_loss_clip": 1.01760268, + "balance_loss_mlp": 1.03594768, + "epoch": 0.5307079513001653, + "flos": 15341218761600.0, + "grad_norm": 1.3501660325975628, + "language_loss": 0.64042354, + "learning_rate": 1.8981997793188558e-06, + "loss": 0.66176176, + "num_input_tokens_seen": 189793975, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.67578125, + "step": 8827, + "time_per_iteration": 2.461434841156006 + }, + { + "auxiliary_loss_clip": 0.0110958, + "auxiliary_loss_mlp": 0.0103759, + "balance_loss_clip": 1.02452421, + "balance_loss_mlp": 1.03835428, + "epoch": 0.5307680745528333, + "flos": 43544452688640.0, + "grad_norm": 1.5699076783392119, + "language_loss": 0.60028976, + "learning_rate": 1.8978108244099762e-06, + "loss": 0.62176144, + "num_input_tokens_seen": 189817870, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71484375, + "step": 8828, + "time_per_iteration": 2.621673107147217 + }, + { + "auxiliary_loss_clip": 0.01109989, + "auxiliary_loss_mlp": 0.01030803, + "balance_loss_clip": 1.01791096, + "balance_loss_mlp": 1.03909802, + "epoch": 0.5308281978055013, + "flos": 20048928001920.0, + "grad_norm": 1.7449235888895405, + "language_loss": 0.81386358, + "learning_rate": 1.8974218733761208e-06, + "loss": 0.83527148, + "num_input_tokens_seen": 189837905, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 8829, + "time_per_iteration": 2.472055673599243 + }, + { + "auxiliary_loss_clip": 0.0110653, + "auxiliary_loss_mlp": 0.01033982, + "balance_loss_clip": 1.02196574, + "balance_loss_mlp": 1.03871477, + "epoch": 0.5308883210581693, + "flos": 20703938463360.0, + "grad_norm": 1.483207387046285, + "language_loss": 0.78292549, + "learning_rate": 1.8970329262320375e-06, + "loss": 0.80433053, + "num_input_tokens_seen": 189856970, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 8830, + "time_per_iteration": 2.4544272422790527 + }, + { + "auxiliary_loss_clip": 0.01106311, + "auxiliary_loss_mlp": 0.01031189, + "balance_loss_clip": 1.01877975, + "balance_loss_mlp": 1.03778768, + "epoch": 0.5309484443108372, + "flos": 14355506759040.0, + "grad_norm": 2.0257257472461525, + "language_loss": 0.80643964, + "learning_rate": 1.8966439829924768e-06, + "loss": 0.82781464, + "num_input_tokens_seen": 189872830, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 8831, + "time_per_iteration": 2.4307594299316406 + }, + { + "auxiliary_loss_clip": 0.01103982, + "auxiliary_loss_mlp": 0.01028528, + "balance_loss_clip": 1.01611233, + "balance_loss_mlp": 1.03561974, + "epoch": 0.5310085675635052, + "flos": 20010503427840.0, + "grad_norm": 2.026603228036347, + "language_loss": 0.73146117, + "learning_rate": 1.896255043672186e-06, + "loss": 0.75278628, + "num_input_tokens_seen": 189891635, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 8832, + "time_per_iteration": 2.429567813873291 + }, + { + "auxiliary_loss_clip": 0.01111675, + "auxiliary_loss_mlp": 0.01034067, + "balance_loss_clip": 1.02162194, + "balance_loss_mlp": 1.04065752, + "epoch": 0.5310686908161731, + "flos": 22127293774080.0, + "grad_norm": 1.9229428073701915, + "language_loss": 0.75382435, + "learning_rate": 1.8958661082859143e-06, + "loss": 0.77528179, + "num_input_tokens_seen": 189909050, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 8833, + "time_per_iteration": 2.4731011390686035 + }, + { + "auxiliary_loss_clip": 0.01107496, + "auxiliary_loss_mlp": 0.01030529, + "balance_loss_clip": 1.01733804, + "balance_loss_mlp": 1.03697777, + "epoch": 0.5311288140688412, + "flos": 24717889445760.0, + "grad_norm": 1.9718581367947616, + "language_loss": 0.73314357, + "learning_rate": 1.8954771768484103e-06, + "loss": 0.75452387, + "num_input_tokens_seen": 189927405, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 8834, + "time_per_iteration": 2.476289987564087 + }, + { + "auxiliary_loss_clip": 0.01113252, + "auxiliary_loss_mlp": 0.01033749, + "balance_loss_clip": 1.01980758, + "balance_loss_mlp": 1.03958392, + "epoch": 0.5311889373215091, + "flos": 24097712198400.0, + "grad_norm": 2.0084943443028975, + "language_loss": 0.77603996, + "learning_rate": 1.8950882493744226e-06, + "loss": 0.79750997, + "num_input_tokens_seen": 189947740, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.73828125, + "step": 8835, + "time_per_iteration": 2.512998104095459 + }, + { + "auxiliary_loss_clip": 0.01106643, + "auxiliary_loss_mlp": 0.01036561, + "balance_loss_clip": 1.02318025, + "balance_loss_mlp": 1.03647518, + "epoch": 0.5312490605741771, + "flos": 22017012042240.0, + "grad_norm": 1.8374817013403106, + "language_loss": 0.72753531, + "learning_rate": 1.8946993258786985e-06, + "loss": 0.74896735, + "num_input_tokens_seen": 189966495, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.703125, + "step": 8836, + "time_per_iteration": 2.4509310722351074 + }, + { + "auxiliary_loss_clip": 0.01108843, + "auxiliary_loss_mlp": 0.01033453, + "balance_loss_clip": 1.01986957, + "balance_loss_mlp": 1.03784788, + "epoch": 0.531309183826845, + "flos": 19390541662080.0, + "grad_norm": 2.66525227198108, + "language_loss": 0.80936503, + "learning_rate": 1.894310406375987e-06, + "loss": 0.83078802, + "num_input_tokens_seen": 189985325, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.70703125, + "step": 8837, + "time_per_iteration": 2.471662759780884 + }, + { + "auxiliary_loss_clip": 0.0110708, + "auxiliary_loss_mlp": 0.01028737, + "balance_loss_clip": 1.01615477, + "balance_loss_mlp": 1.03874159, + "epoch": 0.531369307079513, + "flos": 20190056538240.0, + "grad_norm": 1.8452061032611426, + "language_loss": 0.85926068, + "learning_rate": 1.893921490881035e-06, + "loss": 0.88061881, + "num_input_tokens_seen": 190003290, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.68359375, + "step": 8838, + "time_per_iteration": 2.4360713958740234 + }, + { + "auxiliary_loss_clip": 0.011058, + "auxiliary_loss_mlp": 0.01029554, + "balance_loss_clip": 1.01779366, + "balance_loss_mlp": 1.03785229, + "epoch": 0.5314294303321809, + "flos": 18880143356160.0, + "grad_norm": 1.8224224127823847, + "language_loss": 0.7208544, + "learning_rate": 1.8935325794085906e-06, + "loss": 0.74220788, + "num_input_tokens_seen": 190023260, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6796875, + "step": 8839, + "time_per_iteration": 2.4806606769561768 + }, + { + "auxiliary_loss_clip": 0.01106476, + "auxiliary_loss_mlp": 0.01033997, + "balance_loss_clip": 1.02167034, + "balance_loss_mlp": 1.03606987, + "epoch": 0.531489553584849, + "flos": 23040035297280.0, + "grad_norm": 1.889571361745381, + "language_loss": 0.76674354, + "learning_rate": 1.8931436719734023e-06, + "loss": 0.78814822, + "num_input_tokens_seen": 190042035, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 8840, + "time_per_iteration": 2.47389817237854 + }, + { + "auxiliary_loss_clip": 0.01108751, + "auxiliary_loss_mlp": 0.01031156, + "balance_loss_clip": 1.01798964, + "balance_loss_mlp": 1.03678751, + "epoch": 0.5315496768375169, + "flos": 19790478668160.0, + "grad_norm": 1.9758748106511805, + "language_loss": 0.77377498, + "learning_rate": 1.892754768590216e-06, + "loss": 0.79517406, + "num_input_tokens_seen": 190057545, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71875, + "step": 8841, + "time_per_iteration": 2.4235799312591553 + }, + { + "auxiliary_loss_clip": 0.0103176, + "auxiliary_loss_mlp": 0.01011801, + "balance_loss_clip": 1.01060319, + "balance_loss_mlp": 1.00937963, + "epoch": 0.5316098000901849, + "flos": 71023228185600.0, + "grad_norm": 0.6971901974616477, + "language_loss": 0.56793272, + "learning_rate": 1.8923658692737793e-06, + "loss": 0.5883683, + "num_input_tokens_seen": 190123800, + "router_z_loss_clip": 0.01196289, + "router_z_loss_mlp": 0.22363281, + "step": 8842, + "time_per_iteration": 3.1749658584594727 + }, + { + "auxiliary_loss_clip": 0.0111031, + "auxiliary_loss_mlp": 0.01038298, + "balance_loss_clip": 1.02445221, + "balance_loss_mlp": 1.03839254, + "epoch": 0.5316699233428529, + "flos": 16435560470400.0, + "grad_norm": 1.7048374639197847, + "language_loss": 0.73877072, + "learning_rate": 1.8919769740388407e-06, + "loss": 0.76025677, + "num_input_tokens_seen": 190141625, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.71875, + "step": 8843, + "time_per_iteration": 3.7764668464660645 + }, + { + "auxiliary_loss_clip": 0.01031369, + "auxiliary_loss_mlp": 0.01005783, + "balance_loss_clip": 1.00454903, + "balance_loss_mlp": 1.0092088, + "epoch": 0.5317300465955208, + "flos": 67420814302080.0, + "grad_norm": 0.8754586803272454, + "language_loss": 0.61063367, + "learning_rate": 1.891588082900145e-06, + "loss": 0.63100517, + "num_input_tokens_seen": 190198110, + "router_z_loss_clip": 0.0123291, + "router_z_loss_mlp": 0.22265625, + "step": 8844, + "time_per_iteration": 3.1397178173065186 + }, + { + "auxiliary_loss_clip": 0.01031644, + "auxiliary_loss_mlp": 0.01000918, + "balance_loss_clip": 0.9997676, + "balance_loss_mlp": 1.00950778, + "epoch": 0.5317901698481888, + "flos": 59508075340800.0, + "grad_norm": 0.9433503667086528, + "language_loss": 0.62195891, + "learning_rate": 1.8911991958724411e-06, + "loss": 0.64228451, + "num_input_tokens_seen": 190259950, + "router_z_loss_clip": 0.01147461, + "router_z_loss_mlp": 0.22167969, + "step": 8845, + "time_per_iteration": 3.0431036949157715 + }, + { + "auxiliary_loss_clip": 0.01107979, + "auxiliary_loss_mlp": 0.01032637, + "balance_loss_clip": 1.01908851, + "balance_loss_mlp": 1.0369339, + "epoch": 0.5318502931008567, + "flos": 19129219240320.0, + "grad_norm": 2.021195915673457, + "language_loss": 0.7583214, + "learning_rate": 1.890810312970474e-06, + "loss": 0.77972758, + "num_input_tokens_seen": 190278265, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7109375, + "step": 8846, + "time_per_iteration": 5.309458017349243 + }, + { + "auxiliary_loss_clip": 0.01106825, + "auxiliary_loss_mlp": 0.01031551, + "balance_loss_clip": 1.01994586, + "balance_loss_mlp": 1.03744686, + "epoch": 0.5319104163535248, + "flos": 24681045070080.0, + "grad_norm": 1.5634287795910362, + "language_loss": 0.75384724, + "learning_rate": 1.8904214342089903e-06, + "loss": 0.775231, + "num_input_tokens_seen": 190298400, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6953125, + "step": 8847, + "time_per_iteration": 2.4939441680908203 + }, + { + "auxiliary_loss_clip": 0.01104626, + "auxiliary_loss_mlp": 0.01029614, + "balance_loss_clip": 1.01720405, + "balance_loss_mlp": 1.03563881, + "epoch": 0.5319705396061927, + "flos": 19385513758080.0, + "grad_norm": 1.798053797011527, + "language_loss": 0.87663037, + "learning_rate": 1.8900325596027378e-06, + "loss": 0.89797276, + "num_input_tokens_seen": 190316235, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 8848, + "time_per_iteration": 2.417572498321533 + }, + { + "auxiliary_loss_clip": 0.01107802, + "auxiliary_loss_mlp": 0.01035349, + "balance_loss_clip": 1.02124095, + "balance_loss_mlp": 1.03765666, + "epoch": 0.5320306628588607, + "flos": 18259319664000.0, + "grad_norm": 2.6565378723095834, + "language_loss": 0.74641025, + "learning_rate": 1.8896436891664609e-06, + "loss": 0.76784182, + "num_input_tokens_seen": 190335060, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.703125, + "step": 8849, + "time_per_iteration": 2.4509243965148926 + }, + { + "auxiliary_loss_clip": 0.01107364, + "auxiliary_loss_mlp": 0.01029496, + "balance_loss_clip": 1.01655602, + "balance_loss_mlp": 1.03593016, + "epoch": 0.5320907861115286, + "flos": 23732321097600.0, + "grad_norm": 2.164126567755358, + "language_loss": 0.79812169, + "learning_rate": 1.8892548229149066e-06, + "loss": 0.81949031, + "num_input_tokens_seen": 190353265, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71484375, + "step": 8850, + "time_per_iteration": 2.45766544342041 + }, + { + "auxiliary_loss_clip": 0.01104904, + "auxiliary_loss_mlp": 0.01028828, + "balance_loss_clip": 1.01615, + "balance_loss_mlp": 1.03538489, + "epoch": 0.5321509093641966, + "flos": 34495251321600.0, + "grad_norm": 1.4483393548737078, + "language_loss": 0.54913849, + "learning_rate": 1.888865960862821e-06, + "loss": 0.57047582, + "num_input_tokens_seen": 190376575, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 8851, + "time_per_iteration": 2.607548713684082 + }, + { + "auxiliary_loss_clip": 0.01110841, + "auxiliary_loss_mlp": 0.01030593, + "balance_loss_clip": 1.01821876, + "balance_loss_mlp": 1.03916895, + "epoch": 0.5322110326168645, + "flos": 20010934391040.0, + "grad_norm": 1.7052679387317837, + "language_loss": 0.68385565, + "learning_rate": 1.8884771030249484e-06, + "loss": 0.70526993, + "num_input_tokens_seen": 190395185, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71484375, + "step": 8852, + "time_per_iteration": 2.4444568157196045 + }, + { + "auxiliary_loss_clip": 0.01031832, + "auxiliary_loss_mlp": 0.00999979, + "balance_loss_clip": 0.9987337, + "balance_loss_mlp": 1.00941014, + "epoch": 0.5322711558695326, + "flos": 64631164435200.0, + "grad_norm": 0.8061011864926959, + "language_loss": 0.62881088, + "learning_rate": 1.8880882494160357e-06, + "loss": 0.64912903, + "num_input_tokens_seen": 190452595, + "router_z_loss_clip": 0.01245117, + "router_z_loss_mlp": 0.22460938, + "step": 8853, + "time_per_iteration": 3.0409493446350098 + }, + { + "auxiliary_loss_clip": 0.01108315, + "auxiliary_loss_mlp": 0.01029698, + "balance_loss_clip": 1.01691902, + "balance_loss_mlp": 1.03633368, + "epoch": 0.5323312791222005, + "flos": 14939342421120.0, + "grad_norm": 2.2642894326377196, + "language_loss": 0.79002404, + "learning_rate": 1.8876994000508278e-06, + "loss": 0.81140411, + "num_input_tokens_seen": 190469140, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.71875, + "step": 8854, + "time_per_iteration": 2.4175822734832764 + }, + { + "auxiliary_loss_clip": 0.01103338, + "auxiliary_loss_mlp": 0.01028455, + "balance_loss_clip": 1.01717186, + "balance_loss_mlp": 1.03635907, + "epoch": 0.5323914023748685, + "flos": 23440834229760.0, + "grad_norm": 1.6616394070358602, + "language_loss": 0.73815715, + "learning_rate": 1.8873105549440698e-06, + "loss": 0.75947511, + "num_input_tokens_seen": 190489015, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.671875, + "step": 8855, + "time_per_iteration": 2.5298781394958496 + }, + { + "auxiliary_loss_clip": 0.01104403, + "auxiliary_loss_mlp": 0.01029662, + "balance_loss_clip": 1.01806259, + "balance_loss_mlp": 1.03597307, + "epoch": 0.5324515256275365, + "flos": 26286180134400.0, + "grad_norm": 1.9409120124024815, + "language_loss": 0.64495003, + "learning_rate": 1.886921714110507e-06, + "loss": 0.66629064, + "num_input_tokens_seen": 190508065, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.68359375, + "step": 8856, + "time_per_iteration": 2.483076333999634 + }, + { + "auxiliary_loss_clip": 0.01111855, + "auxiliary_loss_mlp": 0.01035084, + "balance_loss_clip": 1.02166665, + "balance_loss_mlp": 1.03986931, + "epoch": 0.5325116488802044, + "flos": 26870913636480.0, + "grad_norm": 1.6437419686120303, + "language_loss": 0.77630389, + "learning_rate": 1.8865328775648842e-06, + "loss": 0.79777324, + "num_input_tokens_seen": 190527045, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.71875, + "step": 8857, + "time_per_iteration": 2.534383773803711 + }, + { + "auxiliary_loss_clip": 0.01105473, + "auxiliary_loss_mlp": 0.01033387, + "balance_loss_clip": 1.02073884, + "balance_loss_mlp": 1.03602767, + "epoch": 0.5325717721328724, + "flos": 25884734757120.0, + "grad_norm": 2.590488147317335, + "language_loss": 0.71136224, + "learning_rate": 1.8861440453219456e-06, + "loss": 0.73275089, + "num_input_tokens_seen": 190544075, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 8858, + "time_per_iteration": 2.48335862159729 + }, + { + "auxiliary_loss_clip": 0.01108291, + "auxiliary_loss_mlp": 0.01033735, + "balance_loss_clip": 1.02001405, + "balance_loss_mlp": 1.03818965, + "epoch": 0.5326318953855403, + "flos": 21799321666560.0, + "grad_norm": 1.5574852735183802, + "language_loss": 0.69423437, + "learning_rate": 1.8857552173964367e-06, + "loss": 0.71565467, + "num_input_tokens_seen": 190566030, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.703125, + "step": 8859, + "time_per_iteration": 2.5293610095977783 + }, + { + "auxiliary_loss_clip": 0.01104952, + "auxiliary_loss_mlp": 0.01027434, + "balance_loss_clip": 1.01622272, + "balance_loss_mlp": 1.03947163, + "epoch": 0.5326920186382084, + "flos": 20922921728640.0, + "grad_norm": 1.5500879507245162, + "language_loss": 0.69682205, + "learning_rate": 1.8853663938031013e-06, + "loss": 0.71814591, + "num_input_tokens_seen": 190585605, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 8860, + "time_per_iteration": 2.453315019607544 + }, + { + "auxiliary_loss_clip": 0.01105883, + "auxiliary_loss_mlp": 0.01031982, + "balance_loss_clip": 1.02027583, + "balance_loss_mlp": 1.03789401, + "epoch": 0.5327521418908763, + "flos": 21433427775360.0, + "grad_norm": 1.830505462704671, + "language_loss": 0.78035998, + "learning_rate": 1.884977574556683e-06, + "loss": 0.80173862, + "num_input_tokens_seen": 190604625, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 8861, + "time_per_iteration": 2.4910025596618652 + }, + { + "auxiliary_loss_clip": 0.01107901, + "auxiliary_loss_mlp": 0.01037922, + "balance_loss_clip": 1.02470744, + "balance_loss_mlp": 1.03778684, + "epoch": 0.5328122651435443, + "flos": 21760250647680.0, + "grad_norm": 3.045684614472066, + "language_loss": 0.85532111, + "learning_rate": 1.8845887596719279e-06, + "loss": 0.87677932, + "num_input_tokens_seen": 190625060, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 8862, + "time_per_iteration": 2.4594204425811768 + }, + { + "auxiliary_loss_clip": 0.01108341, + "auxiliary_loss_mlp": 0.01035571, + "balance_loss_clip": 1.02181435, + "balance_loss_mlp": 1.03708994, + "epoch": 0.5328723883962122, + "flos": 18296487262080.0, + "grad_norm": 2.155580167277434, + "language_loss": 0.61776686, + "learning_rate": 1.8841999491635778e-06, + "loss": 0.63920593, + "num_input_tokens_seen": 190643150, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7109375, + "step": 8863, + "time_per_iteration": 2.431844472885132 + }, + { + "auxiliary_loss_clip": 0.01107834, + "auxiliary_loss_mlp": 0.01033129, + "balance_loss_clip": 1.02161896, + "balance_loss_mlp": 1.03979647, + "epoch": 0.5329325116488802, + "flos": 25374911068800.0, + "grad_norm": 1.808986842092349, + "language_loss": 0.73174077, + "learning_rate": 1.883811143046377e-06, + "loss": 0.7531504, + "num_input_tokens_seen": 190662725, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6796875, + "step": 8864, + "time_per_iteration": 2.481052875518799 + }, + { + "auxiliary_loss_clip": 0.01106149, + "auxiliary_loss_mlp": 0.01036127, + "balance_loss_clip": 1.02406275, + "balance_loss_mlp": 1.03704095, + "epoch": 0.5329926349015481, + "flos": 25592098654080.0, + "grad_norm": 1.770075213018519, + "language_loss": 0.64782691, + "learning_rate": 1.8834223413350702e-06, + "loss": 0.66924965, + "num_input_tokens_seen": 190683680, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.69140625, + "step": 8865, + "time_per_iteration": 2.5422523021698 + }, + { + "auxiliary_loss_clip": 0.01106424, + "auxiliary_loss_mlp": 0.010298, + "balance_loss_clip": 1.01711667, + "balance_loss_mlp": 1.0374155, + "epoch": 0.5330527581542162, + "flos": 22889605138560.0, + "grad_norm": 1.6788966461131323, + "language_loss": 0.78194928, + "learning_rate": 1.8830335440443989e-06, + "loss": 0.80331147, + "num_input_tokens_seen": 190703350, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 8866, + "time_per_iteration": 2.4783847332000732 + }, + { + "auxiliary_loss_clip": 0.01106298, + "auxiliary_loss_mlp": 0.01033575, + "balance_loss_clip": 1.02127266, + "balance_loss_mlp": 1.03756702, + "epoch": 0.5331128814068841, + "flos": 16026752805120.0, + "grad_norm": 2.4645319902700136, + "language_loss": 0.73618174, + "learning_rate": 1.882644751189108e-06, + "loss": 0.75758052, + "num_input_tokens_seen": 190721170, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 8867, + "time_per_iteration": 2.4607431888580322 + }, + { + "auxiliary_loss_clip": 0.01109812, + "auxiliary_loss_mlp": 0.01039021, + "balance_loss_clip": 1.02575922, + "balance_loss_mlp": 1.03957081, + "epoch": 0.5331730046595521, + "flos": 39344699629440.0, + "grad_norm": 1.616723113347984, + "language_loss": 0.72235525, + "learning_rate": 1.88225596278394e-06, + "loss": 0.7438435, + "num_input_tokens_seen": 190743795, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.703125, + "step": 8868, + "time_per_iteration": 2.6005828380584717 + }, + { + "auxiliary_loss_clip": 0.01107325, + "auxiliary_loss_mlp": 0.01032835, + "balance_loss_clip": 1.02044368, + "balance_loss_mlp": 1.03801441, + "epoch": 0.5332331279122201, + "flos": 24024382583040.0, + "grad_norm": 1.8848687711222403, + "language_loss": 0.78688312, + "learning_rate": 1.881867178843637e-06, + "loss": 0.80828476, + "num_input_tokens_seen": 190761560, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6953125, + "step": 8869, + "time_per_iteration": 2.527679681777954 + }, + { + "auxiliary_loss_clip": 0.01112421, + "auxiliary_loss_mlp": 0.0103673, + "balance_loss_clip": 1.02396262, + "balance_loss_mlp": 1.03942657, + "epoch": 0.533293251164888, + "flos": 17129318728320.0, + "grad_norm": 1.8336580730917733, + "language_loss": 0.75656843, + "learning_rate": 1.8814783993829434e-06, + "loss": 0.7780599, + "num_input_tokens_seen": 190778875, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7265625, + "step": 8870, + "time_per_iteration": 2.408651113510132 + }, + { + "auxiliary_loss_clip": 0.01112864, + "auxiliary_loss_mlp": 0.0103788, + "balance_loss_clip": 1.024266, + "balance_loss_mlp": 1.04069293, + "epoch": 0.533353374417556, + "flos": 22126360020480.0, + "grad_norm": 1.8439379115111716, + "language_loss": 0.75255805, + "learning_rate": 1.8810896244165997e-06, + "loss": 0.77406549, + "num_input_tokens_seen": 190799830, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.71875, + "step": 8871, + "time_per_iteration": 2.501173257827759 + }, + { + "auxiliary_loss_clip": 0.01109454, + "auxiliary_loss_mlp": 0.01032765, + "balance_loss_clip": 1.02014637, + "balance_loss_mlp": 1.03973055, + "epoch": 0.533413497670224, + "flos": 15011091838080.0, + "grad_norm": 1.7881983016452072, + "language_loss": 0.72249746, + "learning_rate": 1.8807008539593498e-06, + "loss": 0.74391973, + "num_input_tokens_seen": 190817155, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 8872, + "time_per_iteration": 2.4058215618133545 + }, + { + "auxiliary_loss_clip": 0.01110293, + "auxiliary_loss_mlp": 0.01038023, + "balance_loss_clip": 1.02498162, + "balance_loss_mlp": 1.04132104, + "epoch": 0.533473620922892, + "flos": 19609955890560.0, + "grad_norm": 1.7441588702127815, + "language_loss": 0.65051317, + "learning_rate": 1.880312088025936e-06, + "loss": 0.67199636, + "num_input_tokens_seen": 190835240, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6875, + "step": 8873, + "time_per_iteration": 2.4598374366760254 + }, + { + "auxiliary_loss_clip": 0.01108657, + "auxiliary_loss_mlp": 0.01037842, + "balance_loss_clip": 1.02549779, + "balance_loss_mlp": 1.03951979, + "epoch": 0.5335337441755599, + "flos": 14282644020480.0, + "grad_norm": 7.037025883542546, + "language_loss": 0.80012232, + "learning_rate": 1.879923326631099e-06, + "loss": 0.82158732, + "num_input_tokens_seen": 190851620, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.69140625, + "step": 8874, + "time_per_iteration": 2.43198299407959 + }, + { + "auxiliary_loss_clip": 0.0110808, + "auxiliary_loss_mlp": 0.01031501, + "balance_loss_clip": 1.01874542, + "balance_loss_mlp": 1.03897262, + "epoch": 0.5335938674282279, + "flos": 20814830726400.0, + "grad_norm": 2.558835697133273, + "language_loss": 0.70077014, + "learning_rate": 1.879534569789582e-06, + "loss": 0.72216594, + "num_input_tokens_seen": 190870545, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6875, + "step": 8875, + "time_per_iteration": 2.4746885299682617 + }, + { + "auxiliary_loss_clip": 0.01033299, + "auxiliary_loss_mlp": 0.01014121, + "balance_loss_clip": 1.01290536, + "balance_loss_mlp": 1.01076412, + "epoch": 0.5336539906808958, + "flos": 71396448451200.0, + "grad_norm": 0.7274620052615154, + "language_loss": 0.59653223, + "learning_rate": 1.879145817516126e-06, + "loss": 0.61700642, + "num_input_tokens_seen": 190931995, + "router_z_loss_clip": 0.012146, + "router_z_loss_mlp": 0.22460938, + "step": 8876, + "time_per_iteration": 3.1654725074768066 + }, + { + "auxiliary_loss_clip": 0.01107319, + "auxiliary_loss_mlp": 0.01031759, + "balance_loss_clip": 1.01971292, + "balance_loss_mlp": 1.0382477, + "epoch": 0.5337141139335638, + "flos": 20152996680960.0, + "grad_norm": 1.894052458703423, + "language_loss": 0.74833322, + "learning_rate": 1.8787570698254727e-06, + "loss": 0.76972401, + "num_input_tokens_seen": 190949890, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6875, + "step": 8877, + "time_per_iteration": 2.4836068153381348 + }, + { + "auxiliary_loss_clip": 0.01032923, + "auxiliary_loss_mlp": 0.0100501, + "balance_loss_clip": 1.00374663, + "balance_loss_mlp": 1.01051378, + "epoch": 0.5337742371862317, + "flos": 67728387484800.0, + "grad_norm": 0.7537185456157387, + "language_loss": 0.57229304, + "learning_rate": 1.8783683267323629e-06, + "loss": 0.59267235, + "num_input_tokens_seen": 191008480, + "router_z_loss_clip": 0.01263428, + "router_z_loss_mlp": 0.22460938, + "step": 8878, + "time_per_iteration": 2.9712772369384766 + }, + { + "auxiliary_loss_clip": 0.01111898, + "auxiliary_loss_mlp": 0.01034899, + "balance_loss_clip": 1.02161908, + "balance_loss_mlp": 1.04023981, + "epoch": 0.5338343604388998, + "flos": 25008909436800.0, + "grad_norm": 1.4246995459674998, + "language_loss": 0.72007561, + "learning_rate": 1.8779795882515395e-06, + "loss": 0.74154353, + "num_input_tokens_seen": 191028995, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 8879, + "time_per_iteration": 2.5073280334472656 + }, + { + "auxiliary_loss_clip": 0.01110375, + "auxiliary_loss_mlp": 0.01029972, + "balance_loss_clip": 1.01706791, + "balance_loss_mlp": 1.03980017, + "epoch": 0.5338944836915677, + "flos": 17601256546560.0, + "grad_norm": 2.331544880776984, + "language_loss": 0.8328526, + "learning_rate": 1.8775908543977416e-06, + "loss": 0.85425603, + "num_input_tokens_seen": 191045285, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 8880, + "time_per_iteration": 2.4154322147369385 + }, + { + "auxiliary_loss_clip": 0.01106028, + "auxiliary_loss_mlp": 0.0103408, + "balance_loss_clip": 1.02200413, + "balance_loss_mlp": 1.03857374, + "epoch": 0.5339546069442357, + "flos": 21724124544000.0, + "grad_norm": 1.3819058164028981, + "language_loss": 0.79567689, + "learning_rate": 1.8772021251857107e-06, + "loss": 0.81707799, + "num_input_tokens_seen": 191066105, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.671875, + "step": 8881, + "time_per_iteration": 2.4748446941375732 + }, + { + "auxiliary_loss_clip": 0.01032611, + "auxiliary_loss_mlp": 0.00999583, + "balance_loss_clip": 0.99825948, + "balance_loss_mlp": 1.01026177, + "epoch": 0.5340147301969036, + "flos": 69723583315200.0, + "grad_norm": 0.7951386121617492, + "language_loss": 0.59243226, + "learning_rate": 1.8768134006301882e-06, + "loss": 0.61275423, + "num_input_tokens_seen": 191126315, + "router_z_loss_clip": 0.01324463, + "router_z_loss_mlp": 0.22363281, + "step": 8882, + "time_per_iteration": 3.0554563999176025 + }, + { + "auxiliary_loss_clip": 0.01032284, + "auxiliary_loss_mlp": 0.01002778, + "balance_loss_clip": 1.0013417, + "balance_loss_mlp": 1.00965989, + "epoch": 0.5340748534495716, + "flos": 63880701580800.0, + "grad_norm": 0.8657705918333868, + "language_loss": 0.63714904, + "learning_rate": 1.876424680745913e-06, + "loss": 0.65749967, + "num_input_tokens_seen": 191174240, + "router_z_loss_clip": 0.01434326, + "router_z_loss_mlp": 0.2265625, + "step": 8883, + "time_per_iteration": 2.8666210174560547 + }, + { + "auxiliary_loss_clip": 0.01112111, + "auxiliary_loss_mlp": 0.01028534, + "balance_loss_clip": 1.01528406, + "balance_loss_mlp": 1.04020667, + "epoch": 0.5341349767022396, + "flos": 28694313694080.0, + "grad_norm": 2.5638154038033334, + "language_loss": 0.82000816, + "learning_rate": 1.8760359655476272e-06, + "loss": 0.84141463, + "num_input_tokens_seen": 191193335, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71875, + "step": 8884, + "time_per_iteration": 3.910738706588745 + }, + { + "auxiliary_loss_clip": 0.01104995, + "auxiliary_loss_mlp": 0.01028727, + "balance_loss_clip": 1.0165143, + "balance_loss_mlp": 1.03923178, + "epoch": 0.5341950999549075, + "flos": 16289691338880.0, + "grad_norm": 1.647799538914853, + "language_loss": 0.7224586, + "learning_rate": 1.8756472550500695e-06, + "loss": 0.74379575, + "num_input_tokens_seen": 191210900, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65625, + "step": 8885, + "time_per_iteration": 2.4330668449401855 + }, + { + "auxiliary_loss_clip": 0.01111824, + "auxiliary_loss_mlp": 0.01031625, + "balance_loss_clip": 1.01816654, + "balance_loss_mlp": 1.03816104, + "epoch": 0.5342552232075756, + "flos": 14355650413440.0, + "grad_norm": 1.9571098005847307, + "language_loss": 0.78834218, + "learning_rate": 1.87525854926798e-06, + "loss": 0.80977666, + "num_input_tokens_seen": 191226730, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 8886, + "time_per_iteration": 2.4285924434661865 + }, + { + "auxiliary_loss_clip": 0.01109212, + "auxiliary_loss_mlp": 0.01027987, + "balance_loss_clip": 1.01453424, + "balance_loss_mlp": 1.03859282, + "epoch": 0.5343153464602435, + "flos": 30297976300800.0, + "grad_norm": 1.4869737557636773, + "language_loss": 0.74745071, + "learning_rate": 1.8748698482160996e-06, + "loss": 0.76882267, + "num_input_tokens_seen": 191250435, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.70703125, + "step": 8887, + "time_per_iteration": 5.458622932434082 + }, + { + "auxiliary_loss_clip": 0.01106415, + "auxiliary_loss_mlp": 0.01025963, + "balance_loss_clip": 1.01351762, + "balance_loss_mlp": 1.03839684, + "epoch": 0.5343754697129115, + "flos": 15596292216960.0, + "grad_norm": 1.9580001729257437, + "language_loss": 0.68680072, + "learning_rate": 1.8744811519091663e-06, + "loss": 0.70812452, + "num_input_tokens_seen": 191268315, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6796875, + "step": 8888, + "time_per_iteration": 3.871016263961792 + }, + { + "auxiliary_loss_clip": 0.01115673, + "auxiliary_loss_mlp": 0.01037433, + "balance_loss_clip": 1.02426004, + "balance_loss_mlp": 1.03957748, + "epoch": 0.5344355929655794, + "flos": 16909617191040.0, + "grad_norm": 2.039365083298093, + "language_loss": 0.77427757, + "learning_rate": 1.8740924603619208e-06, + "loss": 0.79580867, + "num_input_tokens_seen": 191287000, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.76171875, + "step": 8889, + "time_per_iteration": 2.4321072101593018 + }, + { + "auxiliary_loss_clip": 0.01107574, + "auxiliary_loss_mlp": 0.01036398, + "balance_loss_clip": 1.02382183, + "balance_loss_mlp": 1.03896809, + "epoch": 0.5344957162182474, + "flos": 16798186224000.0, + "grad_norm": 1.7896399215033527, + "language_loss": 0.68882942, + "learning_rate": 1.873703773589102e-06, + "loss": 0.71026921, + "num_input_tokens_seen": 191304565, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 8890, + "time_per_iteration": 2.4512557983398438 + }, + { + "auxiliary_loss_clip": 0.01112757, + "auxiliary_loss_mlp": 0.01039562, + "balance_loss_clip": 1.02532864, + "balance_loss_mlp": 1.03882933, + "epoch": 0.5345558394709153, + "flos": 12705590413440.0, + "grad_norm": 3.075420511300943, + "language_loss": 0.77339637, + "learning_rate": 1.8733150916054483e-06, + "loss": 0.79491955, + "num_input_tokens_seen": 191318300, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.73828125, + "step": 8891, + "time_per_iteration": 2.4134135246276855 + }, + { + "auxiliary_loss_clip": 0.01106185, + "auxiliary_loss_mlp": 0.01030725, + "balance_loss_clip": 1.01904237, + "balance_loss_mlp": 1.03873932, + "epoch": 0.5346159627235834, + "flos": 22455050400000.0, + "grad_norm": 1.5298342127178157, + "language_loss": 0.73841035, + "learning_rate": 1.872926414425699e-06, + "loss": 0.75977939, + "num_input_tokens_seen": 191337925, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.67578125, + "step": 8892, + "time_per_iteration": 2.4843709468841553 + }, + { + "auxiliary_loss_clip": 0.0110608, + "auxiliary_loss_mlp": 0.0103126, + "balance_loss_clip": 1.01874948, + "balance_loss_mlp": 1.03663301, + "epoch": 0.5346760859762513, + "flos": 22415763899520.0, + "grad_norm": 1.5614617741562322, + "language_loss": 0.88069522, + "learning_rate": 1.8725377420645932e-06, + "loss": 0.90206861, + "num_input_tokens_seen": 191357120, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 8893, + "time_per_iteration": 2.445389747619629 + }, + { + "auxiliary_loss_clip": 0.0110385, + "auxiliary_loss_mlp": 0.01031179, + "balance_loss_clip": 1.01968718, + "balance_loss_mlp": 1.03617978, + "epoch": 0.5347362092289193, + "flos": 22816131868800.0, + "grad_norm": 1.5898186397759002, + "language_loss": 0.72623652, + "learning_rate": 1.872149074536869e-06, + "loss": 0.74758679, + "num_input_tokens_seen": 191375395, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6796875, + "step": 8894, + "time_per_iteration": 2.475914239883423 + }, + { + "auxiliary_loss_clip": 0.01106294, + "auxiliary_loss_mlp": 0.01030754, + "balance_loss_clip": 1.01774812, + "balance_loss_mlp": 1.03794241, + "epoch": 0.5347963324815872, + "flos": 23219480666880.0, + "grad_norm": 2.053516557339631, + "language_loss": 0.74730217, + "learning_rate": 1.8717604118572648e-06, + "loss": 0.7686727, + "num_input_tokens_seen": 191395595, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.68359375, + "step": 8895, + "time_per_iteration": 2.4524707794189453 + }, + { + "auxiliary_loss_clip": 0.01105976, + "auxiliary_loss_mlp": 0.01028564, + "balance_loss_clip": 1.01558769, + "balance_loss_mlp": 1.03688455, + "epoch": 0.5348564557342552, + "flos": 22601350494720.0, + "grad_norm": 1.7004701648033584, + "language_loss": 0.76999986, + "learning_rate": 1.8713717540405178e-06, + "loss": 0.79134524, + "num_input_tokens_seen": 191413730, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.69140625, + "step": 8896, + "time_per_iteration": 2.4727749824523926 + }, + { + "auxiliary_loss_clip": 0.01105321, + "auxiliary_loss_mlp": 0.01024889, + "balance_loss_clip": 1.01200807, + "balance_loss_mlp": 1.03771544, + "epoch": 0.5349165789869232, + "flos": 18002378701440.0, + "grad_norm": 1.674513516034323, + "language_loss": 0.78698516, + "learning_rate": 1.8709831011013676e-06, + "loss": 0.80828726, + "num_input_tokens_seen": 191432400, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.67578125, + "step": 8897, + "time_per_iteration": 2.437924861907959 + }, + { + "auxiliary_loss_clip": 0.01110861, + "auxiliary_loss_mlp": 0.01028076, + "balance_loss_clip": 1.015589, + "balance_loss_mlp": 1.04029751, + "epoch": 0.5349767022395912, + "flos": 17159770483200.0, + "grad_norm": 1.8516386867396797, + "language_loss": 0.75758165, + "learning_rate": 1.8705944530545509e-06, + "loss": 0.77897102, + "num_input_tokens_seen": 191448855, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 8898, + "time_per_iteration": 2.4490232467651367 + }, + { + "auxiliary_loss_clip": 0.0103315, + "auxiliary_loss_mlp": 0.00997269, + "balance_loss_clip": 0.99616033, + "balance_loss_mlp": 1.01073837, + "epoch": 0.5350368254922592, + "flos": 70992058158720.0, + "grad_norm": 0.8534656988697606, + "language_loss": 0.58027738, + "learning_rate": 1.8702058099148052e-06, + "loss": 0.60058159, + "num_input_tokens_seen": 191519690, + "router_z_loss_clip": 0.0111084, + "router_z_loss_mlp": 0.22460938, + "step": 8899, + "time_per_iteration": 3.2222988605499268 + }, + { + "auxiliary_loss_clip": 0.01105996, + "auxiliary_loss_mlp": 0.01028751, + "balance_loss_clip": 1.01625824, + "balance_loss_mlp": 1.03779793, + "epoch": 0.5350969487449271, + "flos": 27417833095680.0, + "grad_norm": 1.754025350675293, + "language_loss": 0.69734174, + "learning_rate": 1.869817171696868e-06, + "loss": 0.7186892, + "num_input_tokens_seen": 191539380, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 8900, + "time_per_iteration": 2.5348854064941406 + }, + { + "auxiliary_loss_clip": 0.01109931, + "auxiliary_loss_mlp": 0.01031276, + "balance_loss_clip": 1.01857448, + "balance_loss_mlp": 1.03874683, + "epoch": 0.5351570719975951, + "flos": 19316134638720.0, + "grad_norm": 1.712056344952118, + "language_loss": 0.71436262, + "learning_rate": 1.8694285384154777e-06, + "loss": 0.73577476, + "num_input_tokens_seen": 191557400, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 8901, + "time_per_iteration": 2.486694097518921 + }, + { + "auxiliary_loss_clip": 0.01108252, + "auxiliary_loss_mlp": 0.0102913, + "balance_loss_clip": 1.01632655, + "balance_loss_mlp": 1.03779531, + "epoch": 0.535217195250263, + "flos": 19828580019840.0, + "grad_norm": 2.0243685582186477, + "language_loss": 0.77403963, + "learning_rate": 1.8690399100853699e-06, + "loss": 0.79541337, + "num_input_tokens_seen": 191575860, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.703125, + "step": 8902, + "time_per_iteration": 2.4521291255950928 + }, + { + "auxiliary_loss_clip": 0.01103104, + "auxiliary_loss_mlp": 0.01032569, + "balance_loss_clip": 1.02103007, + "balance_loss_mlp": 1.03727639, + "epoch": 0.535277318502931, + "flos": 22127868391680.0, + "grad_norm": 1.5596437382067054, + "language_loss": 0.69763452, + "learning_rate": 1.868651286721281e-06, + "loss": 0.71899128, + "num_input_tokens_seen": 191595775, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 8903, + "time_per_iteration": 2.4639296531677246 + }, + { + "auxiliary_loss_clip": 0.01111291, + "auxiliary_loss_mlp": 0.01038911, + "balance_loss_clip": 1.02613187, + "balance_loss_mlp": 1.03885889, + "epoch": 0.5353374417555989, + "flos": 25045897466880.0, + "grad_norm": 1.4813880450748405, + "language_loss": 0.71867597, + "learning_rate": 1.86826266833795e-06, + "loss": 0.74017799, + "num_input_tokens_seen": 191617785, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7265625, + "step": 8904, + "time_per_iteration": 2.518556833267212 + }, + { + "auxiliary_loss_clip": 0.01109721, + "auxiliary_loss_mlp": 0.01035534, + "balance_loss_clip": 1.0223856, + "balance_loss_mlp": 1.03955388, + "epoch": 0.535397565008267, + "flos": 19388710068480.0, + "grad_norm": 1.7385404274740348, + "language_loss": 0.73125184, + "learning_rate": 1.8678740549501103e-06, + "loss": 0.75270438, + "num_input_tokens_seen": 191636900, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 8905, + "time_per_iteration": 2.481398582458496 + }, + { + "auxiliary_loss_clip": 0.01103053, + "auxiliary_loss_mlp": 0.01033307, + "balance_loss_clip": 1.02244139, + "balance_loss_mlp": 1.03704035, + "epoch": 0.5354576882609349, + "flos": 21471205904640.0, + "grad_norm": 1.4036286343955833, + "language_loss": 0.83569062, + "learning_rate": 1.8674854465725005e-06, + "loss": 0.85705423, + "num_input_tokens_seen": 191656720, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.66015625, + "step": 8906, + "time_per_iteration": 2.4822022914886475 + }, + { + "auxiliary_loss_clip": 0.01110585, + "auxiliary_loss_mlp": 0.01033769, + "balance_loss_clip": 1.02053666, + "balance_loss_mlp": 1.03906655, + "epoch": 0.5355178115136029, + "flos": 20777519473920.0, + "grad_norm": 3.1110381495397688, + "language_loss": 0.74120319, + "learning_rate": 1.8670968432198563e-06, + "loss": 0.76264668, + "num_input_tokens_seen": 191674445, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71484375, + "step": 8907, + "time_per_iteration": 2.4488067626953125 + }, + { + "auxiliary_loss_clip": 0.01109051, + "auxiliary_loss_mlp": 0.01028908, + "balance_loss_clip": 1.01639736, + "balance_loss_mlp": 1.03933167, + "epoch": 0.5355779347662708, + "flos": 23514020190720.0, + "grad_norm": 1.8326240405987804, + "language_loss": 0.77272546, + "learning_rate": 1.866708244906912e-06, + "loss": 0.79410505, + "num_input_tokens_seen": 191695000, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 8908, + "time_per_iteration": 2.5009818077087402 + }, + { + "auxiliary_loss_clip": 0.01111027, + "auxiliary_loss_mlp": 0.01035847, + "balance_loss_clip": 1.02252579, + "balance_loss_mlp": 1.039222, + "epoch": 0.5356380580189388, + "flos": 20303211358080.0, + "grad_norm": 9.969716540759343, + "language_loss": 0.7407465, + "learning_rate": 1.8663196516484055e-06, + "loss": 0.7622152, + "num_input_tokens_seen": 191713295, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.71875, + "step": 8909, + "time_per_iteration": 2.4272916316986084 + }, + { + "auxiliary_loss_clip": 0.01110397, + "auxiliary_loss_mlp": 0.01034564, + "balance_loss_clip": 1.02267265, + "balance_loss_mlp": 1.04071856, + "epoch": 0.5356981812716068, + "flos": 21361642444800.0, + "grad_norm": 1.9518435489791055, + "language_loss": 0.841941, + "learning_rate": 1.8659310634590702e-06, + "loss": 0.86339062, + "num_input_tokens_seen": 191732725, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69921875, + "step": 8910, + "time_per_iteration": 2.4678404331207275 + }, + { + "auxiliary_loss_clip": 0.01109272, + "auxiliary_loss_mlp": 0.01030589, + "balance_loss_clip": 1.0175302, + "balance_loss_mlp": 1.03802073, + "epoch": 0.5357583045242748, + "flos": 23111246010240.0, + "grad_norm": 1.5065365564315203, + "language_loss": 0.81728303, + "learning_rate": 1.8655424803536427e-06, + "loss": 0.83868158, + "num_input_tokens_seen": 191753765, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 8911, + "time_per_iteration": 2.482515335083008 + }, + { + "auxiliary_loss_clip": 0.01107773, + "auxiliary_loss_mlp": 0.01033913, + "balance_loss_clip": 1.02217102, + "balance_loss_mlp": 1.03894281, + "epoch": 0.5358184277769428, + "flos": 21141761339520.0, + "grad_norm": 1.8795354415042287, + "language_loss": 0.6902765, + "learning_rate": 1.8651539023468585e-06, + "loss": 0.71169335, + "num_input_tokens_seen": 191773560, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6875, + "step": 8912, + "time_per_iteration": 2.489625930786133 + }, + { + "auxiliary_loss_clip": 0.01110703, + "auxiliary_loss_mlp": 0.01035561, + "balance_loss_clip": 1.02269232, + "balance_loss_mlp": 1.04099894, + "epoch": 0.5358785510296107, + "flos": 16282400878080.0, + "grad_norm": 1.778457710383864, + "language_loss": 0.71355128, + "learning_rate": 1.8647653294534509e-06, + "loss": 0.73501396, + "num_input_tokens_seen": 191791255, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 8913, + "time_per_iteration": 2.4120781421661377 + }, + { + "auxiliary_loss_clip": 0.01114215, + "auxiliary_loss_mlp": 0.01036322, + "balance_loss_clip": 1.02322149, + "balance_loss_mlp": 1.04114628, + "epoch": 0.5359386742822787, + "flos": 16976877408000.0, + "grad_norm": 1.8082872891744106, + "language_loss": 0.72335684, + "learning_rate": 1.864376761688156e-06, + "loss": 0.7448622, + "num_input_tokens_seen": 191809325, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.734375, + "step": 8914, + "time_per_iteration": 2.466946840286255 + }, + { + "auxiliary_loss_clip": 0.01115012, + "auxiliary_loss_mlp": 0.01039705, + "balance_loss_clip": 1.02528632, + "balance_loss_mlp": 1.04084253, + "epoch": 0.5359987975349466, + "flos": 20812927305600.0, + "grad_norm": 2.2402764225711915, + "language_loss": 0.70448041, + "learning_rate": 1.8639881990657079e-06, + "loss": 0.72602755, + "num_input_tokens_seen": 191829795, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7421875, + "step": 8915, + "time_per_iteration": 2.5281713008880615 + }, + { + "auxiliary_loss_clip": 0.01108649, + "auxiliary_loss_mlp": 0.01036619, + "balance_loss_clip": 1.02335119, + "balance_loss_mlp": 1.03934813, + "epoch": 0.5360589207876146, + "flos": 22199941031040.0, + "grad_norm": 4.884439280571106, + "language_loss": 0.75188339, + "learning_rate": 1.8635996416008408e-06, + "loss": 0.77333617, + "num_input_tokens_seen": 191850840, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.69140625, + "step": 8916, + "time_per_iteration": 2.4901540279388428 + }, + { + "auxiliary_loss_clip": 0.01110696, + "auxiliary_loss_mlp": 0.01029597, + "balance_loss_clip": 1.01685333, + "balance_loss_mlp": 1.03908181, + "epoch": 0.5361190440402825, + "flos": 31394365084800.0, + "grad_norm": 2.001008974250462, + "language_loss": 0.72230595, + "learning_rate": 1.863211089308289e-06, + "loss": 0.74370885, + "num_input_tokens_seen": 191869520, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71484375, + "step": 8917, + "time_per_iteration": 2.5355899333953857 + }, + { + "auxiliary_loss_clip": 0.01109638, + "auxiliary_loss_mlp": 0.01037576, + "balance_loss_clip": 1.02460611, + "balance_loss_mlp": 1.04033589, + "epoch": 0.5361791672929506, + "flos": 16069882060800.0, + "grad_norm": 2.185479233449534, + "language_loss": 0.71158117, + "learning_rate": 1.8628225422027865e-06, + "loss": 0.73305333, + "num_input_tokens_seen": 191887240, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.69140625, + "step": 8918, + "time_per_iteration": 2.497854709625244 + }, + { + "auxiliary_loss_clip": 0.011106, + "auxiliary_loss_mlp": 0.01036514, + "balance_loss_clip": 1.02387154, + "balance_loss_mlp": 1.04111099, + "epoch": 0.5362392905456185, + "flos": 20740926493440.0, + "grad_norm": 1.4281907235735687, + "language_loss": 0.75156265, + "learning_rate": 1.862434000299067e-06, + "loss": 0.7730338, + "num_input_tokens_seen": 191905690, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 8919, + "time_per_iteration": 2.4522061347961426 + }, + { + "auxiliary_loss_clip": 0.01109131, + "auxiliary_loss_mlp": 0.01031322, + "balance_loss_clip": 1.0192163, + "balance_loss_mlp": 1.0374527, + "epoch": 0.5362994137982865, + "flos": 17340077779200.0, + "grad_norm": 1.9146697385716565, + "language_loss": 0.71194351, + "learning_rate": 1.862045463611864e-06, + "loss": 0.73334807, + "num_input_tokens_seen": 191920725, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71875, + "step": 8920, + "time_per_iteration": 2.4363694190979004 + }, + { + "auxiliary_loss_clip": 0.01106889, + "auxiliary_loss_mlp": 0.01031821, + "balance_loss_clip": 1.01886892, + "balance_loss_mlp": 1.03738046, + "epoch": 0.5363595370509544, + "flos": 42813957795840.0, + "grad_norm": 1.417495166440162, + "language_loss": 0.68572164, + "learning_rate": 1.8616569321559105e-06, + "loss": 0.7071088, + "num_input_tokens_seen": 191944645, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 8921, + "time_per_iteration": 2.659815788269043 + }, + { + "auxiliary_loss_clip": 0.01110885, + "auxiliary_loss_mlp": 0.01036076, + "balance_loss_clip": 1.02357066, + "balance_loss_mlp": 1.04096341, + "epoch": 0.5364196603036224, + "flos": 19171953446400.0, + "grad_norm": 1.806007791508249, + "language_loss": 0.81778204, + "learning_rate": 1.86126840594594e-06, + "loss": 0.83925164, + "num_input_tokens_seen": 191962265, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69921875, + "step": 8922, + "time_per_iteration": 2.4896881580352783 + }, + { + "auxiliary_loss_clip": 0.01109712, + "auxiliary_loss_mlp": 0.01028286, + "balance_loss_clip": 1.01601934, + "balance_loss_mlp": 1.03847456, + "epoch": 0.5364797835562904, + "flos": 17931060247680.0, + "grad_norm": 1.9048762186543056, + "language_loss": 0.76640022, + "learning_rate": 1.860879884996686e-06, + "loss": 0.78778023, + "num_input_tokens_seen": 191978850, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.7109375, + "step": 8923, + "time_per_iteration": 2.46250319480896 + }, + { + "auxiliary_loss_clip": 0.01112498, + "auxiliary_loss_mlp": 0.0103384, + "balance_loss_clip": 1.02061963, + "balance_loss_mlp": 1.04007745, + "epoch": 0.5365399068089584, + "flos": 30228058477440.0, + "grad_norm": 1.372230243923659, + "language_loss": 0.70459902, + "learning_rate": 1.8604913693228804e-06, + "loss": 0.72606242, + "num_input_tokens_seen": 192002000, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7265625, + "step": 8924, + "time_per_iteration": 2.5744879245758057 + }, + { + "auxiliary_loss_clip": 0.0111402, + "auxiliary_loss_mlp": 0.01036175, + "balance_loss_clip": 1.02251387, + "balance_loss_mlp": 1.04109585, + "epoch": 0.5366000300616264, + "flos": 24891696380160.0, + "grad_norm": 1.82023886715655, + "language_loss": 0.86756319, + "learning_rate": 1.8601028589392558e-06, + "loss": 0.88906515, + "num_input_tokens_seen": 192019100, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7265625, + "step": 8925, + "time_per_iteration": 2.4910149574279785 + }, + { + "auxiliary_loss_clip": 0.01110722, + "auxiliary_loss_mlp": 0.01031177, + "balance_loss_clip": 1.01847553, + "balance_loss_mlp": 1.03855276, + "epoch": 0.5366601533142943, + "flos": 29826649013760.0, + "grad_norm": 1.7557992545857284, + "language_loss": 0.77842706, + "learning_rate": 1.8597143538605455e-06, + "loss": 0.79984611, + "num_input_tokens_seen": 192041660, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 8926, + "time_per_iteration": 3.935426950454712 + }, + { + "auxiliary_loss_clip": 0.01108374, + "auxiliary_loss_mlp": 0.0103378, + "balance_loss_clip": 1.02207375, + "balance_loss_mlp": 1.04045248, + "epoch": 0.5367202765669623, + "flos": 27199352620800.0, + "grad_norm": 1.9312965019913735, + "language_loss": 0.66655087, + "learning_rate": 1.85932585410148e-06, + "loss": 0.68797243, + "num_input_tokens_seen": 192063540, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 8927, + "time_per_iteration": 2.547527313232422 + }, + { + "auxiliary_loss_clip": 0.01109886, + "auxiliary_loss_mlp": 0.01028352, + "balance_loss_clip": 1.01575708, + "balance_loss_mlp": 1.03839135, + "epoch": 0.5367803998196302, + "flos": 20229953569920.0, + "grad_norm": 1.6954569855299475, + "language_loss": 0.73241496, + "learning_rate": 1.8589373596767929e-06, + "loss": 0.75379729, + "num_input_tokens_seen": 192081760, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71484375, + "step": 8928, + "time_per_iteration": 2.432772636413574 + }, + { + "auxiliary_loss_clip": 0.01109785, + "auxiliary_loss_mlp": 0.01031284, + "balance_loss_clip": 1.01908278, + "balance_loss_mlp": 1.03883481, + "epoch": 0.5368405230722982, + "flos": 32154629374080.0, + "grad_norm": 1.7056756537874223, + "language_loss": 0.62998128, + "learning_rate": 1.8585488706012154e-06, + "loss": 0.65139198, + "num_input_tokens_seen": 192101620, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.7109375, + "step": 8929, + "time_per_iteration": 5.517207145690918 + }, + { + "auxiliary_loss_clip": 0.01109689, + "auxiliary_loss_mlp": 0.01031499, + "balance_loss_clip": 1.01881528, + "balance_loss_mlp": 1.03864491, + "epoch": 0.5369006463249661, + "flos": 26247935128320.0, + "grad_norm": 1.7096435666181475, + "language_loss": 0.65986609, + "learning_rate": 1.8581603868894781e-06, + "loss": 0.68127799, + "num_input_tokens_seen": 192121805, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 8930, + "time_per_iteration": 4.042668581008911 + }, + { + "auxiliary_loss_clip": 0.01105563, + "auxiliary_loss_mlp": 0.01029425, + "balance_loss_clip": 1.01673484, + "balance_loss_mlp": 1.03648782, + "epoch": 0.5369607695776342, + "flos": 26211306234240.0, + "grad_norm": 1.4058068619041801, + "language_loss": 0.66875708, + "learning_rate": 1.8577719085563136e-06, + "loss": 0.69010699, + "num_input_tokens_seen": 192141765, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 8931, + "time_per_iteration": 2.4965057373046875 + }, + { + "auxiliary_loss_clip": 0.01111171, + "auxiliary_loss_mlp": 0.01032988, + "balance_loss_clip": 1.01932693, + "balance_loss_mlp": 1.04157209, + "epoch": 0.5370208928303021, + "flos": 25009017177600.0, + "grad_norm": 1.7390938861026815, + "language_loss": 0.75847304, + "learning_rate": 1.8573834356164525e-06, + "loss": 0.77991474, + "num_input_tokens_seen": 192161560, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.6953125, + "step": 8932, + "time_per_iteration": 2.4885287284851074 + }, + { + "auxiliary_loss_clip": 0.01110784, + "auxiliary_loss_mlp": 0.0103335, + "balance_loss_clip": 1.01999855, + "balance_loss_mlp": 1.04103768, + "epoch": 0.5370810160829701, + "flos": 31792147274880.0, + "grad_norm": 1.8276755120836934, + "language_loss": 0.66255939, + "learning_rate": 1.8569949680846261e-06, + "loss": 0.68400073, + "num_input_tokens_seen": 192180190, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.69921875, + "step": 8933, + "time_per_iteration": 2.545335292816162 + }, + { + "auxiliary_loss_clip": 0.01106255, + "auxiliary_loss_mlp": 0.01032805, + "balance_loss_clip": 1.02077079, + "balance_loss_mlp": 1.03900647, + "epoch": 0.537141139335638, + "flos": 23842602829440.0, + "grad_norm": 1.6337429593741761, + "language_loss": 0.82865143, + "learning_rate": 1.856606505975565e-06, + "loss": 0.85004205, + "num_input_tokens_seen": 192198855, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 8934, + "time_per_iteration": 2.503974437713623 + }, + { + "auxiliary_loss_clip": 0.0110502, + "auxiliary_loss_mlp": 0.01035873, + "balance_loss_clip": 1.02293336, + "balance_loss_mlp": 1.03738618, + "epoch": 0.537201262588306, + "flos": 18508826511360.0, + "grad_norm": 1.7935675007471827, + "language_loss": 0.79473621, + "learning_rate": 1.856218049303999e-06, + "loss": 0.81614518, + "num_input_tokens_seen": 192216555, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.67578125, + "step": 8935, + "time_per_iteration": 2.4432904720306396 + }, + { + "auxiliary_loss_clip": 0.01109317, + "auxiliary_loss_mlp": 0.01037309, + "balance_loss_clip": 1.02450609, + "balance_loss_mlp": 1.03854251, + "epoch": 0.537261385840974, + "flos": 25662950231040.0, + "grad_norm": 1.6092738011459846, + "language_loss": 0.83558774, + "learning_rate": 1.855829598084659e-06, + "loss": 0.857054, + "num_input_tokens_seen": 192236910, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.70703125, + "step": 8936, + "time_per_iteration": 2.5320403575897217 + }, + { + "auxiliary_loss_clip": 0.01106939, + "auxiliary_loss_mlp": 0.01029184, + "balance_loss_clip": 1.0173173, + "balance_loss_mlp": 1.03860474, + "epoch": 0.537321509093642, + "flos": 40735017406080.0, + "grad_norm": 1.2642552304862777, + "language_loss": 0.72749949, + "learning_rate": 1.8554411523322754e-06, + "loss": 0.74886072, + "num_input_tokens_seen": 192260790, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.68359375, + "step": 8937, + "time_per_iteration": 2.6381869316101074 + }, + { + "auxiliary_loss_clip": 0.01110674, + "auxiliary_loss_mlp": 0.01028782, + "balance_loss_clip": 1.01589561, + "balance_loss_mlp": 1.03737688, + "epoch": 0.53738163234631, + "flos": 17238487138560.0, + "grad_norm": 2.79948851304012, + "language_loss": 0.81773913, + "learning_rate": 1.8550527120615778e-06, + "loss": 0.83913368, + "num_input_tokens_seen": 192277230, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.73046875, + "step": 8938, + "time_per_iteration": 2.4865500926971436 + }, + { + "auxiliary_loss_clip": 0.01115105, + "auxiliary_loss_mlp": 0.01035683, + "balance_loss_clip": 1.0231539, + "balance_loss_mlp": 1.04058433, + "epoch": 0.5374417555989779, + "flos": 12821977457280.0, + "grad_norm": 2.3721010649860403, + "language_loss": 0.80348092, + "learning_rate": 1.8546642772872957e-06, + "loss": 0.82498878, + "num_input_tokens_seen": 192292840, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7421875, + "step": 8939, + "time_per_iteration": 2.4440550804138184 + }, + { + "auxiliary_loss_clip": 0.01034483, + "auxiliary_loss_mlp": 0.01002274, + "balance_loss_clip": 1.00117719, + "balance_loss_mlp": 1.01246023, + "epoch": 0.5375018788516459, + "flos": 67256018703360.0, + "grad_norm": 0.7105496368182959, + "language_loss": 0.52484262, + "learning_rate": 1.8542758480241589e-06, + "loss": 0.54521012, + "num_input_tokens_seen": 192358240, + "router_z_loss_clip": 0.01098633, + "router_z_loss_mlp": 0.22070312, + "step": 8940, + "time_per_iteration": 3.091242790222168 + }, + { + "auxiliary_loss_clip": 0.01107473, + "auxiliary_loss_mlp": 0.01029266, + "balance_loss_clip": 1.01732159, + "balance_loss_mlp": 1.03880298, + "epoch": 0.5375620021043138, + "flos": 18114168804480.0, + "grad_norm": 1.7538523818266185, + "language_loss": 0.71252179, + "learning_rate": 1.8538874242868965e-06, + "loss": 0.73388922, + "num_input_tokens_seen": 192377370, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6875, + "step": 8941, + "time_per_iteration": 2.497748613357544 + }, + { + "auxiliary_loss_clip": 0.01106467, + "auxiliary_loss_mlp": 0.01030418, + "balance_loss_clip": 1.01807404, + "balance_loss_mlp": 1.03906739, + "epoch": 0.5376221253569818, + "flos": 23149383275520.0, + "grad_norm": 1.7257322220940274, + "language_loss": 0.7928313, + "learning_rate": 1.853499006090237e-06, + "loss": 0.81420016, + "num_input_tokens_seen": 192396450, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.671875, + "step": 8942, + "time_per_iteration": 2.5012340545654297 + }, + { + "auxiliary_loss_clip": 0.01113441, + "auxiliary_loss_mlp": 0.01036513, + "balance_loss_clip": 1.02305436, + "balance_loss_mlp": 1.04004788, + "epoch": 0.5376822486096497, + "flos": 29972302663680.0, + "grad_norm": 1.6646036710876846, + "language_loss": 0.69918364, + "learning_rate": 1.853110593448911e-06, + "loss": 0.72068322, + "num_input_tokens_seen": 192417390, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 8943, + "time_per_iteration": 2.5815587043762207 + }, + { + "auxiliary_loss_clip": 0.01032313, + "auxiliary_loss_mlp": 0.0099905, + "balance_loss_clip": 0.99804258, + "balance_loss_mlp": 1.01022053, + "epoch": 0.5377423718623178, + "flos": 54168950874240.0, + "grad_norm": 0.8193486791235207, + "language_loss": 0.59579939, + "learning_rate": 1.852722186377645e-06, + "loss": 0.61611301, + "num_input_tokens_seen": 192478060, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.22070312, + "step": 8944, + "time_per_iteration": 3.0560412406921387 + }, + { + "auxiliary_loss_clip": 0.01117959, + "auxiliary_loss_mlp": 0.01036857, + "balance_loss_clip": 1.02264094, + "balance_loss_mlp": 1.0415678, + "epoch": 0.5378024951149857, + "flos": 23257079228160.0, + "grad_norm": 2.048508714437824, + "language_loss": 0.77503264, + "learning_rate": 1.852333784891169e-06, + "loss": 0.79658085, + "num_input_tokens_seen": 192495985, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.765625, + "step": 8945, + "time_per_iteration": 2.4893672466278076 + }, + { + "auxiliary_loss_clip": 0.01109506, + "auxiliary_loss_mlp": 0.01034395, + "balance_loss_clip": 1.02192593, + "balance_loss_mlp": 1.03820658, + "epoch": 0.5378626183676537, + "flos": 24024095274240.0, + "grad_norm": 1.7269314210534699, + "language_loss": 0.68465722, + "learning_rate": 1.8519453890042112e-06, + "loss": 0.70609617, + "num_input_tokens_seen": 192515445, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7109375, + "step": 8946, + "time_per_iteration": 2.4605491161346436 + }, + { + "auxiliary_loss_clip": 0.01109149, + "auxiliary_loss_mlp": 0.0104377, + "balance_loss_clip": 1.03090715, + "balance_loss_mlp": 1.03953493, + "epoch": 0.5379227416203216, + "flos": 27161789973120.0, + "grad_norm": 1.7416668567009066, + "language_loss": 0.76750016, + "learning_rate": 1.851556998731498e-06, + "loss": 0.78902936, + "num_input_tokens_seen": 192536530, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 8947, + "time_per_iteration": 2.547470808029175 + }, + { + "auxiliary_loss_clip": 0.01108626, + "auxiliary_loss_mlp": 0.01029842, + "balance_loss_clip": 1.01731312, + "balance_loss_mlp": 1.03834343, + "epoch": 0.5379828648729896, + "flos": 24681619687680.0, + "grad_norm": 1.559080956726188, + "language_loss": 0.60268521, + "learning_rate": 1.8511686140877592e-06, + "loss": 0.62406987, + "num_input_tokens_seen": 192556075, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 8948, + "time_per_iteration": 2.486721992492676 + }, + { + "auxiliary_loss_clip": 0.01112593, + "auxiliary_loss_mlp": 0.01034497, + "balance_loss_clip": 1.0221529, + "balance_loss_mlp": 1.04152977, + "epoch": 0.5380429881256577, + "flos": 22523280284160.0, + "grad_norm": 1.6883046071040144, + "language_loss": 0.7951721, + "learning_rate": 1.8507802350877205e-06, + "loss": 0.816643, + "num_input_tokens_seen": 192575535, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.7109375, + "step": 8949, + "time_per_iteration": 2.504025936126709 + }, + { + "auxiliary_loss_clip": 0.01107717, + "auxiliary_loss_mlp": 0.0103256, + "balance_loss_clip": 1.01955473, + "balance_loss_mlp": 1.03890014, + "epoch": 0.5381031113783256, + "flos": 26979543342720.0, + "grad_norm": 1.5394027339965872, + "language_loss": 0.77871096, + "learning_rate": 1.850391861746111e-06, + "loss": 0.80011374, + "num_input_tokens_seen": 192594490, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6875, + "step": 8950, + "time_per_iteration": 2.4836034774780273 + }, + { + "auxiliary_loss_clip": 0.01108112, + "auxiliary_loss_mlp": 0.01031572, + "balance_loss_clip": 1.01990116, + "balance_loss_mlp": 1.04001009, + "epoch": 0.5381632346309936, + "flos": 24754087376640.0, + "grad_norm": 1.7709921726317892, + "language_loss": 0.72630781, + "learning_rate": 1.8500034940776573e-06, + "loss": 0.74770463, + "num_input_tokens_seen": 192615650, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 8951, + "time_per_iteration": 2.5027382373809814 + }, + { + "auxiliary_loss_clip": 0.01109504, + "auxiliary_loss_mlp": 0.0102749, + "balance_loss_clip": 1.01503229, + "balance_loss_mlp": 1.03817379, + "epoch": 0.5382233578836615, + "flos": 15560058372480.0, + "grad_norm": 1.739294207658579, + "language_loss": 0.75148916, + "learning_rate": 1.849615132097085e-06, + "loss": 0.7728591, + "num_input_tokens_seen": 192633840, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71484375, + "step": 8952, + "time_per_iteration": 2.423635244369507 + }, + { + "auxiliary_loss_clip": 0.01109041, + "auxiliary_loss_mlp": 0.01028303, + "balance_loss_clip": 1.01504064, + "balance_loss_mlp": 1.03914118, + "epoch": 0.5382834811363295, + "flos": 25084501608960.0, + "grad_norm": 1.5972619646266322, + "language_loss": 0.79724902, + "learning_rate": 1.8492267758191228e-06, + "loss": 0.81862247, + "num_input_tokens_seen": 192655890, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.69921875, + "step": 8953, + "time_per_iteration": 2.532107353210449 + }, + { + "auxiliary_loss_clip": 0.01106301, + "auxiliary_loss_mlp": 0.01033247, + "balance_loss_clip": 1.01993775, + "balance_loss_mlp": 1.03857923, + "epoch": 0.5383436043889974, + "flos": 13297901685120.0, + "grad_norm": 2.0280242140271336, + "language_loss": 0.80724108, + "learning_rate": 1.8488384252584964e-06, + "loss": 0.82863653, + "num_input_tokens_seen": 192673025, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.67578125, + "step": 8954, + "time_per_iteration": 2.404942512512207 + }, + { + "auxiliary_loss_clip": 0.01112016, + "auxiliary_loss_mlp": 0.01030551, + "balance_loss_clip": 1.01780725, + "balance_loss_mlp": 1.04119825, + "epoch": 0.5384037276416654, + "flos": 23039388852480.0, + "grad_norm": 2.327007095214437, + "language_loss": 0.76461661, + "learning_rate": 1.8484500804299318e-06, + "loss": 0.78604227, + "num_input_tokens_seen": 192692190, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 8955, + "time_per_iteration": 2.511826992034912 + }, + { + "auxiliary_loss_clip": 0.01110374, + "auxiliary_loss_mlp": 0.01036995, + "balance_loss_clip": 1.02414417, + "balance_loss_mlp": 1.04121125, + "epoch": 0.5384638508943334, + "flos": 20631147552000.0, + "grad_norm": 1.5710344626373696, + "language_loss": 0.7823422, + "learning_rate": 1.8480617413481557e-06, + "loss": 0.80381584, + "num_input_tokens_seen": 192710380, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69140625, + "step": 8956, + "time_per_iteration": 2.484722375869751 + }, + { + "auxiliary_loss_clip": 0.0103322, + "auxiliary_loss_mlp": 0.01002994, + "balance_loss_clip": 1.00186145, + "balance_loss_mlp": 1.01120663, + "epoch": 0.5385239741470014, + "flos": 66737683491840.0, + "grad_norm": 0.8559223539778376, + "language_loss": 0.63550651, + "learning_rate": 1.8476734080278932e-06, + "loss": 0.65586865, + "num_input_tokens_seen": 192768995, + "router_z_loss_clip": 0.01135254, + "router_z_loss_mlp": 0.22070312, + "step": 8957, + "time_per_iteration": 3.065546751022339 + }, + { + "auxiliary_loss_clip": 0.01032349, + "auxiliary_loss_mlp": 0.01008296, + "balance_loss_clip": 1.00706863, + "balance_loss_mlp": 1.01029825, + "epoch": 0.5385840973996693, + "flos": 64716058229760.0, + "grad_norm": 0.7038941855074313, + "language_loss": 0.5158186, + "learning_rate": 1.8472850804838705e-06, + "loss": 0.53622508, + "num_input_tokens_seen": 192825585, + "router_z_loss_clip": 0.01226807, + "router_z_loss_mlp": 0.22070312, + "step": 8958, + "time_per_iteration": 3.0705761909484863 + }, + { + "auxiliary_loss_clip": 0.01115886, + "auxiliary_loss_mlp": 0.01030672, + "balance_loss_clip": 1.01678383, + "balance_loss_mlp": 1.04319501, + "epoch": 0.5386442206523373, + "flos": 26141783460480.0, + "grad_norm": 1.5948521762422991, + "language_loss": 0.77216792, + "learning_rate": 1.8468967587308128e-06, + "loss": 0.79363346, + "num_input_tokens_seen": 192847335, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7265625, + "step": 8959, + "time_per_iteration": 2.4907429218292236 + }, + { + "auxiliary_loss_clip": 0.01109786, + "auxiliary_loss_mlp": 0.0103173, + "balance_loss_clip": 1.0190165, + "balance_loss_mlp": 1.03810203, + "epoch": 0.5387043439050052, + "flos": 18251849635200.0, + "grad_norm": 2.0946376118717493, + "language_loss": 0.83630693, + "learning_rate": 1.8465084427834455e-06, + "loss": 0.85772204, + "num_input_tokens_seen": 192862205, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 8960, + "time_per_iteration": 2.4251809120178223 + }, + { + "auxiliary_loss_clip": 0.01112347, + "auxiliary_loss_mlp": 0.01030452, + "balance_loss_clip": 1.01780403, + "balance_loss_mlp": 1.0417726, + "epoch": 0.5387644671576732, + "flos": 29788296266880.0, + "grad_norm": 1.575363596920687, + "language_loss": 0.78489578, + "learning_rate": 1.8461201326564933e-06, + "loss": 0.80632377, + "num_input_tokens_seen": 192883695, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 8961, + "time_per_iteration": 2.5358235836029053 + }, + { + "auxiliary_loss_clip": 0.01110674, + "auxiliary_loss_mlp": 0.01032204, + "balance_loss_clip": 1.01921666, + "balance_loss_mlp": 1.04004741, + "epoch": 0.5388245904103413, + "flos": 22374466237440.0, + "grad_norm": 1.7764783659945997, + "language_loss": 0.84602159, + "learning_rate": 1.845731828364681e-06, + "loss": 0.86745036, + "num_input_tokens_seen": 192900190, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.70703125, + "step": 8962, + "time_per_iteration": 2.462369918823242 + }, + { + "auxiliary_loss_clip": 0.01032215, + "auxiliary_loss_mlp": 0.00998189, + "balance_loss_clip": 0.99696141, + "balance_loss_mlp": 1.01020229, + "epoch": 0.5388847136630092, + "flos": 69807794751360.0, + "grad_norm": 0.7323858189394533, + "language_loss": 0.54189092, + "learning_rate": 1.8453435299227333e-06, + "loss": 0.56219494, + "num_input_tokens_seen": 192958675, + "router_z_loss_clip": 0.01226807, + "router_z_loss_mlp": 0.22070312, + "step": 8963, + "time_per_iteration": 3.000844717025757 + }, + { + "auxiliary_loss_clip": 0.01031141, + "auxiliary_loss_mlp": 0.00998281, + "balance_loss_clip": 0.99717277, + "balance_loss_mlp": 1.00911307, + "epoch": 0.5389448369156772, + "flos": 69822303845760.0, + "grad_norm": 0.8055122078658323, + "language_loss": 0.63433194, + "learning_rate": 1.8449552373453744e-06, + "loss": 0.65462613, + "num_input_tokens_seen": 193033135, + "router_z_loss_clip": 0.0111084, + "router_z_loss_mlp": 0.22070312, + "step": 8964, + "time_per_iteration": 3.241182565689087 + }, + { + "auxiliary_loss_clip": 0.01112883, + "auxiliary_loss_mlp": 0.01030674, + "balance_loss_clip": 1.01782298, + "balance_loss_mlp": 1.03918004, + "epoch": 0.5390049601683451, + "flos": 31722444933120.0, + "grad_norm": 1.532843563745025, + "language_loss": 0.69958258, + "learning_rate": 1.8445669506473287e-06, + "loss": 0.72101814, + "num_input_tokens_seen": 193055570, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.73828125, + "step": 8965, + "time_per_iteration": 2.524223804473877 + }, + { + "auxiliary_loss_clip": 0.01114315, + "auxiliary_loss_mlp": 0.01035135, + "balance_loss_clip": 1.02103257, + "balance_loss_mlp": 1.04133582, + "epoch": 0.5390650834210131, + "flos": 18113486446080.0, + "grad_norm": 2.362623955664157, + "language_loss": 0.81848061, + "learning_rate": 1.8441786698433192e-06, + "loss": 0.83997512, + "num_input_tokens_seen": 193073120, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.73046875, + "step": 8966, + "time_per_iteration": 2.477625608444214 + }, + { + "auxiliary_loss_clip": 0.01110928, + "auxiliary_loss_mlp": 0.0103216, + "balance_loss_clip": 1.01913619, + "balance_loss_mlp": 1.04063606, + "epoch": 0.539125206673681, + "flos": 17416711445760.0, + "grad_norm": 1.8348280049509287, + "language_loss": 0.72713602, + "learning_rate": 1.8437903949480706e-06, + "loss": 0.74856687, + "num_input_tokens_seen": 193090105, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 8967, + "time_per_iteration": 2.419088125228882 + }, + { + "auxiliary_loss_clip": 0.0110771, + "auxiliary_loss_mlp": 0.01031235, + "balance_loss_clip": 1.01884913, + "balance_loss_mlp": 1.03676677, + "epoch": 0.539185329926349, + "flos": 22198935450240.0, + "grad_norm": 1.8042691798262989, + "language_loss": 0.81596529, + "learning_rate": 1.8434021259763065e-06, + "loss": 0.83735478, + "num_input_tokens_seen": 193109325, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 8968, + "time_per_iteration": 3.8650004863739014 + }, + { + "auxiliary_loss_clip": 0.01111598, + "auxiliary_loss_mlp": 0.01030147, + "balance_loss_clip": 1.0168612, + "balance_loss_mlp": 1.0391978, + "epoch": 0.539245453179017, + "flos": 21434397442560.0, + "grad_norm": 1.5993373110169542, + "language_loss": 0.73938435, + "learning_rate": 1.8430138629427484e-06, + "loss": 0.76080179, + "num_input_tokens_seen": 193130595, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.72265625, + "step": 8969, + "time_per_iteration": 2.485146999359131 + }, + { + "auxiliary_loss_clip": 0.01111919, + "auxiliary_loss_mlp": 0.01032887, + "balance_loss_clip": 1.01886833, + "balance_loss_mlp": 1.03785658, + "epoch": 0.539305576431685, + "flos": 20735000749440.0, + "grad_norm": 2.3553854013154907, + "language_loss": 0.82165599, + "learning_rate": 1.8426256058621205e-06, + "loss": 0.84310412, + "num_input_tokens_seen": 193148930, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 8970, + "time_per_iteration": 2.4504613876342773 + }, + { + "auxiliary_loss_clip": 0.01109668, + "auxiliary_loss_mlp": 0.01032979, + "balance_loss_clip": 1.02005112, + "balance_loss_mlp": 1.03989851, + "epoch": 0.5393656996843529, + "flos": 30920452018560.0, + "grad_norm": 1.5328161731771237, + "language_loss": 0.75619417, + "learning_rate": 1.842237354749146e-06, + "loss": 0.77762067, + "num_input_tokens_seen": 193170140, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 8971, + "time_per_iteration": 5.434189558029175 + }, + { + "auxiliary_loss_clip": 0.01030677, + "auxiliary_loss_mlp": 0.00999826, + "balance_loss_clip": 0.99856228, + "balance_loss_mlp": 1.00854254, + "epoch": 0.5394258229370209, + "flos": 50317781351040.0, + "grad_norm": 0.8757990223887638, + "language_loss": 0.60310632, + "learning_rate": 1.8418491096185465e-06, + "loss": 0.62341136, + "num_input_tokens_seen": 193227235, + "router_z_loss_clip": 0.01263428, + "router_z_loss_mlp": 0.22167969, + "step": 8972, + "time_per_iteration": 3.070239782333374 + }, + { + "auxiliary_loss_clip": 0.01109336, + "auxiliary_loss_mlp": 0.01044193, + "balance_loss_clip": 1.03085351, + "balance_loss_mlp": 1.0389235, + "epoch": 0.5394859461896888, + "flos": 25411935012480.0, + "grad_norm": 1.4916710753135305, + "language_loss": 0.78427428, + "learning_rate": 1.841460870485045e-06, + "loss": 0.80580956, + "num_input_tokens_seen": 193248435, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.703125, + "step": 8973, + "time_per_iteration": 2.4841833114624023 + }, + { + "auxiliary_loss_clip": 0.01116334, + "auxiliary_loss_mlp": 0.01037039, + "balance_loss_clip": 1.02265668, + "balance_loss_mlp": 1.03959453, + "epoch": 0.5395460694423568, + "flos": 25478476957440.0, + "grad_norm": 2.2712479958365304, + "language_loss": 0.73893452, + "learning_rate": 1.8410726373633623e-06, + "loss": 0.76046824, + "num_input_tokens_seen": 193267490, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.765625, + "step": 8974, + "time_per_iteration": 2.5056395530700684 + }, + { + "auxiliary_loss_clip": 0.01029707, + "auxiliary_loss_mlp": 0.01005081, + "balance_loss_clip": 1.00388896, + "balance_loss_mlp": 1.00777423, + "epoch": 0.5396061926950249, + "flos": 53249493507840.0, + "grad_norm": 0.7339193766969773, + "language_loss": 0.51197326, + "learning_rate": 1.8406844102682215e-06, + "loss": 0.53232116, + "num_input_tokens_seen": 193326050, + "router_z_loss_clip": 0.01190186, + "router_z_loss_mlp": 0.21972656, + "step": 8975, + "time_per_iteration": 3.0552287101745605 + }, + { + "auxiliary_loss_clip": 0.01110098, + "auxiliary_loss_mlp": 0.01040418, + "balance_loss_clip": 1.02723336, + "balance_loss_mlp": 1.03983927, + "epoch": 0.5396663159476928, + "flos": 26725080418560.0, + "grad_norm": 1.5397959415241314, + "language_loss": 0.71919322, + "learning_rate": 1.840296189214344e-06, + "loss": 0.74069834, + "num_input_tokens_seen": 193348785, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 8976, + "time_per_iteration": 2.5368118286132812 + }, + { + "auxiliary_loss_clip": 0.01112047, + "auxiliary_loss_mlp": 0.01035595, + "balance_loss_clip": 1.02300107, + "balance_loss_mlp": 1.03994215, + "epoch": 0.5397264392003608, + "flos": 23253380127360.0, + "grad_norm": 2.148603673983975, + "language_loss": 0.70274073, + "learning_rate": 1.8399079742164509e-06, + "loss": 0.72421718, + "num_input_tokens_seen": 193367080, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71875, + "step": 8977, + "time_per_iteration": 2.4685816764831543 + }, + { + "auxiliary_loss_clip": 0.01113255, + "auxiliary_loss_mlp": 0.01034111, + "balance_loss_clip": 1.02102757, + "balance_loss_mlp": 1.04169548, + "epoch": 0.5397865624530287, + "flos": 18294188791680.0, + "grad_norm": 1.656094242871676, + "language_loss": 0.7241326, + "learning_rate": 1.8395197652892636e-06, + "loss": 0.7456063, + "num_input_tokens_seen": 193383715, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 8978, + "time_per_iteration": 2.4495601654052734 + }, + { + "auxiliary_loss_clip": 0.01118429, + "auxiliary_loss_mlp": 0.01032682, + "balance_loss_clip": 1.01778078, + "balance_loss_mlp": 1.04137743, + "epoch": 0.5398466857056967, + "flos": 15297514888320.0, + "grad_norm": 2.582100330429111, + "language_loss": 0.73947239, + "learning_rate": 1.8391315624475028e-06, + "loss": 0.76098353, + "num_input_tokens_seen": 193400560, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7734375, + "step": 8979, + "time_per_iteration": 2.467693328857422 + }, + { + "auxiliary_loss_clip": 0.0111825, + "auxiliary_loss_mlp": 0.01049486, + "balance_loss_clip": 1.03538978, + "balance_loss_mlp": 1.04216337, + "epoch": 0.5399068089583646, + "flos": 17821748183040.0, + "grad_norm": 2.0456901795615656, + "language_loss": 0.76959479, + "learning_rate": 1.8387433657058892e-06, + "loss": 0.79127216, + "num_input_tokens_seen": 193418680, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 8980, + "time_per_iteration": 2.5299665927886963 + }, + { + "auxiliary_loss_clip": 0.01111255, + "auxiliary_loss_mlp": 0.01035786, + "balance_loss_clip": 1.02332902, + "balance_loss_mlp": 1.0388093, + "epoch": 0.5399669322110326, + "flos": 27381635164800.0, + "grad_norm": 1.6658662418671077, + "language_loss": 0.81773221, + "learning_rate": 1.8383551750791431e-06, + "loss": 0.83920264, + "num_input_tokens_seen": 193439310, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7265625, + "step": 8981, + "time_per_iteration": 2.593594789505005 + }, + { + "auxiliary_loss_clip": 0.01113866, + "auxiliary_loss_mlp": 0.01032362, + "balance_loss_clip": 1.01837826, + "balance_loss_mlp": 1.03922904, + "epoch": 0.5400270554637006, + "flos": 20449116403200.0, + "grad_norm": 1.7978808319720327, + "language_loss": 0.66842318, + "learning_rate": 1.8379669905819857e-06, + "loss": 0.68988544, + "num_input_tokens_seen": 193458115, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.74609375, + "step": 8982, + "time_per_iteration": 2.5118813514709473 + }, + { + "auxiliary_loss_clip": 0.01110986, + "auxiliary_loss_mlp": 0.01039664, + "balance_loss_clip": 1.02715898, + "balance_loss_mlp": 1.03987551, + "epoch": 0.5400871787163686, + "flos": 21689578638720.0, + "grad_norm": 1.4560866330096367, + "language_loss": 0.82442951, + "learning_rate": 1.8375788122291358e-06, + "loss": 0.84593606, + "num_input_tokens_seen": 193477365, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7109375, + "step": 8983, + "time_per_iteration": 2.457221269607544 + }, + { + "auxiliary_loss_clip": 0.01110015, + "auxiliary_loss_mlp": 0.01035726, + "balance_loss_clip": 1.02204108, + "balance_loss_mlp": 1.03799057, + "epoch": 0.5401473019690365, + "flos": 19204739585280.0, + "grad_norm": 1.7289170608138429, + "language_loss": 0.7078771, + "learning_rate": 1.8371906400353138e-06, + "loss": 0.72933447, + "num_input_tokens_seen": 193495595, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71875, + "step": 8984, + "time_per_iteration": 2.4523980617523193 + }, + { + "auxiliary_loss_clip": 0.01115801, + "auxiliary_loss_mlp": 0.01034543, + "balance_loss_clip": 1.02000558, + "balance_loss_mlp": 1.04127955, + "epoch": 0.5402074252217045, + "flos": 20627376624000.0, + "grad_norm": 1.7555929792269789, + "language_loss": 0.80110276, + "learning_rate": 1.8368024740152386e-06, + "loss": 0.82260621, + "num_input_tokens_seen": 193514035, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7421875, + "step": 8985, + "time_per_iteration": 2.446753740310669 + }, + { + "auxiliary_loss_clip": 0.01104654, + "auxiliary_loss_mlp": 0.0102882, + "balance_loss_clip": 1.01560616, + "balance_loss_mlp": 1.03796721, + "epoch": 0.5402675484743724, + "flos": 24973465691520.0, + "grad_norm": 2.3719765019392844, + "language_loss": 0.78840292, + "learning_rate": 1.83641431418363e-06, + "loss": 0.80973768, + "num_input_tokens_seen": 193535445, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.66796875, + "step": 8986, + "time_per_iteration": 2.5318102836608887 + }, + { + "auxiliary_loss_clip": 0.01109855, + "auxiliary_loss_mlp": 0.01031286, + "balance_loss_clip": 1.01879263, + "balance_loss_mlp": 1.03847885, + "epoch": 0.5403276717270404, + "flos": 19459022941440.0, + "grad_norm": 1.6989773263518806, + "language_loss": 0.77060419, + "learning_rate": 1.8360261605552075e-06, + "loss": 0.79201555, + "num_input_tokens_seen": 193554780, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71484375, + "step": 8987, + "time_per_iteration": 2.524240732192993 + }, + { + "auxiliary_loss_clip": 0.01109666, + "auxiliary_loss_mlp": 0.01031219, + "balance_loss_clip": 1.0178858, + "balance_loss_mlp": 1.03889561, + "epoch": 0.5403877949797083, + "flos": 18442140912000.0, + "grad_norm": 2.580263640738581, + "language_loss": 0.71292162, + "learning_rate": 1.8356380131446887e-06, + "loss": 0.73433048, + "num_input_tokens_seen": 193573580, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 8988, + "time_per_iteration": 2.4638671875 + }, + { + "auxiliary_loss_clip": 0.01110225, + "auxiliary_loss_mlp": 0.01036979, + "balance_loss_clip": 1.0228405, + "balance_loss_mlp": 1.03822088, + "epoch": 0.5404479182323764, + "flos": 28292868316800.0, + "grad_norm": 2.2630612952232827, + "language_loss": 0.67666376, + "learning_rate": 1.8352498719667934e-06, + "loss": 0.69813585, + "num_input_tokens_seen": 193590490, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.71875, + "step": 8989, + "time_per_iteration": 2.508855104446411 + }, + { + "auxiliary_loss_clip": 0.01111455, + "auxiliary_loss_mlp": 0.01037396, + "balance_loss_clip": 1.02386594, + "balance_loss_mlp": 1.03881633, + "epoch": 0.5405080414850444, + "flos": 23367325046400.0, + "grad_norm": 1.5798861838358007, + "language_loss": 0.77628905, + "learning_rate": 1.8348617370362399e-06, + "loss": 0.79777759, + "num_input_tokens_seen": 193609900, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7265625, + "step": 8990, + "time_per_iteration": 2.489483118057251 + }, + { + "auxiliary_loss_clip": 0.01106485, + "auxiliary_loss_mlp": 0.01028032, + "balance_loss_clip": 1.01594377, + "balance_loss_mlp": 1.03673029, + "epoch": 0.5405681647377123, + "flos": 21106425335040.0, + "grad_norm": 1.5931818725193578, + "language_loss": 0.69039345, + "learning_rate": 1.834473608367745e-06, + "loss": 0.71173859, + "num_input_tokens_seen": 193629775, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 8991, + "time_per_iteration": 2.4418294429779053 + }, + { + "auxiliary_loss_clip": 0.01109673, + "auxiliary_loss_mlp": 0.01035539, + "balance_loss_clip": 1.02171683, + "balance_loss_mlp": 1.03739381, + "epoch": 0.5406282879903803, + "flos": 20449188230400.0, + "grad_norm": 1.7624988623501092, + "language_loss": 0.7614572, + "learning_rate": 1.8340854859760277e-06, + "loss": 0.78290933, + "num_input_tokens_seen": 193648070, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.72265625, + "step": 8992, + "time_per_iteration": 2.4845540523529053 + }, + { + "auxiliary_loss_clip": 0.01110684, + "auxiliary_loss_mlp": 0.01032261, + "balance_loss_clip": 1.01963115, + "balance_loss_mlp": 1.03731656, + "epoch": 0.5406884112430482, + "flos": 14209493973120.0, + "grad_norm": 2.6314606707027304, + "language_loss": 0.76393229, + "learning_rate": 1.8336973698758056e-06, + "loss": 0.78536171, + "num_input_tokens_seen": 193665060, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.734375, + "step": 8993, + "time_per_iteration": 2.4074175357818604 + }, + { + "auxiliary_loss_clip": 0.01107083, + "auxiliary_loss_mlp": 0.01033943, + "balance_loss_clip": 1.02129519, + "balance_loss_mlp": 1.03785443, + "epoch": 0.5407485344957162, + "flos": 23875568536320.0, + "grad_norm": 1.6731423627794038, + "language_loss": 0.70444834, + "learning_rate": 1.8333092600817959e-06, + "loss": 0.72585857, + "num_input_tokens_seen": 193683620, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 8994, + "time_per_iteration": 2.5207760334014893 + }, + { + "auxiliary_loss_clip": 0.01110631, + "auxiliary_loss_mlp": 0.01031119, + "balance_loss_clip": 1.01729715, + "balance_loss_mlp": 1.03817177, + "epoch": 0.5408086577483842, + "flos": 23148485435520.0, + "grad_norm": 1.7966588085871025, + "language_loss": 0.74846065, + "learning_rate": 1.8329211566087157e-06, + "loss": 0.76987815, + "num_input_tokens_seen": 193702990, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7265625, + "step": 8995, + "time_per_iteration": 2.468820095062256 + }, + { + "auxiliary_loss_clip": 0.01107091, + "auxiliary_loss_mlp": 0.01035132, + "balance_loss_clip": 1.02315211, + "balance_loss_mlp": 1.0381844, + "epoch": 0.5408687810010522, + "flos": 18771046773120.0, + "grad_norm": 1.845320286189123, + "language_loss": 0.73867524, + "learning_rate": 1.832533059471282e-06, + "loss": 0.7600975, + "num_input_tokens_seen": 193721785, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6875, + "step": 8996, + "time_per_iteration": 2.4607067108154297 + }, + { + "auxiliary_loss_clip": 0.01105028, + "auxiliary_loss_mlp": 0.01033976, + "balance_loss_clip": 1.02183414, + "balance_loss_mlp": 1.03760076, + "epoch": 0.5409289042537201, + "flos": 13881557779200.0, + "grad_norm": 1.7779086932858201, + "language_loss": 0.73281908, + "learning_rate": 1.8321449686842115e-06, + "loss": 0.75420916, + "num_input_tokens_seen": 193740315, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.671875, + "step": 8997, + "time_per_iteration": 2.433438301086426 + }, + { + "auxiliary_loss_clip": 0.01109644, + "auxiliary_loss_mlp": 0.01033634, + "balance_loss_clip": 1.02052116, + "balance_loss_mlp": 1.03904319, + "epoch": 0.5409890275063881, + "flos": 14465357527680.0, + "grad_norm": 2.01233035965423, + "language_loss": 0.71775877, + "learning_rate": 1.8317568842622207e-06, + "loss": 0.73919159, + "num_input_tokens_seen": 193757580, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 8998, + "time_per_iteration": 2.4791901111602783 + }, + { + "auxiliary_loss_clip": 0.01107126, + "auxiliary_loss_mlp": 0.01037885, + "balance_loss_clip": 1.02471876, + "balance_loss_mlp": 1.03724909, + "epoch": 0.541049150759056, + "flos": 48977449349760.0, + "grad_norm": 1.596226887866337, + "language_loss": 0.70601052, + "learning_rate": 1.8313688062200256e-06, + "loss": 0.72746068, + "num_input_tokens_seen": 193780965, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.69921875, + "step": 8999, + "time_per_iteration": 2.6774816513061523 + }, + { + "auxiliary_loss_clip": 0.01106234, + "auxiliary_loss_mlp": 0.01035678, + "balance_loss_clip": 1.0222373, + "balance_loss_mlp": 1.03789854, + "epoch": 0.541109274011724, + "flos": 18147601388160.0, + "grad_norm": 2.5727427903087716, + "language_loss": 0.80433559, + "learning_rate": 1.8309807345723422e-06, + "loss": 0.8257547, + "num_input_tokens_seen": 193797855, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.68359375, + "step": 9000, + "time_per_iteration": 2.4608795642852783 + }, + { + "auxiliary_loss_clip": 0.0110639, + "auxiliary_loss_mlp": 0.01029527, + "balance_loss_clip": 1.01646805, + "balance_loss_mlp": 1.03770971, + "epoch": 0.541169397264392, + "flos": 20522553759360.0, + "grad_norm": 1.4688376580267075, + "language_loss": 0.72885478, + "learning_rate": 1.8305926693338863e-06, + "loss": 0.75021398, + "num_input_tokens_seen": 193817375, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6875, + "step": 9001, + "time_per_iteration": 2.469433069229126 + }, + { + "auxiliary_loss_clip": 0.01112566, + "auxiliary_loss_mlp": 0.01035385, + "balance_loss_clip": 1.0213902, + "balance_loss_mlp": 1.03844023, + "epoch": 0.54122952051706, + "flos": 20044043752320.0, + "grad_norm": 2.257759724972284, + "language_loss": 0.85127461, + "learning_rate": 1.8302046105193734e-06, + "loss": 0.87275422, + "num_input_tokens_seen": 193832205, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7421875, + "step": 9002, + "time_per_iteration": 2.4405739307403564 + }, + { + "auxiliary_loss_clip": 0.01107037, + "auxiliary_loss_mlp": 0.010314, + "balance_loss_clip": 1.02020574, + "balance_loss_mlp": 1.0384078, + "epoch": 0.541289643769728, + "flos": 19062246332160.0, + "grad_norm": 1.7125809204353786, + "language_loss": 0.77755821, + "learning_rate": 1.8298165581435183e-06, + "loss": 0.79894257, + "num_input_tokens_seen": 193849830, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6875, + "step": 9003, + "time_per_iteration": 2.451507806777954 + }, + { + "auxiliary_loss_clip": 0.01105384, + "auxiliary_loss_mlp": 0.01029055, + "balance_loss_clip": 1.01557827, + "balance_loss_mlp": 1.03640234, + "epoch": 0.5413497670223959, + "flos": 22382295402240.0, + "grad_norm": 2.168361582224207, + "language_loss": 0.69784325, + "learning_rate": 1.8294285122210372e-06, + "loss": 0.71918762, + "num_input_tokens_seen": 193869945, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.6875, + "step": 9004, + "time_per_iteration": 2.613961935043335 + }, + { + "auxiliary_loss_clip": 0.01028073, + "auxiliary_loss_mlp": 0.01010119, + "balance_loss_clip": 1.00899816, + "balance_loss_mlp": 1.00624812, + "epoch": 0.5414098902750639, + "flos": 70031734093440.0, + "grad_norm": 0.9677352946959291, + "language_loss": 0.59124619, + "learning_rate": 1.8290404727666434e-06, + "loss": 0.61162812, + "num_input_tokens_seen": 193930860, + "router_z_loss_clip": 0.01123047, + "router_z_loss_mlp": 0.21875, + "step": 9005, + "time_per_iteration": 3.175964832305908 + }, + { + "auxiliary_loss_clip": 0.01110665, + "auxiliary_loss_mlp": 0.01033824, + "balance_loss_clip": 1.02183771, + "balance_loss_mlp": 1.03938627, + "epoch": 0.5414700135277318, + "flos": 21798962530560.0, + "grad_norm": 1.6968329328942213, + "language_loss": 0.77685302, + "learning_rate": 1.8286524397950517e-06, + "loss": 0.79829788, + "num_input_tokens_seen": 193949075, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.7109375, + "step": 9006, + "time_per_iteration": 2.455742359161377 + }, + { + "auxiliary_loss_clip": 0.01104494, + "auxiliary_loss_mlp": 0.01032845, + "balance_loss_clip": 1.02205062, + "balance_loss_mlp": 1.03625751, + "epoch": 0.5415301367803999, + "flos": 16907929251840.0, + "grad_norm": 1.624690870596759, + "language_loss": 0.82998371, + "learning_rate": 1.8282644133209777e-06, + "loss": 0.8513571, + "num_input_tokens_seen": 193967630, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.68359375, + "step": 9007, + "time_per_iteration": 2.4356093406677246 + }, + { + "auxiliary_loss_clip": 0.01107937, + "auxiliary_loss_mlp": 0.01030224, + "balance_loss_clip": 1.01693249, + "balance_loss_mlp": 1.03761423, + "epoch": 0.5415902600330678, + "flos": 25704176065920.0, + "grad_norm": 2.1377427178959434, + "language_loss": 0.67209023, + "learning_rate": 1.8278763933591334e-06, + "loss": 0.69347185, + "num_input_tokens_seen": 193988730, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.703125, + "step": 9008, + "time_per_iteration": 2.5489509105682373 + }, + { + "auxiliary_loss_clip": 0.01111879, + "auxiliary_loss_mlp": 0.01031733, + "balance_loss_clip": 1.01810145, + "balance_loss_mlp": 1.03802204, + "epoch": 0.5416503832857358, + "flos": 19208151377280.0, + "grad_norm": 2.189253604566193, + "language_loss": 0.74129766, + "learning_rate": 1.827488379924234e-06, + "loss": 0.76273382, + "num_input_tokens_seen": 194005160, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7421875, + "step": 9009, + "time_per_iteration": 3.8252077102661133 + }, + { + "auxiliary_loss_clip": 0.01110449, + "auxiliary_loss_mlp": 0.01034408, + "balance_loss_clip": 1.02109861, + "balance_loss_mlp": 1.03791738, + "epoch": 0.5417105065384037, + "flos": 12713706887040.0, + "grad_norm": 2.141173328238238, + "language_loss": 0.87482637, + "learning_rate": 1.8271003730309923e-06, + "loss": 0.89627492, + "num_input_tokens_seen": 194021700, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.72265625, + "step": 9010, + "time_per_iteration": 2.4628190994262695 + }, + { + "auxiliary_loss_clip": 0.01106778, + "auxiliary_loss_mlp": 0.01032743, + "balance_loss_clip": 1.02007151, + "balance_loss_mlp": 1.03684556, + "epoch": 0.5417706297910717, + "flos": 30335933998080.0, + "grad_norm": 1.9800903494769417, + "language_loss": 0.64830345, + "learning_rate": 1.826712372694122e-06, + "loss": 0.66969872, + "num_input_tokens_seen": 194042620, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 9011, + "time_per_iteration": 2.530303955078125 + }, + { + "auxiliary_loss_clip": 0.01109504, + "auxiliary_loss_mlp": 0.01036995, + "balance_loss_clip": 1.02463341, + "balance_loss_mlp": 1.03945065, + "epoch": 0.5418307530437396, + "flos": 29020992912000.0, + "grad_norm": 3.61342010762258, + "language_loss": 0.79000378, + "learning_rate": 1.8263243789283362e-06, + "loss": 0.81146884, + "num_input_tokens_seen": 194061800, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.69921875, + "step": 9012, + "time_per_iteration": 5.477705240249634 + }, + { + "auxiliary_loss_clip": 0.01106272, + "auxiliary_loss_mlp": 0.01030108, + "balance_loss_clip": 1.01720369, + "balance_loss_mlp": 1.0364089, + "epoch": 0.5418908762964076, + "flos": 16873455173760.0, + "grad_norm": 1.7419259634167055, + "language_loss": 0.74031919, + "learning_rate": 1.8259363917483466e-06, + "loss": 0.76168299, + "num_input_tokens_seen": 194079890, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69921875, + "step": 9013, + "time_per_iteration": 3.8720171451568604 + }, + { + "auxiliary_loss_clip": 0.01109547, + "auxiliary_loss_mlp": 0.01029661, + "balance_loss_clip": 1.01657844, + "balance_loss_mlp": 1.0367403, + "epoch": 0.5419509995490756, + "flos": 18949702043520.0, + "grad_norm": 2.040050456437719, + "language_loss": 0.72289932, + "learning_rate": 1.8255484111688667e-06, + "loss": 0.74429148, + "num_input_tokens_seen": 194097625, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 9014, + "time_per_iteration": 2.436251640319824 + }, + { + "auxiliary_loss_clip": 0.01108382, + "auxiliary_loss_mlp": 0.01031413, + "balance_loss_clip": 1.01889062, + "balance_loss_mlp": 1.03802454, + "epoch": 0.5420111228017436, + "flos": 18077719478400.0, + "grad_norm": 1.601636110073364, + "language_loss": 0.80585766, + "learning_rate": 1.8251604372046085e-06, + "loss": 0.82725561, + "num_input_tokens_seen": 194116055, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 9015, + "time_per_iteration": 2.4523091316223145 + }, + { + "auxiliary_loss_clip": 0.01112438, + "auxiliary_loss_mlp": 0.0103619, + "balance_loss_clip": 1.02298188, + "balance_loss_mlp": 1.03929543, + "epoch": 0.5420712460544116, + "flos": 19061779455360.0, + "grad_norm": 3.6814275573944717, + "language_loss": 0.81413746, + "learning_rate": 1.8247724698702843e-06, + "loss": 0.83562374, + "num_input_tokens_seen": 194130365, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73046875, + "step": 9016, + "time_per_iteration": 2.4310686588287354 + }, + { + "auxiliary_loss_clip": 0.01107219, + "auxiliary_loss_mlp": 0.01030151, + "balance_loss_clip": 1.01763988, + "balance_loss_mlp": 1.03753281, + "epoch": 0.5421313693070795, + "flos": 18187103370240.0, + "grad_norm": 2.1017981350927646, + "language_loss": 0.81103092, + "learning_rate": 1.8243845091806053e-06, + "loss": 0.83240461, + "num_input_tokens_seen": 194148975, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69921875, + "step": 9017, + "time_per_iteration": 2.427536725997925 + }, + { + "auxiliary_loss_clip": 0.01104389, + "auxiliary_loss_mlp": 0.01029579, + "balance_loss_clip": 1.01719928, + "balance_loss_mlp": 1.03666961, + "epoch": 0.5421914925597475, + "flos": 13005947940480.0, + "grad_norm": 1.7397815948262747, + "language_loss": 0.77372575, + "learning_rate": 1.8239965551502837e-06, + "loss": 0.79506552, + "num_input_tokens_seen": 194167185, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 9018, + "time_per_iteration": 2.4533066749572754 + }, + { + "auxiliary_loss_clip": 0.01107196, + "auxiliary_loss_mlp": 0.01037507, + "balance_loss_clip": 1.02436996, + "balance_loss_mlp": 1.03481603, + "epoch": 0.5422516158124154, + "flos": 46758457831680.0, + "grad_norm": 1.448924926163926, + "language_loss": 0.66352963, + "learning_rate": 1.8236086077940303e-06, + "loss": 0.68497658, + "num_input_tokens_seen": 194192840, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7265625, + "step": 9019, + "time_per_iteration": 2.6830832958221436 + }, + { + "auxiliary_loss_clip": 0.01103655, + "auxiliary_loss_mlp": 0.0102679, + "balance_loss_clip": 1.0157038, + "balance_loss_mlp": 1.03604794, + "epoch": 0.5423117390650835, + "flos": 31758642864000.0, + "grad_norm": 1.5485094933207573, + "language_loss": 0.69635725, + "learning_rate": 1.8232206671265555e-06, + "loss": 0.71766162, + "num_input_tokens_seen": 194213150, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.67578125, + "step": 9020, + "time_per_iteration": 2.5516250133514404 + }, + { + "auxiliary_loss_clip": 0.01101699, + "auxiliary_loss_mlp": 0.01036657, + "balance_loss_clip": 1.02415812, + "balance_loss_mlp": 1.03544152, + "epoch": 0.5423718623177514, + "flos": 27201974313600.0, + "grad_norm": 1.4647880942088878, + "language_loss": 0.80443847, + "learning_rate": 1.8228327331625717e-06, + "loss": 0.825822, + "num_input_tokens_seen": 194234665, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6640625, + "step": 9021, + "time_per_iteration": 2.52411150932312 + }, + { + "auxiliary_loss_clip": 0.01107355, + "auxiliary_loss_mlp": 0.01033488, + "balance_loss_clip": 1.02107835, + "balance_loss_mlp": 1.03812504, + "epoch": 0.5424319855704194, + "flos": 23546447193600.0, + "grad_norm": 1.483970922248673, + "language_loss": 0.78272343, + "learning_rate": 1.822444805916788e-06, + "loss": 0.80413187, + "num_input_tokens_seen": 194253790, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6953125, + "step": 9022, + "time_per_iteration": 2.4745841026306152 + }, + { + "auxiliary_loss_clip": 0.01104936, + "auxiliary_loss_mlp": 0.01033878, + "balance_loss_clip": 1.02170706, + "balance_loss_mlp": 1.03559494, + "epoch": 0.5424921088230873, + "flos": 26615624699520.0, + "grad_norm": 1.6624827413591161, + "language_loss": 0.82107073, + "learning_rate": 1.822056885403915e-06, + "loss": 0.84245884, + "num_input_tokens_seen": 194274950, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6953125, + "step": 9023, + "time_per_iteration": 2.4953298568725586 + }, + { + "auxiliary_loss_clip": 0.01106969, + "auxiliary_loss_mlp": 0.01028855, + "balance_loss_clip": 1.01670718, + "balance_loss_mlp": 1.03815961, + "epoch": 0.5425522320757553, + "flos": 23586811102080.0, + "grad_norm": 1.8210142178846183, + "language_loss": 0.71515894, + "learning_rate": 1.8216689716386627e-06, + "loss": 0.73651719, + "num_input_tokens_seen": 194296155, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 9024, + "time_per_iteration": 2.512596368789673 + }, + { + "auxiliary_loss_clip": 0.01107389, + "auxiliary_loss_mlp": 0.01030904, + "balance_loss_clip": 1.01878023, + "balance_loss_mlp": 1.03640127, + "epoch": 0.5426123553284232, + "flos": 30592264429440.0, + "grad_norm": 1.659326462636006, + "language_loss": 0.64976329, + "learning_rate": 1.8212810646357405e-06, + "loss": 0.67114621, + "num_input_tokens_seen": 194318025, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.7109375, + "step": 9025, + "time_per_iteration": 2.512734889984131 + }, + { + "auxiliary_loss_clip": 0.0110856, + "auxiliary_loss_mlp": 0.01030004, + "balance_loss_clip": 1.01776159, + "balance_loss_mlp": 1.0378685, + "epoch": 0.5426724785810912, + "flos": 12495118671360.0, + "grad_norm": 6.402510966233504, + "language_loss": 0.74099922, + "learning_rate": 1.8208931644098591e-06, + "loss": 0.76238489, + "num_input_tokens_seen": 194336150, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.70703125, + "step": 9026, + "time_per_iteration": 2.42434024810791 + }, + { + "auxiliary_loss_clip": 0.01107364, + "auxiliary_loss_mlp": 0.01040251, + "balance_loss_clip": 1.02587438, + "balance_loss_mlp": 1.03585124, + "epoch": 0.5427326018337592, + "flos": 26064611089920.0, + "grad_norm": 1.637995325273745, + "language_loss": 0.78638506, + "learning_rate": 1.8205052709757265e-06, + "loss": 0.80786121, + "num_input_tokens_seen": 194355980, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.71484375, + "step": 9027, + "time_per_iteration": 2.488490104675293 + }, + { + "auxiliary_loss_clip": 0.01029187, + "auxiliary_loss_mlp": 0.01006045, + "balance_loss_clip": 1.00479341, + "balance_loss_mlp": 1.00745916, + "epoch": 0.5427927250864272, + "flos": 65984745576960.0, + "grad_norm": 0.7366554152868067, + "language_loss": 0.56548405, + "learning_rate": 1.8201173843480515e-06, + "loss": 0.58583641, + "num_input_tokens_seen": 194422660, + "router_z_loss_clip": 0.01251221, + "router_z_loss_mlp": 0.21679688, + "step": 9028, + "time_per_iteration": 3.0799479484558105 + }, + { + "auxiliary_loss_clip": 0.01108987, + "auxiliary_loss_mlp": 0.01030483, + "balance_loss_clip": 1.01727474, + "balance_loss_mlp": 1.03760409, + "epoch": 0.5428528483390952, + "flos": 19975382904960.0, + "grad_norm": 2.289578054979344, + "language_loss": 0.7793408, + "learning_rate": 1.8197295045415442e-06, + "loss": 0.80073547, + "num_input_tokens_seen": 194438545, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71484375, + "step": 9029, + "time_per_iteration": 2.454566478729248 + }, + { + "auxiliary_loss_clip": 0.01105649, + "auxiliary_loss_mlp": 0.01027546, + "balance_loss_clip": 1.01489735, + "balance_loss_mlp": 1.03734791, + "epoch": 0.5429129715917631, + "flos": 21832323287040.0, + "grad_norm": 1.5369423730734595, + "language_loss": 0.83306921, + "learning_rate": 1.8193416315709112e-06, + "loss": 0.85440123, + "num_input_tokens_seen": 194458060, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.68359375, + "step": 9030, + "time_per_iteration": 2.4675095081329346 + }, + { + "auxiliary_loss_clip": 0.01105498, + "auxiliary_loss_mlp": 0.01028416, + "balance_loss_clip": 1.01676893, + "balance_loss_mlp": 1.0374887, + "epoch": 0.5429730948444311, + "flos": 27782685492480.0, + "grad_norm": 1.5422544284751551, + "language_loss": 0.74720484, + "learning_rate": 1.8189537654508623e-06, + "loss": 0.76854396, + "num_input_tokens_seen": 194477405, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 9031, + "time_per_iteration": 2.4871413707733154 + }, + { + "auxiliary_loss_clip": 0.01103416, + "auxiliary_loss_mlp": 0.01030421, + "balance_loss_clip": 1.01883435, + "balance_loss_mlp": 1.03710687, + "epoch": 0.543033218097099, + "flos": 26760452336640.0, + "grad_norm": 1.9031998711979703, + "language_loss": 0.85544586, + "learning_rate": 1.8185659061961045e-06, + "loss": 0.87678427, + "num_input_tokens_seen": 194497085, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6640625, + "step": 9032, + "time_per_iteration": 2.492750406265259 + }, + { + "auxiliary_loss_clip": 0.01110136, + "auxiliary_loss_mlp": 0.01029381, + "balance_loss_clip": 1.01670289, + "balance_loss_mlp": 1.03757548, + "epoch": 0.5430933413497671, + "flos": 22675254727680.0, + "grad_norm": 1.71218946587007, + "language_loss": 0.73568988, + "learning_rate": 1.8181780538213457e-06, + "loss": 0.75708508, + "num_input_tokens_seen": 194516785, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7265625, + "step": 9033, + "time_per_iteration": 2.458281993865967 + }, + { + "auxiliary_loss_clip": 0.01106249, + "auxiliary_loss_mlp": 0.01033307, + "balance_loss_clip": 1.02057564, + "balance_loss_mlp": 1.03709424, + "epoch": 0.543153464602435, + "flos": 24607499973120.0, + "grad_norm": 1.6976408638259588, + "language_loss": 0.75797909, + "learning_rate": 1.8177902083412935e-06, + "loss": 0.77937472, + "num_input_tokens_seen": 194536475, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 9034, + "time_per_iteration": 2.491690158843994 + }, + { + "auxiliary_loss_clip": 0.01105341, + "auxiliary_loss_mlp": 0.01031202, + "balance_loss_clip": 1.01932836, + "balance_loss_mlp": 1.03710067, + "epoch": 0.543213587855103, + "flos": 19025725178880.0, + "grad_norm": 1.7098309272106547, + "language_loss": 0.84488094, + "learning_rate": 1.817402369770655e-06, + "loss": 0.86624634, + "num_input_tokens_seen": 194554495, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 9035, + "time_per_iteration": 2.4352262020111084 + }, + { + "auxiliary_loss_clip": 0.01028064, + "auxiliary_loss_mlp": 0.01007827, + "balance_loss_clip": 1.00669503, + "balance_loss_mlp": 1.00628209, + "epoch": 0.5432737111077709, + "flos": 65686435125120.0, + "grad_norm": 0.7231810753813949, + "language_loss": 0.55908412, + "learning_rate": 1.8170145381241364e-06, + "loss": 0.57944304, + "num_input_tokens_seen": 194617620, + "router_z_loss_clip": 0.01135254, + "router_z_loss_mlp": 0.21777344, + "step": 9036, + "time_per_iteration": 3.041694402694702 + }, + { + "auxiliary_loss_clip": 0.01108199, + "auxiliary_loss_mlp": 0.01034372, + "balance_loss_clip": 1.02147961, + "balance_loss_mlp": 1.03686309, + "epoch": 0.5433338343604389, + "flos": 22091670460800.0, + "grad_norm": 1.5099374695532384, + "language_loss": 0.75264686, + "learning_rate": 1.8166267134164451e-06, + "loss": 0.77407253, + "num_input_tokens_seen": 194637690, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9037, + "time_per_iteration": 2.4950051307678223 + }, + { + "auxiliary_loss_clip": 0.01106194, + "auxiliary_loss_mlp": 0.01035411, + "balance_loss_clip": 1.02301288, + "balance_loss_mlp": 1.03557479, + "epoch": 0.5433939576131068, + "flos": 34672649616000.0, + "grad_norm": 1.5216693219084618, + "language_loss": 0.66438931, + "learning_rate": 1.8162388956622875e-06, + "loss": 0.68580532, + "num_input_tokens_seen": 194659520, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.70703125, + "step": 9038, + "time_per_iteration": 2.559807777404785 + }, + { + "auxiliary_loss_clip": 0.01103453, + "auxiliary_loss_mlp": 0.01030083, + "balance_loss_clip": 1.0184598, + "balance_loss_mlp": 1.03513312, + "epoch": 0.5434540808657748, + "flos": 20303355012480.0, + "grad_norm": 1.8787316560909988, + "language_loss": 0.78100199, + "learning_rate": 1.8158510848763692e-06, + "loss": 0.80233729, + "num_input_tokens_seen": 194677645, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.68359375, + "step": 9039, + "time_per_iteration": 2.4654388427734375 + }, + { + "auxiliary_loss_clip": 0.01106931, + "auxiliary_loss_mlp": 0.01032936, + "balance_loss_clip": 1.02066386, + "balance_loss_mlp": 1.03744531, + "epoch": 0.5435142041184428, + "flos": 23112790295040.0, + "grad_norm": 1.8309305249268624, + "language_loss": 0.76449573, + "learning_rate": 1.8154632810733962e-06, + "loss": 0.78589433, + "num_input_tokens_seen": 194697400, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 9040, + "time_per_iteration": 2.527614116668701 + }, + { + "auxiliary_loss_clip": 0.0102829, + "auxiliary_loss_mlp": 0.01001895, + "balance_loss_clip": 1.00074422, + "balance_loss_mlp": 1.0065496, + "epoch": 0.5435743273711108, + "flos": 64012746954240.0, + "grad_norm": 0.6649082596858222, + "language_loss": 0.52501261, + "learning_rate": 1.815075484268074e-06, + "loss": 0.54531443, + "num_input_tokens_seen": 194761205, + "router_z_loss_clip": 0.01147461, + "router_z_loss_mlp": 0.21777344, + "step": 9041, + "time_per_iteration": 3.0513055324554443 + }, + { + "auxiliary_loss_clip": 0.01105303, + "auxiliary_loss_mlp": 0.01036545, + "balance_loss_clip": 1.02383089, + "balance_loss_mlp": 1.03610432, + "epoch": 0.5436344506237788, + "flos": 25118903859840.0, + "grad_norm": 1.5670483715805776, + "language_loss": 0.76206207, + "learning_rate": 1.8146876944751078e-06, + "loss": 0.78348053, + "num_input_tokens_seen": 194782445, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 9042, + "time_per_iteration": 2.4679293632507324 + }, + { + "auxiliary_loss_clip": 0.01102475, + "auxiliary_loss_mlp": 0.01031619, + "balance_loss_clip": 1.02001429, + "balance_loss_mlp": 1.03483939, + "epoch": 0.5436945738764467, + "flos": 19572967860480.0, + "grad_norm": 1.637929025007711, + "language_loss": 0.67479855, + "learning_rate": 1.8142999117092033e-06, + "loss": 0.69613945, + "num_input_tokens_seen": 194800325, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.67578125, + "step": 9043, + "time_per_iteration": 2.469393730163574 + }, + { + "auxiliary_loss_clip": 0.01101674, + "auxiliary_loss_mlp": 0.01031755, + "balance_loss_clip": 1.019876, + "balance_loss_mlp": 1.03556848, + "epoch": 0.5437546971291147, + "flos": 21142515525120.0, + "grad_norm": 1.6229792564391676, + "language_loss": 0.8417449, + "learning_rate": 1.8139121359850644e-06, + "loss": 0.86307919, + "num_input_tokens_seen": 194818675, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66015625, + "step": 9044, + "time_per_iteration": 2.4827311038970947 + }, + { + "auxiliary_loss_clip": 0.01111478, + "auxiliary_loss_mlp": 0.01026732, + "balance_loss_clip": 1.01375592, + "balance_loss_mlp": 1.03744245, + "epoch": 0.5438148203817826, + "flos": 25118688378240.0, + "grad_norm": 4.385221285903045, + "language_loss": 0.6211096, + "learning_rate": 1.8135243673173956e-06, + "loss": 0.6424917, + "num_input_tokens_seen": 194836595, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7421875, + "step": 9045, + "time_per_iteration": 2.5340473651885986 + }, + { + "auxiliary_loss_clip": 0.01108322, + "auxiliary_loss_mlp": 0.01030915, + "balance_loss_clip": 1.01814771, + "balance_loss_mlp": 1.03780746, + "epoch": 0.5438749436344507, + "flos": 23002939526400.0, + "grad_norm": 1.4286240482824728, + "language_loss": 0.69942701, + "learning_rate": 1.8131366057209023e-06, + "loss": 0.72081935, + "num_input_tokens_seen": 194857520, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.703125, + "step": 9046, + "time_per_iteration": 2.4620296955108643 + }, + { + "auxiliary_loss_clip": 0.01104565, + "auxiliary_loss_mlp": 0.01027743, + "balance_loss_clip": 1.01592338, + "balance_loss_mlp": 1.03681147, + "epoch": 0.5439350668871186, + "flos": 15487016065920.0, + "grad_norm": 2.1944623143587667, + "language_loss": 0.77171725, + "learning_rate": 1.8127488512102868e-06, + "loss": 0.79304034, + "num_input_tokens_seen": 194876020, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 9047, + "time_per_iteration": 2.4618160724639893 + }, + { + "auxiliary_loss_clip": 0.01107988, + "auxiliary_loss_mlp": 0.0103533, + "balance_loss_clip": 1.0232358, + "balance_loss_mlp": 1.03817999, + "epoch": 0.5439951901397866, + "flos": 17238415311360.0, + "grad_norm": 1.7709524835714412, + "language_loss": 0.72530591, + "learning_rate": 1.8123611038002547e-06, + "loss": 0.74673903, + "num_input_tokens_seen": 194894650, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69921875, + "step": 9048, + "time_per_iteration": 2.43306827545166 + }, + { + "auxiliary_loss_clip": 0.01108101, + "auxiliary_loss_mlp": 0.0103279, + "balance_loss_clip": 1.01999831, + "balance_loss_mlp": 1.03979266, + "epoch": 0.5440553133924545, + "flos": 18661016436480.0, + "grad_norm": 2.1212679973875805, + "language_loss": 0.93380594, + "learning_rate": 1.8119733635055076e-06, + "loss": 0.95521486, + "num_input_tokens_seen": 194911935, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.68359375, + "step": 9049, + "time_per_iteration": 2.4344465732574463 + }, + { + "auxiliary_loss_clip": 0.01102747, + "auxiliary_loss_mlp": 0.01029345, + "balance_loss_clip": 1.01810968, + "balance_loss_mlp": 1.0347991, + "epoch": 0.5441154366451225, + "flos": 27122934435840.0, + "grad_norm": 1.8375314287256255, + "language_loss": 0.73678643, + "learning_rate": 1.8115856303407492e-06, + "loss": 0.75810736, + "num_input_tokens_seen": 194931620, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6796875, + "step": 9050, + "time_per_iteration": 2.491992473602295 + }, + { + "auxiliary_loss_clip": 0.01109361, + "auxiliary_loss_mlp": 0.01028722, + "balance_loss_clip": 1.01630008, + "balance_loss_mlp": 1.0390985, + "epoch": 0.5441755598977904, + "flos": 25993867253760.0, + "grad_norm": 1.7129729573051025, + "language_loss": 0.67238903, + "learning_rate": 1.8111979043206832e-06, + "loss": 0.69376987, + "num_input_tokens_seen": 194952560, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.703125, + "step": 9051, + "time_per_iteration": 3.862109661102295 + }, + { + "auxiliary_loss_clip": 0.01104183, + "auxiliary_loss_mlp": 0.01029305, + "balance_loss_clip": 1.0174253, + "balance_loss_mlp": 1.03553367, + "epoch": 0.5442356831504584, + "flos": 32380041173760.0, + "grad_norm": 1.6461015999412698, + "language_loss": 0.67748392, + "learning_rate": 1.810810185460011e-06, + "loss": 0.6988188, + "num_input_tokens_seen": 194973915, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 9052, + "time_per_iteration": 2.5398967266082764 + }, + { + "auxiliary_loss_clip": 0.01108274, + "auxiliary_loss_mlp": 0.010316, + "balance_loss_clip": 1.01914227, + "balance_loss_mlp": 1.03725493, + "epoch": 0.5442958064031264, + "flos": 24164290056960.0, + "grad_norm": 1.7506645402052365, + "language_loss": 0.92625535, + "learning_rate": 1.810422473773436e-06, + "loss": 0.94765407, + "num_input_tokens_seen": 194990170, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 9053, + "time_per_iteration": 2.4675142765045166 + }, + { + "auxiliary_loss_clip": 0.01107915, + "auxiliary_loss_mlp": 0.01034902, + "balance_loss_clip": 1.02233112, + "balance_loss_mlp": 1.03685415, + "epoch": 0.5443559296557944, + "flos": 18764690065920.0, + "grad_norm": 2.7890591975918206, + "language_loss": 0.83447516, + "learning_rate": 1.8100347692756595e-06, + "loss": 0.85590339, + "num_input_tokens_seen": 195006395, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 9054, + "time_per_iteration": 5.314599275588989 + }, + { + "auxiliary_loss_clip": 0.01109858, + "auxiliary_loss_mlp": 0.01032667, + "balance_loss_clip": 1.02034652, + "balance_loss_mlp": 1.04010189, + "epoch": 0.5444160529084624, + "flos": 22632556435200.0, + "grad_norm": 2.3459133888285564, + "language_loss": 0.68981498, + "learning_rate": 1.8096470719813836e-06, + "loss": 0.71124029, + "num_input_tokens_seen": 195025080, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 9055, + "time_per_iteration": 3.926511287689209 + }, + { + "auxiliary_loss_clip": 0.01028465, + "auxiliary_loss_mlp": 0.00999723, + "balance_loss_clip": 0.99868602, + "balance_loss_mlp": 1.00688159, + "epoch": 0.5444761761611303, + "flos": 69671909600640.0, + "grad_norm": 0.7309752042107527, + "language_loss": 0.57659, + "learning_rate": 1.80925938190531e-06, + "loss": 0.59687185, + "num_input_tokens_seen": 195085725, + "router_z_loss_clip": 0.01037598, + "router_z_loss_mlp": 0.21582031, + "step": 9056, + "time_per_iteration": 3.0622963905334473 + }, + { + "auxiliary_loss_clip": 0.01106856, + "auxiliary_loss_mlp": 0.01029461, + "balance_loss_clip": 1.01665783, + "balance_loss_mlp": 1.03565168, + "epoch": 0.5445362994137983, + "flos": 14278442129280.0, + "grad_norm": 1.7313106745452744, + "language_loss": 0.69337952, + "learning_rate": 1.8088716990621395e-06, + "loss": 0.71474266, + "num_input_tokens_seen": 195102585, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7109375, + "step": 9057, + "time_per_iteration": 2.4510855674743652 + }, + { + "auxiliary_loss_clip": 0.01106022, + "auxiliary_loss_mlp": 0.01035977, + "balance_loss_clip": 1.02320337, + "balance_loss_mlp": 1.03730392, + "epoch": 0.5445964226664662, + "flos": 28986195611520.0, + "grad_norm": 2.1714933584662615, + "language_loss": 0.7508406, + "learning_rate": 1.8084840234665738e-06, + "loss": 0.77226055, + "num_input_tokens_seen": 195120055, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6875, + "step": 9058, + "time_per_iteration": 2.526362419128418 + }, + { + "auxiliary_loss_clip": 0.01028725, + "auxiliary_loss_mlp": 0.0100208, + "balance_loss_clip": 1.00100732, + "balance_loss_mlp": 1.00713301, + "epoch": 0.5446565459191343, + "flos": 68620230270720.0, + "grad_norm": 0.7971345769694276, + "language_loss": 0.62662959, + "learning_rate": 1.808096355133312e-06, + "loss": 0.64693761, + "num_input_tokens_seen": 195181045, + "router_z_loss_clip": 0.01074219, + "router_z_loss_mlp": 0.21582031, + "step": 9059, + "time_per_iteration": 3.1505026817321777 + }, + { + "auxiliary_loss_clip": 0.01105797, + "auxiliary_loss_mlp": 0.01030676, + "balance_loss_clip": 1.01862383, + "balance_loss_mlp": 1.03710485, + "epoch": 0.5447166691718022, + "flos": 16216469464320.0, + "grad_norm": 1.9373576881408119, + "language_loss": 0.791785, + "learning_rate": 1.8077086940770572e-06, + "loss": 0.81314969, + "num_input_tokens_seen": 195198840, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6875, + "step": 9060, + "time_per_iteration": 2.4754552841186523 + }, + { + "auxiliary_loss_clip": 0.01106659, + "auxiliary_loss_mlp": 0.01033204, + "balance_loss_clip": 1.02058554, + "balance_loss_mlp": 1.03625464, + "epoch": 0.5447767924244702, + "flos": 25849039616640.0, + "grad_norm": 1.604299719110434, + "language_loss": 0.7939564, + "learning_rate": 1.8073210403125072e-06, + "loss": 0.81535506, + "num_input_tokens_seen": 195218720, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 9061, + "time_per_iteration": 2.556467056274414 + }, + { + "auxiliary_loss_clip": 0.01104902, + "auxiliary_loss_mlp": 0.01026453, + "balance_loss_clip": 1.0152173, + "balance_loss_mlp": 1.03701198, + "epoch": 0.5448369156771381, + "flos": 19677718897920.0, + "grad_norm": 1.7809339372629867, + "language_loss": 0.87091219, + "learning_rate": 1.8069333938543627e-06, + "loss": 0.89222574, + "num_input_tokens_seen": 195235770, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6796875, + "step": 9062, + "time_per_iteration": 2.4758143424987793 + }, + { + "auxiliary_loss_clip": 0.01111266, + "auxiliary_loss_mlp": 0.01032954, + "balance_loss_clip": 1.01959074, + "balance_loss_mlp": 1.03804517, + "epoch": 0.5448970389298061, + "flos": 19281804215040.0, + "grad_norm": 1.9589069040824287, + "language_loss": 0.82366961, + "learning_rate": 1.8065457547173233e-06, + "loss": 0.84511185, + "num_input_tokens_seen": 195254870, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.734375, + "step": 9063, + "time_per_iteration": 2.4351277351379395 + }, + { + "auxiliary_loss_clip": 0.01106592, + "auxiliary_loss_mlp": 0.01028534, + "balance_loss_clip": 1.01580811, + "balance_loss_mlp": 1.0372479, + "epoch": 0.544957162182474, + "flos": 20991690316800.0, + "grad_norm": 1.809751627458355, + "language_loss": 0.63477433, + "learning_rate": 1.8061581229160878e-06, + "loss": 0.65612566, + "num_input_tokens_seen": 195273390, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 9064, + "time_per_iteration": 2.5002574920654297 + }, + { + "auxiliary_loss_clip": 0.01108938, + "auxiliary_loss_mlp": 0.01031086, + "balance_loss_clip": 1.01844406, + "balance_loss_mlp": 1.0378474, + "epoch": 0.545017285435142, + "flos": 25374587846400.0, + "grad_norm": 1.5950372697964212, + "language_loss": 0.79787326, + "learning_rate": 1.8057704984653566e-06, + "loss": 0.81927347, + "num_input_tokens_seen": 195295635, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 9065, + "time_per_iteration": 2.485886335372925 + }, + { + "auxiliary_loss_clip": 0.01104024, + "auxiliary_loss_mlp": 0.01029589, + "balance_loss_clip": 1.01893747, + "balance_loss_mlp": 1.03695667, + "epoch": 0.54507740868781, + "flos": 19134749934720.0, + "grad_norm": 1.9866274876050938, + "language_loss": 0.78143919, + "learning_rate": 1.805382881379827e-06, + "loss": 0.80277526, + "num_input_tokens_seen": 195312545, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.671875, + "step": 9066, + "time_per_iteration": 2.4608097076416016 + }, + { + "auxiliary_loss_clip": 0.0110724, + "auxiliary_loss_mlp": 0.0102858, + "balance_loss_clip": 1.0161345, + "balance_loss_mlp": 1.03510523, + "epoch": 0.545137531940478, + "flos": 26249802635520.0, + "grad_norm": 1.7709941680506742, + "language_loss": 0.75842655, + "learning_rate": 1.8049952716741975e-06, + "loss": 0.7797848, + "num_input_tokens_seen": 195332955, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.72265625, + "step": 9067, + "time_per_iteration": 2.4940598011016846 + }, + { + "auxiliary_loss_clip": 0.01114286, + "auxiliary_loss_mlp": 0.01035797, + "balance_loss_clip": 1.02152777, + "balance_loss_mlp": 1.0393995, + "epoch": 0.545197655193146, + "flos": 37555629995520.0, + "grad_norm": 2.2574843156274, + "language_loss": 0.63637972, + "learning_rate": 1.8046076693631682e-06, + "loss": 0.65788054, + "num_input_tokens_seen": 195355930, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.75, + "step": 9068, + "time_per_iteration": 2.570791244506836 + }, + { + "auxiliary_loss_clip": 0.0110619, + "auxiliary_loss_mlp": 0.01035364, + "balance_loss_clip": 1.02378917, + "balance_loss_mlp": 1.03860283, + "epoch": 0.5452577784458139, + "flos": 26031250333440.0, + "grad_norm": 1.608624941379858, + "language_loss": 0.7232843, + "learning_rate": 1.8042200744614343e-06, + "loss": 0.74469984, + "num_input_tokens_seen": 195376445, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.67578125, + "step": 9069, + "time_per_iteration": 2.49194073677063 + }, + { + "auxiliary_loss_clip": 0.01105915, + "auxiliary_loss_mlp": 0.01029861, + "balance_loss_clip": 1.01882815, + "balance_loss_mlp": 1.03988457, + "epoch": 0.5453179016984819, + "flos": 17639034675840.0, + "grad_norm": 1.7038570560603954, + "language_loss": 0.74060583, + "learning_rate": 1.8038324869836957e-06, + "loss": 0.76196355, + "num_input_tokens_seen": 195393725, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.66015625, + "step": 9070, + "time_per_iteration": 2.4085381031036377 + }, + { + "auxiliary_loss_clip": 0.0110378, + "auxiliary_loss_mlp": 0.01032237, + "balance_loss_clip": 1.02016675, + "balance_loss_mlp": 1.035869, + "epoch": 0.5453780249511498, + "flos": 23216679406080.0, + "grad_norm": 1.9518916968876514, + "language_loss": 0.60487843, + "learning_rate": 1.8034449069446489e-06, + "loss": 0.62623858, + "num_input_tokens_seen": 195411380, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 9071, + "time_per_iteration": 2.4736368656158447 + }, + { + "auxiliary_loss_clip": 0.01029891, + "auxiliary_loss_mlp": 0.01009543, + "balance_loss_clip": 1.00851762, + "balance_loss_mlp": 1.00855255, + "epoch": 0.5454381482038179, + "flos": 68696504801280.0, + "grad_norm": 0.702361481728272, + "language_loss": 0.57095647, + "learning_rate": 1.80305733435899e-06, + "loss": 0.59135079, + "num_input_tokens_seen": 195482015, + "router_z_loss_clip": 0.01025391, + "router_z_loss_mlp": 0.21386719, + "step": 9072, + "time_per_iteration": 3.1778738498687744 + }, + { + "auxiliary_loss_clip": 0.01104044, + "auxiliary_loss_mlp": 0.01029766, + "balance_loss_clip": 1.01834023, + "balance_loss_mlp": 1.03754437, + "epoch": 0.5454982714564858, + "flos": 13260626346240.0, + "grad_norm": 1.6497532443668452, + "language_loss": 0.69947577, + "learning_rate": 1.8026697692414174e-06, + "loss": 0.72081387, + "num_input_tokens_seen": 195500440, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 9073, + "time_per_iteration": 2.414483070373535 + }, + { + "auxiliary_loss_clip": 0.01102116, + "auxiliary_loss_mlp": 0.01032371, + "balance_loss_clip": 1.02133226, + "balance_loss_mlp": 1.03575385, + "epoch": 0.5455583947091538, + "flos": 21835878733440.0, + "grad_norm": 1.7860657423568516, + "language_loss": 0.71207851, + "learning_rate": 1.802282211606627e-06, + "loss": 0.73342335, + "num_input_tokens_seen": 195520860, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6640625, + "step": 9074, + "time_per_iteration": 2.5126519203186035 + }, + { + "auxiliary_loss_clip": 0.01105462, + "auxiliary_loss_mlp": 0.01037255, + "balance_loss_clip": 1.02541733, + "balance_loss_mlp": 1.03713095, + "epoch": 0.5456185179618217, + "flos": 17817438551040.0, + "grad_norm": 1.7043380827263428, + "language_loss": 0.68845975, + "learning_rate": 1.8018946614693148e-06, + "loss": 0.70988691, + "num_input_tokens_seen": 195538615, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.68359375, + "step": 9075, + "time_per_iteration": 2.4271233081817627 + }, + { + "auxiliary_loss_clip": 0.01104973, + "auxiliary_loss_mlp": 0.01029765, + "balance_loss_clip": 1.01904845, + "balance_loss_mlp": 1.03828716, + "epoch": 0.5456786412144897, + "flos": 21069401391360.0, + "grad_norm": 2.0277857780736155, + "language_loss": 0.804497, + "learning_rate": 1.8015071188441768e-06, + "loss": 0.82584435, + "num_input_tokens_seen": 195557460, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.66796875, + "step": 9076, + "time_per_iteration": 2.5117785930633545 + }, + { + "auxiliary_loss_clip": 0.01105415, + "auxiliary_loss_mlp": 0.01030375, + "balance_loss_clip": 1.01892447, + "balance_loss_mlp": 1.03663969, + "epoch": 0.5457387644671576, + "flos": 23294965098240.0, + "grad_norm": 1.583996751680831, + "language_loss": 0.80426413, + "learning_rate": 1.8011195837459089e-06, + "loss": 0.82562208, + "num_input_tokens_seen": 195577985, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6875, + "step": 9077, + "time_per_iteration": 2.4544837474823 + }, + { + "auxiliary_loss_clip": 0.0110649, + "auxiliary_loss_mlp": 0.01030156, + "balance_loss_clip": 1.01880729, + "balance_loss_mlp": 1.03688538, + "epoch": 0.5457988877198257, + "flos": 21617039122560.0, + "grad_norm": 1.9788210228225505, + "language_loss": 0.67737269, + "learning_rate": 1.8007320561892064e-06, + "loss": 0.69873917, + "num_input_tokens_seen": 195597620, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6953125, + "step": 9078, + "time_per_iteration": 2.5323657989501953 + }, + { + "auxiliary_loss_clip": 0.01107395, + "auxiliary_loss_mlp": 0.01032857, + "balance_loss_clip": 1.02072752, + "balance_loss_mlp": 1.03703523, + "epoch": 0.5458590109724936, + "flos": 23762485543680.0, + "grad_norm": 1.8696943679753917, + "language_loss": 0.80740905, + "learning_rate": 1.800344536188764e-06, + "loss": 0.82881159, + "num_input_tokens_seen": 195615910, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.703125, + "step": 9079, + "time_per_iteration": 2.458158493041992 + }, + { + "auxiliary_loss_clip": 0.01110307, + "auxiliary_loss_mlp": 0.01032502, + "balance_loss_clip": 1.01966298, + "balance_loss_mlp": 1.03775454, + "epoch": 0.5459191342251616, + "flos": 24424283675520.0, + "grad_norm": 1.6840905516778153, + "language_loss": 0.75812018, + "learning_rate": 1.799957023759277e-06, + "loss": 0.77954829, + "num_input_tokens_seen": 195635620, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7265625, + "step": 9080, + "time_per_iteration": 2.4955971240997314 + }, + { + "auxiliary_loss_clip": 0.01108419, + "auxiliary_loss_mlp": 0.01033024, + "balance_loss_clip": 1.02007222, + "balance_loss_mlp": 1.03805685, + "epoch": 0.5459792574778296, + "flos": 23623009032960.0, + "grad_norm": 2.4851521305720627, + "language_loss": 0.83080792, + "learning_rate": 1.7995695189154392e-06, + "loss": 0.85222232, + "num_input_tokens_seen": 195652495, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.703125, + "step": 9081, + "time_per_iteration": 2.4580602645874023 + }, + { + "auxiliary_loss_clip": 0.01110385, + "auxiliary_loss_mlp": 0.01029387, + "balance_loss_clip": 1.01722193, + "balance_loss_mlp": 1.03842843, + "epoch": 0.5460393807304975, + "flos": 19135540033920.0, + "grad_norm": 1.5408403844848193, + "language_loss": 0.69658768, + "learning_rate": 1.7991820216719461e-06, + "loss": 0.71798551, + "num_input_tokens_seen": 195671965, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.71875, + "step": 9082, + "time_per_iteration": 2.472858428955078 + }, + { + "auxiliary_loss_clip": 0.01102277, + "auxiliary_loss_mlp": 0.01026377, + "balance_loss_clip": 1.01434886, + "balance_loss_mlp": 1.03546321, + "epoch": 0.5460995039831655, + "flos": 35918534805120.0, + "grad_norm": 1.7415454834760362, + "language_loss": 0.66599333, + "learning_rate": 1.7987945320434906e-06, + "loss": 0.68727982, + "num_input_tokens_seen": 195694725, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 9083, + "time_per_iteration": 2.5756945610046387 + }, + { + "auxiliary_loss_clip": 0.01104147, + "auxiliary_loss_mlp": 0.01029182, + "balance_loss_clip": 1.01772594, + "balance_loss_mlp": 1.03678334, + "epoch": 0.5461596272358334, + "flos": 26759231274240.0, + "grad_norm": 1.6516896910486423, + "language_loss": 0.78909004, + "learning_rate": 1.798407050044766e-06, + "loss": 0.81042337, + "num_input_tokens_seen": 195714090, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.67578125, + "step": 9084, + "time_per_iteration": 2.5361523628234863 + }, + { + "auxiliary_loss_clip": 0.01107894, + "auxiliary_loss_mlp": 0.01032728, + "balance_loss_clip": 1.02093244, + "balance_loss_mlp": 1.03781819, + "epoch": 0.5462197504885015, + "flos": 20886580143360.0, + "grad_norm": 2.0163372032767826, + "language_loss": 0.74970639, + "learning_rate": 1.7980195756904675e-06, + "loss": 0.77111256, + "num_input_tokens_seen": 195733585, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.69921875, + "step": 9085, + "time_per_iteration": 2.461916208267212 + }, + { + "auxiliary_loss_clip": 0.01107723, + "auxiliary_loss_mlp": 0.0102905, + "balance_loss_clip": 1.01702785, + "balance_loss_mlp": 1.03705621, + "epoch": 0.5462798737411694, + "flos": 25804976607360.0, + "grad_norm": 1.6682732441654566, + "language_loss": 0.74792248, + "learning_rate": 1.7976321089952857e-06, + "loss": 0.76929021, + "num_input_tokens_seen": 195752820, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.703125, + "step": 9086, + "time_per_iteration": 2.530505657196045 + }, + { + "auxiliary_loss_clip": 0.01105061, + "auxiliary_loss_mlp": 0.01029176, + "balance_loss_clip": 1.01707602, + "balance_loss_mlp": 1.03592753, + "epoch": 0.5463399969938374, + "flos": 25775027642880.0, + "grad_norm": 1.5861549378759865, + "language_loss": 0.76987553, + "learning_rate": 1.7972446499739155e-06, + "loss": 0.79121786, + "num_input_tokens_seen": 195773740, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69140625, + "step": 9087, + "time_per_iteration": 2.4786858558654785 + }, + { + "auxiliary_loss_clip": 0.01110207, + "auxiliary_loss_mlp": 0.01035533, + "balance_loss_clip": 1.0228374, + "balance_loss_mlp": 1.03895903, + "epoch": 0.5464001202465053, + "flos": 18843298980480.0, + "grad_norm": 1.736831801992395, + "language_loss": 0.77471095, + "learning_rate": 1.7968571986410484e-06, + "loss": 0.79616833, + "num_input_tokens_seen": 195792125, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71484375, + "step": 9088, + "time_per_iteration": 2.450409173965454 + }, + { + "auxiliary_loss_clip": 0.01030156, + "auxiliary_loss_mlp": 0.01001999, + "balance_loss_clip": 1.0009743, + "balance_loss_mlp": 1.0086112, + "epoch": 0.5464602434991733, + "flos": 69049541623680.0, + "grad_norm": 0.7273835392783513, + "language_loss": 0.57771385, + "learning_rate": 1.7964697550113758e-06, + "loss": 0.59803545, + "num_input_tokens_seen": 195854935, + "router_z_loss_clip": 0.01025391, + "router_z_loss_mlp": 0.21484375, + "step": 9089, + "time_per_iteration": 3.1002800464630127 + }, + { + "auxiliary_loss_clip": 0.01107167, + "auxiliary_loss_mlp": 0.01030942, + "balance_loss_clip": 1.01875257, + "balance_loss_mlp": 1.03710759, + "epoch": 0.5465203667518412, + "flos": 27560039040000.0, + "grad_norm": 1.6935215277859987, + "language_loss": 0.76448178, + "learning_rate": 1.7960823190995918e-06, + "loss": 0.78586286, + "num_input_tokens_seen": 195874715, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.703125, + "step": 9090, + "time_per_iteration": 2.5178091526031494 + }, + { + "auxiliary_loss_clip": 0.0110913, + "auxiliary_loss_mlp": 0.01035309, + "balance_loss_clip": 1.02226758, + "balance_loss_mlp": 1.0362854, + "epoch": 0.5465804900045093, + "flos": 21210206705280.0, + "grad_norm": 2.128546091443876, + "language_loss": 0.73422724, + "learning_rate": 1.7956948909203855e-06, + "loss": 0.75567162, + "num_input_tokens_seen": 195892610, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7265625, + "step": 9091, + "time_per_iteration": 2.4523463249206543 + }, + { + "auxiliary_loss_clip": 0.0110893, + "auxiliary_loss_mlp": 0.01035178, + "balance_loss_clip": 1.02313828, + "balance_loss_mlp": 1.03835773, + "epoch": 0.5466406132571772, + "flos": 22488949860480.0, + "grad_norm": 1.850730557544026, + "language_loss": 0.77855682, + "learning_rate": 1.7953074704884498e-06, + "loss": 0.79999787, + "num_input_tokens_seen": 195911085, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.70703125, + "step": 9092, + "time_per_iteration": 2.463998556137085 + }, + { + "auxiliary_loss_clip": 0.01110185, + "auxiliary_loss_mlp": 0.01032754, + "balance_loss_clip": 1.01975393, + "balance_loss_mlp": 1.03879404, + "epoch": 0.5467007365098452, + "flos": 17675843137920.0, + "grad_norm": 1.992080116269468, + "language_loss": 0.74526983, + "learning_rate": 1.794920057818476e-06, + "loss": 0.76669919, + "num_input_tokens_seen": 195929845, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71484375, + "step": 9093, + "time_per_iteration": 3.8121659755706787 + }, + { + "auxiliary_loss_clip": 0.01108983, + "auxiliary_loss_mlp": 0.01037849, + "balance_loss_clip": 1.02420545, + "balance_loss_mlp": 1.03643596, + "epoch": 0.5467608597625132, + "flos": 15698852524800.0, + "grad_norm": 1.8684331289519012, + "language_loss": 0.69012475, + "learning_rate": 1.7945326529251533e-06, + "loss": 0.71159303, + "num_input_tokens_seen": 195946350, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.72265625, + "step": 9094, + "time_per_iteration": 2.406708240509033 + }, + { + "auxiliary_loss_clip": 0.0110964, + "auxiliary_loss_mlp": 0.01035906, + "balance_loss_clip": 1.02463508, + "balance_loss_mlp": 1.0408746, + "epoch": 0.5468209830151811, + "flos": 24312816794880.0, + "grad_norm": 3.1943674750228426, + "language_loss": 0.68355155, + "learning_rate": 1.7941452558231731e-06, + "loss": 0.70500696, + "num_input_tokens_seen": 195959840, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6875, + "step": 9095, + "time_per_iteration": 2.4663615226745605 + }, + { + "auxiliary_loss_clip": 0.0110876, + "auxiliary_loss_mlp": 0.01035993, + "balance_loss_clip": 1.0244838, + "balance_loss_mlp": 1.04013026, + "epoch": 0.5468811062678491, + "flos": 29166323339520.0, + "grad_norm": 1.544968347193232, + "language_loss": 0.66645032, + "learning_rate": 1.7937578665272256e-06, + "loss": 0.6878978, + "num_input_tokens_seen": 195981125, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6875, + "step": 9096, + "time_per_iteration": 5.378362417221069 + }, + { + "auxiliary_loss_clip": 0.01030132, + "auxiliary_loss_mlp": 0.00998409, + "balance_loss_clip": 0.99731266, + "balance_loss_mlp": 1.00865221, + "epoch": 0.546941229520517, + "flos": 67867037982720.0, + "grad_norm": 0.7389922300516351, + "language_loss": 0.57573926, + "learning_rate": 1.7933704850520007e-06, + "loss": 0.59602463, + "num_input_tokens_seen": 196038880, + "router_z_loss_clip": 0.01098633, + "router_z_loss_mlp": 0.21484375, + "step": 9097, + "time_per_iteration": 3.168614387512207 + }, + { + "auxiliary_loss_clip": 0.01030189, + "auxiliary_loss_mlp": 0.01002061, + "balance_loss_clip": 1.00105369, + "balance_loss_mlp": 1.00863671, + "epoch": 0.5470013527731851, + "flos": 58270306625280.0, + "grad_norm": 0.9052213801384115, + "language_loss": 0.64790761, + "learning_rate": 1.7929831114121868e-06, + "loss": 0.66823018, + "num_input_tokens_seen": 196099215, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.21484375, + "step": 9098, + "time_per_iteration": 3.01711106300354 + }, + { + "auxiliary_loss_clip": 0.01108703, + "auxiliary_loss_mlp": 0.01036918, + "balance_loss_clip": 1.02399004, + "balance_loss_mlp": 1.03762555, + "epoch": 0.547061476025853, + "flos": 22965915582720.0, + "grad_norm": 1.9907442514686344, + "language_loss": 0.73179287, + "learning_rate": 1.7925957456224753e-06, + "loss": 0.75324905, + "num_input_tokens_seen": 196120370, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9099, + "time_per_iteration": 2.50752592086792 + }, + { + "auxiliary_loss_clip": 0.01105594, + "auxiliary_loss_mlp": 0.01028658, + "balance_loss_clip": 1.01767302, + "balance_loss_mlp": 1.03749669, + "epoch": 0.547121599278521, + "flos": 29968244426880.0, + "grad_norm": 1.9036037415187144, + "language_loss": 0.72414565, + "learning_rate": 1.7922083876975537e-06, + "loss": 0.74548817, + "num_input_tokens_seen": 196139075, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6796875, + "step": 9100, + "time_per_iteration": 2.5455925464630127 + }, + { + "auxiliary_loss_clip": 0.01105887, + "auxiliary_loss_mlp": 0.01026443, + "balance_loss_clip": 1.01381898, + "balance_loss_mlp": 1.03679228, + "epoch": 0.5471817225311889, + "flos": 36535443914880.0, + "grad_norm": 1.608228209483335, + "language_loss": 0.67675304, + "learning_rate": 1.7918210376521102e-06, + "loss": 0.69807637, + "num_input_tokens_seen": 196159990, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69140625, + "step": 9101, + "time_per_iteration": 2.638460397720337 + }, + { + "auxiliary_loss_clip": 0.01108046, + "auxiliary_loss_mlp": 0.0102741, + "balance_loss_clip": 1.01515532, + "balance_loss_mlp": 1.03816807, + "epoch": 0.5472418457838569, + "flos": 25775243124480.0, + "grad_norm": 1.6461027740418694, + "language_loss": 0.78004694, + "learning_rate": 1.7914336955008343e-06, + "loss": 0.80140156, + "num_input_tokens_seen": 196180570, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 9102, + "time_per_iteration": 2.515669822692871 + }, + { + "auxiliary_loss_clip": 0.01109549, + "auxiliary_loss_mlp": 0.01036821, + "balance_loss_clip": 1.02434635, + "balance_loss_mlp": 1.04091179, + "epoch": 0.5473019690365248, + "flos": 27887687925120.0, + "grad_norm": 1.641023318874669, + "language_loss": 0.72358656, + "learning_rate": 1.791046361258413e-06, + "loss": 0.74505031, + "num_input_tokens_seen": 196200300, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 9103, + "time_per_iteration": 2.516160249710083 + }, + { + "auxiliary_loss_clip": 0.0110583, + "auxiliary_loss_mlp": 0.01027804, + "balance_loss_clip": 1.01571035, + "balance_loss_mlp": 1.03704, + "epoch": 0.5473620922891929, + "flos": 57631490219520.0, + "grad_norm": 1.3192542299458547, + "language_loss": 0.65333968, + "learning_rate": 1.7906590349395356e-06, + "loss": 0.674676, + "num_input_tokens_seen": 196228525, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 9104, + "time_per_iteration": 2.8076846599578857 + }, + { + "auxiliary_loss_clip": 0.01110613, + "auxiliary_loss_mlp": 0.01032122, + "balance_loss_clip": 1.0188477, + "balance_loss_mlp": 1.03879666, + "epoch": 0.5474222155418608, + "flos": 19354056422400.0, + "grad_norm": 1.7582225342351636, + "language_loss": 0.81346989, + "learning_rate": 1.790271716558888e-06, + "loss": 0.83489728, + "num_input_tokens_seen": 196247690, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 9105, + "time_per_iteration": 2.4436333179473877 + }, + { + "auxiliary_loss_clip": 0.01106137, + "auxiliary_loss_mlp": 0.01029973, + "balance_loss_clip": 1.01836777, + "balance_loss_mlp": 1.03727031, + "epoch": 0.5474823387945288, + "flos": 25120448144640.0, + "grad_norm": 1.5498107295674015, + "language_loss": 0.80534816, + "learning_rate": 1.7898844061311575e-06, + "loss": 0.82670921, + "num_input_tokens_seen": 196268555, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6875, + "step": 9106, + "time_per_iteration": 2.5293564796447754 + }, + { + "auxiliary_loss_clip": 0.01108965, + "auxiliary_loss_mlp": 0.0103263, + "balance_loss_clip": 1.02120996, + "balance_loss_mlp": 1.03986609, + "epoch": 0.5475424620471967, + "flos": 18004174381440.0, + "grad_norm": 1.7454593746340303, + "language_loss": 0.69378364, + "learning_rate": 1.7894971036710322e-06, + "loss": 0.71519959, + "num_input_tokens_seen": 196285585, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.69140625, + "step": 9107, + "time_per_iteration": 2.423023223876953 + }, + { + "auxiliary_loss_clip": 0.01110146, + "auxiliary_loss_mlp": 0.01029027, + "balance_loss_clip": 1.01680255, + "balance_loss_mlp": 1.03831339, + "epoch": 0.5476025852998647, + "flos": 22309324922880.0, + "grad_norm": 1.6483473327352183, + "language_loss": 0.63088882, + "learning_rate": 1.789109809193197e-06, + "loss": 0.65228057, + "num_input_tokens_seen": 196305085, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.71875, + "step": 9108, + "time_per_iteration": 2.4629247188568115 + }, + { + "auxiliary_loss_clip": 0.01106827, + "auxiliary_loss_mlp": 0.0102654, + "balance_loss_clip": 1.01526904, + "balance_loss_mlp": 1.03832912, + "epoch": 0.5476627085525327, + "flos": 20120497850880.0, + "grad_norm": 1.6809972098624877, + "language_loss": 0.74894333, + "learning_rate": 1.7887225227123396e-06, + "loss": 0.77027702, + "num_input_tokens_seen": 196323945, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.68359375, + "step": 9109, + "time_per_iteration": 2.445711851119995 + }, + { + "auxiliary_loss_clip": 0.01105646, + "auxiliary_loss_mlp": 0.01033842, + "balance_loss_clip": 1.02130747, + "balance_loss_mlp": 1.03783536, + "epoch": 0.5477228318052006, + "flos": 17712579772800.0, + "grad_norm": 1.9460400321268034, + "language_loss": 0.77668434, + "learning_rate": 1.7883352442431457e-06, + "loss": 0.79807919, + "num_input_tokens_seen": 196342200, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.67578125, + "step": 9110, + "time_per_iteration": 2.4724810123443604 + }, + { + "auxiliary_loss_clip": 0.01105056, + "auxiliary_loss_mlp": 0.01033075, + "balance_loss_clip": 1.02193475, + "balance_loss_mlp": 1.03772378, + "epoch": 0.5477829550578687, + "flos": 25848895962240.0, + "grad_norm": 1.7745449116751173, + "language_loss": 0.71189445, + "learning_rate": 1.7879479738002993e-06, + "loss": 0.73327577, + "num_input_tokens_seen": 196362940, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.67578125, + "step": 9111, + "time_per_iteration": 2.5220110416412354 + }, + { + "auxiliary_loss_clip": 0.01108238, + "auxiliary_loss_mlp": 0.01036998, + "balance_loss_clip": 1.02525544, + "balance_loss_mlp": 1.03890049, + "epoch": 0.5478430783105366, + "flos": 23039676161280.0, + "grad_norm": 1.5754245119869974, + "language_loss": 0.71029758, + "learning_rate": 1.7875607113984876e-06, + "loss": 0.73174989, + "num_input_tokens_seen": 196383070, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.69140625, + "step": 9112, + "time_per_iteration": 2.4876022338867188 + }, + { + "auxiliary_loss_clip": 0.01108992, + "auxiliary_loss_mlp": 0.01029489, + "balance_loss_clip": 1.0176518, + "balance_loss_mlp": 1.03795052, + "epoch": 0.5479032015632046, + "flos": 16071210864000.0, + "grad_norm": 2.4321144529101946, + "language_loss": 0.88027447, + "learning_rate": 1.7871734570523953e-06, + "loss": 0.90165925, + "num_input_tokens_seen": 196398485, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.7109375, + "step": 9113, + "time_per_iteration": 2.4495129585266113 + }, + { + "auxiliary_loss_clip": 0.01110892, + "auxiliary_loss_mlp": 0.01031433, + "balance_loss_clip": 1.01863575, + "balance_loss_mlp": 1.04015231, + "epoch": 0.5479633248158725, + "flos": 24278701852800.0, + "grad_norm": 1.4380357531145453, + "language_loss": 0.73040199, + "learning_rate": 1.7867862107767067e-06, + "loss": 0.75182521, + "num_input_tokens_seen": 196417725, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.70703125, + "step": 9114, + "time_per_iteration": 2.49124813079834 + }, + { + "auxiliary_loss_clip": 0.0110468, + "auxiliary_loss_mlp": 0.01031545, + "balance_loss_clip": 1.0205301, + "balance_loss_mlp": 1.03658402, + "epoch": 0.5480234480685405, + "flos": 26358216860160.0, + "grad_norm": 1.7175878836105734, + "language_loss": 0.72105908, + "learning_rate": 1.7863989725861066e-06, + "loss": 0.74242127, + "num_input_tokens_seen": 196437840, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6796875, + "step": 9115, + "time_per_iteration": 2.4818665981292725 + }, + { + "auxiliary_loss_clip": 0.01109482, + "auxiliary_loss_mlp": 0.01032226, + "balance_loss_clip": 1.01915491, + "balance_loss_mlp": 1.03801298, + "epoch": 0.5480835713212084, + "flos": 22055077480320.0, + "grad_norm": 1.8153830213846445, + "language_loss": 0.7222048, + "learning_rate": 1.7860117424952781e-06, + "loss": 0.74362183, + "num_input_tokens_seen": 196457300, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71484375, + "step": 9116, + "time_per_iteration": 2.4857382774353027 + }, + { + "auxiliary_loss_clip": 0.01108168, + "auxiliary_loss_mlp": 0.01038569, + "balance_loss_clip": 1.02634406, + "balance_loss_mlp": 1.03931904, + "epoch": 0.5481436945738765, + "flos": 25301042749440.0, + "grad_norm": 2.1442712779415025, + "language_loss": 0.76391387, + "learning_rate": 1.7856245205189063e-06, + "loss": 0.78538126, + "num_input_tokens_seen": 196476720, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6875, + "step": 9117, + "time_per_iteration": 2.481539726257324 + }, + { + "auxiliary_loss_clip": 0.01102281, + "auxiliary_loss_mlp": 0.01032027, + "balance_loss_clip": 1.02069592, + "balance_loss_mlp": 1.03559899, + "epoch": 0.5482038178265444, + "flos": 33580857772800.0, + "grad_norm": 1.6184993035700161, + "language_loss": 0.62667149, + "learning_rate": 1.785237306671674e-06, + "loss": 0.64801455, + "num_input_tokens_seen": 196496765, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 9118, + "time_per_iteration": 2.582087516784668 + }, + { + "auxiliary_loss_clip": 0.01112715, + "auxiliary_loss_mlp": 0.01030243, + "balance_loss_clip": 1.01705241, + "balance_loss_mlp": 1.04148602, + "epoch": 0.5482639410792124, + "flos": 19026192055680.0, + "grad_norm": 2.080656601028848, + "language_loss": 0.79054701, + "learning_rate": 1.7848501009682646e-06, + "loss": 0.81197661, + "num_input_tokens_seen": 196516220, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 9119, + "time_per_iteration": 2.431641101837158 + }, + { + "auxiliary_loss_clip": 0.01106769, + "auxiliary_loss_mlp": 0.01032074, + "balance_loss_clip": 1.02143443, + "balance_loss_mlp": 1.0393101, + "epoch": 0.5483240643318803, + "flos": 25410318900480.0, + "grad_norm": 1.6818671426073972, + "language_loss": 0.82585561, + "learning_rate": 1.7844629034233604e-06, + "loss": 0.84724402, + "num_input_tokens_seen": 196533860, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.671875, + "step": 9120, + "time_per_iteration": 2.501248359680176 + }, + { + "auxiliary_loss_clip": 0.01110356, + "auxiliary_loss_mlp": 0.01038217, + "balance_loss_clip": 1.02538443, + "balance_loss_mlp": 1.03979588, + "epoch": 0.5483841875845483, + "flos": 21466896272640.0, + "grad_norm": 1.7397757233914666, + "language_loss": 0.80841327, + "learning_rate": 1.7840757140516455e-06, + "loss": 0.82989895, + "num_input_tokens_seen": 196551305, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.70703125, + "step": 9121, + "time_per_iteration": 2.449951171875 + }, + { + "auxiliary_loss_clip": 0.01108531, + "auxiliary_loss_mlp": 0.0103453, + "balance_loss_clip": 1.02164376, + "balance_loss_mlp": 1.03663361, + "epoch": 0.5484443108372163, + "flos": 24747263792640.0, + "grad_norm": 2.0253856212842662, + "language_loss": 0.61077833, + "learning_rate": 1.7836885328678008e-06, + "loss": 0.63220894, + "num_input_tokens_seen": 196569420, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 9122, + "time_per_iteration": 2.4943363666534424 + }, + { + "auxiliary_loss_clip": 0.01107335, + "auxiliary_loss_mlp": 0.01031907, + "balance_loss_clip": 1.02135706, + "balance_loss_mlp": 1.03908038, + "epoch": 0.5485044340898843, + "flos": 25375377945600.0, + "grad_norm": 1.7986157880414966, + "language_loss": 0.71862841, + "learning_rate": 1.7833013598865084e-06, + "loss": 0.74002087, + "num_input_tokens_seen": 196590610, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.68359375, + "step": 9123, + "time_per_iteration": 2.4815285205841064 + }, + { + "auxiliary_loss_clip": 0.01108125, + "auxiliary_loss_mlp": 0.01028332, + "balance_loss_clip": 1.01702476, + "balance_loss_mlp": 1.03875828, + "epoch": 0.5485645573425523, + "flos": 12641167370880.0, + "grad_norm": 1.9471016807647592, + "language_loss": 0.83393133, + "learning_rate": 1.7829141951224505e-06, + "loss": 0.8552959, + "num_input_tokens_seen": 196606495, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6953125, + "step": 9124, + "time_per_iteration": 2.442490816116333 + }, + { + "auxiliary_loss_clip": 0.0110907, + "auxiliary_loss_mlp": 0.01034722, + "balance_loss_clip": 1.02254486, + "balance_loss_mlp": 1.04040182, + "epoch": 0.5486246805952202, + "flos": 28329425383680.0, + "grad_norm": 1.9388864941150135, + "language_loss": 0.79954362, + "learning_rate": 1.7825270385903075e-06, + "loss": 0.82098156, + "num_input_tokens_seen": 196626365, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 9125, + "time_per_iteration": 2.5117273330688477 + }, + { + "auxiliary_loss_clip": 0.01109363, + "auxiliary_loss_mlp": 0.01030791, + "balance_loss_clip": 1.01844716, + "balance_loss_mlp": 1.03870225, + "epoch": 0.5486848038478882, + "flos": 16800017817600.0, + "grad_norm": 2.35248102892353, + "language_loss": 0.74499249, + "learning_rate": 1.7821398903047617e-06, + "loss": 0.76639402, + "num_input_tokens_seen": 196644465, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.70703125, + "step": 9126, + "time_per_iteration": 2.481576442718506 + }, + { + "auxiliary_loss_clip": 0.01110687, + "auxiliary_loss_mlp": 0.01031323, + "balance_loss_clip": 1.01794803, + "balance_loss_mlp": 1.03789783, + "epoch": 0.5487449271005561, + "flos": 17236224581760.0, + "grad_norm": 2.4816786154583212, + "language_loss": 0.66715956, + "learning_rate": 1.7817527502804928e-06, + "loss": 0.68857968, + "num_input_tokens_seen": 196659160, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7265625, + "step": 9127, + "time_per_iteration": 2.462186574935913 + }, + { + "auxiliary_loss_clip": 0.01106989, + "auxiliary_loss_mlp": 0.01034985, + "balance_loss_clip": 1.02183664, + "balance_loss_mlp": 1.03737557, + "epoch": 0.5488050503532241, + "flos": 17340867878400.0, + "grad_norm": 1.7392555793748137, + "language_loss": 0.83598024, + "learning_rate": 1.781365618532181e-06, + "loss": 0.85740006, + "num_input_tokens_seen": 196677410, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.6953125, + "step": 9128, + "time_per_iteration": 2.4559218883514404 + }, + { + "auxiliary_loss_clip": 0.01107314, + "auxiliary_loss_mlp": 0.01031923, + "balance_loss_clip": 1.01948333, + "balance_loss_mlp": 1.03735828, + "epoch": 0.548865173605892, + "flos": 17239169496960.0, + "grad_norm": 1.8252742071628254, + "language_loss": 0.74370325, + "learning_rate": 1.7809784950745078e-06, + "loss": 0.76509559, + "num_input_tokens_seen": 196696765, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.69921875, + "step": 9129, + "time_per_iteration": 2.443394422531128 + }, + { + "auxiliary_loss_clip": 0.01111598, + "auxiliary_loss_mlp": 0.01030328, + "balance_loss_clip": 1.01749516, + "balance_loss_mlp": 1.0391345, + "epoch": 0.5489252968585601, + "flos": 17456716218240.0, + "grad_norm": 2.8843985474075557, + "language_loss": 0.6325981, + "learning_rate": 1.7805913799221511e-06, + "loss": 0.65401739, + "num_input_tokens_seen": 196714895, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7265625, + "step": 9130, + "time_per_iteration": 2.424933433532715 + }, + { + "auxiliary_loss_clip": 0.01109538, + "auxiliary_loss_mlp": 0.01035884, + "balance_loss_clip": 1.02319455, + "balance_loss_mlp": 1.03782725, + "epoch": 0.548985420111228, + "flos": 26323383646080.0, + "grad_norm": 2.1259011139704804, + "language_loss": 0.62936115, + "learning_rate": 1.7802042730897915e-06, + "loss": 0.65081537, + "num_input_tokens_seen": 196735510, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71484375, + "step": 9131, + "time_per_iteration": 2.508054256439209 + }, + { + "auxiliary_loss_clip": 0.01109907, + "auxiliary_loss_mlp": 0.01032434, + "balance_loss_clip": 1.01925564, + "balance_loss_mlp": 1.03880227, + "epoch": 0.549045543363896, + "flos": 18693730748160.0, + "grad_norm": 1.7299030045344002, + "language_loss": 0.74452615, + "learning_rate": 1.7798171745921084e-06, + "loss": 0.76594955, + "num_input_tokens_seen": 196752855, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 9132, + "time_per_iteration": 2.456127166748047 + }, + { + "auxiliary_loss_clip": 0.0110607, + "auxiliary_loss_mlp": 0.01027832, + "balance_loss_clip": 1.01656091, + "balance_loss_mlp": 1.03589082, + "epoch": 0.5491056666165639, + "flos": 24717386655360.0, + "grad_norm": 1.6111198761107228, + "language_loss": 0.8129831, + "learning_rate": 1.7794300844437795e-06, + "loss": 0.83432209, + "num_input_tokens_seen": 196772230, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.703125, + "step": 9133, + "time_per_iteration": 2.490236759185791 + }, + { + "auxiliary_loss_clip": 0.01106997, + "auxiliary_loss_mlp": 0.01033411, + "balance_loss_clip": 1.02131128, + "balance_loss_mlp": 1.03802598, + "epoch": 0.5491657898692319, + "flos": 21576926609280.0, + "grad_norm": 1.7268592344479874, + "language_loss": 0.70094633, + "learning_rate": 1.7790430026594841e-06, + "loss": 0.72235036, + "num_input_tokens_seen": 196790405, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 9134, + "time_per_iteration": 3.827064275741577 + }, + { + "auxiliary_loss_clip": 0.01110087, + "auxiliary_loss_mlp": 0.01032274, + "balance_loss_clip": 1.02014494, + "balance_loss_mlp": 1.03806603, + "epoch": 0.5492259131219, + "flos": 50476432746240.0, + "grad_norm": 1.744868024388231, + "language_loss": 0.61109304, + "learning_rate": 1.7786559292539004e-06, + "loss": 0.63251662, + "num_input_tokens_seen": 196813785, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71875, + "step": 9135, + "time_per_iteration": 2.730273723602295 + }, + { + "auxiliary_loss_clip": 0.01110668, + "auxiliary_loss_mlp": 0.01034696, + "balance_loss_clip": 1.02089787, + "balance_loss_mlp": 1.03864121, + "epoch": 0.5492860363745679, + "flos": 25119262995840.0, + "grad_norm": 1.7368953039767876, + "language_loss": 0.72582811, + "learning_rate": 1.7782688642417058e-06, + "loss": 0.74728173, + "num_input_tokens_seen": 196834390, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.71875, + "step": 9136, + "time_per_iteration": 2.483704090118408 + }, + { + "auxiliary_loss_clip": 0.01111013, + "auxiliary_loss_mlp": 0.01036009, + "balance_loss_clip": 1.02256799, + "balance_loss_mlp": 1.03636873, + "epoch": 0.5493461596272359, + "flos": 22633777497600.0, + "grad_norm": 3.852349726597511, + "language_loss": 0.68771708, + "learning_rate": 1.7778818076375781e-06, + "loss": 0.70918733, + "num_input_tokens_seen": 196853290, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 9137, + "time_per_iteration": 5.456461191177368 + }, + { + "auxiliary_loss_clip": 0.01031834, + "auxiliary_loss_mlp": 0.01007044, + "balance_loss_clip": 1.00602436, + "balance_loss_mlp": 1.01015878, + "epoch": 0.5494062828799038, + "flos": 66151800754560.0, + "grad_norm": 0.9040496486989937, + "language_loss": 0.6527245, + "learning_rate": 1.7774947594561947e-06, + "loss": 0.67311323, + "num_input_tokens_seen": 196913120, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.21679688, + "step": 9138, + "time_per_iteration": 4.559895753860474 + }, + { + "auxiliary_loss_clip": 0.01109871, + "auxiliary_loss_mlp": 0.01030983, + "balance_loss_clip": 1.01828778, + "balance_loss_mlp": 1.03911173, + "epoch": 0.5494664061325718, + "flos": 21105958458240.0, + "grad_norm": 1.6793798945838962, + "language_loss": 0.74981934, + "learning_rate": 1.7771077197122321e-06, + "loss": 0.7712279, + "num_input_tokens_seen": 196931530, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 9139, + "time_per_iteration": 2.4897236824035645 + }, + { + "auxiliary_loss_clip": 0.01107792, + "auxiliary_loss_mlp": 0.01029596, + "balance_loss_clip": 1.01754975, + "balance_loss_mlp": 1.03827238, + "epoch": 0.5495265293852397, + "flos": 14392566616320.0, + "grad_norm": 1.7331605634368676, + "language_loss": 0.71274745, + "learning_rate": 1.7767206884203672e-06, + "loss": 0.73412126, + "num_input_tokens_seen": 196949430, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6953125, + "step": 9140, + "time_per_iteration": 2.416760206222534 + }, + { + "auxiliary_loss_clip": 0.01105846, + "auxiliary_loss_mlp": 0.01033348, + "balance_loss_clip": 1.02035391, + "balance_loss_mlp": 1.03625703, + "epoch": 0.5495866526379077, + "flos": 25549148966400.0, + "grad_norm": 1.6373657351429003, + "language_loss": 0.76304853, + "learning_rate": 1.7763336655952762e-06, + "loss": 0.78444046, + "num_input_tokens_seen": 196968265, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6953125, + "step": 9141, + "time_per_iteration": 2.495957612991333 + }, + { + "auxiliary_loss_clip": 0.01104653, + "auxiliary_loss_mlp": 0.0102863, + "balance_loss_clip": 1.01651192, + "balance_loss_mlp": 1.03816998, + "epoch": 0.5496467758905756, + "flos": 21317256213120.0, + "grad_norm": 1.8000642859490852, + "language_loss": 0.74711812, + "learning_rate": 1.7759466512516346e-06, + "loss": 0.76845098, + "num_input_tokens_seen": 196984930, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 9142, + "time_per_iteration": 2.476701259613037 + }, + { + "auxiliary_loss_clip": 0.01112649, + "auxiliary_loss_mlp": 0.01033014, + "balance_loss_clip": 1.01920366, + "balance_loss_mlp": 1.04044414, + "epoch": 0.5497068991432437, + "flos": 22233086305920.0, + "grad_norm": 3.087747357168804, + "language_loss": 0.76516807, + "learning_rate": 1.7755596454041192e-06, + "loss": 0.78662473, + "num_input_tokens_seen": 197002320, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.72265625, + "step": 9143, + "time_per_iteration": 2.4777820110321045 + }, + { + "auxiliary_loss_clip": 0.01105498, + "auxiliary_loss_mlp": 0.0103083, + "balance_loss_clip": 1.0188787, + "balance_loss_mlp": 1.03639066, + "epoch": 0.5497670223959116, + "flos": 18479093028480.0, + "grad_norm": 4.124964872446098, + "language_loss": 0.79934669, + "learning_rate": 1.7751726480674044e-06, + "loss": 0.82070994, + "num_input_tokens_seen": 197020825, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.69140625, + "step": 9144, + "time_per_iteration": 2.470946788787842 + }, + { + "auxiliary_loss_clip": 0.01109215, + "auxiliary_loss_mlp": 0.01028797, + "balance_loss_clip": 1.0163275, + "balance_loss_mlp": 1.03886819, + "epoch": 0.5498271456485796, + "flos": 29205107049600.0, + "grad_norm": 2.259125962742438, + "language_loss": 0.71273595, + "learning_rate": 1.7747856592561645e-06, + "loss": 0.73411608, + "num_input_tokens_seen": 197040450, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.703125, + "step": 9145, + "time_per_iteration": 2.5155293941497803 + }, + { + "auxiliary_loss_clip": 0.0110796, + "auxiliary_loss_mlp": 0.01027624, + "balance_loss_clip": 1.01604867, + "balance_loss_mlp": 1.03797007, + "epoch": 0.5498872689012475, + "flos": 34824372664320.0, + "grad_norm": 1.760392083970442, + "language_loss": 0.70398986, + "learning_rate": 1.774398678985076e-06, + "loss": 0.72534567, + "num_input_tokens_seen": 197063930, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.703125, + "step": 9146, + "time_per_iteration": 2.5837745666503906 + }, + { + "auxiliary_loss_clip": 0.01103828, + "auxiliary_loss_mlp": 0.01030506, + "balance_loss_clip": 1.01897275, + "balance_loss_mlp": 1.03747129, + "epoch": 0.5499473921539155, + "flos": 25921938268800.0, + "grad_norm": 1.7328002119898687, + "language_loss": 0.6403445, + "learning_rate": 1.7740117072688113e-06, + "loss": 0.66168791, + "num_input_tokens_seen": 197082660, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 9147, + "time_per_iteration": 2.5004754066467285 + }, + { + "auxiliary_loss_clip": 0.01110115, + "auxiliary_loss_mlp": 0.01029285, + "balance_loss_clip": 1.01714349, + "balance_loss_mlp": 1.04033351, + "epoch": 0.5500075154065835, + "flos": 22273701609600.0, + "grad_norm": 2.3129813772985854, + "language_loss": 0.80632472, + "learning_rate": 1.7736247441220458e-06, + "loss": 0.82771873, + "num_input_tokens_seen": 197100675, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6953125, + "step": 9148, + "time_per_iteration": 2.4941914081573486 + }, + { + "auxiliary_loss_clip": 0.01109987, + "auxiliary_loss_mlp": 0.01034322, + "balance_loss_clip": 1.0224669, + "balance_loss_mlp": 1.04013515, + "epoch": 0.5500676386592515, + "flos": 28037507552640.0, + "grad_norm": 1.5952381042001647, + "language_loss": 0.78739786, + "learning_rate": 1.773237789559453e-06, + "loss": 0.80884099, + "num_input_tokens_seen": 197121320, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.69921875, + "step": 9149, + "time_per_iteration": 2.5276949405670166 + }, + { + "auxiliary_loss_clip": 0.01108964, + "auxiliary_loss_mlp": 0.01029124, + "balance_loss_clip": 1.01695323, + "balance_loss_mlp": 1.03880644, + "epoch": 0.5501277619119195, + "flos": 23914819123200.0, + "grad_norm": 2.0296810240639847, + "language_loss": 0.72119236, + "learning_rate": 1.7728508435957052e-06, + "loss": 0.74257326, + "num_input_tokens_seen": 197138965, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.703125, + "step": 9150, + "time_per_iteration": 2.4646284580230713 + }, + { + "auxiliary_loss_clip": 0.01110946, + "auxiliary_loss_mlp": 0.01027984, + "balance_loss_clip": 1.01450694, + "balance_loss_mlp": 1.03812099, + "epoch": 0.5501878851645874, + "flos": 20923783655040.0, + "grad_norm": 1.6901514106805953, + "language_loss": 0.74800563, + "learning_rate": 1.772463906245477e-06, + "loss": 0.76939499, + "num_input_tokens_seen": 197156460, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7265625, + "step": 9151, + "time_per_iteration": 2.4528467655181885 + }, + { + "auxiliary_loss_clip": 0.01109486, + "auxiliary_loss_mlp": 0.01027197, + "balance_loss_clip": 1.01572907, + "balance_loss_mlp": 1.03945291, + "epoch": 0.5502480084172554, + "flos": 20665298407680.0, + "grad_norm": 1.835684303690663, + "language_loss": 0.76049578, + "learning_rate": 1.7720769775234394e-06, + "loss": 0.78186262, + "num_input_tokens_seen": 197175140, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.703125, + "step": 9152, + "time_per_iteration": 2.4587628841400146 + }, + { + "auxiliary_loss_clip": 0.011054, + "auxiliary_loss_mlp": 0.01028906, + "balance_loss_clip": 1.01691318, + "balance_loss_mlp": 1.03700173, + "epoch": 0.5503081316699233, + "flos": 26432552056320.0, + "grad_norm": 1.7890824738540096, + "language_loss": 0.82162666, + "learning_rate": 1.7716900574442662e-06, + "loss": 0.84296966, + "num_input_tokens_seen": 197194345, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.68359375, + "step": 9153, + "time_per_iteration": 2.490391492843628 + }, + { + "auxiliary_loss_clip": 0.01107152, + "auxiliary_loss_mlp": 0.01032594, + "balance_loss_clip": 1.02004111, + "balance_loss_mlp": 1.03787208, + "epoch": 0.5503682549225913, + "flos": 30629144718720.0, + "grad_norm": 1.7732052023343188, + "language_loss": 0.74143934, + "learning_rate": 1.7713031460226294e-06, + "loss": 0.76283687, + "num_input_tokens_seen": 197215535, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69140625, + "step": 9154, + "time_per_iteration": 2.5304152965545654 + }, + { + "auxiliary_loss_clip": 0.01113689, + "auxiliary_loss_mlp": 0.01035044, + "balance_loss_clip": 1.02184761, + "balance_loss_mlp": 1.04016376, + "epoch": 0.5504283781752592, + "flos": 22565439872640.0, + "grad_norm": 1.4983591953206352, + "language_loss": 0.7257731, + "learning_rate": 1.770916243273199e-06, + "loss": 0.74726045, + "num_input_tokens_seen": 197234945, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 9155, + "time_per_iteration": 2.4642586708068848 + }, + { + "auxiliary_loss_clip": 0.01033812, + "auxiliary_loss_mlp": 0.01001849, + "balance_loss_clip": 1.00080609, + "balance_loss_mlp": 1.01202416, + "epoch": 0.5504885014279273, + "flos": 67901009270400.0, + "grad_norm": 0.7480439065154532, + "language_loss": 0.55414248, + "learning_rate": 1.7705293492106483e-06, + "loss": 0.57449913, + "num_input_tokens_seen": 197302285, + "router_z_loss_clip": 0.01043701, + "router_z_loss_mlp": 0.21777344, + "step": 9156, + "time_per_iteration": 3.184554100036621 + }, + { + "auxiliary_loss_clip": 0.0110658, + "auxiliary_loss_mlp": 0.01029477, + "balance_loss_clip": 1.01741338, + "balance_loss_mlp": 1.0373919, + "epoch": 0.5505486246805952, + "flos": 22450058409600.0, + "grad_norm": 1.690497670143624, + "language_loss": 0.82608092, + "learning_rate": 1.7701424638496475e-06, + "loss": 0.84744143, + "num_input_tokens_seen": 197321575, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.69140625, + "step": 9157, + "time_per_iteration": 2.4718377590179443 + }, + { + "auxiliary_loss_clip": 0.01115009, + "auxiliary_loss_mlp": 0.01031366, + "balance_loss_clip": 1.01764512, + "balance_loss_mlp": 1.04140961, + "epoch": 0.5506087479332632, + "flos": 26906896085760.0, + "grad_norm": 2.5846917450647138, + "language_loss": 0.75262648, + "learning_rate": 1.7697555872048677e-06, + "loss": 0.77409017, + "num_input_tokens_seen": 197340255, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.734375, + "step": 9158, + "time_per_iteration": 2.483400583267212 + }, + { + "auxiliary_loss_clip": 0.01105976, + "auxiliary_loss_mlp": 0.01030833, + "balance_loss_clip": 1.01863742, + "balance_loss_mlp": 1.0392096, + "epoch": 0.5506688711859311, + "flos": 22930256355840.0, + "grad_norm": 1.6248211907364027, + "language_loss": 0.69624805, + "learning_rate": 1.769368719290979e-06, + "loss": 0.71761608, + "num_input_tokens_seen": 197360360, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66796875, + "step": 9159, + "time_per_iteration": 2.5159049034118652 + }, + { + "auxiliary_loss_clip": 0.01110817, + "auxiliary_loss_mlp": 0.01032003, + "balance_loss_clip": 1.01913416, + "balance_loss_mlp": 1.03923249, + "epoch": 0.5507289944385991, + "flos": 29606408772480.0, + "grad_norm": 1.7392637683079002, + "language_loss": 0.67766821, + "learning_rate": 1.7689818601226516e-06, + "loss": 0.69909644, + "num_input_tokens_seen": 197381905, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71484375, + "step": 9160, + "time_per_iteration": 2.5915122032165527 + }, + { + "auxiliary_loss_clip": 0.01106091, + "auxiliary_loss_mlp": 0.01032828, + "balance_loss_clip": 1.02106166, + "balance_loss_mlp": 1.03855252, + "epoch": 0.5507891176912671, + "flos": 15334431091200.0, + "grad_norm": 1.9414097965551829, + "language_loss": 0.71404171, + "learning_rate": 1.7685950097145552e-06, + "loss": 0.7354309, + "num_input_tokens_seen": 197398555, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.67578125, + "step": 9161, + "time_per_iteration": 2.4698691368103027 + }, + { + "auxiliary_loss_clip": 0.0110819, + "auxiliary_loss_mlp": 0.01035993, + "balance_loss_clip": 1.02365494, + "balance_loss_mlp": 1.03864145, + "epoch": 0.5508492409439351, + "flos": 26578313447040.0, + "grad_norm": 2.0077015754602985, + "language_loss": 0.69346386, + "learning_rate": 1.768208168081359e-06, + "loss": 0.71490568, + "num_input_tokens_seen": 197419630, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 9162, + "time_per_iteration": 2.514615297317505 + }, + { + "auxiliary_loss_clip": 0.01107873, + "auxiliary_loss_mlp": 0.01037948, + "balance_loss_clip": 1.02538323, + "balance_loss_mlp": 1.03850245, + "epoch": 0.5509093641966031, + "flos": 25443428261760.0, + "grad_norm": 1.6272332912595904, + "language_loss": 0.8531208, + "learning_rate": 1.767821335237733e-06, + "loss": 0.87457901, + "num_input_tokens_seen": 197438480, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 9163, + "time_per_iteration": 2.55450439453125 + }, + { + "auxiliary_loss_clip": 0.01107861, + "auxiliary_loss_mlp": 0.01032273, + "balance_loss_clip": 1.02065635, + "balance_loss_mlp": 1.0394969, + "epoch": 0.550969487449271, + "flos": 18698543170560.0, + "grad_norm": 1.5452929110279412, + "language_loss": 0.8063103, + "learning_rate": 1.7674345111983441e-06, + "loss": 0.8277117, + "num_input_tokens_seen": 197456755, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.68359375, + "step": 9164, + "time_per_iteration": 2.477283239364624 + }, + { + "auxiliary_loss_clip": 0.01112735, + "auxiliary_loss_mlp": 0.01027637, + "balance_loss_clip": 1.0152092, + "balance_loss_mlp": 1.04160368, + "epoch": 0.551029610701939, + "flos": 22708723224960.0, + "grad_norm": 1.8276675469309818, + "language_loss": 0.73409986, + "learning_rate": 1.767047695977863e-06, + "loss": 0.75550359, + "num_input_tokens_seen": 197475530, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 9165, + "time_per_iteration": 2.4870002269744873 + }, + { + "auxiliary_loss_clip": 0.01105934, + "auxiliary_loss_mlp": 0.01028437, + "balance_loss_clip": 1.01700497, + "balance_loss_mlp": 1.03732443, + "epoch": 0.5510897339546069, + "flos": 12420496166400.0, + "grad_norm": 1.8849650051461906, + "language_loss": 0.79019225, + "learning_rate": 1.7666608895909563e-06, + "loss": 0.81153595, + "num_input_tokens_seen": 197490835, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6875, + "step": 9166, + "time_per_iteration": 2.435049295425415 + }, + { + "auxiliary_loss_clip": 0.01108748, + "auxiliary_loss_mlp": 0.01028394, + "balance_loss_clip": 1.01596665, + "balance_loss_mlp": 1.03822398, + "epoch": 0.5511498572072749, + "flos": 18770579896320.0, + "grad_norm": 2.033929506473001, + "language_loss": 0.76165509, + "learning_rate": 1.7662740920522913e-06, + "loss": 0.78302646, + "num_input_tokens_seen": 197508770, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.70703125, + "step": 9167, + "time_per_iteration": 2.474677562713623 + }, + { + "auxiliary_loss_clip": 0.01105676, + "auxiliary_loss_mlp": 0.01029086, + "balance_loss_clip": 1.01631832, + "balance_loss_mlp": 1.03744709, + "epoch": 0.5512099804599428, + "flos": 19573326996480.0, + "grad_norm": 2.261050601267758, + "language_loss": 0.79845661, + "learning_rate": 1.7658873033765374e-06, + "loss": 0.81980425, + "num_input_tokens_seen": 197527340, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 9168, + "time_per_iteration": 2.484435796737671 + }, + { + "auxiliary_loss_clip": 0.01110227, + "auxiliary_loss_mlp": 0.0103542, + "balance_loss_clip": 1.0231235, + "balance_loss_mlp": 1.03901529, + "epoch": 0.5512701037126109, + "flos": 26245600744320.0, + "grad_norm": 1.641322965099804, + "language_loss": 0.68934894, + "learning_rate": 1.7655005235783591e-06, + "loss": 0.71080542, + "num_input_tokens_seen": 197547280, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.7109375, + "step": 9169, + "time_per_iteration": 2.5206069946289062 + }, + { + "auxiliary_loss_clip": 0.01102507, + "auxiliary_loss_mlp": 0.01025884, + "balance_loss_clip": 1.014714, + "balance_loss_mlp": 1.03545678, + "epoch": 0.5513302269652788, + "flos": 21945406279680.0, + "grad_norm": 2.0185216192280553, + "language_loss": 0.85350084, + "learning_rate": 1.7651137526724251e-06, + "loss": 0.87478477, + "num_input_tokens_seen": 197565045, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.671875, + "step": 9170, + "time_per_iteration": 2.4762823581695557 + }, + { + "auxiliary_loss_clip": 0.01031617, + "auxiliary_loss_mlp": 0.01002445, + "balance_loss_clip": 1.00143194, + "balance_loss_mlp": 1.00984073, + "epoch": 0.5513903502179468, + "flos": 68235948616320.0, + "grad_norm": 0.7807167648980764, + "language_loss": 0.5990442, + "learning_rate": 1.7647269906734017e-06, + "loss": 0.61938488, + "num_input_tokens_seen": 197625005, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.21777344, + "step": 9171, + "time_per_iteration": 3.0934739112854004 + }, + { + "auxiliary_loss_clip": 0.01106302, + "auxiliary_loss_mlp": 0.01032525, + "balance_loss_clip": 1.02024603, + "balance_loss_mlp": 1.03768301, + "epoch": 0.5514504734706147, + "flos": 18734238311040.0, + "grad_norm": 1.4242208217777272, + "language_loss": 0.701002, + "learning_rate": 1.7643402375959533e-06, + "loss": 0.72239029, + "num_input_tokens_seen": 197645050, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 9172, + "time_per_iteration": 2.482672929763794 + }, + { + "auxiliary_loss_clip": 0.01104259, + "auxiliary_loss_mlp": 0.01029819, + "balance_loss_clip": 1.0176115, + "balance_loss_mlp": 1.03602123, + "epoch": 0.5515105967232827, + "flos": 22270972176000.0, + "grad_norm": 1.708440744181033, + "language_loss": 0.75790203, + "learning_rate": 1.7639534934547474e-06, + "loss": 0.77924281, + "num_input_tokens_seen": 197663910, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 9173, + "time_per_iteration": 2.476710557937622 + }, + { + "auxiliary_loss_clip": 0.01104019, + "auxiliary_loss_mlp": 0.01032582, + "balance_loss_clip": 1.02019644, + "balance_loss_mlp": 1.0371182, + "epoch": 0.5515707199759508, + "flos": 22557682535040.0, + "grad_norm": 1.5740431144983165, + "language_loss": 0.74457419, + "learning_rate": 1.7635667582644484e-06, + "loss": 0.76594019, + "num_input_tokens_seen": 197681580, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 9174, + "time_per_iteration": 2.4599406719207764 + }, + { + "auxiliary_loss_clip": 0.01108196, + "auxiliary_loss_mlp": 0.01029333, + "balance_loss_clip": 1.0173409, + "balance_loss_mlp": 1.03827941, + "epoch": 0.5516308432286187, + "flos": 28291072636800.0, + "grad_norm": 1.784111045924148, + "language_loss": 0.72615731, + "learning_rate": 1.7631800320397217e-06, + "loss": 0.74753261, + "num_input_tokens_seen": 197702095, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69921875, + "step": 9175, + "time_per_iteration": 2.5028982162475586 + }, + { + "auxiliary_loss_clip": 0.01108059, + "auxiliary_loss_mlp": 0.01035871, + "balance_loss_clip": 1.02403331, + "balance_loss_mlp": 1.0378927, + "epoch": 0.5516909664812867, + "flos": 18764474584320.0, + "grad_norm": 1.8209397746213287, + "language_loss": 0.69452918, + "learning_rate": 1.7627933147952318e-06, + "loss": 0.71596849, + "num_input_tokens_seen": 197720720, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.703125, + "step": 9176, + "time_per_iteration": 3.852022171020508 + }, + { + "auxiliary_loss_clip": 0.01104891, + "auxiliary_loss_mlp": 0.01027669, + "balance_loss_clip": 1.01612973, + "balance_loss_mlp": 1.03734601, + "epoch": 0.5517510897339546, + "flos": 27740346336000.0, + "grad_norm": 1.7630507090786165, + "language_loss": 0.70797551, + "learning_rate": 1.7624066065456435e-06, + "loss": 0.7293011, + "num_input_tokens_seen": 197741820, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 9177, + "time_per_iteration": 2.507990837097168 + }, + { + "auxiliary_loss_clip": 0.01109377, + "auxiliary_loss_mlp": 0.0102783, + "balance_loss_clip": 1.01644588, + "balance_loss_mlp": 1.03980064, + "epoch": 0.5518112129866226, + "flos": 18404470523520.0, + "grad_norm": 1.556329351454275, + "language_loss": 0.80197215, + "learning_rate": 1.7620199073056204e-06, + "loss": 0.82334423, + "num_input_tokens_seen": 197759160, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6953125, + "step": 9178, + "time_per_iteration": 2.4645802974700928 + }, + { + "auxiliary_loss_clip": 0.01110368, + "auxiliary_loss_mlp": 0.01040375, + "balance_loss_clip": 1.02744687, + "balance_loss_mlp": 1.03942454, + "epoch": 0.5518713362392905, + "flos": 25082670015360.0, + "grad_norm": 1.5358645892565401, + "language_loss": 0.74621391, + "learning_rate": 1.761633217089826e-06, + "loss": 0.7677213, + "num_input_tokens_seen": 197779760, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9179, + "time_per_iteration": 4.023995399475098 + }, + { + "auxiliary_loss_clip": 0.01106959, + "auxiliary_loss_mlp": 0.01034445, + "balance_loss_clip": 1.02213681, + "balance_loss_mlp": 1.0385108, + "epoch": 0.5519314594919585, + "flos": 36538999361280.0, + "grad_norm": 1.8924336027697886, + "language_loss": 0.70433038, + "learning_rate": 1.761246535912924e-06, + "loss": 0.72574437, + "num_input_tokens_seen": 197801545, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.68359375, + "step": 9180, + "time_per_iteration": 4.060170650482178 + }, + { + "auxiliary_loss_clip": 0.0110796, + "auxiliary_loss_mlp": 0.01034059, + "balance_loss_clip": 1.02197158, + "balance_loss_mlp": 1.03808069, + "epoch": 0.5519915827446265, + "flos": 20448613612800.0, + "grad_norm": 1.9150410275355574, + "language_loss": 0.66870642, + "learning_rate": 1.7608598637895776e-06, + "loss": 0.69012666, + "num_input_tokens_seen": 197820760, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69921875, + "step": 9181, + "time_per_iteration": 2.4741644859313965 + }, + { + "auxiliary_loss_clip": 0.01109873, + "auxiliary_loss_mlp": 0.01028117, + "balance_loss_clip": 1.01514149, + "balance_loss_mlp": 1.03774214, + "epoch": 0.5520517059972945, + "flos": 23768052151680.0, + "grad_norm": 1.9118124234638791, + "language_loss": 0.79398257, + "learning_rate": 1.7604732007344486e-06, + "loss": 0.81536245, + "num_input_tokens_seen": 197840195, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.72265625, + "step": 9182, + "time_per_iteration": 2.4744672775268555 + }, + { + "auxiliary_loss_clip": 0.01107607, + "auxiliary_loss_mlp": 0.01027131, + "balance_loss_clip": 1.0145787, + "balance_loss_mlp": 1.03817368, + "epoch": 0.5521118292499624, + "flos": 22196457411840.0, + "grad_norm": 1.7815316362256517, + "language_loss": 0.82710314, + "learning_rate": 1.7600865467622003e-06, + "loss": 0.84845054, + "num_input_tokens_seen": 197859475, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 9183, + "time_per_iteration": 2.4999542236328125 + }, + { + "auxiliary_loss_clip": 0.01106614, + "auxiliary_loss_mlp": 0.01026614, + "balance_loss_clip": 1.01474106, + "balance_loss_mlp": 1.03841662, + "epoch": 0.5521719525026304, + "flos": 23583291569280.0, + "grad_norm": 1.3300741669264389, + "language_loss": 0.67200708, + "learning_rate": 1.7596999018874936e-06, + "loss": 0.69333941, + "num_input_tokens_seen": 197879395, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 9184, + "time_per_iteration": 2.4747231006622314 + }, + { + "auxiliary_loss_clip": 0.01107758, + "auxiliary_loss_mlp": 0.01025737, + "balance_loss_clip": 1.01336932, + "balance_loss_mlp": 1.03818047, + "epoch": 0.5522320757552983, + "flos": 26137617482880.0, + "grad_norm": 1.521307728440283, + "language_loss": 0.76197934, + "learning_rate": 1.7593132661249917e-06, + "loss": 0.78331435, + "num_input_tokens_seen": 197900815, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6953125, + "step": 9185, + "time_per_iteration": 2.534573793411255 + }, + { + "auxiliary_loss_clip": 0.01109207, + "auxiliary_loss_mlp": 0.01034245, + "balance_loss_clip": 1.02194285, + "balance_loss_mlp": 1.0396924, + "epoch": 0.5522921990079663, + "flos": 24676160820480.0, + "grad_norm": 1.6519250451143856, + "language_loss": 0.7376985, + "learning_rate": 1.7589266394893536e-06, + "loss": 0.75913298, + "num_input_tokens_seen": 197918985, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 9186, + "time_per_iteration": 2.5148305892944336 + }, + { + "auxiliary_loss_clip": 0.01111442, + "auxiliary_loss_mlp": 0.01033032, + "balance_loss_clip": 1.02137351, + "balance_loss_mlp": 1.04041481, + "epoch": 0.5523523222606344, + "flos": 22748153379840.0, + "grad_norm": 2.3297788732806275, + "language_loss": 0.6611231, + "learning_rate": 1.7585400219952421e-06, + "loss": 0.68256783, + "num_input_tokens_seen": 197937725, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.7109375, + "step": 9187, + "time_per_iteration": 2.4953529834747314 + }, + { + "auxiliary_loss_clip": 0.0110884, + "auxiliary_loss_mlp": 0.010278, + "balance_loss_clip": 1.01550388, + "balance_loss_mlp": 1.0389905, + "epoch": 0.5524124455133023, + "flos": 19755825022080.0, + "grad_norm": 1.699111440652827, + "language_loss": 0.77629888, + "learning_rate": 1.758153413657318e-06, + "loss": 0.79766524, + "num_input_tokens_seen": 197955635, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69921875, + "step": 9188, + "time_per_iteration": 2.4593770503997803 + }, + { + "auxiliary_loss_clip": 0.01105648, + "auxiliary_loss_mlp": 0.01030962, + "balance_loss_clip": 1.01829576, + "balance_loss_mlp": 1.03729725, + "epoch": 0.5524725687659703, + "flos": 23294821443840.0, + "grad_norm": 1.837373875573988, + "language_loss": 0.81666493, + "learning_rate": 1.7577668144902394e-06, + "loss": 0.83803099, + "num_input_tokens_seen": 197974490, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.68359375, + "step": 9189, + "time_per_iteration": 2.514223098754883 + }, + { + "auxiliary_loss_clip": 0.01105635, + "auxiliary_loss_mlp": 0.01028064, + "balance_loss_clip": 1.0153625, + "balance_loss_mlp": 1.03796136, + "epoch": 0.5525326920186382, + "flos": 24862178378880.0, + "grad_norm": 1.3687672594772107, + "language_loss": 0.76419669, + "learning_rate": 1.7573802245086684e-06, + "loss": 0.78553367, + "num_input_tokens_seen": 197995735, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 9190, + "time_per_iteration": 2.4991939067840576 + }, + { + "auxiliary_loss_clip": 0.01111398, + "auxiliary_loss_mlp": 0.0103191, + "balance_loss_clip": 1.01837981, + "balance_loss_mlp": 1.03823757, + "epoch": 0.5525928152713062, + "flos": 13735580906880.0, + "grad_norm": 3.1168017297152484, + "language_loss": 0.78959441, + "learning_rate": 1.7569936437272627e-06, + "loss": 0.81102753, + "num_input_tokens_seen": 198009685, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73046875, + "step": 9191, + "time_per_iteration": 2.447239875793457 + }, + { + "auxiliary_loss_clip": 0.01106392, + "auxiliary_loss_mlp": 0.01030056, + "balance_loss_clip": 1.01799178, + "balance_loss_mlp": 1.03781414, + "epoch": 0.5526529385239741, + "flos": 13071592045440.0, + "grad_norm": 2.1697062429363427, + "language_loss": 0.68734175, + "learning_rate": 1.7566070721606829e-06, + "loss": 0.70870626, + "num_input_tokens_seen": 198026845, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.68359375, + "step": 9192, + "time_per_iteration": 2.424194812774658 + }, + { + "auxiliary_loss_clip": 0.01104657, + "auxiliary_loss_mlp": 0.01031796, + "balance_loss_clip": 1.0210079, + "balance_loss_mlp": 1.03741503, + "epoch": 0.5527130617766421, + "flos": 23148377694720.0, + "grad_norm": 1.580245881596358, + "language_loss": 0.77429307, + "learning_rate": 1.756220509823588e-06, + "loss": 0.79565763, + "num_input_tokens_seen": 198045275, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.671875, + "step": 9193, + "time_per_iteration": 2.486544370651245 + }, + { + "auxiliary_loss_clip": 0.01106633, + "auxiliary_loss_mlp": 0.01033853, + "balance_loss_clip": 1.02199149, + "balance_loss_mlp": 1.03775311, + "epoch": 0.55277318502931, + "flos": 21285547482240.0, + "grad_norm": 1.6936547327162281, + "language_loss": 0.78554469, + "learning_rate": 1.7558339567306344e-06, + "loss": 0.80694956, + "num_input_tokens_seen": 198065760, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 9194, + "time_per_iteration": 2.446010112762451 + }, + { + "auxiliary_loss_clip": 0.01111391, + "auxiliary_loss_mlp": 0.01032697, + "balance_loss_clip": 1.01982856, + "balance_loss_mlp": 1.03737998, + "epoch": 0.5528333082819781, + "flos": 38324549462400.0, + "grad_norm": 1.6547854303314034, + "language_loss": 0.69580936, + "learning_rate": 1.7554474128964825e-06, + "loss": 0.71725023, + "num_input_tokens_seen": 198087595, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.73828125, + "step": 9195, + "time_per_iteration": 2.633622407913208 + }, + { + "auxiliary_loss_clip": 0.01114776, + "auxiliary_loss_mlp": 0.01029834, + "balance_loss_clip": 1.01669717, + "balance_loss_mlp": 1.0401336, + "epoch": 0.552893431534646, + "flos": 13553621585280.0, + "grad_norm": 2.085899367605988, + "language_loss": 0.73877811, + "learning_rate": 1.7550608783357887e-06, + "loss": 0.76022422, + "num_input_tokens_seen": 198104620, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.74609375, + "step": 9196, + "time_per_iteration": 2.4477953910827637 + }, + { + "auxiliary_loss_clip": 0.01107465, + "auxiliary_loss_mlp": 0.01032297, + "balance_loss_clip": 1.0202986, + "balance_loss_mlp": 1.03845131, + "epoch": 0.552953554787314, + "flos": 21939408708480.0, + "grad_norm": 1.5760086547957552, + "language_loss": 0.76767844, + "learning_rate": 1.7546743530632115e-06, + "loss": 0.78907609, + "num_input_tokens_seen": 198123565, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 9197, + "time_per_iteration": 2.4946064949035645 + }, + { + "auxiliary_loss_clip": 0.01104392, + "auxiliary_loss_mlp": 0.0102516, + "balance_loss_clip": 1.01429963, + "balance_loss_mlp": 1.03566051, + "epoch": 0.5530136780399819, + "flos": 43658002558080.0, + "grad_norm": 1.6045583807501234, + "language_loss": 0.76419538, + "learning_rate": 1.754287837093407e-06, + "loss": 0.78549087, + "num_input_tokens_seen": 198148270, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6875, + "step": 9198, + "time_per_iteration": 2.7027511596679688 + }, + { + "auxiliary_loss_clip": 0.01105504, + "auxiliary_loss_mlp": 0.01025375, + "balance_loss_clip": 1.0140028, + "balance_loss_mlp": 1.03652, + "epoch": 0.5530738012926499, + "flos": 25045502417280.0, + "grad_norm": 1.7911524754161214, + "language_loss": 0.79089695, + "learning_rate": 1.7539013304410327e-06, + "loss": 0.81220573, + "num_input_tokens_seen": 198168810, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6875, + "step": 9199, + "time_per_iteration": 2.5071682929992676 + }, + { + "auxiliary_loss_clip": 0.01106031, + "auxiliary_loss_mlp": 0.01030471, + "balance_loss_clip": 1.01893711, + "balance_loss_mlp": 1.03667951, + "epoch": 0.553133924545318, + "flos": 16472081623680.0, + "grad_norm": 1.789754163992573, + "language_loss": 0.64116317, + "learning_rate": 1.7535148331207443e-06, + "loss": 0.66252816, + "num_input_tokens_seen": 198186200, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6953125, + "step": 9200, + "time_per_iteration": 2.453810214996338 + }, + { + "auxiliary_loss_clip": 0.01112463, + "auxiliary_loss_mlp": 0.01027812, + "balance_loss_clip": 1.01444292, + "balance_loss_mlp": 1.03949916, + "epoch": 0.5531940477979859, + "flos": 24606207083520.0, + "grad_norm": 1.54627322023295, + "language_loss": 0.66172588, + "learning_rate": 1.7531283451471978e-06, + "loss": 0.6831286, + "num_input_tokens_seen": 198207050, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73046875, + "step": 9201, + "time_per_iteration": 2.5050048828125 + }, + { + "auxiliary_loss_clip": 0.01110041, + "auxiliary_loss_mlp": 0.01032711, + "balance_loss_clip": 1.02000964, + "balance_loss_mlp": 1.04039264, + "epoch": 0.5532541710506539, + "flos": 22159577122560.0, + "grad_norm": 2.1300156031813624, + "language_loss": 0.60931027, + "learning_rate": 1.7527418665350502e-06, + "loss": 0.63073778, + "num_input_tokens_seen": 198224565, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 9202, + "time_per_iteration": 2.454374074935913 + }, + { + "auxiliary_loss_clip": 0.01105546, + "auxiliary_loss_mlp": 0.01028359, + "balance_loss_clip": 1.01677179, + "balance_loss_mlp": 1.0374378, + "epoch": 0.5533142943033218, + "flos": 21397265758080.0, + "grad_norm": 1.6333926311503897, + "language_loss": 0.64007318, + "learning_rate": 1.7523553972989548e-06, + "loss": 0.66141224, + "num_input_tokens_seen": 198244790, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6796875, + "step": 9203, + "time_per_iteration": 2.520813226699829 + }, + { + "auxiliary_loss_clip": 0.01106796, + "auxiliary_loss_mlp": 0.0102947, + "balance_loss_clip": 1.01762629, + "balance_loss_mlp": 1.03710103, + "epoch": 0.5533744175559898, + "flos": 23550541344000.0, + "grad_norm": 1.5876710884236471, + "language_loss": 0.63839149, + "learning_rate": 1.7519689374535683e-06, + "loss": 0.65975416, + "num_input_tokens_seen": 198264375, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6953125, + "step": 9204, + "time_per_iteration": 2.519796371459961 + }, + { + "auxiliary_loss_clip": 0.01103569, + "auxiliary_loss_mlp": 0.01026922, + "balance_loss_clip": 1.01617515, + "balance_loss_mlp": 1.0357914, + "epoch": 0.5534345408086577, + "flos": 24061514267520.0, + "grad_norm": 1.7042490030554438, + "language_loss": 0.77431834, + "learning_rate": 1.7515824870135445e-06, + "loss": 0.79562324, + "num_input_tokens_seen": 198283895, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6796875, + "step": 9205, + "time_per_iteration": 2.5149800777435303 + }, + { + "auxiliary_loss_clip": 0.01105223, + "auxiliary_loss_mlp": 0.01029733, + "balance_loss_clip": 1.01799703, + "balance_loss_mlp": 1.03753543, + "epoch": 0.5534946640613257, + "flos": 33771831408000.0, + "grad_norm": 1.5447277527142993, + "language_loss": 0.72338134, + "learning_rate": 1.751196045993537e-06, + "loss": 0.74473095, + "num_input_tokens_seen": 198310035, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.67578125, + "step": 9206, + "time_per_iteration": 2.6088132858276367 + }, + { + "auxiliary_loss_clip": 0.01107088, + "auxiliary_loss_mlp": 0.01030178, + "balance_loss_clip": 1.01891243, + "balance_loss_mlp": 1.03847539, + "epoch": 0.5535547873139937, + "flos": 15159223526400.0, + "grad_norm": 1.9679878300179545, + "language_loss": 0.75601065, + "learning_rate": 1.7508096144082012e-06, + "loss": 0.77738333, + "num_input_tokens_seen": 198327810, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6875, + "step": 9207, + "time_per_iteration": 2.4550647735595703 + }, + { + "auxiliary_loss_clip": 0.01112139, + "auxiliary_loss_mlp": 0.01031947, + "balance_loss_clip": 1.01861894, + "balance_loss_mlp": 1.03909707, + "epoch": 0.5536149105666617, + "flos": 16980863817600.0, + "grad_norm": 2.4900859433120055, + "language_loss": 0.61790574, + "learning_rate": 1.750423192272189e-06, + "loss": 0.6393466, + "num_input_tokens_seen": 198343150, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73046875, + "step": 9208, + "time_per_iteration": 2.4474070072174072 + }, + { + "auxiliary_loss_clip": 0.01109576, + "auxiliary_loss_mlp": 0.01030497, + "balance_loss_clip": 1.01908827, + "balance_loss_mlp": 1.03917742, + "epoch": 0.5536750338193296, + "flos": 18149935772160.0, + "grad_norm": 2.138498398763569, + "language_loss": 0.64059991, + "learning_rate": 1.7500367796001547e-06, + "loss": 0.66200066, + "num_input_tokens_seen": 198360925, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.703125, + "step": 9209, + "time_per_iteration": 2.49118709564209 + }, + { + "auxiliary_loss_clip": 0.01106938, + "auxiliary_loss_mlp": 0.01032725, + "balance_loss_clip": 1.02030945, + "balance_loss_mlp": 1.03779769, + "epoch": 0.5537351570719976, + "flos": 22747794243840.0, + "grad_norm": 1.9091325066097349, + "language_loss": 0.8244276, + "learning_rate": 1.7496503764067513e-06, + "loss": 0.84582424, + "num_input_tokens_seen": 198379265, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69140625, + "step": 9210, + "time_per_iteration": 2.479508876800537 + }, + { + "auxiliary_loss_clip": 0.01104462, + "auxiliary_loss_mlp": 0.01027145, + "balance_loss_clip": 1.01554608, + "balance_loss_mlp": 1.03640354, + "epoch": 0.5537952803246655, + "flos": 26356026130560.0, + "grad_norm": 1.9903415105614328, + "language_loss": 0.72810864, + "learning_rate": 1.74926398270663e-06, + "loss": 0.74942476, + "num_input_tokens_seen": 198399490, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 9211, + "time_per_iteration": 2.56174635887146 + }, + { + "auxiliary_loss_clip": 0.01109862, + "auxiliary_loss_mlp": 0.01034269, + "balance_loss_clip": 1.02101886, + "balance_loss_mlp": 1.03795481, + "epoch": 0.5538554035773335, + "flos": 18037427397120.0, + "grad_norm": 1.687820261734967, + "language_loss": 0.66492426, + "learning_rate": 1.7488775985144437e-06, + "loss": 0.68636549, + "num_input_tokens_seen": 198419110, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 9212, + "time_per_iteration": 2.4493961334228516 + }, + { + "auxiliary_loss_clip": 0.01107628, + "auxiliary_loss_mlp": 0.01032485, + "balance_loss_clip": 1.01846039, + "balance_loss_mlp": 1.03564453, + "epoch": 0.5539155268300014, + "flos": 31686247002240.0, + "grad_norm": 1.478127311181698, + "language_loss": 0.51676697, + "learning_rate": 1.7484912238448443e-06, + "loss": 0.53816813, + "num_input_tokens_seen": 198441360, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.71875, + "step": 9213, + "time_per_iteration": 2.5872037410736084 + }, + { + "auxiliary_loss_clip": 0.01111386, + "auxiliary_loss_mlp": 0.01031335, + "balance_loss_clip": 1.01868105, + "balance_loss_mlp": 1.03979373, + "epoch": 0.5539756500826695, + "flos": 15193769431680.0, + "grad_norm": 1.9151587743929102, + "language_loss": 0.8548407, + "learning_rate": 1.7481048587124827e-06, + "loss": 0.87626791, + "num_input_tokens_seen": 198459835, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71484375, + "step": 9214, + "time_per_iteration": 2.4696502685546875 + }, + { + "auxiliary_loss_clip": 0.01108266, + "auxiliary_loss_mlp": 0.01027132, + "balance_loss_clip": 1.01570582, + "balance_loss_mlp": 1.03970075, + "epoch": 0.5540357733353375, + "flos": 26353117128960.0, + "grad_norm": 1.700191688942819, + "language_loss": 0.70016778, + "learning_rate": 1.7477185031320108e-06, + "loss": 0.72152174, + "num_input_tokens_seen": 198478955, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6875, + "step": 9215, + "time_per_iteration": 2.50022029876709 + }, + { + "auxiliary_loss_clip": 0.01109258, + "auxiliary_loss_mlp": 0.01029951, + "balance_loss_clip": 1.01724386, + "balance_loss_mlp": 1.03815317, + "epoch": 0.5540958965880054, + "flos": 21323684747520.0, + "grad_norm": 1.5266679061001223, + "language_loss": 0.73124695, + "learning_rate": 1.7473321571180773e-06, + "loss": 0.75263906, + "num_input_tokens_seen": 198499030, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9216, + "time_per_iteration": 2.4683403968811035 + }, + { + "auxiliary_loss_clip": 0.01105693, + "auxiliary_loss_mlp": 0.01031419, + "balance_loss_clip": 1.01916385, + "balance_loss_mlp": 1.03830385, + "epoch": 0.5541560198406734, + "flos": 25666828899840.0, + "grad_norm": 1.9596921442179602, + "language_loss": 0.71501839, + "learning_rate": 1.7469458206853345e-06, + "loss": 0.73638952, + "num_input_tokens_seen": 198520265, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 9217, + "time_per_iteration": 2.542431592941284 + }, + { + "auxiliary_loss_clip": 0.01103432, + "auxiliary_loss_mlp": 0.010249, + "balance_loss_clip": 1.01331282, + "balance_loss_mlp": 1.03553486, + "epoch": 0.5542161430933413, + "flos": 21939624190080.0, + "grad_norm": 1.8113809838055568, + "language_loss": 0.7838676, + "learning_rate": 1.7465594938484315e-06, + "loss": 0.80515093, + "num_input_tokens_seen": 198539645, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 9218, + "time_per_iteration": 3.8476054668426514 + }, + { + "auxiliary_loss_clip": 0.01108339, + "auxiliary_loss_mlp": 0.01034768, + "balance_loss_clip": 1.02095163, + "balance_loss_mlp": 1.03540277, + "epoch": 0.5542762663460093, + "flos": 19571459489280.0, + "grad_norm": 2.0355993872839675, + "language_loss": 0.72591358, + "learning_rate": 1.7461731766220176e-06, + "loss": 0.74734467, + "num_input_tokens_seen": 198558710, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7265625, + "step": 9219, + "time_per_iteration": 2.4924545288085938 + }, + { + "auxiliary_loss_clip": 0.01111531, + "auxiliary_loss_mlp": 0.01039554, + "balance_loss_clip": 1.02701962, + "balance_loss_mlp": 1.03986812, + "epoch": 0.5543363895986773, + "flos": 19499063627520.0, + "grad_norm": 1.546677051774663, + "language_loss": 0.71403503, + "learning_rate": 1.7457868690207426e-06, + "loss": 0.73554587, + "num_input_tokens_seen": 198577050, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71875, + "step": 9220, + "time_per_iteration": 2.4362480640411377 + }, + { + "auxiliary_loss_clip": 0.01106556, + "auxiliary_loss_mlp": 0.0102571, + "balance_loss_clip": 1.01424217, + "balance_loss_mlp": 1.03777957, + "epoch": 0.5543965128513453, + "flos": 22635609091200.0, + "grad_norm": 1.6357699921116782, + "language_loss": 0.79294407, + "learning_rate": 1.7454005710592547e-06, + "loss": 0.81426674, + "num_input_tokens_seen": 198595290, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6875, + "step": 9221, + "time_per_iteration": 5.3692920207977295 + }, + { + "auxiliary_loss_clip": 0.01107012, + "auxiliary_loss_mlp": 0.01030177, + "balance_loss_clip": 1.01745725, + "balance_loss_mlp": 1.03750253, + "epoch": 0.5544566361040132, + "flos": 25989952671360.0, + "grad_norm": 1.7434924477802918, + "language_loss": 0.83865321, + "learning_rate": 1.7450142827522027e-06, + "loss": 0.86002505, + "num_input_tokens_seen": 198614110, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 9222, + "time_per_iteration": 2.5054023265838623 + }, + { + "auxiliary_loss_clip": 0.0111308, + "auxiliary_loss_mlp": 0.01034265, + "balance_loss_clip": 1.02092493, + "balance_loss_mlp": 1.04003119, + "epoch": 0.5545167593566812, + "flos": 28257568225920.0, + "grad_norm": 1.7723513069494143, + "language_loss": 0.75498754, + "learning_rate": 1.7446280041142344e-06, + "loss": 0.77646095, + "num_input_tokens_seen": 198633880, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73046875, + "step": 9223, + "time_per_iteration": 2.5140554904937744 + }, + { + "auxiliary_loss_clip": 0.01108507, + "auxiliary_loss_mlp": 0.01028359, + "balance_loss_clip": 1.0155921, + "balance_loss_mlp": 1.03917074, + "epoch": 0.5545768826093491, + "flos": 28476551491200.0, + "grad_norm": 1.798104527740367, + "language_loss": 0.81975842, + "learning_rate": 1.7442417351599986e-06, + "loss": 0.84112704, + "num_input_tokens_seen": 198653505, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 9224, + "time_per_iteration": 2.5273303985595703 + }, + { + "auxiliary_loss_clip": 0.01110718, + "auxiliary_loss_mlp": 0.01040562, + "balance_loss_clip": 1.02769315, + "balance_loss_mlp": 1.0393647, + "epoch": 0.5546370058620171, + "flos": 18478051534080.0, + "grad_norm": 2.764116317399656, + "language_loss": 0.5700891, + "learning_rate": 1.743855475904141e-06, + "loss": 0.59160185, + "num_input_tokens_seen": 198671890, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9225, + "time_per_iteration": 2.4379100799560547 + }, + { + "auxiliary_loss_clip": 0.01110187, + "auxiliary_loss_mlp": 0.01036326, + "balance_loss_clip": 1.02342129, + "balance_loss_mlp": 1.03836024, + "epoch": 0.554697129114685, + "flos": 22930507751040.0, + "grad_norm": 1.5085866030732613, + "language_loss": 0.67495418, + "learning_rate": 1.7434692263613098e-06, + "loss": 0.69641924, + "num_input_tokens_seen": 198691995, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 9226, + "time_per_iteration": 2.4891088008880615 + }, + { + "auxiliary_loss_clip": 0.01107189, + "auxiliary_loss_mlp": 0.01032344, + "balance_loss_clip": 1.01961827, + "balance_loss_mlp": 1.03644681, + "epoch": 0.5547572523673531, + "flos": 21797166850560.0, + "grad_norm": 1.4051697234065024, + "language_loss": 0.74315172, + "learning_rate": 1.7430829865461518e-06, + "loss": 0.76454705, + "num_input_tokens_seen": 198712440, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 9227, + "time_per_iteration": 2.4678173065185547 + }, + { + "auxiliary_loss_clip": 0.01114145, + "auxiliary_loss_mlp": 0.01030915, + "balance_loss_clip": 1.01826084, + "balance_loss_mlp": 1.04228091, + "epoch": 0.5548173756200211, + "flos": 22342829333760.0, + "grad_norm": 2.5448731753452405, + "language_loss": 0.73452151, + "learning_rate": 1.7426967564733118e-06, + "loss": 0.75597215, + "num_input_tokens_seen": 198731515, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 9228, + "time_per_iteration": 2.4851813316345215 + }, + { + "auxiliary_loss_clip": 0.01110082, + "auxiliary_loss_mlp": 0.01030138, + "balance_loss_clip": 1.01803231, + "balance_loss_mlp": 1.03902888, + "epoch": 0.554877498872689, + "flos": 17858736213120.0, + "grad_norm": 2.153919283771507, + "language_loss": 0.76069826, + "learning_rate": 1.7423105361574373e-06, + "loss": 0.7821005, + "num_input_tokens_seen": 198749750, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.7109375, + "step": 9229, + "time_per_iteration": 2.4682509899139404 + }, + { + "auxiliary_loss_clip": 0.01110192, + "auxiliary_loss_mlp": 0.01039645, + "balance_loss_clip": 1.02623343, + "balance_loss_mlp": 1.03956127, + "epoch": 0.554937622125357, + "flos": 17238343484160.0, + "grad_norm": 1.3529022003633056, + "language_loss": 0.68695533, + "learning_rate": 1.741924325613172e-06, + "loss": 0.70845366, + "num_input_tokens_seen": 198768320, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.70703125, + "step": 9230, + "time_per_iteration": 2.4558916091918945 + }, + { + "auxiliary_loss_clip": 0.01110086, + "auxiliary_loss_mlp": 0.01033326, + "balance_loss_clip": 1.02054107, + "balance_loss_mlp": 1.03759503, + "epoch": 0.5549977453780249, + "flos": 25368087484800.0, + "grad_norm": 2.0513203800368327, + "language_loss": 0.67574155, + "learning_rate": 1.741538124855163e-06, + "loss": 0.69717568, + "num_input_tokens_seen": 198787230, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7265625, + "step": 9231, + "time_per_iteration": 2.4816246032714844 + }, + { + "auxiliary_loss_clip": 0.01113542, + "auxiliary_loss_mlp": 0.01035024, + "balance_loss_clip": 1.02160072, + "balance_loss_mlp": 1.03941798, + "epoch": 0.555057868630693, + "flos": 25079114568960.0, + "grad_norm": 1.5458592279354035, + "language_loss": 0.77953124, + "learning_rate": 1.7411519338980548e-06, + "loss": 0.80101693, + "num_input_tokens_seen": 198806720, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7421875, + "step": 9232, + "time_per_iteration": 2.5161256790161133 + }, + { + "auxiliary_loss_clip": 0.01106102, + "auxiliary_loss_mlp": 0.01037505, + "balance_loss_clip": 1.02622199, + "balance_loss_mlp": 1.03777027, + "epoch": 0.5551179918833609, + "flos": 26104220812800.0, + "grad_norm": 1.5305081634070101, + "language_loss": 0.82585824, + "learning_rate": 1.7407657527564898e-06, + "loss": 0.84729433, + "num_input_tokens_seen": 198826235, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.68359375, + "step": 9233, + "time_per_iteration": 2.513498306274414 + }, + { + "auxiliary_loss_clip": 0.01112184, + "auxiliary_loss_mlp": 0.01039099, + "balance_loss_clip": 1.02717805, + "balance_loss_mlp": 1.03902006, + "epoch": 0.5551781151360289, + "flos": 19384759572480.0, + "grad_norm": 2.1768956460608053, + "language_loss": 0.75171268, + "learning_rate": 1.7403795814451142e-06, + "loss": 0.77322543, + "num_input_tokens_seen": 198842655, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.734375, + "step": 9234, + "time_per_iteration": 2.4618585109710693 + }, + { + "auxiliary_loss_clip": 0.01105123, + "auxiliary_loss_mlp": 0.01028385, + "balance_loss_clip": 1.01663136, + "balance_loss_mlp": 1.03685272, + "epoch": 0.5552382383886968, + "flos": 21725956137600.0, + "grad_norm": 2.1362991517660146, + "language_loss": 0.64992738, + "learning_rate": 1.7399934199785706e-06, + "loss": 0.6712625, + "num_input_tokens_seen": 198861210, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.68359375, + "step": 9235, + "time_per_iteration": 2.4449851512908936 + }, + { + "auxiliary_loss_clip": 0.01108941, + "auxiliary_loss_mlp": 0.01032132, + "balance_loss_clip": 1.01977587, + "balance_loss_mlp": 1.03794515, + "epoch": 0.5552983616413648, + "flos": 14356189117440.0, + "grad_norm": 1.8479272776295672, + "language_loss": 0.67863953, + "learning_rate": 1.7396072683715029e-06, + "loss": 0.70005023, + "num_input_tokens_seen": 198880045, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.7109375, + "step": 9236, + "time_per_iteration": 2.4798662662506104 + }, + { + "auxiliary_loss_clip": 0.01104311, + "auxiliary_loss_mlp": 0.01024908, + "balance_loss_clip": 1.0127244, + "balance_loss_mlp": 1.03731084, + "epoch": 0.5553584848940327, + "flos": 25478548784640.0, + "grad_norm": 3.129052058582791, + "language_loss": 0.86174095, + "learning_rate": 1.7392211266385536e-06, + "loss": 0.88303316, + "num_input_tokens_seen": 198900210, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.671875, + "step": 9237, + "time_per_iteration": 2.4789483547210693 + }, + { + "auxiliary_loss_clip": 0.01104495, + "auxiliary_loss_mlp": 0.01032906, + "balance_loss_clip": 1.02062178, + "balance_loss_mlp": 1.03669763, + "epoch": 0.5554186081467007, + "flos": 22163850840960.0, + "grad_norm": 1.712591160520522, + "language_loss": 0.73281908, + "learning_rate": 1.7388349947943652e-06, + "loss": 0.75419307, + "num_input_tokens_seen": 198919055, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.67578125, + "step": 9238, + "time_per_iteration": 2.4812166690826416 + }, + { + "auxiliary_loss_clip": 0.01109816, + "auxiliary_loss_mlp": 0.0103234, + "balance_loss_clip": 1.01997221, + "balance_loss_mlp": 1.03750467, + "epoch": 0.5554787313993687, + "flos": 49746656125440.0, + "grad_norm": 1.5735650405734192, + "language_loss": 0.78268331, + "learning_rate": 1.73844887285358e-06, + "loss": 0.80410492, + "num_input_tokens_seen": 198943505, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.72265625, + "step": 9239, + "time_per_iteration": 2.6846883296966553 + }, + { + "auxiliary_loss_clip": 0.01107901, + "auxiliary_loss_mlp": 0.0102797, + "balance_loss_clip": 1.01580429, + "balance_loss_mlp": 1.03730011, + "epoch": 0.5555388546520367, + "flos": 22127365601280.0, + "grad_norm": 1.4802036052022307, + "language_loss": 0.79760826, + "learning_rate": 1.7380627608308393e-06, + "loss": 0.81896698, + "num_input_tokens_seen": 198963590, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.703125, + "step": 9240, + "time_per_iteration": 2.4733242988586426 + }, + { + "auxiliary_loss_clip": 0.01107185, + "auxiliary_loss_mlp": 0.01032369, + "balance_loss_clip": 1.02035236, + "balance_loss_mlp": 1.0374887, + "epoch": 0.5555989779047047, + "flos": 24682122478080.0, + "grad_norm": 1.5810234034759716, + "language_loss": 0.6520583, + "learning_rate": 1.737676658740786e-06, + "loss": 0.67345387, + "num_input_tokens_seen": 198982680, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69921875, + "step": 9241, + "time_per_iteration": 2.4733994007110596 + }, + { + "auxiliary_loss_clip": 0.01109149, + "auxiliary_loss_mlp": 0.01032145, + "balance_loss_clip": 1.01974106, + "balance_loss_mlp": 1.03843307, + "epoch": 0.5556591011573726, + "flos": 16106510954880.0, + "grad_norm": 1.9354963557050642, + "language_loss": 0.72742647, + "learning_rate": 1.7372905665980594e-06, + "loss": 0.74883944, + "num_input_tokens_seen": 199000185, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.70703125, + "step": 9242, + "time_per_iteration": 2.439195394515991 + }, + { + "auxiliary_loss_clip": 0.01109113, + "auxiliary_loss_mlp": 0.0103746, + "balance_loss_clip": 1.02429366, + "balance_loss_mlp": 1.03737354, + "epoch": 0.5557192244100406, + "flos": 12933695733120.0, + "grad_norm": 1.6615305539564786, + "language_loss": 0.63989079, + "learning_rate": 1.7369044844173012e-06, + "loss": 0.66135651, + "num_input_tokens_seen": 199018380, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71875, + "step": 9243, + "time_per_iteration": 2.5009653568267822 + }, + { + "auxiliary_loss_clip": 0.01109943, + "auxiliary_loss_mlp": 0.01031237, + "balance_loss_clip": 1.01894689, + "balance_loss_mlp": 1.03998828, + "epoch": 0.5557793476627085, + "flos": 23111712887040.0, + "grad_norm": 1.8112849174534187, + "language_loss": 0.75149089, + "learning_rate": 1.7365184122131509e-06, + "loss": 0.77290273, + "num_input_tokens_seen": 199037115, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 9244, + "time_per_iteration": 2.475520610809326 + }, + { + "auxiliary_loss_clip": 0.01102344, + "auxiliary_loss_mlp": 0.01028296, + "balance_loss_clip": 1.01693511, + "balance_loss_mlp": 1.03605533, + "epoch": 0.5558394709153766, + "flos": 21428040735360.0, + "grad_norm": 2.1432873648263473, + "language_loss": 0.74578094, + "learning_rate": 1.7361323500002486e-06, + "loss": 0.76708734, + "num_input_tokens_seen": 199053375, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6640625, + "step": 9245, + "time_per_iteration": 2.45875883102417 + }, + { + "auxiliary_loss_clip": 0.01111156, + "auxiliary_loss_mlp": 0.01031213, + "balance_loss_clip": 1.01832068, + "balance_loss_mlp": 1.03885865, + "epoch": 0.5558995941680445, + "flos": 25078324469760.0, + "grad_norm": 2.0585608296199, + "language_loss": 0.79468071, + "learning_rate": 1.7357462977932348e-06, + "loss": 0.81610441, + "num_input_tokens_seen": 199070930, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.72265625, + "step": 9246, + "time_per_iteration": 2.5065393447875977 + }, + { + "auxiliary_loss_clip": 0.01108664, + "auxiliary_loss_mlp": 0.01032421, + "balance_loss_clip": 1.02022004, + "balance_loss_mlp": 1.03894639, + "epoch": 0.5559597174207125, + "flos": 20011149872640.0, + "grad_norm": 1.99088564820557, + "language_loss": 0.73864704, + "learning_rate": 1.7353602556067471e-06, + "loss": 0.76005793, + "num_input_tokens_seen": 199088675, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69921875, + "step": 9247, + "time_per_iteration": 2.535578489303589 + }, + { + "auxiliary_loss_clip": 0.01108657, + "auxiliary_loss_mlp": 0.01033098, + "balance_loss_clip": 1.02047944, + "balance_loss_mlp": 1.03822637, + "epoch": 0.5560198406733804, + "flos": 16835677044480.0, + "grad_norm": 3.9448346084731214, + "language_loss": 0.76161623, + "learning_rate": 1.7349742234554254e-06, + "loss": 0.78303373, + "num_input_tokens_seen": 199103075, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 9248, + "time_per_iteration": 2.4247324466705322 + }, + { + "auxiliary_loss_clip": 0.01031453, + "auxiliary_loss_mlp": 0.01002871, + "balance_loss_clip": 1.00163698, + "balance_loss_mlp": 1.00995636, + "epoch": 0.5560799639260484, + "flos": 70697051758080.0, + "grad_norm": 0.8418132845618771, + "language_loss": 0.59482312, + "learning_rate": 1.7345882013539081e-06, + "loss": 0.61516631, + "num_input_tokens_seen": 199160325, + "router_z_loss_clip": 0.0123291, + "router_z_loss_mlp": 0.21484375, + "step": 9249, + "time_per_iteration": 3.1760778427124023 + }, + { + "auxiliary_loss_clip": 0.01104052, + "auxiliary_loss_mlp": 0.01027363, + "balance_loss_clip": 1.01514411, + "balance_loss_mlp": 1.03505003, + "epoch": 0.5561400871787163, + "flos": 23148593176320.0, + "grad_norm": 1.8510226601540976, + "language_loss": 0.79942709, + "learning_rate": 1.734202189316832e-06, + "loss": 0.82074124, + "num_input_tokens_seen": 199179760, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 9250, + "time_per_iteration": 2.4803051948547363 + }, + { + "auxiliary_loss_clip": 0.01107715, + "auxiliary_loss_mlp": 0.01032139, + "balance_loss_clip": 1.01952708, + "balance_loss_mlp": 1.03654897, + "epoch": 0.5562002104313843, + "flos": 17566423332480.0, + "grad_norm": 2.627943398678235, + "language_loss": 0.68456143, + "learning_rate": 1.733816187358836e-06, + "loss": 0.70596004, + "num_input_tokens_seen": 199196695, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 9251, + "time_per_iteration": 2.4627792835235596 + }, + { + "auxiliary_loss_clip": 0.01106616, + "auxiliary_loss_mlp": 0.01029568, + "balance_loss_clip": 1.01753998, + "balance_loss_mlp": 1.03680301, + "epoch": 0.5562603336840523, + "flos": 25045430590080.0, + "grad_norm": 1.9270315036455492, + "language_loss": 0.75472188, + "learning_rate": 1.7334301954945569e-06, + "loss": 0.77608371, + "num_input_tokens_seen": 199217845, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6953125, + "step": 9252, + "time_per_iteration": 2.5131149291992188 + }, + { + "auxiliary_loss_clip": 0.01109989, + "auxiliary_loss_mlp": 0.01032399, + "balance_loss_clip": 1.02020955, + "balance_loss_mlp": 1.0379473, + "epoch": 0.5563204569367203, + "flos": 29059022436480.0, + "grad_norm": 1.5243167641625328, + "language_loss": 0.72841972, + "learning_rate": 1.7330442137386313e-06, + "loss": 0.74984354, + "num_input_tokens_seen": 199239250, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.71875, + "step": 9253, + "time_per_iteration": 2.545469045639038 + }, + { + "auxiliary_loss_clip": 0.01108615, + "auxiliary_loss_mlp": 0.01029855, + "balance_loss_clip": 1.01835763, + "balance_loss_mlp": 1.03873754, + "epoch": 0.5563805801893883, + "flos": 22090449398400.0, + "grad_norm": 1.7630844010149394, + "language_loss": 0.8319999, + "learning_rate": 1.7326582421056965e-06, + "loss": 0.85338461, + "num_input_tokens_seen": 199258320, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.69921875, + "step": 9254, + "time_per_iteration": 2.4762439727783203 + }, + { + "auxiliary_loss_clip": 0.01028463, + "auxiliary_loss_mlp": 0.00998119, + "balance_loss_clip": 0.99699229, + "balance_loss_mlp": 1.00661826, + "epoch": 0.5564407034420562, + "flos": 58636128689280.0, + "grad_norm": 0.880020971367601, + "language_loss": 0.64831799, + "learning_rate": 1.732272280610387e-06, + "loss": 0.66858381, + "num_input_tokens_seen": 199314840, + "router_z_loss_clip": 0.0112915, + "router_z_loss_mlp": 0.21875, + "step": 9255, + "time_per_iteration": 2.894592523574829 + }, + { + "auxiliary_loss_clip": 0.01108855, + "auxiliary_loss_mlp": 0.01034937, + "balance_loss_clip": 1.02330816, + "balance_loss_mlp": 1.04103208, + "epoch": 0.5565008266947242, + "flos": 23112323418240.0, + "grad_norm": 1.9305562864951415, + "language_loss": 0.69224131, + "learning_rate": 1.7318863292673399e-06, + "loss": 0.71367919, + "num_input_tokens_seen": 199335405, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 9256, + "time_per_iteration": 2.489379644393921 + }, + { + "auxiliary_loss_clip": 0.01102517, + "auxiliary_loss_mlp": 0.01029113, + "balance_loss_clip": 1.01805019, + "balance_loss_mlp": 1.03555584, + "epoch": 0.5565609499473921, + "flos": 21578399066880.0, + "grad_norm": 1.531147439374393, + "language_loss": 0.75793779, + "learning_rate": 1.73150038809119e-06, + "loss": 0.77925408, + "num_input_tokens_seen": 199354345, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.66796875, + "step": 9257, + "time_per_iteration": 2.484574794769287 + }, + { + "auxiliary_loss_clip": 0.01106314, + "auxiliary_loss_mlp": 0.01036216, + "balance_loss_clip": 1.02477169, + "balance_loss_mlp": 1.03559875, + "epoch": 0.5566210732000602, + "flos": 18369637309440.0, + "grad_norm": 4.5210433992726635, + "language_loss": 0.61403644, + "learning_rate": 1.7311144570965724e-06, + "loss": 0.63546175, + "num_input_tokens_seen": 199372250, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.70703125, + "step": 9258, + "time_per_iteration": 2.4358863830566406 + }, + { + "auxiliary_loss_clip": 0.0110731, + "auxiliary_loss_mlp": 0.01032708, + "balance_loss_clip": 1.01988161, + "balance_loss_mlp": 1.0372082, + "epoch": 0.5566811964527281, + "flos": 25703350053120.0, + "grad_norm": 1.630618195357818, + "language_loss": 0.79231477, + "learning_rate": 1.7307285362981215e-06, + "loss": 0.81371492, + "num_input_tokens_seen": 199392815, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.703125, + "step": 9259, + "time_per_iteration": 3.931269884109497 + }, + { + "auxiliary_loss_clip": 0.01106185, + "auxiliary_loss_mlp": 0.01031095, + "balance_loss_clip": 1.01859045, + "balance_loss_mlp": 1.03665948, + "epoch": 0.5567413197053961, + "flos": 26943991856640.0, + "grad_norm": 1.9981692343252953, + "language_loss": 0.81332636, + "learning_rate": 1.7303426257104712e-06, + "loss": 0.83469915, + "num_input_tokens_seen": 199412375, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 9260, + "time_per_iteration": 2.5092766284942627 + }, + { + "auxiliary_loss_clip": 0.01108362, + "auxiliary_loss_mlp": 0.01037114, + "balance_loss_clip": 1.02450144, + "balance_loss_mlp": 1.03862071, + "epoch": 0.556801442958064, + "flos": 20850597694080.0, + "grad_norm": 1.4782542821591422, + "language_loss": 0.68771613, + "learning_rate": 1.729956725348256e-06, + "loss": 0.70917082, + "num_input_tokens_seen": 199431490, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69921875, + "step": 9261, + "time_per_iteration": 2.5739381313323975 + }, + { + "auxiliary_loss_clip": 0.01027391, + "auxiliary_loss_mlp": 0.01004087, + "balance_loss_clip": 1.00296021, + "balance_loss_mlp": 1.00587916, + "epoch": 0.556861566210732, + "flos": 70498213044480.0, + "grad_norm": 0.7282105219345391, + "language_loss": 0.61132908, + "learning_rate": 1.729570835226108e-06, + "loss": 0.63164389, + "num_input_tokens_seen": 199495855, + "router_z_loss_clip": 0.0112915, + "router_z_loss_mlp": 0.21484375, + "step": 9262, + "time_per_iteration": 5.870652675628662 + }, + { + "auxiliary_loss_clip": 0.01108355, + "auxiliary_loss_mlp": 0.01033188, + "balance_loss_clip": 1.02145159, + "balance_loss_mlp": 1.0379622, + "epoch": 0.5569216894633999, + "flos": 25337276593920.0, + "grad_norm": 1.6754840031905727, + "language_loss": 0.64504874, + "learning_rate": 1.7291849553586622e-06, + "loss": 0.66646421, + "num_input_tokens_seen": 199515870, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.703125, + "step": 9263, + "time_per_iteration": 3.9533426761627197 + }, + { + "auxiliary_loss_clip": 0.01107431, + "auxiliary_loss_mlp": 0.0103239, + "balance_loss_clip": 1.02047563, + "balance_loss_mlp": 1.03795195, + "epoch": 0.556981812716068, + "flos": 22638733574400.0, + "grad_norm": 2.058460487271679, + "language_loss": 0.73137188, + "learning_rate": 1.7287990857605497e-06, + "loss": 0.75277007, + "num_input_tokens_seen": 199535745, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 9264, + "time_per_iteration": 2.493511199951172 + }, + { + "auxiliary_loss_clip": 0.0111011, + "auxiliary_loss_mlp": 0.01031978, + "balance_loss_clip": 1.02008092, + "balance_loss_mlp": 1.04015422, + "epoch": 0.5570419359687359, + "flos": 11035852738560.0, + "grad_norm": 1.9025948017547305, + "language_loss": 0.75953865, + "learning_rate": 1.7284132264464022e-06, + "loss": 0.78095955, + "num_input_tokens_seen": 199554035, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.703125, + "step": 9265, + "time_per_iteration": 2.4533309936523438 + }, + { + "auxiliary_loss_clip": 0.01103692, + "auxiliary_loss_mlp": 0.01030017, + "balance_loss_clip": 1.01909113, + "balance_loss_mlp": 1.03774786, + "epoch": 0.5571020592214039, + "flos": 22823135020800.0, + "grad_norm": 1.366142740242795, + "language_loss": 0.7096293, + "learning_rate": 1.7280273774308536e-06, + "loss": 0.73096645, + "num_input_tokens_seen": 199576120, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.66015625, + "step": 9266, + "time_per_iteration": 2.5045597553253174 + }, + { + "auxiliary_loss_clip": 0.01106333, + "auxiliary_loss_mlp": 0.01033928, + "balance_loss_clip": 1.02204871, + "balance_loss_mlp": 1.03720617, + "epoch": 0.5571621824740719, + "flos": 22927778317440.0, + "grad_norm": 1.7291111077620351, + "language_loss": 0.681355, + "learning_rate": 1.727641538728533e-06, + "loss": 0.7027576, + "num_input_tokens_seen": 199593780, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69140625, + "step": 9267, + "time_per_iteration": 2.5197811126708984 + }, + { + "auxiliary_loss_clip": 0.01103883, + "auxiliary_loss_mlp": 0.01034523, + "balance_loss_clip": 1.02367473, + "balance_loss_mlp": 1.03763127, + "epoch": 0.5572223057267398, + "flos": 22966705681920.0, + "grad_norm": 1.9159467095237732, + "language_loss": 0.74278724, + "learning_rate": 1.7272557103540736e-06, + "loss": 0.76417124, + "num_input_tokens_seen": 199613220, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6640625, + "step": 9268, + "time_per_iteration": 2.489957332611084 + }, + { + "auxiliary_loss_clip": 0.01105844, + "auxiliary_loss_mlp": 0.01030138, + "balance_loss_clip": 1.0188365, + "balance_loss_mlp": 1.03773642, + "epoch": 0.5572824289794078, + "flos": 20960053413120.0, + "grad_norm": 2.490438410193009, + "language_loss": 0.7539283, + "learning_rate": 1.726869892322104e-06, + "loss": 0.77528816, + "num_input_tokens_seen": 199632085, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6796875, + "step": 9269, + "time_per_iteration": 2.5165016651153564 + }, + { + "auxiliary_loss_clip": 0.01106358, + "auxiliary_loss_mlp": 0.01029515, + "balance_loss_clip": 1.01847041, + "balance_loss_mlp": 1.0366416, + "epoch": 0.5573425522320757, + "flos": 25042413847680.0, + "grad_norm": 1.5593232015543566, + "language_loss": 0.82527506, + "learning_rate": 1.726484084647256e-06, + "loss": 0.84663379, + "num_input_tokens_seen": 199649295, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6953125, + "step": 9270, + "time_per_iteration": 2.495546579360962 + }, + { + "auxiliary_loss_clip": 0.01107465, + "auxiliary_loss_mlp": 0.01031385, + "balance_loss_clip": 1.01927948, + "balance_loss_mlp": 1.03695226, + "epoch": 0.5574026754847438, + "flos": 23659637927040.0, + "grad_norm": 2.4402155421947485, + "language_loss": 0.79217434, + "learning_rate": 1.7260982873441591e-06, + "loss": 0.81356287, + "num_input_tokens_seen": 199668870, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.703125, + "step": 9271, + "time_per_iteration": 2.5050055980682373 + }, + { + "auxiliary_loss_clip": 0.01107417, + "auxiliary_loss_mlp": 0.01031316, + "balance_loss_clip": 1.01938963, + "balance_loss_mlp": 1.03778744, + "epoch": 0.5574627987374117, + "flos": 24782240661120.0, + "grad_norm": 1.994384891359262, + "language_loss": 0.90424085, + "learning_rate": 1.725712500427442e-06, + "loss": 0.92562819, + "num_input_tokens_seen": 199684870, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 9272, + "time_per_iteration": 2.455949068069458 + }, + { + "auxiliary_loss_clip": 0.0110516, + "auxiliary_loss_mlp": 0.01030586, + "balance_loss_clip": 1.01864076, + "balance_loss_mlp": 1.03754234, + "epoch": 0.5575229219900797, + "flos": 21834944979840.0, + "grad_norm": 1.979276269767202, + "language_loss": 0.83862162, + "learning_rate": 1.7253267239117347e-06, + "loss": 0.85997909, + "num_input_tokens_seen": 199701975, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 9273, + "time_per_iteration": 2.4802021980285645 + }, + { + "auxiliary_loss_clip": 0.01108902, + "auxiliary_loss_mlp": 0.01041124, + "balance_loss_clip": 1.02752197, + "balance_loss_mlp": 1.03908944, + "epoch": 0.5575830452427476, + "flos": 27815148408960.0, + "grad_norm": 2.0454885443684905, + "language_loss": 0.73996758, + "learning_rate": 1.7249409578116655e-06, + "loss": 0.76146781, + "num_input_tokens_seen": 199721865, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.69921875, + "step": 9274, + "time_per_iteration": 2.4761173725128174 + }, + { + "auxiliary_loss_clip": 0.01116526, + "auxiliary_loss_mlp": 0.01034454, + "balance_loss_clip": 1.02121544, + "balance_loss_mlp": 1.04015088, + "epoch": 0.5576431684954156, + "flos": 17812805696640.0, + "grad_norm": 2.9773966002159824, + "language_loss": 0.78126067, + "learning_rate": 1.7245552021418629e-06, + "loss": 0.8027705, + "num_input_tokens_seen": 199736455, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.765625, + "step": 9275, + "time_per_iteration": 2.4496877193450928 + }, + { + "auxiliary_loss_clip": 0.01106389, + "auxiliary_loss_mlp": 0.01029467, + "balance_loss_clip": 1.01745057, + "balance_loss_mlp": 1.03767419, + "epoch": 0.5577032917480835, + "flos": 15486872411520.0, + "grad_norm": 1.6885485925360224, + "language_loss": 0.74829316, + "learning_rate": 1.7241694569169546e-06, + "loss": 0.76965177, + "num_input_tokens_seen": 199753125, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6875, + "step": 9276, + "time_per_iteration": 2.413726806640625 + }, + { + "auxiliary_loss_clip": 0.0110324, + "auxiliary_loss_mlp": 0.01031184, + "balance_loss_clip": 1.01978803, + "balance_loss_mlp": 1.03508329, + "epoch": 0.5577634150007516, + "flos": 21579763783680.0, + "grad_norm": 1.7672131346084554, + "language_loss": 0.75013113, + "learning_rate": 1.7237837221515678e-06, + "loss": 0.77147532, + "num_input_tokens_seen": 199771365, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6796875, + "step": 9277, + "time_per_iteration": 2.4982142448425293 + }, + { + "auxiliary_loss_clip": 0.01102538, + "auxiliary_loss_mlp": 0.01032622, + "balance_loss_clip": 1.02155328, + "balance_loss_mlp": 1.03504467, + "epoch": 0.5578235382534195, + "flos": 21139750177920.0, + "grad_norm": 1.8714980055762023, + "language_loss": 0.71817064, + "learning_rate": 1.7233979978603304e-06, + "loss": 0.73952222, + "num_input_tokens_seen": 199790035, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.67578125, + "step": 9278, + "time_per_iteration": 2.4389007091522217 + }, + { + "auxiliary_loss_clip": 0.01109043, + "auxiliary_loss_mlp": 0.01034833, + "balance_loss_clip": 1.02185118, + "balance_loss_mlp": 1.0372287, + "epoch": 0.5578836615060875, + "flos": 26505199313280.0, + "grad_norm": 1.6538282955120047, + "language_loss": 0.75750679, + "learning_rate": 1.723012284057868e-06, + "loss": 0.77894545, + "num_input_tokens_seen": 199811125, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71875, + "step": 9279, + "time_per_iteration": 2.5255484580993652 + }, + { + "auxiliary_loss_clip": 0.0110534, + "auxiliary_loss_mlp": 0.01029428, + "balance_loss_clip": 1.01767397, + "balance_loss_mlp": 1.03544426, + "epoch": 0.5579437847587555, + "flos": 20153786780160.0, + "grad_norm": 2.2545627368714034, + "language_loss": 0.67431748, + "learning_rate": 1.7226265807588082e-06, + "loss": 0.69566512, + "num_input_tokens_seen": 199829915, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.69921875, + "step": 9280, + "time_per_iteration": 2.5258350372314453 + }, + { + "auxiliary_loss_clip": 0.01107802, + "auxiliary_loss_mlp": 0.01037985, + "balance_loss_clip": 1.02595139, + "balance_loss_mlp": 1.03626418, + "epoch": 0.5580039080114234, + "flos": 26102281478400.0, + "grad_norm": 1.676674952402485, + "language_loss": 0.72964156, + "learning_rate": 1.7222408879777763e-06, + "loss": 0.75109941, + "num_input_tokens_seen": 199850670, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.71484375, + "step": 9281, + "time_per_iteration": 2.505610466003418 + }, + { + "auxiliary_loss_clip": 0.01106676, + "auxiliary_loss_mlp": 0.01030885, + "balance_loss_clip": 1.01922011, + "balance_loss_mlp": 1.03804862, + "epoch": 0.5580640312640914, + "flos": 13771671096960.0, + "grad_norm": 2.9649443100281627, + "language_loss": 0.75254506, + "learning_rate": 1.7218552057293974e-06, + "loss": 0.77392066, + "num_input_tokens_seen": 199867645, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6875, + "step": 9282, + "time_per_iteration": 2.444455623626709 + }, + { + "auxiliary_loss_clip": 0.01104903, + "auxiliary_loss_mlp": 0.01026903, + "balance_loss_clip": 1.01507115, + "balance_loss_mlp": 1.03695285, + "epoch": 0.5581241545167593, + "flos": 17675986792320.0, + "grad_norm": 1.6849195839549764, + "language_loss": 0.66588777, + "learning_rate": 1.721469534028297e-06, + "loss": 0.68720585, + "num_input_tokens_seen": 199886320, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 9283, + "time_per_iteration": 2.4668378829956055 + }, + { + "auxiliary_loss_clip": 0.01105958, + "auxiliary_loss_mlp": 0.01025736, + "balance_loss_clip": 1.01500154, + "balance_loss_mlp": 1.03703356, + "epoch": 0.5581842777694274, + "flos": 19569161018880.0, + "grad_norm": 2.7565054625366305, + "language_loss": 0.8290503, + "learning_rate": 1.7210838728890994e-06, + "loss": 0.85036725, + "num_input_tokens_seen": 199904895, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6875, + "step": 9284, + "time_per_iteration": 2.430774688720703 + }, + { + "auxiliary_loss_clip": 0.01105717, + "auxiliary_loss_mlp": 0.01029119, + "balance_loss_clip": 1.01653099, + "balance_loss_mlp": 1.03554368, + "epoch": 0.5582444010220953, + "flos": 20595165102720.0, + "grad_norm": 2.3933521300057836, + "language_loss": 0.85047686, + "learning_rate": 1.7206982223264304e-06, + "loss": 0.87182522, + "num_input_tokens_seen": 199921090, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 9285, + "time_per_iteration": 2.4788479804992676 + }, + { + "auxiliary_loss_clip": 0.01105483, + "auxiliary_loss_mlp": 0.01031556, + "balance_loss_clip": 1.02009416, + "balance_loss_mlp": 1.03531575, + "epoch": 0.5583045242747633, + "flos": 19135504120320.0, + "grad_norm": 3.198131799092361, + "language_loss": 0.73653531, + "learning_rate": 1.720312582354912e-06, + "loss": 0.75790572, + "num_input_tokens_seen": 199939925, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.703125, + "step": 9286, + "time_per_iteration": 2.439715623855591 + }, + { + "auxiliary_loss_clip": 0.01107925, + "auxiliary_loss_mlp": 0.01031209, + "balance_loss_clip": 1.01946068, + "balance_loss_mlp": 1.03781044, + "epoch": 0.5583646475274312, + "flos": 27454569730560.0, + "grad_norm": 1.684452503968906, + "language_loss": 0.74169838, + "learning_rate": 1.7199269529891684e-06, + "loss": 0.76308966, + "num_input_tokens_seen": 199960015, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.703125, + "step": 9287, + "time_per_iteration": 2.534813642501831 + }, + { + "auxiliary_loss_clip": 0.01112227, + "auxiliary_loss_mlp": 0.01030459, + "balance_loss_clip": 1.01780486, + "balance_loss_mlp": 1.03982437, + "epoch": 0.5584247707800992, + "flos": 23653784010240.0, + "grad_norm": 2.339953652318452, + "language_loss": 0.75018406, + "learning_rate": 1.7195413342438233e-06, + "loss": 0.77161086, + "num_input_tokens_seen": 199980505, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 9288, + "time_per_iteration": 2.470242977142334 + }, + { + "auxiliary_loss_clip": 0.01109468, + "auxiliary_loss_mlp": 0.01037482, + "balance_loss_clip": 1.0241785, + "balance_loss_mlp": 1.03922033, + "epoch": 0.5584848940327671, + "flos": 13698880185600.0, + "grad_norm": 1.8804248151935914, + "language_loss": 0.77241838, + "learning_rate": 1.7191557261334984e-06, + "loss": 0.79388785, + "num_input_tokens_seen": 199999020, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.703125, + "step": 9289, + "time_per_iteration": 2.5357422828674316 + }, + { + "auxiliary_loss_clip": 0.01112615, + "auxiliary_loss_mlp": 0.01031708, + "balance_loss_clip": 1.01918483, + "balance_loss_mlp": 1.03802335, + "epoch": 0.5585450172854352, + "flos": 27016208150400.0, + "grad_norm": 1.7341259817318901, + "language_loss": 0.61310709, + "learning_rate": 1.718770128672817e-06, + "loss": 0.63455033, + "num_input_tokens_seen": 200019020, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.74609375, + "step": 9290, + "time_per_iteration": 2.479149580001831 + }, + { + "auxiliary_loss_clip": 0.01107208, + "auxiliary_loss_mlp": 0.01028765, + "balance_loss_clip": 1.01678467, + "balance_loss_mlp": 1.03602409, + "epoch": 0.5586051405381031, + "flos": 23185653033600.0, + "grad_norm": 1.9512495779204855, + "language_loss": 0.67988908, + "learning_rate": 1.7183845418764e-06, + "loss": 0.70124876, + "num_input_tokens_seen": 200038110, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.7109375, + "step": 9291, + "time_per_iteration": 2.4684019088745117 + }, + { + "auxiliary_loss_clip": 0.01108099, + "auxiliary_loss_mlp": 0.01033244, + "balance_loss_clip": 1.02022064, + "balance_loss_mlp": 1.0363071, + "epoch": 0.5586652637907711, + "flos": 20775544225920.0, + "grad_norm": 2.2522167745355524, + "language_loss": 0.83802187, + "learning_rate": 1.7179989657588698e-06, + "loss": 0.85943532, + "num_input_tokens_seen": 200056210, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 9292, + "time_per_iteration": 2.550994873046875 + }, + { + "auxiliary_loss_clip": 0.01104675, + "auxiliary_loss_mlp": 0.01033633, + "balance_loss_clip": 1.02180171, + "balance_loss_mlp": 1.03674221, + "epoch": 0.5587253870434391, + "flos": 28219897837440.0, + "grad_norm": 1.8368239448999808, + "language_loss": 0.73363894, + "learning_rate": 1.7176134003348476e-06, + "loss": 0.75502205, + "num_input_tokens_seen": 200075620, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 9293, + "time_per_iteration": 2.5334718227386475 + }, + { + "auxiliary_loss_clip": 0.01104517, + "auxiliary_loss_mlp": 0.01031653, + "balance_loss_clip": 1.02023864, + "balance_loss_mlp": 1.03715324, + "epoch": 0.558785510296107, + "flos": 26615732440320.0, + "grad_norm": 1.6770372644425844, + "language_loss": 0.7251429, + "learning_rate": 1.7172278456189523e-06, + "loss": 0.7465046, + "num_input_tokens_seen": 200095945, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 9294, + "time_per_iteration": 2.4782567024230957 + }, + { + "auxiliary_loss_clip": 0.01107679, + "auxiliary_loss_mlp": 0.01030624, + "balance_loss_clip": 1.01867914, + "balance_loss_mlp": 1.03769052, + "epoch": 0.558845633548775, + "flos": 20156767608960.0, + "grad_norm": 2.2895769976939437, + "language_loss": 0.68138099, + "learning_rate": 1.716842301625806e-06, + "loss": 0.70276403, + "num_input_tokens_seen": 200114185, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69921875, + "step": 9295, + "time_per_iteration": 2.433671474456787 + }, + { + "auxiliary_loss_clip": 0.01108675, + "auxiliary_loss_mlp": 0.0103174, + "balance_loss_clip": 1.01949131, + "balance_loss_mlp": 1.03873825, + "epoch": 0.5589057568014429, + "flos": 24350774492160.0, + "grad_norm": 1.7275865639530346, + "language_loss": 0.80619705, + "learning_rate": 1.7164567683700281e-06, + "loss": 0.82760113, + "num_input_tokens_seen": 200135030, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.69921875, + "step": 9296, + "time_per_iteration": 2.4831361770629883 + }, + { + "auxiliary_loss_clip": 0.01106832, + "auxiliary_loss_mlp": 0.01031695, + "balance_loss_clip": 1.0200243, + "balance_loss_mlp": 1.03788233, + "epoch": 0.558965880054111, + "flos": 21105168359040.0, + "grad_norm": 1.8948732644892212, + "language_loss": 0.65465128, + "learning_rate": 1.7160712458662379e-06, + "loss": 0.67603648, + "num_input_tokens_seen": 200154290, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6875, + "step": 9297, + "time_per_iteration": 2.4711036682128906 + }, + { + "auxiliary_loss_clip": 0.01109853, + "auxiliary_loss_mlp": 0.01035081, + "balance_loss_clip": 1.02202153, + "balance_loss_mlp": 1.03785491, + "epoch": 0.5590260033067789, + "flos": 18436071513600.0, + "grad_norm": 1.6800872146948855, + "language_loss": 0.7513994, + "learning_rate": 1.7156857341290544e-06, + "loss": 0.77284867, + "num_input_tokens_seen": 200171555, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 9298, + "time_per_iteration": 2.421066999435425 + }, + { + "auxiliary_loss_clip": 0.01031879, + "auxiliary_loss_mlp": 0.01018081, + "balance_loss_clip": 1.01695406, + "balance_loss_mlp": 1.01014686, + "epoch": 0.5590861265594469, + "flos": 70577432490240.0, + "grad_norm": 0.6830476030131911, + "language_loss": 0.52463478, + "learning_rate": 1.7153002331730967e-06, + "loss": 0.54513437, + "num_input_tokens_seen": 200237010, + "router_z_loss_clip": 0.0112915, + "router_z_loss_mlp": 0.21777344, + "step": 9299, + "time_per_iteration": 3.096731424331665 + }, + { + "auxiliary_loss_clip": 0.0110307, + "auxiliary_loss_mlp": 0.01029526, + "balance_loss_clip": 1.01799822, + "balance_loss_mlp": 1.03608131, + "epoch": 0.5591462498121148, + "flos": 30664408896000.0, + "grad_norm": 1.8758260689947703, + "language_loss": 0.68378884, + "learning_rate": 1.7149147430129824e-06, + "loss": 0.70511478, + "num_input_tokens_seen": 200260820, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 9300, + "time_per_iteration": 2.5355281829833984 + }, + { + "auxiliary_loss_clip": 0.01107824, + "auxiliary_loss_mlp": 0.01040798, + "balance_loss_clip": 1.02798903, + "balance_loss_mlp": 1.0372839, + "epoch": 0.5592063730647828, + "flos": 18150438562560.0, + "grad_norm": 1.868740801794004, + "language_loss": 0.81233132, + "learning_rate": 1.7145292636633293e-06, + "loss": 0.83381754, + "num_input_tokens_seen": 200278035, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.703125, + "step": 9301, + "time_per_iteration": 3.9131312370300293 + }, + { + "auxiliary_loss_clip": 0.01104347, + "auxiliary_loss_mlp": 0.01026194, + "balance_loss_clip": 1.01370668, + "balance_loss_mlp": 1.03488898, + "epoch": 0.5592664963174507, + "flos": 24060400945920.0, + "grad_norm": 2.564037719481304, + "language_loss": 0.67297423, + "learning_rate": 1.714143795138756e-06, + "loss": 0.69427967, + "num_input_tokens_seen": 200297255, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 9302, + "time_per_iteration": 2.484609365463257 + }, + { + "auxiliary_loss_clip": 0.011096, + "auxiliary_loss_mlp": 0.01024968, + "balance_loss_clip": 1.01249897, + "balance_loss_mlp": 1.03721702, + "epoch": 0.5593266195701188, + "flos": 19827897661440.0, + "grad_norm": 2.803806869845176, + "language_loss": 0.70999819, + "learning_rate": 1.713758337453878e-06, + "loss": 0.73134387, + "num_input_tokens_seen": 200317505, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7265625, + "step": 9303, + "time_per_iteration": 2.442859649658203 + }, + { + "auxiliary_loss_clip": 0.01105574, + "auxiliary_loss_mlp": 0.0102866, + "balance_loss_clip": 1.01791978, + "balance_loss_mlp": 1.03930676, + "epoch": 0.5593867428227867, + "flos": 25300755440640.0, + "grad_norm": 1.540239070281283, + "language_loss": 0.72772652, + "learning_rate": 1.7133728906233124e-06, + "loss": 0.74906886, + "num_input_tokens_seen": 200338350, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6640625, + "step": 9304, + "time_per_iteration": 5.429321765899658 + }, + { + "auxiliary_loss_clip": 0.01104708, + "auxiliary_loss_mlp": 0.01027817, + "balance_loss_clip": 1.01613426, + "balance_loss_mlp": 1.03523278, + "epoch": 0.5594468660754547, + "flos": 12933013374720.0, + "grad_norm": 1.8535856395803625, + "language_loss": 0.77888674, + "learning_rate": 1.7129874546616763e-06, + "loss": 0.80021197, + "num_input_tokens_seen": 200353965, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6953125, + "step": 9305, + "time_per_iteration": 3.8705790042877197 + }, + { + "auxiliary_loss_clip": 0.01102125, + "auxiliary_loss_mlp": 0.01024983, + "balance_loss_clip": 1.01390815, + "balance_loss_mlp": 1.03657615, + "epoch": 0.5595069893281227, + "flos": 19062713208960.0, + "grad_norm": 1.7045399129758072, + "language_loss": 0.69334519, + "learning_rate": 1.7126020295835836e-06, + "loss": 0.7146163, + "num_input_tokens_seen": 200373595, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 9306, + "time_per_iteration": 2.4669442176818848 + }, + { + "auxiliary_loss_clip": 0.01030152, + "auxiliary_loss_mlp": 0.01003605, + "balance_loss_clip": 1.0025028, + "balance_loss_mlp": 1.00838459, + "epoch": 0.5595671125807906, + "flos": 70273375862400.0, + "grad_norm": 0.9104128938879268, + "language_loss": 0.60324359, + "learning_rate": 1.7122166154036518e-06, + "loss": 0.62358117, + "num_input_tokens_seen": 200429155, + "router_z_loss_clip": 0.01104736, + "router_z_loss_mlp": 0.21777344, + "step": 9307, + "time_per_iteration": 3.167161703109741 + }, + { + "auxiliary_loss_clip": 0.01105033, + "auxiliary_loss_mlp": 0.01030814, + "balance_loss_clip": 1.01972127, + "balance_loss_mlp": 1.03697395, + "epoch": 0.5596272358334586, + "flos": 20665513889280.0, + "grad_norm": 1.9188877301503315, + "language_loss": 0.73981357, + "learning_rate": 1.7118312121364943e-06, + "loss": 0.76117194, + "num_input_tokens_seen": 200448290, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6796875, + "step": 9308, + "time_per_iteration": 2.544931650161743 + }, + { + "auxiliary_loss_clip": 0.01107282, + "auxiliary_loss_mlp": 0.01031253, + "balance_loss_clip": 1.01833069, + "balance_loss_mlp": 1.03571653, + "epoch": 0.5596873590861265, + "flos": 25041013217280.0, + "grad_norm": 1.8987333438245737, + "language_loss": 0.69393057, + "learning_rate": 1.7114458197967257e-06, + "loss": 0.71531588, + "num_input_tokens_seen": 200466555, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 9309, + "time_per_iteration": 2.5008022785186768 + }, + { + "auxiliary_loss_clip": 0.01108803, + "auxiliary_loss_mlp": 0.01031964, + "balance_loss_clip": 1.01787376, + "balance_loss_mlp": 1.03872681, + "epoch": 0.5597474823387946, + "flos": 25958387594880.0, + "grad_norm": 2.0715816525821458, + "language_loss": 0.75254035, + "learning_rate": 1.7110604383989613e-06, + "loss": 0.77394807, + "num_input_tokens_seen": 200485980, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.703125, + "step": 9310, + "time_per_iteration": 2.5096590518951416 + }, + { + "auxiliary_loss_clip": 0.01111521, + "auxiliary_loss_mlp": 0.01032538, + "balance_loss_clip": 1.01953197, + "balance_loss_mlp": 1.03922331, + "epoch": 0.5598076055914625, + "flos": 26177442687360.0, + "grad_norm": 4.006602699764322, + "language_loss": 0.69449794, + "learning_rate": 1.7106750679578133e-06, + "loss": 0.71593851, + "num_input_tokens_seen": 200504555, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.72265625, + "step": 9311, + "time_per_iteration": 2.5238418579101562 + }, + { + "auxiliary_loss_clip": 0.01103209, + "auxiliary_loss_mlp": 0.01028086, + "balance_loss_clip": 1.01616526, + "balance_loss_mlp": 1.03474474, + "epoch": 0.5598677288441305, + "flos": 11655778590720.0, + "grad_norm": 1.8631623558730779, + "language_loss": 0.72497612, + "learning_rate": 1.7102897084878962e-06, + "loss": 0.74628901, + "num_input_tokens_seen": 200522700, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.68359375, + "step": 9312, + "time_per_iteration": 2.4980969429016113 + }, + { + "auxiliary_loss_clip": 0.01106218, + "auxiliary_loss_mlp": 0.01030938, + "balance_loss_clip": 1.01871908, + "balance_loss_mlp": 1.03834271, + "epoch": 0.5599278520967984, + "flos": 22966597941120.0, + "grad_norm": 1.9916809517025356, + "language_loss": 0.89106059, + "learning_rate": 1.709904360003822e-06, + "loss": 0.91243219, + "num_input_tokens_seen": 200541910, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6796875, + "step": 9313, + "time_per_iteration": 2.43849515914917 + }, + { + "auxiliary_loss_clip": 0.01107396, + "auxiliary_loss_mlp": 0.01034864, + "balance_loss_clip": 1.0224545, + "balance_loss_mlp": 1.03886163, + "epoch": 0.5599879753494664, + "flos": 21215557831680.0, + "grad_norm": 1.848557040479868, + "language_loss": 0.77809632, + "learning_rate": 1.709519022520204e-06, + "loss": 0.79951894, + "num_input_tokens_seen": 200562600, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 9314, + "time_per_iteration": 2.4745004177093506 + }, + { + "auxiliary_loss_clip": 0.01103678, + "auxiliary_loss_mlp": 0.01027591, + "balance_loss_clip": 1.0153954, + "balance_loss_mlp": 1.03497362, + "epoch": 0.5600480986021343, + "flos": 31903219105920.0, + "grad_norm": 1.6135281246099127, + "language_loss": 0.7005592, + "learning_rate": 1.7091336960516537e-06, + "loss": 0.72187185, + "num_input_tokens_seen": 200584795, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 9315, + "time_per_iteration": 2.523815631866455 + }, + { + "auxiliary_loss_clip": 0.0110827, + "auxiliary_loss_mlp": 0.01034837, + "balance_loss_clip": 1.02225423, + "balance_loss_mlp": 1.03666615, + "epoch": 0.5601082218548024, + "flos": 28476048700800.0, + "grad_norm": 2.163442884097896, + "language_loss": 0.66467899, + "learning_rate": 1.7087483806127824e-06, + "loss": 0.68611002, + "num_input_tokens_seen": 200606945, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71875, + "step": 9316, + "time_per_iteration": 2.530667304992676 + }, + { + "auxiliary_loss_clip": 0.01106878, + "auxiliary_loss_mlp": 0.01031394, + "balance_loss_clip": 1.01796496, + "balance_loss_mlp": 1.03770351, + "epoch": 0.5601683451074703, + "flos": 24097173494400.0, + "grad_norm": 2.3805446029838624, + "language_loss": 0.86762506, + "learning_rate": 1.7083630762182022e-06, + "loss": 0.88900781, + "num_input_tokens_seen": 200626340, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.69140625, + "step": 9317, + "time_per_iteration": 2.469134569168091 + }, + { + "auxiliary_loss_clip": 0.01108894, + "auxiliary_loss_mlp": 0.01035341, + "balance_loss_clip": 1.02155399, + "balance_loss_mlp": 1.03657329, + "epoch": 0.5602284683601383, + "flos": 26356205698560.0, + "grad_norm": 1.7151693589962669, + "language_loss": 0.77363193, + "learning_rate": 1.7079777828825233e-06, + "loss": 0.79507434, + "num_input_tokens_seen": 200644520, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.72265625, + "step": 9318, + "time_per_iteration": 2.4952752590179443 + }, + { + "auxiliary_loss_clip": 0.01101693, + "auxiliary_loss_mlp": 0.01035081, + "balance_loss_clip": 1.02351773, + "balance_loss_mlp": 1.03302336, + "epoch": 0.5602885916128063, + "flos": 24496392228480.0, + "grad_norm": 1.698102214619228, + "language_loss": 0.75956237, + "learning_rate": 1.7075925006203558e-06, + "loss": 0.7809301, + "num_input_tokens_seen": 200664845, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6875, + "step": 9319, + "time_per_iteration": 2.479919910430908 + }, + { + "auxiliary_loss_clip": 0.01104648, + "auxiliary_loss_mlp": 0.01034126, + "balance_loss_clip": 1.02235985, + "balance_loss_mlp": 1.03689611, + "epoch": 0.5603487148654742, + "flos": 27345006270720.0, + "grad_norm": 1.554434910389292, + "language_loss": 0.85508537, + "learning_rate": 1.7072072294463101e-06, + "loss": 0.87647313, + "num_input_tokens_seen": 200686535, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.67578125, + "step": 9320, + "time_per_iteration": 2.511880874633789 + }, + { + "auxiliary_loss_clip": 0.01030962, + "auxiliary_loss_mlp": 0.00999706, + "balance_loss_clip": 0.99860352, + "balance_loss_mlp": 1.00918674, + "epoch": 0.5604088381181422, + "flos": 54087756180480.0, + "grad_norm": 0.7458732992694707, + "language_loss": 0.52630556, + "learning_rate": 1.706821969374996e-06, + "loss": 0.54661226, + "num_input_tokens_seen": 200736965, + "router_z_loss_clip": 0.01104736, + "router_z_loss_mlp": 0.21777344, + "step": 9321, + "time_per_iteration": 2.8576598167419434 + }, + { + "auxiliary_loss_clip": 0.01104414, + "auxiliary_loss_mlp": 0.01031081, + "balance_loss_clip": 1.01938033, + "balance_loss_mlp": 1.03744757, + "epoch": 0.5604689613708101, + "flos": 22236390357120.0, + "grad_norm": 1.4865751697326912, + "language_loss": 0.74422431, + "learning_rate": 1.7064367204210216e-06, + "loss": 0.76557928, + "num_input_tokens_seen": 200757420, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 9322, + "time_per_iteration": 2.480198383331299 + }, + { + "auxiliary_loss_clip": 0.01105934, + "auxiliary_loss_mlp": 0.01033008, + "balance_loss_clip": 1.01982379, + "balance_loss_mlp": 1.03641856, + "epoch": 0.5605290846234782, + "flos": 35297782940160.0, + "grad_norm": 1.8343710411867171, + "language_loss": 0.73661906, + "learning_rate": 1.7060514825989963e-06, + "loss": 0.75800848, + "num_input_tokens_seen": 200779520, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.6953125, + "step": 9323, + "time_per_iteration": 2.5517938137054443 + }, + { + "auxiliary_loss_clip": 0.01109096, + "auxiliary_loss_mlp": 0.01026708, + "balance_loss_clip": 1.01386333, + "balance_loss_mlp": 1.03797293, + "epoch": 0.5605892078761461, + "flos": 20263314326400.0, + "grad_norm": 1.5108510359489868, + "language_loss": 0.61287946, + "learning_rate": 1.7056662559235286e-06, + "loss": 0.63423753, + "num_input_tokens_seen": 200799485, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9324, + "time_per_iteration": 2.4675137996673584 + }, + { + "auxiliary_loss_clip": 0.01106981, + "auxiliary_loss_mlp": 0.0102911, + "balance_loss_clip": 1.01650345, + "balance_loss_mlp": 1.03693414, + "epoch": 0.5606493311288141, + "flos": 17308333134720.0, + "grad_norm": 2.2169286979326768, + "language_loss": 0.87785721, + "learning_rate": 1.705281040409226e-06, + "loss": 0.89921808, + "num_input_tokens_seen": 200817540, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 9325, + "time_per_iteration": 2.4160819053649902 + }, + { + "auxiliary_loss_clip": 0.01108623, + "auxiliary_loss_mlp": 0.01030754, + "balance_loss_clip": 1.01805806, + "balance_loss_mlp": 1.03765607, + "epoch": 0.560709454381482, + "flos": 21652985658240.0, + "grad_norm": 1.6383695475184654, + "language_loss": 0.74048722, + "learning_rate": 1.7048958360706952e-06, + "loss": 0.76188105, + "num_input_tokens_seen": 200838380, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9326, + "time_per_iteration": 2.463094711303711 + }, + { + "auxiliary_loss_clip": 0.01112046, + "auxiliary_loss_mlp": 0.01029376, + "balance_loss_clip": 1.01620328, + "balance_loss_mlp": 1.0386548, + "epoch": 0.56076957763415, + "flos": 20303355012480.0, + "grad_norm": 3.3443611641012674, + "language_loss": 0.78365433, + "learning_rate": 1.7045106429225447e-06, + "loss": 0.80506855, + "num_input_tokens_seen": 200855640, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 9327, + "time_per_iteration": 2.445756673812866 + }, + { + "auxiliary_loss_clip": 0.01108683, + "auxiliary_loss_mlp": 0.01031541, + "balance_loss_clip": 1.01842213, + "balance_loss_mlp": 1.03914046, + "epoch": 0.5608297008868179, + "flos": 25045897466880.0, + "grad_norm": 2.5559440694427478, + "language_loss": 0.78508025, + "learning_rate": 1.7041254609793795e-06, + "loss": 0.80648255, + "num_input_tokens_seen": 200876585, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6953125, + "step": 9328, + "time_per_iteration": 2.5156970024108887 + }, + { + "auxiliary_loss_clip": 0.01106121, + "auxiliary_loss_mlp": 0.01028303, + "balance_loss_clip": 1.01594675, + "balance_loss_mlp": 1.03623605, + "epoch": 0.560889824139486, + "flos": 19866825025920.0, + "grad_norm": 1.528557811702872, + "language_loss": 0.73765361, + "learning_rate": 1.7037402902558066e-06, + "loss": 0.7589978, + "num_input_tokens_seen": 200898175, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69921875, + "step": 9329, + "time_per_iteration": 2.4843335151672363 + }, + { + "auxiliary_loss_clip": 0.01110712, + "auxiliary_loss_mlp": 0.01036618, + "balance_loss_clip": 1.02325511, + "balance_loss_mlp": 1.03798938, + "epoch": 0.5609499473921539, + "flos": 22929394429440.0, + "grad_norm": 1.6466003553704387, + "language_loss": 0.83545572, + "learning_rate": 1.7033551307664324e-06, + "loss": 0.85692906, + "num_input_tokens_seen": 200917515, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7265625, + "step": 9330, + "time_per_iteration": 2.482752561569214 + }, + { + "auxiliary_loss_clip": 0.01031116, + "auxiliary_loss_mlp": 0.01002487, + "balance_loss_clip": 1.00147378, + "balance_loss_mlp": 1.0092634, + "epoch": 0.5610100706448219, + "flos": 53035825455360.0, + "grad_norm": 0.7161961657295335, + "language_loss": 0.57873559, + "learning_rate": 1.7029699825258603e-06, + "loss": 0.59907156, + "num_input_tokens_seen": 200978615, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.21875, + "step": 9331, + "time_per_iteration": 3.063901662826538 + }, + { + "auxiliary_loss_clip": 0.01108686, + "auxiliary_loss_mlp": 0.01032168, + "balance_loss_clip": 1.02002692, + "balance_loss_mlp": 1.03850377, + "epoch": 0.5610701938974898, + "flos": 21834944979840.0, + "grad_norm": 1.694841283599879, + "language_loss": 0.82141155, + "learning_rate": 1.7025848455486971e-06, + "loss": 0.84282017, + "num_input_tokens_seen": 200997745, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.703125, + "step": 9332, + "time_per_iteration": 2.475790500640869 + }, + { + "auxiliary_loss_clip": 0.01113328, + "auxiliary_loss_mlp": 0.01036451, + "balance_loss_clip": 1.02233052, + "balance_loss_mlp": 1.03915834, + "epoch": 0.5611303171501578, + "flos": 17457183095040.0, + "grad_norm": 1.7394490434662164, + "language_loss": 0.8172127, + "learning_rate": 1.7021997198495454e-06, + "loss": 0.83871055, + "num_input_tokens_seen": 201016370, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 9333, + "time_per_iteration": 2.4251558780670166 + }, + { + "auxiliary_loss_clip": 0.01106502, + "auxiliary_loss_mlp": 0.01027346, + "balance_loss_clip": 1.01541877, + "balance_loss_mlp": 1.03641915, + "epoch": 0.5611904404028258, + "flos": 22637799820800.0, + "grad_norm": 1.5456564302164297, + "language_loss": 0.73111224, + "learning_rate": 1.7018146054430108e-06, + "loss": 0.7524507, + "num_input_tokens_seen": 201034310, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.703125, + "step": 9334, + "time_per_iteration": 2.5241355895996094 + }, + { + "auxiliary_loss_clip": 0.01108924, + "auxiliary_loss_mlp": 0.01037845, + "balance_loss_clip": 1.02525675, + "balance_loss_mlp": 1.03886223, + "epoch": 0.5612505636554938, + "flos": 14316327999360.0, + "grad_norm": 1.7664531017043277, + "language_loss": 0.71317977, + "learning_rate": 1.7014295023436961e-06, + "loss": 0.73464751, + "num_input_tokens_seen": 201052030, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 9335, + "time_per_iteration": 2.4215545654296875 + }, + { + "auxiliary_loss_clip": 0.01109063, + "auxiliary_loss_mlp": 0.01029211, + "balance_loss_clip": 1.01659274, + "balance_loss_mlp": 1.0381881, + "epoch": 0.5613106869081618, + "flos": 16508279554560.0, + "grad_norm": 1.7059405915097856, + "language_loss": 0.76673937, + "learning_rate": 1.701044410566205e-06, + "loss": 0.78812212, + "num_input_tokens_seen": 201068445, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 9336, + "time_per_iteration": 2.456911087036133 + }, + { + "auxiliary_loss_clip": 0.01106617, + "auxiliary_loss_mlp": 0.01032175, + "balance_loss_clip": 1.0203793, + "balance_loss_mlp": 1.0376699, + "epoch": 0.5613708101608297, + "flos": 24058569352320.0, + "grad_norm": 2.253598480453168, + "language_loss": 0.644315, + "learning_rate": 1.7006593301251393e-06, + "loss": 0.66570294, + "num_input_tokens_seen": 201082140, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 9337, + "time_per_iteration": 2.4435572624206543 + }, + { + "auxiliary_loss_clip": 0.01029918, + "auxiliary_loss_mlp": 0.01004848, + "balance_loss_clip": 1.00367343, + "balance_loss_mlp": 1.00804543, + "epoch": 0.5614309334134977, + "flos": 64905735997440.0, + "grad_norm": 0.9905116764848269, + "language_loss": 0.62572861, + "learning_rate": 1.700274261035102e-06, + "loss": 0.64607626, + "num_input_tokens_seen": 201137245, + "router_z_loss_clip": 0.01171875, + "router_z_loss_mlp": 0.21875, + "step": 9338, + "time_per_iteration": 3.039401054382324 + }, + { + "auxiliary_loss_clip": 0.01110236, + "auxiliary_loss_mlp": 0.0103103, + "balance_loss_clip": 1.01862049, + "balance_loss_mlp": 1.03832674, + "epoch": 0.5614910566661656, + "flos": 32919849740160.0, + "grad_norm": 1.7660421922814409, + "language_loss": 0.65246809, + "learning_rate": 1.6998892033106946e-06, + "loss": 0.67388076, + "num_input_tokens_seen": 201157270, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71875, + "step": 9339, + "time_per_iteration": 2.5356857776641846 + }, + { + "auxiliary_loss_clip": 0.01106617, + "auxiliary_loss_mlp": 0.01032872, + "balance_loss_clip": 1.0203191, + "balance_loss_mlp": 1.03761101, + "epoch": 0.5615511799188336, + "flos": 18588871969920.0, + "grad_norm": 3.5768294087083317, + "language_loss": 0.69863123, + "learning_rate": 1.6995041569665184e-06, + "loss": 0.72002614, + "num_input_tokens_seen": 201174530, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 9340, + "time_per_iteration": 2.4699902534484863 + }, + { + "auxiliary_loss_clip": 0.01105107, + "auxiliary_loss_mlp": 0.01027787, + "balance_loss_clip": 1.01596177, + "balance_loss_mlp": 1.03900409, + "epoch": 0.5616113031715015, + "flos": 22820010537600.0, + "grad_norm": 1.8075752300654697, + "language_loss": 0.77621818, + "learning_rate": 1.6991191220171756e-06, + "loss": 0.7975471, + "num_input_tokens_seen": 201194905, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66015625, + "step": 9341, + "time_per_iteration": 2.456268072128296 + }, + { + "auxiliary_loss_clip": 0.01106711, + "auxiliary_loss_mlp": 0.01030586, + "balance_loss_clip": 1.01759195, + "balance_loss_mlp": 1.03572893, + "epoch": 0.5616714264241696, + "flos": 22345702421760.0, + "grad_norm": 1.9728763199974049, + "language_loss": 0.79315615, + "learning_rate": 1.6987340984772653e-06, + "loss": 0.81452906, + "num_input_tokens_seen": 201213715, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 9342, + "time_per_iteration": 2.4534597396850586 + }, + { + "auxiliary_loss_clip": 0.01111218, + "auxiliary_loss_mlp": 0.01030819, + "balance_loss_clip": 1.01735401, + "balance_loss_mlp": 1.03851485, + "epoch": 0.5617315496768375, + "flos": 18807783408000.0, + "grad_norm": 2.593835689079262, + "language_loss": 0.76322573, + "learning_rate": 1.6983490863613882e-06, + "loss": 0.78464609, + "num_input_tokens_seen": 201231415, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7265625, + "step": 9343, + "time_per_iteration": 3.8814024925231934 + }, + { + "auxiliary_loss_clip": 0.01109994, + "auxiliary_loss_mlp": 0.01037634, + "balance_loss_clip": 1.0245204, + "balance_loss_mlp": 1.03978682, + "epoch": 0.5617916729295055, + "flos": 18369314087040.0, + "grad_norm": 1.5945215839270617, + "language_loss": 0.68185151, + "learning_rate": 1.6979640856841442e-06, + "loss": 0.70332778, + "num_input_tokens_seen": 201249625, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 9344, + "time_per_iteration": 2.4659440517425537 + }, + { + "auxiliary_loss_clip": 0.01109593, + "auxiliary_loss_mlp": 0.01036696, + "balance_loss_clip": 1.02364254, + "balance_loss_mlp": 1.0381155, + "epoch": 0.5618517961821734, + "flos": 28179964892160.0, + "grad_norm": 2.2863999357797202, + "language_loss": 0.66754413, + "learning_rate": 1.6975790964601318e-06, + "loss": 0.68900704, + "num_input_tokens_seen": 201271205, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71484375, + "step": 9345, + "time_per_iteration": 2.5232093334198 + }, + { + "auxiliary_loss_clip": 0.01109525, + "auxiliary_loss_mlp": 0.01025247, + "balance_loss_clip": 1.01317143, + "balance_loss_mlp": 1.03883803, + "epoch": 0.5619119194348414, + "flos": 15486872411520.0, + "grad_norm": 1.8616054032141576, + "language_loss": 0.87347126, + "learning_rate": 1.6971941187039512e-06, + "loss": 0.89481902, + "num_input_tokens_seen": 201287700, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.70703125, + "step": 9346, + "time_per_iteration": 3.9651877880096436 + }, + { + "auxiliary_loss_clip": 0.0110623, + "auxiliary_loss_mlp": 0.01035149, + "balance_loss_clip": 1.02200019, + "balance_loss_mlp": 1.03657687, + "epoch": 0.5619720426875094, + "flos": 29128652951040.0, + "grad_norm": 2.36966351637476, + "language_loss": 0.59370089, + "learning_rate": 1.6968091524301993e-06, + "loss": 0.61511469, + "num_input_tokens_seen": 201307530, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.6953125, + "step": 9347, + "time_per_iteration": 3.9802420139312744 + }, + { + "auxiliary_loss_clip": 0.01108812, + "auxiliary_loss_mlp": 0.01037423, + "balance_loss_clip": 1.02319539, + "balance_loss_mlp": 1.03742838, + "epoch": 0.5620321659401774, + "flos": 18003743418240.0, + "grad_norm": 2.4273405009541107, + "language_loss": 0.68972194, + "learning_rate": 1.6964241976534745e-06, + "loss": 0.71118426, + "num_input_tokens_seen": 201326210, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7109375, + "step": 9348, + "time_per_iteration": 2.4413368701934814 + }, + { + "auxiliary_loss_clip": 0.01111452, + "auxiliary_loss_mlp": 0.01027053, + "balance_loss_clip": 1.01292634, + "balance_loss_mlp": 1.03695107, + "epoch": 0.5620922891928454, + "flos": 20594518657920.0, + "grad_norm": 1.9093659081457641, + "language_loss": 0.79040921, + "learning_rate": 1.6960392543883754e-06, + "loss": 0.81179428, + "num_input_tokens_seen": 201346120, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7421875, + "step": 9349, + "time_per_iteration": 2.4354894161224365 + }, + { + "auxiliary_loss_clip": 0.01110239, + "auxiliary_loss_mlp": 0.01028142, + "balance_loss_clip": 1.01527977, + "balance_loss_mlp": 1.03902698, + "epoch": 0.5621524124455133, + "flos": 26287006147200.0, + "grad_norm": 2.4504118343525207, + "language_loss": 0.67282045, + "learning_rate": 1.6956543226494975e-06, + "loss": 0.69420421, + "num_input_tokens_seen": 201365700, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9350, + "time_per_iteration": 2.548351287841797 + }, + { + "auxiliary_loss_clip": 0.01110364, + "auxiliary_loss_mlp": 0.01037163, + "balance_loss_clip": 1.02408016, + "balance_loss_mlp": 1.03830576, + "epoch": 0.5622125356981813, + "flos": 12750299867520.0, + "grad_norm": 2.1113714103165884, + "language_loss": 0.78716242, + "learning_rate": 1.6952694024514381e-06, + "loss": 0.80863774, + "num_input_tokens_seen": 201382795, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 9351, + "time_per_iteration": 2.4350974559783936 + }, + { + "auxiliary_loss_clip": 0.01112089, + "auxiliary_loss_mlp": 0.01032433, + "balance_loss_clip": 1.01989186, + "balance_loss_mlp": 1.03818786, + "epoch": 0.5622726589508492, + "flos": 23805327490560.0, + "grad_norm": 1.498970106789848, + "language_loss": 0.58875829, + "learning_rate": 1.6948844938087945e-06, + "loss": 0.6102035, + "num_input_tokens_seen": 201402780, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.73828125, + "step": 9352, + "time_per_iteration": 2.4637343883514404 + }, + { + "auxiliary_loss_clip": 0.01103109, + "auxiliary_loss_mlp": 0.01031893, + "balance_loss_clip": 1.01988828, + "balance_loss_mlp": 1.03640223, + "epoch": 0.5623327822035172, + "flos": 24718212668160.0, + "grad_norm": 1.2149782460758531, + "language_loss": 0.71828997, + "learning_rate": 1.6944995967361604e-06, + "loss": 0.73964, + "num_input_tokens_seen": 201424140, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66796875, + "step": 9353, + "time_per_iteration": 2.4747259616851807 + }, + { + "auxiliary_loss_clip": 0.01110024, + "auxiliary_loss_mlp": 0.01028607, + "balance_loss_clip": 1.01584542, + "balance_loss_mlp": 1.03763878, + "epoch": 0.5623929054561851, + "flos": 14019274523520.0, + "grad_norm": 5.092816610198626, + "language_loss": 0.75717902, + "learning_rate": 1.6941147112481327e-06, + "loss": 0.77856535, + "num_input_tokens_seen": 201439645, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7265625, + "step": 9354, + "time_per_iteration": 2.412938356399536 + }, + { + "auxiliary_loss_clip": 0.01111475, + "auxiliary_loss_mlp": 0.01033305, + "balance_loss_clip": 1.02066851, + "balance_loss_mlp": 1.03783214, + "epoch": 0.5624530287088532, + "flos": 20704405340160.0, + "grad_norm": 2.4650169046981434, + "language_loss": 0.72549778, + "learning_rate": 1.6937298373593056e-06, + "loss": 0.74694556, + "num_input_tokens_seen": 201459970, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.734375, + "step": 9355, + "time_per_iteration": 2.484099864959717 + }, + { + "auxiliary_loss_clip": 0.01108801, + "auxiliary_loss_mlp": 0.01030058, + "balance_loss_clip": 1.01700521, + "balance_loss_mlp": 1.03818929, + "epoch": 0.5625131519615211, + "flos": 21470918595840.0, + "grad_norm": 1.8617046290731056, + "language_loss": 0.73371327, + "learning_rate": 1.693344975084274e-06, + "loss": 0.75510186, + "num_input_tokens_seen": 201480055, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.70703125, + "step": 9356, + "time_per_iteration": 2.465129852294922 + }, + { + "auxiliary_loss_clip": 0.0110695, + "auxiliary_loss_mlp": 0.01035155, + "balance_loss_clip": 1.02280545, + "balance_loss_mlp": 1.03822494, + "epoch": 0.5625732752141891, + "flos": 18698004466560.0, + "grad_norm": 1.9991704999969526, + "language_loss": 0.82985485, + "learning_rate": 1.6929601244376318e-06, + "loss": 0.85127592, + "num_input_tokens_seen": 201497645, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 9357, + "time_per_iteration": 2.41115665435791 + }, + { + "auxiliary_loss_clip": 0.01107392, + "auxiliary_loss_mlp": 0.01030025, + "balance_loss_clip": 1.01797318, + "balance_loss_mlp": 1.03697777, + "epoch": 0.562633398466857, + "flos": 16216900427520.0, + "grad_norm": 1.9946457873090748, + "language_loss": 0.720213, + "learning_rate": 1.6925752854339722e-06, + "loss": 0.74158716, + "num_input_tokens_seen": 201515455, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.703125, + "step": 9358, + "time_per_iteration": 2.4276978969573975 + }, + { + "auxiliary_loss_clip": 0.01105818, + "auxiliary_loss_mlp": 0.0103922, + "balance_loss_clip": 1.0264169, + "balance_loss_mlp": 1.03677905, + "epoch": 0.562693521719525, + "flos": 22491930689280.0, + "grad_norm": 2.1174896987661755, + "language_loss": 0.77650487, + "learning_rate": 1.6921904580878885e-06, + "loss": 0.79795527, + "num_input_tokens_seen": 201534500, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69140625, + "step": 9359, + "time_per_iteration": 2.5595555305480957 + }, + { + "auxiliary_loss_clip": 0.01108374, + "auxiliary_loss_mlp": 0.01029979, + "balance_loss_clip": 1.0177722, + "balance_loss_mlp": 1.03723145, + "epoch": 0.562753644972193, + "flos": 25331171281920.0, + "grad_norm": 1.6788321894876823, + "language_loss": 0.70193481, + "learning_rate": 1.6918056424139736e-06, + "loss": 0.7233184, + "num_input_tokens_seen": 201553280, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.7109375, + "step": 9360, + "time_per_iteration": 2.485053062438965 + }, + { + "auxiliary_loss_clip": 0.01030911, + "auxiliary_loss_mlp": 0.01001933, + "balance_loss_clip": 1.0007472, + "balance_loss_mlp": 1.00916827, + "epoch": 0.562813768224861, + "flos": 67392622126080.0, + "grad_norm": 0.7762895856423075, + "language_loss": 0.55579072, + "learning_rate": 1.6914208384268197e-06, + "loss": 0.57611912, + "num_input_tokens_seen": 201610030, + "router_z_loss_clip": 0.01184082, + "router_z_loss_mlp": 0.21679688, + "step": 9361, + "time_per_iteration": 3.025913953781128 + }, + { + "auxiliary_loss_clip": 0.01105882, + "auxiliary_loss_mlp": 0.01033189, + "balance_loss_clip": 1.02153039, + "balance_loss_mlp": 1.03833425, + "epoch": 0.562873891477529, + "flos": 23331163029120.0, + "grad_norm": 1.3888397041491727, + "language_loss": 0.8183462, + "learning_rate": 1.691036046141018e-06, + "loss": 0.83973688, + "num_input_tokens_seen": 201628370, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.67578125, + "step": 9362, + "time_per_iteration": 2.5037269592285156 + }, + { + "auxiliary_loss_clip": 0.01105782, + "auxiliary_loss_mlp": 0.0103495, + "balance_loss_clip": 1.02248108, + "balance_loss_mlp": 1.03707612, + "epoch": 0.5629340147301969, + "flos": 38472824805120.0, + "grad_norm": 1.5280416781125297, + "language_loss": 0.74536633, + "learning_rate": 1.6906512655711614e-06, + "loss": 0.7667737, + "num_input_tokens_seen": 201649790, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 9363, + "time_per_iteration": 2.617192268371582 + }, + { + "auxiliary_loss_clip": 0.01109847, + "auxiliary_loss_mlp": 0.01032322, + "balance_loss_clip": 1.01944757, + "balance_loss_mlp": 1.03815794, + "epoch": 0.5629941379828649, + "flos": 29242023252480.0, + "grad_norm": 1.6569550766143035, + "language_loss": 0.83350259, + "learning_rate": 1.690266496731839e-06, + "loss": 0.85492432, + "num_input_tokens_seen": 201669175, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 9364, + "time_per_iteration": 2.5304059982299805 + }, + { + "auxiliary_loss_clip": 0.01107314, + "auxiliary_loss_mlp": 0.01034102, + "balance_loss_clip": 1.0222224, + "balance_loss_mlp": 1.03869832, + "epoch": 0.5630542612355328, + "flos": 19420885676160.0, + "grad_norm": 2.211298310091642, + "language_loss": 0.64659059, + "learning_rate": 1.689881739637642e-06, + "loss": 0.66800475, + "num_input_tokens_seen": 201687000, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 9365, + "time_per_iteration": 2.4514007568359375 + }, + { + "auxiliary_loss_clip": 0.01114055, + "auxiliary_loss_mlp": 0.01036393, + "balance_loss_clip": 1.02264261, + "balance_loss_mlp": 1.03817499, + "epoch": 0.5631143844882008, + "flos": 22266303408000.0, + "grad_norm": 3.047674915648226, + "language_loss": 0.81461316, + "learning_rate": 1.6894969943031611e-06, + "loss": 0.83611768, + "num_input_tokens_seen": 201703335, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7578125, + "step": 9366, + "time_per_iteration": 2.4486207962036133 + }, + { + "auxiliary_loss_clip": 0.01107023, + "auxiliary_loss_mlp": 0.01032651, + "balance_loss_clip": 1.02089667, + "balance_loss_mlp": 1.03850698, + "epoch": 0.5631745077408687, + "flos": 22965305051520.0, + "grad_norm": 1.4263654905382444, + "language_loss": 0.73047578, + "learning_rate": 1.6891122607429845e-06, + "loss": 0.75187254, + "num_input_tokens_seen": 201723495, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.68359375, + "step": 9367, + "time_per_iteration": 2.4800310134887695 + }, + { + "auxiliary_loss_clip": 0.01030227, + "auxiliary_loss_mlp": 0.01002627, + "balance_loss_clip": 1.00138175, + "balance_loss_mlp": 1.00840044, + "epoch": 0.5632346309935368, + "flos": 65080515576960.0, + "grad_norm": 0.6249011108272925, + "language_loss": 0.5348472, + "learning_rate": 1.6887275389717028e-06, + "loss": 0.55517572, + "num_input_tokens_seen": 201792615, + "router_z_loss_clip": 0.01245117, + "router_z_loss_mlp": 0.21875, + "step": 9368, + "time_per_iteration": 3.1797282695770264 + }, + { + "auxiliary_loss_clip": 0.01108664, + "auxiliary_loss_mlp": 0.01035647, + "balance_loss_clip": 1.02317202, + "balance_loss_mlp": 1.03974152, + "epoch": 0.5632947542462047, + "flos": 23002903612800.0, + "grad_norm": 1.7643271699947485, + "language_loss": 0.69015235, + "learning_rate": 1.6883428290039046e-06, + "loss": 0.71159542, + "num_input_tokens_seen": 201812520, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 9369, + "time_per_iteration": 2.4736390113830566 + }, + { + "auxiliary_loss_clip": 0.01105862, + "auxiliary_loss_mlp": 0.01033878, + "balance_loss_clip": 1.02091432, + "balance_loss_mlp": 1.03527367, + "epoch": 0.5633548774988727, + "flos": 30482593228800.0, + "grad_norm": 1.7859826045223857, + "language_loss": 0.7540313, + "learning_rate": 1.6879581308541763e-06, + "loss": 0.77542865, + "num_input_tokens_seen": 201834185, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.70703125, + "step": 9370, + "time_per_iteration": 2.5553858280181885 + }, + { + "auxiliary_loss_clip": 0.01109895, + "auxiliary_loss_mlp": 0.01033197, + "balance_loss_clip": 1.01930332, + "balance_loss_mlp": 1.0373863, + "epoch": 0.5634150007515406, + "flos": 18515039564160.0, + "grad_norm": 3.078957924920332, + "language_loss": 0.75699127, + "learning_rate": 1.687573444537108e-06, + "loss": 0.77842218, + "num_input_tokens_seen": 201851305, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7265625, + "step": 9371, + "time_per_iteration": 2.4327011108398438 + }, + { + "auxiliary_loss_clip": 0.01106308, + "auxiliary_loss_mlp": 0.01035962, + "balance_loss_clip": 1.02386189, + "balance_loss_mlp": 1.03729022, + "epoch": 0.5634751240042086, + "flos": 19244672530560.0, + "grad_norm": 2.3308389897051702, + "language_loss": 0.76292467, + "learning_rate": 1.687188770067285e-06, + "loss": 0.7843473, + "num_input_tokens_seen": 201870350, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69140625, + "step": 9372, + "time_per_iteration": 2.447720766067505 + }, + { + "auxiliary_loss_clip": 0.01106021, + "auxiliary_loss_mlp": 0.01032443, + "balance_loss_clip": 1.02006888, + "balance_loss_mlp": 1.03829265, + "epoch": 0.5635352472568766, + "flos": 12020630987520.0, + "grad_norm": 2.0572116747420224, + "language_loss": 0.72010261, + "learning_rate": 1.6868041074592956e-06, + "loss": 0.74148726, + "num_input_tokens_seen": 201886800, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 9373, + "time_per_iteration": 2.4268109798431396 + }, + { + "auxiliary_loss_clip": 0.01110498, + "auxiliary_loss_mlp": 0.01031888, + "balance_loss_clip": 1.01839924, + "balance_loss_mlp": 1.03994441, + "epoch": 0.5635953705095446, + "flos": 21871645701120.0, + "grad_norm": 2.3770492627250617, + "language_loss": 0.82499874, + "learning_rate": 1.6864194567277264e-06, + "loss": 0.84642255, + "num_input_tokens_seen": 201904730, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.70703125, + "step": 9374, + "time_per_iteration": 2.49582576751709 + }, + { + "auxiliary_loss_clip": 0.0110343, + "auxiliary_loss_mlp": 0.01026872, + "balance_loss_clip": 1.0145762, + "balance_loss_mlp": 1.03463507, + "epoch": 0.5636554937622126, + "flos": 27126166659840.0, + "grad_norm": 1.5156995265370945, + "language_loss": 0.66020733, + "learning_rate": 1.6860348178871618e-06, + "loss": 0.68151033, + "num_input_tokens_seen": 201924850, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 9375, + "time_per_iteration": 2.516523599624634 + }, + { + "auxiliary_loss_clip": 0.01109185, + "auxiliary_loss_mlp": 0.01036661, + "balance_loss_clip": 1.02434063, + "balance_loss_mlp": 1.03792977, + "epoch": 0.5637156170148805, + "flos": 12926405272320.0, + "grad_norm": 5.168267369431286, + "language_loss": 0.80860347, + "learning_rate": 1.6856501909521889e-06, + "loss": 0.83006191, + "num_input_tokens_seen": 201939500, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.7109375, + "step": 9376, + "time_per_iteration": 2.4961087703704834 + }, + { + "auxiliary_loss_clip": 0.01110113, + "auxiliary_loss_mlp": 0.01033851, + "balance_loss_clip": 1.02070785, + "balance_loss_mlp": 1.03650188, + "epoch": 0.5637757402675485, + "flos": 45551033130240.0, + "grad_norm": 1.331404975713729, + "language_loss": 0.69354665, + "learning_rate": 1.6852655759373925e-06, + "loss": 0.71498632, + "num_input_tokens_seen": 201963000, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 9377, + "time_per_iteration": 2.6732125282287598 + }, + { + "auxiliary_loss_clip": 0.01104228, + "auxiliary_loss_mlp": 0.01030175, + "balance_loss_clip": 1.01828349, + "balance_loss_mlp": 1.03818166, + "epoch": 0.5638358635202164, + "flos": 20886041439360.0, + "grad_norm": 1.3430474289029712, + "language_loss": 0.74622703, + "learning_rate": 1.6848809728573565e-06, + "loss": 0.76757109, + "num_input_tokens_seen": 201983145, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66015625, + "step": 9378, + "time_per_iteration": 2.4836812019348145 + }, + { + "auxiliary_loss_clip": 0.01111215, + "auxiliary_loss_mlp": 0.01035583, + "balance_loss_clip": 1.0216949, + "balance_loss_mlp": 1.03538918, + "epoch": 0.5638959867728844, + "flos": 18806562345600.0, + "grad_norm": 2.4002466182561366, + "language_loss": 0.81976169, + "learning_rate": 1.6844963817266656e-06, + "loss": 0.84122968, + "num_input_tokens_seen": 202000335, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 9379, + "time_per_iteration": 2.4185829162597656 + }, + { + "auxiliary_loss_clip": 0.01106862, + "auxiliary_loss_mlp": 0.01029607, + "balance_loss_clip": 1.01691699, + "balance_loss_mlp": 1.03549135, + "epoch": 0.5639561100255523, + "flos": 27490336698240.0, + "grad_norm": 2.697413775835763, + "language_loss": 0.71534967, + "learning_rate": 1.6841118025599042e-06, + "loss": 0.73671436, + "num_input_tokens_seen": 202018275, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71484375, + "step": 9380, + "time_per_iteration": 2.5077950954437256 + }, + { + "auxiliary_loss_clip": 0.01110271, + "auxiliary_loss_mlp": 0.01034366, + "balance_loss_clip": 1.02068686, + "balance_loss_mlp": 1.03794408, + "epoch": 0.5640162332782204, + "flos": 18076570243200.0, + "grad_norm": 3.2105212283898905, + "language_loss": 0.74216485, + "learning_rate": 1.6837272353716542e-06, + "loss": 0.7636112, + "num_input_tokens_seen": 202034330, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.72265625, + "step": 9381, + "time_per_iteration": 2.4029319286346436 + }, + { + "auxiliary_loss_clip": 0.01110337, + "auxiliary_loss_mlp": 0.01031927, + "balance_loss_clip": 1.01963091, + "balance_loss_mlp": 1.03806376, + "epoch": 0.5640763565308883, + "flos": 20884856290560.0, + "grad_norm": 3.316310717009383, + "language_loss": 0.72300208, + "learning_rate": 1.683342680176499e-06, + "loss": 0.7444247, + "num_input_tokens_seen": 202053100, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.72265625, + "step": 9382, + "time_per_iteration": 2.501958131790161 + }, + { + "auxiliary_loss_clip": 0.01028829, + "auxiliary_loss_mlp": 0.00999503, + "balance_loss_clip": 0.99848998, + "balance_loss_mlp": 1.00756264, + "epoch": 0.5641364797835563, + "flos": 64447912224000.0, + "grad_norm": 0.7363360341332579, + "language_loss": 0.54461426, + "learning_rate": 1.682958136989022e-06, + "loss": 0.5648976, + "num_input_tokens_seen": 202120125, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.21289062, + "step": 9383, + "time_per_iteration": 3.2148938179016113 + }, + { + "auxiliary_loss_clip": 0.01110708, + "auxiliary_loss_mlp": 0.01030206, + "balance_loss_clip": 1.01627028, + "balance_loss_mlp": 1.03699017, + "epoch": 0.5641966030362242, + "flos": 18660944609280.0, + "grad_norm": 1.8140556963544339, + "language_loss": 0.71018171, + "learning_rate": 1.6825736058238033e-06, + "loss": 0.73159087, + "num_input_tokens_seen": 202138030, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.73828125, + "step": 9384, + "time_per_iteration": 2.442484140396118 + }, + { + "auxiliary_loss_clip": 0.0110745, + "auxiliary_loss_mlp": 0.01030718, + "balance_loss_clip": 1.01751578, + "balance_loss_mlp": 1.03652072, + "epoch": 0.5642567262888922, + "flos": 22492325738880.0, + "grad_norm": 7.95557819766849, + "language_loss": 0.76225626, + "learning_rate": 1.6821890866954263e-06, + "loss": 0.78363794, + "num_input_tokens_seen": 202155580, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 9385, + "time_per_iteration": 3.928744316101074 + }, + { + "auxiliary_loss_clip": 0.01105207, + "auxiliary_loss_mlp": 0.0103345, + "balance_loss_clip": 1.02080739, + "balance_loss_mlp": 1.0359602, + "epoch": 0.5643168495415603, + "flos": 13003972692480.0, + "grad_norm": 2.157193633028955, + "language_loss": 0.82184142, + "learning_rate": 1.6818045796184703e-06, + "loss": 0.84322798, + "num_input_tokens_seen": 202170365, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 9386, + "time_per_iteration": 2.397623062133789 + }, + { + "auxiliary_loss_clip": 0.01112226, + "auxiliary_loss_mlp": 0.01035726, + "balance_loss_clip": 1.0220114, + "balance_loss_mlp": 1.03887677, + "epoch": 0.5643769727942282, + "flos": 18588297352320.0, + "grad_norm": 2.006582014999343, + "language_loss": 0.6989364, + "learning_rate": 1.681420084607516e-06, + "loss": 0.72041589, + "num_input_tokens_seen": 202189095, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73046875, + "step": 9387, + "time_per_iteration": 5.281404733657837 + }, + { + "auxiliary_loss_clip": 0.01110413, + "auxiliary_loss_mlp": 0.01033865, + "balance_loss_clip": 1.02143192, + "balance_loss_mlp": 1.03790522, + "epoch": 0.5644370960468962, + "flos": 33806269572480.0, + "grad_norm": 1.551891117692425, + "language_loss": 0.74553275, + "learning_rate": 1.6810356016771452e-06, + "loss": 0.76697552, + "num_input_tokens_seen": 202213500, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7265625, + "step": 9388, + "time_per_iteration": 4.091272830963135 + }, + { + "auxiliary_loss_clip": 0.01103254, + "auxiliary_loss_mlp": 0.01030903, + "balance_loss_clip": 1.01970327, + "balance_loss_mlp": 1.03551602, + "epoch": 0.5644972192995641, + "flos": 21214911386880.0, + "grad_norm": 1.6063296237871756, + "language_loss": 0.82072294, + "learning_rate": 1.6806511308419353e-06, + "loss": 0.8420645, + "num_input_tokens_seen": 202231920, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.67578125, + "step": 9389, + "time_per_iteration": 2.4588046073913574 + }, + { + "auxiliary_loss_clip": 0.01110191, + "auxiliary_loss_mlp": 0.01034189, + "balance_loss_clip": 1.01995528, + "balance_loss_mlp": 1.03775918, + "epoch": 0.5645573425522321, + "flos": 18587722734720.0, + "grad_norm": 1.8781979731175902, + "language_loss": 0.64145517, + "learning_rate": 1.680266672116467e-06, + "loss": 0.66289902, + "num_input_tokens_seen": 202247600, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.72265625, + "step": 9390, + "time_per_iteration": 2.4152185916900635 + }, + { + "auxiliary_loss_clip": 0.01108689, + "auxiliary_loss_mlp": 0.01031011, + "balance_loss_clip": 1.01928711, + "balance_loss_mlp": 1.0396266, + "epoch": 0.5646174658049, + "flos": 18113809668480.0, + "grad_norm": 1.6485981004433565, + "language_loss": 0.91899133, + "learning_rate": 1.6798822255153192e-06, + "loss": 0.94038832, + "num_input_tokens_seen": 202265350, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.69140625, + "step": 9391, + "time_per_iteration": 2.4316937923431396 + }, + { + "auxiliary_loss_clip": 0.01113898, + "auxiliary_loss_mlp": 0.01036386, + "balance_loss_clip": 1.02221847, + "balance_loss_mlp": 1.03941607, + "epoch": 0.564677589057568, + "flos": 28329964087680.0, + "grad_norm": 1.8545056387285421, + "language_loss": 0.60528994, + "learning_rate": 1.6794977910530684e-06, + "loss": 0.62679285, + "num_input_tokens_seen": 202284285, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7421875, + "step": 9392, + "time_per_iteration": 2.524616003036499 + }, + { + "auxiliary_loss_clip": 0.01106729, + "auxiliary_loss_mlp": 0.01027429, + "balance_loss_clip": 1.01412547, + "balance_loss_mlp": 1.03683674, + "epoch": 0.564737712310236, + "flos": 22163743100160.0, + "grad_norm": 1.8891326454378248, + "language_loss": 0.81002814, + "learning_rate": 1.6791133687442937e-06, + "loss": 0.83136976, + "num_input_tokens_seen": 202303450, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.69921875, + "step": 9393, + "time_per_iteration": 2.5394442081451416 + }, + { + "auxiliary_loss_clip": 0.01109875, + "auxiliary_loss_mlp": 0.01029911, + "balance_loss_clip": 1.0175252, + "balance_loss_mlp": 1.03945863, + "epoch": 0.564797835562904, + "flos": 20959011918720.0, + "grad_norm": 1.6361233529041357, + "language_loss": 0.87129962, + "learning_rate": 1.6787289586035725e-06, + "loss": 0.89269751, + "num_input_tokens_seen": 202322315, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.703125, + "step": 9394, + "time_per_iteration": 2.4735207557678223 + }, + { + "auxiliary_loss_clip": 0.01109379, + "auxiliary_loss_mlp": 0.01030351, + "balance_loss_clip": 1.01856136, + "balance_loss_mlp": 1.04019666, + "epoch": 0.5648579588155719, + "flos": 17420302805760.0, + "grad_norm": 2.1407868955990232, + "language_loss": 0.84850395, + "learning_rate": 1.6783445606454814e-06, + "loss": 0.8699013, + "num_input_tokens_seen": 202339905, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.69140625, + "step": 9395, + "time_per_iteration": 2.457840919494629 + }, + { + "auxiliary_loss_clip": 0.01029319, + "auxiliary_loss_mlp": 0.00999952, + "balance_loss_clip": 0.99876004, + "balance_loss_mlp": 1.00789344, + "epoch": 0.5649180820682399, + "flos": 69929568835200.0, + "grad_norm": 0.857023745969297, + "language_loss": 0.58308172, + "learning_rate": 1.677960174884597e-06, + "loss": 0.60337436, + "num_input_tokens_seen": 202397320, + "router_z_loss_clip": 0.01190186, + "router_z_loss_mlp": 0.21484375, + "step": 9396, + "time_per_iteration": 3.073537588119507 + }, + { + "auxiliary_loss_clip": 0.01110535, + "auxiliary_loss_mlp": 0.01030433, + "balance_loss_clip": 1.01818371, + "balance_loss_mlp": 1.03816915, + "epoch": 0.5649782053209078, + "flos": 24973070641920.0, + "grad_norm": 2.248812637940723, + "language_loss": 0.70105237, + "learning_rate": 1.6775758013354943e-06, + "loss": 0.72246206, + "num_input_tokens_seen": 202416865, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.72265625, + "step": 9397, + "time_per_iteration": 2.4962973594665527 + }, + { + "auxiliary_loss_clip": 0.01109847, + "auxiliary_loss_mlp": 0.01032556, + "balance_loss_clip": 1.02008653, + "balance_loss_mlp": 1.03723562, + "epoch": 0.5650383285735758, + "flos": 21726602582400.0, + "grad_norm": 1.751232513493423, + "language_loss": 0.66376907, + "learning_rate": 1.67719144001275e-06, + "loss": 0.68519312, + "num_input_tokens_seen": 202436210, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7265625, + "step": 9398, + "time_per_iteration": 2.4747612476348877 + }, + { + "auxiliary_loss_clip": 0.01027927, + "auxiliary_loss_mlp": 0.0100079, + "balance_loss_clip": 0.99962217, + "balance_loss_mlp": 1.00642622, + "epoch": 0.5650984518262439, + "flos": 65904484636800.0, + "grad_norm": 0.8050196413226386, + "language_loss": 0.58135325, + "learning_rate": 1.6768070909309386e-06, + "loss": 0.60164046, + "num_input_tokens_seen": 202492925, + "router_z_loss_clip": 0.01165771, + "router_z_loss_mlp": 0.21484375, + "step": 9399, + "time_per_iteration": 3.043860912322998 + }, + { + "auxiliary_loss_clip": 0.01109539, + "auxiliary_loss_mlp": 0.01034107, + "balance_loss_clip": 1.01959336, + "balance_loss_mlp": 1.03663015, + "epoch": 0.5651585750789118, + "flos": 21032592929280.0, + "grad_norm": 1.8022721102148394, + "language_loss": 0.72654182, + "learning_rate": 1.6764227541046347e-06, + "loss": 0.74797827, + "num_input_tokens_seen": 202511905, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7265625, + "step": 9400, + "time_per_iteration": 2.46345853805542 + }, + { + "auxiliary_loss_clip": 0.01112209, + "auxiliary_loss_mlp": 0.01036399, + "balance_loss_clip": 1.02223074, + "balance_loss_mlp": 1.03858781, + "epoch": 0.5652186983315798, + "flos": 18551919853440.0, + "grad_norm": 2.2275961694321254, + "language_loss": 0.61034292, + "learning_rate": 1.676038429548412e-06, + "loss": 0.63182896, + "num_input_tokens_seen": 202529815, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.734375, + "step": 9401, + "time_per_iteration": 2.4518327713012695 + }, + { + "auxiliary_loss_clip": 0.01105602, + "auxiliary_loss_mlp": 0.01028063, + "balance_loss_clip": 1.01590967, + "balance_loss_mlp": 1.03578329, + "epoch": 0.5652788215842477, + "flos": 18478662065280.0, + "grad_norm": 1.8211208041554372, + "language_loss": 0.81334603, + "learning_rate": 1.6756541172768453e-06, + "loss": 0.8346827, + "num_input_tokens_seen": 202547710, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69921875, + "step": 9402, + "time_per_iteration": 2.4201457500457764 + }, + { + "auxiliary_loss_clip": 0.0110456, + "auxiliary_loss_mlp": 0.01033217, + "balance_loss_clip": 1.02154684, + "balance_loss_mlp": 1.03594768, + "epoch": 0.5653389448369157, + "flos": 30044052080640.0, + "grad_norm": 1.4814077209882908, + "language_loss": 0.77969164, + "learning_rate": 1.6752698173045068e-06, + "loss": 0.80106944, + "num_input_tokens_seen": 202568835, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.68359375, + "step": 9403, + "time_per_iteration": 2.5353829860687256 + }, + { + "auxiliary_loss_clip": 0.01106959, + "auxiliary_loss_mlp": 0.01027921, + "balance_loss_clip": 1.01558244, + "balance_loss_mlp": 1.03666544, + "epoch": 0.5653990680895836, + "flos": 16727550128640.0, + "grad_norm": 1.6092170779922605, + "language_loss": 0.68699729, + "learning_rate": 1.6748855296459685e-06, + "loss": 0.70834613, + "num_input_tokens_seen": 202587385, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 9404, + "time_per_iteration": 2.4321181774139404 + }, + { + "auxiliary_loss_clip": 0.01102774, + "auxiliary_loss_mlp": 0.0103085, + "balance_loss_clip": 1.01951897, + "balance_loss_mlp": 1.03503776, + "epoch": 0.5654591913422516, + "flos": 14538256179840.0, + "grad_norm": 2.484491546437136, + "language_loss": 0.66842878, + "learning_rate": 1.6745012543158045e-06, + "loss": 0.68976498, + "num_input_tokens_seen": 202604815, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6796875, + "step": 9405, + "time_per_iteration": 2.440232992172241 + }, + { + "auxiliary_loss_clip": 0.01104276, + "auxiliary_loss_mlp": 0.01031828, + "balance_loss_clip": 1.02001476, + "balance_loss_mlp": 1.03823268, + "epoch": 0.5655193145949196, + "flos": 26209905603840.0, + "grad_norm": 1.9824391842040467, + "language_loss": 0.74238181, + "learning_rate": 1.6741169913285852e-06, + "loss": 0.76374286, + "num_input_tokens_seen": 202623775, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66015625, + "step": 9406, + "time_per_iteration": 2.4748172760009766 + }, + { + "auxiliary_loss_clip": 0.0110835, + "auxiliary_loss_mlp": 0.01033658, + "balance_loss_clip": 1.02006197, + "balance_loss_mlp": 1.03640151, + "epoch": 0.5655794378475876, + "flos": 25046579825280.0, + "grad_norm": 1.7875183280919196, + "language_loss": 0.79345733, + "learning_rate": 1.673732740698882e-06, + "loss": 0.81487745, + "num_input_tokens_seen": 202643375, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71875, + "step": 9407, + "time_per_iteration": 2.507815361022949 + }, + { + "auxiliary_loss_clip": 0.01104854, + "auxiliary_loss_mlp": 0.01031122, + "balance_loss_clip": 1.0192194, + "balance_loss_mlp": 1.03815317, + "epoch": 0.5656395611002555, + "flos": 31032852652800.0, + "grad_norm": 1.520930632215419, + "language_loss": 0.70626116, + "learning_rate": 1.6733485024412666e-06, + "loss": 0.7276209, + "num_input_tokens_seen": 202668400, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 9408, + "time_per_iteration": 2.62674880027771 + }, + { + "auxiliary_loss_clip": 0.01106638, + "auxiliary_loss_mlp": 0.01032436, + "balance_loss_clip": 1.02018738, + "balance_loss_mlp": 1.03758848, + "epoch": 0.5656996843529235, + "flos": 20229522606720.0, + "grad_norm": 2.0177540820880377, + "language_loss": 0.81701803, + "learning_rate": 1.672964276570308e-06, + "loss": 0.83840877, + "num_input_tokens_seen": 202685125, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.69140625, + "step": 9409, + "time_per_iteration": 2.4532053470611572 + }, + { + "auxiliary_loss_clip": 0.01105936, + "auxiliary_loss_mlp": 0.01026562, + "balance_loss_clip": 1.01446199, + "balance_loss_mlp": 1.03632855, + "epoch": 0.5657598076055914, + "flos": 20996251344000.0, + "grad_norm": 1.7583452820695855, + "language_loss": 0.77886415, + "learning_rate": 1.6725800631005776e-06, + "loss": 0.80018914, + "num_input_tokens_seen": 202703830, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69921875, + "step": 9410, + "time_per_iteration": 2.441938877105713 + }, + { + "auxiliary_loss_clip": 0.01107661, + "auxiliary_loss_mlp": 0.01033936, + "balance_loss_clip": 1.02188444, + "balance_loss_mlp": 1.0371294, + "epoch": 0.5658199308582594, + "flos": 11545999649280.0, + "grad_norm": 2.4716186369957405, + "language_loss": 0.83512276, + "learning_rate": 1.6721958620466432e-06, + "loss": 0.85653877, + "num_input_tokens_seen": 202719835, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.703125, + "step": 9411, + "time_per_iteration": 2.4718945026397705 + }, + { + "auxiliary_loss_clip": 0.01111478, + "auxiliary_loss_mlp": 0.01031553, + "balance_loss_clip": 1.01870787, + "balance_loss_mlp": 1.03809881, + "epoch": 0.5658800541109275, + "flos": 14172146807040.0, + "grad_norm": 2.235812012909735, + "language_loss": 0.67052126, + "learning_rate": 1.6718116734230749e-06, + "loss": 0.69195151, + "num_input_tokens_seen": 202736795, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.734375, + "step": 9412, + "time_per_iteration": 2.4114651679992676 + }, + { + "auxiliary_loss_clip": 0.01102875, + "auxiliary_loss_mlp": 0.01027937, + "balance_loss_clip": 1.01747072, + "balance_loss_mlp": 1.03637409, + "epoch": 0.5659401773635954, + "flos": 27305073325440.0, + "grad_norm": 1.4642683426161254, + "language_loss": 0.58723432, + "learning_rate": 1.6714274972444413e-06, + "loss": 0.60854244, + "num_input_tokens_seen": 202756900, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.6640625, + "step": 9413, + "time_per_iteration": 2.5274460315704346 + }, + { + "auxiliary_loss_clip": 0.01102994, + "auxiliary_loss_mlp": 0.01028734, + "balance_loss_clip": 1.01708674, + "balance_loss_mlp": 1.03515315, + "epoch": 0.5660003006162634, + "flos": 16728196573440.0, + "grad_norm": 1.4689493119012975, + "language_loss": 0.69065028, + "learning_rate": 1.6710433335253092e-06, + "loss": 0.71196759, + "num_input_tokens_seen": 202775145, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 9414, + "time_per_iteration": 2.4249722957611084 + }, + { + "auxiliary_loss_clip": 0.01100758, + "auxiliary_loss_mlp": 0.01026939, + "balance_loss_clip": 1.0162462, + "balance_loss_mlp": 1.03464198, + "epoch": 0.5660604238689313, + "flos": 21653452535040.0, + "grad_norm": 2.330719071721026, + "language_loss": 0.78351963, + "learning_rate": 1.670659182280247e-06, + "loss": 0.80479658, + "num_input_tokens_seen": 202794505, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.66015625, + "step": 9415, + "time_per_iteration": 2.4853508472442627 + }, + { + "auxiliary_loss_clip": 0.01027693, + "auxiliary_loss_mlp": 0.01002734, + "balance_loss_clip": 1.00167274, + "balance_loss_mlp": 1.00642896, + "epoch": 0.5661205471215993, + "flos": 68824022083200.0, + "grad_norm": 0.686572948711127, + "language_loss": 0.49232727, + "learning_rate": 1.670275043523822e-06, + "loss": 0.51263154, + "num_input_tokens_seen": 202858580, + "router_z_loss_clip": 0.01062012, + "router_z_loss_mlp": 0.21289062, + "step": 9416, + "time_per_iteration": 3.1817550659179688 + }, + { + "auxiliary_loss_clip": 0.01106414, + "auxiliary_loss_mlp": 0.01032774, + "balance_loss_clip": 1.02036452, + "balance_loss_mlp": 1.03713977, + "epoch": 0.5661806703742672, + "flos": 28621774177920.0, + "grad_norm": 1.6874553076405654, + "language_loss": 0.62577593, + "learning_rate": 1.6698909172706e-06, + "loss": 0.6471678, + "num_input_tokens_seen": 202878565, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6953125, + "step": 9417, + "time_per_iteration": 2.5856666564941406 + }, + { + "auxiliary_loss_clip": 0.01107822, + "auxiliary_loss_mlp": 0.01030901, + "balance_loss_clip": 1.01865244, + "balance_loss_mlp": 1.03606224, + "epoch": 0.5662407936269352, + "flos": 21397948116480.0, + "grad_norm": 1.797784660701456, + "language_loss": 0.68931323, + "learning_rate": 1.6695068035351479e-06, + "loss": 0.71070051, + "num_input_tokens_seen": 202897350, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.71875, + "step": 9418, + "time_per_iteration": 2.4920060634613037 + }, + { + "auxiliary_loss_clip": 0.01105804, + "auxiliary_loss_mlp": 0.01032238, + "balance_loss_clip": 1.01873779, + "balance_loss_mlp": 1.035465, + "epoch": 0.5663009168796032, + "flos": 25660005315840.0, + "grad_norm": 1.9782803688051387, + "language_loss": 0.64613676, + "learning_rate": 1.6691227023320304e-06, + "loss": 0.66751719, + "num_input_tokens_seen": 202916745, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.703125, + "step": 9419, + "time_per_iteration": 2.5130629539489746 + }, + { + "auxiliary_loss_clip": 0.01028877, + "auxiliary_loss_mlp": 0.01005663, + "balance_loss_clip": 1.00455463, + "balance_loss_mlp": 1.00721812, + "epoch": 0.5663610401322712, + "flos": 67930458422400.0, + "grad_norm": 0.7373486000439856, + "language_loss": 0.59778821, + "learning_rate": 1.6687386136758135e-06, + "loss": 0.61813354, + "num_input_tokens_seen": 202982375, + "router_z_loss_clip": 0.0111084, + "router_z_loss_mlp": 0.21679688, + "step": 9420, + "time_per_iteration": 3.1712303161621094 + }, + { + "auxiliary_loss_clip": 0.01101914, + "auxiliary_loss_mlp": 0.01029636, + "balance_loss_clip": 1.01874661, + "balance_loss_mlp": 1.03477347, + "epoch": 0.5664211633849391, + "flos": 24609367480320.0, + "grad_norm": 1.7745364781392496, + "language_loss": 0.74103463, + "learning_rate": 1.6683545375810618e-06, + "loss": 0.76235008, + "num_input_tokens_seen": 203002430, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.671875, + "step": 9421, + "time_per_iteration": 2.4926223754882812 + }, + { + "auxiliary_loss_clip": 0.01109317, + "auxiliary_loss_mlp": 0.01035488, + "balance_loss_clip": 1.02292371, + "balance_loss_mlp": 1.03705812, + "epoch": 0.5664812866376071, + "flos": 11648811352320.0, + "grad_norm": 1.8540803425049197, + "language_loss": 0.72345394, + "learning_rate": 1.6679704740623389e-06, + "loss": 0.74490201, + "num_input_tokens_seen": 203019425, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.72265625, + "step": 9422, + "time_per_iteration": 2.4081509113311768 + }, + { + "auxiliary_loss_clip": 0.01103997, + "auxiliary_loss_mlp": 0.01034779, + "balance_loss_clip": 1.02378821, + "balance_loss_mlp": 1.03694618, + "epoch": 0.566541409890275, + "flos": 24643985212800.0, + "grad_norm": 1.7305682094853587, + "language_loss": 0.81321973, + "learning_rate": 1.6675864231342085e-06, + "loss": 0.83460754, + "num_input_tokens_seen": 203039035, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.671875, + "step": 9423, + "time_per_iteration": 2.4871041774749756 + }, + { + "auxiliary_loss_clip": 0.01102932, + "auxiliary_loss_mlp": 0.0103348, + "balance_loss_clip": 1.0210824, + "balance_loss_mlp": 1.0354147, + "epoch": 0.566601533142943, + "flos": 22270577126400.0, + "grad_norm": 1.656660590859511, + "language_loss": 0.8069616, + "learning_rate": 1.6672023848112353e-06, + "loss": 0.82832569, + "num_input_tokens_seen": 203059320, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.67578125, + "step": 9424, + "time_per_iteration": 2.4634275436401367 + }, + { + "auxiliary_loss_clip": 0.01111676, + "auxiliary_loss_mlp": 0.01032203, + "balance_loss_clip": 1.01844072, + "balance_loss_mlp": 1.03887486, + "epoch": 0.5666616563956111, + "flos": 29971656218880.0, + "grad_norm": 1.8161233698436283, + "language_loss": 0.78745866, + "learning_rate": 1.6668183591079805e-06, + "loss": 0.80889738, + "num_input_tokens_seen": 203078490, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7265625, + "step": 9425, + "time_per_iteration": 2.5064780712127686 + }, + { + "auxiliary_loss_clip": 0.01105998, + "auxiliary_loss_mlp": 0.01028946, + "balance_loss_clip": 1.01658988, + "balance_loss_mlp": 1.03674626, + "epoch": 0.566721779648279, + "flos": 17781456101760.0, + "grad_norm": 1.8642193992685885, + "language_loss": 0.5897873, + "learning_rate": 1.6664343460390064e-06, + "loss": 0.61113673, + "num_input_tokens_seen": 203096065, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6953125, + "step": 9426, + "time_per_iteration": 2.4720263481140137 + }, + { + "auxiliary_loss_clip": 0.01110856, + "auxiliary_loss_mlp": 0.01030111, + "balance_loss_clip": 1.01804113, + "balance_loss_mlp": 1.03823078, + "epoch": 0.566781902900947, + "flos": 21033490769280.0, + "grad_norm": 2.0557394177022768, + "language_loss": 0.81685758, + "learning_rate": 1.6660503456188764e-06, + "loss": 0.83826721, + "num_input_tokens_seen": 203115270, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.7265625, + "step": 9427, + "time_per_iteration": 3.872758388519287 + }, + { + "auxiliary_loss_clip": 0.01104828, + "auxiliary_loss_mlp": 0.01036492, + "balance_loss_clip": 1.02394485, + "balance_loss_mlp": 1.03744185, + "epoch": 0.5668420261536149, + "flos": 23148593176320.0, + "grad_norm": 1.8776390907485432, + "language_loss": 0.86198628, + "learning_rate": 1.6656663578621498e-06, + "loss": 0.88339949, + "num_input_tokens_seen": 203134290, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.671875, + "step": 9428, + "time_per_iteration": 2.4911303520202637 + }, + { + "auxiliary_loss_clip": 0.01112998, + "auxiliary_loss_mlp": 0.01036692, + "balance_loss_clip": 1.02427602, + "balance_loss_mlp": 1.04080331, + "epoch": 0.5669021494062829, + "flos": 22601601889920.0, + "grad_norm": 2.1518083513194552, + "language_loss": 0.74125421, + "learning_rate": 1.6652823827833886e-06, + "loss": 0.7627511, + "num_input_tokens_seen": 203152935, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71875, + "step": 9429, + "time_per_iteration": 3.9635231494903564 + }, + { + "auxiliary_loss_clip": 0.01109434, + "auxiliary_loss_mlp": 0.0103455, + "balance_loss_clip": 1.02127612, + "balance_loss_mlp": 1.03756118, + "epoch": 0.5669622726589508, + "flos": 17381231786880.0, + "grad_norm": 1.7976574461964, + "language_loss": 0.7496838, + "learning_rate": 1.6648984203971538e-06, + "loss": 0.77112365, + "num_input_tokens_seen": 203170110, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 9430, + "time_per_iteration": 3.8817877769470215 + }, + { + "auxiliary_loss_clip": 0.01106735, + "auxiliary_loss_mlp": 0.01033765, + "balance_loss_clip": 1.02152157, + "balance_loss_mlp": 1.03621042, + "epoch": 0.5670223959116188, + "flos": 18763253521920.0, + "grad_norm": 2.3751678803775285, + "language_loss": 0.7272107, + "learning_rate": 1.6645144707180032e-06, + "loss": 0.74861568, + "num_input_tokens_seen": 203188825, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.703125, + "step": 9431, + "time_per_iteration": 2.51401948928833 + }, + { + "auxiliary_loss_clip": 0.01100909, + "auxiliary_loss_mlp": 0.01029023, + "balance_loss_clip": 1.01810944, + "balance_loss_mlp": 1.03722477, + "epoch": 0.5670825191642868, + "flos": 13553334276480.0, + "grad_norm": 1.9291254540879526, + "language_loss": 0.73248518, + "learning_rate": 1.6641305337604984e-06, + "loss": 0.75378448, + "num_input_tokens_seen": 203206860, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.63671875, + "step": 9432, + "time_per_iteration": 2.4319839477539062 + }, + { + "auxiliary_loss_clip": 0.01107311, + "auxiliary_loss_mlp": 0.01032729, + "balance_loss_clip": 1.02087343, + "balance_loss_mlp": 1.03681755, + "epoch": 0.5671426424169548, + "flos": 22054035985920.0, + "grad_norm": 1.5888571716641233, + "language_loss": 0.77957594, + "learning_rate": 1.663746609539197e-06, + "loss": 0.80097634, + "num_input_tokens_seen": 203225625, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.703125, + "step": 9433, + "time_per_iteration": 2.5169765949249268 + }, + { + "auxiliary_loss_clip": 0.01111851, + "auxiliary_loss_mlp": 0.01031044, + "balance_loss_clip": 1.01645875, + "balance_loss_mlp": 1.03870261, + "epoch": 0.5672027656696227, + "flos": 21323972056320.0, + "grad_norm": 1.7704673621088174, + "language_loss": 0.63839334, + "learning_rate": 1.6633626980686582e-06, + "loss": 0.65982234, + "num_input_tokens_seen": 203242920, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.73046875, + "step": 9434, + "time_per_iteration": 2.4372098445892334 + }, + { + "auxiliary_loss_clip": 0.01102835, + "auxiliary_loss_mlp": 0.01026729, + "balance_loss_clip": 1.01495695, + "balance_loss_mlp": 1.03529072, + "epoch": 0.5672628889222907, + "flos": 23514056104320.0, + "grad_norm": 1.879777953851778, + "language_loss": 0.66724491, + "learning_rate": 1.6629787993634399e-06, + "loss": 0.68854052, + "num_input_tokens_seen": 203261995, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.67578125, + "step": 9435, + "time_per_iteration": 2.5156021118164062 + }, + { + "auxiliary_loss_clip": 0.01104078, + "auxiliary_loss_mlp": 0.01032518, + "balance_loss_clip": 1.02028716, + "balance_loss_mlp": 1.03599691, + "epoch": 0.5673230121749586, + "flos": 27121928855040.0, + "grad_norm": 1.3893571871291595, + "language_loss": 0.71398699, + "learning_rate": 1.6625949134380984e-06, + "loss": 0.73535293, + "num_input_tokens_seen": 203280670, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6796875, + "step": 9436, + "time_per_iteration": 2.4815714359283447 + }, + { + "auxiliary_loss_clip": 0.01109121, + "auxiliary_loss_mlp": 0.01029795, + "balance_loss_clip": 1.01723647, + "balance_loss_mlp": 1.03756368, + "epoch": 0.5673831354276266, + "flos": 31141985149440.0, + "grad_norm": 1.6654091498260946, + "language_loss": 0.73988926, + "learning_rate": 1.6622110403071921e-06, + "loss": 0.76127845, + "num_input_tokens_seen": 203304800, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71484375, + "step": 9437, + "time_per_iteration": 2.5965943336486816 + }, + { + "auxiliary_loss_clip": 0.01112439, + "auxiliary_loss_mlp": 0.01032397, + "balance_loss_clip": 1.01942062, + "balance_loss_mlp": 1.04159832, + "epoch": 0.5674432586802945, + "flos": 27673193859840.0, + "grad_norm": 2.439390833366172, + "language_loss": 0.60905057, + "learning_rate": 1.661827179985277e-06, + "loss": 0.63049889, + "num_input_tokens_seen": 203324060, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.70703125, + "step": 9438, + "time_per_iteration": 2.512578010559082 + }, + { + "auxiliary_loss_clip": 0.01105416, + "auxiliary_loss_mlp": 0.01028895, + "balance_loss_clip": 1.01714146, + "balance_loss_mlp": 1.03543329, + "epoch": 0.5675033819329626, + "flos": 26615157822720.0, + "grad_norm": 1.6600048607148805, + "language_loss": 0.75087392, + "learning_rate": 1.661443332486909e-06, + "loss": 0.77221704, + "num_input_tokens_seen": 203344360, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.69921875, + "step": 9439, + "time_per_iteration": 2.531489133834839 + }, + { + "auxiliary_loss_clip": 0.01107772, + "auxiliary_loss_mlp": 0.01029394, + "balance_loss_clip": 1.0159471, + "balance_loss_mlp": 1.03828883, + "epoch": 0.5675635051856306, + "flos": 19098372435840.0, + "grad_norm": 1.8930047517001285, + "language_loss": 0.8361944, + "learning_rate": 1.6610594978266438e-06, + "loss": 0.857566, + "num_input_tokens_seen": 203362115, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.6953125, + "step": 9440, + "time_per_iteration": 2.4386231899261475 + }, + { + "auxiliary_loss_clip": 0.01111147, + "auxiliary_loss_mlp": 0.01034391, + "balance_loss_clip": 1.02123022, + "balance_loss_mlp": 1.03704751, + "epoch": 0.5676236284382985, + "flos": 17566315591680.0, + "grad_norm": 2.0023123091206467, + "language_loss": 0.7550447, + "learning_rate": 1.6606756760190365e-06, + "loss": 0.77650005, + "num_input_tokens_seen": 203380550, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 9441, + "time_per_iteration": 2.4788920879364014 + }, + { + "auxiliary_loss_clip": 0.01108262, + "auxiliary_loss_mlp": 0.01032728, + "balance_loss_clip": 1.02022874, + "balance_loss_mlp": 1.0381217, + "epoch": 0.5676837516909665, + "flos": 15954069634560.0, + "grad_norm": 2.003106565766755, + "language_loss": 0.83199525, + "learning_rate": 1.6602918670786413e-06, + "loss": 0.85340512, + "num_input_tokens_seen": 203396590, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 9442, + "time_per_iteration": 2.4066359996795654 + }, + { + "auxiliary_loss_clip": 0.01103828, + "auxiliary_loss_mlp": 0.01030609, + "balance_loss_clip": 1.01906371, + "balance_loss_mlp": 1.0388906, + "epoch": 0.5677438749436344, + "flos": 18295912644480.0, + "grad_norm": 2.099488848818881, + "language_loss": 0.74606907, + "learning_rate": 1.6599080710200126e-06, + "loss": 0.76741344, + "num_input_tokens_seen": 203414280, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 9443, + "time_per_iteration": 2.4699020385742188 + }, + { + "auxiliary_loss_clip": 0.0110959, + "auxiliary_loss_mlp": 0.01034472, + "balance_loss_clip": 1.02184737, + "balance_loss_mlp": 1.03892851, + "epoch": 0.5678039981963025, + "flos": 17931311642880.0, + "grad_norm": 1.9353911334921245, + "language_loss": 0.77443373, + "learning_rate": 1.6595242878577046e-06, + "loss": 0.79587436, + "num_input_tokens_seen": 203433280, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.70703125, + "step": 9444, + "time_per_iteration": 2.418164014816284 + }, + { + "auxiliary_loss_clip": 0.01110885, + "auxiliary_loss_mlp": 0.01037563, + "balance_loss_clip": 1.02498603, + "balance_loss_mlp": 1.03886068, + "epoch": 0.5678641214489704, + "flos": 19316350120320.0, + "grad_norm": 1.6369546772732781, + "language_loss": 0.80673003, + "learning_rate": 1.6591405176062687e-06, + "loss": 0.82821453, + "num_input_tokens_seen": 203449935, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71875, + "step": 9445, + "time_per_iteration": 2.4474682807922363 + }, + { + "auxiliary_loss_clip": 0.01105393, + "auxiliary_loss_mlp": 0.01026724, + "balance_loss_clip": 1.0146122, + "balance_loss_mlp": 1.03579414, + "epoch": 0.5679242447016384, + "flos": 27751084502400.0, + "grad_norm": 1.310891415120181, + "language_loss": 0.70843911, + "learning_rate": 1.658756760280259e-06, + "loss": 0.72976023, + "num_input_tokens_seen": 203473025, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 9446, + "time_per_iteration": 2.5338428020477295 + }, + { + "auxiliary_loss_clip": 0.01111342, + "auxiliary_loss_mlp": 0.01031244, + "balance_loss_clip": 1.018489, + "balance_loss_mlp": 1.03815663, + "epoch": 0.5679843679543063, + "flos": 23769093646080.0, + "grad_norm": 1.8305308972685952, + "language_loss": 0.7354359, + "learning_rate": 1.6583730158942276e-06, + "loss": 0.75686181, + "num_input_tokens_seen": 203492895, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.734375, + "step": 9447, + "time_per_iteration": 2.5152740478515625 + }, + { + "auxiliary_loss_clip": 0.01110587, + "auxiliary_loss_mlp": 0.01032075, + "balance_loss_clip": 1.01963568, + "balance_loss_mlp": 1.0382061, + "epoch": 0.5680444912069743, + "flos": 25591883172480.0, + "grad_norm": 2.262443693729548, + "language_loss": 0.74931812, + "learning_rate": 1.657989284462725e-06, + "loss": 0.77074468, + "num_input_tokens_seen": 203513710, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.72265625, + "step": 9448, + "time_per_iteration": 2.468688488006592 + }, + { + "auxiliary_loss_clip": 0.0111579, + "auxiliary_loss_mlp": 0.01035922, + "balance_loss_clip": 1.0227201, + "balance_loss_mlp": 1.04175234, + "epoch": 0.5681046144596422, + "flos": 23695799944320.0, + "grad_norm": 2.1518179799978356, + "language_loss": 0.76137841, + "learning_rate": 1.6576055660003038e-06, + "loss": 0.78289551, + "num_input_tokens_seen": 203531630, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73828125, + "step": 9449, + "time_per_iteration": 2.510693311691284 + }, + { + "auxiliary_loss_clip": 0.01110533, + "auxiliary_loss_mlp": 0.01033296, + "balance_loss_clip": 1.02046347, + "balance_loss_mlp": 1.03867984, + "epoch": 0.5681647377123102, + "flos": 28000770917760.0, + "grad_norm": 1.6592475910366993, + "language_loss": 0.74742198, + "learning_rate": 1.6572218605215128e-06, + "loss": 0.76886022, + "num_input_tokens_seen": 203551885, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 9450, + "time_per_iteration": 2.5034866333007812 + }, + { + "auxiliary_loss_clip": 0.01112382, + "auxiliary_loss_mlp": 0.01035469, + "balance_loss_clip": 1.02404845, + "balance_loss_mlp": 1.04081213, + "epoch": 0.5682248609649782, + "flos": 22747758330240.0, + "grad_norm": 3.8340234675809017, + "language_loss": 0.67216206, + "learning_rate": 1.6568381680409038e-06, + "loss": 0.69364059, + "num_input_tokens_seen": 203572250, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.71875, + "step": 9451, + "time_per_iteration": 2.503805637359619 + }, + { + "auxiliary_loss_clip": 0.01115025, + "auxiliary_loss_mlp": 0.01031671, + "balance_loss_clip": 1.01743114, + "balance_loss_mlp": 1.03788531, + "epoch": 0.5682849842176462, + "flos": 21288600138240.0, + "grad_norm": 1.8009184427821863, + "language_loss": 0.71697223, + "learning_rate": 1.656454488573026e-06, + "loss": 0.7384392, + "num_input_tokens_seen": 203590605, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7734375, + "step": 9452, + "time_per_iteration": 2.4519643783569336 + }, + { + "auxiliary_loss_clip": 0.01105455, + "auxiliary_loss_mlp": 0.01031445, + "balance_loss_clip": 1.01947021, + "balance_loss_mlp": 1.03679395, + "epoch": 0.5683451074703142, + "flos": 21141689512320.0, + "grad_norm": 1.6525298490216664, + "language_loss": 0.70272237, + "learning_rate": 1.656070822132428e-06, + "loss": 0.72409141, + "num_input_tokens_seen": 203610080, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6875, + "step": 9453, + "time_per_iteration": 2.5260796546936035 + }, + { + "auxiliary_loss_clip": 0.01110332, + "auxiliary_loss_mlp": 0.01034031, + "balance_loss_clip": 1.02190745, + "balance_loss_mlp": 1.03889799, + "epoch": 0.5684052307229821, + "flos": 22344481359360.0, + "grad_norm": 2.2860746429720833, + "language_loss": 0.69546616, + "learning_rate": 1.6556871687336592e-06, + "loss": 0.71690989, + "num_input_tokens_seen": 203630060, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71484375, + "step": 9454, + "time_per_iteration": 2.457736015319824 + }, + { + "auxiliary_loss_clip": 0.01103936, + "auxiliary_loss_mlp": 0.01029774, + "balance_loss_clip": 1.01837158, + "balance_loss_mlp": 1.03616297, + "epoch": 0.5684653539756501, + "flos": 21798639308160.0, + "grad_norm": 1.8998375571155763, + "language_loss": 0.60430771, + "learning_rate": 1.6553035283912671e-06, + "loss": 0.6256448, + "num_input_tokens_seen": 203649065, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.67578125, + "step": 9455, + "time_per_iteration": 2.506091594696045 + }, + { + "auxiliary_loss_clip": 0.01116555, + "auxiliary_loss_mlp": 0.01030863, + "balance_loss_clip": 1.01808953, + "balance_loss_mlp": 1.0424788, + "epoch": 0.568525477228318, + "flos": 22999635475200.0, + "grad_norm": 2.102932497256003, + "language_loss": 0.72914851, + "learning_rate": 1.6549199011198e-06, + "loss": 0.75062263, + "num_input_tokens_seen": 203667545, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7421875, + "step": 9456, + "time_per_iteration": 2.439221143722534 + }, + { + "auxiliary_loss_clip": 0.01108291, + "auxiliary_loss_mlp": 0.01032712, + "balance_loss_clip": 1.02125049, + "balance_loss_mlp": 1.03915823, + "epoch": 0.568585600480986, + "flos": 21392489249280.0, + "grad_norm": 1.5692423529190727, + "language_loss": 0.76402628, + "learning_rate": 1.6545362869338048e-06, + "loss": 0.78543633, + "num_input_tokens_seen": 203686025, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.69140625, + "step": 9457, + "time_per_iteration": 2.475327491760254 + }, + { + "auxiliary_loss_clip": 0.01110625, + "auxiliary_loss_mlp": 0.01036596, + "balance_loss_clip": 1.02338171, + "balance_loss_mlp": 1.03828931, + "epoch": 0.568645723733654, + "flos": 30007351359360.0, + "grad_norm": 1.8808926225586853, + "language_loss": 0.66305089, + "learning_rate": 1.6541526858478285e-06, + "loss": 0.68452305, + "num_input_tokens_seen": 203705540, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7265625, + "step": 9458, + "time_per_iteration": 2.5271642208099365 + }, + { + "auxiliary_loss_clip": 0.01111416, + "auxiliary_loss_mlp": 0.01027286, + "balance_loss_clip": 1.01424456, + "balance_loss_mlp": 1.03845215, + "epoch": 0.568705846986322, + "flos": 20412667077120.0, + "grad_norm": 2.21557799175144, + "language_loss": 0.67912495, + "learning_rate": 1.6537690978764167e-06, + "loss": 0.70051199, + "num_input_tokens_seen": 203723670, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.73046875, + "step": 9459, + "time_per_iteration": 2.534374237060547 + }, + { + "auxiliary_loss_clip": 0.0111268, + "auxiliary_loss_mlp": 0.01032195, + "balance_loss_clip": 1.01942194, + "balance_loss_mlp": 1.04046702, + "epoch": 0.5687659702389899, + "flos": 17456752131840.0, + "grad_norm": 3.4353012744759335, + "language_loss": 0.77999187, + "learning_rate": 1.6533855230341155e-06, + "loss": 0.8014406, + "num_input_tokens_seen": 203739705, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.72265625, + "step": 9460, + "time_per_iteration": 2.434570789337158 + }, + { + "auxiliary_loss_clip": 0.01109374, + "auxiliary_loss_mlp": 0.01035426, + "balance_loss_clip": 1.02221131, + "balance_loss_mlp": 1.03767824, + "epoch": 0.5688260934916579, + "flos": 25406081095680.0, + "grad_norm": 1.7026913094631195, + "language_loss": 0.71950358, + "learning_rate": 1.65300196133547e-06, + "loss": 0.74095166, + "num_input_tokens_seen": 203759000, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71875, + "step": 9461, + "time_per_iteration": 2.5527231693267822 + }, + { + "auxiliary_loss_clip": 0.01109281, + "auxiliary_loss_mlp": 0.01030442, + "balance_loss_clip": 1.01769304, + "balance_loss_mlp": 1.03814745, + "epoch": 0.5688862167443258, + "flos": 21608024808960.0, + "grad_norm": 2.8717094069028617, + "language_loss": 0.72976351, + "learning_rate": 1.6526184127950249e-06, + "loss": 0.75116074, + "num_input_tokens_seen": 203774295, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9462, + "time_per_iteration": 2.422624111175537 + }, + { + "auxiliary_loss_clip": 0.0110396, + "auxiliary_loss_mlp": 0.01026733, + "balance_loss_clip": 1.01590848, + "balance_loss_mlp": 1.03640223, + "epoch": 0.5689463399969938, + "flos": 22418996123520.0, + "grad_norm": 1.8933127595424433, + "language_loss": 0.7326529, + "learning_rate": 1.6522348774273246e-06, + "loss": 0.75395983, + "num_input_tokens_seen": 203792710, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.67578125, + "step": 9463, + "time_per_iteration": 2.466491460800171 + }, + { + "auxiliary_loss_clip": 0.01106737, + "auxiliary_loss_mlp": 0.01030565, + "balance_loss_clip": 1.01810765, + "balance_loss_mlp": 1.03583968, + "epoch": 0.5690064632496618, + "flos": 18296810484480.0, + "grad_norm": 1.7491308846328846, + "language_loss": 0.74368691, + "learning_rate": 1.6518513552469123e-06, + "loss": 0.76505989, + "num_input_tokens_seen": 203811645, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 9464, + "time_per_iteration": 2.406031370162964 + }, + { + "auxiliary_loss_clip": 0.01110663, + "auxiliary_loss_mlp": 0.01036268, + "balance_loss_clip": 1.02382255, + "balance_loss_mlp": 1.03892159, + "epoch": 0.5690665865023298, + "flos": 21579260993280.0, + "grad_norm": 1.714079864723851, + "language_loss": 0.84333247, + "learning_rate": 1.6514678462683312e-06, + "loss": 0.86480176, + "num_input_tokens_seen": 203830040, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71875, + "step": 9465, + "time_per_iteration": 2.514777183532715 + }, + { + "auxiliary_loss_clip": 0.01103611, + "auxiliary_loss_mlp": 0.01029517, + "balance_loss_clip": 1.01757169, + "balance_loss_mlp": 1.03546405, + "epoch": 0.5691267097549978, + "flos": 24421446501120.0, + "grad_norm": 1.8589721720108319, + "language_loss": 0.7226572, + "learning_rate": 1.651084350506125e-06, + "loss": 0.74398845, + "num_input_tokens_seen": 203851245, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.68359375, + "step": 9466, + "time_per_iteration": 2.475188732147217 + }, + { + "auxiliary_loss_clip": 0.01029497, + "auxiliary_loss_mlp": 0.01005385, + "balance_loss_clip": 1.00427043, + "balance_loss_mlp": 1.0077517, + "epoch": 0.5691868330076657, + "flos": 61657906199040.0, + "grad_norm": 0.7081654133828948, + "language_loss": 0.55354679, + "learning_rate": 1.6507008679748343e-06, + "loss": 0.57389557, + "num_input_tokens_seen": 203916400, + "router_z_loss_clip": 0.01116943, + "router_z_loss_mlp": 0.21777344, + "step": 9467, + "time_per_iteration": 3.185729742050171 + }, + { + "auxiliary_loss_clip": 0.01111718, + "auxiliary_loss_mlp": 0.01032639, + "balance_loss_clip": 1.01861966, + "balance_loss_mlp": 1.03861189, + "epoch": 0.5692469562603337, + "flos": 21325193118720.0, + "grad_norm": 2.2495356407271854, + "language_loss": 0.63680357, + "learning_rate": 1.6503173986890023e-06, + "loss": 0.65824717, + "num_input_tokens_seen": 203935870, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.73046875, + "step": 9468, + "time_per_iteration": 2.4373323917388916 + }, + { + "auxiliary_loss_clip": 0.01108125, + "auxiliary_loss_mlp": 0.01028705, + "balance_loss_clip": 1.01587772, + "balance_loss_mlp": 1.03801632, + "epoch": 0.5693070795130016, + "flos": 23367899664000.0, + "grad_norm": 1.8525378978069993, + "language_loss": 0.79367, + "learning_rate": 1.64993394266317e-06, + "loss": 0.81503832, + "num_input_tokens_seen": 203954950, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 9469, + "time_per_iteration": 3.8166728019714355 + }, + { + "auxiliary_loss_clip": 0.0111246, + "auxiliary_loss_mlp": 0.01041609, + "balance_loss_clip": 1.02810884, + "balance_loss_mlp": 1.03860152, + "epoch": 0.5693672027656697, + "flos": 18697250280960.0, + "grad_norm": 1.9923541987272968, + "language_loss": 0.69606256, + "learning_rate": 1.6495504999118769e-06, + "loss": 0.71760333, + "num_input_tokens_seen": 203972715, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73828125, + "step": 9470, + "time_per_iteration": 2.4572556018829346 + }, + { + "auxiliary_loss_clip": 0.01107845, + "auxiliary_loss_mlp": 0.01031625, + "balance_loss_clip": 1.01882184, + "balance_loss_mlp": 1.03729832, + "epoch": 0.5694273260183376, + "flos": 20449188230400.0, + "grad_norm": 1.5518202279497855, + "language_loss": 0.74791551, + "learning_rate": 1.6491670704496644e-06, + "loss": 0.76931024, + "num_input_tokens_seen": 203990775, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.70703125, + "step": 9471, + "time_per_iteration": 3.926091432571411 + }, + { + "auxiliary_loss_clip": 0.01108882, + "auxiliary_loss_mlp": 0.01032599, + "balance_loss_clip": 1.02006447, + "balance_loss_mlp": 1.03928542, + "epoch": 0.5694874492710056, + "flos": 17603195880960.0, + "grad_norm": 1.9616270612820847, + "language_loss": 0.57270539, + "learning_rate": 1.6487836542910716e-06, + "loss": 0.59412026, + "num_input_tokens_seen": 204008845, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 9472, + "time_per_iteration": 3.8452813625335693 + }, + { + "auxiliary_loss_clip": 0.01103976, + "auxiliary_loss_mlp": 0.010308, + "balance_loss_clip": 1.01886702, + "balance_loss_mlp": 1.03722382, + "epoch": 0.5695475725236735, + "flos": 13370836250880.0, + "grad_norm": 1.803122156723958, + "language_loss": 0.73615265, + "learning_rate": 1.648400251450638e-06, + "loss": 0.75750041, + "num_input_tokens_seen": 204023755, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66796875, + "step": 9473, + "time_per_iteration": 2.4637346267700195 + }, + { + "auxiliary_loss_clip": 0.01028797, + "auxiliary_loss_mlp": 0.01004803, + "balance_loss_clip": 1.00359905, + "balance_loss_mlp": 1.00722575, + "epoch": 0.5696076957763415, + "flos": 68174398661760.0, + "grad_norm": 0.6476817486149063, + "language_loss": 0.57596511, + "learning_rate": 1.6480168619429023e-06, + "loss": 0.59630114, + "num_input_tokens_seen": 204091255, + "router_z_loss_clip": 0.01202393, + "router_z_loss_mlp": 0.21679688, + "step": 9474, + "time_per_iteration": 3.09342622756958 + }, + { + "auxiliary_loss_clip": 0.01108341, + "auxiliary_loss_mlp": 0.01034518, + "balance_loss_clip": 1.02095199, + "balance_loss_mlp": 1.03955841, + "epoch": 0.5696678190290094, + "flos": 33838301525760.0, + "grad_norm": 1.7127367690076127, + "language_loss": 0.53624213, + "learning_rate": 1.6476334857824017e-06, + "loss": 0.55767071, + "num_input_tokens_seen": 204113285, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.6875, + "step": 9475, + "time_per_iteration": 2.6103556156158447 + }, + { + "auxiliary_loss_clip": 0.01110194, + "auxiliary_loss_mlp": 0.01032608, + "balance_loss_clip": 1.01969719, + "balance_loss_mlp": 1.03914022, + "epoch": 0.5697279422816774, + "flos": 26356600748160.0, + "grad_norm": 1.5220537573313933, + "language_loss": 0.79891974, + "learning_rate": 1.647250122983675e-06, + "loss": 0.82034773, + "num_input_tokens_seen": 204133045, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9476, + "time_per_iteration": 2.5519871711730957 + }, + { + "auxiliary_loss_clip": 0.01113361, + "auxiliary_loss_mlp": 0.0103459, + "balance_loss_clip": 1.02248454, + "balance_loss_mlp": 1.04071283, + "epoch": 0.5697880655343454, + "flos": 22930507751040.0, + "grad_norm": 2.93922823935367, + "language_loss": 0.66361278, + "learning_rate": 1.6468667735612592e-06, + "loss": 0.68509227, + "num_input_tokens_seen": 204152590, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.7265625, + "step": 9477, + "time_per_iteration": 2.556461811065674 + }, + { + "auxiliary_loss_clip": 0.01107946, + "auxiliary_loss_mlp": 0.01030235, + "balance_loss_clip": 1.01737881, + "balance_loss_mlp": 1.03697014, + "epoch": 0.5698481887870134, + "flos": 26761314263040.0, + "grad_norm": 1.8188873629652118, + "language_loss": 0.70921832, + "learning_rate": 1.6464834375296906e-06, + "loss": 0.73060012, + "num_input_tokens_seen": 204171815, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9478, + "time_per_iteration": 2.5022385120391846 + }, + { + "auxiliary_loss_clip": 0.01104521, + "auxiliary_loss_mlp": 0.01027788, + "balance_loss_clip": 1.01615286, + "balance_loss_mlp": 1.03824937, + "epoch": 0.5699083120396814, + "flos": 15742269089280.0, + "grad_norm": 1.5933810632151244, + "language_loss": 0.69647413, + "learning_rate": 1.6461001149035055e-06, + "loss": 0.71779716, + "num_input_tokens_seen": 204188535, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 9479, + "time_per_iteration": 2.544422149658203 + }, + { + "auxiliary_loss_clip": 0.01103959, + "auxiliary_loss_mlp": 0.01028863, + "balance_loss_clip": 1.01729965, + "balance_loss_mlp": 1.03753138, + "epoch": 0.5699684352923493, + "flos": 19537272720000.0, + "grad_norm": 1.4338626650619826, + "language_loss": 0.71364439, + "learning_rate": 1.6457168056972392e-06, + "loss": 0.7349726, + "num_input_tokens_seen": 204208365, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6640625, + "step": 9480, + "time_per_iteration": 2.5680878162384033 + }, + { + "auxiliary_loss_clip": 0.01106665, + "auxiliary_loss_mlp": 0.01029171, + "balance_loss_clip": 1.01615977, + "balance_loss_mlp": 1.03689599, + "epoch": 0.5700285585450173, + "flos": 16253349753600.0, + "grad_norm": 2.894404055389402, + "language_loss": 0.71927261, + "learning_rate": 1.6453335099254276e-06, + "loss": 0.74063098, + "num_input_tokens_seen": 204226560, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.69921875, + "step": 9481, + "time_per_iteration": 2.4576737880706787 + }, + { + "auxiliary_loss_clip": 0.01108109, + "auxiliary_loss_mlp": 0.01030771, + "balance_loss_clip": 1.01848626, + "balance_loss_mlp": 1.03819919, + "epoch": 0.5700886817976852, + "flos": 19864993432320.0, + "grad_norm": 1.6819252466037764, + "language_loss": 0.78134334, + "learning_rate": 1.6449502276026041e-06, + "loss": 0.80273211, + "num_input_tokens_seen": 204245410, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69921875, + "step": 9482, + "time_per_iteration": 2.4244532585144043 + }, + { + "auxiliary_loss_clip": 0.01107032, + "auxiliary_loss_mlp": 0.01026772, + "balance_loss_clip": 1.01462436, + "balance_loss_mlp": 1.0372206, + "epoch": 0.5701488050503533, + "flos": 23841704989440.0, + "grad_norm": 2.1918431398286686, + "language_loss": 0.77641654, + "learning_rate": 1.6445669587433043e-06, + "loss": 0.79775453, + "num_input_tokens_seen": 204264840, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69921875, + "step": 9483, + "time_per_iteration": 2.4840755462646484 + }, + { + "auxiliary_loss_clip": 0.01106594, + "auxiliary_loss_mlp": 0.01033667, + "balance_loss_clip": 1.0217644, + "balance_loss_mlp": 1.037377, + "epoch": 0.5702089283030212, + "flos": 23659673840640.0, + "grad_norm": 2.4281256207615702, + "language_loss": 0.8098467, + "learning_rate": 1.6441837033620612e-06, + "loss": 0.8312493, + "num_input_tokens_seen": 204284335, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69140625, + "step": 9484, + "time_per_iteration": 2.4726784229278564 + }, + { + "auxiliary_loss_clip": 0.01107682, + "auxiliary_loss_mlp": 0.01030904, + "balance_loss_clip": 1.01798165, + "balance_loss_mlp": 1.03656316, + "epoch": 0.5702690515556892, + "flos": 27891171544320.0, + "grad_norm": 9.175896769478262, + "language_loss": 0.60516417, + "learning_rate": 1.6438004614734073e-06, + "loss": 0.62655002, + "num_input_tokens_seen": 204302590, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 9485, + "time_per_iteration": 2.5423014163970947 + }, + { + "auxiliary_loss_clip": 0.01107039, + "auxiliary_loss_mlp": 0.01033529, + "balance_loss_clip": 1.02155399, + "balance_loss_mlp": 1.03619039, + "epoch": 0.5703291748083571, + "flos": 24023951619840.0, + "grad_norm": 1.6367482229195742, + "language_loss": 0.65350515, + "learning_rate": 1.6434172330918757e-06, + "loss": 0.67491084, + "num_input_tokens_seen": 204323055, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.7109375, + "step": 9486, + "time_per_iteration": 2.4597506523132324 + }, + { + "auxiliary_loss_clip": 0.01028731, + "auxiliary_loss_mlp": 0.01001408, + "balance_loss_clip": 1.00001299, + "balance_loss_mlp": 1.0072422, + "epoch": 0.5703892980610251, + "flos": 57023382919680.0, + "grad_norm": 0.6639559744347447, + "language_loss": 0.48005819, + "learning_rate": 1.6430340182319978e-06, + "loss": 0.50035954, + "num_input_tokens_seen": 204386160, + "router_z_loss_clip": 0.01397705, + "router_z_loss_mlp": 0.21484375, + "step": 9487, + "time_per_iteration": 3.139495849609375 + }, + { + "auxiliary_loss_clip": 0.01107717, + "auxiliary_loss_mlp": 0.01034452, + "balance_loss_clip": 1.02199435, + "balance_loss_mlp": 1.03726935, + "epoch": 0.570449421313693, + "flos": 24351025887360.0, + "grad_norm": 3.049670437576873, + "language_loss": 0.86058694, + "learning_rate": 1.6426508169083067e-06, + "loss": 0.88200867, + "num_input_tokens_seen": 204406315, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.703125, + "step": 9488, + "time_per_iteration": 2.474616289138794 + }, + { + "auxiliary_loss_clip": 0.01111409, + "auxiliary_loss_mlp": 0.01033222, + "balance_loss_clip": 1.02065694, + "balance_loss_mlp": 1.03814459, + "epoch": 0.570509544566361, + "flos": 24828566227200.0, + "grad_norm": 1.4447763000600118, + "language_loss": 0.79057854, + "learning_rate": 1.6422676291353314e-06, + "loss": 0.81202483, + "num_input_tokens_seen": 204427645, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.73046875, + "step": 9489, + "time_per_iteration": 2.5065059661865234 + }, + { + "auxiliary_loss_clip": 0.01109061, + "auxiliary_loss_mlp": 0.01030936, + "balance_loss_clip": 1.01978409, + "balance_loss_mlp": 1.03869939, + "epoch": 0.570569667819029, + "flos": 21397301671680.0, + "grad_norm": 1.7186115243718623, + "language_loss": 0.69906354, + "learning_rate": 1.641884454927604e-06, + "loss": 0.72046351, + "num_input_tokens_seen": 204445910, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.703125, + "step": 9490, + "time_per_iteration": 2.431102752685547 + }, + { + "auxiliary_loss_clip": 0.01107746, + "auxiliary_loss_mlp": 0.01028873, + "balance_loss_clip": 1.01676106, + "balance_loss_mlp": 1.03836775, + "epoch": 0.570629791071697, + "flos": 23216751233280.0, + "grad_norm": 1.5472180668734579, + "language_loss": 0.76222062, + "learning_rate": 1.6415012942996548e-06, + "loss": 0.78358686, + "num_input_tokens_seen": 204464680, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 9491, + "time_per_iteration": 2.4962759017944336 + }, + { + "auxiliary_loss_clip": 0.01028502, + "auxiliary_loss_mlp": 0.01004058, + "balance_loss_clip": 1.00276494, + "balance_loss_mlp": 1.00699997, + "epoch": 0.570689914324365, + "flos": 65284666525440.0, + "grad_norm": 0.7944597612251223, + "language_loss": 0.57379556, + "learning_rate": 1.641118147266011e-06, + "loss": 0.59412122, + "num_input_tokens_seen": 204525580, + "router_z_loss_clip": 0.01293945, + "router_z_loss_mlp": 0.21484375, + "step": 9492, + "time_per_iteration": 3.0417838096618652 + }, + { + "auxiliary_loss_clip": 0.01108126, + "auxiliary_loss_mlp": 0.01033252, + "balance_loss_clip": 1.02009118, + "balance_loss_mlp": 1.03813028, + "epoch": 0.5707500375770329, + "flos": 21141904993920.0, + "grad_norm": 1.7217254573804663, + "language_loss": 0.71475661, + "learning_rate": 1.6407350138412035e-06, + "loss": 0.73617041, + "num_input_tokens_seen": 204541320, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.69921875, + "step": 9493, + "time_per_iteration": 2.4304161071777344 + }, + { + "auxiliary_loss_clip": 0.01111414, + "auxiliary_loss_mlp": 0.01030431, + "balance_loss_clip": 1.01807487, + "balance_loss_mlp": 1.0384568, + "epoch": 0.5708101608297009, + "flos": 20812747737600.0, + "grad_norm": 1.5364295350921338, + "language_loss": 0.77778745, + "learning_rate": 1.6403518940397606e-06, + "loss": 0.7992059, + "num_input_tokens_seen": 204560275, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.73046875, + "step": 9494, + "time_per_iteration": 2.495940923690796 + }, + { + "auxiliary_loss_clip": 0.01111436, + "auxiliary_loss_mlp": 0.01033742, + "balance_loss_clip": 1.02015769, + "balance_loss_mlp": 1.03685784, + "epoch": 0.5708702840823688, + "flos": 25812338895360.0, + "grad_norm": 2.275602748234112, + "language_loss": 0.80153453, + "learning_rate": 1.6399687878762096e-06, + "loss": 0.82298625, + "num_input_tokens_seen": 204579430, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.74609375, + "step": 9495, + "time_per_iteration": 2.464423418045044 + }, + { + "auxiliary_loss_clip": 0.01117033, + "auxiliary_loss_mlp": 0.01039006, + "balance_loss_clip": 1.02393782, + "balance_loss_mlp": 1.04061937, + "epoch": 0.5709304073350369, + "flos": 23651916503040.0, + "grad_norm": 3.463558707959815, + "language_loss": 0.66745138, + "learning_rate": 1.6395856953650784e-06, + "loss": 0.68901181, + "num_input_tokens_seen": 204597710, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.765625, + "step": 9496, + "time_per_iteration": 2.460413694381714 + }, + { + "auxiliary_loss_clip": 0.01113845, + "auxiliary_loss_mlp": 0.01037224, + "balance_loss_clip": 1.02361047, + "balance_loss_mlp": 1.03911281, + "epoch": 0.5709905305877048, + "flos": 16107552449280.0, + "grad_norm": 2.3847499053839067, + "language_loss": 0.6960094, + "learning_rate": 1.6392026165208938e-06, + "loss": 0.71752012, + "num_input_tokens_seen": 204616140, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 9497, + "time_per_iteration": 2.4051928520202637 + }, + { + "auxiliary_loss_clip": 0.01111626, + "auxiliary_loss_mlp": 0.01030681, + "balance_loss_clip": 1.01712704, + "balance_loss_mlp": 1.03815341, + "epoch": 0.5710506538403728, + "flos": 24750819239040.0, + "grad_norm": 1.8796088723274103, + "language_loss": 0.81200778, + "learning_rate": 1.638819551358182e-06, + "loss": 0.83343083, + "num_input_tokens_seen": 204636470, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.734375, + "step": 9498, + "time_per_iteration": 2.4764246940612793 + }, + { + "auxiliary_loss_clip": 0.01111235, + "auxiliary_loss_mlp": 0.01035181, + "balance_loss_clip": 1.02085817, + "balance_loss_mlp": 1.03874803, + "epoch": 0.5711107770930407, + "flos": 21982250655360.0, + "grad_norm": 1.7968018947144153, + "language_loss": 0.66237068, + "learning_rate": 1.638436499891469e-06, + "loss": 0.68383479, + "num_input_tokens_seen": 204656640, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.72265625, + "step": 9499, + "time_per_iteration": 2.4842209815979004 + }, + { + "auxiliary_loss_clip": 0.01109681, + "auxiliary_loss_mlp": 0.0103261, + "balance_loss_clip": 1.01994455, + "balance_loss_mlp": 1.03881264, + "epoch": 0.5711709003457087, + "flos": 19574009354880.0, + "grad_norm": 2.341189176641991, + "language_loss": 0.71659786, + "learning_rate": 1.6380534621352805e-06, + "loss": 0.73802078, + "num_input_tokens_seen": 204675475, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9500, + "time_per_iteration": 2.474968671798706 + }, + { + "auxiliary_loss_clip": 0.01113264, + "auxiliary_loss_mlp": 0.01032452, + "balance_loss_clip": 1.01911259, + "balance_loss_mlp": 1.03896177, + "epoch": 0.5712310235983766, + "flos": 24242683489920.0, + "grad_norm": 1.7510176581013566, + "language_loss": 0.76148939, + "learning_rate": 1.6376704381041407e-06, + "loss": 0.78294659, + "num_input_tokens_seen": 204695385, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7421875, + "step": 9501, + "time_per_iteration": 2.481982707977295 + }, + { + "auxiliary_loss_clip": 0.01112022, + "auxiliary_loss_mlp": 0.01030774, + "balance_loss_clip": 1.01872778, + "balance_loss_mlp": 1.03827071, + "epoch": 0.5712911468510447, + "flos": 20996143603200.0, + "grad_norm": 1.6683693962706503, + "language_loss": 0.75252867, + "learning_rate": 1.6372874278125742e-06, + "loss": 0.7739566, + "num_input_tokens_seen": 204714730, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.73828125, + "step": 9502, + "time_per_iteration": 2.4645891189575195 + }, + { + "auxiliary_loss_clip": 0.01108222, + "auxiliary_loss_mlp": 0.01026372, + "balance_loss_clip": 1.01413548, + "balance_loss_mlp": 1.03776038, + "epoch": 0.5713512701037126, + "flos": 18916987731840.0, + "grad_norm": 3.8399261830524076, + "language_loss": 0.82397389, + "learning_rate": 1.636904431275105e-06, + "loss": 0.84531981, + "num_input_tokens_seen": 204735025, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.703125, + "step": 9503, + "time_per_iteration": 2.4945871829986572 + }, + { + "auxiliary_loss_clip": 0.01108893, + "auxiliary_loss_mlp": 0.01035138, + "balance_loss_clip": 1.02267456, + "balance_loss_mlp": 1.03824139, + "epoch": 0.5714113933563806, + "flos": 17413443308160.0, + "grad_norm": 2.09557851646671, + "language_loss": 0.85872537, + "learning_rate": 1.6365214485062553e-06, + "loss": 0.8801657, + "num_input_tokens_seen": 204751365, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 9504, + "time_per_iteration": 2.3861567974090576 + }, + { + "auxiliary_loss_clip": 0.01106356, + "auxiliary_loss_mlp": 0.0102491, + "balance_loss_clip": 1.01232708, + "balance_loss_mlp": 1.03753018, + "epoch": 0.5714715166090486, + "flos": 20193360589440.0, + "grad_norm": 1.9315555303189194, + "language_loss": 0.75182885, + "learning_rate": 1.6361384795205496e-06, + "loss": 0.7731415, + "num_input_tokens_seen": 204768980, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 9505, + "time_per_iteration": 2.462536573410034 + }, + { + "auxiliary_loss_clip": 0.01108197, + "auxiliary_loss_mlp": 0.01031273, + "balance_loss_clip": 1.01970994, + "balance_loss_mlp": 1.03717351, + "epoch": 0.5715316398617165, + "flos": 18551668458240.0, + "grad_norm": 1.6115496885789637, + "language_loss": 0.81918782, + "learning_rate": 1.635755524332509e-06, + "loss": 0.84058261, + "num_input_tokens_seen": 204788110, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.7109375, + "step": 9506, + "time_per_iteration": 2.467022180557251 + }, + { + "auxiliary_loss_clip": 0.01106598, + "auxiliary_loss_mlp": 0.01027974, + "balance_loss_clip": 1.01546264, + "balance_loss_mlp": 1.03684521, + "epoch": 0.5715917631143845, + "flos": 18478195188480.0, + "grad_norm": 1.6660041805363315, + "language_loss": 0.77144134, + "learning_rate": 1.6353725829566552e-06, + "loss": 0.79278708, + "num_input_tokens_seen": 204807240, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69921875, + "step": 9507, + "time_per_iteration": 2.4672694206237793 + }, + { + "auxiliary_loss_clip": 0.01110344, + "auxiliary_loss_mlp": 0.01035951, + "balance_loss_clip": 1.02186108, + "balance_loss_mlp": 1.03726792, + "epoch": 0.5716518863670524, + "flos": 24020037037440.0, + "grad_norm": 2.45367934924197, + "language_loss": 0.68435538, + "learning_rate": 1.63498965540751e-06, + "loss": 0.7058183, + "num_input_tokens_seen": 204826415, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.73046875, + "step": 9508, + "time_per_iteration": 2.464097261428833 + }, + { + "auxiliary_loss_clip": 0.01110426, + "auxiliary_loss_mlp": 0.01029096, + "balance_loss_clip": 1.01629877, + "balance_loss_mlp": 1.03722239, + "epoch": 0.5717120096197205, + "flos": 17819485626240.0, + "grad_norm": 2.0052906721639836, + "language_loss": 0.79419613, + "learning_rate": 1.634606741699593e-06, + "loss": 0.81559134, + "num_input_tokens_seen": 204844305, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.73046875, + "step": 9509, + "time_per_iteration": 2.504023551940918 + }, + { + "auxiliary_loss_clip": 0.01105897, + "auxiliary_loss_mlp": 0.01033534, + "balance_loss_clip": 1.02089834, + "balance_loss_mlp": 1.03664279, + "epoch": 0.5717721328723884, + "flos": 21866043179520.0, + "grad_norm": 1.839099502620817, + "language_loss": 0.7265448, + "learning_rate": 1.6342238418474255e-06, + "loss": 0.74793911, + "num_input_tokens_seen": 204861765, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 9510, + "time_per_iteration": 3.815577507019043 + }, + { + "auxiliary_loss_clip": 0.01107423, + "auxiliary_loss_mlp": 0.01030098, + "balance_loss_clip": 1.01810002, + "balance_loss_mlp": 1.03668678, + "epoch": 0.5718322561250564, + "flos": 28437624126720.0, + "grad_norm": 1.3819155223826083, + "language_loss": 0.69395494, + "learning_rate": 1.6338409558655264e-06, + "loss": 0.71533018, + "num_input_tokens_seen": 204882505, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.70703125, + "step": 9511, + "time_per_iteration": 2.5445902347564697 + }, + { + "auxiliary_loss_clip": 0.0110843, + "auxiliary_loss_mlp": 0.01036527, + "balance_loss_clip": 1.02426028, + "balance_loss_mlp": 1.03781009, + "epoch": 0.5718923793777243, + "flos": 13551825905280.0, + "grad_norm": 1.8672218842214499, + "language_loss": 0.61565816, + "learning_rate": 1.6334580837684152e-06, + "loss": 0.63710779, + "num_input_tokens_seen": 204899830, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.70703125, + "step": 9512, + "time_per_iteration": 3.8341665267944336 + }, + { + "auxiliary_loss_clip": 0.01106641, + "auxiliary_loss_mlp": 0.01028616, + "balance_loss_clip": 1.0164628, + "balance_loss_mlp": 1.03667331, + "epoch": 0.5719525026303923, + "flos": 17822035491840.0, + "grad_norm": 4.170405845803043, + "language_loss": 0.7586627, + "learning_rate": 1.6330752255706104e-06, + "loss": 0.78001529, + "num_input_tokens_seen": 204918100, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69921875, + "step": 9513, + "time_per_iteration": 5.299789667129517 + }, + { + "auxiliary_loss_clip": 0.01028017, + "auxiliary_loss_mlp": 0.00999308, + "balance_loss_clip": 0.99809855, + "balance_loss_mlp": 1.00645494, + "epoch": 0.5720126258830602, + "flos": 61298042814720.0, + "grad_norm": 0.8876641821203675, + "language_loss": 0.6684342, + "learning_rate": 1.6326923812866288e-06, + "loss": 0.68870747, + "num_input_tokens_seen": 204972925, + "router_z_loss_clip": 0.01208496, + "router_z_loss_mlp": 0.21582031, + "step": 9514, + "time_per_iteration": 3.0201942920684814 + }, + { + "auxiliary_loss_clip": 0.01114776, + "auxiliary_loss_mlp": 0.0104014, + "balance_loss_clip": 1.02696776, + "balance_loss_mlp": 1.04034257, + "epoch": 0.5720727491357283, + "flos": 23988040997760.0, + "grad_norm": 2.046774799271973, + "language_loss": 0.81059563, + "learning_rate": 1.63230955093099e-06, + "loss": 0.8321448, + "num_input_tokens_seen": 204990910, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.74609375, + "step": 9515, + "time_per_iteration": 2.440838575363159 + }, + { + "auxiliary_loss_clip": 0.01104804, + "auxiliary_loss_mlp": 0.01027026, + "balance_loss_clip": 1.01469994, + "balance_loss_mlp": 1.03602076, + "epoch": 0.5721328723883962, + "flos": 23405426398080.0, + "grad_norm": 1.8601231206296425, + "language_loss": 0.86125237, + "learning_rate": 1.6319267345182092e-06, + "loss": 0.88257068, + "num_input_tokens_seen": 205010500, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 9516, + "time_per_iteration": 2.477764368057251 + }, + { + "auxiliary_loss_clip": 0.01104974, + "auxiliary_loss_mlp": 0.01029332, + "balance_loss_clip": 1.01654696, + "balance_loss_mlp": 1.03561044, + "epoch": 0.5721929956410642, + "flos": 18804910320000.0, + "grad_norm": 1.8026555789133811, + "language_loss": 0.87531322, + "learning_rate": 1.6315439320628038e-06, + "loss": 0.89665627, + "num_input_tokens_seen": 205028560, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 9517, + "time_per_iteration": 2.425889253616333 + }, + { + "auxiliary_loss_clip": 0.0110684, + "auxiliary_loss_mlp": 0.0103103, + "balance_loss_clip": 1.01804841, + "balance_loss_mlp": 1.03662252, + "epoch": 0.5722531188937322, + "flos": 27196659100800.0, + "grad_norm": 1.765867586501473, + "language_loss": 0.8479656, + "learning_rate": 1.6311611435792893e-06, + "loss": 0.86934435, + "num_input_tokens_seen": 205048650, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.703125, + "step": 9518, + "time_per_iteration": 2.515908718109131 + }, + { + "auxiliary_loss_clip": 0.01102718, + "auxiliary_loss_mlp": 0.01031303, + "balance_loss_clip": 1.01909649, + "balance_loss_mlp": 1.03518391, + "epoch": 0.5723132421464001, + "flos": 15195672852480.0, + "grad_norm": 1.8620909672026127, + "language_loss": 0.7880826, + "learning_rate": 1.6307783690821812e-06, + "loss": 0.80942279, + "num_input_tokens_seen": 205066480, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.67578125, + "step": 9519, + "time_per_iteration": 2.400693893432617 + }, + { + "auxiliary_loss_clip": 0.01105893, + "auxiliary_loss_mlp": 0.01030213, + "balance_loss_clip": 1.01795244, + "balance_loss_mlp": 1.03658307, + "epoch": 0.5723733653990681, + "flos": 27599433281280.0, + "grad_norm": 1.5438950427184228, + "language_loss": 0.82970679, + "learning_rate": 1.6303956085859944e-06, + "loss": 0.85106778, + "num_input_tokens_seen": 205087475, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.69140625, + "step": 9520, + "time_per_iteration": 2.5011074542999268 + }, + { + "auxiliary_loss_clip": 0.01110791, + "auxiliary_loss_mlp": 0.01039958, + "balance_loss_clip": 1.0268625, + "balance_loss_mlp": 1.03927732, + "epoch": 0.572433488651736, + "flos": 18222870337920.0, + "grad_norm": 2.123220131944119, + "language_loss": 0.71853209, + "learning_rate": 1.630012862105243e-06, + "loss": 0.74003959, + "num_input_tokens_seen": 205106495, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71484375, + "step": 9521, + "time_per_iteration": 2.4156429767608643 + }, + { + "auxiliary_loss_clip": 0.01106899, + "auxiliary_loss_mlp": 0.01033537, + "balance_loss_clip": 1.02117443, + "balance_loss_mlp": 1.0362848, + "epoch": 0.5724936119044041, + "flos": 31249106484480.0, + "grad_norm": 1.6921576366095024, + "language_loss": 0.77830148, + "learning_rate": 1.6296301296544415e-06, + "loss": 0.79970586, + "num_input_tokens_seen": 205128285, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.70703125, + "step": 9522, + "time_per_iteration": 2.5682153701782227 + }, + { + "auxiliary_loss_clip": 0.01102798, + "auxiliary_loss_mlp": 0.01031779, + "balance_loss_clip": 1.02081728, + "balance_loss_mlp": 1.03628266, + "epoch": 0.572553735157072, + "flos": 19202189719680.0, + "grad_norm": 1.565759699688635, + "language_loss": 0.71671265, + "learning_rate": 1.629247411248102e-06, + "loss": 0.73805845, + "num_input_tokens_seen": 205146595, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6640625, + "step": 9523, + "time_per_iteration": 2.402622938156128 + }, + { + "auxiliary_loss_clip": 0.01104927, + "auxiliary_loss_mlp": 0.01026614, + "balance_loss_clip": 1.01511574, + "balance_loss_mlp": 1.03639328, + "epoch": 0.57261385840974, + "flos": 21214911386880.0, + "grad_norm": 1.6537237547017787, + "language_loss": 0.70046443, + "learning_rate": 1.628864706900738e-06, + "loss": 0.72177982, + "num_input_tokens_seen": 205164295, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6875, + "step": 9524, + "time_per_iteration": 2.478745698928833 + }, + { + "auxiliary_loss_clip": 0.01107047, + "auxiliary_loss_mlp": 0.01030877, + "balance_loss_clip": 1.01944458, + "balance_loss_mlp": 1.03783476, + "epoch": 0.5726739816624079, + "flos": 33984529793280.0, + "grad_norm": 1.431879051430598, + "language_loss": 0.65079439, + "learning_rate": 1.6284820166268615e-06, + "loss": 0.67217362, + "num_input_tokens_seen": 205185380, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.69140625, + "step": 9525, + "time_per_iteration": 2.5722320079803467 + }, + { + "auxiliary_loss_clip": 0.01102, + "auxiliary_loss_mlp": 0.01029624, + "balance_loss_clip": 1.01825702, + "balance_loss_mlp": 1.03385937, + "epoch": 0.5727341049150759, + "flos": 24275972419200.0, + "grad_norm": 1.7621674355193322, + "language_loss": 0.72353703, + "learning_rate": 1.628099340440984e-06, + "loss": 0.74485326, + "num_input_tokens_seen": 205204895, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.68359375, + "step": 9526, + "time_per_iteration": 2.5182504653930664 + }, + { + "auxiliary_loss_clip": 0.01102827, + "auxiliary_loss_mlp": 0.01031539, + "balance_loss_clip": 1.02022004, + "balance_loss_mlp": 1.03617597, + "epoch": 0.5727942281677438, + "flos": 28400564269440.0, + "grad_norm": 1.6243804380597333, + "language_loss": 0.80131519, + "learning_rate": 1.6277166783576176e-06, + "loss": 0.8226589, + "num_input_tokens_seen": 205223440, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66796875, + "step": 9527, + "time_per_iteration": 2.556168556213379 + }, + { + "auxiliary_loss_clip": 0.01104789, + "auxiliary_loss_mlp": 0.01036832, + "balance_loss_clip": 1.02399302, + "balance_loss_mlp": 1.03633451, + "epoch": 0.5728543514204119, + "flos": 19536769929600.0, + "grad_norm": 1.8731920412295517, + "language_loss": 0.71818352, + "learning_rate": 1.6273340303912713e-06, + "loss": 0.7395997, + "num_input_tokens_seen": 205242800, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.68359375, + "step": 9528, + "time_per_iteration": 2.502045154571533 + }, + { + "auxiliary_loss_clip": 0.01107269, + "auxiliary_loss_mlp": 0.01033482, + "balance_loss_clip": 1.02113199, + "balance_loss_mlp": 1.03742957, + "epoch": 0.5729144746730798, + "flos": 21506757390720.0, + "grad_norm": 1.9532280974694858, + "language_loss": 0.853854, + "learning_rate": 1.6269513965564557e-06, + "loss": 0.87526155, + "num_input_tokens_seen": 205259465, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69921875, + "step": 9529, + "time_per_iteration": 2.468146324157715 + }, + { + "auxiliary_loss_clip": 0.01028852, + "auxiliary_loss_mlp": 0.0100185, + "balance_loss_clip": 1.0006398, + "balance_loss_mlp": 1.00712085, + "epoch": 0.5729745979257478, + "flos": 58681628242560.0, + "grad_norm": 0.7632636876236247, + "language_loss": 0.56091511, + "learning_rate": 1.6265687768676813e-06, + "loss": 0.58122212, + "num_input_tokens_seen": 205314100, + "router_z_loss_clip": 0.01208496, + "router_z_loss_mlp": 0.21777344, + "step": 9530, + "time_per_iteration": 2.955796003341675 + }, + { + "auxiliary_loss_clip": 0.01109966, + "auxiliary_loss_mlp": 0.01023962, + "balance_loss_clip": 1.01241684, + "balance_loss_mlp": 1.03820443, + "epoch": 0.5730347211784158, + "flos": 18552099421440.0, + "grad_norm": 2.605800582107851, + "language_loss": 0.66667211, + "learning_rate": 1.6261861713394553e-06, + "loss": 0.68801141, + "num_input_tokens_seen": 205333420, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.71875, + "step": 9531, + "time_per_iteration": 2.4874041080474854 + }, + { + "auxiliary_loss_clip": 0.01107074, + "auxiliary_loss_mlp": 0.01031474, + "balance_loss_clip": 1.0189929, + "balance_loss_mlp": 1.0362972, + "epoch": 0.5730948444310837, + "flos": 38031482396160.0, + "grad_norm": 2.577990064326961, + "language_loss": 0.75677073, + "learning_rate": 1.6258035799862876e-06, + "loss": 0.77815616, + "num_input_tokens_seen": 205350995, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.70703125, + "step": 9532, + "time_per_iteration": 2.653745651245117 + }, + { + "auxiliary_loss_clip": 0.01105987, + "auxiliary_loss_mlp": 0.01030007, + "balance_loss_clip": 1.01779997, + "balance_loss_mlp": 1.03636467, + "epoch": 0.5731549676837517, + "flos": 25227066689280.0, + "grad_norm": 3.4857041080787696, + "language_loss": 0.78726482, + "learning_rate": 1.625421002822686e-06, + "loss": 0.80862474, + "num_input_tokens_seen": 205372675, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6953125, + "step": 9533, + "time_per_iteration": 2.5444183349609375 + }, + { + "auxiliary_loss_clip": 0.01105558, + "auxiliary_loss_mlp": 0.01033011, + "balance_loss_clip": 1.02156746, + "balance_loss_mlp": 1.03771889, + "epoch": 0.5732150909364196, + "flos": 23368222886400.0, + "grad_norm": 2.5155449858561036, + "language_loss": 0.8564285, + "learning_rate": 1.6250384398631574e-06, + "loss": 0.87781423, + "num_input_tokens_seen": 205392590, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6796875, + "step": 9534, + "time_per_iteration": 2.611769199371338 + }, + { + "auxiliary_loss_clip": 0.01108602, + "auxiliary_loss_mlp": 0.01035947, + "balance_loss_clip": 1.02241731, + "balance_loss_mlp": 1.03833961, + "epoch": 0.5732752141890877, + "flos": 23079357711360.0, + "grad_norm": 1.7913378128419626, + "language_loss": 0.74880809, + "learning_rate": 1.6246558911222085e-06, + "loss": 0.7702536, + "num_input_tokens_seen": 205414885, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.703125, + "step": 9535, + "time_per_iteration": 2.5294063091278076 + }, + { + "auxiliary_loss_clip": 0.01113223, + "auxiliary_loss_mlp": 0.010319, + "balance_loss_clip": 1.01927602, + "balance_loss_mlp": 1.04021287, + "epoch": 0.5733353374417556, + "flos": 24352282863360.0, + "grad_norm": 1.60935564318513, + "language_loss": 0.70712042, + "learning_rate": 1.624273356614346e-06, + "loss": 0.72857165, + "num_input_tokens_seen": 205434440, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.73046875, + "step": 9536, + "time_per_iteration": 2.5115044116973877 + }, + { + "auxiliary_loss_clip": 0.01104773, + "auxiliary_loss_mlp": 0.01029145, + "balance_loss_clip": 1.01741457, + "balance_loss_mlp": 1.03604972, + "epoch": 0.5733954606944236, + "flos": 27198849830400.0, + "grad_norm": 1.9605571924010112, + "language_loss": 0.69843078, + "learning_rate": 1.6238908363540755e-06, + "loss": 0.71977001, + "num_input_tokens_seen": 205454225, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6875, + "step": 9537, + "time_per_iteration": 2.485203266143799 + }, + { + "auxiliary_loss_clip": 0.01106743, + "auxiliary_loss_mlp": 0.01033894, + "balance_loss_clip": 1.02179384, + "balance_loss_mlp": 1.03693986, + "epoch": 0.5734555839470915, + "flos": 28765129357440.0, + "grad_norm": 1.9885156073739136, + "language_loss": 0.6257112, + "learning_rate": 1.623508330355902e-06, + "loss": 0.64711761, + "num_input_tokens_seen": 205474750, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 9538, + "time_per_iteration": 2.5242531299591064 + }, + { + "auxiliary_loss_clip": 0.01106895, + "auxiliary_loss_mlp": 0.01034021, + "balance_loss_clip": 1.02131939, + "balance_loss_mlp": 1.03750122, + "epoch": 0.5735157071997595, + "flos": 22966813422720.0, + "grad_norm": 1.847251631174476, + "language_loss": 0.83067656, + "learning_rate": 1.6231258386343306e-06, + "loss": 0.85208571, + "num_input_tokens_seen": 205495495, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 9539, + "time_per_iteration": 2.4557297229766846 + }, + { + "auxiliary_loss_clip": 0.01108422, + "auxiliary_loss_mlp": 0.01034021, + "balance_loss_clip": 1.02155805, + "balance_loss_mlp": 1.03672779, + "epoch": 0.5735758304524274, + "flos": 18989455420800.0, + "grad_norm": 1.9303873756935568, + "language_loss": 0.73266071, + "learning_rate": 1.6227433612038647e-06, + "loss": 0.75408518, + "num_input_tokens_seen": 205510070, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71484375, + "step": 9540, + "time_per_iteration": 2.449195384979248 + }, + { + "auxiliary_loss_clip": 0.01102256, + "auxiliary_loss_mlp": 0.01025926, + "balance_loss_clip": 1.01486361, + "balance_loss_mlp": 1.03386962, + "epoch": 0.5736359537050955, + "flos": 28397942576640.0, + "grad_norm": 1.7719156274309316, + "language_loss": 0.80036277, + "learning_rate": 1.6223608980790089e-06, + "loss": 0.82164454, + "num_input_tokens_seen": 205530190, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.68359375, + "step": 9541, + "time_per_iteration": 2.4807605743408203 + }, + { + "auxiliary_loss_clip": 0.01109647, + "auxiliary_loss_mlp": 0.01035157, + "balance_loss_clip": 1.02247286, + "balance_loss_mlp": 1.03748846, + "epoch": 0.5736960769577634, + "flos": 15627210848640.0, + "grad_norm": 2.3537030152809817, + "language_loss": 0.64358872, + "learning_rate": 1.6219784492742654e-06, + "loss": 0.66503674, + "num_input_tokens_seen": 205547380, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 9542, + "time_per_iteration": 2.417178153991699 + }, + { + "auxiliary_loss_clip": 0.01105899, + "auxiliary_loss_mlp": 0.0102862, + "balance_loss_clip": 1.01681268, + "balance_loss_mlp": 1.03586972, + "epoch": 0.5737562002104314, + "flos": 18003994813440.0, + "grad_norm": 2.222303069950764, + "language_loss": 0.82983625, + "learning_rate": 1.6215960148041365e-06, + "loss": 0.85118151, + "num_input_tokens_seen": 205566540, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.703125, + "step": 9543, + "time_per_iteration": 2.4162886142730713 + }, + { + "auxiliary_loss_clip": 0.01111645, + "auxiliary_loss_mlp": 0.01030794, + "balance_loss_clip": 1.01729405, + "balance_loss_mlp": 1.0378089, + "epoch": 0.5738163234630994, + "flos": 20698192287360.0, + "grad_norm": 2.297441344794182, + "language_loss": 0.73850191, + "learning_rate": 1.6212135946831257e-06, + "loss": 0.75992632, + "num_input_tokens_seen": 205584200, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73828125, + "step": 9544, + "time_per_iteration": 2.4531123638153076 + }, + { + "auxiliary_loss_clip": 0.01110237, + "auxiliary_loss_mlp": 0.01029691, + "balance_loss_clip": 1.01687646, + "balance_loss_mlp": 1.03741252, + "epoch": 0.5738764467157673, + "flos": 23149311448320.0, + "grad_norm": 2.106910148542404, + "language_loss": 0.75869375, + "learning_rate": 1.620831188925733e-06, + "loss": 0.78009301, + "num_input_tokens_seen": 205604675, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.73046875, + "step": 9545, + "time_per_iteration": 2.446340799331665 + }, + { + "auxiliary_loss_clip": 0.01109663, + "auxiliary_loss_mlp": 0.01033226, + "balance_loss_clip": 1.02086437, + "balance_loss_mlp": 1.03903508, + "epoch": 0.5739365699684353, + "flos": 29492930730240.0, + "grad_norm": 1.6841481616941998, + "language_loss": 0.56267381, + "learning_rate": 1.620448797546459e-06, + "loss": 0.58410275, + "num_input_tokens_seen": 205624680, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.70703125, + "step": 9546, + "time_per_iteration": 2.5431458950042725 + }, + { + "auxiliary_loss_clip": 0.01109256, + "auxiliary_loss_mlp": 0.01032725, + "balance_loss_clip": 1.02027345, + "balance_loss_mlp": 1.0375458, + "epoch": 0.5739966932211032, + "flos": 14027247342720.0, + "grad_norm": 2.2354008467729236, + "language_loss": 0.76396316, + "learning_rate": 1.6200664205598055e-06, + "loss": 0.78538299, + "num_input_tokens_seen": 205641950, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71875, + "step": 9547, + "time_per_iteration": 2.399355173110962 + }, + { + "auxiliary_loss_clip": 0.01108464, + "auxiliary_loss_mlp": 0.01030986, + "balance_loss_clip": 1.01847458, + "balance_loss_mlp": 1.03692102, + "epoch": 0.5740568164737713, + "flos": 19062030850560.0, + "grad_norm": 3.5736288481687457, + "language_loss": 0.74030554, + "learning_rate": 1.6196840579802704e-06, + "loss": 0.76169997, + "num_input_tokens_seen": 205660130, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71484375, + "step": 9548, + "time_per_iteration": 2.438188314437866 + }, + { + "auxiliary_loss_clip": 0.01107619, + "auxiliary_loss_mlp": 0.01033978, + "balance_loss_clip": 1.02162778, + "balance_loss_mlp": 1.03630018, + "epoch": 0.5741169397264392, + "flos": 22127832478080.0, + "grad_norm": 2.070673757769185, + "language_loss": 0.6898725, + "learning_rate": 1.619301709822355e-06, + "loss": 0.71128839, + "num_input_tokens_seen": 205678895, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.71484375, + "step": 9549, + "time_per_iteration": 2.4443182945251465 + }, + { + "auxiliary_loss_clip": 0.01109324, + "auxiliary_loss_mlp": 0.01029147, + "balance_loss_clip": 1.01756024, + "balance_loss_mlp": 1.0398941, + "epoch": 0.5741770629791072, + "flos": 24936836797440.0, + "grad_norm": 1.5143454441571018, + "language_loss": 0.79360747, + "learning_rate": 1.6189193761005564e-06, + "loss": 0.81499219, + "num_input_tokens_seen": 205698450, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6953125, + "step": 9550, + "time_per_iteration": 2.570117473602295 + }, + { + "auxiliary_loss_clip": 0.0111024, + "auxiliary_loss_mlp": 0.01030597, + "balance_loss_clip": 1.01844966, + "balance_loss_mlp": 1.03862011, + "epoch": 0.5742371862317751, + "flos": 18801462614400.0, + "grad_norm": 1.8121895379081407, + "language_loss": 0.67906272, + "learning_rate": 1.6185370568293727e-06, + "loss": 0.70047116, + "num_input_tokens_seen": 205714870, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.71484375, + "step": 9551, + "time_per_iteration": 2.423403024673462 + }, + { + "auxiliary_loss_clip": 0.01109924, + "auxiliary_loss_mlp": 0.01036266, + "balance_loss_clip": 1.02370107, + "balance_loss_mlp": 1.03743887, + "epoch": 0.5742973094844431, + "flos": 24460661174400.0, + "grad_norm": 1.628701607162486, + "language_loss": 0.71362531, + "learning_rate": 1.6181547520233031e-06, + "loss": 0.73508722, + "num_input_tokens_seen": 205736045, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.72265625, + "step": 9552, + "time_per_iteration": 3.886622428894043 + }, + { + "auxiliary_loss_clip": 0.01109635, + "auxiliary_loss_mlp": 0.01031692, + "balance_loss_clip": 1.01972914, + "balance_loss_mlp": 1.03975332, + "epoch": 0.574357432737111, + "flos": 21652770176640.0, + "grad_norm": 1.7228318188262413, + "language_loss": 0.79922652, + "learning_rate": 1.617772461696843e-06, + "loss": 0.82063985, + "num_input_tokens_seen": 205754445, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69921875, + "step": 9553, + "time_per_iteration": 2.431051731109619 + }, + { + "auxiliary_loss_clip": 0.01108983, + "auxiliary_loss_mlp": 0.01030986, + "balance_loss_clip": 1.01900589, + "balance_loss_mlp": 1.03611398, + "epoch": 0.5744175559897791, + "flos": 16544728880640.0, + "grad_norm": 2.015136287210995, + "language_loss": 0.83396381, + "learning_rate": 1.6173901858644895e-06, + "loss": 0.85536349, + "num_input_tokens_seen": 205770595, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.73046875, + "step": 9554, + "time_per_iteration": 3.823064088821411 + }, + { + "auxiliary_loss_clip": 0.0111382, + "auxiliary_loss_mlp": 0.01036712, + "balance_loss_clip": 1.02347982, + "balance_loss_mlp": 1.04021072, + "epoch": 0.574477679242447, + "flos": 24207598880640.0, + "grad_norm": 1.4846822756962552, + "language_loss": 0.70777845, + "learning_rate": 1.6170079245407385e-06, + "loss": 0.72928381, + "num_input_tokens_seen": 205791935, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 9555, + "time_per_iteration": 5.333508491516113 + }, + { + "auxiliary_loss_clip": 0.01109263, + "auxiliary_loss_mlp": 0.01027381, + "balance_loss_clip": 1.01494122, + "balance_loss_mlp": 1.03861225, + "epoch": 0.574537802495115, + "flos": 14903000835840.0, + "grad_norm": 2.115239569910986, + "language_loss": 0.72206348, + "learning_rate": 1.6166256777400853e-06, + "loss": 0.7434299, + "num_input_tokens_seen": 205807260, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.70703125, + "step": 9556, + "time_per_iteration": 2.4479689598083496 + }, + { + "auxiliary_loss_clip": 0.01109212, + "auxiliary_loss_mlp": 0.01034822, + "balance_loss_clip": 1.02174449, + "balance_loss_mlp": 1.03852749, + "epoch": 0.5745979257477829, + "flos": 24934969290240.0, + "grad_norm": 1.5580789907924004, + "language_loss": 0.73779786, + "learning_rate": 1.6162434454770248e-06, + "loss": 0.75923818, + "num_input_tokens_seen": 205826885, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.70703125, + "step": 9557, + "time_per_iteration": 2.53330397605896 + }, + { + "auxiliary_loss_clip": 0.01108535, + "auxiliary_loss_mlp": 0.01033277, + "balance_loss_clip": 1.02114749, + "balance_loss_mlp": 1.03805625, + "epoch": 0.5746580490004509, + "flos": 17235757704960.0, + "grad_norm": 1.551535187819687, + "language_loss": 0.67825913, + "learning_rate": 1.6158612277660514e-06, + "loss": 0.69967735, + "num_input_tokens_seen": 205844630, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.703125, + "step": 9558, + "time_per_iteration": 2.4345078468322754 + }, + { + "auxiliary_loss_clip": 0.01115654, + "auxiliary_loss_mlp": 0.01039699, + "balance_loss_clip": 1.02509618, + "balance_loss_mlp": 1.03993464, + "epoch": 0.5747181722531189, + "flos": 13187871348480.0, + "grad_norm": 2.018077791857229, + "language_loss": 0.71494532, + "learning_rate": 1.615479024621659e-06, + "loss": 0.73649883, + "num_input_tokens_seen": 205860960, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7578125, + "step": 9559, + "time_per_iteration": 2.4112660884857178 + }, + { + "auxiliary_loss_clip": 0.01109449, + "auxiliary_loss_mlp": 0.01029457, + "balance_loss_clip": 1.01856709, + "balance_loss_mlp": 1.03951454, + "epoch": 0.5747782955057869, + "flos": 22963006581120.0, + "grad_norm": 1.8277860809166269, + "language_loss": 0.79002881, + "learning_rate": 1.6150968360583398e-06, + "loss": 0.81141782, + "num_input_tokens_seen": 205880675, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.69921875, + "step": 9560, + "time_per_iteration": 2.461737871170044 + }, + { + "auxiliary_loss_clip": 0.01110078, + "auxiliary_loss_mlp": 0.01029167, + "balance_loss_clip": 1.01649547, + "balance_loss_mlp": 1.03796887, + "epoch": 0.5748384187584549, + "flos": 23403235668480.0, + "grad_norm": 2.312922307701609, + "language_loss": 0.64114952, + "learning_rate": 1.614714662090588e-06, + "loss": 0.66254199, + "num_input_tokens_seen": 205900050, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 9561, + "time_per_iteration": 2.4589121341705322 + }, + { + "auxiliary_loss_clip": 0.01116817, + "auxiliary_loss_mlp": 0.01037364, + "balance_loss_clip": 1.02403021, + "balance_loss_mlp": 1.04126084, + "epoch": 0.5748985420111228, + "flos": 17785514338560.0, + "grad_norm": 1.619271715020599, + "language_loss": 0.71404445, + "learning_rate": 1.6143325027328945e-06, + "loss": 0.73558629, + "num_input_tokens_seen": 205918855, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7578125, + "step": 9562, + "time_per_iteration": 2.4472360610961914 + }, + { + "auxiliary_loss_clip": 0.01108014, + "auxiliary_loss_mlp": 0.01036964, + "balance_loss_clip": 1.0256269, + "balance_loss_mlp": 1.03870499, + "epoch": 0.5749586652637908, + "flos": 19866250408320.0, + "grad_norm": 1.47664891140277, + "language_loss": 0.84212148, + "learning_rate": 1.613950357999751e-06, + "loss": 0.86357129, + "num_input_tokens_seen": 205936970, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6953125, + "step": 9563, + "time_per_iteration": 2.448540449142456 + }, + { + "auxiliary_loss_clip": 0.01112952, + "auxiliary_loss_mlp": 0.01035939, + "balance_loss_clip": 1.02251637, + "balance_loss_mlp": 1.03915787, + "epoch": 0.5750187885164587, + "flos": 21287235421440.0, + "grad_norm": 2.1518785584706266, + "language_loss": 0.57469738, + "learning_rate": 1.6135682279056488e-06, + "loss": 0.59618628, + "num_input_tokens_seen": 205954630, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73828125, + "step": 9564, + "time_per_iteration": 2.455137252807617 + }, + { + "auxiliary_loss_clip": 0.01104038, + "auxiliary_loss_mlp": 0.0102815, + "balance_loss_clip": 1.01645529, + "balance_loss_mlp": 1.03663075, + "epoch": 0.5750789117691267, + "flos": 18804658924800.0, + "grad_norm": 1.7205024550895016, + "language_loss": 0.75828826, + "learning_rate": 1.613186112465078e-06, + "loss": 0.7796101, + "num_input_tokens_seen": 205971510, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 9565, + "time_per_iteration": 2.4293572902679443 + }, + { + "auxiliary_loss_clip": 0.01030195, + "auxiliary_loss_mlp": 0.01000571, + "balance_loss_clip": 0.9991762, + "balance_loss_mlp": 1.00864065, + "epoch": 0.5751390350217946, + "flos": 70663224124800.0, + "grad_norm": 0.7426631899706556, + "language_loss": 0.60724127, + "learning_rate": 1.6128040116925287e-06, + "loss": 0.62754893, + "num_input_tokens_seen": 206035125, + "router_z_loss_clip": 0.01397705, + "router_z_loss_mlp": 0.21582031, + "step": 9566, + "time_per_iteration": 3.156651496887207 + }, + { + "auxiliary_loss_clip": 0.01109259, + "auxiliary_loss_mlp": 0.01033183, + "balance_loss_clip": 1.02127385, + "balance_loss_mlp": 1.03952003, + "epoch": 0.5751991582744627, + "flos": 14246338348800.0, + "grad_norm": 1.8230299531471923, + "language_loss": 0.7537874, + "learning_rate": 1.6124219256024901e-06, + "loss": 0.77521175, + "num_input_tokens_seen": 206052075, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 9567, + "time_per_iteration": 2.414881706237793 + }, + { + "auxiliary_loss_clip": 0.01108744, + "auxiliary_loss_mlp": 0.01029486, + "balance_loss_clip": 1.01692748, + "balance_loss_mlp": 1.03808224, + "epoch": 0.5752592815271306, + "flos": 18328160079360.0, + "grad_norm": 1.5717614086198337, + "language_loss": 0.74559051, + "learning_rate": 1.6120398542094504e-06, + "loss": 0.76697284, + "num_input_tokens_seen": 206069970, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.70703125, + "step": 9568, + "time_per_iteration": 2.458827495574951 + }, + { + "auxiliary_loss_clip": 0.0111112, + "auxiliary_loss_mlp": 0.01029788, + "balance_loss_clip": 1.01751542, + "balance_loss_mlp": 1.0394876, + "epoch": 0.5753194047797986, + "flos": 20922742160640.0, + "grad_norm": 1.7630953099139652, + "language_loss": 0.70951653, + "learning_rate": 1.6116577975278994e-06, + "loss": 0.73092568, + "num_input_tokens_seen": 206088950, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.71484375, + "step": 9569, + "time_per_iteration": 2.4545505046844482 + }, + { + "auxiliary_loss_clip": 0.01112247, + "auxiliary_loss_mlp": 0.01040682, + "balance_loss_clip": 1.02746797, + "balance_loss_mlp": 1.04058015, + "epoch": 0.5753795280324665, + "flos": 19281804215040.0, + "grad_norm": 1.9393871177420576, + "language_loss": 0.55699342, + "learning_rate": 1.6112757555723223e-06, + "loss": 0.57852268, + "num_input_tokens_seen": 206107780, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71875, + "step": 9570, + "time_per_iteration": 2.478793144226074 + }, + { + "auxiliary_loss_clip": 0.01106131, + "auxiliary_loss_mlp": 0.01038048, + "balance_loss_clip": 1.02648425, + "balance_loss_mlp": 1.03744042, + "epoch": 0.5754396512851345, + "flos": 21652877917440.0, + "grad_norm": 1.6217673569741213, + "language_loss": 0.64154774, + "learning_rate": 1.6108937283572082e-06, + "loss": 0.6629895, + "num_input_tokens_seen": 206127445, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6875, + "step": 9571, + "time_per_iteration": 2.4446957111358643 + }, + { + "auxiliary_loss_clip": 0.01108191, + "auxiliary_loss_mlp": 0.0103505, + "balance_loss_clip": 1.02213967, + "balance_loss_mlp": 1.03693449, + "epoch": 0.5754997745378025, + "flos": 51021700179840.0, + "grad_norm": 1.5404037339802243, + "language_loss": 0.67144608, + "learning_rate": 1.6105117158970434e-06, + "loss": 0.69287848, + "num_input_tokens_seen": 206152005, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9572, + "time_per_iteration": 2.739871025085449 + }, + { + "auxiliary_loss_clip": 0.0110922, + "auxiliary_loss_mlp": 0.01031668, + "balance_loss_clip": 1.01920414, + "balance_loss_mlp": 1.03968024, + "epoch": 0.5755598977904705, + "flos": 22856890826880.0, + "grad_norm": 2.3042557910685897, + "language_loss": 0.72336781, + "learning_rate": 1.6101297182063123e-06, + "loss": 0.74477673, + "num_input_tokens_seen": 206169875, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 9573, + "time_per_iteration": 2.446484088897705 + }, + { + "auxiliary_loss_clip": 0.01105342, + "auxiliary_loss_mlp": 0.01028983, + "balance_loss_clip": 1.01808691, + "balance_loss_mlp": 1.03999066, + "epoch": 0.5756200210431385, + "flos": 38472824805120.0, + "grad_norm": 1.9447567655956284, + "language_loss": 0.76657987, + "learning_rate": 1.6097477352995022e-06, + "loss": 0.78792316, + "num_input_tokens_seen": 206192635, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.65625, + "step": 9574, + "time_per_iteration": 2.620338201522827 + }, + { + "auxiliary_loss_clip": 0.01113268, + "auxiliary_loss_mlp": 0.01030568, + "balance_loss_clip": 1.01712155, + "balance_loss_mlp": 1.03815711, + "epoch": 0.5756801442958064, + "flos": 23910006700800.0, + "grad_norm": 2.450005891087765, + "language_loss": 0.66523874, + "learning_rate": 1.6093657671910968e-06, + "loss": 0.6866771, + "num_input_tokens_seen": 206211485, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 9575, + "time_per_iteration": 2.4487204551696777 + }, + { + "auxiliary_loss_clip": 0.01106224, + "auxiliary_loss_mlp": 0.01032413, + "balance_loss_clip": 1.02086747, + "balance_loss_mlp": 1.03883016, + "epoch": 0.5757402675484744, + "flos": 21105276099840.0, + "grad_norm": 1.5135571903226765, + "language_loss": 0.79637057, + "learning_rate": 1.6089838138955804e-06, + "loss": 0.81775701, + "num_input_tokens_seen": 206231740, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.67578125, + "step": 9576, + "time_per_iteration": 2.499525547027588 + }, + { + "auxiliary_loss_clip": 0.01106499, + "auxiliary_loss_mlp": 0.0102964, + "balance_loss_clip": 1.0181545, + "balance_loss_mlp": 1.038414, + "epoch": 0.5758003908011423, + "flos": 20559110826240.0, + "grad_norm": 1.624550594516776, + "language_loss": 0.69612324, + "learning_rate": 1.6086018754274372e-06, + "loss": 0.71748459, + "num_input_tokens_seen": 206250975, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6796875, + "step": 9577, + "time_per_iteration": 2.4342739582061768 + }, + { + "auxiliary_loss_clip": 0.0111126, + "auxiliary_loss_mlp": 0.01032786, + "balance_loss_clip": 1.02076983, + "balance_loss_mlp": 1.03889465, + "epoch": 0.5758605140538103, + "flos": 16473015377280.0, + "grad_norm": 1.7262479676640925, + "language_loss": 0.66394711, + "learning_rate": 1.6082199518011504e-06, + "loss": 0.68538755, + "num_input_tokens_seen": 206268800, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.7265625, + "step": 9578, + "time_per_iteration": 2.452836513519287 + }, + { + "auxiliary_loss_clip": 0.01104785, + "auxiliary_loss_mlp": 0.01028747, + "balance_loss_clip": 1.01713598, + "balance_loss_mlp": 1.03683639, + "epoch": 0.5759206373064782, + "flos": 21287558643840.0, + "grad_norm": 1.5955641210398863, + "language_loss": 0.72130096, + "learning_rate": 1.6078380430312016e-06, + "loss": 0.74263626, + "num_input_tokens_seen": 206287190, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 9579, + "time_per_iteration": 2.4709668159484863 + }, + { + "auxiliary_loss_clip": 0.01113888, + "auxiliary_loss_mlp": 0.01031946, + "balance_loss_clip": 1.01880956, + "balance_loss_mlp": 1.03966045, + "epoch": 0.5759807605591463, + "flos": 26067879227520.0, + "grad_norm": 2.099656741464949, + "language_loss": 0.64655066, + "learning_rate": 1.6074561491320742e-06, + "loss": 0.66800898, + "num_input_tokens_seen": 206307020, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 9580, + "time_per_iteration": 2.5071680545806885 + }, + { + "auxiliary_loss_clip": 0.01108728, + "auxiliary_loss_mlp": 0.01033088, + "balance_loss_clip": 1.02024293, + "balance_loss_mlp": 1.03776896, + "epoch": 0.5760408838118142, + "flos": 18873068376960.0, + "grad_norm": 1.9172914104456789, + "language_loss": 0.8563143, + "learning_rate": 1.6070742701182486e-06, + "loss": 0.87773246, + "num_input_tokens_seen": 206324095, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9581, + "time_per_iteration": 2.459761142730713 + }, + { + "auxiliary_loss_clip": 0.01117292, + "auxiliary_loss_mlp": 0.01040765, + "balance_loss_clip": 1.02792597, + "balance_loss_mlp": 1.04308629, + "epoch": 0.5761010070644822, + "flos": 15378134964480.0, + "grad_norm": 2.0860755056974627, + "language_loss": 0.67691463, + "learning_rate": 1.6066924060042057e-06, + "loss": 0.69849521, + "num_input_tokens_seen": 206343210, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7421875, + "step": 9582, + "time_per_iteration": 2.461245536804199 + }, + { + "auxiliary_loss_clip": 0.01030428, + "auxiliary_loss_mlp": 0.01000716, + "balance_loss_clip": 0.99950552, + "balance_loss_mlp": 1.00893497, + "epoch": 0.5761611303171501, + "flos": 71471932882560.0, + "grad_norm": 0.6389163922736963, + "language_loss": 0.57233906, + "learning_rate": 1.6063105568044271e-06, + "loss": 0.59265041, + "num_input_tokens_seen": 206415935, + "router_z_loss_clip": 0.01208496, + "router_z_loss_mlp": 0.21484375, + "step": 9583, + "time_per_iteration": 3.212454080581665 + }, + { + "auxiliary_loss_clip": 0.01108245, + "auxiliary_loss_mlp": 0.01029211, + "balance_loss_clip": 1.01740384, + "balance_loss_mlp": 1.0381434, + "epoch": 0.5762212535698181, + "flos": 16246167033600.0, + "grad_norm": 1.8641226876424317, + "language_loss": 0.82294947, + "learning_rate": 1.6059287225333912e-06, + "loss": 0.84432399, + "num_input_tokens_seen": 206431900, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.69921875, + "step": 9584, + "time_per_iteration": 2.445197582244873 + }, + { + "auxiliary_loss_clip": 0.0102928, + "auxiliary_loss_mlp": 0.01000964, + "balance_loss_clip": 0.99972469, + "balance_loss_mlp": 1.00788319, + "epoch": 0.5762813768224861, + "flos": 70185504216960.0, + "grad_norm": 0.6211358186522926, + "language_loss": 0.49536344, + "learning_rate": 1.6055469032055773e-06, + "loss": 0.51566589, + "num_input_tokens_seen": 206501200, + "router_z_loss_clip": 0.01239014, + "router_z_loss_mlp": 0.21484375, + "step": 9585, + "time_per_iteration": 3.1135380268096924 + }, + { + "auxiliary_loss_clip": 0.01103387, + "auxiliary_loss_mlp": 0.01026782, + "balance_loss_clip": 1.01523662, + "balance_loss_mlp": 1.0356468, + "epoch": 0.5763415000751541, + "flos": 20518028645760.0, + "grad_norm": 2.0469276219055037, + "language_loss": 0.84745687, + "learning_rate": 1.605165098835465e-06, + "loss": 0.86875856, + "num_input_tokens_seen": 206520575, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6796875, + "step": 9586, + "time_per_iteration": 2.4322049617767334 + }, + { + "auxiliary_loss_clip": 0.01107042, + "auxiliary_loss_mlp": 0.01033774, + "balance_loss_clip": 1.02099502, + "balance_loss_mlp": 1.0371176, + "epoch": 0.5764016233278221, + "flos": 15815526877440.0, + "grad_norm": 1.708349469848261, + "language_loss": 0.79935288, + "learning_rate": 1.6047833094375308e-06, + "loss": 0.82076108, + "num_input_tokens_seen": 206538060, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 9587, + "time_per_iteration": 2.420388698577881 + }, + { + "auxiliary_loss_clip": 0.01106973, + "auxiliary_loss_mlp": 0.01030504, + "balance_loss_clip": 1.01772523, + "balance_loss_mlp": 1.03791797, + "epoch": 0.57646174658049, + "flos": 20772312001920.0, + "grad_norm": 1.476870264659234, + "language_loss": 0.65978181, + "learning_rate": 1.6044015350262542e-06, + "loss": 0.68115664, + "num_input_tokens_seen": 206557320, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69140625, + "step": 9588, + "time_per_iteration": 2.470181941986084 + }, + { + "auxiliary_loss_clip": 0.0110785, + "auxiliary_loss_mlp": 0.0103402, + "balance_loss_clip": 1.02095485, + "balance_loss_mlp": 1.03747165, + "epoch": 0.576521869833158, + "flos": 23549930812800.0, + "grad_norm": 1.7939970430826904, + "language_loss": 0.78344554, + "learning_rate": 1.6040197756161104e-06, + "loss": 0.80486423, + "num_input_tokens_seen": 206575780, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 9589, + "time_per_iteration": 2.4622039794921875 + }, + { + "auxiliary_loss_clip": 0.0110208, + "auxiliary_loss_mlp": 0.01023642, + "balance_loss_clip": 1.01255536, + "balance_loss_mlp": 1.03513849, + "epoch": 0.5765819930858259, + "flos": 20266582464000.0, + "grad_norm": 1.899286870644745, + "language_loss": 0.79484087, + "learning_rate": 1.6036380312215762e-06, + "loss": 0.81609809, + "num_input_tokens_seen": 206594100, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.66796875, + "step": 9590, + "time_per_iteration": 2.4738223552703857 + }, + { + "auxiliary_loss_clip": 0.01107337, + "auxiliary_loss_mlp": 0.0102776, + "balance_loss_clip": 1.01693606, + "balance_loss_mlp": 1.03926742, + "epoch": 0.5766421163384939, + "flos": 23148772744320.0, + "grad_norm": 1.6468651932641252, + "language_loss": 0.63016611, + "learning_rate": 1.6032563018571283e-06, + "loss": 0.65151715, + "num_input_tokens_seen": 206613325, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6796875, + "step": 9591, + "time_per_iteration": 2.4630722999572754 + }, + { + "auxiliary_loss_clip": 0.0110984, + "auxiliary_loss_mlp": 0.01035664, + "balance_loss_clip": 1.02349293, + "balance_loss_mlp": 1.03998208, + "epoch": 0.5767022395911618, + "flos": 25848895962240.0, + "grad_norm": 1.6611744555405081, + "language_loss": 0.77684325, + "learning_rate": 1.6028745875372406e-06, + "loss": 0.7982983, + "num_input_tokens_seen": 206634265, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69921875, + "step": 9592, + "time_per_iteration": 2.4990251064300537 + }, + { + "auxiliary_loss_clip": 0.01029258, + "auxiliary_loss_mlp": 0.01002299, + "balance_loss_clip": 1.00114298, + "balance_loss_mlp": 1.00790858, + "epoch": 0.5767623628438299, + "flos": 68293299657600.0, + "grad_norm": 0.7302836874791289, + "language_loss": 0.59611464, + "learning_rate": 1.6024928882763885e-06, + "loss": 0.61643022, + "num_input_tokens_seen": 206696990, + "router_z_loss_clip": 0.01153564, + "router_z_loss_mlp": 0.21386719, + "step": 9593, + "time_per_iteration": 3.1885087490081787 + }, + { + "auxiliary_loss_clip": 0.01110729, + "auxiliary_loss_mlp": 0.01039747, + "balance_loss_clip": 1.0265801, + "balance_loss_mlp": 1.03883052, + "epoch": 0.5768224860964978, + "flos": 30188448754560.0, + "grad_norm": 2.3535875138052806, + "language_loss": 0.7131753, + "learning_rate": 1.6021112040890463e-06, + "loss": 0.73468006, + "num_input_tokens_seen": 206717815, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71875, + "step": 9594, + "time_per_iteration": 3.89677357673645 + }, + { + "auxiliary_loss_clip": 0.01107394, + "auxiliary_loss_mlp": 0.01032136, + "balance_loss_clip": 1.02087677, + "balance_loss_mlp": 1.03755784, + "epoch": 0.5768826093491658, + "flos": 17895041884800.0, + "grad_norm": 1.9084853230861274, + "language_loss": 0.71146429, + "learning_rate": 1.6017295349896863e-06, + "loss": 0.73285961, + "num_input_tokens_seen": 206735985, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.69921875, + "step": 9595, + "time_per_iteration": 2.438798666000366 + }, + { + "auxiliary_loss_clip": 0.01106901, + "auxiliary_loss_mlp": 0.01029125, + "balance_loss_clip": 1.01726389, + "balance_loss_mlp": 1.03756046, + "epoch": 0.5769427326018337, + "flos": 17457183095040.0, + "grad_norm": 2.7843520689138646, + "language_loss": 0.69750065, + "learning_rate": 1.6013478809927828e-06, + "loss": 0.71886092, + "num_input_tokens_seen": 206753370, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6953125, + "step": 9596, + "time_per_iteration": 3.8589518070220947 + }, + { + "auxiliary_loss_clip": 0.01111865, + "auxiliary_loss_mlp": 0.01036248, + "balance_loss_clip": 1.02235997, + "balance_loss_mlp": 1.03845882, + "epoch": 0.5770028558545017, + "flos": 39421728345600.0, + "grad_norm": 2.3208716765708974, + "language_loss": 0.67437601, + "learning_rate": 1.6009662421128074e-06, + "loss": 0.69585705, + "num_input_tokens_seen": 206777645, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.734375, + "step": 9597, + "time_per_iteration": 4.025861501693726 + }, + { + "auxiliary_loss_clip": 0.01107063, + "auxiliary_loss_mlp": 0.01033781, + "balance_loss_clip": 1.02220011, + "balance_loss_mlp": 1.03775668, + "epoch": 0.5770629791071697, + "flos": 21536383132800.0, + "grad_norm": 2.263151487781109, + "language_loss": 0.81492549, + "learning_rate": 1.6005846183642323e-06, + "loss": 0.83633393, + "num_input_tokens_seen": 206794865, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.69140625, + "step": 9598, + "time_per_iteration": 2.4457364082336426 + }, + { + "auxiliary_loss_clip": 0.01108029, + "auxiliary_loss_mlp": 0.01030455, + "balance_loss_clip": 1.01787877, + "balance_loss_mlp": 1.03758776, + "epoch": 0.5771231023598377, + "flos": 20886795624960.0, + "grad_norm": 1.482456402920166, + "language_loss": 0.72767603, + "learning_rate": 1.6002030097615277e-06, + "loss": 0.74906087, + "num_input_tokens_seen": 206814095, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 9599, + "time_per_iteration": 2.440633773803711 + }, + { + "auxiliary_loss_clip": 0.0110238, + "auxiliary_loss_mlp": 0.01029145, + "balance_loss_clip": 1.0178082, + "balance_loss_mlp": 1.03569376, + "epoch": 0.5771832256125057, + "flos": 18077216688000.0, + "grad_norm": 1.8193310631715605, + "language_loss": 0.77990794, + "learning_rate": 1.5998214163191663e-06, + "loss": 0.80122316, + "num_input_tokens_seen": 206832245, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66796875, + "step": 9600, + "time_per_iteration": 2.4627256393432617 + }, + { + "auxiliary_loss_clip": 0.01108817, + "auxiliary_loss_mlp": 0.01033376, + "balance_loss_clip": 1.0210135, + "balance_loss_mlp": 1.03849137, + "epoch": 0.5772433488651736, + "flos": 26359078786560.0, + "grad_norm": 1.5552976085447456, + "language_loss": 0.72505343, + "learning_rate": 1.5994398380516163e-06, + "loss": 0.74647534, + "num_input_tokens_seen": 206851535, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.703125, + "step": 9601, + "time_per_iteration": 2.5040857791900635 + }, + { + "auxiliary_loss_clip": 0.01107029, + "auxiliary_loss_mlp": 0.01033902, + "balance_loss_clip": 1.02177262, + "balance_loss_mlp": 1.03861833, + "epoch": 0.5773034721178416, + "flos": 19680987035520.0, + "grad_norm": 1.6061208919603027, + "language_loss": 0.68449026, + "learning_rate": 1.599058274973348e-06, + "loss": 0.7058996, + "num_input_tokens_seen": 206870595, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.68359375, + "step": 9602, + "time_per_iteration": 2.4730873107910156 + }, + { + "auxiliary_loss_clip": 0.01102166, + "auxiliary_loss_mlp": 0.01030844, + "balance_loss_clip": 1.01990008, + "balance_loss_mlp": 1.03666043, + "epoch": 0.5773635953705095, + "flos": 25082885496960.0, + "grad_norm": 1.4427131087039327, + "language_loss": 0.72969544, + "learning_rate": 1.5986767270988297e-06, + "loss": 0.75102556, + "num_input_tokens_seen": 206892320, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.65625, + "step": 9603, + "time_per_iteration": 2.4821383953094482 + }, + { + "auxiliary_loss_clip": 0.01106029, + "auxiliary_loss_mlp": 0.01029794, + "balance_loss_clip": 1.01787269, + "balance_loss_mlp": 1.03815305, + "epoch": 0.5774237186231775, + "flos": 21032987978880.0, + "grad_norm": 1.760798848795816, + "language_loss": 0.76811421, + "learning_rate": 1.5982951944425298e-06, + "loss": 0.78947246, + "num_input_tokens_seen": 206912485, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 9604, + "time_per_iteration": 2.4963274002075195 + }, + { + "auxiliary_loss_clip": 0.01108714, + "auxiliary_loss_mlp": 0.01033384, + "balance_loss_clip": 1.01986611, + "balance_loss_mlp": 1.03805828, + "epoch": 0.5774838418758454, + "flos": 15231727128960.0, + "grad_norm": 1.8255502953236893, + "language_loss": 0.83589303, + "learning_rate": 1.5979136770189174e-06, + "loss": 0.85731399, + "num_input_tokens_seen": 206929100, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.70703125, + "step": 9605, + "time_per_iteration": 2.420722484588623 + }, + { + "auxiliary_loss_clip": 0.01115788, + "auxiliary_loss_mlp": 0.01032506, + "balance_loss_clip": 1.01826096, + "balance_loss_mlp": 1.041394, + "epoch": 0.5775439651285135, + "flos": 23582609210880.0, + "grad_norm": 1.6448412923605056, + "language_loss": 0.78043878, + "learning_rate": 1.5975321748424581e-06, + "loss": 0.80192173, + "num_input_tokens_seen": 206947020, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.74609375, + "step": 9606, + "time_per_iteration": 2.47755765914917 + }, + { + "auxiliary_loss_clip": 0.01105815, + "auxiliary_loss_mlp": 0.01033593, + "balance_loss_clip": 1.02192283, + "balance_loss_mlp": 1.03780627, + "epoch": 0.5776040883811814, + "flos": 18040515966720.0, + "grad_norm": 1.6466821062116115, + "language_loss": 0.74067813, + "learning_rate": 1.597150687927619e-06, + "loss": 0.76207221, + "num_input_tokens_seen": 206964065, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 9607, + "time_per_iteration": 2.473158597946167 + }, + { + "auxiliary_loss_clip": 0.01107935, + "auxiliary_loss_mlp": 0.01030683, + "balance_loss_clip": 1.01862538, + "balance_loss_mlp": 1.03809416, + "epoch": 0.5776642116338494, + "flos": 18624638937600.0, + "grad_norm": 1.6703318324983303, + "language_loss": 0.69666326, + "learning_rate": 1.5967692162888664e-06, + "loss": 0.71804941, + "num_input_tokens_seen": 206981940, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.69921875, + "step": 9608, + "time_per_iteration": 2.457597255706787 + }, + { + "auxiliary_loss_clip": 0.01109603, + "auxiliary_loss_mlp": 0.01031186, + "balance_loss_clip": 1.01841307, + "balance_loss_mlp": 1.03859639, + "epoch": 0.5777243348865173, + "flos": 28402539517440.0, + "grad_norm": 1.7239529426914375, + "language_loss": 0.76340568, + "learning_rate": 1.596387759940665e-06, + "loss": 0.78481352, + "num_input_tokens_seen": 207002365, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7109375, + "step": 9609, + "time_per_iteration": 2.478379964828491 + }, + { + "auxiliary_loss_clip": 0.01106636, + "auxiliary_loss_mlp": 0.01028415, + "balance_loss_clip": 1.01672637, + "balance_loss_mlp": 1.03600001, + "epoch": 0.5777844581391853, + "flos": 24024705805440.0, + "grad_norm": 1.8185868001057917, + "language_loss": 0.77262604, + "learning_rate": 1.5960063188974808e-06, + "loss": 0.79397655, + "num_input_tokens_seen": 207021195, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.70703125, + "step": 9610, + "time_per_iteration": 2.4817564487457275 + }, + { + "auxiliary_loss_clip": 0.01108101, + "auxiliary_loss_mlp": 0.0102822, + "balance_loss_clip": 1.01526141, + "balance_loss_mlp": 1.03805757, + "epoch": 0.5778445813918534, + "flos": 17777361951360.0, + "grad_norm": 2.0354514470011327, + "language_loss": 0.68514067, + "learning_rate": 1.5956248931737777e-06, + "loss": 0.70650387, + "num_input_tokens_seen": 207037465, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.69921875, + "step": 9611, + "time_per_iteration": 2.401411771774292 + }, + { + "auxiliary_loss_clip": 0.01104847, + "auxiliary_loss_mlp": 0.01026691, + "balance_loss_clip": 1.01431727, + "balance_loss_mlp": 1.03594267, + "epoch": 0.5779047046445213, + "flos": 22233194046720.0, + "grad_norm": 1.8201815228945446, + "language_loss": 0.82796168, + "learning_rate": 1.5952434827840185e-06, + "loss": 0.84927702, + "num_input_tokens_seen": 207054230, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69140625, + "step": 9612, + "time_per_iteration": 2.4473085403442383 + }, + { + "auxiliary_loss_clip": 0.0110712, + "auxiliary_loss_mlp": 0.01031451, + "balance_loss_clip": 1.01915455, + "balance_loss_mlp": 1.0376699, + "epoch": 0.5779648278971893, + "flos": 21434361528960.0, + "grad_norm": 1.6350469107350603, + "language_loss": 0.79244345, + "learning_rate": 1.594862087742667e-06, + "loss": 0.81382918, + "num_input_tokens_seen": 207073150, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6953125, + "step": 9613, + "time_per_iteration": 2.427710771560669 + }, + { + "auxiliary_loss_clip": 0.01104237, + "auxiliary_loss_mlp": 0.01034281, + "balance_loss_clip": 1.02318311, + "balance_loss_mlp": 1.03584552, + "epoch": 0.5780249511498572, + "flos": 19026120228480.0, + "grad_norm": 1.8237036529741348, + "language_loss": 0.77103758, + "learning_rate": 1.5944807080641863e-06, + "loss": 0.79242271, + "num_input_tokens_seen": 207090375, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.68359375, + "step": 9614, + "time_per_iteration": 2.44856595993042 + }, + { + "auxiliary_loss_clip": 0.01108473, + "auxiliary_loss_mlp": 0.0103142, + "balance_loss_clip": 1.01954651, + "balance_loss_mlp": 1.03704453, + "epoch": 0.5780850744025252, + "flos": 12124663752960.0, + "grad_norm": 2.4290592896418093, + "language_loss": 0.8083241, + "learning_rate": 1.5940993437630375e-06, + "loss": 0.829723, + "num_input_tokens_seen": 207106030, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.71484375, + "step": 9615, + "time_per_iteration": 2.387230396270752 + }, + { + "auxiliary_loss_clip": 0.01107982, + "auxiliary_loss_mlp": 0.01032204, + "balance_loss_clip": 1.01979423, + "balance_loss_mlp": 1.0372864, + "epoch": 0.5781451976551931, + "flos": 25044425009280.0, + "grad_norm": 1.467111790124014, + "language_loss": 0.67172909, + "learning_rate": 1.5937179948536825e-06, + "loss": 0.69313097, + "num_input_tokens_seen": 207125435, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.70703125, + "step": 9616, + "time_per_iteration": 2.5091681480407715 + }, + { + "auxiliary_loss_clip": 0.01103982, + "auxiliary_loss_mlp": 0.01031765, + "balance_loss_clip": 1.01983774, + "balance_loss_mlp": 1.03701568, + "epoch": 0.5782053209078611, + "flos": 19245606284160.0, + "grad_norm": 1.7373937933185963, + "language_loss": 0.77820861, + "learning_rate": 1.5933366613505812e-06, + "loss": 0.79956603, + "num_input_tokens_seen": 207145095, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66796875, + "step": 9617, + "time_per_iteration": 2.434692144393921 + }, + { + "auxiliary_loss_clip": 0.01105528, + "auxiliary_loss_mlp": 0.01031393, + "balance_loss_clip": 1.01911426, + "balance_loss_mlp": 1.03798401, + "epoch": 0.578265444160529, + "flos": 25993831340160.0, + "grad_norm": 1.4913926039582375, + "language_loss": 0.75064909, + "learning_rate": 1.5929553432681947e-06, + "loss": 0.77201837, + "num_input_tokens_seen": 207166045, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.67578125, + "step": 9618, + "time_per_iteration": 2.5143377780914307 + }, + { + "auxiliary_loss_clip": 0.01103572, + "auxiliary_loss_mlp": 0.0103001, + "balance_loss_clip": 1.01855421, + "balance_loss_mlp": 1.03614712, + "epoch": 0.5783255674131971, + "flos": 21798603394560.0, + "grad_norm": 1.5244275331123438, + "language_loss": 0.81895173, + "learning_rate": 1.5925740406209826e-06, + "loss": 0.84028757, + "num_input_tokens_seen": 207185290, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.67578125, + "step": 9619, + "time_per_iteration": 2.436741828918457 + }, + { + "auxiliary_loss_clip": 0.01106581, + "auxiliary_loss_mlp": 0.01035226, + "balance_loss_clip": 1.02319741, + "balance_loss_mlp": 1.03689742, + "epoch": 0.578385690665865, + "flos": 24789746603520.0, + "grad_norm": 2.8855702259785874, + "language_loss": 0.7266885, + "learning_rate": 1.5921927534234039e-06, + "loss": 0.7481066, + "num_input_tokens_seen": 207205505, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6953125, + "step": 9620, + "time_per_iteration": 2.508638858795166 + }, + { + "auxiliary_loss_clip": 0.01106937, + "auxiliary_loss_mlp": 0.01031542, + "balance_loss_clip": 1.01994872, + "balance_loss_mlp": 1.0379591, + "epoch": 0.578445813918533, + "flos": 21212864311680.0, + "grad_norm": 1.4901469929607327, + "language_loss": 0.77143538, + "learning_rate": 1.591811481689916e-06, + "loss": 0.79282016, + "num_input_tokens_seen": 207225315, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6875, + "step": 9621, + "time_per_iteration": 2.4620673656463623 + }, + { + "auxiliary_loss_clip": 0.01106096, + "auxiliary_loss_mlp": 0.01031373, + "balance_loss_clip": 1.01862931, + "balance_loss_mlp": 1.03550279, + "epoch": 0.5785059371712009, + "flos": 25046795306880.0, + "grad_norm": 1.5105026325174375, + "language_loss": 0.70597667, + "learning_rate": 1.5914302254349787e-06, + "loss": 0.72735131, + "num_input_tokens_seen": 207247690, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.70703125, + "step": 9622, + "time_per_iteration": 2.509505033493042 + }, + { + "auxiliary_loss_clip": 0.01028849, + "auxiliary_loss_mlp": 0.01001525, + "balance_loss_clip": 1.00028539, + "balance_loss_mlp": 1.007653, + "epoch": 0.5785660604238689, + "flos": 70843172284800.0, + "grad_norm": 0.7726155153830789, + "language_loss": 0.55941814, + "learning_rate": 1.5910489846730476e-06, + "loss": 0.57972187, + "num_input_tokens_seen": 207301735, + "router_z_loss_clip": 0.01239014, + "router_z_loss_mlp": 0.21191406, + "step": 9623, + "time_per_iteration": 3.0823814868927 + }, + { + "auxiliary_loss_clip": 0.01109291, + "auxiliary_loss_mlp": 0.01036509, + "balance_loss_clip": 1.02344918, + "balance_loss_mlp": 1.03692317, + "epoch": 0.578626183676537, + "flos": 31649977244160.0, + "grad_norm": 2.2221143081246373, + "language_loss": 0.71056175, + "learning_rate": 1.5906677594185799e-06, + "loss": 0.73201978, + "num_input_tokens_seen": 207321240, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.72265625, + "step": 9624, + "time_per_iteration": 2.5265705585479736 + }, + { + "auxiliary_loss_clip": 0.01107503, + "auxiliary_loss_mlp": 0.01037993, + "balance_loss_clip": 1.02552414, + "balance_loss_mlp": 1.03862953, + "epoch": 0.5786863069292049, + "flos": 21865181253120.0, + "grad_norm": 2.222167937534436, + "language_loss": 0.82642812, + "learning_rate": 1.5902865496860322e-06, + "loss": 0.84788311, + "num_input_tokens_seen": 207339540, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 9625, + "time_per_iteration": 2.451249122619629 + }, + { + "auxiliary_loss_clip": 0.01104891, + "auxiliary_loss_mlp": 0.01034617, + "balance_loss_clip": 1.02198672, + "balance_loss_mlp": 1.03701115, + "epoch": 0.5787464301818729, + "flos": 23364954748800.0, + "grad_norm": 1.455235974234194, + "language_loss": 0.69956779, + "learning_rate": 1.5899053554898591e-06, + "loss": 0.72096288, + "num_input_tokens_seen": 207360470, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6796875, + "step": 9626, + "time_per_iteration": 2.4975287914276123 + }, + { + "auxiliary_loss_clip": 0.01103607, + "auxiliary_loss_mlp": 0.01036492, + "balance_loss_clip": 1.02480352, + "balance_loss_mlp": 1.03568482, + "epoch": 0.5788065534345408, + "flos": 30004011394560.0, + "grad_norm": 1.93553238886208, + "language_loss": 0.71862161, + "learning_rate": 1.5895241768445166e-06, + "loss": 0.7400226, + "num_input_tokens_seen": 207383080, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 9627, + "time_per_iteration": 2.5138702392578125 + }, + { + "auxiliary_loss_clip": 0.01104177, + "auxiliary_loss_mlp": 0.01028958, + "balance_loss_clip": 1.01737726, + "balance_loss_mlp": 1.03599048, + "epoch": 0.5788666766872088, + "flos": 24527849564160.0, + "grad_norm": 1.727007676436273, + "language_loss": 0.8414377, + "learning_rate": 1.589143013764458e-06, + "loss": 0.86276901, + "num_input_tokens_seen": 207401000, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6796875, + "step": 9628, + "time_per_iteration": 2.4851796627044678 + }, + { + "auxiliary_loss_clip": 0.01103695, + "auxiliary_loss_mlp": 0.01025516, + "balance_loss_clip": 1.01394033, + "balance_loss_mlp": 1.03516388, + "epoch": 0.5789267999398767, + "flos": 23732823888000.0, + "grad_norm": 1.6873428245402236, + "language_loss": 0.71942705, + "learning_rate": 1.5887618662641376e-06, + "loss": 0.74071914, + "num_input_tokens_seen": 207419230, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6875, + "step": 9629, + "time_per_iteration": 2.520517587661743 + }, + { + "auxiliary_loss_clip": 0.01108734, + "auxiliary_loss_mlp": 0.01034367, + "balance_loss_clip": 1.02181387, + "balance_loss_mlp": 1.03963637, + "epoch": 0.5789869231925447, + "flos": 21135045496320.0, + "grad_norm": 1.9628574132847711, + "language_loss": 0.74576336, + "learning_rate": 1.5883807343580087e-06, + "loss": 0.76719439, + "num_input_tokens_seen": 207437615, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69140625, + "step": 9630, + "time_per_iteration": 2.454810380935669 + }, + { + "auxiliary_loss_clip": 0.01101829, + "auxiliary_loss_mlp": 0.01028168, + "balance_loss_clip": 1.01682508, + "balance_loss_mlp": 1.03553247, + "epoch": 0.5790470464452127, + "flos": 21209632087680.0, + "grad_norm": 1.6371763310429226, + "language_loss": 0.79325604, + "learning_rate": 1.587999618060523e-06, + "loss": 0.814556, + "num_input_tokens_seen": 207457270, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 9631, + "time_per_iteration": 2.440864324569702 + }, + { + "auxiliary_loss_clip": 0.01104911, + "auxiliary_loss_mlp": 0.01030044, + "balance_loss_clip": 1.01800966, + "balance_loss_mlp": 1.03596497, + "epoch": 0.5791071696978807, + "flos": 23404384903680.0, + "grad_norm": 1.6037309933130668, + "language_loss": 0.75137591, + "learning_rate": 1.5876185173861333e-06, + "loss": 0.77272546, + "num_input_tokens_seen": 207477890, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6875, + "step": 9632, + "time_per_iteration": 2.4771668910980225 + }, + { + "auxiliary_loss_clip": 0.01106006, + "auxiliary_loss_mlp": 0.01027741, + "balance_loss_clip": 1.01517081, + "balance_loss_mlp": 1.03731871, + "epoch": 0.5791672929505486, + "flos": 24206521472640.0, + "grad_norm": 2.4626986888140716, + "language_loss": 0.79077435, + "learning_rate": 1.5872374323492915e-06, + "loss": 0.81211185, + "num_input_tokens_seen": 207497670, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 9633, + "time_per_iteration": 2.448436737060547 + }, + { + "auxiliary_loss_clip": 0.01115908, + "auxiliary_loss_mlp": 0.0103724, + "balance_loss_clip": 1.02447283, + "balance_loss_mlp": 1.04036343, + "epoch": 0.5792274162032166, + "flos": 24348871071360.0, + "grad_norm": 1.7086543878642706, + "language_loss": 0.77430606, + "learning_rate": 1.5868563629644464e-06, + "loss": 0.79583752, + "num_input_tokens_seen": 207516105, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.75390625, + "step": 9634, + "time_per_iteration": 2.4811017513275146 + }, + { + "auxiliary_loss_clip": 0.01108474, + "auxiliary_loss_mlp": 0.01038015, + "balance_loss_clip": 1.02580237, + "balance_loss_mlp": 1.03722477, + "epoch": 0.5792875394558845, + "flos": 20449403712000.0, + "grad_norm": 2.1301414361920843, + "language_loss": 0.63183784, + "learning_rate": 1.5864753092460502e-06, + "loss": 0.65330267, + "num_input_tokens_seen": 207533685, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.71484375, + "step": 9635, + "time_per_iteration": 3.8360743522644043 + }, + { + "auxiliary_loss_clip": 0.01105747, + "auxiliary_loss_mlp": 0.01036, + "balance_loss_clip": 1.02431154, + "balance_loss_mlp": 1.03854156, + "epoch": 0.5793476627085525, + "flos": 24060329118720.0, + "grad_norm": 1.5921207664968484, + "language_loss": 0.76923883, + "learning_rate": 1.5860942712085516e-06, + "loss": 0.79065627, + "num_input_tokens_seen": 207552840, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.671875, + "step": 9636, + "time_per_iteration": 2.4524970054626465 + }, + { + "auxiliary_loss_clip": 0.01101976, + "auxiliary_loss_mlp": 0.01032358, + "balance_loss_clip": 1.020854, + "balance_loss_mlp": 1.03643167, + "epoch": 0.5794077859612206, + "flos": 22054287381120.0, + "grad_norm": 1.6428369167222547, + "language_loss": 0.68367255, + "learning_rate": 1.5857132488663998e-06, + "loss": 0.70501596, + "num_input_tokens_seen": 207572095, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 9637, + "time_per_iteration": 3.9001073837280273 + }, + { + "auxiliary_loss_clip": 0.01107722, + "auxiliary_loss_mlp": 0.010306, + "balance_loss_clip": 1.0181725, + "balance_loss_mlp": 1.03622174, + "epoch": 0.5794679092138885, + "flos": 11434855991040.0, + "grad_norm": 2.3860817889930326, + "language_loss": 0.72291076, + "learning_rate": 1.585332242234043e-06, + "loss": 0.74429405, + "num_input_tokens_seen": 207587495, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71484375, + "step": 9638, + "time_per_iteration": 3.8099658489227295 + }, + { + "auxiliary_loss_clip": 0.01106068, + "auxiliary_loss_mlp": 0.01031021, + "balance_loss_clip": 1.01981568, + "balance_loss_mlp": 1.03809261, + "epoch": 0.5795280324665565, + "flos": 18880215183360.0, + "grad_norm": 2.0300843650533387, + "language_loss": 0.72111142, + "learning_rate": 1.5849512513259291e-06, + "loss": 0.7424823, + "num_input_tokens_seen": 207606795, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6796875, + "step": 9639, + "time_per_iteration": 3.9071426391601562 + }, + { + "auxiliary_loss_clip": 0.01107692, + "auxiliary_loss_mlp": 0.01034902, + "balance_loss_clip": 1.02291572, + "balance_loss_mlp": 1.03860509, + "epoch": 0.5795881557192244, + "flos": 13005947940480.0, + "grad_norm": 2.0103274032155163, + "language_loss": 0.69715077, + "learning_rate": 1.5845702761565054e-06, + "loss": 0.71857667, + "num_input_tokens_seen": 207623620, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 9640, + "time_per_iteration": 2.433104991912842 + }, + { + "auxiliary_loss_clip": 0.01113005, + "auxiliary_loss_mlp": 0.01038437, + "balance_loss_clip": 1.02583635, + "balance_loss_mlp": 1.03887677, + "epoch": 0.5796482789718924, + "flos": 19932397303680.0, + "grad_norm": 2.7872404958031884, + "language_loss": 0.77623034, + "learning_rate": 1.5841893167402183e-06, + "loss": 0.79774475, + "num_input_tokens_seen": 207639380, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7421875, + "step": 9641, + "time_per_iteration": 2.4301722049713135 + }, + { + "auxiliary_loss_clip": 0.01106272, + "auxiliary_loss_mlp": 0.01030792, + "balance_loss_clip": 1.01930058, + "balance_loss_mlp": 1.0378499, + "epoch": 0.5797084022245603, + "flos": 21650794928640.0, + "grad_norm": 1.8500908876117999, + "language_loss": 0.73673463, + "learning_rate": 1.5838083730915143e-06, + "loss": 0.75810528, + "num_input_tokens_seen": 207657915, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.68359375, + "step": 9642, + "time_per_iteration": 2.49660325050354 + }, + { + "auxiliary_loss_clip": 0.01104964, + "auxiliary_loss_mlp": 0.01029372, + "balance_loss_clip": 1.01718903, + "balance_loss_mlp": 1.03625488, + "epoch": 0.5797685254772283, + "flos": 26031573555840.0, + "grad_norm": 1.696347443177098, + "language_loss": 0.73574042, + "learning_rate": 1.5834274452248378e-06, + "loss": 0.75708383, + "num_input_tokens_seen": 207678620, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 9643, + "time_per_iteration": 2.485637903213501 + }, + { + "auxiliary_loss_clip": 0.01108659, + "auxiliary_loss_mlp": 0.01031042, + "balance_loss_clip": 1.01862597, + "balance_loss_mlp": 1.03768921, + "epoch": 0.5798286487298963, + "flos": 22705167778560.0, + "grad_norm": 1.9990943096580656, + "language_loss": 0.67527819, + "learning_rate": 1.5830465331546352e-06, + "loss": 0.69667518, + "num_input_tokens_seen": 207696980, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 9644, + "time_per_iteration": 2.487901449203491 + }, + { + "auxiliary_loss_clip": 0.01112826, + "auxiliary_loss_mlp": 0.01029368, + "balance_loss_clip": 1.01664853, + "balance_loss_mlp": 1.03988528, + "epoch": 0.5798887719825643, + "flos": 23148988225920.0, + "grad_norm": 2.232135453826953, + "language_loss": 0.85353506, + "learning_rate": 1.5826656368953496e-06, + "loss": 0.87495703, + "num_input_tokens_seen": 207714065, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7265625, + "step": 9645, + "time_per_iteration": 2.4591071605682373 + }, + { + "auxiliary_loss_clip": 0.01108849, + "auxiliary_loss_mlp": 0.01029438, + "balance_loss_clip": 1.01782739, + "balance_loss_mlp": 1.03902066, + "epoch": 0.5799488952352322, + "flos": 24426043441920.0, + "grad_norm": 1.87513340954769, + "language_loss": 0.7528075, + "learning_rate": 1.5822847564614244e-06, + "loss": 0.77419043, + "num_input_tokens_seen": 207734720, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.69921875, + "step": 9646, + "time_per_iteration": 2.5096170902252197 + }, + { + "auxiliary_loss_clip": 0.01111341, + "auxiliary_loss_mlp": 0.01033913, + "balance_loss_clip": 1.02068663, + "balance_loss_mlp": 1.03949249, + "epoch": 0.5800090184879002, + "flos": 38395903829760.0, + "grad_norm": 1.666102030467492, + "language_loss": 0.5938943, + "learning_rate": 1.5819038918673038e-06, + "loss": 0.61534685, + "num_input_tokens_seen": 207755435, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71875, + "step": 9647, + "time_per_iteration": 2.5928401947021484 + }, + { + "auxiliary_loss_clip": 0.01109539, + "auxiliary_loss_mlp": 0.0103437, + "balance_loss_clip": 1.02159083, + "balance_loss_mlp": 1.0388217, + "epoch": 0.5800691417405681, + "flos": 19784840232960.0, + "grad_norm": 1.5329184941218248, + "language_loss": 0.84261942, + "learning_rate": 1.5815230431274288e-06, + "loss": 0.86405849, + "num_input_tokens_seen": 207773570, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.70703125, + "step": 9648, + "time_per_iteration": 2.460245132446289 + }, + { + "auxiliary_loss_clip": 0.01027507, + "auxiliary_loss_mlp": 0.00998956, + "balance_loss_clip": 0.99765694, + "balance_loss_mlp": 1.00610447, + "epoch": 0.5801292649932361, + "flos": 70314565783680.0, + "grad_norm": 0.8404119708733213, + "language_loss": 0.62959844, + "learning_rate": 1.581142210256242e-06, + "loss": 0.64986312, + "num_input_tokens_seen": 207830095, + "router_z_loss_clip": 0.01300049, + "router_z_loss_mlp": 0.21484375, + "step": 9649, + "time_per_iteration": 3.1300153732299805 + }, + { + "auxiliary_loss_clip": 0.01103333, + "auxiliary_loss_mlp": 0.01031569, + "balance_loss_clip": 1.02015436, + "balance_loss_mlp": 1.03649998, + "epoch": 0.5801893882459042, + "flos": 18734812928640.0, + "grad_norm": 2.3310983541006434, + "language_loss": 0.82039601, + "learning_rate": 1.5807613932681857e-06, + "loss": 0.84174502, + "num_input_tokens_seen": 207848555, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 9650, + "time_per_iteration": 2.4216153621673584 + }, + { + "auxiliary_loss_clip": 0.0111056, + "auxiliary_loss_mlp": 0.01032759, + "balance_loss_clip": 1.02018833, + "balance_loss_mlp": 1.0376749, + "epoch": 0.5802495114985721, + "flos": 15596507698560.0, + "grad_norm": 2.3176650701334442, + "language_loss": 0.77372313, + "learning_rate": 1.580380592177698e-06, + "loss": 0.79515636, + "num_input_tokens_seen": 207867060, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7265625, + "step": 9651, + "time_per_iteration": 2.4731314182281494 + }, + { + "auxiliary_loss_clip": 0.01110796, + "auxiliary_loss_mlp": 0.01036307, + "balance_loss_clip": 1.02309239, + "balance_loss_mlp": 1.03978133, + "epoch": 0.5803096347512401, + "flos": 18255405081600.0, + "grad_norm": 2.0034024707617575, + "language_loss": 0.74143803, + "learning_rate": 1.5799998069992213e-06, + "loss": 0.76290905, + "num_input_tokens_seen": 207884520, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 9652, + "time_per_iteration": 2.426095485687256 + }, + { + "auxiliary_loss_clip": 0.01106661, + "auxiliary_loss_mlp": 0.01031597, + "balance_loss_clip": 1.01887703, + "balance_loss_mlp": 1.03536129, + "epoch": 0.580369758003908, + "flos": 22893160584960.0, + "grad_norm": 1.9100146686462136, + "language_loss": 0.76669693, + "learning_rate": 1.579619037747193e-06, + "loss": 0.78807956, + "num_input_tokens_seen": 207905370, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9653, + "time_per_iteration": 2.479843854904175 + }, + { + "auxiliary_loss_clip": 0.01107407, + "auxiliary_loss_mlp": 0.01030936, + "balance_loss_clip": 1.01702428, + "balance_loss_mlp": 1.03746295, + "epoch": 0.580429881256576, + "flos": 18697681244160.0, + "grad_norm": 2.3557465918911578, + "language_loss": 0.74466497, + "learning_rate": 1.5792382844360534e-06, + "loss": 0.76604843, + "num_input_tokens_seen": 207923790, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.69921875, + "step": 9654, + "time_per_iteration": 2.4389872550964355 + }, + { + "auxiliary_loss_clip": 0.01105384, + "auxiliary_loss_mlp": 0.01033574, + "balance_loss_clip": 1.02185535, + "balance_loss_mlp": 1.0386194, + "epoch": 0.5804900045092439, + "flos": 24681978823680.0, + "grad_norm": 1.67229579578488, + "language_loss": 0.70335853, + "learning_rate": 1.5788575470802408e-06, + "loss": 0.72474813, + "num_input_tokens_seen": 207942335, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 9655, + "time_per_iteration": 2.4667346477508545 + }, + { + "auxiliary_loss_clip": 0.01112207, + "auxiliary_loss_mlp": 0.01038295, + "balance_loss_clip": 1.025087, + "balance_loss_mlp": 1.03787553, + "epoch": 0.580550127761912, + "flos": 23112790295040.0, + "grad_norm": 3.1924669760277666, + "language_loss": 0.69441068, + "learning_rate": 1.5784768256941915e-06, + "loss": 0.71591568, + "num_input_tokens_seen": 207961975, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 9656, + "time_per_iteration": 2.47267746925354 + }, + { + "auxiliary_loss_clip": 0.01104052, + "auxiliary_loss_mlp": 0.01031775, + "balance_loss_clip": 1.02040219, + "balance_loss_mlp": 1.0376507, + "epoch": 0.5806102510145799, + "flos": 18475681236480.0, + "grad_norm": 1.8802574367017126, + "language_loss": 0.71315479, + "learning_rate": 1.5780961202923433e-06, + "loss": 0.73451304, + "num_input_tokens_seen": 207979520, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6640625, + "step": 9657, + "time_per_iteration": 2.411862850189209 + }, + { + "auxiliary_loss_clip": 0.01110384, + "auxiliary_loss_mlp": 0.01033397, + "balance_loss_clip": 1.02014136, + "balance_loss_mlp": 1.03748548, + "epoch": 0.5806703742672479, + "flos": 23915645136000.0, + "grad_norm": 2.139189937245848, + "language_loss": 0.70763719, + "learning_rate": 1.5777154308891328e-06, + "loss": 0.72907501, + "num_input_tokens_seen": 207998375, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7265625, + "step": 9658, + "time_per_iteration": 2.4618098735809326 + }, + { + "auxiliary_loss_clip": 0.01029117, + "auxiliary_loss_mlp": 0.01007613, + "balance_loss_clip": 1.00649261, + "balance_loss_mlp": 1.00762427, + "epoch": 0.5807304975199158, + "flos": 66311999412480.0, + "grad_norm": 0.6568503671216013, + "language_loss": 0.53557444, + "learning_rate": 1.5773347574989953e-06, + "loss": 0.5559417, + "num_input_tokens_seen": 208060605, + "router_z_loss_clip": 0.01123047, + "router_z_loss_mlp": 0.21484375, + "step": 9659, + "time_per_iteration": 3.081292152404785 + }, + { + "auxiliary_loss_clip": 0.01109597, + "auxiliary_loss_mlp": 0.01038179, + "balance_loss_clip": 1.02564979, + "balance_loss_mlp": 1.0386076, + "epoch": 0.5807906207725838, + "flos": 31722444933120.0, + "grad_norm": 2.325531986819307, + "language_loss": 0.62134814, + "learning_rate": 1.576954100136366e-06, + "loss": 0.6428259, + "num_input_tokens_seen": 208080320, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7109375, + "step": 9660, + "time_per_iteration": 2.5101215839385986 + }, + { + "auxiliary_loss_clip": 0.01107552, + "auxiliary_loss_mlp": 0.01033971, + "balance_loss_clip": 1.02121592, + "balance_loss_mlp": 1.03510964, + "epoch": 0.5808507440252517, + "flos": 23801161512960.0, + "grad_norm": 1.644077336412447, + "language_loss": 0.65339613, + "learning_rate": 1.5765734588156797e-06, + "loss": 0.67481142, + "num_input_tokens_seen": 208099305, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7265625, + "step": 9661, + "time_per_iteration": 2.495326042175293 + }, + { + "auxiliary_loss_clip": 0.01101624, + "auxiliary_loss_mlp": 0.0102562, + "balance_loss_clip": 1.01473665, + "balance_loss_mlp": 1.03630924, + "epoch": 0.5809108672779197, + "flos": 13698449222400.0, + "grad_norm": 1.4453410326473544, + "language_loss": 0.74667752, + "learning_rate": 1.5761928335513704e-06, + "loss": 0.76795, + "num_input_tokens_seen": 208116960, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.65234375, + "step": 9662, + "time_per_iteration": 2.4072024822235107 + }, + { + "auxiliary_loss_clip": 0.0102818, + "auxiliary_loss_mlp": 0.01003249, + "balance_loss_clip": 1.0020808, + "balance_loss_mlp": 1.00680053, + "epoch": 0.5809709905305876, + "flos": 69134866381440.0, + "grad_norm": 0.8844058515803096, + "language_loss": 0.58421201, + "learning_rate": 1.5758122243578709e-06, + "loss": 0.60452628, + "num_input_tokens_seen": 208182190, + "router_z_loss_clip": 0.01165771, + "router_z_loss_mlp": 0.21484375, + "step": 9663, + "time_per_iteration": 3.128176689147949 + }, + { + "auxiliary_loss_clip": 0.01107731, + "auxiliary_loss_mlp": 0.01032948, + "balance_loss_clip": 1.02058566, + "balance_loss_mlp": 1.03855336, + "epoch": 0.5810311137832557, + "flos": 19827538525440.0, + "grad_norm": 2.2307426037080558, + "language_loss": 0.82198572, + "learning_rate": 1.5754316312496152e-06, + "loss": 0.84339249, + "num_input_tokens_seen": 208197015, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.69140625, + "step": 9664, + "time_per_iteration": 2.4268438816070557 + }, + { + "auxiliary_loss_clip": 0.01106716, + "auxiliary_loss_mlp": 0.01024753, + "balance_loss_clip": 1.0119977, + "balance_loss_mlp": 1.03471017, + "epoch": 0.5810912370359237, + "flos": 29238503719680.0, + "grad_norm": 1.6499573770914204, + "language_loss": 0.81283242, + "learning_rate": 1.5750510542410337e-06, + "loss": 0.8341471, + "num_input_tokens_seen": 208215795, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.72265625, + "step": 9665, + "time_per_iteration": 2.539750337600708 + }, + { + "auxiliary_loss_clip": 0.01113083, + "auxiliary_loss_mlp": 0.01032325, + "balance_loss_clip": 1.0180558, + "balance_loss_mlp": 1.03968716, + "epoch": 0.5811513602885916, + "flos": 22785572373120.0, + "grad_norm": 1.6493862237198238, + "language_loss": 0.81106472, + "learning_rate": 1.5746704933465599e-06, + "loss": 0.83251882, + "num_input_tokens_seen": 208234655, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.734375, + "step": 9666, + "time_per_iteration": 2.4637341499328613 + }, + { + "auxiliary_loss_clip": 0.01105376, + "auxiliary_loss_mlp": 0.01032179, + "balance_loss_clip": 1.02047861, + "balance_loss_mlp": 1.03734851, + "epoch": 0.5812114835412596, + "flos": 18734346051840.0, + "grad_norm": 1.772076851837157, + "language_loss": 0.79902422, + "learning_rate": 1.5742899485806227e-06, + "loss": 0.82039976, + "num_input_tokens_seen": 208251300, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 9667, + "time_per_iteration": 2.4630167484283447 + }, + { + "auxiliary_loss_clip": 0.01113135, + "auxiliary_loss_mlp": 0.01033461, + "balance_loss_clip": 1.01935887, + "balance_loss_mlp": 1.03786182, + "epoch": 0.5812716067939275, + "flos": 26431295080320.0, + "grad_norm": 1.5126376316707284, + "language_loss": 0.78524494, + "learning_rate": 1.573909419957653e-06, + "loss": 0.80671084, + "num_input_tokens_seen": 208272685, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.75, + "step": 9668, + "time_per_iteration": 2.4933431148529053 + }, + { + "auxiliary_loss_clip": 0.01109741, + "auxiliary_loss_mlp": 0.01031977, + "balance_loss_clip": 1.01976347, + "balance_loss_mlp": 1.03882718, + "epoch": 0.5813317300465956, + "flos": 43397865285120.0, + "grad_norm": 2.2917193824708395, + "language_loss": 0.6405921, + "learning_rate": 1.5735289074920819e-06, + "loss": 0.66200924, + "num_input_tokens_seen": 208294315, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.7109375, + "step": 9669, + "time_per_iteration": 2.711413860321045 + }, + { + "auxiliary_loss_clip": 0.01109059, + "auxiliary_loss_mlp": 0.01034536, + "balance_loss_clip": 1.02185786, + "balance_loss_mlp": 1.03847837, + "epoch": 0.5813918532992635, + "flos": 24785472885120.0, + "grad_norm": 1.7201818199144705, + "language_loss": 0.73401237, + "learning_rate": 1.5731484111983363e-06, + "loss": 0.75544822, + "num_input_tokens_seen": 208315610, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 9670, + "time_per_iteration": 2.481351375579834 + }, + { + "auxiliary_loss_clip": 0.01108134, + "auxiliary_loss_mlp": 0.01035647, + "balance_loss_clip": 1.02327895, + "balance_loss_mlp": 1.03665125, + "epoch": 0.5814519765519315, + "flos": 22857357703680.0, + "grad_norm": 2.1547601144280693, + "language_loss": 0.79159272, + "learning_rate": 1.5727679310908464e-06, + "loss": 0.81303054, + "num_input_tokens_seen": 208334725, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71484375, + "step": 9671, + "time_per_iteration": 2.481765031814575 + }, + { + "auxiliary_loss_clip": 0.01113516, + "auxiliary_loss_mlp": 0.01036309, + "balance_loss_clip": 1.02253985, + "balance_loss_mlp": 1.04052281, + "epoch": 0.5815120998045994, + "flos": 24060831909120.0, + "grad_norm": 1.8667318330129747, + "language_loss": 0.60387075, + "learning_rate": 1.5723874671840399e-06, + "loss": 0.62536901, + "num_input_tokens_seen": 208353825, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.73046875, + "step": 9672, + "time_per_iteration": 2.4585747718811035 + }, + { + "auxiliary_loss_clip": 0.01106042, + "auxiliary_loss_mlp": 0.01027644, + "balance_loss_clip": 1.01597953, + "balance_loss_mlp": 1.03804862, + "epoch": 0.5815722230572674, + "flos": 24279491952000.0, + "grad_norm": 1.9986212138203583, + "language_loss": 0.81078732, + "learning_rate": 1.572007019492342e-06, + "loss": 0.83212423, + "num_input_tokens_seen": 208374160, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 9673, + "time_per_iteration": 2.4950785636901855 + }, + { + "auxiliary_loss_clip": 0.01113708, + "auxiliary_loss_mlp": 0.01035096, + "balance_loss_clip": 1.02148843, + "balance_loss_mlp": 1.03956604, + "epoch": 0.5816323463099353, + "flos": 22200371994240.0, + "grad_norm": 1.7057299891387632, + "language_loss": 0.87750065, + "learning_rate": 1.5716265880301817e-06, + "loss": 0.89898866, + "num_input_tokens_seen": 208392105, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7421875, + "step": 9674, + "time_per_iteration": 2.440136432647705 + }, + { + "auxiliary_loss_clip": 0.01108901, + "auxiliary_loss_mlp": 0.01033088, + "balance_loss_clip": 1.02156651, + "balance_loss_mlp": 1.03789747, + "epoch": 0.5816924695626033, + "flos": 24134448833280.0, + "grad_norm": 1.5021502044615473, + "language_loss": 0.78512001, + "learning_rate": 1.571246172811984e-06, + "loss": 0.80653995, + "num_input_tokens_seen": 208411755, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.7109375, + "step": 9675, + "time_per_iteration": 2.474719285964966 + }, + { + "auxiliary_loss_clip": 0.01108481, + "auxiliary_loss_mlp": 0.010293, + "balance_loss_clip": 1.0162822, + "balance_loss_mlp": 1.03912115, + "epoch": 0.5817525928152713, + "flos": 21324223451520.0, + "grad_norm": 2.1292944862371486, + "language_loss": 0.70189106, + "learning_rate": 1.5708657738521748e-06, + "loss": 0.72326887, + "num_input_tokens_seen": 208429995, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6953125, + "step": 9676, + "time_per_iteration": 2.435563325881958 + }, + { + "auxiliary_loss_clip": 0.01108816, + "auxiliary_loss_mlp": 0.01030513, + "balance_loss_clip": 1.01728702, + "balance_loss_mlp": 1.03810883, + "epoch": 0.5818127160679393, + "flos": 26934510666240.0, + "grad_norm": 2.2453262518267216, + "language_loss": 0.63408953, + "learning_rate": 1.5704853911651779e-06, + "loss": 0.65548283, + "num_input_tokens_seen": 208443655, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.70703125, + "step": 9677, + "time_per_iteration": 3.852684736251831 + }, + { + "auxiliary_loss_clip": 0.01029913, + "auxiliary_loss_mlp": 0.01005476, + "balance_loss_clip": 1.00418842, + "balance_loss_mlp": 1.00840485, + "epoch": 0.5818728393206073, + "flos": 63918626342400.0, + "grad_norm": 0.8082693819649737, + "language_loss": 0.54284507, + "learning_rate": 1.5701050247654182e-06, + "loss": 0.56319892, + "num_input_tokens_seen": 208498405, + "router_z_loss_clip": 0.01287842, + "router_z_loss_mlp": 0.21484375, + "step": 9678, + "time_per_iteration": 3.1727702617645264 + }, + { + "auxiliary_loss_clip": 0.01029364, + "auxiliary_loss_mlp": 0.01, + "balance_loss_clip": 0.99879593, + "balance_loss_mlp": 1.00781882, + "epoch": 0.5819329625732752, + "flos": 64954108638720.0, + "grad_norm": 0.7323225743115229, + "language_loss": 0.56212348, + "learning_rate": 1.569724674667319e-06, + "loss": 0.58241719, + "num_input_tokens_seen": 208559075, + "router_z_loss_clip": 0.01202393, + "router_z_loss_mlp": 0.21484375, + "step": 9679, + "time_per_iteration": 4.407592296600342 + }, + { + "auxiliary_loss_clip": 0.01106016, + "auxiliary_loss_mlp": 0.01028689, + "balance_loss_clip": 1.01719165, + "balance_loss_mlp": 1.03636777, + "epoch": 0.5819930858259432, + "flos": 21215270522880.0, + "grad_norm": 1.5677269140843855, + "language_loss": 0.65393043, + "learning_rate": 1.5693443408853032e-06, + "loss": 0.67527747, + "num_input_tokens_seen": 208577770, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6953125, + "step": 9680, + "time_per_iteration": 3.854875087738037 + }, + { + "auxiliary_loss_clip": 0.01106852, + "auxiliary_loss_mlp": 0.01029479, + "balance_loss_clip": 1.01755846, + "balance_loss_mlp": 1.0371331, + "epoch": 0.5820532090786111, + "flos": 19458520151040.0, + "grad_norm": 1.7974099210270778, + "language_loss": 0.83398807, + "learning_rate": 1.5689640234337933e-06, + "loss": 0.85535139, + "num_input_tokens_seen": 208595110, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 9681, + "time_per_iteration": 3.906952381134033 + }, + { + "auxiliary_loss_clip": 0.01107734, + "auxiliary_loss_mlp": 0.01029608, + "balance_loss_clip": 1.01704884, + "balance_loss_mlp": 1.03765953, + "epoch": 0.5821133323312792, + "flos": 17712615686400.0, + "grad_norm": 1.7009206287297167, + "language_loss": 0.75691867, + "learning_rate": 1.5685837223272109e-06, + "loss": 0.77829218, + "num_input_tokens_seen": 208612080, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 9682, + "time_per_iteration": 2.4177029132843018 + }, + { + "auxiliary_loss_clip": 0.01109999, + "auxiliary_loss_mlp": 0.01029335, + "balance_loss_clip": 1.01696062, + "balance_loss_mlp": 1.03816795, + "epoch": 0.5821734555839471, + "flos": 24571804832640.0, + "grad_norm": 2.1225270667604, + "language_loss": 0.75228214, + "learning_rate": 1.568203437579977e-06, + "loss": 0.77367556, + "num_input_tokens_seen": 208630235, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71875, + "step": 9683, + "time_per_iteration": 2.483633279800415 + }, + { + "auxiliary_loss_clip": 0.01110877, + "auxiliary_loss_mlp": 0.01029498, + "balance_loss_clip": 1.01652765, + "balance_loss_mlp": 1.03809631, + "epoch": 0.5822335788366151, + "flos": 22382259488640.0, + "grad_norm": 1.7411447986789845, + "language_loss": 0.74026191, + "learning_rate": 1.5678231692065116e-06, + "loss": 0.76166564, + "num_input_tokens_seen": 208647925, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7265625, + "step": 9684, + "time_per_iteration": 2.445389986038208 + }, + { + "auxiliary_loss_clip": 0.0111011, + "auxiliary_loss_mlp": 0.01036105, + "balance_loss_clip": 1.02327847, + "balance_loss_mlp": 1.03914332, + "epoch": 0.582293702089283, + "flos": 26722494639360.0, + "grad_norm": 2.480778861643935, + "language_loss": 0.77930081, + "learning_rate": 1.5674429172212348e-06, + "loss": 0.80076301, + "num_input_tokens_seen": 208666180, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7109375, + "step": 9685, + "time_per_iteration": 2.4822564125061035 + }, + { + "auxiliary_loss_clip": 0.01108161, + "auxiliary_loss_mlp": 0.01037765, + "balance_loss_clip": 1.02525425, + "balance_loss_mlp": 1.0376507, + "epoch": 0.582353825341951, + "flos": 17348661129600.0, + "grad_norm": 1.6531366373498986, + "language_loss": 0.75214118, + "learning_rate": 1.5670626816385667e-06, + "loss": 0.77360046, + "num_input_tokens_seen": 208684240, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 9686, + "time_per_iteration": 2.441162109375 + }, + { + "auxiliary_loss_clip": 0.0102947, + "auxiliary_loss_mlp": 0.01008506, + "balance_loss_clip": 1.00720644, + "balance_loss_mlp": 1.00800455, + "epoch": 0.5824139485946189, + "flos": 55473261534720.0, + "grad_norm": 0.8335448804232356, + "language_loss": 0.57427585, + "learning_rate": 1.5666824624729244e-06, + "loss": 0.59465551, + "num_input_tokens_seen": 208736090, + "router_z_loss_clip": 0.01300049, + "router_z_loss_mlp": 0.21484375, + "step": 9687, + "time_per_iteration": 2.887495279312134 + }, + { + "auxiliary_loss_clip": 0.01106071, + "auxiliary_loss_mlp": 0.01028754, + "balance_loss_clip": 1.01534319, + "balance_loss_mlp": 1.03597438, + "epoch": 0.582474071847287, + "flos": 20303031790080.0, + "grad_norm": 1.808127013520305, + "language_loss": 0.69851446, + "learning_rate": 1.566302259738727e-06, + "loss": 0.7198627, + "num_input_tokens_seen": 208754600, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.703125, + "step": 9688, + "time_per_iteration": 2.475397825241089 + }, + { + "auxiliary_loss_clip": 0.01108083, + "auxiliary_loss_mlp": 0.01032707, + "balance_loss_clip": 1.02077413, + "balance_loss_mlp": 1.03770781, + "epoch": 0.5825341950999549, + "flos": 23878010661120.0, + "grad_norm": 2.8185672100752224, + "language_loss": 0.65197223, + "learning_rate": 1.5659220734503918e-06, + "loss": 0.67338014, + "num_input_tokens_seen": 208773140, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.703125, + "step": 9689, + "time_per_iteration": 2.461808204650879 + }, + { + "auxiliary_loss_clip": 0.01107982, + "auxiliary_loss_mlp": 0.01031185, + "balance_loss_clip": 1.0186801, + "balance_loss_mlp": 1.03977919, + "epoch": 0.5825943183526229, + "flos": 23113041690240.0, + "grad_norm": 1.5648827403998262, + "language_loss": 0.73213816, + "learning_rate": 1.5655419036223341e-06, + "loss": 0.75352979, + "num_input_tokens_seen": 208793410, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 9690, + "time_per_iteration": 2.459392786026001 + }, + { + "auxiliary_loss_clip": 0.01110714, + "auxiliary_loss_mlp": 0.0103267, + "balance_loss_clip": 1.01889586, + "balance_loss_mlp": 1.03849721, + "epoch": 0.5826544416052909, + "flos": 22857429530880.0, + "grad_norm": 1.9110650477929338, + "language_loss": 0.76118016, + "learning_rate": 1.5651617502689717e-06, + "loss": 0.78261399, + "num_input_tokens_seen": 208811920, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.72265625, + "step": 9691, + "time_per_iteration": 2.454533338546753 + }, + { + "auxiliary_loss_clip": 0.01107915, + "auxiliary_loss_mlp": 0.01033477, + "balance_loss_clip": 1.02115119, + "balance_loss_mlp": 1.03619492, + "epoch": 0.5827145648579588, + "flos": 31501845555840.0, + "grad_norm": 1.7126808977143095, + "language_loss": 0.80746913, + "learning_rate": 1.5647816134047184e-06, + "loss": 0.82888305, + "num_input_tokens_seen": 208834720, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.71875, + "step": 9692, + "time_per_iteration": 2.501497268676758 + }, + { + "auxiliary_loss_clip": 0.01027994, + "auxiliary_loss_mlp": 0.01002254, + "balance_loss_clip": 1.00103843, + "balance_loss_mlp": 1.0067246, + "epoch": 0.5827746881106268, + "flos": 69811817074560.0, + "grad_norm": 0.7602984909294345, + "language_loss": 0.56910902, + "learning_rate": 1.5644014930439907e-06, + "loss": 0.5894115, + "num_input_tokens_seen": 208898415, + "router_z_loss_clip": 0.012146, + "router_z_loss_mlp": 0.21289062, + "step": 9693, + "time_per_iteration": 3.0237975120544434 + }, + { + "auxiliary_loss_clip": 0.01106474, + "auxiliary_loss_mlp": 0.01033695, + "balance_loss_clip": 1.02250707, + "balance_loss_mlp": 1.03660345, + "epoch": 0.5828348113632947, + "flos": 23112395245440.0, + "grad_norm": 2.266427213008104, + "language_loss": 0.79537672, + "learning_rate": 1.5640213892012025e-06, + "loss": 0.81677842, + "num_input_tokens_seen": 208919045, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.69921875, + "step": 9694, + "time_per_iteration": 2.4761908054351807 + }, + { + "auxiliary_loss_clip": 0.01104051, + "auxiliary_loss_mlp": 0.01033564, + "balance_loss_clip": 1.02250743, + "balance_loss_mlp": 1.03815889, + "epoch": 0.5828949346159628, + "flos": 21873082245120.0, + "grad_norm": 1.3946621855299897, + "language_loss": 0.75905991, + "learning_rate": 1.5636413018907656e-06, + "loss": 0.7804361, + "num_input_tokens_seen": 208939375, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.66015625, + "step": 9695, + "time_per_iteration": 2.4863994121551514 + }, + { + "auxiliary_loss_clip": 0.01028568, + "auxiliary_loss_mlp": 0.01000024, + "balance_loss_clip": 0.99865955, + "balance_loss_mlp": 1.00692177, + "epoch": 0.5829550578686307, + "flos": 65962553950080.0, + "grad_norm": 0.7688369043614423, + "language_loss": 0.54971713, + "learning_rate": 1.563261231127095e-06, + "loss": 0.57000303, + "num_input_tokens_seen": 209004760, + "router_z_loss_clip": 0.01367188, + "router_z_loss_mlp": 0.21679688, + "step": 9696, + "time_per_iteration": 3.1397409439086914 + }, + { + "auxiliary_loss_clip": 0.01108342, + "auxiliary_loss_mlp": 0.0102802, + "balance_loss_clip": 1.01588464, + "balance_loss_mlp": 1.03907263, + "epoch": 0.5830151811212987, + "flos": 16289799079680.0, + "grad_norm": 2.461981122956424, + "language_loss": 0.7641257, + "learning_rate": 1.5628811769246021e-06, + "loss": 0.78548938, + "num_input_tokens_seen": 209022930, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 9697, + "time_per_iteration": 2.4391984939575195 + }, + { + "auxiliary_loss_clip": 0.01110278, + "auxiliary_loss_mlp": 0.01032297, + "balance_loss_clip": 1.01940477, + "balance_loss_mlp": 1.03790259, + "epoch": 0.5830753043739666, + "flos": 24168851084160.0, + "grad_norm": 1.5880971870479619, + "language_loss": 0.77744102, + "learning_rate": 1.5625011392976991e-06, + "loss": 0.79886687, + "num_input_tokens_seen": 209043740, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7265625, + "step": 9698, + "time_per_iteration": 2.5576770305633545 + }, + { + "auxiliary_loss_clip": 0.01107792, + "auxiliary_loss_mlp": 0.01037348, + "balance_loss_clip": 1.02412117, + "balance_loss_mlp": 1.03847361, + "epoch": 0.5831354276266346, + "flos": 27059050097280.0, + "grad_norm": 1.8122014087406897, + "language_loss": 0.83381891, + "learning_rate": 1.5621211182607966e-06, + "loss": 0.85527027, + "num_input_tokens_seen": 209068885, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.6953125, + "step": 9699, + "time_per_iteration": 2.5637032985687256 + }, + { + "auxiliary_loss_clip": 0.01108462, + "auxiliary_loss_mlp": 0.01029094, + "balance_loss_clip": 1.01663673, + "balance_loss_mlp": 1.03769052, + "epoch": 0.5831955508793025, + "flos": 23623475909760.0, + "grad_norm": 2.315377539273772, + "language_loss": 0.66859722, + "learning_rate": 1.561741113828305e-06, + "loss": 0.68997276, + "num_input_tokens_seen": 209087340, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.70703125, + "step": 9700, + "time_per_iteration": 2.471012592315674 + }, + { + "auxiliary_loss_clip": 0.01106015, + "auxiliary_loss_mlp": 0.01032562, + "balance_loss_clip": 1.0199858, + "balance_loss_mlp": 1.03591251, + "epoch": 0.5832556741319705, + "flos": 24973250209920.0, + "grad_norm": 1.5256356872175616, + "language_loss": 0.713889, + "learning_rate": 1.5613611260146344e-06, + "loss": 0.73527479, + "num_input_tokens_seen": 209108840, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 9701, + "time_per_iteration": 2.4697649478912354 + }, + { + "auxiliary_loss_clip": 0.01104917, + "auxiliary_loss_mlp": 0.01031819, + "balance_loss_clip": 1.01984477, + "balance_loss_mlp": 1.03625238, + "epoch": 0.5833157973846385, + "flos": 23221563655680.0, + "grad_norm": 1.810379708827147, + "language_loss": 0.85387969, + "learning_rate": 1.5609811548341936e-06, + "loss": 0.87524706, + "num_input_tokens_seen": 209127985, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6875, + "step": 9702, + "time_per_iteration": 2.481027841567993 + }, + { + "auxiliary_loss_clip": 0.01100783, + "auxiliary_loss_mlp": 0.0103365, + "balance_loss_clip": 1.02206278, + "balance_loss_mlp": 1.0346241, + "epoch": 0.5833759206373065, + "flos": 21977941023360.0, + "grad_norm": 1.4628982512923412, + "language_loss": 0.77776694, + "learning_rate": 1.560601200301392e-06, + "loss": 0.79911131, + "num_input_tokens_seen": 209146885, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 9703, + "time_per_iteration": 2.435124397277832 + }, + { + "auxiliary_loss_clip": 0.01110145, + "auxiliary_loss_mlp": 0.01030447, + "balance_loss_clip": 1.01736951, + "balance_loss_mlp": 1.03907001, + "epoch": 0.5834360438899745, + "flos": 21762405463680.0, + "grad_norm": 1.7159930715569567, + "language_loss": 0.71405482, + "learning_rate": 1.5602212624306366e-06, + "loss": 0.73546076, + "num_input_tokens_seen": 209166130, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 9704, + "time_per_iteration": 2.4737584590911865 + }, + { + "auxiliary_loss_clip": 0.01107118, + "auxiliary_loss_mlp": 0.01031577, + "balance_loss_clip": 1.02001369, + "balance_loss_mlp": 1.03844225, + "epoch": 0.5834961671426424, + "flos": 15992566035840.0, + "grad_norm": 2.155391395554278, + "language_loss": 0.814731, + "learning_rate": 1.559841341236335e-06, + "loss": 0.83611786, + "num_input_tokens_seen": 209183350, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6875, + "step": 9705, + "time_per_iteration": 2.456681966781616 + }, + { + "auxiliary_loss_clip": 0.01105829, + "auxiliary_loss_mlp": 0.01029323, + "balance_loss_clip": 1.01780725, + "balance_loss_mlp": 1.03706515, + "epoch": 0.5835562903953104, + "flos": 22818322598400.0, + "grad_norm": 2.7067870421451805, + "language_loss": 0.80659604, + "learning_rate": 1.5594614367328937e-06, + "loss": 0.82794762, + "num_input_tokens_seen": 209203945, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6875, + "step": 9706, + "time_per_iteration": 2.497509717941284 + }, + { + "auxiliary_loss_clip": 0.01104424, + "auxiliary_loss_mlp": 0.01031781, + "balance_loss_clip": 1.01860809, + "balance_loss_mlp": 1.03667164, + "epoch": 0.5836164136479783, + "flos": 48468056624640.0, + "grad_norm": 2.0481497339382084, + "language_loss": 0.74599034, + "learning_rate": 1.5590815489347187e-06, + "loss": 0.7673524, + "num_input_tokens_seen": 209227080, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.6796875, + "step": 9707, + "time_per_iteration": 2.6745028495788574 + }, + { + "auxiliary_loss_clip": 0.01103427, + "auxiliary_loss_mlp": 0.01030508, + "balance_loss_clip": 1.01876628, + "balance_loss_mlp": 1.03624749, + "epoch": 0.5836765369006464, + "flos": 26905998245760.0, + "grad_norm": 1.608372812838098, + "language_loss": 0.81249726, + "learning_rate": 1.5587016778562163e-06, + "loss": 0.83383656, + "num_input_tokens_seen": 209248170, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 9708, + "time_per_iteration": 2.492741584777832 + }, + { + "auxiliary_loss_clip": 0.01106344, + "auxiliary_loss_mlp": 0.01027852, + "balance_loss_clip": 1.01569307, + "balance_loss_mlp": 1.03903604, + "epoch": 0.5837366601533143, + "flos": 20084048524800.0, + "grad_norm": 1.7521527331614153, + "language_loss": 0.78249604, + "learning_rate": 1.5583218235117896e-06, + "loss": 0.80383801, + "num_input_tokens_seen": 209267730, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.671875, + "step": 9709, + "time_per_iteration": 2.476956844329834 + }, + { + "auxiliary_loss_clip": 0.01027997, + "auxiliary_loss_mlp": 0.01002158, + "balance_loss_clip": 1.00083506, + "balance_loss_mlp": 1.0065155, + "epoch": 0.5837967834059823, + "flos": 65363885971200.0, + "grad_norm": 0.7691792257321526, + "language_loss": 0.56582153, + "learning_rate": 1.557941985915844e-06, + "loss": 0.58612299, + "num_input_tokens_seen": 209332510, + "router_z_loss_clip": 0.01324463, + "router_z_loss_mlp": 0.21484375, + "step": 9710, + "time_per_iteration": 3.0814101696014404 + }, + { + "auxiliary_loss_clip": 0.0110345, + "auxiliary_loss_mlp": 0.01032732, + "balance_loss_clip": 1.0211035, + "balance_loss_mlp": 1.03715682, + "epoch": 0.5838569066586502, + "flos": 25338641310720.0, + "grad_norm": 1.5515305439757483, + "language_loss": 0.65762496, + "learning_rate": 1.5575621650827833e-06, + "loss": 0.67898679, + "num_input_tokens_seen": 209353355, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 9711, + "time_per_iteration": 2.4872825145721436 + }, + { + "auxiliary_loss_clip": 0.01112071, + "auxiliary_loss_mlp": 0.01034684, + "balance_loss_clip": 1.02147532, + "balance_loss_mlp": 1.03822017, + "epoch": 0.5839170299113182, + "flos": 22229243550720.0, + "grad_norm": 1.6429842517443687, + "language_loss": 0.78599298, + "learning_rate": 1.5571823610270085e-06, + "loss": 0.80746061, + "num_input_tokens_seen": 209370960, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73828125, + "step": 9712, + "time_per_iteration": 2.442077398300171 + }, + { + "auxiliary_loss_clip": 0.01105322, + "auxiliary_loss_mlp": 0.01025498, + "balance_loss_clip": 1.01343966, + "balance_loss_mlp": 1.03646183, + "epoch": 0.5839771531639861, + "flos": 22200012858240.0, + "grad_norm": 1.7240347174541215, + "language_loss": 0.73268932, + "learning_rate": 1.5568025737629234e-06, + "loss": 0.7539975, + "num_input_tokens_seen": 209390955, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6875, + "step": 9713, + "time_per_iteration": 2.459120750427246 + }, + { + "auxiliary_loss_clip": 0.01110691, + "auxiliary_loss_mlp": 0.01029593, + "balance_loss_clip": 1.01647365, + "balance_loss_mlp": 1.03805757, + "epoch": 0.5840372764166541, + "flos": 22419355259520.0, + "grad_norm": 1.8470967199163717, + "language_loss": 0.69391453, + "learning_rate": 1.5564228033049292e-06, + "loss": 0.71531737, + "num_input_tokens_seen": 209410260, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7265625, + "step": 9714, + "time_per_iteration": 2.4558205604553223 + }, + { + "auxiliary_loss_clip": 0.01106219, + "auxiliary_loss_mlp": 0.01030193, + "balance_loss_clip": 1.01737761, + "balance_loss_mlp": 1.03574395, + "epoch": 0.5840973996693221, + "flos": 19828256797440.0, + "grad_norm": 1.7342681115417722, + "language_loss": 0.79977894, + "learning_rate": 1.5560430496674268e-06, + "loss": 0.82114303, + "num_input_tokens_seen": 209429920, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.70703125, + "step": 9715, + "time_per_iteration": 2.426506757736206 + }, + { + "auxiliary_loss_clip": 0.01106351, + "auxiliary_loss_mlp": 0.01029273, + "balance_loss_clip": 1.01666617, + "balance_loss_mlp": 1.037099, + "epoch": 0.5841575229219901, + "flos": 21142982401920.0, + "grad_norm": 4.9488403812071535, + "language_loss": 0.72778314, + "learning_rate": 1.5556633128648167e-06, + "loss": 0.74913943, + "num_input_tokens_seen": 209449470, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69140625, + "step": 9716, + "time_per_iteration": 2.44687819480896 + }, + { + "auxiliary_loss_clip": 0.01103683, + "auxiliary_loss_mlp": 0.01027677, + "balance_loss_clip": 1.01595879, + "balance_loss_mlp": 1.03716838, + "epoch": 0.5842176461746581, + "flos": 24640322025600.0, + "grad_norm": 1.6127648254863816, + "language_loss": 0.74810076, + "learning_rate": 1.5552835929114976e-06, + "loss": 0.76941431, + "num_input_tokens_seen": 209467695, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 9717, + "time_per_iteration": 2.460857629776001 + }, + { + "auxiliary_loss_clip": 0.01105902, + "auxiliary_loss_mlp": 0.0103646, + "balance_loss_clip": 1.02414012, + "balance_loss_mlp": 1.03733993, + "epoch": 0.584277769427326, + "flos": 19131158574720.0, + "grad_norm": 2.202005488151785, + "language_loss": 0.7997486, + "learning_rate": 1.5549038898218697e-06, + "loss": 0.82117224, + "num_input_tokens_seen": 209484250, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 9718, + "time_per_iteration": 2.4178881645202637 + }, + { + "auxiliary_loss_clip": 0.01106549, + "auxiliary_loss_mlp": 0.01032036, + "balance_loss_clip": 1.01891065, + "balance_loss_mlp": 1.03846669, + "epoch": 0.584337892679994, + "flos": 22675111073280.0, + "grad_norm": 1.4800218219438264, + "language_loss": 0.67422116, + "learning_rate": 1.5545242036103306e-06, + "loss": 0.69560701, + "num_input_tokens_seen": 209502830, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6796875, + "step": 9719, + "time_per_iteration": 3.8449153900146484 + }, + { + "auxiliary_loss_clip": 0.01107677, + "auxiliary_loss_mlp": 0.01028351, + "balance_loss_clip": 1.01631081, + "balance_loss_mlp": 1.03717732, + "epoch": 0.5843980159326619, + "flos": 31284083352960.0, + "grad_norm": 2.1638863024999484, + "language_loss": 0.75937355, + "learning_rate": 1.5541445342912786e-06, + "loss": 0.78073382, + "num_input_tokens_seen": 209525995, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.703125, + "step": 9720, + "time_per_iteration": 2.521005630493164 + }, + { + "auxiliary_loss_clip": 0.01106215, + "auxiliary_loss_mlp": 0.01037967, + "balance_loss_clip": 1.02579594, + "balance_loss_mlp": 1.03623533, + "epoch": 0.58445813918533, + "flos": 22748117466240.0, + "grad_norm": 1.5774446570210707, + "language_loss": 0.83079016, + "learning_rate": 1.5537648818791105e-06, + "loss": 0.85223192, + "num_input_tokens_seen": 209545895, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69921875, + "step": 9721, + "time_per_iteration": 3.9998085498809814 + }, + { + "auxiliary_loss_clip": 0.01030301, + "auxiliary_loss_mlp": 0.01007637, + "balance_loss_clip": 1.00636697, + "balance_loss_mlp": 1.00867438, + "epoch": 0.5845182624379979, + "flos": 60686556658560.0, + "grad_norm": 0.9369686939257119, + "language_loss": 0.71297473, + "learning_rate": 1.5533852463882226e-06, + "loss": 0.73335409, + "num_input_tokens_seen": 209602315, + "router_z_loss_clip": 0.01269531, + "router_z_loss_mlp": 0.21679688, + "step": 9722, + "time_per_iteration": 4.55988335609436 + }, + { + "auxiliary_loss_clip": 0.01104254, + "auxiliary_loss_mlp": 0.01033708, + "balance_loss_clip": 1.0219183, + "balance_loss_mlp": 1.03621197, + "epoch": 0.5845783856906659, + "flos": 16362446336640.0, + "grad_norm": 2.3592007880272097, + "language_loss": 0.89236099, + "learning_rate": 1.5530056278330113e-06, + "loss": 0.91374058, + "num_input_tokens_seen": 209617615, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 9723, + "time_per_iteration": 3.8671655654907227 + }, + { + "auxiliary_loss_clip": 0.01106969, + "auxiliary_loss_mlp": 0.01031836, + "balance_loss_clip": 1.02042723, + "balance_loss_mlp": 1.03859067, + "epoch": 0.5846385089433338, + "flos": 20083402080000.0, + "grad_norm": 1.4227647539631216, + "language_loss": 0.68610382, + "learning_rate": 1.5526260262278709e-06, + "loss": 0.70749187, + "num_input_tokens_seen": 209637005, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.68359375, + "step": 9724, + "time_per_iteration": 2.428325653076172 + }, + { + "auxiliary_loss_clip": 0.01114065, + "auxiliary_loss_mlp": 0.01034629, + "balance_loss_clip": 1.0221715, + "balance_loss_mlp": 1.04199743, + "epoch": 0.5846986321960018, + "flos": 17311062568320.0, + "grad_norm": 1.8750713541003288, + "language_loss": 0.86348903, + "learning_rate": 1.552246441587197e-06, + "loss": 0.88497603, + "num_input_tokens_seen": 209653170, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71875, + "step": 9725, + "time_per_iteration": 2.4113223552703857 + }, + { + "auxiliary_loss_clip": 0.01112675, + "auxiliary_loss_mlp": 0.01038738, + "balance_loss_clip": 1.02615535, + "balance_loss_mlp": 1.04008734, + "epoch": 0.5847587554486697, + "flos": 17197907748480.0, + "grad_norm": 1.9888550356442254, + "language_loss": 0.82856494, + "learning_rate": 1.5518668739253821e-06, + "loss": 0.85007912, + "num_input_tokens_seen": 209671275, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7265625, + "step": 9726, + "time_per_iteration": 2.4277760982513428 + }, + { + "auxiliary_loss_clip": 0.01108752, + "auxiliary_loss_mlp": 0.01036993, + "balance_loss_clip": 1.02550149, + "balance_loss_mlp": 1.03925705, + "epoch": 0.5848188787013378, + "flos": 24529106540160.0, + "grad_norm": 1.8720162128796731, + "language_loss": 0.66911906, + "learning_rate": 1.5514873232568206e-06, + "loss": 0.69057649, + "num_input_tokens_seen": 209690380, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6953125, + "step": 9727, + "time_per_iteration": 2.4941296577453613 + }, + { + "auxiliary_loss_clip": 0.011109, + "auxiliary_loss_mlp": 0.01042126, + "balance_loss_clip": 1.02927577, + "balance_loss_mlp": 1.04078412, + "epoch": 0.5848790019540057, + "flos": 20628382204800.0, + "grad_norm": 1.755089310778911, + "language_loss": 0.81880605, + "learning_rate": 1.5511077895959055e-06, + "loss": 0.84033632, + "num_input_tokens_seen": 209708845, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69921875, + "step": 9728, + "time_per_iteration": 2.504457950592041 + }, + { + "auxiliary_loss_clip": 0.01105423, + "auxiliary_loss_mlp": 0.01036783, + "balance_loss_clip": 1.02519631, + "balance_loss_mlp": 1.03857303, + "epoch": 0.5849391252066737, + "flos": 22418852469120.0, + "grad_norm": 1.9458365932895556, + "language_loss": 0.78459418, + "learning_rate": 1.550728272957027e-06, + "loss": 0.80601627, + "num_input_tokens_seen": 209729000, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66796875, + "step": 9729, + "time_per_iteration": 2.4906978607177734 + }, + { + "auxiliary_loss_clip": 0.0110723, + "auxiliary_loss_mlp": 0.01029316, + "balance_loss_clip": 1.01629853, + "balance_loss_mlp": 1.03705525, + "epoch": 0.5849992484593417, + "flos": 25410929431680.0, + "grad_norm": 2.2265789157985205, + "language_loss": 0.70611644, + "learning_rate": 1.5503487733545782e-06, + "loss": 0.72748184, + "num_input_tokens_seen": 209747435, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.703125, + "step": 9730, + "time_per_iteration": 2.5273194313049316 + }, + { + "auxiliary_loss_clip": 0.01113418, + "auxiliary_loss_mlp": 0.0103557, + "balance_loss_clip": 1.02182508, + "balance_loss_mlp": 1.04057014, + "epoch": 0.5850593717120096, + "flos": 21065163586560.0, + "grad_norm": 2.222037907468424, + "language_loss": 0.78473902, + "learning_rate": 1.5499692908029482e-06, + "loss": 0.80622888, + "num_input_tokens_seen": 209764910, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7265625, + "step": 9731, + "time_per_iteration": 2.4710583686828613 + }, + { + "auxiliary_loss_clip": 0.0110815, + "auxiliary_loss_mlp": 0.01032599, + "balance_loss_clip": 1.01983809, + "balance_loss_mlp": 1.03908777, + "epoch": 0.5851194949646776, + "flos": 25301545539840.0, + "grad_norm": 1.7845208257427057, + "language_loss": 0.69966131, + "learning_rate": 1.549589825316528e-06, + "loss": 0.72106874, + "num_input_tokens_seen": 209786115, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69140625, + "step": 9732, + "time_per_iteration": 2.4975006580352783 + }, + { + "auxiliary_loss_clip": 0.01113456, + "auxiliary_loss_mlp": 0.01033051, + "balance_loss_clip": 1.01913929, + "balance_loss_mlp": 1.04045916, + "epoch": 0.5851796182173455, + "flos": 23587242065280.0, + "grad_norm": 1.73190032828597, + "language_loss": 0.52698147, + "learning_rate": 1.5492103769097075e-06, + "loss": 0.54844654, + "num_input_tokens_seen": 209806095, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.73046875, + "step": 9733, + "time_per_iteration": 2.485399007797241 + }, + { + "auxiliary_loss_clip": 0.01111159, + "auxiliary_loss_mlp": 0.01030285, + "balance_loss_clip": 1.01741672, + "balance_loss_mlp": 1.04071164, + "epoch": 0.5852397414700136, + "flos": 24822712310400.0, + "grad_norm": 6.263677136925273, + "language_loss": 0.87694037, + "learning_rate": 1.5488309455968739e-06, + "loss": 0.89835489, + "num_input_tokens_seen": 209823650, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 9734, + "time_per_iteration": 2.472288131713867 + }, + { + "auxiliary_loss_clip": 0.01103403, + "auxiliary_loss_mlp": 0.01032428, + "balance_loss_clip": 1.02119839, + "balance_loss_mlp": 1.03833449, + "epoch": 0.5852998647226815, + "flos": 19937784343680.0, + "grad_norm": 1.513447931139509, + "language_loss": 0.72063559, + "learning_rate": 1.5484515313924163e-06, + "loss": 0.7419939, + "num_input_tokens_seen": 209843220, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 9735, + "time_per_iteration": 2.4491236209869385 + }, + { + "auxiliary_loss_clip": 0.01113365, + "auxiliary_loss_mlp": 0.01041862, + "balance_loss_clip": 1.02809381, + "balance_loss_mlp": 1.04022026, + "epoch": 0.5853599879753495, + "flos": 16720367408640.0, + "grad_norm": 2.443961120173282, + "language_loss": 0.74189854, + "learning_rate": 1.5480721343107217e-06, + "loss": 0.76345086, + "num_input_tokens_seen": 209854880, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.73046875, + "step": 9736, + "time_per_iteration": 2.419142961502075 + }, + { + "auxiliary_loss_clip": 0.01106138, + "auxiliary_loss_mlp": 0.01032073, + "balance_loss_clip": 1.01981854, + "balance_loss_mlp": 1.0379591, + "epoch": 0.5854201112280174, + "flos": 44456583680640.0, + "grad_norm": 2.2236691167379083, + "language_loss": 0.70181298, + "learning_rate": 1.5476927543661772e-06, + "loss": 0.72319508, + "num_input_tokens_seen": 209877870, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6796875, + "step": 9737, + "time_per_iteration": 2.6583194732666016 + }, + { + "auxiliary_loss_clip": 0.01106196, + "auxiliary_loss_mlp": 0.0103613, + "balance_loss_clip": 1.02428091, + "balance_loss_mlp": 1.03835154, + "epoch": 0.5854802344806854, + "flos": 20339193807360.0, + "grad_norm": 1.7203982017599655, + "language_loss": 0.82579291, + "learning_rate": 1.547313391573169e-06, + "loss": 0.84721613, + "num_input_tokens_seen": 209896690, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.67578125, + "step": 9738, + "time_per_iteration": 2.4531257152557373 + }, + { + "auxiliary_loss_clip": 0.01113247, + "auxiliary_loss_mlp": 0.01036554, + "balance_loss_clip": 1.02323246, + "balance_loss_mlp": 1.04034615, + "epoch": 0.5855403577333533, + "flos": 20921054221440.0, + "grad_norm": 1.7945048569600959, + "language_loss": 0.68588519, + "learning_rate": 1.546934045946082e-06, + "loss": 0.70738328, + "num_input_tokens_seen": 209914640, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7265625, + "step": 9739, + "time_per_iteration": 2.456914186477661 + }, + { + "auxiliary_loss_clip": 0.01108939, + "auxiliary_loss_mlp": 0.0102703, + "balance_loss_clip": 1.01416099, + "balance_loss_mlp": 1.03718436, + "epoch": 0.5856004809860214, + "flos": 20448649526400.0, + "grad_norm": 3.661868392990544, + "language_loss": 0.58782631, + "learning_rate": 1.5465547174993017e-06, + "loss": 0.60918605, + "num_input_tokens_seen": 209933375, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 9740, + "time_per_iteration": 2.4507863521575928 + }, + { + "auxiliary_loss_clip": 0.01106066, + "auxiliary_loss_mlp": 0.01027305, + "balance_loss_clip": 1.01462674, + "balance_loss_mlp": 1.03621328, + "epoch": 0.5856606042386893, + "flos": 19640766781440.0, + "grad_norm": 2.5503677599504138, + "language_loss": 0.74937272, + "learning_rate": 1.5461754062472113e-06, + "loss": 0.77070647, + "num_input_tokens_seen": 209952055, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 9741, + "time_per_iteration": 2.4589905738830566 + }, + { + "auxiliary_loss_clip": 0.01110252, + "auxiliary_loss_mlp": 0.01030619, + "balance_loss_clip": 1.01856065, + "balance_loss_mlp": 1.04028082, + "epoch": 0.5857207274913573, + "flos": 21686166846720.0, + "grad_norm": 5.17192355324585, + "language_loss": 0.75760782, + "learning_rate": 1.5457961122041959e-06, + "loss": 0.77901655, + "num_input_tokens_seen": 209971190, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.69921875, + "step": 9742, + "time_per_iteration": 2.4604122638702393 + }, + { + "auxiliary_loss_clip": 0.01106761, + "auxiliary_loss_mlp": 0.01028937, + "balance_loss_clip": 1.01720667, + "balance_loss_mlp": 1.03765917, + "epoch": 0.5857808507440253, + "flos": 23182708118400.0, + "grad_norm": 1.843175426453247, + "language_loss": 0.74955082, + "learning_rate": 1.5454168353846369e-06, + "loss": 0.77090788, + "num_input_tokens_seen": 209990695, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.69140625, + "step": 9743, + "time_per_iteration": 2.4604763984680176 + }, + { + "auxiliary_loss_clip": 0.01106043, + "auxiliary_loss_mlp": 0.01028717, + "balance_loss_clip": 1.0171833, + "balance_loss_mlp": 1.03878045, + "epoch": 0.5858409739966932, + "flos": 27235299156480.0, + "grad_norm": 1.7092789137699793, + "language_loss": 0.81049299, + "learning_rate": 1.5450375758029172e-06, + "loss": 0.83184063, + "num_input_tokens_seen": 210010210, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 9744, + "time_per_iteration": 2.516517162322998 + }, + { + "auxiliary_loss_clip": 0.0111328, + "auxiliary_loss_mlp": 0.01029291, + "balance_loss_clip": 1.01756728, + "balance_loss_mlp": 1.04009771, + "epoch": 0.5859010972493612, + "flos": 27855512317440.0, + "grad_norm": 1.7947324983718902, + "language_loss": 0.71260583, + "learning_rate": 1.5446583334734183e-06, + "loss": 0.73403156, + "num_input_tokens_seen": 210030030, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.73046875, + "step": 9745, + "time_per_iteration": 2.5095736980438232 + }, + { + "auxiliary_loss_clip": 0.01029472, + "auxiliary_loss_mlp": 0.01001042, + "balance_loss_clip": 0.99980211, + "balance_loss_mlp": 1.00798225, + "epoch": 0.5859612205020291, + "flos": 70007064428160.0, + "grad_norm": 0.7288291603374486, + "language_loss": 0.5328598, + "learning_rate": 1.5442791084105204e-06, + "loss": 0.55316496, + "num_input_tokens_seen": 210094840, + "router_z_loss_clip": 0.01239014, + "router_z_loss_mlp": 0.21484375, + "step": 9746, + "time_per_iteration": 3.1588006019592285 + }, + { + "auxiliary_loss_clip": 0.01111789, + "auxiliary_loss_mlp": 0.01028882, + "balance_loss_clip": 1.01581621, + "balance_loss_mlp": 1.04034877, + "epoch": 0.5860213437546972, + "flos": 24056019486720.0, + "grad_norm": 2.1076565833563743, + "language_loss": 0.73041242, + "learning_rate": 1.5438999006286054e-06, + "loss": 0.75181913, + "num_input_tokens_seen": 210114660, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71484375, + "step": 9747, + "time_per_iteration": 2.529571533203125 + }, + { + "auxiliary_loss_clip": 0.01110161, + "auxiliary_loss_mlp": 0.01034205, + "balance_loss_clip": 1.02153921, + "balance_loss_mlp": 1.03954244, + "epoch": 0.5860814670073651, + "flos": 18947583141120.0, + "grad_norm": 2.1114805581962934, + "language_loss": 0.81232262, + "learning_rate": 1.543520710142051e-06, + "loss": 0.83376622, + "num_input_tokens_seen": 210132770, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 9748, + "time_per_iteration": 2.4205257892608643 + }, + { + "auxiliary_loss_clip": 0.01108981, + "auxiliary_loss_mlp": 0.01031425, + "balance_loss_clip": 1.01904488, + "balance_loss_mlp": 1.03803837, + "epoch": 0.5861415902600331, + "flos": 22561848512640.0, + "grad_norm": 1.6594717662282998, + "language_loss": 0.71928638, + "learning_rate": 1.5431415369652375e-06, + "loss": 0.74069047, + "num_input_tokens_seen": 210151895, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 9749, + "time_per_iteration": 2.4881033897399902 + }, + { + "auxiliary_loss_clip": 0.0110821, + "auxiliary_loss_mlp": 0.0103104, + "balance_loss_clip": 1.01869583, + "balance_loss_mlp": 1.04076529, + "epoch": 0.586201713512701, + "flos": 14392027912320.0, + "grad_norm": 2.0326510096801056, + "language_loss": 0.7436285, + "learning_rate": 1.5427623811125428e-06, + "loss": 0.76502097, + "num_input_tokens_seen": 210168040, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.67578125, + "step": 9750, + "time_per_iteration": 2.414621353149414 + }, + { + "auxiliary_loss_clip": 0.01108261, + "auxiliary_loss_mlp": 0.01035384, + "balance_loss_clip": 1.02279603, + "balance_loss_mlp": 1.03921914, + "epoch": 0.586261836765369, + "flos": 19498560837120.0, + "grad_norm": 1.743949260258008, + "language_loss": 0.71048808, + "learning_rate": 1.542383242598344e-06, + "loss": 0.73192453, + "num_input_tokens_seen": 210187720, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69140625, + "step": 9751, + "time_per_iteration": 2.4829182624816895 + }, + { + "auxiliary_loss_clip": 0.01112341, + "auxiliary_loss_mlp": 0.01034246, + "balance_loss_clip": 1.02050161, + "balance_loss_mlp": 1.04000425, + "epoch": 0.5863219600180369, + "flos": 20701819560960.0, + "grad_norm": 1.8642101544605258, + "language_loss": 0.74632239, + "learning_rate": 1.5420041214370184e-06, + "loss": 0.76778823, + "num_input_tokens_seen": 210206080, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.72265625, + "step": 9752, + "time_per_iteration": 2.4715142250061035 + }, + { + "auxiliary_loss_clip": 0.01107296, + "auxiliary_loss_mlp": 0.01031223, + "balance_loss_clip": 1.01895666, + "balance_loss_mlp": 1.0386945, + "epoch": 0.586382083270705, + "flos": 19792130693760.0, + "grad_norm": 1.7856678678755609, + "language_loss": 0.77179754, + "learning_rate": 1.541625017642943e-06, + "loss": 0.79318273, + "num_input_tokens_seen": 210225660, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6875, + "step": 9753, + "time_per_iteration": 2.443422794342041 + }, + { + "auxiliary_loss_clip": 0.01105348, + "auxiliary_loss_mlp": 0.01026457, + "balance_loss_clip": 1.01546574, + "balance_loss_mlp": 1.03864121, + "epoch": 0.5864422065233729, + "flos": 16500558130560.0, + "grad_norm": 1.9587413882718219, + "language_loss": 0.70530736, + "learning_rate": 1.5412459312304927e-06, + "loss": 0.72662538, + "num_input_tokens_seen": 210242725, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6640625, + "step": 9754, + "time_per_iteration": 2.409973621368408 + }, + { + "auxiliary_loss_clip": 0.01107928, + "auxiliary_loss_mlp": 0.01031096, + "balance_loss_clip": 1.01829863, + "balance_loss_mlp": 1.03827429, + "epoch": 0.5865023297760409, + "flos": 20413277608320.0, + "grad_norm": 1.747136336565704, + "language_loss": 0.72055626, + "learning_rate": 1.540866862214043e-06, + "loss": 0.74194646, + "num_input_tokens_seen": 210263225, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 9755, + "time_per_iteration": 2.4600584506988525 + }, + { + "auxiliary_loss_clip": 0.01030392, + "auxiliary_loss_mlp": 0.01003259, + "balance_loss_clip": 1.00203705, + "balance_loss_mlp": 1.00899053, + "epoch": 0.5865624530287089, + "flos": 63350769254400.0, + "grad_norm": 0.7394274912640315, + "language_loss": 0.5697751, + "learning_rate": 1.540487810607967e-06, + "loss": 0.59011161, + "num_input_tokens_seen": 210322310, + "router_z_loss_clip": 0.01220703, + "router_z_loss_mlp": 0.21484375, + "step": 9756, + "time_per_iteration": 3.0282156467437744 + }, + { + "auxiliary_loss_clip": 0.01105056, + "auxiliary_loss_mlp": 0.01032383, + "balance_loss_clip": 1.02114117, + "balance_loss_mlp": 1.03774321, + "epoch": 0.5866225762813768, + "flos": 27016279977600.0, + "grad_norm": 1.7702895540430315, + "language_loss": 0.76155764, + "learning_rate": 1.5401087764266396e-06, + "loss": 0.78293204, + "num_input_tokens_seen": 210340845, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 9757, + "time_per_iteration": 2.5391111373901367 + }, + { + "auxiliary_loss_clip": 0.01030392, + "auxiliary_loss_mlp": 0.01004494, + "balance_loss_clip": 1.00322425, + "balance_loss_mlp": 1.00899124, + "epoch": 0.5866826995340448, + "flos": 72987038507520.0, + "grad_norm": 0.8655305518018972, + "language_loss": 0.60531819, + "learning_rate": 1.5397297596844337e-06, + "loss": 0.62566704, + "num_input_tokens_seen": 210397815, + "router_z_loss_clip": 0.01269531, + "router_z_loss_mlp": 0.21484375, + "step": 9758, + "time_per_iteration": 3.0623366832733154 + }, + { + "auxiliary_loss_clip": 0.01113209, + "auxiliary_loss_mlp": 0.01030767, + "balance_loss_clip": 1.01773787, + "balance_loss_mlp": 1.03982747, + "epoch": 0.5867428227867127, + "flos": 21285727050240.0, + "grad_norm": 2.3357598656034897, + "language_loss": 0.71766979, + "learning_rate": 1.5393507603957212e-06, + "loss": 0.73910952, + "num_input_tokens_seen": 210413900, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.734375, + "step": 9759, + "time_per_iteration": 2.474400043487549 + }, + { + "auxiliary_loss_clip": 0.0111074, + "auxiliary_loss_mlp": 0.01032361, + "balance_loss_clip": 1.0208931, + "balance_loss_mlp": 1.04039979, + "epoch": 0.5868029460393808, + "flos": 33468852188160.0, + "grad_norm": 1.5007272591007914, + "language_loss": 0.73244017, + "learning_rate": 1.5389717785748742e-06, + "loss": 0.7538712, + "num_input_tokens_seen": 210434110, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.703125, + "step": 9760, + "time_per_iteration": 4.081261396408081 + }, + { + "auxiliary_loss_clip": 0.01106401, + "auxiliary_loss_mlp": 0.01027896, + "balance_loss_clip": 1.01556969, + "balance_loss_mlp": 1.03715563, + "epoch": 0.5868630692920487, + "flos": 17889475276800.0, + "grad_norm": 1.8805423527385174, + "language_loss": 0.72491598, + "learning_rate": 1.5385928142362637e-06, + "loss": 0.74625897, + "num_input_tokens_seen": 210451685, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69140625, + "step": 9761, + "time_per_iteration": 2.42621111869812 + }, + { + "auxiliary_loss_clip": 0.01107357, + "auxiliary_loss_mlp": 0.0103042, + "balance_loss_clip": 1.0167706, + "balance_loss_mlp": 1.03563881, + "epoch": 0.5869231925447167, + "flos": 21035035054080.0, + "grad_norm": 1.837534804487864, + "language_loss": 0.74821299, + "learning_rate": 1.5382138673942597e-06, + "loss": 0.76959074, + "num_input_tokens_seen": 210470825, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71875, + "step": 9762, + "time_per_iteration": 3.899322032928467 + }, + { + "auxiliary_loss_clip": 0.01107201, + "auxiliary_loss_mlp": 0.01029073, + "balance_loss_clip": 1.01706839, + "balance_loss_mlp": 1.03918064, + "epoch": 0.5869833157973846, + "flos": 74738219293440.0, + "grad_norm": 1.367882310541282, + "language_loss": 0.72223246, + "learning_rate": 1.5378349380632317e-06, + "loss": 0.74359524, + "num_input_tokens_seen": 210500075, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 9763, + "time_per_iteration": 4.356280326843262 + }, + { + "auxiliary_loss_clip": 0.01105096, + "auxiliary_loss_mlp": 0.01029686, + "balance_loss_clip": 1.01809907, + "balance_loss_mlp": 1.03675938, + "epoch": 0.5870434390500526, + "flos": 17638998762240.0, + "grad_norm": 1.4976833867772195, + "language_loss": 0.79729784, + "learning_rate": 1.53745602625755e-06, + "loss": 0.81864572, + "num_input_tokens_seen": 210518150, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.68359375, + "step": 9764, + "time_per_iteration": 3.9194676876068115 + }, + { + "auxiliary_loss_clip": 0.01108839, + "auxiliary_loss_mlp": 0.0103278, + "balance_loss_clip": 1.0202508, + "balance_loss_mlp": 1.03856993, + "epoch": 0.5871035623027205, + "flos": 21506146859520.0, + "grad_norm": 2.0111563944475908, + "language_loss": 0.78612924, + "learning_rate": 1.5370771319915819e-06, + "loss": 0.80754542, + "num_input_tokens_seen": 210537760, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 9765, + "time_per_iteration": 2.53273344039917 + }, + { + "auxiliary_loss_clip": 0.01106605, + "auxiliary_loss_mlp": 0.01029586, + "balance_loss_clip": 1.01712823, + "balance_loss_mlp": 1.03891206, + "epoch": 0.5871636855553886, + "flos": 13551861818880.0, + "grad_norm": 1.8843759319265088, + "language_loss": 0.83718032, + "learning_rate": 1.5366982552796947e-06, + "loss": 0.8585422, + "num_input_tokens_seen": 210555515, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.67578125, + "step": 9766, + "time_per_iteration": 2.467556953430176 + }, + { + "auxiliary_loss_clip": 0.01110103, + "auxiliary_loss_mlp": 0.01032223, + "balance_loss_clip": 1.02024257, + "balance_loss_mlp": 1.03847504, + "epoch": 0.5872238088080565, + "flos": 26212922346240.0, + "grad_norm": 2.6418409503909674, + "language_loss": 0.69825381, + "learning_rate": 1.536319396136257e-06, + "loss": 0.71967709, + "num_input_tokens_seen": 210575000, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.71875, + "step": 9767, + "time_per_iteration": 2.514695405960083 + }, + { + "auxiliary_loss_clip": 0.01108076, + "auxiliary_loss_mlp": 0.01034383, + "balance_loss_clip": 1.02162123, + "balance_loss_mlp": 1.03721809, + "epoch": 0.5872839320607245, + "flos": 30665198995200.0, + "grad_norm": 1.7100990150928812, + "language_loss": 0.6345011, + "learning_rate": 1.5359405545756336e-06, + "loss": 0.65592575, + "num_input_tokens_seen": 210595185, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7109375, + "step": 9768, + "time_per_iteration": 2.510586738586426 + }, + { + "auxiliary_loss_clip": 0.01029111, + "auxiliary_loss_mlp": 0.00999867, + "balance_loss_clip": 0.9985916, + "balance_loss_mlp": 1.00760961, + "epoch": 0.5873440553133924, + "flos": 60303570871680.0, + "grad_norm": 0.7128870586180143, + "language_loss": 0.53924322, + "learning_rate": 1.5355617306121914e-06, + "loss": 0.559533, + "num_input_tokens_seen": 210653210, + "router_z_loss_clip": 0.01275635, + "router_z_loss_mlp": 0.21484375, + "step": 9769, + "time_per_iteration": 3.0710904598236084 + }, + { + "auxiliary_loss_clip": 0.01104834, + "auxiliary_loss_mlp": 0.01033468, + "balance_loss_clip": 1.02148712, + "balance_loss_mlp": 1.03672135, + "epoch": 0.5874041785660604, + "flos": 21539292134400.0, + "grad_norm": 1.4641633186547043, + "language_loss": 0.70532131, + "learning_rate": 1.5351829242602945e-06, + "loss": 0.7267043, + "num_input_tokens_seen": 210673750, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 9770, + "time_per_iteration": 2.516707420349121 + }, + { + "auxiliary_loss_clip": 0.01106458, + "auxiliary_loss_mlp": 0.01031999, + "balance_loss_clip": 1.0193336, + "balance_loss_mlp": 1.03782773, + "epoch": 0.5874643018187284, + "flos": 24388947671040.0, + "grad_norm": 3.691664094278214, + "language_loss": 0.67488074, + "learning_rate": 1.5348041355343077e-06, + "loss": 0.69626534, + "num_input_tokens_seen": 210692960, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 9771, + "time_per_iteration": 2.4816172122955322 + }, + { + "auxiliary_loss_clip": 0.01107891, + "auxiliary_loss_mlp": 0.0103358, + "balance_loss_clip": 1.02041984, + "balance_loss_mlp": 1.03628254, + "epoch": 0.5875244250713964, + "flos": 28147717457280.0, + "grad_norm": 1.6051808895674682, + "language_loss": 0.65752995, + "learning_rate": 1.5344253644485954e-06, + "loss": 0.67894471, + "num_input_tokens_seen": 210714040, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71484375, + "step": 9772, + "time_per_iteration": 2.5371270179748535 + }, + { + "auxiliary_loss_clip": 0.01111325, + "auxiliary_loss_mlp": 0.01038697, + "balance_loss_clip": 1.02478576, + "balance_loss_mlp": 1.03915095, + "epoch": 0.5875845483240644, + "flos": 25812410722560.0, + "grad_norm": 1.7393863773768459, + "language_loss": 0.74272907, + "learning_rate": 1.534046611017519e-06, + "loss": 0.7642293, + "num_input_tokens_seen": 210733710, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.71875, + "step": 9773, + "time_per_iteration": 2.4879984855651855 + }, + { + "auxiliary_loss_clip": 0.01108784, + "auxiliary_loss_mlp": 0.01037956, + "balance_loss_clip": 1.02513528, + "balance_loss_mlp": 1.03829455, + "epoch": 0.5876446715767323, + "flos": 26906572863360.0, + "grad_norm": 2.707979121748391, + "language_loss": 0.53293657, + "learning_rate": 1.5336678752554421e-06, + "loss": 0.55440396, + "num_input_tokens_seen": 210753580, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 9774, + "time_per_iteration": 2.5072500705718994 + }, + { + "auxiliary_loss_clip": 0.01109655, + "auxiliary_loss_mlp": 0.01035615, + "balance_loss_clip": 1.02257991, + "balance_loss_mlp": 1.03880942, + "epoch": 0.5877047948294003, + "flos": 36684832579200.0, + "grad_norm": 2.48971225310605, + "language_loss": 0.65312964, + "learning_rate": 1.5332891571767264e-06, + "loss": 0.6745823, + "num_input_tokens_seen": 210773495, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 9775, + "time_per_iteration": 2.5655953884124756 + }, + { + "auxiliary_loss_clip": 0.01106711, + "auxiliary_loss_mlp": 0.01033817, + "balance_loss_clip": 1.02168775, + "balance_loss_mlp": 1.03676975, + "epoch": 0.5877649180820682, + "flos": 26724721282560.0, + "grad_norm": 1.785458151895031, + "language_loss": 0.73554152, + "learning_rate": 1.5329104567957326e-06, + "loss": 0.7569468, + "num_input_tokens_seen": 210793645, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69921875, + "step": 9776, + "time_per_iteration": 2.54707932472229 + }, + { + "auxiliary_loss_clip": 0.01106906, + "auxiliary_loss_mlp": 0.01033226, + "balance_loss_clip": 1.02136469, + "balance_loss_mlp": 1.0373795, + "epoch": 0.5878250413347362, + "flos": 21032197879680.0, + "grad_norm": 2.328878154900185, + "language_loss": 0.74400878, + "learning_rate": 1.532531774126821e-06, + "loss": 0.76541013, + "num_input_tokens_seen": 210813415, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6953125, + "step": 9777, + "time_per_iteration": 2.5013017654418945 + }, + { + "auxiliary_loss_clip": 0.01103004, + "auxiliary_loss_mlp": 0.01029838, + "balance_loss_clip": 1.01816726, + "balance_loss_mlp": 1.03745651, + "epoch": 0.5878851645874041, + "flos": 25484259047040.0, + "grad_norm": 1.542678345734907, + "language_loss": 0.74238187, + "learning_rate": 1.5321531091843512e-06, + "loss": 0.76371026, + "num_input_tokens_seen": 210833850, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 9778, + "time_per_iteration": 2.548445224761963 + }, + { + "auxiliary_loss_clip": 0.01104043, + "auxiliary_loss_mlp": 0.01029504, + "balance_loss_clip": 1.01765513, + "balance_loss_mlp": 1.03588045, + "epoch": 0.5879452878400722, + "flos": 23769129559680.0, + "grad_norm": 1.8670942886874708, + "language_loss": 0.70107329, + "learning_rate": 1.5317744619826824e-06, + "loss": 0.72240877, + "num_input_tokens_seen": 210853115, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 9779, + "time_per_iteration": 2.440385341644287 + }, + { + "auxiliary_loss_clip": 0.01109422, + "auxiliary_loss_mlp": 0.01032703, + "balance_loss_clip": 1.02029324, + "balance_loss_mlp": 1.03690886, + "epoch": 0.5880054110927401, + "flos": 17824513530240.0, + "grad_norm": 1.8860885981569304, + "language_loss": 0.67181754, + "learning_rate": 1.5313958325361727e-06, + "loss": 0.69323874, + "num_input_tokens_seen": 210872090, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7265625, + "step": 9780, + "time_per_iteration": 2.5105738639831543 + }, + { + "auxiliary_loss_clip": 0.01108309, + "auxiliary_loss_mlp": 0.01035836, + "balance_loss_clip": 1.02308023, + "balance_loss_mlp": 1.03872418, + "epoch": 0.5880655343454081, + "flos": 19463404400640.0, + "grad_norm": 3.148071574180809, + "language_loss": 0.72608495, + "learning_rate": 1.5310172208591807e-06, + "loss": 0.74752629, + "num_input_tokens_seen": 210888490, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 9781, + "time_per_iteration": 2.4174652099609375 + }, + { + "auxiliary_loss_clip": 0.01104991, + "auxiliary_loss_mlp": 0.0103161, + "balance_loss_clip": 1.01946235, + "balance_loss_mlp": 1.03562713, + "epoch": 0.588125657598076, + "flos": 21397588980480.0, + "grad_norm": 1.4505377017032317, + "language_loss": 0.70405555, + "learning_rate": 1.5306386269660622e-06, + "loss": 0.72542155, + "num_input_tokens_seen": 210908220, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6953125, + "step": 9782, + "time_per_iteration": 2.4488813877105713 + }, + { + "auxiliary_loss_clip": 0.01108141, + "auxiliary_loss_mlp": 0.01032909, + "balance_loss_clip": 1.02064204, + "balance_loss_mlp": 1.03547001, + "epoch": 0.588185780850744, + "flos": 16034653797120.0, + "grad_norm": 3.528130932430564, + "language_loss": 0.70414114, + "learning_rate": 1.5302600508711741e-06, + "loss": 0.72555161, + "num_input_tokens_seen": 210923945, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.7265625, + "step": 9783, + "time_per_iteration": 2.411940813064575 + }, + { + "auxiliary_loss_clip": 0.01109132, + "auxiliary_loss_mlp": 0.01032084, + "balance_loss_clip": 1.0186553, + "balance_loss_mlp": 1.03764033, + "epoch": 0.588245904103412, + "flos": 23728226947200.0, + "grad_norm": 2.8122189742296952, + "language_loss": 0.6903708, + "learning_rate": 1.5298814925888719e-06, + "loss": 0.71178293, + "num_input_tokens_seen": 210941955, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.71484375, + "step": 9784, + "time_per_iteration": 2.4809060096740723 + }, + { + "auxiliary_loss_clip": 0.01107726, + "auxiliary_loss_mlp": 0.01034991, + "balance_loss_clip": 1.02227104, + "balance_loss_mlp": 1.03585327, + "epoch": 0.58830602735608, + "flos": 33802534558080.0, + "grad_norm": 1.976987554101205, + "language_loss": 0.69485259, + "learning_rate": 1.5295029521335102e-06, + "loss": 0.71627975, + "num_input_tokens_seen": 210963105, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 9785, + "time_per_iteration": 2.5458383560180664 + }, + { + "auxiliary_loss_clip": 0.0110444, + "auxiliary_loss_mlp": 0.01026297, + "balance_loss_clip": 1.01477504, + "balance_loss_mlp": 1.03624511, + "epoch": 0.588366150608748, + "flos": 17090714586240.0, + "grad_norm": 2.0068567513814375, + "language_loss": 0.77542102, + "learning_rate": 1.5291244295194448e-06, + "loss": 0.79672837, + "num_input_tokens_seen": 210978720, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.68359375, + "step": 9786, + "time_per_iteration": 2.4269275665283203 + }, + { + "auxiliary_loss_clip": 0.01108029, + "auxiliary_loss_mlp": 0.01033892, + "balance_loss_clip": 1.02173829, + "balance_loss_mlp": 1.03681958, + "epoch": 0.5884262738614159, + "flos": 22127186033280.0, + "grad_norm": 1.4388452349288328, + "language_loss": 0.79175329, + "learning_rate": 1.5287459247610276e-06, + "loss": 0.81317246, + "num_input_tokens_seen": 210998750, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.7109375, + "step": 9787, + "time_per_iteration": 2.441265344619751 + }, + { + "auxiliary_loss_clip": 0.01106621, + "auxiliary_loss_mlp": 0.01031149, + "balance_loss_clip": 1.01953244, + "balance_loss_mlp": 1.03677058, + "epoch": 0.5884863971140839, + "flos": 21031838743680.0, + "grad_norm": 1.596428038291934, + "language_loss": 0.66514194, + "learning_rate": 1.5283674378726116e-06, + "loss": 0.68651974, + "num_input_tokens_seen": 211017550, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.69921875, + "step": 9788, + "time_per_iteration": 2.4632344245910645 + }, + { + "auxiliary_loss_clip": 0.01106001, + "auxiliary_loss_mlp": 0.01030969, + "balance_loss_clip": 1.01877332, + "balance_loss_mlp": 1.03787911, + "epoch": 0.5885465203667518, + "flos": 23805112008960.0, + "grad_norm": 2.066265402471891, + "language_loss": 0.79951847, + "learning_rate": 1.5279889688685506e-06, + "loss": 0.82088816, + "num_input_tokens_seen": 211034135, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 9789, + "time_per_iteration": 2.4486775398254395 + }, + { + "auxiliary_loss_clip": 0.01105301, + "auxiliary_loss_mlp": 0.01027594, + "balance_loss_clip": 1.01579237, + "balance_loss_mlp": 1.03722358, + "epoch": 0.5886066436194198, + "flos": 18880574319360.0, + "grad_norm": 1.510117689081276, + "language_loss": 0.70817208, + "learning_rate": 1.5276105177631944e-06, + "loss": 0.72950107, + "num_input_tokens_seen": 211053850, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 9790, + "time_per_iteration": 2.474634885787964 + }, + { + "auxiliary_loss_clip": 0.01105567, + "auxiliary_loss_mlp": 0.01033293, + "balance_loss_clip": 1.02120566, + "balance_loss_mlp": 1.0374043, + "epoch": 0.5886667668720877, + "flos": 24790141653120.0, + "grad_norm": 1.9043586619327855, + "language_loss": 0.83184004, + "learning_rate": 1.527232084570895e-06, + "loss": 0.85322857, + "num_input_tokens_seen": 211072165, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 9791, + "time_per_iteration": 2.4930591583251953 + }, + { + "auxiliary_loss_clip": 0.01110799, + "auxiliary_loss_mlp": 0.01034535, + "balance_loss_clip": 1.02189827, + "balance_loss_mlp": 1.04020619, + "epoch": 0.5887268901247558, + "flos": 21614381516160.0, + "grad_norm": 1.5964011084944127, + "language_loss": 0.76287472, + "learning_rate": 1.5268536693060026e-06, + "loss": 0.78432798, + "num_input_tokens_seen": 211089630, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 9792, + "time_per_iteration": 2.572164297103882 + }, + { + "auxiliary_loss_clip": 0.01110663, + "auxiliary_loss_mlp": 0.01030996, + "balance_loss_clip": 1.01878858, + "balance_loss_mlp": 1.0383172, + "epoch": 0.5887870133774237, + "flos": 20481722974080.0, + "grad_norm": 1.954465265842666, + "language_loss": 0.69085598, + "learning_rate": 1.5264752719828662e-06, + "loss": 0.71227252, + "num_input_tokens_seen": 211106120, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.72265625, + "step": 9793, + "time_per_iteration": 2.440532684326172 + }, + { + "auxiliary_loss_clip": 0.01105715, + "auxiliary_loss_mlp": 0.01032775, + "balance_loss_clip": 1.02001381, + "balance_loss_mlp": 1.03754866, + "epoch": 0.5888471366300917, + "flos": 19206283870080.0, + "grad_norm": 2.2945820531528547, + "language_loss": 0.60200524, + "learning_rate": 1.5260968926158353e-06, + "loss": 0.6233902, + "num_input_tokens_seen": 211122450, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 9794, + "time_per_iteration": 2.4281349182128906 + }, + { + "auxiliary_loss_clip": 0.01107792, + "auxiliary_loss_mlp": 0.01035699, + "balance_loss_clip": 1.02265191, + "balance_loss_mlp": 1.03800488, + "epoch": 0.5889072598827596, + "flos": 19972904866560.0, + "grad_norm": 1.8105141483242522, + "language_loss": 0.65209466, + "learning_rate": 1.525718531219257e-06, + "loss": 0.67352962, + "num_input_tokens_seen": 211141765, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.69921875, + "step": 9795, + "time_per_iteration": 2.4471983909606934 + }, + { + "auxiliary_loss_clip": 0.01105789, + "auxiliary_loss_mlp": 0.01036936, + "balance_loss_clip": 1.02589679, + "balance_loss_mlp": 1.03751063, + "epoch": 0.5889673831354276, + "flos": 20741249715840.0, + "grad_norm": 1.6472816848345888, + "language_loss": 0.74171197, + "learning_rate": 1.5253401878074801e-06, + "loss": 0.76313925, + "num_input_tokens_seen": 211160475, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6796875, + "step": 9796, + "time_per_iteration": 2.4404211044311523 + }, + { + "auxiliary_loss_clip": 0.0110878, + "auxiliary_loss_mlp": 0.01029178, + "balance_loss_clip": 1.01761484, + "balance_loss_mlp": 1.04002237, + "epoch": 0.5890275063880956, + "flos": 25300935008640.0, + "grad_norm": 1.4898681844876358, + "language_loss": 0.83064574, + "learning_rate": 1.5249618623948507e-06, + "loss": 0.85202533, + "num_input_tokens_seen": 211180480, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6875, + "step": 9797, + "time_per_iteration": 2.487971544265747 + }, + { + "auxiliary_loss_clip": 0.01104148, + "auxiliary_loss_mlp": 0.01031256, + "balance_loss_clip": 1.01896548, + "balance_loss_mlp": 1.03718829, + "epoch": 0.5890876296407636, + "flos": 11765377964160.0, + "grad_norm": 1.804693100831568, + "language_loss": 0.78741366, + "learning_rate": 1.5245835549957152e-06, + "loss": 0.80876774, + "num_input_tokens_seen": 211198000, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66796875, + "step": 9798, + "time_per_iteration": 2.4391119480133057 + }, + { + "auxiliary_loss_clip": 0.01104678, + "auxiliary_loss_mlp": 0.01031268, + "balance_loss_clip": 1.01994312, + "balance_loss_mlp": 1.03718722, + "epoch": 0.5891477528934316, + "flos": 13589460380160.0, + "grad_norm": 2.097614269824193, + "language_loss": 0.74100447, + "learning_rate": 1.5242052656244186e-06, + "loss": 0.76236397, + "num_input_tokens_seen": 211214765, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.67578125, + "step": 9799, + "time_per_iteration": 2.444185972213745 + }, + { + "auxiliary_loss_clip": 0.01110656, + "auxiliary_loss_mlp": 0.01031885, + "balance_loss_clip": 1.01852775, + "balance_loss_mlp": 1.03889656, + "epoch": 0.5892078761460995, + "flos": 15049193189760.0, + "grad_norm": 1.9705578864506654, + "language_loss": 0.76078779, + "learning_rate": 1.5238269942953064e-06, + "loss": 0.78221321, + "num_input_tokens_seen": 211232335, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.71875, + "step": 9800, + "time_per_iteration": 2.4564571380615234 + }, + { + "auxiliary_loss_clip": 0.01108184, + "auxiliary_loss_mlp": 0.01039976, + "balance_loss_clip": 1.02804899, + "balance_loss_mlp": 1.03771484, + "epoch": 0.5892679993987675, + "flos": 15778215624960.0, + "grad_norm": 1.9698106702703237, + "language_loss": 0.78824806, + "learning_rate": 1.523448741022722e-06, + "loss": 0.8097297, + "num_input_tokens_seen": 211249985, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.703125, + "step": 9801, + "time_per_iteration": 2.439195156097412 + }, + { + "auxiliary_loss_clip": 0.01109337, + "auxiliary_loss_mlp": 0.01029379, + "balance_loss_clip": 1.01721966, + "balance_loss_mlp": 1.03768528, + "epoch": 0.5893281226514354, + "flos": 25265203954560.0, + "grad_norm": 2.596016426383407, + "language_loss": 0.65912932, + "learning_rate": 1.5230705058210088e-06, + "loss": 0.68051648, + "num_input_tokens_seen": 211268425, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.71875, + "step": 9802, + "time_per_iteration": 3.8562896251678467 + }, + { + "auxiliary_loss_clip": 0.01106914, + "auxiliary_loss_mlp": 0.01027404, + "balance_loss_clip": 1.01552522, + "balance_loss_mlp": 1.03888416, + "epoch": 0.5893882459041034, + "flos": 19458232842240.0, + "grad_norm": 1.5756682227023782, + "language_loss": 0.78167737, + "learning_rate": 1.5226922887045108e-06, + "loss": 0.8030206, + "num_input_tokens_seen": 211286680, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 9803, + "time_per_iteration": 2.4531607627868652 + }, + { + "auxiliary_loss_clip": 0.01108754, + "auxiliary_loss_mlp": 0.01035578, + "balance_loss_clip": 1.02300191, + "balance_loss_mlp": 1.03849792, + "epoch": 0.5894483691567713, + "flos": 20634056553600.0, + "grad_norm": 1.5070835087317231, + "language_loss": 0.7292577, + "learning_rate": 1.5223140896875686e-06, + "loss": 0.75070107, + "num_input_tokens_seen": 211307700, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 9804, + "time_per_iteration": 3.909280776977539 + }, + { + "auxiliary_loss_clip": 0.01108266, + "auxiliary_loss_mlp": 0.01029855, + "balance_loss_clip": 1.01809549, + "balance_loss_mlp": 1.03996158, + "epoch": 0.5895084924094394, + "flos": 17778223877760.0, + "grad_norm": 1.9252543926260512, + "language_loss": 0.7480545, + "learning_rate": 1.5219359087845234e-06, + "loss": 0.76943576, + "num_input_tokens_seen": 211324835, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.68359375, + "step": 9805, + "time_per_iteration": 3.92484712600708 + }, + { + "auxiliary_loss_clip": 0.01113176, + "auxiliary_loss_mlp": 0.01031215, + "balance_loss_clip": 1.01807201, + "balance_loss_mlp": 1.03880858, + "epoch": 0.5895686156621073, + "flos": 20121072468480.0, + "grad_norm": 2.2161041024358736, + "language_loss": 0.7798723, + "learning_rate": 1.5215577460097174e-06, + "loss": 0.8013162, + "num_input_tokens_seen": 211344130, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 9806, + "time_per_iteration": 3.958747625350952 + }, + { + "auxiliary_loss_clip": 0.01106773, + "auxiliary_loss_mlp": 0.01030721, + "balance_loss_clip": 1.01821566, + "balance_loss_mlp": 1.03678048, + "epoch": 0.5896287389147753, + "flos": 20850058990080.0, + "grad_norm": 2.028844636014754, + "language_loss": 0.77013928, + "learning_rate": 1.5211796013774887e-06, + "loss": 0.79151416, + "num_input_tokens_seen": 211362915, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69921875, + "step": 9807, + "time_per_iteration": 2.437091827392578 + }, + { + "auxiliary_loss_clip": 0.01111522, + "auxiliary_loss_mlp": 0.01031277, + "balance_loss_clip": 1.01834261, + "balance_loss_mlp": 1.040411, + "epoch": 0.5896888621674432, + "flos": 14537897043840.0, + "grad_norm": 2.123691808114849, + "language_loss": 0.74406278, + "learning_rate": 1.5208014749021786e-06, + "loss": 0.76549083, + "num_input_tokens_seen": 211380700, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9808, + "time_per_iteration": 2.4456939697265625 + }, + { + "auxiliary_loss_clip": 0.01111351, + "auxiliary_loss_mlp": 0.01031497, + "balance_loss_clip": 1.01794887, + "balance_loss_mlp": 1.03927052, + "epoch": 0.5897489854201112, + "flos": 20886759711360.0, + "grad_norm": 1.9040797268830973, + "language_loss": 0.71715617, + "learning_rate": 1.5204233665981236e-06, + "loss": 0.73858464, + "num_input_tokens_seen": 211400095, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.71875, + "step": 9809, + "time_per_iteration": 2.4555907249450684 + }, + { + "auxiliary_loss_clip": 0.01111034, + "auxiliary_loss_mlp": 0.0103364, + "balance_loss_clip": 1.02066374, + "balance_loss_mlp": 1.03881156, + "epoch": 0.5898091086727792, + "flos": 20011149872640.0, + "grad_norm": 2.6575599068105262, + "language_loss": 0.81872356, + "learning_rate": 1.5200452764796627e-06, + "loss": 0.84017026, + "num_input_tokens_seen": 211417810, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.72265625, + "step": 9810, + "time_per_iteration": 2.546018600463867 + }, + { + "auxiliary_loss_clip": 0.01105843, + "auxiliary_loss_mlp": 0.01030074, + "balance_loss_clip": 1.01815283, + "balance_loss_mlp": 1.03850091, + "epoch": 0.5898692319254472, + "flos": 16253242012800.0, + "grad_norm": 1.679981614097192, + "language_loss": 0.8076582, + "learning_rate": 1.5196672045611336e-06, + "loss": 0.8290174, + "num_input_tokens_seen": 211436020, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 9811, + "time_per_iteration": 2.432685613632202 + }, + { + "auxiliary_loss_clip": 0.01110453, + "auxiliary_loss_mlp": 0.01027593, + "balance_loss_clip": 1.01449776, + "balance_loss_mlp": 1.03924918, + "epoch": 0.5899293551781152, + "flos": 20448541785600.0, + "grad_norm": 1.903117615206719, + "language_loss": 0.76666933, + "learning_rate": 1.5192891508568715e-06, + "loss": 0.78804982, + "num_input_tokens_seen": 211454335, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 9812, + "time_per_iteration": 2.45906138420105 + }, + { + "auxiliary_loss_clip": 0.01107232, + "auxiliary_loss_mlp": 0.01028772, + "balance_loss_clip": 1.01794147, + "balance_loss_mlp": 1.03932881, + "epoch": 0.5899894784307831, + "flos": 13881701433600.0, + "grad_norm": 3.543593991514859, + "language_loss": 0.70407474, + "learning_rate": 1.5189111153812133e-06, + "loss": 0.72543478, + "num_input_tokens_seen": 211472775, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6796875, + "step": 9813, + "time_per_iteration": 2.417073965072632 + }, + { + "auxiliary_loss_clip": 0.0110801, + "auxiliary_loss_mlp": 0.01031885, + "balance_loss_clip": 1.01969576, + "balance_loss_mlp": 1.03846037, + "epoch": 0.5900496016834511, + "flos": 20083797129600.0, + "grad_norm": 1.496524946754694, + "language_loss": 0.72230315, + "learning_rate": 1.518533098148494e-06, + "loss": 0.74370211, + "num_input_tokens_seen": 211492195, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6953125, + "step": 9814, + "time_per_iteration": 2.527130365371704 + }, + { + "auxiliary_loss_clip": 0.0110797, + "auxiliary_loss_mlp": 0.01029956, + "balance_loss_clip": 1.01768374, + "balance_loss_mlp": 1.03837872, + "epoch": 0.590109724936119, + "flos": 20259148348800.0, + "grad_norm": 1.8734717265521494, + "language_loss": 0.78583348, + "learning_rate": 1.5181550991730476e-06, + "loss": 0.80721277, + "num_input_tokens_seen": 211510220, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 9815, + "time_per_iteration": 2.4397730827331543 + }, + { + "auxiliary_loss_clip": 0.01114156, + "auxiliary_loss_mlp": 0.01035445, + "balance_loss_clip": 1.02197468, + "balance_loss_mlp": 1.03963876, + "epoch": 0.590169848188787, + "flos": 24235069806720.0, + "grad_norm": 2.0868241481245415, + "language_loss": 0.7557171, + "learning_rate": 1.5177771184692083e-06, + "loss": 0.7772131, + "num_input_tokens_seen": 211526260, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7421875, + "step": 9816, + "time_per_iteration": 2.443861484527588 + }, + { + "auxiliary_loss_clip": 0.01110119, + "auxiliary_loss_mlp": 0.01033143, + "balance_loss_clip": 1.02063835, + "balance_loss_mlp": 1.04108596, + "epoch": 0.590229971441455, + "flos": 17784724239360.0, + "grad_norm": 2.234392841889587, + "language_loss": 0.81303239, + "learning_rate": 1.517399156051309e-06, + "loss": 0.83446503, + "num_input_tokens_seen": 211542890, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69140625, + "step": 9817, + "time_per_iteration": 2.4248719215393066 + }, + { + "auxiliary_loss_clip": 0.01109425, + "auxiliary_loss_mlp": 0.01033156, + "balance_loss_clip": 1.02112818, + "balance_loss_mlp": 1.03941548, + "epoch": 0.590290094694123, + "flos": 22236893147520.0, + "grad_norm": 1.5738429375950187, + "language_loss": 0.76401961, + "learning_rate": 1.517021211933682e-06, + "loss": 0.78544545, + "num_input_tokens_seen": 211562685, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69921875, + "step": 9818, + "time_per_iteration": 2.445507526397705 + }, + { + "auxiliary_loss_clip": 0.01104232, + "auxiliary_loss_mlp": 0.01030314, + "balance_loss_clip": 1.01861358, + "balance_loss_mlp": 1.03634679, + "epoch": 0.5903502179467909, + "flos": 19098623831040.0, + "grad_norm": 1.8418500679377416, + "language_loss": 0.66351467, + "learning_rate": 1.5166432861306592e-06, + "loss": 0.68486011, + "num_input_tokens_seen": 211579960, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 9819, + "time_per_iteration": 2.4585890769958496 + }, + { + "auxiliary_loss_clip": 0.01109622, + "auxiliary_loss_mlp": 0.01032276, + "balance_loss_clip": 1.01972878, + "balance_loss_mlp": 1.03955185, + "epoch": 0.5904103411994589, + "flos": 24235500769920.0, + "grad_norm": 1.5583203498776486, + "language_loss": 0.77830237, + "learning_rate": 1.5162653786565714e-06, + "loss": 0.79972136, + "num_input_tokens_seen": 211599310, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 9820, + "time_per_iteration": 2.452444314956665 + }, + { + "auxiliary_loss_clip": 0.01033068, + "auxiliary_loss_mlp": 0.01003995, + "balance_loss_clip": 1.00268924, + "balance_loss_mlp": 1.01099396, + "epoch": 0.5904704644521268, + "flos": 64876613045760.0, + "grad_norm": 0.9230258023741272, + "language_loss": 0.65167463, + "learning_rate": 1.5158874895257487e-06, + "loss": 0.67204523, + "num_input_tokens_seen": 211658790, + "router_z_loss_clip": 0.01306152, + "router_z_loss_mlp": 0.22070312, + "step": 9821, + "time_per_iteration": 3.0410289764404297 + }, + { + "auxiliary_loss_clip": 0.01106857, + "auxiliary_loss_mlp": 0.01028087, + "balance_loss_clip": 1.0159936, + "balance_loss_mlp": 1.03887093, + "epoch": 0.5905305877047948, + "flos": 19609991804160.0, + "grad_norm": 1.8405567429237777, + "language_loss": 0.61040848, + "learning_rate": 1.515509618752521e-06, + "loss": 0.63175792, + "num_input_tokens_seen": 211677240, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 9822, + "time_per_iteration": 2.4597485065460205 + }, + { + "auxiliary_loss_clip": 0.01110158, + "auxiliary_loss_mlp": 0.01038511, + "balance_loss_clip": 1.02598214, + "balance_loss_mlp": 1.03878164, + "epoch": 0.5905907109574628, + "flos": 18989634988800.0, + "grad_norm": 1.8163106241475082, + "language_loss": 0.82910824, + "learning_rate": 1.5151317663512173e-06, + "loss": 0.850595, + "num_input_tokens_seen": 211695485, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71484375, + "step": 9823, + "time_per_iteration": 2.4342074394226074 + }, + { + "auxiliary_loss_clip": 0.01106822, + "auxiliary_loss_mlp": 0.01032649, + "balance_loss_clip": 1.02025676, + "balance_loss_mlp": 1.03823602, + "epoch": 0.5906508342101308, + "flos": 22200407907840.0, + "grad_norm": 1.9061097186750977, + "language_loss": 0.73051912, + "learning_rate": 1.514753932336165e-06, + "loss": 0.75191379, + "num_input_tokens_seen": 211713090, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 9824, + "time_per_iteration": 2.474583387374878 + }, + { + "auxiliary_loss_clip": 0.01115754, + "auxiliary_loss_mlp": 0.0103547, + "balance_loss_clip": 1.02118862, + "balance_loss_mlp": 1.03907609, + "epoch": 0.5907109574627988, + "flos": 20886687884160.0, + "grad_norm": 2.117093757339989, + "language_loss": 0.82486725, + "learning_rate": 1.514376116721693e-06, + "loss": 0.84637952, + "num_input_tokens_seen": 211732510, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.76953125, + "step": 9825, + "time_per_iteration": 2.4499030113220215 + }, + { + "auxiliary_loss_clip": 0.01104731, + "auxiliary_loss_mlp": 0.01028819, + "balance_loss_clip": 1.01781034, + "balance_loss_mlp": 1.03812122, + "epoch": 0.5907710807154667, + "flos": 21506649649920.0, + "grad_norm": 1.7674632389005596, + "language_loss": 0.77194965, + "learning_rate": 1.5139983195221272e-06, + "loss": 0.79328513, + "num_input_tokens_seen": 211748695, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6640625, + "step": 9826, + "time_per_iteration": 2.490628480911255 + }, + { + "auxiliary_loss_clip": 0.01106346, + "auxiliary_loss_mlp": 0.01026697, + "balance_loss_clip": 1.01523519, + "balance_loss_mlp": 1.03757071, + "epoch": 0.5908312039681347, + "flos": 22018376759040.0, + "grad_norm": 1.8211120400501501, + "language_loss": 0.72350824, + "learning_rate": 1.513620540751793e-06, + "loss": 0.74483871, + "num_input_tokens_seen": 211768545, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6875, + "step": 9827, + "time_per_iteration": 2.496574640274048 + }, + { + "auxiliary_loss_clip": 0.01107742, + "auxiliary_loss_mlp": 0.01028951, + "balance_loss_clip": 1.01782858, + "balance_loss_mlp": 1.0374589, + "epoch": 0.5908913272208026, + "flos": 18479523991680.0, + "grad_norm": 1.7932913826709562, + "language_loss": 0.79741728, + "learning_rate": 1.5132427804250178e-06, + "loss": 0.81878424, + "num_input_tokens_seen": 211786665, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.703125, + "step": 9828, + "time_per_iteration": 2.51045298576355 + }, + { + "auxiliary_loss_clip": 0.01111624, + "auxiliary_loss_mlp": 0.01034198, + "balance_loss_clip": 1.02125204, + "balance_loss_mlp": 1.03958178, + "epoch": 0.5909514504734706, + "flos": 12312189682560.0, + "grad_norm": 2.271428998540672, + "language_loss": 0.88056707, + "learning_rate": 1.5128650385561241e-06, + "loss": 0.90202534, + "num_input_tokens_seen": 211801215, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 9829, + "time_per_iteration": 2.4169514179229736 + }, + { + "auxiliary_loss_clip": 0.01031439, + "auxiliary_loss_mlp": 0.00999905, + "balance_loss_clip": 0.99870729, + "balance_loss_mlp": 1.00956726, + "epoch": 0.5910115737261386, + "flos": 70213262451840.0, + "grad_norm": 0.7537251091943264, + "language_loss": 0.57855141, + "learning_rate": 1.5124873151594376e-06, + "loss": 0.59886479, + "num_input_tokens_seen": 211857005, + "router_z_loss_clip": 0.01196289, + "router_z_loss_mlp": 0.21875, + "step": 9830, + "time_per_iteration": 2.996295928955078 + }, + { + "auxiliary_loss_clip": 0.01117401, + "auxiliary_loss_mlp": 0.01032419, + "balance_loss_clip": 1.01852536, + "balance_loss_mlp": 1.04140687, + "epoch": 0.5910716969788066, + "flos": 22017766227840.0, + "grad_norm": 2.0665850759749813, + "language_loss": 0.76163888, + "learning_rate": 1.5121096102492812e-06, + "loss": 0.78313708, + "num_input_tokens_seen": 211876675, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 9831, + "time_per_iteration": 2.461068868637085 + }, + { + "auxiliary_loss_clip": 0.01105452, + "auxiliary_loss_mlp": 0.01027496, + "balance_loss_clip": 1.01565278, + "balance_loss_mlp": 1.03923118, + "epoch": 0.5911318202314745, + "flos": 21251648021760.0, + "grad_norm": 1.602158251769988, + "language_loss": 0.7790612, + "learning_rate": 1.5117319238399767e-06, + "loss": 0.80039072, + "num_input_tokens_seen": 211895725, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 9832, + "time_per_iteration": 2.4806432723999023 + }, + { + "auxiliary_loss_clip": 0.01104873, + "auxiliary_loss_mlp": 0.01027671, + "balance_loss_clip": 1.01554728, + "balance_loss_mlp": 1.03533232, + "epoch": 0.5911919434841425, + "flos": 17821604528640.0, + "grad_norm": 1.7748958571682212, + "language_loss": 0.83552635, + "learning_rate": 1.511354255945847e-06, + "loss": 0.85685176, + "num_input_tokens_seen": 211913860, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6953125, + "step": 9833, + "time_per_iteration": 2.436558961868286 + }, + { + "auxiliary_loss_clip": 0.01108125, + "auxiliary_loss_mlp": 0.01032413, + "balance_loss_clip": 1.02002692, + "balance_loss_mlp": 1.03818607, + "epoch": 0.5912520667368104, + "flos": 20374781207040.0, + "grad_norm": 1.512608687160236, + "language_loss": 0.74505258, + "learning_rate": 1.5109766065812123e-06, + "loss": 0.76645797, + "num_input_tokens_seen": 211932880, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.703125, + "step": 9834, + "time_per_iteration": 2.497488260269165 + }, + { + "auxiliary_loss_clip": 0.01107604, + "auxiliary_loss_mlp": 0.01028933, + "balance_loss_clip": 1.01680338, + "balance_loss_mlp": 1.03707302, + "epoch": 0.5913121899894784, + "flos": 17930557457280.0, + "grad_norm": 2.15246332260658, + "language_loss": 0.78111219, + "learning_rate": 1.5105989757603942e-06, + "loss": 0.8024776, + "num_input_tokens_seen": 211948625, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.703125, + "step": 9835, + "time_per_iteration": 2.428570032119751 + }, + { + "auxiliary_loss_clip": 0.01108371, + "auxiliary_loss_mlp": 0.0103274, + "balance_loss_clip": 1.02080131, + "balance_loss_mlp": 1.03782153, + "epoch": 0.5913723132421465, + "flos": 22126934638080.0, + "grad_norm": 2.790579015547894, + "language_loss": 0.74016017, + "learning_rate": 1.5102213634977117e-06, + "loss": 0.76157123, + "num_input_tokens_seen": 211965355, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.703125, + "step": 9836, + "time_per_iteration": 2.4571895599365234 + }, + { + "auxiliary_loss_clip": 0.01108454, + "auxiliary_loss_mlp": 0.01025364, + "balance_loss_clip": 1.01332974, + "balance_loss_mlp": 1.03816915, + "epoch": 0.5914324364948144, + "flos": 15697918771200.0, + "grad_norm": 2.0887710674316335, + "language_loss": 0.81834614, + "learning_rate": 1.5098437698074841e-06, + "loss": 0.83968431, + "num_input_tokens_seen": 211982245, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.703125, + "step": 9837, + "time_per_iteration": 2.425869941711426 + }, + { + "auxiliary_loss_clip": 0.01109463, + "auxiliary_loss_mlp": 0.01030556, + "balance_loss_clip": 1.01760364, + "balance_loss_mlp": 1.03828216, + "epoch": 0.5914925597474824, + "flos": 22747327367040.0, + "grad_norm": 1.6633412669476784, + "language_loss": 0.79169023, + "learning_rate": 1.5094661947040304e-06, + "loss": 0.81309044, + "num_input_tokens_seen": 212000250, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 9838, + "time_per_iteration": 2.480945348739624 + }, + { + "auxiliary_loss_clip": 0.01109443, + "auxiliary_loss_mlp": 0.01032925, + "balance_loss_clip": 1.02036071, + "balance_loss_mlp": 1.03814876, + "epoch": 0.5915526830001503, + "flos": 18292788161280.0, + "grad_norm": 1.9639883281700399, + "language_loss": 0.6955409, + "learning_rate": 1.5090886382016673e-06, + "loss": 0.7169646, + "num_input_tokens_seen": 212017505, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 9839, + "time_per_iteration": 2.445032835006714 + }, + { + "auxiliary_loss_clip": 0.01109116, + "auxiliary_loss_mlp": 0.01040629, + "balance_loss_clip": 1.02804112, + "balance_loss_mlp": 1.03763521, + "epoch": 0.5916128062528183, + "flos": 17019072910080.0, + "grad_norm": 2.156057098485451, + "language_loss": 0.65970773, + "learning_rate": 1.5087111003147124e-06, + "loss": 0.68120515, + "num_input_tokens_seen": 212034595, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71484375, + "step": 9840, + "time_per_iteration": 2.4208333492279053 + }, + { + "auxiliary_loss_clip": 0.01109278, + "auxiliary_loss_mlp": 0.01031815, + "balance_loss_clip": 1.01920867, + "balance_loss_mlp": 1.03765261, + "epoch": 0.5916729295054862, + "flos": 24754231031040.0, + "grad_norm": 1.6889823147578333, + "language_loss": 0.81775278, + "learning_rate": 1.5083335810574813e-06, + "loss": 0.83916378, + "num_input_tokens_seen": 212055775, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 9841, + "time_per_iteration": 2.485783576965332 + }, + { + "auxiliary_loss_clip": 0.0110413, + "auxiliary_loss_mlp": 0.01028956, + "balance_loss_clip": 1.0175122, + "balance_loss_mlp": 1.03609967, + "epoch": 0.5917330527581542, + "flos": 15958199698560.0, + "grad_norm": 1.5545668932192243, + "language_loss": 0.68891448, + "learning_rate": 1.507956080444291e-06, + "loss": 0.71024531, + "num_input_tokens_seen": 212074000, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6796875, + "step": 9842, + "time_per_iteration": 2.4090652465820312 + }, + { + "auxiliary_loss_clip": 0.01108304, + "auxiliary_loss_mlp": 0.0103228, + "balance_loss_clip": 1.02031779, + "balance_loss_mlp": 1.03697038, + "epoch": 0.5917931760108222, + "flos": 23800730549760.0, + "grad_norm": 1.8995177421561278, + "language_loss": 0.8258518, + "learning_rate": 1.5075785984894549e-06, + "loss": 0.84725767, + "num_input_tokens_seen": 212091415, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.7109375, + "step": 9843, + "time_per_iteration": 2.456085443496704 + }, + { + "auxiliary_loss_clip": 0.01107968, + "auxiliary_loss_mlp": 0.01031114, + "balance_loss_clip": 1.01810205, + "balance_loss_mlp": 1.03701758, + "epoch": 0.5918532992634902, + "flos": 23249609199360.0, + "grad_norm": 2.3414678440212953, + "language_loss": 0.81883448, + "learning_rate": 1.5072011352072875e-06, + "loss": 0.84022528, + "num_input_tokens_seen": 212105255, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 9844, + "time_per_iteration": 3.834216833114624 + }, + { + "auxiliary_loss_clip": 0.01111099, + "auxiliary_loss_mlp": 0.01031123, + "balance_loss_clip": 1.01842773, + "balance_loss_mlp": 1.04004455, + "epoch": 0.5919134225161581, + "flos": 19499853726720.0, + "grad_norm": 1.8185302816606077, + "language_loss": 0.74449736, + "learning_rate": 1.5068236906121032e-06, + "loss": 0.76591957, + "num_input_tokens_seen": 212122765, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9845, + "time_per_iteration": 2.409029960632324 + }, + { + "auxiliary_loss_clip": 0.01108139, + "auxiliary_loss_mlp": 0.01026326, + "balance_loss_clip": 1.01324248, + "balance_loss_mlp": 1.03682494, + "epoch": 0.5919735457688261, + "flos": 38800940567040.0, + "grad_norm": 2.2228008907542027, + "language_loss": 0.63848257, + "learning_rate": 1.506446264718213e-06, + "loss": 0.65982717, + "num_input_tokens_seen": 212143960, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 9846, + "time_per_iteration": 3.994704246520996 + }, + { + "auxiliary_loss_clip": 0.01100388, + "auxiliary_loss_mlp": 0.01026228, + "balance_loss_clip": 1.01529002, + "balance_loss_mlp": 1.03501678, + "epoch": 0.592033669021494, + "flos": 22163994495360.0, + "grad_norm": 1.7549171077463366, + "language_loss": 0.76315683, + "learning_rate": 1.506068857539931e-06, + "loss": 0.78442299, + "num_input_tokens_seen": 212162005, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.65625, + "step": 9847, + "time_per_iteration": 3.815723419189453 + }, + { + "auxiliary_loss_clip": 0.01107339, + "auxiliary_loss_mlp": 0.01031252, + "balance_loss_clip": 1.01892579, + "balance_loss_mlp": 1.03723776, + "epoch": 0.592093792274162, + "flos": 22710985781760.0, + "grad_norm": 1.7391013556086516, + "language_loss": 0.6229955, + "learning_rate": 1.5056914690915667e-06, + "loss": 0.6443814, + "num_input_tokens_seen": 212181635, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 9848, + "time_per_iteration": 3.9868550300598145 + }, + { + "auxiliary_loss_clip": 0.01108795, + "auxiliary_loss_mlp": 0.01037331, + "balance_loss_clip": 1.02532125, + "balance_loss_mlp": 1.03819513, + "epoch": 0.59215391552683, + "flos": 22528954632960.0, + "grad_norm": 2.784596822173483, + "language_loss": 0.75762534, + "learning_rate": 1.5053140993874312e-06, + "loss": 0.77908659, + "num_input_tokens_seen": 212201615, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.703125, + "step": 9849, + "time_per_iteration": 2.4613027572631836 + }, + { + "auxiliary_loss_clip": 0.01108412, + "auxiliary_loss_mlp": 0.01032802, + "balance_loss_clip": 1.02006471, + "balance_loss_mlp": 1.0370928, + "epoch": 0.592214038779498, + "flos": 24499013921280.0, + "grad_norm": 1.6562680086624124, + "language_loss": 0.75594199, + "learning_rate": 1.5049367484418353e-06, + "loss": 0.77735424, + "num_input_tokens_seen": 212219355, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9850, + "time_per_iteration": 2.5371382236480713 + }, + { + "auxiliary_loss_clip": 0.01106578, + "auxiliary_loss_mlp": 0.01029098, + "balance_loss_clip": 1.01676035, + "balance_loss_mlp": 1.03672051, + "epoch": 0.592274162032166, + "flos": 21831353619840.0, + "grad_norm": 1.7347218503083297, + "language_loss": 0.7573396, + "learning_rate": 1.5045594162690868e-06, + "loss": 0.7786963, + "num_input_tokens_seen": 212236710, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.69921875, + "step": 9851, + "time_per_iteration": 2.4500503540039062 + }, + { + "auxiliary_loss_clip": 0.01106705, + "auxiliary_loss_mlp": 0.0103027, + "balance_loss_clip": 1.0179739, + "balance_loss_mlp": 1.03609896, + "epoch": 0.5923342852848339, + "flos": 24608146417920.0, + "grad_norm": 1.818113501506117, + "language_loss": 0.70232719, + "learning_rate": 1.5041821028834954e-06, + "loss": 0.72369695, + "num_input_tokens_seen": 212256195, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.70703125, + "step": 9852, + "time_per_iteration": 2.50327205657959 + }, + { + "auxiliary_loss_clip": 0.01112321, + "auxiliary_loss_mlp": 0.01040222, + "balance_loss_clip": 1.02710271, + "balance_loss_mlp": 1.03861785, + "epoch": 0.5923944085375019, + "flos": 19938143479680.0, + "grad_norm": 38.24844963287624, + "language_loss": 0.8025564, + "learning_rate": 1.5038048082993685e-06, + "loss": 0.82408178, + "num_input_tokens_seen": 212274085, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73828125, + "step": 9853, + "time_per_iteration": 2.443661689758301 + }, + { + "auxiliary_loss_clip": 0.01103448, + "auxiliary_loss_mlp": 0.01025904, + "balance_loss_clip": 1.01480556, + "balance_loss_mlp": 1.03603673, + "epoch": 0.5924545317901698, + "flos": 28658510812800.0, + "grad_norm": 1.502563314800498, + "language_loss": 0.67641807, + "learning_rate": 1.5034275325310124e-06, + "loss": 0.69771153, + "num_input_tokens_seen": 212295530, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.67578125, + "step": 9854, + "time_per_iteration": 2.5323755741119385 + }, + { + "auxiliary_loss_clip": 0.01105063, + "auxiliary_loss_mlp": 0.01025695, + "balance_loss_clip": 1.01371408, + "balance_loss_mlp": 1.03610444, + "epoch": 0.5925146550428378, + "flos": 19864885691520.0, + "grad_norm": 1.6522001385368033, + "language_loss": 0.88777542, + "learning_rate": 1.5030502755927344e-06, + "loss": 0.90908301, + "num_input_tokens_seen": 212313770, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 9855, + "time_per_iteration": 2.4309167861938477 + }, + { + "auxiliary_loss_clip": 0.01102278, + "auxiliary_loss_mlp": 0.01030174, + "balance_loss_clip": 1.01936722, + "balance_loss_mlp": 1.03590918, + "epoch": 0.5925747782955058, + "flos": 15122989681920.0, + "grad_norm": 1.7115668008760792, + "language_loss": 0.86635554, + "learning_rate": 1.5026730374988397e-06, + "loss": 0.88768005, + "num_input_tokens_seen": 212331525, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6640625, + "step": 9856, + "time_per_iteration": 2.464066743850708 + }, + { + "auxiliary_loss_clip": 0.01105344, + "auxiliary_loss_mlp": 0.01034231, + "balance_loss_clip": 1.02256656, + "balance_loss_mlp": 1.03562045, + "epoch": 0.5926349015481738, + "flos": 18405440190720.0, + "grad_norm": 2.1473398743532153, + "language_loss": 0.77584958, + "learning_rate": 1.5022958182636332e-06, + "loss": 0.79724526, + "num_input_tokens_seen": 212347295, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.69921875, + "step": 9857, + "time_per_iteration": 2.4102070331573486 + }, + { + "auxiliary_loss_clip": 0.01109396, + "auxiliary_loss_mlp": 0.01033385, + "balance_loss_clip": 1.0216434, + "balance_loss_mlp": 1.03954232, + "epoch": 0.5926950248008417, + "flos": 23111138269440.0, + "grad_norm": 1.9751188115052367, + "language_loss": 0.64351666, + "learning_rate": 1.501918617901419e-06, + "loss": 0.66494453, + "num_input_tokens_seen": 212365750, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.69921875, + "step": 9858, + "time_per_iteration": 2.461881637573242 + }, + { + "auxiliary_loss_clip": 0.01104549, + "auxiliary_loss_mlp": 0.01030693, + "balance_loss_clip": 1.01884377, + "balance_loss_mlp": 1.03700852, + "epoch": 0.5927551480535097, + "flos": 28033916192640.0, + "grad_norm": 1.9049315760209506, + "language_loss": 0.77045393, + "learning_rate": 1.501541436426501e-06, + "loss": 0.79180634, + "num_input_tokens_seen": 212385300, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.67578125, + "step": 9859, + "time_per_iteration": 2.478782892227173 + }, + { + "auxiliary_loss_clip": 0.01110235, + "auxiliary_loss_mlp": 0.01033746, + "balance_loss_clip": 1.02082372, + "balance_loss_mlp": 1.03882456, + "epoch": 0.5928152713061776, + "flos": 21798675221760.0, + "grad_norm": 2.1565186381803194, + "language_loss": 0.75153667, + "learning_rate": 1.5011642738531818e-06, + "loss": 0.77297652, + "num_input_tokens_seen": 212402140, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71484375, + "step": 9860, + "time_per_iteration": 2.4513912200927734 + }, + { + "auxiliary_loss_clip": 0.01106266, + "auxiliary_loss_mlp": 0.01033749, + "balance_loss_clip": 1.02277529, + "balance_loss_mlp": 1.03840578, + "epoch": 0.5928753945588456, + "flos": 24316839118080.0, + "grad_norm": 1.6305970530500205, + "language_loss": 0.76227921, + "learning_rate": 1.500787130195763e-06, + "loss": 0.78367937, + "num_input_tokens_seen": 212421790, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.67578125, + "step": 9861, + "time_per_iteration": 2.474095344543457 + }, + { + "auxiliary_loss_clip": 0.01103657, + "auxiliary_loss_mlp": 0.01024434, + "balance_loss_clip": 1.0131923, + "balance_loss_mlp": 1.03595328, + "epoch": 0.5929355178115137, + "flos": 26464619923200.0, + "grad_norm": 1.8413108938997076, + "language_loss": 0.70368218, + "learning_rate": 1.5004100054685465e-06, + "loss": 0.72496319, + "num_input_tokens_seen": 212442115, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.67578125, + "step": 9862, + "time_per_iteration": 2.539903402328491 + }, + { + "auxiliary_loss_clip": 0.0110657, + "auxiliary_loss_mlp": 0.0103043, + "balance_loss_clip": 1.01868796, + "balance_loss_mlp": 1.03706694, + "epoch": 0.5929956410641816, + "flos": 24965995662720.0, + "grad_norm": 1.8355876983877193, + "language_loss": 0.77771485, + "learning_rate": 1.500032899685832e-06, + "loss": 0.7990849, + "num_input_tokens_seen": 212459535, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6953125, + "step": 9863, + "time_per_iteration": 2.4712796211242676 + }, + { + "auxiliary_loss_clip": 0.01106967, + "auxiliary_loss_mlp": 0.01038141, + "balance_loss_clip": 1.02583861, + "balance_loss_mlp": 1.03730559, + "epoch": 0.5930557643168496, + "flos": 26208325405440.0, + "grad_norm": 1.8648903136261632, + "language_loss": 0.70763469, + "learning_rate": 1.499655812861921e-06, + "loss": 0.72908574, + "num_input_tokens_seen": 212479385, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 9864, + "time_per_iteration": 2.52478289604187 + }, + { + "auxiliary_loss_clip": 0.01107547, + "auxiliary_loss_mlp": 0.01033988, + "balance_loss_clip": 1.02201343, + "balance_loss_mlp": 1.03711009, + "epoch": 0.5931158875695175, + "flos": 27854937699840.0, + "grad_norm": 2.2141122969684655, + "language_loss": 0.67234761, + "learning_rate": 1.4992787450111112e-06, + "loss": 0.69376296, + "num_input_tokens_seen": 212500060, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.703125, + "step": 9865, + "time_per_iteration": 2.4957449436187744 + }, + { + "auxiliary_loss_clip": 0.0110929, + "auxiliary_loss_mlp": 0.01031685, + "balance_loss_clip": 1.01892328, + "balance_loss_mlp": 1.03758049, + "epoch": 0.5931760108221855, + "flos": 15413650536960.0, + "grad_norm": 1.8936144812420768, + "language_loss": 0.78334385, + "learning_rate": 1.4989016961477015e-06, + "loss": 0.8047536, + "num_input_tokens_seen": 212518590, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.71875, + "step": 9866, + "time_per_iteration": 2.4394681453704834 + }, + { + "auxiliary_loss_clip": 0.01105609, + "auxiliary_loss_mlp": 0.01030002, + "balance_loss_clip": 1.01867127, + "balance_loss_mlp": 1.03786838, + "epoch": 0.5932361340748534, + "flos": 30188520581760.0, + "grad_norm": 1.98454003485575, + "language_loss": 0.72037029, + "learning_rate": 1.4985246662859903e-06, + "loss": 0.7417264, + "num_input_tokens_seen": 212538190, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.67578125, + "step": 9867, + "time_per_iteration": 2.5107383728027344 + }, + { + "auxiliary_loss_clip": 0.01107812, + "auxiliary_loss_mlp": 0.01030449, + "balance_loss_clip": 1.01795018, + "balance_loss_mlp": 1.03910947, + "epoch": 0.5932962573275214, + "flos": 20157557708160.0, + "grad_norm": 1.538584883762445, + "language_loss": 0.66726553, + "learning_rate": 1.4981476554402732e-06, + "loss": 0.68864822, + "num_input_tokens_seen": 212557820, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 9868, + "time_per_iteration": 2.5143752098083496 + }, + { + "auxiliary_loss_clip": 0.01107645, + "auxiliary_loss_mlp": 0.01033304, + "balance_loss_clip": 1.02100754, + "balance_loss_mlp": 1.03726101, + "epoch": 0.5933563805801894, + "flos": 25445906300160.0, + "grad_norm": 1.5720110660148519, + "language_loss": 0.75083476, + "learning_rate": 1.4977706636248478e-06, + "loss": 0.77224427, + "num_input_tokens_seen": 212577645, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 9869, + "time_per_iteration": 2.4784231185913086 + }, + { + "auxiliary_loss_clip": 0.01110477, + "auxiliary_loss_mlp": 0.0103956, + "balance_loss_clip": 1.02690041, + "balance_loss_mlp": 1.0391326, + "epoch": 0.5934165038328574, + "flos": 59995740337920.0, + "grad_norm": 1.6442009630814416, + "language_loss": 0.74131197, + "learning_rate": 1.4973936908540091e-06, + "loss": 0.76281238, + "num_input_tokens_seen": 212603430, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9870, + "time_per_iteration": 2.8396053314208984 + }, + { + "auxiliary_loss_clip": 0.0111113, + "auxiliary_loss_mlp": 0.01025896, + "balance_loss_clip": 1.01414764, + "balance_loss_mlp": 1.04010868, + "epoch": 0.5934766270855253, + "flos": 24420548661120.0, + "grad_norm": 1.9765481299651093, + "language_loss": 0.71421361, + "learning_rate": 1.4970167371420517e-06, + "loss": 0.7355839, + "num_input_tokens_seen": 212620730, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.7109375, + "step": 9871, + "time_per_iteration": 2.460695505142212 + }, + { + "auxiliary_loss_clip": 0.01110046, + "auxiliary_loss_mlp": 0.01032784, + "balance_loss_clip": 1.0198555, + "balance_loss_mlp": 1.03879905, + "epoch": 0.5935367503381933, + "flos": 23513158264320.0, + "grad_norm": 1.9723601672672642, + "language_loss": 0.74131697, + "learning_rate": 1.496639802503271e-06, + "loss": 0.76274526, + "num_input_tokens_seen": 212639745, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9872, + "time_per_iteration": 2.4877848625183105 + }, + { + "auxiliary_loss_clip": 0.01111497, + "auxiliary_loss_mlp": 0.01036942, + "balance_loss_clip": 1.02359688, + "balance_loss_mlp": 1.03926826, + "epoch": 0.5935968735908612, + "flos": 18948337326720.0, + "grad_norm": 2.142318153174813, + "language_loss": 0.78675568, + "learning_rate": 1.4962628869519583e-06, + "loss": 0.80824012, + "num_input_tokens_seen": 212655915, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.72265625, + "step": 9873, + "time_per_iteration": 2.4480934143066406 + }, + { + "auxiliary_loss_clip": 0.01109102, + "auxiliary_loss_mlp": 0.01034698, + "balance_loss_clip": 1.02197838, + "balance_loss_mlp": 1.03843832, + "epoch": 0.5936569968435292, + "flos": 25483433034240.0, + "grad_norm": 1.5306423792742176, + "language_loss": 0.85011673, + "learning_rate": 1.4958859905024078e-06, + "loss": 0.87155473, + "num_input_tokens_seen": 212676115, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 9874, + "time_per_iteration": 2.5098774433135986 + }, + { + "auxiliary_loss_clip": 0.01030749, + "auxiliary_loss_mlp": 0.01001619, + "balance_loss_clip": 1.00044489, + "balance_loss_mlp": 1.00908446, + "epoch": 0.5937171200961973, + "flos": 66378361789440.0, + "grad_norm": 0.6973173617166174, + "language_loss": 0.60004687, + "learning_rate": 1.4955091131689115e-06, + "loss": 0.62037057, + "num_input_tokens_seen": 212737560, + "router_z_loss_clip": 0.01171875, + "router_z_loss_mlp": 0.21679688, + "step": 9875, + "time_per_iteration": 3.1099135875701904 + }, + { + "auxiliary_loss_clip": 0.01110933, + "auxiliary_loss_mlp": 0.01033007, + "balance_loss_clip": 1.01980412, + "balance_loss_mlp": 1.0373013, + "epoch": 0.5937772433488652, + "flos": 14903467712640.0, + "grad_norm": 2.0699471238582943, + "language_loss": 0.77501059, + "learning_rate": 1.4951322549657594e-06, + "loss": 0.7964499, + "num_input_tokens_seen": 212755365, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 9876, + "time_per_iteration": 2.466031551361084 + }, + { + "auxiliary_loss_clip": 0.01103172, + "auxiliary_loss_mlp": 0.01028266, + "balance_loss_clip": 1.01652348, + "balance_loss_mlp": 1.03654408, + "epoch": 0.5938373666015332, + "flos": 22561489376640.0, + "grad_norm": 1.5589386174362272, + "language_loss": 0.75830436, + "learning_rate": 1.494755415907243e-06, + "loss": 0.77961862, + "num_input_tokens_seen": 212773875, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 9877, + "time_per_iteration": 2.4772722721099854 + }, + { + "auxiliary_loss_clip": 0.01108511, + "auxiliary_loss_mlp": 0.01031637, + "balance_loss_clip": 1.01892304, + "balance_loss_mlp": 1.03673589, + "epoch": 0.5938974898542011, + "flos": 18440883936000.0, + "grad_norm": 4.77912842405454, + "language_loss": 0.81212896, + "learning_rate": 1.4943785960076522e-06, + "loss": 0.83353043, + "num_input_tokens_seen": 212790590, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 9878, + "time_per_iteration": 2.511408805847168 + }, + { + "auxiliary_loss_clip": 0.01108341, + "auxiliary_loss_mlp": 0.01037126, + "balance_loss_clip": 1.02462077, + "balance_loss_mlp": 1.0378468, + "epoch": 0.5939576131068691, + "flos": 45586728270720.0, + "grad_norm": 1.7027842827521733, + "language_loss": 0.71123505, + "learning_rate": 1.4940017952812754e-06, + "loss": 0.73268974, + "num_input_tokens_seen": 212812265, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 9879, + "time_per_iteration": 2.6537530422210693 + }, + { + "auxiliary_loss_clip": 0.01107077, + "auxiliary_loss_mlp": 0.01032021, + "balance_loss_clip": 1.01973653, + "balance_loss_mlp": 1.03814936, + "epoch": 0.594017736359537, + "flos": 23587708942080.0, + "grad_norm": 1.4837097454893722, + "language_loss": 0.5739696, + "learning_rate": 1.493625013742401e-06, + "loss": 0.59536058, + "num_input_tokens_seen": 212831915, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69140625, + "step": 9880, + "time_per_iteration": 2.487082004547119 + }, + { + "auxiliary_loss_clip": 0.01107055, + "auxiliary_loss_mlp": 0.01037129, + "balance_loss_clip": 1.02435601, + "balance_loss_mlp": 1.03724837, + "epoch": 0.594077859612205, + "flos": 29457235589760.0, + "grad_norm": 1.7845732450958962, + "language_loss": 0.76980609, + "learning_rate": 1.4932482514053177e-06, + "loss": 0.79124796, + "num_input_tokens_seen": 212851350, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 9881, + "time_per_iteration": 2.5019240379333496 + }, + { + "auxiliary_loss_clip": 0.01105499, + "auxiliary_loss_mlp": 0.01026899, + "balance_loss_clip": 1.01437569, + "balance_loss_mlp": 1.03524506, + "epoch": 0.594137982864873, + "flos": 16800089644800.0, + "grad_norm": 2.214394269583833, + "language_loss": 0.82820934, + "learning_rate": 1.4928715082843112e-06, + "loss": 0.84953332, + "num_input_tokens_seen": 212867995, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 9882, + "time_per_iteration": 2.4258036613464355 + }, + { + "auxiliary_loss_clip": 0.01106542, + "auxiliary_loss_mlp": 0.01035086, + "balance_loss_clip": 1.02321863, + "balance_loss_mlp": 1.03781402, + "epoch": 0.594198106117541, + "flos": 12750263953920.0, + "grad_norm": 2.5324902309588855, + "language_loss": 0.79348171, + "learning_rate": 1.492494784393667e-06, + "loss": 0.81489801, + "num_input_tokens_seen": 212885220, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 9883, + "time_per_iteration": 2.4191815853118896 + }, + { + "auxiliary_loss_clip": 0.01112982, + "auxiliary_loss_mlp": 0.01034942, + "balance_loss_clip": 1.0214777, + "balance_loss_mlp": 1.03999424, + "epoch": 0.5942582293702089, + "flos": 20996538652800.0, + "grad_norm": 1.7967272432241739, + "language_loss": 0.74134135, + "learning_rate": 1.4921180797476725e-06, + "loss": 0.7628206, + "num_input_tokens_seen": 212903195, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73046875, + "step": 9884, + "time_per_iteration": 2.4599032402038574 + }, + { + "auxiliary_loss_clip": 0.01112156, + "auxiliary_loss_mlp": 0.01030057, + "balance_loss_clip": 1.0181067, + "balance_loss_mlp": 1.04232001, + "epoch": 0.5943183526228769, + "flos": 28291431772800.0, + "grad_norm": 3.4474311080183964, + "language_loss": 0.6639331, + "learning_rate": 1.4917413943606106e-06, + "loss": 0.68535531, + "num_input_tokens_seen": 212923340, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.69921875, + "step": 9885, + "time_per_iteration": 3.940159797668457 + }, + { + "auxiliary_loss_clip": 0.01107325, + "auxiliary_loss_mlp": 0.01036401, + "balance_loss_clip": 1.02392602, + "balance_loss_mlp": 1.03891098, + "epoch": 0.5943784758755448, + "flos": 26614619118720.0, + "grad_norm": 2.562196250157405, + "language_loss": 0.77456462, + "learning_rate": 1.4913647282467667e-06, + "loss": 0.79600191, + "num_input_tokens_seen": 212942755, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 9886, + "time_per_iteration": 2.4958837032318115 + }, + { + "auxiliary_loss_clip": 0.01029578, + "auxiliary_loss_mlp": 0.009997, + "balance_loss_clip": 0.99845427, + "balance_loss_mlp": 1.00789237, + "epoch": 0.5944385991282128, + "flos": 64190935347840.0, + "grad_norm": 0.8479500751523403, + "language_loss": 0.64580774, + "learning_rate": 1.490988081420423e-06, + "loss": 0.6661005, + "num_input_tokens_seen": 212999355, + "router_z_loss_clip": 0.01245117, + "router_z_loss_mlp": 0.21679688, + "step": 9887, + "time_per_iteration": 4.312393426895142 + }, + { + "auxiliary_loss_clip": 0.01106228, + "auxiliary_loss_mlp": 0.01031375, + "balance_loss_clip": 1.01911473, + "balance_loss_mlp": 1.03743696, + "epoch": 0.5944987223808808, + "flos": 19571998193280.0, + "grad_norm": 1.9767325567336362, + "language_loss": 0.69172513, + "learning_rate": 1.4906114538958615e-06, + "loss": 0.71310121, + "num_input_tokens_seen": 213018570, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 9888, + "time_per_iteration": 3.8631362915039062 + }, + { + "auxiliary_loss_clip": 0.01108213, + "auxiliary_loss_mlp": 0.01030472, + "balance_loss_clip": 1.01763296, + "balance_loss_mlp": 1.03916407, + "epoch": 0.5945588456335488, + "flos": 26177586341760.0, + "grad_norm": 1.5956528037649322, + "language_loss": 0.79466522, + "learning_rate": 1.490234845687366e-06, + "loss": 0.81605208, + "num_input_tokens_seen": 213037735, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6875, + "step": 9889, + "time_per_iteration": 4.0321431159973145 + }, + { + "auxiliary_loss_clip": 0.01105021, + "auxiliary_loss_mlp": 0.01031056, + "balance_loss_clip": 1.01912892, + "balance_loss_mlp": 1.03607225, + "epoch": 0.5946189688862168, + "flos": 20446494710400.0, + "grad_norm": 1.529319229595301, + "language_loss": 0.70732993, + "learning_rate": 1.4898582568092154e-06, + "loss": 0.72869068, + "num_input_tokens_seen": 213057160, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 9890, + "time_per_iteration": 2.465503692626953 + }, + { + "auxiliary_loss_clip": 0.01110328, + "auxiliary_loss_mlp": 0.01032793, + "balance_loss_clip": 1.01994216, + "balance_loss_mlp": 1.03921902, + "epoch": 0.5946790921388847, + "flos": 13437521850240.0, + "grad_norm": 2.2570879506032933, + "language_loss": 0.69334114, + "learning_rate": 1.489481687275691e-06, + "loss": 0.71477234, + "num_input_tokens_seen": 213073630, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9891, + "time_per_iteration": 2.4280505180358887 + }, + { + "auxiliary_loss_clip": 0.01106776, + "auxiliary_loss_mlp": 0.01036094, + "balance_loss_clip": 1.02376795, + "balance_loss_mlp": 1.03809762, + "epoch": 0.5947392153915527, + "flos": 20412272027520.0, + "grad_norm": 1.752140694177181, + "language_loss": 0.53531826, + "learning_rate": 1.4891051371010726e-06, + "loss": 0.55674696, + "num_input_tokens_seen": 213092450, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 9892, + "time_per_iteration": 2.4815757274627686 + }, + { + "auxiliary_loss_clip": 0.01030384, + "auxiliary_loss_mlp": 0.0100208, + "balance_loss_clip": 1.00095928, + "balance_loss_mlp": 1.00874603, + "epoch": 0.5947993386442206, + "flos": 65619138994560.0, + "grad_norm": 0.6588951163028871, + "language_loss": 0.54535234, + "learning_rate": 1.4887286062996375e-06, + "loss": 0.56567693, + "num_input_tokens_seen": 213155465, + "router_z_loss_clip": 0.01123047, + "router_z_loss_mlp": 0.21679688, + "step": 9893, + "time_per_iteration": 3.1101529598236084 + }, + { + "auxiliary_loss_clip": 0.01106079, + "auxiliary_loss_mlp": 0.01030799, + "balance_loss_clip": 1.01892543, + "balance_loss_mlp": 1.03811431, + "epoch": 0.5948594618968887, + "flos": 23183103168000.0, + "grad_norm": 1.588107459430707, + "language_loss": 0.74231315, + "learning_rate": 1.4883520948856658e-06, + "loss": 0.76368201, + "num_input_tokens_seen": 213174875, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 9894, + "time_per_iteration": 2.4519400596618652 + }, + { + "auxiliary_loss_clip": 0.01106074, + "auxiliary_loss_mlp": 0.01032085, + "balance_loss_clip": 1.02005649, + "balance_loss_mlp": 1.03685939, + "epoch": 0.5949195851495566, + "flos": 13626771632640.0, + "grad_norm": 1.6911288792838162, + "language_loss": 0.77848423, + "learning_rate": 1.487975602873434e-06, + "loss": 0.79986584, + "num_input_tokens_seen": 213192695, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6953125, + "step": 9895, + "time_per_iteration": 2.524150848388672 + }, + { + "auxiliary_loss_clip": 0.01110174, + "auxiliary_loss_mlp": 0.01032, + "balance_loss_clip": 1.01923883, + "balance_loss_mlp": 1.0391717, + "epoch": 0.5949797084022246, + "flos": 19751012599680.0, + "grad_norm": 1.6627914614590094, + "language_loss": 0.79355633, + "learning_rate": 1.4875991302772182e-06, + "loss": 0.814978, + "num_input_tokens_seen": 213211195, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9896, + "time_per_iteration": 2.450514078140259 + }, + { + "auxiliary_loss_clip": 0.01108131, + "auxiliary_loss_mlp": 0.01032365, + "balance_loss_clip": 1.01991367, + "balance_loss_mlp": 1.0379312, + "epoch": 0.5950398316548925, + "flos": 25773878407680.0, + "grad_norm": 1.56691412182982, + "language_loss": 0.83697438, + "learning_rate": 1.4872226771112954e-06, + "loss": 0.8583793, + "num_input_tokens_seen": 213231975, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.703125, + "step": 9897, + "time_per_iteration": 2.499427556991577 + }, + { + "auxiliary_loss_clip": 0.0111126, + "auxiliary_loss_mlp": 0.01033118, + "balance_loss_clip": 1.02043986, + "balance_loss_mlp": 1.04021525, + "epoch": 0.5950999549075605, + "flos": 23039029716480.0, + "grad_norm": 1.7628400615055348, + "language_loss": 0.70908117, + "learning_rate": 1.486846243389939e-06, + "loss": 0.7305249, + "num_input_tokens_seen": 213249760, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9898, + "time_per_iteration": 2.450711488723755 + }, + { + "auxiliary_loss_clip": 0.01114075, + "auxiliary_loss_mlp": 0.01038747, + "balance_loss_clip": 1.02481782, + "balance_loss_mlp": 1.03905582, + "epoch": 0.5951600781602284, + "flos": 32446367637120.0, + "grad_norm": 2.840239375448059, + "language_loss": 0.64112437, + "learning_rate": 1.4864698291274251e-06, + "loss": 0.66265255, + "num_input_tokens_seen": 213269890, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 9899, + "time_per_iteration": 2.5394744873046875 + }, + { + "auxiliary_loss_clip": 0.01109128, + "auxiliary_loss_mlp": 0.01026657, + "balance_loss_clip": 1.01592183, + "balance_loss_mlp": 1.04008675, + "epoch": 0.5952202014128964, + "flos": 23800874204160.0, + "grad_norm": 1.879978941191363, + "language_loss": 0.71715653, + "learning_rate": 1.4860934343380267e-06, + "loss": 0.73851436, + "num_input_tokens_seen": 213289400, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6875, + "step": 9900, + "time_per_iteration": 2.4623067378997803 + }, + { + "auxiliary_loss_clip": 0.01107194, + "auxiliary_loss_mlp": 0.01029325, + "balance_loss_clip": 1.01654577, + "balance_loss_mlp": 1.03926349, + "epoch": 0.5952803246655644, + "flos": 22492182084480.0, + "grad_norm": 1.9859766918367532, + "language_loss": 0.84489024, + "learning_rate": 1.4857170590360169e-06, + "loss": 0.86625552, + "num_input_tokens_seen": 213308040, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 9901, + "time_per_iteration": 2.4463791847229004 + }, + { + "auxiliary_loss_clip": 0.01028301, + "auxiliary_loss_mlp": 0.01003723, + "balance_loss_clip": 1.00249529, + "balance_loss_mlp": 1.00672269, + "epoch": 0.5953404479182324, + "flos": 51234688851840.0, + "grad_norm": 0.8098587011957621, + "language_loss": 0.58273184, + "learning_rate": 1.4853407032356674e-06, + "loss": 0.60305208, + "num_input_tokens_seen": 213358585, + "router_z_loss_clip": 0.01226807, + "router_z_loss_mlp": 0.21582031, + "step": 9902, + "time_per_iteration": 2.9000015258789062 + }, + { + "auxiliary_loss_clip": 0.01109898, + "auxiliary_loss_mlp": 0.0103143, + "balance_loss_clip": 1.01876402, + "balance_loss_mlp": 1.03859127, + "epoch": 0.5954005711709004, + "flos": 23112682554240.0, + "grad_norm": 3.08671627053405, + "language_loss": 0.77136552, + "learning_rate": 1.4849643669512503e-06, + "loss": 0.79277885, + "num_input_tokens_seen": 213379585, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71484375, + "step": 9903, + "time_per_iteration": 2.5076375007629395 + }, + { + "auxiliary_loss_clip": 0.01111406, + "auxiliary_loss_mlp": 0.01036008, + "balance_loss_clip": 1.02430773, + "balance_loss_mlp": 1.04097402, + "epoch": 0.5954606944235683, + "flos": 35954732736000.0, + "grad_norm": 1.7111155417857251, + "language_loss": 0.77616894, + "learning_rate": 1.4845880501970362e-06, + "loss": 0.79764313, + "num_input_tokens_seen": 213401465, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.703125, + "step": 9904, + "time_per_iteration": 2.5716845989227295 + }, + { + "auxiliary_loss_clip": 0.01110151, + "auxiliary_loss_mlp": 0.01036445, + "balance_loss_clip": 1.02405953, + "balance_loss_mlp": 1.03790653, + "epoch": 0.5955208176762363, + "flos": 30443665864320.0, + "grad_norm": 2.2036474032145192, + "language_loss": 0.72382712, + "learning_rate": 1.4842117529872942e-06, + "loss": 0.74529308, + "num_input_tokens_seen": 213422720, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.72265625, + "step": 9905, + "time_per_iteration": 2.5354321002960205 + }, + { + "auxiliary_loss_clip": 0.01109645, + "auxiliary_loss_mlp": 0.01030539, + "balance_loss_clip": 1.01789069, + "balance_loss_mlp": 1.03853083, + "epoch": 0.5955809409289042, + "flos": 17640112083840.0, + "grad_norm": 1.6203597758298474, + "language_loss": 0.69817066, + "learning_rate": 1.483835475336295e-06, + "loss": 0.71957242, + "num_input_tokens_seen": 213439480, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9906, + "time_per_iteration": 2.4373247623443604 + }, + { + "auxiliary_loss_clip": 0.01110789, + "auxiliary_loss_mlp": 0.01035293, + "balance_loss_clip": 1.02259731, + "balance_loss_mlp": 1.03987217, + "epoch": 0.5956410641815723, + "flos": 24279887001600.0, + "grad_norm": 1.782354761153575, + "language_loss": 0.7491982, + "learning_rate": 1.4834592172583057e-06, + "loss": 0.77065903, + "num_input_tokens_seen": 213458895, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9907, + "time_per_iteration": 2.5548195838928223 + }, + { + "auxiliary_loss_clip": 0.01109413, + "auxiliary_loss_mlp": 0.01035553, + "balance_loss_clip": 1.02353668, + "balance_loss_mlp": 1.0388813, + "epoch": 0.5957011874342402, + "flos": 35734277013120.0, + "grad_norm": 1.601142913290667, + "language_loss": 0.67155874, + "learning_rate": 1.483082978767595e-06, + "loss": 0.69300842, + "num_input_tokens_seen": 213481730, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.70703125, + "step": 9908, + "time_per_iteration": 2.5727956295013428 + }, + { + "auxiliary_loss_clip": 0.01108392, + "auxiliary_loss_mlp": 0.01029453, + "balance_loss_clip": 1.01753211, + "balance_loss_mlp": 1.03904438, + "epoch": 0.5957613106869082, + "flos": 21245004005760.0, + "grad_norm": 5.1100613292928365, + "language_loss": 0.76492268, + "learning_rate": 1.4827067598784298e-06, + "loss": 0.78630114, + "num_input_tokens_seen": 213497225, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 9909, + "time_per_iteration": 2.459608554840088 + }, + { + "auxiliary_loss_clip": 0.01028544, + "auxiliary_loss_mlp": 0.01005303, + "balance_loss_clip": 1.00416493, + "balance_loss_mlp": 1.00715542, + "epoch": 0.5958214339395761, + "flos": 65940969876480.0, + "grad_norm": 0.9275868367088792, + "language_loss": 0.73427647, + "learning_rate": 1.4823305606050753e-06, + "loss": 0.75461495, + "num_input_tokens_seen": 213556890, + "router_z_loss_clip": 0.01141357, + "router_z_loss_mlp": 0.21386719, + "step": 9910, + "time_per_iteration": 3.1051745414733887 + }, + { + "auxiliary_loss_clip": 0.01108818, + "auxiliary_loss_mlp": 0.01032907, + "balance_loss_clip": 1.01981187, + "balance_loss_mlp": 1.03741884, + "epoch": 0.5958815571922441, + "flos": 23218690567680.0, + "grad_norm": 1.6458105124951614, + "language_loss": 0.69844317, + "learning_rate": 1.481954380961799e-06, + "loss": 0.71986043, + "num_input_tokens_seen": 213575800, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71484375, + "step": 9911, + "time_per_iteration": 2.4647021293640137 + }, + { + "auxiliary_loss_clip": 0.01116428, + "auxiliary_loss_mlp": 0.01033668, + "balance_loss_clip": 1.02031708, + "balance_loss_mlp": 1.04145718, + "epoch": 0.595941680444912, + "flos": 16538623568640.0, + "grad_norm": 1.8630263408862686, + "language_loss": 0.65476716, + "learning_rate": 1.4815782209628631e-06, + "loss": 0.6762681, + "num_input_tokens_seen": 213592740, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.75, + "step": 9912, + "time_per_iteration": 2.4077272415161133 + }, + { + "auxiliary_loss_clip": 0.01108551, + "auxiliary_loss_mlp": 0.01036693, + "balance_loss_clip": 1.02385449, + "balance_loss_mlp": 1.03806984, + "epoch": 0.59600180369758, + "flos": 27818883423360.0, + "grad_norm": 2.0476871057930772, + "language_loss": 0.73610109, + "learning_rate": 1.4812020806225337e-06, + "loss": 0.75755352, + "num_input_tokens_seen": 213611970, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 9913, + "time_per_iteration": 2.5155045986175537 + }, + { + "auxiliary_loss_clip": 0.01113961, + "auxiliary_loss_mlp": 0.0103048, + "balance_loss_clip": 1.0178144, + "balance_loss_mlp": 1.03791463, + "epoch": 0.596061926950248, + "flos": 29491566013440.0, + "grad_norm": 2.0765652786465885, + "language_loss": 0.79696703, + "learning_rate": 1.4808259599550738e-06, + "loss": 0.81841141, + "num_input_tokens_seen": 213632230, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.76171875, + "step": 9914, + "time_per_iteration": 2.4950027465820312 + }, + { + "auxiliary_loss_clip": 0.01107633, + "auxiliary_loss_mlp": 0.01030894, + "balance_loss_clip": 1.0189786, + "balance_loss_mlp": 1.03856075, + "epoch": 0.596122050202916, + "flos": 16836790366080.0, + "grad_norm": 1.9745402695948293, + "language_loss": 0.67218065, + "learning_rate": 1.4804498589747448e-06, + "loss": 0.69356596, + "num_input_tokens_seen": 213649645, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69140625, + "step": 9915, + "time_per_iteration": 2.43723726272583 + }, + { + "auxiliary_loss_clip": 0.01107786, + "auxiliary_loss_mlp": 0.01035427, + "balance_loss_clip": 1.02319074, + "balance_loss_mlp": 1.03634763, + "epoch": 0.596182173455584, + "flos": 20996646393600.0, + "grad_norm": 1.613453800947639, + "language_loss": 0.78928566, + "learning_rate": 1.4800737776958095e-06, + "loss": 0.81071782, + "num_input_tokens_seen": 213668850, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.71484375, + "step": 9916, + "time_per_iteration": 2.456350088119507 + }, + { + "auxiliary_loss_clip": 0.01108915, + "auxiliary_loss_mlp": 0.0103207, + "balance_loss_clip": 1.01933253, + "balance_loss_mlp": 1.03744936, + "epoch": 0.5962422967082519, + "flos": 16065680169600.0, + "grad_norm": 1.7690461818627004, + "language_loss": 0.82394695, + "learning_rate": 1.4796977161325286e-06, + "loss": 0.84535682, + "num_input_tokens_seen": 213685695, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71484375, + "step": 9917, + "time_per_iteration": 2.469238758087158 + }, + { + "auxiliary_loss_clip": 0.01108059, + "auxiliary_loss_mlp": 0.01035556, + "balance_loss_clip": 1.02383804, + "balance_loss_mlp": 1.03837276, + "epoch": 0.5963024199609199, + "flos": 12166966995840.0, + "grad_norm": 1.817824058021054, + "language_loss": 0.77982944, + "learning_rate": 1.4793216742991625e-06, + "loss": 0.8012656, + "num_input_tokens_seen": 213703515, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6953125, + "step": 9918, + "time_per_iteration": 2.4436004161834717 + }, + { + "auxiliary_loss_clip": 0.01109399, + "auxiliary_loss_mlp": 0.01034518, + "balance_loss_clip": 1.02182257, + "balance_loss_mlp": 1.0390811, + "epoch": 0.5963625432135878, + "flos": 28074280101120.0, + "grad_norm": 1.422582146168897, + "language_loss": 0.78566158, + "learning_rate": 1.4789456522099707e-06, + "loss": 0.80710077, + "num_input_tokens_seen": 213724170, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 9919, + "time_per_iteration": 2.5787289142608643 + }, + { + "auxiliary_loss_clip": 0.01107781, + "auxiliary_loss_mlp": 0.01034192, + "balance_loss_clip": 1.02094173, + "balance_loss_mlp": 1.0381664, + "epoch": 0.5964226664662559, + "flos": 19860324664320.0, + "grad_norm": 1.9239790966111896, + "language_loss": 0.77425951, + "learning_rate": 1.4785696498792122e-06, + "loss": 0.79567927, + "num_input_tokens_seen": 213740620, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6953125, + "step": 9920, + "time_per_iteration": 2.4440083503723145 + }, + { + "auxiliary_loss_clip": 0.01113744, + "auxiliary_loss_mlp": 0.01030569, + "balance_loss_clip": 1.01843953, + "balance_loss_mlp": 1.04212332, + "epoch": 0.5964827897189238, + "flos": 12932618325120.0, + "grad_norm": 2.2435260632361733, + "language_loss": 0.82452321, + "learning_rate": 1.4781936673211446e-06, + "loss": 0.84596634, + "num_input_tokens_seen": 213755390, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71484375, + "step": 9921, + "time_per_iteration": 2.456138849258423 + }, + { + "auxiliary_loss_clip": 0.0110802, + "auxiliary_loss_mlp": 0.01032844, + "balance_loss_clip": 1.02021408, + "balance_loss_mlp": 1.0373764, + "epoch": 0.5965429129715918, + "flos": 18150797698560.0, + "grad_norm": 1.9967408520895134, + "language_loss": 0.80682462, + "learning_rate": 1.4778177045500252e-06, + "loss": 0.82823324, + "num_input_tokens_seen": 213773225, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.70703125, + "step": 9922, + "time_per_iteration": 2.4144599437713623 + }, + { + "auxiliary_loss_clip": 0.0110795, + "auxiliary_loss_mlp": 0.01029598, + "balance_loss_clip": 1.01693821, + "balance_loss_mlp": 1.03790641, + "epoch": 0.5966030362242597, + "flos": 21763231476480.0, + "grad_norm": 1.7485306495183626, + "language_loss": 0.77080536, + "learning_rate": 1.477441761580111e-06, + "loss": 0.79218084, + "num_input_tokens_seen": 213791860, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 9923, + "time_per_iteration": 2.489145517349243 + }, + { + "auxiliary_loss_clip": 0.01115253, + "auxiliary_loss_mlp": 0.01035824, + "balance_loss_clip": 1.02174497, + "balance_loss_mlp": 1.04084301, + "epoch": 0.5966631594769277, + "flos": 18807208790400.0, + "grad_norm": 1.7680593419575392, + "language_loss": 0.75725371, + "learning_rate": 1.4770658384256573e-06, + "loss": 0.77876449, + "num_input_tokens_seen": 213809455, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 9924, + "time_per_iteration": 2.4216740131378174 + }, + { + "auxiliary_loss_clip": 0.01105499, + "auxiliary_loss_mlp": 0.01032044, + "balance_loss_clip": 1.0190742, + "balance_loss_mlp": 1.03832626, + "epoch": 0.5967232827295956, + "flos": 14064163545600.0, + "grad_norm": 3.198852886281723, + "language_loss": 0.6646719, + "learning_rate": 1.4766899351009204e-06, + "loss": 0.68604732, + "num_input_tokens_seen": 213826615, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.671875, + "step": 9925, + "time_per_iteration": 2.4475882053375244 + }, + { + "auxiliary_loss_clip": 0.01109319, + "auxiliary_loss_mlp": 0.01032439, + "balance_loss_clip": 1.01986837, + "balance_loss_mlp": 1.04157531, + "epoch": 0.5967834059822636, + "flos": 17238235743360.0, + "grad_norm": 2.375187864026988, + "language_loss": 0.71979719, + "learning_rate": 1.4763140516201528e-06, + "loss": 0.74121475, + "num_input_tokens_seen": 213844495, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6796875, + "step": 9926, + "time_per_iteration": 2.4132394790649414 + }, + { + "auxiliary_loss_clip": 0.01111749, + "auxiliary_loss_mlp": 0.01033269, + "balance_loss_clip": 1.02014971, + "balance_loss_mlp": 1.03978753, + "epoch": 0.5968435292349316, + "flos": 42520244284800.0, + "grad_norm": 1.812838696961727, + "language_loss": 0.70522958, + "learning_rate": 1.4759381879976088e-06, + "loss": 0.7266798, + "num_input_tokens_seen": 213869125, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 9927, + "time_per_iteration": 4.071920156478882 + }, + { + "auxiliary_loss_clip": 0.01112228, + "auxiliary_loss_mlp": 0.01031173, + "balance_loss_clip": 1.01779175, + "balance_loss_mlp": 1.03788543, + "epoch": 0.5969036524875996, + "flos": 37630898945280.0, + "grad_norm": 1.756068652476383, + "language_loss": 0.63428164, + "learning_rate": 1.4755623442475415e-06, + "loss": 0.65571564, + "num_input_tokens_seen": 213891115, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7421875, + "step": 9928, + "time_per_iteration": 2.616556406021118 + }, + { + "auxiliary_loss_clip": 0.01105274, + "auxiliary_loss_mlp": 0.01029748, + "balance_loss_clip": 1.01774395, + "balance_loss_mlp": 1.0362494, + "epoch": 0.5969637757402676, + "flos": 23148377694720.0, + "grad_norm": 1.5985801618436777, + "language_loss": 0.69484866, + "learning_rate": 1.4751865203842022e-06, + "loss": 0.71619892, + "num_input_tokens_seen": 213911925, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 9929, + "time_per_iteration": 3.929401397705078 + }, + { + "auxiliary_loss_clip": 0.01106506, + "auxiliary_loss_mlp": 0.01034705, + "balance_loss_clip": 1.02314126, + "balance_loss_mlp": 1.0390749, + "epoch": 0.5970238989929355, + "flos": 24020934877440.0, + "grad_norm": 1.8723634053132125, + "language_loss": 0.7651577, + "learning_rate": 1.4748107164218431e-06, + "loss": 0.78656977, + "num_input_tokens_seen": 213930715, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.67578125, + "step": 9930, + "time_per_iteration": 3.9201314449310303 + }, + { + "auxiliary_loss_clip": 0.01114181, + "auxiliary_loss_mlp": 0.01032152, + "balance_loss_clip": 1.01845503, + "balance_loss_mlp": 1.04086351, + "epoch": 0.5970840222456035, + "flos": 19426883247360.0, + "grad_norm": 1.7493285690141849, + "language_loss": 0.69032001, + "learning_rate": 1.4744349323747146e-06, + "loss": 0.71178329, + "num_input_tokens_seen": 213950015, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.734375, + "step": 9931, + "time_per_iteration": 3.879492998123169 + }, + { + "auxiliary_loss_clip": 0.01027027, + "auxiliary_loss_mlp": 0.00997139, + "balance_loss_clip": 0.99597675, + "balance_loss_mlp": 1.00581264, + "epoch": 0.5971441454982714, + "flos": 62976615235200.0, + "grad_norm": 0.8633082810339764, + "language_loss": 0.64247859, + "learning_rate": 1.474059168257065e-06, + "loss": 0.6627202, + "num_input_tokens_seen": 214003330, + "router_z_loss_clip": 0.01159668, + "router_z_loss_mlp": 0.21289062, + "step": 9932, + "time_per_iteration": 2.985929489135742 + }, + { + "auxiliary_loss_clip": 0.01109379, + "auxiliary_loss_mlp": 0.01029489, + "balance_loss_clip": 1.01604247, + "balance_loss_mlp": 1.03876853, + "epoch": 0.5972042687509395, + "flos": 20266223328000.0, + "grad_norm": 1.8784919283093424, + "language_loss": 0.74257267, + "learning_rate": 1.4736834240831454e-06, + "loss": 0.76396132, + "num_input_tokens_seen": 214021680, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.70703125, + "step": 9933, + "time_per_iteration": 2.4789366722106934 + }, + { + "auxiliary_loss_clip": 0.01027236, + "auxiliary_loss_mlp": 0.00998624, + "balance_loss_clip": 0.997509, + "balance_loss_mlp": 1.00592136, + "epoch": 0.5972643920036074, + "flos": 71652383832960.0, + "grad_norm": 0.6667374312128803, + "language_loss": 0.51967168, + "learning_rate": 1.473307699867203e-06, + "loss": 0.53993034, + "num_input_tokens_seen": 214090265, + "router_z_loss_clip": 0.01116943, + "router_z_loss_mlp": 0.21289062, + "step": 9934, + "time_per_iteration": 3.181849956512451 + }, + { + "auxiliary_loss_clip": 0.01027661, + "auxiliary_loss_mlp": 0.00997349, + "balance_loss_clip": 0.99616891, + "balance_loss_mlp": 1.00641167, + "epoch": 0.5973245152562754, + "flos": 56892702263040.0, + "grad_norm": 0.8444164965298677, + "language_loss": 0.54164159, + "learning_rate": 1.4729319956234849e-06, + "loss": 0.56189167, + "num_input_tokens_seen": 214146375, + "router_z_loss_clip": 0.01177979, + "router_z_loss_mlp": 0.21289062, + "step": 9935, + "time_per_iteration": 2.997821807861328 + }, + { + "auxiliary_loss_clip": 0.01108103, + "auxiliary_loss_mlp": 0.01034314, + "balance_loss_clip": 1.02102828, + "balance_loss_mlp": 1.03731823, + "epoch": 0.5973846385089433, + "flos": 24164361884160.0, + "grad_norm": 1.5699606989571269, + "language_loss": 0.65541828, + "learning_rate": 1.4725563113662394e-06, + "loss": 0.67684245, + "num_input_tokens_seen": 214165340, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.70703125, + "step": 9936, + "time_per_iteration": 2.533317804336548 + }, + { + "auxiliary_loss_clip": 0.01110253, + "auxiliary_loss_mlp": 0.01032053, + "balance_loss_clip": 1.02026367, + "balance_loss_mlp": 1.03937888, + "epoch": 0.5974447617616113, + "flos": 17670599752320.0, + "grad_norm": 2.0123537966767797, + "language_loss": 0.67731905, + "learning_rate": 1.4721806471097103e-06, + "loss": 0.69874215, + "num_input_tokens_seen": 214181360, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.7109375, + "step": 9937, + "time_per_iteration": 2.4379465579986572 + }, + { + "auxiliary_loss_clip": 0.01112093, + "auxiliary_loss_mlp": 0.01034338, + "balance_loss_clip": 1.02101064, + "balance_loss_mlp": 1.03899479, + "epoch": 0.5975048850142792, + "flos": 22892514140160.0, + "grad_norm": 3.133342754143776, + "language_loss": 0.77174151, + "learning_rate": 1.4718050028681442e-06, + "loss": 0.79320574, + "num_input_tokens_seen": 214198525, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73046875, + "step": 9938, + "time_per_iteration": 2.470590114593506 + }, + { + "auxiliary_loss_clip": 0.01110044, + "auxiliary_loss_mlp": 0.01030032, + "balance_loss_clip": 1.01708603, + "balance_loss_mlp": 1.03813004, + "epoch": 0.5975650082669473, + "flos": 24353108876160.0, + "grad_norm": 1.6192850653818303, + "language_loss": 0.75987661, + "learning_rate": 1.4714293786557855e-06, + "loss": 0.78127742, + "num_input_tokens_seen": 214218710, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 9939, + "time_per_iteration": 2.477731227874756 + }, + { + "auxiliary_loss_clip": 0.01113496, + "auxiliary_loss_mlp": 0.01030328, + "balance_loss_clip": 1.01565337, + "balance_loss_mlp": 1.03811717, + "epoch": 0.5976251315196152, + "flos": 20923352691840.0, + "grad_norm": 2.2637964874634124, + "language_loss": 0.6840167, + "learning_rate": 1.471053774486878e-06, + "loss": 0.70545495, + "num_input_tokens_seen": 214237800, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.75390625, + "step": 9940, + "time_per_iteration": 2.4641294479370117 + }, + { + "auxiliary_loss_clip": 0.01103786, + "auxiliary_loss_mlp": 0.01033159, + "balance_loss_clip": 1.02150035, + "balance_loss_mlp": 1.03630126, + "epoch": 0.5976852547722832, + "flos": 35844594658560.0, + "grad_norm": 1.3031499437689418, + "language_loss": 0.70227146, + "learning_rate": 1.470678190375664e-06, + "loss": 0.72364092, + "num_input_tokens_seen": 214260355, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.67578125, + "step": 9941, + "time_per_iteration": 2.644956111907959 + }, + { + "auxiliary_loss_clip": 0.01103617, + "auxiliary_loss_mlp": 0.01033415, + "balance_loss_clip": 1.02042711, + "balance_loss_mlp": 1.0345757, + "epoch": 0.5977453780249512, + "flos": 12855948744960.0, + "grad_norm": 2.0310172288776456, + "language_loss": 0.77255404, + "learning_rate": 1.470302626336386e-06, + "loss": 0.79392433, + "num_input_tokens_seen": 214277120, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.69140625, + "step": 9942, + "time_per_iteration": 2.4575772285461426 + }, + { + "auxiliary_loss_clip": 0.01108233, + "auxiliary_loss_mlp": 0.01040799, + "balance_loss_clip": 1.02815676, + "balance_loss_mlp": 1.03664815, + "epoch": 0.5978055012776191, + "flos": 20959155573120.0, + "grad_norm": 1.8744137632140625, + "language_loss": 0.7585178, + "learning_rate": 1.4699270823832857e-06, + "loss": 0.78000808, + "num_input_tokens_seen": 214295300, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71484375, + "step": 9943, + "time_per_iteration": 2.4413061141967773 + }, + { + "auxiliary_loss_clip": 0.01105208, + "auxiliary_loss_mlp": 0.01030161, + "balance_loss_clip": 1.01884818, + "balance_loss_mlp": 1.03699136, + "epoch": 0.5978656245302871, + "flos": 34058003063040.0, + "grad_norm": 1.7396443017276344, + "language_loss": 0.61821425, + "learning_rate": 1.4695515585306032e-06, + "loss": 0.63956803, + "num_input_tokens_seen": 214317050, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.68359375, + "step": 9944, + "time_per_iteration": 2.569403886795044 + }, + { + "auxiliary_loss_clip": 0.01110079, + "auxiliary_loss_mlp": 0.01035221, + "balance_loss_clip": 1.02228653, + "balance_loss_mlp": 1.0391618, + "epoch": 0.597925747782955, + "flos": 37373275624320.0, + "grad_norm": 1.6935047887113677, + "language_loss": 0.72621685, + "learning_rate": 1.4691760547925795e-06, + "loss": 0.74766988, + "num_input_tokens_seen": 214337470, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 9945, + "time_per_iteration": 2.5811283588409424 + }, + { + "auxiliary_loss_clip": 0.0110514, + "auxiliary_loss_mlp": 0.01032852, + "balance_loss_clip": 1.02017426, + "balance_loss_mlp": 1.03536916, + "epoch": 0.5979858710356231, + "flos": 25374803328000.0, + "grad_norm": 2.0883326121528443, + "language_loss": 0.67156124, + "learning_rate": 1.4688005711834522e-06, + "loss": 0.69294119, + "num_input_tokens_seen": 214357975, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 9946, + "time_per_iteration": 2.513643503189087 + }, + { + "auxiliary_loss_clip": 0.01111839, + "auxiliary_loss_mlp": 0.01036188, + "balance_loss_clip": 1.02280676, + "balance_loss_mlp": 1.03886974, + "epoch": 0.598045994288291, + "flos": 13698413308800.0, + "grad_norm": 2.0799446912413386, + "language_loss": 0.88996196, + "learning_rate": 1.468425107717461e-06, + "loss": 0.91144222, + "num_input_tokens_seen": 214374125, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73046875, + "step": 9947, + "time_per_iteration": 2.4069466590881348 + }, + { + "auxiliary_loss_clip": 0.01102487, + "auxiliary_loss_mlp": 0.01035974, + "balance_loss_clip": 1.02501893, + "balance_loss_mlp": 1.03634834, + "epoch": 0.598106117540959, + "flos": 21981352815360.0, + "grad_norm": 1.664735448435926, + "language_loss": 0.72050726, + "learning_rate": 1.4680496644088432e-06, + "loss": 0.74189186, + "num_input_tokens_seen": 214393395, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6640625, + "step": 9948, + "time_per_iteration": 2.474961280822754 + }, + { + "auxiliary_loss_clip": 0.01107668, + "auxiliary_loss_mlp": 0.01031342, + "balance_loss_clip": 1.01749587, + "balance_loss_mlp": 1.03676891, + "epoch": 0.5981662407936269, + "flos": 20559362221440.0, + "grad_norm": 1.8018456141940389, + "language_loss": 0.89439249, + "learning_rate": 1.4676742412718347e-06, + "loss": 0.91578257, + "num_input_tokens_seen": 214411550, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7109375, + "step": 9949, + "time_per_iteration": 2.455151319503784 + }, + { + "auxiliary_loss_clip": 0.0110613, + "auxiliary_loss_mlp": 0.01026216, + "balance_loss_clip": 1.01458669, + "balance_loss_mlp": 1.03746963, + "epoch": 0.5982263640462949, + "flos": 14063840323200.0, + "grad_norm": 1.9594093526491967, + "language_loss": 0.70425475, + "learning_rate": 1.467298838320673e-06, + "loss": 0.72557819, + "num_input_tokens_seen": 214429780, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6875, + "step": 9950, + "time_per_iteration": 2.479177474975586 + }, + { + "auxiliary_loss_clip": 0.01106992, + "auxiliary_loss_mlp": 0.01030869, + "balance_loss_clip": 1.01816094, + "balance_loss_mlp": 1.03653646, + "epoch": 0.5982864872989628, + "flos": 17707228646400.0, + "grad_norm": 1.7839667170115563, + "language_loss": 0.78153586, + "learning_rate": 1.4669234555695921e-06, + "loss": 0.8029145, + "num_input_tokens_seen": 214447775, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 9951, + "time_per_iteration": 2.4318583011627197 + }, + { + "auxiliary_loss_clip": 0.01108258, + "auxiliary_loss_mlp": 0.01042077, + "balance_loss_clip": 1.02885103, + "balance_loss_mlp": 1.03666139, + "epoch": 0.5983466105516309, + "flos": 16764789553920.0, + "grad_norm": 6.7296631151691235, + "language_loss": 0.73816681, + "learning_rate": 1.4665480930328275e-06, + "loss": 0.75967014, + "num_input_tokens_seen": 214467245, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 9952, + "time_per_iteration": 2.4669008255004883 + }, + { + "auxiliary_loss_clip": 0.01109291, + "auxiliary_loss_mlp": 0.01030783, + "balance_loss_clip": 1.01705003, + "balance_loss_mlp": 1.03699803, + "epoch": 0.5984067338042988, + "flos": 20042714949120.0, + "grad_norm": 2.1100044837404264, + "language_loss": 0.78595901, + "learning_rate": 1.466172750724613e-06, + "loss": 0.8073597, + "num_input_tokens_seen": 214484385, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.72265625, + "step": 9953, + "time_per_iteration": 2.432607650756836 + }, + { + "auxiliary_loss_clip": 0.01106295, + "auxiliary_loss_mlp": 0.01030481, + "balance_loss_clip": 1.01883411, + "balance_loss_mlp": 1.03698087, + "epoch": 0.5984668570569668, + "flos": 26319900026880.0, + "grad_norm": 1.6558066102502929, + "language_loss": 0.69747621, + "learning_rate": 1.4657974286591807e-06, + "loss": 0.71884394, + "num_input_tokens_seen": 214503465, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6953125, + "step": 9954, + "time_per_iteration": 2.5316383838653564 + }, + { + "auxiliary_loss_clip": 0.01106341, + "auxiliary_loss_mlp": 0.01031118, + "balance_loss_clip": 1.01923835, + "balance_loss_mlp": 1.03664923, + "epoch": 0.5985269803096348, + "flos": 20593728558720.0, + "grad_norm": 1.7741106098423227, + "language_loss": 0.73212743, + "learning_rate": 1.4654221268507637e-06, + "loss": 0.75350201, + "num_input_tokens_seen": 214520725, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6953125, + "step": 9955, + "time_per_iteration": 2.457697629928589 + }, + { + "auxiliary_loss_clip": 0.01107558, + "auxiliary_loss_mlp": 0.01030352, + "balance_loss_clip": 1.01816237, + "balance_loss_mlp": 1.03694773, + "epoch": 0.5985871035623027, + "flos": 26865382942080.0, + "grad_norm": 1.8276717412391432, + "language_loss": 0.68681955, + "learning_rate": 1.4650468453135934e-06, + "loss": 0.70819867, + "num_input_tokens_seen": 214540675, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.70703125, + "step": 9956, + "time_per_iteration": 2.5265135765075684 + }, + { + "auxiliary_loss_clip": 0.01109542, + "auxiliary_loss_mlp": 0.01031362, + "balance_loss_clip": 1.0191431, + "balance_loss_mlp": 1.03873038, + "epoch": 0.5986472268149707, + "flos": 19609704495360.0, + "grad_norm": 2.224432093074028, + "language_loss": 0.73662853, + "learning_rate": 1.4646715840618999e-06, + "loss": 0.75803757, + "num_input_tokens_seen": 214559910, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.7109375, + "step": 9957, + "time_per_iteration": 2.4384164810180664 + }, + { + "auxiliary_loss_clip": 0.01105192, + "auxiliary_loss_mlp": 0.0102626, + "balance_loss_clip": 1.01433289, + "balance_loss_mlp": 1.03838789, + "epoch": 0.5987073500676386, + "flos": 21794616984960.0, + "grad_norm": 1.875022862600817, + "language_loss": 0.84732842, + "learning_rate": 1.4642963431099138e-06, + "loss": 0.86864293, + "num_input_tokens_seen": 214575960, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66796875, + "step": 9958, + "time_per_iteration": 2.501417636871338 + }, + { + "auxiliary_loss_clip": 0.01109112, + "auxiliary_loss_mlp": 0.01037074, + "balance_loss_clip": 1.02396715, + "balance_loss_mlp": 1.03740525, + "epoch": 0.5987674733203067, + "flos": 24314361079680.0, + "grad_norm": 2.024494152709453, + "language_loss": 0.66685295, + "learning_rate": 1.463921122471864e-06, + "loss": 0.6883148, + "num_input_tokens_seen": 214594230, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 9959, + "time_per_iteration": 2.471848726272583 + }, + { + "auxiliary_loss_clip": 0.01108718, + "auxiliary_loss_mlp": 0.01029772, + "balance_loss_clip": 1.01754093, + "balance_loss_mlp": 1.0389334, + "epoch": 0.5988275965729746, + "flos": 21320201128320.0, + "grad_norm": 1.6260957561310903, + "language_loss": 0.83360457, + "learning_rate": 1.4635459221619796e-06, + "loss": 0.85498953, + "num_input_tokens_seen": 214613130, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6953125, + "step": 9960, + "time_per_iteration": 2.4651761054992676 + }, + { + "auxiliary_loss_clip": 0.01106018, + "auxiliary_loss_mlp": 0.01029196, + "balance_loss_clip": 1.01716197, + "balance_loss_mlp": 1.03686321, + "epoch": 0.5988877198256426, + "flos": 25118041933440.0, + "grad_norm": 1.466008615140069, + "language_loss": 0.79505813, + "learning_rate": 1.4631707421944868e-06, + "loss": 0.81641018, + "num_input_tokens_seen": 214634470, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.69140625, + "step": 9961, + "time_per_iteration": 2.475454568862915 + }, + { + "auxiliary_loss_clip": 0.01106184, + "auxiliary_loss_mlp": 0.01030716, + "balance_loss_clip": 1.01849759, + "balance_loss_mlp": 1.03730237, + "epoch": 0.5989478430783105, + "flos": 26429104350720.0, + "grad_norm": 1.756927001005791, + "language_loss": 0.67329001, + "learning_rate": 1.4627955825836136e-06, + "loss": 0.69465899, + "num_input_tokens_seen": 214654030, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 9962, + "time_per_iteration": 2.489084005355835 + }, + { + "auxiliary_loss_clip": 0.01107167, + "auxiliary_loss_mlp": 0.01035232, + "balance_loss_clip": 1.02303684, + "balance_loss_mlp": 1.03722596, + "epoch": 0.5990079663309785, + "flos": 25778439434880.0, + "grad_norm": 1.365980621399165, + "language_loss": 0.74311382, + "learning_rate": 1.4624204433435857e-06, + "loss": 0.76453781, + "num_input_tokens_seen": 214676985, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.703125, + "step": 9963, + "time_per_iteration": 2.4947874546051025 + }, + { + "auxiliary_loss_clip": 0.01105091, + "auxiliary_loss_mlp": 0.01032808, + "balance_loss_clip": 1.02042198, + "balance_loss_mlp": 1.03652799, + "epoch": 0.5990680895836464, + "flos": 36831779118720.0, + "grad_norm": 2.111691032145124, + "language_loss": 0.68214118, + "learning_rate": 1.4620453244886281e-06, + "loss": 0.70352018, + "num_input_tokens_seen": 214700105, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6875, + "step": 9964, + "time_per_iteration": 2.595745086669922 + }, + { + "auxiliary_loss_clip": 0.01104549, + "auxiliary_loss_mlp": 0.01028863, + "balance_loss_clip": 1.01635242, + "balance_loss_mlp": 1.03745115, + "epoch": 0.5991282128363145, + "flos": 24133550993280.0, + "grad_norm": 1.9069133835925212, + "language_loss": 0.77044344, + "learning_rate": 1.4616702260329662e-06, + "loss": 0.79177749, + "num_input_tokens_seen": 214717885, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.671875, + "step": 9965, + "time_per_iteration": 2.447580337524414 + }, + { + "auxiliary_loss_clip": 0.01106548, + "auxiliary_loss_mlp": 0.01030278, + "balance_loss_clip": 1.01833928, + "balance_loss_mlp": 1.03651989, + "epoch": 0.5991883360889824, + "flos": 10304064956160.0, + "grad_norm": 1.8284726106569544, + "language_loss": 0.77189291, + "learning_rate": 1.4612951479908229e-06, + "loss": 0.79326117, + "num_input_tokens_seen": 214733680, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.703125, + "step": 9966, + "time_per_iteration": 2.450202226638794 + }, + { + "auxiliary_loss_clip": 0.01106883, + "auxiliary_loss_mlp": 0.01029207, + "balance_loss_clip": 1.01775706, + "balance_loss_mlp": 1.03827262, + "epoch": 0.5992484593416504, + "flos": 23951196622080.0, + "grad_norm": 1.4816211966309663, + "language_loss": 0.73338163, + "learning_rate": 1.460920090376422e-06, + "loss": 0.7547425, + "num_input_tokens_seen": 214753285, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6875, + "step": 9967, + "time_per_iteration": 2.5361080169677734 + }, + { + "auxiliary_loss_clip": 0.01113043, + "auxiliary_loss_mlp": 0.01036039, + "balance_loss_clip": 1.0229435, + "balance_loss_mlp": 1.03907526, + "epoch": 0.5993085825943184, + "flos": 11944105061760.0, + "grad_norm": 1.98552880835617, + "language_loss": 0.68667233, + "learning_rate": 1.4605450532039847e-06, + "loss": 0.70816314, + "num_input_tokens_seen": 214767810, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7421875, + "step": 9968, + "time_per_iteration": 2.4201669692993164 + }, + { + "auxiliary_loss_clip": 0.01107383, + "auxiliary_loss_mlp": 0.01034585, + "balance_loss_clip": 1.02188921, + "balance_loss_mlp": 1.03702521, + "epoch": 0.5993687058469863, + "flos": 19026838500480.0, + "grad_norm": 1.5069000727815525, + "language_loss": 0.79169899, + "learning_rate": 1.4601700364877334e-06, + "loss": 0.8131187, + "num_input_tokens_seen": 214786040, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 9969, + "time_per_iteration": 3.9278953075408936 + }, + { + "auxiliary_loss_clip": 0.01106356, + "auxiliary_loss_mlp": 0.01032743, + "balance_loss_clip": 1.01999974, + "balance_loss_mlp": 1.03598189, + "epoch": 0.5994288290996543, + "flos": 14282967242880.0, + "grad_norm": 2.0663897132059588, + "language_loss": 0.81023246, + "learning_rate": 1.4597950402418889e-06, + "loss": 0.83162344, + "num_input_tokens_seen": 214803110, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 9970, + "time_per_iteration": 2.4416465759277344 + }, + { + "auxiliary_loss_clip": 0.01109867, + "auxiliary_loss_mlp": 0.01039566, + "balance_loss_clip": 1.02511787, + "balance_loss_mlp": 1.0377593, + "epoch": 0.5994889523523222, + "flos": 19206643006080.0, + "grad_norm": 1.8664927797599988, + "language_loss": 0.62176776, + "learning_rate": 1.4594200644806697e-06, + "loss": 0.64326209, + "num_input_tokens_seen": 214819945, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.71875, + "step": 9971, + "time_per_iteration": 3.8846518993377686 + }, + { + "auxiliary_loss_clip": 0.01102408, + "auxiliary_loss_mlp": 0.01029479, + "balance_loss_clip": 1.01776624, + "balance_loss_mlp": 1.03571367, + "epoch": 0.5995490756049903, + "flos": 28037040675840.0, + "grad_norm": 1.8563043542024344, + "language_loss": 0.79314888, + "learning_rate": 1.4590451092182962e-06, + "loss": 0.81446773, + "num_input_tokens_seen": 214838810, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 9972, + "time_per_iteration": 3.901256561279297 + }, + { + "auxiliary_loss_clip": 0.01112588, + "auxiliary_loss_mlp": 0.01034647, + "balance_loss_clip": 1.02152252, + "balance_loss_mlp": 1.03817391, + "epoch": 0.5996091988576582, + "flos": 29052953038080.0, + "grad_norm": 2.1896098539024176, + "language_loss": 0.76205128, + "learning_rate": 1.4586701744689864e-06, + "loss": 0.78352362, + "num_input_tokens_seen": 214857040, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7421875, + "step": 9973, + "time_per_iteration": 3.9424259662628174 + }, + { + "auxiliary_loss_clip": 0.01106987, + "auxiliary_loss_mlp": 0.01033225, + "balance_loss_clip": 1.02021337, + "balance_loss_mlp": 1.0362227, + "epoch": 0.5996693221103262, + "flos": 20813968800000.0, + "grad_norm": 2.3034108647788933, + "language_loss": 0.64969486, + "learning_rate": 1.4582952602469578e-06, + "loss": 0.67109704, + "num_input_tokens_seen": 214873375, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 9974, + "time_per_iteration": 2.4875805377960205 + }, + { + "auxiliary_loss_clip": 0.01107343, + "auxiliary_loss_mlp": 0.0103502, + "balance_loss_clip": 1.02270579, + "balance_loss_mlp": 1.03728855, + "epoch": 0.5997294453629941, + "flos": 23768914078080.0, + "grad_norm": 1.4500461001521425, + "language_loss": 0.74434048, + "learning_rate": 1.457920366566428e-06, + "loss": 0.76576418, + "num_input_tokens_seen": 214893900, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 9975, + "time_per_iteration": 2.4895670413970947 + }, + { + "auxiliary_loss_clip": 0.01107892, + "auxiliary_loss_mlp": 0.01028638, + "balance_loss_clip": 1.01572204, + "balance_loss_mlp": 1.03760493, + "epoch": 0.5997895686156621, + "flos": 20960017499520.0, + "grad_norm": 1.7933529759094704, + "language_loss": 0.76735765, + "learning_rate": 1.457545493441611e-06, + "loss": 0.78872299, + "num_input_tokens_seen": 214912110, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 9976, + "time_per_iteration": 2.5056304931640625 + }, + { + "auxiliary_loss_clip": 0.01107614, + "auxiliary_loss_mlp": 0.01039313, + "balance_loss_clip": 1.02620029, + "balance_loss_mlp": 1.03780508, + "epoch": 0.59984969186833, + "flos": 28365443746560.0, + "grad_norm": 2.4460752586196857, + "language_loss": 0.74817264, + "learning_rate": 1.4571706408867237e-06, + "loss": 0.76964188, + "num_input_tokens_seen": 214930140, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.69921875, + "step": 9977, + "time_per_iteration": 2.496149778366089 + }, + { + "auxiliary_loss_clip": 0.01107436, + "auxiliary_loss_mlp": 0.01032043, + "balance_loss_clip": 1.01962721, + "balance_loss_mlp": 1.03684258, + "epoch": 0.5999098151209981, + "flos": 22565906749440.0, + "grad_norm": 1.6882301956293941, + "language_loss": 0.68553925, + "learning_rate": 1.4567958089159802e-06, + "loss": 0.70693398, + "num_input_tokens_seen": 214949200, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.70703125, + "step": 9978, + "time_per_iteration": 2.483567714691162 + }, + { + "auxiliary_loss_clip": 0.01113427, + "auxiliary_loss_mlp": 0.01033778, + "balance_loss_clip": 1.02087975, + "balance_loss_mlp": 1.04072738, + "epoch": 0.599969938373666, + "flos": 18768712389120.0, + "grad_norm": 2.78777966355448, + "language_loss": 0.81153774, + "learning_rate": 1.456420997543594e-06, + "loss": 0.83300972, + "num_input_tokens_seen": 214965775, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7265625, + "step": 9979, + "time_per_iteration": 2.413935899734497 + }, + { + "auxiliary_loss_clip": 0.01102922, + "auxiliary_loss_mlp": 0.01032239, + "balance_loss_clip": 1.02026439, + "balance_loss_mlp": 1.03630424, + "epoch": 0.600030061626334, + "flos": 11327231865600.0, + "grad_norm": 1.7401896529481804, + "language_loss": 0.6957618, + "learning_rate": 1.4560462067837782e-06, + "loss": 0.71711338, + "num_input_tokens_seen": 214982480, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6640625, + "step": 9980, + "time_per_iteration": 2.4312682151794434 + }, + { + "auxiliary_loss_clip": 0.01110498, + "auxiliary_loss_mlp": 0.01032669, + "balance_loss_clip": 1.01947856, + "balance_loss_mlp": 1.03764093, + "epoch": 0.600090184879002, + "flos": 16578664254720.0, + "grad_norm": 3.8237519537086238, + "language_loss": 0.68642873, + "learning_rate": 1.4556714366507445e-06, + "loss": 0.70786041, + "num_input_tokens_seen": 214998110, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73046875, + "step": 9981, + "time_per_iteration": 2.4180452823638916 + }, + { + "auxiliary_loss_clip": 0.01106792, + "auxiliary_loss_mlp": 0.01035605, + "balance_loss_clip": 1.02439916, + "balance_loss_mlp": 1.03752363, + "epoch": 0.6001503081316699, + "flos": 23618627573760.0, + "grad_norm": 3.017374403618408, + "language_loss": 0.78579712, + "learning_rate": 1.4552966871587048e-06, + "loss": 0.80722106, + "num_input_tokens_seen": 215017995, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.69140625, + "step": 9982, + "time_per_iteration": 2.5378241539001465 + }, + { + "auxiliary_loss_clip": 0.01107415, + "auxiliary_loss_mlp": 0.01034564, + "balance_loss_clip": 1.02182055, + "balance_loss_mlp": 1.03862381, + "epoch": 0.6002104313843379, + "flos": 20667668705280.0, + "grad_norm": 1.4959053225865697, + "language_loss": 0.72973263, + "learning_rate": 1.4549219583218686e-06, + "loss": 0.7511524, + "num_input_tokens_seen": 215038285, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 9983, + "time_per_iteration": 2.4516336917877197 + }, + { + "auxiliary_loss_clip": 0.01105736, + "auxiliary_loss_mlp": 0.01031565, + "balance_loss_clip": 1.01893497, + "balance_loss_mlp": 1.03546536, + "epoch": 0.6002705546370058, + "flos": 22455229968000.0, + "grad_norm": 2.0437339372279775, + "language_loss": 0.77803969, + "learning_rate": 1.454547250154447e-06, + "loss": 0.79941273, + "num_input_tokens_seen": 215057825, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 9984, + "time_per_iteration": 2.4639358520507812 + }, + { + "auxiliary_loss_clip": 0.0110781, + "auxiliary_loss_mlp": 0.01034309, + "balance_loss_clip": 1.02200651, + "balance_loss_mlp": 1.03833842, + "epoch": 0.6003306778896739, + "flos": 25191982080000.0, + "grad_norm": 1.564540000062254, + "language_loss": 0.83254963, + "learning_rate": 1.4541725626706485e-06, + "loss": 0.85397083, + "num_input_tokens_seen": 215077790, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 9985, + "time_per_iteration": 2.584782361984253 + }, + { + "auxiliary_loss_clip": 0.01107675, + "auxiliary_loss_mlp": 0.01037251, + "balance_loss_clip": 1.02535367, + "balance_loss_mlp": 1.03886271, + "epoch": 0.6003908011423418, + "flos": 26687733252480.0, + "grad_norm": 1.8232812965365295, + "language_loss": 0.71257466, + "learning_rate": 1.4537978958846809e-06, + "loss": 0.73402393, + "num_input_tokens_seen": 215097650, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 9986, + "time_per_iteration": 2.5054030418395996 + }, + { + "auxiliary_loss_clip": 0.01110337, + "auxiliary_loss_mlp": 0.01031102, + "balance_loss_clip": 1.01824546, + "balance_loss_mlp": 1.04022861, + "epoch": 0.6004509243950098, + "flos": 22565080736640.0, + "grad_norm": 2.2582190453585653, + "language_loss": 0.71791571, + "learning_rate": 1.4534232498107514e-06, + "loss": 0.73933005, + "num_input_tokens_seen": 215118235, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69921875, + "step": 9987, + "time_per_iteration": 2.4961001873016357 + }, + { + "auxiliary_loss_clip": 0.01106291, + "auxiliary_loss_mlp": 0.01034497, + "balance_loss_clip": 1.02245712, + "balance_loss_mlp": 1.03697586, + "epoch": 0.6005110476476777, + "flos": 19719303868800.0, + "grad_norm": 1.6101111043143586, + "language_loss": 0.84407473, + "learning_rate": 1.4530486244630673e-06, + "loss": 0.86548263, + "num_input_tokens_seen": 215136755, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6953125, + "step": 9988, + "time_per_iteration": 2.435049533843994 + }, + { + "auxiliary_loss_clip": 0.01105215, + "auxiliary_loss_mlp": 0.01033666, + "balance_loss_clip": 1.02113748, + "balance_loss_mlp": 1.03617096, + "epoch": 0.6005711709003457, + "flos": 17712543859200.0, + "grad_norm": 1.6559701651537184, + "language_loss": 0.65416402, + "learning_rate": 1.4526740198558346e-06, + "loss": 0.67555285, + "num_input_tokens_seen": 215155225, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69140625, + "step": 9989, + "time_per_iteration": 2.4359869956970215 + }, + { + "auxiliary_loss_clip": 0.01105185, + "auxiliary_loss_mlp": 0.01031292, + "balance_loss_clip": 1.01960373, + "balance_loss_mlp": 1.03680921, + "epoch": 0.6006312941530136, + "flos": 18514464946560.0, + "grad_norm": 1.811706113820645, + "language_loss": 0.80521321, + "learning_rate": 1.452299436003257e-06, + "loss": 0.82657802, + "num_input_tokens_seen": 215174815, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.68359375, + "step": 9990, + "time_per_iteration": 2.44775128364563 + }, + { + "auxiliary_loss_clip": 0.01108983, + "auxiliary_loss_mlp": 0.01034602, + "balance_loss_clip": 1.02215016, + "balance_loss_mlp": 1.03804195, + "epoch": 0.6006914174056817, + "flos": 21390837223680.0, + "grad_norm": 1.6786296180827829, + "language_loss": 0.82789129, + "learning_rate": 1.4519248729195403e-06, + "loss": 0.84932715, + "num_input_tokens_seen": 215192045, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 9991, + "time_per_iteration": 2.464409112930298 + }, + { + "auxiliary_loss_clip": 0.01103829, + "auxiliary_loss_mlp": 0.01034501, + "balance_loss_clip": 1.02240098, + "balance_loss_mlp": 1.03611255, + "epoch": 0.6007515406583496, + "flos": 12750515349120.0, + "grad_norm": 2.5638990933503587, + "language_loss": 0.82719564, + "learning_rate": 1.4515503306188878e-06, + "loss": 0.84857893, + "num_input_tokens_seen": 215209885, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 9992, + "time_per_iteration": 2.4012389183044434 + }, + { + "auxiliary_loss_clip": 0.01105302, + "auxiliary_loss_mlp": 0.01034099, + "balance_loss_clip": 1.02181458, + "balance_loss_mlp": 1.03721142, + "epoch": 0.6008116639110176, + "flos": 19206894401280.0, + "grad_norm": 2.724325433103902, + "language_loss": 0.6668725, + "learning_rate": 1.4511758091155008e-06, + "loss": 0.6882664, + "num_input_tokens_seen": 215228150, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6796875, + "step": 9993, + "time_per_iteration": 2.431534767150879 + }, + { + "auxiliary_loss_clip": 0.01105757, + "auxiliary_loss_mlp": 0.01031875, + "balance_loss_clip": 1.01941192, + "balance_loss_mlp": 1.03631759, + "epoch": 0.6008717871636855, + "flos": 17055342668160.0, + "grad_norm": 2.313639381360734, + "language_loss": 0.81478924, + "learning_rate": 1.4508013084235826e-06, + "loss": 0.83616555, + "num_input_tokens_seen": 215243755, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 9994, + "time_per_iteration": 2.410637140274048 + }, + { + "auxiliary_loss_clip": 0.01102128, + "auxiliary_loss_mlp": 0.01025937, + "balance_loss_clip": 1.01506472, + "balance_loss_mlp": 1.03755724, + "epoch": 0.6009319104163535, + "flos": 20298686244480.0, + "grad_norm": 1.8133737963871297, + "language_loss": 0.72619045, + "learning_rate": 1.4504268285573337e-06, + "loss": 0.74747109, + "num_input_tokens_seen": 215262130, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.64453125, + "step": 9995, + "time_per_iteration": 2.462024450302124 + }, + { + "auxiliary_loss_clip": 0.01106573, + "auxiliary_loss_mlp": 0.01033043, + "balance_loss_clip": 1.02083576, + "balance_loss_mlp": 1.03584194, + "epoch": 0.6009920336690215, + "flos": 21836776573440.0, + "grad_norm": 2.19390066880666, + "language_loss": 0.80974549, + "learning_rate": 1.4500523695309546e-06, + "loss": 0.83114165, + "num_input_tokens_seen": 215281785, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.70703125, + "step": 9996, + "time_per_iteration": 2.4826295375823975 + }, + { + "auxiliary_loss_clip": 0.01106517, + "auxiliary_loss_mlp": 0.01037102, + "balance_loss_clip": 1.02458513, + "balance_loss_mlp": 1.03807008, + "epoch": 0.6010521569216895, + "flos": 22596107109120.0, + "grad_norm": 3.1537087962017814, + "language_loss": 0.78669906, + "learning_rate": 1.4496779313586447e-06, + "loss": 0.80813521, + "num_input_tokens_seen": 215297550, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 9997, + "time_per_iteration": 2.4731595516204834 + }, + { + "auxiliary_loss_clip": 0.01107621, + "auxiliary_loss_mlp": 0.01030423, + "balance_loss_clip": 1.01708388, + "balance_loss_mlp": 1.03646445, + "epoch": 0.6011122801743575, + "flos": 19171702051200.0, + "grad_norm": 3.7238695953955263, + "language_loss": 0.73005414, + "learning_rate": 1.4493035140546028e-06, + "loss": 0.75143456, + "num_input_tokens_seen": 215316360, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 9998, + "time_per_iteration": 2.4839541912078857 + }, + { + "auxiliary_loss_clip": 0.01103199, + "auxiliary_loss_mlp": 0.01026771, + "balance_loss_clip": 1.0148679, + "balance_loss_mlp": 1.03565955, + "epoch": 0.6011724034270254, + "flos": 25010022758400.0, + "grad_norm": 1.5405076955909784, + "language_loss": 0.721259, + "learning_rate": 1.448929117633027e-06, + "loss": 0.74255872, + "num_input_tokens_seen": 215336405, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 9999, + "time_per_iteration": 2.5177793502807617 + }, + { + "auxiliary_loss_clip": 0.01108153, + "auxiliary_loss_mlp": 0.01035498, + "balance_loss_clip": 1.02320766, + "balance_loss_mlp": 1.03617668, + "epoch": 0.6012325266796934, + "flos": 21797669640960.0, + "grad_norm": 14.582740501304201, + "language_loss": 0.78332782, + "learning_rate": 1.4485547421081142e-06, + "loss": 0.80476433, + "num_input_tokens_seen": 215356590, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.71875, + "step": 10000, + "time_per_iteration": 2.5176899433135986 + }, + { + "auxiliary_loss_clip": 0.01111103, + "auxiliary_loss_mlp": 0.01033566, + "balance_loss_clip": 1.02002978, + "balance_loss_mlp": 1.03898025, + "epoch": 0.6012926499323613, + "flos": 19573003774080.0, + "grad_norm": 1.9333747533908545, + "language_loss": 0.77681154, + "learning_rate": 1.4481803874940608e-06, + "loss": 0.79825819, + "num_input_tokens_seen": 215374295, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.72265625, + "step": 10001, + "time_per_iteration": 2.4608781337738037 + }, + { + "auxiliary_loss_clip": 0.01109986, + "auxiliary_loss_mlp": 0.01031166, + "balance_loss_clip": 1.01821423, + "balance_loss_mlp": 1.03775978, + "epoch": 0.6013527731850293, + "flos": 34860786076800.0, + "grad_norm": 2.0376201380828642, + "language_loss": 0.58534205, + "learning_rate": 1.4478060538050624e-06, + "loss": 0.60675359, + "num_input_tokens_seen": 215394535, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.72265625, + "step": 10002, + "time_per_iteration": 2.573974847793579 + }, + { + "auxiliary_loss_clip": 0.01110624, + "auxiliary_loss_mlp": 0.01035664, + "balance_loss_clip": 1.02163339, + "balance_loss_mlp": 1.0399766, + "epoch": 0.6014128964376972, + "flos": 23291948355840.0, + "grad_norm": 1.4763500532767482, + "language_loss": 0.77651924, + "learning_rate": 1.447431741055314e-06, + "loss": 0.7979821, + "num_input_tokens_seen": 215414355, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.70703125, + "step": 10003, + "time_per_iteration": 2.507904291152954 + }, + { + "auxiliary_loss_clip": 0.01109401, + "auxiliary_loss_mlp": 0.01028384, + "balance_loss_clip": 1.01595616, + "balance_loss_mlp": 1.03869998, + "epoch": 0.6014730196903653, + "flos": 24820916630400.0, + "grad_norm": 2.341725474955548, + "language_loss": 0.77185351, + "learning_rate": 1.4470574492590091e-06, + "loss": 0.79323137, + "num_input_tokens_seen": 215428280, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.70703125, + "step": 10004, + "time_per_iteration": 2.4672906398773193 + }, + { + "auxiliary_loss_clip": 0.01106632, + "auxiliary_loss_mlp": 0.01029336, + "balance_loss_clip": 1.01697397, + "balance_loss_mlp": 1.03765237, + "epoch": 0.6015331429430332, + "flos": 23112359331840.0, + "grad_norm": 1.6533707293679005, + "language_loss": 0.72357887, + "learning_rate": 1.4466831784303408e-06, + "loss": 0.74493855, + "num_input_tokens_seen": 215448970, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 10005, + "time_per_iteration": 2.481327533721924 + }, + { + "auxiliary_loss_clip": 0.01103683, + "auxiliary_loss_mlp": 0.01029251, + "balance_loss_clip": 1.01724029, + "balance_loss_mlp": 1.03719342, + "epoch": 0.6015932661957012, + "flos": 19201363706880.0, + "grad_norm": 1.9903847661444378, + "language_loss": 0.74641156, + "learning_rate": 1.4463089285835026e-06, + "loss": 0.76774085, + "num_input_tokens_seen": 215465260, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 10006, + "time_per_iteration": 2.4176204204559326 + }, + { + "auxiliary_loss_clip": 0.01104928, + "auxiliary_loss_mlp": 0.01036759, + "balance_loss_clip": 1.02387798, + "balance_loss_mlp": 1.03541553, + "epoch": 0.6016533894483691, + "flos": 18113630100480.0, + "grad_norm": 2.3154709076008726, + "language_loss": 0.73940712, + "learning_rate": 1.445934699732685e-06, + "loss": 0.76082402, + "num_input_tokens_seen": 215482725, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 10007, + "time_per_iteration": 2.4568898677825928 + }, + { + "auxiliary_loss_clip": 0.01105567, + "auxiliary_loss_mlp": 0.01026535, + "balance_loss_clip": 1.01488209, + "balance_loss_mlp": 1.03767657, + "epoch": 0.6017135127010371, + "flos": 16216900427520.0, + "grad_norm": 2.0163179080147065, + "language_loss": 0.70129442, + "learning_rate": 1.4455604918920785e-06, + "loss": 0.72261548, + "num_input_tokens_seen": 215500420, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 10008, + "time_per_iteration": 2.4591152667999268 + }, + { + "auxiliary_loss_clip": 0.011063, + "auxiliary_loss_mlp": 0.01025901, + "balance_loss_clip": 1.01420045, + "balance_loss_mlp": 1.0375886, + "epoch": 0.6017736359537051, + "flos": 23444246021760.0, + "grad_norm": 1.5735106118568272, + "language_loss": 0.76055562, + "learning_rate": 1.4451863050758748e-06, + "loss": 0.78187764, + "num_input_tokens_seen": 215522260, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6875, + "step": 10009, + "time_per_iteration": 2.5413200855255127 + }, + { + "auxiliary_loss_clip": 0.01106971, + "auxiliary_loss_mlp": 0.01033511, + "balance_loss_clip": 1.02157235, + "balance_loss_mlp": 1.03784704, + "epoch": 0.601833759206373, + "flos": 23514056104320.0, + "grad_norm": 2.2862690220983257, + "language_loss": 0.74194181, + "learning_rate": 1.4448121392982608e-06, + "loss": 0.76334661, + "num_input_tokens_seen": 215541715, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 10010, + "time_per_iteration": 3.888418436050415 + }, + { + "auxiliary_loss_clip": 0.01029006, + "auxiliary_loss_mlp": 0.00995965, + "balance_loss_clip": 0.99489832, + "balance_loss_mlp": 1.00768209, + "epoch": 0.6018938824590411, + "flos": 63991668648960.0, + "grad_norm": 0.7964241921308365, + "language_loss": 0.55079472, + "learning_rate": 1.4444379945734268e-06, + "loss": 0.57104445, + "num_input_tokens_seen": 215603020, + "router_z_loss_clip": 0.01068115, + "router_z_loss_mlp": 0.21289062, + "step": 10011, + "time_per_iteration": 3.125993251800537 + }, + { + "auxiliary_loss_clip": 0.01106744, + "auxiliary_loss_mlp": 0.01035464, + "balance_loss_clip": 1.02382302, + "balance_loss_mlp": 1.03751755, + "epoch": 0.601954005711709, + "flos": 34640007131520.0, + "grad_norm": 1.3952150015846279, + "language_loss": 0.62033314, + "learning_rate": 1.44406387091556e-06, + "loss": 0.64175516, + "num_input_tokens_seen": 215625115, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.69140625, + "step": 10012, + "time_per_iteration": 3.9947257041931152 + }, + { + "auxiliary_loss_clip": 0.01107114, + "auxiliary_loss_mlp": 0.0102335, + "balance_loss_clip": 1.01210856, + "balance_loss_mlp": 1.03870738, + "epoch": 0.602014128964377, + "flos": 19427062815360.0, + "grad_norm": 1.6026031648611754, + "language_loss": 0.74765098, + "learning_rate": 1.4436897683388462e-06, + "loss": 0.76895565, + "num_input_tokens_seen": 215643730, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.68359375, + "step": 10013, + "time_per_iteration": 3.9350314140319824 + }, + { + "auxiliary_loss_clip": 0.01100697, + "auxiliary_loss_mlp": 0.01027976, + "balance_loss_clip": 1.01671076, + "balance_loss_mlp": 1.03607368, + "epoch": 0.6020742522170449, + "flos": 28329389470080.0, + "grad_norm": 1.7871112945652055, + "language_loss": 0.81346315, + "learning_rate": 1.4433156868574732e-06, + "loss": 0.83474994, + "num_input_tokens_seen": 215664425, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 10014, + "time_per_iteration": 3.929865837097168 + }, + { + "auxiliary_loss_clip": 0.01101176, + "auxiliary_loss_mlp": 0.01025273, + "balance_loss_clip": 1.01355481, + "balance_loss_mlp": 1.03631175, + "epoch": 0.6021343754697129, + "flos": 22747040058240.0, + "grad_norm": 1.3916523900358202, + "language_loss": 0.72577333, + "learning_rate": 1.442941626485624e-06, + "loss": 0.74703777, + "num_input_tokens_seen": 215684280, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6484375, + "step": 10015, + "time_per_iteration": 2.487917184829712 + }, + { + "auxiliary_loss_clip": 0.0102817, + "auxiliary_loss_mlp": 0.0100004, + "balance_loss_clip": 0.99888366, + "balance_loss_mlp": 1.00701785, + "epoch": 0.6021944987223808, + "flos": 65752007402880.0, + "grad_norm": 0.8145782570930438, + "language_loss": 0.54800987, + "learning_rate": 1.4425675872374848e-06, + "loss": 0.5682919, + "num_input_tokens_seen": 215739780, + "router_z_loss_clip": 0.01153564, + "router_z_loss_mlp": 0.2109375, + "step": 10016, + "time_per_iteration": 2.952225923538208 + }, + { + "auxiliary_loss_clip": 0.01105304, + "auxiliary_loss_mlp": 0.01029257, + "balance_loss_clip": 1.01721644, + "balance_loss_mlp": 1.03722167, + "epoch": 0.6022546219750489, + "flos": 16105182151680.0, + "grad_norm": 1.4974922822650143, + "language_loss": 0.82952374, + "learning_rate": 1.4421935691272381e-06, + "loss": 0.85086936, + "num_input_tokens_seen": 215757885, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 10017, + "time_per_iteration": 2.4482316970825195 + }, + { + "auxiliary_loss_clip": 0.01105754, + "auxiliary_loss_mlp": 0.01028153, + "balance_loss_clip": 1.01691723, + "balance_loss_mlp": 1.03885603, + "epoch": 0.6023147452277168, + "flos": 25512555985920.0, + "grad_norm": 1.7894712759587756, + "language_loss": 0.83787656, + "learning_rate": 1.4418195721690677e-06, + "loss": 0.85921562, + "num_input_tokens_seen": 215776415, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 10018, + "time_per_iteration": 2.570969820022583 + }, + { + "auxiliary_loss_clip": 0.01109615, + "auxiliary_loss_mlp": 0.01036282, + "balance_loss_clip": 1.02348518, + "balance_loss_mlp": 1.03740263, + "epoch": 0.6023748684803848, + "flos": 22636075968000.0, + "grad_norm": 1.651779624626633, + "language_loss": 0.78134441, + "learning_rate": 1.4414455963771549e-06, + "loss": 0.80280334, + "num_input_tokens_seen": 215794865, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.72265625, + "step": 10019, + "time_per_iteration": 2.4765312671661377 + }, + { + "auxiliary_loss_clip": 0.01103799, + "auxiliary_loss_mlp": 0.01027592, + "balance_loss_clip": 1.01586819, + "balance_loss_mlp": 1.03516555, + "epoch": 0.6024349917330527, + "flos": 26210444307840.0, + "grad_norm": 1.523816764872001, + "language_loss": 0.73855495, + "learning_rate": 1.441071641765681e-06, + "loss": 0.75986886, + "num_input_tokens_seen": 215816840, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6875, + "step": 10020, + "time_per_iteration": 2.530351161956787 + }, + { + "auxiliary_loss_clip": 0.01106179, + "auxiliary_loss_mlp": 0.01033215, + "balance_loss_clip": 1.02080584, + "balance_loss_mlp": 1.03670871, + "epoch": 0.6024951149857207, + "flos": 21251755762560.0, + "grad_norm": 1.5471183793037282, + "language_loss": 0.64036959, + "learning_rate": 1.4406977083488264e-06, + "loss": 0.66176355, + "num_input_tokens_seen": 215836100, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6953125, + "step": 10021, + "time_per_iteration": 2.491334915161133 + }, + { + "auxiliary_loss_clip": 0.01103767, + "auxiliary_loss_mlp": 0.01031861, + "balance_loss_clip": 1.01892638, + "balance_loss_mlp": 1.03523266, + "epoch": 0.6025552382383887, + "flos": 26943453152640.0, + "grad_norm": 1.4551090911481597, + "language_loss": 0.80527318, + "learning_rate": 1.4403237961407704e-06, + "loss": 0.8266294, + "num_input_tokens_seen": 215858480, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6875, + "step": 10022, + "time_per_iteration": 2.504343032836914 + }, + { + "auxiliary_loss_clip": 0.01110275, + "auxiliary_loss_mlp": 0.01029469, + "balance_loss_clip": 1.01720786, + "balance_loss_mlp": 1.03836441, + "epoch": 0.6026153614910567, + "flos": 31684379495040.0, + "grad_norm": 1.6380547321516945, + "language_loss": 0.66718352, + "learning_rate": 1.439949905155693e-06, + "loss": 0.68858099, + "num_input_tokens_seen": 215879950, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.71875, + "step": 10023, + "time_per_iteration": 2.550156593322754 + }, + { + "auxiliary_loss_clip": 0.01106872, + "auxiliary_loss_mlp": 0.01030774, + "balance_loss_clip": 1.01878142, + "balance_loss_mlp": 1.03709006, + "epoch": 0.6026754847437247, + "flos": 29312731175040.0, + "grad_norm": 3.9256623345472397, + "language_loss": 0.74829918, + "learning_rate": 1.4395760354077707e-06, + "loss": 0.76967561, + "num_input_tokens_seen": 215899830, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6953125, + "step": 10024, + "time_per_iteration": 2.5556838512420654 + }, + { + "auxiliary_loss_clip": 0.01106267, + "auxiliary_loss_mlp": 0.0103271, + "balance_loss_clip": 1.02027631, + "balance_loss_mlp": 1.03824794, + "epoch": 0.6027356079963926, + "flos": 23586775188480.0, + "grad_norm": 1.6728401649111677, + "language_loss": 0.7330395, + "learning_rate": 1.4392021869111815e-06, + "loss": 0.75442922, + "num_input_tokens_seen": 215920440, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6796875, + "step": 10025, + "time_per_iteration": 2.513984441757202 + }, + { + "auxiliary_loss_clip": 0.01110825, + "auxiliary_loss_mlp": 0.01034309, + "balance_loss_clip": 1.02113652, + "balance_loss_mlp": 1.03738081, + "epoch": 0.6027957312490606, + "flos": 20813753318400.0, + "grad_norm": 2.650368099581338, + "language_loss": 0.67278063, + "learning_rate": 1.4388283596801016e-06, + "loss": 0.69423193, + "num_input_tokens_seen": 215940535, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 10026, + "time_per_iteration": 2.542365550994873 + }, + { + "auxiliary_loss_clip": 0.01098285, + "auxiliary_loss_mlp": 0.0103129, + "balance_loss_clip": 1.0200423, + "balance_loss_mlp": 1.03320062, + "epoch": 0.6028558545017285, + "flos": 19935773182080.0, + "grad_norm": 2.2752496975382908, + "language_loss": 0.80318093, + "learning_rate": 1.4384545537287061e-06, + "loss": 0.82447666, + "num_input_tokens_seen": 215958045, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 10027, + "time_per_iteration": 2.444352626800537 + }, + { + "auxiliary_loss_clip": 0.01109574, + "auxiliary_loss_mlp": 0.01034466, + "balance_loss_clip": 1.02199113, + "balance_loss_mlp": 1.03832877, + "epoch": 0.6029159777543965, + "flos": 22820836550400.0, + "grad_norm": 2.211735765604233, + "language_loss": 0.71043503, + "learning_rate": 1.438080769071171e-06, + "loss": 0.73187542, + "num_input_tokens_seen": 215977330, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7109375, + "step": 10028, + "time_per_iteration": 2.479518413543701 + }, + { + "auxiliary_loss_clip": 0.01108344, + "auxiliary_loss_mlp": 0.01037039, + "balance_loss_clip": 1.02431321, + "balance_loss_mlp": 1.03661895, + "epoch": 0.6029761010070644, + "flos": 23587242065280.0, + "grad_norm": 1.6910926571719251, + "language_loss": 0.8391934, + "learning_rate": 1.437707005721669e-06, + "loss": 0.8606472, + "num_input_tokens_seen": 215997865, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 10029, + "time_per_iteration": 2.4701409339904785 + }, + { + "auxiliary_loss_clip": 0.01103116, + "auxiliary_loss_mlp": 0.01035147, + "balance_loss_clip": 1.02357185, + "balance_loss_mlp": 1.03613794, + "epoch": 0.6030362242597325, + "flos": 13662430859520.0, + "grad_norm": 1.6909986386379736, + "language_loss": 0.7958231, + "learning_rate": 1.437333263694373e-06, + "loss": 0.81720573, + "num_input_tokens_seen": 216016230, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.671875, + "step": 10030, + "time_per_iteration": 2.4561784267425537 + }, + { + "auxiliary_loss_clip": 0.01105406, + "auxiliary_loss_mlp": 0.01032107, + "balance_loss_clip": 1.0201565, + "balance_loss_mlp": 1.03732789, + "epoch": 0.6030963475124004, + "flos": 24422883045120.0, + "grad_norm": 1.5628951432606517, + "language_loss": 0.71363872, + "learning_rate": 1.4369595430034572e-06, + "loss": 0.73501384, + "num_input_tokens_seen": 216035785, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6796875, + "step": 10031, + "time_per_iteration": 2.512300729751587 + }, + { + "auxiliary_loss_clip": 0.011108, + "auxiliary_loss_mlp": 0.01032792, + "balance_loss_clip": 1.01967287, + "balance_loss_mlp": 1.03754997, + "epoch": 0.6031564707650684, + "flos": 29644043247360.0, + "grad_norm": 1.6597240808951284, + "language_loss": 0.73467577, + "learning_rate": 1.4365858436630912e-06, + "loss": 0.75611174, + "num_input_tokens_seen": 216059555, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73046875, + "step": 10032, + "time_per_iteration": 2.566749334335327 + }, + { + "auxiliary_loss_clip": 0.01111115, + "auxiliary_loss_mlp": 0.01032658, + "balance_loss_clip": 1.02004528, + "balance_loss_mlp": 1.04087365, + "epoch": 0.6032165940177363, + "flos": 16618776768000.0, + "grad_norm": 1.6790483076068066, + "language_loss": 0.68394065, + "learning_rate": 1.4362121656874465e-06, + "loss": 0.70537835, + "num_input_tokens_seen": 216077235, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 10033, + "time_per_iteration": 2.4334018230438232 + }, + { + "auxiliary_loss_clip": 0.01108457, + "auxiliary_loss_mlp": 0.01032938, + "balance_loss_clip": 1.02034903, + "balance_loss_mlp": 1.03930712, + "epoch": 0.6032767172704043, + "flos": 17488173553920.0, + "grad_norm": 1.9672909213981986, + "language_loss": 0.76032668, + "learning_rate": 1.4358385090906934e-06, + "loss": 0.78174067, + "num_input_tokens_seen": 216094985, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 10034, + "time_per_iteration": 2.430638074874878 + }, + { + "auxiliary_loss_clip": 0.01108661, + "auxiliary_loss_mlp": 0.01030537, + "balance_loss_clip": 1.01763296, + "balance_loss_mlp": 1.03813863, + "epoch": 0.6033368405230723, + "flos": 26832955939200.0, + "grad_norm": 2.463845452157716, + "language_loss": 0.74406719, + "learning_rate": 1.4354648738870004e-06, + "loss": 0.76545924, + "num_input_tokens_seen": 216115905, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 10035, + "time_per_iteration": 2.4784040451049805 + }, + { + "auxiliary_loss_clip": 0.01104517, + "auxiliary_loss_mlp": 0.0102907, + "balance_loss_clip": 1.01751912, + "balance_loss_mlp": 1.03727365, + "epoch": 0.6033969637757403, + "flos": 16909904499840.0, + "grad_norm": 1.5741870761115437, + "language_loss": 0.86713034, + "learning_rate": 1.435091260090536e-06, + "loss": 0.88846624, + "num_input_tokens_seen": 216132420, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 10036, + "time_per_iteration": 2.4385178089141846 + }, + { + "auxiliary_loss_clip": 0.01107298, + "auxiliary_loss_mlp": 0.01033369, + "balance_loss_clip": 1.02077413, + "balance_loss_mlp": 1.0369339, + "epoch": 0.6034570870284083, + "flos": 22930076787840.0, + "grad_norm": 2.0234995174732067, + "language_loss": 0.69894731, + "learning_rate": 1.4347176677154676e-06, + "loss": 0.72035396, + "num_input_tokens_seen": 216149800, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 10037, + "time_per_iteration": 2.4603824615478516 + }, + { + "auxiliary_loss_clip": 0.01106273, + "auxiliary_loss_mlp": 0.0103008, + "balance_loss_clip": 1.01800978, + "balance_loss_mlp": 1.03922844, + "epoch": 0.6035172102810762, + "flos": 23366319465600.0, + "grad_norm": 1.7516523293698103, + "language_loss": 0.85487103, + "learning_rate": 1.4343440967759616e-06, + "loss": 0.87623459, + "num_input_tokens_seen": 216168200, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.671875, + "step": 10038, + "time_per_iteration": 2.478269100189209 + }, + { + "auxiliary_loss_clip": 0.01108308, + "auxiliary_loss_mlp": 0.01035469, + "balance_loss_clip": 1.02303505, + "balance_loss_mlp": 1.03736269, + "epoch": 0.6035773335337442, + "flos": 20887082933760.0, + "grad_norm": 1.859562825285256, + "language_loss": 0.76468384, + "learning_rate": 1.4339705472861846e-06, + "loss": 0.78612161, + "num_input_tokens_seen": 216187105, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 10039, + "time_per_iteration": 2.4567699432373047 + }, + { + "auxiliary_loss_clip": 0.01104292, + "auxiliary_loss_mlp": 0.01032125, + "balance_loss_clip": 1.02047873, + "balance_loss_mlp": 1.03606224, + "epoch": 0.6036374567864121, + "flos": 24936298093440.0, + "grad_norm": 1.5744012931929299, + "language_loss": 0.70843172, + "learning_rate": 1.433597019260301e-06, + "loss": 0.72979593, + "num_input_tokens_seen": 216205440, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 10040, + "time_per_iteration": 2.491757392883301 + }, + { + "auxiliary_loss_clip": 0.01112027, + "auxiliary_loss_mlp": 0.01031343, + "balance_loss_clip": 1.01729393, + "balance_loss_mlp": 1.03952897, + "epoch": 0.6036975800390801, + "flos": 23148269953920.0, + "grad_norm": 2.4316928211832045, + "language_loss": 0.78400159, + "learning_rate": 1.433223512712475e-06, + "loss": 0.80543524, + "num_input_tokens_seen": 216223130, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7265625, + "step": 10041, + "time_per_iteration": 2.452766180038452 + }, + { + "auxiliary_loss_clip": 0.0110643, + "auxiliary_loss_mlp": 0.01030681, + "balance_loss_clip": 1.01892138, + "balance_loss_mlp": 1.03821898, + "epoch": 0.603757703291748, + "flos": 18660729127680.0, + "grad_norm": 1.6317318935059701, + "language_loss": 0.75574881, + "learning_rate": 1.4328500276568704e-06, + "loss": 0.77711999, + "num_input_tokens_seen": 216240260, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.68359375, + "step": 10042, + "time_per_iteration": 2.421757459640503 + }, + { + "auxiliary_loss_clip": 0.01103712, + "auxiliary_loss_mlp": 0.01028058, + "balance_loss_clip": 1.01626205, + "balance_loss_mlp": 1.03584445, + "epoch": 0.6038178265444161, + "flos": 19682603147520.0, + "grad_norm": 2.3271703550981138, + "language_loss": 0.84446549, + "learning_rate": 1.4324765641076498e-06, + "loss": 0.86578321, + "num_input_tokens_seen": 216258510, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6796875, + "step": 10043, + "time_per_iteration": 2.5310654640197754 + }, + { + "auxiliary_loss_clip": 0.01108903, + "auxiliary_loss_mlp": 0.01039945, + "balance_loss_clip": 1.02648067, + "balance_loss_mlp": 1.03705609, + "epoch": 0.603877949797084, + "flos": 22638230784000.0, + "grad_norm": 1.9621351051557316, + "language_loss": 0.69924289, + "learning_rate": 1.432103122078974e-06, + "loss": 0.72073138, + "num_input_tokens_seen": 216277550, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71875, + "step": 10044, + "time_per_iteration": 2.4903266429901123 + }, + { + "auxiliary_loss_clip": 0.01110997, + "auxiliary_loss_mlp": 0.01031283, + "balance_loss_clip": 1.0184021, + "balance_loss_mlp": 1.03954315, + "epoch": 0.603938073049752, + "flos": 25447881548160.0, + "grad_norm": 2.0335535035690557, + "language_loss": 0.77986026, + "learning_rate": 1.4317297015850057e-06, + "loss": 0.80128312, + "num_input_tokens_seen": 216296690, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71484375, + "step": 10045, + "time_per_iteration": 2.4881081581115723 + }, + { + "auxiliary_loss_clip": 0.0110549, + "auxiliary_loss_mlp": 0.01031468, + "balance_loss_clip": 1.01878381, + "balance_loss_mlp": 1.03781128, + "epoch": 0.6039981963024199, + "flos": 22340135813760.0, + "grad_norm": 1.5706793221026767, + "language_loss": 0.76730686, + "learning_rate": 1.4313563026399036e-06, + "loss": 0.7886765, + "num_input_tokens_seen": 216316110, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.67578125, + "step": 10046, + "time_per_iteration": 2.4508702754974365 + }, + { + "auxiliary_loss_clip": 0.01104935, + "auxiliary_loss_mlp": 0.01030001, + "balance_loss_clip": 1.01866424, + "balance_loss_mlp": 1.03633487, + "epoch": 0.6040583195550879, + "flos": 20703148364160.0, + "grad_norm": 1.5559732700373865, + "language_loss": 0.86937988, + "learning_rate": 1.430982925257827e-06, + "loss": 0.89072925, + "num_input_tokens_seen": 216333855, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6875, + "step": 10047, + "time_per_iteration": 2.465775489807129 + }, + { + "auxiliary_loss_clip": 0.01105881, + "auxiliary_loss_mlp": 0.0102976, + "balance_loss_clip": 1.01808965, + "balance_loss_mlp": 1.03915882, + "epoch": 0.604118442807756, + "flos": 27163118776320.0, + "grad_norm": 1.5346026168560238, + "language_loss": 0.75463951, + "learning_rate": 1.4306095694529358e-06, + "loss": 0.77599597, + "num_input_tokens_seen": 216354890, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66796875, + "step": 10048, + "time_per_iteration": 2.5098941326141357 + }, + { + "auxiliary_loss_clip": 0.01115671, + "auxiliary_loss_mlp": 0.01039349, + "balance_loss_clip": 1.0247221, + "balance_loss_mlp": 1.03979802, + "epoch": 0.6041785660604239, + "flos": 30881524654080.0, + "grad_norm": 2.285441895193273, + "language_loss": 0.66271615, + "learning_rate": 1.430236235239386e-06, + "loss": 0.68426633, + "num_input_tokens_seen": 216376055, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7578125, + "step": 10049, + "time_per_iteration": 2.537810802459717 + }, + { + "auxiliary_loss_clip": 0.01105568, + "auxiliary_loss_mlp": 0.01034046, + "balance_loss_clip": 1.02244711, + "balance_loss_mlp": 1.03769147, + "epoch": 0.6042386893130919, + "flos": 19938215306880.0, + "grad_norm": 1.5404607265151984, + "language_loss": 0.66999722, + "learning_rate": 1.429862922631336e-06, + "loss": 0.69139338, + "num_input_tokens_seen": 216396295, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 10050, + "time_per_iteration": 2.5025947093963623 + }, + { + "auxiliary_loss_clip": 0.01108275, + "auxiliary_loss_mlp": 0.01032262, + "balance_loss_clip": 1.01961958, + "balance_loss_mlp": 1.03837466, + "epoch": 0.6042988125657598, + "flos": 32415915882240.0, + "grad_norm": 2.5982455651349325, + "language_loss": 0.69730866, + "learning_rate": 1.4294896316429408e-06, + "loss": 0.718714, + "num_input_tokens_seen": 216416605, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69921875, + "step": 10051, + "time_per_iteration": 2.5584428310394287 + }, + { + "auxiliary_loss_clip": 0.01103115, + "auxiliary_loss_mlp": 0.01032392, + "balance_loss_clip": 1.02025664, + "balance_loss_mlp": 1.03470123, + "epoch": 0.6043589358184278, + "flos": 17420805596160.0, + "grad_norm": 1.883115508781388, + "language_loss": 0.64664817, + "learning_rate": 1.4291163622883553e-06, + "loss": 0.66800326, + "num_input_tokens_seen": 216435130, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.68359375, + "step": 10052, + "time_per_iteration": 3.8776209354400635 + }, + { + "auxiliary_loss_clip": 0.01106513, + "auxiliary_loss_mlp": 0.01035509, + "balance_loss_clip": 1.02243757, + "balance_loss_mlp": 1.03725076, + "epoch": 0.6044190590710957, + "flos": 27672834723840.0, + "grad_norm": 1.6187947947661157, + "language_loss": 0.68885666, + "learning_rate": 1.4287431145817358e-06, + "loss": 0.71027684, + "num_input_tokens_seen": 216455640, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6953125, + "step": 10053, + "time_per_iteration": 3.8864493370056152 + }, + { + "auxiliary_loss_clip": 0.0102793, + "auxiliary_loss_mlp": 0.01006986, + "balance_loss_clip": 1.00581133, + "balance_loss_mlp": 1.00684035, + "epoch": 0.6044791823237637, + "flos": 65316267515520.0, + "grad_norm": 0.7454166517190239, + "language_loss": 0.6043961, + "learning_rate": 1.4283698885372336e-06, + "loss": 0.62474525, + "num_input_tokens_seen": 216518130, + "router_z_loss_clip": 0.01171875, + "router_z_loss_mlp": 0.2109375, + "step": 10054, + "time_per_iteration": 4.507344961166382 + }, + { + "auxiliary_loss_clip": 0.01104586, + "auxiliary_loss_mlp": 0.01032941, + "balance_loss_clip": 1.01997089, + "balance_loss_mlp": 1.03684747, + "epoch": 0.6045393055764317, + "flos": 24492369905280.0, + "grad_norm": 1.6844086395494355, + "language_loss": 0.85636723, + "learning_rate": 1.4279966841690027e-06, + "loss": 0.87774247, + "num_input_tokens_seen": 216536845, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6796875, + "step": 10055, + "time_per_iteration": 3.930811643600464 + }, + { + "auxiliary_loss_clip": 0.01110141, + "auxiliary_loss_mlp": 0.01039632, + "balance_loss_clip": 1.02585101, + "balance_loss_mlp": 1.04008687, + "epoch": 0.6045994288290997, + "flos": 19054345340160.0, + "grad_norm": 2.2914523857580353, + "language_loss": 0.73531651, + "learning_rate": 1.4276235014911952e-06, + "loss": 0.75681424, + "num_input_tokens_seen": 216551860, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.69921875, + "step": 10056, + "time_per_iteration": 2.424492835998535 + }, + { + "auxiliary_loss_clip": 0.01105735, + "auxiliary_loss_mlp": 0.01033852, + "balance_loss_clip": 1.02206218, + "balance_loss_mlp": 1.03815937, + "epoch": 0.6046595520817676, + "flos": 26576697335040.0, + "grad_norm": 1.6647683047258863, + "language_loss": 0.80205089, + "learning_rate": 1.4272503405179616e-06, + "loss": 0.82344675, + "num_input_tokens_seen": 216574775, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.67578125, + "step": 10057, + "time_per_iteration": 2.4988396167755127 + }, + { + "auxiliary_loss_clip": 0.01104511, + "auxiliary_loss_mlp": 0.01030383, + "balance_loss_clip": 1.01725817, + "balance_loss_mlp": 1.0369792, + "epoch": 0.6047196753344356, + "flos": 13582277660160.0, + "grad_norm": 2.656202002056598, + "language_loss": 0.75172931, + "learning_rate": 1.4268772012634527e-06, + "loss": 0.7730782, + "num_input_tokens_seen": 216590100, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.67578125, + "step": 10058, + "time_per_iteration": 2.4108166694641113 + }, + { + "auxiliary_loss_clip": 0.01102949, + "auxiliary_loss_mlp": 0.0102824, + "balance_loss_clip": 1.01627767, + "balance_loss_mlp": 1.03582406, + "epoch": 0.6047797985871035, + "flos": 25520456977920.0, + "grad_norm": 1.75224691919055, + "language_loss": 0.71103948, + "learning_rate": 1.4265040837418176e-06, + "loss": 0.73235136, + "num_input_tokens_seen": 216610145, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 10059, + "time_per_iteration": 2.4859349727630615 + }, + { + "auxiliary_loss_clip": 0.01105606, + "auxiliary_loss_mlp": 0.01029263, + "balance_loss_clip": 1.01686525, + "balance_loss_mlp": 1.03741932, + "epoch": 0.6048399218397715, + "flos": 20520147548160.0, + "grad_norm": 1.587856969701262, + "language_loss": 0.76134253, + "learning_rate": 1.4261309879672054e-06, + "loss": 0.78269112, + "num_input_tokens_seen": 216630625, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 10060, + "time_per_iteration": 2.473043918609619 + }, + { + "auxiliary_loss_clip": 0.01104669, + "auxiliary_loss_mlp": 0.01033513, + "balance_loss_clip": 1.02149105, + "balance_loss_mlp": 1.03757381, + "epoch": 0.6049000450924396, + "flos": 20408788408320.0, + "grad_norm": 2.1588277388437276, + "language_loss": 0.73414183, + "learning_rate": 1.4257579139537628e-06, + "loss": 0.75552368, + "num_input_tokens_seen": 216649255, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 10061, + "time_per_iteration": 2.440943956375122 + }, + { + "auxiliary_loss_clip": 0.01105712, + "auxiliary_loss_mlp": 0.01029083, + "balance_loss_clip": 1.01750207, + "balance_loss_mlp": 1.03634655, + "epoch": 0.6049601683451075, + "flos": 20741357456640.0, + "grad_norm": 2.0041380833930145, + "language_loss": 0.67225152, + "learning_rate": 1.425384861715639e-06, + "loss": 0.69359946, + "num_input_tokens_seen": 216668100, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6953125, + "step": 10062, + "time_per_iteration": 2.4789950847625732 + }, + { + "auxiliary_loss_clip": 0.01105607, + "auxiliary_loss_mlp": 0.01038153, + "balance_loss_clip": 1.02592254, + "balance_loss_mlp": 1.03717685, + "epoch": 0.6050202915977755, + "flos": 20083114771200.0, + "grad_norm": 2.163401547344872, + "language_loss": 0.71361917, + "learning_rate": 1.425011831266978e-06, + "loss": 0.73505676, + "num_input_tokens_seen": 216686125, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 10063, + "time_per_iteration": 2.43302321434021 + }, + { + "auxiliary_loss_clip": 0.01102028, + "auxiliary_loss_mlp": 0.01031729, + "balance_loss_clip": 1.01968336, + "balance_loss_mlp": 1.03561401, + "epoch": 0.6050804148504434, + "flos": 15960821391360.0, + "grad_norm": 1.6164006934985269, + "language_loss": 0.84802878, + "learning_rate": 1.424638822621926e-06, + "loss": 0.86936641, + "num_input_tokens_seen": 216704265, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6640625, + "step": 10064, + "time_per_iteration": 2.447003126144409 + }, + { + "auxiliary_loss_clip": 0.0110348, + "auxiliary_loss_mlp": 0.01033722, + "balance_loss_clip": 1.02206945, + "balance_loss_mlp": 1.0354557, + "epoch": 0.6051405381031114, + "flos": 17456644391040.0, + "grad_norm": 2.2435880628396587, + "language_loss": 0.79335666, + "learning_rate": 1.4242658357946278e-06, + "loss": 0.81472868, + "num_input_tokens_seen": 216721765, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 10065, + "time_per_iteration": 2.437286376953125 + }, + { + "auxiliary_loss_clip": 0.01111102, + "auxiliary_loss_mlp": 0.01031949, + "balance_loss_clip": 1.01874626, + "balance_loss_mlp": 1.03979814, + "epoch": 0.6052006613557793, + "flos": 11400130517760.0, + "grad_norm": 1.9931239622384858, + "language_loss": 0.78788042, + "learning_rate": 1.423892870799226e-06, + "loss": 0.80931091, + "num_input_tokens_seen": 216738295, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 10066, + "time_per_iteration": 2.4346959590911865 + }, + { + "auxiliary_loss_clip": 0.01104198, + "auxiliary_loss_mlp": 0.01027592, + "balance_loss_clip": 1.01523578, + "balance_loss_mlp": 1.03645194, + "epoch": 0.6052607846084473, + "flos": 24750998807040.0, + "grad_norm": 1.5823653049215993, + "language_loss": 0.73320723, + "learning_rate": 1.4235199276498655e-06, + "loss": 0.75452518, + "num_input_tokens_seen": 216759875, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.67578125, + "step": 10067, + "time_per_iteration": 2.5625689029693604 + }, + { + "auxiliary_loss_clip": 0.01107587, + "auxiliary_loss_mlp": 0.01029785, + "balance_loss_clip": 1.01783991, + "balance_loss_mlp": 1.03971481, + "epoch": 0.6053209078611153, + "flos": 20741141975040.0, + "grad_norm": 1.6116431503881068, + "language_loss": 0.68952775, + "learning_rate": 1.4231470063606863e-06, + "loss": 0.7109015, + "num_input_tokens_seen": 216780705, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6796875, + "step": 10068, + "time_per_iteration": 2.5137228965759277 + }, + { + "auxiliary_loss_clip": 0.01103779, + "auxiliary_loss_mlp": 0.01030656, + "balance_loss_clip": 1.01877117, + "balance_loss_mlp": 1.03444147, + "epoch": 0.6053810311137833, + "flos": 18953149749120.0, + "grad_norm": 2.4473752710004586, + "language_loss": 0.86667287, + "learning_rate": 1.4227741069458303e-06, + "loss": 0.8880173, + "num_input_tokens_seen": 216797625, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 10069, + "time_per_iteration": 2.4172072410583496 + }, + { + "auxiliary_loss_clip": 0.01103834, + "auxiliary_loss_mlp": 0.01025707, + "balance_loss_clip": 1.01405478, + "balance_loss_mlp": 1.03583956, + "epoch": 0.6054411543664512, + "flos": 23951124794880.0, + "grad_norm": 1.4672457121748899, + "language_loss": 0.83270586, + "learning_rate": 1.4224012294194387e-06, + "loss": 0.85400122, + "num_input_tokens_seen": 216817610, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 10070, + "time_per_iteration": 2.464062452316284 + }, + { + "auxiliary_loss_clip": 0.01106279, + "auxiliary_loss_mlp": 0.01034036, + "balance_loss_clip": 1.02120876, + "balance_loss_mlp": 1.03630137, + "epoch": 0.6055012776191192, + "flos": 20593979953920.0, + "grad_norm": 1.5142081514734282, + "language_loss": 0.86056209, + "learning_rate": 1.4220283737956496e-06, + "loss": 0.88196522, + "num_input_tokens_seen": 216836835, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69921875, + "step": 10071, + "time_per_iteration": 2.435492515563965 + }, + { + "auxiliary_loss_clip": 0.01108912, + "auxiliary_loss_mlp": 0.01034714, + "balance_loss_clip": 1.02138042, + "balance_loss_mlp": 1.03817403, + "epoch": 0.6055614008717871, + "flos": 30298191782400.0, + "grad_norm": 1.7615317101181058, + "language_loss": 0.7703979, + "learning_rate": 1.421655540088603e-06, + "loss": 0.79183424, + "num_input_tokens_seen": 216856760, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.70703125, + "step": 10072, + "time_per_iteration": 2.5326199531555176 + }, + { + "auxiliary_loss_clip": 0.01104713, + "auxiliary_loss_mlp": 0.01026829, + "balance_loss_clip": 1.01362085, + "balance_loss_mlp": 1.03505397, + "epoch": 0.6056215241244551, + "flos": 27125017424640.0, + "grad_norm": 1.81020475903248, + "language_loss": 0.74383593, + "learning_rate": 1.4212827283124367e-06, + "loss": 0.76515132, + "num_input_tokens_seen": 216878795, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.6953125, + "step": 10073, + "time_per_iteration": 2.4809958934783936 + }, + { + "auxiliary_loss_clip": 0.01026997, + "auxiliary_loss_mlp": 0.01002422, + "balance_loss_clip": 1.00124216, + "balance_loss_mlp": 1.005988, + "epoch": 0.6056816473771232, + "flos": 56007323925120.0, + "grad_norm": 0.7588463064410728, + "language_loss": 0.55220222, + "learning_rate": 1.4209099384812863e-06, + "loss": 0.57249641, + "num_input_tokens_seen": 216937800, + "router_z_loss_clip": 0.01177979, + "router_z_loss_mlp": 0.2109375, + "step": 10074, + "time_per_iteration": 3.101125717163086 + }, + { + "auxiliary_loss_clip": 0.01105722, + "auxiliary_loss_mlp": 0.01029513, + "balance_loss_clip": 1.01752663, + "balance_loss_mlp": 1.03776407, + "epoch": 0.6057417706297911, + "flos": 23549499849600.0, + "grad_norm": 1.8033827655021575, + "language_loss": 0.81893396, + "learning_rate": 1.4205371706092894e-06, + "loss": 0.84028631, + "num_input_tokens_seen": 216955280, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6796875, + "step": 10075, + "time_per_iteration": 2.468269109725952 + }, + { + "auxiliary_loss_clip": 0.01105409, + "auxiliary_loss_mlp": 0.01024158, + "balance_loss_clip": 1.01174855, + "balance_loss_mlp": 1.03608966, + "epoch": 0.6058018938824591, + "flos": 27744296832000.0, + "grad_norm": 2.0602815760014392, + "language_loss": 0.78272569, + "learning_rate": 1.4201644247105813e-06, + "loss": 0.80402136, + "num_input_tokens_seen": 216976950, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69140625, + "step": 10076, + "time_per_iteration": 2.4932310581207275 + }, + { + "auxiliary_loss_clip": 0.01106985, + "auxiliary_loss_mlp": 0.01033773, + "balance_loss_clip": 1.02113056, + "balance_loss_mlp": 1.0365119, + "epoch": 0.605862017135127, + "flos": 22783381643520.0, + "grad_norm": 1.7482408044671829, + "language_loss": 0.72032154, + "learning_rate": 1.4197917007992964e-06, + "loss": 0.74172914, + "num_input_tokens_seen": 216996945, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 10077, + "time_per_iteration": 2.4521970748901367 + }, + { + "auxiliary_loss_clip": 0.01106927, + "auxiliary_loss_mlp": 0.01030175, + "balance_loss_clip": 1.01777112, + "balance_loss_mlp": 1.03759694, + "epoch": 0.605922140387795, + "flos": 21215019127680.0, + "grad_norm": 2.2939968580618215, + "language_loss": 0.55467492, + "learning_rate": 1.4194189988895682e-06, + "loss": 0.57604587, + "num_input_tokens_seen": 217016580, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69140625, + "step": 10078, + "time_per_iteration": 2.4789669513702393 + }, + { + "auxiliary_loss_clip": 0.01106991, + "auxiliary_loss_mlp": 0.01032923, + "balance_loss_clip": 1.02026904, + "balance_loss_mlp": 1.0364964, + "epoch": 0.6059822636404629, + "flos": 27268372604160.0, + "grad_norm": 2.206511673730914, + "language_loss": 0.70283198, + "learning_rate": 1.4190463189955297e-06, + "loss": 0.72423112, + "num_input_tokens_seen": 217037300, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 10079, + "time_per_iteration": 2.494340181350708 + }, + { + "auxiliary_loss_clip": 0.01105474, + "auxiliary_loss_mlp": 0.01038174, + "balance_loss_clip": 1.02605653, + "balance_loss_mlp": 1.03662014, + "epoch": 0.606042386893131, + "flos": 20631327120000.0, + "grad_norm": 1.7147155998392456, + "language_loss": 0.62479711, + "learning_rate": 1.4186736611313131e-06, + "loss": 0.64623356, + "num_input_tokens_seen": 217055805, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 10080, + "time_per_iteration": 2.4511730670928955 + }, + { + "auxiliary_loss_clip": 0.01107796, + "auxiliary_loss_mlp": 0.01029525, + "balance_loss_clip": 1.01679373, + "balance_loss_mlp": 1.03799117, + "epoch": 0.6061025101457989, + "flos": 23002293081600.0, + "grad_norm": 1.8405271272242842, + "language_loss": 0.71136117, + "learning_rate": 1.4183010253110492e-06, + "loss": 0.73273432, + "num_input_tokens_seen": 217074175, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 10081, + "time_per_iteration": 2.455698251724243 + }, + { + "auxiliary_loss_clip": 0.01105313, + "auxiliary_loss_mlp": 0.01028675, + "balance_loss_clip": 1.01634336, + "balance_loss_mlp": 1.03703296, + "epoch": 0.6061626333984669, + "flos": 29898937134720.0, + "grad_norm": 1.6262436392400634, + "language_loss": 0.69449544, + "learning_rate": 1.4179284115488691e-06, + "loss": 0.71583533, + "num_input_tokens_seen": 217095695, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.68359375, + "step": 10082, + "time_per_iteration": 2.52297043800354 + }, + { + "auxiliary_loss_clip": 0.01106177, + "auxiliary_loss_mlp": 0.01029148, + "balance_loss_clip": 1.01712012, + "balance_loss_mlp": 1.03799009, + "epoch": 0.6062227566511348, + "flos": 25009196745600.0, + "grad_norm": 1.4063428250351147, + "language_loss": 0.65709507, + "learning_rate": 1.4175558198589015e-06, + "loss": 0.67844832, + "num_input_tokens_seen": 217116260, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 10083, + "time_per_iteration": 2.464259147644043 + }, + { + "auxiliary_loss_clip": 0.01103152, + "auxiliary_loss_mlp": 0.01031191, + "balance_loss_clip": 1.01852512, + "balance_loss_mlp": 1.03483891, + "epoch": 0.6062828799038028, + "flos": 19463943104640.0, + "grad_norm": 2.2500443264419423, + "language_loss": 0.74058753, + "learning_rate": 1.4171832502552764e-06, + "loss": 0.76193094, + "num_input_tokens_seen": 217134465, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.68359375, + "step": 10084, + "time_per_iteration": 2.4634742736816406 + }, + { + "auxiliary_loss_clip": 0.01105676, + "auxiliary_loss_mlp": 0.01033057, + "balance_loss_clip": 1.02070093, + "balance_loss_mlp": 1.03634107, + "epoch": 0.6063430031564707, + "flos": 13589568120960.0, + "grad_norm": 2.43129197416672, + "language_loss": 0.72011673, + "learning_rate": 1.4168107027521204e-06, + "loss": 0.74150407, + "num_input_tokens_seen": 217149920, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.69140625, + "step": 10085, + "time_per_iteration": 2.4218525886535645 + }, + { + "auxiliary_loss_clip": 0.01104669, + "auxiliary_loss_mlp": 0.01036153, + "balance_loss_clip": 1.02473903, + "balance_loss_mlp": 1.03681493, + "epoch": 0.6064031264091387, + "flos": 23255499029760.0, + "grad_norm": 2.1595465216971834, + "language_loss": 0.76514173, + "learning_rate": 1.4164381773635605e-06, + "loss": 0.78655005, + "num_input_tokens_seen": 217168165, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6796875, + "step": 10086, + "time_per_iteration": 2.466749429702759 + }, + { + "auxiliary_loss_clip": 0.01103719, + "auxiliary_loss_mlp": 0.01030938, + "balance_loss_clip": 1.01887345, + "balance_loss_mlp": 1.03720832, + "epoch": 0.6064632496618068, + "flos": 22458462192000.0, + "grad_norm": 1.355452455492161, + "language_loss": 0.72577417, + "learning_rate": 1.4160656741037246e-06, + "loss": 0.74712074, + "num_input_tokens_seen": 217190070, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6640625, + "step": 10087, + "time_per_iteration": 2.465503692626953 + }, + { + "auxiliary_loss_clip": 0.01101232, + "auxiliary_loss_mlp": 0.01029891, + "balance_loss_clip": 1.01915646, + "balance_loss_mlp": 1.03517973, + "epoch": 0.6065233729144747, + "flos": 25118652464640.0, + "grad_norm": 1.707331111485516, + "language_loss": 0.83679116, + "learning_rate": 1.4156931929867355e-06, + "loss": 0.85810244, + "num_input_tokens_seen": 217209370, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.66015625, + "step": 10088, + "time_per_iteration": 2.490476369857788 + }, + { + "auxiliary_loss_clip": 0.0110302, + "auxiliary_loss_mlp": 0.01027327, + "balance_loss_clip": 1.01454818, + "balance_loss_mlp": 1.03563654, + "epoch": 0.6065834961671427, + "flos": 23477355383040.0, + "grad_norm": 2.1730607876548924, + "language_loss": 0.7139647, + "learning_rate": 1.4153207340267201e-06, + "loss": 0.73526812, + "num_input_tokens_seen": 217226990, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.671875, + "step": 10089, + "time_per_iteration": 2.4656596183776855 + }, + { + "auxiliary_loss_clip": 0.0110663, + "auxiliary_loss_mlp": 0.01036262, + "balance_loss_clip": 1.0252049, + "balance_loss_mlp": 1.0383575, + "epoch": 0.6066436194198106, + "flos": 17019396132480.0, + "grad_norm": 1.8527545733498374, + "language_loss": 0.82743609, + "learning_rate": 1.4149482972378009e-06, + "loss": 0.84886503, + "num_input_tokens_seen": 217244585, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.68359375, + "step": 10090, + "time_per_iteration": 2.4523448944091797 + }, + { + "auxiliary_loss_clip": 0.01112391, + "auxiliary_loss_mlp": 0.01036313, + "balance_loss_clip": 1.0230329, + "balance_loss_mlp": 1.03768897, + "epoch": 0.6067037426724786, + "flos": 18514752255360.0, + "grad_norm": 2.0611786286574514, + "language_loss": 0.75486428, + "learning_rate": 1.4145758826341e-06, + "loss": 0.77635133, + "num_input_tokens_seen": 217263435, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.74609375, + "step": 10091, + "time_per_iteration": 2.412745475769043 + }, + { + "auxiliary_loss_clip": 0.01103456, + "auxiliary_loss_mlp": 0.01034501, + "balance_loss_clip": 1.02224016, + "balance_loss_mlp": 1.03655899, + "epoch": 0.6067638659251465, + "flos": 22345989730560.0, + "grad_norm": 2.008159335053083, + "language_loss": 0.79580414, + "learning_rate": 1.4142034902297415e-06, + "loss": 0.81718373, + "num_input_tokens_seen": 217283725, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.66796875, + "step": 10092, + "time_per_iteration": 2.4787280559539795 + }, + { + "auxiliary_loss_clip": 0.01107853, + "auxiliary_loss_mlp": 0.01036056, + "balance_loss_clip": 1.02313423, + "balance_loss_mlp": 1.03692424, + "epoch": 0.6068239891778145, + "flos": 12451019748480.0, + "grad_norm": 1.8882550633479742, + "language_loss": 0.76085305, + "learning_rate": 1.4138311200388444e-06, + "loss": 0.78229213, + "num_input_tokens_seen": 217301120, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 10093, + "time_per_iteration": 3.8885409832000732 + }, + { + "auxiliary_loss_clip": 0.01103337, + "auxiliary_loss_mlp": 0.01033728, + "balance_loss_clip": 1.02215874, + "balance_loss_mlp": 1.03691947, + "epoch": 0.6068841124304825, + "flos": 23185868515200.0, + "grad_norm": 2.3186576779301387, + "language_loss": 0.87448221, + "learning_rate": 1.4134587720755304e-06, + "loss": 0.89585286, + "num_input_tokens_seen": 217319585, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6640625, + "step": 10094, + "time_per_iteration": 2.4714174270629883 + }, + { + "auxiliary_loss_clip": 0.01105151, + "auxiliary_loss_mlp": 0.01032495, + "balance_loss_clip": 1.01992464, + "balance_loss_mlp": 1.03669322, + "epoch": 0.6069442356831505, + "flos": 18587902302720.0, + "grad_norm": 1.5923423583427312, + "language_loss": 0.71694756, + "learning_rate": 1.413086446353919e-06, + "loss": 0.73832405, + "num_input_tokens_seen": 217338880, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 10095, + "time_per_iteration": 3.852285861968994 + }, + { + "auxiliary_loss_clip": 0.01105359, + "auxiliary_loss_mlp": 0.01028972, + "balance_loss_clip": 1.01727819, + "balance_loss_mlp": 1.036134, + "epoch": 0.6070043589358184, + "flos": 20960340721920.0, + "grad_norm": 1.6817389817846544, + "language_loss": 0.76919025, + "learning_rate": 1.4127141428881273e-06, + "loss": 0.7905336, + "num_input_tokens_seen": 217357480, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.69140625, + "step": 10096, + "time_per_iteration": 3.9708244800567627 + }, + { + "auxiliary_loss_clip": 0.01107233, + "auxiliary_loss_mlp": 0.0104091, + "balance_loss_clip": 1.0291853, + "balance_loss_mlp": 1.03734136, + "epoch": 0.6070644821884864, + "flos": 11692443398400.0, + "grad_norm": 1.7249712415107992, + "language_loss": 0.79864824, + "learning_rate": 1.4123418616922749e-06, + "loss": 0.82012963, + "num_input_tokens_seen": 217374575, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.69921875, + "step": 10097, + "time_per_iteration": 2.4229838848114014 + }, + { + "auxiliary_loss_clip": 0.01102947, + "auxiliary_loss_mlp": 0.01030901, + "balance_loss_clip": 1.01920676, + "balance_loss_mlp": 1.03555632, + "epoch": 0.6071246054411543, + "flos": 19310568030720.0, + "grad_norm": 1.4260099040951442, + "language_loss": 0.67338455, + "learning_rate": 1.411969602780478e-06, + "loss": 0.69472301, + "num_input_tokens_seen": 217392950, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.67578125, + "step": 10098, + "time_per_iteration": 3.9603915214538574 + }, + { + "auxiliary_loss_clip": 0.01103812, + "auxiliary_loss_mlp": 0.01031482, + "balance_loss_clip": 1.01989472, + "balance_loss_mlp": 1.03617251, + "epoch": 0.6071847286938223, + "flos": 17749029098880.0, + "grad_norm": 1.8973033677095168, + "language_loss": 0.80694121, + "learning_rate": 1.4115973661668523e-06, + "loss": 0.82829416, + "num_input_tokens_seen": 217412145, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.67578125, + "step": 10099, + "time_per_iteration": 2.409189462661743 + }, + { + "auxiliary_loss_clip": 0.01108088, + "auxiliary_loss_mlp": 0.01034923, + "balance_loss_clip": 1.02195358, + "balance_loss_mlp": 1.03531229, + "epoch": 0.6072448519464904, + "flos": 22637512512000.0, + "grad_norm": 2.230451803545553, + "language_loss": 0.70439708, + "learning_rate": 1.4112251518655133e-06, + "loss": 0.72582722, + "num_input_tokens_seen": 217432080, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7265625, + "step": 10100, + "time_per_iteration": 2.484339952468872 + }, + { + "auxiliary_loss_clip": 0.01108261, + "auxiliary_loss_mlp": 0.01036108, + "balance_loss_clip": 1.02344251, + "balance_loss_mlp": 1.03890038, + "epoch": 0.6073049751991583, + "flos": 19537308633600.0, + "grad_norm": 1.5791187964785582, + "language_loss": 0.70447475, + "learning_rate": 1.4108529598905764e-06, + "loss": 0.72591841, + "num_input_tokens_seen": 217450945, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 10101, + "time_per_iteration": 2.4309775829315186 + }, + { + "auxiliary_loss_clip": 0.01102039, + "auxiliary_loss_mlp": 0.01033664, + "balance_loss_clip": 1.02154672, + "balance_loss_mlp": 1.03490043, + "epoch": 0.6073650984518263, + "flos": 28294233033600.0, + "grad_norm": 1.6995748618566444, + "language_loss": 0.69606161, + "learning_rate": 1.410480790256154e-06, + "loss": 0.71741861, + "num_input_tokens_seen": 217473105, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 10102, + "time_per_iteration": 2.524376630783081 + }, + { + "auxiliary_loss_clip": 0.01107251, + "auxiliary_loss_mlp": 0.01033826, + "balance_loss_clip": 1.02220285, + "balance_loss_mlp": 1.0382359, + "epoch": 0.6074252217044942, + "flos": 25664422688640.0, + "grad_norm": 1.7952265928760782, + "language_loss": 0.73694891, + "learning_rate": 1.4101086429763589e-06, + "loss": 0.75835967, + "num_input_tokens_seen": 217491780, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.69140625, + "step": 10103, + "time_per_iteration": 2.4625236988067627 + }, + { + "auxiliary_loss_clip": 0.01110432, + "auxiliary_loss_mlp": 0.01036259, + "balance_loss_clip": 1.02295542, + "balance_loss_mlp": 1.03862011, + "epoch": 0.6074853449571622, + "flos": 22857106308480.0, + "grad_norm": 1.6961753672547197, + "language_loss": 0.76819229, + "learning_rate": 1.4097365180653032e-06, + "loss": 0.7896592, + "num_input_tokens_seen": 217510605, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 10104, + "time_per_iteration": 2.467879056930542 + }, + { + "auxiliary_loss_clip": 0.01028848, + "auxiliary_loss_mlp": 0.01014471, + "balance_loss_clip": 1.01331425, + "balance_loss_mlp": 1.00746071, + "epoch": 0.6075454682098301, + "flos": 67111406547840.0, + "grad_norm": 0.7111703190831327, + "language_loss": 0.56059039, + "learning_rate": 1.4093644155370977e-06, + "loss": 0.58102357, + "num_input_tokens_seen": 217574815, + "router_z_loss_clip": 0.01153564, + "router_z_loss_mlp": 0.21484375, + "step": 10105, + "time_per_iteration": 3.066772222518921 + }, + { + "auxiliary_loss_clip": 0.01028964, + "auxiliary_loss_mlp": 0.01012366, + "balance_loss_clip": 1.01119196, + "balance_loss_mlp": 1.00750494, + "epoch": 0.6076055914624982, + "flos": 70712024751360.0, + "grad_norm": 0.7555703523663572, + "language_loss": 0.56791615, + "learning_rate": 1.4089923354058533e-06, + "loss": 0.58832943, + "num_input_tokens_seen": 217632375, + "router_z_loss_clip": 0.01171875, + "router_z_loss_mlp": 0.21484375, + "step": 10106, + "time_per_iteration": 3.0346710681915283 + }, + { + "auxiliary_loss_clip": 0.01103036, + "auxiliary_loss_mlp": 0.01033262, + "balance_loss_clip": 1.02136517, + "balance_loss_mlp": 1.03558111, + "epoch": 0.6076657147151661, + "flos": 28364545906560.0, + "grad_norm": 1.556971911912289, + "language_loss": 0.68647003, + "learning_rate": 1.4086202776856784e-06, + "loss": 0.70783293, + "num_input_tokens_seen": 217653055, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.67578125, + "step": 10107, + "time_per_iteration": 2.5070221424102783 + }, + { + "auxiliary_loss_clip": 0.0110868, + "auxiliary_loss_mlp": 0.01030388, + "balance_loss_clip": 1.01801395, + "balance_loss_mlp": 1.03806663, + "epoch": 0.6077258379678341, + "flos": 15049767807360.0, + "grad_norm": 2.0591355858624594, + "language_loss": 0.81006205, + "learning_rate": 1.4082482423906815e-06, + "loss": 0.83145273, + "num_input_tokens_seen": 217671520, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.70703125, + "step": 10108, + "time_per_iteration": 2.449876070022583 + }, + { + "auxiliary_loss_clip": 0.01109814, + "auxiliary_loss_mlp": 0.01031426, + "balance_loss_clip": 1.01798475, + "balance_loss_mlp": 1.03772831, + "epoch": 0.607785961220502, + "flos": 36167251553280.0, + "grad_norm": 1.6885620074685026, + "language_loss": 0.70979893, + "learning_rate": 1.4078762295349714e-06, + "loss": 0.7312113, + "num_input_tokens_seen": 217691880, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.72265625, + "step": 10109, + "time_per_iteration": 2.569441318511963 + }, + { + "auxiliary_loss_clip": 0.01101619, + "auxiliary_loss_mlp": 0.01027249, + "balance_loss_clip": 1.01598346, + "balance_loss_mlp": 1.0354414, + "epoch": 0.60784608447317, + "flos": 22524249951360.0, + "grad_norm": 1.5138210455097567, + "language_loss": 0.80043399, + "learning_rate": 1.407504239132653e-06, + "loss": 0.82172269, + "num_input_tokens_seen": 217710530, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.66015625, + "step": 10110, + "time_per_iteration": 2.5667614936828613 + }, + { + "auxiliary_loss_clip": 0.01107667, + "auxiliary_loss_mlp": 0.01029434, + "balance_loss_clip": 1.01691723, + "balance_loss_mlp": 1.03725386, + "epoch": 0.6079062077258379, + "flos": 23841166285440.0, + "grad_norm": 2.0834448443085463, + "language_loss": 0.7047748, + "learning_rate": 1.4071322711978338e-06, + "loss": 0.72614574, + "num_input_tokens_seen": 217728650, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.70703125, + "step": 10111, + "time_per_iteration": 2.449047565460205 + }, + { + "auxiliary_loss_clip": 0.01107266, + "auxiliary_loss_mlp": 0.01030401, + "balance_loss_clip": 1.01765776, + "balance_loss_mlp": 1.03687668, + "epoch": 0.6079663309785059, + "flos": 23367037737600.0, + "grad_norm": 1.8731958384235612, + "language_loss": 0.65437806, + "learning_rate": 1.4067603257446186e-06, + "loss": 0.67575473, + "num_input_tokens_seen": 217747135, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 10112, + "time_per_iteration": 2.5956103801727295 + }, + { + "auxiliary_loss_clip": 0.01028267, + "auxiliary_loss_mlp": 0.00997544, + "balance_loss_clip": 0.99637622, + "balance_loss_mlp": 1.00686228, + "epoch": 0.6080264542311739, + "flos": 71382873110400.0, + "grad_norm": 0.6257418493150695, + "language_loss": 0.49600247, + "learning_rate": 1.4063884027871105e-06, + "loss": 0.51626056, + "num_input_tokens_seen": 217811860, + "router_z_loss_clip": 0.01165771, + "router_z_loss_mlp": 0.21484375, + "step": 10113, + "time_per_iteration": 3.0929043292999268 + }, + { + "auxiliary_loss_clip": 0.01027496, + "auxiliary_loss_mlp": 0.01000577, + "balance_loss_clip": 0.99939102, + "balance_loss_mlp": 1.0062747, + "epoch": 0.6080865774838419, + "flos": 66529833442560.0, + "grad_norm": 0.8371205862323671, + "language_loss": 0.56964719, + "learning_rate": 1.4060165023394147e-06, + "loss": 0.58992791, + "num_input_tokens_seen": 217866510, + "router_z_loss_clip": 0.01184082, + "router_z_loss_mlp": 0.21289062, + "step": 10114, + "time_per_iteration": 2.9712812900543213 + }, + { + "auxiliary_loss_clip": 0.01109587, + "auxiliary_loss_mlp": 0.01029457, + "balance_loss_clip": 1.01581991, + "balance_loss_mlp": 1.03810143, + "epoch": 0.6081467007365099, + "flos": 19207935895680.0, + "grad_norm": 1.7367632173905274, + "language_loss": 0.69756359, + "learning_rate": 1.4056446244156317e-06, + "loss": 0.71895409, + "num_input_tokens_seen": 217885650, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71484375, + "step": 10115, + "time_per_iteration": 2.4941470623016357 + }, + { + "auxiliary_loss_clip": 0.01106631, + "auxiliary_loss_mlp": 0.01027755, + "balance_loss_clip": 1.01550055, + "balance_loss_mlp": 1.03715134, + "epoch": 0.6082068239891778, + "flos": 24167737762560.0, + "grad_norm": 18.577098805589706, + "language_loss": 0.72356099, + "learning_rate": 1.4052727690298642e-06, + "loss": 0.74490488, + "num_input_tokens_seen": 217905300, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 10116, + "time_per_iteration": 2.448673725128174 + }, + { + "auxiliary_loss_clip": 0.01107481, + "auxiliary_loss_mlp": 0.01034138, + "balance_loss_clip": 1.02089953, + "balance_loss_mlp": 1.03622699, + "epoch": 0.6082669472418458, + "flos": 37413316310400.0, + "grad_norm": 1.8751462040451332, + "language_loss": 0.53553987, + "learning_rate": 1.4049009361962138e-06, + "loss": 0.55695611, + "num_input_tokens_seen": 217927845, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 10117, + "time_per_iteration": 2.592958927154541 + }, + { + "auxiliary_loss_clip": 0.01106561, + "auxiliary_loss_mlp": 0.01025434, + "balance_loss_clip": 1.01340544, + "balance_loss_mlp": 1.03709269, + "epoch": 0.6083270704945137, + "flos": 15085534775040.0, + "grad_norm": 1.965088318697828, + "language_loss": 0.69835466, + "learning_rate": 1.4045291259287786e-06, + "loss": 0.71967459, + "num_input_tokens_seen": 217946145, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6953125, + "step": 10118, + "time_per_iteration": 2.4184305667877197 + }, + { + "auxiliary_loss_clip": 0.01107391, + "auxiliary_loss_mlp": 0.01027545, + "balance_loss_clip": 1.01545143, + "balance_loss_mlp": 1.03855991, + "epoch": 0.6083871937471818, + "flos": 20668458804480.0, + "grad_norm": 1.4929706938116498, + "language_loss": 0.74641609, + "learning_rate": 1.4041573382416588e-06, + "loss": 0.7677654, + "num_input_tokens_seen": 217965190, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 10119, + "time_per_iteration": 2.4534857273101807 + }, + { + "auxiliary_loss_clip": 0.01104393, + "auxiliary_loss_mlp": 0.01033724, + "balance_loss_clip": 1.02189887, + "balance_loss_mlp": 1.03641152, + "epoch": 0.6084473169998497, + "flos": 21506901045120.0, + "grad_norm": 1.5799518634527623, + "language_loss": 0.67427665, + "learning_rate": 1.4037855731489525e-06, + "loss": 0.69565779, + "num_input_tokens_seen": 217983625, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 10120, + "time_per_iteration": 2.439384937286377 + }, + { + "auxiliary_loss_clip": 0.01109214, + "auxiliary_loss_mlp": 0.01032656, + "balance_loss_clip": 1.01977515, + "balance_loss_mlp": 1.03793478, + "epoch": 0.6085074402525177, + "flos": 26870051710080.0, + "grad_norm": 1.74219428879995, + "language_loss": 0.74141055, + "learning_rate": 1.4034138306647571e-06, + "loss": 0.76282924, + "num_input_tokens_seen": 218006005, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 10121, + "time_per_iteration": 2.506490707397461 + }, + { + "auxiliary_loss_clip": 0.01103145, + "auxiliary_loss_mlp": 0.01026601, + "balance_loss_clip": 1.01530576, + "balance_loss_mlp": 1.03512359, + "epoch": 0.6085675635051856, + "flos": 10889839952640.0, + "grad_norm": 1.7909457152882267, + "language_loss": 0.80599827, + "learning_rate": 1.4030421108031685e-06, + "loss": 0.82729572, + "num_input_tokens_seen": 218024195, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6796875, + "step": 10122, + "time_per_iteration": 2.422988176345825 + }, + { + "auxiliary_loss_clip": 0.01107244, + "auxiliary_loss_mlp": 0.01030794, + "balance_loss_clip": 1.01846766, + "balance_loss_mlp": 1.03843355, + "epoch": 0.6086276867578536, + "flos": 34862186707200.0, + "grad_norm": 1.4671658127927028, + "language_loss": 0.55411458, + "learning_rate": 1.402670413578284e-06, + "loss": 0.57549489, + "num_input_tokens_seen": 218047190, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 10123, + "time_per_iteration": 2.6203012466430664 + }, + { + "auxiliary_loss_clip": 0.01104564, + "auxiliary_loss_mlp": 0.01031847, + "balance_loss_clip": 1.01975298, + "balance_loss_mlp": 1.03711987, + "epoch": 0.6086878100105215, + "flos": 20047706939520.0, + "grad_norm": 1.7982570079112092, + "language_loss": 0.73612612, + "learning_rate": 1.4022987390041965e-06, + "loss": 0.75749022, + "num_input_tokens_seen": 218065945, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 10124, + "time_per_iteration": 2.465306282043457 + }, + { + "auxiliary_loss_clip": 0.01105892, + "auxiliary_loss_mlp": 0.01030108, + "balance_loss_clip": 1.01791847, + "balance_loss_mlp": 1.03691709, + "epoch": 0.6087479332631895, + "flos": 18332469711360.0, + "grad_norm": 3.6424543705255648, + "language_loss": 0.66014802, + "learning_rate": 1.4019270870950006e-06, + "loss": 0.681508, + "num_input_tokens_seen": 218085285, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 10125, + "time_per_iteration": 2.4767675399780273 + }, + { + "auxiliary_loss_clip": 0.01105208, + "auxiliary_loss_mlp": 0.01031644, + "balance_loss_clip": 1.01943719, + "balance_loss_mlp": 1.03736734, + "epoch": 0.6088080565158575, + "flos": 24493411399680.0, + "grad_norm": 2.3623427434848066, + "language_loss": 0.76202977, + "learning_rate": 1.40155545786479e-06, + "loss": 0.78339827, + "num_input_tokens_seen": 218104735, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6796875, + "step": 10126, + "time_per_iteration": 2.469557046890259 + }, + { + "auxiliary_loss_clip": 0.01109286, + "auxiliary_loss_mlp": 0.01028819, + "balance_loss_clip": 1.01583672, + "balance_loss_mlp": 1.03710127, + "epoch": 0.6088681797685255, + "flos": 10269016260480.0, + "grad_norm": 2.6026801218546036, + "language_loss": 0.71315622, + "learning_rate": 1.4011838513276558e-06, + "loss": 0.73453724, + "num_input_tokens_seen": 218121855, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.72265625, + "step": 10127, + "time_per_iteration": 2.463219404220581 + }, + { + "auxiliary_loss_clip": 0.01110954, + "auxiliary_loss_mlp": 0.01033412, + "balance_loss_clip": 1.02028704, + "balance_loss_mlp": 1.03909373, + "epoch": 0.6089283030211935, + "flos": 21973703218560.0, + "grad_norm": 2.879650268865683, + "language_loss": 0.72776711, + "learning_rate": 1.400812267497691e-06, + "loss": 0.74921077, + "num_input_tokens_seen": 218137325, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 10128, + "time_per_iteration": 2.4591028690338135 + }, + { + "auxiliary_loss_clip": 0.01105059, + "auxiliary_loss_mlp": 0.01030144, + "balance_loss_clip": 1.01816392, + "balance_loss_mlp": 1.03707957, + "epoch": 0.6089884262738614, + "flos": 17785191116160.0, + "grad_norm": 4.4407298106903585, + "language_loss": 0.73322678, + "learning_rate": 1.4004407063889842e-06, + "loss": 0.75457883, + "num_input_tokens_seen": 218155530, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6796875, + "step": 10129, + "time_per_iteration": 2.463595151901245 + }, + { + "auxiliary_loss_clip": 0.01104701, + "auxiliary_loss_mlp": 0.01034967, + "balance_loss_clip": 1.02273047, + "balance_loss_mlp": 1.03612173, + "epoch": 0.6090485495265294, + "flos": 36910423946880.0, + "grad_norm": 1.3648179669909797, + "language_loss": 0.65579844, + "learning_rate": 1.400069168015626e-06, + "loss": 0.67719507, + "num_input_tokens_seen": 218182535, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 10130, + "time_per_iteration": 2.638197183609009 + }, + { + "auxiliary_loss_clip": 0.01101489, + "auxiliary_loss_mlp": 0.01024774, + "balance_loss_clip": 1.01360381, + "balance_loss_mlp": 1.03529549, + "epoch": 0.6091086727791973, + "flos": 19899036547200.0, + "grad_norm": 1.5719208851669182, + "language_loss": 0.77160382, + "learning_rate": 1.3996976523917054e-06, + "loss": 0.79286647, + "num_input_tokens_seen": 218201740, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66015625, + "step": 10131, + "time_per_iteration": 2.4989805221557617 + }, + { + "auxiliary_loss_clip": 0.01104899, + "auxiliary_loss_mlp": 0.01030421, + "balance_loss_clip": 1.01976359, + "balance_loss_mlp": 1.03746176, + "epoch": 0.6091687960318654, + "flos": 22163635359360.0, + "grad_norm": 1.8809693968510182, + "language_loss": 0.76772207, + "learning_rate": 1.3993261595313093e-06, + "loss": 0.78907526, + "num_input_tokens_seen": 218219800, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.67578125, + "step": 10132, + "time_per_iteration": 2.4471144676208496 + }, + { + "auxiliary_loss_clip": 0.01100275, + "auxiliary_loss_mlp": 0.0103227, + "balance_loss_clip": 1.02116609, + "balance_loss_mlp": 1.035465, + "epoch": 0.6092289192845333, + "flos": 21465280160640.0, + "grad_norm": 1.8031513435586903, + "language_loss": 0.75461888, + "learning_rate": 1.3989546894485261e-06, + "loss": 0.77594435, + "num_input_tokens_seen": 218237585, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6484375, + "step": 10133, + "time_per_iteration": 2.4543044567108154 + }, + { + "auxiliary_loss_clip": 0.01104667, + "auxiliary_loss_mlp": 0.01032044, + "balance_loss_clip": 1.01921129, + "balance_loss_mlp": 1.03661132, + "epoch": 0.6092890425372013, + "flos": 28694924225280.0, + "grad_norm": 1.617219095446177, + "language_loss": 0.63404942, + "learning_rate": 1.3985832421574414e-06, + "loss": 0.65541649, + "num_input_tokens_seen": 218258700, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 10134, + "time_per_iteration": 2.4968786239624023 + }, + { + "auxiliary_loss_clip": 0.01101874, + "auxiliary_loss_mlp": 0.01028709, + "balance_loss_clip": 1.01722288, + "balance_loss_mlp": 1.03555775, + "epoch": 0.6093491657898692, + "flos": 20813178700800.0, + "grad_norm": 1.7042888689612277, + "language_loss": 0.78689611, + "learning_rate": 1.3982118176721397e-06, + "loss": 0.80820185, + "num_input_tokens_seen": 218275655, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 10135, + "time_per_iteration": 3.8730435371398926 + }, + { + "auxiliary_loss_clip": 0.01105216, + "auxiliary_loss_mlp": 0.01029652, + "balance_loss_clip": 1.01840401, + "balance_loss_mlp": 1.03660417, + "epoch": 0.6094092890425372, + "flos": 25446983708160.0, + "grad_norm": 2.4676554523748115, + "language_loss": 0.72265971, + "learning_rate": 1.3978404160067069e-06, + "loss": 0.7440083, + "num_input_tokens_seen": 218295720, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6875, + "step": 10136, + "time_per_iteration": 2.721339464187622 + }, + { + "auxiliary_loss_clip": 0.01108039, + "auxiliary_loss_mlp": 0.0102914, + "balance_loss_clip": 1.01704586, + "balance_loss_mlp": 1.0386939, + "epoch": 0.6094694122952051, + "flos": 35621265847680.0, + "grad_norm": 2.10435735907629, + "language_loss": 0.74540055, + "learning_rate": 1.3974690371752253e-06, + "loss": 0.76677233, + "num_input_tokens_seen": 218316745, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69140625, + "step": 10137, + "time_per_iteration": 3.969383716583252 + }, + { + "auxiliary_loss_clip": 0.01106599, + "auxiliary_loss_mlp": 0.01040095, + "balance_loss_clip": 1.027542, + "balance_loss_mlp": 1.03668833, + "epoch": 0.6095295355478731, + "flos": 24456962073600.0, + "grad_norm": 1.7200645743924223, + "language_loss": 0.80628771, + "learning_rate": 1.3970976811917785e-06, + "loss": 0.82775462, + "num_input_tokens_seen": 218335385, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 10138, + "time_per_iteration": 3.9027063846588135 + }, + { + "auxiliary_loss_clip": 0.01100268, + "auxiliary_loss_mlp": 0.01029542, + "balance_loss_clip": 1.01837766, + "balance_loss_mlp": 1.03564954, + "epoch": 0.6095896588005411, + "flos": 15633208419840.0, + "grad_norm": 1.5498019522052684, + "language_loss": 0.80843186, + "learning_rate": 1.3967263480704481e-06, + "loss": 0.82972997, + "num_input_tokens_seen": 218353320, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 10139, + "time_per_iteration": 3.9400634765625 + }, + { + "auxiliary_loss_clip": 0.01108872, + "auxiliary_loss_mlp": 0.01034626, + "balance_loss_clip": 1.02206182, + "balance_loss_mlp": 1.03840351, + "epoch": 0.6096497820532091, + "flos": 15550577182080.0, + "grad_norm": 2.0925165633907254, + "language_loss": 0.8375181, + "learning_rate": 1.396355037825315e-06, + "loss": 0.85895312, + "num_input_tokens_seen": 218365620, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 10140, + "time_per_iteration": 2.4656758308410645 + }, + { + "auxiliary_loss_clip": 0.01105652, + "auxiliary_loss_mlp": 0.01035103, + "balance_loss_clip": 1.02315855, + "balance_loss_mlp": 1.03600419, + "epoch": 0.6097099053058771, + "flos": 24204474397440.0, + "grad_norm": 2.1792852747623557, + "language_loss": 0.75585604, + "learning_rate": 1.3959837504704592e-06, + "loss": 0.77726358, + "num_input_tokens_seen": 218383785, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6953125, + "step": 10141, + "time_per_iteration": 2.574366331100464 + }, + { + "auxiliary_loss_clip": 0.01104603, + "auxiliary_loss_mlp": 0.01026989, + "balance_loss_clip": 1.01502669, + "balance_loss_mlp": 1.03598619, + "epoch": 0.609770028558545, + "flos": 19570238426880.0, + "grad_norm": 1.9409433083757806, + "language_loss": 0.76637286, + "learning_rate": 1.3956124860199603e-06, + "loss": 0.78768879, + "num_input_tokens_seen": 218399055, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6875, + "step": 10142, + "time_per_iteration": 2.4868385791778564 + }, + { + "auxiliary_loss_clip": 0.01104768, + "auxiliary_loss_mlp": 0.01032927, + "balance_loss_clip": 1.02058291, + "balance_loss_mlp": 1.03676569, + "epoch": 0.609830151811213, + "flos": 23949185460480.0, + "grad_norm": 1.745652179186059, + "language_loss": 0.76381373, + "learning_rate": 1.3952412444878964e-06, + "loss": 0.7851907, + "num_input_tokens_seen": 218419120, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6796875, + "step": 10143, + "time_per_iteration": 2.5635735988616943 + }, + { + "auxiliary_loss_clip": 0.01106393, + "auxiliary_loss_mlp": 0.01030422, + "balance_loss_clip": 1.01801276, + "balance_loss_mlp": 1.03715992, + "epoch": 0.6098902750638809, + "flos": 16179732829440.0, + "grad_norm": 1.7435526117723426, + "language_loss": 0.74993449, + "learning_rate": 1.3948700258883448e-06, + "loss": 0.77130264, + "num_input_tokens_seen": 218435290, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69140625, + "step": 10144, + "time_per_iteration": 2.4298861026763916 + }, + { + "auxiliary_loss_clip": 0.01107837, + "auxiliary_loss_mlp": 0.01028049, + "balance_loss_clip": 1.01527548, + "balance_loss_mlp": 1.03741479, + "epoch": 0.609950398316549, + "flos": 44526393763200.0, + "grad_norm": 2.307147766408813, + "language_loss": 0.72727025, + "learning_rate": 1.394498830235383e-06, + "loss": 0.74862915, + "num_input_tokens_seen": 218457880, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.703125, + "step": 10145, + "time_per_iteration": 2.694578170776367 + }, + { + "auxiliary_loss_clip": 0.01104204, + "auxiliary_loss_mlp": 0.01030027, + "balance_loss_clip": 1.01882768, + "balance_loss_mlp": 1.036484, + "epoch": 0.6100105215692169, + "flos": 23221743223680.0, + "grad_norm": 7.584582797643419, + "language_loss": 0.69428813, + "learning_rate": 1.3941276575430862e-06, + "loss": 0.71563041, + "num_input_tokens_seen": 218475930, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6796875, + "step": 10146, + "time_per_iteration": 2.4656052589416504 + }, + { + "auxiliary_loss_clip": 0.01102864, + "auxiliary_loss_mlp": 0.0102747, + "balance_loss_clip": 1.01665735, + "balance_loss_mlp": 1.03688705, + "epoch": 0.6100706448218849, + "flos": 15012564295680.0, + "grad_norm": 1.601297479877826, + "language_loss": 0.76745832, + "learning_rate": 1.3937565078255289e-06, + "loss": 0.78876168, + "num_input_tokens_seen": 218493675, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.65625, + "step": 10147, + "time_per_iteration": 2.5520474910736084 + }, + { + "auxiliary_loss_clip": 0.01103536, + "auxiliary_loss_mlp": 0.01026544, + "balance_loss_clip": 1.01436126, + "balance_loss_mlp": 1.03525686, + "epoch": 0.6101307680745528, + "flos": 19639976682240.0, + "grad_norm": 1.740411663388647, + "language_loss": 0.78028274, + "learning_rate": 1.393385381096786e-06, + "loss": 0.80158353, + "num_input_tokens_seen": 218511780, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 10148, + "time_per_iteration": 2.4648149013519287 + }, + { + "auxiliary_loss_clip": 0.01110722, + "auxiliary_loss_mlp": 0.0103437, + "balance_loss_clip": 1.02107859, + "balance_loss_mlp": 1.03736377, + "epoch": 0.6101908913272208, + "flos": 29935566028800.0, + "grad_norm": 2.1220511331050758, + "language_loss": 0.53903639, + "learning_rate": 1.39301427737093e-06, + "loss": 0.56048727, + "num_input_tokens_seen": 218531850, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.734375, + "step": 10149, + "time_per_iteration": 2.566124200820923 + }, + { + "auxiliary_loss_clip": 0.01101762, + "auxiliary_loss_mlp": 0.0103104, + "balance_loss_clip": 1.01953661, + "balance_loss_mlp": 1.03660202, + "epoch": 0.6102510145798887, + "flos": 21798639308160.0, + "grad_norm": 1.8365676346298867, + "language_loss": 0.80172944, + "learning_rate": 1.3926431966620333e-06, + "loss": 0.82305747, + "num_input_tokens_seen": 218551245, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65234375, + "step": 10150, + "time_per_iteration": 2.5030646324157715 + }, + { + "auxiliary_loss_clip": 0.01108008, + "auxiliary_loss_mlp": 0.01035252, + "balance_loss_clip": 1.02272308, + "balance_loss_mlp": 1.0384438, + "epoch": 0.6103111378325567, + "flos": 20706129192960.0, + "grad_norm": 1.5453703107618904, + "language_loss": 0.69006532, + "learning_rate": 1.3922721389841684e-06, + "loss": 0.7114979, + "num_input_tokens_seen": 218571365, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 10151, + "time_per_iteration": 2.5013327598571777 + }, + { + "auxiliary_loss_clip": 0.01103309, + "auxiliary_loss_mlp": 0.01028825, + "balance_loss_clip": 1.01780438, + "balance_loss_mlp": 1.0351758, + "epoch": 0.6103712610852247, + "flos": 29381643417600.0, + "grad_norm": 1.64819750933, + "language_loss": 0.70659781, + "learning_rate": 1.3919011043514036e-06, + "loss": 0.7279191, + "num_input_tokens_seen": 218588315, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6796875, + "step": 10152, + "time_per_iteration": 2.519719362258911 + }, + { + "auxiliary_loss_clip": 0.01107575, + "auxiliary_loss_mlp": 0.01032635, + "balance_loss_clip": 1.02031481, + "balance_loss_mlp": 1.03778815, + "epoch": 0.6104313843378927, + "flos": 20813035046400.0, + "grad_norm": 2.061001889975494, + "language_loss": 0.77937526, + "learning_rate": 1.391530092777811e-06, + "loss": 0.80077732, + "num_input_tokens_seen": 218605940, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69921875, + "step": 10153, + "time_per_iteration": 2.4679317474365234 + }, + { + "auxiliary_loss_clip": 0.01106601, + "auxiliary_loss_mlp": 0.01029839, + "balance_loss_clip": 1.01775157, + "balance_loss_mlp": 1.03693819, + "epoch": 0.6104915075905607, + "flos": 26578457101440.0, + "grad_norm": 1.6071348715593325, + "language_loss": 0.79040915, + "learning_rate": 1.3911591042774573e-06, + "loss": 0.8117736, + "num_input_tokens_seen": 218626100, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6953125, + "step": 10154, + "time_per_iteration": 2.4811360836029053 + }, + { + "auxiliary_loss_clip": 0.01102999, + "auxiliary_loss_mlp": 0.0102928, + "balance_loss_clip": 1.01754403, + "balance_loss_mlp": 1.03598225, + "epoch": 0.6105516308432286, + "flos": 23915788790400.0, + "grad_norm": 1.696167937827746, + "language_loss": 0.70110655, + "learning_rate": 1.3907881388644116e-06, + "loss": 0.72242928, + "num_input_tokens_seen": 218645060, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66796875, + "step": 10155, + "time_per_iteration": 2.4926087856292725 + }, + { + "auxiliary_loss_clip": 0.01107481, + "auxiliary_loss_mlp": 0.01028627, + "balance_loss_clip": 1.01622927, + "balance_loss_mlp": 1.03898025, + "epoch": 0.6106117540958966, + "flos": 31577365900800.0, + "grad_norm": 1.5701440613704458, + "language_loss": 0.7118175, + "learning_rate": 1.3904171965527413e-06, + "loss": 0.73317862, + "num_input_tokens_seen": 218667690, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 10156, + "time_per_iteration": 2.529212236404419 + }, + { + "auxiliary_loss_clip": 0.01103012, + "auxiliary_loss_mlp": 0.01029909, + "balance_loss_clip": 1.01777911, + "balance_loss_mlp": 1.0372014, + "epoch": 0.6106718773485645, + "flos": 19608160210560.0, + "grad_norm": 1.5875405214127527, + "language_loss": 0.67776453, + "learning_rate": 1.3900462773565114e-06, + "loss": 0.69909376, + "num_input_tokens_seen": 218687505, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66015625, + "step": 10157, + "time_per_iteration": 2.4632043838500977 + }, + { + "auxiliary_loss_clip": 0.01103689, + "auxiliary_loss_mlp": 0.01028593, + "balance_loss_clip": 1.01682067, + "balance_loss_mlp": 1.03470659, + "epoch": 0.6107320006012326, + "flos": 17123895774720.0, + "grad_norm": 1.8568219075391552, + "language_loss": 0.72478032, + "learning_rate": 1.3896753812897877e-06, + "loss": 0.74610317, + "num_input_tokens_seen": 218705315, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6875, + "step": 10158, + "time_per_iteration": 2.419174909591675 + }, + { + "auxiliary_loss_clip": 0.01106137, + "auxiliary_loss_mlp": 0.01032049, + "balance_loss_clip": 1.02036619, + "balance_loss_mlp": 1.03673482, + "epoch": 0.6107921238539005, + "flos": 30148228500480.0, + "grad_norm": 1.8610687942781703, + "language_loss": 0.69770175, + "learning_rate": 1.389304508366635e-06, + "loss": 0.71908361, + "num_input_tokens_seen": 218725735, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6953125, + "step": 10159, + "time_per_iteration": 2.5595028400421143 + }, + { + "auxiliary_loss_clip": 0.01106993, + "auxiliary_loss_mlp": 0.01031058, + "balance_loss_clip": 1.01866579, + "balance_loss_mlp": 1.03715146, + "epoch": 0.6108522471065685, + "flos": 18440273404800.0, + "grad_norm": 1.8623845683480673, + "language_loss": 0.79084963, + "learning_rate": 1.3889336586011167e-06, + "loss": 0.81223011, + "num_input_tokens_seen": 218743215, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6953125, + "step": 10160, + "time_per_iteration": 2.4194223880767822 + }, + { + "auxiliary_loss_clip": 0.01029586, + "auxiliary_loss_mlp": 0.0100036, + "balance_loss_clip": 0.99904329, + "balance_loss_mlp": 1.00828457, + "epoch": 0.6109123703592364, + "flos": 64135454791680.0, + "grad_norm": 0.8176802836469281, + "language_loss": 0.61464268, + "learning_rate": 1.388562832007295e-06, + "loss": 0.63494217, + "num_input_tokens_seen": 218806440, + "router_z_loss_clip": 0.01318359, + "router_z_loss_mlp": 0.21289062, + "step": 10161, + "time_per_iteration": 3.204864501953125 + }, + { + "auxiliary_loss_clip": 0.01106906, + "auxiliary_loss_mlp": 0.01033953, + "balance_loss_clip": 1.02142394, + "balance_loss_mlp": 1.03706193, + "epoch": 0.6109724936119044, + "flos": 20667848273280.0, + "grad_norm": 1.7743481380342319, + "language_loss": 0.76395631, + "learning_rate": 1.3881920285992324e-06, + "loss": 0.78536499, + "num_input_tokens_seen": 218825720, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69921875, + "step": 10162, + "time_per_iteration": 2.4414381980895996 + }, + { + "auxiliary_loss_clip": 0.01104818, + "auxiliary_loss_mlp": 0.01030642, + "balance_loss_clip": 1.01866198, + "balance_loss_mlp": 1.0372498, + "epoch": 0.6110326168645723, + "flos": 31351882273920.0, + "grad_norm": 2.0274139033268077, + "language_loss": 0.71609962, + "learning_rate": 1.3878212483909888e-06, + "loss": 0.73745424, + "num_input_tokens_seen": 218847735, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 10163, + "time_per_iteration": 2.541321039199829 + }, + { + "auxiliary_loss_clip": 0.01101773, + "auxiliary_loss_mlp": 0.01029191, + "balance_loss_clip": 1.01797926, + "balance_loss_mlp": 1.03580725, + "epoch": 0.6110927401172404, + "flos": 25003378742400.0, + "grad_norm": 14.54042933705356, + "language_loss": 0.59390211, + "learning_rate": 1.387450491396625e-06, + "loss": 0.61521178, + "num_input_tokens_seen": 218866585, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.66015625, + "step": 10164, + "time_per_iteration": 2.4755120277404785 + }, + { + "auxiliary_loss_clip": 0.01103552, + "auxiliary_loss_mlp": 0.01029571, + "balance_loss_clip": 1.01823997, + "balance_loss_mlp": 1.03602457, + "epoch": 0.6111528633699083, + "flos": 26248078782720.0, + "grad_norm": 1.7214680551340567, + "language_loss": 0.75950801, + "learning_rate": 1.3870797576302003e-06, + "loss": 0.7808392, + "num_input_tokens_seen": 218885560, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.67578125, + "step": 10165, + "time_per_iteration": 2.491528034210205 + }, + { + "auxiliary_loss_clip": 0.01105154, + "auxiliary_loss_mlp": 0.01027958, + "balance_loss_clip": 1.01582241, + "balance_loss_mlp": 1.03982759, + "epoch": 0.6112129866225763, + "flos": 22382474970240.0, + "grad_norm": 1.4553973070214548, + "language_loss": 0.78996694, + "learning_rate": 1.3867090471057719e-06, + "loss": 0.81129807, + "num_input_tokens_seen": 218905055, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.65625, + "step": 10166, + "time_per_iteration": 2.4699227809906006 + }, + { + "auxiliary_loss_clip": 0.01105985, + "auxiliary_loss_mlp": 0.01029373, + "balance_loss_clip": 1.01734531, + "balance_loss_mlp": 1.03734827, + "epoch": 0.6112731098752443, + "flos": 25227892702080.0, + "grad_norm": 3.097252625024806, + "language_loss": 0.67920876, + "learning_rate": 1.3863383598373987e-06, + "loss": 0.70056236, + "num_input_tokens_seen": 218924030, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 10167, + "time_per_iteration": 2.5190818309783936 + }, + { + "auxiliary_loss_clip": 0.01104165, + "auxiliary_loss_mlp": 0.01029568, + "balance_loss_clip": 1.01872551, + "balance_loss_mlp": 1.03759277, + "epoch": 0.6113332331279122, + "flos": 22893160584960.0, + "grad_norm": 1.7954202202348515, + "language_loss": 0.78805661, + "learning_rate": 1.3859676958391364e-06, + "loss": 0.80939388, + "num_input_tokens_seen": 218943750, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6640625, + "step": 10168, + "time_per_iteration": 2.4622983932495117 + }, + { + "auxiliary_loss_clip": 0.01110572, + "auxiliary_loss_mlp": 0.01034407, + "balance_loss_clip": 1.02078724, + "balance_loss_mlp": 1.03739679, + "epoch": 0.6113933563805802, + "flos": 18620329305600.0, + "grad_norm": 4.090256272371363, + "language_loss": 0.85369581, + "learning_rate": 1.3855970551250398e-06, + "loss": 0.87514555, + "num_input_tokens_seen": 218957585, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73046875, + "step": 10169, + "time_per_iteration": 2.4625487327575684 + }, + { + "auxiliary_loss_clip": 0.01101901, + "auxiliary_loss_mlp": 0.01027849, + "balance_loss_clip": 1.01722717, + "balance_loss_mlp": 1.03553629, + "epoch": 0.6114534796332481, + "flos": 41866275317760.0, + "grad_norm": 2.5520669740881727, + "language_loss": 0.78887564, + "learning_rate": 1.3852264377091652e-06, + "loss": 0.81017315, + "num_input_tokens_seen": 218980025, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6640625, + "step": 10170, + "time_per_iteration": 2.6308984756469727 + }, + { + "auxiliary_loss_clip": 0.01108241, + "auxiliary_loss_mlp": 0.01035541, + "balance_loss_clip": 1.02221966, + "balance_loss_mlp": 1.03567076, + "epoch": 0.6115136028859162, + "flos": 21908454163200.0, + "grad_norm": 1.8675504682209607, + "language_loss": 0.69072127, + "learning_rate": 1.3848558436055651e-06, + "loss": 0.71215916, + "num_input_tokens_seen": 218998200, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7265625, + "step": 10171, + "time_per_iteration": 2.4951138496398926 + }, + { + "auxiliary_loss_clip": 0.01106531, + "auxiliary_loss_mlp": 0.01035651, + "balance_loss_clip": 1.02225816, + "balance_loss_mlp": 1.03609705, + "epoch": 0.6115737261385841, + "flos": 28804846821120.0, + "grad_norm": 1.5834424948906107, + "language_loss": 0.78990817, + "learning_rate": 1.3844852728282934e-06, + "loss": 0.81132996, + "num_input_tokens_seen": 219017910, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.703125, + "step": 10172, + "time_per_iteration": 2.512971878051758 + }, + { + "auxiliary_loss_clip": 0.01110708, + "auxiliary_loss_mlp": 0.01033185, + "balance_loss_clip": 1.02060866, + "balance_loss_mlp": 1.03796673, + "epoch": 0.6116338493912521, + "flos": 21251468453760.0, + "grad_norm": 1.895061103662262, + "language_loss": 0.66887462, + "learning_rate": 1.3841147253914022e-06, + "loss": 0.69031352, + "num_input_tokens_seen": 219037730, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7265625, + "step": 10173, + "time_per_iteration": 2.450739860534668 + }, + { + "auxiliary_loss_clip": 0.01107875, + "auxiliary_loss_mlp": 0.01033311, + "balance_loss_clip": 1.02124667, + "balance_loss_mlp": 1.03863525, + "epoch": 0.61169397264392, + "flos": 17530189488000.0, + "grad_norm": 1.9715957300151092, + "language_loss": 0.5560292, + "learning_rate": 1.3837442013089416e-06, + "loss": 0.57744104, + "num_input_tokens_seen": 219056755, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.69140625, + "step": 10174, + "time_per_iteration": 2.4200756549835205 + }, + { + "auxiliary_loss_clip": 0.01111305, + "auxiliary_loss_mlp": 0.0103241, + "balance_loss_clip": 1.01991677, + "balance_loss_mlp": 1.04081392, + "epoch": 0.611754095896588, + "flos": 23951555758080.0, + "grad_norm": 1.8852329096028353, + "language_loss": 0.66003776, + "learning_rate": 1.3833737005949628e-06, + "loss": 0.68147486, + "num_input_tokens_seen": 219076985, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 10175, + "time_per_iteration": 2.4889590740203857 + }, + { + "auxiliary_loss_clip": 0.01102647, + "auxiliary_loss_mlp": 0.01023113, + "balance_loss_clip": 1.01236653, + "balance_loss_mlp": 1.03501439, + "epoch": 0.6118142191492559, + "flos": 25994872834560.0, + "grad_norm": 2.092985999457116, + "language_loss": 0.82515383, + "learning_rate": 1.3830032232635154e-06, + "loss": 0.84641147, + "num_input_tokens_seen": 219096050, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.67578125, + "step": 10176, + "time_per_iteration": 2.506054639816284 + }, + { + "auxiliary_loss_clip": 0.01107676, + "auxiliary_loss_mlp": 0.01036678, + "balance_loss_clip": 1.02346945, + "balance_loss_mlp": 1.03832841, + "epoch": 0.611874342401924, + "flos": 24603190341120.0, + "grad_norm": 4.162493341668284, + "language_loss": 0.76968575, + "learning_rate": 1.3826327693286474e-06, + "loss": 0.79112923, + "num_input_tokens_seen": 219112665, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6953125, + "step": 10177, + "time_per_iteration": 3.941509962081909 + }, + { + "auxiliary_loss_clip": 0.01104435, + "auxiliary_loss_mlp": 0.01033459, + "balance_loss_clip": 1.02179456, + "balance_loss_mlp": 1.03604686, + "epoch": 0.6119344656545919, + "flos": 15887132640000.0, + "grad_norm": 2.082789690638706, + "language_loss": 0.75353473, + "learning_rate": 1.3822623388044065e-06, + "loss": 0.77491367, + "num_input_tokens_seen": 219129120, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.68359375, + "step": 10178, + "time_per_iteration": 3.827141523361206 + }, + { + "auxiliary_loss_clip": 0.01107456, + "auxiliary_loss_mlp": 0.0103036, + "balance_loss_clip": 1.01814103, + "balance_loss_mlp": 1.03866005, + "epoch": 0.6119945889072599, + "flos": 21652877917440.0, + "grad_norm": 1.6048823215389816, + "language_loss": 0.6671313, + "learning_rate": 1.3818919317048402e-06, + "loss": 0.68850946, + "num_input_tokens_seen": 219148950, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6875, + "step": 10179, + "time_per_iteration": 2.467815637588501 + }, + { + "auxiliary_loss_clip": 0.01107829, + "auxiliary_loss_mlp": 0.01032233, + "balance_loss_clip": 1.02061653, + "balance_loss_mlp": 1.03923988, + "epoch": 0.6120547121599279, + "flos": 13772533023360.0, + "grad_norm": 1.8410866190884951, + "language_loss": 0.84216881, + "learning_rate": 1.3815215480439933e-06, + "loss": 0.86356938, + "num_input_tokens_seen": 219165585, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6875, + "step": 10180, + "time_per_iteration": 5.375430583953857 + }, + { + "auxiliary_loss_clip": 0.01105724, + "auxiliary_loss_mlp": 0.01030243, + "balance_loss_clip": 1.01756501, + "balance_loss_mlp": 1.03854799, + "epoch": 0.6121148354125958, + "flos": 20079164275200.0, + "grad_norm": 1.5429296840981428, + "language_loss": 0.77451497, + "learning_rate": 1.3811511878359113e-06, + "loss": 0.79587466, + "num_input_tokens_seen": 219183280, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.671875, + "step": 10181, + "time_per_iteration": 2.493150234222412 + }, + { + "auxiliary_loss_clip": 0.01106153, + "auxiliary_loss_mlp": 0.01027555, + "balance_loss_clip": 1.01597941, + "balance_loss_mlp": 1.03749657, + "epoch": 0.6121749586652638, + "flos": 13471313569920.0, + "grad_norm": 1.8534348182131113, + "language_loss": 0.80704159, + "learning_rate": 1.3807808510946384e-06, + "loss": 0.82837868, + "num_input_tokens_seen": 219197200, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6875, + "step": 10182, + "time_per_iteration": 2.5022473335266113 + }, + { + "auxiliary_loss_clip": 0.01099749, + "auxiliary_loss_mlp": 0.01025447, + "balance_loss_clip": 1.01557016, + "balance_loss_mlp": 1.03581071, + "epoch": 0.6122350819179317, + "flos": 20120533764480.0, + "grad_norm": 1.6380700202040888, + "language_loss": 0.83158624, + "learning_rate": 1.3804105378342177e-06, + "loss": 0.85283822, + "num_input_tokens_seen": 219216825, + "router_z_loss_clip": 0.09863281, + "router_z_loss_mlp": 0.640625, + "step": 10183, + "time_per_iteration": 2.489943265914917 + }, + { + "auxiliary_loss_clip": 0.01029447, + "auxiliary_loss_mlp": 0.01003231, + "balance_loss_clip": 1.00200963, + "balance_loss_mlp": 1.00785327, + "epoch": 0.6122952051705998, + "flos": 65429242767360.0, + "grad_norm": 0.7013408754852208, + "language_loss": 0.62862837, + "learning_rate": 1.3800402480686914e-06, + "loss": 0.64895517, + "num_input_tokens_seen": 219283795, + "router_z_loss_clip": 0.01220703, + "router_z_loss_mlp": 0.21582031, + "step": 10184, + "time_per_iteration": 3.1942267417907715 + }, + { + "auxiliary_loss_clip": 0.01105776, + "auxiliary_loss_mlp": 0.01026861, + "balance_loss_clip": 1.01558959, + "balance_loss_mlp": 1.03836298, + "epoch": 0.6123553284232677, + "flos": 20376253664640.0, + "grad_norm": 2.39384281866501, + "language_loss": 0.82134175, + "learning_rate": 1.379669981812101e-06, + "loss": 0.84266812, + "num_input_tokens_seen": 219302385, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.67578125, + "step": 10185, + "time_per_iteration": 2.441663980484009 + }, + { + "auxiliary_loss_clip": 0.01111146, + "auxiliary_loss_mlp": 0.01029457, + "balance_loss_clip": 1.0174942, + "balance_loss_mlp": 1.03989947, + "epoch": 0.6124154516759357, + "flos": 23987645948160.0, + "grad_norm": 5.230764283030459, + "language_loss": 0.74637246, + "learning_rate": 1.3792997390784868e-06, + "loss": 0.76777852, + "num_input_tokens_seen": 219319765, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.7109375, + "step": 10186, + "time_per_iteration": 2.494351387023926 + }, + { + "auxiliary_loss_clip": 0.01102292, + "auxiliary_loss_mlp": 0.01028561, + "balance_loss_clip": 1.01786828, + "balance_loss_mlp": 1.0364244, + "epoch": 0.6124755749286036, + "flos": 21468799693440.0, + "grad_norm": 1.5640192087821545, + "language_loss": 0.78181458, + "learning_rate": 1.3789295198818895e-06, + "loss": 0.80312312, + "num_input_tokens_seen": 219337440, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.65625, + "step": 10187, + "time_per_iteration": 2.4529902935028076 + }, + { + "auxiliary_loss_clip": 0.01101994, + "auxiliary_loss_mlp": 0.01028569, + "balance_loss_clip": 1.01668978, + "balance_loss_mlp": 1.03424489, + "epoch": 0.6125356981812716, + "flos": 23879195809920.0, + "grad_norm": 1.5585408172838955, + "language_loss": 0.82932627, + "learning_rate": 1.3785593242363462e-06, + "loss": 0.85063195, + "num_input_tokens_seen": 219357525, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 10188, + "time_per_iteration": 2.4779062271118164 + }, + { + "auxiliary_loss_clip": 0.0110417, + "auxiliary_loss_mlp": 0.01028272, + "balance_loss_clip": 1.0162673, + "balance_loss_mlp": 1.03603601, + "epoch": 0.6125958214339395, + "flos": 14425604150400.0, + "grad_norm": 2.027411293701354, + "language_loss": 0.75284189, + "learning_rate": 1.378189152155896e-06, + "loss": 0.77416623, + "num_input_tokens_seen": 219374855, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 10189, + "time_per_iteration": 2.4187629222869873 + }, + { + "auxiliary_loss_clip": 0.01105389, + "auxiliary_loss_mlp": 0.01031301, + "balance_loss_clip": 1.0194819, + "balance_loss_mlp": 1.03746915, + "epoch": 0.6126559446866076, + "flos": 23259090389760.0, + "grad_norm": 1.8604795542963726, + "language_loss": 0.74147457, + "learning_rate": 1.3778190036545758e-06, + "loss": 0.76284146, + "num_input_tokens_seen": 219394740, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 10190, + "time_per_iteration": 2.4838945865631104 + }, + { + "auxiliary_loss_clip": 0.01105194, + "auxiliary_loss_mlp": 0.01030345, + "balance_loss_clip": 1.01819181, + "balance_loss_mlp": 1.03696406, + "epoch": 0.6127160679392755, + "flos": 26864808324480.0, + "grad_norm": 1.6214214182316076, + "language_loss": 0.68505728, + "learning_rate": 1.3774488787464207e-06, + "loss": 0.70641267, + "num_input_tokens_seen": 219413755, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.68359375, + "step": 10191, + "time_per_iteration": 2.4871902465820312 + }, + { + "auxiliary_loss_clip": 0.01105112, + "auxiliary_loss_mlp": 0.01031184, + "balance_loss_clip": 1.01925695, + "balance_loss_mlp": 1.03581357, + "epoch": 0.6127761911919435, + "flos": 26396425952640.0, + "grad_norm": 2.21006786046543, + "language_loss": 0.73561746, + "learning_rate": 1.377078777445467e-06, + "loss": 0.75698042, + "num_input_tokens_seen": 219433560, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 10192, + "time_per_iteration": 2.491898536682129 + }, + { + "auxiliary_loss_clip": 0.0110379, + "auxiliary_loss_mlp": 0.01027536, + "balance_loss_clip": 1.01606202, + "balance_loss_mlp": 1.03735423, + "epoch": 0.6128363144446115, + "flos": 22634747164800.0, + "grad_norm": 1.8299896919962644, + "language_loss": 0.83299625, + "learning_rate": 1.3767086997657478e-06, + "loss": 0.85430956, + "num_input_tokens_seen": 219452640, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6640625, + "step": 10193, + "time_per_iteration": 2.459218740463257 + }, + { + "auxiliary_loss_clip": 0.0110509, + "auxiliary_loss_mlp": 0.01030227, + "balance_loss_clip": 1.01823401, + "balance_loss_mlp": 1.03667831, + "epoch": 0.6128964376972794, + "flos": 26759051706240.0, + "grad_norm": 2.3362331987729554, + "language_loss": 0.69596869, + "learning_rate": 1.3763386457212979e-06, + "loss": 0.71732187, + "num_input_tokens_seen": 219468585, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.68359375, + "step": 10194, + "time_per_iteration": 2.49104642868042 + }, + { + "auxiliary_loss_clip": 0.01026973, + "auxiliary_loss_mlp": 0.01003435, + "balance_loss_clip": 1.00231493, + "balance_loss_mlp": 1.0056808, + "epoch": 0.6129565609499474, + "flos": 65567929178880.0, + "grad_norm": 0.9308202048927251, + "language_loss": 0.58683991, + "learning_rate": 1.375968615326149e-06, + "loss": 0.607144, + "num_input_tokens_seen": 219523015, + "router_z_loss_clip": 0.01123047, + "router_z_loss_mlp": 0.21289062, + "step": 10195, + "time_per_iteration": 2.8671669960021973 + }, + { + "auxiliary_loss_clip": 0.01105637, + "auxiliary_loss_mlp": 0.01035062, + "balance_loss_clip": 1.02292037, + "balance_loss_mlp": 1.03803897, + "epoch": 0.6130166842026153, + "flos": 16362087200640.0, + "grad_norm": 1.927442212334356, + "language_loss": 0.69738579, + "learning_rate": 1.3755986085943324e-06, + "loss": 0.71879274, + "num_input_tokens_seen": 219539980, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 10196, + "time_per_iteration": 2.4702036380767822 + }, + { + "auxiliary_loss_clip": 0.01104196, + "auxiliary_loss_mlp": 0.0103889, + "balance_loss_clip": 1.02748811, + "balance_loss_mlp": 1.0374887, + "epoch": 0.6130768074552834, + "flos": 23652455207040.0, + "grad_norm": 2.920952429136396, + "language_loss": 0.71311784, + "learning_rate": 1.3752286255398788e-06, + "loss": 0.73454869, + "num_input_tokens_seen": 219556980, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 10197, + "time_per_iteration": 2.5032567977905273 + }, + { + "auxiliary_loss_clip": 0.01106358, + "auxiliary_loss_mlp": 0.01038142, + "balance_loss_clip": 1.02622151, + "balance_loss_mlp": 1.03691006, + "epoch": 0.6131369307079513, + "flos": 20047455544320.0, + "grad_norm": 1.885953700600687, + "language_loss": 0.78852749, + "learning_rate": 1.3748586661768191e-06, + "loss": 0.80997241, + "num_input_tokens_seen": 219576410, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 10198, + "time_per_iteration": 2.460963010787964 + }, + { + "auxiliary_loss_clip": 0.01107653, + "auxiliary_loss_mlp": 0.01033569, + "balance_loss_clip": 1.02179098, + "balance_loss_mlp": 1.037503, + "epoch": 0.6131970539606193, + "flos": 22672166158080.0, + "grad_norm": 1.406384953747787, + "language_loss": 0.7426461, + "learning_rate": 1.374488730519181e-06, + "loss": 0.76405835, + "num_input_tokens_seen": 219597180, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.703125, + "step": 10199, + "time_per_iteration": 2.4598445892333984 + }, + { + "auxiliary_loss_clip": 0.01108284, + "auxiliary_loss_mlp": 0.01038465, + "balance_loss_clip": 1.02553713, + "balance_loss_mlp": 1.03748035, + "epoch": 0.6132571772132872, + "flos": 26870913636480.0, + "grad_norm": 1.5460485143525171, + "language_loss": 0.62069702, + "learning_rate": 1.374118818580993e-06, + "loss": 0.64216447, + "num_input_tokens_seen": 219617630, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.70703125, + "step": 10200, + "time_per_iteration": 2.509960651397705 + }, + { + "auxiliary_loss_clip": 0.01104748, + "auxiliary_loss_mlp": 0.01031277, + "balance_loss_clip": 1.01977301, + "balance_loss_mlp": 1.03736472, + "epoch": 0.6133173004659552, + "flos": 22892657794560.0, + "grad_norm": 1.743695857232765, + "language_loss": 0.68367881, + "learning_rate": 1.3737489303762822e-06, + "loss": 0.70503902, + "num_input_tokens_seen": 219637025, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.67578125, + "step": 10201, + "time_per_iteration": 2.451493740081787 + }, + { + "auxiliary_loss_clip": 0.01100932, + "auxiliary_loss_mlp": 0.01027984, + "balance_loss_clip": 1.01627123, + "balance_loss_mlp": 1.03434098, + "epoch": 0.6133774237186231, + "flos": 20485098852480.0, + "grad_norm": 2.0127297199841747, + "language_loss": 0.83613813, + "learning_rate": 1.3733790659190746e-06, + "loss": 0.8574273, + "num_input_tokens_seen": 219656625, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 10202, + "time_per_iteration": 2.469893217086792 + }, + { + "auxiliary_loss_clip": 0.01027559, + "auxiliary_loss_mlp": 0.01002547, + "balance_loss_clip": 1.00130165, + "balance_loss_mlp": 1.00619066, + "epoch": 0.6134375469712912, + "flos": 69413065217280.0, + "grad_norm": 1.0897383842290518, + "language_loss": 0.67103815, + "learning_rate": 1.3730092252233953e-06, + "loss": 0.69133925, + "num_input_tokens_seen": 219718090, + "router_z_loss_clip": 0.01245117, + "router_z_loss_mlp": 0.21386719, + "step": 10203, + "time_per_iteration": 3.1407535076141357 + }, + { + "auxiliary_loss_clip": 0.01104451, + "auxiliary_loss_mlp": 0.01029714, + "balance_loss_clip": 1.01798368, + "balance_loss_mlp": 1.03650403, + "epoch": 0.6134976702239591, + "flos": 41281541815680.0, + "grad_norm": 2.800089510822399, + "language_loss": 0.61266363, + "learning_rate": 1.37263940830327e-06, + "loss": 0.63400525, + "num_input_tokens_seen": 219740100, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 10204, + "time_per_iteration": 2.683048963546753 + }, + { + "auxiliary_loss_clip": 0.01100362, + "auxiliary_loss_mlp": 0.01025947, + "balance_loss_clip": 1.0147351, + "balance_loss_mlp": 1.03410578, + "epoch": 0.6135577934766271, + "flos": 22346600261760.0, + "grad_norm": 1.8212112064426345, + "language_loss": 0.72582424, + "learning_rate": 1.3722696151727204e-06, + "loss": 0.74708724, + "num_input_tokens_seen": 219761225, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6640625, + "step": 10205, + "time_per_iteration": 2.5465259552001953 + }, + { + "auxiliary_loss_clip": 0.01100873, + "auxiliary_loss_mlp": 0.01023206, + "balance_loss_clip": 1.01134467, + "balance_loss_mlp": 1.03527784, + "epoch": 0.6136179167292951, + "flos": 23728155120000.0, + "grad_norm": 1.5866781109951742, + "language_loss": 0.75862819, + "learning_rate": 1.3718998458457701e-06, + "loss": 0.77986902, + "num_input_tokens_seen": 219780085, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.65625, + "step": 10206, + "time_per_iteration": 2.484109401702881 + }, + { + "auxiliary_loss_clip": 0.01105453, + "auxiliary_loss_mlp": 0.01029497, + "balance_loss_clip": 1.01753414, + "balance_loss_mlp": 1.03659487, + "epoch": 0.613678039981963, + "flos": 26024678144640.0, + "grad_norm": 1.9470428402611015, + "language_loss": 0.75471091, + "learning_rate": 1.3715301003364407e-06, + "loss": 0.77606046, + "num_input_tokens_seen": 219797895, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6875, + "step": 10207, + "time_per_iteration": 2.4940414428710938 + }, + { + "auxiliary_loss_clip": 0.01105401, + "auxiliary_loss_mlp": 0.01035368, + "balance_loss_clip": 1.02424574, + "balance_loss_mlp": 1.03734899, + "epoch": 0.613738163234631, + "flos": 9859957200000.0, + "grad_norm": 2.0213582004112336, + "language_loss": 0.82293832, + "learning_rate": 1.3711603786587525e-06, + "loss": 0.84434605, + "num_input_tokens_seen": 219811295, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6796875, + "step": 10208, + "time_per_iteration": 2.4401795864105225 + }, + { + "auxiliary_loss_clip": 0.01109978, + "auxiliary_loss_mlp": 0.01029842, + "balance_loss_clip": 1.01718831, + "balance_loss_mlp": 1.03974009, + "epoch": 0.613798286487299, + "flos": 33182070001920.0, + "grad_norm": 2.3284175830302365, + "language_loss": 0.72680509, + "learning_rate": 1.3707906808267265e-06, + "loss": 0.74820334, + "num_input_tokens_seen": 219832735, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 10209, + "time_per_iteration": 2.5886876583099365 + }, + { + "auxiliary_loss_clip": 0.01104268, + "auxiliary_loss_mlp": 0.01035755, + "balance_loss_clip": 1.02388787, + "balance_loss_mlp": 1.0384059, + "epoch": 0.613858409739967, + "flos": 25627901535360.0, + "grad_norm": 1.6658761229718997, + "language_loss": 0.74108303, + "learning_rate": 1.37042100685438e-06, + "loss": 0.76248324, + "num_input_tokens_seen": 219852755, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66015625, + "step": 10210, + "time_per_iteration": 2.521304130554199 + }, + { + "auxiliary_loss_clip": 0.01027276, + "auxiliary_loss_mlp": 0.01003672, + "balance_loss_clip": 1.00248551, + "balance_loss_mlp": 1.00609028, + "epoch": 0.6139185329926349, + "flos": 67192313932800.0, + "grad_norm": 0.8595111756805056, + "language_loss": 0.65022087, + "learning_rate": 1.3700513567557325e-06, + "loss": 0.67053032, + "num_input_tokens_seen": 219922785, + "router_z_loss_clip": 0.01184082, + "router_z_loss_mlp": 0.21191406, + "step": 10211, + "time_per_iteration": 3.2215003967285156 + }, + { + "auxiliary_loss_clip": 0.01104002, + "auxiliary_loss_mlp": 0.01037412, + "balance_loss_clip": 1.02478802, + "balance_loss_mlp": 1.03655624, + "epoch": 0.6139786562453029, + "flos": 21543637680000.0, + "grad_norm": 1.6436955201310604, + "language_loss": 0.75708187, + "learning_rate": 1.369681730544801e-06, + "loss": 0.77849603, + "num_input_tokens_seen": 219942215, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.671875, + "step": 10212, + "time_per_iteration": 2.4642996788024902 + }, + { + "auxiliary_loss_clip": 0.01106038, + "auxiliary_loss_mlp": 0.01032718, + "balance_loss_clip": 1.0206902, + "balance_loss_mlp": 1.03837156, + "epoch": 0.6140387794979708, + "flos": 26068489758720.0, + "grad_norm": 1.5692336608665938, + "language_loss": 0.74044585, + "learning_rate": 1.3693121282356009e-06, + "loss": 0.76183337, + "num_input_tokens_seen": 219963830, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 10213, + "time_per_iteration": 2.5178582668304443 + }, + { + "auxiliary_loss_clip": 0.01109398, + "auxiliary_loss_mlp": 0.01032231, + "balance_loss_clip": 1.01943398, + "balance_loss_mlp": 1.03825283, + "epoch": 0.6140989027506388, + "flos": 23694614795520.0, + "grad_norm": 1.5485308182437552, + "language_loss": 0.73049855, + "learning_rate": 1.3689425498421483e-06, + "loss": 0.75191492, + "num_input_tokens_seen": 219983815, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7109375, + "step": 10214, + "time_per_iteration": 2.4716460704803467 + }, + { + "auxiliary_loss_clip": 0.01106578, + "auxiliary_loss_mlp": 0.01032258, + "balance_loss_clip": 1.01949024, + "balance_loss_mlp": 1.03701019, + "epoch": 0.6141590260033067, + "flos": 22231721589120.0, + "grad_norm": 1.7742338479763222, + "language_loss": 0.74487185, + "learning_rate": 1.3685729953784572e-06, + "loss": 0.76626021, + "num_input_tokens_seen": 220003165, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 10215, + "time_per_iteration": 2.507734537124634 + }, + { + "auxiliary_loss_clip": 0.01103725, + "auxiliary_loss_mlp": 0.01032178, + "balance_loss_clip": 1.02031636, + "balance_loss_mlp": 1.03673744, + "epoch": 0.6142191492559748, + "flos": 23871653953920.0, + "grad_norm": 1.8655230442391189, + "language_loss": 0.78393024, + "learning_rate": 1.368203464858542e-06, + "loss": 0.80528927, + "num_input_tokens_seen": 220021015, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66796875, + "step": 10216, + "time_per_iteration": 2.479534864425659 + }, + { + "auxiliary_loss_clip": 0.01104623, + "auxiliary_loss_mlp": 0.01030214, + "balance_loss_clip": 1.01762557, + "balance_loss_mlp": 1.0373491, + "epoch": 0.6142792725086427, + "flos": 15042513260160.0, + "grad_norm": 2.5637363675830254, + "language_loss": 0.80079889, + "learning_rate": 1.3678339582964147e-06, + "loss": 0.82214725, + "num_input_tokens_seen": 220035780, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.671875, + "step": 10217, + "time_per_iteration": 2.4395620822906494 + }, + { + "auxiliary_loss_clip": 0.01105204, + "auxiliary_loss_mlp": 0.01026292, + "balance_loss_clip": 1.01397753, + "balance_loss_mlp": 1.0361073, + "epoch": 0.6143393957613107, + "flos": 23330947547520.0, + "grad_norm": 2.424574581231863, + "language_loss": 0.78246987, + "learning_rate": 1.3674644757060865e-06, + "loss": 0.80378485, + "num_input_tokens_seen": 220054280, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69140625, + "step": 10218, + "time_per_iteration": 3.851706027984619 + }, + { + "auxiliary_loss_clip": 0.01105535, + "auxiliary_loss_mlp": 0.0103101, + "balance_loss_clip": 1.01911902, + "balance_loss_mlp": 1.038481, + "epoch": 0.6143995190139786, + "flos": 20117086058880.0, + "grad_norm": 1.517262895370751, + "language_loss": 0.81908238, + "learning_rate": 1.367095017101569e-06, + "loss": 0.84044778, + "num_input_tokens_seen": 220074120, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 10219, + "time_per_iteration": 2.5016467571258545 + }, + { + "auxiliary_loss_clip": 0.01104307, + "auxiliary_loss_mlp": 0.01028106, + "balance_loss_clip": 1.01553547, + "balance_loss_mlp": 1.03468263, + "epoch": 0.6144596422666466, + "flos": 42303559489920.0, + "grad_norm": 1.8306132213683777, + "language_loss": 0.66681564, + "learning_rate": 1.3667255824968717e-06, + "loss": 0.6881398, + "num_input_tokens_seen": 220096320, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 10220, + "time_per_iteration": 4.022945404052734 + }, + { + "auxiliary_loss_clip": 0.01101764, + "auxiliary_loss_mlp": 0.01026811, + "balance_loss_clip": 1.01547968, + "balance_loss_mlp": 1.03572094, + "epoch": 0.6145197655193146, + "flos": 21573622558080.0, + "grad_norm": 1.9547432893761034, + "language_loss": 0.71545637, + "learning_rate": 1.3663561719060041e-06, + "loss": 0.73674214, + "num_input_tokens_seen": 220114850, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66015625, + "step": 10221, + "time_per_iteration": 3.984619617462158 + }, + { + "auxiliary_loss_clip": 0.01102691, + "auxiliary_loss_mlp": 0.0102811, + "balance_loss_clip": 1.01677299, + "balance_loss_mlp": 1.03571272, + "epoch": 0.6145798887719826, + "flos": 21471098163840.0, + "grad_norm": 1.6401613716258656, + "language_loss": 0.79416037, + "learning_rate": 1.3659867853429735e-06, + "loss": 0.81546843, + "num_input_tokens_seen": 220133395, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 10222, + "time_per_iteration": 2.482626438140869 + }, + { + "auxiliary_loss_clip": 0.01107554, + "auxiliary_loss_mlp": 0.01030696, + "balance_loss_clip": 1.01896548, + "balance_loss_mlp": 1.03836894, + "epoch": 0.6146400120246506, + "flos": 20777016683520.0, + "grad_norm": 4.215986026899438, + "language_loss": 0.76034737, + "learning_rate": 1.365617422821788e-06, + "loss": 0.78172994, + "num_input_tokens_seen": 220152790, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.69140625, + "step": 10223, + "time_per_iteration": 3.9831442832946777 + }, + { + "auxiliary_loss_clip": 0.01102548, + "auxiliary_loss_mlp": 0.01028023, + "balance_loss_clip": 1.01629829, + "balance_loss_mlp": 1.03779078, + "epoch": 0.6147001352773185, + "flos": 13881306384000.0, + "grad_norm": 2.127618755426409, + "language_loss": 0.78459811, + "learning_rate": 1.3652480843564535e-06, + "loss": 0.80590385, + "num_input_tokens_seen": 220169535, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6484375, + "step": 10224, + "time_per_iteration": 2.43497896194458 + }, + { + "auxiliary_loss_clip": 0.01100015, + "auxiliary_loss_mlp": 0.01025021, + "balance_loss_clip": 1.01433933, + "balance_loss_mlp": 1.03477085, + "epoch": 0.6147602585299865, + "flos": 56641791807360.0, + "grad_norm": 1.3024527007974456, + "language_loss": 0.66392958, + "learning_rate": 1.3648787699609746e-06, + "loss": 0.68517995, + "num_input_tokens_seen": 220195305, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.65234375, + "step": 10225, + "time_per_iteration": 2.845883369445801 + }, + { + "auxiliary_loss_clip": 0.01107388, + "auxiliary_loss_mlp": 0.01028986, + "balance_loss_clip": 1.01713014, + "balance_loss_mlp": 1.03713298, + "epoch": 0.6148203817826544, + "flos": 32817217605120.0, + "grad_norm": 2.1585029045138415, + "language_loss": 0.63199341, + "learning_rate": 1.364509479649357e-06, + "loss": 0.65335715, + "num_input_tokens_seen": 220215040, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.703125, + "step": 10226, + "time_per_iteration": 2.555772304534912 + }, + { + "auxiliary_loss_clip": 0.01104672, + "auxiliary_loss_mlp": 0.01030795, + "balance_loss_clip": 1.01831996, + "balance_loss_mlp": 1.03651762, + "epoch": 0.6148805050353224, + "flos": 18332038748160.0, + "grad_norm": 1.704417913895937, + "language_loss": 0.75513506, + "learning_rate": 1.3641402134356037e-06, + "loss": 0.77648973, + "num_input_tokens_seen": 220234205, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 10227, + "time_per_iteration": 2.536123514175415 + }, + { + "auxiliary_loss_clip": 0.01106007, + "auxiliary_loss_mlp": 0.01036804, + "balance_loss_clip": 1.022928, + "balance_loss_mlp": 1.03667367, + "epoch": 0.6149406282879903, + "flos": 14063983977600.0, + "grad_norm": 1.8551652476106548, + "language_loss": 0.61097801, + "learning_rate": 1.3637709713337164e-06, + "loss": 0.63240612, + "num_input_tokens_seen": 220252730, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.69140625, + "step": 10228, + "time_per_iteration": 2.419962167739868 + }, + { + "auxiliary_loss_clip": 0.01103286, + "auxiliary_loss_mlp": 0.01029831, + "balance_loss_clip": 1.01778531, + "balance_loss_mlp": 1.03672791, + "epoch": 0.6150007515406584, + "flos": 25190186400000.0, + "grad_norm": 1.3329712414655954, + "language_loss": 0.74049234, + "learning_rate": 1.3634017533576985e-06, + "loss": 0.76182348, + "num_input_tokens_seen": 220273345, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6640625, + "step": 10229, + "time_per_iteration": 2.506852626800537 + }, + { + "auxiliary_loss_clip": 0.01106333, + "auxiliary_loss_mlp": 0.01032652, + "balance_loss_clip": 1.0202601, + "balance_loss_mlp": 1.03880942, + "epoch": 0.6150608747933263, + "flos": 21945262625280.0, + "grad_norm": 1.7175132302354088, + "language_loss": 0.77996862, + "learning_rate": 1.3630325595215493e-06, + "loss": 0.80135846, + "num_input_tokens_seen": 220293845, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.67578125, + "step": 10230, + "time_per_iteration": 2.477675199508667 + }, + { + "auxiliary_loss_clip": 0.0110355, + "auxiliary_loss_mlp": 0.0102496, + "balance_loss_clip": 1.01371837, + "balance_loss_mlp": 1.03570461, + "epoch": 0.6151209980459943, + "flos": 30117453523200.0, + "grad_norm": 1.6621971423553226, + "language_loss": 0.72935748, + "learning_rate": 1.36266338983927e-06, + "loss": 0.75064254, + "num_input_tokens_seen": 220316070, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6796875, + "step": 10231, + "time_per_iteration": 2.561504602432251 + }, + { + "auxiliary_loss_clip": 0.01105925, + "auxiliary_loss_mlp": 0.0102981, + "balance_loss_clip": 1.01883698, + "balance_loss_mlp": 1.03801215, + "epoch": 0.6151811212986622, + "flos": 30008356940160.0, + "grad_norm": 1.5773460676573843, + "language_loss": 0.6960876, + "learning_rate": 1.362294244324858e-06, + "loss": 0.71744496, + "num_input_tokens_seen": 220335695, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6796875, + "step": 10232, + "time_per_iteration": 2.5435595512390137 + }, + { + "auxiliary_loss_clip": 0.01100438, + "auxiliary_loss_mlp": 0.01028039, + "balance_loss_clip": 1.0169704, + "balance_loss_mlp": 1.03564286, + "epoch": 0.6152412445513302, + "flos": 18872888808960.0, + "grad_norm": 1.9335513310183938, + "language_loss": 0.91684914, + "learning_rate": 1.3619251229923126e-06, + "loss": 0.9381339, + "num_input_tokens_seen": 220353720, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 10233, + "time_per_iteration": 2.464128017425537 + }, + { + "auxiliary_loss_clip": 0.01104077, + "auxiliary_loss_mlp": 0.01033089, + "balance_loss_clip": 1.02258682, + "balance_loss_mlp": 1.03727007, + "epoch": 0.6153013678039982, + "flos": 25703601448320.0, + "grad_norm": 1.693429608694219, + "language_loss": 0.71381217, + "learning_rate": 1.3615560258556306e-06, + "loss": 0.73518384, + "num_input_tokens_seen": 220372515, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.66796875, + "step": 10234, + "time_per_iteration": 2.484847068786621 + }, + { + "auxiliary_loss_clip": 0.01104977, + "auxiliary_loss_mlp": 0.01031373, + "balance_loss_clip": 1.01926732, + "balance_loss_mlp": 1.03558159, + "epoch": 0.6153614910566662, + "flos": 28510271383680.0, + "grad_norm": 1.9863568991468559, + "language_loss": 0.66966361, + "learning_rate": 1.3611869529288077e-06, + "loss": 0.69102716, + "num_input_tokens_seen": 220393490, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 10235, + "time_per_iteration": 2.499189853668213 + }, + { + "auxiliary_loss_clip": 0.01106455, + "auxiliary_loss_mlp": 0.01029276, + "balance_loss_clip": 1.01746273, + "balance_loss_mlp": 1.0364213, + "epoch": 0.6154216143093342, + "flos": 23549787158400.0, + "grad_norm": 2.269392311324668, + "language_loss": 0.81321824, + "learning_rate": 1.3608179042258398e-06, + "loss": 0.83457547, + "num_input_tokens_seen": 220412855, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.703125, + "step": 10236, + "time_per_iteration": 2.467374086380005 + }, + { + "auxiliary_loss_clip": 0.01106752, + "auxiliary_loss_mlp": 0.01029525, + "balance_loss_clip": 1.01815271, + "balance_loss_mlp": 1.03654408, + "epoch": 0.6154817375620021, + "flos": 22748081552640.0, + "grad_norm": 1.4348801753525875, + "language_loss": 0.80595863, + "learning_rate": 1.360448879760721e-06, + "loss": 0.82732141, + "num_input_tokens_seen": 220433440, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.703125, + "step": 10237, + "time_per_iteration": 2.4921953678131104 + }, + { + "auxiliary_loss_clip": 0.01103597, + "auxiliary_loss_mlp": 0.01036802, + "balance_loss_clip": 1.02533984, + "balance_loss_mlp": 1.03659725, + "epoch": 0.6155418608146701, + "flos": 27162975121920.0, + "grad_norm": 1.8067050747817437, + "language_loss": 0.7606861, + "learning_rate": 1.3600798795474449e-06, + "loss": 0.78209013, + "num_input_tokens_seen": 220453445, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 10238, + "time_per_iteration": 2.5077149868011475 + }, + { + "auxiliary_loss_clip": 0.01027367, + "auxiliary_loss_mlp": 0.01004239, + "balance_loss_clip": 1.00305295, + "balance_loss_mlp": 1.00621736, + "epoch": 0.615601984067338, + "flos": 68811165014400.0, + "grad_norm": 0.7636645723592903, + "language_loss": 0.57658124, + "learning_rate": 1.3597109036000036e-06, + "loss": 0.5968973, + "num_input_tokens_seen": 220509730, + "router_z_loss_clip": 0.01184082, + "router_z_loss_mlp": 0.21191406, + "step": 10239, + "time_per_iteration": 3.0781197547912598 + }, + { + "auxiliary_loss_clip": 0.01106458, + "auxiliary_loss_mlp": 0.01034804, + "balance_loss_clip": 1.02284706, + "balance_loss_mlp": 1.03747571, + "epoch": 0.615662107320006, + "flos": 15517144598400.0, + "grad_norm": 2.10217205787335, + "language_loss": 0.77644312, + "learning_rate": 1.3593419519323892e-06, + "loss": 0.79785573, + "num_input_tokens_seen": 220527295, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.69140625, + "step": 10240, + "time_per_iteration": 2.4440581798553467 + }, + { + "auxiliary_loss_clip": 0.01107517, + "auxiliary_loss_mlp": 0.01032816, + "balance_loss_clip": 1.02069855, + "balance_loss_mlp": 1.03847337, + "epoch": 0.615722230572674, + "flos": 21063691128960.0, + "grad_norm": 2.3418662553679495, + "language_loss": 0.72875106, + "learning_rate": 1.3589730245585922e-06, + "loss": 0.75015438, + "num_input_tokens_seen": 220542730, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69140625, + "step": 10241, + "time_per_iteration": 2.440458059310913 + }, + { + "auxiliary_loss_clip": 0.01102041, + "auxiliary_loss_mlp": 0.01027, + "balance_loss_clip": 1.01599121, + "balance_loss_mlp": 1.03596628, + "epoch": 0.615782353825342, + "flos": 23256791919360.0, + "grad_norm": 1.629664240741642, + "language_loss": 0.71536696, + "learning_rate": 1.3586041214926018e-06, + "loss": 0.73665738, + "num_input_tokens_seen": 220562995, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.66015625, + "step": 10242, + "time_per_iteration": 2.465280771255493 + }, + { + "auxiliary_loss_clip": 0.0110517, + "auxiliary_loss_mlp": 0.01028948, + "balance_loss_clip": 1.01760554, + "balance_loss_mlp": 1.03812838, + "epoch": 0.6158424770780099, + "flos": 21103911383040.0, + "grad_norm": 1.7806476568458218, + "language_loss": 0.72179866, + "learning_rate": 1.3582352427484086e-06, + "loss": 0.74313986, + "num_input_tokens_seen": 220581775, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 10243, + "time_per_iteration": 2.4706227779388428 + }, + { + "auxiliary_loss_clip": 0.01026424, + "auxiliary_loss_mlp": 0.01003264, + "balance_loss_clip": 1.00196481, + "balance_loss_mlp": 1.00526905, + "epoch": 0.6159026003306779, + "flos": 70333276769280.0, + "grad_norm": 0.7683330535495017, + "language_loss": 0.5684256, + "learning_rate": 1.3578663883399984e-06, + "loss": 0.58872247, + "num_input_tokens_seen": 220646395, + "router_z_loss_clip": 0.01300049, + "router_z_loss_mlp": 0.2109375, + "step": 10244, + "time_per_iteration": 3.108367919921875 + }, + { + "auxiliary_loss_clip": 0.01104886, + "auxiliary_loss_mlp": 0.01027895, + "balance_loss_clip": 1.01563978, + "balance_loss_mlp": 1.03710341, + "epoch": 0.6159627235833458, + "flos": 33874355802240.0, + "grad_norm": 1.7714653532708287, + "language_loss": 0.63837689, + "learning_rate": 1.3574975582813593e-06, + "loss": 0.65970469, + "num_input_tokens_seen": 220668335, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6796875, + "step": 10245, + "time_per_iteration": 2.5604476928710938 + }, + { + "auxiliary_loss_clip": 0.01102331, + "auxiliary_loss_mlp": 0.01028094, + "balance_loss_clip": 1.01664448, + "balance_loss_mlp": 1.03589809, + "epoch": 0.6160228468360138, + "flos": 26575440359040.0, + "grad_norm": 1.7050556240908794, + "language_loss": 0.78958333, + "learning_rate": 1.3571287525864771e-06, + "loss": 0.81088758, + "num_input_tokens_seen": 220688915, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 10246, + "time_per_iteration": 2.6499507427215576 + }, + { + "auxiliary_loss_clip": 0.01109766, + "auxiliary_loss_mlp": 0.01044472, + "balance_loss_clip": 1.03114414, + "balance_loss_mlp": 1.03871059, + "epoch": 0.6160829700886818, + "flos": 17193274894080.0, + "grad_norm": 2.2268806206076586, + "language_loss": 0.87346923, + "learning_rate": 1.3567599712693368e-06, + "loss": 0.89501166, + "num_input_tokens_seen": 220703465, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 10247, + "time_per_iteration": 2.53155517578125 + }, + { + "auxiliary_loss_clip": 0.01108151, + "auxiliary_loss_mlp": 0.01031566, + "balance_loss_clip": 1.01996708, + "balance_loss_mlp": 1.03957379, + "epoch": 0.6161430933413498, + "flos": 23623547736960.0, + "grad_norm": 2.019293099257412, + "language_loss": 0.80015755, + "learning_rate": 1.3563912143439235e-06, + "loss": 0.82155472, + "num_input_tokens_seen": 220722090, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6875, + "step": 10248, + "time_per_iteration": 2.565202236175537 + }, + { + "auxiliary_loss_clip": 0.01101653, + "auxiliary_loss_mlp": 0.01029592, + "balance_loss_clip": 1.01873195, + "balance_loss_mlp": 1.03529978, + "epoch": 0.6162032165940178, + "flos": 23002436736000.0, + "grad_norm": 1.9553906460889976, + "language_loss": 0.8661859, + "learning_rate": 1.3560224818242191e-06, + "loss": 0.88749832, + "num_input_tokens_seen": 220741075, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6640625, + "step": 10249, + "time_per_iteration": 2.5155153274536133 + }, + { + "auxiliary_loss_clip": 0.01104366, + "auxiliary_loss_mlp": 0.01026878, + "balance_loss_clip": 1.01438522, + "balance_loss_mlp": 1.03663516, + "epoch": 0.6162633398466857, + "flos": 39421979740800.0, + "grad_norm": 1.9896565394121724, + "language_loss": 0.6859656, + "learning_rate": 1.3556537737242072e-06, + "loss": 0.70727801, + "num_input_tokens_seen": 220763395, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.67578125, + "step": 10250, + "time_per_iteration": 2.6529786586761475 + }, + { + "auxiliary_loss_clip": 0.01098407, + "auxiliary_loss_mlp": 0.01025601, + "balance_loss_clip": 1.01488972, + "balance_loss_mlp": 1.03547108, + "epoch": 0.6163234630993537, + "flos": 19244672530560.0, + "grad_norm": 1.9258007321652242, + "language_loss": 0.74149621, + "learning_rate": 1.3552850900578692e-06, + "loss": 0.76273632, + "num_input_tokens_seen": 220780640, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.62890625, + "step": 10251, + "time_per_iteration": 2.5420565605163574 + }, + { + "auxiliary_loss_clip": 0.01104048, + "auxiliary_loss_mlp": 0.01027255, + "balance_loss_clip": 1.01518464, + "balance_loss_mlp": 1.03652811, + "epoch": 0.6163835863520216, + "flos": 15961791058560.0, + "grad_norm": 2.78922632869985, + "language_loss": 0.68291706, + "learning_rate": 1.3549164308391844e-06, + "loss": 0.70423007, + "num_input_tokens_seen": 220797960, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.67578125, + "step": 10252, + "time_per_iteration": 2.5236093997955322 + }, + { + "auxiliary_loss_clip": 0.01026564, + "auxiliary_loss_mlp": 0.01001879, + "balance_loss_clip": 1.0006336, + "balance_loss_mlp": 1.00562644, + "epoch": 0.6164437096046896, + "flos": 68103834393600.0, + "grad_norm": 0.8837133823521999, + "language_loss": 0.57868779, + "learning_rate": 1.3545477960821333e-06, + "loss": 0.5989722, + "num_input_tokens_seen": 220856930, + "router_z_loss_clip": 0.01245117, + "router_z_loss_mlp": 0.20898438, + "step": 10253, + "time_per_iteration": 3.103968858718872 + }, + { + "auxiliary_loss_clip": 0.01103565, + "auxiliary_loss_mlp": 0.01028041, + "balance_loss_clip": 1.01609635, + "balance_loss_mlp": 1.03543723, + "epoch": 0.6165038328573575, + "flos": 21361211481600.0, + "grad_norm": 1.4349605702857906, + "language_loss": 0.79628026, + "learning_rate": 1.3541791858006946e-06, + "loss": 0.81759632, + "num_input_tokens_seen": 220877595, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 10254, + "time_per_iteration": 2.4770078659057617 + }, + { + "auxiliary_loss_clip": 0.01107997, + "auxiliary_loss_mlp": 0.01028587, + "balance_loss_clip": 1.0167551, + "balance_loss_mlp": 1.03706634, + "epoch": 0.6165639561100256, + "flos": 21101972048640.0, + "grad_norm": 1.765232531729237, + "language_loss": 0.80340689, + "learning_rate": 1.353810600008846e-06, + "loss": 0.82477272, + "num_input_tokens_seen": 220896880, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.7109375, + "step": 10255, + "time_per_iteration": 2.4666266441345215 + }, + { + "auxiliary_loss_clip": 0.01108694, + "auxiliary_loss_mlp": 0.01032846, + "balance_loss_clip": 1.0197928, + "balance_loss_mlp": 1.03867257, + "epoch": 0.6166240793626935, + "flos": 25338533569920.0, + "grad_norm": 1.7468186030679946, + "language_loss": 0.65269709, + "learning_rate": 1.3534420387205646e-06, + "loss": 0.6741125, + "num_input_tokens_seen": 220916425, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.69921875, + "step": 10256, + "time_per_iteration": 2.514446973800659 + }, + { + "auxiliary_loss_clip": 0.01103556, + "auxiliary_loss_mlp": 0.01028163, + "balance_loss_clip": 1.01702309, + "balance_loss_mlp": 1.0371418, + "epoch": 0.6166842026153615, + "flos": 19682639061120.0, + "grad_norm": 1.5636561949397187, + "language_loss": 0.71758097, + "learning_rate": 1.353073501949825e-06, + "loss": 0.73889816, + "num_input_tokens_seen": 220935050, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 10257, + "time_per_iteration": 2.4575183391571045 + }, + { + "auxiliary_loss_clip": 0.01108721, + "auxiliary_loss_mlp": 0.01028654, + "balance_loss_clip": 1.01625657, + "balance_loss_mlp": 1.03909421, + "epoch": 0.6167443258680294, + "flos": 19318361281920.0, + "grad_norm": 2.0856421908029192, + "language_loss": 0.72058862, + "learning_rate": 1.3527049897106034e-06, + "loss": 0.74196231, + "num_input_tokens_seen": 220953085, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6953125, + "step": 10258, + "time_per_iteration": 2.4590466022491455 + }, + { + "auxiliary_loss_clip": 0.01105581, + "auxiliary_loss_mlp": 0.01030827, + "balance_loss_clip": 1.01888824, + "balance_loss_mlp": 1.03705239, + "epoch": 0.6168044491206974, + "flos": 25265239868160.0, + "grad_norm": 2.864696001888572, + "language_loss": 0.63946176, + "learning_rate": 1.3523365020168735e-06, + "loss": 0.66082585, + "num_input_tokens_seen": 220969050, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 10259, + "time_per_iteration": 2.472621202468872 + }, + { + "auxiliary_loss_clip": 0.01103568, + "auxiliary_loss_mlp": 0.0103251, + "balance_loss_clip": 1.02020216, + "balance_loss_mlp": 1.03760934, + "epoch": 0.6168645723733654, + "flos": 13219903301760.0, + "grad_norm": 1.8983508996193146, + "language_loss": 0.71194589, + "learning_rate": 1.3519680388826084e-06, + "loss": 0.73330671, + "num_input_tokens_seen": 220985825, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66015625, + "step": 10260, + "time_per_iteration": 3.8351244926452637 + }, + { + "auxiliary_loss_clip": 0.01112265, + "auxiliary_loss_mlp": 0.01030607, + "balance_loss_clip": 1.0169692, + "balance_loss_mlp": 1.04087448, + "epoch": 0.6169246956260334, + "flos": 26652038112000.0, + "grad_norm": 1.8640894588611543, + "language_loss": 0.68213212, + "learning_rate": 1.3515996003217803e-06, + "loss": 0.70356077, + "num_input_tokens_seen": 221004465, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71484375, + "step": 10261, + "time_per_iteration": 2.4846863746643066 + }, + { + "auxiliary_loss_clip": 0.01103737, + "auxiliary_loss_mlp": 0.01037916, + "balance_loss_clip": 1.02671063, + "balance_loss_mlp": 1.03602839, + "epoch": 0.6169848188787014, + "flos": 23148413608320.0, + "grad_norm": 1.7606752411550333, + "language_loss": 0.71393299, + "learning_rate": 1.3512311863483602e-06, + "loss": 0.73534954, + "num_input_tokens_seen": 221023260, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.67578125, + "step": 10262, + "time_per_iteration": 3.8463478088378906 + }, + { + "auxiliary_loss_clip": 0.01105557, + "auxiliary_loss_mlp": 0.01031442, + "balance_loss_clip": 1.01940775, + "balance_loss_mlp": 1.03685451, + "epoch": 0.6170449421313693, + "flos": 23331917214720.0, + "grad_norm": 1.9300485767677382, + "language_loss": 0.70171946, + "learning_rate": 1.3508627969763188e-06, + "loss": 0.72308946, + "num_input_tokens_seen": 221043090, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6875, + "step": 10263, + "time_per_iteration": 3.8719136714935303 + }, + { + "auxiliary_loss_clip": 0.01106014, + "auxiliary_loss_mlp": 0.01028395, + "balance_loss_clip": 1.01676631, + "balance_loss_mlp": 1.03678763, + "epoch": 0.6171050653840373, + "flos": 15851617067520.0, + "grad_norm": 8.265893448617778, + "language_loss": 0.75888687, + "learning_rate": 1.3504944322196244e-06, + "loss": 0.78023094, + "num_input_tokens_seen": 221061435, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.69140625, + "step": 10264, + "time_per_iteration": 3.9576797485351562 + }, + { + "auxiliary_loss_clip": 0.01105756, + "auxiliary_loss_mlp": 0.010292, + "balance_loss_clip": 1.01682043, + "balance_loss_mlp": 1.03773212, + "epoch": 0.6171651886367052, + "flos": 20045516209920.0, + "grad_norm": 2.621461269637815, + "language_loss": 0.85138124, + "learning_rate": 1.350126092092247e-06, + "loss": 0.87273085, + "num_input_tokens_seen": 221078705, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 10265, + "time_per_iteration": 2.4204261302948 + }, + { + "auxiliary_loss_clip": 0.01103728, + "auxiliary_loss_mlp": 0.01032958, + "balance_loss_clip": 1.02098346, + "balance_loss_mlp": 1.03761029, + "epoch": 0.6172253118893732, + "flos": 26432695710720.0, + "grad_norm": 3.6073790517357995, + "language_loss": 0.642869, + "learning_rate": 1.349757776608153e-06, + "loss": 0.66423583, + "num_input_tokens_seen": 221099245, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66015625, + "step": 10266, + "time_per_iteration": 2.5135982036590576 + }, + { + "auxiliary_loss_clip": 0.0110251, + "auxiliary_loss_mlp": 0.01031969, + "balance_loss_clip": 1.02062035, + "balance_loss_mlp": 1.03433692, + "epoch": 0.6172854351420412, + "flos": 22632879657600.0, + "grad_norm": 1.7504973624629372, + "language_loss": 0.75734687, + "learning_rate": 1.3493894857813094e-06, + "loss": 0.77869165, + "num_input_tokens_seen": 221116930, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6796875, + "step": 10267, + "time_per_iteration": 2.4403936862945557 + }, + { + "auxiliary_loss_clip": 0.01107183, + "auxiliary_loss_mlp": 0.01026945, + "balance_loss_clip": 1.0146544, + "balance_loss_mlp": 1.0368762, + "epoch": 0.6173455583947092, + "flos": 21212936138880.0, + "grad_norm": 1.5812909664018504, + "language_loss": 0.74722588, + "learning_rate": 1.3490212196256818e-06, + "loss": 0.7685672, + "num_input_tokens_seen": 221137660, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 10268, + "time_per_iteration": 2.467622995376587 + }, + { + "auxiliary_loss_clip": 0.01108432, + "auxiliary_loss_mlp": 0.01027035, + "balance_loss_clip": 1.01477456, + "balance_loss_mlp": 1.03709388, + "epoch": 0.6174056816473771, + "flos": 19500284689920.0, + "grad_norm": 1.6692354192517487, + "language_loss": 0.75483018, + "learning_rate": 1.3486529781552342e-06, + "loss": 0.77618486, + "num_input_tokens_seen": 221156225, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.71484375, + "step": 10269, + "time_per_iteration": 2.427558660507202 + }, + { + "auxiliary_loss_clip": 0.01102127, + "auxiliary_loss_mlp": 0.01025701, + "balance_loss_clip": 1.01379776, + "balance_loss_mlp": 1.03455544, + "epoch": 0.6174658049000451, + "flos": 15997342544640.0, + "grad_norm": 2.2351967956552987, + "language_loss": 0.76565802, + "learning_rate": 1.3482847613839318e-06, + "loss": 0.78693628, + "num_input_tokens_seen": 221173820, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 10270, + "time_per_iteration": 2.441521644592285 + }, + { + "auxiliary_loss_clip": 0.01106104, + "auxiliary_loss_mlp": 0.01026083, + "balance_loss_clip": 1.01432252, + "balance_loss_mlp": 1.03741896, + "epoch": 0.617525928152713, + "flos": 21903893136000.0, + "grad_norm": 1.7948450339640445, + "language_loss": 0.82511967, + "learning_rate": 1.347916569325736e-06, + "loss": 0.84644157, + "num_input_tokens_seen": 221191815, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6875, + "step": 10271, + "time_per_iteration": 2.427300453186035 + }, + { + "auxiliary_loss_clip": 0.01105866, + "auxiliary_loss_mlp": 0.01031447, + "balance_loss_clip": 1.01937735, + "balance_loss_mlp": 1.03691125, + "epoch": 0.617586051405381, + "flos": 21105958458240.0, + "grad_norm": 2.1955459228647687, + "language_loss": 0.76878774, + "learning_rate": 1.3475484019946093e-06, + "loss": 0.79016083, + "num_input_tokens_seen": 221211205, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6875, + "step": 10272, + "time_per_iteration": 2.4983582496643066 + }, + { + "auxiliary_loss_clip": 0.01028751, + "auxiliary_loss_mlp": 0.0099819, + "balance_loss_clip": 0.99684906, + "balance_loss_mlp": 1.00760865, + "epoch": 0.617646174658049, + "flos": 58610776665600.0, + "grad_norm": 0.8101209602428692, + "language_loss": 0.59128773, + "learning_rate": 1.347180259404513e-06, + "loss": 0.61155713, + "num_input_tokens_seen": 221268430, + "router_z_loss_clip": 0.01342773, + "router_z_loss_mlp": 0.21191406, + "step": 10273, + "time_per_iteration": 2.9302847385406494 + }, + { + "auxiliary_loss_clip": 0.01103173, + "auxiliary_loss_mlp": 0.01026931, + "balance_loss_clip": 1.01496243, + "balance_loss_mlp": 1.03603625, + "epoch": 0.617706297910717, + "flos": 13878684691200.0, + "grad_norm": 2.336605024454028, + "language_loss": 0.72963846, + "learning_rate": 1.3468121415694059e-06, + "loss": 0.75093955, + "num_input_tokens_seen": 221281930, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.671875, + "step": 10274, + "time_per_iteration": 2.4481325149536133 + }, + { + "auxiliary_loss_clip": 0.01104274, + "auxiliary_loss_mlp": 0.01027771, + "balance_loss_clip": 1.01643395, + "balance_loss_mlp": 1.03684974, + "epoch": 0.617766421163385, + "flos": 19208438686080.0, + "grad_norm": 3.0133252214936372, + "language_loss": 0.77358514, + "learning_rate": 1.3464440485032484e-06, + "loss": 0.79490566, + "num_input_tokens_seen": 221301605, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.67578125, + "step": 10275, + "time_per_iteration": 2.4196648597717285 + }, + { + "auxiliary_loss_clip": 0.01104297, + "auxiliary_loss_mlp": 0.01027767, + "balance_loss_clip": 1.01589358, + "balance_loss_mlp": 1.03650546, + "epoch": 0.6178265444160529, + "flos": 22565978576640.0, + "grad_norm": 1.6767450105474386, + "language_loss": 0.79291052, + "learning_rate": 1.346075980219998e-06, + "loss": 0.81423116, + "num_input_tokens_seen": 221320105, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 10276, + "time_per_iteration": 2.5229239463806152 + }, + { + "auxiliary_loss_clip": 0.01107984, + "auxiliary_loss_mlp": 0.01033282, + "balance_loss_clip": 1.02068734, + "balance_loss_mlp": 1.0383606, + "epoch": 0.6178866676687209, + "flos": 11984289402240.0, + "grad_norm": 2.1695107159415525, + "language_loss": 0.8092519, + "learning_rate": 1.345707936733612e-06, + "loss": 0.83066452, + "num_input_tokens_seen": 221335915, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 10277, + "time_per_iteration": 2.419820547103882 + }, + { + "auxiliary_loss_clip": 0.01107683, + "auxiliary_loss_mlp": 0.01031431, + "balance_loss_clip": 1.01819897, + "balance_loss_mlp": 1.03688812, + "epoch": 0.6179467909213888, + "flos": 20991510748800.0, + "grad_norm": 1.6341046500403578, + "language_loss": 0.81401992, + "learning_rate": 1.3453399180580466e-06, + "loss": 0.83541107, + "num_input_tokens_seen": 221353965, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.70703125, + "step": 10278, + "time_per_iteration": 2.451904058456421 + }, + { + "auxiliary_loss_clip": 0.01104247, + "auxiliary_loss_mlp": 0.01031724, + "balance_loss_clip": 1.02002394, + "balance_loss_mlp": 1.03586221, + "epoch": 0.6180069141740568, + "flos": 25338102606720.0, + "grad_norm": 1.4680885836846245, + "language_loss": 0.73827434, + "learning_rate": 1.3449719242072567e-06, + "loss": 0.75963408, + "num_input_tokens_seen": 221374080, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.68359375, + "step": 10279, + "time_per_iteration": 2.4702413082122803 + }, + { + "auxiliary_loss_clip": 0.01101252, + "auxiliary_loss_mlp": 0.01028202, + "balance_loss_clip": 1.01637602, + "balance_loss_mlp": 1.03415704, + "epoch": 0.6180670374267248, + "flos": 19645722858240.0, + "grad_norm": 1.5792662413822172, + "language_loss": 0.7052443, + "learning_rate": 1.3446039551951975e-06, + "loss": 0.72653878, + "num_input_tokens_seen": 221392910, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 10280, + "time_per_iteration": 2.439377784729004 + }, + { + "auxiliary_loss_clip": 0.01105085, + "auxiliary_loss_mlp": 0.01032737, + "balance_loss_clip": 1.02039266, + "balance_loss_mlp": 1.03673506, + "epoch": 0.6181271606793928, + "flos": 19464876858240.0, + "grad_norm": 1.433650263791477, + "language_loss": 0.72634661, + "learning_rate": 1.3442360110358215e-06, + "loss": 0.74772483, + "num_input_tokens_seen": 221410990, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.68359375, + "step": 10281, + "time_per_iteration": 2.4201571941375732 + }, + { + "auxiliary_loss_clip": 0.01100944, + "auxiliary_loss_mlp": 0.01030541, + "balance_loss_clip": 1.01990116, + "balance_loss_mlp": 1.0367198, + "epoch": 0.6181872839320607, + "flos": 25594289383680.0, + "grad_norm": 1.5669625672401193, + "language_loss": 0.76539791, + "learning_rate": 1.3438680917430827e-06, + "loss": 0.78671277, + "num_input_tokens_seen": 221431020, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.640625, + "step": 10282, + "time_per_iteration": 2.4729509353637695 + }, + { + "auxiliary_loss_clip": 0.01105858, + "auxiliary_loss_mlp": 0.01032009, + "balance_loss_clip": 1.01784086, + "balance_loss_mlp": 1.03611851, + "epoch": 0.6182474071847287, + "flos": 25551806572800.0, + "grad_norm": 1.557918367732971, + "language_loss": 0.69140053, + "learning_rate": 1.343500197330931e-06, + "loss": 0.71277922, + "num_input_tokens_seen": 221453235, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.69921875, + "step": 10283, + "time_per_iteration": 2.4644439220428467 + }, + { + "auxiliary_loss_clip": 0.01110819, + "auxiliary_loss_mlp": 0.01029964, + "balance_loss_clip": 1.01680923, + "balance_loss_mlp": 1.03751874, + "epoch": 0.6183075304373966, + "flos": 22123738327680.0, + "grad_norm": 1.5819420485757947, + "language_loss": 0.74983263, + "learning_rate": 1.3431323278133176e-06, + "loss": 0.77124047, + "num_input_tokens_seen": 221472560, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 10284, + "time_per_iteration": 2.4563488960266113 + }, + { + "auxiliary_loss_clip": 0.01100937, + "auxiliary_loss_mlp": 0.01034089, + "balance_loss_clip": 1.02219248, + "balance_loss_mlp": 1.03690124, + "epoch": 0.6183676536900646, + "flos": 22455589104000.0, + "grad_norm": 1.4660610214457293, + "language_loss": 0.75491369, + "learning_rate": 1.3427644832041922e-06, + "loss": 0.77626395, + "num_input_tokens_seen": 221492835, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.640625, + "step": 10285, + "time_per_iteration": 2.4554288387298584 + }, + { + "auxiliary_loss_clip": 0.01103991, + "auxiliary_loss_mlp": 0.01031793, + "balance_loss_clip": 1.0199194, + "balance_loss_mlp": 1.03520298, + "epoch": 0.6184277769427327, + "flos": 23364128736000.0, + "grad_norm": 1.5161367182822474, + "language_loss": 0.7299751, + "learning_rate": 1.342396663517503e-06, + "loss": 0.751333, + "num_input_tokens_seen": 221511870, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 10286, + "time_per_iteration": 2.487755060195923 + }, + { + "auxiliary_loss_clip": 0.01102671, + "auxiliary_loss_mlp": 0.01025604, + "balance_loss_clip": 1.01424325, + "balance_loss_mlp": 1.03537941, + "epoch": 0.6184879001954006, + "flos": 22711057608960.0, + "grad_norm": 2.03959974890174, + "language_loss": 0.75874734, + "learning_rate": 1.342028868767199e-06, + "loss": 0.78003013, + "num_input_tokens_seen": 221529915, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.671875, + "step": 10287, + "time_per_iteration": 2.4449198246002197 + }, + { + "auxiliary_loss_clip": 0.0110312, + "auxiliary_loss_mlp": 0.01031317, + "balance_loss_clip": 1.01973581, + "balance_loss_mlp": 1.03618407, + "epoch": 0.6185480234480686, + "flos": 23841920471040.0, + "grad_norm": 1.6506833358218813, + "language_loss": 0.72823429, + "learning_rate": 1.3416610989672262e-06, + "loss": 0.74957871, + "num_input_tokens_seen": 221549745, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66796875, + "step": 10288, + "time_per_iteration": 2.469217538833618 + }, + { + "auxiliary_loss_clip": 0.01099107, + "auxiliary_loss_mlp": 0.01032543, + "balance_loss_clip": 1.02139127, + "balance_loss_mlp": 1.03515327, + "epoch": 0.6186081467007365, + "flos": 45477595774080.0, + "grad_norm": 1.4866118467097145, + "language_loss": 0.72703552, + "learning_rate": 1.3412933541315296e-06, + "loss": 0.74835199, + "num_input_tokens_seen": 221572455, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.640625, + "step": 10289, + "time_per_iteration": 2.6342008113861084 + }, + { + "auxiliary_loss_clip": 0.01105306, + "auxiliary_loss_mlp": 0.01030634, + "balance_loss_clip": 1.01854038, + "balance_loss_mlp": 1.03557706, + "epoch": 0.6186682699534045, + "flos": 23550864566400.0, + "grad_norm": 1.5657368356700847, + "language_loss": 0.79090887, + "learning_rate": 1.340925634274056e-06, + "loss": 0.81226832, + "num_input_tokens_seen": 221591325, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69921875, + "step": 10290, + "time_per_iteration": 2.4762990474700928 + }, + { + "auxiliary_loss_clip": 0.01106885, + "auxiliary_loss_mlp": 0.01031151, + "balance_loss_clip": 1.01883626, + "balance_loss_mlp": 1.03720856, + "epoch": 0.6187283932060724, + "flos": 25774201630080.0, + "grad_norm": 1.6315677183830801, + "language_loss": 0.81586653, + "learning_rate": 1.3405579394087475e-06, + "loss": 0.83724689, + "num_input_tokens_seen": 221611640, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 10291, + "time_per_iteration": 2.46706223487854 + }, + { + "auxiliary_loss_clip": 0.01103179, + "auxiliary_loss_mlp": 0.01031435, + "balance_loss_clip": 1.01962161, + "balance_loss_mlp": 1.0360167, + "epoch": 0.6187885164587404, + "flos": 25265203954560.0, + "grad_norm": 1.907685541449211, + "language_loss": 0.77654225, + "learning_rate": 1.3401902695495487e-06, + "loss": 0.7978884, + "num_input_tokens_seen": 221631225, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 10292, + "time_per_iteration": 2.4810614585876465 + }, + { + "auxiliary_loss_clip": 0.01111234, + "auxiliary_loss_mlp": 0.01038447, + "balance_loss_clip": 1.02459502, + "balance_loss_mlp": 1.03891051, + "epoch": 0.6188486397114084, + "flos": 26250772302720.0, + "grad_norm": 1.9028504578301217, + "language_loss": 0.737167, + "learning_rate": 1.339822624710401e-06, + "loss": 0.75866383, + "num_input_tokens_seen": 221651035, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.72265625, + "step": 10293, + "time_per_iteration": 2.516528844833374 + }, + { + "auxiliary_loss_clip": 0.01106754, + "auxiliary_loss_mlp": 0.01032743, + "balance_loss_clip": 1.02110207, + "balance_loss_mlp": 1.03902757, + "epoch": 0.6189087629640764, + "flos": 20923388605440.0, + "grad_norm": 2.0122354574742602, + "language_loss": 0.83089775, + "learning_rate": 1.3394550049052454e-06, + "loss": 0.85229266, + "num_input_tokens_seen": 221671300, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.67578125, + "step": 10294, + "time_per_iteration": 2.499441623687744 + }, + { + "auxiliary_loss_clip": 0.01106207, + "auxiliary_loss_mlp": 0.01030165, + "balance_loss_clip": 1.01833987, + "balance_loss_mlp": 1.03719449, + "epoch": 0.6189688862167443, + "flos": 14829814874880.0, + "grad_norm": 2.183209160789612, + "language_loss": 0.70951724, + "learning_rate": 1.3390874101480225e-06, + "loss": 0.73088086, + "num_input_tokens_seen": 221687320, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 10295, + "time_per_iteration": 2.4442856311798096 + }, + { + "auxiliary_loss_clip": 0.01105622, + "auxiliary_loss_mlp": 0.01033007, + "balance_loss_clip": 1.02100849, + "balance_loss_mlp": 1.03787184, + "epoch": 0.6190290094694123, + "flos": 24285058560000.0, + "grad_norm": 1.6245756110977043, + "language_loss": 0.70113528, + "learning_rate": 1.3387198404526705e-06, + "loss": 0.72252154, + "num_input_tokens_seen": 221710175, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 10296, + "time_per_iteration": 2.636453866958618 + }, + { + "auxiliary_loss_clip": 0.01108503, + "auxiliary_loss_mlp": 0.01033341, + "balance_loss_clip": 1.02007866, + "balance_loss_mlp": 1.03864932, + "epoch": 0.6190891327220802, + "flos": 22529457423360.0, + "grad_norm": 2.076478179664887, + "language_loss": 0.71677291, + "learning_rate": 1.3383522958331287e-06, + "loss": 0.73819137, + "num_input_tokens_seen": 221728145, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.69921875, + "step": 10297, + "time_per_iteration": 2.487703800201416 + }, + { + "auxiliary_loss_clip": 0.01028294, + "auxiliary_loss_mlp": 0.00997518, + "balance_loss_clip": 0.9962309, + "balance_loss_mlp": 1.00701296, + "epoch": 0.6191492559747482, + "flos": 67729357152000.0, + "grad_norm": 0.8802858185205813, + "language_loss": 0.64150029, + "learning_rate": 1.3379847763033345e-06, + "loss": 0.66175842, + "num_input_tokens_seen": 221786100, + "router_z_loss_clip": 0.01287842, + "router_z_loss_mlp": 0.21289062, + "step": 10298, + "time_per_iteration": 2.959296226501465 + }, + { + "auxiliary_loss_clip": 0.01105855, + "auxiliary_loss_mlp": 0.01032235, + "balance_loss_clip": 1.0202961, + "balance_loss_mlp": 1.03661466, + "epoch": 0.6192093792274163, + "flos": 22346672088960.0, + "grad_norm": 1.6984885948159927, + "language_loss": 0.74105954, + "learning_rate": 1.3376172818772236e-06, + "loss": 0.76244044, + "num_input_tokens_seen": 221806450, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.69140625, + "step": 10299, + "time_per_iteration": 2.450899124145508 + }, + { + "auxiliary_loss_clip": 0.01109628, + "auxiliary_loss_mlp": 0.01032817, + "balance_loss_clip": 1.02075887, + "balance_loss_mlp": 1.0376761, + "epoch": 0.6192695024800842, + "flos": 13553944807680.0, + "grad_norm": 1.8344519767478165, + "language_loss": 0.68278986, + "learning_rate": 1.337249812568732e-06, + "loss": 0.70421433, + "num_input_tokens_seen": 221823330, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.71875, + "step": 10300, + "time_per_iteration": 2.4547624588012695 + }, + { + "auxiliary_loss_clip": 0.01108413, + "auxiliary_loss_mlp": 0.01033534, + "balance_loss_clip": 1.0215776, + "balance_loss_mlp": 1.03889441, + "epoch": 0.6193296257327522, + "flos": 17415310815360.0, + "grad_norm": 1.8244494071351975, + "language_loss": 0.66936946, + "learning_rate": 1.3368823683917939e-06, + "loss": 0.69078887, + "num_input_tokens_seen": 221839360, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6953125, + "step": 10301, + "time_per_iteration": 2.467451810836792 + }, + { + "auxiliary_loss_clip": 0.01104043, + "auxiliary_loss_mlp": 0.01029493, + "balance_loss_clip": 1.018013, + "balance_loss_mlp": 1.03542924, + "epoch": 0.6193897489854201, + "flos": 31101118450560.0, + "grad_norm": 2.0193419698977317, + "language_loss": 0.73042768, + "learning_rate": 1.3365149493603424e-06, + "loss": 0.75176305, + "num_input_tokens_seen": 221859465, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6875, + "step": 10302, + "time_per_iteration": 4.012500762939453 + }, + { + "auxiliary_loss_clip": 0.0110528, + "auxiliary_loss_mlp": 0.01031712, + "balance_loss_clip": 1.01923048, + "balance_loss_mlp": 1.03690219, + "epoch": 0.6194498722380881, + "flos": 19134031662720.0, + "grad_norm": 1.8664060987198585, + "language_loss": 0.80371857, + "learning_rate": 1.3361475554883107e-06, + "loss": 0.82508844, + "num_input_tokens_seen": 221878555, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 10303, + "time_per_iteration": 2.437244176864624 + }, + { + "auxiliary_loss_clip": 0.01107499, + "auxiliary_loss_mlp": 0.01031441, + "balance_loss_clip": 1.01827395, + "balance_loss_mlp": 1.03684223, + "epoch": 0.619509995490756, + "flos": 21835088634240.0, + "grad_norm": 1.5617333087985545, + "language_loss": 0.76300073, + "learning_rate": 1.3357801867896307e-06, + "loss": 0.78439015, + "num_input_tokens_seen": 221898790, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.70703125, + "step": 10304, + "time_per_iteration": 3.8231778144836426 + }, + { + "auxiliary_loss_clip": 0.01110648, + "auxiliary_loss_mlp": 0.01034085, + "balance_loss_clip": 1.02169371, + "balance_loss_mlp": 1.03864741, + "epoch": 0.619570118743424, + "flos": 23806548552960.0, + "grad_norm": 2.062841626901626, + "language_loss": 0.77207863, + "learning_rate": 1.3354128432782324e-06, + "loss": 0.79352599, + "num_input_tokens_seen": 221918875, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71875, + "step": 10305, + "time_per_iteration": 5.318151473999023 + }, + { + "auxiliary_loss_clip": 0.01111243, + "auxiliary_loss_mlp": 0.01032824, + "balance_loss_clip": 1.01918018, + "balance_loss_mlp": 1.03931832, + "epoch": 0.619630241996092, + "flos": 21101612912640.0, + "grad_norm": 1.6773478766205938, + "language_loss": 0.78826416, + "learning_rate": 1.335045524968045e-06, + "loss": 0.80970484, + "num_input_tokens_seen": 221937895, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71875, + "step": 10306, + "time_per_iteration": 2.4717702865600586 + }, + { + "auxiliary_loss_clip": 0.01099716, + "auxiliary_loss_mlp": 0.01026237, + "balance_loss_clip": 1.01576495, + "balance_loss_mlp": 1.03520381, + "epoch": 0.61969036524876, + "flos": 27308269635840.0, + "grad_norm": 1.579957954838489, + "language_loss": 0.79917157, + "learning_rate": 1.3346782318729988e-06, + "loss": 0.82043117, + "num_input_tokens_seen": 221955920, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.64453125, + "step": 10307, + "time_per_iteration": 2.51257586479187 + }, + { + "auxiliary_loss_clip": 0.01027759, + "auxiliary_loss_mlp": 0.00997846, + "balance_loss_clip": 0.99666041, + "balance_loss_mlp": 1.00661421, + "epoch": 0.6197504885014279, + "flos": 51648955384320.0, + "grad_norm": 0.8254095728079679, + "language_loss": 0.59419918, + "learning_rate": 1.3343109640070203e-06, + "loss": 0.61445522, + "num_input_tokens_seen": 222011405, + "router_z_loss_clip": 0.01184082, + "router_z_loss_mlp": 0.2109375, + "step": 10308, + "time_per_iteration": 3.087841510772705 + }, + { + "auxiliary_loss_clip": 0.01102523, + "auxiliary_loss_mlp": 0.01026948, + "balance_loss_clip": 1.01634467, + "balance_loss_mlp": 1.0360744, + "epoch": 0.6198106117540959, + "flos": 30557107992960.0, + "grad_norm": 1.8503774737615284, + "language_loss": 0.67855436, + "learning_rate": 1.333943721384037e-06, + "loss": 0.69984901, + "num_input_tokens_seen": 222034545, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.6640625, + "step": 10309, + "time_per_iteration": 2.516601800918579 + }, + { + "auxiliary_loss_clip": 0.01105412, + "auxiliary_loss_mlp": 0.01031211, + "balance_loss_clip": 1.01924789, + "balance_loss_mlp": 1.03811872, + "epoch": 0.6198707350067638, + "flos": 18909733184640.0, + "grad_norm": 1.5770368221477629, + "language_loss": 0.71985435, + "learning_rate": 1.3335765040179746e-06, + "loss": 0.74122059, + "num_input_tokens_seen": 222052690, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.67578125, + "step": 10310, + "time_per_iteration": 2.4543659687042236 + }, + { + "auxiliary_loss_clip": 0.01109202, + "auxiliary_loss_mlp": 0.01032021, + "balance_loss_clip": 1.01870525, + "balance_loss_mlp": 1.03908801, + "epoch": 0.6199308582594318, + "flos": 21433858738560.0, + "grad_norm": 1.8624693813193853, + "language_loss": 0.78939658, + "learning_rate": 1.3332093119227573e-06, + "loss": 0.81080884, + "num_input_tokens_seen": 222069095, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.703125, + "step": 10311, + "time_per_iteration": 2.4637980461120605 + }, + { + "auxiliary_loss_clip": 0.01105244, + "auxiliary_loss_mlp": 0.01032184, + "balance_loss_clip": 1.01957762, + "balance_loss_mlp": 1.03495574, + "epoch": 0.6199909815120999, + "flos": 18407379525120.0, + "grad_norm": 1.9506851073512315, + "language_loss": 0.72994781, + "learning_rate": 1.3328421451123105e-06, + "loss": 0.75132203, + "num_input_tokens_seen": 222087360, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 10312, + "time_per_iteration": 2.4388468265533447 + }, + { + "auxiliary_loss_clip": 0.0110771, + "auxiliary_loss_mlp": 0.01035173, + "balance_loss_clip": 1.02284074, + "balance_loss_mlp": 1.0381484, + "epoch": 0.6200511047647678, + "flos": 21466860359040.0, + "grad_norm": 2.1707252036738502, + "language_loss": 0.71927798, + "learning_rate": 1.3324750036005557e-06, + "loss": 0.7407068, + "num_input_tokens_seen": 222106130, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 10313, + "time_per_iteration": 2.4896764755249023 + }, + { + "auxiliary_loss_clip": 0.01108842, + "auxiliary_loss_mlp": 0.01030693, + "balance_loss_clip": 1.01780641, + "balance_loss_mlp": 1.03789592, + "epoch": 0.6201112280174358, + "flos": 18215903099520.0, + "grad_norm": 1.8177190018334353, + "language_loss": 0.78071815, + "learning_rate": 1.332107887401416e-06, + "loss": 0.80211347, + "num_input_tokens_seen": 222123125, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 10314, + "time_per_iteration": 2.4607138633728027 + }, + { + "auxiliary_loss_clip": 0.01105035, + "auxiliary_loss_mlp": 0.01033382, + "balance_loss_clip": 1.02113914, + "balance_loss_mlp": 1.03498077, + "epoch": 0.6201713512701037, + "flos": 20011185786240.0, + "grad_norm": 1.7685018834569248, + "language_loss": 0.78155088, + "learning_rate": 1.331740796528812e-06, + "loss": 0.80293512, + "num_input_tokens_seen": 222140655, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.703125, + "step": 10315, + "time_per_iteration": 2.428445816040039 + }, + { + "auxiliary_loss_clip": 0.01109232, + "auxiliary_loss_mlp": 0.01033411, + "balance_loss_clip": 1.02145982, + "balance_loss_mlp": 1.03922391, + "epoch": 0.6202314745227717, + "flos": 22487692884480.0, + "grad_norm": 2.596321726125175, + "language_loss": 0.76265639, + "learning_rate": 1.3313737309966641e-06, + "loss": 0.78408277, + "num_input_tokens_seen": 222160450, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.703125, + "step": 10316, + "time_per_iteration": 2.463766098022461 + }, + { + "auxiliary_loss_clip": 0.01105873, + "auxiliary_loss_mlp": 0.01030686, + "balance_loss_clip": 1.01823497, + "balance_loss_mlp": 1.0344758, + "epoch": 0.6202915977754396, + "flos": 26828682220800.0, + "grad_norm": 1.9163692596467958, + "language_loss": 0.77438551, + "learning_rate": 1.3310066908188915e-06, + "loss": 0.79575109, + "num_input_tokens_seen": 222179170, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 10317, + "time_per_iteration": 2.468884229660034 + }, + { + "auxiliary_loss_clip": 0.01027239, + "auxiliary_loss_mlp": 0.00999035, + "balance_loss_clip": 0.99786037, + "balance_loss_mlp": 1.00593257, + "epoch": 0.6203517210281076, + "flos": 62742694890240.0, + "grad_norm": 0.6919425802260456, + "language_loss": 0.59057474, + "learning_rate": 1.3306396760094122e-06, + "loss": 0.61083746, + "num_input_tokens_seen": 222242660, + "router_z_loss_clip": 0.01171875, + "router_z_loss_mlp": 0.21289062, + "step": 10318, + "time_per_iteration": 3.090552568435669 + }, + { + "auxiliary_loss_clip": 0.0110802, + "auxiliary_loss_mlp": 0.01034141, + "balance_loss_clip": 1.02163601, + "balance_loss_mlp": 1.03937101, + "epoch": 0.6204118442807756, + "flos": 23404277162880.0, + "grad_norm": 1.6841357417658411, + "language_loss": 0.77685571, + "learning_rate": 1.330272686582143e-06, + "loss": 0.79827732, + "num_input_tokens_seen": 222262170, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 10319, + "time_per_iteration": 2.4693212509155273 + }, + { + "auxiliary_loss_clip": 0.01104902, + "auxiliary_loss_mlp": 0.0103113, + "balance_loss_clip": 1.01963234, + "balance_loss_mlp": 1.03732896, + "epoch": 0.6204719675334436, + "flos": 20193647898240.0, + "grad_norm": 2.3469109769721377, + "language_loss": 0.66256416, + "learning_rate": 1.3299057225510013e-06, + "loss": 0.68392456, + "num_input_tokens_seen": 222280375, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.67578125, + "step": 10320, + "time_per_iteration": 2.447006940841675 + }, + { + "auxiliary_loss_clip": 0.01102632, + "auxiliary_loss_mlp": 0.01030299, + "balance_loss_clip": 1.01918244, + "balance_loss_mlp": 1.03645897, + "epoch": 0.6205320907861115, + "flos": 13188050916480.0, + "grad_norm": 1.6640363363170714, + "language_loss": 0.76396954, + "learning_rate": 1.3295387839299013e-06, + "loss": 0.78529894, + "num_input_tokens_seen": 222297325, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66015625, + "step": 10321, + "time_per_iteration": 2.439819574356079 + }, + { + "auxiliary_loss_clip": 0.01102881, + "auxiliary_loss_mlp": 0.01027928, + "balance_loss_clip": 1.01653743, + "balance_loss_mlp": 1.03596795, + "epoch": 0.6205922140387795, + "flos": 20668386977280.0, + "grad_norm": 1.7446721342838176, + "language_loss": 0.73165452, + "learning_rate": 1.329171870732758e-06, + "loss": 0.75296265, + "num_input_tokens_seen": 222317095, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.671875, + "step": 10322, + "time_per_iteration": 2.4455277919769287 + }, + { + "auxiliary_loss_clip": 0.01105105, + "auxiliary_loss_mlp": 0.01024456, + "balance_loss_clip": 1.01309574, + "balance_loss_mlp": 1.03739095, + "epoch": 0.6206523372914474, + "flos": 23877831093120.0, + "grad_norm": 2.5506684456453157, + "language_loss": 0.73217744, + "learning_rate": 1.3288049829734845e-06, + "loss": 0.75347304, + "num_input_tokens_seen": 222337055, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.67578125, + "step": 10323, + "time_per_iteration": 2.4893054962158203 + }, + { + "auxiliary_loss_clip": 0.01114414, + "auxiliary_loss_mlp": 0.01033533, + "balance_loss_clip": 1.02086651, + "balance_loss_mlp": 1.04062796, + "epoch": 0.6207124605441154, + "flos": 13406603218560.0, + "grad_norm": 2.3064550645164354, + "language_loss": 0.58989835, + "learning_rate": 1.3284381206659933e-06, + "loss": 0.61137784, + "num_input_tokens_seen": 222354515, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.73828125, + "step": 10324, + "time_per_iteration": 2.4318976402282715 + }, + { + "auxiliary_loss_clip": 0.01108806, + "auxiliary_loss_mlp": 0.01030221, + "balance_loss_clip": 1.01746607, + "balance_loss_mlp": 1.03886914, + "epoch": 0.6207725837967835, + "flos": 18916341287040.0, + "grad_norm": 2.054520538169497, + "language_loss": 0.76530892, + "learning_rate": 1.3280712838241956e-06, + "loss": 0.78669918, + "num_input_tokens_seen": 222372755, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 10325, + "time_per_iteration": 2.4457478523254395 + }, + { + "auxiliary_loss_clip": 0.01107557, + "auxiliary_loss_mlp": 0.01027615, + "balance_loss_clip": 1.01502085, + "balance_loss_mlp": 1.03696799, + "epoch": 0.6208327070494514, + "flos": 23980211832960.0, + "grad_norm": 1.7674606629656198, + "language_loss": 0.72749656, + "learning_rate": 1.327704472462003e-06, + "loss": 0.74884826, + "num_input_tokens_seen": 222391380, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.70703125, + "step": 10326, + "time_per_iteration": 2.469116687774658 + }, + { + "auxiliary_loss_clip": 0.01108969, + "auxiliary_loss_mlp": 0.01033575, + "balance_loss_clip": 1.02126646, + "balance_loss_mlp": 1.03798246, + "epoch": 0.6208928303021194, + "flos": 22820405587200.0, + "grad_norm": 3.158836515239834, + "language_loss": 0.73515177, + "learning_rate": 1.3273376865933234e-06, + "loss": 0.75657719, + "num_input_tokens_seen": 222411165, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.7109375, + "step": 10327, + "time_per_iteration": 2.4524545669555664 + }, + { + "auxiliary_loss_clip": 0.01109109, + "auxiliary_loss_mlp": 0.01031256, + "balance_loss_clip": 1.01832175, + "balance_loss_mlp": 1.03808546, + "epoch": 0.6209529535547873, + "flos": 17564519911680.0, + "grad_norm": 2.016240551650266, + "language_loss": 0.7945962, + "learning_rate": 1.326970926232066e-06, + "loss": 0.81599987, + "num_input_tokens_seen": 222428110, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 10328, + "time_per_iteration": 2.4385621547698975 + }, + { + "auxiliary_loss_clip": 0.01108206, + "auxiliary_loss_mlp": 0.01036726, + "balance_loss_clip": 1.02380335, + "balance_loss_mlp": 1.03790045, + "epoch": 0.6210130768074553, + "flos": 22011912311040.0, + "grad_norm": 1.9358397907066565, + "language_loss": 0.77753472, + "learning_rate": 1.3266041913921396e-06, + "loss": 0.79898405, + "num_input_tokens_seen": 222446385, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 10329, + "time_per_iteration": 2.462999105453491 + }, + { + "auxiliary_loss_clip": 0.01028614, + "auxiliary_loss_mlp": 0.01005403, + "balance_loss_clip": 1.00426447, + "balance_loss_mlp": 1.00714183, + "epoch": 0.6210732000601232, + "flos": 63676873854720.0, + "grad_norm": 0.8271913018767197, + "language_loss": 0.62140441, + "learning_rate": 1.3262374820874484e-06, + "loss": 0.64174461, + "num_input_tokens_seen": 222502150, + "router_z_loss_clip": 0.01141357, + "router_z_loss_mlp": 0.21484375, + "step": 10330, + "time_per_iteration": 3.0160677433013916 + }, + { + "auxiliary_loss_clip": 0.01111605, + "auxiliary_loss_mlp": 0.0103314, + "balance_loss_clip": 1.02014053, + "balance_loss_mlp": 1.03902602, + "epoch": 0.6211333233127913, + "flos": 24243365848320.0, + "grad_norm": 2.119882521955809, + "language_loss": 0.77734917, + "learning_rate": 1.3258707983319002e-06, + "loss": 0.79879665, + "num_input_tokens_seen": 222519880, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7265625, + "step": 10331, + "time_per_iteration": 2.489560842514038 + }, + { + "auxiliary_loss_clip": 0.01110147, + "auxiliary_loss_mlp": 0.01034338, + "balance_loss_clip": 1.0218091, + "balance_loss_mlp": 1.0385623, + "epoch": 0.6211934465654592, + "flos": 16943803960320.0, + "grad_norm": 2.1826239313183486, + "language_loss": 0.67408252, + "learning_rate": 1.3255041401393992e-06, + "loss": 0.69552743, + "num_input_tokens_seen": 222538545, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71875, + "step": 10332, + "time_per_iteration": 2.425645112991333 + }, + { + "auxiliary_loss_clip": 0.0110723, + "auxiliary_loss_mlp": 0.01027428, + "balance_loss_clip": 1.01532817, + "balance_loss_mlp": 1.03766382, + "epoch": 0.6212535698181272, + "flos": 15267386355840.0, + "grad_norm": 1.6359189592805878, + "language_loss": 0.76677281, + "learning_rate": 1.3251375075238476e-06, + "loss": 0.78811944, + "num_input_tokens_seen": 222556935, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 10333, + "time_per_iteration": 2.4364230632781982 + }, + { + "auxiliary_loss_clip": 0.01105905, + "auxiliary_loss_mlp": 0.01028787, + "balance_loss_clip": 1.01689601, + "balance_loss_mlp": 1.03827369, + "epoch": 0.6213136930707951, + "flos": 13443950384640.0, + "grad_norm": 2.0485781293514793, + "language_loss": 0.69575661, + "learning_rate": 1.3247709004991507e-06, + "loss": 0.71710348, + "num_input_tokens_seen": 222574035, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 10334, + "time_per_iteration": 2.4257168769836426 + }, + { + "auxiliary_loss_clip": 0.01107391, + "auxiliary_loss_mlp": 0.01028969, + "balance_loss_clip": 1.01766801, + "balance_loss_mlp": 1.03944373, + "epoch": 0.6213738163234631, + "flos": 18111223889280.0, + "grad_norm": 2.0078352045306507, + "language_loss": 0.70201457, + "learning_rate": 1.3244043190792078e-06, + "loss": 0.72337818, + "num_input_tokens_seen": 222592290, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6796875, + "step": 10335, + "time_per_iteration": 2.47383451461792 + }, + { + "auxiliary_loss_clip": 0.01102603, + "auxiliary_loss_mlp": 0.01030557, + "balance_loss_clip": 1.01889277, + "balance_loss_mlp": 1.03563762, + "epoch": 0.621433939576131, + "flos": 25337348421120.0, + "grad_norm": 1.47099412595651, + "language_loss": 0.80045199, + "learning_rate": 1.3240377632779213e-06, + "loss": 0.82178366, + "num_input_tokens_seen": 222612805, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66796875, + "step": 10336, + "time_per_iteration": 2.476863145828247 + }, + { + "auxiliary_loss_clip": 0.01103545, + "auxiliary_loss_mlp": 0.01027933, + "balance_loss_clip": 1.01593423, + "balance_loss_mlp": 1.03639817, + "epoch": 0.621494062828799, + "flos": 22565619440640.0, + "grad_norm": 1.8768203229000895, + "language_loss": 0.73504305, + "learning_rate": 1.3236712331091907e-06, + "loss": 0.75635779, + "num_input_tokens_seen": 222632260, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 10337, + "time_per_iteration": 2.4732797145843506 + }, + { + "auxiliary_loss_clip": 0.01109544, + "auxiliary_loss_mlp": 0.01030918, + "balance_loss_clip": 1.01764417, + "balance_loss_mlp": 1.03801644, + "epoch": 0.621554186081467, + "flos": 27417976750080.0, + "grad_norm": 1.8614452301224431, + "language_loss": 0.63164204, + "learning_rate": 1.3233047285869145e-06, + "loss": 0.65304667, + "num_input_tokens_seen": 222653570, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71484375, + "step": 10338, + "time_per_iteration": 2.4973182678222656 + }, + { + "auxiliary_loss_clip": 0.01106095, + "auxiliary_loss_mlp": 0.01029305, + "balance_loss_clip": 1.01755667, + "balance_loss_mlp": 1.03789639, + "epoch": 0.621614309334135, + "flos": 22346815743360.0, + "grad_norm": 2.390170977530988, + "language_loss": 0.71337169, + "learning_rate": 1.322938249724991e-06, + "loss": 0.73472571, + "num_input_tokens_seen": 222672480, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.68359375, + "step": 10339, + "time_per_iteration": 2.47871994972229 + }, + { + "auxiliary_loss_clip": 0.01103361, + "auxiliary_loss_mlp": 0.01027491, + "balance_loss_clip": 1.01571906, + "balance_loss_mlp": 1.03734398, + "epoch": 0.621674432586803, + "flos": 19281229597440.0, + "grad_norm": 1.5831202152699189, + "language_loss": 0.69323343, + "learning_rate": 1.3225717965373166e-06, + "loss": 0.71454197, + "num_input_tokens_seen": 222691200, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.66015625, + "step": 10340, + "time_per_iteration": 2.445570707321167 + }, + { + "auxiliary_loss_clip": 0.0110187, + "auxiliary_loss_mlp": 0.01027129, + "balance_loss_clip": 1.0154407, + "balance_loss_mlp": 1.03529525, + "epoch": 0.6217345558394709, + "flos": 21609533180160.0, + "grad_norm": 3.3727615102843513, + "language_loss": 0.68661916, + "learning_rate": 1.322205369037788e-06, + "loss": 0.70790917, + "num_input_tokens_seen": 222709975, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 10341, + "time_per_iteration": 2.439035415649414 + }, + { + "auxiliary_loss_clip": 0.01107972, + "auxiliary_loss_mlp": 0.0102942, + "balance_loss_clip": 1.01605105, + "balance_loss_mlp": 1.03783154, + "epoch": 0.6217946790921389, + "flos": 18004102554240.0, + "grad_norm": 2.06494623621423, + "language_loss": 0.81278366, + "learning_rate": 1.321838967240299e-06, + "loss": 0.83415759, + "num_input_tokens_seen": 222729005, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.703125, + "step": 10342, + "time_per_iteration": 2.445607900619507 + }, + { + "auxiliary_loss_clip": 0.01027883, + "auxiliary_loss_mlp": 0.01003385, + "balance_loss_clip": 1.00214529, + "balance_loss_mlp": 1.00662279, + "epoch": 0.6218548023448068, + "flos": 61973631768960.0, + "grad_norm": 0.7785995287469357, + "language_loss": 0.57325292, + "learning_rate": 1.3214725911587452e-06, + "loss": 0.59356558, + "num_input_tokens_seen": 222786090, + "router_z_loss_clip": 0.01239014, + "router_z_loss_mlp": 0.21289062, + "step": 10343, + "time_per_iteration": 4.364051342010498 + }, + { + "auxiliary_loss_clip": 0.01102174, + "auxiliary_loss_mlp": 0.01023841, + "balance_loss_clip": 1.01308846, + "balance_loss_mlp": 1.03629875, + "epoch": 0.6219149255974749, + "flos": 25739152934400.0, + "grad_norm": 1.780281281905301, + "language_loss": 0.72907692, + "learning_rate": 1.3211062408070184e-06, + "loss": 0.75033712, + "num_input_tokens_seen": 222806100, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.66015625, + "step": 10344, + "time_per_iteration": 2.4766275882720947 + }, + { + "auxiliary_loss_clip": 0.01107045, + "auxiliary_loss_mlp": 0.01033365, + "balance_loss_clip": 1.02209353, + "balance_loss_mlp": 1.03881705, + "epoch": 0.6219750488501428, + "flos": 25411073086080.0, + "grad_norm": 1.738872083076136, + "language_loss": 0.59990644, + "learning_rate": 1.3207399161990105e-06, + "loss": 0.62131059, + "num_input_tokens_seen": 222826575, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6796875, + "step": 10345, + "time_per_iteration": 2.541123390197754 + }, + { + "auxiliary_loss_clip": 0.01104933, + "auxiliary_loss_mlp": 0.01031607, + "balance_loss_clip": 1.01948929, + "balance_loss_mlp": 1.0357126, + "epoch": 0.6220351721028108, + "flos": 20047383717120.0, + "grad_norm": 1.9219019260210024, + "language_loss": 0.78273392, + "learning_rate": 1.320373617348614e-06, + "loss": 0.80409932, + "num_input_tokens_seen": 222845285, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 10346, + "time_per_iteration": 5.315351724624634 + }, + { + "auxiliary_loss_clip": 0.01106477, + "auxiliary_loss_mlp": 0.01028801, + "balance_loss_clip": 1.01602221, + "balance_loss_mlp": 1.03580999, + "epoch": 0.6220952953554787, + "flos": 27488397363840.0, + "grad_norm": 1.6418210301478282, + "language_loss": 0.71802652, + "learning_rate": 1.3200073442697171e-06, + "loss": 0.73937929, + "num_input_tokens_seen": 222864575, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.70703125, + "step": 10347, + "time_per_iteration": 2.497929334640503 + }, + { + "auxiliary_loss_clip": 0.01102635, + "auxiliary_loss_mlp": 0.01028399, + "balance_loss_clip": 1.01597118, + "balance_loss_mlp": 1.03503013, + "epoch": 0.6221554186081467, + "flos": 19207612673280.0, + "grad_norm": 1.625266135857152, + "language_loss": 0.71975756, + "learning_rate": 1.3196410969762108e-06, + "loss": 0.74106789, + "num_input_tokens_seen": 222884420, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.67578125, + "step": 10348, + "time_per_iteration": 3.9235568046569824 + }, + { + "auxiliary_loss_clip": 0.01026634, + "auxiliary_loss_mlp": 0.01006199, + "balance_loss_clip": 1.00494766, + "balance_loss_mlp": 1.00541496, + "epoch": 0.6222155418608146, + "flos": 62950939989120.0, + "grad_norm": 0.8371335682612564, + "language_loss": 0.54224485, + "learning_rate": 1.3192748754819815e-06, + "loss": 0.56257325, + "num_input_tokens_seen": 222944690, + "router_z_loss_clip": 0.01251221, + "router_z_loss_mlp": 0.21289062, + "step": 10349, + "time_per_iteration": 3.0496747493743896 + }, + { + "auxiliary_loss_clip": 0.0110532, + "auxiliary_loss_mlp": 0.01026788, + "balance_loss_clip": 1.01496863, + "balance_loss_mlp": 1.03663087, + "epoch": 0.6222756651134826, + "flos": 22601099099520.0, + "grad_norm": 2.1582584328539594, + "language_loss": 0.69793445, + "learning_rate": 1.3189086798009173e-06, + "loss": 0.71925557, + "num_input_tokens_seen": 222962990, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 10350, + "time_per_iteration": 2.470149278640747 + }, + { + "auxiliary_loss_clip": 0.0110629, + "auxiliary_loss_mlp": 0.0103389, + "balance_loss_clip": 1.02183747, + "balance_loss_mlp": 1.03684473, + "epoch": 0.6223357883661506, + "flos": 21142228216320.0, + "grad_norm": 1.9147448057982832, + "language_loss": 0.56816912, + "learning_rate": 1.3185425099469046e-06, + "loss": 0.58957094, + "num_input_tokens_seen": 222980715, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6953125, + "step": 10351, + "time_per_iteration": 2.505211114883423 + }, + { + "auxiliary_loss_clip": 0.01026374, + "auxiliary_loss_mlp": 0.00993206, + "balance_loss_clip": 0.99188894, + "balance_loss_mlp": 1.00534272, + "epoch": 0.6223959116188186, + "flos": 63765071700480.0, + "grad_norm": 0.8115156894720258, + "language_loss": 0.61159444, + "learning_rate": 1.3181763659338276e-06, + "loss": 0.63179016, + "num_input_tokens_seen": 223040685, + "router_z_loss_clip": 0.01318359, + "router_z_loss_mlp": 0.2109375, + "step": 10352, + "time_per_iteration": 3.021286725997925 + }, + { + "auxiliary_loss_clip": 0.01101568, + "auxiliary_loss_mlp": 0.01029379, + "balance_loss_clip": 1.017488, + "balance_loss_mlp": 1.0351944, + "epoch": 0.6224560348714866, + "flos": 22565727181440.0, + "grad_norm": 2.081556495777929, + "language_loss": 0.81940329, + "learning_rate": 1.3178102477755714e-06, + "loss": 0.84071267, + "num_input_tokens_seen": 223059000, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6640625, + "step": 10353, + "time_per_iteration": 2.496713638305664 + }, + { + "auxiliary_loss_clip": 0.010991, + "auxiliary_loss_mlp": 0.010273, + "balance_loss_clip": 1.01638615, + "balance_loss_mlp": 1.03455448, + "epoch": 0.6225161581241545, + "flos": 24097748112000.0, + "grad_norm": 1.5710771766627751, + "language_loss": 0.7576375, + "learning_rate": 1.3174441554860195e-06, + "loss": 0.77890158, + "num_input_tokens_seen": 223079345, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.64453125, + "step": 10354, + "time_per_iteration": 2.4855527877807617 + }, + { + "auxiliary_loss_clip": 0.01103725, + "auxiliary_loss_mlp": 0.01028461, + "balance_loss_clip": 1.01658213, + "balance_loss_mlp": 1.03609419, + "epoch": 0.6225762813768225, + "flos": 20443513881600.0, + "grad_norm": 1.4655004554762274, + "language_loss": 0.78727663, + "learning_rate": 1.3170780890790528e-06, + "loss": 0.80859846, + "num_input_tokens_seen": 223097880, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.67578125, + "step": 10355, + "time_per_iteration": 2.445819616317749 + }, + { + "auxiliary_loss_clip": 0.01106453, + "auxiliary_loss_mlp": 0.01030134, + "balance_loss_clip": 1.01859486, + "balance_loss_mlp": 1.03856397, + "epoch": 0.6226364046294904, + "flos": 27198131558400.0, + "grad_norm": 1.5925757296601037, + "language_loss": 0.78048426, + "learning_rate": 1.3167120485685538e-06, + "loss": 0.80185014, + "num_input_tokens_seen": 223118185, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6796875, + "step": 10356, + "time_per_iteration": 2.4893651008605957 + }, + { + "auxiliary_loss_clip": 0.01110459, + "auxiliary_loss_mlp": 0.01031899, + "balance_loss_clip": 1.01882744, + "balance_loss_mlp": 1.0377419, + "epoch": 0.6226965278821585, + "flos": 20445776438400.0, + "grad_norm": 2.1577973787104923, + "language_loss": 0.67252231, + "learning_rate": 1.3163460339684024e-06, + "loss": 0.69394588, + "num_input_tokens_seen": 223137600, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 10357, + "time_per_iteration": 2.4467334747314453 + }, + { + "auxiliary_loss_clip": 0.01111299, + "auxiliary_loss_mlp": 0.01031087, + "balance_loss_clip": 1.01744306, + "balance_loss_mlp": 1.03907299, + "epoch": 0.6227566511348264, + "flos": 22162737519360.0, + "grad_norm": 2.813144519953157, + "language_loss": 0.75561357, + "learning_rate": 1.3159800452924778e-06, + "loss": 0.77703738, + "num_input_tokens_seen": 223154360, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.72265625, + "step": 10358, + "time_per_iteration": 2.516791343688965 + }, + { + "auxiliary_loss_clip": 0.01104161, + "auxiliary_loss_mlp": 0.01031224, + "balance_loss_clip": 1.01916623, + "balance_loss_mlp": 1.03473985, + "epoch": 0.6228167743874944, + "flos": 18040875102720.0, + "grad_norm": 2.219435804709828, + "language_loss": 0.82639635, + "learning_rate": 1.3156140825546588e-06, + "loss": 0.84775025, + "num_input_tokens_seen": 223172255, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6953125, + "step": 10359, + "time_per_iteration": 2.4310834407806396 + }, + { + "auxiliary_loss_clip": 0.01102353, + "auxiliary_loss_mlp": 0.01039222, + "balance_loss_clip": 1.02763474, + "balance_loss_mlp": 1.03537011, + "epoch": 0.6228768976401623, + "flos": 17742851959680.0, + "grad_norm": 2.303439975038256, + "language_loss": 0.73551476, + "learning_rate": 1.315248145768822e-06, + "loss": 0.75693059, + "num_input_tokens_seen": 223186965, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66796875, + "step": 10360, + "time_per_iteration": 2.4032440185546875 + }, + { + "auxiliary_loss_clip": 0.01104376, + "auxiliary_loss_mlp": 0.01033974, + "balance_loss_clip": 1.02152276, + "balance_loss_mlp": 1.03514135, + "epoch": 0.6229370208928303, + "flos": 17894934144000.0, + "grad_norm": 2.1872491258589877, + "language_loss": 0.78007793, + "learning_rate": 1.3148822349488442e-06, + "loss": 0.8014614, + "num_input_tokens_seen": 223206045, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.69140625, + "step": 10361, + "time_per_iteration": 2.432612419128418 + }, + { + "auxiliary_loss_clip": 0.01105247, + "auxiliary_loss_mlp": 0.01028519, + "balance_loss_clip": 1.01694417, + "balance_loss_mlp": 1.03777361, + "epoch": 0.6229971441454982, + "flos": 17347763289600.0, + "grad_norm": 2.0406207393391322, + "language_loss": 0.67669165, + "learning_rate": 1.3145163501086005e-06, + "loss": 0.69802934, + "num_input_tokens_seen": 223224820, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.671875, + "step": 10362, + "time_per_iteration": 2.4279119968414307 + }, + { + "auxiliary_loss_clip": 0.01105655, + "auxiliary_loss_mlp": 0.01029911, + "balance_loss_clip": 1.01772738, + "balance_loss_mlp": 1.03628147, + "epoch": 0.6230572673981662, + "flos": 29241376807680.0, + "grad_norm": 1.866995951195316, + "language_loss": 0.67914844, + "learning_rate": 1.3141504912619658e-06, + "loss": 0.70050412, + "num_input_tokens_seen": 223243205, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6953125, + "step": 10363, + "time_per_iteration": 2.5570461750030518 + }, + { + "auxiliary_loss_clip": 0.01107735, + "auxiliary_loss_mlp": 0.01034747, + "balance_loss_clip": 1.02156806, + "balance_loss_mlp": 1.03598118, + "epoch": 0.6231173906508342, + "flos": 16325961096960.0, + "grad_norm": 1.8313003501061587, + "language_loss": 0.86500871, + "learning_rate": 1.3137846584228127e-06, + "loss": 0.88643348, + "num_input_tokens_seen": 223261370, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71875, + "step": 10364, + "time_per_iteration": 2.4293837547302246 + }, + { + "auxiliary_loss_clip": 0.01025186, + "auxiliary_loss_mlp": 0.01006976, + "balance_loss_clip": 1.00571883, + "balance_loss_mlp": 1.00405002, + "epoch": 0.6231775139035022, + "flos": 68702032517760.0, + "grad_norm": 0.884662336082659, + "language_loss": 0.60777593, + "learning_rate": 1.313418851605015e-06, + "loss": 0.62809759, + "num_input_tokens_seen": 223315050, + "router_z_loss_clip": 0.01257324, + "router_z_loss_mlp": 0.2109375, + "step": 10365, + "time_per_iteration": 3.0822458267211914 + }, + { + "auxiliary_loss_clip": 0.01111747, + "auxiliary_loss_mlp": 0.01039491, + "balance_loss_clip": 1.02530479, + "balance_loss_mlp": 1.03808904, + "epoch": 0.6232376371561702, + "flos": 19821038163840.0, + "grad_norm": 2.2798464083102576, + "language_loss": 0.75205708, + "learning_rate": 1.3130530708224427e-06, + "loss": 0.77356946, + "num_input_tokens_seen": 223332130, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.73828125, + "step": 10366, + "time_per_iteration": 2.441955804824829 + }, + { + "auxiliary_loss_clip": 0.01108704, + "auxiliary_loss_mlp": 0.01040835, + "balance_loss_clip": 1.02833033, + "balance_loss_mlp": 1.03776455, + "epoch": 0.6232977604088381, + "flos": 23258264376960.0, + "grad_norm": 2.0199414320321725, + "language_loss": 0.76469356, + "learning_rate": 1.3126873160889665e-06, + "loss": 0.78618896, + "num_input_tokens_seen": 223351605, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7109375, + "step": 10367, + "time_per_iteration": 2.477055072784424 + }, + { + "auxiliary_loss_clip": 0.01105026, + "auxiliary_loss_mlp": 0.01034491, + "balance_loss_clip": 1.02271938, + "balance_loss_mlp": 1.03831315, + "epoch": 0.6233578836615061, + "flos": 21106425335040.0, + "grad_norm": 1.4367646696128493, + "language_loss": 0.78561807, + "learning_rate": 1.312321587418457e-06, + "loss": 0.80701321, + "num_input_tokens_seen": 223372090, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.66796875, + "step": 10368, + "time_per_iteration": 2.4565787315368652 + }, + { + "auxiliary_loss_clip": 0.01108362, + "auxiliary_loss_mlp": 0.01031753, + "balance_loss_clip": 1.01959956, + "balance_loss_mlp": 1.03783059, + "epoch": 0.623418006914174, + "flos": 23769416868480.0, + "grad_norm": 1.854629496919494, + "language_loss": 0.68463397, + "learning_rate": 1.3119558848247811e-06, + "loss": 0.70603514, + "num_input_tokens_seen": 223390110, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.703125, + "step": 10369, + "time_per_iteration": 2.495943069458008 + }, + { + "auxiliary_loss_clip": 0.01107955, + "auxiliary_loss_mlp": 0.0103761, + "balance_loss_clip": 1.02470601, + "balance_loss_mlp": 1.03846693, + "epoch": 0.6234781301668421, + "flos": 17890480857600.0, + "grad_norm": 2.0672458586121922, + "language_loss": 0.87758917, + "learning_rate": 1.3115902083218072e-06, + "loss": 0.89904487, + "num_input_tokens_seen": 223404205, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 10370, + "time_per_iteration": 2.4028708934783936 + }, + { + "auxiliary_loss_clip": 0.01103104, + "auxiliary_loss_mlp": 0.01026152, + "balance_loss_clip": 1.01450515, + "balance_loss_mlp": 1.03551197, + "epoch": 0.62353825341951, + "flos": 26175503352960.0, + "grad_norm": 1.4687473894600929, + "language_loss": 0.65925562, + "learning_rate": 1.311224557923402e-06, + "loss": 0.68054819, + "num_input_tokens_seen": 223424855, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.67578125, + "step": 10371, + "time_per_iteration": 2.4908487796783447 + }, + { + "auxiliary_loss_clip": 0.01099208, + "auxiliary_loss_mlp": 0.01029723, + "balance_loss_clip": 1.01929259, + "balance_loss_mlp": 1.03462815, + "epoch": 0.623598376672178, + "flos": 31139902160640.0, + "grad_norm": 1.308988821713543, + "language_loss": 0.77547729, + "learning_rate": 1.3108589336434298e-06, + "loss": 0.79676664, + "num_input_tokens_seen": 223447225, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.6484375, + "step": 10372, + "time_per_iteration": 2.5180232524871826 + }, + { + "auxiliary_loss_clip": 0.01105338, + "auxiliary_loss_mlp": 0.01030431, + "balance_loss_clip": 1.01769924, + "balance_loss_mlp": 1.03540146, + "epoch": 0.6236584999248459, + "flos": 23730202195200.0, + "grad_norm": 1.565588018128666, + "language_loss": 0.77423698, + "learning_rate": 1.3104933354957568e-06, + "loss": 0.79559469, + "num_input_tokens_seen": 223467520, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 10373, + "time_per_iteration": 2.4661612510681152 + }, + { + "auxiliary_loss_clip": 0.01101212, + "auxiliary_loss_mlp": 0.01025569, + "balance_loss_clip": 1.01429188, + "balance_loss_mlp": 1.03523397, + "epoch": 0.6237186231775139, + "flos": 21762764599680.0, + "grad_norm": 1.4815417355827754, + "language_loss": 0.69228935, + "learning_rate": 1.3101277634942448e-06, + "loss": 0.71355724, + "num_input_tokens_seen": 223488130, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.66015625, + "step": 10374, + "time_per_iteration": 2.473937511444092 + }, + { + "auxiliary_loss_clip": 0.0110711, + "auxiliary_loss_mlp": 0.01031094, + "balance_loss_clip": 1.01916742, + "balance_loss_mlp": 1.03731394, + "epoch": 0.6237787464301818, + "flos": 14939486075520.0, + "grad_norm": 1.723426878177341, + "language_loss": 0.77033317, + "learning_rate": 1.3097622176527577e-06, + "loss": 0.79171526, + "num_input_tokens_seen": 223505105, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69921875, + "step": 10375, + "time_per_iteration": 2.437490463256836 + }, + { + "auxiliary_loss_clip": 0.01104528, + "auxiliary_loss_mlp": 0.01028615, + "balance_loss_clip": 1.0170275, + "balance_loss_mlp": 1.0379982, + "epoch": 0.6238388696828499, + "flos": 35590311302400.0, + "grad_norm": 1.4731613062232216, + "language_loss": 0.70344281, + "learning_rate": 1.3093966979851566e-06, + "loss": 0.72477418, + "num_input_tokens_seen": 223528065, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66796875, + "step": 10376, + "time_per_iteration": 2.6377809047698975 + }, + { + "auxiliary_loss_clip": 0.01108576, + "auxiliary_loss_mlp": 0.01029796, + "balance_loss_clip": 1.01712978, + "balance_loss_mlp": 1.03811753, + "epoch": 0.6238989929355178, + "flos": 23623511823360.0, + "grad_norm": 2.3241172647924837, + "language_loss": 0.76568282, + "learning_rate": 1.309031204505301e-06, + "loss": 0.78706658, + "num_input_tokens_seen": 223547305, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 10377, + "time_per_iteration": 2.479133367538452 + }, + { + "auxiliary_loss_clip": 0.01106151, + "auxiliary_loss_mlp": 0.01029223, + "balance_loss_clip": 1.01860189, + "balance_loss_mlp": 1.03780174, + "epoch": 0.6239591161881858, + "flos": 22087468569600.0, + "grad_norm": 1.547563238627933, + "language_loss": 0.67949808, + "learning_rate": 1.308665737227052e-06, + "loss": 0.7008518, + "num_input_tokens_seen": 223567205, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.68359375, + "step": 10378, + "time_per_iteration": 2.4531919956207275 + }, + { + "auxiliary_loss_clip": 0.01104298, + "auxiliary_loss_mlp": 0.01030974, + "balance_loss_clip": 1.01901162, + "balance_loss_mlp": 1.03584397, + "epoch": 0.6240192394408538, + "flos": 24535930124160.0, + "grad_norm": 1.7868825896573544, + "language_loss": 0.76539075, + "learning_rate": 1.3083002961642675e-06, + "loss": 0.78674352, + "num_input_tokens_seen": 223586560, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6875, + "step": 10379, + "time_per_iteration": 2.489495277404785 + }, + { + "auxiliary_loss_clip": 0.01102881, + "auxiliary_loss_mlp": 0.01027719, + "balance_loss_clip": 1.01567876, + "balance_loss_mlp": 1.0352664, + "epoch": 0.6240793626935217, + "flos": 27931930502400.0, + "grad_norm": 1.3567066837187596, + "language_loss": 0.79495847, + "learning_rate": 1.3079348813308051e-06, + "loss": 0.81626451, + "num_input_tokens_seen": 223610595, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 10380, + "time_per_iteration": 2.513836145401001 + }, + { + "auxiliary_loss_clip": 0.01105137, + "auxiliary_loss_mlp": 0.01028452, + "balance_loss_clip": 1.01738906, + "balance_loss_mlp": 1.03878844, + "epoch": 0.6241394859461897, + "flos": 22892514140160.0, + "grad_norm": 1.5683522336983957, + "language_loss": 0.79919797, + "learning_rate": 1.3075694927405207e-06, + "loss": 0.82053387, + "num_input_tokens_seen": 223630230, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6640625, + "step": 10381, + "time_per_iteration": 2.4719154834747314 + }, + { + "auxiliary_loss_clip": 0.01104983, + "auxiliary_loss_mlp": 0.01032235, + "balance_loss_clip": 1.02026606, + "balance_loss_mlp": 1.03598738, + "epoch": 0.6241996091988576, + "flos": 12750766744320.0, + "grad_norm": 2.2093057050572606, + "language_loss": 0.74530953, + "learning_rate": 1.3072041304072718e-06, + "loss": 0.76668167, + "num_input_tokens_seen": 223648360, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.69140625, + "step": 10382, + "time_per_iteration": 2.4555060863494873 + }, + { + "auxiliary_loss_clip": 0.01102662, + "auxiliary_loss_mlp": 0.01025503, + "balance_loss_clip": 1.01423788, + "balance_loss_mlp": 1.03613257, + "epoch": 0.6242597324515257, + "flos": 25851302173440.0, + "grad_norm": 1.3920284041280475, + "language_loss": 0.78429455, + "learning_rate": 1.306838794344911e-06, + "loss": 0.80557621, + "num_input_tokens_seen": 223671255, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6640625, + "step": 10383, + "time_per_iteration": 2.5131173133850098 + }, + { + "auxiliary_loss_clip": 0.01103404, + "auxiliary_loss_mlp": 0.0102853, + "balance_loss_clip": 1.01732409, + "balance_loss_mlp": 1.03612638, + "epoch": 0.6243198557041936, + "flos": 19937712516480.0, + "grad_norm": 2.28937629159475, + "language_loss": 0.7478832, + "learning_rate": 1.3064734845672925e-06, + "loss": 0.76920247, + "num_input_tokens_seen": 223689860, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 10384, + "time_per_iteration": 2.441364049911499 + }, + { + "auxiliary_loss_clip": 0.01107606, + "auxiliary_loss_mlp": 0.0103045, + "balance_loss_clip": 1.01817775, + "balance_loss_mlp": 1.03742898, + "epoch": 0.6243799789568616, + "flos": 18406194376320.0, + "grad_norm": 2.8855056380065993, + "language_loss": 0.66313016, + "learning_rate": 1.3061082010882694e-06, + "loss": 0.68451071, + "num_input_tokens_seen": 223707835, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.703125, + "step": 10385, + "time_per_iteration": 3.859321117401123 + }, + { + "auxiliary_loss_clip": 0.01027145, + "auxiliary_loss_mlp": 0.01001461, + "balance_loss_clip": 1.00013185, + "balance_loss_mlp": 1.0058732, + "epoch": 0.6244401022095295, + "flos": 66027587523840.0, + "grad_norm": 0.7546932463540804, + "language_loss": 0.62028766, + "learning_rate": 1.305742943921692e-06, + "loss": 0.64057362, + "num_input_tokens_seen": 223771875, + "router_z_loss_clip": 0.01330566, + "router_z_loss_mlp": 0.21289062, + "step": 10386, + "time_per_iteration": 3.106778860092163 + }, + { + "auxiliary_loss_clip": 0.01105241, + "auxiliary_loss_mlp": 0.01031922, + "balance_loss_clip": 1.01933956, + "balance_loss_mlp": 1.03560019, + "epoch": 0.6245002254621975, + "flos": 24571266128640.0, + "grad_norm": 2.5221123793522247, + "language_loss": 0.7170524, + "learning_rate": 1.3053777130814128e-06, + "loss": 0.73842406, + "num_input_tokens_seen": 223788895, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 10387, + "time_per_iteration": 2.471496105194092 + }, + { + "auxiliary_loss_clip": 0.01110828, + "auxiliary_loss_mlp": 0.01038535, + "balance_loss_clip": 1.02493882, + "balance_loss_mlp": 1.03753424, + "epoch": 0.6245603487148654, + "flos": 29168837291520.0, + "grad_norm": 2.0526196711418345, + "language_loss": 0.65366501, + "learning_rate": 1.3050125085812798e-06, + "loss": 0.67515868, + "num_input_tokens_seen": 223810385, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.73046875, + "step": 10388, + "time_per_iteration": 5.378544330596924 + }, + { + "auxiliary_loss_clip": 0.01104574, + "auxiliary_loss_mlp": 0.01027126, + "balance_loss_clip": 1.01566386, + "balance_loss_mlp": 1.03606319, + "epoch": 0.6246204719675335, + "flos": 14790097411200.0, + "grad_norm": 1.6446610432064326, + "language_loss": 0.79204857, + "learning_rate": 1.3046473304351417e-06, + "loss": 0.81336558, + "num_input_tokens_seen": 223826040, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6875, + "step": 10389, + "time_per_iteration": 3.85504150390625 + }, + { + "auxiliary_loss_clip": 0.01103741, + "auxiliary_loss_mlp": 0.01032365, + "balance_loss_clip": 1.02053928, + "balance_loss_mlp": 1.03604019, + "epoch": 0.6246805952202014, + "flos": 12493538472960.0, + "grad_norm": 1.9237323307273804, + "language_loss": 0.60423774, + "learning_rate": 1.3042821786568475e-06, + "loss": 0.62559879, + "num_input_tokens_seen": 223842300, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.67578125, + "step": 10390, + "time_per_iteration": 2.4648008346557617 + }, + { + "auxiliary_loss_clip": 0.01107504, + "auxiliary_loss_mlp": 0.01033109, + "balance_loss_clip": 1.02080107, + "balance_loss_mlp": 1.03688002, + "epoch": 0.6247407184728694, + "flos": 12786677366400.0, + "grad_norm": 1.88087186985586, + "language_loss": 0.77647173, + "learning_rate": 1.3039170532602416e-06, + "loss": 0.79787791, + "num_input_tokens_seen": 223858320, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.70703125, + "step": 10391, + "time_per_iteration": 2.4204020500183105 + }, + { + "auxiliary_loss_clip": 0.01107712, + "auxiliary_loss_mlp": 0.0103129, + "balance_loss_clip": 1.01849914, + "balance_loss_mlp": 1.03854263, + "epoch": 0.6248008417255374, + "flos": 40629188960640.0, + "grad_norm": 1.9064599500175736, + "language_loss": 0.64700288, + "learning_rate": 1.3035519542591718e-06, + "loss": 0.6683929, + "num_input_tokens_seen": 223883545, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69140625, + "step": 10392, + "time_per_iteration": 2.6868064403533936 + }, + { + "auxiliary_loss_clip": 0.01108711, + "auxiliary_loss_mlp": 0.010312, + "balance_loss_clip": 1.0189693, + "balance_loss_mlp": 1.03795576, + "epoch": 0.6248609649782053, + "flos": 19902017376000.0, + "grad_norm": 1.715075150061653, + "language_loss": 0.76449108, + "learning_rate": 1.3031868816674819e-06, + "loss": 0.78589016, + "num_input_tokens_seen": 223901445, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.70703125, + "step": 10393, + "time_per_iteration": 2.5002684593200684 + }, + { + "auxiliary_loss_clip": 0.01109321, + "auxiliary_loss_mlp": 0.01036311, + "balance_loss_clip": 1.02361488, + "balance_loss_mlp": 1.03849423, + "epoch": 0.6249210882308733, + "flos": 19682746801920.0, + "grad_norm": 1.7032519811811655, + "language_loss": 0.82738161, + "learning_rate": 1.3028218354990142e-06, + "loss": 0.84883797, + "num_input_tokens_seen": 223920170, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 10394, + "time_per_iteration": 2.5074119567871094 + }, + { + "auxiliary_loss_clip": 0.01108744, + "auxiliary_loss_mlp": 0.01032457, + "balance_loss_clip": 1.01968956, + "balance_loss_mlp": 1.03777504, + "epoch": 0.6249812114835412, + "flos": 13990726189440.0, + "grad_norm": 1.7635560366961225, + "language_loss": 0.75053072, + "learning_rate": 1.3024568157676128e-06, + "loss": 0.77194268, + "num_input_tokens_seen": 223936495, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7109375, + "step": 10395, + "time_per_iteration": 2.4207139015197754 + }, + { + "auxiliary_loss_clip": 0.01106696, + "auxiliary_loss_mlp": 0.01029996, + "balance_loss_clip": 1.01774108, + "balance_loss_mlp": 1.03590536, + "epoch": 0.6250413347362093, + "flos": 14530031965440.0, + "grad_norm": 2.116778231139036, + "language_loss": 0.72623551, + "learning_rate": 1.302091822487119e-06, + "loss": 0.74760246, + "num_input_tokens_seen": 223950070, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.70703125, + "step": 10396, + "time_per_iteration": 2.4098753929138184 + }, + { + "auxiliary_loss_clip": 0.01106314, + "auxiliary_loss_mlp": 0.01035035, + "balance_loss_clip": 1.02295291, + "balance_loss_mlp": 1.03761959, + "epoch": 0.6251014579888772, + "flos": 22963006581120.0, + "grad_norm": 1.639382305953213, + "language_loss": 0.75850725, + "learning_rate": 1.3017268556713732e-06, + "loss": 0.7799207, + "num_input_tokens_seen": 223970065, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6875, + "step": 10397, + "time_per_iteration": 2.437908887863159 + }, + { + "auxiliary_loss_clip": 0.01104633, + "auxiliary_loss_mlp": 0.01031448, + "balance_loss_clip": 1.0192287, + "balance_loss_mlp": 1.03570378, + "epoch": 0.6251615812415452, + "flos": 28111232217600.0, + "grad_norm": 2.1037822697926667, + "language_loss": 0.74630761, + "learning_rate": 1.3013619153342154e-06, + "loss": 0.76766837, + "num_input_tokens_seen": 223990315, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 10398, + "time_per_iteration": 2.5268969535827637 + }, + { + "auxiliary_loss_clip": 0.01107479, + "auxiliary_loss_mlp": 0.01031651, + "balance_loss_clip": 1.0180074, + "balance_loss_mlp": 1.03535593, + "epoch": 0.6252217044942131, + "flos": 26724469887360.0, + "grad_norm": 1.7918693005970583, + "language_loss": 0.74092543, + "learning_rate": 1.300997001489483e-06, + "loss": 0.7623167, + "num_input_tokens_seen": 224009960, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.72265625, + "step": 10399, + "time_per_iteration": 2.4791572093963623 + }, + { + "auxiliary_loss_clip": 0.01107905, + "auxiliary_loss_mlp": 0.01032268, + "balance_loss_clip": 1.01990008, + "balance_loss_mlp": 1.03819537, + "epoch": 0.6252818277468811, + "flos": 20006768413440.0, + "grad_norm": 1.731383234573371, + "language_loss": 0.74527764, + "learning_rate": 1.3006321141510147e-06, + "loss": 0.76667941, + "num_input_tokens_seen": 224028870, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.69921875, + "step": 10400, + "time_per_iteration": 2.473951816558838 + }, + { + "auxiliary_loss_clip": 0.01026565, + "auxiliary_loss_mlp": 0.01000492, + "balance_loss_clip": 0.99915105, + "balance_loss_mlp": 1.00554299, + "epoch": 0.625341950999549, + "flos": 59278285059840.0, + "grad_norm": 0.8444247043206139, + "language_loss": 0.5648914, + "learning_rate": 1.3002672533326465e-06, + "loss": 0.58516198, + "num_input_tokens_seen": 224094140, + "router_z_loss_clip": 0.01342773, + "router_z_loss_mlp": 0.20996094, + "step": 10401, + "time_per_iteration": 3.129333019256592 + }, + { + "auxiliary_loss_clip": 0.01107201, + "auxiliary_loss_mlp": 0.01033139, + "balance_loss_clip": 1.0204252, + "balance_loss_mlp": 1.03666401, + "epoch": 0.625402074252217, + "flos": 20157090831360.0, + "grad_norm": 2.0092602513975977, + "language_loss": 0.82945538, + "learning_rate": 1.2999024190482146e-06, + "loss": 0.85085875, + "num_input_tokens_seen": 224113235, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 10402, + "time_per_iteration": 2.460231304168701 + }, + { + "auxiliary_loss_clip": 0.01104333, + "auxiliary_loss_mlp": 0.01031135, + "balance_loss_clip": 1.01907122, + "balance_loss_mlp": 1.03590369, + "epoch": 0.625462197504885, + "flos": 29132531619840.0, + "grad_norm": 1.9961648351421997, + "language_loss": 0.69392562, + "learning_rate": 1.2995376113115527e-06, + "loss": 0.71528035, + "num_input_tokens_seen": 224134530, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.68359375, + "step": 10403, + "time_per_iteration": 2.512580156326294 + }, + { + "auxiliary_loss_clip": 0.01107476, + "auxiliary_loss_mlp": 0.01029842, + "balance_loss_clip": 1.01649678, + "balance_loss_mlp": 1.03631687, + "epoch": 0.625522320757553, + "flos": 26104436294400.0, + "grad_norm": 1.605243006168547, + "language_loss": 0.71813661, + "learning_rate": 1.2991728301364954e-06, + "loss": 0.73950982, + "num_input_tokens_seen": 224154170, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7109375, + "step": 10404, + "time_per_iteration": 2.5337743759155273 + }, + { + "auxiliary_loss_clip": 0.0110666, + "auxiliary_loss_mlp": 0.01036242, + "balance_loss_clip": 1.02410626, + "balance_loss_mlp": 1.03739667, + "epoch": 0.625582444010221, + "flos": 20630967984000.0, + "grad_norm": 2.1209903153707637, + "language_loss": 0.69724202, + "learning_rate": 1.2988080755368742e-06, + "loss": 0.71867102, + "num_input_tokens_seen": 224172730, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69140625, + "step": 10405, + "time_per_iteration": 2.429565191268921 + }, + { + "auxiliary_loss_clip": 0.01106396, + "auxiliary_loss_mlp": 0.01031388, + "balance_loss_clip": 1.01888895, + "balance_loss_mlp": 1.03722537, + "epoch": 0.6256425672628889, + "flos": 20521512264960.0, + "grad_norm": 1.5758155671533136, + "language_loss": 0.79004002, + "learning_rate": 1.2984433475265207e-06, + "loss": 0.81141788, + "num_input_tokens_seen": 224192620, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69140625, + "step": 10406, + "time_per_iteration": 2.468031167984009 + }, + { + "auxiliary_loss_clip": 0.01107697, + "auxiliary_loss_mlp": 0.01033032, + "balance_loss_clip": 1.0206759, + "balance_loss_mlp": 1.03848672, + "epoch": 0.6257026905155569, + "flos": 29529200488320.0, + "grad_norm": 2.3254582384945546, + "language_loss": 0.68920648, + "learning_rate": 1.2980786461192666e-06, + "loss": 0.71061373, + "num_input_tokens_seen": 224214660, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6953125, + "step": 10407, + "time_per_iteration": 2.527899742126465 + }, + { + "auxiliary_loss_clip": 0.01103441, + "auxiliary_loss_mlp": 0.01027988, + "balance_loss_clip": 1.0164783, + "balance_loss_mlp": 1.03711939, + "epoch": 0.6257628137682248, + "flos": 24024885373440.0, + "grad_norm": 1.6489273629254082, + "language_loss": 0.85259062, + "learning_rate": 1.2977139713289398e-06, + "loss": 0.87390488, + "num_input_tokens_seen": 224234170, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 10408, + "time_per_iteration": 2.5326271057128906 + }, + { + "auxiliary_loss_clip": 0.01103218, + "auxiliary_loss_mlp": 0.01032655, + "balance_loss_clip": 1.02121651, + "balance_loss_mlp": 1.03541374, + "epoch": 0.6258229370208929, + "flos": 20850956830080.0, + "grad_norm": 1.6409440677958231, + "language_loss": 0.79910547, + "learning_rate": 1.2973493231693699e-06, + "loss": 0.82046419, + "num_input_tokens_seen": 224253115, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6796875, + "step": 10409, + "time_per_iteration": 2.442823886871338 + }, + { + "auxiliary_loss_clip": 0.01102769, + "auxiliary_loss_mlp": 0.01029631, + "balance_loss_clip": 1.01786542, + "balance_loss_mlp": 1.03510618, + "epoch": 0.6258830602735608, + "flos": 22231542021120.0, + "grad_norm": 2.1270589511309, + "language_loss": 0.69238424, + "learning_rate": 1.2969847016543845e-06, + "loss": 0.71370828, + "num_input_tokens_seen": 224271375, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6796875, + "step": 10410, + "time_per_iteration": 2.5218586921691895 + }, + { + "auxiliary_loss_clip": 0.01102703, + "auxiliary_loss_mlp": 0.01027941, + "balance_loss_clip": 1.01665211, + "balance_loss_mlp": 1.03720927, + "epoch": 0.6259431835262288, + "flos": 25076887925760.0, + "grad_norm": 1.7979777871745755, + "language_loss": 0.67414671, + "learning_rate": 1.2966201067978086e-06, + "loss": 0.69545317, + "num_input_tokens_seen": 224290315, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65625, + "step": 10411, + "time_per_iteration": 2.4738645553588867 + }, + { + "auxiliary_loss_clip": 0.0110494, + "auxiliary_loss_mlp": 0.01035981, + "balance_loss_clip": 1.02416134, + "balance_loss_mlp": 1.03532887, + "epoch": 0.6260033067788967, + "flos": 28252288926720.0, + "grad_norm": 1.6084905019023508, + "language_loss": 0.69372767, + "learning_rate": 1.2962555386134702e-06, + "loss": 0.71513689, + "num_input_tokens_seen": 224310545, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6953125, + "step": 10412, + "time_per_iteration": 2.5545077323913574 + }, + { + "auxiliary_loss_clip": 0.01102021, + "auxiliary_loss_mlp": 0.01031498, + "balance_loss_clip": 1.02027464, + "balance_loss_mlp": 1.03490543, + "epoch": 0.6260634300315647, + "flos": 23367432787200.0, + "grad_norm": 1.551813331878434, + "language_loss": 0.69730282, + "learning_rate": 1.2958909971151908e-06, + "loss": 0.718638, + "num_input_tokens_seen": 224331115, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 10413, + "time_per_iteration": 2.4613993167877197 + }, + { + "auxiliary_loss_clip": 0.01107528, + "auxiliary_loss_mlp": 0.01031885, + "balance_loss_clip": 1.01831901, + "balance_loss_mlp": 1.03475976, + "epoch": 0.6261235532842326, + "flos": 18035308494720.0, + "grad_norm": 2.3187128472961347, + "language_loss": 0.80297446, + "learning_rate": 1.295526482316796e-06, + "loss": 0.82436854, + "num_input_tokens_seen": 224347525, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7265625, + "step": 10414, + "time_per_iteration": 2.4762308597564697 + }, + { + "auxiliary_loss_clip": 0.01106139, + "auxiliary_loss_mlp": 0.01034131, + "balance_loss_clip": 1.02244806, + "balance_loss_mlp": 1.03826272, + "epoch": 0.6261836765369007, + "flos": 22011265866240.0, + "grad_norm": 1.6885486405610761, + "language_loss": 0.74565107, + "learning_rate": 1.2951619942321083e-06, + "loss": 0.76705372, + "num_input_tokens_seen": 224367045, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 10415, + "time_per_iteration": 2.469125270843506 + }, + { + "auxiliary_loss_clip": 0.01103919, + "auxiliary_loss_mlp": 0.01027621, + "balance_loss_clip": 1.01612878, + "balance_loss_mlp": 1.03637624, + "epoch": 0.6262437997895686, + "flos": 24936010784640.0, + "grad_norm": 1.6561914595998568, + "language_loss": 0.74751735, + "learning_rate": 1.2947975328749472e-06, + "loss": 0.7688328, + "num_input_tokens_seen": 224388860, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.67578125, + "step": 10416, + "time_per_iteration": 2.5993549823760986 + }, + { + "auxiliary_loss_clip": 0.0110123, + "auxiliary_loss_mlp": 0.01029477, + "balance_loss_clip": 1.01813984, + "balance_loss_mlp": 1.03624392, + "epoch": 0.6263039230422366, + "flos": 31608428186880.0, + "grad_norm": 1.5562931530598996, + "language_loss": 0.84521848, + "learning_rate": 1.2944330982591352e-06, + "loss": 0.86652553, + "num_input_tokens_seen": 224409645, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 10417, + "time_per_iteration": 2.555704355239868 + }, + { + "auxiliary_loss_clip": 0.01105248, + "auxiliary_loss_mlp": 0.01028467, + "balance_loss_clip": 1.01628423, + "balance_loss_mlp": 1.03636765, + "epoch": 0.6263640462949046, + "flos": 17639465639040.0, + "grad_norm": 2.453683898924351, + "language_loss": 0.56929493, + "learning_rate": 1.2940686903984904e-06, + "loss": 0.59063208, + "num_input_tokens_seen": 224428530, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 10418, + "time_per_iteration": 2.443615198135376 + }, + { + "auxiliary_loss_clip": 0.01108601, + "auxiliary_loss_mlp": 0.01031797, + "balance_loss_clip": 1.01904798, + "balance_loss_mlp": 1.03636181, + "epoch": 0.6264241695475725, + "flos": 19974951941760.0, + "grad_norm": 1.7897891411455675, + "language_loss": 0.84952247, + "learning_rate": 1.2937043093068316e-06, + "loss": 0.8709265, + "num_input_tokens_seen": 224447175, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.72265625, + "step": 10419, + "time_per_iteration": 2.432539224624634 + }, + { + "auxiliary_loss_clip": 0.01108205, + "auxiliary_loss_mlp": 0.01032396, + "balance_loss_clip": 1.02055252, + "balance_loss_mlp": 1.03868783, + "epoch": 0.6264842928002405, + "flos": 27344323912320.0, + "grad_norm": 1.768912237267882, + "language_loss": 0.64837831, + "learning_rate": 1.2933399549979762e-06, + "loss": 0.66978431, + "num_input_tokens_seen": 224469445, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6953125, + "step": 10420, + "time_per_iteration": 2.5192198753356934 + }, + { + "auxiliary_loss_clip": 0.01105751, + "auxiliary_loss_mlp": 0.01030399, + "balance_loss_clip": 1.01782894, + "balance_loss_mlp": 1.03548038, + "epoch": 0.6265444160529084, + "flos": 22997265177600.0, + "grad_norm": 1.9559815455742504, + "language_loss": 0.86093545, + "learning_rate": 1.292975627485741e-06, + "loss": 0.88229704, + "num_input_tokens_seen": 224486590, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 10421, + "time_per_iteration": 2.454472303390503 + }, + { + "auxiliary_loss_clip": 0.01106789, + "auxiliary_loss_mlp": 0.01031721, + "balance_loss_clip": 1.02009797, + "balance_loss_mlp": 1.03760505, + "epoch": 0.6266045393055765, + "flos": 19938323047680.0, + "grad_norm": 2.5701422758472687, + "language_loss": 0.79219615, + "learning_rate": 1.2926113267839403e-06, + "loss": 0.81358123, + "num_input_tokens_seen": 224502795, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.69140625, + "step": 10422, + "time_per_iteration": 2.4565389156341553 + }, + { + "auxiliary_loss_clip": 0.01102252, + "auxiliary_loss_mlp": 0.01024803, + "balance_loss_clip": 1.01235723, + "balance_loss_mlp": 1.03458548, + "epoch": 0.6266646625582444, + "flos": 24389091325440.0, + "grad_norm": 2.6493252664986784, + "language_loss": 0.74391955, + "learning_rate": 1.292247052906389e-06, + "loss": 0.76519012, + "num_input_tokens_seen": 224522300, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.67578125, + "step": 10423, + "time_per_iteration": 2.4744317531585693 + }, + { + "auxiliary_loss_clip": 0.01102071, + "auxiliary_loss_mlp": 0.0102725, + "balance_loss_clip": 1.0154779, + "balance_loss_mlp": 1.03445518, + "epoch": 0.6267247858109124, + "flos": 14683802088960.0, + "grad_norm": 1.8573410403622042, + "language_loss": 0.77685475, + "learning_rate": 1.2918828058669004e-06, + "loss": 0.79814792, + "num_input_tokens_seen": 224538260, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.67578125, + "step": 10424, + "time_per_iteration": 2.459156036376953 + }, + { + "auxiliary_loss_clip": 0.0110302, + "auxiliary_loss_mlp": 0.01028249, + "balance_loss_clip": 1.01498699, + "balance_loss_mlp": 1.03587162, + "epoch": 0.6267849090635803, + "flos": 24929977299840.0, + "grad_norm": 1.722847581119462, + "language_loss": 0.6881507, + "learning_rate": 1.2915185856792868e-06, + "loss": 0.70946336, + "num_input_tokens_seen": 224559155, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.671875, + "step": 10425, + "time_per_iteration": 2.4837486743927 + }, + { + "auxiliary_loss_clip": 0.01100261, + "auxiliary_loss_mlp": 0.01028731, + "balance_loss_clip": 1.0183543, + "balance_loss_mlp": 1.0359807, + "epoch": 0.6268450323162483, + "flos": 25337851211520.0, + "grad_norm": 1.5803855338986545, + "language_loss": 0.7465167, + "learning_rate": 1.2911543923573598e-06, + "loss": 0.76780665, + "num_input_tokens_seen": 224578660, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.640625, + "step": 10426, + "time_per_iteration": 2.514317274093628 + }, + { + "auxiliary_loss_clip": 0.01105959, + "auxiliary_loss_mlp": 0.01032051, + "balance_loss_clip": 1.01989794, + "balance_loss_mlp": 1.03667617, + "epoch": 0.6269051555689162, + "flos": 26177299032960.0, + "grad_norm": 1.372305042134179, + "language_loss": 0.80499035, + "learning_rate": 1.290790225914929e-06, + "loss": 0.82637042, + "num_input_tokens_seen": 224599080, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6953125, + "step": 10427, + "time_per_iteration": 3.906360149383545 + }, + { + "auxiliary_loss_clip": 0.01106724, + "auxiliary_loss_mlp": 0.01031602, + "balance_loss_clip": 1.01931798, + "balance_loss_mlp": 1.03726578, + "epoch": 0.6269652788215843, + "flos": 18256877539200.0, + "grad_norm": 1.7157059050483638, + "language_loss": 0.68742979, + "learning_rate": 1.2904260863658034e-06, + "loss": 0.70881307, + "num_input_tokens_seen": 224614225, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6953125, + "step": 10428, + "time_per_iteration": 2.4357380867004395 + }, + { + "auxiliary_loss_clip": 0.01105018, + "auxiliary_loss_mlp": 0.01032275, + "balance_loss_clip": 1.02134943, + "balance_loss_mlp": 1.03779614, + "epoch": 0.6270254020742522, + "flos": 11765413877760.0, + "grad_norm": 1.9089213874225204, + "language_loss": 0.71640742, + "learning_rate": 1.2900619737237928e-06, + "loss": 0.73778033, + "num_input_tokens_seen": 224632365, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.671875, + "step": 10429, + "time_per_iteration": 3.8758704662323 + }, + { + "auxiliary_loss_clip": 0.01108797, + "auxiliary_loss_mlp": 0.01033731, + "balance_loss_clip": 1.02144098, + "balance_loss_mlp": 1.03881693, + "epoch": 0.6270855253269202, + "flos": 23475631530240.0, + "grad_norm": 1.5765983769123613, + "language_loss": 0.79904956, + "learning_rate": 1.2896978880027023e-06, + "loss": 0.82047486, + "num_input_tokens_seen": 224651125, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69921875, + "step": 10430, + "time_per_iteration": 5.480989217758179 + }, + { + "auxiliary_loss_clip": 0.01027432, + "auxiliary_loss_mlp": 0.01010431, + "balance_loss_clip": 1.00904214, + "balance_loss_mlp": 1.00618088, + "epoch": 0.6271456485795882, + "flos": 70064520232320.0, + "grad_norm": 0.7689165216290166, + "language_loss": 0.59162331, + "learning_rate": 1.2893338292163393e-06, + "loss": 0.6120019, + "num_input_tokens_seen": 224716115, + "router_z_loss_clip": 0.01391602, + "router_z_loss_mlp": 0.21289062, + "step": 10431, + "time_per_iteration": 3.1698784828186035 + }, + { + "auxiliary_loss_clip": 0.0102736, + "auxiliary_loss_mlp": 0.01007095, + "balance_loss_clip": 1.00575376, + "balance_loss_mlp": 1.00630832, + "epoch": 0.6272057718322561, + "flos": 65156718280320.0, + "grad_norm": 0.8815125854573025, + "language_loss": 0.63825411, + "learning_rate": 1.2889697973785095e-06, + "loss": 0.6585986, + "num_input_tokens_seen": 224782930, + "router_z_loss_clip": 0.01342773, + "router_z_loss_mlp": 0.2109375, + "step": 10432, + "time_per_iteration": 3.1316046714782715 + }, + { + "auxiliary_loss_clip": 0.01101622, + "auxiliary_loss_mlp": 0.0103135, + "balance_loss_clip": 1.02075207, + "balance_loss_mlp": 1.03523922, + "epoch": 0.6272658950849241, + "flos": 24389342720640.0, + "grad_norm": 1.6684665767860385, + "language_loss": 0.6480633, + "learning_rate": 1.2886057925030153e-06, + "loss": 0.66939294, + "num_input_tokens_seen": 224802010, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.6640625, + "step": 10433, + "time_per_iteration": 2.530367851257324 + }, + { + "auxiliary_loss_clip": 0.01109549, + "auxiliary_loss_mlp": 0.01033392, + "balance_loss_clip": 1.02095246, + "balance_loss_mlp": 1.03838599, + "epoch": 0.627326018337592, + "flos": 17966001202560.0, + "grad_norm": 1.999112171650009, + "language_loss": 0.61930764, + "learning_rate": 1.2882418146036612e-06, + "loss": 0.64073694, + "num_input_tokens_seen": 224818875, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 10434, + "time_per_iteration": 2.4613072872161865 + }, + { + "auxiliary_loss_clip": 0.01103629, + "auxiliary_loss_mlp": 0.01025848, + "balance_loss_clip": 1.01420164, + "balance_loss_mlp": 1.03523064, + "epoch": 0.6273861415902601, + "flos": 20230097224320.0, + "grad_norm": 1.7052209762713233, + "language_loss": 0.84669697, + "learning_rate": 1.2878778636942484e-06, + "loss": 0.86799175, + "num_input_tokens_seen": 224837790, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.68359375, + "step": 10435, + "time_per_iteration": 2.516956090927124 + }, + { + "auxiliary_loss_clip": 0.01027112, + "auxiliary_loss_mlp": 0.01006345, + "balance_loss_clip": 1.00503409, + "balance_loss_mlp": 1.00594997, + "epoch": 0.627446264842928, + "flos": 64953210798720.0, + "grad_norm": 0.7299742143913254, + "language_loss": 0.61572838, + "learning_rate": 1.2875139397885786e-06, + "loss": 0.63606298, + "num_input_tokens_seen": 224899685, + "router_z_loss_clip": 0.01312256, + "router_z_loss_mlp": 0.2109375, + "step": 10436, + "time_per_iteration": 3.1023128032684326 + }, + { + "auxiliary_loss_clip": 0.01107216, + "auxiliary_loss_mlp": 0.01034863, + "balance_loss_clip": 1.02201223, + "balance_loss_mlp": 1.03899169, + "epoch": 0.627506388095596, + "flos": 23584261236480.0, + "grad_norm": 1.5188433692104768, + "language_loss": 0.77361041, + "learning_rate": 1.2871500429004523e-06, + "loss": 0.79503125, + "num_input_tokens_seen": 224918650, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.68359375, + "step": 10437, + "time_per_iteration": 2.5252764225006104 + }, + { + "auxiliary_loss_clip": 0.01026138, + "auxiliary_loss_mlp": 0.01003989, + "balance_loss_clip": 1.0027318, + "balance_loss_mlp": 1.00493383, + "epoch": 0.6275665113482639, + "flos": 67583631674880.0, + "grad_norm": 0.7219307652334395, + "language_loss": 0.5436241, + "learning_rate": 1.2867861730436667e-06, + "loss": 0.56392533, + "num_input_tokens_seen": 224981575, + "router_z_loss_clip": 0.01257324, + "router_z_loss_mlp": 0.21289062, + "step": 10438, + "time_per_iteration": 3.043013572692871 + }, + { + "auxiliary_loss_clip": 0.01102529, + "auxiliary_loss_mlp": 0.01041098, + "balance_loss_clip": 1.02895069, + "balance_loss_mlp": 1.03441381, + "epoch": 0.6276266346009319, + "flos": 27636924101760.0, + "grad_norm": 2.0343389960160163, + "language_loss": 0.84072959, + "learning_rate": 1.2864223302320214e-06, + "loss": 0.86216581, + "num_input_tokens_seen": 225000820, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6796875, + "step": 10439, + "time_per_iteration": 2.5371646881103516 + }, + { + "auxiliary_loss_clip": 0.0110542, + "auxiliary_loss_mlp": 0.01040087, + "balance_loss_clip": 1.0274682, + "balance_loss_mlp": 1.03541088, + "epoch": 0.6276867578535998, + "flos": 22746142218240.0, + "grad_norm": 2.0182472461440057, + "language_loss": 0.8041876, + "learning_rate": 1.2860585144793128e-06, + "loss": 0.8256427, + "num_input_tokens_seen": 225017585, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69921875, + "step": 10440, + "time_per_iteration": 2.4601192474365234 + }, + { + "auxiliary_loss_clip": 0.01099453, + "auxiliary_loss_mlp": 0.01028712, + "balance_loss_clip": 1.01833498, + "balance_loss_mlp": 1.03509974, + "epoch": 0.6277468811062679, + "flos": 24644200694400.0, + "grad_norm": 1.4716906489338055, + "language_loss": 0.74504089, + "learning_rate": 1.285694725799337e-06, + "loss": 0.76632255, + "num_input_tokens_seen": 225039085, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.640625, + "step": 10441, + "time_per_iteration": 2.5412392616271973 + }, + { + "auxiliary_loss_clip": 0.0110177, + "auxiliary_loss_mlp": 0.01029515, + "balance_loss_clip": 1.0175643, + "balance_loss_mlp": 1.03450918, + "epoch": 0.6278070043589358, + "flos": 19678975873920.0, + "grad_norm": 1.707965956451768, + "language_loss": 0.72134054, + "learning_rate": 1.2853309642058884e-06, + "loss": 0.74265343, + "num_input_tokens_seen": 225058105, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 10442, + "time_per_iteration": 2.537446975708008 + }, + { + "auxiliary_loss_clip": 0.01103523, + "auxiliary_loss_mlp": 0.01029473, + "balance_loss_clip": 1.01785576, + "balance_loss_mlp": 1.03555417, + "epoch": 0.6278671276116038, + "flos": 22121834906880.0, + "grad_norm": 1.5665674956365474, + "language_loss": 0.71364504, + "learning_rate": 1.284967229712762e-06, + "loss": 0.73497498, + "num_input_tokens_seen": 225077605, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 10443, + "time_per_iteration": 2.49980092048645 + }, + { + "auxiliary_loss_clip": 0.01103341, + "auxiliary_loss_mlp": 0.01025519, + "balance_loss_clip": 1.01374125, + "balance_loss_mlp": 1.03619695, + "epoch": 0.6279272508642717, + "flos": 23038562839680.0, + "grad_norm": 1.9169292083366938, + "language_loss": 0.72973317, + "learning_rate": 1.2846035223337492e-06, + "loss": 0.75102174, + "num_input_tokens_seen": 225097775, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.671875, + "step": 10444, + "time_per_iteration": 2.474400520324707 + }, + { + "auxiliary_loss_clip": 0.01102711, + "auxiliary_loss_mlp": 0.01027435, + "balance_loss_clip": 1.0155499, + "balance_loss_mlp": 1.03607392, + "epoch": 0.6279873741169397, + "flos": 19824090819840.0, + "grad_norm": 1.8659138317245392, + "language_loss": 0.72426593, + "learning_rate": 1.2842398420826423e-06, + "loss": 0.74556732, + "num_input_tokens_seen": 225115585, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66796875, + "step": 10445, + "time_per_iteration": 2.4486618041992188 + }, + { + "auxiliary_loss_clip": 0.01101674, + "auxiliary_loss_mlp": 0.0103002, + "balance_loss_clip": 1.01832557, + "balance_loss_mlp": 1.03417051, + "epoch": 0.6280474973696077, + "flos": 23915393740800.0, + "grad_norm": 1.6334831062955149, + "language_loss": 0.69040692, + "learning_rate": 1.2838761889732331e-06, + "loss": 0.71172386, + "num_input_tokens_seen": 225135575, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.67578125, + "step": 10446, + "time_per_iteration": 2.4619648456573486 + }, + { + "auxiliary_loss_clip": 0.01108513, + "auxiliary_loss_mlp": 0.01030919, + "balance_loss_clip": 1.01822352, + "balance_loss_mlp": 1.03651023, + "epoch": 0.6281076206222757, + "flos": 17967976450560.0, + "grad_norm": 1.946229669067864, + "language_loss": 0.74025476, + "learning_rate": 1.2835125630193102e-06, + "loss": 0.76164913, + "num_input_tokens_seen": 225154230, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 10447, + "time_per_iteration": 2.449399948120117 + }, + { + "auxiliary_loss_clip": 0.01025063, + "auxiliary_loss_mlp": 0.00998572, + "balance_loss_clip": 0.99728459, + "balance_loss_mlp": 1.00378299, + "epoch": 0.6281677438749437, + "flos": 66778370622720.0, + "grad_norm": 0.6772794879542157, + "language_loss": 0.52363139, + "learning_rate": 1.2831489642346626e-06, + "loss": 0.54386771, + "num_input_tokens_seen": 225213650, + "router_z_loss_clip": 0.01287842, + "router_z_loss_mlp": 0.21289062, + "step": 10448, + "time_per_iteration": 2.9426791667938232 + }, + { + "auxiliary_loss_clip": 0.01106244, + "auxiliary_loss_mlp": 0.01041172, + "balance_loss_clip": 1.02860117, + "balance_loss_mlp": 1.03656423, + "epoch": 0.6282278671276116, + "flos": 11656173640320.0, + "grad_norm": 2.1057349931562275, + "language_loss": 0.91307616, + "learning_rate": 1.282785392633079e-06, + "loss": 0.93455029, + "num_input_tokens_seen": 225230135, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 10449, + "time_per_iteration": 2.4679763317108154 + }, + { + "auxiliary_loss_clip": 0.01101969, + "auxiliary_loss_mlp": 0.01029117, + "balance_loss_clip": 1.01808405, + "balance_loss_mlp": 1.03486931, + "epoch": 0.6282879903802796, + "flos": 42741597847680.0, + "grad_norm": 1.5272379639764508, + "language_loss": 0.60454214, + "learning_rate": 1.2824218482283438e-06, + "loss": 0.62585294, + "num_input_tokens_seen": 225253520, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.671875, + "step": 10450, + "time_per_iteration": 2.665226459503174 + }, + { + "auxiliary_loss_clip": 0.01101695, + "auxiliary_loss_mlp": 0.01030714, + "balance_loss_clip": 1.01926398, + "balance_loss_mlp": 1.03620005, + "epoch": 0.6283481136329475, + "flos": 20009210538240.0, + "grad_norm": 1.5565304478998412, + "language_loss": 0.76683152, + "learning_rate": 1.2820583310342452e-06, + "loss": 0.78815556, + "num_input_tokens_seen": 225272460, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65625, + "step": 10451, + "time_per_iteration": 2.4581120014190674 + }, + { + "auxiliary_loss_clip": 0.01105178, + "auxiliary_loss_mlp": 0.01031424, + "balance_loss_clip": 1.01928806, + "balance_loss_mlp": 1.0352962, + "epoch": 0.6284082368856155, + "flos": 21904431840000.0, + "grad_norm": 1.577387753245048, + "language_loss": 0.77243423, + "learning_rate": 1.281694841064566e-06, + "loss": 0.79380023, + "num_input_tokens_seen": 225291700, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69921875, + "step": 10452, + "time_per_iteration": 2.4569571018218994 + }, + { + "auxiliary_loss_clip": 0.01105275, + "auxiliary_loss_mlp": 0.01031854, + "balance_loss_clip": 1.01977849, + "balance_loss_mlp": 1.03737903, + "epoch": 0.6284683601382834, + "flos": 25484187219840.0, + "grad_norm": 1.9445051684642027, + "language_loss": 0.72382963, + "learning_rate": 1.2813313783330904e-06, + "loss": 0.74520093, + "num_input_tokens_seen": 225311470, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6796875, + "step": 10453, + "time_per_iteration": 2.4979004859924316 + }, + { + "auxiliary_loss_clip": 0.01102123, + "auxiliary_loss_mlp": 0.0102867, + "balance_loss_clip": 1.01643896, + "balance_loss_mlp": 1.03324366, + "epoch": 0.6285284833909515, + "flos": 16538695395840.0, + "grad_norm": 1.6809278534400005, + "language_loss": 0.80429286, + "learning_rate": 1.2809679428536013e-06, + "loss": 0.82560074, + "num_input_tokens_seen": 225328385, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 10454, + "time_per_iteration": 2.409714937210083 + }, + { + "auxiliary_loss_clip": 0.01102175, + "auxiliary_loss_mlp": 0.01029803, + "balance_loss_clip": 1.01829922, + "balance_loss_mlp": 1.03586721, + "epoch": 0.6285886066436194, + "flos": 22820692896000.0, + "grad_norm": 1.824800115863982, + "language_loss": 0.82303673, + "learning_rate": 1.2806045346398792e-06, + "loss": 0.84435654, + "num_input_tokens_seen": 225348415, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 10455, + "time_per_iteration": 2.4712390899658203 + }, + { + "auxiliary_loss_clip": 0.01103001, + "auxiliary_loss_mlp": 0.01029287, + "balance_loss_clip": 1.01754522, + "balance_loss_mlp": 1.03569484, + "epoch": 0.6286487298962874, + "flos": 24715734629760.0, + "grad_norm": 3.44693783643537, + "language_loss": 0.81578875, + "learning_rate": 1.280241153705706e-06, + "loss": 0.83711159, + "num_input_tokens_seen": 225367740, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 10456, + "time_per_iteration": 2.48745059967041 + }, + { + "auxiliary_loss_clip": 0.01107634, + "auxiliary_loss_mlp": 0.01030014, + "balance_loss_clip": 1.01755691, + "balance_loss_mlp": 1.03793502, + "epoch": 0.6287088531489553, + "flos": 20740818752640.0, + "grad_norm": 1.5367705166393795, + "language_loss": 0.72127652, + "learning_rate": 1.27987780006486e-06, + "loss": 0.74265301, + "num_input_tokens_seen": 225388405, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 10457, + "time_per_iteration": 2.451204776763916 + }, + { + "auxiliary_loss_clip": 0.01107301, + "auxiliary_loss_mlp": 0.0103049, + "balance_loss_clip": 1.01816964, + "balance_loss_mlp": 1.03497529, + "epoch": 0.6287689764016233, + "flos": 23070630706560.0, + "grad_norm": 2.138119380312756, + "language_loss": 0.79647571, + "learning_rate": 1.2795144737311202e-06, + "loss": 0.81785357, + "num_input_tokens_seen": 225408360, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.72265625, + "step": 10458, + "time_per_iteration": 2.4522323608398438 + }, + { + "auxiliary_loss_clip": 0.01107535, + "auxiliary_loss_mlp": 0.01031161, + "balance_loss_clip": 1.01934147, + "balance_loss_mlp": 1.03738856, + "epoch": 0.6288290996542913, + "flos": 32233669251840.0, + "grad_norm": 1.5072940054720605, + "language_loss": 0.60961497, + "learning_rate": 1.2791511747182635e-06, + "loss": 0.63100201, + "num_input_tokens_seen": 225431310, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.703125, + "step": 10459, + "time_per_iteration": 2.5262553691864014 + }, + { + "auxiliary_loss_clip": 0.01105348, + "auxiliary_loss_mlp": 0.01029341, + "balance_loss_clip": 1.01796818, + "balance_loss_mlp": 1.03684652, + "epoch": 0.6288892229069593, + "flos": 24641327606400.0, + "grad_norm": 1.7541268062536184, + "language_loss": 0.7885046, + "learning_rate": 1.2787879030400666e-06, + "loss": 0.80985153, + "num_input_tokens_seen": 225450385, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.68359375, + "step": 10460, + "time_per_iteration": 2.4601290225982666 + }, + { + "auxiliary_loss_clip": 0.01102775, + "auxiliary_loss_mlp": 0.01025947, + "balance_loss_clip": 1.01438367, + "balance_loss_mlp": 1.03575253, + "epoch": 0.6289493461596273, + "flos": 17858341163520.0, + "grad_norm": 1.7189888907813877, + "language_loss": 0.73800498, + "learning_rate": 1.2784246587103047e-06, + "loss": 0.75929219, + "num_input_tokens_seen": 225467325, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.671875, + "step": 10461, + "time_per_iteration": 2.4365780353546143 + }, + { + "auxiliary_loss_clip": 0.01100652, + "auxiliary_loss_mlp": 0.01033338, + "balance_loss_clip": 1.02188754, + "balance_loss_mlp": 1.03492045, + "epoch": 0.6290094694122952, + "flos": 22345379199360.0, + "grad_norm": 1.7518850371883825, + "language_loss": 0.70340359, + "learning_rate": 1.2780614417427523e-06, + "loss": 0.72474349, + "num_input_tokens_seen": 225487370, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 10462, + "time_per_iteration": 2.4497246742248535 + }, + { + "auxiliary_loss_clip": 0.01098069, + "auxiliary_loss_mlp": 0.01028834, + "balance_loss_clip": 1.01851106, + "balance_loss_mlp": 1.03555751, + "epoch": 0.6290695926649632, + "flos": 28402431776640.0, + "grad_norm": 1.8426896444846728, + "language_loss": 0.71998221, + "learning_rate": 1.2776982521511821e-06, + "loss": 0.74125123, + "num_input_tokens_seen": 225506915, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.625, + "step": 10463, + "time_per_iteration": 2.519118070602417 + }, + { + "auxiliary_loss_clip": 0.01104354, + "auxiliary_loss_mlp": 0.01035813, + "balance_loss_clip": 1.02407098, + "balance_loss_mlp": 1.03894711, + "epoch": 0.6291297159176311, + "flos": 21505464501120.0, + "grad_norm": 2.0251276075815507, + "language_loss": 0.72917801, + "learning_rate": 1.2773350899493665e-06, + "loss": 0.75057971, + "num_input_tokens_seen": 225525670, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 10464, + "time_per_iteration": 2.4394619464874268 + }, + { + "auxiliary_loss_clip": 0.01101197, + "auxiliary_loss_mlp": 0.01028137, + "balance_loss_clip": 1.01696074, + "balance_loss_mlp": 1.03590441, + "epoch": 0.6291898391702991, + "flos": 12203308581120.0, + "grad_norm": 1.8688314099913752, + "language_loss": 0.69353777, + "learning_rate": 1.2769719551510768e-06, + "loss": 0.71483117, + "num_input_tokens_seen": 225542235, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65234375, + "step": 10465, + "time_per_iteration": 2.420706033706665 + }, + { + "auxiliary_loss_clip": 0.01025681, + "auxiliary_loss_mlp": 0.01003212, + "balance_loss_clip": 1.00197816, + "balance_loss_mlp": 1.00449264, + "epoch": 0.629249962422967, + "flos": 69299479434240.0, + "grad_norm": 0.6783887533402703, + "language_loss": 0.59743875, + "learning_rate": 1.2766088477700832e-06, + "loss": 0.6177277, + "num_input_tokens_seen": 225607185, + "router_z_loss_clip": 0.0123291, + "router_z_loss_mlp": 0.21191406, + "step": 10466, + "time_per_iteration": 3.1529486179351807 + }, + { + "auxiliary_loss_clip": 0.01098875, + "auxiliary_loss_mlp": 0.01028989, + "balance_loss_clip": 1.01821828, + "balance_loss_mlp": 1.03199136, + "epoch": 0.6293100856756351, + "flos": 40077888042240.0, + "grad_norm": 1.895578491152679, + "language_loss": 0.64383173, + "learning_rate": 1.276245767820154e-06, + "loss": 0.66511035, + "num_input_tokens_seen": 225628785, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.66796875, + "step": 10467, + "time_per_iteration": 2.596909761428833 + }, + { + "auxiliary_loss_clip": 0.01025676, + "auxiliary_loss_mlp": 0.00999758, + "balance_loss_clip": 0.9984706, + "balance_loss_mlp": 1.00462031, + "epoch": 0.629370208928303, + "flos": 67501108177920.0, + "grad_norm": 0.7946860251086647, + "language_loss": 0.569076, + "learning_rate": 1.2758827153150586e-06, + "loss": 0.58933038, + "num_input_tokens_seen": 225678980, + "router_z_loss_clip": 0.01287842, + "router_z_loss_mlp": 0.2109375, + "step": 10468, + "time_per_iteration": 4.298036336898804 + }, + { + "auxiliary_loss_clip": 0.01025761, + "auxiliary_loss_mlp": 0.01000379, + "balance_loss_clip": 0.99905533, + "balance_loss_mlp": 1.00460362, + "epoch": 0.629430332180971, + "flos": 60660450449280.0, + "grad_norm": 0.7346247861969195, + "language_loss": 0.580616, + "learning_rate": 1.2755196902685626e-06, + "loss": 0.6008774, + "num_input_tokens_seen": 225740295, + "router_z_loss_clip": 0.01324463, + "router_z_loss_mlp": 0.2109375, + "step": 10469, + "time_per_iteration": 3.013350009918213 + }, + { + "auxiliary_loss_clip": 0.01026242, + "auxiliary_loss_mlp": 0.0100094, + "balance_loss_clip": 0.99966449, + "balance_loss_mlp": 1.00510228, + "epoch": 0.6294904554336389, + "flos": 66869764778880.0, + "grad_norm": 0.6786572594163077, + "language_loss": 0.5214479, + "learning_rate": 1.2751566926944329e-06, + "loss": 0.54171979, + "num_input_tokens_seen": 225805615, + "router_z_loss_clip": 0.01275635, + "router_z_loss_mlp": 0.21191406, + "step": 10470, + "time_per_iteration": 3.1025776863098145 + }, + { + "auxiliary_loss_clip": 0.01101792, + "auxiliary_loss_mlp": 0.01030269, + "balance_loss_clip": 1.01853275, + "balance_loss_mlp": 1.03531003, + "epoch": 0.6295505786863069, + "flos": 42522794150400.0, + "grad_norm": 1.7821374773378207, + "language_loss": 0.7444669, + "learning_rate": 1.2747937226064342e-06, + "loss": 0.76578748, + "num_input_tokens_seen": 225826585, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 10471, + "time_per_iteration": 5.750757455825806 + }, + { + "auxiliary_loss_clip": 0.01104026, + "auxiliary_loss_mlp": 0.0102689, + "balance_loss_clip": 1.0161432, + "balance_loss_mlp": 1.03594935, + "epoch": 0.629610701938975, + "flos": 17384140788480.0, + "grad_norm": 1.928248423372208, + "language_loss": 0.62892604, + "learning_rate": 1.2744307800183297e-06, + "loss": 0.65023524, + "num_input_tokens_seen": 225844095, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6796875, + "step": 10472, + "time_per_iteration": 2.4507625102996826 + }, + { + "auxiliary_loss_clip": 0.01108224, + "auxiliary_loss_mlp": 0.01032069, + "balance_loss_clip": 1.02030277, + "balance_loss_mlp": 1.03887987, + "epoch": 0.6296708251916429, + "flos": 24242934885120.0, + "grad_norm": 1.6696696732656569, + "language_loss": 0.69374871, + "learning_rate": 1.2740678649438828e-06, + "loss": 0.71515167, + "num_input_tokens_seen": 225864310, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6953125, + "step": 10473, + "time_per_iteration": 3.954071283340454 + }, + { + "auxiliary_loss_clip": 0.01101498, + "auxiliary_loss_mlp": 0.0102561, + "balance_loss_clip": 1.01492906, + "balance_loss_mlp": 1.03493738, + "epoch": 0.6297309484443109, + "flos": 19278536077440.0, + "grad_norm": 1.5555016558834316, + "language_loss": 0.74785316, + "learning_rate": 1.2737049773968554e-06, + "loss": 0.76912427, + "num_input_tokens_seen": 225883830, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6640625, + "step": 10474, + "time_per_iteration": 2.4985709190368652 + }, + { + "auxiliary_loss_clip": 0.0110251, + "auxiliary_loss_mlp": 0.01028258, + "balance_loss_clip": 1.01681423, + "balance_loss_mlp": 1.03494573, + "epoch": 0.6297910716969788, + "flos": 30662685043200.0, + "grad_norm": 1.565073448719141, + "language_loss": 0.66372955, + "learning_rate": 1.2733421173910081e-06, + "loss": 0.68503714, + "num_input_tokens_seen": 225905755, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.67578125, + "step": 10475, + "time_per_iteration": 2.511357307434082 + }, + { + "auxiliary_loss_clip": 0.01098965, + "auxiliary_loss_mlp": 0.01029394, + "balance_loss_clip": 1.01852202, + "balance_loss_mlp": 1.03441048, + "epoch": 0.6298511949496468, + "flos": 14423018371200.0, + "grad_norm": 2.080975026928719, + "language_loss": 0.9029789, + "learning_rate": 1.272979284940101e-06, + "loss": 0.92426246, + "num_input_tokens_seen": 225922155, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.64453125, + "step": 10476, + "time_per_iteration": 2.4218876361846924 + }, + { + "auxiliary_loss_clip": 0.01101376, + "auxiliary_loss_mlp": 0.01035009, + "balance_loss_clip": 1.02379751, + "balance_loss_mlp": 1.03476787, + "epoch": 0.6299113182023147, + "flos": 23514163845120.0, + "grad_norm": 1.6697359788083987, + "language_loss": 0.75050914, + "learning_rate": 1.2726164800578913e-06, + "loss": 0.771873, + "num_input_tokens_seen": 225941060, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6640625, + "step": 10477, + "time_per_iteration": 2.455409049987793 + }, + { + "auxiliary_loss_clip": 0.01101367, + "auxiliary_loss_mlp": 0.01026543, + "balance_loss_clip": 1.01468766, + "balance_loss_mlp": 1.0337708, + "epoch": 0.6299714414549827, + "flos": 22674500542080.0, + "grad_norm": 1.9554844868820769, + "language_loss": 0.70427382, + "learning_rate": 1.272253702758138e-06, + "loss": 0.72555292, + "num_input_tokens_seen": 225960870, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.67578125, + "step": 10478, + "time_per_iteration": 2.448185443878174 + }, + { + "auxiliary_loss_clip": 0.01107518, + "auxiliary_loss_mlp": 0.01031561, + "balance_loss_clip": 1.01893711, + "balance_loss_mlp": 1.03713453, + "epoch": 0.6300315647076506, + "flos": 14501735026560.0, + "grad_norm": 2.8380864968685287, + "language_loss": 0.67054832, + "learning_rate": 1.2718909530545974e-06, + "loss": 0.69193918, + "num_input_tokens_seen": 225977895, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 10479, + "time_per_iteration": 2.4200356006622314 + }, + { + "auxiliary_loss_clip": 0.01103494, + "auxiliary_loss_mlp": 0.01033351, + "balance_loss_clip": 1.0211798, + "balance_loss_mlp": 1.03659678, + "epoch": 0.6300916879603187, + "flos": 21871681614720.0, + "grad_norm": 3.6551699512461067, + "language_loss": 0.73471272, + "learning_rate": 1.2715282309610245e-06, + "loss": 0.75608122, + "num_input_tokens_seen": 225997835, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.66796875, + "step": 10480, + "time_per_iteration": 2.4555039405822754 + }, + { + "auxiliary_loss_clip": 0.01104037, + "auxiliary_loss_mlp": 0.01031551, + "balance_loss_clip": 1.01883698, + "balance_loss_mlp": 1.03487301, + "epoch": 0.6301518112129866, + "flos": 21834047139840.0, + "grad_norm": 2.336956908643113, + "language_loss": 0.78874803, + "learning_rate": 1.2711655364911744e-06, + "loss": 0.81010389, + "num_input_tokens_seen": 226017620, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 10481, + "time_per_iteration": 2.4346415996551514 + }, + { + "auxiliary_loss_clip": 0.01026096, + "auxiliary_loss_mlp": 0.01005078, + "balance_loss_clip": 1.00391531, + "balance_loss_mlp": 1.0049262, + "epoch": 0.6302119344656546, + "flos": 44334237957120.0, + "grad_norm": 0.9177955201810194, + "language_loss": 0.61818945, + "learning_rate": 1.2708028696588e-06, + "loss": 0.63850117, + "num_input_tokens_seen": 226068755, + "router_z_loss_clip": 0.01159668, + "router_z_loss_mlp": 0.21191406, + "step": 10482, + "time_per_iteration": 2.812809705734253 + }, + { + "auxiliary_loss_clip": 0.01108769, + "auxiliary_loss_mlp": 0.01030629, + "balance_loss_clip": 1.01766491, + "balance_loss_mlp": 1.03617549, + "epoch": 0.6302720577183225, + "flos": 11217919800960.0, + "grad_norm": 2.913772314034849, + "language_loss": 0.83037972, + "learning_rate": 1.2704402304776541e-06, + "loss": 0.85177374, + "num_input_tokens_seen": 226084395, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7265625, + "step": 10483, + "time_per_iteration": 2.401224374771118 + }, + { + "auxiliary_loss_clip": 0.01097662, + "auxiliary_loss_mlp": 0.01031323, + "balance_loss_clip": 1.02013469, + "balance_loss_mlp": 1.03428078, + "epoch": 0.6303321809709905, + "flos": 27964932122880.0, + "grad_norm": 1.5146236246766749, + "language_loss": 0.72939026, + "learning_rate": 1.270077618961487e-06, + "loss": 0.75068009, + "num_input_tokens_seen": 226105890, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6328125, + "step": 10484, + "time_per_iteration": 2.5125913619995117 + }, + { + "auxiliary_loss_clip": 0.01101969, + "auxiliary_loss_mlp": 0.01026821, + "balance_loss_clip": 1.01511419, + "balance_loss_mlp": 1.03390932, + "epoch": 0.6303923042236586, + "flos": 28220759763840.0, + "grad_norm": 1.8710303376286184, + "language_loss": 0.74698818, + "learning_rate": 1.2697150351240506e-06, + "loss": 0.7682761, + "num_input_tokens_seen": 226126760, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 10485, + "time_per_iteration": 2.4874563217163086 + }, + { + "auxiliary_loss_clip": 0.01107856, + "auxiliary_loss_mlp": 0.01031188, + "balance_loss_clip": 1.01911783, + "balance_loss_mlp": 1.03676295, + "epoch": 0.6304524274763265, + "flos": 27631034271360.0, + "grad_norm": 1.9819800910053105, + "language_loss": 0.81547624, + "learning_rate": 1.269352478979093e-06, + "loss": 0.83686674, + "num_input_tokens_seen": 226147315, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.7109375, + "step": 10486, + "time_per_iteration": 2.4926888942718506 + }, + { + "auxiliary_loss_clip": 0.01102993, + "auxiliary_loss_mlp": 0.01036434, + "balance_loss_clip": 1.0249896, + "balance_loss_mlp": 1.03641152, + "epoch": 0.6305125507289945, + "flos": 17311313963520.0, + "grad_norm": 2.1821850164901675, + "language_loss": 0.63638449, + "learning_rate": 1.2689899505403628e-06, + "loss": 0.65777874, + "num_input_tokens_seen": 226165935, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 10487, + "time_per_iteration": 2.408770799636841 + }, + { + "auxiliary_loss_clip": 0.01103897, + "auxiliary_loss_mlp": 0.01035839, + "balance_loss_clip": 1.02434158, + "balance_loss_mlp": 1.03714716, + "epoch": 0.6305726739816624, + "flos": 25808280658560.0, + "grad_norm": 1.4517629521514586, + "language_loss": 0.67256761, + "learning_rate": 1.2686274498216065e-06, + "loss": 0.69396502, + "num_input_tokens_seen": 226186890, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.66796875, + "step": 10488, + "time_per_iteration": 2.484377861022949 + }, + { + "auxiliary_loss_clip": 0.01105074, + "auxiliary_loss_mlp": 0.01028659, + "balance_loss_clip": 1.01726294, + "balance_loss_mlp": 1.03574753, + "epoch": 0.6306327972343304, + "flos": 21797454159360.0, + "grad_norm": 1.9513019958263491, + "language_loss": 0.67263639, + "learning_rate": 1.2682649768365706e-06, + "loss": 0.69397372, + "num_input_tokens_seen": 226206710, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.69140625, + "step": 10489, + "time_per_iteration": 2.4636588096618652 + }, + { + "auxiliary_loss_clip": 0.01110064, + "auxiliary_loss_mlp": 0.01029894, + "balance_loss_clip": 1.0174365, + "balance_loss_mlp": 1.03723645, + "epoch": 0.6306929204869983, + "flos": 20777375819520.0, + "grad_norm": 1.966397981441809, + "language_loss": 0.69455999, + "learning_rate": 1.2679025315990007e-06, + "loss": 0.71595961, + "num_input_tokens_seen": 226225565, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7265625, + "step": 10490, + "time_per_iteration": 2.4483461380004883 + }, + { + "auxiliary_loss_clip": 0.0110581, + "auxiliary_loss_mlp": 0.01033287, + "balance_loss_clip": 1.02133048, + "balance_loss_mlp": 1.03623903, + "epoch": 0.6307530437396663, + "flos": 23654214973440.0, + "grad_norm": 2.505536440046342, + "language_loss": 0.78477776, + "learning_rate": 1.2675401141226393e-06, + "loss": 0.80616874, + "num_input_tokens_seen": 226243680, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6953125, + "step": 10491, + "time_per_iteration": 2.4928994178771973 + }, + { + "auxiliary_loss_clip": 0.01104065, + "auxiliary_loss_mlp": 0.01031707, + "balance_loss_clip": 1.02038789, + "balance_loss_mlp": 1.03679323, + "epoch": 0.6308131669923343, + "flos": 24719002767360.0, + "grad_norm": 1.9616523750971206, + "language_loss": 0.55806887, + "learning_rate": 1.2671777244212308e-06, + "loss": 0.57942659, + "num_input_tokens_seen": 226264345, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 10492, + "time_per_iteration": 2.5040977001190186 + }, + { + "auxiliary_loss_clip": 0.01105591, + "auxiliary_loss_mlp": 0.01035776, + "balance_loss_clip": 1.023283, + "balance_loss_mlp": 1.03620148, + "epoch": 0.6308732902450023, + "flos": 22565403959040.0, + "grad_norm": 2.2691030779407693, + "language_loss": 0.63968873, + "learning_rate": 1.2668153625085168e-06, + "loss": 0.66110241, + "num_input_tokens_seen": 226283165, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 10493, + "time_per_iteration": 2.501648187637329 + }, + { + "auxiliary_loss_clip": 0.01103602, + "auxiliary_loss_mlp": 0.01029723, + "balance_loss_clip": 1.01742673, + "balance_loss_mlp": 1.03536439, + "epoch": 0.6309334134976702, + "flos": 24644200694400.0, + "grad_norm": 1.6404154470274028, + "language_loss": 0.82711017, + "learning_rate": 1.2664530283982367e-06, + "loss": 0.84844351, + "num_input_tokens_seen": 226304080, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.68359375, + "step": 10494, + "time_per_iteration": 2.488478183746338 + }, + { + "auxiliary_loss_clip": 0.01106273, + "auxiliary_loss_mlp": 0.010314, + "balance_loss_clip": 1.01908565, + "balance_loss_mlp": 1.03702521, + "epoch": 0.6309935367503382, + "flos": 41427949651200.0, + "grad_norm": 1.6064300635789628, + "language_loss": 0.792678, + "learning_rate": 1.2660907221041317e-06, + "loss": 0.81405473, + "num_input_tokens_seen": 226325925, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6953125, + "step": 10495, + "time_per_iteration": 2.607936143875122 + }, + { + "auxiliary_loss_clip": 0.01104478, + "auxiliary_loss_mlp": 0.01032474, + "balance_loss_clip": 1.01999319, + "balance_loss_mlp": 1.0356319, + "epoch": 0.6310536600030061, + "flos": 15118931445120.0, + "grad_norm": 1.9868473037750025, + "language_loss": 0.69977289, + "learning_rate": 1.2657284436399403e-06, + "loss": 0.72114241, + "num_input_tokens_seen": 226344190, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 10496, + "time_per_iteration": 2.4172658920288086 + }, + { + "auxiliary_loss_clip": 0.01106703, + "auxiliary_loss_mlp": 0.01032501, + "balance_loss_clip": 1.02038908, + "balance_loss_mlp": 1.03729558, + "epoch": 0.6311137832556741, + "flos": 15231619388160.0, + "grad_norm": 2.5454831155818307, + "language_loss": 0.80091369, + "learning_rate": 1.2653661930193997e-06, + "loss": 0.82230574, + "num_input_tokens_seen": 226361520, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 10497, + "time_per_iteration": 2.417558193206787 + }, + { + "auxiliary_loss_clip": 0.01101019, + "auxiliary_loss_mlp": 0.01030621, + "balance_loss_clip": 1.01958823, + "balance_loss_mlp": 1.03501368, + "epoch": 0.6311739065083422, + "flos": 22018664067840.0, + "grad_norm": 1.8690299301257927, + "language_loss": 0.74428982, + "learning_rate": 1.265003970256247e-06, + "loss": 0.76560622, + "num_input_tokens_seen": 226381920, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.66015625, + "step": 10498, + "time_per_iteration": 2.452404737472534 + }, + { + "auxiliary_loss_clip": 0.01104382, + "auxiliary_loss_mlp": 0.0102937, + "balance_loss_clip": 1.01755679, + "balance_loss_mlp": 1.03578484, + "epoch": 0.6312340297610101, + "flos": 22710770300160.0, + "grad_norm": 1.8689991492998164, + "language_loss": 0.69558024, + "learning_rate": 1.264641775364217e-06, + "loss": 0.71691775, + "num_input_tokens_seen": 226400035, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 10499, + "time_per_iteration": 2.4273722171783447 + }, + { + "auxiliary_loss_clip": 0.01101969, + "auxiliary_loss_mlp": 0.01036719, + "balance_loss_clip": 1.02513218, + "balance_loss_mlp": 1.03703976, + "epoch": 0.6312941530136781, + "flos": 24280102483200.0, + "grad_norm": 2.6400614385639294, + "language_loss": 0.70014846, + "learning_rate": 1.2642796083570448e-06, + "loss": 0.72153533, + "num_input_tokens_seen": 226418280, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6484375, + "step": 10500, + "time_per_iteration": 2.4538466930389404 + }, + { + "auxiliary_loss_clip": 0.01106013, + "auxiliary_loss_mlp": 0.01032106, + "balance_loss_clip": 1.02079344, + "balance_loss_mlp": 1.03783047, + "epoch": 0.631354276266346, + "flos": 21725956137600.0, + "grad_norm": 3.0415450485464937, + "language_loss": 0.74062467, + "learning_rate": 1.2639174692484634e-06, + "loss": 0.76200593, + "num_input_tokens_seen": 226436650, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6796875, + "step": 10501, + "time_per_iteration": 2.4303436279296875 + }, + { + "auxiliary_loss_clip": 0.01104429, + "auxiliary_loss_mlp": 0.01031161, + "balance_loss_clip": 1.01901984, + "balance_loss_mlp": 1.03665447, + "epoch": 0.631414399519014, + "flos": 24025100855040.0, + "grad_norm": 1.6794546939174708, + "language_loss": 0.75353241, + "learning_rate": 1.2635553580522053e-06, + "loss": 0.77488828, + "num_input_tokens_seen": 226456275, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6796875, + "step": 10502, + "time_per_iteration": 2.4563441276550293 + }, + { + "auxiliary_loss_clip": 0.01109898, + "auxiliary_loss_mlp": 0.01044466, + "balance_loss_clip": 1.03212154, + "balance_loss_mlp": 1.03856277, + "epoch": 0.6314745227716819, + "flos": 24315797623680.0, + "grad_norm": 2.067886001099209, + "language_loss": 0.85457253, + "learning_rate": 1.2631932747820022e-06, + "loss": 0.87611616, + "num_input_tokens_seen": 226473610, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.7109375, + "step": 10503, + "time_per_iteration": 2.454007148742676 + }, + { + "auxiliary_loss_clip": 0.01104787, + "auxiliary_loss_mlp": 0.0103047, + "balance_loss_clip": 1.01844788, + "balance_loss_mlp": 1.0356003, + "epoch": 0.6315346460243499, + "flos": 23366391292800.0, + "grad_norm": 1.7756005126280807, + "language_loss": 0.86549926, + "learning_rate": 1.2628312194515838e-06, + "loss": 0.88685179, + "num_input_tokens_seen": 226493665, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 10504, + "time_per_iteration": 2.452439546585083 + }, + { + "auxiliary_loss_clip": 0.01110828, + "auxiliary_loss_mlp": 0.01034178, + "balance_loss_clip": 1.02137482, + "balance_loss_mlp": 1.03827763, + "epoch": 0.6315947692770179, + "flos": 20260333497600.0, + "grad_norm": 1.5631411561519288, + "language_loss": 0.76411223, + "learning_rate": 1.2624691920746793e-06, + "loss": 0.78556228, + "num_input_tokens_seen": 226511625, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7265625, + "step": 10505, + "time_per_iteration": 2.4167821407318115 + }, + { + "auxiliary_loss_clip": 0.01107106, + "auxiliary_loss_mlp": 0.01031267, + "balance_loss_clip": 1.01840425, + "balance_loss_mlp": 1.03718579, + "epoch": 0.6316548925296859, + "flos": 25265850399360.0, + "grad_norm": 1.80507675782724, + "language_loss": 0.81566548, + "learning_rate": 1.2621071926650166e-06, + "loss": 0.83704925, + "num_input_tokens_seen": 226530085, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69921875, + "step": 10506, + "time_per_iteration": 2.475015163421631 + }, + { + "auxiliary_loss_clip": 0.01108071, + "auxiliary_loss_mlp": 0.01029424, + "balance_loss_clip": 1.0174973, + "balance_loss_mlp": 1.03848529, + "epoch": 0.6317150157823538, + "flos": 22930579578240.0, + "grad_norm": 1.7792905066974667, + "language_loss": 0.74235427, + "learning_rate": 1.2617452212363238e-06, + "loss": 0.76372921, + "num_input_tokens_seen": 226548115, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6953125, + "step": 10507, + "time_per_iteration": 2.4495646953582764 + }, + { + "auxiliary_loss_clip": 0.01109877, + "auxiliary_loss_mlp": 0.01034985, + "balance_loss_clip": 1.02270663, + "balance_loss_mlp": 1.03861022, + "epoch": 0.6317751390350218, + "flos": 22527051212160.0, + "grad_norm": 1.7094804545962832, + "language_loss": 0.6781255, + "learning_rate": 1.2613832778023258e-06, + "loss": 0.69957411, + "num_input_tokens_seen": 226567955, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.7109375, + "step": 10508, + "time_per_iteration": 2.4817588329315186 + }, + { + "auxiliary_loss_clip": 0.01105487, + "auxiliary_loss_mlp": 0.01028244, + "balance_loss_clip": 1.01696706, + "balance_loss_mlp": 1.03691339, + "epoch": 0.6318352622876897, + "flos": 23294749616640.0, + "grad_norm": 1.6822434485138316, + "language_loss": 0.70602268, + "learning_rate": 1.2610213623767478e-06, + "loss": 0.72736001, + "num_input_tokens_seen": 226588205, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.68359375, + "step": 10509, + "time_per_iteration": 2.511807680130005 + }, + { + "auxiliary_loss_clip": 0.01104325, + "auxiliary_loss_mlp": 0.01027474, + "balance_loss_clip": 1.01614881, + "balance_loss_mlp": 1.03634882, + "epoch": 0.6318953855403577, + "flos": 20704082117760.0, + "grad_norm": 1.6779333049559604, + "language_loss": 0.79419941, + "learning_rate": 1.2606594749733143e-06, + "loss": 0.81551743, + "num_input_tokens_seen": 226606965, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6796875, + "step": 10510, + "time_per_iteration": 3.890570640563965 + }, + { + "auxiliary_loss_clip": 0.01107002, + "auxiliary_loss_mlp": 0.01031338, + "balance_loss_clip": 1.01917839, + "balance_loss_mlp": 1.03689122, + "epoch": 0.6319555087930258, + "flos": 22820046451200.0, + "grad_norm": 1.4507580648571856, + "language_loss": 0.70762742, + "learning_rate": 1.2602976156057469e-06, + "loss": 0.72901082, + "num_input_tokens_seen": 226627845, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.703125, + "step": 10511, + "time_per_iteration": 2.502631902694702 + }, + { + "auxiliary_loss_clip": 0.01102983, + "auxiliary_loss_mlp": 0.01032047, + "balance_loss_clip": 1.02093077, + "balance_loss_mlp": 1.03624094, + "epoch": 0.6320156320456937, + "flos": 19970929618560.0, + "grad_norm": 1.7129276808255165, + "language_loss": 0.80193913, + "learning_rate": 1.2599357842877684e-06, + "loss": 0.82328945, + "num_input_tokens_seen": 226645855, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66796875, + "step": 10512, + "time_per_iteration": 2.4500255584716797 + }, + { + "auxiliary_loss_clip": 0.01108015, + "auxiliary_loss_mlp": 0.01031253, + "balance_loss_clip": 1.01859283, + "balance_loss_mlp": 1.03887498, + "epoch": 0.6320757552983617, + "flos": 27013406889600.0, + "grad_norm": 1.9936938479118853, + "language_loss": 0.70610952, + "learning_rate": 1.2595739810330994e-06, + "loss": 0.72750223, + "num_input_tokens_seen": 226665375, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 10513, + "time_per_iteration": 5.2415876388549805 + }, + { + "auxiliary_loss_clip": 0.01110907, + "auxiliary_loss_mlp": 0.01029996, + "balance_loss_clip": 1.01782441, + "balance_loss_mlp": 1.03923917, + "epoch": 0.6321358785510296, + "flos": 23695943598720.0, + "grad_norm": 1.9330841856618928, + "language_loss": 0.66179729, + "learning_rate": 1.259212205855459e-06, + "loss": 0.68320632, + "num_input_tokens_seen": 226685270, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.71875, + "step": 10514, + "time_per_iteration": 3.9086010456085205 + }, + { + "auxiliary_loss_clip": 0.01102729, + "auxiliary_loss_mlp": 0.01030448, + "balance_loss_clip": 1.01901603, + "balance_loss_mlp": 1.0355525, + "epoch": 0.6321960018036976, + "flos": 25995231970560.0, + "grad_norm": 1.657544375063904, + "language_loss": 0.74582148, + "learning_rate": 1.2588504587685663e-06, + "loss": 0.76715326, + "num_input_tokens_seen": 226705325, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 10515, + "time_per_iteration": 2.4754388332366943 + }, + { + "auxiliary_loss_clip": 0.01103002, + "auxiliary_loss_mlp": 0.01026215, + "balance_loss_clip": 1.01485467, + "balance_loss_mlp": 1.03710318, + "epoch": 0.6322561250563655, + "flos": 22821016118400.0, + "grad_norm": 1.8087331085143223, + "language_loss": 0.89853811, + "learning_rate": 1.2584887397861379e-06, + "loss": 0.91983026, + "num_input_tokens_seen": 226723815, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 10516, + "time_per_iteration": 2.431255578994751 + }, + { + "auxiliary_loss_clip": 0.01113899, + "auxiliary_loss_mlp": 0.01030505, + "balance_loss_clip": 1.01684928, + "balance_loss_mlp": 1.04077578, + "epoch": 0.6323162483090335, + "flos": 18988413926400.0, + "grad_norm": 1.8110008690321133, + "language_loss": 0.81904936, + "learning_rate": 1.2581270489218911e-06, + "loss": 0.84049344, + "num_input_tokens_seen": 226741550, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73046875, + "step": 10517, + "time_per_iteration": 2.418457508087158 + }, + { + "auxiliary_loss_clip": 0.01105413, + "auxiliary_loss_mlp": 0.01034651, + "balance_loss_clip": 1.02295017, + "balance_loss_mlp": 1.03746212, + "epoch": 0.6323763715617015, + "flos": 19865173000320.0, + "grad_norm": 1.9810559885321721, + "language_loss": 0.77525067, + "learning_rate": 1.257765386189541e-06, + "loss": 0.7966513, + "num_input_tokens_seen": 226761115, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 10518, + "time_per_iteration": 2.480358839035034 + }, + { + "auxiliary_loss_clip": 0.01102761, + "auxiliary_loss_mlp": 0.01032476, + "balance_loss_clip": 1.0210377, + "balance_loss_mlp": 1.03653479, + "epoch": 0.6324364948143695, + "flos": 22782699285120.0, + "grad_norm": 1.4836154875686243, + "language_loss": 0.85232532, + "learning_rate": 1.2574037516028018e-06, + "loss": 0.87367767, + "num_input_tokens_seen": 226782225, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 10519, + "time_per_iteration": 2.539891242980957 + }, + { + "auxiliary_loss_clip": 0.01101558, + "auxiliary_loss_mlp": 0.01032383, + "balance_loss_clip": 1.02114785, + "balance_loss_mlp": 1.03623748, + "epoch": 0.6324966180670374, + "flos": 22235923480320.0, + "grad_norm": 1.6381683069265482, + "language_loss": 0.71834314, + "learning_rate": 1.2570421451753867e-06, + "loss": 0.73968256, + "num_input_tokens_seen": 226802375, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 10520, + "time_per_iteration": 2.4911139011383057 + }, + { + "auxiliary_loss_clip": 0.0110337, + "auxiliary_loss_mlp": 0.01030762, + "balance_loss_clip": 1.01956213, + "balance_loss_mlp": 1.03599596, + "epoch": 0.6325567413197054, + "flos": 21689183589120.0, + "grad_norm": 1.7138425612253112, + "language_loss": 0.7110256, + "learning_rate": 1.2566805669210081e-06, + "loss": 0.73236692, + "num_input_tokens_seen": 226822165, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.671875, + "step": 10521, + "time_per_iteration": 2.42466402053833 + }, + { + "auxiliary_loss_clip": 0.01107506, + "auxiliary_loss_mlp": 0.0103163, + "balance_loss_clip": 1.01895845, + "balance_loss_mlp": 1.03792214, + "epoch": 0.6326168645723733, + "flos": 19937137898880.0, + "grad_norm": 1.6701833516110784, + "language_loss": 0.71829087, + "learning_rate": 1.256319016853377e-06, + "loss": 0.7396822, + "num_input_tokens_seen": 226841645, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 10522, + "time_per_iteration": 2.456470012664795 + }, + { + "auxiliary_loss_clip": 0.01105444, + "auxiliary_loss_mlp": 0.01030343, + "balance_loss_clip": 1.01849365, + "balance_loss_mlp": 1.03691065, + "epoch": 0.6326769878250413, + "flos": 20230348619520.0, + "grad_norm": 1.752428604035476, + "language_loss": 0.81730425, + "learning_rate": 1.2559574949862023e-06, + "loss": 0.83866215, + "num_input_tokens_seen": 226860355, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6875, + "step": 10523, + "time_per_iteration": 2.4390153884887695 + }, + { + "auxiliary_loss_clip": 0.01104755, + "auxiliary_loss_mlp": 0.01025919, + "balance_loss_clip": 1.01428986, + "balance_loss_mlp": 1.03734088, + "epoch": 0.6327371110777094, + "flos": 20775759707520.0, + "grad_norm": 2.695654876532073, + "language_loss": 0.73930323, + "learning_rate": 1.255596001333195e-06, + "loss": 0.76060998, + "num_input_tokens_seen": 226878390, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 10524, + "time_per_iteration": 2.4376304149627686 + }, + { + "auxiliary_loss_clip": 0.01111218, + "auxiliary_loss_mlp": 0.01035593, + "balance_loss_clip": 1.02231896, + "balance_loss_mlp": 1.03718793, + "epoch": 0.6327972343303773, + "flos": 30336544529280.0, + "grad_norm": 4.405789883496385, + "language_loss": 0.84463608, + "learning_rate": 1.2552345359080615e-06, + "loss": 0.86610419, + "num_input_tokens_seen": 226898420, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73828125, + "step": 10525, + "time_per_iteration": 2.4973292350769043 + }, + { + "auxiliary_loss_clip": 0.011025, + "auxiliary_loss_mlp": 0.01023105, + "balance_loss_clip": 1.01178622, + "balance_loss_mlp": 1.03544807, + "epoch": 0.6328573575830453, + "flos": 17092258871040.0, + "grad_norm": 3.1585625796827212, + "language_loss": 0.66817802, + "learning_rate": 1.2548730987245093e-06, + "loss": 0.68943405, + "num_input_tokens_seen": 226916305, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 10526, + "time_per_iteration": 2.431757688522339 + }, + { + "auxiliary_loss_clip": 0.01111651, + "auxiliary_loss_mlp": 0.01036373, + "balance_loss_clip": 1.02308118, + "balance_loss_mlp": 1.03971434, + "epoch": 0.6329174808357132, + "flos": 25047154442880.0, + "grad_norm": 2.135799005467542, + "language_loss": 0.7367599, + "learning_rate": 1.254511689796244e-06, + "loss": 0.75824016, + "num_input_tokens_seen": 226937705, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 10527, + "time_per_iteration": 2.473468065261841 + }, + { + "auxiliary_loss_clip": 0.01104509, + "auxiliary_loss_mlp": 0.01029387, + "balance_loss_clip": 1.01803195, + "balance_loss_mlp": 1.03822732, + "epoch": 0.6329776040883812, + "flos": 16836826279680.0, + "grad_norm": 1.98632215188849, + "language_loss": 0.71867841, + "learning_rate": 1.2541503091369693e-06, + "loss": 0.74001735, + "num_input_tokens_seen": 226954880, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6640625, + "step": 10528, + "time_per_iteration": 2.428516387939453 + }, + { + "auxiliary_loss_clip": 0.01103271, + "auxiliary_loss_mlp": 0.01028378, + "balance_loss_clip": 1.01593244, + "balance_loss_mlp": 1.03575611, + "epoch": 0.6330377273410491, + "flos": 13516705382400.0, + "grad_norm": 2.649115399957431, + "language_loss": 0.66042399, + "learning_rate": 1.2537889567603905e-06, + "loss": 0.68174052, + "num_input_tokens_seen": 226972595, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.67578125, + "step": 10529, + "time_per_iteration": 2.4110963344573975 + }, + { + "auxiliary_loss_clip": 0.01109156, + "auxiliary_loss_mlp": 0.01031761, + "balance_loss_clip": 1.01853514, + "balance_loss_mlp": 1.03828883, + "epoch": 0.6330978505937171, + "flos": 21538825257600.0, + "grad_norm": 2.3567719196586134, + "language_loss": 0.75553149, + "learning_rate": 1.2534276326802092e-06, + "loss": 0.7769407, + "num_input_tokens_seen": 226991910, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 10530, + "time_per_iteration": 2.47843074798584 + }, + { + "auxiliary_loss_clip": 0.01109816, + "auxiliary_loss_mlp": 0.01029253, + "balance_loss_clip": 1.01745164, + "balance_loss_mlp": 1.04016328, + "epoch": 0.6331579738463851, + "flos": 25009484054400.0, + "grad_norm": 2.740073625004777, + "language_loss": 0.73872888, + "learning_rate": 1.2530663369101259e-06, + "loss": 0.76011956, + "num_input_tokens_seen": 227010175, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6953125, + "step": 10531, + "time_per_iteration": 2.4678969383239746 + }, + { + "auxiliary_loss_clip": 0.01102476, + "auxiliary_loss_mlp": 0.01028951, + "balance_loss_clip": 1.01666689, + "balance_loss_mlp": 1.03636086, + "epoch": 0.6332180970990531, + "flos": 14976007228800.0, + "grad_norm": 2.9880072875831147, + "language_loss": 0.79408121, + "learning_rate": 1.2527050694638432e-06, + "loss": 0.81539547, + "num_input_tokens_seen": 227025540, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66015625, + "step": 10532, + "time_per_iteration": 2.481036901473999 + }, + { + "auxiliary_loss_clip": 0.01102051, + "auxiliary_loss_mlp": 0.0103236, + "balance_loss_clip": 1.02170849, + "balance_loss_mlp": 1.03580236, + "epoch": 0.633278220351721, + "flos": 22706963458560.0, + "grad_norm": 2.7000401748576817, + "language_loss": 0.74374038, + "learning_rate": 1.2523438303550582e-06, + "loss": 0.76508451, + "num_input_tokens_seen": 227045520, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6640625, + "step": 10533, + "time_per_iteration": 2.4607644081115723 + }, + { + "auxiliary_loss_clip": 0.01110909, + "auxiliary_loss_mlp": 0.01034703, + "balance_loss_clip": 1.02163792, + "balance_loss_mlp": 1.03844595, + "epoch": 0.633338343604389, + "flos": 12602922364800.0, + "grad_norm": 2.750255656428334, + "language_loss": 0.76894259, + "learning_rate": 1.2519826195974706e-06, + "loss": 0.79039878, + "num_input_tokens_seen": 227059420, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.72265625, + "step": 10534, + "time_per_iteration": 2.4279823303222656 + }, + { + "auxiliary_loss_clip": 0.01106846, + "auxiliary_loss_mlp": 0.0103469, + "balance_loss_clip": 1.02312016, + "balance_loss_mlp": 1.03899598, + "epoch": 0.6333984668570569, + "flos": 25960111447680.0, + "grad_norm": 1.5411023230298349, + "language_loss": 0.85583681, + "learning_rate": 1.251621437204777e-06, + "loss": 0.8772521, + "num_input_tokens_seen": 227081310, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6796875, + "step": 10535, + "time_per_iteration": 2.4824087619781494 + }, + { + "auxiliary_loss_clip": 0.01106839, + "auxiliary_loss_mlp": 0.01031821, + "balance_loss_clip": 1.01961398, + "balance_loss_mlp": 1.03782022, + "epoch": 0.6334585901097249, + "flos": 23659242877440.0, + "grad_norm": 2.0534992057606285, + "language_loss": 0.76360321, + "learning_rate": 1.2512602831906733e-06, + "loss": 0.78498983, + "num_input_tokens_seen": 227100365, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 10536, + "time_per_iteration": 2.530451774597168 + }, + { + "auxiliary_loss_clip": 0.0110721, + "auxiliary_loss_mlp": 0.01028938, + "balance_loss_clip": 1.01741064, + "balance_loss_mlp": 1.03990674, + "epoch": 0.633518713362393, + "flos": 28760496503040.0, + "grad_norm": 1.9627877064999752, + "language_loss": 0.60015184, + "learning_rate": 1.250899157568855e-06, + "loss": 0.62151325, + "num_input_tokens_seen": 227119680, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 10537, + "time_per_iteration": 2.5151615142822266 + }, + { + "auxiliary_loss_clip": 0.01026622, + "auxiliary_loss_mlp": 0.01001054, + "balance_loss_clip": 0.999695, + "balance_loss_mlp": 1.00554442, + "epoch": 0.6335788366150609, + "flos": 70420322401920.0, + "grad_norm": 0.7708037183825521, + "language_loss": 0.52472723, + "learning_rate": 1.2505380603530155e-06, + "loss": 0.54500401, + "num_input_tokens_seen": 227184465, + "router_z_loss_clip": 0.01361084, + "router_z_loss_mlp": 0.2109375, + "step": 10538, + "time_per_iteration": 3.165985584259033 + }, + { + "auxiliary_loss_clip": 0.01108701, + "auxiliary_loss_mlp": 0.01033325, + "balance_loss_clip": 1.0205456, + "balance_loss_mlp": 1.0376327, + "epoch": 0.6336389598677289, + "flos": 23732069702400.0, + "grad_norm": 1.8519204835949576, + "language_loss": 0.83039713, + "learning_rate": 1.250176991556848e-06, + "loss": 0.85181737, + "num_input_tokens_seen": 227202185, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7109375, + "step": 10539, + "time_per_iteration": 2.4390335083007812 + }, + { + "auxiliary_loss_clip": 0.01107427, + "auxiliary_loss_mlp": 0.01028431, + "balance_loss_clip": 1.01580071, + "balance_loss_mlp": 1.03738523, + "epoch": 0.6336990831203968, + "flos": 29276676898560.0, + "grad_norm": 1.637138612539208, + "language_loss": 0.86837506, + "learning_rate": 1.2498159511940438e-06, + "loss": 0.88973361, + "num_input_tokens_seen": 227222020, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69921875, + "step": 10540, + "time_per_iteration": 2.4831221103668213 + }, + { + "auxiliary_loss_clip": 0.01100728, + "auxiliary_loss_mlp": 0.01027815, + "balance_loss_clip": 1.01740217, + "balance_loss_mlp": 1.03550363, + "epoch": 0.6337592063730648, + "flos": 29096836479360.0, + "grad_norm": 1.5901447763785947, + "language_loss": 0.7268725, + "learning_rate": 1.2494549392782943e-06, + "loss": 0.74815792, + "num_input_tokens_seen": 227240885, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.65234375, + "step": 10541, + "time_per_iteration": 2.479461908340454 + }, + { + "auxiliary_loss_clip": 0.01109283, + "auxiliary_loss_mlp": 0.0103142, + "balance_loss_clip": 1.01819396, + "balance_loss_mlp": 1.03717303, + "epoch": 0.6338193296257327, + "flos": 34706477249280.0, + "grad_norm": 2.6143323692331166, + "language_loss": 0.84712064, + "learning_rate": 1.2490939558232887e-06, + "loss": 0.86852765, + "num_input_tokens_seen": 227257880, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.72265625, + "step": 10542, + "time_per_iteration": 2.54823899269104 + }, + { + "auxiliary_loss_clip": 0.01105497, + "auxiliary_loss_mlp": 0.01030798, + "balance_loss_clip": 1.01760697, + "balance_loss_mlp": 1.03709495, + "epoch": 0.6338794528784008, + "flos": 16687581269760.0, + "grad_norm": 1.6553786281241991, + "language_loss": 0.77977955, + "learning_rate": 1.2487330008427153e-06, + "loss": 0.80114251, + "num_input_tokens_seen": 227274840, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.68359375, + "step": 10543, + "time_per_iteration": 2.3880414962768555 + }, + { + "auxiliary_loss_clip": 0.0110064, + "auxiliary_loss_mlp": 0.01034533, + "balance_loss_clip": 1.02324414, + "balance_loss_mlp": 1.03599632, + "epoch": 0.6339395761310687, + "flos": 22346600261760.0, + "grad_norm": 1.6753324610621851, + "language_loss": 0.73382592, + "learning_rate": 1.2483720743502618e-06, + "loss": 0.75517762, + "num_input_tokens_seen": 227294835, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 10544, + "time_per_iteration": 2.4576821327209473 + }, + { + "auxiliary_loss_clip": 0.01108095, + "auxiliary_loss_mlp": 0.01036859, + "balance_loss_clip": 1.02458596, + "balance_loss_mlp": 1.0366528, + "epoch": 0.6339996993837367, + "flos": 18551812112640.0, + "grad_norm": 2.0297826320587844, + "language_loss": 0.68563735, + "learning_rate": 1.2480111763596144e-06, + "loss": 0.70708686, + "num_input_tokens_seen": 227314935, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.71484375, + "step": 10545, + "time_per_iteration": 2.4281883239746094 + }, + { + "auxiliary_loss_clip": 0.01102093, + "auxiliary_loss_mlp": 0.01030311, + "balance_loss_clip": 1.01794934, + "balance_loss_mlp": 1.03507733, + "epoch": 0.6340598226364046, + "flos": 12969498614400.0, + "grad_norm": 2.000384025401953, + "language_loss": 0.71141988, + "learning_rate": 1.2476503068844592e-06, + "loss": 0.73274392, + "num_input_tokens_seen": 227332905, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 10546, + "time_per_iteration": 2.4097115993499756 + }, + { + "auxiliary_loss_clip": 0.01101216, + "auxiliary_loss_mlp": 0.01031824, + "balance_loss_clip": 1.02085662, + "balance_loss_mlp": 1.03665507, + "epoch": 0.6341199458890726, + "flos": 26687984647680.0, + "grad_norm": 1.3382755401261122, + "language_loss": 0.77992189, + "learning_rate": 1.2472894659384792e-06, + "loss": 0.80125231, + "num_input_tokens_seen": 227354915, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 10547, + "time_per_iteration": 2.4647274017333984 + }, + { + "auxiliary_loss_clip": 0.01107664, + "auxiliary_loss_mlp": 0.01032127, + "balance_loss_clip": 1.02003956, + "balance_loss_mlp": 1.03658104, + "epoch": 0.6341800691417405, + "flos": 18734274224640.0, + "grad_norm": 1.7405007308500737, + "language_loss": 0.63246721, + "learning_rate": 1.2469286535353578e-06, + "loss": 0.6538651, + "num_input_tokens_seen": 227372990, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.7109375, + "step": 10548, + "time_per_iteration": 2.4153497219085693 + }, + { + "auxiliary_loss_clip": 0.01103941, + "auxiliary_loss_mlp": 0.01027195, + "balance_loss_clip": 1.01604867, + "balance_loss_mlp": 1.03657913, + "epoch": 0.6342401923944085, + "flos": 26249443499520.0, + "grad_norm": 1.5666269720045418, + "language_loss": 0.61767489, + "learning_rate": 1.2465678696887785e-06, + "loss": 0.63898623, + "num_input_tokens_seen": 227393270, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.671875, + "step": 10549, + "time_per_iteration": 2.4682185649871826 + }, + { + "auxiliary_loss_clip": 0.01102967, + "auxiliary_loss_mlp": 0.01028195, + "balance_loss_clip": 1.01777053, + "balance_loss_mlp": 1.03553009, + "epoch": 0.6343003156470765, + "flos": 24680937329280.0, + "grad_norm": 1.7174833177104423, + "language_loss": 0.73910511, + "learning_rate": 1.2462071144124197e-06, + "loss": 0.76041675, + "num_input_tokens_seen": 227413630, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.671875, + "step": 10550, + "time_per_iteration": 2.484928607940674 + }, + { + "auxiliary_loss_clip": 0.01026139, + "auxiliary_loss_mlp": 0.00996982, + "balance_loss_clip": 0.99562275, + "balance_loss_mlp": 1.00515223, + "epoch": 0.6343604388997445, + "flos": 69805352626560.0, + "grad_norm": 0.6918927993882659, + "language_loss": 0.57716167, + "learning_rate": 1.2458463877199638e-06, + "loss": 0.59739286, + "num_input_tokens_seen": 227476630, + "router_z_loss_clip": 0.01361084, + "router_z_loss_mlp": 0.20996094, + "step": 10551, + "time_per_iteration": 3.0650179386138916 + }, + { + "auxiliary_loss_clip": 0.01103158, + "auxiliary_loss_mlp": 0.01026085, + "balance_loss_clip": 1.01508236, + "balance_loss_mlp": 1.03589559, + "epoch": 0.6344205621524125, + "flos": 21982430223360.0, + "grad_norm": 1.7335763595284734, + "language_loss": 0.67098165, + "learning_rate": 1.2454856896250881e-06, + "loss": 0.69227403, + "num_input_tokens_seen": 227496060, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.671875, + "step": 10552, + "time_per_iteration": 3.8182289600372314 + }, + { + "auxiliary_loss_clip": 0.01105164, + "auxiliary_loss_mlp": 0.01028535, + "balance_loss_clip": 1.01642919, + "balance_loss_mlp": 1.03475296, + "epoch": 0.6344806854050804, + "flos": 20448865008000.0, + "grad_norm": 1.910004275661171, + "language_loss": 0.8218025, + "learning_rate": 1.24512502014147e-06, + "loss": 0.84313941, + "num_input_tokens_seen": 227513440, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.703125, + "step": 10553, + "time_per_iteration": 2.449106216430664 + }, + { + "auxiliary_loss_clip": 0.0110533, + "auxiliary_loss_mlp": 0.01033634, + "balance_loss_clip": 1.02142692, + "balance_loss_mlp": 1.03581154, + "epoch": 0.6345408086577484, + "flos": 40510611187200.0, + "grad_norm": 1.776645744912539, + "language_loss": 0.5519408, + "learning_rate": 1.2447643792827879e-06, + "loss": 0.5733304, + "num_input_tokens_seen": 227535395, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6953125, + "step": 10554, + "time_per_iteration": 4.085347652435303 + }, + { + "auxiliary_loss_clip": 0.01105981, + "auxiliary_loss_mlp": 0.01029893, + "balance_loss_clip": 1.01777518, + "balance_loss_mlp": 1.03750849, + "epoch": 0.6346009319104163, + "flos": 21361319222400.0, + "grad_norm": 1.7092991458226663, + "language_loss": 0.70511019, + "learning_rate": 1.2444037670627153e-06, + "loss": 0.72646892, + "num_input_tokens_seen": 227554545, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.68359375, + "step": 10555, + "time_per_iteration": 3.8290207386016846 + }, + { + "auxiliary_loss_clip": 0.01026207, + "auxiliary_loss_mlp": 0.01000287, + "balance_loss_clip": 0.99898165, + "balance_loss_mlp": 1.00531995, + "epoch": 0.6346610551630844, + "flos": 71365419100800.0, + "grad_norm": 0.8961338606309752, + "language_loss": 0.55477089, + "learning_rate": 1.2440431834949276e-06, + "loss": 0.57503581, + "num_input_tokens_seen": 227608575, + "router_z_loss_clip": 0.01306152, + "router_z_loss_mlp": 0.20898438, + "step": 10556, + "time_per_iteration": 4.450624227523804 + }, + { + "auxiliary_loss_clip": 0.01105056, + "auxiliary_loss_mlp": 0.01032391, + "balance_loss_clip": 1.01931942, + "balance_loss_mlp": 1.03497851, + "epoch": 0.6347211784157523, + "flos": 25411504049280.0, + "grad_norm": 1.9064112571580962, + "language_loss": 0.68177021, + "learning_rate": 1.2436826285930985e-06, + "loss": 0.70314467, + "num_input_tokens_seen": 227628175, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.69921875, + "step": 10557, + "time_per_iteration": 2.486895799636841 + }, + { + "auxiliary_loss_clip": 0.01103875, + "auxiliary_loss_mlp": 0.01031026, + "balance_loss_clip": 1.01968968, + "balance_loss_mlp": 1.03706682, + "epoch": 0.6347813016684203, + "flos": 15742735966080.0, + "grad_norm": 1.9232562930576766, + "language_loss": 0.70448172, + "learning_rate": 1.2433221023709002e-06, + "loss": 0.72583079, + "num_input_tokens_seen": 227645330, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 10558, + "time_per_iteration": 2.40922212600708 + }, + { + "auxiliary_loss_clip": 0.0110235, + "auxiliary_loss_mlp": 0.01031008, + "balance_loss_clip": 1.01924789, + "balance_loss_mlp": 1.03492951, + "epoch": 0.6348414249210882, + "flos": 21464777370240.0, + "grad_norm": 1.4566517765841722, + "language_loss": 0.78202355, + "learning_rate": 1.2429616048420031e-06, + "loss": 0.80335712, + "num_input_tokens_seen": 227665250, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.67578125, + "step": 10559, + "time_per_iteration": 2.44706130027771 + }, + { + "auxiliary_loss_clip": 0.01108267, + "auxiliary_loss_mlp": 0.01037326, + "balance_loss_clip": 1.02512479, + "balance_loss_mlp": 1.03806639, + "epoch": 0.6349015481737562, + "flos": 21653057485440.0, + "grad_norm": 2.1761339392195467, + "language_loss": 0.68320858, + "learning_rate": 1.242601136020078e-06, + "loss": 0.70466453, + "num_input_tokens_seen": 227685070, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.703125, + "step": 10560, + "time_per_iteration": 2.4409596920013428 + }, + { + "auxiliary_loss_clip": 0.01104015, + "auxiliary_loss_mlp": 0.01034754, + "balance_loss_clip": 1.02302957, + "balance_loss_mlp": 1.03592563, + "epoch": 0.6349616714264241, + "flos": 22194984954240.0, + "grad_norm": 1.67836467156634, + "language_loss": 0.7699995, + "learning_rate": 1.2422406959187939e-06, + "loss": 0.7913872, + "num_input_tokens_seen": 227704430, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 10561, + "time_per_iteration": 2.5039145946502686 + }, + { + "auxiliary_loss_clip": 0.01104347, + "auxiliary_loss_mlp": 0.01032147, + "balance_loss_clip": 1.01994574, + "balance_loss_mlp": 1.0352478, + "epoch": 0.6350217946790921, + "flos": 25410354814080.0, + "grad_norm": 4.009586317175133, + "language_loss": 0.72008455, + "learning_rate": 1.2418802845518178e-06, + "loss": 0.74144948, + "num_input_tokens_seen": 227724920, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69140625, + "step": 10562, + "time_per_iteration": 2.472137212753296 + }, + { + "auxiliary_loss_clip": 0.01107214, + "auxiliary_loss_mlp": 0.010302, + "balance_loss_clip": 1.01749265, + "balance_loss_mlp": 1.03718257, + "epoch": 0.63508191793176, + "flos": 19718944732800.0, + "grad_norm": 2.5105421382487267, + "language_loss": 0.80683196, + "learning_rate": 1.2415199019328185e-06, + "loss": 0.82820606, + "num_input_tokens_seen": 227743400, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 10563, + "time_per_iteration": 2.4413557052612305 + }, + { + "auxiliary_loss_clip": 0.01108821, + "auxiliary_loss_mlp": 0.01036739, + "balance_loss_clip": 1.02480066, + "balance_loss_mlp": 1.03883505, + "epoch": 0.6351420411844281, + "flos": 18186923802240.0, + "grad_norm": 2.110536240738381, + "language_loss": 0.80818796, + "learning_rate": 1.2411595480754597e-06, + "loss": 0.82964349, + "num_input_tokens_seen": 227759990, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.69921875, + "step": 10564, + "time_per_iteration": 2.4266111850738525 + }, + { + "auxiliary_loss_clip": 0.01107128, + "auxiliary_loss_mlp": 0.01031565, + "balance_loss_clip": 1.02003133, + "balance_loss_mlp": 1.03857136, + "epoch": 0.6352021644370961, + "flos": 33726511422720.0, + "grad_norm": 1.6172553063068438, + "language_loss": 0.72285914, + "learning_rate": 1.240799222993407e-06, + "loss": 0.74424613, + "num_input_tokens_seen": 227780835, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6875, + "step": 10565, + "time_per_iteration": 2.534834623336792 + }, + { + "auxiliary_loss_clip": 0.01106685, + "auxiliary_loss_mlp": 0.01030402, + "balance_loss_clip": 1.01762271, + "balance_loss_mlp": 1.03696799, + "epoch": 0.635262287689764, + "flos": 20374781207040.0, + "grad_norm": 2.0506297866150467, + "language_loss": 0.69144678, + "learning_rate": 1.240438926700324e-06, + "loss": 0.71281761, + "num_input_tokens_seen": 227798580, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 10566, + "time_per_iteration": 2.411491632461548 + }, + { + "auxiliary_loss_clip": 0.01102305, + "auxiliary_loss_mlp": 0.01029542, + "balance_loss_clip": 1.01837826, + "balance_loss_mlp": 1.03648448, + "epoch": 0.635322410942432, + "flos": 27525421307520.0, + "grad_norm": 1.5548948412040506, + "language_loss": 0.69706547, + "learning_rate": 1.2400786592098725e-06, + "loss": 0.71838397, + "num_input_tokens_seen": 227819210, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 10567, + "time_per_iteration": 2.48917293548584 + }, + { + "auxiliary_loss_clip": 0.01103585, + "auxiliary_loss_mlp": 0.01032656, + "balance_loss_clip": 1.02129579, + "balance_loss_mlp": 1.03807187, + "epoch": 0.6353825341950999, + "flos": 21543601766400.0, + "grad_norm": 2.2646303551803753, + "language_loss": 0.84620178, + "learning_rate": 1.2397184205357154e-06, + "loss": 0.86756414, + "num_input_tokens_seen": 227838340, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65625, + "step": 10568, + "time_per_iteration": 2.4403724670410156 + }, + { + "auxiliary_loss_clip": 0.0110714, + "auxiliary_loss_mlp": 0.01038435, + "balance_loss_clip": 1.02603149, + "balance_loss_mlp": 1.03773642, + "epoch": 0.635442657447768, + "flos": 31759756185600.0, + "grad_norm": 1.746273347856982, + "language_loss": 0.83601934, + "learning_rate": 1.2393582106915113e-06, + "loss": 0.8574751, + "num_input_tokens_seen": 227859170, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6953125, + "step": 10569, + "time_per_iteration": 2.5299484729766846 + }, + { + "auxiliary_loss_clip": 0.01103471, + "auxiliary_loss_mlp": 0.01027469, + "balance_loss_clip": 1.01576304, + "balance_loss_mlp": 1.03676128, + "epoch": 0.6355027807004359, + "flos": 19828831415040.0, + "grad_norm": 1.7093099643488843, + "language_loss": 0.69269961, + "learning_rate": 1.2389980296909198e-06, + "loss": 0.71400905, + "num_input_tokens_seen": 227878545, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66796875, + "step": 10570, + "time_per_iteration": 2.4609997272491455 + }, + { + "auxiliary_loss_clip": 0.01105483, + "auxiliary_loss_mlp": 0.01033824, + "balance_loss_clip": 1.02142024, + "balance_loss_mlp": 1.03430879, + "epoch": 0.6355629039531039, + "flos": 30372383324160.0, + "grad_norm": 1.6697776111718718, + "language_loss": 0.65798032, + "learning_rate": 1.2386378775476e-06, + "loss": 0.67937338, + "num_input_tokens_seen": 227898875, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 10571, + "time_per_iteration": 2.5261099338531494 + }, + { + "auxiliary_loss_clip": 0.01110578, + "auxiliary_loss_mlp": 0.01029161, + "balance_loss_clip": 1.01701999, + "balance_loss_mlp": 1.03919911, + "epoch": 0.6356230272057718, + "flos": 17932065828480.0, + "grad_norm": 1.7838042943408632, + "language_loss": 0.71219468, + "learning_rate": 1.2382777542752074e-06, + "loss": 0.73359203, + "num_input_tokens_seen": 227917130, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.7109375, + "step": 10572, + "time_per_iteration": 2.4292333126068115 + }, + { + "auxiliary_loss_clip": 0.0110121, + "auxiliary_loss_mlp": 0.01032091, + "balance_loss_clip": 1.02092671, + "balance_loss_mlp": 1.03459537, + "epoch": 0.6356831504584398, + "flos": 25375844822400.0, + "grad_norm": 2.8044296408111657, + "language_loss": 0.81269503, + "learning_rate": 1.2379176598873992e-06, + "loss": 0.83402801, + "num_input_tokens_seen": 227939550, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.66796875, + "step": 10573, + "time_per_iteration": 2.5012412071228027 + }, + { + "auxiliary_loss_clip": 0.01106251, + "auxiliary_loss_mlp": 0.01030673, + "balance_loss_clip": 1.01897848, + "balance_loss_mlp": 1.0366838, + "epoch": 0.6357432737111077, + "flos": 46500331720320.0, + "grad_norm": 1.5014218063812952, + "language_loss": 0.68932259, + "learning_rate": 1.2375575943978303e-06, + "loss": 0.71069181, + "num_input_tokens_seen": 227962200, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.69921875, + "step": 10574, + "time_per_iteration": 2.668290853500366 + }, + { + "auxiliary_loss_clip": 0.01104073, + "auxiliary_loss_mlp": 0.01027616, + "balance_loss_clip": 1.01580226, + "balance_loss_mlp": 1.03717065, + "epoch": 0.6358033969637757, + "flos": 17274361847040.0, + "grad_norm": 2.2372840416556476, + "language_loss": 0.86855853, + "learning_rate": 1.2371975578201525e-06, + "loss": 0.88987547, + "num_input_tokens_seen": 227979270, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 10575, + "time_per_iteration": 2.4198617935180664 + }, + { + "auxiliary_loss_clip": 0.01104492, + "auxiliary_loss_mlp": 0.01032617, + "balance_loss_clip": 1.02124405, + "balance_loss_mlp": 1.03752255, + "epoch": 0.6358635202164437, + "flos": 27125520215040.0, + "grad_norm": 1.7217722573852687, + "language_loss": 0.72000861, + "learning_rate": 1.2368375501680204e-06, + "loss": 0.74137974, + "num_input_tokens_seen": 228000550, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.66796875, + "step": 10576, + "time_per_iteration": 2.4883639812469482 + }, + { + "auxiliary_loss_clip": 0.01106159, + "auxiliary_loss_mlp": 0.01028082, + "balance_loss_clip": 1.01569581, + "balance_loss_mlp": 1.03626978, + "epoch": 0.6359236434691117, + "flos": 27525205825920.0, + "grad_norm": 1.7446831979165325, + "language_loss": 0.69537437, + "learning_rate": 1.236477571455085e-06, + "loss": 0.71671677, + "num_input_tokens_seen": 228022005, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69921875, + "step": 10577, + "time_per_iteration": 2.4888103008270264 + }, + { + "auxiliary_loss_clip": 0.01103371, + "auxiliary_loss_mlp": 0.01029672, + "balance_loss_clip": 1.01802576, + "balance_loss_mlp": 1.03562689, + "epoch": 0.6359837667217797, + "flos": 39348290989440.0, + "grad_norm": 1.9869814787183224, + "language_loss": 0.72090602, + "learning_rate": 1.2361176216949964e-06, + "loss": 0.74223644, + "num_input_tokens_seen": 228043770, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 10578, + "time_per_iteration": 2.5746970176696777 + }, + { + "auxiliary_loss_clip": 0.01025564, + "auxiliary_loss_mlp": 0.0100215, + "balance_loss_clip": 1.00076127, + "balance_loss_mlp": 1.00480723, + "epoch": 0.6360438899744476, + "flos": 56413797206400.0, + "grad_norm": 0.7033646347458022, + "language_loss": 0.54444003, + "learning_rate": 1.2357577009014044e-06, + "loss": 0.56471717, + "num_input_tokens_seen": 228104985, + "router_z_loss_clip": 0.01391602, + "router_z_loss_mlp": 0.20800781, + "step": 10579, + "time_per_iteration": 3.1232736110687256 + }, + { + "auxiliary_loss_clip": 0.01105773, + "auxiliary_loss_mlp": 0.01027914, + "balance_loss_clip": 1.01612449, + "balance_loss_mlp": 1.0368464, + "epoch": 0.6361040132271156, + "flos": 24973106555520.0, + "grad_norm": 1.7171447811267215, + "language_loss": 0.77475232, + "learning_rate": 1.2353978090879568e-06, + "loss": 0.79608917, + "num_input_tokens_seen": 228125620, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6875, + "step": 10580, + "time_per_iteration": 2.461869239807129 + }, + { + "auxiliary_loss_clip": 0.01103242, + "auxiliary_loss_mlp": 0.01025216, + "balance_loss_clip": 1.0138669, + "balance_loss_mlp": 1.03540814, + "epoch": 0.6361641364797835, + "flos": 23259198130560.0, + "grad_norm": 2.098056730123376, + "language_loss": 0.67005563, + "learning_rate": 1.235037946268301e-06, + "loss": 0.69134021, + "num_input_tokens_seen": 228143495, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6796875, + "step": 10581, + "time_per_iteration": 2.4425008296966553 + }, + { + "auxiliary_loss_clip": 0.01102376, + "auxiliary_loss_mlp": 0.01030749, + "balance_loss_clip": 1.01949584, + "balance_loss_mlp": 1.03480268, + "epoch": 0.6362242597324516, + "flos": 25994513698560.0, + "grad_norm": 1.3074505079001684, + "language_loss": 0.68299043, + "learning_rate": 1.2346781124560828e-06, + "loss": 0.70432162, + "num_input_tokens_seen": 228166500, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.67578125, + "step": 10582, + "time_per_iteration": 2.4763622283935547 + }, + { + "auxiliary_loss_clip": 0.01106848, + "auxiliary_loss_mlp": 0.01035558, + "balance_loss_clip": 1.02355933, + "balance_loss_mlp": 1.03695726, + "epoch": 0.6362843829851195, + "flos": 25703242312320.0, + "grad_norm": 2.267802402035549, + "language_loss": 0.84247005, + "learning_rate": 1.2343183076649473e-06, + "loss": 0.8638941, + "num_input_tokens_seen": 228185325, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69921875, + "step": 10583, + "time_per_iteration": 2.4797277450561523 + }, + { + "auxiliary_loss_clip": 0.0110538, + "auxiliary_loss_mlp": 0.01029153, + "balance_loss_clip": 1.01703572, + "balance_loss_mlp": 1.03860188, + "epoch": 0.6363445062377875, + "flos": 20522912895360.0, + "grad_norm": 1.5650473008286672, + "language_loss": 0.7515592, + "learning_rate": 1.233958531908538e-06, + "loss": 0.77290452, + "num_input_tokens_seen": 228204050, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 10584, + "time_per_iteration": 2.430316209793091 + }, + { + "auxiliary_loss_clip": 0.01106996, + "auxiliary_loss_mlp": 0.01037982, + "balance_loss_clip": 1.02420747, + "balance_loss_mlp": 1.03688443, + "epoch": 0.6364046294904554, + "flos": 19463799450240.0, + "grad_norm": 1.9066305180241776, + "language_loss": 0.72856915, + "learning_rate": 1.2335987852004985e-06, + "loss": 0.75001895, + "num_input_tokens_seen": 228222430, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.703125, + "step": 10585, + "time_per_iteration": 2.4419803619384766 + }, + { + "auxiliary_loss_clip": 0.01105577, + "auxiliary_loss_mlp": 0.01028793, + "balance_loss_clip": 1.01756358, + "balance_loss_mlp": 1.03718138, + "epoch": 0.6364647527431234, + "flos": 20995892208000.0, + "grad_norm": 1.8332276657421747, + "language_loss": 0.82785809, + "learning_rate": 1.2332390675544697e-06, + "loss": 0.8492018, + "num_input_tokens_seen": 228241925, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.68359375, + "step": 10586, + "time_per_iteration": 2.421600341796875 + }, + { + "auxiliary_loss_clip": 0.01103874, + "auxiliary_loss_mlp": 0.0102664, + "balance_loss_clip": 1.01523161, + "balance_loss_mlp": 1.03603029, + "epoch": 0.6365248759957913, + "flos": 25770789838080.0, + "grad_norm": 4.704421092048837, + "language_loss": 0.72570878, + "learning_rate": 1.2328793789840918e-06, + "loss": 0.74701393, + "num_input_tokens_seen": 228262535, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6796875, + "step": 10587, + "time_per_iteration": 2.472022533416748 + }, + { + "auxiliary_loss_clip": 0.0110564, + "auxiliary_loss_mlp": 0.01025045, + "balance_loss_clip": 1.01379192, + "balance_loss_mlp": 1.03764784, + "epoch": 0.6365849992484593, + "flos": 22455589104000.0, + "grad_norm": 1.7915085469286844, + "language_loss": 0.76668859, + "learning_rate": 1.2325197195030058e-06, + "loss": 0.7879954, + "num_input_tokens_seen": 228281340, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6796875, + "step": 10588, + "time_per_iteration": 2.4190168380737305 + }, + { + "auxiliary_loss_clip": 0.01102746, + "auxiliary_loss_mlp": 0.01027364, + "balance_loss_clip": 1.01552689, + "balance_loss_mlp": 1.03755879, + "epoch": 0.6366451225011273, + "flos": 19025689265280.0, + "grad_norm": 1.3970993827847034, + "language_loss": 0.79966116, + "learning_rate": 1.2321600891248478e-06, + "loss": 0.82096231, + "num_input_tokens_seen": 228300865, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65234375, + "step": 10589, + "time_per_iteration": 2.4743268489837646 + }, + { + "auxiliary_loss_clip": 0.01104028, + "auxiliary_loss_mlp": 0.01028134, + "balance_loss_clip": 1.01623118, + "balance_loss_mlp": 1.03771806, + "epoch": 0.6367052457537953, + "flos": 25228395492480.0, + "grad_norm": 2.311775126826065, + "language_loss": 0.67541653, + "learning_rate": 1.231800487863257e-06, + "loss": 0.69673812, + "num_input_tokens_seen": 228320815, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 10590, + "time_per_iteration": 2.450011730194092 + }, + { + "auxiliary_loss_clip": 0.0111127, + "auxiliary_loss_mlp": 0.01031485, + "balance_loss_clip": 1.01871789, + "balance_loss_mlp": 1.03779423, + "epoch": 0.6367653690064633, + "flos": 19208438686080.0, + "grad_norm": 1.6364871188688683, + "language_loss": 0.79574269, + "learning_rate": 1.2314409157318685e-06, + "loss": 0.8171702, + "num_input_tokens_seen": 228339065, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.734375, + "step": 10591, + "time_per_iteration": 2.4351706504821777 + }, + { + "auxiliary_loss_clip": 0.01104191, + "auxiliary_loss_mlp": 0.01026707, + "balance_loss_clip": 1.0155071, + "balance_loss_mlp": 1.03807092, + "epoch": 0.6368254922591312, + "flos": 23546806329600.0, + "grad_norm": 1.6582489812189014, + "language_loss": 0.8898353, + "learning_rate": 1.231081372744317e-06, + "loss": 0.91114426, + "num_input_tokens_seen": 228359210, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.66015625, + "step": 10592, + "time_per_iteration": 2.4826667308807373 + }, + { + "auxiliary_loss_clip": 0.01101297, + "auxiliary_loss_mlp": 0.01025871, + "balance_loss_clip": 1.01484966, + "balance_loss_mlp": 1.03570986, + "epoch": 0.6368856155117992, + "flos": 26467313443200.0, + "grad_norm": 1.2873763192716858, + "language_loss": 0.68307251, + "learning_rate": 1.2307218589142376e-06, + "loss": 0.70434421, + "num_input_tokens_seen": 228379630, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 10593, + "time_per_iteration": 3.870232105255127 + }, + { + "auxiliary_loss_clip": 0.01101916, + "auxiliary_loss_mlp": 0.01030234, + "balance_loss_clip": 1.01887894, + "balance_loss_mlp": 1.03454375, + "epoch": 0.6369457387644671, + "flos": 33692432394240.0, + "grad_norm": 1.9223941478023494, + "language_loss": 0.63311636, + "learning_rate": 1.2303623742552618e-06, + "loss": 0.6544379, + "num_input_tokens_seen": 228401410, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.671875, + "step": 10594, + "time_per_iteration": 2.5456788539886475 + }, + { + "auxiliary_loss_clip": 0.01026012, + "auxiliary_loss_mlp": 0.00997701, + "balance_loss_clip": 0.99638408, + "balance_loss_mlp": 1.0052495, + "epoch": 0.6370058620171352, + "flos": 70908600908160.0, + "grad_norm": 0.7601242064241133, + "language_loss": 0.54636633, + "learning_rate": 1.230002918781022e-06, + "loss": 0.56660342, + "num_input_tokens_seen": 228470335, + "router_z_loss_clip": 0.01318359, + "router_z_loss_mlp": 0.20703125, + "step": 10595, + "time_per_iteration": 3.1794607639312744 + }, + { + "auxiliary_loss_clip": 0.01107322, + "auxiliary_loss_mlp": 0.01033931, + "balance_loss_clip": 1.02145016, + "balance_loss_mlp": 1.03732097, + "epoch": 0.6370659852698031, + "flos": 21141940907520.0, + "grad_norm": 1.701168717319966, + "language_loss": 0.6690321, + "learning_rate": 1.2296434925051493e-06, + "loss": 0.69044465, + "num_input_tokens_seen": 228490765, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69921875, + "step": 10596, + "time_per_iteration": 5.259617328643799 + }, + { + "auxiliary_loss_clip": 0.01105102, + "auxiliary_loss_mlp": 0.01028653, + "balance_loss_clip": 1.0173285, + "balance_loss_mlp": 1.03692877, + "epoch": 0.6371261085224711, + "flos": 20193288762240.0, + "grad_norm": 2.3148419368361686, + "language_loss": 0.78864521, + "learning_rate": 1.2292840954412718e-06, + "loss": 0.80998278, + "num_input_tokens_seen": 228509700, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6796875, + "step": 10597, + "time_per_iteration": 3.8967549800872803 + }, + { + "auxiliary_loss_clip": 0.01105388, + "auxiliary_loss_mlp": 0.01028448, + "balance_loss_clip": 1.01746917, + "balance_loss_mlp": 1.03798401, + "epoch": 0.637186231775139, + "flos": 19683536901120.0, + "grad_norm": 1.7226875807463897, + "language_loss": 0.7490381, + "learning_rate": 1.2289247276030189e-06, + "loss": 0.77037644, + "num_input_tokens_seen": 228529050, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.671875, + "step": 10598, + "time_per_iteration": 2.426950693130493 + }, + { + "auxiliary_loss_clip": 0.01105339, + "auxiliary_loss_mlp": 0.01031507, + "balance_loss_clip": 1.02002072, + "balance_loss_mlp": 1.03712225, + "epoch": 0.637246355027807, + "flos": 13071196995840.0, + "grad_norm": 1.827624008719727, + "language_loss": 0.68324673, + "learning_rate": 1.2285653890040176e-06, + "loss": 0.70461518, + "num_input_tokens_seen": 228544665, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6796875, + "step": 10599, + "time_per_iteration": 2.3905580043792725 + }, + { + "auxiliary_loss_clip": 0.01106294, + "auxiliary_loss_mlp": 0.0103191, + "balance_loss_clip": 1.0198344, + "balance_loss_mlp": 1.03601742, + "epoch": 0.6373064782804749, + "flos": 18222654856320.0, + "grad_norm": 2.01568733519361, + "language_loss": 0.80380464, + "learning_rate": 1.2282060796578942e-06, + "loss": 0.82518673, + "num_input_tokens_seen": 228562060, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.703125, + "step": 10600, + "time_per_iteration": 2.390493631362915 + }, + { + "auxiliary_loss_clip": 0.01102518, + "auxiliary_loss_mlp": 0.01028291, + "balance_loss_clip": 1.01701963, + "balance_loss_mlp": 1.03515601, + "epoch": 0.637366601533143, + "flos": 24498475217280.0, + "grad_norm": 1.447681041520347, + "language_loss": 0.79922855, + "learning_rate": 1.2278467995782732e-06, + "loss": 0.82053661, + "num_input_tokens_seen": 228582550, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.67578125, + "step": 10601, + "time_per_iteration": 2.4929754734039307 + }, + { + "auxiliary_loss_clip": 0.01105771, + "auxiliary_loss_mlp": 0.01026276, + "balance_loss_clip": 1.01460528, + "balance_loss_mlp": 1.03642297, + "epoch": 0.6374267247858109, + "flos": 26359042872960.0, + "grad_norm": 1.837610857942547, + "language_loss": 0.66878605, + "learning_rate": 1.2274875487787797e-06, + "loss": 0.69010651, + "num_input_tokens_seen": 228604960, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6953125, + "step": 10602, + "time_per_iteration": 2.467132091522217 + }, + { + "auxiliary_loss_clip": 0.01101843, + "auxiliary_loss_mlp": 0.01022562, + "balance_loss_clip": 1.01127887, + "balance_loss_mlp": 1.03530014, + "epoch": 0.6374868480384789, + "flos": 20371728551040.0, + "grad_norm": 1.9668253771039714, + "language_loss": 0.79456556, + "learning_rate": 1.2271283272730354e-06, + "loss": 0.81580961, + "num_input_tokens_seen": 228622195, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6640625, + "step": 10603, + "time_per_iteration": 2.439401149749756 + }, + { + "auxiliary_loss_clip": 0.01103337, + "auxiliary_loss_mlp": 0.01026726, + "balance_loss_clip": 1.01447129, + "balance_loss_mlp": 1.03550994, + "epoch": 0.6375469712911469, + "flos": 20996251344000.0, + "grad_norm": 2.0023670291582034, + "language_loss": 0.76751029, + "learning_rate": 1.2267691350746621e-06, + "loss": 0.78881085, + "num_input_tokens_seen": 228639735, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6796875, + "step": 10604, + "time_per_iteration": 2.4105138778686523 + }, + { + "auxiliary_loss_clip": 0.01108604, + "auxiliary_loss_mlp": 0.01032325, + "balance_loss_clip": 1.02013552, + "balance_loss_mlp": 1.03714681, + "epoch": 0.6376070945438148, + "flos": 19715748422400.0, + "grad_norm": 1.780209303316883, + "language_loss": 0.77448142, + "learning_rate": 1.226409972197281e-06, + "loss": 0.79589069, + "num_input_tokens_seen": 228658195, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.71484375, + "step": 10605, + "time_per_iteration": 2.4292843341827393 + }, + { + "auxiliary_loss_clip": 0.01105536, + "auxiliary_loss_mlp": 0.01026657, + "balance_loss_clip": 1.01408625, + "balance_loss_mlp": 1.03642416, + "epoch": 0.6376672177964828, + "flos": 21506757390720.0, + "grad_norm": 1.9363320912621251, + "language_loss": 0.65341508, + "learning_rate": 1.2260508386545106e-06, + "loss": 0.67473698, + "num_input_tokens_seen": 228677415, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69140625, + "step": 10606, + "time_per_iteration": 2.427497625350952 + }, + { + "auxiliary_loss_clip": 0.01101905, + "auxiliary_loss_mlp": 0.01033123, + "balance_loss_clip": 1.02189994, + "balance_loss_mlp": 1.03686523, + "epoch": 0.6377273410491507, + "flos": 18843873598080.0, + "grad_norm": 1.731945960339434, + "language_loss": 0.75044298, + "learning_rate": 1.225691734459971e-06, + "loss": 0.77179325, + "num_input_tokens_seen": 228696450, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 10607, + "time_per_iteration": 2.446707248687744 + }, + { + "auxiliary_loss_clip": 0.01106141, + "auxiliary_loss_mlp": 0.0103654, + "balance_loss_clip": 1.02503061, + "balance_loss_mlp": 1.03733909, + "epoch": 0.6377874643018188, + "flos": 53062970181120.0, + "grad_norm": 1.7077896003554156, + "language_loss": 0.65732801, + "learning_rate": 1.225332659627278e-06, + "loss": 0.67875481, + "num_input_tokens_seen": 228721600, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6875, + "step": 10608, + "time_per_iteration": 2.7172274589538574 + }, + { + "auxiliary_loss_clip": 0.01026098, + "auxiliary_loss_mlp": 0.01010909, + "balance_loss_clip": 1.00953197, + "balance_loss_mlp": 1.00546312, + "epoch": 0.6378475875544867, + "flos": 65135026465920.0, + "grad_norm": 0.7342720172803939, + "language_loss": 0.51933324, + "learning_rate": 1.2249736141700475e-06, + "loss": 0.53970337, + "num_input_tokens_seen": 228784535, + "router_z_loss_clip": 0.01379395, + "router_z_loss_mlp": 0.20703125, + "step": 10609, + "time_per_iteration": 3.038902759552002 + }, + { + "auxiliary_loss_clip": 0.01099294, + "auxiliary_loss_mlp": 0.01023726, + "balance_loss_clip": 1.01322937, + "balance_loss_mlp": 1.03415811, + "epoch": 0.6379077108071547, + "flos": 23002759958400.0, + "grad_norm": 1.5171992119631734, + "language_loss": 0.74632645, + "learning_rate": 1.2246145981018965e-06, + "loss": 0.76755667, + "num_input_tokens_seen": 228804110, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.65234375, + "step": 10610, + "time_per_iteration": 2.472832202911377 + }, + { + "auxiliary_loss_clip": 0.0102568, + "auxiliary_loss_mlp": 0.01003259, + "balance_loss_clip": 1.00187004, + "balance_loss_mlp": 1.00496507, + "epoch": 0.6379678340598226, + "flos": 67601947610880.0, + "grad_norm": 0.8614298288544585, + "language_loss": 0.63198531, + "learning_rate": 1.2242556114364364e-06, + "loss": 0.65227467, + "num_input_tokens_seen": 228867705, + "router_z_loss_clip": 0.01391602, + "router_z_loss_mlp": 0.20703125, + "step": 10611, + "time_per_iteration": 3.118346691131592 + }, + { + "auxiliary_loss_clip": 0.01104297, + "auxiliary_loss_mlp": 0.01035566, + "balance_loss_clip": 1.02351391, + "balance_loss_mlp": 1.03604293, + "epoch": 0.6380279573124906, + "flos": 29680061610240.0, + "grad_norm": 2.12180371585039, + "language_loss": 0.72335958, + "learning_rate": 1.223896654187282e-06, + "loss": 0.74475813, + "num_input_tokens_seen": 228889215, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.68359375, + "step": 10612, + "time_per_iteration": 2.5017549991607666 + }, + { + "auxiliary_loss_clip": 0.01025775, + "auxiliary_loss_mlp": 0.01000915, + "balance_loss_clip": 0.99957991, + "balance_loss_mlp": 1.00507379, + "epoch": 0.6380880805651585, + "flos": 66484046580480.0, + "grad_norm": 0.7184948556551517, + "language_loss": 0.57873541, + "learning_rate": 1.2235377263680446e-06, + "loss": 0.5990023, + "num_input_tokens_seen": 228948465, + "router_z_loss_clip": 0.0133667, + "router_z_loss_mlp": 0.20703125, + "step": 10613, + "time_per_iteration": 2.9799587726593018 + }, + { + "auxiliary_loss_clip": 0.01107464, + "auxiliary_loss_mlp": 0.01031857, + "balance_loss_clip": 1.01890481, + "balance_loss_mlp": 1.03777075, + "epoch": 0.6381482038178266, + "flos": 23914998691200.0, + "grad_norm": 1.7105248760789145, + "language_loss": 0.75128651, + "learning_rate": 1.2231788279923334e-06, + "loss": 0.77267975, + "num_input_tokens_seen": 228967955, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69921875, + "step": 10614, + "time_per_iteration": 2.491565465927124 + }, + { + "auxiliary_loss_clip": 0.01106638, + "auxiliary_loss_mlp": 0.01034933, + "balance_loss_clip": 1.02311897, + "balance_loss_mlp": 1.03886855, + "epoch": 0.6382083270704945, + "flos": 24243042625920.0, + "grad_norm": 1.84751826433944, + "language_loss": 0.79666638, + "learning_rate": 1.2228199590737599e-06, + "loss": 0.81808209, + "num_input_tokens_seen": 228985495, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 10615, + "time_per_iteration": 2.492230176925659 + }, + { + "auxiliary_loss_clip": 0.0102549, + "auxiliary_loss_mlp": 0.01004342, + "balance_loss_clip": 1.00296533, + "balance_loss_mlp": 1.00477338, + "epoch": 0.6382684503231625, + "flos": 70775552931840.0, + "grad_norm": 0.6538614969335592, + "language_loss": 0.55591351, + "learning_rate": 1.2224611196259305e-06, + "loss": 0.57621187, + "num_input_tokens_seen": 229052995, + "router_z_loss_clip": 0.01379395, + "router_z_loss_mlp": 0.20703125, + "step": 10616, + "time_per_iteration": 3.1426796913146973 + }, + { + "auxiliary_loss_clip": 0.01103937, + "auxiliary_loss_mlp": 0.01030089, + "balance_loss_clip": 1.01841819, + "balance_loss_mlp": 1.03616679, + "epoch": 0.6383285735758305, + "flos": 16544836621440.0, + "grad_norm": 1.9069966042725246, + "language_loss": 0.83733106, + "learning_rate": 1.2221023096624538e-06, + "loss": 0.85867131, + "num_input_tokens_seen": 229071030, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 10617, + "time_per_iteration": 2.4153995513916016 + }, + { + "auxiliary_loss_clip": 0.01104997, + "auxiliary_loss_mlp": 0.01034225, + "balance_loss_clip": 1.0218631, + "balance_loss_mlp": 1.03582323, + "epoch": 0.6383886968284984, + "flos": 14427651225600.0, + "grad_norm": 1.8815450583884574, + "language_loss": 0.87111914, + "learning_rate": 1.221743529196936e-06, + "loss": 0.89251137, + "num_input_tokens_seen": 229088275, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.69140625, + "step": 10618, + "time_per_iteration": 2.4547295570373535 + }, + { + "auxiliary_loss_clip": 0.01108351, + "auxiliary_loss_mlp": 0.01033496, + "balance_loss_clip": 1.02284503, + "balance_loss_mlp": 1.03887093, + "epoch": 0.6384488200811664, + "flos": 17929659617280.0, + "grad_norm": 1.8631596079726758, + "language_loss": 0.73287827, + "learning_rate": 1.2213847782429806e-06, + "loss": 0.75429678, + "num_input_tokens_seen": 229105190, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6953125, + "step": 10619, + "time_per_iteration": 2.4028847217559814 + }, + { + "auxiliary_loss_clip": 0.01110376, + "auxiliary_loss_mlp": 0.01033636, + "balance_loss_clip": 1.02090454, + "balance_loss_mlp": 1.03807545, + "epoch": 0.6385089433338343, + "flos": 18515578268160.0, + "grad_norm": 1.9227827130097541, + "language_loss": 0.76158774, + "learning_rate": 1.221026056814193e-06, + "loss": 0.78302789, + "num_input_tokens_seen": 229122290, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 10620, + "time_per_iteration": 2.4420766830444336 + }, + { + "auxiliary_loss_clip": 0.01104115, + "auxiliary_loss_mlp": 0.0102525, + "balance_loss_clip": 1.01349616, + "balance_loss_mlp": 1.03697598, + "epoch": 0.6385690665865024, + "flos": 24753620499840.0, + "grad_norm": 2.4243704084161806, + "language_loss": 0.70476806, + "learning_rate": 1.2206673649241752e-06, + "loss": 0.7260617, + "num_input_tokens_seen": 229141620, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.671875, + "step": 10621, + "time_per_iteration": 2.474518299102783 + }, + { + "auxiliary_loss_clip": 0.01098545, + "auxiliary_loss_mlp": 0.0102422, + "balance_loss_clip": 1.01336575, + "balance_loss_mlp": 1.03505826, + "epoch": 0.6386291898391703, + "flos": 20120569678080.0, + "grad_norm": 1.540938795838808, + "language_loss": 0.77551067, + "learning_rate": 1.220308702586529e-06, + "loss": 0.79673827, + "num_input_tokens_seen": 229161570, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6328125, + "step": 10622, + "time_per_iteration": 2.4603724479675293 + }, + { + "auxiliary_loss_clip": 0.01102358, + "auxiliary_loss_mlp": 0.01027326, + "balance_loss_clip": 1.0161562, + "balance_loss_mlp": 1.0359875, + "epoch": 0.6386893130918383, + "flos": 16867278034560.0, + "grad_norm": 1.7317763854255814, + "language_loss": 0.7494216, + "learning_rate": 1.2199500698148546e-06, + "loss": 0.77071846, + "num_input_tokens_seen": 229178465, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6640625, + "step": 10623, + "time_per_iteration": 2.4120795726776123 + }, + { + "auxiliary_loss_clip": 0.01098287, + "auxiliary_loss_mlp": 0.01029397, + "balance_loss_clip": 1.01879287, + "balance_loss_mlp": 1.03354859, + "epoch": 0.6387494363445062, + "flos": 22966274718720.0, + "grad_norm": 1.6666297183957082, + "language_loss": 0.76487082, + "learning_rate": 1.2195914666227527e-06, + "loss": 0.78614771, + "num_input_tokens_seen": 229198975, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6484375, + "step": 10624, + "time_per_iteration": 2.4929676055908203 + }, + { + "auxiliary_loss_clip": 0.0110372, + "auxiliary_loss_mlp": 0.01026899, + "balance_loss_clip": 1.01597917, + "balance_loss_mlp": 1.03606153, + "epoch": 0.6388095595971742, + "flos": 22857716839680.0, + "grad_norm": 1.639287347980187, + "language_loss": 0.80685896, + "learning_rate": 1.21923289302382e-06, + "loss": 0.82816517, + "num_input_tokens_seen": 229218825, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.67578125, + "step": 10625, + "time_per_iteration": 2.4569015502929688 + }, + { + "auxiliary_loss_clip": 0.01105914, + "auxiliary_loss_mlp": 0.0103107, + "balance_loss_clip": 1.01917887, + "balance_loss_mlp": 1.03810406, + "epoch": 0.6388696828498421, + "flos": 17311529445120.0, + "grad_norm": 1.744297621070212, + "language_loss": 0.72630143, + "learning_rate": 1.218874349031654e-06, + "loss": 0.74767131, + "num_input_tokens_seen": 229236060, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 10626, + "time_per_iteration": 2.441058397293091 + }, + { + "auxiliary_loss_clip": 0.01104529, + "auxiliary_loss_mlp": 0.01029322, + "balance_loss_clip": 1.01703739, + "balance_loss_mlp": 1.03571403, + "epoch": 0.6389298061025102, + "flos": 17128636369920.0, + "grad_norm": 1.7246902612727075, + "language_loss": 0.72518885, + "learning_rate": 1.2185158346598517e-06, + "loss": 0.74652737, + "num_input_tokens_seen": 229255160, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 10627, + "time_per_iteration": 2.40901780128479 + }, + { + "auxiliary_loss_clip": 0.01108886, + "auxiliary_loss_mlp": 0.01029992, + "balance_loss_clip": 1.0163188, + "balance_loss_mlp": 1.03729248, + "epoch": 0.6389899293551781, + "flos": 27710971989120.0, + "grad_norm": 2.244776770999307, + "language_loss": 0.67281765, + "learning_rate": 1.2181573499220064e-06, + "loss": 0.69420648, + "num_input_tokens_seen": 229278705, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71875, + "step": 10628, + "time_per_iteration": 2.5263736248016357 + }, + { + "auxiliary_loss_clip": 0.01100861, + "auxiliary_loss_mlp": 0.01026385, + "balance_loss_clip": 1.01536465, + "balance_loss_mlp": 1.03674936, + "epoch": 0.6390500526078461, + "flos": 21215701486080.0, + "grad_norm": 1.8036287880835562, + "language_loss": 0.67833781, + "learning_rate": 1.2177988948317135e-06, + "loss": 0.69961035, + "num_input_tokens_seen": 229299990, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 10629, + "time_per_iteration": 2.477262258529663 + }, + { + "auxiliary_loss_clip": 0.01110269, + "auxiliary_loss_mlp": 0.01040382, + "balance_loss_clip": 1.02683949, + "balance_loss_mlp": 1.03733897, + "epoch": 0.6391101758605141, + "flos": 21581056673280.0, + "grad_norm": 1.4737896174832923, + "language_loss": 0.75127286, + "learning_rate": 1.2174404694025646e-06, + "loss": 0.77277935, + "num_input_tokens_seen": 229319230, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.73046875, + "step": 10630, + "time_per_iteration": 2.4760096073150635 + }, + { + "auxiliary_loss_clip": 0.01102055, + "auxiliary_loss_mlp": 0.01030181, + "balance_loss_clip": 1.01984, + "balance_loss_mlp": 1.03617334, + "epoch": 0.639170299113182, + "flos": 19900473091200.0, + "grad_norm": 1.5423208876827523, + "language_loss": 0.70398533, + "learning_rate": 1.2170820736481511e-06, + "loss": 0.7253077, + "num_input_tokens_seen": 229338600, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.65625, + "step": 10631, + "time_per_iteration": 2.452275514602661 + }, + { + "auxiliary_loss_clip": 0.01023775, + "auxiliary_loss_mlp": 0.00996899, + "balance_loss_clip": 0.99556983, + "balance_loss_mlp": 1.00307584, + "epoch": 0.63923042236585, + "flos": 69877604833920.0, + "grad_norm": 0.7719101864922713, + "language_loss": 0.63005149, + "learning_rate": 1.2167237075820646e-06, + "loss": 0.6502583, + "num_input_tokens_seen": 229402420, + "router_z_loss_clip": 0.01330566, + "router_z_loss_mlp": 0.20703125, + "step": 10632, + "time_per_iteration": 3.1005401611328125 + }, + { + "auxiliary_loss_clip": 0.01101477, + "auxiliary_loss_mlp": 0.01028273, + "balance_loss_clip": 1.01642942, + "balance_loss_mlp": 1.03553295, + "epoch": 0.639290545618518, + "flos": 22674823764480.0, + "grad_norm": 2.062081508069593, + "language_loss": 0.66411757, + "learning_rate": 1.216365371217893e-06, + "loss": 0.68541509, + "num_input_tokens_seen": 229419185, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66015625, + "step": 10633, + "time_per_iteration": 2.4561798572540283 + }, + { + "auxiliary_loss_clip": 0.0110405, + "auxiliary_loss_mlp": 0.01027355, + "balance_loss_clip": 1.01587558, + "balance_loss_mlp": 1.03670645, + "epoch": 0.639350668871186, + "flos": 19829190551040.0, + "grad_norm": 2.980251338642478, + "language_loss": 0.81779587, + "learning_rate": 1.216007064569225e-06, + "loss": 0.8391099, + "num_input_tokens_seen": 229436735, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.671875, + "step": 10634, + "time_per_iteration": 2.4740054607391357 + }, + { + "auxiliary_loss_clip": 0.01104597, + "auxiliary_loss_mlp": 0.01030359, + "balance_loss_clip": 1.01801491, + "balance_loss_mlp": 1.03732753, + "epoch": 0.6394107921238539, + "flos": 20553328736640.0, + "grad_norm": 1.7668249879195463, + "language_loss": 0.75268984, + "learning_rate": 1.2156487876496483e-06, + "loss": 0.77403939, + "num_input_tokens_seen": 229455595, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.671875, + "step": 10635, + "time_per_iteration": 3.8579487800598145 + }, + { + "auxiliary_loss_clip": 0.01103838, + "auxiliary_loss_mlp": 0.01030566, + "balance_loss_clip": 1.01875806, + "balance_loss_mlp": 1.03555012, + "epoch": 0.6394709153765219, + "flos": 25774991729280.0, + "grad_norm": 1.8856871240472837, + "language_loss": 0.71665233, + "learning_rate": 1.2152905404727475e-06, + "loss": 0.73799634, + "num_input_tokens_seen": 229476230, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.68359375, + "step": 10636, + "time_per_iteration": 2.4976108074188232 + }, + { + "auxiliary_loss_clip": 0.01106058, + "auxiliary_loss_mlp": 0.01030245, + "balance_loss_clip": 1.01808596, + "balance_loss_mlp": 1.03683591, + "epoch": 0.6395310386291898, + "flos": 17530153574400.0, + "grad_norm": 4.067899624402538, + "language_loss": 0.7341159, + "learning_rate": 1.2149323230521085e-06, + "loss": 0.75547898, + "num_input_tokens_seen": 229494300, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69140625, + "step": 10637, + "time_per_iteration": 2.4985272884368896 + }, + { + "auxiliary_loss_clip": 0.01105341, + "auxiliary_loss_mlp": 0.01028468, + "balance_loss_clip": 1.0159924, + "balance_loss_mlp": 1.03592014, + "epoch": 0.6395911618818578, + "flos": 18588225525120.0, + "grad_norm": 1.8415469934331217, + "language_loss": 0.77680337, + "learning_rate": 1.2145741354013143e-06, + "loss": 0.79814142, + "num_input_tokens_seen": 229512985, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6953125, + "step": 10638, + "time_per_iteration": 5.310981035232544 + }, + { + "auxiliary_loss_clip": 0.01102761, + "auxiliary_loss_mlp": 0.01028447, + "balance_loss_clip": 1.016675, + "balance_loss_mlp": 1.0358299, + "epoch": 0.6396512851345257, + "flos": 28366557068160.0, + "grad_norm": 3.6995147498561636, + "language_loss": 0.81817627, + "learning_rate": 1.2142159775339478e-06, + "loss": 0.83948827, + "num_input_tokens_seen": 229534270, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.66796875, + "step": 10639, + "time_per_iteration": 3.956713914871216 + }, + { + "auxiliary_loss_clip": 0.01024264, + "auxiliary_loss_mlp": 0.0100149, + "balance_loss_clip": 1.00012457, + "balance_loss_mlp": 1.00365281, + "epoch": 0.6397114083871938, + "flos": 70724307202560.0, + "grad_norm": 0.8122323276395823, + "language_loss": 0.59012806, + "learning_rate": 1.21385784946359e-06, + "loss": 0.61038566, + "num_input_tokens_seen": 229596455, + "router_z_loss_clip": 0.01367188, + "router_z_loss_mlp": 0.20605469, + "step": 10640, + "time_per_iteration": 3.01208758354187 + }, + { + "auxiliary_loss_clip": 0.01100429, + "auxiliary_loss_mlp": 0.01025452, + "balance_loss_clip": 1.01467586, + "balance_loss_mlp": 1.03550696, + "epoch": 0.6397715316398617, + "flos": 18142537570560.0, + "grad_norm": 1.7939599084799007, + "language_loss": 0.78193939, + "learning_rate": 1.2134997512038215e-06, + "loss": 0.80319822, + "num_input_tokens_seen": 229612860, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6484375, + "step": 10641, + "time_per_iteration": 2.399609327316284 + }, + { + "auxiliary_loss_clip": 0.01108702, + "auxiliary_loss_mlp": 0.01034799, + "balance_loss_clip": 1.02274656, + "balance_loss_mlp": 1.03676474, + "epoch": 0.6398316548925297, + "flos": 25739512070400.0, + "grad_norm": 22.013815914762134, + "language_loss": 0.63092768, + "learning_rate": 1.2131416827682209e-06, + "loss": 0.65236264, + "num_input_tokens_seen": 229633960, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.71875, + "step": 10642, + "time_per_iteration": 2.4959514141082764 + }, + { + "auxiliary_loss_clip": 0.01024704, + "auxiliary_loss_mlp": 0.01003714, + "balance_loss_clip": 1.00239646, + "balance_loss_mlp": 1.00392115, + "epoch": 0.6398917781451977, + "flos": 71214234756480.0, + "grad_norm": 0.944530378795617, + "language_loss": 0.55960983, + "learning_rate": 1.2127836441703667e-06, + "loss": 0.57989401, + "num_input_tokens_seen": 229686730, + "router_z_loss_clip": 0.01318359, + "router_z_loss_mlp": 0.20800781, + "step": 10643, + "time_per_iteration": 2.9914019107818604 + }, + { + "auxiliary_loss_clip": 0.01108117, + "auxiliary_loss_mlp": 0.01025119, + "balance_loss_clip": 1.01332903, + "balance_loss_mlp": 1.03745127, + "epoch": 0.6399519013978656, + "flos": 20521835487360.0, + "grad_norm": 2.5171801924474764, + "language_loss": 0.77069736, + "learning_rate": 1.2124256354238358e-06, + "loss": 0.79202974, + "num_input_tokens_seen": 229704800, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.70703125, + "step": 10644, + "time_per_iteration": 2.437391996383667 + }, + { + "auxiliary_loss_clip": 0.01104463, + "auxiliary_loss_mlp": 0.01031384, + "balance_loss_clip": 1.01893854, + "balance_loss_mlp": 1.03780031, + "epoch": 0.6400120246505336, + "flos": 24460840742400.0, + "grad_norm": 1.4086380930188218, + "language_loss": 0.82438183, + "learning_rate": 1.212067656542203e-06, + "loss": 0.84574032, + "num_input_tokens_seen": 229725265, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.66796875, + "step": 10645, + "time_per_iteration": 2.4806745052337646 + }, + { + "auxiliary_loss_clip": 0.01108703, + "auxiliary_loss_mlp": 0.01034822, + "balance_loss_clip": 1.0219593, + "balance_loss_mlp": 1.03747869, + "epoch": 0.6400721479032015, + "flos": 28366090191360.0, + "grad_norm": 1.670748165032705, + "language_loss": 0.73261863, + "learning_rate": 1.2117097075390447e-06, + "loss": 0.75405383, + "num_input_tokens_seen": 229744840, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 10646, + "time_per_iteration": 2.515089988708496 + }, + { + "auxiliary_loss_clip": 0.01105459, + "auxiliary_loss_mlp": 0.01031116, + "balance_loss_clip": 1.01905167, + "balance_loss_mlp": 1.03657353, + "epoch": 0.6401322711558696, + "flos": 17816540711040.0, + "grad_norm": 2.6918825179848747, + "language_loss": 0.79892278, + "learning_rate": 1.2113517884279327e-06, + "loss": 0.82028854, + "num_input_tokens_seen": 229759095, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.69140625, + "step": 10647, + "time_per_iteration": 2.423576593399048 + }, + { + "auxiliary_loss_clip": 0.01102623, + "auxiliary_loss_mlp": 0.01029035, + "balance_loss_clip": 1.01801419, + "balance_loss_mlp": 1.03732038, + "epoch": 0.6401923944085375, + "flos": 26030855283840.0, + "grad_norm": 2.4485135437848724, + "language_loss": 0.75737441, + "learning_rate": 1.2109938992224399e-06, + "loss": 0.77869105, + "num_input_tokens_seen": 229777750, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65234375, + "step": 10648, + "time_per_iteration": 2.528726100921631 + }, + { + "auxiliary_loss_clip": 0.01103307, + "auxiliary_loss_mlp": 0.01029506, + "balance_loss_clip": 1.01779962, + "balance_loss_mlp": 1.03479123, + "epoch": 0.6402525176612055, + "flos": 23586451966080.0, + "grad_norm": 1.7767786509202286, + "language_loss": 0.78653902, + "learning_rate": 1.210636039936138e-06, + "loss": 0.80786711, + "num_input_tokens_seen": 229796785, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.68359375, + "step": 10649, + "time_per_iteration": 2.4528145790100098 + }, + { + "auxiliary_loss_clip": 0.01103744, + "auxiliary_loss_mlp": 0.0103342, + "balance_loss_clip": 1.02100444, + "balance_loss_mlp": 1.03651512, + "epoch": 0.6403126409138734, + "flos": 18041413806720.0, + "grad_norm": 1.6464773742271148, + "language_loss": 0.75819784, + "learning_rate": 1.2102782105825956e-06, + "loss": 0.77956951, + "num_input_tokens_seen": 229815425, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 10650, + "time_per_iteration": 2.4333670139312744 + }, + { + "auxiliary_loss_clip": 0.01102627, + "auxiliary_loss_mlp": 0.01030825, + "balance_loss_clip": 1.01803422, + "balance_loss_mlp": 1.0351758, + "epoch": 0.6403727641665414, + "flos": 21979485308160.0, + "grad_norm": 1.4678123102603653, + "language_loss": 0.70750296, + "learning_rate": 1.2099204111753833e-06, + "loss": 0.72883749, + "num_input_tokens_seen": 229834545, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.67578125, + "step": 10651, + "time_per_iteration": 2.4399240016937256 + }, + { + "auxiliary_loss_clip": 0.01104316, + "auxiliary_loss_mlp": 0.01037837, + "balance_loss_clip": 1.02509403, + "balance_loss_mlp": 1.03600538, + "epoch": 0.6404328874192093, + "flos": 24895539135360.0, + "grad_norm": 2.264038346674132, + "language_loss": 0.63932753, + "learning_rate": 1.2095626417280684e-06, + "loss": 0.66074908, + "num_input_tokens_seen": 229849175, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.68359375, + "step": 10652, + "time_per_iteration": 2.4656026363372803 + }, + { + "auxiliary_loss_clip": 0.01104729, + "auxiliary_loss_mlp": 0.01029772, + "balance_loss_clip": 1.01769614, + "balance_loss_mlp": 1.03726971, + "epoch": 0.6404930106718774, + "flos": 17597198309760.0, + "grad_norm": 2.2063618593971586, + "language_loss": 0.79597425, + "learning_rate": 1.2092049022542168e-06, + "loss": 0.81731927, + "num_input_tokens_seen": 229865400, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 10653, + "time_per_iteration": 2.4099206924438477 + }, + { + "auxiliary_loss_clip": 0.01113277, + "auxiliary_loss_mlp": 0.01045693, + "balance_loss_clip": 1.03203726, + "balance_loss_mlp": 1.03744364, + "epoch": 0.6405531339245453, + "flos": 20157880930560.0, + "grad_norm": 2.172692455677744, + "language_loss": 0.69950652, + "learning_rate": 1.2088471927673952e-06, + "loss": 0.72109628, + "num_input_tokens_seen": 229882945, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7578125, + "step": 10654, + "time_per_iteration": 2.451249122619629 + }, + { + "auxiliary_loss_clip": 0.01108717, + "auxiliary_loss_mlp": 0.01036203, + "balance_loss_clip": 1.02347147, + "balance_loss_mlp": 1.03717566, + "epoch": 0.6406132571772133, + "flos": 21942281796480.0, + "grad_norm": 1.7648347923503578, + "language_loss": 0.72763705, + "learning_rate": 1.2084895132811666e-06, + "loss": 0.74908626, + "num_input_tokens_seen": 229901590, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 10655, + "time_per_iteration": 2.4311604499816895 + }, + { + "auxiliary_loss_clip": 0.01106611, + "auxiliary_loss_mlp": 0.0103335, + "balance_loss_clip": 1.02160144, + "balance_loss_mlp": 1.0368948, + "epoch": 0.6406733804298813, + "flos": 28768002445440.0, + "grad_norm": 1.5980795641640981, + "language_loss": 0.83070755, + "learning_rate": 1.2081318638090952e-06, + "loss": 0.85210717, + "num_input_tokens_seen": 229922535, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.69921875, + "step": 10656, + "time_per_iteration": 2.5178308486938477 + }, + { + "auxiliary_loss_clip": 0.01103309, + "auxiliary_loss_mlp": 0.01034264, + "balance_loss_clip": 1.02264667, + "balance_loss_mlp": 1.03502929, + "epoch": 0.6407335036825492, + "flos": 17457183095040.0, + "grad_norm": 2.258129795094631, + "language_loss": 0.72108161, + "learning_rate": 1.2077742443647433e-06, + "loss": 0.74245739, + "num_input_tokens_seen": 229939575, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.68359375, + "step": 10657, + "time_per_iteration": 2.422863483428955 + }, + { + "auxiliary_loss_clip": 0.01103006, + "auxiliary_loss_mlp": 0.01032513, + "balance_loss_clip": 1.02086604, + "balance_loss_mlp": 1.03499269, + "epoch": 0.6407936269352172, + "flos": 22125282612480.0, + "grad_norm": 2.427174353089587, + "language_loss": 0.7728945, + "learning_rate": 1.2074166549616707e-06, + "loss": 0.79424977, + "num_input_tokens_seen": 229958840, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 10658, + "time_per_iteration": 2.449277877807617 + }, + { + "auxiliary_loss_clip": 0.01108084, + "auxiliary_loss_mlp": 0.01039076, + "balance_loss_clip": 1.02629066, + "balance_loss_mlp": 1.03781724, + "epoch": 0.6408537501878852, + "flos": 23110635479040.0, + "grad_norm": 1.5608188078670746, + "language_loss": 0.7607885, + "learning_rate": 1.2070590956134386e-06, + "loss": 0.78226012, + "num_input_tokens_seen": 229979680, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.703125, + "step": 10659, + "time_per_iteration": 2.4464104175567627 + }, + { + "auxiliary_loss_clip": 0.01103759, + "auxiliary_loss_mlp": 0.0103387, + "balance_loss_clip": 1.02172303, + "balance_loss_mlp": 1.03568363, + "epoch": 0.6409138734405532, + "flos": 16472440759680.0, + "grad_norm": 1.6810966877518245, + "language_loss": 0.78276753, + "learning_rate": 1.2067015663336046e-06, + "loss": 0.80414379, + "num_input_tokens_seen": 229996830, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6796875, + "step": 10660, + "time_per_iteration": 2.463932752609253 + }, + { + "auxiliary_loss_clip": 0.01110744, + "auxiliary_loss_mlp": 0.01037101, + "balance_loss_clip": 1.023785, + "balance_loss_mlp": 1.03830671, + "epoch": 0.6409739966932211, + "flos": 22777922776320.0, + "grad_norm": 2.1049933789165727, + "language_loss": 0.68227595, + "learning_rate": 1.206344067135727e-06, + "loss": 0.70375443, + "num_input_tokens_seen": 230015115, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7265625, + "step": 10661, + "time_per_iteration": 2.4437673091888428 + }, + { + "auxiliary_loss_clip": 0.01103563, + "auxiliary_loss_mlp": 0.01036782, + "balance_loss_clip": 1.02566016, + "balance_loss_mlp": 1.0374167, + "epoch": 0.6410341199458891, + "flos": 25152049134720.0, + "grad_norm": 1.4944389143541703, + "language_loss": 0.75839317, + "learning_rate": 1.205986598033362e-06, + "loss": 0.77979672, + "num_input_tokens_seen": 230035515, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66015625, + "step": 10662, + "time_per_iteration": 2.4985625743865967 + }, + { + "auxiliary_loss_clip": 0.01102338, + "auxiliary_loss_mlp": 0.01030167, + "balance_loss_clip": 1.01905084, + "balance_loss_mlp": 1.03421175, + "epoch": 0.641094243198557, + "flos": 27046193028480.0, + "grad_norm": 1.8768391350540305, + "language_loss": 0.69502836, + "learning_rate": 1.2056291590400644e-06, + "loss": 0.71635342, + "num_input_tokens_seen": 230054355, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6796875, + "step": 10663, + "time_per_iteration": 2.479556083679199 + }, + { + "auxiliary_loss_clip": 0.01106696, + "auxiliary_loss_mlp": 0.01040197, + "balance_loss_clip": 1.02654743, + "balance_loss_mlp": 1.03724718, + "epoch": 0.641154366451225, + "flos": 25374551932800.0, + "grad_norm": 2.235560561918587, + "language_loss": 0.68056524, + "learning_rate": 1.205271750169389e-06, + "loss": 0.70203424, + "num_input_tokens_seen": 230074605, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.6953125, + "step": 10664, + "time_per_iteration": 2.490736484527588 + }, + { + "auxiliary_loss_clip": 0.01101883, + "auxiliary_loss_mlp": 0.01029231, + "balance_loss_clip": 1.01797128, + "balance_loss_mlp": 1.03587985, + "epoch": 0.6412144897038929, + "flos": 25153342024320.0, + "grad_norm": 1.8443375686405623, + "language_loss": 0.66447258, + "learning_rate": 1.2049143714348881e-06, + "loss": 0.68578362, + "num_input_tokens_seen": 230093820, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.66015625, + "step": 10665, + "time_per_iteration": 2.4581611156463623 + }, + { + "auxiliary_loss_clip": 0.01102013, + "auxiliary_loss_mlp": 0.01029843, + "balance_loss_clip": 1.01801181, + "balance_loss_mlp": 1.03565812, + "epoch": 0.641274612956561, + "flos": 23440762402560.0, + "grad_norm": 1.9911859706917303, + "language_loss": 0.64523447, + "learning_rate": 1.2045570228501145e-06, + "loss": 0.66655302, + "num_input_tokens_seen": 230114285, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 10666, + "time_per_iteration": 2.4770736694335938 + }, + { + "auxiliary_loss_clip": 0.01105742, + "auxiliary_loss_mlp": 0.01031432, + "balance_loss_clip": 1.01933265, + "balance_loss_mlp": 1.03609776, + "epoch": 0.6413347362092289, + "flos": 19427493778560.0, + "grad_norm": 1.666384333420834, + "language_loss": 0.7067616, + "learning_rate": 1.2041997044286176e-06, + "loss": 0.72813338, + "num_input_tokens_seen": 230132760, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 10667, + "time_per_iteration": 2.407938003540039 + }, + { + "auxiliary_loss_clip": 0.01114508, + "auxiliary_loss_mlp": 0.01034302, + "balance_loss_clip": 1.02077127, + "balance_loss_mlp": 1.03901672, + "epoch": 0.6413948594618969, + "flos": 17196578945280.0, + "grad_norm": 2.2700946721922874, + "language_loss": 0.77413416, + "learning_rate": 1.2038424161839484e-06, + "loss": 0.79562223, + "num_input_tokens_seen": 230149690, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75390625, + "step": 10668, + "time_per_iteration": 2.421332836151123 + }, + { + "auxiliary_loss_clip": 0.01105498, + "auxiliary_loss_mlp": 0.01034123, + "balance_loss_clip": 1.02199399, + "balance_loss_mlp": 1.0376507, + "epoch": 0.6414549827145648, + "flos": 22269787027200.0, + "grad_norm": 1.6100109548180268, + "language_loss": 0.67520595, + "learning_rate": 1.2034851581296544e-06, + "loss": 0.69660217, + "num_input_tokens_seen": 230166950, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 10669, + "time_per_iteration": 2.426586866378784 + }, + { + "auxiliary_loss_clip": 0.01112518, + "auxiliary_loss_mlp": 0.01037501, + "balance_loss_clip": 1.02510333, + "balance_loss_mlp": 1.03997803, + "epoch": 0.6415151059672328, + "flos": 19640192163840.0, + "grad_norm": 1.7319389151723867, + "language_loss": 0.78258085, + "learning_rate": 1.2031279302792825e-06, + "loss": 0.80408102, + "num_input_tokens_seen": 230184785, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7265625, + "step": 10670, + "time_per_iteration": 2.469668388366699 + }, + { + "auxiliary_loss_clip": 0.01108443, + "auxiliary_loss_mlp": 0.01034621, + "balance_loss_clip": 1.02200222, + "balance_loss_mlp": 1.03697228, + "epoch": 0.6415752292199008, + "flos": 14865833237760.0, + "grad_norm": 2.2181025019747445, + "language_loss": 0.88322049, + "learning_rate": 1.20277073264638e-06, + "loss": 0.90465117, + "num_input_tokens_seen": 230201385, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71484375, + "step": 10671, + "time_per_iteration": 2.391927480697632 + }, + { + "auxiliary_loss_clip": 0.01103513, + "auxiliary_loss_mlp": 0.0102843, + "balance_loss_clip": 1.01708126, + "balance_loss_mlp": 1.03752613, + "epoch": 0.6416353524725688, + "flos": 13735580906880.0, + "grad_norm": 1.4861712883005815, + "language_loss": 0.69451904, + "learning_rate": 1.2024135652444907e-06, + "loss": 0.71583843, + "num_input_tokens_seen": 230220380, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66015625, + "step": 10672, + "time_per_iteration": 2.4214959144592285 + }, + { + "auxiliary_loss_clip": 0.01109224, + "auxiliary_loss_mlp": 0.0102904, + "balance_loss_clip": 1.01554513, + "balance_loss_mlp": 1.03705025, + "epoch": 0.6416954757252368, + "flos": 24534924543360.0, + "grad_norm": 1.748656888764651, + "language_loss": 0.7392627, + "learning_rate": 1.2020564280871593e-06, + "loss": 0.76064527, + "num_input_tokens_seen": 230239845, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.72265625, + "step": 10673, + "time_per_iteration": 2.4611282348632812 + }, + { + "auxiliary_loss_clip": 0.01104131, + "auxiliary_loss_mlp": 0.01034077, + "balance_loss_clip": 1.0213933, + "balance_loss_mlp": 1.03559685, + "epoch": 0.6417555989779047, + "flos": 27710002321920.0, + "grad_norm": 1.56139787015984, + "language_loss": 0.69352114, + "learning_rate": 1.2016993211879283e-06, + "loss": 0.71490324, + "num_input_tokens_seen": 230262420, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 10674, + "time_per_iteration": 2.5161702632904053 + }, + { + "auxiliary_loss_clip": 0.01109387, + "auxiliary_loss_mlp": 0.01029296, + "balance_loss_clip": 1.01650524, + "balance_loss_mlp": 1.03618658, + "epoch": 0.6418157222305727, + "flos": 20556632787840.0, + "grad_norm": 1.8510668186633226, + "language_loss": 0.66126549, + "learning_rate": 1.201342244560338e-06, + "loss": 0.68265229, + "num_input_tokens_seen": 230279950, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.73046875, + "step": 10675, + "time_per_iteration": 2.4155290126800537 + }, + { + "auxiliary_loss_clip": 0.01106276, + "auxiliary_loss_mlp": 0.01034914, + "balance_loss_clip": 1.02304077, + "balance_loss_mlp": 1.03823316, + "epoch": 0.6418758454832406, + "flos": 22601530062720.0, + "grad_norm": 2.2027244466364486, + "language_loss": 0.66607732, + "learning_rate": 1.2009851982179307e-06, + "loss": 0.68748927, + "num_input_tokens_seen": 230299705, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 10676, + "time_per_iteration": 2.490659713745117 + }, + { + "auxiliary_loss_clip": 0.01108966, + "auxiliary_loss_mlp": 0.01030084, + "balance_loss_clip": 1.01685786, + "balance_loss_mlp": 1.03876162, + "epoch": 0.6419359687359086, + "flos": 27375098889600.0, + "grad_norm": 2.097581634404412, + "language_loss": 0.75956476, + "learning_rate": 1.2006281821742446e-06, + "loss": 0.7809552, + "num_input_tokens_seen": 230320030, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 10677, + "time_per_iteration": 3.9567973613739014 + }, + { + "auxiliary_loss_clip": 0.01026179, + "auxiliary_loss_mlp": 0.00997901, + "balance_loss_clip": 0.99666101, + "balance_loss_mlp": 1.00533533, + "epoch": 0.6419960919885765, + "flos": 67251924552960.0, + "grad_norm": 0.8065212839738138, + "language_loss": 0.60730147, + "learning_rate": 1.200271196442818e-06, + "loss": 0.62754226, + "num_input_tokens_seen": 230381495, + "router_z_loss_clip": 0.01239014, + "router_z_loss_mlp": 0.20898438, + "step": 10678, + "time_per_iteration": 3.13420033454895 + }, + { + "auxiliary_loss_clip": 0.01103932, + "auxiliary_loss_mlp": 0.01033582, + "balance_loss_clip": 1.02209604, + "balance_loss_mlp": 1.03742027, + "epoch": 0.6420562152412446, + "flos": 19901873721600.0, + "grad_norm": 1.6963549464247227, + "language_loss": 0.67299467, + "learning_rate": 1.1999142410371875e-06, + "loss": 0.69436979, + "num_input_tokens_seen": 230401385, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6640625, + "step": 10679, + "time_per_iteration": 3.808528423309326 + }, + { + "auxiliary_loss_clip": 0.01108558, + "auxiliary_loss_mlp": 0.01029627, + "balance_loss_clip": 1.01666307, + "balance_loss_mlp": 1.03855729, + "epoch": 0.6421163384939125, + "flos": 24790177566720.0, + "grad_norm": 1.6996500318605585, + "language_loss": 0.72910142, + "learning_rate": 1.1995573159708897e-06, + "loss": 0.75048327, + "num_input_tokens_seen": 230421340, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69921875, + "step": 10680, + "time_per_iteration": 3.8477213382720947 + }, + { + "auxiliary_loss_clip": 0.01103253, + "auxiliary_loss_mlp": 0.01026838, + "balance_loss_clip": 1.01612723, + "balance_loss_mlp": 1.03545952, + "epoch": 0.6421764617465805, + "flos": 25592816926080.0, + "grad_norm": 1.7403495519820134, + "language_loss": 0.67876667, + "learning_rate": 1.1992004212574582e-06, + "loss": 0.70006758, + "num_input_tokens_seen": 230441270, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6796875, + "step": 10681, + "time_per_iteration": 3.919956922531128 + }, + { + "auxiliary_loss_clip": 0.01101229, + "auxiliary_loss_mlp": 0.01029189, + "balance_loss_clip": 1.01748252, + "balance_loss_mlp": 1.03434682, + "epoch": 0.6422365849992484, + "flos": 14134727813760.0, + "grad_norm": 1.5976000618825759, + "language_loss": 0.74644732, + "learning_rate": 1.198843556910427e-06, + "loss": 0.76775151, + "num_input_tokens_seen": 230457455, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 10682, + "time_per_iteration": 2.4222958087921143 + }, + { + "auxiliary_loss_clip": 0.01099045, + "auxiliary_loss_mlp": 0.01030599, + "balance_loss_clip": 1.01960802, + "balance_loss_mlp": 1.0343014, + "epoch": 0.6422967082519164, + "flos": 22383911514240.0, + "grad_norm": 1.48329541818395, + "language_loss": 0.79282379, + "learning_rate": 1.1984867229433287e-06, + "loss": 0.81412017, + "num_input_tokens_seen": 230478955, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6484375, + "step": 10683, + "time_per_iteration": 2.4635698795318604 + }, + { + "auxiliary_loss_clip": 0.01106037, + "auxiliary_loss_mlp": 0.01036635, + "balance_loss_clip": 1.0240351, + "balance_loss_mlp": 1.03679943, + "epoch": 0.6423568315045844, + "flos": 14647927380480.0, + "grad_norm": 1.6292181520500175, + "language_loss": 0.67376101, + "learning_rate": 1.1981299193696941e-06, + "loss": 0.69518769, + "num_input_tokens_seen": 230496425, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69140625, + "step": 10684, + "time_per_iteration": 2.41907000541687 + }, + { + "auxiliary_loss_clip": 0.01104447, + "auxiliary_loss_mlp": 0.01028123, + "balance_loss_clip": 1.01595724, + "balance_loss_mlp": 1.03616428, + "epoch": 0.6424169547572524, + "flos": 26833925606400.0, + "grad_norm": 2.2028301911766976, + "language_loss": 0.71436971, + "learning_rate": 1.1977731462030533e-06, + "loss": 0.73569536, + "num_input_tokens_seen": 230516245, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.68359375, + "step": 10685, + "time_per_iteration": 2.471905469894409 + }, + { + "auxiliary_loss_clip": 0.01101725, + "auxiliary_loss_mlp": 0.01032399, + "balance_loss_clip": 1.0210917, + "balance_loss_mlp": 1.0360837, + "epoch": 0.6424770780099204, + "flos": 22707430335360.0, + "grad_norm": 1.599317002960078, + "language_loss": 0.75343961, + "learning_rate": 1.197416403456935e-06, + "loss": 0.77478087, + "num_input_tokens_seen": 230534745, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 10686, + "time_per_iteration": 2.4540653228759766 + }, + { + "auxiliary_loss_clip": 0.0110856, + "auxiliary_loss_mlp": 0.01033102, + "balance_loss_clip": 1.02034068, + "balance_loss_mlp": 1.03813434, + "epoch": 0.6425372012625883, + "flos": 28469512425600.0, + "grad_norm": 2.1016215045747684, + "language_loss": 0.6875909, + "learning_rate": 1.197059691144867e-06, + "loss": 0.70900756, + "num_input_tokens_seen": 230555895, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 10687, + "time_per_iteration": 2.4797768592834473 + }, + { + "auxiliary_loss_clip": 0.01107085, + "auxiliary_loss_mlp": 0.01030589, + "balance_loss_clip": 1.01875103, + "balance_loss_mlp": 1.03763437, + "epoch": 0.6425973245152563, + "flos": 29351694453120.0, + "grad_norm": 2.024359307432863, + "language_loss": 0.66338682, + "learning_rate": 1.1967030092803767e-06, + "loss": 0.68476355, + "num_input_tokens_seen": 230577460, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6953125, + "step": 10688, + "time_per_iteration": 2.515491247177124 + }, + { + "auxiliary_loss_clip": 0.01103677, + "auxiliary_loss_mlp": 0.01029681, + "balance_loss_clip": 1.01751018, + "balance_loss_mlp": 1.03563595, + "epoch": 0.6426574477679242, + "flos": 16430388912000.0, + "grad_norm": 1.8678327137671962, + "language_loss": 0.73044169, + "learning_rate": 1.1963463578769876e-06, + "loss": 0.75177526, + "num_input_tokens_seen": 230595030, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6796875, + "step": 10689, + "time_per_iteration": 2.442413806915283 + }, + { + "auxiliary_loss_clip": 0.01101756, + "auxiliary_loss_mlp": 0.01029518, + "balance_loss_clip": 1.01890218, + "balance_loss_mlp": 1.03588271, + "epoch": 0.6427175710205922, + "flos": 21835914647040.0, + "grad_norm": 2.3454318131191485, + "language_loss": 0.72232103, + "learning_rate": 1.195989736948226e-06, + "loss": 0.74363381, + "num_input_tokens_seen": 230615135, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.66015625, + "step": 10690, + "time_per_iteration": 2.471299648284912 + }, + { + "auxiliary_loss_clip": 0.01102076, + "auxiliary_loss_mlp": 0.01028258, + "balance_loss_clip": 1.01679587, + "balance_loss_mlp": 1.03589702, + "epoch": 0.6427776942732601, + "flos": 17786627660160.0, + "grad_norm": 1.751175955717072, + "language_loss": 0.77973688, + "learning_rate": 1.1956331465076143e-06, + "loss": 0.80104017, + "num_input_tokens_seen": 230631965, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.66015625, + "step": 10691, + "time_per_iteration": 2.405625581741333 + }, + { + "auxiliary_loss_clip": 0.01107533, + "auxiliary_loss_mlp": 0.01036737, + "balance_loss_clip": 1.02516222, + "balance_loss_mlp": 1.03734851, + "epoch": 0.6428378175259282, + "flos": 15085893911040.0, + "grad_norm": 1.7365524827328973, + "language_loss": 0.74180853, + "learning_rate": 1.1952765865686738e-06, + "loss": 0.76325125, + "num_input_tokens_seen": 230649565, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.703125, + "step": 10692, + "time_per_iteration": 2.4545161724090576 + }, + { + "auxiliary_loss_clip": 0.01105895, + "auxiliary_loss_mlp": 0.01032234, + "balance_loss_clip": 1.02066517, + "balance_loss_mlp": 1.03752697, + "epoch": 0.6428979407785961, + "flos": 23841776816640.0, + "grad_norm": 1.783950417735838, + "language_loss": 0.61135745, + "learning_rate": 1.1949200571449263e-06, + "loss": 0.63273877, + "num_input_tokens_seen": 230669265, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.68359375, + "step": 10693, + "time_per_iteration": 2.443671464920044 + }, + { + "auxiliary_loss_clip": 0.01107731, + "auxiliary_loss_mlp": 0.010259, + "balance_loss_clip": 1.01348996, + "balance_loss_mlp": 1.03660131, + "epoch": 0.6429580640312641, + "flos": 32926852892160.0, + "grad_norm": 1.580411610275865, + "language_loss": 0.59667271, + "learning_rate": 1.1945635582498903e-06, + "loss": 0.61800897, + "num_input_tokens_seen": 230690575, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 10694, + "time_per_iteration": 2.539658308029175 + }, + { + "auxiliary_loss_clip": 0.01107722, + "auxiliary_loss_mlp": 0.01026895, + "balance_loss_clip": 1.0150162, + "balance_loss_mlp": 1.03852546, + "epoch": 0.643018187283932, + "flos": 21068359896960.0, + "grad_norm": 1.3391279253609552, + "language_loss": 0.79716361, + "learning_rate": 1.1942070898970853e-06, + "loss": 0.81850976, + "num_input_tokens_seen": 230709420, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.69140625, + "step": 10695, + "time_per_iteration": 2.4294402599334717 + }, + { + "auxiliary_loss_clip": 0.01105962, + "auxiliary_loss_mlp": 0.01036859, + "balance_loss_clip": 1.02474165, + "balance_loss_mlp": 1.03641272, + "epoch": 0.6430783105366, + "flos": 26724649455360.0, + "grad_norm": 2.3258756947072112, + "language_loss": 0.73518264, + "learning_rate": 1.1938506521000285e-06, + "loss": 0.75661093, + "num_input_tokens_seen": 230729350, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 10696, + "time_per_iteration": 2.502713441848755 + }, + { + "auxiliary_loss_clip": 0.01102941, + "auxiliary_loss_mlp": 0.01026977, + "balance_loss_clip": 1.01575983, + "balance_loss_mlp": 1.03764093, + "epoch": 0.643138433789268, + "flos": 23696841438720.0, + "grad_norm": 2.7414253907465636, + "language_loss": 0.7579782, + "learning_rate": 1.1934942448722347e-06, + "loss": 0.77927744, + "num_input_tokens_seen": 230749220, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 10697, + "time_per_iteration": 2.4447250366210938 + }, + { + "auxiliary_loss_clip": 0.01102432, + "auxiliary_loss_mlp": 0.01029855, + "balance_loss_clip": 1.01867306, + "balance_loss_mlp": 1.03607345, + "epoch": 0.643198557041936, + "flos": 34202184255360.0, + "grad_norm": 1.4042502284177218, + "language_loss": 0.6627214, + "learning_rate": 1.1931378682272208e-06, + "loss": 0.68404424, + "num_input_tokens_seen": 230770245, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6640625, + "step": 10698, + "time_per_iteration": 2.585150718688965 + }, + { + "auxiliary_loss_clip": 0.01025803, + "auxiliary_loss_mlp": 0.01004446, + "balance_loss_clip": 1.00322425, + "balance_loss_mlp": 1.00514603, + "epoch": 0.643258680294604, + "flos": 67626473621760.0, + "grad_norm": 0.8344250970478979, + "language_loss": 0.63460743, + "learning_rate": 1.1927815221784996e-06, + "loss": 0.65490991, + "num_input_tokens_seen": 230837030, + "router_z_loss_clip": 0.01220703, + "router_z_loss_mlp": 0.20703125, + "step": 10699, + "time_per_iteration": 3.024700403213501 + }, + { + "auxiliary_loss_clip": 0.01103086, + "auxiliary_loss_mlp": 0.01026282, + "balance_loss_clip": 1.01535618, + "balance_loss_mlp": 1.03705359, + "epoch": 0.6433188035472719, + "flos": 25185984508800.0, + "grad_norm": 1.912981795070525, + "language_loss": 0.6912387, + "learning_rate": 1.1924252067395838e-06, + "loss": 0.71253234, + "num_input_tokens_seen": 230856845, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.66015625, + "step": 10700, + "time_per_iteration": 2.4683825969696045 + }, + { + "auxiliary_loss_clip": 0.01103852, + "auxiliary_loss_mlp": 0.01026054, + "balance_loss_clip": 1.01447868, + "balance_loss_mlp": 1.03590679, + "epoch": 0.6433789267999399, + "flos": 24973573432320.0, + "grad_norm": 1.7070737124865907, + "language_loss": 0.73354918, + "learning_rate": 1.1920689219239855e-06, + "loss": 0.75484824, + "num_input_tokens_seen": 230878785, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6796875, + "step": 10701, + "time_per_iteration": 2.4831302165985107 + }, + { + "auxiliary_loss_clip": 0.01105062, + "auxiliary_loss_mlp": 0.01028305, + "balance_loss_clip": 1.01542449, + "balance_loss_mlp": 1.03474069, + "epoch": 0.6434390500526078, + "flos": 17566028282880.0, + "grad_norm": 1.878097796503538, + "language_loss": 0.81941777, + "learning_rate": 1.1917126677452144e-06, + "loss": 0.84075147, + "num_input_tokens_seen": 230895445, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 10702, + "time_per_iteration": 2.468240261077881 + }, + { + "auxiliary_loss_clip": 0.01101591, + "auxiliary_loss_mlp": 0.01033917, + "balance_loss_clip": 1.02235985, + "balance_loss_mlp": 1.03552771, + "epoch": 0.6434991733052758, + "flos": 20843594542080.0, + "grad_norm": 1.8640854274416083, + "language_loss": 0.74179298, + "learning_rate": 1.1913564442167798e-06, + "loss": 0.76314807, + "num_input_tokens_seen": 230911375, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66015625, + "step": 10703, + "time_per_iteration": 2.413569688796997 + }, + { + "auxiliary_loss_clip": 0.01025343, + "auxiliary_loss_mlp": 0.00998028, + "balance_loss_clip": 0.99697268, + "balance_loss_mlp": 1.0046978, + "epoch": 0.6435592965579437, + "flos": 66094596345600.0, + "grad_norm": 0.6508795205779913, + "language_loss": 0.54642779, + "learning_rate": 1.1910002513521898e-06, + "loss": 0.56666148, + "num_input_tokens_seen": 230975990, + "router_z_loss_clip": 0.01055908, + "router_z_loss_mlp": 0.20703125, + "step": 10704, + "time_per_iteration": 3.0236172676086426 + }, + { + "auxiliary_loss_clip": 0.01102168, + "auxiliary_loss_mlp": 0.01022828, + "balance_loss_clip": 1.01258826, + "balance_loss_mlp": 1.03476024, + "epoch": 0.6436194198106118, + "flos": 23768842250880.0, + "grad_norm": 1.5730519252717787, + "language_loss": 0.76976264, + "learning_rate": 1.1906440891649519e-06, + "loss": 0.79101259, + "num_input_tokens_seen": 230997110, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.67578125, + "step": 10705, + "time_per_iteration": 2.455488443374634 + }, + { + "auxiliary_loss_clip": 0.01104051, + "auxiliary_loss_mlp": 0.01036328, + "balance_loss_clip": 1.02475905, + "balance_loss_mlp": 1.0358727, + "epoch": 0.6436795430632797, + "flos": 20230312705920.0, + "grad_norm": 1.7440813911831818, + "language_loss": 0.7908684, + "learning_rate": 1.1902879576685708e-06, + "loss": 0.81227219, + "num_input_tokens_seen": 231015590, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.68359375, + "step": 10706, + "time_per_iteration": 2.449542760848999 + }, + { + "auxiliary_loss_clip": 0.01103828, + "auxiliary_loss_mlp": 0.0103175, + "balance_loss_clip": 1.01926923, + "balance_loss_mlp": 1.0355916, + "epoch": 0.6437396663159477, + "flos": 20301846641280.0, + "grad_norm": 2.1755935090023164, + "language_loss": 0.80497181, + "learning_rate": 1.1899318568765518e-06, + "loss": 0.82632756, + "num_input_tokens_seen": 231033800, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 10707, + "time_per_iteration": 2.416238784790039 + }, + { + "auxiliary_loss_clip": 0.01103614, + "auxiliary_loss_mlp": 0.01028614, + "balance_loss_clip": 1.01691961, + "balance_loss_mlp": 1.03542554, + "epoch": 0.6437997895686156, + "flos": 23878585278720.0, + "grad_norm": 1.7933979371525552, + "language_loss": 0.85400867, + "learning_rate": 1.1895757868023978e-06, + "loss": 0.87533092, + "num_input_tokens_seen": 231053160, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 10708, + "time_per_iteration": 2.4596221446990967 + }, + { + "auxiliary_loss_clip": 0.01113539, + "auxiliary_loss_mlp": 0.01039123, + "balance_loss_clip": 1.02577186, + "balance_loss_mlp": 1.03982544, + "epoch": 0.6438599128212836, + "flos": 18989275852800.0, + "grad_norm": 2.314624765830387, + "language_loss": 0.65632617, + "learning_rate": 1.1892197474596106e-06, + "loss": 0.67785281, + "num_input_tokens_seen": 231069470, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.734375, + "step": 10709, + "time_per_iteration": 2.414792776107788 + }, + { + "auxiliary_loss_clip": 0.01101587, + "auxiliary_loss_mlp": 0.01027315, + "balance_loss_clip": 1.01617527, + "balance_loss_mlp": 1.03474462, + "epoch": 0.6439200360739517, + "flos": 24096347481600.0, + "grad_norm": 2.1321707309255196, + "language_loss": 0.80428755, + "learning_rate": 1.1888637388616929e-06, + "loss": 0.8255766, + "num_input_tokens_seen": 231088205, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66796875, + "step": 10710, + "time_per_iteration": 2.475790500640869 + }, + { + "auxiliary_loss_clip": 0.01101022, + "auxiliary_loss_mlp": 0.01027519, + "balance_loss_clip": 1.01573479, + "balance_loss_mlp": 1.03475547, + "epoch": 0.6439801593266196, + "flos": 31902141697920.0, + "grad_norm": 1.6745994206662376, + "language_loss": 0.66166174, + "learning_rate": 1.1885077610221425e-06, + "loss": 0.68294716, + "num_input_tokens_seen": 231107850, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 10711, + "time_per_iteration": 2.502237319946289 + }, + { + "auxiliary_loss_clip": 0.0110763, + "auxiliary_loss_mlp": 0.01027379, + "balance_loss_clip": 1.0150764, + "balance_loss_mlp": 1.03871155, + "epoch": 0.6440402825792876, + "flos": 27125879351040.0, + "grad_norm": 1.56251052314253, + "language_loss": 0.78744113, + "learning_rate": 1.1881518139544597e-06, + "loss": 0.80879122, + "num_input_tokens_seen": 231127200, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69140625, + "step": 10712, + "time_per_iteration": 2.4865529537200928 + }, + { + "auxiliary_loss_clip": 0.01106973, + "auxiliary_loss_mlp": 0.01033455, + "balance_loss_clip": 1.0215044, + "balance_loss_mlp": 1.03622448, + "epoch": 0.6441004058319555, + "flos": 20667704618880.0, + "grad_norm": 1.5577972768959576, + "language_loss": 0.82686722, + "learning_rate": 1.1877958976721417e-06, + "loss": 0.84827155, + "num_input_tokens_seen": 231146360, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.70703125, + "step": 10713, + "time_per_iteration": 2.4358584880828857 + }, + { + "auxiliary_loss_clip": 0.01101375, + "auxiliary_loss_mlp": 0.01035112, + "balance_loss_clip": 1.02368593, + "balance_loss_mlp": 1.03669071, + "epoch": 0.6441605290846235, + "flos": 26026006947840.0, + "grad_norm": 1.4453495865190145, + "language_loss": 0.78343773, + "learning_rate": 1.187440012188684e-06, + "loss": 0.80480266, + "num_input_tokens_seen": 231168350, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 10714, + "time_per_iteration": 2.4839279651641846 + }, + { + "auxiliary_loss_clip": 0.01102157, + "auxiliary_loss_mlp": 0.01031268, + "balance_loss_clip": 1.02021146, + "balance_loss_mlp": 1.03580499, + "epoch": 0.6442206523372914, + "flos": 24899489631360.0, + "grad_norm": 1.5121330908882218, + "language_loss": 0.81442875, + "learning_rate": 1.187084157517583e-06, + "loss": 0.83576298, + "num_input_tokens_seen": 231188385, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6640625, + "step": 10715, + "time_per_iteration": 2.4751946926116943 + }, + { + "auxiliary_loss_clip": 0.01103061, + "auxiliary_loss_mlp": 0.01029965, + "balance_loss_clip": 1.01841354, + "balance_loss_mlp": 1.03416896, + "epoch": 0.6442807755899594, + "flos": 25156322853120.0, + "grad_norm": 1.858940461069926, + "language_loss": 0.81107575, + "learning_rate": 1.186728333672332e-06, + "loss": 0.83240604, + "num_input_tokens_seen": 231209880, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6875, + "step": 10716, + "time_per_iteration": 2.506404161453247 + }, + { + "auxiliary_loss_clip": 0.01106307, + "auxiliary_loss_mlp": 0.01034501, + "balance_loss_clip": 1.02118576, + "balance_loss_mlp": 1.03650761, + "epoch": 0.6443408988426274, + "flos": 27344503480320.0, + "grad_norm": 1.7227597977263103, + "language_loss": 0.77839047, + "learning_rate": 1.186372540666424e-06, + "loss": 0.79979855, + "num_input_tokens_seen": 231230765, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6953125, + "step": 10717, + "time_per_iteration": 2.4654810428619385 + }, + { + "auxiliary_loss_clip": 0.01102271, + "auxiliary_loss_mlp": 0.01028495, + "balance_loss_clip": 1.01759315, + "balance_loss_mlp": 1.03718793, + "epoch": 0.6444010220952954, + "flos": 27928339142400.0, + "grad_norm": 1.6109335148111539, + "language_loss": 0.68141425, + "learning_rate": 1.1860167785133513e-06, + "loss": 0.70272195, + "num_input_tokens_seen": 231252350, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 10718, + "time_per_iteration": 3.9740819931030273 + }, + { + "auxiliary_loss_clip": 0.0102484, + "auxiliary_loss_mlp": 0.01004792, + "balance_loss_clip": 1.00373709, + "balance_loss_mlp": 1.00415778, + "epoch": 0.6444611453479633, + "flos": 71215024855680.0, + "grad_norm": 0.7588040526175028, + "language_loss": 0.49665093, + "learning_rate": 1.185661047226603e-06, + "loss": 0.51694727, + "num_input_tokens_seen": 231313865, + "router_z_loss_clip": 0.01055908, + "router_z_loss_mlp": 0.20703125, + "step": 10719, + "time_per_iteration": 3.2171850204467773 + }, + { + "auxiliary_loss_clip": 0.01108486, + "auxiliary_loss_mlp": 0.01034143, + "balance_loss_clip": 1.0216434, + "balance_loss_mlp": 1.03927541, + "epoch": 0.6445212686006313, + "flos": 22705131864960.0, + "grad_norm": 2.0805005783182415, + "language_loss": 0.78263915, + "learning_rate": 1.18530534681967e-06, + "loss": 0.80406547, + "num_input_tokens_seen": 231331710, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 10720, + "time_per_iteration": 2.434246301651001 + }, + { + "auxiliary_loss_clip": 0.01102308, + "auxiliary_loss_mlp": 0.01032767, + "balance_loss_clip": 1.02056015, + "balance_loss_mlp": 1.03513026, + "epoch": 0.6445813918532992, + "flos": 21178821196800.0, + "grad_norm": 1.6971626147342385, + "language_loss": 0.76729137, + "learning_rate": 1.18494967730604e-06, + "loss": 0.78864217, + "num_input_tokens_seen": 231350705, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 10721, + "time_per_iteration": 5.301208734512329 + }, + { + "auxiliary_loss_clip": 0.01102301, + "auxiliary_loss_mlp": 0.01031144, + "balance_loss_clip": 1.01885331, + "balance_loss_mlp": 1.03417397, + "epoch": 0.6446415151059672, + "flos": 25191910252800.0, + "grad_norm": 2.4666147768058, + "language_loss": 0.73236001, + "learning_rate": 1.1845940386991995e-06, + "loss": 0.75369453, + "num_input_tokens_seen": 231369550, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6796875, + "step": 10722, + "time_per_iteration": 3.918328046798706 + }, + { + "auxiliary_loss_clip": 0.01101304, + "auxiliary_loss_mlp": 0.01026922, + "balance_loss_clip": 1.01602662, + "balance_loss_mlp": 1.03587341, + "epoch": 0.6447016383586353, + "flos": 25302227898240.0, + "grad_norm": 2.1714179391362074, + "language_loss": 0.78181046, + "learning_rate": 1.184238431012635e-06, + "loss": 0.80309272, + "num_input_tokens_seen": 231389285, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.65625, + "step": 10723, + "time_per_iteration": 2.485879421234131 + }, + { + "auxiliary_loss_clip": 0.01108308, + "auxiliary_loss_mlp": 0.01030662, + "balance_loss_clip": 1.01842475, + "balance_loss_mlp": 1.03774381, + "epoch": 0.6447617616113032, + "flos": 27703142824320.0, + "grad_norm": 1.8069876028647023, + "language_loss": 0.58755672, + "learning_rate": 1.1838828542598312e-06, + "loss": 0.60894638, + "num_input_tokens_seen": 231408820, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.703125, + "step": 10724, + "time_per_iteration": 2.478766679763794 + }, + { + "auxiliary_loss_clip": 0.0110091, + "auxiliary_loss_mlp": 0.01031012, + "balance_loss_clip": 1.02043772, + "balance_loss_mlp": 1.03629243, + "epoch": 0.6448218848639712, + "flos": 23039101543680.0, + "grad_norm": 1.6726755912827203, + "language_loss": 0.83442616, + "learning_rate": 1.183527308454271e-06, + "loss": 0.85574543, + "num_input_tokens_seen": 231428100, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6484375, + "step": 10725, + "time_per_iteration": 2.4473166465759277 + }, + { + "auxiliary_loss_clip": 0.01101664, + "auxiliary_loss_mlp": 0.01033389, + "balance_loss_clip": 1.02134895, + "balance_loss_mlp": 1.03365588, + "epoch": 0.6448820081166391, + "flos": 24496104919680.0, + "grad_norm": 1.7120227863307491, + "language_loss": 0.82104886, + "learning_rate": 1.1831717936094368e-06, + "loss": 0.84239936, + "num_input_tokens_seen": 231445810, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 10726, + "time_per_iteration": 2.4571003913879395 + }, + { + "auxiliary_loss_clip": 0.01106369, + "auxiliary_loss_mlp": 0.01031146, + "balance_loss_clip": 1.01877189, + "balance_loss_mlp": 1.03662455, + "epoch": 0.6449421313693071, + "flos": 22419283432320.0, + "grad_norm": 3.203326603634113, + "language_loss": 0.80919254, + "learning_rate": 1.1828163097388108e-06, + "loss": 0.83056766, + "num_input_tokens_seen": 231463570, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69921875, + "step": 10727, + "time_per_iteration": 2.4684529304504395 + }, + { + "auxiliary_loss_clip": 0.01109129, + "auxiliary_loss_mlp": 0.0103185, + "balance_loss_clip": 1.01939309, + "balance_loss_mlp": 1.03661084, + "epoch": 0.645002254621975, + "flos": 20225715765120.0, + "grad_norm": 2.8311253143889514, + "language_loss": 0.7950902, + "learning_rate": 1.1824608568558717e-06, + "loss": 0.81649995, + "num_input_tokens_seen": 231482155, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7265625, + "step": 10728, + "time_per_iteration": 2.420926094055176 + }, + { + "auxiliary_loss_clip": 0.0110447, + "auxiliary_loss_mlp": 0.01033011, + "balance_loss_clip": 1.02040493, + "balance_loss_mlp": 1.03509378, + "epoch": 0.645062377874643, + "flos": 27855440490240.0, + "grad_norm": 1.688837212564324, + "language_loss": 0.74242163, + "learning_rate": 1.1821054349740988e-06, + "loss": 0.76379651, + "num_input_tokens_seen": 231502465, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 10729, + "time_per_iteration": 2.5284883975982666 + }, + { + "auxiliary_loss_clip": 0.01105519, + "auxiliary_loss_mlp": 0.01032833, + "balance_loss_clip": 1.02004814, + "balance_loss_mlp": 1.03606546, + "epoch": 0.645122501127311, + "flos": 25301509626240.0, + "grad_norm": 1.7461235371462989, + "language_loss": 0.66486406, + "learning_rate": 1.1817500441069706e-06, + "loss": 0.68624759, + "num_input_tokens_seen": 231522740, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 10730, + "time_per_iteration": 2.472608804702759 + }, + { + "auxiliary_loss_clip": 0.01105349, + "auxiliary_loss_mlp": 0.0103343, + "balance_loss_clip": 1.02041864, + "balance_loss_mlp": 1.03703654, + "epoch": 0.645182624379979, + "flos": 18807352444800.0, + "grad_norm": 1.5067900334591022, + "language_loss": 0.63581085, + "learning_rate": 1.1813946842679614e-06, + "loss": 0.65719867, + "num_input_tokens_seen": 231542050, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.68359375, + "step": 10731, + "time_per_iteration": 2.446270704269409 + }, + { + "auxiliary_loss_clip": 0.01103309, + "auxiliary_loss_mlp": 0.01035418, + "balance_loss_clip": 1.0236578, + "balance_loss_mlp": 1.03637123, + "epoch": 0.6452427476326469, + "flos": 18332182402560.0, + "grad_norm": 1.5914748736963724, + "language_loss": 0.67864686, + "learning_rate": 1.1810393554705492e-06, + "loss": 0.70003414, + "num_input_tokens_seen": 231560380, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 10732, + "time_per_iteration": 2.4132513999938965 + }, + { + "auxiliary_loss_clip": 0.0110186, + "auxiliary_loss_mlp": 0.0103276, + "balance_loss_clip": 1.02102399, + "balance_loss_mlp": 1.03576565, + "epoch": 0.6453028708853149, + "flos": 22784746360320.0, + "grad_norm": 2.9402611085528685, + "language_loss": 0.75528163, + "learning_rate": 1.1806840577282055e-06, + "loss": 0.77662778, + "num_input_tokens_seen": 231580810, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66015625, + "step": 10733, + "time_per_iteration": 2.481633186340332 + }, + { + "auxiliary_loss_clip": 0.01109224, + "auxiliary_loss_mlp": 0.01038115, + "balance_loss_clip": 1.02552605, + "balance_loss_mlp": 1.03813672, + "epoch": 0.6453629941379828, + "flos": 23945989150080.0, + "grad_norm": 1.733255021176503, + "language_loss": 0.65421891, + "learning_rate": 1.1803287910544048e-06, + "loss": 0.67569232, + "num_input_tokens_seen": 231600585, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 10734, + "time_per_iteration": 2.458852529525757 + }, + { + "auxiliary_loss_clip": 0.01102096, + "auxiliary_loss_mlp": 0.01040084, + "balance_loss_clip": 1.02868783, + "balance_loss_mlp": 1.03828883, + "epoch": 0.6454231173906508, + "flos": 17676381841920.0, + "grad_norm": 2.35360500847906, + "language_loss": 0.7390331, + "learning_rate": 1.1799735554626191e-06, + "loss": 0.76045489, + "num_input_tokens_seen": 231618765, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.640625, + "step": 10735, + "time_per_iteration": 2.4310169219970703 + }, + { + "auxiliary_loss_clip": 0.01106342, + "auxiliary_loss_mlp": 0.01033097, + "balance_loss_clip": 1.02140272, + "balance_loss_mlp": 1.0381664, + "epoch": 0.6454832406433189, + "flos": 23292774368640.0, + "grad_norm": 1.7357542776809323, + "language_loss": 0.74936789, + "learning_rate": 1.1796183509663176e-06, + "loss": 0.77076226, + "num_input_tokens_seen": 231638525, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 10736, + "time_per_iteration": 2.4535531997680664 + }, + { + "auxiliary_loss_clip": 0.01108598, + "auxiliary_loss_mlp": 0.01030926, + "balance_loss_clip": 1.01834321, + "balance_loss_mlp": 1.03880417, + "epoch": 0.6455433638959868, + "flos": 20157198572160.0, + "grad_norm": 7.331374953548985, + "language_loss": 0.70983565, + "learning_rate": 1.1792631775789708e-06, + "loss": 0.73123091, + "num_input_tokens_seen": 231656785, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 10737, + "time_per_iteration": 2.455932855606079 + }, + { + "auxiliary_loss_clip": 0.01024539, + "auxiliary_loss_mlp": 0.0100647, + "balance_loss_clip": 1.00536776, + "balance_loss_mlp": 1.0038693, + "epoch": 0.6456034871486548, + "flos": 66532922012160.0, + "grad_norm": 0.7756134851395411, + "language_loss": 0.58466899, + "learning_rate": 1.1789080353140464e-06, + "loss": 0.6049791, + "num_input_tokens_seen": 231719075, + "router_z_loss_clip": 0.01104736, + "router_z_loss_mlp": 0.20703125, + "step": 10738, + "time_per_iteration": 3.11362624168396 + }, + { + "auxiliary_loss_clip": 0.01101864, + "auxiliary_loss_mlp": 0.0103012, + "balance_loss_clip": 1.01832366, + "balance_loss_mlp": 1.03569365, + "epoch": 0.6456636104013227, + "flos": 24206090509440.0, + "grad_norm": 1.6796977264879835, + "language_loss": 0.7432248, + "learning_rate": 1.1785529241850118e-06, + "loss": 0.76454461, + "num_input_tokens_seen": 231737810, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66015625, + "step": 10739, + "time_per_iteration": 2.575263261795044 + }, + { + "auxiliary_loss_clip": 0.01106876, + "auxiliary_loss_mlp": 0.01028957, + "balance_loss_clip": 1.01638103, + "balance_loss_mlp": 1.03678012, + "epoch": 0.6457237336539907, + "flos": 23624086440960.0, + "grad_norm": 1.7287512893442607, + "language_loss": 0.71253389, + "learning_rate": 1.1781978442053324e-06, + "loss": 0.7338922, + "num_input_tokens_seen": 231756140, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 10740, + "time_per_iteration": 2.4456567764282227 + }, + { + "auxiliary_loss_clip": 0.01024391, + "auxiliary_loss_mlp": 0.01001433, + "balance_loss_clip": 1.00019932, + "balance_loss_mlp": 1.00384283, + "epoch": 0.6457838569066586, + "flos": 65846023251840.0, + "grad_norm": 0.6634055191842134, + "language_loss": 0.55304271, + "learning_rate": 1.1778427953884733e-06, + "loss": 0.57330096, + "num_input_tokens_seen": 231823665, + "router_z_loss_clip": 0.0123291, + "router_z_loss_mlp": 0.20507812, + "step": 10741, + "time_per_iteration": 3.084655284881592 + }, + { + "auxiliary_loss_clip": 0.01100994, + "auxiliary_loss_mlp": 0.01029872, + "balance_loss_clip": 1.01909518, + "balance_loss_mlp": 1.03560328, + "epoch": 0.6458439801593266, + "flos": 22381972179840.0, + "grad_norm": 4.469668504909254, + "language_loss": 0.80574667, + "learning_rate": 1.1774877777478977e-06, + "loss": 0.82705534, + "num_input_tokens_seen": 231844500, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.65234375, + "step": 10742, + "time_per_iteration": 2.4683938026428223 + }, + { + "auxiliary_loss_clip": 0.01100072, + "auxiliary_loss_mlp": 0.01027999, + "balance_loss_clip": 1.01656711, + "balance_loss_mlp": 1.03513217, + "epoch": 0.6459041034119946, + "flos": 24789243813120.0, + "grad_norm": 1.5091720275231448, + "language_loss": 0.81898236, + "learning_rate": 1.1771327912970678e-06, + "loss": 0.84026313, + "num_input_tokens_seen": 231864510, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 10743, + "time_per_iteration": 2.4860422611236572 + }, + { + "auxiliary_loss_clip": 0.01103144, + "auxiliary_loss_mlp": 0.01028667, + "balance_loss_clip": 1.01716936, + "balance_loss_mlp": 1.03643143, + "epoch": 0.6459642266646626, + "flos": 18325358818560.0, + "grad_norm": 1.8283751590876323, + "language_loss": 0.72072589, + "learning_rate": 1.1767778360494453e-06, + "loss": 0.74204403, + "num_input_tokens_seen": 231881555, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 10744, + "time_per_iteration": 2.401154041290283 + }, + { + "auxiliary_loss_clip": 0.01103143, + "auxiliary_loss_mlp": 0.01024823, + "balance_loss_clip": 1.01339674, + "balance_loss_mlp": 1.0362134, + "epoch": 0.6460243499173305, + "flos": 43581368891520.0, + "grad_norm": 1.6832996887385467, + "language_loss": 0.66680956, + "learning_rate": 1.1764229120184896e-06, + "loss": 0.68808925, + "num_input_tokens_seen": 231905945, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 10745, + "time_per_iteration": 2.648923635482788 + }, + { + "auxiliary_loss_clip": 0.01104749, + "auxiliary_loss_mlp": 0.01030723, + "balance_loss_clip": 1.01873684, + "balance_loss_mlp": 1.03738117, + "epoch": 0.6460844731699985, + "flos": 19244026085760.0, + "grad_norm": 2.3663753891536206, + "language_loss": 0.7367624, + "learning_rate": 1.1760680192176597e-06, + "loss": 0.75811714, + "num_input_tokens_seen": 231922535, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 10746, + "time_per_iteration": 2.414886713027954 + }, + { + "auxiliary_loss_clip": 0.01106121, + "auxiliary_loss_mlp": 0.01031446, + "balance_loss_clip": 1.02040744, + "balance_loss_mlp": 1.03723145, + "epoch": 0.6461445964226664, + "flos": 27453348668160.0, + "grad_norm": 1.4238954510434034, + "language_loss": 0.66682059, + "learning_rate": 1.175713157660413e-06, + "loss": 0.6881963, + "num_input_tokens_seen": 231944800, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6875, + "step": 10747, + "time_per_iteration": 2.5016472339630127 + }, + { + "auxiliary_loss_clip": 0.01103964, + "auxiliary_loss_mlp": 0.01036697, + "balance_loss_clip": 1.02568781, + "balance_loss_mlp": 1.03684711, + "epoch": 0.6462047196753344, + "flos": 20295489934080.0, + "grad_norm": 1.577013961139599, + "language_loss": 0.66913009, + "learning_rate": 1.1753583273602056e-06, + "loss": 0.69053674, + "num_input_tokens_seen": 231962970, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.671875, + "step": 10748, + "time_per_iteration": 2.442237615585327 + }, + { + "auxiliary_loss_clip": 0.0110688, + "auxiliary_loss_mlp": 0.01039659, + "balance_loss_clip": 1.0270108, + "balance_loss_mlp": 1.03662395, + "epoch": 0.6462648429280025, + "flos": 22018340845440.0, + "grad_norm": 1.8120464443443396, + "language_loss": 0.76339692, + "learning_rate": 1.1750035283304937e-06, + "loss": 0.78486234, + "num_input_tokens_seen": 231981195, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 10749, + "time_per_iteration": 2.4924192428588867 + }, + { + "auxiliary_loss_clip": 0.01104279, + "auxiliary_loss_mlp": 0.010303, + "balance_loss_clip": 1.01845694, + "balance_loss_mlp": 1.03520691, + "epoch": 0.6463249661806704, + "flos": 27781141207680.0, + "grad_norm": 1.7469795758698337, + "language_loss": 0.77112448, + "learning_rate": 1.17464876058473e-06, + "loss": 0.79247028, + "num_input_tokens_seen": 232001735, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.69140625, + "step": 10750, + "time_per_iteration": 2.4771273136138916 + }, + { + "auxiliary_loss_clip": 0.01108606, + "auxiliary_loss_mlp": 0.01031442, + "balance_loss_clip": 1.01831079, + "balance_loss_mlp": 1.0382905, + "epoch": 0.6463850894333384, + "flos": 22050588280320.0, + "grad_norm": 2.0857387723701817, + "language_loss": 0.68225217, + "learning_rate": 1.1742940241363683e-06, + "loss": 0.70365262, + "num_input_tokens_seen": 232019830, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 10751, + "time_per_iteration": 2.5023088455200195 + }, + { + "auxiliary_loss_clip": 0.01103858, + "auxiliary_loss_mlp": 0.01029491, + "balance_loss_clip": 1.01730776, + "balance_loss_mlp": 1.03535843, + "epoch": 0.6464452126860063, + "flos": 21106245767040.0, + "grad_norm": 1.6570772228110922, + "language_loss": 0.70823848, + "learning_rate": 1.1739393189988604e-06, + "loss": 0.72957194, + "num_input_tokens_seen": 232039625, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 10752, + "time_per_iteration": 2.4542946815490723 + }, + { + "auxiliary_loss_clip": 0.01106954, + "auxiliary_loss_mlp": 0.01034427, + "balance_loss_clip": 1.02100945, + "balance_loss_mlp": 1.03708041, + "epoch": 0.6465053359386743, + "flos": 16028045694720.0, + "grad_norm": 1.7443402746921521, + "language_loss": 0.7799257, + "learning_rate": 1.1735846451856554e-06, + "loss": 0.80133951, + "num_input_tokens_seen": 232055855, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.69921875, + "step": 10753, + "time_per_iteration": 2.414531707763672 + }, + { + "auxiliary_loss_clip": 0.01102943, + "auxiliary_loss_mlp": 0.01038015, + "balance_loss_clip": 1.02636194, + "balance_loss_mlp": 1.03694177, + "epoch": 0.6465654591913422, + "flos": 23398674641280.0, + "grad_norm": 1.5945794385803833, + "language_loss": 0.85284775, + "learning_rate": 1.1732300027102041e-06, + "loss": 0.87425733, + "num_input_tokens_seen": 232073475, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66015625, + "step": 10754, + "time_per_iteration": 2.4596917629241943 + }, + { + "auxiliary_loss_clip": 0.01102766, + "auxiliary_loss_mlp": 0.01033015, + "balance_loss_clip": 1.02160048, + "balance_loss_mlp": 1.03613544, + "epoch": 0.6466255824440102, + "flos": 15377273038080.0, + "grad_norm": 1.9678569539088453, + "language_loss": 0.59384984, + "learning_rate": 1.1728753915859541e-06, + "loss": 0.61520755, + "num_input_tokens_seen": 232091090, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 10755, + "time_per_iteration": 2.439668893814087 + }, + { + "auxiliary_loss_clip": 0.01102658, + "auxiliary_loss_mlp": 0.01030046, + "balance_loss_clip": 1.01808381, + "balance_loss_mlp": 1.03532171, + "epoch": 0.6466857056966782, + "flos": 16252846963200.0, + "grad_norm": 5.126423165663523, + "language_loss": 0.67684507, + "learning_rate": 1.1725208118263518e-06, + "loss": 0.69817215, + "num_input_tokens_seen": 232107320, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.67578125, + "step": 10756, + "time_per_iteration": 2.405700206756592 + }, + { + "auxiliary_loss_clip": 0.01110332, + "auxiliary_loss_mlp": 0.01031544, + "balance_loss_clip": 1.019063, + "balance_loss_mlp": 1.03889596, + "epoch": 0.6467458289493462, + "flos": 21178246579200.0, + "grad_norm": 3.0387860554111574, + "language_loss": 0.74348402, + "learning_rate": 1.172166263444844e-06, + "loss": 0.76490277, + "num_input_tokens_seen": 232123930, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71484375, + "step": 10757, + "time_per_iteration": 2.4515702724456787 + }, + { + "auxiliary_loss_clip": 0.01102078, + "auxiliary_loss_mlp": 0.01030497, + "balance_loss_clip": 1.01911271, + "balance_loss_mlp": 1.03616095, + "epoch": 0.6468059522020141, + "flos": 17968299672960.0, + "grad_norm": 1.6276488646407918, + "language_loss": 0.74483991, + "learning_rate": 1.1718117464548734e-06, + "loss": 0.76616573, + "num_input_tokens_seen": 232142905, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65625, + "step": 10758, + "time_per_iteration": 2.4118669033050537 + }, + { + "auxiliary_loss_clip": 0.01104769, + "auxiliary_loss_mlp": 0.01029801, + "balance_loss_clip": 1.01735008, + "balance_loss_mlp": 1.03648281, + "epoch": 0.6468660754546821, + "flos": 17890157635200.0, + "grad_norm": 1.54772879655888, + "language_loss": 0.67891282, + "learning_rate": 1.1714572608698845e-06, + "loss": 0.70025849, + "num_input_tokens_seen": 232162230, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.68359375, + "step": 10759, + "time_per_iteration": 2.487632989883423 + }, + { + "auxiliary_loss_clip": 0.01106799, + "auxiliary_loss_mlp": 0.01030763, + "balance_loss_clip": 1.01868701, + "balance_loss_mlp": 1.03644943, + "epoch": 0.64692619870735, + "flos": 22600991358720.0, + "grad_norm": 1.881795853492405, + "language_loss": 0.75285017, + "learning_rate": 1.1711028067033197e-06, + "loss": 0.77422583, + "num_input_tokens_seen": 232182700, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.703125, + "step": 10760, + "time_per_iteration": 3.9272162914276123 + }, + { + "auxiliary_loss_clip": 0.01100222, + "auxiliary_loss_mlp": 0.01026563, + "balance_loss_clip": 1.01540494, + "balance_loss_mlp": 1.03383064, + "epoch": 0.646986321960018, + "flos": 49600786993920.0, + "grad_norm": 1.5052354500877283, + "language_loss": 0.65392292, + "learning_rate": 1.1707483839686194e-06, + "loss": 0.67519075, + "num_input_tokens_seen": 232208235, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6640625, + "step": 10761, + "time_per_iteration": 2.6993539333343506 + }, + { + "auxiliary_loss_clip": 0.01106456, + "auxiliary_loss_mlp": 0.01029889, + "balance_loss_clip": 1.01751542, + "balance_loss_mlp": 1.03747368, + "epoch": 0.6470464452126861, + "flos": 21908454163200.0, + "grad_norm": 2.1055667385281316, + "language_loss": 0.69732755, + "learning_rate": 1.1703939926792235e-06, + "loss": 0.71869099, + "num_input_tokens_seen": 232228720, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 10762, + "time_per_iteration": 2.4523587226867676 + }, + { + "auxiliary_loss_clip": 0.0110606, + "auxiliary_loss_mlp": 0.01032417, + "balance_loss_clip": 1.02048469, + "balance_loss_mlp": 1.03625226, + "epoch": 0.647106568465354, + "flos": 18106124158080.0, + "grad_norm": 2.1633807412884343, + "language_loss": 0.82723743, + "learning_rate": 1.1700396328485705e-06, + "loss": 0.8486222, + "num_input_tokens_seen": 232244655, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69921875, + "step": 10763, + "time_per_iteration": 5.290219306945801 + }, + { + "auxiliary_loss_clip": 0.01024866, + "auxiliary_loss_mlp": 0.01000313, + "balance_loss_clip": 0.99911511, + "balance_loss_mlp": 1.00423336, + "epoch": 0.647166691718022, + "flos": 69480038125440.0, + "grad_norm": 0.7101546065504528, + "language_loss": 0.57767004, + "learning_rate": 1.1696853044900978e-06, + "loss": 0.59792185, + "num_input_tokens_seen": 232308685, + "router_z_loss_clip": 0.01196289, + "router_z_loss_mlp": 0.20703125, + "step": 10764, + "time_per_iteration": 4.603821277618408 + }, + { + "auxiliary_loss_clip": 0.01102286, + "auxiliary_loss_mlp": 0.01034651, + "balance_loss_clip": 1.02277756, + "balance_loss_mlp": 1.03570485, + "epoch": 0.6472268149706899, + "flos": 34095170661120.0, + "grad_norm": 1.8570193979841765, + "language_loss": 0.60458118, + "learning_rate": 1.1693310076172413e-06, + "loss": 0.62595057, + "num_input_tokens_seen": 232327520, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6640625, + "step": 10765, + "time_per_iteration": 2.545964002609253 + }, + { + "auxiliary_loss_clip": 0.01102593, + "auxiliary_loss_mlp": 0.01026242, + "balance_loss_clip": 1.01510835, + "balance_loss_mlp": 1.03606391, + "epoch": 0.6472869382233579, + "flos": 28111232217600.0, + "grad_norm": 1.9785388674295172, + "language_loss": 0.63237435, + "learning_rate": 1.168976742243437e-06, + "loss": 0.65366268, + "num_input_tokens_seen": 232349025, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 10766, + "time_per_iteration": 2.4889070987701416 + }, + { + "auxiliary_loss_clip": 0.01103393, + "auxiliary_loss_mlp": 0.01029515, + "balance_loss_clip": 1.0176481, + "balance_loss_mlp": 1.03667796, + "epoch": 0.6473470614760258, + "flos": 22492146170880.0, + "grad_norm": 1.6243256535427835, + "language_loss": 0.75656283, + "learning_rate": 1.1686225083821174e-06, + "loss": 0.77789199, + "num_input_tokens_seen": 232367835, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 10767, + "time_per_iteration": 2.506972551345825 + }, + { + "auxiliary_loss_clip": 0.01101653, + "auxiliary_loss_mlp": 0.01031252, + "balance_loss_clip": 1.01970649, + "balance_loss_mlp": 1.03562641, + "epoch": 0.6474071847286939, + "flos": 14538938538240.0, + "grad_norm": 2.0284924931052406, + "language_loss": 0.77826148, + "learning_rate": 1.1682683060467153e-06, + "loss": 0.79959053, + "num_input_tokens_seen": 232385840, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66015625, + "step": 10768, + "time_per_iteration": 2.4127895832061768 + }, + { + "auxiliary_loss_clip": 0.01102155, + "auxiliary_loss_mlp": 0.01028073, + "balance_loss_clip": 1.01648641, + "balance_loss_mlp": 1.03510022, + "epoch": 0.6474673079813618, + "flos": 24098214988800.0, + "grad_norm": 1.6952390655728202, + "language_loss": 0.71920127, + "learning_rate": 1.167914135250663e-06, + "loss": 0.74050355, + "num_input_tokens_seen": 232406205, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 10769, + "time_per_iteration": 2.4743292331695557 + }, + { + "auxiliary_loss_clip": 0.01101029, + "auxiliary_loss_mlp": 0.01034947, + "balance_loss_clip": 1.02368212, + "balance_loss_mlp": 1.03668594, + "epoch": 0.6475274312340298, + "flos": 14976186796800.0, + "grad_norm": 1.9257555417687353, + "language_loss": 0.71907532, + "learning_rate": 1.1675599960073895e-06, + "loss": 0.74043512, + "num_input_tokens_seen": 232424995, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.64453125, + "step": 10770, + "time_per_iteration": 2.423251152038574 + }, + { + "auxiliary_loss_clip": 0.01107379, + "auxiliary_loss_mlp": 0.01027825, + "balance_loss_clip": 1.01506996, + "balance_loss_mlp": 1.03676248, + "epoch": 0.6475875544866977, + "flos": 25045322849280.0, + "grad_norm": 1.7207965836379309, + "language_loss": 0.73562384, + "learning_rate": 1.167205888330325e-06, + "loss": 0.75697601, + "num_input_tokens_seen": 232445870, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 10771, + "time_per_iteration": 2.498911142349243 + }, + { + "auxiliary_loss_clip": 0.01104798, + "auxiliary_loss_mlp": 0.01029715, + "balance_loss_clip": 1.01807988, + "balance_loss_mlp": 1.03799939, + "epoch": 0.6476476777393657, + "flos": 16472153450880.0, + "grad_norm": 1.8994664849870517, + "language_loss": 0.7373805, + "learning_rate": 1.1668518122328958e-06, + "loss": 0.75872564, + "num_input_tokens_seen": 232464285, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66796875, + "step": 10772, + "time_per_iteration": 2.4090960025787354 + }, + { + "auxiliary_loss_clip": 0.01100449, + "auxiliary_loss_mlp": 0.01029925, + "balance_loss_clip": 1.0196557, + "balance_loss_mlp": 1.03508711, + "epoch": 0.6477078009920336, + "flos": 25812267068160.0, + "grad_norm": 1.4911839819427335, + "language_loss": 0.83115339, + "learning_rate": 1.1664977677285305e-06, + "loss": 0.85245723, + "num_input_tokens_seen": 232485815, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.65625, + "step": 10773, + "time_per_iteration": 2.4857256412506104 + }, + { + "auxiliary_loss_clip": 0.01100513, + "auxiliary_loss_mlp": 0.0102739, + "balance_loss_clip": 1.01634526, + "balance_loss_mlp": 1.03509998, + "epoch": 0.6477679242447016, + "flos": 17676130446720.0, + "grad_norm": 1.4644145421555252, + "language_loss": 0.78116065, + "learning_rate": 1.1661437548306524e-06, + "loss": 0.80243969, + "num_input_tokens_seen": 232504875, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 10774, + "time_per_iteration": 2.4285647869110107 + }, + { + "auxiliary_loss_clip": 0.01105101, + "auxiliary_loss_mlp": 0.01036946, + "balance_loss_clip": 1.02481055, + "balance_loss_mlp": 1.0360589, + "epoch": 0.6478280474973696, + "flos": 21032305620480.0, + "grad_norm": 2.0390391270124986, + "language_loss": 0.68541199, + "learning_rate": 1.1657897735526867e-06, + "loss": 0.70683241, + "num_input_tokens_seen": 232521945, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69140625, + "step": 10775, + "time_per_iteration": 2.451076030731201 + }, + { + "auxiliary_loss_clip": 0.01106496, + "auxiliary_loss_mlp": 0.01028548, + "balance_loss_clip": 1.01700842, + "balance_loss_mlp": 1.03669178, + "epoch": 0.6478881707500376, + "flos": 21616931381760.0, + "grad_norm": 1.740664481421832, + "language_loss": 0.65512002, + "learning_rate": 1.1654358239080574e-06, + "loss": 0.67647052, + "num_input_tokens_seen": 232541500, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.69921875, + "step": 10776, + "time_per_iteration": 2.457409381866455 + }, + { + "auxiliary_loss_clip": 0.01105388, + "auxiliary_loss_mlp": 0.01033354, + "balance_loss_clip": 1.02126646, + "balance_loss_mlp": 1.03623533, + "epoch": 0.6479482940027056, + "flos": 18442571875200.0, + "grad_norm": 2.790324273409248, + "language_loss": 0.78897285, + "learning_rate": 1.1650819059101839e-06, + "loss": 0.81036025, + "num_input_tokens_seen": 232559720, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69140625, + "step": 10777, + "time_per_iteration": 2.425045967102051 + }, + { + "auxiliary_loss_clip": 0.01105443, + "auxiliary_loss_mlp": 0.01029925, + "balance_loss_clip": 1.01792097, + "balance_loss_mlp": 1.03808999, + "epoch": 0.6480084172553735, + "flos": 22164066322560.0, + "grad_norm": 2.190301315300799, + "language_loss": 0.73786491, + "learning_rate": 1.1647280195724896e-06, + "loss": 0.75921857, + "num_input_tokens_seen": 232579370, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 10778, + "time_per_iteration": 2.459921360015869 + }, + { + "auxiliary_loss_clip": 0.01099736, + "auxiliary_loss_mlp": 0.01030488, + "balance_loss_clip": 1.01919854, + "balance_loss_mlp": 1.033885, + "epoch": 0.6480685405080415, + "flos": 24316228586880.0, + "grad_norm": 1.4021781865585379, + "language_loss": 0.77758849, + "learning_rate": 1.1643741649083923e-06, + "loss": 0.79889071, + "num_input_tokens_seen": 232600495, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65625, + "step": 10779, + "time_per_iteration": 2.4944956302642822 + }, + { + "auxiliary_loss_clip": 0.0102552, + "auxiliary_loss_mlp": 0.0100081, + "balance_loss_clip": 0.99959451, + "balance_loss_mlp": 1.00497544, + "epoch": 0.6481286637607094, + "flos": 59891207760000.0, + "grad_norm": 0.7236484274239682, + "language_loss": 0.59404081, + "learning_rate": 1.1640203419313095e-06, + "loss": 0.61430413, + "num_input_tokens_seen": 232663165, + "router_z_loss_clip": 0.012146, + "router_z_loss_mlp": 0.20507812, + "step": 10780, + "time_per_iteration": 3.0612237453460693 + }, + { + "auxiliary_loss_clip": 0.01101259, + "auxiliary_loss_mlp": 0.01029164, + "balance_loss_clip": 1.01804209, + "balance_loss_mlp": 1.03493273, + "epoch": 0.6481887870133775, + "flos": 25484187219840.0, + "grad_norm": 1.958027941262836, + "language_loss": 0.79607379, + "learning_rate": 1.1636665506546599e-06, + "loss": 0.81737804, + "num_input_tokens_seen": 232683385, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 10781, + "time_per_iteration": 2.5239641666412354 + }, + { + "auxiliary_loss_clip": 0.01107534, + "auxiliary_loss_mlp": 0.01033388, + "balance_loss_clip": 1.01997757, + "balance_loss_mlp": 1.03791904, + "epoch": 0.6482489102660454, + "flos": 19930206574080.0, + "grad_norm": 1.9679764489100238, + "language_loss": 0.78864902, + "learning_rate": 1.1633127910918578e-06, + "loss": 0.81005824, + "num_input_tokens_seen": 232699095, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.6953125, + "step": 10782, + "time_per_iteration": 2.4253900051116943 + }, + { + "auxiliary_loss_clip": 0.01106515, + "auxiliary_loss_mlp": 0.0103164, + "balance_loss_clip": 1.01932609, + "balance_loss_mlp": 1.03778386, + "epoch": 0.6483090335187134, + "flos": 26979471515520.0, + "grad_norm": 2.985749633483, + "language_loss": 0.63785768, + "learning_rate": 1.1629590632563187e-06, + "loss": 0.65923923, + "num_input_tokens_seen": 232717920, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 10783, + "time_per_iteration": 2.5159454345703125 + }, + { + "auxiliary_loss_clip": 0.01108311, + "auxiliary_loss_mlp": 0.01032692, + "balance_loss_clip": 1.01933455, + "balance_loss_mlp": 1.03791237, + "epoch": 0.6483691567713813, + "flos": 25077965333760.0, + "grad_norm": 2.3442009274857387, + "language_loss": 0.88642716, + "learning_rate": 1.1626053671614561e-06, + "loss": 0.90783715, + "num_input_tokens_seen": 232737605, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.703125, + "step": 10784, + "time_per_iteration": 2.4753408432006836 + }, + { + "auxiliary_loss_clip": 0.01102388, + "auxiliary_loss_mlp": 0.01031195, + "balance_loss_clip": 1.01830864, + "balance_loss_mlp": 1.03565788, + "epoch": 0.6484292800240493, + "flos": 16105972250880.0, + "grad_norm": 12.15646159907571, + "language_loss": 0.73281801, + "learning_rate": 1.1622517028206815e-06, + "loss": 0.75415385, + "num_input_tokens_seen": 232755110, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.66796875, + "step": 10785, + "time_per_iteration": 2.4413681030273438 + }, + { + "auxiliary_loss_clip": 0.01101717, + "auxiliary_loss_mlp": 0.01029791, + "balance_loss_clip": 1.01844823, + "balance_loss_mlp": 1.03633511, + "epoch": 0.6484894032767172, + "flos": 28840398307200.0, + "grad_norm": 1.367601959382758, + "language_loss": 0.69167411, + "learning_rate": 1.1618980702474071e-06, + "loss": 0.71298921, + "num_input_tokens_seen": 232779040, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65234375, + "step": 10786, + "time_per_iteration": 2.524073362350464 + }, + { + "auxiliary_loss_clip": 0.01100884, + "auxiliary_loss_mlp": 0.01031055, + "balance_loss_clip": 1.01922917, + "balance_loss_mlp": 1.03379738, + "epoch": 0.6485495265293852, + "flos": 30227052896640.0, + "grad_norm": 1.7579718485158407, + "language_loss": 0.71124583, + "learning_rate": 1.161544469455041e-06, + "loss": 0.73256522, + "num_input_tokens_seen": 232800515, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 10787, + "time_per_iteration": 2.5158114433288574 + }, + { + "auxiliary_loss_clip": 0.01106869, + "auxiliary_loss_mlp": 0.01029835, + "balance_loss_clip": 1.017735, + "balance_loss_mlp": 1.03644013, + "epoch": 0.6486096497820532, + "flos": 20082181017600.0, + "grad_norm": 2.051362245275849, + "language_loss": 0.84114212, + "learning_rate": 1.1611909004569934e-06, + "loss": 0.86250919, + "num_input_tokens_seen": 232818450, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.703125, + "step": 10788, + "time_per_iteration": 2.4534499645233154 + }, + { + "auxiliary_loss_clip": 0.01105049, + "auxiliary_loss_mlp": 0.01028079, + "balance_loss_clip": 1.01589584, + "balance_loss_mlp": 1.03690362, + "epoch": 0.6486697730347212, + "flos": 17129067333120.0, + "grad_norm": 1.7919339269161743, + "language_loss": 0.76950663, + "learning_rate": 1.1608373632666708e-06, + "loss": 0.79083782, + "num_input_tokens_seen": 232834785, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6796875, + "step": 10789, + "time_per_iteration": 2.483477830886841 + }, + { + "auxiliary_loss_clip": 0.01100294, + "auxiliary_loss_mlp": 0.01028178, + "balance_loss_clip": 1.01661515, + "balance_loss_mlp": 1.03485107, + "epoch": 0.6487298962873892, + "flos": 38911940570880.0, + "grad_norm": 1.6011584419095646, + "language_loss": 0.76170266, + "learning_rate": 1.160483857897479e-06, + "loss": 0.78298742, + "num_input_tokens_seen": 232856050, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65234375, + "step": 10790, + "time_per_iteration": 2.589041233062744 + }, + { + "auxiliary_loss_clip": 0.01106166, + "auxiliary_loss_mlp": 0.01032194, + "balance_loss_clip": 1.02138782, + "balance_loss_mlp": 1.03979826, + "epoch": 0.6487900195400571, + "flos": 11947840076160.0, + "grad_norm": 2.041315075509779, + "language_loss": 0.59891582, + "learning_rate": 1.160130384362823e-06, + "loss": 0.6202994, + "num_input_tokens_seen": 232873945, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6640625, + "step": 10791, + "time_per_iteration": 2.432832956314087 + }, + { + "auxiliary_loss_clip": 0.01103609, + "auxiliary_loss_mlp": 0.01028949, + "balance_loss_clip": 1.01708758, + "balance_loss_mlp": 1.03552938, + "epoch": 0.6488501427927251, + "flos": 22344445445760.0, + "grad_norm": 1.6472225462276555, + "language_loss": 0.86154032, + "learning_rate": 1.1597769426761082e-06, + "loss": 0.88286591, + "num_input_tokens_seen": 232892160, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 10792, + "time_per_iteration": 2.446188449859619 + }, + { + "auxiliary_loss_clip": 0.01108514, + "auxiliary_loss_mlp": 0.01037084, + "balance_loss_clip": 1.02510905, + "balance_loss_mlp": 1.03797722, + "epoch": 0.648910266045393, + "flos": 22236282616320.0, + "grad_norm": 2.3897847361162396, + "language_loss": 0.78055567, + "learning_rate": 1.159423532850735e-06, + "loss": 0.80201161, + "num_input_tokens_seen": 232911725, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.703125, + "step": 10793, + "time_per_iteration": 2.5302352905273438 + }, + { + "auxiliary_loss_clip": 0.01108826, + "auxiliary_loss_mlp": 0.01027456, + "balance_loss_clip": 1.01562476, + "balance_loss_mlp": 1.03950644, + "epoch": 0.6489703892980611, + "flos": 25301258231040.0, + "grad_norm": 1.9288429134844602, + "language_loss": 0.75000489, + "learning_rate": 1.1590701549001055e-06, + "loss": 0.77136773, + "num_input_tokens_seen": 232929085, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6953125, + "step": 10794, + "time_per_iteration": 2.487550735473633 + }, + { + "auxiliary_loss_clip": 0.01102282, + "auxiliary_loss_mlp": 0.01030687, + "balance_loss_clip": 1.01906371, + "balance_loss_mlp": 1.03439832, + "epoch": 0.649030512550729, + "flos": 24571912573440.0, + "grad_norm": 1.7036979096858527, + "language_loss": 0.70159793, + "learning_rate": 1.158716808837621e-06, + "loss": 0.72292763, + "num_input_tokens_seen": 232949455, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 10795, + "time_per_iteration": 2.5075082778930664 + }, + { + "auxiliary_loss_clip": 0.01107904, + "auxiliary_loss_mlp": 0.01033056, + "balance_loss_clip": 1.0199964, + "balance_loss_mlp": 1.03854239, + "epoch": 0.649090635803397, + "flos": 26244702904320.0, + "grad_norm": 1.7755045878876892, + "language_loss": 0.54152012, + "learning_rate": 1.158363494676679e-06, + "loss": 0.56292963, + "num_input_tokens_seen": 232969445, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6953125, + "step": 10796, + "time_per_iteration": 2.4778566360473633 + }, + { + "auxiliary_loss_clip": 0.01104118, + "auxiliary_loss_mlp": 0.01027194, + "balance_loss_clip": 1.01583314, + "balance_loss_mlp": 1.03535151, + "epoch": 0.6491507590560649, + "flos": 24937375501440.0, + "grad_norm": 2.2433372918176917, + "language_loss": 0.77806747, + "learning_rate": 1.1580102124306775e-06, + "loss": 0.79938054, + "num_input_tokens_seen": 232988900, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6875, + "step": 10797, + "time_per_iteration": 2.4779365062713623 + }, + { + "auxiliary_loss_clip": 0.01101065, + "auxiliary_loss_mlp": 0.0102751, + "balance_loss_clip": 1.0163343, + "balance_loss_mlp": 1.03683209, + "epoch": 0.6492108823087329, + "flos": 19499781899520.0, + "grad_norm": 2.205335755673093, + "language_loss": 0.70565605, + "learning_rate": 1.1576569621130134e-06, + "loss": 0.72694176, + "num_input_tokens_seen": 233005060, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.640625, + "step": 10798, + "time_per_iteration": 2.4684252738952637 + }, + { + "auxiliary_loss_clip": 0.01101226, + "auxiliary_loss_mlp": 0.0102813, + "balance_loss_clip": 1.01698995, + "balance_loss_mlp": 1.03464842, + "epoch": 0.6492710055614008, + "flos": 19719303868800.0, + "grad_norm": 1.6813115922747512, + "language_loss": 0.76955473, + "learning_rate": 1.1573037437370811e-06, + "loss": 0.79084826, + "num_input_tokens_seen": 233023375, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 10799, + "time_per_iteration": 2.5210940837860107 + }, + { + "auxiliary_loss_clip": 0.01104973, + "auxiliary_loss_mlp": 0.01032808, + "balance_loss_clip": 1.01997542, + "balance_loss_mlp": 1.03435063, + "epoch": 0.6493311288140688, + "flos": 24317018686080.0, + "grad_norm": 1.8153395402518349, + "language_loss": 0.7160871, + "learning_rate": 1.1569505573162755e-06, + "loss": 0.7374649, + "num_input_tokens_seen": 233043130, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.70703125, + "step": 10800, + "time_per_iteration": 2.482504367828369 + }, + { + "auxiliary_loss_clip": 0.01026126, + "auxiliary_loss_mlp": 0.01002417, + "balance_loss_clip": 1.00117147, + "balance_loss_mlp": 1.00504756, + "epoch": 0.6493912520667368, + "flos": 70934635290240.0, + "grad_norm": 0.7657069555877785, + "language_loss": 0.60286164, + "learning_rate": 1.1565974028639897e-06, + "loss": 0.62314713, + "num_input_tokens_seen": 233110560, + "router_z_loss_clip": 0.01245117, + "router_z_loss_mlp": 0.2109375, + "step": 10801, + "time_per_iteration": 3.226260185241699 + }, + { + "auxiliary_loss_clip": 0.01110608, + "auxiliary_loss_mlp": 0.01036145, + "balance_loss_clip": 1.02356291, + "balance_loss_mlp": 1.04023898, + "epoch": 0.6494513753194048, + "flos": 25337779384320.0, + "grad_norm": 1.8073883235159445, + "language_loss": 0.78302824, + "learning_rate": 1.156244280393614e-06, + "loss": 0.80449581, + "num_input_tokens_seen": 233130080, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 10802, + "time_per_iteration": 3.919212579727173 + }, + { + "auxiliary_loss_clip": 0.01103206, + "auxiliary_loss_mlp": 0.01035837, + "balance_loss_clip": 1.02334428, + "balance_loss_mlp": 1.03446245, + "epoch": 0.6495114985720728, + "flos": 24681978823680.0, + "grad_norm": 1.6305174461496863, + "language_loss": 0.74483562, + "learning_rate": 1.155891189918541e-06, + "loss": 0.76622605, + "num_input_tokens_seen": 233150235, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 10803, + "time_per_iteration": 2.4627156257629395 + }, + { + "auxiliary_loss_clip": 0.01104558, + "auxiliary_loss_mlp": 0.01031041, + "balance_loss_clip": 1.01895285, + "balance_loss_mlp": 1.03586698, + "epoch": 0.6495716218247407, + "flos": 23651162317440.0, + "grad_norm": 2.1376614082682104, + "language_loss": 0.70056975, + "learning_rate": 1.1555381314521578e-06, + "loss": 0.72192574, + "num_input_tokens_seen": 233166710, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 10804, + "time_per_iteration": 3.848759889602661 + }, + { + "auxiliary_loss_clip": 0.01103321, + "auxiliary_loss_mlp": 0.01028784, + "balance_loss_clip": 1.01596284, + "balance_loss_mlp": 1.03562534, + "epoch": 0.6496317450774087, + "flos": 22346169298560.0, + "grad_norm": 1.6605919162215552, + "language_loss": 0.72852522, + "learning_rate": 1.1551851050078537e-06, + "loss": 0.74984628, + "num_input_tokens_seen": 233185445, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 10805, + "time_per_iteration": 3.8869080543518066 + }, + { + "auxiliary_loss_clip": 0.01103949, + "auxiliary_loss_mlp": 0.01030707, + "balance_loss_clip": 1.0191083, + "balance_loss_mlp": 1.03534186, + "epoch": 0.6496918683300766, + "flos": 30518647505280.0, + "grad_norm": 2.4377517316486816, + "language_loss": 0.66010499, + "learning_rate": 1.1548321105990155e-06, + "loss": 0.68145156, + "num_input_tokens_seen": 233205805, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6875, + "step": 10806, + "time_per_iteration": 3.955326557159424 + }, + { + "auxiliary_loss_clip": 0.01104962, + "auxiliary_loss_mlp": 0.0103143, + "balance_loss_clip": 1.01891875, + "balance_loss_mlp": 1.0347352, + "epoch": 0.6497519915827447, + "flos": 12458992567680.0, + "grad_norm": 2.0043448276690743, + "language_loss": 0.79282916, + "learning_rate": 1.1544791482390275e-06, + "loss": 0.81419313, + "num_input_tokens_seen": 233224215, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 10807, + "time_per_iteration": 2.39217209815979 + }, + { + "auxiliary_loss_clip": 0.01025408, + "auxiliary_loss_mlp": 0.0099987, + "balance_loss_clip": 0.99852294, + "balance_loss_mlp": 1.0043627, + "epoch": 0.6498121148354126, + "flos": 69093748287360.0, + "grad_norm": 0.8116161107359111, + "language_loss": 0.58930409, + "learning_rate": 1.1541262179412745e-06, + "loss": 0.60955691, + "num_input_tokens_seen": 233294440, + "router_z_loss_clip": 0.01348877, + "router_z_loss_mlp": 0.2109375, + "step": 10808, + "time_per_iteration": 3.230355739593506 + }, + { + "auxiliary_loss_clip": 0.0110383, + "auxiliary_loss_mlp": 0.01027961, + "balance_loss_clip": 1.01633191, + "balance_loss_mlp": 1.03880036, + "epoch": 0.6498722380880806, + "flos": 36897135914880.0, + "grad_norm": 1.7314499567585588, + "language_loss": 0.63442683, + "learning_rate": 1.1537733197191415e-06, + "loss": 0.65574473, + "num_input_tokens_seen": 233316125, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 10809, + "time_per_iteration": 2.5621047019958496 + }, + { + "auxiliary_loss_clip": 0.01103232, + "auxiliary_loss_mlp": 0.01030844, + "balance_loss_clip": 1.01940036, + "balance_loss_mlp": 1.03731823, + "epoch": 0.6499323613407485, + "flos": 29017760688000.0, + "grad_norm": 1.7915412750630062, + "language_loss": 0.81444794, + "learning_rate": 1.153420453586008e-06, + "loss": 0.83578873, + "num_input_tokens_seen": 233336140, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66015625, + "step": 10810, + "time_per_iteration": 2.504213571548462 + }, + { + "auxiliary_loss_clip": 0.01101776, + "auxiliary_loss_mlp": 0.01032614, + "balance_loss_clip": 1.02178955, + "balance_loss_mlp": 1.03596044, + "epoch": 0.6499924845934165, + "flos": 20119240874880.0, + "grad_norm": 1.6107612285139954, + "language_loss": 0.71639317, + "learning_rate": 1.1530676195552561e-06, + "loss": 0.73773706, + "num_input_tokens_seen": 233356095, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.66015625, + "step": 10811, + "time_per_iteration": 2.4460504055023193 + }, + { + "auxiliary_loss_clip": 0.01102886, + "auxiliary_loss_mlp": 0.01026927, + "balance_loss_clip": 1.01604342, + "balance_loss_mlp": 1.03864026, + "epoch": 0.6500526078460844, + "flos": 24421338760320.0, + "grad_norm": 1.696628622759694, + "language_loss": 0.78028226, + "learning_rate": 1.1527148176402649e-06, + "loss": 0.80158031, + "num_input_tokens_seen": 233376830, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.640625, + "step": 10812, + "time_per_iteration": 2.4838054180145264 + }, + { + "auxiliary_loss_clip": 0.01103233, + "auxiliary_loss_mlp": 0.01036542, + "balance_loss_clip": 1.02414393, + "balance_loss_mlp": 1.03522503, + "epoch": 0.6501127310987524, + "flos": 23331019374720.0, + "grad_norm": 1.7227870996833219, + "language_loss": 0.85212648, + "learning_rate": 1.152362047854413e-06, + "loss": 0.87352425, + "num_input_tokens_seen": 233395275, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 10813, + "time_per_iteration": 2.4507973194122314 + }, + { + "auxiliary_loss_clip": 0.01103984, + "auxiliary_loss_mlp": 0.0102924, + "balance_loss_clip": 1.01711679, + "balance_loss_mlp": 1.03622413, + "epoch": 0.6501728543514204, + "flos": 18697824898560.0, + "grad_norm": 1.630969137195917, + "language_loss": 0.80210257, + "learning_rate": 1.1520093102110764e-06, + "loss": 0.82343483, + "num_input_tokens_seen": 233413345, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 10814, + "time_per_iteration": 2.4843356609344482 + }, + { + "auxiliary_loss_clip": 0.0110736, + "auxiliary_loss_mlp": 0.01034219, + "balance_loss_clip": 1.02207136, + "balance_loss_mlp": 1.03762007, + "epoch": 0.6502329776040884, + "flos": 44199858199680.0, + "grad_norm": 1.5728804424803877, + "language_loss": 0.65147841, + "learning_rate": 1.1516566047236328e-06, + "loss": 0.67289424, + "num_input_tokens_seen": 233436105, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6953125, + "step": 10815, + "time_per_iteration": 2.6453187465667725 + }, + { + "auxiliary_loss_clip": 0.01107853, + "auxiliary_loss_mlp": 0.01034444, + "balance_loss_clip": 1.02031219, + "balance_loss_mlp": 1.0368166, + "epoch": 0.6502931008567564, + "flos": 14574741419520.0, + "grad_norm": 2.1717658748812925, + "language_loss": 0.75344497, + "learning_rate": 1.1513039314054546e-06, + "loss": 0.77486801, + "num_input_tokens_seen": 233452320, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7109375, + "step": 10816, + "time_per_iteration": 2.4386065006256104 + }, + { + "auxiliary_loss_clip": 0.01102422, + "auxiliary_loss_mlp": 0.01032135, + "balance_loss_clip": 1.0204587, + "balance_loss_mlp": 1.0362556, + "epoch": 0.6503532241094243, + "flos": 21395003201280.0, + "grad_norm": 1.7229503928288044, + "language_loss": 0.7330451, + "learning_rate": 1.1509512902699174e-06, + "loss": 0.75439066, + "num_input_tokens_seen": 233469920, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 10817, + "time_per_iteration": 2.4583981037139893 + }, + { + "auxiliary_loss_clip": 0.01103563, + "auxiliary_loss_mlp": 0.01036237, + "balance_loss_clip": 1.02363098, + "balance_loss_mlp": 1.03521729, + "epoch": 0.6504133473620923, + "flos": 74740840986240.0, + "grad_norm": 1.4667825090725979, + "language_loss": 0.71944672, + "learning_rate": 1.1505986813303916e-06, + "loss": 0.74084473, + "num_input_tokens_seen": 233499780, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.68359375, + "step": 10818, + "time_per_iteration": 2.862744092941284 + }, + { + "auxiliary_loss_clip": 0.01107713, + "auxiliary_loss_mlp": 0.01028855, + "balance_loss_clip": 1.01682127, + "balance_loss_mlp": 1.03837013, + "epoch": 0.6504734706147602, + "flos": 19713270384000.0, + "grad_norm": 1.8855888512315708, + "language_loss": 0.65002698, + "learning_rate": 1.150246104600249e-06, + "loss": 0.67139268, + "num_input_tokens_seen": 233518235, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6953125, + "step": 10819, + "time_per_iteration": 2.500066041946411 + }, + { + "auxiliary_loss_clip": 0.01105945, + "auxiliary_loss_mlp": 0.01031983, + "balance_loss_clip": 1.01954389, + "balance_loss_mlp": 1.03696644, + "epoch": 0.6505335938674283, + "flos": 25556870390400.0, + "grad_norm": 1.9280601319833375, + "language_loss": 0.83383453, + "learning_rate": 1.14989356009286e-06, + "loss": 0.85521388, + "num_input_tokens_seen": 233535215, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.69140625, + "step": 10820, + "time_per_iteration": 2.5053653717041016 + }, + { + "auxiliary_loss_clip": 0.01105855, + "auxiliary_loss_mlp": 0.01030579, + "balance_loss_clip": 1.01781201, + "balance_loss_mlp": 1.03561532, + "epoch": 0.6505937171200962, + "flos": 17821424960640.0, + "grad_norm": 2.4467285300705166, + "language_loss": 0.78197402, + "learning_rate": 1.1495410478215914e-06, + "loss": 0.80333835, + "num_input_tokens_seen": 233552775, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.703125, + "step": 10821, + "time_per_iteration": 2.431373357772827 + }, + { + "auxiliary_loss_clip": 0.01101047, + "auxiliary_loss_mlp": 0.01029296, + "balance_loss_clip": 1.01875234, + "balance_loss_mlp": 1.03584325, + "epoch": 0.6506538403727642, + "flos": 20668135582080.0, + "grad_norm": 1.457845041613161, + "language_loss": 0.80133367, + "learning_rate": 1.1491885677998126e-06, + "loss": 0.82263708, + "num_input_tokens_seen": 233572080, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.65234375, + "step": 10822, + "time_per_iteration": 2.460176706314087 + }, + { + "auxiliary_loss_clip": 0.01102252, + "auxiliary_loss_mlp": 0.01029965, + "balance_loss_clip": 1.01815748, + "balance_loss_mlp": 1.0353092, + "epoch": 0.6507139636254321, + "flos": 11721422695680.0, + "grad_norm": 1.750587835143927, + "language_loss": 0.87001264, + "learning_rate": 1.1488361200408883e-06, + "loss": 0.89133477, + "num_input_tokens_seen": 233589155, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 10823, + "time_per_iteration": 2.4293131828308105 + }, + { + "auxiliary_loss_clip": 0.01105612, + "auxiliary_loss_mlp": 0.01029251, + "balance_loss_clip": 1.01709723, + "balance_loss_mlp": 1.03624296, + "epoch": 0.6507740868781001, + "flos": 26761745226240.0, + "grad_norm": 1.6365898296789787, + "language_loss": 0.66641533, + "learning_rate": 1.148483704558183e-06, + "loss": 0.68776393, + "num_input_tokens_seen": 233608180, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6953125, + "step": 10824, + "time_per_iteration": 2.4835896492004395 + }, + { + "auxiliary_loss_clip": 0.01106218, + "auxiliary_loss_mlp": 0.01028522, + "balance_loss_clip": 1.01628542, + "balance_loss_mlp": 1.03520238, + "epoch": 0.650834210130768, + "flos": 16471722487680.0, + "grad_norm": 4.8089783891514974, + "language_loss": 0.87194103, + "learning_rate": 1.1481313213650607e-06, + "loss": 0.89328843, + "num_input_tokens_seen": 233625750, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.7109375, + "step": 10825, + "time_per_iteration": 2.4161195755004883 + }, + { + "auxiliary_loss_clip": 0.01106101, + "auxiliary_loss_mlp": 0.01028407, + "balance_loss_clip": 1.01528192, + "balance_loss_mlp": 1.03501797, + "epoch": 0.650894333383436, + "flos": 17128672283520.0, + "grad_norm": 2.113023109439822, + "language_loss": 0.72701895, + "learning_rate": 1.147778970474885e-06, + "loss": 0.74836403, + "num_input_tokens_seen": 233644235, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 10826, + "time_per_iteration": 2.4384891986846924 + }, + { + "auxiliary_loss_clip": 0.01103778, + "auxiliary_loss_mlp": 0.01029196, + "balance_loss_clip": 1.01812768, + "balance_loss_mlp": 1.03663278, + "epoch": 0.650954456636104, + "flos": 18734238311040.0, + "grad_norm": 1.8815234967356322, + "language_loss": 0.69047898, + "learning_rate": 1.1474266519010157e-06, + "loss": 0.71180868, + "num_input_tokens_seen": 233662845, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.671875, + "step": 10827, + "time_per_iteration": 2.4236016273498535 + }, + { + "auxiliary_loss_clip": 0.01103468, + "auxiliary_loss_mlp": 0.01026173, + "balance_loss_clip": 1.01466322, + "balance_loss_mlp": 1.03472352, + "epoch": 0.651014579888772, + "flos": 24528244613760.0, + "grad_norm": 1.912124303976498, + "language_loss": 0.76917899, + "learning_rate": 1.1470743656568136e-06, + "loss": 0.79047537, + "num_input_tokens_seen": 233681990, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6875, + "step": 10828, + "time_per_iteration": 2.501492500305176 + }, + { + "auxiliary_loss_clip": 0.01103546, + "auxiliary_loss_mlp": 0.01025588, + "balance_loss_clip": 1.01449549, + "balance_loss_mlp": 1.03721857, + "epoch": 0.65107470314144, + "flos": 24061083304320.0, + "grad_norm": 1.7405898865071652, + "language_loss": 0.89106113, + "learning_rate": 1.1467221117556362e-06, + "loss": 0.91235244, + "num_input_tokens_seen": 233698930, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6640625, + "step": 10829, + "time_per_iteration": 2.4867043495178223 + }, + { + "auxiliary_loss_clip": 0.01025679, + "auxiliary_loss_mlp": 0.01006089, + "balance_loss_clip": 1.004879, + "balance_loss_mlp": 1.00477648, + "epoch": 0.6511348263941079, + "flos": 72480734352000.0, + "grad_norm": 0.638409366999194, + "language_loss": 0.5535605, + "learning_rate": 1.1463698902108428e-06, + "loss": 0.57387817, + "num_input_tokens_seen": 233769825, + "router_z_loss_clip": 0.01208496, + "router_z_loss_mlp": 0.20898438, + "step": 10830, + "time_per_iteration": 3.2332394123077393 + }, + { + "auxiliary_loss_clip": 0.01105984, + "auxiliary_loss_mlp": 0.01032022, + "balance_loss_clip": 1.01880741, + "balance_loss_mlp": 1.0351963, + "epoch": 0.6511949496467759, + "flos": 23367684182400.0, + "grad_norm": 1.8294925765604486, + "language_loss": 0.74714524, + "learning_rate": 1.1460177010357878e-06, + "loss": 0.76852524, + "num_input_tokens_seen": 233787095, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.70703125, + "step": 10831, + "time_per_iteration": 2.4678196907043457 + }, + { + "auxiliary_loss_clip": 0.01026675, + "auxiliary_loss_mlp": 0.01000885, + "balance_loss_clip": 0.99961585, + "balance_loss_mlp": 1.0056181, + "epoch": 0.6512550728994438, + "flos": 67333191073920.0, + "grad_norm": 0.6414585196656494, + "language_loss": 0.51052123, + "learning_rate": 1.145665544243828e-06, + "loss": 0.53079689, + "num_input_tokens_seen": 233853050, + "router_z_loss_clip": 0.01269531, + "router_z_loss_mlp": 0.2109375, + "step": 10832, + "time_per_iteration": 3.188751697540283 + }, + { + "auxiliary_loss_clip": 0.01105483, + "auxiliary_loss_mlp": 0.01031181, + "balance_loss_clip": 1.01886725, + "balance_loss_mlp": 1.03423023, + "epoch": 0.6513151961521119, + "flos": 21141689512320.0, + "grad_norm": 2.076228287586058, + "language_loss": 0.83391213, + "learning_rate": 1.145313419848316e-06, + "loss": 0.85527885, + "num_input_tokens_seen": 233871385, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.7109375, + "step": 10833, + "time_per_iteration": 2.462529182434082 + }, + { + "auxiliary_loss_clip": 0.01106556, + "auxiliary_loss_mlp": 0.01030981, + "balance_loss_clip": 1.01900601, + "balance_loss_mlp": 1.03788352, + "epoch": 0.6513753194047798, + "flos": 15158828476800.0, + "grad_norm": 2.0595405323959817, + "language_loss": 0.83691829, + "learning_rate": 1.1449613278626049e-06, + "loss": 0.85829365, + "num_input_tokens_seen": 233888175, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6875, + "step": 10834, + "time_per_iteration": 2.4130232334136963 + }, + { + "auxiliary_loss_clip": 0.01105953, + "auxiliary_loss_mlp": 0.01032966, + "balance_loss_clip": 1.02089036, + "balance_loss_mlp": 1.03688574, + "epoch": 0.6514354426574478, + "flos": 30226621933440.0, + "grad_norm": 1.593058398275777, + "language_loss": 0.76863015, + "learning_rate": 1.1446092683000455e-06, + "loss": 0.79001933, + "num_input_tokens_seen": 233911470, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69140625, + "step": 10835, + "time_per_iteration": 2.562690019607544 + }, + { + "auxiliary_loss_clip": 0.01107145, + "auxiliary_loss_mlp": 0.01032273, + "balance_loss_clip": 1.02025032, + "balance_loss_mlp": 1.03849971, + "epoch": 0.6514955659101157, + "flos": 24205587719040.0, + "grad_norm": 1.513196810995274, + "language_loss": 0.7734859, + "learning_rate": 1.1442572411739882e-06, + "loss": 0.79488003, + "num_input_tokens_seen": 233932135, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6875, + "step": 10836, + "time_per_iteration": 2.4830451011657715 + }, + { + "auxiliary_loss_clip": 0.01106015, + "auxiliary_loss_mlp": 0.01030871, + "balance_loss_clip": 1.01907551, + "balance_loss_mlp": 1.03746104, + "epoch": 0.6515556891627837, + "flos": 12377761960320.0, + "grad_norm": 3.377184093609282, + "language_loss": 0.82293916, + "learning_rate": 1.143905246497783e-06, + "loss": 0.84430802, + "num_input_tokens_seen": 233947880, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.68359375, + "step": 10837, + "time_per_iteration": 2.449313163757324 + }, + { + "auxiliary_loss_clip": 0.01101636, + "auxiliary_loss_mlp": 0.01031992, + "balance_loss_clip": 1.01881361, + "balance_loss_mlp": 1.0366106, + "epoch": 0.6516158124154516, + "flos": 49601217957120.0, + "grad_norm": 1.879635988028464, + "language_loss": 0.59214962, + "learning_rate": 1.1435532842847758e-06, + "loss": 0.61348593, + "num_input_tokens_seen": 233971475, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.65234375, + "step": 10838, + "time_per_iteration": 2.7190330028533936 + }, + { + "auxiliary_loss_clip": 0.01025807, + "auxiliary_loss_mlp": 0.01001457, + "balance_loss_clip": 1.00031853, + "balance_loss_mlp": 1.00479698, + "epoch": 0.6516759356681197, + "flos": 59702748076800.0, + "grad_norm": 0.7299756161535264, + "language_loss": 0.60843396, + "learning_rate": 1.1432013545483147e-06, + "loss": 0.62870657, + "num_input_tokens_seen": 234030690, + "router_z_loss_clip": 0.01141357, + "router_z_loss_mlp": 0.2109375, + "step": 10839, + "time_per_iteration": 3.0971086025238037 + }, + { + "auxiliary_loss_clip": 0.0110206, + "auxiliary_loss_mlp": 0.01025474, + "balance_loss_clip": 1.01444113, + "balance_loss_mlp": 1.03583503, + "epoch": 0.6517360589207876, + "flos": 37450807130880.0, + "grad_norm": 1.8264384192259977, + "language_loss": 0.68170393, + "learning_rate": 1.1428494573017439e-06, + "loss": 0.70297927, + "num_input_tokens_seen": 234052470, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6640625, + "step": 10840, + "time_per_iteration": 2.5938761234283447 + }, + { + "auxiliary_loss_clip": 0.01102231, + "auxiliary_loss_mlp": 0.01028806, + "balance_loss_clip": 1.01783288, + "balance_loss_mlp": 1.03470433, + "epoch": 0.6517961821734556, + "flos": 25374911068800.0, + "grad_norm": 2.0940212881125433, + "language_loss": 0.73375624, + "learning_rate": 1.1424975925584071e-06, + "loss": 0.75506657, + "num_input_tokens_seen": 234071495, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.67578125, + "step": 10841, + "time_per_iteration": 2.5096652507781982 + }, + { + "auxiliary_loss_clip": 0.01104442, + "auxiliary_loss_mlp": 0.01031855, + "balance_loss_clip": 1.02016115, + "balance_loss_mlp": 1.03598571, + "epoch": 0.6518563054261236, + "flos": 28766996864640.0, + "grad_norm": 1.444320911302732, + "language_loss": 0.6237874, + "learning_rate": 1.142145760331648e-06, + "loss": 0.64515036, + "num_input_tokens_seen": 234092325, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.68359375, + "step": 10842, + "time_per_iteration": 2.4958693981170654 + }, + { + "auxiliary_loss_clip": 0.01026129, + "auxiliary_loss_mlp": 0.00998688, + "balance_loss_clip": 0.99753761, + "balance_loss_mlp": 1.00511324, + "epoch": 0.6519164286787915, + "flos": 68924750797440.0, + "grad_norm": 0.8080147467318853, + "language_loss": 0.56082183, + "learning_rate": 1.141793960634807e-06, + "loss": 0.58107001, + "num_input_tokens_seen": 234148005, + "router_z_loss_clip": 0.01147461, + "router_z_loss_mlp": 0.2109375, + "step": 10843, + "time_per_iteration": 4.309800863265991 + }, + { + "auxiliary_loss_clip": 0.01107299, + "auxiliary_loss_mlp": 0.01033148, + "balance_loss_clip": 1.02050614, + "balance_loss_mlp": 1.03615665, + "epoch": 0.6519765519314595, + "flos": 20441933683200.0, + "grad_norm": 1.64501007109248, + "language_loss": 0.82562542, + "learning_rate": 1.1414421934812253e-06, + "loss": 0.84702992, + "num_input_tokens_seen": 234164280, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 10844, + "time_per_iteration": 2.4669365882873535 + }, + { + "auxiliary_loss_clip": 0.01103507, + "auxiliary_loss_mlp": 0.01026932, + "balance_loss_clip": 1.0143199, + "balance_loss_mlp": 1.03550506, + "epoch": 0.6520366751841274, + "flos": 28402970480640.0, + "grad_norm": 2.063344534700721, + "language_loss": 0.60069621, + "learning_rate": 1.1410904588842421e-06, + "loss": 0.62200063, + "num_input_tokens_seen": 234185090, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6796875, + "step": 10845, + "time_per_iteration": 2.5032777786254883 + }, + { + "auxiliary_loss_clip": 0.01102843, + "auxiliary_loss_mlp": 0.01026875, + "balance_loss_clip": 1.01482248, + "balance_loss_mlp": 1.03591549, + "epoch": 0.6520967984367955, + "flos": 22273414300800.0, + "grad_norm": 2.814601439051778, + "language_loss": 0.79261941, + "learning_rate": 1.140738756857194e-06, + "loss": 0.81391656, + "num_input_tokens_seen": 234204050, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.66796875, + "step": 10846, + "time_per_iteration": 5.227022171020508 + }, + { + "auxiliary_loss_clip": 0.01025994, + "auxiliary_loss_mlp": 0.01001258, + "balance_loss_clip": 1.00005949, + "balance_loss_mlp": 1.00516367, + "epoch": 0.6521569216894634, + "flos": 68917140092160.0, + "grad_norm": 0.7222516480670771, + "language_loss": 0.60183281, + "learning_rate": 1.1403870874134192e-06, + "loss": 0.6221053, + "num_input_tokens_seen": 234269790, + "router_z_loss_clip": 0.01196289, + "router_z_loss_mlp": 0.20898438, + "step": 10847, + "time_per_iteration": 3.1712331771850586 + }, + { + "auxiliary_loss_clip": 0.01107998, + "auxiliary_loss_mlp": 0.01036888, + "balance_loss_clip": 1.0252527, + "balance_loss_mlp": 1.03767812, + "epoch": 0.6522170449421314, + "flos": 29130520458240.0, + "grad_norm": 1.5760338552649935, + "language_loss": 0.81001323, + "learning_rate": 1.1400354505662514e-06, + "loss": 0.83146203, + "num_input_tokens_seen": 234290135, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.703125, + "step": 10848, + "time_per_iteration": 3.9554522037506104 + }, + { + "auxiliary_loss_clip": 0.01102504, + "auxiliary_loss_mlp": 0.01034881, + "balance_loss_clip": 1.02334166, + "balance_loss_mlp": 1.03565013, + "epoch": 0.6522771681947993, + "flos": 26651930371200.0, + "grad_norm": 2.095194559726116, + "language_loss": 0.75025082, + "learning_rate": 1.1396838463290263e-06, + "loss": 0.77162468, + "num_input_tokens_seen": 234309535, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66796875, + "step": 10849, + "time_per_iteration": 2.4771504402160645 + }, + { + "auxiliary_loss_clip": 0.01101707, + "auxiliary_loss_mlp": 0.01028459, + "balance_loss_clip": 1.01685405, + "balance_loss_mlp": 1.03644204, + "epoch": 0.6523372914474673, + "flos": 25739763465600.0, + "grad_norm": 1.5413673094352514, + "language_loss": 0.68062961, + "learning_rate": 1.1393322747150752e-06, + "loss": 0.70193124, + "num_input_tokens_seen": 234328755, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65234375, + "step": 10850, + "time_per_iteration": 2.5665318965911865 + }, + { + "auxiliary_loss_clip": 0.01102248, + "auxiliary_loss_mlp": 0.01025486, + "balance_loss_clip": 1.01378012, + "balance_loss_mlp": 1.03723216, + "epoch": 0.6523974147001352, + "flos": 24827345164800.0, + "grad_norm": 3.429236792588671, + "language_loss": 0.66494656, + "learning_rate": 1.1389807357377313e-06, + "loss": 0.68622386, + "num_input_tokens_seen": 234348655, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6484375, + "step": 10851, + "time_per_iteration": 2.4702751636505127 + }, + { + "auxiliary_loss_clip": 0.01106489, + "auxiliary_loss_mlp": 0.01027965, + "balance_loss_clip": 1.01643169, + "balance_loss_mlp": 1.03662848, + "epoch": 0.6524575379528033, + "flos": 26317637470080.0, + "grad_norm": 2.15849365590988, + "language_loss": 0.74028027, + "learning_rate": 1.1386292294103235e-06, + "loss": 0.76162481, + "num_input_tokens_seen": 234367445, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.69921875, + "step": 10852, + "time_per_iteration": 2.509229898452759 + }, + { + "auxiliary_loss_clip": 0.01108111, + "auxiliary_loss_mlp": 0.01028926, + "balance_loss_clip": 1.01606905, + "balance_loss_mlp": 1.03742135, + "epoch": 0.6525176612054712, + "flos": 19494143464320.0, + "grad_norm": 7.224145946580318, + "language_loss": 0.66702747, + "learning_rate": 1.1382777557461812e-06, + "loss": 0.68839788, + "num_input_tokens_seen": 234384825, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.70703125, + "step": 10853, + "time_per_iteration": 2.4382283687591553 + }, + { + "auxiliary_loss_clip": 0.01026122, + "auxiliary_loss_mlp": 0.00996827, + "balance_loss_clip": 0.99562865, + "balance_loss_mlp": 1.00521636, + "epoch": 0.6525777844581392, + "flos": 71706894721920.0, + "grad_norm": 0.7308751423910714, + "language_loss": 0.62970364, + "learning_rate": 1.137926314758634e-06, + "loss": 0.64993316, + "num_input_tokens_seen": 234450630, + "router_z_loss_clip": 0.01196289, + "router_z_loss_mlp": 0.20898438, + "step": 10854, + "time_per_iteration": 3.1691970825195312 + }, + { + "auxiliary_loss_clip": 0.01104802, + "auxiliary_loss_mlp": 0.01030611, + "balance_loss_clip": 1.01792085, + "balance_loss_mlp": 1.03625202, + "epoch": 0.6526379077108072, + "flos": 26653115520000.0, + "grad_norm": 1.8459663187588897, + "language_loss": 0.77826589, + "learning_rate": 1.1375749064610072e-06, + "loss": 0.79962003, + "num_input_tokens_seen": 234473505, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 10855, + "time_per_iteration": 2.5133306980133057 + }, + { + "auxiliary_loss_clip": 0.01099784, + "auxiliary_loss_mlp": 0.01026139, + "balance_loss_clip": 1.01462901, + "balance_loss_mlp": 1.03466463, + "epoch": 0.6526980309634751, + "flos": 22820369673600.0, + "grad_norm": 1.7863182329630984, + "language_loss": 0.79166549, + "learning_rate": 1.1372235308666256e-06, + "loss": 0.81292474, + "num_input_tokens_seen": 234492485, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 10856, + "time_per_iteration": 2.521003007888794 + }, + { + "auxiliary_loss_clip": 0.01103089, + "auxiliary_loss_mlp": 0.01029164, + "balance_loss_clip": 1.01608038, + "balance_loss_mlp": 1.03572774, + "epoch": 0.6527581542161431, + "flos": 28365048696960.0, + "grad_norm": 1.7280049220035325, + "language_loss": 0.73561788, + "learning_rate": 1.136872187988815e-06, + "loss": 0.75694042, + "num_input_tokens_seen": 234512645, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.671875, + "step": 10857, + "time_per_iteration": 2.524388074874878 + }, + { + "auxiliary_loss_clip": 0.01103768, + "auxiliary_loss_mlp": 0.01030272, + "balance_loss_clip": 1.01941204, + "balance_loss_mlp": 1.03619289, + "epoch": 0.652818277468811, + "flos": 18369206346240.0, + "grad_norm": 2.287513574647506, + "language_loss": 0.62553668, + "learning_rate": 1.1365208778408965e-06, + "loss": 0.64687705, + "num_input_tokens_seen": 234529310, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.67578125, + "step": 10858, + "time_per_iteration": 2.484292507171631 + }, + { + "auxiliary_loss_clip": 0.01100147, + "auxiliary_loss_mlp": 0.01030536, + "balance_loss_clip": 1.0192945, + "balance_loss_mlp": 1.03388333, + "epoch": 0.6528784007214791, + "flos": 18036170421120.0, + "grad_norm": 1.70957243248878, + "language_loss": 0.78181291, + "learning_rate": 1.1361696004361939e-06, + "loss": 0.80311966, + "num_input_tokens_seen": 234546685, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6640625, + "step": 10859, + "time_per_iteration": 2.4208006858825684 + }, + { + "auxiliary_loss_clip": 0.011057, + "auxiliary_loss_mlp": 0.01030498, + "balance_loss_clip": 1.01820755, + "balance_loss_mlp": 1.03562379, + "epoch": 0.652938523974147, + "flos": 22382008093440.0, + "grad_norm": 1.5618141301411743, + "language_loss": 0.67899007, + "learning_rate": 1.1358183557880256e-06, + "loss": 0.70035207, + "num_input_tokens_seen": 234566255, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 10860, + "time_per_iteration": 2.516052722930908 + }, + { + "auxiliary_loss_clip": 0.01106777, + "auxiliary_loss_mlp": 0.01026586, + "balance_loss_clip": 1.01489735, + "balance_loss_mlp": 1.03654599, + "epoch": 0.652998647226815, + "flos": 16764035368320.0, + "grad_norm": 2.1862353937135732, + "language_loss": 0.66182673, + "learning_rate": 1.135467143909712e-06, + "loss": 0.68316036, + "num_input_tokens_seen": 234585405, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.703125, + "step": 10861, + "time_per_iteration": 2.4207851886749268 + }, + { + "auxiliary_loss_clip": 0.01105314, + "auxiliary_loss_mlp": 0.0103272, + "balance_loss_clip": 1.01948178, + "balance_loss_mlp": 1.03619254, + "epoch": 0.6530587704794829, + "flos": 35772522019200.0, + "grad_norm": 1.7782678366068123, + "language_loss": 0.6507051, + "learning_rate": 1.135115964814572e-06, + "loss": 0.67208546, + "num_input_tokens_seen": 234608095, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.69140625, + "step": 10862, + "time_per_iteration": 2.5804362297058105 + }, + { + "auxiliary_loss_clip": 0.0110242, + "auxiliary_loss_mlp": 0.0103057, + "balance_loss_clip": 1.01891708, + "balance_loss_mlp": 1.03588247, + "epoch": 0.6531188937321509, + "flos": 19316134638720.0, + "grad_norm": 1.5241362686221158, + "language_loss": 0.77193171, + "learning_rate": 1.13476481851592e-06, + "loss": 0.79326159, + "num_input_tokens_seen": 234627335, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 10863, + "time_per_iteration": 2.438044309616089 + }, + { + "auxiliary_loss_clip": 0.01103508, + "auxiliary_loss_mlp": 0.01026948, + "balance_loss_clip": 1.0157485, + "balance_loss_mlp": 1.03619623, + "epoch": 0.6531790169848188, + "flos": 22893771116160.0, + "grad_norm": 1.8164803813000403, + "language_loss": 0.7466498, + "learning_rate": 1.1344137050270739e-06, + "loss": 0.76795435, + "num_input_tokens_seen": 234646540, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.671875, + "step": 10864, + "time_per_iteration": 2.4771134853363037 + }, + { + "auxiliary_loss_clip": 0.01102279, + "auxiliary_loss_mlp": 0.01033829, + "balance_loss_clip": 1.02253413, + "balance_loss_mlp": 1.03580821, + "epoch": 0.6532391402374869, + "flos": 29563530912000.0, + "grad_norm": 1.7514895317957062, + "language_loss": 0.8600319, + "learning_rate": 1.1340626243613458e-06, + "loss": 0.88139296, + "num_input_tokens_seen": 234665470, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.66796875, + "step": 10865, + "time_per_iteration": 2.5002095699310303 + }, + { + "auxiliary_loss_clip": 0.01108321, + "auxiliary_loss_mlp": 0.01036311, + "balance_loss_clip": 1.02430654, + "balance_loss_mlp": 1.03760266, + "epoch": 0.6532992634901548, + "flos": 23105463920640.0, + "grad_norm": 3.5499069425062832, + "language_loss": 0.81403613, + "learning_rate": 1.133711576532051e-06, + "loss": 0.83548248, + "num_input_tokens_seen": 234683955, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.70703125, + "step": 10866, + "time_per_iteration": 2.546633005142212 + }, + { + "auxiliary_loss_clip": 0.01102409, + "auxiliary_loss_mlp": 0.01026983, + "balance_loss_clip": 1.01524687, + "balance_loss_mlp": 1.03626192, + "epoch": 0.6533593867428228, + "flos": 26067340523520.0, + "grad_norm": 1.4960309400225926, + "language_loss": 0.82321596, + "learning_rate": 1.1333605615524995e-06, + "loss": 0.8445099, + "num_input_tokens_seen": 234704595, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66015625, + "step": 10867, + "time_per_iteration": 2.4704959392547607 + }, + { + "auxiliary_loss_clip": 0.01102151, + "auxiliary_loss_mlp": 0.0102709, + "balance_loss_clip": 1.01578307, + "balance_loss_mlp": 1.03344285, + "epoch": 0.6534195099954908, + "flos": 21212469262080.0, + "grad_norm": 1.873401062188488, + "language_loss": 0.81152415, + "learning_rate": 1.1330095794360016e-06, + "loss": 0.8328166, + "num_input_tokens_seen": 234724090, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6875, + "step": 10868, + "time_per_iteration": 2.462496519088745 + }, + { + "auxiliary_loss_clip": 0.01106253, + "auxiliary_loss_mlp": 0.01028186, + "balance_loss_clip": 1.0159198, + "balance_loss_mlp": 1.03690481, + "epoch": 0.6534796332481587, + "flos": 19646584784640.0, + "grad_norm": 1.9591239016591335, + "language_loss": 0.79279351, + "learning_rate": 1.1326586301958675e-06, + "loss": 0.81413788, + "num_input_tokens_seen": 234742560, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 10869, + "time_per_iteration": 2.4351487159729004 + }, + { + "auxiliary_loss_clip": 0.0110718, + "auxiliary_loss_mlp": 0.01034774, + "balance_loss_clip": 1.02241778, + "balance_loss_mlp": 1.03880501, + "epoch": 0.6535397565008267, + "flos": 24022479162240.0, + "grad_norm": 2.040320648065678, + "language_loss": 0.71729898, + "learning_rate": 1.1323077138454063e-06, + "loss": 0.73871845, + "num_input_tokens_seen": 234762315, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6875, + "step": 10870, + "time_per_iteration": 2.5223138332366943 + }, + { + "auxiliary_loss_clip": 0.01104928, + "auxiliary_loss_mlp": 0.01035056, + "balance_loss_clip": 1.02319491, + "balance_loss_mlp": 1.0377295, + "epoch": 0.6535998797534947, + "flos": 24602759377920.0, + "grad_norm": 2.147023101303994, + "language_loss": 0.74992102, + "learning_rate": 1.1319568303979221e-06, + "loss": 0.77132088, + "num_input_tokens_seen": 234781300, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 10871, + "time_per_iteration": 2.469367265701294 + }, + { + "auxiliary_loss_clip": 0.01099729, + "auxiliary_loss_mlp": 0.01029902, + "balance_loss_clip": 1.01870823, + "balance_loss_mlp": 1.03503919, + "epoch": 0.6536600030061627, + "flos": 23364164649600.0, + "grad_norm": 1.7849990892484822, + "language_loss": 0.55615103, + "learning_rate": 1.1316059798667227e-06, + "loss": 0.5774473, + "num_input_tokens_seen": 234801040, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6484375, + "step": 10872, + "time_per_iteration": 2.4673538208007812 + }, + { + "auxiliary_loss_clip": 0.0110205, + "auxiliary_loss_mlp": 0.01034353, + "balance_loss_clip": 1.0230695, + "balance_loss_mlp": 1.03632164, + "epoch": 0.6537201262588306, + "flos": 23878477537920.0, + "grad_norm": 1.8219619398900448, + "language_loss": 0.75073338, + "learning_rate": 1.1312551622651112e-06, + "loss": 0.77209741, + "num_input_tokens_seen": 234821415, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65625, + "step": 10873, + "time_per_iteration": 2.4655163288116455 + }, + { + "auxiliary_loss_clip": 0.01104694, + "auxiliary_loss_mlp": 0.01028464, + "balance_loss_clip": 1.0166738, + "balance_loss_mlp": 1.03662491, + "epoch": 0.6537802495114986, + "flos": 24354760901760.0, + "grad_norm": 1.5897958047644043, + "language_loss": 0.75623226, + "learning_rate": 1.1309043776063917e-06, + "loss": 0.77756387, + "num_input_tokens_seen": 234843795, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6796875, + "step": 10874, + "time_per_iteration": 2.5224883556365967 + }, + { + "auxiliary_loss_clip": 0.01103714, + "auxiliary_loss_mlp": 0.01031841, + "balance_loss_clip": 1.0198009, + "balance_loss_mlp": 1.03682685, + "epoch": 0.6538403727641665, + "flos": 27996892248960.0, + "grad_norm": 2.7439070637520064, + "language_loss": 0.81423092, + "learning_rate": 1.1305536259038642e-06, + "loss": 0.83558643, + "num_input_tokens_seen": 234862350, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.66796875, + "step": 10875, + "time_per_iteration": 2.4869980812072754 + }, + { + "auxiliary_loss_clip": 0.01102459, + "auxiliary_loss_mlp": 0.0103789, + "balance_loss_clip": 1.02635086, + "balance_loss_mlp": 1.03504491, + "epoch": 0.6539004960168345, + "flos": 27563594486400.0, + "grad_norm": 1.6810546720157804, + "language_loss": 0.70045686, + "learning_rate": 1.1302029071708314e-06, + "loss": 0.72186041, + "num_input_tokens_seen": 234881790, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 10876, + "time_per_iteration": 2.5040364265441895 + }, + { + "auxiliary_loss_clip": 0.01103097, + "auxiliary_loss_mlp": 0.01034782, + "balance_loss_clip": 1.02289736, + "balance_loss_mlp": 1.03575683, + "epoch": 0.6539606192695024, + "flos": 14530067879040.0, + "grad_norm": 1.8217122109555075, + "language_loss": 0.7932229, + "learning_rate": 1.1298522214205908e-06, + "loss": 0.81460166, + "num_input_tokens_seen": 234897775, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 10877, + "time_per_iteration": 2.402308940887451 + }, + { + "auxiliary_loss_clip": 0.01103576, + "auxiliary_loss_mlp": 0.01027676, + "balance_loss_clip": 1.01635098, + "balance_loss_mlp": 1.0359726, + "epoch": 0.6540207425221705, + "flos": 21616356764160.0, + "grad_norm": 2.189241924086369, + "language_loss": 0.7987535, + "learning_rate": 1.1295015686664408e-06, + "loss": 0.82006603, + "num_input_tokens_seen": 234918395, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.67578125, + "step": 10878, + "time_per_iteration": 2.4780471324920654 + }, + { + "auxiliary_loss_clip": 0.01100458, + "auxiliary_loss_mlp": 0.01027543, + "balance_loss_clip": 1.0154438, + "balance_loss_mlp": 1.03370023, + "epoch": 0.6540808657748384, + "flos": 17668983640320.0, + "grad_norm": 1.8666542226247762, + "language_loss": 0.84453034, + "learning_rate": 1.1291509489216797e-06, + "loss": 0.86581039, + "num_input_tokens_seen": 234936260, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66796875, + "step": 10879, + "time_per_iteration": 2.4143741130828857 + }, + { + "auxiliary_loss_clip": 0.01104945, + "auxiliary_loss_mlp": 0.01030605, + "balance_loss_clip": 1.01845217, + "balance_loss_mlp": 1.03493488, + "epoch": 0.6541409890275064, + "flos": 14538292093440.0, + "grad_norm": 2.8543762869506004, + "language_loss": 0.71946406, + "learning_rate": 1.128800362199601e-06, + "loss": 0.74081963, + "num_input_tokens_seen": 234952110, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69921875, + "step": 10880, + "time_per_iteration": 2.430192708969116 + }, + { + "auxiliary_loss_clip": 0.01100358, + "auxiliary_loss_mlp": 0.01030494, + "balance_loss_clip": 1.01899612, + "balance_loss_mlp": 1.03472471, + "epoch": 0.6542011122801744, + "flos": 17165301177600.0, + "grad_norm": 1.7514865003733433, + "language_loss": 0.84385759, + "learning_rate": 1.1284498085135005e-06, + "loss": 0.86516607, + "num_input_tokens_seen": 234970810, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 10881, + "time_per_iteration": 2.4801900386810303 + }, + { + "auxiliary_loss_clip": 0.01105434, + "auxiliary_loss_mlp": 0.0103233, + "balance_loss_clip": 1.01909828, + "balance_loss_mlp": 1.03612447, + "epoch": 0.6542612355328423, + "flos": 18186600579840.0, + "grad_norm": 1.8305344772437837, + "language_loss": 0.77706677, + "learning_rate": 1.1280992878766699e-06, + "loss": 0.79844439, + "num_input_tokens_seen": 234989565, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.69140625, + "step": 10882, + "time_per_iteration": 2.4523637294769287 + }, + { + "auxiliary_loss_clip": 0.01106717, + "auxiliary_loss_mlp": 0.01029129, + "balance_loss_clip": 1.01632619, + "balance_loss_mlp": 1.03733766, + "epoch": 0.6543213587855103, + "flos": 19792453916160.0, + "grad_norm": 1.6779149142362604, + "language_loss": 0.82394373, + "learning_rate": 1.1277488003024024e-06, + "loss": 0.84530222, + "num_input_tokens_seen": 235007955, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 10883, + "time_per_iteration": 2.4265058040618896 + }, + { + "auxiliary_loss_clip": 0.01108268, + "auxiliary_loss_mlp": 0.01034275, + "balance_loss_clip": 1.02163935, + "balance_loss_mlp": 1.0390712, + "epoch": 0.6543814820381783, + "flos": 21105096531840.0, + "grad_norm": 2.382020741579914, + "language_loss": 0.85506725, + "learning_rate": 1.127398345803988e-06, + "loss": 0.87649274, + "num_input_tokens_seen": 235024860, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 10884, + "time_per_iteration": 2.4697301387786865 + }, + { + "auxiliary_loss_clip": 0.0110574, + "auxiliary_loss_mlp": 0.01036191, + "balance_loss_clip": 1.02454472, + "balance_loss_mlp": 1.03698301, + "epoch": 0.6544416052908463, + "flos": 20194042947840.0, + "grad_norm": 10.527351582586146, + "language_loss": 0.80486369, + "learning_rate": 1.127047924394715e-06, + "loss": 0.82628304, + "num_input_tokens_seen": 235043815, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6875, + "step": 10885, + "time_per_iteration": 3.9415979385375977 + }, + { + "auxiliary_loss_clip": 0.01103256, + "auxiliary_loss_mlp": 0.01026985, + "balance_loss_clip": 1.01527846, + "balance_loss_mlp": 1.03751159, + "epoch": 0.6545017285435142, + "flos": 23368258800000.0, + "grad_norm": 1.8132591830137343, + "language_loss": 0.72155404, + "learning_rate": 1.1266975360878722e-06, + "loss": 0.74285644, + "num_input_tokens_seen": 235062985, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 10886, + "time_per_iteration": 2.474519729614258 + }, + { + "auxiliary_loss_clip": 0.01101162, + "auxiliary_loss_mlp": 0.01029474, + "balance_loss_clip": 1.01850116, + "balance_loss_mlp": 1.03500915, + "epoch": 0.6545618517961822, + "flos": 19134714021120.0, + "grad_norm": 1.738538225206424, + "language_loss": 0.78089505, + "learning_rate": 1.1263471808967468e-06, + "loss": 0.80220145, + "num_input_tokens_seen": 235081670, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6640625, + "step": 10887, + "time_per_iteration": 2.4567511081695557 + }, + { + "auxiliary_loss_clip": 0.01102786, + "auxiliary_loss_mlp": 0.01030643, + "balance_loss_clip": 1.01913893, + "balance_loss_mlp": 1.03559566, + "epoch": 0.6546219750488501, + "flos": 14938624149120.0, + "grad_norm": 4.496679975000023, + "language_loss": 0.78967035, + "learning_rate": 1.1259968588346234e-06, + "loss": 0.81100464, + "num_input_tokens_seen": 235098510, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 10888, + "time_per_iteration": 5.19985818862915 + }, + { + "auxiliary_loss_clip": 0.01099033, + "auxiliary_loss_mlp": 0.01028783, + "balance_loss_clip": 1.01808405, + "balance_loss_mlp": 1.03421295, + "epoch": 0.6546820983015181, + "flos": 36320518886400.0, + "grad_norm": 1.5919708412571818, + "language_loss": 0.66247272, + "learning_rate": 1.1256465699147874e-06, + "loss": 0.68375087, + "num_input_tokens_seen": 235119990, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6484375, + "step": 10889, + "time_per_iteration": 2.5679409503936768 + }, + { + "auxiliary_loss_clip": 0.01102521, + "auxiliary_loss_mlp": 0.01029157, + "balance_loss_clip": 1.01675916, + "balance_loss_mlp": 1.03473425, + "epoch": 0.654742221554186, + "flos": 20411446014720.0, + "grad_norm": 1.4966214179852624, + "language_loss": 0.79874986, + "learning_rate": 1.1252963141505203e-06, + "loss": 0.82006663, + "num_input_tokens_seen": 235139255, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 10890, + "time_per_iteration": 3.9007346630096436 + }, + { + "auxiliary_loss_clip": 0.01102511, + "auxiliary_loss_mlp": 0.0102901, + "balance_loss_clip": 1.01683879, + "balance_loss_mlp": 1.03386474, + "epoch": 0.6548023448068541, + "flos": 24863650836480.0, + "grad_norm": 2.4806412573813494, + "language_loss": 0.65136874, + "learning_rate": 1.1249460915551052e-06, + "loss": 0.67268395, + "num_input_tokens_seen": 235158455, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6875, + "step": 10891, + "time_per_iteration": 2.4762353897094727 + }, + { + "auxiliary_loss_clip": 0.01103305, + "auxiliary_loss_mlp": 0.01030658, + "balance_loss_clip": 1.01963139, + "balance_loss_mlp": 1.03584743, + "epoch": 0.654862468059522, + "flos": 21427573858560.0, + "grad_norm": 1.713176232540202, + "language_loss": 0.79329646, + "learning_rate": 1.1245959021418214e-06, + "loss": 0.81463599, + "num_input_tokens_seen": 235177350, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.67578125, + "step": 10892, + "time_per_iteration": 2.483430862426758 + }, + { + "auxiliary_loss_clip": 0.01108627, + "auxiliary_loss_mlp": 0.01034578, + "balance_loss_clip": 1.02300262, + "balance_loss_mlp": 1.03826213, + "epoch": 0.65492259131219, + "flos": 26577846570240.0, + "grad_norm": 1.927118370280093, + "language_loss": 0.77688205, + "learning_rate": 1.1242457459239497e-06, + "loss": 0.79831409, + "num_input_tokens_seen": 235196435, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.703125, + "step": 10893, + "time_per_iteration": 2.468653440475464 + }, + { + "auxiliary_loss_clip": 0.01107027, + "auxiliary_loss_mlp": 0.01026547, + "balance_loss_clip": 1.01425672, + "balance_loss_mlp": 1.03698456, + "epoch": 0.6549827145648579, + "flos": 21501334437120.0, + "grad_norm": 1.6133414191995223, + "language_loss": 0.7036956, + "learning_rate": 1.123895622914766e-06, + "loss": 0.72503132, + "num_input_tokens_seen": 235215430, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 10894, + "time_per_iteration": 2.454615592956543 + }, + { + "auxiliary_loss_clip": 0.01106124, + "auxiliary_loss_mlp": 0.0103336, + "balance_loss_clip": 1.02128386, + "balance_loss_mlp": 1.03594112, + "epoch": 0.6550428378175259, + "flos": 22594275515520.0, + "grad_norm": 4.213583210390945, + "language_loss": 0.63007772, + "learning_rate": 1.123545533127549e-06, + "loss": 0.65147251, + "num_input_tokens_seen": 235232015, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.703125, + "step": 10895, + "time_per_iteration": 2.4314959049224854 + }, + { + "auxiliary_loss_clip": 0.01102811, + "auxiliary_loss_mlp": 0.01030375, + "balance_loss_clip": 1.01881742, + "balance_loss_mlp": 1.03532076, + "epoch": 0.655102961070194, + "flos": 12823809050880.0, + "grad_norm": 3.6304048273042717, + "language_loss": 0.7897135, + "learning_rate": 1.1231954765755722e-06, + "loss": 0.81104541, + "num_input_tokens_seen": 235248115, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.67578125, + "step": 10896, + "time_per_iteration": 2.4550769329071045 + }, + { + "auxiliary_loss_clip": 0.01102279, + "auxiliary_loss_mlp": 0.01031456, + "balance_loss_clip": 1.01995277, + "balance_loss_mlp": 1.03664804, + "epoch": 0.6551630843228619, + "flos": 24791075406720.0, + "grad_norm": 1.4344785444999102, + "language_loss": 0.70384824, + "learning_rate": 1.1228454532721111e-06, + "loss": 0.72518563, + "num_input_tokens_seen": 235270785, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 10897, + "time_per_iteration": 2.478304624557495 + }, + { + "auxiliary_loss_clip": 0.01103619, + "auxiliary_loss_mlp": 0.010288, + "balance_loss_clip": 1.0175761, + "balance_loss_mlp": 1.03478158, + "epoch": 0.6552232075755299, + "flos": 16724461559040.0, + "grad_norm": 1.7387642279992266, + "language_loss": 0.75401318, + "learning_rate": 1.1224954632304391e-06, + "loss": 0.77533734, + "num_input_tokens_seen": 235287905, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6875, + "step": 10898, + "time_per_iteration": 2.4487948417663574 + }, + { + "auxiliary_loss_clip": 0.01105468, + "auxiliary_loss_mlp": 0.01034852, + "balance_loss_clip": 1.02299678, + "balance_loss_mlp": 1.03773856, + "epoch": 0.6552833308281978, + "flos": 22016473338240.0, + "grad_norm": 2.1990983943767555, + "language_loss": 0.73518318, + "learning_rate": 1.122145506463827e-06, + "loss": 0.75658637, + "num_input_tokens_seen": 235305525, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.67578125, + "step": 10899, + "time_per_iteration": 2.4304370880126953 + }, + { + "auxiliary_loss_clip": 0.0110359, + "auxiliary_loss_mlp": 0.01026488, + "balance_loss_clip": 1.01528871, + "balance_loss_mlp": 1.0364536, + "epoch": 0.6553434540808658, + "flos": 24863399441280.0, + "grad_norm": 2.1275272720256293, + "language_loss": 0.55958188, + "learning_rate": 1.1217955829855443e-06, + "loss": 0.58088267, + "num_input_tokens_seen": 235324415, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 10900, + "time_per_iteration": 2.484473943710327 + }, + { + "auxiliary_loss_clip": 0.01104316, + "auxiliary_loss_mlp": 0.01032088, + "balance_loss_clip": 1.01939833, + "balance_loss_mlp": 1.03653932, + "epoch": 0.6554035773335337, + "flos": 23221060865280.0, + "grad_norm": 1.8846923286778847, + "language_loss": 0.76933706, + "learning_rate": 1.1214456928088622e-06, + "loss": 0.79070109, + "num_input_tokens_seen": 235341595, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 10901, + "time_per_iteration": 2.4382822513580322 + }, + { + "auxiliary_loss_clip": 0.01101496, + "auxiliary_loss_mlp": 0.01026359, + "balance_loss_clip": 1.01434255, + "balance_loss_mlp": 1.03516734, + "epoch": 0.6554637005862017, + "flos": 22783597125120.0, + "grad_norm": 1.753856944987035, + "language_loss": 0.73216426, + "learning_rate": 1.1210958359470463e-06, + "loss": 0.75344282, + "num_input_tokens_seen": 235361700, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 10902, + "time_per_iteration": 2.49745774269104 + }, + { + "auxiliary_loss_clip": 0.01102831, + "auxiliary_loss_mlp": 0.01026395, + "balance_loss_clip": 1.01536822, + "balance_loss_mlp": 1.03652823, + "epoch": 0.6555238238388696, + "flos": 21507224267520.0, + "grad_norm": 1.6638199342391367, + "language_loss": 0.67729247, + "learning_rate": 1.1207460124133645e-06, + "loss": 0.69858468, + "num_input_tokens_seen": 235382065, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6640625, + "step": 10903, + "time_per_iteration": 2.457672595977783 + }, + { + "auxiliary_loss_clip": 0.01106344, + "auxiliary_loss_mlp": 0.01034393, + "balance_loss_clip": 1.02201295, + "balance_loss_mlp": 1.03555727, + "epoch": 0.6555839470915377, + "flos": 30519473518080.0, + "grad_norm": 1.8258125512154932, + "language_loss": 0.66961503, + "learning_rate": 1.1203962222210832e-06, + "loss": 0.6910224, + "num_input_tokens_seen": 235402130, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 10904, + "time_per_iteration": 2.544079065322876 + }, + { + "auxiliary_loss_clip": 0.01103937, + "auxiliary_loss_mlp": 0.01035362, + "balance_loss_clip": 1.02261209, + "balance_loss_mlp": 1.03435302, + "epoch": 0.6556440703442056, + "flos": 24642943718400.0, + "grad_norm": 1.9965123681804708, + "language_loss": 0.90475762, + "learning_rate": 1.120046465383464e-06, + "loss": 0.92615068, + "num_input_tokens_seen": 235420435, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 10905, + "time_per_iteration": 2.4607133865356445 + }, + { + "auxiliary_loss_clip": 0.01100631, + "auxiliary_loss_mlp": 0.01030435, + "balance_loss_clip": 1.01922941, + "balance_loss_mlp": 1.03564942, + "epoch": 0.6557041935968736, + "flos": 23732464752000.0, + "grad_norm": 2.060465882995779, + "language_loss": 0.75227022, + "learning_rate": 1.1196967419137721e-06, + "loss": 0.77358085, + "num_input_tokens_seen": 235439960, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 10906, + "time_per_iteration": 2.489344358444214 + }, + { + "auxiliary_loss_clip": 0.01108555, + "auxiliary_loss_mlp": 0.01037824, + "balance_loss_clip": 1.02571845, + "balance_loss_mlp": 1.03796065, + "epoch": 0.6557643168495415, + "flos": 11102753819520.0, + "grad_norm": 2.673517900647209, + "language_loss": 0.74337453, + "learning_rate": 1.119347051825267e-06, + "loss": 0.76483834, + "num_input_tokens_seen": 235457495, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.70703125, + "step": 10907, + "time_per_iteration": 2.4216673374176025 + }, + { + "auxiliary_loss_clip": 0.01102218, + "auxiliary_loss_mlp": 0.01030446, + "balance_loss_clip": 1.01732743, + "balance_loss_mlp": 1.03423953, + "epoch": 0.6558244401022095, + "flos": 30191034533760.0, + "grad_norm": 1.4101718899089066, + "language_loss": 0.72367519, + "learning_rate": 1.118997395131211e-06, + "loss": 0.74500179, + "num_input_tokens_seen": 235479525, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6796875, + "step": 10908, + "time_per_iteration": 2.531003952026367 + }, + { + "auxiliary_loss_clip": 0.01105598, + "auxiliary_loss_mlp": 0.01032426, + "balance_loss_clip": 1.01962876, + "balance_loss_mlp": 1.03744864, + "epoch": 0.6558845633548775, + "flos": 17931060247680.0, + "grad_norm": 2.1513013799426868, + "language_loss": 0.81017995, + "learning_rate": 1.118647771844861e-06, + "loss": 0.83156013, + "num_input_tokens_seen": 235496305, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 10909, + "time_per_iteration": 2.4130208492279053 + }, + { + "auxiliary_loss_clip": 0.01105504, + "auxiliary_loss_mlp": 0.01034655, + "balance_loss_clip": 1.02167928, + "balance_loss_mlp": 1.0363667, + "epoch": 0.6559446866075455, + "flos": 21904144531200.0, + "grad_norm": 2.0430689174515098, + "language_loss": 0.63840532, + "learning_rate": 1.1182981819794767e-06, + "loss": 0.65980697, + "num_input_tokens_seen": 235512545, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.69140625, + "step": 10910, + "time_per_iteration": 2.4513769149780273 + }, + { + "auxiliary_loss_clip": 0.01110874, + "auxiliary_loss_mlp": 0.01030758, + "balance_loss_clip": 1.0173471, + "balance_loss_mlp": 1.03761017, + "epoch": 0.6560048098602135, + "flos": 14127976056960.0, + "grad_norm": 3.983049569871041, + "language_loss": 0.76120275, + "learning_rate": 1.117948625548313e-06, + "loss": 0.78261906, + "num_input_tokens_seen": 235526045, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 10911, + "time_per_iteration": 2.421567440032959 + }, + { + "auxiliary_loss_clip": 0.01098552, + "auxiliary_loss_mlp": 0.01027915, + "balance_loss_clip": 1.01696563, + "balance_loss_mlp": 1.03389096, + "epoch": 0.6560649331128814, + "flos": 18807567926400.0, + "grad_norm": 2.6100669832011048, + "language_loss": 0.75670731, + "learning_rate": 1.1175991025646265e-06, + "loss": 0.77797198, + "num_input_tokens_seen": 235545285, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.64453125, + "step": 10912, + "time_per_iteration": 2.4657318592071533 + }, + { + "auxiliary_loss_clip": 0.01111745, + "auxiliary_loss_mlp": 0.01034368, + "balance_loss_clip": 1.02153468, + "balance_loss_mlp": 1.03876007, + "epoch": 0.6561250563655494, + "flos": 17053618815360.0, + "grad_norm": 1.5787420401710588, + "language_loss": 0.77322382, + "learning_rate": 1.1172496130416697e-06, + "loss": 0.79468495, + "num_input_tokens_seen": 235563150, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.73046875, + "step": 10913, + "time_per_iteration": 2.4153146743774414 + }, + { + "auxiliary_loss_clip": 0.01099585, + "auxiliary_loss_mlp": 0.01026757, + "balance_loss_clip": 1.01626611, + "balance_loss_mlp": 1.03441024, + "epoch": 0.6561851796182173, + "flos": 22637656166400.0, + "grad_norm": 1.9167212276506074, + "language_loss": 0.70828009, + "learning_rate": 1.1169001569926961e-06, + "loss": 0.72954357, + "num_input_tokens_seen": 235582535, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.65234375, + "step": 10914, + "time_per_iteration": 2.4597549438476562 + }, + { + "auxiliary_loss_clip": 0.01103262, + "auxiliary_loss_mlp": 0.01030743, + "balance_loss_clip": 1.01875639, + "balance_loss_mlp": 1.03628445, + "epoch": 0.6562453028708853, + "flos": 19239213663360.0, + "grad_norm": 1.851270541448462, + "language_loss": 0.73936331, + "learning_rate": 1.116550734430958e-06, + "loss": 0.76070333, + "num_input_tokens_seen": 235601490, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66796875, + "step": 10915, + "time_per_iteration": 2.4307053089141846 + }, + { + "auxiliary_loss_clip": 0.01100361, + "auxiliary_loss_mlp": 0.01032785, + "balance_loss_clip": 1.02053595, + "balance_loss_mlp": 1.034675, + "epoch": 0.6563054261235532, + "flos": 23801305167360.0, + "grad_norm": 1.6584707758046542, + "language_loss": 0.79572797, + "learning_rate": 1.1162013453697042e-06, + "loss": 0.8170594, + "num_input_tokens_seen": 235619165, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.65625, + "step": 10916, + "time_per_iteration": 2.4956743717193604 + }, + { + "auxiliary_loss_clip": 0.01103152, + "auxiliary_loss_mlp": 0.01034727, + "balance_loss_clip": 1.02357495, + "balance_loss_mlp": 1.03500533, + "epoch": 0.6563655493762213, + "flos": 19240039676160.0, + "grad_norm": 1.9383516308380546, + "language_loss": 0.76153994, + "learning_rate": 1.1158519898221831e-06, + "loss": 0.78291869, + "num_input_tokens_seen": 235637115, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6796875, + "step": 10917, + "time_per_iteration": 2.4713754653930664 + }, + { + "auxiliary_loss_clip": 0.01101411, + "auxiliary_loss_mlp": 0.01027975, + "balance_loss_clip": 1.01656687, + "balance_loss_mlp": 1.03484607, + "epoch": 0.6564256726288892, + "flos": 25556439427200.0, + "grad_norm": 1.8282774447422543, + "language_loss": 0.69401765, + "learning_rate": 1.1155026678016445e-06, + "loss": 0.71531153, + "num_input_tokens_seen": 235656330, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 10918, + "time_per_iteration": 2.500551462173462 + }, + { + "auxiliary_loss_clip": 0.01099041, + "auxiliary_loss_mlp": 0.01037247, + "balance_loss_clip": 1.02611244, + "balance_loss_mlp": 1.03552103, + "epoch": 0.6564857958815572, + "flos": 22200623389440.0, + "grad_norm": 1.7922194863374643, + "language_loss": 0.76487136, + "learning_rate": 1.115153379321332e-06, + "loss": 0.78623426, + "num_input_tokens_seen": 235674510, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6328125, + "step": 10919, + "time_per_iteration": 2.4698684215545654 + }, + { + "auxiliary_loss_clip": 0.01026665, + "auxiliary_loss_mlp": 0.01002269, + "balance_loss_clip": 1.00099361, + "balance_loss_mlp": 1.00584173, + "epoch": 0.6565459191342251, + "flos": 58123144604160.0, + "grad_norm": 0.7207598722602275, + "language_loss": 0.5307852, + "learning_rate": 1.1148041243944931e-06, + "loss": 0.55107456, + "num_input_tokens_seen": 235735050, + "router_z_loss_clip": 0.01275635, + "router_z_loss_mlp": 0.20898438, + "step": 10920, + "time_per_iteration": 3.0821664333343506 + }, + { + "auxiliary_loss_clip": 0.01101918, + "auxiliary_loss_mlp": 0.01028744, + "balance_loss_clip": 1.01682281, + "balance_loss_mlp": 1.03579378, + "epoch": 0.6566060423868931, + "flos": 30809631582720.0, + "grad_norm": 1.482616976222016, + "language_loss": 0.65204817, + "learning_rate": 1.1144549030343697e-06, + "loss": 0.6733548, + "num_input_tokens_seen": 235757545, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66015625, + "step": 10921, + "time_per_iteration": 2.515620231628418 + }, + { + "auxiliary_loss_clip": 0.01100913, + "auxiliary_loss_mlp": 0.0103275, + "balance_loss_clip": 1.01986957, + "balance_loss_mlp": 1.03486526, + "epoch": 0.6566661656395612, + "flos": 23367432787200.0, + "grad_norm": 1.8313420178351358, + "language_loss": 0.81071579, + "learning_rate": 1.114105715254205e-06, + "loss": 0.83205247, + "num_input_tokens_seen": 235777265, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.66015625, + "step": 10922, + "time_per_iteration": 2.454880714416504 + }, + { + "auxiliary_loss_clip": 0.01105049, + "auxiliary_loss_mlp": 0.01032776, + "balance_loss_clip": 1.02074158, + "balance_loss_mlp": 1.03742886, + "epoch": 0.6567262888922291, + "flos": 25735597488000.0, + "grad_norm": 2.376840972990548, + "language_loss": 0.71632755, + "learning_rate": 1.1137565610672414e-06, + "loss": 0.73770583, + "num_input_tokens_seen": 235796565, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 10923, + "time_per_iteration": 2.5216050148010254 + }, + { + "auxiliary_loss_clip": 0.01106548, + "auxiliary_loss_mlp": 0.01030386, + "balance_loss_clip": 1.01902556, + "balance_loss_mlp": 1.03784943, + "epoch": 0.6567864121448971, + "flos": 17123716206720.0, + "grad_norm": 1.9379255151150183, + "language_loss": 0.80668283, + "learning_rate": 1.1134074404867169e-06, + "loss": 0.82805216, + "num_input_tokens_seen": 235814805, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6875, + "step": 10924, + "time_per_iteration": 2.420976400375366 + }, + { + "auxiliary_loss_clip": 0.01103854, + "auxiliary_loss_mlp": 0.01029176, + "balance_loss_clip": 1.01838779, + "balance_loss_mlp": 1.03694773, + "epoch": 0.656846535397565, + "flos": 22419319345920.0, + "grad_norm": 1.6223500631493692, + "language_loss": 0.72360754, + "learning_rate": 1.1130583535258717e-06, + "loss": 0.74493784, + "num_input_tokens_seen": 235833405, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.66796875, + "step": 10925, + "time_per_iteration": 2.48442006111145 + }, + { + "auxiliary_loss_clip": 0.011034, + "auxiliary_loss_mlp": 0.01026622, + "balance_loss_clip": 1.01515365, + "balance_loss_mlp": 1.03553128, + "epoch": 0.656906658650233, + "flos": 17704535126400.0, + "grad_norm": 2.3888033375770266, + "language_loss": 0.72365135, + "learning_rate": 1.112709300197942e-06, + "loss": 0.74495161, + "num_input_tokens_seen": 235848530, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6796875, + "step": 10926, + "time_per_iteration": 2.426408052444458 + }, + { + "auxiliary_loss_clip": 0.01104746, + "auxiliary_loss_mlp": 0.01030909, + "balance_loss_clip": 1.01850533, + "balance_loss_mlp": 1.03482258, + "epoch": 0.6569667819029009, + "flos": 21175158009600.0, + "grad_norm": 1.681586343154767, + "language_loss": 0.72273743, + "learning_rate": 1.1123602805161656e-06, + "loss": 0.74409401, + "num_input_tokens_seen": 235867225, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69921875, + "step": 10927, + "time_per_iteration": 3.80648136138916 + }, + { + "auxiliary_loss_clip": 0.01026322, + "auxiliary_loss_mlp": 0.01001587, + "balance_loss_clip": 1.00027585, + "balance_loss_mlp": 1.00539577, + "epoch": 0.6570269051555689, + "flos": 68761897511040.0, + "grad_norm": 0.7330380682962492, + "language_loss": 0.64455849, + "learning_rate": 1.112011294493775e-06, + "loss": 0.66483754, + "num_input_tokens_seen": 235932925, + "router_z_loss_clip": 0.01312256, + "router_z_loss_mlp": 0.20898438, + "step": 10928, + "time_per_iteration": 3.092785120010376 + }, + { + "auxiliary_loss_clip": 0.01101776, + "auxiliary_loss_mlp": 0.01028054, + "balance_loss_clip": 1.01669884, + "balance_loss_mlp": 1.03520453, + "epoch": 0.6570870284082369, + "flos": 26319289495680.0, + "grad_norm": 1.7549487521997071, + "language_loss": 0.77955842, + "learning_rate": 1.1116623421440063e-06, + "loss": 0.80085671, + "num_input_tokens_seen": 235952680, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6640625, + "step": 10929, + "time_per_iteration": 4.023591041564941 + }, + { + "auxiliary_loss_clip": 0.01102136, + "auxiliary_loss_mlp": 0.01031793, + "balance_loss_clip": 1.02030087, + "balance_loss_mlp": 1.0353775, + "epoch": 0.6571471516609049, + "flos": 26174749167360.0, + "grad_norm": 1.6371374390238511, + "language_loss": 0.65487254, + "learning_rate": 1.1113134234800895e-06, + "loss": 0.67621183, + "num_input_tokens_seen": 235972075, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.66796875, + "step": 10930, + "time_per_iteration": 3.8790106773376465 + }, + { + "auxiliary_loss_clip": 0.01102, + "auxiliary_loss_mlp": 0.01028907, + "balance_loss_clip": 1.01679564, + "balance_loss_mlp": 1.03432441, + "epoch": 0.6572072749135728, + "flos": 20376253664640.0, + "grad_norm": 1.5199914554797245, + "language_loss": 0.70439506, + "learning_rate": 1.110964538515258e-06, + "loss": 0.72570413, + "num_input_tokens_seen": 235990340, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 10931, + "time_per_iteration": 3.8428475856781006 + }, + { + "auxiliary_loss_clip": 0.01105703, + "auxiliary_loss_mlp": 0.01035602, + "balance_loss_clip": 1.02384758, + "balance_loss_mlp": 1.03632128, + "epoch": 0.6572673981662408, + "flos": 17128744110720.0, + "grad_norm": 2.0176400266990147, + "language_loss": 0.68914682, + "learning_rate": 1.1106156872627393e-06, + "loss": 0.71055984, + "num_input_tokens_seen": 236007470, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6953125, + "step": 10932, + "time_per_iteration": 2.427386999130249 + }, + { + "auxiliary_loss_clip": 0.01100641, + "auxiliary_loss_mlp": 0.01028111, + "balance_loss_clip": 1.01668537, + "balance_loss_mlp": 1.03434443, + "epoch": 0.6573275214189087, + "flos": 41275113281280.0, + "grad_norm": 2.41406977097007, + "language_loss": 0.80051857, + "learning_rate": 1.1102668697357626e-06, + "loss": 0.82180607, + "num_input_tokens_seen": 236029030, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 10933, + "time_per_iteration": 2.5989818572998047 + }, + { + "auxiliary_loss_clip": 0.01106278, + "auxiliary_loss_mlp": 0.01030089, + "balance_loss_clip": 1.01818609, + "balance_loss_mlp": 1.03827631, + "epoch": 0.6573876446715767, + "flos": 22890143842560.0, + "grad_norm": 1.7962352646576603, + "language_loss": 0.73653376, + "learning_rate": 1.1099180859475571e-06, + "loss": 0.75789738, + "num_input_tokens_seen": 236047160, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 10934, + "time_per_iteration": 2.513033390045166 + }, + { + "auxiliary_loss_clip": 0.01101364, + "auxiliary_loss_mlp": 0.01031486, + "balance_loss_clip": 1.01991725, + "balance_loss_mlp": 1.03564167, + "epoch": 0.6574477679242448, + "flos": 44018150273280.0, + "grad_norm": 1.5095272560756583, + "language_loss": 0.7590912, + "learning_rate": 1.1095693359113454e-06, + "loss": 0.78041971, + "num_input_tokens_seen": 236069215, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 10935, + "time_per_iteration": 2.7678496837615967 + }, + { + "auxiliary_loss_clip": 0.0110481, + "auxiliary_loss_mlp": 0.0103704, + "balance_loss_clip": 1.02380824, + "balance_loss_mlp": 1.03610992, + "epoch": 0.6575078911769127, + "flos": 24571517523840.0, + "grad_norm": 1.8441545252151383, + "language_loss": 0.78123999, + "learning_rate": 1.1092206196403538e-06, + "loss": 0.8026585, + "num_input_tokens_seen": 236088335, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6875, + "step": 10936, + "time_per_iteration": 2.5077192783355713 + }, + { + "auxiliary_loss_clip": 0.01099758, + "auxiliary_loss_mlp": 0.01029165, + "balance_loss_clip": 1.01789331, + "balance_loss_mlp": 1.03462768, + "epoch": 0.6575680144295807, + "flos": 20924035050240.0, + "grad_norm": 2.0488788051519777, + "language_loss": 0.68872631, + "learning_rate": 1.1088719371478056e-06, + "loss": 0.71001554, + "num_input_tokens_seen": 236108540, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 10937, + "time_per_iteration": 2.5001776218414307 + }, + { + "auxiliary_loss_clip": 0.01102833, + "auxiliary_loss_mlp": 0.01027511, + "balance_loss_clip": 1.01570368, + "balance_loss_mlp": 1.03619266, + "epoch": 0.6576281376822486, + "flos": 10925642833920.0, + "grad_norm": 2.29220645619057, + "language_loss": 0.68323117, + "learning_rate": 1.1085232884469236e-06, + "loss": 0.70453459, + "num_input_tokens_seen": 236124495, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 10938, + "time_per_iteration": 2.4366493225097656 + }, + { + "auxiliary_loss_clip": 0.01105738, + "auxiliary_loss_mlp": 0.01030966, + "balance_loss_clip": 1.01890206, + "balance_loss_mlp": 1.03749824, + "epoch": 0.6576882609349166, + "flos": 19281552819840.0, + "grad_norm": 2.075102589417424, + "language_loss": 0.71458369, + "learning_rate": 1.108174673550927e-06, + "loss": 0.73595071, + "num_input_tokens_seen": 236142550, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.68359375, + "step": 10939, + "time_per_iteration": 2.4688596725463867 + }, + { + "auxiliary_loss_clip": 0.0110619, + "auxiliary_loss_mlp": 0.01029227, + "balance_loss_clip": 1.01679969, + "balance_loss_mlp": 1.03710163, + "epoch": 0.6577483841875845, + "flos": 20220544206720.0, + "grad_norm": 2.217107584857945, + "language_loss": 0.77532256, + "learning_rate": 1.107826092473037e-06, + "loss": 0.7966767, + "num_input_tokens_seen": 236156620, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 10940, + "time_per_iteration": 2.425093412399292 + }, + { + "auxiliary_loss_clip": 0.0110778, + "auxiliary_loss_mlp": 0.01031615, + "balance_loss_clip": 1.01946735, + "balance_loss_mlp": 1.03589988, + "epoch": 0.6578085074402525, + "flos": 34751078962560.0, + "grad_norm": 2.046264853980575, + "language_loss": 0.68482137, + "learning_rate": 1.107477545226471e-06, + "loss": 0.70621532, + "num_input_tokens_seen": 236177095, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71875, + "step": 10941, + "time_per_iteration": 2.5489418506622314 + }, + { + "auxiliary_loss_clip": 0.01100409, + "auxiliary_loss_mlp": 0.01026012, + "balance_loss_clip": 1.01428187, + "balance_loss_mlp": 1.03322697, + "epoch": 0.6578686306929205, + "flos": 23470998675840.0, + "grad_norm": 1.8711951914026155, + "language_loss": 0.68390548, + "learning_rate": 1.1071290318244448e-06, + "loss": 0.70516968, + "num_input_tokens_seen": 236194695, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 10942, + "time_per_iteration": 2.486746072769165 + }, + { + "auxiliary_loss_clip": 0.0110907, + "auxiliary_loss_mlp": 0.01035043, + "balance_loss_clip": 1.02216208, + "balance_loss_mlp": 1.03639185, + "epoch": 0.6579287539455885, + "flos": 18077073033600.0, + "grad_norm": 2.0678514729005544, + "language_loss": 0.71317995, + "learning_rate": 1.1067805522801753e-06, + "loss": 0.73462105, + "num_input_tokens_seen": 236213885, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7265625, + "step": 10943, + "time_per_iteration": 2.4520316123962402 + }, + { + "auxiliary_loss_clip": 0.01102052, + "auxiliary_loss_mlp": 0.01030183, + "balance_loss_clip": 1.01820219, + "balance_loss_mlp": 1.03616333, + "epoch": 0.6579888771982564, + "flos": 28661383900800.0, + "grad_norm": 1.7679689812851298, + "language_loss": 0.59513438, + "learning_rate": 1.1064321066068778e-06, + "loss": 0.61645675, + "num_input_tokens_seen": 236237315, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66015625, + "step": 10944, + "time_per_iteration": 2.5190436840057373 + }, + { + "auxiliary_loss_clip": 0.01108265, + "auxiliary_loss_mlp": 0.01035357, + "balance_loss_clip": 1.02253008, + "balance_loss_mlp": 1.03664446, + "epoch": 0.6580490004509244, + "flos": 25046543911680.0, + "grad_norm": 1.558618410146096, + "language_loss": 0.72308242, + "learning_rate": 1.1060836948177646e-06, + "loss": 0.74451864, + "num_input_tokens_seen": 236256345, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.71484375, + "step": 10945, + "time_per_iteration": 2.463829755783081 + }, + { + "auxiliary_loss_clip": 0.01102, + "auxiliary_loss_mlp": 0.01026302, + "balance_loss_clip": 1.01497078, + "balance_loss_mlp": 1.03548717, + "epoch": 0.6581091237035923, + "flos": 43508793461760.0, + "grad_norm": 1.6917792730430523, + "language_loss": 0.70766807, + "learning_rate": 1.105735316926046e-06, + "loss": 0.7289511, + "num_input_tokens_seen": 236281890, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 10946, + "time_per_iteration": 2.6370081901550293 + }, + { + "auxiliary_loss_clip": 0.01104509, + "auxiliary_loss_mlp": 0.01030715, + "balance_loss_clip": 1.0187701, + "balance_loss_mlp": 1.03649974, + "epoch": 0.6581692469562603, + "flos": 22415404763520.0, + "grad_norm": 1.9998217553522297, + "language_loss": 0.81970888, + "learning_rate": 1.105386972944934e-06, + "loss": 0.84106112, + "num_input_tokens_seen": 236298370, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 10947, + "time_per_iteration": 2.44291090965271 + }, + { + "auxiliary_loss_clip": 0.01105119, + "auxiliary_loss_mlp": 0.01028432, + "balance_loss_clip": 1.01703572, + "balance_loss_mlp": 1.03552985, + "epoch": 0.6582293702089284, + "flos": 24859772167680.0, + "grad_norm": 1.5893547671126769, + "language_loss": 0.77298671, + "learning_rate": 1.1050386628876385e-06, + "loss": 0.79432225, + "num_input_tokens_seen": 236317380, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6953125, + "step": 10948, + "time_per_iteration": 2.52156400680542 + }, + { + "auxiliary_loss_clip": 0.01103491, + "auxiliary_loss_mlp": 0.01024697, + "balance_loss_clip": 1.01318693, + "balance_loss_mlp": 1.03675056, + "epoch": 0.6582894934615963, + "flos": 23039676161280.0, + "grad_norm": 1.5781773720774923, + "language_loss": 0.79309839, + "learning_rate": 1.1046903867673655e-06, + "loss": 0.81438029, + "num_input_tokens_seen": 236336210, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66796875, + "step": 10949, + "time_per_iteration": 2.4466731548309326 + }, + { + "auxiliary_loss_clip": 0.01025722, + "auxiliary_loss_mlp": 0.00999404, + "balance_loss_clip": 0.99824774, + "balance_loss_mlp": 1.0049113, + "epoch": 0.6583496167142643, + "flos": 72551980978560.0, + "grad_norm": 0.7326202101084998, + "language_loss": 0.61823738, + "learning_rate": 1.104342144597323e-06, + "loss": 0.63848865, + "num_input_tokens_seen": 236403090, + "router_z_loss_clip": 0.01153564, + "router_z_loss_mlp": 0.20800781, + "step": 10950, + "time_per_iteration": 3.121711015701294 + }, + { + "auxiliary_loss_clip": 0.01098873, + "auxiliary_loss_mlp": 0.01029132, + "balance_loss_clip": 1.01832557, + "balance_loss_mlp": 1.0340389, + "epoch": 0.6584097399669322, + "flos": 13078846592640.0, + "grad_norm": 2.039519263453104, + "language_loss": 0.67086935, + "learning_rate": 1.1039939363907178e-06, + "loss": 0.69214934, + "num_input_tokens_seen": 236420475, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6484375, + "step": 10951, + "time_per_iteration": 2.4204366207122803 + }, + { + "auxiliary_loss_clip": 0.01103981, + "auxiliary_loss_mlp": 0.01031136, + "balance_loss_clip": 1.01967382, + "balance_loss_mlp": 1.03702927, + "epoch": 0.6584698632196002, + "flos": 28693164458880.0, + "grad_norm": 1.3948057696634335, + "language_loss": 0.76445824, + "learning_rate": 1.1036457621607504e-06, + "loss": 0.7858094, + "num_input_tokens_seen": 236441915, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 10952, + "time_per_iteration": 2.5405352115631104 + }, + { + "auxiliary_loss_clip": 0.01101736, + "auxiliary_loss_mlp": 0.01031049, + "balance_loss_clip": 1.0193491, + "balance_loss_mlp": 1.03628421, + "epoch": 0.6585299864722681, + "flos": 14319272914560.0, + "grad_norm": 1.8480440869895376, + "language_loss": 0.73304069, + "learning_rate": 1.1032976219206257e-06, + "loss": 0.75436854, + "num_input_tokens_seen": 236460340, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 10953, + "time_per_iteration": 2.4275546073913574 + }, + { + "auxiliary_loss_clip": 0.01103382, + "auxiliary_loss_mlp": 0.01035829, + "balance_loss_clip": 1.02360368, + "balance_loss_mlp": 1.0364027, + "epoch": 0.6585901097249361, + "flos": 26797907243520.0, + "grad_norm": 2.01659222308535, + "language_loss": 0.78839052, + "learning_rate": 1.102949515683546e-06, + "loss": 0.80978262, + "num_input_tokens_seen": 236478280, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 10954, + "time_per_iteration": 2.515486478805542 + }, + { + "auxiliary_loss_clip": 0.01105192, + "auxiliary_loss_mlp": 0.01030874, + "balance_loss_clip": 1.01928055, + "balance_loss_mlp": 1.0370729, + "epoch": 0.658650232977604, + "flos": 18733124989440.0, + "grad_norm": 4.542628698192554, + "language_loss": 0.69261253, + "learning_rate": 1.1026014434627096e-06, + "loss": 0.71397316, + "num_input_tokens_seen": 236493225, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 10955, + "time_per_iteration": 2.4162137508392334 + }, + { + "auxiliary_loss_clip": 0.01098966, + "auxiliary_loss_mlp": 0.01031102, + "balance_loss_clip": 1.02065945, + "balance_loss_mlp": 1.03447628, + "epoch": 0.6587103562302721, + "flos": 24753440931840.0, + "grad_norm": 1.9435823457200367, + "language_loss": 0.8063699, + "learning_rate": 1.1022534052713172e-06, + "loss": 0.82767057, + "num_input_tokens_seen": 236514420, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.64453125, + "step": 10956, + "time_per_iteration": 2.501207113265991 + }, + { + "auxiliary_loss_clip": 0.01103712, + "auxiliary_loss_mlp": 0.0103857, + "balance_loss_clip": 1.02636909, + "balance_loss_mlp": 1.03677917, + "epoch": 0.65877047948294, + "flos": 22346133384960.0, + "grad_norm": 2.2587354412030365, + "language_loss": 0.8126533, + "learning_rate": 1.1019054011225648e-06, + "loss": 0.83407611, + "num_input_tokens_seen": 236532785, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66796875, + "step": 10957, + "time_per_iteration": 2.4624950885772705 + }, + { + "auxiliary_loss_clip": 0.01101319, + "auxiliary_loss_mlp": 0.01027592, + "balance_loss_clip": 1.01656473, + "balance_loss_mlp": 1.03620899, + "epoch": 0.658830602735608, + "flos": 45180542298240.0, + "grad_norm": 1.8981628531368988, + "language_loss": 0.76096463, + "learning_rate": 1.1015574310296506e-06, + "loss": 0.78225374, + "num_input_tokens_seen": 236553330, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65234375, + "step": 10958, + "time_per_iteration": 2.6494197845458984 + }, + { + "auxiliary_loss_clip": 0.01101191, + "auxiliary_loss_mlp": 0.01029699, + "balance_loss_clip": 1.01818335, + "balance_loss_mlp": 1.03651094, + "epoch": 0.6588907259882759, + "flos": 19901622326400.0, + "grad_norm": 1.5449360693578584, + "language_loss": 0.7480197, + "learning_rate": 1.1012094950057678e-06, + "loss": 0.76932859, + "num_input_tokens_seen": 236572960, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 10959, + "time_per_iteration": 2.427396535873413 + }, + { + "auxiliary_loss_clip": 0.01102895, + "auxiliary_loss_mlp": 0.0102741, + "balance_loss_clip": 1.01597738, + "balance_loss_mlp": 1.03627992, + "epoch": 0.6589508492409439, + "flos": 24133766474880.0, + "grad_norm": 1.5048251142631304, + "language_loss": 0.64632499, + "learning_rate": 1.1008615930641107e-06, + "loss": 0.66762793, + "num_input_tokens_seen": 236594090, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 10960, + "time_per_iteration": 2.4602410793304443 + }, + { + "auxiliary_loss_clip": 0.01108237, + "auxiliary_loss_mlp": 0.01031743, + "balance_loss_clip": 1.01920843, + "balance_loss_mlp": 1.03767896, + "epoch": 0.659010972493612, + "flos": 18222906251520.0, + "grad_norm": 2.0928832268916064, + "language_loss": 0.81810492, + "learning_rate": 1.1005137252178734e-06, + "loss": 0.83950472, + "num_input_tokens_seen": 236610190, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.70703125, + "step": 10961, + "time_per_iteration": 2.409662961959839 + }, + { + "auxiliary_loss_clip": 0.01105671, + "auxiliary_loss_mlp": 0.01028804, + "balance_loss_clip": 1.01721096, + "balance_loss_mlp": 1.03837204, + "epoch": 0.6590710957462799, + "flos": 27600007898880.0, + "grad_norm": 1.6316286919602636, + "language_loss": 0.73185778, + "learning_rate": 1.1001658914802453e-06, + "loss": 0.7532025, + "num_input_tokens_seen": 236631575, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 10962, + "time_per_iteration": 2.5012168884277344 + }, + { + "auxiliary_loss_clip": 0.01103926, + "auxiliary_loss_mlp": 0.01027399, + "balance_loss_clip": 1.016325, + "balance_loss_mlp": 1.03553998, + "epoch": 0.6591312189989479, + "flos": 20302959962880.0, + "grad_norm": 2.292666509682468, + "language_loss": 0.7991221, + "learning_rate": 1.0998180918644165e-06, + "loss": 0.8204354, + "num_input_tokens_seen": 236649815, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.68359375, + "step": 10963, + "time_per_iteration": 2.4411072731018066 + }, + { + "auxiliary_loss_clip": 0.01101161, + "auxiliary_loss_mlp": 0.01026818, + "balance_loss_clip": 1.01545739, + "balance_loss_mlp": 1.03585351, + "epoch": 0.6591913422516158, + "flos": 12312943868160.0, + "grad_norm": 1.6740266575713383, + "language_loss": 0.78245199, + "learning_rate": 1.0994703263835754e-06, + "loss": 0.8037318, + "num_input_tokens_seen": 236668335, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65625, + "step": 10964, + "time_per_iteration": 2.5599732398986816 + }, + { + "auxiliary_loss_clip": 0.01103059, + "auxiliary_loss_mlp": 0.01032339, + "balance_loss_clip": 1.02130592, + "balance_loss_mlp": 1.03435874, + "epoch": 0.6592514655042838, + "flos": 25884591102720.0, + "grad_norm": 1.7118472944354244, + "language_loss": 0.74207413, + "learning_rate": 1.0991225950509106e-06, + "loss": 0.76342809, + "num_input_tokens_seen": 236688945, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6875, + "step": 10965, + "time_per_iteration": 2.471712112426758 + }, + { + "auxiliary_loss_clip": 0.01107005, + "auxiliary_loss_mlp": 0.01030746, + "balance_loss_clip": 1.01877689, + "balance_loss_mlp": 1.03634071, + "epoch": 0.6593115887569517, + "flos": 14063624841600.0, + "grad_norm": 1.7378396373661993, + "language_loss": 0.73264408, + "learning_rate": 1.0987748978796067e-06, + "loss": 0.75402158, + "num_input_tokens_seen": 236707055, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.70703125, + "step": 10966, + "time_per_iteration": 2.436239004135132 + }, + { + "auxiliary_loss_clip": 0.0110244, + "auxiliary_loss_mlp": 0.01030113, + "balance_loss_clip": 1.01798916, + "balance_loss_mlp": 1.03512931, + "epoch": 0.6593717120096197, + "flos": 24717925359360.0, + "grad_norm": 1.8788551125386406, + "language_loss": 0.77065092, + "learning_rate": 1.0984272348828487e-06, + "loss": 0.79197645, + "num_input_tokens_seen": 236725900, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 10967, + "time_per_iteration": 2.4717586040496826 + }, + { + "auxiliary_loss_clip": 0.01027072, + "auxiliary_loss_mlp": 0.01001789, + "balance_loss_clip": 1.00063896, + "balance_loss_mlp": 1.00624704, + "epoch": 0.6594318352622877, + "flos": 55558083502080.0, + "grad_norm": 0.6916872612313274, + "language_loss": 0.48437804, + "learning_rate": 1.0980796060738221e-06, + "loss": 0.50466669, + "num_input_tokens_seen": 236788415, + "router_z_loss_clip": 0.01147461, + "router_z_loss_mlp": 0.20898438, + "step": 10968, + "time_per_iteration": 4.5336384773254395 + }, + { + "auxiliary_loss_clip": 0.01103459, + "auxiliary_loss_mlp": 0.01030332, + "balance_loss_clip": 1.01853621, + "balance_loss_mlp": 1.03579104, + "epoch": 0.6594919585149557, + "flos": 17456931699840.0, + "grad_norm": 1.9909395686766433, + "language_loss": 0.79144681, + "learning_rate": 1.0977320114657058e-06, + "loss": 0.81278479, + "num_input_tokens_seen": 236805155, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 10969, + "time_per_iteration": 2.4394266605377197 + }, + { + "auxiliary_loss_clip": 0.01101468, + "auxiliary_loss_mlp": 0.01027571, + "balance_loss_clip": 1.0165081, + "balance_loss_mlp": 1.03489542, + "epoch": 0.6595520817676236, + "flos": 18223229473920.0, + "grad_norm": 1.9980021115439661, + "language_loss": 0.65425408, + "learning_rate": 1.0973844510716817e-06, + "loss": 0.6755445, + "num_input_tokens_seen": 236824360, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6640625, + "step": 10970, + "time_per_iteration": 2.421241521835327 + }, + { + "auxiliary_loss_clip": 0.01103326, + "auxiliary_loss_mlp": 0.0102615, + "balance_loss_clip": 1.01499188, + "balance_loss_mlp": 1.0361867, + "epoch": 0.6596122050202916, + "flos": 22199761463040.0, + "grad_norm": 1.9709453771316594, + "language_loss": 0.76396167, + "learning_rate": 1.0970369249049308e-06, + "loss": 0.78525639, + "num_input_tokens_seen": 236844640, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.671875, + "step": 10971, + "time_per_iteration": 5.2941343784332275 + }, + { + "auxiliary_loss_clip": 0.01105265, + "auxiliary_loss_mlp": 0.01031993, + "balance_loss_clip": 1.02074528, + "balance_loss_mlp": 1.03658032, + "epoch": 0.6596723282729595, + "flos": 14173834746240.0, + "grad_norm": 2.436761152631742, + "language_loss": 0.70031983, + "learning_rate": 1.096689432978629e-06, + "loss": 0.72169238, + "num_input_tokens_seen": 236861160, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6875, + "step": 10972, + "time_per_iteration": 2.434751033782959 + }, + { + "auxiliary_loss_clip": 0.01104063, + "auxiliary_loss_mlp": 0.01026316, + "balance_loss_clip": 1.01401901, + "balance_loss_mlp": 1.03706002, + "epoch": 0.6597324515256275, + "flos": 30553193410560.0, + "grad_norm": 2.0552877724786347, + "language_loss": 0.55426097, + "learning_rate": 1.0963419753059556e-06, + "loss": 0.5755648, + "num_input_tokens_seen": 236880465, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.671875, + "step": 10973, + "time_per_iteration": 3.9870107173919678 + }, + { + "auxiliary_loss_clip": 0.01108369, + "auxiliary_loss_mlp": 0.01034143, + "balance_loss_clip": 1.0224663, + "balance_loss_mlp": 1.0379895, + "epoch": 0.6597925747782956, + "flos": 17639860688640.0, + "grad_norm": 1.9173473771897223, + "language_loss": 0.78754056, + "learning_rate": 1.0959945519000839e-06, + "loss": 0.80896568, + "num_input_tokens_seen": 236897730, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.703125, + "step": 10974, + "time_per_iteration": 2.413245916366577 + }, + { + "auxiliary_loss_clip": 0.01104385, + "auxiliary_loss_mlp": 0.01031342, + "balance_loss_clip": 1.01999879, + "balance_loss_mlp": 1.03666687, + "epoch": 0.6598526980309635, + "flos": 22819112697600.0, + "grad_norm": 2.1994599169674016, + "language_loss": 0.69061923, + "learning_rate": 1.0956471627741906e-06, + "loss": 0.71197647, + "num_input_tokens_seen": 236917300, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.67578125, + "step": 10975, + "time_per_iteration": 2.488288164138794 + }, + { + "auxiliary_loss_clip": 0.01101915, + "auxiliary_loss_mlp": 0.01026336, + "balance_loss_clip": 1.01519537, + "balance_loss_mlp": 1.03476441, + "epoch": 0.6599128212836315, + "flos": 21068036674560.0, + "grad_norm": 1.699075414788055, + "language_loss": 0.7082206, + "learning_rate": 1.0952998079414464e-06, + "loss": 0.72950304, + "num_input_tokens_seen": 236935590, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.671875, + "step": 10976, + "time_per_iteration": 2.4436802864074707 + }, + { + "auxiliary_loss_clip": 0.01099428, + "auxiliary_loss_mlp": 0.010308, + "balance_loss_clip": 1.01890898, + "balance_loss_mlp": 1.03462744, + "epoch": 0.6599729445362994, + "flos": 22163527618560.0, + "grad_norm": 1.7471383506629494, + "language_loss": 0.6767379, + "learning_rate": 1.0949524874150243e-06, + "loss": 0.69804019, + "num_input_tokens_seen": 236952830, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6484375, + "step": 10977, + "time_per_iteration": 2.4598448276519775 + }, + { + "auxiliary_loss_clip": 0.01108053, + "auxiliary_loss_mlp": 0.01027322, + "balance_loss_clip": 1.01457834, + "balance_loss_mlp": 1.03748345, + "epoch": 0.6600330677889674, + "flos": 18150079426560.0, + "grad_norm": 2.0162776681697476, + "language_loss": 0.81473112, + "learning_rate": 1.0946052012080952e-06, + "loss": 0.83608478, + "num_input_tokens_seen": 236971930, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 10978, + "time_per_iteration": 2.4228336811065674 + }, + { + "auxiliary_loss_clip": 0.01107046, + "auxiliary_loss_mlp": 0.01037813, + "balance_loss_clip": 1.02570164, + "balance_loss_mlp": 1.03726959, + "epoch": 0.6600931910416353, + "flos": 18150115340160.0, + "grad_norm": 3.1339976235635527, + "language_loss": 0.6725859, + "learning_rate": 1.0942579493338278e-06, + "loss": 0.69403446, + "num_input_tokens_seen": 236989920, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69921875, + "step": 10979, + "time_per_iteration": 2.450756549835205 + }, + { + "auxiliary_loss_clip": 0.01102975, + "auxiliary_loss_mlp": 0.01023928, + "balance_loss_clip": 1.01215005, + "balance_loss_mlp": 1.0349319, + "epoch": 0.6601533142943034, + "flos": 17420733768960.0, + "grad_norm": 2.827162971921963, + "language_loss": 0.72720212, + "learning_rate": 1.0939107318053889e-06, + "loss": 0.74847114, + "num_input_tokens_seen": 237006570, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6796875, + "step": 10980, + "time_per_iteration": 2.406029462814331 + }, + { + "auxiliary_loss_clip": 0.01098591, + "auxiliary_loss_mlp": 0.01031279, + "balance_loss_clip": 1.02074146, + "balance_loss_mlp": 1.03450036, + "epoch": 0.6602134375469713, + "flos": 28219574615040.0, + "grad_norm": 1.5521957632844796, + "language_loss": 0.72807193, + "learning_rate": 1.0935635486359459e-06, + "loss": 0.74937057, + "num_input_tokens_seen": 237028415, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.640625, + "step": 10981, + "time_per_iteration": 2.5201127529144287 + }, + { + "auxiliary_loss_clip": 0.01104778, + "auxiliary_loss_mlp": 0.01032982, + "balance_loss_clip": 1.02119243, + "balance_loss_mlp": 1.03583837, + "epoch": 0.6602735607996393, + "flos": 29418056830080.0, + "grad_norm": 1.966625481577904, + "language_loss": 0.69085824, + "learning_rate": 1.0932163998386647e-06, + "loss": 0.71223581, + "num_input_tokens_seen": 237046595, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 10982, + "time_per_iteration": 2.5098371505737305 + }, + { + "auxiliary_loss_clip": 0.01101832, + "auxiliary_loss_mlp": 0.01026165, + "balance_loss_clip": 1.01473927, + "balance_loss_mlp": 1.03688765, + "epoch": 0.6603336840523072, + "flos": 18588045957120.0, + "grad_norm": 1.50117340695368, + "language_loss": 0.69566637, + "learning_rate": 1.0928692854267075e-06, + "loss": 0.71694636, + "num_input_tokens_seen": 237066150, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 10983, + "time_per_iteration": 2.4642090797424316 + }, + { + "auxiliary_loss_clip": 0.01103785, + "auxiliary_loss_mlp": 0.01027163, + "balance_loss_clip": 1.01571345, + "balance_loss_mlp": 1.03580856, + "epoch": 0.6603938073049752, + "flos": 33254860913280.0, + "grad_norm": 1.6650782937776725, + "language_loss": 0.70871687, + "learning_rate": 1.092522205413239e-06, + "loss": 0.73002636, + "num_input_tokens_seen": 237087060, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6796875, + "step": 10984, + "time_per_iteration": 2.545948028564453 + }, + { + "auxiliary_loss_clip": 0.01099312, + "auxiliary_loss_mlp": 0.01032765, + "balance_loss_clip": 1.02120149, + "balance_loss_mlp": 1.03464043, + "epoch": 0.6604539305576431, + "flos": 17384284442880.0, + "grad_norm": 1.583849922965693, + "language_loss": 0.83839536, + "learning_rate": 1.0921751598114193e-06, + "loss": 0.85971612, + "num_input_tokens_seen": 237103825, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6484375, + "step": 10985, + "time_per_iteration": 2.5026867389678955 + }, + { + "auxiliary_loss_clip": 0.01105227, + "auxiliary_loss_mlp": 0.01032357, + "balance_loss_clip": 1.02026308, + "balance_loss_mlp": 1.03746915, + "epoch": 0.6605140538103111, + "flos": 21251145231360.0, + "grad_norm": 2.805092368411813, + "language_loss": 0.73806614, + "learning_rate": 1.0918281486344077e-06, + "loss": 0.75944197, + "num_input_tokens_seen": 237121740, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 10986, + "time_per_iteration": 2.4697890281677246 + }, + { + "auxiliary_loss_clip": 0.01100417, + "auxiliary_loss_mlp": 0.01026054, + "balance_loss_clip": 1.01450825, + "balance_loss_mlp": 1.03609347, + "epoch": 0.6605741770629792, + "flos": 13881701433600.0, + "grad_norm": 1.6327019217005077, + "language_loss": 0.78796637, + "learning_rate": 1.0914811718953636e-06, + "loss": 0.80923104, + "num_input_tokens_seen": 237139565, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 10987, + "time_per_iteration": 2.417971611022949 + }, + { + "auxiliary_loss_clip": 0.01026194, + "auxiliary_loss_mlp": 0.01004542, + "balance_loss_clip": 1.0033257, + "balance_loss_mlp": 1.00560772, + "epoch": 0.6606343003156471, + "flos": 69316215171840.0, + "grad_norm": 0.8165641821952351, + "language_loss": 0.54130733, + "learning_rate": 1.0911342296074454e-06, + "loss": 0.56161469, + "num_input_tokens_seen": 237201055, + "router_z_loss_clip": 0.012146, + "router_z_loss_mlp": 0.20605469, + "step": 10988, + "time_per_iteration": 3.158214807510376 + }, + { + "auxiliary_loss_clip": 0.01103971, + "auxiliary_loss_mlp": 0.0102934, + "balance_loss_clip": 1.01902199, + "balance_loss_mlp": 1.03813577, + "epoch": 0.6606944235683151, + "flos": 27272394927360.0, + "grad_norm": 1.5008723881290433, + "language_loss": 0.77463698, + "learning_rate": 1.0907873217838077e-06, + "loss": 0.79597014, + "num_input_tokens_seen": 237221805, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.65625, + "step": 10989, + "time_per_iteration": 2.531778573989868 + }, + { + "auxiliary_loss_clip": 0.01105177, + "auxiliary_loss_mlp": 0.01032304, + "balance_loss_clip": 1.02086616, + "balance_loss_mlp": 1.0393579, + "epoch": 0.660754546820983, + "flos": 13772820332160.0, + "grad_norm": 1.9100821463598359, + "language_loss": 0.77224958, + "learning_rate": 1.0904404484376064e-06, + "loss": 0.7936244, + "num_input_tokens_seen": 237238270, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66015625, + "step": 10990, + "time_per_iteration": 2.393866539001465 + }, + { + "auxiliary_loss_clip": 0.01103575, + "auxiliary_loss_mlp": 0.01027755, + "balance_loss_clip": 1.0154345, + "balance_loss_mlp": 1.03490543, + "epoch": 0.660814670073651, + "flos": 15705209232000.0, + "grad_norm": 1.959228938394804, + "language_loss": 0.60573477, + "learning_rate": 1.0900936095819937e-06, + "loss": 0.62704802, + "num_input_tokens_seen": 237255400, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6875, + "step": 10991, + "time_per_iteration": 2.421860933303833 + }, + { + "auxiliary_loss_clip": 0.01106108, + "auxiliary_loss_mlp": 0.01031939, + "balance_loss_clip": 1.01960719, + "balance_loss_mlp": 1.03634095, + "epoch": 0.6608747933263189, + "flos": 20850023076480.0, + "grad_norm": 2.508745269820261, + "language_loss": 0.68313217, + "learning_rate": 1.0897468052301234e-06, + "loss": 0.70451266, + "num_input_tokens_seen": 237273105, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69921875, + "step": 10992, + "time_per_iteration": 2.438251495361328 + }, + { + "auxiliary_loss_clip": 0.01105003, + "auxiliary_loss_mlp": 0.01027261, + "balance_loss_clip": 1.0152688, + "balance_loss_mlp": 1.03565395, + "epoch": 0.660934916578987, + "flos": 20632117219200.0, + "grad_norm": 2.0506508317322036, + "language_loss": 0.87773001, + "learning_rate": 1.0894000353951444e-06, + "loss": 0.89905262, + "num_input_tokens_seen": 237292650, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 10993, + "time_per_iteration": 2.4813613891601562 + }, + { + "auxiliary_loss_clip": 0.01109842, + "auxiliary_loss_mlp": 0.01029551, + "balance_loss_clip": 1.01642656, + "balance_loss_mlp": 1.03765821, + "epoch": 0.6609950398316549, + "flos": 25113588647040.0, + "grad_norm": 1.679596565938276, + "language_loss": 0.66940713, + "learning_rate": 1.0890533000902078e-06, + "loss": 0.69080102, + "num_input_tokens_seen": 237312865, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.72265625, + "step": 10994, + "time_per_iteration": 2.464946985244751 + }, + { + "auxiliary_loss_clip": 0.0110627, + "auxiliary_loss_mlp": 0.0103033, + "balance_loss_clip": 1.01818299, + "balance_loss_mlp": 1.03735578, + "epoch": 0.6610551630843229, + "flos": 18661196004480.0, + "grad_norm": 1.7600806197216516, + "language_loss": 0.76505876, + "learning_rate": 1.0887065993284626e-06, + "loss": 0.78642476, + "num_input_tokens_seen": 237331210, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6875, + "step": 10995, + "time_per_iteration": 2.443978786468506 + }, + { + "auxiliary_loss_clip": 0.01103759, + "auxiliary_loss_mlp": 0.01025776, + "balance_loss_clip": 1.01477861, + "balance_loss_mlp": 1.03649068, + "epoch": 0.6611152863369908, + "flos": 23258192549760.0, + "grad_norm": 1.907480349708707, + "language_loss": 0.74543679, + "learning_rate": 1.088359933123053e-06, + "loss": 0.76673216, + "num_input_tokens_seen": 237349455, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.671875, + "step": 10996, + "time_per_iteration": 2.437030076980591 + }, + { + "auxiliary_loss_clip": 0.01103314, + "auxiliary_loss_mlp": 0.01031676, + "balance_loss_clip": 1.01963055, + "balance_loss_mlp": 1.03681195, + "epoch": 0.6611754095896588, + "flos": 22159720776960.0, + "grad_norm": 1.9556097783969382, + "language_loss": 0.68673009, + "learning_rate": 1.088013301487126e-06, + "loss": 0.70807999, + "num_input_tokens_seen": 237367100, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6640625, + "step": 10997, + "time_per_iteration": 2.4747731685638428 + }, + { + "auxiliary_loss_clip": 0.01107479, + "auxiliary_loss_mlp": 0.01026937, + "balance_loss_clip": 1.01583838, + "balance_loss_mlp": 1.03762627, + "epoch": 0.6612355328423267, + "flos": 13991228979840.0, + "grad_norm": 1.9530622490500587, + "language_loss": 0.68974924, + "learning_rate": 1.0876667044338269e-06, + "loss": 0.71109343, + "num_input_tokens_seen": 237384840, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.69921875, + "step": 10998, + "time_per_iteration": 2.407527208328247 + }, + { + "auxiliary_loss_clip": 0.01026246, + "auxiliary_loss_mlp": 0.0100257, + "balance_loss_clip": 1.00145519, + "balance_loss_mlp": 1.00553703, + "epoch": 0.6612956560949947, + "flos": 61453716359040.0, + "grad_norm": 0.6545620134591473, + "language_loss": 0.5117774, + "learning_rate": 1.087320141976297e-06, + "loss": 0.53206557, + "num_input_tokens_seen": 237443355, + "router_z_loss_clip": 0.01116943, + "router_z_loss_mlp": 0.20703125, + "step": 10999, + "time_per_iteration": 3.0084383487701416 + }, + { + "auxiliary_loss_clip": 0.01105663, + "auxiliary_loss_mlp": 0.01027993, + "balance_loss_clip": 1.01696038, + "balance_loss_mlp": 1.03627193, + "epoch": 0.6613557793476627, + "flos": 21616644072960.0, + "grad_norm": 2.367912839089916, + "language_loss": 0.71249658, + "learning_rate": 1.086973614127679e-06, + "loss": 0.73383313, + "num_input_tokens_seen": 237459205, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6953125, + "step": 11000, + "time_per_iteration": 2.426126480102539 + }, + { + "auxiliary_loss_clip": 0.01100275, + "auxiliary_loss_mlp": 0.010328, + "balance_loss_clip": 1.02214289, + "balance_loss_mlp": 1.03528404, + "epoch": 0.6614159026003307, + "flos": 34020117192960.0, + "grad_norm": 1.5935854519622277, + "language_loss": 0.65334332, + "learning_rate": 1.0866271209011133e-06, + "loss": 0.67467409, + "num_input_tokens_seen": 237483580, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6484375, + "step": 11001, + "time_per_iteration": 2.586193323135376 + }, + { + "auxiliary_loss_clip": 0.01103282, + "auxiliary_loss_mlp": 0.01027047, + "balance_loss_clip": 1.01568055, + "balance_loss_mlp": 1.03593278, + "epoch": 0.6614760258529987, + "flos": 24097281235200.0, + "grad_norm": 1.922146655127119, + "language_loss": 0.73242342, + "learning_rate": 1.086280662309739e-06, + "loss": 0.75372672, + "num_input_tokens_seen": 237502860, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.67578125, + "step": 11002, + "time_per_iteration": 2.4588327407836914 + }, + { + "auxiliary_loss_clip": 0.01101069, + "auxiliary_loss_mlp": 0.01032524, + "balance_loss_clip": 1.02064466, + "balance_loss_mlp": 1.03539062, + "epoch": 0.6615361491056666, + "flos": 14903790935040.0, + "grad_norm": 2.0738499312562215, + "language_loss": 0.78606766, + "learning_rate": 1.0859342383666928e-06, + "loss": 0.80740356, + "num_input_tokens_seen": 237521030, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.65625, + "step": 11003, + "time_per_iteration": 2.470768928527832 + }, + { + "auxiliary_loss_clip": 0.01105808, + "auxiliary_loss_mlp": 0.01034072, + "balance_loss_clip": 1.02114952, + "balance_loss_mlp": 1.03701353, + "epoch": 0.6615962723583346, + "flos": 15304877176320.0, + "grad_norm": 1.8055156139018678, + "language_loss": 0.68872547, + "learning_rate": 1.0855878490851119e-06, + "loss": 0.71012425, + "num_input_tokens_seen": 237539585, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6875, + "step": 11004, + "time_per_iteration": 2.4174275398254395 + }, + { + "auxiliary_loss_clip": 0.01105956, + "auxiliary_loss_mlp": 0.01034853, + "balance_loss_clip": 1.02177572, + "balance_loss_mlp": 1.0356009, + "epoch": 0.6616563956110025, + "flos": 18732586285440.0, + "grad_norm": 2.2557237333346687, + "language_loss": 0.69553763, + "learning_rate": 1.085241494478132e-06, + "loss": 0.71694571, + "num_input_tokens_seen": 237557655, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 11005, + "time_per_iteration": 2.482177495956421 + }, + { + "auxiliary_loss_clip": 0.01102487, + "auxiliary_loss_mlp": 0.01026054, + "balance_loss_clip": 1.01488411, + "balance_loss_mlp": 1.03609776, + "epoch": 0.6617165188636706, + "flos": 24495063425280.0, + "grad_norm": 1.5704694842406037, + "language_loss": 0.78232396, + "learning_rate": 1.0848951745588855e-06, + "loss": 0.80360937, + "num_input_tokens_seen": 237577000, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 11006, + "time_per_iteration": 2.4723048210144043 + }, + { + "auxiliary_loss_clip": 0.01102233, + "auxiliary_loss_mlp": 0.01031605, + "balance_loss_clip": 1.01923108, + "balance_loss_mlp": 1.03596103, + "epoch": 0.6617766421163385, + "flos": 22379673709440.0, + "grad_norm": 1.5007948972384493, + "language_loss": 0.75993907, + "learning_rate": 1.0845488893405068e-06, + "loss": 0.78127748, + "num_input_tokens_seen": 237597960, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6640625, + "step": 11007, + "time_per_iteration": 2.4790470600128174 + }, + { + "auxiliary_loss_clip": 0.01105175, + "auxiliary_loss_mlp": 0.0102811, + "balance_loss_clip": 1.01685643, + "balance_loss_mlp": 1.0384593, + "epoch": 0.6618367653690065, + "flos": 20850418126080.0, + "grad_norm": 1.9253644062666073, + "language_loss": 0.78290129, + "learning_rate": 1.0842026388361248e-06, + "loss": 0.80423415, + "num_input_tokens_seen": 237616385, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.66796875, + "step": 11008, + "time_per_iteration": 2.4340806007385254 + }, + { + "auxiliary_loss_clip": 0.0110631, + "auxiliary_loss_mlp": 0.01029918, + "balance_loss_clip": 1.01736474, + "balance_loss_mlp": 1.03573239, + "epoch": 0.6618968886216744, + "flos": 17712328377600.0, + "grad_norm": 1.8127446377472742, + "language_loss": 0.81780791, + "learning_rate": 1.0838564230588715e-06, + "loss": 0.83917022, + "num_input_tokens_seen": 237634930, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.70703125, + "step": 11009, + "time_per_iteration": 2.4623091220855713 + }, + { + "auxiliary_loss_clip": 0.01026257, + "auxiliary_loss_mlp": 0.01005514, + "balance_loss_clip": 1.0043757, + "balance_loss_mlp": 1.00541437, + "epoch": 0.6619570118743424, + "flos": 67035347498880.0, + "grad_norm": 0.9788733414804485, + "language_loss": 0.67425871, + "learning_rate": 1.0835102420218735e-06, + "loss": 0.69457638, + "num_input_tokens_seen": 237693175, + "router_z_loss_clip": 0.01141357, + "router_z_loss_mlp": 0.20898438, + "step": 11010, + "time_per_iteration": 4.397435188293457 + }, + { + "auxiliary_loss_clip": 0.01104702, + "auxiliary_loss_mlp": 0.01028772, + "balance_loss_clip": 1.01645815, + "balance_loss_mlp": 1.03598547, + "epoch": 0.6620171351270103, + "flos": 18660908695680.0, + "grad_norm": 1.7882832526705355, + "language_loss": 0.71199936, + "learning_rate": 1.0831640957382593e-06, + "loss": 0.73333406, + "num_input_tokens_seen": 237713160, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 11011, + "time_per_iteration": 2.4273481369018555 + }, + { + "auxiliary_loss_clip": 0.01103848, + "auxiliary_loss_mlp": 0.0103319, + "balance_loss_clip": 1.02204967, + "balance_loss_mlp": 1.03780639, + "epoch": 0.6620772583796783, + "flos": 24170503109760.0, + "grad_norm": 1.7492667107704147, + "language_loss": 0.72528613, + "learning_rate": 1.0828179842211557e-06, + "loss": 0.74665654, + "num_input_tokens_seen": 237733600, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66015625, + "step": 11012, + "time_per_iteration": 2.467482566833496 + }, + { + "auxiliary_loss_clip": 0.010978, + "auxiliary_loss_mlp": 0.01031536, + "balance_loss_clip": 1.02084911, + "balance_loss_mlp": 1.03615665, + "epoch": 0.6621373816323463, + "flos": 23623547736960.0, + "grad_norm": 1.7384195449369746, + "language_loss": 0.795021, + "learning_rate": 1.0824719074836845e-06, + "loss": 0.8163144, + "num_input_tokens_seen": 237752135, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6171875, + "step": 11013, + "time_per_iteration": 3.923494577407837 + }, + { + "auxiliary_loss_clip": 0.01102996, + "auxiliary_loss_mlp": 0.01029153, + "balance_loss_clip": 1.01767898, + "balance_loss_mlp": 1.03644931, + "epoch": 0.6621975048850143, + "flos": 18442212739200.0, + "grad_norm": 1.886371512022625, + "language_loss": 0.7088536, + "learning_rate": 1.082125865538971e-06, + "loss": 0.73017514, + "num_input_tokens_seen": 237770735, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6640625, + "step": 11014, + "time_per_iteration": 2.439049482345581 + }, + { + "auxiliary_loss_clip": 0.01100918, + "auxiliary_loss_mlp": 0.01030718, + "balance_loss_clip": 1.02039468, + "balance_loss_mlp": 1.03656077, + "epoch": 0.6622576281376823, + "flos": 14063876236800.0, + "grad_norm": 2.1131368988088504, + "language_loss": 0.76709092, + "learning_rate": 1.081779858400137e-06, + "loss": 0.78840733, + "num_input_tokens_seen": 237789005, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.640625, + "step": 11015, + "time_per_iteration": 3.900524616241455 + }, + { + "auxiliary_loss_clip": 0.01101265, + "auxiliary_loss_mlp": 0.01026817, + "balance_loss_clip": 1.01506257, + "balance_loss_mlp": 1.03580058, + "epoch": 0.6623177513903502, + "flos": 17018965169280.0, + "grad_norm": 1.7610046970273479, + "language_loss": 0.82307482, + "learning_rate": 1.0814338860803021e-06, + "loss": 0.8443557, + "num_input_tokens_seen": 237807740, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.65234375, + "step": 11016, + "time_per_iteration": 2.4373061656951904 + }, + { + "auxiliary_loss_clip": 0.0110303, + "auxiliary_loss_mlp": 0.01031074, + "balance_loss_clip": 1.01933837, + "balance_loss_mlp": 1.03373432, + "epoch": 0.6623778746430182, + "flos": 17271021882240.0, + "grad_norm": 1.888497767792011, + "language_loss": 0.6969018, + "learning_rate": 1.0810879485925864e-06, + "loss": 0.71824282, + "num_input_tokens_seen": 237826340, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.69140625, + "step": 11017, + "time_per_iteration": 2.4477572441101074 + }, + { + "auxiliary_loss_clip": 0.0110184, + "auxiliary_loss_mlp": 0.01034298, + "balance_loss_clip": 1.02280068, + "balance_loss_mlp": 1.03520179, + "epoch": 0.6624379978956861, + "flos": 48792688767360.0, + "grad_norm": 1.7526472003474178, + "language_loss": 0.77214134, + "learning_rate": 1.0807420459501084e-06, + "loss": 0.79350269, + "num_input_tokens_seen": 237848305, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6640625, + "step": 11018, + "time_per_iteration": 2.6970436573028564 + }, + { + "auxiliary_loss_clip": 0.01100319, + "auxiliary_loss_mlp": 0.01036299, + "balance_loss_clip": 1.0244143, + "balance_loss_mlp": 1.03411186, + "epoch": 0.6624981211483542, + "flos": 18952431477120.0, + "grad_norm": 1.9966965859861308, + "language_loss": 0.83007133, + "learning_rate": 1.0803961781659841e-06, + "loss": 0.85143745, + "num_input_tokens_seen": 237867020, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6640625, + "step": 11019, + "time_per_iteration": 2.429482936859131 + }, + { + "auxiliary_loss_clip": 0.011003, + "auxiliary_loss_mlp": 0.01028801, + "balance_loss_clip": 1.01789916, + "balance_loss_mlp": 1.0355196, + "epoch": 0.6625582444010221, + "flos": 23256576437760.0, + "grad_norm": 1.956066495989637, + "language_loss": 0.71813512, + "learning_rate": 1.080050345253328e-06, + "loss": 0.73942614, + "num_input_tokens_seen": 237886710, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6484375, + "step": 11020, + "time_per_iteration": 2.4736745357513428 + }, + { + "auxiliary_loss_clip": 0.01107397, + "auxiliary_loss_mlp": 0.0102918, + "balance_loss_clip": 1.01639438, + "balance_loss_mlp": 1.03652906, + "epoch": 0.6626183676536901, + "flos": 21394823633280.0, + "grad_norm": 1.7164682336590185, + "language_loss": 0.72276735, + "learning_rate": 1.0797045472252554e-06, + "loss": 0.74413311, + "num_input_tokens_seen": 237904795, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7109375, + "step": 11021, + "time_per_iteration": 2.477529525756836 + }, + { + "auxiliary_loss_clip": 0.01103875, + "auxiliary_loss_mlp": 0.01032756, + "balance_loss_clip": 1.02115703, + "balance_loss_mlp": 1.03667212, + "epoch": 0.662678490906358, + "flos": 14571293713920.0, + "grad_norm": 2.3400531031028873, + "language_loss": 0.83128953, + "learning_rate": 1.0793587840948793e-06, + "loss": 0.85265589, + "num_input_tokens_seen": 237921320, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 11022, + "time_per_iteration": 2.507936716079712 + }, + { + "auxiliary_loss_clip": 0.0110951, + "auxiliary_loss_mlp": 0.01032056, + "balance_loss_clip": 1.01896095, + "balance_loss_mlp": 1.03662038, + "epoch": 0.662738614159026, + "flos": 15992350554240.0, + "grad_norm": 2.599884159549939, + "language_loss": 0.73365414, + "learning_rate": 1.0790130558753099e-06, + "loss": 0.75506973, + "num_input_tokens_seen": 237933525, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 11023, + "time_per_iteration": 2.4137043952941895 + }, + { + "auxiliary_loss_clip": 0.0110089, + "auxiliary_loss_mlp": 0.01029067, + "balance_loss_clip": 1.0178678, + "balance_loss_mlp": 1.03488147, + "epoch": 0.6627987374116939, + "flos": 19536338966400.0, + "grad_norm": 1.7959862106394333, + "language_loss": 0.74551922, + "learning_rate": 1.0786673625796574e-06, + "loss": 0.76681882, + "num_input_tokens_seen": 237953395, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6640625, + "step": 11024, + "time_per_iteration": 2.475996255874634 + }, + { + "auxiliary_loss_clip": 0.01105322, + "auxiliary_loss_mlp": 0.01029911, + "balance_loss_clip": 1.01788878, + "balance_loss_mlp": 1.0374223, + "epoch": 0.662858860664362, + "flos": 15702838934400.0, + "grad_norm": 2.1748664614868214, + "language_loss": 0.69700897, + "learning_rate": 1.0783217042210306e-06, + "loss": 0.71836132, + "num_input_tokens_seen": 237971445, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 11025, + "time_per_iteration": 2.4363040924072266 + }, + { + "auxiliary_loss_clip": 0.01105179, + "auxiliary_loss_mlp": 0.01035379, + "balance_loss_clip": 1.02345753, + "balance_loss_mlp": 1.03844023, + "epoch": 0.6629189839170299, + "flos": 20154289570560.0, + "grad_norm": 1.7368551034909252, + "language_loss": 0.78647238, + "learning_rate": 1.0779760808125379e-06, + "loss": 0.8078779, + "num_input_tokens_seen": 237989965, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66796875, + "step": 11026, + "time_per_iteration": 2.4804115295410156 + }, + { + "auxiliary_loss_clip": 0.01102997, + "auxiliary_loss_mlp": 0.01029116, + "balance_loss_clip": 1.01807094, + "balance_loss_mlp": 1.03734887, + "epoch": 0.6629791071696979, + "flos": 20915415786240.0, + "grad_norm": 1.6695781674460857, + "language_loss": 0.7642892, + "learning_rate": 1.0776304923672842e-06, + "loss": 0.78561032, + "num_input_tokens_seen": 238006820, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 11027, + "time_per_iteration": 2.4259533882141113 + }, + { + "auxiliary_loss_clip": 0.01104358, + "auxiliary_loss_mlp": 0.01035525, + "balance_loss_clip": 1.02346063, + "balance_loss_mlp": 1.03656745, + "epoch": 0.6630392304223659, + "flos": 20846898593280.0, + "grad_norm": 2.1060132685452335, + "language_loss": 0.69903147, + "learning_rate": 1.0772849388983742e-06, + "loss": 0.72043025, + "num_input_tokens_seen": 238022560, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6796875, + "step": 11028, + "time_per_iteration": 2.4627115726470947 + }, + { + "auxiliary_loss_clip": 0.01102349, + "auxiliary_loss_mlp": 0.01031493, + "balance_loss_clip": 1.02125263, + "balance_loss_mlp": 1.03578711, + "epoch": 0.6630993536750338, + "flos": 20995820380800.0, + "grad_norm": 1.8773152280466259, + "language_loss": 0.7926842, + "learning_rate": 1.0769394204189138e-06, + "loss": 0.8140226, + "num_input_tokens_seen": 238041895, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.66796875, + "step": 11029, + "time_per_iteration": 2.4524929523468018 + }, + { + "auxiliary_loss_clip": 0.01103713, + "auxiliary_loss_mlp": 0.01028426, + "balance_loss_clip": 1.01583755, + "balance_loss_mlp": 1.03504181, + "epoch": 0.6631594769277018, + "flos": 18259032355200.0, + "grad_norm": 2.11014761642944, + "language_loss": 0.76041275, + "learning_rate": 1.0765939369420012e-06, + "loss": 0.78173411, + "num_input_tokens_seen": 238060445, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 11030, + "time_per_iteration": 2.4383111000061035 + }, + { + "auxiliary_loss_clip": 0.01109452, + "auxiliary_loss_mlp": 0.01031592, + "balance_loss_clip": 1.01958811, + "balance_loss_mlp": 1.03813887, + "epoch": 0.6632196001803697, + "flos": 17820491207040.0, + "grad_norm": 2.37714698139957, + "language_loss": 0.74753916, + "learning_rate": 1.0762484884807391e-06, + "loss": 0.76894963, + "num_input_tokens_seen": 238077080, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.71484375, + "step": 11031, + "time_per_iteration": 2.4041976928710938 + }, + { + "auxiliary_loss_clip": 0.01103516, + "auxiliary_loss_mlp": 0.01031437, + "balance_loss_clip": 1.01942098, + "balance_loss_mlp": 1.03516042, + "epoch": 0.6632797234330378, + "flos": 12670182581760.0, + "grad_norm": 3.9220695320455494, + "language_loss": 0.74872231, + "learning_rate": 1.075903075048228e-06, + "loss": 0.77007186, + "num_input_tokens_seen": 238091045, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 11032, + "time_per_iteration": 2.3847768306732178 + }, + { + "auxiliary_loss_clip": 0.01102597, + "auxiliary_loss_mlp": 0.01028293, + "balance_loss_clip": 1.01723075, + "balance_loss_mlp": 1.03578007, + "epoch": 0.6633398466857057, + "flos": 23584728113280.0, + "grad_norm": 1.77863211463492, + "language_loss": 0.80295861, + "learning_rate": 1.0755576966575635e-06, + "loss": 0.82426751, + "num_input_tokens_seen": 238110220, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.66796875, + "step": 11033, + "time_per_iteration": 2.4669265747070312 + }, + { + "auxiliary_loss_clip": 0.01101844, + "auxiliary_loss_mlp": 0.01029961, + "balance_loss_clip": 1.01801026, + "balance_loss_mlp": 1.03441966, + "epoch": 0.6633999699383737, + "flos": 20631686256000.0, + "grad_norm": 2.0583190629929957, + "language_loss": 0.80057156, + "learning_rate": 1.0752123533218451e-06, + "loss": 0.82188958, + "num_input_tokens_seen": 238130400, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.67578125, + "step": 11034, + "time_per_iteration": 2.4563634395599365 + }, + { + "auxiliary_loss_clip": 0.0109965, + "auxiliary_loss_mlp": 0.0102582, + "balance_loss_clip": 1.01526368, + "balance_loss_mlp": 1.03502083, + "epoch": 0.6634600931910416, + "flos": 21797095023360.0, + "grad_norm": 1.5719715577747368, + "language_loss": 0.75545985, + "learning_rate": 1.074867045054166e-06, + "loss": 0.7767145, + "num_input_tokens_seen": 238148165, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.64453125, + "step": 11035, + "time_per_iteration": 2.513399600982666 + }, + { + "auxiliary_loss_clip": 0.01103249, + "auxiliary_loss_mlp": 0.01027301, + "balance_loss_clip": 1.0156064, + "balance_loss_mlp": 1.0342617, + "epoch": 0.6635202164437096, + "flos": 18732873594240.0, + "grad_norm": 1.7970498153302146, + "language_loss": 0.83235633, + "learning_rate": 1.074521771867622e-06, + "loss": 0.85366178, + "num_input_tokens_seen": 238166360, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.69140625, + "step": 11036, + "time_per_iteration": 2.519704580307007 + }, + { + "auxiliary_loss_clip": 0.01027101, + "auxiliary_loss_mlp": 0.01001243, + "balance_loss_clip": 0.99994338, + "balance_loss_mlp": 1.00646234, + "epoch": 0.6635803396963775, + "flos": 60222771227520.0, + "grad_norm": 0.7769560833184769, + "language_loss": 0.52306348, + "learning_rate": 1.0741765337753044e-06, + "loss": 0.54334688, + "num_input_tokens_seen": 238227630, + "router_z_loss_clip": 0.01300049, + "router_z_loss_mlp": 0.20703125, + "step": 11037, + "time_per_iteration": 3.0515010356903076 + }, + { + "auxiliary_loss_clip": 0.01103588, + "auxiliary_loss_mlp": 0.01036401, + "balance_loss_clip": 1.02405727, + "balance_loss_mlp": 1.03591716, + "epoch": 0.6636404629490456, + "flos": 29167041611520.0, + "grad_norm": 1.842185877925078, + "language_loss": 0.79099, + "learning_rate": 1.0738313307903052e-06, + "loss": 0.81238985, + "num_input_tokens_seen": 238248435, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.67578125, + "step": 11038, + "time_per_iteration": 2.5139565467834473 + }, + { + "auxiliary_loss_clip": 0.0110341, + "auxiliary_loss_mlp": 0.01037399, + "balance_loss_clip": 1.02515566, + "balance_loss_mlp": 1.03648806, + "epoch": 0.6637005862017135, + "flos": 38907702766080.0, + "grad_norm": 1.8255445121908285, + "language_loss": 0.64082795, + "learning_rate": 1.073486162925716e-06, + "loss": 0.66223598, + "num_input_tokens_seen": 238268755, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66796875, + "step": 11039, + "time_per_iteration": 2.623331308364868 + }, + { + "auxiliary_loss_clip": 0.0110572, + "auxiliary_loss_mlp": 0.01030057, + "balance_loss_clip": 1.01841021, + "balance_loss_mlp": 1.03601968, + "epoch": 0.6637607094543815, + "flos": 22783345729920.0, + "grad_norm": 1.7210825984121325, + "language_loss": 0.63687986, + "learning_rate": 1.0731410301946237e-06, + "loss": 0.65823758, + "num_input_tokens_seen": 238290120, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.69921875, + "step": 11040, + "time_per_iteration": 2.472255229949951 + }, + { + "auxiliary_loss_clip": 0.01100331, + "auxiliary_loss_mlp": 0.01029858, + "balance_loss_clip": 1.01893258, + "balance_loss_mlp": 1.03372359, + "epoch": 0.6638208327070495, + "flos": 18114096977280.0, + "grad_norm": 1.9713653362611905, + "language_loss": 0.71843195, + "learning_rate": 1.0727959326101161e-06, + "loss": 0.73973382, + "num_input_tokens_seen": 238309290, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.66796875, + "step": 11041, + "time_per_iteration": 2.4769115447998047 + }, + { + "auxiliary_loss_clip": 0.01102253, + "auxiliary_loss_mlp": 0.01038804, + "balance_loss_clip": 1.02647161, + "balance_loss_mlp": 1.03540432, + "epoch": 0.6638809559597174, + "flos": 29424880414080.0, + "grad_norm": 8.010243162338005, + "language_loss": 0.61716807, + "learning_rate": 1.0724508701852806e-06, + "loss": 0.63857865, + "num_input_tokens_seen": 238327280, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66796875, + "step": 11042, + "time_per_iteration": 2.50669264793396 + }, + { + "auxiliary_loss_clip": 0.01105298, + "auxiliary_loss_mlp": 0.0102732, + "balance_loss_clip": 1.01413536, + "balance_loss_mlp": 1.03500068, + "epoch": 0.6639410792123854, + "flos": 28072699902720.0, + "grad_norm": 2.00393235647331, + "language_loss": 0.68282115, + "learning_rate": 1.0721058429331998e-06, + "loss": 0.70414734, + "num_input_tokens_seen": 238346330, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 11043, + "time_per_iteration": 2.518275499343872 + }, + { + "auxiliary_loss_clip": 0.01099626, + "auxiliary_loss_mlp": 0.01024503, + "balance_loss_clip": 1.01448393, + "balance_loss_mlp": 1.03639984, + "epoch": 0.6640012024650533, + "flos": 25556367600000.0, + "grad_norm": 1.6123860278714182, + "language_loss": 0.83758092, + "learning_rate": 1.0717608508669587e-06, + "loss": 0.85882223, + "num_input_tokens_seen": 238364650, + "router_z_loss_clip": 0.10009766, + "router_z_loss_mlp": 0.6328125, + "step": 11044, + "time_per_iteration": 2.505173444747925 + }, + { + "auxiliary_loss_clip": 0.01102203, + "auxiliary_loss_mlp": 0.01029885, + "balance_loss_clip": 1.01769567, + "balance_loss_mlp": 1.03553414, + "epoch": 0.6640613257177214, + "flos": 14866946559360.0, + "grad_norm": 1.9292668184213282, + "language_loss": 0.69679981, + "learning_rate": 1.0714158939996392e-06, + "loss": 0.71812069, + "num_input_tokens_seen": 238381630, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66796875, + "step": 11045, + "time_per_iteration": 2.4917290210723877 + }, + { + "auxiliary_loss_clip": 0.01106396, + "auxiliary_loss_mlp": 0.01025419, + "balance_loss_clip": 1.01372421, + "balance_loss_mlp": 1.03785038, + "epoch": 0.6641214489703893, + "flos": 23221096778880.0, + "grad_norm": 1.4259906887756533, + "language_loss": 0.6473543, + "learning_rate": 1.0710709723443235e-06, + "loss": 0.66867244, + "num_input_tokens_seen": 238402595, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6875, + "step": 11046, + "time_per_iteration": 2.4937326908111572 + }, + { + "auxiliary_loss_clip": 0.01101037, + "auxiliary_loss_mlp": 0.01026442, + "balance_loss_clip": 1.01489711, + "balance_loss_mlp": 1.03506637, + "epoch": 0.6641815722230573, + "flos": 37742617221120.0, + "grad_norm": 1.4622045705244888, + "language_loss": 0.71289897, + "learning_rate": 1.070726085914088e-06, + "loss": 0.73417372, + "num_input_tokens_seen": 238426860, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66015625, + "step": 11047, + "time_per_iteration": 2.6626944541931152 + }, + { + "auxiliary_loss_clip": 0.01104783, + "auxiliary_loss_mlp": 0.01031561, + "balance_loss_clip": 1.01909184, + "balance_loss_mlp": 1.03837025, + "epoch": 0.6642416954757252, + "flos": 17931132074880.0, + "grad_norm": 1.803867578656826, + "language_loss": 0.77093923, + "learning_rate": 1.0703812347220126e-06, + "loss": 0.79230267, + "num_input_tokens_seen": 238443990, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6640625, + "step": 11048, + "time_per_iteration": 2.3982088565826416 + }, + { + "auxiliary_loss_clip": 0.01026262, + "auxiliary_loss_mlp": 0.00999443, + "balance_loss_clip": 0.99813193, + "balance_loss_mlp": 1.00559723, + "epoch": 0.6643018187283932, + "flos": 51995384104320.0, + "grad_norm": 0.8140473421231088, + "language_loss": 0.55041039, + "learning_rate": 1.0700364187811745e-06, + "loss": 0.57066745, + "num_input_tokens_seen": 238503045, + "router_z_loss_clip": 0.01312256, + "router_z_loss_mlp": 0.20703125, + "step": 11049, + "time_per_iteration": 3.0340354442596436 + }, + { + "auxiliary_loss_clip": 0.01102231, + "auxiliary_loss_mlp": 0.01027108, + "balance_loss_clip": 1.01654625, + "balance_loss_mlp": 1.03627372, + "epoch": 0.6643619419810611, + "flos": 30226657847040.0, + "grad_norm": 1.7091488805060655, + "language_loss": 0.64489448, + "learning_rate": 1.069691638104648e-06, + "loss": 0.66618788, + "num_input_tokens_seen": 238527320, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.66015625, + "step": 11050, + "time_per_iteration": 2.5083260536193848 + }, + { + "auxiliary_loss_clip": 0.01099461, + "auxiliary_loss_mlp": 0.01026918, + "balance_loss_clip": 1.01615942, + "balance_loss_mlp": 1.03481436, + "epoch": 0.6644220652337292, + "flos": 22966131064320.0, + "grad_norm": 2.10593076125299, + "language_loss": 0.78783518, + "learning_rate": 1.0693468927055085e-06, + "loss": 0.80909896, + "num_input_tokens_seen": 238546030, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6484375, + "step": 11051, + "time_per_iteration": 2.462937116622925 + }, + { + "auxiliary_loss_clip": 0.01103355, + "auxiliary_loss_mlp": 0.01031151, + "balance_loss_clip": 1.01975441, + "balance_loss_mlp": 1.03752089, + "epoch": 0.6644821884863971, + "flos": 21142228216320.0, + "grad_norm": 1.6490502352967844, + "language_loss": 0.85132825, + "learning_rate": 1.0690021825968276e-06, + "loss": 0.87267327, + "num_input_tokens_seen": 238564175, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65625, + "step": 11052, + "time_per_iteration": 3.808241128921509 + }, + { + "auxiliary_loss_clip": 0.01106566, + "auxiliary_loss_mlp": 0.01035204, + "balance_loss_clip": 1.02213919, + "balance_loss_mlp": 1.03723979, + "epoch": 0.6645423117390651, + "flos": 20192821885440.0, + "grad_norm": 2.202728029810485, + "language_loss": 0.75382364, + "learning_rate": 1.0686575077916776e-06, + "loss": 0.77524137, + "num_input_tokens_seen": 238581010, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6953125, + "step": 11053, + "time_per_iteration": 2.4659061431884766 + }, + { + "auxiliary_loss_clip": 0.01099205, + "auxiliary_loss_mlp": 0.01024974, + "balance_loss_clip": 1.0138042, + "balance_loss_mlp": 1.03446698, + "epoch": 0.6646024349917331, + "flos": 24351959640960.0, + "grad_norm": 1.6434507479308733, + "language_loss": 0.79397607, + "learning_rate": 1.0683128683031278e-06, + "loss": 0.81521785, + "num_input_tokens_seen": 238601365, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 11054, + "time_per_iteration": 2.4667155742645264 + }, + { + "auxiliary_loss_clip": 0.01100684, + "auxiliary_loss_mlp": 0.01029658, + "balance_loss_clip": 1.0186491, + "balance_loss_mlp": 1.03520536, + "epoch": 0.664662558244401, + "flos": 18806706000000.0, + "grad_norm": 1.4981555869580738, + "language_loss": 0.74050117, + "learning_rate": 1.0679682641442472e-06, + "loss": 0.76180458, + "num_input_tokens_seen": 238619850, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 11055, + "time_per_iteration": 3.8726584911346436 + }, + { + "auxiliary_loss_clip": 0.01104209, + "auxiliary_loss_mlp": 0.01034676, + "balance_loss_clip": 1.02240944, + "balance_loss_mlp": 1.0363059, + "epoch": 0.664722681497069, + "flos": 18952790613120.0, + "grad_norm": 1.7483359396792508, + "language_loss": 0.72639185, + "learning_rate": 1.0676236953281042e-06, + "loss": 0.74778068, + "num_input_tokens_seen": 238637635, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.67578125, + "step": 11056, + "time_per_iteration": 3.913365364074707 + }, + { + "auxiliary_loss_clip": 0.01101489, + "auxiliary_loss_mlp": 0.01027564, + "balance_loss_clip": 1.01610804, + "balance_loss_mlp": 1.03553987, + "epoch": 0.6647828047497369, + "flos": 19571279921280.0, + "grad_norm": 2.080468005748717, + "language_loss": 0.69644797, + "learning_rate": 1.0672791618677641e-06, + "loss": 0.71773851, + "num_input_tokens_seen": 238656200, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 11057, + "time_per_iteration": 2.4554696083068848 + }, + { + "auxiliary_loss_clip": 0.01104048, + "auxiliary_loss_mlp": 0.01029444, + "balance_loss_clip": 1.01761794, + "balance_loss_mlp": 1.0374651, + "epoch": 0.664842928002405, + "flos": 23149455102720.0, + "grad_norm": 2.7208836045736753, + "language_loss": 0.80084372, + "learning_rate": 1.066934663776291e-06, + "loss": 0.8221786, + "num_input_tokens_seen": 238675005, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 11058, + "time_per_iteration": 2.4723973274230957 + }, + { + "auxiliary_loss_clip": 0.01026201, + "auxiliary_loss_mlp": 0.00999951, + "balance_loss_clip": 0.99850267, + "balance_loss_mlp": 1.00571644, + "epoch": 0.6649030512550729, + "flos": 65244913148160.0, + "grad_norm": 0.8197408377002003, + "language_loss": 0.62637091, + "learning_rate": 1.0665902010667496e-06, + "loss": 0.64663243, + "num_input_tokens_seen": 238731425, + "router_z_loss_clip": 0.01446533, + "router_z_loss_mlp": 0.20507812, + "step": 11059, + "time_per_iteration": 2.9666504859924316 + }, + { + "auxiliary_loss_clip": 0.01099741, + "auxiliary_loss_mlp": 0.01034131, + "balance_loss_clip": 1.0232594, + "balance_loss_mlp": 1.03411603, + "epoch": 0.6649631745077409, + "flos": 20194797133440.0, + "grad_norm": 1.4258342030978963, + "language_loss": 0.78922415, + "learning_rate": 1.0662457737522008e-06, + "loss": 0.81056285, + "num_input_tokens_seen": 238752020, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.65625, + "step": 11060, + "time_per_iteration": 2.453782796859741 + }, + { + "auxiliary_loss_clip": 0.01105175, + "auxiliary_loss_mlp": 0.01034567, + "balance_loss_clip": 1.02221072, + "balance_loss_mlp": 1.03826928, + "epoch": 0.6650232977604088, + "flos": 17238558965760.0, + "grad_norm": 1.8106435880803493, + "language_loss": 0.78883487, + "learning_rate": 1.0659013818457055e-06, + "loss": 0.81023228, + "num_input_tokens_seen": 238769665, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.671875, + "step": 11061, + "time_per_iteration": 2.4411821365356445 + }, + { + "auxiliary_loss_clip": 0.01103137, + "auxiliary_loss_mlp": 0.0102691, + "balance_loss_clip": 1.01606226, + "balance_loss_mlp": 1.03756928, + "epoch": 0.6650834210130768, + "flos": 10006867825920.0, + "grad_norm": 2.176969604984505, + "language_loss": 0.57041669, + "learning_rate": 1.0655570253603243e-06, + "loss": 0.59171724, + "num_input_tokens_seen": 238782180, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.65625, + "step": 11062, + "time_per_iteration": 2.389374256134033 + }, + { + "auxiliary_loss_clip": 0.0110523, + "auxiliary_loss_mlp": 0.01027344, + "balance_loss_clip": 1.01472592, + "balance_loss_mlp": 1.03483319, + "epoch": 0.6651435442657447, + "flos": 10452088903680.0, + "grad_norm": 1.8021007966116196, + "language_loss": 0.75658429, + "learning_rate": 1.0652127043091144e-06, + "loss": 0.77791005, + "num_input_tokens_seen": 238800315, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 11063, + "time_per_iteration": 2.4186158180236816 + }, + { + "auxiliary_loss_clip": 0.01103715, + "auxiliary_loss_mlp": 0.01033702, + "balance_loss_clip": 1.02248394, + "balance_loss_mlp": 1.0370208, + "epoch": 0.6652036675184128, + "flos": 22344229964160.0, + "grad_norm": 1.3058140700355754, + "language_loss": 0.7048496, + "learning_rate": 1.0648684187051316e-06, + "loss": 0.72622377, + "num_input_tokens_seen": 238822250, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.66796875, + "step": 11064, + "time_per_iteration": 2.5101113319396973 + }, + { + "auxiliary_loss_clip": 0.01024924, + "auxiliary_loss_mlp": 0.00997873, + "balance_loss_clip": 0.9965679, + "balance_loss_mlp": 1.00459087, + "epoch": 0.6652637907710807, + "flos": 52909633998720.0, + "grad_norm": 0.8487322656758325, + "language_loss": 0.63019937, + "learning_rate": 1.0645241685614322e-06, + "loss": 0.65042734, + "num_input_tokens_seen": 238877190, + "router_z_loss_clip": 0.01306152, + "router_z_loss_mlp": 0.203125, + "step": 11065, + "time_per_iteration": 3.006619691848755 + }, + { + "auxiliary_loss_clip": 0.01104037, + "auxiliary_loss_mlp": 0.01026975, + "balance_loss_clip": 1.01580477, + "balance_loss_mlp": 1.03731883, + "epoch": 0.6653239140237487, + "flos": 23104637907840.0, + "grad_norm": 1.6667915109143088, + "language_loss": 0.62019926, + "learning_rate": 1.0641799538910708e-06, + "loss": 0.64150941, + "num_input_tokens_seen": 238896010, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.66796875, + "step": 11066, + "time_per_iteration": 2.468318223953247 + }, + { + "auxiliary_loss_clip": 0.01102953, + "auxiliary_loss_mlp": 0.01026785, + "balance_loss_clip": 1.01430988, + "balance_loss_mlp": 1.03479779, + "epoch": 0.6653840372764167, + "flos": 25959393175680.0, + "grad_norm": 1.7106058760764156, + "language_loss": 0.70056629, + "learning_rate": 1.0638357747070985e-06, + "loss": 0.72186363, + "num_input_tokens_seen": 238918990, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 11067, + "time_per_iteration": 2.524820566177368 + }, + { + "auxiliary_loss_clip": 0.0102549, + "auxiliary_loss_mlp": 0.00996129, + "balance_loss_clip": 0.99466848, + "balance_loss_mlp": 1.00504017, + "epoch": 0.6654441605290846, + "flos": 66041985899520.0, + "grad_norm": 0.9061893644507588, + "language_loss": 0.72102368, + "learning_rate": 1.0634916310225684e-06, + "loss": 0.74123991, + "num_input_tokens_seen": 238975735, + "router_z_loss_clip": 0.0145874, + "router_z_loss_mlp": 0.20507812, + "step": 11068, + "time_per_iteration": 3.0193986892700195 + }, + { + "auxiliary_loss_clip": 0.01025049, + "auxiliary_loss_mlp": 0.00996802, + "balance_loss_clip": 0.99540693, + "balance_loss_mlp": 1.00446737, + "epoch": 0.6655042837817526, + "flos": 65196112521600.0, + "grad_norm": 0.7087489248971819, + "language_loss": 0.57800353, + "learning_rate": 1.0631475228505285e-06, + "loss": 0.59822208, + "num_input_tokens_seen": 239042360, + "router_z_loss_clip": 0.01397705, + "router_z_loss_mlp": 0.20605469, + "step": 11069, + "time_per_iteration": 3.2124764919281006 + }, + { + "auxiliary_loss_clip": 0.0102455, + "auxiliary_loss_mlp": 0.01000321, + "balance_loss_clip": 0.99886698, + "balance_loss_mlp": 1.0041275, + "epoch": 0.6655644070344205, + "flos": 69008746752000.0, + "grad_norm": 0.7763166900295557, + "language_loss": 0.63506204, + "learning_rate": 1.062803450204029e-06, + "loss": 0.65531075, + "num_input_tokens_seen": 239109410, + "router_z_loss_clip": 0.01452637, + "router_z_loss_mlp": 0.20410156, + "step": 11070, + "time_per_iteration": 3.1373214721679688 + }, + { + "auxiliary_loss_clip": 0.01101179, + "auxiliary_loss_mlp": 0.01026674, + "balance_loss_clip": 1.01531315, + "balance_loss_mlp": 1.03435302, + "epoch": 0.6656245302870886, + "flos": 36315562809600.0, + "grad_norm": 1.7781228106405071, + "language_loss": 0.58826381, + "learning_rate": 1.062459413096116e-06, + "loss": 0.60954237, + "num_input_tokens_seen": 239135345, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66796875, + "step": 11071, + "time_per_iteration": 2.5929718017578125 + }, + { + "auxiliary_loss_clip": 0.01105196, + "auxiliary_loss_mlp": 0.01027293, + "balance_loss_clip": 1.01662362, + "balance_loss_mlp": 1.03853655, + "epoch": 0.6656846535397565, + "flos": 21794832466560.0, + "grad_norm": 2.462730868248946, + "language_loss": 0.72873962, + "learning_rate": 1.0621154115398364e-06, + "loss": 0.75006455, + "num_input_tokens_seen": 239154340, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6640625, + "step": 11072, + "time_per_iteration": 2.457197427749634 + }, + { + "auxiliary_loss_clip": 0.01103868, + "auxiliary_loss_mlp": 0.01029593, + "balance_loss_clip": 1.01775575, + "balance_loss_mlp": 1.03849804, + "epoch": 0.6657447767924245, + "flos": 37487615592960.0, + "grad_norm": 2.0960284851890183, + "language_loss": 0.70686483, + "learning_rate": 1.0617714455482353e-06, + "loss": 0.72819948, + "num_input_tokens_seen": 239177815, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65625, + "step": 11073, + "time_per_iteration": 2.621063709259033 + }, + { + "auxiliary_loss_clip": 0.01105664, + "auxiliary_loss_mlp": 0.0103133, + "balance_loss_clip": 1.01962924, + "balance_loss_mlp": 1.03680611, + "epoch": 0.6658049000450924, + "flos": 16837688206080.0, + "grad_norm": 21.254891604302284, + "language_loss": 0.56184697, + "learning_rate": 1.061427515134354e-06, + "loss": 0.58321697, + "num_input_tokens_seen": 239195735, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6875, + "step": 11074, + "time_per_iteration": 2.417592763900757 + }, + { + "auxiliary_loss_clip": 0.01103413, + "auxiliary_loss_mlp": 0.01029306, + "balance_loss_clip": 1.01795745, + "balance_loss_mlp": 1.03713965, + "epoch": 0.6658650232977604, + "flos": 33510975863040.0, + "grad_norm": 1.4493539029409879, + "language_loss": 0.72269762, + "learning_rate": 1.061083620311235e-06, + "loss": 0.74402475, + "num_input_tokens_seen": 239217535, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 11075, + "time_per_iteration": 2.5621016025543213 + }, + { + "auxiliary_loss_clip": 0.01099577, + "auxiliary_loss_mlp": 0.01035227, + "balance_loss_clip": 1.02400899, + "balance_loss_mlp": 1.03572047, + "epoch": 0.6659251465504283, + "flos": 37706311549440.0, + "grad_norm": 1.47592254117705, + "language_loss": 0.6616652, + "learning_rate": 1.0607397610919202e-06, + "loss": 0.6830132, + "num_input_tokens_seen": 239241975, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 11076, + "time_per_iteration": 2.618560552597046 + }, + { + "auxiliary_loss_clip": 0.01102176, + "auxiliary_loss_mlp": 0.01034767, + "balance_loss_clip": 1.02297735, + "balance_loss_mlp": 1.0359937, + "epoch": 0.6659852698030964, + "flos": 24893420232960.0, + "grad_norm": 1.613817606590062, + "language_loss": 0.75271714, + "learning_rate": 1.0603959374894468e-06, + "loss": 0.77408653, + "num_input_tokens_seen": 239262025, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66015625, + "step": 11077, + "time_per_iteration": 2.487748146057129 + }, + { + "auxiliary_loss_clip": 0.01102302, + "auxiliary_loss_mlp": 0.01030781, + "balance_loss_clip": 1.01956344, + "balance_loss_mlp": 1.03536868, + "epoch": 0.6660453930557643, + "flos": 24352821567360.0, + "grad_norm": 1.863663819937869, + "language_loss": 0.66703588, + "learning_rate": 1.0600521495168538e-06, + "loss": 0.68836671, + "num_input_tokens_seen": 239282775, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.66796875, + "step": 11078, + "time_per_iteration": 2.4835610389709473 + }, + { + "auxiliary_loss_clip": 0.0110385, + "auxiliary_loss_mlp": 0.01029692, + "balance_loss_clip": 1.01786661, + "balance_loss_mlp": 1.03568316, + "epoch": 0.6661055163084323, + "flos": 10597814380800.0, + "grad_norm": 1.962622549544945, + "language_loss": 0.69805777, + "learning_rate": 1.0597083971871783e-06, + "loss": 0.71939325, + "num_input_tokens_seen": 239299775, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.68359375, + "step": 11079, + "time_per_iteration": 2.4517362117767334 + }, + { + "auxiliary_loss_clip": 0.01100691, + "auxiliary_loss_mlp": 0.01027325, + "balance_loss_clip": 1.01656055, + "balance_loss_mlp": 1.03579783, + "epoch": 0.6661656395611003, + "flos": 24057491944320.0, + "grad_norm": 1.4504303029583365, + "language_loss": 0.80272287, + "learning_rate": 1.0593646805134544e-06, + "loss": 0.82400304, + "num_input_tokens_seen": 239319660, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6484375, + "step": 11080, + "time_per_iteration": 2.496086835861206 + }, + { + "auxiliary_loss_clip": 0.01098572, + "auxiliary_loss_mlp": 0.0102897, + "balance_loss_clip": 1.0184257, + "balance_loss_mlp": 1.03518367, + "epoch": 0.6662257628137682, + "flos": 23036192542080.0, + "grad_norm": 1.7747670262807855, + "language_loss": 0.78175783, + "learning_rate": 1.0590209995087157e-06, + "loss": 0.80303317, + "num_input_tokens_seen": 239339215, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6328125, + "step": 11081, + "time_per_iteration": 2.4947092533111572 + }, + { + "auxiliary_loss_clip": 0.01104079, + "auxiliary_loss_mlp": 0.01031001, + "balance_loss_clip": 1.01905608, + "balance_loss_mlp": 1.03641224, + "epoch": 0.6662858860664362, + "flos": 24754446512640.0, + "grad_norm": 1.7096575045073308, + "language_loss": 0.79757982, + "learning_rate": 1.0586773541859946e-06, + "loss": 0.81893063, + "num_input_tokens_seen": 239358545, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 11082, + "time_per_iteration": 2.496314287185669 + }, + { + "auxiliary_loss_clip": 0.01100593, + "auxiliary_loss_mlp": 0.01035918, + "balance_loss_clip": 1.02490342, + "balance_loss_mlp": 1.03489673, + "epoch": 0.6663460093191041, + "flos": 20009066883840.0, + "grad_norm": 1.4408084093775566, + "language_loss": 0.83964407, + "learning_rate": 1.0583337445583234e-06, + "loss": 0.86100918, + "num_input_tokens_seen": 239376665, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 11083, + "time_per_iteration": 2.441714286804199 + }, + { + "auxiliary_loss_clip": 0.0110885, + "auxiliary_loss_mlp": 0.01034762, + "balance_loss_clip": 1.02278709, + "balance_loss_mlp": 1.03879905, + "epoch": 0.6664061325717722, + "flos": 17821389047040.0, + "grad_norm": 2.210335279184582, + "language_loss": 0.85422742, + "learning_rate": 1.057990170638731e-06, + "loss": 0.87566352, + "num_input_tokens_seen": 239394345, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.703125, + "step": 11084, + "time_per_iteration": 2.4179892539978027 + }, + { + "auxiliary_loss_clip": 0.01106729, + "auxiliary_loss_mlp": 0.0102887, + "balance_loss_clip": 1.01672292, + "balance_loss_mlp": 1.03759933, + "epoch": 0.6664662558244401, + "flos": 18076893465600.0, + "grad_norm": 2.2800746471584135, + "language_loss": 0.73236918, + "learning_rate": 1.0576466324402452e-06, + "loss": 0.75372517, + "num_input_tokens_seen": 239410605, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69140625, + "step": 11085, + "time_per_iteration": 2.4865758419036865 + }, + { + "auxiliary_loss_clip": 0.01102626, + "auxiliary_loss_mlp": 0.01029255, + "balance_loss_clip": 1.01763248, + "balance_loss_mlp": 1.03617859, + "epoch": 0.6665263790771081, + "flos": 21574197175680.0, + "grad_norm": 1.9088871569878003, + "language_loss": 0.80301607, + "learning_rate": 1.057303129975894e-06, + "loss": 0.82433486, + "num_input_tokens_seen": 239427155, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 11086, + "time_per_iteration": 2.444645643234253 + }, + { + "auxiliary_loss_clip": 0.01101849, + "auxiliary_loss_mlp": 0.01029998, + "balance_loss_clip": 1.01799965, + "balance_loss_mlp": 1.03593099, + "epoch": 0.666586502329776, + "flos": 24206629213440.0, + "grad_norm": 2.0449845091934753, + "language_loss": 0.74311554, + "learning_rate": 1.056959663258702e-06, + "loss": 0.7644341, + "num_input_tokens_seen": 239445510, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66015625, + "step": 11087, + "time_per_iteration": 2.483962059020996 + }, + { + "auxiliary_loss_clip": 0.0110217, + "auxiliary_loss_mlp": 0.01030474, + "balance_loss_clip": 1.01872563, + "balance_loss_mlp": 1.03587329, + "epoch": 0.666646625582444, + "flos": 22200515648640.0, + "grad_norm": 1.5673899455217954, + "language_loss": 0.64753473, + "learning_rate": 1.0566162323016939e-06, + "loss": 0.66886115, + "num_input_tokens_seen": 239464805, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 11088, + "time_per_iteration": 2.4562034606933594 + }, + { + "auxiliary_loss_clip": 0.01104855, + "auxiliary_loss_mlp": 0.01029085, + "balance_loss_clip": 1.01734924, + "balance_loss_mlp": 1.03735042, + "epoch": 0.6667067488351119, + "flos": 18259930195200.0, + "grad_norm": 1.8332928045753645, + "language_loss": 0.64570332, + "learning_rate": 1.0562728371178928e-06, + "loss": 0.66704261, + "num_input_tokens_seen": 239483890, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.67578125, + "step": 11089, + "time_per_iteration": 2.4386065006256104 + }, + { + "auxiliary_loss_clip": 0.01099875, + "auxiliary_loss_mlp": 0.01031668, + "balance_loss_clip": 1.02059364, + "balance_loss_mlp": 1.03447926, + "epoch": 0.66676687208778, + "flos": 17236547804160.0, + "grad_norm": 2.1527148838753236, + "language_loss": 0.80835247, + "learning_rate": 1.0559294777203221e-06, + "loss": 0.82966793, + "num_input_tokens_seen": 239500080, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.65625, + "step": 11090, + "time_per_iteration": 2.394827365875244 + }, + { + "auxiliary_loss_clip": 0.01105547, + "auxiliary_loss_mlp": 0.01031813, + "balance_loss_clip": 1.02039266, + "balance_loss_mlp": 1.03684211, + "epoch": 0.6668269953404479, + "flos": 19752197748480.0, + "grad_norm": 3.4302717941928806, + "language_loss": 0.7762655, + "learning_rate": 1.0555861541219984e-06, + "loss": 0.79763907, + "num_input_tokens_seen": 239517335, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6875, + "step": 11091, + "time_per_iteration": 2.4357736110687256 + }, + { + "auxiliary_loss_clip": 0.01101701, + "auxiliary_loss_mlp": 0.01030674, + "balance_loss_clip": 1.01943851, + "balance_loss_mlp": 1.03544581, + "epoch": 0.6668871185931159, + "flos": 20558428467840.0, + "grad_norm": 1.7415157953091596, + "language_loss": 0.79347867, + "learning_rate": 1.0552428663359425e-06, + "loss": 0.81480247, + "num_input_tokens_seen": 239536240, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6640625, + "step": 11092, + "time_per_iteration": 2.4493799209594727 + }, + { + "auxiliary_loss_clip": 0.01024657, + "auxiliary_loss_mlp": 0.01010054, + "balance_loss_clip": 1.00873661, + "balance_loss_mlp": 1.00410509, + "epoch": 0.6669472418457839, + "flos": 58088167735680.0, + "grad_norm": 0.7618033983707613, + "language_loss": 0.57674438, + "learning_rate": 1.0548996143751724e-06, + "loss": 0.5970915, + "num_input_tokens_seen": 239598000, + "router_z_loss_clip": 0.01318359, + "router_z_loss_mlp": 0.20507812, + "step": 11093, + "time_per_iteration": 3.060945510864258 + }, + { + "auxiliary_loss_clip": 0.0110198, + "auxiliary_loss_mlp": 0.01029858, + "balance_loss_clip": 1.01859891, + "balance_loss_mlp": 1.03614676, + "epoch": 0.6670073650984518, + "flos": 26065113880320.0, + "grad_norm": 3.0734338086465733, + "language_loss": 0.76404822, + "learning_rate": 1.054556398252703e-06, + "loss": 0.78536654, + "num_input_tokens_seen": 239617650, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.66015625, + "step": 11094, + "time_per_iteration": 3.8702232837677 + }, + { + "auxiliary_loss_clip": 0.01102539, + "auxiliary_loss_mlp": 0.01033592, + "balance_loss_clip": 1.02148628, + "balance_loss_mlp": 1.03533387, + "epoch": 0.6670674883511198, + "flos": 32416849635840.0, + "grad_norm": 1.7962253370500996, + "language_loss": 0.73604453, + "learning_rate": 1.05421321798155e-06, + "loss": 0.75740582, + "num_input_tokens_seen": 239639825, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 11095, + "time_per_iteration": 2.5393593311309814 + }, + { + "auxiliary_loss_clip": 0.01104214, + "auxiliary_loss_mlp": 0.01031091, + "balance_loss_clip": 1.01983809, + "balance_loss_mlp": 1.03839517, + "epoch": 0.6671276116037878, + "flos": 18037786533120.0, + "grad_norm": 2.498006768699264, + "language_loss": 0.73841417, + "learning_rate": 1.053870073574727e-06, + "loss": 0.75976729, + "num_input_tokens_seen": 239656300, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 11096, + "time_per_iteration": 5.295018672943115 + }, + { + "auxiliary_loss_clip": 0.01101592, + "auxiliary_loss_mlp": 0.01030969, + "balance_loss_clip": 1.02000785, + "balance_loss_mlp": 1.03659046, + "epoch": 0.6671877348564558, + "flos": 23767046570880.0, + "grad_norm": 2.1197138558836652, + "language_loss": 0.64377868, + "learning_rate": 1.0535269650452456e-06, + "loss": 0.66510427, + "num_input_tokens_seen": 239676655, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 11097, + "time_per_iteration": 2.4755849838256836 + }, + { + "auxiliary_loss_clip": 0.01105023, + "auxiliary_loss_mlp": 0.01029903, + "balance_loss_clip": 1.01810169, + "balance_loss_mlp": 1.03657043, + "epoch": 0.6672478581091237, + "flos": 20918360701440.0, + "grad_norm": 1.8367279267646714, + "language_loss": 0.75293523, + "learning_rate": 1.0531838924061158e-06, + "loss": 0.77428448, + "num_input_tokens_seen": 239695430, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 11098, + "time_per_iteration": 3.8889780044555664 + }, + { + "auxiliary_loss_clip": 0.01105898, + "auxiliary_loss_mlp": 0.01031065, + "balance_loss_clip": 1.01997876, + "balance_loss_mlp": 1.03809619, + "epoch": 0.6673079813617917, + "flos": 27855799626240.0, + "grad_norm": 1.6239497270406267, + "language_loss": 0.74629354, + "learning_rate": 1.0528408556703476e-06, + "loss": 0.76766318, + "num_input_tokens_seen": 239717070, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6796875, + "step": 11099, + "time_per_iteration": 2.499155282974243 + }, + { + "auxiliary_loss_clip": 0.01099471, + "auxiliary_loss_mlp": 0.01033794, + "balance_loss_clip": 1.02229047, + "balance_loss_mlp": 1.03467488, + "epoch": 0.6673681046144596, + "flos": 21616859554560.0, + "grad_norm": 1.8612331810201734, + "language_loss": 0.78086853, + "learning_rate": 1.0524978548509502e-06, + "loss": 0.80220115, + "num_input_tokens_seen": 239737105, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 11100, + "time_per_iteration": 2.4822754859924316 + }, + { + "auxiliary_loss_clip": 0.01101826, + "auxiliary_loss_mlp": 0.01034027, + "balance_loss_clip": 1.02265465, + "balance_loss_mlp": 1.03608942, + "epoch": 0.6674282278671276, + "flos": 20889884194560.0, + "grad_norm": 2.199541930312583, + "language_loss": 0.60234034, + "learning_rate": 1.0521548899609288e-06, + "loss": 0.62369883, + "num_input_tokens_seen": 239757835, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 11101, + "time_per_iteration": 2.470005750656128 + }, + { + "auxiliary_loss_clip": 0.0110769, + "auxiliary_loss_mlp": 0.01034695, + "balance_loss_clip": 1.02202892, + "balance_loss_mlp": 1.03702366, + "epoch": 0.6674883511197955, + "flos": 23624194181760.0, + "grad_norm": 1.6927482018220132, + "language_loss": 0.711254, + "learning_rate": 1.0518119610132884e-06, + "loss": 0.73267794, + "num_input_tokens_seen": 239775425, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 11102, + "time_per_iteration": 2.5034313201904297 + }, + { + "auxiliary_loss_clip": 0.01104064, + "auxiliary_loss_mlp": 0.01029221, + "balance_loss_clip": 1.01774669, + "balance_loss_mlp": 1.03638661, + "epoch": 0.6675484743724636, + "flos": 19609668581760.0, + "grad_norm": 1.4777736440637246, + "language_loss": 0.84276104, + "learning_rate": 1.051469068021034e-06, + "loss": 0.8640939, + "num_input_tokens_seen": 239794605, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.67578125, + "step": 11103, + "time_per_iteration": 2.430427074432373 + }, + { + "auxiliary_loss_clip": 0.01104082, + "auxiliary_loss_mlp": 0.01024589, + "balance_loss_clip": 1.01338315, + "balance_loss_mlp": 1.03620505, + "epoch": 0.6676085976251315, + "flos": 14319452482560.0, + "grad_norm": 2.187100835254228, + "language_loss": 0.77906835, + "learning_rate": 1.0511262109971668e-06, + "loss": 0.80035502, + "num_input_tokens_seen": 239812135, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6796875, + "step": 11104, + "time_per_iteration": 2.431415557861328 + }, + { + "auxiliary_loss_clip": 0.01105832, + "auxiliary_loss_mlp": 0.01027014, + "balance_loss_clip": 1.01554644, + "balance_loss_mlp": 1.03741312, + "epoch": 0.6676687208777995, + "flos": 38104596529920.0, + "grad_norm": 1.7588653188886298, + "language_loss": 0.58123207, + "learning_rate": 1.0507833899546889e-06, + "loss": 0.60256052, + "num_input_tokens_seen": 239835845, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.68359375, + "step": 11105, + "time_per_iteration": 2.5778300762176514 + }, + { + "auxiliary_loss_clip": 0.01107319, + "auxiliary_loss_mlp": 0.01030859, + "balance_loss_clip": 1.01864648, + "balance_loss_mlp": 1.0369904, + "epoch": 0.6677288441304675, + "flos": 23981576549760.0, + "grad_norm": 1.5584285162619382, + "language_loss": 0.73263156, + "learning_rate": 1.0504406049066e-06, + "loss": 0.75401342, + "num_input_tokens_seen": 239853820, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.703125, + "step": 11106, + "time_per_iteration": 2.502669334411621 + }, + { + "auxiliary_loss_clip": 0.01102707, + "auxiliary_loss_mlp": 0.01027538, + "balance_loss_clip": 1.01568878, + "balance_loss_mlp": 1.03582263, + "epoch": 0.6677889673831354, + "flos": 24170682677760.0, + "grad_norm": 1.612792210414072, + "language_loss": 0.77103424, + "learning_rate": 1.0500978558659e-06, + "loss": 0.7923367, + "num_input_tokens_seen": 239873365, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 11107, + "time_per_iteration": 2.4632906913757324 + }, + { + "auxiliary_loss_clip": 0.01098872, + "auxiliary_loss_mlp": 0.01027525, + "balance_loss_clip": 1.01636124, + "balance_loss_mlp": 1.03531408, + "epoch": 0.6678490906358034, + "flos": 22309648145280.0, + "grad_norm": 2.2458320549685267, + "language_loss": 0.89908957, + "learning_rate": 1.049755142845583e-06, + "loss": 0.92035359, + "num_input_tokens_seen": 239891215, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.63671875, + "step": 11108, + "time_per_iteration": 2.4730093479156494 + }, + { + "auxiliary_loss_clip": 0.01099015, + "auxiliary_loss_mlp": 0.01022867, + "balance_loss_clip": 1.01253176, + "balance_loss_mlp": 1.03418517, + "epoch": 0.6679092138884714, + "flos": 36898752026880.0, + "grad_norm": 1.3985533807105044, + "language_loss": 0.82679069, + "learning_rate": 1.049412465858646e-06, + "loss": 0.84800953, + "num_input_tokens_seen": 239913490, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.6484375, + "step": 11109, + "time_per_iteration": 2.580944061279297 + }, + { + "auxiliary_loss_clip": 0.01102598, + "auxiliary_loss_mlp": 0.01028606, + "balance_loss_clip": 1.01666141, + "balance_loss_mlp": 1.03557515, + "epoch": 0.6679693371411394, + "flos": 18150294908160.0, + "grad_norm": 1.8119039289749856, + "language_loss": 0.69528979, + "learning_rate": 1.0490698249180847e-06, + "loss": 0.71660185, + "num_input_tokens_seen": 239931565, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 11110, + "time_per_iteration": 2.5149457454681396 + }, + { + "auxiliary_loss_clip": 0.01106029, + "auxiliary_loss_mlp": 0.01032156, + "balance_loss_clip": 1.01944864, + "balance_loss_mlp": 1.03632832, + "epoch": 0.6680294603938073, + "flos": 27198167472000.0, + "grad_norm": 1.7594532626452621, + "language_loss": 0.7338779, + "learning_rate": 1.04872722003689e-06, + "loss": 0.75525975, + "num_input_tokens_seen": 239952395, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 11111, + "time_per_iteration": 2.481405258178711 + }, + { + "auxiliary_loss_clip": 0.01097972, + "auxiliary_loss_mlp": 0.01026013, + "balance_loss_clip": 1.01508117, + "balance_loss_mlp": 1.03355026, + "epoch": 0.6680895836464753, + "flos": 21725309692800.0, + "grad_norm": 3.2736780286979488, + "language_loss": 0.64989609, + "learning_rate": 1.0483846512280553e-06, + "loss": 0.6711359, + "num_input_tokens_seen": 239968910, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.64453125, + "step": 11112, + "time_per_iteration": 2.452441930770874 + }, + { + "auxiliary_loss_clip": 0.01102119, + "auxiliary_loss_mlp": 0.01029115, + "balance_loss_clip": 1.01734865, + "balance_loss_mlp": 1.03562021, + "epoch": 0.6681497068991432, + "flos": 19646477043840.0, + "grad_norm": 1.7892928589109056, + "language_loss": 0.63786232, + "learning_rate": 1.048042118504569e-06, + "loss": 0.65917462, + "num_input_tokens_seen": 239987680, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 11113, + "time_per_iteration": 2.4086506366729736 + }, + { + "auxiliary_loss_clip": 0.01100693, + "auxiliary_loss_mlp": 0.01031399, + "balance_loss_clip": 1.02059317, + "balance_loss_mlp": 1.03552222, + "epoch": 0.6682098301518112, + "flos": 17419153570560.0, + "grad_norm": 1.8981901836856618, + "language_loss": 0.66016996, + "learning_rate": 1.047699621879422e-06, + "loss": 0.6814909, + "num_input_tokens_seen": 240005790, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.65234375, + "step": 11114, + "time_per_iteration": 2.4347803592681885 + }, + { + "auxiliary_loss_clip": 0.0110162, + "auxiliary_loss_mlp": 0.01032493, + "balance_loss_clip": 1.02107906, + "balance_loss_mlp": 1.03480756, + "epoch": 0.6682699534044791, + "flos": 22599016110720.0, + "grad_norm": 1.6451209195544332, + "language_loss": 0.78455061, + "learning_rate": 1.0473571613655998e-06, + "loss": 0.80589175, + "num_input_tokens_seen": 240025895, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 11115, + "time_per_iteration": 2.478957414627075 + }, + { + "auxiliary_loss_clip": 0.01101464, + "auxiliary_loss_mlp": 0.01029063, + "balance_loss_clip": 1.01758313, + "balance_loss_mlp": 1.03418374, + "epoch": 0.6683300766571472, + "flos": 24863686750080.0, + "grad_norm": 1.607299826888502, + "language_loss": 0.79468185, + "learning_rate": 1.0470147369760896e-06, + "loss": 0.81598711, + "num_input_tokens_seen": 240044880, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.671875, + "step": 11116, + "time_per_iteration": 2.5263917446136475 + }, + { + "auxiliary_loss_clip": 0.01104708, + "auxiliary_loss_mlp": 0.01031435, + "balance_loss_clip": 1.01905489, + "balance_loss_mlp": 1.03642249, + "epoch": 0.6683901999098151, + "flos": 27126633536640.0, + "grad_norm": 1.793058561798458, + "language_loss": 0.79410267, + "learning_rate": 1.0466723487238768e-06, + "loss": 0.81546414, + "num_input_tokens_seen": 240065785, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 11117, + "time_per_iteration": 2.4854443073272705 + }, + { + "auxiliary_loss_clip": 0.01105696, + "auxiliary_loss_mlp": 0.01031025, + "balance_loss_clip": 1.01769769, + "balance_loss_mlp": 1.03675961, + "epoch": 0.6684503231624831, + "flos": 20739023072640.0, + "grad_norm": 1.507325638356248, + "language_loss": 0.65411663, + "learning_rate": 1.0463299966219441e-06, + "loss": 0.67548382, + "num_input_tokens_seen": 240085130, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6875, + "step": 11118, + "time_per_iteration": 2.472377300262451 + }, + { + "auxiliary_loss_clip": 0.01100857, + "auxiliary_loss_mlp": 0.01028801, + "balance_loss_clip": 1.01816726, + "balance_loss_mlp": 1.03583932, + "epoch": 0.668510446415151, + "flos": 21762189982080.0, + "grad_norm": 2.967647334244501, + "language_loss": 0.68711627, + "learning_rate": 1.0459876806832727e-06, + "loss": 0.70841289, + "num_input_tokens_seen": 240105495, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.65234375, + "step": 11119, + "time_per_iteration": 2.4728288650512695 + }, + { + "auxiliary_loss_clip": 0.01102457, + "auxiliary_loss_mlp": 0.01026404, + "balance_loss_clip": 1.01497746, + "balance_loss_mlp": 1.03634501, + "epoch": 0.668570569667819, + "flos": 30191250015360.0, + "grad_norm": 1.5996077334078893, + "language_loss": 0.66828573, + "learning_rate": 1.0456454009208448e-06, + "loss": 0.68957436, + "num_input_tokens_seen": 240125455, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66015625, + "step": 11120, + "time_per_iteration": 2.546515941619873 + }, + { + "auxiliary_loss_clip": 0.0110248, + "auxiliary_loss_mlp": 0.01029952, + "balance_loss_clip": 1.01742911, + "balance_loss_mlp": 1.03602421, + "epoch": 0.668630692920487, + "flos": 24170646764160.0, + "grad_norm": 1.762800604873663, + "language_loss": 0.72149706, + "learning_rate": 1.045303157347638e-06, + "loss": 0.7428214, + "num_input_tokens_seen": 240143870, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6640625, + "step": 11121, + "time_per_iteration": 2.477660894393921 + }, + { + "auxiliary_loss_clip": 0.01103609, + "auxiliary_loss_mlp": 0.01034095, + "balance_loss_clip": 1.02209687, + "balance_loss_mlp": 1.0351814, + "epoch": 0.668690816173155, + "flos": 17457147181440.0, + "grad_norm": 2.849050741943763, + "language_loss": 0.70147824, + "learning_rate": 1.0449609499766316e-06, + "loss": 0.72285533, + "num_input_tokens_seen": 240161020, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 11122, + "time_per_iteration": 2.490941286087036 + }, + { + "auxiliary_loss_clip": 0.01103575, + "auxiliary_loss_mlp": 0.01035456, + "balance_loss_clip": 1.02377343, + "balance_loss_mlp": 1.03655457, + "epoch": 0.668750939425823, + "flos": 25005102595200.0, + "grad_norm": 1.6701786551201399, + "language_loss": 0.71671915, + "learning_rate": 1.0446187788208015e-06, + "loss": 0.73810941, + "num_input_tokens_seen": 240179820, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 11123, + "time_per_iteration": 2.4819095134735107 + }, + { + "auxiliary_loss_clip": 0.01108577, + "auxiliary_loss_mlp": 0.01035224, + "balance_loss_clip": 1.02273631, + "balance_loss_mlp": 1.0392499, + "epoch": 0.6688110626784909, + "flos": 24096778444800.0, + "grad_norm": 1.6154595834142065, + "language_loss": 0.79180294, + "learning_rate": 1.0442766438931244e-06, + "loss": 0.81324089, + "num_input_tokens_seen": 240200130, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6953125, + "step": 11124, + "time_per_iteration": 2.4734344482421875 + }, + { + "auxiliary_loss_clip": 0.0110496, + "auxiliary_loss_mlp": 0.01036149, + "balance_loss_clip": 1.02444232, + "balance_loss_mlp": 1.03757286, + "epoch": 0.6688711859311589, + "flos": 21759532375680.0, + "grad_norm": 1.7495803882819345, + "language_loss": 0.74282473, + "learning_rate": 1.0439345452065716e-06, + "loss": 0.76423579, + "num_input_tokens_seen": 240217945, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.67578125, + "step": 11125, + "time_per_iteration": 2.444687843322754 + }, + { + "auxiliary_loss_clip": 0.01105662, + "auxiliary_loss_mlp": 0.01034569, + "balance_loss_clip": 1.0227133, + "balance_loss_mlp": 1.03771114, + "epoch": 0.6689313091838268, + "flos": 22929645824640.0, + "grad_norm": 2.3220485163353035, + "language_loss": 0.66047573, + "learning_rate": 1.043592482774116e-06, + "loss": 0.68187803, + "num_input_tokens_seen": 240237220, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 11126, + "time_per_iteration": 2.508352756500244 + }, + { + "auxiliary_loss_clip": 0.01100528, + "auxiliary_loss_mlp": 0.01023616, + "balance_loss_clip": 1.01235676, + "balance_loss_mlp": 1.03333449, + "epoch": 0.6689914324364948, + "flos": 20886149180160.0, + "grad_norm": 3.2519975932516094, + "language_loss": 0.71248001, + "learning_rate": 1.0432504566087305e-06, + "loss": 0.73372149, + "num_input_tokens_seen": 240256000, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 11127, + "time_per_iteration": 2.4746527671813965 + }, + { + "auxiliary_loss_clip": 0.01106513, + "auxiliary_loss_mlp": 0.01032607, + "balance_loss_clip": 1.01952958, + "balance_loss_mlp": 1.03555894, + "epoch": 0.6690515556891627, + "flos": 22748225207040.0, + "grad_norm": 2.0140192417842235, + "language_loss": 0.80290639, + "learning_rate": 1.0429084667233827e-06, + "loss": 0.82429767, + "num_input_tokens_seen": 240275845, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 11128, + "time_per_iteration": 2.476914644241333 + }, + { + "auxiliary_loss_clip": 0.0110363, + "auxiliary_loss_mlp": 0.01029193, + "balance_loss_clip": 1.01713467, + "balance_loss_mlp": 1.03555393, + "epoch": 0.6691116789418308, + "flos": 23331450337920.0, + "grad_norm": 2.0449515592271967, + "language_loss": 0.81091756, + "learning_rate": 1.0425665131310427e-06, + "loss": 0.83224577, + "num_input_tokens_seen": 240294095, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 11129, + "time_per_iteration": 2.457526922225952 + }, + { + "auxiliary_loss_clip": 0.01098446, + "auxiliary_loss_mlp": 0.01033459, + "balance_loss_clip": 1.02288556, + "balance_loss_mlp": 1.0350548, + "epoch": 0.6691718021944987, + "flos": 32447014081920.0, + "grad_norm": 1.6204282208074086, + "language_loss": 0.70266747, + "learning_rate": 1.0422245958446762e-06, + "loss": 0.72398651, + "num_input_tokens_seen": 240313460, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6328125, + "step": 11130, + "time_per_iteration": 2.5508627891540527 + }, + { + "auxiliary_loss_clip": 0.01100261, + "auxiliary_loss_mlp": 0.01034692, + "balance_loss_clip": 1.02409458, + "balance_loss_mlp": 1.03609157, + "epoch": 0.6692319254471667, + "flos": 23731602825600.0, + "grad_norm": 1.5850862701658837, + "language_loss": 0.70004213, + "learning_rate": 1.0418827148772486e-06, + "loss": 0.72139168, + "num_input_tokens_seen": 240333540, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.640625, + "step": 11131, + "time_per_iteration": 2.442675828933716 + }, + { + "auxiliary_loss_clip": 0.01103504, + "auxiliary_loss_mlp": 0.01028409, + "balance_loss_clip": 1.01562405, + "balance_loss_mlp": 1.03573704, + "epoch": 0.6692920486998346, + "flos": 14427902620800.0, + "grad_norm": 2.456945083607925, + "language_loss": 0.65068108, + "learning_rate": 1.0415408702417243e-06, + "loss": 0.67200017, + "num_input_tokens_seen": 240350085, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 11132, + "time_per_iteration": 2.4112234115600586 + }, + { + "auxiliary_loss_clip": 0.01105597, + "auxiliary_loss_mlp": 0.01034611, + "balance_loss_clip": 1.0216949, + "balance_loss_mlp": 1.03693044, + "epoch": 0.6693521719525026, + "flos": 21507475662720.0, + "grad_norm": 1.6075137482523445, + "language_loss": 0.74700105, + "learning_rate": 1.0411990619510661e-06, + "loss": 0.76840317, + "num_input_tokens_seen": 240370015, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6875, + "step": 11133, + "time_per_iteration": 2.4416236877441406 + }, + { + "auxiliary_loss_clip": 0.01109475, + "auxiliary_loss_mlp": 0.01030464, + "balance_loss_clip": 1.01720238, + "balance_loss_mlp": 1.03926897, + "epoch": 0.6694122952051706, + "flos": 25406943022080.0, + "grad_norm": 2.3633346892670266, + "language_loss": 0.66337103, + "learning_rate": 1.0408572900182363e-06, + "loss": 0.68477046, + "num_input_tokens_seen": 240390770, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.703125, + "step": 11134, + "time_per_iteration": 2.4672107696533203 + }, + { + "auxiliary_loss_clip": 0.0111221, + "auxiliary_loss_mlp": 0.01033808, + "balance_loss_clip": 1.02098703, + "balance_loss_mlp": 1.03965247, + "epoch": 0.6694724184578386, + "flos": 25661729168640.0, + "grad_norm": 1.8392889149756566, + "language_loss": 0.77132189, + "learning_rate": 1.0405155544561943e-06, + "loss": 0.79278213, + "num_input_tokens_seen": 240409590, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7265625, + "step": 11135, + "time_per_iteration": 2.4986488819122314 + }, + { + "auxiliary_loss_clip": 0.01101077, + "auxiliary_loss_mlp": 0.01031734, + "balance_loss_clip": 1.01987875, + "balance_loss_mlp": 1.03573108, + "epoch": 0.6695325417105066, + "flos": 17709311635200.0, + "grad_norm": 1.766175864119674, + "language_loss": 0.74168599, + "learning_rate": 1.040173855277898e-06, + "loss": 0.76301408, + "num_input_tokens_seen": 240428180, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65234375, + "step": 11136, + "time_per_iteration": 3.892975091934204 + }, + { + "auxiliary_loss_clip": 0.01108465, + "auxiliary_loss_mlp": 0.010308, + "balance_loss_clip": 1.0182538, + "balance_loss_mlp": 1.03819919, + "epoch": 0.6695926649631745, + "flos": 24460050643200.0, + "grad_norm": 1.743373004526595, + "language_loss": 0.62210536, + "learning_rate": 1.0398321924963061e-06, + "loss": 0.643498, + "num_input_tokens_seen": 240447815, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 11137, + "time_per_iteration": 2.4584341049194336 + }, + { + "auxiliary_loss_clip": 0.01102957, + "auxiliary_loss_mlp": 0.01030402, + "balance_loss_clip": 1.01840353, + "balance_loss_mlp": 1.03640008, + "epoch": 0.6696527882158425, + "flos": 24280138396800.0, + "grad_norm": 2.2042949503897837, + "language_loss": 0.65724766, + "learning_rate": 1.0394905661243724e-06, + "loss": 0.67858124, + "num_input_tokens_seen": 240468635, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 11138, + "time_per_iteration": 4.031554460525513 + }, + { + "auxiliary_loss_clip": 0.01099165, + "auxiliary_loss_mlp": 0.01033153, + "balance_loss_clip": 1.02175677, + "balance_loss_mlp": 1.03467035, + "epoch": 0.6697129114685104, + "flos": 23002759958400.0, + "grad_norm": 1.5685975938909107, + "language_loss": 0.73056483, + "learning_rate": 1.039148976175053e-06, + "loss": 0.75188804, + "num_input_tokens_seen": 240488550, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.64453125, + "step": 11139, + "time_per_iteration": 2.490262746810913 + }, + { + "auxiliary_loss_clip": 0.01099368, + "auxiliary_loss_mlp": 0.0102889, + "balance_loss_clip": 1.01842916, + "balance_loss_mlp": 1.0357821, + "epoch": 0.6697730347211784, + "flos": 22638123043200.0, + "grad_norm": 3.192057111781844, + "language_loss": 0.70166105, + "learning_rate": 1.0388074226613016e-06, + "loss": 0.72294366, + "num_input_tokens_seen": 240508330, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.63671875, + "step": 11140, + "time_per_iteration": 3.9318604469299316 + }, + { + "auxiliary_loss_clip": 0.01104563, + "auxiliary_loss_mlp": 0.01026754, + "balance_loss_clip": 1.01471996, + "balance_loss_mlp": 1.03500891, + "epoch": 0.6698331579738463, + "flos": 28877242682880.0, + "grad_norm": 3.669311669995305, + "language_loss": 0.75779974, + "learning_rate": 1.0384659055960691e-06, + "loss": 0.77911294, + "num_input_tokens_seen": 240528470, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6953125, + "step": 11141, + "time_per_iteration": 2.516190767288208 + }, + { + "auxiliary_loss_clip": 0.01103882, + "auxiliary_loss_mlp": 0.01034793, + "balance_loss_clip": 1.0225563, + "balance_loss_mlp": 1.03589845, + "epoch": 0.6698932812265144, + "flos": 24207096090240.0, + "grad_norm": 1.7275630939402262, + "language_loss": 0.82025433, + "learning_rate": 1.0381244249923052e-06, + "loss": 0.84164113, + "num_input_tokens_seen": 240547815, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6796875, + "step": 11142, + "time_per_iteration": 2.477917432785034 + }, + { + "auxiliary_loss_clip": 0.01099057, + "auxiliary_loss_mlp": 0.0102729, + "balance_loss_clip": 1.01569653, + "balance_loss_mlp": 1.0331465, + "epoch": 0.6699534044791823, + "flos": 22090269830400.0, + "grad_norm": 1.5656493432889642, + "language_loss": 0.70054591, + "learning_rate": 1.037782980862959e-06, + "loss": 0.72180939, + "num_input_tokens_seen": 240567765, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66015625, + "step": 11143, + "time_per_iteration": 2.496873617172241 + }, + { + "auxiliary_loss_clip": 0.01099241, + "auxiliary_loss_mlp": 0.01031347, + "balance_loss_clip": 1.02056444, + "balance_loss_mlp": 1.03546476, + "epoch": 0.6700135277318503, + "flos": 25192377129600.0, + "grad_norm": 1.5042984772488615, + "language_loss": 0.69867527, + "learning_rate": 1.0374415732209796e-06, + "loss": 0.71998119, + "num_input_tokens_seen": 240590750, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.63671875, + "step": 11144, + "time_per_iteration": 2.498004674911499 + }, + { + "auxiliary_loss_clip": 0.01101313, + "auxiliary_loss_mlp": 0.01027386, + "balance_loss_clip": 1.01553071, + "balance_loss_mlp": 1.03556204, + "epoch": 0.6700736509845182, + "flos": 23440187784960.0, + "grad_norm": 1.7755943554148508, + "language_loss": 0.74376822, + "learning_rate": 1.0371002020793114e-06, + "loss": 0.76505524, + "num_input_tokens_seen": 240608875, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.65625, + "step": 11145, + "time_per_iteration": 2.482536554336548 + }, + { + "auxiliary_loss_clip": 0.01105558, + "auxiliary_loss_mlp": 0.01027193, + "balance_loss_clip": 1.01503921, + "balance_loss_mlp": 1.03620577, + "epoch": 0.6701337742371862, + "flos": 24389953251840.0, + "grad_norm": 1.7672711788536422, + "language_loss": 0.70669931, + "learning_rate": 1.0367588674509008e-06, + "loss": 0.72802681, + "num_input_tokens_seen": 240628565, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 11146, + "time_per_iteration": 2.480379819869995 + }, + { + "auxiliary_loss_clip": 0.01098109, + "auxiliary_loss_mlp": 0.01029358, + "balance_loss_clip": 1.01786661, + "balance_loss_mlp": 1.03490484, + "epoch": 0.6701938974898543, + "flos": 14793652857600.0, + "grad_norm": 1.8854886897083816, + "language_loss": 0.7791847, + "learning_rate": 1.0364175693486905e-06, + "loss": 0.80045938, + "num_input_tokens_seen": 240646325, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6328125, + "step": 11147, + "time_per_iteration": 2.4453067779541016 + }, + { + "auxiliary_loss_clip": 0.01104074, + "auxiliary_loss_mlp": 0.01033623, + "balance_loss_clip": 1.02177358, + "balance_loss_mlp": 1.03823161, + "epoch": 0.6702540207425222, + "flos": 20154002261760.0, + "grad_norm": 1.9489637728749547, + "language_loss": 0.70395339, + "learning_rate": 1.0360763077856218e-06, + "loss": 0.72533029, + "num_input_tokens_seen": 240666145, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65625, + "step": 11148, + "time_per_iteration": 2.4539880752563477 + }, + { + "auxiliary_loss_clip": 0.0110278, + "auxiliary_loss_mlp": 0.01032686, + "balance_loss_clip": 1.0209502, + "balance_loss_mlp": 1.03479636, + "epoch": 0.6703141439951902, + "flos": 21214157201280.0, + "grad_norm": 1.6874385150714277, + "language_loss": 0.70091569, + "learning_rate": 1.035735082774636e-06, + "loss": 0.72227037, + "num_input_tokens_seen": 240685570, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 11149, + "time_per_iteration": 2.5368881225585938 + }, + { + "auxiliary_loss_clip": 0.01102045, + "auxiliary_loss_mlp": 0.01025738, + "balance_loss_clip": 1.01511049, + "balance_loss_mlp": 1.03425717, + "epoch": 0.6703742672478581, + "flos": 23112538899840.0, + "grad_norm": 2.0651183620740405, + "language_loss": 0.7356782, + "learning_rate": 1.0353938943286727e-06, + "loss": 0.75695598, + "num_input_tokens_seen": 240706945, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6796875, + "step": 11150, + "time_per_iteration": 2.489635944366455 + }, + { + "auxiliary_loss_clip": 0.01104117, + "auxiliary_loss_mlp": 0.01034109, + "balance_loss_clip": 1.02237308, + "balance_loss_mlp": 1.03698301, + "epoch": 0.6704343905005261, + "flos": 22528918719360.0, + "grad_norm": 1.8066986470751747, + "language_loss": 0.7880882, + "learning_rate": 1.035052742460671e-06, + "loss": 0.80947053, + "num_input_tokens_seen": 240727990, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 11151, + "time_per_iteration": 2.470423698425293 + }, + { + "auxiliary_loss_clip": 0.01028384, + "auxiliary_loss_mlp": 0.01010518, + "balance_loss_clip": 1.00938594, + "balance_loss_mlp": 1.00781679, + "epoch": 0.670494513753194, + "flos": 64793158773120.0, + "grad_norm": 0.8172638110433008, + "language_loss": 0.55524588, + "learning_rate": 1.0347116271835643e-06, + "loss": 0.57563496, + "num_input_tokens_seen": 240790380, + "router_z_loss_clip": 0.01135254, + "router_z_loss_mlp": 0.20507812, + "step": 11152, + "time_per_iteration": 3.123234510421753 + }, + { + "auxiliary_loss_clip": 0.01103764, + "auxiliary_loss_mlp": 0.01030614, + "balance_loss_clip": 1.01884782, + "balance_loss_mlp": 1.03580236, + "epoch": 0.670554637005862, + "flos": 23511506238720.0, + "grad_norm": 1.6208942555378636, + "language_loss": 0.80739468, + "learning_rate": 1.0343705485102896e-06, + "loss": 0.82873851, + "num_input_tokens_seen": 240811545, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6796875, + "step": 11153, + "time_per_iteration": 2.511383533477783 + }, + { + "auxiliary_loss_clip": 0.01102109, + "auxiliary_loss_mlp": 0.0103123, + "balance_loss_clip": 1.0203166, + "balance_loss_mlp": 1.03519535, + "epoch": 0.67061476025853, + "flos": 19463404400640.0, + "grad_norm": 1.5741743783633508, + "language_loss": 0.76160783, + "learning_rate": 1.0340295064537814e-06, + "loss": 0.78294122, + "num_input_tokens_seen": 240831380, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.671875, + "step": 11154, + "time_per_iteration": 2.453047513961792 + }, + { + "auxiliary_loss_clip": 0.01108949, + "auxiliary_loss_mlp": 0.01032067, + "balance_loss_clip": 1.02030683, + "balance_loss_mlp": 1.03847241, + "epoch": 0.670674883511198, + "flos": 20519967980160.0, + "grad_norm": 1.4962510781515113, + "language_loss": 0.75975895, + "learning_rate": 1.0336885010269702e-06, + "loss": 0.78116906, + "num_input_tokens_seen": 240851855, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.703125, + "step": 11155, + "time_per_iteration": 2.507368564605713 + }, + { + "auxiliary_loss_clip": 0.01105615, + "auxiliary_loss_mlp": 0.01034109, + "balance_loss_clip": 1.02230704, + "balance_loss_mlp": 1.03825569, + "epoch": 0.6707350067638659, + "flos": 25483971738240.0, + "grad_norm": 2.76266123008703, + "language_loss": 0.81881839, + "learning_rate": 1.0333475322427878e-06, + "loss": 0.84021568, + "num_input_tokens_seen": 240869980, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.671875, + "step": 11156, + "time_per_iteration": 2.467165946960449 + }, + { + "auxiliary_loss_clip": 0.0110068, + "auxiliary_loss_mlp": 0.01025682, + "balance_loss_clip": 1.01425576, + "balance_loss_mlp": 1.03438997, + "epoch": 0.6707951300165339, + "flos": 22273450214400.0, + "grad_norm": 2.4473397037337237, + "language_loss": 0.74570251, + "learning_rate": 1.033006600114165e-06, + "loss": 0.7669661, + "num_input_tokens_seen": 240888680, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 11157, + "time_per_iteration": 2.4674718379974365 + }, + { + "auxiliary_loss_clip": 0.01105952, + "auxiliary_loss_mlp": 0.01035415, + "balance_loss_clip": 1.02370262, + "balance_loss_mlp": 1.03829253, + "epoch": 0.6708552532692018, + "flos": 23984593292160.0, + "grad_norm": 1.9350697498335474, + "language_loss": 0.7444576, + "learning_rate": 1.0326657046540282e-06, + "loss": 0.76587129, + "num_input_tokens_seen": 240909050, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.67578125, + "step": 11158, + "time_per_iteration": 2.4784538745880127 + }, + { + "auxiliary_loss_clip": 0.01108128, + "auxiliary_loss_mlp": 0.01030262, + "balance_loss_clip": 1.0180732, + "balance_loss_mlp": 1.0385921, + "epoch": 0.6709153765218698, + "flos": 24937519155840.0, + "grad_norm": 2.077178366394848, + "language_loss": 0.81668246, + "learning_rate": 1.0323248458753044e-06, + "loss": 0.83806634, + "num_input_tokens_seen": 240930035, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6953125, + "step": 11159, + "time_per_iteration": 2.476008653640747 + }, + { + "auxiliary_loss_clip": 0.01102735, + "auxiliary_loss_mlp": 0.01029288, + "balance_loss_clip": 1.01749814, + "balance_loss_mlp": 1.0353272, + "epoch": 0.6709754997745379, + "flos": 17530225401600.0, + "grad_norm": 1.6091286648822523, + "language_loss": 0.7708782, + "learning_rate": 1.0319840237909193e-06, + "loss": 0.79219836, + "num_input_tokens_seen": 240948895, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.671875, + "step": 11160, + "time_per_iteration": 2.4390769004821777 + }, + { + "auxiliary_loss_clip": 0.01101773, + "auxiliary_loss_mlp": 0.01026605, + "balance_loss_clip": 1.01520884, + "balance_loss_mlp": 1.03558326, + "epoch": 0.6710356230272058, + "flos": 22090880361600.0, + "grad_norm": 1.9005005299223583, + "language_loss": 0.73766249, + "learning_rate": 1.0316432384137978e-06, + "loss": 0.7589463, + "num_input_tokens_seen": 240967770, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 11161, + "time_per_iteration": 2.5078043937683105 + }, + { + "auxiliary_loss_clip": 0.01105932, + "auxiliary_loss_mlp": 0.01035471, + "balance_loss_clip": 1.0230794, + "balance_loss_mlp": 1.03523338, + "epoch": 0.6710957462798738, + "flos": 24206449645440.0, + "grad_norm": 1.6945637244101817, + "language_loss": 0.67987847, + "learning_rate": 1.0313024897568618e-06, + "loss": 0.70129251, + "num_input_tokens_seen": 240988985, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.703125, + "step": 11162, + "time_per_iteration": 2.5096116065979004 + }, + { + "auxiliary_loss_clip": 0.01102024, + "auxiliary_loss_mlp": 0.01032663, + "balance_loss_clip": 1.02136803, + "balance_loss_mlp": 1.03582597, + "epoch": 0.6711558695325417, + "flos": 19093955063040.0, + "grad_norm": 1.8281474305298504, + "language_loss": 0.70357502, + "learning_rate": 1.030961777833032e-06, + "loss": 0.72492194, + "num_input_tokens_seen": 241005455, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6640625, + "step": 11163, + "time_per_iteration": 2.4419682025909424 + }, + { + "auxiliary_loss_clip": 0.01101063, + "auxiliary_loss_mlp": 0.01029783, + "balance_loss_clip": 1.01849425, + "balance_loss_mlp": 1.03680897, + "epoch": 0.6712159927852097, + "flos": 25557875971200.0, + "grad_norm": 1.5206709527115365, + "language_loss": 0.75686288, + "learning_rate": 1.0306211026552291e-06, + "loss": 0.7781713, + "num_input_tokens_seen": 241026175, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.640625, + "step": 11164, + "time_per_iteration": 2.540302276611328 + }, + { + "auxiliary_loss_clip": 0.01102198, + "auxiliary_loss_mlp": 0.01027109, + "balance_loss_clip": 1.01537251, + "balance_loss_mlp": 1.03613234, + "epoch": 0.6712761160378776, + "flos": 22228812587520.0, + "grad_norm": 2.0013900075408424, + "language_loss": 0.64903474, + "learning_rate": 1.0302804642363704e-06, + "loss": 0.67032778, + "num_input_tokens_seen": 241044040, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66015625, + "step": 11165, + "time_per_iteration": 2.50164532661438 + }, + { + "auxiliary_loss_clip": 0.01101735, + "auxiliary_loss_mlp": 0.0103049, + "balance_loss_clip": 1.0188911, + "balance_loss_mlp": 1.03648162, + "epoch": 0.6713362392905456, + "flos": 22455517276800.0, + "grad_norm": 2.824490258261556, + "language_loss": 0.71357495, + "learning_rate": 1.0299398625893738e-06, + "loss": 0.73489726, + "num_input_tokens_seen": 241063615, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 11166, + "time_per_iteration": 2.4522786140441895 + }, + { + "auxiliary_loss_clip": 0.01102027, + "auxiliary_loss_mlp": 0.0102352, + "balance_loss_clip": 1.01324987, + "balance_loss_mlp": 1.0378294, + "epoch": 0.6713963625432136, + "flos": 25630200005760.0, + "grad_norm": 1.8136989987191092, + "language_loss": 0.77263552, + "learning_rate": 1.0295992977271546e-06, + "loss": 0.79389095, + "num_input_tokens_seen": 241082520, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.640625, + "step": 11167, + "time_per_iteration": 2.5255751609802246 + }, + { + "auxiliary_loss_clip": 0.01101953, + "auxiliary_loss_mlp": 0.0103477, + "balance_loss_clip": 1.02347469, + "balance_loss_mlp": 1.03458977, + "epoch": 0.6714564857958816, + "flos": 35006475640320.0, + "grad_norm": 5.373120607190098, + "language_loss": 0.69078279, + "learning_rate": 1.029258769662629e-06, + "loss": 0.71215004, + "num_input_tokens_seen": 241103505, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.671875, + "step": 11168, + "time_per_iteration": 2.5593607425689697 + }, + { + "auxiliary_loss_clip": 0.01105965, + "auxiliary_loss_mlp": 0.01038556, + "balance_loss_clip": 1.02552032, + "balance_loss_mlp": 1.036659, + "epoch": 0.6715166090485495, + "flos": 26279931168000.0, + "grad_norm": 1.891897557253962, + "language_loss": 0.73191148, + "learning_rate": 1.0289182784087068e-06, + "loss": 0.7533567, + "num_input_tokens_seen": 241122885, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6953125, + "step": 11169, + "time_per_iteration": 2.4835712909698486 + }, + { + "auxiliary_loss_clip": 0.01104514, + "auxiliary_loss_mlp": 0.01031792, + "balance_loss_clip": 1.01917934, + "balance_loss_mlp": 1.03605962, + "epoch": 0.6715767323012175, + "flos": 15924156583680.0, + "grad_norm": 2.050492769021052, + "language_loss": 0.76193798, + "learning_rate": 1.0285778239783005e-06, + "loss": 0.78330112, + "num_input_tokens_seen": 241140865, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 11170, + "time_per_iteration": 2.41772723197937 + }, + { + "auxiliary_loss_clip": 0.01106509, + "auxiliary_loss_mlp": 0.01028049, + "balance_loss_clip": 1.01618147, + "balance_loss_mlp": 1.03668404, + "epoch": 0.6716368555538854, + "flos": 17491441691520.0, + "grad_norm": 4.365942833040682, + "language_loss": 0.74738538, + "learning_rate": 1.0282374063843212e-06, + "loss": 0.768731, + "num_input_tokens_seen": 241158225, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69921875, + "step": 11171, + "time_per_iteration": 2.443998336791992 + }, + { + "auxiliary_loss_clip": 0.01104887, + "auxiliary_loss_mlp": 0.01037908, + "balance_loss_clip": 1.02605891, + "balance_loss_mlp": 1.03686571, + "epoch": 0.6716969788065534, + "flos": 16761521416320.0, + "grad_norm": 6.401963753530839, + "language_loss": 0.86554527, + "learning_rate": 1.0278970256396762e-06, + "loss": 0.88697314, + "num_input_tokens_seen": 241175215, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 11172, + "time_per_iteration": 2.449519395828247 + }, + { + "auxiliary_loss_clip": 0.01101029, + "auxiliary_loss_mlp": 0.0103252, + "balance_loss_clip": 1.02077198, + "balance_loss_mlp": 1.03432322, + "epoch": 0.6717571020592215, + "flos": 22709800632960.0, + "grad_norm": 1.5214923385952612, + "language_loss": 0.63705564, + "learning_rate": 1.0275566817572733e-06, + "loss": 0.65839112, + "num_input_tokens_seen": 241195250, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 11173, + "time_per_iteration": 2.4728994369506836 + }, + { + "auxiliary_loss_clip": 0.01108562, + "auxiliary_loss_mlp": 0.01035239, + "balance_loss_clip": 1.02201891, + "balance_loss_mlp": 1.03632855, + "epoch": 0.6718172253118894, + "flos": 18734094656640.0, + "grad_norm": 2.8011577390317584, + "language_loss": 0.71934807, + "learning_rate": 1.02721637475002e-06, + "loss": 0.74078608, + "num_input_tokens_seen": 241210720, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.72265625, + "step": 11174, + "time_per_iteration": 2.4150753021240234 + }, + { + "auxiliary_loss_clip": 0.01100599, + "auxiliary_loss_mlp": 0.01029318, + "balance_loss_clip": 1.01791573, + "balance_loss_mlp": 1.03507197, + "epoch": 0.6718773485645574, + "flos": 15632526061440.0, + "grad_norm": 1.9034241424773972, + "language_loss": 0.68639195, + "learning_rate": 1.0268761046308178e-06, + "loss": 0.70769107, + "num_input_tokens_seen": 241227395, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 11175, + "time_per_iteration": 2.4914746284484863 + }, + { + "auxiliary_loss_clip": 0.01101682, + "auxiliary_loss_mlp": 0.0103252, + "balance_loss_clip": 1.02143312, + "balance_loss_mlp": 1.0366466, + "epoch": 0.6719374718172253, + "flos": 19354774694400.0, + "grad_norm": 2.444826411678876, + "language_loss": 0.73786706, + "learning_rate": 1.0265358714125714e-06, + "loss": 0.7592091, + "num_input_tokens_seen": 241246355, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.65234375, + "step": 11176, + "time_per_iteration": 2.4306447505950928 + }, + { + "auxiliary_loss_clip": 0.01104157, + "auxiliary_loss_mlp": 0.01028322, + "balance_loss_clip": 1.01596642, + "balance_loss_mlp": 1.035748, + "epoch": 0.6719975950698933, + "flos": 21981316901760.0, + "grad_norm": 1.6959341450848686, + "language_loss": 0.72810507, + "learning_rate": 1.026195675108182e-06, + "loss": 0.74942982, + "num_input_tokens_seen": 241264180, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.68359375, + "step": 11177, + "time_per_iteration": 2.498624086380005 + }, + { + "auxiliary_loss_clip": 0.01103405, + "auxiliary_loss_mlp": 0.010286, + "balance_loss_clip": 1.01617265, + "balance_loss_mlp": 1.0354104, + "epoch": 0.6720577183225612, + "flos": 25228072270080.0, + "grad_norm": 2.080774174197305, + "language_loss": 0.76790631, + "learning_rate": 1.025855515730551e-06, + "loss": 0.78922629, + "num_input_tokens_seen": 241282245, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6796875, + "step": 11178, + "time_per_iteration": 3.880969524383545 + }, + { + "auxiliary_loss_clip": 0.01105896, + "auxiliary_loss_mlp": 0.01030362, + "balance_loss_clip": 1.01926351, + "balance_loss_mlp": 1.0375278, + "epoch": 0.6721178415752292, + "flos": 16945886949120.0, + "grad_norm": 1.9975121194491492, + "language_loss": 0.69893503, + "learning_rate": 1.0255153932925766e-06, + "loss": 0.72029757, + "num_input_tokens_seen": 241300745, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.68359375, + "step": 11179, + "time_per_iteration": 2.4223077297210693 + }, + { + "auxiliary_loss_clip": 0.01102153, + "auxiliary_loss_mlp": 0.01029046, + "balance_loss_clip": 1.01767302, + "balance_loss_mlp": 1.03676152, + "epoch": 0.6721779648278972, + "flos": 21541375123200.0, + "grad_norm": 1.6665783443252085, + "language_loss": 0.74105644, + "learning_rate": 1.0251753078071557e-06, + "loss": 0.76236838, + "num_input_tokens_seen": 241319320, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65625, + "step": 11180, + "time_per_iteration": 3.958832263946533 + }, + { + "auxiliary_loss_clip": 0.01102807, + "auxiliary_loss_mlp": 0.01027033, + "balance_loss_clip": 1.01544547, + "balance_loss_mlp": 1.03720415, + "epoch": 0.6722380880805652, + "flos": 22605444645120.0, + "grad_norm": 1.5017770160927022, + "language_loss": 0.75209451, + "learning_rate": 1.0248352592871848e-06, + "loss": 0.77339292, + "num_input_tokens_seen": 241342225, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 11181, + "time_per_iteration": 3.970757484436035 + }, + { + "auxiliary_loss_clip": 0.0110393, + "auxiliary_loss_mlp": 0.01025138, + "balance_loss_clip": 1.01424241, + "balance_loss_mlp": 1.03628325, + "epoch": 0.6722982113332331, + "flos": 15925269905280.0, + "grad_norm": 1.9826713327422718, + "language_loss": 0.74716818, + "learning_rate": 1.0244952477455585e-06, + "loss": 0.76845884, + "num_input_tokens_seen": 241358240, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.67578125, + "step": 11182, + "time_per_iteration": 2.4164199829101562 + }, + { + "auxiliary_loss_clip": 0.01098753, + "auxiliary_loss_mlp": 0.01032191, + "balance_loss_clip": 1.0213666, + "balance_loss_mlp": 1.03483748, + "epoch": 0.6723583345859011, + "flos": 20596170683520.0, + "grad_norm": 1.6492155923055305, + "language_loss": 0.69678056, + "learning_rate": 1.0241552731951699e-06, + "loss": 0.71808994, + "num_input_tokens_seen": 241378420, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.640625, + "step": 11183, + "time_per_iteration": 2.4825363159179688 + }, + { + "auxiliary_loss_clip": 0.01101908, + "auxiliary_loss_mlp": 0.01032724, + "balance_loss_clip": 1.02102327, + "balance_loss_mlp": 1.0350728, + "epoch": 0.672418457838569, + "flos": 21725848396800.0, + "grad_norm": 1.6819294722428546, + "language_loss": 0.77619171, + "learning_rate": 1.0238153356489112e-06, + "loss": 0.79753804, + "num_input_tokens_seen": 241397185, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66796875, + "step": 11184, + "time_per_iteration": 2.4742484092712402 + }, + { + "auxiliary_loss_clip": 0.01112997, + "auxiliary_loss_mlp": 0.01032631, + "balance_loss_clip": 1.01960111, + "balance_loss_mlp": 1.03978956, + "epoch": 0.672478581091237, + "flos": 21470379891840.0, + "grad_norm": 1.9600702886656058, + "language_loss": 0.65830189, + "learning_rate": 1.0234754351196743e-06, + "loss": 0.67975819, + "num_input_tokens_seen": 241415785, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.734375, + "step": 11185, + "time_per_iteration": 2.6265766620635986 + }, + { + "auxiliary_loss_clip": 0.0110247, + "auxiliary_loss_mlp": 0.01027508, + "balance_loss_clip": 1.01555133, + "balance_loss_mlp": 1.03508019, + "epoch": 0.6725387043439051, + "flos": 30846763267200.0, + "grad_norm": 1.6086008561996032, + "language_loss": 0.8077392, + "learning_rate": 1.023135571620345e-06, + "loss": 0.82903898, + "num_input_tokens_seen": 241437390, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 11186, + "time_per_iteration": 2.5254018306732178 + }, + { + "auxiliary_loss_clip": 0.0110242, + "auxiliary_loss_mlp": 0.01032446, + "balance_loss_clip": 1.02216411, + "balance_loss_mlp": 1.03798425, + "epoch": 0.672598827596573, + "flos": 24055947659520.0, + "grad_norm": 1.4050560740555764, + "language_loss": 0.8022958, + "learning_rate": 1.022795745163813e-06, + "loss": 0.82364446, + "num_input_tokens_seen": 241458085, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.64453125, + "step": 11187, + "time_per_iteration": 2.492206335067749 + }, + { + "auxiliary_loss_clip": 0.01108961, + "auxiliary_loss_mlp": 0.01035039, + "balance_loss_clip": 1.02235556, + "balance_loss_mlp": 1.03917003, + "epoch": 0.672658950849241, + "flos": 21871861182720.0, + "grad_norm": 2.0955662178616663, + "language_loss": 0.70936477, + "learning_rate": 1.022455955762965e-06, + "loss": 0.73080474, + "num_input_tokens_seen": 241476880, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 11188, + "time_per_iteration": 2.4696547985076904 + }, + { + "auxiliary_loss_clip": 0.01100609, + "auxiliary_loss_mlp": 0.01029794, + "balance_loss_clip": 1.01867819, + "balance_loss_mlp": 1.03614163, + "epoch": 0.6727190741019089, + "flos": 23222102359680.0, + "grad_norm": 1.690433236478768, + "language_loss": 0.7567057, + "learning_rate": 1.0221162034306842e-06, + "loss": 0.77800977, + "num_input_tokens_seen": 241496535, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.64453125, + "step": 11189, + "time_per_iteration": 2.502394676208496 + }, + { + "auxiliary_loss_clip": 0.01105784, + "auxiliary_loss_mlp": 0.01026536, + "balance_loss_clip": 1.01342869, + "balance_loss_mlp": 1.03580916, + "epoch": 0.6727791973545769, + "flos": 15778610674560.0, + "grad_norm": 2.0624308015957666, + "language_loss": 0.75735819, + "learning_rate": 1.0217764881798562e-06, + "loss": 0.7786814, + "num_input_tokens_seen": 241513465, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.69921875, + "step": 11190, + "time_per_iteration": 2.4117863178253174 + }, + { + "auxiliary_loss_clip": 0.01101643, + "auxiliary_loss_mlp": 0.01030612, + "balance_loss_clip": 1.01830935, + "balance_loss_mlp": 1.03503203, + "epoch": 0.6728393206072448, + "flos": 21249852341760.0, + "grad_norm": 1.479637189299754, + "language_loss": 0.77305663, + "learning_rate": 1.0214368100233612e-06, + "loss": 0.79437912, + "num_input_tokens_seen": 241534125, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66796875, + "step": 11191, + "time_per_iteration": 2.471383571624756 + }, + { + "auxiliary_loss_clip": 0.01101045, + "auxiliary_loss_mlp": 0.01027006, + "balance_loss_clip": 1.01542521, + "balance_loss_mlp": 1.03620696, + "epoch": 0.6728994438599128, + "flos": 32123279779200.0, + "grad_norm": 1.9484073900919987, + "language_loss": 0.86244619, + "learning_rate": 1.0210971689740802e-06, + "loss": 0.88372666, + "num_input_tokens_seen": 241556340, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6484375, + "step": 11192, + "time_per_iteration": 2.541471481323242 + }, + { + "auxiliary_loss_clip": 0.01105869, + "auxiliary_loss_mlp": 0.01033893, + "balance_loss_clip": 1.02166843, + "balance_loss_mlp": 1.03793001, + "epoch": 0.6729595671125808, + "flos": 23112359331840.0, + "grad_norm": 1.7778605034576032, + "language_loss": 0.76010567, + "learning_rate": 1.0207575650448923e-06, + "loss": 0.78150332, + "num_input_tokens_seen": 241575185, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6796875, + "step": 11193, + "time_per_iteration": 2.4631118774414062 + }, + { + "auxiliary_loss_clip": 0.01104222, + "auxiliary_loss_mlp": 0.01032099, + "balance_loss_clip": 1.0205301, + "balance_loss_mlp": 1.03698504, + "epoch": 0.6730196903652488, + "flos": 14611406227200.0, + "grad_norm": 1.7482449519435526, + "language_loss": 0.78450751, + "learning_rate": 1.0204179982486758e-06, + "loss": 0.80587071, + "num_input_tokens_seen": 241592970, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 11194, + "time_per_iteration": 2.4163994789123535 + }, + { + "auxiliary_loss_clip": 0.01102715, + "auxiliary_loss_mlp": 0.01027293, + "balance_loss_clip": 1.01602221, + "balance_loss_mlp": 1.03523183, + "epoch": 0.6730798136179167, + "flos": 21105922544640.0, + "grad_norm": 2.439402985037115, + "language_loss": 0.89769554, + "learning_rate": 1.0200784685983075e-06, + "loss": 0.91899562, + "num_input_tokens_seen": 241610245, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.67578125, + "step": 11195, + "time_per_iteration": 2.4890894889831543 + }, + { + "auxiliary_loss_clip": 0.01103794, + "auxiliary_loss_mlp": 0.0103069, + "balance_loss_clip": 1.01964474, + "balance_loss_mlp": 1.03698754, + "epoch": 0.6731399368705847, + "flos": 28986267438720.0, + "grad_norm": 3.8315256645626468, + "language_loss": 0.7259835, + "learning_rate": 1.019738976106662e-06, + "loss": 0.74732834, + "num_input_tokens_seen": 241630350, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6640625, + "step": 11196, + "time_per_iteration": 2.4961941242218018 + }, + { + "auxiliary_loss_clip": 0.01026268, + "auxiliary_loss_mlp": 0.00997949, + "balance_loss_clip": 0.99669737, + "balance_loss_mlp": 1.00585961, + "epoch": 0.6732000601232526, + "flos": 64743708723840.0, + "grad_norm": 0.7827982838834083, + "language_loss": 0.56530619, + "learning_rate": 1.0193995207866123e-06, + "loss": 0.58554828, + "num_input_tokens_seen": 241692380, + "router_z_loss_clip": 0.01251221, + "router_z_loss_mlp": 0.20410156, + "step": 11197, + "time_per_iteration": 2.9888203144073486 + }, + { + "auxiliary_loss_clip": 0.01103429, + "auxiliary_loss_mlp": 0.01023702, + "balance_loss_clip": 1.01289546, + "balance_loss_mlp": 1.03899539, + "epoch": 0.6732601833759206, + "flos": 17201642762880.0, + "grad_norm": 2.0080706986846635, + "language_loss": 0.75471473, + "learning_rate": 1.0190601026510312e-06, + "loss": 0.77598602, + "num_input_tokens_seen": 241710430, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.64453125, + "step": 11198, + "time_per_iteration": 2.4266445636749268 + }, + { + "auxiliary_loss_clip": 0.01103371, + "auxiliary_loss_mlp": 0.01026973, + "balance_loss_clip": 1.01493251, + "balance_loss_mlp": 1.03564501, + "epoch": 0.6733203066285887, + "flos": 18658861620480.0, + "grad_norm": 2.2277183364076674, + "language_loss": 0.8092168, + "learning_rate": 1.0187207217127892e-06, + "loss": 0.83052027, + "num_input_tokens_seen": 241724775, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6796875, + "step": 11199, + "time_per_iteration": 2.4250686168670654 + }, + { + "auxiliary_loss_clip": 0.01103852, + "auxiliary_loss_mlp": 0.01032091, + "balance_loss_clip": 1.01972258, + "balance_loss_mlp": 1.03520989, + "epoch": 0.6733804298812566, + "flos": 35809330481280.0, + "grad_norm": 1.7815929608142598, + "language_loss": 0.71828485, + "learning_rate": 1.0183813779847552e-06, + "loss": 0.73964423, + "num_input_tokens_seen": 241744440, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6875, + "step": 11200, + "time_per_iteration": 2.555952787399292 + }, + { + "auxiliary_loss_clip": 0.01106738, + "auxiliary_loss_mlp": 0.01032004, + "balance_loss_clip": 1.0200175, + "balance_loss_mlp": 1.03832173, + "epoch": 0.6734405531339246, + "flos": 61638833099520.0, + "grad_norm": 1.625800733182769, + "language_loss": 0.6466803, + "learning_rate": 1.0180420714797987e-06, + "loss": 0.66806769, + "num_input_tokens_seen": 241771705, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 11201, + "time_per_iteration": 2.8149640560150146 + }, + { + "auxiliary_loss_clip": 0.01106243, + "auxiliary_loss_mlp": 0.0103222, + "balance_loss_clip": 1.01998901, + "balance_loss_mlp": 1.03641796, + "epoch": 0.6735006763865925, + "flos": 20522338277760.0, + "grad_norm": 1.7955061796431357, + "language_loss": 0.63162857, + "learning_rate": 1.0177028022107856e-06, + "loss": 0.65301323, + "num_input_tokens_seen": 241790830, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69921875, + "step": 11202, + "time_per_iteration": 2.437077045440674 + }, + { + "auxiliary_loss_clip": 0.01103951, + "auxiliary_loss_mlp": 0.01026684, + "balance_loss_clip": 1.01568675, + "balance_loss_mlp": 1.03587484, + "epoch": 0.6735607996392605, + "flos": 13918869031680.0, + "grad_norm": 1.8620640282713015, + "language_loss": 0.74766082, + "learning_rate": 1.0173635701905796e-06, + "loss": 0.76896715, + "num_input_tokens_seen": 241808165, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6796875, + "step": 11203, + "time_per_iteration": 2.457798719406128 + }, + { + "auxiliary_loss_clip": 0.01109149, + "auxiliary_loss_mlp": 0.01030349, + "balance_loss_clip": 1.01710498, + "balance_loss_mlp": 1.037606, + "epoch": 0.6736209228919284, + "flos": 18807244704000.0, + "grad_norm": 1.7246428938805878, + "language_loss": 0.67498362, + "learning_rate": 1.0170243754320456e-06, + "loss": 0.69637865, + "num_input_tokens_seen": 241826925, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71484375, + "step": 11204, + "time_per_iteration": 2.4272255897521973 + }, + { + "auxiliary_loss_clip": 0.01110127, + "auxiliary_loss_mlp": 0.01032358, + "balance_loss_clip": 1.01991844, + "balance_loss_mlp": 1.03929329, + "epoch": 0.6736810461445965, + "flos": 20373129181440.0, + "grad_norm": 1.5939578102801788, + "language_loss": 0.7447291, + "learning_rate": 1.0166852179480465e-06, + "loss": 0.76615399, + "num_input_tokens_seen": 241845525, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 11205, + "time_per_iteration": 2.4560165405273438 + }, + { + "auxiliary_loss_clip": 0.0109994, + "auxiliary_loss_mlp": 0.01030681, + "balance_loss_clip": 1.01932693, + "balance_loss_mlp": 1.03492117, + "epoch": 0.6737411693972644, + "flos": 30007530927360.0, + "grad_norm": 1.6470910861724577, + "language_loss": 0.71854442, + "learning_rate": 1.0163460977514416e-06, + "loss": 0.73985064, + "num_input_tokens_seen": 241866815, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 11206, + "time_per_iteration": 2.5040676593780518 + }, + { + "auxiliary_loss_clip": 0.01111631, + "auxiliary_loss_mlp": 0.01032573, + "balance_loss_clip": 1.02033639, + "balance_loss_mlp": 1.03923798, + "epoch": 0.6738012926499324, + "flos": 25447342844160.0, + "grad_norm": 6.529945029855453, + "language_loss": 0.67127562, + "learning_rate": 1.016007014855092e-06, + "loss": 0.69271767, + "num_input_tokens_seen": 241887050, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.7265625, + "step": 11207, + "time_per_iteration": 2.5161397457122803 + }, + { + "auxiliary_loss_clip": 0.01102629, + "auxiliary_loss_mlp": 0.01029952, + "balance_loss_clip": 1.0182395, + "balance_loss_mlp": 1.03757155, + "epoch": 0.6738614159026003, + "flos": 20776873029120.0, + "grad_norm": 2.4663080715904675, + "language_loss": 0.73317289, + "learning_rate": 1.0156679692718553e-06, + "loss": 0.75449866, + "num_input_tokens_seen": 241904280, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6484375, + "step": 11208, + "time_per_iteration": 2.4350569248199463 + }, + { + "auxiliary_loss_clip": 0.01104929, + "auxiliary_loss_mlp": 0.01033002, + "balance_loss_clip": 1.019835, + "balance_loss_mlp": 1.03649032, + "epoch": 0.6739215391552683, + "flos": 19566898462080.0, + "grad_norm": 1.8859944640341983, + "language_loss": 0.75882745, + "learning_rate": 1.0153289610145867e-06, + "loss": 0.78020674, + "num_input_tokens_seen": 241919190, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.6875, + "step": 11209, + "time_per_iteration": 2.4393579959869385 + }, + { + "auxiliary_loss_clip": 0.01100693, + "auxiliary_loss_mlp": 0.01029713, + "balance_loss_clip": 1.01881683, + "balance_loss_mlp": 1.03629994, + "epoch": 0.6739816624079362, + "flos": 24388193485440.0, + "grad_norm": 1.6804143170759391, + "language_loss": 0.66519487, + "learning_rate": 1.0149899900961428e-06, + "loss": 0.68649894, + "num_input_tokens_seen": 241940525, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.64453125, + "step": 11210, + "time_per_iteration": 2.4730069637298584 + }, + { + "auxiliary_loss_clip": 0.01100446, + "auxiliary_loss_mlp": 0.01027448, + "balance_loss_clip": 1.01682603, + "balance_loss_mlp": 1.03569078, + "epoch": 0.6740417856606042, + "flos": 22528164533760.0, + "grad_norm": 2.1720353274754154, + "language_loss": 0.79894733, + "learning_rate": 1.014651056529377e-06, + "loss": 0.82022631, + "num_input_tokens_seen": 241959290, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6484375, + "step": 11211, + "time_per_iteration": 2.468639850616455 + }, + { + "auxiliary_loss_clip": 0.01101219, + "auxiliary_loss_mlp": 0.01030077, + "balance_loss_clip": 1.01841807, + "balance_loss_mlp": 1.03608012, + "epoch": 0.6741019089132723, + "flos": 25775458606080.0, + "grad_norm": 1.549232637169743, + "language_loss": 0.76512897, + "learning_rate": 1.014312160327143e-06, + "loss": 0.78644192, + "num_input_tokens_seen": 241980715, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65234375, + "step": 11212, + "time_per_iteration": 2.478450059890747 + }, + { + "auxiliary_loss_clip": 0.01104217, + "auxiliary_loss_mlp": 0.01029584, + "balance_loss_clip": 1.01728141, + "balance_loss_mlp": 1.03573346, + "epoch": 0.6741620321659402, + "flos": 21105671149440.0, + "grad_norm": 1.6801358890975542, + "language_loss": 0.77888572, + "learning_rate": 1.0139733015022905e-06, + "loss": 0.80022377, + "num_input_tokens_seen": 241999985, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.68359375, + "step": 11213, + "time_per_iteration": 2.4666621685028076 + }, + { + "auxiliary_loss_clip": 0.01107053, + "auxiliary_loss_mlp": 0.01035384, + "balance_loss_clip": 1.0228132, + "balance_loss_mlp": 1.03760529, + "epoch": 0.6742221554186082, + "flos": 20740423703040.0, + "grad_norm": 2.4257892231901765, + "language_loss": 0.67633986, + "learning_rate": 1.0136344800676685e-06, + "loss": 0.69776428, + "num_input_tokens_seen": 242018990, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 11214, + "time_per_iteration": 2.452108860015869 + }, + { + "auxiliary_loss_clip": 0.01103571, + "auxiliary_loss_mlp": 0.01033418, + "balance_loss_clip": 1.02189624, + "balance_loss_mlp": 1.03610945, + "epoch": 0.6742822786712761, + "flos": 37774146384000.0, + "grad_norm": 1.6441501997597023, + "language_loss": 0.72691011, + "learning_rate": 1.0132956960361263e-06, + "loss": 0.74828005, + "num_input_tokens_seen": 242039340, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.67578125, + "step": 11215, + "time_per_iteration": 2.615023374557495 + }, + { + "auxiliary_loss_clip": 0.01105661, + "auxiliary_loss_mlp": 0.01032878, + "balance_loss_clip": 1.02168989, + "balance_loss_mlp": 1.03667545, + "epoch": 0.6743424019239441, + "flos": 37263891732480.0, + "grad_norm": 3.085424201902257, + "language_loss": 0.67325628, + "learning_rate": 1.0129569494205096e-06, + "loss": 0.69464171, + "num_input_tokens_seen": 242062215, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6875, + "step": 11216, + "time_per_iteration": 2.567662477493286 + }, + { + "auxiliary_loss_clip": 0.01026395, + "auxiliary_loss_mlp": 0.01001456, + "balance_loss_clip": 1.00016236, + "balance_loss_mlp": 1.00580978, + "epoch": 0.674402525176612, + "flos": 65997746300160.0, + "grad_norm": 0.6744353438462242, + "language_loss": 0.56309336, + "learning_rate": 1.0126182402336646e-06, + "loss": 0.58337194, + "num_input_tokens_seen": 242131130, + "router_z_loss_clip": 0.01293945, + "router_z_loss_mlp": 0.20605469, + "step": 11217, + "time_per_iteration": 3.1818552017211914 + }, + { + "auxiliary_loss_clip": 0.01102202, + "auxiliary_loss_mlp": 0.01028455, + "balance_loss_clip": 1.0162183, + "balance_loss_mlp": 1.0352273, + "epoch": 0.67446264842928, + "flos": 26461208131200.0, + "grad_norm": 1.9712085707776, + "language_loss": 0.74490952, + "learning_rate": 1.0122795684884363e-06, + "loss": 0.76621616, + "num_input_tokens_seen": 242149720, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.671875, + "step": 11218, + "time_per_iteration": 2.4742777347564697 + }, + { + "auxiliary_loss_clip": 0.01105482, + "auxiliary_loss_mlp": 0.01042974, + "balance_loss_clip": 1.02953315, + "balance_loss_mlp": 1.03671169, + "epoch": 0.674522771681948, + "flos": 23732392924800.0, + "grad_norm": 1.6873790300129339, + "language_loss": 0.66097057, + "learning_rate": 1.0119409341976639e-06, + "loss": 0.68245506, + "num_input_tokens_seen": 242168875, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.6875, + "step": 11219, + "time_per_iteration": 3.9712955951690674 + }, + { + "auxiliary_loss_clip": 0.01105197, + "auxiliary_loss_mlp": 0.0103097, + "balance_loss_clip": 1.01901901, + "balance_loss_mlp": 1.03550935, + "epoch": 0.674582894934616, + "flos": 24754338771840.0, + "grad_norm": 1.9354673669636624, + "language_loss": 0.74431932, + "learning_rate": 1.0116023373741904e-06, + "loss": 0.76568097, + "num_input_tokens_seen": 242188465, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 11220, + "time_per_iteration": 2.4782400131225586 + }, + { + "auxiliary_loss_clip": 0.01104541, + "auxiliary_loss_mlp": 0.0103107, + "balance_loss_clip": 1.01871443, + "balance_loss_mlp": 1.03673649, + "epoch": 0.6746430181872839, + "flos": 24826626892800.0, + "grad_norm": 1.753572378422806, + "language_loss": 0.70208532, + "learning_rate": 1.0112637780308554e-06, + "loss": 0.72344136, + "num_input_tokens_seen": 242208675, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6796875, + "step": 11221, + "time_per_iteration": 3.8499643802642822 + }, + { + "auxiliary_loss_clip": 0.01104329, + "auxiliary_loss_mlp": 0.01027391, + "balance_loss_clip": 1.01634061, + "balance_loss_mlp": 1.03750563, + "epoch": 0.6747031414399519, + "flos": 16873491087360.0, + "grad_norm": 2.083478811055199, + "language_loss": 0.58038485, + "learning_rate": 1.010925256180498e-06, + "loss": 0.60170209, + "num_input_tokens_seen": 242227440, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.66796875, + "step": 11222, + "time_per_iteration": 3.796449661254883 + }, + { + "auxiliary_loss_clip": 0.01105674, + "auxiliary_loss_mlp": 0.01032124, + "balance_loss_clip": 1.02047682, + "balance_loss_mlp": 1.03731191, + "epoch": 0.6747632646926198, + "flos": 22784925928320.0, + "grad_norm": 2.9048479494136266, + "language_loss": 0.76680332, + "learning_rate": 1.0105867718359528e-06, + "loss": 0.7881813, + "num_input_tokens_seen": 242245240, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.68359375, + "step": 11223, + "time_per_iteration": 3.932152271270752 + }, + { + "auxiliary_loss_clip": 0.01107203, + "auxiliary_loss_mlp": 0.01032299, + "balance_loss_clip": 1.02009189, + "balance_loss_mlp": 1.03799176, + "epoch": 0.6748233879452878, + "flos": 20046090827520.0, + "grad_norm": 1.7350565617477662, + "language_loss": 0.75261784, + "learning_rate": 1.0102483250100574e-06, + "loss": 0.77401286, + "num_input_tokens_seen": 242263435, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69140625, + "step": 11224, + "time_per_iteration": 2.4370362758636475 + }, + { + "auxiliary_loss_clip": 0.0109934, + "auxiliary_loss_mlp": 0.01024383, + "balance_loss_clip": 1.01388621, + "balance_loss_mlp": 1.03474987, + "epoch": 0.6748835111979558, + "flos": 23002831785600.0, + "grad_norm": 1.6433655631752735, + "language_loss": 0.63031125, + "learning_rate": 1.0099099157156445e-06, + "loss": 0.6515485, + "num_input_tokens_seen": 242282765, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.64453125, + "step": 11225, + "time_per_iteration": 2.472139835357666 + }, + { + "auxiliary_loss_clip": 0.01099206, + "auxiliary_loss_mlp": 0.01030978, + "balance_loss_clip": 1.02051783, + "balance_loss_mlp": 1.03548217, + "epoch": 0.6749436344506238, + "flos": 12197311009920.0, + "grad_norm": 1.7438523279987848, + "language_loss": 0.64443898, + "learning_rate": 1.0095715439655462e-06, + "loss": 0.66574085, + "num_input_tokens_seen": 242298980, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.63671875, + "step": 11226, + "time_per_iteration": 2.3997251987457275 + }, + { + "auxiliary_loss_clip": 0.01106856, + "auxiliary_loss_mlp": 0.01032206, + "balance_loss_clip": 1.02009439, + "balance_loss_mlp": 1.03833103, + "epoch": 0.6750037577032918, + "flos": 11873720361600.0, + "grad_norm": 2.0520128582030406, + "language_loss": 0.71177256, + "learning_rate": 1.0092332097725945e-06, + "loss": 0.73316324, + "num_input_tokens_seen": 242315420, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 11227, + "time_per_iteration": 2.4354188442230225 + }, + { + "auxiliary_loss_clip": 0.01102719, + "auxiliary_loss_mlp": 0.01028023, + "balance_loss_clip": 1.01566148, + "balance_loss_mlp": 1.03702497, + "epoch": 0.6750638809559597, + "flos": 17019611614080.0, + "grad_norm": 1.9773279438432965, + "language_loss": 0.7113992, + "learning_rate": 1.0088949131496183e-06, + "loss": 0.73270661, + "num_input_tokens_seen": 242332805, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.65625, + "step": 11228, + "time_per_iteration": 2.4065871238708496 + }, + { + "auxiliary_loss_clip": 0.01026271, + "auxiliary_loss_mlp": 0.01000743, + "balance_loss_clip": 0.99950963, + "balance_loss_mlp": 1.00561559, + "epoch": 0.6751240042086277, + "flos": 70951011891840.0, + "grad_norm": 0.7600669046292114, + "language_loss": 0.53283465, + "learning_rate": 1.0085566541094482e-06, + "loss": 0.55310482, + "num_input_tokens_seen": 242396160, + "router_z_loss_clip": 0.0123291, + "router_z_loss_mlp": 0.20703125, + "step": 11229, + "time_per_iteration": 3.113936424255371 + }, + { + "auxiliary_loss_clip": 0.01102392, + "auxiliary_loss_mlp": 0.01029521, + "balance_loss_clip": 1.01778507, + "balance_loss_mlp": 1.03599358, + "epoch": 0.6751841274612956, + "flos": 22675146986880.0, + "grad_norm": 1.668368696112623, + "language_loss": 0.80301458, + "learning_rate": 1.0082184326649072e-06, + "loss": 0.82433373, + "num_input_tokens_seen": 242414660, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 11230, + "time_per_iteration": 2.481586456298828 + }, + { + "auxiliary_loss_clip": 0.01101511, + "auxiliary_loss_mlp": 0.01025242, + "balance_loss_clip": 1.01433396, + "balance_loss_mlp": 1.03644145, + "epoch": 0.6752442507139637, + "flos": 21288636051840.0, + "grad_norm": 1.5254643295267571, + "language_loss": 0.66080362, + "learning_rate": 1.0078802488288228e-06, + "loss": 0.68207115, + "num_input_tokens_seen": 242434225, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6484375, + "step": 11231, + "time_per_iteration": 2.4348020553588867 + }, + { + "auxiliary_loss_clip": 0.01109126, + "auxiliary_loss_mlp": 0.01036053, + "balance_loss_clip": 1.02254665, + "balance_loss_mlp": 1.03815401, + "epoch": 0.6753043739666316, + "flos": 28256921781120.0, + "grad_norm": 1.8895861738799862, + "language_loss": 0.66976327, + "learning_rate": 1.0075421026140198e-06, + "loss": 0.69121504, + "num_input_tokens_seen": 242454355, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7109375, + "step": 11232, + "time_per_iteration": 2.565011501312256 + }, + { + "auxiliary_loss_clip": 0.01102814, + "auxiliary_loss_mlp": 0.01026397, + "balance_loss_clip": 1.01529849, + "balance_loss_mlp": 1.03667426, + "epoch": 0.6753644972192996, + "flos": 21360349555200.0, + "grad_norm": 1.7997945281360064, + "language_loss": 0.72617656, + "learning_rate": 1.0072039940333188e-06, + "loss": 0.74746865, + "num_input_tokens_seen": 242474935, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6640625, + "step": 11233, + "time_per_iteration": 2.451127767562866 + }, + { + "auxiliary_loss_clip": 0.01102933, + "auxiliary_loss_mlp": 0.01032225, + "balance_loss_clip": 1.01986313, + "balance_loss_mlp": 1.035604, + "epoch": 0.6754246204719675, + "flos": 26541971861760.0, + "grad_norm": 1.728016441920487, + "language_loss": 0.76981372, + "learning_rate": 1.0068659230995418e-06, + "loss": 0.79116529, + "num_input_tokens_seen": 242495530, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 11234, + "time_per_iteration": 2.560873031616211 + }, + { + "auxiliary_loss_clip": 0.01103068, + "auxiliary_loss_mlp": 0.01028923, + "balance_loss_clip": 1.01688838, + "balance_loss_mlp": 1.03655529, + "epoch": 0.6754847437246355, + "flos": 25556690822400.0, + "grad_norm": 1.5233618386668848, + "language_loss": 0.7516101, + "learning_rate": 1.0065278898255101e-06, + "loss": 0.77292997, + "num_input_tokens_seen": 242514550, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 11235, + "time_per_iteration": 2.473658323287964 + }, + { + "auxiliary_loss_clip": 0.01025939, + "auxiliary_loss_mlp": 0.00999916, + "balance_loss_clip": 0.99873585, + "balance_loss_mlp": 1.00544596, + "epoch": 0.6755448669773034, + "flos": 59513318726400.0, + "grad_norm": 0.7849137447209698, + "language_loss": 0.51408035, + "learning_rate": 1.0061898942240387e-06, + "loss": 0.53433889, + "num_input_tokens_seen": 242569200, + "router_z_loss_clip": 0.01177979, + "router_z_loss_mlp": 0.20507812, + "step": 11236, + "time_per_iteration": 2.993544340133667 + }, + { + "auxiliary_loss_clip": 0.01102563, + "auxiliary_loss_mlp": 0.01027868, + "balance_loss_clip": 1.01443923, + "balance_loss_mlp": 1.03596473, + "epoch": 0.6756049902299714, + "flos": 23294534135040.0, + "grad_norm": 2.2221952993281335, + "language_loss": 0.75521564, + "learning_rate": 1.0058519363079464e-06, + "loss": 0.77652001, + "num_input_tokens_seen": 242586950, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.6640625, + "step": 11237, + "time_per_iteration": 2.4348740577697754 + }, + { + "auxiliary_loss_clip": 0.01105842, + "auxiliary_loss_mlp": 0.01032869, + "balance_loss_clip": 1.02153206, + "balance_loss_mlp": 1.03944969, + "epoch": 0.6756651134826394, + "flos": 31575426566400.0, + "grad_norm": 2.1736628297595466, + "language_loss": 0.77503932, + "learning_rate": 1.0055140160900482e-06, + "loss": 0.79642648, + "num_input_tokens_seen": 242607380, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 11238, + "time_per_iteration": 2.526988983154297 + }, + { + "auxiliary_loss_clip": 0.01105979, + "auxiliary_loss_mlp": 0.01031121, + "balance_loss_clip": 1.0188483, + "balance_loss_mlp": 1.03556848, + "epoch": 0.6757252367353074, + "flos": 27272287186560.0, + "grad_norm": 1.9142971498049255, + "language_loss": 0.66731274, + "learning_rate": 1.0051761335831587e-06, + "loss": 0.68868375, + "num_input_tokens_seen": 242628025, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 11239, + "time_per_iteration": 2.4696223735809326 + }, + { + "auxiliary_loss_clip": 0.0110246, + "auxiliary_loss_mlp": 0.01026428, + "balance_loss_clip": 1.01447082, + "balance_loss_mlp": 1.03745294, + "epoch": 0.6757853599879754, + "flos": 16830900535680.0, + "grad_norm": 2.923743651844225, + "language_loss": 0.82995439, + "learning_rate": 1.0048382888000898e-06, + "loss": 0.85124326, + "num_input_tokens_seen": 242643825, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6484375, + "step": 11240, + "time_per_iteration": 2.446572780609131 + }, + { + "auxiliary_loss_clip": 0.01111011, + "auxiliary_loss_mlp": 0.01033506, + "balance_loss_clip": 1.01959419, + "balance_loss_mlp": 1.03869963, + "epoch": 0.6758454832406433, + "flos": 23220055284480.0, + "grad_norm": 3.6442496224808933, + "language_loss": 0.74812031, + "learning_rate": 1.0045004817536525e-06, + "loss": 0.76956552, + "num_input_tokens_seen": 242661820, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.72265625, + "step": 11241, + "time_per_iteration": 2.423372268676758 + }, + { + "auxiliary_loss_clip": 0.01104649, + "auxiliary_loss_mlp": 0.01032061, + "balance_loss_clip": 1.02011645, + "balance_loss_mlp": 1.03697479, + "epoch": 0.6759056064933113, + "flos": 16289547684480.0, + "grad_norm": 2.14563763168323, + "language_loss": 0.80052149, + "learning_rate": 1.0041627124566572e-06, + "loss": 0.82188863, + "num_input_tokens_seen": 242679890, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 11242, + "time_per_iteration": 2.434990167617798 + }, + { + "auxiliary_loss_clip": 0.01101563, + "auxiliary_loss_mlp": 0.01028144, + "balance_loss_clip": 1.01693797, + "balance_loss_mlp": 1.034922, + "epoch": 0.6759657297459792, + "flos": 25922297404800.0, + "grad_norm": 1.9802154508142344, + "language_loss": 0.72626722, + "learning_rate": 1.0038249809219109e-06, + "loss": 0.74756432, + "num_input_tokens_seen": 242699495, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.66796875, + "step": 11243, + "time_per_iteration": 2.453474283218384 + }, + { + "auxiliary_loss_clip": 0.0110346, + "auxiliary_loss_mlp": 0.01034848, + "balance_loss_clip": 1.02346909, + "balance_loss_mlp": 1.03676426, + "epoch": 0.6760258529986473, + "flos": 23000820624000.0, + "grad_norm": 1.7073695655292809, + "language_loss": 0.72612441, + "learning_rate": 1.003487287162221e-06, + "loss": 0.74750745, + "num_input_tokens_seen": 242719500, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 11244, + "time_per_iteration": 2.5105230808258057 + }, + { + "auxiliary_loss_clip": 0.01106398, + "auxiliary_loss_mlp": 0.01038693, + "balance_loss_clip": 1.02668309, + "balance_loss_mlp": 1.03746104, + "epoch": 0.6760859762513152, + "flos": 20959335141120.0, + "grad_norm": 1.8087707557146027, + "language_loss": 0.85335118, + "learning_rate": 1.003149631190393e-06, + "loss": 0.87480211, + "num_input_tokens_seen": 242738325, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6875, + "step": 11245, + "time_per_iteration": 2.445233106613159 + }, + { + "auxiliary_loss_clip": 0.01108481, + "auxiliary_loss_mlp": 0.01032399, + "balance_loss_clip": 1.02013278, + "balance_loss_mlp": 1.03733289, + "epoch": 0.6761460995039832, + "flos": 23622937205760.0, + "grad_norm": 2.3183444790940766, + "language_loss": 0.73646373, + "learning_rate": 1.0028120130192327e-06, + "loss": 0.75787258, + "num_input_tokens_seen": 242756620, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.7109375, + "step": 11246, + "time_per_iteration": 2.4863364696502686 + }, + { + "auxiliary_loss_clip": 0.01102215, + "auxiliary_loss_mlp": 0.01025917, + "balance_loss_clip": 1.01430011, + "balance_loss_mlp": 1.0346514, + "epoch": 0.6762062227566511, + "flos": 20770875457920.0, + "grad_norm": 2.2448543978250437, + "language_loss": 0.88085318, + "learning_rate": 1.002474432661539e-06, + "loss": 0.90213448, + "num_input_tokens_seen": 242774505, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.67578125, + "step": 11247, + "time_per_iteration": 2.4308738708496094 + }, + { + "auxiliary_loss_clip": 0.01025674, + "auxiliary_loss_mlp": 0.01003402, + "balance_loss_clip": 1.00222802, + "balance_loss_mlp": 1.0053699, + "epoch": 0.6762663460093191, + "flos": 52818099166080.0, + "grad_norm": 0.8266217559963673, + "language_loss": 0.54048848, + "learning_rate": 1.002136890130115e-06, + "loss": 0.56077927, + "num_input_tokens_seen": 242828645, + "router_z_loss_clip": 0.01171875, + "router_z_loss_mlp": 0.203125, + "step": 11248, + "time_per_iteration": 3.076478958129883 + }, + { + "auxiliary_loss_clip": 0.01098711, + "auxiliary_loss_mlp": 0.01029161, + "balance_loss_clip": 1.01805067, + "balance_loss_mlp": 1.03580928, + "epoch": 0.676326469261987, + "flos": 23696302734720.0, + "grad_norm": 1.69579760819699, + "language_loss": 0.73396099, + "learning_rate": 1.001799385437761e-06, + "loss": 0.75523973, + "num_input_tokens_seen": 242850100, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.62890625, + "step": 11249, + "time_per_iteration": 2.47476863861084 + }, + { + "auxiliary_loss_clip": 0.01103589, + "auxiliary_loss_mlp": 0.01031036, + "balance_loss_clip": 1.01811373, + "balance_loss_mlp": 1.03449488, + "epoch": 0.676386592514655, + "flos": 14063732582400.0, + "grad_norm": 2.3334311767034035, + "language_loss": 0.73674285, + "learning_rate": 1.0014619185972732e-06, + "loss": 0.75808907, + "num_input_tokens_seen": 242867775, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69140625, + "step": 11250, + "time_per_iteration": 2.427795171737671 + }, + { + "auxiliary_loss_clip": 0.01104705, + "auxiliary_loss_mlp": 0.01024882, + "balance_loss_clip": 1.01378322, + "balance_loss_mlp": 1.03724456, + "epoch": 0.676446715767323, + "flos": 20412236113920.0, + "grad_norm": 1.7440150220700932, + "language_loss": 0.75326031, + "learning_rate": 1.0011244896214497e-06, + "loss": 0.77455616, + "num_input_tokens_seen": 242886865, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.67578125, + "step": 11251, + "time_per_iteration": 2.453015089035034 + }, + { + "auxiliary_loss_clip": 0.01103045, + "auxiliary_loss_mlp": 0.01030739, + "balance_loss_clip": 1.01890135, + "balance_loss_mlp": 1.03677213, + "epoch": 0.676506839019991, + "flos": 21288241002240.0, + "grad_norm": 4.794996819995717, + "language_loss": 0.7030319, + "learning_rate": 1.0007870985230873e-06, + "loss": 0.7243697, + "num_input_tokens_seen": 242906705, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6640625, + "step": 11252, + "time_per_iteration": 2.507655382156372 + }, + { + "auxiliary_loss_clip": 0.01104897, + "auxiliary_loss_mlp": 0.01030105, + "balance_loss_clip": 1.01872027, + "balance_loss_mlp": 1.03790915, + "epoch": 0.676566962272659, + "flos": 29932477459200.0, + "grad_norm": 1.7295296864842329, + "language_loss": 0.66713816, + "learning_rate": 1.0004497453149765e-06, + "loss": 0.68848813, + "num_input_tokens_seen": 242925215, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.671875, + "step": 11253, + "time_per_iteration": 2.495661735534668 + }, + { + "auxiliary_loss_clip": 0.01106169, + "auxiliary_loss_mlp": 0.01034304, + "balance_loss_clip": 1.02159083, + "balance_loss_mlp": 1.03755689, + "epoch": 0.6766270855253269, + "flos": 17931203902080.0, + "grad_norm": 1.5712995070705533, + "language_loss": 0.77059627, + "learning_rate": 1.0001124300099115e-06, + "loss": 0.79200101, + "num_input_tokens_seen": 242944750, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 11254, + "time_per_iteration": 2.5303773880004883 + }, + { + "auxiliary_loss_clip": 0.01103059, + "auxiliary_loss_mlp": 0.0103136, + "balance_loss_clip": 1.01923668, + "balance_loss_mlp": 1.0353651, + "epoch": 0.6766872087779949, + "flos": 23104853389440.0, + "grad_norm": 2.008694221276799, + "language_loss": 0.72041488, + "learning_rate": 9.997751526206835e-07, + "loss": 0.74175906, + "num_input_tokens_seen": 242963860, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 11255, + "time_per_iteration": 2.4310834407806396 + }, + { + "auxiliary_loss_clip": 0.01103491, + "auxiliary_loss_mlp": 0.01034987, + "balance_loss_clip": 1.02294099, + "balance_loss_mlp": 1.03527474, + "epoch": 0.6767473320306628, + "flos": 26213137827840.0, + "grad_norm": 2.4309429012787533, + "language_loss": 0.75107753, + "learning_rate": 9.994379131600828e-07, + "loss": 0.77246231, + "num_input_tokens_seen": 242983050, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.68359375, + "step": 11256, + "time_per_iteration": 2.5040993690490723 + }, + { + "auxiliary_loss_clip": 0.01105082, + "auxiliary_loss_mlp": 0.01030951, + "balance_loss_clip": 1.01898217, + "balance_loss_mlp": 1.03802788, + "epoch": 0.6768074552833309, + "flos": 18368739469440.0, + "grad_norm": 2.256626523492283, + "language_loss": 0.64639592, + "learning_rate": 9.991007116408965e-07, + "loss": 0.6677562, + "num_input_tokens_seen": 243001125, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 11257, + "time_per_iteration": 2.4259557723999023 + }, + { + "auxiliary_loss_clip": 0.01097898, + "auxiliary_loss_mlp": 0.01028121, + "balance_loss_clip": 1.01709366, + "balance_loss_mlp": 1.03422582, + "epoch": 0.6768675785359988, + "flos": 23039927556480.0, + "grad_norm": 1.4043820681784667, + "language_loss": 0.75555968, + "learning_rate": 9.987635480759109e-07, + "loss": 0.77681983, + "num_input_tokens_seen": 243021865, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.63671875, + "step": 11258, + "time_per_iteration": 2.4665939807891846 + }, + { + "auxiliary_loss_clip": 0.01100628, + "auxiliary_loss_mlp": 0.01028723, + "balance_loss_clip": 1.01757717, + "balance_loss_mlp": 1.03654146, + "epoch": 0.6769277017886668, + "flos": 33036524092800.0, + "grad_norm": 1.6503685315767886, + "language_loss": 0.66716135, + "learning_rate": 9.984264224779127e-07, + "loss": 0.68845475, + "num_input_tokens_seen": 243042970, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.640625, + "step": 11259, + "time_per_iteration": 2.527073383331299 + }, + { + "auxiliary_loss_clip": 0.01104423, + "auxiliary_loss_mlp": 0.01027196, + "balance_loss_clip": 1.0155077, + "balance_loss_mlp": 1.03676665, + "epoch": 0.6769878250413347, + "flos": 20848406964480.0, + "grad_norm": 2.9058137848386902, + "language_loss": 0.85316312, + "learning_rate": 9.980893348596839e-07, + "loss": 0.87447935, + "num_input_tokens_seen": 243058470, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.67578125, + "step": 11260, + "time_per_iteration": 2.457331418991089 + }, + { + "auxiliary_loss_clip": 0.01104097, + "auxiliary_loss_mlp": 0.01032449, + "balance_loss_clip": 1.02011061, + "balance_loss_mlp": 1.03481388, + "epoch": 0.6770479482940027, + "flos": 15595968994560.0, + "grad_norm": 1.992894296567027, + "language_loss": 0.77366221, + "learning_rate": 9.977522852340081e-07, + "loss": 0.79502773, + "num_input_tokens_seen": 243076630, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6953125, + "step": 11261, + "time_per_iteration": 3.8098442554473877 + }, + { + "auxiliary_loss_clip": 0.01102151, + "auxiliary_loss_mlp": 0.01033066, + "balance_loss_clip": 1.0210495, + "balance_loss_mlp": 1.03392744, + "epoch": 0.6771080715466706, + "flos": 18621011664000.0, + "grad_norm": 1.8771294723417649, + "language_loss": 0.87785065, + "learning_rate": 9.97415273613666e-07, + "loss": 0.89920282, + "num_input_tokens_seen": 243092260, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 11262, + "time_per_iteration": 2.4098682403564453 + }, + { + "auxiliary_loss_clip": 0.0110654, + "auxiliary_loss_mlp": 0.0102957, + "balance_loss_clip": 1.01738644, + "balance_loss_mlp": 1.03773284, + "epoch": 0.6771681947993387, + "flos": 12495441893760.0, + "grad_norm": 2.144843606745404, + "language_loss": 0.73935968, + "learning_rate": 9.97078300011439e-07, + "loss": 0.76072079, + "num_input_tokens_seen": 243109405, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 11263, + "time_per_iteration": 3.836534261703491 + }, + { + "auxiliary_loss_clip": 0.01105867, + "auxiliary_loss_mlp": 0.01033538, + "balance_loss_clip": 1.020401, + "balance_loss_mlp": 1.03613746, + "epoch": 0.6772283180520066, + "flos": 22236964974720.0, + "grad_norm": 3.0336865802259716, + "language_loss": 0.67681348, + "learning_rate": 9.967413644401016e-07, + "loss": 0.6982075, + "num_input_tokens_seen": 243128135, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6953125, + "step": 11264, + "time_per_iteration": 3.8063998222351074 + }, + { + "auxiliary_loss_clip": 0.01104469, + "auxiliary_loss_mlp": 0.01036584, + "balance_loss_clip": 1.02429914, + "balance_loss_mlp": 1.03774631, + "epoch": 0.6772884413046746, + "flos": 16143139848960.0, + "grad_norm": 1.9309030757319006, + "language_loss": 0.72956276, + "learning_rate": 9.964044669124324e-07, + "loss": 0.75097328, + "num_input_tokens_seen": 243146785, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66796875, + "step": 11265, + "time_per_iteration": 3.905475616455078 + }, + { + "auxiliary_loss_clip": 0.01101535, + "auxiliary_loss_mlp": 0.01031399, + "balance_loss_clip": 1.02002645, + "balance_loss_mlp": 1.03592122, + "epoch": 0.6773485645573426, + "flos": 19135755515520.0, + "grad_norm": 1.5488311970116568, + "language_loss": 0.61298478, + "learning_rate": 9.96067607441207e-07, + "loss": 0.63431406, + "num_input_tokens_seen": 243165275, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65625, + "step": 11266, + "time_per_iteration": 2.4533629417419434 + }, + { + "auxiliary_loss_clip": 0.01105454, + "auxiliary_loss_mlp": 0.01035839, + "balance_loss_clip": 1.02384639, + "balance_loss_mlp": 1.03653467, + "epoch": 0.6774086878100105, + "flos": 14136918543360.0, + "grad_norm": 2.0018325327863455, + "language_loss": 0.70975608, + "learning_rate": 9.957307860391976e-07, + "loss": 0.73116899, + "num_input_tokens_seen": 243182845, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 11267, + "time_per_iteration": 2.4130048751831055 + }, + { + "auxiliary_loss_clip": 0.01102815, + "auxiliary_loss_mlp": 0.01027992, + "balance_loss_clip": 1.01627374, + "balance_loss_mlp": 1.03553224, + "epoch": 0.6774688110626785, + "flos": 22197067943040.0, + "grad_norm": 1.995940802920633, + "language_loss": 0.71196496, + "learning_rate": 9.953940027191785e-07, + "loss": 0.73327303, + "num_input_tokens_seen": 243201475, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 11268, + "time_per_iteration": 2.5001561641693115 + }, + { + "auxiliary_loss_clip": 0.01106446, + "auxiliary_loss_mlp": 0.01028349, + "balance_loss_clip": 1.01621413, + "balance_loss_mlp": 1.03911674, + "epoch": 0.6775289343153464, + "flos": 23039963470080.0, + "grad_norm": 1.4505290648176117, + "language_loss": 0.76658797, + "learning_rate": 9.950572574939194e-07, + "loss": 0.78793591, + "num_input_tokens_seen": 243221850, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 11269, + "time_per_iteration": 2.450594902038574 + }, + { + "auxiliary_loss_clip": 0.0110441, + "auxiliary_loss_mlp": 0.01033402, + "balance_loss_clip": 1.02046824, + "balance_loss_mlp": 1.03552103, + "epoch": 0.6775890575680145, + "flos": 18293506433280.0, + "grad_norm": 1.9037033189032353, + "language_loss": 0.74434447, + "learning_rate": 9.94720550376189e-07, + "loss": 0.76572257, + "num_input_tokens_seen": 243239855, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6875, + "step": 11270, + "time_per_iteration": 2.4480292797088623 + }, + { + "auxiliary_loss_clip": 0.01105285, + "auxiliary_loss_mlp": 0.01037183, + "balance_loss_clip": 1.02421904, + "balance_loss_mlp": 1.03799176, + "epoch": 0.6776491808206824, + "flos": 25336450581120.0, + "grad_norm": 1.765961836580733, + "language_loss": 0.72747099, + "learning_rate": 9.94383881378756e-07, + "loss": 0.74889576, + "num_input_tokens_seen": 243260085, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.671875, + "step": 11271, + "time_per_iteration": 2.466099739074707 + }, + { + "auxiliary_loss_clip": 0.01104325, + "auxiliary_loss_mlp": 0.01033539, + "balance_loss_clip": 1.02158785, + "balance_loss_mlp": 1.0367682, + "epoch": 0.6777093040733504, + "flos": 26028233591040.0, + "grad_norm": 1.5327741783409103, + "language_loss": 0.67725623, + "learning_rate": 9.94047250514387e-07, + "loss": 0.69863486, + "num_input_tokens_seen": 243280065, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 11272, + "time_per_iteration": 2.506606340408325 + }, + { + "auxiliary_loss_clip": 0.01107233, + "auxiliary_loss_mlp": 0.01034611, + "balance_loss_clip": 1.02115774, + "balance_loss_mlp": 1.03756714, + "epoch": 0.6777694273260183, + "flos": 18003599763840.0, + "grad_norm": 2.19334323210367, + "language_loss": 0.73699766, + "learning_rate": 9.937106577958481e-07, + "loss": 0.75841612, + "num_input_tokens_seen": 243297775, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.6953125, + "step": 11273, + "time_per_iteration": 2.40608286857605 + }, + { + "auxiliary_loss_clip": 0.01101569, + "auxiliary_loss_mlp": 0.01036562, + "balance_loss_clip": 1.02462888, + "balance_loss_mlp": 1.03617656, + "epoch": 0.6778295505786863, + "flos": 23441085624960.0, + "grad_norm": 2.20814425061036, + "language_loss": 0.70081609, + "learning_rate": 9.933741032359015e-07, + "loss": 0.72219741, + "num_input_tokens_seen": 243315760, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 11274, + "time_per_iteration": 2.476304769515991 + }, + { + "auxiliary_loss_clip": 0.01104951, + "auxiliary_loss_mlp": 0.0103141, + "balance_loss_clip": 1.01921475, + "balance_loss_mlp": 1.03662062, + "epoch": 0.6778896738313542, + "flos": 19098408349440.0, + "grad_norm": 1.6447665363620352, + "language_loss": 0.65597254, + "learning_rate": 9.930375868473093e-07, + "loss": 0.67733622, + "num_input_tokens_seen": 243335715, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 11275, + "time_per_iteration": 2.4458420276641846 + }, + { + "auxiliary_loss_clip": 0.01103666, + "auxiliary_loss_mlp": 0.01032506, + "balance_loss_clip": 1.02177751, + "balance_loss_mlp": 1.03688347, + "epoch": 0.6779497970840223, + "flos": 26103933504000.0, + "grad_norm": 2.26567322463042, + "language_loss": 0.72724402, + "learning_rate": 9.927011086428335e-07, + "loss": 0.74860573, + "num_input_tokens_seen": 243356935, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.66796875, + "step": 11276, + "time_per_iteration": 2.506394624710083 + }, + { + "auxiliary_loss_clip": 0.01103474, + "auxiliary_loss_mlp": 0.01029393, + "balance_loss_clip": 1.01724589, + "balance_loss_mlp": 1.03681684, + "epoch": 0.6780099203366902, + "flos": 19719232041600.0, + "grad_norm": 1.7387203972635623, + "language_loss": 0.76835978, + "learning_rate": 9.923646686352317e-07, + "loss": 0.78968847, + "num_input_tokens_seen": 243375625, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.66796875, + "step": 11277, + "time_per_iteration": 2.4156947135925293 + }, + { + "auxiliary_loss_clip": 0.01106329, + "auxiliary_loss_mlp": 0.01027599, + "balance_loss_clip": 1.01580894, + "balance_loss_mlp": 1.03709924, + "epoch": 0.6780700435893582, + "flos": 18214538382720.0, + "grad_norm": 3.843343867942956, + "language_loss": 0.83494425, + "learning_rate": 9.920282668372627e-07, + "loss": 0.85628355, + "num_input_tokens_seen": 243390195, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.69140625, + "step": 11278, + "time_per_iteration": 2.4242331981658936 + }, + { + "auxiliary_loss_clip": 0.01100898, + "auxiliary_loss_mlp": 0.01030557, + "balance_loss_clip": 1.01966131, + "balance_loss_mlp": 1.03655803, + "epoch": 0.6781301668420262, + "flos": 25376239872000.0, + "grad_norm": 1.546828654628467, + "language_loss": 0.70229775, + "learning_rate": 9.916919032616844e-07, + "loss": 0.72361231, + "num_input_tokens_seen": 243411690, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.640625, + "step": 11279, + "time_per_iteration": 2.4774818420410156 + }, + { + "auxiliary_loss_clip": 0.01103487, + "auxiliary_loss_mlp": 0.01030453, + "balance_loss_clip": 1.01785898, + "balance_loss_mlp": 1.03606427, + "epoch": 0.6781902900946941, + "flos": 24020432087040.0, + "grad_norm": 1.8996542277217034, + "language_loss": 0.74191052, + "learning_rate": 9.913555779212485e-07, + "loss": 0.76324993, + "num_input_tokens_seen": 243430280, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.671875, + "step": 11280, + "time_per_iteration": 2.4954020977020264 + }, + { + "auxiliary_loss_clip": 0.01106782, + "auxiliary_loss_mlp": 0.01029984, + "balance_loss_clip": 1.01768732, + "balance_loss_mlp": 1.03710222, + "epoch": 0.6782504133473621, + "flos": 19646764352640.0, + "grad_norm": 1.8728658209175957, + "language_loss": 0.70118409, + "learning_rate": 9.910192908287104e-07, + "loss": 0.7225517, + "num_input_tokens_seen": 243448690, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 11281, + "time_per_iteration": 2.4171640872955322 + }, + { + "auxiliary_loss_clip": 0.01100593, + "auxiliary_loss_mlp": 0.01025939, + "balance_loss_clip": 1.01519203, + "balance_loss_mlp": 1.03611064, + "epoch": 0.67831053660003, + "flos": 24932742647040.0, + "grad_norm": 1.563642265820809, + "language_loss": 0.63874096, + "learning_rate": 9.906830419968217e-07, + "loss": 0.66000628, + "num_input_tokens_seen": 243470695, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6484375, + "step": 11282, + "time_per_iteration": 2.5364012718200684 + }, + { + "auxiliary_loss_clip": 0.0110743, + "auxiliary_loss_mlp": 0.01036912, + "balance_loss_clip": 1.02427554, + "balance_loss_mlp": 1.03683639, + "epoch": 0.6783706598526981, + "flos": 31208383440000.0, + "grad_norm": 1.5622929992593626, + "language_loss": 0.74648255, + "learning_rate": 9.90346831438334e-07, + "loss": 0.76792598, + "num_input_tokens_seen": 243493345, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 11283, + "time_per_iteration": 2.5009424686431885 + }, + { + "auxiliary_loss_clip": 0.01101134, + "auxiliary_loss_mlp": 0.01027647, + "balance_loss_clip": 1.01622117, + "balance_loss_mlp": 1.03523421, + "epoch": 0.678430783105366, + "flos": 35441317687680.0, + "grad_norm": 1.6182405596102953, + "language_loss": 0.5701533, + "learning_rate": 9.900106591659948e-07, + "loss": 0.59144115, + "num_input_tokens_seen": 243515670, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66015625, + "step": 11284, + "time_per_iteration": 2.5896449089050293 + }, + { + "auxiliary_loss_clip": 0.01101588, + "auxiliary_loss_mlp": 0.0102962, + "balance_loss_clip": 1.01796126, + "balance_loss_mlp": 1.03485477, + "epoch": 0.678490906358034, + "flos": 14428800460800.0, + "grad_norm": 2.4677100655448485, + "language_loss": 0.75404185, + "learning_rate": 9.896745251925535e-07, + "loss": 0.77535391, + "num_input_tokens_seen": 243533625, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66796875, + "step": 11285, + "time_per_iteration": 2.53873872756958 + }, + { + "auxiliary_loss_clip": 0.01102067, + "auxiliary_loss_mlp": 0.01028054, + "balance_loss_clip": 1.01661038, + "balance_loss_mlp": 1.03747129, + "epoch": 0.6785510296107019, + "flos": 24311236596480.0, + "grad_norm": 1.8021221276720163, + "language_loss": 0.66290027, + "learning_rate": 9.893384295307557e-07, + "loss": 0.68420148, + "num_input_tokens_seen": 243553040, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 11286, + "time_per_iteration": 2.498288631439209 + }, + { + "auxiliary_loss_clip": 0.0110234, + "auxiliary_loss_mlp": 0.01029185, + "balance_loss_clip": 1.01754975, + "balance_loss_mlp": 1.03434348, + "epoch": 0.6786111528633699, + "flos": 26977244872320.0, + "grad_norm": 2.2344222526167083, + "language_loss": 0.52489305, + "learning_rate": 9.890023721933447e-07, + "loss": 0.54620832, + "num_input_tokens_seen": 243572590, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 11287, + "time_per_iteration": 2.470860719680786 + }, + { + "auxiliary_loss_clip": 0.01102428, + "auxiliary_loss_mlp": 0.01029695, + "balance_loss_clip": 1.01842999, + "balance_loss_mlp": 1.0358603, + "epoch": 0.6786712761160378, + "flos": 24317557390080.0, + "grad_norm": 2.2748309661133086, + "language_loss": 0.77437216, + "learning_rate": 9.886663531930655e-07, + "loss": 0.7956934, + "num_input_tokens_seen": 243594140, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6640625, + "step": 11288, + "time_per_iteration": 2.507276773452759 + }, + { + "auxiliary_loss_clip": 0.01105773, + "auxiliary_loss_mlp": 0.01034994, + "balance_loss_clip": 1.02353239, + "balance_loss_mlp": 1.03752971, + "epoch": 0.6787313993687059, + "flos": 22930435923840.0, + "grad_norm": 1.9600358072539563, + "language_loss": 0.73192465, + "learning_rate": 9.883303725426593e-07, + "loss": 0.75333238, + "num_input_tokens_seen": 243615170, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.68359375, + "step": 11289, + "time_per_iteration": 2.466587781906128 + }, + { + "auxiliary_loss_clip": 0.01103364, + "auxiliary_loss_mlp": 0.01035376, + "balance_loss_clip": 1.02300215, + "balance_loss_mlp": 1.0357126, + "epoch": 0.6787915226213738, + "flos": 26868435598080.0, + "grad_norm": 1.567844133932764, + "language_loss": 0.80266666, + "learning_rate": 9.879944302548682e-07, + "loss": 0.82405412, + "num_input_tokens_seen": 243635675, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.67578125, + "step": 11290, + "time_per_iteration": 2.5057084560394287 + }, + { + "auxiliary_loss_clip": 0.01100237, + "auxiliary_loss_mlp": 0.01027997, + "balance_loss_clip": 1.01677918, + "balance_loss_mlp": 1.03600717, + "epoch": 0.6788516458740418, + "flos": 20008851402240.0, + "grad_norm": 2.2351562454410034, + "language_loss": 0.75014412, + "learning_rate": 9.87658526342428e-07, + "loss": 0.77142644, + "num_input_tokens_seen": 243654950, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 11291, + "time_per_iteration": 2.4530417919158936 + }, + { + "auxiliary_loss_clip": 0.01105979, + "auxiliary_loss_mlp": 0.01034196, + "balance_loss_clip": 1.02219784, + "balance_loss_mlp": 1.03691578, + "epoch": 0.6789117691267098, + "flos": 28727099832960.0, + "grad_norm": 1.8299710869537638, + "language_loss": 0.75613016, + "learning_rate": 9.873226608180785e-07, + "loss": 0.77753186, + "num_input_tokens_seen": 243674970, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 11292, + "time_per_iteration": 2.560930013656616 + }, + { + "auxiliary_loss_clip": 0.01103978, + "auxiliary_loss_mlp": 0.01028719, + "balance_loss_clip": 1.01666081, + "balance_loss_mlp": 1.03636706, + "epoch": 0.6789718923793777, + "flos": 23403451150080.0, + "grad_norm": 1.9135755383501691, + "language_loss": 0.83619392, + "learning_rate": 9.869868336945556e-07, + "loss": 0.85752094, + "num_input_tokens_seen": 243693440, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 11293, + "time_per_iteration": 2.442145824432373 + }, + { + "auxiliary_loss_clip": 0.01111617, + "auxiliary_loss_mlp": 0.01037557, + "balance_loss_clip": 1.02419984, + "balance_loss_mlp": 1.03933525, + "epoch": 0.6790320156320457, + "flos": 20448865008000.0, + "grad_norm": 2.319599838777995, + "language_loss": 0.79377204, + "learning_rate": 9.866510449845929e-07, + "loss": 0.81526375, + "num_input_tokens_seen": 243710055, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.72265625, + "step": 11294, + "time_per_iteration": 2.487916946411133 + }, + { + "auxiliary_loss_clip": 0.0110334, + "auxiliary_loss_mlp": 0.01027757, + "balance_loss_clip": 1.0165689, + "balance_loss_mlp": 1.0358336, + "epoch": 0.6790921388847136, + "flos": 24167199058560.0, + "grad_norm": 1.670516322497649, + "language_loss": 0.79154253, + "learning_rate": 9.86315294700924e-07, + "loss": 0.81285346, + "num_input_tokens_seen": 243728635, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.67578125, + "step": 11295, + "time_per_iteration": 2.466892957687378 + }, + { + "auxiliary_loss_clip": 0.01099219, + "auxiliary_loss_mlp": 0.01028607, + "balance_loss_clip": 1.01862347, + "balance_loss_mlp": 1.03505814, + "epoch": 0.6791522621373817, + "flos": 21908095027200.0, + "grad_norm": 1.698673678539366, + "language_loss": 0.71407616, + "learning_rate": 9.859795828562823e-07, + "loss": 0.73535442, + "num_input_tokens_seen": 243748330, + "router_z_loss_clip": 0.09960938, + "router_z_loss_mlp": 0.640625, + "step": 11296, + "time_per_iteration": 2.482555866241455 + }, + { + "auxiliary_loss_clip": 0.01101606, + "auxiliary_loss_mlp": 0.0102908, + "balance_loss_clip": 1.01736212, + "balance_loss_mlp": 1.03510296, + "epoch": 0.6792123853900496, + "flos": 24826519152000.0, + "grad_norm": 1.753920624789111, + "language_loss": 0.70683616, + "learning_rate": 9.856439094633949e-07, + "loss": 0.72814304, + "num_input_tokens_seen": 243769380, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 11297, + "time_per_iteration": 2.466238021850586 + }, + { + "auxiliary_loss_clip": 0.01106999, + "auxiliary_loss_mlp": 0.01030897, + "balance_loss_clip": 1.01821899, + "balance_loss_mlp": 1.03667176, + "epoch": 0.6792725086427176, + "flos": 17566279678080.0, + "grad_norm": 2.1069890127028974, + "language_loss": 0.66267467, + "learning_rate": 9.853082745349918e-07, + "loss": 0.6840536, + "num_input_tokens_seen": 243785510, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 11298, + "time_per_iteration": 2.424710273742676 + }, + { + "auxiliary_loss_clip": 0.01103908, + "auxiliary_loss_mlp": 0.01029396, + "balance_loss_clip": 1.01871479, + "balance_loss_mlp": 1.03633463, + "epoch": 0.6793326318953855, + "flos": 26941837040640.0, + "grad_norm": 1.7026224144439064, + "language_loss": 0.71526003, + "learning_rate": 9.84972678083801e-07, + "loss": 0.73659307, + "num_input_tokens_seen": 243805545, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.67578125, + "step": 11299, + "time_per_iteration": 2.4778668880462646 + }, + { + "auxiliary_loss_clip": 0.01105656, + "auxiliary_loss_mlp": 0.01032112, + "balance_loss_clip": 1.02016139, + "balance_loss_mlp": 1.03812611, + "epoch": 0.6793927551480535, + "flos": 24318275662080.0, + "grad_norm": 1.4081485921140142, + "language_loss": 0.77155232, + "learning_rate": 9.846371201225488e-07, + "loss": 0.79293001, + "num_input_tokens_seen": 243825185, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.67578125, + "step": 11300, + "time_per_iteration": 2.492253541946411 + }, + { + "auxiliary_loss_clip": 0.01103159, + "auxiliary_loss_mlp": 0.01029717, + "balance_loss_clip": 1.01748598, + "balance_loss_mlp": 1.03599048, + "epoch": 0.6794528784007214, + "flos": 11436615757440.0, + "grad_norm": 1.7968797031135182, + "language_loss": 0.62885916, + "learning_rate": 9.843016006639577e-07, + "loss": 0.65018791, + "num_input_tokens_seen": 243841600, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 11301, + "time_per_iteration": 2.397135019302368 + }, + { + "auxiliary_loss_clip": 0.01102494, + "auxiliary_loss_mlp": 0.0102808, + "balance_loss_clip": 1.01690459, + "balance_loss_mlp": 1.03594089, + "epoch": 0.6795130016533895, + "flos": 25229688382080.0, + "grad_norm": 1.724284284453245, + "language_loss": 0.82755935, + "learning_rate": 9.839661197207525e-07, + "loss": 0.84886515, + "num_input_tokens_seen": 243862250, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6640625, + "step": 11302, + "time_per_iteration": 2.472766399383545 + }, + { + "auxiliary_loss_clip": 0.01106208, + "auxiliary_loss_mlp": 0.01031247, + "balance_loss_clip": 1.01926029, + "balance_loss_mlp": 1.03716099, + "epoch": 0.6795731249060574, + "flos": 18296415434880.0, + "grad_norm": 2.1762222349963176, + "language_loss": 0.69784915, + "learning_rate": 9.83630677305654e-07, + "loss": 0.71922374, + "num_input_tokens_seen": 243880560, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 11303, + "time_per_iteration": 3.805736780166626 + }, + { + "auxiliary_loss_clip": 0.0110718, + "auxiliary_loss_mlp": 0.01029925, + "balance_loss_clip": 1.01801562, + "balance_loss_mlp": 1.03717601, + "epoch": 0.6796332481587254, + "flos": 20300374183680.0, + "grad_norm": 2.3868097803445383, + "language_loss": 0.69926792, + "learning_rate": 9.832952734313813e-07, + "loss": 0.72063893, + "num_input_tokens_seen": 243900635, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.703125, + "step": 11304, + "time_per_iteration": 2.4878110885620117 + }, + { + "auxiliary_loss_clip": 0.01107692, + "auxiliary_loss_mlp": 0.01031025, + "balance_loss_clip": 1.0188539, + "balance_loss_mlp": 1.03924417, + "epoch": 0.6796933714113934, + "flos": 23586847015680.0, + "grad_norm": 2.7487345535411407, + "language_loss": 0.72523355, + "learning_rate": 9.829599081106536e-07, + "loss": 0.74662066, + "num_input_tokens_seen": 243920160, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.68359375, + "step": 11305, + "time_per_iteration": 3.969510316848755 + }, + { + "auxiliary_loss_clip": 0.01103346, + "auxiliary_loss_mlp": 0.01027504, + "balance_loss_clip": 1.01541042, + "balance_loss_mlp": 1.03585541, + "epoch": 0.6797534946640613, + "flos": 27119917693440.0, + "grad_norm": 1.9643394158396053, + "language_loss": 0.65558803, + "learning_rate": 9.826245813561882e-07, + "loss": 0.67689657, + "num_input_tokens_seen": 243939015, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 11306, + "time_per_iteration": 5.400679111480713 + }, + { + "auxiliary_loss_clip": 0.01101932, + "auxiliary_loss_mlp": 0.01027949, + "balance_loss_clip": 1.01505661, + "balance_loss_mlp": 1.03540945, + "epoch": 0.6798136179167293, + "flos": 22127437428480.0, + "grad_norm": 1.6667606428941142, + "language_loss": 0.79942191, + "learning_rate": 9.822892931807021e-07, + "loss": 0.82072073, + "num_input_tokens_seen": 243958470, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6640625, + "step": 11307, + "time_per_iteration": 2.430248260498047 + }, + { + "auxiliary_loss_clip": 0.01103369, + "auxiliary_loss_mlp": 0.01030053, + "balance_loss_clip": 1.01866865, + "balance_loss_mlp": 1.03694439, + "epoch": 0.6798737411693972, + "flos": 17488640430720.0, + "grad_norm": 1.5435492505708737, + "language_loss": 0.88790625, + "learning_rate": 9.819540435969066e-07, + "loss": 0.90924048, + "num_input_tokens_seen": 243975450, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6640625, + "step": 11308, + "time_per_iteration": 2.456007242202759 + }, + { + "auxiliary_loss_clip": 0.01104873, + "auxiliary_loss_mlp": 0.01036527, + "balance_loss_clip": 1.02374792, + "balance_loss_mlp": 1.03597665, + "epoch": 0.6799338644220653, + "flos": 22892262744960.0, + "grad_norm": 2.037595188669874, + "language_loss": 0.71198809, + "learning_rate": 9.816188326175154e-07, + "loss": 0.73340213, + "num_input_tokens_seen": 243994355, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6875, + "step": 11309, + "time_per_iteration": 2.444063901901245 + }, + { + "auxiliary_loss_clip": 0.01104515, + "auxiliary_loss_mlp": 0.01033962, + "balance_loss_clip": 1.02223217, + "balance_loss_mlp": 1.03636754, + "epoch": 0.6799939876747332, + "flos": 23180409648000.0, + "grad_norm": 2.0611426595675915, + "language_loss": 0.84300488, + "learning_rate": 9.812836602552411e-07, + "loss": 0.86438966, + "num_input_tokens_seen": 244011620, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.68359375, + "step": 11310, + "time_per_iteration": 2.4817349910736084 + }, + { + "auxiliary_loss_clip": 0.01102101, + "auxiliary_loss_mlp": 0.01027846, + "balance_loss_clip": 1.01696813, + "balance_loss_mlp": 1.03708959, + "epoch": 0.6800541109274012, + "flos": 19499925553920.0, + "grad_norm": 2.1934331981692963, + "language_loss": 0.82783055, + "learning_rate": 9.80948526522792e-07, + "loss": 0.84913009, + "num_input_tokens_seen": 244029925, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6484375, + "step": 11311, + "time_per_iteration": 2.4103691577911377 + }, + { + "auxiliary_loss_clip": 0.01107302, + "auxiliary_loss_mlp": 0.01032182, + "balance_loss_clip": 1.01871729, + "balance_loss_mlp": 1.03547812, + "epoch": 0.6801142341800691, + "flos": 22277652105600.0, + "grad_norm": 2.5662813310714268, + "language_loss": 0.76297283, + "learning_rate": 9.806134314328767e-07, + "loss": 0.78436768, + "num_input_tokens_seen": 244051225, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71875, + "step": 11312, + "time_per_iteration": 2.5150935649871826 + }, + { + "auxiliary_loss_clip": 0.01027323, + "auxiliary_loss_mlp": 0.01002804, + "balance_loss_clip": 1.00166547, + "balance_loss_mlp": 1.00670671, + "epoch": 0.6801743574327371, + "flos": 68714817759360.0, + "grad_norm": 0.6868398662733849, + "language_loss": 0.57254708, + "learning_rate": 9.802783749982038e-07, + "loss": 0.59284842, + "num_input_tokens_seen": 244115930, + "router_z_loss_clip": 0.01141357, + "router_z_loss_mlp": 0.20605469, + "step": 11313, + "time_per_iteration": 3.1505696773529053 + }, + { + "auxiliary_loss_clip": 0.01103458, + "auxiliary_loss_mlp": 0.01027729, + "balance_loss_clip": 1.01572418, + "balance_loss_mlp": 1.03516006, + "epoch": 0.680234480685405, + "flos": 29460467813760.0, + "grad_norm": 1.7918563854588148, + "language_loss": 0.68882596, + "learning_rate": 9.799433572314754e-07, + "loss": 0.71013784, + "num_input_tokens_seen": 244137320, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 11314, + "time_per_iteration": 2.5254998207092285 + }, + { + "auxiliary_loss_clip": 0.01099909, + "auxiliary_loss_mlp": 0.01028487, + "balance_loss_clip": 1.01754403, + "balance_loss_mlp": 1.03417087, + "epoch": 0.6802946039380731, + "flos": 15916866122880.0, + "grad_norm": 1.7481645051595534, + "language_loss": 0.81398594, + "learning_rate": 9.796083781453972e-07, + "loss": 0.83526987, + "num_input_tokens_seen": 244152755, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.65625, + "step": 11315, + "time_per_iteration": 2.453127861022949 + }, + { + "auxiliary_loss_clip": 0.01104752, + "auxiliary_loss_mlp": 0.01026651, + "balance_loss_clip": 1.01452708, + "balance_loss_mlp": 1.03766704, + "epoch": 0.680354727190741, + "flos": 22018664067840.0, + "grad_norm": 1.6730986060802988, + "language_loss": 0.69740957, + "learning_rate": 9.792734377526718e-07, + "loss": 0.7187236, + "num_input_tokens_seen": 244171480, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 11316, + "time_per_iteration": 2.483550548553467 + }, + { + "auxiliary_loss_clip": 0.01103992, + "auxiliary_loss_mlp": 0.01026651, + "balance_loss_clip": 1.0155412, + "balance_loss_mlp": 1.03765678, + "epoch": 0.680414850443409, + "flos": 18441494467200.0, + "grad_norm": 2.178074033436339, + "language_loss": 0.66859937, + "learning_rate": 9.789385360660003e-07, + "loss": 0.68990576, + "num_input_tokens_seen": 244187920, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 11317, + "time_per_iteration": 2.4059898853302 + }, + { + "auxiliary_loss_clip": 0.01106005, + "auxiliary_loss_mlp": 0.0103958, + "balance_loss_clip": 1.02807629, + "balance_loss_mlp": 1.0385282, + "epoch": 0.680474973696077, + "flos": 26358611909760.0, + "grad_norm": 1.4508017405477542, + "language_loss": 0.75009024, + "learning_rate": 9.78603673098082e-07, + "loss": 0.77154613, + "num_input_tokens_seen": 244209565, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.67578125, + "step": 11318, + "time_per_iteration": 2.499570608139038 + }, + { + "auxiliary_loss_clip": 0.01097899, + "auxiliary_loss_mlp": 0.01028185, + "balance_loss_clip": 1.01697898, + "balance_loss_mlp": 1.03418541, + "epoch": 0.6805350969487449, + "flos": 18333116156160.0, + "grad_norm": 2.7236911079158985, + "language_loss": 0.6802513, + "learning_rate": 9.782688488616143e-07, + "loss": 0.7015121, + "num_input_tokens_seen": 244228015, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.63671875, + "step": 11319, + "time_per_iteration": 2.4078075885772705 + }, + { + "auxiliary_loss_clip": 0.01101617, + "auxiliary_loss_mlp": 0.01037234, + "balance_loss_clip": 1.02501535, + "balance_loss_mlp": 1.03571796, + "epoch": 0.6805952202014129, + "flos": 19937497034880.0, + "grad_norm": 1.8193525574873417, + "language_loss": 0.76578677, + "learning_rate": 9.779340633692945e-07, + "loss": 0.7871753, + "num_input_tokens_seen": 244245615, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65625, + "step": 11320, + "time_per_iteration": 2.4763078689575195 + }, + { + "auxiliary_loss_clip": 0.011026, + "auxiliary_loss_mlp": 0.01028294, + "balance_loss_clip": 1.01627135, + "balance_loss_mlp": 1.0357213, + "epoch": 0.6806553434540809, + "flos": 25224301342080.0, + "grad_norm": 2.0578108779297732, + "language_loss": 0.74360389, + "learning_rate": 9.77599316633817e-07, + "loss": 0.76491284, + "num_input_tokens_seen": 244263625, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66796875, + "step": 11321, + "time_per_iteration": 2.4495351314544678 + }, + { + "auxiliary_loss_clip": 0.01105692, + "auxiliary_loss_mlp": 0.01034068, + "balance_loss_clip": 1.02243876, + "balance_loss_mlp": 1.03807235, + "epoch": 0.6807154667067489, + "flos": 17785586165760.0, + "grad_norm": 1.8874116924899373, + "language_loss": 0.72533345, + "learning_rate": 9.772646086678758e-07, + "loss": 0.74673104, + "num_input_tokens_seen": 244282745, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.67578125, + "step": 11322, + "time_per_iteration": 2.4374794960021973 + }, + { + "auxiliary_loss_clip": 0.01102931, + "auxiliary_loss_mlp": 0.01028655, + "balance_loss_clip": 1.01677608, + "balance_loss_mlp": 1.03495407, + "epoch": 0.6807755899594168, + "flos": 22199905117440.0, + "grad_norm": 1.6803003695181602, + "language_loss": 0.78470093, + "learning_rate": 9.769299394841638e-07, + "loss": 0.8060168, + "num_input_tokens_seen": 244303770, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 11323, + "time_per_iteration": 2.4333457946777344 + }, + { + "auxiliary_loss_clip": 0.01027457, + "auxiliary_loss_mlp": 0.01001857, + "balance_loss_clip": 1.00065899, + "balance_loss_mlp": 1.00677872, + "epoch": 0.6808357132120848, + "flos": 68631073200000.0, + "grad_norm": 0.7788248321760284, + "language_loss": 0.57097274, + "learning_rate": 9.765953090953714e-07, + "loss": 0.59126586, + "num_input_tokens_seen": 244355910, + "router_z_loss_clip": 0.01196289, + "router_z_loss_mlp": 0.20703125, + "step": 11324, + "time_per_iteration": 2.87032413482666 + }, + { + "auxiliary_loss_clip": 0.01104753, + "auxiliary_loss_mlp": 0.01034165, + "balance_loss_clip": 1.02192771, + "balance_loss_mlp": 1.03705823, + "epoch": 0.6808958364647527, + "flos": 23843357015040.0, + "grad_norm": 1.797689988899455, + "language_loss": 0.68072367, + "learning_rate": 9.76260717514186e-07, + "loss": 0.70211285, + "num_input_tokens_seen": 244376610, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6796875, + "step": 11325, + "time_per_iteration": 2.4791805744171143 + }, + { + "auxiliary_loss_clip": 0.01107083, + "auxiliary_loss_mlp": 0.01031616, + "balance_loss_clip": 1.01901543, + "balance_loss_mlp": 1.03593659, + "epoch": 0.6809559597174207, + "flos": 17711717846400.0, + "grad_norm": 9.902559035776392, + "language_loss": 0.7025001, + "learning_rate": 9.759261647532974e-07, + "loss": 0.72388709, + "num_input_tokens_seen": 244393000, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 11326, + "time_per_iteration": 2.411768913269043 + }, + { + "auxiliary_loss_clip": 0.01103444, + "auxiliary_loss_mlp": 0.01030379, + "balance_loss_clip": 1.01868427, + "balance_loss_mlp": 1.03564632, + "epoch": 0.6810160829700886, + "flos": 22491894775680.0, + "grad_norm": 1.7689960274485943, + "language_loss": 0.72761798, + "learning_rate": 9.75591650825392e-07, + "loss": 0.7489562, + "num_input_tokens_seen": 244409515, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 11327, + "time_per_iteration": 2.4436709880828857 + }, + { + "auxiliary_loss_clip": 0.01101261, + "auxiliary_loss_mlp": 0.01031062, + "balance_loss_clip": 1.01918912, + "balance_loss_mlp": 1.03561234, + "epoch": 0.6810762062227567, + "flos": 16832875783680.0, + "grad_norm": 2.3861554573552533, + "language_loss": 0.77319372, + "learning_rate": 9.752571757431526e-07, + "loss": 0.79451698, + "num_input_tokens_seen": 244427165, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.65625, + "step": 11328, + "time_per_iteration": 2.427549123764038 + }, + { + "auxiliary_loss_clip": 0.01104861, + "auxiliary_loss_mlp": 0.01029367, + "balance_loss_clip": 1.01756525, + "balance_loss_mlp": 1.03677118, + "epoch": 0.6811363294754246, + "flos": 12714676554240.0, + "grad_norm": 3.828786564380187, + "language_loss": 0.64639735, + "learning_rate": 9.74922739519265e-07, + "loss": 0.66773969, + "num_input_tokens_seen": 244445705, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.68359375, + "step": 11329, + "time_per_iteration": 2.4063379764556885 + }, + { + "auxiliary_loss_clip": 0.01106328, + "auxiliary_loss_mlp": 0.01029313, + "balance_loss_clip": 1.01713562, + "balance_loss_mlp": 1.03745294, + "epoch": 0.6811964527280926, + "flos": 17711969241600.0, + "grad_norm": 1.9960449149160304, + "language_loss": 0.79504317, + "learning_rate": 9.745883421664096e-07, + "loss": 0.81639957, + "num_input_tokens_seen": 244460415, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6875, + "step": 11330, + "time_per_iteration": 2.4729740619659424 + }, + { + "auxiliary_loss_clip": 0.0110534, + "auxiliary_loss_mlp": 0.01029907, + "balance_loss_clip": 1.01765263, + "balance_loss_mlp": 1.03767729, + "epoch": 0.6812565759807605, + "flos": 24863471268480.0, + "grad_norm": 3.982985267736798, + "language_loss": 0.63851273, + "learning_rate": 9.742539836972665e-07, + "loss": 0.6598652, + "num_input_tokens_seen": 244480555, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.67578125, + "step": 11331, + "time_per_iteration": 2.4589385986328125 + }, + { + "auxiliary_loss_clip": 0.01104506, + "auxiliary_loss_mlp": 0.01034793, + "balance_loss_clip": 1.02241278, + "balance_loss_mlp": 1.03761506, + "epoch": 0.6813166992334285, + "flos": 17166019449600.0, + "grad_norm": 1.9198633310725437, + "language_loss": 0.7197634, + "learning_rate": 9.739196641245148e-07, + "loss": 0.7411564, + "num_input_tokens_seen": 244498540, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.66796875, + "step": 11332, + "time_per_iteration": 2.48699951171875 + }, + { + "auxiliary_loss_clip": 0.01105323, + "auxiliary_loss_mlp": 0.01032093, + "balance_loss_clip": 1.0197432, + "balance_loss_mlp": 1.03659022, + "epoch": 0.6813768224860965, + "flos": 18843550375680.0, + "grad_norm": 1.8849624776188914, + "language_loss": 0.75043106, + "learning_rate": 9.735853834608326e-07, + "loss": 0.77180523, + "num_input_tokens_seen": 244517015, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6875, + "step": 11333, + "time_per_iteration": 2.4035282135009766 + }, + { + "auxiliary_loss_clip": 0.01109278, + "auxiliary_loss_mlp": 0.01029254, + "balance_loss_clip": 1.01664138, + "balance_loss_mlp": 1.03870749, + "epoch": 0.6814369457387645, + "flos": 24532733813760.0, + "grad_norm": 1.3964934580500172, + "language_loss": 0.71910471, + "learning_rate": 9.732511417188963e-07, + "loss": 0.74048996, + "num_input_tokens_seen": 244537450, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.70703125, + "step": 11334, + "time_per_iteration": 2.514709234237671 + }, + { + "auxiliary_loss_clip": 0.01102183, + "auxiliary_loss_mlp": 0.01031072, + "balance_loss_clip": 1.01966393, + "balance_loss_mlp": 1.03584528, + "epoch": 0.6814970689914325, + "flos": 18222978078720.0, + "grad_norm": 1.6647407719870675, + "language_loss": 0.85981625, + "learning_rate": 9.729169389113791e-07, + "loss": 0.88114882, + "num_input_tokens_seen": 244555640, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 11335, + "time_per_iteration": 2.566171169281006 + }, + { + "auxiliary_loss_clip": 0.0109703, + "auxiliary_loss_mlp": 0.01027373, + "balance_loss_clip": 1.01659656, + "balance_loss_mlp": 1.03387475, + "epoch": 0.6815571922441004, + "flos": 25228790542080.0, + "grad_norm": 2.956835270100481, + "language_loss": 0.81945407, + "learning_rate": 9.725827750509542e-07, + "loss": 0.84069812, + "num_input_tokens_seen": 244574005, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6328125, + "step": 11336, + "time_per_iteration": 2.50917911529541 + }, + { + "auxiliary_loss_clip": 0.01100635, + "auxiliary_loss_mlp": 0.01029151, + "balance_loss_clip": 1.01822007, + "balance_loss_mlp": 1.03596747, + "epoch": 0.6816173154967684, + "flos": 19456078026240.0, + "grad_norm": 1.8358807203128344, + "language_loss": 0.81945646, + "learning_rate": 9.72248650150294e-07, + "loss": 0.84075427, + "num_input_tokens_seen": 244591395, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.64453125, + "step": 11337, + "time_per_iteration": 2.448796510696411 + }, + { + "auxiliary_loss_clip": 0.01099529, + "auxiliary_loss_mlp": 0.0102691, + "balance_loss_clip": 1.0160563, + "balance_loss_mlp": 1.03479064, + "epoch": 0.6816774387494363, + "flos": 17931455297280.0, + "grad_norm": 1.722806796595651, + "language_loss": 0.72469616, + "learning_rate": 9.719145642220673e-07, + "loss": 0.74596059, + "num_input_tokens_seen": 244610400, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6484375, + "step": 11338, + "time_per_iteration": 2.517240047454834 + }, + { + "auxiliary_loss_clip": 0.01105227, + "auxiliary_loss_mlp": 0.01031788, + "balance_loss_clip": 1.02005768, + "balance_loss_mlp": 1.03771722, + "epoch": 0.6817375620021043, + "flos": 22233014478720.0, + "grad_norm": 1.4508555916130568, + "language_loss": 0.77669561, + "learning_rate": 9.715805172789435e-07, + "loss": 0.79806578, + "num_input_tokens_seen": 244630400, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.67578125, + "step": 11339, + "time_per_iteration": 2.436663866043091 + }, + { + "auxiliary_loss_clip": 0.01105389, + "auxiliary_loss_mlp": 0.01032391, + "balance_loss_clip": 1.02076244, + "balance_loss_mlp": 1.03804171, + "epoch": 0.6817976852547722, + "flos": 25374408278400.0, + "grad_norm": 2.0295293442554483, + "language_loss": 0.70622659, + "learning_rate": 9.712465093335901e-07, + "loss": 0.72760439, + "num_input_tokens_seen": 244649155, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 11340, + "time_per_iteration": 2.5092625617980957 + }, + { + "auxiliary_loss_clip": 0.01108606, + "auxiliary_loss_mlp": 0.01032866, + "balance_loss_clip": 1.02090895, + "balance_loss_mlp": 1.03815854, + "epoch": 0.6818578085074403, + "flos": 22265764704000.0, + "grad_norm": 2.203520540229157, + "language_loss": 0.82961929, + "learning_rate": 9.709125403986722e-07, + "loss": 0.85103399, + "num_input_tokens_seen": 244665470, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.703125, + "step": 11341, + "time_per_iteration": 2.470651626586914 + }, + { + "auxiliary_loss_clip": 0.01106072, + "auxiliary_loss_mlp": 0.01036902, + "balance_loss_clip": 1.02358067, + "balance_loss_mlp": 1.03685653, + "epoch": 0.6819179317601082, + "flos": 19318145800320.0, + "grad_norm": 1.764627541247337, + "language_loss": 0.68348753, + "learning_rate": 9.705786104868531e-07, + "loss": 0.70491731, + "num_input_tokens_seen": 244684390, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.69140625, + "step": 11342, + "time_per_iteration": 2.5127713680267334 + }, + { + "auxiliary_loss_clip": 0.01101992, + "auxiliary_loss_mlp": 0.01029003, + "balance_loss_clip": 1.01706433, + "balance_loss_mlp": 1.03569162, + "epoch": 0.6819780550127762, + "flos": 21104126864640.0, + "grad_norm": 1.7075903323008321, + "language_loss": 0.74946058, + "learning_rate": 9.702447196107963e-07, + "loss": 0.77077055, + "num_input_tokens_seen": 244703370, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6640625, + "step": 11343, + "time_per_iteration": 2.5146141052246094 + }, + { + "auxiliary_loss_clip": 0.01104891, + "auxiliary_loss_mlp": 0.01033799, + "balance_loss_clip": 1.02227187, + "balance_loss_mlp": 1.0377264, + "epoch": 0.6820381782654441, + "flos": 29716403195520.0, + "grad_norm": 1.6017732799578648, + "language_loss": 0.79690164, + "learning_rate": 9.699108677831639e-07, + "loss": 0.81828856, + "num_input_tokens_seen": 244723325, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 11344, + "time_per_iteration": 3.9397521018981934 + }, + { + "auxiliary_loss_clip": 0.01102922, + "auxiliary_loss_mlp": 0.01032113, + "balance_loss_clip": 1.0200969, + "balance_loss_mlp": 1.03575659, + "epoch": 0.6820983015181121, + "flos": 29242130993280.0, + "grad_norm": 2.3863241768064416, + "language_loss": 0.66377771, + "learning_rate": 9.695770550166136e-07, + "loss": 0.68512809, + "num_input_tokens_seen": 244745650, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 11345, + "time_per_iteration": 2.5208473205566406 + }, + { + "auxiliary_loss_clip": 0.01106639, + "auxiliary_loss_mlp": 0.01030289, + "balance_loss_clip": 1.01854682, + "balance_loss_mlp": 1.03741777, + "epoch": 0.6821584247707801, + "flos": 18871775487360.0, + "grad_norm": 2.4472974915932637, + "language_loss": 0.64573473, + "learning_rate": 9.692432813238054e-07, + "loss": 0.66710401, + "num_input_tokens_seen": 244760270, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6953125, + "step": 11346, + "time_per_iteration": 3.8512396812438965 + }, + { + "auxiliary_loss_clip": 0.01105563, + "auxiliary_loss_mlp": 0.01030452, + "balance_loss_clip": 1.01776791, + "balance_loss_mlp": 1.03745544, + "epoch": 0.6822185480234481, + "flos": 21324582587520.0, + "grad_norm": 1.5968577060390179, + "language_loss": 0.7844069, + "learning_rate": 9.689095467173952e-07, + "loss": 0.80576706, + "num_input_tokens_seen": 244779565, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 11347, + "time_per_iteration": 3.8028361797332764 + }, + { + "auxiliary_loss_clip": 0.01026659, + "auxiliary_loss_mlp": 0.01001661, + "balance_loss_clip": 1.00046301, + "balance_loss_mlp": 1.0059818, + "epoch": 0.6822786712761161, + "flos": 63488306430720.0, + "grad_norm": 0.7216727103538496, + "language_loss": 0.5250113, + "learning_rate": 9.685758512100378e-07, + "loss": 0.54529452, + "num_input_tokens_seen": 244838480, + "router_z_loss_clip": 0.01196289, + "router_z_loss_mlp": 0.20703125, + "step": 11348, + "time_per_iteration": 4.506226539611816 + }, + { + "auxiliary_loss_clip": 0.01101236, + "auxiliary_loss_mlp": 0.01032556, + "balance_loss_clip": 1.02144003, + "balance_loss_mlp": 1.03572845, + "epoch": 0.682338794528784, + "flos": 21068934514560.0, + "grad_norm": 1.7906697802801645, + "language_loss": 0.79596829, + "learning_rate": 9.682421948143873e-07, + "loss": 0.81730622, + "num_input_tokens_seen": 244855265, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 11349, + "time_per_iteration": 2.4514377117156982 + }, + { + "auxiliary_loss_clip": 0.01111621, + "auxiliary_loss_mlp": 0.01028663, + "balance_loss_clip": 1.01438189, + "balance_loss_mlp": 1.03865266, + "epoch": 0.682398917781452, + "flos": 36283243547520.0, + "grad_norm": 9.523032245657118, + "language_loss": 0.74000543, + "learning_rate": 9.67908577543096e-07, + "loss": 0.76140821, + "num_input_tokens_seen": 244875555, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7265625, + "step": 11350, + "time_per_iteration": 2.6128787994384766 + }, + { + "auxiliary_loss_clip": 0.01102473, + "auxiliary_loss_mlp": 0.01028574, + "balance_loss_clip": 1.01656938, + "balance_loss_mlp": 1.03694868, + "epoch": 0.6824590410341199, + "flos": 24859197550080.0, + "grad_norm": 1.583319505093848, + "language_loss": 0.79434985, + "learning_rate": 9.675749994088161e-07, + "loss": 0.81566036, + "num_input_tokens_seen": 244895270, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.65625, + "step": 11351, + "time_per_iteration": 2.4813127517700195 + }, + { + "auxiliary_loss_clip": 0.01102481, + "auxiliary_loss_mlp": 0.01033209, + "balance_loss_clip": 1.02167511, + "balance_loss_mlp": 1.03581142, + "epoch": 0.6825191642867879, + "flos": 22452392793600.0, + "grad_norm": 1.5951575368956712, + "language_loss": 0.73410577, + "learning_rate": 9.672414604241954e-07, + "loss": 0.75546265, + "num_input_tokens_seen": 244914535, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66796875, + "step": 11352, + "time_per_iteration": 2.471541166305542 + }, + { + "auxiliary_loss_clip": 0.01105327, + "auxiliary_loss_mlp": 0.01034603, + "balance_loss_clip": 1.02216315, + "balance_loss_mlp": 1.03617918, + "epoch": 0.6825792875394558, + "flos": 29424377623680.0, + "grad_norm": 1.5725908722190713, + "language_loss": 0.80191058, + "learning_rate": 9.669079606018814e-07, + "loss": 0.8233099, + "num_input_tokens_seen": 244936095, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69140625, + "step": 11353, + "time_per_iteration": 2.5034008026123047 + }, + { + "auxiliary_loss_clip": 0.01103178, + "auxiliary_loss_mlp": 0.01024386, + "balance_loss_clip": 1.01242352, + "balance_loss_mlp": 1.03601313, + "epoch": 0.6826394107921239, + "flos": 18770974945920.0, + "grad_norm": 1.984510532707265, + "language_loss": 0.78228319, + "learning_rate": 9.665744999545218e-07, + "loss": 0.80355877, + "num_input_tokens_seen": 244955290, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 11354, + "time_per_iteration": 2.4608607292175293 + }, + { + "auxiliary_loss_clip": 0.01102222, + "auxiliary_loss_mlp": 0.0102752, + "balance_loss_clip": 1.01630878, + "balance_loss_mlp": 1.03619695, + "epoch": 0.6826995340447918, + "flos": 16617591619200.0, + "grad_norm": 2.0028339846466445, + "language_loss": 0.61692381, + "learning_rate": 9.662410784947599e-07, + "loss": 0.63822126, + "num_input_tokens_seen": 244972935, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.66015625, + "step": 11355, + "time_per_iteration": 2.40678071975708 + }, + { + "auxiliary_loss_clip": 0.01101274, + "auxiliary_loss_mlp": 0.01026693, + "balance_loss_clip": 1.0152607, + "balance_loss_mlp": 1.03438973, + "epoch": 0.6827596572974598, + "flos": 20848299223680.0, + "grad_norm": 1.9183183626079316, + "language_loss": 0.81905627, + "learning_rate": 9.659076962352398e-07, + "loss": 0.84033597, + "num_input_tokens_seen": 244989440, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 11356, + "time_per_iteration": 2.4604368209838867 + }, + { + "auxiliary_loss_clip": 0.01106625, + "auxiliary_loss_mlp": 0.01028652, + "balance_loss_clip": 1.01654649, + "balance_loss_mlp": 1.03872633, + "epoch": 0.6828197805501277, + "flos": 22748081552640.0, + "grad_norm": 2.562660672921637, + "language_loss": 0.78667843, + "learning_rate": 9.655743531886052e-07, + "loss": 0.8080312, + "num_input_tokens_seen": 245007830, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 11357, + "time_per_iteration": 2.4570956230163574 + }, + { + "auxiliary_loss_clip": 0.01027055, + "auxiliary_loss_mlp": 0.01004376, + "balance_loss_clip": 1.00311232, + "balance_loss_mlp": 1.00636482, + "epoch": 0.6828799038027957, + "flos": 71646565829760.0, + "grad_norm": 0.8170905749226814, + "language_loss": 0.59669131, + "learning_rate": 9.65241049367493e-07, + "loss": 0.61700559, + "num_input_tokens_seen": 245070720, + "router_z_loss_clip": 0.01263428, + "router_z_loss_mlp": 0.20703125, + "step": 11358, + "time_per_iteration": 3.1206090450286865 + }, + { + "auxiliary_loss_clip": 0.01108785, + "auxiliary_loss_mlp": 0.01036705, + "balance_loss_clip": 1.0243305, + "balance_loss_mlp": 1.03812075, + "epoch": 0.6829400270554637, + "flos": 19829154637440.0, + "grad_norm": 1.7308298657289736, + "language_loss": 0.78347307, + "learning_rate": 9.64907784784544e-07, + "loss": 0.804928, + "num_input_tokens_seen": 245089070, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.70703125, + "step": 11359, + "time_per_iteration": 2.4206995964050293 + }, + { + "auxiliary_loss_clip": 0.01102635, + "auxiliary_loss_mlp": 0.0103121, + "balance_loss_clip": 1.01964045, + "balance_loss_mlp": 1.03594446, + "epoch": 0.6830001503081317, + "flos": 21980634543360.0, + "grad_norm": 1.9738432775453243, + "language_loss": 0.81637627, + "learning_rate": 9.645745594523958e-07, + "loss": 0.83771473, + "num_input_tokens_seen": 245106500, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66796875, + "step": 11360, + "time_per_iteration": 2.476433038711548 + }, + { + "auxiliary_loss_clip": 0.01107014, + "auxiliary_loss_mlp": 0.01033044, + "balance_loss_clip": 1.02083063, + "balance_loss_mlp": 1.03856695, + "epoch": 0.6830602735607997, + "flos": 24316767290880.0, + "grad_norm": 1.86444446180785, + "language_loss": 0.75634044, + "learning_rate": 9.642413733836844e-07, + "loss": 0.77774101, + "num_input_tokens_seen": 245125260, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 11361, + "time_per_iteration": 2.4659223556518555 + }, + { + "auxiliary_loss_clip": 0.010268, + "auxiliary_loss_mlp": 0.01003581, + "balance_loss_clip": 1.00227582, + "balance_loss_mlp": 1.00611186, + "epoch": 0.6831203968134676, + "flos": 57690062323200.0, + "grad_norm": 0.8682819030103981, + "language_loss": 0.59711051, + "learning_rate": 9.639082265910437e-07, + "loss": 0.61741436, + "num_input_tokens_seen": 245188730, + "router_z_loss_clip": 0.01306152, + "router_z_loss_mlp": 0.20703125, + "step": 11362, + "time_per_iteration": 3.127232074737549 + }, + { + "auxiliary_loss_clip": 0.01104869, + "auxiliary_loss_mlp": 0.01030633, + "balance_loss_clip": 1.0179255, + "balance_loss_mlp": 1.03573108, + "epoch": 0.6831805200661356, + "flos": 14388436552320.0, + "grad_norm": 2.8459010350172913, + "language_loss": 0.74898708, + "learning_rate": 9.635751190871074e-07, + "loss": 0.77034211, + "num_input_tokens_seen": 245205065, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 11363, + "time_per_iteration": 2.4112236499786377 + }, + { + "auxiliary_loss_clip": 0.01102233, + "auxiliary_loss_mlp": 0.01037618, + "balance_loss_clip": 1.02511919, + "balance_loss_mlp": 1.03508842, + "epoch": 0.6832406433188035, + "flos": 22820297846400.0, + "grad_norm": 2.6445368972435976, + "language_loss": 0.89400429, + "learning_rate": 9.632420508845063e-07, + "loss": 0.91540277, + "num_input_tokens_seen": 245224265, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.671875, + "step": 11364, + "time_per_iteration": 2.4431772232055664 + }, + { + "auxiliary_loss_clip": 0.01101882, + "auxiliary_loss_mlp": 0.01030697, + "balance_loss_clip": 1.01950884, + "balance_loss_mlp": 1.03680646, + "epoch": 0.6833007665714715, + "flos": 17561718650880.0, + "grad_norm": 3.2328498112003503, + "language_loss": 0.88372034, + "learning_rate": 9.629090219958697e-07, + "loss": 0.90504611, + "num_input_tokens_seen": 245243360, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.65234375, + "step": 11365, + "time_per_iteration": 2.4429502487182617 + }, + { + "auxiliary_loss_clip": 0.01110566, + "auxiliary_loss_mlp": 0.01036403, + "balance_loss_clip": 1.02396965, + "balance_loss_mlp": 1.03944576, + "epoch": 0.6833608898241395, + "flos": 22445928345600.0, + "grad_norm": 2.0793788072414734, + "language_loss": 0.81185693, + "learning_rate": 9.625760324338272e-07, + "loss": 0.83332664, + "num_input_tokens_seen": 245256350, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 11366, + "time_per_iteration": 2.472283363342285 + }, + { + "auxiliary_loss_clip": 0.01103514, + "auxiliary_loss_mlp": 0.01028693, + "balance_loss_clip": 1.0166235, + "balance_loss_mlp": 1.03517795, + "epoch": 0.6834210130768075, + "flos": 24534637234560.0, + "grad_norm": 1.7001262791469558, + "language_loss": 0.76775587, + "learning_rate": 9.622430822110062e-07, + "loss": 0.789078, + "num_input_tokens_seen": 245277575, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.68359375, + "step": 11367, + "time_per_iteration": 2.4591305255889893 + }, + { + "auxiliary_loss_clip": 0.01105081, + "auxiliary_loss_mlp": 0.01035748, + "balance_loss_clip": 1.0234282, + "balance_loss_mlp": 1.03755784, + "epoch": 0.6834811363294754, + "flos": 20047132321920.0, + "grad_norm": 1.4398909959276744, + "language_loss": 0.68965262, + "learning_rate": 9.619101713400312e-07, + "loss": 0.71106088, + "num_input_tokens_seen": 245296615, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.67578125, + "step": 11368, + "time_per_iteration": 2.477160692214966 + }, + { + "auxiliary_loss_clip": 0.01102397, + "auxiliary_loss_mlp": 0.01029739, + "balance_loss_clip": 1.01824105, + "balance_loss_mlp": 1.03536785, + "epoch": 0.6835412595821434, + "flos": 24790752184320.0, + "grad_norm": 1.9865162675168815, + "language_loss": 0.73352474, + "learning_rate": 9.615772998335261e-07, + "loss": 0.7548461, + "num_input_tokens_seen": 245316275, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 11369, + "time_per_iteration": 2.4527742862701416 + }, + { + "auxiliary_loss_clip": 0.01102773, + "auxiliary_loss_mlp": 0.0102762, + "balance_loss_clip": 1.01549673, + "balance_loss_mlp": 1.03507197, + "epoch": 0.6836013828348113, + "flos": 19500356517120.0, + "grad_norm": 1.995405258990165, + "language_loss": 0.78393018, + "learning_rate": 9.612444677041138e-07, + "loss": 0.80523407, + "num_input_tokens_seen": 245334595, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 11370, + "time_per_iteration": 2.443544864654541 + }, + { + "auxiliary_loss_clip": 0.01026342, + "auxiliary_loss_mlp": 0.01001936, + "balance_loss_clip": 1.00067234, + "balance_loss_mlp": 1.00567722, + "epoch": 0.6836615060874793, + "flos": 58363999251840.0, + "grad_norm": 0.7476131007411569, + "language_loss": 0.59831941, + "learning_rate": 9.609116749644162e-07, + "loss": 0.61860228, + "num_input_tokens_seen": 245389750, + "router_z_loss_clip": 0.01263428, + "router_z_loss_mlp": 0.20703125, + "step": 11371, + "time_per_iteration": 2.9889161586761475 + }, + { + "auxiliary_loss_clip": 0.01099697, + "auxiliary_loss_mlp": 0.01028049, + "balance_loss_clip": 1.01723075, + "balance_loss_mlp": 1.03550124, + "epoch": 0.6837216293401474, + "flos": 12166895168640.0, + "grad_norm": 1.4552904214885107, + "language_loss": 0.63685644, + "learning_rate": 9.605789216270511e-07, + "loss": 0.65813392, + "num_input_tokens_seen": 245407530, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.640625, + "step": 11372, + "time_per_iteration": 2.424954891204834 + }, + { + "auxiliary_loss_clip": 0.01101447, + "auxiliary_loss_mlp": 0.01025041, + "balance_loss_clip": 1.01319766, + "balance_loss_mlp": 1.03525615, + "epoch": 0.6837817525928153, + "flos": 22127581082880.0, + "grad_norm": 1.4781124923613422, + "language_loss": 0.71735704, + "learning_rate": 9.602462077046375e-07, + "loss": 0.73862189, + "num_input_tokens_seen": 245427000, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66015625, + "step": 11373, + "time_per_iteration": 2.474728584289551 + }, + { + "auxiliary_loss_clip": 0.01026667, + "auxiliary_loss_mlp": 0.01000459, + "balance_loss_clip": 0.99917108, + "balance_loss_mlp": 1.00602746, + "epoch": 0.6838418758454833, + "flos": 65005928985600.0, + "grad_norm": 1.2229800972978824, + "language_loss": 0.56697685, + "learning_rate": 9.599135332097935e-07, + "loss": 0.58724803, + "num_input_tokens_seen": 245491620, + "router_z_loss_clip": 0.01287842, + "router_z_loss_mlp": 0.20703125, + "step": 11374, + "time_per_iteration": 3.22890567779541 + }, + { + "auxiliary_loss_clip": 0.01106754, + "auxiliary_loss_mlp": 0.01026655, + "balance_loss_clip": 1.01422763, + "balance_loss_mlp": 1.03807116, + "epoch": 0.6839019990981512, + "flos": 21030833162880.0, + "grad_norm": 1.6218199942773524, + "language_loss": 0.73614061, + "learning_rate": 9.595808981551312e-07, + "loss": 0.75747472, + "num_input_tokens_seen": 245511285, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 11375, + "time_per_iteration": 2.461625814437866 + }, + { + "auxiliary_loss_clip": 0.01102376, + "auxiliary_loss_mlp": 0.01031136, + "balance_loss_clip": 1.01968646, + "balance_loss_mlp": 1.036448, + "epoch": 0.6839621223508192, + "flos": 24935543907840.0, + "grad_norm": 1.6159856732267652, + "language_loss": 0.70548576, + "learning_rate": 9.592483025532651e-07, + "loss": 0.72682095, + "num_input_tokens_seen": 245532910, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66015625, + "step": 11376, + "time_per_iteration": 2.4842541217803955 + }, + { + "auxiliary_loss_clip": 0.01104932, + "auxiliary_loss_mlp": 0.01037614, + "balance_loss_clip": 1.02520418, + "balance_loss_mlp": 1.03640866, + "epoch": 0.6840222456034871, + "flos": 26358827391360.0, + "grad_norm": 2.0252780909145756, + "language_loss": 0.7449975, + "learning_rate": 9.58915746416808e-07, + "loss": 0.76642299, + "num_input_tokens_seen": 245550540, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 11377, + "time_per_iteration": 2.5335726737976074 + }, + { + "auxiliary_loss_clip": 0.01026236, + "auxiliary_loss_mlp": 0.01001308, + "balance_loss_clip": 1.00019324, + "balance_loss_mlp": 1.00557923, + "epoch": 0.6840823688561551, + "flos": 65988336936960.0, + "grad_norm": 0.7232069780958926, + "language_loss": 0.56829667, + "learning_rate": 9.585832297583707e-07, + "loss": 0.58857214, + "num_input_tokens_seen": 245619570, + "router_z_loss_clip": 0.01116943, + "router_z_loss_mlp": 0.20703125, + "step": 11378, + "time_per_iteration": 3.137204885482788 + }, + { + "auxiliary_loss_clip": 0.01103234, + "auxiliary_loss_mlp": 0.0103223, + "balance_loss_clip": 1.01959991, + "balance_loss_mlp": 1.03537726, + "epoch": 0.684142492108823, + "flos": 21397588980480.0, + "grad_norm": 1.644108790952547, + "language_loss": 0.78129804, + "learning_rate": 9.58250752590561e-07, + "loss": 0.80265266, + "num_input_tokens_seen": 245637980, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 11379, + "time_per_iteration": 2.496009349822998 + }, + { + "auxiliary_loss_clip": 0.01098608, + "auxiliary_loss_mlp": 0.01026904, + "balance_loss_clip": 1.01664054, + "balance_loss_mlp": 1.03623796, + "epoch": 0.6842026153614911, + "flos": 18801426700800.0, + "grad_norm": 2.007866180272703, + "language_loss": 0.68494868, + "learning_rate": 9.57918314925988e-07, + "loss": 0.70620382, + "num_input_tokens_seen": 245655690, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.625, + "step": 11380, + "time_per_iteration": 2.406384229660034 + }, + { + "auxiliary_loss_clip": 0.01101488, + "auxiliary_loss_mlp": 0.0103156, + "balance_loss_clip": 1.01939452, + "balance_loss_mlp": 1.03453815, + "epoch": 0.684262738614159, + "flos": 19646405216640.0, + "grad_norm": 2.132022624853322, + "language_loss": 0.78171045, + "learning_rate": 9.575859167772568e-07, + "loss": 0.80304098, + "num_input_tokens_seen": 245671525, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.66796875, + "step": 11381, + "time_per_iteration": 2.4570810794830322 + }, + { + "auxiliary_loss_clip": 0.01025143, + "auxiliary_loss_mlp": 0.01003104, + "balance_loss_clip": 1.00188828, + "balance_loss_mlp": 1.00454473, + "epoch": 0.684322861866827, + "flos": 62354462739840.0, + "grad_norm": 0.8747752326004012, + "language_loss": 0.67185926, + "learning_rate": 9.572535581569713e-07, + "loss": 0.69214177, + "num_input_tokens_seen": 245724115, + "router_z_loss_clip": 0.012146, + "router_z_loss_mlp": 0.20605469, + "step": 11382, + "time_per_iteration": 2.90439510345459 + }, + { + "auxiliary_loss_clip": 0.0102608, + "auxiliary_loss_mlp": 0.01001227, + "balance_loss_clip": 1.00005233, + "balance_loss_mlp": 1.00557017, + "epoch": 0.6843829851194949, + "flos": 65805048812160.0, + "grad_norm": 0.8179080284964599, + "language_loss": 0.58123773, + "learning_rate": 9.569212390777356e-07, + "loss": 0.60151082, + "num_input_tokens_seen": 245789245, + "router_z_loss_clip": 0.01171875, + "router_z_loss_mlp": 0.20507812, + "step": 11383, + "time_per_iteration": 3.0904266834259033 + }, + { + "auxiliary_loss_clip": 0.01100892, + "auxiliary_loss_mlp": 0.01025381, + "balance_loss_clip": 1.0144496, + "balance_loss_mlp": 1.03393197, + "epoch": 0.6844431083721629, + "flos": 27855153181440.0, + "grad_norm": 6.398458171268355, + "language_loss": 0.7963292, + "learning_rate": 9.565889595521517e-07, + "loss": 0.81759197, + "num_input_tokens_seen": 245812420, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.671875, + "step": 11384, + "time_per_iteration": 2.56005859375 + }, + { + "auxiliary_loss_clip": 0.0110454, + "auxiliary_loss_mlp": 0.01033859, + "balance_loss_clip": 1.02203345, + "balance_loss_mlp": 1.03471613, + "epoch": 0.684503231624831, + "flos": 18255010032000.0, + "grad_norm": 2.1545219517049135, + "language_loss": 0.7672773, + "learning_rate": 9.562567195928187e-07, + "loss": 0.7886613, + "num_input_tokens_seen": 245829135, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.69921875, + "step": 11385, + "time_per_iteration": 2.442094326019287 + }, + { + "auxiliary_loss_clip": 0.0111135, + "auxiliary_loss_mlp": 0.01034562, + "balance_loss_clip": 1.02137756, + "balance_loss_mlp": 1.03792572, + "epoch": 0.6845633548774989, + "flos": 17639681120640.0, + "grad_norm": 2.0113901870570534, + "language_loss": 0.84306657, + "learning_rate": 9.55924519212335e-07, + "loss": 0.86452568, + "num_input_tokens_seen": 245847140, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 11386, + "time_per_iteration": 3.9225666522979736 + }, + { + "auxiliary_loss_clip": 0.01105442, + "auxiliary_loss_mlp": 0.01036021, + "balance_loss_clip": 1.02474415, + "balance_loss_mlp": 1.0376749, + "epoch": 0.6846234781301669, + "flos": 20807576179200.0, + "grad_norm": 2.7843660394813035, + "language_loss": 0.83315331, + "learning_rate": 9.555923584232984e-07, + "loss": 0.854568, + "num_input_tokens_seen": 245862855, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6796875, + "step": 11387, + "time_per_iteration": 2.4256067276000977 + }, + { + "auxiliary_loss_clip": 0.01100691, + "auxiliary_loss_mlp": 0.01028881, + "balance_loss_clip": 1.01747251, + "balance_loss_mlp": 1.03419471, + "epoch": 0.6846836013828348, + "flos": 36101176485120.0, + "grad_norm": 1.6307550098034056, + "language_loss": 0.72258627, + "learning_rate": 9.552602372383047e-07, + "loss": 0.74388194, + "num_input_tokens_seen": 245885415, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 11388, + "time_per_iteration": 3.991851806640625 + }, + { + "auxiliary_loss_clip": 0.01101036, + "auxiliary_loss_mlp": 0.01023785, + "balance_loss_clip": 1.01292491, + "balance_loss_mlp": 1.03534198, + "epoch": 0.6847437246355028, + "flos": 43142468607360.0, + "grad_norm": 1.8327013595289872, + "language_loss": 0.62769783, + "learning_rate": 9.549281556699469e-07, + "loss": 0.64894605, + "num_input_tokens_seen": 245906285, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.65625, + "step": 11389, + "time_per_iteration": 4.031615495681763 + }, + { + "auxiliary_loss_clip": 0.0102484, + "auxiliary_loss_mlp": 0.00998817, + "balance_loss_clip": 0.99756575, + "balance_loss_mlp": 1.00428033, + "epoch": 0.6848038478881707, + "flos": 71663729552640.0, + "grad_norm": 0.7254408078879129, + "language_loss": 0.56007105, + "learning_rate": 9.54596113730818e-07, + "loss": 0.5803076, + "num_input_tokens_seen": 245967620, + "router_z_loss_clip": 0.01251221, + "router_z_loss_mlp": 0.20605469, + "step": 11390, + "time_per_iteration": 4.692908048629761 + }, + { + "auxiliary_loss_clip": 0.01103708, + "auxiliary_loss_mlp": 0.01031834, + "balance_loss_clip": 1.02001452, + "balance_loss_mlp": 1.03709829, + "epoch": 0.6848639711408387, + "flos": 19937820257280.0, + "grad_norm": 2.011305237937575, + "language_loss": 0.8772974, + "learning_rate": 9.542641114335109e-07, + "loss": 0.89865273, + "num_input_tokens_seen": 245985075, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66796875, + "step": 11391, + "time_per_iteration": 2.4319207668304443 + }, + { + "auxiliary_loss_clip": 0.01106215, + "auxiliary_loss_mlp": 0.0103464, + "balance_loss_clip": 1.02271295, + "balance_loss_mlp": 1.03695166, + "epoch": 0.6849240943935067, + "flos": 26867501844480.0, + "grad_norm": 1.6650143278886758, + "language_loss": 0.79346359, + "learning_rate": 9.539321487906117e-07, + "loss": 0.81487215, + "num_input_tokens_seen": 246003560, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69140625, + "step": 11392, + "time_per_iteration": 2.501056671142578 + }, + { + "auxiliary_loss_clip": 0.01101076, + "auxiliary_loss_mlp": 0.01027958, + "balance_loss_clip": 1.01641881, + "balance_loss_mlp": 1.03576994, + "epoch": 0.6849842176461747, + "flos": 13735365425280.0, + "grad_norm": 2.2005866152358977, + "language_loss": 0.70957869, + "learning_rate": 9.536002258147104e-07, + "loss": 0.73086905, + "num_input_tokens_seen": 246019600, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65234375, + "step": 11393, + "time_per_iteration": 2.404430627822876 + }, + { + "auxiliary_loss_clip": 0.0110549, + "auxiliary_loss_mlp": 0.01029919, + "balance_loss_clip": 1.01724112, + "balance_loss_mlp": 1.03636444, + "epoch": 0.6850443408988426, + "flos": 24973070641920.0, + "grad_norm": 1.6151771222215205, + "language_loss": 0.64394313, + "learning_rate": 9.532683425183936e-07, + "loss": 0.66529727, + "num_input_tokens_seen": 246038920, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 11394, + "time_per_iteration": 2.4956462383270264 + }, + { + "auxiliary_loss_clip": 0.01105306, + "auxiliary_loss_mlp": 0.010345, + "balance_loss_clip": 1.02175093, + "balance_loss_mlp": 1.03593922, + "epoch": 0.6851044641515106, + "flos": 27744225004800.0, + "grad_norm": 2.3582380826263303, + "language_loss": 0.80521697, + "learning_rate": 9.529364989142468e-07, + "loss": 0.82661504, + "num_input_tokens_seen": 246060490, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 11395, + "time_per_iteration": 2.510676860809326 + }, + { + "auxiliary_loss_clip": 0.01105245, + "auxiliary_loss_mlp": 0.0103051, + "balance_loss_clip": 1.01755834, + "balance_loss_mlp": 1.03777242, + "epoch": 0.6851645874041785, + "flos": 24351061800960.0, + "grad_norm": 1.764971527643648, + "language_loss": 0.73285419, + "learning_rate": 9.526046950148527e-07, + "loss": 0.75421178, + "num_input_tokens_seen": 246081465, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.67578125, + "step": 11396, + "time_per_iteration": 2.568514823913574 + }, + { + "auxiliary_loss_clip": 0.01106243, + "auxiliary_loss_mlp": 0.0102748, + "balance_loss_clip": 1.01480818, + "balance_loss_mlp": 1.03660202, + "epoch": 0.6852247106568465, + "flos": 15077849264640.0, + "grad_norm": 5.148870058421947, + "language_loss": 0.79048425, + "learning_rate": 9.522729308327931e-07, + "loss": 0.81182146, + "num_input_tokens_seen": 246096110, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 11397, + "time_per_iteration": 2.4331774711608887 + }, + { + "auxiliary_loss_clip": 0.01103305, + "auxiliary_loss_mlp": 0.01026103, + "balance_loss_clip": 1.01383626, + "balance_loss_mlp": 1.03412771, + "epoch": 0.6852848339095146, + "flos": 18770005278720.0, + "grad_norm": 2.4689910585067616, + "language_loss": 0.71553206, + "learning_rate": 9.519412063806493e-07, + "loss": 0.73682612, + "num_input_tokens_seen": 246114785, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69140625, + "step": 11398, + "time_per_iteration": 2.5442934036254883 + }, + { + "auxiliary_loss_clip": 0.0110016, + "auxiliary_loss_mlp": 0.01033134, + "balance_loss_clip": 1.02194667, + "balance_loss_mlp": 1.03415036, + "epoch": 0.6853449571621825, + "flos": 27854363082240.0, + "grad_norm": 1.6631285015848603, + "language_loss": 0.70751739, + "learning_rate": 9.516095216709996e-07, + "loss": 0.72885031, + "num_input_tokens_seen": 246136375, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.66015625, + "step": 11399, + "time_per_iteration": 2.4914610385894775 + }, + { + "auxiliary_loss_clip": 0.01104852, + "auxiliary_loss_mlp": 0.01026727, + "balance_loss_clip": 1.01515758, + "balance_loss_mlp": 1.03707409, + "epoch": 0.6854050804148505, + "flos": 18150510389760.0, + "grad_norm": 1.5329347602462005, + "language_loss": 0.7047379, + "learning_rate": 9.512778767164217e-07, + "loss": 0.72605371, + "num_input_tokens_seen": 246155090, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6796875, + "step": 11400, + "time_per_iteration": 2.5048537254333496 + }, + { + "auxiliary_loss_clip": 0.01113165, + "auxiliary_loss_mlp": 0.01035214, + "balance_loss_clip": 1.02017021, + "balance_loss_mlp": 1.0384146, + "epoch": 0.6854652036675184, + "flos": 16326212492160.0, + "grad_norm": 1.826720269595169, + "language_loss": 0.78065717, + "learning_rate": 9.509462715294927e-07, + "loss": 0.80214089, + "num_input_tokens_seen": 246172645, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.75, + "step": 11401, + "time_per_iteration": 2.441246747970581 + }, + { + "auxiliary_loss_clip": 0.01102237, + "auxiliary_loss_mlp": 0.01028091, + "balance_loss_clip": 1.01642609, + "balance_loss_mlp": 1.03616953, + "epoch": 0.6855253269201864, + "flos": 14940814878720.0, + "grad_norm": 2.0377910237961925, + "language_loss": 0.75284612, + "learning_rate": 9.50614706122786e-07, + "loss": 0.77414942, + "num_input_tokens_seen": 246189055, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66015625, + "step": 11402, + "time_per_iteration": 2.4716646671295166 + }, + { + "auxiliary_loss_clip": 0.01105094, + "auxiliary_loss_mlp": 0.01037038, + "balance_loss_clip": 1.0245446, + "balance_loss_mlp": 1.03575242, + "epoch": 0.6855854501728543, + "flos": 23037736826880.0, + "grad_norm": 1.633024747176301, + "language_loss": 0.7278834, + "learning_rate": 9.502831805088742e-07, + "loss": 0.74930477, + "num_input_tokens_seen": 246207990, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 11403, + "time_per_iteration": 2.4483251571655273 + }, + { + "auxiliary_loss_clip": 0.01101831, + "auxiliary_loss_mlp": 0.01029557, + "balance_loss_clip": 1.01826835, + "balance_loss_mlp": 1.03608656, + "epoch": 0.6856455734255223, + "flos": 13253623194240.0, + "grad_norm": 2.2661790169676284, + "language_loss": 0.81050408, + "learning_rate": 9.499516947003294e-07, + "loss": 0.83181787, + "num_input_tokens_seen": 246221595, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65625, + "step": 11404, + "time_per_iteration": 2.4682669639587402 + }, + { + "auxiliary_loss_clip": 0.01103095, + "auxiliary_loss_mlp": 0.01034189, + "balance_loss_clip": 1.02251863, + "balance_loss_mlp": 1.03651369, + "epoch": 0.6857056966781903, + "flos": 23333461499520.0, + "grad_norm": 1.3732837819876964, + "language_loss": 0.77531087, + "learning_rate": 9.496202487097222e-07, + "loss": 0.79668367, + "num_input_tokens_seen": 246242970, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 11405, + "time_per_iteration": 2.4672837257385254 + }, + { + "auxiliary_loss_clip": 0.01024197, + "auxiliary_loss_mlp": 0.01001171, + "balance_loss_clip": 1.00008011, + "balance_loss_mlp": 1.00375617, + "epoch": 0.6857658199308583, + "flos": 61852647784320.0, + "grad_norm": 0.7944486320456374, + "language_loss": 0.60998279, + "learning_rate": 9.492888425496199e-07, + "loss": 0.63023651, + "num_input_tokens_seen": 246300405, + "router_z_loss_clip": 0.01092529, + "router_z_loss_mlp": 0.20507812, + "step": 11406, + "time_per_iteration": 3.146902084350586 + }, + { + "auxiliary_loss_clip": 0.01102554, + "auxiliary_loss_mlp": 0.01033572, + "balance_loss_clip": 1.02050114, + "balance_loss_mlp": 1.03420663, + "epoch": 0.6858259431835262, + "flos": 16654543735680.0, + "grad_norm": 1.8632160242742672, + "language_loss": 0.76916838, + "learning_rate": 9.489574762325907e-07, + "loss": 0.79052973, + "num_input_tokens_seen": 246318780, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.68359375, + "step": 11407, + "time_per_iteration": 2.4350507259368896 + }, + { + "auxiliary_loss_clip": 0.01106959, + "auxiliary_loss_mlp": 0.01035298, + "balance_loss_clip": 1.02232862, + "balance_loss_mlp": 1.03708422, + "epoch": 0.6858860664361942, + "flos": 21872974504320.0, + "grad_norm": 2.5660412243788153, + "language_loss": 0.71399796, + "learning_rate": 9.486261497711991e-07, + "loss": 0.73542058, + "num_input_tokens_seen": 246339405, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.69921875, + "step": 11408, + "time_per_iteration": 2.5331506729125977 + }, + { + "auxiliary_loss_clip": 0.01104047, + "auxiliary_loss_mlp": 0.01025559, + "balance_loss_clip": 1.01318479, + "balance_loss_mlp": 1.03469181, + "epoch": 0.6859461896888621, + "flos": 15267637751040.0, + "grad_norm": 1.9585659451981918, + "language_loss": 0.69841951, + "learning_rate": 9.482948631780087e-07, + "loss": 0.7197156, + "num_input_tokens_seen": 246357055, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6953125, + "step": 11409, + "time_per_iteration": 2.406949520111084 + }, + { + "auxiliary_loss_clip": 0.01098382, + "auxiliary_loss_mlp": 0.01030381, + "balance_loss_clip": 1.01965857, + "balance_loss_mlp": 1.03563976, + "epoch": 0.6860063129415301, + "flos": 18620293392000.0, + "grad_norm": 1.5737480053745323, + "language_loss": 0.78358257, + "learning_rate": 9.479636164655825e-07, + "loss": 0.80487025, + "num_input_tokens_seen": 246374050, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.625, + "step": 11410, + "time_per_iteration": 2.5127828121185303 + }, + { + "auxiliary_loss_clip": 0.0110556, + "auxiliary_loss_mlp": 0.01034715, + "balance_loss_clip": 1.02162552, + "balance_loss_mlp": 1.03487253, + "epoch": 0.6860664361941982, + "flos": 23951376190080.0, + "grad_norm": 2.0456589939951852, + "language_loss": 0.71620971, + "learning_rate": 9.476324096464821e-07, + "loss": 0.73761249, + "num_input_tokens_seen": 246392910, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.70703125, + "step": 11411, + "time_per_iteration": 2.456273317337036 + }, + { + "auxiliary_loss_clip": 0.01105032, + "auxiliary_loss_mlp": 0.01031198, + "balance_loss_clip": 1.01804924, + "balance_loss_mlp": 1.03671002, + "epoch": 0.6861265594468661, + "flos": 20407782827520.0, + "grad_norm": 1.870472752363858, + "language_loss": 0.696311, + "learning_rate": 9.473012427332654e-07, + "loss": 0.7176733, + "num_input_tokens_seen": 246411540, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.68359375, + "step": 11412, + "time_per_iteration": 2.4815471172332764 + }, + { + "auxiliary_loss_clip": 0.0110396, + "auxiliary_loss_mlp": 0.01030657, + "balance_loss_clip": 1.01843774, + "balance_loss_mlp": 1.03616846, + "epoch": 0.6861866826995341, + "flos": 11428571111040.0, + "grad_norm": 2.8759639216310364, + "language_loss": 0.72033083, + "learning_rate": 9.469701157384919e-07, + "loss": 0.74167705, + "num_input_tokens_seen": 246423295, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6796875, + "step": 11413, + "time_per_iteration": 2.3763904571533203 + }, + { + "auxiliary_loss_clip": 0.01104612, + "auxiliary_loss_mlp": 0.01031654, + "balance_loss_clip": 1.01989388, + "balance_loss_mlp": 1.03653979, + "epoch": 0.686246805952202, + "flos": 15997593939840.0, + "grad_norm": 1.7019599587889749, + "language_loss": 0.73731822, + "learning_rate": 9.466390286747164e-07, + "loss": 0.75868088, + "num_input_tokens_seen": 246441045, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6796875, + "step": 11414, + "time_per_iteration": 2.4849958419799805 + }, + { + "auxiliary_loss_clip": 0.0110805, + "auxiliary_loss_mlp": 0.01030632, + "balance_loss_clip": 1.01831794, + "balance_loss_mlp": 1.03832841, + "epoch": 0.68630692920487, + "flos": 19826712512640.0, + "grad_norm": 2.1354792795106396, + "language_loss": 0.86471385, + "learning_rate": 9.46307981554495e-07, + "loss": 0.88610065, + "num_input_tokens_seen": 246456905, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69921875, + "step": 11415, + "time_per_iteration": 2.419379711151123 + }, + { + "auxiliary_loss_clip": 0.01106633, + "auxiliary_loss_mlp": 0.01036667, + "balance_loss_clip": 1.02393508, + "balance_loss_mlp": 1.03672004, + "epoch": 0.6863670524575379, + "flos": 26286216048000.0, + "grad_norm": 1.5997351133047528, + "language_loss": 0.67188251, + "learning_rate": 9.459769743903801e-07, + "loss": 0.69331551, + "num_input_tokens_seen": 246477545, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 11416, + "time_per_iteration": 2.544360876083374 + }, + { + "auxiliary_loss_clip": 0.01101411, + "auxiliary_loss_mlp": 0.01032124, + "balance_loss_clip": 1.02020359, + "balance_loss_mlp": 1.03366458, + "epoch": 0.686427175710206, + "flos": 19173138595200.0, + "grad_norm": 1.3938350999013296, + "language_loss": 0.75928599, + "learning_rate": 9.456460071949237e-07, + "loss": 0.78062129, + "num_input_tokens_seen": 246496705, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 11417, + "time_per_iteration": 2.420132637023926 + }, + { + "auxiliary_loss_clip": 0.01103442, + "auxiliary_loss_mlp": 0.01030485, + "balance_loss_clip": 1.01882672, + "balance_loss_mlp": 1.03592944, + "epoch": 0.6864872989628739, + "flos": 18916628595840.0, + "grad_norm": 1.7730588079343717, + "language_loss": 0.77459234, + "learning_rate": 9.45315079980678e-07, + "loss": 0.79593164, + "num_input_tokens_seen": 246514860, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.67578125, + "step": 11418, + "time_per_iteration": 2.4872171878814697 + }, + { + "auxiliary_loss_clip": 0.01103813, + "auxiliary_loss_mlp": 0.01026249, + "balance_loss_clip": 1.01471543, + "balance_loss_mlp": 1.03681958, + "epoch": 0.6865474222155419, + "flos": 25956196865280.0, + "grad_norm": 2.2244412162236924, + "language_loss": 0.76546735, + "learning_rate": 9.449841927601887e-07, + "loss": 0.78676796, + "num_input_tokens_seen": 246536145, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 11419, + "time_per_iteration": 2.5004422664642334 + }, + { + "auxiliary_loss_clip": 0.01101876, + "auxiliary_loss_mlp": 0.01032873, + "balance_loss_clip": 1.02173305, + "balance_loss_mlp": 1.03602588, + "epoch": 0.6866075454682098, + "flos": 18478087447680.0, + "grad_norm": 1.9820381057917913, + "language_loss": 0.71707082, + "learning_rate": 9.446533455460044e-07, + "loss": 0.73841834, + "num_input_tokens_seen": 246553265, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66015625, + "step": 11420, + "time_per_iteration": 2.480562925338745 + }, + { + "auxiliary_loss_clip": 0.01101218, + "auxiliary_loss_mlp": 0.01023861, + "balance_loss_clip": 1.01320374, + "balance_loss_mlp": 1.03455591, + "epoch": 0.6866676687208778, + "flos": 34239998298240.0, + "grad_norm": 1.3356950077180587, + "language_loss": 0.7420696, + "learning_rate": 9.443225383506712e-07, + "loss": 0.76332039, + "num_input_tokens_seen": 246575130, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6640625, + "step": 11421, + "time_per_iteration": 2.532064199447632 + }, + { + "auxiliary_loss_clip": 0.01100357, + "auxiliary_loss_mlp": 0.01029807, + "balance_loss_clip": 1.01827979, + "balance_loss_mlp": 1.03495026, + "epoch": 0.6867277919735457, + "flos": 21721754246400.0, + "grad_norm": 1.7634473864986122, + "language_loss": 0.77061129, + "learning_rate": 9.439917711867338e-07, + "loss": 0.79191291, + "num_input_tokens_seen": 246593095, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 11422, + "time_per_iteration": 2.494222402572632 + }, + { + "auxiliary_loss_clip": 0.01105572, + "auxiliary_loss_mlp": 0.01036083, + "balance_loss_clip": 1.02336359, + "balance_loss_mlp": 1.03689635, + "epoch": 0.6867879152262137, + "flos": 24097999507200.0, + "grad_norm": 2.896334528061073, + "language_loss": 0.77752495, + "learning_rate": 9.436610440667334e-07, + "loss": 0.79894149, + "num_input_tokens_seen": 246612165, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 11423, + "time_per_iteration": 2.4580142498016357 + }, + { + "auxiliary_loss_clip": 0.01105867, + "auxiliary_loss_mlp": 0.01028863, + "balance_loss_clip": 1.01655424, + "balance_loss_mlp": 1.03794348, + "epoch": 0.6868480384788818, + "flos": 21615818060160.0, + "grad_norm": 1.4732024211582577, + "language_loss": 0.72956997, + "learning_rate": 9.433303570032129e-07, + "loss": 0.75091726, + "num_input_tokens_seen": 246632065, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6796875, + "step": 11424, + "time_per_iteration": 2.5055267810821533 + }, + { + "auxiliary_loss_clip": 0.01105305, + "auxiliary_loss_mlp": 0.01028679, + "balance_loss_clip": 1.01724076, + "balance_loss_mlp": 1.03695333, + "epoch": 0.6869081617315497, + "flos": 26286144220800.0, + "grad_norm": 1.7308743196557235, + "language_loss": 0.65175045, + "learning_rate": 9.429997100087112e-07, + "loss": 0.67309034, + "num_input_tokens_seen": 246651245, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.68359375, + "step": 11425, + "time_per_iteration": 2.470486640930176 + }, + { + "auxiliary_loss_clip": 0.01102552, + "auxiliary_loss_mlp": 0.01023971, + "balance_loss_clip": 1.01246786, + "balance_loss_mlp": 1.03693807, + "epoch": 0.6869682849842177, + "flos": 21105096531840.0, + "grad_norm": 1.3720059089078416, + "language_loss": 0.71447921, + "learning_rate": 9.426691030957657e-07, + "loss": 0.73574442, + "num_input_tokens_seen": 246672225, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 11426, + "time_per_iteration": 2.5032618045806885 + }, + { + "auxiliary_loss_clip": 0.01102828, + "auxiliary_loss_mlp": 0.01026153, + "balance_loss_clip": 1.01463187, + "balance_loss_mlp": 1.03570724, + "epoch": 0.6870284082368856, + "flos": 17092653920640.0, + "grad_norm": 2.2242612174106737, + "language_loss": 0.85695207, + "learning_rate": 9.423385362769136e-07, + "loss": 0.8782419, + "num_input_tokens_seen": 246688385, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 11427, + "time_per_iteration": 2.4124362468719482 + }, + { + "auxiliary_loss_clip": 0.01102706, + "auxiliary_loss_mlp": 0.01027967, + "balance_loss_clip": 1.01630831, + "balance_loss_mlp": 1.03642297, + "epoch": 0.6870885314895536, + "flos": 27308090067840.0, + "grad_norm": 1.5166850198696897, + "language_loss": 0.75723726, + "learning_rate": 9.420080095646909e-07, + "loss": 0.77854395, + "num_input_tokens_seen": 246710730, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 11428, + "time_per_iteration": 3.971212387084961 + }, + { + "auxiliary_loss_clip": 0.0110684, + "auxiliary_loss_mlp": 0.01035789, + "balance_loss_clip": 1.02289069, + "balance_loss_mlp": 1.03649604, + "epoch": 0.6871486547422215, + "flos": 20814543417600.0, + "grad_norm": 2.165798768763756, + "language_loss": 0.73242265, + "learning_rate": 9.4167752297163e-07, + "loss": 0.75384891, + "num_input_tokens_seen": 246730350, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 11429, + "time_per_iteration": 2.4732346534729004 + }, + { + "auxiliary_loss_clip": 0.01107151, + "auxiliary_loss_mlp": 0.01027831, + "balance_loss_clip": 1.01595795, + "balance_loss_mlp": 1.03874505, + "epoch": 0.6872087779948896, + "flos": 30154118330880.0, + "grad_norm": 2.494094152353352, + "language_loss": 0.83109355, + "learning_rate": 9.413470765102643e-07, + "loss": 0.8524434, + "num_input_tokens_seen": 246751700, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.68359375, + "step": 11430, + "time_per_iteration": 3.9374120235443115 + }, + { + "auxiliary_loss_clip": 0.01102176, + "auxiliary_loss_mlp": 0.01032588, + "balance_loss_clip": 1.02065516, + "balance_loss_mlp": 1.03498435, + "epoch": 0.6872689012475575, + "flos": 20704584908160.0, + "grad_norm": 2.0537474499977746, + "language_loss": 0.700809, + "learning_rate": 9.410166701931225e-07, + "loss": 0.72215664, + "num_input_tokens_seen": 246769860, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 11431, + "time_per_iteration": 5.39936375617981 + }, + { + "auxiliary_loss_clip": 0.0110521, + "auxiliary_loss_mlp": 0.01032138, + "balance_loss_clip": 1.02002013, + "balance_loss_mlp": 1.03624368, + "epoch": 0.6873290245002255, + "flos": 25520852027520.0, + "grad_norm": 1.9154257852528767, + "language_loss": 0.79996437, + "learning_rate": 9.406863040327355e-07, + "loss": 0.82133788, + "num_input_tokens_seen": 246789905, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 11432, + "time_per_iteration": 2.5091586112976074 + }, + { + "auxiliary_loss_clip": 0.0110135, + "auxiliary_loss_mlp": 0.01026907, + "balance_loss_clip": 1.01545095, + "balance_loss_mlp": 1.03639221, + "epoch": 0.6873891477528934, + "flos": 25191479289600.0, + "grad_norm": 1.5073442194689934, + "language_loss": 0.67916226, + "learning_rate": 9.403559780416295e-07, + "loss": 0.70044488, + "num_input_tokens_seen": 246808815, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6484375, + "step": 11433, + "time_per_iteration": 2.4911651611328125 + }, + { + "auxiliary_loss_clip": 0.0110721, + "auxiliary_loss_mlp": 0.01037163, + "balance_loss_clip": 1.02483046, + "balance_loss_mlp": 1.03957868, + "epoch": 0.6874492710055614, + "flos": 35152380685440.0, + "grad_norm": 1.9834703858650884, + "language_loss": 0.72955799, + "learning_rate": 9.400256922323309e-07, + "loss": 0.75100172, + "num_input_tokens_seen": 246829775, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.67578125, + "step": 11434, + "time_per_iteration": 2.601761817932129 + }, + { + "auxiliary_loss_clip": 0.01104287, + "auxiliary_loss_mlp": 0.0102684, + "balance_loss_clip": 1.01488328, + "balance_loss_mlp": 1.03820884, + "epoch": 0.6875093942582293, + "flos": 17822215059840.0, + "grad_norm": 1.6345537065528275, + "language_loss": 0.80520904, + "learning_rate": 9.396954466173657e-07, + "loss": 0.82652032, + "num_input_tokens_seen": 246848045, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66015625, + "step": 11435, + "time_per_iteration": 2.4691109657287598 + }, + { + "auxiliary_loss_clip": 0.01105455, + "auxiliary_loss_mlp": 0.01031499, + "balance_loss_clip": 1.01895833, + "balance_loss_mlp": 1.03661776, + "epoch": 0.6875695175108973, + "flos": 20704548994560.0, + "grad_norm": 2.919181748670558, + "language_loss": 0.8081519, + "learning_rate": 9.393652412092538e-07, + "loss": 0.82952142, + "num_input_tokens_seen": 246866095, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 11436, + "time_per_iteration": 2.4831182956695557 + }, + { + "auxiliary_loss_clip": 0.01098481, + "auxiliary_loss_mlp": 0.01027936, + "balance_loss_clip": 1.01780939, + "balance_loss_mlp": 1.03531957, + "epoch": 0.6876296407635654, + "flos": 25374013228800.0, + "grad_norm": 2.0171807255350056, + "language_loss": 0.82209235, + "learning_rate": 9.390350760205183e-07, + "loss": 0.84335649, + "num_input_tokens_seen": 246883975, + "router_z_loss_clip": 0.1015625, + "router_z_loss_mlp": 0.6328125, + "step": 11437, + "time_per_iteration": 2.476003646850586 + }, + { + "auxiliary_loss_clip": 0.01111133, + "auxiliary_loss_mlp": 0.01034984, + "balance_loss_clip": 1.02188921, + "balance_loss_mlp": 1.03871989, + "epoch": 0.6876897640162333, + "flos": 23222317841280.0, + "grad_norm": 2.5574373753550894, + "language_loss": 0.77940321, + "learning_rate": 9.387049510636793e-07, + "loss": 0.80086446, + "num_input_tokens_seen": 246901560, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.72265625, + "step": 11438, + "time_per_iteration": 2.502321720123291 + }, + { + "auxiliary_loss_clip": 0.01098247, + "auxiliary_loss_mlp": 0.01032589, + "balance_loss_clip": 1.02097225, + "balance_loss_mlp": 1.03480375, + "epoch": 0.6877498872689013, + "flos": 27124335066240.0, + "grad_norm": 1.5853093369472568, + "language_loss": 0.72395837, + "learning_rate": 9.383748663512554e-07, + "loss": 0.74526674, + "num_input_tokens_seen": 246922655, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6328125, + "step": 11439, + "time_per_iteration": 2.4871983528137207 + }, + { + "auxiliary_loss_clip": 0.01104102, + "auxiliary_loss_mlp": 0.01026458, + "balance_loss_clip": 1.0148648, + "balance_loss_mlp": 1.0368948, + "epoch": 0.6878100105215692, + "flos": 11581658876160.0, + "grad_norm": 1.9510407430553642, + "language_loss": 0.75392562, + "learning_rate": 9.380448218957623e-07, + "loss": 0.77523124, + "num_input_tokens_seen": 246940100, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.671875, + "step": 11440, + "time_per_iteration": 2.444061040878296 + }, + { + "auxiliary_loss_clip": 0.01100078, + "auxiliary_loss_mlp": 0.01032398, + "balance_loss_clip": 1.02096558, + "balance_loss_mlp": 1.03482723, + "epoch": 0.6878701337742372, + "flos": 20303175444480.0, + "grad_norm": 1.5583446762430218, + "language_loss": 0.71741056, + "learning_rate": 9.377148177097167e-07, + "loss": 0.73873532, + "num_input_tokens_seen": 246958545, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6484375, + "step": 11441, + "time_per_iteration": 2.42561936378479 + }, + { + "auxiliary_loss_clip": 0.01107766, + "auxiliary_loss_mlp": 0.0103068, + "balance_loss_clip": 1.01703668, + "balance_loss_mlp": 1.03738022, + "epoch": 0.6879302570269051, + "flos": 13840080549120.0, + "grad_norm": 1.6223718684669892, + "language_loss": 0.66661596, + "learning_rate": 9.373848538056317e-07, + "loss": 0.68800044, + "num_input_tokens_seen": 246974805, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.703125, + "step": 11442, + "time_per_iteration": 2.527100086212158 + }, + { + "auxiliary_loss_clip": 0.01104807, + "auxiliary_loss_mlp": 0.0103077, + "balance_loss_clip": 1.01938581, + "balance_loss_mlp": 1.03825164, + "epoch": 0.6879903802795732, + "flos": 21324654414720.0, + "grad_norm": 1.9334719769408109, + "language_loss": 0.69233751, + "learning_rate": 9.370549301960189e-07, + "loss": 0.71369326, + "num_input_tokens_seen": 246992505, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6640625, + "step": 11443, + "time_per_iteration": 2.4346165657043457 + }, + { + "auxiliary_loss_clip": 0.01109303, + "auxiliary_loss_mlp": 0.01033521, + "balance_loss_clip": 1.02084899, + "balance_loss_mlp": 1.04012263, + "epoch": 0.6880505035322411, + "flos": 25152049134720.0, + "grad_norm": 1.4614285926013768, + "language_loss": 0.76507717, + "learning_rate": 9.367250468933893e-07, + "loss": 0.78650534, + "num_input_tokens_seen": 247013370, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69140625, + "step": 11444, + "time_per_iteration": 2.508368968963623 + }, + { + "auxiliary_loss_clip": 0.01101207, + "auxiliary_loss_mlp": 0.01027844, + "balance_loss_clip": 1.01622105, + "balance_loss_mlp": 1.03577399, + "epoch": 0.6881106267849091, + "flos": 23215530170880.0, + "grad_norm": 1.8080804951596867, + "language_loss": 0.76652426, + "learning_rate": 9.363952039102536e-07, + "loss": 0.78781474, + "num_input_tokens_seen": 247029855, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 11445, + "time_per_iteration": 2.4379546642303467 + }, + { + "auxiliary_loss_clip": 0.01025524, + "auxiliary_loss_mlp": 0.01005058, + "balance_loss_clip": 1.00386608, + "balance_loss_mlp": 1.00513721, + "epoch": 0.688170750037577, + "flos": 48484397312640.0, + "grad_norm": 0.8196174893111461, + "language_loss": 0.58379793, + "learning_rate": 9.360654012591183e-07, + "loss": 0.60410374, + "num_input_tokens_seen": 247085030, + "router_z_loss_clip": 0.01190186, + "router_z_loss_mlp": 0.20410156, + "step": 11446, + "time_per_iteration": 3.09559965133667 + }, + { + "auxiliary_loss_clip": 0.01105797, + "auxiliary_loss_mlp": 0.01027584, + "balance_loss_clip": 1.01536548, + "balance_loss_mlp": 1.03552115, + "epoch": 0.688230873290245, + "flos": 22783633038720.0, + "grad_norm": 1.5108741045715646, + "language_loss": 0.75743663, + "learning_rate": 9.357356389524886e-07, + "loss": 0.77877045, + "num_input_tokens_seen": 247104840, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.703125, + "step": 11447, + "time_per_iteration": 2.4388415813446045 + }, + { + "auxiliary_loss_clip": 0.01105525, + "auxiliary_loss_mlp": 0.01034698, + "balance_loss_clip": 1.02256274, + "balance_loss_mlp": 1.035833, + "epoch": 0.6882909965429129, + "flos": 22455660931200.0, + "grad_norm": 2.3058905142845, + "language_loss": 0.73110414, + "learning_rate": 9.354059170028705e-07, + "loss": 0.75250638, + "num_input_tokens_seen": 247121905, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69921875, + "step": 11448, + "time_per_iteration": 2.490492820739746 + }, + { + "auxiliary_loss_clip": 0.01106927, + "auxiliary_loss_mlp": 0.01031125, + "balance_loss_clip": 1.01841772, + "balance_loss_mlp": 1.03607249, + "epoch": 0.688351119795581, + "flos": 26214143408640.0, + "grad_norm": 1.6148138238236993, + "language_loss": 0.74589622, + "learning_rate": 9.350762354227673e-07, + "loss": 0.76727676, + "num_input_tokens_seen": 247142375, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 11449, + "time_per_iteration": 2.5052759647369385 + }, + { + "auxiliary_loss_clip": 0.01102717, + "auxiliary_loss_mlp": 0.01033997, + "balance_loss_clip": 1.02249944, + "balance_loss_mlp": 1.03643203, + "epoch": 0.6884112430482489, + "flos": 22565260304640.0, + "grad_norm": 1.8257091472853513, + "language_loss": 0.69832647, + "learning_rate": 9.34746594224679e-07, + "loss": 0.71969366, + "num_input_tokens_seen": 247161095, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 11450, + "time_per_iteration": 2.4648208618164062 + }, + { + "auxiliary_loss_clip": 0.01108292, + "auxiliary_loss_mlp": 0.0103361, + "balance_loss_clip": 1.02027011, + "balance_loss_mlp": 1.03613949, + "epoch": 0.6884713663009169, + "flos": 17341047446400.0, + "grad_norm": 2.0456347390181366, + "language_loss": 0.76224291, + "learning_rate": 9.344169934211068e-07, + "loss": 0.78366196, + "num_input_tokens_seen": 247178565, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 11451, + "time_per_iteration": 2.430615186691284 + }, + { + "auxiliary_loss_clip": 0.01106472, + "auxiliary_loss_mlp": 0.01029349, + "balance_loss_clip": 1.01746345, + "balance_loss_mlp": 1.03748226, + "epoch": 0.6885314895535849, + "flos": 26470832976000.0, + "grad_norm": 1.5920883527953233, + "language_loss": 0.69262952, + "learning_rate": 9.340874330245505e-07, + "loss": 0.71398771, + "num_input_tokens_seen": 247202345, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6875, + "step": 11452, + "time_per_iteration": 2.5010976791381836 + }, + { + "auxiliary_loss_clip": 0.01103078, + "auxiliary_loss_mlp": 0.01035172, + "balance_loss_clip": 1.02176046, + "balance_loss_mlp": 1.0362519, + "epoch": 0.6885916128062528, + "flos": 20521548178560.0, + "grad_norm": 1.7710041973258575, + "language_loss": 0.72149074, + "learning_rate": 9.337579130475042e-07, + "loss": 0.74287325, + "num_input_tokens_seen": 247219240, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.66796875, + "step": 11453, + "time_per_iteration": 2.450064182281494 + }, + { + "auxiliary_loss_clip": 0.01025423, + "auxiliary_loss_mlp": 0.00997723, + "balance_loss_clip": 0.99648923, + "balance_loss_mlp": 1.0050149, + "epoch": 0.6886517360589208, + "flos": 70715795679360.0, + "grad_norm": 0.7858760559038386, + "language_loss": 0.50753725, + "learning_rate": 9.334284335024644e-07, + "loss": 0.52776867, + "num_input_tokens_seen": 247272010, + "router_z_loss_clip": 0.0123291, + "router_z_loss_mlp": 0.20410156, + "step": 11454, + "time_per_iteration": 2.9117000102996826 + }, + { + "auxiliary_loss_clip": 0.01101076, + "auxiliary_loss_mlp": 0.01028661, + "balance_loss_clip": 1.01732993, + "balance_loss_mlp": 1.03662014, + "epoch": 0.6887118593115887, + "flos": 17893533513600.0, + "grad_norm": 1.7206646308115936, + "language_loss": 0.75241423, + "learning_rate": 9.330989944019263e-07, + "loss": 0.77371156, + "num_input_tokens_seen": 247290630, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.64453125, + "step": 11455, + "time_per_iteration": 2.485668897628784 + }, + { + "auxiliary_loss_clip": 0.01106397, + "auxiliary_loss_mlp": 0.01037929, + "balance_loss_clip": 1.02456009, + "balance_loss_mlp": 1.03585863, + "epoch": 0.6887719825642568, + "flos": 17453017117440.0, + "grad_norm": 2.149117194105129, + "language_loss": 0.72609061, + "learning_rate": 9.327695957583803e-07, + "loss": 0.74753392, + "num_input_tokens_seen": 247304800, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.70703125, + "step": 11456, + "time_per_iteration": 2.393894672393799 + }, + { + "auxiliary_loss_clip": 0.01102522, + "auxiliary_loss_mlp": 0.01030403, + "balance_loss_clip": 1.01892924, + "balance_loss_mlp": 1.03732562, + "epoch": 0.6888321058169247, + "flos": 23070199743360.0, + "grad_norm": 1.623007735916198, + "language_loss": 0.80938387, + "learning_rate": 9.32440237584319e-07, + "loss": 0.83071315, + "num_input_tokens_seen": 247323450, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65234375, + "step": 11457, + "time_per_iteration": 2.495333194732666 + }, + { + "auxiliary_loss_clip": 0.01108692, + "auxiliary_loss_mlp": 0.01028177, + "balance_loss_clip": 1.01552272, + "balance_loss_mlp": 1.03859973, + "epoch": 0.6888922290695927, + "flos": 23368833417600.0, + "grad_norm": 1.5000729460202227, + "language_loss": 0.76153016, + "learning_rate": 9.321109198922301e-07, + "loss": 0.7828989, + "num_input_tokens_seen": 247343845, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 11458, + "time_per_iteration": 2.4778497219085693 + }, + { + "auxiliary_loss_clip": 0.01104516, + "auxiliary_loss_mlp": 0.01029574, + "balance_loss_clip": 1.01787341, + "balance_loss_mlp": 1.03653932, + "epoch": 0.6889523523222606, + "flos": 17631636474240.0, + "grad_norm": 2.658523232455535, + "language_loss": 0.68647993, + "learning_rate": 9.31781642694603e-07, + "loss": 0.70782083, + "num_input_tokens_seen": 247356650, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 11459, + "time_per_iteration": 2.418846845626831 + }, + { + "auxiliary_loss_clip": 0.01104672, + "auxiliary_loss_mlp": 0.01027492, + "balance_loss_clip": 1.01640558, + "balance_loss_mlp": 1.03759336, + "epoch": 0.6890124755749286, + "flos": 25228144097280.0, + "grad_norm": 1.5707154761187223, + "language_loss": 0.68636, + "learning_rate": 9.314524060039221e-07, + "loss": 0.7076816, + "num_input_tokens_seen": 247377340, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.671875, + "step": 11460, + "time_per_iteration": 2.5109915733337402 + }, + { + "auxiliary_loss_clip": 0.01108621, + "auxiliary_loss_mlp": 0.01033572, + "balance_loss_clip": 1.02050114, + "balance_loss_mlp": 1.03564703, + "epoch": 0.6890725988275965, + "flos": 20230240878720.0, + "grad_norm": 1.7818403559528928, + "language_loss": 0.76981837, + "learning_rate": 9.311232098326731e-07, + "loss": 0.79124033, + "num_input_tokens_seen": 247395805, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73046875, + "step": 11461, + "time_per_iteration": 2.467684268951416 + }, + { + "auxiliary_loss_clip": 0.01103615, + "auxiliary_loss_mlp": 0.01034219, + "balance_loss_clip": 1.02204204, + "balance_loss_mlp": 1.03618026, + "epoch": 0.6891327220802645, + "flos": 14535311264640.0, + "grad_norm": 1.6409609736690487, + "language_loss": 0.6973418, + "learning_rate": 9.307940541933401e-07, + "loss": 0.71872014, + "num_input_tokens_seen": 247413165, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 11462, + "time_per_iteration": 2.411785125732422 + }, + { + "auxiliary_loss_clip": 0.01106527, + "auxiliary_loss_mlp": 0.01025599, + "balance_loss_clip": 1.0134095, + "balance_loss_mlp": 1.03735316, + "epoch": 0.6891928453329325, + "flos": 21139139646720.0, + "grad_norm": 1.4661487687088357, + "language_loss": 0.87139171, + "learning_rate": 9.304649390984034e-07, + "loss": 0.89271295, + "num_input_tokens_seen": 247433140, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69140625, + "step": 11463, + "time_per_iteration": 2.4572672843933105 + }, + { + "auxiliary_loss_clip": 0.010984, + "auxiliary_loss_mlp": 0.01027776, + "balance_loss_clip": 1.01702309, + "balance_loss_mlp": 1.03459322, + "epoch": 0.6892529685856005, + "flos": 17858520731520.0, + "grad_norm": 1.8830832637485666, + "language_loss": 0.68394661, + "learning_rate": 9.301358645603428e-07, + "loss": 0.70520842, + "num_input_tokens_seen": 247451265, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.63671875, + "step": 11464, + "time_per_iteration": 2.4330556392669678 + }, + { + "auxiliary_loss_clip": 0.0110454, + "auxiliary_loss_mlp": 0.0103758, + "balance_loss_clip": 1.0248543, + "balance_loss_mlp": 1.0371387, + "epoch": 0.6893130918382685, + "flos": 29934811843200.0, + "grad_norm": 1.8974270807015088, + "language_loss": 0.65594816, + "learning_rate": 9.298068305916373e-07, + "loss": 0.67736936, + "num_input_tokens_seen": 247471645, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.671875, + "step": 11465, + "time_per_iteration": 2.495144844055176 + }, + { + "auxiliary_loss_clip": 0.01106695, + "auxiliary_loss_mlp": 0.01034637, + "balance_loss_clip": 1.02248406, + "balance_loss_mlp": 1.03674364, + "epoch": 0.6893732150909364, + "flos": 24388516707840.0, + "grad_norm": 1.5240764354372476, + "language_loss": 0.72628653, + "learning_rate": 9.294778372047649e-07, + "loss": 0.74769986, + "num_input_tokens_seen": 247491170, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69921875, + "step": 11466, + "time_per_iteration": 2.4766881465911865 + }, + { + "auxiliary_loss_clip": 0.0110566, + "auxiliary_loss_mlp": 0.0103001, + "balance_loss_clip": 1.01822066, + "balance_loss_mlp": 1.03696775, + "epoch": 0.6894333383436044, + "flos": 16982874979200.0, + "grad_norm": 1.714596281960388, + "language_loss": 0.71770334, + "learning_rate": 9.291488844121995e-07, + "loss": 0.73905998, + "num_input_tokens_seen": 247509005, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 11467, + "time_per_iteration": 2.4112367630004883 + }, + { + "auxiliary_loss_clip": 0.0110697, + "auxiliary_loss_mlp": 0.01033327, + "balance_loss_clip": 1.02008295, + "balance_loss_mlp": 1.0355289, + "epoch": 0.6894934615962723, + "flos": 18985540838400.0, + "grad_norm": 2.163503550286246, + "language_loss": 0.81232512, + "learning_rate": 9.288199722264156e-07, + "loss": 0.83372813, + "num_input_tokens_seen": 247527050, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71484375, + "step": 11468, + "time_per_iteration": 2.466501474380493 + }, + { + "auxiliary_loss_clip": 0.01107651, + "auxiliary_loss_mlp": 0.01032816, + "balance_loss_clip": 1.02028728, + "balance_loss_mlp": 1.03816724, + "epoch": 0.6895535848489404, + "flos": 34531664734080.0, + "grad_norm": 1.7452296141639345, + "language_loss": 0.65893084, + "learning_rate": 9.284911006598875e-07, + "loss": 0.68033552, + "num_input_tokens_seen": 247547765, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 11469, + "time_per_iteration": 3.9587156772613525 + }, + { + "auxiliary_loss_clip": 0.01024995, + "auxiliary_loss_mlp": 0.01001889, + "balance_loss_clip": 1.00072718, + "balance_loss_mlp": 1.0044626, + "epoch": 0.6896137081016083, + "flos": 50075852273280.0, + "grad_norm": 0.79151835418889, + "language_loss": 0.55171818, + "learning_rate": 9.281622697250824e-07, + "loss": 0.57198697, + "num_input_tokens_seen": 247603515, + "router_z_loss_clip": 0.01159668, + "router_z_loss_mlp": 0.20507812, + "step": 11470, + "time_per_iteration": 2.9345321655273438 + }, + { + "auxiliary_loss_clip": 0.01101343, + "auxiliary_loss_mlp": 0.01031107, + "balance_loss_clip": 1.0206759, + "balance_loss_mlp": 1.03692126, + "epoch": 0.6896738313542763, + "flos": 19938215306880.0, + "grad_norm": 2.531274937243883, + "language_loss": 0.77590048, + "learning_rate": 9.278334794344715e-07, + "loss": 0.79722488, + "num_input_tokens_seen": 247622110, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.64453125, + "step": 11471, + "time_per_iteration": 3.9249086380004883 + }, + { + "auxiliary_loss_clip": 0.0110492, + "auxiliary_loss_mlp": 0.01032326, + "balance_loss_clip": 1.02015519, + "balance_loss_mlp": 1.03743219, + "epoch": 0.6897339546069442, + "flos": 21725489260800.0, + "grad_norm": 1.771316633109537, + "language_loss": 0.78440964, + "learning_rate": 9.275047298005232e-07, + "loss": 0.80578208, + "num_input_tokens_seen": 247641905, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.67578125, + "step": 11472, + "time_per_iteration": 3.877894401550293 + }, + { + "auxiliary_loss_clip": 0.01100137, + "auxiliary_loss_mlp": 0.01029711, + "balance_loss_clip": 1.01826715, + "balance_loss_mlp": 1.03419447, + "epoch": 0.6897940778596122, + "flos": 19826497031040.0, + "grad_norm": 1.5889671799486909, + "language_loss": 0.76273483, + "learning_rate": 9.271760208357024e-07, + "loss": 0.7840333, + "num_input_tokens_seen": 247660945, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66015625, + "step": 11473, + "time_per_iteration": 3.895129680633545 + }, + { + "auxiliary_loss_clip": 0.01106535, + "auxiliary_loss_mlp": 0.01032871, + "balance_loss_clip": 1.0202589, + "balance_loss_mlp": 1.03657973, + "epoch": 0.6898542011122801, + "flos": 17310056987520.0, + "grad_norm": 3.23937327376226, + "language_loss": 0.75285846, + "learning_rate": 9.268473525524751e-07, + "loss": 0.77425253, + "num_input_tokens_seen": 247678395, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69921875, + "step": 11474, + "time_per_iteration": 2.4117770195007324 + }, + { + "auxiliary_loss_clip": 0.0110508, + "auxiliary_loss_mlp": 0.01032957, + "balance_loss_clip": 1.02095246, + "balance_loss_mlp": 1.03775465, + "epoch": 0.6899143243649482, + "flos": 24754051463040.0, + "grad_norm": 1.4990231187907213, + "language_loss": 0.74082041, + "learning_rate": 9.26518724963303e-07, + "loss": 0.76220077, + "num_input_tokens_seen": 247698380, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 11475, + "time_per_iteration": 2.5168709754943848 + }, + { + "auxiliary_loss_clip": 0.01105263, + "auxiliary_loss_mlp": 0.01028544, + "balance_loss_clip": 1.01619959, + "balance_loss_mlp": 1.03732133, + "epoch": 0.6899744476176161, + "flos": 17234536642560.0, + "grad_norm": 2.0309056655134587, + "language_loss": 0.88638115, + "learning_rate": 9.261901380806491e-07, + "loss": 0.90771919, + "num_input_tokens_seen": 247716370, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6796875, + "step": 11476, + "time_per_iteration": 2.4443247318267822 + }, + { + "auxiliary_loss_clip": 0.01102042, + "auxiliary_loss_mlp": 0.0103205, + "balance_loss_clip": 1.02020097, + "balance_loss_mlp": 1.03498316, + "epoch": 0.6900345708702841, + "flos": 25410678036480.0, + "grad_norm": 1.3153464082970854, + "language_loss": 0.70150822, + "learning_rate": 9.258615919169724e-07, + "loss": 0.72284913, + "num_input_tokens_seen": 247737335, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 11477, + "time_per_iteration": 2.5622828006744385 + }, + { + "auxiliary_loss_clip": 0.01109227, + "auxiliary_loss_mlp": 0.01038844, + "balance_loss_clip": 1.02615404, + "balance_loss_mlp": 1.03800416, + "epoch": 0.6900946941229521, + "flos": 23434190213760.0, + "grad_norm": 2.5064065757946925, + "language_loss": 0.68533587, + "learning_rate": 9.255330864847313e-07, + "loss": 0.70681655, + "num_input_tokens_seen": 247756680, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 11478, + "time_per_iteration": 2.46543288230896 + }, + { + "auxiliary_loss_clip": 0.01105606, + "auxiliary_loss_mlp": 0.01032476, + "balance_loss_clip": 1.02038169, + "balance_loss_mlp": 1.03681922, + "epoch": 0.69015481737562, + "flos": 17820096157440.0, + "grad_norm": 1.887360413937171, + "language_loss": 0.7609849, + "learning_rate": 9.252046217963843e-07, + "loss": 0.78236568, + "num_input_tokens_seen": 247774265, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 11479, + "time_per_iteration": 2.568270683288574 + }, + { + "auxiliary_loss_clip": 0.01107631, + "auxiliary_loss_mlp": 0.01027498, + "balance_loss_clip": 1.01501036, + "balance_loss_mlp": 1.03848529, + "epoch": 0.690214940628288, + "flos": 17456500736640.0, + "grad_norm": 1.594697323523918, + "language_loss": 0.78643298, + "learning_rate": 9.248761978643856e-07, + "loss": 0.80778426, + "num_input_tokens_seen": 247792395, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69140625, + "step": 11480, + "time_per_iteration": 2.4369962215423584 + }, + { + "auxiliary_loss_clip": 0.01104582, + "auxiliary_loss_mlp": 0.0102684, + "balance_loss_clip": 1.01478815, + "balance_loss_mlp": 1.03820038, + "epoch": 0.6902750638809559, + "flos": 29566691308800.0, + "grad_norm": 1.618219832411148, + "language_loss": 0.75485682, + "learning_rate": 9.245478147011885e-07, + "loss": 0.77617109, + "num_input_tokens_seen": 247811985, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6640625, + "step": 11481, + "time_per_iteration": 2.5970773696899414 + }, + { + "auxiliary_loss_clip": 0.01103611, + "auxiliary_loss_mlp": 0.01031414, + "balance_loss_clip": 1.01932073, + "balance_loss_mlp": 1.03630292, + "epoch": 0.690335187133624, + "flos": 25557121785600.0, + "grad_norm": 1.6722041595175992, + "language_loss": 0.6924783, + "learning_rate": 9.24219472319246e-07, + "loss": 0.71382856, + "num_input_tokens_seen": 247831880, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 11482, + "time_per_iteration": 2.4690396785736084 + }, + { + "auxiliary_loss_clip": 0.01104337, + "auxiliary_loss_mlp": 0.01027796, + "balance_loss_clip": 1.01601243, + "balance_loss_mlp": 1.0365622, + "epoch": 0.6903953103862919, + "flos": 22488447070080.0, + "grad_norm": 1.9391931338657746, + "language_loss": 0.82797402, + "learning_rate": 9.238911707310096e-07, + "loss": 0.84929538, + "num_input_tokens_seen": 247851170, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 11483, + "time_per_iteration": 2.479827880859375 + }, + { + "auxiliary_loss_clip": 0.01105727, + "auxiliary_loss_mlp": 0.01028162, + "balance_loss_clip": 1.01693249, + "balance_loss_mlp": 1.03651834, + "epoch": 0.6904554336389599, + "flos": 26100521712000.0, + "grad_norm": 2.252246315768351, + "language_loss": 0.65228778, + "learning_rate": 9.235629099489273e-07, + "loss": 0.67362666, + "num_input_tokens_seen": 247868950, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.69140625, + "step": 11484, + "time_per_iteration": 2.4820756912231445 + }, + { + "auxiliary_loss_clip": 0.01100593, + "auxiliary_loss_mlp": 0.01034816, + "balance_loss_clip": 1.0234673, + "balance_loss_mlp": 1.03430891, + "epoch": 0.6905155568916278, + "flos": 31171754545920.0, + "grad_norm": 1.5009595972061287, + "language_loss": 0.73750043, + "learning_rate": 9.232346899854479e-07, + "loss": 0.75885451, + "num_input_tokens_seen": 247889805, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 11485, + "time_per_iteration": 2.5609304904937744 + }, + { + "auxiliary_loss_clip": 0.0110609, + "auxiliary_loss_mlp": 0.0103398, + "balance_loss_clip": 1.02159464, + "balance_loss_mlp": 1.03691673, + "epoch": 0.6905756801442958, + "flos": 17639681120640.0, + "grad_norm": 1.703754025392432, + "language_loss": 0.85226732, + "learning_rate": 9.22906510853017e-07, + "loss": 0.87366807, + "num_input_tokens_seen": 247908585, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69140625, + "step": 11486, + "time_per_iteration": 2.422380208969116 + }, + { + "auxiliary_loss_clip": 0.01105383, + "auxiliary_loss_mlp": 0.01034252, + "balance_loss_clip": 1.02200925, + "balance_loss_mlp": 1.03721333, + "epoch": 0.6906358033969637, + "flos": 22343691260160.0, + "grad_norm": 1.4802712098189896, + "language_loss": 0.72739094, + "learning_rate": 9.225783725640786e-07, + "loss": 0.74878728, + "num_input_tokens_seen": 247928480, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.68359375, + "step": 11487, + "time_per_iteration": 2.4903013706207275 + }, + { + "auxiliary_loss_clip": 0.01025937, + "auxiliary_loss_mlp": 0.01011443, + "balance_loss_clip": 1.01028049, + "balance_loss_mlp": 1.00546408, + "epoch": 0.6906959266496318, + "flos": 69747789081600.0, + "grad_norm": 0.8945179331036194, + "language_loss": 0.66639161, + "learning_rate": 9.222502751310759e-07, + "loss": 0.68676543, + "num_input_tokens_seen": 247988855, + "router_z_loss_clip": 0.01159668, + "router_z_loss_mlp": 0.20507812, + "step": 11488, + "time_per_iteration": 3.0653343200683594 + }, + { + "auxiliary_loss_clip": 0.01107886, + "auxiliary_loss_mlp": 0.01032838, + "balance_loss_clip": 1.0193913, + "balance_loss_mlp": 1.03628397, + "epoch": 0.6907560499022997, + "flos": 21434253788160.0, + "grad_norm": 1.7595875611490563, + "language_loss": 0.7471655, + "learning_rate": 9.219222185664519e-07, + "loss": 0.76857275, + "num_input_tokens_seen": 248007685, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71484375, + "step": 11489, + "time_per_iteration": 2.5286636352539062 + }, + { + "auxiliary_loss_clip": 0.01107539, + "auxiliary_loss_mlp": 0.01034501, + "balance_loss_clip": 1.02164459, + "balance_loss_mlp": 1.03755021, + "epoch": 0.6908161731549677, + "flos": 14392207480320.0, + "grad_norm": 1.9530912954904702, + "language_loss": 0.62219006, + "learning_rate": 9.215942028826445e-07, + "loss": 0.64361048, + "num_input_tokens_seen": 248025145, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69921875, + "step": 11490, + "time_per_iteration": 2.420513868331909 + }, + { + "auxiliary_loss_clip": 0.01104902, + "auxiliary_loss_mlp": 0.01029569, + "balance_loss_clip": 1.01746964, + "balance_loss_mlp": 1.03648567, + "epoch": 0.6908762964076357, + "flos": 20010970304640.0, + "grad_norm": 1.749287596246761, + "language_loss": 0.72922885, + "learning_rate": 9.212662280920937e-07, + "loss": 0.75057352, + "num_input_tokens_seen": 248043750, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.68359375, + "step": 11491, + "time_per_iteration": 2.481513738632202 + }, + { + "auxiliary_loss_clip": 0.01101839, + "auxiliary_loss_mlp": 0.01037114, + "balance_loss_clip": 1.02441788, + "balance_loss_mlp": 1.03524041, + "epoch": 0.6909364196603036, + "flos": 28769079853440.0, + "grad_norm": 1.8381710188845477, + "language_loss": 0.7008509, + "learning_rate": 9.20938294207235e-07, + "loss": 0.72224045, + "num_input_tokens_seen": 248065765, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6640625, + "step": 11492, + "time_per_iteration": 2.506946325302124 + }, + { + "auxiliary_loss_clip": 0.01107014, + "auxiliary_loss_mlp": 0.01033387, + "balance_loss_clip": 1.02075076, + "balance_loss_mlp": 1.03607428, + "epoch": 0.6909965429129716, + "flos": 22528128620160.0, + "grad_norm": 1.9892003988580658, + "language_loss": 0.74623132, + "learning_rate": 9.206104012405049e-07, + "loss": 0.76763535, + "num_input_tokens_seen": 248083810, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 11493, + "time_per_iteration": 2.485933780670166 + }, + { + "auxiliary_loss_clip": 0.011046, + "auxiliary_loss_mlp": 0.01029054, + "balance_loss_clip": 1.01704955, + "balance_loss_mlp": 1.03709757, + "epoch": 0.6910566661656395, + "flos": 18405942981120.0, + "grad_norm": 1.750158272708012, + "language_loss": 0.74326122, + "learning_rate": 9.20282549204336e-07, + "loss": 0.76459777, + "num_input_tokens_seen": 248103185, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 11494, + "time_per_iteration": 2.4338111877441406 + }, + { + "auxiliary_loss_clip": 0.01102928, + "auxiliary_loss_mlp": 0.01030264, + "balance_loss_clip": 1.01822948, + "balance_loss_mlp": 1.03593969, + "epoch": 0.6911167894183076, + "flos": 30773972355840.0, + "grad_norm": 1.7715754861715476, + "language_loss": 0.68369365, + "learning_rate": 9.19954738111161e-07, + "loss": 0.70502561, + "num_input_tokens_seen": 248125665, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66796875, + "step": 11495, + "time_per_iteration": 2.5815460681915283 + }, + { + "auxiliary_loss_clip": 0.01103437, + "auxiliary_loss_mlp": 0.01029758, + "balance_loss_clip": 1.01754475, + "balance_loss_mlp": 1.03441787, + "epoch": 0.6911769126709755, + "flos": 13735724561280.0, + "grad_norm": 1.7834368050418072, + "language_loss": 0.73899794, + "learning_rate": 9.196269679734119e-07, + "loss": 0.7603299, + "num_input_tokens_seen": 248142545, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69140625, + "step": 11496, + "time_per_iteration": 2.4315319061279297 + }, + { + "auxiliary_loss_clip": 0.01102521, + "auxiliary_loss_mlp": 0.01028635, + "balance_loss_clip": 1.01727474, + "balance_loss_mlp": 1.03553581, + "epoch": 0.6912370359236435, + "flos": 17566854295680.0, + "grad_norm": 1.6258579372444952, + "language_loss": 0.79742873, + "learning_rate": 9.19299238803515e-07, + "loss": 0.81874031, + "num_input_tokens_seen": 248160225, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.671875, + "step": 11497, + "time_per_iteration": 2.4571430683135986 + }, + { + "auxiliary_loss_clip": 0.01107463, + "auxiliary_loss_mlp": 0.01036367, + "balance_loss_clip": 1.0240463, + "balance_loss_mlp": 1.03682327, + "epoch": 0.6912971591763114, + "flos": 22090772620800.0, + "grad_norm": 1.5194582434001807, + "language_loss": 0.80841976, + "learning_rate": 9.189715506138993e-07, + "loss": 0.82985806, + "num_input_tokens_seen": 248180430, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.70703125, + "step": 11498, + "time_per_iteration": 2.4500298500061035 + }, + { + "auxiliary_loss_clip": 0.01100372, + "auxiliary_loss_mlp": 0.01033064, + "balance_loss_clip": 1.0208571, + "balance_loss_mlp": 1.03421736, + "epoch": 0.6913572824289794, + "flos": 29971476650880.0, + "grad_norm": 1.486917569830455, + "language_loss": 0.86061001, + "learning_rate": 9.186439034169915e-07, + "loss": 0.8819443, + "num_input_tokens_seen": 248202365, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6640625, + "step": 11499, + "time_per_iteration": 2.5612852573394775 + }, + { + "auxiliary_loss_clip": 0.01101921, + "auxiliary_loss_mlp": 0.01027697, + "balance_loss_clip": 1.01606178, + "balance_loss_mlp": 1.03633177, + "epoch": 0.6914174056816473, + "flos": 20448936835200.0, + "grad_norm": 1.5487466201601385, + "language_loss": 0.75228941, + "learning_rate": 9.183162972252145e-07, + "loss": 0.77358556, + "num_input_tokens_seen": 248221750, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 11500, + "time_per_iteration": 2.443873405456543 + }, + { + "auxiliary_loss_clip": 0.01106604, + "auxiliary_loss_mlp": 0.01032013, + "balance_loss_clip": 1.0196985, + "balance_loss_mlp": 1.03778219, + "epoch": 0.6914775289343154, + "flos": 21282530739840.0, + "grad_norm": 1.800321839469313, + "language_loss": 0.76985884, + "learning_rate": 9.179887320509921e-07, + "loss": 0.79124504, + "num_input_tokens_seen": 248239535, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 11501, + "time_per_iteration": 2.5296645164489746 + }, + { + "auxiliary_loss_clip": 0.01107203, + "auxiliary_loss_mlp": 0.01033206, + "balance_loss_clip": 1.0208025, + "balance_loss_mlp": 1.03735363, + "epoch": 0.6915376521869833, + "flos": 23878118401920.0, + "grad_norm": 1.9287376377715924, + "language_loss": 0.73522556, + "learning_rate": 9.176612079067458e-07, + "loss": 0.75662971, + "num_input_tokens_seen": 248259055, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69921875, + "step": 11502, + "time_per_iteration": 2.476379632949829 + }, + { + "auxiliary_loss_clip": 0.01108675, + "auxiliary_loss_mlp": 0.0103432, + "balance_loss_clip": 1.02137995, + "balance_loss_mlp": 1.03706694, + "epoch": 0.6915977754396513, + "flos": 11510268595200.0, + "grad_norm": 1.914547972677127, + "language_loss": 0.73439324, + "learning_rate": 9.173337248048953e-07, + "loss": 0.75582325, + "num_input_tokens_seen": 248276765, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71484375, + "step": 11503, + "time_per_iteration": 2.477112293243408 + }, + { + "auxiliary_loss_clip": 0.01103095, + "auxiliary_loss_mlp": 0.01030947, + "balance_loss_clip": 1.01900172, + "balance_loss_mlp": 1.03558373, + "epoch": 0.6916578986923193, + "flos": 22601278667520.0, + "grad_norm": 2.2572840313297067, + "language_loss": 0.77144331, + "learning_rate": 9.170062827578575e-07, + "loss": 0.79278374, + "num_input_tokens_seen": 248295310, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.67578125, + "step": 11504, + "time_per_iteration": 2.434324026107788 + }, + { + "auxiliary_loss_clip": 0.01103184, + "auxiliary_loss_mlp": 0.01029313, + "balance_loss_clip": 1.0178746, + "balance_loss_mlp": 1.03509164, + "epoch": 0.6917180219449872, + "flos": 23477355383040.0, + "grad_norm": 1.6879501017402825, + "language_loss": 0.73243099, + "learning_rate": 9.166788817780499e-07, + "loss": 0.75375593, + "num_input_tokens_seen": 248315230, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6796875, + "step": 11505, + "time_per_iteration": 2.4869065284729004 + }, + { + "auxiliary_loss_clip": 0.01102379, + "auxiliary_loss_mlp": 0.01033942, + "balance_loss_clip": 1.02147281, + "balance_loss_mlp": 1.03443623, + "epoch": 0.6917781451976552, + "flos": 23732536579200.0, + "grad_norm": 1.792057287093782, + "language_loss": 0.87782943, + "learning_rate": 9.163515218778886e-07, + "loss": 0.89919269, + "num_input_tokens_seen": 248332980, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6796875, + "step": 11506, + "time_per_iteration": 2.4522695541381836 + }, + { + "auxiliary_loss_clip": 0.01102604, + "auxiliary_loss_mlp": 0.01026179, + "balance_loss_clip": 1.01465774, + "balance_loss_mlp": 1.03585625, + "epoch": 0.6918382684503231, + "flos": 31466760946560.0, + "grad_norm": 2.803306813867866, + "language_loss": 0.69775116, + "learning_rate": 9.160242030697856e-07, + "loss": 0.71903902, + "num_input_tokens_seen": 248352865, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 11507, + "time_per_iteration": 2.5447754859924316 + }, + { + "auxiliary_loss_clip": 0.01106091, + "auxiliary_loss_mlp": 0.01033389, + "balance_loss_clip": 1.02132535, + "balance_loss_mlp": 1.03596449, + "epoch": 0.6918983917029912, + "flos": 21650471706240.0, + "grad_norm": 2.005563924492128, + "language_loss": 0.76869601, + "learning_rate": 9.156969253661538e-07, + "loss": 0.7900908, + "num_input_tokens_seen": 248371125, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.703125, + "step": 11508, + "time_per_iteration": 2.4350826740264893 + }, + { + "auxiliary_loss_clip": 0.01099817, + "auxiliary_loss_mlp": 0.01029985, + "balance_loss_clip": 1.01885688, + "balance_loss_mlp": 1.03575826, + "epoch": 0.6919585149556591, + "flos": 25550082720000.0, + "grad_norm": 3.1306614754136217, + "language_loss": 0.75215411, + "learning_rate": 9.153696887794027e-07, + "loss": 0.77345216, + "num_input_tokens_seen": 248390455, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.640625, + "step": 11509, + "time_per_iteration": 2.51385760307312 + }, + { + "auxiliary_loss_clip": 0.01104564, + "auxiliary_loss_mlp": 0.01032862, + "balance_loss_clip": 1.02108383, + "balance_loss_mlp": 1.03770804, + "epoch": 0.6920186382083271, + "flos": 23659781581440.0, + "grad_norm": 1.4724116040863566, + "language_loss": 0.64134341, + "learning_rate": 9.150424933219425e-07, + "loss": 0.6627177, + "num_input_tokens_seen": 248411305, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.66796875, + "step": 11510, + "time_per_iteration": 2.45000958442688 + }, + { + "auxiliary_loss_clip": 0.01109479, + "auxiliary_loss_mlp": 0.01032628, + "balance_loss_clip": 1.01910424, + "balance_loss_mlp": 1.03804469, + "epoch": 0.692078761460995, + "flos": 19061959023360.0, + "grad_norm": 4.327241358216876, + "language_loss": 0.75543642, + "learning_rate": 9.147153390061788e-07, + "loss": 0.7768575, + "num_input_tokens_seen": 248430190, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71484375, + "step": 11511, + "time_per_iteration": 3.932948350906372 + }, + { + "auxiliary_loss_clip": 0.01104006, + "auxiliary_loss_mlp": 0.01033598, + "balance_loss_clip": 1.02240431, + "balance_loss_mlp": 1.03698862, + "epoch": 0.692138884713663, + "flos": 29023291382400.0, + "grad_norm": 2.3102277566791614, + "language_loss": 0.62639916, + "learning_rate": 9.143882258445184e-07, + "loss": 0.64777517, + "num_input_tokens_seen": 248450830, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.671875, + "step": 11512, + "time_per_iteration": 2.50154185295105 + }, + { + "auxiliary_loss_clip": 0.01103911, + "auxiliary_loss_mlp": 0.01030377, + "balance_loss_clip": 1.01848567, + "balance_loss_mlp": 1.03483152, + "epoch": 0.6921990079663309, + "flos": 14757849976320.0, + "grad_norm": 1.6663402692023492, + "language_loss": 0.8328855, + "learning_rate": 9.140611538493666e-07, + "loss": 0.85422838, + "num_input_tokens_seen": 248468585, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69140625, + "step": 11513, + "time_per_iteration": 3.906061887741089 + }, + { + "auxiliary_loss_clip": 0.01101955, + "auxiliary_loss_mlp": 0.01029486, + "balance_loss_clip": 1.01814294, + "balance_loss_mlp": 1.03563786, + "epoch": 0.692259131218999, + "flos": 23841848643840.0, + "grad_norm": 1.4134932862806329, + "language_loss": 0.77965999, + "learning_rate": 9.137341230331233e-07, + "loss": 0.80097437, + "num_input_tokens_seen": 248490535, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 11514, + "time_per_iteration": 3.914891481399536 + }, + { + "auxiliary_loss_clip": 0.01104612, + "auxiliary_loss_mlp": 0.01034457, + "balance_loss_clip": 1.02256036, + "balance_loss_mlp": 1.03478587, + "epoch": 0.6923192544716669, + "flos": 19135073157120.0, + "grad_norm": 1.8450575688706539, + "language_loss": 0.74720532, + "learning_rate": 9.134071334081907e-07, + "loss": 0.76859605, + "num_input_tokens_seen": 248508575, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69921875, + "step": 11515, + "time_per_iteration": 3.975337505340576 + }, + { + "auxiliary_loss_clip": 0.01101876, + "auxiliary_loss_mlp": 0.01034689, + "balance_loss_clip": 1.02331054, + "balance_loss_mlp": 1.03631759, + "epoch": 0.6923793777243349, + "flos": 28074639237120.0, + "grad_norm": 2.249358111886257, + "language_loss": 0.53926551, + "learning_rate": 9.130801849869694e-07, + "loss": 0.56063116, + "num_input_tokens_seen": 248527025, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65625, + "step": 11516, + "time_per_iteration": 2.4912428855895996 + }, + { + "auxiliary_loss_clip": 0.01101185, + "auxiliary_loss_mlp": 0.01031903, + "balance_loss_clip": 1.01989245, + "balance_loss_mlp": 1.03666639, + "epoch": 0.6924395009770029, + "flos": 16581250033920.0, + "grad_norm": 1.6422617041097631, + "language_loss": 0.72871542, + "learning_rate": 9.127532777818557e-07, + "loss": 0.75004637, + "num_input_tokens_seen": 248544275, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.64453125, + "step": 11517, + "time_per_iteration": 2.478013277053833 + }, + { + "auxiliary_loss_clip": 0.01105782, + "auxiliary_loss_mlp": 0.01036586, + "balance_loss_clip": 1.02440846, + "balance_loss_mlp": 1.03657305, + "epoch": 0.6924996242296708, + "flos": 16655297921280.0, + "grad_norm": 1.7574015499880917, + "language_loss": 0.76101017, + "learning_rate": 9.124264118052465e-07, + "loss": 0.78243387, + "num_input_tokens_seen": 248561870, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69140625, + "step": 11518, + "time_per_iteration": 2.4453186988830566 + }, + { + "auxiliary_loss_clip": 0.01108029, + "auxiliary_loss_mlp": 0.01033282, + "balance_loss_clip": 1.02049708, + "balance_loss_mlp": 1.03722334, + "epoch": 0.6925597474823388, + "flos": 34754167532160.0, + "grad_norm": 1.3039874531903892, + "language_loss": 0.64442092, + "learning_rate": 9.120995870695376e-07, + "loss": 0.66583401, + "num_input_tokens_seen": 248588190, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.70703125, + "step": 11519, + "time_per_iteration": 2.6372623443603516 + }, + { + "auxiliary_loss_clip": 0.01103223, + "auxiliary_loss_mlp": 0.01035136, + "balance_loss_clip": 1.02304852, + "balance_loss_mlp": 1.03542209, + "epoch": 0.6926198707350067, + "flos": 21871717528320.0, + "grad_norm": 1.9115708642987976, + "language_loss": 0.6239593, + "learning_rate": 9.117728035871212e-07, + "loss": 0.64534283, + "num_input_tokens_seen": 248606460, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 11520, + "time_per_iteration": 2.4893410205841064 + }, + { + "auxiliary_loss_clip": 0.01111126, + "auxiliary_loss_mlp": 0.01036871, + "balance_loss_clip": 1.02313781, + "balance_loss_mlp": 1.03751791, + "epoch": 0.6926799939876748, + "flos": 13006271162880.0, + "grad_norm": 1.8081030789169619, + "language_loss": 0.77767199, + "learning_rate": 9.114460613703887e-07, + "loss": 0.79915196, + "num_input_tokens_seen": 248623715, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.734375, + "step": 11521, + "time_per_iteration": 2.4445972442626953 + }, + { + "auxiliary_loss_clip": 0.01107789, + "auxiliary_loss_mlp": 0.01030231, + "balance_loss_clip": 1.01773214, + "balance_loss_mlp": 1.03593922, + "epoch": 0.6927401172403427, + "flos": 16761234107520.0, + "grad_norm": 1.8501694912434254, + "language_loss": 0.81979275, + "learning_rate": 9.111193604317304e-07, + "loss": 0.84117287, + "num_input_tokens_seen": 248640575, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71875, + "step": 11522, + "time_per_iteration": 2.423020124435425 + }, + { + "auxiliary_loss_clip": 0.01105276, + "auxiliary_loss_mlp": 0.01030165, + "balance_loss_clip": 1.01828539, + "balance_loss_mlp": 1.03786206, + "epoch": 0.6928002404930107, + "flos": 25705648523520.0, + "grad_norm": 1.3469897873257555, + "language_loss": 0.76728314, + "learning_rate": 9.107927007835361e-07, + "loss": 0.78863752, + "num_input_tokens_seen": 248663535, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 11523, + "time_per_iteration": 2.549304246902466 + }, + { + "auxiliary_loss_clip": 0.01101923, + "auxiliary_loss_mlp": 0.01031993, + "balance_loss_clip": 1.02063847, + "balance_loss_mlp": 1.03536248, + "epoch": 0.6928603637456786, + "flos": 18588261438720.0, + "grad_norm": 2.1482280608330355, + "language_loss": 0.68315476, + "learning_rate": 9.104660824381915e-07, + "loss": 0.70449388, + "num_input_tokens_seen": 248681125, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 11524, + "time_per_iteration": 2.443089723587036 + }, + { + "auxiliary_loss_clip": 0.0110548, + "auxiliary_loss_mlp": 0.01034425, + "balance_loss_clip": 1.02197385, + "balance_loss_mlp": 1.03614259, + "epoch": 0.6929204869983466, + "flos": 22200874784640.0, + "grad_norm": 1.8981913764440181, + "language_loss": 0.64524782, + "learning_rate": 9.101395054080815e-07, + "loss": 0.66664684, + "num_input_tokens_seen": 248700555, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.69140625, + "step": 11525, + "time_per_iteration": 2.504351854324341 + }, + { + "auxiliary_loss_clip": 0.01106067, + "auxiliary_loss_mlp": 0.01039081, + "balance_loss_clip": 1.02695775, + "balance_loss_mlp": 1.0376687, + "epoch": 0.6929806102510145, + "flos": 17894754576000.0, + "grad_norm": 1.9735788084293737, + "language_loss": 0.70338595, + "learning_rate": 9.098129697055907e-07, + "loss": 0.72483742, + "num_input_tokens_seen": 248716095, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.68359375, + "step": 11526, + "time_per_iteration": 2.4542391300201416 + }, + { + "auxiliary_loss_clip": 0.01100987, + "auxiliary_loss_mlp": 0.01028995, + "balance_loss_clip": 1.01756251, + "balance_loss_mlp": 1.03445363, + "epoch": 0.6930407335036826, + "flos": 19755178577280.0, + "grad_norm": 1.4787934463099037, + "language_loss": 0.76685685, + "learning_rate": 9.094864753431022e-07, + "loss": 0.78815675, + "num_input_tokens_seen": 248735330, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 11527, + "time_per_iteration": 2.510793685913086 + }, + { + "auxiliary_loss_clip": 0.0110112, + "auxiliary_loss_mlp": 0.0102928, + "balance_loss_clip": 1.01780045, + "balance_loss_mlp": 1.03496742, + "epoch": 0.6931008567563505, + "flos": 21544248211200.0, + "grad_norm": 1.562329187830164, + "language_loss": 0.79614961, + "learning_rate": 9.091600223329952e-07, + "loss": 0.81745368, + "num_input_tokens_seen": 248754530, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.66015625, + "step": 11528, + "time_per_iteration": 2.465226173400879 + }, + { + "auxiliary_loss_clip": 0.01099854, + "auxiliary_loss_mlp": 0.01030348, + "balance_loss_clip": 1.01909447, + "balance_loss_mlp": 1.03573501, + "epoch": 0.6931609800090185, + "flos": 26250018117120.0, + "grad_norm": 1.4331100909898178, + "language_loss": 0.76051259, + "learning_rate": 9.088336106876491e-07, + "loss": 0.78181458, + "num_input_tokens_seen": 248775825, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 11529, + "time_per_iteration": 2.5549967288970947 + }, + { + "auxiliary_loss_clip": 0.01100355, + "auxiliary_loss_mlp": 0.01035247, + "balance_loss_clip": 1.02312326, + "balance_loss_mlp": 1.0351932, + "epoch": 0.6932211032616865, + "flos": 32343376366080.0, + "grad_norm": 1.7201137592726918, + "language_loss": 0.72201979, + "learning_rate": 9.085072404194436e-07, + "loss": 0.74337578, + "num_input_tokens_seen": 248796180, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6484375, + "step": 11530, + "time_per_iteration": 2.531743049621582 + }, + { + "auxiliary_loss_clip": 0.01111171, + "auxiliary_loss_mlp": 0.01036425, + "balance_loss_clip": 1.02228653, + "balance_loss_mlp": 1.0381999, + "epoch": 0.6932812265143544, + "flos": 22049079909120.0, + "grad_norm": 1.645987038290128, + "language_loss": 0.7850855, + "learning_rate": 9.081809115407513e-07, + "loss": 0.80656147, + "num_input_tokens_seen": 248814735, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.73046875, + "step": 11531, + "time_per_iteration": 2.500711679458618 + }, + { + "auxiliary_loss_clip": 0.01101383, + "auxiliary_loss_mlp": 0.01030167, + "balance_loss_clip": 1.01911616, + "balance_loss_mlp": 1.0353266, + "epoch": 0.6933413497670224, + "flos": 26256626219520.0, + "grad_norm": 1.5275750432937483, + "language_loss": 0.69725084, + "learning_rate": 9.078546240639484e-07, + "loss": 0.71856636, + "num_input_tokens_seen": 248839140, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.66015625, + "step": 11532, + "time_per_iteration": 2.527376174926758 + }, + { + "auxiliary_loss_clip": 0.01106351, + "auxiliary_loss_mlp": 0.01028424, + "balance_loss_clip": 1.01601446, + "balance_loss_mlp": 1.0371834, + "epoch": 0.6934014730196904, + "flos": 19573003774080.0, + "grad_norm": 1.8600077248097753, + "language_loss": 0.6705901, + "learning_rate": 9.075283780014082e-07, + "loss": 0.69193786, + "num_input_tokens_seen": 248858300, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69140625, + "step": 11533, + "time_per_iteration": 2.518920421600342 + }, + { + "auxiliary_loss_clip": 0.01105686, + "auxiliary_loss_mlp": 0.01032096, + "balance_loss_clip": 1.01975226, + "balance_loss_mlp": 1.03683567, + "epoch": 0.6934615962723584, + "flos": 22119249127680.0, + "grad_norm": 3.0139531095823893, + "language_loss": 0.58712631, + "learning_rate": 9.072021733655007e-07, + "loss": 0.60850418, + "num_input_tokens_seen": 248876310, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 11534, + "time_per_iteration": 2.4710326194763184 + }, + { + "auxiliary_loss_clip": 0.01101215, + "auxiliary_loss_mlp": 0.01029045, + "balance_loss_clip": 1.01689124, + "balance_loss_mlp": 1.03428173, + "epoch": 0.6935217195250263, + "flos": 21360816432000.0, + "grad_norm": 2.05674594042133, + "language_loss": 0.71339464, + "learning_rate": 9.068760101685971e-07, + "loss": 0.73469722, + "num_input_tokens_seen": 248895650, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.671875, + "step": 11535, + "time_per_iteration": 2.4800782203674316 + }, + { + "auxiliary_loss_clip": 0.01024678, + "auxiliary_loss_mlp": 0.01001067, + "balance_loss_clip": 0.99988097, + "balance_loss_mlp": 1.00423908, + "epoch": 0.6935818427776943, + "flos": 64063813115520.0, + "grad_norm": 0.7160519901112068, + "language_loss": 0.59069979, + "learning_rate": 9.065498884230638e-07, + "loss": 0.61095721, + "num_input_tokens_seen": 248963920, + "router_z_loss_clip": 0.01184082, + "router_z_loss_mlp": 0.20507812, + "step": 11536, + "time_per_iteration": 3.175150156021118 + }, + { + "auxiliary_loss_clip": 0.01107914, + "auxiliary_loss_mlp": 0.01030682, + "balance_loss_clip": 1.01796234, + "balance_loss_mlp": 1.03721535, + "epoch": 0.6936419660303622, + "flos": 20302564913280.0, + "grad_norm": 1.8374101085934587, + "language_loss": 0.72543836, + "learning_rate": 9.062238081412692e-07, + "loss": 0.74682426, + "num_input_tokens_seen": 248983380, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 11537, + "time_per_iteration": 2.4590697288513184 + }, + { + "auxiliary_loss_clip": 0.01024524, + "auxiliary_loss_mlp": 0.00999962, + "balance_loss_clip": 0.99879992, + "balance_loss_mlp": 1.0041244, + "epoch": 0.6937020892830302, + "flos": 67182581347200.0, + "grad_norm": 0.7454400182413451, + "language_loss": 0.55605686, + "learning_rate": 9.058977693355767e-07, + "loss": 0.57630169, + "num_input_tokens_seen": 249044680, + "router_z_loss_clip": 0.01159668, + "router_z_loss_mlp": 0.20410156, + "step": 11538, + "time_per_iteration": 3.05582332611084 + }, + { + "auxiliary_loss_clip": 0.01098329, + "auxiliary_loss_mlp": 0.01030961, + "balance_loss_clip": 1.02029145, + "balance_loss_mlp": 1.03483129, + "epoch": 0.6937622125356981, + "flos": 23878190229120.0, + "grad_norm": 1.5310037982769402, + "language_loss": 0.77299392, + "learning_rate": 9.055717720183505e-07, + "loss": 0.79428679, + "num_input_tokens_seen": 249061060, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.63671875, + "step": 11539, + "time_per_iteration": 2.478339433670044 + }, + { + "auxiliary_loss_clip": 0.01102344, + "auxiliary_loss_mlp": 0.01026627, + "balance_loss_clip": 1.01524878, + "balance_loss_mlp": 1.03527951, + "epoch": 0.6938223357883662, + "flos": 28730619365760.0, + "grad_norm": 1.7857614206632793, + "language_loss": 0.64559513, + "learning_rate": 9.05245816201953e-07, + "loss": 0.66688484, + "num_input_tokens_seen": 249081430, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 11540, + "time_per_iteration": 2.5308845043182373 + }, + { + "auxiliary_loss_clip": 0.01102212, + "auxiliary_loss_mlp": 0.01028001, + "balance_loss_clip": 1.01658714, + "balance_loss_mlp": 1.03576088, + "epoch": 0.6938824590410341, + "flos": 28655027193600.0, + "grad_norm": 1.5373758397394544, + "language_loss": 0.8667385, + "learning_rate": 9.049199018987437e-07, + "loss": 0.88804066, + "num_input_tokens_seen": 249103020, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 11541, + "time_per_iteration": 2.5364692211151123 + }, + { + "auxiliary_loss_clip": 0.01103258, + "auxiliary_loss_mlp": 0.01031894, + "balance_loss_clip": 1.02015162, + "balance_loss_mlp": 1.03593302, + "epoch": 0.6939425822937021, + "flos": 18983062800000.0, + "grad_norm": 1.7924323447912938, + "language_loss": 0.84049714, + "learning_rate": 9.04594029121081e-07, + "loss": 0.86184859, + "num_input_tokens_seen": 249120810, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 11542, + "time_per_iteration": 2.4829962253570557 + }, + { + "auxiliary_loss_clip": 0.01104055, + "auxiliary_loss_mlp": 0.01029315, + "balance_loss_clip": 1.01595759, + "balance_loss_mlp": 1.0352869, + "epoch": 0.6940027055463701, + "flos": 23075838178560.0, + "grad_norm": 1.8414334065280868, + "language_loss": 0.75269711, + "learning_rate": 9.04268197881323e-07, + "loss": 0.77403086, + "num_input_tokens_seen": 249138050, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6875, + "step": 11543, + "time_per_iteration": 2.452195167541504 + }, + { + "auxiliary_loss_clip": 0.01102342, + "auxiliary_loss_mlp": 0.01031344, + "balance_loss_clip": 1.01960182, + "balance_loss_mlp": 1.03582442, + "epoch": 0.694062828799038, + "flos": 18186564666240.0, + "grad_norm": 1.6661945850864863, + "language_loss": 0.76122248, + "learning_rate": 9.039424081918241e-07, + "loss": 0.78255928, + "num_input_tokens_seen": 249155570, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 11544, + "time_per_iteration": 2.461024761199951 + }, + { + "auxiliary_loss_clip": 0.01105964, + "auxiliary_loss_mlp": 0.01033804, + "balance_loss_clip": 1.02216911, + "balance_loss_mlp": 1.03684866, + "epoch": 0.694122952051706, + "flos": 17821532701440.0, + "grad_norm": 1.7008976535157667, + "language_loss": 0.71218264, + "learning_rate": 9.036166600649388e-07, + "loss": 0.73358029, + "num_input_tokens_seen": 249172960, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.69140625, + "step": 11545, + "time_per_iteration": 2.4178249835968018 + }, + { + "auxiliary_loss_clip": 0.01099714, + "auxiliary_loss_mlp": 0.01027089, + "balance_loss_clip": 1.01646137, + "balance_loss_mlp": 1.03581667, + "epoch": 0.694183075304374, + "flos": 21215306436480.0, + "grad_norm": 1.933857108829042, + "language_loss": 0.79382741, + "learning_rate": 9.0329095351302e-07, + "loss": 0.81509542, + "num_input_tokens_seen": 249192450, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.640625, + "step": 11546, + "time_per_iteration": 2.467369794845581 + }, + { + "auxiliary_loss_clip": 0.01105153, + "auxiliary_loss_mlp": 0.0102905, + "balance_loss_clip": 1.01755857, + "balance_loss_mlp": 1.03803396, + "epoch": 0.694243198557042, + "flos": 24060508686720.0, + "grad_norm": 2.1784420231587562, + "language_loss": 0.78471816, + "learning_rate": 9.029652885484194e-07, + "loss": 0.80606019, + "num_input_tokens_seen": 249214320, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 11547, + "time_per_iteration": 2.5005674362182617 + }, + { + "auxiliary_loss_clip": 0.01104152, + "auxiliary_loss_mlp": 0.01033664, + "balance_loss_clip": 1.02177894, + "balance_loss_mlp": 1.03765762, + "epoch": 0.6943033218097099, + "flos": 21141869080320.0, + "grad_norm": 2.1600607182563323, + "language_loss": 0.81004536, + "learning_rate": 9.026396651834834e-07, + "loss": 0.83142352, + "num_input_tokens_seen": 249230925, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 11548, + "time_per_iteration": 2.467039108276367 + }, + { + "auxiliary_loss_clip": 0.01024313, + "auxiliary_loss_mlp": 0.01001552, + "balance_loss_clip": 1.00046158, + "balance_loss_mlp": 1.003824, + "epoch": 0.6943634450623779, + "flos": 57812015975040.0, + "grad_norm": 0.6998312619688671, + "language_loss": 0.53725159, + "learning_rate": 9.023140834305613e-07, + "loss": 0.55751026, + "num_input_tokens_seen": 249293975, + "router_z_loss_clip": 0.01092529, + "router_z_loss_mlp": 0.20507812, + "step": 11549, + "time_per_iteration": 3.049893617630005 + }, + { + "auxiliary_loss_clip": 0.01102026, + "auxiliary_loss_mlp": 0.01031521, + "balance_loss_clip": 1.01926637, + "balance_loss_mlp": 1.03490329, + "epoch": 0.6944235683150458, + "flos": 30590684231040.0, + "grad_norm": 1.4134834791230244, + "language_loss": 0.7344752, + "learning_rate": 9.01988543302e-07, + "loss": 0.75581068, + "num_input_tokens_seen": 249315285, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.671875, + "step": 11550, + "time_per_iteration": 2.5287935733795166 + }, + { + "auxiliary_loss_clip": 0.01105894, + "auxiliary_loss_mlp": 0.01035534, + "balance_loss_clip": 1.02367878, + "balance_loss_mlp": 1.03701949, + "epoch": 0.6944836915677138, + "flos": 19719447523200.0, + "grad_norm": 1.8969044968976483, + "language_loss": 0.73992145, + "learning_rate": 9.016630448101425e-07, + "loss": 0.76133573, + "num_input_tokens_seen": 249333505, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6875, + "step": 11551, + "time_per_iteration": 2.4404563903808594 + }, + { + "auxiliary_loss_clip": 0.01104938, + "auxiliary_loss_mlp": 0.01037921, + "balance_loss_clip": 1.02617919, + "balance_loss_mlp": 1.03671432, + "epoch": 0.6945438148203817, + "flos": 24863579009280.0, + "grad_norm": 1.6277950876042102, + "language_loss": 0.84549385, + "learning_rate": 9.01337587967333e-07, + "loss": 0.86692244, + "num_input_tokens_seen": 249354180, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 11552, + "time_per_iteration": 2.498476028442383 + }, + { + "auxiliary_loss_clip": 0.01102767, + "auxiliary_loss_mlp": 0.01034275, + "balance_loss_clip": 1.02255046, + "balance_loss_mlp": 1.03642297, + "epoch": 0.6946039380730498, + "flos": 33326646243840.0, + "grad_norm": 1.5310970869840324, + "language_loss": 0.67400169, + "learning_rate": 9.010121727859117e-07, + "loss": 0.6953721, + "num_input_tokens_seen": 249377035, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 11553, + "time_per_iteration": 3.92946720123291 + }, + { + "auxiliary_loss_clip": 0.01107649, + "auxiliary_loss_mlp": 0.01028199, + "balance_loss_clip": 1.01543725, + "balance_loss_mlp": 1.03727949, + "epoch": 0.6946640613257177, + "flos": 20850956830080.0, + "grad_norm": 1.5363855656738201, + "language_loss": 0.79580885, + "learning_rate": 9.006867992782195e-07, + "loss": 0.8171674, + "num_input_tokens_seen": 249396155, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.703125, + "step": 11554, + "time_per_iteration": 2.469681978225708 + }, + { + "auxiliary_loss_clip": 0.01103857, + "auxiliary_loss_mlp": 0.01029016, + "balance_loss_clip": 1.0172801, + "balance_loss_mlp": 1.03479338, + "epoch": 0.6947241845783857, + "flos": 19354846521600.0, + "grad_norm": 1.7519879066783155, + "language_loss": 0.72581065, + "learning_rate": 9.003614674565934e-07, + "loss": 0.74713933, + "num_input_tokens_seen": 249414555, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.69140625, + "step": 11555, + "time_per_iteration": 3.862004280090332 + }, + { + "auxiliary_loss_clip": 0.01100586, + "auxiliary_loss_mlp": 0.01027174, + "balance_loss_clip": 1.01587296, + "balance_loss_mlp": 1.0338273, + "epoch": 0.6947843078310536, + "flos": 27120240915840.0, + "grad_norm": 1.9852142507231525, + "language_loss": 0.78025049, + "learning_rate": 9.000361773333705e-07, + "loss": 0.8015281, + "num_input_tokens_seen": 249433570, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.66796875, + "step": 11556, + "time_per_iteration": 5.454412937164307 + }, + { + "auxiliary_loss_clip": 0.01101652, + "auxiliary_loss_mlp": 0.01034952, + "balance_loss_clip": 1.0232873, + "balance_loss_mlp": 1.03403139, + "epoch": 0.6948444310837216, + "flos": 28585109370240.0, + "grad_norm": 2.79871624128239, + "language_loss": 0.60282063, + "learning_rate": 8.997109289208869e-07, + "loss": 0.62418664, + "num_input_tokens_seen": 249453735, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.67578125, + "step": 11557, + "time_per_iteration": 2.5056674480438232 + }, + { + "auxiliary_loss_clip": 0.01100911, + "auxiliary_loss_mlp": 0.01036469, + "balance_loss_clip": 1.02463782, + "balance_loss_mlp": 1.03539312, + "epoch": 0.6949045543363896, + "flos": 15669262696320.0, + "grad_norm": 1.6476789256185396, + "language_loss": 0.8537513, + "learning_rate": 8.993857222314752e-07, + "loss": 0.87512511, + "num_input_tokens_seen": 249470805, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65625, + "step": 11558, + "time_per_iteration": 2.456141948699951 + }, + { + "auxiliary_loss_clip": 0.01105067, + "auxiliary_loss_mlp": 0.01029415, + "balance_loss_clip": 1.01679111, + "balance_loss_mlp": 1.03618479, + "epoch": 0.6949646775890576, + "flos": 23259413612160.0, + "grad_norm": 1.6025671858040744, + "language_loss": 0.70371419, + "learning_rate": 8.990605572774664e-07, + "loss": 0.72505903, + "num_input_tokens_seen": 249491150, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 11559, + "time_per_iteration": 2.5148940086364746 + }, + { + "auxiliary_loss_clip": 0.01101778, + "auxiliary_loss_mlp": 0.01028231, + "balance_loss_clip": 1.01674509, + "balance_loss_mlp": 1.03588152, + "epoch": 0.6950248008417256, + "flos": 22382546797440.0, + "grad_norm": 1.5297645646514304, + "language_loss": 0.78975582, + "learning_rate": 8.987354340711921e-07, + "loss": 0.8110559, + "num_input_tokens_seen": 249511560, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66015625, + "step": 11560, + "time_per_iteration": 2.504146099090576 + }, + { + "auxiliary_loss_clip": 0.01101416, + "auxiliary_loss_mlp": 0.01030876, + "balance_loss_clip": 1.01942587, + "balance_loss_mlp": 1.03616834, + "epoch": 0.6950849240943935, + "flos": 23477355383040.0, + "grad_norm": 1.666384056444463, + "language_loss": 0.76987702, + "learning_rate": 8.9841035262498e-07, + "loss": 0.79119992, + "num_input_tokens_seen": 249531910, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 11561, + "time_per_iteration": 2.480802536010742 + }, + { + "auxiliary_loss_clip": 0.0109923, + "auxiliary_loss_mlp": 0.01030204, + "balance_loss_clip": 1.01806235, + "balance_loss_mlp": 1.03331923, + "epoch": 0.6951450473470615, + "flos": 17420554200960.0, + "grad_norm": 1.7391531347439242, + "language_loss": 0.78634578, + "learning_rate": 8.980853129511577e-07, + "loss": 0.80764008, + "num_input_tokens_seen": 249550300, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66015625, + "step": 11562, + "time_per_iteration": 2.438997268676758 + }, + { + "auxiliary_loss_clip": 0.0110346, + "auxiliary_loss_mlp": 0.01032953, + "balance_loss_clip": 1.02100253, + "balance_loss_mlp": 1.03525412, + "epoch": 0.6952051705997294, + "flos": 20485745297280.0, + "grad_norm": 1.9230268961820236, + "language_loss": 0.69259918, + "learning_rate": 8.977603150620515e-07, + "loss": 0.71396333, + "num_input_tokens_seen": 249567740, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 11563, + "time_per_iteration": 2.4467828273773193 + }, + { + "auxiliary_loss_clip": 0.01097161, + "auxiliary_loss_mlp": 0.01025707, + "balance_loss_clip": 1.01491845, + "balance_loss_mlp": 1.03383183, + "epoch": 0.6952652938523974, + "flos": 13989541040640.0, + "grad_norm": 2.153945918609724, + "language_loss": 0.73383999, + "learning_rate": 8.974353589699846e-07, + "loss": 0.75506866, + "num_input_tokens_seen": 249582700, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6328125, + "step": 11564, + "time_per_iteration": 2.4219517707824707 + }, + { + "auxiliary_loss_clip": 0.01117667, + "auxiliary_loss_mlp": 0.01036182, + "balance_loss_clip": 1.02188849, + "balance_loss_mlp": 1.04055667, + "epoch": 0.6953254171050653, + "flos": 30953956429440.0, + "grad_norm": 1.987939257518994, + "language_loss": 0.71758306, + "learning_rate": 8.971104446872785e-07, + "loss": 0.73912156, + "num_input_tokens_seen": 249602920, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.76953125, + "step": 11565, + "time_per_iteration": 2.5249881744384766 + }, + { + "auxiliary_loss_clip": 0.01024476, + "auxiliary_loss_mlp": 0.01001909, + "balance_loss_clip": 1.00083661, + "balance_loss_mlp": 1.00426412, + "epoch": 0.6953855403577334, + "flos": 61670257499520.0, + "grad_norm": 0.9231095353674287, + "language_loss": 0.58470231, + "learning_rate": 8.96785572226255e-07, + "loss": 0.60496616, + "num_input_tokens_seen": 249660400, + "router_z_loss_clip": 0.01074219, + "router_z_loss_mlp": 0.20214844, + "step": 11566, + "time_per_iteration": 2.9420695304870605 + }, + { + "auxiliary_loss_clip": 0.0110462, + "auxiliary_loss_mlp": 0.01028714, + "balance_loss_clip": 1.01639366, + "balance_loss_mlp": 1.03440809, + "epoch": 0.6954456636104013, + "flos": 23039029716480.0, + "grad_norm": 1.9048250540658576, + "language_loss": 0.74568522, + "learning_rate": 8.964607415992338e-07, + "loss": 0.76701856, + "num_input_tokens_seen": 249679335, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 11567, + "time_per_iteration": 2.4744651317596436 + }, + { + "auxiliary_loss_clip": 0.01100227, + "auxiliary_loss_mlp": 0.01033708, + "balance_loss_clip": 1.02154922, + "balance_loss_mlp": 1.03473878, + "epoch": 0.6955057868630693, + "flos": 23918518224000.0, + "grad_norm": 1.342733224210211, + "language_loss": 0.76978123, + "learning_rate": 8.961359528185313e-07, + "loss": 0.79112065, + "num_input_tokens_seen": 249701805, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.65625, + "step": 11568, + "time_per_iteration": 2.5342469215393066 + }, + { + "auxiliary_loss_clip": 0.01103163, + "auxiliary_loss_mlp": 0.01033796, + "balance_loss_clip": 1.02255452, + "balance_loss_mlp": 1.03756905, + "epoch": 0.6955659101157372, + "flos": 22594634651520.0, + "grad_norm": 4.390531062594107, + "language_loss": 0.72720057, + "learning_rate": 8.958112058964649e-07, + "loss": 0.74857014, + "num_input_tokens_seen": 249720550, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 11569, + "time_per_iteration": 2.44547438621521 + }, + { + "auxiliary_loss_clip": 0.01104961, + "auxiliary_loss_mlp": 0.0102821, + "balance_loss_clip": 1.01642609, + "balance_loss_mlp": 1.03668261, + "epoch": 0.6956260333684052, + "flos": 24572523104640.0, + "grad_norm": 2.456023744681467, + "language_loss": 0.77213609, + "learning_rate": 8.954865008453471e-07, + "loss": 0.79346788, + "num_input_tokens_seen": 249740325, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.68359375, + "step": 11570, + "time_per_iteration": 2.539635419845581 + }, + { + "auxiliary_loss_clip": 0.01104303, + "auxiliary_loss_mlp": 0.01033386, + "balance_loss_clip": 1.02089262, + "balance_loss_mlp": 1.03544307, + "epoch": 0.6956861566210732, + "flos": 25846058787840.0, + "grad_norm": 2.0491810853886125, + "language_loss": 0.74309134, + "learning_rate": 8.95161837677493e-07, + "loss": 0.76446825, + "num_input_tokens_seen": 249760570, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 11571, + "time_per_iteration": 2.5310707092285156 + }, + { + "auxiliary_loss_clip": 0.0109878, + "auxiliary_loss_mlp": 0.01030092, + "balance_loss_clip": 1.01863599, + "balance_loss_mlp": 1.03522277, + "epoch": 0.6957462798737412, + "flos": 15301393557120.0, + "grad_norm": 2.2800160570301395, + "language_loss": 0.74539, + "learning_rate": 8.948372164052118e-07, + "loss": 0.76667869, + "num_input_tokens_seen": 249778290, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.63671875, + "step": 11572, + "time_per_iteration": 2.454315423965454 + }, + { + "auxiliary_loss_clip": 0.01101105, + "auxiliary_loss_mlp": 0.0102909, + "balance_loss_clip": 1.01692498, + "balance_loss_mlp": 1.03309405, + "epoch": 0.6958064031264092, + "flos": 36246830135040.0, + "grad_norm": 1.919471935586269, + "language_loss": 0.7033447, + "learning_rate": 8.94512637040814e-07, + "loss": 0.72464669, + "num_input_tokens_seen": 249800925, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6796875, + "step": 11573, + "time_per_iteration": 2.6062417030334473 + }, + { + "auxiliary_loss_clip": 0.01108794, + "auxiliary_loss_mlp": 0.0103278, + "balance_loss_clip": 1.02028072, + "balance_loss_mlp": 1.03887129, + "epoch": 0.6958665263790771, + "flos": 19208725994880.0, + "grad_norm": 1.9750506885077386, + "language_loss": 0.74985647, + "learning_rate": 8.941880995966095e-07, + "loss": 0.77127224, + "num_input_tokens_seen": 249820500, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69921875, + "step": 11574, + "time_per_iteration": 2.4739365577697754 + }, + { + "auxiliary_loss_clip": 0.01105022, + "auxiliary_loss_mlp": 0.01031169, + "balance_loss_clip": 1.01920092, + "balance_loss_mlp": 1.03574729, + "epoch": 0.6959266496317451, + "flos": 21795838047360.0, + "grad_norm": 1.6163956776113584, + "language_loss": 0.74427664, + "learning_rate": 8.938636040849014e-07, + "loss": 0.76563859, + "num_input_tokens_seen": 249839845, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.69140625, + "step": 11575, + "time_per_iteration": 2.4526143074035645 + }, + { + "auxiliary_loss_clip": 0.01103541, + "auxiliary_loss_mlp": 0.01031176, + "balance_loss_clip": 1.01851606, + "balance_loss_mlp": 1.03498685, + "epoch": 0.695986772884413, + "flos": 20558248899840.0, + "grad_norm": 2.202817220265, + "language_loss": 0.78680444, + "learning_rate": 8.935391505179966e-07, + "loss": 0.80815148, + "num_input_tokens_seen": 249857400, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 11576, + "time_per_iteration": 2.489030122756958 + }, + { + "auxiliary_loss_clip": 0.01104629, + "auxiliary_loss_mlp": 0.01031961, + "balance_loss_clip": 1.0206064, + "balance_loss_mlp": 1.03426623, + "epoch": 0.696046896137081, + "flos": 14936217937920.0, + "grad_norm": 2.167216169901492, + "language_loss": 0.56448716, + "learning_rate": 8.932147389081985e-07, + "loss": 0.5858531, + "num_input_tokens_seen": 249871645, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.703125, + "step": 11577, + "time_per_iteration": 2.402588367462158 + }, + { + "auxiliary_loss_clip": 0.010978, + "auxiliary_loss_mlp": 0.01020474, + "balance_loss_clip": 1.01061571, + "balance_loss_mlp": 1.0344727, + "epoch": 0.696107019389749, + "flos": 30740216549760.0, + "grad_norm": 1.3300447055766056, + "language_loss": 0.76633966, + "learning_rate": 8.928903692678081e-07, + "loss": 0.78752244, + "num_input_tokens_seen": 249894215, + "router_z_loss_clip": 0.09863281, + "router_z_loss_mlp": 0.6328125, + "step": 11578, + "time_per_iteration": 2.5856926441192627 + }, + { + "auxiliary_loss_clip": 0.01103837, + "auxiliary_loss_mlp": 0.01031702, + "balance_loss_clip": 1.02003193, + "balance_loss_mlp": 1.03707981, + "epoch": 0.696167142642417, + "flos": 20776729374720.0, + "grad_norm": 1.9898977429274547, + "language_loss": 0.7948364, + "learning_rate": 8.925660416091254e-07, + "loss": 0.81619179, + "num_input_tokens_seen": 249912850, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66796875, + "step": 11579, + "time_per_iteration": 2.4593424797058105 + }, + { + "auxiliary_loss_clip": 0.01097825, + "auxiliary_loss_mlp": 0.01026526, + "balance_loss_clip": 1.0148437, + "balance_loss_mlp": 1.03269458, + "epoch": 0.6962272658950849, + "flos": 22565152563840.0, + "grad_norm": 1.7711043261793566, + "language_loss": 0.72253591, + "learning_rate": 8.922417559444502e-07, + "loss": 0.74377942, + "num_input_tokens_seen": 249932650, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65234375, + "step": 11580, + "time_per_iteration": 2.5214614868164062 + }, + { + "auxiliary_loss_clip": 0.0110553, + "auxiliary_loss_mlp": 0.01029239, + "balance_loss_clip": 1.01617932, + "balance_loss_mlp": 1.03608978, + "epoch": 0.6962873891477529, + "flos": 22200156512640.0, + "grad_norm": 1.861307366576084, + "language_loss": 0.65531254, + "learning_rate": 8.919175122860787e-07, + "loss": 0.67666024, + "num_input_tokens_seen": 249951205, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6953125, + "step": 11581, + "time_per_iteration": 2.519068479537964 + }, + { + "auxiliary_loss_clip": 0.01102711, + "auxiliary_loss_mlp": 0.01030191, + "balance_loss_clip": 1.01883006, + "balance_loss_mlp": 1.03555655, + "epoch": 0.6963475124004208, + "flos": 12489695717760.0, + "grad_norm": 2.390157722365771, + "language_loss": 0.76223433, + "learning_rate": 8.915933106463056e-07, + "loss": 0.78356332, + "num_input_tokens_seen": 249967045, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 11582, + "time_per_iteration": 2.444866418838501 + }, + { + "auxiliary_loss_clip": 0.01101303, + "auxiliary_loss_mlp": 0.01029814, + "balance_loss_clip": 1.01910901, + "balance_loss_mlp": 1.03478706, + "epoch": 0.6964076356530888, + "flos": 17165085696000.0, + "grad_norm": 1.876033269945707, + "language_loss": 0.69968796, + "learning_rate": 8.91269151037425e-07, + "loss": 0.72099912, + "num_input_tokens_seen": 249984565, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6640625, + "step": 11583, + "time_per_iteration": 2.430619239807129 + }, + { + "auxiliary_loss_clip": 0.01105097, + "auxiliary_loss_mlp": 0.0103149, + "balance_loss_clip": 1.01950979, + "balance_loss_mlp": 1.03693569, + "epoch": 0.6964677589057569, + "flos": 19937317466880.0, + "grad_norm": 2.37757967168826, + "language_loss": 0.82697153, + "learning_rate": 8.909450334717301e-07, + "loss": 0.84833741, + "num_input_tokens_seen": 250004235, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6796875, + "step": 11584, + "time_per_iteration": 2.5077664852142334 + }, + { + "auxiliary_loss_clip": 0.01105057, + "auxiliary_loss_mlp": 0.01035873, + "balance_loss_clip": 1.02336848, + "balance_loss_mlp": 1.03613901, + "epoch": 0.6965278821584248, + "flos": 22784064001920.0, + "grad_norm": 2.430393804317416, + "language_loss": 0.79577053, + "learning_rate": 8.906209579615107e-07, + "loss": 0.8171798, + "num_input_tokens_seen": 250017645, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 11585, + "time_per_iteration": 2.4488959312438965 + }, + { + "auxiliary_loss_clip": 0.01099431, + "auxiliary_loss_mlp": 0.0103141, + "balance_loss_clip": 1.02046049, + "balance_loss_mlp": 1.03464603, + "epoch": 0.6965880054110928, + "flos": 20047563285120.0, + "grad_norm": 1.5234092919525861, + "language_loss": 0.77759147, + "learning_rate": 8.90296924519055e-07, + "loss": 0.79889989, + "num_input_tokens_seen": 250037640, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 11586, + "time_per_iteration": 2.4705069065093994 + }, + { + "auxiliary_loss_clip": 0.01096075, + "auxiliary_loss_mlp": 0.01030607, + "balance_loss_clip": 1.02000952, + "balance_loss_mlp": 1.03367376, + "epoch": 0.6966481286637607, + "flos": 21908238681600.0, + "grad_norm": 1.7766488711687052, + "language_loss": 0.78765887, + "learning_rate": 8.899729331566519e-07, + "loss": 0.80892575, + "num_input_tokens_seen": 250056490, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.625, + "step": 11587, + "time_per_iteration": 2.4538965225219727 + }, + { + "auxiliary_loss_clip": 0.01100978, + "auxiliary_loss_mlp": 0.01030866, + "balance_loss_clip": 1.01916003, + "balance_loss_mlp": 1.03608429, + "epoch": 0.6967082519164287, + "flos": 15633172506240.0, + "grad_norm": 2.0434006837874885, + "language_loss": 0.72847271, + "learning_rate": 8.896489838865857e-07, + "loss": 0.74979115, + "num_input_tokens_seen": 250074285, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6484375, + "step": 11588, + "time_per_iteration": 2.452421188354492 + }, + { + "auxiliary_loss_clip": 0.01101081, + "auxiliary_loss_mlp": 0.0102536, + "balance_loss_clip": 1.01462507, + "balance_loss_mlp": 1.03454709, + "epoch": 0.6967683751690966, + "flos": 24024598064640.0, + "grad_norm": 1.6358395354491653, + "language_loss": 0.75110734, + "learning_rate": 8.893250767211413e-07, + "loss": 0.77237165, + "num_input_tokens_seen": 250093350, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6640625, + "step": 11589, + "time_per_iteration": 2.466801643371582 + }, + { + "auxiliary_loss_clip": 0.01102838, + "auxiliary_loss_mlp": 0.01029576, + "balance_loss_clip": 1.01815021, + "balance_loss_mlp": 1.03571272, + "epoch": 0.6968284984217646, + "flos": 31024700265600.0, + "grad_norm": 1.8223612278895884, + "language_loss": 0.63479555, + "learning_rate": 8.890012116726012e-07, + "loss": 0.6561197, + "num_input_tokens_seen": 250114170, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 11590, + "time_per_iteration": 2.547621011734009 + }, + { + "auxiliary_loss_clip": 0.0102506, + "auxiliary_loss_mlp": 0.00999727, + "balance_loss_clip": 0.99851686, + "balance_loss_mlp": 1.00460005, + "epoch": 0.6968886216744326, + "flos": 67622990002560.0, + "grad_norm": 0.7464434837595778, + "language_loss": 0.61278826, + "learning_rate": 8.88677388753248e-07, + "loss": 0.63303614, + "num_input_tokens_seen": 250178250, + "router_z_loss_clip": 0.01208496, + "router_z_loss_mlp": 0.20507812, + "step": 11591, + "time_per_iteration": 3.138062000274658 + }, + { + "auxiliary_loss_clip": 0.01106658, + "auxiliary_loss_mlp": 0.010328, + "balance_loss_clip": 1.02039623, + "balance_loss_mlp": 1.03897679, + "epoch": 0.6969487449271006, + "flos": 24863686750080.0, + "grad_norm": 2.149264324135608, + "language_loss": 0.69040775, + "learning_rate": 8.883536079753582e-07, + "loss": 0.7118023, + "num_input_tokens_seen": 250198420, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 11592, + "time_per_iteration": 2.4973015785217285 + }, + { + "auxiliary_loss_clip": 0.01102777, + "auxiliary_loss_mlp": 0.01027086, + "balance_loss_clip": 1.01633334, + "balance_loss_mlp": 1.03731585, + "epoch": 0.6970088681797685, + "flos": 28767858791040.0, + "grad_norm": 1.7113840138583603, + "language_loss": 0.62385631, + "learning_rate": 8.880298693512109e-07, + "loss": 0.64515489, + "num_input_tokens_seen": 250220650, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.65625, + "step": 11593, + "time_per_iteration": 2.5094406604766846 + }, + { + "auxiliary_loss_clip": 0.01098813, + "auxiliary_loss_mlp": 0.0102709, + "balance_loss_clip": 1.01652873, + "balance_loss_mlp": 1.03533387, + "epoch": 0.6970689914324365, + "flos": 27308556944640.0, + "grad_norm": 1.6455172692601516, + "language_loss": 0.54323792, + "learning_rate": 8.877061728930832e-07, + "loss": 0.56449699, + "num_input_tokens_seen": 250241750, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6328125, + "step": 11594, + "time_per_iteration": 3.896481513977051 + }, + { + "auxiliary_loss_clip": 0.01100941, + "auxiliary_loss_mlp": 0.01025932, + "balance_loss_clip": 1.01524472, + "balance_loss_mlp": 1.03542423, + "epoch": 0.6971291146851044, + "flos": 19136258305920.0, + "grad_norm": 2.382773789064297, + "language_loss": 0.77469057, + "learning_rate": 8.87382518613248e-07, + "loss": 0.79595929, + "num_input_tokens_seen": 250259445, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.65625, + "step": 11595, + "time_per_iteration": 2.4667396545410156 + }, + { + "auxiliary_loss_clip": 0.01107354, + "auxiliary_loss_mlp": 0.01030534, + "balance_loss_clip": 1.01850617, + "balance_loss_mlp": 1.03804874, + "epoch": 0.6971892379377724, + "flos": 14610508387200.0, + "grad_norm": 2.2493761025640957, + "language_loss": 0.71796727, + "learning_rate": 8.870589065239793e-07, + "loss": 0.73934615, + "num_input_tokens_seen": 250275640, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6953125, + "step": 11596, + "time_per_iteration": 3.921229839324951 + }, + { + "auxiliary_loss_clip": 0.01105557, + "auxiliary_loss_mlp": 0.01031813, + "balance_loss_clip": 1.0197432, + "balance_loss_mlp": 1.03878427, + "epoch": 0.6972493611904405, + "flos": 22307457415680.0, + "grad_norm": 1.6145547078757287, + "language_loss": 0.76072466, + "learning_rate": 8.867353366375492e-07, + "loss": 0.78209841, + "num_input_tokens_seen": 250296435, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66796875, + "step": 11597, + "time_per_iteration": 3.8901522159576416 + }, + { + "auxiliary_loss_clip": 0.01101534, + "auxiliary_loss_mlp": 0.01034099, + "balance_loss_clip": 1.02247071, + "balance_loss_mlp": 1.03553581, + "epoch": 0.6973094844431084, + "flos": 17420374632960.0, + "grad_norm": 1.8362035763244782, + "language_loss": 0.74662215, + "learning_rate": 8.864118089662267e-07, + "loss": 0.76797849, + "num_input_tokens_seen": 250314035, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66015625, + "step": 11598, + "time_per_iteration": 3.8907439708709717 + }, + { + "auxiliary_loss_clip": 0.01105596, + "auxiliary_loss_mlp": 0.0103258, + "balance_loss_clip": 1.02021837, + "balance_loss_mlp": 1.03667629, + "epoch": 0.6973696076957764, + "flos": 27235370983680.0, + "grad_norm": 1.7078147721602885, + "language_loss": 0.89751863, + "learning_rate": 8.860883235222791e-07, + "loss": 0.91890037, + "num_input_tokens_seen": 250332995, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69140625, + "step": 11599, + "time_per_iteration": 2.508460760116577 + }, + { + "auxiliary_loss_clip": 0.01107859, + "auxiliary_loss_mlp": 0.01035136, + "balance_loss_clip": 1.02237415, + "balance_loss_mlp": 1.03705978, + "epoch": 0.6974297309484443, + "flos": 22018089450240.0, + "grad_norm": 2.217668834863667, + "language_loss": 0.69431078, + "learning_rate": 8.85764880317974e-07, + "loss": 0.7157408, + "num_input_tokens_seen": 250352120, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7109375, + "step": 11600, + "time_per_iteration": 2.4692399501800537 + }, + { + "auxiliary_loss_clip": 0.01100885, + "auxiliary_loss_mlp": 0.01030939, + "balance_loss_clip": 1.01923847, + "balance_loss_mlp": 1.03319108, + "epoch": 0.6974898542011123, + "flos": 28366449327360.0, + "grad_norm": 2.0745134651859853, + "language_loss": 0.76886988, + "learning_rate": 8.854414793655771e-07, + "loss": 0.79018807, + "num_input_tokens_seen": 250371705, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 11601, + "time_per_iteration": 2.5153214931488037 + }, + { + "auxiliary_loss_clip": 0.0109772, + "auxiliary_loss_mlp": 0.0103129, + "balance_loss_clip": 1.02020907, + "balance_loss_mlp": 1.03365159, + "epoch": 0.6975499774537802, + "flos": 15232050351360.0, + "grad_norm": 1.7793101834620162, + "language_loss": 0.72061765, + "learning_rate": 8.851181206773508e-07, + "loss": 0.74190778, + "num_input_tokens_seen": 250390485, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.640625, + "step": 11602, + "time_per_iteration": 2.4385433197021484 + }, + { + "auxiliary_loss_clip": 0.01102254, + "auxiliary_loss_mlp": 0.010339, + "balance_loss_clip": 1.0228374, + "balance_loss_mlp": 1.0355022, + "epoch": 0.6976101007064482, + "flos": 22157422306560.0, + "grad_norm": 2.295891013382411, + "language_loss": 0.76406467, + "learning_rate": 8.847948042655567e-07, + "loss": 0.78542626, + "num_input_tokens_seen": 250407020, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.66796875, + "step": 11603, + "time_per_iteration": 2.451995611190796 + }, + { + "auxiliary_loss_clip": 0.01102122, + "auxiliary_loss_mlp": 0.01030282, + "balance_loss_clip": 1.01861763, + "balance_loss_mlp": 1.03604972, + "epoch": 0.6976702239591162, + "flos": 22273522041600.0, + "grad_norm": 3.2492511864977476, + "language_loss": 0.62036002, + "learning_rate": 8.844715301424557e-07, + "loss": 0.64168406, + "num_input_tokens_seen": 250425880, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66015625, + "step": 11604, + "time_per_iteration": 2.4743845462799072 + }, + { + "auxiliary_loss_clip": 0.01102984, + "auxiliary_loss_mlp": 0.01033432, + "balance_loss_clip": 1.02031875, + "balance_loss_mlp": 1.03493071, + "epoch": 0.6977303472117842, + "flos": 25848608653440.0, + "grad_norm": 2.371593906069345, + "language_loss": 0.81601393, + "learning_rate": 8.841482983203057e-07, + "loss": 0.83737808, + "num_input_tokens_seen": 250442925, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6796875, + "step": 11605, + "time_per_iteration": 2.4963574409484863 + }, + { + "auxiliary_loss_clip": 0.01102471, + "auxiliary_loss_mlp": 0.01029368, + "balance_loss_clip": 1.01794219, + "balance_loss_mlp": 1.03550363, + "epoch": 0.6977904704644521, + "flos": 20959586536320.0, + "grad_norm": 1.5505350039714891, + "language_loss": 0.70039761, + "learning_rate": 8.838251088113638e-07, + "loss": 0.72171599, + "num_input_tokens_seen": 250461220, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 11606, + "time_per_iteration": 2.464792490005493 + }, + { + "auxiliary_loss_clip": 0.01105207, + "auxiliary_loss_mlp": 0.01028269, + "balance_loss_clip": 1.0165329, + "balance_loss_mlp": 1.03639364, + "epoch": 0.6978505937171201, + "flos": 22055041566720.0, + "grad_norm": 2.811539216798812, + "language_loss": 0.8241694, + "learning_rate": 8.835019616278856e-07, + "loss": 0.84550416, + "num_input_tokens_seen": 250480975, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6875, + "step": 11607, + "time_per_iteration": 2.4532179832458496 + }, + { + "auxiliary_loss_clip": 0.01108161, + "auxiliary_loss_mlp": 0.01030785, + "balance_loss_clip": 1.01842856, + "balance_loss_mlp": 1.03726959, + "epoch": 0.697910716969788, + "flos": 20043720529920.0, + "grad_norm": 1.8001657478638917, + "language_loss": 0.7874788, + "learning_rate": 8.831788567821265e-07, + "loss": 0.80886829, + "num_input_tokens_seen": 250497980, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.70703125, + "step": 11608, + "time_per_iteration": 2.47961688041687 + }, + { + "auxiliary_loss_clip": 0.01103023, + "auxiliary_loss_mlp": 0.01031974, + "balance_loss_clip": 1.02052379, + "balance_loss_mlp": 1.03606093, + "epoch": 0.697970840222456, + "flos": 15888245961600.0, + "grad_norm": 1.8111202994770392, + "language_loss": 0.89970839, + "learning_rate": 8.828557942863357e-07, + "loss": 0.9210583, + "num_input_tokens_seen": 250511910, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 11609, + "time_per_iteration": 2.408423900604248 + }, + { + "auxiliary_loss_clip": 0.01104617, + "auxiliary_loss_mlp": 0.01027572, + "balance_loss_clip": 1.01522803, + "balance_loss_mlp": 1.03529525, + "epoch": 0.698030963475124, + "flos": 21215629658880.0, + "grad_norm": 2.1159011349331607, + "language_loss": 0.63904428, + "learning_rate": 8.82532774152765e-07, + "loss": 0.66036618, + "num_input_tokens_seen": 250531090, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 11610, + "time_per_iteration": 2.4653687477111816 + }, + { + "auxiliary_loss_clip": 0.01100567, + "auxiliary_loss_mlp": 0.01029938, + "balance_loss_clip": 1.01883924, + "balance_loss_mlp": 1.03393793, + "epoch": 0.698091086727792, + "flos": 33759728524800.0, + "grad_norm": 1.6195278662998478, + "language_loss": 0.84689248, + "learning_rate": 8.822097963936643e-07, + "loss": 0.86819756, + "num_input_tokens_seen": 250551565, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 11611, + "time_per_iteration": 2.5322601795196533 + }, + { + "auxiliary_loss_clip": 0.01104506, + "auxiliary_loss_mlp": 0.01030479, + "balance_loss_clip": 1.01864767, + "balance_loss_mlp": 1.03619266, + "epoch": 0.69815120998046, + "flos": 15887850912000.0, + "grad_norm": 1.902997346306539, + "language_loss": 0.71074033, + "learning_rate": 8.818868610212793e-07, + "loss": 0.73209023, + "num_input_tokens_seen": 250569625, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.68359375, + "step": 11612, + "time_per_iteration": 2.432530641555786 + }, + { + "auxiliary_loss_clip": 0.01100621, + "auxiliary_loss_mlp": 0.01031074, + "balance_loss_clip": 1.01988053, + "balance_loss_mlp": 1.03486013, + "epoch": 0.6982113332331279, + "flos": 18947044437120.0, + "grad_norm": 1.5615931118386375, + "language_loss": 0.80995202, + "learning_rate": 8.815639680478573e-07, + "loss": 0.83126897, + "num_input_tokens_seen": 250586960, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.65625, + "step": 11613, + "time_per_iteration": 2.429049253463745 + }, + { + "auxiliary_loss_clip": 0.01100637, + "auxiliary_loss_mlp": 0.01029075, + "balance_loss_clip": 1.01840007, + "balance_loss_mlp": 1.03550696, + "epoch": 0.6982714564857959, + "flos": 24389594115840.0, + "grad_norm": 1.8186173474764362, + "language_loss": 0.75323808, + "learning_rate": 8.812411174856411e-07, + "loss": 0.77453518, + "num_input_tokens_seen": 250605080, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.65234375, + "step": 11614, + "time_per_iteration": 2.469871997833252 + }, + { + "auxiliary_loss_clip": 0.01102382, + "auxiliary_loss_mlp": 0.01029848, + "balance_loss_clip": 1.01817775, + "balance_loss_mlp": 1.03613019, + "epoch": 0.6983315797384638, + "flos": 20083725302400.0, + "grad_norm": 2.4207105527318125, + "language_loss": 0.77124798, + "learning_rate": 8.809183093468746e-07, + "loss": 0.79257029, + "num_input_tokens_seen": 250623965, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 11615, + "time_per_iteration": 2.4482977390289307 + }, + { + "auxiliary_loss_clip": 0.01098585, + "auxiliary_loss_mlp": 0.01025272, + "balance_loss_clip": 1.01378596, + "balance_loss_mlp": 1.03474522, + "epoch": 0.6983917029911318, + "flos": 13512431664000.0, + "grad_norm": 2.152403248821291, + "language_loss": 0.73121244, + "learning_rate": 8.80595543643797e-07, + "loss": 0.752451, + "num_input_tokens_seen": 250640675, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 11616, + "time_per_iteration": 2.4637510776519775 + }, + { + "auxiliary_loss_clip": 0.01102545, + "auxiliary_loss_mlp": 0.01033011, + "balance_loss_clip": 1.02162004, + "balance_loss_mlp": 1.03698003, + "epoch": 0.6984518262437998, + "flos": 22018412672640.0, + "grad_norm": 1.620744160430393, + "language_loss": 0.84509301, + "learning_rate": 8.802728203886487e-07, + "loss": 0.86644858, + "num_input_tokens_seen": 250660295, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 11617, + "time_per_iteration": 2.4850711822509766 + }, + { + "auxiliary_loss_clip": 0.01105897, + "auxiliary_loss_mlp": 0.01035391, + "balance_loss_clip": 1.02358341, + "balance_loss_mlp": 1.03734601, + "epoch": 0.6985119494964678, + "flos": 18770615809920.0, + "grad_norm": 2.8091395621454884, + "language_loss": 0.59596443, + "learning_rate": 8.799501395936682e-07, + "loss": 0.61737734, + "num_input_tokens_seen": 250678155, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 11618, + "time_per_iteration": 2.4457621574401855 + }, + { + "auxiliary_loss_clip": 0.0110188, + "auxiliary_loss_mlp": 0.01032015, + "balance_loss_clip": 1.02075553, + "balance_loss_mlp": 1.03564835, + "epoch": 0.6985720727491357, + "flos": 22382834106240.0, + "grad_norm": 1.7259844025825606, + "language_loss": 0.82820493, + "learning_rate": 8.796275012710903e-07, + "loss": 0.84954393, + "num_input_tokens_seen": 250697230, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6640625, + "step": 11619, + "time_per_iteration": 2.4546103477478027 + }, + { + "auxiliary_loss_clip": 0.01097255, + "auxiliary_loss_mlp": 0.01029049, + "balance_loss_clip": 1.01878548, + "balance_loss_mlp": 1.0334444, + "epoch": 0.6986321960018037, + "flos": 39567884785920.0, + "grad_norm": 1.7065049310483924, + "language_loss": 0.67252052, + "learning_rate": 8.793049054331494e-07, + "loss": 0.69378352, + "num_input_tokens_seen": 250719865, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.63671875, + "step": 11620, + "time_per_iteration": 2.6086742877960205 + }, + { + "auxiliary_loss_clip": 0.01102039, + "auxiliary_loss_mlp": 0.01028774, + "balance_loss_clip": 1.0171926, + "balance_loss_mlp": 1.03403723, + "epoch": 0.6986923192544716, + "flos": 17967725055360.0, + "grad_norm": 2.0144848908668607, + "language_loss": 0.72543484, + "learning_rate": 8.789823520920794e-07, + "loss": 0.74674302, + "num_input_tokens_seen": 250736565, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 11621, + "time_per_iteration": 2.4109437465667725 + }, + { + "auxiliary_loss_clip": 0.01104286, + "auxiliary_loss_mlp": 0.01033661, + "balance_loss_clip": 1.02206206, + "balance_loss_mlp": 1.03532565, + "epoch": 0.6987524425071396, + "flos": 25594325297280.0, + "grad_norm": 1.8967396853715839, + "language_loss": 0.68434918, + "learning_rate": 8.7865984126011e-07, + "loss": 0.70572865, + "num_input_tokens_seen": 250757235, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6875, + "step": 11622, + "time_per_iteration": 2.4823949337005615 + }, + { + "auxiliary_loss_clip": 0.01097003, + "auxiliary_loss_mlp": 0.0102851, + "balance_loss_clip": 1.01771569, + "balance_loss_mlp": 1.03294408, + "epoch": 0.6988125657598077, + "flos": 17530081747200.0, + "grad_norm": 1.7255143974519898, + "language_loss": 0.62549627, + "learning_rate": 8.783373729494721e-07, + "loss": 0.6467514, + "num_input_tokens_seen": 250775585, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.640625, + "step": 11623, + "time_per_iteration": 2.4188036918640137 + }, + { + "auxiliary_loss_clip": 0.01104383, + "auxiliary_loss_mlp": 0.0102562, + "balance_loss_clip": 1.0135262, + "balance_loss_mlp": 1.03467298, + "epoch": 0.6988726890124756, + "flos": 39165721136640.0, + "grad_norm": 1.7388598441341108, + "language_loss": 0.60939074, + "learning_rate": 8.780149471723932e-07, + "loss": 0.63069075, + "num_input_tokens_seen": 250795725, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69921875, + "step": 11624, + "time_per_iteration": 2.5913877487182617 + }, + { + "auxiliary_loss_clip": 0.01102198, + "auxiliary_loss_mlp": 0.01035945, + "balance_loss_clip": 1.02349353, + "balance_loss_mlp": 1.03341901, + "epoch": 0.6989328122651436, + "flos": 20193468330240.0, + "grad_norm": 1.6753967170861992, + "language_loss": 0.78502715, + "learning_rate": 8.776925639411017e-07, + "loss": 0.80640858, + "num_input_tokens_seen": 250814555, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 11625, + "time_per_iteration": 2.4710693359375 + }, + { + "auxiliary_loss_clip": 0.01098526, + "auxiliary_loss_mlp": 0.01029914, + "balance_loss_clip": 1.01916766, + "balance_loss_mlp": 1.03475714, + "epoch": 0.6989929355178115, + "flos": 21834873152640.0, + "grad_norm": 1.9082516770255042, + "language_loss": 0.66193223, + "learning_rate": 8.773702232678188e-07, + "loss": 0.68321669, + "num_input_tokens_seen": 250833105, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.640625, + "step": 11626, + "time_per_iteration": 2.4523563385009766 + }, + { + "auxiliary_loss_clip": 0.01102348, + "auxiliary_loss_mlp": 0.01030649, + "balance_loss_clip": 1.01857281, + "balance_loss_mlp": 1.03522182, + "epoch": 0.6990530587704795, + "flos": 26322880855680.0, + "grad_norm": 1.7406688014675167, + "language_loss": 0.7007491, + "learning_rate": 8.770479251647697e-07, + "loss": 0.72207904, + "num_input_tokens_seen": 250852570, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.671875, + "step": 11627, + "time_per_iteration": 2.474536895751953 + }, + { + "auxiliary_loss_clip": 0.01098589, + "auxiliary_loss_mlp": 0.01025817, + "balance_loss_clip": 1.01572561, + "balance_loss_mlp": 1.03557801, + "epoch": 0.6991131820231474, + "flos": 19828975069440.0, + "grad_norm": 1.7260870632652867, + "language_loss": 0.62484425, + "learning_rate": 8.767256696441768e-07, + "loss": 0.64608836, + "num_input_tokens_seen": 250870500, + "router_z_loss_clip": 0.10107422, + "router_z_loss_mlp": 0.62890625, + "step": 11628, + "time_per_iteration": 2.466815710067749 + }, + { + "auxiliary_loss_clip": 0.01102216, + "auxiliary_loss_mlp": 0.01031088, + "balance_loss_clip": 1.01934004, + "balance_loss_mlp": 1.03518367, + "epoch": 0.6991733052758154, + "flos": 33984817102080.0, + "grad_norm": 2.3991163930052757, + "language_loss": 0.68365383, + "learning_rate": 8.764034567182581e-07, + "loss": 0.70498693, + "num_input_tokens_seen": 250892745, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 11629, + "time_per_iteration": 2.5539638996124268 + }, + { + "auxiliary_loss_clip": 0.01102664, + "auxiliary_loss_mlp": 0.01032595, + "balance_loss_clip": 1.02066183, + "balance_loss_mlp": 1.03708851, + "epoch": 0.6992334285284834, + "flos": 15633136592640.0, + "grad_norm": 1.6822972614586869, + "language_loss": 0.73017991, + "learning_rate": 8.760812863992337e-07, + "loss": 0.75153255, + "num_input_tokens_seen": 250910225, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 11630, + "time_per_iteration": 2.4794862270355225 + }, + { + "auxiliary_loss_clip": 0.01100869, + "auxiliary_loss_mlp": 0.01034714, + "balance_loss_clip": 1.0236398, + "balance_loss_mlp": 1.03656542, + "epoch": 0.6992935517811514, + "flos": 21726279360000.0, + "grad_norm": 1.6007473169297173, + "language_loss": 0.7410804, + "learning_rate": 8.757591586993196e-07, + "loss": 0.76243627, + "num_input_tokens_seen": 250929715, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.64453125, + "step": 11631, + "time_per_iteration": 2.4957640171051025 + }, + { + "auxiliary_loss_clip": 0.01106043, + "auxiliary_loss_mlp": 0.01029343, + "balance_loss_clip": 1.01692176, + "balance_loss_mlp": 1.03722155, + "epoch": 0.6993536750338193, + "flos": 20115254465280.0, + "grad_norm": 2.1507086916172153, + "language_loss": 0.8977077, + "learning_rate": 8.7543707363073e-07, + "loss": 0.91906154, + "num_input_tokens_seen": 250944230, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 11632, + "time_per_iteration": 2.44950008392334 + }, + { + "auxiliary_loss_clip": 0.01105644, + "auxiliary_loss_mlp": 0.01033481, + "balance_loss_clip": 1.02223396, + "balance_loss_mlp": 1.03784966, + "epoch": 0.6994137982864873, + "flos": 22010547594240.0, + "grad_norm": 1.6745752563732321, + "language_loss": 0.79724801, + "learning_rate": 8.751150312056792e-07, + "loss": 0.81863928, + "num_input_tokens_seen": 250961865, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.67578125, + "step": 11633, + "time_per_iteration": 2.4414355754852295 + }, + { + "auxiliary_loss_clip": 0.0110496, + "auxiliary_loss_mlp": 0.01031046, + "balance_loss_clip": 1.01837361, + "balance_loss_mlp": 1.03629565, + "epoch": 0.6994739215391552, + "flos": 25519020433920.0, + "grad_norm": 1.8513742632089842, + "language_loss": 0.6695196, + "learning_rate": 8.747930314363794e-07, + "loss": 0.69087964, + "num_input_tokens_seen": 250982025, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 11634, + "time_per_iteration": 2.512799024581909 + }, + { + "auxiliary_loss_clip": 0.01025073, + "auxiliary_loss_mlp": 0.01006178, + "balance_loss_clip": 1.0051055, + "balance_loss_mlp": 1.00443375, + "epoch": 0.6995340447918232, + "flos": 59128357691520.0, + "grad_norm": 0.7055663228963396, + "language_loss": 0.53125268, + "learning_rate": 8.744710743350412e-07, + "loss": 0.55156517, + "num_input_tokens_seen": 251046900, + "router_z_loss_clip": 0.01074219, + "router_z_loss_mlp": 0.20703125, + "step": 11635, + "time_per_iteration": 3.1653506755828857 + }, + { + "auxiliary_loss_clip": 0.01100006, + "auxiliary_loss_mlp": 0.01029622, + "balance_loss_clip": 1.01810038, + "balance_loss_mlp": 1.03436577, + "epoch": 0.6995941680444913, + "flos": 17967832796160.0, + "grad_norm": 1.634854939073058, + "language_loss": 0.82167876, + "learning_rate": 8.741491599138726e-07, + "loss": 0.84297502, + "num_input_tokens_seen": 251065050, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 11636, + "time_per_iteration": 3.8652594089508057 + }, + { + "auxiliary_loss_clip": 0.0110217, + "auxiliary_loss_mlp": 0.01025995, + "balance_loss_clip": 1.01429462, + "balance_loss_mlp": 1.03523159, + "epoch": 0.6996542912971592, + "flos": 21980095839360.0, + "grad_norm": 2.0826416356932764, + "language_loss": 0.83018386, + "learning_rate": 8.738272881850801e-07, + "loss": 0.85146558, + "num_input_tokens_seen": 251083355, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 11637, + "time_per_iteration": 2.471907615661621 + }, + { + "auxiliary_loss_clip": 0.01100142, + "auxiliary_loss_mlp": 0.01033162, + "balance_loss_clip": 1.02207518, + "balance_loss_mlp": 1.03530073, + "epoch": 0.6997144145498272, + "flos": 11686158518400.0, + "grad_norm": 2.0103377322341807, + "language_loss": 0.67541957, + "learning_rate": 8.735054591608704e-07, + "loss": 0.69675255, + "num_input_tokens_seen": 251096420, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6484375, + "step": 11638, + "time_per_iteration": 3.8712992668151855 + }, + { + "auxiliary_loss_clip": 0.01105589, + "auxiliary_loss_mlp": 0.01031049, + "balance_loss_clip": 1.01862764, + "balance_loss_mlp": 1.03554988, + "epoch": 0.6997745378024951, + "flos": 29607162958080.0, + "grad_norm": 3.4273717366145293, + "language_loss": 0.78027046, + "learning_rate": 8.731836728534459e-07, + "loss": 0.80163682, + "num_input_tokens_seen": 251115410, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.703125, + "step": 11639, + "time_per_iteration": 4.004430532455444 + }, + { + "auxiliary_loss_clip": 0.01104922, + "auxiliary_loss_mlp": 0.01035722, + "balance_loss_clip": 1.02339602, + "balance_loss_mlp": 1.03788579, + "epoch": 0.6998346610551631, + "flos": 20886616056960.0, + "grad_norm": 2.1417598387130807, + "language_loss": 0.82320189, + "learning_rate": 8.728619292750093e-07, + "loss": 0.84460831, + "num_input_tokens_seen": 251133530, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.671875, + "step": 11640, + "time_per_iteration": 3.938671588897705 + }, + { + "auxiliary_loss_clip": 0.01099361, + "auxiliary_loss_mlp": 0.01028635, + "balance_loss_clip": 1.01750684, + "balance_loss_mlp": 1.03294611, + "epoch": 0.699894784307831, + "flos": 27163046949120.0, + "grad_norm": 1.651631828879974, + "language_loss": 0.7513082, + "learning_rate": 8.725402284377619e-07, + "loss": 0.77258819, + "num_input_tokens_seen": 251153985, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 11641, + "time_per_iteration": 2.5288925170898438 + }, + { + "auxiliary_loss_clip": 0.01103165, + "auxiliary_loss_mlp": 0.01021586, + "balance_loss_clip": 1.00946224, + "balance_loss_mlp": 1.03693998, + "epoch": 0.699954907560499, + "flos": 20923640000640.0, + "grad_norm": 1.9866198731885556, + "language_loss": 0.78112102, + "learning_rate": 8.722185703539022e-07, + "loss": 0.80236852, + "num_input_tokens_seen": 251173225, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 11642, + "time_per_iteration": 2.4836714267730713 + }, + { + "auxiliary_loss_clip": 0.01106745, + "auxiliary_loss_mlp": 0.01033461, + "balance_loss_clip": 1.01997852, + "balance_loss_mlp": 1.03653657, + "epoch": 0.700015030813167, + "flos": 28657792540800.0, + "grad_norm": 3.5463939994986524, + "language_loss": 0.75054216, + "learning_rate": 8.718969550356266e-07, + "loss": 0.77194417, + "num_input_tokens_seen": 251192485, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.703125, + "step": 11643, + "time_per_iteration": 2.5334367752075195 + }, + { + "auxiliary_loss_clip": 0.01102548, + "auxiliary_loss_mlp": 0.01026355, + "balance_loss_clip": 1.01483929, + "balance_loss_mlp": 1.03516173, + "epoch": 0.700075154065835, + "flos": 29205286617600.0, + "grad_norm": 1.4977944271718722, + "language_loss": 0.60428506, + "learning_rate": 8.715753824951315e-07, + "loss": 0.62557411, + "num_input_tokens_seen": 251214965, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.67578125, + "step": 11644, + "time_per_iteration": 2.549466609954834 + }, + { + "auxiliary_loss_clip": 0.01099168, + "auxiliary_loss_mlp": 0.01026944, + "balance_loss_clip": 1.01587558, + "balance_loss_mlp": 1.03423524, + "epoch": 0.7001352773185029, + "flos": 23112431159040.0, + "grad_norm": 1.654773912405309, + "language_loss": 0.8168875, + "learning_rate": 8.712538527446119e-07, + "loss": 0.83814859, + "num_input_tokens_seen": 251234500, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6484375, + "step": 11645, + "time_per_iteration": 2.5374014377593994 + }, + { + "auxiliary_loss_clip": 0.01100534, + "auxiliary_loss_mlp": 0.01025535, + "balance_loss_clip": 1.01418638, + "balance_loss_mlp": 1.03470361, + "epoch": 0.7001954005711709, + "flos": 21322858734720.0, + "grad_norm": 1.9559227219413697, + "language_loss": 0.6827392, + "learning_rate": 8.709323657962584e-07, + "loss": 0.70399988, + "num_input_tokens_seen": 251254360, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 11646, + "time_per_iteration": 2.4721925258636475 + }, + { + "auxiliary_loss_clip": 0.01101074, + "auxiliary_loss_mlp": 0.01033463, + "balance_loss_clip": 1.02236462, + "balance_loss_mlp": 1.03534269, + "epoch": 0.7002555238238388, + "flos": 24535822383360.0, + "grad_norm": 1.4678938287912224, + "language_loss": 0.71031594, + "learning_rate": 8.706109216622635e-07, + "loss": 0.73166132, + "num_input_tokens_seen": 251274790, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.65625, + "step": 11647, + "time_per_iteration": 2.5134873390197754 + }, + { + "auxiliary_loss_clip": 0.01105174, + "auxiliary_loss_mlp": 0.01033483, + "balance_loss_clip": 1.02156842, + "balance_loss_mlp": 1.03716385, + "epoch": 0.7003156470765068, + "flos": 39056552726400.0, + "grad_norm": 1.703178589128687, + "language_loss": 0.71102858, + "learning_rate": 8.702895203548155e-07, + "loss": 0.73241514, + "num_input_tokens_seen": 251296275, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 11648, + "time_per_iteration": 2.5937957763671875 + }, + { + "auxiliary_loss_clip": 0.01099145, + "auxiliary_loss_mlp": 0.01027726, + "balance_loss_clip": 1.01577532, + "balance_loss_mlp": 1.03368604, + "epoch": 0.7003757703291749, + "flos": 28804092635520.0, + "grad_norm": 1.6329252584498772, + "language_loss": 0.77452666, + "learning_rate": 8.699681618861014e-07, + "loss": 0.79579538, + "num_input_tokens_seen": 251317375, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.65625, + "step": 11649, + "time_per_iteration": 2.517803907394409 + }, + { + "auxiliary_loss_clip": 0.01101252, + "auxiliary_loss_mlp": 0.01033228, + "balance_loss_clip": 1.02211761, + "balance_loss_mlp": 1.03584242, + "epoch": 0.7004358935818428, + "flos": 15953854152960.0, + "grad_norm": 1.77714876620496, + "language_loss": 0.78475487, + "learning_rate": 8.69646846268308e-07, + "loss": 0.80609971, + "num_input_tokens_seen": 251333570, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 11650, + "time_per_iteration": 2.40120530128479 + }, + { + "auxiliary_loss_clip": 0.0109906, + "auxiliary_loss_mlp": 0.01025547, + "balance_loss_clip": 1.01452041, + "balance_loss_mlp": 1.03317046, + "epoch": 0.7004960168345108, + "flos": 20411984718720.0, + "grad_norm": 2.032619640135715, + "language_loss": 0.78585541, + "learning_rate": 8.693255735136194e-07, + "loss": 0.80710149, + "num_input_tokens_seen": 251351070, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.66015625, + "step": 11651, + "time_per_iteration": 2.4667370319366455 + }, + { + "auxiliary_loss_clip": 0.01104452, + "auxiliary_loss_mlp": 0.01031415, + "balance_loss_clip": 1.02046514, + "balance_loss_mlp": 1.03640985, + "epoch": 0.7005561400871787, + "flos": 17347547808000.0, + "grad_norm": 1.5029723936879913, + "language_loss": 0.69227219, + "learning_rate": 8.690043436342198e-07, + "loss": 0.71363091, + "num_input_tokens_seen": 251370005, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6796875, + "step": 11652, + "time_per_iteration": 2.4276230335235596 + }, + { + "auxiliary_loss_clip": 0.01102036, + "auxiliary_loss_mlp": 0.01027935, + "balance_loss_clip": 1.01670551, + "balance_loss_mlp": 1.03644037, + "epoch": 0.7006162633398467, + "flos": 25302120157440.0, + "grad_norm": 1.3694191346433118, + "language_loss": 0.74200094, + "learning_rate": 8.686831566422874e-07, + "loss": 0.76330066, + "num_input_tokens_seen": 251391210, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 11653, + "time_per_iteration": 2.515753984451294 + }, + { + "auxiliary_loss_clip": 0.01102535, + "auxiliary_loss_mlp": 0.0102929, + "balance_loss_clip": 1.01641536, + "balance_loss_mlp": 1.03555512, + "epoch": 0.7006763865925146, + "flos": 20668997508480.0, + "grad_norm": 2.227987433936512, + "language_loss": 0.70499587, + "learning_rate": 8.68362012550003e-07, + "loss": 0.72631419, + "num_input_tokens_seen": 251411505, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.671875, + "step": 11654, + "time_per_iteration": 2.4286937713623047 + }, + { + "auxiliary_loss_clip": 0.01104582, + "auxiliary_loss_mlp": 0.01030492, + "balance_loss_clip": 1.01751626, + "balance_loss_mlp": 1.03610516, + "epoch": 0.7007365098451827, + "flos": 20046449963520.0, + "grad_norm": 2.4203729950028063, + "language_loss": 0.73474562, + "learning_rate": 8.680409113695453e-07, + "loss": 0.75609636, + "num_input_tokens_seen": 251428975, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.68359375, + "step": 11655, + "time_per_iteration": 2.4598588943481445 + }, + { + "auxiliary_loss_clip": 0.01110167, + "auxiliary_loss_mlp": 0.01036696, + "balance_loss_clip": 1.0236547, + "balance_loss_mlp": 1.03842175, + "epoch": 0.7007966330978506, + "flos": 20777375819520.0, + "grad_norm": 1.832010728467088, + "language_loss": 0.69950438, + "learning_rate": 8.677198531130889e-07, + "loss": 0.72097301, + "num_input_tokens_seen": 251446940, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 11656, + "time_per_iteration": 2.4319212436676025 + }, + { + "auxiliary_loss_clip": 0.01100001, + "auxiliary_loss_mlp": 0.01028207, + "balance_loss_clip": 1.01765096, + "balance_loss_mlp": 1.03448498, + "epoch": 0.7008567563505186, + "flos": 29638189330560.0, + "grad_norm": 1.5232296652544484, + "language_loss": 0.77772856, + "learning_rate": 8.673988377928092e-07, + "loss": 0.79901063, + "num_input_tokens_seen": 251466205, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.65625, + "step": 11657, + "time_per_iteration": 2.5232887268066406 + }, + { + "auxiliary_loss_clip": 0.01107629, + "auxiliary_loss_mlp": 0.01031267, + "balance_loss_clip": 1.01827931, + "balance_loss_mlp": 1.03665113, + "epoch": 0.7009168796031865, + "flos": 17092007475840.0, + "grad_norm": 2.426278289678233, + "language_loss": 0.77859247, + "learning_rate": 8.670778654208797e-07, + "loss": 0.79998142, + "num_input_tokens_seen": 251484820, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 11658, + "time_per_iteration": 2.5308613777160645 + }, + { + "auxiliary_loss_clip": 0.01098446, + "auxiliary_loss_mlp": 0.01024442, + "balance_loss_clip": 1.01329541, + "balance_loss_mlp": 1.03391457, + "epoch": 0.7009770028558545, + "flos": 20448972748800.0, + "grad_norm": 2.3274246978175803, + "language_loss": 0.82637346, + "learning_rate": 8.667569360094713e-07, + "loss": 0.84760237, + "num_input_tokens_seen": 251502670, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.64453125, + "step": 11659, + "time_per_iteration": 2.4660232067108154 + }, + { + "auxiliary_loss_clip": 0.01100216, + "auxiliary_loss_mlp": 0.01026445, + "balance_loss_clip": 1.01545429, + "balance_loss_mlp": 1.0353384, + "epoch": 0.7010371261085224, + "flos": 19245139407360.0, + "grad_norm": 1.9444226757743717, + "language_loss": 0.69085199, + "learning_rate": 8.664360495707526e-07, + "loss": 0.71211863, + "num_input_tokens_seen": 251521630, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6484375, + "step": 11660, + "time_per_iteration": 2.425694227218628 + }, + { + "auxiliary_loss_clip": 0.01102737, + "auxiliary_loss_mlp": 0.0103262, + "balance_loss_clip": 1.0202167, + "balance_loss_mlp": 1.03413391, + "epoch": 0.7010972493611904, + "flos": 22127581082880.0, + "grad_norm": 1.7015787806945502, + "language_loss": 0.80871427, + "learning_rate": 8.661152061168924e-07, + "loss": 0.83006787, + "num_input_tokens_seen": 251540105, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 11661, + "time_per_iteration": 2.4829437732696533 + }, + { + "auxiliary_loss_clip": 0.01100288, + "auxiliary_loss_mlp": 0.01030814, + "balance_loss_clip": 1.01967359, + "balance_loss_mlp": 1.033602, + "epoch": 0.7011573726138585, + "flos": 31391132860800.0, + "grad_norm": 3.059809361896724, + "language_loss": 0.78862965, + "learning_rate": 8.657944056600579e-07, + "loss": 0.80994064, + "num_input_tokens_seen": 251560530, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66796875, + "step": 11662, + "time_per_iteration": 2.5052289962768555 + }, + { + "auxiliary_loss_clip": 0.01102457, + "auxiliary_loss_mlp": 0.01023605, + "balance_loss_clip": 1.01188052, + "balance_loss_mlp": 1.03489375, + "epoch": 0.7012174958665264, + "flos": 18150582216960.0, + "grad_norm": 1.922970255639485, + "language_loss": 0.8358953, + "learning_rate": 8.654736482124134e-07, + "loss": 0.85715592, + "num_input_tokens_seen": 251577930, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.67578125, + "step": 11663, + "time_per_iteration": 2.4594573974609375 + }, + { + "auxiliary_loss_clip": 0.0102523, + "auxiliary_loss_mlp": 0.01007606, + "balance_loss_clip": 1.00651574, + "balance_loss_mlp": 1.00453377, + "epoch": 0.7012776191191944, + "flos": 60651256567680.0, + "grad_norm": 0.8204387591217913, + "language_loss": 0.53774929, + "learning_rate": 8.651529337861209e-07, + "loss": 0.55807763, + "num_input_tokens_seen": 251638820, + "router_z_loss_clip": 0.01092529, + "router_z_loss_mlp": 0.20703125, + "step": 11664, + "time_per_iteration": 3.0331904888153076 + }, + { + "auxiliary_loss_clip": 0.01104897, + "auxiliary_loss_mlp": 0.01030739, + "balance_loss_clip": 1.01900291, + "balance_loss_mlp": 1.03650737, + "epoch": 0.7013377423718623, + "flos": 27198598435200.0, + "grad_norm": 2.4272507893526143, + "language_loss": 0.78843081, + "learning_rate": 8.64832262393344e-07, + "loss": 0.80978715, + "num_input_tokens_seen": 251658070, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.68359375, + "step": 11665, + "time_per_iteration": 2.4934439659118652 + }, + { + "auxiliary_loss_clip": 0.01099902, + "auxiliary_loss_mlp": 0.01028028, + "balance_loss_clip": 1.01650083, + "balance_loss_mlp": 1.03361416, + "epoch": 0.7013978656245303, + "flos": 16543543731840.0, + "grad_norm": 2.269849765653923, + "language_loss": 0.77034938, + "learning_rate": 8.645116340462404e-07, + "loss": 0.79162872, + "num_input_tokens_seen": 251671575, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 11666, + "time_per_iteration": 2.4027786254882812 + }, + { + "auxiliary_loss_clip": 0.01101042, + "auxiliary_loss_mlp": 0.01026786, + "balance_loss_clip": 1.01623607, + "balance_loss_mlp": 1.0356462, + "epoch": 0.7014579888771982, + "flos": 23143780753920.0, + "grad_norm": 1.878568521742783, + "language_loss": 0.81238604, + "learning_rate": 8.641910487569695e-07, + "loss": 0.8336643, + "num_input_tokens_seen": 251689350, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.65625, + "step": 11667, + "time_per_iteration": 2.4780242443084717 + }, + { + "auxiliary_loss_clip": 0.0110046, + "auxiliary_loss_mlp": 0.01035616, + "balance_loss_clip": 1.0237546, + "balance_loss_mlp": 1.03487873, + "epoch": 0.7015181121298663, + "flos": 25082095397760.0, + "grad_norm": 2.0547760249868685, + "language_loss": 0.65335631, + "learning_rate": 8.638705065376879e-07, + "loss": 0.67471707, + "num_input_tokens_seen": 251704635, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.65625, + "step": 11668, + "time_per_iteration": 2.447939395904541 + }, + { + "auxiliary_loss_clip": 0.01103124, + "auxiliary_loss_mlp": 0.01022731, + "balance_loss_clip": 1.01117384, + "balance_loss_mlp": 1.03469038, + "epoch": 0.7015782353825342, + "flos": 23327894891520.0, + "grad_norm": 2.272329624033439, + "language_loss": 0.76275986, + "learning_rate": 8.635500074005519e-07, + "loss": 0.78401846, + "num_input_tokens_seen": 251723035, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.68359375, + "step": 11669, + "time_per_iteration": 2.4600765705108643 + }, + { + "auxiliary_loss_clip": 0.01025535, + "auxiliary_loss_mlp": 0.01006318, + "balance_loss_clip": 1.00525713, + "balance_loss_mlp": 1.00477183, + "epoch": 0.7016383586352022, + "flos": 70397161107840.0, + "grad_norm": 0.6922095034682588, + "language_loss": 0.54468822, + "learning_rate": 8.632295513577122e-07, + "loss": 0.56500673, + "num_input_tokens_seen": 251791630, + "router_z_loss_clip": 0.01062012, + "router_z_loss_mlp": 0.20703125, + "step": 11670, + "time_per_iteration": 3.1504855155944824 + }, + { + "auxiliary_loss_clip": 0.01100438, + "auxiliary_loss_mlp": 0.01031818, + "balance_loss_clip": 1.0203141, + "balance_loss_mlp": 1.03460622, + "epoch": 0.7016984818878701, + "flos": 19792274348160.0, + "grad_norm": 1.9909569240580678, + "language_loss": 0.81605625, + "learning_rate": 8.629091384213218e-07, + "loss": 0.83737886, + "num_input_tokens_seen": 251809840, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66015625, + "step": 11671, + "time_per_iteration": 2.475792169570923 + }, + { + "auxiliary_loss_clip": 0.011038, + "auxiliary_loss_mlp": 0.01028796, + "balance_loss_clip": 1.01734638, + "balance_loss_mlp": 1.03691864, + "epoch": 0.7017586051405381, + "flos": 12896923184640.0, + "grad_norm": 2.023044603900928, + "language_loss": 0.75000024, + "learning_rate": 8.625887686035313e-07, + "loss": 0.77132618, + "num_input_tokens_seen": 251827550, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 11672, + "time_per_iteration": 2.4228410720825195 + }, + { + "auxiliary_loss_clip": 0.0110057, + "auxiliary_loss_mlp": 0.01030144, + "balance_loss_clip": 1.01794934, + "balance_loss_mlp": 1.0343281, + "epoch": 0.701818728393206, + "flos": 18332828847360.0, + "grad_norm": 1.708219397381251, + "language_loss": 0.87053084, + "learning_rate": 8.622684419164883e-07, + "loss": 0.89183801, + "num_input_tokens_seen": 251844880, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66015625, + "step": 11673, + "time_per_iteration": 2.4504873752593994 + }, + { + "auxiliary_loss_clip": 0.01098005, + "auxiliary_loss_mlp": 0.01024449, + "balance_loss_clip": 1.01308239, + "balance_loss_mlp": 1.0342052, + "epoch": 0.701878851645874, + "flos": 17384212615680.0, + "grad_norm": 2.1494737009789935, + "language_loss": 0.72768337, + "learning_rate": 8.619481583723399e-07, + "loss": 0.74890792, + "num_input_tokens_seen": 251861025, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.63671875, + "step": 11674, + "time_per_iteration": 2.397975444793701 + }, + { + "auxiliary_loss_clip": 0.01100361, + "auxiliary_loss_mlp": 0.01027786, + "balance_loss_clip": 1.01708126, + "balance_loss_mlp": 1.03694451, + "epoch": 0.701938974898542, + "flos": 23915501481600.0, + "grad_norm": 1.5674244409742963, + "language_loss": 0.72100163, + "learning_rate": 8.616279179832329e-07, + "loss": 0.74228311, + "num_input_tokens_seen": 251880175, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6328125, + "step": 11675, + "time_per_iteration": 2.4895689487457275 + }, + { + "auxiliary_loss_clip": 0.01102681, + "auxiliary_loss_mlp": 0.01024344, + "balance_loss_clip": 1.01256597, + "balance_loss_mlp": 1.03593993, + "epoch": 0.70199909815121, + "flos": 21795586652160.0, + "grad_norm": 2.517132712975458, + "language_loss": 0.50993675, + "learning_rate": 8.613077207613078e-07, + "loss": 0.53120697, + "num_input_tokens_seen": 251899005, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.66796875, + "step": 11676, + "time_per_iteration": 2.4392223358154297 + }, + { + "auxiliary_loss_clip": 0.01024806, + "auxiliary_loss_mlp": 0.01002084, + "balance_loss_clip": 1.00087988, + "balance_loss_mlp": 1.00406504, + "epoch": 0.702059221403878, + "flos": 71715047109120.0, + "grad_norm": 0.7321379163768023, + "language_loss": 0.59195387, + "learning_rate": 8.609875667187079e-07, + "loss": 0.61222279, + "num_input_tokens_seen": 251966790, + "router_z_loss_clip": 0.01202393, + "router_z_loss_mlp": 0.20703125, + "step": 11677, + "time_per_iteration": 3.125434398651123 + }, + { + "auxiliary_loss_clip": 0.01103207, + "auxiliary_loss_mlp": 0.01026564, + "balance_loss_clip": 1.01498294, + "balance_loss_mlp": 1.03543353, + "epoch": 0.7021193446565459, + "flos": 28111052649600.0, + "grad_norm": 2.2320710813331304, + "language_loss": 0.62693989, + "learning_rate": 8.606674558675737e-07, + "loss": 0.64823759, + "num_input_tokens_seen": 251989315, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6796875, + "step": 11678, + "time_per_iteration": 4.006704330444336 + }, + { + "auxiliary_loss_clip": 0.01100355, + "auxiliary_loss_mlp": 0.01032338, + "balance_loss_clip": 1.02100134, + "balance_loss_mlp": 1.0344584, + "epoch": 0.7021794679092139, + "flos": 22924905229440.0, + "grad_norm": 1.8460467241007361, + "language_loss": 0.79242504, + "learning_rate": 8.603473882200444e-07, + "loss": 0.81375194, + "num_input_tokens_seen": 252006620, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 11679, + "time_per_iteration": 2.4555304050445557 + }, + { + "auxiliary_loss_clip": 0.01101096, + "auxiliary_loss_mlp": 0.01035801, + "balance_loss_clip": 1.02535808, + "balance_loss_mlp": 1.03703773, + "epoch": 0.7022395911618818, + "flos": 18077827219200.0, + "grad_norm": 2.331847817004221, + "language_loss": 0.70253718, + "learning_rate": 8.600273637882567e-07, + "loss": 0.7239061, + "num_input_tokens_seen": 252024570, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.640625, + "step": 11680, + "time_per_iteration": 3.8396050930023193 + }, + { + "auxiliary_loss_clip": 0.01105234, + "auxiliary_loss_mlp": 0.01031375, + "balance_loss_clip": 1.01941895, + "balance_loss_mlp": 1.03682303, + "epoch": 0.7022997144145499, + "flos": 16034294661120.0, + "grad_norm": 1.6980564631311013, + "language_loss": 0.74690676, + "learning_rate": 8.597073825843446e-07, + "loss": 0.76827282, + "num_input_tokens_seen": 252042775, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.68359375, + "step": 11681, + "time_per_iteration": 5.31316614151001 + }, + { + "auxiliary_loss_clip": 0.0110093, + "auxiliary_loss_mlp": 0.01030271, + "balance_loss_clip": 1.01963735, + "balance_loss_mlp": 1.03458714, + "epoch": 0.7023598376672178, + "flos": 26468678160000.0, + "grad_norm": 1.4988427000417734, + "language_loss": 0.76605582, + "learning_rate": 8.593874446204434e-07, + "loss": 0.78736782, + "num_input_tokens_seen": 252063690, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6640625, + "step": 11682, + "time_per_iteration": 2.4792110919952393 + }, + { + "auxiliary_loss_clip": 0.01103891, + "auxiliary_loss_mlp": 0.01033801, + "balance_loss_clip": 1.02231503, + "balance_loss_mlp": 1.03589272, + "epoch": 0.7024199609198858, + "flos": 17055917285760.0, + "grad_norm": 1.8311880743600102, + "language_loss": 0.73361951, + "learning_rate": 8.590675499086841e-07, + "loss": 0.75499648, + "num_input_tokens_seen": 252080335, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6796875, + "step": 11683, + "time_per_iteration": 2.434879779815674 + }, + { + "auxiliary_loss_clip": 0.0110308, + "auxiliary_loss_mlp": 0.01028117, + "balance_loss_clip": 1.01577854, + "balance_loss_mlp": 1.03725612, + "epoch": 0.7024800841725537, + "flos": 25849039616640.0, + "grad_norm": 1.7668169003154093, + "language_loss": 0.71169794, + "learning_rate": 8.587476984611976e-07, + "loss": 0.73300993, + "num_input_tokens_seen": 252101075, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.65625, + "step": 11684, + "time_per_iteration": 2.486572742462158 + }, + { + "auxiliary_loss_clip": 0.01101245, + "auxiliary_loss_mlp": 0.01031543, + "balance_loss_clip": 1.01969957, + "balance_loss_mlp": 1.03529143, + "epoch": 0.7025402074252217, + "flos": 23513014609920.0, + "grad_norm": 1.8432235400728463, + "language_loss": 0.72046304, + "learning_rate": 8.584278902901128e-07, + "loss": 0.74179095, + "num_input_tokens_seen": 252120510, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66015625, + "step": 11685, + "time_per_iteration": 2.5009102821350098 + }, + { + "auxiliary_loss_clip": 0.01101202, + "auxiliary_loss_mlp": 0.01029995, + "balance_loss_clip": 1.01923084, + "balance_loss_mlp": 1.03449953, + "epoch": 0.7026003306778896, + "flos": 20150985519360.0, + "grad_norm": 1.7057605239318525, + "language_loss": 0.84865069, + "learning_rate": 8.581081254075582e-07, + "loss": 0.86996263, + "num_input_tokens_seen": 252137590, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.66796875, + "step": 11686, + "time_per_iteration": 2.447744846343994 + }, + { + "auxiliary_loss_clip": 0.01025709, + "auxiliary_loss_mlp": 0.01003132, + "balance_loss_clip": 1.00203538, + "balance_loss_mlp": 1.00512934, + "epoch": 0.7026604539305576, + "flos": 64772400712320.0, + "grad_norm": 0.988856355007654, + "language_loss": 0.69923353, + "learning_rate": 8.577884038256566e-07, + "loss": 0.71952194, + "num_input_tokens_seen": 252199830, + "router_z_loss_clip": 0.01098633, + "router_z_loss_mlp": 0.20605469, + "step": 11687, + "time_per_iteration": 3.1910674571990967 + }, + { + "auxiliary_loss_clip": 0.01103018, + "auxiliary_loss_mlp": 0.01027664, + "balance_loss_clip": 1.0161128, + "balance_loss_mlp": 1.03627849, + "epoch": 0.7027205771832256, + "flos": 21871466133120.0, + "grad_norm": 2.17247822122661, + "language_loss": 0.77656871, + "learning_rate": 8.574687255565329e-07, + "loss": 0.79787552, + "num_input_tokens_seen": 252217200, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 11688, + "time_per_iteration": 2.472559928894043 + }, + { + "auxiliary_loss_clip": 0.01101422, + "auxiliary_loss_mlp": 0.01031189, + "balance_loss_clip": 1.01951802, + "balance_loss_mlp": 1.0350461, + "epoch": 0.7027807004358936, + "flos": 23367791923200.0, + "grad_norm": 2.0685575537033207, + "language_loss": 0.68521178, + "learning_rate": 8.571490906123107e-07, + "loss": 0.70653796, + "num_input_tokens_seen": 252236105, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 11689, + "time_per_iteration": 2.4660775661468506 + }, + { + "auxiliary_loss_clip": 0.01103667, + "auxiliary_loss_mlp": 0.01035824, + "balance_loss_clip": 1.02360475, + "balance_loss_mlp": 1.03517842, + "epoch": 0.7028408236885616, + "flos": 15304266645120.0, + "grad_norm": 2.110320581130951, + "language_loss": 0.79499185, + "learning_rate": 8.568294990051086e-07, + "loss": 0.81638682, + "num_input_tokens_seen": 252253315, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 11690, + "time_per_iteration": 2.510883331298828 + }, + { + "auxiliary_loss_clip": 0.01102324, + "auxiliary_loss_mlp": 0.01031703, + "balance_loss_clip": 1.01994324, + "balance_loss_mlp": 1.03600478, + "epoch": 0.7029009469412295, + "flos": 22018197191040.0, + "grad_norm": 1.5848883111705174, + "language_loss": 0.76091731, + "learning_rate": 8.56509950747047e-07, + "loss": 0.78225756, + "num_input_tokens_seen": 252272765, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6640625, + "step": 11691, + "time_per_iteration": 2.4371836185455322 + }, + { + "auxiliary_loss_clip": 0.01102138, + "auxiliary_loss_mlp": 0.01024652, + "balance_loss_clip": 1.01367295, + "balance_loss_mlp": 1.03720069, + "epoch": 0.7029610701938975, + "flos": 21835519597440.0, + "grad_norm": 1.7363845404220049, + "language_loss": 0.81481391, + "learning_rate": 8.561904458502429e-07, + "loss": 0.8360818, + "num_input_tokens_seen": 252290510, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6484375, + "step": 11692, + "time_per_iteration": 2.501248359680176 + }, + { + "auxiliary_loss_clip": 0.01099945, + "auxiliary_loss_mlp": 0.0102586, + "balance_loss_clip": 1.01414728, + "balance_loss_mlp": 1.03468466, + "epoch": 0.7030211934465654, + "flos": 19135647774720.0, + "grad_norm": 1.5395445178386533, + "language_loss": 0.76162529, + "learning_rate": 8.558709843268111e-07, + "loss": 0.78288329, + "num_input_tokens_seen": 252309365, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65234375, + "step": 11693, + "time_per_iteration": 2.452014923095703 + }, + { + "auxiliary_loss_clip": 0.01101571, + "auxiliary_loss_mlp": 0.01030054, + "balance_loss_clip": 1.01904464, + "balance_loss_mlp": 1.03672361, + "epoch": 0.7030813166992335, + "flos": 38546010766080.0, + "grad_norm": 1.51123653242133, + "language_loss": 0.68433905, + "learning_rate": 8.55551566188866e-07, + "loss": 0.70565528, + "num_input_tokens_seen": 252333010, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 11694, + "time_per_iteration": 2.6905438899993896 + }, + { + "auxiliary_loss_clip": 0.01101725, + "auxiliary_loss_mlp": 0.0103067, + "balance_loss_clip": 1.01921415, + "balance_loss_mlp": 1.03518784, + "epoch": 0.7031414399519014, + "flos": 14720897859840.0, + "grad_norm": 2.685426816457134, + "language_loss": 0.75926757, + "learning_rate": 8.552321914485203e-07, + "loss": 0.78059149, + "num_input_tokens_seen": 252351330, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 11695, + "time_per_iteration": 2.4197287559509277 + }, + { + "auxiliary_loss_clip": 0.01104949, + "auxiliary_loss_mlp": 0.0103903, + "balance_loss_clip": 1.02692449, + "balance_loss_mlp": 1.03704011, + "epoch": 0.7032015632045694, + "flos": 14027247342720.0, + "grad_norm": 2.1895380825721595, + "language_loss": 0.73749006, + "learning_rate": 8.549128601178852e-07, + "loss": 0.75892979, + "num_input_tokens_seen": 252369580, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 11696, + "time_per_iteration": 2.438162088394165 + }, + { + "auxiliary_loss_clip": 0.01102914, + "auxiliary_loss_mlp": 0.01027271, + "balance_loss_clip": 1.01507568, + "balance_loss_mlp": 1.03577912, + "epoch": 0.7032616864572373, + "flos": 27637175496960.0, + "grad_norm": 1.6020001034841755, + "language_loss": 0.75352108, + "learning_rate": 8.545935722090693e-07, + "loss": 0.77482289, + "num_input_tokens_seen": 252390525, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 11697, + "time_per_iteration": 2.50844669342041 + }, + { + "auxiliary_loss_clip": 0.01107405, + "auxiliary_loss_mlp": 0.01033634, + "balance_loss_clip": 1.02019286, + "balance_loss_mlp": 1.03933907, + "epoch": 0.7033218097099053, + "flos": 17967294092160.0, + "grad_norm": 1.763301186005729, + "language_loss": 0.8075971, + "learning_rate": 8.542743277341793e-07, + "loss": 0.82900751, + "num_input_tokens_seen": 252407470, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.6796875, + "step": 11698, + "time_per_iteration": 2.4794504642486572 + }, + { + "auxiliary_loss_clip": 0.01102647, + "auxiliary_loss_mlp": 0.01031888, + "balance_loss_clip": 1.01978219, + "balance_loss_mlp": 1.03481781, + "epoch": 0.7033819329625732, + "flos": 19501721233920.0, + "grad_norm": 1.4318828234621686, + "language_loss": 0.84606147, + "learning_rate": 8.539551267053222e-07, + "loss": 0.86740685, + "num_input_tokens_seen": 252427025, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 11699, + "time_per_iteration": 2.466271162033081 + }, + { + "auxiliary_loss_clip": 0.01102469, + "auxiliary_loss_mlp": 0.01028448, + "balance_loss_clip": 1.01603246, + "balance_loss_mlp": 1.03670907, + "epoch": 0.7034420562152413, + "flos": 23987645948160.0, + "grad_norm": 2.1706968176821326, + "language_loss": 0.79156339, + "learning_rate": 8.53635969134601e-07, + "loss": 0.81287259, + "num_input_tokens_seen": 252445410, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.65625, + "step": 11700, + "time_per_iteration": 2.4769561290740967 + }, + { + "auxiliary_loss_clip": 0.01102749, + "auxiliary_loss_mlp": 0.01023696, + "balance_loss_clip": 1.01164412, + "balance_loss_mlp": 1.0352428, + "epoch": 0.7035021794679092, + "flos": 35043427756800.0, + "grad_norm": 1.698709640635861, + "language_loss": 0.74290204, + "learning_rate": 8.533168550341186e-07, + "loss": 0.76416653, + "num_input_tokens_seen": 252463905, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.67578125, + "step": 11701, + "time_per_iteration": 2.5410683155059814 + }, + { + "auxiliary_loss_clip": 0.01105173, + "auxiliary_loss_mlp": 0.01027306, + "balance_loss_clip": 1.01449096, + "balance_loss_mlp": 1.03693128, + "epoch": 0.7035623027205772, + "flos": 10997428164480.0, + "grad_norm": 2.241875664618386, + "language_loss": 0.83804989, + "learning_rate": 8.529977844159769e-07, + "loss": 0.8593747, + "num_input_tokens_seen": 252478655, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.68359375, + "step": 11702, + "time_per_iteration": 2.4136838912963867 + }, + { + "auxiliary_loss_clip": 0.01102777, + "auxiliary_loss_mlp": 0.01031389, + "balance_loss_clip": 1.01974845, + "balance_loss_mlp": 1.03585792, + "epoch": 0.7036224259732452, + "flos": 23623727304960.0, + "grad_norm": 17.73315944125735, + "language_loss": 0.60806382, + "learning_rate": 8.526787572922738e-07, + "loss": 0.62940544, + "num_input_tokens_seen": 252498740, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66796875, + "step": 11703, + "time_per_iteration": 2.4728925228118896 + }, + { + "auxiliary_loss_clip": 0.0109967, + "auxiliary_loss_mlp": 0.01027379, + "balance_loss_clip": 1.01538706, + "balance_loss_mlp": 1.03344357, + "epoch": 0.7036825492259131, + "flos": 31686175175040.0, + "grad_norm": 1.86622111466138, + "language_loss": 0.60721993, + "learning_rate": 8.523597736751067e-07, + "loss": 0.62849051, + "num_input_tokens_seen": 252517800, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 11704, + "time_per_iteration": 2.5538487434387207 + }, + { + "auxiliary_loss_clip": 0.01097343, + "auxiliary_loss_mlp": 0.01030559, + "balance_loss_clip": 1.02000296, + "balance_loss_mlp": 1.03398025, + "epoch": 0.7037426724785811, + "flos": 30192866127360.0, + "grad_norm": 1.6367819423893837, + "language_loss": 0.70355535, + "learning_rate": 8.520408335765719e-07, + "loss": 0.72483432, + "num_input_tokens_seen": 252539620, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6328125, + "step": 11705, + "time_per_iteration": 2.5196011066436768 + }, + { + "auxiliary_loss_clip": 0.01100052, + "auxiliary_loss_mlp": 0.01027822, + "balance_loss_clip": 1.01636624, + "balance_loss_mlp": 1.03497076, + "epoch": 0.703802795731249, + "flos": 24311523905280.0, + "grad_norm": 2.637724615159266, + "language_loss": 0.61509889, + "learning_rate": 8.517219370087645e-07, + "loss": 0.63637763, + "num_input_tokens_seen": 252557300, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65234375, + "step": 11706, + "time_per_iteration": 2.4852991104125977 + }, + { + "auxiliary_loss_clip": 0.01102393, + "auxiliary_loss_mlp": 0.01026458, + "balance_loss_clip": 1.01521683, + "balance_loss_mlp": 1.03553593, + "epoch": 0.7038629189839171, + "flos": 22528954632960.0, + "grad_norm": 2.2484984676875563, + "language_loss": 0.68121183, + "learning_rate": 8.514030839837756e-07, + "loss": 0.70250034, + "num_input_tokens_seen": 252576715, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.66796875, + "step": 11707, + "time_per_iteration": 2.4560024738311768 + }, + { + "auxiliary_loss_clip": 0.01097433, + "auxiliary_loss_mlp": 0.01027441, + "balance_loss_clip": 1.0162648, + "balance_loss_mlp": 1.03335011, + "epoch": 0.703923042236585, + "flos": 26250484993920.0, + "grad_norm": 1.7446259905587083, + "language_loss": 0.76487923, + "learning_rate": 8.510842745136974e-07, + "loss": 0.78612804, + "num_input_tokens_seen": 252596190, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.640625, + "step": 11708, + "time_per_iteration": 2.515327215194702 + }, + { + "auxiliary_loss_clip": 0.01099228, + "auxiliary_loss_mlp": 0.01024966, + "balance_loss_clip": 1.01421952, + "balance_loss_mlp": 1.03512418, + "epoch": 0.703983165489253, + "flos": 19390254353280.0, + "grad_norm": 1.893368388386225, + "language_loss": 0.72055292, + "learning_rate": 8.50765508610619e-07, + "loss": 0.74179482, + "num_input_tokens_seen": 252613410, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.640625, + "step": 11709, + "time_per_iteration": 2.431182384490967 + }, + { + "auxiliary_loss_clip": 0.01099189, + "auxiliary_loss_mlp": 0.01025216, + "balance_loss_clip": 1.01399827, + "balance_loss_mlp": 1.03375983, + "epoch": 0.7040432887419209, + "flos": 16683630773760.0, + "grad_norm": 2.079430411231168, + "language_loss": 0.79054451, + "learning_rate": 8.504467862866267e-07, + "loss": 0.81178856, + "num_input_tokens_seen": 252629150, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 11710, + "time_per_iteration": 2.3997299671173096 + }, + { + "auxiliary_loss_clip": 0.01104493, + "auxiliary_loss_mlp": 0.01030463, + "balance_loss_clip": 1.01852989, + "balance_loss_mlp": 1.03760147, + "epoch": 0.7041034119945889, + "flos": 21141402203520.0, + "grad_norm": 1.6049139638931622, + "language_loss": 0.77447236, + "learning_rate": 8.501281075538076e-07, + "loss": 0.79582191, + "num_input_tokens_seen": 252648225, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.671875, + "step": 11711, + "time_per_iteration": 2.4934744834899902 + }, + { + "auxiliary_loss_clip": 0.01098365, + "auxiliary_loss_mlp": 0.01024434, + "balance_loss_clip": 1.01442647, + "balance_loss_mlp": 1.03375506, + "epoch": 0.7041635352472568, + "flos": 16910299549440.0, + "grad_norm": 2.37459605810246, + "language_loss": 0.73933756, + "learning_rate": 8.498094724242457e-07, + "loss": 0.76056558, + "num_input_tokens_seen": 252665380, + "router_z_loss_clip": 0.10009766, + "router_z_loss_mlp": 0.64453125, + "step": 11712, + "time_per_iteration": 2.4414384365081787 + }, + { + "auxiliary_loss_clip": 0.01025006, + "auxiliary_loss_mlp": 0.01006413, + "balance_loss_clip": 1.00531662, + "balance_loss_mlp": 1.00448298, + "epoch": 0.7042236584999249, + "flos": 71681219475840.0, + "grad_norm": 0.8819337057085826, + "language_loss": 0.64707136, + "learning_rate": 8.494908809100247e-07, + "loss": 0.66738558, + "num_input_tokens_seen": 252727950, + "router_z_loss_clip": 0.01098633, + "router_z_loss_mlp": 0.20507812, + "step": 11713, + "time_per_iteration": 3.1559205055236816 + }, + { + "auxiliary_loss_clip": 0.0109808, + "auxiliary_loss_mlp": 0.01024348, + "balance_loss_clip": 1.01370883, + "balance_loss_mlp": 1.03258777, + "epoch": 0.7042837817525928, + "flos": 28658187590400.0, + "grad_norm": 2.337022160062714, + "language_loss": 0.72537225, + "learning_rate": 8.49172333023225e-07, + "loss": 0.74659657, + "num_input_tokens_seen": 252746770, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.65625, + "step": 11714, + "time_per_iteration": 2.5274534225463867 + }, + { + "auxiliary_loss_clip": 0.01100361, + "auxiliary_loss_mlp": 0.01030547, + "balance_loss_clip": 1.01889992, + "balance_loss_mlp": 1.03500628, + "epoch": 0.7043439050052608, + "flos": 19753562465280.0, + "grad_norm": 1.5791768588768047, + "language_loss": 0.79251838, + "learning_rate": 8.488538287759248e-07, + "loss": 0.81382746, + "num_input_tokens_seen": 252765610, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 11715, + "time_per_iteration": 2.423422336578369 + }, + { + "auxiliary_loss_clip": 0.01102527, + "auxiliary_loss_mlp": 0.01031536, + "balance_loss_clip": 1.02010405, + "balance_loss_mlp": 1.03536105, + "epoch": 0.7044040282579288, + "flos": 11538529620480.0, + "grad_norm": 2.2156697071751204, + "language_loss": 0.71082246, + "learning_rate": 8.485353681802037e-07, + "loss": 0.73216307, + "num_input_tokens_seen": 252781610, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.671875, + "step": 11716, + "time_per_iteration": 2.407350540161133 + }, + { + "auxiliary_loss_clip": 0.01105426, + "auxiliary_loss_mlp": 0.01028892, + "balance_loss_clip": 1.0173167, + "balance_loss_mlp": 1.03666377, + "epoch": 0.7044641515105967, + "flos": 33656126722560.0, + "grad_norm": 1.9148933155218295, + "language_loss": 0.66782308, + "learning_rate": 8.482169512481358e-07, + "loss": 0.68916631, + "num_input_tokens_seen": 252800600, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6875, + "step": 11717, + "time_per_iteration": 2.525740146636963 + }, + { + "auxiliary_loss_clip": 0.01102186, + "auxiliary_loss_mlp": 0.01028377, + "balance_loss_clip": 1.01697445, + "balance_loss_mlp": 1.03591442, + "epoch": 0.7045242747632647, + "flos": 26723859356160.0, + "grad_norm": 1.4782257349417278, + "language_loss": 0.7415244, + "learning_rate": 8.478985779917967e-07, + "loss": 0.76283002, + "num_input_tokens_seen": 252822310, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 11718, + "time_per_iteration": 2.5084335803985596 + }, + { + "auxiliary_loss_clip": 0.01100672, + "auxiliary_loss_mlp": 0.01031212, + "balance_loss_clip": 1.02055478, + "balance_loss_mlp": 1.03563166, + "epoch": 0.7045843980159326, + "flos": 26797655848320.0, + "grad_norm": 1.542276447013311, + "language_loss": 0.79529881, + "learning_rate": 8.475802484232606e-07, + "loss": 0.81661767, + "num_input_tokens_seen": 252842355, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6484375, + "step": 11719, + "time_per_iteration": 3.982532024383545 + }, + { + "auxiliary_loss_clip": 0.011017, + "auxiliary_loss_mlp": 0.01032857, + "balance_loss_clip": 1.02105546, + "balance_loss_mlp": 1.03649902, + "epoch": 0.7046445212686007, + "flos": 41574824363520.0, + "grad_norm": 1.7315117799773545, + "language_loss": 0.65495813, + "learning_rate": 8.472619625545951e-07, + "loss": 0.67630363, + "num_input_tokens_seen": 252866785, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65234375, + "step": 11720, + "time_per_iteration": 2.613939046859741 + }, + { + "auxiliary_loss_clip": 0.01103943, + "auxiliary_loss_mlp": 0.01027827, + "balance_loss_clip": 1.01631165, + "balance_loss_mlp": 1.03645182, + "epoch": 0.7047046445212686, + "flos": 15560166113280.0, + "grad_norm": 2.050842345880835, + "language_loss": 0.79890549, + "learning_rate": 8.46943720397872e-07, + "loss": 0.82022321, + "num_input_tokens_seen": 252881870, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.67578125, + "step": 11721, + "time_per_iteration": 3.8472952842712402 + }, + { + "auxiliary_loss_clip": 0.01025354, + "auxiliary_loss_mlp": 0.0100049, + "balance_loss_clip": 0.99931604, + "balance_loss_mlp": 1.00471067, + "epoch": 0.7047647677739366, + "flos": 70410269571840.0, + "grad_norm": 0.7603746797437617, + "language_loss": 0.64777911, + "learning_rate": 8.466255219651582e-07, + "loss": 0.66803753, + "num_input_tokens_seen": 252951300, + "router_z_loss_clip": 0.01171875, + "router_z_loss_mlp": 0.20703125, + "step": 11722, + "time_per_iteration": 4.5988264083862305 + }, + { + "auxiliary_loss_clip": 0.01101223, + "auxiliary_loss_mlp": 0.01031226, + "balance_loss_clip": 1.02053356, + "balance_loss_mlp": 1.03678107, + "epoch": 0.7048248910266045, + "flos": 23660032976640.0, + "grad_norm": 1.538856016334547, + "language_loss": 0.65742815, + "learning_rate": 8.463073672685211e-07, + "loss": 0.67875266, + "num_input_tokens_seen": 252971400, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.64453125, + "step": 11723, + "time_per_iteration": 3.925845146179199 + }, + { + "auxiliary_loss_clip": 0.01103786, + "auxiliary_loss_mlp": 0.01027901, + "balance_loss_clip": 1.01623046, + "balance_loss_mlp": 1.03655779, + "epoch": 0.7048850142792725, + "flos": 21397158017280.0, + "grad_norm": 1.8916483795909507, + "language_loss": 0.81127882, + "learning_rate": 8.459892563200235e-07, + "loss": 0.83259565, + "num_input_tokens_seen": 252989475, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.671875, + "step": 11724, + "time_per_iteration": 2.4720969200134277 + }, + { + "auxiliary_loss_clip": 0.01100772, + "auxiliary_loss_mlp": 0.01034144, + "balance_loss_clip": 1.02252126, + "balance_loss_mlp": 1.03349257, + "epoch": 0.7049451375319404, + "flos": 21648101408640.0, + "grad_norm": 2.093101088286717, + "language_loss": 0.72902447, + "learning_rate": 8.456711891317296e-07, + "loss": 0.75037366, + "num_input_tokens_seen": 253007220, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 11725, + "time_per_iteration": 2.4452946186065674 + }, + { + "auxiliary_loss_clip": 0.01103396, + "auxiliary_loss_mlp": 0.01029478, + "balance_loss_clip": 1.01771188, + "balance_loss_mlp": 1.03560305, + "epoch": 0.7050052607846085, + "flos": 14866802904960.0, + "grad_norm": 2.2997258543703847, + "language_loss": 0.78231096, + "learning_rate": 8.453531657156998e-07, + "loss": 0.80363971, + "num_input_tokens_seen": 253025410, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6796875, + "step": 11726, + "time_per_iteration": 2.4585561752319336 + }, + { + "auxiliary_loss_clip": 0.0110172, + "auxiliary_loss_mlp": 0.01028095, + "balance_loss_clip": 1.01683593, + "balance_loss_mlp": 1.0345757, + "epoch": 0.7050653840372764, + "flos": 19241763528960.0, + "grad_norm": 1.8306322081887336, + "language_loss": 0.70494819, + "learning_rate": 8.450351860839931e-07, + "loss": 0.72624636, + "num_input_tokens_seen": 253043305, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 11727, + "time_per_iteration": 2.4121358394622803 + }, + { + "auxiliary_loss_clip": 0.01093352, + "auxiliary_loss_mlp": 0.01023523, + "balance_loss_clip": 1.01340246, + "balance_loss_mlp": 1.03211212, + "epoch": 0.7051255072899444, + "flos": 27780422935680.0, + "grad_norm": 1.6678218850336868, + "language_loss": 0.69096273, + "learning_rate": 8.44717250248668e-07, + "loss": 0.7121315, + "num_input_tokens_seen": 253062790, + "router_z_loss_clip": 0.10107422, + "router_z_loss_mlp": 0.61328125, + "step": 11728, + "time_per_iteration": 2.5468525886535645 + }, + { + "auxiliary_loss_clip": 0.0110237, + "auxiliary_loss_mlp": 0.01029013, + "balance_loss_clip": 1.01771235, + "balance_loss_mlp": 1.03713453, + "epoch": 0.7051856305426124, + "flos": 27892033470720.0, + "grad_norm": 3.1019246116397774, + "language_loss": 0.73087037, + "learning_rate": 8.443993582217803e-07, + "loss": 0.75218427, + "num_input_tokens_seen": 253082055, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 11729, + "time_per_iteration": 2.4827933311462402 + }, + { + "auxiliary_loss_clip": 0.01106229, + "auxiliary_loss_mlp": 0.01033191, + "balance_loss_clip": 1.02045989, + "balance_loss_mlp": 1.03594112, + "epoch": 0.7052457537952803, + "flos": 25043563082880.0, + "grad_norm": 1.545567199994104, + "language_loss": 0.77897024, + "learning_rate": 8.440815100153862e-07, + "loss": 0.80036438, + "num_input_tokens_seen": 253102575, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 11730, + "time_per_iteration": 2.493704080581665 + }, + { + "auxiliary_loss_clip": 0.0110104, + "auxiliary_loss_mlp": 0.01030193, + "balance_loss_clip": 1.01871312, + "balance_loss_mlp": 1.03360641, + "epoch": 0.7053058770479483, + "flos": 21871717528320.0, + "grad_norm": 2.123896450725626, + "language_loss": 0.62706244, + "learning_rate": 8.437637056415359e-07, + "loss": 0.64837468, + "num_input_tokens_seen": 253121290, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.671875, + "step": 11731, + "time_per_iteration": 2.459735631942749 + }, + { + "auxiliary_loss_clip": 0.01103723, + "auxiliary_loss_mlp": 0.01025352, + "balance_loss_clip": 1.01358604, + "balance_loss_mlp": 1.03539586, + "epoch": 0.7053660003006162, + "flos": 16398716094720.0, + "grad_norm": 2.3898643418724888, + "language_loss": 0.74733448, + "learning_rate": 8.434459451122815e-07, + "loss": 0.76862514, + "num_input_tokens_seen": 253139720, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.68359375, + "step": 11732, + "time_per_iteration": 2.4383316040039062 + }, + { + "auxiliary_loss_clip": 0.01100804, + "auxiliary_loss_mlp": 0.0102585, + "balance_loss_clip": 1.0146327, + "balance_loss_mlp": 1.03631091, + "epoch": 0.7054261235532843, + "flos": 22711560399360.0, + "grad_norm": 1.6140204941030658, + "language_loss": 0.70913476, + "learning_rate": 8.431282284396735e-07, + "loss": 0.73040134, + "num_input_tokens_seen": 253160250, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.64453125, + "step": 11733, + "time_per_iteration": 2.463106632232666 + }, + { + "auxiliary_loss_clip": 0.0109822, + "auxiliary_loss_mlp": 0.01030378, + "balance_loss_clip": 1.01916051, + "balance_loss_mlp": 1.0332557, + "epoch": 0.7054862468059522, + "flos": 13589711775360.0, + "grad_norm": 1.8693202683913837, + "language_loss": 0.73223364, + "learning_rate": 8.428105556357583e-07, + "loss": 0.75351965, + "num_input_tokens_seen": 253178710, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 11734, + "time_per_iteration": 2.44874906539917 + }, + { + "auxiliary_loss_clip": 0.01105433, + "auxiliary_loss_mlp": 0.01034147, + "balance_loss_clip": 1.02211308, + "balance_loss_mlp": 1.03561354, + "epoch": 0.7055463700586202, + "flos": 15880704105600.0, + "grad_norm": 2.1460182030345423, + "language_loss": 0.69040471, + "learning_rate": 8.424929267125829e-07, + "loss": 0.71180052, + "num_input_tokens_seen": 253194805, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.69921875, + "step": 11735, + "time_per_iteration": 2.3848354816436768 + }, + { + "auxiliary_loss_clip": 0.01103108, + "auxiliary_loss_mlp": 0.01034516, + "balance_loss_clip": 1.02173638, + "balance_loss_mlp": 1.03526986, + "epoch": 0.7056064933112881, + "flos": 23076161400960.0, + "grad_norm": 2.0775841009488105, + "language_loss": 0.72464728, + "learning_rate": 8.421753416821933e-07, + "loss": 0.74602348, + "num_input_tokens_seen": 253213895, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 11736, + "time_per_iteration": 2.4738998413085938 + }, + { + "auxiliary_loss_clip": 0.01101906, + "auxiliary_loss_mlp": 0.01022502, + "balance_loss_clip": 1.01198161, + "balance_loss_mlp": 1.03716493, + "epoch": 0.7056666165639561, + "flos": 24057168721920.0, + "grad_norm": 1.8965770447194195, + "language_loss": 0.69242585, + "learning_rate": 8.41857800556629e-07, + "loss": 0.71366996, + "num_input_tokens_seen": 253231620, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6484375, + "step": 11737, + "time_per_iteration": 2.486891031265259 + }, + { + "auxiliary_loss_clip": 0.01104553, + "auxiliary_loss_mlp": 0.01036427, + "balance_loss_clip": 1.02426147, + "balance_loss_mlp": 1.03642035, + "epoch": 0.705726739816624, + "flos": 17493237371520.0, + "grad_norm": 3.675344969023003, + "language_loss": 0.6783061, + "learning_rate": 8.415403033479332e-07, + "loss": 0.69971591, + "num_input_tokens_seen": 253249590, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.68359375, + "step": 11738, + "time_per_iteration": 2.553422212600708 + }, + { + "auxiliary_loss_clip": 0.01101113, + "auxiliary_loss_mlp": 0.01028831, + "balance_loss_clip": 1.01684439, + "balance_loss_mlp": 1.03525221, + "epoch": 0.7057868630692921, + "flos": 51350426472960.0, + "grad_norm": 1.822626738464323, + "language_loss": 0.75158858, + "learning_rate": 8.41222850068145e-07, + "loss": 0.77288795, + "num_input_tokens_seen": 253273870, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66015625, + "step": 11739, + "time_per_iteration": 2.7234206199645996 + }, + { + "auxiliary_loss_clip": 0.01096979, + "auxiliary_loss_mlp": 0.01024687, + "balance_loss_clip": 1.01327837, + "balance_loss_mlp": 1.03416896, + "epoch": 0.70584698632196, + "flos": 26102963836800.0, + "grad_norm": 1.6386606118434162, + "language_loss": 0.71622884, + "learning_rate": 8.409054407293032e-07, + "loss": 0.73744547, + "num_input_tokens_seen": 253293720, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.62890625, + "step": 11740, + "time_per_iteration": 2.5212409496307373 + }, + { + "auxiliary_loss_clip": 0.01101026, + "auxiliary_loss_mlp": 0.01025615, + "balance_loss_clip": 1.01523209, + "balance_loss_mlp": 1.03545165, + "epoch": 0.705907109574628, + "flos": 21543134889600.0, + "grad_norm": 1.6725006196923968, + "language_loss": 0.81998235, + "learning_rate": 8.405880753434434e-07, + "loss": 0.84124875, + "num_input_tokens_seen": 253313700, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.65625, + "step": 11741, + "time_per_iteration": 2.492769241333008 + }, + { + "auxiliary_loss_clip": 0.01100552, + "auxiliary_loss_mlp": 0.0102846, + "balance_loss_clip": 1.01662874, + "balance_loss_mlp": 1.03408957, + "epoch": 0.705967232827296, + "flos": 22710842127360.0, + "grad_norm": 3.596961466154263, + "language_loss": 0.78171599, + "learning_rate": 8.402707539225993e-07, + "loss": 0.80300617, + "num_input_tokens_seen": 253332425, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 11742, + "time_per_iteration": 2.4635274410247803 + }, + { + "auxiliary_loss_clip": 0.01105195, + "auxiliary_loss_mlp": 0.01028943, + "balance_loss_clip": 1.01668274, + "balance_loss_mlp": 1.03600883, + "epoch": 0.7060273560799639, + "flos": 28691225124480.0, + "grad_norm": 1.573979132261771, + "language_loss": 0.64315516, + "learning_rate": 8.39953476478805e-07, + "loss": 0.66449654, + "num_input_tokens_seen": 253353620, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.69140625, + "step": 11743, + "time_per_iteration": 2.5026400089263916 + }, + { + "auxiliary_loss_clip": 0.01102792, + "auxiliary_loss_mlp": 0.01026845, + "balance_loss_clip": 1.01475716, + "balance_loss_mlp": 1.03465271, + "epoch": 0.7060874793326319, + "flos": 15706178899200.0, + "grad_norm": 2.3718798915613846, + "language_loss": 0.65446359, + "learning_rate": 8.396362430240902e-07, + "loss": 0.67576003, + "num_input_tokens_seen": 253370930, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.68359375, + "step": 11744, + "time_per_iteration": 2.458536386489868 + }, + { + "auxiliary_loss_clip": 0.01099117, + "auxiliary_loss_mlp": 0.01030101, + "balance_loss_clip": 1.0185678, + "balance_loss_mlp": 1.03479218, + "epoch": 0.7061476025852998, + "flos": 21506757390720.0, + "grad_norm": 1.9180320114034342, + "language_loss": 0.6355719, + "learning_rate": 8.393190535704857e-07, + "loss": 0.65686405, + "num_input_tokens_seen": 253389810, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 11745, + "time_per_iteration": 2.462301254272461 + }, + { + "auxiliary_loss_clip": 0.0110159, + "auxiliary_loss_mlp": 0.0102864, + "balance_loss_clip": 1.01734483, + "balance_loss_mlp": 1.03486073, + "epoch": 0.7062077258379679, + "flos": 28181832399360.0, + "grad_norm": 1.843467279794647, + "language_loss": 0.71770209, + "learning_rate": 8.390019081300188e-07, + "loss": 0.73900437, + "num_input_tokens_seen": 253408685, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66796875, + "step": 11746, + "time_per_iteration": 2.528543472290039 + }, + { + "auxiliary_loss_clip": 0.01102256, + "auxiliary_loss_mlp": 0.01030658, + "balance_loss_clip": 1.01882029, + "balance_loss_mlp": 1.03566575, + "epoch": 0.7062678490906358, + "flos": 27853680723840.0, + "grad_norm": 1.4097258428408725, + "language_loss": 0.79373205, + "learning_rate": 8.386848067147175e-07, + "loss": 0.81506121, + "num_input_tokens_seen": 253429685, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 11747, + "time_per_iteration": 2.479778528213501 + }, + { + "auxiliary_loss_clip": 0.01099735, + "auxiliary_loss_mlp": 0.01027659, + "balance_loss_clip": 1.01698387, + "balance_loss_mlp": 1.03513098, + "epoch": 0.7063279723433038, + "flos": 23184862934400.0, + "grad_norm": 1.7869226712906443, + "language_loss": 0.65377176, + "learning_rate": 8.383677493366031e-07, + "loss": 0.67504573, + "num_input_tokens_seen": 253448260, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.64453125, + "step": 11748, + "time_per_iteration": 2.4946584701538086 + }, + { + "auxiliary_loss_clip": 0.01101478, + "auxiliary_loss_mlp": 0.01034387, + "balance_loss_clip": 1.02267504, + "balance_loss_mlp": 1.03426147, + "epoch": 0.7063880955959717, + "flos": 20188655907840.0, + "grad_norm": 1.990623957456742, + "language_loss": 0.79503167, + "learning_rate": 8.380507360077003e-07, + "loss": 0.8163904, + "num_input_tokens_seen": 253467725, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 11749, + "time_per_iteration": 2.4612464904785156 + }, + { + "auxiliary_loss_clip": 0.01024671, + "auxiliary_loss_mlp": 0.010023, + "balance_loss_clip": 1.0011971, + "balance_loss_mlp": 1.00396466, + "epoch": 0.7064482188486397, + "flos": 63668182763520.0, + "grad_norm": 0.788003911856545, + "language_loss": 0.54088426, + "learning_rate": 8.377337667400304e-07, + "loss": 0.56115395, + "num_input_tokens_seen": 253526940, + "router_z_loss_clip": 0.01104736, + "router_z_loss_mlp": 0.20703125, + "step": 11750, + "time_per_iteration": 2.998089075088501 + }, + { + "auxiliary_loss_clip": 0.01103221, + "auxiliary_loss_mlp": 0.01029796, + "balance_loss_clip": 1.01806545, + "balance_loss_mlp": 1.03667092, + "epoch": 0.7065083421013076, + "flos": 25191227894400.0, + "grad_norm": 2.4248797762244725, + "language_loss": 0.7843067, + "learning_rate": 8.37416841545612e-07, + "loss": 0.80563688, + "num_input_tokens_seen": 253546160, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 11751, + "time_per_iteration": 2.4795496463775635 + }, + { + "auxiliary_loss_clip": 0.01096512, + "auxiliary_loss_mlp": 0.01027672, + "balance_loss_clip": 1.01685357, + "balance_loss_mlp": 1.03329563, + "epoch": 0.7065684653539757, + "flos": 22893699288960.0, + "grad_norm": 1.7553518859924266, + "language_loss": 0.67958248, + "learning_rate": 8.370999604364634e-07, + "loss": 0.70082432, + "num_input_tokens_seen": 253565505, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6328125, + "step": 11752, + "time_per_iteration": 2.4724245071411133 + }, + { + "auxiliary_loss_clip": 0.01100964, + "auxiliary_loss_mlp": 0.01034731, + "balance_loss_clip": 1.02317405, + "balance_loss_mlp": 1.03582311, + "epoch": 0.7066285886066436, + "flos": 23550254035200.0, + "grad_norm": 2.8550758527521567, + "language_loss": 0.76533222, + "learning_rate": 8.367831234246025e-07, + "loss": 0.78668916, + "num_input_tokens_seen": 253585125, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65234375, + "step": 11753, + "time_per_iteration": 2.5033509731292725 + }, + { + "auxiliary_loss_clip": 0.01099171, + "auxiliary_loss_mlp": 0.01026978, + "balance_loss_clip": 1.01595759, + "balance_loss_mlp": 1.03566098, + "epoch": 0.7066887118593116, + "flos": 21069293650560.0, + "grad_norm": 1.8063663453491996, + "language_loss": 0.710163, + "learning_rate": 8.364663305220405e-07, + "loss": 0.73142445, + "num_input_tokens_seen": 253604815, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.63671875, + "step": 11754, + "time_per_iteration": 2.47737979888916 + }, + { + "auxiliary_loss_clip": 0.01101217, + "auxiliary_loss_mlp": 0.01032473, + "balance_loss_clip": 1.02061772, + "balance_loss_mlp": 1.03515744, + "epoch": 0.7067488351119796, + "flos": 21176307244800.0, + "grad_norm": 1.555791916243094, + "language_loss": 0.89167392, + "learning_rate": 8.361495817407919e-07, + "loss": 0.91301078, + "num_input_tokens_seen": 253622855, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66015625, + "step": 11755, + "time_per_iteration": 2.4300765991210938 + }, + { + "auxiliary_loss_clip": 0.01099783, + "auxiliary_loss_mlp": 0.01032572, + "balance_loss_clip": 1.0210979, + "balance_loss_mlp": 1.03451729, + "epoch": 0.7068089583646475, + "flos": 20449224144000.0, + "grad_norm": 1.6305430191953068, + "language_loss": 0.79877228, + "learning_rate": 8.358328770928678e-07, + "loss": 0.82009578, + "num_input_tokens_seen": 253642760, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65625, + "step": 11756, + "time_per_iteration": 2.455738067626953 + }, + { + "auxiliary_loss_clip": 0.01025525, + "auxiliary_loss_mlp": 0.00998571, + "balance_loss_clip": 0.99742049, + "balance_loss_mlp": 1.00500059, + "epoch": 0.7068690816173155, + "flos": 59109179829120.0, + "grad_norm": 0.8167477619249136, + "language_loss": 0.60323715, + "learning_rate": 8.355162165902785e-07, + "loss": 0.62347817, + "num_input_tokens_seen": 253695685, + "router_z_loss_clip": 0.01147461, + "router_z_loss_mlp": 0.20507812, + "step": 11757, + "time_per_iteration": 2.8279542922973633 + }, + { + "auxiliary_loss_clip": 0.01103404, + "auxiliary_loss_mlp": 0.01030296, + "balance_loss_clip": 1.0194478, + "balance_loss_mlp": 1.03670585, + "epoch": 0.7069292048699835, + "flos": 16251554073600.0, + "grad_norm": 2.9383193028665335, + "language_loss": 0.80605227, + "learning_rate": 8.351996002450307e-07, + "loss": 0.82738924, + "num_input_tokens_seen": 253713305, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.66796875, + "step": 11758, + "time_per_iteration": 2.438985824584961 + }, + { + "auxiliary_loss_clip": 0.01097896, + "auxiliary_loss_mlp": 0.01032742, + "balance_loss_clip": 1.02111876, + "balance_loss_mlp": 1.03326845, + "epoch": 0.7069893281226515, + "flos": 41172768455040.0, + "grad_norm": 2.302594291056757, + "language_loss": 0.77111626, + "learning_rate": 8.348830280691304e-07, + "loss": 0.79242271, + "num_input_tokens_seen": 253736100, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6484375, + "step": 11759, + "time_per_iteration": 2.6082146167755127 + }, + { + "auxiliary_loss_clip": 0.01099992, + "auxiliary_loss_mlp": 0.01030363, + "balance_loss_clip": 1.01839471, + "balance_loss_mlp": 1.03407705, + "epoch": 0.7070494513753194, + "flos": 24207275658240.0, + "grad_norm": 1.8203560783968598, + "language_loss": 0.67900372, + "learning_rate": 8.34566500074583e-07, + "loss": 0.70030731, + "num_input_tokens_seen": 253757350, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66015625, + "step": 11760, + "time_per_iteration": 2.4875950813293457 + }, + { + "auxiliary_loss_clip": 0.01103064, + "auxiliary_loss_mlp": 0.01033498, + "balance_loss_clip": 1.02223873, + "balance_loss_mlp": 1.03625393, + "epoch": 0.7071095746279874, + "flos": 20185675079040.0, + "grad_norm": 1.8036620557159548, + "language_loss": 0.80104721, + "learning_rate": 8.342500162733899e-07, + "loss": 0.82241285, + "num_input_tokens_seen": 253772855, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.66796875, + "step": 11761, + "time_per_iteration": 3.7999839782714844 + }, + { + "auxiliary_loss_clip": 0.01101999, + "auxiliary_loss_mlp": 0.01030339, + "balance_loss_clip": 1.01776791, + "balance_loss_mlp": 1.03520203, + "epoch": 0.7071696978806553, + "flos": 18183045133440.0, + "grad_norm": 2.4050467781095697, + "language_loss": 0.74975789, + "learning_rate": 8.33933576677553e-07, + "loss": 0.77108127, + "num_input_tokens_seen": 253790360, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.66796875, + "step": 11762, + "time_per_iteration": 2.408281087875366 + }, + { + "auxiliary_loss_clip": 0.01100811, + "auxiliary_loss_mlp": 0.0102852, + "balance_loss_clip": 1.01743984, + "balance_loss_mlp": 1.03630018, + "epoch": 0.7072298211333233, + "flos": 24131719399680.0, + "grad_norm": 1.750455145965042, + "language_loss": 0.76771009, + "learning_rate": 8.336171812990724e-07, + "loss": 0.78900343, + "num_input_tokens_seen": 253810585, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.64453125, + "step": 11763, + "time_per_iteration": 3.8708484172821045 + }, + { + "auxiliary_loss_clip": 0.01101144, + "auxiliary_loss_mlp": 0.01033004, + "balance_loss_clip": 1.02082658, + "balance_loss_mlp": 1.03537869, + "epoch": 0.7072899443859912, + "flos": 27198418867200.0, + "grad_norm": 2.2813098001672527, + "language_loss": 0.78606045, + "learning_rate": 8.333008301499453e-07, + "loss": 0.8074019, + "num_input_tokens_seen": 253829080, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65625, + "step": 11764, + "time_per_iteration": 3.926267623901367 + }, + { + "auxiliary_loss_clip": 0.0110389, + "auxiliary_loss_mlp": 0.01036608, + "balance_loss_clip": 1.02440739, + "balance_loss_mlp": 1.03585315, + "epoch": 0.7073500676386593, + "flos": 16435596384000.0, + "grad_norm": 1.4902481922059967, + "language_loss": 0.79271352, + "learning_rate": 8.32984523242167e-07, + "loss": 0.8141185, + "num_input_tokens_seen": 253846780, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6796875, + "step": 11765, + "time_per_iteration": 3.9003517627716064 + }, + { + "auxiliary_loss_clip": 0.01097952, + "auxiliary_loss_mlp": 0.01025366, + "balance_loss_clip": 1.01503086, + "balance_loss_mlp": 1.03383851, + "epoch": 0.7074101908913272, + "flos": 27673732563840.0, + "grad_norm": 1.6100965300159724, + "language_loss": 0.68550825, + "learning_rate": 8.326682605877324e-07, + "loss": 0.70674151, + "num_input_tokens_seen": 253867075, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.640625, + "step": 11766, + "time_per_iteration": 2.4833571910858154 + }, + { + "auxiliary_loss_clip": 0.01100721, + "auxiliary_loss_mlp": 0.0103238, + "balance_loss_clip": 1.02058399, + "balance_loss_mlp": 1.03390872, + "epoch": 0.7074703141439952, + "flos": 22238078296320.0, + "grad_norm": 1.8537677939151296, + "language_loss": 0.63282174, + "learning_rate": 8.323520421986352e-07, + "loss": 0.65415275, + "num_input_tokens_seen": 253885790, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.66796875, + "step": 11767, + "time_per_iteration": 2.4963812828063965 + }, + { + "auxiliary_loss_clip": 0.01100427, + "auxiliary_loss_mlp": 0.01029022, + "balance_loss_clip": 1.0175842, + "balance_loss_mlp": 1.03403151, + "epoch": 0.7075304373966632, + "flos": 29643217234560.0, + "grad_norm": 1.4756633405104822, + "language_loss": 0.52592945, + "learning_rate": 8.320358680868646e-07, + "loss": 0.54722404, + "num_input_tokens_seen": 253907070, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6640625, + "step": 11768, + "time_per_iteration": 2.5584144592285156 + }, + { + "auxiliary_loss_clip": 0.01098381, + "auxiliary_loss_mlp": 0.01028365, + "balance_loss_clip": 1.01779723, + "balance_loss_mlp": 1.03422117, + "epoch": 0.7075905606493311, + "flos": 19755214490880.0, + "grad_norm": 2.0331888903396296, + "language_loss": 0.75885397, + "learning_rate": 8.317197382644119e-07, + "loss": 0.78012145, + "num_input_tokens_seen": 253927290, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.640625, + "step": 11769, + "time_per_iteration": 2.474039077758789 + }, + { + "auxiliary_loss_clip": 0.01025061, + "auxiliary_loss_mlp": 0.01004429, + "balance_loss_clip": 1.00333822, + "balance_loss_mlp": 1.00454879, + "epoch": 0.7076506839019991, + "flos": 65716132694400.0, + "grad_norm": 0.8547700200374695, + "language_loss": 0.6197865, + "learning_rate": 8.314036527432637e-07, + "loss": 0.64008141, + "num_input_tokens_seen": 253983440, + "router_z_loss_clip": 0.01092529, + "router_z_loss_mlp": 0.20507812, + "step": 11770, + "time_per_iteration": 2.9852561950683594 + }, + { + "auxiliary_loss_clip": 0.01103159, + "auxiliary_loss_mlp": 0.01032983, + "balance_loss_clip": 1.02135992, + "balance_loss_mlp": 1.03515804, + "epoch": 0.707710807154667, + "flos": 23765286804480.0, + "grad_norm": 1.6682974029871904, + "language_loss": 0.76099932, + "learning_rate": 8.310876115354055e-07, + "loss": 0.78236079, + "num_input_tokens_seen": 254003825, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 11771, + "time_per_iteration": 2.4772582054138184 + }, + { + "auxiliary_loss_clip": 0.01096997, + "auxiliary_loss_mlp": 0.01025955, + "balance_loss_clip": 1.01532149, + "balance_loss_mlp": 1.03349578, + "epoch": 0.7077709304073351, + "flos": 21251360712960.0, + "grad_norm": 1.5504616161071019, + "language_loss": 0.71518672, + "learning_rate": 8.307716146528221e-07, + "loss": 0.73641628, + "num_input_tokens_seen": 254023345, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6328125, + "step": 11772, + "time_per_iteration": 2.443416118621826 + }, + { + "auxiliary_loss_clip": 0.011025, + "auxiliary_loss_mlp": 0.01030203, + "balance_loss_clip": 1.01823425, + "balance_loss_mlp": 1.03437555, + "epoch": 0.707831053660003, + "flos": 20740746925440.0, + "grad_norm": 2.392750359759926, + "language_loss": 0.69805288, + "learning_rate": 8.30455662107496e-07, + "loss": 0.7193799, + "num_input_tokens_seen": 254041815, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6796875, + "step": 11773, + "time_per_iteration": 2.4619219303131104 + }, + { + "auxiliary_loss_clip": 0.01101421, + "auxiliary_loss_mlp": 0.0103239, + "balance_loss_clip": 1.02130961, + "balance_loss_mlp": 1.03520298, + "epoch": 0.707891176912671, + "flos": 21980993679360.0, + "grad_norm": 1.496714779410967, + "language_loss": 0.70210946, + "learning_rate": 8.301397539114095e-07, + "loss": 0.72344756, + "num_input_tokens_seen": 254062065, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6640625, + "step": 11774, + "time_per_iteration": 2.446824073791504 + }, + { + "auxiliary_loss_clip": 0.01098362, + "auxiliary_loss_mlp": 0.01027842, + "balance_loss_clip": 1.01658893, + "balance_loss_mlp": 1.03544569, + "epoch": 0.7079513001653389, + "flos": 21068970428160.0, + "grad_norm": 1.5148638748080412, + "language_loss": 0.74460763, + "learning_rate": 8.298238900765407e-07, + "loss": 0.76586962, + "num_input_tokens_seen": 254080605, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.62890625, + "step": 11775, + "time_per_iteration": 2.482792854309082 + }, + { + "auxiliary_loss_clip": 0.01102892, + "auxiliary_loss_mlp": 0.01026682, + "balance_loss_clip": 1.01527333, + "balance_loss_mlp": 1.03621781, + "epoch": 0.7080114234180069, + "flos": 18040659621120.0, + "grad_norm": 1.8403672382430083, + "language_loss": 0.86566663, + "learning_rate": 8.295080706148665e-07, + "loss": 0.88696229, + "num_input_tokens_seen": 254098710, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 11776, + "time_per_iteration": 2.425718069076538 + }, + { + "auxiliary_loss_clip": 0.01098917, + "auxiliary_loss_mlp": 0.01027548, + "balance_loss_clip": 1.01670027, + "balance_loss_mlp": 1.03438497, + "epoch": 0.7080715466706748, + "flos": 15122271409920.0, + "grad_norm": 1.5328522694355011, + "language_loss": 0.74733853, + "learning_rate": 8.291922955383641e-07, + "loss": 0.76860321, + "num_input_tokens_seen": 254117200, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.64453125, + "step": 11777, + "time_per_iteration": 2.4531426429748535 + }, + { + "auxiliary_loss_clip": 0.01106707, + "auxiliary_loss_mlp": 0.01029139, + "balance_loss_clip": 1.01738548, + "balance_loss_mlp": 1.0374651, + "epoch": 0.7081316699233429, + "flos": 14422802889600.0, + "grad_norm": 2.558875929872249, + "language_loss": 0.82017881, + "learning_rate": 8.288765648590066e-07, + "loss": 0.84153724, + "num_input_tokens_seen": 254132115, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6953125, + "step": 11778, + "time_per_iteration": 2.4829678535461426 + }, + { + "auxiliary_loss_clip": 0.01097091, + "auxiliary_loss_mlp": 0.01028381, + "balance_loss_clip": 1.0185461, + "balance_loss_mlp": 1.03495932, + "epoch": 0.7081917931760108, + "flos": 23222389668480.0, + "grad_norm": 1.514152254548671, + "language_loss": 0.84892875, + "learning_rate": 8.285608785887673e-07, + "loss": 0.87018347, + "num_input_tokens_seen": 254152285, + "router_z_loss_clip": 0.09863281, + "router_z_loss_mlp": 0.62109375, + "step": 11779, + "time_per_iteration": 2.484011173248291 + }, + { + "auxiliary_loss_clip": 0.01103023, + "auxiliary_loss_mlp": 0.01037082, + "balance_loss_clip": 1.02578115, + "balance_loss_mlp": 1.03680944, + "epoch": 0.7082519164286788, + "flos": 39308429871360.0, + "grad_norm": 2.0385221770512474, + "language_loss": 0.71657723, + "learning_rate": 8.28245236739618e-07, + "loss": 0.73797828, + "num_input_tokens_seen": 254172805, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66015625, + "step": 11780, + "time_per_iteration": 2.5964436531066895 + }, + { + "auxiliary_loss_clip": 0.0110027, + "auxiliary_loss_mlp": 0.01029233, + "balance_loss_clip": 1.01806879, + "balance_loss_mlp": 1.03559303, + "epoch": 0.7083120396813467, + "flos": 21651154064640.0, + "grad_norm": 1.4808752741388003, + "language_loss": 0.72866988, + "learning_rate": 8.279296393235256e-07, + "loss": 0.74996495, + "num_input_tokens_seen": 254191890, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.64453125, + "step": 11781, + "time_per_iteration": 2.583249807357788 + }, + { + "auxiliary_loss_clip": 0.01100497, + "auxiliary_loss_mlp": 0.01031477, + "balance_loss_clip": 1.02093256, + "balance_loss_mlp": 1.03541338, + "epoch": 0.7083721629340147, + "flos": 17567033863680.0, + "grad_norm": 1.5571808268796947, + "language_loss": 0.77223784, + "learning_rate": 8.276140863524585e-07, + "loss": 0.79355758, + "num_input_tokens_seen": 254210150, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6484375, + "step": 11782, + "time_per_iteration": 2.4219703674316406 + }, + { + "auxiliary_loss_clip": 0.01098336, + "auxiliary_loss_mlp": 0.01025106, + "balance_loss_clip": 1.0149796, + "balance_loss_mlp": 1.03362107, + "epoch": 0.7084322861866827, + "flos": 29350509304320.0, + "grad_norm": 3.8090510781636273, + "language_loss": 0.69602305, + "learning_rate": 8.272985778383828e-07, + "loss": 0.71725744, + "num_input_tokens_seen": 254233015, + "router_z_loss_clip": 0.1015625, + "router_z_loss_mlp": 0.6484375, + "step": 11783, + "time_per_iteration": 2.532317638397217 + }, + { + "auxiliary_loss_clip": 0.01103454, + "auxiliary_loss_mlp": 0.01028711, + "balance_loss_clip": 1.01744008, + "balance_loss_mlp": 1.03593731, + "epoch": 0.7084924094393507, + "flos": 20194294343040.0, + "grad_norm": 1.6689397610891612, + "language_loss": 0.79052562, + "learning_rate": 8.269831137932632e-07, + "loss": 0.81184721, + "num_input_tokens_seen": 254251345, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.67578125, + "step": 11784, + "time_per_iteration": 2.443634271621704 + }, + { + "auxiliary_loss_clip": 0.01100479, + "auxiliary_loss_mlp": 0.01027476, + "balance_loss_clip": 1.01634157, + "balance_loss_mlp": 1.03534245, + "epoch": 0.7085525326920187, + "flos": 23477211728640.0, + "grad_norm": 2.217987534439464, + "language_loss": 0.77291393, + "learning_rate": 8.266676942290609e-07, + "loss": 0.79419351, + "num_input_tokens_seen": 254269905, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 11785, + "time_per_iteration": 2.4818367958068848 + }, + { + "auxiliary_loss_clip": 0.01100759, + "auxiliary_loss_mlp": 0.01029862, + "balance_loss_clip": 1.01869774, + "balance_loss_mlp": 1.03610969, + "epoch": 0.7086126559446866, + "flos": 25958818558080.0, + "grad_norm": 1.6474825992078, + "language_loss": 0.77668089, + "learning_rate": 8.26352319157738e-07, + "loss": 0.7979871, + "num_input_tokens_seen": 254289990, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6484375, + "step": 11786, + "time_per_iteration": 2.4843997955322266 + }, + { + "auxiliary_loss_clip": 0.01103028, + "auxiliary_loss_mlp": 0.01025097, + "balance_loss_clip": 1.01389718, + "balance_loss_mlp": 1.03586793, + "epoch": 0.7086727791973546, + "flos": 26724793109760.0, + "grad_norm": 2.2462918540494865, + "language_loss": 0.78872836, + "learning_rate": 8.260369885912526e-07, + "loss": 0.8100096, + "num_input_tokens_seen": 254309085, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 11787, + "time_per_iteration": 2.5082507133483887 + }, + { + "auxiliary_loss_clip": 0.01100945, + "auxiliary_loss_mlp": 0.01027499, + "balance_loss_clip": 1.01635325, + "balance_loss_mlp": 1.03544235, + "epoch": 0.7087329024500225, + "flos": 21683365585920.0, + "grad_norm": 1.6974940078994716, + "language_loss": 0.76277357, + "learning_rate": 8.257217025415615e-07, + "loss": 0.78405803, + "num_input_tokens_seen": 254327045, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 11788, + "time_per_iteration": 2.4395945072174072 + }, + { + "auxiliary_loss_clip": 0.01105244, + "auxiliary_loss_mlp": 0.01028353, + "balance_loss_clip": 1.01596761, + "balance_loss_mlp": 1.03661728, + "epoch": 0.7087930257026905, + "flos": 17931060247680.0, + "grad_norm": 2.1698748278708644, + "language_loss": 0.67896038, + "learning_rate": 8.254064610206212e-07, + "loss": 0.70029634, + "num_input_tokens_seen": 254344585, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 11789, + "time_per_iteration": 2.4851551055908203 + }, + { + "auxiliary_loss_clip": 0.01102295, + "auxiliary_loss_mlp": 0.01027538, + "balance_loss_clip": 1.01540208, + "balance_loss_mlp": 1.0347805, + "epoch": 0.7088531489553584, + "flos": 18911528864640.0, + "grad_norm": 1.6812027162903995, + "language_loss": 0.77360779, + "learning_rate": 8.250912640403858e-07, + "loss": 0.79490614, + "num_input_tokens_seen": 254362470, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 11790, + "time_per_iteration": 2.42874813079834 + }, + { + "auxiliary_loss_clip": 0.01105386, + "auxiliary_loss_mlp": 0.01028585, + "balance_loss_clip": 1.01652074, + "balance_loss_mlp": 1.03555274, + "epoch": 0.7089132722080265, + "flos": 27380880979200.0, + "grad_norm": 2.1572989383917864, + "language_loss": 0.70921314, + "learning_rate": 8.247761116128085e-07, + "loss": 0.73055279, + "num_input_tokens_seen": 254383190, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.69921875, + "step": 11791, + "time_per_iteration": 2.5331575870513916 + }, + { + "auxiliary_loss_clip": 0.01101819, + "auxiliary_loss_mlp": 0.01028888, + "balance_loss_clip": 1.01735473, + "balance_loss_mlp": 1.03576159, + "epoch": 0.7089733954606944, + "flos": 22162917087360.0, + "grad_norm": 2.1052262710476968, + "language_loss": 0.81886566, + "learning_rate": 8.244610037498376e-07, + "loss": 0.84017277, + "num_input_tokens_seen": 254403115, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66015625, + "step": 11792, + "time_per_iteration": 2.4568569660186768 + }, + { + "auxiliary_loss_clip": 0.01102816, + "auxiliary_loss_mlp": 0.01027492, + "balance_loss_clip": 1.01563632, + "balance_loss_mlp": 1.03356898, + "epoch": 0.7090335187133624, + "flos": 24425827960320.0, + "grad_norm": 1.890918416074432, + "language_loss": 0.64758253, + "learning_rate": 8.241459404634232e-07, + "loss": 0.66888559, + "num_input_tokens_seen": 254421875, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.69140625, + "step": 11793, + "time_per_iteration": 2.508790969848633 + }, + { + "auxiliary_loss_clip": 0.011012, + "auxiliary_loss_mlp": 0.01027317, + "balance_loss_clip": 1.01664209, + "balance_loss_mlp": 1.03602946, + "epoch": 0.7090936419660303, + "flos": 21835232288640.0, + "grad_norm": 2.7723759797175505, + "language_loss": 0.70710409, + "learning_rate": 8.238309217655133e-07, + "loss": 0.7283892, + "num_input_tokens_seen": 254440765, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.65234375, + "step": 11794, + "time_per_iteration": 2.4677059650421143 + }, + { + "auxiliary_loss_clip": 0.01102435, + "auxiliary_loss_mlp": 0.01029941, + "balance_loss_clip": 1.01953435, + "balance_loss_mlp": 1.03833604, + "epoch": 0.7091537652186983, + "flos": 20082360585600.0, + "grad_norm": 1.7023757586214014, + "language_loss": 0.75844228, + "learning_rate": 8.23515947668052e-07, + "loss": 0.77976608, + "num_input_tokens_seen": 254459480, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.640625, + "step": 11795, + "time_per_iteration": 2.451152801513672 + }, + { + "auxiliary_loss_clip": 0.01100363, + "auxiliary_loss_mlp": 0.0102972, + "balance_loss_clip": 1.01907492, + "balance_loss_mlp": 1.03473902, + "epoch": 0.7092138884713663, + "flos": 13151565676800.0, + "grad_norm": 2.342459713927466, + "language_loss": 0.74982113, + "learning_rate": 8.232010181829838e-07, + "loss": 0.77112198, + "num_input_tokens_seen": 254473985, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.65625, + "step": 11796, + "time_per_iteration": 2.402853012084961 + }, + { + "auxiliary_loss_clip": 0.01106679, + "auxiliary_loss_mlp": 0.01031501, + "balance_loss_clip": 1.0188055, + "balance_loss_mlp": 1.03671682, + "epoch": 0.7092740117240343, + "flos": 21645982506240.0, + "grad_norm": 1.6427166102656843, + "language_loss": 0.74295354, + "learning_rate": 8.228861333222523e-07, + "loss": 0.76433539, + "num_input_tokens_seen": 254492135, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 11797, + "time_per_iteration": 2.4772911071777344 + }, + { + "auxiliary_loss_clip": 0.01102045, + "auxiliary_loss_mlp": 0.01028995, + "balance_loss_clip": 1.01778328, + "balance_loss_mlp": 1.03599036, + "epoch": 0.7093341349767023, + "flos": 21032521102080.0, + "grad_norm": 1.5744211833149133, + "language_loss": 0.79336572, + "learning_rate": 8.225712930977953e-07, + "loss": 0.81467617, + "num_input_tokens_seen": 254512865, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6640625, + "step": 11798, + "time_per_iteration": 2.470794677734375 + }, + { + "auxiliary_loss_clip": 0.0110133, + "auxiliary_loss_mlp": 0.01032189, + "balance_loss_clip": 1.02051234, + "balance_loss_mlp": 1.03513288, + "epoch": 0.7093942582293702, + "flos": 22017658487040.0, + "grad_norm": 1.8971965021381223, + "language_loss": 0.66774857, + "learning_rate": 8.222564975215529e-07, + "loss": 0.68908381, + "num_input_tokens_seen": 254532605, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 11799, + "time_per_iteration": 2.4620981216430664 + }, + { + "auxiliary_loss_clip": 0.01102381, + "auxiliary_loss_mlp": 0.01026893, + "balance_loss_clip": 1.01489425, + "balance_loss_mlp": 1.03516233, + "epoch": 0.7094543814820382, + "flos": 27235586465280.0, + "grad_norm": 1.8304913592304672, + "language_loss": 0.81343234, + "learning_rate": 8.219417466054622e-07, + "loss": 0.83472508, + "num_input_tokens_seen": 254553780, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 11800, + "time_per_iteration": 2.5046193599700928 + }, + { + "auxiliary_loss_clip": 0.01097772, + "auxiliary_loss_mlp": 0.01025392, + "balance_loss_clip": 1.01481259, + "balance_loss_mlp": 1.03336954, + "epoch": 0.7095145047347061, + "flos": 12089148180480.0, + "grad_norm": 1.8277069049900614, + "language_loss": 0.8660984, + "learning_rate": 8.21627040361459e-07, + "loss": 0.88733006, + "num_input_tokens_seen": 254567510, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.64453125, + "step": 11801, + "time_per_iteration": 2.4158272743225098 + }, + { + "auxiliary_loss_clip": 0.01100039, + "auxiliary_loss_mlp": 0.01031161, + "balance_loss_clip": 1.01996124, + "balance_loss_mlp": 1.03366089, + "epoch": 0.7095746279873741, + "flos": 19383789905280.0, + "grad_norm": 1.7026819201034897, + "language_loss": 0.76157814, + "learning_rate": 8.213123788014758e-07, + "loss": 0.78289014, + "num_input_tokens_seen": 254585565, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6640625, + "step": 11802, + "time_per_iteration": 2.4612386226654053 + }, + { + "auxiliary_loss_clip": 0.01102987, + "auxiliary_loss_mlp": 0.01036669, + "balance_loss_clip": 1.02519536, + "balance_loss_mlp": 1.03526998, + "epoch": 0.709634751240042, + "flos": 21360600950400.0, + "grad_norm": 3.23871820936019, + "language_loss": 0.81726915, + "learning_rate": 8.209977619374462e-07, + "loss": 0.83866572, + "num_input_tokens_seen": 254603465, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.67578125, + "step": 11803, + "time_per_iteration": 3.975581407546997 + }, + { + "auxiliary_loss_clip": 0.01101959, + "auxiliary_loss_mlp": 0.01027237, + "balance_loss_clip": 1.01518524, + "balance_loss_mlp": 1.03458929, + "epoch": 0.7096948744927101, + "flos": 13917037438080.0, + "grad_norm": 2.0140842961231047, + "language_loss": 0.67451382, + "learning_rate": 8.206831897812995e-07, + "loss": 0.69580579, + "num_input_tokens_seen": 254620500, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 11804, + "time_per_iteration": 2.4457967281341553 + }, + { + "auxiliary_loss_clip": 0.01096545, + "auxiliary_loss_mlp": 0.01024221, + "balance_loss_clip": 1.01398039, + "balance_loss_mlp": 1.03440809, + "epoch": 0.709754997745378, + "flos": 30298335436800.0, + "grad_norm": 1.740193690303794, + "language_loss": 0.78362393, + "learning_rate": 8.203686623449637e-07, + "loss": 0.80483156, + "num_input_tokens_seen": 254638565, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.62109375, + "step": 11805, + "time_per_iteration": 3.905280590057373 + }, + { + "auxiliary_loss_clip": 0.01099173, + "auxiliary_loss_mlp": 0.01028601, + "balance_loss_clip": 1.01693034, + "balance_loss_mlp": 1.03327656, + "epoch": 0.709815120998046, + "flos": 18515147304960.0, + "grad_norm": 3.0979433949045125, + "language_loss": 0.78634393, + "learning_rate": 8.200541796403667e-07, + "loss": 0.8076216, + "num_input_tokens_seen": 254657505, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66015625, + "step": 11806, + "time_per_iteration": 5.279039144515991 + }, + { + "auxiliary_loss_clip": 0.01100527, + "auxiliary_loss_mlp": 0.01031855, + "balance_loss_clip": 1.02109098, + "balance_loss_mlp": 1.03536928, + "epoch": 0.7098752442507139, + "flos": 22272588288000.0, + "grad_norm": 2.519109679125039, + "language_loss": 0.56458282, + "learning_rate": 8.197397416794332e-07, + "loss": 0.58590662, + "num_input_tokens_seen": 254674730, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.65234375, + "step": 11807, + "time_per_iteration": 2.4814159870147705 + }, + { + "auxiliary_loss_clip": 0.01104048, + "auxiliary_loss_mlp": 0.01038917, + "balance_loss_clip": 1.02743721, + "balance_loss_mlp": 1.03456068, + "epoch": 0.7099353675033819, + "flos": 19275447507840.0, + "grad_norm": 2.0844100679096407, + "language_loss": 0.68413723, + "learning_rate": 8.194253484740882e-07, + "loss": 0.70556688, + "num_input_tokens_seen": 254691665, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6953125, + "step": 11808, + "time_per_iteration": 2.425276279449463 + }, + { + "auxiliary_loss_clip": 0.01102073, + "auxiliary_loss_mlp": 0.01029901, + "balance_loss_clip": 1.01882625, + "balance_loss_mlp": 1.03456879, + "epoch": 0.70999549075605, + "flos": 21908525990400.0, + "grad_norm": 1.9066636961835672, + "language_loss": 0.71175826, + "learning_rate": 8.191110000362513e-07, + "loss": 0.733078, + "num_input_tokens_seen": 254711610, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.67578125, + "step": 11809, + "time_per_iteration": 2.4811971187591553 + }, + { + "auxiliary_loss_clip": 0.01025844, + "auxiliary_loss_mlp": 0.00998682, + "balance_loss_clip": 0.99747771, + "balance_loss_mlp": 1.00508428, + "epoch": 0.7100556140087179, + "flos": 70456053456000.0, + "grad_norm": 0.7498079844660932, + "language_loss": 0.59492218, + "learning_rate": 8.187966963778435e-07, + "loss": 0.61516744, + "num_input_tokens_seen": 254772615, + "router_z_loss_clip": 0.01202393, + "router_z_loss_mlp": 0.20703125, + "step": 11810, + "time_per_iteration": 3.1407463550567627 + }, + { + "auxiliary_loss_clip": 0.01102477, + "auxiliary_loss_mlp": 0.01031923, + "balance_loss_clip": 1.02154016, + "balance_loss_mlp": 1.03702438, + "epoch": 0.7101157372613859, + "flos": 23039568420480.0, + "grad_norm": 1.5762923305466447, + "language_loss": 0.73988348, + "learning_rate": 8.18482437510784e-07, + "loss": 0.76122749, + "num_input_tokens_seen": 254791375, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.65625, + "step": 11811, + "time_per_iteration": 2.4921576976776123 + }, + { + "auxiliary_loss_clip": 0.01097734, + "auxiliary_loss_mlp": 0.01024065, + "balance_loss_clip": 1.0132947, + "balance_loss_mlp": 1.03462029, + "epoch": 0.7101758605140538, + "flos": 23185329811200.0, + "grad_norm": 1.6755141879364293, + "language_loss": 0.83260751, + "learning_rate": 8.181682234469882e-07, + "loss": 0.85382551, + "num_input_tokens_seen": 254809300, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6328125, + "step": 11812, + "time_per_iteration": 2.4486024379730225 + }, + { + "auxiliary_loss_clip": 0.0110213, + "auxiliary_loss_mlp": 0.01025057, + "balance_loss_clip": 1.01317763, + "balance_loss_mlp": 1.0353713, + "epoch": 0.7102359837667218, + "flos": 23696123166720.0, + "grad_norm": 1.6424398905568702, + "language_loss": 0.69810915, + "learning_rate": 8.178540541983716e-07, + "loss": 0.71938103, + "num_input_tokens_seen": 254829325, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66796875, + "step": 11813, + "time_per_iteration": 2.4982481002807617 + }, + { + "auxiliary_loss_clip": 0.01096572, + "auxiliary_loss_mlp": 0.0102546, + "balance_loss_clip": 1.01479709, + "balance_loss_mlp": 1.03272831, + "epoch": 0.7102961070193897, + "flos": 19391116279680.0, + "grad_norm": 1.8324166675871492, + "language_loss": 0.81685358, + "learning_rate": 8.175399297768495e-07, + "loss": 0.83807397, + "num_input_tokens_seen": 254847690, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.63671875, + "step": 11814, + "time_per_iteration": 2.4432296752929688 + }, + { + "auxiliary_loss_clip": 0.01100828, + "auxiliary_loss_mlp": 0.01026562, + "balance_loss_clip": 1.01498675, + "balance_loss_mlp": 1.03533602, + "epoch": 0.7103562302720577, + "flos": 21507511576320.0, + "grad_norm": 2.0936967568296594, + "language_loss": 0.75861955, + "learning_rate": 8.172258501943301e-07, + "loss": 0.77989352, + "num_input_tokens_seen": 254865960, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65234375, + "step": 11815, + "time_per_iteration": 2.49507474899292 + }, + { + "auxiliary_loss_clip": 0.01098556, + "auxiliary_loss_mlp": 0.01030404, + "balance_loss_clip": 1.0192579, + "balance_loss_mlp": 1.03366482, + "epoch": 0.7104163535247257, + "flos": 14535059869440.0, + "grad_norm": 1.6038639171669453, + "language_loss": 0.78608739, + "learning_rate": 8.16911815462725e-07, + "loss": 0.80737698, + "num_input_tokens_seen": 254882815, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 11816, + "time_per_iteration": 2.415172815322876 + }, + { + "auxiliary_loss_clip": 0.01101923, + "auxiliary_loss_mlp": 0.01035584, + "balance_loss_clip": 1.02450991, + "balance_loss_mlp": 1.03593814, + "epoch": 0.7104764767773937, + "flos": 11400310085760.0, + "grad_norm": 1.8614231241085628, + "language_loss": 0.8662678, + "learning_rate": 8.165978255939426e-07, + "loss": 0.88764292, + "num_input_tokens_seen": 254898705, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.66015625, + "step": 11817, + "time_per_iteration": 2.4507339000701904 + }, + { + "auxiliary_loss_clip": 0.01099735, + "auxiliary_loss_mlp": 0.01028346, + "balance_loss_clip": 1.01768219, + "balance_loss_mlp": 1.03457141, + "epoch": 0.7105366000300616, + "flos": 11690432236800.0, + "grad_norm": 3.9427784620989437, + "language_loss": 0.84360695, + "learning_rate": 8.162838805998897e-07, + "loss": 0.86488771, + "num_input_tokens_seen": 254913665, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.65234375, + "step": 11818, + "time_per_iteration": 2.451037883758545 + }, + { + "auxiliary_loss_clip": 0.01099201, + "auxiliary_loss_mlp": 0.01027387, + "balance_loss_clip": 1.01552582, + "balance_loss_mlp": 1.03239679, + "epoch": 0.7105967232827296, + "flos": 19354020508800.0, + "grad_norm": 2.103555241678178, + "language_loss": 0.75971746, + "learning_rate": 8.159699804924709e-07, + "loss": 0.78098345, + "num_input_tokens_seen": 254932140, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66796875, + "step": 11819, + "time_per_iteration": 2.4669997692108154 + }, + { + "auxiliary_loss_clip": 0.01101813, + "auxiliary_loss_mlp": 0.01028132, + "balance_loss_clip": 1.01483393, + "balance_loss_mlp": 1.03531337, + "epoch": 0.7106568465353975, + "flos": 22930400010240.0, + "grad_norm": 1.7430720805927078, + "language_loss": 0.70564902, + "learning_rate": 8.156561252835883e-07, + "loss": 0.7269485, + "num_input_tokens_seen": 254951580, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6640625, + "step": 11820, + "time_per_iteration": 2.454805612564087 + }, + { + "auxiliary_loss_clip": 0.01100228, + "auxiliary_loss_mlp": 0.01026521, + "balance_loss_clip": 1.01536262, + "balance_loss_mlp": 1.03519297, + "epoch": 0.7107169697880655, + "flos": 19099665325440.0, + "grad_norm": 1.9533750259905485, + "language_loss": 0.75224185, + "learning_rate": 8.153423149851449e-07, + "loss": 0.77350932, + "num_input_tokens_seen": 254969425, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65234375, + "step": 11821, + "time_per_iteration": 2.4534716606140137 + }, + { + "auxiliary_loss_clip": 0.01025147, + "auxiliary_loss_mlp": 0.01000031, + "balance_loss_clip": 0.99898189, + "balance_loss_mlp": 1.00464201, + "epoch": 0.7107770930407336, + "flos": 63638054231040.0, + "grad_norm": 0.7907699295335275, + "language_loss": 0.55060166, + "learning_rate": 8.150285496090388e-07, + "loss": 0.57085341, + "num_input_tokens_seen": 255032680, + "router_z_loss_clip": 0.01049805, + "router_z_loss_mlp": 0.20507812, + "step": 11822, + "time_per_iteration": 3.0831096172332764 + }, + { + "auxiliary_loss_clip": 0.01095485, + "auxiliary_loss_mlp": 0.01025121, + "balance_loss_clip": 1.01383758, + "balance_loss_mlp": 1.03307807, + "epoch": 0.7108372162934015, + "flos": 22054466949120.0, + "grad_norm": 1.9650661666731581, + "language_loss": 0.60139519, + "learning_rate": 8.147148291671688e-07, + "loss": 0.62260121, + "num_input_tokens_seen": 255054400, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.625, + "step": 11823, + "time_per_iteration": 2.5066399574279785 + }, + { + "auxiliary_loss_clip": 0.01100805, + "auxiliary_loss_mlp": 0.01029098, + "balance_loss_clip": 1.01848853, + "balance_loss_mlp": 1.03523636, + "epoch": 0.7108973395460695, + "flos": 19135144984320.0, + "grad_norm": 2.216168272824083, + "language_loss": 0.71333873, + "learning_rate": 8.144011536714322e-07, + "loss": 0.73463774, + "num_input_tokens_seen": 255072785, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.65625, + "step": 11824, + "time_per_iteration": 2.4382858276367188 + }, + { + "auxiliary_loss_clip": 0.01095465, + "auxiliary_loss_mlp": 0.01028765, + "balance_loss_clip": 1.01859653, + "balance_loss_mlp": 1.03347003, + "epoch": 0.7109574627987374, + "flos": 17894431353600.0, + "grad_norm": 1.655325791752312, + "language_loss": 0.7270785, + "learning_rate": 8.140875231337223e-07, + "loss": 0.74832082, + "num_input_tokens_seen": 255091820, + "router_z_loss_clip": 0.1015625, + "router_z_loss_mlp": 0.62109375, + "step": 11825, + "time_per_iteration": 2.46207332611084 + }, + { + "auxiliary_loss_clip": 0.01102509, + "auxiliary_loss_mlp": 0.01030137, + "balance_loss_clip": 1.01849043, + "balance_loss_mlp": 1.0350585, + "epoch": 0.7110175860514054, + "flos": 28979623422720.0, + "grad_norm": 1.7037190958225141, + "language_loss": 0.79228491, + "learning_rate": 8.137739375659321e-07, + "loss": 0.81361139, + "num_input_tokens_seen": 255111720, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.671875, + "step": 11826, + "time_per_iteration": 2.4977200031280518 + }, + { + "auxiliary_loss_clip": 0.01097466, + "auxiliary_loss_mlp": 0.01031185, + "balance_loss_clip": 1.02055109, + "balance_loss_mlp": 1.03329957, + "epoch": 0.7110777093040733, + "flos": 26173312623360.0, + "grad_norm": 1.8095370005527254, + "language_loss": 0.83191311, + "learning_rate": 8.134603969799527e-07, + "loss": 0.8531996, + "num_input_tokens_seen": 255133495, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.640625, + "step": 11827, + "time_per_iteration": 2.5329458713531494 + }, + { + "auxiliary_loss_clip": 0.01100333, + "auxiliary_loss_mlp": 0.01029649, + "balance_loss_clip": 1.01757264, + "balance_loss_mlp": 1.03426528, + "epoch": 0.7111378325567413, + "flos": 26869943969280.0, + "grad_norm": 27.265917209893804, + "language_loss": 0.62289751, + "learning_rate": 8.131469013876748e-07, + "loss": 0.64419734, + "num_input_tokens_seen": 255156880, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 11828, + "time_per_iteration": 2.601370096206665 + }, + { + "auxiliary_loss_clip": 0.01099233, + "auxiliary_loss_mlp": 0.01031042, + "balance_loss_clip": 1.01993763, + "balance_loss_mlp": 1.03395164, + "epoch": 0.7111979558094093, + "flos": 27271820309760.0, + "grad_norm": 1.4399488675180274, + "language_loss": 0.72070241, + "learning_rate": 8.128334508009846e-07, + "loss": 0.74200517, + "num_input_tokens_seen": 255178920, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65234375, + "step": 11829, + "time_per_iteration": 2.534470796585083 + }, + { + "auxiliary_loss_clip": 0.01098293, + "auxiliary_loss_mlp": 0.01030259, + "balance_loss_clip": 1.01942253, + "balance_loss_mlp": 1.0337075, + "epoch": 0.7112580790620773, + "flos": 25046938961280.0, + "grad_norm": 1.7046572375419429, + "language_loss": 0.80539268, + "learning_rate": 8.125200452317697e-07, + "loss": 0.82667816, + "num_input_tokens_seen": 255198095, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6484375, + "step": 11830, + "time_per_iteration": 2.4941787719726562 + }, + { + "auxiliary_loss_clip": 0.01100972, + "auxiliary_loss_mlp": 0.01032941, + "balance_loss_clip": 1.02192593, + "balance_loss_mlp": 1.03516912, + "epoch": 0.7113182023147452, + "flos": 21646628951040.0, + "grad_norm": 1.6897013308211777, + "language_loss": 0.84117299, + "learning_rate": 8.122066846919138e-07, + "loss": 0.86251217, + "num_input_tokens_seen": 255215860, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 11831, + "time_per_iteration": 2.4908971786499023 + }, + { + "auxiliary_loss_clip": 0.01100644, + "auxiliary_loss_mlp": 0.01028791, + "balance_loss_clip": 1.01750183, + "balance_loss_mlp": 1.03430891, + "epoch": 0.7113783255674132, + "flos": 20996287257600.0, + "grad_norm": 2.068922809184691, + "language_loss": 0.76956964, + "learning_rate": 8.118933691932985e-07, + "loss": 0.79086405, + "num_input_tokens_seen": 255235425, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6640625, + "step": 11832, + "time_per_iteration": 2.4407291412353516 + }, + { + "auxiliary_loss_clip": 0.01024653, + "auxiliary_loss_mlp": 0.0100495, + "balance_loss_clip": 1.00390673, + "balance_loss_mlp": 1.00420582, + "epoch": 0.7114384488200811, + "flos": 66771080161920.0, + "grad_norm": 0.7451484693360029, + "language_loss": 0.56659162, + "learning_rate": 8.115800987478059e-07, + "loss": 0.58688766, + "num_input_tokens_seen": 255291680, + "router_z_loss_clip": 0.01043701, + "router_z_loss_mlp": 0.20507812, + "step": 11833, + "time_per_iteration": 2.9816091060638428 + }, + { + "auxiliary_loss_clip": 0.01097454, + "auxiliary_loss_mlp": 0.01033073, + "balance_loss_clip": 1.02226698, + "balance_loss_mlp": 1.0331707, + "epoch": 0.7114985720727491, + "flos": 25010058672000.0, + "grad_norm": 1.6073221071178434, + "language_loss": 0.70877647, + "learning_rate": 8.11266873367315e-07, + "loss": 0.7300818, + "num_input_tokens_seen": 255313880, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.640625, + "step": 11834, + "time_per_iteration": 2.478980541229248 + }, + { + "auxiliary_loss_clip": 0.01103011, + "auxiliary_loss_mlp": 0.0102967, + "balance_loss_clip": 1.0181601, + "balance_loss_mlp": 1.03596425, + "epoch": 0.7115586953254172, + "flos": 21470128496640.0, + "grad_norm": 1.9914740179798254, + "language_loss": 0.79722375, + "learning_rate": 8.10953693063704e-07, + "loss": 0.81855053, + "num_input_tokens_seen": 255332390, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 11835, + "time_per_iteration": 2.479388952255249 + }, + { + "auxiliary_loss_clip": 0.01096967, + "auxiliary_loss_mlp": 0.01025588, + "balance_loss_clip": 1.0149014, + "balance_loss_mlp": 1.03320408, + "epoch": 0.7116188185780851, + "flos": 28622600190720.0, + "grad_norm": 1.6571407536951757, + "language_loss": 0.7602039, + "learning_rate": 8.10640557848848e-07, + "loss": 0.78142941, + "num_input_tokens_seen": 255354025, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.63671875, + "step": 11836, + "time_per_iteration": 2.4998624324798584 + }, + { + "auxiliary_loss_clip": 0.01098563, + "auxiliary_loss_mlp": 0.01030382, + "balance_loss_clip": 1.01905715, + "balance_loss_mlp": 1.03251767, + "epoch": 0.7116789418307531, + "flos": 25293608634240.0, + "grad_norm": 1.7551754985161803, + "language_loss": 0.70438159, + "learning_rate": 8.103274677346208e-07, + "loss": 0.72567105, + "num_input_tokens_seen": 255371400, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66015625, + "step": 11837, + "time_per_iteration": 2.4985547065734863 + }, + { + "auxiliary_loss_clip": 0.01103208, + "auxiliary_loss_mlp": 0.01032547, + "balance_loss_clip": 1.02022099, + "balance_loss_mlp": 1.03518689, + "epoch": 0.711739065083421, + "flos": 25557301353600.0, + "grad_norm": 1.8053810542915782, + "language_loss": 0.61668026, + "learning_rate": 8.100144227328958e-07, + "loss": 0.63803786, + "num_input_tokens_seen": 255390710, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6796875, + "step": 11838, + "time_per_iteration": 2.4703662395477295 + }, + { + "auxiliary_loss_clip": 0.01101169, + "auxiliary_loss_mlp": 0.0103104, + "balance_loss_clip": 1.01982808, + "balance_loss_mlp": 1.03559279, + "epoch": 0.711799188336089, + "flos": 26140993361280.0, + "grad_norm": 2.6637536928847556, + "language_loss": 0.67472559, + "learning_rate": 8.097014228555426e-07, + "loss": 0.69604766, + "num_input_tokens_seen": 255408790, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 11839, + "time_per_iteration": 2.535466194152832 + }, + { + "auxiliary_loss_clip": 0.01101981, + "auxiliary_loss_mlp": 0.0102972, + "balance_loss_clip": 1.01871097, + "balance_loss_mlp": 1.03578651, + "epoch": 0.7118593115887569, + "flos": 21140648017920.0, + "grad_norm": 1.8263370197913231, + "language_loss": 0.84035689, + "learning_rate": 8.093884681144305e-07, + "loss": 0.86167389, + "num_input_tokens_seen": 255426280, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6640625, + "step": 11840, + "time_per_iteration": 2.4370462894439697 + }, + { + "auxiliary_loss_clip": 0.01104281, + "auxiliary_loss_mlp": 0.01028702, + "balance_loss_clip": 1.01743066, + "balance_loss_mlp": 1.03657627, + "epoch": 0.711919434841425, + "flos": 14975684006400.0, + "grad_norm": 2.0089671894900243, + "language_loss": 0.76980072, + "learning_rate": 8.090755585214277e-07, + "loss": 0.79113054, + "num_input_tokens_seen": 255442935, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6796875, + "step": 11841, + "time_per_iteration": 2.4408881664276123 + }, + { + "auxiliary_loss_clip": 0.01102547, + "auxiliary_loss_mlp": 0.01027846, + "balance_loss_clip": 1.01616335, + "balance_loss_mlp": 1.03546906, + "epoch": 0.7119795580940929, + "flos": 16508997826560.0, + "grad_norm": 2.1117001145117595, + "language_loss": 0.74941587, + "learning_rate": 8.087626940883994e-07, + "loss": 0.77071977, + "num_input_tokens_seen": 255460925, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.671875, + "step": 11842, + "time_per_iteration": 2.4360697269439697 + }, + { + "auxiliary_loss_clip": 0.01024411, + "auxiliary_loss_mlp": 0.01002483, + "balance_loss_clip": 1.00127351, + "balance_loss_mlp": 1.00402236, + "epoch": 0.7120396813467609, + "flos": 66570736055040.0, + "grad_norm": 0.784591330387751, + "language_loss": 0.61587965, + "learning_rate": 8.084498748272082e-07, + "loss": 0.63614863, + "num_input_tokens_seen": 255521360, + "router_z_loss_clip": 0.01208496, + "router_z_loss_mlp": 0.20410156, + "step": 11843, + "time_per_iteration": 3.0296053886413574 + }, + { + "auxiliary_loss_clip": 0.01099529, + "auxiliary_loss_mlp": 0.01027745, + "balance_loss_clip": 1.01671815, + "balance_loss_mlp": 1.03480315, + "epoch": 0.7120998045994288, + "flos": 26432731624320.0, + "grad_norm": 1.734640870802516, + "language_loss": 0.80089492, + "learning_rate": 8.081371007497171e-07, + "loss": 0.82216763, + "num_input_tokens_seen": 255541435, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 11844, + "time_per_iteration": 3.887108325958252 + }, + { + "auxiliary_loss_clip": 0.01098797, + "auxiliary_loss_mlp": 0.01030686, + "balance_loss_clip": 1.01913476, + "balance_loss_mlp": 1.03288889, + "epoch": 0.7121599278520968, + "flos": 16427982700800.0, + "grad_norm": 2.1905334361731326, + "language_loss": 0.78714418, + "learning_rate": 8.078243718677873e-07, + "loss": 0.80843902, + "num_input_tokens_seen": 255558505, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66015625, + "step": 11845, + "time_per_iteration": 2.410975456237793 + }, + { + "auxiliary_loss_clip": 0.01099567, + "auxiliary_loss_mlp": 0.01031851, + "balance_loss_clip": 1.02026939, + "balance_loss_mlp": 1.03620291, + "epoch": 0.7122200511047647, + "flos": 28949889939840.0, + "grad_norm": 1.893878343442594, + "language_loss": 0.76888061, + "learning_rate": 8.075116881932762e-07, + "loss": 0.79019481, + "num_input_tokens_seen": 255577815, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6328125, + "step": 11846, + "time_per_iteration": 3.931493043899536 + }, + { + "auxiliary_loss_clip": 0.01102634, + "auxiliary_loss_mlp": 0.01028937, + "balance_loss_clip": 1.0170877, + "balance_loss_mlp": 1.03620863, + "epoch": 0.7122801743574327, + "flos": 16471866142080.0, + "grad_norm": 1.9372499520787854, + "language_loss": 0.58303821, + "learning_rate": 8.071990497380421e-07, + "loss": 0.6043539, + "num_input_tokens_seen": 255595885, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 11847, + "time_per_iteration": 3.8361809253692627 + }, + { + "auxiliary_loss_clip": 0.01097288, + "auxiliary_loss_mlp": 0.01031113, + "balance_loss_clip": 1.01974046, + "balance_loss_mlp": 1.03439856, + "epoch": 0.7123402976101008, + "flos": 20631039811200.0, + "grad_norm": 1.4312853577961298, + "language_loss": 0.71475565, + "learning_rate": 8.068864565139395e-07, + "loss": 0.7360397, + "num_input_tokens_seen": 255616750, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.62890625, + "step": 11848, + "time_per_iteration": 3.985182523727417 + }, + { + "auxiliary_loss_clip": 0.01025097, + "auxiliary_loss_mlp": 0.01002394, + "balance_loss_clip": 1.00141037, + "balance_loss_mlp": 1.00462532, + "epoch": 0.7124004208627687, + "flos": 62325734837760.0, + "grad_norm": 0.8575731984951991, + "language_loss": 0.63123107, + "learning_rate": 8.065739085328211e-07, + "loss": 0.65150595, + "num_input_tokens_seen": 255677900, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.20507812, + "step": 11849, + "time_per_iteration": 3.0350046157836914 + }, + { + "auxiliary_loss_clip": 0.01102326, + "auxiliary_loss_mlp": 0.01031998, + "balance_loss_clip": 1.02040517, + "balance_loss_mlp": 1.03554058, + "epoch": 0.7124605441154367, + "flos": 39675975788160.0, + "grad_norm": 1.4965357236983527, + "language_loss": 0.63742816, + "learning_rate": 8.0626140580654e-07, + "loss": 0.65877146, + "num_input_tokens_seen": 255699140, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66796875, + "step": 11850, + "time_per_iteration": 2.6502671241760254 + }, + { + "auxiliary_loss_clip": 0.01101252, + "auxiliary_loss_mlp": 0.0103008, + "balance_loss_clip": 1.0185765, + "balance_loss_mlp": 1.03538823, + "epoch": 0.7125206673681046, + "flos": 28181868312960.0, + "grad_norm": 1.4672764564322482, + "language_loss": 0.69679284, + "learning_rate": 8.05948948346946e-07, + "loss": 0.71810615, + "num_input_tokens_seen": 255719640, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66015625, + "step": 11851, + "time_per_iteration": 2.495501756668091 + }, + { + "auxiliary_loss_clip": 0.01100247, + "auxiliary_loss_mlp": 0.01031406, + "balance_loss_clip": 1.02083778, + "balance_loss_mlp": 1.03549206, + "epoch": 0.7125807906207726, + "flos": 26176939896960.0, + "grad_norm": 1.4895655159302474, + "language_loss": 0.83113164, + "learning_rate": 8.056365361658882e-07, + "loss": 0.85244817, + "num_input_tokens_seen": 255740450, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6484375, + "step": 11852, + "time_per_iteration": 2.510340929031372 + }, + { + "auxiliary_loss_clip": 0.01103516, + "auxiliary_loss_mlp": 0.0103163, + "balance_loss_clip": 1.0193572, + "balance_loss_mlp": 1.03595805, + "epoch": 0.7126409138734405, + "flos": 17157328358400.0, + "grad_norm": 2.258616053920704, + "language_loss": 0.73188543, + "learning_rate": 8.053241692752126e-07, + "loss": 0.75323689, + "num_input_tokens_seen": 255758070, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.67578125, + "step": 11853, + "time_per_iteration": 2.4003355503082275 + }, + { + "auxiliary_loss_clip": 0.01096006, + "auxiliary_loss_mlp": 0.01027437, + "balance_loss_clip": 1.0173521, + "balance_loss_mlp": 1.03375578, + "epoch": 0.7127010371261085, + "flos": 18769933451520.0, + "grad_norm": 1.9420602082674068, + "language_loss": 0.92091542, + "learning_rate": 8.050118476867635e-07, + "loss": 0.94214988, + "num_input_tokens_seen": 255775685, + "router_z_loss_clip": 0.10058594, + "router_z_loss_mlp": 0.62109375, + "step": 11854, + "time_per_iteration": 2.4623403549194336 + }, + { + "auxiliary_loss_clip": 0.01099699, + "auxiliary_loss_mlp": 0.01030318, + "balance_loss_clip": 1.01910615, + "balance_loss_mlp": 1.0353142, + "epoch": 0.7127611603787765, + "flos": 20376433232640.0, + "grad_norm": 2.0934387752470403, + "language_loss": 0.79594553, + "learning_rate": 8.046995714123856e-07, + "loss": 0.81724572, + "num_input_tokens_seen": 255794750, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 11855, + "time_per_iteration": 2.442281484603882 + }, + { + "auxiliary_loss_clip": 0.01100914, + "auxiliary_loss_mlp": 0.01033118, + "balance_loss_clip": 1.02083373, + "balance_loss_mlp": 1.0347116, + "epoch": 0.7128212836314445, + "flos": 20449008662400.0, + "grad_norm": 1.6650252891937876, + "language_loss": 0.72577047, + "learning_rate": 8.043873404639192e-07, + "loss": 0.74711072, + "num_input_tokens_seen": 255813325, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.66015625, + "step": 11856, + "time_per_iteration": 2.47229266166687 + }, + { + "auxiliary_loss_clip": 0.01103455, + "auxiliary_loss_mlp": 0.01030145, + "balance_loss_clip": 1.01880825, + "balance_loss_mlp": 1.03564286, + "epoch": 0.7128814068841124, + "flos": 23440834229760.0, + "grad_norm": 1.6411446267606922, + "language_loss": 0.70082289, + "learning_rate": 8.040751548532046e-07, + "loss": 0.72215885, + "num_input_tokens_seen": 255832470, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.67578125, + "step": 11857, + "time_per_iteration": 2.4524147510528564 + }, + { + "auxiliary_loss_clip": 0.01098237, + "auxiliary_loss_mlp": 0.01029295, + "balance_loss_clip": 1.01744556, + "balance_loss_mlp": 1.03391576, + "epoch": 0.7129415301367804, + "flos": 18222942165120.0, + "grad_norm": 2.116428788246258, + "language_loss": 0.85496008, + "learning_rate": 8.03763014592081e-07, + "loss": 0.87623537, + "num_input_tokens_seen": 255849740, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.640625, + "step": 11858, + "time_per_iteration": 2.4527347087860107 + }, + { + "auxiliary_loss_clip": 0.01104991, + "auxiliary_loss_mlp": 0.01029649, + "balance_loss_clip": 1.01795506, + "balance_loss_mlp": 1.03623009, + "epoch": 0.7130016533894483, + "flos": 15523896355200.0, + "grad_norm": 1.608889007430339, + "language_loss": 0.80293894, + "learning_rate": 8.034509196923829e-07, + "loss": 0.82428539, + "num_input_tokens_seen": 255866975, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6875, + "step": 11859, + "time_per_iteration": 2.4199166297912598 + }, + { + "auxiliary_loss_clip": 0.010985, + "auxiliary_loss_mlp": 0.01031707, + "balance_loss_clip": 1.02047169, + "balance_loss_mlp": 1.03418899, + "epoch": 0.7130617766421163, + "flos": 57115668960000.0, + "grad_norm": 1.1635938409015476, + "language_loss": 0.68921995, + "learning_rate": 8.031388701659456e-07, + "loss": 0.710522, + "num_input_tokens_seen": 255892915, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.64453125, + "step": 11860, + "time_per_iteration": 2.779348373413086 + }, + { + "auxiliary_loss_clip": 0.01101605, + "auxiliary_loss_mlp": 0.01028645, + "balance_loss_clip": 1.01661134, + "balance_loss_mlp": 1.03528762, + "epoch": 0.7131218998947844, + "flos": 19788252024960.0, + "grad_norm": 1.9453238784757083, + "language_loss": 0.64468431, + "learning_rate": 8.028268660246023e-07, + "loss": 0.66598678, + "num_input_tokens_seen": 255911480, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 11861, + "time_per_iteration": 2.4438693523406982 + }, + { + "auxiliary_loss_clip": 0.01106949, + "auxiliary_loss_mlp": 0.01027271, + "balance_loss_clip": 1.01536191, + "balance_loss_mlp": 1.03813672, + "epoch": 0.7131820231474523, + "flos": 26651894457600.0, + "grad_norm": 1.665544522358975, + "language_loss": 0.67246974, + "learning_rate": 8.025149072801849e-07, + "loss": 0.69381201, + "num_input_tokens_seen": 255931140, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 11862, + "time_per_iteration": 2.507708787918091 + }, + { + "auxiliary_loss_clip": 0.01099005, + "auxiliary_loss_mlp": 0.01033992, + "balance_loss_clip": 1.02357876, + "balance_loss_mlp": 1.03554285, + "epoch": 0.7132421464001203, + "flos": 29205609840000.0, + "grad_norm": 2.1581150638153117, + "language_loss": 0.66787547, + "learning_rate": 8.022029939445214e-07, + "loss": 0.68920541, + "num_input_tokens_seen": 255951665, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.6328125, + "step": 11863, + "time_per_iteration": 2.508451223373413 + }, + { + "auxiliary_loss_clip": 0.01107413, + "auxiliary_loss_mlp": 0.01035848, + "balance_loss_clip": 1.02361131, + "balance_loss_mlp": 1.03781486, + "epoch": 0.7133022696527882, + "flos": 23073611535360.0, + "grad_norm": 9.155363012323315, + "language_loss": 0.65499818, + "learning_rate": 8.018911260294414e-07, + "loss": 0.67643076, + "num_input_tokens_seen": 255970055, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6953125, + "step": 11864, + "time_per_iteration": 2.4946515560150146 + }, + { + "auxiliary_loss_clip": 0.0110368, + "auxiliary_loss_mlp": 0.01028226, + "balance_loss_clip": 1.01631117, + "balance_loss_mlp": 1.03640735, + "epoch": 0.7133623929054562, + "flos": 17457111267840.0, + "grad_norm": 1.87343338578939, + "language_loss": 0.85730636, + "learning_rate": 8.015793035467697e-07, + "loss": 0.87862539, + "num_input_tokens_seen": 255987720, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 11865, + "time_per_iteration": 2.42283296585083 + }, + { + "auxiliary_loss_clip": 0.01100738, + "auxiliary_loss_mlp": 0.01027678, + "balance_loss_clip": 1.01554251, + "balance_loss_mlp": 1.03419256, + "epoch": 0.7134225161581241, + "flos": 19536554448000.0, + "grad_norm": 1.8472790526640706, + "language_loss": 0.74752319, + "learning_rate": 8.012675265083304e-07, + "loss": 0.76880735, + "num_input_tokens_seen": 256005490, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 11866, + "time_per_iteration": 2.4545392990112305 + }, + { + "auxiliary_loss_clip": 0.01104452, + "auxiliary_loss_mlp": 0.01034488, + "balance_loss_clip": 1.02232265, + "balance_loss_mlp": 1.03757143, + "epoch": 0.7134826394107922, + "flos": 26250089944320.0, + "grad_norm": 2.6643205457919477, + "language_loss": 0.70109868, + "learning_rate": 8.009557949259464e-07, + "loss": 0.72248805, + "num_input_tokens_seen": 256026030, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 11867, + "time_per_iteration": 2.487058639526367 + }, + { + "auxiliary_loss_clip": 0.01098879, + "auxiliary_loss_mlp": 0.0102599, + "balance_loss_clip": 1.01518393, + "balance_loss_mlp": 1.03465641, + "epoch": 0.7135427626634601, + "flos": 15815311395840.0, + "grad_norm": 6.705448377548921, + "language_loss": 0.71701014, + "learning_rate": 8.006441088114397e-07, + "loss": 0.73825878, + "num_input_tokens_seen": 256043680, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.640625, + "step": 11868, + "time_per_iteration": 2.4669320583343506 + }, + { + "auxiliary_loss_clip": 0.01105504, + "auxiliary_loss_mlp": 0.01027422, + "balance_loss_clip": 1.01509547, + "balance_loss_mlp": 1.03705835, + "epoch": 0.7136028859161281, + "flos": 18223409041920.0, + "grad_norm": 2.2157289852805278, + "language_loss": 0.65810573, + "learning_rate": 8.003324681766286e-07, + "loss": 0.67943501, + "num_input_tokens_seen": 256059705, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.68359375, + "step": 11869, + "time_per_iteration": 2.452075242996216 + }, + { + "auxiliary_loss_clip": 0.01100077, + "auxiliary_loss_mlp": 0.01024311, + "balance_loss_clip": 1.01321864, + "balance_loss_mlp": 1.03367877, + "epoch": 0.713663009168796, + "flos": 24314827956480.0, + "grad_norm": 1.5172430207890026, + "language_loss": 0.77797884, + "learning_rate": 8.000208730333298e-07, + "loss": 0.79922271, + "num_input_tokens_seen": 256079785, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6640625, + "step": 11870, + "time_per_iteration": 2.497041940689087 + }, + { + "auxiliary_loss_clip": 0.01101931, + "auxiliary_loss_mlp": 0.01029796, + "balance_loss_clip": 1.0176903, + "balance_loss_mlp": 1.03650808, + "epoch": 0.713723132421464, + "flos": 26538488242560.0, + "grad_norm": 1.6309688506128002, + "language_loss": 0.80767673, + "learning_rate": 7.997093233933597e-07, + "loss": 0.82899404, + "num_input_tokens_seen": 256099000, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.65625, + "step": 11871, + "time_per_iteration": 2.519364595413208 + }, + { + "auxiliary_loss_clip": 0.01102602, + "auxiliary_loss_mlp": 0.01036614, + "balance_loss_clip": 1.02396536, + "balance_loss_mlp": 1.03430688, + "epoch": 0.7137832556741319, + "flos": 19865675790720.0, + "grad_norm": 1.5882335500802451, + "language_loss": 0.78899664, + "learning_rate": 7.993978192685331e-07, + "loss": 0.8103888, + "num_input_tokens_seen": 256117985, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.68359375, + "step": 11872, + "time_per_iteration": 2.4607558250427246 + }, + { + "auxiliary_loss_clip": 0.01102685, + "auxiliary_loss_mlp": 0.01026379, + "balance_loss_clip": 1.01414764, + "balance_loss_mlp": 1.035676, + "epoch": 0.7138433789267999, + "flos": 21688932193920.0, + "grad_norm": 2.27961967349627, + "language_loss": 0.84102201, + "learning_rate": 7.990863606706606e-07, + "loss": 0.86231267, + "num_input_tokens_seen": 256134350, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.671875, + "step": 11873, + "time_per_iteration": 2.4343557357788086 + }, + { + "auxiliary_loss_clip": 0.01097529, + "auxiliary_loss_mlp": 0.0102625, + "balance_loss_clip": 1.01584864, + "balance_loss_mlp": 1.03362751, + "epoch": 0.713903502179468, + "flos": 17602729004160.0, + "grad_norm": 1.9049541609511427, + "language_loss": 0.86355829, + "learning_rate": 7.987749476115539e-07, + "loss": 0.88479608, + "num_input_tokens_seen": 256150610, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.640625, + "step": 11874, + "time_per_iteration": 2.4541850090026855 + }, + { + "auxiliary_loss_clip": 0.01102173, + "auxiliary_loss_mlp": 0.01026012, + "balance_loss_clip": 1.01449037, + "balance_loss_mlp": 1.0344789, + "epoch": 0.7139636254321359, + "flos": 18040336398720.0, + "grad_norm": 1.8939539946065194, + "language_loss": 0.82938111, + "learning_rate": 7.984635801030228e-07, + "loss": 0.85066295, + "num_input_tokens_seen": 256168620, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6796875, + "step": 11875, + "time_per_iteration": 2.4051244258880615 + }, + { + "auxiliary_loss_clip": 0.01106903, + "auxiliary_loss_mlp": 0.01031292, + "balance_loss_clip": 1.01805401, + "balance_loss_mlp": 1.03582454, + "epoch": 0.7140237486848039, + "flos": 23331127115520.0, + "grad_norm": 1.8810853083413022, + "language_loss": 0.69459707, + "learning_rate": 7.981522581568721e-07, + "loss": 0.71597898, + "num_input_tokens_seen": 256186700, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 11876, + "time_per_iteration": 2.461815595626831 + }, + { + "auxiliary_loss_clip": 0.0110347, + "auxiliary_loss_mlp": 0.01028817, + "balance_loss_clip": 1.01702094, + "balance_loss_mlp": 1.0361371, + "epoch": 0.7140838719374718, + "flos": 16837077674880.0, + "grad_norm": 1.9368833564249184, + "language_loss": 0.78070778, + "learning_rate": 7.978409817849079e-07, + "loss": 0.80203062, + "num_input_tokens_seen": 256205390, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 11877, + "time_per_iteration": 2.420319080352783 + }, + { + "auxiliary_loss_clip": 0.01100487, + "auxiliary_loss_mlp": 0.01031738, + "balance_loss_clip": 1.02102709, + "balance_loss_mlp": 1.0355581, + "epoch": 0.7141439951901398, + "flos": 21142012734720.0, + "grad_norm": 6.763182431425842, + "language_loss": 0.69534928, + "learning_rate": 7.97529750998934e-07, + "loss": 0.71667153, + "num_input_tokens_seen": 256224575, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6484375, + "step": 11878, + "time_per_iteration": 2.544290781021118 + }, + { + "auxiliary_loss_clip": 0.01100118, + "auxiliary_loss_mlp": 0.01033007, + "balance_loss_clip": 1.02254677, + "balance_loss_mlp": 1.03579926, + "epoch": 0.7142041184428077, + "flos": 24717709877760.0, + "grad_norm": 1.7269032775367334, + "language_loss": 0.679344, + "learning_rate": 7.972185658107535e-07, + "loss": 0.70067525, + "num_input_tokens_seen": 256242130, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.64453125, + "step": 11879, + "time_per_iteration": 2.4966022968292236 + }, + { + "auxiliary_loss_clip": 0.0110079, + "auxiliary_loss_mlp": 0.01031125, + "balance_loss_clip": 1.01867926, + "balance_loss_mlp": 1.03534412, + "epoch": 0.7142642416954758, + "flos": 21908202768000.0, + "grad_norm": 1.8974430539108489, + "language_loss": 0.68789601, + "learning_rate": 7.969074262321646e-07, + "loss": 0.70921516, + "num_input_tokens_seen": 256261920, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.65625, + "step": 11880, + "time_per_iteration": 2.502960205078125 + }, + { + "auxiliary_loss_clip": 0.01101747, + "auxiliary_loss_mlp": 0.01035023, + "balance_loss_clip": 1.02314401, + "balance_loss_mlp": 1.03362322, + "epoch": 0.7143243649481437, + "flos": 20805636844800.0, + "grad_norm": 2.4282585669500105, + "language_loss": 0.80370951, + "learning_rate": 7.965963322749674e-07, + "loss": 0.82507718, + "num_input_tokens_seen": 256277970, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.68359375, + "step": 11881, + "time_per_iteration": 2.470723867416382 + }, + { + "auxiliary_loss_clip": 0.01100316, + "auxiliary_loss_mlp": 0.01028315, + "balance_loss_clip": 1.01772344, + "balance_loss_mlp": 1.03443766, + "epoch": 0.7143844882008117, + "flos": 27235011847680.0, + "grad_norm": 1.561021120261205, + "language_loss": 0.63214886, + "learning_rate": 7.962852839509579e-07, + "loss": 0.65343523, + "num_input_tokens_seen": 256298205, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.66015625, + "step": 11882, + "time_per_iteration": 2.509657859802246 + }, + { + "auxiliary_loss_clip": 0.01104591, + "auxiliary_loss_mlp": 0.01027179, + "balance_loss_clip": 1.01627707, + "balance_loss_mlp": 1.03739905, + "epoch": 0.7144446114534796, + "flos": 17929623703680.0, + "grad_norm": 2.019106640227393, + "language_loss": 0.68898022, + "learning_rate": 7.959742812719304e-07, + "loss": 0.71029788, + "num_input_tokens_seen": 256316685, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.671875, + "step": 11883, + "time_per_iteration": 2.443070650100708 + }, + { + "auxiliary_loss_clip": 0.01101954, + "auxiliary_loss_mlp": 0.0103422, + "balance_loss_clip": 1.02256155, + "balance_loss_mlp": 1.03674269, + "epoch": 0.7145047347061476, + "flos": 20740962407040.0, + "grad_norm": 1.8254173167373133, + "language_loss": 0.77734333, + "learning_rate": 7.956633242496788e-07, + "loss": 0.7987051, + "num_input_tokens_seen": 256334205, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 11884, + "time_per_iteration": 2.498660087585449 + }, + { + "auxiliary_loss_clip": 0.01107297, + "auxiliary_loss_mlp": 0.0103088, + "balance_loss_clip": 1.01801157, + "balance_loss_mlp": 1.03647792, + "epoch": 0.7145648579588155, + "flos": 21178605715200.0, + "grad_norm": 2.2601581794211456, + "language_loss": 0.73881954, + "learning_rate": 7.953524128959954e-07, + "loss": 0.76020128, + "num_input_tokens_seen": 256353340, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.70703125, + "step": 11885, + "time_per_iteration": 2.4516425132751465 + }, + { + "auxiliary_loss_clip": 0.01024577, + "auxiliary_loss_mlp": 0.01003775, + "balance_loss_clip": 1.00262451, + "balance_loss_mlp": 1.00405157, + "epoch": 0.7146249812114835, + "flos": 64784539509120.0, + "grad_norm": 0.8858821646270937, + "language_loss": 0.66354322, + "learning_rate": 7.95041547222669e-07, + "loss": 0.68382668, + "num_input_tokens_seen": 256411550, + "router_z_loss_clip": 0.01147461, + "router_z_loss_mlp": 0.20507812, + "step": 11886, + "time_per_iteration": 4.428006649017334 + }, + { + "auxiliary_loss_clip": 0.01101529, + "auxiliary_loss_mlp": 0.01029616, + "balance_loss_clip": 1.01769543, + "balance_loss_mlp": 1.03508115, + "epoch": 0.7146851044641516, + "flos": 18113881495680.0, + "grad_norm": 2.6640943514117006, + "language_loss": 0.75138283, + "learning_rate": 7.947307272414874e-07, + "loss": 0.77269423, + "num_input_tokens_seen": 256430360, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 11887, + "time_per_iteration": 2.449885129928589 + }, + { + "auxiliary_loss_clip": 0.01102615, + "auxiliary_loss_mlp": 0.01025326, + "balance_loss_clip": 1.01408505, + "balance_loss_mlp": 1.03539872, + "epoch": 0.7147452277168195, + "flos": 19243846517760.0, + "grad_norm": 1.6754616856197402, + "language_loss": 0.71326733, + "learning_rate": 7.944199529642372e-07, + "loss": 0.73454678, + "num_input_tokens_seen": 256449750, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 11888, + "time_per_iteration": 3.880155086517334 + }, + { + "auxiliary_loss_clip": 0.01103487, + "auxiliary_loss_mlp": 0.01031134, + "balance_loss_clip": 1.01867044, + "balance_loss_mlp": 1.03440201, + "epoch": 0.7148053509694875, + "flos": 23764712186880.0, + "grad_norm": 1.7956471800089868, + "language_loss": 0.84206235, + "learning_rate": 7.941092244027041e-07, + "loss": 0.86340851, + "num_input_tokens_seen": 256467330, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.69140625, + "step": 11889, + "time_per_iteration": 3.8910415172576904 + }, + { + "auxiliary_loss_clip": 0.01102924, + "auxiliary_loss_mlp": 0.01024297, + "balance_loss_clip": 1.0128237, + "balance_loss_mlp": 1.0358007, + "epoch": 0.7148654742221554, + "flos": 22485322586880.0, + "grad_norm": 2.5861869043572994, + "language_loss": 0.75895607, + "learning_rate": 7.937985415686695e-07, + "loss": 0.78022826, + "num_input_tokens_seen": 256485705, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.671875, + "step": 11890, + "time_per_iteration": 3.942615270614624 + }, + { + "auxiliary_loss_clip": 0.01100031, + "auxiliary_loss_mlp": 0.01029052, + "balance_loss_clip": 1.01834106, + "balance_loss_mlp": 1.03469455, + "epoch": 0.7149255974748234, + "flos": 24679213476480.0, + "grad_norm": 1.4697874617816058, + "language_loss": 0.74033976, + "learning_rate": 7.934879044739147e-07, + "loss": 0.76163059, + "num_input_tokens_seen": 256504755, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.65234375, + "step": 11891, + "time_per_iteration": 2.5003371238708496 + }, + { + "auxiliary_loss_clip": 0.01101426, + "auxiliary_loss_mlp": 0.0103332, + "balance_loss_clip": 1.02201867, + "balance_loss_mlp": 1.03495193, + "epoch": 0.7149857207274913, + "flos": 18405583845120.0, + "grad_norm": 2.2548483440838676, + "language_loss": 0.68382698, + "learning_rate": 7.931773131302211e-07, + "loss": 0.70517445, + "num_input_tokens_seen": 256523670, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 11892, + "time_per_iteration": 2.431938409805298 + }, + { + "auxiliary_loss_clip": 0.0110488, + "auxiliary_loss_mlp": 0.01033708, + "balance_loss_clip": 1.02009463, + "balance_loss_mlp": 1.03543699, + "epoch": 0.7150458439801594, + "flos": 24969515195520.0, + "grad_norm": 2.391594593507675, + "language_loss": 0.73810261, + "learning_rate": 7.928667675493632e-07, + "loss": 0.75948846, + "num_input_tokens_seen": 256542225, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.6953125, + "step": 11893, + "time_per_iteration": 2.487308979034424 + }, + { + "auxiliary_loss_clip": 0.01103932, + "auxiliary_loss_mlp": 0.01029609, + "balance_loss_clip": 1.01721692, + "balance_loss_mlp": 1.03571689, + "epoch": 0.7151059672328273, + "flos": 16690777580160.0, + "grad_norm": 2.3568611580959016, + "language_loss": 0.65677148, + "learning_rate": 7.925562677431185e-07, + "loss": 0.6781069, + "num_input_tokens_seen": 256560730, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 11894, + "time_per_iteration": 2.4283459186553955 + }, + { + "auxiliary_loss_clip": 0.01103271, + "auxiliary_loss_mlp": 0.01029058, + "balance_loss_clip": 1.01771545, + "balance_loss_mlp": 1.03522325, + "epoch": 0.7151660904854953, + "flos": 27271820309760.0, + "grad_norm": 1.6791953890758138, + "language_loss": 0.77629852, + "learning_rate": 7.922458137232613e-07, + "loss": 0.79762185, + "num_input_tokens_seen": 256580505, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6796875, + "step": 11895, + "time_per_iteration": 2.478421926498413 + }, + { + "auxiliary_loss_clip": 0.01103559, + "auxiliary_loss_mlp": 0.01030416, + "balance_loss_clip": 1.01749945, + "balance_loss_mlp": 1.03565729, + "epoch": 0.7152262137381632, + "flos": 18332254229760.0, + "grad_norm": 2.101834953638121, + "language_loss": 0.69718951, + "learning_rate": 7.919354055015643e-07, + "loss": 0.71852922, + "num_input_tokens_seen": 256597330, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6796875, + "step": 11896, + "time_per_iteration": 2.4343297481536865 + }, + { + "auxiliary_loss_clip": 0.01102918, + "auxiliary_loss_mlp": 0.01042714, + "balance_loss_clip": 1.03019083, + "balance_loss_mlp": 1.03482461, + "epoch": 0.7152863369908312, + "flos": 21799285752960.0, + "grad_norm": 2.363966655291517, + "language_loss": 0.86399305, + "learning_rate": 7.91625043089798e-07, + "loss": 0.88544941, + "num_input_tokens_seen": 256616030, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 11897, + "time_per_iteration": 2.4417433738708496 + }, + { + "auxiliary_loss_clip": 0.01100281, + "auxiliary_loss_mlp": 0.01032292, + "balance_loss_clip": 1.02097273, + "balance_loss_mlp": 1.0358789, + "epoch": 0.7153464602434991, + "flos": 22158427887360.0, + "grad_norm": 2.1825882164427015, + "language_loss": 0.77925879, + "learning_rate": 7.913147264997304e-07, + "loss": 0.8005845, + "num_input_tokens_seen": 256635570, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.64453125, + "step": 11898, + "time_per_iteration": 2.4770331382751465 + }, + { + "auxiliary_loss_clip": 0.01105608, + "auxiliary_loss_mlp": 0.0102901, + "balance_loss_clip": 1.01695776, + "balance_loss_mlp": 1.03606868, + "epoch": 0.7154065834961671, + "flos": 24716057852160.0, + "grad_norm": 1.8319920355445916, + "language_loss": 0.73037088, + "learning_rate": 7.910044557431302e-07, + "loss": 0.75171709, + "num_input_tokens_seen": 256655290, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6953125, + "step": 11899, + "time_per_iteration": 2.4661285877227783 + }, + { + "auxiliary_loss_clip": 0.01100403, + "auxiliary_loss_mlp": 0.01033704, + "balance_loss_clip": 1.02177763, + "balance_loss_mlp": 1.03431213, + "epoch": 0.7154667067488351, + "flos": 22601494149120.0, + "grad_norm": 3.247812809543318, + "language_loss": 0.76076663, + "learning_rate": 7.906942308317614e-07, + "loss": 0.78210765, + "num_input_tokens_seen": 256671605, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 11900, + "time_per_iteration": 2.4811995029449463 + }, + { + "auxiliary_loss_clip": 0.01103689, + "auxiliary_loss_mlp": 0.01027857, + "balance_loss_clip": 1.01656795, + "balance_loss_mlp": 1.03645658, + "epoch": 0.7155268300015031, + "flos": 18771154513920.0, + "grad_norm": 1.955266248567226, + "language_loss": 0.80275625, + "learning_rate": 7.903840517773886e-07, + "loss": 0.82407176, + "num_input_tokens_seen": 256689680, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.671875, + "step": 11901, + "time_per_iteration": 2.423145294189453 + }, + { + "auxiliary_loss_clip": 0.01105274, + "auxiliary_loss_mlp": 0.01029008, + "balance_loss_clip": 1.01728368, + "balance_loss_mlp": 1.0356729, + "epoch": 0.7155869532541711, + "flos": 18296343607680.0, + "grad_norm": 2.026904555565968, + "language_loss": 0.81071323, + "learning_rate": 7.900739185917744e-07, + "loss": 0.83205605, + "num_input_tokens_seen": 256707760, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6953125, + "step": 11902, + "time_per_iteration": 2.459885835647583 + }, + { + "auxiliary_loss_clip": 0.01101351, + "auxiliary_loss_mlp": 0.0102546, + "balance_loss_clip": 1.01437926, + "balance_loss_mlp": 1.03461826, + "epoch": 0.715647076506839, + "flos": 11980805783040.0, + "grad_norm": 1.7500024281838862, + "language_loss": 0.68114519, + "learning_rate": 7.897638312866785e-07, + "loss": 0.70241332, + "num_input_tokens_seen": 256724150, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.66796875, + "step": 11903, + "time_per_iteration": 2.407540798187256 + }, + { + "auxiliary_loss_clip": 0.01098245, + "auxiliary_loss_mlp": 0.010278, + "balance_loss_clip": 1.01664829, + "balance_loss_mlp": 1.03362346, + "epoch": 0.715707199759507, + "flos": 18951641377920.0, + "grad_norm": 1.6395674800408413, + "language_loss": 0.76098162, + "learning_rate": 7.894537898738589e-07, + "loss": 0.78224206, + "num_input_tokens_seen": 256742780, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.64453125, + "step": 11904, + "time_per_iteration": 2.4763503074645996 + }, + { + "auxiliary_loss_clip": 0.01102193, + "auxiliary_loss_mlp": 0.01036941, + "balance_loss_clip": 1.02438211, + "balance_loss_mlp": 1.03558111, + "epoch": 0.7157673230121749, + "flos": 15304410299520.0, + "grad_norm": 2.193780720610546, + "language_loss": 0.72085339, + "learning_rate": 7.891437943650727e-07, + "loss": 0.74224472, + "num_input_tokens_seen": 256761355, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.66796875, + "step": 11905, + "time_per_iteration": 2.42999267578125 + }, + { + "auxiliary_loss_clip": 0.01099839, + "auxiliary_loss_mlp": 0.01029451, + "balance_loss_clip": 1.01815557, + "balance_loss_mlp": 1.03396761, + "epoch": 0.715827446264843, + "flos": 23221850964480.0, + "grad_norm": 1.8001319449198983, + "language_loss": 0.78033888, + "learning_rate": 7.88833844772076e-07, + "loss": 0.80163181, + "num_input_tokens_seen": 256781335, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65625, + "step": 11906, + "time_per_iteration": 2.483344078063965 + }, + { + "auxiliary_loss_clip": 0.01024215, + "auxiliary_loss_mlp": 0.01002687, + "balance_loss_clip": 1.0016793, + "balance_loss_mlp": 1.00366879, + "epoch": 0.7158875695175109, + "flos": 60975421833600.0, + "grad_norm": 0.733638122069069, + "language_loss": 0.55290663, + "learning_rate": 7.885239411066205e-07, + "loss": 0.57317567, + "num_input_tokens_seen": 256838890, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.20507812, + "step": 11907, + "time_per_iteration": 2.9801692962646484 + }, + { + "auxiliary_loss_clip": 0.01101012, + "auxiliary_loss_mlp": 0.01029943, + "balance_loss_clip": 1.01846945, + "balance_loss_mlp": 1.03456974, + "epoch": 0.7159476927701789, + "flos": 17128780024320.0, + "grad_norm": 1.7110812642484816, + "language_loss": 0.69928622, + "learning_rate": 7.882140833804593e-07, + "loss": 0.72059584, + "num_input_tokens_seen": 256858145, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 11908, + "time_per_iteration": 2.4816782474517822 + }, + { + "auxiliary_loss_clip": 0.01102562, + "auxiliary_loss_mlp": 0.01031677, + "balance_loss_clip": 1.01918399, + "balance_loss_mlp": 1.03589427, + "epoch": 0.7160078160228468, + "flos": 22490601886080.0, + "grad_norm": 1.7432604153438784, + "language_loss": 0.71158898, + "learning_rate": 7.879042716053415e-07, + "loss": 0.73293138, + "num_input_tokens_seen": 256878545, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.66796875, + "step": 11909, + "time_per_iteration": 2.463728189468384 + }, + { + "auxiliary_loss_clip": 0.01102467, + "auxiliary_loss_mlp": 0.01030839, + "balance_loss_clip": 1.01932335, + "balance_loss_mlp": 1.0351603, + "epoch": 0.7160679392755148, + "flos": 30590935626240.0, + "grad_norm": 2.4467362846605014, + "language_loss": 0.75301147, + "learning_rate": 7.875945057930144e-07, + "loss": 0.7743445, + "num_input_tokens_seen": 256899920, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 11910, + "time_per_iteration": 2.552417755126953 + }, + { + "auxiliary_loss_clip": 0.01101078, + "auxiliary_loss_mlp": 0.01030765, + "balance_loss_clip": 1.02067399, + "balance_loss_mlp": 1.03550065, + "epoch": 0.7161280625281827, + "flos": 21323648833920.0, + "grad_norm": 1.495993401769944, + "language_loss": 0.7667104, + "learning_rate": 7.872847859552251e-07, + "loss": 0.78802884, + "num_input_tokens_seen": 256918460, + "router_z_loss_clip": 0.10058594, + "router_z_loss_mlp": 0.65625, + "step": 11911, + "time_per_iteration": 2.441070079803467 + }, + { + "auxiliary_loss_clip": 0.01101901, + "auxiliary_loss_mlp": 0.01028053, + "balance_loss_clip": 1.01610255, + "balance_loss_mlp": 1.03523242, + "epoch": 0.7161881857808508, + "flos": 61860078921600.0, + "grad_norm": 1.748429659384578, + "language_loss": 0.58908474, + "learning_rate": 7.869751121037192e-07, + "loss": 0.61038435, + "num_input_tokens_seen": 256942015, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66796875, + "step": 11912, + "time_per_iteration": 2.857440948486328 + }, + { + "auxiliary_loss_clip": 0.01101647, + "auxiliary_loss_mlp": 0.01032599, + "balance_loss_clip": 1.02072561, + "balance_loss_mlp": 1.03633833, + "epoch": 0.7162483090335187, + "flos": 20812101292800.0, + "grad_norm": 2.5901065267477907, + "language_loss": 0.77851343, + "learning_rate": 7.866654842502376e-07, + "loss": 0.79985595, + "num_input_tokens_seen": 256961065, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.65625, + "step": 11913, + "time_per_iteration": 2.4704270362854004 + }, + { + "auxiliary_loss_clip": 0.01097344, + "auxiliary_loss_mlp": 0.0102545, + "balance_loss_clip": 1.01518047, + "balance_loss_mlp": 1.03362048, + "epoch": 0.7163084322861867, + "flos": 24097532630400.0, + "grad_norm": 1.6674872832299297, + "language_loss": 0.7374261, + "learning_rate": 7.863559024065234e-07, + "loss": 0.75865406, + "num_input_tokens_seen": 256982165, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.63671875, + "step": 11914, + "time_per_iteration": 2.4930355548858643 + }, + { + "auxiliary_loss_clip": 0.01097032, + "auxiliary_loss_mlp": 0.01028588, + "balance_loss_clip": 1.01763296, + "balance_loss_mlp": 1.03384876, + "epoch": 0.7163685555388547, + "flos": 20080888128000.0, + "grad_norm": 1.6897507669283607, + "language_loss": 0.74089867, + "learning_rate": 7.860463665843143e-07, + "loss": 0.76215488, + "num_input_tokens_seen": 256999825, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6328125, + "step": 11915, + "time_per_iteration": 2.4409830570220947 + }, + { + "auxiliary_loss_clip": 0.01101198, + "auxiliary_loss_mlp": 0.01026687, + "balance_loss_clip": 1.01569629, + "balance_loss_mlp": 1.03444886, + "epoch": 0.7164286787915226, + "flos": 17456967613440.0, + "grad_norm": 1.8754792377471143, + "language_loss": 0.81102198, + "learning_rate": 7.85736876795349e-07, + "loss": 0.83230084, + "num_input_tokens_seen": 257017450, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.66796875, + "step": 11916, + "time_per_iteration": 2.459618330001831 + }, + { + "auxiliary_loss_clip": 0.01101693, + "auxiliary_loss_mlp": 0.01030601, + "balance_loss_clip": 1.01945496, + "balance_loss_mlp": 1.03565669, + "epoch": 0.7164888020441906, + "flos": 19718908819200.0, + "grad_norm": 1.9464707558133532, + "language_loss": 0.68163168, + "learning_rate": 7.854274330513626e-07, + "loss": 0.70295465, + "num_input_tokens_seen": 257035465, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66015625, + "step": 11917, + "time_per_iteration": 2.4127745628356934 + }, + { + "auxiliary_loss_clip": 0.01101517, + "auxiliary_loss_mlp": 0.01028462, + "balance_loss_clip": 1.01660061, + "balance_loss_mlp": 1.0357312, + "epoch": 0.7165489252968585, + "flos": 21470523546240.0, + "grad_norm": 1.6865560164096236, + "language_loss": 0.75851363, + "learning_rate": 7.851180353640896e-07, + "loss": 0.77981341, + "num_input_tokens_seen": 257053750, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.65625, + "step": 11918, + "time_per_iteration": 2.4734885692596436 + }, + { + "auxiliary_loss_clip": 0.01024332, + "auxiliary_loss_mlp": 0.00998276, + "balance_loss_clip": 0.99721545, + "balance_loss_mlp": 1.00387406, + "epoch": 0.7166090485495266, + "flos": 69928060464000.0, + "grad_norm": 0.6281271868389183, + "language_loss": 0.53900385, + "learning_rate": 7.848086837452639e-07, + "loss": 0.55922985, + "num_input_tokens_seen": 257121215, + "router_z_loss_clip": 0.01062012, + "router_z_loss_mlp": 0.20507812, + "step": 11919, + "time_per_iteration": 3.0739991664886475 + }, + { + "auxiliary_loss_clip": 0.01103551, + "auxiliary_loss_mlp": 0.010276, + "balance_loss_clip": 1.0166924, + "balance_loss_mlp": 1.03664875, + "epoch": 0.7166691718021945, + "flos": 27343892949120.0, + "grad_norm": 1.814886397013554, + "language_loss": 0.69109583, + "learning_rate": 7.844993782066132e-07, + "loss": 0.71240735, + "num_input_tokens_seen": 257143370, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.66796875, + "step": 11920, + "time_per_iteration": 2.544965982437134 + }, + { + "auxiliary_loss_clip": 0.01101615, + "auxiliary_loss_mlp": 0.01034755, + "balance_loss_clip": 1.02296519, + "balance_loss_mlp": 1.03518677, + "epoch": 0.7167292950548625, + "flos": 30408868563840.0, + "grad_norm": 2.316743559144869, + "language_loss": 0.74621791, + "learning_rate": 7.841901187598678e-07, + "loss": 0.76758158, + "num_input_tokens_seen": 257162160, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 11921, + "time_per_iteration": 2.526437282562256 + }, + { + "auxiliary_loss_clip": 0.01105899, + "auxiliary_loss_mlp": 0.0103267, + "balance_loss_clip": 1.01924133, + "balance_loss_mlp": 1.0359118, + "epoch": 0.7167894183075304, + "flos": 14571257800320.0, + "grad_norm": 2.001999520631163, + "language_loss": 0.75461966, + "learning_rate": 7.83880905416755e-07, + "loss": 0.77600539, + "num_input_tokens_seen": 257179300, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.703125, + "step": 11922, + "time_per_iteration": 2.4796934127807617 + }, + { + "auxiliary_loss_clip": 0.0102356, + "auxiliary_loss_mlp": 0.01004637, + "balance_loss_clip": 1.0036118, + "balance_loss_mlp": 1.00325036, + "epoch": 0.7168495415601984, + "flos": 64110674407680.0, + "grad_norm": 0.7529363745673505, + "language_loss": 0.55118704, + "learning_rate": 7.83571738189001e-07, + "loss": 0.57146901, + "num_input_tokens_seen": 257235470, + "router_z_loss_clip": 0.01025391, + "router_z_loss_mlp": 0.203125, + "step": 11923, + "time_per_iteration": 2.8653676509857178 + }, + { + "auxiliary_loss_clip": 0.01101474, + "auxiliary_loss_mlp": 0.01034043, + "balance_loss_clip": 1.02191389, + "balance_loss_mlp": 1.03463423, + "epoch": 0.7169096648128663, + "flos": 24681440119680.0, + "grad_norm": 1.5657552163313224, + "language_loss": 0.7707153, + "learning_rate": 7.832626170883279e-07, + "loss": 0.79207051, + "num_input_tokens_seen": 257255850, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66796875, + "step": 11924, + "time_per_iteration": 2.4798498153686523 + }, + { + "auxiliary_loss_clip": 0.01099287, + "auxiliary_loss_mlp": 0.01026383, + "balance_loss_clip": 1.01563597, + "balance_loss_mlp": 1.03447676, + "epoch": 0.7169697880655344, + "flos": 20667525050880.0, + "grad_norm": 1.8554693193395075, + "language_loss": 0.68279767, + "learning_rate": 7.829535421264588e-07, + "loss": 0.70405436, + "num_input_tokens_seen": 257275425, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6484375, + "step": 11925, + "time_per_iteration": 2.456970453262329 + }, + { + "auxiliary_loss_clip": 0.01094381, + "auxiliary_loss_mlp": 0.01029664, + "balance_loss_clip": 1.0189774, + "balance_loss_mlp": 1.03209913, + "epoch": 0.7170299113182023, + "flos": 21032700670080.0, + "grad_norm": 1.4556850136555692, + "language_loss": 0.77406371, + "learning_rate": 7.826445133151133e-07, + "loss": 0.79530406, + "num_input_tokens_seen": 257295740, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.62109375, + "step": 11926, + "time_per_iteration": 2.47904109954834 + }, + { + "auxiliary_loss_clip": 0.01104854, + "auxiliary_loss_mlp": 0.01029406, + "balance_loss_clip": 1.01756239, + "balance_loss_mlp": 1.03482664, + "epoch": 0.7170900345708703, + "flos": 22893304239360.0, + "grad_norm": 1.9978148890029475, + "language_loss": 0.77397847, + "learning_rate": 7.823355306660093e-07, + "loss": 0.79532105, + "num_input_tokens_seen": 257315970, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.703125, + "step": 11927, + "time_per_iteration": 2.4695799350738525 + }, + { + "auxiliary_loss_clip": 0.0110007, + "auxiliary_loss_mlp": 0.01029701, + "balance_loss_clip": 1.01752985, + "balance_loss_mlp": 1.03606367, + "epoch": 0.7171501578235383, + "flos": 15518688883200.0, + "grad_norm": 1.633304495033459, + "language_loss": 0.69208646, + "learning_rate": 7.820265941908642e-07, + "loss": 0.71338403, + "num_input_tokens_seen": 257334230, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.640625, + "step": 11928, + "time_per_iteration": 3.8939363956451416 + }, + { + "auxiliary_loss_clip": 0.01097285, + "auxiliary_loss_mlp": 0.01028264, + "balance_loss_clip": 1.01755297, + "balance_loss_mlp": 1.03416717, + "epoch": 0.7172102810762062, + "flos": 26104292640000.0, + "grad_norm": 1.8722089290497335, + "language_loss": 0.65309197, + "learning_rate": 7.817177039013931e-07, + "loss": 0.67434746, + "num_input_tokens_seen": 257352145, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6328125, + "step": 11929, + "time_per_iteration": 2.6483962535858154 + }, + { + "auxiliary_loss_clip": 0.01101349, + "auxiliary_loss_mlp": 0.01027062, + "balance_loss_clip": 1.01543295, + "balance_loss_mlp": 1.03426468, + "epoch": 0.7172704043288742, + "flos": 21506649649920.0, + "grad_norm": 1.9043937603193066, + "language_loss": 0.69810534, + "learning_rate": 7.81408859809308e-07, + "loss": 0.71938944, + "num_input_tokens_seen": 257371460, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 11930, + "time_per_iteration": 3.861077308654785 + }, + { + "auxiliary_loss_clip": 0.01099761, + "auxiliary_loss_mlp": 0.01027844, + "balance_loss_clip": 1.01666808, + "balance_loss_mlp": 1.0326252, + "epoch": 0.7173305275815421, + "flos": 18770939032320.0, + "grad_norm": 1.6949604037705792, + "language_loss": 0.80755305, + "learning_rate": 7.811000619263219e-07, + "loss": 0.82882911, + "num_input_tokens_seen": 257390800, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.671875, + "step": 11931, + "time_per_iteration": 5.3606438636779785 + }, + { + "auxiliary_loss_clip": 0.01098633, + "auxiliary_loss_mlp": 0.01031463, + "balance_loss_clip": 1.02060318, + "balance_loss_mlp": 1.03377175, + "epoch": 0.7173906508342102, + "flos": 16179876483840.0, + "grad_norm": 2.0368865181542843, + "language_loss": 0.78136313, + "learning_rate": 7.80791310264143e-07, + "loss": 0.8026641, + "num_input_tokens_seen": 257407495, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6484375, + "step": 11932, + "time_per_iteration": 2.4471938610076904 + }, + { + "auxiliary_loss_clip": 0.01098455, + "auxiliary_loss_mlp": 0.01027853, + "balance_loss_clip": 1.01697493, + "balance_loss_mlp": 1.03404713, + "epoch": 0.7174507740868781, + "flos": 26613864933120.0, + "grad_norm": 2.4237059069381446, + "language_loss": 0.75071502, + "learning_rate": 7.804826048344803e-07, + "loss": 0.77197808, + "num_input_tokens_seen": 257429675, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.64453125, + "step": 11933, + "time_per_iteration": 2.5671815872192383 + }, + { + "auxiliary_loss_clip": 0.01107402, + "auxiliary_loss_mlp": 0.01034606, + "balance_loss_clip": 1.02070642, + "balance_loss_mlp": 1.03681624, + "epoch": 0.7175108973395461, + "flos": 18432911116800.0, + "grad_norm": 2.920076286079433, + "language_loss": 0.69595957, + "learning_rate": 7.801739456490388e-07, + "loss": 0.71737969, + "num_input_tokens_seen": 257442765, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.703125, + "step": 11934, + "time_per_iteration": 2.4200711250305176 + }, + { + "auxiliary_loss_clip": 0.0109937, + "auxiliary_loss_mlp": 0.01033447, + "balance_loss_clip": 1.02228904, + "balance_loss_mlp": 1.03382134, + "epoch": 0.717571020592214, + "flos": 23914962777600.0, + "grad_norm": 2.1353095308292858, + "language_loss": 0.86605275, + "learning_rate": 7.798653327195237e-07, + "loss": 0.8873809, + "num_input_tokens_seen": 257459310, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.65625, + "step": 11935, + "time_per_iteration": 2.4989066123962402 + }, + { + "auxiliary_loss_clip": 0.0110051, + "auxiliary_loss_mlp": 0.0102799, + "balance_loss_clip": 1.01663518, + "balance_loss_mlp": 1.03355277, + "epoch": 0.717631143844882, + "flos": 38256930109440.0, + "grad_norm": 1.5482941622525788, + "language_loss": 0.73668665, + "learning_rate": 7.795567660576388e-07, + "loss": 0.75797164, + "num_input_tokens_seen": 257484750, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 11936, + "time_per_iteration": 2.5941200256347656 + }, + { + "auxiliary_loss_clip": 0.01023485, + "auxiliary_loss_mlp": 0.00998978, + "balance_loss_clip": 0.99795878, + "balance_loss_mlp": 1.00320697, + "epoch": 0.7176912670975499, + "flos": 65515896328320.0, + "grad_norm": 0.7612162175951352, + "language_loss": 0.5594666, + "learning_rate": 7.79248245675082e-07, + "loss": 0.57969117, + "num_input_tokens_seen": 257543110, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.203125, + "step": 11937, + "time_per_iteration": 3.0358333587646484 + }, + { + "auxiliary_loss_clip": 0.01103398, + "auxiliary_loss_mlp": 0.01031893, + "balance_loss_clip": 1.0194416, + "balance_loss_mlp": 1.03557646, + "epoch": 0.717751390350218, + "flos": 31281066610560.0, + "grad_norm": 1.9834308333096748, + "language_loss": 0.54777831, + "learning_rate": 7.789397715835542e-07, + "loss": 0.56913126, + "num_input_tokens_seen": 257567410, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6796875, + "step": 11938, + "time_per_iteration": 2.498337984085083 + }, + { + "auxiliary_loss_clip": 0.01096235, + "auxiliary_loss_mlp": 0.01028561, + "balance_loss_clip": 1.01791584, + "balance_loss_mlp": 1.03201962, + "epoch": 0.7178115136028859, + "flos": 19859031774720.0, + "grad_norm": 1.6763116198702877, + "language_loss": 0.76891506, + "learning_rate": 7.786313437947527e-07, + "loss": 0.79016298, + "num_input_tokens_seen": 257586270, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.640625, + "step": 11939, + "time_per_iteration": 2.4648613929748535 + }, + { + "auxiliary_loss_clip": 0.0102339, + "auxiliary_loss_mlp": 0.01004556, + "balance_loss_clip": 1.00347769, + "balance_loss_mlp": 1.003003, + "epoch": 0.7178716368555539, + "flos": 64348655967360.0, + "grad_norm": 0.7581492176008457, + "language_loss": 0.61391574, + "learning_rate": 7.783229623203738e-07, + "loss": 0.63419521, + "num_input_tokens_seen": 257647415, + "router_z_loss_clip": 0.01080322, + "router_z_loss_mlp": 0.20410156, + "step": 11940, + "time_per_iteration": 3.0383803844451904 + }, + { + "auxiliary_loss_clip": 0.0109722, + "auxiliary_loss_mlp": 0.01030263, + "balance_loss_clip": 1.01903307, + "balance_loss_mlp": 1.03327632, + "epoch": 0.7179317601082219, + "flos": 26762607152640.0, + "grad_norm": 1.5272164711726817, + "language_loss": 0.58784437, + "learning_rate": 7.780146271721097e-07, + "loss": 0.60911918, + "num_input_tokens_seen": 257669795, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 11941, + "time_per_iteration": 2.5290164947509766 + }, + { + "auxiliary_loss_clip": 0.01100557, + "auxiliary_loss_mlp": 0.01029246, + "balance_loss_clip": 1.0178616, + "balance_loss_mlp": 1.03522658, + "epoch": 0.7179918833608898, + "flos": 23513804709120.0, + "grad_norm": 1.9189885732421792, + "language_loss": 0.79849315, + "learning_rate": 7.777063383616543e-07, + "loss": 0.81979108, + "num_input_tokens_seen": 257687415, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65234375, + "step": 11942, + "time_per_iteration": 2.5065739154815674 + }, + { + "auxiliary_loss_clip": 0.0110185, + "auxiliary_loss_mlp": 0.0103798, + "balance_loss_clip": 1.02638674, + "balance_loss_mlp": 1.03522158, + "epoch": 0.7180520066135578, + "flos": 17165588486400.0, + "grad_norm": 2.0597149659122636, + "language_loss": 0.66328835, + "learning_rate": 7.773980959006968e-07, + "loss": 0.6846866, + "num_input_tokens_seen": 257706215, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 11943, + "time_per_iteration": 2.4654104709625244 + }, + { + "auxiliary_loss_clip": 0.01097892, + "auxiliary_loss_mlp": 0.01028638, + "balance_loss_clip": 1.01696157, + "balance_loss_mlp": 1.03440082, + "epoch": 0.7181121298662257, + "flos": 17566638814080.0, + "grad_norm": 1.9764370465475432, + "language_loss": 0.79013598, + "learning_rate": 7.770898998009254e-07, + "loss": 0.81140125, + "num_input_tokens_seen": 257724740, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6328125, + "step": 11944, + "time_per_iteration": 2.445718765258789 + }, + { + "auxiliary_loss_clip": 0.01102899, + "auxiliary_loss_mlp": 0.01036625, + "balance_loss_clip": 1.02368522, + "balance_loss_mlp": 1.03503132, + "epoch": 0.7181722531188938, + "flos": 11947660508160.0, + "grad_norm": 2.260846776642364, + "language_loss": 0.62923992, + "learning_rate": 7.767817500740277e-07, + "loss": 0.65063506, + "num_input_tokens_seen": 257742060, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6796875, + "step": 11945, + "time_per_iteration": 2.4455084800720215 + }, + { + "auxiliary_loss_clip": 0.01023274, + "auxiliary_loss_mlp": 0.01000772, + "balance_loss_clip": 0.99966967, + "balance_loss_mlp": 1.00287986, + "epoch": 0.7182323763715617, + "flos": 65503649790720.0, + "grad_norm": 0.7012511616617018, + "language_loss": 0.51091176, + "learning_rate": 7.76473646731689e-07, + "loss": 0.53115225, + "num_input_tokens_seen": 257802250, + "router_z_loss_clip": 0.01104736, + "router_z_loss_mlp": 0.20410156, + "step": 11946, + "time_per_iteration": 2.993520498275757 + }, + { + "auxiliary_loss_clip": 0.0110474, + "auxiliary_loss_mlp": 0.01035735, + "balance_loss_clip": 1.0228188, + "balance_loss_mlp": 1.03633344, + "epoch": 0.7182924996242297, + "flos": 20630932070400.0, + "grad_norm": 1.5511387132101104, + "language_loss": 0.74426639, + "learning_rate": 7.761655897855925e-07, + "loss": 0.76567119, + "num_input_tokens_seen": 257821155, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.68359375, + "step": 11947, + "time_per_iteration": 2.5280697345733643 + }, + { + "auxiliary_loss_clip": 0.01098111, + "auxiliary_loss_mlp": 0.01028479, + "balance_loss_clip": 1.01682568, + "balance_loss_mlp": 1.03252912, + "epoch": 0.7183526228768976, + "flos": 16216433550720.0, + "grad_norm": 1.7377460165223417, + "language_loss": 0.72264934, + "learning_rate": 7.758575792474187e-07, + "loss": 0.74391532, + "num_input_tokens_seen": 257839905, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65625, + "step": 11948, + "time_per_iteration": 2.404911994934082 + }, + { + "auxiliary_loss_clip": 0.01103929, + "auxiliary_loss_mlp": 0.01038628, + "balance_loss_clip": 1.0260042, + "balance_loss_mlp": 1.0358839, + "epoch": 0.7184127461295656, + "flos": 22232655342720.0, + "grad_norm": 1.5225277290119825, + "language_loss": 0.71613109, + "learning_rate": 7.755496151288483e-07, + "loss": 0.73755664, + "num_input_tokens_seen": 257860055, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 11949, + "time_per_iteration": 2.4918761253356934 + }, + { + "auxiliary_loss_clip": 0.01099737, + "auxiliary_loss_mlp": 0.01030693, + "balance_loss_clip": 1.01983905, + "balance_loss_mlp": 1.03520155, + "epoch": 0.7184728693822335, + "flos": 27344503480320.0, + "grad_norm": 1.917874476636917, + "language_loss": 0.75913876, + "learning_rate": 7.752416974415598e-07, + "loss": 0.78044307, + "num_input_tokens_seen": 257879315, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6484375, + "step": 11950, + "time_per_iteration": 2.4783732891082764 + }, + { + "auxiliary_loss_clip": 0.01104047, + "auxiliary_loss_mlp": 0.01029869, + "balance_loss_clip": 1.01741719, + "balance_loss_mlp": 1.03651667, + "epoch": 0.7185329926349016, + "flos": 16508530949760.0, + "grad_norm": 2.3664494047814872, + "language_loss": 0.67457062, + "learning_rate": 7.749338261972282e-07, + "loss": 0.69590974, + "num_input_tokens_seen": 257896570, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.67578125, + "step": 11951, + "time_per_iteration": 2.4524526596069336 + }, + { + "auxiliary_loss_clip": 0.01106378, + "auxiliary_loss_mlp": 0.01029859, + "balance_loss_clip": 1.01682329, + "balance_loss_mlp": 1.03777874, + "epoch": 0.7185931158875695, + "flos": 23951052967680.0, + "grad_norm": 1.7288194945229958, + "language_loss": 0.78023463, + "learning_rate": 7.746260014075286e-07, + "loss": 0.80159694, + "num_input_tokens_seen": 257916855, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.68359375, + "step": 11952, + "time_per_iteration": 2.49094295501709 + }, + { + "auxiliary_loss_clip": 0.01103687, + "auxiliary_loss_mlp": 0.01033038, + "balance_loss_clip": 1.02052677, + "balance_loss_mlp": 1.03563547, + "epoch": 0.7186532391402375, + "flos": 26542007775360.0, + "grad_norm": 1.7793096783925773, + "language_loss": 0.74963003, + "learning_rate": 7.743182230841352e-07, + "loss": 0.77099729, + "num_input_tokens_seen": 257937140, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 11953, + "time_per_iteration": 2.500009298324585 + }, + { + "auxiliary_loss_clip": 0.01102038, + "auxiliary_loss_mlp": 0.01028598, + "balance_loss_clip": 1.0169332, + "balance_loss_mlp": 1.03495383, + "epoch": 0.7187133623929055, + "flos": 22383049587840.0, + "grad_norm": 1.7832624252992626, + "language_loss": 0.72971594, + "learning_rate": 7.740104912387164e-07, + "loss": 0.75102234, + "num_input_tokens_seen": 257956785, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.671875, + "step": 11954, + "time_per_iteration": 2.4608652591705322 + }, + { + "auxiliary_loss_clip": 0.01103144, + "auxiliary_loss_mlp": 0.01034266, + "balance_loss_clip": 1.02251804, + "balance_loss_mlp": 1.03668714, + "epoch": 0.7187734856455734, + "flos": 15779580341760.0, + "grad_norm": 1.601255234350909, + "language_loss": 0.74186033, + "learning_rate": 7.737028058829425e-07, + "loss": 0.7632345, + "num_input_tokens_seen": 257975455, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 11955, + "time_per_iteration": 2.474217176437378 + }, + { + "auxiliary_loss_clip": 0.01101929, + "auxiliary_loss_mlp": 0.01030274, + "balance_loss_clip": 1.01877582, + "balance_loss_mlp": 1.0353359, + "epoch": 0.7188336088982414, + "flos": 31759612531200.0, + "grad_norm": 1.6751832358498482, + "language_loss": 0.73376679, + "learning_rate": 7.733951670284817e-07, + "loss": 0.75508881, + "num_input_tokens_seen": 257996850, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 11956, + "time_per_iteration": 2.5315232276916504 + }, + { + "auxiliary_loss_clip": 0.01101581, + "auxiliary_loss_mlp": 0.01028517, + "balance_loss_clip": 1.01688766, + "balance_loss_mlp": 1.03342509, + "epoch": 0.7188937321509093, + "flos": 21465208333440.0, + "grad_norm": 2.7995163806109407, + "language_loss": 0.7065621, + "learning_rate": 7.730875746869987e-07, + "loss": 0.72786307, + "num_input_tokens_seen": 258016145, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.68359375, + "step": 11957, + "time_per_iteration": 2.479146957397461 + }, + { + "auxiliary_loss_clip": 0.01102151, + "auxiliary_loss_mlp": 0.01036925, + "balance_loss_clip": 1.02427661, + "balance_loss_mlp": 1.03408146, + "epoch": 0.7189538554035774, + "flos": 27271497087360.0, + "grad_norm": 1.9581401117139001, + "language_loss": 0.73586559, + "learning_rate": 7.727800288701582e-07, + "loss": 0.75725639, + "num_input_tokens_seen": 258035420, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6796875, + "step": 11958, + "time_per_iteration": 2.50201416015625 + }, + { + "auxiliary_loss_clip": 0.01099164, + "auxiliary_loss_mlp": 0.01034303, + "balance_loss_clip": 1.0223763, + "balance_loss_mlp": 1.03451216, + "epoch": 0.7190139786562453, + "flos": 21580625710080.0, + "grad_norm": 1.602205422840009, + "language_loss": 0.84252381, + "learning_rate": 7.724725295896215e-07, + "loss": 0.86385846, + "num_input_tokens_seen": 258053520, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6484375, + "step": 11959, + "time_per_iteration": 2.4619383811950684 + }, + { + "auxiliary_loss_clip": 0.01107021, + "auxiliary_loss_mlp": 0.01029979, + "balance_loss_clip": 1.01745617, + "balance_loss_mlp": 1.0378958, + "epoch": 0.7190741019089133, + "flos": 26721237663360.0, + "grad_norm": 1.9033832243828488, + "language_loss": 0.81933033, + "learning_rate": 7.7216507685705e-07, + "loss": 0.84070033, + "num_input_tokens_seen": 258073020, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69140625, + "step": 11960, + "time_per_iteration": 2.4611432552337646 + }, + { + "auxiliary_loss_clip": 0.01100369, + "auxiliary_loss_mlp": 0.01031508, + "balance_loss_clip": 1.01926565, + "balance_loss_mlp": 1.03601289, + "epoch": 0.7191342251615812, + "flos": 26104759516800.0, + "grad_norm": 1.6005750914484573, + "language_loss": 0.77382779, + "learning_rate": 7.718576706841013e-07, + "loss": 0.79514658, + "num_input_tokens_seen": 258093155, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.64453125, + "step": 11961, + "time_per_iteration": 2.490257978439331 + }, + { + "auxiliary_loss_clip": 0.01096696, + "auxiliary_loss_mlp": 0.01030381, + "balance_loss_clip": 1.01970601, + "balance_loss_mlp": 1.03359604, + "epoch": 0.7191943484142492, + "flos": 22967028904320.0, + "grad_norm": 1.493885754938081, + "language_loss": 0.75197971, + "learning_rate": 7.715503110824326e-07, + "loss": 0.7732504, + "num_input_tokens_seen": 258113905, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.62890625, + "step": 11962, + "time_per_iteration": 2.444990873336792 + }, + { + "auxiliary_loss_clip": 0.01101992, + "auxiliary_loss_mlp": 0.01031338, + "balance_loss_clip": 1.01888692, + "balance_loss_mlp": 1.03441834, + "epoch": 0.7192544716669171, + "flos": 22565332131840.0, + "grad_norm": 1.6695078722173347, + "language_loss": 0.75041807, + "learning_rate": 7.712429980637001e-07, + "loss": 0.7717514, + "num_input_tokens_seen": 258132820, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.67578125, + "step": 11963, + "time_per_iteration": 2.4661693572998047 + }, + { + "auxiliary_loss_clip": 0.01105424, + "auxiliary_loss_mlp": 0.01033709, + "balance_loss_clip": 1.02045298, + "balance_loss_mlp": 1.03614235, + "epoch": 0.7193145949195852, + "flos": 18982200873600.0, + "grad_norm": 8.488875605489067, + "language_loss": 0.80680382, + "learning_rate": 7.709357316395564e-07, + "loss": 0.82819521, + "num_input_tokens_seen": 258148055, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.69140625, + "step": 11964, + "time_per_iteration": 2.400843620300293 + }, + { + "auxiliary_loss_clip": 0.01098575, + "auxiliary_loss_mlp": 0.01030033, + "balance_loss_clip": 1.01819539, + "balance_loss_mlp": 1.03335524, + "epoch": 0.7193747181722531, + "flos": 18004246208640.0, + "grad_norm": 1.6851421500357613, + "language_loss": 0.74987501, + "learning_rate": 7.70628511821652e-07, + "loss": 0.77116108, + "num_input_tokens_seen": 258165995, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65234375, + "step": 11965, + "time_per_iteration": 2.455549955368042 + }, + { + "auxiliary_loss_clip": 0.01103926, + "auxiliary_loss_mlp": 0.01032188, + "balance_loss_clip": 1.01950407, + "balance_loss_mlp": 1.03589249, + "epoch": 0.7194348414249211, + "flos": 24389414547840.0, + "grad_norm": 1.6225024257282918, + "language_loss": 0.77548587, + "learning_rate": 7.703213386216377e-07, + "loss": 0.79684699, + "num_input_tokens_seen": 258186165, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 11966, + "time_per_iteration": 2.4651193618774414 + }, + { + "auxiliary_loss_clip": 0.01101346, + "auxiliary_loss_mlp": 0.01029248, + "balance_loss_clip": 1.01745796, + "balance_loss_mlp": 1.03470814, + "epoch": 0.7194949646775891, + "flos": 22163455791360.0, + "grad_norm": 1.9626871533411263, + "language_loss": 0.72638512, + "learning_rate": 7.700142120511619e-07, + "loss": 0.74769109, + "num_input_tokens_seen": 258204595, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 11967, + "time_per_iteration": 2.4732322692871094 + }, + { + "auxiliary_loss_clip": 0.01098168, + "auxiliary_loss_mlp": 0.01028177, + "balance_loss_clip": 1.01812136, + "balance_loss_mlp": 1.03623199, + "epoch": 0.719555087930257, + "flos": 20266366982400.0, + "grad_norm": 1.8100027522509434, + "language_loss": 0.81220973, + "learning_rate": 7.6970713212187e-07, + "loss": 0.83347309, + "num_input_tokens_seen": 258223110, + "router_z_loss_clip": 0.10058594, + "router_z_loss_mlp": 0.62109375, + "step": 11968, + "time_per_iteration": 2.4276745319366455 + }, + { + "auxiliary_loss_clip": 0.01098632, + "auxiliary_loss_mlp": 0.01027926, + "balance_loss_clip": 1.01651216, + "balance_loss_mlp": 1.03366137, + "epoch": 0.719615211182925, + "flos": 24716309247360.0, + "grad_norm": 2.0102886054893268, + "language_loss": 0.76459819, + "learning_rate": 7.69400098845407e-07, + "loss": 0.78586376, + "num_input_tokens_seen": 258242660, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 11969, + "time_per_iteration": 3.861771821975708 + }, + { + "auxiliary_loss_clip": 0.01100862, + "auxiliary_loss_mlp": 0.01030429, + "balance_loss_clip": 1.01781702, + "balance_loss_mlp": 1.03329253, + "epoch": 0.719675334435593, + "flos": 20009641501440.0, + "grad_norm": 1.7792544853917616, + "language_loss": 0.70936543, + "learning_rate": 7.69093112233417e-07, + "loss": 0.73067832, + "num_input_tokens_seen": 258261850, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.67578125, + "step": 11970, + "time_per_iteration": 2.421149253845215 + }, + { + "auxiliary_loss_clip": 0.01023909, + "auxiliary_loss_mlp": 0.00997715, + "balance_loss_clip": 0.99661201, + "balance_loss_mlp": 1.00355303, + "epoch": 0.719735457688261, + "flos": 44199861177600.0, + "grad_norm": 0.9239284754087862, + "language_loss": 0.60847962, + "learning_rate": 7.68786172297538e-07, + "loss": 0.62869585, + "num_input_tokens_seen": 258312570, + "router_z_loss_clip": 0.01104736, + "router_z_loss_mlp": 0.203125, + "step": 11971, + "time_per_iteration": 4.394974231719971 + }, + { + "auxiliary_loss_clip": 0.01108101, + "auxiliary_loss_mlp": 0.01032644, + "balance_loss_clip": 1.02012718, + "balance_loss_mlp": 1.03647828, + "epoch": 0.7197955809409289, + "flos": 16802890905600.0, + "grad_norm": 2.2193219685375647, + "language_loss": 0.79842031, + "learning_rate": 7.684792790494105e-07, + "loss": 0.8198278, + "num_input_tokens_seen": 258331600, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71875, + "step": 11972, + "time_per_iteration": 3.8465628623962402 + }, + { + "auxiliary_loss_clip": 0.01104665, + "auxiliary_loss_mlp": 0.01036669, + "balance_loss_clip": 1.02426565, + "balance_loss_mlp": 1.0365268, + "epoch": 0.7198557041935969, + "flos": 24535391420160.0, + "grad_norm": 1.5934226335424646, + "language_loss": 0.75385857, + "learning_rate": 7.681724325006733e-07, + "loss": 0.77527189, + "num_input_tokens_seen": 258351785, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 11973, + "time_per_iteration": 3.967134475708008 + }, + { + "auxiliary_loss_clip": 0.0102351, + "auxiliary_loss_mlp": 0.00997992, + "balance_loss_clip": 0.9969967, + "balance_loss_mlp": 1.00313878, + "epoch": 0.7199158274462648, + "flos": 70710839602560.0, + "grad_norm": 0.8568599946371717, + "language_loss": 0.57251143, + "learning_rate": 7.6786563266296e-07, + "loss": 0.59272635, + "num_input_tokens_seen": 258404035, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.20410156, + "step": 11974, + "time_per_iteration": 2.9041314125061035 + }, + { + "auxiliary_loss_clip": 0.0110113, + "auxiliary_loss_mlp": 0.01032835, + "balance_loss_clip": 1.02024031, + "balance_loss_mlp": 1.03228343, + "epoch": 0.7199759506989328, + "flos": 29347995352320.0, + "grad_norm": 2.0540036125086623, + "language_loss": 0.61555636, + "learning_rate": 7.675588795479062e-07, + "loss": 0.63689601, + "num_input_tokens_seen": 258424850, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 11975, + "time_per_iteration": 2.5565595626831055 + }, + { + "auxiliary_loss_clip": 0.0110015, + "auxiliary_loss_mlp": 0.01031171, + "balance_loss_clip": 1.01964951, + "balance_loss_mlp": 1.03378308, + "epoch": 0.7200360739516007, + "flos": 24640465680000.0, + "grad_norm": 1.7485061825333017, + "language_loss": 0.67644596, + "learning_rate": 7.672521731671425e-07, + "loss": 0.69775921, + "num_input_tokens_seen": 258445485, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 11976, + "time_per_iteration": 2.4791998863220215 + }, + { + "auxiliary_loss_clip": 0.01101483, + "auxiliary_loss_mlp": 0.01028822, + "balance_loss_clip": 1.0175333, + "balance_loss_mlp": 1.03462696, + "epoch": 0.7200961972042688, + "flos": 20812855478400.0, + "grad_norm": 1.9984197913928563, + "language_loss": 0.67032665, + "learning_rate": 7.669455135323004e-07, + "loss": 0.69162977, + "num_input_tokens_seen": 258464505, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.66796875, + "step": 11977, + "time_per_iteration": 2.4562158584594727 + }, + { + "auxiliary_loss_clip": 0.01105574, + "auxiliary_loss_mlp": 0.01030978, + "balance_loss_clip": 1.01912272, + "balance_loss_mlp": 1.03690076, + "epoch": 0.7201563204569367, + "flos": 31245910174080.0, + "grad_norm": 1.7897602101317545, + "language_loss": 0.75156534, + "learning_rate": 7.666389006550074e-07, + "loss": 0.77293086, + "num_input_tokens_seen": 258487190, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 11978, + "time_per_iteration": 2.6318418979644775 + }, + { + "auxiliary_loss_clip": 0.01098931, + "auxiliary_loss_mlp": 0.01031575, + "balance_loss_clip": 1.01953471, + "balance_loss_mlp": 1.03316569, + "epoch": 0.7202164437096047, + "flos": 26651391667200.0, + "grad_norm": 2.125023403243126, + "language_loss": 0.78794968, + "learning_rate": 7.663323345468908e-07, + "loss": 0.80925471, + "num_input_tokens_seen": 258503790, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.65625, + "step": 11979, + "time_per_iteration": 2.4805469512939453 + }, + { + "auxiliary_loss_clip": 0.01103342, + "auxiliary_loss_mlp": 0.01026884, + "balance_loss_clip": 1.0148797, + "balance_loss_mlp": 1.03659976, + "epoch": 0.7202765669622727, + "flos": 25959608657280.0, + "grad_norm": 1.7429736369489133, + "language_loss": 0.65073323, + "learning_rate": 7.660258152195767e-07, + "loss": 0.67203552, + "num_input_tokens_seen": 258527335, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66796875, + "step": 11980, + "time_per_iteration": 2.530036211013794 + }, + { + "auxiliary_loss_clip": 0.0110393, + "auxiliary_loss_mlp": 0.01035093, + "balance_loss_clip": 1.02152729, + "balance_loss_mlp": 1.03610325, + "epoch": 0.7203366902149406, + "flos": 28512354372480.0, + "grad_norm": 1.8302790091648973, + "language_loss": 0.67421222, + "learning_rate": 7.657193426846871e-07, + "loss": 0.69560248, + "num_input_tokens_seen": 258546690, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.6796875, + "step": 11981, + "time_per_iteration": 2.5009641647338867 + }, + { + "auxiliary_loss_clip": 0.01103608, + "auxiliary_loss_mlp": 0.01032928, + "balance_loss_clip": 1.02081633, + "balance_loss_mlp": 1.03605318, + "epoch": 0.7203968134676086, + "flos": 21106030285440.0, + "grad_norm": 1.9266732225953629, + "language_loss": 0.73759854, + "learning_rate": 7.65412916953843e-07, + "loss": 0.75896388, + "num_input_tokens_seen": 258566340, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 11982, + "time_per_iteration": 2.4776506423950195 + }, + { + "auxiliary_loss_clip": 0.01101459, + "auxiliary_loss_mlp": 0.01037116, + "balance_loss_clip": 1.02592814, + "balance_loss_mlp": 1.03360009, + "epoch": 0.7204569367202766, + "flos": 18332146488960.0, + "grad_norm": 1.8065417430122819, + "language_loss": 0.66113031, + "learning_rate": 7.65106538038665e-07, + "loss": 0.68251604, + "num_input_tokens_seen": 258584455, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6796875, + "step": 11983, + "time_per_iteration": 2.441363573074341 + }, + { + "auxiliary_loss_clip": 0.01103087, + "auxiliary_loss_mlp": 0.01031533, + "balance_loss_clip": 1.0197134, + "balance_loss_mlp": 1.03643811, + "epoch": 0.7205170599729446, + "flos": 23255103980160.0, + "grad_norm": 1.5519388922028943, + "language_loss": 0.66470373, + "learning_rate": 7.648002059507715e-07, + "loss": 0.68604994, + "num_input_tokens_seen": 258604725, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66796875, + "step": 11984, + "time_per_iteration": 2.4713308811187744 + }, + { + "auxiliary_loss_clip": 0.01107357, + "auxiliary_loss_mlp": 0.01035242, + "balance_loss_clip": 1.02291024, + "balance_loss_mlp": 1.03795314, + "epoch": 0.7205771832256125, + "flos": 20120892900480.0, + "grad_norm": 1.7856287402136095, + "language_loss": 0.73836136, + "learning_rate": 7.644939207017771e-07, + "loss": 0.75978738, + "num_input_tokens_seen": 258622885, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6953125, + "step": 11985, + "time_per_iteration": 2.4582014083862305 + }, + { + "auxiliary_loss_clip": 0.01101196, + "auxiliary_loss_mlp": 0.01028264, + "balance_loss_clip": 1.01717734, + "balance_loss_mlp": 1.03589368, + "epoch": 0.7206373064782805, + "flos": 27703250565120.0, + "grad_norm": 1.7243225094685473, + "language_loss": 0.62891448, + "learning_rate": 7.641876823032977e-07, + "loss": 0.65020913, + "num_input_tokens_seen": 258644305, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.65234375, + "step": 11986, + "time_per_iteration": 2.525557279586792 + }, + { + "auxiliary_loss_clip": 0.01104045, + "auxiliary_loss_mlp": 0.01034059, + "balance_loss_clip": 1.020702, + "balance_loss_mlp": 1.03693676, + "epoch": 0.7206974297309484, + "flos": 17968156018560.0, + "grad_norm": 1.5922220046222206, + "language_loss": 0.72103626, + "learning_rate": 7.638814907669455e-07, + "loss": 0.74241722, + "num_input_tokens_seen": 258661775, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.671875, + "step": 11987, + "time_per_iteration": 2.4586973190307617 + }, + { + "auxiliary_loss_clip": 0.01104181, + "auxiliary_loss_mlp": 0.01030178, + "balance_loss_clip": 1.01796532, + "balance_loss_mlp": 1.03563333, + "epoch": 0.7207575529836164, + "flos": 16983162288000.0, + "grad_norm": 1.7226788638874178, + "language_loss": 0.78616083, + "learning_rate": 7.635753461043301e-07, + "loss": 0.80750442, + "num_input_tokens_seen": 258679830, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 11988, + "time_per_iteration": 2.425905227661133 + }, + { + "auxiliary_loss_clip": 0.01100213, + "auxiliary_loss_mlp": 0.01030561, + "balance_loss_clip": 1.01888466, + "balance_loss_mlp": 1.03406453, + "epoch": 0.7208176762362843, + "flos": 18727594295040.0, + "grad_norm": 3.553932459688601, + "language_loss": 0.78784275, + "learning_rate": 7.632692483270618e-07, + "loss": 0.80915058, + "num_input_tokens_seen": 258697415, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 11989, + "time_per_iteration": 2.58890700340271 + }, + { + "auxiliary_loss_clip": 0.01100086, + "auxiliary_loss_mlp": 0.01031511, + "balance_loss_clip": 1.01997149, + "balance_loss_mlp": 1.03511739, + "epoch": 0.7208777994889524, + "flos": 18734489706240.0, + "grad_norm": 6.030003130937093, + "language_loss": 0.82572663, + "learning_rate": 7.629631974467481e-07, + "loss": 0.84704268, + "num_input_tokens_seen": 258716755, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 11990, + "time_per_iteration": 2.422929048538208 + }, + { + "auxiliary_loss_clip": 0.01101655, + "auxiliary_loss_mlp": 0.01035975, + "balance_loss_clip": 1.02463794, + "balance_loss_mlp": 1.03581941, + "epoch": 0.7209379227416203, + "flos": 14793437376000.0, + "grad_norm": 2.2646719383287746, + "language_loss": 0.76148689, + "learning_rate": 7.626571934749931e-07, + "loss": 0.78286314, + "num_input_tokens_seen": 258733270, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.66015625, + "step": 11991, + "time_per_iteration": 2.439966917037964 + }, + { + "auxiliary_loss_clip": 0.01099738, + "auxiliary_loss_mlp": 0.01027379, + "balance_loss_clip": 1.0163815, + "balance_loss_mlp": 1.03555298, + "epoch": 0.7209980459942883, + "flos": 29636860527360.0, + "grad_norm": 2.0383069832544263, + "language_loss": 0.72644949, + "learning_rate": 7.623512364234022e-07, + "loss": 0.74772066, + "num_input_tokens_seen": 258755270, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.640625, + "step": 11992, + "time_per_iteration": 2.508730173110962 + }, + { + "auxiliary_loss_clip": 0.01103062, + "auxiliary_loss_mlp": 0.01028315, + "balance_loss_clip": 1.01668572, + "balance_loss_mlp": 1.0353663, + "epoch": 0.7210581692469563, + "flos": 23477175815040.0, + "grad_norm": 1.8344706583489365, + "language_loss": 0.66479945, + "learning_rate": 7.620453263035755e-07, + "loss": 0.68611324, + "num_input_tokens_seen": 258775340, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.67578125, + "step": 11993, + "time_per_iteration": 2.496220350265503 + }, + { + "auxiliary_loss_clip": 0.01101133, + "auxiliary_loss_mlp": 0.01030932, + "balance_loss_clip": 1.01959491, + "balance_loss_mlp": 1.03491402, + "epoch": 0.7211182924996242, + "flos": 26099839353600.0, + "grad_norm": 2.3726873705189786, + "language_loss": 0.65635949, + "learning_rate": 7.61739463127115e-07, + "loss": 0.67768013, + "num_input_tokens_seen": 258794580, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66015625, + "step": 11994, + "time_per_iteration": 2.481267213821411 + }, + { + "auxiliary_loss_clip": 0.01102846, + "auxiliary_loss_mlp": 0.01029077, + "balance_loss_clip": 1.01626778, + "balance_loss_mlp": 1.03604794, + "epoch": 0.7211784157522922, + "flos": 17712076982400.0, + "grad_norm": 1.7186394121352693, + "language_loss": 0.66596985, + "learning_rate": 7.614336469056172e-07, + "loss": 0.68728906, + "num_input_tokens_seen": 258812330, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.66796875, + "step": 11995, + "time_per_iteration": 2.4427177906036377 + }, + { + "auxiliary_loss_clip": 0.01099622, + "auxiliary_loss_mlp": 0.01029445, + "balance_loss_clip": 1.01687467, + "balance_loss_mlp": 1.0355916, + "epoch": 0.7212385390049602, + "flos": 24423637230720.0, + "grad_norm": 1.85436447909986, + "language_loss": 0.79713655, + "learning_rate": 7.6112787765068e-07, + "loss": 0.81842726, + "num_input_tokens_seen": 258831770, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.640625, + "step": 11996, + "time_per_iteration": 2.459115505218506 + }, + { + "auxiliary_loss_clip": 0.01103225, + "auxiliary_loss_mlp": 0.01032367, + "balance_loss_clip": 1.0208993, + "balance_loss_mlp": 1.03676772, + "epoch": 0.7212986622576282, + "flos": 28147250580480.0, + "grad_norm": 5.051284745258933, + "language_loss": 0.81384039, + "learning_rate": 7.60822155373899e-07, + "loss": 0.83519638, + "num_input_tokens_seen": 258849090, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6640625, + "step": 11997, + "time_per_iteration": 2.5205626487731934 + }, + { + "auxiliary_loss_clip": 0.01103756, + "auxiliary_loss_mlp": 0.01034784, + "balance_loss_clip": 1.02231431, + "balance_loss_mlp": 1.03483105, + "epoch": 0.7213587855102961, + "flos": 21835770992640.0, + "grad_norm": 1.8313827039335897, + "language_loss": 0.67091608, + "learning_rate": 7.605164800868646e-07, + "loss": 0.69230151, + "num_input_tokens_seen": 258868230, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 11998, + "time_per_iteration": 2.431267023086548 + }, + { + "auxiliary_loss_clip": 0.01102391, + "auxiliary_loss_mlp": 0.01031921, + "balance_loss_clip": 1.02113891, + "balance_loss_mlp": 1.03637111, + "epoch": 0.7214189087629641, + "flos": 14611549881600.0, + "grad_norm": 1.8599790081910679, + "language_loss": 0.72658986, + "learning_rate": 7.602108518011696e-07, + "loss": 0.74793291, + "num_input_tokens_seen": 258885525, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.66015625, + "step": 11999, + "time_per_iteration": 2.434900999069214 + }, + { + "auxiliary_loss_clip": 0.01103894, + "auxiliary_loss_mlp": 0.01027384, + "balance_loss_clip": 1.01531434, + "balance_loss_mlp": 1.03644443, + "epoch": 0.721479032015632, + "flos": 19390864884480.0, + "grad_norm": 2.3549521640831843, + "language_loss": 0.83203346, + "learning_rate": 7.599052705284039e-07, + "loss": 0.85334623, + "num_input_tokens_seen": 258903245, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.67578125, + "step": 12000, + "time_per_iteration": 2.4203250408172607 + }, + { + "auxiliary_loss_clip": 0.011045, + "auxiliary_loss_mlp": 0.01034844, + "balance_loss_clip": 1.02320933, + "balance_loss_mlp": 1.03663993, + "epoch": 0.7215391552683, + "flos": 18512884748160.0, + "grad_norm": 1.6620327129342116, + "language_loss": 0.77455056, + "learning_rate": 7.59599736280154e-07, + "loss": 0.79594404, + "num_input_tokens_seen": 258921245, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 12001, + "time_per_iteration": 2.446817636489868 + }, + { + "auxiliary_loss_clip": 0.01103076, + "auxiliary_loss_mlp": 0.01034762, + "balance_loss_clip": 1.02331209, + "balance_loss_mlp": 1.0377841, + "epoch": 0.721599278520968, + "flos": 23258731253760.0, + "grad_norm": 1.7518200734535594, + "language_loss": 0.81436306, + "learning_rate": 7.592942490680066e-07, + "loss": 0.83574152, + "num_input_tokens_seen": 258939425, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 12002, + "time_per_iteration": 2.4679903984069824 + }, + { + "auxiliary_loss_clip": 0.01104088, + "auxiliary_loss_mlp": 0.01027156, + "balance_loss_clip": 1.01510406, + "balance_loss_mlp": 1.03641772, + "epoch": 0.721659401773636, + "flos": 39199045979520.0, + "grad_norm": 2.283155803599373, + "language_loss": 0.62498772, + "learning_rate": 7.589888089035462e-07, + "loss": 0.6463002, + "num_input_tokens_seen": 258960710, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.67578125, + "step": 12003, + "time_per_iteration": 2.58776593208313 + }, + { + "auxiliary_loss_clip": 0.01102937, + "auxiliary_loss_mlp": 0.01032279, + "balance_loss_clip": 1.02022064, + "balance_loss_mlp": 1.03539622, + "epoch": 0.7217195250263039, + "flos": 14939917038720.0, + "grad_norm": 2.560985107334089, + "language_loss": 0.68500596, + "learning_rate": 7.586834157983544e-07, + "loss": 0.70635808, + "num_input_tokens_seen": 258978475, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.67578125, + "step": 12004, + "time_per_iteration": 2.3969027996063232 + }, + { + "auxiliary_loss_clip": 0.01025027, + "auxiliary_loss_mlp": 0.0099804, + "balance_loss_clip": 0.99700272, + "balance_loss_mlp": 1.00448203, + "epoch": 0.7217796482789719, + "flos": 70869206666880.0, + "grad_norm": 0.8643975392958543, + "language_loss": 0.54278243, + "learning_rate": 7.583780697640112e-07, + "loss": 0.56301308, + "num_input_tokens_seen": 259037520, + "router_z_loss_clip": 0.01037598, + "router_z_loss_mlp": 0.20507812, + "step": 12005, + "time_per_iteration": 2.9869492053985596 + }, + { + "auxiliary_loss_clip": 0.01102163, + "auxiliary_loss_mlp": 0.01032861, + "balance_loss_clip": 1.02052271, + "balance_loss_mlp": 1.03582788, + "epoch": 0.7218397715316398, + "flos": 37451525402880.0, + "grad_norm": 1.4997790369746062, + "language_loss": 0.62904799, + "learning_rate": 7.580727708120962e-07, + "loss": 0.65039825, + "num_input_tokens_seen": 259061325, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6640625, + "step": 12006, + "time_per_iteration": 2.6116576194763184 + }, + { + "auxiliary_loss_clip": 0.01102198, + "auxiliary_loss_mlp": 0.01031585, + "balance_loss_clip": 1.02048635, + "balance_loss_mlp": 1.0356214, + "epoch": 0.7218998947843078, + "flos": 22710662559360.0, + "grad_norm": 1.702113645244825, + "language_loss": 0.92155731, + "learning_rate": 7.577675189541865e-07, + "loss": 0.94289511, + "num_input_tokens_seen": 259078135, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 12007, + "time_per_iteration": 2.4609286785125732 + }, + { + "auxiliary_loss_clip": 0.01102657, + "auxiliary_loss_mlp": 0.01030059, + "balance_loss_clip": 1.0176847, + "balance_loss_mlp": 1.03450811, + "epoch": 0.7219600180369758, + "flos": 12167182477440.0, + "grad_norm": 2.0030110165156088, + "language_loss": 0.64172041, + "learning_rate": 7.574623142018568e-07, + "loss": 0.66304755, + "num_input_tokens_seen": 259095910, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 12008, + "time_per_iteration": 2.4176084995269775 + }, + { + "auxiliary_loss_clip": 0.0110518, + "auxiliary_loss_mlp": 0.0103247, + "balance_loss_clip": 1.01998281, + "balance_loss_mlp": 1.03712559, + "epoch": 0.7220201412896438, + "flos": 22596573985920.0, + "grad_norm": 1.9142767312180562, + "language_loss": 0.78281379, + "learning_rate": 7.57157156566681e-07, + "loss": 0.80419028, + "num_input_tokens_seen": 259114225, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 12009, + "time_per_iteration": 2.486860752105713 + }, + { + "auxiliary_loss_clip": 0.01105579, + "auxiliary_loss_mlp": 0.01034559, + "balance_loss_clip": 1.0216608, + "balance_loss_mlp": 1.03696656, + "epoch": 0.7220802645423118, + "flos": 26718651884160.0, + "grad_norm": 1.8228551130543398, + "language_loss": 0.63638747, + "learning_rate": 7.568520460602297e-07, + "loss": 0.65778881, + "num_input_tokens_seen": 259134660, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6875, + "step": 12010, + "time_per_iteration": 2.4727206230163574 + }, + { + "auxiliary_loss_clip": 0.01102553, + "auxiliary_loss_mlp": 0.0102775, + "balance_loss_clip": 1.0161804, + "balance_loss_mlp": 1.03521693, + "epoch": 0.7221403877949797, + "flos": 24420548661120.0, + "grad_norm": 2.6555622208181195, + "language_loss": 0.77546549, + "learning_rate": 7.565469826940742e-07, + "loss": 0.79676855, + "num_input_tokens_seen": 259153300, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.671875, + "step": 12011, + "time_per_iteration": 3.9832870960235596 + }, + { + "auxiliary_loss_clip": 0.01101603, + "auxiliary_loss_mlp": 0.01032842, + "balance_loss_clip": 1.02180326, + "balance_loss_mlp": 1.03652728, + "epoch": 0.7222005110476477, + "flos": 23514379326720.0, + "grad_norm": 1.6788129204959028, + "language_loss": 0.79040414, + "learning_rate": 7.56241966479781e-07, + "loss": 0.81174862, + "num_input_tokens_seen": 259172115, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 12012, + "time_per_iteration": 2.5008320808410645 + }, + { + "auxiliary_loss_clip": 0.01105391, + "auxiliary_loss_mlp": 0.01030402, + "balance_loss_clip": 1.01893425, + "balance_loss_mlp": 1.03754234, + "epoch": 0.7222606343003156, + "flos": 23112538899840.0, + "grad_norm": 1.7808047508810358, + "language_loss": 0.75740772, + "learning_rate": 7.559369974289171e-07, + "loss": 0.77876568, + "num_input_tokens_seen": 259191345, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6796875, + "step": 12013, + "time_per_iteration": 3.9566152095794678 + }, + { + "auxiliary_loss_clip": 0.01102634, + "auxiliary_loss_mlp": 0.01025299, + "balance_loss_clip": 1.01445651, + "balance_loss_mlp": 1.03621209, + "epoch": 0.7223207575529836, + "flos": 24351169541760.0, + "grad_norm": 1.471281729007001, + "language_loss": 0.75965142, + "learning_rate": 7.556320755530484e-07, + "loss": 0.78093076, + "num_input_tokens_seen": 259211700, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6640625, + "step": 12014, + "time_per_iteration": 3.9748001098632812 + }, + { + "auxiliary_loss_clip": 0.01104429, + "auxiliary_loss_mlp": 0.01032084, + "balance_loss_clip": 1.02027655, + "balance_loss_mlp": 1.03614628, + "epoch": 0.7223808808056515, + "flos": 28330179569280.0, + "grad_norm": 1.614960921439624, + "language_loss": 0.86782753, + "learning_rate": 7.553272008637346e-07, + "loss": 0.8891927, + "num_input_tokens_seen": 259233825, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.68359375, + "step": 12015, + "time_per_iteration": 3.9988059997558594 + }, + { + "auxiliary_loss_clip": 0.01100793, + "auxiliary_loss_mlp": 0.01035132, + "balance_loss_clip": 1.02386725, + "balance_loss_mlp": 1.03534532, + "epoch": 0.7224410040583196, + "flos": 21069437304960.0, + "grad_norm": 1.879880951075302, + "language_loss": 0.77969182, + "learning_rate": 7.55022373372538e-07, + "loss": 0.80105108, + "num_input_tokens_seen": 259253055, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65625, + "step": 12016, + "time_per_iteration": 2.45281982421875 + }, + { + "auxiliary_loss_clip": 0.01099669, + "auxiliary_loss_mlp": 0.01033938, + "balance_loss_clip": 1.02245855, + "balance_loss_mlp": 1.03527737, + "epoch": 0.7225011273109875, + "flos": 26795429205120.0, + "grad_norm": 1.444882690983208, + "language_loss": 0.77545393, + "learning_rate": 7.547175930910186e-07, + "loss": 0.79679, + "num_input_tokens_seen": 259273420, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.64453125, + "step": 12017, + "time_per_iteration": 2.4577410221099854 + }, + { + "auxiliary_loss_clip": 0.01098758, + "auxiliary_loss_mlp": 0.01025484, + "balance_loss_clip": 1.01503491, + "balance_loss_mlp": 1.03520453, + "epoch": 0.7225612505636555, + "flos": 23583578878080.0, + "grad_norm": 2.637627355867151, + "language_loss": 0.73314553, + "learning_rate": 7.54412860030732e-07, + "loss": 0.75438797, + "num_input_tokens_seen": 259291000, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.63671875, + "step": 12018, + "time_per_iteration": 2.4559662342071533 + }, + { + "auxiliary_loss_clip": 0.01099343, + "auxiliary_loss_mlp": 0.01031186, + "balance_loss_clip": 1.02060056, + "balance_loss_mlp": 1.03665912, + "epoch": 0.7226213738163234, + "flos": 20777627214720.0, + "grad_norm": 2.5981107828035026, + "language_loss": 0.77910566, + "learning_rate": 7.541081742032347e-07, + "loss": 0.80041099, + "num_input_tokens_seen": 259312390, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.625, + "step": 12019, + "time_per_iteration": 2.4371070861816406 + }, + { + "auxiliary_loss_clip": 0.01100393, + "auxiliary_loss_mlp": 0.01027432, + "balance_loss_clip": 1.01560664, + "balance_loss_mlp": 1.0350244, + "epoch": 0.7226814970689914, + "flos": 32635832901120.0, + "grad_norm": 1.6489444745204735, + "language_loss": 0.73905075, + "learning_rate": 7.53803535620081e-07, + "loss": 0.76032901, + "num_input_tokens_seen": 259332645, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65234375, + "step": 12020, + "time_per_iteration": 2.5431694984436035 + }, + { + "auxiliary_loss_clip": 0.01103343, + "auxiliary_loss_mlp": 0.01032271, + "balance_loss_clip": 1.02115512, + "balance_loss_mlp": 1.03456461, + "epoch": 0.7227416203216595, + "flos": 22454368041600.0, + "grad_norm": 1.6675263064788628, + "language_loss": 0.77169615, + "learning_rate": 7.534989442928219e-07, + "loss": 0.79305232, + "num_input_tokens_seen": 259353810, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6875, + "step": 12021, + "time_per_iteration": 2.483078718185425 + }, + { + "auxiliary_loss_clip": 0.0110063, + "auxiliary_loss_mlp": 0.01031983, + "balance_loss_clip": 1.0206815, + "balance_loss_mlp": 1.03491306, + "epoch": 0.7228017435743274, + "flos": 21652303299840.0, + "grad_norm": 2.1826099193920374, + "language_loss": 0.68331528, + "learning_rate": 7.531944002330073e-07, + "loss": 0.70464146, + "num_input_tokens_seen": 259372460, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65625, + "step": 12022, + "time_per_iteration": 2.454972982406616 + }, + { + "auxiliary_loss_clip": 0.01101398, + "auxiliary_loss_mlp": 0.01032038, + "balance_loss_clip": 1.01967645, + "balance_loss_mlp": 1.03452194, + "epoch": 0.7228618668269954, + "flos": 29533474206720.0, + "grad_norm": 1.7453912487460392, + "language_loss": 0.69111204, + "learning_rate": 7.528899034521858e-07, + "loss": 0.71244639, + "num_input_tokens_seen": 259393275, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.66796875, + "step": 12023, + "time_per_iteration": 2.4790570735931396 + }, + { + "auxiliary_loss_clip": 0.01098672, + "auxiliary_loss_mlp": 0.01028619, + "balance_loss_clip": 1.0168705, + "balance_loss_mlp": 1.03245616, + "epoch": 0.7229219900796633, + "flos": 27453815544960.0, + "grad_norm": 1.6879551293116275, + "language_loss": 0.71159554, + "learning_rate": 7.525854539619052e-07, + "loss": 0.73286849, + "num_input_tokens_seen": 259416205, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6640625, + "step": 12024, + "time_per_iteration": 2.5031228065490723 + }, + { + "auxiliary_loss_clip": 0.0110197, + "auxiliary_loss_mlp": 0.01035131, + "balance_loss_clip": 1.02403879, + "balance_loss_mlp": 1.03651297, + "epoch": 0.7229821133323313, + "flos": 16289368116480.0, + "grad_norm": 1.7827113324832673, + "language_loss": 0.75502241, + "learning_rate": 7.522810517737089e-07, + "loss": 0.77639341, + "num_input_tokens_seen": 259433115, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.65625, + "step": 12025, + "time_per_iteration": 2.540117025375366 + }, + { + "auxiliary_loss_clip": 0.01101331, + "auxiliary_loss_mlp": 0.0102966, + "balance_loss_clip": 1.01867485, + "balance_loss_mlp": 1.03641152, + "epoch": 0.7230422365849992, + "flos": 20412343854720.0, + "grad_norm": 2.1646639083011, + "language_loss": 0.7686342, + "learning_rate": 7.519766968991395e-07, + "loss": 0.78994411, + "num_input_tokens_seen": 259450475, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6484375, + "step": 12026, + "time_per_iteration": 2.619121551513672 + }, + { + "auxiliary_loss_clip": 0.01101511, + "auxiliary_loss_mlp": 0.0103796, + "balance_loss_clip": 1.02681398, + "balance_loss_mlp": 1.0340333, + "epoch": 0.7231023598376672, + "flos": 25593499284480.0, + "grad_norm": 1.96713718815872, + "language_loss": 0.67575908, + "learning_rate": 7.516723893497388e-07, + "loss": 0.69715375, + "num_input_tokens_seen": 259469355, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.671875, + "step": 12027, + "time_per_iteration": 2.5705184936523438 + }, + { + "auxiliary_loss_clip": 0.01105426, + "auxiliary_loss_mlp": 0.01030564, + "balance_loss_clip": 1.01864338, + "balance_loss_mlp": 1.03727841, + "epoch": 0.7231624830903352, + "flos": 25149607009920.0, + "grad_norm": 2.266596078102469, + "language_loss": 0.78860784, + "learning_rate": 7.513681291370469e-07, + "loss": 0.8099677, + "num_input_tokens_seen": 259486565, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 12028, + "time_per_iteration": 2.521543502807617 + }, + { + "auxiliary_loss_clip": 0.01099985, + "auxiliary_loss_mlp": 0.01025931, + "balance_loss_clip": 1.0140934, + "balance_loss_mlp": 1.03353393, + "epoch": 0.7232226063430032, + "flos": 21725740656000.0, + "grad_norm": 1.7215623884299298, + "language_loss": 0.81997663, + "learning_rate": 7.510639162726e-07, + "loss": 0.84123576, + "num_input_tokens_seen": 259505070, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 12029, + "time_per_iteration": 2.518493890762329 + }, + { + "auxiliary_loss_clip": 0.01024828, + "auxiliary_loss_mlp": 0.01005824, + "balance_loss_clip": 1.00497139, + "balance_loss_mlp": 1.00435281, + "epoch": 0.7232827295956711, + "flos": 68436798491520.0, + "grad_norm": 0.8108297905714709, + "language_loss": 0.61798579, + "learning_rate": 7.507597507679347e-07, + "loss": 0.63829231, + "num_input_tokens_seen": 259569135, + "router_z_loss_clip": 0.00854492, + "router_z_loss_mlp": 0.20507812, + "step": 12030, + "time_per_iteration": 3.3008005619049072 + }, + { + "auxiliary_loss_clip": 0.01097674, + "auxiliary_loss_mlp": 0.01027221, + "balance_loss_clip": 1.01557982, + "balance_loss_mlp": 1.0335412, + "epoch": 0.7233428528483391, + "flos": 20192642317440.0, + "grad_norm": 1.9017157177210717, + "language_loss": 0.78060263, + "learning_rate": 7.504556326345859e-07, + "loss": 0.80185157, + "num_input_tokens_seen": 259587035, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.640625, + "step": 12031, + "time_per_iteration": 2.410015106201172 + }, + { + "auxiliary_loss_clip": 0.01103629, + "auxiliary_loss_mlp": 0.01030191, + "balance_loss_clip": 1.01835394, + "balance_loss_mlp": 1.03571391, + "epoch": 0.723402976101007, + "flos": 23949472769280.0, + "grad_norm": 2.6817131275089614, + "language_loss": 0.81817293, + "learning_rate": 7.501515618840834e-07, + "loss": 0.83951116, + "num_input_tokens_seen": 259606140, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 12032, + "time_per_iteration": 2.4944539070129395 + }, + { + "auxiliary_loss_clip": 0.01105541, + "auxiliary_loss_mlp": 0.01032741, + "balance_loss_clip": 1.0208137, + "balance_loss_mlp": 1.03620064, + "epoch": 0.723463099353675, + "flos": 20813394182400.0, + "grad_norm": 1.8666102600772807, + "language_loss": 0.74966335, + "learning_rate": 7.498475385279592e-07, + "loss": 0.77104622, + "num_input_tokens_seen": 259624275, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 12033, + "time_per_iteration": 2.4195306301116943 + }, + { + "auxiliary_loss_clip": 0.01098927, + "auxiliary_loss_mlp": 0.01027603, + "balance_loss_clip": 1.01661193, + "balance_loss_mlp": 1.03378749, + "epoch": 0.723523222606343, + "flos": 19098013299840.0, + "grad_norm": 1.53895157270623, + "language_loss": 0.74960071, + "learning_rate": 7.495435625777423e-07, + "loss": 0.77086604, + "num_input_tokens_seen": 259643465, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.65234375, + "step": 12034, + "time_per_iteration": 2.4611551761627197 + }, + { + "auxiliary_loss_clip": 0.01099874, + "auxiliary_loss_mlp": 0.01026544, + "balance_loss_clip": 1.01580346, + "balance_loss_mlp": 1.03402519, + "epoch": 0.723583345859011, + "flos": 26506994993280.0, + "grad_norm": 1.7101429729597608, + "language_loss": 0.80541229, + "learning_rate": 7.492396340449578e-07, + "loss": 0.82667649, + "num_input_tokens_seen": 259662500, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.66015625, + "step": 12035, + "time_per_iteration": 2.4735255241394043 + }, + { + "auxiliary_loss_clip": 0.01102988, + "auxiliary_loss_mlp": 0.0103086, + "balance_loss_clip": 1.01914811, + "balance_loss_mlp": 1.03593981, + "epoch": 0.723643469111679, + "flos": 16033863697920.0, + "grad_norm": 1.6708890033016828, + "language_loss": 0.60718334, + "learning_rate": 7.489357529411326e-07, + "loss": 0.6285218, + "num_input_tokens_seen": 259680140, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 12036, + "time_per_iteration": 2.4652183055877686 + }, + { + "auxiliary_loss_clip": 0.01097804, + "auxiliary_loss_mlp": 0.01029749, + "balance_loss_clip": 1.01916969, + "balance_loss_mlp": 1.03397477, + "epoch": 0.7237035923643469, + "flos": 21945549934080.0, + "grad_norm": 1.8946488922724685, + "language_loss": 0.67484653, + "learning_rate": 7.486319192777883e-07, + "loss": 0.69612211, + "num_input_tokens_seen": 259700160, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.63671875, + "step": 12037, + "time_per_iteration": 2.439401388168335 + }, + { + "auxiliary_loss_clip": 0.0110086, + "auxiliary_loss_mlp": 0.01035034, + "balance_loss_clip": 1.02312541, + "balance_loss_mlp": 1.03511119, + "epoch": 0.7237637156170149, + "flos": 23583112001280.0, + "grad_norm": 1.8161270520180812, + "language_loss": 0.72444439, + "learning_rate": 7.483281330664479e-07, + "loss": 0.74580336, + "num_input_tokens_seen": 259720525, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 12038, + "time_per_iteration": 2.498206853866577 + }, + { + "auxiliary_loss_clip": 0.01102879, + "auxiliary_loss_mlp": 0.01029521, + "balance_loss_clip": 1.01683688, + "balance_loss_mlp": 1.0365181, + "epoch": 0.7238238388696828, + "flos": 20594698225920.0, + "grad_norm": 1.9105264736762722, + "language_loss": 0.72119117, + "learning_rate": 7.480243943186293e-07, + "loss": 0.74251521, + "num_input_tokens_seen": 259738680, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6640625, + "step": 12039, + "time_per_iteration": 2.427929401397705 + }, + { + "auxiliary_loss_clip": 0.01105168, + "auxiliary_loss_mlp": 0.01034722, + "balance_loss_clip": 1.02339756, + "balance_loss_mlp": 1.03659403, + "epoch": 0.7238839621223508, + "flos": 24207024263040.0, + "grad_norm": 2.0387115182112567, + "language_loss": 0.75838852, + "learning_rate": 7.477207030458513e-07, + "loss": 0.77978736, + "num_input_tokens_seen": 259758790, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6875, + "step": 12040, + "time_per_iteration": 2.4932591915130615 + }, + { + "auxiliary_loss_clip": 0.01100807, + "auxiliary_loss_mlp": 0.01033028, + "balance_loss_clip": 1.02060628, + "balance_loss_mlp": 1.03361833, + "epoch": 0.7239440853750188, + "flos": 14209745368320.0, + "grad_norm": 1.913740912847533, + "language_loss": 0.76230586, + "learning_rate": 7.474170592596301e-07, + "loss": 0.7836442, + "num_input_tokens_seen": 259777370, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 12041, + "time_per_iteration": 2.393092393875122 + }, + { + "auxiliary_loss_clip": 0.01101216, + "auxiliary_loss_mlp": 0.01028073, + "balance_loss_clip": 1.01658726, + "balance_loss_mlp": 1.03313875, + "epoch": 0.7240042086276868, + "flos": 21614812479360.0, + "grad_norm": 2.0516689414632348, + "language_loss": 0.63410985, + "learning_rate": 7.471134629714797e-07, + "loss": 0.65540266, + "num_input_tokens_seen": 259794665, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6796875, + "step": 12042, + "time_per_iteration": 2.4641988277435303 + }, + { + "auxiliary_loss_clip": 0.01105282, + "auxiliary_loss_mlp": 0.01033601, + "balance_loss_clip": 1.02106047, + "balance_loss_mlp": 1.03651488, + "epoch": 0.7240643318803547, + "flos": 23331450337920.0, + "grad_norm": 2.5235443155533486, + "language_loss": 0.83237529, + "learning_rate": 7.468099141929116e-07, + "loss": 0.85376412, + "num_input_tokens_seen": 259811110, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 12043, + "time_per_iteration": 2.433598041534424 + }, + { + "auxiliary_loss_clip": 0.0110258, + "auxiliary_loss_mlp": 0.0102984, + "balance_loss_clip": 1.01696599, + "balance_loss_mlp": 1.03478646, + "epoch": 0.7241244551330227, + "flos": 24024849459840.0, + "grad_norm": 1.7620410881767092, + "language_loss": 0.64035821, + "learning_rate": 7.465064129354379e-07, + "loss": 0.66168237, + "num_input_tokens_seen": 259831080, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6796875, + "step": 12044, + "time_per_iteration": 2.4627864360809326 + }, + { + "auxiliary_loss_clip": 0.0110401, + "auxiliary_loss_mlp": 0.01031559, + "balance_loss_clip": 1.01967335, + "balance_loss_mlp": 1.03717875, + "epoch": 0.7241845783856906, + "flos": 18730323728640.0, + "grad_norm": 1.4978020202204398, + "language_loss": 0.81621009, + "learning_rate": 7.462029592105658e-07, + "loss": 0.83756578, + "num_input_tokens_seen": 259850135, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66796875, + "step": 12045, + "time_per_iteration": 2.4192216396331787 + }, + { + "auxiliary_loss_clip": 0.01098967, + "auxiliary_loss_mlp": 0.01033304, + "balance_loss_clip": 1.0214541, + "balance_loss_mlp": 1.0345459, + "epoch": 0.7242447016383586, + "flos": 19498668577920.0, + "grad_norm": 1.5204011665835366, + "language_loss": 0.71989012, + "learning_rate": 7.458995530298034e-07, + "loss": 0.74121284, + "num_input_tokens_seen": 259868185, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.64453125, + "step": 12046, + "time_per_iteration": 2.4425227642059326 + }, + { + "auxiliary_loss_clip": 0.01101516, + "auxiliary_loss_mlp": 0.01028832, + "balance_loss_clip": 1.01617825, + "balance_loss_mlp": 1.03457832, + "epoch": 0.7243048248910267, + "flos": 22163491704960.0, + "grad_norm": 1.7863177787262001, + "language_loss": 0.71125013, + "learning_rate": 7.455961944046553e-07, + "loss": 0.7325536, + "num_input_tokens_seen": 259887055, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.671875, + "step": 12047, + "time_per_iteration": 2.4461426734924316 + }, + { + "auxiliary_loss_clip": 0.01108161, + "auxiliary_loss_mlp": 0.01034212, + "balance_loss_clip": 1.02217817, + "balance_loss_mlp": 1.03864622, + "epoch": 0.7243649481436946, + "flos": 27672762896640.0, + "grad_norm": 1.6964330206566038, + "language_loss": 0.69839394, + "learning_rate": 7.45292883346627e-07, + "loss": 0.71981764, + "num_input_tokens_seen": 259908295, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6953125, + "step": 12048, + "time_per_iteration": 2.500828981399536 + }, + { + "auxiliary_loss_clip": 0.01024144, + "auxiliary_loss_mlp": 0.01003374, + "balance_loss_clip": 1.00239074, + "balance_loss_mlp": 1.00373721, + "epoch": 0.7244250713963626, + "flos": 63244545759360.0, + "grad_norm": 0.8243567714089579, + "language_loss": 0.5377422, + "learning_rate": 7.449896198672168e-07, + "loss": 0.55801743, + "num_input_tokens_seen": 259968475, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.20410156, + "step": 12049, + "time_per_iteration": 3.04441499710083 + }, + { + "auxiliary_loss_clip": 0.01107642, + "auxiliary_loss_mlp": 0.01030185, + "balance_loss_clip": 1.01611245, + "balance_loss_mlp": 1.0363996, + "epoch": 0.7244851946490305, + "flos": 17967114524160.0, + "grad_norm": 3.707915690527614, + "language_loss": 0.59357387, + "learning_rate": 7.446864039779258e-07, + "loss": 0.61495221, + "num_input_tokens_seen": 259984865, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7109375, + "step": 12050, + "time_per_iteration": 2.4253971576690674 + }, + { + "auxiliary_loss_clip": 0.01024067, + "auxiliary_loss_mlp": 0.01001921, + "balance_loss_clip": 1.00082481, + "balance_loss_mlp": 1.00360942, + "epoch": 0.7245453179016985, + "flos": 70943649603840.0, + "grad_norm": 0.7294493469822053, + "language_loss": 0.53312981, + "learning_rate": 7.443832356902528e-07, + "loss": 0.55338979, + "num_input_tokens_seen": 260046735, + "router_z_loss_clip": 0.01098633, + "router_z_loss_mlp": 0.20507812, + "step": 12051, + "time_per_iteration": 3.049221992492676 + }, + { + "auxiliary_loss_clip": 0.01100204, + "auxiliary_loss_mlp": 0.01030999, + "balance_loss_clip": 1.01975131, + "balance_loss_mlp": 1.03494263, + "epoch": 0.7246054411543664, + "flos": 24568464867840.0, + "grad_norm": 1.5472193855827432, + "language_loss": 0.72156775, + "learning_rate": 7.440801150156927e-07, + "loss": 0.74287981, + "num_input_tokens_seen": 260067950, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 12052, + "time_per_iteration": 2.46797776222229 + }, + { + "auxiliary_loss_clip": 0.01102918, + "auxiliary_loss_mlp": 0.01029028, + "balance_loss_clip": 1.01620138, + "balance_loss_mlp": 1.03667867, + "epoch": 0.7246655644070344, + "flos": 32338312548480.0, + "grad_norm": 2.0462685374624088, + "language_loss": 0.74402982, + "learning_rate": 7.437770419657415e-07, + "loss": 0.76534927, + "num_input_tokens_seen": 260087730, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6640625, + "step": 12053, + "time_per_iteration": 3.902531862258911 + }, + { + "auxiliary_loss_clip": 0.01102096, + "auxiliary_loss_mlp": 0.01029811, + "balance_loss_clip": 1.01761603, + "balance_loss_mlp": 1.03548145, + "epoch": 0.7247256876597024, + "flos": 21872471713920.0, + "grad_norm": 2.1030984660426792, + "language_loss": 0.78042889, + "learning_rate": 7.434740165518898e-07, + "loss": 0.80174804, + "num_input_tokens_seen": 260107760, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66796875, + "step": 12054, + "time_per_iteration": 2.4352877140045166 + }, + { + "auxiliary_loss_clip": 0.01101331, + "auxiliary_loss_mlp": 0.01033632, + "balance_loss_clip": 1.02155614, + "balance_loss_mlp": 1.03527296, + "epoch": 0.7247858109123704, + "flos": 16213093585920.0, + "grad_norm": 2.826077293282499, + "language_loss": 0.68607175, + "learning_rate": 7.431710387856301e-07, + "loss": 0.70742142, + "num_input_tokens_seen": 260123660, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 12055, + "time_per_iteration": 3.8767430782318115 + }, + { + "auxiliary_loss_clip": 0.01098671, + "auxiliary_loss_mlp": 0.01028558, + "balance_loss_clip": 1.0167743, + "balance_loss_mlp": 1.03378785, + "epoch": 0.7248459341650383, + "flos": 20850705434880.0, + "grad_norm": 1.7289479887024157, + "language_loss": 0.73999792, + "learning_rate": 7.428681086784496e-07, + "loss": 0.76127023, + "num_input_tokens_seen": 260142690, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6484375, + "step": 12056, + "time_per_iteration": 5.455943822860718 + }, + { + "auxiliary_loss_clip": 0.010981, + "auxiliary_loss_mlp": 0.01023811, + "balance_loss_clip": 1.01225948, + "balance_loss_mlp": 1.03432655, + "epoch": 0.7249060574177063, + "flos": 25921794614400.0, + "grad_norm": 1.6012339855962578, + "language_loss": 0.70800096, + "learning_rate": 7.425652262418368e-07, + "loss": 0.72922009, + "num_input_tokens_seen": 260162590, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.63671875, + "step": 12057, + "time_per_iteration": 2.5277090072631836 + }, + { + "auxiliary_loss_clip": 0.01104249, + "auxiliary_loss_mlp": 0.01033995, + "balance_loss_clip": 1.02146614, + "balance_loss_mlp": 1.03651786, + "epoch": 0.7249661806703742, + "flos": 17345536646400.0, + "grad_norm": 1.9271030457531089, + "language_loss": 0.6256361, + "learning_rate": 7.42262391487277e-07, + "loss": 0.64701855, + "num_input_tokens_seen": 260181065, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 12058, + "time_per_iteration": 2.4183826446533203 + }, + { + "auxiliary_loss_clip": 0.01105608, + "auxiliary_loss_mlp": 0.01029496, + "balance_loss_clip": 1.01710391, + "balance_loss_mlp": 1.03729975, + "epoch": 0.7250263039230422, + "flos": 19574153009280.0, + "grad_norm": 1.9655611905409667, + "language_loss": 0.74991, + "learning_rate": 7.419596044262535e-07, + "loss": 0.7712611, + "num_input_tokens_seen": 260200330, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 12059, + "time_per_iteration": 2.4240307807922363 + }, + { + "auxiliary_loss_clip": 0.01098542, + "auxiliary_loss_mlp": 0.01030731, + "balance_loss_clip": 1.01989508, + "balance_loss_mlp": 1.03418756, + "epoch": 0.7250864271757103, + "flos": 21976648133760.0, + "grad_norm": 1.73148336462866, + "language_loss": 0.79305416, + "learning_rate": 7.416568650702472e-07, + "loss": 0.81434691, + "num_input_tokens_seen": 260219975, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.640625, + "step": 12060, + "time_per_iteration": 2.493459463119507 + }, + { + "auxiliary_loss_clip": 0.0110292, + "auxiliary_loss_mlp": 0.01028467, + "balance_loss_clip": 1.01606321, + "balance_loss_mlp": 1.03522062, + "epoch": 0.7251465504283782, + "flos": 25012608537600.0, + "grad_norm": 2.354515481339918, + "language_loss": 0.76317465, + "learning_rate": 7.413541734307393e-07, + "loss": 0.78448856, + "num_input_tokens_seen": 260242025, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 12061, + "time_per_iteration": 2.4897234439849854 + }, + { + "auxiliary_loss_clip": 0.01100914, + "auxiliary_loss_mlp": 0.01028273, + "balance_loss_clip": 1.01707315, + "balance_loss_mlp": 1.03607178, + "epoch": 0.7252066736810462, + "flos": 16690131135360.0, + "grad_norm": 1.707041727604455, + "language_loss": 0.81039721, + "learning_rate": 7.410515295192068e-07, + "loss": 0.83168906, + "num_input_tokens_seen": 260260015, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 12062, + "time_per_iteration": 2.4312822818756104 + }, + { + "auxiliary_loss_clip": 0.01106743, + "auxiliary_loss_mlp": 0.0103005, + "balance_loss_clip": 1.01713991, + "balance_loss_mlp": 1.03735328, + "epoch": 0.7252667969337141, + "flos": 25703026830720.0, + "grad_norm": 1.9940387151474506, + "language_loss": 0.68844217, + "learning_rate": 7.407489333471262e-07, + "loss": 0.70981008, + "num_input_tokens_seen": 260278635, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 12063, + "time_per_iteration": 2.5078516006469727 + }, + { + "auxiliary_loss_clip": 0.01099308, + "auxiliary_loss_mlp": 0.01029715, + "balance_loss_clip": 1.01788342, + "balance_loss_mlp": 1.03523588, + "epoch": 0.7253269201863821, + "flos": 18259930195200.0, + "grad_norm": 1.3500136523009691, + "language_loss": 0.69967401, + "learning_rate": 7.40446384925973e-07, + "loss": 0.72096425, + "num_input_tokens_seen": 260298510, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.640625, + "step": 12064, + "time_per_iteration": 2.4525294303894043 + }, + { + "auxiliary_loss_clip": 0.01103585, + "auxiliary_loss_mlp": 0.010308, + "balance_loss_clip": 1.0188607, + "balance_loss_mlp": 1.03744543, + "epoch": 0.72538704343905, + "flos": 20411805150720.0, + "grad_norm": 2.2336703023596716, + "language_loss": 0.90039599, + "learning_rate": 7.401438842672192e-07, + "loss": 0.92173982, + "num_input_tokens_seen": 260317405, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 12065, + "time_per_iteration": 2.4503257274627686 + }, + { + "auxiliary_loss_clip": 0.01023945, + "auxiliary_loss_mlp": 0.00999171, + "balance_loss_clip": 0.99806815, + "balance_loss_mlp": 1.00351691, + "epoch": 0.725447166691718, + "flos": 70151209706880.0, + "grad_norm": 0.6543765045930707, + "language_loss": 0.56138921, + "learning_rate": 7.398414313823349e-07, + "loss": 0.58162034, + "num_input_tokens_seen": 260388085, + "router_z_loss_clip": 0.01104736, + "router_z_loss_mlp": 0.20507812, + "step": 12066, + "time_per_iteration": 3.203951120376587 + }, + { + "auxiliary_loss_clip": 0.01100204, + "auxiliary_loss_mlp": 0.01027787, + "balance_loss_clip": 1.01663494, + "balance_loss_mlp": 1.03434396, + "epoch": 0.725507289944386, + "flos": 27052334254080.0, + "grad_norm": 1.9431934533116317, + "language_loss": 0.76573753, + "learning_rate": 7.395390262827897e-07, + "loss": 0.78701746, + "num_input_tokens_seen": 260406165, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 12067, + "time_per_iteration": 2.5001325607299805 + }, + { + "auxiliary_loss_clip": 0.01024325, + "auxiliary_loss_mlp": 0.01000445, + "balance_loss_clip": 0.99928838, + "balance_loss_mlp": 1.00393391, + "epoch": 0.725567413197054, + "flos": 62921924778240.0, + "grad_norm": 0.7268496108336204, + "language_loss": 0.57092577, + "learning_rate": 7.392366689800515e-07, + "loss": 0.59117347, + "num_input_tokens_seen": 260461365, + "router_z_loss_clip": 0.01153564, + "router_z_loss_mlp": 0.20410156, + "step": 12068, + "time_per_iteration": 2.961564779281616 + }, + { + "auxiliary_loss_clip": 0.01023519, + "auxiliary_loss_mlp": 0.00997832, + "balance_loss_clip": 0.99668139, + "balance_loss_mlp": 1.00306845, + "epoch": 0.7256275364497219, + "flos": 60295957188480.0, + "grad_norm": 0.6592626191043454, + "language_loss": 0.55426753, + "learning_rate": 7.389343594855848e-07, + "loss": 0.57448101, + "num_input_tokens_seen": 260523795, + "router_z_loss_clip": 0.01147461, + "router_z_loss_mlp": 0.20507812, + "step": 12069, + "time_per_iteration": 3.111906051635742 + }, + { + "auxiliary_loss_clip": 0.01098503, + "auxiliary_loss_mlp": 0.01026099, + "balance_loss_clip": 1.015275, + "balance_loss_mlp": 1.03479362, + "epoch": 0.7256876597023899, + "flos": 24498511130880.0, + "grad_norm": 1.8188254561357684, + "language_loss": 0.79876685, + "learning_rate": 7.38632097810854e-07, + "loss": 0.82001287, + "num_input_tokens_seen": 260544765, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.63671875, + "step": 12070, + "time_per_iteration": 2.4814393520355225 + }, + { + "auxiliary_loss_clip": 0.0109711, + "auxiliary_loss_mlp": 0.01030083, + "balance_loss_clip": 1.01860952, + "balance_loss_mlp": 1.03523922, + "epoch": 0.7257477829550578, + "flos": 24352749740160.0, + "grad_norm": 2.135024516193614, + "language_loss": 0.72267014, + "learning_rate": 7.383298839673197e-07, + "loss": 0.74394208, + "num_input_tokens_seen": 260564340, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.62109375, + "step": 12071, + "time_per_iteration": 2.5080463886260986 + }, + { + "auxiliary_loss_clip": 0.01099686, + "auxiliary_loss_mlp": 0.01034521, + "balance_loss_clip": 1.02348769, + "balance_loss_mlp": 1.03501189, + "epoch": 0.7258079062077258, + "flos": 17202217380480.0, + "grad_norm": 1.7654284044796786, + "language_loss": 0.6994983, + "learning_rate": 7.380277179664436e-07, + "loss": 0.72084033, + "num_input_tokens_seen": 260582565, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.64453125, + "step": 12072, + "time_per_iteration": 2.430056571960449 + }, + { + "auxiliary_loss_clip": 0.01103966, + "auxiliary_loss_mlp": 0.01029251, + "balance_loss_clip": 1.01702607, + "balance_loss_mlp": 1.03472924, + "epoch": 0.7258680294603939, + "flos": 21580338401280.0, + "grad_norm": 1.7824187520349677, + "language_loss": 0.78317153, + "learning_rate": 7.377255998196821e-07, + "loss": 0.80450368, + "num_input_tokens_seen": 260601700, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69140625, + "step": 12073, + "time_per_iteration": 2.479287624359131 + }, + { + "auxiliary_loss_clip": 0.01100141, + "auxiliary_loss_mlp": 0.01026691, + "balance_loss_clip": 1.01472855, + "balance_loss_mlp": 1.03557312, + "epoch": 0.7259281527130618, + "flos": 34855399036800.0, + "grad_norm": 1.6619094478292162, + "language_loss": 0.70389605, + "learning_rate": 7.374235295384923e-07, + "loss": 0.72516435, + "num_input_tokens_seen": 260623040, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.64453125, + "step": 12074, + "time_per_iteration": 2.674909830093384 + }, + { + "auxiliary_loss_clip": 0.01103212, + "auxiliary_loss_mlp": 0.01027211, + "balance_loss_clip": 1.01514673, + "balance_loss_mlp": 1.03562987, + "epoch": 0.7259882759657298, + "flos": 25404644551680.0, + "grad_norm": 1.6427266790682502, + "language_loss": 0.7405411, + "learning_rate": 7.371215071343302e-07, + "loss": 0.76184535, + "num_input_tokens_seen": 260642735, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.67578125, + "step": 12075, + "time_per_iteration": 2.4879863262176514 + }, + { + "auxiliary_loss_clip": 0.01102234, + "auxiliary_loss_mlp": 0.01030684, + "balance_loss_clip": 1.01842904, + "balance_loss_mlp": 1.03551388, + "epoch": 0.7260483992183977, + "flos": 62953630531200.0, + "grad_norm": 1.5060189576698704, + "language_loss": 0.635382, + "learning_rate": 7.368195326186458e-07, + "loss": 0.65671116, + "num_input_tokens_seen": 260669935, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.66796875, + "step": 12076, + "time_per_iteration": 2.817375659942627 + }, + { + "auxiliary_loss_clip": 0.0110076, + "auxiliary_loss_mlp": 0.01025872, + "balance_loss_clip": 1.0139389, + "balance_loss_mlp": 1.03412342, + "epoch": 0.7261085224710657, + "flos": 26467528924800.0, + "grad_norm": 2.5366204857105332, + "language_loss": 0.79249585, + "learning_rate": 7.365176060028912e-07, + "loss": 0.81376213, + "num_input_tokens_seen": 260689605, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66796875, + "step": 12077, + "time_per_iteration": 2.513556480407715 + }, + { + "auxiliary_loss_clip": 0.01023637, + "auxiliary_loss_mlp": 0.01002866, + "balance_loss_clip": 1.00172806, + "balance_loss_mlp": 1.00323439, + "epoch": 0.7261686457237336, + "flos": 66772732187520.0, + "grad_norm": 0.8858624910390671, + "language_loss": 0.64977288, + "learning_rate": 7.362157272985163e-07, + "loss": 0.67003787, + "num_input_tokens_seen": 260748265, + "router_z_loss_clip": 0.01141357, + "router_z_loss_mlp": 0.20507812, + "step": 12078, + "time_per_iteration": 3.0679736137390137 + }, + { + "auxiliary_loss_clip": 0.01023707, + "auxiliary_loss_mlp": 0.01000415, + "balance_loss_clip": 0.99934798, + "balance_loss_mlp": 1.0032717, + "epoch": 0.7262287689764017, + "flos": 69999594399360.0, + "grad_norm": 0.7121161567572437, + "language_loss": 0.59267461, + "learning_rate": 7.359138965169671e-07, + "loss": 0.61291581, + "num_input_tokens_seen": 260816715, + "router_z_loss_clip": 0.01068115, + "router_z_loss_mlp": 0.20507812, + "step": 12079, + "time_per_iteration": 3.201369524002075 + }, + { + "auxiliary_loss_clip": 0.01099969, + "auxiliary_loss_mlp": 0.01030168, + "balance_loss_clip": 1.01752567, + "balance_loss_mlp": 1.03409278, + "epoch": 0.7262888922290696, + "flos": 23805435231360.0, + "grad_norm": 1.8820513707228834, + "language_loss": 0.65003538, + "learning_rate": 7.356121136696895e-07, + "loss": 0.67133677, + "num_input_tokens_seen": 260836765, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.66015625, + "step": 12080, + "time_per_iteration": 2.4735429286956787 + }, + { + "auxiliary_loss_clip": 0.01101349, + "auxiliary_loss_mlp": 0.01026647, + "balance_loss_clip": 1.01415968, + "balance_loss_mlp": 1.0338223, + "epoch": 0.7263490154817376, + "flos": 19500320603520.0, + "grad_norm": 5.946673694238332, + "language_loss": 0.699211, + "learning_rate": 7.35310378768128e-07, + "loss": 0.72049093, + "num_input_tokens_seen": 260854610, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.67578125, + "step": 12081, + "time_per_iteration": 2.4283978939056396 + }, + { + "auxiliary_loss_clip": 0.01104797, + "auxiliary_loss_mlp": 0.0102899, + "balance_loss_clip": 1.01758754, + "balance_loss_mlp": 1.03677154, + "epoch": 0.7264091387344055, + "flos": 16286243633280.0, + "grad_norm": 4.042667093911173, + "language_loss": 0.81073087, + "learning_rate": 7.350086918237237e-07, + "loss": 0.83206874, + "num_input_tokens_seen": 260871620, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6796875, + "step": 12082, + "time_per_iteration": 2.4518401622772217 + }, + { + "auxiliary_loss_clip": 0.01107339, + "auxiliary_loss_mlp": 0.01033829, + "balance_loss_clip": 1.02072203, + "balance_loss_mlp": 1.03555846, + "epoch": 0.7264692619870735, + "flos": 24352031468160.0, + "grad_norm": 1.773588814829077, + "language_loss": 0.76834166, + "learning_rate": 7.347070528479158e-07, + "loss": 0.78975332, + "num_input_tokens_seen": 260890490, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 12083, + "time_per_iteration": 2.4874460697174072 + }, + { + "auxiliary_loss_clip": 0.01106226, + "auxiliary_loss_mlp": 0.01031622, + "balance_loss_clip": 1.01915908, + "balance_loss_mlp": 1.03815079, + "epoch": 0.7265293852397414, + "flos": 25119478477440.0, + "grad_norm": 1.6288025457613526, + "language_loss": 0.72911334, + "learning_rate": 7.344054618521433e-07, + "loss": 0.75049186, + "num_input_tokens_seen": 260909700, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 12084, + "time_per_iteration": 2.4936935901641846 + }, + { + "auxiliary_loss_clip": 0.01104738, + "auxiliary_loss_mlp": 0.01031261, + "balance_loss_clip": 1.01855981, + "balance_loss_mlp": 1.03661275, + "epoch": 0.7265895084924094, + "flos": 22638230784000.0, + "grad_norm": 1.683298254553577, + "language_loss": 0.77603686, + "learning_rate": 7.34103918847843e-07, + "loss": 0.79739684, + "num_input_tokens_seen": 260929090, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 12085, + "time_per_iteration": 2.461860418319702 + }, + { + "auxiliary_loss_clip": 0.01101384, + "auxiliary_loss_mlp": 0.01032841, + "balance_loss_clip": 1.02104557, + "balance_loss_mlp": 1.03391504, + "epoch": 0.7266496317450775, + "flos": 23368222886400.0, + "grad_norm": 1.8314526850775286, + "language_loss": 0.72461057, + "learning_rate": 7.338024238464493e-07, + "loss": 0.74595284, + "num_input_tokens_seen": 260946615, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 12086, + "time_per_iteration": 2.4804890155792236 + }, + { + "auxiliary_loss_clip": 0.01102997, + "auxiliary_loss_mlp": 0.01033387, + "balance_loss_clip": 1.02163863, + "balance_loss_mlp": 1.03661227, + "epoch": 0.7267097549977454, + "flos": 28074603323520.0, + "grad_norm": 2.0882270871339492, + "language_loss": 0.69382304, + "learning_rate": 7.335009768593938e-07, + "loss": 0.71518683, + "num_input_tokens_seen": 260968515, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6640625, + "step": 12087, + "time_per_iteration": 2.472632884979248 + }, + { + "auxiliary_loss_clip": 0.01105347, + "auxiliary_loss_mlp": 0.01034009, + "balance_loss_clip": 1.02104521, + "balance_loss_mlp": 1.03732419, + "epoch": 0.7267698782504134, + "flos": 22195523658240.0, + "grad_norm": 2.250412175179094, + "language_loss": 0.79011619, + "learning_rate": 7.331995778981088e-07, + "loss": 0.81150979, + "num_input_tokens_seen": 260986790, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6796875, + "step": 12088, + "time_per_iteration": 2.455793857574463 + }, + { + "auxiliary_loss_clip": 0.01103631, + "auxiliary_loss_mlp": 0.01035978, + "balance_loss_clip": 1.02433753, + "balance_loss_mlp": 1.03484094, + "epoch": 0.7268300015030813, + "flos": 18514859996160.0, + "grad_norm": 1.695956180050093, + "language_loss": 0.73965418, + "learning_rate": 7.328982269740221e-07, + "loss": 0.76105028, + "num_input_tokens_seen": 261004925, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6875, + "step": 12089, + "time_per_iteration": 2.4252777099609375 + }, + { + "auxiliary_loss_clip": 0.01103186, + "auxiliary_loss_mlp": 0.01033658, + "balance_loss_clip": 1.0215764, + "balance_loss_mlp": 1.03553808, + "epoch": 0.7268901247557493, + "flos": 23986029836160.0, + "grad_norm": 1.809103044869338, + "language_loss": 0.70920813, + "learning_rate": 7.325969240985616e-07, + "loss": 0.73057657, + "num_input_tokens_seen": 261023895, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 12090, + "time_per_iteration": 2.500497817993164 + }, + { + "auxiliary_loss_clip": 0.01103253, + "auxiliary_loss_mlp": 0.01029106, + "balance_loss_clip": 1.01645172, + "balance_loss_mlp": 1.03472519, + "epoch": 0.7269502480084172, + "flos": 32088087429120.0, + "grad_norm": 1.7365025485289893, + "language_loss": 0.7741468, + "learning_rate": 7.322956692831528e-07, + "loss": 0.79547042, + "num_input_tokens_seen": 261045445, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.68359375, + "step": 12091, + "time_per_iteration": 2.5417003631591797 + }, + { + "auxiliary_loss_clip": 0.01100865, + "auxiliary_loss_mlp": 0.01028025, + "balance_loss_clip": 1.01566255, + "balance_loss_mlp": 1.03411698, + "epoch": 0.7270103712610853, + "flos": 19062785036160.0, + "grad_norm": 3.1465600327537304, + "language_loss": 0.71302813, + "learning_rate": 7.319944625392205e-07, + "loss": 0.73431706, + "num_input_tokens_seen": 261064275, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.66796875, + "step": 12092, + "time_per_iteration": 2.4790890216827393 + }, + { + "auxiliary_loss_clip": 0.01101296, + "auxiliary_loss_mlp": 0.01029757, + "balance_loss_clip": 1.01770473, + "balance_loss_mlp": 1.03515983, + "epoch": 0.7270704945137532, + "flos": 34532921710080.0, + "grad_norm": 1.8134968044947444, + "language_loss": 0.6129632, + "learning_rate": 7.31693303878184e-07, + "loss": 0.63427377, + "num_input_tokens_seen": 261083310, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.66015625, + "step": 12093, + "time_per_iteration": 2.531416654586792 + }, + { + "auxiliary_loss_clip": 0.01102331, + "auxiliary_loss_mlp": 0.01031046, + "balance_loss_clip": 1.01923263, + "balance_loss_mlp": 1.03584278, + "epoch": 0.7271306177664212, + "flos": 21507583403520.0, + "grad_norm": 1.5414395566200807, + "language_loss": 0.75677824, + "learning_rate": 7.313921933114644e-07, + "loss": 0.77811199, + "num_input_tokens_seen": 261103460, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 12094, + "time_per_iteration": 3.885373592376709 + }, + { + "auxiliary_loss_clip": 0.01099162, + "auxiliary_loss_mlp": 0.0103038, + "balance_loss_clip": 1.01925766, + "balance_loss_mlp": 1.03378463, + "epoch": 0.7271907410190891, + "flos": 22272444633600.0, + "grad_norm": 1.9126635522388606, + "language_loss": 0.84773397, + "learning_rate": 7.310911308504808e-07, + "loss": 0.8690294, + "num_input_tokens_seen": 261121375, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 12095, + "time_per_iteration": 2.429746150970459 + }, + { + "auxiliary_loss_clip": 0.01101056, + "auxiliary_loss_mlp": 0.01033976, + "balance_loss_clip": 1.02127481, + "balance_loss_mlp": 1.03374481, + "epoch": 0.7272508642717571, + "flos": 22893124671360.0, + "grad_norm": 1.7505444036152586, + "language_loss": 0.78038371, + "learning_rate": 7.307901165066479e-07, + "loss": 0.80173397, + "num_input_tokens_seen": 261141105, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.671875, + "step": 12096, + "time_per_iteration": 3.8615665435791016 + }, + { + "auxiliary_loss_clip": 0.01103926, + "auxiliary_loss_mlp": 0.01031359, + "balance_loss_clip": 1.0200038, + "balance_loss_mlp": 1.03728688, + "epoch": 0.727310987524425, + "flos": 11655886331520.0, + "grad_norm": 2.3221692333246655, + "language_loss": 0.7232452, + "learning_rate": 7.30489150291381e-07, + "loss": 0.74459803, + "num_input_tokens_seen": 261159255, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 12097, + "time_per_iteration": 3.8505306243896484 + }, + { + "auxiliary_loss_clip": 0.01104342, + "auxiliary_loss_mlp": 0.01034863, + "balance_loss_clip": 1.02190495, + "balance_loss_mlp": 1.03669655, + "epoch": 0.727371110777093, + "flos": 24535319592960.0, + "grad_norm": 2.177278264782312, + "language_loss": 0.7672922, + "learning_rate": 7.301882322160935e-07, + "loss": 0.78868425, + "num_input_tokens_seen": 261177960, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6796875, + "step": 12098, + "time_per_iteration": 4.021664142608643 + }, + { + "auxiliary_loss_clip": 0.01102665, + "auxiliary_loss_mlp": 0.01032278, + "balance_loss_clip": 1.01946902, + "balance_loss_mlp": 1.03345513, + "epoch": 0.7274312340297611, + "flos": 74739835405440.0, + "grad_norm": 1.8124975199898956, + "language_loss": 0.6742186, + "learning_rate": 7.298873622921952e-07, + "loss": 0.69556803, + "num_input_tokens_seen": 261205660, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 12099, + "time_per_iteration": 2.8312809467315674 + }, + { + "auxiliary_loss_clip": 0.01106918, + "auxiliary_loss_mlp": 0.01032961, + "balance_loss_clip": 1.02005613, + "balance_loss_mlp": 1.0350759, + "epoch": 0.727491357282429, + "flos": 22342865247360.0, + "grad_norm": 4.666251767932542, + "language_loss": 0.72614902, + "learning_rate": 7.29586540531095e-07, + "loss": 0.74754786, + "num_input_tokens_seen": 261225185, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 12100, + "time_per_iteration": 2.48777437210083 + }, + { + "auxiliary_loss_clip": 0.01103782, + "auxiliary_loss_mlp": 0.01033607, + "balance_loss_clip": 1.02218103, + "balance_loss_mlp": 1.03623843, + "epoch": 0.727551480535097, + "flos": 23297550877440.0, + "grad_norm": 1.4296037667662786, + "language_loss": 0.74749982, + "learning_rate": 7.292857669442005e-07, + "loss": 0.76887369, + "num_input_tokens_seen": 261247965, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.67578125, + "step": 12101, + "time_per_iteration": 2.460813045501709 + }, + { + "auxiliary_loss_clip": 0.01102809, + "auxiliary_loss_mlp": 0.01030746, + "balance_loss_clip": 1.01962399, + "balance_loss_mlp": 1.03687561, + "epoch": 0.7276116037877649, + "flos": 21470559459840.0, + "grad_norm": 1.6471267556293203, + "language_loss": 0.82180774, + "learning_rate": 7.289850415429177e-07, + "loss": 0.84314322, + "num_input_tokens_seen": 261267585, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66015625, + "step": 12102, + "time_per_iteration": 2.486891031265259 + }, + { + "auxiliary_loss_clip": 0.01101993, + "auxiliary_loss_mlp": 0.01032966, + "balance_loss_clip": 1.02160573, + "balance_loss_mlp": 1.03577983, + "epoch": 0.7276717270404329, + "flos": 21464059098240.0, + "grad_norm": 2.238789262926412, + "language_loss": 0.81434906, + "learning_rate": 7.286843643386495e-07, + "loss": 0.8356986, + "num_input_tokens_seen": 261285200, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66015625, + "step": 12103, + "time_per_iteration": 2.414008855819702 + }, + { + "auxiliary_loss_clip": 0.01102157, + "auxiliary_loss_mlp": 0.01026281, + "balance_loss_clip": 1.01403213, + "balance_loss_mlp": 1.03556037, + "epoch": 0.7277318502931008, + "flos": 16837221329280.0, + "grad_norm": 2.300581534767291, + "language_loss": 0.66380107, + "learning_rate": 7.283837353427968e-07, + "loss": 0.68508548, + "num_input_tokens_seen": 261303645, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6640625, + "step": 12104, + "time_per_iteration": 2.4741268157958984 + }, + { + "auxiliary_loss_clip": 0.01099619, + "auxiliary_loss_mlp": 0.01028412, + "balance_loss_clip": 1.01674151, + "balance_loss_mlp": 1.03588009, + "epoch": 0.7277919735457689, + "flos": 33400550476800.0, + "grad_norm": 1.8448719986078481, + "language_loss": 0.65691745, + "learning_rate": 7.280831545667611e-07, + "loss": 0.67819774, + "num_input_tokens_seen": 261323265, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.63671875, + "step": 12105, + "time_per_iteration": 2.5147173404693604 + }, + { + "auxiliary_loss_clip": 0.01103458, + "auxiliary_loss_mlp": 0.01032555, + "balance_loss_clip": 1.02052665, + "balance_loss_mlp": 1.03698063, + "epoch": 0.7278520967984368, + "flos": 19206499351680.0, + "grad_norm": 2.269554332821791, + "language_loss": 0.75712693, + "learning_rate": 7.27782622021939e-07, + "loss": 0.77848709, + "num_input_tokens_seen": 261339745, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 12106, + "time_per_iteration": 2.435525417327881 + }, + { + "auxiliary_loss_clip": 0.01105516, + "auxiliary_loss_mlp": 0.01028309, + "balance_loss_clip": 1.01580417, + "balance_loss_mlp": 1.03651524, + "epoch": 0.7279122200511048, + "flos": 34094667870720.0, + "grad_norm": 2.027947954090959, + "language_loss": 0.70116639, + "learning_rate": 7.274821377197273e-07, + "loss": 0.72250462, + "num_input_tokens_seen": 261359310, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 12107, + "time_per_iteration": 2.5302398204803467 + }, + { + "auxiliary_loss_clip": 0.01101241, + "auxiliary_loss_mlp": 0.0103217, + "balance_loss_clip": 1.02056551, + "balance_loss_mlp": 1.03459477, + "epoch": 0.7279723433037727, + "flos": 54599049348480.0, + "grad_norm": 1.520569075146339, + "language_loss": 0.75155759, + "learning_rate": 7.271817016715205e-07, + "loss": 0.77289176, + "num_input_tokens_seen": 261384640, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 12108, + "time_per_iteration": 2.7630767822265625 + }, + { + "auxiliary_loss_clip": 0.01102209, + "auxiliary_loss_mlp": 0.01028873, + "balance_loss_clip": 1.01658893, + "balance_loss_mlp": 1.03495109, + "epoch": 0.7280324665564407, + "flos": 36137482156800.0, + "grad_norm": 1.5886355104574046, + "language_loss": 0.66785181, + "learning_rate": 7.268813138887124e-07, + "loss": 0.68916261, + "num_input_tokens_seen": 261405290, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.671875, + "step": 12109, + "time_per_iteration": 2.5576727390289307 + }, + { + "auxiliary_loss_clip": 0.01102344, + "auxiliary_loss_mlp": 0.01030828, + "balance_loss_clip": 1.01853728, + "balance_loss_mlp": 1.03609085, + "epoch": 0.7280925898091086, + "flos": 11618539165440.0, + "grad_norm": 1.9794357831275327, + "language_loss": 0.62950575, + "learning_rate": 7.265809743826912e-07, + "loss": 0.65083742, + "num_input_tokens_seen": 261419710, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6640625, + "step": 12110, + "time_per_iteration": 2.44002366065979 + }, + { + "auxiliary_loss_clip": 0.01102169, + "auxiliary_loss_mlp": 0.01026996, + "balance_loss_clip": 1.01408529, + "balance_loss_mlp": 1.03304601, + "epoch": 0.7281527130617766, + "flos": 34277094069120.0, + "grad_norm": 1.7658774771753212, + "language_loss": 0.58043802, + "learning_rate": 7.26280683164847e-07, + "loss": 0.60172975, + "num_input_tokens_seen": 261442385, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69140625, + "step": 12111, + "time_per_iteration": 2.6210787296295166 + }, + { + "auxiliary_loss_clip": 0.01106335, + "auxiliary_loss_mlp": 0.01030725, + "balance_loss_clip": 1.01827931, + "balance_loss_mlp": 1.03801906, + "epoch": 0.7282128363144446, + "flos": 13918043018880.0, + "grad_norm": 1.9352527589955661, + "language_loss": 0.73992717, + "learning_rate": 7.259804402465677e-07, + "loss": 0.76129776, + "num_input_tokens_seen": 261459805, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.68359375, + "step": 12112, + "time_per_iteration": 2.4524636268615723 + }, + { + "auxiliary_loss_clip": 0.01099679, + "auxiliary_loss_mlp": 0.01029493, + "balance_loss_clip": 1.01777458, + "balance_loss_mlp": 1.03403258, + "epoch": 0.7282729595671126, + "flos": 20777627214720.0, + "grad_norm": 2.0053906619330006, + "language_loss": 0.67298758, + "learning_rate": 7.25680245639237e-07, + "loss": 0.69427931, + "num_input_tokens_seen": 261477175, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 12113, + "time_per_iteration": 2.4597878456115723 + }, + { + "auxiliary_loss_clip": 0.01102125, + "auxiliary_loss_mlp": 0.01030726, + "balance_loss_clip": 1.01829863, + "balance_loss_mlp": 1.03391302, + "epoch": 0.7283330828197806, + "flos": 16325422392960.0, + "grad_norm": 1.6626035833227917, + "language_loss": 0.73243928, + "learning_rate": 7.253800993542399e-07, + "loss": 0.75376785, + "num_input_tokens_seen": 261494990, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.68359375, + "step": 12114, + "time_per_iteration": 2.4250495433807373 + }, + { + "auxiliary_loss_clip": 0.01099798, + "auxiliary_loss_mlp": 0.01029231, + "balance_loss_clip": 1.01685691, + "balance_loss_mlp": 1.03370285, + "epoch": 0.7283932060724485, + "flos": 27490193043840.0, + "grad_norm": 2.0029156408767714, + "language_loss": 0.68175685, + "learning_rate": 7.250800014029564e-07, + "loss": 0.70304716, + "num_input_tokens_seen": 261514445, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6640625, + "step": 12115, + "time_per_iteration": 2.4954171180725098 + }, + { + "auxiliary_loss_clip": 0.01103561, + "auxiliary_loss_mlp": 0.01027892, + "balance_loss_clip": 1.01567912, + "balance_loss_mlp": 1.03449523, + "epoch": 0.7284533293251165, + "flos": 18367877543040.0, + "grad_norm": 1.5749182133229294, + "language_loss": 0.59722745, + "learning_rate": 7.247799517967674e-07, + "loss": 0.61854202, + "num_input_tokens_seen": 261533565, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69140625, + "step": 12116, + "time_per_iteration": 2.5029101371765137 + }, + { + "auxiliary_loss_clip": 0.01100013, + "auxiliary_loss_mlp": 0.01028036, + "balance_loss_clip": 1.01579905, + "balance_loss_mlp": 1.03508806, + "epoch": 0.7285134525777844, + "flos": 21725525174400.0, + "grad_norm": 1.7186518000931694, + "language_loss": 0.72523415, + "learning_rate": 7.2447995054705e-07, + "loss": 0.74651456, + "num_input_tokens_seen": 261553795, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6484375, + "step": 12117, + "time_per_iteration": 2.4426584243774414 + }, + { + "auxiliary_loss_clip": 0.01101836, + "auxiliary_loss_mlp": 0.01024568, + "balance_loss_clip": 1.01234937, + "balance_loss_mlp": 1.03475642, + "epoch": 0.7285735758304525, + "flos": 20741357456640.0, + "grad_norm": 2.143264936763247, + "language_loss": 0.69296616, + "learning_rate": 7.241799976651807e-07, + "loss": 0.71423018, + "num_input_tokens_seen": 261572565, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 12118, + "time_per_iteration": 2.5339369773864746 + }, + { + "auxiliary_loss_clip": 0.01097686, + "auxiliary_loss_mlp": 0.0103422, + "balance_loss_clip": 1.02279413, + "balance_loss_mlp": 1.03442514, + "epoch": 0.7286336990831204, + "flos": 17310954827520.0, + "grad_norm": 1.6909309126085614, + "language_loss": 0.84203392, + "learning_rate": 7.238800931625346e-07, + "loss": 0.86335295, + "num_input_tokens_seen": 261590910, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6328125, + "step": 12119, + "time_per_iteration": 2.3954200744628906 + }, + { + "auxiliary_loss_clip": 0.01102768, + "auxiliary_loss_mlp": 0.01027674, + "balance_loss_clip": 1.01645637, + "balance_loss_mlp": 1.03579891, + "epoch": 0.7286938223357884, + "flos": 19787390098560.0, + "grad_norm": 2.2822251390786312, + "language_loss": 0.82164419, + "learning_rate": 7.235802370504831e-07, + "loss": 0.84294862, + "num_input_tokens_seen": 261606005, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 12120, + "time_per_iteration": 2.4175772666931152 + }, + { + "auxiliary_loss_clip": 0.01104482, + "auxiliary_loss_mlp": 0.0103554, + "balance_loss_clip": 1.02358377, + "balance_loss_mlp": 1.03648496, + "epoch": 0.7287539455884563, + "flos": 15340859625600.0, + "grad_norm": 1.8056895427232635, + "language_loss": 0.78642154, + "learning_rate": 7.232804293403963e-07, + "loss": 0.80782175, + "num_input_tokens_seen": 261622305, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6796875, + "step": 12121, + "time_per_iteration": 2.406684160232544 + }, + { + "auxiliary_loss_clip": 0.01100839, + "auxiliary_loss_mlp": 0.01029423, + "balance_loss_clip": 1.01706123, + "balance_loss_mlp": 1.03222573, + "epoch": 0.7288140688411243, + "flos": 25192484870400.0, + "grad_norm": 1.5367306608153926, + "language_loss": 0.6915673, + "learning_rate": 7.229806700436441e-07, + "loss": 0.71286988, + "num_input_tokens_seen": 261642465, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 12122, + "time_per_iteration": 2.533647060394287 + }, + { + "auxiliary_loss_clip": 0.01097102, + "auxiliary_loss_mlp": 0.01029883, + "balance_loss_clip": 1.01871347, + "balance_loss_mlp": 1.03240955, + "epoch": 0.7288741920937922, + "flos": 23984162328960.0, + "grad_norm": 1.8487795313278665, + "language_loss": 0.8722074, + "learning_rate": 7.226809591715923e-07, + "loss": 0.89347732, + "num_input_tokens_seen": 261661420, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 12123, + "time_per_iteration": 2.4654133319854736 + }, + { + "auxiliary_loss_clip": 0.01099535, + "auxiliary_loss_mlp": 0.01031272, + "balance_loss_clip": 1.01967263, + "balance_loss_mlp": 1.03390992, + "epoch": 0.7289343153464602, + "flos": 22744921155840.0, + "grad_norm": 2.1267005511199604, + "language_loss": 0.8275702, + "learning_rate": 7.223812967356065e-07, + "loss": 0.84887826, + "num_input_tokens_seen": 261680865, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 12124, + "time_per_iteration": 2.5298664569854736 + }, + { + "auxiliary_loss_clip": 0.01100083, + "auxiliary_loss_mlp": 0.01028709, + "balance_loss_clip": 1.01730633, + "balance_loss_mlp": 1.0351851, + "epoch": 0.7289944385991282, + "flos": 24900028335360.0, + "grad_norm": 1.8446613007140906, + "language_loss": 0.67240703, + "learning_rate": 7.220816827470499e-07, + "loss": 0.69369495, + "num_input_tokens_seen": 261701455, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 12125, + "time_per_iteration": 2.4683637619018555 + }, + { + "auxiliary_loss_clip": 0.01105338, + "auxiliary_loss_mlp": 0.01030214, + "balance_loss_clip": 1.01760745, + "balance_loss_mlp": 1.03575897, + "epoch": 0.7290545618517962, + "flos": 22967064817920.0, + "grad_norm": 1.8041889285235344, + "language_loss": 0.74976206, + "learning_rate": 7.217821172172855e-07, + "loss": 0.77111757, + "num_input_tokens_seen": 261721260, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 12126, + "time_per_iteration": 2.4857234954833984 + }, + { + "auxiliary_loss_clip": 0.01023798, + "auxiliary_loss_mlp": 0.01004495, + "balance_loss_clip": 1.00342834, + "balance_loss_mlp": 1.0033108, + "epoch": 0.7291146851044642, + "flos": 61901523216000.0, + "grad_norm": 0.8154544542721714, + "language_loss": 0.58675981, + "learning_rate": 7.2148260015767e-07, + "loss": 0.60704273, + "num_input_tokens_seen": 261779370, + "router_z_loss_clip": 0.01068115, + "router_z_loss_mlp": 0.20507812, + "step": 12127, + "time_per_iteration": 2.9716975688934326 + }, + { + "auxiliary_loss_clip": 0.01100331, + "auxiliary_loss_mlp": 0.01027444, + "balance_loss_clip": 1.0168165, + "balance_loss_mlp": 1.03571177, + "epoch": 0.7291748083571321, + "flos": 23330947547520.0, + "grad_norm": 1.9593385701209045, + "language_loss": 0.69048452, + "learning_rate": 7.21183131579562e-07, + "loss": 0.71176225, + "num_input_tokens_seen": 261798050, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6484375, + "step": 12128, + "time_per_iteration": 2.5162582397460938 + }, + { + "auxiliary_loss_clip": 0.0110308, + "auxiliary_loss_mlp": 0.01032469, + "balance_loss_clip": 1.02043474, + "balance_loss_mlp": 1.03561521, + "epoch": 0.7292349316098001, + "flos": 28330000001280.0, + "grad_norm": 2.0485847355558953, + "language_loss": 0.65249133, + "learning_rate": 7.20883711494319e-07, + "loss": 0.67384678, + "num_input_tokens_seen": 261817660, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 12129, + "time_per_iteration": 2.487868547439575 + }, + { + "auxiliary_loss_clip": 0.01098698, + "auxiliary_loss_mlp": 0.01026271, + "balance_loss_clip": 1.01426673, + "balance_loss_mlp": 1.03446507, + "epoch": 0.729295054862468, + "flos": 24132222190080.0, + "grad_norm": 2.5377483717802485, + "language_loss": 0.74676943, + "learning_rate": 7.205843399132927e-07, + "loss": 0.76801908, + "num_input_tokens_seen": 261837935, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.640625, + "step": 12130, + "time_per_iteration": 2.5030577182769775 + }, + { + "auxiliary_loss_clip": 0.01100647, + "auxiliary_loss_mlp": 0.01029321, + "balance_loss_clip": 1.01728129, + "balance_loss_mlp": 1.0347085, + "epoch": 0.7293551781151361, + "flos": 22816239609600.0, + "grad_norm": 1.617355369468953, + "language_loss": 0.6962043, + "learning_rate": 7.202850168478374e-07, + "loss": 0.71750402, + "num_input_tokens_seen": 261857575, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66015625, + "step": 12131, + "time_per_iteration": 2.4352428913116455 + }, + { + "auxiliary_loss_clip": 0.01101676, + "auxiliary_loss_mlp": 0.01028873, + "balance_loss_clip": 1.01771474, + "balance_loss_mlp": 1.03647351, + "epoch": 0.729415301367804, + "flos": 22126683242880.0, + "grad_norm": 1.4743863900351697, + "language_loss": 0.77282, + "learning_rate": 7.199857423093025e-07, + "loss": 0.79412544, + "num_input_tokens_seen": 261877265, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.65234375, + "step": 12132, + "time_per_iteration": 2.495375156402588 + }, + { + "auxiliary_loss_clip": 0.0110199, + "auxiliary_loss_mlp": 0.01032463, + "balance_loss_clip": 1.02124524, + "balance_loss_mlp": 1.03552151, + "epoch": 0.729475424620472, + "flos": 12349608675840.0, + "grad_norm": 2.217572112042413, + "language_loss": 0.79134017, + "learning_rate": 7.196865163090358e-07, + "loss": 0.81268471, + "num_input_tokens_seen": 261893695, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6640625, + "step": 12133, + "time_per_iteration": 2.403266668319702 + }, + { + "auxiliary_loss_clip": 0.01100314, + "auxiliary_loss_mlp": 0.01030932, + "balance_loss_clip": 1.01922512, + "balance_loss_mlp": 1.03376698, + "epoch": 0.7295355478731399, + "flos": 22195308176640.0, + "grad_norm": 1.8655920091949136, + "language_loss": 0.7224102, + "learning_rate": 7.193873388583846e-07, + "loss": 0.74372262, + "num_input_tokens_seen": 261911825, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66796875, + "step": 12134, + "time_per_iteration": 2.510369300842285 + }, + { + "auxiliary_loss_clip": 0.01103467, + "auxiliary_loss_mlp": 0.01035641, + "balance_loss_clip": 1.02342796, + "balance_loss_mlp": 1.03683078, + "epoch": 0.7295956711258079, + "flos": 23222030532480.0, + "grad_norm": 1.8815102601218348, + "language_loss": 0.71485353, + "learning_rate": 7.190882099686939e-07, + "loss": 0.73624468, + "num_input_tokens_seen": 261931190, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6640625, + "step": 12135, + "time_per_iteration": 2.4513211250305176 + }, + { + "auxiliary_loss_clip": 0.01102275, + "auxiliary_loss_mlp": 0.01032318, + "balance_loss_clip": 1.02063513, + "balance_loss_mlp": 1.03478527, + "epoch": 0.7296557943784758, + "flos": 31869104163840.0, + "grad_norm": 2.3479540644405645, + "language_loss": 0.62245309, + "learning_rate": 7.187891296513075e-07, + "loss": 0.64379901, + "num_input_tokens_seen": 261951240, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.67578125, + "step": 12136, + "time_per_iteration": 3.9409608840942383 + }, + { + "auxiliary_loss_clip": 0.01099061, + "auxiliary_loss_mlp": 0.01034881, + "balance_loss_clip": 1.02353823, + "balance_loss_mlp": 1.03336811, + "epoch": 0.7297159176311439, + "flos": 26651714889600.0, + "grad_norm": 1.8075029483736118, + "language_loss": 0.74606574, + "learning_rate": 7.184900979175654e-07, + "loss": 0.76740515, + "num_input_tokens_seen": 261971605, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 12137, + "time_per_iteration": 2.536616086959839 + }, + { + "auxiliary_loss_clip": 0.01104966, + "auxiliary_loss_mlp": 0.01033839, + "balance_loss_clip": 1.02242422, + "balance_loss_mlp": 1.03774345, + "epoch": 0.7297760408838118, + "flos": 24749562263040.0, + "grad_norm": 1.6283862626280647, + "language_loss": 0.74377739, + "learning_rate": 7.181911147788069e-07, + "loss": 0.76516545, + "num_input_tokens_seen": 261990830, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 12138, + "time_per_iteration": 3.9735019207000732 + }, + { + "auxiliary_loss_clip": 0.01097337, + "auxiliary_loss_mlp": 0.01029617, + "balance_loss_clip": 1.01875067, + "balance_loss_mlp": 1.03234982, + "epoch": 0.7298361641364798, + "flos": 18073768982400.0, + "grad_norm": 2.062700649659985, + "language_loss": 0.71971607, + "learning_rate": 7.178921802463702e-07, + "loss": 0.74098563, + "num_input_tokens_seen": 262008190, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6484375, + "step": 12139, + "time_per_iteration": 4.020869731903076 + }, + { + "auxiliary_loss_clip": 0.01097707, + "auxiliary_loss_mlp": 0.01025679, + "balance_loss_clip": 1.01508093, + "balance_loss_mlp": 1.03471375, + "epoch": 0.7298962873891478, + "flos": 29895597169920.0, + "grad_norm": 1.3852703912405009, + "language_loss": 0.73432374, + "learning_rate": 7.175932943315898e-07, + "loss": 0.75555754, + "num_input_tokens_seen": 262030460, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.62890625, + "step": 12140, + "time_per_iteration": 4.02800989151001 + }, + { + "auxiliary_loss_clip": 0.01101201, + "auxiliary_loss_mlp": 0.01031284, + "balance_loss_clip": 1.01883268, + "balance_loss_mlp": 1.03433836, + "epoch": 0.7299564106418157, + "flos": 32266096254720.0, + "grad_norm": 1.6478138849846053, + "language_loss": 0.55289412, + "learning_rate": 7.172944570458003e-07, + "loss": 0.57421893, + "num_input_tokens_seen": 262050830, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.66796875, + "step": 12141, + "time_per_iteration": 2.7540974617004395 + }, + { + "auxiliary_loss_clip": 0.01098698, + "auxiliary_loss_mlp": 0.01024438, + "balance_loss_clip": 1.01330972, + "balance_loss_mlp": 1.0348109, + "epoch": 0.7300165338944837, + "flos": 22930292269440.0, + "grad_norm": 1.4560422495968448, + "language_loss": 0.72527927, + "learning_rate": 7.169956684003342e-07, + "loss": 0.74651062, + "num_input_tokens_seen": 262071245, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.640625, + "step": 12142, + "time_per_iteration": 2.5032155513763428 + }, + { + "auxiliary_loss_clip": 0.01100592, + "auxiliary_loss_mlp": 0.01031929, + "balance_loss_clip": 1.0208838, + "balance_loss_mlp": 1.03534031, + "epoch": 0.7300766571471516, + "flos": 19828795501440.0, + "grad_norm": 1.7431177644397007, + "language_loss": 0.73784506, + "learning_rate": 7.16696928406521e-07, + "loss": 0.75917029, + "num_input_tokens_seen": 262087525, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65234375, + "step": 12143, + "time_per_iteration": 2.4508650302886963 + }, + { + "auxiliary_loss_clip": 0.01101928, + "auxiliary_loss_mlp": 0.01031479, + "balance_loss_clip": 1.01959443, + "balance_loss_mlp": 1.03553247, + "epoch": 0.7301367803998197, + "flos": 24347829576960.0, + "grad_norm": 2.3241470315915786, + "language_loss": 0.66688013, + "learning_rate": 7.163982370756882e-07, + "loss": 0.68821418, + "num_input_tokens_seen": 262107355, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 12144, + "time_per_iteration": 2.5108768939971924 + }, + { + "auxiliary_loss_clip": 0.01101867, + "auxiliary_loss_mlp": 0.01027765, + "balance_loss_clip": 1.01570094, + "balance_loss_mlp": 1.03569078, + "epoch": 0.7301969036524876, + "flos": 15304518040320.0, + "grad_norm": 1.6911946286278683, + "language_loss": 0.79302132, + "learning_rate": 7.160995944191627e-07, + "loss": 0.81431764, + "num_input_tokens_seen": 262125645, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6640625, + "step": 12145, + "time_per_iteration": 2.4418389797210693 + }, + { + "auxiliary_loss_clip": 0.01100282, + "auxiliary_loss_mlp": 0.01028599, + "balance_loss_clip": 1.0172739, + "balance_loss_mlp": 1.03604698, + "epoch": 0.7302570269051556, + "flos": 23507268433920.0, + "grad_norm": 1.6533125281544103, + "language_loss": 0.91145337, + "learning_rate": 7.158010004482702e-07, + "loss": 0.93274218, + "num_input_tokens_seen": 262144075, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 12146, + "time_per_iteration": 2.4392800331115723 + }, + { + "auxiliary_loss_clip": 0.01098845, + "auxiliary_loss_mlp": 0.01025746, + "balance_loss_clip": 1.01512456, + "balance_loss_mlp": 1.03589582, + "epoch": 0.7303171501578235, + "flos": 20523056549760.0, + "grad_norm": 3.9977008079887275, + "language_loss": 0.61903286, + "learning_rate": 7.155024551743316e-07, + "loss": 0.64027882, + "num_input_tokens_seen": 262165940, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.62890625, + "step": 12147, + "time_per_iteration": 2.4647200107574463 + }, + { + "auxiliary_loss_clip": 0.01102166, + "auxiliary_loss_mlp": 0.01035594, + "balance_loss_clip": 1.02376306, + "balance_loss_mlp": 1.03584671, + "epoch": 0.7303772734104915, + "flos": 18332613365760.0, + "grad_norm": 1.8253831896260186, + "language_loss": 0.75063682, + "learning_rate": 7.152039586086693e-07, + "loss": 0.7720145, + "num_input_tokens_seen": 262184520, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 12148, + "time_per_iteration": 2.4266207218170166 + }, + { + "auxiliary_loss_clip": 0.01024253, + "auxiliary_loss_mlp": 0.01006124, + "balance_loss_clip": 1.00514054, + "balance_loss_mlp": 1.0036819, + "epoch": 0.7304373966631594, + "flos": 60654776100480.0, + "grad_norm": 0.6830351523119454, + "language_loss": 0.56657213, + "learning_rate": 7.149055107626017e-07, + "loss": 0.58687592, + "num_input_tokens_seen": 262247070, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.20507812, + "step": 12149, + "time_per_iteration": 3.027615785598755 + }, + { + "auxiliary_loss_clip": 0.01101256, + "auxiliary_loss_mlp": 0.01030223, + "balance_loss_clip": 1.01849318, + "balance_loss_mlp": 1.03406572, + "epoch": 0.7304975199158275, + "flos": 19828077229440.0, + "grad_norm": 1.6835156550315518, + "language_loss": 0.73653138, + "learning_rate": 7.146071116474451e-07, + "loss": 0.75784624, + "num_input_tokens_seen": 262266605, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 12150, + "time_per_iteration": 2.4099485874176025 + }, + { + "auxiliary_loss_clip": 0.01103316, + "auxiliary_loss_mlp": 0.01027257, + "balance_loss_clip": 1.01468682, + "balance_loss_mlp": 1.03478301, + "epoch": 0.7305576431684954, + "flos": 13223997452160.0, + "grad_norm": 1.944560081629452, + "language_loss": 0.84078568, + "learning_rate": 7.143087612745158e-07, + "loss": 0.86209142, + "num_input_tokens_seen": 262283880, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 12151, + "time_per_iteration": 2.4708986282348633 + }, + { + "auxiliary_loss_clip": 0.01102798, + "auxiliary_loss_mlp": 0.0103432, + "balance_loss_clip": 1.0218451, + "balance_loss_mlp": 1.0358156, + "epoch": 0.7306177664211634, + "flos": 24060472773120.0, + "grad_norm": 1.670544008969589, + "language_loss": 0.77620661, + "learning_rate": 7.14010459655127e-07, + "loss": 0.79757774, + "num_input_tokens_seen": 262304155, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.66796875, + "step": 12152, + "time_per_iteration": 2.4695539474487305 + }, + { + "auxiliary_loss_clip": 0.01103894, + "auxiliary_loss_mlp": 0.01028837, + "balance_loss_clip": 1.01692247, + "balance_loss_mlp": 1.03786087, + "epoch": 0.7306778896738314, + "flos": 27089106802560.0, + "grad_norm": 1.5663619490166691, + "language_loss": 0.79568756, + "learning_rate": 7.137122068005919e-07, + "loss": 0.81701493, + "num_input_tokens_seen": 262325660, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66015625, + "step": 12153, + "time_per_iteration": 2.533879280090332 + }, + { + "auxiliary_loss_clip": 0.01105053, + "auxiliary_loss_mlp": 0.01030327, + "balance_loss_clip": 1.0184778, + "balance_loss_mlp": 1.03624892, + "epoch": 0.7307380129264993, + "flos": 16690669839360.0, + "grad_norm": 1.621227897072943, + "language_loss": 0.67485428, + "learning_rate": 7.134140027222173e-07, + "loss": 0.69620812, + "num_input_tokens_seen": 262344075, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6875, + "step": 12154, + "time_per_iteration": 2.418184995651245 + }, + { + "auxiliary_loss_clip": 0.01102596, + "auxiliary_loss_mlp": 0.01029751, + "balance_loss_clip": 1.01756167, + "balance_loss_mlp": 1.03488839, + "epoch": 0.7307981361791673, + "flos": 21725740656000.0, + "grad_norm": 1.9151300415152432, + "language_loss": 0.65747088, + "learning_rate": 7.131158474313128e-07, + "loss": 0.67879438, + "num_input_tokens_seen": 262363305, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.67578125, + "step": 12155, + "time_per_iteration": 2.4923956394195557 + }, + { + "auxiliary_loss_clip": 0.01096922, + "auxiliary_loss_mlp": 0.01030411, + "balance_loss_clip": 1.01884151, + "balance_loss_mlp": 1.03208816, + "epoch": 0.7308582594318352, + "flos": 18040659621120.0, + "grad_norm": 1.6880646162483905, + "language_loss": 0.81661636, + "learning_rate": 7.128177409391851e-07, + "loss": 0.83788967, + "num_input_tokens_seen": 262380730, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6484375, + "step": 12156, + "time_per_iteration": 2.4129483699798584 + }, + { + "auxiliary_loss_clip": 0.01098675, + "auxiliary_loss_mlp": 0.01030204, + "balance_loss_clip": 1.0193615, + "balance_loss_mlp": 1.03432953, + "epoch": 0.7309183826845033, + "flos": 13844964798720.0, + "grad_norm": 2.405459413664416, + "language_loss": 0.75240982, + "learning_rate": 7.125196832571367e-07, + "loss": 0.77369863, + "num_input_tokens_seen": 262395480, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.640625, + "step": 12157, + "time_per_iteration": 2.4383459091186523 + }, + { + "auxiliary_loss_clip": 0.0109587, + "auxiliary_loss_mlp": 0.0102862, + "balance_loss_clip": 1.01831448, + "balance_loss_mlp": 1.03320694, + "epoch": 0.7309785059371712, + "flos": 17019216564480.0, + "grad_norm": 2.0421552799554457, + "language_loss": 0.72894901, + "learning_rate": 7.122216743964713e-07, + "loss": 0.75019395, + "num_input_tokens_seen": 262413340, + "router_z_loss_clip": 0.10302734, + "router_z_loss_mlp": 0.62890625, + "step": 12158, + "time_per_iteration": 2.409529209136963 + }, + { + "auxiliary_loss_clip": 0.01103494, + "auxiliary_loss_mlp": 0.01030392, + "balance_loss_clip": 1.01861429, + "balance_loss_mlp": 1.03654337, + "epoch": 0.7310386291898392, + "flos": 26502398052480.0, + "grad_norm": 1.5794059929341078, + "language_loss": 0.85767531, + "learning_rate": 7.119237143684896e-07, + "loss": 0.87901425, + "num_input_tokens_seen": 262433455, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.66796875, + "step": 12159, + "time_per_iteration": 2.5144267082214355 + }, + { + "auxiliary_loss_clip": 0.01104084, + "auxiliary_loss_mlp": 0.01029593, + "balance_loss_clip": 1.01700473, + "balance_loss_mlp": 1.03464055, + "epoch": 0.7310987524425071, + "flos": 16945922862720.0, + "grad_norm": 2.076806919622798, + "language_loss": 0.73464298, + "learning_rate": 7.116258031844895e-07, + "loss": 0.75597978, + "num_input_tokens_seen": 262450335, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 12160, + "time_per_iteration": 2.405029535293579 + }, + { + "auxiliary_loss_clip": 0.0110368, + "auxiliary_loss_mlp": 0.01029782, + "balance_loss_clip": 1.01743793, + "balance_loss_mlp": 1.0356549, + "epoch": 0.7311588756951751, + "flos": 13845288021120.0, + "grad_norm": 1.9196235781681743, + "language_loss": 0.72528148, + "learning_rate": 7.113279408557675e-07, + "loss": 0.74661607, + "num_input_tokens_seen": 262468240, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6796875, + "step": 12161, + "time_per_iteration": 2.4075698852539062 + }, + { + "auxiliary_loss_clip": 0.01107154, + "auxiliary_loss_mlp": 0.01029867, + "balance_loss_clip": 1.01682591, + "balance_loss_mlp": 1.03725171, + "epoch": 0.731218998947843, + "flos": 28767894704640.0, + "grad_norm": 5.707259461998225, + "language_loss": 0.69178545, + "learning_rate": 7.110301273936192e-07, + "loss": 0.71315575, + "num_input_tokens_seen": 262487045, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.69921875, + "step": 12162, + "time_per_iteration": 2.5137577056884766 + }, + { + "auxiliary_loss_clip": 0.01103934, + "auxiliary_loss_mlp": 0.01030313, + "balance_loss_clip": 1.01783824, + "balance_loss_mlp": 1.03625202, + "epoch": 0.7312791222005111, + "flos": 27088783580160.0, + "grad_norm": 1.8703565147701806, + "language_loss": 0.66851526, + "learning_rate": 7.107323628093382e-07, + "loss": 0.68985772, + "num_input_tokens_seen": 262504855, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 12163, + "time_per_iteration": 2.4703001976013184 + }, + { + "auxiliary_loss_clip": 0.0110019, + "auxiliary_loss_mlp": 0.01030251, + "balance_loss_clip": 1.01822889, + "balance_loss_mlp": 1.03375793, + "epoch": 0.731339245453179, + "flos": 20924035050240.0, + "grad_norm": 1.4832431428317139, + "language_loss": 0.68488622, + "learning_rate": 7.104346471142153e-07, + "loss": 0.70619065, + "num_input_tokens_seen": 262524920, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 12164, + "time_per_iteration": 2.4578616619110107 + }, + { + "auxiliary_loss_clip": 0.01100044, + "auxiliary_loss_mlp": 0.01031446, + "balance_loss_clip": 1.0206039, + "balance_loss_mlp": 1.0372684, + "epoch": 0.731399368705847, + "flos": 23075694524160.0, + "grad_norm": 1.4717257929564707, + "language_loss": 0.72854477, + "learning_rate": 7.101369803195391e-07, + "loss": 0.74985963, + "num_input_tokens_seen": 262545725, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.62890625, + "step": 12165, + "time_per_iteration": 2.451599359512329 + }, + { + "auxiliary_loss_clip": 0.01102834, + "auxiliary_loss_mlp": 0.01033628, + "balance_loss_clip": 1.0217309, + "balance_loss_mlp": 1.03535652, + "epoch": 0.731459491958515, + "flos": 23582681038080.0, + "grad_norm": 1.8716087020467311, + "language_loss": 0.76773065, + "learning_rate": 7.098393624365988e-07, + "loss": 0.78909522, + "num_input_tokens_seen": 262565480, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 12166, + "time_per_iteration": 2.4635097980499268 + }, + { + "auxiliary_loss_clip": 0.01102127, + "auxiliary_loss_mlp": 0.01032086, + "balance_loss_clip": 1.02040911, + "balance_loss_mlp": 1.03687727, + "epoch": 0.7315196152111829, + "flos": 22379278659840.0, + "grad_norm": 2.0545527072080945, + "language_loss": 0.79531485, + "learning_rate": 7.095417934766781e-07, + "loss": 0.81665695, + "num_input_tokens_seen": 262584145, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65234375, + "step": 12167, + "time_per_iteration": 2.46749210357666 + }, + { + "auxiliary_loss_clip": 0.01101324, + "auxiliary_loss_mlp": 0.01038084, + "balance_loss_clip": 1.02647865, + "balance_loss_mlp": 1.03602624, + "epoch": 0.7315797384638509, + "flos": 26177047637760.0, + "grad_norm": 1.668118675469295, + "language_loss": 0.76923746, + "learning_rate": 7.092442734510622e-07, + "loss": 0.79063153, + "num_input_tokens_seen": 262604045, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 12168, + "time_per_iteration": 2.5427803993225098 + }, + { + "auxiliary_loss_clip": 0.01103242, + "auxiliary_loss_mlp": 0.01033018, + "balance_loss_clip": 1.02010727, + "balance_loss_mlp": 1.03531849, + "epoch": 0.7316398617165188, + "flos": 21506326427520.0, + "grad_norm": 1.6642312861866588, + "language_loss": 0.81803644, + "learning_rate": 7.089468023710326e-07, + "loss": 0.83939904, + "num_input_tokens_seen": 262624540, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6796875, + "step": 12169, + "time_per_iteration": 2.4575917720794678 + }, + { + "auxiliary_loss_clip": 0.0110358, + "auxiliary_loss_mlp": 0.01035859, + "balance_loss_clip": 1.024194, + "balance_loss_mlp": 1.03600168, + "epoch": 0.7316999849691869, + "flos": 30482557315200.0, + "grad_norm": 1.6489053706369026, + "language_loss": 0.69867074, + "learning_rate": 7.08649380247871e-07, + "loss": 0.72006512, + "num_input_tokens_seen": 262644545, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.67578125, + "step": 12170, + "time_per_iteration": 2.5548336505889893 + }, + { + "auxiliary_loss_clip": 0.01100039, + "auxiliary_loss_mlp": 0.01030294, + "balance_loss_clip": 1.01778316, + "balance_loss_mlp": 1.03440404, + "epoch": 0.7317601082218548, + "flos": 21543781334400.0, + "grad_norm": 2.8957173811976022, + "language_loss": 0.69379872, + "learning_rate": 7.083520070928533e-07, + "loss": 0.71510202, + "num_input_tokens_seen": 262662570, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.65625, + "step": 12171, + "time_per_iteration": 2.4312360286712646 + }, + { + "auxiliary_loss_clip": 0.01101881, + "auxiliary_loss_mlp": 0.01033722, + "balance_loss_clip": 1.02222395, + "balance_loss_mlp": 1.03613734, + "epoch": 0.7318202314745228, + "flos": 33251592775680.0, + "grad_norm": 3.1521599416176582, + "language_loss": 0.65645874, + "learning_rate": 7.080546829172564e-07, + "loss": 0.67781472, + "num_input_tokens_seen": 262683245, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 12172, + "time_per_iteration": 2.5476059913635254 + }, + { + "auxiliary_loss_clip": 0.01103925, + "auxiliary_loss_mlp": 0.01027122, + "balance_loss_clip": 1.01507545, + "balance_loss_mlp": 1.03686643, + "epoch": 0.7318803547271907, + "flos": 20157054917760.0, + "grad_norm": 2.237216797653005, + "language_loss": 0.6100843, + "learning_rate": 7.077574077323564e-07, + "loss": 0.63139474, + "num_input_tokens_seen": 262701585, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.671875, + "step": 12173, + "time_per_iteration": 2.4594876766204834 + }, + { + "auxiliary_loss_clip": 0.0110106, + "auxiliary_loss_mlp": 0.01025966, + "balance_loss_clip": 1.0147481, + "balance_loss_mlp": 1.03543413, + "epoch": 0.7319404779798587, + "flos": 20558536208640.0, + "grad_norm": 1.8253545093146943, + "language_loss": 0.73704946, + "learning_rate": 7.074601815494243e-07, + "loss": 0.75831974, + "num_input_tokens_seen": 262719295, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 12174, + "time_per_iteration": 2.515566349029541 + }, + { + "auxiliary_loss_clip": 0.01099205, + "auxiliary_loss_mlp": 0.01025641, + "balance_loss_clip": 1.01454306, + "balance_loss_mlp": 1.03585482, + "epoch": 0.7320006012325266, + "flos": 28695391102080.0, + "grad_norm": 1.5591268998445824, + "language_loss": 0.80786538, + "learning_rate": 7.071630043797317e-07, + "loss": 0.82911384, + "num_input_tokens_seen": 262739995, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6328125, + "step": 12175, + "time_per_iteration": 2.5457139015197754 + }, + { + "auxiliary_loss_clip": 0.01101358, + "auxiliary_loss_mlp": 0.01027139, + "balance_loss_clip": 1.01556993, + "balance_loss_mlp": 1.03506994, + "epoch": 0.7320607244851947, + "flos": 16362697731840.0, + "grad_norm": 1.8633750273009067, + "language_loss": 0.76524568, + "learning_rate": 7.068658762345488e-07, + "loss": 0.78653067, + "num_input_tokens_seen": 262757680, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66015625, + "step": 12176, + "time_per_iteration": 2.4949843883514404 + }, + { + "auxiliary_loss_clip": 0.01101151, + "auxiliary_loss_mlp": 0.01030227, + "balance_loss_clip": 1.01911664, + "balance_loss_mlp": 1.03668857, + "epoch": 0.7321208477378626, + "flos": 20955097336320.0, + "grad_norm": 2.0429703759451074, + "language_loss": 0.76661092, + "learning_rate": 7.065687971251399e-07, + "loss": 0.78792465, + "num_input_tokens_seen": 262776990, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.64453125, + "step": 12177, + "time_per_iteration": 2.5137908458709717 + }, + { + "auxiliary_loss_clip": 0.01097382, + "auxiliary_loss_mlp": 0.01034623, + "balance_loss_clip": 1.02391255, + "balance_loss_mlp": 1.03224051, + "epoch": 0.7321809709905306, + "flos": 13845072539520.0, + "grad_norm": 2.015813751432838, + "language_loss": 0.74164724, + "learning_rate": 7.06271767062772e-07, + "loss": 0.76296735, + "num_input_tokens_seen": 262795440, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6484375, + "step": 12178, + "time_per_iteration": 3.7930397987365723 + }, + { + "auxiliary_loss_clip": 0.01102574, + "auxiliary_loss_mlp": 0.01029204, + "balance_loss_clip": 1.01727104, + "balance_loss_mlp": 1.03461027, + "epoch": 0.7322410942431986, + "flos": 26979938392320.0, + "grad_norm": 3.902615906398373, + "language_loss": 0.82204944, + "learning_rate": 7.059747860587084e-07, + "loss": 0.84336722, + "num_input_tokens_seen": 262816385, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 12179, + "time_per_iteration": 2.4926083087921143 + }, + { + "auxiliary_loss_clip": 0.01096766, + "auxiliary_loss_mlp": 0.01031847, + "balance_loss_clip": 1.0208199, + "balance_loss_mlp": 1.03491974, + "epoch": 0.7323012174958665, + "flos": 17639717034240.0, + "grad_norm": 1.7358162194967635, + "language_loss": 0.74350899, + "learning_rate": 7.056778541242115e-07, + "loss": 0.76479512, + "num_input_tokens_seen": 262834955, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6171875, + "step": 12180, + "time_per_iteration": 3.9542806148529053 + }, + { + "auxiliary_loss_clip": 0.01102785, + "auxiliary_loss_mlp": 0.01028317, + "balance_loss_clip": 1.01565659, + "balance_loss_mlp": 1.03372073, + "epoch": 0.7323613407485345, + "flos": 32342765834880.0, + "grad_norm": 1.8090406286045437, + "language_loss": 0.78966725, + "learning_rate": 7.053809712705396e-07, + "loss": 0.81097823, + "num_input_tokens_seen": 262853555, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 12181, + "time_per_iteration": 5.370461940765381 + }, + { + "auxiliary_loss_clip": 0.01104053, + "auxiliary_loss_mlp": 0.01031532, + "balance_loss_clip": 1.01984382, + "balance_loss_mlp": 1.03627169, + "epoch": 0.7324214640012024, + "flos": 18362777811840.0, + "grad_norm": 1.6926303414905466, + "language_loss": 0.71438134, + "learning_rate": 7.050841375089506e-07, + "loss": 0.7357372, + "num_input_tokens_seen": 262870975, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 12182, + "time_per_iteration": 2.395366668701172 + }, + { + "auxiliary_loss_clip": 0.01104144, + "auxiliary_loss_mlp": 0.01033063, + "balance_loss_clip": 1.02144599, + "balance_loss_mlp": 1.03678739, + "epoch": 0.7324815872538705, + "flos": 30812289189120.0, + "grad_norm": 1.516043869338468, + "language_loss": 0.71126986, + "learning_rate": 7.047873528507015e-07, + "loss": 0.73264194, + "num_input_tokens_seen": 262892635, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 12183, + "time_per_iteration": 2.5406055450439453 + }, + { + "auxiliary_loss_clip": 0.0110482, + "auxiliary_loss_mlp": 0.01036116, + "balance_loss_clip": 1.02371871, + "balance_loss_mlp": 1.03739989, + "epoch": 0.7325417105065384, + "flos": 21505069451520.0, + "grad_norm": 1.782462638135082, + "language_loss": 0.72453171, + "learning_rate": 7.04490617307045e-07, + "loss": 0.74594104, + "num_input_tokens_seen": 262910725, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 12184, + "time_per_iteration": 2.4203481674194336 + }, + { + "auxiliary_loss_clip": 0.01023657, + "auxiliary_loss_mlp": 0.01014002, + "balance_loss_clip": 1.01300097, + "balance_loss_mlp": 1.00301158, + "epoch": 0.7326018337592064, + "flos": 67257742556160.0, + "grad_norm": 0.763876847553094, + "language_loss": 0.65218687, + "learning_rate": 7.041939308892344e-07, + "loss": 0.67256343, + "num_input_tokens_seen": 262974150, + "router_z_loss_clip": 0.01000977, + "router_z_loss_mlp": 0.20703125, + "step": 12185, + "time_per_iteration": 3.0270133018493652 + }, + { + "auxiliary_loss_clip": 0.01100629, + "auxiliary_loss_mlp": 0.01026623, + "balance_loss_clip": 1.01434445, + "balance_loss_mlp": 1.03290069, + "epoch": 0.7326619570118743, + "flos": 22857070394880.0, + "grad_norm": 1.938744837028, + "language_loss": 0.807504, + "learning_rate": 7.038972936085197e-07, + "loss": 0.82877648, + "num_input_tokens_seen": 262993370, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6796875, + "step": 12186, + "time_per_iteration": 2.4389822483062744 + }, + { + "auxiliary_loss_clip": 0.01101959, + "auxiliary_loss_mlp": 0.01031355, + "balance_loss_clip": 1.01886177, + "balance_loss_mlp": 1.03473353, + "epoch": 0.7327220802645423, + "flos": 23327499841920.0, + "grad_norm": 1.9074219827171814, + "language_loss": 0.73762989, + "learning_rate": 7.036007054761508e-07, + "loss": 0.75896305, + "num_input_tokens_seen": 263012665, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.671875, + "step": 12187, + "time_per_iteration": 2.4973368644714355 + }, + { + "auxiliary_loss_clip": 0.01104423, + "auxiliary_loss_mlp": 0.01033786, + "balance_loss_clip": 1.02201378, + "balance_loss_mlp": 1.03718829, + "epoch": 0.7327822035172102, + "flos": 23180661043200.0, + "grad_norm": 1.717563808128471, + "language_loss": 0.88947159, + "learning_rate": 7.033041665033716e-07, + "loss": 0.91085368, + "num_input_tokens_seen": 263031475, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 12188, + "time_per_iteration": 2.4411849975585938 + }, + { + "auxiliary_loss_clip": 0.01103922, + "auxiliary_loss_mlp": 0.01030882, + "balance_loss_clip": 1.01875281, + "balance_loss_mlp": 1.03507185, + "epoch": 0.7328423267698783, + "flos": 21066600130560.0, + "grad_norm": 1.8794202002209792, + "language_loss": 0.7421574, + "learning_rate": 7.030076767014284e-07, + "loss": 0.76350546, + "num_input_tokens_seen": 263051445, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6875, + "step": 12189, + "time_per_iteration": 2.4856882095336914 + }, + { + "auxiliary_loss_clip": 0.0110238, + "auxiliary_loss_mlp": 0.0102823, + "balance_loss_clip": 1.01568341, + "balance_loss_mlp": 1.03474796, + "epoch": 0.7329024500225462, + "flos": 21689578638720.0, + "grad_norm": 1.5825056379011793, + "language_loss": 0.82314098, + "learning_rate": 7.027112360815648e-07, + "loss": 0.84444714, + "num_input_tokens_seen": 263070835, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.67578125, + "step": 12190, + "time_per_iteration": 2.456019878387451 + }, + { + "auxiliary_loss_clip": 0.01102905, + "auxiliary_loss_mlp": 0.01034193, + "balance_loss_clip": 1.02160442, + "balance_loss_mlp": 1.03589582, + "epoch": 0.7329625732752142, + "flos": 24164038661760.0, + "grad_norm": 1.732792680094222, + "language_loss": 0.71868473, + "learning_rate": 7.024148446550204e-07, + "loss": 0.74005568, + "num_input_tokens_seen": 263090070, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.66796875, + "step": 12191, + "time_per_iteration": 2.483551502227783 + }, + { + "auxiliary_loss_clip": 0.01103846, + "auxiliary_loss_mlp": 0.01033545, + "balance_loss_clip": 1.02112985, + "balance_loss_mlp": 1.03651261, + "epoch": 0.7330226965278822, + "flos": 30077915627520.0, + "grad_norm": 1.5577440951602006, + "language_loss": 0.69461203, + "learning_rate": 7.021185024330361e-07, + "loss": 0.71598595, + "num_input_tokens_seen": 263110030, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 12192, + "time_per_iteration": 2.509345531463623 + }, + { + "auxiliary_loss_clip": 0.01100244, + "auxiliary_loss_mlp": 0.01028189, + "balance_loss_clip": 1.01683998, + "balance_loss_mlp": 1.03492808, + "epoch": 0.7330828197805501, + "flos": 23368294713600.0, + "grad_norm": 1.567853507336265, + "language_loss": 0.73125577, + "learning_rate": 7.01822209426848e-07, + "loss": 0.75254017, + "num_input_tokens_seen": 263129735, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65234375, + "step": 12193, + "time_per_iteration": 2.5061562061309814 + }, + { + "auxiliary_loss_clip": 0.01101772, + "auxiliary_loss_mlp": 0.01027657, + "balance_loss_clip": 1.01551533, + "balance_loss_mlp": 1.03417039, + "epoch": 0.7331429430332181, + "flos": 21032808410880.0, + "grad_norm": 4.194654550291271, + "language_loss": 0.76709831, + "learning_rate": 7.015259656476911e-07, + "loss": 0.78839254, + "num_input_tokens_seen": 263149100, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.67578125, + "step": 12194, + "time_per_iteration": 2.429858446121216 + }, + { + "auxiliary_loss_clip": 0.01101072, + "auxiliary_loss_mlp": 0.01026816, + "balance_loss_clip": 1.01485932, + "balance_loss_mlp": 1.03564095, + "epoch": 0.733203066285886, + "flos": 14647891466880.0, + "grad_norm": 1.8657268793695219, + "language_loss": 0.70426142, + "learning_rate": 7.012297711067998e-07, + "loss": 0.72554034, + "num_input_tokens_seen": 263166620, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 12195, + "time_per_iteration": 2.47605299949646 + }, + { + "auxiliary_loss_clip": 0.01103283, + "auxiliary_loss_mlp": 0.01036918, + "balance_loss_clip": 1.02542019, + "balance_loss_mlp": 1.0363059, + "epoch": 0.7332631895385541, + "flos": 17165301177600.0, + "grad_norm": 1.8748815414700573, + "language_loss": 0.72009385, + "learning_rate": 7.009336258154057e-07, + "loss": 0.74149585, + "num_input_tokens_seen": 263184780, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 12196, + "time_per_iteration": 2.4170355796813965 + }, + { + "auxiliary_loss_clip": 0.01100598, + "auxiliary_loss_mlp": 0.0102618, + "balance_loss_clip": 1.01400244, + "balance_loss_mlp": 1.03541434, + "epoch": 0.733323312791222, + "flos": 28658151676800.0, + "grad_norm": 1.6057850533210987, + "language_loss": 0.71647477, + "learning_rate": 7.006375297847394e-07, + "loss": 0.73774254, + "num_input_tokens_seen": 263204625, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65234375, + "step": 12197, + "time_per_iteration": 2.5049266815185547 + }, + { + "auxiliary_loss_clip": 0.01103625, + "auxiliary_loss_mlp": 0.01038235, + "balance_loss_clip": 1.0253787, + "balance_loss_mlp": 1.03410459, + "epoch": 0.73338343604389, + "flos": 16618417632000.0, + "grad_norm": 1.8231283851018831, + "language_loss": 0.78448522, + "learning_rate": 7.003414830260282e-07, + "loss": 0.80590379, + "num_input_tokens_seen": 263221565, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 12198, + "time_per_iteration": 2.4223878383636475 + }, + { + "auxiliary_loss_clip": 0.0110209, + "auxiliary_loss_mlp": 0.01030266, + "balance_loss_clip": 1.01910758, + "balance_loss_mlp": 1.03584075, + "epoch": 0.7334435592965579, + "flos": 21142084561920.0, + "grad_norm": 1.9413444885935378, + "language_loss": 0.74405611, + "learning_rate": 7.000454855504974e-07, + "loss": 0.76537967, + "num_input_tokens_seen": 263240620, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 12199, + "time_per_iteration": 2.503514528274536 + }, + { + "auxiliary_loss_clip": 0.01106436, + "auxiliary_loss_mlp": 0.01034584, + "balance_loss_clip": 1.02240086, + "balance_loss_mlp": 1.03749204, + "epoch": 0.7335036825492259, + "flos": 17125332318720.0, + "grad_norm": 2.410343838162529, + "language_loss": 0.76916027, + "learning_rate": 6.997495373693729e-07, + "loss": 0.79057044, + "num_input_tokens_seen": 263254365, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 12200, + "time_per_iteration": 2.385646104812622 + }, + { + "auxiliary_loss_clip": 0.0110137, + "auxiliary_loss_mlp": 0.01031712, + "balance_loss_clip": 1.02033889, + "balance_loss_mlp": 1.03535485, + "epoch": 0.7335638058018938, + "flos": 23731818307200.0, + "grad_norm": 1.9712263454849892, + "language_loss": 0.61337197, + "learning_rate": 6.994536384938754e-07, + "loss": 0.63470274, + "num_input_tokens_seen": 263275880, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.66015625, + "step": 12201, + "time_per_iteration": 2.494711399078369 + }, + { + "auxiliary_loss_clip": 0.01099322, + "auxiliary_loss_mlp": 0.01022943, + "balance_loss_clip": 1.01186204, + "balance_loss_mlp": 1.03445053, + "epoch": 0.7336239290545619, + "flos": 34933289679360.0, + "grad_norm": 1.770832212268843, + "language_loss": 0.52208602, + "learning_rate": 6.991577889352264e-07, + "loss": 0.54330868, + "num_input_tokens_seen": 263298315, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6484375, + "step": 12202, + "time_per_iteration": 2.5508878231048584 + }, + { + "auxiliary_loss_clip": 0.01100977, + "auxiliary_loss_mlp": 0.01026727, + "balance_loss_clip": 1.01555753, + "balance_loss_mlp": 1.03535819, + "epoch": 0.7336840523072298, + "flos": 21103049456640.0, + "grad_norm": 1.8712183341846977, + "language_loss": 0.68450284, + "learning_rate": 6.98861988704645e-07, + "loss": 0.70577991, + "num_input_tokens_seen": 263318615, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.65625, + "step": 12203, + "time_per_iteration": 2.455225944519043 + }, + { + "auxiliary_loss_clip": 0.01104999, + "auxiliary_loss_mlp": 0.01037444, + "balance_loss_clip": 1.02551746, + "balance_loss_mlp": 1.03558648, + "epoch": 0.7337441755598978, + "flos": 24024418496640.0, + "grad_norm": 2.0115937343101176, + "language_loss": 0.66122192, + "learning_rate": 6.985662378133474e-07, + "loss": 0.68264639, + "num_input_tokens_seen": 263336705, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 12204, + "time_per_iteration": 2.4275307655334473 + }, + { + "auxiliary_loss_clip": 0.01100701, + "auxiliary_loss_mlp": 0.0102869, + "balance_loss_clip": 1.01770449, + "balance_loss_mlp": 1.036098, + "epoch": 0.7338042988125658, + "flos": 22711309004160.0, + "grad_norm": 2.1044017909422434, + "language_loss": 0.77165949, + "learning_rate": 6.982705362725479e-07, + "loss": 0.79295337, + "num_input_tokens_seen": 263355065, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.64453125, + "step": 12205, + "time_per_iteration": 2.465723752975464 + }, + { + "auxiliary_loss_clip": 0.01101796, + "auxiliary_loss_mlp": 0.01028561, + "balance_loss_clip": 1.01719475, + "balance_loss_mlp": 1.03765106, + "epoch": 0.7338644220652337, + "flos": 21360996000000.0, + "grad_norm": 1.633398371679339, + "language_loss": 0.79663754, + "learning_rate": 6.979748840934601e-07, + "loss": 0.81794107, + "num_input_tokens_seen": 263374460, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 12206, + "time_per_iteration": 2.4295356273651123 + }, + { + "auxiliary_loss_clip": 0.01101572, + "auxiliary_loss_mlp": 0.01027593, + "balance_loss_clip": 1.01490951, + "balance_loss_mlp": 1.03436399, + "epoch": 0.7339245453179017, + "flos": 30920236536960.0, + "grad_norm": 1.938197948270063, + "language_loss": 0.71248126, + "learning_rate": 6.976792812872958e-07, + "loss": 0.73377299, + "num_input_tokens_seen": 263393610, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.671875, + "step": 12207, + "time_per_iteration": 2.533963918685913 + }, + { + "auxiliary_loss_clip": 0.01023391, + "auxiliary_loss_mlp": 0.01000694, + "balance_loss_clip": 0.99954408, + "balance_loss_mlp": 1.0029676, + "epoch": 0.7339846685705697, + "flos": 67899429072000.0, + "grad_norm": 0.7861729617868648, + "language_loss": 0.54826534, + "learning_rate": 6.97383727865263e-07, + "loss": 0.56850618, + "num_input_tokens_seen": 263450340, + "router_z_loss_clip": 0.01147461, + "router_z_loss_mlp": 0.20507812, + "step": 12208, + "time_per_iteration": 3.1204357147216797 + }, + { + "auxiliary_loss_clip": 0.01100641, + "auxiliary_loss_mlp": 0.01026824, + "balance_loss_clip": 1.01652431, + "balance_loss_mlp": 1.03539574, + "epoch": 0.7340447918232377, + "flos": 22236749493120.0, + "grad_norm": 1.435103992793476, + "language_loss": 0.80251199, + "learning_rate": 6.970882238385703e-07, + "loss": 0.82378662, + "num_input_tokens_seen": 263471735, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.65234375, + "step": 12209, + "time_per_iteration": 2.4724159240722656 + }, + { + "auxiliary_loss_clip": 0.01097718, + "auxiliary_loss_mlp": 0.01027545, + "balance_loss_clip": 1.01657128, + "balance_loss_mlp": 1.03298545, + "epoch": 0.7341049150759056, + "flos": 23764784014080.0, + "grad_norm": 1.8625549043469913, + "language_loss": 0.78958344, + "learning_rate": 6.96792769218423e-07, + "loss": 0.81083614, + "num_input_tokens_seen": 263493245, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6484375, + "step": 12210, + "time_per_iteration": 2.455946445465088 + }, + { + "auxiliary_loss_clip": 0.011003, + "auxiliary_loss_mlp": 0.0102585, + "balance_loss_clip": 1.01407206, + "balance_loss_mlp": 1.03534794, + "epoch": 0.7341650383285736, + "flos": 17236547804160.0, + "grad_norm": 1.6735159974751206, + "language_loss": 0.7608707, + "learning_rate": 6.964973640160236e-07, + "loss": 0.78213215, + "num_input_tokens_seen": 263511660, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6484375, + "step": 12211, + "time_per_iteration": 2.4627277851104736 + }, + { + "auxiliary_loss_clip": 0.01102174, + "auxiliary_loss_mlp": 0.01025987, + "balance_loss_clip": 1.01444197, + "balance_loss_mlp": 1.03521109, + "epoch": 0.7342251615812415, + "flos": 23403953940480.0, + "grad_norm": 1.9025360413194936, + "language_loss": 0.71490365, + "learning_rate": 6.962020082425748e-07, + "loss": 0.73618519, + "num_input_tokens_seen": 263530875, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 12212, + "time_per_iteration": 2.446685552597046 + }, + { + "auxiliary_loss_clip": 0.01103728, + "auxiliary_loss_mlp": 0.01031481, + "balance_loss_clip": 1.01991129, + "balance_loss_mlp": 1.03784096, + "epoch": 0.7342852848339095, + "flos": 22747183712640.0, + "grad_norm": 1.9034635106886582, + "language_loss": 0.68719161, + "learning_rate": 6.959067019092766e-07, + "loss": 0.70854366, + "num_input_tokens_seen": 263551585, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 12213, + "time_per_iteration": 2.4991095066070557 + }, + { + "auxiliary_loss_clip": 0.01023626, + "auxiliary_loss_mlp": 0.01002854, + "balance_loss_clip": 1.00172174, + "balance_loss_mlp": 1.00317287, + "epoch": 0.7343454080865774, + "flos": 53942353925760.0, + "grad_norm": 0.7248810226626392, + "language_loss": 0.54344672, + "learning_rate": 6.956114450273276e-07, + "loss": 0.56371152, + "num_input_tokens_seen": 263609545, + "router_z_loss_clip": 0.01135254, + "router_z_loss_mlp": 0.20507812, + "step": 12214, + "time_per_iteration": 2.920579433441162 + }, + { + "auxiliary_loss_clip": 0.01103211, + "auxiliary_loss_mlp": 0.01026326, + "balance_loss_clip": 1.01514435, + "balance_loss_mlp": 1.03471541, + "epoch": 0.7344055313392455, + "flos": 12166859255040.0, + "grad_norm": 1.9617721107193735, + "language_loss": 0.70233238, + "learning_rate": 6.953162376079233e-07, + "loss": 0.72362781, + "num_input_tokens_seen": 263627880, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.68359375, + "step": 12215, + "time_per_iteration": 2.4825196266174316 + }, + { + "auxiliary_loss_clip": 0.01098919, + "auxiliary_loss_mlp": 0.01027103, + "balance_loss_clip": 1.01648164, + "balance_loss_mlp": 1.03576207, + "epoch": 0.7344656545919134, + "flos": 18550052346240.0, + "grad_norm": 1.7008791597621735, + "language_loss": 0.72984588, + "learning_rate": 6.950210796622573e-07, + "loss": 0.75110614, + "num_input_tokens_seen": 263645665, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6328125, + "step": 12216, + "time_per_iteration": 2.419165849685669 + }, + { + "auxiliary_loss_clip": 0.01106239, + "auxiliary_loss_mlp": 0.01035051, + "balance_loss_clip": 1.02124095, + "balance_loss_mlp": 1.03503752, + "epoch": 0.7345257778445814, + "flos": 23661649088640.0, + "grad_norm": 1.6841898563593931, + "language_loss": 0.7813915, + "learning_rate": 6.947259712015236e-07, + "loss": 0.80280441, + "num_input_tokens_seen": 263668170, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7109375, + "step": 12217, + "time_per_iteration": 2.519476890563965 + }, + { + "auxiliary_loss_clip": 0.01097824, + "auxiliary_loss_mlp": 0.01025415, + "balance_loss_clip": 1.01500201, + "balance_loss_mlp": 1.03322065, + "epoch": 0.7345859010972494, + "flos": 13808659127040.0, + "grad_norm": 2.0430723318586814, + "language_loss": 0.77478087, + "learning_rate": 6.94430912236911e-07, + "loss": 0.7960133, + "num_input_tokens_seen": 263684190, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.64453125, + "step": 12218, + "time_per_iteration": 2.4323973655700684 + }, + { + "auxiliary_loss_clip": 0.0109922, + "auxiliary_loss_mlp": 0.01029029, + "balance_loss_clip": 1.0175488, + "balance_loss_mlp": 1.03478718, + "epoch": 0.7346460243499173, + "flos": 22272731942400.0, + "grad_norm": 2.4653490702635223, + "language_loss": 0.72245163, + "learning_rate": 6.941359027796092e-07, + "loss": 0.74373412, + "num_input_tokens_seen": 263702095, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.64453125, + "step": 12219, + "time_per_iteration": 3.851811408996582 + }, + { + "auxiliary_loss_clip": 0.0109725, + "auxiliary_loss_mlp": 0.01027919, + "balance_loss_clip": 1.01675463, + "balance_loss_mlp": 1.03373814, + "epoch": 0.7347061476025853, + "flos": 23255247634560.0, + "grad_norm": 1.7840681188410097, + "language_loss": 0.7480529, + "learning_rate": 6.938409428408061e-07, + "loss": 0.76930463, + "num_input_tokens_seen": 263721385, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6328125, + "step": 12220, + "time_per_iteration": 2.450587511062622 + }, + { + "auxiliary_loss_clip": 0.01102396, + "auxiliary_loss_mlp": 0.01030551, + "balance_loss_clip": 1.01934493, + "balance_loss_mlp": 1.03515816, + "epoch": 0.7347662708552533, + "flos": 15267565923840.0, + "grad_norm": 1.5828657801363317, + "language_loss": 0.65927309, + "learning_rate": 6.93546032431684e-07, + "loss": 0.68060255, + "num_input_tokens_seen": 263737835, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 12221, + "time_per_iteration": 3.9862098693847656 + }, + { + "auxiliary_loss_clip": 0.01100484, + "auxiliary_loss_mlp": 0.01028442, + "balance_loss_clip": 1.01736093, + "balance_loss_mlp": 1.03518033, + "epoch": 0.7348263941079213, + "flos": 24859987649280.0, + "grad_norm": 1.690484446007973, + "language_loss": 0.69146597, + "learning_rate": 6.932511715634273e-07, + "loss": 0.71275526, + "num_input_tokens_seen": 263756480, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.65234375, + "step": 12222, + "time_per_iteration": 3.9009041786193848 + }, + { + "auxiliary_loss_clip": 0.01099444, + "auxiliary_loss_mlp": 0.01027554, + "balance_loss_clip": 1.01703954, + "balance_loss_mlp": 1.03489995, + "epoch": 0.7348865173605892, + "flos": 24352103295360.0, + "grad_norm": 1.6021663525354104, + "language_loss": 0.65751356, + "learning_rate": 6.92956360247217e-07, + "loss": 0.67878354, + "num_input_tokens_seen": 263776440, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.64453125, + "step": 12223, + "time_per_iteration": 3.9320757389068604 + }, + { + "auxiliary_loss_clip": 0.01101903, + "auxiliary_loss_mlp": 0.01027965, + "balance_loss_clip": 1.01641989, + "balance_loss_mlp": 1.03491271, + "epoch": 0.7349466406132572, + "flos": 20004613597440.0, + "grad_norm": 1.7805598542267875, + "language_loss": 0.72150576, + "learning_rate": 6.926615984942332e-07, + "loss": 0.74280441, + "num_input_tokens_seen": 263793700, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.671875, + "step": 12224, + "time_per_iteration": 2.424764394760132 + }, + { + "auxiliary_loss_clip": 0.01102425, + "auxiliary_loss_mlp": 0.01030088, + "balance_loss_clip": 1.01862597, + "balance_loss_mlp": 1.0356946, + "epoch": 0.7350067638659251, + "flos": 29825068815360.0, + "grad_norm": 1.667305857067153, + "language_loss": 0.72422898, + "learning_rate": 6.92366886315652e-07, + "loss": 0.74555409, + "num_input_tokens_seen": 263814620, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.66796875, + "step": 12225, + "time_per_iteration": 2.555699110031128 + }, + { + "auxiliary_loss_clip": 0.0110455, + "auxiliary_loss_mlp": 0.01031967, + "balance_loss_clip": 1.01911092, + "balance_loss_mlp": 1.03528094, + "epoch": 0.7350668871185931, + "flos": 21866150920320.0, + "grad_norm": 1.6574802149125882, + "language_loss": 0.76740652, + "learning_rate": 6.920722237226501e-07, + "loss": 0.78877175, + "num_input_tokens_seen": 263832725, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69140625, + "step": 12226, + "time_per_iteration": 2.417281150817871 + }, + { + "auxiliary_loss_clip": 0.0110041, + "auxiliary_loss_mlp": 0.01028316, + "balance_loss_clip": 1.01671648, + "balance_loss_mlp": 1.03516448, + "epoch": 0.735127010371261, + "flos": 22566122231040.0, + "grad_norm": 1.6412947887343436, + "language_loss": 0.66742253, + "learning_rate": 6.917776107264008e-07, + "loss": 0.68870974, + "num_input_tokens_seen": 263853850, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 12227, + "time_per_iteration": 2.493746280670166 + }, + { + "auxiliary_loss_clip": 0.01101958, + "auxiliary_loss_mlp": 0.01034032, + "balance_loss_clip": 1.02254581, + "balance_loss_mlp": 1.03482342, + "epoch": 0.7351871336239291, + "flos": 25884339707520.0, + "grad_norm": 1.3969319271399194, + "language_loss": 0.63719964, + "learning_rate": 6.914830473380749e-07, + "loss": 0.65855956, + "num_input_tokens_seen": 263874760, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 12228, + "time_per_iteration": 2.4691944122314453 + }, + { + "auxiliary_loss_clip": 0.01099398, + "auxiliary_loss_mlp": 0.01033625, + "balance_loss_clip": 1.02261627, + "balance_loss_mlp": 1.03371692, + "epoch": 0.735247256876597, + "flos": 17932173569280.0, + "grad_norm": 2.005632249261944, + "language_loss": 0.63364494, + "learning_rate": 6.911885335688427e-07, + "loss": 0.65497524, + "num_input_tokens_seen": 263893390, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 12229, + "time_per_iteration": 2.44689679145813 + }, + { + "auxiliary_loss_clip": 0.01104076, + "auxiliary_loss_mlp": 0.0103478, + "balance_loss_clip": 1.02215624, + "balance_loss_mlp": 1.03622568, + "epoch": 0.735307380129265, + "flos": 28875159694080.0, + "grad_norm": 1.6720920493620766, + "language_loss": 0.73554301, + "learning_rate": 6.908940694298726e-07, + "loss": 0.7569316, + "num_input_tokens_seen": 263911180, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6796875, + "step": 12230, + "time_per_iteration": 2.471467971801758 + }, + { + "auxiliary_loss_clip": 0.01102648, + "auxiliary_loss_mlp": 0.01028691, + "balance_loss_clip": 1.01664519, + "balance_loss_mlp": 1.03582287, + "epoch": 0.7353675033819329, + "flos": 13625658311040.0, + "grad_norm": 1.9806878096831662, + "language_loss": 0.71668804, + "learning_rate": 6.90599654932332e-07, + "loss": 0.73800141, + "num_input_tokens_seen": 263928975, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.66796875, + "step": 12231, + "time_per_iteration": 2.473133087158203 + }, + { + "auxiliary_loss_clip": 0.01104217, + "auxiliary_loss_mlp": 0.01036138, + "balance_loss_clip": 1.02288222, + "balance_loss_mlp": 1.03647971, + "epoch": 0.7354276266346009, + "flos": 19463081178240.0, + "grad_norm": 3.941316401522165, + "language_loss": 0.64094537, + "learning_rate": 6.903052900873823e-07, + "loss": 0.66234899, + "num_input_tokens_seen": 263944495, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6796875, + "step": 12232, + "time_per_iteration": 2.4203951358795166 + }, + { + "auxiliary_loss_clip": 0.01102255, + "auxiliary_loss_mlp": 0.01030592, + "balance_loss_clip": 1.01851058, + "balance_loss_mlp": 1.03487468, + "epoch": 0.735487749887269, + "flos": 15771858917760.0, + "grad_norm": 1.8455770572081356, + "language_loss": 0.75458562, + "learning_rate": 6.900109749061874e-07, + "loss": 0.77591407, + "num_input_tokens_seen": 263961325, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 12233, + "time_per_iteration": 2.4624409675598145 + }, + { + "auxiliary_loss_clip": 0.01101376, + "auxiliary_loss_mlp": 0.01027274, + "balance_loss_clip": 1.01549006, + "balance_loss_mlp": 1.03507233, + "epoch": 0.7355478731399369, + "flos": 18260648467200.0, + "grad_norm": 1.614964377536134, + "language_loss": 0.73402774, + "learning_rate": 6.897167093999079e-07, + "loss": 0.75531423, + "num_input_tokens_seen": 263980445, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 12234, + "time_per_iteration": 2.4193742275238037 + }, + { + "auxiliary_loss_clip": 0.01103947, + "auxiliary_loss_mlp": 0.01030505, + "balance_loss_clip": 1.01924026, + "balance_loss_mlp": 1.03720987, + "epoch": 0.7356079963926049, + "flos": 26542043688960.0, + "grad_norm": 4.5713288626894455, + "language_loss": 0.59835577, + "learning_rate": 6.894224935797017e-07, + "loss": 0.61970031, + "num_input_tokens_seen": 263999330, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.66796875, + "step": 12235, + "time_per_iteration": 2.5044472217559814 + }, + { + "auxiliary_loss_clip": 0.01101342, + "auxiliary_loss_mlp": 0.01026652, + "balance_loss_clip": 1.01521957, + "balance_loss_mlp": 1.03657699, + "epoch": 0.7356681196452728, + "flos": 10778624467200.0, + "grad_norm": 2.0497651121742115, + "language_loss": 0.8565346, + "learning_rate": 6.891283274567259e-07, + "loss": 0.87781453, + "num_input_tokens_seen": 264014150, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 12236, + "time_per_iteration": 2.3936641216278076 + }, + { + "auxiliary_loss_clip": 0.01102811, + "auxiliary_loss_mlp": 0.01028119, + "balance_loss_clip": 1.01669908, + "balance_loss_mlp": 1.03538775, + "epoch": 0.7357282428979408, + "flos": 19718693337600.0, + "grad_norm": 1.8090519272371215, + "language_loss": 0.69331872, + "learning_rate": 6.888342110421364e-07, + "loss": 0.71462798, + "num_input_tokens_seen": 264033140, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 12237, + "time_per_iteration": 2.473252296447754 + }, + { + "auxiliary_loss_clip": 0.01101452, + "auxiliary_loss_mlp": 0.01025644, + "balance_loss_clip": 1.01471233, + "balance_loss_mlp": 1.03477573, + "epoch": 0.7357883661506087, + "flos": 19464014931840.0, + "grad_norm": 1.6472611180309946, + "language_loss": 0.72134531, + "learning_rate": 6.885401443470839e-07, + "loss": 0.7426163, + "num_input_tokens_seen": 264052105, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6640625, + "step": 12238, + "time_per_iteration": 2.423517942428589 + }, + { + "auxiliary_loss_clip": 0.01106028, + "auxiliary_loss_mlp": 0.01029179, + "balance_loss_clip": 1.01703119, + "balance_loss_mlp": 1.03515995, + "epoch": 0.7358484894032767, + "flos": 27123006263040.0, + "grad_norm": 1.7391094576956916, + "language_loss": 0.72675085, + "learning_rate": 6.882461273827205e-07, + "loss": 0.7481029, + "num_input_tokens_seen": 264070690, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.7109375, + "step": 12239, + "time_per_iteration": 2.523238182067871 + }, + { + "auxiliary_loss_clip": 0.01098843, + "auxiliary_loss_mlp": 0.01029765, + "balance_loss_clip": 1.01849365, + "balance_loss_mlp": 1.03532851, + "epoch": 0.7359086126559446, + "flos": 24502282058880.0, + "grad_norm": 1.5041553602452318, + "language_loss": 0.78892875, + "learning_rate": 6.879521601601954e-07, + "loss": 0.81021476, + "num_input_tokens_seen": 264094225, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6328125, + "step": 12240, + "time_per_iteration": 2.4987194538116455 + }, + { + "auxiliary_loss_clip": 0.0110103, + "auxiliary_loss_mlp": 0.01033295, + "balance_loss_clip": 1.02145731, + "balance_loss_mlp": 1.03596234, + "epoch": 0.7359687359086127, + "flos": 23331270769920.0, + "grad_norm": 1.7320565425934242, + "language_loss": 0.83208013, + "learning_rate": 6.876582426906565e-07, + "loss": 0.85342342, + "num_input_tokens_seen": 264113190, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6484375, + "step": 12241, + "time_per_iteration": 2.499547004699707 + }, + { + "auxiliary_loss_clip": 0.01099431, + "auxiliary_loss_mlp": 0.01026793, + "balance_loss_clip": 1.01507461, + "balance_loss_mlp": 1.03403616, + "epoch": 0.7360288591612806, + "flos": 20193396503040.0, + "grad_norm": 1.8298064214189858, + "language_loss": 0.78645867, + "learning_rate": 6.873643749852484e-07, + "loss": 0.8077209, + "num_input_tokens_seen": 264132050, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65234375, + "step": 12242, + "time_per_iteration": 2.4207592010498047 + }, + { + "auxiliary_loss_clip": 0.01102156, + "auxiliary_loss_mlp": 0.01029374, + "balance_loss_clip": 1.0180552, + "balance_loss_mlp": 1.0359714, + "epoch": 0.7360889824139486, + "flos": 24972783333120.0, + "grad_norm": 1.9546604159013963, + "language_loss": 0.79385024, + "learning_rate": 6.870705570551145e-07, + "loss": 0.81516558, + "num_input_tokens_seen": 264152800, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 12243, + "time_per_iteration": 2.51019024848938 + }, + { + "auxiliary_loss_clip": 0.01102378, + "auxiliary_loss_mlp": 0.01032839, + "balance_loss_clip": 1.02083445, + "balance_loss_mlp": 1.03466713, + "epoch": 0.7361491056666165, + "flos": 15012312900480.0, + "grad_norm": 1.9125543259943414, + "language_loss": 0.74100977, + "learning_rate": 6.867767889113969e-07, + "loss": 0.76236194, + "num_input_tokens_seen": 264169650, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 12244, + "time_per_iteration": 2.4030749797821045 + }, + { + "auxiliary_loss_clip": 0.01101314, + "auxiliary_loss_mlp": 0.01029345, + "balance_loss_clip": 1.01773405, + "balance_loss_mlp": 1.03416705, + "epoch": 0.7362092289192845, + "flos": 22930400010240.0, + "grad_norm": 1.7798055097675247, + "language_loss": 0.6942178, + "learning_rate": 6.864830705652347e-07, + "loss": 0.71552444, + "num_input_tokens_seen": 264190530, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 12245, + "time_per_iteration": 2.4875071048736572 + }, + { + "auxiliary_loss_clip": 0.01098192, + "auxiliary_loss_mlp": 0.01031115, + "balance_loss_clip": 1.01933169, + "balance_loss_mlp": 1.03475428, + "epoch": 0.7362693521719526, + "flos": 20702681487360.0, + "grad_norm": 1.5087221257099204, + "language_loss": 0.73185629, + "learning_rate": 6.861894020277658e-07, + "loss": 0.75314939, + "num_input_tokens_seen": 264210820, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.63671875, + "step": 12246, + "time_per_iteration": 2.4394288063049316 + }, + { + "auxiliary_loss_clip": 0.01096401, + "auxiliary_loss_mlp": 0.01025823, + "balance_loss_clip": 1.01489758, + "balance_loss_mlp": 1.0334698, + "epoch": 0.7363294754246205, + "flos": 13111381336320.0, + "grad_norm": 2.1784937379902787, + "language_loss": 0.73557955, + "learning_rate": 6.858957833101266e-07, + "loss": 0.75680184, + "num_input_tokens_seen": 264227430, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.62890625, + "step": 12247, + "time_per_iteration": 2.4587297439575195 + }, + { + "auxiliary_loss_clip": 0.01101638, + "auxiliary_loss_mlp": 0.01027969, + "balance_loss_clip": 1.01730013, + "balance_loss_mlp": 1.03827024, + "epoch": 0.7363895986772885, + "flos": 14027426910720.0, + "grad_norm": 1.48643381660021, + "language_loss": 0.7409212, + "learning_rate": 6.856022144234526e-07, + "loss": 0.76221728, + "num_input_tokens_seen": 264245230, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6328125, + "step": 12248, + "time_per_iteration": 2.4140796661376953 + }, + { + "auxiliary_loss_clip": 0.01101098, + "auxiliary_loss_mlp": 0.01034256, + "balance_loss_clip": 1.02237701, + "balance_loss_mlp": 1.03480268, + "epoch": 0.7364497219299564, + "flos": 19719986227200.0, + "grad_norm": 4.381127457761843, + "language_loss": 0.72677851, + "learning_rate": 6.853086953788727e-07, + "loss": 0.74813205, + "num_input_tokens_seen": 264263945, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6640625, + "step": 12249, + "time_per_iteration": 2.4724795818328857 + }, + { + "auxiliary_loss_clip": 0.01103324, + "auxiliary_loss_mlp": 0.01030157, + "balance_loss_clip": 1.01859331, + "balance_loss_mlp": 1.03676438, + "epoch": 0.7365098451826244, + "flos": 21361391049600.0, + "grad_norm": 1.708422030858321, + "language_loss": 0.77026933, + "learning_rate": 6.850152261875189e-07, + "loss": 0.79160416, + "num_input_tokens_seen": 264281500, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6640625, + "step": 12250, + "time_per_iteration": 2.4324309825897217 + }, + { + "auxiliary_loss_clip": 0.01102594, + "auxiliary_loss_mlp": 0.0102868, + "balance_loss_clip": 1.01680052, + "balance_loss_mlp": 1.0353688, + "epoch": 0.7365699684352923, + "flos": 23368222886400.0, + "grad_norm": 2.1441444373175687, + "language_loss": 0.71412712, + "learning_rate": 6.8472180686052e-07, + "loss": 0.7354399, + "num_input_tokens_seen": 264301625, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 12251, + "time_per_iteration": 2.4759652614593506 + }, + { + "auxiliary_loss_clip": 0.0110003, + "auxiliary_loss_mlp": 0.01029632, + "balance_loss_clip": 1.01828933, + "balance_loss_mlp": 1.03470254, + "epoch": 0.7366300916879603, + "flos": 59524879927680.0, + "grad_norm": 1.4418314268019194, + "language_loss": 0.65489835, + "learning_rate": 6.844284374090015e-07, + "loss": 0.67619503, + "num_input_tokens_seen": 264323975, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65234375, + "step": 12252, + "time_per_iteration": 2.8028664588928223 + }, + { + "auxiliary_loss_clip": 0.01105105, + "auxiliary_loss_mlp": 0.0102878, + "balance_loss_clip": 1.01736534, + "balance_loss_mlp": 1.03739333, + "epoch": 0.7366902149406283, + "flos": 20923137210240.0, + "grad_norm": 1.657771200645772, + "language_loss": 0.79182792, + "learning_rate": 6.841351178440884e-07, + "loss": 0.8131668, + "num_input_tokens_seen": 264343785, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.67578125, + "step": 12253, + "time_per_iteration": 2.472512722015381 + }, + { + "auxiliary_loss_clip": 0.01096622, + "auxiliary_loss_mlp": 0.010276, + "balance_loss_clip": 1.01669192, + "balance_loss_mlp": 1.03384531, + "epoch": 0.7367503381932963, + "flos": 17348158339200.0, + "grad_norm": 2.145672565702914, + "language_loss": 0.75874883, + "learning_rate": 6.83841848176905e-07, + "loss": 0.77999103, + "num_input_tokens_seen": 264361130, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.625, + "step": 12254, + "time_per_iteration": 2.419156074523926 + }, + { + "auxiliary_loss_clip": 0.01101466, + "auxiliary_loss_mlp": 0.01032545, + "balance_loss_clip": 1.02074361, + "balance_loss_mlp": 1.03581631, + "epoch": 0.7368104614459642, + "flos": 17821317219840.0, + "grad_norm": 2.333279522964119, + "language_loss": 0.68892902, + "learning_rate": 6.835486284185692e-07, + "loss": 0.71026909, + "num_input_tokens_seen": 264376965, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65625, + "step": 12255, + "time_per_iteration": 2.456407308578491 + }, + { + "auxiliary_loss_clip": 0.01101847, + "auxiliary_loss_mlp": 0.01029458, + "balance_loss_clip": 1.01738834, + "balance_loss_mlp": 1.03577256, + "epoch": 0.7368705846986322, + "flos": 24606099342720.0, + "grad_norm": 2.0115502306535404, + "language_loss": 0.7508868, + "learning_rate": 6.832554585802012e-07, + "loss": 0.77219987, + "num_input_tokens_seen": 264396310, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6640625, + "step": 12256, + "time_per_iteration": 2.4806578159332275 + }, + { + "auxiliary_loss_clip": 0.01103736, + "auxiliary_loss_mlp": 0.01029113, + "balance_loss_clip": 1.01691759, + "balance_loss_mlp": 1.0363915, + "epoch": 0.7369307079513001, + "flos": 34970169968640.0, + "grad_norm": 1.5936534045043864, + "language_loss": 0.73533136, + "learning_rate": 6.829623386729182e-07, + "loss": 0.75665981, + "num_input_tokens_seen": 264418085, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.671875, + "step": 12257, + "time_per_iteration": 2.5967447757720947 + }, + { + "auxiliary_loss_clip": 0.01100133, + "auxiliary_loss_mlp": 0.01034318, + "balance_loss_clip": 1.02328479, + "balance_loss_mlp": 1.0344913, + "epoch": 0.7369908312039681, + "flos": 21214588164480.0, + "grad_norm": 1.4666060569830273, + "language_loss": 0.78067857, + "learning_rate": 6.826692687078362e-07, + "loss": 0.80202311, + "num_input_tokens_seen": 264437595, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 12258, + "time_per_iteration": 2.454329252243042 + }, + { + "auxiliary_loss_clip": 0.0110487, + "auxiliary_loss_mlp": 0.01029651, + "balance_loss_clip": 1.0180105, + "balance_loss_mlp": 1.03685117, + "epoch": 0.7370509544566362, + "flos": 23623655477760.0, + "grad_norm": 1.3867663760940814, + "language_loss": 0.66167754, + "learning_rate": 6.823762486960674e-07, + "loss": 0.68302274, + "num_input_tokens_seen": 264457385, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 12259, + "time_per_iteration": 2.517813205718994 + }, + { + "auxiliary_loss_clip": 0.0110186, + "auxiliary_loss_mlp": 0.01032538, + "balance_loss_clip": 1.02026582, + "balance_loss_mlp": 1.03576601, + "epoch": 0.7371110777093041, + "flos": 24827704300800.0, + "grad_norm": 1.584231595020614, + "language_loss": 0.73625088, + "learning_rate": 6.820832786487225e-07, + "loss": 0.75759482, + "num_input_tokens_seen": 264477205, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66015625, + "step": 12260, + "time_per_iteration": 2.5023396015167236 + }, + { + "auxiliary_loss_clip": 0.0110407, + "auxiliary_loss_mlp": 0.01028376, + "balance_loss_clip": 1.01717019, + "balance_loss_mlp": 1.03662717, + "epoch": 0.7371712009619721, + "flos": 23149491016320.0, + "grad_norm": 1.604192195943769, + "language_loss": 0.73533583, + "learning_rate": 6.817903585769125e-07, + "loss": 0.75666034, + "num_input_tokens_seen": 264497195, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.67578125, + "step": 12261, + "time_per_iteration": 3.906297445297241 + }, + { + "auxiliary_loss_clip": 0.01105085, + "auxiliary_loss_mlp": 0.01034539, + "balance_loss_clip": 1.02218294, + "balance_loss_mlp": 1.03563118, + "epoch": 0.73723132421464, + "flos": 23112898035840.0, + "grad_norm": 2.303167962152087, + "language_loss": 0.66901404, + "learning_rate": 6.814974884917438e-07, + "loss": 0.69041032, + "num_input_tokens_seen": 264516950, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6953125, + "step": 12262, + "time_per_iteration": 2.4535868167877197 + }, + { + "auxiliary_loss_clip": 0.01102433, + "auxiliary_loss_mlp": 0.0102981, + "balance_loss_clip": 1.01726305, + "balance_loss_mlp": 1.03487778, + "epoch": 0.737291447467308, + "flos": 19273328605440.0, + "grad_norm": 1.8236008971372257, + "language_loss": 0.88766813, + "learning_rate": 6.81204668404322e-07, + "loss": 0.90899056, + "num_input_tokens_seen": 264532675, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.67578125, + "step": 12263, + "time_per_iteration": 4.029206037521362 + }, + { + "auxiliary_loss_clip": 0.01096266, + "auxiliary_loss_mlp": 0.01026445, + "balance_loss_clip": 1.01636577, + "balance_loss_mlp": 1.03449428, + "epoch": 0.7373515707199759, + "flos": 25118257415040.0, + "grad_norm": 2.309256872894793, + "language_loss": 0.67259324, + "learning_rate": 6.809118983257522e-07, + "loss": 0.69382036, + "num_input_tokens_seen": 264555635, + "router_z_loss_clip": 0.10107422, + "router_z_loss_mlp": 0.6171875, + "step": 12264, + "time_per_iteration": 3.8689637184143066 + }, + { + "auxiliary_loss_clip": 0.01098996, + "auxiliary_loss_mlp": 0.0102669, + "balance_loss_clip": 1.01562762, + "balance_loss_mlp": 1.03491688, + "epoch": 0.737411693972644, + "flos": 32408481767040.0, + "grad_norm": 2.4971579087814066, + "language_loss": 0.80039012, + "learning_rate": 6.806191782671356e-07, + "loss": 0.82164693, + "num_input_tokens_seen": 264573140, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 12265, + "time_per_iteration": 4.065499782562256 + }, + { + "auxiliary_loss_clip": 0.01103678, + "auxiliary_loss_mlp": 0.01030021, + "balance_loss_clip": 1.01788533, + "balance_loss_mlp": 1.03421259, + "epoch": 0.7374718172253119, + "flos": 24315797623680.0, + "grad_norm": 1.6219065104687562, + "language_loss": 0.74228191, + "learning_rate": 6.803265082395711e-07, + "loss": 0.76361895, + "num_input_tokens_seen": 264591610, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 12266, + "time_per_iteration": 2.469236373901367 + }, + { + "auxiliary_loss_clip": 0.01103845, + "auxiliary_loss_mlp": 0.01034958, + "balance_loss_clip": 1.02281022, + "balance_loss_mlp": 1.03720498, + "epoch": 0.7375319404779799, + "flos": 27156115624320.0, + "grad_norm": 1.5661834210732133, + "language_loss": 0.73517638, + "learning_rate": 6.800338882541576e-07, + "loss": 0.75656438, + "num_input_tokens_seen": 264611170, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6640625, + "step": 12267, + "time_per_iteration": 2.504617214202881 + }, + { + "auxiliary_loss_clip": 0.01100734, + "auxiliary_loss_mlp": 0.01032173, + "balance_loss_clip": 1.02071714, + "balance_loss_mlp": 1.03550386, + "epoch": 0.7375920637306478, + "flos": 18879999701760.0, + "grad_norm": 1.9413990473639766, + "language_loss": 0.82913959, + "learning_rate": 6.797413183219923e-07, + "loss": 0.85046864, + "num_input_tokens_seen": 264629365, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65234375, + "step": 12268, + "time_per_iteration": 2.4835684299468994 + }, + { + "auxiliary_loss_clip": 0.01098968, + "auxiliary_loss_mlp": 0.01036468, + "balance_loss_clip": 1.02494073, + "balance_loss_mlp": 1.034657, + "epoch": 0.7376521869833158, + "flos": 15669765486720.0, + "grad_norm": 1.7133544019503224, + "language_loss": 0.7298789, + "learning_rate": 6.794487984541677e-07, + "loss": 0.75123322, + "num_input_tokens_seen": 264647915, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.64453125, + "step": 12269, + "time_per_iteration": 2.467454195022583 + }, + { + "auxiliary_loss_clip": 0.01104784, + "auxiliary_loss_mlp": 0.01032211, + "balance_loss_clip": 1.01989651, + "balance_loss_mlp": 1.03631639, + "epoch": 0.7377123102359837, + "flos": 36971973901440.0, + "grad_norm": 2.1055066962392095, + "language_loss": 0.69917566, + "learning_rate": 6.791563286617776e-07, + "loss": 0.72054565, + "num_input_tokens_seen": 264669620, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.68359375, + "step": 12270, + "time_per_iteration": 2.5774502754211426 + }, + { + "auxiliary_loss_clip": 0.0110007, + "auxiliary_loss_mlp": 0.01028402, + "balance_loss_clip": 1.01778626, + "balance_loss_mlp": 1.03567266, + "epoch": 0.7377724334886517, + "flos": 24496284487680.0, + "grad_norm": 1.7971813672192163, + "language_loss": 0.69534814, + "learning_rate": 6.788639089559119e-07, + "loss": 0.71663284, + "num_input_tokens_seen": 264689345, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.64453125, + "step": 12271, + "time_per_iteration": 2.6254172325134277 + }, + { + "auxiliary_loss_clip": 0.01102484, + "auxiliary_loss_mlp": 0.01029648, + "balance_loss_clip": 1.01770949, + "balance_loss_mlp": 1.03490114, + "epoch": 0.7378325567413198, + "flos": 24390025079040.0, + "grad_norm": 1.9993430148747984, + "language_loss": 0.68443513, + "learning_rate": 6.785715393476586e-07, + "loss": 0.70575643, + "num_input_tokens_seen": 264707625, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.67578125, + "step": 12272, + "time_per_iteration": 2.514380693435669 + }, + { + "auxiliary_loss_clip": 0.0109964, + "auxiliary_loss_mlp": 0.01029561, + "balance_loss_clip": 1.01809931, + "balance_loss_mlp": 1.03528929, + "epoch": 0.7378926799939877, + "flos": 17416388223360.0, + "grad_norm": 1.683058960031114, + "language_loss": 0.77877617, + "learning_rate": 6.782792198481049e-07, + "loss": 0.80006814, + "num_input_tokens_seen": 264725575, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.64453125, + "step": 12273, + "time_per_iteration": 2.4802489280700684 + }, + { + "auxiliary_loss_clip": 0.01098973, + "auxiliary_loss_mlp": 0.01031366, + "balance_loss_clip": 1.01958835, + "balance_loss_mlp": 1.03365088, + "epoch": 0.7379528032466557, + "flos": 18474208778880.0, + "grad_norm": 1.8227934716103082, + "language_loss": 0.83283198, + "learning_rate": 6.779869504683355e-07, + "loss": 0.85413539, + "num_input_tokens_seen": 264742855, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.65234375, + "step": 12274, + "time_per_iteration": 2.4196221828460693 + }, + { + "auxiliary_loss_clip": 0.01106787, + "auxiliary_loss_mlp": 0.01026617, + "balance_loss_clip": 1.01393938, + "balance_loss_mlp": 1.03611016, + "epoch": 0.7380129264993236, + "flos": 17821999578240.0, + "grad_norm": 1.788699432283416, + "language_loss": 0.7346586, + "learning_rate": 6.776947312194341e-07, + "loss": 0.75599259, + "num_input_tokens_seen": 264761155, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 12275, + "time_per_iteration": 2.4947471618652344 + }, + { + "auxiliary_loss_clip": 0.01106269, + "auxiliary_loss_mlp": 0.01039319, + "balance_loss_clip": 1.02698684, + "balance_loss_mlp": 1.03702235, + "epoch": 0.7380730497519916, + "flos": 22997372918400.0, + "grad_norm": 1.805676108210034, + "language_loss": 0.73670596, + "learning_rate": 6.774025621124813e-07, + "loss": 0.75816184, + "num_input_tokens_seen": 264780660, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6953125, + "step": 12276, + "time_per_iteration": 2.460041046142578 + }, + { + "auxiliary_loss_clip": 0.01102916, + "auxiliary_loss_mlp": 0.01028638, + "balance_loss_clip": 1.01706874, + "balance_loss_mlp": 1.03511322, + "epoch": 0.7381331730046595, + "flos": 20266259241600.0, + "grad_norm": 2.2438661310985544, + "language_loss": 0.77184784, + "learning_rate": 6.771104431585551e-07, + "loss": 0.79316336, + "num_input_tokens_seen": 264798850, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6796875, + "step": 12277, + "time_per_iteration": 2.5230605602264404 + }, + { + "auxiliary_loss_clip": 0.01101926, + "auxiliary_loss_mlp": 0.0103486, + "balance_loss_clip": 1.02326107, + "balance_loss_mlp": 1.03710866, + "epoch": 0.7381932962573275, + "flos": 19754532132480.0, + "grad_norm": 1.8274458620386211, + "language_loss": 0.78436172, + "learning_rate": 6.768183743687338e-07, + "loss": 0.80572963, + "num_input_tokens_seen": 264816795, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6484375, + "step": 12278, + "time_per_iteration": 2.4361507892608643 + }, + { + "auxiliary_loss_clip": 0.0110302, + "auxiliary_loss_mlp": 0.0103102, + "balance_loss_clip": 1.01922441, + "balance_loss_mlp": 1.03554451, + "epoch": 0.7382534195099955, + "flos": 17305316392320.0, + "grad_norm": 2.0940191805387722, + "language_loss": 0.72178644, + "learning_rate": 6.765263557540921e-07, + "loss": 0.74312687, + "num_input_tokens_seen": 264834105, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.67578125, + "step": 12279, + "time_per_iteration": 2.454338312149048 + }, + { + "auxiliary_loss_clip": 0.01102728, + "auxiliary_loss_mlp": 0.01033973, + "balance_loss_clip": 1.02146792, + "balance_loss_mlp": 1.03468275, + "epoch": 0.7383135427626635, + "flos": 18697358021760.0, + "grad_norm": 2.207094607312378, + "language_loss": 0.85757834, + "learning_rate": 6.762343873257034e-07, + "loss": 0.87894535, + "num_input_tokens_seen": 264850895, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 12280, + "time_per_iteration": 2.4340832233428955 + }, + { + "auxiliary_loss_clip": 0.01103222, + "auxiliary_loss_mlp": 0.01028536, + "balance_loss_clip": 1.016675, + "balance_loss_mlp": 1.03586102, + "epoch": 0.7383736660153314, + "flos": 20881300844160.0, + "grad_norm": 2.186067036515089, + "language_loss": 0.72367251, + "learning_rate": 6.759424690946408e-07, + "loss": 0.74499011, + "num_input_tokens_seen": 264869505, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 12281, + "time_per_iteration": 2.4844117164611816 + }, + { + "auxiliary_loss_clip": 0.01102088, + "auxiliary_loss_mlp": 0.01033571, + "balance_loss_clip": 1.02173972, + "balance_loss_mlp": 1.03446507, + "epoch": 0.7384337892679994, + "flos": 20663215418880.0, + "grad_norm": 1.9159466937607454, + "language_loss": 0.6074115, + "learning_rate": 6.756506010719711e-07, + "loss": 0.62876809, + "num_input_tokens_seen": 264886915, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.67578125, + "step": 12282, + "time_per_iteration": 2.4337880611419678 + }, + { + "auxiliary_loss_clip": 0.01104133, + "auxiliary_loss_mlp": 0.01031061, + "balance_loss_clip": 1.01902103, + "balance_loss_mlp": 1.03598022, + "epoch": 0.7384939125206673, + "flos": 29169627390720.0, + "grad_norm": 2.224847577186844, + "language_loss": 0.67914271, + "learning_rate": 6.753587832687632e-07, + "loss": 0.70049471, + "num_input_tokens_seen": 264910350, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6796875, + "step": 12283, + "time_per_iteration": 2.5443530082702637 + }, + { + "auxiliary_loss_clip": 0.01103409, + "auxiliary_loss_mlp": 0.01036343, + "balance_loss_clip": 1.02461326, + "balance_loss_mlp": 1.03717303, + "epoch": 0.7385540357733353, + "flos": 36312833376000.0, + "grad_norm": 1.587417277679554, + "language_loss": 0.76002008, + "learning_rate": 6.750670156960832e-07, + "loss": 0.78141761, + "num_input_tokens_seen": 264930705, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 12284, + "time_per_iteration": 2.561150312423706 + }, + { + "auxiliary_loss_clip": 0.01103106, + "auxiliary_loss_mlp": 0.01029878, + "balance_loss_clip": 1.01742673, + "balance_loss_mlp": 1.03535485, + "epoch": 0.7386141590260034, + "flos": 20302600826880.0, + "grad_norm": 1.8705632629894415, + "language_loss": 0.69351077, + "learning_rate": 6.747752983649954e-07, + "loss": 0.71484059, + "num_input_tokens_seen": 264946975, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6796875, + "step": 12285, + "time_per_iteration": 2.5044779777526855 + }, + { + "auxiliary_loss_clip": 0.01105253, + "auxiliary_loss_mlp": 0.01030637, + "balance_loss_clip": 1.01810813, + "balance_loss_mlp": 1.03483808, + "epoch": 0.7386742822786713, + "flos": 25483792170240.0, + "grad_norm": 2.818148859522571, + "language_loss": 0.79595774, + "learning_rate": 6.744836312865602e-07, + "loss": 0.81731659, + "num_input_tokens_seen": 264967665, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 12286, + "time_per_iteration": 2.462742328643799 + }, + { + "auxiliary_loss_clip": 0.01099992, + "auxiliary_loss_mlp": 0.01026401, + "balance_loss_clip": 1.01436138, + "balance_loss_mlp": 1.03468239, + "epoch": 0.7387344055313393, + "flos": 13771958405760.0, + "grad_norm": 2.0998689615756616, + "language_loss": 0.65484864, + "learning_rate": 6.741920144718396e-07, + "loss": 0.67611259, + "num_input_tokens_seen": 264985480, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.65234375, + "step": 12287, + "time_per_iteration": 2.5399184226989746 + }, + { + "auxiliary_loss_clip": 0.01097159, + "auxiliary_loss_mlp": 0.01026905, + "balance_loss_clip": 1.01564598, + "balance_loss_mlp": 1.03362429, + "epoch": 0.7387945287840072, + "flos": 27855189095040.0, + "grad_norm": 1.862112231817168, + "language_loss": 0.76542664, + "learning_rate": 6.739004479318903e-07, + "loss": 0.78666735, + "num_input_tokens_seen": 265004790, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.63671875, + "step": 12288, + "time_per_iteration": 2.483729362487793 + }, + { + "auxiliary_loss_clip": 0.01107844, + "auxiliary_loss_mlp": 0.01031728, + "balance_loss_clip": 1.0192709, + "balance_loss_mlp": 1.03781092, + "epoch": 0.7388546520366752, + "flos": 44233039388160.0, + "grad_norm": 1.6167864576536901, + "language_loss": 0.58242345, + "learning_rate": 6.736089316777684e-07, + "loss": 0.60381913, + "num_input_tokens_seen": 265028790, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.69921875, + "step": 12289, + "time_per_iteration": 2.653754472732544 + }, + { + "auxiliary_loss_clip": 0.01027818, + "auxiliary_loss_mlp": 0.010066, + "balance_loss_clip": 1.00555122, + "balance_loss_mlp": 1.00722313, + "epoch": 0.7389147752893431, + "flos": 70680890638080.0, + "grad_norm": 0.657884434351233, + "language_loss": 0.49320006, + "learning_rate": 6.733174657205287e-07, + "loss": 0.5135442, + "num_input_tokens_seen": 265096660, + "router_z_loss_clip": 0.01049805, + "router_z_loss_mlp": 0.20605469, + "step": 12290, + "time_per_iteration": 3.161417007446289 + }, + { + "auxiliary_loss_clip": 0.01104102, + "auxiliary_loss_mlp": 0.01030184, + "balance_loss_clip": 1.01758409, + "balance_loss_mlp": 1.03600287, + "epoch": 0.7389748985420111, + "flos": 25994980575360.0, + "grad_norm": 1.8618109210971494, + "language_loss": 0.66936404, + "learning_rate": 6.730260500712237e-07, + "loss": 0.69070697, + "num_input_tokens_seen": 265116375, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6796875, + "step": 12291, + "time_per_iteration": 2.504171371459961 + }, + { + "auxiliary_loss_clip": 0.01026631, + "auxiliary_loss_mlp": 0.01008045, + "balance_loss_clip": 1.00700212, + "balance_loss_mlp": 1.00617576, + "epoch": 0.7390350217946791, + "flos": 54403661318400.0, + "grad_norm": 0.9921278078436683, + "language_loss": 0.60870874, + "learning_rate": 6.727346847409052e-07, + "loss": 0.6290555, + "num_input_tokens_seen": 265161230, + "router_z_loss_clip": 0.01043701, + "router_z_loss_mlp": 0.20507812, + "step": 12292, + "time_per_iteration": 2.740140676498413 + }, + { + "auxiliary_loss_clip": 0.0110263, + "auxiliary_loss_mlp": 0.01030054, + "balance_loss_clip": 1.0190208, + "balance_loss_mlp": 1.03666413, + "epoch": 0.7390951450473471, + "flos": 32196968530560.0, + "grad_norm": 2.0283775750990447, + "language_loss": 0.67287552, + "learning_rate": 6.724433697406191e-07, + "loss": 0.6942023, + "num_input_tokens_seen": 265182515, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.66015625, + "step": 12293, + "time_per_iteration": 2.5637433528900146 + }, + { + "auxiliary_loss_clip": 0.01101914, + "auxiliary_loss_mlp": 0.01030871, + "balance_loss_clip": 1.01897979, + "balance_loss_mlp": 1.03533363, + "epoch": 0.739155268300015, + "flos": 16684241304960.0, + "grad_norm": 1.7680717845070275, + "language_loss": 0.83443105, + "learning_rate": 6.721521050814134e-07, + "loss": 0.85575891, + "num_input_tokens_seen": 265198160, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 12294, + "time_per_iteration": 2.452796697616577 + }, + { + "auxiliary_loss_clip": 0.0109896, + "auxiliary_loss_mlp": 0.01030815, + "balance_loss_clip": 1.01865005, + "balance_loss_mlp": 1.03435683, + "epoch": 0.739215391552683, + "flos": 31649761762560.0, + "grad_norm": 1.704234892939925, + "language_loss": 0.72765625, + "learning_rate": 6.718608907743337e-07, + "loss": 0.74895406, + "num_input_tokens_seen": 265218480, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6484375, + "step": 12295, + "time_per_iteration": 2.532444953918457 + }, + { + "auxiliary_loss_clip": 0.01099527, + "auxiliary_loss_mlp": 0.01036709, + "balance_loss_clip": 1.0250864, + "balance_loss_mlp": 1.03585625, + "epoch": 0.7392755148053509, + "flos": 29718522097920.0, + "grad_norm": 1.6789172360591735, + "language_loss": 0.78772449, + "learning_rate": 6.715697268304215e-07, + "loss": 0.8090868, + "num_input_tokens_seen": 265240165, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.63671875, + "step": 12296, + "time_per_iteration": 2.5699706077575684 + }, + { + "auxiliary_loss_clip": 0.01102686, + "auxiliary_loss_mlp": 0.01031782, + "balance_loss_clip": 1.01921093, + "balance_loss_mlp": 1.03617287, + "epoch": 0.7393356380580189, + "flos": 37050475075200.0, + "grad_norm": 1.8636543361899776, + "language_loss": 0.66520232, + "learning_rate": 6.712786132607182e-07, + "loss": 0.68654692, + "num_input_tokens_seen": 265263295, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6640625, + "step": 12297, + "time_per_iteration": 2.5840320587158203 + }, + { + "auxiliary_loss_clip": 0.01103197, + "auxiliary_loss_mlp": 0.0103567, + "balance_loss_clip": 1.0230639, + "balance_loss_mlp": 1.03605783, + "epoch": 0.739395761310687, + "flos": 19719627091200.0, + "grad_norm": 2.2038505631105054, + "language_loss": 0.68769479, + "learning_rate": 6.709875500762645e-07, + "loss": 0.70908344, + "num_input_tokens_seen": 265282740, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.671875, + "step": 12298, + "time_per_iteration": 2.4649643898010254 + }, + { + "auxiliary_loss_clip": 0.01102459, + "auxiliary_loss_mlp": 0.01029321, + "balance_loss_clip": 1.01767373, + "balance_loss_mlp": 1.0349468, + "epoch": 0.7394558845633549, + "flos": 11801504067840.0, + "grad_norm": 1.7869505814548332, + "language_loss": 0.74577737, + "learning_rate": 6.706965372880946e-07, + "loss": 0.76709521, + "num_input_tokens_seen": 265300175, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.67578125, + "step": 12299, + "time_per_iteration": 2.4275574684143066 + }, + { + "auxiliary_loss_clip": 0.01025983, + "auxiliary_loss_mlp": 0.01002146, + "balance_loss_clip": 1.00116849, + "balance_loss_mlp": 1.00569797, + "epoch": 0.7395160078160229, + "flos": 66195827850240.0, + "grad_norm": 0.7180686194551699, + "language_loss": 0.60861343, + "learning_rate": 6.704055749072455e-07, + "loss": 0.62889469, + "num_input_tokens_seen": 265363275, + "router_z_loss_clip": 0.00976562, + "router_z_loss_mlp": 0.203125, + "step": 12300, + "time_per_iteration": 3.1263675689697266 + }, + { + "auxiliary_loss_clip": 0.01102982, + "auxiliary_loss_mlp": 0.01028506, + "balance_loss_clip": 1.01645398, + "balance_loss_mlp": 1.03720665, + "epoch": 0.7395761310686908, + "flos": 21249708687360.0, + "grad_norm": 1.4253075505979764, + "language_loss": 0.80278659, + "learning_rate": 6.7011466294475e-07, + "loss": 0.82410145, + "num_input_tokens_seen": 265382935, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.65625, + "step": 12301, + "time_per_iteration": 2.46708345413208 + }, + { + "auxiliary_loss_clip": 0.01100248, + "auxiliary_loss_mlp": 0.01028329, + "balance_loss_clip": 1.01725399, + "balance_loss_mlp": 1.0343194, + "epoch": 0.7396362543213588, + "flos": 25955299025280.0, + "grad_norm": 1.5951843205733178, + "language_loss": 0.73313689, + "learning_rate": 6.698238014116406e-07, + "loss": 0.75442266, + "num_input_tokens_seen": 265403245, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.66015625, + "step": 12302, + "time_per_iteration": 2.532886028289795 + }, + { + "auxiliary_loss_clip": 0.01102922, + "auxiliary_loss_mlp": 0.01036251, + "balance_loss_clip": 1.02475905, + "balance_loss_mlp": 1.03542805, + "epoch": 0.7396963775740267, + "flos": 27377936064000.0, + "grad_norm": 1.7925873497266347, + "language_loss": 0.7409184, + "learning_rate": 6.695329903189451e-07, + "loss": 0.76231015, + "num_input_tokens_seen": 265423105, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.67578125, + "step": 12303, + "time_per_iteration": 3.842045545578003 + }, + { + "auxiliary_loss_clip": 0.01098981, + "auxiliary_loss_mlp": 0.01026474, + "balance_loss_clip": 1.01557863, + "balance_loss_mlp": 1.03380299, + "epoch": 0.7397565008266948, + "flos": 25520133755520.0, + "grad_norm": 1.7395112572263238, + "language_loss": 0.54232901, + "learning_rate": 6.692422296776927e-07, + "loss": 0.56358361, + "num_input_tokens_seen": 265443445, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6484375, + "step": 12304, + "time_per_iteration": 2.5310745239257812 + }, + { + "auxiliary_loss_clip": 0.01102065, + "auxiliary_loss_mlp": 0.01029526, + "balance_loss_clip": 1.01740789, + "balance_loss_mlp": 1.03500128, + "epoch": 0.7398166240793627, + "flos": 23727760070400.0, + "grad_norm": 1.9555871557250795, + "language_loss": 0.841694, + "learning_rate": 6.689515194989084e-07, + "loss": 0.86300987, + "num_input_tokens_seen": 265462085, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 12305, + "time_per_iteration": 3.97141170501709 + }, + { + "auxiliary_loss_clip": 0.01025514, + "auxiliary_loss_mlp": 0.00998213, + "balance_loss_clip": 0.99718779, + "balance_loss_mlp": 1.0049659, + "epoch": 0.7398767473320307, + "flos": 67267582882560.0, + "grad_norm": 0.8695449825144963, + "language_loss": 0.57674229, + "learning_rate": 6.68660859793615e-07, + "loss": 0.59697956, + "num_input_tokens_seen": 265521190, + "router_z_loss_clip": 0.01025391, + "router_z_loss_mlp": 0.20507812, + "step": 12306, + "time_per_iteration": 4.480564117431641 + }, + { + "auxiliary_loss_clip": 0.01105578, + "auxiliary_loss_mlp": 0.01031897, + "balance_loss_clip": 1.01955891, + "balance_loss_mlp": 1.03752124, + "epoch": 0.7399368705846986, + "flos": 22018699981440.0, + "grad_norm": 1.94634660943293, + "language_loss": 0.81800246, + "learning_rate": 6.683702505728355e-07, + "loss": 0.83937716, + "num_input_tokens_seen": 265539705, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6796875, + "step": 12307, + "time_per_iteration": 3.964345932006836 + }, + { + "auxiliary_loss_clip": 0.01099571, + "auxiliary_loss_mlp": 0.01028725, + "balance_loss_clip": 1.01743591, + "balance_loss_mlp": 1.03615248, + "epoch": 0.7399969938373666, + "flos": 14173870659840.0, + "grad_norm": 1.7625756479889783, + "language_loss": 0.69852555, + "learning_rate": 6.680796918475893e-07, + "loss": 0.71980846, + "num_input_tokens_seen": 265555855, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.63671875, + "step": 12308, + "time_per_iteration": 2.426374912261963 + }, + { + "auxiliary_loss_clip": 0.01097458, + "auxiliary_loss_mlp": 0.01025287, + "balance_loss_clip": 1.01394367, + "balance_loss_mlp": 1.03327668, + "epoch": 0.7400571170900345, + "flos": 25301473712640.0, + "grad_norm": 1.8311869299558743, + "language_loss": 0.81359291, + "learning_rate": 6.67789183628896e-07, + "loss": 0.83482039, + "num_input_tokens_seen": 265575455, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 12309, + "time_per_iteration": 2.47933292388916 + }, + { + "auxiliary_loss_clip": 0.01102906, + "auxiliary_loss_mlp": 0.01033716, + "balance_loss_clip": 1.02118754, + "balance_loss_mlp": 1.03444481, + "epoch": 0.7401172403427025, + "flos": 22711344917760.0, + "grad_norm": 1.7272186323130432, + "language_loss": 0.72933966, + "learning_rate": 6.674987259277692e-07, + "loss": 0.7507059, + "num_input_tokens_seen": 265595250, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 12310, + "time_per_iteration": 2.458360195159912 + }, + { + "auxiliary_loss_clip": 0.01105362, + "auxiliary_loss_mlp": 0.01037153, + "balance_loss_clip": 1.02455902, + "balance_loss_mlp": 1.03706884, + "epoch": 0.7401773635953706, + "flos": 18067448188800.0, + "grad_norm": 2.8138497569314165, + "language_loss": 0.8816393, + "learning_rate": 6.672083187552239e-07, + "loss": 0.90306449, + "num_input_tokens_seen": 265606945, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.68359375, + "step": 12311, + "time_per_iteration": 2.4193923473358154 + }, + { + "auxiliary_loss_clip": 0.0110026, + "auxiliary_loss_mlp": 0.01026797, + "balance_loss_clip": 1.01557398, + "balance_loss_mlp": 1.0338285, + "epoch": 0.7402374868480385, + "flos": 22712135016960.0, + "grad_norm": 1.5281974655269193, + "language_loss": 0.80203426, + "learning_rate": 6.669179621222738e-07, + "loss": 0.82330477, + "num_input_tokens_seen": 265626115, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6640625, + "step": 12312, + "time_per_iteration": 2.5062949657440186 + }, + { + "auxiliary_loss_clip": 0.01100667, + "auxiliary_loss_mlp": 0.01026723, + "balance_loss_clip": 1.0153985, + "balance_loss_mlp": 1.03547597, + "epoch": 0.7402976101007065, + "flos": 22856675345280.0, + "grad_norm": 2.0496860461073676, + "language_loss": 0.7839551, + "learning_rate": 6.666276560399273e-07, + "loss": 0.80522901, + "num_input_tokens_seen": 265646520, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 12313, + "time_per_iteration": 2.4662246704101562 + }, + { + "auxiliary_loss_clip": 0.01102693, + "auxiliary_loss_mlp": 0.01036243, + "balance_loss_clip": 1.02358902, + "balance_loss_mlp": 1.03396571, + "epoch": 0.7403577333533744, + "flos": 12345801834240.0, + "grad_norm": 2.00903442682859, + "language_loss": 0.78872943, + "learning_rate": 6.663374005191937e-07, + "loss": 0.81011879, + "num_input_tokens_seen": 265661875, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 12314, + "time_per_iteration": 2.446385622024536 + }, + { + "auxiliary_loss_clip": 0.01024604, + "auxiliary_loss_mlp": 0.01003964, + "balance_loss_clip": 1.00296831, + "balance_loss_mlp": 1.00410616, + "epoch": 0.7404178566060424, + "flos": 60327270869760.0, + "grad_norm": 0.8412651667201435, + "language_loss": 0.55169189, + "learning_rate": 6.660471955710809e-07, + "loss": 0.57197762, + "num_input_tokens_seen": 265721255, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.20507812, + "step": 12315, + "time_per_iteration": 3.0314457416534424 + }, + { + "auxiliary_loss_clip": 0.01097855, + "auxiliary_loss_mlp": 0.01031672, + "balance_loss_clip": 1.02031732, + "balance_loss_mlp": 1.03454709, + "epoch": 0.7404779798587103, + "flos": 32014650072960.0, + "grad_norm": 1.5280075701489741, + "language_loss": 0.79192966, + "learning_rate": 6.65757041206591e-07, + "loss": 0.81322497, + "num_input_tokens_seen": 265743970, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6328125, + "step": 12316, + "time_per_iteration": 2.5997025966644287 + }, + { + "auxiliary_loss_clip": 0.0109893, + "auxiliary_loss_mlp": 0.01029612, + "balance_loss_clip": 1.01758349, + "balance_loss_mlp": 1.03257847, + "epoch": 0.7405381031113784, + "flos": 12889704551040.0, + "grad_norm": 1.6312870183183517, + "language_loss": 0.74777615, + "learning_rate": 6.654669374367275e-07, + "loss": 0.76906157, + "num_input_tokens_seen": 265760890, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 12317, + "time_per_iteration": 2.409041404724121 + }, + { + "auxiliary_loss_clip": 0.01097259, + "auxiliary_loss_mlp": 0.01034106, + "balance_loss_clip": 1.02258456, + "balance_loss_mlp": 1.03415799, + "epoch": 0.7405982263640463, + "flos": 20229127557120.0, + "grad_norm": 1.5381739579945533, + "language_loss": 0.81140697, + "learning_rate": 6.651768842724917e-07, + "loss": 0.83272064, + "num_input_tokens_seen": 265779600, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.62890625, + "step": 12318, + "time_per_iteration": 2.483341932296753 + }, + { + "auxiliary_loss_clip": 0.01102727, + "auxiliary_loss_mlp": 0.01029982, + "balance_loss_clip": 1.01801968, + "balance_loss_mlp": 1.03532875, + "epoch": 0.7406583496167143, + "flos": 17567213431680.0, + "grad_norm": 2.10976565284071, + "language_loss": 0.76717627, + "learning_rate": 6.648868817248827e-07, + "loss": 0.78850329, + "num_input_tokens_seen": 265797030, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.67578125, + "step": 12319, + "time_per_iteration": 2.4090797901153564 + }, + { + "auxiliary_loss_clip": 0.01100157, + "auxiliary_loss_mlp": 0.01031224, + "balance_loss_clip": 1.02052474, + "balance_loss_mlp": 1.03510928, + "epoch": 0.7407184728693822, + "flos": 18295733076480.0, + "grad_norm": 2.728045021553726, + "language_loss": 0.64247096, + "learning_rate": 6.64596929804897e-07, + "loss": 0.6637848, + "num_input_tokens_seen": 265815055, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.65234375, + "step": 12320, + "time_per_iteration": 2.4777369499206543 + }, + { + "auxiliary_loss_clip": 0.01104796, + "auxiliary_loss_mlp": 0.01034808, + "balance_loss_clip": 1.02257681, + "balance_loss_mlp": 1.03554249, + "epoch": 0.7407785961220502, + "flos": 16690562098560.0, + "grad_norm": 2.5603662317591307, + "language_loss": 0.83399361, + "learning_rate": 6.643070285235288e-07, + "loss": 0.8553896, + "num_input_tokens_seen": 265828480, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69140625, + "step": 12321, + "time_per_iteration": 2.5069942474365234 + }, + { + "auxiliary_loss_clip": 0.01106734, + "auxiliary_loss_mlp": 0.01043827, + "balance_loss_clip": 1.03052354, + "balance_loss_mlp": 1.03583789, + "epoch": 0.7408387193747181, + "flos": 22088330496000.0, + "grad_norm": 1.897257666550991, + "language_loss": 0.71964365, + "learning_rate": 6.640171778917727e-07, + "loss": 0.74114925, + "num_input_tokens_seen": 265845825, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 12322, + "time_per_iteration": 2.4930129051208496 + }, + { + "auxiliary_loss_clip": 0.01101959, + "auxiliary_loss_mlp": 0.0103281, + "balance_loss_clip": 1.02137196, + "balance_loss_mlp": 1.03622496, + "epoch": 0.7408988426273861, + "flos": 24236721832320.0, + "grad_norm": 1.870315243792337, + "language_loss": 0.64078039, + "learning_rate": 6.637273779206183e-07, + "loss": 0.66212809, + "num_input_tokens_seen": 265866335, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 12323, + "time_per_iteration": 2.4777188301086426 + }, + { + "auxiliary_loss_clip": 0.01102632, + "auxiliary_loss_mlp": 0.01026506, + "balance_loss_clip": 1.0141499, + "balance_loss_mlp": 1.03480208, + "epoch": 0.7409589658800542, + "flos": 29023004073600.0, + "grad_norm": 1.4950637015537451, + "language_loss": 0.75935167, + "learning_rate": 6.634376286210559e-07, + "loss": 0.78064305, + "num_input_tokens_seen": 265888945, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6796875, + "step": 12324, + "time_per_iteration": 2.5902748107910156 + }, + { + "auxiliary_loss_clip": 0.01099826, + "auxiliary_loss_mlp": 0.01023896, + "balance_loss_clip": 1.01248217, + "balance_loss_mlp": 1.0326978, + "epoch": 0.7410190891327221, + "flos": 19351362902400.0, + "grad_norm": 1.7779845069008868, + "language_loss": 0.74595994, + "learning_rate": 6.63147930004073e-07, + "loss": 0.76719713, + "num_input_tokens_seen": 265908030, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 12325, + "time_per_iteration": 2.428908586502075 + }, + { + "auxiliary_loss_clip": 0.01105539, + "auxiliary_loss_mlp": 0.01032307, + "balance_loss_clip": 1.01949763, + "balance_loss_mlp": 1.03505337, + "epoch": 0.7410792123853901, + "flos": 22747650589440.0, + "grad_norm": 1.8169030049946526, + "language_loss": 0.68363488, + "learning_rate": 6.628582820806545e-07, + "loss": 0.70501333, + "num_input_tokens_seen": 265927030, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.703125, + "step": 12326, + "time_per_iteration": 2.4834694862365723 + }, + { + "auxiliary_loss_clip": 0.01101938, + "auxiliary_loss_mlp": 0.01027612, + "balance_loss_clip": 1.0159893, + "balance_loss_mlp": 1.03513253, + "epoch": 0.741139335638058, + "flos": 25372433030400.0, + "grad_norm": 2.058459084269704, + "language_loss": 0.89730138, + "learning_rate": 6.625686848617835e-07, + "loss": 0.91859686, + "num_input_tokens_seen": 265945490, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66796875, + "step": 12327, + "time_per_iteration": 2.4705865383148193 + }, + { + "auxiliary_loss_clip": 0.01101392, + "auxiliary_loss_mlp": 0.01031454, + "balance_loss_clip": 1.01925874, + "balance_loss_mlp": 1.03504896, + "epoch": 0.741199458890726, + "flos": 18585639745920.0, + "grad_norm": 1.6496511439188377, + "language_loss": 0.85582221, + "learning_rate": 6.62279138358442e-07, + "loss": 0.87715065, + "num_input_tokens_seen": 265963265, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6640625, + "step": 12328, + "time_per_iteration": 2.440108060836792 + }, + { + "auxiliary_loss_clip": 0.0109826, + "auxiliary_loss_mlp": 0.01029623, + "balance_loss_clip": 1.01708829, + "balance_loss_mlp": 1.03355885, + "epoch": 0.7412595821433939, + "flos": 22127078292480.0, + "grad_norm": 1.676741332984265, + "language_loss": 0.66687691, + "learning_rate": 6.619896425816103e-07, + "loss": 0.68815577, + "num_input_tokens_seen": 265982270, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6484375, + "step": 12329, + "time_per_iteration": 2.433601140975952 + }, + { + "auxiliary_loss_clip": 0.01105654, + "auxiliary_loss_mlp": 0.01029158, + "balance_loss_clip": 1.01754093, + "balance_loss_mlp": 1.03583872, + "epoch": 0.741319705396062, + "flos": 29169699217920.0, + "grad_norm": 1.8984380479185268, + "language_loss": 0.66488492, + "learning_rate": 6.617001975422647e-07, + "loss": 0.68623304, + "num_input_tokens_seen": 266003835, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.69921875, + "step": 12330, + "time_per_iteration": 2.5116231441497803 + }, + { + "auxiliary_loss_clip": 0.01108565, + "auxiliary_loss_mlp": 0.0103297, + "balance_loss_clip": 1.01889706, + "balance_loss_mlp": 1.03731847, + "epoch": 0.7413798286487299, + "flos": 20667489137280.0, + "grad_norm": 1.9345159720147296, + "language_loss": 0.85613048, + "learning_rate": 6.614108032513823e-07, + "loss": 0.87754583, + "num_input_tokens_seen": 266021595, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7109375, + "step": 12331, + "time_per_iteration": 2.4270429611206055 + }, + { + "auxiliary_loss_clip": 0.01102715, + "auxiliary_loss_mlp": 0.01030968, + "balance_loss_clip": 1.01889229, + "balance_loss_mlp": 1.03435421, + "epoch": 0.7414399519013979, + "flos": 16398895662720.0, + "grad_norm": 1.9091499126857316, + "language_loss": 0.69466591, + "learning_rate": 6.611214597199364e-07, + "loss": 0.7160027, + "num_input_tokens_seen": 266039860, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.68359375, + "step": 12332, + "time_per_iteration": 2.422391176223755 + }, + { + "auxiliary_loss_clip": 0.01102435, + "auxiliary_loss_mlp": 0.01035735, + "balance_loss_clip": 1.02290845, + "balance_loss_mlp": 1.03556943, + "epoch": 0.7415000751540658, + "flos": 25630235919360.0, + "grad_norm": 2.2157206056702097, + "language_loss": 0.63370979, + "learning_rate": 6.608321669588984e-07, + "loss": 0.65509146, + "num_input_tokens_seen": 266058050, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.66796875, + "step": 12333, + "time_per_iteration": 2.505436420440674 + }, + { + "auxiliary_loss_clip": 0.01099765, + "auxiliary_loss_mlp": 0.01033254, + "balance_loss_clip": 1.02141094, + "balance_loss_mlp": 1.03644109, + "epoch": 0.7415601984067338, + "flos": 24499732193280.0, + "grad_norm": 1.6374577716994534, + "language_loss": 0.71271133, + "learning_rate": 6.605429249792387e-07, + "loss": 0.73404145, + "num_input_tokens_seen": 266078060, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6328125, + "step": 12334, + "time_per_iteration": 2.5002856254577637 + }, + { + "auxiliary_loss_clip": 0.01101856, + "auxiliary_loss_mlp": 0.01027153, + "balance_loss_clip": 1.01598334, + "balance_loss_mlp": 1.03537202, + "epoch": 0.7416203216594017, + "flos": 20887154760960.0, + "grad_norm": 1.9057001714532567, + "language_loss": 0.82662481, + "learning_rate": 6.602537337919257e-07, + "loss": 0.84791493, + "num_input_tokens_seen": 266097110, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6640625, + "step": 12335, + "time_per_iteration": 2.420285701751709 + }, + { + "auxiliary_loss_clip": 0.01101959, + "auxiliary_loss_mlp": 0.01031092, + "balance_loss_clip": 1.01862848, + "balance_loss_mlp": 1.03514791, + "epoch": 0.7416804449120697, + "flos": 15624265933440.0, + "grad_norm": 2.6318734852412082, + "language_loss": 0.74709713, + "learning_rate": 6.599645934079259e-07, + "loss": 0.76842761, + "num_input_tokens_seen": 266110870, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.66796875, + "step": 12336, + "time_per_iteration": 2.395914077758789 + }, + { + "auxiliary_loss_clip": 0.01105209, + "auxiliary_loss_mlp": 0.0102888, + "balance_loss_clip": 1.01698947, + "balance_loss_mlp": 1.03675711, + "epoch": 0.7417405681647377, + "flos": 17120483982720.0, + "grad_norm": 2.0074082890204803, + "language_loss": 0.73073846, + "learning_rate": 6.596755038382029e-07, + "loss": 0.75207937, + "num_input_tokens_seen": 266127845, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.68359375, + "step": 12337, + "time_per_iteration": 2.4017410278320312 + }, + { + "auxiliary_loss_clip": 0.01101618, + "auxiliary_loss_mlp": 0.0103512, + "balance_loss_clip": 1.02384233, + "balance_loss_mlp": 1.0375526, + "epoch": 0.7418006914174057, + "flos": 18880322924160.0, + "grad_norm": 1.582069295944861, + "language_loss": 0.76476055, + "learning_rate": 6.593864650937186e-07, + "loss": 0.78612792, + "num_input_tokens_seen": 266145400, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.640625, + "step": 12338, + "time_per_iteration": 2.469158172607422 + }, + { + "auxiliary_loss_clip": 0.01098771, + "auxiliary_loss_mlp": 0.01027623, + "balance_loss_clip": 1.01728797, + "balance_loss_mlp": 1.03412902, + "epoch": 0.7418608146700737, + "flos": 21580733450880.0, + "grad_norm": 1.7521644726075343, + "language_loss": 0.73067641, + "learning_rate": 6.590974771854345e-07, + "loss": 0.75194031, + "num_input_tokens_seen": 266164430, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.64453125, + "step": 12339, + "time_per_iteration": 2.4999265670776367 + }, + { + "auxiliary_loss_clip": 0.01103048, + "auxiliary_loss_mlp": 0.01027293, + "balance_loss_clip": 1.01544917, + "balance_loss_mlp": 1.03630698, + "epoch": 0.7419209379227416, + "flos": 22340459036160.0, + "grad_norm": 1.733265242117768, + "language_loss": 0.79821277, + "learning_rate": 6.588085401243077e-07, + "loss": 0.81951618, + "num_input_tokens_seen": 266183855, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66796875, + "step": 12340, + "time_per_iteration": 2.5067059993743896 + }, + { + "auxiliary_loss_clip": 0.0110211, + "auxiliary_loss_mlp": 0.01033939, + "balance_loss_clip": 1.02174389, + "balance_loss_mlp": 1.0347414, + "epoch": 0.7419810611754096, + "flos": 16762275601920.0, + "grad_norm": 1.853046258672694, + "language_loss": 0.75634474, + "learning_rate": 6.585196539212958e-07, + "loss": 0.77770519, + "num_input_tokens_seen": 266202085, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 12341, + "time_per_iteration": 2.419905662536621 + }, + { + "auxiliary_loss_clip": 0.01095271, + "auxiliary_loss_mlp": 0.0103099, + "balance_loss_clip": 1.01980829, + "balance_loss_mlp": 1.03472114, + "epoch": 0.7420411844280775, + "flos": 26212958259840.0, + "grad_norm": 1.6930413865654552, + "language_loss": 0.80139267, + "learning_rate": 6.582308185873535e-07, + "loss": 0.82265526, + "num_input_tokens_seen": 266223445, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.60546875, + "step": 12342, + "time_per_iteration": 2.5155606269836426 + }, + { + "auxiliary_loss_clip": 0.01100642, + "auxiliary_loss_mlp": 0.01028992, + "balance_loss_clip": 1.01748824, + "balance_loss_mlp": 1.03512716, + "epoch": 0.7421013076807456, + "flos": 68529371840640.0, + "grad_norm": 1.6721865826322508, + "language_loss": 0.77694213, + "learning_rate": 6.57942034133433e-07, + "loss": 0.79823846, + "num_input_tokens_seen": 266246575, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 12343, + "time_per_iteration": 2.8234310150146484 + }, + { + "auxiliary_loss_clip": 0.01097938, + "auxiliary_loss_mlp": 0.01031813, + "balance_loss_clip": 1.02027309, + "balance_loss_mlp": 1.03249693, + "epoch": 0.7421614309334135, + "flos": 24425325169920.0, + "grad_norm": 1.7204142149055508, + "language_loss": 0.67798221, + "learning_rate": 6.576533005704843e-07, + "loss": 0.69927979, + "num_input_tokens_seen": 266266055, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 12344, + "time_per_iteration": 3.9860341548919678 + }, + { + "auxiliary_loss_clip": 0.01102936, + "auxiliary_loss_mlp": 0.01033112, + "balance_loss_clip": 1.0204283, + "balance_loss_mlp": 1.03553951, + "epoch": 0.7422215541860815, + "flos": 12311076360960.0, + "grad_norm": 2.3379030417701423, + "language_loss": 0.81033051, + "learning_rate": 6.573646179094572e-07, + "loss": 0.83169097, + "num_input_tokens_seen": 266282240, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.67578125, + "step": 12345, + "time_per_iteration": 2.457531213760376 + }, + { + "auxiliary_loss_clip": 0.01102706, + "auxiliary_loss_mlp": 0.01036384, + "balance_loss_clip": 1.02450442, + "balance_loss_mlp": 1.0354228, + "epoch": 0.7422816774387494, + "flos": 19645579203840.0, + "grad_norm": 1.9598348009853668, + "language_loss": 0.71018803, + "learning_rate": 6.570759861612988e-07, + "loss": 0.73157895, + "num_input_tokens_seen": 266300980, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 12346, + "time_per_iteration": 3.8033220767974854 + }, + { + "auxiliary_loss_clip": 0.01102695, + "auxiliary_loss_mlp": 0.01030351, + "balance_loss_clip": 1.01851344, + "balance_loss_mlp": 1.03597689, + "epoch": 0.7423418006914174, + "flos": 32015978876160.0, + "grad_norm": 1.5893772785658562, + "language_loss": 0.73678845, + "learning_rate": 6.56787405336953e-07, + "loss": 0.75811887, + "num_input_tokens_seen": 266322215, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6640625, + "step": 12347, + "time_per_iteration": 3.922349691390991 + }, + { + "auxiliary_loss_clip": 0.0110556, + "auxiliary_loss_mlp": 0.01030516, + "balance_loss_clip": 1.01888108, + "balance_loss_mlp": 1.03616238, + "epoch": 0.7424019239440853, + "flos": 18916951818240.0, + "grad_norm": 1.7507272785973695, + "language_loss": 0.80773383, + "learning_rate": 6.564988754473642e-07, + "loss": 0.82909453, + "num_input_tokens_seen": 266341600, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6953125, + "step": 12348, + "time_per_iteration": 3.8946139812469482 + }, + { + "auxiliary_loss_clip": 0.01100119, + "auxiliary_loss_mlp": 0.01028617, + "balance_loss_clip": 1.01724422, + "balance_loss_mlp": 1.03434706, + "epoch": 0.7424620471967533, + "flos": 35876518871040.0, + "grad_norm": 1.9451806865791765, + "language_loss": 0.72609961, + "learning_rate": 6.562103965034724e-07, + "loss": 0.74738705, + "num_input_tokens_seen": 266362895, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65625, + "step": 12349, + "time_per_iteration": 2.6297786235809326 + }, + { + "auxiliary_loss_clip": 0.01105402, + "auxiliary_loss_mlp": 0.01032044, + "balance_loss_clip": 1.01891899, + "balance_loss_mlp": 1.03512514, + "epoch": 0.7425221704494213, + "flos": 27016603200000.0, + "grad_norm": 1.884291217596135, + "language_loss": 0.78724527, + "learning_rate": 6.559219685162165e-07, + "loss": 0.80861974, + "num_input_tokens_seen": 266384015, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 12350, + "time_per_iteration": 2.500523567199707 + }, + { + "auxiliary_loss_clip": 0.01100361, + "auxiliary_loss_mlp": 0.01034078, + "balance_loss_clip": 1.02290213, + "balance_loss_mlp": 1.03446043, + "epoch": 0.7425822937020893, + "flos": 34167135559680.0, + "grad_norm": 1.7431994876148182, + "language_loss": 0.74992573, + "learning_rate": 6.556335914965343e-07, + "loss": 0.7712701, + "num_input_tokens_seen": 266405990, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.66015625, + "step": 12351, + "time_per_iteration": 2.570344924926758 + }, + { + "auxiliary_loss_clip": 0.01100715, + "auxiliary_loss_mlp": 0.01024897, + "balance_loss_clip": 1.01363814, + "balance_loss_mlp": 1.03487992, + "epoch": 0.7426424169547573, + "flos": 21283572234240.0, + "grad_norm": 1.8775764813546454, + "language_loss": 0.81292212, + "learning_rate": 6.553452654553611e-07, + "loss": 0.83417821, + "num_input_tokens_seen": 266424260, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 12352, + "time_per_iteration": 2.4442734718322754 + }, + { + "auxiliary_loss_clip": 0.01103269, + "auxiliary_loss_mlp": 0.01037884, + "balance_loss_clip": 1.02641034, + "balance_loss_mlp": 1.0369432, + "epoch": 0.7427025402074252, + "flos": 22448442297600.0, + "grad_norm": 1.9024946732776964, + "language_loss": 0.71716195, + "learning_rate": 6.550569904036307e-07, + "loss": 0.73857349, + "num_input_tokens_seen": 266444580, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6640625, + "step": 12353, + "time_per_iteration": 2.4757235050201416 + }, + { + "auxiliary_loss_clip": 0.01102245, + "auxiliary_loss_mlp": 0.0103172, + "balance_loss_clip": 1.02075243, + "balance_loss_mlp": 1.03749537, + "epoch": 0.7427626634600932, + "flos": 22524609087360.0, + "grad_norm": 1.5592881493961996, + "language_loss": 0.72042692, + "learning_rate": 6.547687663522739e-07, + "loss": 0.74176657, + "num_input_tokens_seen": 266465640, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 12354, + "time_per_iteration": 2.4892525672912598 + }, + { + "auxiliary_loss_clip": 0.01024379, + "auxiliary_loss_mlp": 0.01002171, + "balance_loss_clip": 1.00115824, + "balance_loss_mlp": 1.0041914, + "epoch": 0.7428227867127611, + "flos": 67209477655680.0, + "grad_norm": 0.7195367720859078, + "language_loss": 0.595505, + "learning_rate": 6.544805933122199e-07, + "loss": 0.61577046, + "num_input_tokens_seen": 266531950, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.20214844, + "step": 12355, + "time_per_iteration": 3.1565847396850586 + }, + { + "auxiliary_loss_clip": 0.01101716, + "auxiliary_loss_mlp": 0.01029249, + "balance_loss_clip": 1.01746547, + "balance_loss_mlp": 1.03509939, + "epoch": 0.7428829099654292, + "flos": 14721221082240.0, + "grad_norm": 1.5856742175038152, + "language_loss": 0.67546952, + "learning_rate": 6.541924712943971e-07, + "loss": 0.69677925, + "num_input_tokens_seen": 266550665, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 12356, + "time_per_iteration": 2.4489800930023193 + }, + { + "auxiliary_loss_clip": 0.01102658, + "auxiliary_loss_mlp": 0.01035169, + "balance_loss_clip": 1.02305126, + "balance_loss_mlp": 1.03400218, + "epoch": 0.7429430332180971, + "flos": 48646496413440.0, + "grad_norm": 2.760673613642481, + "language_loss": 0.72485077, + "learning_rate": 6.539044003097301e-07, + "loss": 0.74622905, + "num_input_tokens_seen": 266572455, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 12357, + "time_per_iteration": 2.721644401550293 + }, + { + "auxiliary_loss_clip": 0.01098694, + "auxiliary_loss_mlp": 0.01028573, + "balance_loss_clip": 1.0175041, + "balance_loss_mlp": 1.03629148, + "epoch": 0.7430031564707651, + "flos": 16764071281920.0, + "grad_norm": 2.0039134107579395, + "language_loss": 0.65105826, + "learning_rate": 6.53616380369143e-07, + "loss": 0.67233098, + "num_input_tokens_seen": 266590895, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.625, + "step": 12358, + "time_per_iteration": 2.4294605255126953 + }, + { + "auxiliary_loss_clip": 0.01104584, + "auxiliary_loss_mlp": 0.01035558, + "balance_loss_clip": 1.02243936, + "balance_loss_mlp": 1.03652191, + "epoch": 0.743063279723433, + "flos": 23870576545920.0, + "grad_norm": 1.8081229014020102, + "language_loss": 0.80658948, + "learning_rate": 6.533284114835591e-07, + "loss": 0.82799089, + "num_input_tokens_seen": 266607660, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6796875, + "step": 12359, + "time_per_iteration": 2.4662840366363525 + }, + { + "auxiliary_loss_clip": 0.01100319, + "auxiliary_loss_mlp": 0.01027181, + "balance_loss_clip": 1.01539159, + "balance_loss_mlp": 1.03399527, + "epoch": 0.743123402976101, + "flos": 14391704689920.0, + "grad_norm": 1.9929370638459747, + "language_loss": 0.68443716, + "learning_rate": 6.530404936638956e-07, + "loss": 0.7057122, + "num_input_tokens_seen": 266624260, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 12360, + "time_per_iteration": 2.39972186088562 + }, + { + "auxiliary_loss_clip": 0.01100119, + "auxiliary_loss_mlp": 0.01028769, + "balance_loss_clip": 1.01742589, + "balance_loss_mlp": 1.03408909, + "epoch": 0.7431835262287689, + "flos": 27454318335360.0, + "grad_norm": 1.6105929709695739, + "language_loss": 0.72354007, + "learning_rate": 6.527526269210715e-07, + "loss": 0.74482894, + "num_input_tokens_seen": 266644210, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66015625, + "step": 12361, + "time_per_iteration": 2.53438663482666 + }, + { + "auxiliary_loss_clip": 0.01103295, + "auxiliary_loss_mlp": 0.01031584, + "balance_loss_clip": 1.01991367, + "balance_loss_mlp": 1.03592443, + "epoch": 0.743243649481437, + "flos": 20959514709120.0, + "grad_norm": 1.9313349058571254, + "language_loss": 0.55937529, + "learning_rate": 6.524648112660027e-07, + "loss": 0.58072412, + "num_input_tokens_seen": 266664230, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 12362, + "time_per_iteration": 2.44446063041687 + }, + { + "auxiliary_loss_clip": 0.01103216, + "auxiliary_loss_mlp": 0.01029843, + "balance_loss_clip": 1.01801753, + "balance_loss_mlp": 1.03700173, + "epoch": 0.7433037727341049, + "flos": 22783166161920.0, + "grad_norm": 1.6965020963152944, + "language_loss": 0.77103531, + "learning_rate": 6.521770467096039e-07, + "loss": 0.79236591, + "num_input_tokens_seen": 266683270, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 12363, + "time_per_iteration": 2.4665377140045166 + }, + { + "auxiliary_loss_clip": 0.01100355, + "auxiliary_loss_mlp": 0.01030156, + "balance_loss_clip": 1.01916444, + "balance_loss_mlp": 1.03546381, + "epoch": 0.7433638959867729, + "flos": 22196708807040.0, + "grad_norm": 1.5848696782031413, + "language_loss": 0.781322, + "learning_rate": 6.518893332627862e-07, + "loss": 0.80262709, + "num_input_tokens_seen": 266701235, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6484375, + "step": 12364, + "time_per_iteration": 2.4513514041900635 + }, + { + "auxiliary_loss_clip": 0.01099072, + "auxiliary_loss_mlp": 0.0103225, + "balance_loss_clip": 1.02075863, + "balance_loss_mlp": 1.03311908, + "epoch": 0.7434240192394409, + "flos": 23296760778240.0, + "grad_norm": 1.566466537213553, + "language_loss": 0.78534245, + "learning_rate": 6.516016709364604e-07, + "loss": 0.80665576, + "num_input_tokens_seen": 266721495, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 12365, + "time_per_iteration": 2.5116143226623535 + }, + { + "auxiliary_loss_clip": 0.01103544, + "auxiliary_loss_mlp": 0.01031747, + "balance_loss_clip": 1.01939702, + "balance_loss_mlp": 1.03469706, + "epoch": 0.7434841424921088, + "flos": 54009575251200.0, + "grad_norm": 1.5212918722481565, + "language_loss": 0.76719224, + "learning_rate": 6.513140597415346e-07, + "loss": 0.78854513, + "num_input_tokens_seen": 266747400, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6875, + "step": 12366, + "time_per_iteration": 2.714674711227417 + }, + { + "auxiliary_loss_clip": 0.0109921, + "auxiliary_loss_mlp": 0.01030031, + "balance_loss_clip": 1.01957047, + "balance_loss_mlp": 1.03603196, + "epoch": 0.7435442657447768, + "flos": 21433966479360.0, + "grad_norm": 1.8098497154463502, + "language_loss": 0.7116037, + "learning_rate": 6.510264996889141e-07, + "loss": 0.73289615, + "num_input_tokens_seen": 266767630, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.6328125, + "step": 12367, + "time_per_iteration": 2.4605956077575684 + }, + { + "auxiliary_loss_clip": 0.01104307, + "auxiliary_loss_mlp": 0.01034707, + "balance_loss_clip": 1.02303672, + "balance_loss_mlp": 1.03570354, + "epoch": 0.7436043889974447, + "flos": 24499408970880.0, + "grad_norm": 1.5537878615409826, + "language_loss": 0.74737108, + "learning_rate": 6.507389907895038e-07, + "loss": 0.76876128, + "num_input_tokens_seen": 266788015, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6875, + "step": 12368, + "time_per_iteration": 2.4532225131988525 + }, + { + "auxiliary_loss_clip": 0.01099451, + "auxiliary_loss_mlp": 0.01032106, + "balance_loss_clip": 1.02200878, + "balance_loss_mlp": 1.03582263, + "epoch": 0.7436645122501128, + "flos": 40698388512000.0, + "grad_norm": 2.6437968867522397, + "language_loss": 0.69177192, + "learning_rate": 6.50451533054207e-07, + "loss": 0.7130875, + "num_input_tokens_seen": 266809010, + "router_z_loss_clip": 0.10107422, + "router_z_loss_mlp": 0.63671875, + "step": 12369, + "time_per_iteration": 2.6095521450042725 + }, + { + "auxiliary_loss_clip": 0.01100669, + "auxiliary_loss_mlp": 0.01027152, + "balance_loss_clip": 1.01569033, + "balance_loss_mlp": 1.03491139, + "epoch": 0.7437246355027807, + "flos": 18908835344640.0, + "grad_norm": 1.8225441721973505, + "language_loss": 0.75607926, + "learning_rate": 6.501641264939233e-07, + "loss": 0.77735746, + "num_input_tokens_seen": 266825390, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 12370, + "time_per_iteration": 2.3974015712738037 + }, + { + "auxiliary_loss_clip": 0.01101812, + "auxiliary_loss_mlp": 0.01033435, + "balance_loss_clip": 1.02188921, + "balance_loss_mlp": 1.03682232, + "epoch": 0.7437847587554487, + "flos": 21543817248000.0, + "grad_norm": 1.5003725500414622, + "language_loss": 0.78235525, + "learning_rate": 6.498767711195503e-07, + "loss": 0.80370772, + "num_input_tokens_seen": 266844675, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6484375, + "step": 12371, + "time_per_iteration": 2.587583303451538 + }, + { + "auxiliary_loss_clip": 0.01100563, + "auxiliary_loss_mlp": 0.01024934, + "balance_loss_clip": 1.01351357, + "balance_loss_mlp": 1.03449976, + "epoch": 0.7438448820081166, + "flos": 27782470010880.0, + "grad_norm": 1.5904858963552928, + "language_loss": 0.69456738, + "learning_rate": 6.495894669419857e-07, + "loss": 0.71582228, + "num_input_tokens_seen": 266865160, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 12372, + "time_per_iteration": 2.46589732170105 + }, + { + "auxiliary_loss_clip": 0.01100436, + "auxiliary_loss_mlp": 0.0103015, + "balance_loss_clip": 1.01876593, + "balance_loss_mlp": 1.03523791, + "epoch": 0.7439050052607846, + "flos": 17967832796160.0, + "grad_norm": 2.0303622627769, + "language_loss": 0.74881828, + "learning_rate": 6.493022139721245e-07, + "loss": 0.77012408, + "num_input_tokens_seen": 266883285, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 12373, + "time_per_iteration": 2.429455518722534 + }, + { + "auxiliary_loss_clip": 0.01103553, + "auxiliary_loss_mlp": 0.01031934, + "balance_loss_clip": 1.01918495, + "balance_loss_mlp": 1.03517175, + "epoch": 0.7439651285134525, + "flos": 22958696949120.0, + "grad_norm": 1.8905423318011396, + "language_loss": 0.77127612, + "learning_rate": 6.49015012220858e-07, + "loss": 0.79263097, + "num_input_tokens_seen": 266900960, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.68359375, + "step": 12374, + "time_per_iteration": 2.467027425765991 + }, + { + "auxiliary_loss_clip": 0.01101807, + "auxiliary_loss_mlp": 0.0103348, + "balance_loss_clip": 1.02173197, + "balance_loss_mlp": 1.03450108, + "epoch": 0.7440252517661206, + "flos": 18806777827200.0, + "grad_norm": 2.0275286605601903, + "language_loss": 0.76452887, + "learning_rate": 6.487278616990774e-07, + "loss": 0.7858817, + "num_input_tokens_seen": 266917710, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.671875, + "step": 12375, + "time_per_iteration": 2.4504282474517822 + }, + { + "auxiliary_loss_clip": 0.01098205, + "auxiliary_loss_mlp": 0.01029678, + "balance_loss_clip": 1.01930046, + "balance_loss_mlp": 1.03446364, + "epoch": 0.7440853750187885, + "flos": 20266295155200.0, + "grad_norm": 1.8957308287031664, + "language_loss": 0.77052188, + "learning_rate": 6.484407624176733e-07, + "loss": 0.79180074, + "num_input_tokens_seen": 266934220, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.63671875, + "step": 12376, + "time_per_iteration": 2.426997423171997 + }, + { + "auxiliary_loss_clip": 0.01102657, + "auxiliary_loss_mlp": 0.01026205, + "balance_loss_clip": 1.01435566, + "balance_loss_mlp": 1.03490746, + "epoch": 0.7441454982714565, + "flos": 25337276593920.0, + "grad_norm": 1.648771332644217, + "language_loss": 0.79147625, + "learning_rate": 6.481537143875296e-07, + "loss": 0.81276488, + "num_input_tokens_seen": 266955210, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.67578125, + "step": 12377, + "time_per_iteration": 2.5062367916107178 + }, + { + "auxiliary_loss_clip": 0.01104221, + "auxiliary_loss_mlp": 0.01028645, + "balance_loss_clip": 1.01639605, + "balance_loss_mlp": 1.03595889, + "epoch": 0.7442056215241245, + "flos": 64480910866560.0, + "grad_norm": 1.8728399382870544, + "language_loss": 0.67017269, + "learning_rate": 6.478667176195322e-07, + "loss": 0.69150138, + "num_input_tokens_seen": 266976555, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.68359375, + "step": 12378, + "time_per_iteration": 2.81579327583313 + }, + { + "auxiliary_loss_clip": 0.01103565, + "auxiliary_loss_mlp": 0.01034825, + "balance_loss_clip": 1.02170622, + "balance_loss_mlp": 1.0356729, + "epoch": 0.7442657447767924, + "flos": 31285376242560.0, + "grad_norm": 1.6381441755645296, + "language_loss": 0.71693718, + "learning_rate": 6.475797721245648e-07, + "loss": 0.73832107, + "num_input_tokens_seen": 266997640, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6796875, + "step": 12379, + "time_per_iteration": 2.5361573696136475 + }, + { + "auxiliary_loss_clip": 0.0110076, + "auxiliary_loss_mlp": 0.01032377, + "balance_loss_clip": 1.02015245, + "balance_loss_mlp": 1.0342983, + "epoch": 0.7443258680294604, + "flos": 20807899401600.0, + "grad_norm": 1.779117116222904, + "language_loss": 0.6545527, + "learning_rate": 6.472928779135085e-07, + "loss": 0.67588407, + "num_input_tokens_seen": 267016165, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6640625, + "step": 12380, + "time_per_iteration": 2.4233927726745605 + }, + { + "auxiliary_loss_clip": 0.01103433, + "auxiliary_loss_mlp": 0.01030324, + "balance_loss_clip": 1.0180037, + "balance_loss_mlp": 1.0361979, + "epoch": 0.7443859912821283, + "flos": 22199833290240.0, + "grad_norm": 1.8649656788405269, + "language_loss": 0.78407371, + "learning_rate": 6.470060349972411e-07, + "loss": 0.80541134, + "num_input_tokens_seen": 267034075, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.671875, + "step": 12381, + "time_per_iteration": 2.4858570098876953 + }, + { + "auxiliary_loss_clip": 0.01105177, + "auxiliary_loss_mlp": 0.01030856, + "balance_loss_clip": 1.01844049, + "balance_loss_mlp": 1.03706956, + "epoch": 0.7444461145347964, + "flos": 22017838055040.0, + "grad_norm": 2.020102032989411, + "language_loss": 0.726803, + "learning_rate": 6.467192433866411e-07, + "loss": 0.74816334, + "num_input_tokens_seen": 267053645, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 12382, + "time_per_iteration": 2.4412596225738525 + }, + { + "auxiliary_loss_clip": 0.01025583, + "auxiliary_loss_mlp": 0.0100093, + "balance_loss_clip": 0.99986947, + "balance_loss_mlp": 1.00515175, + "epoch": 0.7445062377874643, + "flos": 70559047704960.0, + "grad_norm": 0.6497921539673587, + "language_loss": 0.5464738, + "learning_rate": 6.464325030925831e-07, + "loss": 0.56673896, + "num_input_tokens_seen": 267121830, + "router_z_loss_clip": 0.01062012, + "router_z_loss_mlp": 0.20507812, + "step": 12383, + "time_per_iteration": 3.222402811050415 + }, + { + "auxiliary_loss_clip": 0.01100878, + "auxiliary_loss_mlp": 0.01026647, + "balance_loss_clip": 1.0151608, + "balance_loss_mlp": 1.03370833, + "epoch": 0.7445663610401323, + "flos": 22164425458560.0, + "grad_norm": 1.9786543947489503, + "language_loss": 0.76230276, + "learning_rate": 6.461458141259395e-07, + "loss": 0.78357792, + "num_input_tokens_seen": 267141145, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.671875, + "step": 12384, + "time_per_iteration": 2.455353021621704 + }, + { + "auxiliary_loss_clip": 0.01100095, + "auxiliary_loss_mlp": 0.01029552, + "balance_loss_clip": 1.01782155, + "balance_loss_mlp": 1.03452992, + "epoch": 0.7446264842928002, + "flos": 24170251714560.0, + "grad_norm": 2.0782969884363816, + "language_loss": 0.79298764, + "learning_rate": 6.458591764975823e-07, + "loss": 0.81428415, + "num_input_tokens_seen": 267159280, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.65625, + "step": 12385, + "time_per_iteration": 2.4987757205963135 + }, + { + "auxiliary_loss_clip": 0.01106725, + "auxiliary_loss_mlp": 0.01032618, + "balance_loss_clip": 1.01921868, + "balance_loss_mlp": 1.03626704, + "epoch": 0.7446866075454682, + "flos": 24134556574080.0, + "grad_norm": 1.6771558108044815, + "language_loss": 0.8143934, + "learning_rate": 6.455725902183813e-07, + "loss": 0.83578682, + "num_input_tokens_seen": 267179390, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.703125, + "step": 12386, + "time_per_iteration": 3.998560667037964 + }, + { + "auxiliary_loss_clip": 0.01099591, + "auxiliary_loss_mlp": 0.01026498, + "balance_loss_clip": 1.0154351, + "balance_loss_mlp": 1.03524506, + "epoch": 0.7447467307981361, + "flos": 23548063305600.0, + "grad_norm": 1.7576352250203031, + "language_loss": 0.71226764, + "learning_rate": 6.452860552992037e-07, + "loss": 0.73352849, + "num_input_tokens_seen": 267198165, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.64453125, + "step": 12387, + "time_per_iteration": 2.4593608379364014 + }, + { + "auxiliary_loss_clip": 0.01101935, + "auxiliary_loss_mlp": 0.0102766, + "balance_loss_clip": 1.01612639, + "balance_loss_mlp": 1.03501618, + "epoch": 0.7448068540508042, + "flos": 19567832215680.0, + "grad_norm": 2.162095578178006, + "language_loss": 0.7053076, + "learning_rate": 6.449995717509138e-07, + "loss": 0.72660351, + "num_input_tokens_seen": 267214520, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 12388, + "time_per_iteration": 3.8914287090301514 + }, + { + "auxiliary_loss_clip": 0.01102008, + "auxiliary_loss_mlp": 0.01030206, + "balance_loss_clip": 1.01879215, + "balance_loss_mlp": 1.03539407, + "epoch": 0.7448669773034721, + "flos": 21839721488640.0, + "grad_norm": 1.5805660577109513, + "language_loss": 0.84949243, + "learning_rate": 6.447131395843761e-07, + "loss": 0.87081456, + "num_input_tokens_seen": 267236555, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 12389, + "time_per_iteration": 4.039583683013916 + }, + { + "auxiliary_loss_clip": 0.01100859, + "auxiliary_loss_mlp": 0.01030695, + "balance_loss_clip": 1.01907206, + "balance_loss_mlp": 1.03446209, + "epoch": 0.7449271005561401, + "flos": 25155389099520.0, + "grad_norm": 1.992620566185106, + "language_loss": 0.79385233, + "learning_rate": 6.444267588104526e-07, + "loss": 0.8151679, + "num_input_tokens_seen": 267254800, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 12390, + "time_per_iteration": 3.9466896057128906 + }, + { + "auxiliary_loss_clip": 0.01104503, + "auxiliary_loss_mlp": 0.0102899, + "balance_loss_clip": 1.01669955, + "balance_loss_mlp": 1.03669739, + "epoch": 0.7449872238088081, + "flos": 22273342473600.0, + "grad_norm": 1.730347550558291, + "language_loss": 0.84698212, + "learning_rate": 6.441404294400014e-07, + "loss": 0.86831707, + "num_input_tokens_seen": 267274610, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6796875, + "step": 12391, + "time_per_iteration": 2.493415117263794 + }, + { + "auxiliary_loss_clip": 0.01100534, + "auxiliary_loss_mlp": 0.01029608, + "balance_loss_clip": 1.01861715, + "balance_loss_mlp": 1.03483033, + "epoch": 0.745047347061476, + "flos": 20594805966720.0, + "grad_norm": 1.8306369594039993, + "language_loss": 0.73786843, + "learning_rate": 6.438541514838811e-07, + "loss": 0.75916982, + "num_input_tokens_seen": 267292600, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.65625, + "step": 12392, + "time_per_iteration": 2.431533098220825 + }, + { + "auxiliary_loss_clip": 0.01098966, + "auxiliary_loss_mlp": 0.01033054, + "balance_loss_clip": 1.02171087, + "balance_loss_mlp": 1.03509498, + "epoch": 0.745107470314144, + "flos": 22127545169280.0, + "grad_norm": 1.6456666698641875, + "language_loss": 0.76718521, + "learning_rate": 6.435679249529487e-07, + "loss": 0.78850538, + "num_input_tokens_seen": 267311295, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.63671875, + "step": 12393, + "time_per_iteration": 2.473604917526245 + }, + { + "auxiliary_loss_clip": 0.01102478, + "auxiliary_loss_mlp": 0.01033816, + "balance_loss_clip": 1.02133441, + "balance_loss_mlp": 1.03579187, + "epoch": 0.745167593566812, + "flos": 22236498097920.0, + "grad_norm": 1.8111060695117658, + "language_loss": 0.72828883, + "learning_rate": 6.432817498580552e-07, + "loss": 0.74965185, + "num_input_tokens_seen": 267328390, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6640625, + "step": 12394, + "time_per_iteration": 2.4453284740448 + }, + { + "auxiliary_loss_clip": 0.01103102, + "auxiliary_loss_mlp": 0.01035289, + "balance_loss_clip": 1.02332675, + "balance_loss_mlp": 1.03558517, + "epoch": 0.74522771681948, + "flos": 20666232161280.0, + "grad_norm": 1.668528755901744, + "language_loss": 0.81820607, + "learning_rate": 6.429956262100535e-07, + "loss": 0.83958995, + "num_input_tokens_seen": 267348185, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.67578125, + "step": 12395, + "time_per_iteration": 2.4907712936401367 + }, + { + "auxiliary_loss_clip": 0.01104977, + "auxiliary_loss_mlp": 0.010329, + "balance_loss_clip": 1.02062798, + "balance_loss_mlp": 1.03574276, + "epoch": 0.7452878400721479, + "flos": 21106999952640.0, + "grad_norm": 5.4481505993838475, + "language_loss": 0.70923871, + "learning_rate": 6.427095540197937e-07, + "loss": 0.73061752, + "num_input_tokens_seen": 267367010, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69140625, + "step": 12396, + "time_per_iteration": 2.5307369232177734 + }, + { + "auxiliary_loss_clip": 0.01103961, + "auxiliary_loss_mlp": 0.01029197, + "balance_loss_clip": 1.0171988, + "balance_loss_mlp": 1.03555429, + "epoch": 0.7453479633248159, + "flos": 26688056474880.0, + "grad_norm": 1.799312565551718, + "language_loss": 0.6829254, + "learning_rate": 6.424235332981245e-07, + "loss": 0.70425701, + "num_input_tokens_seen": 267386605, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 12397, + "time_per_iteration": 2.5126614570617676 + }, + { + "auxiliary_loss_clip": 0.01101329, + "auxiliary_loss_mlp": 0.01040843, + "balance_loss_clip": 1.02871311, + "balance_loss_mlp": 1.03490043, + "epoch": 0.7454080865774838, + "flos": 17016056167680.0, + "grad_norm": 2.004729126431997, + "language_loss": 0.76321107, + "learning_rate": 6.421375640558908e-07, + "loss": 0.7846328, + "num_input_tokens_seen": 267404135, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6640625, + "step": 12398, + "time_per_iteration": 2.4119622707366943 + }, + { + "auxiliary_loss_clip": 0.01098608, + "auxiliary_loss_mlp": 0.01026539, + "balance_loss_clip": 1.01461804, + "balance_loss_mlp": 1.03464854, + "epoch": 0.7454682098301518, + "flos": 21323900229120.0, + "grad_norm": 1.6814125292484552, + "language_loss": 0.77809334, + "learning_rate": 6.418516463039363e-07, + "loss": 0.79934478, + "num_input_tokens_seen": 267423120, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.640625, + "step": 12399, + "time_per_iteration": 2.4987549781799316 + }, + { + "auxiliary_loss_clip": 0.01097189, + "auxiliary_loss_mlp": 0.0103442, + "balance_loss_clip": 1.02345836, + "balance_loss_mlp": 1.03396916, + "epoch": 0.7455283330828197, + "flos": 17858341163520.0, + "grad_norm": 1.9741218645460363, + "language_loss": 0.73963678, + "learning_rate": 6.415657800531038e-07, + "loss": 0.76095283, + "num_input_tokens_seen": 267441250, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6328125, + "step": 12400, + "time_per_iteration": 2.4242513179779053 + }, + { + "auxiliary_loss_clip": 0.01098712, + "auxiliary_loss_mlp": 0.01029354, + "balance_loss_clip": 1.01808882, + "balance_loss_mlp": 1.03357267, + "epoch": 0.7455884563354878, + "flos": 30774259664640.0, + "grad_norm": 1.8638807707826066, + "language_loss": 0.81975746, + "learning_rate": 6.412799653142327e-07, + "loss": 0.84103811, + "num_input_tokens_seen": 267462820, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65234375, + "step": 12401, + "time_per_iteration": 2.5451955795288086 + }, + { + "auxiliary_loss_clip": 0.01100279, + "auxiliary_loss_mlp": 0.01033562, + "balance_loss_clip": 1.02252901, + "balance_loss_mlp": 1.03501511, + "epoch": 0.7456485795881557, + "flos": 23185545292800.0, + "grad_norm": 1.845084112452823, + "language_loss": 0.65197337, + "learning_rate": 6.409942020981611e-07, + "loss": 0.67331183, + "num_input_tokens_seen": 267483065, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65234375, + "step": 12402, + "time_per_iteration": 2.4577367305755615 + }, + { + "auxiliary_loss_clip": 0.01099262, + "auxiliary_loss_mlp": 0.01028921, + "balance_loss_clip": 1.01831102, + "balance_loss_mlp": 1.0342567, + "epoch": 0.7457087028408237, + "flos": 38727144074880.0, + "grad_norm": 1.6576964620220311, + "language_loss": 0.73214388, + "learning_rate": 6.407084904157265e-07, + "loss": 0.75342572, + "num_input_tokens_seen": 267504825, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.6484375, + "step": 12403, + "time_per_iteration": 2.620654821395874 + }, + { + "auxiliary_loss_clip": 0.01024907, + "auxiliary_loss_mlp": 0.01005223, + "balance_loss_clip": 1.00420368, + "balance_loss_mlp": 1.00436723, + "epoch": 0.7457688260934917, + "flos": 56043737337600.0, + "grad_norm": 0.8255474672184773, + "language_loss": 0.58760434, + "learning_rate": 6.404228302777621e-07, + "loss": 0.60790563, + "num_input_tokens_seen": 267559260, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.20507812, + "step": 12404, + "time_per_iteration": 2.8954858779907227 + }, + { + "auxiliary_loss_clip": 0.0110003, + "auxiliary_loss_mlp": 0.01034771, + "balance_loss_clip": 1.02357709, + "balance_loss_mlp": 1.03306055, + "epoch": 0.7458289493461596, + "flos": 20116152305280.0, + "grad_norm": 1.6032592804273305, + "language_loss": 0.77657819, + "learning_rate": 6.401372216950995e-07, + "loss": 0.79792619, + "num_input_tokens_seen": 267578720, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.66796875, + "step": 12405, + "time_per_iteration": 2.470407247543335 + }, + { + "auxiliary_loss_clip": 0.01100531, + "auxiliary_loss_mlp": 0.01032126, + "balance_loss_clip": 1.02069402, + "balance_loss_mlp": 1.03543913, + "epoch": 0.7458890725988276, + "flos": 20193073280640.0, + "grad_norm": 1.5461856417653022, + "language_loss": 0.69148755, + "learning_rate": 6.398516646785698e-07, + "loss": 0.71281415, + "num_input_tokens_seen": 267598250, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 12406, + "time_per_iteration": 2.4450039863586426 + }, + { + "auxiliary_loss_clip": 0.01107001, + "auxiliary_loss_mlp": 0.01034039, + "balance_loss_clip": 1.02102149, + "balance_loss_mlp": 1.03617549, + "epoch": 0.7459491958514956, + "flos": 17018749687680.0, + "grad_norm": 1.505466725953553, + "language_loss": 0.64742386, + "learning_rate": 6.39566159239002e-07, + "loss": 0.66883421, + "num_input_tokens_seen": 267615430, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 12407, + "time_per_iteration": 2.4332051277160645 + }, + { + "auxiliary_loss_clip": 0.01103397, + "auxiliary_loss_mlp": 0.01031734, + "balance_loss_clip": 1.01944911, + "balance_loss_mlp": 1.03494692, + "epoch": 0.7460093191041636, + "flos": 25078719519360.0, + "grad_norm": 1.652287891377431, + "language_loss": 0.72460616, + "learning_rate": 6.392807053872212e-07, + "loss": 0.74595749, + "num_input_tokens_seen": 267635075, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.68359375, + "step": 12408, + "time_per_iteration": 2.4836978912353516 + }, + { + "auxiliary_loss_clip": 0.01106452, + "auxiliary_loss_mlp": 0.01034146, + "balance_loss_clip": 1.02128339, + "balance_loss_mlp": 1.03751123, + "epoch": 0.7460694423568315, + "flos": 21908525990400.0, + "grad_norm": 1.7143768507331778, + "language_loss": 0.72858518, + "learning_rate": 6.38995303134053e-07, + "loss": 0.74999118, + "num_input_tokens_seen": 267654105, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6875, + "step": 12409, + "time_per_iteration": 2.515709400177002 + }, + { + "auxiliary_loss_clip": 0.01098264, + "auxiliary_loss_mlp": 0.01032387, + "balance_loss_clip": 1.02187228, + "balance_loss_mlp": 1.03468859, + "epoch": 0.7461295656094995, + "flos": 21215737399680.0, + "grad_norm": 1.710421587761424, + "language_loss": 0.6618892, + "learning_rate": 6.38709952490319e-07, + "loss": 0.68319571, + "num_input_tokens_seen": 267673090, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.63671875, + "step": 12410, + "time_per_iteration": 2.449406147003174 + }, + { + "auxiliary_loss_clip": 0.01098711, + "auxiliary_loss_mlp": 0.0103109, + "balance_loss_clip": 1.01910925, + "balance_loss_mlp": 1.034163, + "epoch": 0.7461896888621674, + "flos": 22346851656960.0, + "grad_norm": 2.213506116293379, + "language_loss": 0.84104359, + "learning_rate": 6.384246534668396e-07, + "loss": 0.86234152, + "num_input_tokens_seen": 267690605, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6484375, + "step": 12411, + "time_per_iteration": 2.4625163078308105 + }, + { + "auxiliary_loss_clip": 0.01103566, + "auxiliary_loss_mlp": 0.01029489, + "balance_loss_clip": 1.01740742, + "balance_loss_mlp": 1.03515697, + "epoch": 0.7462498121148354, + "flos": 25482930243840.0, + "grad_norm": 1.6692936053556306, + "language_loss": 0.7766965, + "learning_rate": 6.381394060744339e-07, + "loss": 0.79802704, + "num_input_tokens_seen": 267710540, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.68359375, + "step": 12412, + "time_per_iteration": 2.4557554721832275 + }, + { + "auxiliary_loss_clip": 0.01102723, + "auxiliary_loss_mlp": 0.01035248, + "balance_loss_clip": 1.02409029, + "balance_loss_mlp": 1.03520751, + "epoch": 0.7463099353675033, + "flos": 33947936812800.0, + "grad_norm": 1.834679176534713, + "language_loss": 0.6225034, + "learning_rate": 6.378542103239188e-07, + "loss": 0.64388311, + "num_input_tokens_seen": 267730780, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.67578125, + "step": 12413, + "time_per_iteration": 2.559657573699951 + }, + { + "auxiliary_loss_clip": 0.01024964, + "auxiliary_loss_mlp": 0.01002262, + "balance_loss_clip": 1.00121295, + "balance_loss_mlp": 1.00439072, + "epoch": 0.7463700586201714, + "flos": 62767723691520.0, + "grad_norm": 0.7203793484361629, + "language_loss": 0.54924321, + "learning_rate": 6.375690662261082e-07, + "loss": 0.56951547, + "num_input_tokens_seen": 267794240, + "router_z_loss_clip": 0.01049805, + "router_z_loss_mlp": 0.20605469, + "step": 12414, + "time_per_iteration": 3.0637338161468506 + }, + { + "auxiliary_loss_clip": 0.01101199, + "auxiliary_loss_mlp": 0.01030512, + "balance_loss_clip": 1.01846027, + "balance_loss_mlp": 1.03334022, + "epoch": 0.7464301818728393, + "flos": 33432654257280.0, + "grad_norm": 1.860182659182016, + "language_loss": 0.54804456, + "learning_rate": 6.372839737918154e-07, + "loss": 0.56936157, + "num_input_tokens_seen": 267817190, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 12415, + "time_per_iteration": 2.5465588569641113 + }, + { + "auxiliary_loss_clip": 0.01101991, + "auxiliary_loss_mlp": 0.01032607, + "balance_loss_clip": 1.02022743, + "balance_loss_mlp": 1.0359658, + "epoch": 0.7464903051255073, + "flos": 26869872142080.0, + "grad_norm": 1.6660939393048266, + "language_loss": 0.74985796, + "learning_rate": 6.369989330318506e-07, + "loss": 0.77120394, + "num_input_tokens_seen": 267836245, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.65625, + "step": 12416, + "time_per_iteration": 2.514845132827759 + }, + { + "auxiliary_loss_clip": 0.01101809, + "auxiliary_loss_mlp": 0.01034557, + "balance_loss_clip": 1.02302361, + "balance_loss_mlp": 1.03556323, + "epoch": 0.7465504283781753, + "flos": 44086954775040.0, + "grad_norm": 1.4814223642956346, + "language_loss": 0.69489551, + "learning_rate": 6.367139439570233e-07, + "loss": 0.71625924, + "num_input_tokens_seen": 267858310, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66015625, + "step": 12417, + "time_per_iteration": 2.6574227809906006 + }, + { + "auxiliary_loss_clip": 0.01104674, + "auxiliary_loss_mlp": 0.01030343, + "balance_loss_clip": 1.01790345, + "balance_loss_mlp": 1.03659248, + "epoch": 0.7466105516308432, + "flos": 19676102785920.0, + "grad_norm": 1.767590849665872, + "language_loss": 0.73728597, + "learning_rate": 6.364290065781392e-07, + "loss": 0.75863612, + "num_input_tokens_seen": 267876345, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6796875, + "step": 12418, + "time_per_iteration": 2.462244987487793 + }, + { + "auxiliary_loss_clip": 0.01103226, + "auxiliary_loss_mlp": 0.01027321, + "balance_loss_clip": 1.01586497, + "balance_loss_mlp": 1.03675175, + "epoch": 0.7466706748835112, + "flos": 20520722165760.0, + "grad_norm": 1.574966460677448, + "language_loss": 0.69369054, + "learning_rate": 6.361441209060039e-07, + "loss": 0.71499598, + "num_input_tokens_seen": 267896740, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 12419, + "time_per_iteration": 2.4568960666656494 + }, + { + "auxiliary_loss_clip": 0.01095857, + "auxiliary_loss_mlp": 0.01032572, + "balance_loss_clip": 1.02151561, + "balance_loss_mlp": 1.03342533, + "epoch": 0.7467307981361792, + "flos": 21690260997120.0, + "grad_norm": 1.6640874245133943, + "language_loss": 0.74578714, + "learning_rate": 6.358592869514216e-07, + "loss": 0.76707137, + "num_input_tokens_seen": 267914765, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.625, + "step": 12420, + "time_per_iteration": 2.5238821506500244 + }, + { + "auxiliary_loss_clip": 0.01104196, + "auxiliary_loss_mlp": 0.01031831, + "balance_loss_clip": 1.01943922, + "balance_loss_mlp": 1.03683901, + "epoch": 0.7467909213888472, + "flos": 19573686132480.0, + "grad_norm": 1.6177707150337377, + "language_loss": 0.67195189, + "learning_rate": 6.355745047251904e-07, + "loss": 0.69331217, + "num_input_tokens_seen": 267934085, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 12421, + "time_per_iteration": 2.4293341636657715 + }, + { + "auxiliary_loss_clip": 0.01104487, + "auxiliary_loss_mlp": 0.01032623, + "balance_loss_clip": 1.01912296, + "balance_loss_mlp": 1.03556955, + "epoch": 0.7468510446415151, + "flos": 23695225326720.0, + "grad_norm": 1.5639142011030407, + "language_loss": 0.72440511, + "learning_rate": 6.352897742381107e-07, + "loss": 0.74577618, + "num_input_tokens_seen": 267955170, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.6875, + "step": 12422, + "time_per_iteration": 2.509237766265869 + }, + { + "auxiliary_loss_clip": 0.01100612, + "auxiliary_loss_mlp": 0.01030685, + "balance_loss_clip": 1.01877022, + "balance_loss_mlp": 1.03514779, + "epoch": 0.7469111678941831, + "flos": 29315783831040.0, + "grad_norm": 9.98591332499941, + "language_loss": 0.74842906, + "learning_rate": 6.350050955009796e-07, + "loss": 0.76974201, + "num_input_tokens_seen": 267974980, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 12423, + "time_per_iteration": 2.5110628604888916 + }, + { + "auxiliary_loss_clip": 0.01099293, + "auxiliary_loss_mlp": 0.0102642, + "balance_loss_clip": 1.01536298, + "balance_loss_mlp": 1.03383863, + "epoch": 0.746971291146851, + "flos": 21798639308160.0, + "grad_norm": 1.296938244989713, + "language_loss": 0.67754054, + "learning_rate": 6.347204685245929e-07, + "loss": 0.6987977, + "num_input_tokens_seen": 267994985, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.65625, + "step": 12424, + "time_per_iteration": 2.4905362129211426 + }, + { + "auxiliary_loss_clip": 0.01105568, + "auxiliary_loss_mlp": 0.0103483, + "balance_loss_clip": 1.02293932, + "balance_loss_mlp": 1.03707027, + "epoch": 0.747031414399519, + "flos": 36245070368640.0, + "grad_norm": 1.7754548837213033, + "language_loss": 0.74119371, + "learning_rate": 6.344358933197418e-07, + "loss": 0.76259774, + "num_input_tokens_seen": 268014985, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 12425, + "time_per_iteration": 2.5686028003692627 + }, + { + "auxiliary_loss_clip": 0.01101237, + "auxiliary_loss_mlp": 0.0102942, + "balance_loss_clip": 1.01754081, + "balance_loss_mlp": 1.0353744, + "epoch": 0.7470915376521869, + "flos": 19974916028160.0, + "grad_norm": 2.326605643233434, + "language_loss": 0.69533008, + "learning_rate": 6.341513698972194e-07, + "loss": 0.71663666, + "num_input_tokens_seen": 268034395, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66015625, + "step": 12426, + "time_per_iteration": 2.4671969413757324 + }, + { + "auxiliary_loss_clip": 0.01097868, + "auxiliary_loss_mlp": 0.01036163, + "balance_loss_clip": 1.024755, + "balance_loss_mlp": 1.03396261, + "epoch": 0.747151660904855, + "flos": 20084299920000.0, + "grad_norm": 1.6460733379816328, + "language_loss": 0.65486181, + "learning_rate": 6.338668982678139e-07, + "loss": 0.67620206, + "num_input_tokens_seen": 268054485, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.640625, + "step": 12427, + "time_per_iteration": 2.459057092666626 + }, + { + "auxiliary_loss_clip": 0.01102337, + "auxiliary_loss_mlp": 0.01027971, + "balance_loss_clip": 1.01555538, + "balance_loss_mlp": 1.03570294, + "epoch": 0.7472117841575229, + "flos": 16290373697280.0, + "grad_norm": 1.7506429909383225, + "language_loss": 0.74639595, + "learning_rate": 6.335824784423118e-07, + "loss": 0.767699, + "num_input_tokens_seen": 268072250, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6640625, + "step": 12428, + "time_per_iteration": 3.809513807296753 + }, + { + "auxiliary_loss_clip": 0.0110597, + "auxiliary_loss_mlp": 0.0103051, + "balance_loss_clip": 1.01710534, + "balance_loss_mlp": 1.0359993, + "epoch": 0.7472719074101909, + "flos": 21389939383680.0, + "grad_norm": 2.159964503285926, + "language_loss": 0.58328772, + "learning_rate": 6.33298110431499e-07, + "loss": 0.60465252, + "num_input_tokens_seen": 268089840, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.69921875, + "step": 12429, + "time_per_iteration": 2.420081377029419 + }, + { + "auxiliary_loss_clip": 0.01105592, + "auxiliary_loss_mlp": 0.0103229, + "balance_loss_clip": 1.02002382, + "balance_loss_mlp": 1.03655839, + "epoch": 0.7473320306628589, + "flos": 29643289061760.0, + "grad_norm": 1.8822181590488856, + "language_loss": 0.60539925, + "learning_rate": 6.330137942461595e-07, + "loss": 0.62677801, + "num_input_tokens_seen": 268109360, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.69140625, + "step": 12430, + "time_per_iteration": 3.917961359024048 + }, + { + "auxiliary_loss_clip": 0.01102089, + "auxiliary_loss_mlp": 0.01030145, + "balance_loss_clip": 1.01848626, + "balance_loss_mlp": 1.0366466, + "epoch": 0.7473921539155268, + "flos": 24136100858880.0, + "grad_norm": 1.4375442916697652, + "language_loss": 0.75408334, + "learning_rate": 6.327295298970734e-07, + "loss": 0.77540565, + "num_input_tokens_seen": 268131840, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65625, + "step": 12431, + "time_per_iteration": 3.8775863647460938 + }, + { + "auxiliary_loss_clip": 0.01101836, + "auxiliary_loss_mlp": 0.01030243, + "balance_loss_clip": 1.01829195, + "balance_loss_mlp": 1.03413606, + "epoch": 0.7474522771681948, + "flos": 17487958072320.0, + "grad_norm": 1.7750987800998057, + "language_loss": 0.75931549, + "learning_rate": 6.32445317395021e-07, + "loss": 0.78063631, + "num_input_tokens_seen": 268148300, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.67578125, + "step": 12432, + "time_per_iteration": 2.4008095264434814 + }, + { + "auxiliary_loss_clip": 0.01105995, + "auxiliary_loss_mlp": 0.01036957, + "balance_loss_clip": 1.02375436, + "balance_loss_mlp": 1.03559935, + "epoch": 0.7475124004208628, + "flos": 16727298733440.0, + "grad_norm": 4.600278612020183, + "language_loss": 0.69874978, + "learning_rate": 6.321611567507787e-07, + "loss": 0.72017932, + "num_input_tokens_seen": 268166450, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 12433, + "time_per_iteration": 3.879322052001953 + }, + { + "auxiliary_loss_clip": 0.01103347, + "auxiliary_loss_mlp": 0.01031143, + "balance_loss_clip": 1.01847744, + "balance_loss_mlp": 1.03535938, + "epoch": 0.7475725236735308, + "flos": 19720237622400.0, + "grad_norm": 1.4972431185118094, + "language_loss": 0.67169416, + "learning_rate": 6.318770479751232e-07, + "loss": 0.69303912, + "num_input_tokens_seen": 268186165, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 12434, + "time_per_iteration": 2.45617938041687 + }, + { + "auxiliary_loss_clip": 0.01096539, + "auxiliary_loss_mlp": 0.01028681, + "balance_loss_clip": 1.01809549, + "balance_loss_mlp": 1.03466129, + "epoch": 0.7476326469261987, + "flos": 26286000566400.0, + "grad_norm": 1.5115766265302155, + "language_loss": 0.7984153, + "learning_rate": 6.315929910788263e-07, + "loss": 0.81966752, + "num_input_tokens_seen": 268208145, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.6171875, + "step": 12435, + "time_per_iteration": 2.4689295291900635 + }, + { + "auxiliary_loss_clip": 0.01103643, + "auxiliary_loss_mlp": 0.01028444, + "balance_loss_clip": 1.0165174, + "balance_loss_mlp": 1.03551531, + "epoch": 0.7476927701788667, + "flos": 31831828824960.0, + "grad_norm": 1.9192190166141703, + "language_loss": 0.685781, + "learning_rate": 6.313089860726604e-07, + "loss": 0.70710182, + "num_input_tokens_seen": 268228345, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.68359375, + "step": 12436, + "time_per_iteration": 2.5397560596466064 + }, + { + "auxiliary_loss_clip": 0.01105286, + "auxiliary_loss_mlp": 0.01032667, + "balance_loss_clip": 1.02078748, + "balance_loss_mlp": 1.0353477, + "epoch": 0.7477528934315346, + "flos": 31795487239680.0, + "grad_norm": 2.523256251254823, + "language_loss": 0.70543289, + "learning_rate": 6.31025032967396e-07, + "loss": 0.72681236, + "num_input_tokens_seen": 268250260, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.703125, + "step": 12437, + "time_per_iteration": 2.5217578411102295 + }, + { + "auxiliary_loss_clip": 0.01096987, + "auxiliary_loss_mlp": 0.01028586, + "balance_loss_clip": 1.01766062, + "balance_loss_mlp": 1.0336585, + "epoch": 0.7478130166842026, + "flos": 20371979946240.0, + "grad_norm": 1.7258668993948156, + "language_loss": 0.6710937, + "learning_rate": 6.307411317737986e-07, + "loss": 0.69234937, + "num_input_tokens_seen": 268268440, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6328125, + "step": 12438, + "time_per_iteration": 2.4754526615142822 + }, + { + "auxiliary_loss_clip": 0.01101335, + "auxiliary_loss_mlp": 0.01032026, + "balance_loss_clip": 1.02005768, + "balance_loss_mlp": 1.03440166, + "epoch": 0.7478731399368705, + "flos": 18148930191360.0, + "grad_norm": 1.6057176452605648, + "language_loss": 0.80471182, + "learning_rate": 6.304572825026344e-07, + "loss": 0.82604539, + "num_input_tokens_seen": 268285765, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 12439, + "time_per_iteration": 2.4217841625213623 + }, + { + "auxiliary_loss_clip": 0.01100863, + "auxiliary_loss_mlp": 0.01035132, + "balance_loss_clip": 1.02369952, + "balance_loss_mlp": 1.034688, + "epoch": 0.7479332631895386, + "flos": 15267889146240.0, + "grad_norm": 4.3324890021257065, + "language_loss": 0.70790303, + "learning_rate": 6.301734851646674e-07, + "loss": 0.72926295, + "num_input_tokens_seen": 268304015, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 12440, + "time_per_iteration": 2.4390249252319336 + }, + { + "auxiliary_loss_clip": 0.01100888, + "auxiliary_loss_mlp": 0.01028215, + "balance_loss_clip": 1.01672912, + "balance_loss_mlp": 1.03678477, + "epoch": 0.7479933864422065, + "flos": 21142515525120.0, + "grad_norm": 1.6196156366406493, + "language_loss": 0.74209476, + "learning_rate": 6.298897397706597e-07, + "loss": 0.76338577, + "num_input_tokens_seen": 268323290, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 12441, + "time_per_iteration": 2.452240467071533 + }, + { + "auxiliary_loss_clip": 0.01104655, + "auxiliary_loss_mlp": 0.01035515, + "balance_loss_clip": 1.02294469, + "balance_loss_mlp": 1.0354284, + "epoch": 0.7480535096948745, + "flos": 14392027912320.0, + "grad_norm": 2.0647572412884223, + "language_loss": 0.82613641, + "learning_rate": 6.296060463313698e-07, + "loss": 0.84753811, + "num_input_tokens_seen": 268339490, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69140625, + "step": 12442, + "time_per_iteration": 2.4386143684387207 + }, + { + "auxiliary_loss_clip": 0.01104883, + "auxiliary_loss_mlp": 0.01030375, + "balance_loss_clip": 1.01779294, + "balance_loss_mlp": 1.03697157, + "epoch": 0.7481136329475425, + "flos": 27344683048320.0, + "grad_norm": 1.8278548482074275, + "language_loss": 0.62552464, + "learning_rate": 6.293224048575565e-07, + "loss": 0.64687717, + "num_input_tokens_seen": 268359865, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6796875, + "step": 12443, + "time_per_iteration": 2.501383066177368 + }, + { + "auxiliary_loss_clip": 0.01099555, + "auxiliary_loss_mlp": 0.01027213, + "balance_loss_clip": 1.01601326, + "balance_loss_mlp": 1.03445029, + "epoch": 0.7481737562002104, + "flos": 19531454716800.0, + "grad_norm": 2.2374686087677365, + "language_loss": 0.71498984, + "learning_rate": 6.29038815359975e-07, + "loss": 0.73625755, + "num_input_tokens_seen": 268377065, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6484375, + "step": 12444, + "time_per_iteration": 2.4533753395080566 + }, + { + "auxiliary_loss_clip": 0.01101788, + "auxiliary_loss_mlp": 0.01030053, + "balance_loss_clip": 1.01813221, + "balance_loss_mlp": 1.03564715, + "epoch": 0.7482338794528784, + "flos": 21760035166080.0, + "grad_norm": 1.421192180726323, + "language_loss": 0.68887877, + "learning_rate": 6.287552778493786e-07, + "loss": 0.71019721, + "num_input_tokens_seen": 268396935, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 12445, + "time_per_iteration": 2.4437148571014404 + }, + { + "auxiliary_loss_clip": 0.01099024, + "auxiliary_loss_mlp": 0.01025898, + "balance_loss_clip": 1.01420927, + "balance_loss_mlp": 1.0338124, + "epoch": 0.7482940027055464, + "flos": 18697358021760.0, + "grad_norm": 1.6018226461169682, + "language_loss": 0.73926389, + "learning_rate": 6.28471792336519e-07, + "loss": 0.76051313, + "num_input_tokens_seen": 268414460, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6484375, + "step": 12446, + "time_per_iteration": 2.4290761947631836 + }, + { + "auxiliary_loss_clip": 0.01107586, + "auxiliary_loss_mlp": 0.01031072, + "balance_loss_clip": 1.01841235, + "balance_loss_mlp": 1.03757131, + "epoch": 0.7483541259582144, + "flos": 15998024903040.0, + "grad_norm": 1.8678016899713992, + "language_loss": 0.73009384, + "learning_rate": 6.281883588321475e-07, + "loss": 0.75148046, + "num_input_tokens_seen": 268432225, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 12447, + "time_per_iteration": 2.4282591342926025 + }, + { + "auxiliary_loss_clip": 0.01100481, + "auxiliary_loss_mlp": 0.01030723, + "balance_loss_clip": 1.01952291, + "balance_loss_mlp": 1.03436816, + "epoch": 0.7484142492108823, + "flos": 25556295772800.0, + "grad_norm": 2.453147122317507, + "language_loss": 0.71330941, + "learning_rate": 6.279049773470109e-07, + "loss": 0.73462141, + "num_input_tokens_seen": 268449270, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.66015625, + "step": 12448, + "time_per_iteration": 2.492389678955078 + }, + { + "auxiliary_loss_clip": 0.01103975, + "auxiliary_loss_mlp": 0.01035837, + "balance_loss_clip": 1.02396417, + "balance_loss_mlp": 1.03592634, + "epoch": 0.7484743724635503, + "flos": 22887737631360.0, + "grad_norm": 1.681801443430281, + "language_loss": 0.73694456, + "learning_rate": 6.276216478918543e-07, + "loss": 0.75834262, + "num_input_tokens_seen": 268467250, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 12449, + "time_per_iteration": 2.458009958267212 + }, + { + "auxiliary_loss_clip": 0.01107992, + "auxiliary_loss_mlp": 0.0103785, + "balance_loss_clip": 1.02537513, + "balance_loss_mlp": 1.03796268, + "epoch": 0.7485344957162182, + "flos": 25300288563840.0, + "grad_norm": 4.253717763971936, + "language_loss": 0.6114825, + "learning_rate": 6.273383704774225e-07, + "loss": 0.63294089, + "num_input_tokens_seen": 268487270, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69921875, + "step": 12450, + "time_per_iteration": 2.50168776512146 + }, + { + "auxiliary_loss_clip": 0.01095887, + "auxiliary_loss_mlp": 0.01024791, + "balance_loss_clip": 1.01391912, + "balance_loss_mlp": 1.03296888, + "epoch": 0.7485946189688862, + "flos": 27053016612480.0, + "grad_norm": 2.2078562652579445, + "language_loss": 0.70491904, + "learning_rate": 6.270551451144577e-07, + "loss": 0.72612584, + "num_input_tokens_seen": 268508020, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.62890625, + "step": 12451, + "time_per_iteration": 2.4641804695129395 + }, + { + "auxiliary_loss_clip": 0.0110528, + "auxiliary_loss_mlp": 0.01029724, + "balance_loss_clip": 1.01757646, + "balance_loss_mlp": 1.03587961, + "epoch": 0.7486547422215541, + "flos": 26906752431360.0, + "grad_norm": 1.9404174586148812, + "language_loss": 0.80036032, + "learning_rate": 6.267719718136988e-07, + "loss": 0.82171035, + "num_input_tokens_seen": 268527375, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69140625, + "step": 12452, + "time_per_iteration": 2.472050189971924 + }, + { + "auxiliary_loss_clip": 0.0111029, + "auxiliary_loss_mlp": 0.01032553, + "balance_loss_clip": 1.0199945, + "balance_loss_mlp": 1.03968, + "epoch": 0.7487148654742222, + "flos": 22346277039360.0, + "grad_norm": 1.9353512881851993, + "language_loss": 0.71305573, + "learning_rate": 6.264888505858843e-07, + "loss": 0.73448426, + "num_input_tokens_seen": 268544870, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.70703125, + "step": 12453, + "time_per_iteration": 2.4257922172546387 + }, + { + "auxiliary_loss_clip": 0.01104414, + "auxiliary_loss_mlp": 0.0102963, + "balance_loss_clip": 1.01754212, + "balance_loss_mlp": 1.03703308, + "epoch": 0.7487749887268901, + "flos": 23038814234880.0, + "grad_norm": 1.4891462542899447, + "language_loss": 0.74149597, + "learning_rate": 6.262057814417517e-07, + "loss": 0.76283646, + "num_input_tokens_seen": 268564580, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 12454, + "time_per_iteration": 2.468405246734619 + }, + { + "auxiliary_loss_clip": 0.01025662, + "auxiliary_loss_mlp": 0.01006028, + "balance_loss_clip": 1.00498486, + "balance_loss_mlp": 1.00516868, + "epoch": 0.7488351119795581, + "flos": 71525294536320.0, + "grad_norm": 0.7310384566009501, + "language_loss": 0.59401155, + "learning_rate": 6.259227643920322e-07, + "loss": 0.61432838, + "num_input_tokens_seen": 268629550, + "router_z_loss_clip": 0.01043701, + "router_z_loss_mlp": 0.20507812, + "step": 12455, + "time_per_iteration": 3.1971945762634277 + }, + { + "auxiliary_loss_clip": 0.01101497, + "auxiliary_loss_mlp": 0.01029187, + "balance_loss_clip": 1.01709366, + "balance_loss_mlp": 1.03604209, + "epoch": 0.748895235232226, + "flos": 17196255722880.0, + "grad_norm": 4.934936184310134, + "language_loss": 0.79615253, + "learning_rate": 6.256397994474592e-07, + "loss": 0.81745934, + "num_input_tokens_seen": 268646645, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.65625, + "step": 12456, + "time_per_iteration": 2.4296135902404785 + }, + { + "auxiliary_loss_clip": 0.0102509, + "auxiliary_loss_mlp": 0.01006564, + "balance_loss_clip": 1.00547349, + "balance_loss_mlp": 1.00471401, + "epoch": 0.748955358484894, + "flos": 58979256336000.0, + "grad_norm": 0.83989134398578, + "language_loss": 0.61468804, + "learning_rate": 6.25356886618763e-07, + "loss": 0.63500464, + "num_input_tokens_seen": 268702275, + "router_z_loss_clip": 0.01092529, + "router_z_loss_mlp": 0.20410156, + "step": 12457, + "time_per_iteration": 2.974139928817749 + }, + { + "auxiliary_loss_clip": 0.01106258, + "auxiliary_loss_mlp": 0.01030489, + "balance_loss_clip": 1.01888442, + "balance_loss_mlp": 1.03782308, + "epoch": 0.749015481737562, + "flos": 11360413054080.0, + "grad_norm": 2.8899809171397686, + "language_loss": 0.6718834, + "learning_rate": 6.250740259166711e-07, + "loss": 0.69325089, + "num_input_tokens_seen": 268716265, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.68359375, + "step": 12458, + "time_per_iteration": 2.439760684967041 + }, + { + "auxiliary_loss_clip": 0.01100263, + "auxiliary_loss_mlp": 0.01029117, + "balance_loss_clip": 1.01747072, + "balance_loss_mlp": 1.03471386, + "epoch": 0.74907560499023, + "flos": 21106497162240.0, + "grad_norm": 4.815239058798898, + "language_loss": 0.79521596, + "learning_rate": 6.247912173519106e-07, + "loss": 0.81650984, + "num_input_tokens_seen": 268734330, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 12459, + "time_per_iteration": 2.4311044216156006 + }, + { + "auxiliary_loss_clip": 0.01099542, + "auxiliary_loss_mlp": 0.01031519, + "balance_loss_clip": 1.01964533, + "balance_loss_mlp": 1.03522098, + "epoch": 0.749135728242898, + "flos": 22268027260800.0, + "grad_norm": 1.5166660138964414, + "language_loss": 0.80542082, + "learning_rate": 6.245084609352043e-07, + "loss": 0.82673144, + "num_input_tokens_seen": 268753500, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.64453125, + "step": 12460, + "time_per_iteration": 2.467636823654175 + }, + { + "auxiliary_loss_clip": 0.01102889, + "auxiliary_loss_mlp": 0.01030071, + "balance_loss_clip": 1.01772666, + "balance_loss_mlp": 1.03595591, + "epoch": 0.7491958514955659, + "flos": 24057527857920.0, + "grad_norm": 1.8187946605999095, + "language_loss": 0.8621248, + "learning_rate": 6.242257566772755e-07, + "loss": 0.88345432, + "num_input_tokens_seen": 268772055, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.66796875, + "step": 12461, + "time_per_iteration": 2.445946216583252 + }, + { + "auxiliary_loss_clip": 0.01099091, + "auxiliary_loss_mlp": 0.01030727, + "balance_loss_clip": 1.01919913, + "balance_loss_mlp": 1.03504705, + "epoch": 0.7492559747482339, + "flos": 24492118510080.0, + "grad_norm": 4.4069049168427235, + "language_loss": 0.69474328, + "learning_rate": 6.239431045888435e-07, + "loss": 0.71604145, + "num_input_tokens_seen": 268792265, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 12462, + "time_per_iteration": 2.4715051651000977 + }, + { + "auxiliary_loss_clip": 0.01101104, + "auxiliary_loss_mlp": 0.0103147, + "balance_loss_clip": 1.01925659, + "balance_loss_mlp": 1.03515553, + "epoch": 0.7493160980009018, + "flos": 27745338326400.0, + "grad_norm": 2.161569960012567, + "language_loss": 0.70565915, + "learning_rate": 6.236605046806267e-07, + "loss": 0.72698486, + "num_input_tokens_seen": 268812735, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66015625, + "step": 12463, + "time_per_iteration": 2.4890224933624268 + }, + { + "auxiliary_loss_clip": 0.01103139, + "auxiliary_loss_mlp": 0.0103002, + "balance_loss_clip": 1.01886845, + "balance_loss_mlp": 1.03757596, + "epoch": 0.7493762212535698, + "flos": 30226190970240.0, + "grad_norm": 1.815437502169393, + "language_loss": 0.77414626, + "learning_rate": 6.233779569633419e-07, + "loss": 0.79547787, + "num_input_tokens_seen": 268833090, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 12464, + "time_per_iteration": 2.5218935012817383 + }, + { + "auxiliary_loss_clip": 0.01097602, + "auxiliary_loss_mlp": 0.01026218, + "balance_loss_clip": 1.01478601, + "balance_loss_mlp": 1.03183138, + "epoch": 0.7494363445062378, + "flos": 21944472526080.0, + "grad_norm": 1.6191901183341268, + "language_loss": 0.78242761, + "learning_rate": 6.230954614477034e-07, + "loss": 0.80366582, + "num_input_tokens_seen": 268851880, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 12465, + "time_per_iteration": 2.438852071762085 + }, + { + "auxiliary_loss_clip": 0.0111071, + "auxiliary_loss_mlp": 0.01034499, + "balance_loss_clip": 1.02102232, + "balance_loss_mlp": 1.03788424, + "epoch": 0.7494964677589058, + "flos": 12490342162560.0, + "grad_norm": 2.367319558994289, + "language_loss": 0.73263687, + "learning_rate": 6.22813018144422e-07, + "loss": 0.75408894, + "num_input_tokens_seen": 268867910, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7265625, + "step": 12466, + "time_per_iteration": 2.4159023761749268 + }, + { + "auxiliary_loss_clip": 0.01103324, + "auxiliary_loss_mlp": 0.01032506, + "balance_loss_clip": 1.02088344, + "balance_loss_mlp": 1.03596592, + "epoch": 0.7495565910115737, + "flos": 21653057485440.0, + "grad_norm": 2.1301146092024004, + "language_loss": 0.66439664, + "learning_rate": 6.22530627064209e-07, + "loss": 0.6857549, + "num_input_tokens_seen": 268887260, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 12467, + "time_per_iteration": 2.476149320602417 + }, + { + "auxiliary_loss_clip": 0.0110248, + "auxiliary_loss_mlp": 0.01031367, + "balance_loss_clip": 1.01941586, + "balance_loss_mlp": 1.03570294, + "epoch": 0.7496167142642417, + "flos": 15268535591040.0, + "grad_norm": 2.3152910875520982, + "language_loss": 0.76111352, + "learning_rate": 6.222482882177735e-07, + "loss": 0.78245205, + "num_input_tokens_seen": 268902520, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66796875, + "step": 12468, + "time_per_iteration": 2.536062717437744 + }, + { + "auxiliary_loss_clip": 0.01101389, + "auxiliary_loss_mlp": 0.01029093, + "balance_loss_clip": 1.01648641, + "balance_loss_mlp": 1.03586531, + "epoch": 0.7496768375169096, + "flos": 22054933825920.0, + "grad_norm": 2.6980590171523238, + "language_loss": 0.69451874, + "learning_rate": 6.219660016158201e-07, + "loss": 0.71582359, + "num_input_tokens_seen": 268920970, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.65625, + "step": 12469, + "time_per_iteration": 3.8304295539855957 + }, + { + "auxiliary_loss_clip": 0.01102636, + "auxiliary_loss_mlp": 0.0103201, + "balance_loss_clip": 1.01960647, + "balance_loss_mlp": 1.03584695, + "epoch": 0.7497369607695776, + "flos": 19057038860160.0, + "grad_norm": 1.8066582872371235, + "language_loss": 0.68950933, + "learning_rate": 6.216837672690543e-07, + "loss": 0.71085578, + "num_input_tokens_seen": 268936600, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.66796875, + "step": 12470, + "time_per_iteration": 2.478144645690918 + }, + { + "auxiliary_loss_clip": 0.01105048, + "auxiliary_loss_mlp": 0.01031368, + "balance_loss_clip": 1.0178256, + "balance_loss_mlp": 1.03487074, + "epoch": 0.7497970840222457, + "flos": 21617434172160.0, + "grad_norm": 2.8963816737460606, + "language_loss": 0.74823713, + "learning_rate": 6.214015851881793e-07, + "loss": 0.76960123, + "num_input_tokens_seen": 268956560, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.69921875, + "step": 12471, + "time_per_iteration": 3.9513978958129883 + }, + { + "auxiliary_loss_clip": 0.01103416, + "auxiliary_loss_mlp": 0.01027975, + "balance_loss_clip": 1.01577377, + "balance_loss_mlp": 1.03611624, + "epoch": 0.7498572072749136, + "flos": 13735580906880.0, + "grad_norm": 1.9482854997068855, + "language_loss": 0.76652914, + "learning_rate": 6.211194553838929e-07, + "loss": 0.78784305, + "num_input_tokens_seen": 268973945, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 12472, + "time_per_iteration": 3.9247841835021973 + }, + { + "auxiliary_loss_clip": 0.01100064, + "auxiliary_loss_mlp": 0.01029842, + "balance_loss_clip": 1.01829672, + "balance_loss_mlp": 1.03378856, + "epoch": 0.7499173305275816, + "flos": 22966526113920.0, + "grad_norm": 1.4581749540086286, + "language_loss": 0.84420872, + "learning_rate": 6.208373778668951e-07, + "loss": 0.86550772, + "num_input_tokens_seen": 268993245, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66015625, + "step": 12473, + "time_per_iteration": 2.460721492767334 + }, + { + "auxiliary_loss_clip": 0.01107651, + "auxiliary_loss_mlp": 0.01034983, + "balance_loss_clip": 1.0219121, + "balance_loss_mlp": 1.03714895, + "epoch": 0.7499774537802495, + "flos": 22740467869440.0, + "grad_norm": 1.9225859728755545, + "language_loss": 0.73670536, + "learning_rate": 6.205553526478829e-07, + "loss": 0.75813174, + "num_input_tokens_seen": 269012125, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 12474, + "time_per_iteration": 3.8605411052703857 + }, + { + "auxiliary_loss_clip": 0.01106384, + "auxiliary_loss_mlp": 0.01033692, + "balance_loss_clip": 1.02101982, + "balance_loss_mlp": 1.03537238, + "epoch": 0.7500375770329175, + "flos": 18296559089280.0, + "grad_norm": 1.6563775017925497, + "language_loss": 0.74591839, + "learning_rate": 6.202733797375492e-07, + "loss": 0.7673192, + "num_input_tokens_seen": 269030545, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 12475, + "time_per_iteration": 2.42132830619812 + }, + { + "auxiliary_loss_clip": 0.01108313, + "auxiliary_loss_mlp": 0.01039073, + "balance_loss_clip": 1.02606773, + "balance_loss_mlp": 1.0368228, + "epoch": 0.7500977002855854, + "flos": 19169978198400.0, + "grad_norm": 3.53790302868858, + "language_loss": 0.80186552, + "learning_rate": 6.199914591465878e-07, + "loss": 0.82333934, + "num_input_tokens_seen": 269048180, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71484375, + "step": 12476, + "time_per_iteration": 2.4238805770874023 + }, + { + "auxiliary_loss_clip": 0.01101438, + "auxiliary_loss_mlp": 0.01030238, + "balance_loss_clip": 1.01843047, + "balance_loss_mlp": 1.03465772, + "epoch": 0.7501578235382534, + "flos": 22163886754560.0, + "grad_norm": 1.8885808312532115, + "language_loss": 0.77860969, + "learning_rate": 6.19709590885688e-07, + "loss": 0.79992652, + "num_input_tokens_seen": 269068600, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66796875, + "step": 12477, + "time_per_iteration": 2.4582700729370117 + }, + { + "auxiliary_loss_clip": 0.01025103, + "auxiliary_loss_mlp": 0.01003277, + "balance_loss_clip": 1.00223351, + "balance_loss_mlp": 1.00471592, + "epoch": 0.7502179467909214, + "flos": 64465040033280.0, + "grad_norm": 0.8084596961185327, + "language_loss": 0.54396832, + "learning_rate": 6.194277749655394e-07, + "loss": 0.56425214, + "num_input_tokens_seen": 269119045, + "router_z_loss_clip": 0.01043701, + "router_z_loss_mlp": 0.20410156, + "step": 12478, + "time_per_iteration": 3.0614583492279053 + }, + { + "auxiliary_loss_clip": 0.01100592, + "auxiliary_loss_mlp": 0.01032025, + "balance_loss_clip": 1.02021098, + "balance_loss_mlp": 1.0357542, + "epoch": 0.7502780700435894, + "flos": 20478275268480.0, + "grad_norm": 1.732296797104268, + "language_loss": 0.80400872, + "learning_rate": 6.191460113968272e-07, + "loss": 0.82533485, + "num_input_tokens_seen": 269136755, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6484375, + "step": 12479, + "time_per_iteration": 2.4517574310302734 + }, + { + "auxiliary_loss_clip": 0.01106016, + "auxiliary_loss_mlp": 0.01035802, + "balance_loss_clip": 1.0231421, + "balance_loss_mlp": 1.03617644, + "epoch": 0.7503381932962573, + "flos": 20445273648000.0, + "grad_norm": 2.2068384473951386, + "language_loss": 0.62537003, + "learning_rate": 6.188643001902369e-07, + "loss": 0.64678824, + "num_input_tokens_seen": 269156120, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 12480, + "time_per_iteration": 2.464008092880249 + }, + { + "auxiliary_loss_clip": 0.01099382, + "auxiliary_loss_mlp": 0.01033195, + "balance_loss_clip": 1.02187634, + "balance_loss_mlp": 1.03546023, + "epoch": 0.7503983165489253, + "flos": 22381936266240.0, + "grad_norm": 1.8758461375908144, + "language_loss": 0.77756959, + "learning_rate": 6.185826413564512e-07, + "loss": 0.79889536, + "num_input_tokens_seen": 269175650, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 12481, + "time_per_iteration": 2.457960367202759 + }, + { + "auxiliary_loss_clip": 0.01103553, + "auxiliary_loss_mlp": 0.01030844, + "balance_loss_clip": 1.01870799, + "balance_loss_mlp": 1.03513408, + "epoch": 0.7504584398015932, + "flos": 24899453717760.0, + "grad_norm": 1.6027939437318084, + "language_loss": 0.70975888, + "learning_rate": 6.183010349061501e-07, + "loss": 0.73110282, + "num_input_tokens_seen": 269197080, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.68359375, + "step": 12482, + "time_per_iteration": 2.505486011505127 + }, + { + "auxiliary_loss_clip": 0.01103914, + "auxiliary_loss_mlp": 0.01031643, + "balance_loss_clip": 1.02004409, + "balance_loss_mlp": 1.03608012, + "epoch": 0.7505185630542612, + "flos": 25885237547520.0, + "grad_norm": 1.6593432935882615, + "language_loss": 0.70126545, + "learning_rate": 6.180194808500118e-07, + "loss": 0.72262096, + "num_input_tokens_seen": 269218600, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 12483, + "time_per_iteration": 2.5372493267059326 + }, + { + "auxiliary_loss_clip": 0.01101463, + "auxiliary_loss_mlp": 0.01025502, + "balance_loss_clip": 1.01488626, + "balance_loss_mlp": 1.03527784, + "epoch": 0.7505786863069293, + "flos": 23143852581120.0, + "grad_norm": 1.8314217473162897, + "language_loss": 0.74355495, + "learning_rate": 6.177379791987131e-07, + "loss": 0.76482463, + "num_input_tokens_seen": 269239245, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.6640625, + "step": 12484, + "time_per_iteration": 2.482421636581421 + }, + { + "auxiliary_loss_clip": 0.01100626, + "auxiliary_loss_mlp": 0.01026576, + "balance_loss_clip": 1.01453543, + "balance_loss_mlp": 1.03498316, + "epoch": 0.7506388095595972, + "flos": 16983377769600.0, + "grad_norm": 2.0535325266367153, + "language_loss": 0.84864926, + "learning_rate": 6.174565299629295e-07, + "loss": 0.86992133, + "num_input_tokens_seen": 269258520, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.65625, + "step": 12485, + "time_per_iteration": 2.446956157684326 + }, + { + "auxiliary_loss_clip": 0.0110043, + "auxiliary_loss_mlp": 0.01030894, + "balance_loss_clip": 1.01902699, + "balance_loss_mlp": 1.03467631, + "epoch": 0.7506989328122652, + "flos": 22344984149760.0, + "grad_norm": 1.4660860594284646, + "language_loss": 0.77995837, + "learning_rate": 6.171751331533323e-07, + "loss": 0.80127156, + "num_input_tokens_seen": 269278320, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.65625, + "step": 12486, + "time_per_iteration": 2.517058849334717 + }, + { + "auxiliary_loss_clip": 0.01106308, + "auxiliary_loss_mlp": 0.0103261, + "balance_loss_clip": 1.01999187, + "balance_loss_mlp": 1.03714168, + "epoch": 0.7507590560649331, + "flos": 25776069137280.0, + "grad_norm": 1.8190391114760833, + "language_loss": 0.72836137, + "learning_rate": 6.168937887805932e-07, + "loss": 0.74975049, + "num_input_tokens_seen": 269298025, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69140625, + "step": 12487, + "time_per_iteration": 2.5011062622070312 + }, + { + "auxiliary_loss_clip": 0.01102568, + "auxiliary_loss_mlp": 0.01029737, + "balance_loss_clip": 1.01789975, + "balance_loss_mlp": 1.03470707, + "epoch": 0.7508191793176011, + "flos": 24279420124800.0, + "grad_norm": 1.9101645594404746, + "language_loss": 0.67258334, + "learning_rate": 6.166124968553801e-07, + "loss": 0.69390637, + "num_input_tokens_seen": 269316770, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 12488, + "time_per_iteration": 2.4733595848083496 + }, + { + "auxiliary_loss_clip": 0.01102, + "auxiliary_loss_mlp": 0.01030971, + "balance_loss_clip": 1.01859736, + "balance_loss_mlp": 1.03543329, + "epoch": 0.750879302570269, + "flos": 19899575251200.0, + "grad_norm": 1.5954829957007908, + "language_loss": 0.77207714, + "learning_rate": 6.163312573883592e-07, + "loss": 0.79340684, + "num_input_tokens_seen": 269334755, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6640625, + "step": 12489, + "time_per_iteration": 2.41869854927063 + }, + { + "auxiliary_loss_clip": 0.01100051, + "auxiliary_loss_mlp": 0.01029682, + "balance_loss_clip": 1.01907802, + "balance_loss_mlp": 1.03533888, + "epoch": 0.750939425822937, + "flos": 29205681667200.0, + "grad_norm": 1.8920646114871729, + "language_loss": 0.75356829, + "learning_rate": 6.160500703901956e-07, + "loss": 0.77486563, + "num_input_tokens_seen": 269353810, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.6484375, + "step": 12490, + "time_per_iteration": 2.530346155166626 + }, + { + "auxiliary_loss_clip": 0.01102359, + "auxiliary_loss_mlp": 0.01029526, + "balance_loss_clip": 1.01762867, + "balance_loss_mlp": 1.03632128, + "epoch": 0.750999549075605, + "flos": 21142300043520.0, + "grad_norm": 1.6040694673861557, + "language_loss": 0.78232539, + "learning_rate": 6.157689358715527e-07, + "loss": 0.8036443, + "num_input_tokens_seen": 269372910, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66015625, + "step": 12491, + "time_per_iteration": 2.445436954498291 + }, + { + "auxiliary_loss_clip": 0.01097554, + "auxiliary_loss_mlp": 0.01029244, + "balance_loss_clip": 1.01876545, + "balance_loss_mlp": 1.03334594, + "epoch": 0.751059672328273, + "flos": 23547740083200.0, + "grad_norm": 2.0707908886127813, + "language_loss": 0.76477361, + "learning_rate": 6.154878538430899e-07, + "loss": 0.7860415, + "num_input_tokens_seen": 269391545, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.640625, + "step": 12492, + "time_per_iteration": 2.4592933654785156 + }, + { + "auxiliary_loss_clip": 0.01098246, + "auxiliary_loss_mlp": 0.01028344, + "balance_loss_clip": 1.01735926, + "balance_loss_mlp": 1.03225935, + "epoch": 0.7511197955809409, + "flos": 18989742729600.0, + "grad_norm": 2.019943812075004, + "language_loss": 0.71320605, + "learning_rate": 6.152068243154671e-07, + "loss": 0.73447198, + "num_input_tokens_seen": 269408530, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.66015625, + "step": 12493, + "time_per_iteration": 2.420647621154785 + }, + { + "auxiliary_loss_clip": 0.01103819, + "auxiliary_loss_mlp": 0.01025972, + "balance_loss_clip": 1.01421213, + "balance_loss_mlp": 1.03696609, + "epoch": 0.7511799188336089, + "flos": 22046961006720.0, + "grad_norm": 1.620130382276632, + "language_loss": 0.80576169, + "learning_rate": 6.149258472993395e-07, + "loss": 0.82705963, + "num_input_tokens_seen": 269425930, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.671875, + "step": 12494, + "time_per_iteration": 2.4511101245880127 + }, + { + "auxiliary_loss_clip": 0.01102931, + "auxiliary_loss_mlp": 0.01029107, + "balance_loss_clip": 1.01690626, + "balance_loss_mlp": 1.03543544, + "epoch": 0.7512400420862768, + "flos": 16467125546880.0, + "grad_norm": 2.1793596151447208, + "language_loss": 0.78629243, + "learning_rate": 6.146449228053634e-07, + "loss": 0.80761278, + "num_input_tokens_seen": 269443945, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 12495, + "time_per_iteration": 2.4220409393310547 + }, + { + "auxiliary_loss_clip": 0.01101733, + "auxiliary_loss_mlp": 0.0103546, + "balance_loss_clip": 1.02359903, + "balance_loss_mlp": 1.0354476, + "epoch": 0.7513001653389448, + "flos": 20448326304000.0, + "grad_norm": 2.0360130649256183, + "language_loss": 0.70592833, + "learning_rate": 6.143640508441898e-07, + "loss": 0.72730023, + "num_input_tokens_seen": 269463625, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 12496, + "time_per_iteration": 2.4752755165100098 + }, + { + "auxiliary_loss_clip": 0.01102064, + "auxiliary_loss_mlp": 0.01030349, + "balance_loss_clip": 1.01929259, + "balance_loss_mlp": 1.03579581, + "epoch": 0.7513602885916129, + "flos": 23476816679040.0, + "grad_norm": 1.644722371980129, + "language_loss": 0.77970195, + "learning_rate": 6.140832314264705e-07, + "loss": 0.80102611, + "num_input_tokens_seen": 269483415, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6640625, + "step": 12497, + "time_per_iteration": 2.4557857513427734 + }, + { + "auxiliary_loss_clip": 0.01102933, + "auxiliary_loss_mlp": 0.01033562, + "balance_loss_clip": 1.02162886, + "balance_loss_mlp": 1.03516352, + "epoch": 0.7514204118442808, + "flos": 26797224885120.0, + "grad_norm": 1.5625953994029207, + "language_loss": 0.7667886, + "learning_rate": 6.13802464562855e-07, + "loss": 0.78815353, + "num_input_tokens_seen": 269504635, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 12498, + "time_per_iteration": 2.4923367500305176 + }, + { + "auxiliary_loss_clip": 0.01100471, + "auxiliary_loss_mlp": 0.01031296, + "balance_loss_clip": 1.02007866, + "balance_loss_mlp": 1.03681421, + "epoch": 0.7514805350969488, + "flos": 19865639877120.0, + "grad_norm": 1.712775881225065, + "language_loss": 0.74015152, + "learning_rate": 6.135217502639878e-07, + "loss": 0.76146924, + "num_input_tokens_seen": 269523955, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.63671875, + "step": 12499, + "time_per_iteration": 2.42573618888855 + }, + { + "auxiliary_loss_clip": 0.0109767, + "auxiliary_loss_mlp": 0.01027517, + "balance_loss_clip": 1.01657331, + "balance_loss_mlp": 1.03243327, + "epoch": 0.7515406583496167, + "flos": 24571553437440.0, + "grad_norm": 2.6175707927072787, + "language_loss": 0.7927863, + "learning_rate": 6.132410885405148e-07, + "loss": 0.81403816, + "num_input_tokens_seen": 269544410, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.65234375, + "step": 12500, + "time_per_iteration": 2.4984662532806396 + }, + { + "auxiliary_loss_clip": 0.01109495, + "auxiliary_loss_mlp": 0.01033563, + "balance_loss_clip": 1.01993728, + "balance_loss_mlp": 1.03732872, + "epoch": 0.7516007816022847, + "flos": 20120246455680.0, + "grad_norm": 2.410232320418393, + "language_loss": 0.73039198, + "learning_rate": 6.129604794030794e-07, + "loss": 0.75182259, + "num_input_tokens_seen": 269563315, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.72265625, + "step": 12501, + "time_per_iteration": 2.4204771518707275 + }, + { + "auxiliary_loss_clip": 0.01098599, + "auxiliary_loss_mlp": 0.01025182, + "balance_loss_clip": 1.01324964, + "balance_loss_mlp": 1.0327723, + "epoch": 0.7516609048549526, + "flos": 22784638619520.0, + "grad_norm": 1.6630444702124707, + "language_loss": 0.7825129, + "learning_rate": 6.126799228623207e-07, + "loss": 0.80375075, + "num_input_tokens_seen": 269583950, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 12502, + "time_per_iteration": 2.4997878074645996 + }, + { + "auxiliary_loss_clip": 0.01102781, + "auxiliary_loss_mlp": 0.01032155, + "balance_loss_clip": 1.01995397, + "balance_loss_mlp": 1.03561115, + "epoch": 0.7517210281076206, + "flos": 10634012311680.0, + "grad_norm": 2.7088747693103663, + "language_loss": 0.70608878, + "learning_rate": 6.123994189288786e-07, + "loss": 0.72743809, + "num_input_tokens_seen": 269600120, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 12503, + "time_per_iteration": 2.407897472381592 + }, + { + "auxiliary_loss_clip": 0.01024599, + "auxiliary_loss_mlp": 0.01000364, + "balance_loss_clip": 0.99929094, + "balance_loss_mlp": 1.00410652, + "epoch": 0.7517811513602886, + "flos": 66052221275520.0, + "grad_norm": 0.9807627668089319, + "language_loss": 0.63942432, + "learning_rate": 6.121189676133903e-07, + "loss": 0.65967393, + "num_input_tokens_seen": 269659815, + "router_z_loss_clip": 0.01074219, + "router_z_loss_mlp": 0.20507812, + "step": 12504, + "time_per_iteration": 2.995584726333618 + }, + { + "auxiliary_loss_clip": 0.01096816, + "auxiliary_loss_mlp": 0.01029954, + "balance_loss_clip": 1.01842678, + "balance_loss_mlp": 1.03316665, + "epoch": 0.7518412746129566, + "flos": 37268345018880.0, + "grad_norm": 2.135704139669575, + "language_loss": 0.68474889, + "learning_rate": 6.118385689264896e-07, + "loss": 0.70601666, + "num_input_tokens_seen": 269684565, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.63671875, + "step": 12505, + "time_per_iteration": 2.5871872901916504 + }, + { + "auxiliary_loss_clip": 0.01024908, + "auxiliary_loss_mlp": 0.00998595, + "balance_loss_clip": 0.99765915, + "balance_loss_mlp": 1.00445008, + "epoch": 0.7519013978656245, + "flos": 60518567727360.0, + "grad_norm": 0.6625472273588794, + "language_loss": 0.5508914, + "learning_rate": 6.11558222878809e-07, + "loss": 0.57112646, + "num_input_tokens_seen": 269752325, + "router_z_loss_clip": 0.00933838, + "router_z_loss_mlp": 0.20507812, + "step": 12506, + "time_per_iteration": 3.1377921104431152 + }, + { + "auxiliary_loss_clip": 0.01105218, + "auxiliary_loss_mlp": 0.01033148, + "balance_loss_clip": 1.02154922, + "balance_loss_mlp": 1.03739369, + "epoch": 0.7519615211182925, + "flos": 18806885568000.0, + "grad_norm": 2.061903152831647, + "language_loss": 0.78302479, + "learning_rate": 6.112779294809796e-07, + "loss": 0.80440837, + "num_input_tokens_seen": 269770630, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 12507, + "time_per_iteration": 2.4135823249816895 + }, + { + "auxiliary_loss_clip": 0.01100841, + "auxiliary_loss_mlp": 0.01029111, + "balance_loss_clip": 1.01779842, + "balance_loss_mlp": 1.03669238, + "epoch": 0.7520216443709604, + "flos": 14575244209920.0, + "grad_norm": 1.6731769986850884, + "language_loss": 0.71181047, + "learning_rate": 6.10997688743631e-07, + "loss": 0.73311001, + "num_input_tokens_seen": 269787280, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 12508, + "time_per_iteration": 2.4572551250457764 + }, + { + "auxiliary_loss_clip": 0.01099119, + "auxiliary_loss_mlp": 0.01028584, + "balance_loss_clip": 1.01687193, + "balance_loss_mlp": 1.03434396, + "epoch": 0.7520817676236284, + "flos": 17056599644160.0, + "grad_norm": 1.7139417588852437, + "language_loss": 0.71999872, + "learning_rate": 6.107175006773885e-07, + "loss": 0.74127567, + "num_input_tokens_seen": 269805205, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6484375, + "step": 12509, + "time_per_iteration": 2.432441473007202 + }, + { + "auxiliary_loss_clip": 0.01107542, + "auxiliary_loss_mlp": 0.01036341, + "balance_loss_clip": 1.02306163, + "balance_loss_mlp": 1.03668177, + "epoch": 0.7521418908762965, + "flos": 25666397936640.0, + "grad_norm": 1.5641902395179517, + "language_loss": 0.61837184, + "learning_rate": 6.104373652928785e-07, + "loss": 0.63981068, + "num_input_tokens_seen": 269824820, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.70703125, + "step": 12510, + "time_per_iteration": 2.483800172805786 + }, + { + "auxiliary_loss_clip": 0.01098004, + "auxiliary_loss_mlp": 0.01030469, + "balance_loss_clip": 1.01876235, + "balance_loss_mlp": 1.03506911, + "epoch": 0.7522020141289644, + "flos": 20886759711360.0, + "grad_norm": 1.6552475399559823, + "language_loss": 0.81871247, + "learning_rate": 6.10157282600722e-07, + "loss": 0.83999723, + "num_input_tokens_seen": 269842825, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.62890625, + "step": 12511, + "time_per_iteration": 3.882760524749756 + }, + { + "auxiliary_loss_clip": 0.01105136, + "auxiliary_loss_mlp": 0.01033984, + "balance_loss_clip": 1.02165818, + "balance_loss_mlp": 1.03586888, + "epoch": 0.7522621373816324, + "flos": 12640305444480.0, + "grad_norm": 1.8295208531594718, + "language_loss": 0.7603333, + "learning_rate": 6.098772526115412e-07, + "loss": 0.78172445, + "num_input_tokens_seen": 269859000, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69140625, + "step": 12512, + "time_per_iteration": 2.38800048828125 + }, + { + "auxiliary_loss_clip": 0.01094203, + "auxiliary_loss_mlp": 0.01027458, + "balance_loss_clip": 1.01646113, + "balance_loss_mlp": 1.03219318, + "epoch": 0.7523222606343003, + "flos": 25626141768960.0, + "grad_norm": 1.6286622984961852, + "language_loss": 0.82186234, + "learning_rate": 6.095972753359537e-07, + "loss": 0.84307897, + "num_input_tokens_seen": 269878895, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.62109375, + "step": 12513, + "time_per_iteration": 3.94989013671875 + }, + { + "auxiliary_loss_clip": 0.01103819, + "auxiliary_loss_mlp": 0.01033273, + "balance_loss_clip": 1.02108955, + "balance_loss_mlp": 1.03550029, + "epoch": 0.7523823838869683, + "flos": 20448900921600.0, + "grad_norm": 1.990000011048308, + "language_loss": 0.75192893, + "learning_rate": 6.093173507845771e-07, + "loss": 0.77329987, + "num_input_tokens_seen": 269897280, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.68359375, + "step": 12514, + "time_per_iteration": 3.8526079654693604 + }, + { + "auxiliary_loss_clip": 0.01096596, + "auxiliary_loss_mlp": 0.01029324, + "balance_loss_clip": 1.01869035, + "balance_loss_mlp": 1.03373909, + "epoch": 0.7524425071396362, + "flos": 14720610551040.0, + "grad_norm": 1.7973618299480842, + "language_loss": 0.68311769, + "learning_rate": 6.090374789680271e-07, + "loss": 0.70437688, + "num_input_tokens_seen": 269914640, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.62890625, + "step": 12515, + "time_per_iteration": 2.394958257675171 + }, + { + "auxiliary_loss_clip": 0.01101823, + "auxiliary_loss_mlp": 0.01031956, + "balance_loss_clip": 1.0206188, + "balance_loss_mlp": 1.03523326, + "epoch": 0.7525026303923043, + "flos": 30592048947840.0, + "grad_norm": 2.066116424023424, + "language_loss": 0.70559716, + "learning_rate": 6.087576598969137e-07, + "loss": 0.72693491, + "num_input_tokens_seen": 269934960, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 12516, + "time_per_iteration": 3.9556925296783447 + }, + { + "auxiliary_loss_clip": 0.01099405, + "auxiliary_loss_mlp": 0.01031087, + "balance_loss_clip": 1.01887429, + "balance_loss_mlp": 1.0354656, + "epoch": 0.7525627536449722, + "flos": 24791757765120.0, + "grad_norm": 2.2302621688638764, + "language_loss": 0.8934896, + "learning_rate": 6.084778935818495e-07, + "loss": 0.91479456, + "num_input_tokens_seen": 269956655, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.640625, + "step": 12517, + "time_per_iteration": 2.47792387008667 + }, + { + "auxiliary_loss_clip": 0.01103304, + "auxiliary_loss_mlp": 0.01034057, + "balance_loss_clip": 1.02259493, + "balance_loss_mlp": 1.03562522, + "epoch": 0.7526228768976402, + "flos": 20779782030720.0, + "grad_norm": 1.6178525628265004, + "language_loss": 0.74129748, + "learning_rate": 6.081981800334437e-07, + "loss": 0.76267111, + "num_input_tokens_seen": 269976835, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.67578125, + "step": 12518, + "time_per_iteration": 2.462576150894165 + }, + { + "auxiliary_loss_clip": 0.01024303, + "auxiliary_loss_mlp": 0.00999013, + "balance_loss_clip": 0.99803591, + "balance_loss_mlp": 1.00396061, + "epoch": 0.7526830001503081, + "flos": 66559243703040.0, + "grad_norm": 0.7063379492670796, + "language_loss": 0.55728912, + "learning_rate": 6.079185192623017e-07, + "loss": 0.57752228, + "num_input_tokens_seen": 270040630, + "router_z_loss_clip": 0.00976562, + "router_z_loss_mlp": 0.203125, + "step": 12519, + "time_per_iteration": 3.1375198364257812 + }, + { + "auxiliary_loss_clip": 0.01099253, + "auxiliary_loss_mlp": 0.01033203, + "balance_loss_clip": 1.02270663, + "balance_loss_mlp": 1.03384554, + "epoch": 0.7527431234029761, + "flos": 23477894087040.0, + "grad_norm": 1.4310986441379439, + "language_loss": 0.7804352, + "learning_rate": 6.07638911279029e-07, + "loss": 0.80175972, + "num_input_tokens_seen": 270059695, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.65625, + "step": 12520, + "time_per_iteration": 2.456511974334717 + }, + { + "auxiliary_loss_clip": 0.01098797, + "auxiliary_loss_mlp": 0.01034351, + "balance_loss_clip": 1.02335954, + "balance_loss_mlp": 1.03329098, + "epoch": 0.752803246655644, + "flos": 22049546785920.0, + "grad_norm": 4.550524012485904, + "language_loss": 0.74427485, + "learning_rate": 6.07359356094229e-07, + "loss": 0.76560634, + "num_input_tokens_seen": 270078420, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.65625, + "step": 12521, + "time_per_iteration": 2.492000102996826 + }, + { + "auxiliary_loss_clip": 0.01106943, + "auxiliary_loss_mlp": 0.01031135, + "balance_loss_clip": 1.01836157, + "balance_loss_mlp": 1.03684282, + "epoch": 0.752863369908312, + "flos": 30153795108480.0, + "grad_norm": 1.9335055849585505, + "language_loss": 0.67128062, + "learning_rate": 6.070798537185016e-07, + "loss": 0.6926614, + "num_input_tokens_seen": 270097040, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 12522, + "time_per_iteration": 2.4961695671081543 + }, + { + "auxiliary_loss_clip": 0.01105031, + "auxiliary_loss_mlp": 0.01035216, + "balance_loss_clip": 1.02371216, + "balance_loss_mlp": 1.03653431, + "epoch": 0.7529234931609801, + "flos": 24567638855040.0, + "grad_norm": 2.7941692603753565, + "language_loss": 0.78211427, + "learning_rate": 6.068004041624453e-07, + "loss": 0.80351675, + "num_input_tokens_seen": 270116365, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.68359375, + "step": 12523, + "time_per_iteration": 2.507122039794922 + }, + { + "auxiliary_loss_clip": 0.01100044, + "auxiliary_loss_mlp": 0.01028294, + "balance_loss_clip": 1.01675415, + "balance_loss_mlp": 1.03509927, + "epoch": 0.752983616413648, + "flos": 23112395245440.0, + "grad_norm": 2.0548195739736603, + "language_loss": 0.80642009, + "learning_rate": 6.065210074366571e-07, + "loss": 0.82770348, + "num_input_tokens_seen": 270135395, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 12524, + "time_per_iteration": 2.470827579498291 + }, + { + "auxiliary_loss_clip": 0.01100479, + "auxiliary_loss_mlp": 0.01030003, + "balance_loss_clip": 1.01935172, + "balance_loss_mlp": 1.0362587, + "epoch": 0.753043739666316, + "flos": 24316946858880.0, + "grad_norm": 1.5342669106186173, + "language_loss": 0.7387985, + "learning_rate": 6.062416635517326e-07, + "loss": 0.76010329, + "num_input_tokens_seen": 270156425, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.640625, + "step": 12525, + "time_per_iteration": 2.506251335144043 + }, + { + "auxiliary_loss_clip": 0.01100462, + "auxiliary_loss_mlp": 0.01028627, + "balance_loss_clip": 1.01732588, + "balance_loss_mlp": 1.03503311, + "epoch": 0.7531038629189839, + "flos": 24243294021120.0, + "grad_norm": 1.881783434485301, + "language_loss": 0.71693766, + "learning_rate": 6.059623725182641e-07, + "loss": 0.73822856, + "num_input_tokens_seen": 270176905, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 12526, + "time_per_iteration": 2.4697048664093018 + }, + { + "auxiliary_loss_clip": 0.0109865, + "auxiliary_loss_mlp": 0.01025174, + "balance_loss_clip": 1.01402175, + "balance_loss_mlp": 1.0336082, + "epoch": 0.7531639861716519, + "flos": 30188807890560.0, + "grad_norm": 2.5161959473083675, + "language_loss": 0.71867061, + "learning_rate": 6.056831343468414e-07, + "loss": 0.73990887, + "num_input_tokens_seen": 270196640, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 12527, + "time_per_iteration": 2.544797658920288 + }, + { + "auxiliary_loss_clip": 0.01099923, + "auxiliary_loss_mlp": 0.01025133, + "balance_loss_clip": 1.01430297, + "balance_loss_mlp": 1.03523958, + "epoch": 0.7532241094243198, + "flos": 18223193560320.0, + "grad_norm": 1.8815008802332143, + "language_loss": 0.80829144, + "learning_rate": 6.054039490480539e-07, + "loss": 0.82954198, + "num_input_tokens_seen": 270213905, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6484375, + "step": 12528, + "time_per_iteration": 2.4095561504364014 + }, + { + "auxiliary_loss_clip": 0.01100721, + "auxiliary_loss_mlp": 0.01033151, + "balance_loss_clip": 1.0207355, + "balance_loss_mlp": 1.03425789, + "epoch": 0.7532842326769879, + "flos": 20881049448960.0, + "grad_norm": 1.941676529480235, + "language_loss": 0.84620762, + "learning_rate": 6.051248166324892e-07, + "loss": 0.86754632, + "num_input_tokens_seen": 270231995, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6640625, + "step": 12529, + "time_per_iteration": 2.4949631690979004 + }, + { + "auxiliary_loss_clip": 0.01105602, + "auxiliary_loss_mlp": 0.0103173, + "balance_loss_clip": 1.0194391, + "balance_loss_mlp": 1.03682232, + "epoch": 0.7533443559296558, + "flos": 18078689145600.0, + "grad_norm": 1.741456594396521, + "language_loss": 0.73868054, + "learning_rate": 6.048457371107303e-07, + "loss": 0.76005387, + "num_input_tokens_seen": 270251480, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 12530, + "time_per_iteration": 2.414186954498291 + }, + { + "auxiliary_loss_clip": 0.01024012, + "auxiliary_loss_mlp": 0.01001757, + "balance_loss_clip": 1.00077367, + "balance_loss_mlp": 1.00382376, + "epoch": 0.7534044791823238, + "flos": 50254830766080.0, + "grad_norm": 0.8225360852867398, + "language_loss": 0.63598192, + "learning_rate": 6.045667104933612e-07, + "loss": 0.65623963, + "num_input_tokens_seen": 270306480, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.20214844, + "step": 12531, + "time_per_iteration": 2.9014906883239746 + }, + { + "auxiliary_loss_clip": 0.0110411, + "auxiliary_loss_mlp": 0.0102764, + "balance_loss_clip": 1.0154748, + "balance_loss_mlp": 1.03552723, + "epoch": 0.7534646024349917, + "flos": 20850274471680.0, + "grad_norm": 2.4431425943596876, + "language_loss": 0.69780314, + "learning_rate": 6.042877367909633e-07, + "loss": 0.71912062, + "num_input_tokens_seen": 270324595, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6875, + "step": 12532, + "time_per_iteration": 2.4260380268096924 + }, + { + "auxiliary_loss_clip": 0.01097275, + "auxiliary_loss_mlp": 0.01028653, + "balance_loss_clip": 1.01846051, + "balance_loss_mlp": 1.03496122, + "epoch": 0.7535247256876597, + "flos": 23071779941760.0, + "grad_norm": 1.5569948577505761, + "language_loss": 0.77583849, + "learning_rate": 6.040088160141132e-07, + "loss": 0.79709774, + "num_input_tokens_seen": 270344375, + "router_z_loss_clip": 0.10205078, + "router_z_loss_mlp": 0.62109375, + "step": 12533, + "time_per_iteration": 2.454207181930542 + }, + { + "auxiliary_loss_clip": 0.01024523, + "auxiliary_loss_mlp": 0.01002703, + "balance_loss_clip": 1.00167179, + "balance_loss_mlp": 1.00402224, + "epoch": 0.7535848489403276, + "flos": 58623418252800.0, + "grad_norm": 0.7822513714763298, + "language_loss": 0.57376039, + "learning_rate": 6.037299481733886e-07, + "loss": 0.59403265, + "num_input_tokens_seen": 270405235, + "router_z_loss_clip": 0.01031494, + "router_z_loss_mlp": 0.20507812, + "step": 12534, + "time_per_iteration": 3.077544927597046 + }, + { + "auxiliary_loss_clip": 0.01097886, + "auxiliary_loss_mlp": 0.0102723, + "balance_loss_clip": 1.01530933, + "balance_loss_mlp": 1.03252482, + "epoch": 0.7536449721929956, + "flos": 26577882483840.0, + "grad_norm": 1.4171340268037091, + "language_loss": 0.71380311, + "learning_rate": 6.03451133279365e-07, + "loss": 0.73505425, + "num_input_tokens_seen": 270425820, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 12535, + "time_per_iteration": 2.526242971420288 + }, + { + "auxiliary_loss_clip": 0.01100548, + "auxiliary_loss_mlp": 0.01028567, + "balance_loss_clip": 1.01628292, + "balance_loss_mlp": 1.03258336, + "epoch": 0.7537050954456637, + "flos": 25735992537600.0, + "grad_norm": 1.6321998046367074, + "language_loss": 0.80901384, + "learning_rate": 6.031723713426135e-07, + "loss": 0.83030498, + "num_input_tokens_seen": 270447120, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6796875, + "step": 12536, + "time_per_iteration": 2.472864866256714 + }, + { + "auxiliary_loss_clip": 0.01096541, + "auxiliary_loss_mlp": 0.01024408, + "balance_loss_clip": 1.01334548, + "balance_loss_mlp": 1.03342223, + "epoch": 0.7537652186983316, + "flos": 30224431203840.0, + "grad_norm": 1.9374714714672148, + "language_loss": 0.74261057, + "learning_rate": 6.028936623737067e-07, + "loss": 0.76382011, + "num_input_tokens_seen": 270468680, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6328125, + "step": 12537, + "time_per_iteration": 2.5162243843078613 + }, + { + "auxiliary_loss_clip": 0.01101972, + "auxiliary_loss_mlp": 0.01031017, + "balance_loss_clip": 1.01921487, + "balance_loss_mlp": 1.03531504, + "epoch": 0.7538253419509996, + "flos": 12641239198080.0, + "grad_norm": 1.6037731039814345, + "language_loss": 0.74178267, + "learning_rate": 6.026150063832111e-07, + "loss": 0.76311255, + "num_input_tokens_seen": 270486310, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 12538, + "time_per_iteration": 2.3771462440490723 + }, + { + "auxiliary_loss_clip": 0.01102251, + "auxiliary_loss_mlp": 0.01030969, + "balance_loss_clip": 1.01944685, + "balance_loss_mlp": 1.03522778, + "epoch": 0.7538854652036675, + "flos": 23185976256000.0, + "grad_norm": 1.599430575608072, + "language_loss": 0.6738885, + "learning_rate": 6.023364033816956e-07, + "loss": 0.69522071, + "num_input_tokens_seen": 270507210, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 12539, + "time_per_iteration": 2.4771296977996826 + }, + { + "auxiliary_loss_clip": 0.01099836, + "auxiliary_loss_mlp": 0.01026603, + "balance_loss_clip": 1.01467586, + "balance_loss_mlp": 1.03530288, + "epoch": 0.7539455884563355, + "flos": 23186227651200.0, + "grad_norm": 1.910954039527726, + "language_loss": 0.74824083, + "learning_rate": 6.020578533797229e-07, + "loss": 0.7695052, + "num_input_tokens_seen": 270525250, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.64453125, + "step": 12540, + "time_per_iteration": 2.4341037273406982 + }, + { + "auxiliary_loss_clip": 0.01102106, + "auxiliary_loss_mlp": 0.01028012, + "balance_loss_clip": 1.01606107, + "balance_loss_mlp": 1.03418863, + "epoch": 0.7540057117090034, + "flos": 13181155505280.0, + "grad_norm": 1.9945629348385325, + "language_loss": 0.72719324, + "learning_rate": 6.017793563878566e-07, + "loss": 0.74849451, + "num_input_tokens_seen": 270539295, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6796875, + "step": 12541, + "time_per_iteration": 2.393623113632202 + }, + { + "auxiliary_loss_clip": 0.01100227, + "auxiliary_loss_mlp": 0.01030539, + "balance_loss_clip": 1.01907074, + "balance_loss_mlp": 1.03469777, + "epoch": 0.7540658349616715, + "flos": 45478134478080.0, + "grad_norm": 2.0115318030709277, + "language_loss": 0.72047889, + "learning_rate": 6.015009124166576e-07, + "loss": 0.74178648, + "num_input_tokens_seen": 270562815, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65234375, + "step": 12542, + "time_per_iteration": 2.635145902633667 + }, + { + "auxiliary_loss_clip": 0.01098214, + "auxiliary_loss_mlp": 0.01025398, + "balance_loss_clip": 1.01399565, + "balance_loss_mlp": 1.03344584, + "epoch": 0.7541259582143394, + "flos": 19930817105280.0, + "grad_norm": 1.9065173152707051, + "language_loss": 0.84603345, + "learning_rate": 6.012225214766844e-07, + "loss": 0.86726964, + "num_input_tokens_seen": 270579055, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 12543, + "time_per_iteration": 2.428612232208252 + }, + { + "auxiliary_loss_clip": 0.01104276, + "auxiliary_loss_mlp": 0.01029829, + "balance_loss_clip": 1.01840305, + "balance_loss_mlp": 1.03965712, + "epoch": 0.7541860814670074, + "flos": 27198239299200.0, + "grad_norm": 2.1119731634282766, + "language_loss": 0.73896754, + "learning_rate": 6.009441835784927e-07, + "loss": 0.76030856, + "num_input_tokens_seen": 270599080, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.64453125, + "step": 12544, + "time_per_iteration": 2.4670307636260986 + }, + { + "auxiliary_loss_clip": 0.01102346, + "auxiliary_loss_mlp": 0.01029394, + "balance_loss_clip": 1.0182302, + "balance_loss_mlp": 1.03597724, + "epoch": 0.7542462047196753, + "flos": 21324151624320.0, + "grad_norm": 2.101942602107972, + "language_loss": 0.6828922, + "learning_rate": 6.006658987326383e-07, + "loss": 0.70420957, + "num_input_tokens_seen": 270618715, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 12545, + "time_per_iteration": 2.459852933883667 + }, + { + "auxiliary_loss_clip": 0.01100275, + "auxiliary_loss_mlp": 0.01030728, + "balance_loss_clip": 1.01913524, + "balance_loss_mlp": 1.0337708, + "epoch": 0.7543063279723433, + "flos": 11940944664960.0, + "grad_norm": 1.8429570719628683, + "language_loss": 0.68578523, + "learning_rate": 6.003876669496728e-07, + "loss": 0.70709527, + "num_input_tokens_seen": 270635695, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6640625, + "step": 12546, + "time_per_iteration": 2.420004367828369 + }, + { + "auxiliary_loss_clip": 0.01100982, + "auxiliary_loss_mlp": 0.01032925, + "balance_loss_clip": 1.0202651, + "balance_loss_mlp": 1.03451025, + "epoch": 0.7543664512250112, + "flos": 22819974624000.0, + "grad_norm": 2.2369205909253917, + "language_loss": 0.73266494, + "learning_rate": 6.00109488240147e-07, + "loss": 0.75400406, + "num_input_tokens_seen": 270654325, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6640625, + "step": 12547, + "time_per_iteration": 2.4736859798431396 + }, + { + "auxiliary_loss_clip": 0.01101024, + "auxiliary_loss_mlp": 0.01024925, + "balance_loss_clip": 1.01246762, + "balance_loss_mlp": 1.03465009, + "epoch": 0.7544265744776792, + "flos": 20923855482240.0, + "grad_norm": 1.7870453962384887, + "language_loss": 0.67817152, + "learning_rate": 5.998313626146099e-07, + "loss": 0.699431, + "num_input_tokens_seen": 270674260, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6640625, + "step": 12548, + "time_per_iteration": 2.443042755126953 + }, + { + "auxiliary_loss_clip": 0.01103041, + "auxiliary_loss_mlp": 0.01032802, + "balance_loss_clip": 1.02093458, + "balance_loss_mlp": 1.03505886, + "epoch": 0.7544866977303473, + "flos": 15195493284480.0, + "grad_norm": 1.7833036384787766, + "language_loss": 0.87229598, + "learning_rate": 5.995532900836088e-07, + "loss": 0.89365441, + "num_input_tokens_seen": 270692200, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 12549, + "time_per_iteration": 2.4908969402313232 + }, + { + "auxiliary_loss_clip": 0.01097148, + "auxiliary_loss_mlp": 0.01035232, + "balance_loss_clip": 1.02423525, + "balance_loss_mlp": 1.0338217, + "epoch": 0.7545468209830152, + "flos": 27083683848960.0, + "grad_norm": 1.9918391310756007, + "language_loss": 0.76892895, + "learning_rate": 5.992752706576865e-07, + "loss": 0.79025269, + "num_input_tokens_seen": 270709675, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6328125, + "step": 12550, + "time_per_iteration": 2.5220580101013184 + }, + { + "auxiliary_loss_clip": 0.01101116, + "auxiliary_loss_mlp": 0.01025163, + "balance_loss_clip": 1.01411855, + "balance_loss_mlp": 1.0339551, + "epoch": 0.7546069442356832, + "flos": 26871703735680.0, + "grad_norm": 1.4369467492375085, + "language_loss": 0.69346207, + "learning_rate": 5.98997304347386e-07, + "loss": 0.7147249, + "num_input_tokens_seen": 270733055, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.671875, + "step": 12551, + "time_per_iteration": 2.517190933227539 + }, + { + "auxiliary_loss_clip": 0.0110311, + "auxiliary_loss_mlp": 0.01026388, + "balance_loss_clip": 1.01450872, + "balance_loss_mlp": 1.03722537, + "epoch": 0.7546670674883511, + "flos": 15743131015680.0, + "grad_norm": 1.8744654131641019, + "language_loss": 0.86030054, + "learning_rate": 5.987193911632487e-07, + "loss": 0.88159549, + "num_input_tokens_seen": 270749275, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66015625, + "step": 12552, + "time_per_iteration": 2.402366876602173 + }, + { + "auxiliary_loss_clip": 0.01102012, + "auxiliary_loss_mlp": 0.0102913, + "balance_loss_clip": 1.01788902, + "balance_loss_mlp": 1.03502691, + "epoch": 0.7547271907410191, + "flos": 23477714519040.0, + "grad_norm": 2.4665346108502533, + "language_loss": 0.78498495, + "learning_rate": 5.98441531115812e-07, + "loss": 0.80629647, + "num_input_tokens_seen": 270768230, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 12553, + "time_per_iteration": 3.900495767593384 + }, + { + "auxiliary_loss_clip": 0.01103224, + "auxiliary_loss_mlp": 0.01032799, + "balance_loss_clip": 1.02062142, + "balance_loss_mlp": 1.03664863, + "epoch": 0.754787313993687, + "flos": 31722804069120.0, + "grad_norm": 2.2168137149261518, + "language_loss": 0.62832999, + "learning_rate": 5.981637242156135e-07, + "loss": 0.64969027, + "num_input_tokens_seen": 270786285, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6640625, + "step": 12554, + "time_per_iteration": 2.517960786819458 + }, + { + "auxiliary_loss_clip": 0.01099717, + "auxiliary_loss_mlp": 0.01027707, + "balance_loss_clip": 1.01661491, + "balance_loss_mlp": 1.03377116, + "epoch": 0.7548474372463551, + "flos": 27563055782400.0, + "grad_norm": 1.582375661492136, + "language_loss": 0.73297715, + "learning_rate": 5.978859704731864e-07, + "loss": 0.75425136, + "num_input_tokens_seen": 270805505, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.66015625, + "step": 12555, + "time_per_iteration": 3.861729145050049 + }, + { + "auxiliary_loss_clip": 0.01105045, + "auxiliary_loss_mlp": 0.01028913, + "balance_loss_clip": 1.01683104, + "balance_loss_mlp": 1.03707051, + "epoch": 0.754907560499023, + "flos": 19318576763520.0, + "grad_norm": 2.3601676718523956, + "language_loss": 0.78618932, + "learning_rate": 5.976082698990645e-07, + "loss": 0.80752885, + "num_input_tokens_seen": 270824610, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6796875, + "step": 12556, + "time_per_iteration": 3.837012529373169 + }, + { + "auxiliary_loss_clip": 0.01024032, + "auxiliary_loss_mlp": 0.0100246, + "balance_loss_clip": 1.00142884, + "balance_loss_mlp": 1.00368142, + "epoch": 0.754967683751691, + "flos": 69744628684800.0, + "grad_norm": 0.708139285400587, + "language_loss": 0.50455654, + "learning_rate": 5.973306225037769e-07, + "loss": 0.52482152, + "num_input_tokens_seen": 270886155, + "router_z_loss_clip": 0.01031494, + "router_z_loss_mlp": 0.203125, + "step": 12557, + "time_per_iteration": 4.464947462081909 + }, + { + "auxiliary_loss_clip": 0.01105013, + "auxiliary_loss_mlp": 0.01027804, + "balance_loss_clip": 1.01530528, + "balance_loss_mlp": 1.03735423, + "epoch": 0.7550278070043589, + "flos": 24421913377920.0, + "grad_norm": 1.6820502805276656, + "language_loss": 0.71426684, + "learning_rate": 5.970530282978525e-07, + "loss": 0.73559499, + "num_input_tokens_seen": 270905325, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 12558, + "time_per_iteration": 2.4628171920776367 + }, + { + "auxiliary_loss_clip": 0.0109928, + "auxiliary_loss_mlp": 0.01027296, + "balance_loss_clip": 1.01590014, + "balance_loss_mlp": 1.0340848, + "epoch": 0.7550879302570269, + "flos": 32634611838720.0, + "grad_norm": 1.7073621929136382, + "language_loss": 0.80198216, + "learning_rate": 5.967754872918187e-07, + "loss": 0.82324797, + "num_input_tokens_seen": 270927535, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 12559, + "time_per_iteration": 2.5296967029571533 + }, + { + "auxiliary_loss_clip": 0.01102511, + "auxiliary_loss_mlp": 0.01027699, + "balance_loss_clip": 1.01577234, + "balance_loss_mlp": 1.03483188, + "epoch": 0.7551480535096948, + "flos": 21795550738560.0, + "grad_norm": 1.6276492932782158, + "language_loss": 0.78893793, + "learning_rate": 5.96497999496199e-07, + "loss": 0.81024003, + "num_input_tokens_seen": 270946920, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 12560, + "time_per_iteration": 2.5170834064483643 + }, + { + "auxiliary_loss_clip": 0.01098646, + "auxiliary_loss_mlp": 0.01033819, + "balance_loss_clip": 1.02261329, + "balance_loss_mlp": 1.03458691, + "epoch": 0.7552081767623628, + "flos": 18515111391360.0, + "grad_norm": 1.7060183642703433, + "language_loss": 0.70997584, + "learning_rate": 5.96220564921515e-07, + "loss": 0.73130047, + "num_input_tokens_seen": 270965705, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 12561, + "time_per_iteration": 2.491224765777588 + }, + { + "auxiliary_loss_clip": 0.01099644, + "auxiliary_loss_mlp": 0.01028258, + "balance_loss_clip": 1.01652765, + "balance_loss_mlp": 1.03315794, + "epoch": 0.7552683000150308, + "flos": 27634805199360.0, + "grad_norm": 1.5670310978935318, + "language_loss": 0.75664687, + "learning_rate": 5.959431835782889e-07, + "loss": 0.77792597, + "num_input_tokens_seen": 270986550, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 12562, + "time_per_iteration": 2.5043649673461914 + }, + { + "auxiliary_loss_clip": 0.0110067, + "auxiliary_loss_mlp": 0.01029353, + "balance_loss_clip": 1.01738989, + "balance_loss_mlp": 1.03472567, + "epoch": 0.7553284232676988, + "flos": 20302924049280.0, + "grad_norm": 2.5989481487272426, + "language_loss": 0.75632036, + "learning_rate": 5.956658554770371e-07, + "loss": 0.77762067, + "num_input_tokens_seen": 271006250, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66015625, + "step": 12563, + "time_per_iteration": 2.461552143096924 + }, + { + "auxiliary_loss_clip": 0.0110889, + "auxiliary_loss_mlp": 0.01034511, + "balance_loss_clip": 1.02043235, + "balance_loss_mlp": 1.03629291, + "epoch": 0.7553885465203668, + "flos": 33255471444480.0, + "grad_norm": 2.463791742652493, + "language_loss": 0.67465413, + "learning_rate": 5.953885806282768e-07, + "loss": 0.69608808, + "num_input_tokens_seen": 271025575, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7265625, + "step": 12564, + "time_per_iteration": 2.518521785736084 + }, + { + "auxiliary_loss_clip": 0.01104188, + "auxiliary_loss_mlp": 0.01032716, + "balance_loss_clip": 1.02057433, + "balance_loss_mlp": 1.03584766, + "epoch": 0.7554486697730347, + "flos": 21616249023360.0, + "grad_norm": 2.2259446193296943, + "language_loss": 0.68585801, + "learning_rate": 5.951113590425228e-07, + "loss": 0.70722699, + "num_input_tokens_seen": 271045805, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.68359375, + "step": 12565, + "time_per_iteration": 2.473606586456299 + }, + { + "auxiliary_loss_clip": 0.01103455, + "auxiliary_loss_mlp": 0.01027682, + "balance_loss_clip": 1.01519513, + "balance_loss_mlp": 1.0340724, + "epoch": 0.7555087930257027, + "flos": 27632973605760.0, + "grad_norm": 1.6339568808166163, + "language_loss": 0.7538799, + "learning_rate": 5.94834190730287e-07, + "loss": 0.77519131, + "num_input_tokens_seen": 271066065, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 12566, + "time_per_iteration": 2.4602677822113037 + }, + { + "auxiliary_loss_clip": 0.01105793, + "auxiliary_loss_mlp": 0.0103259, + "balance_loss_clip": 1.01922655, + "balance_loss_mlp": 1.03676701, + "epoch": 0.7555689162783706, + "flos": 23621644316160.0, + "grad_norm": 2.446271815399535, + "language_loss": 0.73930967, + "learning_rate": 5.945570757020789e-07, + "loss": 0.76069355, + "num_input_tokens_seen": 271085870, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.6875, + "step": 12567, + "time_per_iteration": 2.482639789581299 + }, + { + "auxiliary_loss_clip": 0.01100485, + "auxiliary_loss_mlp": 0.01028822, + "balance_loss_clip": 1.01764655, + "balance_loss_mlp": 1.03495455, + "epoch": 0.7556290395310387, + "flos": 24863076218880.0, + "grad_norm": 1.8407945721596504, + "language_loss": 0.62615836, + "learning_rate": 5.942800139684073e-07, + "loss": 0.6474514, + "num_input_tokens_seen": 271104260, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.65625, + "step": 12568, + "time_per_iteration": 2.5483205318450928 + }, + { + "auxiliary_loss_clip": 0.01101205, + "auxiliary_loss_mlp": 0.01030606, + "balance_loss_clip": 1.01934648, + "balance_loss_mlp": 1.03582668, + "epoch": 0.7556891627837066, + "flos": 43543770330240.0, + "grad_norm": 1.9963818018777864, + "language_loss": 0.66748881, + "learning_rate": 5.940030055397789e-07, + "loss": 0.68880689, + "num_input_tokens_seen": 271125745, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65234375, + "step": 12569, + "time_per_iteration": 2.659467935562134 + }, + { + "auxiliary_loss_clip": 0.01105651, + "auxiliary_loss_mlp": 0.0103263, + "balance_loss_clip": 1.01936173, + "balance_loss_mlp": 1.03600824, + "epoch": 0.7557492860363746, + "flos": 26650924790400.0, + "grad_norm": 1.6607243680943589, + "language_loss": 0.67248321, + "learning_rate": 5.93726050426697e-07, + "loss": 0.69386601, + "num_input_tokens_seen": 271147145, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6953125, + "step": 12570, + "time_per_iteration": 2.4708566665649414 + }, + { + "auxiliary_loss_clip": 0.01102793, + "auxiliary_loss_mlp": 0.0103299, + "balance_loss_clip": 1.02071762, + "balance_loss_mlp": 1.0357399, + "epoch": 0.7558094092890425, + "flos": 55182885010560.0, + "grad_norm": 1.8220604458329166, + "language_loss": 0.7152952, + "learning_rate": 5.934491486396647e-07, + "loss": 0.73665303, + "num_input_tokens_seen": 271170865, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.671875, + "step": 12571, + "time_per_iteration": 2.7677295207977295 + }, + { + "auxiliary_loss_clip": 0.01102851, + "auxiliary_loss_mlp": 0.01029018, + "balance_loss_clip": 1.01681685, + "balance_loss_mlp": 1.03468394, + "epoch": 0.7558695325417105, + "flos": 23988292392960.0, + "grad_norm": 1.6120967066403376, + "language_loss": 0.73383725, + "learning_rate": 5.931723001891811e-07, + "loss": 0.75515598, + "num_input_tokens_seen": 271191450, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6796875, + "step": 12572, + "time_per_iteration": 2.457766056060791 + }, + { + "auxiliary_loss_clip": 0.01104212, + "auxiliary_loss_mlp": 0.01032566, + "balance_loss_clip": 1.02069271, + "balance_loss_mlp": 1.03641462, + "epoch": 0.7559296557943784, + "flos": 14611262572800.0, + "grad_norm": 1.9236315061860603, + "language_loss": 0.76293039, + "learning_rate": 5.928955050857456e-07, + "loss": 0.78429818, + "num_input_tokens_seen": 271207335, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 12573, + "time_per_iteration": 2.419971466064453 + }, + { + "auxiliary_loss_clip": 0.01105728, + "auxiliary_loss_mlp": 0.01033526, + "balance_loss_clip": 1.02172422, + "balance_loss_mlp": 1.03609872, + "epoch": 0.7559897790470465, + "flos": 18550483309440.0, + "grad_norm": 1.703385006523425, + "language_loss": 0.69107687, + "learning_rate": 5.926187633398527e-07, + "loss": 0.7124694, + "num_input_tokens_seen": 271226895, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6953125, + "step": 12574, + "time_per_iteration": 2.4180386066436768 + }, + { + "auxiliary_loss_clip": 0.01099738, + "auxiliary_loss_mlp": 0.01033214, + "balance_loss_clip": 1.02082789, + "balance_loss_mlp": 1.03441286, + "epoch": 0.7560499022997144, + "flos": 17967868709760.0, + "grad_norm": 2.2423644939518423, + "language_loss": 0.7207917, + "learning_rate": 5.923420749619974e-07, + "loss": 0.74212122, + "num_input_tokens_seen": 271244375, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.65625, + "step": 12575, + "time_per_iteration": 2.455258846282959 + }, + { + "auxiliary_loss_clip": 0.01098947, + "auxiliary_loss_mlp": 0.01033415, + "balance_loss_clip": 1.02219176, + "balance_loss_mlp": 1.03365374, + "epoch": 0.7561100255523824, + "flos": 15737815802880.0, + "grad_norm": 2.02730026321769, + "language_loss": 0.72025073, + "learning_rate": 5.92065439962673e-07, + "loss": 0.74157435, + "num_input_tokens_seen": 271259530, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 12576, + "time_per_iteration": 2.4121248722076416 + }, + { + "auxiliary_loss_clip": 0.01101081, + "auxiliary_loss_mlp": 0.01033414, + "balance_loss_clip": 1.0213263, + "balance_loss_mlp": 1.03535473, + "epoch": 0.7561701488050504, + "flos": 15888102307200.0, + "grad_norm": 1.8488663808999763, + "language_loss": 0.67365032, + "learning_rate": 5.917888583523669e-07, + "loss": 0.69499528, + "num_input_tokens_seen": 271276835, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.65625, + "step": 12577, + "time_per_iteration": 2.4330592155456543 + }, + { + "auxiliary_loss_clip": 0.0110105, + "auxiliary_loss_mlp": 0.01031737, + "balance_loss_clip": 1.02031672, + "balance_loss_mlp": 1.03520989, + "epoch": 0.7562302720577183, + "flos": 20339157893760.0, + "grad_norm": 1.669663040088463, + "language_loss": 0.78626776, + "learning_rate": 5.915123301415685e-07, + "loss": 0.80759561, + "num_input_tokens_seen": 271296275, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 12578, + "time_per_iteration": 2.4133589267730713 + }, + { + "auxiliary_loss_clip": 0.01101874, + "auxiliary_loss_mlp": 0.01030497, + "balance_loss_clip": 1.01851618, + "balance_loss_mlp": 1.03413773, + "epoch": 0.7562903953103863, + "flos": 20812209033600.0, + "grad_norm": 1.4105288225039079, + "language_loss": 0.75553155, + "learning_rate": 5.912358553407641e-07, + "loss": 0.77685523, + "num_input_tokens_seen": 271315685, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 12579, + "time_per_iteration": 2.465855836868286 + }, + { + "auxiliary_loss_clip": 0.01107073, + "auxiliary_loss_mlp": 0.0103106, + "balance_loss_clip": 1.01840019, + "balance_loss_mlp": 1.03599763, + "epoch": 0.7563505185630542, + "flos": 37596999484800.0, + "grad_norm": 1.9246022226121349, + "language_loss": 0.62678003, + "learning_rate": 5.90959433960437e-07, + "loss": 0.64816135, + "num_input_tokens_seen": 271336790, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 12580, + "time_per_iteration": 2.5613341331481934 + }, + { + "auxiliary_loss_clip": 0.01102863, + "auxiliary_loss_mlp": 0.0103228, + "balance_loss_clip": 1.02075255, + "balance_loss_mlp": 1.03644252, + "epoch": 0.7564106418157223, + "flos": 20230995064320.0, + "grad_norm": 1.7113026290728908, + "language_loss": 0.74942124, + "learning_rate": 5.906830660110691e-07, + "loss": 0.7707727, + "num_input_tokens_seen": 271355470, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 12581, + "time_per_iteration": 2.4502360820770264 + }, + { + "auxiliary_loss_clip": 0.01101422, + "auxiliary_loss_mlp": 0.01029963, + "balance_loss_clip": 1.01828623, + "balance_loss_mlp": 1.03389621, + "epoch": 0.7564707650683902, + "flos": 24754877475840.0, + "grad_norm": 2.005641504780856, + "language_loss": 0.6295954, + "learning_rate": 5.904067515031412e-07, + "loss": 0.6509093, + "num_input_tokens_seen": 271375810, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.67578125, + "step": 12582, + "time_per_iteration": 2.4572439193725586 + }, + { + "auxiliary_loss_clip": 0.01023883, + "auxiliary_loss_mlp": 0.01000227, + "balance_loss_clip": 0.99921417, + "balance_loss_mlp": 1.00362778, + "epoch": 0.7565308883210582, + "flos": 48530076433920.0, + "grad_norm": 0.9810901823792554, + "language_loss": 0.60704458, + "learning_rate": 5.901304904471307e-07, + "loss": 0.6272856, + "num_input_tokens_seen": 271424775, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.203125, + "step": 12583, + "time_per_iteration": 2.7996931076049805 + }, + { + "auxiliary_loss_clip": 0.01103397, + "auxiliary_loss_mlp": 0.01035964, + "balance_loss_clip": 1.02372694, + "balance_loss_mlp": 1.03601849, + "epoch": 0.7565910115737261, + "flos": 12495082757760.0, + "grad_norm": 2.0250696621760413, + "language_loss": 0.78582263, + "learning_rate": 5.898542828535125e-07, + "loss": 0.80721629, + "num_input_tokens_seen": 271440500, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.671875, + "step": 12584, + "time_per_iteration": 2.400280475616455 + }, + { + "auxiliary_loss_clip": 0.01099872, + "auxiliary_loss_mlp": 0.01027615, + "balance_loss_clip": 1.01651096, + "balance_loss_mlp": 1.03562188, + "epoch": 0.7566511348263941, + "flos": 21173003193600.0, + "grad_norm": 2.69321954136788, + "language_loss": 0.77584487, + "learning_rate": 5.895781287327612e-07, + "loss": 0.79711974, + "num_input_tokens_seen": 271458180, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.64453125, + "step": 12585, + "time_per_iteration": 2.4472086429595947 + }, + { + "auxiliary_loss_clip": 0.01108261, + "auxiliary_loss_mlp": 0.01034728, + "balance_loss_clip": 1.02196097, + "balance_loss_mlp": 1.03907382, + "epoch": 0.756711258079062, + "flos": 21754827694080.0, + "grad_norm": 1.6081546851080741, + "language_loss": 0.82765162, + "learning_rate": 5.893020280953493e-07, + "loss": 0.84908152, + "num_input_tokens_seen": 271475730, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69140625, + "step": 12586, + "time_per_iteration": 2.4276626110076904 + }, + { + "auxiliary_loss_clip": 0.01106519, + "auxiliary_loss_mlp": 0.01030607, + "balance_loss_clip": 1.01965213, + "balance_loss_mlp": 1.03753841, + "epoch": 0.75677138133173, + "flos": 22382905933440.0, + "grad_norm": 1.873036053279186, + "language_loss": 0.83275306, + "learning_rate": 5.890259809517459e-07, + "loss": 0.85412443, + "num_input_tokens_seen": 271495030, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.69140625, + "step": 12587, + "time_per_iteration": 2.4600062370300293 + }, + { + "auxiliary_loss_clip": 0.01100482, + "auxiliary_loss_mlp": 0.01029229, + "balance_loss_clip": 1.01739788, + "balance_loss_mlp": 1.03461528, + "epoch": 0.756831504584398, + "flos": 22708974620160.0, + "grad_norm": 1.6080398539976855, + "language_loss": 0.71293926, + "learning_rate": 5.88749987312418e-07, + "loss": 0.73423636, + "num_input_tokens_seen": 271515355, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65625, + "step": 12588, + "time_per_iteration": 2.460441827774048 + }, + { + "auxiliary_loss_clip": 0.01105135, + "auxiliary_loss_mlp": 0.01028941, + "balance_loss_clip": 1.01644826, + "balance_loss_mlp": 1.03631103, + "epoch": 0.756891627837066, + "flos": 24098358643200.0, + "grad_norm": 1.7772907750031848, + "language_loss": 0.68223751, + "learning_rate": 5.884740471878327e-07, + "loss": 0.70357823, + "num_input_tokens_seen": 271535090, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 12589, + "time_per_iteration": 2.4796125888824463 + }, + { + "auxiliary_loss_clip": 0.0110204, + "auxiliary_loss_mlp": 0.01029865, + "balance_loss_clip": 1.01805186, + "balance_loss_mlp": 1.0352689, + "epoch": 0.756951751089734, + "flos": 19749001438080.0, + "grad_norm": 2.5553015061472326, + "language_loss": 0.91916406, + "learning_rate": 5.881981605884522e-07, + "loss": 0.94048315, + "num_input_tokens_seen": 271551075, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66796875, + "step": 12590, + "time_per_iteration": 2.4198997020721436 + }, + { + "auxiliary_loss_clip": 0.01098826, + "auxiliary_loss_mlp": 0.0102714, + "balance_loss_clip": 1.01576114, + "balance_loss_mlp": 1.03452909, + "epoch": 0.7570118743424019, + "flos": 35079266551680.0, + "grad_norm": 1.7917701509519888, + "language_loss": 0.65428317, + "learning_rate": 5.879223275247391e-07, + "loss": 0.67554283, + "num_input_tokens_seen": 271571035, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.640625, + "step": 12591, + "time_per_iteration": 2.56341814994812 + }, + { + "auxiliary_loss_clip": 0.01102228, + "auxiliary_loss_mlp": 0.01026308, + "balance_loss_clip": 1.01525116, + "balance_loss_mlp": 1.03707504, + "epoch": 0.7570719975950699, + "flos": 25594540778880.0, + "grad_norm": 1.511094647527582, + "language_loss": 0.73406184, + "learning_rate": 5.876465480071528e-07, + "loss": 0.75534725, + "num_input_tokens_seen": 271592950, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65234375, + "step": 12592, + "time_per_iteration": 2.474759340286255 + }, + { + "auxiliary_loss_clip": 0.01102216, + "auxiliary_loss_mlp": 0.01035375, + "balance_loss_clip": 1.02331686, + "balance_loss_mlp": 1.03412235, + "epoch": 0.7571321208477378, + "flos": 10816223028480.0, + "grad_norm": 2.1319710484730074, + "language_loss": 0.7111423, + "learning_rate": 5.873708220461522e-07, + "loss": 0.7325182, + "num_input_tokens_seen": 271608835, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6796875, + "step": 12593, + "time_per_iteration": 2.4273533821105957 + }, + { + "auxiliary_loss_clip": 0.01104658, + "auxiliary_loss_mlp": 0.01030106, + "balance_loss_clip": 1.01837587, + "balance_loss_mlp": 1.03637433, + "epoch": 0.7571922441004059, + "flos": 18260109763200.0, + "grad_norm": 1.845375608838855, + "language_loss": 0.66037387, + "learning_rate": 5.870951496521903e-07, + "loss": 0.68172151, + "num_input_tokens_seen": 271627730, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.68359375, + "step": 12594, + "time_per_iteration": 3.838972568511963 + }, + { + "auxiliary_loss_clip": 0.01104515, + "auxiliary_loss_mlp": 0.01032453, + "balance_loss_clip": 1.02056789, + "balance_loss_mlp": 1.03537512, + "epoch": 0.7572523673530738, + "flos": 22890502978560.0, + "grad_norm": 1.5603399133411295, + "language_loss": 0.80766582, + "learning_rate": 5.86819530835722e-07, + "loss": 0.82903558, + "num_input_tokens_seen": 271646415, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69140625, + "step": 12595, + "time_per_iteration": 2.4764091968536377 + }, + { + "auxiliary_loss_clip": 0.01102369, + "auxiliary_loss_mlp": 0.01031107, + "balance_loss_clip": 1.01978159, + "balance_loss_mlp": 1.03633952, + "epoch": 0.7573124906057418, + "flos": 20996323171200.0, + "grad_norm": 1.8048420186435026, + "language_loss": 0.71071315, + "learning_rate": 5.865439656071993e-07, + "loss": 0.73204786, + "num_input_tokens_seen": 271666240, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66015625, + "step": 12596, + "time_per_iteration": 3.9183623790740967 + }, + { + "auxiliary_loss_clip": 0.01100386, + "auxiliary_loss_mlp": 0.01030232, + "balance_loss_clip": 1.01918721, + "balance_loss_mlp": 1.0357343, + "epoch": 0.7573726138584097, + "flos": 20886292834560.0, + "grad_norm": 1.5321566367759303, + "language_loss": 0.80469054, + "learning_rate": 5.862684539770706e-07, + "loss": 0.82599676, + "num_input_tokens_seen": 271686370, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 12597, + "time_per_iteration": 3.962346076965332 + }, + { + "auxiliary_loss_clip": 0.01108273, + "auxiliary_loss_mlp": 0.01030578, + "balance_loss_clip": 1.01784658, + "balance_loss_mlp": 1.03885663, + "epoch": 0.7574327371110777, + "flos": 24530507170560.0, + "grad_norm": 1.9840297783183698, + "language_loss": 0.83408284, + "learning_rate": 5.859929959557835e-07, + "loss": 0.85547137, + "num_input_tokens_seen": 271705050, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 12598, + "time_per_iteration": 2.4496231079101562 + }, + { + "auxiliary_loss_clip": 0.01101103, + "auxiliary_loss_mlp": 0.01024155, + "balance_loss_clip": 1.01322365, + "balance_loss_mlp": 1.03596723, + "epoch": 0.7574928603637456, + "flos": 23364523785600.0, + "grad_norm": 1.806795486884082, + "language_loss": 0.62383306, + "learning_rate": 5.857175915537845e-07, + "loss": 0.64508563, + "num_input_tokens_seen": 271724915, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 12599, + "time_per_iteration": 3.9129881858825684 + }, + { + "auxiliary_loss_clip": 0.01106489, + "auxiliary_loss_mlp": 0.01033726, + "balance_loss_clip": 1.02027953, + "balance_loss_mlp": 1.03697991, + "epoch": 0.7575529836164137, + "flos": 13516274419200.0, + "grad_norm": 2.7350879991531523, + "language_loss": 0.62593752, + "learning_rate": 5.854422407815161e-07, + "loss": 0.6473397, + "num_input_tokens_seen": 271742410, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.6953125, + "step": 12600, + "time_per_iteration": 2.3905975818634033 + }, + { + "auxiliary_loss_clip": 0.01100395, + "auxiliary_loss_mlp": 0.01027152, + "balance_loss_clip": 1.01535034, + "balance_loss_mlp": 1.03529775, + "epoch": 0.7576131068690816, + "flos": 19646584784640.0, + "grad_norm": 1.9463870297593193, + "language_loss": 0.66116518, + "learning_rate": 5.851669436494191e-07, + "loss": 0.68244064, + "num_input_tokens_seen": 271761425, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65234375, + "step": 12601, + "time_per_iteration": 2.4491307735443115 + }, + { + "auxiliary_loss_clip": 0.01099051, + "auxiliary_loss_mlp": 0.01031238, + "balance_loss_clip": 1.02029419, + "balance_loss_mlp": 1.03474712, + "epoch": 0.7576732301217496, + "flos": 20048245643520.0, + "grad_norm": 1.5554220634885219, + "language_loss": 0.67926621, + "learning_rate": 5.848917001679335e-07, + "loss": 0.70056915, + "num_input_tokens_seen": 271780875, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.640625, + "step": 12602, + "time_per_iteration": 2.421680450439453 + }, + { + "auxiliary_loss_clip": 0.01103764, + "auxiliary_loss_mlp": 0.01032798, + "balance_loss_clip": 1.0201081, + "balance_loss_mlp": 1.03649664, + "epoch": 0.7577333533744176, + "flos": 15377093470080.0, + "grad_norm": 1.7612852584963323, + "language_loss": 0.67052841, + "learning_rate": 5.846165103474967e-07, + "loss": 0.69189405, + "num_input_tokens_seen": 271799490, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.671875, + "step": 12603, + "time_per_iteration": 2.4140625 + }, + { + "auxiliary_loss_clip": 0.01098496, + "auxiliary_loss_mlp": 0.01029738, + "balance_loss_clip": 1.01902699, + "balance_loss_mlp": 1.03334355, + "epoch": 0.7577934766270855, + "flos": 17894862316800.0, + "grad_norm": 1.9150574683213546, + "language_loss": 0.61476982, + "learning_rate": 5.843413741985439e-07, + "loss": 0.63605225, + "num_input_tokens_seen": 271817040, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.65234375, + "step": 12604, + "time_per_iteration": 2.4143993854522705 + }, + { + "auxiliary_loss_clip": 0.0110333, + "auxiliary_loss_mlp": 0.01032559, + "balance_loss_clip": 1.02098989, + "balance_loss_mlp": 1.03802633, + "epoch": 0.7578535998797535, + "flos": 21613770984960.0, + "grad_norm": 1.9881999977626783, + "language_loss": 0.80013704, + "learning_rate": 5.840662917315076e-07, + "loss": 0.82149595, + "num_input_tokens_seen": 271835480, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 12605, + "time_per_iteration": 2.4703023433685303 + }, + { + "auxiliary_loss_clip": 0.01105019, + "auxiliary_loss_mlp": 0.01028637, + "balance_loss_clip": 1.01616216, + "balance_loss_mlp": 1.03563833, + "epoch": 0.7579137231324214, + "flos": 18478374756480.0, + "grad_norm": 2.509488145051598, + "language_loss": 0.78940737, + "learning_rate": 5.837912629568198e-07, + "loss": 0.81074387, + "num_input_tokens_seen": 271849835, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 12606, + "time_per_iteration": 2.4461817741394043 + }, + { + "auxiliary_loss_clip": 0.01098445, + "auxiliary_loss_mlp": 0.01025586, + "balance_loss_clip": 1.01545978, + "balance_loss_mlp": 1.03513336, + "epoch": 0.7579738463850895, + "flos": 23255032152960.0, + "grad_norm": 1.3687592276329898, + "language_loss": 0.73185945, + "learning_rate": 5.835162878849087e-07, + "loss": 0.75309968, + "num_input_tokens_seen": 271869560, + "router_z_loss_clip": 0.10107422, + "router_z_loss_mlp": 0.6328125, + "step": 12607, + "time_per_iteration": 2.4908721446990967 + }, + { + "auxiliary_loss_clip": 0.01104024, + "auxiliary_loss_mlp": 0.01028582, + "balance_loss_clip": 1.01622605, + "balance_loss_mlp": 1.03433669, + "epoch": 0.7580339696377574, + "flos": 14027031861120.0, + "grad_norm": 2.4968443331698635, + "language_loss": 0.75006789, + "learning_rate": 5.83241366526202e-07, + "loss": 0.7713939, + "num_input_tokens_seen": 271887950, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6953125, + "step": 12608, + "time_per_iteration": 2.408450126647949 + }, + { + "auxiliary_loss_clip": 0.01100229, + "auxiliary_loss_mlp": 0.01032276, + "balance_loss_clip": 1.02053344, + "balance_loss_mlp": 1.03477442, + "epoch": 0.7580940928904254, + "flos": 25082777756160.0, + "grad_norm": 1.5859201905014537, + "language_loss": 0.71409112, + "learning_rate": 5.829664988911245e-07, + "loss": 0.73541617, + "num_input_tokens_seen": 271907700, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 12609, + "time_per_iteration": 2.498211622238159 + }, + { + "auxiliary_loss_clip": 0.01102343, + "auxiliary_loss_mlp": 0.01029809, + "balance_loss_clip": 1.01692224, + "balance_loss_mlp": 1.03438187, + "epoch": 0.7581542161430933, + "flos": 23836425690240.0, + "grad_norm": 2.844859157672467, + "language_loss": 0.81682944, + "learning_rate": 5.826916849901007e-07, + "loss": 0.83815098, + "num_input_tokens_seen": 271926840, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6796875, + "step": 12610, + "time_per_iteration": 2.432453155517578 + }, + { + "auxiliary_loss_clip": 0.01105711, + "auxiliary_loss_mlp": 0.01030053, + "balance_loss_clip": 1.01812613, + "balance_loss_mlp": 1.03694248, + "epoch": 0.7582143393957613, + "flos": 22237000888320.0, + "grad_norm": 1.6924171050782333, + "language_loss": 0.70433235, + "learning_rate": 5.824169248335488e-07, + "loss": 0.72569001, + "num_input_tokens_seen": 271946465, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 12611, + "time_per_iteration": 2.500880479812622 + }, + { + "auxiliary_loss_clip": 0.01102293, + "auxiliary_loss_mlp": 0.01027381, + "balance_loss_clip": 1.01576972, + "balance_loss_mlp": 1.03632438, + "epoch": 0.7582744626484292, + "flos": 21106389421440.0, + "grad_norm": 1.4523660094894448, + "language_loss": 0.70939386, + "learning_rate": 5.821422184318893e-07, + "loss": 0.7306906, + "num_input_tokens_seen": 271967295, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66015625, + "step": 12612, + "time_per_iteration": 2.4539196491241455 + }, + { + "auxiliary_loss_clip": 0.01104666, + "auxiliary_loss_mlp": 0.01035871, + "balance_loss_clip": 1.02454031, + "balance_loss_mlp": 1.03628385, + "epoch": 0.7583345859010973, + "flos": 24604770539520.0, + "grad_norm": 1.557484420274363, + "language_loss": 0.59628952, + "learning_rate": 5.818675657955397e-07, + "loss": 0.61769485, + "num_input_tokens_seen": 271987960, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.68359375, + "step": 12613, + "time_per_iteration": 2.5192790031433105 + }, + { + "auxiliary_loss_clip": 0.01102507, + "auxiliary_loss_mlp": 0.0103357, + "balance_loss_clip": 1.02141631, + "balance_loss_mlp": 1.03547192, + "epoch": 0.7583947091537652, + "flos": 33546814657920.0, + "grad_norm": 1.5699815827869172, + "language_loss": 0.59917688, + "learning_rate": 5.815929669349135e-07, + "loss": 0.62053764, + "num_input_tokens_seen": 272011780, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.671875, + "step": 12614, + "time_per_iteration": 2.5326051712036133 + }, + { + "auxiliary_loss_clip": 0.01102627, + "auxiliary_loss_mlp": 0.01026271, + "balance_loss_clip": 1.01423693, + "balance_loss_mlp": 1.03418064, + "epoch": 0.7584548324064332, + "flos": 20121000641280.0, + "grad_norm": 1.965283793201321, + "language_loss": 0.73299825, + "learning_rate": 5.813184218604246e-07, + "loss": 0.75428724, + "num_input_tokens_seen": 272030825, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 12615, + "time_per_iteration": 2.4653162956237793 + }, + { + "auxiliary_loss_clip": 0.01023549, + "auxiliary_loss_mlp": 0.01002988, + "balance_loss_clip": 1.00207007, + "balance_loss_mlp": 1.00344896, + "epoch": 0.7585149556591012, + "flos": 70402584061440.0, + "grad_norm": 0.8444154589468232, + "language_loss": 0.67707115, + "learning_rate": 5.810439305824828e-07, + "loss": 0.69733649, + "num_input_tokens_seen": 272095825, + "router_z_loss_clip": 0.00915527, + "router_z_loss_mlp": 0.20117188, + "step": 12616, + "time_per_iteration": 3.0754714012145996 + }, + { + "auxiliary_loss_clip": 0.01105053, + "auxiliary_loss_mlp": 0.01035378, + "balance_loss_clip": 1.02318311, + "balance_loss_mlp": 1.03608048, + "epoch": 0.7585750789117691, + "flos": 16143786293760.0, + "grad_norm": 1.7978643606873037, + "language_loss": 0.84971976, + "learning_rate": 5.807694931114979e-07, + "loss": 0.87112409, + "num_input_tokens_seen": 272113950, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69140625, + "step": 12617, + "time_per_iteration": 2.448288917541504 + }, + { + "auxiliary_loss_clip": 0.01103847, + "auxiliary_loss_mlp": 0.01035021, + "balance_loss_clip": 1.02366078, + "balance_loss_mlp": 1.03641772, + "epoch": 0.7586352021644371, + "flos": 17493165544320.0, + "grad_norm": 2.68696985331022, + "language_loss": 0.75189435, + "learning_rate": 5.804951094578757e-07, + "loss": 0.77328306, + "num_input_tokens_seen": 272130315, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.67578125, + "step": 12618, + "time_per_iteration": 2.3945305347442627 + }, + { + "auxiliary_loss_clip": 0.01106053, + "auxiliary_loss_mlp": 0.01033655, + "balance_loss_clip": 1.02109587, + "balance_loss_mlp": 1.03541541, + "epoch": 0.758695325417105, + "flos": 17275187859840.0, + "grad_norm": 2.6724320695855646, + "language_loss": 0.77528578, + "learning_rate": 5.802207796320209e-07, + "loss": 0.79668283, + "num_input_tokens_seen": 272149080, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.70703125, + "step": 12619, + "time_per_iteration": 2.5116357803344727 + }, + { + "auxiliary_loss_clip": 0.01101262, + "auxiliary_loss_mlp": 0.01033753, + "balance_loss_clip": 1.0217483, + "balance_loss_mlp": 1.03520966, + "epoch": 0.7587554486697731, + "flos": 29495660163840.0, + "grad_norm": 1.9430951948294126, + "language_loss": 0.8248623, + "learning_rate": 5.79946503644337e-07, + "loss": 0.84621245, + "num_input_tokens_seen": 272168285, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66015625, + "step": 12620, + "time_per_iteration": 2.4870126247406006 + }, + { + "auxiliary_loss_clip": 0.0110498, + "auxiliary_loss_mlp": 0.0103625, + "balance_loss_clip": 1.02316654, + "balance_loss_mlp": 1.03550339, + "epoch": 0.758815571922441, + "flos": 16100800692480.0, + "grad_norm": 2.128247483649562, + "language_loss": 0.82510465, + "learning_rate": 5.796722815052242e-07, + "loss": 0.84651691, + "num_input_tokens_seen": 272184585, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6953125, + "step": 12621, + "time_per_iteration": 2.407888412475586 + }, + { + "auxiliary_loss_clip": 0.01103126, + "auxiliary_loss_mlp": 0.01031415, + "balance_loss_clip": 1.01944077, + "balance_loss_mlp": 1.03546882, + "epoch": 0.758875695175109, + "flos": 16143714466560.0, + "grad_norm": 1.905238128524311, + "language_loss": 0.73415148, + "learning_rate": 5.7939811322508e-07, + "loss": 0.75549692, + "num_input_tokens_seen": 272200205, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 12622, + "time_per_iteration": 2.392918348312378 + }, + { + "auxiliary_loss_clip": 0.01023365, + "auxiliary_loss_mlp": 0.01001846, + "balance_loss_clip": 1.00088048, + "balance_loss_mlp": 1.00314832, + "epoch": 0.7589358184277769, + "flos": 68462006860800.0, + "grad_norm": 0.8354315652196721, + "language_loss": 0.60838234, + "learning_rate": 5.791239988143024e-07, + "loss": 0.62863445, + "num_input_tokens_seen": 272259670, + "router_z_loss_clip": 0.00964355, + "router_z_loss_mlp": 0.20214844, + "step": 12623, + "time_per_iteration": 3.0560390949249268 + }, + { + "auxiliary_loss_clip": 0.0110073, + "auxiliary_loss_mlp": 0.01033721, + "balance_loss_clip": 1.02278328, + "balance_loss_mlp": 1.03662705, + "epoch": 0.7589959416804449, + "flos": 20047311889920.0, + "grad_norm": 3.5930861717067653, + "language_loss": 0.66990733, + "learning_rate": 5.788499382832847e-07, + "loss": 0.69125187, + "num_input_tokens_seen": 272277925, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.640625, + "step": 12624, + "time_per_iteration": 2.41662335395813 + }, + { + "auxiliary_loss_clip": 0.01099572, + "auxiliary_loss_mlp": 0.01025695, + "balance_loss_clip": 1.01397121, + "balance_loss_mlp": 1.03495693, + "epoch": 0.7590560649331128, + "flos": 18771800958720.0, + "grad_norm": 1.9214697173160005, + "language_loss": 0.75980389, + "learning_rate": 5.785759316424196e-07, + "loss": 0.78105658, + "num_input_tokens_seen": 272296010, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6484375, + "step": 12625, + "time_per_iteration": 2.430710792541504 + }, + { + "auxiliary_loss_clip": 0.01102125, + "auxiliary_loss_mlp": 0.01033951, + "balance_loss_clip": 1.02228665, + "balance_loss_mlp": 1.0369575, + "epoch": 0.7591161881857809, + "flos": 29825284296960.0, + "grad_norm": 1.779846333652066, + "language_loss": 0.6279074, + "learning_rate": 5.783019789020977e-07, + "loss": 0.64926815, + "num_input_tokens_seen": 272318330, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6484375, + "step": 12626, + "time_per_iteration": 2.504363775253296 + }, + { + "auxiliary_loss_clip": 0.01104327, + "auxiliary_loss_mlp": 0.01037762, + "balance_loss_clip": 1.02497673, + "balance_loss_mlp": 1.03691292, + "epoch": 0.7591763114384488, + "flos": 20302708567680.0, + "grad_norm": 2.3505107376172782, + "language_loss": 0.73657954, + "learning_rate": 5.780280800727084e-07, + "loss": 0.75800049, + "num_input_tokens_seen": 272335265, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.67578125, + "step": 12627, + "time_per_iteration": 2.4584648609161377 + }, + { + "auxiliary_loss_clip": 0.01104059, + "auxiliary_loss_mlp": 0.01025855, + "balance_loss_clip": 1.01399338, + "balance_loss_mlp": 1.03618145, + "epoch": 0.7592364346911168, + "flos": 20813609664000.0, + "grad_norm": 1.9976061083215328, + "language_loss": 0.68754119, + "learning_rate": 5.777542351646356e-07, + "loss": 0.70884025, + "num_input_tokens_seen": 272354795, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 12628, + "time_per_iteration": 2.430168390274048 + }, + { + "auxiliary_loss_clip": 0.01111325, + "auxiliary_loss_mlp": 0.01038199, + "balance_loss_clip": 1.02543187, + "balance_loss_mlp": 1.03944075, + "epoch": 0.7592965579437848, + "flos": 21251504367360.0, + "grad_norm": 1.8845310767470707, + "language_loss": 0.63146746, + "learning_rate": 5.774804441882648e-07, + "loss": 0.65296274, + "num_input_tokens_seen": 272372875, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.71875, + "step": 12629, + "time_per_iteration": 2.4647164344787598 + }, + { + "auxiliary_loss_clip": 0.01096357, + "auxiliary_loss_mlp": 0.01029567, + "balance_loss_clip": 1.01772952, + "balance_loss_mlp": 1.03295267, + "epoch": 0.7593566811964527, + "flos": 26213604704640.0, + "grad_norm": 1.5320581360916075, + "language_loss": 0.77814519, + "learning_rate": 5.772067071539786e-07, + "loss": 0.79940444, + "num_input_tokens_seen": 272394715, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6328125, + "step": 12630, + "time_per_iteration": 2.4695019721984863 + }, + { + "auxiliary_loss_clip": 0.01023993, + "auxiliary_loss_mlp": 0.01002903, + "balance_loss_clip": 1.00193775, + "balance_loss_mlp": 1.00382364, + "epoch": 0.7594168044491207, + "flos": 71237255374080.0, + "grad_norm": 0.8096499014530706, + "language_loss": 0.61483628, + "learning_rate": 5.769330240721562e-07, + "loss": 0.63510519, + "num_input_tokens_seen": 272458775, + "router_z_loss_clip": 0.00964355, + "router_z_loss_mlp": 0.20117188, + "step": 12631, + "time_per_iteration": 3.0936625003814697 + }, + { + "auxiliary_loss_clip": 0.01109676, + "auxiliary_loss_mlp": 0.01034458, + "balance_loss_clip": 1.02038503, + "balance_loss_mlp": 1.0382787, + "epoch": 0.7594769277017887, + "flos": 26613326229120.0, + "grad_norm": 1.648732197394605, + "language_loss": 0.73976278, + "learning_rate": 5.766593949531767e-07, + "loss": 0.76120412, + "num_input_tokens_seen": 272479355, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7109375, + "step": 12632, + "time_per_iteration": 2.480149745941162 + }, + { + "auxiliary_loss_clip": 0.01104237, + "auxiliary_loss_mlp": 0.01030871, + "balance_loss_clip": 1.01905131, + "balance_loss_mlp": 1.03713107, + "epoch": 0.7595370509544567, + "flos": 17595941333760.0, + "grad_norm": 1.9673738745547358, + "language_loss": 0.74681813, + "learning_rate": 5.763858198074154e-07, + "loss": 0.76816922, + "num_input_tokens_seen": 272493555, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 12633, + "time_per_iteration": 2.4051129817962646 + }, + { + "auxiliary_loss_clip": 0.01102602, + "auxiliary_loss_mlp": 0.0102895, + "balance_loss_clip": 1.0182507, + "balance_loss_mlp": 1.03637874, + "epoch": 0.7595971742071246, + "flos": 18002953319040.0, + "grad_norm": 1.9622807663436381, + "language_loss": 0.73751974, + "learning_rate": 5.76112298645246e-07, + "loss": 0.75883526, + "num_input_tokens_seen": 272508925, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.66015625, + "step": 12634, + "time_per_iteration": 2.4096055030822754 + }, + { + "auxiliary_loss_clip": 0.01105344, + "auxiliary_loss_mlp": 0.01032122, + "balance_loss_clip": 1.02009416, + "balance_loss_mlp": 1.03842199, + "epoch": 0.7596572974597926, + "flos": 28840326480000.0, + "grad_norm": 1.6454406828041275, + "language_loss": 0.64365327, + "learning_rate": 5.758388314770408e-07, + "loss": 0.66502792, + "num_input_tokens_seen": 272528805, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66796875, + "step": 12635, + "time_per_iteration": 2.50323224067688 + }, + { + "auxiliary_loss_clip": 0.01105903, + "auxiliary_loss_mlp": 0.01030647, + "balance_loss_clip": 1.01841593, + "balance_loss_mlp": 1.03627133, + "epoch": 0.7597174207124605, + "flos": 14282823588480.0, + "grad_norm": 1.7052959170016264, + "language_loss": 0.68446481, + "learning_rate": 5.7556541831317e-07, + "loss": 0.70583028, + "num_input_tokens_seen": 272546655, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69921875, + "step": 12636, + "time_per_iteration": 3.86566424369812 + }, + { + "auxiliary_loss_clip": 0.01106502, + "auxiliary_loss_mlp": 0.01032191, + "balance_loss_clip": 1.02063966, + "balance_loss_mlp": 1.03834045, + "epoch": 0.7597775439651285, + "flos": 21688932193920.0, + "grad_norm": 1.977358934255135, + "language_loss": 0.81089514, + "learning_rate": 5.752920591640018e-07, + "loss": 0.83228207, + "num_input_tokens_seen": 272564010, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.68359375, + "step": 12637, + "time_per_iteration": 2.4373815059661865 + }, + { + "auxiliary_loss_clip": 0.01100493, + "auxiliary_loss_mlp": 0.01032101, + "balance_loss_clip": 1.02041268, + "balance_loss_mlp": 1.0336833, + "epoch": 0.7598376672177964, + "flos": 36101248312320.0, + "grad_norm": 1.8305503551265345, + "language_loss": 0.66367668, + "learning_rate": 5.750187540399017e-07, + "loss": 0.68500262, + "num_input_tokens_seen": 272585840, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66796875, + "step": 12638, + "time_per_iteration": 3.9780218601226807 + }, + { + "auxiliary_loss_clip": 0.01104273, + "auxiliary_loss_mlp": 0.01033993, + "balance_loss_clip": 1.02106452, + "balance_loss_mlp": 1.03667748, + "epoch": 0.7598977904704645, + "flos": 18332326056960.0, + "grad_norm": 2.213704137729046, + "language_loss": 0.65462083, + "learning_rate": 5.747455029512323e-07, + "loss": 0.67600346, + "num_input_tokens_seen": 272602300, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.67578125, + "step": 12639, + "time_per_iteration": 3.9062952995300293 + }, + { + "auxiliary_loss_clip": 0.01101967, + "auxiliary_loss_mlp": 0.01027427, + "balance_loss_clip": 1.01530933, + "balance_loss_mlp": 1.03570402, + "epoch": 0.7599579137231324, + "flos": 20192642317440.0, + "grad_norm": 2.267632288408512, + "language_loss": 0.6999557, + "learning_rate": 5.744723059083572e-07, + "loss": 0.72124958, + "num_input_tokens_seen": 272619595, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 12640, + "time_per_iteration": 2.4175524711608887 + }, + { + "auxiliary_loss_clip": 0.01105941, + "auxiliary_loss_mlp": 0.01032991, + "balance_loss_clip": 1.02036071, + "balance_loss_mlp": 1.03658712, + "epoch": 0.7600180369758004, + "flos": 24024849459840.0, + "grad_norm": 1.786840701662577, + "language_loss": 0.6698308, + "learning_rate": 5.741991629216343e-07, + "loss": 0.69122016, + "num_input_tokens_seen": 272638825, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 12641, + "time_per_iteration": 3.921182632446289 + }, + { + "auxiliary_loss_clip": 0.01102198, + "auxiliary_loss_mlp": 0.01033642, + "balance_loss_clip": 1.02081525, + "balance_loss_mlp": 1.03358555, + "epoch": 0.7600781602284684, + "flos": 18989527248000.0, + "grad_norm": 2.0392329057559433, + "language_loss": 0.66791224, + "learning_rate": 5.73926074001422e-07, + "loss": 0.68927062, + "num_input_tokens_seen": 272657240, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6875, + "step": 12642, + "time_per_iteration": 2.437264919281006 + }, + { + "auxiliary_loss_clip": 0.01102022, + "auxiliary_loss_mlp": 0.01028646, + "balance_loss_clip": 1.01733255, + "balance_loss_mlp": 1.0378716, + "epoch": 0.7601382834811363, + "flos": 26067520091520.0, + "grad_norm": 1.817654182769989, + "language_loss": 0.75470227, + "learning_rate": 5.736530391580765e-07, + "loss": 0.77600896, + "num_input_tokens_seen": 272677520, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 12643, + "time_per_iteration": 2.454752206802368 + }, + { + "auxiliary_loss_clip": 0.01104004, + "auxiliary_loss_mlp": 0.01032717, + "balance_loss_clip": 1.01981854, + "balance_loss_mlp": 1.03661776, + "epoch": 0.7601984067338043, + "flos": 18844232734080.0, + "grad_norm": 1.71435715776806, + "language_loss": 0.78663039, + "learning_rate": 5.733800584019508e-07, + "loss": 0.80799764, + "num_input_tokens_seen": 272696770, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.671875, + "step": 12644, + "time_per_iteration": 2.464467763900757 + }, + { + "auxiliary_loss_clip": 0.01102086, + "auxiliary_loss_mlp": 0.01026874, + "balance_loss_clip": 1.01553774, + "balance_loss_mlp": 1.03507113, + "epoch": 0.7602585299864723, + "flos": 24646391424000.0, + "grad_norm": 1.487007417540331, + "language_loss": 0.80469275, + "learning_rate": 5.731071317433957e-07, + "loss": 0.82598233, + "num_input_tokens_seen": 272718340, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 12645, + "time_per_iteration": 2.46242094039917 + }, + { + "auxiliary_loss_clip": 0.01106779, + "auxiliary_loss_mlp": 0.01028551, + "balance_loss_clip": 1.01639724, + "balance_loss_mlp": 1.03778565, + "epoch": 0.7603186532391403, + "flos": 23842100039040.0, + "grad_norm": 1.4295948404829946, + "language_loss": 0.72978055, + "learning_rate": 5.728342591927611e-07, + "loss": 0.75113386, + "num_input_tokens_seen": 272739575, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69140625, + "step": 12646, + "time_per_iteration": 2.471769332885742 + }, + { + "auxiliary_loss_clip": 0.01100614, + "auxiliary_loss_mlp": 0.01032121, + "balance_loss_clip": 1.02084398, + "balance_loss_mlp": 1.0357635, + "epoch": 0.7603787764918082, + "flos": 22199905117440.0, + "grad_norm": 2.0171184972904426, + "language_loss": 0.67350507, + "learning_rate": 5.725614407603949e-07, + "loss": 0.69483244, + "num_input_tokens_seen": 272758710, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6484375, + "step": 12647, + "time_per_iteration": 2.4212889671325684 + }, + { + "auxiliary_loss_clip": 0.01023895, + "auxiliary_loss_mlp": 0.01003551, + "balance_loss_clip": 1.00256717, + "balance_loss_mlp": 1.00363588, + "epoch": 0.7604388997444762, + "flos": 54086894254080.0, + "grad_norm": 0.6700081607219286, + "language_loss": 0.48957998, + "learning_rate": 5.722886764566415e-07, + "loss": 0.50985444, + "num_input_tokens_seen": 272814855, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.203125, + "step": 12648, + "time_per_iteration": 2.992032766342163 + }, + { + "auxiliary_loss_clip": 0.01099019, + "auxiliary_loss_mlp": 0.01032507, + "balance_loss_clip": 1.02124202, + "balance_loss_mlp": 1.03481627, + "epoch": 0.7604990229971441, + "flos": 19681920789120.0, + "grad_norm": 1.457089881735221, + "language_loss": 0.76486385, + "learning_rate": 5.720159662918451e-07, + "loss": 0.78617918, + "num_input_tokens_seen": 272834400, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.640625, + "step": 12649, + "time_per_iteration": 2.4250268936157227 + }, + { + "auxiliary_loss_clip": 0.01099572, + "auxiliary_loss_mlp": 0.01029511, + "balance_loss_clip": 1.0177089, + "balance_loss_mlp": 1.03462923, + "epoch": 0.7605591462498121, + "flos": 25228036356480.0, + "grad_norm": 1.4982970493787315, + "language_loss": 0.68732083, + "learning_rate": 5.717433102763462e-07, + "loss": 0.70861167, + "num_input_tokens_seen": 272854760, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6484375, + "step": 12650, + "time_per_iteration": 2.488598585128784 + }, + { + "auxiliary_loss_clip": 0.0102378, + "auxiliary_loss_mlp": 0.00999701, + "balance_loss_clip": 0.99867612, + "balance_loss_mlp": 1.00336099, + "epoch": 0.76061926950248, + "flos": 66783757662720.0, + "grad_norm": 0.7616307552749029, + "language_loss": 0.62742424, + "learning_rate": 5.714707084204838e-07, + "loss": 0.64765906, + "num_input_tokens_seen": 272919030, + "router_z_loss_clip": 0.01025391, + "router_z_loss_mlp": 0.20410156, + "step": 12651, + "time_per_iteration": 3.0423130989074707 + }, + { + "auxiliary_loss_clip": 0.01099802, + "auxiliary_loss_mlp": 0.01032274, + "balance_loss_clip": 1.02096677, + "balance_loss_mlp": 1.03473544, + "epoch": 0.7606793927551481, + "flos": 25338354001920.0, + "grad_norm": 1.3759590164717375, + "language_loss": 0.71249425, + "learning_rate": 5.711981607345951e-07, + "loss": 0.73381495, + "num_input_tokens_seen": 272938925, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65234375, + "step": 12652, + "time_per_iteration": 2.4702324867248535 + }, + { + "auxiliary_loss_clip": 0.01103881, + "auxiliary_loss_mlp": 0.01033324, + "balance_loss_clip": 1.0212419, + "balance_loss_mlp": 1.03609121, + "epoch": 0.760739516007816, + "flos": 18223624523520.0, + "grad_norm": 2.2736870535871354, + "language_loss": 0.80135083, + "learning_rate": 5.709256672290152e-07, + "loss": 0.82272291, + "num_input_tokens_seen": 272954945, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6796875, + "step": 12653, + "time_per_iteration": 2.452436685562134 + }, + { + "auxiliary_loss_clip": 0.01106986, + "auxiliary_loss_mlp": 0.01030648, + "balance_loss_clip": 1.01928127, + "balance_loss_mlp": 1.03704405, + "epoch": 0.760799639260484, + "flos": 22559119079040.0, + "grad_norm": 1.5498044874704002, + "language_loss": 0.80112356, + "learning_rate": 5.706532279140785e-07, + "loss": 0.82249987, + "num_input_tokens_seen": 272972855, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.69921875, + "step": 12654, + "time_per_iteration": 2.48616886138916 + }, + { + "auxiliary_loss_clip": 0.01104783, + "auxiliary_loss_mlp": 0.01034864, + "balance_loss_clip": 1.02255547, + "balance_loss_mlp": 1.03588712, + "epoch": 0.760859762513152, + "flos": 22309324922880.0, + "grad_norm": 2.061909970432495, + "language_loss": 0.79397112, + "learning_rate": 5.703808428001136e-07, + "loss": 0.81536764, + "num_input_tokens_seen": 272989895, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69140625, + "step": 12655, + "time_per_iteration": 2.421402931213379 + }, + { + "auxiliary_loss_clip": 0.01098302, + "auxiliary_loss_mlp": 0.01025594, + "balance_loss_clip": 1.01572394, + "balance_loss_mlp": 1.03430891, + "epoch": 0.7609198857658199, + "flos": 24863902231680.0, + "grad_norm": 1.6410708258422424, + "language_loss": 0.68456256, + "learning_rate": 5.701085118974505e-07, + "loss": 0.70580149, + "num_input_tokens_seen": 273011695, + "router_z_loss_clip": 0.09863281, + "router_z_loss_mlp": 0.640625, + "step": 12656, + "time_per_iteration": 2.5130324363708496 + }, + { + "auxiliary_loss_clip": 0.01103079, + "auxiliary_loss_mlp": 0.0102861, + "balance_loss_clip": 1.01629603, + "balance_loss_mlp": 1.03264689, + "epoch": 0.760980009018488, + "flos": 16836790366080.0, + "grad_norm": 1.9462034213744268, + "language_loss": 0.73116565, + "learning_rate": 5.698362352164164e-07, + "loss": 0.75248253, + "num_input_tokens_seen": 273028815, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 12657, + "time_per_iteration": 2.400148391723633 + }, + { + "auxiliary_loss_clip": 0.01024109, + "auxiliary_loss_mlp": 0.01000104, + "balance_loss_clip": 0.99908441, + "balance_loss_mlp": 1.0036025, + "epoch": 0.7610401322711559, + "flos": 61230603029760.0, + "grad_norm": 0.8561186291133048, + "language_loss": 0.64938498, + "learning_rate": 5.695640127673347e-07, + "loss": 0.66962707, + "num_input_tokens_seen": 273084080, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.20507812, + "step": 12658, + "time_per_iteration": 3.001168727874756 + }, + { + "auxiliary_loss_clip": 0.01098421, + "auxiliary_loss_mlp": 0.01030443, + "balance_loss_clip": 1.01878452, + "balance_loss_mlp": 1.03460932, + "epoch": 0.7611002555238239, + "flos": 19640730867840.0, + "grad_norm": 1.8302909281614124, + "language_loss": 0.79259527, + "learning_rate": 5.692918445605293e-07, + "loss": 0.8138839, + "num_input_tokens_seen": 273102295, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.640625, + "step": 12659, + "time_per_iteration": 2.4172587394714355 + }, + { + "auxiliary_loss_clip": 0.01099076, + "auxiliary_loss_mlp": 0.01025102, + "balance_loss_clip": 1.01360416, + "balance_loss_mlp": 1.03339934, + "epoch": 0.7611603787764918, + "flos": 26872206526080.0, + "grad_norm": 1.5401617612635332, + "language_loss": 0.68613267, + "learning_rate": 5.690197306063209e-07, + "loss": 0.70737445, + "num_input_tokens_seen": 273123400, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 12660, + "time_per_iteration": 2.486931085586548 + }, + { + "auxiliary_loss_clip": 0.01102403, + "auxiliary_loss_mlp": 0.01030099, + "balance_loss_clip": 1.01855946, + "balance_loss_mlp": 1.03502679, + "epoch": 0.7612205020291598, + "flos": 27344252085120.0, + "grad_norm": 1.63464824040793, + "language_loss": 0.70508969, + "learning_rate": 5.687476709150281e-07, + "loss": 0.72641468, + "num_input_tokens_seen": 273145150, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 12661, + "time_per_iteration": 2.5559232234954834 + }, + { + "auxiliary_loss_clip": 0.01099871, + "auxiliary_loss_mlp": 0.01027124, + "balance_loss_clip": 1.01578677, + "balance_loss_mlp": 1.03281772, + "epoch": 0.7612806252818277, + "flos": 29314598682240.0, + "grad_norm": 1.568031869440725, + "language_loss": 0.8346833, + "learning_rate": 5.68475665496966e-07, + "loss": 0.85595322, + "num_input_tokens_seen": 273165180, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 12662, + "time_per_iteration": 2.5182721614837646 + }, + { + "auxiliary_loss_clip": 0.01101806, + "auxiliary_loss_mlp": 0.01040729, + "balance_loss_clip": 1.02870047, + "balance_loss_mlp": 1.03437781, + "epoch": 0.7613407485344957, + "flos": 19026048401280.0, + "grad_norm": 1.7160561629790159, + "language_loss": 0.68380648, + "learning_rate": 5.682037143624505e-07, + "loss": 0.70523185, + "num_input_tokens_seen": 273184005, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 12663, + "time_per_iteration": 2.415670871734619 + }, + { + "auxiliary_loss_clip": 0.0110108, + "auxiliary_loss_mlp": 0.01025073, + "balance_loss_clip": 1.01368248, + "balance_loss_mlp": 1.03619945, + "epoch": 0.7614008717871636, + "flos": 23256037733760.0, + "grad_norm": 1.8370977086816516, + "language_loss": 0.70325685, + "learning_rate": 5.67931817521794e-07, + "loss": 0.72451836, + "num_input_tokens_seen": 273203565, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 12664, + "time_per_iteration": 2.4670281410217285 + }, + { + "auxiliary_loss_clip": 0.01107046, + "auxiliary_loss_mlp": 0.01038009, + "balance_loss_clip": 1.02551007, + "balance_loss_mlp": 1.03717303, + "epoch": 0.7614609950398317, + "flos": 21579907438080.0, + "grad_norm": 2.4295435457248575, + "language_loss": 0.79482126, + "learning_rate": 5.676599749853066e-07, + "loss": 0.81627178, + "num_input_tokens_seen": 273221645, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69921875, + "step": 12665, + "time_per_iteration": 2.4276509284973145 + }, + { + "auxiliary_loss_clip": 0.01101745, + "auxiliary_loss_mlp": 0.01031097, + "balance_loss_clip": 1.02022529, + "balance_loss_mlp": 1.03754544, + "epoch": 0.7615211182924996, + "flos": 29277897960960.0, + "grad_norm": 1.6635534140237522, + "language_loss": 0.88047594, + "learning_rate": 5.673881867632959e-07, + "loss": 0.90180439, + "num_input_tokens_seen": 273242040, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.640625, + "step": 12666, + "time_per_iteration": 2.51179575920105 + }, + { + "auxiliary_loss_clip": 0.0110239, + "auxiliary_loss_mlp": 0.0103263, + "balance_loss_clip": 1.0205127, + "balance_loss_mlp": 1.03515267, + "epoch": 0.7615812415451676, + "flos": 13261129136640.0, + "grad_norm": 1.9417407111979526, + "language_loss": 0.8323909, + "learning_rate": 5.671164528660693e-07, + "loss": 0.85374105, + "num_input_tokens_seen": 273257365, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 12667, + "time_per_iteration": 2.4148190021514893 + }, + { + "auxiliary_loss_clip": 0.01100905, + "auxiliary_loss_mlp": 0.01035947, + "balance_loss_clip": 1.02489078, + "balance_loss_mlp": 1.03628147, + "epoch": 0.7616413647978356, + "flos": 18584741905920.0, + "grad_norm": 1.6916218117768351, + "language_loss": 0.78259969, + "learning_rate": 5.668447733039296e-07, + "loss": 0.80396825, + "num_input_tokens_seen": 273274710, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.64453125, + "step": 12668, + "time_per_iteration": 2.4754624366760254 + }, + { + "auxiliary_loss_clip": 0.01100404, + "auxiliary_loss_mlp": 0.01030857, + "balance_loss_clip": 1.01928806, + "balance_loss_mlp": 1.0345788, + "epoch": 0.7617014880505035, + "flos": 18516188799360.0, + "grad_norm": 1.7878935447004587, + "language_loss": 0.63670552, + "learning_rate": 5.6657314808718e-07, + "loss": 0.65801817, + "num_input_tokens_seen": 273292870, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 12669, + "time_per_iteration": 2.406334638595581 + }, + { + "auxiliary_loss_clip": 0.01103168, + "auxiliary_loss_mlp": 0.01035702, + "balance_loss_clip": 1.023, + "balance_loss_mlp": 1.03439915, + "epoch": 0.7617616113031715, + "flos": 24973178382720.0, + "grad_norm": 1.8779652791388421, + "language_loss": 0.66191423, + "learning_rate": 5.663015772261202e-07, + "loss": 0.68330294, + "num_input_tokens_seen": 273312375, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 12670, + "time_per_iteration": 2.479275703430176 + }, + { + "auxiliary_loss_clip": 0.01103769, + "auxiliary_loss_mlp": 0.01032723, + "balance_loss_clip": 1.02112961, + "balance_loss_mlp": 1.0352459, + "epoch": 0.7618217345558395, + "flos": 23295036925440.0, + "grad_norm": 1.5352589226081985, + "language_loss": 0.73205262, + "learning_rate": 5.660300607310493e-07, + "loss": 0.75341749, + "num_input_tokens_seen": 273332590, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.68359375, + "step": 12671, + "time_per_iteration": 2.43534517288208 + }, + { + "auxiliary_loss_clip": 0.01098817, + "auxiliary_loss_mlp": 0.01028667, + "balance_loss_clip": 1.01777697, + "balance_loss_mlp": 1.03416443, + "epoch": 0.7618818578085075, + "flos": 25482894330240.0, + "grad_norm": 2.4136368104172607, + "language_loss": 0.73309898, + "learning_rate": 5.657585986122613e-07, + "loss": 0.75437379, + "num_input_tokens_seen": 273352885, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6484375, + "step": 12672, + "time_per_iteration": 2.4863340854644775 + }, + { + "auxiliary_loss_clip": 0.01023134, + "auxiliary_loss_mlp": 0.01002705, + "balance_loss_clip": 1.00170374, + "balance_loss_mlp": 1.00292134, + "epoch": 0.7619419810611754, + "flos": 61151994115200.0, + "grad_norm": 0.7636907167661546, + "language_loss": 0.56764495, + "learning_rate": 5.654871908800506e-07, + "loss": 0.58790326, + "num_input_tokens_seen": 273411730, + "router_z_loss_clip": 0.01000977, + "router_z_loss_mlp": 0.20214844, + "step": 12673, + "time_per_iteration": 3.0046093463897705 + }, + { + "auxiliary_loss_clip": 0.01103698, + "auxiliary_loss_mlp": 0.01027688, + "balance_loss_clip": 1.01493824, + "balance_loss_mlp": 1.03571641, + "epoch": 0.7620021043138434, + "flos": 23258659426560.0, + "grad_norm": 1.9214444027294126, + "language_loss": 0.74586606, + "learning_rate": 5.652158375447102e-07, + "loss": 0.76717991, + "num_input_tokens_seen": 273430020, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 12674, + "time_per_iteration": 2.4860613346099854 + }, + { + "auxiliary_loss_clip": 0.01099933, + "auxiliary_loss_mlp": 0.01027095, + "balance_loss_clip": 1.01634872, + "balance_loss_mlp": 1.03547001, + "epoch": 0.7620622275665113, + "flos": 25082490447360.0, + "grad_norm": 1.9445116324740603, + "language_loss": 0.72109187, + "learning_rate": 5.649445386165286e-07, + "loss": 0.74236214, + "num_input_tokens_seen": 273448690, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.64453125, + "step": 12675, + "time_per_iteration": 2.4733452796936035 + }, + { + "auxiliary_loss_clip": 0.01099705, + "auxiliary_loss_mlp": 0.01029829, + "balance_loss_clip": 1.01858139, + "balance_loss_mlp": 1.03579307, + "epoch": 0.7621223508191793, + "flos": 20155007842560.0, + "grad_norm": 2.3582627114091417, + "language_loss": 0.72836524, + "learning_rate": 5.646732941057936e-07, + "loss": 0.74966055, + "num_input_tokens_seen": 273465190, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 12676, + "time_per_iteration": 2.464700698852539 + }, + { + "auxiliary_loss_clip": 0.01108509, + "auxiliary_loss_mlp": 0.01030827, + "balance_loss_clip": 1.01853633, + "balance_loss_mlp": 1.0366993, + "epoch": 0.7621824740718472, + "flos": 18000187971840.0, + "grad_norm": 2.4246183918605055, + "language_loss": 0.54263771, + "learning_rate": 5.644021040227927e-07, + "loss": 0.56403106, + "num_input_tokens_seen": 273478620, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.71875, + "step": 12677, + "time_per_iteration": 2.3858957290649414 + }, + { + "auxiliary_loss_clip": 0.01101511, + "auxiliary_loss_mlp": 0.01028312, + "balance_loss_clip": 1.0166893, + "balance_loss_mlp": 1.03496563, + "epoch": 0.7622425973245153, + "flos": 21725668828800.0, + "grad_norm": 2.7484196878623104, + "language_loss": 0.78978539, + "learning_rate": 5.641309683778064e-07, + "loss": 0.81108367, + "num_input_tokens_seen": 273497635, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 12678, + "time_per_iteration": 3.8235199451446533 + }, + { + "auxiliary_loss_clip": 0.01103703, + "auxiliary_loss_mlp": 0.01030473, + "balance_loss_clip": 1.01842141, + "balance_loss_mlp": 1.0358417, + "epoch": 0.7623027205771832, + "flos": 19718549683200.0, + "grad_norm": 1.880562321588857, + "language_loss": 0.7751689, + "learning_rate": 5.638598871811175e-07, + "loss": 0.79651058, + "num_input_tokens_seen": 273513955, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 12679, + "time_per_iteration": 2.406036615371704 + }, + { + "auxiliary_loss_clip": 0.01102397, + "auxiliary_loss_mlp": 0.01024752, + "balance_loss_clip": 1.01288462, + "balance_loss_mlp": 1.03522229, + "epoch": 0.7623628438298512, + "flos": 23988831096960.0, + "grad_norm": 1.3855129030202036, + "language_loss": 0.79996926, + "learning_rate": 5.635888604430059e-07, + "loss": 0.82124078, + "num_input_tokens_seen": 273533970, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 12680, + "time_per_iteration": 3.8292644023895264 + }, + { + "auxiliary_loss_clip": 0.01102879, + "auxiliary_loss_mlp": 0.01027152, + "balance_loss_clip": 1.01448607, + "balance_loss_mlp": 1.03598523, + "epoch": 0.7624229670825191, + "flos": 22345702421760.0, + "grad_norm": 1.8104724953691376, + "language_loss": 0.62750268, + "learning_rate": 5.633178881737493e-07, + "loss": 0.64880306, + "num_input_tokens_seen": 273553090, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.66796875, + "step": 12681, + "time_per_iteration": 3.884755849838257 + }, + { + "auxiliary_loss_clip": 0.01099627, + "auxiliary_loss_mlp": 0.01030882, + "balance_loss_clip": 1.01962304, + "balance_loss_mlp": 1.03471422, + "epoch": 0.7624830903351871, + "flos": 22711775880960.0, + "grad_norm": 2.0185008739532946, + "language_loss": 0.76076877, + "learning_rate": 5.63046970383622e-07, + "loss": 0.78207386, + "num_input_tokens_seen": 273572460, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 12682, + "time_per_iteration": 3.9090828895568848 + }, + { + "auxiliary_loss_clip": 0.01098759, + "auxiliary_loss_mlp": 0.01027158, + "balance_loss_clip": 1.01630437, + "balance_loss_mlp": 1.0342561, + "epoch": 0.7625432135878552, + "flos": 25593714766080.0, + "grad_norm": 1.71259737430395, + "language_loss": 0.68134248, + "learning_rate": 5.627761070828974e-07, + "loss": 0.70260167, + "num_input_tokens_seen": 273592815, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.64453125, + "step": 12683, + "time_per_iteration": 2.4623308181762695 + }, + { + "auxiliary_loss_clip": 0.01103084, + "auxiliary_loss_mlp": 0.01029472, + "balance_loss_clip": 1.01777172, + "balance_loss_mlp": 1.03596735, + "epoch": 0.7626033368405231, + "flos": 23987645948160.0, + "grad_norm": 2.1249879118259285, + "language_loss": 0.83107448, + "learning_rate": 5.625052982818472e-07, + "loss": 0.85240012, + "num_input_tokens_seen": 273611790, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 12684, + "time_per_iteration": 2.4951984882354736 + }, + { + "auxiliary_loss_clip": 0.01103097, + "auxiliary_loss_mlp": 0.01037574, + "balance_loss_clip": 1.02493775, + "balance_loss_mlp": 1.03559566, + "epoch": 0.7626634600931911, + "flos": 12599115523200.0, + "grad_norm": 1.7953521206718834, + "language_loss": 0.82664561, + "learning_rate": 5.622345439907396e-07, + "loss": 0.84805232, + "num_input_tokens_seen": 273628340, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.67578125, + "step": 12685, + "time_per_iteration": 2.397047519683838 + }, + { + "auxiliary_loss_clip": 0.01104402, + "auxiliary_loss_mlp": 0.01025447, + "balance_loss_clip": 1.01402688, + "balance_loss_mlp": 1.03638494, + "epoch": 0.762723583345859, + "flos": 26322593546880.0, + "grad_norm": 1.8540410766605766, + "language_loss": 0.77068198, + "learning_rate": 5.619638442198422e-07, + "loss": 0.79198045, + "num_input_tokens_seen": 273646585, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6796875, + "step": 12686, + "time_per_iteration": 2.484248399734497 + }, + { + "auxiliary_loss_clip": 0.01104273, + "auxiliary_loss_mlp": 0.01038595, + "balance_loss_clip": 1.02545214, + "balance_loss_mlp": 1.03546059, + "epoch": 0.762783706598527, + "flos": 21907053532800.0, + "grad_norm": 1.6147280683220673, + "language_loss": 0.71894288, + "learning_rate": 5.616931989794198e-07, + "loss": 0.74037153, + "num_input_tokens_seen": 273665410, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6875, + "step": 12687, + "time_per_iteration": 2.438870668411255 + }, + { + "auxiliary_loss_clip": 0.01101986, + "auxiliary_loss_mlp": 0.01037451, + "balance_loss_clip": 1.02494013, + "balance_loss_mlp": 1.03586364, + "epoch": 0.7628438298511949, + "flos": 15339782217600.0, + "grad_norm": 1.7893122270206685, + "language_loss": 0.64678234, + "learning_rate": 5.614226082797369e-07, + "loss": 0.66817671, + "num_input_tokens_seen": 273683035, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.66015625, + "step": 12688, + "time_per_iteration": 2.473334550857544 + }, + { + "auxiliary_loss_clip": 0.01100145, + "auxiliary_loss_mlp": 0.01027183, + "balance_loss_clip": 1.01613188, + "balance_loss_mlp": 1.03599501, + "epoch": 0.7629039531038629, + "flos": 13006307076480.0, + "grad_norm": 1.824140660658097, + "language_loss": 0.70988876, + "learning_rate": 5.611520721310515e-07, + "loss": 0.73116207, + "num_input_tokens_seen": 273700130, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 12689, + "time_per_iteration": 2.389702320098877 + }, + { + "auxiliary_loss_clip": 0.01106966, + "auxiliary_loss_mlp": 0.01035843, + "balance_loss_clip": 1.02357674, + "balance_loss_mlp": 1.03706014, + "epoch": 0.7629640763565309, + "flos": 26171660597760.0, + "grad_norm": 1.6778934175859046, + "language_loss": 0.69599509, + "learning_rate": 5.608815905436238e-07, + "loss": 0.7174232, + "num_input_tokens_seen": 273720310, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.69921875, + "step": 12690, + "time_per_iteration": 2.4964652061462402 + }, + { + "auxiliary_loss_clip": 0.01103144, + "auxiliary_loss_mlp": 0.0102945, + "balance_loss_clip": 1.01791096, + "balance_loss_mlp": 1.03643334, + "epoch": 0.7630241996091989, + "flos": 36793713680640.0, + "grad_norm": 1.627452026729889, + "language_loss": 0.69135779, + "learning_rate": 5.606111635277109e-07, + "loss": 0.71268374, + "num_input_tokens_seen": 273744475, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66796875, + "step": 12691, + "time_per_iteration": 2.577179431915283 + }, + { + "auxiliary_loss_clip": 0.0109925, + "auxiliary_loss_mlp": 0.01032646, + "balance_loss_clip": 1.02197647, + "balance_loss_mlp": 1.03412747, + "epoch": 0.7630843228618668, + "flos": 21835160461440.0, + "grad_norm": 1.5885842386967668, + "language_loss": 0.81694877, + "learning_rate": 5.603407910935662e-07, + "loss": 0.83826768, + "num_input_tokens_seen": 273764635, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.65234375, + "step": 12692, + "time_per_iteration": 2.4633901119232178 + }, + { + "auxiliary_loss_clip": 0.01107736, + "auxiliary_loss_mlp": 0.01031236, + "balance_loss_clip": 1.02030492, + "balance_loss_mlp": 1.03841257, + "epoch": 0.7631444461145348, + "flos": 12640520926080.0, + "grad_norm": 2.217828968535983, + "language_loss": 0.76950878, + "learning_rate": 5.600704732514438e-07, + "loss": 0.79089856, + "num_input_tokens_seen": 273780115, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6953125, + "step": 12693, + "time_per_iteration": 2.4009978771209717 + }, + { + "auxiliary_loss_clip": 0.01103157, + "auxiliary_loss_mlp": 0.01030435, + "balance_loss_clip": 1.01835942, + "balance_loss_mlp": 1.03572786, + "epoch": 0.7632045693672027, + "flos": 16836610798080.0, + "grad_norm": 2.173871462173048, + "language_loss": 0.73079503, + "learning_rate": 5.598002100115933e-07, + "loss": 0.75213093, + "num_input_tokens_seen": 273796605, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.671875, + "step": 12694, + "time_per_iteration": 2.462535858154297 + }, + { + "auxiliary_loss_clip": 0.01098607, + "auxiliary_loss_mlp": 0.01027527, + "balance_loss_clip": 1.01585007, + "balance_loss_mlp": 1.03326893, + "epoch": 0.7632646926198707, + "flos": 22017335264640.0, + "grad_norm": 1.6266641771767514, + "language_loss": 0.70343757, + "learning_rate": 5.595300013842625e-07, + "loss": 0.7246989, + "num_input_tokens_seen": 273816515, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65625, + "step": 12695, + "time_per_iteration": 2.436309576034546 + }, + { + "auxiliary_loss_clip": 0.01101174, + "auxiliary_loss_mlp": 0.01029968, + "balance_loss_clip": 1.01889324, + "balance_loss_mlp": 1.03454077, + "epoch": 0.7633248158725388, + "flos": 23114011357440.0, + "grad_norm": 1.4700012541303298, + "language_loss": 0.72275102, + "learning_rate": 5.592598473796985e-07, + "loss": 0.74406242, + "num_input_tokens_seen": 273837060, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6640625, + "step": 12696, + "time_per_iteration": 2.473132371902466 + }, + { + "auxiliary_loss_clip": 0.01101016, + "auxiliary_loss_mlp": 0.01033584, + "balance_loss_clip": 1.02113307, + "balance_loss_mlp": 1.03426933, + "epoch": 0.7633849391252067, + "flos": 10889839952640.0, + "grad_norm": 2.376546359844648, + "language_loss": 0.71416759, + "learning_rate": 5.589897480081453e-07, + "loss": 0.73551357, + "num_input_tokens_seen": 273853365, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.66796875, + "step": 12697, + "time_per_iteration": 2.397484064102173 + }, + { + "auxiliary_loss_clip": 0.01103465, + "auxiliary_loss_mlp": 0.01030875, + "balance_loss_clip": 1.0194068, + "balance_loss_mlp": 1.03697562, + "epoch": 0.7634450623778747, + "flos": 20994168355200.0, + "grad_norm": 1.8987571133249672, + "language_loss": 0.66587389, + "learning_rate": 5.587197032798461e-07, + "loss": 0.6872173, + "num_input_tokens_seen": 273870750, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6640625, + "step": 12698, + "time_per_iteration": 2.4368910789489746 + }, + { + "auxiliary_loss_clip": 0.01099721, + "auxiliary_loss_mlp": 0.01028437, + "balance_loss_clip": 1.01665354, + "balance_loss_mlp": 1.03326559, + "epoch": 0.7635051856305426, + "flos": 18882046776960.0, + "grad_norm": 1.574933939339682, + "language_loss": 0.72529495, + "learning_rate": 5.5844971320504e-07, + "loss": 0.74657655, + "num_input_tokens_seen": 273890890, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6640625, + "step": 12699, + "time_per_iteration": 2.449216842651367 + }, + { + "auxiliary_loss_clip": 0.01099566, + "auxiliary_loss_mlp": 0.01032721, + "balance_loss_clip": 1.02182543, + "balance_loss_mlp": 1.03466082, + "epoch": 0.7635653088832106, + "flos": 34786989584640.0, + "grad_norm": 1.9214661095744658, + "language_loss": 0.73283732, + "learning_rate": 5.581797777939648e-07, + "loss": 0.75416017, + "num_input_tokens_seen": 273914015, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 12700, + "time_per_iteration": 2.614281177520752 + }, + { + "auxiliary_loss_clip": 0.01101231, + "auxiliary_loss_mlp": 0.01029995, + "balance_loss_clip": 1.01834226, + "balance_loss_mlp": 1.03391504, + "epoch": 0.7636254321358785, + "flos": 23178434400000.0, + "grad_norm": 3.347573177390183, + "language_loss": 0.68935323, + "learning_rate": 5.579098970568574e-07, + "loss": 0.71066546, + "num_input_tokens_seen": 273927415, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 12701, + "time_per_iteration": 2.407780170440674 + }, + { + "auxiliary_loss_clip": 0.01102757, + "auxiliary_loss_mlp": 0.01030007, + "balance_loss_clip": 1.01846802, + "balance_loss_mlp": 1.0361433, + "epoch": 0.7636855553885465, + "flos": 21325229032320.0, + "grad_norm": 2.4924220366961145, + "language_loss": 0.64379907, + "learning_rate": 5.576400710039508e-07, + "loss": 0.66512668, + "num_input_tokens_seen": 273946690, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 12702, + "time_per_iteration": 2.444377899169922 + }, + { + "auxiliary_loss_clip": 0.01101798, + "auxiliary_loss_mlp": 0.01031427, + "balance_loss_clip": 1.02000129, + "balance_loss_mlp": 1.0348711, + "epoch": 0.7637456786412145, + "flos": 28658079849600.0, + "grad_norm": 1.9476964019276684, + "language_loss": 0.65595478, + "learning_rate": 5.57370299645477e-07, + "loss": 0.67728704, + "num_input_tokens_seen": 273966870, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 12703, + "time_per_iteration": 2.4628920555114746 + }, + { + "auxiliary_loss_clip": 0.01102971, + "auxiliary_loss_mlp": 0.01023498, + "balance_loss_clip": 1.01204157, + "balance_loss_mlp": 1.03720379, + "epoch": 0.7638058018938825, + "flos": 21907269014400.0, + "grad_norm": 1.7669844217588608, + "language_loss": 0.83665591, + "learning_rate": 5.571005829916668e-07, + "loss": 0.85792065, + "num_input_tokens_seen": 273986360, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66015625, + "step": 12704, + "time_per_iteration": 2.448728561401367 + }, + { + "auxiliary_loss_clip": 0.01104257, + "auxiliary_loss_mlp": 0.01030733, + "balance_loss_clip": 1.01903248, + "balance_loss_mlp": 1.03712642, + "epoch": 0.7638659251465504, + "flos": 29643899592960.0, + "grad_norm": 1.3938959354870066, + "language_loss": 0.67689544, + "learning_rate": 5.568309210527469e-07, + "loss": 0.69824535, + "num_input_tokens_seen": 274009745, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 12705, + "time_per_iteration": 2.4803051948547363 + }, + { + "auxiliary_loss_clip": 0.01100722, + "auxiliary_loss_mlp": 0.0102598, + "balance_loss_clip": 1.01429725, + "balance_loss_mlp": 1.03554821, + "epoch": 0.7639260483992184, + "flos": 26141172929280.0, + "grad_norm": 1.691823975978675, + "language_loss": 0.74275041, + "learning_rate": 5.565613138389427e-07, + "loss": 0.7640174, + "num_input_tokens_seen": 274028775, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65234375, + "step": 12706, + "time_per_iteration": 2.4732961654663086 + }, + { + "auxiliary_loss_clip": 0.01102547, + "auxiliary_loss_mlp": 0.0102807, + "balance_loss_clip": 1.01650715, + "balance_loss_mlp": 1.03575993, + "epoch": 0.7639861716518863, + "flos": 20156695781760.0, + "grad_norm": 1.8728916449529083, + "language_loss": 0.7829448, + "learning_rate": 5.562917613604781e-07, + "loss": 0.80425096, + "num_input_tokens_seen": 274047520, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66796875, + "step": 12707, + "time_per_iteration": 2.44852352142334 + }, + { + "auxiliary_loss_clip": 0.01100823, + "auxiliary_loss_mlp": 0.01025401, + "balance_loss_clip": 1.01379025, + "balance_loss_mlp": 1.03446913, + "epoch": 0.7640462949045543, + "flos": 18583125793920.0, + "grad_norm": 1.8180584415058063, + "language_loss": 0.79873604, + "learning_rate": 5.560222636275751e-07, + "loss": 0.81999826, + "num_input_tokens_seen": 274065350, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 12708, + "time_per_iteration": 2.427623987197876 + }, + { + "auxiliary_loss_clip": 0.01024337, + "auxiliary_loss_mlp": 0.01003138, + "balance_loss_clip": 1.00209475, + "balance_loss_mlp": 1.00414193, + "epoch": 0.7641064181572224, + "flos": 68321991646080.0, + "grad_norm": 0.8188309305581064, + "language_loss": 0.56423205, + "learning_rate": 5.557528206504521e-07, + "loss": 0.58450681, + "num_input_tokens_seen": 274122315, + "router_z_loss_clip": 0.01043701, + "router_z_loss_mlp": 0.20214844, + "step": 12709, + "time_per_iteration": 3.0471227169036865 + }, + { + "auxiliary_loss_clip": 0.01106792, + "auxiliary_loss_mlp": 0.01031993, + "balance_loss_clip": 1.01926708, + "balance_loss_mlp": 1.03640699, + "epoch": 0.7641665414098903, + "flos": 17968982031360.0, + "grad_norm": 1.7746105056549126, + "language_loss": 0.63412935, + "learning_rate": 5.554834324393271e-07, + "loss": 0.65551722, + "num_input_tokens_seen": 274140555, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 12710, + "time_per_iteration": 2.436523199081421 + }, + { + "auxiliary_loss_clip": 0.01104937, + "auxiliary_loss_mlp": 0.01030515, + "balance_loss_clip": 1.01748586, + "balance_loss_mlp": 1.03611827, + "epoch": 0.7642266646625583, + "flos": 21252078984960.0, + "grad_norm": 2.4696813182375994, + "language_loss": 0.64710927, + "learning_rate": 5.552140990044154e-07, + "loss": 0.66846383, + "num_input_tokens_seen": 274161125, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6875, + "step": 12711, + "time_per_iteration": 2.413130760192871 + }, + { + "auxiliary_loss_clip": 0.01100872, + "auxiliary_loss_mlp": 0.01032613, + "balance_loss_clip": 1.02089453, + "balance_loss_mlp": 1.03438199, + "epoch": 0.7642867879152262, + "flos": 22747794243840.0, + "grad_norm": 1.5961757403151435, + "language_loss": 0.72854543, + "learning_rate": 5.549448203559293e-07, + "loss": 0.74988031, + "num_input_tokens_seen": 274180835, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 12712, + "time_per_iteration": 2.4923083782196045 + }, + { + "auxiliary_loss_clip": 0.01100743, + "auxiliary_loss_mlp": 0.01027304, + "balance_loss_clip": 1.01644421, + "balance_loss_mlp": 1.03588057, + "epoch": 0.7643469111678942, + "flos": 23332132696320.0, + "grad_norm": 1.512862256571613, + "language_loss": 0.8010205, + "learning_rate": 5.546755965040804e-07, + "loss": 0.82230103, + "num_input_tokens_seen": 274201190, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6484375, + "step": 12713, + "time_per_iteration": 2.4570553302764893 + }, + { + "auxiliary_loss_clip": 0.01104482, + "auxiliary_loss_mlp": 0.01029802, + "balance_loss_clip": 1.01756477, + "balance_loss_mlp": 1.03663445, + "epoch": 0.7644070344205621, + "flos": 19857092440320.0, + "grad_norm": 2.127063992718731, + "language_loss": 0.83558553, + "learning_rate": 5.544064274590776e-07, + "loss": 0.85692835, + "num_input_tokens_seen": 274217595, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6796875, + "step": 12714, + "time_per_iteration": 2.4317142963409424 + }, + { + "auxiliary_loss_clip": 0.01105545, + "auxiliary_loss_mlp": 0.01038361, + "balance_loss_clip": 1.0259099, + "balance_loss_mlp": 1.03701067, + "epoch": 0.7644671576732301, + "flos": 22090628966400.0, + "grad_norm": 1.592380808570538, + "language_loss": 0.72868395, + "learning_rate": 5.541373132311287e-07, + "loss": 0.75012302, + "num_input_tokens_seen": 274237885, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6875, + "step": 12715, + "time_per_iteration": 2.43247389793396 + }, + { + "auxiliary_loss_clip": 0.01100488, + "auxiliary_loss_mlp": 0.0102946, + "balance_loss_clip": 1.01739025, + "balance_loss_mlp": 1.03394234, + "epoch": 0.7645272809258981, + "flos": 25481421872640.0, + "grad_norm": 1.7023765879093384, + "language_loss": 0.63293636, + "learning_rate": 5.538682538304376e-07, + "loss": 0.65423584, + "num_input_tokens_seen": 274258820, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66796875, + "step": 12716, + "time_per_iteration": 2.519078016281128 + }, + { + "auxiliary_loss_clip": 0.0110555, + "auxiliary_loss_mlp": 0.01034917, + "balance_loss_clip": 1.02227485, + "balance_loss_mlp": 1.03597593, + "epoch": 0.7645874041785661, + "flos": 21541877913600.0, + "grad_norm": 1.4875164699453862, + "language_loss": 0.79791009, + "learning_rate": 5.535992492672068e-07, + "loss": 0.81931472, + "num_input_tokens_seen": 274278835, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 12717, + "time_per_iteration": 2.429151773452759 + }, + { + "auxiliary_loss_clip": 0.01102712, + "auxiliary_loss_mlp": 0.01033337, + "balance_loss_clip": 1.0218451, + "balance_loss_mlp": 1.03612757, + "epoch": 0.764647527431234, + "flos": 20630896156800.0, + "grad_norm": 2.2673772679539486, + "language_loss": 0.66456509, + "learning_rate": 5.53330299551638e-07, + "loss": 0.6859256, + "num_input_tokens_seen": 274297110, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6640625, + "step": 12718, + "time_per_iteration": 2.5357375144958496 + }, + { + "auxiliary_loss_clip": 0.01098639, + "auxiliary_loss_mlp": 0.01031904, + "balance_loss_clip": 1.02124047, + "balance_loss_mlp": 1.03456593, + "epoch": 0.764707650683902, + "flos": 21434074220160.0, + "grad_norm": 1.8849716661729419, + "language_loss": 0.77913976, + "learning_rate": 5.530614046939286e-07, + "loss": 0.8004452, + "num_input_tokens_seen": 274315610, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.640625, + "step": 12719, + "time_per_iteration": 3.9749484062194824 + }, + { + "auxiliary_loss_clip": 0.01102309, + "auxiliary_loss_mlp": 0.01025821, + "balance_loss_clip": 1.01369143, + "balance_loss_mlp": 1.03523517, + "epoch": 0.7647677739365699, + "flos": 22711201263360.0, + "grad_norm": 1.683095995258743, + "language_loss": 0.69655412, + "learning_rate": 5.527925647042754e-07, + "loss": 0.71783549, + "num_input_tokens_seen": 274333975, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.671875, + "step": 12720, + "time_per_iteration": 2.568380117416382 + }, + { + "auxiliary_loss_clip": 0.01102352, + "auxiliary_loss_mlp": 0.01031499, + "balance_loss_clip": 1.01990581, + "balance_loss_mlp": 1.03623235, + "epoch": 0.7648278971892379, + "flos": 21324115710720.0, + "grad_norm": 1.6712048084567594, + "language_loss": 0.73724437, + "learning_rate": 5.52523779592875e-07, + "loss": 0.75858283, + "num_input_tokens_seen": 274353695, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6640625, + "step": 12721, + "time_per_iteration": 3.8811776638031006 + }, + { + "auxiliary_loss_clip": 0.01101929, + "auxiliary_loss_mlp": 0.01028761, + "balance_loss_clip": 1.01676273, + "balance_loss_mlp": 1.03572047, + "epoch": 0.764888020441906, + "flos": 20667345482880.0, + "grad_norm": 1.8878016684824361, + "language_loss": 0.73512298, + "learning_rate": 5.522550493699163e-07, + "loss": 0.75642979, + "num_input_tokens_seen": 274371120, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66015625, + "step": 12722, + "time_per_iteration": 3.989180564880371 + }, + { + "auxiliary_loss_clip": 0.01101721, + "auxiliary_loss_mlp": 0.0103481, + "balance_loss_clip": 1.02355647, + "balance_loss_mlp": 1.03481197, + "epoch": 0.7649481436945739, + "flos": 25082526360960.0, + "grad_norm": 1.7865929213753133, + "language_loss": 0.7357918, + "learning_rate": 5.519863740455912e-07, + "loss": 0.75715715, + "num_input_tokens_seen": 274389665, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.66796875, + "step": 12723, + "time_per_iteration": 2.5361814498901367 + }, + { + "auxiliary_loss_clip": 0.01101913, + "auxiliary_loss_mlp": 0.01028255, + "balance_loss_clip": 1.01642966, + "balance_loss_mlp": 1.03334272, + "epoch": 0.7650082669472419, + "flos": 24900890261760.0, + "grad_norm": 1.812040578255397, + "language_loss": 0.73211122, + "learning_rate": 5.517177536300881e-07, + "loss": 0.7534129, + "num_input_tokens_seen": 274408750, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 12724, + "time_per_iteration": 3.9343338012695312 + }, + { + "auxiliary_loss_clip": 0.01099657, + "auxiliary_loss_mlp": 0.01024123, + "balance_loss_clip": 1.01270843, + "balance_loss_mlp": 1.03587949, + "epoch": 0.7650683901999098, + "flos": 14647388676480.0, + "grad_norm": 1.9420758894203383, + "language_loss": 0.8370254, + "learning_rate": 5.514491881335935e-07, + "loss": 0.85826313, + "num_input_tokens_seen": 274424600, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.63671875, + "step": 12725, + "time_per_iteration": 2.378312587738037 + }, + { + "auxiliary_loss_clip": 0.01102, + "auxiliary_loss_mlp": 0.01032852, + "balance_loss_clip": 1.02064514, + "balance_loss_mlp": 1.03584003, + "epoch": 0.7651285134525778, + "flos": 26352434770560.0, + "grad_norm": 1.7077077280444313, + "language_loss": 0.77513289, + "learning_rate": 5.511806775662901e-07, + "loss": 0.79648137, + "num_input_tokens_seen": 274443075, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66015625, + "step": 12726, + "time_per_iteration": 2.489109992980957 + }, + { + "auxiliary_loss_clip": 0.01103068, + "auxiliary_loss_mlp": 0.0103319, + "balance_loss_clip": 1.02161503, + "balance_loss_mlp": 1.03531957, + "epoch": 0.7651886367052457, + "flos": 26646866553600.0, + "grad_norm": 1.5743856699934278, + "language_loss": 0.7073437, + "learning_rate": 5.509122219383615e-07, + "loss": 0.7287063, + "num_input_tokens_seen": 274463240, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6796875, + "step": 12727, + "time_per_iteration": 2.4679818153381348 + }, + { + "auxiliary_loss_clip": 0.01097102, + "auxiliary_loss_mlp": 0.01024446, + "balance_loss_clip": 1.01324618, + "balance_loss_mlp": 1.03295493, + "epoch": 0.7652487599579137, + "flos": 25702847262720.0, + "grad_norm": 1.683634898596646, + "language_loss": 0.79648662, + "learning_rate": 5.506438212599864e-07, + "loss": 0.81770217, + "num_input_tokens_seen": 274482750, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.640625, + "step": 12728, + "time_per_iteration": 2.5594372749328613 + }, + { + "auxiliary_loss_clip": 0.01104269, + "auxiliary_loss_mlp": 0.01029906, + "balance_loss_clip": 1.01803839, + "balance_loss_mlp": 1.03638935, + "epoch": 0.7653088832105817, + "flos": 28585576247040.0, + "grad_norm": 1.9251474152175339, + "language_loss": 0.55158925, + "learning_rate": 5.503754755413424e-07, + "loss": 0.57293093, + "num_input_tokens_seen": 274503545, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 12729, + "time_per_iteration": 2.4821853637695312 + }, + { + "auxiliary_loss_clip": 0.01101542, + "auxiliary_loss_mlp": 0.01029418, + "balance_loss_clip": 1.01739013, + "balance_loss_mlp": 1.03482783, + "epoch": 0.7653690064632497, + "flos": 23366750428800.0, + "grad_norm": 2.177670439939341, + "language_loss": 0.77752316, + "learning_rate": 5.501071847926055e-07, + "loss": 0.79883277, + "num_input_tokens_seen": 274523825, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 12730, + "time_per_iteration": 2.4903311729431152 + }, + { + "auxiliary_loss_clip": 0.01107568, + "auxiliary_loss_mlp": 0.01037906, + "balance_loss_clip": 1.02540636, + "balance_loss_mlp": 1.03940296, + "epoch": 0.7654291297159176, + "flos": 15773905992960.0, + "grad_norm": 2.6215650166042854, + "language_loss": 0.68980086, + "learning_rate": 5.498389490239495e-07, + "loss": 0.71125555, + "num_input_tokens_seen": 274541625, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 12731, + "time_per_iteration": 2.4075534343719482 + }, + { + "auxiliary_loss_clip": 0.01103331, + "auxiliary_loss_mlp": 0.01029528, + "balance_loss_clip": 1.01775599, + "balance_loss_mlp": 1.03526866, + "epoch": 0.7654892529685856, + "flos": 18033800123520.0, + "grad_norm": 2.0647561779987598, + "language_loss": 0.69921666, + "learning_rate": 5.495707682455471e-07, + "loss": 0.72054529, + "num_input_tokens_seen": 274557580, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 12732, + "time_per_iteration": 2.4208905696868896 + }, + { + "auxiliary_loss_clip": 0.01103869, + "auxiliary_loss_mlp": 0.01029535, + "balance_loss_clip": 1.01700664, + "balance_loss_mlp": 1.03542268, + "epoch": 0.7655493762212535, + "flos": 27236017428480.0, + "grad_norm": 1.4500082329987547, + "language_loss": 0.78334171, + "learning_rate": 5.493026424675653e-07, + "loss": 0.8046757, + "num_input_tokens_seen": 274578135, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 12733, + "time_per_iteration": 2.4912784099578857 + }, + { + "auxiliary_loss_clip": 0.01101688, + "auxiliary_loss_mlp": 0.01031369, + "balance_loss_clip": 1.02012134, + "balance_loss_mlp": 1.03670192, + "epoch": 0.7656094994739215, + "flos": 20773964027520.0, + "grad_norm": 1.7158100423573102, + "language_loss": 0.77660191, + "learning_rate": 5.490345717001726e-07, + "loss": 0.79793251, + "num_input_tokens_seen": 274595655, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 12734, + "time_per_iteration": 2.500473737716675 + }, + { + "auxiliary_loss_clip": 0.01105167, + "auxiliary_loss_mlp": 0.01028995, + "balance_loss_clip": 1.01641846, + "balance_loss_mlp": 1.03554702, + "epoch": 0.7656696227265896, + "flos": 23039245198080.0, + "grad_norm": 1.8249591988641765, + "language_loss": 0.72925597, + "learning_rate": 5.48766555953535e-07, + "loss": 0.7505976, + "num_input_tokens_seen": 274616305, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 12735, + "time_per_iteration": 2.477151870727539 + }, + { + "auxiliary_loss_clip": 0.01100932, + "auxiliary_loss_mlp": 0.01031337, + "balance_loss_clip": 1.01950526, + "balance_loss_mlp": 1.03448582, + "epoch": 0.7657297459792575, + "flos": 27525636789120.0, + "grad_norm": 1.762755938447221, + "language_loss": 0.72515297, + "learning_rate": 5.484985952378145e-07, + "loss": 0.7464757, + "num_input_tokens_seen": 274638110, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 12736, + "time_per_iteration": 2.5486631393432617 + }, + { + "auxiliary_loss_clip": 0.01107585, + "auxiliary_loss_mlp": 0.01036525, + "balance_loss_clip": 1.02288127, + "balance_loss_mlp": 1.03783011, + "epoch": 0.7657898692319255, + "flos": 17128456801920.0, + "grad_norm": 1.7485103277952745, + "language_loss": 0.77891874, + "learning_rate": 5.482306895631728e-07, + "loss": 0.80035985, + "num_input_tokens_seen": 274656565, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.69921875, + "step": 12737, + "time_per_iteration": 2.4112277030944824 + }, + { + "auxiliary_loss_clip": 0.01101521, + "auxiliary_loss_mlp": 0.01028185, + "balance_loss_clip": 1.01596594, + "balance_loss_mlp": 1.0340569, + "epoch": 0.7658499924845934, + "flos": 21465747037440.0, + "grad_norm": 1.6956859838498979, + "language_loss": 0.76673079, + "learning_rate": 5.479628389397699e-07, + "loss": 0.78802776, + "num_input_tokens_seen": 274674215, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.67578125, + "step": 12738, + "time_per_iteration": 2.4841501712799072 + }, + { + "auxiliary_loss_clip": 0.01104744, + "auxiliary_loss_mlp": 0.01029004, + "balance_loss_clip": 1.01677346, + "balance_loss_mlp": 1.03617144, + "epoch": 0.7659101157372614, + "flos": 29496665744640.0, + "grad_norm": 1.8494809749417094, + "language_loss": 0.62757778, + "learning_rate": 5.476950433777603e-07, + "loss": 0.64891523, + "num_input_tokens_seen": 274693445, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 12739, + "time_per_iteration": 2.5342459678649902 + }, + { + "auxiliary_loss_clip": 0.01104187, + "auxiliary_loss_mlp": 0.01034147, + "balance_loss_clip": 1.02121282, + "balance_loss_mlp": 1.03702021, + "epoch": 0.7659702389899293, + "flos": 18551812112640.0, + "grad_norm": 1.9457756189181725, + "language_loss": 0.79532218, + "learning_rate": 5.474273028873004e-07, + "loss": 0.81670547, + "num_input_tokens_seen": 274712815, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.671875, + "step": 12740, + "time_per_iteration": 2.464242458343506 + }, + { + "auxiliary_loss_clip": 0.01101878, + "auxiliary_loss_mlp": 0.0102987, + "balance_loss_clip": 1.01806879, + "balance_loss_mlp": 1.03497076, + "epoch": 0.7660303622425974, + "flos": 23549176627200.0, + "grad_norm": 1.8538704286256995, + "language_loss": 0.65541816, + "learning_rate": 5.471596174785429e-07, + "loss": 0.67673558, + "num_input_tokens_seen": 274732690, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66796875, + "step": 12741, + "time_per_iteration": 2.6027071475982666 + }, + { + "auxiliary_loss_clip": 0.01101685, + "auxiliary_loss_mlp": 0.0102683, + "balance_loss_clip": 1.01482606, + "balance_loss_mlp": 1.03617609, + "epoch": 0.7660904854952653, + "flos": 18916736336640.0, + "grad_norm": 1.883849175475749, + "language_loss": 0.75741291, + "learning_rate": 5.468919871616386e-07, + "loss": 0.77869809, + "num_input_tokens_seen": 274752460, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.65625, + "step": 12742, + "time_per_iteration": 2.483158588409424 + }, + { + "auxiliary_loss_clip": 0.0109981, + "auxiliary_loss_mlp": 0.01028671, + "balance_loss_clip": 1.01749492, + "balance_loss_mlp": 1.03572869, + "epoch": 0.7661506087479333, + "flos": 23147515768320.0, + "grad_norm": 1.3603011041168136, + "language_loss": 0.76397032, + "learning_rate": 5.46624411946736e-07, + "loss": 0.78525507, + "num_input_tokens_seen": 274773070, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.640625, + "step": 12743, + "time_per_iteration": 2.4432547092437744 + }, + { + "auxiliary_loss_clip": 0.01100504, + "auxiliary_loss_mlp": 0.01027763, + "balance_loss_clip": 1.01619387, + "balance_loss_mlp": 1.0345211, + "epoch": 0.7662107320006012, + "flos": 17565776887680.0, + "grad_norm": 1.9126072304780652, + "language_loss": 0.749053, + "learning_rate": 5.463568918439805e-07, + "loss": 0.77033567, + "num_input_tokens_seen": 274790220, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 12744, + "time_per_iteration": 2.4553682804107666 + }, + { + "auxiliary_loss_clip": 0.01103322, + "auxiliary_loss_mlp": 0.01027604, + "balance_loss_clip": 1.01541495, + "balance_loss_mlp": 1.03609204, + "epoch": 0.7662708552532692, + "flos": 22303075956480.0, + "grad_norm": 2.243657219575693, + "language_loss": 0.70895386, + "learning_rate": 5.460894268635181e-07, + "loss": 0.73026311, + "num_input_tokens_seen": 274805095, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 12745, + "time_per_iteration": 2.4222021102905273 + }, + { + "auxiliary_loss_clip": 0.01102421, + "auxiliary_loss_mlp": 0.0103259, + "balance_loss_clip": 1.02016246, + "balance_loss_mlp": 1.03536963, + "epoch": 0.7663309785059371, + "flos": 15742053607680.0, + "grad_norm": 2.2580014777322264, + "language_loss": 0.7671814, + "learning_rate": 5.458220170154896e-07, + "loss": 0.78853154, + "num_input_tokens_seen": 274821800, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.671875, + "step": 12746, + "time_per_iteration": 2.4328715801239014 + }, + { + "auxiliary_loss_clip": 0.01024805, + "auxiliary_loss_mlp": 0.01002921, + "balance_loss_clip": 1.00194991, + "balance_loss_mlp": 1.00455523, + "epoch": 0.7663911017586051, + "flos": 62163312514560.0, + "grad_norm": 0.6617058093404249, + "language_loss": 0.56800187, + "learning_rate": 5.455546623100362e-07, + "loss": 0.58827913, + "num_input_tokens_seen": 274886970, + "router_z_loss_clip": 0.00970459, + "router_z_loss_mlp": 0.20214844, + "step": 12747, + "time_per_iteration": 3.0698306560516357 + }, + { + "auxiliary_loss_clip": 0.01098009, + "auxiliary_loss_mlp": 0.01028323, + "balance_loss_clip": 1.01804721, + "balance_loss_mlp": 1.03344798, + "epoch": 0.7664512250112732, + "flos": 26506025326080.0, + "grad_norm": 1.7111315539475358, + "language_loss": 0.72324377, + "learning_rate": 5.452873627572956e-07, + "loss": 0.74450713, + "num_input_tokens_seen": 274907240, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.6484375, + "step": 12748, + "time_per_iteration": 2.506683588027954 + }, + { + "auxiliary_loss_clip": 0.01099884, + "auxiliary_loss_mlp": 0.01030099, + "balance_loss_clip": 1.01763535, + "balance_loss_mlp": 1.03435397, + "epoch": 0.7665113482639411, + "flos": 16249542912000.0, + "grad_norm": 3.145698976514515, + "language_loss": 0.6893121, + "learning_rate": 5.450201183674052e-07, + "loss": 0.71061194, + "num_input_tokens_seen": 274924650, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.65625, + "step": 12749, + "time_per_iteration": 2.389932155609131 + }, + { + "auxiliary_loss_clip": 0.01101373, + "auxiliary_loss_mlp": 0.01026386, + "balance_loss_clip": 1.01423216, + "balance_loss_mlp": 1.034747, + "epoch": 0.7665714715166091, + "flos": 27197880163200.0, + "grad_norm": 1.5718921115117155, + "language_loss": 0.73633575, + "learning_rate": 5.447529291504967e-07, + "loss": 0.75761336, + "num_input_tokens_seen": 274944550, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6640625, + "step": 12750, + "time_per_iteration": 2.5167572498321533 + }, + { + "auxiliary_loss_clip": 0.01098567, + "auxiliary_loss_mlp": 0.01027406, + "balance_loss_clip": 1.01658773, + "balance_loss_mlp": 1.0340786, + "epoch": 0.766631594769277, + "flos": 21067785279360.0, + "grad_norm": 3.4547507974534937, + "language_loss": 0.75537312, + "learning_rate": 5.444857951167026e-07, + "loss": 0.77663291, + "num_input_tokens_seen": 274961330, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.64453125, + "step": 12751, + "time_per_iteration": 2.476710081100464 + }, + { + "auxiliary_loss_clip": 0.01103164, + "auxiliary_loss_mlp": 0.01036055, + "balance_loss_clip": 1.0242238, + "balance_loss_mlp": 1.03732014, + "epoch": 0.766691718021945, + "flos": 24097963593600.0, + "grad_norm": 2.104179028478291, + "language_loss": 0.61111033, + "learning_rate": 5.442187162761537e-07, + "loss": 0.6325025, + "num_input_tokens_seen": 274981655, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66015625, + "step": 12752, + "time_per_iteration": 2.483185291290283 + }, + { + "auxiliary_loss_clip": 0.01103162, + "auxiliary_loss_mlp": 0.01033869, + "balance_loss_clip": 1.0219605, + "balance_loss_mlp": 1.03612447, + "epoch": 0.7667518412746129, + "flos": 23440654661760.0, + "grad_norm": 1.7425308356363913, + "language_loss": 0.69364887, + "learning_rate": 5.439516926389767e-07, + "loss": 0.71501917, + "num_input_tokens_seen": 274999970, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 12753, + "time_per_iteration": 2.462432384490967 + }, + { + "auxiliary_loss_clip": 0.01102326, + "auxiliary_loss_mlp": 0.01036325, + "balance_loss_clip": 1.02463651, + "balance_loss_mlp": 1.03598034, + "epoch": 0.766811964527281, + "flos": 18148786536960.0, + "grad_norm": 2.935870889400166, + "language_loss": 0.62185645, + "learning_rate": 5.436847242152971e-07, + "loss": 0.64324296, + "num_input_tokens_seen": 275015805, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 12754, + "time_per_iteration": 2.518746852874756 + }, + { + "auxiliary_loss_clip": 0.01103223, + "auxiliary_loss_mlp": 0.01027471, + "balance_loss_clip": 1.01619387, + "balance_loss_mlp": 1.03773046, + "epoch": 0.7668720877799489, + "flos": 19536051657600.0, + "grad_norm": 2.3055221195996065, + "language_loss": 0.79792452, + "learning_rate": 5.434178110152401e-07, + "loss": 0.81923139, + "num_input_tokens_seen": 275031810, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65625, + "step": 12755, + "time_per_iteration": 2.4429805278778076 + }, + { + "auxiliary_loss_clip": 0.01101727, + "auxiliary_loss_mlp": 0.01028616, + "balance_loss_clip": 1.01758885, + "balance_loss_mlp": 1.03660679, + "epoch": 0.7669322110326169, + "flos": 22674320974080.0, + "grad_norm": 1.7360812888518318, + "language_loss": 0.70129168, + "learning_rate": 5.431509530489242e-07, + "loss": 0.7225951, + "num_input_tokens_seen": 275049325, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 12756, + "time_per_iteration": 2.4959518909454346 + }, + { + "auxiliary_loss_clip": 0.01101968, + "auxiliary_loss_mlp": 0.01034868, + "balance_loss_clip": 1.02353144, + "balance_loss_mlp": 1.03610778, + "epoch": 0.7669923342852848, + "flos": 26469396432000.0, + "grad_norm": 2.2706673014793766, + "language_loss": 0.70277941, + "learning_rate": 5.428841503264706e-07, + "loss": 0.7241478, + "num_input_tokens_seen": 275070865, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 12757, + "time_per_iteration": 2.503958225250244 + }, + { + "auxiliary_loss_clip": 0.01103409, + "auxiliary_loss_mlp": 0.01033303, + "balance_loss_clip": 1.02101266, + "balance_loss_mlp": 1.03675115, + "epoch": 0.7670524575379528, + "flos": 22856136641280.0, + "grad_norm": 1.9695287063261235, + "language_loss": 0.75929737, + "learning_rate": 5.426174028579955e-07, + "loss": 0.78066456, + "num_input_tokens_seen": 275088015, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66796875, + "step": 12758, + "time_per_iteration": 2.490203857421875 + }, + { + "auxiliary_loss_clip": 0.01098808, + "auxiliary_loss_mlp": 0.01032432, + "balance_loss_clip": 1.02119648, + "balance_loss_mlp": 1.03469872, + "epoch": 0.7671125807906207, + "flos": 22452141398400.0, + "grad_norm": 1.6224114327929111, + "language_loss": 0.76120728, + "learning_rate": 5.423507106536156e-07, + "loss": 0.7825197, + "num_input_tokens_seen": 275106975, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 12759, + "time_per_iteration": 2.462779998779297 + }, + { + "auxiliary_loss_clip": 0.0109933, + "auxiliary_loss_mlp": 0.01026965, + "balance_loss_clip": 1.01611102, + "balance_loss_mlp": 1.03285909, + "epoch": 0.7671727040432887, + "flos": 35371543518720.0, + "grad_norm": 2.0831597822945738, + "language_loss": 0.68447405, + "learning_rate": 5.420840737234425e-07, + "loss": 0.70573699, + "num_input_tokens_seen": 275129560, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6640625, + "step": 12760, + "time_per_iteration": 2.66218900680542 + }, + { + "auxiliary_loss_clip": 0.01102506, + "auxiliary_loss_mlp": 0.01029066, + "balance_loss_clip": 1.01719248, + "balance_loss_mlp": 1.03628325, + "epoch": 0.7672328272959568, + "flos": 22494947431680.0, + "grad_norm": 1.530930371771359, + "language_loss": 0.79041481, + "learning_rate": 5.418174920775871e-07, + "loss": 0.8117305, + "num_input_tokens_seen": 275151180, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6640625, + "step": 12761, + "time_per_iteration": 3.9318642616271973 + }, + { + "auxiliary_loss_clip": 0.01099585, + "auxiliary_loss_mlp": 0.0102753, + "balance_loss_clip": 1.0160147, + "balance_loss_mlp": 1.03551531, + "epoch": 0.7672929505486247, + "flos": 22815557251200.0, + "grad_norm": 1.7398225752644456, + "language_loss": 0.66273689, + "learning_rate": 5.415509657261589e-07, + "loss": 0.68400806, + "num_input_tokens_seen": 275170605, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 12762, + "time_per_iteration": 2.423274040222168 + }, + { + "auxiliary_loss_clip": 0.01102242, + "auxiliary_loss_mlp": 0.01027373, + "balance_loss_clip": 1.01513004, + "balance_loss_mlp": 1.03505349, + "epoch": 0.7673530738012927, + "flos": 20338834671360.0, + "grad_norm": 1.6795407868504282, + "language_loss": 0.73981798, + "learning_rate": 5.412844946792639e-07, + "loss": 0.76111412, + "num_input_tokens_seen": 275188750, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.671875, + "step": 12763, + "time_per_iteration": 3.845613718032837 + }, + { + "auxiliary_loss_clip": 0.01102131, + "auxiliary_loss_mlp": 0.01032406, + "balance_loss_clip": 1.02071738, + "balance_loss_mlp": 1.0367074, + "epoch": 0.7674131970539606, + "flos": 34933576988160.0, + "grad_norm": 1.585918390915768, + "language_loss": 0.70586705, + "learning_rate": 5.410180789470067e-07, + "loss": 0.72721243, + "num_input_tokens_seen": 275211365, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 12764, + "time_per_iteration": 3.981903314590454 + }, + { + "auxiliary_loss_clip": 0.01100669, + "auxiliary_loss_mlp": 0.01026967, + "balance_loss_clip": 1.01529598, + "balance_loss_mlp": 1.03549826, + "epoch": 0.7674733203066286, + "flos": 28328850766080.0, + "grad_norm": 1.533836649562743, + "language_loss": 0.69619727, + "learning_rate": 5.40751718539491e-07, + "loss": 0.71747363, + "num_input_tokens_seen": 275231670, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65234375, + "step": 12765, + "time_per_iteration": 2.4988484382629395 + }, + { + "auxiliary_loss_clip": 0.01097446, + "auxiliary_loss_mlp": 0.01030323, + "balance_loss_clip": 1.02000558, + "balance_loss_mlp": 1.03249931, + "epoch": 0.7675334435592965, + "flos": 16289727252480.0, + "grad_norm": 1.7341921361954618, + "language_loss": 0.60877311, + "learning_rate": 5.404854134668162e-07, + "loss": 0.63005078, + "num_input_tokens_seen": 275249425, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.65234375, + "step": 12766, + "time_per_iteration": 3.856095790863037 + }, + { + "auxiliary_loss_clip": 0.01024204, + "auxiliary_loss_mlp": 0.01001208, + "balance_loss_clip": 1.00024879, + "balance_loss_mlp": 1.00405478, + "epoch": 0.7675935668119646, + "flos": 64826232220800.0, + "grad_norm": 0.7388978362538794, + "language_loss": 0.60806286, + "learning_rate": 5.402191637390803e-07, + "loss": 0.628317, + "num_input_tokens_seen": 275312485, + "router_z_loss_clip": 0.00958252, + "router_z_loss_mlp": 0.20117188, + "step": 12767, + "time_per_iteration": 3.1863934993743896 + }, + { + "auxiliary_loss_clip": 0.01098543, + "auxiliary_loss_mlp": 0.01024357, + "balance_loss_clip": 1.01363397, + "balance_loss_mlp": 1.03486204, + "epoch": 0.7676536900646325, + "flos": 22675398382080.0, + "grad_norm": 1.9841724465329964, + "language_loss": 0.69505453, + "learning_rate": 5.399529693663801e-07, + "loss": 0.71628356, + "num_input_tokens_seen": 275331680, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.63671875, + "step": 12768, + "time_per_iteration": 2.423121452331543 + }, + { + "auxiliary_loss_clip": 0.01107565, + "auxiliary_loss_mlp": 0.01034083, + "balance_loss_clip": 1.02206123, + "balance_loss_mlp": 1.03830612, + "epoch": 0.7677138133173005, + "flos": 26939682224640.0, + "grad_norm": 1.9774662095092985, + "language_loss": 0.70799577, + "learning_rate": 5.3968683035881e-07, + "loss": 0.7294122, + "num_input_tokens_seen": 275351615, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6953125, + "step": 12769, + "time_per_iteration": 2.4947516918182373 + }, + { + "auxiliary_loss_clip": 0.01103148, + "auxiliary_loss_mlp": 0.01026944, + "balance_loss_clip": 1.0148201, + "balance_loss_mlp": 1.03540611, + "epoch": 0.7677739365699684, + "flos": 23799545400960.0, + "grad_norm": 1.823298760542139, + "language_loss": 0.80289495, + "learning_rate": 5.394207467264611e-07, + "loss": 0.82419586, + "num_input_tokens_seen": 275368815, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 12770, + "time_per_iteration": 2.4479711055755615 + }, + { + "auxiliary_loss_clip": 0.01098048, + "auxiliary_loss_mlp": 0.01030309, + "balance_loss_clip": 1.01986027, + "balance_loss_mlp": 1.0342977, + "epoch": 0.7678340598226364, + "flos": 34455497944320.0, + "grad_norm": 1.520087647586923, + "language_loss": 0.78579485, + "learning_rate": 5.391547184794245e-07, + "loss": 0.80707848, + "num_input_tokens_seen": 275389345, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.63671875, + "step": 12771, + "time_per_iteration": 2.5589637756347656 + }, + { + "auxiliary_loss_clip": 0.01100406, + "auxiliary_loss_mlp": 0.01027097, + "balance_loss_clip": 1.01595068, + "balance_loss_mlp": 1.03527427, + "epoch": 0.7678941830753043, + "flos": 23841740903040.0, + "grad_norm": 1.305591039584481, + "language_loss": 0.68094563, + "learning_rate": 5.388887456277876e-07, + "loss": 0.70222068, + "num_input_tokens_seen": 275411240, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65234375, + "step": 12772, + "time_per_iteration": 2.443350076675415 + }, + { + "auxiliary_loss_clip": 0.01097286, + "auxiliary_loss_mlp": 0.01023769, + "balance_loss_clip": 1.01294541, + "balance_loss_mlp": 1.03465271, + "epoch": 0.7679543063279723, + "flos": 25410929431680.0, + "grad_norm": 1.6667227683698287, + "language_loss": 0.73345917, + "learning_rate": 5.386228281816349e-07, + "loss": 0.75466973, + "num_input_tokens_seen": 275432010, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.62890625, + "step": 12773, + "time_per_iteration": 2.5177359580993652 + }, + { + "auxiliary_loss_clip": 0.01097604, + "auxiliary_loss_mlp": 0.01026807, + "balance_loss_clip": 1.01624548, + "balance_loss_mlp": 1.03416824, + "epoch": 0.7680144295806404, + "flos": 27962382257280.0, + "grad_norm": 1.8287819749313907, + "language_loss": 0.8077029, + "learning_rate": 5.383569661510512e-07, + "loss": 0.82894701, + "num_input_tokens_seen": 275453710, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.63671875, + "step": 12774, + "time_per_iteration": 2.4638662338256836 + }, + { + "auxiliary_loss_clip": 0.01102122, + "auxiliary_loss_mlp": 0.01030311, + "balance_loss_clip": 1.01865244, + "balance_loss_mlp": 1.03650451, + "epoch": 0.7680745528333083, + "flos": 20412810731520.0, + "grad_norm": 1.5141235793881351, + "language_loss": 0.6951592, + "learning_rate": 5.380911595461177e-07, + "loss": 0.71648353, + "num_input_tokens_seen": 275472915, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65625, + "step": 12775, + "time_per_iteration": 2.529325246810913 + }, + { + "auxiliary_loss_clip": 0.01024296, + "auxiliary_loss_mlp": 0.0099915, + "balance_loss_clip": 0.9981491, + "balance_loss_mlp": 1.00394726, + "epoch": 0.7681346760859763, + "flos": 68401103351040.0, + "grad_norm": 0.6956565563059588, + "language_loss": 0.56836295, + "learning_rate": 5.378254083769147e-07, + "loss": 0.58859742, + "num_input_tokens_seen": 275534785, + "router_z_loss_clip": 0.01000977, + "router_z_loss_mlp": 0.203125, + "step": 12776, + "time_per_iteration": 3.10646915435791 + }, + { + "auxiliary_loss_clip": 0.01100161, + "auxiliary_loss_mlp": 0.01031577, + "balance_loss_clip": 1.02058029, + "balance_loss_mlp": 1.03510964, + "epoch": 0.7681947993386442, + "flos": 21251468453760.0, + "grad_norm": 1.821518021735027, + "language_loss": 0.74034452, + "learning_rate": 5.375597126535188e-07, + "loss": 0.76166189, + "num_input_tokens_seen": 275553205, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6484375, + "step": 12777, + "time_per_iteration": 2.4750425815582275 + }, + { + "auxiliary_loss_clip": 0.01100992, + "auxiliary_loss_mlp": 0.01033586, + "balance_loss_clip": 1.02257133, + "balance_loss_mlp": 1.03636885, + "epoch": 0.7682549225913122, + "flos": 21397696721280.0, + "grad_norm": 2.340152185552387, + "language_loss": 0.70033187, + "learning_rate": 5.372940723860043e-07, + "loss": 0.72167766, + "num_input_tokens_seen": 275571490, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.64453125, + "step": 12778, + "time_per_iteration": 2.4316253662109375 + }, + { + "auxiliary_loss_clip": 0.01100934, + "auxiliary_loss_mlp": 0.01028306, + "balance_loss_clip": 1.01741004, + "balance_loss_mlp": 1.03619504, + "epoch": 0.7683150458439801, + "flos": 23038921975680.0, + "grad_norm": 1.7229591710828633, + "language_loss": 0.70021391, + "learning_rate": 5.37028487584446e-07, + "loss": 0.72150636, + "num_input_tokens_seen": 275589665, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6484375, + "step": 12779, + "time_per_iteration": 2.4962258338928223 + }, + { + "auxiliary_loss_clip": 0.01102633, + "auxiliary_loss_mlp": 0.01027115, + "balance_loss_clip": 1.01549852, + "balance_loss_mlp": 1.03702402, + "epoch": 0.7683751690966482, + "flos": 67332397996800.0, + "grad_norm": 1.5025489085425099, + "language_loss": 0.58335769, + "learning_rate": 5.367629582589133e-07, + "loss": 0.60465509, + "num_input_tokens_seen": 275615605, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 12780, + "time_per_iteration": 2.827277898788452 + }, + { + "auxiliary_loss_clip": 0.0110525, + "auxiliary_loss_mlp": 0.01036743, + "balance_loss_clip": 1.02303374, + "balance_loss_mlp": 1.03533888, + "epoch": 0.7684352923493161, + "flos": 21798890703360.0, + "grad_norm": 1.7175154048047394, + "language_loss": 0.68096447, + "learning_rate": 5.364974844194759e-07, + "loss": 0.70238441, + "num_input_tokens_seen": 275634965, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.69921875, + "step": 12781, + "time_per_iteration": 2.450493574142456 + }, + { + "auxiliary_loss_clip": 0.01099247, + "auxiliary_loss_mlp": 0.0102942, + "balance_loss_clip": 1.0181365, + "balance_loss_mlp": 1.03411829, + "epoch": 0.7684954156019841, + "flos": 25847603072640.0, + "grad_norm": 1.4930277529018858, + "language_loss": 0.79351133, + "learning_rate": 5.362320660762016e-07, + "loss": 0.814798, + "num_input_tokens_seen": 275655785, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65234375, + "step": 12782, + "time_per_iteration": 2.473785638809204 + }, + { + "auxiliary_loss_clip": 0.01101943, + "auxiliary_loss_mlp": 0.01028865, + "balance_loss_clip": 1.01648557, + "balance_loss_mlp": 1.03457451, + "epoch": 0.768555538854652, + "flos": 25447378757760.0, + "grad_norm": 3.89329070185187, + "language_loss": 0.6701203, + "learning_rate": 5.35966703239153e-07, + "loss": 0.6914283, + "num_input_tokens_seen": 275676160, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 12783, + "time_per_iteration": 2.496005058288574 + }, + { + "auxiliary_loss_clip": 0.0110336, + "auxiliary_loss_mlp": 0.01032843, + "balance_loss_clip": 1.02068949, + "balance_loss_mlp": 1.0368228, + "epoch": 0.76861566210732, + "flos": 19646369303040.0, + "grad_norm": 2.317173566412315, + "language_loss": 0.68567002, + "learning_rate": 5.357013959183938e-07, + "loss": 0.70703208, + "num_input_tokens_seen": 275695660, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6640625, + "step": 12784, + "time_per_iteration": 2.4193952083587646 + }, + { + "auxiliary_loss_clip": 0.01101224, + "auxiliary_loss_mlp": 0.01027175, + "balance_loss_clip": 1.01677442, + "balance_loss_mlp": 1.03561044, + "epoch": 0.7686757853599879, + "flos": 22419032037120.0, + "grad_norm": 2.4397788203349546, + "language_loss": 0.80600178, + "learning_rate": 5.354361441239843e-07, + "loss": 0.82728577, + "num_input_tokens_seen": 275714025, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.65625, + "step": 12785, + "time_per_iteration": 2.4642157554626465 + }, + { + "auxiliary_loss_clip": 0.01102953, + "auxiliary_loss_mlp": 0.0103025, + "balance_loss_clip": 1.01812065, + "balance_loss_mlp": 1.03647351, + "epoch": 0.768735908612656, + "flos": 47774262453120.0, + "grad_norm": 1.5675219455195206, + "language_loss": 0.77255261, + "learning_rate": 5.351709478659836e-07, + "loss": 0.79388458, + "num_input_tokens_seen": 275737300, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6640625, + "step": 12786, + "time_per_iteration": 2.6608307361602783 + }, + { + "auxiliary_loss_clip": 0.0109997, + "auxiliary_loss_mlp": 0.01029084, + "balance_loss_clip": 1.01797938, + "balance_loss_mlp": 1.03441632, + "epoch": 0.7687960318653239, + "flos": 30263179000320.0, + "grad_norm": 2.029037446974208, + "language_loss": 0.58857298, + "learning_rate": 5.349058071544468e-07, + "loss": 0.60986358, + "num_input_tokens_seen": 275757895, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 12787, + "time_per_iteration": 2.5195324420928955 + }, + { + "auxiliary_loss_clip": 0.01098338, + "auxiliary_loss_mlp": 0.01026426, + "balance_loss_clip": 1.01488662, + "balance_loss_mlp": 1.03323674, + "epoch": 0.7688561551179919, + "flos": 19573434737280.0, + "grad_norm": 1.5842728148921028, + "language_loss": 0.75863254, + "learning_rate": 5.346407219994292e-07, + "loss": 0.77988023, + "num_input_tokens_seen": 275776745, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65234375, + "step": 12788, + "time_per_iteration": 2.427560567855835 + }, + { + "auxiliary_loss_clip": 0.01103523, + "auxiliary_loss_mlp": 0.01037331, + "balance_loss_clip": 1.02540421, + "balance_loss_mlp": 1.03627038, + "epoch": 0.7689162783706599, + "flos": 22783776693120.0, + "grad_norm": 1.6525125671595142, + "language_loss": 0.66358525, + "learning_rate": 5.343756924109821e-07, + "loss": 0.6849938, + "num_input_tokens_seen": 275797205, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 12789, + "time_per_iteration": 2.484055280685425 + }, + { + "auxiliary_loss_clip": 0.01103699, + "auxiliary_loss_mlp": 0.01033442, + "balance_loss_clip": 1.02053142, + "balance_loss_mlp": 1.03660512, + "epoch": 0.7689764016233278, + "flos": 34204195416960.0, + "grad_norm": 1.730155675117843, + "language_loss": 0.68648386, + "learning_rate": 5.341107183991553e-07, + "loss": 0.70785522, + "num_input_tokens_seen": 275817935, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.671875, + "step": 12790, + "time_per_iteration": 2.5284645557403564 + }, + { + "auxiliary_loss_clip": 0.01101213, + "auxiliary_loss_mlp": 0.01031951, + "balance_loss_clip": 1.02015567, + "balance_loss_mlp": 1.03384793, + "epoch": 0.7690365248759958, + "flos": 17274469587840.0, + "grad_norm": 1.6904473195565226, + "language_loss": 0.68665707, + "learning_rate": 5.338457999739969e-07, + "loss": 0.70798862, + "num_input_tokens_seen": 275837145, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 12791, + "time_per_iteration": 2.4484951496124268 + }, + { + "auxiliary_loss_clip": 0.0109967, + "auxiliary_loss_mlp": 0.01032034, + "balance_loss_clip": 1.02082801, + "balance_loss_mlp": 1.03512239, + "epoch": 0.7690966481286637, + "flos": 18223157646720.0, + "grad_norm": 1.7979814428541672, + "language_loss": 0.79704869, + "learning_rate": 5.335809371455526e-07, + "loss": 0.81836575, + "num_input_tokens_seen": 275855705, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.64453125, + "step": 12792, + "time_per_iteration": 2.397611141204834 + }, + { + "auxiliary_loss_clip": 0.01109838, + "auxiliary_loss_mlp": 0.01030341, + "balance_loss_clip": 1.01751399, + "balance_loss_mlp": 1.04006386, + "epoch": 0.7691567713813318, + "flos": 21537568281600.0, + "grad_norm": 1.8065104235700298, + "language_loss": 0.72902393, + "learning_rate": 5.333161299238673e-07, + "loss": 0.7504257, + "num_input_tokens_seen": 275873930, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 12793, + "time_per_iteration": 2.445250988006592 + }, + { + "auxiliary_loss_clip": 0.01102182, + "auxiliary_loss_mlp": 0.01033309, + "balance_loss_clip": 1.02147722, + "balance_loss_mlp": 1.0359565, + "epoch": 0.7692168946339997, + "flos": 39379999720320.0, + "grad_norm": 1.7477925933074476, + "language_loss": 0.63753021, + "learning_rate": 5.330513783189803e-07, + "loss": 0.65888512, + "num_input_tokens_seen": 275895895, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 12794, + "time_per_iteration": 2.609574317932129 + }, + { + "auxiliary_loss_clip": 0.01107559, + "auxiliary_loss_mlp": 0.01031379, + "balance_loss_clip": 1.01955318, + "balance_loss_mlp": 1.03873158, + "epoch": 0.7692770178866677, + "flos": 25009950931200.0, + "grad_norm": 1.4386826522149643, + "language_loss": 0.76442081, + "learning_rate": 5.327866823409319e-07, + "loss": 0.78581011, + "num_input_tokens_seen": 275917825, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 12795, + "time_per_iteration": 2.491729736328125 + }, + { + "auxiliary_loss_clip": 0.01101903, + "auxiliary_loss_mlp": 0.01026935, + "balance_loss_clip": 1.01503158, + "balance_loss_mlp": 1.03450465, + "epoch": 0.7693371411393356, + "flos": 24716273333760.0, + "grad_norm": 1.5564929317372034, + "language_loss": 0.71727788, + "learning_rate": 5.325220419997601e-07, + "loss": 0.73856628, + "num_input_tokens_seen": 275937890, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 12796, + "time_per_iteration": 2.4555909633636475 + }, + { + "auxiliary_loss_clip": 0.01101987, + "auxiliary_loss_mlp": 0.01027607, + "balance_loss_clip": 1.01607311, + "balance_loss_mlp": 1.03496242, + "epoch": 0.7693972643920036, + "flos": 15924803028480.0, + "grad_norm": 2.1139443574880574, + "language_loss": 0.65011704, + "learning_rate": 5.32257457305499e-07, + "loss": 0.671413, + "num_input_tokens_seen": 275954495, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 12797, + "time_per_iteration": 2.4375650882720947 + }, + { + "auxiliary_loss_clip": 0.01102375, + "auxiliary_loss_mlp": 0.01032493, + "balance_loss_clip": 1.01997042, + "balance_loss_mlp": 1.03497744, + "epoch": 0.7694573876446715, + "flos": 25405901527680.0, + "grad_norm": 1.7406268375676737, + "language_loss": 0.91516721, + "learning_rate": 5.319929282681823e-07, + "loss": 0.93651593, + "num_input_tokens_seen": 275972395, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.671875, + "step": 12798, + "time_per_iteration": 2.4546101093292236 + }, + { + "auxiliary_loss_clip": 0.0110163, + "auxiliary_loss_mlp": 0.01027124, + "balance_loss_clip": 1.01569748, + "balance_loss_mlp": 1.03515077, + "epoch": 0.7695175108973396, + "flos": 16654220513280.0, + "grad_norm": 1.9252292535695115, + "language_loss": 0.82239765, + "learning_rate": 5.317284548978418e-07, + "loss": 0.84368521, + "num_input_tokens_seen": 275989020, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 12799, + "time_per_iteration": 2.44386625289917 + }, + { + "auxiliary_loss_clip": 0.01103323, + "auxiliary_loss_mlp": 0.01028131, + "balance_loss_clip": 1.01621604, + "balance_loss_mlp": 1.03646576, + "epoch": 0.7695776341500075, + "flos": 13626520237440.0, + "grad_norm": 2.0094364967525262, + "language_loss": 0.77591789, + "learning_rate": 5.314640372045045e-07, + "loss": 0.79723239, + "num_input_tokens_seen": 276006525, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66796875, + "step": 12800, + "time_per_iteration": 2.397705316543579 + }, + { + "auxiliary_loss_clip": 0.0110665, + "auxiliary_loss_mlp": 0.01028477, + "balance_loss_clip": 1.01594234, + "balance_loss_mlp": 1.03569245, + "epoch": 0.7696377574026755, + "flos": 24276690691200.0, + "grad_norm": 1.660947128359647, + "language_loss": 0.83736777, + "learning_rate": 5.31199675198198e-07, + "loss": 0.85871899, + "num_input_tokens_seen": 276027130, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 12801, + "time_per_iteration": 2.4850664138793945 + }, + { + "auxiliary_loss_clip": 0.01101531, + "auxiliary_loss_mlp": 0.01030973, + "balance_loss_clip": 1.01908183, + "balance_loss_mlp": 1.03610682, + "epoch": 0.7696978806553435, + "flos": 20923137210240.0, + "grad_norm": 1.968794932529363, + "language_loss": 0.72192085, + "learning_rate": 5.30935368888947e-07, + "loss": 0.7432459, + "num_input_tokens_seen": 276045715, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.65625, + "step": 12802, + "time_per_iteration": 2.423994779586792 + }, + { + "auxiliary_loss_clip": 0.01101064, + "auxiliary_loss_mlp": 0.0102928, + "balance_loss_clip": 1.01767504, + "balance_loss_mlp": 1.03590822, + "epoch": 0.7697580039080114, + "flos": 22929609911040.0, + "grad_norm": 1.7968472672418645, + "language_loss": 0.75812244, + "learning_rate": 5.306711182867747e-07, + "loss": 0.77942592, + "num_input_tokens_seen": 276065375, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 12803, + "time_per_iteration": 3.8298709392547607 + }, + { + "auxiliary_loss_clip": 0.0102415, + "auxiliary_loss_mlp": 0.00999256, + "balance_loss_clip": 0.99821299, + "balance_loss_mlp": 1.00390053, + "epoch": 0.7698181271606794, + "flos": 68717654933760.0, + "grad_norm": 0.7302044850934681, + "language_loss": 0.55831051, + "learning_rate": 5.304069234017001e-07, + "loss": 0.57854456, + "num_input_tokens_seen": 276131405, + "router_z_loss_clip": 0.01043701, + "router_z_loss_mlp": 0.203125, + "step": 12804, + "time_per_iteration": 3.058547258377075 + }, + { + "auxiliary_loss_clip": 0.01024727, + "auxiliary_loss_mlp": 0.01002741, + "balance_loss_clip": 1.00166178, + "balance_loss_mlp": 1.00439858, + "epoch": 0.7698782504133473, + "flos": 67409716999680.0, + "grad_norm": 0.9747386199890918, + "language_loss": 0.54020375, + "learning_rate": 5.301427842437429e-07, + "loss": 0.56047845, + "num_input_tokens_seen": 276200755, + "router_z_loss_clip": 0.01080322, + "router_z_loss_mlp": 0.203125, + "step": 12805, + "time_per_iteration": 4.5421671867370605 + }, + { + "auxiliary_loss_clip": 0.01105608, + "auxiliary_loss_mlp": 0.0103352, + "balance_loss_clip": 1.02145565, + "balance_loss_mlp": 1.03835249, + "epoch": 0.7699383736660154, + "flos": 22488842119680.0, + "grad_norm": 2.1701782975166, + "language_loss": 0.72961175, + "learning_rate": 5.298787008229187e-07, + "loss": 0.75100303, + "num_input_tokens_seen": 276217880, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.671875, + "step": 12806, + "time_per_iteration": 3.833503246307373 + }, + { + "auxiliary_loss_clip": 0.01102326, + "auxiliary_loss_mlp": 0.01035726, + "balance_loss_clip": 1.02383482, + "balance_loss_mlp": 1.03555238, + "epoch": 0.7699984969186833, + "flos": 21539723097600.0, + "grad_norm": 3.0939147131077878, + "language_loss": 0.75202084, + "learning_rate": 5.296146731492408e-07, + "loss": 0.77340138, + "num_input_tokens_seen": 276234810, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66796875, + "step": 12807, + "time_per_iteration": 2.453640937805176 + }, + { + "auxiliary_loss_clip": 0.01107207, + "auxiliary_loss_mlp": 0.01033913, + "balance_loss_clip": 1.02156925, + "balance_loss_mlp": 1.03789043, + "epoch": 0.7700586201713513, + "flos": 21719096640000.0, + "grad_norm": 2.25264240922501, + "language_loss": 0.79834819, + "learning_rate": 5.293507012327218e-07, + "loss": 0.81975937, + "num_input_tokens_seen": 276252850, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 12808, + "time_per_iteration": 3.863776206970215 + }, + { + "auxiliary_loss_clip": 0.01106296, + "auxiliary_loss_mlp": 0.01035203, + "balance_loss_clip": 1.02278161, + "balance_loss_mlp": 1.03690052, + "epoch": 0.7701187434240192, + "flos": 27856015107840.0, + "grad_norm": 1.7718685431414871, + "language_loss": 0.79037017, + "learning_rate": 5.290867850833718e-07, + "loss": 0.81178522, + "num_input_tokens_seen": 276272525, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6953125, + "step": 12809, + "time_per_iteration": 2.50119948387146 + }, + { + "auxiliary_loss_clip": 0.01097513, + "auxiliary_loss_mlp": 0.0102508, + "balance_loss_clip": 1.01414251, + "balance_loss_mlp": 1.03431511, + "epoch": 0.7701788666766872, + "flos": 28621307301120.0, + "grad_norm": 1.5273739274998572, + "language_loss": 0.70192695, + "learning_rate": 5.288229247111993e-07, + "loss": 0.72315288, + "num_input_tokens_seen": 276294210, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6328125, + "step": 12810, + "time_per_iteration": 2.4800918102264404 + }, + { + "auxiliary_loss_clip": 0.01104583, + "auxiliary_loss_mlp": 0.01031615, + "balance_loss_clip": 1.01865101, + "balance_loss_mlp": 1.03556144, + "epoch": 0.7702389899293551, + "flos": 14246446089600.0, + "grad_norm": 2.2614131210478465, + "language_loss": 0.78612316, + "learning_rate": 5.285591201262079e-07, + "loss": 0.80748516, + "num_input_tokens_seen": 276310290, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6875, + "step": 12811, + "time_per_iteration": 2.404200792312622 + }, + { + "auxiliary_loss_clip": 0.01024644, + "auxiliary_loss_mlp": 0.01001291, + "balance_loss_clip": 1.00025964, + "balance_loss_mlp": 1.00433743, + "epoch": 0.7702991131820232, + "flos": 70574128439040.0, + "grad_norm": 0.8119263300614926, + "language_loss": 0.56688583, + "learning_rate": 5.28295371338402e-07, + "loss": 0.58714521, + "num_input_tokens_seen": 276371715, + "router_z_loss_clip": 0.01031494, + "router_z_loss_mlp": 0.203125, + "step": 12812, + "time_per_iteration": 3.1152541637420654 + }, + { + "auxiliary_loss_clip": 0.0110341, + "auxiliary_loss_mlp": 0.01034375, + "balance_loss_clip": 1.0224545, + "balance_loss_mlp": 1.0352596, + "epoch": 0.7703592364346911, + "flos": 25480021242240.0, + "grad_norm": 1.6865104586503614, + "language_loss": 0.7190448, + "learning_rate": 5.280316783577836e-07, + "loss": 0.74042261, + "num_input_tokens_seen": 276389895, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 12813, + "time_per_iteration": 2.4738786220550537 + }, + { + "auxiliary_loss_clip": 0.01102625, + "auxiliary_loss_mlp": 0.01029531, + "balance_loss_clip": 1.01718116, + "balance_loss_mlp": 1.03553629, + "epoch": 0.7704193596873591, + "flos": 19280906375040.0, + "grad_norm": 1.808315927971449, + "language_loss": 0.66342986, + "learning_rate": 5.27768041194351e-07, + "loss": 0.68475139, + "num_input_tokens_seen": 276408990, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.671875, + "step": 12814, + "time_per_iteration": 2.454023599624634 + }, + { + "auxiliary_loss_clip": 0.01101607, + "auxiliary_loss_mlp": 0.01031859, + "balance_loss_clip": 1.02005756, + "balance_loss_mlp": 1.03535891, + "epoch": 0.7704794829400271, + "flos": 23658452778240.0, + "grad_norm": 1.9935067754667941, + "language_loss": 0.65677094, + "learning_rate": 5.275044598581018e-07, + "loss": 0.67810559, + "num_input_tokens_seen": 276428190, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 12815, + "time_per_iteration": 2.453657627105713 + }, + { + "auxiliary_loss_clip": 0.01101554, + "auxiliary_loss_mlp": 0.01031447, + "balance_loss_clip": 1.01935291, + "balance_loss_mlp": 1.03516507, + "epoch": 0.770539606192695, + "flos": 18989311766400.0, + "grad_norm": 2.1548232448255566, + "language_loss": 0.6524539, + "learning_rate": 5.272409343590322e-07, + "loss": 0.6737839, + "num_input_tokens_seen": 276446855, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 12816, + "time_per_iteration": 2.407606840133667 + }, + { + "auxiliary_loss_clip": 0.01104205, + "auxiliary_loss_mlp": 0.0103424, + "balance_loss_clip": 1.02282572, + "balance_loss_mlp": 1.03735924, + "epoch": 0.770599729445363, + "flos": 11830160142720.0, + "grad_norm": 2.105850100227776, + "language_loss": 0.71998227, + "learning_rate": 5.26977464707133e-07, + "loss": 0.74136674, + "num_input_tokens_seen": 276462000, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 12817, + "time_per_iteration": 2.4196791648864746 + }, + { + "auxiliary_loss_clip": 0.01101955, + "auxiliary_loss_mlp": 0.01032617, + "balance_loss_clip": 1.02102351, + "balance_loss_mlp": 1.03574193, + "epoch": 0.770659852698031, + "flos": 17822610109440.0, + "grad_norm": 1.9485299894899226, + "language_loss": 0.61153173, + "learning_rate": 5.267140509123957e-07, + "loss": 0.63287747, + "num_input_tokens_seen": 276481190, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 12818, + "time_per_iteration": 2.422590970993042 + }, + { + "auxiliary_loss_clip": 0.0110086, + "auxiliary_loss_mlp": 0.0102778, + "balance_loss_clip": 1.01726627, + "balance_loss_mlp": 1.036057, + "epoch": 0.770719975950699, + "flos": 21871968923520.0, + "grad_norm": 1.7189181201095014, + "language_loss": 0.67140901, + "learning_rate": 5.264506929848093e-07, + "loss": 0.69269538, + "num_input_tokens_seen": 276499520, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.6484375, + "step": 12819, + "time_per_iteration": 2.445463180541992 + }, + { + "auxiliary_loss_clip": 0.01103433, + "auxiliary_loss_mlp": 0.01026789, + "balance_loss_clip": 1.0150826, + "balance_loss_mlp": 1.03642428, + "epoch": 0.7707800992033669, + "flos": 21325049464320.0, + "grad_norm": 1.8084191100945337, + "language_loss": 0.57428622, + "learning_rate": 5.261873909343608e-07, + "loss": 0.59558845, + "num_input_tokens_seen": 276519110, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66796875, + "step": 12820, + "time_per_iteration": 2.4313409328460693 + }, + { + "auxiliary_loss_clip": 0.01101387, + "auxiliary_loss_mlp": 0.01026524, + "balance_loss_clip": 1.01476407, + "balance_loss_mlp": 1.03471613, + "epoch": 0.7708402224560349, + "flos": 28179426188160.0, + "grad_norm": 1.656188868997019, + "language_loss": 0.80691266, + "learning_rate": 5.259241447710343e-07, + "loss": 0.82819176, + "num_input_tokens_seen": 276538805, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6640625, + "step": 12821, + "time_per_iteration": 2.495997190475464 + }, + { + "auxiliary_loss_clip": 0.01102331, + "auxiliary_loss_mlp": 0.01029131, + "balance_loss_clip": 1.01740658, + "balance_loss_mlp": 1.0356462, + "epoch": 0.7709003457087028, + "flos": 15377057556480.0, + "grad_norm": 2.1643932163706388, + "language_loss": 0.68480009, + "learning_rate": 5.256609545048114e-07, + "loss": 0.70611471, + "num_input_tokens_seen": 276554770, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66796875, + "step": 12822, + "time_per_iteration": 2.390167236328125 + }, + { + "auxiliary_loss_clip": 0.01101193, + "auxiliary_loss_mlp": 0.01032018, + "balance_loss_clip": 1.02059174, + "balance_loss_mlp": 1.03602922, + "epoch": 0.7709604689613708, + "flos": 30621854257920.0, + "grad_norm": 1.6982430970073337, + "language_loss": 0.72335845, + "learning_rate": 5.253978201456733e-07, + "loss": 0.74469054, + "num_input_tokens_seen": 276574535, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 12823, + "time_per_iteration": 2.492733955383301 + }, + { + "auxiliary_loss_clip": 0.01108942, + "auxiliary_loss_mlp": 0.01037818, + "balance_loss_clip": 1.02433515, + "balance_loss_mlp": 1.03756452, + "epoch": 0.7710205922140387, + "flos": 20301272023680.0, + "grad_norm": 1.8295063286437603, + "language_loss": 0.76613212, + "learning_rate": 5.251347417035969e-07, + "loss": 0.78759968, + "num_input_tokens_seen": 276592925, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71484375, + "step": 12824, + "time_per_iteration": 2.4176483154296875 + }, + { + "auxiliary_loss_clip": 0.01102375, + "auxiliary_loss_mlp": 0.01027118, + "balance_loss_clip": 1.01542997, + "balance_loss_mlp": 1.03651464, + "epoch": 0.7710807154667068, + "flos": 19644214487040.0, + "grad_norm": 3.0696602507520603, + "language_loss": 0.72657233, + "learning_rate": 5.248717191885592e-07, + "loss": 0.74786729, + "num_input_tokens_seen": 276610540, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 12825, + "time_per_iteration": 2.451836109161377 + }, + { + "auxiliary_loss_clip": 0.0109918, + "auxiliary_loss_mlp": 0.01032495, + "balance_loss_clip": 1.02191544, + "balance_loss_mlp": 1.03549349, + "epoch": 0.7711408387193747, + "flos": 20006337450240.0, + "grad_norm": 1.391969266660785, + "language_loss": 0.73613906, + "learning_rate": 5.246087526105343e-07, + "loss": 0.75745583, + "num_input_tokens_seen": 276629200, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.63671875, + "step": 12826, + "time_per_iteration": 2.4155168533325195 + }, + { + "auxiliary_loss_clip": 0.01102055, + "auxiliary_loss_mlp": 0.01031129, + "balance_loss_clip": 1.01887417, + "balance_loss_mlp": 1.03364134, + "epoch": 0.7712009619720427, + "flos": 24971131307520.0, + "grad_norm": 1.6262733051040712, + "language_loss": 0.81322646, + "learning_rate": 5.243458419794933e-07, + "loss": 0.83455837, + "num_input_tokens_seen": 276648655, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.68359375, + "step": 12827, + "time_per_iteration": 2.459195852279663 + }, + { + "auxiliary_loss_clip": 0.01024065, + "auxiliary_loss_mlp": 0.00999839, + "balance_loss_clip": 0.99881953, + "balance_loss_mlp": 1.00367689, + "epoch": 0.7712610852247107, + "flos": 63249681404160.0, + "grad_norm": 0.8804510230026851, + "language_loss": 0.55191517, + "learning_rate": 5.240829873054051e-07, + "loss": 0.57215428, + "num_input_tokens_seen": 276716500, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.20410156, + "step": 12828, + "time_per_iteration": 3.203558921813965 + }, + { + "auxiliary_loss_clip": 0.01099176, + "auxiliary_loss_mlp": 0.01026642, + "balance_loss_clip": 1.0158478, + "balance_loss_mlp": 1.03485942, + "epoch": 0.7713212084773786, + "flos": 18697860812160.0, + "grad_norm": 1.7353204568908176, + "language_loss": 0.69503725, + "learning_rate": 5.23820188598238e-07, + "loss": 0.71629542, + "num_input_tokens_seen": 276733535, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.640625, + "step": 12829, + "time_per_iteration": 2.447021722793579 + }, + { + "auxiliary_loss_clip": 0.01105724, + "auxiliary_loss_mlp": 0.01031583, + "balance_loss_clip": 1.01863086, + "balance_loss_mlp": 1.036901, + "epoch": 0.7713813317300466, + "flos": 14173367869440.0, + "grad_norm": 4.262950048849265, + "language_loss": 0.79446471, + "learning_rate": 5.235574458679579e-07, + "loss": 0.8158378, + "num_input_tokens_seen": 276749575, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6875, + "step": 12830, + "time_per_iteration": 2.3964903354644775 + }, + { + "auxiliary_loss_clip": 0.01106244, + "auxiliary_loss_mlp": 0.01031562, + "balance_loss_clip": 1.0183301, + "balance_loss_mlp": 1.03630996, + "epoch": 0.7714414549827145, + "flos": 25703960584320.0, + "grad_norm": 1.6021673475847413, + "language_loss": 0.78127801, + "learning_rate": 5.232947591245269e-07, + "loss": 0.80265611, + "num_input_tokens_seen": 276769460, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.69921875, + "step": 12831, + "time_per_iteration": 2.5234055519104004 + }, + { + "auxiliary_loss_clip": 0.01100095, + "auxiliary_loss_mlp": 0.01025829, + "balance_loss_clip": 1.01434898, + "balance_loss_mlp": 1.03424239, + "epoch": 0.7715015782353826, + "flos": 30555312312960.0, + "grad_norm": 1.5450896985633467, + "language_loss": 0.60894483, + "learning_rate": 5.230321283779071e-07, + "loss": 0.63020408, + "num_input_tokens_seen": 276790820, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65625, + "step": 12832, + "time_per_iteration": 2.492701530456543 + }, + { + "auxiliary_loss_clip": 0.0110303, + "auxiliary_loss_mlp": 0.01032842, + "balance_loss_clip": 1.02072477, + "balance_loss_mlp": 1.03454924, + "epoch": 0.7715617014880505, + "flos": 20229343038720.0, + "grad_norm": 1.7425232320118673, + "language_loss": 0.79137206, + "learning_rate": 5.227695536380572e-07, + "loss": 0.81273079, + "num_input_tokens_seen": 276811345, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 12833, + "time_per_iteration": 2.497288942337036 + }, + { + "auxiliary_loss_clip": 0.01024056, + "auxiliary_loss_mlp": 0.01003026, + "balance_loss_clip": 1.00204265, + "balance_loss_mlp": 1.00360727, + "epoch": 0.7716218247407185, + "flos": 63664770971520.0, + "grad_norm": 0.8686662344719275, + "language_loss": 0.55410403, + "learning_rate": 5.22507034914933e-07, + "loss": 0.57437485, + "num_input_tokens_seen": 276870950, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.20507812, + "step": 12834, + "time_per_iteration": 3.03043532371521 + }, + { + "auxiliary_loss_clip": 0.01103044, + "auxiliary_loss_mlp": 0.0102846, + "balance_loss_clip": 1.01647997, + "balance_loss_mlp": 1.03643119, + "epoch": 0.7716819479933864, + "flos": 19791807471360.0, + "grad_norm": 2.117345370793711, + "language_loss": 0.72845638, + "learning_rate": 5.222445722184903e-07, + "loss": 0.74977142, + "num_input_tokens_seen": 276890760, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6640625, + "step": 12835, + "time_per_iteration": 2.446268320083618 + }, + { + "auxiliary_loss_clip": 0.01101161, + "auxiliary_loss_mlp": 0.01036937, + "balance_loss_clip": 1.02434242, + "balance_loss_mlp": 1.03385723, + "epoch": 0.7717420712460544, + "flos": 18442176825600.0, + "grad_norm": 1.6490070086393855, + "language_loss": 0.70007384, + "learning_rate": 5.219821655586814e-07, + "loss": 0.7214548, + "num_input_tokens_seen": 276909625, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.671875, + "step": 12836, + "time_per_iteration": 2.4494271278381348 + }, + { + "auxiliary_loss_clip": 0.01100539, + "auxiliary_loss_mlp": 0.01030913, + "balance_loss_clip": 1.01912892, + "balance_loss_mlp": 1.03515959, + "epoch": 0.7718021944987223, + "flos": 35189476456320.0, + "grad_norm": 1.6293860419166157, + "language_loss": 0.59337658, + "learning_rate": 5.217198149454575e-07, + "loss": 0.61469114, + "num_input_tokens_seen": 276930760, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65234375, + "step": 12837, + "time_per_iteration": 2.5418989658355713 + }, + { + "auxiliary_loss_clip": 0.01023613, + "auxiliary_loss_mlp": 0.00999355, + "balance_loss_clip": 0.99835348, + "balance_loss_mlp": 1.00311017, + "epoch": 0.7718623177513904, + "flos": 67923167961600.0, + "grad_norm": 0.8631972633412854, + "language_loss": 0.5581463, + "learning_rate": 5.214575203887666e-07, + "loss": 0.578376, + "num_input_tokens_seen": 276989580, + "router_z_loss_clip": 0.01000977, + "router_z_loss_mlp": 0.20507812, + "step": 12838, + "time_per_iteration": 3.0269720554351807 + }, + { + "auxiliary_loss_clip": 0.01100948, + "auxiliary_loss_mlp": 0.01025399, + "balance_loss_clip": 1.01461673, + "balance_loss_mlp": 1.03597295, + "epoch": 0.7719224410040583, + "flos": 18581401941120.0, + "grad_norm": 2.4471669974150347, + "language_loss": 0.69294447, + "learning_rate": 5.211952818985538e-07, + "loss": 0.71420795, + "num_input_tokens_seen": 277005450, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6484375, + "step": 12839, + "time_per_iteration": 2.4177730083465576 + }, + { + "auxiliary_loss_clip": 0.01099889, + "auxiliary_loss_mlp": 0.01025095, + "balance_loss_clip": 1.01409793, + "balance_loss_mlp": 1.03574765, + "epoch": 0.7719825642567263, + "flos": 23075802264960.0, + "grad_norm": 1.7653669822284475, + "language_loss": 0.79856348, + "learning_rate": 5.209330994847647e-07, + "loss": 0.81981325, + "num_input_tokens_seen": 277023055, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.640625, + "step": 12840, + "time_per_iteration": 2.5179991722106934 + }, + { + "auxiliary_loss_clip": 0.01102241, + "auxiliary_loss_mlp": 0.01030173, + "balance_loss_clip": 1.01843691, + "balance_loss_mlp": 1.0361371, + "epoch": 0.7720426875093943, + "flos": 20339086066560.0, + "grad_norm": 1.7784222568456114, + "language_loss": 0.79938293, + "learning_rate": 5.206709731573402e-07, + "loss": 0.82070708, + "num_input_tokens_seen": 277041150, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66015625, + "step": 12841, + "time_per_iteration": 2.5245449542999268 + }, + { + "auxiliary_loss_clip": 0.01102151, + "auxiliary_loss_mlp": 0.01029086, + "balance_loss_clip": 1.01720667, + "balance_loss_mlp": 1.03537869, + "epoch": 0.7721028107620622, + "flos": 23880704181120.0, + "grad_norm": 1.4922109541948092, + "language_loss": 0.76314819, + "learning_rate": 5.204089029262208e-07, + "loss": 0.7844606, + "num_input_tokens_seen": 277063895, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66796875, + "step": 12842, + "time_per_iteration": 2.5023560523986816 + }, + { + "auxiliary_loss_clip": 0.01104825, + "auxiliary_loss_mlp": 0.01034383, + "balance_loss_clip": 1.02228308, + "balance_loss_mlp": 1.03711128, + "epoch": 0.7721629340147302, + "flos": 26651571235200.0, + "grad_norm": 2.1043616353717525, + "language_loss": 0.68631554, + "learning_rate": 5.201468888013445e-07, + "loss": 0.70770752, + "num_input_tokens_seen": 277084045, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 12843, + "time_per_iteration": 2.493771553039551 + }, + { + "auxiliary_loss_clip": 0.01103415, + "auxiliary_loss_mlp": 0.01029337, + "balance_loss_clip": 1.01814365, + "balance_loss_mlp": 1.03442502, + "epoch": 0.7722230572673981, + "flos": 21178857110400.0, + "grad_norm": 2.427096295958664, + "language_loss": 0.73946643, + "learning_rate": 5.198849307926465e-07, + "loss": 0.76079392, + "num_input_tokens_seen": 277102625, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6875, + "step": 12844, + "time_per_iteration": 3.8521201610565186 + }, + { + "auxiliary_loss_clip": 0.01099917, + "auxiliary_loss_mlp": 0.01028689, + "balance_loss_clip": 1.01715553, + "balance_loss_mlp": 1.03452098, + "epoch": 0.7722831805200662, + "flos": 27964644814080.0, + "grad_norm": 1.3727417180259405, + "language_loss": 0.7147876, + "learning_rate": 5.196230289100596e-07, + "loss": 0.73607367, + "num_input_tokens_seen": 277123210, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65234375, + "step": 12845, + "time_per_iteration": 2.480782985687256 + }, + { + "auxiliary_loss_clip": 0.01098431, + "auxiliary_loss_mlp": 0.01027462, + "balance_loss_clip": 1.01647639, + "balance_loss_mlp": 1.03456306, + "epoch": 0.7723433037727341, + "flos": 33875576864640.0, + "grad_norm": 1.8692274529253097, + "language_loss": 0.64329362, + "learning_rate": 5.193611831635159e-07, + "loss": 0.66455245, + "num_input_tokens_seen": 277144895, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.640625, + "step": 12846, + "time_per_iteration": 3.939861297607422 + }, + { + "auxiliary_loss_clip": 0.01024017, + "auxiliary_loss_mlp": 0.00999429, + "balance_loss_clip": 0.99847573, + "balance_loss_mlp": 1.00368702, + "epoch": 0.7724034270254021, + "flos": 62848271940480.0, + "grad_norm": 0.7797260608055787, + "language_loss": 0.61791992, + "learning_rate": 5.19099393562945e-07, + "loss": 0.63815439, + "num_input_tokens_seen": 277205160, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.203125, + "step": 12847, + "time_per_iteration": 4.373151063919067 + }, + { + "auxiliary_loss_clip": 0.01099082, + "auxiliary_loss_mlp": 0.01024766, + "balance_loss_clip": 1.01329207, + "balance_loss_mlp": 1.03237033, + "epoch": 0.77246355027807, + "flos": 23295467888640.0, + "grad_norm": 1.8104305553743092, + "language_loss": 0.78874886, + "learning_rate": 5.188376601182732e-07, + "loss": 0.80998737, + "num_input_tokens_seen": 277223005, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.66796875, + "step": 12848, + "time_per_iteration": 2.4621658325195312 + }, + { + "auxiliary_loss_clip": 0.01104725, + "auxiliary_loss_mlp": 0.01031435, + "balance_loss_clip": 1.02015185, + "balance_loss_mlp": 1.03665447, + "epoch": 0.772523673530738, + "flos": 20121287950080.0, + "grad_norm": 1.8870380118998122, + "language_loss": 0.73187292, + "learning_rate": 5.185759828394261e-07, + "loss": 0.75323451, + "num_input_tokens_seen": 277241785, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6796875, + "step": 12849, + "time_per_iteration": 2.423586368560791 + }, + { + "auxiliary_loss_clip": 0.01099584, + "auxiliary_loss_mlp": 0.01029656, + "balance_loss_clip": 1.01813483, + "balance_loss_mlp": 1.03409362, + "epoch": 0.7725837967834059, + "flos": 17820096157440.0, + "grad_norm": 1.7816955634054865, + "language_loss": 0.78761244, + "learning_rate": 5.183143617363261e-07, + "loss": 0.80890489, + "num_input_tokens_seen": 277259050, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 12850, + "time_per_iteration": 3.8340566158294678 + }, + { + "auxiliary_loss_clip": 0.0110087, + "auxiliary_loss_mlp": 0.01034019, + "balance_loss_clip": 1.02208018, + "balance_loss_mlp": 1.03316355, + "epoch": 0.772643920036074, + "flos": 27198921657600.0, + "grad_norm": 1.5411131818733386, + "language_loss": 0.79572296, + "learning_rate": 5.180527968188935e-07, + "loss": 0.81707186, + "num_input_tokens_seen": 277278235, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 12851, + "time_per_iteration": 2.4925901889801025 + }, + { + "auxiliary_loss_clip": 0.01100454, + "auxiliary_loss_mlp": 0.0103028, + "balance_loss_clip": 1.01792979, + "balance_loss_mlp": 1.03538489, + "epoch": 0.7727040432887419, + "flos": 21579512388480.0, + "grad_norm": 1.50632412923142, + "language_loss": 0.73631006, + "learning_rate": 5.177912880970474e-07, + "loss": 0.75761741, + "num_input_tokens_seen": 277298355, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6484375, + "step": 12852, + "time_per_iteration": 2.4682977199554443 + }, + { + "auxiliary_loss_clip": 0.01098009, + "auxiliary_loss_mlp": 0.01033845, + "balance_loss_clip": 1.02231097, + "balance_loss_mlp": 1.03296447, + "epoch": 0.7727641665414099, + "flos": 22236641752320.0, + "grad_norm": 1.8447801118424108, + "language_loss": 0.8239882, + "learning_rate": 5.17529835580704e-07, + "loss": 0.84530675, + "num_input_tokens_seen": 277316095, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 12853, + "time_per_iteration": 2.4569756984710693 + }, + { + "auxiliary_loss_clip": 0.01023792, + "auxiliary_loss_mlp": 0.0099718, + "balance_loss_clip": 0.99619693, + "balance_loss_mlp": 1.00358176, + "epoch": 0.7728242897940779, + "flos": 54832221463680.0, + "grad_norm": 0.9862475584721329, + "language_loss": 0.54506302, + "learning_rate": 5.172684392797786e-07, + "loss": 0.56527275, + "num_input_tokens_seen": 277380130, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.20214844, + "step": 12854, + "time_per_iteration": 3.091365098953247 + }, + { + "auxiliary_loss_clip": 0.01103537, + "auxiliary_loss_mlp": 0.01028551, + "balance_loss_clip": 1.01575994, + "balance_loss_mlp": 1.0352025, + "epoch": 0.7728844130467458, + "flos": 34461962392320.0, + "grad_norm": 1.475002899268902, + "language_loss": 0.71589357, + "learning_rate": 5.170070992041826e-07, + "loss": 0.73721445, + "num_input_tokens_seen": 277404015, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.68359375, + "step": 12855, + "time_per_iteration": 2.563339948654175 + }, + { + "auxiliary_loss_clip": 0.01100584, + "auxiliary_loss_mlp": 0.01029763, + "balance_loss_clip": 1.01721025, + "balance_loss_mlp": 1.03491831, + "epoch": 0.7729445362994138, + "flos": 18916341287040.0, + "grad_norm": 1.6853102907434419, + "language_loss": 0.67508936, + "learning_rate": 5.167458153638254e-07, + "loss": 0.69639283, + "num_input_tokens_seen": 277421375, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.65625, + "step": 12856, + "time_per_iteration": 2.4246950149536133 + }, + { + "auxiliary_loss_clip": 0.01102261, + "auxiliary_loss_mlp": 0.0102718, + "balance_loss_clip": 1.0153966, + "balance_loss_mlp": 1.03492117, + "epoch": 0.7730046595520818, + "flos": 22200048771840.0, + "grad_norm": 1.6664497759881594, + "language_loss": 0.78636038, + "learning_rate": 5.164845877686162e-07, + "loss": 0.8076548, + "num_input_tokens_seen": 277440170, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.671875, + "step": 12857, + "time_per_iteration": 2.4259722232818604 + }, + { + "auxiliary_loss_clip": 0.01099797, + "auxiliary_loss_mlp": 0.01029002, + "balance_loss_clip": 1.0170691, + "balance_loss_mlp": 1.03505707, + "epoch": 0.7730647828047498, + "flos": 13552328695680.0, + "grad_norm": 2.4693745762825627, + "language_loss": 0.78503597, + "learning_rate": 5.162234164284591e-07, + "loss": 0.80632401, + "num_input_tokens_seen": 277456880, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6484375, + "step": 12858, + "time_per_iteration": 2.414808988571167 + }, + { + "auxiliary_loss_clip": 0.0110013, + "auxiliary_loss_mlp": 0.01030418, + "balance_loss_clip": 1.01887894, + "balance_loss_mlp": 1.0332911, + "epoch": 0.7731249060574177, + "flos": 21976037602560.0, + "grad_norm": 2.1506807950165716, + "language_loss": 0.76832533, + "learning_rate": 5.159623013532591e-07, + "loss": 0.78963083, + "num_input_tokens_seen": 277475365, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66796875, + "step": 12859, + "time_per_iteration": 2.4226794242858887 + }, + { + "auxiliary_loss_clip": 0.01098676, + "auxiliary_loss_mlp": 0.01027748, + "balance_loss_clip": 1.01765668, + "balance_loss_mlp": 1.03636694, + "epoch": 0.7731850293100857, + "flos": 22601817371520.0, + "grad_norm": 1.3976193464700644, + "language_loss": 0.67598879, + "learning_rate": 5.157012425529186e-07, + "loss": 0.69725305, + "num_input_tokens_seen": 277494975, + "router_z_loss_clip": 0.10107422, + "router_z_loss_mlp": 0.62109375, + "step": 12860, + "time_per_iteration": 2.4838390350341797 + }, + { + "auxiliary_loss_clip": 0.01102762, + "auxiliary_loss_mlp": 0.01036693, + "balance_loss_clip": 1.02416456, + "balance_loss_mlp": 1.03352654, + "epoch": 0.7732451525627536, + "flos": 14098422142080.0, + "grad_norm": 2.447865183826217, + "language_loss": 0.7403549, + "learning_rate": 5.154402400373343e-07, + "loss": 0.76174939, + "num_input_tokens_seen": 277510520, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69140625, + "step": 12861, + "time_per_iteration": 2.4177722930908203 + }, + { + "auxiliary_loss_clip": 0.01105061, + "auxiliary_loss_mlp": 0.01029699, + "balance_loss_clip": 1.01744413, + "balance_loss_mlp": 1.03674173, + "epoch": 0.7733052758154216, + "flos": 21470020755840.0, + "grad_norm": 1.5943042288451297, + "language_loss": 0.74818659, + "learning_rate": 5.15179293816405e-07, + "loss": 0.76953417, + "num_input_tokens_seen": 277530505, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6796875, + "step": 12862, + "time_per_iteration": 2.502509832382202 + }, + { + "auxiliary_loss_clip": 0.01098685, + "auxiliary_loss_mlp": 0.0102901, + "balance_loss_clip": 1.01839459, + "balance_loss_mlp": 1.03460789, + "epoch": 0.7733653990680895, + "flos": 21394284929280.0, + "grad_norm": 1.605143243310102, + "language_loss": 0.82941031, + "learning_rate": 5.149184039000256e-07, + "loss": 0.85068727, + "num_input_tokens_seen": 277550810, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.640625, + "step": 12863, + "time_per_iteration": 2.435492753982544 + }, + { + "auxiliary_loss_clip": 0.01099256, + "auxiliary_loss_mlp": 0.01029825, + "balance_loss_clip": 1.01841044, + "balance_loss_mlp": 1.03421164, + "epoch": 0.7734255223207576, + "flos": 17676058619520.0, + "grad_norm": 1.686286227621035, + "language_loss": 0.73311162, + "learning_rate": 5.146575702980898e-07, + "loss": 0.7544024, + "num_input_tokens_seen": 277567680, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 12864, + "time_per_iteration": 2.4345412254333496 + }, + { + "auxiliary_loss_clip": 0.01100211, + "auxiliary_loss_mlp": 0.0103261, + "balance_loss_clip": 1.02182722, + "balance_loss_mlp": 1.03336382, + "epoch": 0.7734856455734255, + "flos": 25230837617280.0, + "grad_norm": 1.7236073313381683, + "language_loss": 0.82668412, + "learning_rate": 5.143967930204871e-07, + "loss": 0.84801233, + "num_input_tokens_seen": 277588970, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.66796875, + "step": 12865, + "time_per_iteration": 2.489175796508789 + }, + { + "auxiliary_loss_clip": 0.01106204, + "auxiliary_loss_mlp": 0.01031833, + "balance_loss_clip": 1.01882756, + "balance_loss_mlp": 1.03688681, + "epoch": 0.7735457688260935, + "flos": 23433112805760.0, + "grad_norm": 1.9919400131202358, + "language_loss": 0.71579105, + "learning_rate": 5.141360720771077e-07, + "loss": 0.73717141, + "num_input_tokens_seen": 277605450, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6953125, + "step": 12866, + "time_per_iteration": 2.4729628562927246 + }, + { + "auxiliary_loss_clip": 0.01103336, + "auxiliary_loss_mlp": 0.01026596, + "balance_loss_clip": 1.01476479, + "balance_loss_mlp": 1.03699803, + "epoch": 0.7736058920787615, + "flos": 18729246320640.0, + "grad_norm": 2.21518020983948, + "language_loss": 0.64429164, + "learning_rate": 5.138754074778371e-07, + "loss": 0.66559094, + "num_input_tokens_seen": 277622530, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 12867, + "time_per_iteration": 2.3936469554901123 + }, + { + "auxiliary_loss_clip": 0.01098589, + "auxiliary_loss_mlp": 0.01033636, + "balance_loss_clip": 1.02214408, + "balance_loss_mlp": 1.03422713, + "epoch": 0.7736660153314294, + "flos": 22893304239360.0, + "grad_norm": 1.4977465030205475, + "language_loss": 0.70845938, + "learning_rate": 5.136147992325595e-07, + "loss": 0.72978157, + "num_input_tokens_seen": 277642700, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.640625, + "step": 12868, + "time_per_iteration": 2.5017075538635254 + }, + { + "auxiliary_loss_clip": 0.01103278, + "auxiliary_loss_mlp": 0.01027324, + "balance_loss_clip": 1.0157187, + "balance_loss_mlp": 1.03648961, + "epoch": 0.7737261385840974, + "flos": 13800901789440.0, + "grad_norm": 2.3483431493436506, + "language_loss": 0.78185302, + "learning_rate": 5.133542473511578e-07, + "loss": 0.803159, + "num_input_tokens_seen": 277660005, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66796875, + "step": 12869, + "time_per_iteration": 2.4156572818756104 + }, + { + "auxiliary_loss_clip": 0.01095592, + "auxiliary_loss_mlp": 0.01027488, + "balance_loss_clip": 1.01597238, + "balance_loss_mlp": 1.0325917, + "epoch": 0.7737862618367654, + "flos": 28730727106560.0, + "grad_norm": 2.073469705859901, + "language_loss": 0.73596758, + "learning_rate": 5.130937518435124e-07, + "loss": 0.75719839, + "num_input_tokens_seen": 277682890, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6328125, + "step": 12870, + "time_per_iteration": 2.517237663269043 + }, + { + "auxiliary_loss_clip": 0.01101602, + "auxiliary_loss_mlp": 0.0103016, + "balance_loss_clip": 1.01848328, + "balance_loss_mlp": 1.03500986, + "epoch": 0.7738463850894334, + "flos": 17018570119680.0, + "grad_norm": 2.174151142441679, + "language_loss": 0.75611806, + "learning_rate": 5.12833312719501e-07, + "loss": 0.77743572, + "num_input_tokens_seen": 277699330, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 12871, + "time_per_iteration": 2.400402069091797 + }, + { + "auxiliary_loss_clip": 0.01099358, + "auxiliary_loss_mlp": 0.01030488, + "balance_loss_clip": 1.0195806, + "balance_loss_mlp": 1.03400016, + "epoch": 0.7739065083421013, + "flos": 20704010290560.0, + "grad_norm": 1.515902079714309, + "language_loss": 0.69396317, + "learning_rate": 5.12572929988999e-07, + "loss": 0.71526158, + "num_input_tokens_seen": 277718750, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.65234375, + "step": 12872, + "time_per_iteration": 2.454831838607788 + }, + { + "auxiliary_loss_clip": 0.01101254, + "auxiliary_loss_mlp": 0.01031553, + "balance_loss_clip": 1.01894081, + "balance_loss_mlp": 1.03436494, + "epoch": 0.7739666315947693, + "flos": 20697222620160.0, + "grad_norm": 2.1128263848604303, + "language_loss": 0.85076445, + "learning_rate": 5.123126036618804e-07, + "loss": 0.87209249, + "num_input_tokens_seen": 277734645, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.66796875, + "step": 12873, + "time_per_iteration": 2.413208484649658 + }, + { + "auxiliary_loss_clip": 0.01103416, + "auxiliary_loss_mlp": 0.0103242, + "balance_loss_clip": 1.02108884, + "balance_loss_mlp": 1.03663659, + "epoch": 0.7740267548474372, + "flos": 29570677718400.0, + "grad_norm": 2.3833664106096357, + "language_loss": 0.65228915, + "learning_rate": 5.120523337480174e-07, + "loss": 0.67364746, + "num_input_tokens_seen": 277755535, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 12874, + "time_per_iteration": 2.511897563934326 + }, + { + "auxiliary_loss_clip": 0.01101804, + "auxiliary_loss_mlp": 0.01029077, + "balance_loss_clip": 1.01711988, + "balance_loss_mlp": 1.03627491, + "epoch": 0.7740868781001052, + "flos": 23659099223040.0, + "grad_norm": 1.672939756784885, + "language_loss": 0.62344849, + "learning_rate": 5.117921202572785e-07, + "loss": 0.64475727, + "num_input_tokens_seen": 277775585, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 12875, + "time_per_iteration": 2.4547970294952393 + }, + { + "auxiliary_loss_clip": 0.01100089, + "auxiliary_loss_mlp": 0.01030816, + "balance_loss_clip": 1.01926494, + "balance_loss_mlp": 1.03329086, + "epoch": 0.7741470013527731, + "flos": 24717314828160.0, + "grad_norm": 1.7114118176893034, + "language_loss": 0.65592134, + "learning_rate": 5.115319631995318e-07, + "loss": 0.67723036, + "num_input_tokens_seen": 277794795, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66796875, + "step": 12876, + "time_per_iteration": 2.507066011428833 + }, + { + "auxiliary_loss_clip": 0.01097976, + "auxiliary_loss_mlp": 0.01029435, + "balance_loss_clip": 1.01795566, + "balance_loss_mlp": 1.03334641, + "epoch": 0.7742071246054412, + "flos": 21871645701120.0, + "grad_norm": 2.056913252626623, + "language_loss": 0.71540773, + "learning_rate": 5.112718625846433e-07, + "loss": 0.73668182, + "num_input_tokens_seen": 277813235, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6484375, + "step": 12877, + "time_per_iteration": 2.4201643466949463 + }, + { + "auxiliary_loss_clip": 0.01104162, + "auxiliary_loss_mlp": 0.01031249, + "balance_loss_clip": 1.0190177, + "balance_loss_mlp": 1.03517962, + "epoch": 0.7742672478581091, + "flos": 22674249146880.0, + "grad_norm": 1.8044293280530723, + "language_loss": 0.82859612, + "learning_rate": 5.110118184224736e-07, + "loss": 0.84995025, + "num_input_tokens_seen": 277832560, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6875, + "step": 12878, + "time_per_iteration": 2.4779839515686035 + }, + { + "auxiliary_loss_clip": 0.01101355, + "auxiliary_loss_mlp": 0.01031253, + "balance_loss_clip": 1.01875401, + "balance_loss_mlp": 1.03469586, + "epoch": 0.7743273711107771, + "flos": 18840892769280.0, + "grad_norm": 1.7446777293969558, + "language_loss": 0.73307019, + "learning_rate": 5.10751830722885e-07, + "loss": 0.75439632, + "num_input_tokens_seen": 277850120, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6640625, + "step": 12879, + "time_per_iteration": 2.4160289764404297 + }, + { + "auxiliary_loss_clip": 0.01095247, + "auxiliary_loss_mlp": 0.01026524, + "balance_loss_clip": 1.01504445, + "balance_loss_mlp": 1.03218174, + "epoch": 0.7743874943634451, + "flos": 28729326476160.0, + "grad_norm": 2.0530344125877824, + "language_loss": 0.79587936, + "learning_rate": 5.104918994957364e-07, + "loss": 0.81709713, + "num_input_tokens_seen": 277871020, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6328125, + "step": 12880, + "time_per_iteration": 2.5343987941741943 + }, + { + "auxiliary_loss_clip": 0.01098931, + "auxiliary_loss_mlp": 0.01032432, + "balance_loss_clip": 1.02097631, + "balance_loss_mlp": 1.03499806, + "epoch": 0.774447617616113, + "flos": 21909639312000.0, + "grad_norm": 1.5022230028348473, + "language_loss": 0.69992185, + "learning_rate": 5.102320247508847e-07, + "loss": 0.72123551, + "num_input_tokens_seen": 277891525, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.640625, + "step": 12881, + "time_per_iteration": 2.4520153999328613 + }, + { + "auxiliary_loss_clip": 0.01105007, + "auxiliary_loss_mlp": 0.01035783, + "balance_loss_clip": 1.02330136, + "balance_loss_mlp": 1.0357368, + "epoch": 0.774507740868781, + "flos": 19500643825920.0, + "grad_norm": 2.221505140298077, + "language_loss": 0.84215307, + "learning_rate": 5.099722064981832e-07, + "loss": 0.86356097, + "num_input_tokens_seen": 277910425, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69140625, + "step": 12882, + "time_per_iteration": 2.4450690746307373 + }, + { + "auxiliary_loss_clip": 0.01023891, + "auxiliary_loss_mlp": 0.01012882, + "balance_loss_clip": 1.01180887, + "balance_loss_mlp": 1.00356591, + "epoch": 0.774567864121449, + "flos": 59426560402560.0, + "grad_norm": 0.8021199290846766, + "language_loss": 0.6040681, + "learning_rate": 5.097124447474858e-07, + "loss": 0.62443578, + "num_input_tokens_seen": 277972795, + "router_z_loss_clip": 0.01074219, + "router_z_loss_mlp": 0.203125, + "step": 12883, + "time_per_iteration": 3.0097620487213135 + }, + { + "auxiliary_loss_clip": 0.01103604, + "auxiliary_loss_mlp": 0.01032342, + "balance_loss_clip": 1.02008712, + "balance_loss_mlp": 1.03575671, + "epoch": 0.774627987374117, + "flos": 13225326255360.0, + "grad_norm": 1.7162492869747636, + "language_loss": 0.72789645, + "learning_rate": 5.094527395086416e-07, + "loss": 0.7492559, + "num_input_tokens_seen": 277990675, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6796875, + "step": 12884, + "time_per_iteration": 2.4377074241638184 + }, + { + "auxiliary_loss_clip": 0.01100524, + "auxiliary_loss_mlp": 0.01030847, + "balance_loss_clip": 1.02003515, + "balance_loss_mlp": 1.0354799, + "epoch": 0.7746881106267849, + "flos": 21394033534080.0, + "grad_norm": 3.230363758289503, + "language_loss": 0.80970025, + "learning_rate": 5.091930907914986e-07, + "loss": 0.83101392, + "num_input_tokens_seen": 278010050, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.65234375, + "step": 12885, + "time_per_iteration": 2.4225785732269287 + }, + { + "auxiliary_loss_clip": 0.01098684, + "auxiliary_loss_mlp": 0.01033412, + "balance_loss_clip": 1.02272451, + "balance_loss_mlp": 1.03376412, + "epoch": 0.7747482338794529, + "flos": 25629338079360.0, + "grad_norm": 1.8035422481179095, + "language_loss": 0.64108509, + "learning_rate": 5.089334986059029e-07, + "loss": 0.66240609, + "num_input_tokens_seen": 278030660, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6484375, + "step": 12886, + "time_per_iteration": 3.857712507247925 + }, + { + "auxiliary_loss_clip": 0.01099608, + "auxiliary_loss_mlp": 0.01028414, + "balance_loss_clip": 1.0177393, + "balance_loss_mlp": 1.03219748, + "epoch": 0.7748083571321208, + "flos": 11546933402880.0, + "grad_norm": 2.0473331213234327, + "language_loss": 0.69581932, + "learning_rate": 5.086739629616987e-07, + "loss": 0.71709955, + "num_input_tokens_seen": 278047645, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.671875, + "step": 12887, + "time_per_iteration": 2.423344373703003 + }, + { + "auxiliary_loss_clip": 0.01097443, + "auxiliary_loss_mlp": 0.01028368, + "balance_loss_clip": 1.01763892, + "balance_loss_mlp": 1.03330386, + "epoch": 0.7748684803847888, + "flos": 19062425900160.0, + "grad_norm": 1.7264815005579048, + "language_loss": 0.70614457, + "learning_rate": 5.084144838687275e-07, + "loss": 0.72740269, + "num_input_tokens_seen": 278066170, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.64453125, + "step": 12888, + "time_per_iteration": 3.8539748191833496 + }, + { + "auxiliary_loss_clip": 0.01101208, + "auxiliary_loss_mlp": 0.01029001, + "balance_loss_clip": 1.01708579, + "balance_loss_mlp": 1.03361416, + "epoch": 0.7749286036374567, + "flos": 22273162905600.0, + "grad_norm": 2.628922406260807, + "language_loss": 0.81764227, + "learning_rate": 5.081550613368279e-07, + "loss": 0.83894438, + "num_input_tokens_seen": 278085545, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 12889, + "time_per_iteration": 3.9081172943115234 + }, + { + "auxiliary_loss_clip": 0.01100926, + "auxiliary_loss_mlp": 0.01029423, + "balance_loss_clip": 1.01818752, + "balance_loss_mlp": 1.0352304, + "epoch": 0.7749887268901248, + "flos": 20192462749440.0, + "grad_norm": 1.7934757747385575, + "language_loss": 0.79690224, + "learning_rate": 5.07895695375838e-07, + "loss": 0.81820571, + "num_input_tokens_seen": 278102995, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 12890, + "time_per_iteration": 2.4259889125823975 + }, + { + "auxiliary_loss_clip": 0.0110576, + "auxiliary_loss_mlp": 0.01031786, + "balance_loss_clip": 1.02038956, + "balance_loss_mlp": 1.03786349, + "epoch": 0.7750488501427927, + "flos": 20337541781760.0, + "grad_norm": 1.8078298047405903, + "language_loss": 0.6619277, + "learning_rate": 5.076363859955932e-07, + "loss": 0.68330312, + "num_input_tokens_seen": 278121460, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6796875, + "step": 12891, + "time_per_iteration": 2.421792984008789 + }, + { + "auxiliary_loss_clip": 0.01100105, + "auxiliary_loss_mlp": 0.01027779, + "balance_loss_clip": 1.01623988, + "balance_loss_mlp": 1.03472996, + "epoch": 0.7751089733954607, + "flos": 28364043116160.0, + "grad_norm": 1.4457356185681014, + "language_loss": 0.78705311, + "learning_rate": 5.073771332059257e-07, + "loss": 0.80833197, + "num_input_tokens_seen": 278143905, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65234375, + "step": 12892, + "time_per_iteration": 4.022496223449707 + }, + { + "auxiliary_loss_clip": 0.0110464, + "auxiliary_loss_mlp": 0.01025963, + "balance_loss_clip": 1.01451278, + "balance_loss_mlp": 1.03709579, + "epoch": 0.7751690966481286, + "flos": 16943803960320.0, + "grad_norm": 1.95553815104522, + "language_loss": 0.6747188, + "learning_rate": 5.071179370166669e-07, + "loss": 0.69602484, + "num_input_tokens_seen": 278160850, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.67578125, + "step": 12893, + "time_per_iteration": 2.4064764976501465 + }, + { + "auxiliary_loss_clip": 0.010241, + "auxiliary_loss_mlp": 0.01003293, + "balance_loss_clip": 1.0022974, + "balance_loss_mlp": 1.00361943, + "epoch": 0.7752292199007966, + "flos": 65668050339840.0, + "grad_norm": 0.8057156528399092, + "language_loss": 0.58470869, + "learning_rate": 5.068587974376468e-07, + "loss": 0.60498261, + "num_input_tokens_seen": 278219950, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.20507812, + "step": 12894, + "time_per_iteration": 3.0993287563323975 + }, + { + "auxiliary_loss_clip": 0.01103557, + "auxiliary_loss_mlp": 0.01030628, + "balance_loss_clip": 1.01871347, + "balance_loss_mlp": 1.03571117, + "epoch": 0.7752893431534646, + "flos": 20594662312320.0, + "grad_norm": 1.980811218300561, + "language_loss": 0.78687382, + "learning_rate": 5.065997144786895e-07, + "loss": 0.80821562, + "num_input_tokens_seen": 278237805, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 12895, + "time_per_iteration": 2.4280591011047363 + }, + { + "auxiliary_loss_clip": 0.01101744, + "auxiliary_loss_mlp": 0.01026712, + "balance_loss_clip": 1.01484489, + "balance_loss_mlp": 1.03593993, + "epoch": 0.7753494664061326, + "flos": 20485350247680.0, + "grad_norm": 1.9018795725509905, + "language_loss": 0.67731452, + "learning_rate": 5.063406881496209e-07, + "loss": 0.69859904, + "num_input_tokens_seen": 278257660, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.65625, + "step": 12896, + "time_per_iteration": 2.461527109146118 + }, + { + "auxiliary_loss_clip": 0.01099523, + "auxiliary_loss_mlp": 0.01033427, + "balance_loss_clip": 1.0224123, + "balance_loss_mlp": 1.0342598, + "epoch": 0.7754095896588006, + "flos": 20265900105600.0, + "grad_norm": 1.7046546419810793, + "language_loss": 0.69181269, + "learning_rate": 5.060817184602629e-07, + "loss": 0.71314216, + "num_input_tokens_seen": 278275110, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65234375, + "step": 12897, + "time_per_iteration": 2.4287121295928955 + }, + { + "auxiliary_loss_clip": 0.01103573, + "auxiliary_loss_mlp": 0.01034198, + "balance_loss_clip": 1.02193165, + "balance_loss_mlp": 1.03643906, + "epoch": 0.7754697129114685, + "flos": 23331091201920.0, + "grad_norm": 1.6934570873388384, + "language_loss": 0.75021553, + "learning_rate": 5.058228054204364e-07, + "loss": 0.77159327, + "num_input_tokens_seen": 278293035, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.671875, + "step": 12898, + "time_per_iteration": 2.476008415222168 + }, + { + "auxiliary_loss_clip": 0.01101597, + "auxiliary_loss_mlp": 0.01028302, + "balance_loss_clip": 1.01574945, + "balance_loss_mlp": 1.03492308, + "epoch": 0.7755298361641365, + "flos": 17347619635200.0, + "grad_norm": 2.107133651932301, + "language_loss": 0.70084441, + "learning_rate": 5.055639490399588e-07, + "loss": 0.72214341, + "num_input_tokens_seen": 278311010, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.66796875, + "step": 12899, + "time_per_iteration": 2.4085845947265625 + }, + { + "auxiliary_loss_clip": 0.01100599, + "auxiliary_loss_mlp": 0.01030096, + "balance_loss_clip": 1.01829982, + "balance_loss_mlp": 1.03514957, + "epoch": 0.7755899594168044, + "flos": 19645866512640.0, + "grad_norm": 1.8299634820170116, + "language_loss": 0.74540645, + "learning_rate": 5.053051493286453e-07, + "loss": 0.76671344, + "num_input_tokens_seen": 278329900, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65625, + "step": 12900, + "time_per_iteration": 2.463158369064331 + }, + { + "auxiliary_loss_clip": 0.01097147, + "auxiliary_loss_mlp": 0.010342, + "balance_loss_clip": 1.02339911, + "balance_loss_mlp": 1.03308296, + "epoch": 0.7756500826694724, + "flos": 27414457217280.0, + "grad_norm": 1.7671979453264242, + "language_loss": 0.77766836, + "learning_rate": 5.050464062963113e-07, + "loss": 0.79898179, + "num_input_tokens_seen": 278349980, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.640625, + "step": 12901, + "time_per_iteration": 2.487149715423584 + }, + { + "auxiliary_loss_clip": 0.01103314, + "auxiliary_loss_mlp": 0.01027981, + "balance_loss_clip": 1.01622105, + "balance_loss_mlp": 1.03825235, + "epoch": 0.7757102059221404, + "flos": 28730511624960.0, + "grad_norm": 1.6889669978659576, + "language_loss": 0.77270627, + "learning_rate": 5.047877199527666e-07, + "loss": 0.79401928, + "num_input_tokens_seen": 278372485, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6484375, + "step": 12902, + "time_per_iteration": 2.522047758102417 + }, + { + "auxiliary_loss_clip": 0.01099422, + "auxiliary_loss_mlp": 0.01028642, + "balance_loss_clip": 1.01794279, + "balance_loss_mlp": 1.03434253, + "epoch": 0.7757703291748084, + "flos": 22486795044480.0, + "grad_norm": 1.6966870042115003, + "language_loss": 0.73324692, + "learning_rate": 5.045290903078215e-07, + "loss": 0.75452751, + "num_input_tokens_seen": 278391660, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.65234375, + "step": 12903, + "time_per_iteration": 2.4301648139953613 + }, + { + "auxiliary_loss_clip": 0.01101778, + "auxiliary_loss_mlp": 0.01025962, + "balance_loss_clip": 1.01404119, + "balance_loss_mlp": 1.03656173, + "epoch": 0.7758304524274763, + "flos": 21430159637760.0, + "grad_norm": 2.1229192794074025, + "language_loss": 0.76073396, + "learning_rate": 5.042705173712835e-07, + "loss": 0.78201139, + "num_input_tokens_seen": 278409125, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65234375, + "step": 12904, + "time_per_iteration": 2.4397873878479004 + }, + { + "auxiliary_loss_clip": 0.01096338, + "auxiliary_loss_mlp": 0.01023985, + "balance_loss_clip": 1.01313651, + "balance_loss_mlp": 1.03401458, + "epoch": 0.7758905756801443, + "flos": 23659242877440.0, + "grad_norm": 2.282889081568611, + "language_loss": 0.68131924, + "learning_rate": 5.040120011529576e-07, + "loss": 0.70252246, + "num_input_tokens_seen": 278429450, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.625, + "step": 12905, + "time_per_iteration": 2.444009780883789 + }, + { + "auxiliary_loss_clip": 0.01098458, + "auxiliary_loss_mlp": 0.01027316, + "balance_loss_clip": 1.01636112, + "balance_loss_mlp": 1.03590798, + "epoch": 0.7759506989328122, + "flos": 28365479660160.0, + "grad_norm": 1.6520534873626833, + "language_loss": 0.67321658, + "learning_rate": 5.037535416626459e-07, + "loss": 0.69447428, + "num_input_tokens_seen": 278449925, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.625, + "step": 12906, + "time_per_iteration": 2.5024046897888184 + }, + { + "auxiliary_loss_clip": 0.01101029, + "auxiliary_loss_mlp": 0.01031572, + "balance_loss_clip": 1.02074146, + "balance_loss_mlp": 1.03560805, + "epoch": 0.7760108221854802, + "flos": 14902785354240.0, + "grad_norm": 3.183876280395432, + "language_loss": 0.81314665, + "learning_rate": 5.034951389101498e-07, + "loss": 0.83447266, + "num_input_tokens_seen": 278467255, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.65625, + "step": 12907, + "time_per_iteration": 2.3983490467071533 + }, + { + "auxiliary_loss_clip": 0.01098064, + "auxiliary_loss_mlp": 0.01030989, + "balance_loss_clip": 1.02001524, + "balance_loss_mlp": 1.0352093, + "epoch": 0.7760709454381483, + "flos": 14792503622400.0, + "grad_norm": 2.1955762882014604, + "language_loss": 0.67891413, + "learning_rate": 5.032367929052685e-07, + "loss": 0.70020467, + "num_input_tokens_seen": 278484250, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.62890625, + "step": 12908, + "time_per_iteration": 2.4205586910247803 + }, + { + "auxiliary_loss_clip": 0.0110402, + "auxiliary_loss_mlp": 0.01037159, + "balance_loss_clip": 1.02561998, + "balance_loss_mlp": 1.0367105, + "epoch": 0.7761310686908162, + "flos": 17379831156480.0, + "grad_norm": 1.5072254351199776, + "language_loss": 0.70509684, + "learning_rate": 5.029785036577976e-07, + "loss": 0.72650868, + "num_input_tokens_seen": 278502740, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 12909, + "time_per_iteration": 2.411200523376465 + }, + { + "auxiliary_loss_clip": 0.01098463, + "auxiliary_loss_mlp": 0.01031862, + "balance_loss_clip": 1.02122903, + "balance_loss_mlp": 1.03443766, + "epoch": 0.7761911919434842, + "flos": 25556547168000.0, + "grad_norm": 1.8009791603999328, + "language_loss": 0.677131, + "learning_rate": 5.027202711775324e-07, + "loss": 0.69843423, + "num_input_tokens_seen": 278523890, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.640625, + "step": 12910, + "time_per_iteration": 2.4990389347076416 + }, + { + "auxiliary_loss_clip": 0.01102537, + "auxiliary_loss_mlp": 0.01030888, + "balance_loss_clip": 1.01995671, + "balance_loss_mlp": 1.03720117, + "epoch": 0.7762513151961521, + "flos": 23179763203200.0, + "grad_norm": 1.6715228881797681, + "language_loss": 0.71815217, + "learning_rate": 5.024620954742646e-07, + "loss": 0.73948646, + "num_input_tokens_seen": 278543185, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.65234375, + "step": 12911, + "time_per_iteration": 2.4534413814544678 + }, + { + "auxiliary_loss_clip": 0.01105044, + "auxiliary_loss_mlp": 0.01032193, + "balance_loss_clip": 1.01966429, + "balance_loss_mlp": 1.03769255, + "epoch": 0.7763114384488201, + "flos": 21689614552320.0, + "grad_norm": 2.415333717110697, + "language_loss": 0.63629675, + "learning_rate": 5.022039765577836e-07, + "loss": 0.65766907, + "num_input_tokens_seen": 278559220, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.67578125, + "step": 12912, + "time_per_iteration": 2.485800266265869 + }, + { + "auxiliary_loss_clip": 0.01024108, + "auxiliary_loss_mlp": 0.00998178, + "balance_loss_clip": 0.99713534, + "balance_loss_mlp": 1.00357115, + "epoch": 0.776371561701488, + "flos": 69025554316800.0, + "grad_norm": 1.0865465621016743, + "language_loss": 0.53211093, + "learning_rate": 5.019459144378779e-07, + "loss": 0.55233377, + "num_input_tokens_seen": 278618185, + "router_z_loss_clip": 0.01043701, + "router_z_loss_mlp": 0.20507812, + "step": 12913, + "time_per_iteration": 3.1158273220062256 + }, + { + "auxiliary_loss_clip": 0.01102849, + "auxiliary_loss_mlp": 0.01031884, + "balance_loss_clip": 1.02007604, + "balance_loss_mlp": 1.03618884, + "epoch": 0.776431684954156, + "flos": 22893914770560.0, + "grad_norm": 2.955130949159741, + "language_loss": 0.62075317, + "learning_rate": 5.016879091243338e-07, + "loss": 0.64210051, + "num_input_tokens_seen": 278636210, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6640625, + "step": 12914, + "time_per_iteration": 2.4749767780303955 + }, + { + "auxiliary_loss_clip": 0.01099375, + "auxiliary_loss_mlp": 0.0103267, + "balance_loss_clip": 1.02070785, + "balance_loss_mlp": 1.03413486, + "epoch": 0.776491808206824, + "flos": 20261554560000.0, + "grad_norm": 1.8057060992355358, + "language_loss": 0.82471168, + "learning_rate": 5.014299606269339e-07, + "loss": 0.84603214, + "num_input_tokens_seen": 278653305, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.65234375, + "step": 12915, + "time_per_iteration": 2.439039468765259 + }, + { + "auxiliary_loss_clip": 0.01103501, + "auxiliary_loss_mlp": 0.01031701, + "balance_loss_clip": 1.01918375, + "balance_loss_mlp": 1.03486073, + "epoch": 0.776551931459492, + "flos": 26759051706240.0, + "grad_norm": 1.6623901678084019, + "language_loss": 0.7471149, + "learning_rate": 5.011720689554603e-07, + "loss": 0.76846689, + "num_input_tokens_seen": 278671850, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 12916, + "time_per_iteration": 2.494717836380005 + }, + { + "auxiliary_loss_clip": 0.01099429, + "auxiliary_loss_mlp": 0.01027417, + "balance_loss_clip": 1.01588905, + "balance_loss_mlp": 1.03332853, + "epoch": 0.7766120547121599, + "flos": 52665080250240.0, + "grad_norm": 1.7727217475878263, + "language_loss": 0.65696949, + "learning_rate": 5.009142341196919e-07, + "loss": 0.67823803, + "num_input_tokens_seen": 278697860, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66015625, + "step": 12917, + "time_per_iteration": 2.718024969100952 + }, + { + "auxiliary_loss_clip": 0.01100019, + "auxiliary_loss_mlp": 0.01034508, + "balance_loss_clip": 1.02317166, + "balance_loss_mlp": 1.03343606, + "epoch": 0.7766721779648279, + "flos": 25156215112320.0, + "grad_norm": 1.709981739113561, + "language_loss": 0.64356208, + "learning_rate": 5.006564561294065e-07, + "loss": 0.66490734, + "num_input_tokens_seen": 278720655, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 12918, + "time_per_iteration": 2.5265743732452393 + }, + { + "auxiliary_loss_clip": 0.01099633, + "auxiliary_loss_mlp": 0.01031711, + "balance_loss_clip": 1.02063048, + "balance_loss_mlp": 1.03485835, + "epoch": 0.7767323012174958, + "flos": 23760761690880.0, + "grad_norm": 2.1453981037999386, + "language_loss": 0.73354542, + "learning_rate": 5.003987349943777e-07, + "loss": 0.75485885, + "num_input_tokens_seen": 278737375, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6484375, + "step": 12919, + "time_per_iteration": 2.436053514480591 + }, + { + "auxiliary_loss_clip": 0.01102045, + "auxiliary_loss_mlp": 0.01031685, + "balance_loss_clip": 1.01979434, + "balance_loss_mlp": 1.03540444, + "epoch": 0.7767924244701638, + "flos": 22086642556800.0, + "grad_norm": 2.524282476401475, + "language_loss": 0.79217321, + "learning_rate": 5.001410707243792e-07, + "loss": 0.81351054, + "num_input_tokens_seen": 278756510, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 12920, + "time_per_iteration": 2.4638402462005615 + }, + { + "auxiliary_loss_clip": 0.01101955, + "auxiliary_loss_mlp": 0.01030068, + "balance_loss_clip": 1.01851654, + "balance_loss_mlp": 1.03587747, + "epoch": 0.7768525477228319, + "flos": 21981640124160.0, + "grad_norm": 1.5839883144130948, + "language_loss": 0.70594597, + "learning_rate": 4.998834633291829e-07, + "loss": 0.72726625, + "num_input_tokens_seen": 278775410, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6640625, + "step": 12921, + "time_per_iteration": 2.4318997859954834 + }, + { + "auxiliary_loss_clip": 0.01103624, + "auxiliary_loss_mlp": 0.01027271, + "balance_loss_clip": 1.01492715, + "balance_loss_mlp": 1.03501809, + "epoch": 0.7769126709754998, + "flos": 21794581071360.0, + "grad_norm": 1.7058717810568553, + "language_loss": 0.76330459, + "learning_rate": 4.996259128185547e-07, + "loss": 0.78461355, + "num_input_tokens_seen": 278794260, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 12922, + "time_per_iteration": 2.470374345779419 + }, + { + "auxiliary_loss_clip": 0.01102145, + "auxiliary_loss_mlp": 0.01031584, + "balance_loss_clip": 1.01991987, + "balance_loss_mlp": 1.03619885, + "epoch": 0.7769727942281678, + "flos": 20047994248320.0, + "grad_norm": 1.882909865169764, + "language_loss": 0.80363363, + "learning_rate": 4.993684192022625e-07, + "loss": 0.82497096, + "num_input_tokens_seen": 278813290, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66015625, + "step": 12923, + "time_per_iteration": 2.488701343536377 + }, + { + "auxiliary_loss_clip": 0.01102496, + "auxiliary_loss_mlp": 0.01031608, + "balance_loss_clip": 1.02036023, + "balance_loss_mlp": 1.03716397, + "epoch": 0.7770329174808357, + "flos": 21686777377920.0, + "grad_norm": 1.9867390382218033, + "language_loss": 0.92483282, + "learning_rate": 4.991109824900699e-07, + "loss": 0.94617379, + "num_input_tokens_seen": 278830610, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 12924, + "time_per_iteration": 2.452601194381714 + }, + { + "auxiliary_loss_clip": 0.010996, + "auxiliary_loss_mlp": 0.01029134, + "balance_loss_clip": 1.01750469, + "balance_loss_mlp": 1.03356338, + "epoch": 0.7770930407335037, + "flos": 25849255098240.0, + "grad_norm": 1.980221846763212, + "language_loss": 0.65940827, + "learning_rate": 4.988536026917401e-07, + "loss": 0.68069565, + "num_input_tokens_seen": 278849530, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66015625, + "step": 12925, + "time_per_iteration": 2.4850525856018066 + }, + { + "auxiliary_loss_clip": 0.01103083, + "auxiliary_loss_mlp": 0.01035406, + "balance_loss_clip": 1.02330625, + "balance_loss_mlp": 1.03621173, + "epoch": 0.7771531639861716, + "flos": 24347865490560.0, + "grad_norm": 1.7141356167818045, + "language_loss": 0.71911299, + "learning_rate": 4.985962798170314e-07, + "loss": 0.74049789, + "num_input_tokens_seen": 278869005, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66796875, + "step": 12926, + "time_per_iteration": 2.4577598571777344 + }, + { + "auxiliary_loss_clip": 0.01103729, + "auxiliary_loss_mlp": 0.01026778, + "balance_loss_clip": 1.01420105, + "balance_loss_mlp": 1.03604841, + "epoch": 0.7772132872388396, + "flos": 25629948610560.0, + "grad_norm": 1.8312057216887105, + "language_loss": 0.65467525, + "learning_rate": 4.983390138757027e-07, + "loss": 0.67598033, + "num_input_tokens_seen": 278888790, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.67578125, + "step": 12927, + "time_per_iteration": 2.4614973068237305 + }, + { + "auxiliary_loss_clip": 0.01101116, + "auxiliary_loss_mlp": 0.01036421, + "balance_loss_clip": 1.02413607, + "balance_loss_mlp": 1.03512836, + "epoch": 0.7772734104915076, + "flos": 26067412350720.0, + "grad_norm": 1.745612038393379, + "language_loss": 0.72182518, + "learning_rate": 4.980818048775093e-07, + "loss": 0.74320054, + "num_input_tokens_seen": 278908150, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66015625, + "step": 12928, + "time_per_iteration": 3.8557302951812744 + }, + { + "auxiliary_loss_clip": 0.01097726, + "auxiliary_loss_mlp": 0.01029217, + "balance_loss_clip": 1.01759386, + "balance_loss_mlp": 1.03366995, + "epoch": 0.7773335337441756, + "flos": 22925048883840.0, + "grad_norm": 1.6060667874854504, + "language_loss": 0.73954302, + "learning_rate": 4.978246528322036e-07, + "loss": 0.76081246, + "num_input_tokens_seen": 278927425, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.640625, + "step": 12929, + "time_per_iteration": 2.4402310848236084 + }, + { + "auxiliary_loss_clip": 0.01101677, + "auxiliary_loss_mlp": 0.01029041, + "balance_loss_clip": 1.01716232, + "balance_loss_mlp": 1.03536963, + "epoch": 0.7773936569968435, + "flos": 20776765288320.0, + "grad_norm": 1.8904453576029416, + "language_loss": 0.77982825, + "learning_rate": 4.975675577495377e-07, + "loss": 0.80113542, + "num_input_tokens_seen": 278946475, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 12930, + "time_per_iteration": 3.86580491065979 + }, + { + "auxiliary_loss_clip": 0.01101392, + "auxiliary_loss_mlp": 0.01030849, + "balance_loss_clip": 1.01922631, + "balance_loss_mlp": 1.03665566, + "epoch": 0.7774537802495115, + "flos": 20372267255040.0, + "grad_norm": 1.7459832422973112, + "language_loss": 0.79347777, + "learning_rate": 4.973105196392613e-07, + "loss": 0.81480014, + "num_input_tokens_seen": 278964345, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6484375, + "step": 12931, + "time_per_iteration": 3.8444814682006836 + }, + { + "auxiliary_loss_clip": 0.01023847, + "auxiliary_loss_mlp": 0.01000444, + "balance_loss_clip": 0.99943125, + "balance_loss_mlp": 1.00351429, + "epoch": 0.7775139035021794, + "flos": 53912081738880.0, + "grad_norm": 0.8066088266331831, + "language_loss": 0.59735709, + "learning_rate": 4.970535385111199e-07, + "loss": 0.61760002, + "num_input_tokens_seen": 279022380, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.203125, + "step": 12932, + "time_per_iteration": 3.025099039077759 + }, + { + "auxiliary_loss_clip": 0.01102256, + "auxiliary_loss_mlp": 0.0102973, + "balance_loss_clip": 1.01822686, + "balance_loss_mlp": 1.03569841, + "epoch": 0.7775740267548474, + "flos": 28842481296000.0, + "grad_norm": 1.4815322595088087, + "language_loss": 0.76235545, + "learning_rate": 4.967966143748595e-07, + "loss": 0.78367525, + "num_input_tokens_seen": 279044275, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 12933, + "time_per_iteration": 4.019074440002441 + }, + { + "auxiliary_loss_clip": 0.01102231, + "auxiliary_loss_mlp": 0.01032552, + "balance_loss_clip": 1.02023768, + "balance_loss_mlp": 1.03603268, + "epoch": 0.7776341500075155, + "flos": 21872471713920.0, + "grad_norm": 2.0481953339666026, + "language_loss": 0.73607898, + "learning_rate": 4.965397472402215e-07, + "loss": 0.7574268, + "num_input_tokens_seen": 279063375, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6640625, + "step": 12934, + "time_per_iteration": 2.4444801807403564 + }, + { + "auxiliary_loss_clip": 0.01103168, + "auxiliary_loss_mlp": 0.0102577, + "balance_loss_clip": 1.01344395, + "balance_loss_mlp": 1.03648293, + "epoch": 0.7776942732601834, + "flos": 20229845829120.0, + "grad_norm": 1.8918830226491183, + "language_loss": 0.70461309, + "learning_rate": 4.962829371169475e-07, + "loss": 0.72590244, + "num_input_tokens_seen": 279082680, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66796875, + "step": 12935, + "time_per_iteration": 2.461881637573242 + }, + { + "auxiliary_loss_clip": 0.01103005, + "auxiliary_loss_mlp": 0.01037697, + "balance_loss_clip": 1.02545476, + "balance_loss_mlp": 1.03640771, + "epoch": 0.7777543965128514, + "flos": 22231829329920.0, + "grad_norm": 1.5340308714380857, + "language_loss": 0.83742738, + "learning_rate": 4.960261840147746e-07, + "loss": 0.85883445, + "num_input_tokens_seen": 279099805, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6640625, + "step": 12936, + "time_per_iteration": 2.4495856761932373 + }, + { + "auxiliary_loss_clip": 0.01103271, + "auxiliary_loss_mlp": 0.01027254, + "balance_loss_clip": 1.01662064, + "balance_loss_mlp": 1.03480935, + "epoch": 0.7778145197655193, + "flos": 14501950508160.0, + "grad_norm": 2.0135584494243255, + "language_loss": 0.67168462, + "learning_rate": 4.957694879434397e-07, + "loss": 0.69298995, + "num_input_tokens_seen": 279117975, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6875, + "step": 12937, + "time_per_iteration": 2.4478330612182617 + }, + { + "auxiliary_loss_clip": 0.01101824, + "auxiliary_loss_mlp": 0.01027729, + "balance_loss_clip": 1.01624966, + "balance_loss_mlp": 1.03470469, + "epoch": 0.7778746430181873, + "flos": 21140288881920.0, + "grad_norm": 1.648957424958238, + "language_loss": 0.868128, + "learning_rate": 4.955128489126777e-07, + "loss": 0.88942349, + "num_input_tokens_seen": 279137255, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 12938, + "time_per_iteration": 2.4823062419891357 + }, + { + "auxiliary_loss_clip": 0.01101697, + "auxiliary_loss_mlp": 0.01027803, + "balance_loss_clip": 1.01571488, + "balance_loss_mlp": 1.03527653, + "epoch": 0.7779347662708552, + "flos": 20266366982400.0, + "grad_norm": 1.8176002406557528, + "language_loss": 0.85162985, + "learning_rate": 4.95256266932218e-07, + "loss": 0.8729248, + "num_input_tokens_seen": 279154500, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6640625, + "step": 12939, + "time_per_iteration": 2.465057611465454 + }, + { + "auxiliary_loss_clip": 0.01097955, + "auxiliary_loss_mlp": 0.01031617, + "balance_loss_clip": 1.0204885, + "balance_loss_mlp": 1.03464723, + "epoch": 0.7779948895235232, + "flos": 19209013303680.0, + "grad_norm": 1.9198356417092663, + "language_loss": 0.68793273, + "learning_rate": 4.949997420117915e-07, + "loss": 0.70922846, + "num_input_tokens_seen": 279173635, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6328125, + "step": 12940, + "time_per_iteration": 2.4191107749938965 + }, + { + "auxiliary_loss_clip": 0.01100503, + "auxiliary_loss_mlp": 0.01024954, + "balance_loss_clip": 1.01418972, + "balance_loss_mlp": 1.03387284, + "epoch": 0.7780550127761912, + "flos": 23914711382400.0, + "grad_norm": 1.6186124498470607, + "language_loss": 0.77783638, + "learning_rate": 4.947432741611255e-07, + "loss": 0.79909098, + "num_input_tokens_seen": 279194430, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.66796875, + "step": 12941, + "time_per_iteration": 2.5182301998138428 + }, + { + "auxiliary_loss_clip": 0.01103819, + "auxiliary_loss_mlp": 0.01032841, + "balance_loss_clip": 1.02010357, + "balance_loss_mlp": 1.03493428, + "epoch": 0.7781151360288592, + "flos": 32415951795840.0, + "grad_norm": 2.252237972252455, + "language_loss": 0.73223758, + "learning_rate": 4.944868633899462e-07, + "loss": 0.75360417, + "num_input_tokens_seen": 279212920, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 12942, + "time_per_iteration": 2.5156443119049072 + }, + { + "auxiliary_loss_clip": 0.01097922, + "auxiliary_loss_mlp": 0.01031145, + "balance_loss_clip": 1.02004635, + "balance_loss_mlp": 1.03366685, + "epoch": 0.7781752592815271, + "flos": 22346384780160.0, + "grad_norm": 3.1295555400179653, + "language_loss": 0.6771059, + "learning_rate": 4.942305097079751e-07, + "loss": 0.69839656, + "num_input_tokens_seen": 279232310, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.640625, + "step": 12943, + "time_per_iteration": 2.4742066860198975 + }, + { + "auxiliary_loss_clip": 0.01023917, + "auxiliary_loss_mlp": 0.00999519, + "balance_loss_clip": 0.99852365, + "balance_loss_mlp": 1.00365448, + "epoch": 0.7782353825341951, + "flos": 70460183520000.0, + "grad_norm": 0.7816270653723761, + "language_loss": 0.5855267, + "learning_rate": 4.939742131249347e-07, + "loss": 0.60576105, + "num_input_tokens_seen": 279295375, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.203125, + "step": 12944, + "time_per_iteration": 3.1933257579803467 + }, + { + "auxiliary_loss_clip": 0.01103658, + "auxiliary_loss_mlp": 0.01035194, + "balance_loss_clip": 1.0226059, + "balance_loss_mlp": 1.03550398, + "epoch": 0.778295505786863, + "flos": 19062569554560.0, + "grad_norm": 2.222805879365814, + "language_loss": 0.6770618, + "learning_rate": 4.937179736505428e-07, + "loss": 0.69845027, + "num_input_tokens_seen": 279313660, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6796875, + "step": 12945, + "time_per_iteration": 2.4619064331054688 + }, + { + "auxiliary_loss_clip": 0.01101979, + "auxiliary_loss_mlp": 0.01031112, + "balance_loss_clip": 1.01970923, + "balance_loss_mlp": 1.03608465, + "epoch": 0.778355629039531, + "flos": 20999734963200.0, + "grad_norm": 1.9340302005475807, + "language_loss": 0.69121152, + "learning_rate": 4.93461791294516e-07, + "loss": 0.71254241, + "num_input_tokens_seen": 279334495, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66015625, + "step": 12946, + "time_per_iteration": 2.459763526916504 + }, + { + "auxiliary_loss_clip": 0.0110194, + "auxiliary_loss_mlp": 0.01027375, + "balance_loss_clip": 1.01551938, + "balance_loss_mlp": 1.03543854, + "epoch": 0.7784157522921991, + "flos": 21398091770880.0, + "grad_norm": 2.351828874315234, + "language_loss": 0.65289766, + "learning_rate": 4.932056660665689e-07, + "loss": 0.67419076, + "num_input_tokens_seen": 279352985, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6640625, + "step": 12947, + "time_per_iteration": 2.4477789402008057 + }, + { + "auxiliary_loss_clip": 0.01100664, + "auxiliary_loss_mlp": 0.01033296, + "balance_loss_clip": 1.02189326, + "balance_loss_mlp": 1.0360136, + "epoch": 0.778475875544867, + "flos": 20813861059200.0, + "grad_norm": 2.161531176276814, + "language_loss": 0.65099561, + "learning_rate": 4.929495979764147e-07, + "loss": 0.67233521, + "num_input_tokens_seen": 279371360, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6484375, + "step": 12948, + "time_per_iteration": 2.4290242195129395 + }, + { + "auxiliary_loss_clip": 0.01100958, + "auxiliary_loss_mlp": 0.01030087, + "balance_loss_clip": 1.01845825, + "balance_loss_mlp": 1.03465629, + "epoch": 0.778535998797535, + "flos": 14355363104640.0, + "grad_norm": 1.6859142998281702, + "language_loss": 0.74930477, + "learning_rate": 4.926935870337625e-07, + "loss": 0.77061522, + "num_input_tokens_seen": 279389400, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 12949, + "time_per_iteration": 2.4495837688446045 + }, + { + "auxiliary_loss_clip": 0.01105998, + "auxiliary_loss_mlp": 0.01032538, + "balance_loss_clip": 1.02037239, + "balance_loss_mlp": 1.03724563, + "epoch": 0.7785961220502029, + "flos": 19209552007680.0, + "grad_norm": 1.519597637019559, + "language_loss": 0.68952882, + "learning_rate": 4.924376332483202e-07, + "loss": 0.71091413, + "num_input_tokens_seen": 279409715, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6875, + "step": 12950, + "time_per_iteration": 2.4255573749542236 + }, + { + "auxiliary_loss_clip": 0.01099665, + "auxiliary_loss_mlp": 0.01028219, + "balance_loss_clip": 1.01623845, + "balance_loss_mlp": 1.0328927, + "epoch": 0.7786562453028709, + "flos": 25738757884800.0, + "grad_norm": 1.6317845562293505, + "language_loss": 0.71912777, + "learning_rate": 4.921817366297938e-07, + "loss": 0.74040663, + "num_input_tokens_seen": 279427705, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 12951, + "time_per_iteration": 2.481668710708618 + }, + { + "auxiliary_loss_clip": 0.0109957, + "auxiliary_loss_mlp": 0.0102935, + "balance_loss_clip": 1.017977, + "balance_loss_mlp": 1.03646922, + "epoch": 0.7787163685555388, + "flos": 25739440243200.0, + "grad_norm": 1.6634043770166038, + "language_loss": 0.65471166, + "learning_rate": 4.919258971878877e-07, + "loss": 0.67600083, + "num_input_tokens_seen": 279448215, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6328125, + "step": 12952, + "time_per_iteration": 2.4531540870666504 + }, + { + "auxiliary_loss_clip": 0.01093756, + "auxiliary_loss_mlp": 0.01026755, + "balance_loss_clip": 1.01577616, + "balance_loss_mlp": 1.03269386, + "epoch": 0.7787764918082068, + "flos": 22747722416640.0, + "grad_norm": 1.5845487757509182, + "language_loss": 0.81134123, + "learning_rate": 4.916701149323022e-07, + "loss": 0.83254635, + "num_input_tokens_seen": 279466260, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.609375, + "step": 12953, + "time_per_iteration": 2.463089942932129 + }, + { + "auxiliary_loss_clip": 0.01106842, + "auxiliary_loss_mlp": 0.01031184, + "balance_loss_clip": 1.01938844, + "balance_loss_mlp": 1.03845859, + "epoch": 0.7788366150608748, + "flos": 15190860430080.0, + "grad_norm": 3.927672519957359, + "language_loss": 0.77081442, + "learning_rate": 4.91414389872737e-07, + "loss": 0.79219466, + "num_input_tokens_seen": 279484520, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.68359375, + "step": 12954, + "time_per_iteration": 2.407898187637329 + }, + { + "auxiliary_loss_clip": 0.01103106, + "auxiliary_loss_mlp": 0.0102569, + "balance_loss_clip": 1.01457942, + "balance_loss_mlp": 1.03563237, + "epoch": 0.7788967383135428, + "flos": 21210242618880.0, + "grad_norm": 1.509444537933962, + "language_loss": 0.72937489, + "learning_rate": 4.911587220188905e-07, + "loss": 0.7506628, + "num_input_tokens_seen": 279503130, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.67578125, + "step": 12955, + "time_per_iteration": 2.4522764682769775 + }, + { + "auxiliary_loss_clip": 0.01100775, + "auxiliary_loss_mlp": 0.01028796, + "balance_loss_clip": 1.0172863, + "balance_loss_mlp": 1.03384817, + "epoch": 0.7789568615662107, + "flos": 21682970536320.0, + "grad_norm": 1.439262912645897, + "language_loss": 0.68722045, + "learning_rate": 4.909031113804551e-07, + "loss": 0.70851612, + "num_input_tokens_seen": 279521930, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66796875, + "step": 12956, + "time_per_iteration": 2.4333713054656982 + }, + { + "auxiliary_loss_clip": 0.01100355, + "auxiliary_loss_mlp": 0.01028981, + "balance_loss_clip": 1.01781666, + "balance_loss_mlp": 1.03511453, + "epoch": 0.7790169848188787, + "flos": 26360371676160.0, + "grad_norm": 1.517896090927025, + "language_loss": 0.76230508, + "learning_rate": 4.906475579671252e-07, + "loss": 0.78359848, + "num_input_tokens_seen": 279542375, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.65234375, + "step": 12957, + "time_per_iteration": 2.503735065460205 + }, + { + "auxiliary_loss_clip": 0.01100381, + "auxiliary_loss_mlp": 0.01028257, + "balance_loss_clip": 1.01647925, + "balance_loss_mlp": 1.03468633, + "epoch": 0.7790771080715466, + "flos": 25516183259520.0, + "grad_norm": 1.5979731248356082, + "language_loss": 0.77661026, + "learning_rate": 4.903920617885917e-07, + "loss": 0.79789662, + "num_input_tokens_seen": 279561885, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.65625, + "step": 12958, + "time_per_iteration": 2.470494270324707 + }, + { + "auxiliary_loss_clip": 0.01103674, + "auxiliary_loss_mlp": 0.01037597, + "balance_loss_clip": 1.02521682, + "balance_loss_mlp": 1.03665078, + "epoch": 0.7791372313242146, + "flos": 16034186920320.0, + "grad_norm": 1.8919094933835359, + "language_loss": 0.71729428, + "learning_rate": 4.901366228545418e-07, + "loss": 0.73870701, + "num_input_tokens_seen": 279579965, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 12959, + "time_per_iteration": 2.4404170513153076 + }, + { + "auxiliary_loss_clip": 0.01099647, + "auxiliary_loss_mlp": 0.01031546, + "balance_loss_clip": 1.02000022, + "balance_loss_mlp": 1.03415179, + "epoch": 0.7791973545768827, + "flos": 23842207779840.0, + "grad_norm": 2.165413341103088, + "language_loss": 0.7770282, + "learning_rate": 4.898812411746632e-07, + "loss": 0.79834014, + "num_input_tokens_seen": 279599030, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 12960, + "time_per_iteration": 2.4568068981170654 + }, + { + "auxiliary_loss_clip": 0.01103395, + "auxiliary_loss_mlp": 0.01034613, + "balance_loss_clip": 1.02233458, + "balance_loss_mlp": 1.03674865, + "epoch": 0.7792574778295506, + "flos": 24168384207360.0, + "grad_norm": 1.9020069613466535, + "language_loss": 0.75351453, + "learning_rate": 4.896259167586385e-07, + "loss": 0.7748946, + "num_input_tokens_seen": 279614400, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66796875, + "step": 12961, + "time_per_iteration": 2.4992313385009766 + }, + { + "auxiliary_loss_clip": 0.0109806, + "auxiliary_loss_mlp": 0.01035181, + "balance_loss_clip": 1.02412987, + "balance_loss_mlp": 1.03634429, + "epoch": 0.7793176010822186, + "flos": 21464921024640.0, + "grad_norm": 1.602325654578752, + "language_loss": 0.73415077, + "learning_rate": 4.893706496161511e-07, + "loss": 0.75548315, + "num_input_tokens_seen": 279633745, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6171875, + "step": 12962, + "time_per_iteration": 2.4623515605926514 + }, + { + "auxiliary_loss_clip": 0.01100833, + "auxiliary_loss_mlp": 0.01027912, + "balance_loss_clip": 1.01642597, + "balance_loss_mlp": 1.03580284, + "epoch": 0.7793777243348865, + "flos": 20666699038080.0, + "grad_norm": 4.864590611193701, + "language_loss": 0.6971066, + "learning_rate": 4.891154397568795e-07, + "loss": 0.71839404, + "num_input_tokens_seen": 279651165, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6484375, + "step": 12963, + "time_per_iteration": 2.4501214027404785 + }, + { + "auxiliary_loss_clip": 0.01102284, + "auxiliary_loss_mlp": 0.01030764, + "balance_loss_clip": 1.01936793, + "balance_loss_mlp": 1.0372932, + "epoch": 0.7794378475875545, + "flos": 27125771610240.0, + "grad_norm": 1.8027321276281432, + "language_loss": 0.63654995, + "learning_rate": 4.888602871905019e-07, + "loss": 0.65788043, + "num_input_tokens_seen": 279671175, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6484375, + "step": 12964, + "time_per_iteration": 2.491323709487915 + }, + { + "auxiliary_loss_clip": 0.01102129, + "auxiliary_loss_mlp": 0.01031652, + "balance_loss_clip": 1.02017188, + "balance_loss_mlp": 1.03510189, + "epoch": 0.7794979708402224, + "flos": 28074136446720.0, + "grad_norm": 1.518939457679847, + "language_loss": 0.7682904, + "learning_rate": 4.88605191926694e-07, + "loss": 0.78962815, + "num_input_tokens_seen": 279688675, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.671875, + "step": 12965, + "time_per_iteration": 2.528763771057129 + }, + { + "auxiliary_loss_clip": 0.01094543, + "auxiliary_loss_mlp": 0.01027037, + "balance_loss_clip": 1.01648712, + "balance_loss_mlp": 1.03374982, + "epoch": 0.7795580940928905, + "flos": 26869548919680.0, + "grad_norm": 2.0628769649637273, + "language_loss": 0.73018187, + "learning_rate": 4.883501539751289e-07, + "loss": 0.75139767, + "num_input_tokens_seen": 279710245, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.609375, + "step": 12966, + "time_per_iteration": 2.484900951385498 + }, + { + "auxiliary_loss_clip": 0.01098331, + "auxiliary_loss_mlp": 0.01025441, + "balance_loss_clip": 1.01554668, + "balance_loss_mlp": 1.03585887, + "epoch": 0.7796182173455584, + "flos": 23835384195840.0, + "grad_norm": 1.5008219463106178, + "language_loss": 0.73900837, + "learning_rate": 4.880951733454768e-07, + "loss": 0.76024604, + "num_input_tokens_seen": 279729045, + "router_z_loss_clip": 0.09912109, + "router_z_loss_mlp": 0.625, + "step": 12967, + "time_per_iteration": 2.478590250015259 + }, + { + "auxiliary_loss_clip": 0.01102816, + "auxiliary_loss_mlp": 0.01025264, + "balance_loss_clip": 1.01353419, + "balance_loss_mlp": 1.03645122, + "epoch": 0.7796783405982264, + "flos": 19792238434560.0, + "grad_norm": 2.127947897129968, + "language_loss": 0.72439355, + "learning_rate": 4.878402500474073e-07, + "loss": 0.74567437, + "num_input_tokens_seen": 279748350, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 12968, + "time_per_iteration": 2.4800057411193848 + }, + { + "auxiliary_loss_clip": 0.01099689, + "auxiliary_loss_mlp": 0.01034695, + "balance_loss_clip": 1.02313745, + "balance_loss_mlp": 1.0356847, + "epoch": 0.7797384638508943, + "flos": 15450207603840.0, + "grad_norm": 2.4882382801625114, + "language_loss": 0.6027385, + "learning_rate": 4.875853840905874e-07, + "loss": 0.62408233, + "num_input_tokens_seen": 279765620, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 12969, + "time_per_iteration": 3.8256025314331055 + }, + { + "auxiliary_loss_clip": 0.01095828, + "auxiliary_loss_mlp": 0.01027516, + "balance_loss_clip": 1.01716805, + "balance_loss_mlp": 1.03350222, + "epoch": 0.7797985871035623, + "flos": 20922742160640.0, + "grad_norm": 1.7218656768380223, + "language_loss": 0.70345086, + "learning_rate": 4.873305754846811e-07, + "loss": 0.7246843, + "num_input_tokens_seen": 279782485, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.62109375, + "step": 12970, + "time_per_iteration": 2.4424326419830322 + }, + { + "auxiliary_loss_clip": 0.01102147, + "auxiliary_loss_mlp": 0.01031819, + "balance_loss_clip": 1.01964164, + "balance_loss_mlp": 1.03676975, + "epoch": 0.7798587103562302, + "flos": 36937212514560.0, + "grad_norm": 1.5981872425492996, + "language_loss": 0.72214878, + "learning_rate": 4.870758242393507e-07, + "loss": 0.74348849, + "num_input_tokens_seen": 279804170, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.65625, + "step": 12971, + "time_per_iteration": 4.000694990158081 + }, + { + "auxiliary_loss_clip": 0.01105251, + "auxiliary_loss_mlp": 0.01031706, + "balance_loss_clip": 1.01978493, + "balance_loss_mlp": 1.03616154, + "epoch": 0.7799188336088982, + "flos": 22419283432320.0, + "grad_norm": 1.9065262783110748, + "language_loss": 0.74722421, + "learning_rate": 4.868211303642578e-07, + "loss": 0.76859379, + "num_input_tokens_seen": 279823730, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69140625, + "step": 12972, + "time_per_iteration": 3.843189001083374 + }, + { + "auxiliary_loss_clip": 0.01099808, + "auxiliary_loss_mlp": 0.01024082, + "balance_loss_clip": 1.01216161, + "balance_loss_mlp": 1.03417563, + "epoch": 0.7799789568615663, + "flos": 18880466578560.0, + "grad_norm": 2.146033088576411, + "language_loss": 0.71397805, + "learning_rate": 4.865664938690584e-07, + "loss": 0.73521698, + "num_input_tokens_seen": 279843035, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 12973, + "time_per_iteration": 2.4355766773223877 + }, + { + "auxiliary_loss_clip": 0.01097706, + "auxiliary_loss_mlp": 0.0102725, + "balance_loss_clip": 1.01677775, + "balance_loss_mlp": 1.03420782, + "epoch": 0.7800390801142342, + "flos": 20262272832000.0, + "grad_norm": 2.289500877533027, + "language_loss": 0.77711248, + "learning_rate": 4.863119147634089e-07, + "loss": 0.79836202, + "num_input_tokens_seen": 279861450, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.63671875, + "step": 12974, + "time_per_iteration": 2.4445388317108154 + }, + { + "auxiliary_loss_clip": 0.010981, + "auxiliary_loss_mlp": 0.01029835, + "balance_loss_clip": 1.01833785, + "balance_loss_mlp": 1.03401935, + "epoch": 0.7800992033669022, + "flos": 16690310703360.0, + "grad_norm": 1.507070733985586, + "language_loss": 0.69106656, + "learning_rate": 4.86057393056964e-07, + "loss": 0.71234584, + "num_input_tokens_seen": 279878660, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 12975, + "time_per_iteration": 4.026258230209351 + }, + { + "auxiliary_loss_clip": 0.01098461, + "auxiliary_loss_mlp": 0.01028562, + "balance_loss_clip": 1.01754749, + "balance_loss_mlp": 1.03443432, + "epoch": 0.7801593266195701, + "flos": 18585208782720.0, + "grad_norm": 3.483605083933044, + "language_loss": 0.81612706, + "learning_rate": 4.858029287593739e-07, + "loss": 0.83739734, + "num_input_tokens_seen": 279895685, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 12976, + "time_per_iteration": 2.39786696434021 + }, + { + "auxiliary_loss_clip": 0.01102312, + "auxiliary_loss_mlp": 0.01026506, + "balance_loss_clip": 1.01467419, + "balance_loss_mlp": 1.03479075, + "epoch": 0.7802194498722381, + "flos": 25484941405440.0, + "grad_norm": 1.6747970494866666, + "language_loss": 0.6597501, + "learning_rate": 4.85548521880289e-07, + "loss": 0.68103826, + "num_input_tokens_seen": 279917240, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.67578125, + "step": 12977, + "time_per_iteration": 2.509279489517212 + }, + { + "auxiliary_loss_clip": 0.01097395, + "auxiliary_loss_mlp": 0.01028658, + "balance_loss_clip": 1.01837647, + "balance_loss_mlp": 1.03446436, + "epoch": 0.780279573124906, + "flos": 31176315573120.0, + "grad_norm": 2.0224689564916236, + "language_loss": 0.74458158, + "learning_rate": 4.852941724293554e-07, + "loss": 0.76584208, + "num_input_tokens_seen": 279938665, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.6328125, + "step": 12978, + "time_per_iteration": 2.5191776752471924 + }, + { + "auxiliary_loss_clip": 0.01103093, + "auxiliary_loss_mlp": 0.01029874, + "balance_loss_clip": 1.01786351, + "balance_loss_mlp": 1.03624713, + "epoch": 0.780339696377574, + "flos": 26944027770240.0, + "grad_norm": 2.0922083765089523, + "language_loss": 0.62049854, + "learning_rate": 4.85039880416219e-07, + "loss": 0.64182818, + "num_input_tokens_seen": 279957965, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66796875, + "step": 12979, + "time_per_iteration": 2.5099925994873047 + }, + { + "auxiliary_loss_clip": 0.01099974, + "auxiliary_loss_mlp": 0.01027414, + "balance_loss_clip": 1.01567745, + "balance_loss_mlp": 1.03531623, + "epoch": 0.780399819630242, + "flos": 27957426180480.0, + "grad_norm": 1.9372520913604323, + "language_loss": 0.77348953, + "learning_rate": 4.847856458505217e-07, + "loss": 0.79476345, + "num_input_tokens_seen": 279977490, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.64453125, + "step": 12980, + "time_per_iteration": 2.4801688194274902 + }, + { + "auxiliary_loss_clip": 0.01102229, + "auxiliary_loss_mlp": 0.01031284, + "balance_loss_clip": 1.02032888, + "balance_loss_mlp": 1.03540671, + "epoch": 0.78045994288291, + "flos": 22486795044480.0, + "grad_norm": 3.6673789740050484, + "language_loss": 0.78181487, + "learning_rate": 4.845314687419046e-07, + "loss": 0.80315006, + "num_input_tokens_seen": 279994220, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.66796875, + "step": 12981, + "time_per_iteration": 2.4743919372558594 + }, + { + "auxiliary_loss_clip": 0.01104292, + "auxiliary_loss_mlp": 0.0103052, + "balance_loss_clip": 1.01932621, + "balance_loss_mlp": 1.03766203, + "epoch": 0.7805200661355779, + "flos": 20850849089280.0, + "grad_norm": 1.7572805466494936, + "language_loss": 0.7283631, + "learning_rate": 4.842773491000067e-07, + "loss": 0.74971128, + "num_input_tokens_seen": 280012590, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.66796875, + "step": 12982, + "time_per_iteration": 2.464043140411377 + }, + { + "auxiliary_loss_clip": 0.0109892, + "auxiliary_loss_mlp": 0.01028115, + "balance_loss_clip": 1.01735628, + "balance_loss_mlp": 1.03321373, + "epoch": 0.7805801893882459, + "flos": 25665966973440.0, + "grad_norm": 1.447832651307714, + "language_loss": 0.73497742, + "learning_rate": 4.840232869344636e-07, + "loss": 0.75624776, + "num_input_tokens_seen": 280033700, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.66015625, + "step": 12983, + "time_per_iteration": 2.5320849418640137 + }, + { + "auxiliary_loss_clip": 0.01098957, + "auxiliary_loss_mlp": 0.01029332, + "balance_loss_clip": 1.01825762, + "balance_loss_mlp": 1.03431869, + "epoch": 0.7806403126409138, + "flos": 11327806483200.0, + "grad_norm": 1.994731335047155, + "language_loss": 0.7493751, + "learning_rate": 4.837692822549086e-07, + "loss": 0.77065802, + "num_input_tokens_seen": 280052215, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6484375, + "step": 12984, + "time_per_iteration": 2.4252982139587402 + }, + { + "auxiliary_loss_clip": 0.01098022, + "auxiliary_loss_mlp": 0.01031539, + "balance_loss_clip": 1.02072072, + "balance_loss_mlp": 1.03346229, + "epoch": 0.7807004358935818, + "flos": 19573362910080.0, + "grad_norm": 1.7760899084313728, + "language_loss": 0.81298089, + "learning_rate": 4.835153350709746e-07, + "loss": 0.83427656, + "num_input_tokens_seen": 280070525, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.64453125, + "step": 12985, + "time_per_iteration": 2.442458391189575 + }, + { + "auxiliary_loss_clip": 0.01100867, + "auxiliary_loss_mlp": 0.01030708, + "balance_loss_clip": 1.01956177, + "balance_loss_mlp": 1.03591645, + "epoch": 0.7807605591462499, + "flos": 19135827342720.0, + "grad_norm": 1.639777449127703, + "language_loss": 0.77087915, + "learning_rate": 4.832614453922915e-07, + "loss": 0.79219496, + "num_input_tokens_seen": 280089855, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 12986, + "time_per_iteration": 2.4363291263580322 + }, + { + "auxiliary_loss_clip": 0.01100757, + "auxiliary_loss_mlp": 0.01031618, + "balance_loss_clip": 1.02032864, + "balance_loss_mlp": 1.03434944, + "epoch": 0.7808206823989178, + "flos": 32374654133760.0, + "grad_norm": 2.42025665629093, + "language_loss": 0.73686159, + "learning_rate": 4.830076132284859e-07, + "loss": 0.75818527, + "num_input_tokens_seen": 280109960, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6640625, + "step": 12987, + "time_per_iteration": 2.542191505432129 + }, + { + "auxiliary_loss_clip": 0.01023759, + "auxiliary_loss_mlp": 0.01001114, + "balance_loss_clip": 1.00014293, + "balance_loss_mlp": 1.0034368, + "epoch": 0.7808808056515858, + "flos": 55050235061760.0, + "grad_norm": 0.7329422119144833, + "language_loss": 0.55088633, + "learning_rate": 4.82753838589184e-07, + "loss": 0.57113504, + "num_input_tokens_seen": 280169805, + "router_z_loss_clip": 0.00970459, + "router_z_loss_mlp": 0.203125, + "step": 12988, + "time_per_iteration": 3.1061744689941406 + }, + { + "auxiliary_loss_clip": 0.01096388, + "auxiliary_loss_mlp": 0.01034542, + "balance_loss_clip": 1.02383065, + "balance_loss_mlp": 1.03418314, + "epoch": 0.7809409289042537, + "flos": 12859468277760.0, + "grad_norm": 2.47954830996045, + "language_loss": 0.80945504, + "learning_rate": 4.82500121484009e-07, + "loss": 0.83076429, + "num_input_tokens_seen": 280184630, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.625, + "step": 12989, + "time_per_iteration": 2.4808456897735596 + }, + { + "auxiliary_loss_clip": 0.01096337, + "auxiliary_loss_mlp": 0.01028263, + "balance_loss_clip": 1.01711679, + "balance_loss_mlp": 1.03300154, + "epoch": 0.7810010521569217, + "flos": 21687244254720.0, + "grad_norm": 1.5469006395559106, + "language_loss": 0.70564306, + "learning_rate": 4.822464619225806e-07, + "loss": 0.72688901, + "num_input_tokens_seen": 280203880, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6328125, + "step": 12990, + "time_per_iteration": 2.443657636642456 + }, + { + "auxiliary_loss_clip": 0.01101485, + "auxiliary_loss_mlp": 0.01028511, + "balance_loss_clip": 1.01631021, + "balance_loss_mlp": 1.03604221, + "epoch": 0.7810611754095896, + "flos": 16757068129920.0, + "grad_norm": 1.8688564219914294, + "language_loss": 0.77437395, + "learning_rate": 4.819928599145184e-07, + "loss": 0.79567397, + "num_input_tokens_seen": 280220460, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65625, + "step": 12991, + "time_per_iteration": 2.4350147247314453 + }, + { + "auxiliary_loss_clip": 0.01098523, + "auxiliary_loss_mlp": 0.01033078, + "balance_loss_clip": 1.0213058, + "balance_loss_mlp": 1.0333643, + "epoch": 0.7811212986622577, + "flos": 43507464658560.0, + "grad_norm": 1.6335805671408214, + "language_loss": 0.66026002, + "learning_rate": 4.817393154694398e-07, + "loss": 0.68157601, + "num_input_tokens_seen": 280242680, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.65234375, + "step": 12992, + "time_per_iteration": 2.689131259918213 + }, + { + "auxiliary_loss_clip": 0.01101645, + "auxiliary_loss_mlp": 0.0102982, + "balance_loss_clip": 1.01868546, + "balance_loss_mlp": 1.03544474, + "epoch": 0.7811814219149256, + "flos": 21757700782080.0, + "grad_norm": 1.671791427999923, + "language_loss": 0.6139763, + "learning_rate": 4.814858285969578e-07, + "loss": 0.63529098, + "num_input_tokens_seen": 280260655, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66015625, + "step": 12993, + "time_per_iteration": 2.4541869163513184 + }, + { + "auxiliary_loss_clip": 0.01098832, + "auxiliary_loss_mlp": 0.01028588, + "balance_loss_clip": 1.01672089, + "balance_loss_mlp": 1.03474307, + "epoch": 0.7812415451675936, + "flos": 24061514267520.0, + "grad_norm": 1.5259935915170835, + "language_loss": 0.68686914, + "learning_rate": 4.812323993066862e-07, + "loss": 0.70814335, + "num_input_tokens_seen": 280281185, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.640625, + "step": 12994, + "time_per_iteration": 2.4765658378601074 + }, + { + "auxiliary_loss_clip": 0.01098133, + "auxiliary_loss_mlp": 0.01025809, + "balance_loss_clip": 1.01448953, + "balance_loss_mlp": 1.03380036, + "epoch": 0.7813016684202615, + "flos": 18989706816000.0, + "grad_norm": 1.852574283053805, + "language_loss": 0.68799579, + "learning_rate": 4.809790276082335e-07, + "loss": 0.70923519, + "num_input_tokens_seen": 280298255, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.64453125, + "step": 12995, + "time_per_iteration": 2.4536421298980713 + }, + { + "auxiliary_loss_clip": 0.01095783, + "auxiliary_loss_mlp": 0.01026788, + "balance_loss_clip": 1.01633954, + "balance_loss_mlp": 1.03263307, + "epoch": 0.7813617916729295, + "flos": 25260786581760.0, + "grad_norm": 1.6880507432835572, + "language_loss": 0.74965352, + "learning_rate": 4.807257135112088e-07, + "loss": 0.77087927, + "num_input_tokens_seen": 280319000, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.6328125, + "step": 12996, + "time_per_iteration": 2.5054454803466797 + }, + { + "auxiliary_loss_clip": 0.0110413, + "auxiliary_loss_mlp": 0.01031779, + "balance_loss_clip": 1.01969695, + "balance_loss_mlp": 1.03568673, + "epoch": 0.7814219149255974, + "flos": 17966037116160.0, + "grad_norm": 2.5743234501120424, + "language_loss": 0.6912725, + "learning_rate": 4.804724570252167e-07, + "loss": 0.71263158, + "num_input_tokens_seen": 280336375, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.68359375, + "step": 12997, + "time_per_iteration": 2.4369044303894043 + }, + { + "auxiliary_loss_clip": 0.01104469, + "auxiliary_loss_mlp": 0.01031264, + "balance_loss_clip": 1.01893187, + "balance_loss_mlp": 1.03557801, + "epoch": 0.7814820381782654, + "flos": 25776176878080.0, + "grad_norm": 1.8652008126435036, + "language_loss": 0.82176995, + "learning_rate": 4.802192581598614e-07, + "loss": 0.84312725, + "num_input_tokens_seen": 280358760, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69140625, + "step": 12998, + "time_per_iteration": 2.486489772796631 + }, + { + "auxiliary_loss_clip": 0.01099451, + "auxiliary_loss_mlp": 0.01030802, + "balance_loss_clip": 1.01885128, + "balance_loss_mlp": 1.03346038, + "epoch": 0.7815421614309335, + "flos": 20519572930560.0, + "grad_norm": 1.8319036090536944, + "language_loss": 0.74508494, + "learning_rate": 4.799661169247453e-07, + "loss": 0.76638746, + "num_input_tokens_seen": 280377085, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66015625, + "step": 12999, + "time_per_iteration": 2.4737162590026855 + }, + { + "auxiliary_loss_clip": 0.01103401, + "auxiliary_loss_mlp": 0.01033986, + "balance_loss_clip": 1.02182698, + "balance_loss_mlp": 1.03589118, + "epoch": 0.7816022846836014, + "flos": 21287666384640.0, + "grad_norm": 1.563923642471339, + "language_loss": 0.84530002, + "learning_rate": 4.797130333294652e-07, + "loss": 0.86667389, + "num_input_tokens_seen": 280395465, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.67578125, + "step": 13000, + "time_per_iteration": 2.4414126873016357 + }, + { + "auxiliary_loss_clip": 0.01101696, + "auxiliary_loss_mlp": 0.01030595, + "balance_loss_clip": 1.01921082, + "balance_loss_mlp": 1.03525925, + "epoch": 0.7816624079362694, + "flos": 19208402772480.0, + "grad_norm": 1.979765622408292, + "language_loss": 0.65926194, + "learning_rate": 4.794600073836192e-07, + "loss": 0.68058491, + "num_input_tokens_seen": 280412775, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 13001, + "time_per_iteration": 2.459602117538452 + }, + { + "auxiliary_loss_clip": 0.01100637, + "auxiliary_loss_mlp": 0.01031843, + "balance_loss_clip": 1.02068496, + "balance_loss_mlp": 1.0349071, + "epoch": 0.7817225311889373, + "flos": 26104687689600.0, + "grad_norm": 1.7956850599557053, + "language_loss": 0.6699869, + "learning_rate": 4.792070390968027e-07, + "loss": 0.69131166, + "num_input_tokens_seen": 280432905, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 13002, + "time_per_iteration": 2.476304769515991 + }, + { + "auxiliary_loss_clip": 0.01104712, + "auxiliary_loss_mlp": 0.01035704, + "balance_loss_clip": 1.02302575, + "balance_loss_mlp": 1.0376792, + "epoch": 0.7817826544416053, + "flos": 21250929749760.0, + "grad_norm": 2.585481392916345, + "language_loss": 0.7332117, + "learning_rate": 4.78954128478607e-07, + "loss": 0.75461578, + "num_input_tokens_seen": 280450785, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.671875, + "step": 13003, + "time_per_iteration": 2.4720077514648438 + }, + { + "auxiliary_loss_clip": 0.01103208, + "auxiliary_loss_mlp": 0.01031396, + "balance_loss_clip": 1.0201664, + "balance_loss_mlp": 1.03717935, + "epoch": 0.7818427776942732, + "flos": 19932181822080.0, + "grad_norm": 1.569897666611527, + "language_loss": 0.62077022, + "learning_rate": 4.787012755386233e-07, + "loss": 0.64211631, + "num_input_tokens_seen": 280468400, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.66015625, + "step": 13004, + "time_per_iteration": 2.441561222076416 + }, + { + "auxiliary_loss_clip": 0.01094018, + "auxiliary_loss_mlp": 0.01029156, + "balance_loss_clip": 1.01888061, + "balance_loss_mlp": 1.03251433, + "epoch": 0.7819029009469413, + "flos": 11363753018880.0, + "grad_norm": 2.201816457690377, + "language_loss": 0.82857859, + "learning_rate": 4.784484802864403e-07, + "loss": 0.84981036, + "num_input_tokens_seen": 280483930, + "router_z_loss_clip": 0.10302734, + "router_z_loss_mlp": 0.6171875, + "step": 13005, + "time_per_iteration": 2.463477373123169 + }, + { + "auxiliary_loss_clip": 0.01098144, + "auxiliary_loss_mlp": 0.01029594, + "balance_loss_clip": 1.01770329, + "balance_loss_mlp": 1.033494, + "epoch": 0.7819630241996092, + "flos": 24279276470400.0, + "grad_norm": 1.897683871126404, + "language_loss": 0.72580653, + "learning_rate": 4.781957427316432e-07, + "loss": 0.7470839, + "num_input_tokens_seen": 280503465, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6484375, + "step": 13006, + "time_per_iteration": 2.465083122253418 + }, + { + "auxiliary_loss_clip": 0.01101706, + "auxiliary_loss_mlp": 0.01030088, + "balance_loss_clip": 1.01830435, + "balance_loss_mlp": 1.03508401, + "epoch": 0.7820231474522772, + "flos": 22708902792960.0, + "grad_norm": 1.6366399269872012, + "language_loss": 0.7201829, + "learning_rate": 4.779430628838157e-07, + "loss": 0.74150085, + "num_input_tokens_seen": 280523375, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66796875, + "step": 13007, + "time_per_iteration": 2.459080934524536 + }, + { + "auxiliary_loss_clip": 0.01100835, + "auxiliary_loss_mlp": 0.01027314, + "balance_loss_clip": 1.0149343, + "balance_loss_mlp": 1.03300202, + "epoch": 0.7820832707049451, + "flos": 20047419630720.0, + "grad_norm": 2.036752007618824, + "language_loss": 0.68872929, + "learning_rate": 4.776904407525397e-07, + "loss": 0.71001077, + "num_input_tokens_seen": 280542920, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 13008, + "time_per_iteration": 2.4224483966827393 + }, + { + "auxiliary_loss_clip": 0.01098365, + "auxiliary_loss_mlp": 0.01029334, + "balance_loss_clip": 1.01692426, + "balance_loss_mlp": 1.03345668, + "epoch": 0.7821433939576131, + "flos": 27162795553920.0, + "grad_norm": 1.640160857289297, + "language_loss": 0.69686973, + "learning_rate": 4.774378763473954e-07, + "loss": 0.71814674, + "num_input_tokens_seen": 280561700, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6484375, + "step": 13009, + "time_per_iteration": 2.5025076866149902 + }, + { + "auxiliary_loss_clip": 0.01097253, + "auxiliary_loss_mlp": 0.01027375, + "balance_loss_clip": 1.01588941, + "balance_loss_mlp": 1.03301144, + "epoch": 0.782203517210281, + "flos": 22602068766720.0, + "grad_norm": 1.5960610923342113, + "language_loss": 0.81570321, + "learning_rate": 4.771853696779586e-07, + "loss": 0.83694947, + "num_input_tokens_seen": 280580605, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 13010, + "time_per_iteration": 2.4285366535186768 + }, + { + "auxiliary_loss_clip": 0.01096868, + "auxiliary_loss_mlp": 0.0103215, + "balance_loss_clip": 1.02153432, + "balance_loss_mlp": 1.03357911, + "epoch": 0.782263640462949, + "flos": 29059812535680.0, + "grad_norm": 1.6142519346757356, + "language_loss": 0.62225044, + "learning_rate": 4.76932920753806e-07, + "loss": 0.64354062, + "num_input_tokens_seen": 280601495, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6328125, + "step": 13011, + "time_per_iteration": 3.926089286804199 + }, + { + "auxiliary_loss_clip": 0.01099415, + "auxiliary_loss_mlp": 0.01024998, + "balance_loss_clip": 1.01533031, + "balance_loss_mlp": 1.03547144, + "epoch": 0.782323763715617, + "flos": 25299498464640.0, + "grad_norm": 1.6368138696323526, + "language_loss": 0.6998511, + "learning_rate": 4.7668052958450913e-07, + "loss": 0.72109526, + "num_input_tokens_seen": 280622760, + "router_z_loss_clip": 0.09667969, + "router_z_loss_mlp": 0.640625, + "step": 13012, + "time_per_iteration": 2.4826955795288086 + }, + { + "auxiliary_loss_clip": 0.01023537, + "auxiliary_loss_mlp": 0.01008113, + "balance_loss_clip": 1.00711727, + "balance_loss_mlp": 1.00321245, + "epoch": 0.782383886968285, + "flos": 65194388668800.0, + "grad_norm": 0.7065375253302547, + "language_loss": 0.55039519, + "learning_rate": 4.764281961796395e-07, + "loss": 0.57071167, + "num_input_tokens_seen": 280687115, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.203125, + "step": 13013, + "time_per_iteration": 4.645312786102295 + }, + { + "auxiliary_loss_clip": 0.01104842, + "auxiliary_loss_mlp": 0.01032528, + "balance_loss_clip": 1.02101803, + "balance_loss_mlp": 1.03746831, + "epoch": 0.782444010220953, + "flos": 18405440190720.0, + "grad_norm": 1.6705985916443649, + "language_loss": 0.65102112, + "learning_rate": 4.76175920548765e-07, + "loss": 0.67239481, + "num_input_tokens_seen": 280705000, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 13014, + "time_per_iteration": 3.8477301597595215 + }, + { + "auxiliary_loss_clip": 0.01023801, + "auxiliary_loss_mlp": 0.01001816, + "balance_loss_clip": 1.00088012, + "balance_loss_mlp": 1.00337434, + "epoch": 0.7825041334736209, + "flos": 63955003841280.0, + "grad_norm": 0.727505311889394, + "language_loss": 0.58472216, + "learning_rate": 4.759237027014524e-07, + "loss": 0.60497832, + "num_input_tokens_seen": 280773525, + "router_z_loss_clip": 0.00933838, + "router_z_loss_mlp": 0.20507812, + "step": 13015, + "time_per_iteration": 3.1371023654937744 + }, + { + "auxiliary_loss_clip": 0.01098459, + "auxiliary_loss_mlp": 0.01028458, + "balance_loss_clip": 1.01759779, + "balance_loss_mlp": 1.03401864, + "epoch": 0.7825642567262889, + "flos": 20339373375360.0, + "grad_norm": 1.8961534099857338, + "language_loss": 0.7447719, + "learning_rate": 4.756715426472666e-07, + "loss": 0.76604104, + "num_input_tokens_seen": 280791915, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.64453125, + "step": 13016, + "time_per_iteration": 2.434140682220459 + }, + { + "auxiliary_loss_clip": 0.01101959, + "auxiliary_loss_mlp": 0.01030127, + "balance_loss_clip": 1.01715088, + "balance_loss_mlp": 1.03527248, + "epoch": 0.7826243799789568, + "flos": 20262955190400.0, + "grad_norm": 1.7784650318460415, + "language_loss": 0.75034481, + "learning_rate": 4.7541944039576766e-07, + "loss": 0.77166569, + "num_input_tokens_seen": 280811460, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6640625, + "step": 13017, + "time_per_iteration": 3.9943692684173584 + }, + { + "auxiliary_loss_clip": 0.01099632, + "auxiliary_loss_mlp": 0.01029093, + "balance_loss_clip": 1.01680803, + "balance_loss_mlp": 1.03296256, + "epoch": 0.7826845032316249, + "flos": 21132926593920.0, + "grad_norm": 1.8349392879241557, + "language_loss": 0.75123864, + "learning_rate": 4.7516739595651636e-07, + "loss": 0.77252591, + "num_input_tokens_seen": 280825415, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6640625, + "step": 13018, + "time_per_iteration": 2.4067063331604004 + }, + { + "auxiliary_loss_clip": 0.01098611, + "auxiliary_loss_mlp": 0.01026262, + "balance_loss_clip": 1.01444817, + "balance_loss_mlp": 1.03329933, + "epoch": 0.7827446264842928, + "flos": 22492253911680.0, + "grad_norm": 1.4416632846342243, + "language_loss": 0.77156466, + "learning_rate": 4.749154093390708e-07, + "loss": 0.79281342, + "num_input_tokens_seen": 280845335, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65234375, + "step": 13019, + "time_per_iteration": 2.463075876235962 + }, + { + "auxiliary_loss_clip": 0.01097045, + "auxiliary_loss_mlp": 0.01026432, + "balance_loss_clip": 1.01522064, + "balance_loss_mlp": 1.03263474, + "epoch": 0.7828047497369608, + "flos": 28840649702400.0, + "grad_norm": 1.5659008205546523, + "language_loss": 0.67608422, + "learning_rate": 4.746634805529852e-07, + "loss": 0.69731897, + "num_input_tokens_seen": 280867145, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.64453125, + "step": 13020, + "time_per_iteration": 2.4952075481414795 + }, + { + "auxiliary_loss_clip": 0.01100425, + "auxiliary_loss_mlp": 0.01028675, + "balance_loss_clip": 1.01770771, + "balance_loss_mlp": 1.03600883, + "epoch": 0.7828648729896287, + "flos": 23257689759360.0, + "grad_norm": 2.0993447559615905, + "language_loss": 0.6252991, + "learning_rate": 4.7441160960781325e-07, + "loss": 0.64659011, + "num_input_tokens_seen": 280886185, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.64453125, + "step": 13021, + "time_per_iteration": 2.4579381942749023 + }, + { + "auxiliary_loss_clip": 0.01097567, + "auxiliary_loss_mlp": 0.01030619, + "balance_loss_clip": 1.01984227, + "balance_loss_mlp": 1.03425419, + "epoch": 0.7829249962422967, + "flos": 25265670831360.0, + "grad_norm": 1.6887151004822496, + "language_loss": 0.69123161, + "learning_rate": 4.7415979651310636e-07, + "loss": 0.71251345, + "num_input_tokens_seen": 280907665, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6328125, + "step": 13022, + "time_per_iteration": 2.4774861335754395 + }, + { + "auxiliary_loss_clip": 0.01023146, + "auxiliary_loss_mlp": 0.01003513, + "balance_loss_clip": 1.00248182, + "balance_loss_mlp": 1.00289679, + "epoch": 0.7829851194949646, + "flos": 70722044645760.0, + "grad_norm": 0.6410994514398879, + "language_loss": 0.56181228, + "learning_rate": 4.739080412784131e-07, + "loss": 0.58207887, + "num_input_tokens_seen": 280971405, + "router_z_loss_clip": 0.01031494, + "router_z_loss_mlp": 0.203125, + "step": 13023, + "time_per_iteration": 3.216150999069214 + }, + { + "auxiliary_loss_clip": 0.01092363, + "auxiliary_loss_mlp": 0.01026003, + "balance_loss_clip": 1.01569128, + "balance_loss_mlp": 1.03068089, + "epoch": 0.7830452427476327, + "flos": 25660795415040.0, + "grad_norm": 1.5988888518402644, + "language_loss": 0.67096663, + "learning_rate": 4.736563439132792e-07, + "loss": 0.69215035, + "num_input_tokens_seen": 280989615, + "router_z_loss_clip": 0.10302734, + "router_z_loss_mlp": 0.6171875, + "step": 13024, + "time_per_iteration": 2.4942939281463623 + }, + { + "auxiliary_loss_clip": 0.01101952, + "auxiliary_loss_mlp": 0.01026503, + "balance_loss_clip": 1.01470125, + "balance_loss_mlp": 1.0346812, + "epoch": 0.7831053660003006, + "flos": 22784315397120.0, + "grad_norm": 1.8279349963305433, + "language_loss": 0.77768403, + "learning_rate": 4.734047044272498e-07, + "loss": 0.79896855, + "num_input_tokens_seen": 281009450, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.671875, + "step": 13025, + "time_per_iteration": 2.4907360076904297 + }, + { + "auxiliary_loss_clip": 0.01100969, + "auxiliary_loss_mlp": 0.01030225, + "balance_loss_clip": 1.01925731, + "balance_loss_mlp": 1.0364809, + "epoch": 0.7831654892529686, + "flos": 25812267068160.0, + "grad_norm": 1.6346779993689489, + "language_loss": 0.78158247, + "learning_rate": 4.731531228298673e-07, + "loss": 0.80289435, + "num_input_tokens_seen": 281028120, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.64453125, + "step": 13026, + "time_per_iteration": 2.510455846786499 + }, + { + "auxiliary_loss_clip": 0.01099064, + "auxiliary_loss_mlp": 0.01024558, + "balance_loss_clip": 1.01344812, + "balance_loss_mlp": 1.03539133, + "epoch": 0.7832256125056366, + "flos": 20771557816320.0, + "grad_norm": 1.8424561314636239, + "language_loss": 0.75538385, + "learning_rate": 4.729015991306715e-07, + "loss": 0.77662009, + "num_input_tokens_seen": 281042130, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.63671875, + "step": 13027, + "time_per_iteration": 2.4143946170806885 + }, + { + "auxiliary_loss_clip": 0.01100205, + "auxiliary_loss_mlp": 0.01026641, + "balance_loss_clip": 1.01554847, + "balance_loss_mlp": 1.03557467, + "epoch": 0.7832857357583045, + "flos": 21506541909120.0, + "grad_norm": 1.6598203189142682, + "language_loss": 0.70306528, + "learning_rate": 4.726501333391997e-07, + "loss": 0.72433376, + "num_input_tokens_seen": 281060945, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6484375, + "step": 13028, + "time_per_iteration": 2.4640142917633057 + }, + { + "auxiliary_loss_clip": 0.01103224, + "auxiliary_loss_mlp": 0.01034946, + "balance_loss_clip": 1.0228107, + "balance_loss_mlp": 1.03549385, + "epoch": 0.7833458590109725, + "flos": 18077791305600.0, + "grad_norm": 1.953273334391897, + "language_loss": 0.69041282, + "learning_rate": 4.7239872546498774e-07, + "loss": 0.71179456, + "num_input_tokens_seen": 281079270, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 13029, + "time_per_iteration": 2.4038736820220947 + }, + { + "auxiliary_loss_clip": 0.01101772, + "auxiliary_loss_mlp": 0.01026447, + "balance_loss_clip": 1.01434183, + "balance_loss_mlp": 1.03425694, + "epoch": 0.7834059822636404, + "flos": 28288738252800.0, + "grad_norm": 1.7164794542717685, + "language_loss": 0.81022191, + "learning_rate": 4.721473755175698e-07, + "loss": 0.83150411, + "num_input_tokens_seen": 281099500, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 13030, + "time_per_iteration": 2.5112462043762207 + }, + { + "auxiliary_loss_clip": 0.01102526, + "auxiliary_loss_mlp": 0.01029393, + "balance_loss_clip": 1.01789546, + "balance_loss_mlp": 1.03459156, + "epoch": 0.7834661055163085, + "flos": 31686211088640.0, + "grad_norm": 1.6569423927401024, + "language_loss": 0.70443982, + "learning_rate": 4.71896083506476e-07, + "loss": 0.72575903, + "num_input_tokens_seen": 281121250, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6796875, + "step": 13031, + "time_per_iteration": 2.5177314281463623 + }, + { + "auxiliary_loss_clip": 0.01101312, + "auxiliary_loss_mlp": 0.0103291, + "balance_loss_clip": 1.02162719, + "balance_loss_mlp": 1.03390551, + "epoch": 0.7835262287689764, + "flos": 12933192942720.0, + "grad_norm": 2.080929287511114, + "language_loss": 0.78692496, + "learning_rate": 4.7164484944123574e-07, + "loss": 0.80826724, + "num_input_tokens_seen": 281138760, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.671875, + "step": 13032, + "time_per_iteration": 2.438286066055298 + }, + { + "auxiliary_loss_clip": 0.01104134, + "auxiliary_loss_mlp": 0.01034155, + "balance_loss_clip": 1.02238297, + "balance_loss_mlp": 1.03637064, + "epoch": 0.7835863520216444, + "flos": 16143211676160.0, + "grad_norm": 2.498040083098191, + "language_loss": 0.62467206, + "learning_rate": 4.7139367333137726e-07, + "loss": 0.64605498, + "num_input_tokens_seen": 281157420, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6796875, + "step": 13033, + "time_per_iteration": 2.4075143337249756 + }, + { + "auxiliary_loss_clip": 0.01099306, + "auxiliary_loss_mlp": 0.01026458, + "balance_loss_clip": 1.01454878, + "balance_loss_mlp": 1.03466129, + "epoch": 0.7836464752743123, + "flos": 11509909459200.0, + "grad_norm": 1.5229312558567987, + "language_loss": 0.71800756, + "learning_rate": 4.7114255518642255e-07, + "loss": 0.7392652, + "num_input_tokens_seen": 281174620, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6484375, + "step": 13034, + "time_per_iteration": 2.426010847091675 + }, + { + "auxiliary_loss_clip": 0.01102657, + "auxiliary_loss_mlp": 0.01029907, + "balance_loss_clip": 1.01777768, + "balance_loss_mlp": 1.03581548, + "epoch": 0.7837065985269803, + "flos": 18223696350720.0, + "grad_norm": 1.6809698816895169, + "language_loss": 0.72046518, + "learning_rate": 4.7089149501589555e-07, + "loss": 0.74179089, + "num_input_tokens_seen": 281193865, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 13035, + "time_per_iteration": 2.417221784591675 + }, + { + "auxiliary_loss_clip": 0.0110217, + "auxiliary_loss_mlp": 0.01031304, + "balance_loss_clip": 1.01936555, + "balance_loss_mlp": 1.0355823, + "epoch": 0.7837667217796482, + "flos": 24754410599040.0, + "grad_norm": 1.9215035774038787, + "language_loss": 0.66247499, + "learning_rate": 4.7064049282931664e-07, + "loss": 0.6838097, + "num_input_tokens_seen": 281212250, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 13036, + "time_per_iteration": 2.4644551277160645 + }, + { + "auxiliary_loss_clip": 0.01105291, + "auxiliary_loss_mlp": 0.01032277, + "balance_loss_clip": 1.01995683, + "balance_loss_mlp": 1.03585243, + "epoch": 0.7838268450323163, + "flos": 22383121415040.0, + "grad_norm": 2.2777930341142945, + "language_loss": 0.72937357, + "learning_rate": 4.703895486362031e-07, + "loss": 0.75074923, + "num_input_tokens_seen": 281230850, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 13037, + "time_per_iteration": 2.449385404586792 + }, + { + "auxiliary_loss_clip": 0.01097375, + "auxiliary_loss_mlp": 0.01029582, + "balance_loss_clip": 1.0175482, + "balance_loss_mlp": 1.03236222, + "epoch": 0.7838869682849842, + "flos": 19500284689920.0, + "grad_norm": 2.4737781125187808, + "language_loss": 0.60029399, + "learning_rate": 4.701386624460717e-07, + "loss": 0.62156355, + "num_input_tokens_seen": 281249810, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6484375, + "step": 13038, + "time_per_iteration": 2.467207193374634 + }, + { + "auxiliary_loss_clip": 0.010977, + "auxiliary_loss_mlp": 0.01027526, + "balance_loss_clip": 1.01651084, + "balance_loss_mlp": 1.03378868, + "epoch": 0.7839470915376522, + "flos": 32892845690880.0, + "grad_norm": 1.8286159549617163, + "language_loss": 0.68401051, + "learning_rate": 4.698878342684349e-07, + "loss": 0.70526278, + "num_input_tokens_seen": 281273730, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 13039, + "time_per_iteration": 2.576012372970581 + }, + { + "auxiliary_loss_clip": 0.01095371, + "auxiliary_loss_mlp": 0.01021071, + "balance_loss_clip": 1.01055706, + "balance_loss_mlp": 1.03193581, + "epoch": 0.7840072147903202, + "flos": 29676003373440.0, + "grad_norm": 1.8627494716028734, + "language_loss": 0.68923277, + "learning_rate": 4.6963706411280537e-07, + "loss": 0.71039724, + "num_input_tokens_seen": 281293670, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6328125, + "step": 13040, + "time_per_iteration": 2.5061099529266357 + }, + { + "auxiliary_loss_clip": 0.01100843, + "auxiliary_loss_mlp": 0.01031272, + "balance_loss_clip": 1.01966667, + "balance_loss_mlp": 1.03439748, + "epoch": 0.7840673380429881, + "flos": 18186744234240.0, + "grad_norm": 1.5445420563280179, + "language_loss": 0.67223978, + "learning_rate": 4.6938635198869116e-07, + "loss": 0.6935609, + "num_input_tokens_seen": 281313070, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 13041, + "time_per_iteration": 2.4612159729003906 + }, + { + "auxiliary_loss_clip": 0.01023594, + "auxiliary_loss_mlp": 0.01001116, + "balance_loss_clip": 1.00019228, + "balance_loss_mlp": 1.00344205, + "epoch": 0.7841274612956561, + "flos": 66346006613760.0, + "grad_norm": 0.6599910887916006, + "language_loss": 0.57391232, + "learning_rate": 4.691356979055998e-07, + "loss": 0.59415942, + "num_input_tokens_seen": 281374880, + "router_z_loss_clip": 0.00921631, + "router_z_loss_mlp": 0.20117188, + "step": 13042, + "time_per_iteration": 3.0452370643615723 + }, + { + "auxiliary_loss_clip": 0.0110195, + "auxiliary_loss_mlp": 0.01027245, + "balance_loss_clip": 1.01545572, + "balance_loss_mlp": 1.03551662, + "epoch": 0.784187584548324, + "flos": 26648482665600.0, + "grad_norm": 2.3220034153225235, + "language_loss": 0.83760583, + "learning_rate": 4.688851018730369e-07, + "loss": 0.85889781, + "num_input_tokens_seen": 281392620, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6640625, + "step": 13043, + "time_per_iteration": 2.4752867221832275 + }, + { + "auxiliary_loss_clip": 0.01097987, + "auxiliary_loss_mlp": 0.01025032, + "balance_loss_clip": 1.01391542, + "balance_loss_mlp": 1.03412688, + "epoch": 0.7842477078009921, + "flos": 25740158515200.0, + "grad_norm": 1.3727755929331091, + "language_loss": 0.88437784, + "learning_rate": 4.6863456390050425e-07, + "loss": 0.905608, + "num_input_tokens_seen": 281413140, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.640625, + "step": 13044, + "time_per_iteration": 2.4991369247436523 + }, + { + "auxiliary_loss_clip": 0.01104783, + "auxiliary_loss_mlp": 0.0103004, + "balance_loss_clip": 1.01857805, + "balance_loss_mlp": 1.03586638, + "epoch": 0.78430783105366, + "flos": 21980957765760.0, + "grad_norm": 2.298673788206572, + "language_loss": 0.79098254, + "learning_rate": 4.6838408399750195e-07, + "loss": 0.81233072, + "num_input_tokens_seen": 281430860, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6875, + "step": 13045, + "time_per_iteration": 2.4472832679748535 + }, + { + "auxiliary_loss_clip": 0.01098057, + "auxiliary_loss_mlp": 0.01027036, + "balance_loss_clip": 1.0161643, + "balance_loss_mlp": 1.03325605, + "epoch": 0.784367954306328, + "flos": 23842279607040.0, + "grad_norm": 1.3934452663009353, + "language_loss": 0.72286654, + "learning_rate": 4.6813366217352925e-07, + "loss": 0.7441175, + "num_input_tokens_seen": 281451385, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6484375, + "step": 13046, + "time_per_iteration": 2.487778425216675 + }, + { + "auxiliary_loss_clip": 0.01098961, + "auxiliary_loss_mlp": 0.0103391, + "balance_loss_clip": 1.0218997, + "balance_loss_mlp": 1.03507853, + "epoch": 0.7844280775589959, + "flos": 24826662806400.0, + "grad_norm": 1.566633263646869, + "language_loss": 0.63192189, + "learning_rate": 4.678832984380809e-07, + "loss": 0.65325058, + "num_input_tokens_seen": 281472255, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.63671875, + "step": 13047, + "time_per_iteration": 2.5349674224853516 + }, + { + "auxiliary_loss_clip": 0.01098768, + "auxiliary_loss_mlp": 0.01024033, + "balance_loss_clip": 1.01313078, + "balance_loss_mlp": 1.03501678, + "epoch": 0.7844882008116639, + "flos": 22455660931200.0, + "grad_norm": 1.5581126874211093, + "language_loss": 0.73077911, + "learning_rate": 4.676329928006515e-07, + "loss": 0.75200713, + "num_input_tokens_seen": 281492860, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.63671875, + "step": 13048, + "time_per_iteration": 2.4880495071411133 + }, + { + "auxiliary_loss_clip": 0.01105114, + "auxiliary_loss_mlp": 0.01031215, + "balance_loss_clip": 1.01921093, + "balance_loss_mlp": 1.03758121, + "epoch": 0.7845483240643318, + "flos": 26104041244800.0, + "grad_norm": 2.6312152451554587, + "language_loss": 0.74826312, + "learning_rate": 4.6738274527073243e-07, + "loss": 0.76962638, + "num_input_tokens_seen": 281511815, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 13049, + "time_per_iteration": 2.477346658706665 + }, + { + "auxiliary_loss_clip": 0.01103242, + "auxiliary_loss_mlp": 0.01028189, + "balance_loss_clip": 1.01565409, + "balance_loss_mlp": 1.0343411, + "epoch": 0.7846084473169999, + "flos": 19354307817600.0, + "grad_norm": 1.741709533193149, + "language_loss": 0.72563767, + "learning_rate": 4.6713255585781454e-07, + "loss": 0.746952, + "num_input_tokens_seen": 281530090, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 13050, + "time_per_iteration": 2.4637062549591064 + }, + { + "auxiliary_loss_clip": 0.01099539, + "auxiliary_loss_mlp": 0.01033602, + "balance_loss_clip": 1.0217284, + "balance_loss_mlp": 1.03509378, + "epoch": 0.7846685705696678, + "flos": 23325811902720.0, + "grad_norm": 2.325466593852248, + "language_loss": 0.73197848, + "learning_rate": 4.668824245713825e-07, + "loss": 0.75330985, + "num_input_tokens_seen": 281547075, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.64453125, + "step": 13051, + "time_per_iteration": 2.4410598278045654 + }, + { + "auxiliary_loss_clip": 0.01102687, + "auxiliary_loss_mlp": 0.0103361, + "balance_loss_clip": 1.02142096, + "balance_loss_mlp": 1.03567302, + "epoch": 0.7847286938223358, + "flos": 35809545962880.0, + "grad_norm": 2.1693731979967965, + "language_loss": 0.72507489, + "learning_rate": 4.666323514209227e-07, + "loss": 0.74643779, + "num_input_tokens_seen": 281568080, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.66796875, + "step": 13052, + "time_per_iteration": 2.580509901046753 + }, + { + "auxiliary_loss_clip": 0.01096936, + "auxiliary_loss_mlp": 0.01029725, + "balance_loss_clip": 1.01875806, + "balance_loss_mlp": 1.0346005, + "epoch": 0.7847888170750038, + "flos": 18478159274880.0, + "grad_norm": 1.7569531144927393, + "language_loss": 0.69126081, + "learning_rate": 4.663823364159183e-07, + "loss": 0.71252745, + "num_input_tokens_seen": 281586925, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.625, + "step": 13053, + "time_per_iteration": 3.805539131164551 + }, + { + "auxiliary_loss_clip": 0.01098051, + "auxiliary_loss_mlp": 0.01027333, + "balance_loss_clip": 1.01637769, + "balance_loss_mlp": 1.03426385, + "epoch": 0.7848489403276717, + "flos": 25119155255040.0, + "grad_norm": 2.052215222925797, + "language_loss": 0.70214486, + "learning_rate": 4.6613237956584893e-07, + "loss": 0.72339875, + "num_input_tokens_seen": 281603915, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.640625, + "step": 13054, + "time_per_iteration": 2.4813599586486816 + }, + { + "auxiliary_loss_clip": 0.01103116, + "auxiliary_loss_mlp": 0.01034346, + "balance_loss_clip": 1.02268767, + "balance_loss_mlp": 1.03524971, + "epoch": 0.7849090635803397, + "flos": 26502433966080.0, + "grad_norm": 1.891443504325583, + "language_loss": 0.75708246, + "learning_rate": 4.658824808801938e-07, + "loss": 0.77845711, + "num_input_tokens_seen": 281624220, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 13055, + "time_per_iteration": 3.9307680130004883 + }, + { + "auxiliary_loss_clip": 0.01106616, + "auxiliary_loss_mlp": 0.01028557, + "balance_loss_clip": 1.01664162, + "balance_loss_mlp": 1.03725183, + "epoch": 0.7849691868330076, + "flos": 20959658363520.0, + "grad_norm": 6.321454082407856, + "language_loss": 0.74865484, + "learning_rate": 4.656326403684283e-07, + "loss": 0.77000654, + "num_input_tokens_seen": 281642325, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69140625, + "step": 13056, + "time_per_iteration": 4.0152342319488525 + }, + { + "auxiliary_loss_clip": 0.01101822, + "auxiliary_loss_mlp": 0.01027242, + "balance_loss_clip": 1.01566076, + "balance_loss_mlp": 1.03655851, + "epoch": 0.7850293100856757, + "flos": 26067484177920.0, + "grad_norm": 1.5631013098906712, + "language_loss": 0.70461977, + "learning_rate": 4.6538285804002744e-07, + "loss": 0.72591043, + "num_input_tokens_seen": 281663065, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 13057, + "time_per_iteration": 2.5022852420806885 + }, + { + "auxiliary_loss_clip": 0.01101195, + "auxiliary_loss_mlp": 0.01031025, + "balance_loss_clip": 1.01983142, + "balance_loss_mlp": 1.03427744, + "epoch": 0.7850894333383436, + "flos": 22491894775680.0, + "grad_norm": 2.087059911869826, + "language_loss": 0.7686438, + "learning_rate": 4.6513313390446175e-07, + "loss": 0.78996599, + "num_input_tokens_seen": 281681005, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.66796875, + "step": 13058, + "time_per_iteration": 3.913203716278076 + }, + { + "auxiliary_loss_clip": 0.01101711, + "auxiliary_loss_mlp": 0.01029686, + "balance_loss_clip": 1.01822972, + "balance_loss_mlp": 1.03652596, + "epoch": 0.7851495565910116, + "flos": 20558643949440.0, + "grad_norm": 1.620822282702505, + "language_loss": 0.70728242, + "learning_rate": 4.6488346797120146e-07, + "loss": 0.72859639, + "num_input_tokens_seen": 281697965, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65234375, + "step": 13059, + "time_per_iteration": 2.4571406841278076 + }, + { + "auxiliary_loss_clip": 0.01104562, + "auxiliary_loss_mlp": 0.01038767, + "balance_loss_clip": 1.02604127, + "balance_loss_mlp": 1.03527403, + "epoch": 0.7852096798436795, + "flos": 15924838942080.0, + "grad_norm": 1.7516949433985336, + "language_loss": 0.76551163, + "learning_rate": 4.646338602497144e-07, + "loss": 0.78694499, + "num_input_tokens_seen": 281716035, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 13060, + "time_per_iteration": 2.452622413635254 + }, + { + "auxiliary_loss_clip": 0.0110109, + "auxiliary_loss_mlp": 0.01030961, + "balance_loss_clip": 1.01882577, + "balance_loss_mlp": 1.03516376, + "epoch": 0.7852698030963475, + "flos": 19062282245760.0, + "grad_norm": 2.1122245234180923, + "language_loss": 0.77249229, + "learning_rate": 4.643843107494654e-07, + "loss": 0.79381275, + "num_input_tokens_seen": 281732815, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66015625, + "step": 13061, + "time_per_iteration": 2.4392404556274414 + }, + { + "auxiliary_loss_clip": 0.01100348, + "auxiliary_loss_mlp": 0.01029308, + "balance_loss_clip": 1.01744044, + "balance_loss_mlp": 1.03367698, + "epoch": 0.7853299263490154, + "flos": 24644380262400.0, + "grad_norm": 2.075148531111265, + "language_loss": 0.73844373, + "learning_rate": 4.641348194799164e-07, + "loss": 0.75974035, + "num_input_tokens_seen": 281751980, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66796875, + "step": 13062, + "time_per_iteration": 2.542872428894043 + }, + { + "auxiliary_loss_clip": 0.01097942, + "auxiliary_loss_mlp": 0.01029163, + "balance_loss_clip": 1.01824331, + "balance_loss_mlp": 1.03418064, + "epoch": 0.7853900496016835, + "flos": 22017981709440.0, + "grad_norm": 1.4437360757682784, + "language_loss": 0.68408203, + "learning_rate": 4.638853864505297e-07, + "loss": 0.70535302, + "num_input_tokens_seen": 281772670, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.640625, + "step": 13063, + "time_per_iteration": 2.468757390975952 + }, + { + "auxiliary_loss_clip": 0.01102772, + "auxiliary_loss_mlp": 0.01033084, + "balance_loss_clip": 1.02163374, + "balance_loss_mlp": 1.03934288, + "epoch": 0.7854501728543514, + "flos": 30227412032640.0, + "grad_norm": 2.216322061173653, + "language_loss": 0.7278775, + "learning_rate": 4.636360116707625e-07, + "loss": 0.74923611, + "num_input_tokens_seen": 281792930, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6328125, + "step": 13064, + "time_per_iteration": 2.629014730453491 + }, + { + "auxiliary_loss_clip": 0.01101508, + "auxiliary_loss_mlp": 0.01031412, + "balance_loss_clip": 1.01990271, + "balance_loss_mlp": 1.03406608, + "epoch": 0.7855102961070194, + "flos": 18843694030080.0, + "grad_norm": 1.7428353830367498, + "language_loss": 0.67990673, + "learning_rate": 4.633866951500718e-07, + "loss": 0.70123595, + "num_input_tokens_seen": 281811805, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.67578125, + "step": 13065, + "time_per_iteration": 2.440537691116333 + }, + { + "auxiliary_loss_clip": 0.01101655, + "auxiliary_loss_mlp": 0.01030538, + "balance_loss_clip": 1.01917148, + "balance_loss_mlp": 1.03686762, + "epoch": 0.7855704193596874, + "flos": 22309971367680.0, + "grad_norm": 1.9043114354962565, + "language_loss": 0.76035756, + "learning_rate": 4.6313743689791196e-07, + "loss": 0.78167951, + "num_input_tokens_seen": 281831885, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6484375, + "step": 13066, + "time_per_iteration": 2.4779815673828125 + }, + { + "auxiliary_loss_clip": 0.01023361, + "auxiliary_loss_mlp": 0.0100262, + "balance_loss_clip": 1.00158274, + "balance_loss_mlp": 1.00318313, + "epoch": 0.7856305426123553, + "flos": 60004434407040.0, + "grad_norm": 0.7064057313548338, + "language_loss": 0.53389549, + "learning_rate": 4.628882369237346e-07, + "loss": 0.55415535, + "num_input_tokens_seen": 281900310, + "router_z_loss_clip": 0.01037598, + "router_z_loss_mlp": 0.20214844, + "step": 13067, + "time_per_iteration": 3.158377170562744 + }, + { + "auxiliary_loss_clip": 0.01099339, + "auxiliary_loss_mlp": 0.01030027, + "balance_loss_clip": 1.01784921, + "balance_loss_mlp": 1.03333259, + "epoch": 0.7856906658650233, + "flos": 21868593045120.0, + "grad_norm": 1.7780609677400445, + "language_loss": 0.67590213, + "learning_rate": 4.62639095236989e-07, + "loss": 0.69719583, + "num_input_tokens_seen": 281918870, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.66015625, + "step": 13068, + "time_per_iteration": 2.4604732990264893 + }, + { + "auxiliary_loss_clip": 0.01099845, + "auxiliary_loss_mlp": 0.01030575, + "balance_loss_clip": 1.01966739, + "balance_loss_mlp": 1.03644729, + "epoch": 0.7857507891176913, + "flos": 23622937205760.0, + "grad_norm": 1.9961392096486945, + "language_loss": 0.67999709, + "learning_rate": 4.6239001184712267e-07, + "loss": 0.70130128, + "num_input_tokens_seen": 281936905, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6328125, + "step": 13069, + "time_per_iteration": 2.470776319503784 + }, + { + "auxiliary_loss_clip": 0.01102413, + "auxiliary_loss_mlp": 0.01030592, + "balance_loss_clip": 1.01887965, + "balance_loss_mlp": 1.03625858, + "epoch": 0.7858109123703593, + "flos": 25520061928320.0, + "grad_norm": 1.6342789712373722, + "language_loss": 0.76993471, + "learning_rate": 4.6214098676358195e-07, + "loss": 0.79126477, + "num_input_tokens_seen": 281955625, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 13070, + "time_per_iteration": 2.4821813106536865 + }, + { + "auxiliary_loss_clip": 0.01097348, + "auxiliary_loss_mlp": 0.01030243, + "balance_loss_clip": 1.01948428, + "balance_loss_mlp": 1.0329771, + "epoch": 0.7858710356230272, + "flos": 17457398576640.0, + "grad_norm": 1.5497406441787502, + "language_loss": 0.65501463, + "learning_rate": 4.618920199958083e-07, + "loss": 0.67629051, + "num_input_tokens_seen": 281973285, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.64453125, + "step": 13071, + "time_per_iteration": 2.4392311573028564 + }, + { + "auxiliary_loss_clip": 0.01099716, + "auxiliary_loss_mlp": 0.01031474, + "balance_loss_clip": 1.02051842, + "balance_loss_mlp": 1.03337324, + "epoch": 0.7859311588756952, + "flos": 24679680353280.0, + "grad_norm": 1.7465471589650208, + "language_loss": 0.74096799, + "learning_rate": 4.616431115532442e-07, + "loss": 0.76227987, + "num_input_tokens_seen": 281991410, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6640625, + "step": 13072, + "time_per_iteration": 2.4858996868133545 + }, + { + "auxiliary_loss_clip": 0.0110303, + "auxiliary_loss_mlp": 0.01029417, + "balance_loss_clip": 1.01730585, + "balance_loss_mlp": 1.03666794, + "epoch": 0.7859912821283631, + "flos": 21799142098560.0, + "grad_norm": 2.0042152052909206, + "language_loss": 0.71074873, + "learning_rate": 4.613942614453268e-07, + "loss": 0.73207319, + "num_input_tokens_seen": 282010845, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 13073, + "time_per_iteration": 2.454535961151123 + }, + { + "auxiliary_loss_clip": 0.01099776, + "auxiliary_loss_mlp": 0.0103377, + "balance_loss_clip": 1.0218128, + "balance_loss_mlp": 1.03427434, + "epoch": 0.7860514053810311, + "flos": 20847293642880.0, + "grad_norm": 1.677206170034674, + "language_loss": 0.76719201, + "learning_rate": 4.611454696814938e-07, + "loss": 0.78852749, + "num_input_tokens_seen": 282029635, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 13074, + "time_per_iteration": 2.4688336849212646 + }, + { + "auxiliary_loss_clip": 0.01097672, + "auxiliary_loss_mlp": 0.01030189, + "balance_loss_clip": 1.01888216, + "balance_loss_mlp": 1.03478217, + "epoch": 0.786111528633699, + "flos": 24315689882880.0, + "grad_norm": 1.626029190410932, + "language_loss": 0.74981356, + "learning_rate": 4.608967362711782e-07, + "loss": 0.77109224, + "num_input_tokens_seen": 282050285, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.62890625, + "step": 13075, + "time_per_iteration": 2.4762327671051025 + }, + { + "auxiliary_loss_clip": 0.01100533, + "auxiliary_loss_mlp": 0.01024172, + "balance_loss_clip": 1.01356792, + "balance_loss_mlp": 1.03545177, + "epoch": 0.7861716518863671, + "flos": 24353180703360.0, + "grad_norm": 1.7567110977428382, + "language_loss": 0.6898433, + "learning_rate": 4.6064806122381283e-07, + "loss": 0.71109033, + "num_input_tokens_seen": 282071040, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.65234375, + "step": 13076, + "time_per_iteration": 2.5244879722595215 + }, + { + "auxiliary_loss_clip": 0.01098306, + "auxiliary_loss_mlp": 0.01026682, + "balance_loss_clip": 1.01502383, + "balance_loss_mlp": 1.0347321, + "epoch": 0.786231775139035, + "flos": 14022399006720.0, + "grad_norm": 2.2025280596790395, + "language_loss": 0.80192757, + "learning_rate": 4.603994445488282e-07, + "loss": 0.8231774, + "num_input_tokens_seen": 282086610, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.63671875, + "step": 13077, + "time_per_iteration": 2.491744041442871 + }, + { + "auxiliary_loss_clip": 0.01100583, + "auxiliary_loss_mlp": 0.01032144, + "balance_loss_clip": 1.01986599, + "balance_loss_mlp": 1.03536844, + "epoch": 0.786291898391703, + "flos": 33724248865920.0, + "grad_norm": 1.490748661053691, + "language_loss": 0.70515674, + "learning_rate": 4.6015088625564956e-07, + "loss": 0.72648406, + "num_input_tokens_seen": 282107440, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.65234375, + "step": 13078, + "time_per_iteration": 2.555865526199341 + }, + { + "auxiliary_loss_clip": 0.0109965, + "auxiliary_loss_mlp": 0.01030772, + "balance_loss_clip": 1.01984668, + "balance_loss_mlp": 1.0353632, + "epoch": 0.786352021644371, + "flos": 25811476968960.0, + "grad_norm": 1.565975595125152, + "language_loss": 0.81306797, + "learning_rate": 4.599023863537039e-07, + "loss": 0.83437216, + "num_input_tokens_seen": 282127290, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.640625, + "step": 13079, + "time_per_iteration": 2.49438738822937 + }, + { + "auxiliary_loss_clip": 0.01096305, + "auxiliary_loss_mlp": 0.01026692, + "balance_loss_clip": 1.01586151, + "balance_loss_mlp": 1.03352332, + "epoch": 0.7864121448970389, + "flos": 28910818920960.0, + "grad_norm": 1.6630658201399222, + "language_loss": 0.68445063, + "learning_rate": 4.596539448524146e-07, + "loss": 0.70568061, + "num_input_tokens_seen": 282147505, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.62890625, + "step": 13080, + "time_per_iteration": 2.5388312339782715 + }, + { + "auxiliary_loss_clip": 0.01099497, + "auxiliary_loss_mlp": 0.01031122, + "balance_loss_clip": 1.01981521, + "balance_loss_mlp": 1.03463578, + "epoch": 0.7864722681497069, + "flos": 19208833735680.0, + "grad_norm": 1.6317908200800284, + "language_loss": 0.69513613, + "learning_rate": 4.594055617612016e-07, + "loss": 0.71644235, + "num_input_tokens_seen": 282166450, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 13081, + "time_per_iteration": 2.470564842224121 + }, + { + "auxiliary_loss_clip": 0.01100243, + "auxiliary_loss_mlp": 0.01035049, + "balance_loss_clip": 1.02367032, + "balance_loss_mlp": 1.03415251, + "epoch": 0.7865323914023749, + "flos": 21871573873920.0, + "grad_norm": 1.6215934459039671, + "language_loss": 0.68073553, + "learning_rate": 4.591572370894838e-07, + "loss": 0.70208842, + "num_input_tokens_seen": 282186465, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.66015625, + "step": 13082, + "time_per_iteration": 2.47454833984375 + }, + { + "auxiliary_loss_clip": 0.01099019, + "auxiliary_loss_mlp": 0.01034558, + "balance_loss_clip": 1.02276242, + "balance_loss_mlp": 1.03449476, + "epoch": 0.7865925146550429, + "flos": 25520313323520.0, + "grad_norm": 1.8334733344878817, + "language_loss": 0.66071731, + "learning_rate": 4.589089708466789e-07, + "loss": 0.68205309, + "num_input_tokens_seen": 282207180, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.64453125, + "step": 13083, + "time_per_iteration": 2.4937517642974854 + }, + { + "auxiliary_loss_clip": 0.01103443, + "auxiliary_loss_mlp": 0.01030832, + "balance_loss_clip": 1.01840496, + "balance_loss_mlp": 1.03549075, + "epoch": 0.7866526379077108, + "flos": 19097366855040.0, + "grad_norm": 2.042540926509675, + "language_loss": 0.74778521, + "learning_rate": 4.5866076304220015e-07, + "loss": 0.76912796, + "num_input_tokens_seen": 282225865, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6796875, + "step": 13084, + "time_per_iteration": 2.4672179222106934 + }, + { + "auxiliary_loss_clip": 0.01098876, + "auxiliary_loss_mlp": 0.01028809, + "balance_loss_clip": 1.01814008, + "balance_loss_mlp": 1.03493166, + "epoch": 0.7867127611603788, + "flos": 16173771171840.0, + "grad_norm": 2.928531982319309, + "language_loss": 0.70411515, + "learning_rate": 4.584126136854591e-07, + "loss": 0.72539198, + "num_input_tokens_seen": 282242895, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.640625, + "step": 13085, + "time_per_iteration": 2.427304267883301 + }, + { + "auxiliary_loss_clip": 0.01103417, + "auxiliary_loss_mlp": 0.01028061, + "balance_loss_clip": 1.01565087, + "balance_loss_mlp": 1.03474259, + "epoch": 0.7867728844130467, + "flos": 20773640805120.0, + "grad_norm": 1.8136957772733184, + "language_loss": 0.72376126, + "learning_rate": 4.5816452278586617e-07, + "loss": 0.74507606, + "num_input_tokens_seen": 282260425, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 13086, + "time_per_iteration": 2.480523109436035 + }, + { + "auxiliary_loss_clip": 0.01097734, + "auxiliary_loss_mlp": 0.0102774, + "balance_loss_clip": 1.01654005, + "balance_loss_mlp": 1.03270912, + "epoch": 0.7868330076657147, + "flos": 21760106993280.0, + "grad_norm": 1.9014411249537477, + "language_loss": 0.74928933, + "learning_rate": 4.5791649035282965e-07, + "loss": 0.77054405, + "num_input_tokens_seen": 282279335, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 13087, + "time_per_iteration": 2.469919204711914 + }, + { + "auxiliary_loss_clip": 0.01097848, + "auxiliary_loss_mlp": 0.01032258, + "balance_loss_clip": 1.02146316, + "balance_loss_mlp": 1.03391075, + "epoch": 0.7868931309183826, + "flos": 25700692446720.0, + "grad_norm": 3.8678035141678913, + "language_loss": 0.71336555, + "learning_rate": 4.5766851639575456e-07, + "loss": 0.73466659, + "num_input_tokens_seen": 282299905, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.640625, + "step": 13088, + "time_per_iteration": 2.5597689151763916 + }, + { + "auxiliary_loss_clip": 0.01023649, + "auxiliary_loss_mlp": 0.01006009, + "balance_loss_clip": 1.00502574, + "balance_loss_mlp": 1.00346375, + "epoch": 0.7869532541710507, + "flos": 64644883430400.0, + "grad_norm": 0.6844618253743016, + "language_loss": 0.55505019, + "learning_rate": 4.574206009240431e-07, + "loss": 0.57534683, + "num_input_tokens_seen": 282367620, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.20117188, + "step": 13089, + "time_per_iteration": 3.174372673034668 + }, + { + "auxiliary_loss_clip": 0.01023353, + "auxiliary_loss_mlp": 0.01001352, + "balance_loss_clip": 1.00036299, + "balance_loss_mlp": 1.00316393, + "epoch": 0.7870133774237186, + "flos": 67453600440960.0, + "grad_norm": 0.7253731939477448, + "language_loss": 0.49957851, + "learning_rate": 4.571727439470976e-07, + "loss": 0.51982558, + "num_input_tokens_seen": 282435695, + "router_z_loss_clip": 0.0098877, + "router_z_loss_mlp": 0.20214844, + "step": 13090, + "time_per_iteration": 3.1464152336120605 + }, + { + "auxiliary_loss_clip": 0.01097486, + "auxiliary_loss_mlp": 0.01026378, + "balance_loss_clip": 1.01597738, + "balance_loss_mlp": 1.03442216, + "epoch": 0.7870735006763866, + "flos": 26068310190720.0, + "grad_norm": 2.0009020702147624, + "language_loss": 0.83693981, + "learning_rate": 4.5692494547431583e-07, + "loss": 0.8581785, + "num_input_tokens_seen": 282456025, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.6328125, + "step": 13091, + "time_per_iteration": 2.5320253372192383 + }, + { + "auxiliary_loss_clip": 0.01023736, + "auxiliary_loss_mlp": 0.01003239, + "balance_loss_clip": 1.00224388, + "balance_loss_mlp": 1.00338745, + "epoch": 0.7871336239290546, + "flos": 70289572896000.0, + "grad_norm": 0.7117957030218485, + "language_loss": 0.63994247, + "learning_rate": 4.566772055150947e-07, + "loss": 0.66021222, + "num_input_tokens_seen": 282520995, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.203125, + "step": 13092, + "time_per_iteration": 3.083390474319458 + }, + { + "auxiliary_loss_clip": 0.01102492, + "auxiliary_loss_mlp": 0.01031507, + "balance_loss_clip": 1.01996171, + "balance_loss_mlp": 1.03640008, + "epoch": 0.7871937471817225, + "flos": 15778574760960.0, + "grad_norm": 3.478229156670452, + "language_loss": 0.79910231, + "learning_rate": 4.564295240788285e-07, + "loss": 0.82044232, + "num_input_tokens_seen": 282539355, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66015625, + "step": 13093, + "time_per_iteration": 2.4508519172668457 + }, + { + "auxiliary_loss_clip": 0.01097319, + "auxiliary_loss_mlp": 0.01027817, + "balance_loss_clip": 1.01696348, + "balance_loss_mlp": 1.03387761, + "epoch": 0.7872538704343905, + "flos": 20485242506880.0, + "grad_norm": 2.289206273735693, + "language_loss": 0.7536335, + "learning_rate": 4.561819011749106e-07, + "loss": 0.77488482, + "num_input_tokens_seen": 282555735, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6328125, + "step": 13094, + "time_per_iteration": 2.423264980316162 + }, + { + "auxiliary_loss_clip": 0.01101607, + "auxiliary_loss_mlp": 0.01036216, + "balance_loss_clip": 1.02510548, + "balance_loss_mlp": 1.03562438, + "epoch": 0.7873139936870585, + "flos": 25082670015360.0, + "grad_norm": 1.6408632577371567, + "language_loss": 0.79475707, + "learning_rate": 4.5593433681272884e-07, + "loss": 0.81613529, + "num_input_tokens_seen": 282574550, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66015625, + "step": 13095, + "time_per_iteration": 3.9224746227264404 + }, + { + "auxiliary_loss_clip": 0.01099901, + "auxiliary_loss_mlp": 0.01030817, + "balance_loss_clip": 1.01915216, + "balance_loss_mlp": 1.03335738, + "epoch": 0.7873741169397265, + "flos": 30883176679680.0, + "grad_norm": 2.020167585783757, + "language_loss": 0.67747319, + "learning_rate": 4.556868310016715e-07, + "loss": 0.69878036, + "num_input_tokens_seen": 282596520, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 13096, + "time_per_iteration": 4.006121635437012 + }, + { + "auxiliary_loss_clip": 0.01093799, + "auxiliary_loss_mlp": 0.01025076, + "balance_loss_clip": 1.01535416, + "balance_loss_mlp": 1.03172147, + "epoch": 0.7874342401923944, + "flos": 46791962242560.0, + "grad_norm": 1.5298468077201632, + "language_loss": 0.70352769, + "learning_rate": 4.55439383751125e-07, + "loss": 0.72471642, + "num_input_tokens_seen": 282620560, + "router_z_loss_clip": 0.09716797, + "router_z_loss_mlp": 0.625, + "step": 13097, + "time_per_iteration": 4.101962327957153 + }, + { + "auxiliary_loss_clip": 0.01102049, + "auxiliary_loss_mlp": 0.01031353, + "balance_loss_clip": 1.02018285, + "balance_loss_mlp": 1.0361073, + "epoch": 0.7874943634450624, + "flos": 23584548545280.0, + "grad_norm": 1.6655151068519558, + "language_loss": 0.80427504, + "learning_rate": 4.5519199507047126e-07, + "loss": 0.82560897, + "num_input_tokens_seen": 282639830, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.66015625, + "step": 13098, + "time_per_iteration": 2.46547532081604 + }, + { + "auxiliary_loss_clip": 0.01098922, + "auxiliary_loss_mlp": 0.01029007, + "balance_loss_clip": 1.01834953, + "balance_loss_mlp": 1.03521609, + "epoch": 0.7875544866977303, + "flos": 20191169859840.0, + "grad_norm": 1.645167890556634, + "language_loss": 0.74275064, + "learning_rate": 4.5494466496909177e-07, + "loss": 0.76402998, + "num_input_tokens_seen": 282660130, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.63671875, + "step": 13099, + "time_per_iteration": 2.485710620880127 + }, + { + "auxiliary_loss_clip": 0.01099828, + "auxiliary_loss_mlp": 0.01026234, + "balance_loss_clip": 1.01486731, + "balance_loss_mlp": 1.03532815, + "epoch": 0.7876146099503983, + "flos": 22602571557120.0, + "grad_norm": 1.60096052488611, + "language_loss": 0.78410721, + "learning_rate": 4.5469739345636603e-07, + "loss": 0.80536783, + "num_input_tokens_seen": 282681125, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6484375, + "step": 13100, + "time_per_iteration": 4.035876750946045 + }, + { + "auxiliary_loss_clip": 0.01106007, + "auxiliary_loss_mlp": 0.01028363, + "balance_loss_clip": 1.01570272, + "balance_loss_mlp": 1.03587461, + "epoch": 0.7876747332030662, + "flos": 10705833555840.0, + "grad_norm": 2.2959557681189895, + "language_loss": 0.66067588, + "learning_rate": 4.5445018054167007e-07, + "loss": 0.68201947, + "num_input_tokens_seen": 282696690, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 13101, + "time_per_iteration": 2.4304044246673584 + }, + { + "auxiliary_loss_clip": 0.01098831, + "auxiliary_loss_mlp": 0.01027717, + "balance_loss_clip": 1.01638031, + "balance_loss_mlp": 1.03366089, + "epoch": 0.7877348564557343, + "flos": 38399315621760.0, + "grad_norm": 1.576742328174997, + "language_loss": 0.7767005, + "learning_rate": 4.5420302623437745e-07, + "loss": 0.79796594, + "num_input_tokens_seen": 282721210, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 13102, + "time_per_iteration": 2.587104320526123 + }, + { + "auxiliary_loss_clip": 0.01099699, + "auxiliary_loss_mlp": 0.01033598, + "balance_loss_clip": 1.02300668, + "balance_loss_mlp": 1.03498983, + "epoch": 0.7877949797084022, + "flos": 18329524796160.0, + "grad_norm": 2.03801984289661, + "language_loss": 0.82200575, + "learning_rate": 4.5395593054386093e-07, + "loss": 0.84333879, + "num_input_tokens_seen": 282738505, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.6484375, + "step": 13103, + "time_per_iteration": 2.4504380226135254 + }, + { + "auxiliary_loss_clip": 0.01102423, + "auxiliary_loss_mlp": 0.01033481, + "balance_loss_clip": 1.02108872, + "balance_loss_mlp": 1.03538537, + "epoch": 0.7878551029610702, + "flos": 25806736373760.0, + "grad_norm": 1.9382935639553287, + "language_loss": 0.80800354, + "learning_rate": 4.537088934794913e-07, + "loss": 0.82936251, + "num_input_tokens_seen": 282756895, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 13104, + "time_per_iteration": 2.4761226177215576 + }, + { + "auxiliary_loss_clip": 0.01101176, + "auxiliary_loss_mlp": 0.01032376, + "balance_loss_clip": 1.02072978, + "balance_loss_mlp": 1.03486192, + "epoch": 0.7879152262137382, + "flos": 22342685679360.0, + "grad_norm": 1.5580110951181336, + "language_loss": 0.74400711, + "learning_rate": 4.5346191505063515e-07, + "loss": 0.76534271, + "num_input_tokens_seen": 282774955, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 13105, + "time_per_iteration": 2.458893060684204 + }, + { + "auxiliary_loss_clip": 0.01102329, + "auxiliary_loss_mlp": 0.01033841, + "balance_loss_clip": 1.0220865, + "balance_loss_mlp": 1.03494358, + "epoch": 0.7879753494664061, + "flos": 24785329230720.0, + "grad_norm": 1.6914912151610795, + "language_loss": 0.75718057, + "learning_rate": 4.5321499526665776e-07, + "loss": 0.77854228, + "num_input_tokens_seen": 282793165, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.67578125, + "step": 13106, + "time_per_iteration": 2.4740750789642334 + }, + { + "auxiliary_loss_clip": 0.01101506, + "auxiliary_loss_mlp": 0.01032404, + "balance_loss_clip": 1.02129924, + "balance_loss_mlp": 1.03471053, + "epoch": 0.7880354727190741, + "flos": 16909078487040.0, + "grad_norm": 2.2970900789620767, + "language_loss": 0.73269242, + "learning_rate": 4.5296813413692337e-07, + "loss": 0.75403154, + "num_input_tokens_seen": 282809820, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66796875, + "step": 13107, + "time_per_iteration": 2.465049982070923 + }, + { + "auxiliary_loss_clip": 0.01098914, + "auxiliary_loss_mlp": 0.0103322, + "balance_loss_clip": 1.02140641, + "balance_loss_mlp": 1.03424203, + "epoch": 0.7880955959717421, + "flos": 22230500526720.0, + "grad_norm": 1.8872299288056482, + "language_loss": 0.73182052, + "learning_rate": 4.5272133167079165e-07, + "loss": 0.75314188, + "num_input_tokens_seen": 282828600, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6484375, + "step": 13108, + "time_per_iteration": 2.4711079597473145 + }, + { + "auxiliary_loss_clip": 0.01023267, + "auxiliary_loss_mlp": 0.01002041, + "balance_loss_clip": 1.00098598, + "balance_loss_mlp": 1.00313234, + "epoch": 0.7881557192244101, + "flos": 69183200131200.0, + "grad_norm": 1.661709536618796, + "language_loss": 0.60381085, + "learning_rate": 4.5247458787762216e-07, + "loss": 0.62406397, + "num_input_tokens_seen": 282882775, + "router_z_loss_clip": 0.01055908, + "router_z_loss_mlp": 0.20117188, + "step": 13109, + "time_per_iteration": 3.0089924335479736 + }, + { + "auxiliary_loss_clip": 0.01097142, + "auxiliary_loss_mlp": 0.01028206, + "balance_loss_clip": 1.01739979, + "balance_loss_mlp": 1.03491497, + "epoch": 0.788215842477078, + "flos": 24935436167040.0, + "grad_norm": 1.5824275736461375, + "language_loss": 0.71883583, + "learning_rate": 4.5222790276677126e-07, + "loss": 0.7400893, + "num_input_tokens_seen": 282902680, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.62109375, + "step": 13110, + "time_per_iteration": 2.465576171875 + }, + { + "auxiliary_loss_clip": 0.01098188, + "auxiliary_loss_mlp": 0.01026783, + "balance_loss_clip": 1.01631093, + "balance_loss_mlp": 1.03485966, + "epoch": 0.788275965729746, + "flos": 26106483369600.0, + "grad_norm": 1.3860317758339384, + "language_loss": 0.75074577, + "learning_rate": 4.5198127634759455e-07, + "loss": 0.77199543, + "num_input_tokens_seen": 282923625, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.6328125, + "step": 13111, + "time_per_iteration": 2.4993157386779785 + }, + { + "auxiliary_loss_clip": 0.01098161, + "auxiliary_loss_mlp": 0.01031611, + "balance_loss_clip": 1.02001154, + "balance_loss_mlp": 1.03351355, + "epoch": 0.7883360889824139, + "flos": 21214803646080.0, + "grad_norm": 1.94564104551391, + "language_loss": 0.61333418, + "learning_rate": 4.5173470862944206e-07, + "loss": 0.63463187, + "num_input_tokens_seen": 282941955, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.64453125, + "step": 13112, + "time_per_iteration": 2.43581485748291 + }, + { + "auxiliary_loss_clip": 0.01099376, + "auxiliary_loss_mlp": 0.01027358, + "balance_loss_clip": 1.01515722, + "balance_loss_mlp": 1.0338614, + "epoch": 0.7883962122350819, + "flos": 21142551438720.0, + "grad_norm": 1.7382192958818077, + "language_loss": 0.67246455, + "learning_rate": 4.514881996216644e-07, + "loss": 0.69373184, + "num_input_tokens_seen": 282961280, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65625, + "step": 13113, + "time_per_iteration": 2.4511425495147705 + }, + { + "auxiliary_loss_clip": 0.01098431, + "auxiliary_loss_mlp": 0.01027803, + "balance_loss_clip": 1.0168004, + "balance_loss_mlp": 1.03448272, + "epoch": 0.7884563354877498, + "flos": 15302901928320.0, + "grad_norm": 12.027787303417453, + "language_loss": 0.58199584, + "learning_rate": 4.5124174933361e-07, + "loss": 0.60325825, + "num_input_tokens_seen": 282978210, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 13114, + "time_per_iteration": 2.3941895961761475 + }, + { + "auxiliary_loss_clip": 0.01101584, + "auxiliary_loss_mlp": 0.01028509, + "balance_loss_clip": 1.01636708, + "balance_loss_mlp": 1.03487444, + "epoch": 0.7885164587404179, + "flos": 24388301226240.0, + "grad_norm": 1.6461122480026786, + "language_loss": 0.66887224, + "learning_rate": 4.5099535777462306e-07, + "loss": 0.69017321, + "num_input_tokens_seen": 282998845, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66796875, + "step": 13115, + "time_per_iteration": 2.4768731594085693 + }, + { + "auxiliary_loss_clip": 0.01099861, + "auxiliary_loss_mlp": 0.01025915, + "balance_loss_clip": 1.01442361, + "balance_loss_mlp": 1.03510892, + "epoch": 0.7885765819930858, + "flos": 14385886686720.0, + "grad_norm": 1.909649629635062, + "language_loss": 0.8859247, + "learning_rate": 4.50749024954048e-07, + "loss": 0.90718246, + "num_input_tokens_seen": 283015200, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6484375, + "step": 13116, + "time_per_iteration": 2.4047675132751465 + }, + { + "auxiliary_loss_clip": 0.01106955, + "auxiliary_loss_mlp": 0.01031924, + "balance_loss_clip": 1.01909757, + "balance_loss_mlp": 1.0356214, + "epoch": 0.7886367052457538, + "flos": 18259930195200.0, + "grad_norm": 1.7003920490690876, + "language_loss": 0.72708535, + "learning_rate": 4.505027508812245e-07, + "loss": 0.74847412, + "num_input_tokens_seen": 283033680, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 13117, + "time_per_iteration": 2.4341704845428467 + }, + { + "auxiliary_loss_clip": 0.01097792, + "auxiliary_loss_mlp": 0.01023058, + "balance_loss_clip": 1.01247823, + "balance_loss_mlp": 1.03483558, + "epoch": 0.7886968284984217, + "flos": 15305092657920.0, + "grad_norm": 1.4826682639516906, + "language_loss": 0.79875678, + "learning_rate": 4.502565355654926e-07, + "loss": 0.81996524, + "num_input_tokens_seen": 283050620, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.62890625, + "step": 13118, + "time_per_iteration": 2.394805431365967 + }, + { + "auxiliary_loss_clip": 0.01099322, + "auxiliary_loss_mlp": 0.01027518, + "balance_loss_clip": 1.01643777, + "balance_loss_mlp": 1.03507447, + "epoch": 0.7887569517510897, + "flos": 21215450090880.0, + "grad_norm": 1.7945164673922278, + "language_loss": 0.73091543, + "learning_rate": 4.500103790161878e-07, + "loss": 0.75218379, + "num_input_tokens_seen": 283070215, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.64453125, + "step": 13119, + "time_per_iteration": 2.460057258605957 + }, + { + "auxiliary_loss_clip": 0.01101447, + "auxiliary_loss_mlp": 0.01023623, + "balance_loss_clip": 1.01194072, + "balance_loss_mlp": 1.03509176, + "epoch": 0.7888170750037578, + "flos": 22711237176960.0, + "grad_norm": 1.261657596478895, + "language_loss": 0.71529341, + "learning_rate": 4.4976428124264454e-07, + "loss": 0.73654413, + "num_input_tokens_seen": 283091485, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 13120, + "time_per_iteration": 2.455064058303833 + }, + { + "auxiliary_loss_clip": 0.01100545, + "auxiliary_loss_mlp": 0.01031312, + "balance_loss_clip": 1.01978469, + "balance_loss_mlp": 1.03517127, + "epoch": 0.7888771982564257, + "flos": 36429148592640.0, + "grad_norm": 1.4332103532117941, + "language_loss": 0.78814548, + "learning_rate": 4.4951824225419564e-07, + "loss": 0.8094641, + "num_input_tokens_seen": 283115040, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 13121, + "time_per_iteration": 2.599400281906128 + }, + { + "auxiliary_loss_clip": 0.01098409, + "auxiliary_loss_mlp": 0.01026067, + "balance_loss_clip": 1.01478994, + "balance_loss_mlp": 1.03450656, + "epoch": 0.7889373215090937, + "flos": 27309993488640.0, + "grad_norm": 1.3967660183368626, + "language_loss": 0.80094564, + "learning_rate": 4.4927226206017057e-07, + "loss": 0.8221904, + "num_input_tokens_seen": 283136925, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.640625, + "step": 13122, + "time_per_iteration": 2.4992713928222656 + }, + { + "auxiliary_loss_clip": 0.0110103, + "auxiliary_loss_mlp": 0.01022634, + "balance_loss_clip": 1.01157165, + "balance_loss_mlp": 1.03481627, + "epoch": 0.7889974447617616, + "flos": 19829010983040.0, + "grad_norm": 2.145985677381676, + "language_loss": 0.77920961, + "learning_rate": 4.4902634066989597e-07, + "loss": 0.80044621, + "num_input_tokens_seen": 283155725, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.66015625, + "step": 13123, + "time_per_iteration": 2.4735960960388184 + }, + { + "auxiliary_loss_clip": 0.01104198, + "auxiliary_loss_mlp": 0.01029642, + "balance_loss_clip": 1.01790643, + "balance_loss_mlp": 1.0362196, + "epoch": 0.7890575680144296, + "flos": 17271201450240.0, + "grad_norm": 1.856299947344871, + "language_loss": 0.6726073, + "learning_rate": 4.487804780926985e-07, + "loss": 0.69394577, + "num_input_tokens_seen": 283173845, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6796875, + "step": 13124, + "time_per_iteration": 2.4079813957214355 + }, + { + "auxiliary_loss_clip": 0.01103324, + "auxiliary_loss_mlp": 0.01025655, + "balance_loss_clip": 1.01391327, + "balance_loss_mlp": 1.03546476, + "epoch": 0.7891176912670975, + "flos": 27600151553280.0, + "grad_norm": 2.605711353354914, + "language_loss": 0.72957736, + "learning_rate": 4.4853467433790036e-07, + "loss": 0.75086713, + "num_input_tokens_seen": 283191985, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 13125, + "time_per_iteration": 2.5052480697631836 + }, + { + "auxiliary_loss_clip": 0.01099892, + "auxiliary_loss_mlp": 0.01027655, + "balance_loss_clip": 1.0155673, + "balance_loss_mlp": 1.03235054, + "epoch": 0.7891778145197655, + "flos": 22711668140160.0, + "grad_norm": 2.154516730399549, + "language_loss": 0.72528452, + "learning_rate": 4.4828892941482267e-07, + "loss": 0.74655998, + "num_input_tokens_seen": 283210855, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 13126, + "time_per_iteration": 2.4527993202209473 + }, + { + "auxiliary_loss_clip": 0.01102896, + "auxiliary_loss_mlp": 0.01026431, + "balance_loss_clip": 1.01474881, + "balance_loss_mlp": 1.03575099, + "epoch": 0.7892379377724335, + "flos": 17310775259520.0, + "grad_norm": 2.0791406277804567, + "language_loss": 0.76886559, + "learning_rate": 4.480432433327845e-07, + "loss": 0.79015887, + "num_input_tokens_seen": 283229665, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.671875, + "step": 13127, + "time_per_iteration": 2.4405977725982666 + }, + { + "auxiliary_loss_clip": 0.01098106, + "auxiliary_loss_mlp": 0.01026719, + "balance_loss_clip": 1.0155077, + "balance_loss_mlp": 1.03493667, + "epoch": 0.7892980610251015, + "flos": 25775674087680.0, + "grad_norm": 1.7461753139665992, + "language_loss": 0.85763645, + "learning_rate": 4.47797616101103e-07, + "loss": 0.87888473, + "num_input_tokens_seen": 283248615, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6328125, + "step": 13128, + "time_per_iteration": 2.474844455718994 + }, + { + "auxiliary_loss_clip": 0.01098818, + "auxiliary_loss_mlp": 0.01031045, + "balance_loss_clip": 1.02045906, + "balance_loss_mlp": 1.03425086, + "epoch": 0.7893581842777694, + "flos": 21579943351680.0, + "grad_norm": 2.0767433694769175, + "language_loss": 0.68800604, + "learning_rate": 4.475520477290904e-07, + "loss": 0.70930469, + "num_input_tokens_seen": 283267135, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.64453125, + "step": 13129, + "time_per_iteration": 2.5359485149383545 + }, + { + "auxiliary_loss_clip": 0.01022991, + "auxiliary_loss_mlp": 0.01001965, + "balance_loss_clip": 1.00090396, + "balance_loss_mlp": 1.00285482, + "epoch": 0.7894183075304374, + "flos": 69016468176000.0, + "grad_norm": 0.7130558400515205, + "language_loss": 0.61589611, + "learning_rate": 4.473065382260597e-07, + "loss": 0.63614571, + "num_input_tokens_seen": 283328940, + "router_z_loss_clip": 0.01062012, + "router_z_loss_mlp": 0.20117188, + "step": 13130, + "time_per_iteration": 3.0489916801452637 + }, + { + "auxiliary_loss_clip": 0.01103251, + "auxiliary_loss_mlp": 0.01027204, + "balance_loss_clip": 1.01583779, + "balance_loss_mlp": 1.03717756, + "epoch": 0.7894784307831053, + "flos": 24243258107520.0, + "grad_norm": 1.6182422451860332, + "language_loss": 0.73774695, + "learning_rate": 4.4706108760132124e-07, + "loss": 0.7590515, + "num_input_tokens_seen": 283350000, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6640625, + "step": 13131, + "time_per_iteration": 2.475581169128418 + }, + { + "auxiliary_loss_clip": 0.01108004, + "auxiliary_loss_mlp": 0.01025009, + "balance_loss_clip": 1.01216388, + "balance_loss_mlp": 1.034796, + "epoch": 0.7895385540357733, + "flos": 20266546550400.0, + "grad_norm": 2.199372765286003, + "language_loss": 0.68987596, + "learning_rate": 4.4681569586418153e-07, + "loss": 0.71120608, + "num_input_tokens_seen": 283368020, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.734375, + "step": 13132, + "time_per_iteration": 2.4295406341552734 + }, + { + "auxiliary_loss_clip": 0.01102436, + "auxiliary_loss_mlp": 0.01033148, + "balance_loss_clip": 1.02129269, + "balance_loss_mlp": 1.03545117, + "epoch": 0.7895986772884414, + "flos": 20996574566400.0, + "grad_norm": 2.1121460507768406, + "language_loss": 0.62110436, + "learning_rate": 4.465703630239468e-07, + "loss": 0.64246017, + "num_input_tokens_seen": 283387030, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66796875, + "step": 13133, + "time_per_iteration": 2.483172655105591 + }, + { + "auxiliary_loss_clip": 0.01105396, + "auxiliary_loss_mlp": 0.0103757, + "balance_loss_clip": 1.02418268, + "balance_loss_mlp": 1.03652048, + "epoch": 0.7896588005411093, + "flos": 18657999694080.0, + "grad_norm": 2.3671306381438817, + "language_loss": 0.79635763, + "learning_rate": 4.463250890899195e-07, + "loss": 0.81778735, + "num_input_tokens_seen": 283402090, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.6875, + "step": 13134, + "time_per_iteration": 2.4047813415527344 + }, + { + "auxiliary_loss_clip": 0.01100888, + "auxiliary_loss_mlp": 0.01027831, + "balance_loss_clip": 1.01651824, + "balance_loss_mlp": 1.03489256, + "epoch": 0.7897189237937773, + "flos": 18405907067520.0, + "grad_norm": 1.729726812184161, + "language_loss": 0.79917061, + "learning_rate": 4.460798740713998e-07, + "loss": 0.82045782, + "num_input_tokens_seen": 283421035, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66015625, + "step": 13135, + "time_per_iteration": 2.4462645053863525 + }, + { + "auxiliary_loss_clip": 0.01099492, + "auxiliary_loss_mlp": 0.01028879, + "balance_loss_clip": 1.01702976, + "balance_loss_mlp": 1.03459549, + "epoch": 0.7897790470464452, + "flos": 23731602825600.0, + "grad_norm": 1.7066786377957706, + "language_loss": 0.72467506, + "learning_rate": 4.4583471797768733e-07, + "loss": 0.74595881, + "num_input_tokens_seen": 283441830, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6484375, + "step": 13136, + "time_per_iteration": 3.8541600704193115 + }, + { + "auxiliary_loss_clip": 0.01107278, + "auxiliary_loss_mlp": 0.01033045, + "balance_loss_clip": 1.02079642, + "balance_loss_mlp": 1.03614569, + "epoch": 0.7898391702991132, + "flos": 15918949111680.0, + "grad_norm": 1.8157038606560463, + "language_loss": 0.70418733, + "learning_rate": 4.455896208180778e-07, + "loss": 0.72559059, + "num_input_tokens_seen": 283459540, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.7109375, + "step": 13137, + "time_per_iteration": 2.451396942138672 + }, + { + "auxiliary_loss_clip": 0.01099987, + "auxiliary_loss_mlp": 0.01033834, + "balance_loss_clip": 1.02095389, + "balance_loss_mlp": 1.03527665, + "epoch": 0.7898992935517811, + "flos": 19829046896640.0, + "grad_norm": 1.748688408488967, + "language_loss": 0.74126804, + "learning_rate": 4.4534458260186645e-07, + "loss": 0.7626062, + "num_input_tokens_seen": 283478790, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6484375, + "step": 13138, + "time_per_iteration": 3.8486387729644775 + }, + { + "auxiliary_loss_clip": 0.01099719, + "auxiliary_loss_mlp": 0.01028497, + "balance_loss_clip": 1.01726758, + "balance_loss_mlp": 1.03461611, + "epoch": 0.7899594168044491, + "flos": 16216253982720.0, + "grad_norm": 2.0347678051570046, + "language_loss": 0.68777812, + "learning_rate": 4.4509960333834426e-07, + "loss": 0.70906031, + "num_input_tokens_seen": 283495720, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 13139, + "time_per_iteration": 3.8628947734832764 + }, + { + "auxiliary_loss_clip": 0.01022998, + "auxiliary_loss_mlp": 0.01001993, + "balance_loss_clip": 1.00090218, + "balance_loss_mlp": 1.00276661, + "epoch": 0.790019540057117, + "flos": 68331005959680.0, + "grad_norm": 0.8639772352746394, + "language_loss": 0.60299456, + "learning_rate": 4.448546830368003e-07, + "loss": 0.62324452, + "num_input_tokens_seen": 283558795, + "router_z_loss_clip": 0.01092529, + "router_z_loss_mlp": 0.20214844, + "step": 13140, + "time_per_iteration": 3.12382435798645 + }, + { + "auxiliary_loss_clip": 0.01101176, + "auxiliary_loss_mlp": 0.01031932, + "balance_loss_clip": 1.01973701, + "balance_loss_mlp": 1.03487992, + "epoch": 0.7900796633097851, + "flos": 30332773601280.0, + "grad_norm": 1.6042755472834633, + "language_loss": 0.7596916, + "learning_rate": 4.4460982170652304e-07, + "loss": 0.78102267, + "num_input_tokens_seen": 283579305, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6640625, + "step": 13141, + "time_per_iteration": 2.5595388412475586 + }, + { + "auxiliary_loss_clip": 0.0110272, + "auxiliary_loss_mlp": 0.01032935, + "balance_loss_clip": 1.02109766, + "balance_loss_mlp": 1.0354681, + "epoch": 0.790139786562453, + "flos": 22126790983680.0, + "grad_norm": 2.061867815111243, + "language_loss": 0.68504715, + "learning_rate": 4.4436501935679694e-07, + "loss": 0.70640367, + "num_input_tokens_seen": 283597840, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 13142, + "time_per_iteration": 3.9543938636779785 + }, + { + "auxiliary_loss_clip": 0.0102319, + "auxiliary_loss_mlp": 0.01000022, + "balance_loss_clip": 0.99900836, + "balance_loss_mlp": 1.00304079, + "epoch": 0.790199909815121, + "flos": 58207284213120.0, + "grad_norm": 0.8198553177673825, + "language_loss": 0.60004789, + "learning_rate": 4.441202759969049e-07, + "loss": 0.62028003, + "num_input_tokens_seen": 283647950, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.20117188, + "step": 13143, + "time_per_iteration": 2.863976001739502 + }, + { + "auxiliary_loss_clip": 0.01103929, + "auxiliary_loss_mlp": 0.01029168, + "balance_loss_clip": 1.01715136, + "balance_loss_mlp": 1.03638124, + "epoch": 0.7902600330677889, + "flos": 34533316759680.0, + "grad_norm": 1.589507938557268, + "language_loss": 0.74556917, + "learning_rate": 4.4387559163612875e-07, + "loss": 0.76690018, + "num_input_tokens_seen": 283670645, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 13144, + "time_per_iteration": 2.5839059352874756 + }, + { + "auxiliary_loss_clip": 0.01103839, + "auxiliary_loss_mlp": 0.01032647, + "balance_loss_clip": 1.02020764, + "balance_loss_mlp": 1.03596044, + "epoch": 0.7903201563204569, + "flos": 22346384780160.0, + "grad_norm": 1.7274125688221094, + "language_loss": 0.83230376, + "learning_rate": 4.4363096628374605e-07, + "loss": 0.85366857, + "num_input_tokens_seen": 283688830, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 13145, + "time_per_iteration": 2.468961000442505 + }, + { + "auxiliary_loss_clip": 0.01094904, + "auxiliary_loss_mlp": 0.01029429, + "balance_loss_clip": 1.01889074, + "balance_loss_mlp": 1.03252196, + "epoch": 0.790380279573125, + "flos": 22053533195520.0, + "grad_norm": 1.7706663213688858, + "language_loss": 0.72783786, + "learning_rate": 4.4338639994903235e-07, + "loss": 0.74908125, + "num_input_tokens_seen": 283708625, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.625, + "step": 13146, + "time_per_iteration": 2.483905076980591 + }, + { + "auxiliary_loss_clip": 0.01101036, + "auxiliary_loss_mlp": 0.01028151, + "balance_loss_clip": 1.01676106, + "balance_loss_mlp": 1.03308654, + "epoch": 0.7904404028257929, + "flos": 20302600826880.0, + "grad_norm": 1.9329251437189798, + "language_loss": 0.75868392, + "learning_rate": 4.4314189264126246e-07, + "loss": 0.77997577, + "num_input_tokens_seen": 283725710, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6796875, + "step": 13147, + "time_per_iteration": 2.444445848464966 + }, + { + "auxiliary_loss_clip": 0.01098948, + "auxiliary_loss_mlp": 0.0103655, + "balance_loss_clip": 1.02420568, + "balance_loss_mlp": 1.03389215, + "epoch": 0.7905005260784609, + "flos": 20008923229440.0, + "grad_norm": 1.8432803916429288, + "language_loss": 0.71830833, + "learning_rate": 4.428974443697087e-07, + "loss": 0.7396633, + "num_input_tokens_seen": 283744150, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6484375, + "step": 13148, + "time_per_iteration": 2.4763596057891846 + }, + { + "auxiliary_loss_clip": 0.01099876, + "auxiliary_loss_mlp": 0.01029272, + "balance_loss_clip": 1.0174942, + "balance_loss_mlp": 1.03280914, + "epoch": 0.7905606493311288, + "flos": 26905926418560.0, + "grad_norm": 2.2200316445748536, + "language_loss": 0.71857107, + "learning_rate": 4.4265305514363913e-07, + "loss": 0.73986256, + "num_input_tokens_seen": 283764170, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 13149, + "time_per_iteration": 2.5340046882629395 + }, + { + "auxiliary_loss_clip": 0.01103652, + "auxiliary_loss_mlp": 0.01030142, + "balance_loss_clip": 1.01735091, + "balance_loss_mlp": 1.03590095, + "epoch": 0.7906207725837968, + "flos": 23696230907520.0, + "grad_norm": 2.727710817995862, + "language_loss": 0.65459621, + "learning_rate": 4.424087249723225e-07, + "loss": 0.67593414, + "num_input_tokens_seen": 283784305, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 13150, + "time_per_iteration": 2.4871621131896973 + }, + { + "auxiliary_loss_clip": 0.01098617, + "auxiliary_loss_mlp": 0.01029632, + "balance_loss_clip": 1.01808667, + "balance_loss_mlp": 1.03340101, + "epoch": 0.7906808958364647, + "flos": 20848837927680.0, + "grad_norm": 2.2316729864145035, + "language_loss": 0.69869459, + "learning_rate": 4.421644538650231e-07, + "loss": 0.71997708, + "num_input_tokens_seen": 283804040, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65234375, + "step": 13151, + "time_per_iteration": 2.429283857345581 + }, + { + "auxiliary_loss_clip": 0.01102592, + "auxiliary_loss_mlp": 0.01035884, + "balance_loss_clip": 1.02360559, + "balance_loss_mlp": 1.03463364, + "epoch": 0.7907410190891327, + "flos": 40735196974080.0, + "grad_norm": 1.3770531341531196, + "language_loss": 0.70089221, + "learning_rate": 4.4192024183100306e-07, + "loss": 0.72227693, + "num_input_tokens_seen": 283827120, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6796875, + "step": 13152, + "time_per_iteration": 2.6216795444488525 + }, + { + "auxiliary_loss_clip": 0.01099523, + "auxiliary_loss_mlp": 0.01026461, + "balance_loss_clip": 1.01488543, + "balance_loss_mlp": 1.03391027, + "epoch": 0.7908011423418007, + "flos": 13261165050240.0, + "grad_norm": 1.7997753431488441, + "language_loss": 0.72821844, + "learning_rate": 4.4167608887952367e-07, + "loss": 0.74947822, + "num_input_tokens_seen": 283844820, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 13153, + "time_per_iteration": 2.432175636291504 + }, + { + "auxiliary_loss_clip": 0.01098332, + "auxiliary_loss_mlp": 0.01024691, + "balance_loss_clip": 1.01356864, + "balance_loss_mlp": 1.03256023, + "epoch": 0.7908612655944687, + "flos": 19754747614080.0, + "grad_norm": 1.8282420637025174, + "language_loss": 0.78883809, + "learning_rate": 4.4143199501984306e-07, + "loss": 0.81006831, + "num_input_tokens_seen": 283862870, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 13154, + "time_per_iteration": 2.466029167175293 + }, + { + "auxiliary_loss_clip": 0.01105447, + "auxiliary_loss_mlp": 0.01030297, + "balance_loss_clip": 1.01705313, + "balance_loss_mlp": 1.03479743, + "epoch": 0.7909213888471366, + "flos": 21287738211840.0, + "grad_norm": 1.8238344908904138, + "language_loss": 0.70285016, + "learning_rate": 4.411879602612185e-07, + "loss": 0.72420764, + "num_input_tokens_seen": 283882405, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.70703125, + "step": 13155, + "time_per_iteration": 2.446547746658325 + }, + { + "auxiliary_loss_clip": 0.01100095, + "auxiliary_loss_mlp": 0.01027373, + "balance_loss_clip": 1.01582754, + "balance_loss_mlp": 1.03381193, + "epoch": 0.7909815120998046, + "flos": 22528882805760.0, + "grad_norm": 2.6081718094801003, + "language_loss": 0.7679953, + "learning_rate": 4.4094398461290174e-07, + "loss": 0.78926998, + "num_input_tokens_seen": 283902070, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 13156, + "time_per_iteration": 2.475921154022217 + }, + { + "auxiliary_loss_clip": 0.01099115, + "auxiliary_loss_mlp": 0.01028477, + "balance_loss_clip": 1.01664567, + "balance_loss_mlp": 1.03353715, + "epoch": 0.7910416353524725, + "flos": 26727702111360.0, + "grad_norm": 1.636282955731654, + "language_loss": 0.65013611, + "learning_rate": 4.4070006808414526e-07, + "loss": 0.67141205, + "num_input_tokens_seen": 283924100, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65625, + "step": 13157, + "time_per_iteration": 2.504150867462158 + }, + { + "auxiliary_loss_clip": 0.0110173, + "auxiliary_loss_mlp": 0.01031989, + "balance_loss_clip": 1.01937079, + "balance_loss_mlp": 1.03502417, + "epoch": 0.7911017586051405, + "flos": 24644847139200.0, + "grad_norm": 1.6940743634270539, + "language_loss": 0.73872387, + "learning_rate": 4.4045621068419894e-07, + "loss": 0.76006109, + "num_input_tokens_seen": 283944955, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.66796875, + "step": 13158, + "time_per_iteration": 2.4976253509521484 + }, + { + "auxiliary_loss_clip": 0.01096891, + "auxiliary_loss_mlp": 0.01028906, + "balance_loss_clip": 1.018332, + "balance_loss_mlp": 1.03334785, + "epoch": 0.7911618818578086, + "flos": 17565489578880.0, + "grad_norm": 1.9043976356667784, + "language_loss": 0.6686908, + "learning_rate": 4.40212412422309e-07, + "loss": 0.68994868, + "num_input_tokens_seen": 283963125, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.63671875, + "step": 13159, + "time_per_iteration": 2.4071156978607178 + }, + { + "auxiliary_loss_clip": 0.01098959, + "auxiliary_loss_mlp": 0.01028336, + "balance_loss_clip": 1.01733327, + "balance_loss_mlp": 1.03454971, + "epoch": 0.7912220051104765, + "flos": 16721660298240.0, + "grad_norm": 1.8560150384461531, + "language_loss": 0.67281532, + "learning_rate": 4.399686733077206e-07, + "loss": 0.69408834, + "num_input_tokens_seen": 283982850, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.64453125, + "step": 13160, + "time_per_iteration": 2.4779374599456787 + }, + { + "auxiliary_loss_clip": 0.01092943, + "auxiliary_loss_mlp": 0.01025967, + "balance_loss_clip": 1.01608515, + "balance_loss_mlp": 1.03147316, + "epoch": 0.7912821283631445, + "flos": 13698736531200.0, + "grad_norm": 1.960219382824367, + "language_loss": 0.72932816, + "learning_rate": 4.3972499334967694e-07, + "loss": 0.75051731, + "num_input_tokens_seen": 283998275, + "router_z_loss_clip": 0.09863281, + "router_z_loss_mlp": 0.61328125, + "step": 13161, + "time_per_iteration": 2.393747091293335 + }, + { + "auxiliary_loss_clip": 0.01099171, + "auxiliary_loss_mlp": 0.0102686, + "balance_loss_clip": 1.01512957, + "balance_loss_mlp": 1.03505635, + "epoch": 0.7913422516158124, + "flos": 23769021818880.0, + "grad_norm": 2.030740934223021, + "language_loss": 0.73477876, + "learning_rate": 4.39481372557418e-07, + "loss": 0.75603908, + "num_input_tokens_seen": 284018750, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.640625, + "step": 13162, + "time_per_iteration": 2.476824998855591 + }, + { + "auxiliary_loss_clip": 0.01102454, + "auxiliary_loss_mlp": 0.01030138, + "balance_loss_clip": 1.01868761, + "balance_loss_mlp": 1.03506005, + "epoch": 0.7914023748684804, + "flos": 19938251220480.0, + "grad_norm": 1.6298606745864626, + "language_loss": 0.72000325, + "learning_rate": 4.392378109401811e-07, + "loss": 0.74132919, + "num_input_tokens_seen": 284037850, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 13163, + "time_per_iteration": 2.4319183826446533 + }, + { + "auxiliary_loss_clip": 0.01101866, + "auxiliary_loss_mlp": 0.01031655, + "balance_loss_clip": 1.01945353, + "balance_loss_mlp": 1.03639102, + "epoch": 0.7914624981211483, + "flos": 20594805966720.0, + "grad_norm": 1.9265161616003688, + "language_loss": 0.69604623, + "learning_rate": 4.3899430850720296e-07, + "loss": 0.71738136, + "num_input_tokens_seen": 284056380, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65625, + "step": 13164, + "time_per_iteration": 2.4604907035827637 + }, + { + "auxiliary_loss_clip": 0.01098403, + "auxiliary_loss_mlp": 0.01029886, + "balance_loss_clip": 1.01869857, + "balance_loss_mlp": 1.03331554, + "epoch": 0.7915226213738163, + "flos": 21799465320960.0, + "grad_norm": 1.9521377640863393, + "language_loss": 0.66389132, + "learning_rate": 4.387508652677177e-07, + "loss": 0.68517423, + "num_input_tokens_seen": 284074945, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.65234375, + "step": 13165, + "time_per_iteration": 2.4393765926361084 + }, + { + "auxiliary_loss_clip": 0.01093623, + "auxiliary_loss_mlp": 0.01024396, + "balance_loss_clip": 1.01379871, + "balance_loss_mlp": 1.03140879, + "epoch": 0.7915827446264843, + "flos": 16288362535680.0, + "grad_norm": 1.870206675725358, + "language_loss": 0.72397065, + "learning_rate": 4.385074812309557e-07, + "loss": 0.74515086, + "num_input_tokens_seen": 284092070, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.62109375, + "step": 13166, + "time_per_iteration": 2.42858624458313 + }, + { + "auxiliary_loss_clip": 0.01098429, + "auxiliary_loss_mlp": 0.01029233, + "balance_loss_clip": 1.01669192, + "balance_loss_mlp": 1.03284669, + "epoch": 0.7916428678791523, + "flos": 25702595867520.0, + "grad_norm": 1.6243880882538562, + "language_loss": 0.77239472, + "learning_rate": 4.382641564061462e-07, + "loss": 0.79367137, + "num_input_tokens_seen": 284112255, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.65625, + "step": 13167, + "time_per_iteration": 2.4857194423675537 + }, + { + "auxiliary_loss_clip": 0.0109987, + "auxiliary_loss_mlp": 0.01029913, + "balance_loss_clip": 1.0192678, + "balance_loss_mlp": 1.03484404, + "epoch": 0.7917029911318202, + "flos": 23878513451520.0, + "grad_norm": 1.6932683776062956, + "language_loss": 0.84575874, + "learning_rate": 4.3802089080251713e-07, + "loss": 0.86705655, + "num_input_tokens_seen": 284132330, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6484375, + "step": 13168, + "time_per_iteration": 2.5257365703582764 + }, + { + "auxiliary_loss_clip": 0.01101185, + "auxiliary_loss_mlp": 0.01027494, + "balance_loss_clip": 1.01633573, + "balance_loss_mlp": 1.03501356, + "epoch": 0.7917631143844882, + "flos": 21646593037440.0, + "grad_norm": 1.7075722391650643, + "language_loss": 0.72710097, + "learning_rate": 4.3777768442929155e-07, + "loss": 0.74838775, + "num_input_tokens_seen": 284150640, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66015625, + "step": 13169, + "time_per_iteration": 2.4436428546905518 + }, + { + "auxiliary_loss_clip": 0.01102971, + "auxiliary_loss_mlp": 0.01032186, + "balance_loss_clip": 1.02028275, + "balance_loss_mlp": 1.03484845, + "epoch": 0.7918232376371561, + "flos": 38874198355200.0, + "grad_norm": 1.8243232954035, + "language_loss": 0.67037463, + "learning_rate": 4.3753453729569287e-07, + "loss": 0.69172621, + "num_input_tokens_seen": 284171910, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 13170, + "time_per_iteration": 2.624098777770996 + }, + { + "auxiliary_loss_clip": 0.01099882, + "auxiliary_loss_mlp": 0.01022631, + "balance_loss_clip": 1.01188445, + "balance_loss_mlp": 1.03370655, + "epoch": 0.7918833608898241, + "flos": 20775544225920.0, + "grad_norm": 2.145643776900154, + "language_loss": 0.70821196, + "learning_rate": 4.372914494109412e-07, + "loss": 0.72943711, + "num_input_tokens_seen": 284191340, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6640625, + "step": 13171, + "time_per_iteration": 2.4759225845336914 + }, + { + "auxiliary_loss_clip": 0.01097813, + "auxiliary_loss_mlp": 0.01026979, + "balance_loss_clip": 1.01555896, + "balance_loss_mlp": 1.03287041, + "epoch": 0.7919434841424922, + "flos": 33910122769920.0, + "grad_norm": 1.7808114898510692, + "language_loss": 0.66749847, + "learning_rate": 4.370484207842553e-07, + "loss": 0.68874633, + "num_input_tokens_seen": 284212495, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 13172, + "time_per_iteration": 2.5700619220733643 + }, + { + "auxiliary_loss_clip": 0.0110086, + "auxiliary_loss_mlp": 0.01030852, + "balance_loss_clip": 1.01951575, + "balance_loss_mlp": 1.03532124, + "epoch": 0.7920036073951601, + "flos": 21064660796160.0, + "grad_norm": 1.881471397827846, + "language_loss": 0.79114199, + "learning_rate": 4.3680545142484893e-07, + "loss": 0.81245905, + "num_input_tokens_seen": 284230825, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 13173, + "time_per_iteration": 2.4757769107818604 + }, + { + "auxiliary_loss_clip": 0.01098601, + "auxiliary_loss_mlp": 0.01026298, + "balance_loss_clip": 1.01604629, + "balance_loss_mlp": 1.03356767, + "epoch": 0.7920637306478281, + "flos": 23655974739840.0, + "grad_norm": 1.8257169099577297, + "language_loss": 0.7678805, + "learning_rate": 4.365625413419365e-07, + "loss": 0.7891295, + "num_input_tokens_seen": 284250365, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.6484375, + "step": 13174, + "time_per_iteration": 2.478116989135742 + }, + { + "auxiliary_loss_clip": 0.01097308, + "auxiliary_loss_mlp": 0.01031423, + "balance_loss_clip": 1.02046227, + "balance_loss_mlp": 1.03321493, + "epoch": 0.792123853900496, + "flos": 27195438038400.0, + "grad_norm": 1.6179511988960908, + "language_loss": 0.71719491, + "learning_rate": 4.363196905447297e-07, + "loss": 0.73848224, + "num_input_tokens_seen": 284269635, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.640625, + "step": 13175, + "time_per_iteration": 2.528700590133667 + }, + { + "auxiliary_loss_clip": 0.01099648, + "auxiliary_loss_mlp": 0.01027473, + "balance_loss_clip": 1.01570737, + "balance_loss_mlp": 1.03435004, + "epoch": 0.792183977153164, + "flos": 19098659744640.0, + "grad_norm": 1.9378539521552467, + "language_loss": 0.59763598, + "learning_rate": 4.360768990424364e-07, + "loss": 0.61890721, + "num_input_tokens_seen": 284288380, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.65234375, + "step": 13176, + "time_per_iteration": 2.4653594493865967 + }, + { + "auxiliary_loss_clip": 0.01101303, + "auxiliary_loss_mlp": 0.01030579, + "balance_loss_clip": 1.01922417, + "balance_loss_mlp": 1.03675985, + "epoch": 0.7922441004058319, + "flos": 17128851851520.0, + "grad_norm": 1.8690026492537037, + "language_loss": 0.73695058, + "learning_rate": 4.3583416684426376e-07, + "loss": 0.75826943, + "num_input_tokens_seen": 284306920, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.64453125, + "step": 13177, + "time_per_iteration": 2.439019203186035 + }, + { + "auxiliary_loss_clip": 0.01098632, + "auxiliary_loss_mlp": 0.01032505, + "balance_loss_clip": 1.02159739, + "balance_loss_mlp": 1.0353229, + "epoch": 0.7923042236585, + "flos": 17821640442240.0, + "grad_norm": 2.5597015980871656, + "language_loss": 0.63997006, + "learning_rate": 4.355914939594174e-07, + "loss": 0.66128141, + "num_input_tokens_seen": 284324700, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6328125, + "step": 13178, + "time_per_iteration": 3.8768224716186523 + }, + { + "auxiliary_loss_clip": 0.01098113, + "auxiliary_loss_mlp": 0.01030005, + "balance_loss_clip": 1.0197531, + "balance_loss_mlp": 1.03276765, + "epoch": 0.7923643469111679, + "flos": 29935206892800.0, + "grad_norm": 1.4086658766608762, + "language_loss": 0.68400067, + "learning_rate": 4.3534888039709726e-07, + "loss": 0.70528185, + "num_input_tokens_seen": 284345985, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.65625, + "step": 13179, + "time_per_iteration": 2.5326123237609863 + }, + { + "auxiliary_loss_clip": 0.01099366, + "auxiliary_loss_mlp": 0.01029023, + "balance_loss_clip": 1.01749516, + "balance_loss_mlp": 1.03448081, + "epoch": 0.7924244701638359, + "flos": 22674716023680.0, + "grad_norm": 3.8313461513968408, + "language_loss": 0.74134624, + "learning_rate": 4.3510632616650444e-07, + "loss": 0.76263011, + "num_input_tokens_seen": 284364475, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 13180, + "time_per_iteration": 3.892685651779175 + }, + { + "auxiliary_loss_clip": 0.01104124, + "auxiliary_loss_mlp": 0.01032877, + "balance_loss_clip": 1.02059281, + "balance_loss_mlp": 1.03637862, + "epoch": 0.7924845934165038, + "flos": 17968156018560.0, + "grad_norm": 2.6414763504058936, + "language_loss": 0.81435031, + "learning_rate": 4.3486383127683646e-07, + "loss": 0.8357203, + "num_input_tokens_seen": 284382125, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6796875, + "step": 13181, + "time_per_iteration": 3.8623433113098145 + }, + { + "auxiliary_loss_clip": 0.01098541, + "auxiliary_loss_mlp": 0.01032498, + "balance_loss_clip": 1.02029681, + "balance_loss_mlp": 1.03413761, + "epoch": 0.7925447166691718, + "flos": 23476960333440.0, + "grad_norm": 1.7875723421609098, + "language_loss": 0.77434945, + "learning_rate": 4.346213957372895e-07, + "loss": 0.7956599, + "num_input_tokens_seen": 284401585, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.64453125, + "step": 13182, + "time_per_iteration": 2.4663844108581543 + }, + { + "auxiliary_loss_clip": 0.01103982, + "auxiliary_loss_mlp": 0.01032586, + "balance_loss_clip": 1.01979494, + "balance_loss_mlp": 1.03470898, + "epoch": 0.7926048399218397, + "flos": 20447572118400.0, + "grad_norm": 2.7996855635820013, + "language_loss": 0.74354494, + "learning_rate": 4.34379019557056e-07, + "loss": 0.7649107, + "num_input_tokens_seen": 284419125, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 13183, + "time_per_iteration": 2.490994930267334 + }, + { + "auxiliary_loss_clip": 0.0109888, + "auxiliary_loss_mlp": 0.01024612, + "balance_loss_clip": 1.01273239, + "balance_loss_mlp": 1.03439891, + "epoch": 0.7926649631745077, + "flos": 37160038535040.0, + "grad_norm": 1.6595627925509142, + "language_loss": 0.68164527, + "learning_rate": 4.341367027453264e-07, + "loss": 0.70288026, + "num_input_tokens_seen": 284440445, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.64453125, + "step": 13184, + "time_per_iteration": 4.066596508026123 + }, + { + "auxiliary_loss_clip": 0.01102689, + "auxiliary_loss_mlp": 0.01028631, + "balance_loss_clip": 1.01719308, + "balance_loss_mlp": 1.03515947, + "epoch": 0.7927250864271758, + "flos": 17018606033280.0, + "grad_norm": 1.6953007662822652, + "language_loss": 0.70649928, + "learning_rate": 4.338944453112907e-07, + "loss": 0.72781253, + "num_input_tokens_seen": 284459370, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.67578125, + "step": 13185, + "time_per_iteration": 2.5168697834014893 + }, + { + "auxiliary_loss_clip": 0.01101927, + "auxiliary_loss_mlp": 0.01027341, + "balance_loss_clip": 1.01530719, + "balance_loss_mlp": 1.03461063, + "epoch": 0.7927852096798437, + "flos": 17749208666880.0, + "grad_norm": 2.0010064491427335, + "language_loss": 0.65568876, + "learning_rate": 4.3365224726413375e-07, + "loss": 0.67698145, + "num_input_tokens_seen": 284477525, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 13186, + "time_per_iteration": 2.4313526153564453 + }, + { + "auxiliary_loss_clip": 0.01099917, + "auxiliary_loss_mlp": 0.01028869, + "balance_loss_clip": 1.01786041, + "balance_loss_mlp": 1.03488398, + "epoch": 0.7928453329325117, + "flos": 23838436851840.0, + "grad_norm": 2.458790452958655, + "language_loss": 0.76782525, + "learning_rate": 4.334101086130408e-07, + "loss": 0.78911316, + "num_input_tokens_seen": 284496590, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6484375, + "step": 13187, + "time_per_iteration": 2.4705545902252197 + }, + { + "auxiliary_loss_clip": 0.01097825, + "auxiliary_loss_mlp": 0.01026522, + "balance_loss_clip": 1.01525056, + "balance_loss_mlp": 1.03388309, + "epoch": 0.7929054561851796, + "flos": 17454920538240.0, + "grad_norm": 2.052216881515836, + "language_loss": 0.72776371, + "learning_rate": 4.3316802936719334e-07, + "loss": 0.74900717, + "num_input_tokens_seen": 284511470, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.640625, + "step": 13188, + "time_per_iteration": 2.473217010498047 + }, + { + "auxiliary_loss_clip": 0.01102244, + "auxiliary_loss_mlp": 0.01036567, + "balance_loss_clip": 1.02387083, + "balance_loss_mlp": 1.03462553, + "epoch": 0.7929655794378476, + "flos": 21981280988160.0, + "grad_norm": 2.027455817824797, + "language_loss": 0.62665582, + "learning_rate": 4.329260095357725e-07, + "loss": 0.64804399, + "num_input_tokens_seen": 284531125, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.67578125, + "step": 13189, + "time_per_iteration": 2.442365884780884 + }, + { + "auxiliary_loss_clip": 0.01098917, + "auxiliary_loss_mlp": 0.01028169, + "balance_loss_clip": 1.01705313, + "balance_loss_mlp": 1.03361034, + "epoch": 0.7930257026905155, + "flos": 17273930883840.0, + "grad_norm": 2.5304062276018793, + "language_loss": 0.72505867, + "learning_rate": 4.3268404912795307e-07, + "loss": 0.74632961, + "num_input_tokens_seen": 284549340, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65234375, + "step": 13190, + "time_per_iteration": 2.5228397846221924 + }, + { + "auxiliary_loss_clip": 0.01096381, + "auxiliary_loss_mlp": 0.01027089, + "balance_loss_clip": 1.01708758, + "balance_loss_mlp": 1.03499353, + "epoch": 0.7930858259431836, + "flos": 27300584125440.0, + "grad_norm": 1.8037952214110713, + "language_loss": 0.73300159, + "learning_rate": 4.3244214815291166e-07, + "loss": 0.75423628, + "num_input_tokens_seen": 284567060, + "router_z_loss_clip": 0.10009766, + "router_z_loss_mlp": 0.6171875, + "step": 13191, + "time_per_iteration": 2.5402090549468994 + }, + { + "auxiliary_loss_clip": 0.01099659, + "auxiliary_loss_mlp": 0.01036425, + "balance_loss_clip": 1.02452767, + "balance_loss_mlp": 1.03368807, + "epoch": 0.7931459491958515, + "flos": 19863736456320.0, + "grad_norm": 1.9478523410400206, + "language_loss": 0.69033474, + "learning_rate": 4.322003066198219e-07, + "loss": 0.71169555, + "num_input_tokens_seen": 284586600, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66015625, + "step": 13192, + "time_per_iteration": 2.488039970397949 + }, + { + "auxiliary_loss_clip": 0.01100062, + "auxiliary_loss_mlp": 0.010329, + "balance_loss_clip": 1.02173603, + "balance_loss_mlp": 1.03413558, + "epoch": 0.7932060724485195, + "flos": 23147120718720.0, + "grad_norm": 1.5635403333357274, + "language_loss": 0.75213289, + "learning_rate": 4.3195852453785274e-07, + "loss": 0.77346253, + "num_input_tokens_seen": 284605715, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.66015625, + "step": 13193, + "time_per_iteration": 2.464966297149658 + }, + { + "auxiliary_loss_clip": 0.01100043, + "auxiliary_loss_mlp": 0.01033879, + "balance_loss_clip": 1.02102232, + "balance_loss_mlp": 1.03474998, + "epoch": 0.7932661957011874, + "flos": 29934847756800.0, + "grad_norm": 1.8781856147923044, + "language_loss": 0.72225535, + "learning_rate": 4.317168019161741e-07, + "loss": 0.74359465, + "num_input_tokens_seen": 284628540, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.65234375, + "step": 13194, + "time_per_iteration": 2.55106520652771 + }, + { + "auxiliary_loss_clip": 0.01104878, + "auxiliary_loss_mlp": 0.01030783, + "balance_loss_clip": 1.01911819, + "balance_loss_mlp": 1.03578103, + "epoch": 0.7933263189538554, + "flos": 22559119079040.0, + "grad_norm": 1.9952958516123638, + "language_loss": 0.69781977, + "learning_rate": 4.314751387639517e-07, + "loss": 0.71917635, + "num_input_tokens_seen": 284646040, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.69140625, + "step": 13195, + "time_per_iteration": 2.4327144622802734 + }, + { + "auxiliary_loss_clip": 0.01100264, + "auxiliary_loss_mlp": 0.01025694, + "balance_loss_clip": 1.0142858, + "balance_loss_mlp": 1.03533435, + "epoch": 0.7933864422065233, + "flos": 25479051575040.0, + "grad_norm": 1.5235615459382654, + "language_loss": 0.77706164, + "learning_rate": 4.3123353509034844e-07, + "loss": 0.79832125, + "num_input_tokens_seen": 284665110, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 13196, + "time_per_iteration": 2.4901678562164307 + }, + { + "auxiliary_loss_clip": 0.01103725, + "auxiliary_loss_mlp": 0.01034187, + "balance_loss_clip": 1.02258193, + "balance_loss_mlp": 1.03656614, + "epoch": 0.7934465654591913, + "flos": 33583156243200.0, + "grad_norm": 1.803068943631605, + "language_loss": 0.68970078, + "learning_rate": 4.309919909045268e-07, + "loss": 0.71107984, + "num_input_tokens_seen": 284686515, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 13197, + "time_per_iteration": 2.5378594398498535 + }, + { + "auxiliary_loss_clip": 0.01098819, + "auxiliary_loss_mlp": 0.0102988, + "balance_loss_clip": 1.01860309, + "balance_loss_mlp": 1.03417861, + "epoch": 0.7935066887118594, + "flos": 31432538263680.0, + "grad_norm": 1.7643596771229297, + "language_loss": 0.64804506, + "learning_rate": 4.30750506215646e-07, + "loss": 0.66933215, + "num_input_tokens_seen": 284707300, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.64453125, + "step": 13198, + "time_per_iteration": 2.534534215927124 + }, + { + "auxiliary_loss_clip": 0.01103865, + "auxiliary_loss_mlp": 0.01030473, + "balance_loss_clip": 1.01787245, + "balance_loss_mlp": 1.03533065, + "epoch": 0.7935668119645273, + "flos": 14682616940160.0, + "grad_norm": 2.0561177660493453, + "language_loss": 0.72203559, + "learning_rate": 4.30509081032864e-07, + "loss": 0.743379, + "num_input_tokens_seen": 284723545, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.68359375, + "step": 13199, + "time_per_iteration": 2.409954071044922 + }, + { + "auxiliary_loss_clip": 0.01100244, + "auxiliary_loss_mlp": 0.01028741, + "balance_loss_clip": 1.01794064, + "balance_loss_mlp": 1.03514385, + "epoch": 0.7936269352171953, + "flos": 18004246208640.0, + "grad_norm": 2.5680157152450933, + "language_loss": 0.80811197, + "learning_rate": 4.302677153653349e-07, + "loss": 0.82940185, + "num_input_tokens_seen": 284742650, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.65234375, + "step": 13200, + "time_per_iteration": 2.4604108333587646 + }, + { + "auxiliary_loss_clip": 0.01098579, + "auxiliary_loss_mlp": 0.01028539, + "balance_loss_clip": 1.01745248, + "balance_loss_mlp": 1.0353868, + "epoch": 0.7936870584698632, + "flos": 18880215183360.0, + "grad_norm": 1.627584700503655, + "language_loss": 0.77191329, + "learning_rate": 4.3002640922221077e-07, + "loss": 0.7931844, + "num_input_tokens_seen": 284760955, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6328125, + "step": 13201, + "time_per_iteration": 2.428744077682495 + }, + { + "auxiliary_loss_clip": 0.01098306, + "auxiliary_loss_mlp": 0.01028208, + "balance_loss_clip": 1.01721644, + "balance_loss_mlp": 1.03374922, + "epoch": 0.7937471817225312, + "flos": 23367001824000.0, + "grad_norm": 1.4615967760668465, + "language_loss": 0.67071187, + "learning_rate": 4.2978516261264296e-07, + "loss": 0.69197702, + "num_input_tokens_seen": 284780745, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.64453125, + "step": 13202, + "time_per_iteration": 2.4896771907806396 + }, + { + "auxiliary_loss_clip": 0.0110056, + "auxiliary_loss_mlp": 0.01032272, + "balance_loss_clip": 1.02063727, + "balance_loss_mlp": 1.03468037, + "epoch": 0.7938073049751991, + "flos": 22674428714880.0, + "grad_norm": 1.816192931663621, + "language_loss": 0.74804997, + "learning_rate": 4.2954397554577884e-07, + "loss": 0.7693783, + "num_input_tokens_seen": 284799000, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 13203, + "time_per_iteration": 2.451380729675293 + }, + { + "auxiliary_loss_clip": 0.01100879, + "auxiliary_loss_mlp": 0.01030155, + "balance_loss_clip": 1.01872849, + "balance_loss_mlp": 1.03399134, + "epoch": 0.7938674282278672, + "flos": 22851431959680.0, + "grad_norm": 2.0709813366174807, + "language_loss": 0.6622262, + "learning_rate": 4.293028480307643e-07, + "loss": 0.68353653, + "num_input_tokens_seen": 284817450, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 13204, + "time_per_iteration": 2.4800636768341064 + }, + { + "auxiliary_loss_clip": 0.01097898, + "auxiliary_loss_mlp": 0.01029539, + "balance_loss_clip": 1.01835084, + "balance_loss_mlp": 1.03296351, + "epoch": 0.7939275514805351, + "flos": 27012509049600.0, + "grad_norm": 1.3281882721679232, + "language_loss": 0.7925297, + "learning_rate": 4.290617800767438e-07, + "loss": 0.81380415, + "num_input_tokens_seen": 284838865, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6484375, + "step": 13205, + "time_per_iteration": 2.4787778854370117 + }, + { + "auxiliary_loss_clip": 0.01096536, + "auxiliary_loss_mlp": 0.01026398, + "balance_loss_clip": 1.01493573, + "balance_loss_mlp": 1.03291297, + "epoch": 0.7939876747332031, + "flos": 21142838747520.0, + "grad_norm": 1.7942191439670012, + "language_loss": 0.77874231, + "learning_rate": 4.28820771692858e-07, + "loss": 0.7999717, + "num_input_tokens_seen": 284857975, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.63671875, + "step": 13206, + "time_per_iteration": 2.499706983566284 + }, + { + "auxiliary_loss_clip": 0.01104173, + "auxiliary_loss_mlp": 0.01029487, + "balance_loss_clip": 1.0172863, + "balance_loss_mlp": 1.03587031, + "epoch": 0.794047797985871, + "flos": 23289075267840.0, + "grad_norm": 1.8397672987802902, + "language_loss": 0.79237318, + "learning_rate": 4.285798228882456e-07, + "loss": 0.81370986, + "num_input_tokens_seen": 284877145, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 13207, + "time_per_iteration": 2.4636006355285645 + }, + { + "auxiliary_loss_clip": 0.01099783, + "auxiliary_loss_mlp": 0.01032098, + "balance_loss_clip": 1.02048755, + "balance_loss_mlp": 1.03468966, + "epoch": 0.794107921238539, + "flos": 24608074590720.0, + "grad_norm": 1.9530235791320048, + "language_loss": 0.84002006, + "learning_rate": 4.2833893367204375e-07, + "loss": 0.86133885, + "num_input_tokens_seen": 284895560, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 13208, + "time_per_iteration": 2.5083847045898438 + }, + { + "auxiliary_loss_clip": 0.01022967, + "auxiliary_loss_mlp": 0.00999733, + "balance_loss_clip": 0.99883288, + "balance_loss_mlp": 1.00283718, + "epoch": 0.7941680444912069, + "flos": 64093690252800.0, + "grad_norm": 0.7192767006915639, + "language_loss": 0.58359563, + "learning_rate": 4.280981040533875e-07, + "loss": 0.60382259, + "num_input_tokens_seen": 284963135, + "router_z_loss_clip": 0.00897217, + "router_z_loss_mlp": 0.20117188, + "step": 13209, + "time_per_iteration": 3.1166725158691406 + }, + { + "auxiliary_loss_clip": 0.01105651, + "auxiliary_loss_mlp": 0.0102813, + "balance_loss_clip": 1.01602447, + "balance_loss_mlp": 1.03716731, + "epoch": 0.794228167743875, + "flos": 24388839930240.0, + "grad_norm": 6.276461119543849, + "language_loss": 0.62636811, + "learning_rate": 4.2785733404140825e-07, + "loss": 0.64770591, + "num_input_tokens_seen": 284981755, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.68359375, + "step": 13210, + "time_per_iteration": 2.5011911392211914 + }, + { + "auxiliary_loss_clip": 0.01100308, + "auxiliary_loss_mlp": 0.010306, + "balance_loss_clip": 1.01959693, + "balance_loss_mlp": 1.03402996, + "epoch": 0.794288290996543, + "flos": 28512498026880.0, + "grad_norm": 1.5861692556571285, + "language_loss": 0.68948948, + "learning_rate": 4.2761662364523676e-07, + "loss": 0.71079856, + "num_input_tokens_seen": 285003060, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6640625, + "step": 13211, + "time_per_iteration": 2.5030434131622314 + }, + { + "auxiliary_loss_clip": 0.01102809, + "auxiliary_loss_mlp": 0.01036253, + "balance_loss_clip": 1.02349782, + "balance_loss_mlp": 1.03480554, + "epoch": 0.7943484142492109, + "flos": 25922117836800.0, + "grad_norm": 1.5459525414339919, + "language_loss": 0.72359824, + "learning_rate": 4.2737597287400074e-07, + "loss": 0.7449888, + "num_input_tokens_seen": 285021640, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 13212, + "time_per_iteration": 2.513190984725952 + }, + { + "auxiliary_loss_clip": 0.01098106, + "auxiliary_loss_mlp": 0.01025052, + "balance_loss_clip": 1.01388764, + "balance_loss_mlp": 1.03500962, + "epoch": 0.7944085375018789, + "flos": 23915286000000.0, + "grad_norm": 1.663831013986619, + "language_loss": 0.80758727, + "learning_rate": 4.271353817368246e-07, + "loss": 0.82881892, + "num_input_tokens_seen": 285040490, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6328125, + "step": 13213, + "time_per_iteration": 2.4620864391326904 + }, + { + "auxiliary_loss_clip": 0.01106094, + "auxiliary_loss_mlp": 0.01029417, + "balance_loss_clip": 1.01679885, + "balance_loss_mlp": 1.03663135, + "epoch": 0.7944686607545468, + "flos": 20229953569920.0, + "grad_norm": 2.2825802582203476, + "language_loss": 0.68319535, + "learning_rate": 4.268948502428327e-07, + "loss": 0.70455045, + "num_input_tokens_seen": 285059270, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 13214, + "time_per_iteration": 2.4502992630004883 + }, + { + "auxiliary_loss_clip": 0.0109771, + "auxiliary_loss_mlp": 0.01028761, + "balance_loss_clip": 1.01793051, + "balance_loss_mlp": 1.03391325, + "epoch": 0.7945287840072148, + "flos": 21980993679360.0, + "grad_norm": 1.8169772357963099, + "language_loss": 0.72712231, + "learning_rate": 4.2665437840114535e-07, + "loss": 0.74838698, + "num_input_tokens_seen": 285075390, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.63671875, + "step": 13215, + "time_per_iteration": 2.4472222328186035 + }, + { + "auxiliary_loss_clip": 0.01101234, + "auxiliary_loss_mlp": 0.01028514, + "balance_loss_clip": 1.01751041, + "balance_loss_mlp": 1.03695011, + "epoch": 0.7945889072598827, + "flos": 26397718842240.0, + "grad_norm": 1.5004674854133762, + "language_loss": 0.78918624, + "learning_rate": 4.2641396622088253e-07, + "loss": 0.81048369, + "num_input_tokens_seen": 285096290, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.640625, + "step": 13216, + "time_per_iteration": 2.5075128078460693 + }, + { + "auxiliary_loss_clip": 0.01100883, + "auxiliary_loss_mlp": 0.01029594, + "balance_loss_clip": 1.01874018, + "balance_loss_mlp": 1.03463197, + "epoch": 0.7946490305125508, + "flos": 25810255906560.0, + "grad_norm": 1.6163941804337032, + "language_loss": 0.73908085, + "learning_rate": 4.261736137111598e-07, + "loss": 0.76038563, + "num_input_tokens_seen": 285116020, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6640625, + "step": 13217, + "time_per_iteration": 2.4624104499816895 + }, + { + "auxiliary_loss_clip": 0.01097689, + "auxiliary_loss_mlp": 0.01034045, + "balance_loss_clip": 1.02317882, + "balance_loss_mlp": 1.03437877, + "epoch": 0.7947091537652187, + "flos": 15960965045760.0, + "grad_norm": 1.7536489091121308, + "language_loss": 0.74128562, + "learning_rate": 4.259333208810907e-07, + "loss": 0.76260298, + "num_input_tokens_seen": 285133510, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6328125, + "step": 13218, + "time_per_iteration": 2.4378395080566406 + }, + { + "auxiliary_loss_clip": 0.01101542, + "auxiliary_loss_mlp": 0.0103414, + "balance_loss_clip": 1.02162278, + "balance_loss_mlp": 1.03341603, + "epoch": 0.7947692770178867, + "flos": 18587866389120.0, + "grad_norm": 1.8944799290168057, + "language_loss": 0.83180892, + "learning_rate": 4.2569308773978817e-07, + "loss": 0.85316575, + "num_input_tokens_seen": 285151690, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 13219, + "time_per_iteration": 2.4046013355255127 + }, + { + "auxiliary_loss_clip": 0.01104407, + "auxiliary_loss_mlp": 0.01033611, + "balance_loss_clip": 1.02093291, + "balance_loss_mlp": 1.03578758, + "epoch": 0.7948294002705546, + "flos": 20442220992000.0, + "grad_norm": 1.8955600034556859, + "language_loss": 0.7588414, + "learning_rate": 4.2545291429636123e-07, + "loss": 0.78022164, + "num_input_tokens_seen": 285170485, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 13220, + "time_per_iteration": 3.8154995441436768 + }, + { + "auxiliary_loss_clip": 0.01104021, + "auxiliary_loss_mlp": 0.01033248, + "balance_loss_clip": 1.02123189, + "balance_loss_mlp": 1.03558075, + "epoch": 0.7948895235232226, + "flos": 38181194282880.0, + "grad_norm": 1.997206331366737, + "language_loss": 0.72682828, + "learning_rate": 4.252128005599176e-07, + "loss": 0.74820095, + "num_input_tokens_seen": 285191050, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 13221, + "time_per_iteration": 4.05722188949585 + }, + { + "auxiliary_loss_clip": 0.0109725, + "auxiliary_loss_mlp": 0.01026356, + "balance_loss_clip": 1.01540709, + "balance_loss_mlp": 1.03402424, + "epoch": 0.7949496467758905, + "flos": 15559806977280.0, + "grad_norm": 1.8234441442382394, + "language_loss": 0.7454437, + "learning_rate": 4.249727465395634e-07, + "loss": 0.76667982, + "num_input_tokens_seen": 285208750, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6328125, + "step": 13222, + "time_per_iteration": 3.82381534576416 + }, + { + "auxiliary_loss_clip": 0.01023305, + "auxiliary_loss_mlp": 0.01001588, + "balance_loss_clip": 1.00058103, + "balance_loss_mlp": 1.00324297, + "epoch": 0.7950097700285585, + "flos": 70897036728960.0, + "grad_norm": 0.7682356775746639, + "language_loss": 0.67054129, + "learning_rate": 4.247327522443993e-07, + "loss": 0.6907903, + "num_input_tokens_seen": 285264605, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.20117188, + "step": 13223, + "time_per_iteration": 2.910489797592163 + }, + { + "auxiliary_loss_clip": 0.01098555, + "auxiliary_loss_mlp": 0.0102863, + "balance_loss_clip": 1.01635742, + "balance_loss_mlp": 1.03264594, + "epoch": 0.7950698932812266, + "flos": 23951627585280.0, + "grad_norm": 1.726476210042691, + "language_loss": 0.7146225, + "learning_rate": 4.2449281768352717e-07, + "loss": 0.73589438, + "num_input_tokens_seen": 285283940, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66015625, + "step": 13224, + "time_per_iteration": 2.4960734844207764 + }, + { + "auxiliary_loss_clip": 0.01023049, + "auxiliary_loss_mlp": 0.00999614, + "balance_loss_clip": 0.9985711, + "balance_loss_mlp": 1.00314784, + "epoch": 0.7951300165338945, + "flos": 60282561415680.0, + "grad_norm": 0.6952095607048513, + "language_loss": 0.55011863, + "learning_rate": 4.2425294286604527e-07, + "loss": 0.57034522, + "num_input_tokens_seen": 285349525, + "router_z_loss_clip": 0.01043701, + "router_z_loss_mlp": 0.19921875, + "step": 13225, + "time_per_iteration": 4.49747109413147 + }, + { + "auxiliary_loss_clip": 0.01097582, + "auxiliary_loss_mlp": 0.01023614, + "balance_loss_clip": 1.01296818, + "balance_loss_mlp": 1.03373742, + "epoch": 0.7951901397865625, + "flos": 22819004956800.0, + "grad_norm": 1.932116603626369, + "language_loss": 0.64920199, + "learning_rate": 4.2401312780105034e-07, + "loss": 0.67041391, + "num_input_tokens_seen": 285367355, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.640625, + "step": 13226, + "time_per_iteration": 2.492919921875 + }, + { + "auxiliary_loss_clip": 0.01102867, + "auxiliary_loss_mlp": 0.01036554, + "balance_loss_clip": 1.02517581, + "balance_loss_mlp": 1.03584349, + "epoch": 0.7952502630392304, + "flos": 35695672871040.0, + "grad_norm": 3.097889886505811, + "language_loss": 0.70084739, + "learning_rate": 4.237733724976349e-07, + "loss": 0.72224164, + "num_input_tokens_seen": 285386190, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.671875, + "step": 13227, + "time_per_iteration": 2.55519700050354 + }, + { + "auxiliary_loss_clip": 0.01096905, + "auxiliary_loss_mlp": 0.01027043, + "balance_loss_clip": 1.01701736, + "balance_loss_mlp": 1.03388405, + "epoch": 0.7953103862918984, + "flos": 25629840869760.0, + "grad_norm": 1.6685312506793168, + "language_loss": 0.69431317, + "learning_rate": 4.2353367696489184e-07, + "loss": 0.71555269, + "num_input_tokens_seen": 285406150, + "router_z_loss_clip": 0.10058594, + "router_z_loss_mlp": 0.62890625, + "step": 13228, + "time_per_iteration": 2.5069961547851562 + }, + { + "auxiliary_loss_clip": 0.01100896, + "auxiliary_loss_mlp": 0.01032604, + "balance_loss_clip": 1.02095747, + "balance_loss_mlp": 1.03423619, + "epoch": 0.7953705095445663, + "flos": 40551980676480.0, + "grad_norm": 1.445985556067254, + "language_loss": 0.70922631, + "learning_rate": 4.232940412119095e-07, + "loss": 0.73056132, + "num_input_tokens_seen": 285429900, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66796875, + "step": 13229, + "time_per_iteration": 2.6479508876800537 + }, + { + "auxiliary_loss_clip": 0.01106737, + "auxiliary_loss_mlp": 0.01030695, + "balance_loss_clip": 1.01903689, + "balance_loss_mlp": 1.03793633, + "epoch": 0.7954306327972344, + "flos": 27636672706560.0, + "grad_norm": 1.7589665184565293, + "language_loss": 0.71889889, + "learning_rate": 4.2305446524777457e-07, + "loss": 0.74027318, + "num_input_tokens_seen": 285452555, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6875, + "step": 13230, + "time_per_iteration": 2.5062637329101562 + }, + { + "auxiliary_loss_clip": 0.01022715, + "auxiliary_loss_mlp": 0.01000194, + "balance_loss_clip": 0.9992041, + "balance_loss_mlp": 1.00273073, + "epoch": 0.7954907560499023, + "flos": 59504055995520.0, + "grad_norm": 0.8959170552781407, + "language_loss": 0.63557678, + "learning_rate": 4.2281494908157247e-07, + "loss": 0.65580589, + "num_input_tokens_seen": 285515700, + "router_z_loss_clip": 0.0098877, + "router_z_loss_mlp": 0.19921875, + "step": 13231, + "time_per_iteration": 3.082951784133911 + }, + { + "auxiliary_loss_clip": 0.01099992, + "auxiliary_loss_mlp": 0.01025832, + "balance_loss_clip": 1.01489472, + "balance_loss_mlp": 1.03479195, + "epoch": 0.7955508793025703, + "flos": 20120533764480.0, + "grad_norm": 1.513210283199707, + "language_loss": 0.69656473, + "learning_rate": 4.2257549272238566e-07, + "loss": 0.71782291, + "num_input_tokens_seen": 285533910, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.65234375, + "step": 13232, + "time_per_iteration": 2.440912961959839 + }, + { + "auxiliary_loss_clip": 0.01099299, + "auxiliary_loss_mlp": 0.01027268, + "balance_loss_clip": 1.01572859, + "balance_loss_mlp": 1.03366399, + "epoch": 0.7956110025552382, + "flos": 26505378881280.0, + "grad_norm": 1.5695652916232832, + "language_loss": 0.77775937, + "learning_rate": 4.223360961792952e-07, + "loss": 0.79902506, + "num_input_tokens_seen": 285554080, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 13233, + "time_per_iteration": 2.5125248432159424 + }, + { + "auxiliary_loss_clip": 0.01100048, + "auxiliary_loss_mlp": 0.01029216, + "balance_loss_clip": 1.01780808, + "balance_loss_mlp": 1.03384972, + "epoch": 0.7956711258079062, + "flos": 22565475786240.0, + "grad_norm": 2.4242376989153183, + "language_loss": 0.78652054, + "learning_rate": 4.220967594613769e-07, + "loss": 0.80781317, + "num_input_tokens_seen": 285572325, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66015625, + "step": 13234, + "time_per_iteration": 2.468038558959961 + }, + { + "auxiliary_loss_clip": 0.0109831, + "auxiliary_loss_mlp": 0.01023983, + "balance_loss_clip": 1.01374316, + "balance_loss_mlp": 1.03441608, + "epoch": 0.7957312490605741, + "flos": 17379005143680.0, + "grad_norm": 1.5905892668664205, + "language_loss": 0.70050478, + "learning_rate": 4.218574825777077e-07, + "loss": 0.72172773, + "num_input_tokens_seen": 285589770, + "router_z_loss_clip": 0.10205078, + "router_z_loss_mlp": 0.640625, + "step": 13235, + "time_per_iteration": 2.493274450302124 + }, + { + "auxiliary_loss_clip": 0.01100603, + "auxiliary_loss_mlp": 0.01027765, + "balance_loss_clip": 1.01598716, + "balance_loss_mlp": 1.03456783, + "epoch": 0.7957913723132422, + "flos": 22491427898880.0, + "grad_norm": 1.4327288828899616, + "language_loss": 0.6766414, + "learning_rate": 4.2161826553736145e-07, + "loss": 0.69792509, + "num_input_tokens_seen": 285610065, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6640625, + "step": 13236, + "time_per_iteration": 2.484380006790161 + }, + { + "auxiliary_loss_clip": 0.01097874, + "auxiliary_loss_mlp": 0.01026353, + "balance_loss_clip": 1.01505828, + "balance_loss_mlp": 1.03377748, + "epoch": 0.7958514955659101, + "flos": 22638087129600.0, + "grad_norm": 1.7411950179861415, + "language_loss": 0.75172085, + "learning_rate": 4.2137910834940826e-07, + "loss": 0.77296317, + "num_input_tokens_seen": 285628480, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 13237, + "time_per_iteration": 2.4766552448272705 + }, + { + "auxiliary_loss_clip": 0.01101102, + "auxiliary_loss_mlp": 0.01032287, + "balance_loss_clip": 1.01983571, + "balance_loss_mlp": 1.03548527, + "epoch": 0.7959116188185781, + "flos": 20704225772160.0, + "grad_norm": 1.9189361259680966, + "language_loss": 0.71440208, + "learning_rate": 4.211400110229175e-07, + "loss": 0.73573601, + "num_input_tokens_seen": 285647805, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.65625, + "step": 13238, + "time_per_iteration": 2.456925392150879 + }, + { + "auxiliary_loss_clip": 0.01099911, + "auxiliary_loss_mlp": 0.01026443, + "balance_loss_clip": 1.01485622, + "balance_loss_mlp": 1.0334146, + "epoch": 0.7959717420712461, + "flos": 19024683684480.0, + "grad_norm": 1.8980651664510928, + "language_loss": 0.73918056, + "learning_rate": 4.2090097356695684e-07, + "loss": 0.76044405, + "num_input_tokens_seen": 285665505, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6640625, + "step": 13239, + "time_per_iteration": 2.443584680557251 + }, + { + "auxiliary_loss_clip": 0.01103286, + "auxiliary_loss_mlp": 0.01032954, + "balance_loss_clip": 1.02093184, + "balance_loss_mlp": 1.03495479, + "epoch": 0.796031865323914, + "flos": 26356636661760.0, + "grad_norm": 1.6947466268706028, + "language_loss": 0.69046456, + "learning_rate": 4.2066199599058814e-07, + "loss": 0.71182698, + "num_input_tokens_seen": 285685855, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 13240, + "time_per_iteration": 2.4764912128448486 + }, + { + "auxiliary_loss_clip": 0.01022946, + "auxiliary_loss_mlp": 0.00999029, + "balance_loss_clip": 0.99800378, + "balance_loss_mlp": 1.00302553, + "epoch": 0.796091988576582, + "flos": 62069440320000.0, + "grad_norm": 0.8878157964624488, + "language_loss": 0.58645731, + "learning_rate": 4.2042307830287526e-07, + "loss": 0.60667706, + "num_input_tokens_seen": 285735710, + "router_z_loss_clip": 0.01025391, + "router_z_loss_mlp": 0.19921875, + "step": 13241, + "time_per_iteration": 2.843022108078003 + }, + { + "auxiliary_loss_clip": 0.01099407, + "auxiliary_loss_mlp": 0.01028324, + "balance_loss_clip": 1.01764321, + "balance_loss_mlp": 1.03436446, + "epoch": 0.7961521118292499, + "flos": 39020103400320.0, + "grad_norm": 1.8253771110110306, + "language_loss": 0.64276886, + "learning_rate": 4.201842205128772e-07, + "loss": 0.66404617, + "num_input_tokens_seen": 285757045, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6484375, + "step": 13242, + "time_per_iteration": 2.689807653427124 + }, + { + "auxiliary_loss_clip": 0.01100947, + "auxiliary_loss_mlp": 0.01031927, + "balance_loss_clip": 1.01986313, + "balance_loss_mlp": 1.03429365, + "epoch": 0.796212235081918, + "flos": 21762836426880.0, + "grad_norm": 2.1932509816632235, + "language_loss": 0.75971556, + "learning_rate": 4.199454226296526e-07, + "loss": 0.78104436, + "num_input_tokens_seen": 285776050, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6640625, + "step": 13243, + "time_per_iteration": 2.48710298538208 + }, + { + "auxiliary_loss_clip": 0.01101282, + "auxiliary_loss_mlp": 0.01028309, + "balance_loss_clip": 1.01662683, + "balance_loss_mlp": 1.03448629, + "epoch": 0.7962723583345859, + "flos": 21178857110400.0, + "grad_norm": 1.748658102628615, + "language_loss": 0.7998516, + "learning_rate": 4.1970668466225565e-07, + "loss": 0.8211475, + "num_input_tokens_seen": 285796830, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66796875, + "step": 13244, + "time_per_iteration": 2.475694179534912 + }, + { + "auxiliary_loss_clip": 0.01103079, + "auxiliary_loss_mlp": 0.01028494, + "balance_loss_clip": 1.01651323, + "balance_loss_mlp": 1.03422666, + "epoch": 0.7963324815872539, + "flos": 17128636369920.0, + "grad_norm": 1.9995558497633756, + "language_loss": 0.67953658, + "learning_rate": 4.1946800661973934e-07, + "loss": 0.70085227, + "num_input_tokens_seen": 285814755, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6875, + "step": 13245, + "time_per_iteration": 2.4532089233398438 + }, + { + "auxiliary_loss_clip": 0.01101276, + "auxiliary_loss_mlp": 0.01031868, + "balance_loss_clip": 1.02030444, + "balance_loss_mlp": 1.03515017, + "epoch": 0.7963926048399218, + "flos": 21397481239680.0, + "grad_norm": 1.3612442472486292, + "language_loss": 0.78971922, + "learning_rate": 4.192293885111549e-07, + "loss": 0.81105065, + "num_input_tokens_seen": 285834255, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66015625, + "step": 13246, + "time_per_iteration": 2.440587282180786 + }, + { + "auxiliary_loss_clip": 0.01101796, + "auxiliary_loss_mlp": 0.01026671, + "balance_loss_clip": 1.0149411, + "balance_loss_mlp": 1.0336647, + "epoch": 0.7964527280925898, + "flos": 25184188828800.0, + "grad_norm": 1.6847390016039745, + "language_loss": 0.66190958, + "learning_rate": 4.1899083034555007e-07, + "loss": 0.68319428, + "num_input_tokens_seen": 285853540, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 13247, + "time_per_iteration": 2.487718343734741 + }, + { + "auxiliary_loss_clip": 0.01097373, + "auxiliary_loss_mlp": 0.01028984, + "balance_loss_clip": 1.01829112, + "balance_loss_mlp": 1.03314734, + "epoch": 0.7965128513452577, + "flos": 27015884928000.0, + "grad_norm": 2.8639636552336234, + "language_loss": 0.71457285, + "learning_rate": 4.1875233213197123e-07, + "loss": 0.73583645, + "num_input_tokens_seen": 285872705, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.640625, + "step": 13248, + "time_per_iteration": 2.474893093109131 + }, + { + "auxiliary_loss_clip": 0.0110231, + "auxiliary_loss_mlp": 0.01028868, + "balance_loss_clip": 1.01695275, + "balance_loss_mlp": 1.03439724, + "epoch": 0.7965729745979258, + "flos": 24419578993920.0, + "grad_norm": 2.0200427415060416, + "language_loss": 0.7616542, + "learning_rate": 4.1851389387946255e-07, + "loss": 0.78296602, + "num_input_tokens_seen": 285890290, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 13249, + "time_per_iteration": 2.48595929145813 + }, + { + "auxiliary_loss_clip": 0.01099446, + "auxiliary_loss_mlp": 0.01031046, + "balance_loss_clip": 1.01975679, + "balance_loss_mlp": 1.03507221, + "epoch": 0.7966330978505937, + "flos": 18840389978880.0, + "grad_norm": 2.126182284443467, + "language_loss": 0.61335742, + "learning_rate": 4.1827551559706674e-07, + "loss": 0.63466233, + "num_input_tokens_seen": 285909190, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.64453125, + "step": 13250, + "time_per_iteration": 2.4277217388153076 + }, + { + "auxiliary_loss_clip": 0.01101542, + "auxiliary_loss_mlp": 0.01025247, + "balance_loss_clip": 1.01375568, + "balance_loss_mlp": 1.03543615, + "epoch": 0.7966932211032617, + "flos": 13152319862400.0, + "grad_norm": 2.322541545659239, + "language_loss": 0.72526091, + "learning_rate": 4.180371972938206e-07, + "loss": 0.7465288, + "num_input_tokens_seen": 285927570, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66015625, + "step": 13251, + "time_per_iteration": 2.4575724601745605 + }, + { + "auxiliary_loss_clip": 0.01103859, + "auxiliary_loss_mlp": 0.01027801, + "balance_loss_clip": 1.0152247, + "balance_loss_mlp": 1.03521776, + "epoch": 0.7967533443559297, + "flos": 23949760078080.0, + "grad_norm": 1.8469414013396577, + "language_loss": 0.72915018, + "learning_rate": 4.177989389787624e-07, + "loss": 0.75046682, + "num_input_tokens_seen": 285945810, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 13252, + "time_per_iteration": 2.4559550285339355 + }, + { + "auxiliary_loss_clip": 0.010964, + "auxiliary_loss_mlp": 0.01027003, + "balance_loss_clip": 1.01549911, + "balance_loss_mlp": 1.03332281, + "epoch": 0.7968134676085976, + "flos": 30368791964160.0, + "grad_norm": 1.6873706589511155, + "language_loss": 0.66239917, + "learning_rate": 4.175607406609278e-07, + "loss": 0.68363321, + "num_input_tokens_seen": 285964235, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6328125, + "step": 13253, + "time_per_iteration": 2.538630962371826 + }, + { + "auxiliary_loss_clip": 0.01105101, + "auxiliary_loss_mlp": 0.01035714, + "balance_loss_clip": 1.02353644, + "balance_loss_mlp": 1.03758895, + "epoch": 0.7968735908612656, + "flos": 23075048079360.0, + "grad_norm": 1.5649254820848235, + "language_loss": 0.67826599, + "learning_rate": 4.1732260234934767e-07, + "loss": 0.69967413, + "num_input_tokens_seen": 285983710, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 13254, + "time_per_iteration": 2.4423506259918213 + }, + { + "auxiliary_loss_clip": 0.01098639, + "auxiliary_loss_mlp": 0.01035877, + "balance_loss_clip": 1.02485597, + "balance_loss_mlp": 1.03396869, + "epoch": 0.7969337141139335, + "flos": 23582250074880.0, + "grad_norm": 1.8439634807377834, + "language_loss": 0.69335532, + "learning_rate": 4.1708452405305314e-07, + "loss": 0.71470052, + "num_input_tokens_seen": 286003425, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 13255, + "time_per_iteration": 2.4770302772521973 + }, + { + "auxiliary_loss_clip": 0.01098301, + "auxiliary_loss_mlp": 0.0103042, + "balance_loss_clip": 1.01932132, + "balance_loss_mlp": 1.03357673, + "epoch": 0.7969938373666016, + "flos": 19755860935680.0, + "grad_norm": 2.0839299199597576, + "language_loss": 0.79384631, + "learning_rate": 4.168465057810733e-07, + "loss": 0.81513351, + "num_input_tokens_seen": 286020130, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6484375, + "step": 13256, + "time_per_iteration": 2.435065507888794 + }, + { + "auxiliary_loss_clip": 0.01100559, + "auxiliary_loss_mlp": 0.01025356, + "balance_loss_clip": 1.01379228, + "balance_loss_mlp": 1.03476715, + "epoch": 0.7970539606192695, + "flos": 24134089697280.0, + "grad_norm": 1.7632548016359857, + "language_loss": 0.65341133, + "learning_rate": 4.166085475424315e-07, + "loss": 0.67467046, + "num_input_tokens_seen": 286040230, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 13257, + "time_per_iteration": 2.4952993392944336 + }, + { + "auxiliary_loss_clip": 0.01106098, + "auxiliary_loss_mlp": 0.01033925, + "balance_loss_clip": 1.02211094, + "balance_loss_mlp": 1.03727162, + "epoch": 0.7971140838719375, + "flos": 17968622895360.0, + "grad_norm": 4.269740157114163, + "language_loss": 0.72265047, + "learning_rate": 4.163706493461523e-07, + "loss": 0.74405068, + "num_input_tokens_seen": 286059475, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 13258, + "time_per_iteration": 2.422609806060791 + }, + { + "auxiliary_loss_clip": 0.01103566, + "auxiliary_loss_mlp": 0.01030154, + "balance_loss_clip": 1.01795912, + "balance_loss_mlp": 1.03580558, + "epoch": 0.7971742071246054, + "flos": 19169547235200.0, + "grad_norm": 1.7787889345265135, + "language_loss": 0.68876815, + "learning_rate": 4.1613281120125655e-07, + "loss": 0.7101053, + "num_input_tokens_seen": 286077820, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6796875, + "step": 13259, + "time_per_iteration": 2.454787015914917 + }, + { + "auxiliary_loss_clip": 0.01098869, + "auxiliary_loss_mlp": 0.01029239, + "balance_loss_clip": 1.01853442, + "balance_loss_mlp": 1.03478527, + "epoch": 0.7972343303772734, + "flos": 27125951178240.0, + "grad_norm": 1.6665251005798685, + "language_loss": 0.73773205, + "learning_rate": 4.158950331167641e-07, + "loss": 0.75901318, + "num_input_tokens_seen": 286097285, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.640625, + "step": 13260, + "time_per_iteration": 2.491205930709839 + }, + { + "auxiliary_loss_clip": 0.01097155, + "auxiliary_loss_mlp": 0.01028441, + "balance_loss_clip": 1.01763427, + "balance_loss_mlp": 1.03306603, + "epoch": 0.7972944536299413, + "flos": 20996646393600.0, + "grad_norm": 1.7740121958206554, + "language_loss": 0.78436148, + "learning_rate": 4.1565731510169065e-07, + "loss": 0.80561745, + "num_input_tokens_seen": 286116000, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.640625, + "step": 13261, + "time_per_iteration": 3.8501453399658203 + }, + { + "auxiliary_loss_clip": 0.01097148, + "auxiliary_loss_mlp": 0.01030088, + "balance_loss_clip": 1.02008629, + "balance_loss_mlp": 1.035748, + "epoch": 0.7973545768826094, + "flos": 21580015178880.0, + "grad_norm": 1.4361813599632072, + "language_loss": 0.75999635, + "learning_rate": 4.154196571650501e-07, + "loss": 0.78126872, + "num_input_tokens_seen": 286135110, + "router_z_loss_clip": 0.10009766, + "router_z_loss_mlp": 0.61328125, + "step": 13262, + "time_per_iteration": 2.4577090740203857 + }, + { + "auxiliary_loss_clip": 0.01107624, + "auxiliary_loss_mlp": 0.0102757, + "balance_loss_clip": 1.01496387, + "balance_loss_mlp": 1.03712356, + "epoch": 0.7974147001352773, + "flos": 20558536208640.0, + "grad_norm": 2.1431092546500103, + "language_loss": 0.7052893, + "learning_rate": 4.1518205931585524e-07, + "loss": 0.7266413, + "num_input_tokens_seen": 286152835, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 13263, + "time_per_iteration": 3.8635799884796143 + }, + { + "auxiliary_loss_clip": 0.01106881, + "auxiliary_loss_mlp": 0.01034326, + "balance_loss_clip": 1.021523, + "balance_loss_mlp": 1.03610015, + "epoch": 0.7974748233879453, + "flos": 20996790048000.0, + "grad_norm": 1.9663243641140786, + "language_loss": 0.71254778, + "learning_rate": 4.149445215631153e-07, + "loss": 0.73395979, + "num_input_tokens_seen": 286171785, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.70703125, + "step": 13264, + "time_per_iteration": 3.8191962242126465 + }, + { + "auxiliary_loss_clip": 0.01098223, + "auxiliary_loss_mlp": 0.01033387, + "balance_loss_clip": 1.0220921, + "balance_loss_mlp": 1.03471494, + "epoch": 0.7975349466406133, + "flos": 22565188477440.0, + "grad_norm": 1.6219090858782177, + "language_loss": 0.76819849, + "learning_rate": 4.1470704391583776e-07, + "loss": 0.78951454, + "num_input_tokens_seen": 286190420, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.63671875, + "step": 13265, + "time_per_iteration": 2.4498677253723145 + }, + { + "auxiliary_loss_clip": 0.0110135, + "auxiliary_loss_mlp": 0.01027942, + "balance_loss_clip": 1.01672459, + "balance_loss_mlp": 1.03420353, + "epoch": 0.7975950698932812, + "flos": 21689542725120.0, + "grad_norm": 1.8502756325316978, + "language_loss": 0.75627744, + "learning_rate": 4.144696263830285e-07, + "loss": 0.77757037, + "num_input_tokens_seen": 286210105, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 13266, + "time_per_iteration": 2.4424939155578613 + }, + { + "auxiliary_loss_clip": 0.01097761, + "auxiliary_loss_mlp": 0.01027384, + "balance_loss_clip": 1.01613104, + "balance_loss_mlp": 1.03291893, + "epoch": 0.7976551931459492, + "flos": 19604568850560.0, + "grad_norm": 1.5381451690373484, + "language_loss": 0.83917278, + "learning_rate": 4.1423226897369015e-07, + "loss": 0.86042428, + "num_input_tokens_seen": 286228180, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6484375, + "step": 13267, + "time_per_iteration": 4.030280113220215 + }, + { + "auxiliary_loss_clip": 0.01099973, + "auxiliary_loss_mlp": 0.01032115, + "balance_loss_clip": 1.02045047, + "balance_loss_mlp": 1.03457189, + "epoch": 0.7977153163986171, + "flos": 21687603390720.0, + "grad_norm": 1.7026811780981197, + "language_loss": 0.75749743, + "learning_rate": 4.139949716968223e-07, + "loss": 0.77881831, + "num_input_tokens_seen": 286247305, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65625, + "step": 13268, + "time_per_iteration": 2.4395506381988525 + }, + { + "auxiliary_loss_clip": 0.01101025, + "auxiliary_loss_mlp": 0.01027573, + "balance_loss_clip": 1.01612282, + "balance_loss_mlp": 1.0355351, + "epoch": 0.7977754396512852, + "flos": 23476780765440.0, + "grad_norm": 1.5399567563780987, + "language_loss": 0.77794158, + "learning_rate": 4.1375773456142403e-07, + "loss": 0.7992276, + "num_input_tokens_seen": 286268145, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65625, + "step": 13269, + "time_per_iteration": 2.4894964694976807 + }, + { + "auxiliary_loss_clip": 0.01096838, + "auxiliary_loss_mlp": 0.01030312, + "balance_loss_clip": 1.01972592, + "balance_loss_mlp": 1.03359246, + "epoch": 0.7978355629039531, + "flos": 22382223575040.0, + "grad_norm": 1.7083868858848195, + "language_loss": 0.82055652, + "learning_rate": 4.135205575764922e-07, + "loss": 0.84182805, + "num_input_tokens_seen": 286286775, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6328125, + "step": 13270, + "time_per_iteration": 2.4561750888824463 + }, + { + "auxiliary_loss_clip": 0.01101524, + "auxiliary_loss_mlp": 0.0102809, + "balance_loss_clip": 1.0165925, + "balance_loss_mlp": 1.03613377, + "epoch": 0.7978956861566211, + "flos": 20266331068800.0, + "grad_norm": 1.6705229084811413, + "language_loss": 0.595366, + "learning_rate": 4.1328344075101905e-07, + "loss": 0.61666214, + "num_input_tokens_seen": 286305590, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65234375, + "step": 13271, + "time_per_iteration": 2.4954357147216797 + }, + { + "auxiliary_loss_clip": 0.01104573, + "auxiliary_loss_mlp": 0.01031498, + "balance_loss_clip": 1.01961303, + "balance_loss_mlp": 1.03635263, + "epoch": 0.797955809409289, + "flos": 28112417366400.0, + "grad_norm": 1.5850933882113063, + "language_loss": 0.73206866, + "learning_rate": 4.130463840939975e-07, + "loss": 0.75342935, + "num_input_tokens_seen": 286328050, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.68359375, + "step": 13272, + "time_per_iteration": 2.509640693664551 + }, + { + "auxiliary_loss_clip": 0.0110219, + "auxiliary_loss_mlp": 0.01027919, + "balance_loss_clip": 1.01630759, + "balance_loss_mlp": 1.03711224, + "epoch": 0.798015932661957, + "flos": 15559591495680.0, + "grad_norm": 2.009910797942707, + "language_loss": 0.71586084, + "learning_rate": 4.128093876144161e-07, + "loss": 0.73716193, + "num_input_tokens_seen": 286345265, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 13273, + "time_per_iteration": 2.488239049911499 + }, + { + "auxiliary_loss_clip": 0.01103696, + "auxiliary_loss_mlp": 0.01030621, + "balance_loss_clip": 1.01880777, + "balance_loss_mlp": 1.03583157, + "epoch": 0.7980760559146249, + "flos": 23951196622080.0, + "grad_norm": 1.887561029731992, + "language_loss": 0.7577731, + "learning_rate": 4.1257245132126117e-07, + "loss": 0.77911627, + "num_input_tokens_seen": 286364465, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 13274, + "time_per_iteration": 2.462188482284546 + }, + { + "auxiliary_loss_clip": 0.01095507, + "auxiliary_loss_mlp": 0.01028761, + "balance_loss_clip": 1.01827025, + "balance_loss_mlp": 1.03438878, + "epoch": 0.798136179167293, + "flos": 28038082170240.0, + "grad_norm": 1.3212411504254853, + "language_loss": 0.77607358, + "learning_rate": 4.12335575223518e-07, + "loss": 0.79731625, + "num_input_tokens_seen": 286385565, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.61328125, + "step": 13275, + "time_per_iteration": 2.514090061187744 + }, + { + "auxiliary_loss_clip": 0.0110285, + "auxiliary_loss_mlp": 0.01031831, + "balance_loss_clip": 1.01921892, + "balance_loss_mlp": 1.0345515, + "epoch": 0.7981963024199609, + "flos": 35984538046080.0, + "grad_norm": 1.829043802525264, + "language_loss": 0.64052433, + "learning_rate": 4.1209875933016877e-07, + "loss": 0.66187114, + "num_input_tokens_seen": 286403950, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.68359375, + "step": 13276, + "time_per_iteration": 2.5371670722961426 + }, + { + "auxiliary_loss_clip": 0.01097788, + "auxiliary_loss_mlp": 0.01028439, + "balance_loss_clip": 1.01740062, + "balance_loss_mlp": 1.03425479, + "epoch": 0.7982564256726289, + "flos": 25884914325120.0, + "grad_norm": 1.5295363534394828, + "language_loss": 0.60448158, + "learning_rate": 4.118620036501945e-07, + "loss": 0.62574387, + "num_input_tokens_seen": 286426160, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.63671875, + "step": 13277, + "time_per_iteration": 2.4880197048187256 + }, + { + "auxiliary_loss_clip": 0.01105128, + "auxiliary_loss_mlp": 0.01030377, + "balance_loss_clip": 1.01911783, + "balance_loss_mlp": 1.0375464, + "epoch": 0.7983165489252969, + "flos": 25739152934400.0, + "grad_norm": 2.008631814369184, + "language_loss": 0.79715037, + "learning_rate": 4.1162530819257227e-07, + "loss": 0.81850541, + "num_input_tokens_seen": 286446610, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.67578125, + "step": 13278, + "time_per_iteration": 2.4780664443969727 + }, + { + "auxiliary_loss_clip": 0.01103768, + "auxiliary_loss_mlp": 0.01037367, + "balance_loss_clip": 1.02485633, + "balance_loss_mlp": 1.03559947, + "epoch": 0.7983766721779648, + "flos": 21908202768000.0, + "grad_norm": 1.9965492403610876, + "language_loss": 0.6323722, + "learning_rate": 4.113886729662768e-07, + "loss": 0.65378356, + "num_input_tokens_seen": 286465460, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 13279, + "time_per_iteration": 2.4683034420013428 + }, + { + "auxiliary_loss_clip": 0.01095285, + "auxiliary_loss_mlp": 0.01026468, + "balance_loss_clip": 1.01601911, + "balance_loss_mlp": 1.03389192, + "epoch": 0.7984367954306328, + "flos": 29347420734720.0, + "grad_norm": 1.6504787755208947, + "language_loss": 0.70773625, + "learning_rate": 4.111520979802825e-07, + "loss": 0.72895384, + "num_input_tokens_seen": 286485720, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.61328125, + "step": 13280, + "time_per_iteration": 2.4923903942108154 + }, + { + "auxiliary_loss_clip": 0.01103118, + "auxiliary_loss_mlp": 0.01031209, + "balance_loss_clip": 1.01919234, + "balance_loss_mlp": 1.03547907, + "epoch": 0.7984969186833007, + "flos": 31357772104320.0, + "grad_norm": 1.6234618647236767, + "language_loss": 0.62751859, + "learning_rate": 4.1091558324355955e-07, + "loss": 0.64886189, + "num_input_tokens_seen": 286507465, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 13281, + "time_per_iteration": 2.5414252281188965 + }, + { + "auxiliary_loss_clip": 0.01104951, + "auxiliary_loss_mlp": 0.01033082, + "balance_loss_clip": 1.02107787, + "balance_loss_mlp": 1.03481054, + "epoch": 0.7985570419359688, + "flos": 24312924535680.0, + "grad_norm": 1.7029379552600752, + "language_loss": 0.80491292, + "learning_rate": 4.1067912876507683e-07, + "loss": 0.82629329, + "num_input_tokens_seen": 286526345, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.703125, + "step": 13282, + "time_per_iteration": 2.4520959854125977 + }, + { + "auxiliary_loss_clip": 0.01101884, + "auxiliary_loss_mlp": 0.0103017, + "balance_loss_clip": 1.01801062, + "balance_loss_mlp": 1.03421319, + "epoch": 0.7986171651886367, + "flos": 15742233175680.0, + "grad_norm": 1.8947522031030082, + "language_loss": 0.7154727, + "learning_rate": 4.10442734553802e-07, + "loss": 0.73679316, + "num_input_tokens_seen": 286544095, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6796875, + "step": 13283, + "time_per_iteration": 2.4246160984039307 + }, + { + "auxiliary_loss_clip": 0.01098743, + "auxiliary_loss_mlp": 0.01029389, + "balance_loss_clip": 1.01833189, + "balance_loss_mlp": 1.03302252, + "epoch": 0.7986772884413047, + "flos": 11619401091840.0, + "grad_norm": 1.8968441964994822, + "language_loss": 0.7347362, + "learning_rate": 4.102064006186967e-07, + "loss": 0.75601751, + "num_input_tokens_seen": 286560960, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 13284, + "time_per_iteration": 2.402165174484253 + }, + { + "auxiliary_loss_clip": 0.01100101, + "auxiliary_loss_mlp": 0.01030516, + "balance_loss_clip": 1.02016854, + "balance_loss_mlp": 1.03526652, + "epoch": 0.7987374116939726, + "flos": 22091059929600.0, + "grad_norm": 1.5742258488227296, + "language_loss": 0.70226932, + "learning_rate": 4.0997012696872415e-07, + "loss": 0.72357547, + "num_input_tokens_seen": 286579865, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.6484375, + "step": 13285, + "time_per_iteration": 2.5729713439941406 + }, + { + "auxiliary_loss_clip": 0.0109928, + "auxiliary_loss_mlp": 0.01027172, + "balance_loss_clip": 1.01627612, + "balance_loss_mlp": 1.03425968, + "epoch": 0.7987975349466406, + "flos": 17890696339200.0, + "grad_norm": 1.5373042942121937, + "language_loss": 0.73492497, + "learning_rate": 4.097339136128437e-07, + "loss": 0.75618953, + "num_input_tokens_seen": 286597295, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6484375, + "step": 13286, + "time_per_iteration": 2.435335874557495 + }, + { + "auxiliary_loss_clip": 0.01100513, + "auxiliary_loss_mlp": 0.01032503, + "balance_loss_clip": 1.02102304, + "balance_loss_mlp": 1.03493309, + "epoch": 0.7988576581993085, + "flos": 19719232041600.0, + "grad_norm": 2.2146294120164876, + "language_loss": 0.74433863, + "learning_rate": 4.0949776056001296e-07, + "loss": 0.76566875, + "num_input_tokens_seen": 286616270, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 13287, + "time_per_iteration": 2.4583966732025146 + }, + { + "auxiliary_loss_clip": 0.0109898, + "auxiliary_loss_mlp": 0.01028536, + "balance_loss_clip": 1.0171752, + "balance_loss_mlp": 1.03461981, + "epoch": 0.7989177814519766, + "flos": 28036358317440.0, + "grad_norm": 2.4603095156491457, + "language_loss": 0.61630833, + "learning_rate": 4.092616678191863e-07, + "loss": 0.63758349, + "num_input_tokens_seen": 286638315, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.64453125, + "step": 13288, + "time_per_iteration": 2.5024874210357666 + }, + { + "auxiliary_loss_clip": 0.01100282, + "auxiliary_loss_mlp": 0.0102498, + "balance_loss_clip": 1.0142808, + "balance_loss_mlp": 1.03618968, + "epoch": 0.7989779047046445, + "flos": 28871029630080.0, + "grad_norm": 2.398551145532932, + "language_loss": 0.70419228, + "learning_rate": 4.090256353993169e-07, + "loss": 0.72544491, + "num_input_tokens_seen": 286658630, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.640625, + "step": 13289, + "time_per_iteration": 2.5227341651916504 + }, + { + "auxiliary_loss_clip": 0.01099382, + "auxiliary_loss_mlp": 0.01032264, + "balance_loss_clip": 1.01975262, + "balance_loss_mlp": 1.03570962, + "epoch": 0.7990380279573125, + "flos": 18186887888640.0, + "grad_norm": 3.476010785150094, + "language_loss": 0.62750173, + "learning_rate": 4.0878966330935506e-07, + "loss": 0.64881819, + "num_input_tokens_seen": 286676870, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.63671875, + "step": 13290, + "time_per_iteration": 2.413945436477661 + }, + { + "auxiliary_loss_clip": 0.01103234, + "auxiliary_loss_mlp": 0.0102787, + "balance_loss_clip": 1.0152936, + "balance_loss_mlp": 1.03642523, + "epoch": 0.7990981512099805, + "flos": 20879936127360.0, + "grad_norm": 3.9151132007409513, + "language_loss": 0.71637499, + "learning_rate": 4.08553751558248e-07, + "loss": 0.73768604, + "num_input_tokens_seen": 286694300, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.66796875, + "step": 13291, + "time_per_iteration": 2.4885568618774414 + }, + { + "auxiliary_loss_clip": 0.01097167, + "auxiliary_loss_mlp": 0.01025771, + "balance_loss_clip": 1.01506627, + "balance_loss_mlp": 1.03270483, + "epoch": 0.7991582744626484, + "flos": 26099911180800.0, + "grad_norm": 1.5485118073746154, + "language_loss": 0.6335237, + "learning_rate": 4.083179001549422e-07, + "loss": 0.65475303, + "num_input_tokens_seen": 286714545, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6484375, + "step": 13292, + "time_per_iteration": 2.4616239070892334 + }, + { + "auxiliary_loss_clip": 0.0109979, + "auxiliary_loss_mlp": 0.01029505, + "balance_loss_clip": 1.0185678, + "balance_loss_mlp": 1.0349102, + "epoch": 0.7992183977153164, + "flos": 35295843605760.0, + "grad_norm": 1.555240655733236, + "language_loss": 0.56249213, + "learning_rate": 4.0808210910838105e-07, + "loss": 0.58378512, + "num_input_tokens_seen": 286734525, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 13293, + "time_per_iteration": 2.5668938159942627 + }, + { + "auxiliary_loss_clip": 0.01101281, + "auxiliary_loss_mlp": 0.01032816, + "balance_loss_clip": 1.02145505, + "balance_loss_mlp": 1.03606391, + "epoch": 0.7992785209679844, + "flos": 51853426577280.0, + "grad_norm": 2.987312394872763, + "language_loss": 0.71444452, + "learning_rate": 4.0784637842750704e-07, + "loss": 0.73578554, + "num_input_tokens_seen": 286753430, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65234375, + "step": 13294, + "time_per_iteration": 2.693946361541748 + }, + { + "auxiliary_loss_clip": 0.01101257, + "auxiliary_loss_mlp": 0.01030873, + "balance_loss_clip": 1.0196135, + "balance_loss_mlp": 1.03554058, + "epoch": 0.7993386442206524, + "flos": 22565116650240.0, + "grad_norm": 1.7329593206167035, + "language_loss": 0.72202832, + "learning_rate": 4.0761070812125675e-07, + "loss": 0.74334961, + "num_input_tokens_seen": 286771915, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 13295, + "time_per_iteration": 2.440544605255127 + }, + { + "auxiliary_loss_clip": 0.01096658, + "auxiliary_loss_mlp": 0.01030621, + "balance_loss_clip": 1.01995802, + "balance_loss_mlp": 1.03398633, + "epoch": 0.7993987674733203, + "flos": 18800277465600.0, + "grad_norm": 2.1702200839393395, + "language_loss": 0.76480281, + "learning_rate": 4.0737509819856797e-07, + "loss": 0.78607565, + "num_input_tokens_seen": 286789835, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.625, + "step": 13296, + "time_per_iteration": 2.4405605792999268 + }, + { + "auxiliary_loss_clip": 0.01023152, + "auxiliary_loss_mlp": 0.01002637, + "balance_loss_clip": 1.001755, + "balance_loss_mlp": 1.00325012, + "epoch": 0.7994588907259883, + "flos": 69421720394880.0, + "grad_norm": 0.6861737124330846, + "language_loss": 0.60802543, + "learning_rate": 4.0713954866837573e-07, + "loss": 0.62828332, + "num_input_tokens_seen": 286855580, + "router_z_loss_clip": 0.0088501, + "router_z_loss_mlp": 0.19921875, + "step": 13297, + "time_per_iteration": 3.11775541305542 + }, + { + "auxiliary_loss_clip": 0.01099558, + "auxiliary_loss_mlp": 0.01029569, + "balance_loss_clip": 1.01829767, + "balance_loss_mlp": 1.03441751, + "epoch": 0.7995190139786562, + "flos": 13480327883520.0, + "grad_norm": 1.8885665209520346, + "language_loss": 0.70239675, + "learning_rate": 4.0690405953961073e-07, + "loss": 0.72368801, + "num_input_tokens_seen": 286874360, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65234375, + "step": 13298, + "time_per_iteration": 2.4225876331329346 + }, + { + "auxiliary_loss_clip": 0.01103672, + "auxiliary_loss_mlp": 0.0103276, + "balance_loss_clip": 1.02003491, + "balance_loss_mlp": 1.0351696, + "epoch": 0.7995791372313242, + "flos": 21652842003840.0, + "grad_norm": 1.9550250872317747, + "language_loss": 0.75762308, + "learning_rate": 4.066686308212037e-07, + "loss": 0.77898747, + "num_input_tokens_seen": 286891950, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 13299, + "time_per_iteration": 2.4788718223571777 + }, + { + "auxiliary_loss_clip": 0.01098072, + "auxiliary_loss_mlp": 0.01027232, + "balance_loss_clip": 1.01662207, + "balance_loss_mlp": 1.03498912, + "epoch": 0.7996392604839921, + "flos": 26068130622720.0, + "grad_norm": 1.7719100438283584, + "language_loss": 0.77760887, + "learning_rate": 4.064332625220828e-07, + "loss": 0.79886186, + "num_input_tokens_seen": 286911725, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.6328125, + "step": 13300, + "time_per_iteration": 2.4796881675720215 + }, + { + "auxiliary_loss_clip": 0.01101744, + "auxiliary_loss_mlp": 0.01027746, + "balance_loss_clip": 1.01584315, + "balance_loss_mlp": 1.03424072, + "epoch": 0.7996993837366602, + "flos": 24606889441920.0, + "grad_norm": 2.002040406516657, + "language_loss": 0.63432777, + "learning_rate": 4.0619795465117115e-07, + "loss": 0.65562272, + "num_input_tokens_seen": 286931400, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 13301, + "time_per_iteration": 2.4858558177948 + }, + { + "auxiliary_loss_clip": 0.01098587, + "auxiliary_loss_mlp": 0.01033841, + "balance_loss_clip": 1.02241445, + "balance_loss_mlp": 1.0356431, + "epoch": 0.7997595069893281, + "flos": 20992049452800.0, + "grad_norm": 1.6279257021355094, + "language_loss": 0.71502745, + "learning_rate": 4.059627072173928e-07, + "loss": 0.73635173, + "num_input_tokens_seen": 286949795, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.62890625, + "step": 13302, + "time_per_iteration": 2.4388864040374756 + }, + { + "auxiliary_loss_clip": 0.01104001, + "auxiliary_loss_mlp": 0.01030712, + "balance_loss_clip": 1.01885068, + "balance_loss_mlp": 1.03659701, + "epoch": 0.7998196302419961, + "flos": 24426510318720.0, + "grad_norm": 1.833344875316907, + "language_loss": 0.83622801, + "learning_rate": 4.057275202296684e-07, + "loss": 0.85757518, + "num_input_tokens_seen": 286968805, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 13303, + "time_per_iteration": 3.86017107963562 + }, + { + "auxiliary_loss_clip": 0.01098042, + "auxiliary_loss_mlp": 0.01030765, + "balance_loss_clip": 1.02020907, + "balance_loss_mlp": 1.03435075, + "epoch": 0.7998797534946641, + "flos": 30264651457920.0, + "grad_norm": 1.6100512541022713, + "language_loss": 0.5873881, + "learning_rate": 4.054923936969166e-07, + "loss": 0.60867614, + "num_input_tokens_seen": 286990235, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.640625, + "step": 13304, + "time_per_iteration": 2.5343167781829834 + }, + { + "auxiliary_loss_clip": 0.01101146, + "auxiliary_loss_mlp": 0.01027155, + "balance_loss_clip": 1.01525831, + "balance_loss_mlp": 1.03274465, + "epoch": 0.799939876747332, + "flos": 23513984277120.0, + "grad_norm": 1.7664004546927765, + "language_loss": 0.69075799, + "learning_rate": 4.0525732762805265e-07, + "loss": 0.71204102, + "num_input_tokens_seen": 287011060, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.68359375, + "step": 13305, + "time_per_iteration": 3.846991777420044 + }, + { + "auxiliary_loss_clip": 0.01097982, + "auxiliary_loss_mlp": 0.01026107, + "balance_loss_clip": 1.0154916, + "balance_loss_mlp": 1.03421581, + "epoch": 0.8, + "flos": 19318109886720.0, + "grad_norm": 1.5410766724401597, + "language_loss": 0.6923117, + "learning_rate": 4.0502232203199107e-07, + "loss": 0.71355259, + "num_input_tokens_seen": 287029215, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.63671875, + "step": 13306, + "time_per_iteration": 3.824300527572632 + }, + { + "auxiliary_loss_clip": 0.01101652, + "auxiliary_loss_mlp": 0.01033892, + "balance_loss_clip": 1.02248406, + "balance_loss_mlp": 1.03599465, + "epoch": 0.800060123252668, + "flos": 32412432263040.0, + "grad_norm": 1.5349326427116308, + "language_loss": 0.69361722, + "learning_rate": 4.0478737691764286e-07, + "loss": 0.71497267, + "num_input_tokens_seen": 287050855, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 13307, + "time_per_iteration": 2.641338348388672 + }, + { + "auxiliary_loss_clip": 0.01100663, + "auxiliary_loss_mlp": 0.01029461, + "balance_loss_clip": 1.01857102, + "balance_loss_mlp": 1.03444958, + "epoch": 0.800120246505336, + "flos": 20010611168640.0, + "grad_norm": 1.9021997746458712, + "language_loss": 0.76933712, + "learning_rate": 4.0455249229391677e-07, + "loss": 0.79063845, + "num_input_tokens_seen": 287069915, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6640625, + "step": 13308, + "time_per_iteration": 2.449411630630493 + }, + { + "auxiliary_loss_clip": 0.01102913, + "auxiliary_loss_mlp": 0.01029431, + "balance_loss_clip": 1.01708674, + "balance_loss_mlp": 1.03450274, + "epoch": 0.8001803697580039, + "flos": 31868278151040.0, + "grad_norm": 1.9120896372435958, + "language_loss": 0.78702182, + "learning_rate": 4.0431766816972e-07, + "loss": 0.80834526, + "num_input_tokens_seen": 287091450, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.68359375, + "step": 13309, + "time_per_iteration": 4.006925106048584 + }, + { + "auxiliary_loss_clip": 0.01023336, + "auxiliary_loss_mlp": 0.0100397, + "balance_loss_clip": 1.00295115, + "balance_loss_mlp": 1.00320923, + "epoch": 0.8002404930106719, + "flos": 63392066916480.0, + "grad_norm": 0.9338382930636256, + "language_loss": 0.64702326, + "learning_rate": 4.040829045539571e-07, + "loss": 0.66729629, + "num_input_tokens_seen": 287148365, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.20117188, + "step": 13310, + "time_per_iteration": 2.975738525390625 + }, + { + "auxiliary_loss_clip": 0.01100488, + "auxiliary_loss_mlp": 0.01034343, + "balance_loss_clip": 1.02319705, + "balance_loss_mlp": 1.03539252, + "epoch": 0.8003006162633398, + "flos": 27855476403840.0, + "grad_norm": 4.652395781854749, + "language_loss": 0.82905459, + "learning_rate": 4.0384820145553156e-07, + "loss": 0.85040295, + "num_input_tokens_seen": 287168280, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65234375, + "step": 13311, + "time_per_iteration": 2.5010745525360107 + }, + { + "auxiliary_loss_clip": 0.01100391, + "auxiliary_loss_mlp": 0.0103207, + "balance_loss_clip": 1.0205307, + "balance_loss_mlp": 1.03499472, + "epoch": 0.8003607395160078, + "flos": 18223337214720.0, + "grad_norm": 2.136696844503174, + "language_loss": 0.6653198, + "learning_rate": 4.0361355888334116e-07, + "loss": 0.68664443, + "num_input_tokens_seen": 287185980, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 13312, + "time_per_iteration": 2.413475275039673 + } + ], + "logging_steps": 1.0, + "max_steps": 16632, + "num_input_tokens_seen": 287185980, + "num_train_epochs": 1, + "save_steps": 3328, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.1194871478254305e+18, + "train_batch_size": 5, + "trial_name": null, + "trial_params": null +} diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/training_args.bin b/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6b9a73eb97a1ef37776f0d97a0590d802e6f8d5a --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a0c59c7a64d6e018f6d41a91f3e718772a260e91597586a7ce64cd9f7d3d0c6 +size 7992 diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/zero_to_fp32.py b/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-13312/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/added_tokens.json b/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/config.json b/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/config.json new file mode 100644 index 0000000000000000000000000000000000000000..97409ed874967d8d79c126c028d286e8fe8e1484 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/config.json @@ -0,0 +1,199 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": true, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": false, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "smoe_sigmoidgating", + "norm_softmax": false, + "normalization": true, + "num_attention_heads": 32, + "num_experts": 4, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 2, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/mm_projector.bin", + "rate_compete": 0.2, + "rate_flip": 0.05, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": true, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": false, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/generation_config.json b/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/latest b/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/latest new file mode 100644 index 0000000000000000000000000000000000000000..e4087b037c4d90a88f08b57160ddc65e74a0c271 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/latest @@ -0,0 +1 @@ +global_step16632 \ No newline at end of file diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/model-00001-of-00003.safetensors b/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f5b7321e207c9353462ae136d4ede44f47689416 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7722796541d52a39377cd2531b8fc433ef6a093a1f99987883d2012a4e94dc7 +size 4972489328 diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/model-00002-of-00003.safetensors b/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..bdaa4fc2ab26723da186e277a0048acfef73ad4b --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:616b9978b385c3409b052f6f7f74f22fa04dce5c37b3688f820876c5b2c4f4ba +size 4985529648 diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/model-00003-of-00003.safetensors b/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c65459b5ca40d134a874af93f90b967622398fc9 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:921925205fa91bb7815f7d11a90db2d2a7b7e9c4e710f02241163cf3cb7e133d +size 248943552 diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/model.safetensors.index.json b/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..aa54419fc0a3eab502aa7c4ad974dca52ed10803 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/model.safetensors.index.json @@ -0,0 +1,1005 @@ +{ + "metadata": { + "total_size": 10206819456 + }, + "weight_map": { + "lm_head.weight": "model-00003-of-00003.safetensors", + "model.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00003-of-00003.safetensors", + "model.norm.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors" + } +} diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/rng_state_0.pth b/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..c1e6773e944015af0e83161fa2d20fe7d469fd7f --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22271cc36f268c0b3e870b3930ac590fd40a4a3cd3a88aed74f78e5f8790aceb +size 15024 diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/rng_state_1.pth b/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..a834a7be015ebd36883cec3bb92a8657936cd0a6 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19435e9be5d4b837d96fc2e9286e23e27344bb6ad3222ef1b9d207e6b2bb8c78 +size 15024 diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/rng_state_2.pth b/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..1f1b991258d274ff5481ace768d5b6702d919d50 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2902ec572b1b2f1a6a78f8979353bf31953eacdc78b129cc34a9f04c1de9b8d5 +size 15024 diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/rng_state_3.pth b/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..ee742fbd21912a77c2d25fe5ca60af4403668637 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8a011e80ba323d1fcabf31eaea4d2bc397efadb23603b4248f0067ff8ca3987 +size 15024 diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/special_tokens_map.json b/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/tokenizer.model b/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/tokenizer_config.json b/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/trainer_state.json b/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5a2709716089ddbb40ce650e94a1aad83fd4ecaf --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/trainer_state.json @@ -0,0 +1,282777 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.999969938373666, + "eval_steps": 500, + "global_step": 16632, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.05015663, + "auxiliary_loss_mlp": 0.02215404, + "balance_loss_clip": 1.76946592, + "balance_loss_mlp": 2.42247009, + "epoch": 6.012325266796934e-05, + "flos": 24456507091200.0, + "grad_norm": 54.31846269900138, + "language_loss": 2.84849024, + "learning_rate": 0.0, + "loss": 1.94356799, + "num_input_tokens_seen": 19155, + "router_z_loss_clip": 4.4375, + "router_z_loss_mlp": 26.0, + "step": 1, + "time_per_iteration": 14.062297821044922 + }, + { + "auxiliary_loss_clip": 0.03371575, + "auxiliary_loss_mlp": 0.01459085, + "balance_loss_clip": 1.18919563, + "balance_loss_mlp": 1.61943495, + "epoch": 0.00012024650533593868, + "flos": 20225931246720.0, + "grad_norm": 34.71678092445231, + "language_loss": 1.82690942, + "learning_rate": 4.4628432569317594e-07, + "loss": 1.87521601, + "num_input_tokens_seen": 36175, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 17.5, + "step": 2, + "time_per_iteration": 2.4504079818725586 + }, + { + "auxiliary_loss_clip": 0.03311525, + "auxiliary_loss_mlp": 0.014397, + "balance_loss_clip": 1.18697679, + "balance_loss_mlp": 1.61685562, + "epoch": 0.000180369758003908, + "flos": 22309935454080.0, + "grad_norm": 34.59102075188436, + "language_loss": 1.57529902, + "learning_rate": 7.073439208833112e-07, + "loss": 1.62281132, + "num_input_tokens_seen": 54870, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 17.0, + "step": 3, + "time_per_iteration": 2.4145541191101074 + }, + { + "auxiliary_loss_clip": 0.03353861, + "auxiliary_loss_mlp": 0.01449549, + "balance_loss_clip": 1.15390992, + "balance_loss_mlp": 1.61571431, + "epoch": 0.00024049301067187735, + "flos": 22414650577920.0, + "grad_norm": 51.728740512395206, + "language_loss": 1.67595887, + "learning_rate": 8.925686513863519e-07, + "loss": 1.72399294, + "num_input_tokens_seen": 74575, + "router_z_loss_clip": 2.953125, + "router_z_loss_mlp": 17.375, + "step": 4, + "time_per_iteration": 2.466392993927002 + }, + { + "auxiliary_loss_clip": 0.03393634, + "auxiliary_loss_mlp": 0.01505687, + "balance_loss_clip": 1.21710527, + "balance_loss_mlp": 1.61638641, + "epoch": 0.0003006162633398467, + "flos": 21396978449280.0, + "grad_norm": 56.74196654651921, + "language_loss": 1.90851176, + "learning_rate": 1.0362401141348472e-06, + "loss": 1.95750499, + "num_input_tokens_seen": 92580, + "router_z_loss_clip": 2.890625, + "router_z_loss_mlp": 17.75, + "step": 5, + "time_per_iteration": 2.6828246116638184 + }, + { + "auxiliary_loss_clip": 0.03361898, + "auxiliary_loss_mlp": 0.01518906, + "balance_loss_clip": 1.22441149, + "balance_loss_mlp": 1.60614848, + "epoch": 0.000360739516007816, + "flos": 21652375127040.0, + "grad_norm": 33.32400799743486, + "language_loss": 1.6094954, + "learning_rate": 1.153628246576487e-06, + "loss": 1.6583035, + "num_input_tokens_seen": 109705, + "router_z_loss_clip": 2.953125, + "router_z_loss_mlp": 17.5, + "step": 6, + "time_per_iteration": 2.660855770111084 + }, + { + "auxiliary_loss_clip": 0.03345758, + "auxiliary_loss_mlp": 0.01485904, + "balance_loss_clip": 1.20209074, + "balance_loss_mlp": 1.60783124, + "epoch": 0.0004208627686757854, + "flos": 27159742897920.0, + "grad_norm": 26.76365346454933, + "language_loss": 1.53346825, + "learning_rate": 1.2528784983718962e-06, + "loss": 1.58178496, + "num_input_tokens_seen": 129425, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 17.375, + "step": 7, + "time_per_iteration": 2.718822956085205 + }, + { + "auxiliary_loss_clip": 0.03312894, + "auxiliary_loss_mlp": 0.01444018, + "balance_loss_clip": 1.16630852, + "balance_loss_mlp": 1.60320723, + "epoch": 0.0004809860213437547, + "flos": 31319096135040.0, + "grad_norm": 31.923588970831496, + "language_loss": 1.43687642, + "learning_rate": 1.338852977079528e-06, + "loss": 1.48444545, + "num_input_tokens_seen": 149210, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 17.0, + "step": 8, + "time_per_iteration": 2.779961109161377 + }, + { + "auxiliary_loss_clip": 0.03360351, + "auxiliary_loss_mlp": 0.01496215, + "balance_loss_clip": 1.21144783, + "balance_loss_mlp": 1.60258842, + "epoch": 0.000541109274011724, + "flos": 32160411463680.0, + "grad_norm": 28.084887526361417, + "language_loss": 1.49955618, + "learning_rate": 1.4146878417666224e-06, + "loss": 1.54812181, + "num_input_tokens_seen": 169055, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 17.5, + "step": 9, + "time_per_iteration": 2.799635887145996 + }, + { + "auxiliary_loss_clip": 0.03302188, + "auxiliary_loss_mlp": 0.01477479, + "balance_loss_clip": 1.20797062, + "balance_loss_mlp": 1.6070832, + "epoch": 0.0006012325266796934, + "flos": 18916808163840.0, + "grad_norm": 23.45187310710616, + "language_loss": 1.44727731, + "learning_rate": 1.4825244398280232e-06, + "loss": 1.49507403, + "num_input_tokens_seen": 188045, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 17.0, + "step": 10, + "time_per_iteration": 2.6989152431488037 + }, + { + "auxiliary_loss_clip": 0.03356835, + "auxiliary_loss_mlp": 0.01493566, + "balance_loss_clip": 1.21928966, + "balance_loss_mlp": 1.61121845, + "epoch": 0.0006613557793476627, + "flos": 20774861867520.0, + "grad_norm": 18.63867113279811, + "language_loss": 1.45021069, + "learning_rate": 1.5438901072051983e-06, + "loss": 1.4987148, + "num_input_tokens_seen": 207035, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 17.5, + "step": 11, + "time_per_iteration": 2.6820693016052246 + }, + { + "auxiliary_loss_clip": 0.0328584, + "auxiliary_loss_mlp": 0.01449969, + "balance_loss_clip": 1.17378449, + "balance_loss_mlp": 1.59900761, + "epoch": 0.000721479032015632, + "flos": 16581680997120.0, + "grad_norm": 16.861449854609447, + "language_loss": 1.45122719, + "learning_rate": 1.5999125722696629e-06, + "loss": 1.49858522, + "num_input_tokens_seen": 223225, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 16.875, + "step": 12, + "time_per_iteration": 2.631218910217285 + }, + { + "auxiliary_loss_clip": 0.03313605, + "auxiliary_loss_mlp": 0.01404342, + "balance_loss_clip": 1.14589679, + "balance_loss_mlp": 1.60898232, + "epoch": 0.0007816022846836014, + "flos": 23805471144960.0, + "grad_norm": 11.176593153687291, + "language_loss": 1.24100113, + "learning_rate": 1.6514482443788434e-06, + "loss": 1.28818083, + "num_input_tokens_seen": 242570, + "router_z_loss_clip": 2.578125, + "router_z_loss_mlp": 17.125, + "step": 13, + "time_per_iteration": 2.6961779594421387 + }, + { + "auxiliary_loss_clip": 0.03282163, + "auxiliary_loss_mlp": 0.01472629, + "balance_loss_clip": 1.20464635, + "balance_loss_mlp": 1.60534358, + "epoch": 0.0008417255373515708, + "flos": 19172204841600.0, + "grad_norm": 5.7580183597057975, + "language_loss": 1.20611417, + "learning_rate": 1.6991628240650723e-06, + "loss": 1.25366211, + "num_input_tokens_seen": 261215, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 16.75, + "step": 14, + "time_per_iteration": 2.6555092334747314 + }, + { + "auxiliary_loss_clip": 0.0326835, + "auxiliary_loss_mlp": 0.01431945, + "balance_loss_clip": 1.16815877, + "balance_loss_mlp": 1.6104542, + "epoch": 0.00090184879001954, + "flos": 26395564026240.0, + "grad_norm": 6.4839782289009085, + "language_loss": 1.12832427, + "learning_rate": 1.7435840350181584e-06, + "loss": 1.1753273, + "num_input_tokens_seen": 280035, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 16.5, + "step": 15, + "time_per_iteration": 2.717512607574463 + }, + { + "auxiliary_loss_clip": 0.03231722, + "auxiliary_loss_mlp": 0.01412441, + "balance_loss_clip": 1.16257811, + "balance_loss_mlp": 1.59521294, + "epoch": 0.0009619720426875094, + "flos": 24679500785280.0, + "grad_norm": 4.584872954405151, + "language_loss": 1.1119349, + "learning_rate": 1.7851373027727038e-06, + "loss": 1.15837646, + "num_input_tokens_seen": 300265, + "router_z_loss_clip": 2.5, + "router_z_loss_mlp": 16.375, + "step": 16, + "time_per_iteration": 2.7170701026916504 + }, + { + "auxiliary_loss_clip": 0.03220058, + "auxiliary_loss_mlp": 0.0141779, + "balance_loss_clip": 1.17784595, + "balance_loss_mlp": 1.60289145, + "epoch": 0.0010220952953554788, + "flos": 18624531196800.0, + "grad_norm": 5.285773165398426, + "language_loss": 1.1253047, + "learning_rate": 1.8241705979033208e-06, + "loss": 1.17168307, + "num_input_tokens_seen": 317375, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 16.125, + "step": 17, + "time_per_iteration": 2.6125564575195312 + }, + { + "auxiliary_loss_clip": 0.0315575, + "auxiliary_loss_mlp": 0.01378857, + "balance_loss_clip": 1.14730477, + "balance_loss_mlp": 1.60051179, + "epoch": 0.001082218548023448, + "flos": 26142537646080.0, + "grad_norm": 3.8094646515897193, + "language_loss": 1.08149433, + "learning_rate": 1.860972167459798e-06, + "loss": 1.12684035, + "num_input_tokens_seen": 337975, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 15.5625, + "step": 18, + "time_per_iteration": 5.593315362930298 + }, + { + "auxiliary_loss_clip": 0.03181327, + "auxiliary_loss_mlp": 0.01400224, + "balance_loss_clip": 1.13548398, + "balance_loss_mlp": 1.59901524, + "epoch": 0.0011423418006914173, + "flos": 19609776322560.0, + "grad_norm": 4.551402579460018, + "language_loss": 1.02296436, + "learning_rate": 1.89578346593066e-06, + "loss": 1.06877995, + "num_input_tokens_seen": 356635, + "router_z_loss_clip": 2.65625, + "router_z_loss_mlp": 15.8125, + "step": 19, + "time_per_iteration": 2.6462903022766113 + }, + { + "auxiliary_loss_clip": 0.0312444, + "auxiliary_loss_mlp": 0.01341166, + "balance_loss_clip": 1.12096262, + "balance_loss_mlp": 1.60122275, + "epoch": 0.0012024650533593868, + "flos": 17895365107200.0, + "grad_norm": 4.049985155187145, + "language_loss": 1.16660511, + "learning_rate": 1.928808765521199e-06, + "loss": 1.21126115, + "num_input_tokens_seen": 375625, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 15.25, + "step": 20, + "time_per_iteration": 2.687962293624878 + }, + { + "auxiliary_loss_clip": 0.03111088, + "auxiliary_loss_mlp": 0.01380381, + "balance_loss_clip": 1.13109064, + "balance_loss_mlp": 1.58184814, + "epoch": 0.001262588306027356, + "flos": 21252043071360.0, + "grad_norm": 8.855966691950416, + "language_loss": 1.06044388, + "learning_rate": 1.9602224192552076e-06, + "loss": 1.1053586, + "num_input_tokens_seen": 394350, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 15.3125, + "step": 21, + "time_per_iteration": 2.705784320831299 + }, + { + "auxiliary_loss_clip": 0.03006166, + "auxiliary_loss_mlp": 0.0138104, + "balance_loss_clip": 1.14758062, + "balance_loss_mlp": 1.56386232, + "epoch": 0.0013227115586953253, + "flos": 26104077158400.0, + "grad_norm": 3.503731577984969, + "language_loss": 1.05752254, + "learning_rate": 1.9901744328983746e-06, + "loss": 1.10139465, + "num_input_tokens_seen": 413255, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 14.4375, + "step": 22, + "time_per_iteration": 2.714902400970459 + }, + { + "auxiliary_loss_clip": 0.02958535, + "auxiliary_loss_mlp": 0.01337723, + "balance_loss_clip": 1.12743819, + "balance_loss_mlp": 1.56545472, + "epoch": 0.0013828348113632948, + "flos": 23951376190080.0, + "grad_norm": 2.8887485842740657, + "language_loss": 0.91820848, + "learning_rate": 2.018794797290208e-06, + "loss": 0.96117103, + "num_input_tokens_seen": 433065, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 13.9375, + "step": 23, + "time_per_iteration": 2.6802501678466797 + }, + { + "auxiliary_loss_clip": 0.02925568, + "auxiliary_loss_mlp": 0.0136327, + "balance_loss_clip": 1.14306688, + "balance_loss_mlp": 1.55789983, + "epoch": 0.001442958064031264, + "flos": 15959851724160.0, + "grad_norm": 2.888412626700388, + "language_loss": 1.08090949, + "learning_rate": 2.046196897962839e-06, + "loss": 1.12379789, + "num_input_tokens_seen": 451175, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 13.6875, + "step": 24, + "time_per_iteration": 2.6134862899780273 + }, + { + "auxiliary_loss_clip": 0.02818042, + "auxiliary_loss_mlp": 0.01329399, + "balance_loss_clip": 1.11892343, + "balance_loss_mlp": 1.55278993, + "epoch": 0.0015030813166992333, + "flos": 18108350801280.0, + "grad_norm": 3.5526652768314877, + "language_loss": 1.01197755, + "learning_rate": 2.0724802282696944e-06, + "loss": 1.05345201, + "num_input_tokens_seen": 468775, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 12.6875, + "step": 25, + "time_per_iteration": 2.6801955699920654 + }, + { + "auxiliary_loss_clip": 0.02811065, + "auxiliary_loss_mlp": 0.01310914, + "balance_loss_clip": 1.10196424, + "balance_loss_mlp": 1.55557573, + "epoch": 0.0015632045693672028, + "flos": 22234558763520.0, + "grad_norm": 2.8866965715457127, + "language_loss": 1.0650332, + "learning_rate": 2.0977325700720194e-06, + "loss": 1.10625291, + "num_input_tokens_seen": 488530, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 12.5625, + "step": 26, + "time_per_iteration": 2.6561954021453857 + }, + { + "auxiliary_loss_clip": 0.02754337, + "auxiliary_loss_mlp": 0.01325989, + "balance_loss_clip": 1.12600398, + "balance_loss_mlp": 1.54593086, + "epoch": 0.001623327822035172, + "flos": 23991955580160.0, + "grad_norm": 8.480879524297928, + "language_loss": 0.95465469, + "learning_rate": 2.122031762649933e-06, + "loss": 0.99545801, + "num_input_tokens_seen": 510495, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 12.0625, + "step": 27, + "time_per_iteration": 2.717332363128662 + }, + { + "auxiliary_loss_clip": 0.02732017, + "auxiliary_loss_mlp": 0.0131313, + "balance_loss_clip": 1.13174081, + "balance_loss_mlp": 1.55085063, + "epoch": 0.0016834510747031415, + "flos": 19677647070720.0, + "grad_norm": 2.7582152185230338, + "language_loss": 1.06276608, + "learning_rate": 2.1454471497582483e-06, + "loss": 1.1032176, + "num_input_tokens_seen": 528605, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 11.8125, + "step": 28, + "time_per_iteration": 2.6645846366882324 + }, + { + "auxiliary_loss_clip": 0.02698877, + "auxiliary_loss_mlp": 0.01319704, + "balance_loss_clip": 1.1339283, + "balance_loss_mlp": 1.5357703, + "epoch": 0.0017435743273711108, + "flos": 20923819568640.0, + "grad_norm": 2.703793609192777, + "language_loss": 1.02653611, + "learning_rate": 2.1680407726407727e-06, + "loss": 1.06672192, + "num_input_tokens_seen": 548515, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 11.625, + "step": 29, + "time_per_iteration": 2.6647088527679443 + }, + { + "auxiliary_loss_clip": 0.02692806, + "auxiliary_loss_mlp": 0.01313595, + "balance_loss_clip": 1.12667465, + "balance_loss_mlp": 1.53252506, + "epoch": 0.00180369758003908, + "flos": 19528976678400.0, + "grad_norm": 3.824163422844594, + "language_loss": 1.1929419, + "learning_rate": 2.189868360711334e-06, + "loss": 1.233006, + "num_input_tokens_seen": 564025, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 11.625, + "step": 30, + "time_per_iteration": 2.6305816173553467 + }, + { + "auxiliary_loss_clip": 0.02610821, + "auxiliary_loss_mlp": 0.01338782, + "balance_loss_clip": 1.15748882, + "balance_loss_mlp": 1.51829374, + "epoch": 0.0018638208327070496, + "flos": 27453169100160.0, + "grad_norm": 4.55861683808779, + "language_loss": 1.02499342, + "learning_rate": 2.2109801597326265e-06, + "loss": 1.06448936, + "num_input_tokens_seen": 583345, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 10.9375, + "step": 31, + "time_per_iteration": 2.71045184135437 + }, + { + "auxiliary_loss_clip": 0.02583705, + "auxiliary_loss_mlp": 0.01332414, + "balance_loss_clip": 1.15245557, + "balance_loss_mlp": 1.52035046, + "epoch": 0.0019239440853750188, + "flos": 13589460380160.0, + "grad_norm": 2.526137445187824, + "language_loss": 0.95697796, + "learning_rate": 2.2314216284658796e-06, + "loss": 0.99613917, + "num_input_tokens_seen": 600010, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 10.625, + "step": 32, + "time_per_iteration": 2.626783847808838 + }, + { + "auxiliary_loss_clip": 0.02566919, + "auxiliary_loss_mlp": 0.01304168, + "balance_loss_clip": 1.13670313, + "balance_loss_mlp": 1.51655078, + "epoch": 0.001984067338042988, + "flos": 11253866336640.0, + "grad_norm": 3.344933729659458, + "language_loss": 0.95465255, + "learning_rate": 2.2512340280885094e-06, + "loss": 0.99336338, + "num_input_tokens_seen": 616295, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 10.5, + "step": 33, + "time_per_iteration": 2.645725727081299 + }, + { + "auxiliary_loss_clip": 0.02433039, + "auxiliary_loss_mlp": 0.013041, + "balance_loss_clip": 1.14569449, + "balance_loss_mlp": 1.48877192, + "epoch": 0.0020441905907109576, + "flos": 22386245898240.0, + "grad_norm": 4.808068329548225, + "language_loss": 0.91556877, + "learning_rate": 2.270454923596497e-06, + "loss": 0.95294011, + "num_input_tokens_seen": 637640, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 9.4375, + "step": 34, + "time_per_iteration": 2.7327146530151367 + }, + { + "auxiliary_loss_clip": 0.02385913, + "auxiliary_loss_mlp": 0.0127366, + "balance_loss_clip": 1.1172576, + "balance_loss_mlp": 1.45172572, + "epoch": 0.0021043138433789266, + "flos": 49778580337920.0, + "grad_norm": 2.948252640490764, + "language_loss": 0.76639408, + "learning_rate": 2.2891186125067434e-06, + "loss": 0.80298984, + "num_input_tokens_seen": 659710, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 9.375, + "step": 35, + "time_per_iteration": 2.940739870071411 + }, + { + "auxiliary_loss_clip": 0.02360979, + "auxiliary_loss_mlp": 0.0127456, + "balance_loss_clip": 1.12769413, + "balance_loss_mlp": 1.46427846, + "epoch": 0.002164437096046896, + "flos": 20557961591040.0, + "grad_norm": 2.1659182072135064, + "language_loss": 0.89043307, + "learning_rate": 2.307256493152974e-06, + "loss": 0.92678845, + "num_input_tokens_seen": 679670, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 8.9375, + "step": 36, + "time_per_iteration": 2.693335771560669 + }, + { + "auxiliary_loss_clip": 0.02305413, + "auxiliary_loss_mlp": 0.01335093, + "balance_loss_clip": 1.18574798, + "balance_loss_mlp": 1.45221901, + "epoch": 0.0022245603487148656, + "flos": 26542295084160.0, + "grad_norm": 3.3248653771669416, + "language_loss": 0.93231332, + "learning_rate": 2.3248973825097614e-06, + "loss": 0.96871841, + "num_input_tokens_seen": 700170, + "router_z_loss_clip": 1.4921875, + "router_z_loss_mlp": 8.5, + "step": 37, + "time_per_iteration": 2.70194673538208 + }, + { + "auxiliary_loss_clip": 0.02264412, + "auxiliary_loss_mlp": 0.01277806, + "balance_loss_clip": 1.15373349, + "balance_loss_mlp": 1.44697845, + "epoch": 0.0022846836013828346, + "flos": 20338188226560.0, + "grad_norm": 2.1191864106647906, + "language_loss": 1.04275775, + "learning_rate": 2.3420677916238357e-06, + "loss": 1.07817996, + "num_input_tokens_seen": 718545, + "router_z_loss_clip": 1.2421875, + "router_z_loss_mlp": 8.1875, + "step": 38, + "time_per_iteration": 2.674187183380127 + }, + { + "auxiliary_loss_clip": 0.02234117, + "auxiliary_loss_mlp": 0.01257339, + "balance_loss_clip": 1.13164425, + "balance_loss_mlp": 1.44101977, + "epoch": 0.002344806854050804, + "flos": 26247575992320.0, + "grad_norm": 2.2707505194681685, + "language_loss": 0.85635245, + "learning_rate": 2.358792165262154e-06, + "loss": 0.891267, + "num_input_tokens_seen": 739865, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 7.9375, + "step": 39, + "time_per_iteration": 2.716417074203491 + }, + { + "auxiliary_loss_clip": 0.02209554, + "auxiliary_loss_mlp": 0.01248677, + "balance_loss_clip": 1.1173557, + "balance_loss_mlp": 1.43176007, + "epoch": 0.0024049301067187736, + "flos": 11801539981440.0, + "grad_norm": 2.874633531970748, + "language_loss": 0.90416026, + "learning_rate": 2.3750930912143747e-06, + "loss": 0.93874258, + "num_input_tokens_seen": 755770, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 7.78125, + "step": 40, + "time_per_iteration": 2.621108055114746 + }, + { + "auxiliary_loss_clip": 0.02158681, + "auxiliary_loss_mlp": 0.01271709, + "balance_loss_clip": 1.15626693, + "balance_loss_mlp": 1.42207694, + "epoch": 0.0024650533593867426, + "flos": 20631506688000.0, + "grad_norm": 3.842521317695652, + "language_loss": 0.93497038, + "learning_rate": 2.3909914837471044e-06, + "loss": 0.96927428, + "num_input_tokens_seen": 773440, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 7.375, + "step": 41, + "time_per_iteration": 2.66089129447937 + }, + { + "auxiliary_loss_clip": 0.0212207, + "auxiliary_loss_mlp": 0.0125263, + "balance_loss_clip": 1.14720106, + "balance_loss_mlp": 1.41368401, + "epoch": 0.002525176612054712, + "flos": 18406122549120.0, + "grad_norm": 4.5963223670672635, + "language_loss": 0.97454929, + "learning_rate": 2.4065067449483835e-06, + "loss": 1.00829637, + "num_input_tokens_seen": 790455, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 7.09375, + "step": 42, + "time_per_iteration": 2.63149094581604 + }, + { + "auxiliary_loss_clip": 0.02082851, + "auxiliary_loss_mlp": 0.01298258, + "balance_loss_clip": 1.18939614, + "balance_loss_mlp": 1.41430426, + "epoch": 0.0025852998647226816, + "flos": 28184023128960.0, + "grad_norm": 2.9545418034556814, + "language_loss": 0.97656071, + "learning_rate": 2.4216569070848724e-06, + "loss": 1.01037169, + "num_input_tokens_seen": 810645, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 6.6875, + "step": 43, + "time_per_iteration": 2.7244436740875244 + }, + { + "auxiliary_loss_clip": 0.02102024, + "auxiliary_loss_mlp": 0.01311792, + "balance_loss_clip": 1.19706488, + "balance_loss_mlp": 1.4130851, + "epoch": 0.0026454231173906506, + "flos": 14283110897280.0, + "grad_norm": 2.0531245010632473, + "language_loss": 0.93701768, + "learning_rate": 2.4364587585915504e-06, + "loss": 0.97115582, + "num_input_tokens_seen": 827470, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 6.875, + "step": 44, + "time_per_iteration": 2.6628317832946777 + }, + { + "auxiliary_loss_clip": 0.02065563, + "auxiliary_loss_mlp": 0.01272457, + "balance_loss_clip": 1.17236853, + "balance_loss_mlp": 1.41084957, + "epoch": 0.00270554637005862, + "flos": 22419211605120.0, + "grad_norm": 9.3374631511207, + "language_loss": 0.98937047, + "learning_rate": 2.450927955901469e-06, + "loss": 1.02275062, + "num_input_tokens_seen": 847285, + "router_z_loss_clip": 1.0, + "router_z_loss_mlp": 6.5625, + "step": 45, + "time_per_iteration": 2.7355775833129883 + }, + { + "auxiliary_loss_clip": 0.02040064, + "auxiliary_loss_mlp": 0.01227769, + "balance_loss_clip": 1.13831401, + "balance_loss_mlp": 1.39673805, + "epoch": 0.0027656696227265896, + "flos": 23985778440960.0, + "grad_norm": 1.8055823424878037, + "language_loss": 1.02792716, + "learning_rate": 2.465079122983384e-06, + "loss": 1.06060553, + "num_input_tokens_seen": 867545, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 6.4375, + "step": 46, + "time_per_iteration": 2.7488839626312256 + }, + { + "auxiliary_loss_clip": 0.02002379, + "auxiliary_loss_mlp": 0.01270193, + "balance_loss_clip": 1.17773402, + "balance_loss_mlp": 1.38648152, + "epoch": 0.0028257928753945586, + "flos": 37669503087360.0, + "grad_norm": 2.971366079361506, + "language_loss": 0.88043427, + "learning_rate": 2.4789259401737868e-06, + "loss": 0.91315997, + "num_input_tokens_seen": 889915, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 6.15625, + "step": 47, + "time_per_iteration": 2.845005512237549 + }, + { + "auxiliary_loss_clip": 0.01963914, + "auxiliary_loss_mlp": 0.01252908, + "balance_loss_clip": 1.16493094, + "balance_loss_mlp": 1.37624073, + "epoch": 0.002885916128062528, + "flos": 22454547609600.0, + "grad_norm": 2.070099145794898, + "language_loss": 0.87949276, + "learning_rate": 2.492481223656015e-06, + "loss": 0.91166103, + "num_input_tokens_seen": 908975, + "router_z_loss_clip": 0.87890625, + "router_z_loss_mlp": 5.875, + "step": 48, + "time_per_iteration": 2.7514398097991943 + }, + { + "auxiliary_loss_clip": 0.01962956, + "auxiliary_loss_mlp": 0.01244481, + "balance_loss_clip": 1.15078259, + "balance_loss_mlp": 1.36602139, + "epoch": 0.0029460393807304976, + "flos": 27012796358400.0, + "grad_norm": 2.366138839739612, + "language_loss": 0.89877701, + "learning_rate": 2.5057569967437924e-06, + "loss": 0.93085134, + "num_input_tokens_seen": 929810, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 6.0, + "step": 49, + "time_per_iteration": 2.743236541748047 + }, + { + "auxiliary_loss_clip": 0.01955947, + "auxiliary_loss_mlp": 0.01232227, + "balance_loss_clip": 1.14534748, + "balance_loss_mlp": 1.36045313, + "epoch": 0.0030061626333984666, + "flos": 15851832549120.0, + "grad_norm": 2.8158483763506914, + "language_loss": 0.91078663, + "learning_rate": 2.51876455396287e-06, + "loss": 0.94266832, + "num_input_tokens_seen": 948650, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 5.9375, + "step": 50, + "time_per_iteration": 2.6860456466674805 + }, + { + "auxiliary_loss_clip": 0.01953364, + "auxiliary_loss_mlp": 0.01201227, + "balance_loss_clip": 1.11778045, + "balance_loss_mlp": 1.36547732, + "epoch": 0.003066285886066436, + "flos": 31827052316160.0, + "grad_norm": 3.5299735782100026, + "language_loss": 0.87144494, + "learning_rate": 2.5315145187866316e-06, + "loss": 0.90299082, + "num_input_tokens_seen": 966455, + "router_z_loss_clip": 0.83203125, + "router_z_loss_mlp": 5.875, + "step": 51, + "time_per_iteration": 2.7481534481048584 + }, + { + "auxiliary_loss_clip": 0.01909154, + "auxiliary_loss_mlp": 0.01207037, + "balance_loss_clip": 1.12707186, + "balance_loss_mlp": 1.35597348, + "epoch": 0.0031264091387344056, + "flos": 41427482774400.0, + "grad_norm": 2.0262044932375836, + "language_loss": 0.95253396, + "learning_rate": 2.5440168957651953e-06, + "loss": 0.98369586, + "num_input_tokens_seen": 988110, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 5.53125, + "step": 52, + "time_per_iteration": 2.8958797454833984 + }, + { + "auxiliary_loss_clip": 0.01904814, + "auxiliary_loss_mlp": 0.01243661, + "balance_loss_clip": 1.16274214, + "balance_loss_mlp": 1.35173535, + "epoch": 0.0031865323914023747, + "flos": 23440941970560.0, + "grad_norm": 3.3193539013945546, + "language_loss": 0.92261833, + "learning_rate": 2.5562811176888872e-06, + "loss": 0.95410311, + "num_input_tokens_seen": 1008550, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 5.53125, + "step": 53, + "time_per_iteration": 2.7579286098480225 + }, + { + "auxiliary_loss_clip": 0.01893968, + "auxiliary_loss_mlp": 0.01196907, + "balance_loss_clip": 1.11489081, + "balance_loss_mlp": 1.35535884, + "epoch": 0.003246655644070344, + "flos": 14429195510400.0, + "grad_norm": 2.2021865200163, + "language_loss": 0.82945669, + "learning_rate": 2.5683160883431093e-06, + "loss": 0.86036545, + "num_input_tokens_seen": 1026840, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 5.375, + "step": 54, + "time_per_iteration": 2.684718132019043 + }, + { + "auxiliary_loss_clip": 0.01889572, + "auxiliary_loss_mlp": 0.01211293, + "balance_loss_clip": 1.13113666, + "balance_loss_mlp": 1.34359026, + "epoch": 0.0033067788967383136, + "flos": 35918247496320.0, + "grad_norm": 2.4060188817442487, + "language_loss": 0.81305432, + "learning_rate": 2.580130221340046e-06, + "loss": 0.84406298, + "num_input_tokens_seen": 1048875, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 5.4375, + "step": 55, + "time_per_iteration": 2.7722246646881104 + }, + { + "auxiliary_loss_clip": 0.01879346, + "auxiliary_loss_mlp": 0.01199903, + "balance_loss_clip": 1.11926973, + "balance_loss_mlp": 1.33773279, + "epoch": 0.003366902149406283, + "flos": 22958732862720.0, + "grad_norm": 2.497299649397407, + "language_loss": 0.87261844, + "learning_rate": 2.5917314754514246e-06, + "loss": 0.90341091, + "num_input_tokens_seen": 1066435, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 5.40625, + "step": 56, + "time_per_iteration": 2.7031195163726807 + }, + { + "auxiliary_loss_clip": 0.01879922, + "auxiliary_loss_mlp": 0.01161266, + "balance_loss_clip": 1.0864507, + "balance_loss_mlp": 1.33024335, + "epoch": 0.003427025402074252, + "flos": 26582838560640.0, + "grad_norm": 2.4089458733946882, + "language_loss": 0.92949611, + "learning_rate": 2.6031273868139713e-06, + "loss": 0.95990801, + "num_input_tokens_seen": 1090330, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 5.5, + "step": 57, + "time_per_iteration": 2.8580281734466553 + }, + { + "auxiliary_loss_clip": 0.01843074, + "auxiliary_loss_mlp": 0.01217004, + "balance_loss_clip": 1.14395308, + "balance_loss_mlp": 1.33453596, + "epoch": 0.0034871486547422216, + "flos": 23951196622080.0, + "grad_norm": 2.105168727735643, + "language_loss": 0.99725533, + "learning_rate": 2.614325098333948e-06, + "loss": 1.02785611, + "num_input_tokens_seen": 1109840, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 5.09375, + "step": 58, + "time_per_iteration": 2.687504529953003 + }, + { + "auxiliary_loss_clip": 0.01822907, + "auxiliary_loss_mlp": 0.01195384, + "balance_loss_clip": 1.12319088, + "balance_loss_mlp": 1.32094967, + "epoch": 0.003547271907410191, + "flos": 21214983214080.0, + "grad_norm": 2.1328304194940855, + "language_loss": 0.8821373, + "learning_rate": 2.625331386578098e-06, + "loss": 0.9123202, + "num_input_tokens_seen": 1128415, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 5.03125, + "step": 59, + "time_per_iteration": 6.997380495071411 + }, + { + "auxiliary_loss_clip": 0.01844896, + "auxiliary_loss_mlp": 0.01162144, + "balance_loss_clip": 1.08885431, + "balance_loss_mlp": 1.32932925, + "epoch": 0.00360739516007816, + "flos": 16504903676160.0, + "grad_norm": 2.097582115586327, + "language_loss": 0.93430054, + "learning_rate": 2.63615268640451e-06, + "loss": 0.96437097, + "num_input_tokens_seen": 1146515, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 5.15625, + "step": 60, + "time_per_iteration": 2.67743182182312 + }, + { + "auxiliary_loss_clip": 0.0182307, + "auxiliary_loss_mlp": 0.01172385, + "balance_loss_clip": 1.10376787, + "balance_loss_mlp": 1.31307459, + "epoch": 0.0036675184127461296, + "flos": 19464805031040.0, + "grad_norm": 4.241258673484683, + "language_loss": 0.90090871, + "learning_rate": 2.6467951135575943e-06, + "loss": 0.93086326, + "num_input_tokens_seen": 1166330, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 5.09375, + "step": 61, + "time_per_iteration": 2.707247257232666 + }, + { + "auxiliary_loss_clip": 0.01806801, + "auxiliary_loss_mlp": 0.01142561, + "balance_loss_clip": 1.07475519, + "balance_loss_mlp": 1.31002319, + "epoch": 0.003727641665414099, + "flos": 20957323979520.0, + "grad_norm": 3.0487456468745586, + "language_loss": 0.88434047, + "learning_rate": 2.657264485425803e-06, + "loss": 0.9138341, + "num_input_tokens_seen": 1186010, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 4.96875, + "step": 62, + "time_per_iteration": 2.736107587814331 + }, + { + "auxiliary_loss_clip": 0.01787131, + "auxiliary_loss_mlp": 0.01161947, + "balance_loss_clip": 1.09132755, + "balance_loss_mlp": 1.30018497, + "epoch": 0.003787764918082068, + "flos": 18406050721920.0, + "grad_norm": 2.6509198595432406, + "language_loss": 0.96265876, + "learning_rate": 2.6675663401385186e-06, + "loss": 0.99214947, + "num_input_tokens_seen": 1204985, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 4.875, + "step": 63, + "time_per_iteration": 2.6760194301605225 + }, + { + "auxiliary_loss_clip": 0.01795174, + "auxiliary_loss_mlp": 0.01169703, + "balance_loss_clip": 1.10284996, + "balance_loss_mlp": 1.30725491, + "epoch": 0.0038478881707500376, + "flos": 12459243962880.0, + "grad_norm": 2.677484479433752, + "language_loss": 0.99141657, + "learning_rate": 2.677705954159056e-06, + "loss": 1.02106524, + "num_input_tokens_seen": 1223545, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 4.875, + "step": 64, + "time_per_iteration": 2.675295114517212 + }, + { + "auxiliary_loss_clip": 0.01802087, + "auxiliary_loss_mlp": 0.01149441, + "balance_loss_clip": 1.08134842, + "balance_loss_mlp": 1.30652797, + "epoch": 0.003908011423418007, + "flos": 13553334276480.0, + "grad_norm": 2.45939593962701, + "language_loss": 0.85358196, + "learning_rate": 2.6876883585136904e-06, + "loss": 0.88309723, + "num_input_tokens_seen": 1241175, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 4.9375, + "step": 65, + "time_per_iteration": 2.647696018218994 + }, + { + "auxiliary_loss_clip": 0.01779034, + "auxiliary_loss_mlp": 0.01156784, + "balance_loss_clip": 1.0886445, + "balance_loss_mlp": 1.29322505, + "epoch": 0.003968134676085976, + "flos": 18333475292160.0, + "grad_norm": 2.8561979494145033, + "language_loss": 0.85224223, + "learning_rate": 2.697518353781685e-06, + "loss": 0.88160038, + "num_input_tokens_seen": 1259315, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 4.875, + "step": 66, + "time_per_iteration": 2.617143392562866 + }, + { + "auxiliary_loss_clip": 0.01782156, + "auxiliary_loss_mlp": 0.01152634, + "balance_loss_clip": 1.07648349, + "balance_loss_mlp": 1.29168975, + "epoch": 0.004028257928753946, + "flos": 20485242506880.0, + "grad_norm": 2.246759082278279, + "language_loss": 0.96454394, + "learning_rate": 2.7072005239581103e-06, + "loss": 0.99389184, + "num_input_tokens_seen": 1277055, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 4.90625, + "step": 67, + "time_per_iteration": 2.6343421936035156 + }, + { + "auxiliary_loss_clip": 0.01753238, + "auxiliary_loss_mlp": 0.01155696, + "balance_loss_clip": 1.08340704, + "balance_loss_mlp": 1.28524387, + "epoch": 0.004088381181421915, + "flos": 18843837684480.0, + "grad_norm": 2.549207131743101, + "language_loss": 0.94534445, + "learning_rate": 2.7167392492896727e-06, + "loss": 0.97443378, + "num_input_tokens_seen": 1294355, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 4.6875, + "step": 68, + "time_per_iteration": 2.614696741104126 + }, + { + "auxiliary_loss_clip": 0.01748377, + "auxiliary_loss_mlp": 0.01156697, + "balance_loss_clip": 1.08717394, + "balance_loss_mlp": 1.28268003, + "epoch": 0.004148504434089885, + "flos": 19427817000960.0, + "grad_norm": 1.9922029239060344, + "language_loss": 0.95657748, + "learning_rate": 2.7261387181735195e-06, + "loss": 0.98562825, + "num_input_tokens_seen": 1313525, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 4.65625, + "step": 69, + "time_per_iteration": 2.6637492179870605 + }, + { + "auxiliary_loss_clip": 0.01742428, + "auxiliary_loss_mlp": 0.01160645, + "balance_loss_clip": 1.09598637, + "balance_loss_mlp": 1.2855866, + "epoch": 0.004208627686757853, + "flos": 20811023884800.0, + "grad_norm": 2.4176731159017075, + "language_loss": 0.98073572, + "learning_rate": 2.7354029381999196e-06, + "loss": 1.00976658, + "num_input_tokens_seen": 1330505, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 4.5625, + "step": 70, + "time_per_iteration": 2.6395556926727295 + }, + { + "auxiliary_loss_clip": 0.01748999, + "auxiliary_loss_mlp": 0.01146397, + "balance_loss_clip": 1.07673144, + "balance_loss_mlp": 1.2760632, + "epoch": 0.004268750939425823, + "flos": 19098623831040.0, + "grad_norm": 2.71386904393857, + "language_loss": 0.93927777, + "learning_rate": 2.7445357464116983e-06, + "loss": 0.96823174, + "num_input_tokens_seen": 1349615, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 4.75, + "step": 71, + "time_per_iteration": 2.628272294998169 + }, + { + "auxiliary_loss_clip": 0.01838762, + "auxiliary_loss_mlp": 0.01327632, + "balance_loss_clip": 1.28967619, + "balance_loss_mlp": 1.43997037, + "epoch": 0.004328874192093792, + "flos": 52439635514880.0, + "grad_norm": 2.4194543250518663, + "language_loss": 0.65655279, + "learning_rate": 2.75354081884615e-06, + "loss": 0.68821681, + "num_input_tokens_seen": 1410275, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 4.0, + "step": 72, + "time_per_iteration": 3.104635000228882 + }, + { + "auxiliary_loss_clip": 0.01820285, + "auxiliary_loss_mlp": 0.01295248, + "balance_loss_clip": 1.25824571, + "balance_loss_mlp": 1.43420911, + "epoch": 0.004388997444761762, + "flos": 66473239564800.0, + "grad_norm": 2.2482458517722455, + "language_loss": 0.63711512, + "learning_rate": 2.7624216794188286e-06, + "loss": 0.66827047, + "num_input_tokens_seen": 1473020, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 3.859375, + "step": 73, + "time_per_iteration": 3.208836793899536 + }, + { + "auxiliary_loss_clip": 0.01723308, + "auxiliary_loss_mlp": 0.01141966, + "balance_loss_clip": 1.07382631, + "balance_loss_mlp": 1.26790953, + "epoch": 0.004449120697429731, + "flos": 18952970181120.0, + "grad_norm": 2.4515337577309424, + "language_loss": 0.85899854, + "learning_rate": 2.771181708202938e-06, + "loss": 0.88765126, + "num_input_tokens_seen": 1490385, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 4.5625, + "step": 74, + "time_per_iteration": 2.6287550926208496 + }, + { + "auxiliary_loss_clip": 0.01725734, + "auxiliary_loss_mlp": 0.01165418, + "balance_loss_clip": 1.09584761, + "balance_loss_mlp": 1.26750898, + "epoch": 0.004509243950097701, + "flos": 21105491581440.0, + "grad_norm": 2.110493434952054, + "language_loss": 0.9716984, + "learning_rate": 2.779824149153005e-06, + "loss": 1.00060987, + "num_input_tokens_seen": 1509725, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 4.5625, + "step": 75, + "time_per_iteration": 2.635618209838867 + }, + { + "auxiliary_loss_clip": 0.01704277, + "auxiliary_loss_mlp": 0.01144942, + "balance_loss_clip": 1.07875705, + "balance_loss_mlp": 1.26302838, + "epoch": 0.004569367202765669, + "flos": 20698730991360.0, + "grad_norm": 2.60583579179481, + "language_loss": 0.87675405, + "learning_rate": 2.788352117317012e-06, + "loss": 0.9052462, + "num_input_tokens_seen": 1527245, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 4.4375, + "step": 76, + "time_per_iteration": 2.6379826068878174 + }, + { + "auxiliary_loss_clip": 0.01705571, + "auxiliary_loss_mlp": 0.0114831, + "balance_loss_clip": 1.07845366, + "balance_loss_mlp": 1.26138341, + "epoch": 0.004629490455433639, + "flos": 28658474899200.0, + "grad_norm": 1.9080158042054207, + "language_loss": 0.91751724, + "learning_rate": 2.796768605577095e-06, + "loss": 0.94605613, + "num_input_tokens_seen": 1548930, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 4.4375, + "step": 77, + "time_per_iteration": 2.6596872806549072 + }, + { + "auxiliary_loss_clip": 0.01694222, + "auxiliary_loss_mlp": 0.01165235, + "balance_loss_clip": 1.09494948, + "balance_loss_mlp": 1.26167083, + "epoch": 0.004689613708101608, + "flos": 11072409805440.0, + "grad_norm": 2.1229280552318803, + "language_loss": 0.92189825, + "learning_rate": 2.80507649095533e-06, + "loss": 0.95049286, + "num_input_tokens_seen": 1565695, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 4.3125, + "step": 78, + "time_per_iteration": 2.598590850830078 + }, + { + "auxiliary_loss_clip": 0.01690635, + "auxiliary_loss_mlp": 0.01155594, + "balance_loss_clip": 1.08735824, + "balance_loss_mlp": 1.25696921, + "epoch": 0.004749736960769578, + "flos": 21799106184960.0, + "grad_norm": 2.280813483182965, + "language_loss": 0.82480371, + "learning_rate": 2.813278540517843e-06, + "loss": 0.85326606, + "num_input_tokens_seen": 1582625, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 4.34375, + "step": 79, + "time_per_iteration": 2.6215708255767822 + }, + { + "auxiliary_loss_clip": 0.01705122, + "auxiliary_loss_mlp": 0.01133248, + "balance_loss_clip": 1.06315339, + "balance_loss_mlp": 1.26029253, + "epoch": 0.004809860213437547, + "flos": 19792597570560.0, + "grad_norm": 2.4809717100134616, + "language_loss": 0.91311121, + "learning_rate": 2.8213774169075505e-06, + "loss": 0.94149494, + "num_input_tokens_seen": 1601725, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 4.4375, + "step": 80, + "time_per_iteration": 2.639841079711914 + }, + { + "auxiliary_loss_clip": 0.01674552, + "auxiliary_loss_mlp": 0.01142875, + "balance_loss_clip": 1.07254159, + "balance_loss_mlp": 1.25350285, + "epoch": 0.004869983466105517, + "flos": 26574327037440.0, + "grad_norm": 2.165091554789383, + "language_loss": 0.94981706, + "learning_rate": 2.829375683533245e-06, + "loss": 0.97799134, + "num_input_tokens_seen": 1622420, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 4.21875, + "step": 81, + "time_per_iteration": 2.6689717769622803 + }, + { + "auxiliary_loss_clip": 0.01688803, + "auxiliary_loss_mlp": 0.01148831, + "balance_loss_clip": 1.08269382, + "balance_loss_mlp": 1.25745821, + "epoch": 0.004930106718773485, + "flos": 12823378087680.0, + "grad_norm": 2.9914678747629226, + "language_loss": 0.96341741, + "learning_rate": 2.8372758094402803e-06, + "loss": 0.99179375, + "num_input_tokens_seen": 1640715, + "router_z_loss_clip": 0.66015625, + "router_z_loss_mlp": 4.3125, + "step": 82, + "time_per_iteration": 2.629596471786499 + }, + { + "auxiliary_loss_clip": 0.01671229, + "auxiliary_loss_mlp": 0.01159801, + "balance_loss_clip": 1.09013557, + "balance_loss_mlp": 1.24528587, + "epoch": 0.004990229971441455, + "flos": 25774919902080.0, + "grad_norm": 2.533591741594043, + "language_loss": 0.8664127, + "learning_rate": 2.84508017388607e-06, + "loss": 0.894723, + "num_input_tokens_seen": 1662210, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 4.25, + "step": 83, + "time_per_iteration": 2.7277162075042725 + }, + { + "auxiliary_loss_clip": 0.01664198, + "auxiliary_loss_mlp": 0.01156919, + "balance_loss_clip": 1.08663368, + "balance_loss_mlp": 1.24647975, + "epoch": 0.005050353224109424, + "flos": 17457254922240.0, + "grad_norm": 3.373799694341511, + "language_loss": 0.91779828, + "learning_rate": 2.852791070641559e-06, + "loss": 0.94600952, + "num_input_tokens_seen": 1681070, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 4.1875, + "step": 84, + "time_per_iteration": 2.62187123298645 + }, + { + "auxiliary_loss_clip": 0.01645783, + "auxiliary_loss_mlp": 0.01205663, + "balance_loss_clip": 1.17075825, + "balance_loss_mlp": 1.34984684, + "epoch": 0.005110476476777394, + "flos": 69805460367360.0, + "grad_norm": 1.4266053341540552, + "language_loss": 0.62504542, + "learning_rate": 2.8604107120381682e-06, + "loss": 0.65355992, + "num_input_tokens_seen": 1747140, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 2.96875, + "step": 85, + "time_per_iteration": 3.190223217010498 + }, + { + "auxiliary_loss_clip": 0.0165122, + "auxiliary_loss_mlp": 0.01127154, + "balance_loss_clip": 1.05648708, + "balance_loss_mlp": 1.23674285, + "epoch": 0.005170599729445363, + "flos": 24790105739520.0, + "grad_norm": 1.7428139018461835, + "language_loss": 0.90836501, + "learning_rate": 2.8679412327780482e-06, + "loss": 0.93614876, + "num_input_tokens_seen": 1767475, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 4.15625, + "step": 86, + "time_per_iteration": 2.66162109375 + }, + { + "auxiliary_loss_clip": 0.01655877, + "auxiliary_loss_mlp": 0.01161945, + "balance_loss_clip": 1.09065783, + "balance_loss_mlp": 1.24282312, + "epoch": 0.005230722982113333, + "flos": 23258048895360.0, + "grad_norm": 2.38275425723773, + "language_loss": 0.8209877, + "learning_rate": 2.8753846935240833e-06, + "loss": 0.84916592, + "num_input_tokens_seen": 1784980, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 4.125, + "step": 87, + "time_per_iteration": 2.618906021118164 + }, + { + "auxiliary_loss_clip": 0.01644726, + "auxiliary_loss_mlp": 0.01154792, + "balance_loss_clip": 1.08617568, + "balance_loss_mlp": 1.24127626, + "epoch": 0.005290846234781301, + "flos": 16727909264640.0, + "grad_norm": 1.8918921085406437, + "language_loss": 0.95630223, + "learning_rate": 2.8827430842847267e-06, + "loss": 0.98429739, + "num_input_tokens_seen": 1803030, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 4.03125, + "step": 88, + "time_per_iteration": 2.5916671752929688 + }, + { + "auxiliary_loss_clip": 0.01661198, + "auxiliary_loss_mlp": 0.0114963, + "balance_loss_clip": 1.08230066, + "balance_loss_mlp": 1.24101663, + "epoch": 0.005350969487449271, + "flos": 20886077352960.0, + "grad_norm": 1.9438908009999392, + "language_loss": 0.85920149, + "learning_rate": 2.8900183276075957e-06, + "loss": 0.88730979, + "num_input_tokens_seen": 1822865, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 4.1875, + "step": 89, + "time_per_iteration": 2.6486849784851074 + }, + { + "auxiliary_loss_clip": 0.01648909, + "auxiliary_loss_mlp": 0.01132231, + "balance_loss_clip": 1.06547391, + "balance_loss_mlp": 1.23491406, + "epoch": 0.00541109274011724, + "flos": 26209977431040.0, + "grad_norm": 4.519706664825811, + "language_loss": 0.91517568, + "learning_rate": 2.8972122815946455e-06, + "loss": 0.94298708, + "num_input_tokens_seen": 1842435, + "router_z_loss_clip": 0.66796875, + "router_z_loss_mlp": 4.125, + "step": 90, + "time_per_iteration": 2.658997058868408 + }, + { + "auxiliary_loss_clip": 0.01630542, + "auxiliary_loss_mlp": 0.0113282, + "balance_loss_clip": 1.06496572, + "balance_loss_mlp": 1.23102689, + "epoch": 0.00547121599278521, + "flos": 21178569801600.0, + "grad_norm": 2.2090932400382486, + "language_loss": 0.8587057, + "learning_rate": 2.90432674275074e-06, + "loss": 0.88633931, + "num_input_tokens_seen": 1860065, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 3.984375, + "step": 91, + "time_per_iteration": 2.619231939315796 + }, + { + "auxiliary_loss_clip": 0.01629785, + "auxiliary_loss_mlp": 0.01140917, + "balance_loss_clip": 1.07458866, + "balance_loss_mlp": 1.22673059, + "epoch": 0.005531339245453179, + "flos": 19718801078400.0, + "grad_norm": 2.769705373909222, + "language_loss": 0.86930025, + "learning_rate": 2.91136344867656e-06, + "loss": 0.89700729, + "num_input_tokens_seen": 1878135, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 4.03125, + "step": 92, + "time_per_iteration": 2.646968126296997 + }, + { + "auxiliary_loss_clip": 0.01621216, + "auxiliary_loss_mlp": 0.01179948, + "balance_loss_clip": 1.1122849, + "balance_loss_mlp": 1.21872091, + "epoch": 0.005591462498121149, + "flos": 17636089760640.0, + "grad_norm": 2.5030178409929, + "language_loss": 0.92042911, + "learning_rate": 2.918324080615938e-06, + "loss": 0.94844079, + "num_input_tokens_seen": 1894895, + "router_z_loss_clip": 0.67578125, + "router_z_loss_mlp": 4.03125, + "step": 93, + "time_per_iteration": 2.59853196144104 + }, + { + "auxiliary_loss_clip": 0.016342, + "auxiliary_loss_mlp": 0.01152159, + "balance_loss_clip": 1.08120561, + "balance_loss_mlp": 1.22512126, + "epoch": 0.005651585750789117, + "flos": 20011221699840.0, + "grad_norm": 2.2071592078672198, + "language_loss": 0.87372428, + "learning_rate": 2.925210265866963e-06, + "loss": 0.90158784, + "num_input_tokens_seen": 1913220, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 4.09375, + "step": 94, + "time_per_iteration": 2.587707281112671 + }, + { + "auxiliary_loss_clip": 0.01562532, + "auxiliary_loss_mlp": 0.01067909, + "balance_loss_clip": 1.03243279, + "balance_loss_mlp": 1.30452466, + "epoch": 0.005711709003457087, + "flos": 59812957981440.0, + "grad_norm": 1.3851210442303683, + "language_loss": 0.6813519, + "learning_rate": 2.932023580065507e-06, + "loss": 0.70765626, + "num_input_tokens_seen": 1970970, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 2.578125, + "step": 95, + "time_per_iteration": 3.067047595977783 + }, + { + "auxiliary_loss_clip": 0.01611383, + "auxiliary_loss_mlp": 0.01154317, + "balance_loss_clip": 1.08693981, + "balance_loss_mlp": 1.21303511, + "epoch": 0.005771832256125056, + "flos": 15559591495680.0, + "grad_norm": 2.5109536438971976, + "language_loss": 0.89978027, + "learning_rate": 2.9387655493491906e-06, + "loss": 0.92743719, + "num_input_tokens_seen": 1988930, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 3.984375, + "step": 96, + "time_per_iteration": 2.590522289276123 + }, + { + "auxiliary_loss_clip": 0.01603776, + "auxiliary_loss_mlp": 0.01143264, + "balance_loss_clip": 1.08108413, + "balance_loss_mlp": 1.21597803, + "epoch": 0.005831955508793026, + "flos": 22528380015360.0, + "grad_norm": 2.825781473558237, + "language_loss": 0.89798892, + "learning_rate": 2.9454376524092147e-06, + "loss": 0.92545933, + "num_input_tokens_seen": 2006285, + "router_z_loss_clip": 0.62109375, + "router_z_loss_mlp": 3.875, + "step": 97, + "time_per_iteration": 2.630364179611206 + }, + { + "auxiliary_loss_clip": 0.0158997, + "auxiliary_loss_mlp": 0.01139649, + "balance_loss_clip": 1.07103181, + "balance_loss_mlp": 1.20754981, + "epoch": 0.005892078761460995, + "flos": 22049834094720.0, + "grad_norm": 2.1954130163748573, + "language_loss": 0.76553786, + "learning_rate": 2.952041322436969e-06, + "loss": 0.79283404, + "num_input_tokens_seen": 2024905, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 3.8125, + "step": 98, + "time_per_iteration": 2.6088852882385254 + }, + { + "auxiliary_loss_clip": 0.01531856, + "auxiliary_loss_mlp": 0.01047217, + "balance_loss_clip": 1.01250362, + "balance_loss_mlp": 1.28449416, + "epoch": 0.005952202014128965, + "flos": 68539143317760.0, + "grad_norm": 1.0389188302362988, + "language_loss": 0.65464473, + "learning_rate": 2.9585779489718204e-06, + "loss": 0.68043554, + "num_input_tokens_seen": 2086220, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 2.46875, + "step": 99, + "time_per_iteration": 3.196779251098633 + }, + { + "auxiliary_loss_clip": 0.0159215, + "auxiliary_loss_mlp": 0.01143603, + "balance_loss_clip": 1.07312632, + "balance_loss_mlp": 1.20754516, + "epoch": 0.006012325266796933, + "flos": 22960887678720.0, + "grad_norm": 2.02393591458392, + "language_loss": 0.90861535, + "learning_rate": 2.9650488796560464e-06, + "loss": 0.93597281, + "num_input_tokens_seen": 2103365, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 3.84375, + "step": 100, + "time_per_iteration": 2.659716844558716 + }, + { + "auxiliary_loss_clip": 0.01602583, + "auxiliary_loss_mlp": 0.01150362, + "balance_loss_clip": 1.08360529, + "balance_loss_mlp": 1.21008992, + "epoch": 0.006072448519464903, + "flos": 17347942857600.0, + "grad_norm": 9.149928686451464, + "language_loss": 0.91165614, + "learning_rate": 2.971455421902446e-06, + "loss": 0.93918556, + "num_input_tokens_seen": 2121995, + "router_z_loss_clip": 0.66796875, + "router_z_loss_mlp": 3.921875, + "step": 101, + "time_per_iteration": 5.522722959518433 + }, + { + "auxiliary_loss_clip": 0.01592164, + "auxiliary_loss_mlp": 0.01153598, + "balance_loss_clip": 1.08273995, + "balance_loss_mlp": 1.21078956, + "epoch": 0.006132571772132872, + "flos": 24681116897280.0, + "grad_norm": 2.149611483260168, + "language_loss": 0.90634245, + "learning_rate": 2.9777988444798075e-06, + "loss": 0.9338001, + "num_input_tokens_seen": 2141815, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 3.8125, + "step": 102, + "time_per_iteration": 2.7264201641082764 + }, + { + "auxiliary_loss_clip": 0.01586171, + "auxiliary_loss_mlp": 0.01134806, + "balance_loss_clip": 1.06986046, + "balance_loss_mlp": 1.20794034, + "epoch": 0.006192695024800842, + "flos": 21465675210240.0, + "grad_norm": 2.4455555336324135, + "language_loss": 0.87990314, + "learning_rate": 2.9840803790210285e-06, + "loss": 0.9071129, + "num_input_tokens_seen": 2161125, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 3.78125, + "step": 103, + "time_per_iteration": 2.6332345008850098 + }, + { + "auxiliary_loss_clip": 0.01586169, + "auxiliary_loss_mlp": 0.01136721, + "balance_loss_clip": 1.07015502, + "balance_loss_mlp": 1.2100153, + "epoch": 0.006252818277468811, + "flos": 17420410546560.0, + "grad_norm": 1.9653003456434248, + "language_loss": 0.93796182, + "learning_rate": 2.990301221458371e-06, + "loss": 0.96519077, + "num_input_tokens_seen": 2179510, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 3.765625, + "step": 104, + "time_per_iteration": 2.5763180255889893 + }, + { + "auxiliary_loss_clip": 0.01576682, + "auxiliary_loss_mlp": 0.01148107, + "balance_loss_clip": 1.08382916, + "balance_loss_mlp": 1.20004964, + "epoch": 0.006312941530136781, + "flos": 19099557584640.0, + "grad_norm": 2.978383813748495, + "language_loss": 0.96302718, + "learning_rate": 2.9964625333900544e-06, + "loss": 0.99027503, + "num_input_tokens_seen": 2197870, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 3.765625, + "step": 105, + "time_per_iteration": 2.598074197769165 + }, + { + "auxiliary_loss_clip": 0.01576054, + "auxiliary_loss_mlp": 0.01157995, + "balance_loss_clip": 1.08618331, + "balance_loss_mlp": 1.20040035, + "epoch": 0.006373064782804749, + "flos": 24060831909120.0, + "grad_norm": 2.254409296180574, + "language_loss": 0.86981636, + "learning_rate": 3.002565443382063e-06, + "loss": 0.89715683, + "num_input_tokens_seen": 2217495, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 3.75, + "step": 106, + "time_per_iteration": 2.620400905609131 + }, + { + "auxiliary_loss_clip": 0.01558878, + "auxiliary_loss_mlp": 0.01142953, + "balance_loss_clip": 1.07462192, + "balance_loss_mlp": 1.18650925, + "epoch": 0.006433188035472719, + "flos": 18332433797760.0, + "grad_norm": 2.299900982703377, + "language_loss": 0.8342824, + "learning_rate": 3.008611048208843e-06, + "loss": 0.86130083, + "num_input_tokens_seen": 2236520, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 3.71875, + "step": 107, + "time_per_iteration": 2.6031439304351807 + }, + { + "auxiliary_loss_clip": 0.01473949, + "auxiliary_loss_mlp": 0.01044987, + "balance_loss_clip": 1.01294351, + "balance_loss_mlp": 1.24969411, + "epoch": 0.006493311288140688, + "flos": 62562387594240.0, + "grad_norm": 0.9921074222226888, + "language_loss": 0.64829654, + "learning_rate": 3.014600414036285e-06, + "loss": 0.67348593, + "num_input_tokens_seen": 2300140, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 2.25, + "step": 108, + "time_per_iteration": 3.1797876358032227 + }, + { + "auxiliary_loss_clip": 0.01549803, + "auxiliary_loss_mlp": 0.0113223, + "balance_loss_clip": 1.0634706, + "balance_loss_mlp": 1.18794155, + "epoch": 0.006553434540808658, + "flos": 19500141035520.0, + "grad_norm": 3.0292528917398895, + "language_loss": 0.97705221, + "learning_rate": 3.0205345775501937e-06, + "loss": 1.00387263, + "num_input_tokens_seen": 2317320, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 3.625, + "step": 109, + "time_per_iteration": 2.587251663208008 + }, + { + "auxiliary_loss_clip": 0.01548304, + "auxiliary_loss_mlp": 0.01143686, + "balance_loss_clip": 1.07759643, + "balance_loss_mlp": 1.18955791, + "epoch": 0.006613557793476627, + "flos": 21105132445440.0, + "grad_norm": 1.7037490209774204, + "language_loss": 0.84119976, + "learning_rate": 3.0264145470332218e-06, + "loss": 0.86811972, + "num_input_tokens_seen": 2337820, + "router_z_loss_clip": 0.66015625, + "router_z_loss_mlp": 3.59375, + "step": 110, + "time_per_iteration": 2.612900495529175 + }, + { + "auxiliary_loss_clip": 0.01543027, + "auxiliary_loss_mlp": 0.01148771, + "balance_loss_clip": 1.08287191, + "balance_loss_mlp": 1.18348098, + "epoch": 0.006673681046144597, + "flos": 26030747543040.0, + "grad_norm": 2.0686651571732186, + "language_loss": 0.83053756, + "learning_rate": 3.032241303393073e-06, + "loss": 0.85745549, + "num_input_tokens_seen": 2358560, + "router_z_loss_clip": 0.66015625, + "router_z_loss_mlp": 3.59375, + "step": 111, + "time_per_iteration": 2.648775815963745 + }, + { + "auxiliary_loss_clip": 0.01543945, + "auxiliary_loss_mlp": 0.01132291, + "balance_loss_clip": 1.06906247, + "balance_loss_mlp": 1.18600404, + "epoch": 0.006733804298812566, + "flos": 23147767163520.0, + "grad_norm": 1.9360906695559799, + "language_loss": 0.94064176, + "learning_rate": 3.0380158011446e-06, + "loss": 0.96740413, + "num_input_tokens_seen": 2379005, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 3.59375, + "step": 112, + "time_per_iteration": 2.5952305793762207 + }, + { + "auxiliary_loss_clip": 0.01547241, + "auxiliary_loss_mlp": 0.0113746, + "balance_loss_clip": 1.07342076, + "balance_loss_mlp": 1.18214464, + "epoch": 0.006793927551480535, + "flos": 11764444210560.0, + "grad_norm": 2.4119047199233594, + "language_loss": 0.79298341, + "learning_rate": 3.0437389693482466e-06, + "loss": 0.81983036, + "num_input_tokens_seen": 2395610, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 3.65625, + "step": 113, + "time_per_iteration": 2.524744987487793 + }, + { + "auxiliary_loss_clip": 0.01535171, + "auxiliary_loss_mlp": 0.01130123, + "balance_loss_clip": 1.06460583, + "balance_loss_mlp": 1.1784718, + "epoch": 0.006854050804148504, + "flos": 19171953446400.0, + "grad_norm": 2.1108584765070924, + "language_loss": 0.93168736, + "learning_rate": 3.0494117125071475e-06, + "loss": 0.95834035, + "num_input_tokens_seen": 2415005, + "router_z_loss_clip": 0.65625, + "router_z_loss_mlp": 3.5625, + "step": 114, + "time_per_iteration": 2.6716785430908203 + }, + { + "auxiliary_loss_clip": 0.01541748, + "auxiliary_loss_mlp": 0.01138267, + "balance_loss_clip": 1.07828045, + "balance_loss_mlp": 1.17785645, + "epoch": 0.006914174056816474, + "flos": 21981891519360.0, + "grad_norm": 2.266348661789013, + "language_loss": 0.94440514, + "learning_rate": 3.055034911425055e-06, + "loss": 0.97120523, + "num_input_tokens_seen": 2433965, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 3.640625, + "step": 115, + "time_per_iteration": 2.6136229038238525 + }, + { + "auxiliary_loss_clip": 0.01536673, + "auxiliary_loss_mlp": 0.0111845, + "balance_loss_clip": 1.052122, + "balance_loss_mlp": 1.1758287, + "epoch": 0.006974297309484443, + "flos": 16289152634880.0, + "grad_norm": 12.665326776351556, + "language_loss": 0.81903678, + "learning_rate": 3.0606094240271244e-06, + "loss": 0.84558797, + "num_input_tokens_seen": 2451605, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 3.609375, + "step": 116, + "time_per_iteration": 2.577003240585327 + }, + { + "auxiliary_loss_clip": 0.01526673, + "auxiliary_loss_mlp": 0.01127935, + "balance_loss_clip": 1.06375241, + "balance_loss_mlp": 1.17504787, + "epoch": 0.007034420562152413, + "flos": 26104005331200.0, + "grad_norm": 2.0071741256932794, + "language_loss": 0.88063896, + "learning_rate": 3.0661360861454656e-06, + "loss": 0.90718508, + "num_input_tokens_seen": 2472035, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 3.515625, + "step": 117, + "time_per_iteration": 2.611503839492798 + }, + { + "auxiliary_loss_clip": 0.01525448, + "auxiliary_loss_mlp": 0.01143736, + "balance_loss_clip": 1.07840896, + "balance_loss_mlp": 1.17308259, + "epoch": 0.007094543814820382, + "flos": 14204609723520.0, + "grad_norm": 2.5473368597875594, + "language_loss": 0.84470415, + "learning_rate": 3.071615712271274e-06, + "loss": 0.87139601, + "num_input_tokens_seen": 2489285, + "router_z_loss_clip": 0.65234375, + "router_z_loss_mlp": 3.53125, + "step": 118, + "time_per_iteration": 2.577461004257202 + }, + { + "auxiliary_loss_clip": 0.01536798, + "auxiliary_loss_mlp": 0.01163532, + "balance_loss_clip": 1.09930205, + "balance_loss_mlp": 1.1748507, + "epoch": 0.007154667067488351, + "flos": 14976007228800.0, + "grad_norm": 2.057592918726277, + "language_loss": 0.99470234, + "learning_rate": 3.0770490962752172e-06, + "loss": 1.02170563, + "num_input_tokens_seen": 2506460, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 3.625, + "step": 119, + "time_per_iteration": 2.549661636352539 + }, + { + "auxiliary_loss_clip": 0.01537395, + "auxiliary_loss_mlp": 0.0111939, + "balance_loss_clip": 1.05701971, + "balance_loss_mlp": 1.16968298, + "epoch": 0.00721479032015632, + "flos": 20193288762240.0, + "grad_norm": 2.410205702357196, + "language_loss": 0.89085704, + "learning_rate": 3.082437012097686e-06, + "loss": 0.91742492, + "num_input_tokens_seen": 2525565, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 3.6875, + "step": 120, + "time_per_iteration": 2.583630084991455 + }, + { + "auxiliary_loss_clip": 0.01524337, + "auxiliary_loss_mlp": 0.01130091, + "balance_loss_clip": 1.06667209, + "balance_loss_mlp": 1.17169607, + "epoch": 0.00727491357282429, + "flos": 23147228459520.0, + "grad_norm": 1.904240324338801, + "language_loss": 0.93491054, + "learning_rate": 3.0877802144103967e-06, + "loss": 0.96145487, + "num_input_tokens_seen": 2546605, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 3.53125, + "step": 121, + "time_per_iteration": 2.6146788597106934 + }, + { + "auxiliary_loss_clip": 0.01523412, + "auxiliary_loss_mlp": 0.0114605, + "balance_loss_clip": 1.08382273, + "balance_loss_mlp": 1.17073464, + "epoch": 0.007335036825492259, + "flos": 15521669712000.0, + "grad_norm": 3.352658173167552, + "language_loss": 0.90176952, + "learning_rate": 3.09307943925077e-06, + "loss": 0.92846411, + "num_input_tokens_seen": 2560730, + "router_z_loss_clip": 0.62109375, + "router_z_loss_mlp": 3.53125, + "step": 122, + "time_per_iteration": 2.566470146179199 + }, + { + "auxiliary_loss_clip": 0.01520578, + "auxiliary_loss_mlp": 0.01142532, + "balance_loss_clip": 1.07634664, + "balance_loss_mlp": 1.16606736, + "epoch": 0.007395160078160229, + "flos": 24243365848320.0, + "grad_norm": 2.7249964127160764, + "language_loss": 0.92516506, + "learning_rate": 3.0983354046304154e-06, + "loss": 0.95179617, + "num_input_tokens_seen": 2579550, + "router_z_loss_clip": 0.66015625, + "router_z_loss_mlp": 3.546875, + "step": 123, + "time_per_iteration": 2.6002941131591797 + }, + { + "auxiliary_loss_clip": 0.01517776, + "auxiliary_loss_mlp": 0.01125795, + "balance_loss_clip": 1.06433022, + "balance_loss_mlp": 1.1609534, + "epoch": 0.007455283330828198, + "flos": 31759792099200.0, + "grad_norm": 7.583203404073904, + "language_loss": 0.71128142, + "learning_rate": 3.103548811118979e-06, + "loss": 0.73771715, + "num_input_tokens_seen": 2600390, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 3.5625, + "step": 124, + "time_per_iteration": 2.79618763923645 + }, + { + "auxiliary_loss_clip": 0.01505473, + "auxiliary_loss_mlp": 0.01124615, + "balance_loss_clip": 1.06157708, + "balance_loss_mlp": 1.16223335, + "epoch": 0.007515406583496167, + "flos": 26615157822720.0, + "grad_norm": 2.4227692366027855, + "language_loss": 0.88482195, + "learning_rate": 3.108720342404542e-06, + "loss": 0.9111228, + "num_input_tokens_seen": 2620770, + "router_z_loss_clip": 0.62890625, + "router_z_loss_mlp": 3.4375, + "step": 125, + "time_per_iteration": 2.6131536960601807 + }, + { + "auxiliary_loss_clip": 0.0152071, + "auxiliary_loss_mlp": 0.01140137, + "balance_loss_clip": 1.07762396, + "balance_loss_mlp": 1.16211164, + "epoch": 0.007575529836164136, + "flos": 18223696350720.0, + "grad_norm": 2.993097477973623, + "language_loss": 0.82384819, + "learning_rate": 3.1138506658316945e-06, + "loss": 0.8504566, + "num_input_tokens_seen": 2639900, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 3.59375, + "step": 126, + "time_per_iteration": 2.595423936843872 + }, + { + "auxiliary_loss_clip": 0.01514354, + "auxiliary_loss_mlp": 0.01139255, + "balance_loss_clip": 1.077981, + "balance_loss_mlp": 1.16128385, + "epoch": 0.007635653088832106, + "flos": 21580410228480.0, + "grad_norm": 3.7264016399601534, + "language_loss": 0.67276633, + "learning_rate": 3.1189404329183404e-06, + "loss": 0.69930243, + "num_input_tokens_seen": 2657450, + "router_z_loss_clip": 0.61328125, + "router_z_loss_mlp": 3.53125, + "step": 127, + "time_per_iteration": 2.620950937271118 + }, + { + "auxiliary_loss_clip": 0.01504536, + "auxiliary_loss_mlp": 0.01128822, + "balance_loss_clip": 1.06640375, + "balance_loss_mlp": 1.16422939, + "epoch": 0.007695776341500075, + "flos": 25375054723200.0, + "grad_norm": 3.6226937306152496, + "language_loss": 0.8815757, + "learning_rate": 3.1239902798522317e-06, + "loss": 0.90790927, + "num_input_tokens_seen": 2678150, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 3.40625, + "step": 128, + "time_per_iteration": 2.6476521492004395 + }, + { + "auxiliary_loss_clip": 0.01505804, + "auxiliary_loss_mlp": 0.01141266, + "balance_loss_clip": 1.07870471, + "balance_loss_mlp": 1.15920687, + "epoch": 0.007755899594168045, + "flos": 22343906741760.0, + "grad_norm": 1.875185485357673, + "language_loss": 0.84581351, + "learning_rate": 3.129000827968184e-06, + "loss": 0.87228423, + "num_input_tokens_seen": 2698290, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 3.46875, + "step": 129, + "time_per_iteration": 2.611762285232544 + }, + { + "auxiliary_loss_clip": 0.01499869, + "auxiliary_loss_mlp": 0.01133647, + "balance_loss_clip": 1.07122934, + "balance_loss_mlp": 1.1588279, + "epoch": 0.007816022846836013, + "flos": 22638230784000.0, + "grad_norm": 2.023668494136832, + "language_loss": 0.9742806, + "learning_rate": 3.133972684206866e-06, + "loss": 1.00061572, + "num_input_tokens_seen": 2717630, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 3.40625, + "step": 130, + "time_per_iteration": 2.599639415740967 + }, + { + "auxiliary_loss_clip": 0.01493155, + "auxiliary_loss_mlp": 0.0113499, + "balance_loss_clip": 1.07109392, + "balance_loss_mlp": 1.15518749, + "epoch": 0.007876146099503984, + "flos": 18182901479040.0, + "grad_norm": 2.1876581172480285, + "language_loss": 0.82624269, + "learning_rate": 3.138906441556014e-06, + "loss": 0.85252404, + "num_input_tokens_seen": 2735835, + "router_z_loss_clip": 0.63671875, + "router_z_loss_mlp": 3.375, + "step": 131, + "time_per_iteration": 2.6086065769195557 + }, + { + "auxiliary_loss_clip": 0.01502593, + "auxiliary_loss_mlp": 0.01127672, + "balance_loss_clip": 1.06759024, + "balance_loss_mlp": 1.15800536, + "epoch": 0.007936269352171952, + "flos": 27119486730240.0, + "grad_norm": 2.4868851395581677, + "language_loss": 0.82762384, + "learning_rate": 3.143802679474861e-06, + "loss": 0.85392648, + "num_input_tokens_seen": 2756335, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 3.4375, + "step": 132, + "time_per_iteration": 2.673790454864502 + }, + { + "auxiliary_loss_clip": 0.01493849, + "auxiliary_loss_mlp": 0.01128197, + "balance_loss_clip": 1.06716144, + "balance_loss_mlp": 1.15264463, + "epoch": 0.007996392604839923, + "flos": 19026335710080.0, + "grad_norm": 2.7432419346617443, + "language_loss": 0.95486552, + "learning_rate": 3.1486619643025565e-06, + "loss": 0.98108596, + "num_input_tokens_seen": 2775090, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 3.40625, + "step": 133, + "time_per_iteration": 2.6287872791290283 + }, + { + "auxiliary_loss_clip": 0.01490198, + "auxiliary_loss_mlp": 0.01125526, + "balance_loss_clip": 1.06725681, + "balance_loss_mlp": 1.16143155, + "epoch": 0.008056515857507891, + "flos": 25484151306240.0, + "grad_norm": 1.7764051426707919, + "language_loss": 0.73316634, + "learning_rate": 3.153484849651286e-06, + "loss": 0.7593236, + "num_input_tokens_seen": 2795320, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 3.296875, + "step": 134, + "time_per_iteration": 2.6728081703186035 + }, + { + "auxiliary_loss_clip": 0.01486213, + "auxiliary_loss_mlp": 0.01130543, + "balance_loss_clip": 1.06707644, + "balance_loss_mlp": 1.14955854, + "epoch": 0.00811663911017586, + "flos": 20557566541440.0, + "grad_norm": 3.090234736760587, + "language_loss": 0.88808328, + "learning_rate": 3.1582718767847806e-06, + "loss": 0.91425079, + "num_input_tokens_seen": 2812815, + "router_z_loss_clip": 0.63671875, + "router_z_loss_mlp": 3.375, + "step": 135, + "time_per_iteration": 2.6380510330200195 + }, + { + "auxiliary_loss_clip": 0.01489108, + "auxiliary_loss_mlp": 0.01131555, + "balance_loss_clip": 1.06789732, + "balance_loss_mlp": 1.15456343, + "epoch": 0.00817676236284383, + "flos": 18799738761600.0, + "grad_norm": 2.008171494368998, + "language_loss": 0.89123899, + "learning_rate": 3.1630235749828485e-06, + "loss": 0.9174456, + "num_input_tokens_seen": 2830445, + "router_z_loss_clip": 0.63671875, + "router_z_loss_mlp": 3.34375, + "step": 136, + "time_per_iteration": 2.555936813354492 + }, + { + "auxiliary_loss_clip": 0.01486639, + "auxiliary_loss_mlp": 0.01108223, + "balance_loss_clip": 1.04962027, + "balance_loss_mlp": 1.14870429, + "epoch": 0.008236885615511799, + "flos": 23873593288320.0, + "grad_norm": 5.8712537379963345, + "language_loss": 0.8400104, + "learning_rate": 3.1677404618925676e-06, + "loss": 0.86595905, + "num_input_tokens_seen": 2846965, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 3.375, + "step": 137, + "time_per_iteration": 2.6225337982177734 + }, + { + "auxiliary_loss_clip": 0.01482624, + "auxiliary_loss_mlp": 0.01116377, + "balance_loss_clip": 1.05796409, + "balance_loss_mlp": 1.14842129, + "epoch": 0.00829700886817977, + "flos": 24643626076800.0, + "grad_norm": 1.6861384534946333, + "language_loss": 0.90170664, + "learning_rate": 3.1724230438666953e-06, + "loss": 0.9276967, + "num_input_tokens_seen": 2867520, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 3.34375, + "step": 138, + "time_per_iteration": 2.653205156326294 + }, + { + "auxiliary_loss_clip": 0.01472312, + "auxiliary_loss_mlp": 0.01119929, + "balance_loss_clip": 1.0568912, + "balance_loss_mlp": 1.1478796, + "epoch": 0.008357132120847738, + "flos": 25262007644160.0, + "grad_norm": 2.679342832062188, + "language_loss": 0.91253459, + "learning_rate": 3.177071816289865e-06, + "loss": 0.93845713, + "num_input_tokens_seen": 2885675, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 3.234375, + "step": 139, + "time_per_iteration": 2.6182503700256348 + }, + { + "auxiliary_loss_clip": 0.01489087, + "auxiliary_loss_mlp": 0.01123997, + "balance_loss_clip": 1.06229401, + "balance_loss_mlp": 1.154405, + "epoch": 0.008417255373515706, + "flos": 27344898529920.0, + "grad_norm": 2.5553770836970675, + "language_loss": 0.85446793, + "learning_rate": 3.181687263893095e-06, + "loss": 0.88059878, + "num_input_tokens_seen": 2905960, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 3.34375, + "step": 140, + "time_per_iteration": 2.649454116821289 + }, + { + "auxiliary_loss_clip": 0.01476267, + "auxiliary_loss_mlp": 0.0111889, + "balance_loss_clip": 1.0594281, + "balance_loss_mlp": 1.14865911, + "epoch": 0.008477378626183677, + "flos": 17639070589440.0, + "grad_norm": 2.379593217845822, + "language_loss": 0.84156519, + "learning_rate": 3.186269861057098e-06, + "loss": 0.86751676, + "num_input_tokens_seen": 2922780, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 3.28125, + "step": 141, + "time_per_iteration": 2.608603000640869 + }, + { + "auxiliary_loss_clip": 0.01480312, + "auxiliary_loss_mlp": 0.01134333, + "balance_loss_clip": 1.07320273, + "balance_loss_mlp": 1.14624739, + "epoch": 0.008537501878851645, + "flos": 13881342297600.0, + "grad_norm": 2.3283494467369965, + "language_loss": 0.81387591, + "learning_rate": 3.1908200721048745e-06, + "loss": 0.84002233, + "num_input_tokens_seen": 2938765, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 3.34375, + "step": 142, + "time_per_iteration": 4.023308753967285 + }, + { + "auxiliary_loss_clip": 0.01378722, + "auxiliary_loss_mlp": 0.01032728, + "balance_loss_clip": 1.00621629, + "balance_loss_mlp": 1.1918689, + "epoch": 0.008597625131519616, + "flos": 71248101281280.0, + "grad_norm": 1.0451783350372967, + "language_loss": 0.66831523, + "learning_rate": 3.195338351584042e-06, + "loss": 0.69242978, + "num_input_tokens_seen": 3006665, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.8671875, + "step": 143, + "time_per_iteration": 4.718023777008057 + }, + { + "auxiliary_loss_clip": 0.01472184, + "auxiliary_loss_mlp": 0.0112263, + "balance_loss_clip": 1.06283474, + "balance_loss_mlp": 1.14625573, + "epoch": 0.008657748384187584, + "flos": 17602836744960.0, + "grad_norm": 2.2608538764922295, + "language_loss": 0.83954072, + "learning_rate": 3.1998251445393258e-06, + "loss": 0.86548889, + "num_input_tokens_seen": 3024335, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 3.25, + "step": 144, + "time_per_iteration": 2.5878453254699707 + }, + { + "auxiliary_loss_clip": 0.01457808, + "auxiliary_loss_mlp": 0.01111605, + "balance_loss_clip": 1.04890084, + "balance_loss_mlp": 1.13930941, + "epoch": 0.008717871636855555, + "flos": 19715317459200.0, + "grad_norm": 2.241812154138119, + "language_loss": 0.88511693, + "learning_rate": 3.204280886775619e-06, + "loss": 0.91081107, + "num_input_tokens_seen": 3043300, + "router_z_loss_clip": 0.62890625, + "router_z_loss_mlp": 3.1875, + "step": 145, + "time_per_iteration": 2.586512565612793 + }, + { + "auxiliary_loss_clip": 0.01475641, + "auxiliary_loss_mlp": 0.01124002, + "balance_loss_clip": 1.06153631, + "balance_loss_mlp": 1.14211285, + "epoch": 0.008777994889523523, + "flos": 24717422568960.0, + "grad_norm": 1.792984011276012, + "language_loss": 0.85949898, + "learning_rate": 3.208706005112005e-06, + "loss": 0.88549542, + "num_input_tokens_seen": 3064610, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 3.34375, + "step": 146, + "time_per_iteration": 2.6258151531219482 + }, + { + "auxiliary_loss_clip": 0.01359324, + "auxiliary_loss_mlp": 0.01026953, + "balance_loss_clip": 1.00082254, + "balance_loss_mlp": 1.17825258, + "epoch": 0.008838118142191492, + "flos": 70132067758080.0, + "grad_norm": 0.8557738136673508, + "language_loss": 0.60047674, + "learning_rate": 3.213100917627104e-06, + "loss": 0.62433958, + "num_input_tokens_seen": 3130385, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.8125, + "step": 147, + "time_per_iteration": 3.2522764205932617 + }, + { + "auxiliary_loss_clip": 0.01465546, + "auxiliary_loss_mlp": 0.01124118, + "balance_loss_clip": 1.06670642, + "balance_loss_mlp": 1.14550173, + "epoch": 0.008898241394859462, + "flos": 20044797937920.0, + "grad_norm": 1.8343461268862185, + "language_loss": 0.8454501, + "learning_rate": 3.2174660338961135e-06, + "loss": 0.87134671, + "num_input_tokens_seen": 3149760, + "router_z_loss_clip": 0.57421875, + "router_z_loss_mlp": 3.203125, + "step": 148, + "time_per_iteration": 2.635499954223633 + }, + { + "auxiliary_loss_clip": 0.0147086, + "auxiliary_loss_mlp": 0.01143141, + "balance_loss_clip": 1.07914925, + "balance_loss_mlp": 1.14693797, + "epoch": 0.008958364647527431, + "flos": 10743611685120.0, + "grad_norm": 2.2581185064103404, + "language_loss": 0.88802874, + "learning_rate": 3.2218017552198588e-06, + "loss": 0.91416872, + "num_input_tokens_seen": 3164500, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 3.234375, + "step": 149, + "time_per_iteration": 2.5458836555480957 + }, + { + "auxiliary_loss_clip": 0.01466862, + "auxiliary_loss_mlp": 0.01112061, + "balance_loss_clip": 1.05445874, + "balance_loss_mlp": 1.14131117, + "epoch": 0.009018487900195401, + "flos": 29127467802240.0, + "grad_norm": 2.7760320197047097, + "language_loss": 0.93054724, + "learning_rate": 3.226108474846181e-06, + "loss": 0.95633656, + "num_input_tokens_seen": 3182455, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 3.25, + "step": 150, + "time_per_iteration": 2.648111343383789 + }, + { + "auxiliary_loss_clip": 0.01454371, + "auxiliary_loss_mlp": 0.01109463, + "balance_loss_clip": 1.05391192, + "balance_loss_mlp": 1.13663483, + "epoch": 0.00907861115286337, + "flos": 32963661354240.0, + "grad_norm": 1.9005080345968057, + "language_loss": 0.74303263, + "learning_rate": 3.2303865781839817e-06, + "loss": 0.76867104, + "num_input_tokens_seen": 3203995, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 3.171875, + "step": 151, + "time_per_iteration": 2.6953165531158447 + }, + { + "auxiliary_loss_clip": 0.014664, + "auxiliary_loss_mlp": 0.01125146, + "balance_loss_clip": 1.06735289, + "balance_loss_mlp": 1.14143276, + "epoch": 0.009138734405531338, + "flos": 21762441377280.0, + "grad_norm": 2.6241423805649298, + "language_loss": 0.88251799, + "learning_rate": 3.234636443010188e-06, + "loss": 0.90843344, + "num_input_tokens_seen": 3222575, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 3.25, + "step": 152, + "time_per_iteration": 2.6034231185913086 + }, + { + "auxiliary_loss_clip": 0.01466383, + "auxiliary_loss_mlp": 0.01121244, + "balance_loss_clip": 1.0628314, + "balance_loss_mlp": 1.14757276, + "epoch": 0.009198857658199309, + "flos": 20842517134080.0, + "grad_norm": 3.4062301864690196, + "language_loss": 0.83957756, + "learning_rate": 3.238858439669943e-06, + "loss": 0.86545384, + "num_input_tokens_seen": 3240180, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 3.1875, + "step": 153, + "time_per_iteration": 2.6023271083831787 + }, + { + "auxiliary_loss_clip": 0.01456394, + "auxiliary_loss_mlp": 0.01136316, + "balance_loss_clip": 1.0765202, + "balance_loss_mlp": 1.13805962, + "epoch": 0.009258980910867277, + "flos": 24827381078400.0, + "grad_norm": 1.9441527650945287, + "language_loss": 0.89881843, + "learning_rate": 3.2430529312702712e-06, + "loss": 0.92474556, + "num_input_tokens_seen": 3259800, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 3.1875, + "step": 154, + "time_per_iteration": 2.646308183670044 + }, + { + "auxiliary_loss_clip": 0.01460439, + "auxiliary_loss_mlp": 0.01154617, + "balance_loss_clip": 1.09577537, + "balance_loss_mlp": 1.14094579, + "epoch": 0.009319104163535248, + "flos": 28767786963840.0, + "grad_norm": 2.0692323216259187, + "language_loss": 0.89471745, + "learning_rate": 3.2472202738674737e-06, + "loss": 0.92086804, + "num_input_tokens_seen": 3280400, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 3.1875, + "step": 155, + "time_per_iteration": 2.6336286067962646 + }, + { + "auxiliary_loss_clip": 0.01463585, + "auxiliary_loss_mlp": 0.01116238, + "balance_loss_clip": 1.05894589, + "balance_loss_mlp": 1.13895822, + "epoch": 0.009379227416203216, + "flos": 16582004219520.0, + "grad_norm": 3.3077298720636255, + "language_loss": 0.86882627, + "learning_rate": 3.2513608166485063e-06, + "loss": 0.89462447, + "num_input_tokens_seen": 3297600, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 3.25, + "step": 156, + "time_per_iteration": 2.5539867877960205 + }, + { + "auxiliary_loss_clip": 0.01462083, + "auxiliary_loss_mlp": 0.01121969, + "balance_loss_clip": 1.06408143, + "balance_loss_mlp": 1.14298415, + "epoch": 0.009439350668871187, + "flos": 18329919845760.0, + "grad_norm": 2.4916444524903527, + "language_loss": 0.99553013, + "learning_rate": 3.2554749021065498e-06, + "loss": 1.02137065, + "num_input_tokens_seen": 3313635, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 3.1875, + "step": 157, + "time_per_iteration": 2.5249693393707275 + }, + { + "auxiliary_loss_clip": 0.01445636, + "auxiliary_loss_mlp": 0.01139016, + "balance_loss_clip": 1.08146214, + "balance_loss_mlp": 1.1366899, + "epoch": 0.009499473921539155, + "flos": 24349912565760.0, + "grad_norm": 2.0302475566757225, + "language_loss": 0.8847568, + "learning_rate": 3.2595628662110186e-06, + "loss": 0.91060334, + "num_input_tokens_seen": 3333735, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 3.09375, + "step": 158, + "time_per_iteration": 2.6009252071380615 + }, + { + "auxiliary_loss_clip": 0.01452439, + "auxiliary_loss_mlp": 0.01124253, + "balance_loss_clip": 1.06555486, + "balance_loss_mlp": 1.13677907, + "epoch": 0.009559597174207124, + "flos": 16399326625920.0, + "grad_norm": 4.310723443959545, + "language_loss": 0.86534697, + "learning_rate": 3.2636250385721982e-06, + "loss": 0.89111388, + "num_input_tokens_seen": 3348800, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 3.15625, + "step": 159, + "time_per_iteration": 2.6107394695281982 + }, + { + "auxiliary_loss_clip": 0.01442093, + "auxiliary_loss_mlp": 0.01132817, + "balance_loss_clip": 1.07340288, + "balance_loss_mlp": 1.13145089, + "epoch": 0.009619720426875094, + "flos": 22856890826880.0, + "grad_norm": 1.790220267572532, + "language_loss": 0.86825597, + "learning_rate": 3.2676617426007263e-06, + "loss": 0.89400506, + "num_input_tokens_seen": 3368595, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 3.109375, + "step": 160, + "time_per_iteration": 2.574252128601074 + }, + { + "auxiliary_loss_clip": 0.01449537, + "auxiliary_loss_mlp": 0.01117828, + "balance_loss_clip": 1.06318271, + "balance_loss_mlp": 1.13704872, + "epoch": 0.009679843679543063, + "flos": 19135001329920.0, + "grad_norm": 2.6107931748588893, + "language_loss": 0.91542315, + "learning_rate": 3.2716732956621042e-06, + "loss": 0.94109678, + "num_input_tokens_seen": 3384975, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 3.125, + "step": 161, + "time_per_iteration": 2.550865650177002 + }, + { + "auxiliary_loss_clip": 0.01454094, + "auxiliary_loss_mlp": 0.01109765, + "balance_loss_clip": 1.05488133, + "balance_loss_mlp": 1.13759339, + "epoch": 0.009739966932211033, + "flos": 20302995876480.0, + "grad_norm": 2.2107920101940994, + "language_loss": 0.91690832, + "learning_rate": 3.2756600092264203e-06, + "loss": 0.94254684, + "num_input_tokens_seen": 3404755, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 3.15625, + "step": 162, + "time_per_iteration": 2.5527970790863037 + }, + { + "auxiliary_loss_clip": 0.01312712, + "auxiliary_loss_mlp": 0.0102725, + "balance_loss_clip": 1.00331306, + "balance_loss_mlp": 1.14560354, + "epoch": 0.009800090184879002, + "flos": 67034234177280.0, + "grad_norm": 1.2615279464106541, + "language_loss": 0.72354776, + "learning_rate": 3.279622189013474e-06, + "loss": 0.74694741, + "num_input_tokens_seen": 3467210, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.671875, + "step": 163, + "time_per_iteration": 3.143763542175293 + }, + { + "auxiliary_loss_clip": 0.01440764, + "auxiliary_loss_mlp": 0.01113881, + "balance_loss_clip": 1.05804312, + "balance_loss_mlp": 1.13505006, + "epoch": 0.00986021343754697, + "flos": 17164690646400.0, + "grad_norm": 2.1923315312730374, + "language_loss": 0.8427155, + "learning_rate": 3.283560135133457e-06, + "loss": 0.86826193, + "num_input_tokens_seen": 3483220, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 3.0625, + "step": 164, + "time_per_iteration": 2.5536584854125977 + }, + { + "auxiliary_loss_clip": 0.01429878, + "auxiliary_loss_mlp": 0.01100497, + "balance_loss_clip": 1.04585135, + "balance_loss_mlp": 1.12637794, + "epoch": 0.00992033669021494, + "flos": 17749424148480.0, + "grad_norm": 2.006756380443377, + "language_loss": 0.89215541, + "learning_rate": 3.2874741422233565e-06, + "loss": 0.91745919, + "num_input_tokens_seen": 3501465, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 3.03125, + "step": 165, + "time_per_iteration": 2.5313210487365723 + }, + { + "auxiliary_loss_clip": 0.01434156, + "auxiliary_loss_mlp": 0.01127756, + "balance_loss_clip": 1.0692482, + "balance_loss_mlp": 1.12764359, + "epoch": 0.00998045994288291, + "flos": 25297164080640.0, + "grad_norm": 6.432940691763592, + "language_loss": 0.80138129, + "learning_rate": 3.2913644995792465e-06, + "loss": 0.82700044, + "num_input_tokens_seen": 3520480, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 3.0625, + "step": 166, + "time_per_iteration": 2.6461095809936523 + }, + { + "auxiliary_loss_clip": 0.01438531, + "auxiliary_loss_mlp": 0.01125189, + "balance_loss_clip": 1.06749213, + "balance_loss_mlp": 1.13121533, + "epoch": 0.01004058319555088, + "flos": 32298954220800.0, + "grad_norm": 2.334124726802297, + "language_loss": 0.9190954, + "learning_rate": 3.2952314912845914e-06, + "loss": 0.94473255, + "num_input_tokens_seen": 3539570, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 3.078125, + "step": 167, + "time_per_iteration": 2.655597448348999 + }, + { + "auxiliary_loss_clip": 0.01430369, + "auxiliary_loss_mlp": 0.01135101, + "balance_loss_clip": 1.07997894, + "balance_loss_mlp": 1.12960708, + "epoch": 0.010100706448218848, + "flos": 11319941404800.0, + "grad_norm": 3.1870046541457873, + "language_loss": 0.90852308, + "learning_rate": 3.299075396334735e-06, + "loss": 0.93417776, + "num_input_tokens_seen": 3555465, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 3.0, + "step": 168, + "time_per_iteration": 2.5387983322143555 + }, + { + "auxiliary_loss_clip": 0.01424973, + "auxiliary_loss_mlp": 0.01106848, + "balance_loss_clip": 1.05072391, + "balance_loss_mlp": 1.12456727, + "epoch": 0.010160829700886819, + "flos": 29719491765120.0, + "grad_norm": 2.0495813916191077, + "language_loss": 0.87094414, + "learning_rate": 3.3028964887576868e-06, + "loss": 0.89626241, + "num_input_tokens_seen": 3578970, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 3.0, + "step": 169, + "time_per_iteration": 2.6448419094085693 + }, + { + "auxiliary_loss_clip": 0.01426284, + "auxiliary_loss_mlp": 0.01111393, + "balance_loss_clip": 1.05548358, + "balance_loss_mlp": 1.12704372, + "epoch": 0.010220952953554787, + "flos": 20412343854720.0, + "grad_norm": 3.0203817486241973, + "language_loss": 0.84758192, + "learning_rate": 3.306695037731344e-06, + "loss": 0.87295866, + "num_input_tokens_seen": 3597275, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 3.0, + "step": 170, + "time_per_iteration": 2.5596489906311035 + }, + { + "auxiliary_loss_clip": 0.01435879, + "auxiliary_loss_mlp": 0.01136565, + "balance_loss_clip": 1.07963061, + "balance_loss_mlp": 1.12765205, + "epoch": 0.010281076206222756, + "flos": 31285124847360.0, + "grad_norm": 2.124400250788896, + "language_loss": 0.89896494, + "learning_rate": 3.3104713076972827e-06, + "loss": 0.92468935, + "num_input_tokens_seen": 3618905, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 3.078125, + "step": 171, + "time_per_iteration": 2.71183180809021 + }, + { + "auxiliary_loss_clip": 0.01429687, + "auxiliary_loss_mlp": 0.01108406, + "balance_loss_clip": 1.05421364, + "balance_loss_mlp": 1.1300813, + "epoch": 0.010341199458890726, + "flos": 21982286568960.0, + "grad_norm": 2.015577645060998, + "language_loss": 0.88978243, + "learning_rate": 3.314225558471224e-06, + "loss": 0.91516334, + "num_input_tokens_seen": 3639610, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 3.0, + "step": 172, + "time_per_iteration": 2.6193771362304688 + }, + { + "auxiliary_loss_clip": 0.01419105, + "auxiliary_loss_mlp": 0.01124801, + "balance_loss_clip": 1.06986928, + "balance_loss_mlp": 1.12354624, + "epoch": 0.010401322711558695, + "flos": 30810529422720.0, + "grad_norm": 1.6868779107262128, + "language_loss": 0.81148165, + "learning_rate": 3.317958045350308e-06, + "loss": 0.83692074, + "num_input_tokens_seen": 3664030, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 2.953125, + "step": 173, + "time_per_iteration": 2.656935691833496 + }, + { + "auxiliary_loss_clip": 0.01430653, + "auxiliary_loss_mlp": 0.0110718, + "balance_loss_clip": 1.05496693, + "balance_loss_mlp": 1.12733519, + "epoch": 0.010461445964226665, + "flos": 24715124098560.0, + "grad_norm": 2.1134597687554244, + "language_loss": 0.82498932, + "learning_rate": 3.3216690192172596e-06, + "loss": 0.85036767, + "num_input_tokens_seen": 3683615, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 3.03125, + "step": 174, + "time_per_iteration": 2.6050753593444824 + }, + { + "auxiliary_loss_clip": 0.01425822, + "auxiliary_loss_mlp": 0.01124156, + "balance_loss_clip": 1.06984437, + "balance_loss_mlp": 1.12589645, + "epoch": 0.010521569216894634, + "flos": 27710361457920.0, + "grad_norm": 2.6035215697191965, + "language_loss": 0.72699076, + "learning_rate": 3.325358726641591e-06, + "loss": 0.75249052, + "num_input_tokens_seen": 3704540, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 3.0, + "step": 175, + "time_per_iteration": 2.6859946250915527 + }, + { + "auxiliary_loss_clip": 0.01427679, + "auxiliary_loss_mlp": 0.0113274, + "balance_loss_clip": 1.07571054, + "balance_loss_mlp": 1.12603855, + "epoch": 0.010581692469562603, + "flos": 12458346122880.0, + "grad_norm": 2.402827576481816, + "language_loss": 0.98082507, + "learning_rate": 3.329027409977902e-06, + "loss": 1.00642931, + "num_input_tokens_seen": 3721320, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 3.015625, + "step": 176, + "time_per_iteration": 2.5405664443969727 + }, + { + "auxiliary_loss_clip": 0.01412838, + "auxiliary_loss_mlp": 0.01132631, + "balance_loss_clip": 1.08005941, + "balance_loss_mlp": 1.12270594, + "epoch": 0.010641815722230573, + "flos": 19427601519360.0, + "grad_norm": 2.3427037211777115, + "language_loss": 0.76749414, + "learning_rate": 3.3326753074614087e-06, + "loss": 0.79294884, + "num_input_tokens_seen": 3739385, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.90625, + "step": 177, + "time_per_iteration": 2.555553674697876 + }, + { + "auxiliary_loss_clip": 0.01423246, + "auxiliary_loss_mlp": 0.01104615, + "balance_loss_clip": 1.0507797, + "balance_loss_mlp": 1.12089574, + "epoch": 0.010701938974898541, + "flos": 18332577452160.0, + "grad_norm": 2.4108248963401464, + "language_loss": 0.76824659, + "learning_rate": 3.3363026533007716e-06, + "loss": 0.79352522, + "num_input_tokens_seen": 3756360, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 3.015625, + "step": 178, + "time_per_iteration": 2.5799388885498047 + }, + { + "auxiliary_loss_clip": 0.01429506, + "auxiliary_loss_mlp": 0.01108132, + "balance_loss_clip": 1.05224717, + "balance_loss_mlp": 1.12586653, + "epoch": 0.010762062227566512, + "flos": 19203985399680.0, + "grad_norm": 2.1918052506036174, + "language_loss": 0.84004253, + "learning_rate": 3.3399096777683303e-06, + "loss": 0.86541891, + "num_input_tokens_seen": 3773930, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 3.03125, + "step": 179, + "time_per_iteration": 2.5387184619903564 + }, + { + "auxiliary_loss_clip": 0.01420983, + "auxiliary_loss_mlp": 0.01112539, + "balance_loss_clip": 1.05677247, + "balance_loss_mlp": 1.12062979, + "epoch": 0.01082218548023448, + "flos": 31425427370880.0, + "grad_norm": 1.90488055395076, + "language_loss": 0.83719397, + "learning_rate": 3.3434966072878213e-06, + "loss": 0.86252916, + "num_input_tokens_seen": 3793630, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 3.0, + "step": 180, + "time_per_iteration": 2.6149253845214844 + }, + { + "auxiliary_loss_clip": 0.01421575, + "auxiliary_loss_mlp": 0.01120367, + "balance_loss_clip": 1.06503046, + "balance_loss_mlp": 1.1226536, + "epoch": 0.01088230873290245, + "flos": 25046436170880.0, + "grad_norm": 3.784573507260413, + "language_loss": 0.7774682, + "learning_rate": 3.3470636645196674e-06, + "loss": 0.80288756, + "num_input_tokens_seen": 3813610, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 3.0, + "step": 181, + "time_per_iteration": 2.5769712924957275 + }, + { + "auxiliary_loss_clip": 0.01417045, + "auxiliary_loss_mlp": 0.01131731, + "balance_loss_clip": 1.07732356, + "balance_loss_mlp": 1.11938787, + "epoch": 0.01094243198557042, + "flos": 22893411980160.0, + "grad_norm": 3.1835165271024377, + "language_loss": 0.76440376, + "learning_rate": 3.3506110684439156e-06, + "loss": 0.78989148, + "num_input_tokens_seen": 3831390, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 2.96875, + "step": 182, + "time_per_iteration": 2.5641353130340576 + }, + { + "auxiliary_loss_clip": 0.01412704, + "auxiliary_loss_mlp": 0.01127012, + "balance_loss_clip": 1.07122183, + "balance_loss_mlp": 1.11758399, + "epoch": 0.011002555238238388, + "flos": 17165049782400.0, + "grad_norm": 2.172025067133121, + "language_loss": 0.87377435, + "learning_rate": 3.3541390344409054e-06, + "loss": 0.89917147, + "num_input_tokens_seen": 3849705, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 2.953125, + "step": 183, + "time_per_iteration": 2.567457914352417 + }, + { + "auxiliary_loss_clip": 0.01415124, + "auxiliary_loss_mlp": 0.01114516, + "balance_loss_clip": 1.06397092, + "balance_loss_mlp": 1.1209594, + "epoch": 0.011062678490906358, + "flos": 22310150935680.0, + "grad_norm": 2.2669267607504255, + "language_loss": 0.86875558, + "learning_rate": 3.357647774369736e-06, + "loss": 0.89405191, + "num_input_tokens_seen": 3869230, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.953125, + "step": 184, + "time_per_iteration": 5.380701780319214 + }, + { + "auxiliary_loss_clip": 0.01411555, + "auxiliary_loss_mlp": 0.01107942, + "balance_loss_clip": 1.05308247, + "balance_loss_mlp": 1.12176847, + "epoch": 0.011122801743574327, + "flos": 24388373053440.0, + "grad_norm": 1.8448371257401488, + "language_loss": 0.83683228, + "learning_rate": 3.3611374966446085e-06, + "loss": 0.86202729, + "num_input_tokens_seen": 3889735, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 2.90625, + "step": 185, + "time_per_iteration": 2.5522208213806152 + }, + { + "auxiliary_loss_clip": 0.01420908, + "auxiliary_loss_mlp": 0.01109712, + "balance_loss_clip": 1.05253971, + "balance_loss_mlp": 1.11964798, + "epoch": 0.011182924996242297, + "flos": 18150258994560.0, + "grad_norm": 2.4162416092451475, + "language_loss": 0.71111757, + "learning_rate": 3.3646084063091142e-06, + "loss": 0.73642373, + "num_input_tokens_seen": 3908855, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 3.015625, + "step": 186, + "time_per_iteration": 2.536498546600342 + }, + { + "auxiliary_loss_clip": 0.01416319, + "auxiliary_loss_mlp": 0.01107204, + "balance_loss_clip": 1.0558964, + "balance_loss_mlp": 1.11923158, + "epoch": 0.011243048248910266, + "flos": 15486800584320.0, + "grad_norm": 3.342492581434835, + "language_loss": 1.02028871, + "learning_rate": 3.3680607051085194e-06, + "loss": 1.04552388, + "num_input_tokens_seen": 3923865, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 2.96875, + "step": 187, + "time_per_iteration": 2.5189080238342285 + }, + { + "auxiliary_loss_clip": 0.01405552, + "auxiliary_loss_mlp": 0.01110459, + "balance_loss_clip": 1.05597997, + "balance_loss_mlp": 1.11834478, + "epoch": 0.011303171501578235, + "flos": 40916868986880.0, + "grad_norm": 1.6787333311747052, + "language_loss": 0.75107503, + "learning_rate": 3.371494591560139e-06, + "loss": 0.7762351, + "num_input_tokens_seen": 3946870, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 2.875, + "step": 188, + "time_per_iteration": 2.73420786857605 + }, + { + "auxiliary_loss_clip": 0.01292523, + "auxiliary_loss_mlp": 0.01034102, + "balance_loss_clip": 1.01273942, + "balance_loss_mlp": 1.13387585, + "epoch": 0.011363294754246205, + "flos": 66302697790080.0, + "grad_norm": 0.7700467396195164, + "language_loss": 0.56216431, + "learning_rate": 3.3749102610218297e-06, + "loss": 0.5854305, + "num_input_tokens_seen": 4010005, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.5859375, + "step": 189, + "time_per_iteration": 3.176280975341797 + }, + { + "auxiliary_loss_clip": 0.01402635, + "auxiliary_loss_mlp": 0.01121834, + "balance_loss_clip": 1.06742704, + "balance_loss_mlp": 1.1134795, + "epoch": 0.011423418006914174, + "flos": 24900279730560.0, + "grad_norm": 2.292403028528975, + "language_loss": 0.94771594, + "learning_rate": 3.3783079057586833e-06, + "loss": 0.97296059, + "num_input_tokens_seen": 4029035, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 2.90625, + "step": 190, + "time_per_iteration": 2.604132652282715 + }, + { + "auxiliary_loss_clip": 0.01403317, + "auxiliary_loss_mlp": 0.01101291, + "balance_loss_clip": 1.04964972, + "balance_loss_mlp": 1.11493886, + "epoch": 0.011483541259582144, + "flos": 19791879298560.0, + "grad_norm": 2.993049163405909, + "language_loss": 0.84462845, + "learning_rate": 3.3816877150079665e-06, + "loss": 0.8696745, + "num_input_tokens_seen": 4046995, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 2.875, + "step": 191, + "time_per_iteration": 2.569664716720581 + }, + { + "auxiliary_loss_clip": 0.01402316, + "auxiliary_loss_mlp": 0.01121031, + "balance_loss_clip": 1.0698905, + "balance_loss_mlp": 1.11087692, + "epoch": 0.011543664512250112, + "flos": 26176939896960.0, + "grad_norm": 2.0097697123850593, + "language_loss": 0.91439575, + "learning_rate": 3.385049875042367e-06, + "loss": 0.93962914, + "num_input_tokens_seen": 4065865, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 2.90625, + "step": 192, + "time_per_iteration": 2.6416900157928467 + }, + { + "auxiliary_loss_clip": 0.0139743, + "auxiliary_loss_mlp": 0.01113461, + "balance_loss_clip": 1.05776596, + "balance_loss_mlp": 1.11231375, + "epoch": 0.011603787764918083, + "flos": 23768985905280.0, + "grad_norm": 2.095754720056515, + "language_loss": 0.86849445, + "learning_rate": 3.3883945692315938e-06, + "loss": 0.89360332, + "num_input_tokens_seen": 4085305, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 2.84375, + "step": 193, + "time_per_iteration": 2.569899797439575 + }, + { + "auxiliary_loss_clip": 0.01399232, + "auxiliary_loss_mlp": 0.01095137, + "balance_loss_clip": 1.04409146, + "balance_loss_mlp": 1.10937476, + "epoch": 0.011663911017586051, + "flos": 25954688494080.0, + "grad_norm": 2.446553756436178, + "language_loss": 0.92399615, + "learning_rate": 3.3917219781023906e-06, + "loss": 0.9489398, + "num_input_tokens_seen": 4105185, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 2.90625, + "step": 194, + "time_per_iteration": 2.6078743934631348 + }, + { + "auxiliary_loss_clip": 0.01405837, + "auxiliary_loss_mlp": 0.0110398, + "balance_loss_clip": 1.05188549, + "balance_loss_mlp": 1.11522019, + "epoch": 0.01172403427025402, + "flos": 17895149625600.0, + "grad_norm": 3.1413620570060052, + "language_loss": 0.89698559, + "learning_rate": 3.3950322793970014e-06, + "loss": 0.92208374, + "num_input_tokens_seen": 4123160, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.90625, + "step": 195, + "time_per_iteration": 2.5785820484161377 + }, + { + "auxiliary_loss_clip": 0.01400897, + "auxiliary_loss_mlp": 0.01117652, + "balance_loss_clip": 1.06345916, + "balance_loss_mlp": 1.11416054, + "epoch": 0.01178415752292199, + "flos": 17894539094400.0, + "grad_norm": 3.0173579296668813, + "language_loss": 0.8577168, + "learning_rate": 3.3983256481301445e-06, + "loss": 0.88290232, + "num_input_tokens_seen": 4140425, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 2.875, + "step": 196, + "time_per_iteration": 2.5492773056030273 + }, + { + "auxiliary_loss_clip": 0.01397107, + "auxiliary_loss_mlp": 0.01106206, + "balance_loss_clip": 1.05299139, + "balance_loss_mlp": 1.10991478, + "epoch": 0.011844280775589959, + "flos": 22893555634560.0, + "grad_norm": 2.86264810097015, + "language_loss": 0.93367243, + "learning_rate": 3.4016022566445335e-06, + "loss": 0.95870566, + "num_input_tokens_seen": 4159555, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.875, + "step": 197, + "time_per_iteration": 2.5488431453704834 + }, + { + "auxiliary_loss_clip": 0.01394686, + "auxiliary_loss_mlp": 0.0110986, + "balance_loss_clip": 1.05781317, + "balance_loss_mlp": 1.1120131, + "epoch": 0.01190440402825793, + "flos": 26980333441920.0, + "grad_norm": 2.1872318454948045, + "language_loss": 0.79184073, + "learning_rate": 3.4048622746649966e-06, + "loss": 0.81688625, + "num_input_tokens_seen": 4180480, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 2.828125, + "step": 198, + "time_per_iteration": 2.6208834648132324 + }, + { + "auxiliary_loss_clip": 0.01390401, + "auxiliary_loss_mlp": 0.01116361, + "balance_loss_clip": 1.06545901, + "balance_loss_mlp": 1.11265802, + "epoch": 0.011964527280925898, + "flos": 20521584092160.0, + "grad_norm": 3.3720724842630663, + "language_loss": 0.88065112, + "learning_rate": 3.4081058693512278e-06, + "loss": 0.90571868, + "num_input_tokens_seen": 4198835, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.765625, + "step": 199, + "time_per_iteration": 2.5257043838500977 + }, + { + "auxiliary_loss_clip": 0.01403414, + "auxiliary_loss_mlp": 0.01121968, + "balance_loss_clip": 1.0658679, + "balance_loss_mlp": 1.11557496, + "epoch": 0.012024650533593867, + "flos": 27745984771200.0, + "grad_norm": 1.8432610551497841, + "language_loss": 0.81327617, + "learning_rate": 3.411333205349222e-06, + "loss": 0.83853, + "num_input_tokens_seen": 4219335, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 2.875, + "step": 200, + "time_per_iteration": 2.593231201171875 + }, + { + "auxiliary_loss_clip": 0.01400536, + "auxiliary_loss_mlp": 0.01101092, + "balance_loss_clip": 1.04792464, + "balance_loss_mlp": 1.11138511, + "epoch": 0.012084773786261837, + "flos": 10452017076480.0, + "grad_norm": 2.758923223370522, + "language_loss": 0.87688923, + "learning_rate": 3.4145444448414217e-06, + "loss": 0.90190548, + "num_input_tokens_seen": 4236940, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.90625, + "step": 201, + "time_per_iteration": 2.5057122707366943 + }, + { + "auxiliary_loss_clip": 0.01401128, + "auxiliary_loss_mlp": 0.01110995, + "balance_loss_clip": 1.05751753, + "balance_loss_mlp": 1.1152513, + "epoch": 0.012144897038929806, + "flos": 23105751229440.0, + "grad_norm": 3.7927516715708736, + "language_loss": 0.84123611, + "learning_rate": 3.4177397475956223e-06, + "loss": 0.86635733, + "num_input_tokens_seen": 4256755, + "router_z_loss_clip": 0.53515625, + "router_z_loss_mlp": 2.859375, + "step": 202, + "time_per_iteration": 2.555680751800537 + }, + { + "auxiliary_loss_clip": 0.01388205, + "auxiliary_loss_mlp": 0.01109065, + "balance_loss_clip": 1.05639839, + "balance_loss_mlp": 1.10674798, + "epoch": 0.012205020291597776, + "flos": 21033203460480.0, + "grad_norm": 1.9040504717952067, + "language_loss": 0.90116632, + "learning_rate": 3.4209192710126685e-06, + "loss": 0.926139, + "num_input_tokens_seen": 4276505, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 2.8125, + "step": 203, + "time_per_iteration": 2.526937484741211 + }, + { + "auxiliary_loss_clip": 0.01281494, + "auxiliary_loss_mlp": 0.01053133, + "balance_loss_clip": 1.03138971, + "balance_loss_mlp": 1.12054539, + "epoch": 0.012265143544265745, + "flos": 68447785075200.0, + "grad_norm": 1.0150955472927095, + "language_loss": 0.61259121, + "learning_rate": 3.4240831701729837e-06, + "loss": 0.63593745, + "num_input_tokens_seen": 4330965, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.609375, + "step": 204, + "time_per_iteration": 3.051469326019287 + }, + { + "auxiliary_loss_clip": 0.01398264, + "auxiliary_loss_mlp": 0.01111819, + "balance_loss_clip": 1.0593431, + "balance_loss_mlp": 1.11035323, + "epoch": 0.012325266796933715, + "flos": 17019252478080.0, + "grad_norm": 2.269022633654934, + "language_loss": 0.91206741, + "learning_rate": 3.4272315978819516e-06, + "loss": 0.93716824, + "num_input_tokens_seen": 4348200, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.875, + "step": 205, + "time_per_iteration": 2.5105605125427246 + }, + { + "auxiliary_loss_clip": 0.01406073, + "auxiliary_loss_mlp": 0.01120568, + "balance_loss_clip": 1.06675649, + "balance_loss_mlp": 1.11524296, + "epoch": 0.012385390049601683, + "flos": 20190056538240.0, + "grad_norm": 2.2813283317886497, + "language_loss": 0.89215505, + "learning_rate": 3.4303647047142043e-06, + "loss": 0.91742146, + "num_input_tokens_seen": 4365460, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 2.90625, + "step": 206, + "time_per_iteration": 2.524327039718628 + }, + { + "auxiliary_loss_clip": 0.01394865, + "auxiliary_loss_mlp": 0.01101938, + "balance_loss_clip": 1.05039215, + "balance_loss_mlp": 1.10848641, + "epoch": 0.012445513302269652, + "flos": 16253134272000.0, + "grad_norm": 2.502758142715096, + "language_loss": 0.95368809, + "learning_rate": 3.43348263905683e-06, + "loss": 0.97865611, + "num_input_tokens_seen": 4383650, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 2.859375, + "step": 207, + "time_per_iteration": 2.5147407054901123 + }, + { + "auxiliary_loss_clip": 0.01393931, + "auxiliary_loss_mlp": 0.01116307, + "balance_loss_clip": 1.06416512, + "balance_loss_mlp": 1.11335945, + "epoch": 0.012505636554937622, + "flos": 23769380954880.0, + "grad_norm": 2.4565104125033232, + "language_loss": 0.75770479, + "learning_rate": 3.436585547151547e-06, + "loss": 0.78280723, + "num_input_tokens_seen": 4403765, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.8125, + "step": 208, + "time_per_iteration": 2.5426721572875977 + }, + { + "auxiliary_loss_clip": 0.01382601, + "auxiliary_loss_mlp": 0.01107359, + "balance_loss_clip": 1.05497861, + "balance_loss_mlp": 1.10796773, + "epoch": 0.012565759807605591, + "flos": 30591546157440.0, + "grad_norm": 2.79364384939249, + "language_loss": 0.98718858, + "learning_rate": 3.4396735731358586e-06, + "loss": 1.01208818, + "num_input_tokens_seen": 4421935, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.75, + "step": 209, + "time_per_iteration": 2.607238292694092 + }, + { + "auxiliary_loss_clip": 0.01389293, + "auxiliary_loss_mlp": 0.01112212, + "balance_loss_clip": 1.05971253, + "balance_loss_mlp": 1.11020541, + "epoch": 0.012625883060273561, + "flos": 40113511355520.0, + "grad_norm": 7.039976369418198, + "language_loss": 0.85444254, + "learning_rate": 3.4427468590832302e-06, + "loss": 0.87945753, + "num_input_tokens_seen": 4441470, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.78125, + "step": 210, + "time_per_iteration": 2.67632794380188 + }, + { + "auxiliary_loss_clip": 0.01385349, + "auxiliary_loss_mlp": 0.01120301, + "balance_loss_clip": 1.07042408, + "balance_loss_mlp": 1.1073029, + "epoch": 0.01268600631294153, + "flos": 27089178629760.0, + "grad_norm": 2.2334441604414783, + "language_loss": 0.97016168, + "learning_rate": 3.445805545042314e-06, + "loss": 0.99521822, + "num_input_tokens_seen": 4459950, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 2.78125, + "step": 211, + "time_per_iteration": 2.5733633041381836 + }, + { + "auxiliary_loss_clip": 0.01394963, + "auxiliary_loss_mlp": 0.01114691, + "balance_loss_clip": 1.0616188, + "balance_loss_mlp": 1.11342549, + "epoch": 0.012746129565609499, + "flos": 16982767238400.0, + "grad_norm": 3.6563211355425453, + "language_loss": 0.95188707, + "learning_rate": 3.448849769075239e-06, + "loss": 0.97698367, + "num_input_tokens_seen": 4478390, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.8125, + "step": 212, + "time_per_iteration": 2.5224313735961914 + }, + { + "auxiliary_loss_clip": 0.01383511, + "auxiliary_loss_mlp": 0.01116361, + "balance_loss_clip": 1.06376541, + "balance_loss_mlp": 1.10996664, + "epoch": 0.012806252818277469, + "flos": 46533476995200.0, + "grad_norm": 2.0395830195466504, + "language_loss": 0.76049221, + "learning_rate": 3.4518796672950093e-06, + "loss": 0.78549099, + "num_input_tokens_seen": 4501665, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 2.734375, + "step": 213, + "time_per_iteration": 2.76625919342041 + }, + { + "auxiliary_loss_clip": 0.0138732, + "auxiliary_loss_mlp": 0.01103154, + "balance_loss_clip": 1.052037, + "balance_loss_mlp": 1.10833097, + "epoch": 0.012866376070945438, + "flos": 14388616120320.0, + "grad_norm": 8.414558483522654, + "language_loss": 0.86754733, + "learning_rate": 3.4548953739020187e-06, + "loss": 0.89245206, + "num_input_tokens_seen": 4519055, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 2.78125, + "step": 214, + "time_per_iteration": 2.500417470932007 + }, + { + "auxiliary_loss_clip": 0.0138682, + "auxiliary_loss_mlp": 0.01127788, + "balance_loss_clip": 1.07397687, + "balance_loss_mlp": 1.11549139, + "epoch": 0.012926499323613408, + "flos": 26140813793280.0, + "grad_norm": 2.3854037050744057, + "language_loss": 0.77357471, + "learning_rate": 3.4578970212197196e-06, + "loss": 0.79872084, + "num_input_tokens_seen": 4540870, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 2.703125, + "step": 215, + "time_per_iteration": 2.6116256713867188 + }, + { + "auxiliary_loss_clip": 0.01394912, + "auxiliary_loss_mlp": 0.01111048, + "balance_loss_clip": 1.06002641, + "balance_loss_mlp": 1.11393261, + "epoch": 0.012986622576281377, + "flos": 30117202128000.0, + "grad_norm": 2.44498430810385, + "language_loss": 0.90545797, + "learning_rate": 3.460884739729461e-06, + "loss": 0.93051755, + "num_input_tokens_seen": 4560395, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.8125, + "step": 216, + "time_per_iteration": 2.5903706550598145 + }, + { + "auxiliary_loss_clip": 0.0138678, + "auxiliary_loss_mlp": 0.01107632, + "balance_loss_clip": 1.05622888, + "balance_loss_mlp": 1.10772836, + "epoch": 0.013046745828949347, + "flos": 13954025468160.0, + "grad_norm": 2.630220300857062, + "language_loss": 0.93660516, + "learning_rate": 3.463858658104523e-06, + "loss": 0.96154928, + "num_input_tokens_seen": 4575785, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 2.78125, + "step": 217, + "time_per_iteration": 2.5109100341796875 + }, + { + "auxiliary_loss_clip": 0.01381618, + "auxiliary_loss_mlp": 0.01107152, + "balance_loss_clip": 1.05360317, + "balance_loss_mlp": 1.10700643, + "epoch": 0.013106869081617315, + "flos": 17347835116800.0, + "grad_norm": 1.9165712032980975, + "language_loss": 0.93656206, + "learning_rate": 3.4668189032433696e-06, + "loss": 0.96144974, + "num_input_tokens_seen": 4594985, + "router_z_loss_clip": 0.53515625, + "router_z_loss_mlp": 2.75, + "step": 218, + "time_per_iteration": 2.6586077213287354 + }, + { + "auxiliary_loss_clip": 0.01376505, + "auxiliary_loss_mlp": 0.01108753, + "balance_loss_clip": 1.05820787, + "balance_loss_mlp": 1.10663593, + "epoch": 0.013166992334285284, + "flos": 25884914325120.0, + "grad_norm": 1.916363531530835, + "language_loss": 0.86148179, + "learning_rate": 3.46976560030214e-06, + "loss": 0.88633436, + "num_input_tokens_seen": 4616125, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 2.703125, + "step": 219, + "time_per_iteration": 2.584040880203247 + }, + { + "auxiliary_loss_clip": 0.01383955, + "auxiliary_loss_mlp": 0.01101272, + "balance_loss_clip": 1.05056047, + "balance_loss_mlp": 1.110309, + "epoch": 0.013227115586953254, + "flos": 31175956437120.0, + "grad_norm": 1.7731463199764816, + "language_loss": 0.87598741, + "learning_rate": 3.4726988727263976e-06, + "loss": 0.90083969, + "num_input_tokens_seen": 4637795, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.75, + "step": 220, + "time_per_iteration": 2.6294186115264893 + }, + { + "auxiliary_loss_clip": 0.01373821, + "auxiliary_loss_mlp": 0.01103316, + "balance_loss_clip": 1.05663311, + "balance_loss_mlp": 1.10389161, + "epoch": 0.013287238839621223, + "flos": 20409470766720.0, + "grad_norm": 1.991547522293572, + "language_loss": 0.86413074, + "learning_rate": 3.475618842282164e-06, + "loss": 0.88890207, + "num_input_tokens_seen": 4656835, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 2.6875, + "step": 221, + "time_per_iteration": 2.606137990951538 + }, + { + "auxiliary_loss_clip": 0.0137878, + "auxiliary_loss_mlp": 0.01109834, + "balance_loss_clip": 1.05800176, + "balance_loss_mlp": 1.10240269, + "epoch": 0.013347362092289193, + "flos": 14137134024960.0, + "grad_norm": 2.017045003530743, + "language_loss": 0.92153138, + "learning_rate": 3.4785256290862486e-06, + "loss": 0.94641757, + "num_input_tokens_seen": 4673015, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 2.765625, + "step": 222, + "time_per_iteration": 2.6237566471099854 + }, + { + "auxiliary_loss_clip": 0.01377393, + "auxiliary_loss_mlp": 0.01105441, + "balance_loss_clip": 1.05129576, + "balance_loss_mlp": 1.10672021, + "epoch": 0.013407485344957162, + "flos": 21797705554560.0, + "grad_norm": 2.7127164790698606, + "language_loss": 0.95539695, + "learning_rate": 3.481419351635897e-06, + "loss": 0.98022527, + "num_input_tokens_seen": 4692355, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 2.71875, + "step": 223, + "time_per_iteration": 2.679387092590332 + }, + { + "auxiliary_loss_clip": 0.01377947, + "auxiliary_loss_mlp": 0.01106927, + "balance_loss_clip": 1.05612004, + "balance_loss_mlp": 1.10671806, + "epoch": 0.013467608597625132, + "flos": 18621622195200.0, + "grad_norm": 2.5543531214735586, + "language_loss": 0.88022512, + "learning_rate": 3.484300126837776e-06, + "loss": 0.90507382, + "num_input_tokens_seen": 4710080, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.71875, + "step": 224, + "time_per_iteration": 2.6327528953552246 + }, + { + "auxiliary_loss_clip": 0.0137715, + "auxiliary_loss_mlp": 0.01102713, + "balance_loss_clip": 1.04873466, + "balance_loss_mlp": 1.10632586, + "epoch": 0.013527731850293101, + "flos": 18552314903040.0, + "grad_norm": 2.0812591886363183, + "language_loss": 0.89642018, + "learning_rate": 3.487168070036317e-06, + "loss": 0.92121875, + "num_input_tokens_seen": 4728980, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 2.703125, + "step": 225, + "time_per_iteration": 2.511749505996704 + }, + { + "auxiliary_loss_clip": 0.01374075, + "auxiliary_loss_mlp": 0.01115854, + "balance_loss_clip": 1.06273401, + "balance_loss_mlp": 1.10547256, + "epoch": 0.01358785510296107, + "flos": 19165381257600.0, + "grad_norm": 2.1555099546542142, + "language_loss": 0.99022663, + "learning_rate": 3.4900232950414224e-06, + "loss": 1.01512599, + "num_input_tokens_seen": 4747020, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.6875, + "step": 226, + "time_per_iteration": 5.38438868522644 + }, + { + "auxiliary_loss_clip": 0.0137773, + "auxiliary_loss_mlp": 0.01111487, + "balance_loss_clip": 1.0584867, + "balance_loss_mlp": 1.10696185, + "epoch": 0.01364797835562904, + "flos": 23329941966720.0, + "grad_norm": 15.523681056640678, + "language_loss": 0.91210413, + "learning_rate": 3.4928659141555727e-06, + "loss": 0.93699628, + "num_input_tokens_seen": 4765000, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.703125, + "step": 227, + "time_per_iteration": 2.5391762256622314 + }, + { + "auxiliary_loss_clip": 0.01252818, + "auxiliary_loss_mlp": 0.01025357, + "balance_loss_clip": 1.00666487, + "balance_loss_mlp": 1.10911703, + "epoch": 0.013708101608297009, + "flos": 70993746097920.0, + "grad_norm": 0.99230217192713, + "language_loss": 0.57680154, + "learning_rate": 3.4956960382003234e-06, + "loss": 0.59958327, + "num_input_tokens_seen": 4833210, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.4375, + "step": 228, + "time_per_iteration": 3.1981163024902344 + }, + { + "auxiliary_loss_clip": 0.0136686, + "auxiliary_loss_mlp": 0.01110654, + "balance_loss_clip": 1.06127763, + "balance_loss_mlp": 1.10228515, + "epoch": 0.013768224860964979, + "flos": 16325170997760.0, + "grad_norm": 2.2779006264878374, + "language_loss": 0.8759563, + "learning_rate": 3.4985137765422354e-06, + "loss": 0.90073144, + "num_input_tokens_seen": 4850120, + "router_z_loss_clip": 0.49414062, + "router_z_loss_mlp": 2.65625, + "step": 229, + "time_per_iteration": 2.49130916595459 + }, + { + "auxiliary_loss_clip": 0.01377631, + "auxiliary_loss_mlp": 0.01101911, + "balance_loss_clip": 1.05212951, + "balance_loss_mlp": 1.10486007, + "epoch": 0.013828348113632948, + "flos": 20193037367040.0, + "grad_norm": 4.280679608747667, + "language_loss": 0.84247303, + "learning_rate": 3.501319237118231e-06, + "loss": 0.8672685, + "num_input_tokens_seen": 4866215, + "router_z_loss_clip": 0.49804688, + "router_z_loss_mlp": 2.734375, + "step": 230, + "time_per_iteration": 2.501218557357788 + }, + { + "auxiliary_loss_clip": 0.01375417, + "auxiliary_loss_mlp": 0.0111628, + "balance_loss_clip": 1.06671298, + "balance_loss_mlp": 1.10600948, + "epoch": 0.013888471366300916, + "flos": 20741070147840.0, + "grad_norm": 1.78964280876859, + "language_loss": 0.90378422, + "learning_rate": 3.5041125264604056e-06, + "loss": 0.92870116, + "num_input_tokens_seen": 4885630, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 2.6875, + "step": 231, + "time_per_iteration": 2.541137456893921 + }, + { + "auxiliary_loss_clip": 0.01377441, + "auxiliary_loss_mlp": 0.01108629, + "balance_loss_clip": 1.05941916, + "balance_loss_mlp": 1.10821056, + "epoch": 0.013948594618968886, + "flos": 22090628966400.0, + "grad_norm": 2.031489983297281, + "language_loss": 0.83706695, + "learning_rate": 3.5068937497203002e-06, + "loss": 0.86192763, + "num_input_tokens_seen": 4905570, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 2.6875, + "step": 232, + "time_per_iteration": 2.5444753170013428 + }, + { + "auxiliary_loss_clip": 0.0137977, + "auxiliary_loss_mlp": 0.01092372, + "balance_loss_clip": 1.04125488, + "balance_loss_mlp": 1.10017753, + "epoch": 0.014008717871636855, + "flos": 19063108258560.0, + "grad_norm": 2.928489064169697, + "language_loss": 0.74033689, + "learning_rate": 3.509663010692652e-06, + "loss": 0.76505834, + "num_input_tokens_seen": 4923535, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 2.796875, + "step": 233, + "time_per_iteration": 2.5364952087402344 + }, + { + "auxiliary_loss_clip": 0.01382965, + "auxiliary_loss_mlp": 0.0112384, + "balance_loss_clip": 1.07141209, + "balance_loss_mlp": 1.10741055, + "epoch": 0.014068841124304825, + "flos": 14530822064640.0, + "grad_norm": 2.287774019631123, + "language_loss": 0.85867143, + "learning_rate": 3.512420411838642e-06, + "loss": 0.88373953, + "num_input_tokens_seen": 4939200, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.75, + "step": 234, + "time_per_iteration": 2.532949209213257 + }, + { + "auxiliary_loss_clip": 0.01375298, + "auxiliary_loss_mlp": 0.01106064, + "balance_loss_clip": 1.05683041, + "balance_loss_mlp": 1.10759592, + "epoch": 0.014128964376972794, + "flos": 18077396256000.0, + "grad_norm": 2.6527993685177154, + "language_loss": 0.89144391, + "learning_rate": 3.515166054308634e-06, + "loss": 0.9162575, + "num_input_tokens_seen": 4956620, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 2.671875, + "step": 235, + "time_per_iteration": 2.509592294692993 + }, + { + "auxiliary_loss_clip": 0.0137416, + "auxiliary_loss_mlp": 0.01119384, + "balance_loss_clip": 1.06874382, + "balance_loss_mlp": 1.10830367, + "epoch": 0.014189087629640764, + "flos": 25334331678720.0, + "grad_norm": 4.054998173736759, + "language_loss": 0.85780042, + "learning_rate": 3.5179000379644498e-06, + "loss": 0.88273585, + "num_input_tokens_seen": 4975650, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.65625, + "step": 236, + "time_per_iteration": 2.744925022125244 + }, + { + "auxiliary_loss_clip": 0.0137118, + "auxiliary_loss_mlp": 0.01099258, + "balance_loss_clip": 1.04871392, + "balance_loss_mlp": 1.10178149, + "epoch": 0.014249210882308733, + "flos": 36139744713600.0, + "grad_norm": 2.128422813257453, + "language_loss": 0.82452404, + "learning_rate": 3.520622461401154e-06, + "loss": 0.84922838, + "num_input_tokens_seen": 4997415, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.6875, + "step": 237, + "time_per_iteration": 2.67307710647583 + }, + { + "auxiliary_loss_clip": 0.01369116, + "auxiliary_loss_mlp": 0.01116968, + "balance_loss_clip": 1.0643487, + "balance_loss_mlp": 1.10451889, + "epoch": 0.014309334134976702, + "flos": 12932977461120.0, + "grad_norm": 3.103781307849977, + "language_loss": 0.77321362, + "learning_rate": 3.5233334219683935e-06, + "loss": 0.79807448, + "num_input_tokens_seen": 5013905, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 2.65625, + "step": 238, + "time_per_iteration": 2.4973809719085693 + }, + { + "auxiliary_loss_clip": 0.01368178, + "auxiliary_loss_mlp": 0.01112367, + "balance_loss_clip": 1.06566119, + "balance_loss_mlp": 1.10654771, + "epoch": 0.014369457387644672, + "flos": 20777519473920.0, + "grad_norm": 1.992064896075991, + "language_loss": 0.87370872, + "learning_rate": 3.526033015791284e-06, + "loss": 0.89851415, + "num_input_tokens_seen": 5033645, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 2.609375, + "step": 239, + "time_per_iteration": 2.554222583770752 + }, + { + "auxiliary_loss_clip": 0.01352979, + "auxiliary_loss_mlp": 0.01100535, + "balance_loss_clip": 1.05330408, + "balance_loss_mlp": 1.09776592, + "epoch": 0.01442958064031264, + "flos": 25848536826240.0, + "grad_norm": 2.2433371609956283, + "language_loss": 0.93297911, + "learning_rate": 3.528721337790862e-06, + "loss": 0.95751429, + "num_input_tokens_seen": 5052875, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 2.5625, + "step": 240, + "time_per_iteration": 2.588529348373413 + }, + { + "auxiliary_loss_clip": 0.01362634, + "auxiliary_loss_mlp": 0.01104045, + "balance_loss_clip": 1.05736244, + "balance_loss_mlp": 1.10324717, + "epoch": 0.014489703892980611, + "flos": 28219718269440.0, + "grad_norm": 2.299780828803648, + "language_loss": 0.85129881, + "learning_rate": 3.531398481704111e-06, + "loss": 0.8759656, + "num_input_tokens_seen": 5075005, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 2.59375, + "step": 241, + "time_per_iteration": 2.607272148132324 + }, + { + "auxiliary_loss_clip": 0.01360778, + "auxiliary_loss_mlp": 0.01116022, + "balance_loss_clip": 1.06695509, + "balance_loss_mlp": 1.10865557, + "epoch": 0.01454982714564858, + "flos": 22490925108480.0, + "grad_norm": 1.927287768398498, + "language_loss": 0.88410223, + "learning_rate": 3.534064540103573e-06, + "loss": 0.90887022, + "num_input_tokens_seen": 5091875, + "router_z_loss_clip": 0.49023438, + "router_z_loss_mlp": 2.53125, + "step": 242, + "time_per_iteration": 2.522657632827759 + }, + { + "auxiliary_loss_clip": 0.013595, + "auxiliary_loss_mlp": 0.0109979, + "balance_loss_clip": 1.04981756, + "balance_loss_mlp": 1.10147619, + "epoch": 0.014609950398316548, + "flos": 21653201139840.0, + "grad_norm": 2.6384412969740922, + "language_loss": 0.86817086, + "learning_rate": 3.536719604416555e-06, + "loss": 0.89276373, + "num_input_tokens_seen": 5111290, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 2.578125, + "step": 243, + "time_per_iteration": 2.5738751888275146 + }, + { + "auxiliary_loss_clip": 0.01366378, + "auxiliary_loss_mlp": 0.01105289, + "balance_loss_clip": 1.05574584, + "balance_loss_mlp": 1.10421979, + "epoch": 0.014670073650984519, + "flos": 21869993675520.0, + "grad_norm": 1.576084931358892, + "language_loss": 0.84271425, + "learning_rate": 3.5393637649439464e-06, + "loss": 0.86743093, + "num_input_tokens_seen": 5132265, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 2.625, + "step": 244, + "time_per_iteration": 2.51370906829834 + }, + { + "auxiliary_loss_clip": 0.01374385, + "auxiliary_loss_mlp": 0.01115077, + "balance_loss_clip": 1.06403196, + "balance_loss_mlp": 1.10701251, + "epoch": 0.014730196903652487, + "flos": 23183713699200.0, + "grad_norm": 2.2775099056278916, + "language_loss": 0.78689361, + "learning_rate": 3.54199711087864e-06, + "loss": 0.8117882, + "num_input_tokens_seen": 5148575, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 2.671875, + "step": 245, + "time_per_iteration": 2.5579745769500732 + }, + { + "auxiliary_loss_clip": 0.01372772, + "auxiliary_loss_mlp": 0.0110276, + "balance_loss_clip": 1.04961681, + "balance_loss_mlp": 1.10232484, + "epoch": 0.014790320156320457, + "flos": 23222605150080.0, + "grad_norm": 2.2330220282190685, + "language_loss": 0.84241545, + "learning_rate": 3.5446197303235913e-06, + "loss": 0.86717069, + "num_input_tokens_seen": 5170415, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.703125, + "step": 246, + "time_per_iteration": 2.565614700317383 + }, + { + "auxiliary_loss_clip": 0.01367419, + "auxiliary_loss_mlp": 0.01097455, + "balance_loss_clip": 1.04722059, + "balance_loss_mlp": 1.10181057, + "epoch": 0.014850443408988426, + "flos": 15815490963840.0, + "grad_norm": 1.9335653980079095, + "language_loss": 0.9014703, + "learning_rate": 3.5472317103095034e-06, + "loss": 0.92611909, + "num_input_tokens_seen": 5188565, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 2.65625, + "step": 247, + "time_per_iteration": 2.5572896003723145 + }, + { + "auxiliary_loss_clip": 0.01365881, + "auxiliary_loss_mlp": 0.01097755, + "balance_loss_clip": 1.04952252, + "balance_loss_mlp": 1.09689593, + "epoch": 0.014910566661656396, + "flos": 22781657790720.0, + "grad_norm": 2.1205098484246734, + "language_loss": 0.78058362, + "learning_rate": 3.549833136812155e-06, + "loss": 0.80521989, + "num_input_tokens_seen": 5207810, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 2.6875, + "step": 248, + "time_per_iteration": 2.5365517139434814 + }, + { + "auxiliary_loss_clip": 0.0136687, + "auxiliary_loss_mlp": 0.01105288, + "balance_loss_clip": 1.05552983, + "balance_loss_mlp": 1.10545397, + "epoch": 0.014970689914324365, + "flos": 26865023806080.0, + "grad_norm": 2.1747011613954177, + "language_loss": 0.83849227, + "learning_rate": 3.552424094769381e-06, + "loss": 0.86321384, + "num_input_tokens_seen": 5226210, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 2.609375, + "step": 249, + "time_per_iteration": 2.6142020225524902 + }, + { + "auxiliary_loss_clip": 0.01360073, + "auxiliary_loss_mlp": 0.01106063, + "balance_loss_clip": 1.05806887, + "balance_loss_mlp": 1.09971058, + "epoch": 0.015030813166992334, + "flos": 13985662371840.0, + "grad_norm": 2.2137591284686455, + "language_loss": 0.93476778, + "learning_rate": 3.5550046680977174e-06, + "loss": 0.95942914, + "num_input_tokens_seen": 5241660, + "router_z_loss_clip": 0.47851562, + "router_z_loss_mlp": 2.609375, + "step": 250, + "time_per_iteration": 2.485686779022217 + }, + { + "auxiliary_loss_clip": 0.01369254, + "auxiliary_loss_mlp": 0.01114661, + "balance_loss_clip": 1.06351972, + "balance_loss_mlp": 1.10460913, + "epoch": 0.015090936419660304, + "flos": 24717817618560.0, + "grad_norm": 2.2612141068319622, + "language_loss": 0.97030997, + "learning_rate": 3.5575749397087034e-06, + "loss": 0.99514914, + "num_input_tokens_seen": 5261090, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 2.640625, + "step": 251, + "time_per_iteration": 2.5887296199798584 + }, + { + "auxiliary_loss_clip": 0.01362288, + "auxiliary_loss_mlp": 0.01105325, + "balance_loss_clip": 1.05723596, + "balance_loss_mlp": 1.09872079, + "epoch": 0.015151059672328273, + "flos": 25738793798400.0, + "grad_norm": 2.0465178965121136, + "language_loss": 0.8428089, + "learning_rate": 3.5601349915248707e-06, + "loss": 0.86748511, + "num_input_tokens_seen": 5279175, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 2.640625, + "step": 252, + "time_per_iteration": 2.5749199390411377 + }, + { + "auxiliary_loss_clip": 0.01357969, + "auxiliary_loss_mlp": 0.01114738, + "balance_loss_clip": 1.06569552, + "balance_loss_mlp": 1.10169089, + "epoch": 0.015211182924996243, + "flos": 21871214737920.0, + "grad_norm": 2.482990993198259, + "language_loss": 0.98208833, + "learning_rate": 3.5626849044954064e-06, + "loss": 1.00681543, + "num_input_tokens_seen": 5296975, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 2.5625, + "step": 253, + "time_per_iteration": 2.5639333724975586 + }, + { + "auxiliary_loss_clip": 0.01233728, + "auxiliary_loss_mlp": 0.01026961, + "balance_loss_clip": 1.00855541, + "balance_loss_mlp": 1.09965372, + "epoch": 0.015271306177664212, + "flos": 66895080888960.0, + "grad_norm": 0.8505459641429172, + "language_loss": 0.55672622, + "learning_rate": 3.5652247586115167e-06, + "loss": 0.57933319, + "num_input_tokens_seen": 5358375, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.34375, + "step": 254, + "time_per_iteration": 3.1063449382781982 + }, + { + "auxiliary_loss_clip": 0.01362079, + "auxiliary_loss_mlp": 0.01116704, + "balance_loss_clip": 1.06687438, + "balance_loss_mlp": 1.09652638, + "epoch": 0.01533142943033218, + "flos": 26834069260800.0, + "grad_norm": 2.4360968938917065, + "language_loss": 0.90453845, + "learning_rate": 3.567754632921479e-06, + "loss": 0.9293263, + "num_input_tokens_seen": 5377255, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 2.65625, + "step": 255, + "time_per_iteration": 2.5746912956237793 + }, + { + "auxiliary_loss_clip": 0.01358909, + "auxiliary_loss_mlp": 0.01125654, + "balance_loss_clip": 1.07568169, + "balance_loss_mlp": 1.09931397, + "epoch": 0.01539155268300015, + "flos": 20813753318400.0, + "grad_norm": 2.2666703391376903, + "language_loss": 0.8562001, + "learning_rate": 3.5702746055454075e-06, + "loss": 0.8810457, + "num_input_tokens_seen": 5395320, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 2.59375, + "step": 256, + "time_per_iteration": 2.6095149517059326 + }, + { + "auxiliary_loss_clip": 0.01366413, + "auxiliary_loss_mlp": 0.01112529, + "balance_loss_clip": 1.06305718, + "balance_loss_mlp": 1.09961021, + "epoch": 0.01545167593566812, + "flos": 15961862885760.0, + "grad_norm": 2.7442871984488386, + "language_loss": 0.71504897, + "learning_rate": 3.5727847536897254e-06, + "loss": 0.73983842, + "num_input_tokens_seen": 5411970, + "router_z_loss_clip": 0.49414062, + "router_z_loss_mlp": 2.65625, + "step": 257, + "time_per_iteration": 2.5939691066741943 + }, + { + "auxiliary_loss_clip": 0.01357007, + "auxiliary_loss_mlp": 0.01100177, + "balance_loss_clip": 1.05087197, + "balance_loss_mlp": 1.09875202, + "epoch": 0.01551179918833609, + "flos": 22601745544320.0, + "grad_norm": 1.9522192109187282, + "language_loss": 0.94659579, + "learning_rate": 3.5752851536613596e-06, + "loss": 0.97116768, + "num_input_tokens_seen": 5430245, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 2.578125, + "step": 258, + "time_per_iteration": 2.7119739055633545 + }, + { + "auxiliary_loss_clip": 0.01356701, + "auxiliary_loss_mlp": 0.01104272, + "balance_loss_clip": 1.05615926, + "balance_loss_mlp": 1.09608126, + "epoch": 0.015571922441004058, + "flos": 22816706486400.0, + "grad_norm": 3.167214789879638, + "language_loss": 0.93174207, + "learning_rate": 3.577775880881658e-06, + "loss": 0.95635182, + "num_input_tokens_seen": 5448905, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 2.59375, + "step": 259, + "time_per_iteration": 2.6776607036590576 + }, + { + "auxiliary_loss_clip": 0.01351639, + "auxiliary_loss_mlp": 0.010988, + "balance_loss_clip": 1.05297637, + "balance_loss_mlp": 1.10035825, + "epoch": 0.015632045693672027, + "flos": 18947439486720.0, + "grad_norm": 2.1226725879970605, + "language_loss": 0.97360909, + "learning_rate": 3.5802570099000424e-06, + "loss": 0.99811351, + "num_input_tokens_seen": 5466405, + "router_z_loss_clip": 0.45898438, + "router_z_loss_mlp": 2.515625, + "step": 260, + "time_per_iteration": 2.520759105682373 + }, + { + "auxiliary_loss_clip": 0.01365989, + "auxiliary_loss_mlp": 0.01110082, + "balance_loss_clip": 1.06282747, + "balance_loss_mlp": 1.10060608, + "epoch": 0.015692168946339995, + "flos": 29971728046080.0, + "grad_norm": 2.3569711169381, + "language_loss": 0.87644511, + "learning_rate": 3.5827286144073947e-06, + "loss": 0.90120584, + "num_input_tokens_seen": 5487055, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 2.65625, + "step": 261, + "time_per_iteration": 2.5837602615356445 + }, + { + "auxiliary_loss_clip": 0.0135711, + "auxiliary_loss_mlp": 0.01105229, + "balance_loss_clip": 1.05613816, + "balance_loss_mlp": 1.09709311, + "epoch": 0.015752292199007967, + "flos": 19392085946880.0, + "grad_norm": 1.9926513495738176, + "language_loss": 0.67226446, + "learning_rate": 3.5851907672491904e-06, + "loss": 0.69688779, + "num_input_tokens_seen": 5506600, + "router_z_loss_clip": 0.49023438, + "router_z_loss_mlp": 2.59375, + "step": 262, + "time_per_iteration": 2.5490784645080566 + }, + { + "auxiliary_loss_clip": 0.01354995, + "auxiliary_loss_mlp": 0.01121613, + "balance_loss_clip": 1.07145, + "balance_loss_mlp": 1.0984714, + "epoch": 0.015812415451675936, + "flos": 20339804338560.0, + "grad_norm": 2.3019763169045637, + "language_loss": 0.68570435, + "learning_rate": 3.587643540438383e-06, + "loss": 0.71047044, + "num_input_tokens_seen": 5524350, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 2.5625, + "step": 263, + "time_per_iteration": 2.5207104682922363 + }, + { + "auxiliary_loss_clip": 0.01355963, + "auxiliary_loss_mlp": 0.01105396, + "balance_loss_clip": 1.055686, + "balance_loss_mlp": 1.09446979, + "epoch": 0.015872538704343905, + "flos": 17525412979200.0, + "grad_norm": 3.705792502973735, + "language_loss": 0.85120308, + "learning_rate": 3.590087005168037e-06, + "loss": 0.87581658, + "num_input_tokens_seen": 5542145, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 2.625, + "step": 264, + "time_per_iteration": 2.559406280517578 + }, + { + "auxiliary_loss_clip": 0.01361439, + "auxiliary_loss_mlp": 0.01092909, + "balance_loss_clip": 1.04663229, + "balance_loss_mlp": 1.10003614, + "epoch": 0.015932661957011873, + "flos": 15260490944640.0, + "grad_norm": 4.651007312001026, + "language_loss": 1.04371059, + "learning_rate": 3.5925212318237344e-06, + "loss": 1.06825411, + "num_input_tokens_seen": 5557920, + "router_z_loss_clip": 0.46289062, + "router_z_loss_mlp": 2.625, + "step": 265, + "time_per_iteration": 2.5076427459716797 + }, + { + "auxiliary_loss_clip": 0.01364923, + "auxiliary_loss_mlp": 0.01114141, + "balance_loss_clip": 1.06266677, + "balance_loss_mlp": 1.10278761, + "epoch": 0.015992785209679845, + "flos": 20302528999680.0, + "grad_norm": 2.2797174203272705, + "language_loss": 0.75153112, + "learning_rate": 3.5949462899957323e-06, + "loss": 0.77632177, + "num_input_tokens_seen": 5576290, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 2.625, + "step": 266, + "time_per_iteration": 2.52923583984375 + }, + { + "auxiliary_loss_clip": 0.01351984, + "auxiliary_loss_mlp": 0.01101922, + "balance_loss_clip": 1.05321336, + "balance_loss_mlp": 1.10004377, + "epoch": 0.016052908462347814, + "flos": 23362368969600.0, + "grad_norm": 1.7047265515665009, + "language_loss": 0.90568709, + "learning_rate": 3.5973622484909068e-06, + "loss": 0.93022615, + "num_input_tokens_seen": 5595205, + "router_z_loss_clip": 0.48632812, + "router_z_loss_mlp": 2.515625, + "step": 267, + "time_per_iteration": 4.033226251602173 + }, + { + "auxiliary_loss_clip": 0.01359316, + "auxiliary_loss_mlp": 0.01118854, + "balance_loss_clip": 1.07143235, + "balance_loss_mlp": 1.09878063, + "epoch": 0.016113031715015783, + "flos": 21286588976640.0, + "grad_norm": 2.258126572730018, + "language_loss": 0.86044276, + "learning_rate": 3.599769175344462e-06, + "loss": 0.88522446, + "num_input_tokens_seen": 5612645, + "router_z_loss_clip": 0.47460938, + "router_z_loss_mlp": 2.609375, + "step": 268, + "time_per_iteration": 3.9120936393737793 + }, + { + "auxiliary_loss_clip": 0.01352601, + "auxiliary_loss_mlp": 0.01098281, + "balance_loss_clip": 1.05186045, + "balance_loss_mlp": 1.10092831, + "epoch": 0.01617315496768375, + "flos": 18914689261440.0, + "grad_norm": 3.4793793476816335, + "language_loss": 0.88284534, + "learning_rate": 3.602167137831432e-06, + "loss": 0.90735412, + "num_input_tokens_seen": 5628345, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 2.515625, + "step": 269, + "time_per_iteration": 2.5170347690582275 + }, + { + "auxiliary_loss_clip": 0.01357286, + "auxiliary_loss_mlp": 0.01099969, + "balance_loss_clip": 1.04901874, + "balance_loss_mlp": 1.09723783, + "epoch": 0.01623327822035172, + "flos": 16546488647040.0, + "grad_norm": 2.082153756456244, + "language_loss": 0.97073388, + "learning_rate": 3.6045562024779565e-06, + "loss": 0.99530637, + "num_input_tokens_seen": 5645940, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.59375, + "step": 270, + "time_per_iteration": 2.4856350421905518 + }, + { + "auxiliary_loss_clip": 0.01357366, + "auxiliary_loss_mlp": 0.01117767, + "balance_loss_clip": 1.07001138, + "balance_loss_mlp": 1.10259032, + "epoch": 0.016293401473019692, + "flos": 23513481486720.0, + "grad_norm": 2.1071719511680755, + "language_loss": 0.85919821, + "learning_rate": 3.606936435072361e-06, + "loss": 0.88394946, + "num_input_tokens_seen": 5665690, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 2.546875, + "step": 271, + "time_per_iteration": 2.55047345161438 + }, + { + "auxiliary_loss_clip": 0.01355041, + "auxiliary_loss_mlp": 0.0109977, + "balance_loss_clip": 1.05201519, + "balance_loss_mlp": 1.09418058, + "epoch": 0.01635352472568766, + "flos": 29016072748800.0, + "grad_norm": 3.6330072162998523, + "language_loss": 0.81509304, + "learning_rate": 3.609307900676025e-06, + "loss": 0.83964115, + "num_input_tokens_seen": 5683190, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 2.609375, + "step": 272, + "time_per_iteration": 2.563840389251709 + }, + { + "auxiliary_loss_clip": 0.01348825, + "auxiliary_loss_mlp": 0.01118044, + "balance_loss_clip": 1.07229137, + "balance_loss_mlp": 1.09649634, + "epoch": 0.01641364797835563, + "flos": 13370513028480.0, + "grad_norm": 2.4112371858801436, + "language_loss": 0.81101978, + "learning_rate": 3.611670663634051e-06, + "loss": 0.83568847, + "num_input_tokens_seen": 5699780, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 2.515625, + "step": 273, + "time_per_iteration": 2.504791736602783 + }, + { + "auxiliary_loss_clip": 0.01348205, + "auxiliary_loss_mlp": 0.01105988, + "balance_loss_clip": 1.05825627, + "balance_loss_mlp": 1.0930239, + "epoch": 0.016473771231023598, + "flos": 18878239935360.0, + "grad_norm": 2.3125197915452387, + "language_loss": 0.91599321, + "learning_rate": 3.614024787585744e-06, + "loss": 0.94053519, + "num_input_tokens_seen": 5716980, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 2.5625, + "step": 274, + "time_per_iteration": 2.530883312225342 + }, + { + "auxiliary_loss_clip": 0.01346841, + "auxiliary_loss_mlp": 0.01110058, + "balance_loss_clip": 1.06154013, + "balance_loss_mlp": 1.09588742, + "epoch": 0.016533894483691566, + "flos": 22601637803520.0, + "grad_norm": 1.8828740595481548, + "language_loss": 0.87952697, + "learning_rate": 3.6163703354748927e-06, + "loss": 0.90409595, + "num_input_tokens_seen": 5737780, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 2.515625, + "step": 275, + "time_per_iteration": 2.6067841053009033 + }, + { + "auxiliary_loss_clip": 0.01349399, + "auxiliary_loss_mlp": 0.01103927, + "balance_loss_clip": 1.05481219, + "balance_loss_mlp": 1.09579742, + "epoch": 0.01659401773635954, + "flos": 21507188353920.0, + "grad_norm": 1.8814357547622875, + "language_loss": 0.80717576, + "learning_rate": 3.6187073695598707e-06, + "loss": 0.83170903, + "num_input_tokens_seen": 5758330, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 2.53125, + "step": 276, + "time_per_iteration": 2.5251641273498535 + }, + { + "auxiliary_loss_clip": 0.01340258, + "auxiliary_loss_mlp": 0.01100275, + "balance_loss_clip": 1.0561676, + "balance_loss_mlp": 1.0946306, + "epoch": 0.016654140989027507, + "flos": 32850973411200.0, + "grad_norm": 1.7238418569970533, + "language_loss": 0.81033546, + "learning_rate": 3.621035951423551e-06, + "loss": 0.83474076, + "num_input_tokens_seen": 5778340, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 2.46875, + "step": 277, + "time_per_iteration": 2.6796398162841797 + }, + { + "auxiliary_loss_clip": 0.01338755, + "auxiliary_loss_mlp": 0.01095233, + "balance_loss_clip": 1.04828835, + "balance_loss_mlp": 1.08789539, + "epoch": 0.016714264241695476, + "flos": 12306228024960.0, + "grad_norm": 2.810922211495867, + "language_loss": 0.80307728, + "learning_rate": 3.623356141983041e-06, + "loss": 0.82741719, + "num_input_tokens_seen": 5794295, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 2.515625, + "step": 278, + "time_per_iteration": 2.4939208030700684 + }, + { + "auxiliary_loss_clip": 0.01343866, + "auxiliary_loss_mlp": 0.01101481, + "balance_loss_clip": 1.05634809, + "balance_loss_mlp": 1.09381282, + "epoch": 0.016774387494363444, + "flos": 27123796362240.0, + "grad_norm": 1.7778988036026468, + "language_loss": 0.90482658, + "learning_rate": 3.6256680014992486e-06, + "loss": 0.92928004, + "num_input_tokens_seen": 5814405, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 2.5, + "step": 279, + "time_per_iteration": 2.571439504623413 + }, + { + "auxiliary_loss_clip": 0.01348727, + "auxiliary_loss_mlp": 0.01116075, + "balance_loss_clip": 1.06872559, + "balance_loss_mlp": 1.09391451, + "epoch": 0.016834510747031413, + "flos": 20191493082240.0, + "grad_norm": 3.0477743200742387, + "language_loss": 0.94153798, + "learning_rate": 3.6279715895862713e-06, + "loss": 0.96618605, + "num_input_tokens_seen": 5832795, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 2.546875, + "step": 280, + "time_per_iteration": 2.5161728858947754 + }, + { + "auxiliary_loss_clip": 0.0134865, + "auxiliary_loss_mlp": 0.0110692, + "balance_loss_clip": 1.05864, + "balance_loss_mlp": 1.09245062, + "epoch": 0.016894633999699385, + "flos": 27274262434560.0, + "grad_norm": 3.578687135351882, + "language_loss": 0.73929775, + "learning_rate": 3.6302669652206183e-06, + "loss": 0.76385343, + "num_input_tokens_seen": 5855750, + "router_z_loss_clip": 0.48242188, + "router_z_loss_mlp": 2.5625, + "step": 281, + "time_per_iteration": 2.616241931915283 + }, + { + "auxiliary_loss_clip": 0.01343434, + "auxiliary_loss_mlp": 0.0111488, + "balance_loss_clip": 1.06977129, + "balance_loss_mlp": 1.09390783, + "epoch": 0.016954757252367354, + "flos": 14902964922240.0, + "grad_norm": 2.679798242609796, + "language_loss": 0.80207133, + "learning_rate": 3.632554186750274e-06, + "loss": 0.82665443, + "num_input_tokens_seen": 5872610, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 2.5, + "step": 282, + "time_per_iteration": 2.5421135425567627 + }, + { + "auxiliary_loss_clip": 0.01348806, + "auxiliary_loss_mlp": 0.01117348, + "balance_loss_clip": 1.0704273, + "balance_loss_mlp": 1.09599137, + "epoch": 0.017014880505035322, + "flos": 21358805270400.0, + "grad_norm": 2.1184562475367916, + "language_loss": 0.77788174, + "learning_rate": 3.6348333119035937e-06, + "loss": 0.80254328, + "num_input_tokens_seen": 5892985, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 2.53125, + "step": 283, + "time_per_iteration": 2.516474485397339 + }, + { + "auxiliary_loss_clip": 0.01349252, + "auxiliary_loss_mlp": 0.01091995, + "balance_loss_clip": 1.04788804, + "balance_loss_mlp": 1.09700751, + "epoch": 0.01707500375770329, + "flos": 35333154858240.0, + "grad_norm": 2.1009174504018544, + "language_loss": 0.84172702, + "learning_rate": 3.6371043977980503e-06, + "loss": 0.86613953, + "num_input_tokens_seen": 5914060, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 2.515625, + "step": 284, + "time_per_iteration": 2.646301031112671 + }, + { + "auxiliary_loss_clip": 0.01339164, + "auxiliary_loss_mlp": 0.010994, + "balance_loss_clip": 1.05216956, + "balance_loss_mlp": 1.09148788, + "epoch": 0.01713512701037126, + "flos": 23582070506880.0, + "grad_norm": 3.014395623363928, + "language_loss": 0.96993905, + "learning_rate": 3.639367500948819e-06, + "loss": 0.99432468, + "num_input_tokens_seen": 5932860, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 2.46875, + "step": 285, + "time_per_iteration": 2.5412731170654297 + }, + { + "auxiliary_loss_clip": 0.01342544, + "auxiliary_loss_mlp": 0.01093983, + "balance_loss_clip": 1.05025744, + "balance_loss_mlp": 1.09407294, + "epoch": 0.01719525026303923, + "flos": 27634661544960.0, + "grad_norm": 2.2067050643741433, + "language_loss": 0.93951917, + "learning_rate": 3.6416226772772178e-06, + "loss": 0.96388453, + "num_input_tokens_seen": 5952725, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 2.484375, + "step": 286, + "time_per_iteration": 2.5895566940307617 + }, + { + "auxiliary_loss_clip": 0.0133546, + "auxiliary_loss_mlp": 0.01090331, + "balance_loss_clip": 1.04503167, + "balance_loss_mlp": 1.08924019, + "epoch": 0.0172553735157072, + "flos": 26979722910720.0, + "grad_norm": 1.8729510510678706, + "language_loss": 0.92157722, + "learning_rate": 3.643869982119001e-06, + "loss": 0.94583511, + "num_input_tokens_seen": 5970560, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 2.46875, + "step": 287, + "time_per_iteration": 2.6144802570343018 + }, + { + "auxiliary_loss_clip": 0.01338793, + "auxiliary_loss_mlp": 0.01089685, + "balance_loss_clip": 1.04462433, + "balance_loss_mlp": 1.08859432, + "epoch": 0.01731549676837517, + "flos": 14056621689600.0, + "grad_norm": 3.2271144452092564, + "language_loss": 1.02026963, + "learning_rate": 3.646109470232502e-06, + "loss": 1.04455447, + "num_input_tokens_seen": 5982980, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 2.5, + "step": 288, + "time_per_iteration": 2.488274097442627 + }, + { + "auxiliary_loss_clip": 0.01222501, + "auxiliary_loss_mlp": 0.01036501, + "balance_loss_clip": 1.02000237, + "balance_loss_mlp": 1.09325862, + "epoch": 0.017375620021043137, + "flos": 66510694471680.0, + "grad_norm": 0.9131614435254132, + "language_loss": 0.63915455, + "learning_rate": 3.6483411958066417e-06, + "loss": 0.66174459, + "num_input_tokens_seen": 6049445, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 1.296875, + "step": 289, + "time_per_iteration": 3.222426652908325 + }, + { + "auxiliary_loss_clip": 0.01341104, + "auxiliary_loss_mlp": 0.01107523, + "balance_loss_clip": 1.06379664, + "balance_loss_mlp": 1.09403992, + "epoch": 0.01743574327371111, + "flos": 15225154940160.0, + "grad_norm": 2.4014361624695173, + "language_loss": 0.88569438, + "learning_rate": 3.6505652124687957e-06, + "loss": 0.91018069, + "num_input_tokens_seen": 6064150, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 2.46875, + "step": 290, + "time_per_iteration": 2.49294114112854 + }, + { + "auxiliary_loss_clip": 0.01338257, + "auxiliary_loss_mlp": 0.01091523, + "balance_loss_clip": 1.04631877, + "balance_loss_mlp": 1.09248078, + "epoch": 0.017495866526379078, + "flos": 25373869574400.0, + "grad_norm": 2.156562479490788, + "language_loss": 0.84578067, + "learning_rate": 3.6527815732925258e-06, + "loss": 0.87007844, + "num_input_tokens_seen": 6083920, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 2.453125, + "step": 291, + "time_per_iteration": 2.5356485843658447 + }, + { + "auxiliary_loss_clip": 0.01345108, + "auxiliary_loss_mlp": 0.01106973, + "balance_loss_clip": 1.05897939, + "balance_loss_mlp": 1.10042334, + "epoch": 0.017555989779047047, + "flos": 26359473836160.0, + "grad_norm": 1.6617628708439536, + "language_loss": 0.72766221, + "learning_rate": 3.6549903308051806e-06, + "loss": 0.75218308, + "num_input_tokens_seen": 6105460, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 2.453125, + "step": 292, + "time_per_iteration": 2.6524176597595215 + }, + { + "auxiliary_loss_clip": 0.01333825, + "auxiliary_loss_mlp": 0.01101528, + "balance_loss_clip": 1.05625248, + "balance_loss_mlp": 1.09236324, + "epoch": 0.017616113031715015, + "flos": 22338807010560.0, + "grad_norm": 2.2014441192179866, + "language_loss": 0.8726995, + "learning_rate": 3.6571915369953646e-06, + "loss": 0.89705306, + "num_input_tokens_seen": 6122890, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 2.40625, + "step": 293, + "time_per_iteration": 2.531580686569214 + }, + { + "auxiliary_loss_clip": 0.01334314, + "auxiliary_loss_mlp": 0.0110389, + "balance_loss_clip": 1.05959213, + "balance_loss_mlp": 1.09177744, + "epoch": 0.017676236284382984, + "flos": 20156911263360.0, + "grad_norm": 2.3120260424061367, + "language_loss": 0.81276119, + "learning_rate": 3.6593852433202797e-06, + "loss": 0.83714324, + "num_input_tokens_seen": 6142890, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 2.4375, + "step": 294, + "time_per_iteration": 2.568784236907959 + }, + { + "auxiliary_loss_clip": 0.01334452, + "auxiliary_loss_mlp": 0.01107857, + "balance_loss_clip": 1.06274807, + "balance_loss_mlp": 1.08824301, + "epoch": 0.017736359537050956, + "flos": 25223331674880.0, + "grad_norm": 2.9227055740425705, + "language_loss": 0.83710909, + "learning_rate": 3.6615715007129453e-06, + "loss": 0.86153215, + "num_input_tokens_seen": 6162030, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 2.46875, + "step": 295, + "time_per_iteration": 2.5799450874328613 + }, + { + "auxiliary_loss_clip": 0.01339817, + "auxiliary_loss_mlp": 0.01110731, + "balance_loss_clip": 1.06559837, + "balance_loss_mlp": 1.09874845, + "epoch": 0.017796482789718925, + "flos": 20338798757760.0, + "grad_norm": 2.5339269047951727, + "language_loss": 0.84620988, + "learning_rate": 3.6637503595892897e-06, + "loss": 0.87071538, + "num_input_tokens_seen": 6180540, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 2.40625, + "step": 296, + "time_per_iteration": 2.5243051052093506 + }, + { + "auxiliary_loss_clip": 0.01338756, + "auxiliary_loss_mlp": 0.01097832, + "balance_loss_clip": 1.05417752, + "balance_loss_mlp": 1.09317493, + "epoch": 0.017856606042386893, + "flos": 22379206832640.0, + "grad_norm": 2.123858619871597, + "language_loss": 0.87729871, + "learning_rate": 3.665921869855132e-06, + "loss": 0.90166461, + "num_input_tokens_seen": 6199425, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 2.453125, + "step": 297, + "time_per_iteration": 2.5186710357666016 + }, + { + "auxiliary_loss_clip": 0.01337139, + "auxiliary_loss_mlp": 0.01100837, + "balance_loss_clip": 1.05713463, + "balance_loss_mlp": 1.09108877, + "epoch": 0.017916729295054862, + "flos": 20230061310720.0, + "grad_norm": 2.170328911832355, + "language_loss": 0.88528925, + "learning_rate": 3.6680860809130346e-06, + "loss": 0.90966904, + "num_input_tokens_seen": 6219170, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 2.46875, + "step": 298, + "time_per_iteration": 2.5320143699645996 + }, + { + "auxiliary_loss_clip": 0.0133273, + "auxiliary_loss_mlp": 0.01118432, + "balance_loss_clip": 1.07234538, + "balance_loss_mlp": 1.09249902, + "epoch": 0.01797685254772283, + "flos": 19390972625280.0, + "grad_norm": 1.8938405886263965, + "language_loss": 0.88666737, + "learning_rate": 3.6702430416690516e-06, + "loss": 0.91117901, + "num_input_tokens_seen": 6237930, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 2.40625, + "step": 299, + "time_per_iteration": 2.588275671005249 + }, + { + "auxiliary_loss_clip": 0.01340105, + "auxiliary_loss_mlp": 0.01105829, + "balance_loss_clip": 1.06055307, + "balance_loss_mlp": 1.09275746, + "epoch": 0.018036975800390802, + "flos": 24426007528320.0, + "grad_norm": 3.2936483356677253, + "language_loss": 0.64349103, + "learning_rate": 3.672392800539357e-06, + "loss": 0.66795039, + "num_input_tokens_seen": 6257170, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 2.46875, + "step": 300, + "time_per_iteration": 2.592313289642334 + }, + { + "auxiliary_loss_clip": 0.01338706, + "auxiliary_loss_mlp": 0.01105447, + "balance_loss_clip": 1.05986142, + "balance_loss_mlp": 1.09540462, + "epoch": 0.01809709905305877, + "flos": 15778933896960.0, + "grad_norm": 2.310898752337597, + "language_loss": 0.88330823, + "learning_rate": 3.6745354054567686e-06, + "loss": 0.90774977, + "num_input_tokens_seen": 6274780, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 2.4375, + "step": 301, + "time_per_iteration": 2.499481439590454 + }, + { + "auxiliary_loss_clip": 0.01214573, + "auxiliary_loss_mlp": 0.01024582, + "balance_loss_clip": 1.00932336, + "balance_loss_mlp": 1.08753991, + "epoch": 0.01815722230572674, + "flos": 67348382526720.0, + "grad_norm": 0.8370211186232274, + "language_loss": 0.62198341, + "learning_rate": 3.676670903877158e-06, + "loss": 0.64437497, + "num_input_tokens_seen": 6340435, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 1.265625, + "step": 302, + "time_per_iteration": 3.259997844696045 + }, + { + "auxiliary_loss_clip": 0.01329895, + "auxiliary_loss_mlp": 0.01107479, + "balance_loss_clip": 1.06198907, + "balance_loss_mlp": 1.08938098, + "epoch": 0.01821734555839471, + "flos": 15485615435520.0, + "grad_norm": 2.491293816938874, + "language_loss": 0.89617372, + "learning_rate": 3.6787993427857567e-06, + "loss": 0.92054749, + "num_input_tokens_seen": 6358160, + "router_z_loss_clip": 0.45507812, + "router_z_loss_mlp": 2.40625, + "step": 303, + "time_per_iteration": 2.536773920059204 + }, + { + "auxiliary_loss_clip": 0.01336859, + "auxiliary_loss_mlp": 0.01114111, + "balance_loss_clip": 1.06778669, + "balance_loss_mlp": 1.09363747, + "epoch": 0.018277468811062677, + "flos": 24097424889600.0, + "grad_norm": 4.887297609803561, + "language_loss": 0.80314684, + "learning_rate": 3.680920768703364e-06, + "loss": 0.82765651, + "num_input_tokens_seen": 6378485, + "router_z_loss_clip": 0.46289062, + "router_z_loss_mlp": 2.4375, + "step": 304, + "time_per_iteration": 2.563828945159912 + }, + { + "auxiliary_loss_clip": 0.01331614, + "auxiliary_loss_mlp": 0.01094816, + "balance_loss_clip": 1.05144823, + "balance_loss_mlp": 1.09657788, + "epoch": 0.01833759206373065, + "flos": 20959335141120.0, + "grad_norm": 1.8235558005033383, + "language_loss": 0.82894015, + "learning_rate": 3.6830352276924415e-06, + "loss": 0.85320443, + "num_input_tokens_seen": 6397845, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 2.34375, + "step": 305, + "time_per_iteration": 2.5195910930633545 + }, + { + "auxiliary_loss_clip": 0.01332168, + "auxiliary_loss_mlp": 0.01093114, + "balance_loss_clip": 1.04993677, + "balance_loss_mlp": 1.08868921, + "epoch": 0.018397715316398618, + "flos": 19390757143680.0, + "grad_norm": 1.9087210074301977, + "language_loss": 0.90843809, + "learning_rate": 3.685142765363119e-06, + "loss": 0.93269092, + "num_input_tokens_seen": 6416475, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 2.4375, + "step": 306, + "time_per_iteration": 2.501276969909668 + }, + { + "auxiliary_loss_clip": 0.01324982, + "auxiliary_loss_mlp": 0.01090544, + "balance_loss_clip": 1.04815364, + "balance_loss_mlp": 1.08638549, + "epoch": 0.018457838569066586, + "flos": 29132531619840.0, + "grad_norm": 2.1762826783898586, + "language_loss": 0.86435306, + "learning_rate": 3.687243426879095e-06, + "loss": 0.88850832, + "num_input_tokens_seen": 6437520, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 2.390625, + "step": 307, + "time_per_iteration": 2.6048038005828857 + }, + { + "auxiliary_loss_clip": 0.01325097, + "auxiliary_loss_mlp": 0.01106166, + "balance_loss_clip": 1.05817199, + "balance_loss_mlp": 1.09046888, + "epoch": 0.018517961821734555, + "flos": 19208654167680.0, + "grad_norm": 2.221444292833677, + "language_loss": 0.71723771, + "learning_rate": 3.6893372569634466e-06, + "loss": 0.74155033, + "num_input_tokens_seen": 6455680, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 2.34375, + "step": 308, + "time_per_iteration": 2.513774871826172 + }, + { + "auxiliary_loss_clip": 0.01331987, + "auxiliary_loss_mlp": 0.01102938, + "balance_loss_clip": 1.05904555, + "balance_loss_mlp": 1.08861351, + "epoch": 0.018578085074402523, + "flos": 19863018184320.0, + "grad_norm": 2.2254161740825293, + "language_loss": 0.91952753, + "learning_rate": 3.6914242999043395e-06, + "loss": 0.94387674, + "num_input_tokens_seen": 6474880, + "router_z_loss_clip": 0.43945312, + "router_z_loss_mlp": 2.4375, + "step": 309, + "time_per_iteration": 5.224750280380249 + }, + { + "auxiliary_loss_clip": 0.01338325, + "auxiliary_loss_mlp": 0.01104953, + "balance_loss_clip": 1.05896235, + "balance_loss_mlp": 1.08840334, + "epoch": 0.018638208327070496, + "flos": 29606947476480.0, + "grad_norm": 2.8056803187702135, + "language_loss": 0.72399509, + "learning_rate": 3.69350459956065e-06, + "loss": 0.74842793, + "num_input_tokens_seen": 6495945, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 2.5, + "step": 310, + "time_per_iteration": 2.584730863571167 + }, + { + "auxiliary_loss_clip": 0.01330325, + "auxiliary_loss_mlp": 0.01112154, + "balance_loss_clip": 1.06790328, + "balance_loss_mlp": 1.09306264, + "epoch": 0.018698331579738464, + "flos": 45731555907840.0, + "grad_norm": 12.392698164772181, + "language_loss": 0.74104297, + "learning_rate": 3.695578199367497e-06, + "loss": 0.76546776, + "num_input_tokens_seen": 6519930, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 2.375, + "step": 311, + "time_per_iteration": 2.734072208404541 + }, + { + "auxiliary_loss_clip": 0.01337963, + "auxiliary_loss_mlp": 0.0110935, + "balance_loss_clip": 1.06619668, + "balance_loss_mlp": 1.09045064, + "epoch": 0.018758454832406433, + "flos": 20483662308480.0, + "grad_norm": 2.2753160661232603, + "language_loss": 0.91518372, + "learning_rate": 3.6976451423416825e-06, + "loss": 0.93965685, + "num_input_tokens_seen": 6535070, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 2.46875, + "step": 312, + "time_per_iteration": 2.5117411613464355 + }, + { + "auxiliary_loss_clip": 0.01336169, + "auxiliary_loss_mlp": 0.01112089, + "balance_loss_clip": 1.06609774, + "balance_loss_mlp": 1.09088099, + "epoch": 0.0188185780850744, + "flos": 15777784661760.0, + "grad_norm": 2.320247917383294, + "language_loss": 0.89746982, + "learning_rate": 3.699705471087043e-06, + "loss": 0.92195237, + "num_input_tokens_seen": 6554135, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 2.453125, + "step": 313, + "time_per_iteration": 2.4761838912963867 + }, + { + "auxiliary_loss_clip": 0.01340305, + "auxiliary_loss_mlp": 0.01098393, + "balance_loss_clip": 1.05230689, + "balance_loss_mlp": 1.09061432, + "epoch": 0.018878701337742373, + "flos": 22455732758400.0, + "grad_norm": 2.3404867001555236, + "language_loss": 0.73099983, + "learning_rate": 3.7017592277997256e-06, + "loss": 0.75538683, + "num_input_tokens_seen": 6572275, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 2.5, + "step": 314, + "time_per_iteration": 2.5488638877868652 + }, + { + "auxiliary_loss_clip": 0.01326469, + "auxiliary_loss_mlp": 0.01103837, + "balance_loss_clip": 1.06101751, + "balance_loss_mlp": 1.08694446, + "epoch": 0.018938824590410342, + "flos": 30993530238720.0, + "grad_norm": 2.192553769026804, + "language_loss": 0.89887041, + "learning_rate": 3.7038064542733654e-06, + "loss": 0.92317349, + "num_input_tokens_seen": 6594520, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 2.40625, + "step": 315, + "time_per_iteration": 2.5857741832733154 + }, + { + "auxiliary_loss_clip": 0.01329672, + "auxiliary_loss_mlp": 0.01096027, + "balance_loss_clip": 1.05170512, + "balance_loss_mlp": 1.08870411, + "epoch": 0.01899894784307831, + "flos": 23258910821760.0, + "grad_norm": 1.8364758613144732, + "language_loss": 0.80796063, + "learning_rate": 3.7058471919041945e-06, + "loss": 0.83221763, + "num_input_tokens_seen": 6614245, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 2.40625, + "step": 316, + "time_per_iteration": 2.5222342014312744 + }, + { + "auxiliary_loss_clip": 0.01324399, + "auxiliary_loss_mlp": 0.01095063, + "balance_loss_clip": 1.05131364, + "balance_loss_mlp": 1.08633423, + "epoch": 0.01905907109574628, + "flos": 17457901367040.0, + "grad_norm": 2.1363686538021236, + "language_loss": 0.90357143, + "learning_rate": 3.7078814816960605e-06, + "loss": 0.92776608, + "num_input_tokens_seen": 6632015, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 2.375, + "step": 317, + "time_per_iteration": 2.4696123600006104 + }, + { + "auxiliary_loss_clip": 0.01319895, + "auxiliary_loss_mlp": 0.01095564, + "balance_loss_clip": 1.0515281, + "balance_loss_mlp": 1.0845592, + "epoch": 0.019119194348414248, + "flos": 14970225139200.0, + "grad_norm": 2.5260192321083794, + "language_loss": 0.90939772, + "learning_rate": 3.709909364265374e-06, + "loss": 0.93355227, + "num_input_tokens_seen": 6649015, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 2.34375, + "step": 318, + "time_per_iteration": 2.488128185272217 + }, + { + "auxiliary_loss_clip": 0.01324457, + "auxiliary_loss_mlp": 0.01088861, + "balance_loss_clip": 1.04706657, + "balance_loss_mlp": 1.08574772, + "epoch": 0.01917931760108222, + "flos": 25482822503040.0, + "grad_norm": 2.626221841877022, + "language_loss": 0.93980259, + "learning_rate": 3.7119308798459706e-06, + "loss": 0.96393579, + "num_input_tokens_seen": 6669225, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 2.390625, + "step": 319, + "time_per_iteration": 2.5184502601623535 + }, + { + "auxiliary_loss_clip": 0.01205117, + "auxiliary_loss_mlp": 0.01080363, + "balance_loss_clip": 1.06586683, + "balance_loss_mlp": 1.07482553, + "epoch": 0.01923944085375019, + "flos": 71556967353600.0, + "grad_norm": 0.9345393611259016, + "language_loss": 0.59860981, + "learning_rate": 3.7139460682939026e-06, + "loss": 0.62146461, + "num_input_tokens_seen": 6725775, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 1.296875, + "step": 320, + "time_per_iteration": 3.0250258445739746 + }, + { + "auxiliary_loss_clip": 0.01320993, + "auxiliary_loss_mlp": 0.0110086, + "balance_loss_clip": 1.05827808, + "balance_loss_mlp": 1.08425927, + "epoch": 0.019299564106418157, + "flos": 19682495406720.0, + "grad_norm": 3.0799113353921572, + "language_loss": 0.89622325, + "learning_rate": 3.715954969092154e-06, + "loss": 0.92044175, + "num_input_tokens_seen": 6744170, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 2.375, + "step": 321, + "time_per_iteration": 2.476439952850342 + }, + { + "auxiliary_loss_clip": 0.01332068, + "auxiliary_loss_mlp": 0.0112077, + "balance_loss_clip": 1.07620978, + "balance_loss_mlp": 1.08993089, + "epoch": 0.019359687359086126, + "flos": 24387151991040.0, + "grad_norm": 2.068543890023447, + "language_loss": 0.82884163, + "learning_rate": 3.7179576213552805e-06, + "loss": 0.85337007, + "num_input_tokens_seen": 6764565, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 2.421875, + "step": 322, + "time_per_iteration": 2.556302309036255 + }, + { + "auxiliary_loss_clip": 0.01332156, + "auxiliary_loss_mlp": 0.01090343, + "balance_loss_clip": 1.04828596, + "balance_loss_mlp": 1.08754158, + "epoch": 0.019419810611754094, + "flos": 23951376190080.0, + "grad_norm": 2.2506232399398245, + "language_loss": 0.72734368, + "learning_rate": 3.719954063833981e-06, + "loss": 0.75156873, + "num_input_tokens_seen": 6785310, + "router_z_loss_clip": 0.41992188, + "router_z_loss_mlp": 2.453125, + "step": 323, + "time_per_iteration": 2.5033397674560547 + }, + { + "auxiliary_loss_clip": 0.01318896, + "auxiliary_loss_mlp": 0.01090622, + "balance_loss_clip": 1.04763484, + "balance_loss_mlp": 1.08184087, + "epoch": 0.019479933864422067, + "flos": 22160223567360.0, + "grad_norm": 2.023515622890843, + "language_loss": 0.92639947, + "learning_rate": 3.721944334919596e-06, + "loss": 0.95049465, + "num_input_tokens_seen": 6803290, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 2.375, + "step": 324, + "time_per_iteration": 2.5194544792175293 + }, + { + "auxiliary_loss_clip": 0.01328869, + "auxiliary_loss_mlp": 0.01089838, + "balance_loss_clip": 1.04935479, + "balance_loss_mlp": 1.08943164, + "epoch": 0.019540057117090035, + "flos": 22236821320320.0, + "grad_norm": 4.018466874717804, + "language_loss": 0.65336061, + "learning_rate": 3.7239284726485375e-06, + "loss": 0.67754775, + "num_input_tokens_seen": 6822570, + "router_z_loss_clip": 0.40429688, + "router_z_loss_mlp": 2.390625, + "step": 325, + "time_per_iteration": 2.5107386112213135 + }, + { + "auxiliary_loss_clip": 0.0132709, + "auxiliary_loss_mlp": 0.01101196, + "balance_loss_clip": 1.05799484, + "balance_loss_mlp": 1.093485, + "epoch": 0.019600180369758004, + "flos": 23076771932160.0, + "grad_norm": 1.921455060851243, + "language_loss": 0.76449442, + "learning_rate": 3.72590651470665e-06, + "loss": 0.78877723, + "num_input_tokens_seen": 6841910, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 2.34375, + "step": 326, + "time_per_iteration": 2.5080325603485107 + }, + { + "auxiliary_loss_clip": 0.01320399, + "auxiliary_loss_mlp": 0.01103572, + "balance_loss_clip": 1.06015599, + "balance_loss_mlp": 1.08845115, + "epoch": 0.019660303622425972, + "flos": 25410857604480.0, + "grad_norm": 2.1551163890972123, + "language_loss": 0.79176939, + "learning_rate": 3.727878498433505e-06, + "loss": 0.8160091, + "num_input_tokens_seen": 6862480, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 2.3125, + "step": 327, + "time_per_iteration": 2.5449633598327637 + }, + { + "auxiliary_loss_clip": 0.01326802, + "auxiliary_loss_mlp": 0.01111954, + "balance_loss_clip": 1.06984949, + "balance_loss_mlp": 1.08873606, + "epoch": 0.01972042687509394, + "flos": 23657519024640.0, + "grad_norm": 2.1574079642063246, + "language_loss": 0.80725288, + "learning_rate": 3.7298444608266328e-06, + "loss": 0.83164048, + "num_input_tokens_seen": 6882015, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 2.390625, + "step": 328, + "time_per_iteration": 2.5418970584869385 + }, + { + "auxiliary_loss_clip": 0.01325663, + "auxiliary_loss_mlp": 0.01096681, + "balance_loss_clip": 1.05278802, + "balance_loss_mlp": 1.08396721, + "epoch": 0.019780550127761913, + "flos": 18223480869120.0, + "grad_norm": 2.245263087715646, + "language_loss": 0.93704766, + "learning_rate": 3.731804438545683e-06, + "loss": 0.96127105, + "num_input_tokens_seen": 6899785, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 2.40625, + "step": 329, + "time_per_iteration": 2.4910004138946533 + }, + { + "auxiliary_loss_clip": 0.01332781, + "auxiliary_loss_mlp": 0.01105781, + "balance_loss_clip": 1.06253231, + "balance_loss_mlp": 1.08930123, + "epoch": 0.01984067338042988, + "flos": 22418780641920.0, + "grad_norm": 2.9776357674257365, + "language_loss": 0.74277973, + "learning_rate": 3.7337584679165324e-06, + "loss": 0.7671653, + "num_input_tokens_seen": 6918575, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 2.4375, + "step": 330, + "time_per_iteration": 2.51430082321167 + }, + { + "auxiliary_loss_clip": 0.01328701, + "auxiliary_loss_mlp": 0.01120913, + "balance_loss_clip": 1.07814097, + "balance_loss_mlp": 1.08762872, + "epoch": 0.01990079663309785, + "flos": 17055199013760.0, + "grad_norm": 2.972763157156593, + "language_loss": 0.93870068, + "learning_rate": 3.7357065849353186e-06, + "loss": 0.96319681, + "num_input_tokens_seen": 6936965, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 2.40625, + "step": 331, + "time_per_iteration": 2.4759159088134766 + }, + { + "auxiliary_loss_clip": 0.01316192, + "auxiliary_loss_mlp": 0.01089699, + "balance_loss_clip": 1.04938233, + "balance_loss_mlp": 1.0853951, + "epoch": 0.01996091988576582, + "flos": 15961791058560.0, + "grad_norm": 2.6958694906457836, + "language_loss": 0.92730892, + "learning_rate": 3.737648825272422e-06, + "loss": 0.95136791, + "num_input_tokens_seen": 6953475, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 2.3125, + "step": 332, + "time_per_iteration": 2.49817156791687 + }, + { + "auxiliary_loss_clip": 0.01325132, + "auxiliary_loss_mlp": 0.01092519, + "balance_loss_clip": 1.04903162, + "balance_loss_mlp": 1.09081161, + "epoch": 0.02002104313843379, + "flos": 23586451966080.0, + "grad_norm": 2.6289067025313777, + "language_loss": 0.75589794, + "learning_rate": 3.739585224276384e-06, + "loss": 0.78007442, + "num_input_tokens_seen": 6971630, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 2.34375, + "step": 333, + "time_per_iteration": 2.5180609226226807 + }, + { + "auxiliary_loss_clip": 0.01323371, + "auxiliary_loss_mlp": 0.01087623, + "balance_loss_clip": 1.04597139, + "balance_loss_mlp": 1.08625877, + "epoch": 0.02008116639110176, + "flos": 34094883352320.0, + "grad_norm": 2.1766901409232426, + "language_loss": 0.78768885, + "learning_rate": 3.7415158169777673e-06, + "loss": 0.81179881, + "num_input_tokens_seen": 6992775, + "router_z_loss_clip": 0.41601562, + "router_z_loss_mlp": 2.375, + "step": 334, + "time_per_iteration": 2.614708423614502 + }, + { + "auxiliary_loss_clip": 0.01324397, + "auxiliary_loss_mlp": 0.01094838, + "balance_loss_clip": 1.05015838, + "balance_loss_mlp": 1.08276975, + "epoch": 0.020141289643769728, + "flos": 19683716469120.0, + "grad_norm": 2.4059127888346916, + "language_loss": 0.83083838, + "learning_rate": 3.7434406380929575e-06, + "loss": 0.85503072, + "num_input_tokens_seen": 7011425, + "router_z_loss_clip": 0.44726562, + "router_z_loss_mlp": 2.421875, + "step": 335, + "time_per_iteration": 2.495260000228882 + }, + { + "auxiliary_loss_clip": 0.01320649, + "auxiliary_loss_mlp": 0.01090782, + "balance_loss_clip": 1.04934454, + "balance_loss_mlp": 1.08585882, + "epoch": 0.020201412896437697, + "flos": 20740567357440.0, + "grad_norm": 2.166489879958422, + "language_loss": 0.92639577, + "learning_rate": 3.745359722027911e-06, + "loss": 0.95051014, + "num_input_tokens_seen": 7029450, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 2.34375, + "step": 336, + "time_per_iteration": 2.526906967163086 + }, + { + "auxiliary_loss_clip": 0.01321744, + "auxiliary_loss_mlp": 0.01083167, + "balance_loss_clip": 1.04139614, + "balance_loss_mlp": 1.08352447, + "epoch": 0.020261536149105665, + "flos": 20266510636800.0, + "grad_norm": 1.825762702383362, + "language_loss": 0.88474333, + "learning_rate": 3.7472731028818428e-06, + "loss": 0.90879244, + "num_input_tokens_seen": 7047555, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 2.390625, + "step": 337, + "time_per_iteration": 2.5151607990264893 + }, + { + "auxiliary_loss_clip": 0.01310297, + "auxiliary_loss_mlp": 0.01101804, + "balance_loss_clip": 1.05836427, + "balance_loss_mlp": 1.08001363, + "epoch": 0.020321659401773638, + "flos": 25848752307840.0, + "grad_norm": 1.5415234153999902, + "language_loss": 0.89914495, + "learning_rate": 3.7491808144508626e-06, + "loss": 0.92326593, + "num_input_tokens_seen": 7068185, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 2.3125, + "step": 338, + "time_per_iteration": 2.5795979499816895 + }, + { + "auxiliary_loss_clip": 0.01324391, + "auxiliary_loss_mlp": 0.01099602, + "balance_loss_clip": 1.05742574, + "balance_loss_mlp": 1.08479571, + "epoch": 0.020381782654441606, + "flos": 17495033051520.0, + "grad_norm": 2.047046576054304, + "language_loss": 0.84801471, + "learning_rate": 3.7510828902315576e-06, + "loss": 0.87225461, + "num_input_tokens_seen": 7085955, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 2.40625, + "step": 339, + "time_per_iteration": 2.4558403491973877 + }, + { + "auxiliary_loss_clip": 0.01326609, + "auxiliary_loss_mlp": 0.01093427, + "balance_loss_clip": 1.05001152, + "balance_loss_mlp": 1.08709431, + "epoch": 0.020441905907109575, + "flos": 24243940465920.0, + "grad_norm": 1.7544231793273473, + "language_loss": 0.88913274, + "learning_rate": 3.75297936342452e-06, + "loss": 0.91333312, + "num_input_tokens_seen": 7106345, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 2.40625, + "step": 340, + "time_per_iteration": 2.5330188274383545 + }, + { + "auxiliary_loss_clip": 0.01323557, + "auxiliary_loss_mlp": 0.01086176, + "balance_loss_clip": 1.04135346, + "balance_loss_mlp": 1.0859195, + "epoch": 0.020502029159777543, + "flos": 22233301787520.0, + "grad_norm": 2.2340783182785975, + "language_loss": 0.88071406, + "learning_rate": 3.7548702669378253e-06, + "loss": 0.90481138, + "num_input_tokens_seen": 7125070, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 2.375, + "step": 341, + "time_per_iteration": 2.502161979675293 + }, + { + "auxiliary_loss_clip": 0.01325847, + "auxiliary_loss_mlp": 0.01099304, + "balance_loss_clip": 1.05643678, + "balance_loss_mlp": 1.08389783, + "epoch": 0.020562152412445512, + "flos": 23987861429760.0, + "grad_norm": 3.2005009235922572, + "language_loss": 0.80293322, + "learning_rate": 3.756755633390458e-06, + "loss": 0.82718468, + "num_input_tokens_seen": 7144675, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 2.421875, + "step": 342, + "time_per_iteration": 2.5315535068511963 + }, + { + "auxiliary_loss_clip": 0.0131301, + "auxiliary_loss_mlp": 0.01098615, + "balance_loss_clip": 1.05293417, + "balance_loss_mlp": 1.08132875, + "epoch": 0.020622275665113484, + "flos": 26975305537920.0, + "grad_norm": 2.399130254204822, + "language_loss": 0.89451253, + "learning_rate": 3.7586354951156886e-06, + "loss": 0.91862881, + "num_input_tokens_seen": 7165505, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 2.3125, + "step": 343, + "time_per_iteration": 2.554255485534668 + }, + { + "auxiliary_loss_clip": 0.01325104, + "auxiliary_loss_mlp": 0.01094315, + "balance_loss_clip": 1.05342627, + "balance_loss_mlp": 1.08973229, + "epoch": 0.020682398917781453, + "flos": 22600704049920.0, + "grad_norm": 2.3234219523507296, + "language_loss": 0.78252918, + "learning_rate": 3.7605098841644e-06, + "loss": 0.80672336, + "num_input_tokens_seen": 7184605, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 2.359375, + "step": 344, + "time_per_iteration": 2.514665365219116 + }, + { + "auxiliary_loss_clip": 0.01309596, + "auxiliary_loss_mlp": 0.0110098, + "balance_loss_clip": 1.05730188, + "balance_loss_mlp": 1.08079529, + "epoch": 0.02074252217044942, + "flos": 15013605790080.0, + "grad_norm": 1.8371023099908983, + "language_loss": 0.75138956, + "learning_rate": 3.7623788323083666e-06, + "loss": 0.77549529, + "num_input_tokens_seen": 7203065, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 2.28125, + "step": 345, + "time_per_iteration": 2.513394594192505 + }, + { + "auxiliary_loss_clip": 0.01318525, + "auxiliary_loss_mlp": 0.01101003, + "balance_loss_clip": 1.05806339, + "balance_loss_mlp": 1.08789146, + "epoch": 0.02080264542311739, + "flos": 25337958952320.0, + "grad_norm": 2.0741733748571565, + "language_loss": 0.90269232, + "learning_rate": 3.7642423710434837e-06, + "loss": 0.92688763, + "num_input_tokens_seen": 7222995, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 2.3125, + "step": 346, + "time_per_iteration": 2.5487060546875 + }, + { + "auxiliary_loss_clip": 0.01314255, + "auxiliary_loss_mlp": 0.01095048, + "balance_loss_clip": 1.05527973, + "balance_loss_mlp": 1.08358788, + "epoch": 0.02086276867578536, + "flos": 24388804016640.0, + "grad_norm": 2.0766581400667, + "language_loss": 0.78869188, + "learning_rate": 3.7661005315929563e-06, + "loss": 0.81278479, + "num_input_tokens_seen": 7244625, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 2.3125, + "step": 347, + "time_per_iteration": 2.516402006149292 + }, + { + "auxiliary_loss_clip": 0.01317315, + "auxiliary_loss_mlp": 0.01096912, + "balance_loss_clip": 1.05335259, + "balance_loss_mlp": 1.08719826, + "epoch": 0.02092289192845333, + "flos": 24462205459200.0, + "grad_norm": 2.4234628631287927, + "language_loss": 0.71424043, + "learning_rate": 3.7679533449104354e-06, + "loss": 0.7383827, + "num_input_tokens_seen": 7263255, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 2.3125, + "step": 348, + "time_per_iteration": 2.5407540798187256 + }, + { + "auxiliary_loss_clip": 0.01319638, + "auxiliary_loss_mlp": 0.01101899, + "balance_loss_clip": 1.0595324, + "balance_loss_mlp": 1.08435416, + "epoch": 0.0209830151811213, + "flos": 17451185523840.0, + "grad_norm": 4.002924557181807, + "language_loss": 0.76819432, + "learning_rate": 3.7698008416831116e-06, + "loss": 0.79240972, + "num_input_tokens_seen": 7279275, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 2.34375, + "step": 349, + "time_per_iteration": 2.4884049892425537 + }, + { + "auxiliary_loss_clip": 0.0130292, + "auxiliary_loss_mlp": 0.0109884, + "balance_loss_clip": 1.05792725, + "balance_loss_mlp": 1.08141851, + "epoch": 0.021043138433789268, + "flos": 24573995562240.0, + "grad_norm": 1.9115672624672835, + "language_loss": 0.85271406, + "learning_rate": 3.7716430523347664e-06, + "loss": 0.87673163, + "num_input_tokens_seen": 7300180, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 2.21875, + "step": 350, + "time_per_iteration": 2.559812307357788 + }, + { + "auxiliary_loss_clip": 0.01311162, + "auxiliary_loss_mlp": 0.01089483, + "balance_loss_clip": 1.05083585, + "balance_loss_mlp": 1.08571863, + "epoch": 0.021103261686457236, + "flos": 24454053072000.0, + "grad_norm": 2.3355222976898764, + "language_loss": 0.80104828, + "learning_rate": 3.773480007028776e-06, + "loss": 0.82505476, + "num_input_tokens_seen": 7317430, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 2.25, + "step": 351, + "time_per_iteration": 5.468780517578125 + }, + { + "auxiliary_loss_clip": 0.01318524, + "auxiliary_loss_mlp": 0.01103443, + "balance_loss_clip": 1.06048024, + "balance_loss_mlp": 1.08623564, + "epoch": 0.021163384939125205, + "flos": 14683083816960.0, + "grad_norm": 3.8473493260702125, + "language_loss": 0.87258279, + "learning_rate": 3.775311735671078e-06, + "loss": 0.89680254, + "num_input_tokens_seen": 7334875, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 2.328125, + "step": 352, + "time_per_iteration": 2.4787278175354004 + }, + { + "auxiliary_loss_clip": 0.01312545, + "auxiliary_loss_mlp": 0.01105111, + "balance_loss_clip": 1.06248152, + "balance_loss_mlp": 1.08574009, + "epoch": 0.021223508191793177, + "flos": 24493195918080.0, + "grad_norm": 1.8920106465676412, + "language_loss": 0.82386625, + "learning_rate": 3.7771382679130878e-06, + "loss": 0.84804279, + "num_input_tokens_seen": 7355185, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 2.265625, + "step": 353, + "time_per_iteration": 2.5428433418273926 + }, + { + "auxiliary_loss_clip": 0.01307832, + "auxiliary_loss_mlp": 0.01091814, + "balance_loss_clip": 1.05133069, + "balance_loss_mlp": 1.08353949, + "epoch": 0.021283631444461146, + "flos": 24126978804480.0, + "grad_norm": 2.0636001035279694, + "language_loss": 0.8102631, + "learning_rate": 3.7789596331545845e-06, + "loss": 0.83425963, + "num_input_tokens_seen": 7374425, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 2.25, + "step": 354, + "time_per_iteration": 2.5261166095733643 + }, + { + "auxiliary_loss_clip": 0.01315043, + "auxiliary_loss_mlp": 0.01092413, + "balance_loss_clip": 1.04935455, + "balance_loss_mlp": 1.08190715, + "epoch": 0.021343754697129114, + "flos": 25192233475200.0, + "grad_norm": 2.8065821662627575, + "language_loss": 0.80764574, + "learning_rate": 3.780775860546545e-06, + "loss": 0.83172029, + "num_input_tokens_seen": 7394175, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 2.328125, + "step": 355, + "time_per_iteration": 2.56968355178833 + }, + { + "auxiliary_loss_clip": 0.01310125, + "auxiliary_loss_mlp": 0.01086869, + "balance_loss_clip": 1.0454793, + "balance_loss_mlp": 1.08140039, + "epoch": 0.021403877949797083, + "flos": 17274182279040.0, + "grad_norm": 2.2488803729957, + "language_loss": 0.89553398, + "learning_rate": 3.7825869789939474e-06, + "loss": 0.91950381, + "num_input_tokens_seen": 7412645, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 2.28125, + "step": 356, + "time_per_iteration": 2.5510213375091553 + }, + { + "auxiliary_loss_clip": 0.01308646, + "auxiliary_loss_mlp": 0.01083372, + "balance_loss_clip": 1.04117227, + "balance_loss_mlp": 1.08451605, + "epoch": 0.021464001202465055, + "flos": 30917435276160.0, + "grad_norm": 2.7055681522526522, + "language_loss": 0.80032516, + "learning_rate": 3.784393017158528e-06, + "loss": 0.82424533, + "num_input_tokens_seen": 7432275, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 2.234375, + "step": 357, + "time_per_iteration": 2.5834848880767822 + }, + { + "auxiliary_loss_clip": 0.01311386, + "auxiliary_loss_mlp": 0.0108216, + "balance_loss_clip": 1.04336917, + "balance_loss_mlp": 1.08195996, + "epoch": 0.021524124455133024, + "flos": 18186385098240.0, + "grad_norm": 2.3810225918991827, + "language_loss": 0.7661376, + "learning_rate": 3.786194003461506e-06, + "loss": 0.7900731, + "num_input_tokens_seen": 7450245, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 2.296875, + "step": 358, + "time_per_iteration": 2.4937031269073486 + }, + { + "auxiliary_loss_clip": 0.01308618, + "auxiliary_loss_mlp": 0.01088514, + "balance_loss_clip": 1.04574156, + "balance_loss_mlp": 1.08024073, + "epoch": 0.021584247707800992, + "flos": 13805786039040.0, + "grad_norm": 3.004949550769694, + "language_loss": 0.88491321, + "learning_rate": 3.787989966086264e-06, + "loss": 0.90888453, + "num_input_tokens_seen": 7466845, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 2.28125, + "step": 359, + "time_per_iteration": 2.452698230743408 + }, + { + "auxiliary_loss_clip": 0.01316066, + "auxiliary_loss_mlp": 0.01089057, + "balance_loss_clip": 1.05000377, + "balance_loss_mlp": 1.08438587, + "epoch": 0.02164437096046896, + "flos": 23294713703040.0, + "grad_norm": 2.789884231725057, + "language_loss": 0.76007903, + "learning_rate": 3.789780932980997e-06, + "loss": 0.78413033, + "num_input_tokens_seen": 7485450, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.3125, + "step": 360, + "time_per_iteration": 2.490006685256958 + }, + { + "auxiliary_loss_clip": 0.01189834, + "auxiliary_loss_mlp": 0.010797, + "balance_loss_clip": 1.06634831, + "balance_loss_mlp": 1.06162107, + "epoch": 0.02170449421313693, + "flos": 68899578341760.0, + "grad_norm": 0.8685264055585812, + "language_loss": 0.64943242, + "learning_rate": 3.79156693186132e-06, + "loss": 0.67212784, + "num_input_tokens_seen": 7553780, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 1.28125, + "step": 361, + "time_per_iteration": 3.1978280544281006 + }, + { + "auxiliary_loss_clip": 0.01307066, + "auxiliary_loss_mlp": 0.01088482, + "balance_loss_clip": 1.04826093, + "balance_loss_mlp": 1.0776422, + "epoch": 0.0217646174658049, + "flos": 25228539146880.0, + "grad_norm": 2.6839093883440213, + "language_loss": 0.78157276, + "learning_rate": 3.7933479902128433e-06, + "loss": 0.80552828, + "num_input_tokens_seen": 7574155, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 2.296875, + "step": 362, + "time_per_iteration": 2.5401153564453125 + }, + { + "auxiliary_loss_clip": 0.0131339, + "auxiliary_loss_mlp": 0.01092034, + "balance_loss_clip": 1.05171776, + "balance_loss_mlp": 1.08265781, + "epoch": 0.02182474071847287, + "flos": 22893124671360.0, + "grad_norm": 2.163466714708112, + "language_loss": 0.92508751, + "learning_rate": 3.7951241352937077e-06, + "loss": 0.94914174, + "num_input_tokens_seen": 7592320, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 2.3125, + "step": 363, + "time_per_iteration": 2.4868171215057373 + }, + { + "auxiliary_loss_clip": 0.01307593, + "auxiliary_loss_mlp": 0.01102168, + "balance_loss_clip": 1.06270981, + "balance_loss_mlp": 1.08121252, + "epoch": 0.02188486397114084, + "flos": 23658991482240.0, + "grad_norm": 2.137373361500905, + "language_loss": 0.89611077, + "learning_rate": 3.7968953941370915e-06, + "loss": 0.92020839, + "num_input_tokens_seen": 7611185, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 2.265625, + "step": 364, + "time_per_iteration": 2.5251975059509277 + }, + { + "auxiliary_loss_clip": 0.01313873, + "auxiliary_loss_mlp": 0.01094072, + "balance_loss_clip": 1.05232477, + "balance_loss_mlp": 1.08512843, + "epoch": 0.021944987223808807, + "flos": 21543637680000.0, + "grad_norm": 2.0040846596101867, + "language_loss": 0.79597497, + "learning_rate": 3.798661793553676e-06, + "loss": 0.82005441, + "num_input_tokens_seen": 7631970, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 2.28125, + "step": 365, + "time_per_iteration": 2.5358779430389404 + }, + { + "auxiliary_loss_clip": 0.01307321, + "auxiliary_loss_mlp": 0.01094001, + "balance_loss_clip": 1.05218291, + "balance_loss_mlp": 1.08262253, + "epoch": 0.022005110476476776, + "flos": 16070887641600.0, + "grad_norm": 2.4198695758814126, + "language_loss": 0.84312123, + "learning_rate": 3.8004233601340808e-06, + "loss": 0.86713445, + "num_input_tokens_seen": 7649745, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 2.25, + "step": 366, + "time_per_iteration": 2.4834306240081787 + }, + { + "auxiliary_loss_clip": 0.01314411, + "auxiliary_loss_mlp": 0.01089093, + "balance_loss_clip": 1.05008757, + "balance_loss_mlp": 1.08409071, + "epoch": 0.022065233729144748, + "flos": 21433715084160.0, + "grad_norm": 2.4790438398014114, + "language_loss": 0.87009263, + "learning_rate": 3.8021801202512694e-06, + "loss": 0.89412761, + "num_input_tokens_seen": 7668830, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.296875, + "step": 367, + "time_per_iteration": 2.486476421356201 + }, + { + "auxiliary_loss_clip": 0.01315695, + "auxiliary_loss_mlp": 0.01094559, + "balance_loss_clip": 1.05247772, + "balance_loss_mlp": 1.08183074, + "epoch": 0.022125356981812717, + "flos": 21543709507200.0, + "grad_norm": 3.1787846704720906, + "language_loss": 0.84725291, + "learning_rate": 3.803932100062912e-06, + "loss": 0.87135541, + "num_input_tokens_seen": 7687240, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 2.34375, + "step": 368, + "time_per_iteration": 2.522035837173462 + }, + { + "auxiliary_loss_clip": 0.01314671, + "auxiliary_loss_mlp": 0.01085486, + "balance_loss_clip": 1.04559815, + "balance_loss_mlp": 1.07997978, + "epoch": 0.022185480234480685, + "flos": 20704153944960.0, + "grad_norm": 3.205334425353566, + "language_loss": 0.75328851, + "learning_rate": 3.8056793255137264e-06, + "loss": 0.77728999, + "num_input_tokens_seen": 7704440, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 2.34375, + "step": 369, + "time_per_iteration": 2.5247385501861572 + }, + { + "auxiliary_loss_clip": 0.01309465, + "auxiliary_loss_mlp": 0.01102826, + "balance_loss_clip": 1.06241453, + "balance_loss_mlp": 1.08204889, + "epoch": 0.022245603487148654, + "flos": 25193203142400.0, + "grad_norm": 2.195001895084689, + "language_loss": 0.82444763, + "learning_rate": 3.8074218223377844e-06, + "loss": 0.84857059, + "num_input_tokens_seen": 7727160, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 2.28125, + "step": 370, + "time_per_iteration": 2.556654453277588 + }, + { + "auxiliary_loss_clip": 0.01308477, + "auxiliary_loss_mlp": 0.01102256, + "balance_loss_clip": 1.06186807, + "balance_loss_mlp": 1.08148122, + "epoch": 0.022305726739816623, + "flos": 21395936954880.0, + "grad_norm": 1.701167396379405, + "language_loss": 0.81576145, + "learning_rate": 3.8091596160607834e-06, + "loss": 0.83986878, + "num_input_tokens_seen": 7747730, + "router_z_loss_clip": 0.40429688, + "router_z_loss_mlp": 2.265625, + "step": 371, + "time_per_iteration": 2.5303707122802734 + }, + { + "auxiliary_loss_clip": 0.01313813, + "auxiliary_loss_mlp": 0.01097647, + "balance_loss_clip": 1.05611479, + "balance_loss_mlp": 1.08685589, + "epoch": 0.022365849992484595, + "flos": 22492146170880.0, + "grad_norm": 2.421527930745161, + "language_loss": 0.83273733, + "learning_rate": 3.8108927320022896e-06, + "loss": 0.85685182, + "num_input_tokens_seen": 7766765, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 2.28125, + "step": 372, + "time_per_iteration": 2.528141975402832 + }, + { + "auxiliary_loss_clip": 0.01304775, + "auxiliary_loss_mlp": 0.01093239, + "balance_loss_clip": 1.05170679, + "balance_loss_mlp": 1.08068216, + "epoch": 0.022425973245152563, + "flos": 17856581397120.0, + "grad_norm": 3.9515576064335742, + "language_loss": 0.78448784, + "learning_rate": 3.8126211952779548e-06, + "loss": 0.80846798, + "num_input_tokens_seen": 7784010, + "router_z_loss_clip": 0.41601562, + "router_z_loss_mlp": 2.234375, + "step": 373, + "time_per_iteration": 2.4879236221313477 + }, + { + "auxiliary_loss_clip": 0.01310159, + "auxiliary_loss_mlp": 0.01088775, + "balance_loss_clip": 1.04681301, + "balance_loss_mlp": 1.08387947, + "epoch": 0.022486096497820532, + "flos": 15483029656320.0, + "grad_norm": 2.577150517784044, + "language_loss": 0.77507353, + "learning_rate": 3.8143450308016952e-06, + "loss": 0.79906291, + "num_input_tokens_seen": 7801305, + "router_z_loss_clip": 0.41992188, + "router_z_loss_mlp": 2.265625, + "step": 374, + "time_per_iteration": 2.467660665512085 + }, + { + "auxiliary_loss_clip": 0.01300907, + "auxiliary_loss_mlp": 0.01075524, + "balance_loss_clip": 1.03415811, + "balance_loss_mlp": 1.07458413, + "epoch": 0.0225462197504885, + "flos": 27784157950080.0, + "grad_norm": 2.1361288872426187, + "language_loss": 0.85989249, + "learning_rate": 3.8160642632878525e-06, + "loss": 0.8836568, + "num_input_tokens_seen": 7823965, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 2.265625, + "step": 375, + "time_per_iteration": 2.555748224258423 + }, + { + "auxiliary_loss_clip": 0.01307901, + "auxiliary_loss_mlp": 0.01100092, + "balance_loss_clip": 1.05767775, + "balance_loss_mlp": 1.08341241, + "epoch": 0.02260634300315647, + "flos": 19975490645760.0, + "grad_norm": 5.5735447387306785, + "language_loss": 0.89170349, + "learning_rate": 3.817778917253314e-06, + "loss": 0.91578341, + "num_input_tokens_seen": 7842115, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 2.25, + "step": 376, + "time_per_iteration": 2.53151798248291 + }, + { + "auxiliary_loss_clip": 0.01309113, + "auxiliary_loss_mlp": 0.01087831, + "balance_loss_clip": 1.04908752, + "balance_loss_mlp": 1.07899499, + "epoch": 0.02266646625582444, + "flos": 16028189349120.0, + "grad_norm": 4.261190841992283, + "language_loss": 0.74947262, + "learning_rate": 3.8194890170196155e-06, + "loss": 0.77344215, + "num_input_tokens_seen": 7857830, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 2.3125, + "step": 377, + "time_per_iteration": 2.463115692138672 + }, + { + "auxiliary_loss_clip": 0.0129987, + "auxiliary_loss_mlp": 0.01091273, + "balance_loss_clip": 1.05009794, + "balance_loss_mlp": 1.08131123, + "epoch": 0.02272658950849241, + "flos": 20404622430720.0, + "grad_norm": 9.398931100052017, + "language_loss": 0.99195766, + "learning_rate": 3.8211945867150055e-06, + "loss": 1.01586914, + "num_input_tokens_seen": 7875840, + "router_z_loss_clip": 0.41210938, + "router_z_loss_mlp": 2.1875, + "step": 378, + "time_per_iteration": 2.4765851497650146 + }, + { + "auxiliary_loss_clip": 0.01180245, + "auxiliary_loss_mlp": 0.0112236, + "balance_loss_clip": 1.10910404, + "balance_loss_mlp": 1.06006432, + "epoch": 0.02278671276116038, + "flos": 69847332647040.0, + "grad_norm": 0.9843357397114052, + "language_loss": 0.75457036, + "learning_rate": 3.822895650276492e-06, + "loss": 0.77759647, + "num_input_tokens_seen": 7940190, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 1.203125, + "step": 379, + "time_per_iteration": 3.113067388534546 + }, + { + "auxiliary_loss_clip": 0.01308809, + "auxiliary_loss_mlp": 0.01083458, + "balance_loss_clip": 1.0448581, + "balance_loss_mlp": 1.07811105, + "epoch": 0.022846836013828347, + "flos": 38508771340800.0, + "grad_norm": 4.195302770466088, + "language_loss": 0.78423429, + "learning_rate": 3.824592231451859e-06, + "loss": 0.80815697, + "num_input_tokens_seen": 7960840, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 2.3125, + "step": 380, + "time_per_iteration": 2.6457204818725586 + }, + { + "auxiliary_loss_clip": 0.01302565, + "auxiliary_loss_mlp": 0.01083038, + "balance_loss_clip": 1.04527259, + "balance_loss_mlp": 1.08019924, + "epoch": 0.02290695926649632, + "flos": 20959478795520.0, + "grad_norm": 2.272240555091753, + "language_loss": 0.9679752, + "learning_rate": 3.826284353801652e-06, + "loss": 0.99183118, + "num_input_tokens_seen": 7975500, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 2.21875, + "step": 381, + "time_per_iteration": 2.485316038131714 + }, + { + "auxiliary_loss_clip": 0.01312325, + "auxiliary_loss_mlp": 0.01084569, + "balance_loss_clip": 1.04501581, + "balance_loss_mlp": 1.08177519, + "epoch": 0.022967082519164288, + "flos": 24022407335040.0, + "grad_norm": 2.322972014312181, + "language_loss": 0.88035834, + "learning_rate": 3.827972040701142e-06, + "loss": 0.90432727, + "num_input_tokens_seen": 7993880, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 2.3125, + "step": 382, + "time_per_iteration": 2.5361156463623047 + }, + { + "auxiliary_loss_clip": 0.01306631, + "auxiliary_loss_mlp": 0.01099641, + "balance_loss_clip": 1.06080246, + "balance_loss_mlp": 1.08242524, + "epoch": 0.023027205771832256, + "flos": 20997149184000.0, + "grad_norm": 2.197151340607638, + "language_loss": 0.84830511, + "learning_rate": 3.829655315342268e-06, + "loss": 0.87236774, + "num_input_tokens_seen": 8012730, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 2.25, + "step": 383, + "time_per_iteration": 2.479097843170166 + }, + { + "auxiliary_loss_clip": 0.01303681, + "auxiliary_loss_mlp": 0.01106386, + "balance_loss_clip": 1.06673658, + "balance_loss_mlp": 1.08259249, + "epoch": 0.023087329024500225, + "flos": 21360816432000.0, + "grad_norm": 2.2992198386883116, + "language_loss": 0.83199835, + "learning_rate": 3.831334200735543e-06, + "loss": 0.85609907, + "num_input_tokens_seen": 8031275, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 2.203125, + "step": 384, + "time_per_iteration": 2.5008413791656494 + }, + { + "auxiliary_loss_clip": 0.01303616, + "auxiliary_loss_mlp": 0.0109643, + "balance_loss_clip": 1.06030965, + "balance_loss_mlp": 1.08539534, + "epoch": 0.023147452277168194, + "flos": 21872435800320.0, + "grad_norm": 1.8570399395654076, + "language_loss": 0.89240694, + "learning_rate": 3.8330087197119426e-06, + "loss": 0.91640741, + "num_input_tokens_seen": 8051600, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 2.1875, + "step": 385, + "time_per_iteration": 2.4913859367370605 + }, + { + "auxiliary_loss_clip": 0.01306859, + "auxiliary_loss_mlp": 0.01121647, + "balance_loss_clip": 1.08397639, + "balance_loss_mlp": 1.0826149, + "epoch": 0.023207575529836166, + "flos": 18916700423040.0, + "grad_norm": 2.2576284783670357, + "language_loss": 0.70096415, + "learning_rate": 3.83467889492477e-06, + "loss": 0.72524917, + "num_input_tokens_seen": 8070600, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 2.234375, + "step": 386, + "time_per_iteration": 2.5017154216766357 + }, + { + "auxiliary_loss_clip": 0.01308067, + "auxiliary_loss_mlp": 0.01098351, + "balance_loss_clip": 1.06072879, + "balance_loss_mlp": 1.08460176, + "epoch": 0.023267698782504134, + "flos": 25046005207680.0, + "grad_norm": 1.9470877788533054, + "language_loss": 0.87909782, + "learning_rate": 3.836344748851495e-06, + "loss": 0.90316188, + "num_input_tokens_seen": 8090680, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 2.234375, + "step": 387, + "time_per_iteration": 2.5142157077789307 + }, + { + "auxiliary_loss_clip": 0.01308318, + "auxiliary_loss_mlp": 0.01085815, + "balance_loss_clip": 1.04666662, + "balance_loss_mlp": 1.08291698, + "epoch": 0.023327822035172103, + "flos": 28879217930880.0, + "grad_norm": 2.441105853176172, + "language_loss": 0.83429295, + "learning_rate": 3.838006303795566e-06, + "loss": 0.85823429, + "num_input_tokens_seen": 8114610, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.25, + "step": 388, + "time_per_iteration": 2.591242790222168 + }, + { + "auxiliary_loss_clip": 0.01305661, + "auxiliary_loss_mlp": 0.01094305, + "balance_loss_clip": 1.05754054, + "balance_loss_mlp": 1.08271885, + "epoch": 0.02338794528784007, + "flos": 27121533805440.0, + "grad_norm": 3.2646980282386644, + "language_loss": 0.93823689, + "learning_rate": 3.839663581888206e-06, + "loss": 0.96223652, + "num_input_tokens_seen": 8133975, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 2.21875, + "step": 389, + "time_per_iteration": 2.5427236557006836 + }, + { + "auxiliary_loss_clip": 0.01299094, + "auxiliary_loss_mlp": 0.01087693, + "balance_loss_clip": 1.04954624, + "balance_loss_mlp": 1.08334351, + "epoch": 0.02344806854050804, + "flos": 21322355944320.0, + "grad_norm": 2.08298220488583, + "language_loss": 0.87901413, + "learning_rate": 3.841316605090178e-06, + "loss": 0.90288198, + "num_input_tokens_seen": 8153570, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 2.15625, + "step": 390, + "time_per_iteration": 2.53519606590271 + }, + { + "auxiliary_loss_clip": 0.01304239, + "auxiliary_loss_mlp": 0.01095828, + "balance_loss_clip": 1.05927861, + "balance_loss_mlp": 1.08334053, + "epoch": 0.023508191793176012, + "flos": 24789997998720.0, + "grad_norm": 2.2293869448662362, + "language_loss": 0.89346433, + "learning_rate": 3.842965395193529e-06, + "loss": 0.91746497, + "num_input_tokens_seen": 8170075, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 2.203125, + "step": 391, + "time_per_iteration": 2.5662949085235596 + }, + { + "auxiliary_loss_clip": 0.01302453, + "auxiliary_loss_mlp": 0.0107275, + "balance_loss_clip": 1.03560483, + "balance_loss_mlp": 1.08116579, + "epoch": 0.02356831504584398, + "flos": 25995375624960.0, + "grad_norm": 2.022763227206087, + "language_loss": 0.86065882, + "learning_rate": 3.84460997382332e-06, + "loss": 0.88441086, + "num_input_tokens_seen": 8190420, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 2.21875, + "step": 392, + "time_per_iteration": 4.050429105758667 + }, + { + "auxiliary_loss_clip": 0.01297975, + "auxiliary_loss_mlp": 0.01086863, + "balance_loss_clip": 1.04990816, + "balance_loss_mlp": 1.08006191, + "epoch": 0.02362843829851195, + "flos": 19062461813760.0, + "grad_norm": 2.9628480690926318, + "language_loss": 0.88900077, + "learning_rate": 3.8462503624393256e-06, + "loss": 0.91284919, + "num_input_tokens_seen": 8208790, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 2.1875, + "step": 393, + "time_per_iteration": 3.9293932914733887 + }, + { + "auxiliary_loss_clip": 0.01309989, + "auxiliary_loss_mlp": 0.01103908, + "balance_loss_clip": 1.06449771, + "balance_loss_mlp": 1.087502, + "epoch": 0.023688561551179918, + "flos": 16071031296000.0, + "grad_norm": 2.0531375516435943, + "language_loss": 0.81400156, + "learning_rate": 3.84788658233771e-06, + "loss": 0.83814055, + "num_input_tokens_seen": 8226885, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 2.21875, + "step": 394, + "time_per_iteration": 2.552100658416748 + }, + { + "auxiliary_loss_clip": 0.01299653, + "auxiliary_loss_mlp": 0.01084647, + "balance_loss_clip": 1.04611897, + "balance_loss_mlp": 1.08043575, + "epoch": 0.023748684803847887, + "flos": 21724375939200.0, + "grad_norm": 2.0447414784698092, + "language_loss": 0.86189264, + "learning_rate": 3.84951865465269e-06, + "loss": 0.88573563, + "num_input_tokens_seen": 8246825, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 2.1875, + "step": 395, + "time_per_iteration": 2.536823272705078 + }, + { + "auxiliary_loss_clip": 0.01174527, + "auxiliary_loss_mlp": 0.01044608, + "balance_loss_clip": 1.03135228, + "balance_loss_mlp": 1.0590049, + "epoch": 0.02380880805651586, + "flos": 61926192881280.0, + "grad_norm": 0.9487784547172928, + "language_loss": 0.63808912, + "learning_rate": 3.851146600358172e-06, + "loss": 0.66028047, + "num_input_tokens_seen": 8302835, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 1.15625, + "step": 396, + "time_per_iteration": 2.935506582260132 + }, + { + "auxiliary_loss_clip": 0.01296295, + "auxiliary_loss_mlp": 0.01069502, + "balance_loss_clip": 1.03252339, + "balance_loss_mlp": 1.07895613, + "epoch": 0.023868931309183827, + "flos": 20266331068800.0, + "grad_norm": 2.6168641306315172, + "language_loss": 0.83744055, + "learning_rate": 3.852770440269372e-06, + "loss": 0.86109853, + "num_input_tokens_seen": 8320745, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 2.171875, + "step": 397, + "time_per_iteration": 2.5051028728485107 + }, + { + "auxiliary_loss_clip": 0.01302535, + "auxiliary_loss_mlp": 0.01091033, + "balance_loss_clip": 1.05288601, + "balance_loss_mlp": 1.08300877, + "epoch": 0.023929054561851796, + "flos": 21139103733120.0, + "grad_norm": 2.535145802301163, + "language_loss": 0.84050488, + "learning_rate": 3.854390195044404e-06, + "loss": 0.86444056, + "num_input_tokens_seen": 8339540, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 2.1875, + "step": 398, + "time_per_iteration": 2.5234646797180176 + }, + { + "auxiliary_loss_clip": 0.01300466, + "auxiliary_loss_mlp": 0.0108273, + "balance_loss_clip": 1.04427278, + "balance_loss_mlp": 1.07864475, + "epoch": 0.023989177814519765, + "flos": 13698521049600.0, + "grad_norm": 2.904470095612531, + "language_loss": 0.85865271, + "learning_rate": 3.856005885185868e-06, + "loss": 0.88248467, + "num_input_tokens_seen": 8354890, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 2.21875, + "step": 399, + "time_per_iteration": 2.4674201011657715 + }, + { + "auxiliary_loss_clip": 0.01295496, + "auxiliary_loss_mlp": 0.01093118, + "balance_loss_clip": 1.05566239, + "balance_loss_mlp": 1.08021355, + "epoch": 0.024049301067187733, + "flos": 26322018929280.0, + "grad_norm": 2.016759933832732, + "language_loss": 0.86157769, + "learning_rate": 3.857617531042398e-06, + "loss": 0.88546383, + "num_input_tokens_seen": 8375845, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 2.15625, + "step": 400, + "time_per_iteration": 2.554075241088867 + }, + { + "auxiliary_loss_clip": 0.01303599, + "auxiliary_loss_mlp": 0.01083552, + "balance_loss_clip": 1.04652512, + "balance_loss_mlp": 1.0848943, + "epoch": 0.024109424319855705, + "flos": 24425432910720.0, + "grad_norm": 3.068890951588493, + "language_loss": 0.79142016, + "learning_rate": 3.8592251528102065e-06, + "loss": 0.8152917, + "num_input_tokens_seen": 8395240, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 2.1875, + "step": 401, + "time_per_iteration": 2.543750286102295 + }, + { + "auxiliary_loss_clip": 0.01297911, + "auxiliary_loss_mlp": 0.01096359, + "balance_loss_clip": 1.05968988, + "balance_loss_mlp": 1.07987046, + "epoch": 0.024169547572523674, + "flos": 29604397610880.0, + "grad_norm": 2.2009554384450154, + "language_loss": 0.78456193, + "learning_rate": 3.8608287705345976e-06, + "loss": 0.80850464, + "num_input_tokens_seen": 8416950, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 2.1875, + "step": 402, + "time_per_iteration": 2.5531415939331055 + }, + { + "auxiliary_loss_clip": 0.0130167, + "auxiliary_loss_mlp": 0.01084273, + "balance_loss_clip": 1.04529142, + "balance_loss_mlp": 1.07989287, + "epoch": 0.024229670825191642, + "flos": 22601458235520.0, + "grad_norm": 2.7198213535828923, + "language_loss": 0.94637424, + "learning_rate": 3.86242840411147e-06, + "loss": 0.97023368, + "num_input_tokens_seen": 8433660, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.21875, + "step": 403, + "time_per_iteration": 2.4873671531677246 + }, + { + "auxiliary_loss_clip": 0.01306025, + "auxiliary_loss_mlp": 0.01095616, + "balance_loss_clip": 1.05620587, + "balance_loss_mlp": 1.07952547, + "epoch": 0.02428979407785961, + "flos": 18150258994560.0, + "grad_norm": 2.3706875621243246, + "language_loss": 0.99751151, + "learning_rate": 3.864024073288798e-06, + "loss": 1.02152789, + "num_input_tokens_seen": 8450180, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 2.265625, + "step": 404, + "time_per_iteration": 2.5400550365448 + }, + { + "auxiliary_loss_clip": 0.01304501, + "auxiliary_loss_mlp": 0.01104455, + "balance_loss_clip": 1.06716657, + "balance_loss_mlp": 1.08213115, + "epoch": 0.024349917330527583, + "flos": 15304984917120.0, + "grad_norm": 2.480197457162756, + "language_loss": 0.87603909, + "learning_rate": 3.865615797668091e-06, + "loss": 0.90012866, + "num_input_tokens_seen": 8467775, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 2.21875, + "step": 405, + "time_per_iteration": 2.4698479175567627 + }, + { + "auxiliary_loss_clip": 0.01314075, + "auxiliary_loss_mlp": 0.01107285, + "balance_loss_clip": 1.06835127, + "balance_loss_mlp": 1.08775485, + "epoch": 0.024410040583195552, + "flos": 20773892200320.0, + "grad_norm": 3.242686201363518, + "language_loss": 0.93258083, + "learning_rate": 3.867203596705844e-06, + "loss": 0.9567945, + "num_input_tokens_seen": 8486765, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 2.265625, + "step": 406, + "time_per_iteration": 2.505598306655884 + }, + { + "auxiliary_loss_clip": 0.01305046, + "auxiliary_loss_mlp": 0.01092168, + "balance_loss_clip": 1.05330622, + "balance_loss_mlp": 1.08378315, + "epoch": 0.02447016383586352, + "flos": 21798854789760.0, + "grad_norm": 2.059728688773918, + "language_loss": 0.87446553, + "learning_rate": 3.86878748971496e-06, + "loss": 0.89843762, + "num_input_tokens_seen": 8506515, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 2.21875, + "step": 407, + "time_per_iteration": 2.5017173290252686 + }, + { + "auxiliary_loss_clip": 0.01301523, + "auxiliary_loss_mlp": 0.01085362, + "balance_loss_clip": 1.04814506, + "balance_loss_mlp": 1.08445001, + "epoch": 0.02453028708853149, + "flos": 33948116380800.0, + "grad_norm": 2.439524495250932, + "language_loss": 0.7404871, + "learning_rate": 3.8703674958661596e-06, + "loss": 0.76435596, + "num_input_tokens_seen": 8528035, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 2.171875, + "step": 408, + "time_per_iteration": 2.6097092628479004 + }, + { + "auxiliary_loss_clip": 0.013061, + "auxiliary_loss_mlp": 0.01096961, + "balance_loss_clip": 1.05771768, + "balance_loss_mlp": 1.08381224, + "epoch": 0.024590410341199458, + "flos": 21793000872960.0, + "grad_norm": 2.750776221383638, + "language_loss": 0.92393035, + "learning_rate": 3.871943634189376e-06, + "loss": 0.94796097, + "num_input_tokens_seen": 8546455, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 2.21875, + "step": 409, + "time_per_iteration": 2.5198304653167725 + }, + { + "auxiliary_loss_clip": 0.01304769, + "auxiliary_loss_mlp": 0.01080478, + "balance_loss_clip": 1.04488206, + "balance_loss_mlp": 1.0854609, + "epoch": 0.02465053359386743, + "flos": 35114782124160.0, + "grad_norm": 1.9763435283924244, + "language_loss": 0.82926536, + "learning_rate": 3.873515923575128e-06, + "loss": 0.85311788, + "num_input_tokens_seen": 8568450, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 2.1875, + "step": 410, + "time_per_iteration": 2.624333143234253 + }, + { + "auxiliary_loss_clip": 0.01307118, + "auxiliary_loss_mlp": 0.01089288, + "balance_loss_clip": 1.05164146, + "balance_loss_mlp": 1.08556843, + "epoch": 0.0247106568465354, + "flos": 27451409333760.0, + "grad_norm": 4.176812441051998, + "language_loss": 0.77715993, + "learning_rate": 3.875084382775879e-06, + "loss": 0.80112404, + "num_input_tokens_seen": 8589340, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 2.21875, + "step": 411, + "time_per_iteration": 2.571401596069336 + }, + { + "auxiliary_loss_clip": 0.01303549, + "auxiliary_loss_mlp": 0.01102238, + "balance_loss_clip": 1.06311393, + "balance_loss_mlp": 1.08078265, + "epoch": 0.024770780099203367, + "flos": 20703794808960.0, + "grad_norm": 2.1103060729449883, + "language_loss": 0.86276567, + "learning_rate": 3.87664903040738e-06, + "loss": 0.88682353, + "num_input_tokens_seen": 8607150, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.21875, + "step": 412, + "time_per_iteration": 2.4968833923339844 + }, + { + "auxiliary_loss_clip": 0.01168305, + "auxiliary_loss_mlp": 0.01068817, + "balance_loss_clip": 1.05632353, + "balance_loss_mlp": 1.05478358, + "epoch": 0.024830903351871336, + "flos": 69551859369600.0, + "grad_norm": 0.8568818905087673, + "language_loss": 0.58512402, + "learning_rate": 3.878209884949994e-06, + "loss": 0.60749531, + "num_input_tokens_seen": 8669865, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 1.1328125, + "step": 413, + "time_per_iteration": 3.1763217449188232 + }, + { + "auxiliary_loss_clip": 0.01296528, + "auxiliary_loss_mlp": 0.01092205, + "balance_loss_clip": 1.05145931, + "balance_loss_mlp": 1.07941055, + "epoch": 0.024891026604539304, + "flos": 32270477713920.0, + "grad_norm": 1.7554792190049524, + "language_loss": 0.80704832, + "learning_rate": 3.879766964750006e-06, + "loss": 0.83093566, + "num_input_tokens_seen": 8690235, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 2.171875, + "step": 414, + "time_per_iteration": 2.5954627990722656 + }, + { + "auxiliary_loss_clip": 0.01292737, + "auxiliary_loss_mlp": 0.01093441, + "balance_loss_clip": 1.05660486, + "balance_loss_mlp": 1.07739186, + "epoch": 0.024951149857207276, + "flos": 18840282238080.0, + "grad_norm": 2.3796689224247904, + "language_loss": 0.80473328, + "learning_rate": 3.881320288020917e-06, + "loss": 0.82859504, + "num_input_tokens_seen": 8706295, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 2.15625, + "step": 415, + "time_per_iteration": 2.471665620803833 + }, + { + "auxiliary_loss_clip": 0.0131185, + "auxiliary_loss_mlp": 0.01085672, + "balance_loss_clip": 1.0481931, + "balance_loss_mlp": 1.08601356, + "epoch": 0.025011273109875245, + "flos": 15377201210880.0, + "grad_norm": 5.333540620494007, + "language_loss": 0.96179891, + "learning_rate": 3.882869872844723e-06, + "loss": 0.98577416, + "num_input_tokens_seen": 8724200, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 2.25, + "step": 416, + "time_per_iteration": 2.5133068561553955 + }, + { + "auxiliary_loss_clip": 0.01300197, + "auxiliary_loss_mlp": 0.01076153, + "balance_loss_clip": 1.03702867, + "balance_loss_mlp": 1.0806849, + "epoch": 0.025071396362543213, + "flos": 18915515274240.0, + "grad_norm": 2.409464042642492, + "language_loss": 0.77541196, + "learning_rate": 3.884415737173176e-06, + "loss": 0.79917544, + "num_input_tokens_seen": 8744170, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.1875, + "step": 417, + "time_per_iteration": 2.5206987857818604 + }, + { + "auxiliary_loss_clip": 0.01297091, + "auxiliary_loss_mlp": 0.01092626, + "balance_loss_clip": 1.05512297, + "balance_loss_mlp": 1.08281994, + "epoch": 0.025131519615211182, + "flos": 25337958952320.0, + "grad_norm": 1.6345521849457858, + "language_loss": 0.7689445, + "learning_rate": 3.8859578988290344e-06, + "loss": 0.79284167, + "num_input_tokens_seen": 8765120, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 2.140625, + "step": 418, + "time_per_iteration": 2.6002862453460693 + }, + { + "auxiliary_loss_clip": 0.01304842, + "auxiliary_loss_mlp": 0.01075451, + "balance_loss_clip": 1.03873444, + "balance_loss_mlp": 1.08383846, + "epoch": 0.02519164286787915, + "flos": 18953149749120.0, + "grad_norm": 2.548681745998596, + "language_loss": 0.81088459, + "learning_rate": 3.887496375507294e-06, + "loss": 0.83468759, + "num_input_tokens_seen": 8783500, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 2.203125, + "step": 419, + "time_per_iteration": 2.5097553730010986 + }, + { + "auxiliary_loss_clip": 0.01298642, + "auxiliary_loss_mlp": 0.01085388, + "balance_loss_clip": 1.0453577, + "balance_loss_mlp": 1.08236253, + "epoch": 0.025251766120547123, + "flos": 17421092904960.0, + "grad_norm": 1.9166879875817555, + "language_loss": 0.73812175, + "learning_rate": 3.8890311847764065e-06, + "loss": 0.761962, + "num_input_tokens_seen": 8801175, + "router_z_loss_clip": 0.40039062, + "router_z_loss_mlp": 2.15625, + "step": 420, + "time_per_iteration": 2.480468511581421 + }, + { + "auxiliary_loss_clip": 0.01298409, + "auxiliary_loss_mlp": 0.01098321, + "balance_loss_clip": 1.06086528, + "balance_loss_mlp": 1.0791508, + "epoch": 0.02531188937321509, + "flos": 25045430590080.0, + "grad_norm": 1.7246544027149788, + "language_loss": 0.78928417, + "learning_rate": 3.890562344079484e-06, + "loss": 0.8132515, + "num_input_tokens_seen": 8820215, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 2.1875, + "step": 421, + "time_per_iteration": 2.583979845046997 + }, + { + "auxiliary_loss_clip": 0.01300301, + "auxiliary_loss_mlp": 0.01095113, + "balance_loss_clip": 1.05589294, + "balance_loss_mlp": 1.08374381, + "epoch": 0.02537201262588306, + "flos": 30592228515840.0, + "grad_norm": 2.879256315405443, + "language_loss": 0.81915486, + "learning_rate": 3.89208987073549e-06, + "loss": 0.84310895, + "num_input_tokens_seen": 8839660, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 2.171875, + "step": 422, + "time_per_iteration": 2.5834591388702393 + }, + { + "auxiliary_loss_clip": 0.01299282, + "auxiliary_loss_mlp": 0.01079788, + "balance_loss_clip": 1.0445497, + "balance_loss_mlp": 1.07925105, + "epoch": 0.02543213587855103, + "flos": 26065365275520.0, + "grad_norm": 1.9426129656279463, + "language_loss": 0.83468062, + "learning_rate": 3.893613781940409e-06, + "loss": 0.85847133, + "num_input_tokens_seen": 8859280, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 2.203125, + "step": 423, + "time_per_iteration": 2.5526318550109863 + }, + { + "auxiliary_loss_clip": 0.01293361, + "auxiliary_loss_mlp": 0.01086715, + "balance_loss_clip": 1.04978371, + "balance_loss_mlp": 1.07668817, + "epoch": 0.025492259131218997, + "flos": 36022818965760.0, + "grad_norm": 2.7010989411926367, + "language_loss": 0.74435121, + "learning_rate": 3.895134094768415e-06, + "loss": 0.768152, + "num_input_tokens_seen": 8880560, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 2.171875, + "step": 424, + "time_per_iteration": 2.606895923614502 + }, + { + "auxiliary_loss_clip": 0.01303473, + "auxiliary_loss_mlp": 0.01097188, + "balance_loss_clip": 1.06113958, + "balance_loss_mlp": 1.08349586, + "epoch": 0.02555238238388697, + "flos": 18588045957120.0, + "grad_norm": 2.227147445366898, + "language_loss": 0.83008313, + "learning_rate": 3.896650826173015e-06, + "loss": 0.85408974, + "num_input_tokens_seen": 8899155, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 2.203125, + "step": 425, + "time_per_iteration": 2.522517442703247 + }, + { + "auxiliary_loss_clip": 0.01299491, + "auxiliary_loss_mlp": 0.01096328, + "balance_loss_clip": 1.05691719, + "balance_loss_mlp": 1.07528758, + "epoch": 0.025612505636554938, + "flos": 24243186280320.0, + "grad_norm": 2.394258070540652, + "language_loss": 0.85481966, + "learning_rate": 3.898163992988186e-06, + "loss": 0.87877786, + "num_input_tokens_seen": 8917890, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 2.25, + "step": 426, + "time_per_iteration": 2.5039095878601074 + }, + { + "auxiliary_loss_clip": 0.01160068, + "auxiliary_loss_mlp": 0.0104803, + "balance_loss_clip": 1.03663349, + "balance_loss_mlp": 1.04526472, + "epoch": 0.025672628889222907, + "flos": 60586941265920.0, + "grad_norm": 0.8962322500302954, + "language_loss": 0.57186544, + "learning_rate": 3.899673611929491e-06, + "loss": 0.5939464, + "num_input_tokens_seen": 8978260, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 1.1484375, + "step": 427, + "time_per_iteration": 3.2289342880249023 + }, + { + "auxiliary_loss_clip": 0.01297452, + "auxiliary_loss_mlp": 0.01095521, + "balance_loss_clip": 1.05849457, + "balance_loss_mlp": 1.0838623, + "epoch": 0.025732752141890875, + "flos": 19573255169280.0, + "grad_norm": 2.6536896946259816, + "language_loss": 0.88190198, + "learning_rate": 3.901179699595194e-06, + "loss": 0.90583158, + "num_input_tokens_seen": 8994460, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 2.125, + "step": 428, + "time_per_iteration": 2.500389814376831 + }, + { + "auxiliary_loss_clip": 0.01290417, + "auxiliary_loss_mlp": 0.01078869, + "balance_loss_clip": 1.03972101, + "balance_loss_mlp": 1.07718623, + "epoch": 0.025792875394558847, + "flos": 31284262920960.0, + "grad_norm": 1.6692033855414803, + "language_loss": 0.85672665, + "learning_rate": 3.902682272467353e-06, + "loss": 0.88041949, + "num_input_tokens_seen": 9016670, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.125, + "step": 429, + "time_per_iteration": 2.605687379837036 + }, + { + "auxiliary_loss_clip": 0.01297427, + "auxiliary_loss_mlp": 0.01082502, + "balance_loss_clip": 1.04373491, + "balance_loss_mlp": 1.07673144, + "epoch": 0.025852998647226816, + "flos": 32379610210560.0, + "grad_norm": 2.5023850128037672, + "language_loss": 0.88384748, + "learning_rate": 3.904181346912895e-06, + "loss": 0.90764678, + "num_input_tokens_seen": 9039720, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 2.203125, + "step": 430, + "time_per_iteration": 2.593492269515991 + }, + { + "auxiliary_loss_clip": 0.01298542, + "auxiliary_loss_mlp": 0.01083596, + "balance_loss_clip": 1.04799962, + "balance_loss_mlp": 1.08428442, + "epoch": 0.025913121899894784, + "flos": 20193288762240.0, + "grad_norm": 1.9811912271744876, + "language_loss": 0.84202254, + "learning_rate": 3.905676939184698e-06, + "loss": 0.86584389, + "num_input_tokens_seen": 9059850, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 2.140625, + "step": 431, + "time_per_iteration": 2.5326902866363525 + }, + { + "auxiliary_loss_clip": 0.01291302, + "auxiliary_loss_mlp": 0.01073914, + "balance_loss_clip": 1.03886628, + "balance_loss_mlp": 1.0772872, + "epoch": 0.025973245152562753, + "flos": 14720430983040.0, + "grad_norm": 2.686150654607635, + "language_loss": 0.86775959, + "learning_rate": 3.907169065422638e-06, + "loss": 0.89141178, + "num_input_tokens_seen": 9077590, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 2.140625, + "step": 432, + "time_per_iteration": 2.4793269634246826 + }, + { + "auxiliary_loss_clip": 0.01296964, + "auxiliary_loss_mlp": 0.01080084, + "balance_loss_clip": 1.04491723, + "balance_loss_mlp": 1.08109105, + "epoch": 0.02603336840523072, + "flos": 30992991534720.0, + "grad_norm": 2.6953453355349684, + "language_loss": 0.76074433, + "learning_rate": 3.908657741654636e-06, + "loss": 0.78451484, + "num_input_tokens_seen": 9099880, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 2.15625, + "step": 433, + "time_per_iteration": 2.6125545501708984 + }, + { + "auxiliary_loss_clip": 0.01296292, + "auxiliary_loss_mlp": 0.0109282, + "balance_loss_clip": 1.05312383, + "balance_loss_mlp": 1.07772529, + "epoch": 0.026093491657898694, + "flos": 17674262939520.0, + "grad_norm": 2.2540618473103247, + "language_loss": 0.89764363, + "learning_rate": 3.910142983797699e-06, + "loss": 0.92153478, + "num_input_tokens_seen": 9118620, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 2.1875, + "step": 434, + "time_per_iteration": 5.3097922801971436 + }, + { + "auxiliary_loss_clip": 0.01297376, + "auxiliary_loss_mlp": 0.01102904, + "balance_loss_clip": 1.06404209, + "balance_loss_mlp": 1.08362865, + "epoch": 0.026153614910566662, + "flos": 17857874286720.0, + "grad_norm": 6.328317132251919, + "language_loss": 0.7985189, + "learning_rate": 3.9116248076589305e-06, + "loss": 0.82252169, + "num_input_tokens_seen": 9135655, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 2.125, + "step": 435, + "time_per_iteration": 3.9629530906677246 + }, + { + "auxiliary_loss_clip": 0.01291104, + "auxiliary_loss_mlp": 0.01091144, + "balance_loss_clip": 1.05316401, + "balance_loss_mlp": 1.0750463, + "epoch": 0.02621373816323463, + "flos": 20011113959040.0, + "grad_norm": 3.559504815450524, + "language_loss": 0.86357677, + "learning_rate": 3.913103228936546e-06, + "loss": 0.88739926, + "num_input_tokens_seen": 9153520, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 2.15625, + "step": 436, + "time_per_iteration": 2.479033946990967 + }, + { + "auxiliary_loss_clip": 0.01296325, + "auxiliary_loss_mlp": 0.01099771, + "balance_loss_clip": 1.06214869, + "balance_loss_mlp": 1.07964039, + "epoch": 0.0262738614159026, + "flos": 19281193683840.0, + "grad_norm": 2.6168892141891944, + "language_loss": 0.75002837, + "learning_rate": 3.914578263220868e-06, + "loss": 0.77398932, + "num_input_tokens_seen": 9170750, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 2.171875, + "step": 437, + "time_per_iteration": 2.508769989013672 + }, + { + "auxiliary_loss_clip": 0.01293849, + "auxiliary_loss_mlp": 0.01104049, + "balance_loss_clip": 1.06380415, + "balance_loss_mlp": 1.08015561, + "epoch": 0.026333984668570568, + "flos": 18807208790400.0, + "grad_norm": 2.3031145987765758, + "language_loss": 0.91467845, + "learning_rate": 3.916049925995316e-06, + "loss": 0.93865746, + "num_input_tokens_seen": 9188430, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 2.140625, + "step": 438, + "time_per_iteration": 2.4693844318389893 + }, + { + "auxiliary_loss_clip": 0.01155458, + "auxiliary_loss_mlp": 0.01064255, + "balance_loss_clip": 1.05276346, + "balance_loss_mlp": 1.0448494, + "epoch": 0.02639410792123854, + "flos": 64572020691840.0, + "grad_norm": 0.877669139368542, + "language_loss": 0.62577796, + "learning_rate": 3.917518232637377e-06, + "loss": 0.64797509, + "num_input_tokens_seen": 9255835, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 1.109375, + "step": 439, + "time_per_iteration": 3.162259101867676 + }, + { + "auxiliary_loss_clip": 0.01303989, + "auxiliary_loss_mlp": 0.01098096, + "balance_loss_clip": 1.05873275, + "balance_loss_mlp": 1.08440769, + "epoch": 0.02645423117390651, + "flos": 28473462921600.0, + "grad_norm": 2.1384369611317493, + "language_loss": 0.75629139, + "learning_rate": 3.918983198419573e-06, + "loss": 0.78031218, + "num_input_tokens_seen": 9276835, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 2.203125, + "step": 440, + "time_per_iteration": 2.5541677474975586 + }, + { + "auxiliary_loss_clip": 0.01294139, + "auxiliary_loss_mlp": 0.01082398, + "balance_loss_clip": 1.04408443, + "balance_loss_mlp": 1.08003163, + "epoch": 0.026514354426574478, + "flos": 18551237495040.0, + "grad_norm": 1.9583565981573345, + "language_loss": 0.83186466, + "learning_rate": 3.920444838510415e-06, + "loss": 0.85563004, + "num_input_tokens_seen": 9295075, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 2.140625, + "step": 441, + "time_per_iteration": 2.453705072402954 + }, + { + "auxiliary_loss_clip": 0.01298235, + "auxiliary_loss_mlp": 0.01092726, + "balance_loss_clip": 1.05286217, + "balance_loss_mlp": 1.07855892, + "epoch": 0.026574477679242446, + "flos": 20667812359680.0, + "grad_norm": 2.035076381127293, + "language_loss": 0.7850582, + "learning_rate": 3.92190316797534e-06, + "loss": 0.80896777, + "num_input_tokens_seen": 9314205, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 2.203125, + "step": 442, + "time_per_iteration": 2.477555990219116 + }, + { + "auxiliary_loss_clip": 0.01145517, + "auxiliary_loss_mlp": 0.01012445, + "balance_loss_clip": 1.00185919, + "balance_loss_mlp": 1.04045749, + "epoch": 0.026634600931910415, + "flos": 57956125340160.0, + "grad_norm": 0.9584767110468104, + "language_loss": 0.64475185, + "learning_rate": 3.92335820177765e-06, + "loss": 0.66633147, + "num_input_tokens_seen": 9367395, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 1.046875, + "step": 443, + "time_per_iteration": 2.9838714599609375 + }, + { + "auxiliary_loss_clip": 0.01297944, + "auxiliary_loss_mlp": 0.01087685, + "balance_loss_clip": 1.04941845, + "balance_loss_mlp": 1.08318424, + "epoch": 0.026694724184578387, + "flos": 15815131827840.0, + "grad_norm": 2.4335650573352483, + "language_loss": 0.82707053, + "learning_rate": 3.924809954779425e-06, + "loss": 0.85092688, + "num_input_tokens_seen": 9385185, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 2.140625, + "step": 444, + "time_per_iteration": 2.4520323276519775 + }, + { + "auxiliary_loss_clip": 0.0130195, + "auxiliary_loss_mlp": 0.0108515, + "balance_loss_clip": 1.0440464, + "balance_loss_mlp": 1.08103406, + "epoch": 0.026754847437246355, + "flos": 23440259612160.0, + "grad_norm": 2.6903851096875733, + "language_loss": 0.95400113, + "learning_rate": 3.9262584417424425e-06, + "loss": 0.97787213, + "num_input_tokens_seen": 9403225, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 2.21875, + "step": 445, + "time_per_iteration": 2.5113518238067627 + }, + { + "auxiliary_loss_clip": 0.01296406, + "auxiliary_loss_mlp": 0.01096489, + "balance_loss_clip": 1.05657816, + "balance_loss_mlp": 1.08177555, + "epoch": 0.026814970689914324, + "flos": 17341801632000.0, + "grad_norm": 2.416617421630428, + "language_loss": 0.91790259, + "learning_rate": 3.9277036773290725e-06, + "loss": 0.94183153, + "num_input_tokens_seen": 9420540, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 2.15625, + "step": 446, + "time_per_iteration": 2.4585111141204834 + }, + { + "auxiliary_loss_clip": 0.01293099, + "auxiliary_loss_mlp": 0.01085762, + "balance_loss_clip": 1.04718637, + "balance_loss_mlp": 1.08102632, + "epoch": 0.026875093942582293, + "flos": 17894718662400.0, + "grad_norm": 2.3983095061811635, + "language_loss": 0.80024058, + "learning_rate": 3.92914567610317e-06, + "loss": 0.82402921, + "num_input_tokens_seen": 9438840, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 2.125, + "step": 447, + "time_per_iteration": 2.509643316268921 + }, + { + "auxiliary_loss_clip": 0.01292768, + "auxiliary_loss_mlp": 0.01072511, + "balance_loss_clip": 1.03658175, + "balance_loss_mlp": 1.07935369, + "epoch": 0.026935217195250265, + "flos": 21723980889600.0, + "grad_norm": 2.4579217038825423, + "language_loss": 0.86773896, + "learning_rate": 3.930584452530952e-06, + "loss": 0.89139175, + "num_input_tokens_seen": 9457215, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 2.125, + "step": 448, + "time_per_iteration": 2.477384328842163 + }, + { + "auxiliary_loss_clip": 0.01287268, + "auxiliary_loss_mlp": 0.01093327, + "balance_loss_clip": 1.0583508, + "balance_loss_mlp": 1.07870793, + "epoch": 0.026995340447918233, + "flos": 23622685810560.0, + "grad_norm": 2.1426472419274503, + "language_loss": 0.88779259, + "learning_rate": 3.9320200209818755e-06, + "loss": 0.91159856, + "num_input_tokens_seen": 9475615, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 2.078125, + "step": 449, + "time_per_iteration": 2.50108003616333 + }, + { + "auxiliary_loss_clip": 0.01298718, + "auxiliary_loss_mlp": 0.01087936, + "balance_loss_clip": 1.04897857, + "balance_loss_mlp": 1.08056545, + "epoch": 0.027055463700586202, + "flos": 17931275729280.0, + "grad_norm": 1.9975703664508544, + "language_loss": 0.80516291, + "learning_rate": 3.933452395729493e-06, + "loss": 0.82902944, + "num_input_tokens_seen": 9493975, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.1875, + "step": 450, + "time_per_iteration": 2.470536947250366 + }, + { + "auxiliary_loss_clip": 0.01291132, + "auxiliary_loss_mlp": 0.01077476, + "balance_loss_clip": 1.03973413, + "balance_loss_mlp": 1.08217299, + "epoch": 0.02711558695325417, + "flos": 25118903859840.0, + "grad_norm": 2.7768383062811637, + "language_loss": 0.81500483, + "learning_rate": 3.934881590952304e-06, + "loss": 0.83869088, + "num_input_tokens_seen": 9514810, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 2.09375, + "step": 451, + "time_per_iteration": 2.530539035797119 + }, + { + "auxiliary_loss_clip": 0.01289442, + "auxiliary_loss_mlp": 0.0109125, + "balance_loss_clip": 1.0524354, + "balance_loss_mlp": 1.08151317, + "epoch": 0.02717571020592214, + "flos": 24239559006720.0, + "grad_norm": 1.5925691418309382, + "language_loss": 0.76994318, + "learning_rate": 3.936307620734599e-06, + "loss": 0.79375011, + "num_input_tokens_seen": 9533635, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 2.078125, + "step": 452, + "time_per_iteration": 2.5138871669769287 + }, + { + "auxiliary_loss_clip": 0.01292925, + "auxiliary_loss_mlp": 0.01088314, + "balance_loss_clip": 1.0507158, + "balance_loss_mlp": 1.08201516, + "epoch": 0.02723583345859011, + "flos": 25118939773440.0, + "grad_norm": 1.9334646917545748, + "language_loss": 0.73053265, + "learning_rate": 3.937730499067294e-06, + "loss": 0.754345, + "num_input_tokens_seen": 9555420, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 2.109375, + "step": 453, + "time_per_iteration": 2.5271401405334473 + }, + { + "auxiliary_loss_clip": 0.01288113, + "auxiliary_loss_mlp": 0.01086026, + "balance_loss_clip": 1.04952383, + "balance_loss_mlp": 1.08018303, + "epoch": 0.02729595671125808, + "flos": 42741597847680.0, + "grad_norm": 1.845498968311748, + "language_loss": 0.82439983, + "learning_rate": 3.939150239848748e-06, + "loss": 0.84814119, + "num_input_tokens_seen": 9578950, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 2.078125, + "step": 454, + "time_per_iteration": 2.6724069118499756 + }, + { + "auxiliary_loss_clip": 0.01290287, + "auxiliary_loss_mlp": 0.01078957, + "balance_loss_clip": 1.04491115, + "balance_loss_mlp": 1.0808264, + "epoch": 0.02735607996392605, + "flos": 21430985650560.0, + "grad_norm": 2.1414002490484005, + "language_loss": 0.75815403, + "learning_rate": 3.9405668568855866e-06, + "loss": 0.78184646, + "num_input_tokens_seen": 9598160, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 2.09375, + "step": 455, + "time_per_iteration": 2.496913194656372 + }, + { + "auxiliary_loss_clip": 0.01290624, + "auxiliary_loss_mlp": 0.01097119, + "balance_loss_clip": 1.06114161, + "balance_loss_mlp": 1.07846022, + "epoch": 0.027416203216594017, + "flos": 20851280052480.0, + "grad_norm": 2.102028743174525, + "language_loss": 0.80576169, + "learning_rate": 3.941980363893499e-06, + "loss": 0.82963914, + "num_input_tokens_seen": 9616010, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 2.125, + "step": 456, + "time_per_iteration": 2.4748263359069824 + }, + { + "auxiliary_loss_clip": 0.01286184, + "auxiliary_loss_mlp": 0.01078793, + "balance_loss_clip": 1.04152811, + "balance_loss_mlp": 1.07863176, + "epoch": 0.027476326469261986, + "flos": 13224500242560.0, + "grad_norm": 2.479828414472028, + "language_loss": 0.81621009, + "learning_rate": 3.9433907744980384e-06, + "loss": 0.83985978, + "num_input_tokens_seen": 9634000, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 2.078125, + "step": 457, + "time_per_iteration": 2.5122945308685303 + }, + { + "auxiliary_loss_clip": 0.01289671, + "auxiliary_loss_mlp": 0.01084101, + "balance_loss_clip": 1.04728937, + "balance_loss_mlp": 1.07828617, + "epoch": 0.027536449721929958, + "flos": 24024526237440.0, + "grad_norm": 2.0492464691581476, + "language_loss": 0.94062889, + "learning_rate": 3.944798102235412e-06, + "loss": 0.96436661, + "num_input_tokens_seen": 9653455, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 2.109375, + "step": 458, + "time_per_iteration": 2.542919874191284 + }, + { + "auxiliary_loss_clip": 0.01287914, + "auxiliary_loss_mlp": 0.01093849, + "balance_loss_clip": 1.05872989, + "balance_loss_mlp": 1.07926297, + "epoch": 0.027596572974597926, + "flos": 13006055681280.0, + "grad_norm": 2.4293190258203774, + "language_loss": 0.79353511, + "learning_rate": 3.9462023605532545e-06, + "loss": 0.81735277, + "num_input_tokens_seen": 9669650, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 2.09375, + "step": 459, + "time_per_iteration": 2.472830295562744 + }, + { + "auxiliary_loss_clip": 0.01293203, + "auxiliary_loss_mlp": 0.01082653, + "balance_loss_clip": 1.04360008, + "balance_loss_mlp": 1.08543491, + "epoch": 0.027656696227265895, + "flos": 26143076350080.0, + "grad_norm": 1.8472887331493792, + "language_loss": 0.83103061, + "learning_rate": 3.947603562811407e-06, + "loss": 0.85478914, + "num_input_tokens_seen": 9691415, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.078125, + "step": 460, + "time_per_iteration": 2.5376338958740234 + }, + { + "auxiliary_loss_clip": 0.01140517, + "auxiliary_loss_mlp": 0.01043703, + "balance_loss_clip": 1.03488147, + "balance_loss_mlp": 1.03798664, + "epoch": 0.027716819479933864, + "flos": 60697222997760.0, + "grad_norm": 1.5738760379538346, + "language_loss": 0.73565412, + "learning_rate": 3.949001722282675e-06, + "loss": 0.7574963, + "num_input_tokens_seen": 9755605, + "router_z_loss_clip": 0.08837891, + "router_z_loss_mlp": 1.0234375, + "step": 461, + "time_per_iteration": 3.0358285903930664 + }, + { + "auxiliary_loss_clip": 0.01289208, + "auxiliary_loss_mlp": 0.01081781, + "balance_loss_clip": 1.04735351, + "balance_loss_mlp": 1.086905, + "epoch": 0.027776942732601832, + "flos": 31211938886400.0, + "grad_norm": 2.85425781388422, + "language_loss": 0.81291741, + "learning_rate": 3.950396852153582e-06, + "loss": 0.83662736, + "num_input_tokens_seen": 9776270, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 2.015625, + "step": 462, + "time_per_iteration": 2.6079564094543457 + }, + { + "auxiliary_loss_clip": 0.01287586, + "auxiliary_loss_mlp": 0.01073577, + "balance_loss_clip": 1.04096127, + "balance_loss_mlp": 1.08167982, + "epoch": 0.027837065985269804, + "flos": 22674644196480.0, + "grad_norm": 2.2822341634579195, + "language_loss": 0.90235889, + "learning_rate": 3.951788965525118e-06, + "loss": 0.92597055, + "num_input_tokens_seen": 9794465, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 2.0625, + "step": 463, + "time_per_iteration": 2.4881155490875244 + }, + { + "auxiliary_loss_clip": 0.01137482, + "auxiliary_loss_mlp": 0.01014393, + "balance_loss_clip": 1.00561893, + "balance_loss_mlp": 1.03824747, + "epoch": 0.027897189237937773, + "flos": 62182487399040.0, + "grad_norm": 0.8835585057209928, + "language_loss": 0.59031862, + "learning_rate": 3.953178075413476e-06, + "loss": 0.61183739, + "num_input_tokens_seen": 9849685, + "router_z_loss_clip": 0.08789062, + "router_z_loss_mlp": 0.9921875, + "step": 464, + "time_per_iteration": 3.039377212524414 + }, + { + "auxiliary_loss_clip": 0.01299905, + "auxiliary_loss_mlp": 0.01097461, + "balance_loss_clip": 1.06081581, + "balance_loss_mlp": 1.08716702, + "epoch": 0.02795731249060574, + "flos": 24493160004480.0, + "grad_norm": 2.8663863440598525, + "language_loss": 0.81203198, + "learning_rate": 3.954564194750784e-06, + "loss": 0.83600569, + "num_input_tokens_seen": 9869505, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 2.125, + "step": 465, + "time_per_iteration": 2.5197718143463135 + }, + { + "auxiliary_loss_clip": 0.01286546, + "auxiliary_loss_mlp": 0.01082829, + "balance_loss_clip": 1.04708982, + "balance_loss_mlp": 1.08028877, + "epoch": 0.02801743574327371, + "flos": 23733003456000.0, + "grad_norm": 2.004656273762408, + "language_loss": 0.78560221, + "learning_rate": 3.955947336385828e-06, + "loss": 0.80929601, + "num_input_tokens_seen": 9890950, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 2.0625, + "step": 466, + "time_per_iteration": 2.5151565074920654 + }, + { + "auxiliary_loss_clip": 0.01285777, + "auxiliary_loss_mlp": 0.01085472, + "balance_loss_clip": 1.05075812, + "balance_loss_mlp": 1.0816046, + "epoch": 0.02807755899594168, + "flos": 20629100476800.0, + "grad_norm": 2.05931728393333, + "language_loss": 0.87548482, + "learning_rate": 3.957327513084761e-06, + "loss": 0.89919734, + "num_input_tokens_seen": 9911265, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 2.03125, + "step": 467, + "time_per_iteration": 2.4994542598724365 + }, + { + "auxiliary_loss_clip": 0.01289137, + "auxiliary_loss_mlp": 0.01106554, + "balance_loss_clip": 1.06969416, + "balance_loss_mlp": 1.08202362, + "epoch": 0.02813768224860965, + "flos": 19244564789760.0, + "grad_norm": 2.728881931821799, + "language_loss": 0.86217642, + "learning_rate": 3.958704737531818e-06, + "loss": 0.88613331, + "num_input_tokens_seen": 9929025, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 2.0625, + "step": 468, + "time_per_iteration": 2.482377767562866 + }, + { + "auxiliary_loss_clip": 0.01287545, + "auxiliary_loss_mlp": 0.01081999, + "balance_loss_clip": 1.0447104, + "balance_loss_mlp": 1.07984936, + "epoch": 0.02819780550127762, + "flos": 20813968800000.0, + "grad_norm": 3.6924571591440762, + "language_loss": 0.91605878, + "learning_rate": 3.9600790223300065e-06, + "loss": 0.93975413, + "num_input_tokens_seen": 9945190, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 2.078125, + "step": 469, + "time_per_iteration": 2.471510648727417 + }, + { + "auxiliary_loss_clip": 0.01286876, + "auxiliary_loss_mlp": 0.01096778, + "balance_loss_clip": 1.06106234, + "balance_loss_mlp": 1.08290672, + "epoch": 0.028257928753945588, + "flos": 19974125928960.0, + "grad_norm": 8.38112094971343, + "language_loss": 0.81587195, + "learning_rate": 3.96145038000181e-06, + "loss": 0.83970851, + "num_input_tokens_seen": 9962820, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 2.03125, + "step": 470, + "time_per_iteration": 2.5398614406585693 + }, + { + "auxiliary_loss_clip": 0.01286572, + "auxiliary_loss_mlp": 0.01085498, + "balance_loss_clip": 1.04868627, + "balance_loss_mlp": 1.07859015, + "epoch": 0.028318052006613557, + "flos": 20484488321280.0, + "grad_norm": 1.8437898933227894, + "language_loss": 0.93147206, + "learning_rate": 3.962818822989861e-06, + "loss": 0.9551928, + "num_input_tokens_seen": 9982595, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 2.078125, + "step": 471, + "time_per_iteration": 2.5005030632019043 + }, + { + "auxiliary_loss_clip": 0.0128173, + "auxiliary_loss_mlp": 0.01094713, + "balance_loss_clip": 1.05885458, + "balance_loss_mlp": 1.07808042, + "epoch": 0.02837817525928153, + "flos": 28514832410880.0, + "grad_norm": 1.89303735573371, + "language_loss": 0.757568, + "learning_rate": 3.964184363657625e-06, + "loss": 0.78133243, + "num_input_tokens_seen": 10004645, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 2.03125, + "step": 472, + "time_per_iteration": 2.597637176513672 + }, + { + "auxiliary_loss_clip": 0.0128882, + "auxiliary_loss_mlp": 0.01078393, + "balance_loss_clip": 1.04479945, + "balance_loss_mlp": 1.07699013, + "epoch": 0.028438298511949497, + "flos": 18551668458240.0, + "grad_norm": 3.986951446490631, + "language_loss": 0.93354845, + "learning_rate": 3.965547014290071e-06, + "loss": 0.95722055, + "num_input_tokens_seen": 10022555, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 2.125, + "step": 473, + "time_per_iteration": 2.4882545471191406 + }, + { + "auxiliary_loss_clip": 0.01293922, + "auxiliary_loss_mlp": 0.01115319, + "balance_loss_clip": 1.08134401, + "balance_loss_mlp": 1.08149064, + "epoch": 0.028498421764617466, + "flos": 16910227722240.0, + "grad_norm": 4.845992674029067, + "language_loss": 0.88586211, + "learning_rate": 3.96690678709433e-06, + "loss": 0.90995455, + "num_input_tokens_seen": 10041025, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 2.125, + "step": 474, + "time_per_iteration": 2.483210563659668 + }, + { + "auxiliary_loss_clip": 0.01284496, + "auxiliary_loss_mlp": 0.01091761, + "balance_loss_clip": 1.05559278, + "balance_loss_mlp": 1.07983565, + "epoch": 0.028558545017285435, + "flos": 27778699082880.0, + "grad_norm": 2.474550917046853, + "language_loss": 0.78771299, + "learning_rate": 3.968263694200355e-06, + "loss": 0.81147563, + "num_input_tokens_seen": 10060775, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 2.046875, + "step": 475, + "time_per_iteration": 2.5462486743927 + }, + { + "auxiliary_loss_clip": 0.01139312, + "auxiliary_loss_mlp": 0.0107539, + "balance_loss_clip": 1.06647348, + "balance_loss_mlp": 1.03907108, + "epoch": 0.028618668269953403, + "flos": 65654367258240.0, + "grad_norm": 0.9304884927077405, + "language_loss": 0.66880804, + "learning_rate": 3.969617747661569e-06, + "loss": 0.6909551, + "num_input_tokens_seen": 10120225, + "router_z_loss_clip": 0.08935547, + "router_z_loss_mlp": 1.0, + "step": 476, + "time_per_iteration": 5.8287513256073 + }, + { + "auxiliary_loss_clip": 0.01286666, + "auxiliary_loss_mlp": 0.01081774, + "balance_loss_clip": 1.04527175, + "balance_loss_mlp": 1.0796659, + "epoch": 0.028678791522621375, + "flos": 21937074324480.0, + "grad_norm": 2.9569520931335775, + "language_loss": 0.83852398, + "learning_rate": 3.970968959455509e-06, + "loss": 0.86220837, + "num_input_tokens_seen": 10137880, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 2.078125, + "step": 477, + "time_per_iteration": 2.5179195404052734 + }, + { + "auxiliary_loss_clip": 0.01293161, + "auxiliary_loss_mlp": 0.01088101, + "balance_loss_clip": 1.05164671, + "balance_loss_mlp": 1.08298135, + "epoch": 0.028738914775289344, + "flos": 24572128055040.0, + "grad_norm": 2.2048636254017504, + "language_loss": 0.82267237, + "learning_rate": 3.97231734148446e-06, + "loss": 0.84648502, + "num_input_tokens_seen": 10156930, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 2.09375, + "step": 478, + "time_per_iteration": 2.495760679244995 + }, + { + "auxiliary_loss_clip": 0.01283274, + "auxiliary_loss_mlp": 0.01076252, + "balance_loss_clip": 1.0409658, + "balance_loss_mlp": 1.07707858, + "epoch": 0.028799038027957313, + "flos": 23257977068160.0, + "grad_norm": 2.28603697529264, + "language_loss": 0.81010443, + "learning_rate": 3.973662905576082e-06, + "loss": 0.8336997, + "num_input_tokens_seen": 10176295, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 2.0625, + "step": 479, + "time_per_iteration": 2.491910934448242 + }, + { + "auxiliary_loss_clip": 0.01281719, + "auxiliary_loss_mlp": 0.01080307, + "balance_loss_clip": 1.04323328, + "balance_loss_mlp": 1.07729793, + "epoch": 0.02885916128062528, + "flos": 22164102236160.0, + "grad_norm": 2.2385690137770715, + "language_loss": 0.73465097, + "learning_rate": 3.975005663484038e-06, + "loss": 0.75827128, + "num_input_tokens_seen": 10195790, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 2.03125, + "step": 480, + "time_per_iteration": 2.4959068298339844 + }, + { + "auxiliary_loss_clip": 0.01280408, + "auxiliary_loss_mlp": 0.01071287, + "balance_loss_clip": 1.03945768, + "balance_loss_mlp": 1.07837129, + "epoch": 0.02891928453329325, + "flos": 22932842135040.0, + "grad_norm": 1.6612342828976938, + "language_loss": 0.87719476, + "learning_rate": 3.976345626888605e-06, + "loss": 0.90071172, + "num_input_tokens_seen": 10218405, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 2.03125, + "step": 481, + "time_per_iteration": 2.534792184829712 + }, + { + "auxiliary_loss_clip": 0.0113967, + "auxiliary_loss_mlp": 0.01022688, + "balance_loss_clip": 1.01367593, + "balance_loss_mlp": 1.03470159, + "epoch": 0.028979407785961222, + "flos": 57432941792640.0, + "grad_norm": 0.8259666239631118, + "language_loss": 0.66064727, + "learning_rate": 3.9776828073972864e-06, + "loss": 0.68227088, + "num_input_tokens_seen": 10271005, + "router_z_loss_clip": 0.09033203, + "router_z_loss_mlp": 1.046875, + "step": 482, + "time_per_iteration": 2.8219997882843018 + }, + { + "auxiliary_loss_clip": 0.01295379, + "auxiliary_loss_mlp": 0.01073835, + "balance_loss_clip": 1.04014635, + "balance_loss_mlp": 1.08159328, + "epoch": 0.02903953103862919, + "flos": 16722737706240.0, + "grad_norm": 2.373570732629757, + "language_loss": 0.78743541, + "learning_rate": 3.979017216545415e-06, + "loss": 0.81112754, + "num_input_tokens_seen": 10288405, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 2.140625, + "step": 483, + "time_per_iteration": 2.4733006954193115 + }, + { + "auxiliary_loss_clip": 0.01293434, + "auxiliary_loss_mlp": 0.01090935, + "balance_loss_clip": 1.0548625, + "balance_loss_mlp": 1.08311069, + "epoch": 0.02909965429129716, + "flos": 16763640318720.0, + "grad_norm": 2.520023812901894, + "language_loss": 0.75405324, + "learning_rate": 3.980348865796749e-06, + "loss": 0.77789688, + "num_input_tokens_seen": 10306875, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 2.109375, + "step": 484, + "time_per_iteration": 2.466634750366211 + }, + { + "auxiliary_loss_clip": 0.01288089, + "auxiliary_loss_mlp": 0.01078618, + "balance_loss_clip": 1.04459584, + "balance_loss_mlp": 1.08002305, + "epoch": 0.029159777543965128, + "flos": 19785343023360.0, + "grad_norm": 2.0323982063196153, + "language_loss": 0.84021544, + "learning_rate": 3.9816777665440615e-06, + "loss": 0.86388254, + "num_input_tokens_seen": 10323965, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 2.078125, + "step": 485, + "time_per_iteration": 2.511415719985962 + }, + { + "auxiliary_loss_clip": 0.01293039, + "auxiliary_loss_mlp": 0.01081906, + "balance_loss_clip": 1.04740667, + "balance_loss_mlp": 1.08659554, + "epoch": 0.029219900796633096, + "flos": 19642670202240.0, + "grad_norm": 1.9066132168030567, + "language_loss": 0.84465218, + "learning_rate": 3.983003930109732e-06, + "loss": 0.86840165, + "num_input_tokens_seen": 10342620, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 2.0625, + "step": 486, + "time_per_iteration": 2.453583002090454 + }, + { + "auxiliary_loss_clip": 0.01284719, + "auxiliary_loss_mlp": 0.01083872, + "balance_loss_clip": 1.04841876, + "balance_loss_mlp": 1.07841349, + "epoch": 0.02928002404930107, + "flos": 25885704424320.0, + "grad_norm": 1.9228432408219163, + "language_loss": 0.8891986, + "learning_rate": 3.984327367746315e-06, + "loss": 0.91288453, + "num_input_tokens_seen": 10364610, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 2.0625, + "step": 487, + "time_per_iteration": 2.5558598041534424 + }, + { + "auxiliary_loss_clip": 0.0128758, + "auxiliary_loss_mlp": 0.01070867, + "balance_loss_clip": 1.03806067, + "balance_loss_mlp": 1.08095598, + "epoch": 0.029340147301969037, + "flos": 20660234590080.0, + "grad_norm": 2.5260996981700456, + "language_loss": 0.87981069, + "learning_rate": 3.985648090637122e-06, + "loss": 0.90339512, + "num_input_tokens_seen": 10380910, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 2.0625, + "step": 488, + "time_per_iteration": 2.5299952030181885 + }, + { + "auxiliary_loss_clip": 0.01283325, + "auxiliary_loss_mlp": 0.01079627, + "balance_loss_clip": 1.0449605, + "balance_loss_mlp": 1.07794333, + "epoch": 0.029400270554637006, + "flos": 24428018689920.0, + "grad_norm": 2.1862911790042543, + "language_loss": 0.88956475, + "learning_rate": 3.986966109896785e-06, + "loss": 0.9131943, + "num_input_tokens_seen": 10400665, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 2.046875, + "step": 489, + "time_per_iteration": 2.545240879058838 + }, + { + "auxiliary_loss_clip": 0.0127768, + "auxiliary_loss_mlp": 0.01078157, + "balance_loss_clip": 1.04322839, + "balance_loss_mlp": 1.07402337, + "epoch": 0.029460393807304974, + "flos": 20120892900480.0, + "grad_norm": 2.0397830948196756, + "language_loss": 0.88539088, + "learning_rate": 3.988281436571815e-06, + "loss": 0.90894926, + "num_input_tokens_seen": 10420150, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 2.03125, + "step": 490, + "time_per_iteration": 2.4727838039398193 + }, + { + "auxiliary_loss_clip": 0.01284238, + "auxiliary_loss_mlp": 0.01081508, + "balance_loss_clip": 1.04774833, + "balance_loss_mlp": 1.07731342, + "epoch": 0.029520517059972943, + "flos": 17675914965120.0, + "grad_norm": 2.230679327742206, + "language_loss": 0.91299963, + "learning_rate": 3.989594081641164e-06, + "loss": 0.93665713, + "num_input_tokens_seen": 10438210, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 2.0625, + "step": 491, + "time_per_iteration": 2.4900829792022705 + }, + { + "auxiliary_loss_clip": 0.01274874, + "auxiliary_loss_mlp": 0.01070684, + "balance_loss_clip": 1.03804421, + "balance_loss_mlp": 1.0749476, + "epoch": 0.029580640312640915, + "flos": 18953185662720.0, + "grad_norm": 2.419480988494796, + "language_loss": 0.85232413, + "learning_rate": 3.9909040560167675e-06, + "loss": 0.87577969, + "num_input_tokens_seen": 10455125, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 2.0, + "step": 492, + "time_per_iteration": 2.457188844680786 + }, + { + "auxiliary_loss_clip": 0.0128558, + "auxiliary_loss_mlp": 0.01093772, + "balance_loss_clip": 1.05939209, + "balance_loss_mlp": 1.08082771, + "epoch": 0.029640763565308884, + "flos": 18726121837440.0, + "grad_norm": 2.826333733481051, + "language_loss": 0.83989829, + "learning_rate": 3.992211370544093e-06, + "loss": 0.86369187, + "num_input_tokens_seen": 10470990, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 2.046875, + "step": 493, + "time_per_iteration": 2.4821553230285645 + }, + { + "auxiliary_loss_clip": 0.01280126, + "auxiliary_loss_mlp": 0.01079048, + "balance_loss_clip": 1.04586005, + "balance_loss_mlp": 1.07578444, + "epoch": 0.029700886817976852, + "flos": 20595308757120.0, + "grad_norm": 1.8259196989393787, + "language_loss": 0.86575663, + "learning_rate": 3.99351603600268e-06, + "loss": 0.88934839, + "num_input_tokens_seen": 10490685, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 2.046875, + "step": 494, + "time_per_iteration": 2.507068395614624 + }, + { + "auxiliary_loss_clip": 0.01286409, + "auxiliary_loss_mlp": 0.01082408, + "balance_loss_clip": 1.05084157, + "balance_loss_mlp": 1.07973599, + "epoch": 0.02976101007064482, + "flos": 22236857233920.0, + "grad_norm": 4.414490317498679, + "language_loss": 0.86250752, + "learning_rate": 3.994818063106668e-06, + "loss": 0.88619578, + "num_input_tokens_seen": 10509435, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 2.0625, + "step": 495, + "time_per_iteration": 2.498401165008545 + }, + { + "auxiliary_loss_clip": 0.01274095, + "auxiliary_loss_mlp": 0.0107342, + "balance_loss_clip": 1.04144859, + "balance_loss_mlp": 1.07653904, + "epoch": 0.029821133323312793, + "flos": 23732644320000.0, + "grad_norm": 1.893732744603442, + "language_loss": 0.6230706, + "learning_rate": 3.99611746250533e-06, + "loss": 0.64654577, + "num_input_tokens_seen": 10530050, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 1.9765625, + "step": 496, + "time_per_iteration": 2.499669313430786 + }, + { + "auxiliary_loss_clip": 0.01276388, + "auxiliary_loss_mlp": 0.01085353, + "balance_loss_clip": 1.05314219, + "balance_loss_mlp": 1.07830799, + "epoch": 0.02988125657598076, + "flos": 22419498913920.0, + "grad_norm": 1.8423417765009742, + "language_loss": 0.88582325, + "learning_rate": 3.997414244783595e-06, + "loss": 0.90944064, + "num_input_tokens_seen": 10551370, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 1.984375, + "step": 497, + "time_per_iteration": 2.5570924282073975 + }, + { + "auxiliary_loss_clip": 0.01282787, + "auxiliary_loss_mlp": 0.0108035, + "balance_loss_clip": 1.04711461, + "balance_loss_mlp": 1.07822609, + "epoch": 0.02994137982864873, + "flos": 13845108453120.0, + "grad_norm": 3.4064142479622377, + "language_loss": 0.85174376, + "learning_rate": 3.998708420462557e-06, + "loss": 0.87537515, + "num_input_tokens_seen": 10569225, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 2.046875, + "step": 498, + "time_per_iteration": 2.513601541519165 + }, + { + "auxiliary_loss_clip": 0.01281177, + "auxiliary_loss_mlp": 0.01082811, + "balance_loss_clip": 1.05052912, + "balance_loss_mlp": 1.07829463, + "epoch": 0.0300015030813167, + "flos": 23908354675200.0, + "grad_norm": 37.23719619981942, + "language_loss": 0.78152531, + "learning_rate": 4e-06, + "loss": 0.80516517, + "num_input_tokens_seen": 10586170, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 2.03125, + "step": 499, + "time_per_iteration": 2.4924824237823486 + }, + { + "auxiliary_loss_clip": 0.01282354, + "auxiliary_loss_mlp": 0.01080564, + "balance_loss_clip": 1.04818654, + "balance_loss_mlp": 1.08037949, + "epoch": 0.030061626333984667, + "flos": 22016796560640.0, + "grad_norm": 3.687829420060643, + "language_loss": 0.8271451, + "learning_rate": 3.9999999620799e-06, + "loss": 0.85077423, + "num_input_tokens_seen": 10606205, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 2.015625, + "step": 500, + "time_per_iteration": 2.494333028793335 + }, + { + "auxiliary_loss_clip": 0.01274571, + "auxiliary_loss_mlp": 0.01084389, + "balance_loss_clip": 1.04924583, + "balance_loss_mlp": 1.07541978, + "epoch": 0.03012174958665264, + "flos": 23039747988480.0, + "grad_norm": 2.6096117253121447, + "language_loss": 0.88464928, + "learning_rate": 3.9999998483196e-06, + "loss": 0.90823889, + "num_input_tokens_seen": 10625995, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 1.9921875, + "step": 501, + "time_per_iteration": 2.494575262069702 + }, + { + "auxiliary_loss_clip": 0.01283018, + "auxiliary_loss_mlp": 0.01073076, + "balance_loss_clip": 1.04158127, + "balance_loss_mlp": 1.07912767, + "epoch": 0.030181872839320608, + "flos": 18953257489920.0, + "grad_norm": 2.304054979465899, + "language_loss": 0.86586684, + "learning_rate": 3.9999996587191065e-06, + "loss": 0.88942778, + "num_input_tokens_seen": 10644105, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 2.03125, + "step": 502, + "time_per_iteration": 2.4574413299560547 + }, + { + "auxiliary_loss_clip": 0.01278734, + "auxiliary_loss_mlp": 0.01077839, + "balance_loss_clip": 1.0444839, + "balance_loss_mlp": 1.07952762, + "epoch": 0.030241996091988577, + "flos": 16728017005440.0, + "grad_norm": 2.6244890775354976, + "language_loss": 0.84661186, + "learning_rate": 3.999999393278425e-06, + "loss": 0.87017757, + "num_input_tokens_seen": 10661090, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 1.9921875, + "step": 503, + "time_per_iteration": 2.4406938552856445 + }, + { + "auxiliary_loss_clip": 0.0127278, + "auxiliary_loss_mlp": 0.01082796, + "balance_loss_clip": 1.05008519, + "balance_loss_mlp": 1.07727659, + "epoch": 0.030302119344656545, + "flos": 28621271387520.0, + "grad_norm": 1.6755724800263092, + "language_loss": 0.88215417, + "learning_rate": 3.999999051997567e-06, + "loss": 0.90570992, + "num_input_tokens_seen": 10682380, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.953125, + "step": 504, + "time_per_iteration": 2.5319011211395264 + }, + { + "auxiliary_loss_clip": 0.01274883, + "auxiliary_loss_mlp": 0.01087129, + "balance_loss_clip": 1.05556226, + "balance_loss_mlp": 1.07692564, + "epoch": 0.030362242597324514, + "flos": 15669334523520.0, + "grad_norm": 2.2080583468347, + "language_loss": 0.78446162, + "learning_rate": 3.9999986348765425e-06, + "loss": 0.80808175, + "num_input_tokens_seen": 10699925, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.9765625, + "step": 505, + "time_per_iteration": 2.4724690914154053 + }, + { + "auxiliary_loss_clip": 0.01135682, + "auxiliary_loss_mlp": 0.01018291, + "balance_loss_clip": 1.00927854, + "balance_loss_mlp": 1.04092085, + "epoch": 0.030422365849992486, + "flos": 72125973676800.0, + "grad_norm": 0.8461866637376847, + "language_loss": 0.55057126, + "learning_rate": 3.999998141915371e-06, + "loss": 0.57211095, + "num_input_tokens_seen": 10766525, + "router_z_loss_clip": 0.09033203, + "router_z_loss_mlp": 0.9453125, + "step": 506, + "time_per_iteration": 3.2490124702453613 + }, + { + "auxiliary_loss_clip": 0.01274292, + "auxiliary_loss_mlp": 0.01087138, + "balance_loss_clip": 1.05418897, + "balance_loss_mlp": 1.0756762, + "epoch": 0.030482489102660455, + "flos": 19427817000960.0, + "grad_norm": 1.9034614277572226, + "language_loss": 0.83767861, + "learning_rate": 3.999997573114069e-06, + "loss": 0.8612929, + "num_input_tokens_seen": 10786725, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.984375, + "step": 507, + "time_per_iteration": 2.48811674118042 + }, + { + "auxiliary_loss_clip": 0.01280318, + "auxiliary_loss_mlp": 0.01080114, + "balance_loss_clip": 1.04778421, + "balance_loss_mlp": 1.07709789, + "epoch": 0.030542612355328423, + "flos": 20375822701440.0, + "grad_norm": 2.5950154193771526, + "language_loss": 0.88689649, + "learning_rate": 3.999996928472659e-06, + "loss": 0.91050076, + "num_input_tokens_seen": 10805390, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 2.03125, + "step": 508, + "time_per_iteration": 2.4966533184051514 + }, + { + "auxiliary_loss_clip": 0.01281637, + "auxiliary_loss_mlp": 0.01063766, + "balance_loss_clip": 1.0311265, + "balance_loss_mlp": 1.07728887, + "epoch": 0.030602735607996392, + "flos": 34677354297600.0, + "grad_norm": 2.2339008285543227, + "language_loss": 0.71499902, + "learning_rate": 3.999996207991165e-06, + "loss": 0.73845309, + "num_input_tokens_seen": 10828030, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 2.03125, + "step": 509, + "time_per_iteration": 2.5966317653656006 + }, + { + "auxiliary_loss_clip": 0.01274736, + "auxiliary_loss_mlp": 0.01072718, + "balance_loss_clip": 1.04229546, + "balance_loss_mlp": 1.07770133, + "epoch": 0.03066285886066436, + "flos": 23658668259840.0, + "grad_norm": 2.064360756351981, + "language_loss": 0.82369828, + "learning_rate": 3.999995411669614e-06, + "loss": 0.8471728, + "num_input_tokens_seen": 10845240, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.9765625, + "step": 510, + "time_per_iteration": 2.5276355743408203 + }, + { + "auxiliary_loss_clip": 0.01280977, + "auxiliary_loss_mlp": 0.01082699, + "balance_loss_clip": 1.04984498, + "balance_loss_mlp": 1.08235979, + "epoch": 0.030722982113332332, + "flos": 23002975440000.0, + "grad_norm": 2.1614325499153693, + "language_loss": 0.83621502, + "learning_rate": 3.999994539508036e-06, + "loss": 0.85985172, + "num_input_tokens_seen": 10864325, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.984375, + "step": 511, + "time_per_iteration": 2.503779888153076 + }, + { + "auxiliary_loss_clip": 0.01278507, + "auxiliary_loss_mlp": 0.01077898, + "balance_loss_clip": 1.04633093, + "balance_loss_mlp": 1.07648492, + "epoch": 0.0307831053660003, + "flos": 24750855152640.0, + "grad_norm": 2.1059740170821515, + "language_loss": 0.82234836, + "learning_rate": 3.9999935915064655e-06, + "loss": 0.8459124, + "num_input_tokens_seen": 10883860, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 2.03125, + "step": 512, + "time_per_iteration": 2.5306975841522217 + }, + { + "auxiliary_loss_clip": 0.01276149, + "auxiliary_loss_mlp": 0.01077944, + "balance_loss_clip": 1.04499435, + "balance_loss_mlp": 1.0769974, + "epoch": 0.03084322861866827, + "flos": 26140885620480.0, + "grad_norm": 1.9256325141107502, + "language_loss": 0.87030005, + "learning_rate": 3.9999925676649374e-06, + "loss": 0.89384103, + "num_input_tokens_seen": 10904555, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 1.9921875, + "step": 513, + "time_per_iteration": 2.507490634918213 + }, + { + "auxiliary_loss_clip": 0.01281572, + "auxiliary_loss_mlp": 0.01080973, + "balance_loss_clip": 1.04840553, + "balance_loss_mlp": 1.07869625, + "epoch": 0.03090335187133624, + "flos": 18771298168320.0, + "grad_norm": 3.202753983864072, + "language_loss": 0.79141152, + "learning_rate": 3.999991467983491e-06, + "loss": 0.81503695, + "num_input_tokens_seen": 10923700, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 2.03125, + "step": 514, + "time_per_iteration": 2.515496015548706 + }, + { + "auxiliary_loss_clip": 0.01276725, + "auxiliary_loss_mlp": 0.01063014, + "balance_loss_clip": 1.03218651, + "balance_loss_mlp": 1.07966864, + "epoch": 0.030963475124004207, + "flos": 23221886878080.0, + "grad_norm": 2.5461002634459216, + "language_loss": 0.77459693, + "learning_rate": 3.999990292462167e-06, + "loss": 0.79799432, + "num_input_tokens_seen": 10942730, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.96875, + "step": 515, + "time_per_iteration": 2.481903553009033 + }, + { + "auxiliary_loss_clip": 0.01272098, + "auxiliary_loss_mlp": 0.0106896, + "balance_loss_clip": 1.03615332, + "balance_loss_mlp": 1.07318711, + "epoch": 0.03102359837667218, + "flos": 42525595411200.0, + "grad_norm": 1.901518391780262, + "language_loss": 0.82729101, + "learning_rate": 3.999989041101011e-06, + "loss": 0.85070157, + "num_input_tokens_seen": 10967120, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.9921875, + "step": 516, + "time_per_iteration": 2.699577808380127 + }, + { + "auxiliary_loss_clip": 0.01272185, + "auxiliary_loss_mlp": 0.01070291, + "balance_loss_clip": 1.03760433, + "balance_loss_mlp": 1.07659435, + "epoch": 0.031083721629340148, + "flos": 21176953689600.0, + "grad_norm": 2.071844032637654, + "language_loss": 0.79009813, + "learning_rate": 3.999987713900071e-06, + "loss": 0.81352293, + "num_input_tokens_seen": 10986775, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.953125, + "step": 517, + "time_per_iteration": 4.0190205574035645 + }, + { + "auxiliary_loss_clip": 0.01269009, + "auxiliary_loss_mlp": 0.01072314, + "balance_loss_clip": 1.04069996, + "balance_loss_mlp": 1.07610774, + "epoch": 0.031143844882008116, + "flos": 29716187713920.0, + "grad_norm": 1.58218863781409, + "language_loss": 0.90778029, + "learning_rate": 3.999986310859396e-06, + "loss": 0.93119347, + "num_input_tokens_seen": 11011360, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.9296875, + "step": 518, + "time_per_iteration": 4.080751657485962 + }, + { + "auxiliary_loss_clip": 0.0128372, + "auxiliary_loss_mlp": 0.01093666, + "balance_loss_clip": 1.05883288, + "balance_loss_mlp": 1.08518016, + "epoch": 0.031203968134676085, + "flos": 23112467072640.0, + "grad_norm": 3.008779144342936, + "language_loss": 0.86396456, + "learning_rate": 3.999984831979039e-06, + "loss": 0.88773847, + "num_input_tokens_seen": 11030150, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 1.984375, + "step": 519, + "time_per_iteration": 2.510267734527588 + }, + { + "auxiliary_loss_clip": 0.01278708, + "auxiliary_loss_mlp": 0.01092513, + "balance_loss_clip": 1.06092215, + "balance_loss_mlp": 1.07567024, + "epoch": 0.03126409138734405, + "flos": 20954379064320.0, + "grad_norm": 2.0313723427087216, + "language_loss": 0.87156898, + "learning_rate": 3.999983277259057e-06, + "loss": 0.8952812, + "num_input_tokens_seen": 11049145, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 2.03125, + "step": 520, + "time_per_iteration": 2.4891066551208496 + }, + { + "auxiliary_loss_clip": 0.01281744, + "auxiliary_loss_mlp": 0.01089643, + "balance_loss_clip": 1.05633557, + "balance_loss_mlp": 1.07832289, + "epoch": 0.031324214640012026, + "flos": 21650112570240.0, + "grad_norm": 1.6802829394342778, + "language_loss": 0.89362079, + "learning_rate": 3.999981646699509e-06, + "loss": 0.91733468, + "num_input_tokens_seen": 11068835, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 2.03125, + "step": 521, + "time_per_iteration": 2.508524179458618 + }, + { + "auxiliary_loss_clip": 0.01274208, + "auxiliary_loss_mlp": 0.010832, + "balance_loss_clip": 1.04889154, + "balance_loss_mlp": 1.07795191, + "epoch": 0.03138433789267999, + "flos": 23441337020160.0, + "grad_norm": 2.273639697525746, + "language_loss": 0.71327078, + "learning_rate": 3.999979940300456e-06, + "loss": 0.73684484, + "num_input_tokens_seen": 11088980, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.9609375, + "step": 522, + "time_per_iteration": 2.49629282951355 + }, + { + "auxiliary_loss_clip": 0.01278501, + "auxiliary_loss_mlp": 0.01083501, + "balance_loss_clip": 1.05150533, + "balance_loss_mlp": 1.07655358, + "epoch": 0.03144446114534796, + "flos": 18982164960000.0, + "grad_norm": 3.1208656196394706, + "language_loss": 0.84886295, + "learning_rate": 3.999978158061963e-06, + "loss": 0.87248302, + "num_input_tokens_seen": 11104300, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 2.015625, + "step": 523, + "time_per_iteration": 2.4674315452575684 + }, + { + "auxiliary_loss_clip": 0.01280597, + "auxiliary_loss_mlp": 0.01075539, + "balance_loss_clip": 1.04249442, + "balance_loss_mlp": 1.07655168, + "epoch": 0.031504584398015935, + "flos": 22637692080000.0, + "grad_norm": 1.9693639011355857, + "language_loss": 0.90419745, + "learning_rate": 3.999976299984099e-06, + "loss": 0.92775881, + "num_input_tokens_seen": 11123335, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 2.046875, + "step": 524, + "time_per_iteration": 2.480764627456665 + }, + { + "auxiliary_loss_clip": 0.01285248, + "auxiliary_loss_mlp": 0.01083988, + "balance_loss_clip": 1.05034757, + "balance_loss_mlp": 1.08102393, + "epoch": 0.0315647076506839, + "flos": 25297056339840.0, + "grad_norm": 2.4392367222760276, + "language_loss": 0.80040443, + "learning_rate": 3.999974366066933e-06, + "loss": 0.8240968, + "num_input_tokens_seen": 11140880, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 2.046875, + "step": 525, + "time_per_iteration": 2.5409629344940186 + }, + { + "auxiliary_loss_clip": 0.01277675, + "auxiliary_loss_mlp": 0.01082993, + "balance_loss_clip": 1.05025804, + "balance_loss_mlp": 1.07571197, + "epoch": 0.03162483090335187, + "flos": 16982839065600.0, + "grad_norm": 2.8378410017413658, + "language_loss": 0.80693865, + "learning_rate": 3.999972356310538e-06, + "loss": 0.83054531, + "num_input_tokens_seen": 11158710, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 2.03125, + "step": 526, + "time_per_iteration": 2.4509081840515137 + }, + { + "auxiliary_loss_clip": 0.01285808, + "auxiliary_loss_mlp": 0.01072361, + "balance_loss_clip": 1.03655052, + "balance_loss_mlp": 1.08127069, + "epoch": 0.03168495415601984, + "flos": 18734489706240.0, + "grad_norm": 2.27970800213601, + "language_loss": 0.81417823, + "learning_rate": 3.999970270714991e-06, + "loss": 0.83775997, + "num_input_tokens_seen": 11177550, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 2.046875, + "step": 527, + "time_per_iteration": 2.4760756492614746 + }, + { + "auxiliary_loss_clip": 0.01273782, + "auxiliary_loss_mlp": 0.01080634, + "balance_loss_clip": 1.04651666, + "balance_loss_mlp": 1.07408452, + "epoch": 0.03174507740868781, + "flos": 21214875473280.0, + "grad_norm": 2.59751390244888, + "language_loss": 0.93932182, + "learning_rate": 3.999968109280371e-06, + "loss": 0.96286595, + "num_input_tokens_seen": 11196230, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 2.0, + "step": 528, + "time_per_iteration": 2.4721155166625977 + }, + { + "auxiliary_loss_clip": 0.01273884, + "auxiliary_loss_mlp": 0.01073354, + "balance_loss_clip": 1.04083371, + "balance_loss_mlp": 1.07427406, + "epoch": 0.03180520066135578, + "flos": 24787663614720.0, + "grad_norm": 1.8844039207994492, + "language_loss": 0.84143054, + "learning_rate": 3.99996587200676e-06, + "loss": 0.86490291, + "num_input_tokens_seen": 11214935, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 2.0, + "step": 529, + "time_per_iteration": 2.5173239707946777 + }, + { + "auxiliary_loss_clip": 0.01278919, + "auxiliary_loss_mlp": 0.01087129, + "balance_loss_clip": 1.05530047, + "balance_loss_mlp": 1.08254409, + "epoch": 0.03186532391402375, + "flos": 24864261367680.0, + "grad_norm": 2.130233453276154, + "language_loss": 0.90547037, + "learning_rate": 3.999963558894243e-06, + "loss": 0.92913085, + "num_input_tokens_seen": 11235310, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 1.96875, + "step": 530, + "time_per_iteration": 2.5096359252929688 + }, + { + "auxiliary_loss_clip": 0.0127291, + "auxiliary_loss_mlp": 0.01073181, + "balance_loss_clip": 1.03930187, + "balance_loss_mlp": 1.07199419, + "epoch": 0.03192544716669172, + "flos": 21215055041280.0, + "grad_norm": 2.12169085676626, + "language_loss": 0.76197046, + "learning_rate": 3.999961169942907e-06, + "loss": 0.78543139, + "num_input_tokens_seen": 11254425, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 2.015625, + "step": 531, + "time_per_iteration": 2.503265142440796 + }, + { + "auxiliary_loss_clip": 0.01272973, + "auxiliary_loss_mlp": 0.01064442, + "balance_loss_clip": 1.03030038, + "balance_loss_mlp": 1.07424712, + "epoch": 0.03198557041935969, + "flos": 24353216616960.0, + "grad_norm": 2.621085079916904, + "language_loss": 0.9073056, + "learning_rate": 3.999958705152843e-06, + "loss": 0.9306798, + "num_input_tokens_seen": 11274595, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 1.9921875, + "step": 532, + "time_per_iteration": 2.506220817565918 + }, + { + "auxiliary_loss_clip": 0.01137355, + "auxiliary_loss_mlp": 0.01010615, + "balance_loss_clip": 1.00198412, + "balance_loss_mlp": 1.0428524, + "epoch": 0.032045693672027656, + "flos": 61827367587840.0, + "grad_norm": 0.7306749876416057, + "language_loss": 0.57931173, + "learning_rate": 3.9999561645241445e-06, + "loss": 0.60079145, + "num_input_tokens_seen": 11336705, + "router_z_loss_clip": 0.08642578, + "router_z_loss_mlp": 0.9453125, + "step": 533, + "time_per_iteration": 3.154953956604004 + }, + { + "auxiliary_loss_clip": 0.01271016, + "auxiliary_loss_mlp": 0.01084756, + "balance_loss_clip": 1.05209231, + "balance_loss_mlp": 1.07378936, + "epoch": 0.03210581692469563, + "flos": 28401174800640.0, + "grad_norm": 1.8972625930530718, + "language_loss": 0.86725944, + "learning_rate": 3.999953548056907e-06, + "loss": 0.89081717, + "num_input_tokens_seen": 11356820, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 1.96875, + "step": 534, + "time_per_iteration": 2.5384750366210938 + }, + { + "auxiliary_loss_clip": 0.01271847, + "auxiliary_loss_mlp": 0.01066511, + "balance_loss_clip": 1.03468204, + "balance_loss_mlp": 1.07573223, + "epoch": 0.03216594017736359, + "flos": 24717709877760.0, + "grad_norm": 2.118212102173022, + "language_loss": 0.77352351, + "learning_rate": 3.999950855751232e-06, + "loss": 0.79690707, + "num_input_tokens_seen": 11376645, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 1.9609375, + "step": 535, + "time_per_iteration": 2.517940044403076 + }, + { + "auxiliary_loss_clip": 0.01274503, + "auxiliary_loss_mlp": 0.01083227, + "balance_loss_clip": 1.05151725, + "balance_loss_mlp": 1.07644773, + "epoch": 0.032226063430031565, + "flos": 31175453646720.0, + "grad_norm": 2.176836888233088, + "language_loss": 0.8074764, + "learning_rate": 3.999948087607219e-06, + "loss": 0.83105373, + "num_input_tokens_seen": 11397310, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 1.984375, + "step": 536, + "time_per_iteration": 2.546128034591675 + }, + { + "auxiliary_loss_clip": 0.01275643, + "auxiliary_loss_mlp": 0.01077633, + "balance_loss_clip": 1.04361033, + "balance_loss_mlp": 1.07698941, + "epoch": 0.03228618668269954, + "flos": 32198225506560.0, + "grad_norm": 2.3353202427960627, + "language_loss": 0.70118421, + "learning_rate": 3.999945243624975e-06, + "loss": 0.72471696, + "num_input_tokens_seen": 11418475, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 1.9921875, + "step": 537, + "time_per_iteration": 2.578101634979248 + }, + { + "auxiliary_loss_clip": 0.01274556, + "auxiliary_loss_mlp": 0.01081628, + "balance_loss_clip": 1.04877353, + "balance_loss_mlp": 1.08040798, + "epoch": 0.0323463099353675, + "flos": 22670154996480.0, + "grad_norm": 2.1000918694055044, + "language_loss": 0.8250435, + "learning_rate": 3.999942323804607e-06, + "loss": 0.84860539, + "num_input_tokens_seen": 11436630, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.9375, + "step": 538, + "time_per_iteration": 2.4822683334350586 + }, + { + "auxiliary_loss_clip": 0.01280793, + "auxiliary_loss_mlp": 0.01078587, + "balance_loss_clip": 1.0458765, + "balance_loss_mlp": 1.0775007, + "epoch": 0.032406433188035474, + "flos": 26905172232960.0, + "grad_norm": 1.8128048759039839, + "language_loss": 0.78999949, + "learning_rate": 3.999939328146225e-06, + "loss": 0.81359327, + "num_input_tokens_seen": 11457275, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 2.03125, + "step": 539, + "time_per_iteration": 2.5495705604553223 + }, + { + "auxiliary_loss_clip": 0.01274183, + "auxiliary_loss_mlp": 0.01066988, + "balance_loss_clip": 1.03284597, + "balance_loss_mlp": 1.0766232, + "epoch": 0.03246655644070344, + "flos": 31503928544640.0, + "grad_norm": 2.6651388031929835, + "language_loss": 0.77802742, + "learning_rate": 3.999936256649943e-06, + "loss": 0.80143911, + "num_input_tokens_seen": 11476925, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 1.9765625, + "step": 540, + "time_per_iteration": 2.5547144412994385 + }, + { + "auxiliary_loss_clip": 0.01282159, + "auxiliary_loss_mlp": 0.01070004, + "balance_loss_clip": 1.03755546, + "balance_loss_mlp": 1.08122253, + "epoch": 0.03252667969337141, + "flos": 23218331431680.0, + "grad_norm": 2.2422114385304845, + "language_loss": 0.85410464, + "learning_rate": 3.999933109315878e-06, + "loss": 0.8776263, + "num_input_tokens_seen": 11496830, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 2.0, + "step": 541, + "time_per_iteration": 2.517545700073242 + }, + { + "auxiliary_loss_clip": 0.01271503, + "auxiliary_loss_mlp": 0.01083563, + "balance_loss_clip": 1.04906392, + "balance_loss_mlp": 1.07759655, + "epoch": 0.032586802946039384, + "flos": 14757454926720.0, + "grad_norm": 2.210152212848466, + "language_loss": 0.89072484, + "learning_rate": 3.9999298861441496e-06, + "loss": 0.91427547, + "num_input_tokens_seen": 11515605, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.9375, + "step": 542, + "time_per_iteration": 2.437566041946411 + }, + { + "auxiliary_loss_clip": 0.01272694, + "auxiliary_loss_mlp": 0.01075801, + "balance_loss_clip": 1.04289961, + "balance_loss_mlp": 1.07649362, + "epoch": 0.03264692619870735, + "flos": 24280677100800.0, + "grad_norm": 2.3494598042187236, + "language_loss": 0.71096039, + "learning_rate": 3.999926587134879e-06, + "loss": 0.73444533, + "num_input_tokens_seen": 11536230, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.9609375, + "step": 543, + "time_per_iteration": 2.5121288299560547 + }, + { + "auxiliary_loss_clip": 0.0127171, + "auxiliary_loss_mlp": 0.01086873, + "balance_loss_clip": 1.05411386, + "balance_loss_mlp": 1.07139826, + "epoch": 0.03270704945137532, + "flos": 22893160584960.0, + "grad_norm": 2.6617228213889375, + "language_loss": 0.91273057, + "learning_rate": 3.999923212288192e-06, + "loss": 0.93631637, + "num_input_tokens_seen": 11554715, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 2.0, + "step": 544, + "time_per_iteration": 2.529536008834839 + }, + { + "auxiliary_loss_clip": 0.01274727, + "auxiliary_loss_mlp": 0.01082721, + "balance_loss_clip": 1.05186987, + "balance_loss_mlp": 1.07790041, + "epoch": 0.032767172704043286, + "flos": 18041018757120.0, + "grad_norm": 3.144073602630947, + "language_loss": 0.6640051, + "learning_rate": 3.999919761604216e-06, + "loss": 0.68757957, + "num_input_tokens_seen": 11571370, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.96875, + "step": 545, + "time_per_iteration": 2.487250328063965 + }, + { + "auxiliary_loss_clip": 0.01272187, + "auxiliary_loss_mlp": 0.01069604, + "balance_loss_clip": 1.03715563, + "balance_loss_mlp": 1.07393909, + "epoch": 0.03282729595671126, + "flos": 22528739151360.0, + "grad_norm": 2.6288964335615805, + "language_loss": 0.91857421, + "learning_rate": 3.999916235083083e-06, + "loss": 0.94199216, + "num_input_tokens_seen": 11588560, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 1.984375, + "step": 546, + "time_per_iteration": 2.4893922805786133 + }, + { + "auxiliary_loss_clip": 0.0126813, + "auxiliary_loss_mlp": 0.01071134, + "balance_loss_clip": 1.03723049, + "balance_loss_mlp": 1.07095337, + "epoch": 0.03288741920937923, + "flos": 20410620001920.0, + "grad_norm": 2.4455611041839127, + "language_loss": 0.82002354, + "learning_rate": 3.999912632724925e-06, + "loss": 0.84341609, + "num_input_tokens_seen": 11605685, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 1.96875, + "step": 547, + "time_per_iteration": 2.4879701137542725 + }, + { + "auxiliary_loss_clip": 0.01271545, + "auxiliary_loss_mlp": 0.01070995, + "balance_loss_clip": 1.0376637, + "balance_loss_mlp": 1.07550538, + "epoch": 0.032947542462047195, + "flos": 20777986350720.0, + "grad_norm": 3.015836198351779, + "language_loss": 0.80919325, + "learning_rate": 3.999908954529881e-06, + "loss": 0.83261865, + "num_input_tokens_seen": 11626290, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 1.9609375, + "step": 548, + "time_per_iteration": 2.501983404159546 + }, + { + "auxiliary_loss_clip": 0.01270889, + "auxiliary_loss_mlp": 0.01079421, + "balance_loss_clip": 1.04499304, + "balance_loss_mlp": 1.07411838, + "epoch": 0.03300766571471517, + "flos": 19901263190400.0, + "grad_norm": 3.9904289991591217, + "language_loss": 0.67330974, + "learning_rate": 3.999905200498087e-06, + "loss": 0.69681287, + "num_input_tokens_seen": 11643950, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.96875, + "step": 549, + "time_per_iteration": 2.479069948196411 + }, + { + "auxiliary_loss_clip": 0.01265753, + "auxiliary_loss_mlp": 0.01075673, + "balance_loss_clip": 1.04286647, + "balance_loss_mlp": 1.07537639, + "epoch": 0.03306778896738313, + "flos": 17967760968960.0, + "grad_norm": 2.081726350608672, + "language_loss": 0.86137938, + "learning_rate": 3.999901370629689e-06, + "loss": 0.88479364, + "num_input_tokens_seen": 11662560, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.90625, + "step": 550, + "time_per_iteration": 2.435030221939087 + }, + { + "auxiliary_loss_clip": 0.01271779, + "auxiliary_loss_mlp": 0.01089379, + "balance_loss_clip": 1.05712056, + "balance_loss_mlp": 1.07876444, + "epoch": 0.033127912220051105, + "flos": 21653380707840.0, + "grad_norm": 2.0024940554917534, + "language_loss": 0.81302834, + "learning_rate": 3.99989746492483e-06, + "loss": 0.83663994, + "num_input_tokens_seen": 11682265, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 1.9296875, + "step": 551, + "time_per_iteration": 2.474317789077759 + }, + { + "auxiliary_loss_clip": 0.01278525, + "auxiliary_loss_mlp": 0.01080037, + "balance_loss_clip": 1.0469687, + "balance_loss_mlp": 1.0786469, + "epoch": 0.03318803547271908, + "flos": 30188376927360.0, + "grad_norm": 2.5540153370218697, + "language_loss": 0.85907811, + "learning_rate": 3.999893483383658e-06, + "loss": 0.88266373, + "num_input_tokens_seen": 11699300, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 2.0, + "step": 552, + "time_per_iteration": 2.5352437496185303 + }, + { + "auxiliary_loss_clip": 0.01276099, + "auxiliary_loss_mlp": 0.01077197, + "balance_loss_clip": 1.0428648, + "balance_loss_mlp": 1.07894135, + "epoch": 0.03324815872538704, + "flos": 20376038183040.0, + "grad_norm": 2.3148388677976253, + "language_loss": 0.928128, + "learning_rate": 3.999889426006326e-06, + "loss": 0.95166099, + "num_input_tokens_seen": 11716955, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.96875, + "step": 553, + "time_per_iteration": 2.4860291481018066 + }, + { + "auxiliary_loss_clip": 0.01270959, + "auxiliary_loss_mlp": 0.01072703, + "balance_loss_clip": 1.03858554, + "balance_loss_mlp": 1.0755136, + "epoch": 0.033308281978055014, + "flos": 24494560634880.0, + "grad_norm": 2.234190064541142, + "language_loss": 0.78874755, + "learning_rate": 3.999885292792986e-06, + "loss": 0.81218415, + "num_input_tokens_seen": 11736130, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 1.953125, + "step": 554, + "time_per_iteration": 2.4878416061401367 + }, + { + "auxiliary_loss_clip": 0.0126611, + "auxiliary_loss_mlp": 0.01082645, + "balance_loss_clip": 1.04838455, + "balance_loss_mlp": 1.07417822, + "epoch": 0.03336840523072298, + "flos": 23400326666880.0, + "grad_norm": 2.1365458646452424, + "language_loss": 0.82297659, + "learning_rate": 3.999881083743795e-06, + "loss": 0.84646416, + "num_input_tokens_seen": 11754425, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.9140625, + "step": 555, + "time_per_iteration": 2.4846394062042236 + }, + { + "auxiliary_loss_clip": 0.01270081, + "auxiliary_loss_mlp": 0.01075464, + "balance_loss_clip": 1.04156113, + "balance_loss_mlp": 1.07390678, + "epoch": 0.03342852848339095, + "flos": 30550571717760.0, + "grad_norm": 2.781828445596944, + "language_loss": 0.88624835, + "learning_rate": 3.999876798858914e-06, + "loss": 0.90970379, + "num_input_tokens_seen": 11772845, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 1.96875, + "step": 556, + "time_per_iteration": 2.5788674354553223 + }, + { + "auxiliary_loss_clip": 0.01269545, + "auxiliary_loss_mlp": 0.01079314, + "balance_loss_clip": 1.04531527, + "balance_loss_mlp": 1.07534254, + "epoch": 0.03348865173605892, + "flos": 22893304239360.0, + "grad_norm": 2.0860752820949586, + "language_loss": 0.83492053, + "learning_rate": 3.999872438138503e-06, + "loss": 0.85840911, + "num_input_tokens_seen": 11792850, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 1.9375, + "step": 557, + "time_per_iteration": 2.5352954864501953 + }, + { + "auxiliary_loss_clip": 0.01275093, + "auxiliary_loss_mlp": 0.0106652, + "balance_loss_clip": 1.03495288, + "balance_loss_mlp": 1.07979858, + "epoch": 0.03354877498872689, + "flos": 17676022705920.0, + "grad_norm": 9.145612151583265, + "language_loss": 0.94169575, + "learning_rate": 3.999868001582729e-06, + "loss": 0.96511185, + "num_input_tokens_seen": 11809670, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.953125, + "step": 558, + "time_per_iteration": 2.4541964530944824 + }, + { + "auxiliary_loss_clip": 0.01265501, + "auxiliary_loss_mlp": 0.01073065, + "balance_loss_clip": 1.0406878, + "balance_loss_mlp": 1.07178497, + "epoch": 0.03360889824139486, + "flos": 21652985658240.0, + "grad_norm": 2.48174106566098, + "language_loss": 0.7735827, + "learning_rate": 3.99986348919176e-06, + "loss": 0.7969684, + "num_input_tokens_seen": 11829665, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 1.9375, + "step": 559, + "time_per_iteration": 5.362890005111694 + }, + { + "auxiliary_loss_clip": 0.01268387, + "auxiliary_loss_mlp": 0.01078962, + "balance_loss_clip": 1.04818201, + "balance_loss_mlp": 1.07386613, + "epoch": 0.033669021494062826, + "flos": 21795730306560.0, + "grad_norm": 2.071149038386511, + "language_loss": 0.87681198, + "learning_rate": 3.9998589009657675e-06, + "loss": 0.90028548, + "num_input_tokens_seen": 11848190, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.953125, + "step": 560, + "time_per_iteration": 3.9536426067352295 + }, + { + "auxiliary_loss_clip": 0.01264547, + "auxiliary_loss_mlp": 0.01067998, + "balance_loss_clip": 1.0375762, + "balance_loss_mlp": 1.07323277, + "epoch": 0.0337291447467308, + "flos": 21866222747520.0, + "grad_norm": 2.2284071587683463, + "language_loss": 0.81380183, + "learning_rate": 3.999854236904925e-06, + "loss": 0.83712727, + "num_input_tokens_seen": 11864795, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.9140625, + "step": 561, + "time_per_iteration": 2.49826717376709 + }, + { + "auxiliary_loss_clip": 0.01263917, + "auxiliary_loss_mlp": 0.01071053, + "balance_loss_clip": 1.04029727, + "balance_loss_mlp": 1.07403696, + "epoch": 0.03378926799939877, + "flos": 24245951627520.0, + "grad_norm": 1.7768341081574646, + "language_loss": 0.82018232, + "learning_rate": 3.999849497009409e-06, + "loss": 0.84353203, + "num_input_tokens_seen": 11885275, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.90625, + "step": 562, + "time_per_iteration": 2.503990888595581 + }, + { + "auxiliary_loss_clip": 0.01269896, + "auxiliary_loss_mlp": 0.01075498, + "balance_loss_clip": 1.04352641, + "balance_loss_mlp": 1.07592142, + "epoch": 0.033849391252066735, + "flos": 16507812677760.0, + "grad_norm": 1.966221896086353, + "language_loss": 0.84028983, + "learning_rate": 3.999844681279401e-06, + "loss": 0.86374378, + "num_input_tokens_seen": 11903595, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 1.9375, + "step": 563, + "time_per_iteration": 2.464571952819824 + }, + { + "auxiliary_loss_clip": 0.01268432, + "auxiliary_loss_mlp": 0.01079949, + "balance_loss_clip": 1.04866886, + "balance_loss_mlp": 1.07648492, + "epoch": 0.03390951450473471, + "flos": 15669298609920.0, + "grad_norm": 2.359913311978066, + "language_loss": 0.94194812, + "learning_rate": 3.99983978971508e-06, + "loss": 0.96543193, + "num_input_tokens_seen": 11917815, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.921875, + "step": 564, + "time_per_iteration": 2.423762798309326 + }, + { + "auxiliary_loss_clip": 0.01267204, + "auxiliary_loss_mlp": 0.01070001, + "balance_loss_clip": 1.03745687, + "balance_loss_mlp": 1.07225537, + "epoch": 0.03396963775740267, + "flos": 22674787850880.0, + "grad_norm": 3.7666153248687277, + "language_loss": 0.94089758, + "learning_rate": 3.999834822316635e-06, + "loss": 0.96426964, + "num_input_tokens_seen": 11936305, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 1.953125, + "step": 565, + "time_per_iteration": 2.499417543411255 + }, + { + "auxiliary_loss_clip": 0.01140331, + "auxiliary_loss_mlp": 0.0102506, + "balance_loss_clip": 1.01714468, + "balance_loss_mlp": 1.04934859, + "epoch": 0.034029761010070644, + "flos": 64392683063040.0, + "grad_norm": 1.1198796781785882, + "language_loss": 0.54823005, + "learning_rate": 3.9998297790842535e-06, + "loss": 0.569884, + "num_input_tokens_seen": 11998940, + "router_z_loss_clip": 0.07910156, + "router_z_loss_mlp": 0.91015625, + "step": 566, + "time_per_iteration": 3.1322038173675537 + }, + { + "auxiliary_loss_clip": 0.01270043, + "auxiliary_loss_mlp": 0.01072204, + "balance_loss_clip": 1.03837276, + "balance_loss_mlp": 1.0753262, + "epoch": 0.034089884262738616, + "flos": 25004204755200.0, + "grad_norm": 2.6603630269915683, + "language_loss": 0.76780868, + "learning_rate": 3.999824660018126e-06, + "loss": 0.79123116, + "num_input_tokens_seen": 12018860, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 1.9453125, + "step": 567, + "time_per_iteration": 2.5351951122283936 + }, + { + "auxiliary_loss_clip": 0.01261299, + "auxiliary_loss_mlp": 0.01077897, + "balance_loss_clip": 1.04809463, + "balance_loss_mlp": 1.07400167, + "epoch": 0.03415000751540658, + "flos": 28439096584320.0, + "grad_norm": 4.563520524929296, + "language_loss": 0.80796623, + "learning_rate": 3.999819465118447e-06, + "loss": 0.83135819, + "num_input_tokens_seen": 12039675, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.875, + "step": 568, + "time_per_iteration": 2.558093309402466 + }, + { + "auxiliary_loss_clip": 0.01263323, + "auxiliary_loss_mlp": 0.01079335, + "balance_loss_clip": 1.04836476, + "balance_loss_mlp": 1.07628214, + "epoch": 0.034210130768074554, + "flos": 21468727866240.0, + "grad_norm": 1.809578126153619, + "language_loss": 0.86777622, + "learning_rate": 3.999814194385413e-06, + "loss": 0.89120281, + "num_input_tokens_seen": 12057680, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.875, + "step": 569, + "time_per_iteration": 2.500319719314575 + }, + { + "auxiliary_loss_clip": 0.01264002, + "auxiliary_loss_mlp": 0.01073079, + "balance_loss_clip": 1.04227519, + "balance_loss_mlp": 1.07425416, + "epoch": 0.03427025402074252, + "flos": 18697501676160.0, + "grad_norm": 1.8164454228173497, + "language_loss": 0.95802778, + "learning_rate": 3.9998088478192255e-06, + "loss": 0.98139858, + "num_input_tokens_seen": 12076135, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.8984375, + "step": 570, + "time_per_iteration": 2.473808526992798 + }, + { + "auxiliary_loss_clip": 0.01264689, + "auxiliary_loss_mlp": 0.01080759, + "balance_loss_clip": 1.04733253, + "balance_loss_mlp": 1.07053721, + "epoch": 0.03433037727341049, + "flos": 20849987162880.0, + "grad_norm": 2.217921822086313, + "language_loss": 0.79522127, + "learning_rate": 3.9998034254200846e-06, + "loss": 0.81867576, + "num_input_tokens_seen": 12094785, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 1.9375, + "step": 571, + "time_per_iteration": 2.48317813873291 + }, + { + "auxiliary_loss_clip": 0.01265335, + "auxiliary_loss_mlp": 0.01076969, + "balance_loss_clip": 1.04490221, + "balance_loss_mlp": 1.07593679, + "epoch": 0.03439050052607846, + "flos": 25410282986880.0, + "grad_norm": 2.3471183659940555, + "language_loss": 0.79962778, + "learning_rate": 3.999797927188199e-06, + "loss": 0.82305074, + "num_input_tokens_seen": 12114590, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 1.890625, + "step": 572, + "time_per_iteration": 2.5112764835357666 + }, + { + "auxiliary_loss_clip": 0.01270326, + "auxiliary_loss_mlp": 0.0106947, + "balance_loss_clip": 1.03871393, + "balance_loss_mlp": 1.07574439, + "epoch": 0.03445062377874643, + "flos": 17640147997440.0, + "grad_norm": 1.9544136074887903, + "language_loss": 0.84374899, + "learning_rate": 3.999792353123774e-06, + "loss": 0.86714697, + "num_input_tokens_seen": 12132390, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.9453125, + "step": 573, + "time_per_iteration": 2.474212408065796 + }, + { + "auxiliary_loss_clip": 0.01266726, + "auxiliary_loss_mlp": 0.01064214, + "balance_loss_clip": 1.03460276, + "balance_loss_mlp": 1.07282329, + "epoch": 0.0345107470314144, + "flos": 16764502245120.0, + "grad_norm": 3.553507560277694, + "language_loss": 0.76376265, + "learning_rate": 3.999786703227023e-06, + "loss": 0.78707206, + "num_input_tokens_seen": 12149035, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.9375, + "step": 574, + "time_per_iteration": 2.4510116577148438 + }, + { + "auxiliary_loss_clip": 0.01264596, + "auxiliary_loss_mlp": 0.01064423, + "balance_loss_clip": 1.03531194, + "balance_loss_mlp": 1.0731982, + "epoch": 0.03457087028408237, + "flos": 14684448533760.0, + "grad_norm": 2.5278817664157343, + "language_loss": 0.83801597, + "learning_rate": 3.9997809774981606e-06, + "loss": 0.86130619, + "num_input_tokens_seen": 12167530, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.9140625, + "step": 575, + "time_per_iteration": 2.459693193435669 + }, + { + "auxiliary_loss_clip": 0.01260171, + "auxiliary_loss_mlp": 0.01067742, + "balance_loss_clip": 1.03830886, + "balance_loss_mlp": 1.07501364, + "epoch": 0.03463099353675034, + "flos": 20011293527040.0, + "grad_norm": 2.241383472398266, + "language_loss": 0.83726245, + "learning_rate": 3.9997751759374025e-06, + "loss": 0.86054158, + "num_input_tokens_seen": 12186340, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.8515625, + "step": 576, + "time_per_iteration": 2.47292423248291 + }, + { + "auxiliary_loss_clip": 0.01267718, + "auxiliary_loss_mlp": 0.01074956, + "balance_loss_clip": 1.04582155, + "balance_loss_mlp": 1.08247435, + "epoch": 0.03469111678941831, + "flos": 25301150490240.0, + "grad_norm": 2.0876645490308334, + "language_loss": 0.8640908, + "learning_rate": 3.99976929854497e-06, + "loss": 0.88751757, + "num_input_tokens_seen": 12204090, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.8515625, + "step": 577, + "time_per_iteration": 2.529500961303711 + }, + { + "auxiliary_loss_clip": 0.01262371, + "auxiliary_loss_mlp": 0.01069797, + "balance_loss_clip": 1.04028082, + "balance_loss_mlp": 1.0769875, + "epoch": 0.034751240042086275, + "flos": 23259413612160.0, + "grad_norm": 3.2017547958107784, + "language_loss": 0.72333407, + "learning_rate": 3.9997633453210845e-06, + "loss": 0.74665576, + "num_input_tokens_seen": 12224850, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.859375, + "step": 578, + "time_per_iteration": 2.4868762493133545 + }, + { + "auxiliary_loss_clip": 0.01263036, + "auxiliary_loss_mlp": 0.01071071, + "balance_loss_clip": 1.04050565, + "balance_loss_mlp": 1.07441878, + "epoch": 0.03481136329475425, + "flos": 23769237300480.0, + "grad_norm": 1.8544904120227406, + "language_loss": 0.77664137, + "learning_rate": 3.999757316265973e-06, + "loss": 0.79998243, + "num_input_tokens_seen": 12244935, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.8828125, + "step": 579, + "time_per_iteration": 2.50669002532959 + }, + { + "auxiliary_loss_clip": 0.01260844, + "auxiliary_loss_mlp": 0.01077558, + "balance_loss_clip": 1.04634845, + "balance_loss_mlp": 1.07355189, + "epoch": 0.03487148654742222, + "flos": 20157521794560.0, + "grad_norm": 2.5351053977844136, + "language_loss": 0.86927247, + "learning_rate": 3.999751211379863e-06, + "loss": 0.89265645, + "num_input_tokens_seen": 12262140, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.875, + "step": 580, + "time_per_iteration": 2.505908966064453 + }, + { + "auxiliary_loss_clip": 0.01266331, + "auxiliary_loss_mlp": 0.01063953, + "balance_loss_clip": 1.03536677, + "balance_loss_mlp": 1.07510614, + "epoch": 0.034931609800090184, + "flos": 15669585918720.0, + "grad_norm": 4.565959491833327, + "language_loss": 0.82161844, + "learning_rate": 3.999745030662987e-06, + "loss": 0.84492135, + "num_input_tokens_seen": 12280930, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.9140625, + "step": 581, + "time_per_iteration": 2.4735610485076904 + }, + { + "auxiliary_loss_clip": 0.01263493, + "auxiliary_loss_mlp": 0.01067149, + "balance_loss_clip": 1.03832436, + "balance_loss_mlp": 1.07712197, + "epoch": 0.034991733052758156, + "flos": 16362374509440.0, + "grad_norm": 2.2699668532214377, + "language_loss": 0.77498174, + "learning_rate": 3.99973877411558e-06, + "loss": 0.79828823, + "num_input_tokens_seen": 12299125, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.8671875, + "step": 582, + "time_per_iteration": 2.4596173763275146 + }, + { + "auxiliary_loss_clip": 0.01261728, + "auxiliary_loss_mlp": 0.01075668, + "balance_loss_clip": 1.04467332, + "balance_loss_mlp": 1.07715631, + "epoch": 0.03505185630542612, + "flos": 19387309438080.0, + "grad_norm": 2.0991939318744692, + "language_loss": 0.87632537, + "learning_rate": 3.999732441737877e-06, + "loss": 0.89969933, + "num_input_tokens_seen": 12316905, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 1.84375, + "step": 583, + "time_per_iteration": 2.46062970161438 + }, + { + "auxiliary_loss_clip": 0.01268555, + "auxiliary_loss_mlp": 0.01082553, + "balance_loss_clip": 1.05167794, + "balance_loss_mlp": 1.07587278, + "epoch": 0.03511197955809409, + "flos": 21323828401920.0, + "grad_norm": 2.3581841085942004, + "language_loss": 0.80997103, + "learning_rate": 3.99972603353012e-06, + "loss": 0.83348215, + "num_input_tokens_seen": 12335070, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.921875, + "step": 584, + "time_per_iteration": 2.4776926040649414 + }, + { + "auxiliary_loss_clip": 0.01262257, + "auxiliary_loss_mlp": 0.01063212, + "balance_loss_clip": 1.03326654, + "balance_loss_mlp": 1.0725317, + "epoch": 0.035172102810762065, + "flos": 14136595320960.0, + "grad_norm": 2.6245680316153743, + "language_loss": 0.92654932, + "learning_rate": 3.999719549492551e-06, + "loss": 0.94980395, + "num_input_tokens_seen": 12350315, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 1.8984375, + "step": 585, + "time_per_iteration": 2.486678123474121 + }, + { + "auxiliary_loss_clip": 0.01262479, + "auxiliary_loss_mlp": 0.01070226, + "balance_loss_clip": 1.04011369, + "balance_loss_mlp": 1.07368612, + "epoch": 0.03523222606343003, + "flos": 20296890564480.0, + "grad_norm": 2.4855014647160245, + "language_loss": 0.87484592, + "learning_rate": 3.9997129896254165e-06, + "loss": 0.89817297, + "num_input_tokens_seen": 12366030, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.890625, + "step": 586, + "time_per_iteration": 2.457772970199585 + }, + { + "auxiliary_loss_clip": 0.01269677, + "auxiliary_loss_mlp": 0.01071192, + "balance_loss_clip": 1.04137754, + "balance_loss_mlp": 1.07875896, + "epoch": 0.035292349316098, + "flos": 20375822701440.0, + "grad_norm": 1.7854143394247532, + "language_loss": 0.76574278, + "learning_rate": 3.999706353928965e-06, + "loss": 0.78915149, + "num_input_tokens_seen": 12384895, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.90625, + "step": 587, + "time_per_iteration": 2.4794015884399414 + }, + { + "auxiliary_loss_clip": 0.01269924, + "auxiliary_loss_mlp": 0.01061103, + "balance_loss_clip": 1.02991772, + "balance_loss_mlp": 1.07701528, + "epoch": 0.03535247256876597, + "flos": 21468871520640.0, + "grad_norm": 1.6805414217886456, + "language_loss": 0.78441286, + "learning_rate": 3.999699642403449e-06, + "loss": 0.80772316, + "num_input_tokens_seen": 12404980, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.9296875, + "step": 588, + "time_per_iteration": 2.4755733013153076 + }, + { + "auxiliary_loss_clip": 0.01267146, + "auxiliary_loss_mlp": 0.01071411, + "balance_loss_clip": 1.03850961, + "balance_loss_mlp": 1.07600832, + "epoch": 0.03541259582143394, + "flos": 23623044946560.0, + "grad_norm": 2.6477303031273185, + "language_loss": 0.94003904, + "learning_rate": 3.99969285504912e-06, + "loss": 0.96342462, + "num_input_tokens_seen": 12423835, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.9140625, + "step": 589, + "time_per_iteration": 2.515296459197998 + }, + { + "auxiliary_loss_clip": 0.01269747, + "auxiliary_loss_mlp": 0.01067695, + "balance_loss_clip": 1.03803611, + "balance_loss_mlp": 1.07632184, + "epoch": 0.03547271907410191, + "flos": 33726367768320.0, + "grad_norm": 2.4870139863099157, + "language_loss": 0.84060037, + "learning_rate": 3.99968599186624e-06, + "loss": 0.86397475, + "num_input_tokens_seen": 12443135, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.9375, + "step": 590, + "time_per_iteration": 2.583080291748047 + }, + { + "auxiliary_loss_clip": 0.01259593, + "auxiliary_loss_mlp": 0.01062628, + "balance_loss_clip": 1.0342319, + "balance_loss_mlp": 1.07476449, + "epoch": 0.03553284232676988, + "flos": 21142695093120.0, + "grad_norm": 2.031404841890899, + "language_loss": 0.86889851, + "learning_rate": 3.999679052855065e-06, + "loss": 0.89212072, + "num_input_tokens_seen": 12462895, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.8515625, + "step": 591, + "time_per_iteration": 2.497912883758545 + }, + { + "auxiliary_loss_clip": 0.01264593, + "auxiliary_loss_mlp": 0.01070221, + "balance_loss_clip": 1.03917849, + "balance_loss_mlp": 1.07271862, + "epoch": 0.03559296557943785, + "flos": 20046593617920.0, + "grad_norm": 3.1144902928375586, + "language_loss": 0.82980722, + "learning_rate": 3.999672038015861e-06, + "loss": 0.85315537, + "num_input_tokens_seen": 12481515, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 1.921875, + "step": 592, + "time_per_iteration": 2.463977813720703 + }, + { + "auxiliary_loss_clip": 0.01128612, + "auxiliary_loss_mlp": 0.01022486, + "balance_loss_clip": 1.01590526, + "balance_loss_mlp": 1.03881407, + "epoch": 0.035653088832105814, + "flos": 60334597244160.0, + "grad_norm": 0.8806680605255408, + "language_loss": 0.59741807, + "learning_rate": 3.999664947348893e-06, + "loss": 0.61892909, + "num_input_tokens_seen": 12548220, + "router_z_loss_clip": 0.06591797, + "router_z_loss_mlp": 0.8984375, + "step": 593, + "time_per_iteration": 3.1275696754455566 + }, + { + "auxiliary_loss_clip": 0.01262803, + "auxiliary_loss_mlp": 0.01070928, + "balance_loss_clip": 1.03988624, + "balance_loss_mlp": 1.07810974, + "epoch": 0.035713212084773786, + "flos": 20113135562880.0, + "grad_norm": 1.8853114596204945, + "language_loss": 0.87042278, + "learning_rate": 3.999657780854429e-06, + "loss": 0.89376009, + "num_input_tokens_seen": 12566105, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 1.84375, + "step": 594, + "time_per_iteration": 2.522805690765381 + }, + { + "auxiliary_loss_clip": 0.01262874, + "auxiliary_loss_mlp": 0.01064868, + "balance_loss_clip": 1.03539896, + "balance_loss_mlp": 1.07309461, + "epoch": 0.03577333533744176, + "flos": 26285785084800.0, + "grad_norm": 2.3431313884364395, + "language_loss": 0.83481348, + "learning_rate": 3.999650538532742e-06, + "loss": 0.85809088, + "num_input_tokens_seen": 12586680, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.8984375, + "step": 595, + "time_per_iteration": 2.565220832824707 + }, + { + "auxiliary_loss_clip": 0.01261367, + "auxiliary_loss_mlp": 0.01072242, + "balance_loss_clip": 1.04216576, + "balance_loss_mlp": 1.07610273, + "epoch": 0.035833458590109724, + "flos": 10889732211840.0, + "grad_norm": 3.1278930526147426, + "language_loss": 0.96185803, + "learning_rate": 3.999643220384106e-06, + "loss": 0.98519421, + "num_input_tokens_seen": 12601605, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.859375, + "step": 596, + "time_per_iteration": 2.460515260696411 + }, + { + "auxiliary_loss_clip": 0.0126361, + "auxiliary_loss_mlp": 0.0107037, + "balance_loss_clip": 1.04185498, + "balance_loss_mlp": 1.07627654, + "epoch": 0.035893581842777696, + "flos": 22090198003200.0, + "grad_norm": 2.2167421176017204, + "language_loss": 0.82718551, + "learning_rate": 3.999635826408799e-06, + "loss": 0.85052526, + "num_input_tokens_seen": 12620365, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.875, + "step": 597, + "time_per_iteration": 2.5076701641082764 + }, + { + "auxiliary_loss_clip": 0.01261023, + "auxiliary_loss_mlp": 0.01069081, + "balance_loss_clip": 1.03956461, + "balance_loss_mlp": 1.0784421, + "epoch": 0.03595370509544566, + "flos": 23038347358080.0, + "grad_norm": 2.168981908539252, + "language_loss": 0.81386817, + "learning_rate": 3.999628356607101e-06, + "loss": 0.83716923, + "num_input_tokens_seen": 12641140, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.828125, + "step": 598, + "time_per_iteration": 2.531188726425171 + }, + { + "auxiliary_loss_clip": 0.01254264, + "auxiliary_loss_mlp": 0.0106961, + "balance_loss_clip": 1.03894937, + "balance_loss_mlp": 1.07570839, + "epoch": 0.03601382834811363, + "flos": 20777734955520.0, + "grad_norm": 1.9075541218278638, + "language_loss": 0.81387949, + "learning_rate": 3.999620810979295e-06, + "loss": 0.83711827, + "num_input_tokens_seen": 12661080, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.7890625, + "step": 599, + "time_per_iteration": 2.511871576309204 + }, + { + "auxiliary_loss_clip": 0.01262476, + "auxiliary_loss_mlp": 0.01074253, + "balance_loss_clip": 1.04557085, + "balance_loss_mlp": 1.07350755, + "epoch": 0.036073951600781605, + "flos": 23951627585280.0, + "grad_norm": 2.1528215266255604, + "language_loss": 0.86115932, + "learning_rate": 3.999613189525668e-06, + "loss": 0.88452661, + "num_input_tokens_seen": 12678270, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.890625, + "step": 600, + "time_per_iteration": 2.50054669380188 + }, + { + "auxiliary_loss_clip": 0.01254617, + "auxiliary_loss_mlp": 0.01080731, + "balance_loss_clip": 1.05133438, + "balance_loss_mlp": 1.06909621, + "epoch": 0.03613407485344957, + "flos": 18912283050240.0, + "grad_norm": 3.928737875146519, + "language_loss": 0.82175761, + "learning_rate": 3.999605492246508e-06, + "loss": 0.84511113, + "num_input_tokens_seen": 12697295, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.8515625, + "step": 601, + "time_per_iteration": 6.795202255249023 + }, + { + "auxiliary_loss_clip": 0.01253245, + "auxiliary_loss_mlp": 0.01056304, + "balance_loss_clip": 1.02666831, + "balance_loss_mlp": 1.07096183, + "epoch": 0.03619419810611754, + "flos": 23038526926080.0, + "grad_norm": 2.2629653513719252, + "language_loss": 0.75467926, + "learning_rate": 3.999597719142107e-06, + "loss": 0.77777481, + "num_input_tokens_seen": 12716165, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.8203125, + "step": 602, + "time_per_iteration": 2.503629446029663 + }, + { + "auxiliary_loss_clip": 0.01252806, + "auxiliary_loss_mlp": 0.01057069, + "balance_loss_clip": 1.02833962, + "balance_loss_mlp": 1.07078326, + "epoch": 0.03625432135878551, + "flos": 29457774293760.0, + "grad_norm": 1.9962737747137984, + "language_loss": 0.80078572, + "learning_rate": 3.999589870212761e-06, + "loss": 0.82388449, + "num_input_tokens_seen": 12735475, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.8203125, + "step": 603, + "time_per_iteration": 2.568368911743164 + }, + { + "auxiliary_loss_clip": 0.01258325, + "auxiliary_loss_mlp": 0.01061531, + "balance_loss_clip": 1.03320646, + "balance_loss_mlp": 1.07597041, + "epoch": 0.03631444461145348, + "flos": 23508525409920.0, + "grad_norm": 1.9836566776981934, + "language_loss": 0.86801207, + "learning_rate": 3.9995819454587664e-06, + "loss": 0.89121068, + "num_input_tokens_seen": 12754540, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.8203125, + "step": 604, + "time_per_iteration": 2.496415376663208 + }, + { + "auxiliary_loss_clip": 0.01260423, + "auxiliary_loss_mlp": 0.01065702, + "balance_loss_clip": 1.03549433, + "balance_loss_mlp": 1.07688427, + "epoch": 0.03637456786412145, + "flos": 16618130323200.0, + "grad_norm": 3.252638522711271, + "language_loss": 0.81078291, + "learning_rate": 3.999573944880424e-06, + "loss": 0.83404416, + "num_input_tokens_seen": 12773050, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.828125, + "step": 605, + "time_per_iteration": 2.46071457862854 + }, + { + "auxiliary_loss_clip": 0.01255946, + "auxiliary_loss_mlp": 0.01067125, + "balance_loss_clip": 1.04012406, + "balance_loss_mlp": 1.07317901, + "epoch": 0.03643469111678942, + "flos": 15851832549120.0, + "grad_norm": 2.2162807408147964, + "language_loss": 0.85624671, + "learning_rate": 3.9995658684780375e-06, + "loss": 0.87947738, + "num_input_tokens_seen": 12791240, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.828125, + "step": 606, + "time_per_iteration": 2.450775623321533 + }, + { + "auxiliary_loss_clip": 0.01262483, + "auxiliary_loss_mlp": 0.01072166, + "balance_loss_clip": 1.04279351, + "balance_loss_mlp": 1.07551849, + "epoch": 0.03649481436945739, + "flos": 23620387340160.0, + "grad_norm": 2.1498788116147125, + "language_loss": 0.82370651, + "learning_rate": 3.999557716251912e-06, + "loss": 0.84705305, + "num_input_tokens_seen": 12812245, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.8671875, + "step": 607, + "time_per_iteration": 2.4969747066497803 + }, + { + "auxiliary_loss_clip": 0.01255757, + "auxiliary_loss_mlp": 0.01063348, + "balance_loss_clip": 1.035954, + "balance_loss_mlp": 1.07488835, + "epoch": 0.036554937622125354, + "flos": 21755581879680.0, + "grad_norm": 3.329641026295442, + "language_loss": 0.8315016, + "learning_rate": 3.999549488202358e-06, + "loss": 0.8546927, + "num_input_tokens_seen": 12831085, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.8046875, + "step": 608, + "time_per_iteration": 2.4648640155792236 + }, + { + "auxiliary_loss_clip": 0.01260127, + "auxiliary_loss_mlp": 0.0106578, + "balance_loss_clip": 1.03533435, + "balance_loss_mlp": 1.0769459, + "epoch": 0.036615060874793326, + "flos": 17819772935040.0, + "grad_norm": 2.072924568315734, + "language_loss": 0.82258713, + "learning_rate": 3.999541184329688e-06, + "loss": 0.84584618, + "num_input_tokens_seen": 12849115, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.828125, + "step": 609, + "time_per_iteration": 2.4761714935302734 + }, + { + "auxiliary_loss_clip": 0.01266536, + "auxiliary_loss_mlp": 0.01080333, + "balance_loss_clip": 1.05247378, + "balance_loss_mlp": 1.08229148, + "epoch": 0.0366751841274613, + "flos": 26753808320640.0, + "grad_norm": 2.279075715646142, + "language_loss": 0.7924515, + "learning_rate": 3.999532804634215e-06, + "loss": 0.81592017, + "num_input_tokens_seen": 12868005, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.84375, + "step": 610, + "time_per_iteration": 2.512223958969116 + }, + { + "auxiliary_loss_clip": 0.01265179, + "auxiliary_loss_mlp": 0.01076881, + "balance_loss_clip": 1.04767442, + "balance_loss_mlp": 1.07819688, + "epoch": 0.03673530738012926, + "flos": 22196960202240.0, + "grad_norm": 2.108980449215705, + "language_loss": 0.87263799, + "learning_rate": 3.9995243491162575e-06, + "loss": 0.89605856, + "num_input_tokens_seen": 12886890, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.8671875, + "step": 611, + "time_per_iteration": 2.488800525665283 + }, + { + "auxiliary_loss_clip": 0.01257304, + "auxiliary_loss_mlp": 0.01084406, + "balance_loss_clip": 1.05577183, + "balance_loss_mlp": 1.0769043, + "epoch": 0.036795430632797235, + "flos": 24681655601280.0, + "grad_norm": 2.0539399448943145, + "language_loss": 0.72783852, + "learning_rate": 3.999515817776136e-06, + "loss": 0.75125557, + "num_input_tokens_seen": 12906130, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.8046875, + "step": 612, + "time_per_iteration": 2.4950740337371826 + }, + { + "auxiliary_loss_clip": 0.01258776, + "auxiliary_loss_mlp": 0.01069045, + "balance_loss_clip": 1.03999329, + "balance_loss_mlp": 1.07377708, + "epoch": 0.0368555538854652, + "flos": 17748921358080.0, + "grad_norm": 2.903841869182041, + "language_loss": 0.7909385, + "learning_rate": 3.999507210614175e-06, + "loss": 0.81421661, + "num_input_tokens_seen": 12925260, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.8515625, + "step": 613, + "time_per_iteration": 2.4849369525909424 + }, + { + "auxiliary_loss_clip": 0.01253943, + "auxiliary_loss_mlp": 0.01079095, + "balance_loss_clip": 1.05141413, + "balance_loss_mlp": 1.07326341, + "epoch": 0.03691567713813317, + "flos": 20594554571520.0, + "grad_norm": 2.273957434397869, + "language_loss": 0.93266213, + "learning_rate": 3.9994985276307e-06, + "loss": 0.95599246, + "num_input_tokens_seen": 12944590, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.8125, + "step": 614, + "time_per_iteration": 2.4639992713928223 + }, + { + "auxiliary_loss_clip": 0.01263574, + "auxiliary_loss_mlp": 0.01075313, + "balance_loss_clip": 1.04415178, + "balance_loss_mlp": 1.07938302, + "epoch": 0.036975800390801145, + "flos": 33650380546560.0, + "grad_norm": 2.901964177226116, + "language_loss": 0.72534943, + "learning_rate": 3.999489768826041e-06, + "loss": 0.74873829, + "num_input_tokens_seen": 12964785, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.84375, + "step": 615, + "time_per_iteration": 2.601372480392456 + }, + { + "auxiliary_loss_clip": 0.01258092, + "auxiliary_loss_mlp": 0.01071353, + "balance_loss_clip": 1.04299331, + "balance_loss_mlp": 1.07278967, + "epoch": 0.03703592364346911, + "flos": 28293694329600.0, + "grad_norm": 2.023635364571096, + "language_loss": 0.81449711, + "learning_rate": 3.999480934200528e-06, + "loss": 0.83779156, + "num_input_tokens_seen": 12986705, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.8515625, + "step": 616, + "time_per_iteration": 2.5325467586517334 + }, + { + "auxiliary_loss_clip": 0.01256707, + "auxiliary_loss_mlp": 0.01063142, + "balance_loss_clip": 1.03643894, + "balance_loss_mlp": 1.07431316, + "epoch": 0.03709604689613708, + "flos": 31504215853440.0, + "grad_norm": 1.9753277492127743, + "language_loss": 0.67868775, + "learning_rate": 3.999472023754499e-06, + "loss": 0.7018863, + "num_input_tokens_seen": 13010560, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.828125, + "step": 617, + "time_per_iteration": 2.5784177780151367 + }, + { + "auxiliary_loss_clip": 0.01263095, + "auxiliary_loss_mlp": 0.01064585, + "balance_loss_clip": 1.0349381, + "balance_loss_mlp": 1.07892454, + "epoch": 0.03715617014880505, + "flos": 19609381272960.0, + "grad_norm": 3.556814357499394, + "language_loss": 0.80340034, + "learning_rate": 3.99946303748829e-06, + "loss": 0.8266772, + "num_input_tokens_seen": 13028935, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.84375, + "step": 618, + "time_per_iteration": 2.4876792430877686 + }, + { + "auxiliary_loss_clip": 0.01261829, + "auxiliary_loss_mlp": 0.01070874, + "balance_loss_clip": 1.04059458, + "balance_loss_mlp": 1.07458091, + "epoch": 0.03721629340147302, + "flos": 15924192497280.0, + "grad_norm": 2.355648226269084, + "language_loss": 0.91115171, + "learning_rate": 3.999453975402242e-06, + "loss": 0.93447876, + "num_input_tokens_seen": 13046000, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.875, + "step": 619, + "time_per_iteration": 2.4804162979125977 + }, + { + "auxiliary_loss_clip": 0.01259898, + "auxiliary_loss_mlp": 0.01077134, + "balance_loss_clip": 1.04871452, + "balance_loss_mlp": 1.07845378, + "epoch": 0.03727641665414099, + "flos": 21104090951040.0, + "grad_norm": 2.218621959424752, + "language_loss": 0.94397002, + "learning_rate": 3.9994448374967e-06, + "loss": 0.96734041, + "num_input_tokens_seen": 13062995, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.8125, + "step": 620, + "time_per_iteration": 2.4592232704162598 + }, + { + "auxiliary_loss_clip": 0.01257463, + "auxiliary_loss_mlp": 0.01077616, + "balance_loss_clip": 1.04750419, + "balance_loss_mlp": 1.07455909, + "epoch": 0.037336539906808956, + "flos": 24131683486080.0, + "grad_norm": 1.8159025601621845, + "language_loss": 0.77105826, + "learning_rate": 3.999435623772008e-06, + "loss": 0.7944091, + "num_input_tokens_seen": 13084120, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.828125, + "step": 621, + "time_per_iteration": 2.53365159034729 + }, + { + "auxiliary_loss_clip": 0.01255819, + "auxiliary_loss_mlp": 0.01059811, + "balance_loss_clip": 1.03084326, + "balance_loss_mlp": 1.07761526, + "epoch": 0.03739666315947693, + "flos": 22346384780160.0, + "grad_norm": 2.793013868715132, + "language_loss": 0.86895752, + "learning_rate": 3.999426334228518e-06, + "loss": 0.89211386, + "num_input_tokens_seen": 13100035, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.78125, + "step": 622, + "time_per_iteration": 2.472726583480835 + }, + { + "auxiliary_loss_clip": 0.01258428, + "auxiliary_loss_mlp": 0.01064577, + "balance_loss_clip": 1.03591871, + "balance_loss_mlp": 1.07622766, + "epoch": 0.0374567864121449, + "flos": 20449511452800.0, + "grad_norm": 2.261361439009279, + "language_loss": 0.90376818, + "learning_rate": 3.999416968866581e-06, + "loss": 0.9269982, + "num_input_tokens_seen": 13118070, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.8203125, + "step": 623, + "time_per_iteration": 2.486699104309082 + }, + { + "auxiliary_loss_clip": 0.0125978, + "auxiliary_loss_mlp": 0.01075147, + "balance_loss_clip": 1.04626298, + "balance_loss_mlp": 1.07841158, + "epoch": 0.037516909664812866, + "flos": 19208043636480.0, + "grad_norm": 1.9910669563462169, + "language_loss": 0.84149444, + "learning_rate": 3.999407527686551e-06, + "loss": 0.86484373, + "num_input_tokens_seen": 13136355, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.8125, + "step": 624, + "time_per_iteration": 2.4514520168304443 + }, + { + "auxiliary_loss_clip": 0.01261437, + "auxiliary_loss_mlp": 0.01067743, + "balance_loss_clip": 1.03867936, + "balance_loss_mlp": 1.0750618, + "epoch": 0.03757703291748084, + "flos": 35005218664320.0, + "grad_norm": 2.4867963928692554, + "language_loss": 0.66228586, + "learning_rate": 3.999398010688788e-06, + "loss": 0.68557763, + "num_input_tokens_seen": 13155435, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.8671875, + "step": 625, + "time_per_iteration": 2.5765273571014404 + }, + { + "auxiliary_loss_clip": 0.01253583, + "auxiliary_loss_mlp": 0.0106714, + "balance_loss_clip": 1.03697979, + "balance_loss_mlp": 1.07435441, + "epoch": 0.0376371561701488, + "flos": 25483899911040.0, + "grad_norm": 2.071255255654034, + "language_loss": 0.77375329, + "learning_rate": 3.999388417873652e-06, + "loss": 0.79696059, + "num_input_tokens_seen": 13174295, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.7890625, + "step": 626, + "time_per_iteration": 2.5022406578063965 + }, + { + "auxiliary_loss_clip": 0.01258684, + "auxiliary_loss_mlp": 0.01074389, + "balance_loss_clip": 1.04499173, + "balance_loss_mlp": 1.07735705, + "epoch": 0.037697279422816775, + "flos": 18185630912640.0, + "grad_norm": 2.2077512286027288, + "language_loss": 0.81357861, + "learning_rate": 3.999378749241506e-06, + "loss": 0.83690929, + "num_input_tokens_seen": 13192500, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.8125, + "step": 627, + "time_per_iteration": 2.4750607013702393 + }, + { + "auxiliary_loss_clip": 0.01261632, + "auxiliary_loss_mlp": 0.01076941, + "balance_loss_clip": 1.04768682, + "balance_loss_mlp": 1.07859111, + "epoch": 0.03775740267548475, + "flos": 24644272521600.0, + "grad_norm": 3.546199216596373, + "language_loss": 0.88572276, + "learning_rate": 3.999369004792719e-06, + "loss": 0.90910852, + "num_input_tokens_seen": 13213470, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.828125, + "step": 628, + "time_per_iteration": 2.571899890899658 + }, + { + "auxiliary_loss_clip": 0.01253553, + "auxiliary_loss_mlp": 0.01067038, + "balance_loss_clip": 1.03864217, + "balance_loss_mlp": 1.07086658, + "epoch": 0.03781752592815271, + "flos": 21288205088640.0, + "grad_norm": 2.488861546346732, + "language_loss": 0.79683006, + "learning_rate": 3.999359184527658e-06, + "loss": 0.82003593, + "num_input_tokens_seen": 13232365, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.828125, + "step": 629, + "time_per_iteration": 2.486675262451172 + }, + { + "auxiliary_loss_clip": 0.01258011, + "auxiliary_loss_mlp": 0.01067816, + "balance_loss_clip": 1.03977799, + "balance_loss_mlp": 1.07458425, + "epoch": 0.037877649180820684, + "flos": 22089623385600.0, + "grad_norm": 1.7117761504495859, + "language_loss": 0.76808703, + "learning_rate": 3.999349288446696e-06, + "loss": 0.79134536, + "num_input_tokens_seen": 13251920, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.8359375, + "step": 630, + "time_per_iteration": 2.494297742843628 + }, + { + "auxiliary_loss_clip": 0.01262746, + "auxiliary_loss_mlp": 0.01070638, + "balance_loss_clip": 1.04250503, + "balance_loss_mlp": 1.07651484, + "epoch": 0.03793777243348865, + "flos": 14501339976960.0, + "grad_norm": 2.6765452133705403, + "language_loss": 0.91492796, + "learning_rate": 3.99933931655021e-06, + "loss": 0.93826187, + "num_input_tokens_seen": 13267440, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.859375, + "step": 631, + "time_per_iteration": 2.4605348110198975 + }, + { + "auxiliary_loss_clip": 0.01252436, + "auxiliary_loss_mlp": 0.01076716, + "balance_loss_clip": 1.04560196, + "balance_loss_mlp": 1.07244229, + "epoch": 0.03799789568615662, + "flos": 21908418249600.0, + "grad_norm": 1.669704350294595, + "language_loss": 0.9207651, + "learning_rate": 3.999329268838575e-06, + "loss": 0.94405663, + "num_input_tokens_seen": 13287850, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 1.796875, + "step": 632, + "time_per_iteration": 2.518498659133911 + }, + { + "auxiliary_loss_clip": 0.01255106, + "auxiliary_loss_mlp": 0.01058467, + "balance_loss_clip": 1.03069162, + "balance_loss_mlp": 1.07462335, + "epoch": 0.03805801893882459, + "flos": 24827021942400.0, + "grad_norm": 2.0828864645498872, + "language_loss": 0.8341018, + "learning_rate": 3.999319145312175e-06, + "loss": 0.85723758, + "num_input_tokens_seen": 13307760, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.8046875, + "step": 633, + "time_per_iteration": 2.5217537879943848 + }, + { + "auxiliary_loss_clip": 0.01258224, + "auxiliary_loss_mlp": 0.01071025, + "balance_loss_clip": 1.04153264, + "balance_loss_mlp": 1.07408428, + "epoch": 0.03811814219149256, + "flos": 30482952364800.0, + "grad_norm": 1.6987522649376106, + "language_loss": 0.69638437, + "learning_rate": 3.999308945971392e-06, + "loss": 0.71967685, + "num_input_tokens_seen": 13331230, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.84375, + "step": 634, + "time_per_iteration": 2.5694239139556885 + }, + { + "auxiliary_loss_clip": 0.01127675, + "auxiliary_loss_mlp": 0.01017483, + "balance_loss_clip": 1.0106163, + "balance_loss_mlp": 1.04225707, + "epoch": 0.03817826544416053, + "flos": 66992577379200.0, + "grad_norm": 0.8852243261294688, + "language_loss": 0.61585373, + "learning_rate": 3.999298670816614e-06, + "loss": 0.63730532, + "num_input_tokens_seen": 13394760, + "router_z_loss_clip": 0.06884766, + "router_z_loss_mlp": 0.8515625, + "step": 635, + "time_per_iteration": 3.1059212684631348 + }, + { + "auxiliary_loss_clip": 0.01253433, + "auxiliary_loss_mlp": 0.01068627, + "balance_loss_clip": 1.03930187, + "balance_loss_mlp": 1.07354546, + "epoch": 0.038238388696828496, + "flos": 20485350247680.0, + "grad_norm": 2.2313569204055246, + "language_loss": 0.83721048, + "learning_rate": 3.9992883198482294e-06, + "loss": 0.86043108, + "num_input_tokens_seen": 13412775, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.796875, + "step": 636, + "time_per_iteration": 2.4975383281707764 + }, + { + "auxiliary_loss_clip": 0.01258014, + "auxiliary_loss_mlp": 0.01077997, + "balance_loss_clip": 1.04852867, + "balance_loss_mlp": 1.07623935, + "epoch": 0.03829851194949647, + "flos": 17965893461760.0, + "grad_norm": 2.4018992949787847, + "language_loss": 0.79327047, + "learning_rate": 3.999277893066632e-06, + "loss": 0.8166306, + "num_input_tokens_seen": 13427835, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.8203125, + "step": 637, + "time_per_iteration": 2.4560744762420654 + }, + { + "auxiliary_loss_clip": 0.01258084, + "auxiliary_loss_mlp": 0.01073075, + "balance_loss_clip": 1.04342771, + "balance_loss_mlp": 1.07309079, + "epoch": 0.03835863520216444, + "flos": 22456522857600.0, + "grad_norm": 2.8779285506389924, + "language_loss": 0.8410306, + "learning_rate": 3.999267390472215e-06, + "loss": 0.86434221, + "num_input_tokens_seen": 13447295, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.8515625, + "step": 638, + "time_per_iteration": 2.504343271255493 + }, + { + "auxiliary_loss_clip": 0.01263348, + "auxiliary_loss_mlp": 0.01067898, + "balance_loss_clip": 1.03717756, + "balance_loss_mlp": 1.07495832, + "epoch": 0.038418758454832405, + "flos": 22164425458560.0, + "grad_norm": 2.5416523890288976, + "language_loss": 0.70099992, + "learning_rate": 3.999256812065381e-06, + "loss": 0.72431237, + "num_input_tokens_seen": 13468455, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.890625, + "step": 639, + "time_per_iteration": 2.52817964553833 + }, + { + "auxiliary_loss_clip": 0.01259266, + "auxiliary_loss_mlp": 0.01075603, + "balance_loss_clip": 1.04463232, + "balance_loss_mlp": 1.07514286, + "epoch": 0.03847888170750038, + "flos": 22747435107840.0, + "grad_norm": 2.42201861797838, + "language_loss": 0.85030365, + "learning_rate": 3.999246157846526e-06, + "loss": 0.8736524, + "num_input_tokens_seen": 13489085, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.84375, + "step": 640, + "time_per_iteration": 2.503262758255005 + }, + { + "auxiliary_loss_clip": 0.0126167, + "auxiliary_loss_mlp": 0.0107911, + "balance_loss_clip": 1.04725742, + "balance_loss_mlp": 1.07574821, + "epoch": 0.03853900496016834, + "flos": 22711201263360.0, + "grad_norm": 2.3722848939528953, + "language_loss": 0.82117289, + "learning_rate": 3.9992354278160574e-06, + "loss": 0.84458065, + "num_input_tokens_seen": 13509120, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 1.859375, + "step": 641, + "time_per_iteration": 2.51052188873291 + }, + { + "auxiliary_loss_clip": 0.01118992, + "auxiliary_loss_mlp": 0.01008303, + "balance_loss_clip": 1.00205636, + "balance_loss_mlp": 1.03414774, + "epoch": 0.038599128212836314, + "flos": 70399136355840.0, + "grad_norm": 0.9008353353488252, + "language_loss": 0.6540072, + "learning_rate": 3.999224621974381e-06, + "loss": 0.67528021, + "num_input_tokens_seen": 13562005, + "router_z_loss_clip": 0.06225586, + "router_z_loss_mlp": 0.8515625, + "step": 642, + "time_per_iteration": 4.430839538574219 + }, + { + "auxiliary_loss_clip": 0.01256856, + "auxiliary_loss_mlp": 0.01062608, + "balance_loss_clip": 1.03433132, + "balance_loss_mlp": 1.07364345, + "epoch": 0.03865925146550429, + "flos": 23295144666240.0, + "grad_norm": 1.9870813050305103, + "language_loss": 0.79512584, + "learning_rate": 3.999213740321906e-06, + "loss": 0.81832051, + "num_input_tokens_seen": 13582185, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.8359375, + "step": 643, + "time_per_iteration": 5.386199951171875 + }, + { + "auxiliary_loss_clip": 0.01255871, + "auxiliary_loss_mlp": 0.01074414, + "balance_loss_clip": 1.0456841, + "balance_loss_mlp": 1.07266629, + "epoch": 0.03871937471817225, + "flos": 21430446946560.0, + "grad_norm": 2.074949815918338, + "language_loss": 0.82926929, + "learning_rate": 3.999202782859046e-06, + "loss": 0.85257208, + "num_input_tokens_seen": 13599555, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.828125, + "step": 644, + "time_per_iteration": 2.45499587059021 + }, + { + "auxiliary_loss_clip": 0.01260265, + "auxiliary_loss_mlp": 0.0106622, + "balance_loss_clip": 1.03503489, + "balance_loss_mlp": 1.07482159, + "epoch": 0.038779497970840224, + "flos": 34277309550720.0, + "grad_norm": 2.258008571643512, + "language_loss": 0.82131916, + "learning_rate": 3.9991917495862165e-06, + "loss": 0.84458399, + "num_input_tokens_seen": 13621160, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.859375, + "step": 645, + "time_per_iteration": 2.610435724258423 + }, + { + "auxiliary_loss_clip": 0.0126099, + "auxiliary_loss_mlp": 0.01070847, + "balance_loss_clip": 1.04121125, + "balance_loss_mlp": 1.07544899, + "epoch": 0.03883962122350819, + "flos": 22748189293440.0, + "grad_norm": 2.4729923618605554, + "language_loss": 0.82006776, + "learning_rate": 3.9991806405038345e-06, + "loss": 0.84338611, + "num_input_tokens_seen": 13641915, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.8515625, + "step": 646, + "time_per_iteration": 2.4771342277526855 + }, + { + "auxiliary_loss_clip": 0.01260575, + "auxiliary_loss_mlp": 0.01080585, + "balance_loss_clip": 1.05123544, + "balance_loss_mlp": 1.07928514, + "epoch": 0.03889974447617616, + "flos": 21945837242880.0, + "grad_norm": 1.8327945326632593, + "language_loss": 0.81973422, + "learning_rate": 3.999169455612323e-06, + "loss": 0.84314579, + "num_input_tokens_seen": 13661410, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.8125, + "step": 647, + "time_per_iteration": 2.522347927093506 + }, + { + "auxiliary_loss_clip": 0.01260388, + "auxiliary_loss_mlp": 0.01069051, + "balance_loss_clip": 1.03965366, + "balance_loss_mlp": 1.07776546, + "epoch": 0.03895986772884413, + "flos": 31504826384640.0, + "grad_norm": 1.9222642653000834, + "language_loss": 0.84699827, + "learning_rate": 3.999158194912106e-06, + "loss": 0.87029266, + "num_input_tokens_seen": 13681705, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.828125, + "step": 648, + "time_per_iteration": 2.561929941177368 + }, + { + "auxiliary_loss_clip": 0.01258218, + "auxiliary_loss_mlp": 0.01071465, + "balance_loss_clip": 1.041448, + "balance_loss_mlp": 1.07636404, + "epoch": 0.0390199909815121, + "flos": 19901011795200.0, + "grad_norm": 3.7283662397985053, + "language_loss": 0.84446943, + "learning_rate": 3.9991468584036086e-06, + "loss": 0.86776626, + "num_input_tokens_seen": 13700400, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.8203125, + "step": 649, + "time_per_iteration": 2.477027416229248 + }, + { + "auxiliary_loss_clip": 0.01259496, + "auxiliary_loss_mlp": 0.01070031, + "balance_loss_clip": 1.03977561, + "balance_loss_mlp": 1.07551885, + "epoch": 0.03908011423418007, + "flos": 21612478095360.0, + "grad_norm": 1.8508721849532739, + "language_loss": 0.79670662, + "learning_rate": 3.999135446087263e-06, + "loss": 0.8200019, + "num_input_tokens_seen": 13720145, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.84375, + "step": 650, + "time_per_iteration": 2.482440948486328 + }, + { + "auxiliary_loss_clip": 0.0125375, + "auxiliary_loss_mlp": 0.01072982, + "balance_loss_clip": 1.04314423, + "balance_loss_mlp": 1.07259929, + "epoch": 0.039140237486848035, + "flos": 18661411486080.0, + "grad_norm": 2.708739352564946, + "language_loss": 0.78509629, + "learning_rate": 3.9991239579635e-06, + "loss": 0.80836356, + "num_input_tokens_seen": 13737500, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.8125, + "step": 651, + "time_per_iteration": 2.4757516384124756 + }, + { + "auxiliary_loss_clip": 0.01255418, + "auxiliary_loss_mlp": 0.01082545, + "balance_loss_clip": 1.05004883, + "balance_loss_mlp": 1.0719974, + "epoch": 0.03920036073951601, + "flos": 18661124177280.0, + "grad_norm": 2.7896665115169244, + "language_loss": 0.88031149, + "learning_rate": 3.999112394032757e-06, + "loss": 0.90369117, + "num_input_tokens_seen": 13754750, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 1.8359375, + "step": 652, + "time_per_iteration": 2.4425668716430664 + }, + { + "auxiliary_loss_clip": 0.01249027, + "auxiliary_loss_mlp": 0.01069663, + "balance_loss_clip": 1.0411005, + "balance_loss_mlp": 1.07108784, + "epoch": 0.03926048399218398, + "flos": 31354468053120.0, + "grad_norm": 3.185528651545475, + "language_loss": 0.79044777, + "learning_rate": 3.999100754295471e-06, + "loss": 0.81363463, + "num_input_tokens_seen": 13771990, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.78125, + "step": 653, + "time_per_iteration": 2.5651934146881104 + }, + { + "auxiliary_loss_clip": 0.01264568, + "auxiliary_loss_mlp": 0.01070462, + "balance_loss_clip": 1.03996825, + "balance_loss_mlp": 1.07603264, + "epoch": 0.039320607244851945, + "flos": 29603499770880.0, + "grad_norm": 2.207303268368246, + "language_loss": 0.86304128, + "learning_rate": 3.999089038752085e-06, + "loss": 0.88639158, + "num_input_tokens_seen": 13792750, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.8828125, + "step": 654, + "time_per_iteration": 2.533297061920166 + }, + { + "auxiliary_loss_clip": 0.01115043, + "auxiliary_loss_mlp": 0.01012751, + "balance_loss_clip": 1.00710094, + "balance_loss_mlp": 1.03246427, + "epoch": 0.03938073049751992, + "flos": 66534609951360.0, + "grad_norm": 0.7205066186016396, + "language_loss": 0.49900642, + "learning_rate": 3.999077247403041e-06, + "loss": 0.5202843, + "num_input_tokens_seen": 13858570, + "router_z_loss_clip": 0.05639648, + "router_z_loss_mlp": 0.82421875, + "step": 655, + "time_per_iteration": 3.1399919986724854 + }, + { + "auxiliary_loss_clip": 0.01251012, + "auxiliary_loss_mlp": 0.01066863, + "balance_loss_clip": 1.03866971, + "balance_loss_mlp": 1.07330465, + "epoch": 0.03944085375018788, + "flos": 23367827836800.0, + "grad_norm": 2.4228021909793918, + "language_loss": 0.80845964, + "learning_rate": 3.9990653802487886e-06, + "loss": 0.83163846, + "num_input_tokens_seen": 13876335, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.78125, + "step": 656, + "time_per_iteration": 2.5063297748565674 + }, + { + "auxiliary_loss_clip": 0.01264211, + "auxiliary_loss_mlp": 0.0109165, + "balance_loss_clip": 1.0566026, + "balance_loss_mlp": 1.07672703, + "epoch": 0.039500977002855854, + "flos": 18548292579840.0, + "grad_norm": 2.8602268717749526, + "language_loss": 0.76602596, + "learning_rate": 3.999053437289776e-06, + "loss": 0.78958458, + "num_input_tokens_seen": 13892640, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 1.875, + "step": 657, + "time_per_iteration": 2.4405555725097656 + }, + { + "auxiliary_loss_clip": 0.01258331, + "auxiliary_loss_mlp": 0.01071967, + "balance_loss_clip": 1.04192615, + "balance_loss_mlp": 1.07452726, + "epoch": 0.039561100255523826, + "flos": 25338174433920.0, + "grad_norm": 2.1526815744488945, + "language_loss": 0.81690443, + "learning_rate": 3.999041418526457e-06, + "loss": 0.84020746, + "num_input_tokens_seen": 13910085, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.84375, + "step": 658, + "time_per_iteration": 2.5383949279785156 + }, + { + "auxiliary_loss_clip": 0.01252051, + "auxiliary_loss_mlp": 0.01072669, + "balance_loss_clip": 1.04091132, + "balance_loss_mlp": 1.07283425, + "epoch": 0.03962122350819179, + "flos": 18219889509120.0, + "grad_norm": 2.2075021313123777, + "language_loss": 0.91331315, + "learning_rate": 3.999029323959287e-06, + "loss": 0.93656039, + "num_input_tokens_seen": 13928800, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.796875, + "step": 659, + "time_per_iteration": 2.4678854942321777 + }, + { + "auxiliary_loss_clip": 0.01259034, + "auxiliary_loss_mlp": 0.01066414, + "balance_loss_clip": 1.03699267, + "balance_loss_mlp": 1.07427669, + "epoch": 0.03968134676085976, + "flos": 20522230536960.0, + "grad_norm": 2.5412719342676215, + "language_loss": 0.79241848, + "learning_rate": 3.999017153588724e-06, + "loss": 0.81567293, + "num_input_tokens_seen": 13948325, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.8515625, + "step": 660, + "time_per_iteration": 2.5135834217071533 + }, + { + "auxiliary_loss_clip": 0.01255641, + "auxiliary_loss_mlp": 0.01070807, + "balance_loss_clip": 1.04017007, + "balance_loss_mlp": 1.07534087, + "epoch": 0.03974147001352773, + "flos": 22422587483520.0, + "grad_norm": 1.6909533460123631, + "language_loss": 0.81942898, + "learning_rate": 3.999004907415231e-06, + "loss": 0.84269351, + "num_input_tokens_seen": 13969090, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.796875, + "step": 661, + "time_per_iteration": 2.513702154159546 + }, + { + "auxiliary_loss_clip": 0.01112947, + "auxiliary_loss_mlp": 0.01010967, + "balance_loss_clip": 1.00519753, + "balance_loss_mlp": 1.03039932, + "epoch": 0.0398015932661957, + "flos": 71128769322240.0, + "grad_norm": 0.9113020435813882, + "language_loss": 0.69376045, + "learning_rate": 3.998992585439272e-06, + "loss": 0.7149995, + "num_input_tokens_seen": 14037555, + "router_z_loss_clip": 0.05761719, + "router_z_loss_mlp": 0.82421875, + "step": 662, + "time_per_iteration": 3.2435107231140137 + }, + { + "auxiliary_loss_clip": 0.01260063, + "auxiliary_loss_mlp": 0.01071537, + "balance_loss_clip": 1.04113865, + "balance_loss_mlp": 1.0779382, + "epoch": 0.03986171651886367, + "flos": 16800951571200.0, + "grad_norm": 2.025040011333182, + "language_loss": 0.83253002, + "learning_rate": 3.998980187661314e-06, + "loss": 0.85584599, + "num_input_tokens_seen": 14055765, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.8125, + "step": 663, + "time_per_iteration": 2.5213887691497803 + }, + { + "auxiliary_loss_clip": 0.01261822, + "auxiliary_loss_mlp": 0.0106269, + "balance_loss_clip": 1.032125, + "balance_loss_mlp": 1.07768416, + "epoch": 0.03992183977153164, + "flos": 24535068197760.0, + "grad_norm": 2.8595031628608143, + "language_loss": 0.87538105, + "learning_rate": 3.998967714081826e-06, + "loss": 0.89862621, + "num_input_tokens_seen": 14074195, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.84375, + "step": 664, + "time_per_iteration": 2.516810655593872 + }, + { + "auxiliary_loss_clip": 0.0125116, + "auxiliary_loss_mlp": 0.01060628, + "balance_loss_clip": 1.03065872, + "balance_loss_mlp": 1.07347679, + "epoch": 0.03998196302419961, + "flos": 15595897167360.0, + "grad_norm": 2.3519362819230625, + "language_loss": 0.84738994, + "learning_rate": 3.998955164701281e-06, + "loss": 0.87050784, + "num_input_tokens_seen": 14090215, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 1.7734375, + "step": 665, + "time_per_iteration": 2.4348978996276855 + }, + { + "auxiliary_loss_clip": 0.01263346, + "auxiliary_loss_mlp": 0.01087391, + "balance_loss_clip": 1.05525231, + "balance_loss_mlp": 1.07680821, + "epoch": 0.04004208627686758, + "flos": 25305065072640.0, + "grad_norm": 2.1279588772882687, + "language_loss": 0.81491798, + "learning_rate": 3.998942539520158e-06, + "loss": 0.83842534, + "num_input_tokens_seen": 14112150, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 1.8671875, + "step": 666, + "time_per_iteration": 2.564187526702881 + }, + { + "auxiliary_loss_clip": 0.01252779, + "auxiliary_loss_mlp": 0.01074876, + "balance_loss_clip": 1.04276049, + "balance_loss_mlp": 1.07225358, + "epoch": 0.04010220952953555, + "flos": 23475847011840.0, + "grad_norm": 2.9939634291419526, + "language_loss": 0.87121451, + "learning_rate": 3.998929838538932e-06, + "loss": 0.89449108, + "num_input_tokens_seen": 14131475, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 1.8046875, + "step": 667, + "time_per_iteration": 2.547909736633301 + }, + { + "auxiliary_loss_clip": 0.0125258, + "auxiliary_loss_mlp": 0.01066007, + "balance_loss_clip": 1.03661036, + "balance_loss_mlp": 1.07692444, + "epoch": 0.04016233278220352, + "flos": 18617025254400.0, + "grad_norm": 2.627098567014159, + "language_loss": 0.80619991, + "learning_rate": 3.998917061758087e-06, + "loss": 0.82938576, + "num_input_tokens_seen": 14146165, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.7578125, + "step": 668, + "time_per_iteration": 2.441667079925537 + }, + { + "auxiliary_loss_clip": 0.01110895, + "auxiliary_loss_mlp": 0.01011229, + "balance_loss_clip": 1.0053643, + "balance_loss_mlp": 1.02968836, + "epoch": 0.040222456034871484, + "flos": 70906194696960.0, + "grad_norm": 0.7872457900726799, + "language_loss": 0.60042131, + "learning_rate": 3.998904209178107e-06, + "loss": 0.62164247, + "num_input_tokens_seen": 14215005, + "router_z_loss_clip": 0.05859375, + "router_z_loss_mlp": 0.8125, + "step": 669, + "time_per_iteration": 3.200874090194702 + }, + { + "auxiliary_loss_clip": 0.01253738, + "auxiliary_loss_mlp": 0.0107276, + "balance_loss_clip": 1.0431962, + "balance_loss_mlp": 1.07228541, + "epoch": 0.040282579287539456, + "flos": 23764712186880.0, + "grad_norm": 1.7415828974469272, + "language_loss": 0.86405391, + "learning_rate": 3.9988912807994785e-06, + "loss": 0.88731897, + "num_input_tokens_seen": 14235510, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.8125, + "step": 670, + "time_per_iteration": 2.5169434547424316 + }, + { + "auxiliary_loss_clip": 0.0124964, + "auxiliary_loss_mlp": 0.01070621, + "balance_loss_clip": 1.0414381, + "balance_loss_mlp": 1.07305872, + "epoch": 0.04034270254020743, + "flos": 18478518410880.0, + "grad_norm": 1.9261739939324196, + "language_loss": 0.752123, + "learning_rate": 3.998878276622692e-06, + "loss": 0.7753256, + "num_input_tokens_seen": 14254565, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.765625, + "step": 671, + "time_per_iteration": 2.514566421508789 + }, + { + "auxiliary_loss_clip": 0.01259516, + "auxiliary_loss_mlp": 0.01075144, + "balance_loss_clip": 1.04472136, + "balance_loss_mlp": 1.0774349, + "epoch": 0.040402825792875394, + "flos": 17201858244480.0, + "grad_norm": 2.0846907245314688, + "language_loss": 0.92279977, + "learning_rate": 3.998865196648242e-06, + "loss": 0.94614637, + "num_input_tokens_seen": 14271885, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.8203125, + "step": 672, + "time_per_iteration": 2.464036226272583 + }, + { + "auxiliary_loss_clip": 0.01253491, + "auxiliary_loss_mlp": 0.01071171, + "balance_loss_clip": 1.03921115, + "balance_loss_mlp": 1.07329202, + "epoch": 0.040462949045543366, + "flos": 19172168928000.0, + "grad_norm": 1.816355722874097, + "language_loss": 0.90220857, + "learning_rate": 3.998852040876622e-06, + "loss": 0.92545515, + "num_input_tokens_seen": 14289670, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 1.796875, + "step": 673, + "time_per_iteration": 2.450547456741333 + }, + { + "auxiliary_loss_clip": 0.01249229, + "auxiliary_loss_mlp": 0.01077482, + "balance_loss_clip": 1.0463202, + "balance_loss_mlp": 1.07150948, + "epoch": 0.04052307229821133, + "flos": 24019821555840.0, + "grad_norm": 2.117589951798075, + "language_loss": 0.74881005, + "learning_rate": 3.998838809308334e-06, + "loss": 0.77207714, + "num_input_tokens_seen": 14309285, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.78125, + "step": 674, + "time_per_iteration": 2.5444436073303223 + }, + { + "auxiliary_loss_clip": 0.01260981, + "auxiliary_loss_mlp": 0.01061202, + "balance_loss_clip": 1.03036261, + "balance_loss_mlp": 1.07609737, + "epoch": 0.0405831955508793, + "flos": 16436601964800.0, + "grad_norm": 2.2422867770418797, + "language_loss": 0.78305578, + "learning_rate": 3.9988255019438766e-06, + "loss": 0.80627763, + "num_input_tokens_seen": 14328300, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.84375, + "step": 675, + "time_per_iteration": 2.4525954723358154 + }, + { + "auxiliary_loss_clip": 0.01252319, + "auxiliary_loss_mlp": 0.01078615, + "balance_loss_clip": 1.04578447, + "balance_loss_mlp": 1.07254028, + "epoch": 0.040643318803547275, + "flos": 24279922915200.0, + "grad_norm": 1.7072695919905723, + "language_loss": 0.76650077, + "learning_rate": 3.998812118783757e-06, + "loss": 0.78981006, + "num_input_tokens_seen": 14346395, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.796875, + "step": 676, + "time_per_iteration": 2.530043840408325 + }, + { + "auxiliary_loss_clip": 0.01258388, + "auxiliary_loss_mlp": 0.01076398, + "balance_loss_clip": 1.04564214, + "balance_loss_mlp": 1.0750767, + "epoch": 0.04070344205621524, + "flos": 17712076982400.0, + "grad_norm": 2.3168648577819138, + "language_loss": 0.85182011, + "learning_rate": 3.9987986598284804e-06, + "loss": 0.87516803, + "num_input_tokens_seen": 14364605, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.828125, + "step": 677, + "time_per_iteration": 2.4390082359313965 + }, + { + "auxiliary_loss_clip": 0.01249568, + "auxiliary_loss_mlp": 0.01068372, + "balance_loss_clip": 1.03804517, + "balance_loss_mlp": 1.071486, + "epoch": 0.04076356530888321, + "flos": 26177658168960.0, + "grad_norm": 1.7808730288109123, + "language_loss": 0.76348364, + "learning_rate": 3.998785125078559e-06, + "loss": 0.78666306, + "num_input_tokens_seen": 14385265, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.78125, + "step": 678, + "time_per_iteration": 2.5151596069335938 + }, + { + "auxiliary_loss_clip": 0.01250603, + "auxiliary_loss_mlp": 0.01066495, + "balance_loss_clip": 1.03807509, + "balance_loss_mlp": 1.07162285, + "epoch": 0.04082368856155118, + "flos": 35773455772800.0, + "grad_norm": 1.9938089142752387, + "language_loss": 0.82114184, + "learning_rate": 3.998771514534505e-06, + "loss": 0.84431279, + "num_input_tokens_seen": 14406090, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.7890625, + "step": 679, + "time_per_iteration": 2.5701568126678467 + }, + { + "auxiliary_loss_clip": 0.01255726, + "auxiliary_loss_mlp": 0.01057721, + "balance_loss_clip": 1.02799058, + "balance_loss_mlp": 1.07693028, + "epoch": 0.04088381181421915, + "flos": 28146640049280.0, + "grad_norm": 1.893911305727382, + "language_loss": 0.76349533, + "learning_rate": 3.998757828196835e-06, + "loss": 0.7866298, + "num_input_tokens_seen": 14425130, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.7890625, + "step": 680, + "time_per_iteration": 2.5289864540100098 + }, + { + "auxiliary_loss_clip": 0.01255007, + "auxiliary_loss_mlp": 0.01065478, + "balance_loss_clip": 1.03305268, + "balance_loss_mlp": 1.07167506, + "epoch": 0.04094393506688712, + "flos": 27597673514880.0, + "grad_norm": 1.7999776318515568, + "language_loss": 0.83315849, + "learning_rate": 3.9987440660660685e-06, + "loss": 0.8563633, + "num_input_tokens_seen": 14447355, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 1.8359375, + "step": 681, + "time_per_iteration": 2.5313305854797363 + }, + { + "auxiliary_loss_clip": 0.01253144, + "auxiliary_loss_mlp": 0.01064685, + "balance_loss_clip": 1.03302324, + "balance_loss_mlp": 1.07082057, + "epoch": 0.04100405831955509, + "flos": 23112036109440.0, + "grad_norm": 1.6690976928218293, + "language_loss": 0.71312869, + "learning_rate": 3.998730228142726e-06, + "loss": 0.73630697, + "num_input_tokens_seen": 14466790, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.828125, + "step": 682, + "time_per_iteration": 2.5190017223358154 + }, + { + "auxiliary_loss_clip": 0.01251459, + "auxiliary_loss_mlp": 0.01068202, + "balance_loss_clip": 1.03911471, + "balance_loss_mlp": 1.07090235, + "epoch": 0.04106418157222306, + "flos": 20156731695360.0, + "grad_norm": 1.7744847161326498, + "language_loss": 0.72373003, + "learning_rate": 3.998716314427333e-06, + "loss": 0.74692667, + "num_input_tokens_seen": 14485195, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.8046875, + "step": 683, + "time_per_iteration": 2.473156690597534 + }, + { + "auxiliary_loss_clip": 0.01250706, + "auxiliary_loss_mlp": 0.01075324, + "balance_loss_clip": 1.04540253, + "balance_loss_mlp": 1.07707, + "epoch": 0.041124304824891024, + "flos": 17420697855360.0, + "grad_norm": 2.316908811268422, + "language_loss": 0.81263745, + "learning_rate": 3.998702324920417e-06, + "loss": 0.83589774, + "num_input_tokens_seen": 14503370, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 1.734375, + "step": 684, + "time_per_iteration": 5.34027099609375 + }, + { + "auxiliary_loss_clip": 0.01251905, + "auxiliary_loss_mlp": 0.01072266, + "balance_loss_clip": 1.04053211, + "balance_loss_mlp": 1.07572865, + "epoch": 0.041184428077558996, + "flos": 25780163287680.0, + "grad_norm": 1.5327144156887007, + "language_loss": 0.90501672, + "learning_rate": 3.9986882596225085e-06, + "loss": 0.92825842, + "num_input_tokens_seen": 14526415, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 1.765625, + "step": 685, + "time_per_iteration": 3.918776750564575 + }, + { + "auxiliary_loss_clip": 0.01253389, + "auxiliary_loss_mlp": 0.010703, + "balance_loss_clip": 1.04002118, + "balance_loss_mlp": 1.07458997, + "epoch": 0.04124455133022697, + "flos": 22964766347520.0, + "grad_norm": 2.0402082016953234, + "language_loss": 0.87871253, + "learning_rate": 3.998674118534141e-06, + "loss": 0.90194941, + "num_input_tokens_seen": 14546595, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.7890625, + "step": 686, + "time_per_iteration": 2.481177806854248 + }, + { + "auxiliary_loss_clip": 0.01258153, + "auxiliary_loss_mlp": 0.01071669, + "balance_loss_clip": 1.04158103, + "balance_loss_mlp": 1.07474661, + "epoch": 0.04130467458289493, + "flos": 21289067015040.0, + "grad_norm": 1.7716861202834375, + "language_loss": 0.71645427, + "learning_rate": 3.998659901655851e-06, + "loss": 0.73975253, + "num_input_tokens_seen": 14566590, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.8359375, + "step": 687, + "time_per_iteration": 2.4720261096954346 + }, + { + "auxiliary_loss_clip": 0.01252382, + "auxiliary_loss_mlp": 0.01070684, + "balance_loss_clip": 1.04262209, + "balance_loss_mlp": 1.07918715, + "epoch": 0.041364797835562905, + "flos": 19974233669760.0, + "grad_norm": 2.117746024922212, + "language_loss": 0.8642537, + "learning_rate": 3.998645608988177e-06, + "loss": 0.88748431, + "num_input_tokens_seen": 14585965, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.734375, + "step": 688, + "time_per_iteration": 2.46706223487854 + }, + { + "auxiliary_loss_clip": 0.01249454, + "auxiliary_loss_mlp": 0.01083042, + "balance_loss_clip": 1.05338287, + "balance_loss_mlp": 1.07534754, + "epoch": 0.04142492108823087, + "flos": 21906227520000.0, + "grad_norm": 2.6487514234328304, + "language_loss": 0.83326006, + "learning_rate": 3.998631240531661e-06, + "loss": 0.85658503, + "num_input_tokens_seen": 14606015, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.7421875, + "step": 689, + "time_per_iteration": 2.4689462184906006 + }, + { + "auxiliary_loss_clip": 0.01248134, + "auxiliary_loss_mlp": 0.01077255, + "balance_loss_clip": 1.04847789, + "balance_loss_mlp": 1.07176828, + "epoch": 0.04148504434089884, + "flos": 27639617621760.0, + "grad_norm": 1.7821885346326607, + "language_loss": 0.68391848, + "learning_rate": 3.998616796286848e-06, + "loss": 0.70717239, + "num_input_tokens_seen": 14629955, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.765625, + "step": 690, + "time_per_iteration": 2.5583128929138184 + }, + { + "auxiliary_loss_clip": 0.012458, + "auxiliary_loss_mlp": 0.01071299, + "balance_loss_clip": 1.04197323, + "balance_loss_mlp": 1.07094526, + "epoch": 0.041545167593566815, + "flos": 20518387781760.0, + "grad_norm": 1.747700039366933, + "language_loss": 0.74933273, + "learning_rate": 3.998602276254286e-06, + "loss": 0.77250373, + "num_input_tokens_seen": 14648000, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.75, + "step": 691, + "time_per_iteration": 2.4566729068756104 + }, + { + "auxiliary_loss_clip": 0.01246178, + "auxiliary_loss_mlp": 0.0107911, + "balance_loss_clip": 1.04890203, + "balance_loss_mlp": 1.07268727, + "epoch": 0.04160529084623478, + "flos": 11868907939200.0, + "grad_norm": 2.450885846250815, + "language_loss": 0.84518701, + "learning_rate": 3.998587680434526e-06, + "loss": 0.86843991, + "num_input_tokens_seen": 14662235, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.734375, + "step": 692, + "time_per_iteration": 2.4667932987213135 + }, + { + "auxiliary_loss_clip": 0.01252043, + "auxiliary_loss_mlp": 0.01072791, + "balance_loss_clip": 1.04124784, + "balance_loss_mlp": 1.07099986, + "epoch": 0.04166541409890275, + "flos": 14828306503680.0, + "grad_norm": 9.166238009589804, + "language_loss": 0.89107299, + "learning_rate": 3.99857300882812e-06, + "loss": 0.9143213, + "num_input_tokens_seen": 14676065, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.8125, + "step": 693, + "time_per_iteration": 2.4823052883148193 + }, + { + "auxiliary_loss_clip": 0.01254961, + "auxiliary_loss_mlp": 0.01065864, + "balance_loss_clip": 1.03637171, + "balance_loss_mlp": 1.07755136, + "epoch": 0.04172553735157072, + "flos": 25808137004160.0, + "grad_norm": 2.1462970179067646, + "language_loss": 0.82179356, + "learning_rate": 3.998558261435626e-06, + "loss": 0.84500182, + "num_input_tokens_seen": 14694955, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.7734375, + "step": 694, + "time_per_iteration": 2.564098834991455 + }, + { + "auxiliary_loss_clip": 0.01253069, + "auxiliary_loss_mlp": 0.01067691, + "balance_loss_clip": 1.03791225, + "balance_loss_mlp": 1.07214785, + "epoch": 0.04178566060423869, + "flos": 24279815174400.0, + "grad_norm": 2.057768586122239, + "language_loss": 0.83656573, + "learning_rate": 3.9985434382576015e-06, + "loss": 0.85977334, + "num_input_tokens_seen": 14715510, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.8125, + "step": 695, + "time_per_iteration": 2.5122969150543213 + }, + { + "auxiliary_loss_clip": 0.01249861, + "auxiliary_loss_mlp": 0.01073319, + "balance_loss_clip": 1.04270577, + "balance_loss_mlp": 1.07313716, + "epoch": 0.04184578385690666, + "flos": 18222008411520.0, + "grad_norm": 2.138642052855673, + "language_loss": 0.8441087, + "learning_rate": 3.99852853929461e-06, + "loss": 0.86734056, + "num_input_tokens_seen": 14731755, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.765625, + "step": 696, + "time_per_iteration": 2.462756872177124 + }, + { + "auxiliary_loss_clip": 0.01247863, + "auxiliary_loss_mlp": 0.01073791, + "balance_loss_clip": 1.04253471, + "balance_loss_mlp": 1.07146811, + "epoch": 0.041905907109574626, + "flos": 22776342577920.0, + "grad_norm": 2.042298821772003, + "language_loss": 0.93134123, + "learning_rate": 3.998513564547216e-06, + "loss": 0.95455778, + "num_input_tokens_seen": 14750810, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.765625, + "step": 697, + "time_per_iteration": 2.5189502239227295 + }, + { + "auxiliary_loss_clip": 0.0124398, + "auxiliary_loss_mlp": 0.01069004, + "balance_loss_clip": 1.04048967, + "balance_loss_mlp": 1.07146859, + "epoch": 0.0419660303622426, + "flos": 20156947176960.0, + "grad_norm": 2.2837511795811207, + "language_loss": 0.83989406, + "learning_rate": 3.998498514015987e-06, + "loss": 0.86302388, + "num_input_tokens_seen": 14768435, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.71875, + "step": 698, + "time_per_iteration": 2.5080907344818115 + }, + { + "auxiliary_loss_clip": 0.01247569, + "auxiliary_loss_mlp": 0.01086724, + "balance_loss_clip": 1.05551505, + "balance_loss_mlp": 1.0711751, + "epoch": 0.042026153614910564, + "flos": 23076376882560.0, + "grad_norm": 1.9405760650289445, + "language_loss": 0.91369909, + "learning_rate": 3.998483387701495e-06, + "loss": 0.93704206, + "num_input_tokens_seen": 14786690, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.7578125, + "step": 699, + "time_per_iteration": 2.4667766094207764 + }, + { + "auxiliary_loss_clip": 0.01113685, + "auxiliary_loss_mlp": 0.01024099, + "balance_loss_clip": 1.01842487, + "balance_loss_mlp": 1.03384757, + "epoch": 0.042086276867578536, + "flos": 64495243370880.0, + "grad_norm": 0.8964375713204716, + "language_loss": 0.67850006, + "learning_rate": 3.998468185604312e-06, + "loss": 0.69987792, + "num_input_tokens_seen": 14853840, + "router_z_loss_clip": 0.05664062, + "router_z_loss_mlp": 0.796875, + "step": 700, + "time_per_iteration": 3.1214911937713623 + }, + { + "auxiliary_loss_clip": 0.01254452, + "auxiliary_loss_mlp": 0.01078478, + "balance_loss_clip": 1.04695964, + "balance_loss_mlp": 1.07502532, + "epoch": 0.04214640012024651, + "flos": 15487016065920.0, + "grad_norm": 2.6789371965697524, + "language_loss": 0.89020562, + "learning_rate": 3.998452907725016e-06, + "loss": 0.913535, + "num_input_tokens_seen": 14869580, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 1.796875, + "step": 701, + "time_per_iteration": 2.46085524559021 + }, + { + "auxiliary_loss_clip": 0.01250018, + "auxiliary_loss_mlp": 0.0107128, + "balance_loss_clip": 1.04085803, + "balance_loss_mlp": 1.07681179, + "epoch": 0.04220652337291447, + "flos": 23877040993920.0, + "grad_norm": 2.2592774096130794, + "language_loss": 0.67494118, + "learning_rate": 3.998437554064184e-06, + "loss": 0.69815421, + "num_input_tokens_seen": 14891065, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.734375, + "step": 702, + "time_per_iteration": 2.5170979499816895 + }, + { + "auxiliary_loss_clip": 0.01112958, + "auxiliary_loss_mlp": 0.01006834, + "balance_loss_clip": 1.00161314, + "balance_loss_mlp": 1.03296542, + "epoch": 0.042266646625582445, + "flos": 63795451628160.0, + "grad_norm": 0.8426087453226233, + "language_loss": 0.60777819, + "learning_rate": 3.9984221246224006e-06, + "loss": 0.62897617, + "num_input_tokens_seen": 14954815, + "router_z_loss_clip": 0.05224609, + "router_z_loss_mlp": 0.80078125, + "step": 703, + "time_per_iteration": 3.155794143676758 + }, + { + "auxiliary_loss_clip": 0.01112196, + "auxiliary_loss_mlp": 0.01010352, + "balance_loss_clip": 1.0050354, + "balance_loss_mlp": 1.03251982, + "epoch": 0.04232676987825041, + "flos": 50018863345920.0, + "grad_norm": 1.0167549333074237, + "language_loss": 0.5776214, + "learning_rate": 3.9984066194002494e-06, + "loss": 0.59884691, + "num_input_tokens_seen": 15003050, + "router_z_loss_clip": 0.05322266, + "router_z_loss_mlp": 0.796875, + "step": 704, + "time_per_iteration": 2.95633602142334 + }, + { + "auxiliary_loss_clip": 0.01252148, + "auxiliary_loss_mlp": 0.01070665, + "balance_loss_clip": 1.0397656, + "balance_loss_mlp": 1.07432342, + "epoch": 0.04238689313091838, + "flos": 21616105368960.0, + "grad_norm": 2.1970745802550624, + "language_loss": 0.87708455, + "learning_rate": 3.998391038398319e-06, + "loss": 0.90031266, + "num_input_tokens_seen": 15021990, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.78125, + "step": 705, + "time_per_iteration": 2.51802921295166 + }, + { + "auxiliary_loss_clip": 0.01238458, + "auxiliary_loss_mlp": 0.01062417, + "balance_loss_clip": 1.03498721, + "balance_loss_mlp": 1.06876624, + "epoch": 0.042447016383586354, + "flos": 19135109070720.0, + "grad_norm": 1.7054575923778923, + "language_loss": 0.71612352, + "learning_rate": 3.998375381617201e-06, + "loss": 0.73913229, + "num_input_tokens_seen": 15040700, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.6953125, + "step": 706, + "time_per_iteration": 2.464270830154419 + }, + { + "auxiliary_loss_clip": 0.01243119, + "auxiliary_loss_mlp": 0.01068207, + "balance_loss_clip": 1.03816676, + "balance_loss_mlp": 1.07029784, + "epoch": 0.04250713963625432, + "flos": 24426007528320.0, + "grad_norm": 2.0927829932503714, + "language_loss": 0.93480223, + "learning_rate": 3.9983596490574875e-06, + "loss": 0.95791554, + "num_input_tokens_seen": 15056725, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.7265625, + "step": 707, + "time_per_iteration": 2.5087966918945312 + }, + { + "auxiliary_loss_clip": 0.01245928, + "auxiliary_loss_mlp": 0.01065311, + "balance_loss_clip": 1.03441203, + "balance_loss_mlp": 1.0676806, + "epoch": 0.04256726288892229, + "flos": 30367391333760.0, + "grad_norm": 2.3244890877745883, + "language_loss": 0.81275034, + "learning_rate": 3.998343840719776e-06, + "loss": 0.83586276, + "num_input_tokens_seen": 15077550, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.78125, + "step": 708, + "time_per_iteration": 2.557119607925415 + }, + { + "auxiliary_loss_clip": 0.01251091, + "auxiliary_loss_mlp": 0.01073266, + "balance_loss_clip": 1.04239082, + "balance_loss_mlp": 1.07195199, + "epoch": 0.04262738614159026, + "flos": 16362661818240.0, + "grad_norm": 2.2553269788690224, + "language_loss": 0.82229173, + "learning_rate": 3.998327956604666e-06, + "loss": 0.84553528, + "num_input_tokens_seen": 15094955, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.796875, + "step": 709, + "time_per_iteration": 2.4828600883483887 + }, + { + "auxiliary_loss_clip": 0.01256006, + "auxiliary_loss_mlp": 0.01064315, + "balance_loss_clip": 1.03389335, + "balance_loss_mlp": 1.07517564, + "epoch": 0.04268750939425823, + "flos": 20412379768320.0, + "grad_norm": 2.534138916450152, + "language_loss": 0.85063422, + "learning_rate": 3.99831199671276e-06, + "loss": 0.87383747, + "num_input_tokens_seen": 15113395, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.8125, + "step": 710, + "time_per_iteration": 2.453641653060913 + }, + { + "auxiliary_loss_clip": 0.01254724, + "auxiliary_loss_mlp": 0.01070713, + "balance_loss_clip": 1.04114938, + "balance_loss_mlp": 1.07757199, + "epoch": 0.0427476326469262, + "flos": 20302959962880.0, + "grad_norm": 3.316207411440496, + "language_loss": 0.84996349, + "learning_rate": 3.998295961044662e-06, + "loss": 0.87321782, + "num_input_tokens_seen": 15132920, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.7734375, + "step": 711, + "time_per_iteration": 2.4685802459716797 + }, + { + "auxiliary_loss_clip": 0.01246695, + "auxiliary_loss_mlp": 0.01069917, + "balance_loss_clip": 1.03827882, + "balance_loss_mlp": 1.07044697, + "epoch": 0.042807755899594166, + "flos": 21650794928640.0, + "grad_norm": 2.000925777751644, + "language_loss": 0.85439169, + "learning_rate": 3.9982798496009804e-06, + "loss": 0.87755781, + "num_input_tokens_seen": 15153115, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.765625, + "step": 712, + "time_per_iteration": 2.5002598762512207 + }, + { + "auxiliary_loss_clip": 0.01252579, + "auxiliary_loss_mlp": 0.0107294, + "balance_loss_clip": 1.0445205, + "balance_loss_mlp": 1.0701685, + "epoch": 0.04286787915226214, + "flos": 21435007973760.0, + "grad_norm": 3.2453781921901728, + "language_loss": 0.90829903, + "learning_rate": 3.998263662382328e-06, + "loss": 0.9315542, + "num_input_tokens_seen": 15172770, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.8203125, + "step": 713, + "time_per_iteration": 2.4908998012542725 + }, + { + "auxiliary_loss_clip": 0.01108903, + "auxiliary_loss_mlp": 0.01017546, + "balance_loss_clip": 1.01187158, + "balance_loss_mlp": 1.0288384, + "epoch": 0.04292800240493011, + "flos": 66397970615040.0, + "grad_norm": 0.8777811618173876, + "language_loss": 0.63746506, + "learning_rate": 3.9982473993893165e-06, + "loss": 0.65872955, + "num_input_tokens_seen": 15240055, + "router_z_loss_clip": 0.05664062, + "router_z_loss_mlp": 0.80078125, + "step": 714, + "time_per_iteration": 3.158921480178833 + }, + { + "auxiliary_loss_clip": 0.01249012, + "auxiliary_loss_mlp": 0.01080593, + "balance_loss_clip": 1.05076694, + "balance_loss_mlp": 1.07545531, + "epoch": 0.042988125657598075, + "flos": 31650264552960.0, + "grad_norm": 2.1622955343434382, + "language_loss": 0.74528754, + "learning_rate": 3.998231060622563e-06, + "loss": 0.76858354, + "num_input_tokens_seen": 15261585, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 1.734375, + "step": 715, + "time_per_iteration": 2.5759642124176025 + }, + { + "auxiliary_loss_clip": 0.01250142, + "auxiliary_loss_mlp": 0.01077038, + "balance_loss_clip": 1.04534006, + "balance_loss_mlp": 1.07450986, + "epoch": 0.04304824891026605, + "flos": 33248468292480.0, + "grad_norm": 2.2108029839954213, + "language_loss": 0.72630137, + "learning_rate": 3.998214646082688e-06, + "loss": 0.74957311, + "num_input_tokens_seen": 15281160, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.7578125, + "step": 716, + "time_per_iteration": 2.5973668098449707 + }, + { + "auxiliary_loss_clip": 0.01104967, + "auxiliary_loss_mlp": 0.01006876, + "balance_loss_clip": 1.00091577, + "balance_loss_mlp": 1.02687418, + "epoch": 0.04310837216293401, + "flos": 64064782782720.0, + "grad_norm": 0.9052113850529176, + "language_loss": 0.65557301, + "learning_rate": 3.998198155770314e-06, + "loss": 0.67669141, + "num_input_tokens_seen": 15344505, + "router_z_loss_clip": 0.05957031, + "router_z_loss_mlp": 0.78125, + "step": 717, + "time_per_iteration": 3.114957571029663 + }, + { + "auxiliary_loss_clip": 0.01104969, + "auxiliary_loss_mlp": 0.01003955, + "balance_loss_clip": 0.99780369, + "balance_loss_mlp": 1.02667391, + "epoch": 0.043168495415601985, + "flos": 61343757849600.0, + "grad_norm": 0.9880116621267147, + "language_loss": 0.58762264, + "learning_rate": 3.998181589686065e-06, + "loss": 0.60871184, + "num_input_tokens_seen": 15404050, + "router_z_loss_clip": 0.0612793, + "router_z_loss_mlp": 0.78125, + "step": 718, + "time_per_iteration": 2.910278797149658 + }, + { + "auxiliary_loss_clip": 0.01248398, + "auxiliary_loss_mlp": 0.01074809, + "balance_loss_clip": 1.04314709, + "balance_loss_mlp": 1.0758605, + "epoch": 0.04322861866826996, + "flos": 20704261685760.0, + "grad_norm": 1.8513004644505335, + "language_loss": 0.91198725, + "learning_rate": 3.99816494783057e-06, + "loss": 0.93521935, + "num_input_tokens_seen": 15424190, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.7265625, + "step": 719, + "time_per_iteration": 2.492509126663208 + }, + { + "auxiliary_loss_clip": 0.01244347, + "auxiliary_loss_mlp": 0.0107141, + "balance_loss_clip": 1.04208493, + "balance_loss_mlp": 1.06931555, + "epoch": 0.04328874192093792, + "flos": 30373352991360.0, + "grad_norm": 1.803377327315558, + "language_loss": 0.66468138, + "learning_rate": 3.99814823020446e-06, + "loss": 0.68783891, + "num_input_tokens_seen": 15446500, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.75, + "step": 720, + "time_per_iteration": 2.6061203479766846 + }, + { + "auxiliary_loss_clip": 0.01244682, + "auxiliary_loss_mlp": 0.01079523, + "balance_loss_clip": 1.04895782, + "balance_loss_mlp": 1.07152998, + "epoch": 0.043348865173605894, + "flos": 21944795748480.0, + "grad_norm": 1.8832143461121282, + "language_loss": 0.77743989, + "learning_rate": 3.9981314368083684e-06, + "loss": 0.80068195, + "num_input_tokens_seen": 15465830, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.734375, + "step": 721, + "time_per_iteration": 2.5255632400512695 + }, + { + "auxiliary_loss_clip": 0.01251204, + "auxiliary_loss_mlp": 0.0108774, + "balance_loss_clip": 1.05879569, + "balance_loss_mlp": 1.07584524, + "epoch": 0.04340898842627386, + "flos": 15264225959040.0, + "grad_norm": 3.027898330451403, + "language_loss": 0.87873065, + "learning_rate": 3.998114567642933e-06, + "loss": 0.90212011, + "num_input_tokens_seen": 15479985, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.75, + "step": 722, + "time_per_iteration": 2.536283493041992 + }, + { + "auxiliary_loss_clip": 0.0125365, + "auxiliary_loss_mlp": 0.01075404, + "balance_loss_clip": 1.04660296, + "balance_loss_mlp": 1.0758208, + "epoch": 0.04346911167894183, + "flos": 27965434913280.0, + "grad_norm": 30.376200688873947, + "language_loss": 0.84770942, + "learning_rate": 3.998097622708792e-06, + "loss": 0.87099999, + "num_input_tokens_seen": 15501545, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.78125, + "step": 723, + "time_per_iteration": 2.5167360305786133 + }, + { + "auxiliary_loss_clip": 0.01256754, + "auxiliary_loss_mlp": 0.01076494, + "balance_loss_clip": 1.04638171, + "balance_loss_mlp": 1.07828176, + "epoch": 0.0435292349316098, + "flos": 29242202820480.0, + "grad_norm": 1.9203333396820472, + "language_loss": 0.82793808, + "learning_rate": 3.99808060200659e-06, + "loss": 0.85127056, + "num_input_tokens_seen": 15521725, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.78125, + "step": 724, + "time_per_iteration": 2.5561347007751465 + }, + { + "auxiliary_loss_clip": 0.0125067, + "auxiliary_loss_mlp": 0.01090321, + "balance_loss_clip": 1.05975556, + "balance_loss_mlp": 1.07561088, + "epoch": 0.04358935818427777, + "flos": 20558356640640.0, + "grad_norm": 1.8200683460759586, + "language_loss": 0.79530561, + "learning_rate": 3.998063505536971e-06, + "loss": 0.81871551, + "num_input_tokens_seen": 15540910, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.75, + "step": 725, + "time_per_iteration": 2.4551918506622314 + }, + { + "auxiliary_loss_clip": 0.0126067, + "auxiliary_loss_mlp": 0.01076358, + "balance_loss_clip": 1.04529178, + "balance_loss_mlp": 1.07715642, + "epoch": 0.04364948143694574, + "flos": 14464926564480.0, + "grad_norm": 2.8106150104808485, + "language_loss": 0.87100697, + "learning_rate": 3.998046333300584e-06, + "loss": 0.89437729, + "num_input_tokens_seen": 15558640, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 1.8359375, + "step": 726, + "time_per_iteration": 5.350574731826782 + }, + { + "auxiliary_loss_clip": 0.01106916, + "auxiliary_loss_mlp": 0.01011263, + "balance_loss_clip": 1.00542223, + "balance_loss_mlp": 1.02866364, + "epoch": 0.043709604689613706, + "flos": 50067268922880.0, + "grad_norm": 0.9088619113669424, + "language_loss": 0.5587045, + "learning_rate": 3.998029085298079e-06, + "loss": 0.57988632, + "num_input_tokens_seen": 15612975, + "router_z_loss_clip": 0.05834961, + "router_z_loss_mlp": 0.78125, + "step": 727, + "time_per_iteration": 3.1539440155029297 + }, + { + "auxiliary_loss_clip": 0.01251236, + "auxiliary_loss_mlp": 0.01076851, + "balance_loss_clip": 1.04676282, + "balance_loss_mlp": 1.07453549, + "epoch": 0.04376972794228168, + "flos": 13991588115840.0, + "grad_norm": 2.397861957488019, + "language_loss": 0.82248902, + "learning_rate": 3.998011761530112e-06, + "loss": 0.84576982, + "num_input_tokens_seen": 15631070, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.765625, + "step": 728, + "time_per_iteration": 2.4445250034332275 + }, + { + "auxiliary_loss_clip": 0.01244631, + "auxiliary_loss_mlp": 0.01068516, + "balance_loss_clip": 1.0395956, + "balance_loss_mlp": 1.07265663, + "epoch": 0.04382985119494965, + "flos": 22009901149440.0, + "grad_norm": 2.2715062050859745, + "language_loss": 0.77187145, + "learning_rate": 3.997994361997338e-06, + "loss": 0.79500294, + "num_input_tokens_seen": 15647825, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.71875, + "step": 729, + "time_per_iteration": 2.5091514587402344 + }, + { + "auxiliary_loss_clip": 0.0125233, + "auxiliary_loss_mlp": 0.01074681, + "balance_loss_clip": 1.04502177, + "balance_loss_mlp": 1.07452357, + "epoch": 0.043889974447617615, + "flos": 24206521472640.0, + "grad_norm": 2.258754879989397, + "language_loss": 0.9515503, + "learning_rate": 3.997976886700417e-06, + "loss": 0.97482038, + "num_input_tokens_seen": 15668260, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.78125, + "step": 730, + "time_per_iteration": 2.4795522689819336 + }, + { + "auxiliary_loss_clip": 0.0124716, + "auxiliary_loss_mlp": 0.01065838, + "balance_loss_clip": 1.03496313, + "balance_loss_mlp": 1.07000017, + "epoch": 0.04395009770028559, + "flos": 17274541415040.0, + "grad_norm": 2.2097226025839483, + "language_loss": 0.88016784, + "learning_rate": 3.997959335640013e-06, + "loss": 0.90329784, + "num_input_tokens_seen": 15685630, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.7734375, + "step": 731, + "time_per_iteration": 2.4678709506988525 + }, + { + "auxiliary_loss_clip": 0.01251191, + "auxiliary_loss_mlp": 0.01073318, + "balance_loss_clip": 1.04589999, + "balance_loss_mlp": 1.07521737, + "epoch": 0.04401022095295355, + "flos": 12310286261760.0, + "grad_norm": 3.3707184473936587, + "language_loss": 0.88656235, + "learning_rate": 3.997941708816791e-06, + "loss": 0.90980744, + "num_input_tokens_seen": 15698645, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.7578125, + "step": 732, + "time_per_iteration": 2.4135851860046387 + }, + { + "auxiliary_loss_clip": 0.01251086, + "auxiliary_loss_mlp": 0.01073165, + "balance_loss_clip": 1.04288554, + "balance_loss_mlp": 1.07443762, + "epoch": 0.044070344205621524, + "flos": 20959658363520.0, + "grad_norm": 2.131822645051773, + "language_loss": 0.86010063, + "learning_rate": 3.997924006231419e-06, + "loss": 0.88334322, + "num_input_tokens_seen": 15716775, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.765625, + "step": 733, + "time_per_iteration": 2.491278648376465 + }, + { + "auxiliary_loss_clip": 0.01256254, + "auxiliary_loss_mlp": 0.01078649, + "balance_loss_clip": 1.04715347, + "balance_loss_mlp": 1.07624841, + "epoch": 0.044130467458289496, + "flos": 13845288021120.0, + "grad_norm": 2.0564057381838885, + "language_loss": 0.91515708, + "learning_rate": 3.9979062278845685e-06, + "loss": 0.93850613, + "num_input_tokens_seen": 15733320, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 1.796875, + "step": 734, + "time_per_iteration": 2.451258897781372 + }, + { + "auxiliary_loss_clip": 0.01247796, + "auxiliary_loss_mlp": 0.01064289, + "balance_loss_clip": 1.03696656, + "balance_loss_mlp": 1.07613921, + "epoch": 0.04419059071095746, + "flos": 28655063107200.0, + "grad_norm": 1.8863467898976456, + "language_loss": 0.77831066, + "learning_rate": 3.9978883737769125e-06, + "loss": 0.8014316, + "num_input_tokens_seen": 15752705, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.71875, + "step": 735, + "time_per_iteration": 2.558958053588867 + }, + { + "auxiliary_loss_clip": 0.01240634, + "auxiliary_loss_mlp": 0.01063468, + "balance_loss_clip": 1.03526342, + "balance_loss_mlp": 1.06886315, + "epoch": 0.04425071396362543, + "flos": 28183304856960.0, + "grad_norm": 2.1337917025346074, + "language_loss": 0.88456166, + "learning_rate": 3.9978704439091305e-06, + "loss": 0.90760267, + "num_input_tokens_seen": 15772800, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.71875, + "step": 736, + "time_per_iteration": 2.5100033283233643 + }, + { + "auxiliary_loss_clip": 0.01242163, + "auxiliary_loss_mlp": 0.01067088, + "balance_loss_clip": 1.03995562, + "balance_loss_mlp": 1.07473993, + "epoch": 0.0443108372162934, + "flos": 23658452778240.0, + "grad_norm": 1.954630170969084, + "language_loss": 0.84155536, + "learning_rate": 3.997852438281901e-06, + "loss": 0.86464787, + "num_input_tokens_seen": 15793665, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.671875, + "step": 737, + "time_per_iteration": 2.5015766620635986 + }, + { + "auxiliary_loss_clip": 0.01251899, + "auxiliary_loss_mlp": 0.01072468, + "balance_loss_clip": 1.04077065, + "balance_loss_mlp": 1.07667851, + "epoch": 0.04437096046896137, + "flos": 33979861025280.0, + "grad_norm": 2.0376910697928947, + "language_loss": 0.8518666, + "learning_rate": 3.997834356895906e-06, + "loss": 0.87511027, + "num_input_tokens_seen": 15813175, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.75, + "step": 738, + "time_per_iteration": 2.5576610565185547 + }, + { + "auxiliary_loss_clip": 0.01112093, + "auxiliary_loss_mlp": 0.01046041, + "balance_loss_clip": 1.04048622, + "balance_loss_mlp": 1.03298163, + "epoch": 0.04443108372162934, + "flos": 67397506375680.0, + "grad_norm": 0.8684121686227821, + "language_loss": 0.59110028, + "learning_rate": 3.9978161997518324e-06, + "loss": 0.61268163, + "num_input_tokens_seen": 15872050, + "router_z_loss_clip": 0.05566406, + "router_z_loss_mlp": 0.7890625, + "step": 739, + "time_per_iteration": 3.0643718242645264 + }, + { + "auxiliary_loss_clip": 0.0124678, + "auxiliary_loss_mlp": 0.01070548, + "balance_loss_clip": 1.04220033, + "balance_loss_mlp": 1.07513726, + "epoch": 0.04449120697429731, + "flos": 29752672953600.0, + "grad_norm": 2.1860888775648695, + "language_loss": 0.91622591, + "learning_rate": 3.997797966850369e-06, + "loss": 0.93939924, + "num_input_tokens_seen": 15891085, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.71875, + "step": 740, + "time_per_iteration": 2.5448389053344727 + }, + { + "auxiliary_loss_clip": 0.01252276, + "auxiliary_loss_mlp": 0.01063948, + "balance_loss_clip": 1.03693473, + "balance_loss_mlp": 1.07766986, + "epoch": 0.04455133022696528, + "flos": 36502119072000.0, + "grad_norm": 2.01644947055736, + "language_loss": 0.71842492, + "learning_rate": 3.997779658192205e-06, + "loss": 0.74158716, + "num_input_tokens_seen": 15914225, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.7421875, + "step": 741, + "time_per_iteration": 2.5979790687561035 + }, + { + "auxiliary_loss_clip": 0.01240373, + "auxiliary_loss_mlp": 0.01073056, + "balance_loss_clip": 1.04532838, + "balance_loss_mlp": 1.07044411, + "epoch": 0.044611453479633245, + "flos": 28803661672320.0, + "grad_norm": 1.722907957661965, + "language_loss": 0.88555831, + "learning_rate": 3.997761273778037e-06, + "loss": 0.9086926, + "num_input_tokens_seen": 15934540, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.703125, + "step": 742, + "time_per_iteration": 2.6367549896240234 + }, + { + "auxiliary_loss_clip": 0.0124233, + "auxiliary_loss_mlp": 0.01061826, + "balance_loss_clip": 1.03253651, + "balance_loss_mlp": 1.07209873, + "epoch": 0.04467157673230122, + "flos": 20010970304640.0, + "grad_norm": 2.0306401320231693, + "language_loss": 0.83823264, + "learning_rate": 3.997742813608561e-06, + "loss": 0.86127412, + "num_input_tokens_seen": 15952560, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.703125, + "step": 743, + "time_per_iteration": 2.516587972640991 + }, + { + "auxiliary_loss_clip": 0.01249271, + "auxiliary_loss_mlp": 0.01068722, + "balance_loss_clip": 1.04161429, + "balance_loss_mlp": 1.07474804, + "epoch": 0.04473169998496919, + "flos": 18004964480640.0, + "grad_norm": 3.0889105946672704, + "language_loss": 0.79948521, + "learning_rate": 3.997724277684479e-06, + "loss": 0.8226651, + "num_input_tokens_seen": 15970620, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.75, + "step": 744, + "time_per_iteration": 2.44805645942688 + }, + { + "auxiliary_loss_clip": 0.01243449, + "auxiliary_loss_mlp": 0.01067337, + "balance_loss_clip": 1.04037201, + "balance_loss_mlp": 1.07279778, + "epoch": 0.044791823237637154, + "flos": 20631722169600.0, + "grad_norm": 2.388036535067576, + "language_loss": 0.85400093, + "learning_rate": 3.99770566600649e-06, + "loss": 0.87710881, + "num_input_tokens_seen": 15987325, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.703125, + "step": 745, + "time_per_iteration": 2.4790430068969727 + }, + { + "auxiliary_loss_clip": 0.01242131, + "auxiliary_loss_mlp": 0.01064523, + "balance_loss_clip": 1.03569877, + "balance_loss_mlp": 1.0714339, + "epoch": 0.04485194649030513, + "flos": 31176171918720.0, + "grad_norm": 2.1215702602167688, + "language_loss": 0.6866799, + "learning_rate": 3.997686978575302e-06, + "loss": 0.70974648, + "num_input_tokens_seen": 16008310, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.7109375, + "step": 746, + "time_per_iteration": 2.5645759105682373 + }, + { + "auxiliary_loss_clip": 0.01250748, + "auxiliary_loss_mlp": 0.010776, + "balance_loss_clip": 1.04748797, + "balance_loss_mlp": 1.0783143, + "epoch": 0.04491206974297309, + "flos": 26143291831680.0, + "grad_norm": 2.1376273799467547, + "language_loss": 0.68823957, + "learning_rate": 3.997668215391625e-06, + "loss": 0.71152306, + "num_input_tokens_seen": 16029620, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.7265625, + "step": 747, + "time_per_iteration": 2.5267317295074463 + }, + { + "auxiliary_loss_clip": 0.01248685, + "auxiliary_loss_mlp": 0.01079454, + "balance_loss_clip": 1.04984236, + "balance_loss_mlp": 1.07314527, + "epoch": 0.044972192995641064, + "flos": 20667668705280.0, + "grad_norm": 1.9669744064389407, + "language_loss": 0.66721869, + "learning_rate": 3.997649376456168e-06, + "loss": 0.69050002, + "num_input_tokens_seen": 16049065, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.75, + "step": 748, + "time_per_iteration": 2.4818925857543945 + }, + { + "auxiliary_loss_clip": 0.01250197, + "auxiliary_loss_mlp": 0.01082391, + "balance_loss_clip": 1.05320835, + "balance_loss_mlp": 1.07779491, + "epoch": 0.045032316248309036, + "flos": 16106834177280.0, + "grad_norm": 2.650057046326624, + "language_loss": 0.76540357, + "learning_rate": 3.997630461769647e-06, + "loss": 0.78872949, + "num_input_tokens_seen": 16066765, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.7265625, + "step": 749, + "time_per_iteration": 2.4454426765441895 + }, + { + "auxiliary_loss_clip": 0.01251335, + "auxiliary_loss_mlp": 0.01077492, + "balance_loss_clip": 1.04883409, + "balance_loss_mlp": 1.0770005, + "epoch": 0.045092439500977, + "flos": 17858843953920.0, + "grad_norm": 2.0345099055640317, + "language_loss": 0.88970172, + "learning_rate": 3.997611471332778e-06, + "loss": 0.91298997, + "num_input_tokens_seen": 16085980, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.7421875, + "step": 750, + "time_per_iteration": 2.458716630935669 + }, + { + "auxiliary_loss_clip": 0.01247033, + "auxiliary_loss_mlp": 0.01074335, + "balance_loss_clip": 1.04295921, + "balance_loss_mlp": 1.07139015, + "epoch": 0.04515256275364497, + "flos": 24462815990400.0, + "grad_norm": 2.3716924268159367, + "language_loss": 0.74869245, + "learning_rate": 3.9975924051462825e-06, + "loss": 0.77190608, + "num_input_tokens_seen": 16106260, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 1.7578125, + "step": 751, + "time_per_iteration": 2.5231218338012695 + }, + { + "auxiliary_loss_clip": 0.01243504, + "auxiliary_loss_mlp": 0.01073697, + "balance_loss_clip": 1.04573071, + "balance_loss_mlp": 1.07175446, + "epoch": 0.04521268600631294, + "flos": 20916385453440.0, + "grad_norm": 2.2224468826240975, + "language_loss": 0.69360238, + "learning_rate": 3.997573263210883e-06, + "loss": 0.7167744, + "num_input_tokens_seen": 16123475, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.71875, + "step": 752, + "time_per_iteration": 2.4620048999786377 + }, + { + "auxiliary_loss_clip": 0.01244736, + "auxiliary_loss_mlp": 0.01057192, + "balance_loss_clip": 1.02927327, + "balance_loss_mlp": 1.07154715, + "epoch": 0.04527280925898091, + "flos": 13371374954880.0, + "grad_norm": 2.984649176219999, + "language_loss": 0.91634125, + "learning_rate": 3.997554045527305e-06, + "loss": 0.9393605, + "num_input_tokens_seen": 16138335, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.734375, + "step": 753, + "time_per_iteration": 2.4722437858581543 + }, + { + "auxiliary_loss_clip": 0.01249124, + "auxiliary_loss_mlp": 0.01075497, + "balance_loss_clip": 1.04728031, + "balance_loss_mlp": 1.07501864, + "epoch": 0.04533293251164888, + "flos": 23254565276160.0, + "grad_norm": 2.2056938633592975, + "language_loss": 0.91197902, + "learning_rate": 3.997534752096277e-06, + "loss": 0.93522525, + "num_input_tokens_seen": 16157110, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.7421875, + "step": 754, + "time_per_iteration": 2.472975492477417 + }, + { + "auxiliary_loss_clip": 0.01238249, + "auxiliary_loss_mlp": 0.0107062, + "balance_loss_clip": 1.04144955, + "balance_loss_mlp": 1.07163191, + "epoch": 0.04539305576431685, + "flos": 12422004537600.0, + "grad_norm": 2.234660546964849, + "language_loss": 0.78528345, + "learning_rate": 3.997515382918531e-06, + "loss": 0.80837214, + "num_input_tokens_seen": 16174155, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.671875, + "step": 755, + "time_per_iteration": 2.4704174995422363 + }, + { + "auxiliary_loss_clip": 0.01248815, + "auxiliary_loss_mlp": 0.0107981, + "balance_loss_clip": 1.05100918, + "balance_loss_mlp": 1.07416105, + "epoch": 0.04545317901698482, + "flos": 16070995382400.0, + "grad_norm": 1.9667934561660614, + "language_loss": 0.78451371, + "learning_rate": 3.9974959379948015e-06, + "loss": 0.80779994, + "num_input_tokens_seen": 16192240, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.75, + "step": 756, + "time_per_iteration": 2.4873547554016113 + }, + { + "auxiliary_loss_clip": 0.01118229, + "auxiliary_loss_mlp": 0.010118, + "balance_loss_clip": 1.00600612, + "balance_loss_mlp": 1.03558636, + "epoch": 0.045513302269652785, + "flos": 66396139021440.0, + "grad_norm": 0.8118987787253854, + "language_loss": 0.62730747, + "learning_rate": 3.997476417325827e-06, + "loss": 0.64860779, + "num_input_tokens_seen": 16255775, + "router_z_loss_clip": 0.05786133, + "router_z_loss_mlp": 0.828125, + "step": 757, + "time_per_iteration": 3.1292941570281982 + }, + { + "auxiliary_loss_clip": 0.01242797, + "auxiliary_loss_mlp": 0.01069674, + "balance_loss_clip": 1.04220784, + "balance_loss_mlp": 1.0731318, + "epoch": 0.04557342552232076, + "flos": 21471169991040.0, + "grad_norm": 1.5194495460848947, + "language_loss": 0.84329176, + "learning_rate": 3.997456820912346e-06, + "loss": 0.86641645, + "num_input_tokens_seen": 16277015, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.6953125, + "step": 758, + "time_per_iteration": 2.498905658721924 + }, + { + "auxiliary_loss_clip": 0.01237511, + "auxiliary_loss_mlp": 0.01067085, + "balance_loss_clip": 1.0405376, + "balance_loss_mlp": 1.06733441, + "epoch": 0.04563354877498873, + "flos": 23732680233600.0, + "grad_norm": 2.0933163310434963, + "language_loss": 0.88315606, + "learning_rate": 3.997437148755101e-06, + "loss": 0.90620202, + "num_input_tokens_seen": 16296005, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.703125, + "step": 759, + "time_per_iteration": 2.5122711658477783 + }, + { + "auxiliary_loss_clip": 0.01248241, + "auxiliary_loss_mlp": 0.01075804, + "balance_loss_clip": 1.04644299, + "balance_loss_mlp": 1.075526, + "epoch": 0.045693672027656694, + "flos": 25735741142400.0, + "grad_norm": 2.170817451496144, + "language_loss": 0.73644727, + "learning_rate": 3.9974174008548405e-06, + "loss": 0.75968778, + "num_input_tokens_seen": 16315300, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.7265625, + "step": 760, + "time_per_iteration": 2.511322021484375 + }, + { + "auxiliary_loss_clip": 0.01244913, + "auxiliary_loss_mlp": 0.01073409, + "balance_loss_clip": 1.04630077, + "balance_loss_mlp": 1.07509935, + "epoch": 0.045753795280324666, + "flos": 19719016560000.0, + "grad_norm": 2.192184725657734, + "language_loss": 0.82177126, + "learning_rate": 3.9973975772123105e-06, + "loss": 0.84495443, + "num_input_tokens_seen": 16333820, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.6953125, + "step": 761, + "time_per_iteration": 2.4831535816192627 + }, + { + "auxiliary_loss_clip": 0.01238913, + "auxiliary_loss_mlp": 0.0107061, + "balance_loss_clip": 1.04245305, + "balance_loss_mlp": 1.06961203, + "epoch": 0.04581391853299264, + "flos": 23255786338560.0, + "grad_norm": 1.7986428347309282, + "language_loss": 0.79732436, + "learning_rate": 3.997377677828266e-06, + "loss": 0.82041955, + "num_input_tokens_seen": 16355290, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.6953125, + "step": 762, + "time_per_iteration": 2.52453875541687 + }, + { + "auxiliary_loss_clip": 0.01117014, + "auxiliary_loss_mlp": 0.01036706, + "balance_loss_clip": 1.03112733, + "balance_loss_mlp": 1.03455913, + "epoch": 0.0458740417856606, + "flos": 64231155601920.0, + "grad_norm": 1.008821564963746, + "language_loss": 0.58659625, + "learning_rate": 3.9973577027034585e-06, + "loss": 0.60813344, + "num_input_tokens_seen": 16415995, + "router_z_loss_clip": 0.0559082, + "router_z_loss_mlp": 0.82421875, + "step": 763, + "time_per_iteration": 3.1429429054260254 + }, + { + "auxiliary_loss_clip": 0.01245459, + "auxiliary_loss_mlp": 0.01081866, + "balance_loss_clip": 1.05381632, + "balance_loss_mlp": 1.07288039, + "epoch": 0.045934165038328575, + "flos": 20770121272320.0, + "grad_norm": 2.8717486924500517, + "language_loss": 0.87752867, + "learning_rate": 3.9973376518386475e-06, + "loss": 0.9008019, + "num_input_tokens_seen": 16433120, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.7265625, + "step": 764, + "time_per_iteration": 2.4727554321289062 + }, + { + "auxiliary_loss_clip": 0.01248006, + "auxiliary_loss_mlp": 0.01079864, + "balance_loss_clip": 1.05192137, + "balance_loss_mlp": 1.07565248, + "epoch": 0.04599428829099654, + "flos": 30262891691520.0, + "grad_norm": 1.9426139778845304, + "language_loss": 0.86118066, + "learning_rate": 3.997317525234592e-06, + "loss": 0.88445938, + "num_input_tokens_seen": 16453360, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.71875, + "step": 765, + "time_per_iteration": 2.5370731353759766 + }, + { + "auxiliary_loss_clip": 0.01248646, + "auxiliary_loss_mlp": 0.01070241, + "balance_loss_clip": 1.03912735, + "balance_loss_mlp": 1.07336497, + "epoch": 0.04605441154366451, + "flos": 23038921975680.0, + "grad_norm": 3.0624701923152453, + "language_loss": 0.87846982, + "learning_rate": 3.997297322892056e-06, + "loss": 0.90165865, + "num_input_tokens_seen": 16471160, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 1.75, + "step": 766, + "time_per_iteration": 2.475677013397217 + }, + { + "auxiliary_loss_clip": 0.01239894, + "auxiliary_loss_mlp": 0.01067957, + "balance_loss_clip": 1.03979921, + "balance_loss_mlp": 1.06896472, + "epoch": 0.046114534796332485, + "flos": 22017407091840.0, + "grad_norm": 2.616885530601855, + "language_loss": 0.84314167, + "learning_rate": 3.997277044811806e-06, + "loss": 0.86622024, + "num_input_tokens_seen": 16488940, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.703125, + "step": 767, + "time_per_iteration": 2.465449810028076 + }, + { + "auxiliary_loss_clip": 0.01245421, + "auxiliary_loss_mlp": 0.01060911, + "balance_loss_clip": 1.03249097, + "balance_loss_mlp": 1.07569289, + "epoch": 0.04617465804900045, + "flos": 29862380067840.0, + "grad_norm": 2.056931367891973, + "language_loss": 0.87013769, + "learning_rate": 3.99725669099461e-06, + "loss": 0.89320099, + "num_input_tokens_seen": 16509505, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.703125, + "step": 768, + "time_per_iteration": 5.441957235336304 + }, + { + "auxiliary_loss_clip": 0.01238542, + "auxiliary_loss_mlp": 0.01069073, + "balance_loss_clip": 1.04184508, + "balance_loss_mlp": 1.06768477, + "epoch": 0.04623478130166842, + "flos": 25630056351360.0, + "grad_norm": 2.1199205591749033, + "language_loss": 0.75022334, + "learning_rate": 3.9972362614412395e-06, + "loss": 0.77329946, + "num_input_tokens_seen": 16528840, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.703125, + "step": 769, + "time_per_iteration": 2.5294675827026367 + }, + { + "auxiliary_loss_clip": 0.01238179, + "auxiliary_loss_mlp": 0.01063477, + "balance_loss_clip": 1.03734684, + "balance_loss_mlp": 1.07084632, + "epoch": 0.04629490455433639, + "flos": 20449080489600.0, + "grad_norm": 1.886534334963383, + "language_loss": 0.86162585, + "learning_rate": 3.997215756152471e-06, + "loss": 0.88464236, + "num_input_tokens_seen": 16548335, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.671875, + "step": 770, + "time_per_iteration": 2.4646449089050293 + }, + { + "auxiliary_loss_clip": 0.01248004, + "auxiliary_loss_mlp": 0.01066015, + "balance_loss_clip": 1.0385015, + "balance_loss_mlp": 1.07160687, + "epoch": 0.04635502780700436, + "flos": 23148736830720.0, + "grad_norm": 2.8625416592988477, + "language_loss": 0.87259042, + "learning_rate": 3.99719517512908e-06, + "loss": 0.89573061, + "num_input_tokens_seen": 16567725, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.765625, + "step": 771, + "time_per_iteration": 2.512622117996216 + }, + { + "auxiliary_loss_clip": 0.01246333, + "auxiliary_loss_mlp": 0.01076832, + "balance_loss_clip": 1.04726815, + "balance_loss_mlp": 1.06911707, + "epoch": 0.04641515105967233, + "flos": 23292020183040.0, + "grad_norm": 2.3640102097360587, + "language_loss": 0.83736801, + "learning_rate": 3.997174518371848e-06, + "loss": 0.86059964, + "num_input_tokens_seen": 16588175, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.7734375, + "step": 772, + "time_per_iteration": 2.509572982788086 + }, + { + "auxiliary_loss_clip": 0.01243608, + "auxiliary_loss_mlp": 0.01064058, + "balance_loss_clip": 1.03721189, + "balance_loss_mlp": 1.07392263, + "epoch": 0.046475274312340296, + "flos": 25115204759040.0, + "grad_norm": 2.3097217333215694, + "language_loss": 0.73399591, + "learning_rate": 3.997153785881557e-06, + "loss": 0.75707257, + "num_input_tokens_seen": 16607735, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.6953125, + "step": 773, + "time_per_iteration": 2.5539331436157227 + }, + { + "auxiliary_loss_clip": 0.01240234, + "auxiliary_loss_mlp": 0.01065536, + "balance_loss_clip": 1.03624654, + "balance_loss_mlp": 1.07288945, + "epoch": 0.04653539756500827, + "flos": 25264916645760.0, + "grad_norm": 2.066531290075925, + "language_loss": 0.78523052, + "learning_rate": 3.997132977658996e-06, + "loss": 0.80828828, + "num_input_tokens_seen": 16627225, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.671875, + "step": 774, + "time_per_iteration": 2.5350210666656494 + }, + { + "auxiliary_loss_clip": 0.01239038, + "auxiliary_loss_mlp": 0.01065848, + "balance_loss_clip": 1.03955007, + "balance_loss_mlp": 1.07101154, + "epoch": 0.046595520817676234, + "flos": 35404150089600.0, + "grad_norm": 2.187480231527322, + "language_loss": 0.73357666, + "learning_rate": 3.997112093704952e-06, + "loss": 0.75662553, + "num_input_tokens_seen": 16647785, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.6796875, + "step": 775, + "time_per_iteration": 2.6102981567382812 + }, + { + "auxiliary_loss_clip": 0.01240703, + "auxiliary_loss_mlp": 0.01059246, + "balance_loss_clip": 1.03096998, + "balance_loss_mlp": 1.06996655, + "epoch": 0.046655644070344206, + "flos": 18112516778880.0, + "grad_norm": 1.5904648869830247, + "language_loss": 0.77037287, + "learning_rate": 3.997091134020217e-06, + "loss": 0.79337239, + "num_input_tokens_seen": 16667555, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.703125, + "step": 776, + "time_per_iteration": 2.4713642597198486 + }, + { + "auxiliary_loss_clip": 0.01236202, + "auxiliary_loss_mlp": 0.01063775, + "balance_loss_clip": 1.03790653, + "balance_loss_mlp": 1.06914115, + "epoch": 0.04671576732301218, + "flos": 29205286617600.0, + "grad_norm": 1.9751950676431418, + "language_loss": 0.70967531, + "learning_rate": 3.997070098605585e-06, + "loss": 0.73267508, + "num_input_tokens_seen": 16686875, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.671875, + "step": 777, + "time_per_iteration": 2.540151596069336 + }, + { + "auxiliary_loss_clip": 0.01242182, + "auxiliary_loss_mlp": 0.01078178, + "balance_loss_clip": 1.04999709, + "balance_loss_mlp": 1.07221043, + "epoch": 0.04677589057568014, + "flos": 30478319510400.0, + "grad_norm": 1.9852588200641685, + "language_loss": 0.76756501, + "learning_rate": 3.997048987461856e-06, + "loss": 0.79076868, + "num_input_tokens_seen": 16706420, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.703125, + "step": 778, + "time_per_iteration": 2.5299642086029053 + }, + { + "auxiliary_loss_clip": 0.01236882, + "auxiliary_loss_mlp": 0.01068399, + "balance_loss_clip": 1.04049253, + "balance_loss_mlp": 1.06948996, + "epoch": 0.046836013828348115, + "flos": 20557674282240.0, + "grad_norm": 2.9364819041983576, + "language_loss": 0.78900939, + "learning_rate": 3.997027800589829e-06, + "loss": 0.81206226, + "num_input_tokens_seen": 16726390, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.671875, + "step": 779, + "time_per_iteration": 2.4999477863311768 + }, + { + "auxiliary_loss_clip": 0.01230899, + "auxiliary_loss_mlp": 0.01065999, + "balance_loss_clip": 1.03997588, + "balance_loss_mlp": 1.06776333, + "epoch": 0.04689613708101608, + "flos": 25447378757760.0, + "grad_norm": 1.7037291099106273, + "language_loss": 0.77051055, + "learning_rate": 3.997006537990308e-06, + "loss": 0.7934795, + "num_input_tokens_seen": 16748965, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.6328125, + "step": 780, + "time_per_iteration": 2.54770565032959 + }, + { + "auxiliary_loss_clip": 0.01235667, + "auxiliary_loss_mlp": 0.01067194, + "balance_loss_clip": 1.04187369, + "balance_loss_mlp": 1.07070863, + "epoch": 0.04695626033368405, + "flos": 23001395241600.0, + "grad_norm": 2.6789342331958745, + "language_loss": 0.76432645, + "learning_rate": 3.996985199664099e-06, + "loss": 0.78735507, + "num_input_tokens_seen": 16768620, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.6484375, + "step": 781, + "time_per_iteration": 2.5040361881256104 + }, + { + "auxiliary_loss_clip": 0.01245917, + "auxiliary_loss_mlp": 0.01072818, + "balance_loss_clip": 1.04468417, + "balance_loss_mlp": 1.07423282, + "epoch": 0.047016383586352024, + "flos": 29133357632640.0, + "grad_norm": 2.2171800145032736, + "language_loss": 0.74027473, + "learning_rate": 3.99696378561201e-06, + "loss": 0.76346207, + "num_input_tokens_seen": 16789755, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.71875, + "step": 782, + "time_per_iteration": 2.528890371322632 + }, + { + "auxiliary_loss_clip": 0.01241991, + "auxiliary_loss_mlp": 0.01060851, + "balance_loss_clip": 1.03549504, + "balance_loss_mlp": 1.07483578, + "epoch": 0.04707650683901999, + "flos": 14976330451200.0, + "grad_norm": 6.219089205177081, + "language_loss": 0.8032757, + "learning_rate": 3.996942295834855e-06, + "loss": 0.82630414, + "num_input_tokens_seen": 16807585, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.671875, + "step": 783, + "time_per_iteration": 2.4866061210632324 + }, + { + "auxiliary_loss_clip": 0.01232605, + "auxiliary_loss_mlp": 0.01059533, + "balance_loss_clip": 1.03417742, + "balance_loss_mlp": 1.07062817, + "epoch": 0.04713663009168796, + "flos": 21651118151040.0, + "grad_norm": 2.0172272756643816, + "language_loss": 0.81289953, + "learning_rate": 3.996920730333448e-06, + "loss": 0.83582091, + "num_input_tokens_seen": 16827220, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.6171875, + "step": 784, + "time_per_iteration": 2.476659059524536 + }, + { + "auxiliary_loss_clip": 0.01238913, + "auxiliary_loss_mlp": 0.01072248, + "balance_loss_clip": 1.04597473, + "balance_loss_mlp": 1.0683856, + "epoch": 0.04719675334435593, + "flos": 21325408600320.0, + "grad_norm": 2.171254656371271, + "language_loss": 0.8076694, + "learning_rate": 3.996899089108607e-06, + "loss": 0.83078098, + "num_input_tokens_seen": 16846230, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.703125, + "step": 785, + "time_per_iteration": 2.493598461151123 + }, + { + "auxiliary_loss_clip": 0.01241548, + "auxiliary_loss_mlp": 0.01061941, + "balance_loss_clip": 1.03752661, + "balance_loss_mlp": 1.0762614, + "epoch": 0.0472568765970239, + "flos": 17931383470080.0, + "grad_norm": 2.444819858404617, + "language_loss": 0.89981294, + "learning_rate": 3.996877372161152e-06, + "loss": 0.92284781, + "num_input_tokens_seen": 16865325, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.65625, + "step": 786, + "time_per_iteration": 2.4573311805725098 + }, + { + "auxiliary_loss_clip": 0.012413, + "auxiliary_loss_mlp": 0.01070001, + "balance_loss_clip": 1.04055619, + "balance_loss_mlp": 1.06742501, + "epoch": 0.04731699984969187, + "flos": 18077324428800.0, + "grad_norm": 3.379381752409287, + "language_loss": 0.76639462, + "learning_rate": 3.9968555794919065e-06, + "loss": 0.78950763, + "num_input_tokens_seen": 16882930, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.734375, + "step": 787, + "time_per_iteration": 2.447611093521118 + }, + { + "auxiliary_loss_clip": 0.01247236, + "auxiliary_loss_mlp": 0.01071736, + "balance_loss_clip": 1.04431772, + "balance_loss_mlp": 1.0765723, + "epoch": 0.047377123102359836, + "flos": 23185078416000.0, + "grad_norm": 2.4642209511959403, + "language_loss": 0.80851126, + "learning_rate": 3.996833711101698e-06, + "loss": 0.83170098, + "num_input_tokens_seen": 16900710, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.7109375, + "step": 788, + "time_per_iteration": 2.4679956436157227 + }, + { + "auxiliary_loss_clip": 0.01236983, + "auxiliary_loss_mlp": 0.01074337, + "balance_loss_clip": 1.04551244, + "balance_loss_mlp": 1.07285857, + "epoch": 0.04743724635502781, + "flos": 22747794243840.0, + "grad_norm": 2.2318634793178127, + "language_loss": 0.84819949, + "learning_rate": 3.996811766991355e-06, + "loss": 0.87131274, + "num_input_tokens_seen": 16919210, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.640625, + "step": 789, + "time_per_iteration": 2.4982516765594482 + }, + { + "auxiliary_loss_clip": 0.01242053, + "auxiliary_loss_mlp": 0.01066276, + "balance_loss_clip": 1.04006219, + "balance_loss_mlp": 1.07367456, + "epoch": 0.04749736960769577, + "flos": 17238702620160.0, + "grad_norm": 1.948517450129577, + "language_loss": 0.82196069, + "learning_rate": 3.996789747161709e-06, + "loss": 0.84504396, + "num_input_tokens_seen": 16937125, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.6875, + "step": 790, + "time_per_iteration": 2.4380602836608887 + }, + { + "auxiliary_loss_clip": 0.01236299, + "auxiliary_loss_mlp": 0.01062348, + "balance_loss_clip": 1.03524029, + "balance_loss_mlp": 1.06857598, + "epoch": 0.047557492860363745, + "flos": 40479261592320.0, + "grad_norm": 2.8806939749630054, + "language_loss": 0.88245451, + "learning_rate": 3.996767651613597e-06, + "loss": 0.90544093, + "num_input_tokens_seen": 16958610, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.671875, + "step": 791, + "time_per_iteration": 2.6723573207855225 + }, + { + "auxiliary_loss_clip": 0.01239952, + "auxiliary_loss_mlp": 0.010655, + "balance_loss_clip": 1.03826034, + "balance_loss_mlp": 1.07212687, + "epoch": 0.04761761611303172, + "flos": 18698004466560.0, + "grad_norm": 2.2584516419561464, + "language_loss": 0.90245461, + "learning_rate": 3.996745480347854e-06, + "loss": 0.92550921, + "num_input_tokens_seen": 16977300, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.671875, + "step": 792, + "time_per_iteration": 2.4627771377563477 + }, + { + "auxiliary_loss_clip": 0.01241845, + "auxiliary_loss_mlp": 0.01074856, + "balance_loss_clip": 1.04874945, + "balance_loss_mlp": 1.07157969, + "epoch": 0.04767773936569968, + "flos": 20921987975040.0, + "grad_norm": 1.9386484459236437, + "language_loss": 0.7310667, + "learning_rate": 3.996723233365324e-06, + "loss": 0.75423372, + "num_input_tokens_seen": 16994950, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.703125, + "step": 793, + "time_per_iteration": 2.512657403945923 + }, + { + "auxiliary_loss_clip": 0.0124198, + "auxiliary_loss_mlp": 0.01067209, + "balance_loss_clip": 1.03969526, + "balance_loss_mlp": 1.07207203, + "epoch": 0.047737862618367655, + "flos": 23732680233600.0, + "grad_norm": 2.0117940746735123, + "language_loss": 0.86102074, + "learning_rate": 3.996700910666847e-06, + "loss": 0.88411266, + "num_input_tokens_seen": 17014760, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.703125, + "step": 794, + "time_per_iteration": 2.510611057281494 + }, + { + "auxiliary_loss_clip": 0.0123999, + "auxiliary_loss_mlp": 0.01074174, + "balance_loss_clip": 1.04701805, + "balance_loss_mlp": 1.06925917, + "epoch": 0.04779798587103562, + "flos": 23695764030720.0, + "grad_norm": 3.4118642482115384, + "language_loss": 0.69812739, + "learning_rate": 3.996678512253272e-06, + "loss": 0.72126907, + "num_input_tokens_seen": 17032715, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.7109375, + "step": 795, + "time_per_iteration": 2.500420093536377 + }, + { + "auxiliary_loss_clip": 0.01236981, + "auxiliary_loss_mlp": 0.01070364, + "balance_loss_clip": 1.0432204, + "balance_loss_mlp": 1.06999111, + "epoch": 0.04785810912370359, + "flos": 23183641872000.0, + "grad_norm": 2.0479238599532135, + "language_loss": 0.81053579, + "learning_rate": 3.996656038125449e-06, + "loss": 0.83360916, + "num_input_tokens_seen": 17052215, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.671875, + "step": 796, + "time_per_iteration": 2.4838409423828125 + }, + { + "auxiliary_loss_clip": 0.0124002, + "auxiliary_loss_mlp": 0.01058331, + "balance_loss_clip": 1.03129458, + "balance_loss_mlp": 1.07190371, + "epoch": 0.047918232376371564, + "flos": 18040623707520.0, + "grad_norm": 2.3456590334750858, + "language_loss": 0.81249642, + "learning_rate": 3.996633488284228e-06, + "loss": 0.83547997, + "num_input_tokens_seen": 17069225, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.6796875, + "step": 797, + "time_per_iteration": 2.466343402862549 + }, + { + "auxiliary_loss_clip": 0.01122032, + "auxiliary_loss_mlp": 0.0100279, + "balance_loss_clip": 0.9972828, + "balance_loss_mlp": 1.03672731, + "epoch": 0.04797835562903953, + "flos": 62442588758400.0, + "grad_norm": 0.9120921080635288, + "language_loss": 0.64447635, + "learning_rate": 3.996610862730465e-06, + "loss": 0.66572458, + "num_input_tokens_seen": 17126680, + "router_z_loss_clip": 0.05517578, + "router_z_loss_mlp": 0.8515625, + "step": 798, + "time_per_iteration": 3.0081863403320312 + }, + { + "auxiliary_loss_clip": 0.01243937, + "auxiliary_loss_mlp": 0.01070197, + "balance_loss_clip": 1.04285014, + "balance_loss_mlp": 1.06894708, + "epoch": 0.0480384788817075, + "flos": 21507296094720.0, + "grad_norm": 7.0153313624744005, + "language_loss": 0.90794134, + "learning_rate": 3.996588161465018e-06, + "loss": 0.93108267, + "num_input_tokens_seen": 17144835, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.75, + "step": 799, + "time_per_iteration": 2.4872424602508545 + }, + { + "auxiliary_loss_clip": 0.01242621, + "auxiliary_loss_mlp": 0.01069655, + "balance_loss_clip": 1.04220068, + "balance_loss_mlp": 1.07567, + "epoch": 0.048098602134375466, + "flos": 21726710323200.0, + "grad_norm": 2.1467314479540818, + "language_loss": 0.86701, + "learning_rate": 3.996565384488748e-06, + "loss": 0.89013278, + "num_input_tokens_seen": 17165030, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.671875, + "step": 800, + "time_per_iteration": 2.477720022201538 + }, + { + "auxiliary_loss_clip": 0.01243518, + "auxiliary_loss_mlp": 0.0106979, + "balance_loss_clip": 1.04362369, + "balance_loss_mlp": 1.07207572, + "epoch": 0.04815872538704344, + "flos": 22931082368640.0, + "grad_norm": 7.517902152046504, + "language_loss": 0.84513009, + "learning_rate": 3.996542531802518e-06, + "loss": 0.86826313, + "num_input_tokens_seen": 17184895, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.71875, + "step": 801, + "time_per_iteration": 2.487889528274536 + }, + { + "auxiliary_loss_clip": 0.01242116, + "auxiliary_loss_mlp": 0.01071409, + "balance_loss_clip": 1.04470587, + "balance_loss_mlp": 1.07289147, + "epoch": 0.04821884863971141, + "flos": 43174716042240.0, + "grad_norm": 1.97564705550146, + "language_loss": 0.79967415, + "learning_rate": 3.996519603407196e-06, + "loss": 0.82280934, + "num_input_tokens_seen": 17208225, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.6953125, + "step": 802, + "time_per_iteration": 2.6496224403381348 + }, + { + "auxiliary_loss_clip": 0.01238875, + "auxiliary_loss_mlp": 0.01065547, + "balance_loss_clip": 1.03963101, + "balance_loss_mlp": 1.07270598, + "epoch": 0.048278971892379376, + "flos": 18620006083200.0, + "grad_norm": 2.8331626885697725, + "language_loss": 0.86420751, + "learning_rate": 3.996496599303649e-06, + "loss": 0.88725173, + "num_input_tokens_seen": 17226305, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.6640625, + "step": 803, + "time_per_iteration": 2.4806807041168213 + }, + { + "auxiliary_loss_clip": 0.01238315, + "auxiliary_loss_mlp": 0.01061166, + "balance_loss_clip": 1.0346303, + "balance_loss_mlp": 1.07398677, + "epoch": 0.04833909514504735, + "flos": 20230061310720.0, + "grad_norm": 2.229653749186784, + "language_loss": 0.85436332, + "learning_rate": 3.996473519492753e-06, + "loss": 0.87735808, + "num_input_tokens_seen": 17244545, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.640625, + "step": 804, + "time_per_iteration": 2.458303213119507 + }, + { + "auxiliary_loss_clip": 0.01239413, + "auxiliary_loss_mlp": 0.01066878, + "balance_loss_clip": 1.04099822, + "balance_loss_mlp": 1.07286024, + "epoch": 0.04839921839771532, + "flos": 24645170361600.0, + "grad_norm": 2.2509331098011645, + "language_loss": 0.86119306, + "learning_rate": 3.99645036397538e-06, + "loss": 0.88425595, + "num_input_tokens_seen": 17265730, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.6640625, + "step": 805, + "time_per_iteration": 2.5021419525146484 + }, + { + "auxiliary_loss_clip": 0.01235031, + "auxiliary_loss_mlp": 0.01067273, + "balance_loss_clip": 1.04115391, + "balance_loss_mlp": 1.06942892, + "epoch": 0.048459341650383285, + "flos": 24827452905600.0, + "grad_norm": 1.8866019303880346, + "language_loss": 0.68034315, + "learning_rate": 3.9964271327524085e-06, + "loss": 0.70336622, + "num_input_tokens_seen": 17284820, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.65625, + "step": 806, + "time_per_iteration": 2.4904568195343018 + }, + { + "auxiliary_loss_clip": 0.01235579, + "auxiliary_loss_mlp": 0.01064526, + "balance_loss_clip": 1.03847933, + "balance_loss_mlp": 1.07208037, + "epoch": 0.04851946490305126, + "flos": 22163204396160.0, + "grad_norm": 2.221107161276338, + "language_loss": 0.7716608, + "learning_rate": 3.9964038258247214e-06, + "loss": 0.79466188, + "num_input_tokens_seen": 17305085, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.6328125, + "step": 807, + "time_per_iteration": 2.498624563217163 + }, + { + "auxiliary_loss_clip": 0.01232532, + "auxiliary_loss_mlp": 0.01071643, + "balance_loss_clip": 1.04567873, + "balance_loss_mlp": 1.06831741, + "epoch": 0.04857958815571922, + "flos": 19792022952960.0, + "grad_norm": 2.844770488216335, + "language_loss": 0.86509991, + "learning_rate": 3.9963804431932005e-06, + "loss": 0.88814163, + "num_input_tokens_seen": 17322715, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.640625, + "step": 808, + "time_per_iteration": 2.444673538208008 + }, + { + "auxiliary_loss_clip": 0.01242847, + "auxiliary_loss_mlp": 0.01070908, + "balance_loss_clip": 1.04441929, + "balance_loss_mlp": 1.07261682, + "epoch": 0.048639711408387194, + "flos": 18697968552960.0, + "grad_norm": 1.9428867449931826, + "language_loss": 0.90154302, + "learning_rate": 3.996356984858732e-06, + "loss": 0.92468053, + "num_input_tokens_seen": 17341455, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.703125, + "step": 809, + "time_per_iteration": 5.353702545166016 + }, + { + "auxiliary_loss_clip": 0.01242102, + "auxiliary_loss_mlp": 0.01070004, + "balance_loss_clip": 1.0432415, + "balance_loss_mlp": 1.07577538, + "epoch": 0.048699834661055166, + "flos": 24863507182080.0, + "grad_norm": 2.12821080633451, + "language_loss": 0.84360719, + "learning_rate": 3.996333450822208e-06, + "loss": 0.86672825, + "num_input_tokens_seen": 17360765, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.65625, + "step": 810, + "time_per_iteration": 3.8935022354125977 + }, + { + "auxiliary_loss_clip": 0.01240735, + "auxiliary_loss_mlp": 0.01066961, + "balance_loss_clip": 1.04049659, + "balance_loss_mlp": 1.07189715, + "epoch": 0.04875995791372313, + "flos": 20704010290560.0, + "grad_norm": 1.7610993085905569, + "language_loss": 0.80875039, + "learning_rate": 3.99630984108452e-06, + "loss": 0.8318274, + "num_input_tokens_seen": 17380625, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.6875, + "step": 811, + "time_per_iteration": 2.5000643730163574 + }, + { + "auxiliary_loss_clip": 0.01232044, + "auxiliary_loss_mlp": 0.0107527, + "balance_loss_clip": 1.04991412, + "balance_loss_mlp": 1.06997907, + "epoch": 0.048820081166391104, + "flos": 18588297352320.0, + "grad_norm": 2.0417171226218715, + "language_loss": 0.74768531, + "learning_rate": 3.9962861556465615e-06, + "loss": 0.77075845, + "num_input_tokens_seen": 17399355, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.625, + "step": 812, + "time_per_iteration": 2.4853179454803467 + }, + { + "auxiliary_loss_clip": 0.01233917, + "auxiliary_loss_mlp": 0.01074517, + "balance_loss_clip": 1.04924428, + "balance_loss_mlp": 1.07263327, + "epoch": 0.04888020441905907, + "flos": 22707322594560.0, + "grad_norm": 1.8904091966919716, + "language_loss": 0.89845109, + "learning_rate": 3.996262394509233e-06, + "loss": 0.92153537, + "num_input_tokens_seen": 17418240, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.609375, + "step": 813, + "time_per_iteration": 2.6731016635894775 + }, + { + "auxiliary_loss_clip": 0.01232344, + "auxiliary_loss_mlp": 0.01058653, + "balance_loss_clip": 1.03429866, + "balance_loss_mlp": 1.07083082, + "epoch": 0.04894032767172704, + "flos": 22784351310720.0, + "grad_norm": 2.028357820963791, + "language_loss": 0.74551463, + "learning_rate": 3.9962385576734335e-06, + "loss": 0.76842451, + "num_input_tokens_seen": 17436250, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.6171875, + "step": 814, + "time_per_iteration": 2.509963035583496 + }, + { + "auxiliary_loss_clip": 0.01235531, + "auxiliary_loss_mlp": 0.01067085, + "balance_loss_clip": 1.04074001, + "balance_loss_mlp": 1.07073569, + "epoch": 0.04900045092439501, + "flos": 25516147345920.0, + "grad_norm": 2.3605733083261464, + "language_loss": 0.83740532, + "learning_rate": 3.9962146451400675e-06, + "loss": 0.86043149, + "num_input_tokens_seen": 17455750, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.6484375, + "step": 815, + "time_per_iteration": 2.5490894317626953 + }, + { + "auxiliary_loss_clip": 0.01239776, + "auxiliary_loss_mlp": 0.0106033, + "balance_loss_clip": 1.03396082, + "balance_loss_mlp": 1.07326484, + "epoch": 0.04906057417706298, + "flos": 25958136199680.0, + "grad_norm": 2.271155414035229, + "language_loss": 0.90803105, + "learning_rate": 3.996190656910043e-06, + "loss": 0.93103218, + "num_input_tokens_seen": 17474995, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.6640625, + "step": 816, + "time_per_iteration": 2.5273053646087646 + }, + { + "auxiliary_loss_clip": 0.01240454, + "auxiliary_loss_mlp": 0.01060699, + "balance_loss_clip": 1.03410304, + "balance_loss_mlp": 1.0732162, + "epoch": 0.04912069742973095, + "flos": 18624638937600.0, + "grad_norm": 3.2321750342473603, + "language_loss": 0.79924619, + "learning_rate": 3.996166592984268e-06, + "loss": 0.82225776, + "num_input_tokens_seen": 17493395, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.671875, + "step": 817, + "time_per_iteration": 2.5095019340515137 + }, + { + "auxiliary_loss_clip": 0.0123455, + "auxiliary_loss_mlp": 0.01074727, + "balance_loss_clip": 1.04864395, + "balance_loss_mlp": 1.07184172, + "epoch": 0.049180820682398915, + "flos": 23699786353920.0, + "grad_norm": 1.8264850687392937, + "language_loss": 0.84520394, + "learning_rate": 3.996142453363656e-06, + "loss": 0.86829674, + "num_input_tokens_seen": 17514565, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.625, + "step": 818, + "time_per_iteration": 2.5476157665252686 + }, + { + "auxiliary_loss_clip": 0.01243386, + "auxiliary_loss_mlp": 0.01067455, + "balance_loss_clip": 1.04041791, + "balance_loss_mlp": 1.07401037, + "epoch": 0.04924094393506689, + "flos": 22420396753920.0, + "grad_norm": 2.779535734169796, + "language_loss": 0.75307131, + "learning_rate": 3.996118238049124e-06, + "loss": 0.77617967, + "num_input_tokens_seen": 17534590, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.6953125, + "step": 819, + "time_per_iteration": 2.5486624240875244 + }, + { + "auxiliary_loss_clip": 0.01239669, + "auxiliary_loss_mlp": 0.01061583, + "balance_loss_clip": 1.03858793, + "balance_loss_mlp": 1.07577193, + "epoch": 0.04930106718773486, + "flos": 15738246766080.0, + "grad_norm": 2.1475545017813853, + "language_loss": 0.85166955, + "learning_rate": 3.996093947041586e-06, + "loss": 0.87468207, + "num_input_tokens_seen": 17551900, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.640625, + "step": 820, + "time_per_iteration": 2.4565298557281494 + }, + { + "auxiliary_loss_clip": 0.0123627, + "auxiliary_loss_mlp": 0.01061843, + "balance_loss_clip": 1.03602266, + "balance_loss_mlp": 1.07061315, + "epoch": 0.049361190440402825, + "flos": 26250628648320.0, + "grad_norm": 1.902695357085614, + "language_loss": 0.9041872, + "learning_rate": 3.996069580341966e-06, + "loss": 0.92716837, + "num_input_tokens_seen": 17571485, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.65625, + "step": 821, + "time_per_iteration": 2.5412514209747314 + }, + { + "auxiliary_loss_clip": 0.01233424, + "auxiliary_loss_mlp": 0.01073041, + "balance_loss_clip": 1.04773307, + "balance_loss_mlp": 1.06951392, + "epoch": 0.0494213136930708, + "flos": 21252366293760.0, + "grad_norm": 2.0531707528144274, + "language_loss": 0.8941884, + "learning_rate": 3.996045137951188e-06, + "loss": 0.91725308, + "num_input_tokens_seen": 17591410, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.640625, + "step": 822, + "time_per_iteration": 2.5171031951904297 + }, + { + "auxiliary_loss_clip": 0.01237258, + "auxiliary_loss_mlp": 0.01059943, + "balance_loss_clip": 1.03295374, + "balance_loss_mlp": 1.0742538, + "epoch": 0.04948143694573876, + "flos": 27965506740480.0, + "grad_norm": 2.060390808888412, + "language_loss": 0.67537785, + "learning_rate": 3.996020619870178e-06, + "loss": 0.69834983, + "num_input_tokens_seen": 17612010, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.6328125, + "step": 823, + "time_per_iteration": 2.5744235515594482 + }, + { + "auxiliary_loss_clip": 0.01120581, + "auxiliary_loss_mlp": 0.01008389, + "balance_loss_clip": 1.00323892, + "balance_loss_mlp": 1.04174662, + "epoch": 0.049541560198406734, + "flos": 66180995533440.0, + "grad_norm": 1.3777513990451415, + "language_loss": 0.62206292, + "learning_rate": 3.995996026099866e-06, + "loss": 0.64335263, + "num_input_tokens_seen": 17673430, + "router_z_loss_clip": 0.05151367, + "router_z_loss_mlp": 0.7890625, + "step": 824, + "time_per_iteration": 3.13708758354187 + }, + { + "auxiliary_loss_clip": 0.01240025, + "auxiliary_loss_mlp": 0.01070032, + "balance_loss_clip": 1.0431149, + "balance_loss_mlp": 1.07293963, + "epoch": 0.049601683451074706, + "flos": 22892693708160.0, + "grad_norm": 2.021638376413324, + "language_loss": 0.90364408, + "learning_rate": 3.995971356641185e-06, + "loss": 0.92674464, + "num_input_tokens_seen": 17689545, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.671875, + "step": 825, + "time_per_iteration": 2.519487142562866 + }, + { + "auxiliary_loss_clip": 0.01237141, + "auxiliary_loss_mlp": 0.01064311, + "balance_loss_clip": 1.03678548, + "balance_loss_mlp": 1.0713625, + "epoch": 0.04966180670374267, + "flos": 21433643256960.0, + "grad_norm": 23.06748840114486, + "language_loss": 0.66790086, + "learning_rate": 3.9959466114950695e-06, + "loss": 0.69091535, + "num_input_tokens_seen": 17705965, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.65625, + "step": 826, + "time_per_iteration": 2.486837387084961 + }, + { + "auxiliary_loss_clip": 0.01236344, + "auxiliary_loss_mlp": 0.01062091, + "balance_loss_clip": 1.0362581, + "balance_loss_mlp": 1.07166433, + "epoch": 0.04972192995641064, + "flos": 23107367341440.0, + "grad_norm": 5.4656671498779845, + "language_loss": 0.78386623, + "learning_rate": 3.995921790662459e-06, + "loss": 0.80685055, + "num_input_tokens_seen": 17724580, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.640625, + "step": 827, + "time_per_iteration": 2.517092704772949 + }, + { + "auxiliary_loss_clip": 0.0124052, + "auxiliary_loss_mlp": 0.01072288, + "balance_loss_clip": 1.04553795, + "balance_loss_mlp": 1.07333767, + "epoch": 0.04978205320907861, + "flos": 40406147458560.0, + "grad_norm": 2.8940457048653916, + "language_loss": 0.78592682, + "learning_rate": 3.995896894144294e-06, + "loss": 0.80905491, + "num_input_tokens_seen": 17747755, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.671875, + "step": 828, + "time_per_iteration": 2.6536450386047363 + }, + { + "auxiliary_loss_clip": 0.01227721, + "auxiliary_loss_mlp": 0.01058804, + "balance_loss_clip": 1.03428292, + "balance_loss_mlp": 1.06777728, + "epoch": 0.04984217646174658, + "flos": 25228539146880.0, + "grad_norm": 2.330577425067274, + "language_loss": 0.83493364, + "learning_rate": 3.995871921941519e-06, + "loss": 0.85779881, + "num_input_tokens_seen": 17768550, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.6015625, + "step": 829, + "time_per_iteration": 2.5744268894195557 + }, + { + "auxiliary_loss_clip": 0.01235678, + "auxiliary_loss_mlp": 0.01073434, + "balance_loss_clip": 1.04433525, + "balance_loss_mlp": 1.07021666, + "epoch": 0.04990229971441455, + "flos": 15959636242560.0, + "grad_norm": 2.2375926111489743, + "language_loss": 0.75055873, + "learning_rate": 3.99584687405508e-06, + "loss": 0.77364987, + "num_input_tokens_seen": 17786080, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.65625, + "step": 830, + "time_per_iteration": 2.5045461654663086 + }, + { + "auxiliary_loss_clip": 0.01233457, + "auxiliary_loss_mlp": 0.01065961, + "balance_loss_clip": 1.03935385, + "balance_loss_mlp": 1.06966341, + "epoch": 0.04996242296708252, + "flos": 18405116968320.0, + "grad_norm": 1.962979792887244, + "language_loss": 0.79379636, + "learning_rate": 3.995821750485929e-06, + "loss": 0.81679052, + "num_input_tokens_seen": 17803635, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.640625, + "step": 831, + "time_per_iteration": 2.5924267768859863 + }, + { + "auxiliary_loss_clip": 0.01237676, + "auxiliary_loss_mlp": 0.01070014, + "balance_loss_clip": 1.04487276, + "balance_loss_mlp": 1.07213569, + "epoch": 0.05002254621975049, + "flos": 17858053854720.0, + "grad_norm": 2.758266217871517, + "language_loss": 0.91538632, + "learning_rate": 3.995796551235016e-06, + "loss": 0.93846321, + "num_input_tokens_seen": 17822190, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.65625, + "step": 832, + "time_per_iteration": 2.653150796890259 + }, + { + "auxiliary_loss_clip": 0.01230534, + "auxiliary_loss_mlp": 0.01081981, + "balance_loss_clip": 1.05747163, + "balance_loss_mlp": 1.07053018, + "epoch": 0.050082669472418455, + "flos": 45660273367680.0, + "grad_norm": 1.9700093948003867, + "language_loss": 0.83139837, + "learning_rate": 3.9957712763032974e-06, + "loss": 0.85452354, + "num_input_tokens_seen": 17846915, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.6015625, + "step": 833, + "time_per_iteration": 2.73848819732666 + }, + { + "auxiliary_loss_clip": 0.0123523, + "auxiliary_loss_mlp": 0.01058285, + "balance_loss_clip": 1.0318923, + "balance_loss_mlp": 1.06913459, + "epoch": 0.05014279272508643, + "flos": 37962067363200.0, + "grad_norm": 2.433665596415918, + "language_loss": 0.8254565, + "learning_rate": 3.995745925691733e-06, + "loss": 0.84839165, + "num_input_tokens_seen": 17867270, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.65625, + "step": 834, + "time_per_iteration": 2.6406352519989014 + }, + { + "auxiliary_loss_clip": 0.01236789, + "auxiliary_loss_mlp": 0.01063828, + "balance_loss_clip": 1.03710127, + "balance_loss_mlp": 1.07138014, + "epoch": 0.0502029159777544, + "flos": 20996179516800.0, + "grad_norm": 2.099554255469436, + "language_loss": 0.91758966, + "learning_rate": 3.995720499401282e-06, + "loss": 0.94059587, + "num_input_tokens_seen": 17884880, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.65625, + "step": 835, + "time_per_iteration": 2.4750425815582275 + }, + { + "auxiliary_loss_clip": 0.0123437, + "auxiliary_loss_mlp": 0.01071713, + "balance_loss_clip": 1.04372287, + "balance_loss_mlp": 1.06699944, + "epoch": 0.050263039230422364, + "flos": 15888066393600.0, + "grad_norm": 2.4903656252358735, + "language_loss": 0.76346481, + "learning_rate": 3.995694997432911e-06, + "loss": 0.78652561, + "num_input_tokens_seen": 17903695, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.671875, + "step": 836, + "time_per_iteration": 2.4839258193969727 + }, + { + "auxiliary_loss_clip": 0.01229978, + "auxiliary_loss_mlp": 0.01072782, + "balance_loss_clip": 1.04696083, + "balance_loss_mlp": 1.07100809, + "epoch": 0.050323162483090336, + "flos": 23732752060800.0, + "grad_norm": 2.1380784235063066, + "language_loss": 0.8360337, + "learning_rate": 3.9956694197875855e-06, + "loss": 0.85906136, + "num_input_tokens_seen": 17920745, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.5859375, + "step": 837, + "time_per_iteration": 2.5140485763549805 + }, + { + "auxiliary_loss_clip": 0.01233502, + "auxiliary_loss_mlp": 0.01065592, + "balance_loss_clip": 1.0403192, + "balance_loss_mlp": 1.07245386, + "epoch": 0.0503832857357583, + "flos": 20266223328000.0, + "grad_norm": 2.225982034212064, + "language_loss": 0.73137468, + "learning_rate": 3.995643766466275e-06, + "loss": 0.75436556, + "num_input_tokens_seen": 17938220, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.609375, + "step": 838, + "time_per_iteration": 2.5128419399261475 + }, + { + "auxiliary_loss_clip": 0.01229023, + "auxiliary_loss_mlp": 0.0106788, + "balance_loss_clip": 1.04195237, + "balance_loss_mlp": 1.06636167, + "epoch": 0.05044340898842627, + "flos": 17785011548160.0, + "grad_norm": 1.886796600099776, + "language_loss": 0.83328462, + "learning_rate": 3.995618037469953e-06, + "loss": 0.85625362, + "num_input_tokens_seen": 17957325, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.6328125, + "step": 839, + "time_per_iteration": 2.499415874481201 + }, + { + "auxiliary_loss_clip": 0.01228207, + "auxiliary_loss_mlp": 0.01066651, + "balance_loss_clip": 1.04128349, + "balance_loss_mlp": 1.06866539, + "epoch": 0.050503532241094246, + "flos": 22966526113920.0, + "grad_norm": 2.2056506497336765, + "language_loss": 0.85777193, + "learning_rate": 3.995592232799595e-06, + "loss": 0.8807205, + "num_input_tokens_seen": 17975875, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.59375, + "step": 840, + "time_per_iteration": 2.522038698196411 + }, + { + "auxiliary_loss_clip": 0.01235877, + "auxiliary_loss_mlp": 0.01063775, + "balance_loss_clip": 1.03691697, + "balance_loss_mlp": 1.07246661, + "epoch": 0.05056365549376221, + "flos": 22776989022720.0, + "grad_norm": 2.034102412822674, + "language_loss": 0.94658732, + "learning_rate": 3.99556635245618e-06, + "loss": 0.96958393, + "num_input_tokens_seen": 17994340, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.6328125, + "step": 841, + "time_per_iteration": 2.4996211528778076 + }, + { + "auxiliary_loss_clip": 0.01234454, + "auxiliary_loss_mlp": 0.01066453, + "balance_loss_clip": 1.03958321, + "balance_loss_mlp": 1.07130527, + "epoch": 0.05062377874643018, + "flos": 30916968399360.0, + "grad_norm": 2.030819255438432, + "language_loss": 0.77387047, + "learning_rate": 3.995540396440688e-06, + "loss": 0.79687953, + "num_input_tokens_seen": 18015260, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.6328125, + "step": 842, + "time_per_iteration": 2.6253628730773926 + }, + { + "auxiliary_loss_clip": 0.01238804, + "auxiliary_loss_mlp": 0.01067813, + "balance_loss_clip": 1.041659, + "balance_loss_mlp": 1.07278991, + "epoch": 0.05068390199909815, + "flos": 19647159402240.0, + "grad_norm": 2.283727909175907, + "language_loss": 0.78014457, + "learning_rate": 3.995514364754105e-06, + "loss": 0.80321074, + "num_input_tokens_seen": 18033960, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.6640625, + "step": 843, + "time_per_iteration": 2.5158324241638184 + }, + { + "auxiliary_loss_clip": 0.01237695, + "auxiliary_loss_mlp": 0.01061566, + "balance_loss_clip": 1.036461, + "balance_loss_mlp": 1.07266212, + "epoch": 0.05074402525176612, + "flos": 37962103276800.0, + "grad_norm": 2.249210505837228, + "language_loss": 0.82952344, + "learning_rate": 3.995488257397417e-06, + "loss": 0.85251611, + "num_input_tokens_seen": 18056700, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.6484375, + "step": 844, + "time_per_iteration": 2.6476500034332275 + }, + { + "auxiliary_loss_clip": 0.01229818, + "auxiliary_loss_mlp": 0.0106479, + "balance_loss_clip": 1.03935087, + "balance_loss_mlp": 1.06871867, + "epoch": 0.05080414850443409, + "flos": 22054610603520.0, + "grad_norm": 2.3236550986537368, + "language_loss": 0.76042783, + "learning_rate": 3.995462074371614e-06, + "loss": 0.78337395, + "num_input_tokens_seen": 18075815, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.609375, + "step": 845, + "time_per_iteration": 2.502901315689087 + }, + { + "auxiliary_loss_clip": 0.01229682, + "auxiliary_loss_mlp": 0.01075672, + "balance_loss_clip": 1.04924285, + "balance_loss_mlp": 1.06694174, + "epoch": 0.05086427175710206, + "flos": 20225787592320.0, + "grad_norm": 2.2528566199281905, + "language_loss": 0.87468004, + "learning_rate": 3.99543581567769e-06, + "loss": 0.89773357, + "num_input_tokens_seen": 18095095, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.625, + "step": 846, + "time_per_iteration": 2.5271859169006348 + }, + { + "auxiliary_loss_clip": 0.01230653, + "auxiliary_loss_mlp": 0.01070334, + "balance_loss_clip": 1.04521692, + "balance_loss_mlp": 1.06982791, + "epoch": 0.05092439500977003, + "flos": 15159223526400.0, + "grad_norm": 1.95159927266484, + "language_loss": 0.87571466, + "learning_rate": 3.9954094813166394e-06, + "loss": 0.89872456, + "num_input_tokens_seen": 18112675, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.609375, + "step": 847, + "time_per_iteration": 2.4566030502319336 + }, + { + "auxiliary_loss_clip": 0.01226009, + "auxiliary_loss_mlp": 0.01071018, + "balance_loss_clip": 1.04489946, + "balance_loss_mlp": 1.06883907, + "epoch": 0.050984518262437994, + "flos": 22055149307520.0, + "grad_norm": 2.141846591022022, + "language_loss": 0.81706643, + "learning_rate": 3.995383071289462e-06, + "loss": 0.84003675, + "num_input_tokens_seen": 18130745, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.5703125, + "step": 848, + "time_per_iteration": 2.4695050716400146 + }, + { + "auxiliary_loss_clip": 0.0123182, + "auxiliary_loss_mlp": 0.01077851, + "balance_loss_clip": 1.0522449, + "balance_loss_mlp": 1.07167053, + "epoch": 0.05104464151510597, + "flos": 30225329043840.0, + "grad_norm": 1.898868752622741, + "language_loss": 0.87266076, + "learning_rate": 3.995356585597158e-06, + "loss": 0.89575738, + "num_input_tokens_seen": 18152410, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.6015625, + "step": 849, + "time_per_iteration": 2.5472936630249023 + }, + { + "auxiliary_loss_clip": 0.0122487, + "auxiliary_loss_mlp": 0.01062562, + "balance_loss_clip": 1.03743291, + "balance_loss_mlp": 1.06569946, + "epoch": 0.05110476476777394, + "flos": 18332900674560.0, + "grad_norm": 1.8637209623848903, + "language_loss": 0.83340889, + "learning_rate": 3.995330024240732e-06, + "loss": 0.85628319, + "num_input_tokens_seen": 18170870, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.59375, + "step": 850, + "time_per_iteration": 2.493814468383789 + }, + { + "auxiliary_loss_clip": 0.01229016, + "auxiliary_loss_mlp": 0.01064236, + "balance_loss_clip": 1.03847528, + "balance_loss_mlp": 1.06816506, + "epoch": 0.051164888020441904, + "flos": 37998732170880.0, + "grad_norm": 2.1400408414194154, + "language_loss": 0.6501807, + "learning_rate": 3.995303387221192e-06, + "loss": 0.67311323, + "num_input_tokens_seen": 18191555, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.609375, + "step": 851, + "time_per_iteration": 5.443026065826416 + }, + { + "auxiliary_loss_clip": 0.01228781, + "auxiliary_loss_mlp": 0.01071453, + "balance_loss_clip": 1.04424942, + "balance_loss_mlp": 1.0674876, + "epoch": 0.051225011273109876, + "flos": 23038634666880.0, + "grad_norm": 2.2562645326336686, + "language_loss": 0.8376134, + "learning_rate": 3.995276674539547e-06, + "loss": 0.86061573, + "num_input_tokens_seen": 18208620, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.609375, + "step": 852, + "time_per_iteration": 2.4753623008728027 + }, + { + "auxiliary_loss_clip": 0.01231223, + "auxiliary_loss_mlp": 0.01068594, + "balance_loss_clip": 1.04190326, + "balance_loss_mlp": 1.06879044, + "epoch": 0.05128513452577785, + "flos": 18259822454400.0, + "grad_norm": 1.9405819970113303, + "language_loss": 0.80252314, + "learning_rate": 3.995249886196811e-06, + "loss": 0.82552135, + "num_input_tokens_seen": 18226370, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.625, + "step": 853, + "time_per_iteration": 2.5048112869262695 + }, + { + "auxiliary_loss_clip": 0.01226539, + "auxiliary_loss_mlp": 0.01060743, + "balance_loss_clip": 1.03432584, + "balance_loss_mlp": 1.06710184, + "epoch": 0.05134525777844581, + "flos": 27198957571200.0, + "grad_norm": 1.8237562231360178, + "language_loss": 0.75846469, + "learning_rate": 3.995223022193999e-06, + "loss": 0.7813375, + "num_input_tokens_seen": 18247075, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.59375, + "step": 854, + "time_per_iteration": 2.53165602684021 + }, + { + "auxiliary_loss_clip": 0.01233418, + "auxiliary_loss_mlp": 0.0106357, + "balance_loss_clip": 1.03678393, + "balance_loss_mlp": 1.07139039, + "epoch": 0.051405381031113785, + "flos": 28362247436160.0, + "grad_norm": 2.718422527893707, + "language_loss": 0.81173462, + "learning_rate": 3.99519608253213e-06, + "loss": 0.83470446, + "num_input_tokens_seen": 18265680, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.625, + "step": 855, + "time_per_iteration": 2.5610744953155518 + }, + { + "auxiliary_loss_clip": 0.01113278, + "auxiliary_loss_mlp": 0.01020682, + "balance_loss_clip": 1.01534104, + "balance_loss_mlp": 1.03902698, + "epoch": 0.05146550428378175, + "flos": 65618169327360.0, + "grad_norm": 1.0071030268205712, + "language_loss": 0.65609074, + "learning_rate": 3.995169067212227e-06, + "loss": 0.67743033, + "num_input_tokens_seen": 18327015, + "router_z_loss_clip": 0.0534668, + "router_z_loss_mlp": 0.7421875, + "step": 856, + "time_per_iteration": 3.0546581745147705 + }, + { + "auxiliary_loss_clip": 0.01224884, + "auxiliary_loss_mlp": 0.01053813, + "balance_loss_clip": 1.02823043, + "balance_loss_mlp": 1.06811357, + "epoch": 0.05152562753644972, + "flos": 22054861998720.0, + "grad_norm": 1.8111088050205955, + "language_loss": 0.76996124, + "learning_rate": 3.9951419762353116e-06, + "loss": 0.79274821, + "num_input_tokens_seen": 18345235, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.5625, + "step": 857, + "time_per_iteration": 2.6051554679870605 + }, + { + "auxiliary_loss_clip": 0.01229705, + "auxiliary_loss_mlp": 0.01057595, + "balance_loss_clip": 1.03130889, + "balance_loss_mlp": 1.06846082, + "epoch": 0.051585750789117694, + "flos": 18509544783360.0, + "grad_norm": 3.7937823779894377, + "language_loss": 0.88893878, + "learning_rate": 3.995114809602412e-06, + "loss": 0.91181171, + "num_input_tokens_seen": 18362350, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.6171875, + "step": 858, + "time_per_iteration": 2.4517769813537598 + }, + { + "auxiliary_loss_clip": 0.01228685, + "auxiliary_loss_mlp": 0.01056497, + "balance_loss_clip": 1.03000832, + "balance_loss_mlp": 1.06902003, + "epoch": 0.05164587404178566, + "flos": 23730238108800.0, + "grad_norm": 1.9531750101692102, + "language_loss": 0.75199753, + "learning_rate": 3.9950875673145605e-06, + "loss": 0.77484941, + "num_input_tokens_seen": 18383390, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.59375, + "step": 859, + "time_per_iteration": 2.5090014934539795 + }, + { + "auxiliary_loss_clip": 0.01237239, + "auxiliary_loss_mlp": 0.01070917, + "balance_loss_clip": 1.04280758, + "balance_loss_mlp": 1.06980002, + "epoch": 0.05170599729445363, + "flos": 16252882876800.0, + "grad_norm": 2.092452223155828, + "language_loss": 0.90812773, + "learning_rate": 3.995060249372788e-06, + "loss": 0.93120927, + "num_input_tokens_seen": 18399220, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.671875, + "step": 860, + "time_per_iteration": 2.437220335006714 + }, + { + "auxiliary_loss_clip": 0.01231057, + "auxiliary_loss_mlp": 0.01060698, + "balance_loss_clip": 1.03568769, + "balance_loss_mlp": 1.0717634, + "epoch": 0.0517661205471216, + "flos": 23985922095360.0, + "grad_norm": 1.9189860758016508, + "language_loss": 0.82252973, + "learning_rate": 3.99503285577813e-06, + "loss": 0.8454473, + "num_input_tokens_seen": 18419005, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.59375, + "step": 861, + "time_per_iteration": 2.50883412361145 + }, + { + "auxiliary_loss_clip": 0.01233216, + "auxiliary_loss_mlp": 0.01057472, + "balance_loss_clip": 1.03177071, + "balance_loss_mlp": 1.0704143, + "epoch": 0.05182624379978957, + "flos": 29277718392960.0, + "grad_norm": 2.0352629197197762, + "language_loss": 0.78607392, + "learning_rate": 3.995005386531627e-06, + "loss": 0.80898082, + "num_input_tokens_seen": 18440550, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.625, + "step": 862, + "time_per_iteration": 2.548757314682007 + }, + { + "auxiliary_loss_clip": 0.01229413, + "auxiliary_loss_mlp": 0.01068334, + "balance_loss_clip": 1.04402709, + "balance_loss_mlp": 1.07291067, + "epoch": 0.05188636705245754, + "flos": 24170826332160.0, + "grad_norm": 1.9841587361763113, + "language_loss": 0.88999134, + "learning_rate": 3.9949778416343195e-06, + "loss": 0.91296881, + "num_input_tokens_seen": 18461950, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.5625, + "step": 863, + "time_per_iteration": 2.506289005279541 + }, + { + "auxiliary_loss_clip": 0.01238268, + "auxiliary_loss_mlp": 0.01064282, + "balance_loss_clip": 1.03712606, + "balance_loss_mlp": 1.07635331, + "epoch": 0.051946490305125506, + "flos": 26760703731840.0, + "grad_norm": 2.003999649515418, + "language_loss": 0.7575798, + "learning_rate": 3.9949502210872525e-06, + "loss": 0.78060532, + "num_input_tokens_seen": 18480555, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.625, + "step": 864, + "time_per_iteration": 2.515944480895996 + }, + { + "auxiliary_loss_clip": 0.01236545, + "auxiliary_loss_mlp": 0.01069508, + "balance_loss_clip": 1.04228067, + "balance_loss_mlp": 1.07355332, + "epoch": 0.05200661355779348, + "flos": 21502519585920.0, + "grad_norm": 1.9298630836237705, + "language_loss": 0.7919569, + "learning_rate": 3.994922524891474e-06, + "loss": 0.81501746, + "num_input_tokens_seen": 18499645, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.6328125, + "step": 865, + "time_per_iteration": 2.485499620437622 + }, + { + "auxiliary_loss_clip": 0.0123268, + "auxiliary_loss_mlp": 0.0106684, + "balance_loss_clip": 1.04144871, + "balance_loss_mlp": 1.07079291, + "epoch": 0.05206673681046144, + "flos": 18114492026880.0, + "grad_norm": 2.366131428952597, + "language_loss": 0.85700798, + "learning_rate": 3.994894753048032e-06, + "loss": 0.88000321, + "num_input_tokens_seen": 18516810, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.6171875, + "step": 866, + "time_per_iteration": 2.4854226112365723 + }, + { + "auxiliary_loss_clip": 0.01242589, + "auxiliary_loss_mlp": 0.01065926, + "balance_loss_clip": 1.03910398, + "balance_loss_mlp": 1.0804987, + "epoch": 0.052126860063129415, + "flos": 17524191916800.0, + "grad_norm": 2.535209572965093, + "language_loss": 0.8680315, + "learning_rate": 3.9948669055579815e-06, + "loss": 0.89111662, + "num_input_tokens_seen": 18532510, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.625, + "step": 867, + "time_per_iteration": 2.4644970893859863 + }, + { + "auxiliary_loss_clip": 0.01231644, + "auxiliary_loss_mlp": 0.01073847, + "balance_loss_clip": 1.05021977, + "balance_loss_mlp": 1.07513499, + "epoch": 0.05218698331579739, + "flos": 32598054771840.0, + "grad_norm": 1.64188364663517, + "language_loss": 0.63562089, + "learning_rate": 3.9948389824223785e-06, + "loss": 0.65867579, + "num_input_tokens_seen": 18557380, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.5625, + "step": 868, + "time_per_iteration": 2.567958354949951 + }, + { + "auxiliary_loss_clip": 0.01236968, + "auxiliary_loss_mlp": 0.01065922, + "balance_loss_clip": 1.03753829, + "balance_loss_mlp": 1.07263327, + "epoch": 0.05224710656846535, + "flos": 22127293774080.0, + "grad_norm": 2.1448269109564198, + "language_loss": 0.83076257, + "learning_rate": 3.994810983642281e-06, + "loss": 0.85379148, + "num_input_tokens_seen": 18575720, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.6484375, + "step": 869, + "time_per_iteration": 2.5021841526031494 + }, + { + "auxiliary_loss_clip": 0.01237154, + "auxiliary_loss_mlp": 0.01057742, + "balance_loss_clip": 1.03201652, + "balance_loss_mlp": 1.07245827, + "epoch": 0.052307229821133325, + "flos": 11145092976000.0, + "grad_norm": 2.352948725027126, + "language_loss": 0.87544227, + "learning_rate": 3.994782909218751e-06, + "loss": 0.89839119, + "num_input_tokens_seen": 18592185, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.6484375, + "step": 870, + "time_per_iteration": 2.459662437438965 + }, + { + "auxiliary_loss_clip": 0.01238457, + "auxiliary_loss_mlp": 0.01067184, + "balance_loss_clip": 1.04135191, + "balance_loss_mlp": 1.07536197, + "epoch": 0.05236735307380129, + "flos": 19128070005120.0, + "grad_norm": 1.9212028950510787, + "language_loss": 0.80554998, + "learning_rate": 3.994754759152854e-06, + "loss": 0.82860637, + "num_input_tokens_seen": 18609560, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.6328125, + "step": 871, + "time_per_iteration": 2.4701170921325684 + }, + { + "auxiliary_loss_clip": 0.01234905, + "auxiliary_loss_mlp": 0.01064695, + "balance_loss_clip": 1.04009032, + "balance_loss_mlp": 1.07576704, + "epoch": 0.05242747632646926, + "flos": 20960663944320.0, + "grad_norm": 1.5975290841395262, + "language_loss": 0.81374049, + "learning_rate": 3.994726533445656e-06, + "loss": 0.8367365, + "num_input_tokens_seen": 18629405, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.59375, + "step": 872, + "time_per_iteration": 2.4886369705200195 + }, + { + "auxiliary_loss_clip": 0.0111147, + "auxiliary_loss_mlp": 0.0101489, + "balance_loss_clip": 1.00952566, + "balance_loss_mlp": 1.03955865, + "epoch": 0.052487599579137234, + "flos": 65020542842880.0, + "grad_norm": 0.8879269166117758, + "language_loss": 0.61579192, + "learning_rate": 3.9946982320982274e-06, + "loss": 0.63705552, + "num_input_tokens_seen": 18681480, + "router_z_loss_clip": 0.05371094, + "router_z_loss_mlp": 0.71875, + "step": 873, + "time_per_iteration": 2.9913430213928223 + }, + { + "auxiliary_loss_clip": 0.01231663, + "auxiliary_loss_mlp": 0.01058247, + "balance_loss_clip": 1.03245032, + "balance_loss_mlp": 1.07107997, + "epoch": 0.0525477228318052, + "flos": 23288859786240.0, + "grad_norm": 1.8426182555123698, + "language_loss": 0.88426232, + "learning_rate": 3.994669855111643e-06, + "loss": 0.90716141, + "num_input_tokens_seen": 18700390, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.6015625, + "step": 874, + "time_per_iteration": 2.4794461727142334 + }, + { + "auxiliary_loss_clip": 0.0123222, + "auxiliary_loss_mlp": 0.01062298, + "balance_loss_clip": 1.03626251, + "balance_loss_mlp": 1.06908488, + "epoch": 0.05260784608447317, + "flos": 32230221546240.0, + "grad_norm": 2.2494767595307628, + "language_loss": 0.74779439, + "learning_rate": 3.994641402486977e-06, + "loss": 0.77073956, + "num_input_tokens_seen": 18721280, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.6328125, + "step": 875, + "time_per_iteration": 2.5768113136291504 + }, + { + "auxiliary_loss_clip": 0.01228414, + "auxiliary_loss_mlp": 0.01061086, + "balance_loss_clip": 1.03412056, + "balance_loss_mlp": 1.06905699, + "epoch": 0.052667969337141136, + "flos": 24463211040000.0, + "grad_norm": 2.052141253618648, + "language_loss": 0.92836702, + "learning_rate": 3.99461287422531e-06, + "loss": 0.951262, + "num_input_tokens_seen": 18741545, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.59375, + "step": 876, + "time_per_iteration": 2.535587787628174 + }, + { + "auxiliary_loss_clip": 0.0110653, + "auxiliary_loss_mlp": 0.01009036, + "balance_loss_clip": 1.00379074, + "balance_loss_mlp": 1.03698087, + "epoch": 0.05272809258980911, + "flos": 57784329567360.0, + "grad_norm": 0.854570032578524, + "language_loss": 0.62934959, + "learning_rate": 3.994584270327722e-06, + "loss": 0.6505053, + "num_input_tokens_seen": 18801400, + "router_z_loss_clip": 0.05249023, + "router_z_loss_mlp": 0.6953125, + "step": 877, + "time_per_iteration": 3.094581127166748 + }, + { + "auxiliary_loss_clip": 0.01231545, + "auxiliary_loss_mlp": 0.01069359, + "balance_loss_clip": 1.04174972, + "balance_loss_mlp": 1.06975055, + "epoch": 0.05278821584247708, + "flos": 17420805596160.0, + "grad_norm": 2.154366240232031, + "language_loss": 0.85691291, + "learning_rate": 3.994555590795299e-06, + "loss": 0.87992191, + "num_input_tokens_seen": 18819670, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.6171875, + "step": 878, + "time_per_iteration": 2.5052285194396973 + }, + { + "auxiliary_loss_clip": 0.01232133, + "auxiliary_loss_mlp": 0.01063559, + "balance_loss_clip": 1.03754723, + "balance_loss_mlp": 1.06974411, + "epoch": 0.052848339095145046, + "flos": 26137258346880.0, + "grad_norm": 2.0833089409086942, + "language_loss": 0.82790506, + "learning_rate": 3.9945268356291275e-06, + "loss": 0.85086197, + "num_input_tokens_seen": 18840580, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.625, + "step": 879, + "time_per_iteration": 2.564312219619751 + }, + { + "auxiliary_loss_clip": 0.01227867, + "auxiliary_loss_mlp": 0.01066877, + "balance_loss_clip": 1.04011488, + "balance_loss_mlp": 1.06966615, + "epoch": 0.05290846234781302, + "flos": 16472081623680.0, + "grad_norm": 4.271066320440391, + "language_loss": 0.84404933, + "learning_rate": 3.9944980048302985e-06, + "loss": 0.86699677, + "num_input_tokens_seen": 18859295, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.578125, + "step": 880, + "time_per_iteration": 2.4854133129119873 + }, + { + "auxiliary_loss_clip": 0.01233797, + "auxiliary_loss_mlp": 0.01069821, + "balance_loss_clip": 1.04360688, + "balance_loss_mlp": 1.07206059, + "epoch": 0.05296858560048098, + "flos": 19865173000320.0, + "grad_norm": 3.515636761469604, + "language_loss": 0.87156737, + "learning_rate": 3.994469098399906e-06, + "loss": 0.89460361, + "num_input_tokens_seen": 18877485, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.625, + "step": 881, + "time_per_iteration": 2.476846933364868 + }, + { + "auxiliary_loss_clip": 0.01228751, + "auxiliary_loss_mlp": 0.01065941, + "balance_loss_clip": 1.03789103, + "balance_loss_mlp": 1.06813371, + "epoch": 0.053028708853148955, + "flos": 24388588535040.0, + "grad_norm": 1.9345214626214409, + "language_loss": 0.87682849, + "learning_rate": 3.994440116339046e-06, + "loss": 0.89977539, + "num_input_tokens_seen": 18898275, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.609375, + "step": 882, + "time_per_iteration": 2.6449031829833984 + }, + { + "auxiliary_loss_clip": 0.01233714, + "auxiliary_loss_mlp": 0.01065669, + "balance_loss_clip": 1.03825057, + "balance_loss_mlp": 1.07030129, + "epoch": 0.05308883210581693, + "flos": 36393166143360.0, + "grad_norm": 2.7245054008776814, + "language_loss": 0.68869275, + "learning_rate": 3.994411058648816e-06, + "loss": 0.71168661, + "num_input_tokens_seen": 18920665, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.6328125, + "step": 883, + "time_per_iteration": 2.620363235473633 + }, + { + "auxiliary_loss_clip": 0.01225388, + "auxiliary_loss_mlp": 0.01060527, + "balance_loss_clip": 1.03461075, + "balance_loss_mlp": 1.06937146, + "epoch": 0.05314895535848489, + "flos": 22855095146880.0, + "grad_norm": 1.9628498458506696, + "language_loss": 0.75887203, + "learning_rate": 3.994381925330319e-06, + "loss": 0.78173113, + "num_input_tokens_seen": 18939835, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.5546875, + "step": 884, + "time_per_iteration": 2.4948067665100098 + }, + { + "auxiliary_loss_clip": 0.01225729, + "auxiliary_loss_mlp": 0.01063879, + "balance_loss_clip": 1.03870201, + "balance_loss_mlp": 1.06921601, + "epoch": 0.053209078611152864, + "flos": 12860330204160.0, + "grad_norm": 2.00306560312032, + "language_loss": 0.85323638, + "learning_rate": 3.994352716384659e-06, + "loss": 0.87613249, + "num_input_tokens_seen": 18958405, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.5625, + "step": 885, + "time_per_iteration": 2.5159530639648438 + }, + { + "auxiliary_loss_clip": 0.01228523, + "auxiliary_loss_mlp": 0.01068973, + "balance_loss_clip": 1.04205549, + "balance_loss_mlp": 1.06673646, + "epoch": 0.05326920186382083, + "flos": 12164596698240.0, + "grad_norm": 2.6316893825734344, + "language_loss": 0.85726082, + "learning_rate": 3.994323431812945e-06, + "loss": 0.88023585, + "num_input_tokens_seen": 18975445, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.6171875, + "step": 886, + "time_per_iteration": 2.4650700092315674 + }, + { + "auxiliary_loss_clip": 0.01226585, + "auxiliary_loss_mlp": 0.01066459, + "balance_loss_clip": 1.03908896, + "balance_loss_mlp": 1.06944001, + "epoch": 0.0533293251164888, + "flos": 22704485420160.0, + "grad_norm": 2.1517488326805214, + "language_loss": 0.89229804, + "learning_rate": 3.994294071616286e-06, + "loss": 0.91522843, + "num_input_tokens_seen": 18991930, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.5703125, + "step": 887, + "time_per_iteration": 2.5020337104797363 + }, + { + "auxiliary_loss_clip": 0.01227687, + "auxiliary_loss_mlp": 0.01070962, + "balance_loss_clip": 1.04270935, + "balance_loss_mlp": 1.06604195, + "epoch": 0.053389448369156774, + "flos": 26940939200640.0, + "grad_norm": 2.2836036404275593, + "language_loss": 0.75076836, + "learning_rate": 3.994264635795796e-06, + "loss": 0.77375484, + "num_input_tokens_seen": 19009790, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.6171875, + "step": 888, + "time_per_iteration": 2.5055694580078125 + }, + { + "auxiliary_loss_clip": 0.0123028, + "auxiliary_loss_mlp": 0.0107639, + "balance_loss_clip": 1.0480895, + "balance_loss_mlp": 1.07113457, + "epoch": 0.05344957162182474, + "flos": 25556331686400.0, + "grad_norm": 2.032914331295681, + "language_loss": 0.88330352, + "learning_rate": 3.994235124352592e-06, + "loss": 0.90637028, + "num_input_tokens_seen": 19030170, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.59375, + "step": 889, + "time_per_iteration": 2.5147650241851807 + }, + { + "auxiliary_loss_clip": 0.01222875, + "auxiliary_loss_mlp": 0.01053174, + "balance_loss_clip": 1.02748489, + "balance_loss_mlp": 1.06732821, + "epoch": 0.05350969487449271, + "flos": 19719591177600.0, + "grad_norm": 1.9726085703824752, + "language_loss": 0.88269985, + "learning_rate": 3.994205537287791e-06, + "loss": 0.90546036, + "num_input_tokens_seen": 19048075, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.5546875, + "step": 890, + "time_per_iteration": 2.490300416946411 + }, + { + "auxiliary_loss_clip": 0.01225662, + "auxiliary_loss_mlp": 0.0107145, + "balance_loss_clip": 1.04612982, + "balance_loss_mlp": 1.06690812, + "epoch": 0.053569818127160676, + "flos": 27016351804800.0, + "grad_norm": 2.320271972022273, + "language_loss": 0.93251556, + "learning_rate": 3.994175874602517e-06, + "loss": 0.95548671, + "num_input_tokens_seen": 19067465, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.59375, + "step": 891, + "time_per_iteration": 2.5133957862854004 + }, + { + "auxiliary_loss_clip": 0.01225404, + "auxiliary_loss_mlp": 0.01062212, + "balance_loss_clip": 1.03506804, + "balance_loss_mlp": 1.06682086, + "epoch": 0.05362994137982865, + "flos": 13188338225280.0, + "grad_norm": 2.238230674372026, + "language_loss": 0.71759057, + "learning_rate": 3.994146136297893e-06, + "loss": 0.74046671, + "num_input_tokens_seen": 19085505, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.5859375, + "step": 892, + "time_per_iteration": 2.5544779300689697 + }, + { + "auxiliary_loss_clip": 0.01229119, + "auxiliary_loss_mlp": 0.01067529, + "balance_loss_clip": 1.0421617, + "balance_loss_mlp": 1.06946719, + "epoch": 0.05369006463249662, + "flos": 28658008022400.0, + "grad_norm": 2.3204520758070037, + "language_loss": 0.82304287, + "learning_rate": 3.994116322375049e-06, + "loss": 0.84600937, + "num_input_tokens_seen": 19104360, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.6015625, + "step": 893, + "time_per_iteration": 5.3903117179870605 + }, + { + "auxiliary_loss_clip": 0.0122945, + "auxiliary_loss_mlp": 0.01070342, + "balance_loss_clip": 1.04430699, + "balance_loss_mlp": 1.0679965, + "epoch": 0.053750187885164585, + "flos": 28913153304960.0, + "grad_norm": 2.3808217776212937, + "language_loss": 0.81695569, + "learning_rate": 3.994086432835114e-06, + "loss": 0.83995366, + "num_input_tokens_seen": 19124680, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.609375, + "step": 894, + "time_per_iteration": 2.52809476852417 + }, + { + "auxiliary_loss_clip": 0.01227471, + "auxiliary_loss_mlp": 0.01065449, + "balance_loss_clip": 1.03915119, + "balance_loss_mlp": 1.06881404, + "epoch": 0.05381031113783256, + "flos": 15158828476800.0, + "grad_norm": 2.5337894710206093, + "language_loss": 0.76043701, + "learning_rate": 3.994056467679221e-06, + "loss": 0.7833662, + "num_input_tokens_seen": 19142895, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.5859375, + "step": 895, + "time_per_iteration": 2.4810688495635986 + }, + { + "auxiliary_loss_clip": 0.01238307, + "auxiliary_loss_mlp": 0.01057353, + "balance_loss_clip": 1.03022122, + "balance_loss_mlp": 1.07260597, + "epoch": 0.05387043439050053, + "flos": 21835232288640.0, + "grad_norm": 2.2065839001211156, + "language_loss": 0.86456096, + "learning_rate": 3.9940264269085065e-06, + "loss": 0.88751751, + "num_input_tokens_seen": 19163125, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.65625, + "step": 896, + "time_per_iteration": 2.522254467010498 + }, + { + "auxiliary_loss_clip": 0.01231325, + "auxiliary_loss_mlp": 0.01062675, + "balance_loss_clip": 1.03495908, + "balance_loss_mlp": 1.06809413, + "epoch": 0.053930557643168495, + "flos": 17310308382720.0, + "grad_norm": 2.1680285530564274, + "language_loss": 0.87949234, + "learning_rate": 3.9939963105241115e-06, + "loss": 0.90243232, + "num_input_tokens_seen": 19179385, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.6328125, + "step": 897, + "time_per_iteration": 2.457918167114258 + }, + { + "auxiliary_loss_clip": 0.0122574, + "auxiliary_loss_mlp": 0.01063765, + "balance_loss_clip": 1.03570318, + "balance_loss_mlp": 1.06723523, + "epoch": 0.05399068089583647, + "flos": 17348481561600.0, + "grad_norm": 1.7359050724031848, + "language_loss": 0.9035244, + "learning_rate": 3.993966118527175e-06, + "loss": 0.9264195, + "num_input_tokens_seen": 19198725, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.5859375, + "step": 898, + "time_per_iteration": 2.4593143463134766 + }, + { + "auxiliary_loss_clip": 0.01234899, + "auxiliary_loss_mlp": 0.01084595, + "balance_loss_clip": 1.05808282, + "balance_loss_mlp": 1.07024622, + "epoch": 0.05405080414850443, + "flos": 17486952491520.0, + "grad_norm": 3.958355519485596, + "language_loss": 0.91756964, + "learning_rate": 3.993935850918845e-06, + "loss": 0.94076455, + "num_input_tokens_seen": 19212380, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.6484375, + "step": 899, + "time_per_iteration": 2.4461729526519775 + }, + { + "auxiliary_loss_clip": 0.01225208, + "auxiliary_loss_mlp": 0.01065344, + "balance_loss_clip": 1.03964233, + "balance_loss_mlp": 1.06601286, + "epoch": 0.054110927401172404, + "flos": 24496787278080.0, + "grad_norm": 2.6493739136310643, + "language_loss": 0.75594276, + "learning_rate": 3.9939055077002665e-06, + "loss": 0.77884829, + "num_input_tokens_seen": 19232235, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.59375, + "step": 900, + "time_per_iteration": 2.5180957317352295 + }, + { + "auxiliary_loss_clip": 0.01231903, + "auxiliary_loss_mlp": 0.01059763, + "balance_loss_clip": 1.03413296, + "balance_loss_mlp": 1.06860638, + "epoch": 0.054171050653840376, + "flos": 22930040874240.0, + "grad_norm": 2.2496787705299908, + "language_loss": 0.7377668, + "learning_rate": 3.993875088872592e-06, + "loss": 0.76068342, + "num_input_tokens_seen": 19251460, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.6328125, + "step": 901, + "time_per_iteration": 2.49638032913208 + }, + { + "auxiliary_loss_clip": 0.01221671, + "auxiliary_loss_mlp": 0.01074944, + "balance_loss_clip": 1.04982698, + "balance_loss_mlp": 1.06662059, + "epoch": 0.05423117390650834, + "flos": 12933192942720.0, + "grad_norm": 2.0553503619333586, + "language_loss": 0.85004938, + "learning_rate": 3.9938445944369745e-06, + "loss": 0.87301552, + "num_input_tokens_seen": 19269060, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.546875, + "step": 902, + "time_per_iteration": 2.5067105293273926 + }, + { + "auxiliary_loss_clip": 0.01226177, + "auxiliary_loss_mlp": 0.0106299, + "balance_loss_clip": 1.03638279, + "balance_loss_mlp": 1.06769705, + "epoch": 0.05429129715917631, + "flos": 19901335017600.0, + "grad_norm": 2.0002475654879195, + "language_loss": 0.8655951, + "learning_rate": 3.993814024394569e-06, + "loss": 0.8884868, + "num_input_tokens_seen": 19288620, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5859375, + "step": 903, + "time_per_iteration": 2.522193670272827 + }, + { + "auxiliary_loss_clip": 0.01227512, + "auxiliary_loss_mlp": 0.01062198, + "balance_loss_clip": 1.03622222, + "balance_loss_mlp": 1.06904316, + "epoch": 0.05435142041184428, + "flos": 16908611610240.0, + "grad_norm": 2.4298091072226855, + "language_loss": 0.74835998, + "learning_rate": 3.993783378746537e-06, + "loss": 0.77125704, + "num_input_tokens_seen": 19306615, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.578125, + "step": 904, + "time_per_iteration": 2.456969976425171 + }, + { + "auxiliary_loss_clip": 0.0123038, + "auxiliary_loss_mlp": 0.01073252, + "balance_loss_clip": 1.04685879, + "balance_loss_mlp": 1.06905615, + "epoch": 0.05441154366451225, + "flos": 23948323534080.0, + "grad_norm": 2.0843949675352356, + "language_loss": 0.85750329, + "learning_rate": 3.993752657494039e-06, + "loss": 0.8805396, + "num_input_tokens_seen": 19321680, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.609375, + "step": 905, + "time_per_iteration": 2.5358731746673584 + }, + { + "auxiliary_loss_clip": 0.01227222, + "auxiliary_loss_mlp": 0.01078235, + "balance_loss_clip": 1.05317712, + "balance_loss_mlp": 1.07247257, + "epoch": 0.05447166691718022, + "flos": 19975382904960.0, + "grad_norm": 1.7937911991915148, + "language_loss": 0.74028552, + "learning_rate": 3.993721860638241e-06, + "loss": 0.76334012, + "num_input_tokens_seen": 19339760, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.546875, + "step": 906, + "time_per_iteration": 2.468331813812256 + }, + { + "auxiliary_loss_clip": 0.01228766, + "auxiliary_loss_mlp": 0.01065896, + "balance_loss_clip": 1.03909731, + "balance_loss_mlp": 1.06858826, + "epoch": 0.05453179016984819, + "flos": 24936513575040.0, + "grad_norm": 2.220044948377472, + "language_loss": 0.87410975, + "learning_rate": 3.993690988180309e-06, + "loss": 0.89705634, + "num_input_tokens_seen": 19359585, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.6015625, + "step": 907, + "time_per_iteration": 2.5177390575408936 + }, + { + "auxiliary_loss_clip": 0.01227557, + "auxiliary_loss_mlp": 0.01071851, + "balance_loss_clip": 1.04521942, + "balance_loss_mlp": 1.07002556, + "epoch": 0.05459191342251616, + "flos": 18115102558080.0, + "grad_norm": 1.8689281211501179, + "language_loss": 0.86915505, + "learning_rate": 3.9936600401214165e-06, + "loss": 0.89214909, + "num_input_tokens_seen": 19378590, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.578125, + "step": 908, + "time_per_iteration": 2.45135498046875 + }, + { + "auxiliary_loss_clip": 0.01225417, + "auxiliary_loss_mlp": 0.01068459, + "balance_loss_clip": 1.04073071, + "balance_loss_mlp": 1.06842148, + "epoch": 0.054652036675184125, + "flos": 19208295031680.0, + "grad_norm": 2.409525813232516, + "language_loss": 0.89454836, + "learning_rate": 3.9936290164627345e-06, + "loss": 0.91748714, + "num_input_tokens_seen": 19397910, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.5703125, + "step": 909, + "time_per_iteration": 2.4702625274658203 + }, + { + "auxiliary_loss_clip": 0.01231345, + "auxiliary_loss_mlp": 0.01075786, + "balance_loss_clip": 1.04773629, + "balance_loss_mlp": 1.06930447, + "epoch": 0.0547121599278521, + "flos": 16325745615360.0, + "grad_norm": 2.4022545211155593, + "language_loss": 0.70942473, + "learning_rate": 3.99359791720544e-06, + "loss": 0.73249602, + "num_input_tokens_seen": 19415950, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.625, + "step": 910, + "time_per_iteration": 2.4530797004699707 + }, + { + "auxiliary_loss_clip": 0.01224757, + "auxiliary_loss_mlp": 0.01055797, + "balance_loss_clip": 1.03002357, + "balance_loss_mlp": 1.06815219, + "epoch": 0.05477228318052007, + "flos": 20339014239360.0, + "grad_norm": 2.0100188286094745, + "language_loss": 0.8349818, + "learning_rate": 3.993566742350714e-06, + "loss": 0.85778737, + "num_input_tokens_seen": 19435275, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.5625, + "step": 911, + "time_per_iteration": 2.4792025089263916 + }, + { + "auxiliary_loss_clip": 0.01224017, + "auxiliary_loss_mlp": 0.01072081, + "balance_loss_clip": 1.04524732, + "balance_loss_mlp": 1.06649613, + "epoch": 0.054832406433188034, + "flos": 21973092687360.0, + "grad_norm": 2.746196883211308, + "language_loss": 0.76096344, + "learning_rate": 3.993535491899736e-06, + "loss": 0.7839244, + "num_input_tokens_seen": 19452090, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.578125, + "step": 912, + "time_per_iteration": 2.4651522636413574 + }, + { + "auxiliary_loss_clip": 0.01219912, + "auxiliary_loss_mlp": 0.01052416, + "balance_loss_clip": 1.02733433, + "balance_loss_mlp": 1.06664968, + "epoch": 0.054892529685856006, + "flos": 16398931576320.0, + "grad_norm": 2.385296939765248, + "language_loss": 0.82667339, + "learning_rate": 3.993504165853694e-06, + "loss": 0.84939671, + "num_input_tokens_seen": 19470865, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.53125, + "step": 913, + "time_per_iteration": 2.475384473800659 + }, + { + "auxiliary_loss_clip": 0.01224168, + "auxiliary_loss_mlp": 0.01061883, + "balance_loss_clip": 1.03633678, + "balance_loss_mlp": 1.07065797, + "epoch": 0.05495265293852397, + "flos": 23912341084800.0, + "grad_norm": 2.227172084037845, + "language_loss": 0.83470452, + "learning_rate": 3.993472764213772e-06, + "loss": 0.85756505, + "num_input_tokens_seen": 19492145, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.53125, + "step": 914, + "time_per_iteration": 2.5741806030273438 + }, + { + "auxiliary_loss_clip": 0.01229195, + "auxiliary_loss_mlp": 0.01057782, + "balance_loss_clip": 1.03324902, + "balance_loss_mlp": 1.07264161, + "epoch": 0.055012776191191944, + "flos": 23586954756480.0, + "grad_norm": 2.897688985464872, + "language_loss": 0.9010309, + "learning_rate": 3.9934412869811655e-06, + "loss": 0.92390066, + "num_input_tokens_seen": 19511015, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.5625, + "step": 915, + "time_per_iteration": 2.492981433868408 + }, + { + "auxiliary_loss_clip": 0.01225584, + "auxiliary_loss_mlp": 0.01055475, + "balance_loss_clip": 1.03046489, + "balance_loss_mlp": 1.0708915, + "epoch": 0.055072899443859916, + "flos": 17528501548800.0, + "grad_norm": 1.870109983937874, + "language_loss": 0.89555848, + "learning_rate": 3.993409734157064e-06, + "loss": 0.91836905, + "num_input_tokens_seen": 19529040, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.546875, + "step": 916, + "time_per_iteration": 2.4621188640594482 + }, + { + "auxiliary_loss_clip": 0.01228011, + "auxiliary_loss_mlp": 0.01072271, + "balance_loss_clip": 1.04593801, + "balance_loss_mlp": 1.06942379, + "epoch": 0.05513302269652788, + "flos": 21687172427520.0, + "grad_norm": 1.7933741103180343, + "language_loss": 0.80085957, + "learning_rate": 3.993378105742666e-06, + "loss": 0.82386243, + "num_input_tokens_seen": 19549540, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.5859375, + "step": 917, + "time_per_iteration": 2.49455189704895 + }, + { + "auxiliary_loss_clip": 0.01225592, + "auxiliary_loss_mlp": 0.01058516, + "balance_loss_clip": 1.03270769, + "balance_loss_mlp": 1.06678224, + "epoch": 0.05519314594919585, + "flos": 21613340021760.0, + "grad_norm": 1.9216560267302982, + "language_loss": 0.79673612, + "learning_rate": 3.9933464017391705e-06, + "loss": 0.81957722, + "num_input_tokens_seen": 19567570, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.59375, + "step": 918, + "time_per_iteration": 2.504734516143799 + }, + { + "auxiliary_loss_clip": 0.01223712, + "auxiliary_loss_mlp": 0.01059794, + "balance_loss_clip": 1.03414011, + "balance_loss_mlp": 1.06658053, + "epoch": 0.05525326920186382, + "flos": 21798567480960.0, + "grad_norm": 1.9394116717498289, + "language_loss": 0.89132315, + "learning_rate": 3.99331462214778e-06, + "loss": 0.91415823, + "num_input_tokens_seen": 19585330, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.5703125, + "step": 919, + "time_per_iteration": 2.5087900161743164 + }, + { + "auxiliary_loss_clip": 0.01219042, + "auxiliary_loss_mlp": 0.01068553, + "balance_loss_clip": 1.0427916, + "balance_loss_mlp": 1.06515777, + "epoch": 0.05531339245453179, + "flos": 28439635288320.0, + "grad_norm": 2.688355226699252, + "language_loss": 0.87421197, + "learning_rate": 3.993282766969699e-06, + "loss": 0.89708793, + "num_input_tokens_seen": 19604970, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.5390625, + "step": 920, + "time_per_iteration": 2.536914348602295 + }, + { + "auxiliary_loss_clip": 0.01223828, + "auxiliary_loss_mlp": 0.01063036, + "balance_loss_clip": 1.03733468, + "balance_loss_mlp": 1.06937671, + "epoch": 0.05537351570719976, + "flos": 37375143131520.0, + "grad_norm": 2.1255302161497704, + "language_loss": 0.65921712, + "learning_rate": 3.993250836206136e-06, + "loss": 0.68208569, + "num_input_tokens_seen": 19626235, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.546875, + "step": 921, + "time_per_iteration": 2.643416166305542 + }, + { + "auxiliary_loss_clip": 0.01229793, + "auxiliary_loss_mlp": 0.0106877, + "balance_loss_clip": 1.03969455, + "balance_loss_mlp": 1.0698204, + "epoch": 0.05543363895986773, + "flos": 20084479488000.0, + "grad_norm": 2.143682946402907, + "language_loss": 0.71841472, + "learning_rate": 3.993218829858301e-06, + "loss": 0.74140036, + "num_input_tokens_seen": 19644305, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.6015625, + "step": 922, + "time_per_iteration": 2.4544074535369873 + }, + { + "auxiliary_loss_clip": 0.0122536, + "auxiliary_loss_mlp": 0.01070183, + "balance_loss_clip": 1.04346824, + "balance_loss_mlp": 1.0669136, + "epoch": 0.0554937622125357, + "flos": 24533200690560.0, + "grad_norm": 2.766492717488127, + "language_loss": 0.82548857, + "learning_rate": 3.993186747927408e-06, + "loss": 0.84844404, + "num_input_tokens_seen": 19662130, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5859375, + "step": 923, + "time_per_iteration": 2.490915536880493 + }, + { + "auxiliary_loss_clip": 0.01221243, + "auxiliary_loss_mlp": 0.01068053, + "balance_loss_clip": 1.04194593, + "balance_loss_mlp": 1.06429458, + "epoch": 0.055553885465203665, + "flos": 14320063013760.0, + "grad_norm": 2.2095756655687397, + "language_loss": 0.78808558, + "learning_rate": 3.993154590414675e-06, + "loss": 0.81097853, + "num_input_tokens_seen": 19680715, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.5625, + "step": 924, + "time_per_iteration": 2.45884108543396 + }, + { + "auxiliary_loss_clip": 0.0121918, + "auxiliary_loss_mlp": 0.01059373, + "balance_loss_clip": 1.03245521, + "balance_loss_mlp": 1.06480467, + "epoch": 0.05561400871787164, + "flos": 27381132374400.0, + "grad_norm": 1.9513803878946447, + "language_loss": 1.02250028, + "learning_rate": 3.993122357321319e-06, + "loss": 1.04528582, + "num_input_tokens_seen": 19700535, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.546875, + "step": 925, + "time_per_iteration": 2.5296268463134766 + }, + { + "auxiliary_loss_clip": 0.01220429, + "auxiliary_loss_mlp": 0.01055633, + "balance_loss_clip": 1.02975261, + "balance_loss_mlp": 1.0634799, + "epoch": 0.05567413197053961, + "flos": 23221096778880.0, + "grad_norm": 2.3756260245044687, + "language_loss": 0.80808276, + "learning_rate": 3.993090048648564e-06, + "loss": 0.83084333, + "num_input_tokens_seen": 19718825, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.5703125, + "step": 926, + "time_per_iteration": 2.496299982070923 + }, + { + "auxiliary_loss_clip": 0.01229405, + "auxiliary_loss_mlp": 0.01068259, + "balance_loss_clip": 1.04049563, + "balance_loss_mlp": 1.06743848, + "epoch": 0.055734255223207574, + "flos": 25264952559360.0, + "grad_norm": 2.4713559623940924, + "language_loss": 0.73378903, + "learning_rate": 3.993057664397634e-06, + "loss": 0.75676566, + "num_input_tokens_seen": 19739080, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.6171875, + "step": 927, + "time_per_iteration": 2.5607478618621826 + }, + { + "auxiliary_loss_clip": 0.01103967, + "auxiliary_loss_mlp": 0.01014529, + "balance_loss_clip": 1.00837731, + "balance_loss_mlp": 1.03639269, + "epoch": 0.055794378475875546, + "flos": 66503116702080.0, + "grad_norm": 0.7814837823676635, + "language_loss": 0.5989722, + "learning_rate": 3.9930252045697585e-06, + "loss": 0.62015712, + "num_input_tokens_seen": 19802960, + "router_z_loss_clip": 0.0612793, + "router_z_loss_mlp": 0.67578125, + "step": 928, + "time_per_iteration": 3.0945305824279785 + }, + { + "auxiliary_loss_clip": 0.01223562, + "auxiliary_loss_mlp": 0.01066756, + "balance_loss_clip": 1.04035151, + "balance_loss_mlp": 1.06729245, + "epoch": 0.05585450172854351, + "flos": 25337635729920.0, + "grad_norm": 2.3037954576101587, + "language_loss": 0.95011377, + "learning_rate": 3.992992669166168e-06, + "loss": 0.97301698, + "num_input_tokens_seen": 19822765, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.5625, + "step": 929, + "time_per_iteration": 2.527270555496216 + }, + { + "auxiliary_loss_clip": 0.01221186, + "auxiliary_loss_mlp": 0.01067668, + "balance_loss_clip": 1.03924894, + "balance_loss_mlp": 1.06494856, + "epoch": 0.05591462498121148, + "flos": 33911738881920.0, + "grad_norm": 2.1540114832188553, + "language_loss": 0.71827871, + "learning_rate": 3.992960058188094e-06, + "loss": 0.74116725, + "num_input_tokens_seen": 19843590, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.5625, + "step": 930, + "time_per_iteration": 2.57513689994812 + }, + { + "auxiliary_loss_clip": 0.01227654, + "auxiliary_loss_mlp": 0.01062398, + "balance_loss_clip": 1.0355165, + "balance_loss_mlp": 1.06905401, + "epoch": 0.055974748233879455, + "flos": 17930880679680.0, + "grad_norm": 2.336481182624628, + "language_loss": 0.85333288, + "learning_rate": 3.992927371636776e-06, + "loss": 0.87623346, + "num_input_tokens_seen": 19860230, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.5859375, + "step": 931, + "time_per_iteration": 2.459167957305908 + }, + { + "auxiliary_loss_clip": 0.01224553, + "auxiliary_loss_mlp": 0.01072004, + "balance_loss_clip": 1.0448482, + "balance_loss_mlp": 1.06556344, + "epoch": 0.05603487148654742, + "flos": 24021976371840.0, + "grad_norm": 1.9723738142749898, + "language_loss": 0.83577204, + "learning_rate": 3.9928946095134525e-06, + "loss": 0.85873753, + "num_input_tokens_seen": 19880795, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.59375, + "step": 932, + "time_per_iteration": 2.4930593967437744 + }, + { + "auxiliary_loss_clip": 0.01223225, + "auxiliary_loss_mlp": 0.01067756, + "balance_loss_clip": 1.04012322, + "balance_loss_mlp": 1.06712675, + "epoch": 0.05609499473921539, + "flos": 17307758517120.0, + "grad_norm": 2.411257667891357, + "language_loss": 0.73405433, + "learning_rate": 3.992861771819365e-06, + "loss": 0.75696409, + "num_input_tokens_seen": 19897960, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.5625, + "step": 933, + "time_per_iteration": 2.526521682739258 + }, + { + "auxiliary_loss_clip": 0.01219811, + "auxiliary_loss_mlp": 0.01070368, + "balance_loss_clip": 1.04328358, + "balance_loss_mlp": 1.06432819, + "epoch": 0.05615511799188336, + "flos": 20994742972800.0, + "grad_norm": 2.577929883809357, + "language_loss": 0.86850882, + "learning_rate": 3.99282885855576e-06, + "loss": 0.89141059, + "num_input_tokens_seen": 19913315, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.5546875, + "step": 934, + "time_per_iteration": 5.338034391403198 + }, + { + "auxiliary_loss_clip": 0.01220003, + "auxiliary_loss_mlp": 0.01069693, + "balance_loss_clip": 1.04495692, + "balance_loss_mlp": 1.06842983, + "epoch": 0.05621524124455133, + "flos": 17273535834240.0, + "grad_norm": 2.2060919587088965, + "language_loss": 0.80243224, + "learning_rate": 3.992795869723885e-06, + "loss": 0.82532918, + "num_input_tokens_seen": 19928790, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.515625, + "step": 935, + "time_per_iteration": 3.8198087215423584 + }, + { + "auxiliary_loss_clip": 0.01094527, + "auxiliary_loss_mlp": 0.01008984, + "balance_loss_clip": 1.00321388, + "balance_loss_mlp": 1.02876139, + "epoch": 0.0562753644972193, + "flos": 58719370458240.0, + "grad_norm": 0.8225714537835027, + "language_loss": 0.69179416, + "learning_rate": 3.99276280532499e-06, + "loss": 0.71282923, + "num_input_tokens_seen": 19988785, + "router_z_loss_clip": 0.05761719, + "router_z_loss_mlp": 0.65625, + "step": 936, + "time_per_iteration": 2.9585764408111572 + }, + { + "auxiliary_loss_clip": 0.01220636, + "auxiliary_loss_mlp": 0.01067113, + "balance_loss_clip": 1.04123259, + "balance_loss_mlp": 1.06387568, + "epoch": 0.05633548774988727, + "flos": 17457039440640.0, + "grad_norm": 2.5168182860703237, + "language_loss": 0.75900578, + "learning_rate": 3.992729665360331e-06, + "loss": 0.78188324, + "num_input_tokens_seen": 20007685, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.5703125, + "step": 937, + "time_per_iteration": 2.4891855716705322 + }, + { + "auxiliary_loss_clip": 0.01091743, + "auxiliary_loss_mlp": 0.01002728, + "balance_loss_clip": 0.99738711, + "balance_loss_mlp": 1.02642298, + "epoch": 0.05639561100255524, + "flos": 70654928083200.0, + "grad_norm": 0.8631606334327763, + "language_loss": 0.64287508, + "learning_rate": 3.992696449831162e-06, + "loss": 0.66381979, + "num_input_tokens_seen": 20072750, + "router_z_loss_clip": 0.0534668, + "router_z_loss_mlp": 0.65625, + "step": 938, + "time_per_iteration": 3.0239782333374023 + }, + { + "auxiliary_loss_clip": 0.01226335, + "auxiliary_loss_mlp": 0.0107197, + "balance_loss_clip": 1.04487348, + "balance_loss_mlp": 1.06571174, + "epoch": 0.056455734255223204, + "flos": 20485996692480.0, + "grad_norm": 4.570077538128457, + "language_loss": 0.7903074, + "learning_rate": 3.992663158738745e-06, + "loss": 0.81329048, + "num_input_tokens_seen": 20089070, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.609375, + "step": 939, + "time_per_iteration": 2.494706630706787 + }, + { + "auxiliary_loss_clip": 0.012214, + "auxiliary_loss_mlp": 0.01063948, + "balance_loss_clip": 1.03868759, + "balance_loss_mlp": 1.0669229, + "epoch": 0.056515857507891176, + "flos": 22053569109120.0, + "grad_norm": 1.950609958048397, + "language_loss": 0.73893893, + "learning_rate": 3.992629792084341e-06, + "loss": 0.76179242, + "num_input_tokens_seen": 20108790, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.546875, + "step": 940, + "time_per_iteration": 2.5279061794281006 + }, + { + "auxiliary_loss_clip": 0.01220257, + "auxiliary_loss_mlp": 0.01064421, + "balance_loss_clip": 1.03776574, + "balance_loss_mlp": 1.06722569, + "epoch": 0.05657598076055915, + "flos": 24025316336640.0, + "grad_norm": 1.9142676693922898, + "language_loss": 0.70475829, + "learning_rate": 3.992596349869216e-06, + "loss": 0.72760499, + "num_input_tokens_seen": 20128455, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.53125, + "step": 941, + "time_per_iteration": 2.551604747772217 + }, + { + "auxiliary_loss_clip": 0.01218348, + "auxiliary_loss_mlp": 0.01058281, + "balance_loss_clip": 1.03229308, + "balance_loss_mlp": 1.06624675, + "epoch": 0.05663610401322711, + "flos": 20480609652480.0, + "grad_norm": 2.3045436850665917, + "language_loss": 0.80928791, + "learning_rate": 3.992562832094637e-06, + "loss": 0.83205426, + "num_input_tokens_seen": 20145775, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.515625, + "step": 942, + "time_per_iteration": 2.515646457672119 + }, + { + "auxiliary_loss_clip": 0.01214197, + "auxiliary_loss_mlp": 0.01057859, + "balance_loss_clip": 1.03245521, + "balance_loss_mlp": 1.062042, + "epoch": 0.056696227265895086, + "flos": 21069042255360.0, + "grad_norm": 2.7900678467193205, + "language_loss": 0.88067353, + "learning_rate": 3.9925292387618755e-06, + "loss": 0.9033941, + "num_input_tokens_seen": 20164315, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.5234375, + "step": 943, + "time_per_iteration": 2.674614191055298 + }, + { + "auxiliary_loss_clip": 0.01220399, + "auxiliary_loss_mlp": 0.01056577, + "balance_loss_clip": 1.03182912, + "balance_loss_mlp": 1.06757212, + "epoch": 0.05675635051856306, + "flos": 17821317219840.0, + "grad_norm": 2.6837069047913924, + "language_loss": 0.75092185, + "learning_rate": 3.992495569872206e-06, + "loss": 0.77369165, + "num_input_tokens_seen": 20182760, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.5234375, + "step": 944, + "time_per_iteration": 2.5806639194488525 + }, + { + "auxiliary_loss_clip": 0.01215674, + "auxiliary_loss_mlp": 0.01062669, + "balance_loss_clip": 1.0385294, + "balance_loss_mlp": 1.06267428, + "epoch": 0.05681647377123102, + "flos": 23114945111040.0, + "grad_norm": 1.7462690351912153, + "language_loss": 0.79321784, + "learning_rate": 3.992461825426906e-06, + "loss": 0.8160013, + "num_input_tokens_seen": 20203830, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.53125, + "step": 945, + "time_per_iteration": 2.695613384246826 + }, + { + "auxiliary_loss_clip": 0.01218347, + "auxiliary_loss_mlp": 0.01061935, + "balance_loss_clip": 1.03628159, + "balance_loss_mlp": 1.06407309, + "epoch": 0.056876597023898995, + "flos": 16070528505600.0, + "grad_norm": 2.1794845223078556, + "language_loss": 0.82465631, + "learning_rate": 3.992428005427252e-06, + "loss": 0.84745914, + "num_input_tokens_seen": 20220365, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.546875, + "step": 946, + "time_per_iteration": 2.6081790924072266 + }, + { + "auxiliary_loss_clip": 0.01223255, + "auxiliary_loss_mlp": 0.01059618, + "balance_loss_clip": 1.03258097, + "balance_loss_mlp": 1.06615055, + "epoch": 0.05693672027656696, + "flos": 16835641130880.0, + "grad_norm": 2.7693395657309297, + "language_loss": 0.7904911, + "learning_rate": 3.992394109874529e-06, + "loss": 0.8133198, + "num_input_tokens_seen": 20238640, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.5703125, + "step": 947, + "time_per_iteration": 2.460472822189331 + }, + { + "auxiliary_loss_clip": 0.01227462, + "auxiliary_loss_mlp": 0.01065027, + "balance_loss_clip": 1.03890848, + "balance_loss_mlp": 1.06883287, + "epoch": 0.05699684352923493, + "flos": 21389113370880.0, + "grad_norm": 7.046260534289203, + "language_loss": 0.85772789, + "learning_rate": 3.9923601387700225e-06, + "loss": 0.88065279, + "num_input_tokens_seen": 20251025, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.5859375, + "step": 948, + "time_per_iteration": 2.4560892581939697 + }, + { + "auxiliary_loss_clip": 0.01217019, + "auxiliary_loss_mlp": 0.01060985, + "balance_loss_clip": 1.03374553, + "balance_loss_mlp": 1.06329989, + "epoch": 0.057056966781902904, + "flos": 15560309767680.0, + "grad_norm": 1.8055084405958775, + "language_loss": 0.87044799, + "learning_rate": 3.992326092115019e-06, + "loss": 0.89322805, + "num_input_tokens_seen": 20269775, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.5390625, + "step": 949, + "time_per_iteration": 2.4843316078186035 + }, + { + "auxiliary_loss_clip": 0.01212611, + "auxiliary_loss_mlp": 0.01066298, + "balance_loss_clip": 1.04170561, + "balance_loss_mlp": 1.06284809, + "epoch": 0.05711709003457087, + "flos": 19937856170880.0, + "grad_norm": 2.230679935648155, + "language_loss": 0.79035759, + "learning_rate": 3.992291969910811e-06, + "loss": 0.81314665, + "num_input_tokens_seen": 20287715, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.4921875, + "step": 950, + "time_per_iteration": 2.468172311782837 + }, + { + "auxiliary_loss_clip": 0.01221984, + "auxiliary_loss_mlp": 0.01068601, + "balance_loss_clip": 1.04365039, + "balance_loss_mlp": 1.06574106, + "epoch": 0.05717721328723884, + "flos": 30332701774080.0, + "grad_norm": 2.0871877141587682, + "language_loss": 0.8244521, + "learning_rate": 3.992257772158691e-06, + "loss": 0.84735799, + "num_input_tokens_seen": 20307070, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.5625, + "step": 951, + "time_per_iteration": 2.5418505668640137 + }, + { + "auxiliary_loss_clip": 0.01215404, + "auxiliary_loss_mlp": 0.01062639, + "balance_loss_clip": 1.03568625, + "balance_loss_mlp": 1.06129527, + "epoch": 0.05723733653990681, + "flos": 23654358627840.0, + "grad_norm": 2.5400916768099426, + "language_loss": 0.86685216, + "learning_rate": 3.992223498859958e-06, + "loss": 0.88963258, + "num_input_tokens_seen": 20324945, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.5390625, + "step": 952, + "time_per_iteration": 2.513356924057007 + }, + { + "auxiliary_loss_clip": 0.0122001, + "auxiliary_loss_mlp": 0.01062958, + "balance_loss_clip": 1.03415656, + "balance_loss_mlp": 1.06145215, + "epoch": 0.05729745979257478, + "flos": 22055759838720.0, + "grad_norm": 1.725154467975805, + "language_loss": 0.79043579, + "learning_rate": 3.9921891500159084e-06, + "loss": 0.81326544, + "num_input_tokens_seen": 20346135, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.5859375, + "step": 953, + "time_per_iteration": 2.490940570831299 + }, + { + "auxiliary_loss_clip": 0.01223554, + "auxiliary_loss_mlp": 0.01063244, + "balance_loss_clip": 1.03592086, + "balance_loss_mlp": 1.06757712, + "epoch": 0.05735758304524275, + "flos": 19604353368960.0, + "grad_norm": 2.2937199779067106, + "language_loss": 0.87086606, + "learning_rate": 3.992154725627848e-06, + "loss": 0.89373398, + "num_input_tokens_seen": 20364450, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.5625, + "step": 954, + "time_per_iteration": 2.495039701461792 + }, + { + "auxiliary_loss_clip": 0.01221375, + "auxiliary_loss_mlp": 0.01062344, + "balance_loss_clip": 1.03707159, + "balance_loss_mlp": 1.06446028, + "epoch": 0.057417706297910716, + "flos": 19099018880640.0, + "grad_norm": 2.3514674671771933, + "language_loss": 0.87789929, + "learning_rate": 3.9921202256970804e-06, + "loss": 0.90073651, + "num_input_tokens_seen": 20383500, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.5625, + "step": 955, + "time_per_iteration": 2.5018017292022705 + }, + { + "auxiliary_loss_clip": 0.01214985, + "auxiliary_loss_mlp": 0.01065732, + "balance_loss_clip": 1.04000711, + "balance_loss_mlp": 1.06217909, + "epoch": 0.05747782955057869, + "flos": 16654507822080.0, + "grad_norm": 3.7193659196918576, + "language_loss": 0.89682388, + "learning_rate": 3.992085650224914e-06, + "loss": 0.919631, + "num_input_tokens_seen": 20400295, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.53125, + "step": 956, + "time_per_iteration": 2.43306565284729 + }, + { + "auxiliary_loss_clip": 0.01212174, + "auxiliary_loss_mlp": 0.0105844, + "balance_loss_clip": 1.03232098, + "balance_loss_mlp": 1.06344521, + "epoch": 0.05753795280324665, + "flos": 14502058248960.0, + "grad_norm": 1.7667772588634594, + "language_loss": 0.75335747, + "learning_rate": 3.99205099921266e-06, + "loss": 0.77606356, + "num_input_tokens_seen": 20419085, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.484375, + "step": 957, + "time_per_iteration": 2.469240665435791 + }, + { + "auxiliary_loss_clip": 0.01218166, + "auxiliary_loss_mlp": 0.01075955, + "balance_loss_clip": 1.04713011, + "balance_loss_mlp": 1.06214452, + "epoch": 0.057598076055914625, + "flos": 18076318848000.0, + "grad_norm": 1.8974624224625587, + "language_loss": 0.79871029, + "learning_rate": 3.992016272661633e-06, + "loss": 0.82165146, + "num_input_tokens_seen": 20437465, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.5625, + "step": 958, + "time_per_iteration": 2.5016849040985107 + }, + { + "auxiliary_loss_clip": 0.01214009, + "auxiliary_loss_mlp": 0.01062008, + "balance_loss_clip": 1.03780818, + "balance_loss_mlp": 1.06024444, + "epoch": 0.0576581993085826, + "flos": 22124600254080.0, + "grad_norm": 2.5702669091422234, + "language_loss": 0.88410264, + "learning_rate": 3.99198147057315e-06, + "loss": 0.90686285, + "num_input_tokens_seen": 20456235, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.5390625, + "step": 959, + "time_per_iteration": 2.4830191135406494 + }, + { + "auxiliary_loss_clip": 0.01211651, + "auxiliary_loss_mlp": 0.01063458, + "balance_loss_clip": 1.03832912, + "balance_loss_mlp": 1.0626018, + "epoch": 0.05771832256125056, + "flos": 33181746779520.0, + "grad_norm": 2.6997220185951347, + "language_loss": 0.78556621, + "learning_rate": 3.991946592948529e-06, + "loss": 0.8083173, + "num_input_tokens_seen": 20476825, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.4921875, + "step": 960, + "time_per_iteration": 2.569218397140503 + }, + { + "auxiliary_loss_clip": 0.01217172, + "auxiliary_loss_mlp": 0.01063539, + "balance_loss_clip": 1.03685999, + "balance_loss_mlp": 1.06168103, + "epoch": 0.057778445813918534, + "flos": 24170143973760.0, + "grad_norm": 4.159271492638429, + "language_loss": 0.932491, + "learning_rate": 3.991911639789094e-06, + "loss": 0.95529813, + "num_input_tokens_seen": 20496965, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.5546875, + "step": 961, + "time_per_iteration": 2.511765480041504 + }, + { + "auxiliary_loss_clip": 0.01215042, + "auxiliary_loss_mlp": 0.01070899, + "balance_loss_clip": 1.04411268, + "balance_loss_mlp": 1.06039667, + "epoch": 0.0578385690665865, + "flos": 29643037666560.0, + "grad_norm": 2.532017623976099, + "language_loss": 0.6822986, + "learning_rate": 3.991876611096169e-06, + "loss": 0.70515805, + "num_input_tokens_seen": 20518035, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.546875, + "step": 962, + "time_per_iteration": 2.544498920440674 + }, + { + "auxiliary_loss_clip": 0.01214012, + "auxiliary_loss_mlp": 0.01068596, + "balance_loss_clip": 1.04461062, + "balance_loss_mlp": 1.06268489, + "epoch": 0.05789869231925447, + "flos": 20885430908160.0, + "grad_norm": 2.445305128304827, + "language_loss": 0.88187808, + "learning_rate": 3.991841506871084e-06, + "loss": 0.90470415, + "num_input_tokens_seen": 20534740, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.515625, + "step": 963, + "time_per_iteration": 2.459487199783325 + }, + { + "auxiliary_loss_clip": 0.01222623, + "auxiliary_loss_mlp": 0.01058866, + "balance_loss_clip": 1.03337944, + "balance_loss_mlp": 1.06633568, + "epoch": 0.057958815571922444, + "flos": 26031106679040.0, + "grad_norm": 2.5656796350524473, + "language_loss": 0.84858835, + "learning_rate": 3.99180632711517e-06, + "loss": 0.87140322, + "num_input_tokens_seen": 20553485, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.5625, + "step": 964, + "time_per_iteration": 2.5268235206604004 + }, + { + "auxiliary_loss_clip": 0.01216658, + "auxiliary_loss_mlp": 0.01067828, + "balance_loss_clip": 1.04157782, + "balance_loss_mlp": 1.06309247, + "epoch": 0.05801893882459041, + "flos": 18077683564800.0, + "grad_norm": 2.846103019544017, + "language_loss": 0.77748007, + "learning_rate": 3.99177107182976e-06, + "loss": 0.80032492, + "num_input_tokens_seen": 20572155, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.5390625, + "step": 965, + "time_per_iteration": 2.4572315216064453 + }, + { + "auxiliary_loss_clip": 0.01211478, + "auxiliary_loss_mlp": 0.01068539, + "balance_loss_clip": 1.04424393, + "balance_loss_mlp": 1.0614084, + "epoch": 0.05807906207725838, + "flos": 17748885444480.0, + "grad_norm": 2.4479010977704463, + "language_loss": 0.80922461, + "learning_rate": 3.99173574101619e-06, + "loss": 0.83202475, + "num_input_tokens_seen": 20590395, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.5, + "step": 966, + "time_per_iteration": 2.4682776927948 + }, + { + "auxiliary_loss_clip": 0.01212307, + "auxiliary_loss_mlp": 0.01061872, + "balance_loss_clip": 1.03730273, + "balance_loss_mlp": 1.06173599, + "epoch": 0.058139185329926346, + "flos": 18040372312320.0, + "grad_norm": 1.8643875206872442, + "language_loss": 0.76291096, + "learning_rate": 3.9917003346758035e-06, + "loss": 0.78565276, + "num_input_tokens_seen": 20608435, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.5, + "step": 967, + "time_per_iteration": 2.453474521636963 + }, + { + "auxiliary_loss_clip": 0.01084806, + "auxiliary_loss_mlp": 0.01040579, + "balance_loss_clip": 1.03547657, + "balance_loss_mlp": 1.02152586, + "epoch": 0.05819930858259432, + "flos": 62363297485440.0, + "grad_norm": 0.7926144837125159, + "language_loss": 0.57362092, + "learning_rate": 3.991664852809939e-06, + "loss": 0.59487474, + "num_input_tokens_seen": 20668575, + "router_z_loss_clip": 0.05102539, + "router_z_loss_mlp": 0.6328125, + "step": 968, + "time_per_iteration": 2.994419574737549 + }, + { + "auxiliary_loss_clip": 0.01218807, + "auxiliary_loss_mlp": 0.01055354, + "balance_loss_clip": 1.02865148, + "balance_loss_mlp": 1.06574845, + "epoch": 0.05825943183526229, + "flos": 19135360465920.0, + "grad_norm": 2.057389892616485, + "language_loss": 0.82289147, + "learning_rate": 3.991629295419945e-06, + "loss": 0.84563303, + "num_input_tokens_seen": 20687355, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.53125, + "step": 969, + "time_per_iteration": 2.4718844890594482 + }, + { + "auxiliary_loss_clip": 0.01217673, + "auxiliary_loss_mlp": 0.01056826, + "balance_loss_clip": 1.03105259, + "balance_loss_mlp": 1.06392384, + "epoch": 0.058319555087930255, + "flos": 29022465369600.0, + "grad_norm": 2.1897875503845725, + "language_loss": 0.780442, + "learning_rate": 3.991593662507167e-06, + "loss": 0.80318701, + "num_input_tokens_seen": 20705710, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.5390625, + "step": 970, + "time_per_iteration": 2.5632171630859375 + }, + { + "auxiliary_loss_clip": 0.01216631, + "auxiliary_loss_mlp": 0.01054997, + "balance_loss_clip": 1.02809155, + "balance_loss_mlp": 1.06188202, + "epoch": 0.05837967834059823, + "flos": 18879999701760.0, + "grad_norm": 2.6802242915962, + "language_loss": 0.92492616, + "learning_rate": 3.991557954072958e-06, + "loss": 0.94764245, + "num_input_tokens_seen": 20722405, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.546875, + "step": 971, + "time_per_iteration": 2.4642531871795654 + }, + { + "auxiliary_loss_clip": 0.01210603, + "auxiliary_loss_mlp": 0.010597, + "balance_loss_clip": 1.03439212, + "balance_loss_mlp": 1.05865097, + "epoch": 0.05843980159326619, + "flos": 25703062744320.0, + "grad_norm": 3.0470884327064276, + "language_loss": 0.86133701, + "learning_rate": 3.991522170118673e-06, + "loss": 0.88404, + "num_input_tokens_seen": 20741480, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.515625, + "step": 972, + "time_per_iteration": 2.5298526287078857 + }, + { + "auxiliary_loss_clip": 0.01212752, + "auxiliary_loss_mlp": 0.01064681, + "balance_loss_clip": 1.04038596, + "balance_loss_mlp": 1.0636549, + "epoch": 0.058499924845934165, + "flos": 25552129795200.0, + "grad_norm": 2.0754734138997906, + "language_loss": 0.87340444, + "learning_rate": 3.991486310645667e-06, + "loss": 0.89617872, + "num_input_tokens_seen": 20759685, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.4921875, + "step": 973, + "time_per_iteration": 2.5198311805725098 + }, + { + "auxiliary_loss_clip": 0.01213937, + "auxiliary_loss_mlp": 0.01067264, + "balance_loss_clip": 1.04070425, + "balance_loss_mlp": 1.06140256, + "epoch": 0.05856004809860214, + "flos": 16436171001600.0, + "grad_norm": 3.2539468590332707, + "language_loss": 0.74868345, + "learning_rate": 3.991450375655301e-06, + "loss": 0.77149546, + "num_input_tokens_seen": 20778180, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5234375, + "step": 974, + "time_per_iteration": 2.465268850326538 + }, + { + "auxiliary_loss_clip": 0.0121359, + "auxiliary_loss_mlp": 0.01059075, + "balance_loss_clip": 1.03308713, + "balance_loss_mlp": 1.06260133, + "epoch": 0.0586201713512701, + "flos": 39458824116480.0, + "grad_norm": 1.7891188847385684, + "language_loss": 0.76707923, + "learning_rate": 3.991414365148936e-06, + "loss": 0.78980577, + "num_input_tokens_seen": 20802705, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.515625, + "step": 975, + "time_per_iteration": 2.633850336074829 + }, + { + "auxiliary_loss_clip": 0.01216778, + "auxiliary_loss_mlp": 0.01068456, + "balance_loss_clip": 1.04332697, + "balance_loss_mlp": 1.0621978, + "epoch": 0.058680294603938074, + "flos": 23365170230400.0, + "grad_norm": 2.0981769673049326, + "language_loss": 0.76878488, + "learning_rate": 3.99137827912794e-06, + "loss": 0.79163718, + "num_input_tokens_seen": 20822540, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.546875, + "step": 976, + "time_per_iteration": 6.8309245109558105 + }, + { + "auxiliary_loss_clip": 0.01210296, + "auxiliary_loss_mlp": 0.01061517, + "balance_loss_clip": 1.03606534, + "balance_loss_mlp": 1.0585494, + "epoch": 0.05874041785660604, + "flos": 32232017226240.0, + "grad_norm": 1.8109666318996334, + "language_loss": 0.87465948, + "learning_rate": 3.991342117593679e-06, + "loss": 0.89737761, + "num_input_tokens_seen": 20844175, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.515625, + "step": 977, + "time_per_iteration": 2.5693395137786865 + }, + { + "auxiliary_loss_clip": 0.01213396, + "auxiliary_loss_mlp": 0.01064383, + "balance_loss_clip": 1.0380497, + "balance_loss_mlp": 1.06246471, + "epoch": 0.05880054110927401, + "flos": 22310043194880.0, + "grad_norm": 1.7886661734827753, + "language_loss": 0.79517525, + "learning_rate": 3.991305880547527e-06, + "loss": 0.81795299, + "num_input_tokens_seen": 20864730, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.5078125, + "step": 978, + "time_per_iteration": 2.51609206199646 + }, + { + "auxiliary_loss_clip": 0.01218239, + "auxiliary_loss_mlp": 0.01069938, + "balance_loss_clip": 1.04339027, + "balance_loss_mlp": 1.06304932, + "epoch": 0.05886066436194198, + "flos": 27380450016000.0, + "grad_norm": 2.6270410794651102, + "language_loss": 0.80902123, + "learning_rate": 3.991269567990855e-06, + "loss": 0.83190298, + "num_input_tokens_seen": 20885200, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.546875, + "step": 979, + "time_per_iteration": 2.527127504348755 + }, + { + "auxiliary_loss_clip": 0.01085971, + "auxiliary_loss_mlp": 0.01009543, + "balance_loss_clip": 1.0044651, + "balance_loss_mlp": 1.02304745, + "epoch": 0.05892078761460995, + "flos": 59584493525760.0, + "grad_norm": 0.94528472512207, + "language_loss": 0.59059429, + "learning_rate": 3.9912331799250415e-06, + "loss": 0.61154944, + "num_input_tokens_seen": 20940325, + "router_z_loss_clip": 0.05078125, + "router_z_loss_mlp": 0.62890625, + "step": 980, + "time_per_iteration": 2.9545915126800537 + }, + { + "auxiliary_loss_clip": 0.01210703, + "auxiliary_loss_mlp": 0.0106402, + "balance_loss_clip": 1.03747201, + "balance_loss_mlp": 1.0622623, + "epoch": 0.05898091086727792, + "flos": 15414081500160.0, + "grad_norm": 2.3915266710240917, + "language_loss": 0.86397457, + "learning_rate": 3.9911967163514665e-06, + "loss": 0.88672185, + "num_input_tokens_seen": 20958220, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.484375, + "step": 981, + "time_per_iteration": 2.4726293087005615 + }, + { + "auxiliary_loss_clip": 0.01212695, + "auxiliary_loss_mlp": 0.01057503, + "balance_loss_clip": 1.03423381, + "balance_loss_mlp": 1.06214404, + "epoch": 0.059041034119945886, + "flos": 23655328295040.0, + "grad_norm": 1.9485203495729437, + "language_loss": 0.79623365, + "learning_rate": 3.991160177271513e-06, + "loss": 0.81893563, + "num_input_tokens_seen": 20978920, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.5078125, + "step": 982, + "time_per_iteration": 2.5271458625793457 + }, + { + "auxiliary_loss_clip": 0.01219179, + "auxiliary_loss_mlp": 0.01060762, + "balance_loss_clip": 1.03571582, + "balance_loss_mlp": 1.06248748, + "epoch": 0.05910115737261386, + "flos": 24754087376640.0, + "grad_norm": 2.5320957946125437, + "language_loss": 0.84376037, + "learning_rate": 3.9911235626865654e-06, + "loss": 0.86655974, + "num_input_tokens_seen": 20999490, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.5625, + "step": 983, + "time_per_iteration": 2.526364803314209 + }, + { + "auxiliary_loss_clip": 0.01212847, + "auxiliary_loss_mlp": 0.01067453, + "balance_loss_clip": 1.04361129, + "balance_loss_mlp": 1.06317604, + "epoch": 0.05916128062528183, + "flos": 11728749070080.0, + "grad_norm": 1.8446015864025267, + "language_loss": 0.84607553, + "learning_rate": 3.9910868725980125e-06, + "loss": 0.86887848, + "num_input_tokens_seen": 21017865, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.4921875, + "step": 984, + "time_per_iteration": 2.456803321838379 + }, + { + "auxiliary_loss_clip": 0.01211466, + "auxiliary_loss_mlp": 0.01059154, + "balance_loss_clip": 1.03551483, + "balance_loss_mlp": 1.06338882, + "epoch": 0.059221403877949795, + "flos": 21902995296000.0, + "grad_norm": 2.3276500524021495, + "language_loss": 0.77875566, + "learning_rate": 3.9910501070072465e-06, + "loss": 0.80146182, + "num_input_tokens_seen": 21035900, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.484375, + "step": 985, + "time_per_iteration": 2.504096508026123 + }, + { + "auxiliary_loss_clip": 0.01215785, + "auxiliary_loss_mlp": 0.01061307, + "balance_loss_clip": 1.03661919, + "balance_loss_mlp": 1.06191659, + "epoch": 0.05928152713061777, + "flos": 20514580940160.0, + "grad_norm": 2.294716701848832, + "language_loss": 0.90598249, + "learning_rate": 3.991013265915661e-06, + "loss": 0.92875338, + "num_input_tokens_seen": 21053235, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.5390625, + "step": 986, + "time_per_iteration": 2.4882049560546875 + }, + { + "auxiliary_loss_clip": 0.01215421, + "auxiliary_loss_mlp": 0.01062373, + "balance_loss_clip": 1.03534794, + "balance_loss_mlp": 1.06017947, + "epoch": 0.05934165038328574, + "flos": 24495135252480.0, + "grad_norm": 3.8181645576894256, + "language_loss": 0.7589798, + "learning_rate": 3.9909763493246525e-06, + "loss": 0.78175771, + "num_input_tokens_seen": 21073090, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.546875, + "step": 987, + "time_per_iteration": 2.492412805557251 + }, + { + "auxiliary_loss_clip": 0.01216653, + "auxiliary_loss_mlp": 0.01059941, + "balance_loss_clip": 1.03491902, + "balance_loss_mlp": 1.06059265, + "epoch": 0.059401773635953704, + "flos": 38728041914880.0, + "grad_norm": 2.1447391932017843, + "language_loss": 0.71525705, + "learning_rate": 3.990939357235621e-06, + "loss": 0.73802304, + "num_input_tokens_seen": 21094895, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.5625, + "step": 988, + "time_per_iteration": 2.6386756896972656 + }, + { + "auxiliary_loss_clip": 0.01081383, + "auxiliary_loss_mlp": 0.01012054, + "balance_loss_clip": 1.00680876, + "balance_loss_mlp": 1.01888978, + "epoch": 0.059461896888621676, + "flos": 58023565125120.0, + "grad_norm": 0.9344259157338769, + "language_loss": 0.71159971, + "learning_rate": 3.99090228964997e-06, + "loss": 0.73253405, + "num_input_tokens_seen": 21147555, + "router_z_loss_clip": 0.05249023, + "router_z_loss_mlp": 0.625, + "step": 989, + "time_per_iteration": 2.903996706008911 + }, + { + "auxiliary_loss_clip": 0.01219656, + "auxiliary_loss_mlp": 0.01067443, + "balance_loss_clip": 1.0404067, + "balance_loss_mlp": 1.06221163, + "epoch": 0.05952202014128964, + "flos": 22127760650880.0, + "grad_norm": 1.89069901477269, + "language_loss": 0.78102934, + "learning_rate": 3.990865146569105e-06, + "loss": 0.80390036, + "num_input_tokens_seen": 21167845, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.578125, + "step": 990, + "time_per_iteration": 2.6252431869506836 + }, + { + "auxiliary_loss_clip": 0.01208224, + "auxiliary_loss_mlp": 0.0105602, + "balance_loss_clip": 1.0303905, + "balance_loss_mlp": 1.05700588, + "epoch": 0.059582143393957614, + "flos": 20445776438400.0, + "grad_norm": 2.077710223302236, + "language_loss": 0.86406755, + "learning_rate": 3.990827927994434e-06, + "loss": 0.88671005, + "num_input_tokens_seen": 21185085, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.515625, + "step": 991, + "time_per_iteration": 2.483853340148926 + }, + { + "auxiliary_loss_clip": 0.01216429, + "auxiliary_loss_mlp": 0.01065185, + "balance_loss_clip": 1.04030573, + "balance_loss_mlp": 1.06190968, + "epoch": 0.059642266646625586, + "flos": 20594877793920.0, + "grad_norm": 2.866628977756486, + "language_loss": 0.76876801, + "learning_rate": 3.9907906339273674e-06, + "loss": 0.79158413, + "num_input_tokens_seen": 21204230, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 1.546875, + "step": 992, + "time_per_iteration": 2.5149648189544678 + }, + { + "auxiliary_loss_clip": 0.01214781, + "auxiliary_loss_mlp": 0.0106048, + "balance_loss_clip": 1.03701937, + "balance_loss_mlp": 1.06251192, + "epoch": 0.05970238989929355, + "flos": 19352655792000.0, + "grad_norm": 2.726921793738851, + "language_loss": 0.74594641, + "learning_rate": 3.9907532643693215e-06, + "loss": 0.76869899, + "num_input_tokens_seen": 21222655, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.5234375, + "step": 993, + "time_per_iteration": 2.4739816188812256 + }, + { + "auxiliary_loss_clip": 0.01214249, + "auxiliary_loss_mlp": 0.01071365, + "balance_loss_clip": 1.04560351, + "balance_loss_mlp": 1.06326771, + "epoch": 0.05976251315196152, + "flos": 30264040926720.0, + "grad_norm": 3.2517233877247396, + "language_loss": 0.78911841, + "learning_rate": 3.990715819321712e-06, + "loss": 0.81197453, + "num_input_tokens_seen": 21242310, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.515625, + "step": 994, + "time_per_iteration": 2.5408835411071777 + }, + { + "auxiliary_loss_clip": 0.01214677, + "auxiliary_loss_mlp": 0.01082728, + "balance_loss_clip": 1.05768251, + "balance_loss_mlp": 1.06170893, + "epoch": 0.05982263640462949, + "flos": 23185150243200.0, + "grad_norm": 2.42517884603863, + "language_loss": 0.79639304, + "learning_rate": 3.99067829878596e-06, + "loss": 0.81936711, + "num_input_tokens_seen": 21261410, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.53125, + "step": 995, + "time_per_iteration": 2.5062758922576904 + }, + { + "auxiliary_loss_clip": 0.01212085, + "auxiliary_loss_mlp": 0.01065995, + "balance_loss_clip": 1.04022169, + "balance_loss_mlp": 1.05969059, + "epoch": 0.05988275965729746, + "flos": 27850879463040.0, + "grad_norm": 2.536496545288829, + "language_loss": 0.86939722, + "learning_rate": 3.990640702763487e-06, + "loss": 0.89217806, + "num_input_tokens_seen": 21280080, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.53125, + "step": 996, + "time_per_iteration": 2.5236001014709473 + }, + { + "auxiliary_loss_clip": 0.01217352, + "auxiliary_loss_mlp": 0.01070048, + "balance_loss_clip": 1.04098463, + "balance_loss_mlp": 1.06309104, + "epoch": 0.05994288290996543, + "flos": 24680003575680.0, + "grad_norm": 4.013698471354103, + "language_loss": 0.88192105, + "learning_rate": 3.990603031255718e-06, + "loss": 0.90479505, + "num_input_tokens_seen": 21296765, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.546875, + "step": 997, + "time_per_iteration": 2.483116626739502 + }, + { + "auxiliary_loss_clip": 0.01079761, + "auxiliary_loss_mlp": 0.01004407, + "balance_loss_clip": 0.9993524, + "balance_loss_mlp": 1.01837301, + "epoch": 0.0600030061626334, + "flos": 69929568835200.0, + "grad_norm": 1.020759515587473, + "language_loss": 0.75442117, + "learning_rate": 3.990565284264083e-06, + "loss": 0.77526283, + "num_input_tokens_seen": 21363345, + "router_z_loss_clip": 0.05053711, + "router_z_loss_mlp": 0.6171875, + "step": 998, + "time_per_iteration": 3.152331590652466 + }, + { + "auxiliary_loss_clip": 0.01213812, + "auxiliary_loss_mlp": 0.01067708, + "balance_loss_clip": 1.04179215, + "balance_loss_mlp": 1.0626508, + "epoch": 0.06006312941530137, + "flos": 26540140268160.0, + "grad_norm": 1.8375420281697645, + "language_loss": 0.75796127, + "learning_rate": 3.990527461790013e-06, + "loss": 0.7807765, + "num_input_tokens_seen": 21385290, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.515625, + "step": 999, + "time_per_iteration": 2.5051238536834717 + }, + { + "auxiliary_loss_clip": 0.01212853, + "auxiliary_loss_mlp": 0.0106227, + "balance_loss_clip": 1.03575778, + "balance_loss_mlp": 1.05894446, + "epoch": 0.060123252667969335, + "flos": 27344000689920.0, + "grad_norm": 1.9091686508511199, + "language_loss": 0.82658899, + "learning_rate": 3.990489563834943e-06, + "loss": 0.8493402, + "num_input_tokens_seen": 21407625, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5390625, + "step": 1000, + "time_per_iteration": 2.5369935035705566 + }, + { + "auxiliary_loss_clip": 0.01215386, + "auxiliary_loss_mlp": 0.01057348, + "balance_loss_clip": 1.03282714, + "balance_loss_mlp": 1.06143069, + "epoch": 0.06018337592063731, + "flos": 27016710940800.0, + "grad_norm": 3.4065508827059783, + "language_loss": 0.85644853, + "learning_rate": 3.990451590400309e-06, + "loss": 0.8791759, + "num_input_tokens_seen": 21426835, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.5390625, + "step": 1001, + "time_per_iteration": 2.4972190856933594 + }, + { + "auxiliary_loss_clip": 0.0120879, + "auxiliary_loss_mlp": 0.01063055, + "balance_loss_clip": 1.0376749, + "balance_loss_mlp": 1.0587517, + "epoch": 0.06024349917330528, + "flos": 25592960580480.0, + "grad_norm": 2.156321640703371, + "language_loss": 0.74386394, + "learning_rate": 3.990413541487551e-06, + "loss": 0.76658237, + "num_input_tokens_seen": 21444920, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.5, + "step": 1002, + "time_per_iteration": 2.531172275543213 + }, + { + "auxiliary_loss_clip": 0.01211576, + "auxiliary_loss_mlp": 0.01065904, + "balance_loss_clip": 1.04019034, + "balance_loss_mlp": 1.06015134, + "epoch": 0.060303622425973244, + "flos": 26133271937280.0, + "grad_norm": 3.1165374575777145, + "language_loss": 0.75346643, + "learning_rate": 3.990375417098112e-06, + "loss": 0.77624118, + "num_input_tokens_seen": 21463555, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.515625, + "step": 1003, + "time_per_iteration": 2.508817434310913 + }, + { + "auxiliary_loss_clip": 0.01219434, + "auxiliary_loss_mlp": 0.01065938, + "balance_loss_clip": 1.04047489, + "balance_loss_mlp": 1.06255794, + "epoch": 0.060363745678641216, + "flos": 20377187418240.0, + "grad_norm": 2.2578292515807603, + "language_loss": 0.70071733, + "learning_rate": 3.990337217233437e-06, + "loss": 0.723571, + "num_input_tokens_seen": 21481990, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.5625, + "step": 1004, + "time_per_iteration": 2.480116844177246 + }, + { + "auxiliary_loss_clip": 0.01218526, + "auxiliary_loss_mlp": 0.01073584, + "balance_loss_clip": 1.04810917, + "balance_loss_mlp": 1.06360686, + "epoch": 0.06042386893130918, + "flos": 17749172753280.0, + "grad_norm": 2.248554137518493, + "language_loss": 0.83246684, + "learning_rate": 3.990298941894976e-06, + "loss": 0.85538793, + "num_input_tokens_seen": 21500385, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.546875, + "step": 1005, + "time_per_iteration": 2.449733018875122 + }, + { + "auxiliary_loss_clip": 0.01077543, + "auxiliary_loss_mlp": 0.01007523, + "balance_loss_clip": 1.00306416, + "balance_loss_mlp": 1.0157814, + "epoch": 0.06048399218397715, + "flos": 68538496872960.0, + "grad_norm": 0.8959746990508154, + "language_loss": 0.59000289, + "learning_rate": 3.9902605910841794e-06, + "loss": 0.61085355, + "num_input_tokens_seen": 21561040, + "router_z_loss_clip": 0.04467773, + "router_z_loss_mlp": 0.6171875, + "step": 1006, + "time_per_iteration": 3.1583423614501953 + }, + { + "auxiliary_loss_clip": 0.01209886, + "auxiliary_loss_mlp": 0.0105727, + "balance_loss_clip": 1.03203392, + "balance_loss_mlp": 1.05658197, + "epoch": 0.060544115436645125, + "flos": 23258515772160.0, + "grad_norm": 2.271524805944984, + "language_loss": 0.7428897, + "learning_rate": 3.990222164802503e-06, + "loss": 0.76556122, + "num_input_tokens_seen": 21580655, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.53125, + "step": 1007, + "time_per_iteration": 2.49139666557312 + }, + { + "auxiliary_loss_clip": 0.01212867, + "auxiliary_loss_mlp": 0.01055047, + "balance_loss_clip": 1.02930975, + "balance_loss_mlp": 1.05897522, + "epoch": 0.06060423868931309, + "flos": 23878441624320.0, + "grad_norm": 1.8583948299039934, + "language_loss": 0.80739897, + "learning_rate": 3.9901836630514006e-06, + "loss": 0.83007812, + "num_input_tokens_seen": 21599650, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.5390625, + "step": 1008, + "time_per_iteration": 2.4990036487579346 + }, + { + "auxiliary_loss_clip": 0.01213893, + "auxiliary_loss_mlp": 0.01055804, + "balance_loss_clip": 1.03082955, + "balance_loss_mlp": 1.06254637, + "epoch": 0.06066436194198106, + "flos": 18728061171840.0, + "grad_norm": 1.935763632111394, + "language_loss": 0.77840835, + "learning_rate": 3.990145085832335e-06, + "loss": 0.80110532, + "num_input_tokens_seen": 21617550, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.515625, + "step": 1009, + "time_per_iteration": 2.4785048961639404 + }, + { + "auxiliary_loss_clip": 0.01210213, + "auxiliary_loss_mlp": 0.01059495, + "balance_loss_clip": 1.03537917, + "balance_loss_mlp": 1.06082368, + "epoch": 0.06072448519464903, + "flos": 24640465680000.0, + "grad_norm": 2.1058592784097567, + "language_loss": 0.93059653, + "learning_rate": 3.990106433146769e-06, + "loss": 0.95329368, + "num_input_tokens_seen": 21635865, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.4921875, + "step": 1010, + "time_per_iteration": 2.507596015930176 + }, + { + "auxiliary_loss_clip": 0.01219036, + "auxiliary_loss_mlp": 0.01066438, + "balance_loss_clip": 1.0390203, + "balance_loss_mlp": 1.05885124, + "epoch": 0.060784608447317, + "flos": 17378825575680.0, + "grad_norm": 3.1716667034247843, + "language_loss": 0.71846473, + "learning_rate": 3.9900677049961665e-06, + "loss": 0.74131954, + "num_input_tokens_seen": 21653945, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.6015625, + "step": 1011, + "time_per_iteration": 2.4791040420532227 + }, + { + "auxiliary_loss_clip": 0.01214432, + "auxiliary_loss_mlp": 0.01070485, + "balance_loss_clip": 1.04388905, + "balance_loss_mlp": 1.05902421, + "epoch": 0.06084473169998497, + "flos": 23692208584320.0, + "grad_norm": 2.5871469840663535, + "language_loss": 0.87542284, + "learning_rate": 3.990028901381999e-06, + "loss": 0.89827204, + "num_input_tokens_seen": 21671230, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5546875, + "step": 1012, + "time_per_iteration": 2.4876151084899902 + }, + { + "auxiliary_loss_clip": 0.01206171, + "auxiliary_loss_mlp": 0.01060353, + "balance_loss_clip": 1.03577256, + "balance_loss_mlp": 1.05505085, + "epoch": 0.06090485495265294, + "flos": 23546339452800.0, + "grad_norm": 1.8956263482043672, + "language_loss": 0.76679665, + "learning_rate": 3.989990022305734e-06, + "loss": 0.78946191, + "num_input_tokens_seen": 21691155, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.515625, + "step": 1013, + "time_per_iteration": 2.4874446392059326 + }, + { + "auxiliary_loss_clip": 0.01215089, + "auxiliary_loss_mlp": 0.01067055, + "balance_loss_clip": 1.03946972, + "balance_loss_mlp": 1.05924904, + "epoch": 0.06096497820532091, + "flos": 20339301548160.0, + "grad_norm": 2.654718290448769, + "language_loss": 0.85651302, + "learning_rate": 3.98995106776885e-06, + "loss": 0.87933445, + "num_input_tokens_seen": 21707405, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.5546875, + "step": 1014, + "time_per_iteration": 2.483774423599243 + }, + { + "auxiliary_loss_clip": 0.0122011, + "auxiliary_loss_mlp": 0.01067578, + "balance_loss_clip": 1.03996944, + "balance_loss_mlp": 1.06207335, + "epoch": 0.061025101457988874, + "flos": 26939035779840.0, + "grad_norm": 2.4287988001966028, + "language_loss": 0.72807163, + "learning_rate": 3.98991203777282e-06, + "loss": 0.75094855, + "num_input_tokens_seen": 21728090, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.578125, + "step": 1015, + "time_per_iteration": 2.520026206970215 + }, + { + "auxiliary_loss_clip": 0.01207162, + "auxiliary_loss_mlp": 0.01068406, + "balance_loss_clip": 1.04290748, + "balance_loss_mlp": 1.0576005, + "epoch": 0.061085224710656846, + "flos": 25375054723200.0, + "grad_norm": 1.6555956389633335, + "language_loss": 0.79197502, + "learning_rate": 3.9898729323191275e-06, + "loss": 0.8147307, + "num_input_tokens_seen": 21747950, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4921875, + "step": 1016, + "time_per_iteration": 2.5177054405212402 + }, + { + "auxiliary_loss_clip": 0.01210352, + "auxiliary_loss_mlp": 0.01060413, + "balance_loss_clip": 1.03609443, + "balance_loss_mlp": 1.0571332, + "epoch": 0.06114534796332482, + "flos": 24824759385600.0, + "grad_norm": 1.934405213560846, + "language_loss": 0.76170123, + "learning_rate": 3.989833751409254e-06, + "loss": 0.78440881, + "num_input_tokens_seen": 21767900, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.53125, + "step": 1017, + "time_per_iteration": 2.517730951309204 + }, + { + "auxiliary_loss_clip": 0.01220983, + "auxiliary_loss_mlp": 0.01069505, + "balance_loss_clip": 1.04331422, + "balance_loss_mlp": 1.06240773, + "epoch": 0.061205471215992784, + "flos": 20631434860800.0, + "grad_norm": 1.873264658326973, + "language_loss": 0.86145842, + "learning_rate": 3.989794495044685e-06, + "loss": 0.88436329, + "num_input_tokens_seen": 21787375, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.5859375, + "step": 1018, + "time_per_iteration": 5.324457883834839 + }, + { + "auxiliary_loss_clip": 0.01206709, + "auxiliary_loss_mlp": 0.01071464, + "balance_loss_clip": 1.045012, + "balance_loss_mlp": 1.05659163, + "epoch": 0.061265594468660756, + "flos": 16508351381760.0, + "grad_norm": 2.696758126666256, + "language_loss": 0.77535981, + "learning_rate": 3.989755163226909e-06, + "loss": 0.79814154, + "num_input_tokens_seen": 21806275, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.5, + "step": 1019, + "time_per_iteration": 2.453768253326416 + }, + { + "auxiliary_loss_clip": 0.01210848, + "auxiliary_loss_mlp": 0.01061489, + "balance_loss_clip": 1.03559661, + "balance_loss_mlp": 1.05749679, + "epoch": 0.06132571772132872, + "flos": 26246211275520.0, + "grad_norm": 1.8458417378275351, + "language_loss": 0.84254557, + "learning_rate": 3.989715755957418e-06, + "loss": 0.86526895, + "num_input_tokens_seen": 21826430, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.53125, + "step": 1020, + "time_per_iteration": 2.5126123428344727 + }, + { + "auxiliary_loss_clip": 0.01217116, + "auxiliary_loss_mlp": 0.01060663, + "balance_loss_clip": 1.0352596, + "balance_loss_mlp": 1.06234074, + "epoch": 0.06138584097399669, + "flos": 37414788768000.0, + "grad_norm": 2.186416819505148, + "language_loss": 0.79234397, + "learning_rate": 3.989676273237705e-06, + "loss": 0.81512177, + "num_input_tokens_seen": 21847800, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.546875, + "step": 1021, + "time_per_iteration": 2.583918333053589 + }, + { + "auxiliary_loss_clip": 0.01207219, + "auxiliary_loss_mlp": 0.01064403, + "balance_loss_clip": 1.04207504, + "balance_loss_mlp": 1.05748677, + "epoch": 0.061445964226664665, + "flos": 17420661941760.0, + "grad_norm": 2.2026341390443434, + "language_loss": 0.87493509, + "learning_rate": 3.9896367150692705e-06, + "loss": 0.89765131, + "num_input_tokens_seen": 21863385, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.5, + "step": 1022, + "time_per_iteration": 2.441298007965088 + }, + { + "auxiliary_loss_clip": 0.01213359, + "auxiliary_loss_mlp": 0.0106856, + "balance_loss_clip": 1.04353857, + "balance_loss_mlp": 1.06052542, + "epoch": 0.06150608747933263, + "flos": 22600021691520.0, + "grad_norm": 1.752710779550117, + "language_loss": 0.82776564, + "learning_rate": 3.989597081453611e-06, + "loss": 0.85058486, + "num_input_tokens_seen": 21881880, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.53125, + "step": 1023, + "time_per_iteration": 2.5027952194213867 + }, + { + "auxiliary_loss_clip": 0.01079025, + "auxiliary_loss_mlp": 0.01003997, + "balance_loss_clip": 0.99944335, + "balance_loss_mlp": 1.01796818, + "epoch": 0.0615662107320006, + "flos": 56741482005120.0, + "grad_norm": 0.8999264202466762, + "language_loss": 0.65078986, + "learning_rate": 3.989557372392231e-06, + "loss": 0.67162001, + "num_input_tokens_seen": 21940550, + "router_z_loss_clip": 0.0456543, + "router_z_loss_mlp": 0.609375, + "step": 1024, + "time_per_iteration": 3.0969655513763428 + }, + { + "auxiliary_loss_clip": 0.01212272, + "auxiliary_loss_mlp": 0.01066841, + "balance_loss_clip": 1.04123473, + "balance_loss_mlp": 1.05936897, + "epoch": 0.06162633398466857, + "flos": 22564793427840.0, + "grad_norm": 1.9303372998519377, + "language_loss": 0.88293028, + "learning_rate": 3.989517587886636e-06, + "loss": 0.90572149, + "num_input_tokens_seen": 21958390, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.53125, + "step": 1025, + "time_per_iteration": 2.5229876041412354 + }, + { + "auxiliary_loss_clip": 0.01212316, + "auxiliary_loss_mlp": 0.01059432, + "balance_loss_clip": 1.03513718, + "balance_loss_mlp": 1.05916524, + "epoch": 0.06168645723733654, + "flos": 25593104234880.0, + "grad_norm": 1.519276165786755, + "language_loss": 0.84567487, + "learning_rate": 3.989477727938335e-06, + "loss": 0.86839235, + "num_input_tokens_seen": 21978625, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.53125, + "step": 1026, + "time_per_iteration": 2.5304806232452393 + }, + { + "auxiliary_loss_clip": 0.01212365, + "auxiliary_loss_mlp": 0.01071013, + "balance_loss_clip": 1.04614556, + "balance_loss_mlp": 1.05798197, + "epoch": 0.06174658049000451, + "flos": 15997917162240.0, + "grad_norm": 1.9431802827698534, + "language_loss": 0.82320756, + "learning_rate": 3.989437792548839e-06, + "loss": 0.84604132, + "num_input_tokens_seen": 21996035, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 1.546875, + "step": 1027, + "time_per_iteration": 2.4508368968963623 + }, + { + "auxiliary_loss_clip": 0.01209611, + "auxiliary_loss_mlp": 0.01058329, + "balance_loss_clip": 1.03343821, + "balance_loss_mlp": 1.05799866, + "epoch": 0.06180670374267248, + "flos": 11285970117120.0, + "grad_norm": 2.262386050001272, + "language_loss": 0.84232426, + "learning_rate": 3.989397781719663e-06, + "loss": 0.86500365, + "num_input_tokens_seen": 22011625, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 1.515625, + "step": 1028, + "time_per_iteration": 2.4485137462615967 + }, + { + "auxiliary_loss_clip": 0.01077664, + "auxiliary_loss_mlp": 0.01009618, + "balance_loss_clip": 1.00544536, + "balance_loss_mlp": 1.01686025, + "epoch": 0.06186682699534045, + "flos": 65130142216320.0, + "grad_norm": 0.9476883841381922, + "language_loss": 0.60497737, + "learning_rate": 3.989357695452323e-06, + "loss": 0.6258502, + "num_input_tokens_seen": 22066035, + "router_z_loss_clip": 0.04174805, + "router_z_loss_mlp": 0.609375, + "step": 1029, + "time_per_iteration": 2.8714137077331543 + }, + { + "auxiliary_loss_clip": 0.0120304, + "auxiliary_loss_mlp": 0.01066238, + "balance_loss_clip": 1.0419786, + "balance_loss_mlp": 1.05338669, + "epoch": 0.061926950248008414, + "flos": 21105742976640.0, + "grad_norm": 2.297452518318954, + "language_loss": 0.82309926, + "learning_rate": 3.98931753374834e-06, + "loss": 0.84579194, + "num_input_tokens_seen": 22085015, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.4921875, + "step": 1030, + "time_per_iteration": 2.4705348014831543 + }, + { + "auxiliary_loss_clip": 0.01214194, + "auxiliary_loss_mlp": 0.01071397, + "balance_loss_clip": 1.04586279, + "balance_loss_mlp": 1.06025672, + "epoch": 0.061987073500676386, + "flos": 17748454481280.0, + "grad_norm": 2.391039807046215, + "language_loss": 0.80262065, + "learning_rate": 3.989277296609237e-06, + "loss": 0.82547653, + "num_input_tokens_seen": 22102775, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.5390625, + "step": 1031, + "time_per_iteration": 2.447964906692505 + }, + { + "auxiliary_loss_clip": 0.0121101, + "auxiliary_loss_mlp": 0.01075497, + "balance_loss_clip": 1.04919958, + "balance_loss_mlp": 1.05865717, + "epoch": 0.06204719675334436, + "flos": 21836237869440.0, + "grad_norm": 1.6245278130098144, + "language_loss": 0.77141201, + "learning_rate": 3.98923698403654e-06, + "loss": 0.79427713, + "num_input_tokens_seen": 22121680, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.5234375, + "step": 1032, + "time_per_iteration": 2.475891590118408 + }, + { + "auxiliary_loss_clip": 0.01205906, + "auxiliary_loss_mlp": 0.01069412, + "balance_loss_clip": 1.04350805, + "balance_loss_mlp": 1.05307126, + "epoch": 0.06210732000601232, + "flos": 19353697286400.0, + "grad_norm": 3.949793190746779, + "language_loss": 0.89276892, + "learning_rate": 3.989196596031776e-06, + "loss": 0.91552204, + "num_input_tokens_seen": 22138155, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.53125, + "step": 1033, + "time_per_iteration": 2.5332658290863037 + }, + { + "auxiliary_loss_clip": 0.01212647, + "auxiliary_loss_mlp": 0.01059216, + "balance_loss_clip": 1.03437293, + "balance_loss_mlp": 1.05739737, + "epoch": 0.062167443258680295, + "flos": 24749382695040.0, + "grad_norm": 2.160025730572359, + "language_loss": 0.84795135, + "learning_rate": 3.989156132596479e-06, + "loss": 0.87066996, + "num_input_tokens_seen": 22157420, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.5546875, + "step": 1034, + "time_per_iteration": 2.507636785507202 + }, + { + "auxiliary_loss_clip": 0.01202421, + "auxiliary_loss_mlp": 0.01060051, + "balance_loss_clip": 1.03399241, + "balance_loss_mlp": 1.05694687, + "epoch": 0.06222756651134827, + "flos": 34458478773120.0, + "grad_norm": 3.176440156188905, + "language_loss": 0.81156218, + "learning_rate": 3.989115593732182e-06, + "loss": 0.83418697, + "num_input_tokens_seen": 22178620, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.453125, + "step": 1035, + "time_per_iteration": 2.624635696411133 + }, + { + "auxiliary_loss_clip": 0.01212161, + "auxiliary_loss_mlp": 0.01068413, + "balance_loss_clip": 1.04051828, + "balance_loss_mlp": 1.06080353, + "epoch": 0.06228768976401623, + "flos": 25666469763840.0, + "grad_norm": 2.252599829484163, + "language_loss": 0.78701359, + "learning_rate": 3.989074979440421e-06, + "loss": 0.80981934, + "num_input_tokens_seen": 22197125, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.515625, + "step": 1036, + "time_per_iteration": 2.53027081489563 + }, + { + "auxiliary_loss_clip": 0.01204167, + "auxiliary_loss_mlp": 0.01068533, + "balance_loss_clip": 1.04334402, + "balance_loss_mlp": 1.05620134, + "epoch": 0.062347813016684205, + "flos": 25295619795840.0, + "grad_norm": 1.670767972712633, + "language_loss": 0.86802149, + "learning_rate": 3.989034289722739e-06, + "loss": 0.8907485, + "num_input_tokens_seen": 22217575, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.4765625, + "step": 1037, + "time_per_iteration": 2.506011724472046 + }, + { + "auxiliary_loss_clip": 0.01206019, + "auxiliary_loss_mlp": 0.01057504, + "balance_loss_clip": 1.02990723, + "balance_loss_mlp": 1.05728471, + "epoch": 0.06240793626935217, + "flos": 26907039740160.0, + "grad_norm": 2.1914513209480933, + "language_loss": 0.81051469, + "learning_rate": 3.988993524580676e-06, + "loss": 0.83314991, + "num_input_tokens_seen": 22236840, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.484375, + "step": 1038, + "time_per_iteration": 2.486758232116699 + }, + { + "auxiliary_loss_clip": 0.01205947, + "auxiliary_loss_mlp": 0.01072566, + "balance_loss_clip": 1.04587555, + "balance_loss_mlp": 1.05856836, + "epoch": 0.06246805952202014, + "flos": 21615782146560.0, + "grad_norm": 2.3663261426095965, + "language_loss": 0.85336804, + "learning_rate": 3.98895268401578e-06, + "loss": 0.87615323, + "num_input_tokens_seen": 22256465, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.46875, + "step": 1039, + "time_per_iteration": 2.489241123199463 + }, + { + "auxiliary_loss_clip": 0.01207559, + "auxiliary_loss_mlp": 0.01067632, + "balance_loss_clip": 1.0417757, + "balance_loss_mlp": 1.05744672, + "epoch": 0.0625281827746881, + "flos": 19311896833920.0, + "grad_norm": 1.9774289629637263, + "language_loss": 0.80853289, + "learning_rate": 3.9889117680296e-06, + "loss": 0.83128488, + "num_input_tokens_seen": 22274025, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.5, + "step": 1040, + "time_per_iteration": 2.480022668838501 + }, + { + "auxiliary_loss_clip": 0.01213203, + "auxiliary_loss_mlp": 0.01067746, + "balance_loss_clip": 1.04155588, + "balance_loss_mlp": 1.06227219, + "epoch": 0.06258830602735609, + "flos": 27745769289600.0, + "grad_norm": 2.535271913081881, + "language_loss": 0.69440711, + "learning_rate": 3.988870776623685e-06, + "loss": 0.71721661, + "num_input_tokens_seen": 22292245, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.5078125, + "step": 1041, + "time_per_iteration": 2.5417978763580322 + }, + { + "auxiliary_loss_clip": 0.01210541, + "auxiliary_loss_mlp": 0.0106006, + "balance_loss_clip": 1.03360724, + "balance_loss_mlp": 1.05743289, + "epoch": 0.06264842928002405, + "flos": 23222605150080.0, + "grad_norm": 1.9564735382917973, + "language_loss": 0.80983013, + "learning_rate": 3.9888297097995905e-06, + "loss": 0.83253616, + "num_input_tokens_seen": 22311455, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.53125, + "step": 1042, + "time_per_iteration": 2.478926181793213 + }, + { + "auxiliary_loss_clip": 0.01210242, + "auxiliary_loss_mlp": 0.01057893, + "balance_loss_clip": 1.03352678, + "balance_loss_mlp": 1.05925727, + "epoch": 0.06270855253269202, + "flos": 38399495189760.0, + "grad_norm": 1.9466384226705415, + "language_loss": 0.76463902, + "learning_rate": 3.988788567558874e-06, + "loss": 0.78732038, + "num_input_tokens_seen": 22333750, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.515625, + "step": 1043, + "time_per_iteration": 2.6262781620025635 + }, + { + "auxiliary_loss_clip": 0.01203702, + "auxiliary_loss_mlp": 0.01066445, + "balance_loss_clip": 1.04174471, + "balance_loss_mlp": 1.05835676, + "epoch": 0.06276867578535998, + "flos": 22453542028800.0, + "grad_norm": 1.8860277298285366, + "language_loss": 0.92454541, + "learning_rate": 3.988747349903097e-06, + "loss": 0.94724691, + "num_input_tokens_seen": 22351940, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.453125, + "step": 1044, + "time_per_iteration": 2.4886953830718994 + }, + { + "auxiliary_loss_clip": 0.01204359, + "auxiliary_loss_mlp": 0.01073486, + "balance_loss_clip": 1.04824948, + "balance_loss_mlp": 1.05475259, + "epoch": 0.06282879903802796, + "flos": 22930435923840.0, + "grad_norm": 1.9539908597303346, + "language_loss": 0.8581354, + "learning_rate": 3.988706056833821e-06, + "loss": 0.88091385, + "num_input_tokens_seen": 22372085, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.5, + "step": 1045, + "time_per_iteration": 2.5382347106933594 + }, + { + "auxiliary_loss_clip": 0.01203094, + "auxiliary_loss_mlp": 0.01073753, + "balance_loss_clip": 1.04900479, + "balance_loss_mlp": 1.05618775, + "epoch": 0.06288892229069593, + "flos": 34819237019520.0, + "grad_norm": 2.0798822187092094, + "language_loss": 0.77675486, + "learning_rate": 3.9886646883526125e-06, + "loss": 0.79952335, + "num_input_tokens_seen": 22392020, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.46875, + "step": 1046, + "time_per_iteration": 2.548157215118408 + }, + { + "auxiliary_loss_clip": 0.01206605, + "auxiliary_loss_mlp": 0.01074859, + "balance_loss_clip": 1.04981279, + "balance_loss_mlp": 1.05837655, + "epoch": 0.06294904554336389, + "flos": 19427134642560.0, + "grad_norm": 2.197016946040243, + "language_loss": 0.77317166, + "learning_rate": 3.988623244461039e-06, + "loss": 0.79598629, + "num_input_tokens_seen": 22411180, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.4765625, + "step": 1047, + "time_per_iteration": 2.467973232269287 + }, + { + "auxiliary_loss_clip": 0.0121283, + "auxiliary_loss_mlp": 0.01061299, + "balance_loss_clip": 1.03584743, + "balance_loss_mlp": 1.05874014, + "epoch": 0.06300916879603187, + "flos": 40661867358720.0, + "grad_norm": 2.3103480986625753, + "language_loss": 0.7696203, + "learning_rate": 3.988581725160672e-06, + "loss": 0.79236162, + "num_input_tokens_seen": 22435105, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.546875, + "step": 1048, + "time_per_iteration": 2.636072874069214 + }, + { + "auxiliary_loss_clip": 0.01209565, + "auxiliary_loss_mlp": 0.01072791, + "balance_loss_clip": 1.0470655, + "balance_loss_mlp": 1.0583266, + "epoch": 0.06306929204869983, + "flos": 23804142341760.0, + "grad_norm": 2.2069714466600656, + "language_loss": 0.77757037, + "learning_rate": 3.988540130453087e-06, + "loss": 0.80039394, + "num_input_tokens_seen": 22452710, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.515625, + "step": 1049, + "time_per_iteration": 2.5173420906066895 + }, + { + "auxiliary_loss_clip": 0.01207985, + "auxiliary_loss_mlp": 0.01065489, + "balance_loss_clip": 1.04047871, + "balance_loss_mlp": 1.05734015, + "epoch": 0.0631294153013678, + "flos": 18915802583040.0, + "grad_norm": 2.316298014027776, + "language_loss": 0.83165503, + "learning_rate": 3.988498460339862e-06, + "loss": 0.85438979, + "num_input_tokens_seen": 22470175, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.5078125, + "step": 1050, + "time_per_iteration": 2.4742541313171387 + }, + { + "auxiliary_loss_clip": 0.01204381, + "auxiliary_loss_mlp": 0.01062607, + "balance_loss_clip": 1.03852665, + "balance_loss_mlp": 1.05776763, + "epoch": 0.06318953855403578, + "flos": 24280174310400.0, + "grad_norm": 2.1475970013183563, + "language_loss": 0.76909173, + "learning_rate": 3.988456714822575e-06, + "loss": 0.79176152, + "num_input_tokens_seen": 22490020, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.46875, + "step": 1051, + "time_per_iteration": 2.4629740715026855 + }, + { + "auxiliary_loss_clip": 0.01207556, + "auxiliary_loss_mlp": 0.01070398, + "balance_loss_clip": 1.04487562, + "balance_loss_mlp": 1.05788827, + "epoch": 0.06324966180670374, + "flos": 22528918719360.0, + "grad_norm": 2.090947022989376, + "language_loss": 0.80053556, + "learning_rate": 3.98841489390281e-06, + "loss": 0.82331514, + "num_input_tokens_seen": 22509685, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.4921875, + "step": 1052, + "time_per_iteration": 2.4729230403900146 + }, + { + "auxiliary_loss_clip": 0.01209047, + "auxiliary_loss_mlp": 0.01064567, + "balance_loss_clip": 1.03911567, + "balance_loss_mlp": 1.05839717, + "epoch": 0.06330978505937171, + "flos": 15778107884160.0, + "grad_norm": 2.21177767113968, + "language_loss": 0.78088665, + "learning_rate": 3.988372997582155e-06, + "loss": 0.80362272, + "num_input_tokens_seen": 22527905, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.5078125, + "step": 1053, + "time_per_iteration": 2.433969736099243 + }, + { + "auxiliary_loss_clip": 0.01209123, + "auxiliary_loss_mlp": 0.01055135, + "balance_loss_clip": 1.03094769, + "balance_loss_mlp": 1.0578481, + "epoch": 0.06336990831203967, + "flos": 21471098163840.0, + "grad_norm": 1.8421697124920164, + "language_loss": 0.84737611, + "learning_rate": 3.988331025862195e-06, + "loss": 0.8700186, + "num_input_tokens_seen": 22546335, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.515625, + "step": 1054, + "time_per_iteration": 2.4986183643341064 + }, + { + "auxiliary_loss_clip": 0.01205973, + "auxiliary_loss_mlp": 0.01065192, + "balance_loss_clip": 1.04051518, + "balance_loss_mlp": 1.05870843, + "epoch": 0.06343003156470765, + "flos": 18478877546880.0, + "grad_norm": 1.9255333357469135, + "language_loss": 0.8566432, + "learning_rate": 3.9882889787445225e-06, + "loss": 0.87935483, + "num_input_tokens_seen": 22563885, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.4765625, + "step": 1055, + "time_per_iteration": 2.5098037719726562 + }, + { + "auxiliary_loss_clip": 0.0121179, + "auxiliary_loss_mlp": 0.01071097, + "balance_loss_clip": 1.0451932, + "balance_loss_mlp": 1.05891657, + "epoch": 0.06349015481737562, + "flos": 25154886309120.0, + "grad_norm": 2.390503126540762, + "language_loss": 0.80966836, + "learning_rate": 3.988246856230734e-06, + "loss": 0.83249724, + "num_input_tokens_seen": 22583035, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.53125, + "step": 1056, + "time_per_iteration": 2.4944088459014893 + }, + { + "auxiliary_loss_clip": 0.01211818, + "auxiliary_loss_mlp": 0.01065832, + "balance_loss_clip": 1.03816319, + "balance_loss_mlp": 1.05503476, + "epoch": 0.06355027807004358, + "flos": 26871775562880.0, + "grad_norm": 2.70684555522199, + "language_loss": 0.81153649, + "learning_rate": 3.988204658322426e-06, + "loss": 0.83431304, + "num_input_tokens_seen": 22605055, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.5703125, + "step": 1057, + "time_per_iteration": 2.5327882766723633 + }, + { + "auxiliary_loss_clip": 0.0119703, + "auxiliary_loss_mlp": 0.01056395, + "balance_loss_clip": 1.03401923, + "balance_loss_mlp": 1.054492, + "epoch": 0.06361040132271156, + "flos": 21396691140480.0, + "grad_norm": 2.2830641052403826, + "language_loss": 0.8369416, + "learning_rate": 3.988162385021196e-06, + "loss": 0.85947585, + "num_input_tokens_seen": 22623760, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.421875, + "step": 1058, + "time_per_iteration": 2.4742424488067627 + }, + { + "auxiliary_loss_clip": 0.01208572, + "auxiliary_loss_mlp": 0.01067718, + "balance_loss_clip": 1.03934646, + "balance_loss_mlp": 1.05714464, + "epoch": 0.06367052457537953, + "flos": 25733765894400.0, + "grad_norm": 1.9712110015930453, + "language_loss": 0.87264961, + "learning_rate": 3.988120036328651e-06, + "loss": 0.8954125, + "num_input_tokens_seen": 22643000, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.515625, + "step": 1059, + "time_per_iteration": 5.514882564544678 + }, + { + "auxiliary_loss_clip": 0.01213823, + "auxiliary_loss_mlp": 0.01068179, + "balance_loss_clip": 1.04273927, + "balance_loss_mlp": 1.06130195, + "epoch": 0.0637306478280475, + "flos": 17631420992640.0, + "grad_norm": 2.227642611819728, + "language_loss": 0.9117676, + "learning_rate": 3.988077612246394e-06, + "loss": 0.9345876, + "num_input_tokens_seen": 22660460, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.5234375, + "step": 1060, + "time_per_iteration": 3.8977622985839844 + }, + { + "auxiliary_loss_clip": 0.01204952, + "auxiliary_loss_mlp": 0.01062848, + "balance_loss_clip": 1.03691983, + "balance_loss_mlp": 1.05582809, + "epoch": 0.06379077108071547, + "flos": 13662610427520.0, + "grad_norm": 1.9159755464944204, + "language_loss": 0.87713706, + "learning_rate": 3.988035112776035e-06, + "loss": 0.89981508, + "num_input_tokens_seen": 22679270, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.4921875, + "step": 1061, + "time_per_iteration": 2.4825663566589355 + }, + { + "auxiliary_loss_clip": 0.01213048, + "auxiliary_loss_mlp": 0.01066139, + "balance_loss_clip": 1.03862584, + "balance_loss_mlp": 1.05683804, + "epoch": 0.06385089433338344, + "flos": 28478849961600.0, + "grad_norm": 2.167309005799961, + "language_loss": 0.771905, + "learning_rate": 3.987992537919185e-06, + "loss": 0.79469687, + "num_input_tokens_seen": 22699330, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.5625, + "step": 1062, + "time_per_iteration": 2.5576398372650146 + }, + { + "auxiliary_loss_clip": 0.01206834, + "auxiliary_loss_mlp": 0.01063844, + "balance_loss_clip": 1.03896523, + "balance_loss_mlp": 1.05504322, + "epoch": 0.0639110175860514, + "flos": 24311057028480.0, + "grad_norm": 2.0414192004570872, + "language_loss": 0.86835265, + "learning_rate": 3.987949887677459e-06, + "loss": 0.89105946, + "num_input_tokens_seen": 22717945, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 1.515625, + "step": 1063, + "time_per_iteration": 2.472473382949829 + }, + { + "auxiliary_loss_clip": 0.01206458, + "auxiliary_loss_mlp": 0.0106328, + "balance_loss_clip": 1.03747082, + "balance_loss_mlp": 1.05539751, + "epoch": 0.06397114083871938, + "flos": 22090772620800.0, + "grad_norm": 2.0150359019026185, + "language_loss": 0.8051579, + "learning_rate": 3.9879071620524744e-06, + "loss": 0.82785529, + "num_input_tokens_seen": 22736790, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.515625, + "step": 1064, + "time_per_iteration": 2.478205919265747 + }, + { + "auxiliary_loss_clip": 0.01207278, + "auxiliary_loss_mlp": 0.01070567, + "balance_loss_clip": 1.04409075, + "balance_loss_mlp": 1.05682254, + "epoch": 0.06403126409138735, + "flos": 19572824206080.0, + "grad_norm": 2.254194289767691, + "language_loss": 0.84650666, + "learning_rate": 3.987864361045851e-06, + "loss": 0.86928511, + "num_input_tokens_seen": 22754745, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5078125, + "step": 1065, + "time_per_iteration": 2.456456184387207 + }, + { + "auxiliary_loss_clip": 0.01207067, + "auxiliary_loss_mlp": 0.01055171, + "balance_loss_clip": 1.03099585, + "balance_loss_mlp": 1.05966115, + "epoch": 0.06409138734405531, + "flos": 40807413267840.0, + "grad_norm": 1.66169186591579, + "language_loss": 0.68201709, + "learning_rate": 3.987821484659211e-06, + "loss": 0.70463943, + "num_input_tokens_seen": 22776780, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.46875, + "step": 1066, + "time_per_iteration": 2.6294829845428467 + }, + { + "auxiliary_loss_clip": 0.01206291, + "auxiliary_loss_mlp": 0.01076738, + "balance_loss_clip": 1.05003476, + "balance_loss_mlp": 1.05877519, + "epoch": 0.06415151059672328, + "flos": 20441610460800.0, + "grad_norm": 3.704601442813356, + "language_loss": 0.90345579, + "learning_rate": 3.987778532894181e-06, + "loss": 0.9262861, + "num_input_tokens_seen": 22793915, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.4765625, + "step": 1067, + "time_per_iteration": 2.459721565246582 + }, + { + "auxiliary_loss_clip": 0.01207052, + "auxiliary_loss_mlp": 0.01068129, + "balance_loss_clip": 1.04364336, + "balance_loss_mlp": 1.05625772, + "epoch": 0.06421163384939126, + "flos": 18072045129600.0, + "grad_norm": 2.8684947664405436, + "language_loss": 0.8343029, + "learning_rate": 3.987735505752391e-06, + "loss": 0.85705471, + "num_input_tokens_seen": 22812670, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.515625, + "step": 1068, + "time_per_iteration": 2.4611129760742188 + }, + { + "auxiliary_loss_clip": 0.01205753, + "auxiliary_loss_mlp": 0.01064379, + "balance_loss_clip": 1.03963113, + "balance_loss_mlp": 1.05991328, + "epoch": 0.06427175710205922, + "flos": 25119442563840.0, + "grad_norm": 2.4683216708617053, + "language_loss": 0.89402264, + "learning_rate": 3.987692403235471e-06, + "loss": 0.91672397, + "num_input_tokens_seen": 22832440, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.453125, + "step": 1069, + "time_per_iteration": 2.486241340637207 + }, + { + "auxiliary_loss_clip": 0.01206711, + "auxiliary_loss_mlp": 0.01082225, + "balance_loss_clip": 1.05555749, + "balance_loss_mlp": 1.05718124, + "epoch": 0.06433188035472719, + "flos": 17380549428480.0, + "grad_norm": 2.6076700233042396, + "language_loss": 0.95764256, + "learning_rate": 3.987649225345056e-06, + "loss": 0.98053193, + "num_input_tokens_seen": 22845495, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5, + "step": 1070, + "time_per_iteration": 2.413357734680176 + }, + { + "auxiliary_loss_clip": 0.01209924, + "auxiliary_loss_mlp": 0.01057318, + "balance_loss_clip": 1.0309608, + "balance_loss_mlp": 1.05859673, + "epoch": 0.06439200360739517, + "flos": 23546267625600.0, + "grad_norm": 1.8004745601001504, + "language_loss": 0.8819589, + "learning_rate": 3.987605972082782e-06, + "loss": 0.90463126, + "num_input_tokens_seen": 22865390, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.515625, + "step": 1071, + "time_per_iteration": 2.4717295169830322 + }, + { + "auxiliary_loss_clip": 0.01204044, + "auxiliary_loss_mlp": 0.01052011, + "balance_loss_clip": 1.02799058, + "balance_loss_mlp": 1.056633, + "epoch": 0.06445212686006313, + "flos": 21979772616960.0, + "grad_norm": 1.6498592642907823, + "language_loss": 0.75996184, + "learning_rate": 3.987562643450292e-06, + "loss": 0.78252238, + "num_input_tokens_seen": 22885495, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.4765625, + "step": 1072, + "time_per_iteration": 2.486936330795288 + }, + { + "auxiliary_loss_clip": 0.01207782, + "auxiliary_loss_mlp": 0.010661, + "balance_loss_clip": 1.03951669, + "balance_loss_mlp": 1.05679154, + "epoch": 0.0645122501127311, + "flos": 25921291824000.0, + "grad_norm": 1.95165590675185, + "language_loss": 0.80415034, + "learning_rate": 3.987519239449226e-06, + "loss": 0.82688916, + "num_input_tokens_seen": 22904845, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5078125, + "step": 1073, + "time_per_iteration": 2.476189613342285 + }, + { + "auxiliary_loss_clip": 0.01200054, + "auxiliary_loss_mlp": 0.01059954, + "balance_loss_clip": 1.03563547, + "balance_loss_mlp": 1.05634785, + "epoch": 0.06457237336539907, + "flos": 25626034028160.0, + "grad_norm": 1.7105520573330508, + "language_loss": 0.80205524, + "learning_rate": 3.987475760081233e-06, + "loss": 0.82465529, + "num_input_tokens_seen": 22925940, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.4375, + "step": 1074, + "time_per_iteration": 2.499133586883545 + }, + { + "auxiliary_loss_clip": 0.01204265, + "auxiliary_loss_mlp": 0.01060595, + "balance_loss_clip": 1.03469074, + "balance_loss_mlp": 1.05560029, + "epoch": 0.06463249661806704, + "flos": 19463979018240.0, + "grad_norm": 2.398999995550556, + "language_loss": 0.79203326, + "learning_rate": 3.987432205347958e-06, + "loss": 0.81468183, + "num_input_tokens_seen": 22944375, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.484375, + "step": 1075, + "time_per_iteration": 2.46777606010437 + }, + { + "auxiliary_loss_clip": 0.01207545, + "auxiliary_loss_mlp": 0.01064646, + "balance_loss_clip": 1.04086363, + "balance_loss_mlp": 1.05960226, + "epoch": 0.064692619870735, + "flos": 24498044254080.0, + "grad_norm": 2.7671348430420712, + "language_loss": 0.87819242, + "learning_rate": 3.987388575251055e-06, + "loss": 0.90091443, + "num_input_tokens_seen": 22959145, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.484375, + "step": 1076, + "time_per_iteration": 2.487417221069336 + }, + { + "auxiliary_loss_clip": 0.01199028, + "auxiliary_loss_mlp": 0.01053729, + "balance_loss_clip": 1.02918351, + "balance_loss_mlp": 1.05429745, + "epoch": 0.06475274312340297, + "flos": 17018677860480.0, + "grad_norm": 2.1388407300528534, + "language_loss": 0.80692923, + "learning_rate": 3.98734486979218e-06, + "loss": 0.82945681, + "num_input_tokens_seen": 22978100, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.4453125, + "step": 1077, + "time_per_iteration": 2.4290995597839355 + }, + { + "auxiliary_loss_clip": 0.01211867, + "auxiliary_loss_mlp": 0.01071702, + "balance_loss_clip": 1.04566646, + "balance_loss_mlp": 1.05862093, + "epoch": 0.06481286637607095, + "flos": 24572379450240.0, + "grad_norm": 2.618517400605346, + "language_loss": 0.91640681, + "learning_rate": 3.987301088972986e-06, + "loss": 0.93924248, + "num_input_tokens_seen": 22997285, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.53125, + "step": 1078, + "time_per_iteration": 2.500995635986328 + }, + { + "auxiliary_loss_clip": 0.01212712, + "auxiliary_loss_mlp": 0.01062475, + "balance_loss_clip": 1.03698754, + "balance_loss_mlp": 1.05874825, + "epoch": 0.06487298962873891, + "flos": 21105635235840.0, + "grad_norm": 2.106125999672554, + "language_loss": 0.78772497, + "learning_rate": 3.987257232795137e-06, + "loss": 0.81047684, + "num_input_tokens_seen": 23016285, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.5390625, + "step": 1079, + "time_per_iteration": 2.4510841369628906 + }, + { + "auxiliary_loss_clip": 0.01204732, + "auxiliary_loss_mlp": 0.01061369, + "balance_loss_clip": 1.03619218, + "balance_loss_mlp": 1.05602205, + "epoch": 0.06493311288140688, + "flos": 24608182331520.0, + "grad_norm": 2.051955253501364, + "language_loss": 0.69555283, + "learning_rate": 3.987213301260294e-06, + "loss": 0.7182138, + "num_input_tokens_seen": 23036420, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.484375, + "step": 1080, + "time_per_iteration": 2.5024302005767822 + }, + { + "auxiliary_loss_clip": 0.01204586, + "auxiliary_loss_mlp": 0.01063302, + "balance_loss_clip": 1.03649211, + "balance_loss_mlp": 1.05477285, + "epoch": 0.06499323613407486, + "flos": 25337994865920.0, + "grad_norm": 1.85895294752556, + "language_loss": 0.72094852, + "learning_rate": 3.987169294370123e-06, + "loss": 0.74362737, + "num_input_tokens_seen": 23056945, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.5, + "step": 1081, + "time_per_iteration": 2.5032622814178467 + }, + { + "auxiliary_loss_clip": 0.01201777, + "auxiliary_loss_mlp": 0.01064533, + "balance_loss_clip": 1.03867674, + "balance_loss_mlp": 1.0554111, + "epoch": 0.06505335938674282, + "flos": 20375714960640.0, + "grad_norm": 2.6422342029105863, + "language_loss": 0.84621316, + "learning_rate": 3.987125212126294e-06, + "loss": 0.86887628, + "num_input_tokens_seen": 23074940, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.46875, + "step": 1082, + "time_per_iteration": 2.459564447402954 + }, + { + "auxiliary_loss_clip": 0.01214386, + "auxiliary_loss_mlp": 0.01067955, + "balance_loss_clip": 1.04106104, + "balance_loss_mlp": 1.05817008, + "epoch": 0.06511348263941079, + "flos": 25337923038720.0, + "grad_norm": 2.177850298461163, + "language_loss": 0.8303026, + "learning_rate": 3.987081054530478e-06, + "loss": 0.85312605, + "num_input_tokens_seen": 23093420, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.5625, + "step": 1083, + "time_per_iteration": 2.504584550857544 + }, + { + "auxiliary_loss_clip": 0.01206291, + "auxiliary_loss_mlp": 0.01065303, + "balance_loss_clip": 1.03852844, + "balance_loss_mlp": 1.05794787, + "epoch": 0.06517360589207877, + "flos": 20332801186560.0, + "grad_norm": 2.6002614807121227, + "language_loss": 0.79689312, + "learning_rate": 3.987036821584348e-06, + "loss": 0.81960905, + "num_input_tokens_seen": 23111550, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.484375, + "step": 1084, + "time_per_iteration": 2.4530820846557617 + }, + { + "auxiliary_loss_clip": 0.01204762, + "auxiliary_loss_mlp": 0.01060872, + "balance_loss_clip": 1.03489637, + "balance_loss_mlp": 1.05634058, + "epoch": 0.06523372914474673, + "flos": 31681650061440.0, + "grad_norm": 2.1191367521188074, + "language_loss": 0.66211331, + "learning_rate": 3.986992513289584e-06, + "loss": 0.68476963, + "num_input_tokens_seen": 23130335, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.484375, + "step": 1085, + "time_per_iteration": 2.5733256340026855 + }, + { + "auxiliary_loss_clip": 0.01198609, + "auxiliary_loss_mlp": 0.01069188, + "balance_loss_clip": 1.04436827, + "balance_loss_mlp": 1.05400848, + "epoch": 0.0652938523974147, + "flos": 20778165918720.0, + "grad_norm": 1.9997547556569089, + "language_loss": 0.76998973, + "learning_rate": 3.9869481296478645e-06, + "loss": 0.79266769, + "num_input_tokens_seen": 23152380, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.4453125, + "step": 1086, + "time_per_iteration": 2.4958763122558594 + }, + { + "auxiliary_loss_clip": 0.01199669, + "auxiliary_loss_mlp": 0.01063299, + "balance_loss_clip": 1.03763306, + "balance_loss_mlp": 1.05291176, + "epoch": 0.06535397565008266, + "flos": 16690993061760.0, + "grad_norm": 2.1546414392836977, + "language_loss": 0.85154319, + "learning_rate": 3.986903670660872e-06, + "loss": 0.87417287, + "num_input_tokens_seen": 23171630, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.46875, + "step": 1087, + "time_per_iteration": 2.4456934928894043 + }, + { + "auxiliary_loss_clip": 0.01204231, + "auxiliary_loss_mlp": 0.01061167, + "balance_loss_clip": 1.03609776, + "balance_loss_mlp": 1.05594206, + "epoch": 0.06541409890275064, + "flos": 26868220116480.0, + "grad_norm": 1.7775330808837086, + "language_loss": 0.77970594, + "learning_rate": 3.9868591363302945e-06, + "loss": 0.80235994, + "num_input_tokens_seen": 23192520, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.484375, + "step": 1088, + "time_per_iteration": 2.53505277633667 + }, + { + "auxiliary_loss_clip": 0.01204134, + "auxiliary_loss_mlp": 0.01066637, + "balance_loss_clip": 1.04329574, + "balance_loss_mlp": 1.05602646, + "epoch": 0.06547422215541861, + "flos": 20521620005760.0, + "grad_norm": 1.9036978890371752, + "language_loss": 0.71191919, + "learning_rate": 3.9868145266578186e-06, + "loss": 0.73462689, + "num_input_tokens_seen": 23210710, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.484375, + "step": 1089, + "time_per_iteration": 2.4569168090820312 + }, + { + "auxiliary_loss_clip": 0.01204151, + "auxiliary_loss_mlp": 0.0105997, + "balance_loss_clip": 1.03566289, + "balance_loss_mlp": 1.05729651, + "epoch": 0.06553434540808657, + "flos": 22016616992640.0, + "grad_norm": 1.7924808842614686, + "language_loss": 0.85504186, + "learning_rate": 3.9867698416451366e-06, + "loss": 0.8776831, + "num_input_tokens_seen": 23230305, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.46875, + "step": 1090, + "time_per_iteration": 2.4624812602996826 + }, + { + "auxiliary_loss_clip": 0.01204567, + "auxiliary_loss_mlp": 0.01062106, + "balance_loss_clip": 1.0365001, + "balance_loss_mlp": 1.05594897, + "epoch": 0.06559446866075455, + "flos": 24608649208320.0, + "grad_norm": 2.2382380061135945, + "language_loss": 0.72027361, + "learning_rate": 3.9867250812939434e-06, + "loss": 0.74294031, + "num_input_tokens_seen": 23249015, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.484375, + "step": 1091, + "time_per_iteration": 2.4911999702453613 + }, + { + "auxiliary_loss_clip": 0.01201014, + "auxiliary_loss_mlp": 0.0106187, + "balance_loss_clip": 1.03638279, + "balance_loss_mlp": 1.05507159, + "epoch": 0.06565459191342252, + "flos": 24274679529600.0, + "grad_norm": 2.7948943762047525, + "language_loss": 0.82525271, + "learning_rate": 3.986680245605936e-06, + "loss": 0.8478815, + "num_input_tokens_seen": 23265105, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4609375, + "step": 1092, + "time_per_iteration": 2.510835886001587 + }, + { + "auxiliary_loss_clip": 0.01205888, + "auxiliary_loss_mlp": 0.01064535, + "balance_loss_clip": 1.03716493, + "balance_loss_mlp": 1.05484402, + "epoch": 0.06571471516609048, + "flos": 24787124910720.0, + "grad_norm": 4.994634192306823, + "language_loss": 0.71286589, + "learning_rate": 3.986635334582814e-06, + "loss": 0.73557013, + "num_input_tokens_seen": 23283950, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.515625, + "step": 1093, + "time_per_iteration": 2.528994560241699 + }, + { + "auxiliary_loss_clip": 0.01204526, + "auxiliary_loss_mlp": 0.01063177, + "balance_loss_clip": 1.03668869, + "balance_loss_mlp": 1.05701041, + "epoch": 0.06577483841875846, + "flos": 26214071581440.0, + "grad_norm": 1.8259988866114194, + "language_loss": 0.87971264, + "learning_rate": 3.986590348226282e-06, + "loss": 0.90238965, + "num_input_tokens_seen": 23305005, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.4765625, + "step": 1094, + "time_per_iteration": 2.50201678276062 + }, + { + "auxiliary_loss_clip": 0.01205803, + "auxiliary_loss_mlp": 0.0106202, + "balance_loss_clip": 1.0350548, + "balance_loss_mlp": 1.0575459, + "epoch": 0.06583496167142643, + "flos": 25080802508160.0, + "grad_norm": 1.6349502946236962, + "language_loss": 0.81364405, + "learning_rate": 3.986545286538044e-06, + "loss": 0.83632231, + "num_input_tokens_seen": 23323220, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.484375, + "step": 1095, + "time_per_iteration": 2.4947729110717773 + }, + { + "auxiliary_loss_clip": 0.01200923, + "auxiliary_loss_mlp": 0.01057353, + "balance_loss_clip": 1.03414297, + "balance_loss_mlp": 1.05544913, + "epoch": 0.06589508492409439, + "flos": 25629804956160.0, + "grad_norm": 2.4379029944224215, + "language_loss": 0.69712919, + "learning_rate": 3.986500149519811e-06, + "loss": 0.7197119, + "num_input_tokens_seen": 23342235, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.453125, + "step": 1096, + "time_per_iteration": 2.5135879516601562 + }, + { + "auxiliary_loss_clip": 0.01206873, + "auxiliary_loss_mlp": 0.01069815, + "balance_loss_clip": 1.04451883, + "balance_loss_mlp": 1.0592947, + "epoch": 0.06595520817676236, + "flos": 23621249266560.0, + "grad_norm": 1.7715259730160258, + "language_loss": 0.77498722, + "learning_rate": 3.986454937173292e-06, + "loss": 0.79775411, + "num_input_tokens_seen": 23363680, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.4765625, + "step": 1097, + "time_per_iteration": 2.4872820377349854 + }, + { + "auxiliary_loss_clip": 0.0120653, + "auxiliary_loss_mlp": 0.01063548, + "balance_loss_clip": 1.03814423, + "balance_loss_mlp": 1.05785179, + "epoch": 0.06601533142943034, + "flos": 33801708545280.0, + "grad_norm": 1.7376479388989727, + "language_loss": 0.77846545, + "learning_rate": 3.986409649500203e-06, + "loss": 0.80116618, + "num_input_tokens_seen": 23385590, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.484375, + "step": 1098, + "time_per_iteration": 2.583075761795044 + }, + { + "auxiliary_loss_clip": 0.01204454, + "auxiliary_loss_mlp": 0.01071542, + "balance_loss_clip": 1.04483891, + "balance_loss_mlp": 1.05739522, + "epoch": 0.0660754546820983, + "flos": 20259184262400.0, + "grad_norm": 1.9398633669636132, + "language_loss": 0.81675154, + "learning_rate": 3.986364286502261e-06, + "loss": 0.83951151, + "num_input_tokens_seen": 23402945, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.46875, + "step": 1099, + "time_per_iteration": 2.446969985961914 + }, + { + "auxiliary_loss_clip": 0.01195976, + "auxiliary_loss_mlp": 0.01052705, + "balance_loss_clip": 1.02801692, + "balance_loss_mlp": 1.0519135, + "epoch": 0.06613557793476627, + "flos": 19354164163200.0, + "grad_norm": 2.0018625732470245, + "language_loss": 0.82619941, + "learning_rate": 3.986318848181186e-06, + "loss": 0.84868616, + "num_input_tokens_seen": 23421410, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.4375, + "step": 1100, + "time_per_iteration": 2.4545743465423584 + }, + { + "auxiliary_loss_clip": 0.01204382, + "auxiliary_loss_mlp": 0.01060672, + "balance_loss_clip": 1.03630555, + "balance_loss_mlp": 1.05827951, + "epoch": 0.06619570118743424, + "flos": 13772568936960.0, + "grad_norm": 2.362466383483127, + "language_loss": 0.73439336, + "learning_rate": 3.986273334538702e-06, + "loss": 0.7570439, + "num_input_tokens_seen": 23438870, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.4609375, + "step": 1101, + "time_per_iteration": 6.740786790847778 + }, + { + "auxiliary_loss_clip": 0.0119874, + "auxiliary_loss_mlp": 0.01062411, + "balance_loss_clip": 1.03829539, + "balance_loss_mlp": 1.05373132, + "epoch": 0.06625582444010221, + "flos": 17857874286720.0, + "grad_norm": 2.46656505058328, + "language_loss": 0.86047602, + "learning_rate": 3.986227745576533e-06, + "loss": 0.88308758, + "num_input_tokens_seen": 23456975, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.453125, + "step": 1102, + "time_per_iteration": 2.4480903148651123 + }, + { + "auxiliary_loss_clip": 0.01200394, + "auxiliary_loss_mlp": 0.01057443, + "balance_loss_clip": 1.0322063, + "balance_loss_mlp": 1.05588222, + "epoch": 0.06631594769277017, + "flos": 11838707579520.0, + "grad_norm": 2.0494810685505995, + "language_loss": 0.81707513, + "learning_rate": 3.98618208129641e-06, + "loss": 0.83965349, + "num_input_tokens_seen": 23473440, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.4453125, + "step": 1103, + "time_per_iteration": 2.4419338703155518 + }, + { + "auxiliary_loss_clip": 0.01203538, + "auxiliary_loss_mlp": 0.01063441, + "balance_loss_clip": 1.04029047, + "balance_loss_mlp": 1.05891824, + "epoch": 0.06637607094543815, + "flos": 19793351756160.0, + "grad_norm": 1.7865556655629211, + "language_loss": 0.82059169, + "learning_rate": 3.986136341700063e-06, + "loss": 0.84326148, + "num_input_tokens_seen": 23493880, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.4453125, + "step": 1104, + "time_per_iteration": 2.506230115890503 + }, + { + "auxiliary_loss_clip": 0.01195268, + "auxiliary_loss_mlp": 0.01047754, + "balance_loss_clip": 1.02229071, + "balance_loss_mlp": 1.05232382, + "epoch": 0.06643619419810612, + "flos": 25485659677440.0, + "grad_norm": 1.6089454783719872, + "language_loss": 0.80542791, + "learning_rate": 3.986090526789227e-06, + "loss": 0.82785821, + "num_input_tokens_seen": 23514920, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4296875, + "step": 1105, + "time_per_iteration": 2.524385929107666 + }, + { + "auxiliary_loss_clip": 0.01197193, + "auxiliary_loss_mlp": 0.01060252, + "balance_loss_clip": 1.03720832, + "balance_loss_mlp": 1.05697632, + "epoch": 0.06649631745077408, + "flos": 16946533393920.0, + "grad_norm": 1.8452117827451007, + "language_loss": 0.96738935, + "learning_rate": 3.986044636565639e-06, + "loss": 0.98996383, + "num_input_tokens_seen": 23531635, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.40625, + "step": 1106, + "time_per_iteration": 2.455122470855713 + }, + { + "auxiliary_loss_clip": 0.01204143, + "auxiliary_loss_mlp": 0.01060087, + "balance_loss_clip": 1.03436136, + "balance_loss_mlp": 1.05509543, + "epoch": 0.06655644070344206, + "flos": 17858592558720.0, + "grad_norm": 1.9568581550144768, + "language_loss": 0.82766026, + "learning_rate": 3.985998671031039e-06, + "loss": 0.85030258, + "num_input_tokens_seen": 23551020, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.4921875, + "step": 1107, + "time_per_iteration": 2.4554357528686523 + }, + { + "auxiliary_loss_clip": 0.01077187, + "auxiliary_loss_mlp": 0.01010186, + "balance_loss_clip": 1.0061568, + "balance_loss_mlp": 1.01696265, + "epoch": 0.06661656395611003, + "flos": 61419350021760.0, + "grad_norm": 0.8235952583150978, + "language_loss": 0.56729984, + "learning_rate": 3.9859526301871705e-06, + "loss": 0.58817357, + "num_input_tokens_seen": 23610675, + "router_z_loss_clip": 0.0402832, + "router_z_loss_mlp": 0.6015625, + "step": 1108, + "time_per_iteration": 3.0248770713806152 + }, + { + "auxiliary_loss_clip": 0.01200435, + "auxiliary_loss_mlp": 0.01065514, + "balance_loss_clip": 1.04034865, + "balance_loss_mlp": 1.05397463, + "epoch": 0.066676687208778, + "flos": 20662856282880.0, + "grad_norm": 2.4203653272420693, + "language_loss": 0.72493321, + "learning_rate": 3.9859065140357795e-06, + "loss": 0.74759269, + "num_input_tokens_seen": 23628710, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.46875, + "step": 1109, + "time_per_iteration": 2.4559717178344727 + }, + { + "auxiliary_loss_clip": 0.01197389, + "auxiliary_loss_mlp": 0.01063103, + "balance_loss_clip": 1.03759217, + "balance_loss_mlp": 1.05389571, + "epoch": 0.06673681046144596, + "flos": 20923280864640.0, + "grad_norm": 3.084593088047962, + "language_loss": 0.78256035, + "learning_rate": 3.985860322578614e-06, + "loss": 0.80516529, + "num_input_tokens_seen": 23649160, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4375, + "step": 1110, + "time_per_iteration": 2.4989912509918213 + }, + { + "auxiliary_loss_clip": 0.01201522, + "auxiliary_loss_mlp": 0.01057407, + "balance_loss_clip": 1.0334934, + "balance_loss_mlp": 1.05598152, + "epoch": 0.06679693371411394, + "flos": 31065818359680.0, + "grad_norm": 2.197430378352105, + "language_loss": 0.71290207, + "learning_rate": 3.985814055817427e-06, + "loss": 0.73549128, + "num_input_tokens_seen": 23671995, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.453125, + "step": 1111, + "time_per_iteration": 2.5445287227630615 + }, + { + "auxiliary_loss_clip": 0.0120524, + "auxiliary_loss_mlp": 0.01062473, + "balance_loss_clip": 1.03833365, + "balance_loss_mlp": 1.05788755, + "epoch": 0.0668570569667819, + "flos": 21726135705600.0, + "grad_norm": 1.8078370838130353, + "language_loss": 0.78315711, + "learning_rate": 3.985767713753971e-06, + "loss": 0.80583429, + "num_input_tokens_seen": 23690705, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.4765625, + "step": 1112, + "time_per_iteration": 2.4791040420532227 + }, + { + "auxiliary_loss_clip": 0.01203172, + "auxiliary_loss_mlp": 0.01058254, + "balance_loss_clip": 1.03426933, + "balance_loss_mlp": 1.05794001, + "epoch": 0.06691718021944987, + "flos": 22747255539840.0, + "grad_norm": 2.0430507180103943, + "language_loss": 0.78819263, + "learning_rate": 3.985721296390005e-06, + "loss": 0.81080687, + "num_input_tokens_seen": 23709990, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.453125, + "step": 1113, + "time_per_iteration": 2.4637296199798584 + }, + { + "auxiliary_loss_clip": 0.01195153, + "auxiliary_loss_mlp": 0.01053406, + "balance_loss_clip": 1.03056598, + "balance_loss_mlp": 1.05255365, + "epoch": 0.06697730347211785, + "flos": 16545626720640.0, + "grad_norm": 2.035611213247421, + "language_loss": 0.82393003, + "learning_rate": 3.985674803727289e-06, + "loss": 0.84641558, + "num_input_tokens_seen": 23728485, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.4296875, + "step": 1114, + "time_per_iteration": 2.434006452560425 + }, + { + "auxiliary_loss_clip": 0.01076, + "auxiliary_loss_mlp": 0.01008619, + "balance_loss_clip": 1.00463712, + "balance_loss_mlp": 1.0165143, + "epoch": 0.06703742672478581, + "flos": 59782326658560.0, + "grad_norm": 0.8339607525511222, + "language_loss": 0.58126414, + "learning_rate": 3.985628235767584e-06, + "loss": 0.60211033, + "num_input_tokens_seen": 23786650, + "router_z_loss_clip": 0.03979492, + "router_z_loss_mlp": 0.59375, + "step": 1115, + "time_per_iteration": 3.020782709121704 + }, + { + "auxiliary_loss_clip": 0.01200335, + "auxiliary_loss_mlp": 0.01059737, + "balance_loss_clip": 1.03427422, + "balance_loss_mlp": 1.05479646, + "epoch": 0.06709754997745378, + "flos": 16800197385600.0, + "grad_norm": 2.8263674595854464, + "language_loss": 0.91123891, + "learning_rate": 3.985581592512658e-06, + "loss": 0.93383968, + "num_input_tokens_seen": 23802555, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.453125, + "step": 1116, + "time_per_iteration": 2.446439504623413 + }, + { + "auxiliary_loss_clip": 0.01209259, + "auxiliary_loss_mlp": 0.01067721, + "balance_loss_clip": 1.04323506, + "balance_loss_mlp": 1.06065357, + "epoch": 0.06715767323012176, + "flos": 22123917895680.0, + "grad_norm": 2.019283248682947, + "language_loss": 0.8709814, + "learning_rate": 3.985534873964279e-06, + "loss": 0.89375114, + "num_input_tokens_seen": 23822945, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.484375, + "step": 1117, + "time_per_iteration": 2.486212968826294 + }, + { + "auxiliary_loss_clip": 0.01074244, + "auxiliary_loss_mlp": 0.0100646, + "balance_loss_clip": 1.00250196, + "balance_loss_mlp": 1.01550937, + "epoch": 0.06721779648278972, + "flos": 66618100137600.0, + "grad_norm": 0.9454776991467404, + "language_loss": 0.59798217, + "learning_rate": 3.985488080124218e-06, + "loss": 0.6187892, + "num_input_tokens_seen": 23874075, + "router_z_loss_clip": 0.03955078, + "router_z_loss_mlp": 0.5859375, + "step": 1118, + "time_per_iteration": 3.0197594165802 + }, + { + "auxiliary_loss_clip": 0.01201284, + "auxiliary_loss_mlp": 0.01056466, + "balance_loss_clip": 1.03255224, + "balance_loss_mlp": 1.05418777, + "epoch": 0.06727791973545769, + "flos": 22382474970240.0, + "grad_norm": 3.7568577616727468, + "language_loss": 0.83498162, + "learning_rate": 3.985441210994251e-06, + "loss": 0.85755914, + "num_input_tokens_seen": 23889720, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.4765625, + "step": 1119, + "time_per_iteration": 2.4535257816314697 + }, + { + "auxiliary_loss_clip": 0.01199216, + "auxiliary_loss_mlp": 0.01059451, + "balance_loss_clip": 1.03732562, + "balance_loss_mlp": 1.0562222, + "epoch": 0.06733804298812565, + "flos": 24280210224000.0, + "grad_norm": 1.8165724331790314, + "language_loss": 0.8480413, + "learning_rate": 3.9853942665761545e-06, + "loss": 0.87062794, + "num_input_tokens_seen": 23909385, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.4296875, + "step": 1120, + "time_per_iteration": 2.533182382583618 + }, + { + "auxiliary_loss_clip": 0.01208718, + "auxiliary_loss_mlp": 0.01067102, + "balance_loss_clip": 1.04269981, + "balance_loss_mlp": 1.0602659, + "epoch": 0.06739816624079363, + "flos": 15918230839680.0, + "grad_norm": 2.032922437281707, + "language_loss": 0.78959441, + "learning_rate": 3.985347246871708e-06, + "loss": 0.81235266, + "num_input_tokens_seen": 23926830, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.484375, + "step": 1121, + "time_per_iteration": 2.4523215293884277 + }, + { + "auxiliary_loss_clip": 0.01072018, + "auxiliary_loss_mlp": 0.01004747, + "balance_loss_clip": 1.00033593, + "balance_loss_mlp": 1.0132587, + "epoch": 0.0674582894934616, + "flos": 71398567353600.0, + "grad_norm": 0.7615352754050735, + "language_loss": 0.58346939, + "learning_rate": 3.985300151882694e-06, + "loss": 0.60423702, + "num_input_tokens_seen": 23992640, + "router_z_loss_clip": 0.04418945, + "router_z_loss_mlp": 0.5859375, + "step": 1122, + "time_per_iteration": 3.2087855339050293 + }, + { + "auxiliary_loss_clip": 0.0120309, + "auxiliary_loss_mlp": 0.01065913, + "balance_loss_clip": 1.04245234, + "balance_loss_mlp": 1.0584271, + "epoch": 0.06751841274612956, + "flos": 25264952559360.0, + "grad_norm": 2.0430211727412098, + "language_loss": 0.71546745, + "learning_rate": 3.985252981610901e-06, + "loss": 0.73815745, + "num_input_tokens_seen": 24011135, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.4453125, + "step": 1123, + "time_per_iteration": 2.5017640590667725 + }, + { + "auxiliary_loss_clip": 0.01201701, + "auxiliary_loss_mlp": 0.01057362, + "balance_loss_clip": 1.03216124, + "balance_loss_mlp": 1.05484593, + "epoch": 0.06757853599879754, + "flos": 23802741711360.0, + "grad_norm": 1.8376842720828679, + "language_loss": 0.79288971, + "learning_rate": 3.985205736058114e-06, + "loss": 0.81548035, + "num_input_tokens_seen": 24030695, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.46875, + "step": 1124, + "time_per_iteration": 2.4980688095092773 + }, + { + "auxiliary_loss_clip": 0.01196564, + "auxiliary_loss_mlp": 0.01054377, + "balance_loss_clip": 1.03204954, + "balance_loss_mlp": 1.05469489, + "epoch": 0.0676386592514655, + "flos": 21033742164480.0, + "grad_norm": 2.0983993205372253, + "language_loss": 0.71198726, + "learning_rate": 3.985158415226128e-06, + "loss": 0.73449671, + "num_input_tokens_seen": 24050680, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.421875, + "step": 1125, + "time_per_iteration": 2.4704325199127197 + }, + { + "auxiliary_loss_clip": 0.01198895, + "auxiliary_loss_mlp": 0.01068522, + "balance_loss_clip": 1.04247451, + "balance_loss_mlp": 1.05620742, + "epoch": 0.06769878250413347, + "flos": 25556331686400.0, + "grad_norm": 2.9171204901367243, + "language_loss": 0.80814254, + "learning_rate": 3.985111019116736e-06, + "loss": 0.83081663, + "num_input_tokens_seen": 24067205, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.421875, + "step": 1126, + "time_per_iteration": 2.5046803951263428 + }, + { + "auxiliary_loss_clip": 0.01070877, + "auxiliary_loss_mlp": 0.01002736, + "balance_loss_clip": 0.9986586, + "balance_loss_mlp": 1.01286924, + "epoch": 0.06775890575680145, + "flos": 70655251305600.0, + "grad_norm": 0.7804116507992601, + "language_loss": 0.59733766, + "learning_rate": 3.985063547731735e-06, + "loss": 0.61807376, + "num_input_tokens_seen": 24131320, + "router_z_loss_clip": 0.04077148, + "router_z_loss_mlp": 0.578125, + "step": 1127, + "time_per_iteration": 3.0877249240875244 + }, + { + "auxiliary_loss_clip": 0.01199514, + "auxiliary_loss_mlp": 0.01056848, + "balance_loss_clip": 1.03376949, + "balance_loss_mlp": 1.05723238, + "epoch": 0.06781902900946941, + "flos": 24235500769920.0, + "grad_norm": 2.13286114653412, + "language_loss": 0.81392133, + "learning_rate": 3.985016001072925e-06, + "loss": 0.83648497, + "num_input_tokens_seen": 24149930, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.421875, + "step": 1128, + "time_per_iteration": 2.5406885147094727 + }, + { + "auxiliary_loss_clip": 0.01208088, + "auxiliary_loss_mlp": 0.0105195, + "balance_loss_clip": 1.02692807, + "balance_loss_mlp": 1.0598706, + "epoch": 0.06787915226213738, + "flos": 22417523665920.0, + "grad_norm": 3.047918834731733, + "language_loss": 0.76034033, + "learning_rate": 3.984968379142109e-06, + "loss": 0.78294069, + "num_input_tokens_seen": 24169590, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.484375, + "step": 1129, + "time_per_iteration": 2.486829996109009 + }, + { + "auxiliary_loss_clip": 0.01201584, + "auxiliary_loss_mlp": 0.01061333, + "balance_loss_clip": 1.03721654, + "balance_loss_mlp": 1.05536139, + "epoch": 0.06793927551480534, + "flos": 37706922080640.0, + "grad_norm": 1.8621491947103987, + "language_loss": 0.72340226, + "learning_rate": 3.984920681941094e-06, + "loss": 0.74603146, + "num_input_tokens_seen": 24189965, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.4609375, + "step": 1130, + "time_per_iteration": 2.6195991039276123 + }, + { + "auxiliary_loss_clip": 0.01197626, + "auxiliary_loss_mlp": 0.01063599, + "balance_loss_clip": 1.03957844, + "balance_loss_mlp": 1.05584192, + "epoch": 0.06799939876747332, + "flos": 20631398947200.0, + "grad_norm": 2.3479224842049917, + "language_loss": 0.80624223, + "learning_rate": 3.984872909471688e-06, + "loss": 0.82885444, + "num_input_tokens_seen": 24208045, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.421875, + "step": 1131, + "time_per_iteration": 2.4917030334472656 + }, + { + "auxiliary_loss_clip": 0.01196301, + "auxiliary_loss_mlp": 0.0106802, + "balance_loss_clip": 1.04398775, + "balance_loss_mlp": 1.05550814, + "epoch": 0.06805952202014129, + "flos": 14864755829760.0, + "grad_norm": 2.1673533627141652, + "language_loss": 0.8104949, + "learning_rate": 3.984825061735701e-06, + "loss": 0.83313811, + "num_input_tokens_seen": 24223805, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.40625, + "step": 1132, + "time_per_iteration": 2.4325902462005615 + }, + { + "auxiliary_loss_clip": 0.01199688, + "auxiliary_loss_mlp": 0.01069367, + "balance_loss_clip": 1.04525137, + "balance_loss_mlp": 1.05629563, + "epoch": 0.06811964527280925, + "flos": 48909434947200.0, + "grad_norm": 1.450417149602266, + "language_loss": 0.63629937, + "learning_rate": 3.9847771387349495e-06, + "loss": 0.65898991, + "num_input_tokens_seen": 24249475, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.4296875, + "step": 1133, + "time_per_iteration": 2.7164230346679688 + }, + { + "auxiliary_loss_clip": 0.01203203, + "auxiliary_loss_mlp": 0.01059397, + "balance_loss_clip": 1.03194308, + "balance_loss_mlp": 1.05427325, + "epoch": 0.06817976852547723, + "flos": 15377273038080.0, + "grad_norm": 2.5027083277203963, + "language_loss": 0.74811196, + "learning_rate": 3.9847291404712506e-06, + "loss": 0.77073789, + "num_input_tokens_seen": 24267980, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.484375, + "step": 1134, + "time_per_iteration": 2.420506000518799 + }, + { + "auxiliary_loss_clip": 0.01201452, + "auxiliary_loss_mlp": 0.01064371, + "balance_loss_clip": 1.04088652, + "balance_loss_mlp": 1.05952573, + "epoch": 0.0682398917781452, + "flos": 20155690200960.0, + "grad_norm": 2.0759609389962037, + "language_loss": 0.87245119, + "learning_rate": 3.984681066946423e-06, + "loss": 0.89510942, + "num_input_tokens_seen": 24286805, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.421875, + "step": 1135, + "time_per_iteration": 2.464738607406616 + }, + { + "auxiliary_loss_clip": 0.01200809, + "auxiliary_loss_mlp": 0.01055494, + "balance_loss_clip": 1.03010249, + "balance_loss_mlp": 1.05388534, + "epoch": 0.06830001503081316, + "flos": 23440618748160.0, + "grad_norm": 2.383261313924855, + "language_loss": 0.78335494, + "learning_rate": 3.984632918162291e-06, + "loss": 0.80591798, + "num_input_tokens_seen": 24305855, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.46875, + "step": 1136, + "time_per_iteration": 2.4486002922058105 + }, + { + "auxiliary_loss_clip": 0.01206211, + "auxiliary_loss_mlp": 0.01073979, + "balance_loss_clip": 1.04906416, + "balance_loss_mlp": 1.06089664, + "epoch": 0.06836013828348114, + "flos": 34349813153280.0, + "grad_norm": 3.2008110915617207, + "language_loss": 0.83941948, + "learning_rate": 3.984584694120679e-06, + "loss": 0.86222148, + "num_input_tokens_seen": 24326535, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 1.453125, + "step": 1137, + "time_per_iteration": 2.5714635848999023 + }, + { + "auxiliary_loss_clip": 0.01199575, + "auxiliary_loss_mlp": 0.01061827, + "balance_loss_clip": 1.03806889, + "balance_loss_mlp": 1.05628538, + "epoch": 0.06842026153614911, + "flos": 23148844571520.0, + "grad_norm": 2.067587662099544, + "language_loss": 0.78669268, + "learning_rate": 3.984536394823418e-06, + "loss": 0.80930662, + "num_input_tokens_seen": 24345810, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.4375, + "step": 1138, + "time_per_iteration": 2.459437370300293 + }, + { + "auxiliary_loss_clip": 0.01202271, + "auxiliary_loss_mlp": 0.01059469, + "balance_loss_clip": 1.03480506, + "balance_loss_mlp": 1.05729747, + "epoch": 0.06848038478881707, + "flos": 24608972430720.0, + "grad_norm": 2.606905885529735, + "language_loss": 0.85683703, + "learning_rate": 3.984488020272336e-06, + "loss": 0.87945449, + "num_input_tokens_seen": 24366095, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.453125, + "step": 1139, + "time_per_iteration": 2.5198936462402344 + }, + { + "auxiliary_loss_clip": 0.01201061, + "auxiliary_loss_mlp": 0.01057605, + "balance_loss_clip": 1.03297663, + "balance_loss_mlp": 1.05803108, + "epoch": 0.06854050804148504, + "flos": 40880994278400.0, + "grad_norm": 1.7528507300348692, + "language_loss": 0.74826896, + "learning_rate": 3.984439570469271e-06, + "loss": 0.77085567, + "num_input_tokens_seen": 24388665, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.4296875, + "step": 1140, + "time_per_iteration": 2.6609106063842773 + }, + { + "auxiliary_loss_clip": 0.01198151, + "auxiliary_loss_mlp": 0.01062313, + "balance_loss_clip": 1.03698146, + "balance_loss_mlp": 1.05620885, + "epoch": 0.06860063129415302, + "flos": 31686354743040.0, + "grad_norm": 2.210262717529583, + "language_loss": 0.68083167, + "learning_rate": 3.9843910454160574e-06, + "loss": 0.70343632, + "num_input_tokens_seen": 24407705, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.421875, + "step": 1141, + "time_per_iteration": 2.5661122798919678 + }, + { + "auxiliary_loss_clip": 0.01205913, + "auxiliary_loss_mlp": 0.0106664, + "balance_loss_clip": 1.04098654, + "balance_loss_mlp": 1.05848837, + "epoch": 0.06866075454682098, + "flos": 26542007775360.0, + "grad_norm": 1.82433360121009, + "language_loss": 0.79399014, + "learning_rate": 3.984342445114538e-06, + "loss": 0.8167156, + "num_input_tokens_seen": 24428390, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.46875, + "step": 1142, + "time_per_iteration": 2.5499107837677 + }, + { + "auxiliary_loss_clip": 0.0120232, + "auxiliary_loss_mlp": 0.01061074, + "balance_loss_clip": 1.03650475, + "balance_loss_mlp": 1.05730164, + "epoch": 0.06872087779948895, + "flos": 29789768724480.0, + "grad_norm": 1.6821535193321122, + "language_loss": 0.68701231, + "learning_rate": 3.984293769566553e-06, + "loss": 0.70964622, + "num_input_tokens_seen": 24450810, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.4453125, + "step": 1143, + "time_per_iteration": 5.380373239517212 + }, + { + "auxiliary_loss_clip": 0.01196375, + "auxiliary_loss_mlp": 0.01059216, + "balance_loss_clip": 1.03670955, + "balance_loss_mlp": 1.05885804, + "epoch": 0.06878100105215693, + "flos": 26941118768640.0, + "grad_norm": 1.8434796401844256, + "language_loss": 0.74694496, + "learning_rate": 3.98424501877395e-06, + "loss": 0.76950091, + "num_input_tokens_seen": 24469965, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.375, + "step": 1144, + "time_per_iteration": 2.536839485168457 + }, + { + "auxiliary_loss_clip": 0.01204332, + "auxiliary_loss_mlp": 0.01064223, + "balance_loss_clip": 1.03893876, + "balance_loss_mlp": 1.05654943, + "epoch": 0.06884112430482489, + "flos": 10670748946560.0, + "grad_norm": 2.296493270147659, + "language_loss": 0.91720247, + "learning_rate": 3.984196192738577e-06, + "loss": 0.93988806, + "num_input_tokens_seen": 24486370, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4765625, + "step": 1145, + "time_per_iteration": 2.44307017326355 + }, + { + "auxiliary_loss_clip": 0.01206887, + "auxiliary_loss_mlp": 0.01067692, + "balance_loss_clip": 1.04160893, + "balance_loss_mlp": 1.05779576, + "epoch": 0.06890124755749286, + "flos": 20193647898240.0, + "grad_norm": 2.4650333910918865, + "language_loss": 0.82189268, + "learning_rate": 3.984147291462285e-06, + "loss": 0.84463847, + "num_input_tokens_seen": 24503780, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.4921875, + "step": 1146, + "time_per_iteration": 2.4743804931640625 + }, + { + "auxiliary_loss_clip": 0.01198651, + "auxiliary_loss_mlp": 0.01061891, + "balance_loss_clip": 1.03869271, + "balance_loss_mlp": 1.05755806, + "epoch": 0.06896137081016084, + "flos": 20449224144000.0, + "grad_norm": 2.5935722439127744, + "language_loss": 0.85150343, + "learning_rate": 3.98409831494693e-06, + "loss": 0.87410891, + "num_input_tokens_seen": 24522320, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.4140625, + "step": 1147, + "time_per_iteration": 2.48410701751709 + }, + { + "auxiliary_loss_clip": 0.01201275, + "auxiliary_loss_mlp": 0.01064743, + "balance_loss_clip": 1.03988767, + "balance_loss_mlp": 1.05699074, + "epoch": 0.0690214940628288, + "flos": 18368703555840.0, + "grad_norm": 2.3932988353276645, + "language_loss": 0.86235052, + "learning_rate": 3.984049263194367e-06, + "loss": 0.88501072, + "num_input_tokens_seen": 24540445, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.4453125, + "step": 1148, + "time_per_iteration": 2.455441951751709 + }, + { + "auxiliary_loss_clip": 0.01199305, + "auxiliary_loss_mlp": 0.01058033, + "balance_loss_clip": 1.0337863, + "balance_loss_mlp": 1.05560231, + "epoch": 0.06908161731549677, + "flos": 20558033418240.0, + "grad_norm": 2.070658514783469, + "language_loss": 0.69185412, + "learning_rate": 3.9840001362064575e-06, + "loss": 0.71442747, + "num_input_tokens_seen": 24557105, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.4375, + "step": 1149, + "time_per_iteration": 2.461880922317505 + }, + { + "auxiliary_loss_clip": 0.01203307, + "auxiliary_loss_mlp": 0.0105502, + "balance_loss_clip": 1.0289495, + "balance_loss_mlp": 1.05679548, + "epoch": 0.06914174056816474, + "flos": 27563666313600.0, + "grad_norm": 2.828663566846353, + "language_loss": 0.84069788, + "learning_rate": 3.983950933985064e-06, + "loss": 0.86328113, + "num_input_tokens_seen": 24578240, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.4609375, + "step": 1150, + "time_per_iteration": 2.509122371673584 + }, + { + "auxiliary_loss_clip": 0.01206199, + "auxiliary_loss_mlp": 0.01058671, + "balance_loss_clip": 1.03453135, + "balance_loss_mlp": 1.06116164, + "epoch": 0.06920186382083271, + "flos": 15304015249920.0, + "grad_norm": 3.57752822218259, + "language_loss": 0.82044697, + "learning_rate": 3.983901656532052e-06, + "loss": 0.84309566, + "num_input_tokens_seen": 24593585, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.453125, + "step": 1151, + "time_per_iteration": 2.420128345489502 + }, + { + "auxiliary_loss_clip": 0.01201904, + "auxiliary_loss_mlp": 0.01062248, + "balance_loss_clip": 1.03883505, + "balance_loss_mlp": 1.06011868, + "epoch": 0.06926198707350067, + "flos": 25191227894400.0, + "grad_norm": 1.8279979065740934, + "language_loss": 0.85587418, + "learning_rate": 3.983852303849291e-06, + "loss": 0.87851566, + "num_input_tokens_seen": 24613110, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.4140625, + "step": 1152, + "time_per_iteration": 2.498180866241455 + }, + { + "auxiliary_loss_clip": 0.01198565, + "auxiliary_loss_mlp": 0.01060938, + "balance_loss_clip": 1.03747797, + "balance_loss_mlp": 1.05767703, + "epoch": 0.06932211032616864, + "flos": 13256137146240.0, + "grad_norm": 2.1251557516582995, + "language_loss": 0.90536988, + "learning_rate": 3.983802875938651e-06, + "loss": 0.92796487, + "num_input_tokens_seen": 24628795, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.40625, + "step": 1153, + "time_per_iteration": 2.422480821609497 + }, + { + "auxiliary_loss_clip": 0.01200757, + "auxiliary_loss_mlp": 0.01054623, + "balance_loss_clip": 1.03035152, + "balance_loss_mlp": 1.05790865, + "epoch": 0.06938223357883662, + "flos": 24827381078400.0, + "grad_norm": 2.190017778582164, + "language_loss": 0.81363368, + "learning_rate": 3.983753372802008e-06, + "loss": 0.83618748, + "num_input_tokens_seen": 24645480, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.4296875, + "step": 1154, + "time_per_iteration": 2.528118848800659 + }, + { + "auxiliary_loss_clip": 0.01202754, + "auxiliary_loss_mlp": 0.01068044, + "balance_loss_clip": 1.04476249, + "balance_loss_mlp": 1.06078768, + "epoch": 0.06944235683150458, + "flos": 27267977554560.0, + "grad_norm": 32.79102955334026, + "language_loss": 0.7560131, + "learning_rate": 3.983703794441237e-06, + "loss": 0.77872109, + "num_input_tokens_seen": 24664630, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.421875, + "step": 1155, + "time_per_iteration": 2.5010287761688232 + }, + { + "auxiliary_loss_clip": 0.01196108, + "auxiliary_loss_mlp": 0.01059268, + "balance_loss_clip": 1.03595114, + "balance_loss_mlp": 1.05511975, + "epoch": 0.06950248008417255, + "flos": 25808065176960.0, + "grad_norm": 1.6800097473238784, + "language_loss": 0.71119213, + "learning_rate": 3.98365414085822e-06, + "loss": 0.73374593, + "num_input_tokens_seen": 24684210, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.40625, + "step": 1156, + "time_per_iteration": 2.5034549236297607 + }, + { + "auxiliary_loss_clip": 0.01199728, + "auxiliary_loss_mlp": 0.01069841, + "balance_loss_clip": 1.04437828, + "balance_loss_mlp": 1.05711889, + "epoch": 0.06956260333684053, + "flos": 22271546793600.0, + "grad_norm": 2.0301788984863918, + "language_loss": 0.75299567, + "learning_rate": 3.98360441205484e-06, + "loss": 0.77569139, + "num_input_tokens_seen": 24702490, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4296875, + "step": 1157, + "time_per_iteration": 2.4654574394226074 + }, + { + "auxiliary_loss_clip": 0.0119867, + "auxiliary_loss_mlp": 0.0105715, + "balance_loss_clip": 1.03240204, + "balance_loss_mlp": 1.0551796, + "epoch": 0.0696227265895085, + "flos": 29681390413440.0, + "grad_norm": 1.6687264459000366, + "language_loss": 0.71895158, + "learning_rate": 3.983554608032982e-06, + "loss": 0.7415098, + "num_input_tokens_seen": 24724340, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.4375, + "step": 1158, + "time_per_iteration": 2.53495454788208 + }, + { + "auxiliary_loss_clip": 0.01202231, + "auxiliary_loss_mlp": 0.01063046, + "balance_loss_clip": 1.03764284, + "balance_loss_mlp": 1.05718327, + "epoch": 0.06968284984217646, + "flos": 25523545547520.0, + "grad_norm": 1.9777890540291267, + "language_loss": 0.79796576, + "learning_rate": 3.983504728794533e-06, + "loss": 0.82061857, + "num_input_tokens_seen": 24745550, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.453125, + "step": 1159, + "time_per_iteration": 2.511402130126953 + }, + { + "auxiliary_loss_clip": 0.01205534, + "auxiliary_loss_mlp": 0.0106614, + "balance_loss_clip": 1.03938961, + "balance_loss_mlp": 1.05860782, + "epoch": 0.06974297309484444, + "flos": 20698192287360.0, + "grad_norm": 5.094070474761981, + "language_loss": 0.810929, + "learning_rate": 3.983454774341387e-06, + "loss": 0.83364576, + "num_input_tokens_seen": 24762575, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.46875, + "step": 1160, + "time_per_iteration": 2.4580883979797363 + }, + { + "auxiliary_loss_clip": 0.01197544, + "auxiliary_loss_mlp": 0.01059119, + "balance_loss_clip": 1.03373909, + "balance_loss_mlp": 1.05382752, + "epoch": 0.0698030963475124, + "flos": 26505199313280.0, + "grad_norm": 1.8746427931419856, + "language_loss": 0.75958532, + "learning_rate": 3.983404744675437e-06, + "loss": 0.78215194, + "num_input_tokens_seen": 24782605, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4375, + "step": 1161, + "time_per_iteration": 2.5046370029449463 + }, + { + "auxiliary_loss_clip": 0.01195466, + "auxiliary_loss_mlp": 0.01062077, + "balance_loss_clip": 1.03642368, + "balance_loss_mlp": 1.05299318, + "epoch": 0.06986321960018037, + "flos": 23040430346880.0, + "grad_norm": 1.806880077375887, + "language_loss": 0.8285073, + "learning_rate": 3.9833546397985794e-06, + "loss": 0.85108274, + "num_input_tokens_seen": 24802910, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.4296875, + "step": 1162, + "time_per_iteration": 2.4779040813446045 + }, + { + "auxiliary_loss_clip": 0.01193968, + "auxiliary_loss_mlp": 0.01055987, + "balance_loss_clip": 1.03172803, + "balance_loss_mlp": 1.05355024, + "epoch": 0.06992334285284833, + "flos": 28584822061440.0, + "grad_norm": 1.8779282806609423, + "language_loss": 0.79095101, + "learning_rate": 3.983304459712716e-06, + "loss": 0.81345057, + "num_input_tokens_seen": 24823305, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.40625, + "step": 1163, + "time_per_iteration": 2.515899181365967 + }, + { + "auxiliary_loss_clip": 0.01198337, + "auxiliary_loss_mlp": 0.0106386, + "balance_loss_clip": 1.03728819, + "balance_loss_mlp": 1.05438375, + "epoch": 0.06998346610551631, + "flos": 20595344670720.0, + "grad_norm": 2.1142628107327233, + "language_loss": 0.79552305, + "learning_rate": 3.983254204419749e-06, + "loss": 0.81814498, + "num_input_tokens_seen": 24842155, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.4375, + "step": 1164, + "time_per_iteration": 2.476428747177124 + }, + { + "auxiliary_loss_clip": 0.0119937, + "auxiliary_loss_mlp": 0.0106576, + "balance_loss_clip": 1.0401659, + "balance_loss_mlp": 1.05587661, + "epoch": 0.07004358935818428, + "flos": 22528810978560.0, + "grad_norm": 1.4863162511761774, + "language_loss": 0.73198837, + "learning_rate": 3.983203873921583e-06, + "loss": 0.75463963, + "num_input_tokens_seen": 24862080, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.4375, + "step": 1165, + "time_per_iteration": 2.5053012371063232 + }, + { + "auxiliary_loss_clip": 0.01196916, + "auxiliary_loss_mlp": 0.01056731, + "balance_loss_clip": 1.03225732, + "balance_loss_mlp": 1.05550849, + "epoch": 0.07010371261085224, + "flos": 28949997680640.0, + "grad_norm": 1.690867173089168, + "language_loss": 0.81019437, + "learning_rate": 3.983153468220128e-06, + "loss": 0.83273077, + "num_input_tokens_seen": 24886165, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.4140625, + "step": 1166, + "time_per_iteration": 2.5378963947296143 + }, + { + "auxiliary_loss_clip": 0.01194011, + "auxiliary_loss_mlp": 0.0104974, + "balance_loss_clip": 1.02452731, + "balance_loss_mlp": 1.0534389, + "epoch": 0.07016383586352022, + "flos": 23659171050240.0, + "grad_norm": 4.886682439277329, + "language_loss": 0.84443307, + "learning_rate": 3.983102987317295e-06, + "loss": 0.86687052, + "num_input_tokens_seen": 24905775, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.40625, + "step": 1167, + "time_per_iteration": 2.5244622230529785 + }, + { + "auxiliary_loss_clip": 0.01201364, + "auxiliary_loss_mlp": 0.01057063, + "balance_loss_clip": 1.03188586, + "balance_loss_mlp": 1.05693448, + "epoch": 0.07022395911618819, + "flos": 19792130693760.0, + "grad_norm": 3.687845484368313, + "language_loss": 0.89423364, + "learning_rate": 3.983052431214997e-06, + "loss": 0.9168179, + "num_input_tokens_seen": 24924295, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.4453125, + "step": 1168, + "time_per_iteration": 2.49411678314209 + }, + { + "auxiliary_loss_clip": 0.01203973, + "auxiliary_loss_mlp": 0.01068913, + "balance_loss_clip": 1.04078007, + "balance_loss_mlp": 1.05737031, + "epoch": 0.07028408236885615, + "flos": 21689147675520.0, + "grad_norm": 2.629371766417224, + "language_loss": 0.88661098, + "learning_rate": 3.983001799915153e-06, + "loss": 0.9093399, + "num_input_tokens_seen": 24943210, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.46875, + "step": 1169, + "time_per_iteration": 2.4795143604278564 + }, + { + "auxiliary_loss_clip": 0.01203226, + "auxiliary_loss_mlp": 0.01069625, + "balance_loss_clip": 1.04397118, + "balance_loss_mlp": 1.05864179, + "epoch": 0.07034420562152413, + "flos": 25630271832960.0, + "grad_norm": 2.0154006947860705, + "language_loss": 0.84000075, + "learning_rate": 3.982951093419681e-06, + "loss": 0.86272925, + "num_input_tokens_seen": 24960360, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.4375, + "step": 1170, + "time_per_iteration": 2.501016616821289 + }, + { + "auxiliary_loss_clip": 0.01199625, + "auxiliary_loss_mlp": 0.01064997, + "balance_loss_clip": 1.03860402, + "balance_loss_mlp": 1.05753505, + "epoch": 0.0704043288741921, + "flos": 20810449267200.0, + "grad_norm": 1.945268169582358, + "language_loss": 0.75220597, + "learning_rate": 3.982900311730506e-06, + "loss": 0.77485222, + "num_input_tokens_seen": 24978290, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.421875, + "step": 1171, + "time_per_iteration": 2.4456748962402344 + }, + { + "auxiliary_loss_clip": 0.01199689, + "auxiliary_loss_mlp": 0.01058158, + "balance_loss_clip": 1.03393483, + "balance_loss_mlp": 1.05765915, + "epoch": 0.07046445212686006, + "flos": 25593176062080.0, + "grad_norm": 3.2481396571627923, + "language_loss": 0.88848841, + "learning_rate": 3.9828494548495514e-06, + "loss": 0.91106689, + "num_input_tokens_seen": 24997055, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.421875, + "step": 1172, + "time_per_iteration": 2.4970321655273438 + }, + { + "auxiliary_loss_clip": 0.01202846, + "auxiliary_loss_mlp": 0.01053059, + "balance_loss_clip": 1.02776241, + "balance_loss_mlp": 1.05584753, + "epoch": 0.07052457537952803, + "flos": 25556978131200.0, + "grad_norm": 1.6229718682058278, + "language_loss": 0.8212136, + "learning_rate": 3.982798522778748e-06, + "loss": 0.84377271, + "num_input_tokens_seen": 25017490, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.46875, + "step": 1173, + "time_per_iteration": 2.485822916030884 + }, + { + "auxiliary_loss_clip": 0.01200818, + "auxiliary_loss_mlp": 0.01061183, + "balance_loss_clip": 1.03574347, + "balance_loss_mlp": 1.05786848, + "epoch": 0.070584698632196, + "flos": 17968515154560.0, + "grad_norm": 2.056745883983527, + "language_loss": 0.81825697, + "learning_rate": 3.9827475155200245e-06, + "loss": 0.840877, + "num_input_tokens_seen": 25035660, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4296875, + "step": 1174, + "time_per_iteration": 2.4564759731292725 + }, + { + "auxiliary_loss_clip": 0.01199791, + "auxiliary_loss_mlp": 0.01060254, + "balance_loss_clip": 1.03538728, + "balance_loss_mlp": 1.0569849, + "epoch": 0.07064482188486397, + "flos": 25370888745600.0, + "grad_norm": 1.925446476900023, + "language_loss": 0.8511939, + "learning_rate": 3.982696433075317e-06, + "loss": 0.87379438, + "num_input_tokens_seen": 25054785, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 1.421875, + "step": 1175, + "time_per_iteration": 2.487058639526367 + }, + { + "auxiliary_loss_clip": 0.0120243, + "auxiliary_loss_mlp": 0.01067762, + "balance_loss_clip": 1.04362202, + "balance_loss_mlp": 1.05922508, + "epoch": 0.07070494513753194, + "flos": 24899848767360.0, + "grad_norm": 1.9716433558257507, + "language_loss": 0.8303746, + "learning_rate": 3.982645275446563e-06, + "loss": 0.85307658, + "num_input_tokens_seen": 25075180, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.4375, + "step": 1176, + "time_per_iteration": 2.511456251144409 + }, + { + "auxiliary_loss_clip": 0.01197689, + "auxiliary_loss_mlp": 0.01061097, + "balance_loss_clip": 1.03528786, + "balance_loss_mlp": 1.05717707, + "epoch": 0.07076506839019991, + "flos": 22338447874560.0, + "grad_norm": 2.3318965992312, + "language_loss": 0.74563694, + "learning_rate": 3.982594042635701e-06, + "loss": 0.76822478, + "num_input_tokens_seen": 25093035, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.40625, + "step": 1177, + "time_per_iteration": 2.460102081298828 + }, + { + "auxiliary_loss_clip": 0.01207406, + "auxiliary_loss_mlp": 0.01058724, + "balance_loss_clip": 1.033476, + "balance_loss_mlp": 1.06167924, + "epoch": 0.07082519164286788, + "flos": 18660800954880.0, + "grad_norm": 2.2206541819979995, + "language_loss": 0.86031914, + "learning_rate": 3.982542734644673e-06, + "loss": 0.88298053, + "num_input_tokens_seen": 25112520, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.4609375, + "step": 1178, + "time_per_iteration": 2.4605627059936523 + }, + { + "auxiliary_loss_clip": 0.01087089, + "auxiliary_loss_mlp": 0.01007975, + "balance_loss_clip": 1.00349271, + "balance_loss_mlp": 1.02766943, + "epoch": 0.07088531489553584, + "flos": 63654107610240.0, + "grad_norm": 0.8386980392448491, + "language_loss": 0.63242435, + "learning_rate": 3.982491351475427e-06, + "loss": 0.65337497, + "num_input_tokens_seen": 25177760, + "router_z_loss_clip": 0.04492188, + "router_z_loss_mlp": 0.59375, + "step": 1179, + "time_per_iteration": 3.156688690185547 + }, + { + "auxiliary_loss_clip": 0.01207076, + "auxiliary_loss_mlp": 0.01062734, + "balance_loss_clip": 1.03886819, + "balance_loss_mlp": 1.06038809, + "epoch": 0.07094543814820382, + "flos": 21572688804480.0, + "grad_norm": 2.3853497849810945, + "language_loss": 0.83326972, + "learning_rate": 3.98243989312991e-06, + "loss": 0.85596782, + "num_input_tokens_seen": 25195260, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.46875, + "step": 1180, + "time_per_iteration": 2.4823896884918213 + }, + { + "auxiliary_loss_clip": 0.01200915, + "auxiliary_loss_mlp": 0.01065839, + "balance_loss_clip": 1.04087663, + "balance_loss_mlp": 1.05910683, + "epoch": 0.07100556140087179, + "flos": 22089946608000.0, + "grad_norm": 2.1921067510196446, + "language_loss": 0.88595563, + "learning_rate": 3.982388359610074e-06, + "loss": 0.90862316, + "num_input_tokens_seen": 25212740, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.421875, + "step": 1181, + "time_per_iteration": 2.505908727645874 + }, + { + "auxiliary_loss_clip": 0.01200757, + "auxiliary_loss_mlp": 0.01060636, + "balance_loss_clip": 1.03607869, + "balance_loss_mlp": 1.05944347, + "epoch": 0.07106568465353975, + "flos": 47922286400640.0, + "grad_norm": 2.2303634282095257, + "language_loss": 0.83314365, + "learning_rate": 3.9823367509178725e-06, + "loss": 0.85575759, + "num_input_tokens_seen": 25236420, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.4140625, + "step": 1182, + "time_per_iteration": 2.7283623218536377 + }, + { + "auxiliary_loss_clip": 0.01199287, + "auxiliary_loss_mlp": 0.01065564, + "balance_loss_clip": 1.04006529, + "balance_loss_mlp": 1.06100821, + "epoch": 0.07112580790620772, + "flos": 23440798316160.0, + "grad_norm": 2.671395976555463, + "language_loss": 0.7925818, + "learning_rate": 3.982285067055262e-06, + "loss": 0.81523037, + "num_input_tokens_seen": 25255120, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.3828125, + "step": 1183, + "time_per_iteration": 2.5057172775268555 + }, + { + "auxiliary_loss_clip": 0.01201972, + "auxiliary_loss_mlp": 0.01059167, + "balance_loss_clip": 1.03441906, + "balance_loss_mlp": 1.05550563, + "epoch": 0.0711859311588757, + "flos": 31868888682240.0, + "grad_norm": 2.6492838430830963, + "language_loss": 0.78910172, + "learning_rate": 3.982233308024204e-06, + "loss": 0.8117131, + "num_input_tokens_seen": 25275150, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.46875, + "step": 1184, + "time_per_iteration": 5.494150638580322 + }, + { + "auxiliary_loss_clip": 0.01196982, + "auxiliary_loss_mlp": 0.01057128, + "balance_loss_clip": 1.03369188, + "balance_loss_mlp": 1.05884266, + "epoch": 0.07124605441154366, + "flos": 19610315026560.0, + "grad_norm": 2.546293211356889, + "language_loss": 0.7696892, + "learning_rate": 3.98218147382666e-06, + "loss": 0.79223031, + "num_input_tokens_seen": 25293680, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.375, + "step": 1185, + "time_per_iteration": 3.8873486518859863 + }, + { + "auxiliary_loss_clip": 0.01200052, + "auxiliary_loss_mlp": 0.01065088, + "balance_loss_clip": 1.0408771, + "balance_loss_mlp": 1.05808377, + "epoch": 0.07130617766421163, + "flos": 14684448533760.0, + "grad_norm": 2.519913974657541, + "language_loss": 0.65896261, + "learning_rate": 3.982129564464596e-06, + "loss": 0.68161404, + "num_input_tokens_seen": 25310050, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.421875, + "step": 1186, + "time_per_iteration": 2.44986891746521 + }, + { + "auxiliary_loss_clip": 0.01198722, + "auxiliary_loss_mlp": 0.01056267, + "balance_loss_clip": 1.03234124, + "balance_loss_mlp": 1.05906928, + "epoch": 0.07136630091687961, + "flos": 26067915141120.0, + "grad_norm": 2.0047668871213205, + "language_loss": 0.69673246, + "learning_rate": 3.98207757993998e-06, + "loss": 0.71928233, + "num_input_tokens_seen": 25331020, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.3984375, + "step": 1187, + "time_per_iteration": 2.517432451248169 + }, + { + "auxiliary_loss_clip": 0.01194056, + "auxiliary_loss_mlp": 0.01059861, + "balance_loss_clip": 1.03713942, + "balance_loss_mlp": 1.05690861, + "epoch": 0.07142642416954757, + "flos": 15669190869120.0, + "grad_norm": 2.6848541171122307, + "language_loss": 0.78598166, + "learning_rate": 3.9820255202547845e-06, + "loss": 0.80852079, + "num_input_tokens_seen": 25347875, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.375, + "step": 1188, + "time_per_iteration": 2.4682350158691406 + }, + { + "auxiliary_loss_clip": 0.01197809, + "auxiliary_loss_mlp": 0.01056931, + "balance_loss_clip": 1.03282666, + "balance_loss_mlp": 1.0588758, + "epoch": 0.07148654742221554, + "flos": 19755322231680.0, + "grad_norm": 2.0343008635273834, + "language_loss": 0.84854662, + "learning_rate": 3.981973385410981e-06, + "loss": 0.87109399, + "num_input_tokens_seen": 25366715, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.390625, + "step": 1189, + "time_per_iteration": 2.451464891433716 + }, + { + "auxiliary_loss_clip": 0.01193617, + "auxiliary_loss_mlp": 0.01062112, + "balance_loss_clip": 1.03707767, + "balance_loss_mlp": 1.05589187, + "epoch": 0.07154667067488352, + "flos": 23471824688640.0, + "grad_norm": 1.7193907035784557, + "language_loss": 0.77021295, + "learning_rate": 3.9819211754105494e-06, + "loss": 0.79277021, + "num_input_tokens_seen": 25385450, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.375, + "step": 1190, + "time_per_iteration": 2.5028254985809326 + }, + { + "auxiliary_loss_clip": 0.01200514, + "auxiliary_loss_mlp": 0.01065982, + "balance_loss_clip": 1.04018509, + "balance_loss_mlp": 1.0585537, + "epoch": 0.07160679392755148, + "flos": 18332936588160.0, + "grad_norm": 2.3385605637591302, + "language_loss": 0.75145626, + "learning_rate": 3.981868890255468e-06, + "loss": 0.77412122, + "num_input_tokens_seen": 25403940, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.421875, + "step": 1191, + "time_per_iteration": 2.461045980453491 + }, + { + "auxiliary_loss_clip": 0.0119767, + "auxiliary_loss_mlp": 0.01056581, + "balance_loss_clip": 1.03147578, + "balance_loss_mlp": 1.05730891, + "epoch": 0.07166691718021945, + "flos": 17747017937280.0, + "grad_norm": 3.3332115059632583, + "language_loss": 0.7360636, + "learning_rate": 3.981816529947719e-06, + "loss": 0.75860614, + "num_input_tokens_seen": 25420410, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.40625, + "step": 1192, + "time_per_iteration": 2.4944753646850586 + }, + { + "auxiliary_loss_clip": 0.01194068, + "auxiliary_loss_mlp": 0.01052089, + "balance_loss_clip": 1.02884293, + "balance_loss_mlp": 1.05358601, + "epoch": 0.07172704043288743, + "flos": 22451925916800.0, + "grad_norm": 2.1652973689026176, + "language_loss": 0.7830255, + "learning_rate": 3.9817640944892896e-06, + "loss": 0.80548704, + "num_input_tokens_seen": 25439415, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.40625, + "step": 1193, + "time_per_iteration": 2.487025737762451 + }, + { + "auxiliary_loss_clip": 0.01202609, + "auxiliary_loss_mlp": 0.01053593, + "balance_loss_clip": 1.02786815, + "balance_loss_mlp": 1.06034899, + "epoch": 0.07178716368555539, + "flos": 23222210100480.0, + "grad_norm": 1.9678931818636167, + "language_loss": 0.85748619, + "learning_rate": 3.981711583882166e-06, + "loss": 0.88004816, + "num_input_tokens_seen": 25458715, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.421875, + "step": 1194, + "time_per_iteration": 2.493823766708374 + }, + { + "auxiliary_loss_clip": 0.01197363, + "auxiliary_loss_mlp": 0.0106262, + "balance_loss_clip": 1.03886151, + "balance_loss_mlp": 1.05782473, + "epoch": 0.07184728693822336, + "flos": 25150828072320.0, + "grad_norm": 1.9701258602591958, + "language_loss": 0.81425989, + "learning_rate": 3.981658998128341e-06, + "loss": 0.83685976, + "num_input_tokens_seen": 25477985, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.3984375, + "step": 1195, + "time_per_iteration": 2.5168802738189697 + }, + { + "auxiliary_loss_clip": 0.01195742, + "auxiliary_loss_mlp": 0.01051594, + "balance_loss_clip": 1.02979064, + "balance_loss_mlp": 1.05720496, + "epoch": 0.07190741019089132, + "flos": 22711237176960.0, + "grad_norm": 1.9269272748189905, + "language_loss": 0.79917538, + "learning_rate": 3.981606337229808e-06, + "loss": 0.82164884, + "num_input_tokens_seen": 25497110, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.3828125, + "step": 1196, + "time_per_iteration": 2.4749536514282227 + }, + { + "auxiliary_loss_clip": 0.01193553, + "auxiliary_loss_mlp": 0.01069477, + "balance_loss_clip": 1.04418063, + "balance_loss_mlp": 1.05655897, + "epoch": 0.0719675334435593, + "flos": 29349791032320.0, + "grad_norm": 8.862292558474625, + "language_loss": 0.71015084, + "learning_rate": 3.9815536011885655e-06, + "loss": 0.73278111, + "num_input_tokens_seen": 25516555, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.3671875, + "step": 1197, + "time_per_iteration": 2.520514726638794 + }, + { + "auxiliary_loss_clip": 0.01192449, + "auxiliary_loss_mlp": 0.01052157, + "balance_loss_clip": 1.02845871, + "balance_loss_mlp": 1.05429292, + "epoch": 0.07202765669622727, + "flos": 17639788861440.0, + "grad_norm": 2.0584524946763767, + "language_loss": 0.86034989, + "learning_rate": 3.98150079000661e-06, + "loss": 0.88279593, + "num_input_tokens_seen": 25533895, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.3828125, + "step": 1198, + "time_per_iteration": 2.441458225250244 + }, + { + "auxiliary_loss_clip": 0.01194875, + "auxiliary_loss_mlp": 0.01061206, + "balance_loss_clip": 1.03724504, + "balance_loss_mlp": 1.05664325, + "epoch": 0.07208777994889523, + "flos": 21434038306560.0, + "grad_norm": 2.7240513490380307, + "language_loss": 0.83822477, + "learning_rate": 3.981447903685947e-06, + "loss": 0.8607856, + "num_input_tokens_seen": 25554195, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.3828125, + "step": 1199, + "time_per_iteration": 2.462790012359619 + }, + { + "auxiliary_loss_clip": 0.01201627, + "auxiliary_loss_mlp": 0.01055923, + "balance_loss_clip": 1.03351128, + "balance_loss_mlp": 1.06159616, + "epoch": 0.07214790320156321, + "flos": 26940867373440.0, + "grad_norm": 2.0725431151836453, + "language_loss": 0.76464498, + "learning_rate": 3.981394942228581e-06, + "loss": 0.78722042, + "num_input_tokens_seen": 25574155, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.3984375, + "step": 1200, + "time_per_iteration": 2.5007636547088623 + }, + { + "auxiliary_loss_clip": 0.01196382, + "auxiliary_loss_mlp": 0.01061794, + "balance_loss_clip": 1.0376662, + "balance_loss_mlp": 1.05783701, + "epoch": 0.07220802645423118, + "flos": 23879949995520.0, + "grad_norm": 1.959995672067427, + "language_loss": 0.82965535, + "learning_rate": 3.98134190563652e-06, + "loss": 0.85223711, + "num_input_tokens_seen": 25592735, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.390625, + "step": 1201, + "time_per_iteration": 2.4968512058258057 + }, + { + "auxiliary_loss_clip": 0.01198607, + "auxiliary_loss_mlp": 0.01059493, + "balance_loss_clip": 1.03372014, + "balance_loss_mlp": 1.05568862, + "epoch": 0.07226814970689914, + "flos": 19243631036160.0, + "grad_norm": 2.411287508312223, + "language_loss": 0.69041032, + "learning_rate": 3.981288793911775e-06, + "loss": 0.71299136, + "num_input_tokens_seen": 25611510, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.4296875, + "step": 1202, + "time_per_iteration": 2.456216812133789 + }, + { + "auxiliary_loss_clip": 0.01196785, + "auxiliary_loss_mlp": 0.01063607, + "balance_loss_clip": 1.03804839, + "balance_loss_mlp": 1.05721354, + "epoch": 0.07232827295956712, + "flos": 19172025273600.0, + "grad_norm": 1.9411904343348254, + "language_loss": 0.87723774, + "learning_rate": 3.98123560705636e-06, + "loss": 0.89984161, + "num_input_tokens_seen": 25629560, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.3984375, + "step": 1203, + "time_per_iteration": 2.449903964996338 + }, + { + "auxiliary_loss_clip": 0.01198028, + "auxiliary_loss_mlp": 0.01061987, + "balance_loss_clip": 1.03803837, + "balance_loss_mlp": 1.0546416, + "epoch": 0.07238839621223508, + "flos": 17639752947840.0, + "grad_norm": 1.819852916387131, + "language_loss": 0.7844671, + "learning_rate": 3.981182345072293e-06, + "loss": 0.80706728, + "num_input_tokens_seen": 25648330, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.4375, + "step": 1204, + "time_per_iteration": 2.449265480041504 + }, + { + "auxiliary_loss_clip": 0.01194984, + "auxiliary_loss_mlp": 0.01062187, + "balance_loss_clip": 1.0388217, + "balance_loss_mlp": 1.05605316, + "epoch": 0.07244851946490305, + "flos": 28292401440000.0, + "grad_norm": 1.8514893306986777, + "language_loss": 0.81960398, + "learning_rate": 3.981129007961593e-06, + "loss": 0.84217566, + "num_input_tokens_seen": 25669470, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.390625, + "step": 1205, + "time_per_iteration": 2.517423629760742 + }, + { + "auxiliary_loss_clip": 0.01199989, + "auxiliary_loss_mlp": 0.01067422, + "balance_loss_clip": 1.04250705, + "balance_loss_mlp": 1.05852747, + "epoch": 0.07250864271757101, + "flos": 22564829341440.0, + "grad_norm": 2.0830735488163254, + "language_loss": 0.76702261, + "learning_rate": 3.981075595726283e-06, + "loss": 0.78969669, + "num_input_tokens_seen": 25690470, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.4140625, + "step": 1206, + "time_per_iteration": 2.489978313446045 + }, + { + "auxiliary_loss_clip": 0.01193529, + "auxiliary_loss_mlp": 0.01055273, + "balance_loss_clip": 1.03071594, + "balance_loss_mlp": 1.05481935, + "epoch": 0.072568765970239, + "flos": 21762405463680.0, + "grad_norm": 1.8430962541821914, + "language_loss": 0.77246201, + "learning_rate": 3.981022108368387e-06, + "loss": 0.79495007, + "num_input_tokens_seen": 25709205, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.3828125, + "step": 1207, + "time_per_iteration": 2.4895267486572266 + }, + { + "auxiliary_loss_clip": 0.01194673, + "auxiliary_loss_mlp": 0.01049476, + "balance_loss_clip": 1.02816105, + "balance_loss_mlp": 1.05703962, + "epoch": 0.07262888922290696, + "flos": 25519702792320.0, + "grad_norm": 5.768853045708734, + "language_loss": 0.79723513, + "learning_rate": 3.9809685458899345e-06, + "loss": 0.81967664, + "num_input_tokens_seen": 25728485, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.375, + "step": 1208, + "time_per_iteration": 2.509073495864868 + }, + { + "auxiliary_loss_clip": 0.0119292, + "auxiliary_loss_mlp": 0.01054601, + "balance_loss_clip": 1.03204679, + "balance_loss_mlp": 1.05551386, + "epoch": 0.07268901247557492, + "flos": 21246548290560.0, + "grad_norm": 3.6873449148768063, + "language_loss": 0.78595626, + "learning_rate": 3.980914908292955e-06, + "loss": 0.80843151, + "num_input_tokens_seen": 25747730, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.375, + "step": 1209, + "time_per_iteration": 2.506157398223877 + }, + { + "auxiliary_loss_clip": 0.01194158, + "auxiliary_loss_mlp": 0.01056209, + "balance_loss_clip": 1.03409529, + "balance_loss_mlp": 1.05510461, + "epoch": 0.0727491357282429, + "flos": 25479302970240.0, + "grad_norm": 2.6193169355932104, + "language_loss": 0.81117678, + "learning_rate": 3.980861195579486e-06, + "loss": 0.83368045, + "num_input_tokens_seen": 25768050, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.390625, + "step": 1210, + "time_per_iteration": 2.4985666275024414 + }, + { + "auxiliary_loss_clip": 0.01192388, + "auxiliary_loss_mlp": 0.01061033, + "balance_loss_clip": 1.03688109, + "balance_loss_mlp": 1.0565064, + "epoch": 0.07280925898091087, + "flos": 24462169545600.0, + "grad_norm": 2.2378435782703834, + "language_loss": 0.84350932, + "learning_rate": 3.98080740775156e-06, + "loss": 0.86604351, + "num_input_tokens_seen": 25787985, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.359375, + "step": 1211, + "time_per_iteration": 2.4971728324890137 + }, + { + "auxiliary_loss_clip": 0.01189207, + "auxiliary_loss_mlp": 0.01051238, + "balance_loss_clip": 1.02931547, + "balance_loss_mlp": 1.05233216, + "epoch": 0.07286938223357883, + "flos": 18288191220480.0, + "grad_norm": 2.2910402501943516, + "language_loss": 0.90813953, + "learning_rate": 3.98075354481122e-06, + "loss": 0.9305439, + "num_input_tokens_seen": 25803620, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.375, + "step": 1212, + "time_per_iteration": 2.424874782562256 + }, + { + "auxiliary_loss_clip": 0.01191621, + "auxiliary_loss_mlp": 0.01051304, + "balance_loss_clip": 1.0286777, + "balance_loss_mlp": 1.05457211, + "epoch": 0.07292950548624681, + "flos": 21214803646080.0, + "grad_norm": 2.346480404505952, + "language_loss": 0.7238096, + "learning_rate": 3.9806996067605055e-06, + "loss": 0.74623883, + "num_input_tokens_seen": 25823315, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.3671875, + "step": 1213, + "time_per_iteration": 2.443542003631592 + }, + { + "auxiliary_loss_clip": 0.0119423, + "auxiliary_loss_mlp": 0.01051601, + "balance_loss_clip": 1.02848625, + "balance_loss_mlp": 1.05338192, + "epoch": 0.07298962873891478, + "flos": 24642009964800.0, + "grad_norm": 1.9141465843449694, + "language_loss": 0.84441102, + "learning_rate": 3.980645593601465e-06, + "loss": 0.86686933, + "num_input_tokens_seen": 25842605, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.40625, + "step": 1214, + "time_per_iteration": 2.500112295150757 + }, + { + "auxiliary_loss_clip": 0.01197246, + "auxiliary_loss_mlp": 0.0105819, + "balance_loss_clip": 1.03468192, + "balance_loss_mlp": 1.05678558, + "epoch": 0.07304975199158274, + "flos": 27052765217280.0, + "grad_norm": 2.82775499028919, + "language_loss": 0.83929181, + "learning_rate": 3.980591505336144e-06, + "loss": 0.86184609, + "num_input_tokens_seen": 25863030, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.40625, + "step": 1215, + "time_per_iteration": 2.539769411087036 + }, + { + "auxiliary_loss_clip": 0.01194493, + "auxiliary_loss_mlp": 0.01061008, + "balance_loss_clip": 1.03711891, + "balance_loss_mlp": 1.05474758, + "epoch": 0.07310987524425071, + "flos": 33549544091520.0, + "grad_norm": 1.8082751516232567, + "language_loss": 0.80984753, + "learning_rate": 3.980537341966595e-06, + "loss": 0.83240259, + "num_input_tokens_seen": 25888015, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.390625, + "step": 1216, + "time_per_iteration": 2.6036598682403564 + }, + { + "auxiliary_loss_clip": 0.01196444, + "auxiliary_loss_mlp": 0.01050548, + "balance_loss_clip": 1.02863717, + "balance_loss_mlp": 1.05746269, + "epoch": 0.07316999849691869, + "flos": 28110944908800.0, + "grad_norm": 2.8100743600713276, + "language_loss": 0.76112509, + "learning_rate": 3.980483103494872e-06, + "loss": 0.78359497, + "num_input_tokens_seen": 25908660, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.390625, + "step": 1217, + "time_per_iteration": 2.513061046600342 + }, + { + "auxiliary_loss_clip": 0.01192952, + "auxiliary_loss_mlp": 0.01055183, + "balance_loss_clip": 1.0347029, + "balance_loss_mlp": 1.05546904, + "epoch": 0.07323012174958665, + "flos": 14392602529920.0, + "grad_norm": 2.0751842608938142, + "language_loss": 0.86442709, + "learning_rate": 3.98042878992303e-06, + "loss": 0.88690841, + "num_input_tokens_seen": 25927215, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.375, + "step": 1218, + "time_per_iteration": 2.4514572620391846 + }, + { + "auxiliary_loss_clip": 0.01193593, + "auxiliary_loss_mlp": 0.01062446, + "balance_loss_clip": 1.03989124, + "balance_loss_mlp": 1.05405331, + "epoch": 0.07329024500225462, + "flos": 21616428591360.0, + "grad_norm": 1.9036635750322874, + "language_loss": 0.86757988, + "learning_rate": 3.9803744012531305e-06, + "loss": 0.8901403, + "num_input_tokens_seen": 25945500, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.3984375, + "step": 1219, + "time_per_iteration": 2.4501893520355225 + }, + { + "auxiliary_loss_clip": 0.01190573, + "auxiliary_loss_mlp": 0.01058106, + "balance_loss_clip": 1.03654075, + "balance_loss_mlp": 1.05260015, + "epoch": 0.0733503682549226, + "flos": 13224141106560.0, + "grad_norm": 2.320539289810395, + "language_loss": 0.84721315, + "learning_rate": 3.980319937487235e-06, + "loss": 0.86969984, + "num_input_tokens_seen": 25963105, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.375, + "step": 1220, + "time_per_iteration": 2.4651544094085693 + }, + { + "auxiliary_loss_clip": 0.01193314, + "auxiliary_loss_mlp": 0.01062531, + "balance_loss_clip": 1.04015541, + "balance_loss_mlp": 1.05455709, + "epoch": 0.07341049150759056, + "flos": 20886975192960.0, + "grad_norm": 2.803787378453645, + "language_loss": 0.76840538, + "learning_rate": 3.98026539862741e-06, + "loss": 0.79096377, + "num_input_tokens_seen": 25981690, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.390625, + "step": 1221, + "time_per_iteration": 2.4643850326538086 + }, + { + "auxiliary_loss_clip": 0.01195957, + "auxiliary_loss_mlp": 0.01059407, + "balance_loss_clip": 1.0369482, + "balance_loss_mlp": 1.05698907, + "epoch": 0.07347061476025853, + "flos": 15413614623360.0, + "grad_norm": 4.111967976062365, + "language_loss": 0.92201889, + "learning_rate": 3.980210784675722e-06, + "loss": 0.94457251, + "num_input_tokens_seen": 25999890, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.390625, + "step": 1222, + "time_per_iteration": 2.461393117904663 + }, + { + "auxiliary_loss_clip": 0.01197113, + "auxiliary_loss_mlp": 0.01056347, + "balance_loss_clip": 1.03440046, + "balance_loss_mlp": 1.05795276, + "epoch": 0.0735307380129265, + "flos": 11108859131520.0, + "grad_norm": 2.739326433562924, + "language_loss": 0.91106719, + "learning_rate": 3.980156095634242e-06, + "loss": 0.9336018, + "num_input_tokens_seen": 26016445, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.390625, + "step": 1223, + "time_per_iteration": 2.4616212844848633 + }, + { + "auxiliary_loss_clip": 0.01194512, + "auxiliary_loss_mlp": 0.01071192, + "balance_loss_clip": 1.04895926, + "balance_loss_mlp": 1.05628467, + "epoch": 0.07359086126559447, + "flos": 23732392924800.0, + "grad_norm": 2.5538951271380395, + "language_loss": 0.81946027, + "learning_rate": 3.980101331505045e-06, + "loss": 0.84211743, + "num_input_tokens_seen": 26036080, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.3828125, + "step": 1224, + "time_per_iteration": 2.555060386657715 + }, + { + "auxiliary_loss_clip": 0.01191919, + "auxiliary_loss_mlp": 0.01053854, + "balance_loss_clip": 1.02938056, + "balance_loss_mlp": 1.05385065, + "epoch": 0.07365098451826244, + "flos": 20993270515200.0, + "grad_norm": 2.209826315991058, + "language_loss": 0.83313572, + "learning_rate": 3.9800464922902076e-06, + "loss": 0.8555935, + "num_input_tokens_seen": 26055805, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.375, + "step": 1225, + "time_per_iteration": 2.5317656993865967 + }, + { + "auxiliary_loss_clip": 0.01194191, + "auxiliary_loss_mlp": 0.0105208, + "balance_loss_clip": 1.0300144, + "balance_loss_mlp": 1.05566537, + "epoch": 0.0737111077709304, + "flos": 19933582452480.0, + "grad_norm": 2.0864455990649144, + "language_loss": 0.9037565, + "learning_rate": 3.979991577991808e-06, + "loss": 0.92621917, + "num_input_tokens_seen": 26073905, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.3828125, + "step": 1226, + "time_per_iteration": 5.374137878417969 + }, + { + "auxiliary_loss_clip": 0.01201048, + "auxiliary_loss_mlp": 0.01048748, + "balance_loss_clip": 1.02451301, + "balance_loss_mlp": 1.05401981, + "epoch": 0.07377123102359838, + "flos": 16581537342720.0, + "grad_norm": 2.8833434676543, + "language_loss": 0.76944947, + "learning_rate": 3.97993658861193e-06, + "loss": 0.79194742, + "num_input_tokens_seen": 26091700, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.46875, + "step": 1227, + "time_per_iteration": 2.4528942108154297 + }, + { + "auxiliary_loss_clip": 0.01192324, + "auxiliary_loss_mlp": 0.01049537, + "balance_loss_clip": 1.02720916, + "balance_loss_mlp": 1.05810142, + "epoch": 0.07383135427626634, + "flos": 28328563457280.0, + "grad_norm": 1.6041059240123434, + "language_loss": 0.85634637, + "learning_rate": 3.9798815241526575e-06, + "loss": 0.87876499, + "num_input_tokens_seen": 26114105, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.34375, + "step": 1228, + "time_per_iteration": 2.5452229976654053 + }, + { + "auxiliary_loss_clip": 0.01194537, + "auxiliary_loss_mlp": 0.01061009, + "balance_loss_clip": 1.0383954, + "balance_loss_mlp": 1.05448794, + "epoch": 0.07389147752893431, + "flos": 20047168235520.0, + "grad_norm": 4.251776538682485, + "language_loss": 0.79688829, + "learning_rate": 3.97982638461608e-06, + "loss": 0.81944382, + "num_input_tokens_seen": 26131165, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.3984375, + "step": 1229, + "time_per_iteration": 2.501086711883545 + }, + { + "auxiliary_loss_clip": 0.01196019, + "auxiliary_loss_mlp": 0.01061374, + "balance_loss_clip": 1.03777039, + "balance_loss_mlp": 1.05632436, + "epoch": 0.07395160078160229, + "flos": 18114132890880.0, + "grad_norm": 2.028375336194412, + "language_loss": 0.78218549, + "learning_rate": 3.979771170004287e-06, + "loss": 0.8047595, + "num_input_tokens_seen": 26150040, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.3984375, + "step": 1230, + "time_per_iteration": 2.4474098682403564 + }, + { + "auxiliary_loss_clip": 0.01193092, + "auxiliary_loss_mlp": 0.01048754, + "balance_loss_clip": 1.02554393, + "balance_loss_mlp": 1.05599403, + "epoch": 0.07401172403427025, + "flos": 23586918842880.0, + "grad_norm": 1.924374124094053, + "language_loss": 0.81301343, + "learning_rate": 3.979715880319372e-06, + "loss": 0.83543187, + "num_input_tokens_seen": 26169380, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.375, + "step": 1231, + "time_per_iteration": 2.4861042499542236 + }, + { + "auxiliary_loss_clip": 0.01198041, + "auxiliary_loss_mlp": 0.01066474, + "balance_loss_clip": 1.04277539, + "balance_loss_mlp": 1.05443811, + "epoch": 0.07407184728693822, + "flos": 26359904799360.0, + "grad_norm": 2.4882746298902343, + "language_loss": 0.95111585, + "learning_rate": 3.979660515563434e-06, + "loss": 0.97376096, + "num_input_tokens_seen": 26189420, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.4375, + "step": 1232, + "time_per_iteration": 2.5074143409729004 + }, + { + "auxiliary_loss_clip": 0.01194092, + "auxiliary_loss_mlp": 0.01060623, + "balance_loss_clip": 1.03938031, + "balance_loss_mlp": 1.05667329, + "epoch": 0.0741319705396062, + "flos": 22200443821440.0, + "grad_norm": 2.246534337547551, + "language_loss": 0.80640733, + "learning_rate": 3.979605075738569e-06, + "loss": 0.82895458, + "num_input_tokens_seen": 26209300, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.375, + "step": 1233, + "time_per_iteration": 2.490816831588745 + }, + { + "auxiliary_loss_clip": 0.01198611, + "auxiliary_loss_mlp": 0.01060349, + "balance_loss_clip": 1.03488624, + "balance_loss_mlp": 1.05483365, + "epoch": 0.07419209379227416, + "flos": 39200482523520.0, + "grad_norm": 2.357402762223285, + "language_loss": 0.70458734, + "learning_rate": 3.979549560846883e-06, + "loss": 0.72717696, + "num_input_tokens_seen": 26228110, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4375, + "step": 1234, + "time_per_iteration": 2.605139970779419 + }, + { + "auxiliary_loss_clip": 0.01195848, + "auxiliary_loss_mlp": 0.01059615, + "balance_loss_clip": 1.03665543, + "balance_loss_mlp": 1.05792761, + "epoch": 0.07425221704494213, + "flos": 22781657790720.0, + "grad_norm": 2.1034220776692765, + "language_loss": 0.77058101, + "learning_rate": 3.979493970890478e-06, + "loss": 0.79313564, + "num_input_tokens_seen": 26247020, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.3828125, + "step": 1235, + "time_per_iteration": 2.4728853702545166 + }, + { + "auxiliary_loss_clip": 0.01189622, + "auxiliary_loss_mlp": 0.01053872, + "balance_loss_clip": 1.03123438, + "balance_loss_mlp": 1.05414248, + "epoch": 0.0743123402976101, + "flos": 22272983337600.0, + "grad_norm": 5.584514149172867, + "language_loss": 0.82648033, + "learning_rate": 3.979438305871464e-06, + "loss": 0.84891528, + "num_input_tokens_seen": 26265750, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.359375, + "step": 1236, + "time_per_iteration": 2.462069511413574 + }, + { + "auxiliary_loss_clip": 0.0119681, + "auxiliary_loss_mlp": 0.01057366, + "balance_loss_clip": 1.03385794, + "balance_loss_mlp": 1.05572712, + "epoch": 0.07437246355027807, + "flos": 29315029645440.0, + "grad_norm": 2.2536643652174724, + "language_loss": 0.75702679, + "learning_rate": 3.979382565791951e-06, + "loss": 0.77956861, + "num_input_tokens_seen": 26287905, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.40625, + "step": 1237, + "time_per_iteration": 2.5572054386138916 + }, + { + "auxiliary_loss_clip": 0.01192925, + "auxiliary_loss_mlp": 0.01060344, + "balance_loss_clip": 1.03817141, + "balance_loss_mlp": 1.05427146, + "epoch": 0.07443258680294604, + "flos": 31944732249600.0, + "grad_norm": 1.878495773650564, + "language_loss": 0.7740556, + "learning_rate": 3.979326750654053e-06, + "loss": 0.7965883, + "num_input_tokens_seen": 26311795, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.3828125, + "step": 1238, + "time_per_iteration": 2.5915493965148926 + }, + { + "auxiliary_loss_clip": 0.01198337, + "auxiliary_loss_mlp": 0.01055743, + "balance_loss_clip": 1.03222322, + "balance_loss_mlp": 1.05435395, + "epoch": 0.074492710055614, + "flos": 22675290641280.0, + "grad_norm": 2.0695087378138455, + "language_loss": 0.86322856, + "learning_rate": 3.9792708604598854e-06, + "loss": 0.88576937, + "num_input_tokens_seen": 26330330, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.4375, + "step": 1239, + "time_per_iteration": 2.4961507320404053 + }, + { + "auxiliary_loss_clip": 0.01194884, + "auxiliary_loss_mlp": 0.01049072, + "balance_loss_clip": 1.02401412, + "balance_loss_mlp": 1.05433989, + "epoch": 0.07455283330828198, + "flos": 21284901037440.0, + "grad_norm": 2.179426429753772, + "language_loss": 0.89070082, + "learning_rate": 3.979214895211569e-06, + "loss": 0.91314042, + "num_input_tokens_seen": 26348865, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.40625, + "step": 1240, + "time_per_iteration": 2.456801176071167 + }, + { + "auxiliary_loss_clip": 0.01197473, + "auxiliary_loss_mlp": 0.01058447, + "balance_loss_clip": 1.03325772, + "balance_loss_mlp": 1.05600643, + "epoch": 0.07461295656094995, + "flos": 24388408967040.0, + "grad_norm": 2.2624482063672513, + "language_loss": 0.88586551, + "learning_rate": 3.979158854911225e-06, + "loss": 0.90842468, + "num_input_tokens_seen": 26368210, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.4140625, + "step": 1241, + "time_per_iteration": 2.5667178630828857 + }, + { + "auxiliary_loss_clip": 0.01080695, + "auxiliary_loss_mlp": 0.01022083, + "balance_loss_clip": 1.018507, + "balance_loss_mlp": 1.02113318, + "epoch": 0.07467307981361791, + "flos": 62109660574080.0, + "grad_norm": 0.9233978594431768, + "language_loss": 0.63032585, + "learning_rate": 3.979102739560979e-06, + "loss": 0.65135366, + "num_input_tokens_seen": 26424890, + "router_z_loss_clip": 0.03564453, + "router_z_loss_mlp": 0.59375, + "step": 1242, + "time_per_iteration": 3.1321358680725098 + }, + { + "auxiliary_loss_clip": 0.012088, + "auxiliary_loss_mlp": 0.01059736, + "balance_loss_clip": 1.03305697, + "balance_loss_mlp": 1.05792046, + "epoch": 0.07473320306628589, + "flos": 24863148046080.0, + "grad_norm": 2.8956100556858004, + "language_loss": 0.62917286, + "learning_rate": 3.9790465491629595e-06, + "loss": 0.65185821, + "num_input_tokens_seen": 26446405, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.5078125, + "step": 1243, + "time_per_iteration": 2.5571463108062744 + }, + { + "auxiliary_loss_clip": 0.01196196, + "auxiliary_loss_mlp": 0.01052045, + "balance_loss_clip": 1.0280956, + "balance_loss_mlp": 1.05710852, + "epoch": 0.07479332631895386, + "flos": 24897442556160.0, + "grad_norm": 2.504235331520048, + "language_loss": 0.76465732, + "learning_rate": 3.978990283719296e-06, + "loss": 0.78713971, + "num_input_tokens_seen": 26466070, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.390625, + "step": 1244, + "time_per_iteration": 2.501621723175049 + }, + { + "auxiliary_loss_clip": 0.01197755, + "auxiliary_loss_mlp": 0.01058762, + "balance_loss_clip": 1.03462183, + "balance_loss_mlp": 1.05684423, + "epoch": 0.07485344957162182, + "flos": 17815247821440.0, + "grad_norm": 2.8968513367461495, + "language_loss": 0.69149882, + "learning_rate": 3.978933943232123e-06, + "loss": 0.714064, + "num_input_tokens_seen": 26479350, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.40625, + "step": 1245, + "time_per_iteration": 2.417921781539917 + }, + { + "auxiliary_loss_clip": 0.01196347, + "auxiliary_loss_mlp": 0.01052065, + "balance_loss_clip": 1.02768707, + "balance_loss_mlp": 1.05663347, + "epoch": 0.0749135728242898, + "flos": 25010202326400.0, + "grad_norm": 1.9272496045423029, + "language_loss": 0.88344061, + "learning_rate": 3.978877527703576e-06, + "loss": 0.90592474, + "num_input_tokens_seen": 26498255, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.3984375, + "step": 1246, + "time_per_iteration": 2.5631723403930664 + }, + { + "auxiliary_loss_clip": 0.01205457, + "auxiliary_loss_mlp": 0.01067222, + "balance_loss_clip": 1.04055524, + "balance_loss_mlp": 1.05656838, + "epoch": 0.07497369607695777, + "flos": 17822071405440.0, + "grad_norm": 2.4755370190447064, + "language_loss": 0.87921643, + "learning_rate": 3.9788210371357945e-06, + "loss": 0.90194321, + "num_input_tokens_seen": 26515375, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.4921875, + "step": 1247, + "time_per_iteration": 2.4602389335632324 + }, + { + "auxiliary_loss_clip": 0.01194073, + "auxiliary_loss_mlp": 0.01060013, + "balance_loss_clip": 1.03502667, + "balance_loss_mlp": 1.05565107, + "epoch": 0.07503381932962573, + "flos": 15121086261120.0, + "grad_norm": 2.2039165223770194, + "language_loss": 0.6477375, + "learning_rate": 3.978764471530921e-06, + "loss": 0.67027843, + "num_input_tokens_seen": 26533595, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.3828125, + "step": 1248, + "time_per_iteration": 2.4408388137817383 + }, + { + "auxiliary_loss_clip": 0.01192958, + "auxiliary_loss_mlp": 0.0106246, + "balance_loss_clip": 1.04016805, + "balance_loss_mlp": 1.0575254, + "epoch": 0.0750939425822937, + "flos": 12816734071680.0, + "grad_norm": 2.0641418493429713, + "language_loss": 0.73964334, + "learning_rate": 3.978707830891102e-06, + "loss": 0.76219749, + "num_input_tokens_seen": 26549405, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.359375, + "step": 1249, + "time_per_iteration": 2.443767547607422 + }, + { + "auxiliary_loss_clip": 0.01201286, + "auxiliary_loss_mlp": 0.01068388, + "balance_loss_clip": 1.0433774, + "balance_loss_mlp": 1.05842972, + "epoch": 0.07515406583496168, + "flos": 24206844695040.0, + "grad_norm": 2.607815988938315, + "language_loss": 0.81845009, + "learning_rate": 3.978651115218482e-06, + "loss": 0.84114683, + "num_input_tokens_seen": 26567200, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.4296875, + "step": 1250, + "time_per_iteration": 2.491236448287964 + }, + { + "auxiliary_loss_clip": 0.01197565, + "auxiliary_loss_mlp": 0.01061004, + "balance_loss_clip": 1.03703094, + "balance_loss_mlp": 1.05932856, + "epoch": 0.07521418908762964, + "flos": 26688164215680.0, + "grad_norm": 2.308634463940828, + "language_loss": 0.66713893, + "learning_rate": 3.978594324515215e-06, + "loss": 0.68972456, + "num_input_tokens_seen": 26586190, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.3828125, + "step": 1251, + "time_per_iteration": 2.5437874794006348 + }, + { + "auxiliary_loss_clip": 0.0107681, + "auxiliary_loss_mlp": 0.01002851, + "balance_loss_clip": 0.99946529, + "balance_loss_mlp": 1.02021933, + "epoch": 0.0752743123402976, + "flos": 59095140589440.0, + "grad_norm": 0.8978558428983584, + "language_loss": 0.70356798, + "learning_rate": 3.9785374587834515e-06, + "loss": 0.72436458, + "num_input_tokens_seen": 26650710, + "router_z_loss_clip": 0.03393555, + "router_z_loss_mlp": 0.56640625, + "step": 1252, + "time_per_iteration": 3.1170923709869385 + }, + { + "auxiliary_loss_clip": 0.01194007, + "auxiliary_loss_mlp": 0.01061281, + "balance_loss_clip": 1.03698599, + "balance_loss_mlp": 1.05419612, + "epoch": 0.07533443559296558, + "flos": 23477032160640.0, + "grad_norm": 2.9290655276351045, + "language_loss": 0.79516673, + "learning_rate": 3.97848051802535e-06, + "loss": 0.81771958, + "num_input_tokens_seen": 26669000, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.3984375, + "step": 1253, + "time_per_iteration": 2.4821414947509766 + }, + { + "auxiliary_loss_clip": 0.01199953, + "auxiliary_loss_mlp": 0.01065033, + "balance_loss_clip": 1.04125071, + "balance_loss_mlp": 1.05829906, + "epoch": 0.07539455884563355, + "flos": 20879110114560.0, + "grad_norm": 2.5751371148477995, + "language_loss": 0.93441045, + "learning_rate": 3.978423502243069e-06, + "loss": 0.95706034, + "num_input_tokens_seen": 26683075, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.421875, + "step": 1254, + "time_per_iteration": 2.4245519638061523 + }, + { + "auxiliary_loss_clip": 0.01191058, + "auxiliary_loss_mlp": 0.01062028, + "balance_loss_clip": 1.03849554, + "balance_loss_mlp": 1.05566263, + "epoch": 0.07545468209830151, + "flos": 27672906551040.0, + "grad_norm": 1.866823394820361, + "language_loss": 0.88030314, + "learning_rate": 3.97836641143877e-06, + "loss": 0.902834, + "num_input_tokens_seen": 26701875, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.359375, + "step": 1255, + "time_per_iteration": 2.5579185485839844 + }, + { + "auxiliary_loss_clip": 0.01192242, + "auxiliary_loss_mlp": 0.01064619, + "balance_loss_clip": 1.04009795, + "balance_loss_mlp": 1.05518413, + "epoch": 0.0755148053509695, + "flos": 14136990370560.0, + "grad_norm": 1.7574194703288544, + "language_loss": 0.79516619, + "learning_rate": 3.978309245614618e-06, + "loss": 0.81773484, + "num_input_tokens_seen": 26719050, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.3671875, + "step": 1256, + "time_per_iteration": 2.4203784465789795 + }, + { + "auxiliary_loss_clip": 0.01074137, + "auxiliary_loss_mlp": 0.01007102, + "balance_loss_clip": 1.00378788, + "balance_loss_mlp": 1.01769829, + "epoch": 0.07557492860363746, + "flos": 58235257929600.0, + "grad_norm": 0.8283025846018472, + "language_loss": 0.58016127, + "learning_rate": 3.9782520047727825e-06, + "loss": 0.60097361, + "num_input_tokens_seen": 26780650, + "router_z_loss_clip": 0.03320312, + "router_z_loss_mlp": 0.5625, + "step": 1257, + "time_per_iteration": 3.1732118129730225 + }, + { + "auxiliary_loss_clip": 0.0119581, + "auxiliary_loss_mlp": 0.01056297, + "balance_loss_clip": 1.03272927, + "balance_loss_mlp": 1.05982757, + "epoch": 0.07563505185630542, + "flos": 24644380262400.0, + "grad_norm": 3.1336739114125107, + "language_loss": 0.89859951, + "learning_rate": 3.978194688915432e-06, + "loss": 0.92112058, + "num_input_tokens_seen": 26798725, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.359375, + "step": 1258, + "time_per_iteration": 2.516925811767578 + }, + { + "auxiliary_loss_clip": 0.01192364, + "auxiliary_loss_mlp": 0.01054502, + "balance_loss_clip": 1.03181624, + "balance_loss_mlp": 1.05663717, + "epoch": 0.07569517510897339, + "flos": 15522998515200.0, + "grad_norm": 3.28312942247731, + "language_loss": 0.81211507, + "learning_rate": 3.978137298044741e-06, + "loss": 0.83458376, + "num_input_tokens_seen": 26817005, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.359375, + "step": 1259, + "time_per_iteration": 2.449533224105835 + }, + { + "auxiliary_loss_clip": 0.01193912, + "auxiliary_loss_mlp": 0.01058573, + "balance_loss_clip": 1.03593481, + "balance_loss_mlp": 1.05662787, + "epoch": 0.07575529836164137, + "flos": 22928532503040.0, + "grad_norm": 1.9172803769558988, + "language_loss": 0.75733984, + "learning_rate": 3.978079832162885e-06, + "loss": 0.77986467, + "num_input_tokens_seen": 26836655, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.375, + "step": 1260, + "time_per_iteration": 2.5003559589385986 + }, + { + "auxiliary_loss_clip": 0.01192246, + "auxiliary_loss_mlp": 0.01059755, + "balance_loss_clip": 1.03550828, + "balance_loss_mlp": 1.0552032, + "epoch": 0.07581542161430933, + "flos": 19500428344320.0, + "grad_norm": 1.8260195606442358, + "language_loss": 0.84695768, + "learning_rate": 3.978022291272044e-06, + "loss": 0.86947775, + "num_input_tokens_seen": 26854925, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.375, + "step": 1261, + "time_per_iteration": 2.4633476734161377 + }, + { + "auxiliary_loss_clip": 0.01200376, + "auxiliary_loss_mlp": 0.01060967, + "balance_loss_clip": 1.03828108, + "balance_loss_mlp": 1.05969536, + "epoch": 0.0758755448669773, + "flos": 24973465691520.0, + "grad_norm": 2.3160282321136334, + "language_loss": 0.8266682, + "learning_rate": 3.977964675374399e-06, + "loss": 0.84928167, + "num_input_tokens_seen": 26876170, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.40625, + "step": 1262, + "time_per_iteration": 2.5256471633911133 + }, + { + "auxiliary_loss_clip": 0.01192085, + "auxiliary_loss_mlp": 0.01061195, + "balance_loss_clip": 1.03703153, + "balance_loss_mlp": 1.0540688, + "epoch": 0.07593566811964528, + "flos": 22747973811840.0, + "grad_norm": 2.4581964181262776, + "language_loss": 0.8255769, + "learning_rate": 3.977906984472136e-06, + "loss": 0.84810972, + "num_input_tokens_seen": 26895005, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.3828125, + "step": 1263, + "time_per_iteration": 2.470656633377075 + }, + { + "auxiliary_loss_clip": 0.01195735, + "auxiliary_loss_mlp": 0.01056704, + "balance_loss_clip": 1.03381538, + "balance_loss_mlp": 1.05504882, + "epoch": 0.07599579137231324, + "flos": 23112395245440.0, + "grad_norm": 2.324943057092889, + "language_loss": 0.7591399, + "learning_rate": 3.977849218567442e-06, + "loss": 0.78166431, + "num_input_tokens_seen": 26913930, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.40625, + "step": 1264, + "time_per_iteration": 2.4715359210968018 + }, + { + "auxiliary_loss_clip": 0.0119596, + "auxiliary_loss_mlp": 0.01062168, + "balance_loss_clip": 1.03832579, + "balance_loss_mlp": 1.05711412, + "epoch": 0.07605591462498121, + "flos": 14502058248960.0, + "grad_norm": 2.1997185871944356, + "language_loss": 0.81106204, + "learning_rate": 3.977791377662507e-06, + "loss": 0.83364332, + "num_input_tokens_seen": 26931485, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.390625, + "step": 1265, + "time_per_iteration": 2.440000295639038 + }, + { + "auxiliary_loss_clip": 0.01195477, + "auxiliary_loss_mlp": 0.01056708, + "balance_loss_clip": 1.03408241, + "balance_loss_mlp": 1.05631864, + "epoch": 0.07611603787764919, + "flos": 23514199758720.0, + "grad_norm": 2.141616369936441, + "language_loss": 0.64935738, + "learning_rate": 3.977733461759524e-06, + "loss": 0.67187923, + "num_input_tokens_seen": 26951670, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.390625, + "step": 1266, + "time_per_iteration": 2.495001792907715 + }, + { + "auxiliary_loss_clip": 0.01194799, + "auxiliary_loss_mlp": 0.01060988, + "balance_loss_clip": 1.03752804, + "balance_loss_mlp": 1.05550349, + "epoch": 0.07617616113031715, + "flos": 21507188353920.0, + "grad_norm": 2.2514277899416606, + "language_loss": 0.79527593, + "learning_rate": 3.977675470860691e-06, + "loss": 0.81783378, + "num_input_tokens_seen": 26970335, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.390625, + "step": 1267, + "time_per_iteration": 2.4763970375061035 + }, + { + "auxiliary_loss_clip": 0.01194511, + "auxiliary_loss_mlp": 0.01051729, + "balance_loss_clip": 1.02975869, + "balance_loss_mlp": 1.05526185, + "epoch": 0.07623628438298512, + "flos": 14573161221120.0, + "grad_norm": 2.2740159695832682, + "language_loss": 0.7253381, + "learning_rate": 3.977617404968205e-06, + "loss": 0.74780059, + "num_input_tokens_seen": 26986025, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.390625, + "step": 1268, + "time_per_iteration": 3.8910977840423584 + }, + { + "auxiliary_loss_clip": 0.01192554, + "auxiliary_loss_mlp": 0.01057239, + "balance_loss_clip": 1.03447044, + "balance_loss_mlp": 1.05342031, + "epoch": 0.07629640763565308, + "flos": 14720395069440.0, + "grad_norm": 2.163449384012833, + "language_loss": 0.81891817, + "learning_rate": 3.977559264084269e-06, + "loss": 0.84141612, + "num_input_tokens_seen": 27004045, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.390625, + "step": 1269, + "time_per_iteration": 3.8643741607666016 + }, + { + "auxiliary_loss_clip": 0.01192657, + "auxiliary_loss_mlp": 0.01054484, + "balance_loss_clip": 1.03120267, + "balance_loss_mlp": 1.05559695, + "epoch": 0.07635653088832106, + "flos": 14902929008640.0, + "grad_norm": 3.2383492700687078, + "language_loss": 0.88135087, + "learning_rate": 3.977501048211088e-06, + "loss": 0.90382218, + "num_input_tokens_seen": 27022070, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.375, + "step": 1270, + "time_per_iteration": 2.4746575355529785 + }, + { + "auxiliary_loss_clip": 0.01198155, + "auxiliary_loss_mlp": 0.0105921, + "balance_loss_clip": 1.03559494, + "balance_loss_mlp": 1.05707884, + "epoch": 0.07641665414098903, + "flos": 26651571235200.0, + "grad_norm": 2.188682914143081, + "language_loss": 0.71113384, + "learning_rate": 3.977442757350869e-06, + "loss": 0.73370755, + "num_input_tokens_seen": 27041755, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.4140625, + "step": 1271, + "time_per_iteration": 2.529632091522217 + }, + { + "auxiliary_loss_clip": 0.01189637, + "auxiliary_loss_mlp": 0.01066344, + "balance_loss_clip": 1.04351556, + "balance_loss_mlp": 1.05675423, + "epoch": 0.07647677739365699, + "flos": 25192808092800.0, + "grad_norm": 1.9018984880968814, + "language_loss": 0.82745486, + "learning_rate": 3.977384391505823e-06, + "loss": 0.85001469, + "num_input_tokens_seen": 27061540, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.328125, + "step": 1272, + "time_per_iteration": 2.4950368404388428 + }, + { + "auxiliary_loss_clip": 0.01191491, + "auxiliary_loss_mlp": 0.01061838, + "balance_loss_clip": 1.03867579, + "balance_loss_mlp": 1.05351079, + "epoch": 0.07653690064632497, + "flos": 20558141159040.0, + "grad_norm": 2.0211474255264643, + "language_loss": 0.79951203, + "learning_rate": 3.977325950678162e-06, + "loss": 0.82204533, + "num_input_tokens_seen": 27081395, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.3828125, + "step": 1273, + "time_per_iteration": 2.490281105041504 + }, + { + "auxiliary_loss_clip": 0.01194744, + "auxiliary_loss_mlp": 0.01062211, + "balance_loss_clip": 1.03858376, + "balance_loss_mlp": 1.05600715, + "epoch": 0.07659702389899294, + "flos": 22269320150400.0, + "grad_norm": 1.848359088284866, + "language_loss": 0.81545758, + "learning_rate": 3.977267434870103e-06, + "loss": 0.83802712, + "num_input_tokens_seen": 27101175, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.390625, + "step": 1274, + "time_per_iteration": 2.499799966812134 + }, + { + "auxiliary_loss_clip": 0.01191932, + "auxiliary_loss_mlp": 0.01068548, + "balance_loss_clip": 1.04430115, + "balance_loss_mlp": 1.05469346, + "epoch": 0.0766571471516609, + "flos": 32636120209920.0, + "grad_norm": 1.991418246716423, + "language_loss": 0.73099387, + "learning_rate": 3.977208844083865e-06, + "loss": 0.75359869, + "num_input_tokens_seen": 27124505, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.375, + "step": 1275, + "time_per_iteration": 2.557973623275757 + }, + { + "auxiliary_loss_clip": 0.011939, + "auxiliary_loss_mlp": 0.01061514, + "balance_loss_clip": 1.0368377, + "balance_loss_mlp": 1.05536842, + "epoch": 0.07671727040432888, + "flos": 15267386355840.0, + "grad_norm": 2.1093684912214545, + "language_loss": 0.79584897, + "learning_rate": 3.9771501783216685e-06, + "loss": 0.81840312, + "num_input_tokens_seen": 27140960, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.390625, + "step": 1276, + "time_per_iteration": 2.4329752922058105 + }, + { + "auxiliary_loss_clip": 0.01196395, + "auxiliary_loss_mlp": 0.01051332, + "balance_loss_clip": 1.02838457, + "balance_loss_mlp": 1.05656397, + "epoch": 0.07677739365699685, + "flos": 28184094956160.0, + "grad_norm": 2.623540269613024, + "language_loss": 0.59020305, + "learning_rate": 3.97709143758574e-06, + "loss": 0.61268032, + "num_input_tokens_seen": 27160985, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.3984375, + "step": 1277, + "time_per_iteration": 2.5318989753723145 + }, + { + "auxiliary_loss_clip": 0.01200985, + "auxiliary_loss_mlp": 0.01057464, + "balance_loss_clip": 1.03433728, + "balance_loss_mlp": 1.05805659, + "epoch": 0.07683751690966481, + "flos": 18296128126080.0, + "grad_norm": 2.2944749333347096, + "language_loss": 0.74846482, + "learning_rate": 3.977032621878305e-06, + "loss": 0.77104926, + "num_input_tokens_seen": 27178390, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.4296875, + "step": 1278, + "time_per_iteration": 2.448615789413452 + }, + { + "auxiliary_loss_clip": 0.01190146, + "auxiliary_loss_mlp": 0.01051712, + "balance_loss_clip": 1.02943182, + "balance_loss_mlp": 1.05475163, + "epoch": 0.07689764016233278, + "flos": 21981101420160.0, + "grad_norm": 4.0999470067777075, + "language_loss": 0.88656616, + "learning_rate": 3.976973731201596e-06, + "loss": 0.90898478, + "num_input_tokens_seen": 27197505, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.359375, + "step": 1279, + "time_per_iteration": 2.4883790016174316 + }, + { + "auxiliary_loss_clip": 0.01189256, + "auxiliary_loss_mlp": 0.01062556, + "balance_loss_clip": 1.03973901, + "balance_loss_mlp": 1.05507362, + "epoch": 0.07695776341500075, + "flos": 22235995307520.0, + "grad_norm": 3.4596954186847393, + "language_loss": 0.82899994, + "learning_rate": 3.976914765557845e-06, + "loss": 0.85151803, + "num_input_tokens_seen": 27214260, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.34375, + "step": 1280, + "time_per_iteration": 2.459294319152832 + }, + { + "auxiliary_loss_clip": 0.01188755, + "auxiliary_loss_mlp": 0.01061811, + "balance_loss_clip": 1.03874409, + "balance_loss_mlp": 1.05492759, + "epoch": 0.07701788666766872, + "flos": 16143750380160.0, + "grad_norm": 1.9224222656998016, + "language_loss": 0.76059222, + "learning_rate": 3.9768557249492875e-06, + "loss": 0.78309786, + "num_input_tokens_seen": 27232525, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.3359375, + "step": 1281, + "time_per_iteration": 2.453183650970459 + }, + { + "auxiliary_loss_clip": 0.0119548, + "auxiliary_loss_mlp": 0.01054802, + "balance_loss_clip": 1.03128171, + "balance_loss_mlp": 1.05448353, + "epoch": 0.07707800992033668, + "flos": 19463045264640.0, + "grad_norm": 1.8937081587754587, + "language_loss": 0.75307631, + "learning_rate": 3.9767966093781634e-06, + "loss": 0.77557921, + "num_input_tokens_seen": 27249800, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.40625, + "step": 1282, + "time_per_iteration": 2.4526116847991943 + }, + { + "auxiliary_loss_clip": 0.01190337, + "auxiliary_loss_mlp": 0.01070616, + "balance_loss_clip": 1.04734671, + "balance_loss_mlp": 1.054286, + "epoch": 0.07713813317300466, + "flos": 18990281433600.0, + "grad_norm": 2.0304459145795963, + "language_loss": 0.8428033, + "learning_rate": 3.976737418846713e-06, + "loss": 0.86541283, + "num_input_tokens_seen": 27268895, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.359375, + "step": 1283, + "time_per_iteration": 2.468101739883423 + }, + { + "auxiliary_loss_clip": 0.01192768, + "auxiliary_loss_mlp": 0.01062747, + "balance_loss_clip": 1.0375464, + "balance_loss_mlp": 1.05560803, + "epoch": 0.07719825642567263, + "flos": 18113953322880.0, + "grad_norm": 2.622403612740989, + "language_loss": 0.75031364, + "learning_rate": 3.976678153357181e-06, + "loss": 0.77286887, + "num_input_tokens_seen": 27288180, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.375, + "step": 1284, + "time_per_iteration": 2.451749801635742 + }, + { + "auxiliary_loss_clip": 0.01188745, + "auxiliary_loss_mlp": 0.0106155, + "balance_loss_clip": 1.03947222, + "balance_loss_mlp": 1.05330253, + "epoch": 0.0772583796783406, + "flos": 42194426993280.0, + "grad_norm": 1.6448065546510353, + "language_loss": 0.75934827, + "learning_rate": 3.976618812911817e-06, + "loss": 0.78185129, + "num_input_tokens_seen": 27311815, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.359375, + "step": 1285, + "time_per_iteration": 2.664769411087036 + }, + { + "auxiliary_loss_clip": 0.01196484, + "auxiliary_loss_mlp": 0.01062869, + "balance_loss_clip": 1.0406251, + "balance_loss_mlp": 1.05862105, + "epoch": 0.07731850293100857, + "flos": 24753692327040.0, + "grad_norm": 1.8165785508620624, + "language_loss": 0.84204662, + "learning_rate": 3.9765593975128685e-06, + "loss": 0.86464012, + "num_input_tokens_seen": 27331890, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.375, + "step": 1286, + "time_per_iteration": 2.550670862197876 + }, + { + "auxiliary_loss_clip": 0.01196192, + "auxiliary_loss_mlp": 0.01055874, + "balance_loss_clip": 1.03271151, + "balance_loss_mlp": 1.05582845, + "epoch": 0.07737862618367654, + "flos": 17565884628480.0, + "grad_norm": 4.521300853065514, + "language_loss": 0.76725763, + "learning_rate": 3.97649990716259e-06, + "loss": 0.78977823, + "num_input_tokens_seen": 27348320, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.40625, + "step": 1287, + "time_per_iteration": 2.455627918243408 + }, + { + "auxiliary_loss_clip": 0.01190346, + "auxiliary_loss_mlp": 0.01058612, + "balance_loss_clip": 1.03636777, + "balance_loss_mlp": 1.05476642, + "epoch": 0.0774387494363445, + "flos": 25627147349760.0, + "grad_norm": 1.6785000972571258, + "language_loss": 0.84509134, + "learning_rate": 3.976440341863237e-06, + "loss": 0.86758095, + "num_input_tokens_seen": 27367670, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.359375, + "step": 1288, + "time_per_iteration": 2.500218629837036 + }, + { + "auxiliary_loss_clip": 0.01192387, + "auxiliary_loss_mlp": 0.01056799, + "balance_loss_clip": 1.0350318, + "balance_loss_mlp": 1.05364347, + "epoch": 0.07749887268901248, + "flos": 12239865648000.0, + "grad_norm": 2.192533837519805, + "language_loss": 0.85769016, + "learning_rate": 3.976380701617068e-06, + "loss": 0.88018203, + "num_input_tokens_seen": 27385485, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.390625, + "step": 1289, + "time_per_iteration": 2.4759440422058105 + }, + { + "auxiliary_loss_clip": 0.01189023, + "auxiliary_loss_mlp": 0.01047658, + "balance_loss_clip": 1.02563989, + "balance_loss_mlp": 1.05300641, + "epoch": 0.07755899594168045, + "flos": 25081736261760.0, + "grad_norm": 1.8877463184856607, + "language_loss": 0.85053366, + "learning_rate": 3.976320986426344e-06, + "loss": 0.87290049, + "num_input_tokens_seen": 27405110, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.359375, + "step": 1290, + "time_per_iteration": 2.5266401767730713 + }, + { + "auxiliary_loss_clip": 0.01185369, + "auxiliary_loss_mlp": 0.01059291, + "balance_loss_clip": 1.03541303, + "balance_loss_mlp": 1.05397463, + "epoch": 0.07761911919434841, + "flos": 14246410176000.0, + "grad_norm": 2.3980248629455834, + "language_loss": 0.90562832, + "learning_rate": 3.9762611962933315e-06, + "loss": 0.92807496, + "num_input_tokens_seen": 27422855, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.3125, + "step": 1291, + "time_per_iteration": 2.4760262966156006 + }, + { + "auxiliary_loss_clip": 0.01071114, + "auxiliary_loss_mlp": 0.01008288, + "balance_loss_clip": 1.00456893, + "balance_loss_mlp": 1.01656318, + "epoch": 0.07767924244701638, + "flos": 67237202954880.0, + "grad_norm": 0.9429671936579762, + "language_loss": 0.64993972, + "learning_rate": 3.9762013312202955e-06, + "loss": 0.67073375, + "num_input_tokens_seen": 27487190, + "router_z_loss_clip": 0.03710938, + "router_z_loss_mlp": 0.546875, + "step": 1292, + "time_per_iteration": 3.1508371829986572 + }, + { + "auxiliary_loss_clip": 0.0118873, + "auxiliary_loss_mlp": 0.01059018, + "balance_loss_clip": 1.03716707, + "balance_loss_mlp": 1.05293965, + "epoch": 0.07773936569968436, + "flos": 28550635292160.0, + "grad_norm": 1.7960778456946043, + "language_loss": 0.87610948, + "learning_rate": 3.9761413912095075e-06, + "loss": 0.89858699, + "num_input_tokens_seen": 27510465, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.359375, + "step": 1293, + "time_per_iteration": 2.6359729766845703 + }, + { + "auxiliary_loss_clip": 0.01193413, + "auxiliary_loss_mlp": 0.01062236, + "balance_loss_clip": 1.03789377, + "balance_loss_mlp": 1.05659533, + "epoch": 0.07779948895235232, + "flos": 27490264871040.0, + "grad_norm": 2.312065886688882, + "language_loss": 0.85111046, + "learning_rate": 3.976081376263239e-06, + "loss": 0.873667, + "num_input_tokens_seen": 27528645, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.3671875, + "step": 1294, + "time_per_iteration": 2.5151314735412598 + }, + { + "auxiliary_loss_clip": 0.01193943, + "auxiliary_loss_mlp": 0.01054926, + "balance_loss_clip": 1.03188252, + "balance_loss_mlp": 1.05702615, + "epoch": 0.07785961220502029, + "flos": 18223301301120.0, + "grad_norm": 2.728225366024782, + "language_loss": 0.79202414, + "learning_rate": 3.976021286383768e-06, + "loss": 0.81451285, + "num_input_tokens_seen": 27546165, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.3671875, + "step": 1295, + "time_per_iteration": 2.459510326385498 + }, + { + "auxiliary_loss_clip": 0.01188808, + "auxiliary_loss_mlp": 0.01052849, + "balance_loss_clip": 1.02966261, + "balance_loss_mlp": 1.05383039, + "epoch": 0.07791973545768827, + "flos": 24608218245120.0, + "grad_norm": 2.8222308711400834, + "language_loss": 0.88216382, + "learning_rate": 3.975961121573371e-06, + "loss": 0.90458035, + "num_input_tokens_seen": 27566520, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3515625, + "step": 1296, + "time_per_iteration": 2.492892026901245 + }, + { + "auxiliary_loss_clip": 0.01192946, + "auxiliary_loss_mlp": 0.01058016, + "balance_loss_clip": 1.03410244, + "balance_loss_mlp": 1.05591464, + "epoch": 0.07797985871035623, + "flos": 14282069402880.0, + "grad_norm": 3.2140473454082086, + "language_loss": 0.96160841, + "learning_rate": 3.9759008818343305e-06, + "loss": 0.98411804, + "num_input_tokens_seen": 27581960, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.375, + "step": 1297, + "time_per_iteration": 2.4668915271759033 + }, + { + "auxiliary_loss_clip": 0.01189875, + "auxiliary_loss_mlp": 0.01054366, + "balance_loss_clip": 1.032372, + "balance_loss_mlp": 1.05289149, + "epoch": 0.0780399819630242, + "flos": 26610453141120.0, + "grad_norm": 2.460261972702069, + "language_loss": 0.76087165, + "learning_rate": 3.97584056716893e-06, + "loss": 0.78331399, + "num_input_tokens_seen": 27601415, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.3671875, + "step": 1298, + "time_per_iteration": 2.5059781074523926 + }, + { + "auxiliary_loss_clip": 0.01192131, + "auxiliary_loss_mlp": 0.01061793, + "balance_loss_clip": 1.04039502, + "balance_loss_mlp": 1.05696058, + "epoch": 0.07810010521569218, + "flos": 21834514016640.0, + "grad_norm": 1.8752674736144914, + "language_loss": 0.80755305, + "learning_rate": 3.9757801775794575e-06, + "loss": 0.83009231, + "num_input_tokens_seen": 27621490, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.3515625, + "step": 1299, + "time_per_iteration": 2.5036020278930664 + }, + { + "auxiliary_loss_clip": 0.01183493, + "auxiliary_loss_mlp": 0.01056623, + "balance_loss_clip": 1.03402138, + "balance_loss_mlp": 1.05226159, + "epoch": 0.07816022846836014, + "flos": 25081233471360.0, + "grad_norm": 2.1903498852009813, + "language_loss": 0.86459941, + "learning_rate": 3.975719713068202e-06, + "loss": 0.88700056, + "num_input_tokens_seen": 27640600, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.3125, + "step": 1300, + "time_per_iteration": 2.4866278171539307 + }, + { + "auxiliary_loss_clip": 0.0118988, + "auxiliary_loss_mlp": 0.01052064, + "balance_loss_clip": 1.0284245, + "balance_loss_mlp": 1.05393028, + "epoch": 0.0782203517210281, + "flos": 40917515431680.0, + "grad_norm": 1.909902293479526, + "language_loss": 0.71778899, + "learning_rate": 3.975659173637458e-06, + "loss": 0.74020839, + "num_input_tokens_seen": 27663070, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.359375, + "step": 1301, + "time_per_iteration": 2.6491336822509766 + }, + { + "auxiliary_loss_clip": 0.01196178, + "auxiliary_loss_mlp": 0.0106414, + "balance_loss_clip": 1.04106081, + "balance_loss_mlp": 1.0586772, + "epoch": 0.07828047497369607, + "flos": 41172014269440.0, + "grad_norm": 1.5624281437346959, + "language_loss": 0.70860815, + "learning_rate": 3.97559855928952e-06, + "loss": 0.7312113, + "num_input_tokens_seen": 27686425, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.375, + "step": 1302, + "time_per_iteration": 2.635430335998535 + }, + { + "auxiliary_loss_clip": 0.01188946, + "auxiliary_loss_mlp": 0.01060723, + "balance_loss_clip": 1.03702378, + "balance_loss_mlp": 1.05438161, + "epoch": 0.07834059822636405, + "flos": 23508130360320.0, + "grad_norm": 2.152945758623263, + "language_loss": 0.8192755, + "learning_rate": 3.9755378700266864e-06, + "loss": 0.84177226, + "num_input_tokens_seen": 27704900, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.34375, + "step": 1303, + "time_per_iteration": 2.4861090183258057 + }, + { + "auxiliary_loss_clip": 0.01188578, + "auxiliary_loss_mlp": 0.01061933, + "balance_loss_clip": 1.03879452, + "balance_loss_mlp": 1.05351233, + "epoch": 0.07840072147903202, + "flos": 20193899293440.0, + "grad_norm": 1.8425530042965788, + "language_loss": 0.7497822, + "learning_rate": 3.9754771058512585e-06, + "loss": 0.77228731, + "num_input_tokens_seen": 27724890, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3515625, + "step": 1304, + "time_per_iteration": 2.464087963104248 + }, + { + "auxiliary_loss_clip": 0.01191658, + "auxiliary_loss_mlp": 0.0106237, + "balance_loss_clip": 1.03857565, + "balance_loss_mlp": 1.05645108, + "epoch": 0.07846084473169998, + "flos": 21360816432000.0, + "grad_norm": 1.696211405930565, + "language_loss": 0.76397038, + "learning_rate": 3.975416266765542e-06, + "loss": 0.78651059, + "num_input_tokens_seen": 27743115, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.359375, + "step": 1305, + "time_per_iteration": 2.486093521118164 + }, + { + "auxiliary_loss_clip": 0.01192283, + "auxiliary_loss_mlp": 0.01064575, + "balance_loss_clip": 1.04087615, + "balance_loss_mlp": 1.05527782, + "epoch": 0.07852096798436796, + "flos": 25410965345280.0, + "grad_norm": 2.2926357932273866, + "language_loss": 0.85035503, + "learning_rate": 3.975355352771841e-06, + "loss": 0.87292361, + "num_input_tokens_seen": 27763570, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.3671875, + "step": 1306, + "time_per_iteration": 2.496265172958374 + }, + { + "auxiliary_loss_clip": 0.0119039, + "auxiliary_loss_mlp": 0.0104404, + "balance_loss_clip": 1.02299976, + "balance_loss_mlp": 1.05652416, + "epoch": 0.07858109123703592, + "flos": 24571481610240.0, + "grad_norm": 3.0575778567802976, + "language_loss": 0.90087706, + "learning_rate": 3.975294363872468e-06, + "loss": 0.92322135, + "num_input_tokens_seen": 27780030, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.34375, + "step": 1307, + "time_per_iteration": 2.5122623443603516 + }, + { + "auxiliary_loss_clip": 0.01189263, + "auxiliary_loss_mlp": 0.01057091, + "balance_loss_clip": 1.03295124, + "balance_loss_mlp": 1.05417371, + "epoch": 0.07864121448970389, + "flos": 20698874645760.0, + "grad_norm": 1.8540925974151201, + "language_loss": 0.83408689, + "learning_rate": 3.975233300069735e-06, + "loss": 0.85655046, + "num_input_tokens_seen": 27796225, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.3515625, + "step": 1308, + "time_per_iteration": 2.4686944484710693 + }, + { + "auxiliary_loss_clip": 0.01186004, + "auxiliary_loss_mlp": 0.01053628, + "balance_loss_clip": 1.03177738, + "balance_loss_mlp": 1.05289674, + "epoch": 0.07870133774237187, + "flos": 22966526113920.0, + "grad_norm": 1.6283340971904061, + "language_loss": 0.77841777, + "learning_rate": 3.975172161365958e-06, + "loss": 0.80081415, + "num_input_tokens_seen": 27815975, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.328125, + "step": 1309, + "time_per_iteration": 5.444388151168823 + }, + { + "auxiliary_loss_clip": 0.0119416, + "auxiliary_loss_mlp": 0.01062294, + "balance_loss_clip": 1.0380106, + "balance_loss_mlp": 1.05386913, + "epoch": 0.07876146099503983, + "flos": 18842832103680.0, + "grad_norm": 1.9656388899868151, + "language_loss": 0.80146122, + "learning_rate": 3.975110947763453e-06, + "loss": 0.82402575, + "num_input_tokens_seen": 27832255, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.40625, + "step": 1310, + "time_per_iteration": 3.8553466796875 + }, + { + "auxiliary_loss_clip": 0.01185305, + "auxiliary_loss_mlp": 0.01052602, + "balance_loss_clip": 1.03067899, + "balance_loss_mlp": 1.05544043, + "epoch": 0.0788215842477078, + "flos": 23805794367360.0, + "grad_norm": 1.7115323272474947, + "language_loss": 0.73069102, + "learning_rate": 3.9750496592645435e-06, + "loss": 0.75307012, + "num_input_tokens_seen": 27852180, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.296875, + "step": 1311, + "time_per_iteration": 2.5299458503723145 + }, + { + "auxiliary_loss_clip": 0.01190682, + "auxiliary_loss_mlp": 0.01072627, + "balance_loss_clip": 1.04861844, + "balance_loss_mlp": 1.05650353, + "epoch": 0.07888170750037576, + "flos": 21579907438080.0, + "grad_norm": 1.9161215374898264, + "language_loss": 0.85871482, + "learning_rate": 3.974988295871553e-06, + "loss": 0.88134789, + "num_input_tokens_seen": 27871435, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.34375, + "step": 1312, + "time_per_iteration": 2.5490031242370605 + }, + { + "auxiliary_loss_clip": 0.01186476, + "auxiliary_loss_mlp": 0.01059916, + "balance_loss_clip": 1.03811264, + "balance_loss_mlp": 1.0555284, + "epoch": 0.07894183075304374, + "flos": 19864849777920.0, + "grad_norm": 1.7542323177910393, + "language_loss": 0.81968379, + "learning_rate": 3.9749268575868085e-06, + "loss": 0.84214771, + "num_input_tokens_seen": 27890625, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.3125, + "step": 1313, + "time_per_iteration": 2.507046699523926 + }, + { + "auxiliary_loss_clip": 0.0119261, + "auxiliary_loss_mlp": 0.0105996, + "balance_loss_clip": 1.03528404, + "balance_loss_mlp": 1.05271506, + "epoch": 0.07900195400571171, + "flos": 16143463071360.0, + "grad_norm": 3.109477065223649, + "language_loss": 0.73372161, + "learning_rate": 3.97486534441264e-06, + "loss": 0.7562474, + "num_input_tokens_seen": 27906530, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.3984375, + "step": 1314, + "time_per_iteration": 2.4396395683288574 + }, + { + "auxiliary_loss_clip": 0.01185115, + "auxiliary_loss_mlp": 0.01058505, + "balance_loss_clip": 1.03678489, + "balance_loss_mlp": 1.05120206, + "epoch": 0.07906207725837967, + "flos": 23730417676800.0, + "grad_norm": 1.579996187361532, + "language_loss": 0.79460657, + "learning_rate": 3.974803756351379e-06, + "loss": 0.81704271, + "num_input_tokens_seen": 27926725, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.34375, + "step": 1315, + "time_per_iteration": 2.493365526199341 + }, + { + "auxiliary_loss_clip": 0.011877, + "auxiliary_loss_mlp": 0.01060931, + "balance_loss_clip": 1.03592062, + "balance_loss_mlp": 1.05232, + "epoch": 0.07912220051104765, + "flos": 24315905364480.0, + "grad_norm": 1.9411836832725016, + "language_loss": 0.73614991, + "learning_rate": 3.974742093405362e-06, + "loss": 0.75863618, + "num_input_tokens_seen": 27947875, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.3515625, + "step": 1316, + "time_per_iteration": 2.4696316719055176 + }, + { + "auxiliary_loss_clip": 0.01193023, + "auxiliary_loss_mlp": 0.01063129, + "balance_loss_clip": 1.03940618, + "balance_loss_mlp": 1.05415511, + "epoch": 0.07918232376371562, + "flos": 18880035615360.0, + "grad_norm": 2.862910173072837, + "language_loss": 0.65148681, + "learning_rate": 3.974680355576927e-06, + "loss": 0.67404836, + "num_input_tokens_seen": 27965040, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.390625, + "step": 1317, + "time_per_iteration": 2.447847843170166 + }, + { + "auxiliary_loss_clip": 0.01197561, + "auxiliary_loss_mlp": 0.01063488, + "balance_loss_clip": 1.03899026, + "balance_loss_mlp": 1.05774999, + "epoch": 0.07924244701638358, + "flos": 27376284038400.0, + "grad_norm": 2.3478172138868967, + "language_loss": 0.7324174, + "learning_rate": 3.974618542868415e-06, + "loss": 0.75502789, + "num_input_tokens_seen": 27985330, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.3984375, + "step": 1318, + "time_per_iteration": 2.497406482696533 + }, + { + "auxiliary_loss_clip": 0.01188329, + "auxiliary_loss_mlp": 0.01057875, + "balance_loss_clip": 1.03557122, + "balance_loss_mlp": 1.05335736, + "epoch": 0.07930257026905156, + "flos": 25120340403840.0, + "grad_norm": 1.92969491679129, + "language_loss": 0.90610284, + "learning_rate": 3.97455665528217e-06, + "loss": 0.92856491, + "num_input_tokens_seen": 28007615, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.3515625, + "step": 1319, + "time_per_iteration": 2.5007200241088867 + }, + { + "auxiliary_loss_clip": 0.01188786, + "auxiliary_loss_mlp": 0.01054126, + "balance_loss_clip": 1.03086793, + "balance_loss_mlp": 1.05155873, + "epoch": 0.07936269352171953, + "flos": 21834478103040.0, + "grad_norm": 2.95797867210378, + "language_loss": 0.79765761, + "learning_rate": 3.974494692820539e-06, + "loss": 0.82008684, + "num_input_tokens_seen": 28027765, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.375, + "step": 1320, + "time_per_iteration": 2.4683783054351807 + }, + { + "auxiliary_loss_clip": 0.01190918, + "auxiliary_loss_mlp": 0.01057044, + "balance_loss_clip": 1.03448987, + "balance_loss_mlp": 1.05700457, + "epoch": 0.07942281677438749, + "flos": 16939889377920.0, + "grad_norm": 2.6163787894008363, + "language_loss": 0.69574934, + "learning_rate": 3.974432655485872e-06, + "loss": 0.71822894, + "num_input_tokens_seen": 28044225, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.34375, + "step": 1321, + "time_per_iteration": 2.466911554336548 + }, + { + "auxiliary_loss_clip": 0.01184231, + "auxiliary_loss_mlp": 0.01055954, + "balance_loss_clip": 1.03313756, + "balance_loss_mlp": 1.05313718, + "epoch": 0.07948294002705546, + "flos": 18986941468800.0, + "grad_norm": 1.926313653502779, + "language_loss": 0.83559513, + "learning_rate": 3.9743705432805195e-06, + "loss": 0.857997, + "num_input_tokens_seen": 28062915, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.3125, + "step": 1322, + "time_per_iteration": 2.465885639190674 + }, + { + "auxiliary_loss_clip": 0.01188233, + "auxiliary_loss_mlp": 0.01058763, + "balance_loss_clip": 1.03544521, + "balance_loss_mlp": 1.05104756, + "epoch": 0.07954306327972344, + "flos": 21653452535040.0, + "grad_norm": 1.8863777031262867, + "language_loss": 0.90437615, + "learning_rate": 3.974308356206838e-06, + "loss": 0.92684615, + "num_input_tokens_seen": 28082175, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.375, + "step": 1323, + "time_per_iteration": 2.465841293334961 + }, + { + "auxiliary_loss_clip": 0.0118735, + "auxiliary_loss_mlp": 0.01057162, + "balance_loss_clip": 1.03438115, + "balance_loss_mlp": 1.05414796, + "epoch": 0.0796031865323914, + "flos": 23220270766080.0, + "grad_norm": 1.6454981938510795, + "language_loss": 0.82583225, + "learning_rate": 3.974246094267187e-06, + "loss": 0.84827733, + "num_input_tokens_seen": 28102645, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.328125, + "step": 1324, + "time_per_iteration": 2.475486993789673 + }, + { + "auxiliary_loss_clip": 0.01188398, + "auxiliary_loss_mlp": 0.01049438, + "balance_loss_clip": 1.0255841, + "balance_loss_mlp": 1.05264676, + "epoch": 0.07966330978505937, + "flos": 23294534135040.0, + "grad_norm": 2.416918252865386, + "language_loss": 0.79654729, + "learning_rate": 3.974183757463925e-06, + "loss": 0.81892562, + "num_input_tokens_seen": 28122805, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.359375, + "step": 1325, + "time_per_iteration": 2.482555389404297 + }, + { + "auxiliary_loss_clip": 0.01190127, + "auxiliary_loss_mlp": 0.01064919, + "balance_loss_clip": 1.03989661, + "balance_loss_mlp": 1.05474687, + "epoch": 0.07972343303772735, + "flos": 18363783392640.0, + "grad_norm": 2.170521767048619, + "language_loss": 0.8812806, + "learning_rate": 3.974121345799418e-06, + "loss": 0.90383106, + "num_input_tokens_seen": 28140530, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.3515625, + "step": 1326, + "time_per_iteration": 2.466742753982544 + }, + { + "auxiliary_loss_clip": 0.01182901, + "auxiliary_loss_mlp": 0.01052255, + "balance_loss_clip": 1.02823424, + "balance_loss_mlp": 1.05014396, + "epoch": 0.07978355629039531, + "flos": 21762513204480.0, + "grad_norm": 2.3992518634606164, + "language_loss": 0.83013594, + "learning_rate": 3.974058859276032e-06, + "loss": 0.8524875, + "num_input_tokens_seen": 28159640, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.328125, + "step": 1327, + "time_per_iteration": 2.4989237785339355 + }, + { + "auxiliary_loss_clip": 0.0119143, + "auxiliary_loss_mlp": 0.0105424, + "balance_loss_clip": 1.03013575, + "balance_loss_mlp": 1.05436027, + "epoch": 0.07984367954306328, + "flos": 18551309322240.0, + "grad_norm": 2.1664091533416587, + "language_loss": 0.78452092, + "learning_rate": 3.9739962978961354e-06, + "loss": 0.80697763, + "num_input_tokens_seen": 28177050, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.375, + "step": 1328, + "time_per_iteration": 2.4402852058410645 + }, + { + "auxiliary_loss_clip": 0.01191637, + "auxiliary_loss_mlp": 0.01053331, + "balance_loss_clip": 1.02969217, + "balance_loss_mlp": 1.05460131, + "epoch": 0.07990380279573125, + "flos": 16904050583040.0, + "grad_norm": 2.484533735051083, + "language_loss": 0.74277186, + "learning_rate": 3.973933661662101e-06, + "loss": 0.76522154, + "num_input_tokens_seen": 28193245, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.375, + "step": 1329, + "time_per_iteration": 2.425388813018799 + }, + { + "auxiliary_loss_clip": 0.01185759, + "auxiliary_loss_mlp": 0.01060058, + "balance_loss_clip": 1.03731298, + "balance_loss_mlp": 1.05096054, + "epoch": 0.07996392604839922, + "flos": 24098358643200.0, + "grad_norm": 1.5753219993175995, + "language_loss": 0.81090498, + "learning_rate": 3.973870950576305e-06, + "loss": 0.83336312, + "num_input_tokens_seen": 28213570, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.3515625, + "step": 1330, + "time_per_iteration": 2.4831247329711914 + }, + { + "auxiliary_loss_clip": 0.01190834, + "auxiliary_loss_mlp": 0.01062422, + "balance_loss_clip": 1.03924823, + "balance_loss_mlp": 1.05348384, + "epoch": 0.08002404930106718, + "flos": 14278729438080.0, + "grad_norm": 2.322034822225311, + "language_loss": 0.88790143, + "learning_rate": 3.9738081646411255e-06, + "loss": 0.91043401, + "num_input_tokens_seen": 28229980, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.375, + "step": 1331, + "time_per_iteration": 2.4410722255706787 + }, + { + "auxiliary_loss_clip": 0.01193336, + "auxiliary_loss_mlp": 0.01058252, + "balance_loss_clip": 1.03414834, + "balance_loss_mlp": 1.05288279, + "epoch": 0.08008417255373516, + "flos": 40406219285760.0, + "grad_norm": 2.577873328737783, + "language_loss": 0.73332524, + "learning_rate": 3.973745303858942e-06, + "loss": 0.75584114, + "num_input_tokens_seen": 28253840, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.40625, + "step": 1332, + "time_per_iteration": 2.6054465770721436 + }, + { + "auxiliary_loss_clip": 0.01186558, + "auxiliary_loss_mlp": 0.01050644, + "balance_loss_clip": 1.02820885, + "balance_loss_mlp": 1.05179858, + "epoch": 0.08014429580640313, + "flos": 18478913460480.0, + "grad_norm": 1.9568005204239032, + "language_loss": 0.82994795, + "learning_rate": 3.973682368232138e-06, + "loss": 0.85232008, + "num_input_tokens_seen": 28271675, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.34375, + "step": 1333, + "time_per_iteration": 2.453457832336426 + }, + { + "auxiliary_loss_clip": 0.01187412, + "auxiliary_loss_mlp": 0.01055323, + "balance_loss_clip": 1.03272128, + "balance_loss_mlp": 1.05115032, + "epoch": 0.0802044190590711, + "flos": 22053461368320.0, + "grad_norm": 2.7771179443818466, + "language_loss": 0.74698973, + "learning_rate": 3.9736193577631015e-06, + "loss": 0.76941711, + "num_input_tokens_seen": 28291850, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.359375, + "step": 1334, + "time_per_iteration": 2.5768256187438965 + }, + { + "auxiliary_loss_clip": 0.01187182, + "auxiliary_loss_mlp": 0.01060862, + "balance_loss_clip": 1.03831935, + "balance_loss_mlp": 1.05457497, + "epoch": 0.08026454231173906, + "flos": 24572128055040.0, + "grad_norm": 2.0216765528325635, + "language_loss": 0.80279201, + "learning_rate": 3.973556272454221e-06, + "loss": 0.82527244, + "num_input_tokens_seen": 28310780, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.328125, + "step": 1335, + "time_per_iteration": 2.538670301437378 + }, + { + "auxiliary_loss_clip": 0.01078994, + "auxiliary_loss_mlp": 0.01011272, + "balance_loss_clip": 1.00802934, + "balance_loss_mlp": 1.02308655, + "epoch": 0.08032466556440704, + "flos": 52581841459200.0, + "grad_norm": 0.7427722697577622, + "language_loss": 0.56020629, + "learning_rate": 3.973493112307889e-06, + "loss": 0.58110893, + "num_input_tokens_seen": 28369985, + "router_z_loss_clip": 0.0324707, + "router_z_loss_mlp": 0.5625, + "step": 1336, + "time_per_iteration": 3.125026226043701 + }, + { + "auxiliary_loss_clip": 0.01188939, + "auxiliary_loss_mlp": 0.01054834, + "balance_loss_clip": 1.0331738, + "balance_loss_mlp": 1.05371606, + "epoch": 0.080384788817075, + "flos": 23842602829440.0, + "grad_norm": 2.050916847484745, + "language_loss": 0.67764497, + "learning_rate": 3.9734298773265005e-06, + "loss": 0.70008272, + "num_input_tokens_seen": 28388670, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.3515625, + "step": 1337, + "time_per_iteration": 2.506103038787842 + }, + { + "auxiliary_loss_clip": 0.01188826, + "auxiliary_loss_mlp": 0.01065102, + "balance_loss_clip": 1.04313135, + "balance_loss_mlp": 1.05480385, + "epoch": 0.08044491206974297, + "flos": 25300719527040.0, + "grad_norm": 1.8692893317328456, + "language_loss": 0.86701488, + "learning_rate": 3.973366567512453e-06, + "loss": 0.88955414, + "num_input_tokens_seen": 28411845, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.3359375, + "step": 1338, + "time_per_iteration": 2.5451908111572266 + }, + { + "auxiliary_loss_clip": 0.01188004, + "auxiliary_loss_mlp": 0.01060185, + "balance_loss_clip": 1.0368793, + "balance_loss_mlp": 1.05142283, + "epoch": 0.08050503532241095, + "flos": 22376549226240.0, + "grad_norm": 2.6265473040924725, + "language_loss": 0.87246621, + "learning_rate": 3.973303182868147e-06, + "loss": 0.89494807, + "num_input_tokens_seen": 28427875, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.359375, + "step": 1339, + "time_per_iteration": 2.450932502746582 + }, + { + "auxiliary_loss_clip": 0.01181336, + "auxiliary_loss_mlp": 0.01047749, + "balance_loss_clip": 1.02660179, + "balance_loss_mlp": 1.05106449, + "epoch": 0.08056515857507891, + "flos": 18369421827840.0, + "grad_norm": 2.428441908593999, + "language_loss": 0.88819683, + "learning_rate": 3.973239723395988e-06, + "loss": 0.91048771, + "num_input_tokens_seen": 28446615, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.296875, + "step": 1340, + "time_per_iteration": 2.4539895057678223 + }, + { + "auxiliary_loss_clip": 0.01072684, + "auxiliary_loss_mlp": 0.01003041, + "balance_loss_clip": 0.99951285, + "balance_loss_mlp": 1.01727247, + "epoch": 0.08062528182774688, + "flos": 51348130980480.0, + "grad_norm": 0.8886760882983712, + "language_loss": 0.64806795, + "learning_rate": 3.97317618909838e-06, + "loss": 0.66882515, + "num_input_tokens_seen": 28505290, + "router_z_loss_clip": 0.03540039, + "router_z_loss_mlp": 0.5546875, + "step": 1341, + "time_per_iteration": 3.0034360885620117 + }, + { + "auxiliary_loss_clip": 0.01193907, + "auxiliary_loss_mlp": 0.01060938, + "balance_loss_clip": 1.03577328, + "balance_loss_mlp": 1.05301166, + "epoch": 0.08068540508041486, + "flos": 17599712261760.0, + "grad_norm": 2.817345215565239, + "language_loss": 0.89616883, + "learning_rate": 3.973112579977733e-06, + "loss": 0.91871732, + "num_input_tokens_seen": 28522735, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.40625, + "step": 1342, + "time_per_iteration": 2.479701042175293 + }, + { + "auxiliary_loss_clip": 0.01194936, + "auxiliary_loss_mlp": 0.0105815, + "balance_loss_clip": 1.03334308, + "balance_loss_mlp": 1.05721259, + "epoch": 0.08074552833308282, + "flos": 10561185486720.0, + "grad_norm": 2.7453135307928216, + "language_loss": 0.76378155, + "learning_rate": 3.973048896036459e-06, + "loss": 0.78631246, + "num_input_tokens_seen": 28539460, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.375, + "step": 1343, + "time_per_iteration": 2.4969120025634766 + }, + { + "auxiliary_loss_clip": 0.01072196, + "auxiliary_loss_mlp": 0.01008036, + "balance_loss_clip": 1.00446022, + "balance_loss_mlp": 1.01791215, + "epoch": 0.08080565158575079, + "flos": 60840254954880.0, + "grad_norm": 0.8963318804352591, + "language_loss": 0.57395822, + "learning_rate": 3.972985137276974e-06, + "loss": 0.59476054, + "num_input_tokens_seen": 28599855, + "router_z_loss_clip": 0.03564453, + "router_z_loss_mlp": 0.54296875, + "step": 1344, + "time_per_iteration": 2.9917871952056885 + }, + { + "auxiliary_loss_clip": 0.01190985, + "auxiliary_loss_mlp": 0.0105771, + "balance_loss_clip": 1.03452373, + "balance_loss_mlp": 1.05523396, + "epoch": 0.08086577483841875, + "flos": 18332361970560.0, + "grad_norm": 2.677643541218582, + "language_loss": 0.86665964, + "learning_rate": 3.972921303701695e-06, + "loss": 0.88914657, + "num_input_tokens_seen": 28617585, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.359375, + "step": 1345, + "time_per_iteration": 2.4601447582244873 + }, + { + "auxiliary_loss_clip": 0.01187459, + "auxiliary_loss_mlp": 0.01054913, + "balance_loss_clip": 1.03289497, + "balance_loss_mlp": 1.05403256, + "epoch": 0.08092589809108673, + "flos": 21543601766400.0, + "grad_norm": 1.7098835991166323, + "language_loss": 0.87242532, + "learning_rate": 3.972857395313042e-06, + "loss": 0.894849, + "num_input_tokens_seen": 28636355, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.3359375, + "step": 1346, + "time_per_iteration": 2.4809892177581787 + }, + { + "auxiliary_loss_clip": 0.01185898, + "auxiliary_loss_mlp": 0.01054973, + "balance_loss_clip": 1.03256202, + "balance_loss_mlp": 1.05219567, + "epoch": 0.0809860213437547, + "flos": 22128012046080.0, + "grad_norm": 1.6659805361601863, + "language_loss": 0.92606491, + "learning_rate": 3.972793412113439e-06, + "loss": 0.94847363, + "num_input_tokens_seen": 28656260, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.3359375, + "step": 1347, + "time_per_iteration": 2.4802379608154297 + }, + { + "auxiliary_loss_clip": 0.0118757, + "auxiliary_loss_mlp": 0.01057822, + "balance_loss_clip": 1.03318167, + "balance_loss_mlp": 1.05471659, + "epoch": 0.08104614459642266, + "flos": 21725489260800.0, + "grad_norm": 9.453605004454174, + "language_loss": 0.89181751, + "learning_rate": 3.972729354105312e-06, + "loss": 0.91427147, + "num_input_tokens_seen": 28675865, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.328125, + "step": 1348, + "time_per_iteration": 2.4610300064086914 + }, + { + "auxiliary_loss_clip": 0.01185296, + "auxiliary_loss_mlp": 0.01056008, + "balance_loss_clip": 1.03420484, + "balance_loss_mlp": 1.05543983, + "epoch": 0.08110626784909064, + "flos": 23951878980480.0, + "grad_norm": 2.4916215003739355, + "language_loss": 0.76796132, + "learning_rate": 3.97266522129109e-06, + "loss": 0.7903744, + "num_input_tokens_seen": 28696255, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.296875, + "step": 1349, + "time_per_iteration": 2.4789178371429443 + }, + { + "auxiliary_loss_clip": 0.01187103, + "auxiliary_loss_mlp": 0.0105974, + "balance_loss_clip": 1.03669679, + "balance_loss_mlp": 1.05236626, + "epoch": 0.0811663911017586, + "flos": 19025689265280.0, + "grad_norm": 2.126949034470324, + "language_loss": 0.88571703, + "learning_rate": 3.972601013673205e-06, + "loss": 0.90818548, + "num_input_tokens_seen": 28713905, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.34375, + "step": 1350, + "time_per_iteration": 2.43094539642334 + }, + { + "auxiliary_loss_clip": 0.01184059, + "auxiliary_loss_mlp": 0.01061052, + "balance_loss_clip": 1.03773451, + "balance_loss_mlp": 1.05228257, + "epoch": 0.08122651435442657, + "flos": 15341290588800.0, + "grad_norm": 2.044220866897066, + "language_loss": 0.82058489, + "learning_rate": 3.972536731254092e-06, + "loss": 0.843036, + "num_input_tokens_seen": 28732075, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.3125, + "step": 1351, + "time_per_iteration": 6.688653469085693 + }, + { + "auxiliary_loss_clip": 0.01184193, + "auxiliary_loss_mlp": 0.01053712, + "balance_loss_clip": 1.02917862, + "balance_loss_mlp": 1.04863417, + "epoch": 0.08128663760709455, + "flos": 23221563655680.0, + "grad_norm": 1.9894600711485977, + "language_loss": 0.75347674, + "learning_rate": 3.972472374036189e-06, + "loss": 0.77585584, + "num_input_tokens_seen": 28751150, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.359375, + "step": 1352, + "time_per_iteration": 2.4888412952423096 + }, + { + "auxiliary_loss_clip": 0.01192461, + "auxiliary_loss_mlp": 0.01055559, + "balance_loss_clip": 1.03163338, + "balance_loss_mlp": 1.05483341, + "epoch": 0.08134676085976252, + "flos": 22965628273920.0, + "grad_norm": 1.7603053493114211, + "language_loss": 0.82833469, + "learning_rate": 3.972407942021935e-06, + "loss": 0.85081488, + "num_input_tokens_seen": 28773360, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.375, + "step": 1353, + "time_per_iteration": 2.522960901260376 + }, + { + "auxiliary_loss_clip": 0.01073388, + "auxiliary_loss_mlp": 0.01010471, + "balance_loss_clip": 1.00694275, + "balance_loss_mlp": 1.01996851, + "epoch": 0.08140688411243048, + "flos": 64322115816960.0, + "grad_norm": 0.8931676068679675, + "language_loss": 0.5970993, + "learning_rate": 3.972343435213775e-06, + "loss": 0.61793786, + "num_input_tokens_seen": 28833390, + "router_z_loss_clip": 0.03540039, + "router_z_loss_mlp": 0.53125, + "step": 1354, + "time_per_iteration": 3.0639474391937256 + }, + { + "auxiliary_loss_clip": 0.0118665, + "auxiliary_loss_mlp": 0.01060844, + "balance_loss_clip": 1.03764629, + "balance_loss_mlp": 1.05431724, + "epoch": 0.08146700736509845, + "flos": 22491858862080.0, + "grad_norm": 1.7981329827127455, + "language_loss": 0.82785606, + "learning_rate": 3.972278853614154e-06, + "loss": 0.85033101, + "num_input_tokens_seen": 28852430, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.3203125, + "step": 1355, + "time_per_iteration": 2.4664132595062256 + }, + { + "auxiliary_loss_clip": 0.01186535, + "auxiliary_loss_mlp": 0.01062061, + "balance_loss_clip": 1.03619206, + "balance_loss_mlp": 1.05146575, + "epoch": 0.08152713061776642, + "flos": 20447823513600.0, + "grad_norm": 1.9123465925299232, + "language_loss": 0.70799643, + "learning_rate": 3.972214197225521e-06, + "loss": 0.73048234, + "num_input_tokens_seen": 28870685, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.3515625, + "step": 1356, + "time_per_iteration": 2.509061813354492 + }, + { + "auxiliary_loss_clip": 0.01188256, + "auxiliary_loss_mlp": 0.01055944, + "balance_loss_clip": 1.03169644, + "balance_loss_mlp": 1.05148005, + "epoch": 0.08158725387043439, + "flos": 23550218121600.0, + "grad_norm": 2.53580294551395, + "language_loss": 0.70255458, + "learning_rate": 3.972149466050329e-06, + "loss": 0.72499657, + "num_input_tokens_seen": 28889860, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.3671875, + "step": 1357, + "time_per_iteration": 2.476951837539673 + }, + { + "auxiliary_loss_clip": 0.01191615, + "auxiliary_loss_mlp": 0.01053374, + "balance_loss_clip": 1.03067684, + "balance_loss_mlp": 1.05488217, + "epoch": 0.08164737712310235, + "flos": 22017335264640.0, + "grad_norm": 2.6163823683714953, + "language_loss": 0.84186697, + "learning_rate": 3.97208466009103e-06, + "loss": 0.86431682, + "num_input_tokens_seen": 28905865, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.3671875, + "step": 1358, + "time_per_iteration": 2.457376480102539 + }, + { + "auxiliary_loss_clip": 0.01190093, + "auxiliary_loss_mlp": 0.01056216, + "balance_loss_clip": 1.0310626, + "balance_loss_mlp": 1.05484545, + "epoch": 0.08170750037577033, + "flos": 23367827836800.0, + "grad_norm": 1.9894839389786314, + "language_loss": 1.02294087, + "learning_rate": 3.972019779350084e-06, + "loss": 1.04540396, + "num_input_tokens_seen": 28925250, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.3515625, + "step": 1359, + "time_per_iteration": 2.4723212718963623 + }, + { + "auxiliary_loss_clip": 0.01185855, + "auxiliary_loss_mlp": 0.01057366, + "balance_loss_clip": 1.03344035, + "balance_loss_mlp": 1.0511415, + "epoch": 0.0817676236284383, + "flos": 28397978490240.0, + "grad_norm": 2.0666688933075963, + "language_loss": 0.82969773, + "learning_rate": 3.971954823829951e-06, + "loss": 0.85212988, + "num_input_tokens_seen": 28943445, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.34375, + "step": 1360, + "time_per_iteration": 2.5143508911132812 + }, + { + "auxiliary_loss_clip": 0.01190184, + "auxiliary_loss_mlp": 0.01062181, + "balance_loss_clip": 1.03820777, + "balance_loss_mlp": 1.05335808, + "epoch": 0.08182774688110626, + "flos": 19208905562880.0, + "grad_norm": 2.14797754608813, + "language_loss": 0.72352278, + "learning_rate": 3.971889793533093e-06, + "loss": 0.74604642, + "num_input_tokens_seen": 28962695, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.3671875, + "step": 1361, + "time_per_iteration": 2.458034038543701 + }, + { + "auxiliary_loss_clip": 0.01179057, + "auxiliary_loss_mlp": 0.01057287, + "balance_loss_clip": 1.03249121, + "balance_loss_mlp": 1.04741335, + "epoch": 0.08188787013377424, + "flos": 22784099915520.0, + "grad_norm": 5.8589819193374515, + "language_loss": 0.76781029, + "learning_rate": 3.971824688461976e-06, + "loss": 0.79017377, + "num_input_tokens_seen": 28982120, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.3125, + "step": 1362, + "time_per_iteration": 2.472259759902954 + }, + { + "auxiliary_loss_clip": 0.01187551, + "auxiliary_loss_mlp": 0.01052018, + "balance_loss_clip": 1.0291419, + "balance_loss_mlp": 1.05449164, + "epoch": 0.08194799338644221, + "flos": 16468095214080.0, + "grad_norm": 2.631594675791475, + "language_loss": 0.72409523, + "learning_rate": 3.971759508619069e-06, + "loss": 0.74649096, + "num_input_tokens_seen": 28998100, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.328125, + "step": 1363, + "time_per_iteration": 2.4447264671325684 + }, + { + "auxiliary_loss_clip": 0.01189235, + "auxiliary_loss_mlp": 0.01061525, + "balance_loss_clip": 1.03603828, + "balance_loss_mlp": 1.05607057, + "epoch": 0.08200811663911017, + "flos": 23913633974400.0, + "grad_norm": 3.9166951523525464, + "language_loss": 0.77459586, + "learning_rate": 3.971694254006844e-06, + "loss": 0.79710352, + "num_input_tokens_seen": 29017095, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.328125, + "step": 1364, + "time_per_iteration": 2.475141763687134 + }, + { + "auxiliary_loss_clip": 0.01190144, + "auxiliary_loss_mlp": 0.01061328, + "balance_loss_clip": 1.03745019, + "balance_loss_mlp": 1.05500793, + "epoch": 0.08206823989177814, + "flos": 17896550256000.0, + "grad_norm": 2.6241179536013033, + "language_loss": 0.82025397, + "learning_rate": 3.971628924627776e-06, + "loss": 0.84276867, + "num_input_tokens_seen": 29037240, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.3515625, + "step": 1365, + "time_per_iteration": 2.493732452392578 + }, + { + "auxiliary_loss_clip": 0.0118713, + "auxiliary_loss_mlp": 0.0105741, + "balance_loss_clip": 1.03406882, + "balance_loss_mlp": 1.05614781, + "epoch": 0.08212836314444612, + "flos": 22088186841600.0, + "grad_norm": 3.3261283913074884, + "language_loss": 0.82173789, + "learning_rate": 3.97156352048434e-06, + "loss": 0.84418333, + "num_input_tokens_seen": 29056250, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.3125, + "step": 1366, + "time_per_iteration": 2.4809322357177734 + }, + { + "auxiliary_loss_clip": 0.01186928, + "auxiliary_loss_mlp": 0.0105891, + "balance_loss_clip": 1.03703475, + "balance_loss_mlp": 1.05126381, + "epoch": 0.08218848639711408, + "flos": 17597485618560.0, + "grad_norm": 2.8403828718649033, + "language_loss": 0.81534755, + "learning_rate": 3.97149804157902e-06, + "loss": 0.83780599, + "num_input_tokens_seen": 29073380, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.359375, + "step": 1367, + "time_per_iteration": 2.4577715396881104 + }, + { + "auxiliary_loss_clip": 0.01192402, + "auxiliary_loss_mlp": 0.01060775, + "balance_loss_clip": 1.03724277, + "balance_loss_mlp": 1.05413651, + "epoch": 0.08224860964978205, + "flos": 17857838373120.0, + "grad_norm": 2.3540874203263358, + "language_loss": 0.83644414, + "learning_rate": 3.9714324879142946e-06, + "loss": 0.85897589, + "num_input_tokens_seen": 29091330, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.3828125, + "step": 1368, + "time_per_iteration": 2.453547716140747 + }, + { + "auxiliary_loss_clip": 0.01181645, + "auxiliary_loss_mlp": 0.01049123, + "balance_loss_clip": 1.02694988, + "balance_loss_mlp": 1.05349994, + "epoch": 0.08230873290245003, + "flos": 25227533566080.0, + "grad_norm": 1.7360129433802456, + "language_loss": 0.81245828, + "learning_rate": 3.971366859492653e-06, + "loss": 0.83476603, + "num_input_tokens_seen": 29110375, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.28125, + "step": 1369, + "time_per_iteration": 2.527573585510254 + }, + { + "auxiliary_loss_clip": 0.01185735, + "auxiliary_loss_mlp": 0.01051149, + "balance_loss_clip": 1.02979898, + "balance_loss_mlp": 1.05528903, + "epoch": 0.08236885615511799, + "flos": 31759935753600.0, + "grad_norm": 2.240857135161324, + "language_loss": 0.74790901, + "learning_rate": 3.971301156316582e-06, + "loss": 0.77027786, + "num_input_tokens_seen": 29129395, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.3046875, + "step": 1370, + "time_per_iteration": 2.5205185413360596 + }, + { + "auxiliary_loss_clip": 0.01189372, + "auxiliary_loss_mlp": 0.01061396, + "balance_loss_clip": 1.03697038, + "balance_loss_mlp": 1.05480862, + "epoch": 0.08242897940778596, + "flos": 23185832601600.0, + "grad_norm": 1.6313231263601415, + "language_loss": 0.74633086, + "learning_rate": 3.971235378388573e-06, + "loss": 0.76883852, + "num_input_tokens_seen": 29148650, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.34375, + "step": 1371, + "time_per_iteration": 2.4995803833007812 + }, + { + "auxiliary_loss_clip": 0.01188254, + "auxiliary_loss_mlp": 0.01061601, + "balance_loss_clip": 1.03769946, + "balance_loss_mlp": 1.05410123, + "epoch": 0.08248910266045394, + "flos": 34491480393600.0, + "grad_norm": 2.0830704741847423, + "language_loss": 0.71080554, + "learning_rate": 3.971169525711122e-06, + "loss": 0.73330408, + "num_input_tokens_seen": 29170785, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.34375, + "step": 1372, + "time_per_iteration": 2.574457883834839 + }, + { + "auxiliary_loss_clip": 0.0118845, + "auxiliary_loss_mlp": 0.01051797, + "balance_loss_clip": 1.02750254, + "balance_loss_mlp": 1.05397415, + "epoch": 0.0825492259131219, + "flos": 13436228960640.0, + "grad_norm": 3.137320584176607, + "language_loss": 0.88010907, + "learning_rate": 3.9711035982867246e-06, + "loss": 0.90251154, + "num_input_tokens_seen": 29185210, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.34375, + "step": 1373, + "time_per_iteration": 2.485727310180664 + }, + { + "auxiliary_loss_clip": 0.01186594, + "auxiliary_loss_mlp": 0.01058909, + "balance_loss_clip": 1.03575897, + "balance_loss_mlp": 1.05331743, + "epoch": 0.08260934916578987, + "flos": 25812446636160.0, + "grad_norm": 1.7727067520163604, + "language_loss": 0.82349706, + "learning_rate": 3.971037596117882e-06, + "loss": 0.84595209, + "num_input_tokens_seen": 29205210, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.328125, + "step": 1374, + "time_per_iteration": 2.5223724842071533 + }, + { + "auxiliary_loss_clip": 0.01076427, + "auxiliary_loss_mlp": 0.01007461, + "balance_loss_clip": 1.0032891, + "balance_loss_mlp": 1.02371156, + "epoch": 0.08266947241845783, + "flos": 63460009491840.0, + "grad_norm": 0.8248734910296001, + "language_loss": 0.60630989, + "learning_rate": 3.970971519207095e-06, + "loss": 0.62714875, + "num_input_tokens_seen": 29265350, + "router_z_loss_clip": 0.04174805, + "router_z_loss_mlp": 0.5234375, + "step": 1375, + "time_per_iteration": 3.0909183025360107 + }, + { + "auxiliary_loss_clip": 0.01074233, + "auxiliary_loss_mlp": 0.01006319, + "balance_loss_clip": 1.00221813, + "balance_loss_mlp": 1.02162504, + "epoch": 0.08272959567112581, + "flos": 69993704568960.0, + "grad_norm": 0.9071425511101782, + "language_loss": 0.62149519, + "learning_rate": 3.970905367556871e-06, + "loss": 0.64230067, + "num_input_tokens_seen": 29321475, + "router_z_loss_clip": 0.04101562, + "router_z_loss_mlp": 0.52734375, + "step": 1376, + "time_per_iteration": 2.991158962249756 + }, + { + "auxiliary_loss_clip": 0.01195866, + "auxiliary_loss_mlp": 0.01069408, + "balance_loss_clip": 1.04624534, + "balance_loss_mlp": 1.05995989, + "epoch": 0.08278971892379378, + "flos": 20413205781120.0, + "grad_norm": 1.9826192893196872, + "language_loss": 0.82601643, + "learning_rate": 3.970839141169718e-06, + "loss": 0.84866917, + "num_input_tokens_seen": 29341405, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.359375, + "step": 1377, + "time_per_iteration": 2.5851728916168213 + }, + { + "auxiliary_loss_clip": 0.01188463, + "auxiliary_loss_mlp": 0.01057538, + "balance_loss_clip": 1.0342443, + "balance_loss_mlp": 1.05601847, + "epoch": 0.08284984217646174, + "flos": 26250233598720.0, + "grad_norm": 1.8760965133588865, + "language_loss": 0.84516692, + "learning_rate": 3.970772840048147e-06, + "loss": 0.86762691, + "num_input_tokens_seen": 29361955, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.3203125, + "step": 1378, + "time_per_iteration": 2.500251054763794 + }, + { + "auxiliary_loss_clip": 0.01190024, + "auxiliary_loss_mlp": 0.01062419, + "balance_loss_clip": 1.0383265, + "balance_loss_mlp": 1.05516553, + "epoch": 0.08290996542912972, + "flos": 27194683852800.0, + "grad_norm": 1.9551783234852504, + "language_loss": 0.87725681, + "learning_rate": 3.970706464194672e-06, + "loss": 0.89978123, + "num_input_tokens_seen": 29382395, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.3515625, + "step": 1379, + "time_per_iteration": 2.5428385734558105 + }, + { + "auxiliary_loss_clip": 0.01189534, + "auxiliary_loss_mlp": 0.01056049, + "balance_loss_clip": 1.03336358, + "balance_loss_mlp": 1.05776525, + "epoch": 0.08297008868179769, + "flos": 38618191146240.0, + "grad_norm": 1.7573789229703745, + "language_loss": 0.78658688, + "learning_rate": 3.970640013611812e-06, + "loss": 0.80904275, + "num_input_tokens_seen": 29404460, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.3125, + "step": 1380, + "time_per_iteration": 2.6350595951080322 + }, + { + "auxiliary_loss_clip": 0.01190411, + "auxiliary_loss_mlp": 0.01061393, + "balance_loss_clip": 1.03666866, + "balance_loss_mlp": 1.05878401, + "epoch": 0.08303021193446565, + "flos": 19974736460160.0, + "grad_norm": 2.2395713763978002, + "language_loss": 0.86146504, + "learning_rate": 3.970573488302083e-06, + "loss": 0.88398302, + "num_input_tokens_seen": 29422675, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.3125, + "step": 1381, + "time_per_iteration": 2.470153331756592 + }, + { + "auxiliary_loss_clip": 0.0119877, + "auxiliary_loss_mlp": 0.01060106, + "balance_loss_clip": 1.03604937, + "balance_loss_mlp": 1.06063581, + "epoch": 0.08309033518713363, + "flos": 13662646341120.0, + "grad_norm": 3.795546136319442, + "language_loss": 0.8817445, + "learning_rate": 3.970506888268011e-06, + "loss": 0.90433335, + "num_input_tokens_seen": 29439840, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.3828125, + "step": 1382, + "time_per_iteration": 2.4352822303771973 + }, + { + "auxiliary_loss_clip": 0.01190764, + "auxiliary_loss_mlp": 0.01059612, + "balance_loss_clip": 1.03728414, + "balance_loss_mlp": 1.0569818, + "epoch": 0.0831504584398016, + "flos": 17968551068160.0, + "grad_norm": 2.6234570747150734, + "language_loss": 0.77606535, + "learning_rate": 3.970440213512121e-06, + "loss": 0.79856908, + "num_input_tokens_seen": 29457360, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.34375, + "step": 1383, + "time_per_iteration": 2.45939040184021 + }, + { + "auxiliary_loss_clip": 0.01194291, + "auxiliary_loss_mlp": 0.01056287, + "balance_loss_clip": 1.03254008, + "balance_loss_mlp": 1.05730414, + "epoch": 0.08321058169246956, + "flos": 22601386408320.0, + "grad_norm": 2.1508484512905945, + "language_loss": 0.8293128, + "learning_rate": 3.97037346403694e-06, + "loss": 0.85181862, + "num_input_tokens_seen": 29477040, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.3671875, + "step": 1384, + "time_per_iteration": 2.4773356914520264 + }, + { + "auxiliary_loss_clip": 0.01198678, + "auxiliary_loss_mlp": 0.01055169, + "balance_loss_clip": 1.02937245, + "balance_loss_mlp": 1.05890989, + "epoch": 0.08327070494513754, + "flos": 22850426378880.0, + "grad_norm": 2.4890613364481893, + "language_loss": 0.84828049, + "learning_rate": 3.970306639845e-06, + "loss": 0.87081897, + "num_input_tokens_seen": 29492010, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.3984375, + "step": 1385, + "time_per_iteration": 2.5084009170532227 + }, + { + "auxiliary_loss_clip": 0.01194904, + "auxiliary_loss_mlp": 0.01066074, + "balance_loss_clip": 1.04257774, + "balance_loss_mlp": 1.05825758, + "epoch": 0.0833308281978055, + "flos": 22782986593920.0, + "grad_norm": 2.123672194513448, + "language_loss": 0.68744183, + "learning_rate": 3.970239740938835e-06, + "loss": 0.7100516, + "num_input_tokens_seen": 29511850, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.3671875, + "step": 1386, + "time_per_iteration": 2.477592945098877 + }, + { + "auxiliary_loss_clip": 0.01191265, + "auxiliary_loss_mlp": 0.0105612, + "balance_loss_clip": 1.03186047, + "balance_loss_mlp": 1.05579662, + "epoch": 0.08339095145047347, + "flos": 20812604083200.0, + "grad_norm": 1.7726596290820096, + "language_loss": 0.82067239, + "learning_rate": 3.97017276732098e-06, + "loss": 0.84314626, + "num_input_tokens_seen": 29531415, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.359375, + "step": 1387, + "time_per_iteration": 2.529261350631714 + }, + { + "auxiliary_loss_clip": 0.01196512, + "auxiliary_loss_mlp": 0.0107016, + "balance_loss_clip": 1.04474461, + "balance_loss_mlp": 1.05739772, + "epoch": 0.08345107470314143, + "flos": 18515326872960.0, + "grad_norm": 2.385304875072474, + "language_loss": 0.77194649, + "learning_rate": 3.970105718993978e-06, + "loss": 0.79461324, + "num_input_tokens_seen": 29549525, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.390625, + "step": 1388, + "time_per_iteration": 2.4517693519592285 + }, + { + "auxiliary_loss_clip": 0.01187734, + "auxiliary_loss_mlp": 0.01059717, + "balance_loss_clip": 1.0351125, + "balance_loss_mlp": 1.0574429, + "epoch": 0.08351119795580941, + "flos": 18807567926400.0, + "grad_norm": 2.246368739161805, + "language_loss": 0.79078835, + "learning_rate": 3.970038595960369e-06, + "loss": 0.81326282, + "num_input_tokens_seen": 29568705, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.3046875, + "step": 1389, + "time_per_iteration": 2.4999983310699463 + }, + { + "auxiliary_loss_clip": 0.01194109, + "auxiliary_loss_mlp": 0.01056803, + "balance_loss_clip": 1.03368866, + "balance_loss_mlp": 1.05773938, + "epoch": 0.08357132120847738, + "flos": 18441817689600.0, + "grad_norm": 4.533904477221136, + "language_loss": 0.87495124, + "learning_rate": 3.969971398222699e-06, + "loss": 0.89746046, + "num_input_tokens_seen": 29585855, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.359375, + "step": 1390, + "time_per_iteration": 2.438126802444458 + }, + { + "auxiliary_loss_clip": 0.01190886, + "auxiliary_loss_mlp": 0.01063167, + "balance_loss_clip": 1.03902745, + "balance_loss_mlp": 1.05621624, + "epoch": 0.08363144446114534, + "flos": 25922333318400.0, + "grad_norm": 1.6928828016377326, + "language_loss": 0.86753631, + "learning_rate": 3.969904125783517e-06, + "loss": 0.89007682, + "num_input_tokens_seen": 29607280, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.34375, + "step": 1391, + "time_per_iteration": 2.5615429878234863 + }, + { + "auxiliary_loss_clip": 0.01198327, + "auxiliary_loss_mlp": 0.01071606, + "balance_loss_clip": 1.0480268, + "balance_loss_mlp": 1.05904424, + "epoch": 0.08369156771381332, + "flos": 18041306065920.0, + "grad_norm": 4.090701354718017, + "language_loss": 0.87550449, + "learning_rate": 3.969836778645371e-06, + "loss": 0.89820385, + "num_input_tokens_seen": 29624130, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.390625, + "step": 1392, + "time_per_iteration": 2.4343698024749756 + }, + { + "auxiliary_loss_clip": 0.01190277, + "auxiliary_loss_mlp": 0.01060815, + "balance_loss_clip": 1.03682983, + "balance_loss_mlp": 1.05556941, + "epoch": 0.08375169096648129, + "flos": 22675111073280.0, + "grad_norm": 2.9857894096842457, + "language_loss": 0.80519998, + "learning_rate": 3.969769356810819e-06, + "loss": 0.82771087, + "num_input_tokens_seen": 29643210, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.34375, + "step": 1393, + "time_per_iteration": 3.9978342056274414 + }, + { + "auxiliary_loss_clip": 0.01191931, + "auxiliary_loss_mlp": 0.01054176, + "balance_loss_clip": 1.03098941, + "balance_loss_mlp": 1.05832088, + "epoch": 0.08381181421914925, + "flos": 26103215232000.0, + "grad_norm": 1.8413427873168604, + "language_loss": 0.84738398, + "learning_rate": 3.969701860282415e-06, + "loss": 0.86984503, + "num_input_tokens_seen": 29663920, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3359375, + "step": 1394, + "time_per_iteration": 3.995389461517334 + }, + { + "auxiliary_loss_clip": 0.01193271, + "auxiliary_loss_mlp": 0.010537, + "balance_loss_clip": 1.0296433, + "balance_loss_mlp": 1.05856824, + "epoch": 0.08387193747181723, + "flos": 20629782835200.0, + "grad_norm": 1.7688902284368797, + "language_loss": 0.82957625, + "learning_rate": 3.969634289062719e-06, + "loss": 0.85204601, + "num_input_tokens_seen": 29683825, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.34375, + "step": 1395, + "time_per_iteration": 2.5080416202545166 + }, + { + "auxiliary_loss_clip": 0.01194811, + "auxiliary_loss_mlp": 0.01062467, + "balance_loss_clip": 1.03683722, + "balance_loss_mlp": 1.05833054, + "epoch": 0.0839320607244852, + "flos": 13443196199040.0, + "grad_norm": 1.9626395114639965, + "language_loss": 0.82492781, + "learning_rate": 3.969566643154293e-06, + "loss": 0.84750068, + "num_input_tokens_seen": 29698775, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.3671875, + "step": 1396, + "time_per_iteration": 2.51763653755188 + }, + { + "auxiliary_loss_clip": 0.01191589, + "auxiliary_loss_mlp": 0.01058769, + "balance_loss_clip": 1.03253114, + "balance_loss_mlp": 1.05944824, + "epoch": 0.08399218397715316, + "flos": 23477247642240.0, + "grad_norm": 2.3756879295671367, + "language_loss": 0.7702114, + "learning_rate": 3.969498922559703e-06, + "loss": 0.79271495, + "num_input_tokens_seen": 29719430, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.3203125, + "step": 1397, + "time_per_iteration": 2.522019624710083 + }, + { + "auxiliary_loss_clip": 0.01191257, + "auxiliary_loss_mlp": 0.01050826, + "balance_loss_clip": 1.02635193, + "balance_loss_mlp": 1.05688787, + "epoch": 0.08405230722982113, + "flos": 25920717206400.0, + "grad_norm": 2.1333990758799795, + "language_loss": 0.77589226, + "learning_rate": 3.969431127281516e-06, + "loss": 0.79831308, + "num_input_tokens_seen": 29739685, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.34375, + "step": 1398, + "time_per_iteration": 2.499405860900879 + }, + { + "auxiliary_loss_clip": 0.01187104, + "auxiliary_loss_mlp": 0.01057261, + "balance_loss_clip": 1.03366995, + "balance_loss_mlp": 1.05604136, + "epoch": 0.0841124304824891, + "flos": 17967437746560.0, + "grad_norm": 6.547707007931562, + "language_loss": 0.94411373, + "learning_rate": 3.969363257322304e-06, + "loss": 0.96655744, + "num_input_tokens_seen": 29756165, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.3125, + "step": 1399, + "time_per_iteration": 2.458564043045044 + }, + { + "auxiliary_loss_clip": 0.01192876, + "auxiliary_loss_mlp": 0.01060981, + "balance_loss_clip": 1.03585184, + "balance_loss_mlp": 1.05564523, + "epoch": 0.08417255373515707, + "flos": 25629661301760.0, + "grad_norm": 2.3313569082148637, + "language_loss": 0.82052553, + "learning_rate": 3.96929531268464e-06, + "loss": 0.84306407, + "num_input_tokens_seen": 29776425, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.375, + "step": 1400, + "time_per_iteration": 2.511075258255005 + }, + { + "auxiliary_loss_clip": 0.01191821, + "auxiliary_loss_mlp": 0.01061122, + "balance_loss_clip": 1.03713727, + "balance_loss_mlp": 1.05681479, + "epoch": 0.08423267698782504, + "flos": 26249730808320.0, + "grad_norm": 3.6029570836648723, + "language_loss": 0.86615682, + "learning_rate": 3.969227293371099e-06, + "loss": 0.8886863, + "num_input_tokens_seen": 29796440, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.3515625, + "step": 1401, + "time_per_iteration": 2.5328855514526367 + }, + { + "auxiliary_loss_clip": 0.01190636, + "auxiliary_loss_mlp": 0.01063749, + "balance_loss_clip": 1.0383811, + "balance_loss_mlp": 1.05496573, + "epoch": 0.08429280024049302, + "flos": 20119707751680.0, + "grad_norm": 2.2778357332658543, + "language_loss": 0.87128234, + "learning_rate": 3.969159199384263e-06, + "loss": 0.89382625, + "num_input_tokens_seen": 29814755, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.359375, + "step": 1402, + "time_per_iteration": 2.4695520401000977 + }, + { + "auxiliary_loss_clip": 0.0118725, + "auxiliary_loss_mlp": 0.01056626, + "balance_loss_clip": 1.03340352, + "balance_loss_mlp": 1.0542388, + "epoch": 0.08435292349316098, + "flos": 42924526836480.0, + "grad_norm": 2.954964391273458, + "language_loss": 0.88680542, + "learning_rate": 3.9690910307267125e-06, + "loss": 0.90924418, + "num_input_tokens_seen": 29834785, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.328125, + "step": 1403, + "time_per_iteration": 2.6655161380767822 + }, + { + "auxiliary_loss_clip": 0.01189559, + "auxiliary_loss_mlp": 0.01056388, + "balance_loss_clip": 1.03105569, + "balance_loss_mlp": 1.05429792, + "epoch": 0.08441304674582895, + "flos": 22857285876480.0, + "grad_norm": 1.9645692036725415, + "language_loss": 0.80325729, + "learning_rate": 3.969022787401033e-06, + "loss": 0.82571673, + "num_input_tokens_seen": 29854695, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.359375, + "step": 1404, + "time_per_iteration": 2.5011603832244873 + }, + { + "auxiliary_loss_clip": 0.01195719, + "auxiliary_loss_mlp": 0.01066072, + "balance_loss_clip": 1.04089534, + "balance_loss_mlp": 1.05798006, + "epoch": 0.08447316999849692, + "flos": 18697501676160.0, + "grad_norm": 2.1059643070764027, + "language_loss": 0.83845061, + "learning_rate": 3.968954469409811e-06, + "loss": 0.86106849, + "num_input_tokens_seen": 29872180, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.375, + "step": 1405, + "time_per_iteration": 2.4612858295440674 + }, + { + "auxiliary_loss_clip": 0.01188265, + "auxiliary_loss_mlp": 0.01056168, + "balance_loss_clip": 1.03314888, + "balance_loss_mlp": 1.05381966, + "epoch": 0.08453329325116489, + "flos": 25483971738240.0, + "grad_norm": 1.7581309060245893, + "language_loss": 0.80343008, + "learning_rate": 3.968886076755639e-06, + "loss": 0.82587439, + "num_input_tokens_seen": 29893205, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.34375, + "step": 1406, + "time_per_iteration": 2.496676206588745 + }, + { + "auxiliary_loss_clip": 0.01192497, + "auxiliary_loss_mlp": 0.01065969, + "balance_loss_clip": 1.0421989, + "balance_loss_mlp": 1.05858994, + "epoch": 0.08459341650383286, + "flos": 20920048640640.0, + "grad_norm": 1.8241253914082192, + "language_loss": 0.79411483, + "learning_rate": 3.96881760944111e-06, + "loss": 0.8166995, + "num_input_tokens_seen": 29911970, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.3359375, + "step": 1407, + "time_per_iteration": 2.491055727005005 + }, + { + "auxiliary_loss_clip": 0.01188371, + "auxiliary_loss_mlp": 0.01055807, + "balance_loss_clip": 1.03234673, + "balance_loss_mlp": 1.05521655, + "epoch": 0.08465353975650082, + "flos": 13043079624960.0, + "grad_norm": 4.541456574357825, + "language_loss": 0.91929626, + "learning_rate": 3.968749067468819e-06, + "loss": 0.94173807, + "num_input_tokens_seen": 29929925, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.328125, + "step": 1408, + "time_per_iteration": 2.44599986076355 + }, + { + "auxiliary_loss_clip": 0.01074059, + "auxiliary_loss_mlp": 0.01025014, + "balance_loss_clip": 1.02074611, + "balance_loss_mlp": 1.02193737, + "epoch": 0.0847136630091688, + "flos": 60877422552960.0, + "grad_norm": 0.8980094129226197, + "language_loss": 0.61861706, + "learning_rate": 3.968680450841368e-06, + "loss": 0.63960779, + "num_input_tokens_seen": 29985950, + "router_z_loss_clip": 0.04272461, + "router_z_loss_mlp": 0.5234375, + "step": 1409, + "time_per_iteration": 3.1084799766540527 + }, + { + "auxiliary_loss_clip": 0.01180993, + "auxiliary_loss_mlp": 0.01060196, + "balance_loss_clip": 1.03784466, + "balance_loss_mlp": 1.05419254, + "epoch": 0.08477378626183676, + "flos": 22046530043520.0, + "grad_norm": 2.25814404402445, + "language_loss": 0.86819237, + "learning_rate": 3.968611759561355e-06, + "loss": 0.89060426, + "num_input_tokens_seen": 30004330, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.265625, + "step": 1410, + "time_per_iteration": 2.4854791164398193 + }, + { + "auxiliary_loss_clip": 0.01188551, + "auxiliary_loss_mlp": 0.01056537, + "balance_loss_clip": 1.0309782, + "balance_loss_mlp": 1.05453801, + "epoch": 0.08483390951450473, + "flos": 16690059308160.0, + "grad_norm": 2.048224684561652, + "language_loss": 0.74138093, + "learning_rate": 3.968542993631388e-06, + "loss": 0.76383173, + "num_input_tokens_seen": 30022555, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.3359375, + "step": 1411, + "time_per_iteration": 2.484879970550537 + }, + { + "auxiliary_loss_clip": 0.01068033, + "auxiliary_loss_mlp": 0.01005767, + "balance_loss_clip": 1.00169039, + "balance_loss_mlp": 1.01640451, + "epoch": 0.08489403276717271, + "flos": 51584640082560.0, + "grad_norm": 0.9041737870208939, + "language_loss": 0.56723791, + "learning_rate": 3.968474153054073e-06, + "loss": 0.58797586, + "num_input_tokens_seen": 30077220, + "router_z_loss_clip": 0.04077148, + "router_z_loss_mlp": 0.515625, + "step": 1412, + "time_per_iteration": 3.003227949142456 + }, + { + "auxiliary_loss_clip": 0.01183878, + "auxiliary_loss_mlp": 0.01062107, + "balance_loss_clip": 1.03855133, + "balance_loss_mlp": 1.05354273, + "epoch": 0.08495415601984067, + "flos": 17092330698240.0, + "grad_norm": 2.0338814511208883, + "language_loss": 0.89084172, + "learning_rate": 3.96840523783202e-06, + "loss": 0.91330159, + "num_input_tokens_seen": 30094600, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.3046875, + "step": 1413, + "time_per_iteration": 2.4545698165893555 + }, + { + "auxiliary_loss_clip": 0.01186591, + "auxiliary_loss_mlp": 0.01054482, + "balance_loss_clip": 1.03019929, + "balance_loss_mlp": 1.0562067, + "epoch": 0.08501427927250864, + "flos": 23148413608320.0, + "grad_norm": 2.1859301398641415, + "language_loss": 0.8807795, + "learning_rate": 3.968336247967844e-06, + "loss": 0.90319026, + "num_input_tokens_seen": 30114475, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.3046875, + "step": 1414, + "time_per_iteration": 2.4803147315979004 + }, + { + "auxiliary_loss_clip": 0.01185784, + "auxiliary_loss_mlp": 0.01056984, + "balance_loss_clip": 1.03497767, + "balance_loss_mlp": 1.0540117, + "epoch": 0.08507440252517662, + "flos": 19063467394560.0, + "grad_norm": 1.82577143383273, + "language_loss": 0.77434587, + "learning_rate": 3.96826718346416e-06, + "loss": 0.79677355, + "num_input_tokens_seen": 30133350, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.3125, + "step": 1415, + "time_per_iteration": 2.510671615600586 + }, + { + "auxiliary_loss_clip": 0.01185616, + "auxiliary_loss_mlp": 0.010657, + "balance_loss_clip": 1.0441227, + "balance_loss_mlp": 1.05612898, + "epoch": 0.08513452577784458, + "flos": 60182296600320.0, + "grad_norm": 1.848223104879299, + "language_loss": 0.70859981, + "learning_rate": 3.968198044323587e-06, + "loss": 0.73111296, + "num_input_tokens_seen": 30159005, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.296875, + "step": 1416, + "time_per_iteration": 2.827016592025757 + }, + { + "auxiliary_loss_clip": 0.01192002, + "auxiliary_loss_mlp": 0.0106124, + "balance_loss_clip": 1.03587198, + "balance_loss_mlp": 1.05693281, + "epoch": 0.08519464903051255, + "flos": 27308485117440.0, + "grad_norm": 1.9370001986884609, + "language_loss": 0.74855268, + "learning_rate": 3.968128830548748e-06, + "loss": 0.77108514, + "num_input_tokens_seen": 30179450, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.3515625, + "step": 1417, + "time_per_iteration": 2.51518177986145 + }, + { + "auxiliary_loss_clip": 0.01184448, + "auxiliary_loss_mlp": 0.01055419, + "balance_loss_clip": 1.03157723, + "balance_loss_mlp": 1.05394006, + "epoch": 0.08525477228318051, + "flos": 20266438809600.0, + "grad_norm": 2.566029486363868, + "language_loss": 0.82460356, + "learning_rate": 3.968059542142265e-06, + "loss": 0.84700227, + "num_input_tokens_seen": 30197235, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.3046875, + "step": 1418, + "time_per_iteration": 2.4632515907287598 + }, + { + "auxiliary_loss_clip": 0.01078096, + "auxiliary_loss_mlp": 0.01026146, + "balance_loss_clip": 1.02221191, + "balance_loss_mlp": 1.0269177, + "epoch": 0.08531489553584849, + "flos": 67615017183360.0, + "grad_norm": 0.8662062784105238, + "language_loss": 0.56616145, + "learning_rate": 3.9679901791067685e-06, + "loss": 0.58720386, + "num_input_tokens_seen": 30257410, + "router_z_loss_clip": 0.03930664, + "router_z_loss_mlp": 0.51171875, + "step": 1419, + "time_per_iteration": 3.0262646675109863 + }, + { + "auxiliary_loss_clip": 0.01185611, + "auxiliary_loss_mlp": 0.01062944, + "balance_loss_clip": 1.03858972, + "balance_loss_mlp": 1.05284262, + "epoch": 0.08537501878851646, + "flos": 27526965592320.0, + "grad_norm": 2.301787344693911, + "language_loss": 0.69764268, + "learning_rate": 3.967920741444886e-06, + "loss": 0.72012818, + "num_input_tokens_seen": 30277865, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.328125, + "step": 1420, + "time_per_iteration": 2.5173370838165283 + }, + { + "auxiliary_loss_clip": 0.01182824, + "auxiliary_loss_mlp": 0.01051954, + "balance_loss_clip": 1.02912498, + "balance_loss_mlp": 1.05232763, + "epoch": 0.08543514204118442, + "flos": 22784243569920.0, + "grad_norm": 1.56579546013663, + "language_loss": 0.87886292, + "learning_rate": 3.967851229159252e-06, + "loss": 0.90121067, + "num_input_tokens_seen": 30298545, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.3046875, + "step": 1421, + "time_per_iteration": 2.498198986053467 + }, + { + "auxiliary_loss_clip": 0.01069987, + "auxiliary_loss_mlp": 0.01004015, + "balance_loss_clip": 1.00034332, + "balance_loss_mlp": 1.01909983, + "epoch": 0.0854952652938524, + "flos": 60990721027200.0, + "grad_norm": 0.7935144939089421, + "language_loss": 0.63490081, + "learning_rate": 3.967781642252502e-06, + "loss": 0.65564084, + "num_input_tokens_seen": 30361725, + "router_z_loss_clip": 0.03662109, + "router_z_loss_mlp": 0.5078125, + "step": 1422, + "time_per_iteration": 3.050874948501587 + }, + { + "auxiliary_loss_clip": 0.01182797, + "auxiliary_loss_mlp": 0.01065038, + "balance_loss_clip": 1.04182768, + "balance_loss_mlp": 1.05538559, + "epoch": 0.08555538854652037, + "flos": 28038046256640.0, + "grad_norm": 2.040119561169685, + "language_loss": 0.83427018, + "learning_rate": 3.967711980727276e-06, + "loss": 0.85674852, + "num_input_tokens_seen": 30382180, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.2734375, + "step": 1423, + "time_per_iteration": 2.525075674057007 + }, + { + "auxiliary_loss_clip": 0.01190455, + "auxiliary_loss_mlp": 0.01059451, + "balance_loss_clip": 1.0365268, + "balance_loss_mlp": 1.05613029, + "epoch": 0.08561551179918833, + "flos": 23509279595520.0, + "grad_norm": 1.7627385415604107, + "language_loss": 0.74945033, + "learning_rate": 3.967642244586213e-06, + "loss": 0.77194929, + "num_input_tokens_seen": 30402980, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.34375, + "step": 1424, + "time_per_iteration": 2.523231029510498 + }, + { + "auxiliary_loss_clip": 0.01185893, + "auxiliary_loss_mlp": 0.01056266, + "balance_loss_clip": 1.03307986, + "balance_loss_mlp": 1.05510807, + "epoch": 0.08567563505185631, + "flos": 17926930183680.0, + "grad_norm": 1.9395290082560723, + "language_loss": 0.7574805, + "learning_rate": 3.96757243383196e-06, + "loss": 0.7799021, + "num_input_tokens_seen": 30420800, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3046875, + "step": 1425, + "time_per_iteration": 2.441420793533325 + }, + { + "auxiliary_loss_clip": 0.01183386, + "auxiliary_loss_mlp": 0.01053965, + "balance_loss_clip": 1.03092194, + "balance_loss_mlp": 1.05407834, + "epoch": 0.08573575830452428, + "flos": 19719519350400.0, + "grad_norm": 2.579491371045568, + "language_loss": 0.93504989, + "learning_rate": 3.9675025484671624e-06, + "loss": 0.95742333, + "num_input_tokens_seen": 30439620, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.296875, + "step": 1426, + "time_per_iteration": 2.4703657627105713 + }, + { + "auxiliary_loss_clip": 0.0119154, + "auxiliary_loss_mlp": 0.01066598, + "balance_loss_clip": 1.04115915, + "balance_loss_mlp": 1.05764198, + "epoch": 0.08579588155719224, + "flos": 17931563038080.0, + "grad_norm": 2.235647808517122, + "language_loss": 0.75003266, + "learning_rate": 3.967432588494471e-06, + "loss": 0.772614, + "num_input_tokens_seen": 30457300, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.34375, + "step": 1427, + "time_per_iteration": 2.4430549144744873 + }, + { + "auxiliary_loss_clip": 0.01182417, + "auxiliary_loss_mlp": 0.01061112, + "balance_loss_clip": 1.03907049, + "balance_loss_mlp": 1.05315089, + "epoch": 0.08585600480986022, + "flos": 16033324993920.0, + "grad_norm": 2.3372587699614726, + "language_loss": 0.81915152, + "learning_rate": 3.96736255391654e-06, + "loss": 0.84158677, + "num_input_tokens_seen": 30471580, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.296875, + "step": 1428, + "time_per_iteration": 2.454441785812378 + }, + { + "auxiliary_loss_clip": 0.01189987, + "auxiliary_loss_mlp": 0.01066735, + "balance_loss_clip": 1.04258347, + "balance_loss_mlp": 1.05586076, + "epoch": 0.08591612806252819, + "flos": 28657433404800.0, + "grad_norm": 2.395570851050941, + "language_loss": 0.79697371, + "learning_rate": 3.967292444736023e-06, + "loss": 0.81954098, + "num_input_tokens_seen": 30492720, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.34375, + "step": 1429, + "time_per_iteration": 2.5411579608917236 + }, + { + "auxiliary_loss_clip": 0.0119024, + "auxiliary_loss_mlp": 0.01062326, + "balance_loss_clip": 1.03952122, + "balance_loss_mlp": 1.05773449, + "epoch": 0.08597625131519615, + "flos": 20959119659520.0, + "grad_norm": 2.301464625204156, + "language_loss": 0.88055587, + "learning_rate": 3.967222260955578e-06, + "loss": 0.90308148, + "num_input_tokens_seen": 30509535, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.328125, + "step": 1430, + "time_per_iteration": 2.497523546218872 + }, + { + "auxiliary_loss_clip": 0.01184535, + "auxiliary_loss_mlp": 0.01072949, + "balance_loss_clip": 1.04995334, + "balance_loss_mlp": 1.05712664, + "epoch": 0.08603637456786412, + "flos": 23256360956160.0, + "grad_norm": 1.7504719201320615, + "language_loss": 0.81914723, + "learning_rate": 3.96715200257787e-06, + "loss": 0.84172201, + "num_input_tokens_seen": 30529490, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.2734375, + "step": 1431, + "time_per_iteration": 2.478731393814087 + }, + { + "auxiliary_loss_clip": 0.01184756, + "auxiliary_loss_mlp": 0.01056491, + "balance_loss_clip": 1.03337657, + "balance_loss_mlp": 1.05376828, + "epoch": 0.0860964978205321, + "flos": 28694170039680.0, + "grad_norm": 1.9949655353101803, + "language_loss": 0.77759397, + "learning_rate": 3.967081669605559e-06, + "loss": 0.80000651, + "num_input_tokens_seen": 30550205, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3125, + "step": 1432, + "time_per_iteration": 2.5344104766845703 + }, + { + "auxiliary_loss_clip": 0.0118072, + "auxiliary_loss_mlp": 0.01059034, + "balance_loss_clip": 1.03497803, + "balance_loss_mlp": 1.05027151, + "epoch": 0.08615662107320006, + "flos": 19318397195520.0, + "grad_norm": 2.2873036973179603, + "language_loss": 0.73330259, + "learning_rate": 3.967011262041315e-06, + "loss": 0.75570011, + "num_input_tokens_seen": 30568830, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.3046875, + "step": 1433, + "time_per_iteration": 2.4787938594818115 + }, + { + "auxiliary_loss_clip": 0.01188497, + "auxiliary_loss_mlp": 0.01058804, + "balance_loss_clip": 1.03375793, + "balance_loss_mlp": 1.05464733, + "epoch": 0.08621674432586802, + "flos": 15851688894720.0, + "grad_norm": 2.615593579271415, + "language_loss": 0.85741955, + "learning_rate": 3.9669407798878065e-06, + "loss": 0.87989259, + "num_input_tokens_seen": 30585730, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.3359375, + "step": 1434, + "time_per_iteration": 5.500946998596191 + }, + { + "auxiliary_loss_clip": 0.01182383, + "auxiliary_loss_mlp": 0.01054521, + "balance_loss_clip": 1.03139436, + "balance_loss_mlp": 1.05177212, + "epoch": 0.086276867578536, + "flos": 14100648785280.0, + "grad_norm": 3.0513138823403825, + "language_loss": 0.78913063, + "learning_rate": 3.966870223147707e-06, + "loss": 0.81149966, + "num_input_tokens_seen": 30603180, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3046875, + "step": 1435, + "time_per_iteration": 3.899777412414551 + }, + { + "auxiliary_loss_clip": 0.01070575, + "auxiliary_loss_mlp": 0.01027743, + "balance_loss_clip": 1.02428555, + "balance_loss_mlp": 1.02010655, + "epoch": 0.08633699083120397, + "flos": 70184857772160.0, + "grad_norm": 0.8910926846424677, + "language_loss": 0.57930011, + "learning_rate": 3.96679959182369e-06, + "loss": 0.60028332, + "num_input_tokens_seen": 30668895, + "router_z_loss_clip": 0.03466797, + "router_z_loss_mlp": 0.5078125, + "step": 1436, + "time_per_iteration": 3.179255247116089 + }, + { + "auxiliary_loss_clip": 0.01186059, + "auxiliary_loss_mlp": 0.01049386, + "balance_loss_clip": 1.02633083, + "balance_loss_mlp": 1.05314159, + "epoch": 0.08639711408387193, + "flos": 30298874140800.0, + "grad_norm": 2.429993259280604, + "language_loss": 0.68775386, + "learning_rate": 3.966728885918437e-06, + "loss": 0.71010828, + "num_input_tokens_seen": 30688955, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.328125, + "step": 1437, + "time_per_iteration": 2.529339551925659 + }, + { + "auxiliary_loss_clip": 0.01185365, + "auxiliary_loss_mlp": 0.01050306, + "balance_loss_clip": 1.02806163, + "balance_loss_mlp": 1.05388093, + "epoch": 0.08645723733653991, + "flos": 20297680663680.0, + "grad_norm": 2.5641138848438163, + "language_loss": 0.7274068, + "learning_rate": 3.966658105434627e-06, + "loss": 0.74976349, + "num_input_tokens_seen": 30706095, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.3125, + "step": 1438, + "time_per_iteration": 2.4840176105499268 + }, + { + "auxiliary_loss_clip": 0.01183596, + "auxiliary_loss_mlp": 0.01049035, + "balance_loss_clip": 1.02594447, + "balance_loss_mlp": 1.05472374, + "epoch": 0.08651736058920788, + "flos": 32890583134080.0, + "grad_norm": 1.681614476681305, + "language_loss": 0.64628494, + "learning_rate": 3.966587250374945e-06, + "loss": 0.66861117, + "num_input_tokens_seen": 30729025, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.2890625, + "step": 1439, + "time_per_iteration": 2.61686372756958 + }, + { + "auxiliary_loss_clip": 0.01187197, + "auxiliary_loss_mlp": 0.01055218, + "balance_loss_clip": 1.03131676, + "balance_loss_mlp": 1.05638909, + "epoch": 0.08657748384187584, + "flos": 22637368857600.0, + "grad_norm": 2.062065757985673, + "language_loss": 0.87748063, + "learning_rate": 3.966516320742077e-06, + "loss": 0.89990479, + "num_input_tokens_seen": 30746155, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.3125, + "step": 1440, + "time_per_iteration": 2.5116493701934814 + }, + { + "auxiliary_loss_clip": 0.01188419, + "auxiliary_loss_mlp": 0.01059749, + "balance_loss_clip": 1.03538251, + "balance_loss_mlp": 1.0540843, + "epoch": 0.08663760709454381, + "flos": 23658380951040.0, + "grad_norm": 2.4102507257620363, + "language_loss": 0.83243793, + "learning_rate": 3.9664453165387124e-06, + "loss": 0.85491961, + "num_input_tokens_seen": 30761410, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.34375, + "step": 1441, + "time_per_iteration": 2.5058300495147705 + }, + { + "auxiliary_loss_clip": 0.01070024, + "auxiliary_loss_mlp": 0.01009256, + "balance_loss_clip": 1.00525022, + "balance_loss_mlp": 1.01939523, + "epoch": 0.08669773034721179, + "flos": 62686564911360.0, + "grad_norm": 0.8461220926791603, + "language_loss": 0.60426581, + "learning_rate": 3.966374237767545e-06, + "loss": 0.62505859, + "num_input_tokens_seen": 30823010, + "router_z_loss_clip": 0.04003906, + "router_z_loss_mlp": 0.5078125, + "step": 1442, + "time_per_iteration": 3.1946628093719482 + }, + { + "auxiliary_loss_clip": 0.01192002, + "auxiliary_loss_mlp": 0.01057232, + "balance_loss_clip": 1.03379524, + "balance_loss_mlp": 1.05709028, + "epoch": 0.08675785359987975, + "flos": 20667489137280.0, + "grad_norm": 3.2809405592870835, + "language_loss": 0.79264277, + "learning_rate": 3.96630308443127e-06, + "loss": 0.81513512, + "num_input_tokens_seen": 30841980, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.3515625, + "step": 1443, + "time_per_iteration": 2.477691411972046 + }, + { + "auxiliary_loss_clip": 0.01185255, + "auxiliary_loss_mlp": 0.01054103, + "balance_loss_clip": 1.03170311, + "balance_loss_mlp": 1.05261874, + "epoch": 0.08681797685254772, + "flos": 26941118768640.0, + "grad_norm": 1.764762918327591, + "language_loss": 0.82248437, + "learning_rate": 3.966231856532584e-06, + "loss": 0.8448779, + "num_input_tokens_seen": 30863280, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.328125, + "step": 1444, + "time_per_iteration": 2.584773063659668 + }, + { + "auxiliary_loss_clip": 0.01189581, + "auxiliary_loss_mlp": 0.01049918, + "balance_loss_clip": 1.02745867, + "balance_loss_mlp": 1.05537939, + "epoch": 0.0868781001052157, + "flos": 17712831168000.0, + "grad_norm": 1.945627197742621, + "language_loss": 0.86856627, + "learning_rate": 3.966160554074189e-06, + "loss": 0.89096129, + "num_input_tokens_seen": 30881710, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.34375, + "step": 1445, + "time_per_iteration": 2.506258964538574 + }, + { + "auxiliary_loss_clip": 0.01189413, + "auxiliary_loss_mlp": 0.01054326, + "balance_loss_clip": 1.03303528, + "balance_loss_mlp": 1.05808067, + "epoch": 0.08693822335788366, + "flos": 19896522595200.0, + "grad_norm": 1.9763924186655837, + "language_loss": 0.81639445, + "learning_rate": 3.96608917705879e-06, + "loss": 0.8388319, + "num_input_tokens_seen": 30900225, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.3125, + "step": 1446, + "time_per_iteration": 2.531313180923462 + }, + { + "auxiliary_loss_clip": 0.010647, + "auxiliary_loss_mlp": 0.01005416, + "balance_loss_clip": 1.0020541, + "balance_loss_mlp": 1.0147202, + "epoch": 0.08699834661055163, + "flos": 67023747406080.0, + "grad_norm": 0.728477241136595, + "language_loss": 0.54725462, + "learning_rate": 3.966017725489091e-06, + "loss": 0.56795579, + "num_input_tokens_seen": 30959580, + "router_z_loss_clip": 0.03369141, + "router_z_loss_mlp": 0.5, + "step": 1447, + "time_per_iteration": 3.1009976863861084 + }, + { + "auxiliary_loss_clip": 0.01178637, + "auxiliary_loss_mlp": 0.01052877, + "balance_loss_clip": 1.03104973, + "balance_loss_mlp": 1.05198455, + "epoch": 0.0870584698632196, + "flos": 13480507451520.0, + "grad_norm": 2.2332818090387243, + "language_loss": 0.84593046, + "learning_rate": 3.965946199367804e-06, + "loss": 0.8682456, + "num_input_tokens_seen": 30976775, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.265625, + "step": 1448, + "time_per_iteration": 2.483792543411255 + }, + { + "auxiliary_loss_clip": 0.01185215, + "auxiliary_loss_mlp": 0.01056358, + "balance_loss_clip": 1.03386295, + "balance_loss_mlp": 1.0524509, + "epoch": 0.08711859311588757, + "flos": 16107013745280.0, + "grad_norm": 3.099884448391289, + "language_loss": 0.80688727, + "learning_rate": 3.965874598697638e-06, + "loss": 0.82930297, + "num_input_tokens_seen": 30990495, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.328125, + "step": 1449, + "time_per_iteration": 2.4637081623077393 + }, + { + "auxiliary_loss_clip": 0.01182046, + "auxiliary_loss_mlp": 0.01050023, + "balance_loss_clip": 1.02862501, + "balance_loss_mlp": 1.05370414, + "epoch": 0.08717871636855554, + "flos": 38472357928320.0, + "grad_norm": 4.183651889411507, + "language_loss": 0.71012592, + "learning_rate": 3.965802923481313e-06, + "loss": 0.73244655, + "num_input_tokens_seen": 31014080, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.2890625, + "step": 1450, + "time_per_iteration": 2.6521542072296143 + }, + { + "auxiliary_loss_clip": 0.0118314, + "auxiliary_loss_mlp": 0.01053244, + "balance_loss_clip": 1.03057098, + "balance_loss_mlp": 1.05502534, + "epoch": 0.0872388396212235, + "flos": 17600574188160.0, + "grad_norm": 1.8266796466048172, + "language_loss": 0.83492875, + "learning_rate": 3.965731173721542e-06, + "loss": 0.85729253, + "num_input_tokens_seen": 31031210, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.28125, + "step": 1451, + "time_per_iteration": 2.4866271018981934 + }, + { + "auxiliary_loss_clip": 0.01181156, + "auxiliary_loss_mlp": 0.01057138, + "balance_loss_clip": 1.03538203, + "balance_loss_mlp": 1.05371869, + "epoch": 0.08729896287389148, + "flos": 25259385951360.0, + "grad_norm": 1.850339391564711, + "language_loss": 0.74351519, + "learning_rate": 3.965659349421049e-06, + "loss": 0.76589811, + "num_input_tokens_seen": 31049710, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.2734375, + "step": 1452, + "time_per_iteration": 2.5450925827026367 + }, + { + "auxiliary_loss_clip": 0.01182798, + "auxiliary_loss_mlp": 0.01061481, + "balance_loss_clip": 1.03840256, + "balance_loss_mlp": 1.05121017, + "epoch": 0.08735908612655945, + "flos": 15632454234240.0, + "grad_norm": 3.3421371051734474, + "language_loss": 0.79840016, + "learning_rate": 3.965587450582556e-06, + "loss": 0.82084292, + "num_input_tokens_seen": 31066160, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3125, + "step": 1453, + "time_per_iteration": 2.49350643157959 + }, + { + "auxiliary_loss_clip": 0.01182604, + "auxiliary_loss_mlp": 0.0106489, + "balance_loss_clip": 1.04213262, + "balance_loss_mlp": 1.0545752, + "epoch": 0.08741920937922741, + "flos": 20339660684160.0, + "grad_norm": 1.982640213979625, + "language_loss": 0.71298045, + "learning_rate": 3.96551547720879e-06, + "loss": 0.73545539, + "num_input_tokens_seen": 31085270, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.28125, + "step": 1454, + "time_per_iteration": 2.534457206726074 + }, + { + "auxiliary_loss_clip": 0.01070391, + "auxiliary_loss_mlp": 0.01029091, + "balance_loss_clip": 1.02608728, + "balance_loss_mlp": 1.02026677, + "epoch": 0.08747933263189539, + "flos": 62819795433600.0, + "grad_norm": 0.7993884765543664, + "language_loss": 0.58655661, + "learning_rate": 3.96544342930248e-06, + "loss": 0.6075514, + "num_input_tokens_seen": 31148445, + "router_z_loss_clip": 0.0300293, + "router_z_loss_mlp": 0.5, + "step": 1455, + "time_per_iteration": 3.088113307952881 + }, + { + "auxiliary_loss_clip": 0.01182632, + "auxiliary_loss_mlp": 0.01058901, + "balance_loss_clip": 1.03638279, + "balance_loss_mlp": 1.05210626, + "epoch": 0.08753945588456336, + "flos": 33035877648000.0, + "grad_norm": 1.5590098662562957, + "language_loss": 0.77404714, + "learning_rate": 3.965371306866359e-06, + "loss": 0.79646254, + "num_input_tokens_seen": 31168770, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.3046875, + "step": 1456, + "time_per_iteration": 2.6145191192626953 + }, + { + "auxiliary_loss_clip": 0.01182283, + "auxiliary_loss_mlp": 0.01051585, + "balance_loss_clip": 1.02888715, + "balance_loss_mlp": 1.05235434, + "epoch": 0.08759957913723132, + "flos": 35547182046720.0, + "grad_norm": 2.3657198267749777, + "language_loss": 0.72391665, + "learning_rate": 3.96529910990316e-06, + "loss": 0.74625528, + "num_input_tokens_seen": 31189270, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.296875, + "step": 1457, + "time_per_iteration": 2.6438605785369873 + }, + { + "auxiliary_loss_clip": 0.01179054, + "auxiliary_loss_mlp": 0.01047648, + "balance_loss_clip": 1.02623844, + "balance_loss_mlp": 1.05207849, + "epoch": 0.0876597023898993, + "flos": 23911120022400.0, + "grad_norm": 1.5929331180335078, + "language_loss": 0.86215973, + "learning_rate": 3.965226838415622e-06, + "loss": 0.88442671, + "num_input_tokens_seen": 31210385, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.265625, + "step": 1458, + "time_per_iteration": 2.539658546447754 + }, + { + "auxiliary_loss_clip": 0.01189161, + "auxiliary_loss_mlp": 0.01059801, + "balance_loss_clip": 1.03694844, + "balance_loss_mlp": 1.05887103, + "epoch": 0.08771982564256726, + "flos": 18114025150080.0, + "grad_norm": 1.660016084678777, + "language_loss": 0.80662763, + "learning_rate": 3.965154492406486e-06, + "loss": 0.8291173, + "num_input_tokens_seen": 31229745, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.3046875, + "step": 1459, + "time_per_iteration": 2.4880902767181396 + }, + { + "auxiliary_loss_clip": 0.01187526, + "auxiliary_loss_mlp": 0.01054149, + "balance_loss_clip": 1.03057003, + "balance_loss_mlp": 1.05512893, + "epoch": 0.08777994889523523, + "flos": 17712005155200.0, + "grad_norm": 2.474003232718447, + "language_loss": 0.84058738, + "learning_rate": 3.9650820718784945e-06, + "loss": 0.86300415, + "num_input_tokens_seen": 31248280, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.328125, + "step": 1460, + "time_per_iteration": 2.4644060134887695 + }, + { + "auxiliary_loss_clip": 0.01178547, + "auxiliary_loss_mlp": 0.01054299, + "balance_loss_clip": 1.03287745, + "balance_loss_mlp": 1.05051732, + "epoch": 0.0878400721479032, + "flos": 12819930382080.0, + "grad_norm": 2.696872821623283, + "language_loss": 0.81030595, + "learning_rate": 3.965009576834394e-06, + "loss": 0.83263445, + "num_input_tokens_seen": 31262190, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.28125, + "step": 1461, + "time_per_iteration": 2.4456100463867188 + }, + { + "auxiliary_loss_clip": 0.01187345, + "auxiliary_loss_mlp": 0.01059901, + "balance_loss_clip": 1.03795433, + "balance_loss_mlp": 1.05579305, + "epoch": 0.08790019540057117, + "flos": 26392690938240.0, + "grad_norm": 1.656505593412751, + "language_loss": 0.76405656, + "learning_rate": 3.964937007276932e-06, + "loss": 0.786529, + "num_input_tokens_seen": 31283690, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.3125, + "step": 1462, + "time_per_iteration": 2.546812057495117 + }, + { + "auxiliary_loss_clip": 0.01190578, + "auxiliary_loss_mlp": 0.01058183, + "balance_loss_clip": 1.03431702, + "balance_loss_mlp": 1.05753493, + "epoch": 0.08796031865323914, + "flos": 19134031662720.0, + "grad_norm": 2.4277854967530663, + "language_loss": 0.74615479, + "learning_rate": 3.9648643632088634e-06, + "loss": 0.76864231, + "num_input_tokens_seen": 31302505, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.328125, + "step": 1463, + "time_per_iteration": 2.46510648727417 + }, + { + "auxiliary_loss_clip": 0.01189177, + "auxiliary_loss_mlp": 0.0106376, + "balance_loss_clip": 1.03929877, + "balance_loss_mlp": 1.05380559, + "epoch": 0.0880204419059071, + "flos": 26064287867520.0, + "grad_norm": 2.09054267836168, + "language_loss": 0.83423382, + "learning_rate": 3.964791644632941e-06, + "loss": 0.85676318, + "num_input_tokens_seen": 31323070, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.3515625, + "step": 1464, + "time_per_iteration": 2.5343735218048096 + }, + { + "auxiliary_loss_clip": 0.01183588, + "auxiliary_loss_mlp": 0.01069008, + "balance_loss_clip": 1.04659677, + "balance_loss_mlp": 1.05336595, + "epoch": 0.08808056515857508, + "flos": 22377842115840.0, + "grad_norm": 4.267071209901202, + "language_loss": 0.78351951, + "learning_rate": 3.964718851551923e-06, + "loss": 0.80604541, + "num_input_tokens_seen": 31341880, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.296875, + "step": 1465, + "time_per_iteration": 2.4745209217071533 + }, + { + "auxiliary_loss_clip": 0.01190864, + "auxiliary_loss_mlp": 0.01059186, + "balance_loss_clip": 1.0371089, + "balance_loss_mlp": 1.05628061, + "epoch": 0.08814068841124305, + "flos": 23185293897600.0, + "grad_norm": 1.8950228405880263, + "language_loss": 0.84698099, + "learning_rate": 3.9646459839685675e-06, + "loss": 0.86948144, + "num_input_tokens_seen": 31361995, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.34375, + "step": 1466, + "time_per_iteration": 2.4920802116394043 + }, + { + "auxiliary_loss_clip": 0.01185126, + "auxiliary_loss_mlp": 0.0105874, + "balance_loss_clip": 1.03556609, + "balance_loss_mlp": 1.05407715, + "epoch": 0.08820081166391101, + "flos": 25155281358720.0, + "grad_norm": 3.8136580791310783, + "language_loss": 0.84233636, + "learning_rate": 3.964573041885641e-06, + "loss": 0.86477506, + "num_input_tokens_seen": 31381515, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3125, + "step": 1467, + "time_per_iteration": 2.5413413047790527 + }, + { + "auxiliary_loss_clip": 0.01183856, + "auxiliary_loss_mlp": 0.01056021, + "balance_loss_clip": 1.03381276, + "balance_loss_mlp": 1.05462813, + "epoch": 0.08826093491657899, + "flos": 22231685675520.0, + "grad_norm": 1.7481416698073104, + "language_loss": 0.75517243, + "learning_rate": 3.964500025305907e-06, + "loss": 0.7775712, + "num_input_tokens_seen": 31400345, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.296875, + "step": 1468, + "time_per_iteration": 2.496363878250122 + }, + { + "auxiliary_loss_clip": 0.01184448, + "auxiliary_loss_mlp": 0.0105718, + "balance_loss_clip": 1.03623509, + "balance_loss_mlp": 1.05570245, + "epoch": 0.08832105816924696, + "flos": 22126826897280.0, + "grad_norm": 1.7579385887345491, + "language_loss": 0.80601043, + "learning_rate": 3.9644269342321355e-06, + "loss": 0.82842672, + "num_input_tokens_seen": 31419620, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.2890625, + "step": 1469, + "time_per_iteration": 2.5486512184143066 + }, + { + "auxiliary_loss_clip": 0.01187777, + "auxiliary_loss_mlp": 0.01054482, + "balance_loss_clip": 1.0321182, + "balance_loss_mlp": 1.05454695, + "epoch": 0.08838118142191492, + "flos": 17566495159680.0, + "grad_norm": 3.202810753535508, + "language_loss": 0.77607989, + "learning_rate": 3.9643537686670974e-06, + "loss": 0.7985025, + "num_input_tokens_seen": 31437970, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.3359375, + "step": 1470, + "time_per_iteration": 2.6632297039031982 + }, + { + "auxiliary_loss_clip": 0.01182287, + "auxiliary_loss_mlp": 0.0106647, + "balance_loss_clip": 1.04266429, + "balance_loss_mlp": 1.05412459, + "epoch": 0.0884413046745829, + "flos": 20777196251520.0, + "grad_norm": 1.774803600242038, + "language_loss": 0.84233272, + "learning_rate": 3.964280528613569e-06, + "loss": 0.86482024, + "num_input_tokens_seen": 31457040, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.28125, + "step": 1471, + "time_per_iteration": 2.5040950775146484 + }, + { + "auxiliary_loss_clip": 0.01178062, + "auxiliary_loss_mlp": 0.01052705, + "balance_loss_clip": 1.03247499, + "balance_loss_mlp": 1.05459309, + "epoch": 0.08850142792725087, + "flos": 22125462180480.0, + "grad_norm": 1.6761790638208889, + "language_loss": 0.83481324, + "learning_rate": 3.964207214074324e-06, + "loss": 0.85712093, + "num_input_tokens_seen": 31477520, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.234375, + "step": 1472, + "time_per_iteration": 2.5079073905944824 + }, + { + "auxiliary_loss_clip": 0.01185739, + "auxiliary_loss_mlp": 0.0105882, + "balance_loss_clip": 1.03597999, + "balance_loss_mlp": 1.05491877, + "epoch": 0.08856155117991883, + "flos": 22418744728320.0, + "grad_norm": 2.396127276436556, + "language_loss": 0.828246, + "learning_rate": 3.964133825052146e-06, + "loss": 0.85069156, + "num_input_tokens_seen": 31495575, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.3125, + "step": 1473, + "time_per_iteration": 2.4919679164886475 + }, + { + "auxiliary_loss_clip": 0.01183368, + "auxiliary_loss_mlp": 0.01061525, + "balance_loss_clip": 1.04040098, + "balance_loss_mlp": 1.05414963, + "epoch": 0.0886216744325868, + "flos": 29937002572800.0, + "grad_norm": 1.8346488607114506, + "language_loss": 0.78871369, + "learning_rate": 3.964060361549816e-06, + "loss": 0.81116265, + "num_input_tokens_seen": 31520020, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.296875, + "step": 1474, + "time_per_iteration": 2.5806753635406494 + }, + { + "auxiliary_loss_clip": 0.01181812, + "auxiliary_loss_mlp": 0.01057333, + "balance_loss_clip": 1.03413475, + "balance_loss_mlp": 1.05450511, + "epoch": 0.08868179768525478, + "flos": 23982833525760.0, + "grad_norm": 1.918961213895669, + "language_loss": 0.79045832, + "learning_rate": 3.963986823570121e-06, + "loss": 0.81284976, + "num_input_tokens_seen": 31539265, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.2734375, + "step": 1475, + "time_per_iteration": 2.495753765106201 + }, + { + "auxiliary_loss_clip": 0.01184034, + "auxiliary_loss_mlp": 0.01048109, + "balance_loss_clip": 1.0258882, + "balance_loss_mlp": 1.05443335, + "epoch": 0.08874192093792274, + "flos": 43177553216640.0, + "grad_norm": 1.6510632676992876, + "language_loss": 0.73973525, + "learning_rate": 3.963913211115848e-06, + "loss": 0.76205671, + "num_input_tokens_seen": 31563425, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.296875, + "step": 1476, + "time_per_iteration": 6.925957679748535 + }, + { + "auxiliary_loss_clip": 0.0118493, + "auxiliary_loss_mlp": 0.01060562, + "balance_loss_clip": 1.03723264, + "balance_loss_mlp": 1.05454326, + "epoch": 0.0888020441905907, + "flos": 32852445868800.0, + "grad_norm": 1.527991814504802, + "language_loss": 0.74644423, + "learning_rate": 3.9638395241897895e-06, + "loss": 0.76889908, + "num_input_tokens_seen": 31584525, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.3046875, + "step": 1477, + "time_per_iteration": 2.6033589839935303 + }, + { + "auxiliary_loss_clip": 0.01181345, + "auxiliary_loss_mlp": 0.01048179, + "balance_loss_clip": 1.02571976, + "balance_loss_mlp": 1.05315852, + "epoch": 0.08886216744325869, + "flos": 23149347361920.0, + "grad_norm": 2.4237564416671002, + "language_loss": 0.86488914, + "learning_rate": 3.963765762794739e-06, + "loss": 0.88718438, + "num_input_tokens_seen": 31603325, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.28125, + "step": 1478, + "time_per_iteration": 2.5188398361206055 + }, + { + "auxiliary_loss_clip": 0.01182629, + "auxiliary_loss_mlp": 0.01057749, + "balance_loss_clip": 1.03599334, + "balance_loss_mlp": 1.05417609, + "epoch": 0.08892229069592665, + "flos": 23331593992320.0, + "grad_norm": 7.715019285918926, + "language_loss": 0.77988106, + "learning_rate": 3.963691926933495e-06, + "loss": 0.80228484, + "num_input_tokens_seen": 31624820, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.28125, + "step": 1479, + "time_per_iteration": 2.50730562210083 + }, + { + "auxiliary_loss_clip": 0.01180801, + "auxiliary_loss_mlp": 0.01053517, + "balance_loss_clip": 1.02986622, + "balance_loss_mlp": 1.05275774, + "epoch": 0.08898241394859462, + "flos": 26213784272640.0, + "grad_norm": 2.3628139464189815, + "language_loss": 0.78267598, + "learning_rate": 3.9636180166088555e-06, + "loss": 0.80501914, + "num_input_tokens_seen": 31646080, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.28125, + "step": 1480, + "time_per_iteration": 2.512730360031128 + }, + { + "auxiliary_loss_clip": 0.01185027, + "auxiliary_loss_mlp": 0.01063075, + "balance_loss_clip": 1.03901875, + "balance_loss_mlp": 1.05357075, + "epoch": 0.0890425372012626, + "flos": 23550613171200.0, + "grad_norm": 3.1949876590170825, + "language_loss": 0.66627192, + "learning_rate": 3.963544031823624e-06, + "loss": 0.68875289, + "num_input_tokens_seen": 31665770, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.3125, + "step": 1481, + "time_per_iteration": 2.4874138832092285 + }, + { + "auxiliary_loss_clip": 0.0118244, + "auxiliary_loss_mlp": 0.01051994, + "balance_loss_clip": 1.03040504, + "balance_loss_mlp": 1.05519605, + "epoch": 0.08910266045393056, + "flos": 23002795872000.0, + "grad_norm": 1.9560930463008703, + "language_loss": 0.9644348, + "learning_rate": 3.9634699725806065e-06, + "loss": 0.98677909, + "num_input_tokens_seen": 31683805, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.2734375, + "step": 1482, + "time_per_iteration": 2.484274387359619 + }, + { + "auxiliary_loss_clip": 0.01190541, + "auxiliary_loss_mlp": 0.01055727, + "balance_loss_clip": 1.03306508, + "balance_loss_mlp": 1.0577234, + "epoch": 0.08916278370659853, + "flos": 31936508035200.0, + "grad_norm": 2.358614174414972, + "language_loss": 0.78436875, + "learning_rate": 3.96339583888261e-06, + "loss": 0.80683142, + "num_input_tokens_seen": 31704630, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.328125, + "step": 1483, + "time_per_iteration": 2.566199779510498 + }, + { + "auxiliary_loss_clip": 0.01183147, + "auxiliary_loss_mlp": 0.01072522, + "balance_loss_clip": 1.04891825, + "balance_loss_mlp": 1.05463076, + "epoch": 0.08922290695926649, + "flos": 17530404969600.0, + "grad_norm": 2.232834813834399, + "language_loss": 0.86091626, + "learning_rate": 3.963321630732448e-06, + "loss": 0.88347292, + "num_input_tokens_seen": 31723255, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.28125, + "step": 1484, + "time_per_iteration": 2.4742467403411865 + }, + { + "auxiliary_loss_clip": 0.01190947, + "auxiliary_loss_mlp": 0.01064822, + "balance_loss_clip": 1.04152799, + "balance_loss_mlp": 1.0570302, + "epoch": 0.08928303021193447, + "flos": 32125075459200.0, + "grad_norm": 1.7135103732453094, + "language_loss": 0.80460989, + "learning_rate": 3.963247348132932e-06, + "loss": 0.82716757, + "num_input_tokens_seen": 31747045, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.34375, + "step": 1485, + "time_per_iteration": 2.5808591842651367 + }, + { + "auxiliary_loss_clip": 0.01182644, + "auxiliary_loss_mlp": 0.01059654, + "balance_loss_clip": 1.03663421, + "balance_loss_mlp": 1.05256486, + "epoch": 0.08934315346460243, + "flos": 22125210785280.0, + "grad_norm": 2.0833446931013144, + "language_loss": 0.8295821, + "learning_rate": 3.96317299108688e-06, + "loss": 0.852005, + "num_input_tokens_seen": 31766615, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.296875, + "step": 1486, + "time_per_iteration": 2.5060923099517822 + }, + { + "auxiliary_loss_clip": 0.01184012, + "auxiliary_loss_mlp": 0.01060171, + "balance_loss_clip": 1.03749752, + "balance_loss_mlp": 1.05506349, + "epoch": 0.0894032767172704, + "flos": 22565583527040.0, + "grad_norm": 1.6673763915473876, + "language_loss": 0.76653707, + "learning_rate": 3.963098559597111e-06, + "loss": 0.78897893, + "num_input_tokens_seen": 31785855, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.296875, + "step": 1487, + "time_per_iteration": 2.4968059062957764 + }, + { + "auxiliary_loss_clip": 0.01181982, + "auxiliary_loss_mlp": 0.01063322, + "balance_loss_clip": 1.03908658, + "balance_loss_mlp": 1.05203557, + "epoch": 0.08946339996993838, + "flos": 20193396503040.0, + "grad_norm": 2.360836711926668, + "language_loss": 0.83246535, + "learning_rate": 3.963024053666449e-06, + "loss": 0.85491836, + "num_input_tokens_seen": 31804210, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.296875, + "step": 1488, + "time_per_iteration": 2.48189377784729 + }, + { + "auxiliary_loss_clip": 0.01180173, + "auxiliary_loss_mlp": 0.01051663, + "balance_loss_clip": 1.03020549, + "balance_loss_mlp": 1.05375743, + "epoch": 0.08952352322260634, + "flos": 48360181104000.0, + "grad_norm": 1.9508187836998312, + "language_loss": 0.71647823, + "learning_rate": 3.962949473297718e-06, + "loss": 0.73879659, + "num_input_tokens_seen": 31826150, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.265625, + "step": 1489, + "time_per_iteration": 2.701035737991333 + }, + { + "auxiliary_loss_clip": 0.01178824, + "auxiliary_loss_mlp": 0.01053682, + "balance_loss_clip": 1.03087783, + "balance_loss_mlp": 1.05088401, + "epoch": 0.08958364647527431, + "flos": 31793081028480.0, + "grad_norm": 1.8144641128553483, + "language_loss": 0.89490288, + "learning_rate": 3.962874818493745e-06, + "loss": 0.91722786, + "num_input_tokens_seen": 31848060, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.28125, + "step": 1490, + "time_per_iteration": 2.676098108291626 + }, + { + "auxiliary_loss_clip": 0.01187914, + "auxiliary_loss_mlp": 0.01064376, + "balance_loss_clip": 1.0416671, + "balance_loss_mlp": 1.05264366, + "epoch": 0.08964376972794229, + "flos": 23368186972800.0, + "grad_norm": 2.165908760559946, + "language_loss": 0.73276365, + "learning_rate": 3.9628000892573635e-06, + "loss": 0.75528657, + "num_input_tokens_seen": 31870040, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.3515625, + "step": 1491, + "time_per_iteration": 2.5531163215637207 + }, + { + "auxiliary_loss_clip": 0.01181575, + "auxiliary_loss_mlp": 0.01050632, + "balance_loss_clip": 1.02984166, + "balance_loss_mlp": 1.05362582, + "epoch": 0.08970389298061025, + "flos": 23294785530240.0, + "grad_norm": 1.6884120279290091, + "language_loss": 0.77121007, + "learning_rate": 3.9627252855914055e-06, + "loss": 0.79353207, + "num_input_tokens_seen": 31890400, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.28125, + "step": 1492, + "time_per_iteration": 2.485531806945801 + }, + { + "auxiliary_loss_clip": 0.01180742, + "auxiliary_loss_mlp": 0.01055458, + "balance_loss_clip": 1.03324914, + "balance_loss_mlp": 1.05471706, + "epoch": 0.08976401623327822, + "flos": 33761703772800.0, + "grad_norm": 2.0059524225222414, + "language_loss": 0.71168351, + "learning_rate": 3.962650407498707e-06, + "loss": 0.73404551, + "num_input_tokens_seen": 31913435, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.2578125, + "step": 1493, + "time_per_iteration": 2.5819149017333984 + }, + { + "auxiliary_loss_clip": 0.01184961, + "auxiliary_loss_mlp": 0.01056172, + "balance_loss_clip": 1.03304577, + "balance_loss_mlp": 1.05477107, + "epoch": 0.08982413948594618, + "flos": 23911335504000.0, + "grad_norm": 1.7443337417031568, + "language_loss": 0.86910093, + "learning_rate": 3.962575454982109e-06, + "loss": 0.89151227, + "num_input_tokens_seen": 31932435, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3046875, + "step": 1494, + "time_per_iteration": 2.491126775741577 + }, + { + "auxiliary_loss_clip": 0.01180854, + "auxiliary_loss_mlp": 0.01064445, + "balance_loss_clip": 1.04080594, + "balance_loss_mlp": 1.05289626, + "epoch": 0.08988426273861416, + "flos": 16837544551680.0, + "grad_norm": 1.7176751495851263, + "language_loss": 0.83065581, + "learning_rate": 3.962500428044454e-06, + "loss": 0.85310876, + "num_input_tokens_seen": 31950125, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.28125, + "step": 1495, + "time_per_iteration": 2.463747501373291 + }, + { + "auxiliary_loss_clip": 0.01187726, + "auxiliary_loss_mlp": 0.01056179, + "balance_loss_clip": 1.03410196, + "balance_loss_mlp": 1.05825078, + "epoch": 0.08994438599128213, + "flos": 14793365548800.0, + "grad_norm": 1.861203767183833, + "language_loss": 0.69813877, + "learning_rate": 3.962425326688585e-06, + "loss": 0.72057784, + "num_input_tokens_seen": 31968050, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.296875, + "step": 1496, + "time_per_iteration": 2.4409985542297363 + }, + { + "auxiliary_loss_clip": 0.01180533, + "auxiliary_loss_mlp": 0.01051241, + "balance_loss_clip": 1.03035557, + "balance_loss_mlp": 1.05325341, + "epoch": 0.09000450924395009, + "flos": 17384320356480.0, + "grad_norm": 1.6091347390483586, + "language_loss": 0.79913563, + "learning_rate": 3.962350150917351e-06, + "loss": 0.82145333, + "num_input_tokens_seen": 31985675, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.2734375, + "step": 1497, + "time_per_iteration": 2.492732048034668 + }, + { + "auxiliary_loss_clip": 0.01186533, + "auxiliary_loss_mlp": 0.0105809, + "balance_loss_clip": 1.03484416, + "balance_loss_mlp": 1.05299318, + "epoch": 0.09006463249661807, + "flos": 24280317964800.0, + "grad_norm": 2.3611651581227915, + "language_loss": 0.8262192, + "learning_rate": 3.9622749007336035e-06, + "loss": 0.84866548, + "num_input_tokens_seen": 32005180, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.3359375, + "step": 1498, + "time_per_iteration": 2.492124080657959 + }, + { + "auxiliary_loss_clip": 0.01188542, + "auxiliary_loss_mlp": 0.01061597, + "balance_loss_clip": 1.0402112, + "balance_loss_mlp": 1.05628157, + "epoch": 0.09012475574928604, + "flos": 13661928069120.0, + "grad_norm": 2.316244908481527, + "language_loss": 0.7849865, + "learning_rate": 3.962199576140195e-06, + "loss": 0.80748791, + "num_input_tokens_seen": 32022970, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.3203125, + "step": 1499, + "time_per_iteration": 2.455986738204956 + }, + { + "auxiliary_loss_clip": 0.0117942, + "auxiliary_loss_mlp": 0.01055125, + "balance_loss_clip": 1.03348815, + "balance_loss_mlp": 1.05351877, + "epoch": 0.090184879001954, + "flos": 23327751237120.0, + "grad_norm": 1.652937184766999, + "language_loss": 0.93453979, + "learning_rate": 3.962124177139981e-06, + "loss": 0.95688522, + "num_input_tokens_seen": 32043055, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.2578125, + "step": 1500, + "time_per_iteration": 2.481450080871582 + }, + { + "auxiliary_loss_clip": 0.01182931, + "auxiliary_loss_mlp": 0.01050934, + "balance_loss_clip": 1.0268302, + "balance_loss_mlp": 1.05170345, + "epoch": 0.09024500225462198, + "flos": 23002688131200.0, + "grad_norm": 2.9257189866461966, + "language_loss": 0.74465239, + "learning_rate": 3.962048703735822e-06, + "loss": 0.76699102, + "num_input_tokens_seen": 32061900, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.3125, + "step": 1501, + "time_per_iteration": 2.4806344509124756 + }, + { + "auxiliary_loss_clip": 0.01077215, + "auxiliary_loss_mlp": 0.01003378, + "balance_loss_clip": 0.99992049, + "balance_loss_mlp": 1.02834833, + "epoch": 0.09030512550728995, + "flos": 62189203242240.0, + "grad_norm": 0.7322723529864947, + "language_loss": 0.58304042, + "learning_rate": 3.96197315593058e-06, + "loss": 0.60384637, + "num_input_tokens_seen": 32122745, + "router_z_loss_clip": 0.03466797, + "router_z_loss_mlp": 0.48828125, + "step": 1502, + "time_per_iteration": 3.066755771636963 + }, + { + "auxiliary_loss_clip": 0.01178455, + "auxiliary_loss_mlp": 0.01047829, + "balance_loss_clip": 1.02655029, + "balance_loss_mlp": 1.05134845, + "epoch": 0.09036524875995791, + "flos": 38800689171840.0, + "grad_norm": 2.407651446444188, + "language_loss": 0.69502187, + "learning_rate": 3.961897533727119e-06, + "loss": 0.71728474, + "num_input_tokens_seen": 32145125, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.2734375, + "step": 1503, + "time_per_iteration": 2.608006000518799 + }, + { + "auxiliary_loss_clip": 0.01180913, + "auxiliary_loss_mlp": 0.01054911, + "balance_loss_clip": 1.03346539, + "balance_loss_mlp": 1.0508244, + "epoch": 0.09042537201262588, + "flos": 21690081429120.0, + "grad_norm": 2.015182939383952, + "language_loss": 0.86142361, + "learning_rate": 3.961821837128306e-06, + "loss": 0.88378185, + "num_input_tokens_seen": 32166255, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.3046875, + "step": 1504, + "time_per_iteration": 2.489906072616577 + }, + { + "auxiliary_loss_clip": 0.01188306, + "auxiliary_loss_mlp": 0.01064134, + "balance_loss_clip": 1.03871906, + "balance_loss_mlp": 1.05330658, + "epoch": 0.09048549526529386, + "flos": 22267021680000.0, + "grad_norm": 1.9466916160800904, + "language_loss": 0.72267938, + "learning_rate": 3.961746066137014e-06, + "loss": 0.74520379, + "num_input_tokens_seen": 32184010, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.3515625, + "step": 1505, + "time_per_iteration": 2.465965509414673 + }, + { + "auxiliary_loss_clip": 0.01179818, + "auxiliary_loss_mlp": 0.01054589, + "balance_loss_clip": 1.03203487, + "balance_loss_mlp": 1.05332816, + "epoch": 0.09054561851796182, + "flos": 14610939350400.0, + "grad_norm": 2.3726339000283447, + "language_loss": 0.80946511, + "learning_rate": 3.961670220756114e-06, + "loss": 0.83180916, + "num_input_tokens_seen": 32201635, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.265625, + "step": 1506, + "time_per_iteration": 2.4512932300567627 + }, + { + "auxiliary_loss_clip": 0.01179114, + "auxiliary_loss_mlp": 0.01049611, + "balance_loss_clip": 1.02859426, + "balance_loss_mlp": 1.0531404, + "epoch": 0.09060574177062979, + "flos": 27636169916160.0, + "grad_norm": 2.1533698580433254, + "language_loss": 0.76043189, + "learning_rate": 3.961594300988482e-06, + "loss": 0.78271914, + "num_input_tokens_seen": 32221940, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.2578125, + "step": 1507, + "time_per_iteration": 2.5277152061462402 + }, + { + "auxiliary_loss_clip": 0.01067186, + "auxiliary_loss_mlp": 0.01009923, + "balance_loss_clip": 1.00679994, + "balance_loss_mlp": 1.01922798, + "epoch": 0.09066586502329776, + "flos": 66085797513600.0, + "grad_norm": 0.7312512202665958, + "language_loss": 0.57670546, + "learning_rate": 3.961518306836998e-06, + "loss": 0.59747648, + "num_input_tokens_seen": 32276495, + "router_z_loss_clip": 0.03112793, + "router_z_loss_mlp": 0.48046875, + "step": 1508, + "time_per_iteration": 2.9330992698669434 + }, + { + "auxiliary_loss_clip": 0.01182207, + "auxiliary_loss_mlp": 0.01052694, + "balance_loss_clip": 1.0313319, + "balance_loss_mlp": 1.05309892, + "epoch": 0.09072598827596573, + "flos": 18916449027840.0, + "grad_norm": 2.072562238387217, + "language_loss": 0.85046542, + "learning_rate": 3.961442238304543e-06, + "loss": 0.87281442, + "num_input_tokens_seen": 32294130, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.2890625, + "step": 1509, + "time_per_iteration": 2.475606918334961 + }, + { + "auxiliary_loss_clip": 0.01189974, + "auxiliary_loss_mlp": 0.01065674, + "balance_loss_clip": 1.04158139, + "balance_loss_mlp": 1.05606115, + "epoch": 0.0907861115286337, + "flos": 24821742643200.0, + "grad_norm": 2.413703760690829, + "language_loss": 0.84302551, + "learning_rate": 3.961366095394002e-06, + "loss": 0.86558187, + "num_input_tokens_seen": 32313555, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.3359375, + "step": 1510, + "time_per_iteration": 2.576070785522461 + }, + { + "auxiliary_loss_clip": 0.01184002, + "auxiliary_loss_mlp": 0.01055545, + "balance_loss_clip": 1.0335387, + "balance_loss_mlp": 1.05408144, + "epoch": 0.09084623478130167, + "flos": 21652842003840.0, + "grad_norm": 1.9204492801986277, + "language_loss": 0.85558611, + "learning_rate": 3.961289878108262e-06, + "loss": 0.8779816, + "num_input_tokens_seen": 32331430, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.296875, + "step": 1511, + "time_per_iteration": 2.5085484981536865 + }, + { + "auxiliary_loss_clip": 0.01181957, + "auxiliary_loss_mlp": 0.01048579, + "balance_loss_clip": 1.02690685, + "balance_loss_mlp": 1.05469918, + "epoch": 0.09090635803396964, + "flos": 27639258485760.0, + "grad_norm": 1.5775523407684693, + "language_loss": 0.84897017, + "learning_rate": 3.9612135864502135e-06, + "loss": 0.87127548, + "num_input_tokens_seen": 32353705, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.2734375, + "step": 1512, + "time_per_iteration": 2.515565872192383 + }, + { + "auxiliary_loss_clip": 0.01175178, + "auxiliary_loss_mlp": 0.01049482, + "balance_loss_clip": 1.02888274, + "balance_loss_mlp": 1.05033123, + "epoch": 0.0909664812866376, + "flos": 17669127294720.0, + "grad_norm": 2.9006324958480167, + "language_loss": 0.86704344, + "learning_rate": 3.961137220422749e-06, + "loss": 0.88929009, + "num_input_tokens_seen": 32370520, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.25, + "step": 1513, + "time_per_iteration": 2.475271701812744 + }, + { + "auxiliary_loss_clip": 0.01180699, + "auxiliary_loss_mlp": 0.01051925, + "balance_loss_clip": 1.03170729, + "balance_loss_mlp": 1.0536902, + "epoch": 0.09102660453930557, + "flos": 23951448017280.0, + "grad_norm": 1.6716164971548293, + "language_loss": 0.86379707, + "learning_rate": 3.961060780028764e-06, + "loss": 0.8861233, + "num_input_tokens_seen": 32389105, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.2734375, + "step": 1514, + "time_per_iteration": 2.5317347049713135 + }, + { + "auxiliary_loss_clip": 0.0118192, + "auxiliary_loss_mlp": 0.01060131, + "balance_loss_clip": 1.03991365, + "balance_loss_mlp": 1.05550981, + "epoch": 0.09108672779197355, + "flos": 25812949426560.0, + "grad_norm": 1.9279276264910965, + "language_loss": 0.89882755, + "learning_rate": 3.960984265271159e-06, + "loss": 0.92124808, + "num_input_tokens_seen": 32408065, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.265625, + "step": 1515, + "time_per_iteration": 2.5507757663726807 + }, + { + "auxiliary_loss_clip": 0.011822, + "auxiliary_loss_mlp": 0.0105444, + "balance_loss_clip": 1.03174293, + "balance_loss_mlp": 1.05321527, + "epoch": 0.09114685104464151, + "flos": 29639482220160.0, + "grad_norm": 2.0145121179505905, + "language_loss": 0.85567206, + "learning_rate": 3.9609076761528335e-06, + "loss": 0.87803847, + "num_input_tokens_seen": 32427225, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.28125, + "step": 1516, + "time_per_iteration": 2.524787425994873 + }, + { + "auxiliary_loss_clip": 0.01182997, + "auxiliary_loss_mlp": 0.01053148, + "balance_loss_clip": 1.03130913, + "balance_loss_mlp": 1.05217946, + "epoch": 0.09120697429730948, + "flos": 33729635905920.0, + "grad_norm": 1.5232376391767188, + "language_loss": 0.81104374, + "learning_rate": 3.960831012676692e-06, + "loss": 0.83340514, + "num_input_tokens_seen": 32450510, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.3125, + "step": 1517, + "time_per_iteration": 2.5781173706054688 + }, + { + "auxiliary_loss_clip": 0.01185126, + "auxiliary_loss_mlp": 0.01068952, + "balance_loss_clip": 1.04729199, + "balance_loss_mlp": 1.05378699, + "epoch": 0.09126709754997746, + "flos": 18401381953920.0, + "grad_norm": 1.6026665805728266, + "language_loss": 0.78008473, + "learning_rate": 3.960754274845642e-06, + "loss": 0.80262554, + "num_input_tokens_seen": 32468425, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.3125, + "step": 1518, + "time_per_iteration": 4.000938653945923 + }, + { + "auxiliary_loss_clip": 0.01179619, + "auxiliary_loss_mlp": 0.01060053, + "balance_loss_clip": 1.03851235, + "balance_loss_mlp": 1.05189955, + "epoch": 0.09132722080264542, + "flos": 22091957769600.0, + "grad_norm": 1.883609624415087, + "language_loss": 0.86375809, + "learning_rate": 3.960677462662594e-06, + "loss": 0.88615477, + "num_input_tokens_seen": 32487510, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.28125, + "step": 1519, + "time_per_iteration": 3.945183277130127 + }, + { + "auxiliary_loss_clip": 0.01180521, + "auxiliary_loss_mlp": 0.01053198, + "balance_loss_clip": 1.03027439, + "balance_loss_mlp": 1.05196333, + "epoch": 0.09138734405531339, + "flos": 21033131633280.0, + "grad_norm": 2.4149150298084425, + "language_loss": 0.73425877, + "learning_rate": 3.96060057613046e-06, + "loss": 0.75659597, + "num_input_tokens_seen": 32507250, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.28125, + "step": 1520, + "time_per_iteration": 2.470977306365967 + }, + { + "auxiliary_loss_clip": 0.01181506, + "auxiliary_loss_mlp": 0.01055081, + "balance_loss_clip": 1.03299177, + "balance_loss_mlp": 1.0525614, + "epoch": 0.09144746730798137, + "flos": 20083940784000.0, + "grad_norm": 2.6960755220153825, + "language_loss": 0.85296613, + "learning_rate": 3.960523615252156e-06, + "loss": 0.87533194, + "num_input_tokens_seen": 32526045, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.2890625, + "step": 1521, + "time_per_iteration": 2.478440761566162 + }, + { + "auxiliary_loss_clip": 0.01183058, + "auxiliary_loss_mlp": 0.01057495, + "balance_loss_clip": 1.034917, + "balance_loss_mlp": 1.05319118, + "epoch": 0.09150759056064933, + "flos": 22778210085120.0, + "grad_norm": 2.1543470058122876, + "language_loss": 0.83979875, + "learning_rate": 3.960446580030599e-06, + "loss": 0.86220425, + "num_input_tokens_seen": 32546575, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.296875, + "step": 1522, + "time_per_iteration": 2.4761834144592285 + }, + { + "auxiliary_loss_clip": 0.01174804, + "auxiliary_loss_mlp": 0.01057417, + "balance_loss_clip": 1.03500533, + "balance_loss_mlp": 1.05125594, + "epoch": 0.0915677138133173, + "flos": 27564205017600.0, + "grad_norm": 2.174137545904809, + "language_loss": 0.810691, + "learning_rate": 3.960369470468711e-06, + "loss": 0.83301324, + "num_input_tokens_seen": 32568795, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.234375, + "step": 1523, + "time_per_iteration": 2.525385618209839 + }, + { + "auxiliary_loss_clip": 0.01182998, + "auxiliary_loss_mlp": 0.01063543, + "balance_loss_clip": 1.0426811, + "balance_loss_mlp": 1.05365944, + "epoch": 0.09162783706598528, + "flos": 17674765729920.0, + "grad_norm": 2.529065997296093, + "language_loss": 0.74591744, + "learning_rate": 3.960292286569418e-06, + "loss": 0.76838291, + "num_input_tokens_seen": 32587010, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.296875, + "step": 1524, + "time_per_iteration": 2.4293112754821777 + }, + { + "auxiliary_loss_clip": 0.01181121, + "auxiliary_loss_mlp": 0.01060116, + "balance_loss_clip": 1.03822935, + "balance_loss_mlp": 1.05373263, + "epoch": 0.09168796031865324, + "flos": 18478195188480.0, + "grad_norm": 2.0870290485059586, + "language_loss": 0.861516, + "learning_rate": 3.960215028335644e-06, + "loss": 0.88392842, + "num_input_tokens_seen": 32602375, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.2734375, + "step": 1525, + "time_per_iteration": 2.449774980545044 + }, + { + "auxiliary_loss_clip": 0.01181752, + "auxiliary_loss_mlp": 0.01047765, + "balance_loss_clip": 1.02577078, + "balance_loss_mlp": 1.05424511, + "epoch": 0.0917480835713212, + "flos": 29387605075200.0, + "grad_norm": 2.3600448138049597, + "language_loss": 0.74690467, + "learning_rate": 3.96013769577032e-06, + "loss": 0.76919985, + "num_input_tokens_seen": 32621460, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.28125, + "step": 1526, + "time_per_iteration": 2.5295088291168213 + }, + { + "auxiliary_loss_clip": 0.01177679, + "auxiliary_loss_mlp": 0.01052164, + "balance_loss_clip": 1.03058743, + "balance_loss_mlp": 1.05291057, + "epoch": 0.09180820682398917, + "flos": 19829262378240.0, + "grad_norm": 1.970734062299861, + "language_loss": 0.7736311, + "learning_rate": 3.960060288876378e-06, + "loss": 0.79592943, + "num_input_tokens_seen": 32640440, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.25, + "step": 1527, + "time_per_iteration": 2.465484142303467 + }, + { + "auxiliary_loss_clip": 0.01179355, + "auxiliary_loss_mlp": 0.01053495, + "balance_loss_clip": 1.03064227, + "balance_loss_mlp": 1.05090261, + "epoch": 0.09186833007665715, + "flos": 23841848643840.0, + "grad_norm": 1.9755082573034908, + "language_loss": 0.78465801, + "learning_rate": 3.959982807656753e-06, + "loss": 0.80698651, + "num_input_tokens_seen": 32660020, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.28125, + "step": 1528, + "time_per_iteration": 2.5257718563079834 + }, + { + "auxiliary_loss_clip": 0.01177926, + "auxiliary_loss_mlp": 0.01048723, + "balance_loss_clip": 1.0276351, + "balance_loss_mlp": 1.05085492, + "epoch": 0.09192845332932512, + "flos": 12932726065920.0, + "grad_norm": 2.6736868569465813, + "language_loss": 0.76880527, + "learning_rate": 3.959905252114384e-06, + "loss": 0.79107177, + "num_input_tokens_seen": 32678170, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.2734375, + "step": 1529, + "time_per_iteration": 2.4417288303375244 + }, + { + "auxiliary_loss_clip": 0.01180418, + "auxiliary_loss_mlp": 0.01053431, + "balance_loss_clip": 1.0306139, + "balance_loss_mlp": 1.05037212, + "epoch": 0.09198857658199308, + "flos": 24568177559040.0, + "grad_norm": 1.767002219307874, + "language_loss": 0.83118784, + "learning_rate": 3.959827622252211e-06, + "loss": 0.85352623, + "num_input_tokens_seen": 32697540, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.296875, + "step": 1530, + "time_per_iteration": 2.53367018699646 + }, + { + "auxiliary_loss_clip": 0.01173477, + "auxiliary_loss_mlp": 0.01059229, + "balance_loss_clip": 1.03723454, + "balance_loss_mlp": 1.05024123, + "epoch": 0.09204869983466106, + "flos": 20266941600000.0, + "grad_norm": 2.058190265763826, + "language_loss": 0.8408612, + "learning_rate": 3.959749918073179e-06, + "loss": 0.86318833, + "num_input_tokens_seen": 32716805, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.234375, + "step": 1531, + "time_per_iteration": 2.4784743785858154 + }, + { + "auxiliary_loss_clip": 0.01177383, + "auxiliary_loss_mlp": 0.01048961, + "balance_loss_clip": 1.02728868, + "balance_loss_mlp": 1.05083799, + "epoch": 0.09210882308732903, + "flos": 20885646389760.0, + "grad_norm": 1.8347699676368683, + "language_loss": 0.81135088, + "learning_rate": 3.959672139580233e-06, + "loss": 0.83361435, + "num_input_tokens_seen": 32736385, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.265625, + "step": 1532, + "time_per_iteration": 2.506875991821289 + }, + { + "auxiliary_loss_clip": 0.01179012, + "auxiliary_loss_mlp": 0.01052948, + "balance_loss_clip": 1.03044105, + "balance_loss_mlp": 1.05169332, + "epoch": 0.09216894633999699, + "flos": 30956326727040.0, + "grad_norm": 1.8650949584676202, + "language_loss": 0.83489287, + "learning_rate": 3.9595942867763235e-06, + "loss": 0.85721242, + "num_input_tokens_seen": 32757140, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.2734375, + "step": 1533, + "time_per_iteration": 2.5279369354248047 + }, + { + "auxiliary_loss_clip": 0.01181754, + "auxiliary_loss_mlp": 0.01048559, + "balance_loss_clip": 1.02662432, + "balance_loss_mlp": 1.05468941, + "epoch": 0.09222906959266497, + "flos": 13151565676800.0, + "grad_norm": 1.8226281566677605, + "language_loss": 0.89789164, + "learning_rate": 3.959516359664402e-06, + "loss": 0.92019475, + "num_input_tokens_seen": 32774860, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.2734375, + "step": 1534, + "time_per_iteration": 2.498732089996338 + }, + { + "auxiliary_loss_clip": 0.01178154, + "auxiliary_loss_mlp": 0.01064045, + "balance_loss_clip": 1.03958321, + "balance_loss_mlp": 1.04994035, + "epoch": 0.09228919284533293, + "flos": 25994477784960.0, + "grad_norm": 2.6410414613778777, + "language_loss": 0.75911283, + "learning_rate": 3.959438358247424e-06, + "loss": 0.78153479, + "num_input_tokens_seen": 32795250, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.28125, + "step": 1535, + "time_per_iteration": 2.5389468669891357 + }, + { + "auxiliary_loss_clip": 0.01170543, + "auxiliary_loss_mlp": 0.01043965, + "balance_loss_clip": 1.02404571, + "balance_loss_mlp": 1.04907823, + "epoch": 0.0923493160980009, + "flos": 18660800954880.0, + "grad_norm": 1.8388387816947327, + "language_loss": 0.81344318, + "learning_rate": 3.959360282528346e-06, + "loss": 0.83558822, + "num_input_tokens_seen": 32813805, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.2109375, + "step": 1536, + "time_per_iteration": 2.5075631141662598 + }, + { + "auxiliary_loss_clip": 0.01173873, + "auxiliary_loss_mlp": 0.01051939, + "balance_loss_clip": 1.0312202, + "balance_loss_mlp": 1.04995418, + "epoch": 0.09240943935066886, + "flos": 21140576190720.0, + "grad_norm": 2.109198419692537, + "language_loss": 0.8921392, + "learning_rate": 3.959282132510131e-06, + "loss": 0.91439736, + "num_input_tokens_seen": 32830960, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.234375, + "step": 1537, + "time_per_iteration": 2.4454562664031982 + }, + { + "auxiliary_loss_clip": 0.01177438, + "auxiliary_loss_mlp": 0.01059104, + "balance_loss_clip": 1.03638315, + "balance_loss_mlp": 1.05164456, + "epoch": 0.09246956260333684, + "flos": 20592435669120.0, + "grad_norm": 2.1959440535625285, + "language_loss": 0.8072964, + "learning_rate": 3.959203908195741e-06, + "loss": 0.82966185, + "num_input_tokens_seen": 32848275, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.2578125, + "step": 1538, + "time_per_iteration": 2.50838303565979 + }, + { + "auxiliary_loss_clip": 0.01066028, + "auxiliary_loss_mlp": 0.0101212, + "balance_loss_clip": 1.0091517, + "balance_loss_mlp": 1.01794529, + "epoch": 0.09252968585600481, + "flos": 67558710614400.0, + "grad_norm": 0.74443800558722, + "language_loss": 0.57375526, + "learning_rate": 3.959125609588142e-06, + "loss": 0.59453678, + "num_input_tokens_seen": 32917730, + "router_z_loss_clip": 0.02966309, + "router_z_loss_mlp": 0.48046875, + "step": 1539, + "time_per_iteration": 3.16038179397583 + }, + { + "auxiliary_loss_clip": 0.01179737, + "auxiliary_loss_mlp": 0.01051392, + "balance_loss_clip": 1.02958906, + "balance_loss_mlp": 1.05291581, + "epoch": 0.09258980910867277, + "flos": 17383853479680.0, + "grad_norm": 2.903908071477431, + "language_loss": 0.67164814, + "learning_rate": 3.959047236690304e-06, + "loss": 0.69395947, + "num_input_tokens_seen": 32934910, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.265625, + "step": 1540, + "time_per_iteration": 2.488809585571289 + }, + { + "auxiliary_loss_clip": 0.01178592, + "auxiliary_loss_mlp": 0.0104328, + "balance_loss_clip": 1.02154827, + "balance_loss_mlp": 1.05285096, + "epoch": 0.09264993236134075, + "flos": 19865927185920.0, + "grad_norm": 1.797248436862791, + "language_loss": 0.83666921, + "learning_rate": 3.958968789505198e-06, + "loss": 0.85888791, + "num_input_tokens_seen": 32953840, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.2578125, + "step": 1541, + "time_per_iteration": 2.5406758785247803 + }, + { + "auxiliary_loss_clip": 0.01060695, + "auxiliary_loss_mlp": 0.01009011, + "balance_loss_clip": 1.0061146, + "balance_loss_mlp": 1.01339245, + "epoch": 0.09271005561400872, + "flos": 62284401262080.0, + "grad_norm": 0.8904869203130611, + "language_loss": 0.6196329, + "learning_rate": 3.9588902680358e-06, + "loss": 0.64032996, + "num_input_tokens_seen": 33011410, + "router_z_loss_clip": 0.02893066, + "router_z_loss_mlp": 0.47265625, + "step": 1542, + "time_per_iteration": 3.0973262786865234 + }, + { + "auxiliary_loss_clip": 0.01178215, + "auxiliary_loss_mlp": 0.01055032, + "balance_loss_clip": 1.03486192, + "balance_loss_mlp": 1.05283189, + "epoch": 0.09277017886667668, + "flos": 23329870139520.0, + "grad_norm": 1.711071573157868, + "language_loss": 0.82672381, + "learning_rate": 3.958811672285086e-06, + "loss": 0.84905624, + "num_input_tokens_seen": 33031675, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.25, + "step": 1543, + "time_per_iteration": 2.489415168762207 + }, + { + "auxiliary_loss_clip": 0.0117317, + "auxiliary_loss_mlp": 0.01055984, + "balance_loss_clip": 1.03462195, + "balance_loss_mlp": 1.05128777, + "epoch": 0.09283030211934466, + "flos": 54745169875200.0, + "grad_norm": 1.6169278883375504, + "language_loss": 0.72058821, + "learning_rate": 3.958733002256038e-06, + "loss": 0.74287981, + "num_input_tokens_seen": 33056355, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.21875, + "step": 1544, + "time_per_iteration": 2.7986748218536377 + }, + { + "auxiliary_loss_clip": 0.01177062, + "auxiliary_loss_mlp": 0.01047649, + "balance_loss_clip": 1.0257864, + "balance_loss_mlp": 1.05111873, + "epoch": 0.09289042537201263, + "flos": 30334784762880.0, + "grad_norm": 1.7012123784712243, + "language_loss": 0.77617419, + "learning_rate": 3.958654257951637e-06, + "loss": 0.79842126, + "num_input_tokens_seen": 33079520, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.265625, + "step": 1545, + "time_per_iteration": 2.5673069953918457 + }, + { + "auxiliary_loss_clip": 0.01173726, + "auxiliary_loss_mlp": 0.01050414, + "balance_loss_clip": 1.029338, + "balance_loss_mlp": 1.0525856, + "epoch": 0.09295054862468059, + "flos": 17746838369280.0, + "grad_norm": 2.736353511607615, + "language_loss": 0.74531418, + "learning_rate": 3.9585754393748706e-06, + "loss": 0.76755565, + "num_input_tokens_seen": 33096135, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.2109375, + "step": 1546, + "time_per_iteration": 2.456806182861328 + }, + { + "auxiliary_loss_clip": 0.01180806, + "auxiliary_loss_mlp": 0.01051708, + "balance_loss_clip": 1.02968979, + "balance_loss_mlp": 1.05292201, + "epoch": 0.09301067187734856, + "flos": 23658021815040.0, + "grad_norm": 2.1086065935537284, + "language_loss": 0.84392273, + "learning_rate": 3.9584965465287275e-06, + "loss": 0.86624783, + "num_input_tokens_seen": 33115245, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.28125, + "step": 1547, + "time_per_iteration": 2.5041439533233643 + }, + { + "auxiliary_loss_clip": 0.01177454, + "auxiliary_loss_mlp": 0.01053084, + "balance_loss_clip": 1.03136444, + "balance_loss_mlp": 1.05125856, + "epoch": 0.09307079513001654, + "flos": 27527719777920.0, + "grad_norm": 7.120670718523448, + "language_loss": 0.67616034, + "learning_rate": 3.958417579416199e-06, + "loss": 0.6984657, + "num_input_tokens_seen": 33136640, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.2578125, + "step": 1548, + "time_per_iteration": 2.513141393661499 + }, + { + "auxiliary_loss_clip": 0.01178735, + "auxiliary_loss_mlp": 0.01053, + "balance_loss_clip": 1.03083944, + "balance_loss_mlp": 1.05175209, + "epoch": 0.0931309183826845, + "flos": 20627340710400.0, + "grad_norm": 2.761700755369037, + "language_loss": 0.83445251, + "learning_rate": 3.9583385380402795e-06, + "loss": 0.85676992, + "num_input_tokens_seen": 33155060, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.265625, + "step": 1549, + "time_per_iteration": 2.4822285175323486 + }, + { + "auxiliary_loss_clip": 0.01181659, + "auxiliary_loss_mlp": 0.01043887, + "balance_loss_clip": 1.02312112, + "balance_loss_mlp": 1.05560291, + "epoch": 0.09319104163535247, + "flos": 29020921084800.0, + "grad_norm": 1.7822943519837542, + "language_loss": 0.75744081, + "learning_rate": 3.958259422403966e-06, + "loss": 0.77969635, + "num_input_tokens_seen": 33175420, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.2578125, + "step": 1550, + "time_per_iteration": 2.5503265857696533 + }, + { + "auxiliary_loss_clip": 0.01179426, + "auxiliary_loss_mlp": 0.01069184, + "balance_loss_clip": 1.04579496, + "balance_loss_mlp": 1.05118561, + "epoch": 0.09325116488802045, + "flos": 25301545539840.0, + "grad_norm": 2.0184762942100876, + "language_loss": 0.83272278, + "learning_rate": 3.95818023251026e-06, + "loss": 0.85520893, + "num_input_tokens_seen": 33194120, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.28125, + "step": 1551, + "time_per_iteration": 2.4962081909179688 + }, + { + "auxiliary_loss_clip": 0.01060634, + "auxiliary_loss_mlp": 0.01007794, + "balance_loss_clip": 1.0051949, + "balance_loss_mlp": 1.01350796, + "epoch": 0.09331128814068841, + "flos": 61536203942400.0, + "grad_norm": 0.7800746873014213, + "language_loss": 0.6182366, + "learning_rate": 3.958100968362163e-06, + "loss": 0.6389209, + "num_input_tokens_seen": 33261080, + "router_z_loss_clip": 0.02600098, + "router_z_loss_mlp": 0.47070312, + "step": 1552, + "time_per_iteration": 3.2178378105163574 + }, + { + "auxiliary_loss_clip": 0.01059462, + "auxiliary_loss_mlp": 0.01003668, + "balance_loss_clip": 1.00099754, + "balance_loss_mlp": 1.01257896, + "epoch": 0.09337141139335638, + "flos": 53293700171520.0, + "grad_norm": 0.8330449834122059, + "language_loss": 0.5895977, + "learning_rate": 3.958021629962681e-06, + "loss": 0.61022902, + "num_input_tokens_seen": 33330235, + "router_z_loss_clip": 0.0267334, + "router_z_loss_mlp": 0.46875, + "step": 1553, + "time_per_iteration": 3.220923900604248 + }, + { + "auxiliary_loss_clip": 0.01178223, + "auxiliary_loss_mlp": 0.01058803, + "balance_loss_clip": 1.0369525, + "balance_loss_mlp": 1.05040002, + "epoch": 0.09343153464602436, + "flos": 23476852592640.0, + "grad_norm": 2.0753391269624797, + "language_loss": 0.87452686, + "learning_rate": 3.957942217314823e-06, + "loss": 0.89689714, + "num_input_tokens_seen": 33349035, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.28125, + "step": 1554, + "time_per_iteration": 2.5448763370513916 + }, + { + "auxiliary_loss_clip": 0.01174828, + "auxiliary_loss_mlp": 0.01052934, + "balance_loss_clip": 1.0310595, + "balance_loss_mlp": 1.05265594, + "epoch": 0.09349165789869232, + "flos": 19353481804800.0, + "grad_norm": 2.2438919833216913, + "language_loss": 0.81355709, + "learning_rate": 3.957862730421599e-06, + "loss": 0.83583468, + "num_input_tokens_seen": 33368060, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.21875, + "step": 1555, + "time_per_iteration": 2.4939990043640137 + }, + { + "auxiliary_loss_clip": 0.01058772, + "auxiliary_loss_mlp": 0.01008478, + "balance_loss_clip": 1.00581956, + "balance_loss_mlp": 1.01259685, + "epoch": 0.09355178115136029, + "flos": 67502580635520.0, + "grad_norm": 0.8701907042199977, + "language_loss": 0.59583747, + "learning_rate": 3.957783169286024e-06, + "loss": 0.61651003, + "num_input_tokens_seen": 33430825, + "router_z_loss_clip": 0.02661133, + "router_z_loss_mlp": 0.4609375, + "step": 1556, + "time_per_iteration": 3.0923824310302734 + }, + { + "auxiliary_loss_clip": 0.01177126, + "auxiliary_loss_mlp": 0.01056269, + "balance_loss_clip": 1.03518105, + "balance_loss_mlp": 1.05278862, + "epoch": 0.09361190440402825, + "flos": 37341638720640.0, + "grad_norm": 1.5891177576034032, + "language_loss": 0.84455961, + "learning_rate": 3.9577035339111155e-06, + "loss": 0.86689359, + "num_input_tokens_seen": 33454855, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.2421875, + "step": 1557, + "time_per_iteration": 2.5973968505859375 + }, + { + "auxiliary_loss_clip": 0.01175988, + "auxiliary_loss_mlp": 0.01061513, + "balance_loss_clip": 1.03799307, + "balance_loss_mlp": 1.05065048, + "epoch": 0.09367202765669623, + "flos": 24899705112960.0, + "grad_norm": 1.787574567308206, + "language_loss": 0.77987397, + "learning_rate": 3.957623824299893e-06, + "loss": 0.80224895, + "num_input_tokens_seen": 33476000, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.25, + "step": 1558, + "time_per_iteration": 2.5390286445617676 + }, + { + "auxiliary_loss_clip": 0.01178258, + "auxiliary_loss_mlp": 0.01054751, + "balance_loss_clip": 1.03268576, + "balance_loss_mlp": 1.05035424, + "epoch": 0.0937321509093642, + "flos": 15705568368000.0, + "grad_norm": 2.0310113035260873, + "language_loss": 0.7998119, + "learning_rate": 3.957544040455379e-06, + "loss": 0.822142, + "num_input_tokens_seen": 33493845, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.28125, + "step": 1559, + "time_per_iteration": 5.3233802318573 + }, + { + "auxiliary_loss_clip": 0.01172855, + "auxiliary_loss_mlp": 0.01063353, + "balance_loss_clip": 1.04146647, + "balance_loss_mlp": 1.05015147, + "epoch": 0.09379227416203216, + "flos": 20483698222080.0, + "grad_norm": 1.9877315441152976, + "language_loss": 0.76720232, + "learning_rate": 3.957464182380599e-06, + "loss": 0.78956437, + "num_input_tokens_seen": 33510850, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.2265625, + "step": 1560, + "time_per_iteration": 3.863935947418213 + }, + { + "auxiliary_loss_clip": 0.01180546, + "auxiliary_loss_mlp": 0.01059772, + "balance_loss_clip": 1.03793311, + "balance_loss_mlp": 1.05101645, + "epoch": 0.09385239741470014, + "flos": 24352498344960.0, + "grad_norm": 1.6628394684514, + "language_loss": 0.81219828, + "learning_rate": 3.95738425007858e-06, + "loss": 0.83460152, + "num_input_tokens_seen": 33530430, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.296875, + "step": 1561, + "time_per_iteration": 2.5050160884857178 + }, + { + "auxiliary_loss_clip": 0.01175131, + "auxiliary_loss_mlp": 0.01048338, + "balance_loss_clip": 1.02641547, + "balance_loss_mlp": 1.04764926, + "epoch": 0.0939125206673681, + "flos": 33291489807360.0, + "grad_norm": 2.307547697406205, + "language_loss": 0.61553764, + "learning_rate": 3.957304243552354e-06, + "loss": 0.63777232, + "num_input_tokens_seen": 33551975, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.2734375, + "step": 1562, + "time_per_iteration": 2.5884838104248047 + }, + { + "auxiliary_loss_clip": 0.01177686, + "auxiliary_loss_mlp": 0.01059886, + "balance_loss_clip": 1.03920364, + "balance_loss_mlp": 1.0552876, + "epoch": 0.09397264392003607, + "flos": 19244923925760.0, + "grad_norm": 2.5948914783661468, + "language_loss": 0.84981585, + "learning_rate": 3.957224162804956e-06, + "loss": 0.87219155, + "num_input_tokens_seen": 33569850, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.21875, + "step": 1563, + "time_per_iteration": 2.427928924560547 + }, + { + "auxiliary_loss_clip": 0.01172512, + "auxiliary_loss_mlp": 0.01048044, + "balance_loss_clip": 1.02767134, + "balance_loss_mlp": 1.05013323, + "epoch": 0.09403276717270405, + "flos": 19317930318720.0, + "grad_norm": 1.8141046481233785, + "language_loss": 0.76106739, + "learning_rate": 3.9571440078394205e-06, + "loss": 0.78327298, + "num_input_tokens_seen": 33590510, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.21875, + "step": 1564, + "time_per_iteration": 2.4996325969696045 + }, + { + "auxiliary_loss_clip": 0.01177295, + "auxiliary_loss_mlp": 0.01055133, + "balance_loss_clip": 1.03415227, + "balance_loss_mlp": 1.05290008, + "epoch": 0.09409289042537201, + "flos": 23583471137280.0, + "grad_norm": 2.0134268414891388, + "language_loss": 0.7971766, + "learning_rate": 3.9570637786587895e-06, + "loss": 0.81950086, + "num_input_tokens_seen": 33608810, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.25, + "step": 1565, + "time_per_iteration": 2.470870018005371 + }, + { + "auxiliary_loss_clip": 0.01175133, + "auxiliary_loss_mlp": 0.01069432, + "balance_loss_clip": 1.0479629, + "balance_loss_mlp": 1.0497129, + "epoch": 0.09415301367803998, + "flos": 20078446003200.0, + "grad_norm": 1.8353632925340597, + "language_loss": 0.75241816, + "learning_rate": 3.956983475266103e-06, + "loss": 0.77486378, + "num_input_tokens_seen": 33627265, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.2578125, + "step": 1566, + "time_per_iteration": 2.4962053298950195 + }, + { + "auxiliary_loss_clip": 0.0117411, + "auxiliary_loss_mlp": 0.01059014, + "balance_loss_clip": 1.03746092, + "balance_loss_mlp": 1.04822683, + "epoch": 0.09421313693070796, + "flos": 21062075016960.0, + "grad_norm": 2.55149440594841, + "language_loss": 0.77724433, + "learning_rate": 3.956903097664407e-06, + "loss": 0.79957557, + "num_input_tokens_seen": 33644810, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.2578125, + "step": 1567, + "time_per_iteration": 2.448511838912964 + }, + { + "auxiliary_loss_clip": 0.01178494, + "auxiliary_loss_mlp": 0.01054706, + "balance_loss_clip": 1.03504825, + "balance_loss_mlp": 1.05183101, + "epoch": 0.09427326018337592, + "flos": 24316156759680.0, + "grad_norm": 2.293964487000622, + "language_loss": 0.82571244, + "learning_rate": 3.956822645856749e-06, + "loss": 0.8480444, + "num_input_tokens_seen": 33665665, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.265625, + "step": 1568, + "time_per_iteration": 2.5221774578094482 + }, + { + "auxiliary_loss_clip": 0.01179838, + "auxiliary_loss_mlp": 0.01048346, + "balance_loss_clip": 1.02527881, + "balance_loss_mlp": 1.05191278, + "epoch": 0.09433338343604389, + "flos": 20263888944000.0, + "grad_norm": 4.3822924949764515, + "language_loss": 0.7658236, + "learning_rate": 3.9567421198461814e-06, + "loss": 0.78810549, + "num_input_tokens_seen": 33684760, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.28125, + "step": 1569, + "time_per_iteration": 2.464019775390625 + }, + { + "auxiliary_loss_clip": 0.01171203, + "auxiliary_loss_mlp": 0.01052053, + "balance_loss_clip": 1.03004718, + "balance_loss_mlp": 1.04984534, + "epoch": 0.09439350668871185, + "flos": 12742973493120.0, + "grad_norm": 2.11394347406088, + "language_loss": 0.86315012, + "learning_rate": 3.956661519635756e-06, + "loss": 0.88538271, + "num_input_tokens_seen": 33700750, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.21875, + "step": 1570, + "time_per_iteration": 2.479001998901367 + }, + { + "auxiliary_loss_clip": 0.01177967, + "auxiliary_loss_mlp": 0.01049184, + "balance_loss_clip": 1.02764273, + "balance_loss_mlp": 1.05340183, + "epoch": 0.09445362994137983, + "flos": 25962266263680.0, + "grad_norm": 1.6480791038221163, + "language_loss": 0.76531005, + "learning_rate": 3.95658084522853e-06, + "loss": 0.78758156, + "num_input_tokens_seen": 33724430, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.25, + "step": 1571, + "time_per_iteration": 2.5270462036132812 + }, + { + "auxiliary_loss_clip": 0.01169263, + "auxiliary_loss_mlp": 0.01049685, + "balance_loss_clip": 1.02848995, + "balance_loss_mlp": 1.0496099, + "epoch": 0.0945137531940478, + "flos": 19715353372800.0, + "grad_norm": 1.780883866775424, + "language_loss": 0.79518712, + "learning_rate": 3.956500096627561e-06, + "loss": 0.81737661, + "num_input_tokens_seen": 33743455, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1953125, + "step": 1572, + "time_per_iteration": 2.477403163909912 + }, + { + "auxiliary_loss_clip": 0.01172702, + "auxiliary_loss_mlp": 0.01053869, + "balance_loss_clip": 1.03288805, + "balance_loss_mlp": 1.05036175, + "epoch": 0.09457387644671576, + "flos": 23617047375360.0, + "grad_norm": 1.8458711299535766, + "language_loss": 0.87948155, + "learning_rate": 3.956419273835913e-06, + "loss": 0.90174723, + "num_input_tokens_seen": 33763435, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.21875, + "step": 1573, + "time_per_iteration": 2.5164122581481934 + }, + { + "auxiliary_loss_clip": 0.01177194, + "auxiliary_loss_mlp": 0.01059795, + "balance_loss_clip": 1.03533316, + "balance_loss_mlp": 1.05045378, + "epoch": 0.09463399969938374, + "flos": 26907291135360.0, + "grad_norm": 2.770313323609274, + "language_loss": 0.81827116, + "learning_rate": 3.95633837685665e-06, + "loss": 0.84064102, + "num_input_tokens_seen": 33784325, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.265625, + "step": 1574, + "time_per_iteration": 2.5540831089019775 + }, + { + "auxiliary_loss_clip": 0.01178056, + "auxiliary_loss_mlp": 0.01052269, + "balance_loss_clip": 1.03128815, + "balance_loss_mlp": 1.05359375, + "epoch": 0.0946941229520517, + "flos": 23659566099840.0, + "grad_norm": 2.139236970889498, + "language_loss": 0.80922085, + "learning_rate": 3.95625740569284e-06, + "loss": 0.83152413, + "num_input_tokens_seen": 33802510, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.25, + "step": 1575, + "time_per_iteration": 2.4874608516693115 + }, + { + "auxiliary_loss_clip": 0.01172567, + "auxiliary_loss_mlp": 0.01063693, + "balance_loss_clip": 1.04184198, + "balance_loss_mlp": 1.05048943, + "epoch": 0.09475424620471967, + "flos": 24134053783680.0, + "grad_norm": 2.1107661515601, + "language_loss": 0.86745369, + "learning_rate": 3.956176360347553e-06, + "loss": 0.88981628, + "num_input_tokens_seen": 33819980, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.21875, + "step": 1576, + "time_per_iteration": 2.514961004257202 + }, + { + "auxiliary_loss_clip": 0.01058351, + "auxiliary_loss_mlp": 0.01025821, + "balance_loss_clip": 1.02272177, + "balance_loss_mlp": 1.01272786, + "epoch": 0.09481436945738765, + "flos": 68426168065920.0, + "grad_norm": 0.9836929902555142, + "language_loss": 0.65832257, + "learning_rate": 3.956095240823862e-06, + "loss": 0.67916429, + "num_input_tokens_seen": 33878925, + "router_z_loss_clip": 0.03100586, + "router_z_loss_mlp": 0.45703125, + "step": 1577, + "time_per_iteration": 3.042998790740967 + }, + { + "auxiliary_loss_clip": 0.01175806, + "auxiliary_loss_mlp": 0.01045658, + "balance_loss_clip": 1.02504635, + "balance_loss_mlp": 1.05083144, + "epoch": 0.09487449271005562, + "flos": 16654076858880.0, + "grad_norm": 3.158821122445177, + "language_loss": 0.79113019, + "learning_rate": 3.956014047124844e-06, + "loss": 0.81334484, + "num_input_tokens_seen": 33897600, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.25, + "step": 1578, + "time_per_iteration": 2.492605447769165 + }, + { + "auxiliary_loss_clip": 0.01173104, + "auxiliary_loss_mlp": 0.01056494, + "balance_loss_clip": 1.03446436, + "balance_loss_mlp": 1.04935408, + "epoch": 0.09493461596272358, + "flos": 24275685110400.0, + "grad_norm": 1.6941125689582233, + "language_loss": 0.77994359, + "learning_rate": 3.955932779253578e-06, + "loss": 0.80223954, + "num_input_tokens_seen": 33917365, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.234375, + "step": 1579, + "time_per_iteration": 2.5021350383758545 + }, + { + "auxiliary_loss_clip": 0.01176838, + "auxiliary_loss_mlp": 0.01054415, + "balance_loss_clip": 1.0317533, + "balance_loss_mlp": 1.05228639, + "epoch": 0.09499473921539155, + "flos": 21870173243520.0, + "grad_norm": 2.3012950697800747, + "language_loss": 0.73576474, + "learning_rate": 3.955851437213144e-06, + "loss": 0.75807726, + "num_input_tokens_seen": 33936680, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.2421875, + "step": 1580, + "time_per_iteration": 2.500426769256592 + }, + { + "auxiliary_loss_clip": 0.01171524, + "auxiliary_loss_mlp": 0.01053034, + "balance_loss_clip": 1.03235102, + "balance_loss_mlp": 1.05162525, + "epoch": 0.09505486246805953, + "flos": 33547137880320.0, + "grad_norm": 2.820694860574998, + "language_loss": 0.77813822, + "learning_rate": 3.955770021006627e-06, + "loss": 0.80038381, + "num_input_tokens_seen": 33960685, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.203125, + "step": 1581, + "time_per_iteration": 2.569086790084839 + }, + { + "auxiliary_loss_clip": 0.01177083, + "auxiliary_loss_mlp": 0.0105881, + "balance_loss_clip": 1.03779316, + "balance_loss_mlp": 1.05315304, + "epoch": 0.09511498572072749, + "flos": 21215342350080.0, + "grad_norm": 2.1718701740895443, + "language_loss": 0.86914808, + "learning_rate": 3.955688530637116e-06, + "loss": 0.89150703, + "num_input_tokens_seen": 33980015, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.234375, + "step": 1582, + "time_per_iteration": 2.476386785507202 + }, + { + "auxiliary_loss_clip": 0.01178411, + "auxiliary_loss_mlp": 0.01057193, + "balance_loss_clip": 1.03394723, + "balance_loss_mlp": 1.05487967, + "epoch": 0.09517510897339546, + "flos": 14611262572800.0, + "grad_norm": 1.7496793522695477, + "language_loss": 0.66838771, + "learning_rate": 3.955606966107699e-06, + "loss": 0.6907438, + "num_input_tokens_seen": 33997705, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.234375, + "step": 1583, + "time_per_iteration": 2.4433302879333496 + }, + { + "auxiliary_loss_clip": 0.01180705, + "auxiliary_loss_mlp": 0.01052141, + "balance_loss_clip": 1.02919281, + "balance_loss_mlp": 1.0555923, + "epoch": 0.09523523222606343, + "flos": 27817339138560.0, + "grad_norm": 1.8272679383640855, + "language_loss": 0.70314872, + "learning_rate": 3.95552532742147e-06, + "loss": 0.7254771, + "num_input_tokens_seen": 34017465, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.25, + "step": 1584, + "time_per_iteration": 2.5352938175201416 + }, + { + "auxiliary_loss_clip": 0.01177345, + "auxiliary_loss_mlp": 0.01054432, + "balance_loss_clip": 1.0344646, + "balance_loss_mlp": 1.0527246, + "epoch": 0.0952953554787314, + "flos": 20706272847360.0, + "grad_norm": 1.5429491827095454, + "language_loss": 0.80649364, + "learning_rate": 3.955443614581525e-06, + "loss": 0.82881135, + "num_input_tokens_seen": 34038550, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.2421875, + "step": 1585, + "time_per_iteration": 2.5006139278411865 + }, + { + "auxiliary_loss_clip": 0.01179471, + "auxiliary_loss_mlp": 0.01056563, + "balance_loss_clip": 1.03301954, + "balance_loss_mlp": 1.05324364, + "epoch": 0.09535547873139937, + "flos": 24787627701120.0, + "grad_norm": 1.5763794615860258, + "language_loss": 0.7156626, + "learning_rate": 3.955361827590961e-06, + "loss": 0.73802292, + "num_input_tokens_seen": 34058665, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.265625, + "step": 1586, + "time_per_iteration": 2.510941982269287 + }, + { + "auxiliary_loss_clip": 0.01058124, + "auxiliary_loss_mlp": 0.010121, + "balance_loss_clip": 1.00946522, + "balance_loss_mlp": 1.01272035, + "epoch": 0.09541560198406734, + "flos": 71912194905600.0, + "grad_norm": 0.8128409972345002, + "language_loss": 0.55392706, + "learning_rate": 3.955279966452883e-06, + "loss": 0.57462931, + "num_input_tokens_seen": 34109655, + "router_z_loss_clip": 0.02636719, + "router_z_loss_mlp": 0.453125, + "step": 1587, + "time_per_iteration": 2.8747992515563965 + }, + { + "auxiliary_loss_clip": 0.0118109, + "auxiliary_loss_mlp": 0.01056077, + "balance_loss_clip": 1.0345006, + "balance_loss_mlp": 1.0550952, + "epoch": 0.09547572523673531, + "flos": 28982604251520.0, + "grad_norm": 1.813611272618652, + "language_loss": 0.81023234, + "learning_rate": 3.955198031170391e-06, + "loss": 0.83260405, + "num_input_tokens_seen": 34131115, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.265625, + "step": 1588, + "time_per_iteration": 2.5403292179107666 + }, + { + "auxiliary_loss_clip": 0.01178114, + "auxiliary_loss_mlp": 0.01054854, + "balance_loss_clip": 1.03290713, + "balance_loss_mlp": 1.05471849, + "epoch": 0.09553584848940327, + "flos": 24133910129280.0, + "grad_norm": 2.1843830695972835, + "language_loss": 0.81552076, + "learning_rate": 3.955116021746594e-06, + "loss": 0.83785045, + "num_input_tokens_seen": 34151925, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.234375, + "step": 1589, + "time_per_iteration": 2.4995651245117188 + }, + { + "auxiliary_loss_clip": 0.01175474, + "auxiliary_loss_mlp": 0.01051658, + "balance_loss_clip": 1.02901983, + "balance_loss_mlp": 1.05340207, + "epoch": 0.09559597174207124, + "flos": 42851376789120.0, + "grad_norm": 1.4497838373443381, + "language_loss": 0.65005404, + "learning_rate": 3.955033938184601e-06, + "loss": 0.67232537, + "num_input_tokens_seen": 34175395, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.21875, + "step": 1590, + "time_per_iteration": 2.7222375869750977 + }, + { + "auxiliary_loss_clip": 0.01173556, + "auxiliary_loss_mlp": 0.01051921, + "balance_loss_clip": 1.03036785, + "balance_loss_mlp": 1.05178595, + "epoch": 0.09565609499473922, + "flos": 32670845683200.0, + "grad_norm": 1.714913693600035, + "language_loss": 0.83272862, + "learning_rate": 3.954951780487526e-06, + "loss": 0.85498345, + "num_input_tokens_seen": 34197760, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.21875, + "step": 1591, + "time_per_iteration": 2.571277379989624 + }, + { + "auxiliary_loss_clip": 0.01179776, + "auxiliary_loss_mlp": 0.01055769, + "balance_loss_clip": 1.03419209, + "balance_loss_mlp": 1.05280709, + "epoch": 0.09571621824740718, + "flos": 18478410670080.0, + "grad_norm": 2.268244689889179, + "language_loss": 0.74068749, + "learning_rate": 3.9548695486584835e-06, + "loss": 0.76304293, + "num_input_tokens_seen": 34215330, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.265625, + "step": 1592, + "time_per_iteration": 2.446272373199463 + }, + { + "auxiliary_loss_clip": 0.01173297, + "auxiliary_loss_mlp": 0.0104948, + "balance_loss_clip": 1.0282129, + "balance_loss_mlp": 1.05028248, + "epoch": 0.09577634150007515, + "flos": 29387497334400.0, + "grad_norm": 1.9287746031752921, + "language_loss": 0.74135411, + "learning_rate": 3.954787242700592e-06, + "loss": 0.76358187, + "num_input_tokens_seen": 34237745, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.234375, + "step": 1593, + "time_per_iteration": 2.518038749694824 + }, + { + "auxiliary_loss_clip": 0.01175652, + "auxiliary_loss_mlp": 0.01051222, + "balance_loss_clip": 1.03061128, + "balance_loss_mlp": 1.05365515, + "epoch": 0.09583646475274313, + "flos": 22747830157440.0, + "grad_norm": 1.8251705146793997, + "language_loss": 0.69907188, + "learning_rate": 3.954704862616971e-06, + "loss": 0.72134066, + "num_input_tokens_seen": 34256565, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.21875, + "step": 1594, + "time_per_iteration": 2.5454983711242676 + }, + { + "auxiliary_loss_clip": 0.01174594, + "auxiliary_loss_mlp": 0.0105111, + "balance_loss_clip": 1.03062999, + "balance_loss_mlp": 1.05023921, + "epoch": 0.0958965880054111, + "flos": 23218367345280.0, + "grad_norm": 2.596137828422853, + "language_loss": 0.82464099, + "learning_rate": 3.954622408410747e-06, + "loss": 0.84689802, + "num_input_tokens_seen": 34275970, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.2421875, + "step": 1595, + "time_per_iteration": 2.472062826156616 + }, + { + "auxiliary_loss_clip": 0.01176658, + "auxiliary_loss_mlp": 0.01050558, + "balance_loss_clip": 1.02803886, + "balance_loss_mlp": 1.05217803, + "epoch": 0.09595671125807906, + "flos": 21324438933120.0, + "grad_norm": 2.0311987750358953, + "language_loss": 0.84673214, + "learning_rate": 3.954539880085045e-06, + "loss": 0.86900425, + "num_input_tokens_seen": 34295490, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.2421875, + "step": 1596, + "time_per_iteration": 2.4801599979400635 + }, + { + "auxiliary_loss_clip": 0.01181467, + "auxiliary_loss_mlp": 0.01051063, + "balance_loss_clip": 1.02871156, + "balance_loss_mlp": 1.05628884, + "epoch": 0.09601683451074704, + "flos": 39603472185600.0, + "grad_norm": 2.531539932785817, + "language_loss": 0.68993127, + "learning_rate": 3.9544572776429945e-06, + "loss": 0.71225667, + "num_input_tokens_seen": 34319990, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.25, + "step": 1597, + "time_per_iteration": 2.6195101737976074 + }, + { + "auxiliary_loss_clip": 0.01175632, + "auxiliary_loss_mlp": 0.0104509, + "balance_loss_clip": 1.02370429, + "balance_loss_mlp": 1.04902959, + "epoch": 0.096076957763415, + "flos": 23732716147200.0, + "grad_norm": 2.18946094151333, + "language_loss": 0.74929029, + "learning_rate": 3.954374601087729e-06, + "loss": 0.77149749, + "num_input_tokens_seen": 34339225, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.265625, + "step": 1598, + "time_per_iteration": 2.474071502685547 + }, + { + "auxiliary_loss_clip": 0.01179079, + "auxiliary_loss_mlp": 0.01048418, + "balance_loss_clip": 1.02574444, + "balance_loss_mlp": 1.05284083, + "epoch": 0.09613708101608297, + "flos": 34678108483200.0, + "grad_norm": 1.6350676424235815, + "language_loss": 0.69002283, + "learning_rate": 3.954291850422382e-06, + "loss": 0.7122978, + "num_input_tokens_seen": 34361020, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.265625, + "step": 1599, + "time_per_iteration": 2.5599992275238037 + }, + { + "auxiliary_loss_clip": 0.01174972, + "auxiliary_loss_mlp": 0.01056578, + "balance_loss_clip": 1.0358355, + "balance_loss_mlp": 1.05169392, + "epoch": 0.09619720426875093, + "flos": 20740028653440.0, + "grad_norm": 2.013538613147854, + "language_loss": 0.840271, + "learning_rate": 3.954209025650093e-06, + "loss": 0.8625865, + "num_input_tokens_seen": 34378630, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.234375, + "step": 1600, + "time_per_iteration": 2.4882116317749023 + }, + { + "auxiliary_loss_clip": 0.01174537, + "auxiliary_loss_mlp": 0.01052763, + "balance_loss_clip": 1.03162694, + "balance_loss_mlp": 1.05098653, + "epoch": 0.09625732752141891, + "flos": 13042720488960.0, + "grad_norm": 3.038904015519863, + "language_loss": 0.8034178, + "learning_rate": 3.954126126774001e-06, + "loss": 0.82569081, + "num_input_tokens_seen": 34397110, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.234375, + "step": 1601, + "time_per_iteration": 5.328745365142822 + }, + { + "auxiliary_loss_clip": 0.01178453, + "auxiliary_loss_mlp": 0.01052259, + "balance_loss_clip": 1.03031266, + "balance_loss_mlp": 1.05090928, + "epoch": 0.09631745077408688, + "flos": 22273629782400.0, + "grad_norm": 2.183236390866488, + "language_loss": 0.82405198, + "learning_rate": 3.954043153797251e-06, + "loss": 0.84635913, + "num_input_tokens_seen": 34414165, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.2734375, + "step": 1602, + "time_per_iteration": 2.4609556198120117 + }, + { + "auxiliary_loss_clip": 0.01172805, + "auxiliary_loss_mlp": 0.01051791, + "balance_loss_clip": 1.02926075, + "balance_loss_mlp": 1.05170703, + "epoch": 0.09637757402675484, + "flos": 24754266944640.0, + "grad_norm": 1.882331764966583, + "language_loss": 0.62527591, + "learning_rate": 3.953960106722989e-06, + "loss": 0.64752185, + "num_input_tokens_seen": 34434445, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.2109375, + "step": 1603, + "time_per_iteration": 2.4974379539489746 + }, + { + "auxiliary_loss_clip": 0.01178105, + "auxiliary_loss_mlp": 0.01054363, + "balance_loss_clip": 1.03049707, + "balance_loss_mlp": 1.05224609, + "epoch": 0.09643769727942282, + "flos": 22525758322560.0, + "grad_norm": 2.347327571135852, + "language_loss": 0.71259016, + "learning_rate": 3.953876985554364e-06, + "loss": 0.73491484, + "num_input_tokens_seen": 34453095, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.2578125, + "step": 1604, + "time_per_iteration": 2.5012693405151367 + }, + { + "auxiliary_loss_clip": 0.01172586, + "auxiliary_loss_mlp": 0.01056823, + "balance_loss_clip": 1.0368669, + "balance_loss_mlp": 1.05051208, + "epoch": 0.09649782053209079, + "flos": 30921026636160.0, + "grad_norm": 2.129697971326249, + "language_loss": 0.79487669, + "learning_rate": 3.953793790294527e-06, + "loss": 0.8171708, + "num_input_tokens_seen": 34473680, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.21875, + "step": 1605, + "time_per_iteration": 2.5392873287200928 + }, + { + "auxiliary_loss_clip": 0.01176232, + "auxiliary_loss_mlp": 0.01044253, + "balance_loss_clip": 1.02275968, + "balance_loss_mlp": 1.04916394, + "epoch": 0.09655794378475875, + "flos": 25337635729920.0, + "grad_norm": 3.698123586343809, + "language_loss": 0.74810207, + "learning_rate": 3.953710520946634e-06, + "loss": 0.77030694, + "num_input_tokens_seen": 34492610, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.2734375, + "step": 1606, + "time_per_iteration": 2.4922726154327393 + }, + { + "auxiliary_loss_clip": 0.01176161, + "auxiliary_loss_mlp": 0.01044775, + "balance_loss_clip": 1.02391386, + "balance_loss_mlp": 1.05243278, + "epoch": 0.09661806703742673, + "flos": 22346061557760.0, + "grad_norm": 1.649703340967918, + "language_loss": 0.75382137, + "learning_rate": 3.953627177513843e-06, + "loss": 0.77603066, + "num_input_tokens_seen": 34511855, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.234375, + "step": 1607, + "time_per_iteration": 2.4787087440490723 + }, + { + "auxiliary_loss_clip": 0.0117289, + "auxiliary_loss_mlp": 0.01042475, + "balance_loss_clip": 1.02206647, + "balance_loss_mlp": 1.04831934, + "epoch": 0.0966781902900947, + "flos": 17457578144640.0, + "grad_norm": 2.262571531890369, + "language_loss": 0.86648059, + "learning_rate": 3.953543759999312e-06, + "loss": 0.88863426, + "num_input_tokens_seen": 34528905, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.25, + "step": 1608, + "time_per_iteration": 2.435391664505005 + }, + { + "auxiliary_loss_clip": 0.01183391, + "auxiliary_loss_mlp": 0.01056654, + "balance_loss_clip": 1.03513622, + "balance_loss_mlp": 1.05276418, + "epoch": 0.09673831354276266, + "flos": 36903995412480.0, + "grad_norm": 2.2277980990408297, + "language_loss": 0.70968121, + "learning_rate": 3.953460268406207e-06, + "loss": 0.73208165, + "num_input_tokens_seen": 34548480, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.3046875, + "step": 1609, + "time_per_iteration": 2.599719762802124 + }, + { + "auxiliary_loss_clip": 0.01173214, + "auxiliary_loss_mlp": 0.01054271, + "balance_loss_clip": 1.03342104, + "balance_loss_mlp": 1.04860282, + "epoch": 0.09679843679543064, + "flos": 20701388597760.0, + "grad_norm": 3.7787270736621674, + "language_loss": 0.84566712, + "learning_rate": 3.953376702737693e-06, + "loss": 0.86794198, + "num_input_tokens_seen": 34565410, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.2421875, + "step": 1610, + "time_per_iteration": 2.446676254272461 + }, + { + "auxiliary_loss_clip": 0.01177531, + "auxiliary_loss_mlp": 0.01049914, + "balance_loss_clip": 1.02781224, + "balance_loss_mlp": 1.05382621, + "epoch": 0.0968585600480986, + "flos": 23514415240320.0, + "grad_norm": 2.0483419743874682, + "language_loss": 0.67360532, + "learning_rate": 3.953293062996939e-06, + "loss": 0.69587982, + "num_input_tokens_seen": 34584840, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.234375, + "step": 1611, + "time_per_iteration": 2.520211696624756 + }, + { + "auxiliary_loss_clip": 0.01177545, + "auxiliary_loss_mlp": 0.0105068, + "balance_loss_clip": 1.03000879, + "balance_loss_mlp": 1.05313492, + "epoch": 0.09691868330076657, + "flos": 20121072468480.0, + "grad_norm": 1.6625909003061596, + "language_loss": 0.81166416, + "learning_rate": 3.953209349187115e-06, + "loss": 0.83394641, + "num_input_tokens_seen": 34603360, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.2421875, + "step": 1612, + "time_per_iteration": 2.449491262435913 + }, + { + "auxiliary_loss_clip": 0.01180036, + "auxiliary_loss_mlp": 0.01061745, + "balance_loss_clip": 1.04027581, + "balance_loss_mlp": 1.05431938, + "epoch": 0.09697880655343454, + "flos": 16544692967040.0, + "grad_norm": 2.509420249413084, + "language_loss": 0.80708754, + "learning_rate": 3.953125561311398e-06, + "loss": 0.82950538, + "num_input_tokens_seen": 34620760, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.2578125, + "step": 1613, + "time_per_iteration": 2.4753763675689697 + }, + { + "auxiliary_loss_clip": 0.01173718, + "auxiliary_loss_mlp": 0.01052644, + "balance_loss_clip": 1.03019738, + "balance_loss_mlp": 1.05074048, + "epoch": 0.09703892980610251, + "flos": 26104184899200.0, + "grad_norm": 2.0025313344872484, + "language_loss": 0.84173608, + "learning_rate": 3.953041699372964e-06, + "loss": 0.86399966, + "num_input_tokens_seen": 34640695, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.2265625, + "step": 1614, + "time_per_iteration": 2.5492141246795654 + }, + { + "auxiliary_loss_clip": 0.01065917, + "auxiliary_loss_mlp": 0.010187, + "balance_loss_clip": 1.01610088, + "balance_loss_mlp": 1.019063, + "epoch": 0.09709905305877048, + "flos": 60443622000000.0, + "grad_norm": 0.7078098108364695, + "language_loss": 0.54584575, + "learning_rate": 3.952957763374992e-06, + "loss": 0.56669194, + "num_input_tokens_seen": 34702395, + "router_z_loss_clip": 0.02600098, + "router_z_loss_mlp": 0.46875, + "step": 1615, + "time_per_iteration": 3.1041057109832764 + }, + { + "auxiliary_loss_clip": 0.01065912, + "auxiliary_loss_mlp": 0.01007477, + "balance_loss_clip": 1.00491357, + "balance_loss_mlp": 1.01844954, + "epoch": 0.09715917631143844, + "flos": 57639932893440.0, + "grad_norm": 0.7637649269659756, + "language_loss": 0.5822649, + "learning_rate": 3.952873753320666e-06, + "loss": 0.60299873, + "num_input_tokens_seen": 34768910, + "router_z_loss_clip": 0.02563477, + "router_z_loss_mlp": 0.47460938, + "step": 1616, + "time_per_iteration": 3.215376377105713 + }, + { + "auxiliary_loss_clip": 0.01178513, + "auxiliary_loss_mlp": 0.01055808, + "balance_loss_clip": 1.03275275, + "balance_loss_mlp": 1.05275226, + "epoch": 0.09721929956410642, + "flos": 20558212986240.0, + "grad_norm": 1.690325520565165, + "language_loss": 0.69293094, + "learning_rate": 3.952789669213172e-06, + "loss": 0.71527421, + "num_input_tokens_seen": 34787680, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.2578125, + "step": 1617, + "time_per_iteration": 2.458017587661743 + }, + { + "auxiliary_loss_clip": 0.01176727, + "auxiliary_loss_mlp": 0.01055641, + "balance_loss_clip": 1.03116739, + "balance_loss_mlp": 1.05130577, + "epoch": 0.09727942281677439, + "flos": 27344359825920.0, + "grad_norm": 1.7927692696889819, + "language_loss": 0.80748308, + "learning_rate": 3.952705511055698e-06, + "loss": 0.8298068, + "num_input_tokens_seen": 34808330, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.25, + "step": 1618, + "time_per_iteration": 2.5471577644348145 + }, + { + "auxiliary_loss_clip": 0.01169902, + "auxiliary_loss_mlp": 0.01050477, + "balance_loss_clip": 1.03077149, + "balance_loss_mlp": 1.04996848, + "epoch": 0.09733954606944235, + "flos": 24900028335360.0, + "grad_norm": 1.5831304278494804, + "language_loss": 0.9288674, + "learning_rate": 3.952621278851435e-06, + "loss": 0.9510712, + "num_input_tokens_seen": 34830020, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1953125, + "step": 1619, + "time_per_iteration": 2.515282392501831 + }, + { + "auxiliary_loss_clip": 0.01171299, + "auxiliary_loss_mlp": 0.01052594, + "balance_loss_clip": 1.03150594, + "balance_loss_mlp": 1.05216622, + "epoch": 0.09739966932211033, + "flos": 31503928544640.0, + "grad_norm": 1.7974961209450113, + "language_loss": 0.88785303, + "learning_rate": 3.9525369726035784e-06, + "loss": 0.910092, + "num_input_tokens_seen": 34850330, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1875, + "step": 1620, + "time_per_iteration": 2.556744337081909 + }, + { + "auxiliary_loss_clip": 0.01175309, + "auxiliary_loss_mlp": 0.01056801, + "balance_loss_clip": 1.0339601, + "balance_loss_mlp": 1.05045033, + "epoch": 0.0974597925747783, + "flos": 23878764846720.0, + "grad_norm": 1.90931759761679, + "language_loss": 0.77130795, + "learning_rate": 3.952452592315324e-06, + "loss": 0.79362905, + "num_input_tokens_seen": 34871640, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.25, + "step": 1621, + "time_per_iteration": 2.491441011428833 + }, + { + "auxiliary_loss_clip": 0.01171563, + "auxiliary_loss_mlp": 0.01056002, + "balance_loss_clip": 1.03398418, + "balance_loss_mlp": 1.04859447, + "epoch": 0.09751991582744626, + "flos": 17019575700480.0, + "grad_norm": 1.9170880538391684, + "language_loss": 0.77856946, + "learning_rate": 3.952368137989871e-06, + "loss": 0.80084509, + "num_input_tokens_seen": 34888100, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.2265625, + "step": 1622, + "time_per_iteration": 2.4379701614379883 + }, + { + "auxiliary_loss_clip": 0.01177415, + "auxiliary_loss_mlp": 0.01056732, + "balance_loss_clip": 1.0349052, + "balance_loss_mlp": 1.05105746, + "epoch": 0.09758003908011423, + "flos": 28402826826240.0, + "grad_norm": 1.9420709042223125, + "language_loss": 0.85783195, + "learning_rate": 3.9522836096304225e-06, + "loss": 0.88017344, + "num_input_tokens_seen": 34910485, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.265625, + "step": 1623, + "time_per_iteration": 2.51741099357605 + }, + { + "auxiliary_loss_clip": 0.01172696, + "auxiliary_loss_mlp": 0.01056286, + "balance_loss_clip": 1.03498316, + "balance_loss_mlp": 1.05181813, + "epoch": 0.09764016233278221, + "flos": 18144297336960.0, + "grad_norm": 2.2833168401589656, + "language_loss": 0.80328369, + "learning_rate": 3.952199007240184e-06, + "loss": 0.8255735, + "num_input_tokens_seen": 34928615, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.2109375, + "step": 1624, + "time_per_iteration": 2.4646618366241455 + }, + { + "auxiliary_loss_clip": 0.01170952, + "auxiliary_loss_mlp": 0.01044517, + "balance_loss_clip": 1.02450192, + "balance_loss_mlp": 1.04799926, + "epoch": 0.09770028558545017, + "flos": 15265842071040.0, + "grad_norm": 2.7577002662180954, + "language_loss": 0.8575626, + "learning_rate": 3.952114330822364e-06, + "loss": 0.87971735, + "num_input_tokens_seen": 34946045, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.234375, + "step": 1625, + "time_per_iteration": 2.452615976333618 + }, + { + "auxiliary_loss_clip": 0.01176501, + "auxiliary_loss_mlp": 0.01055325, + "balance_loss_clip": 1.03445125, + "balance_loss_mlp": 1.05226421, + "epoch": 0.09776040883811814, + "flos": 23472435219840.0, + "grad_norm": 3.258883448957912, + "language_loss": 0.8539601, + "learning_rate": 3.952029580380172e-06, + "loss": 0.87627834, + "num_input_tokens_seen": 34962865, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.2421875, + "step": 1626, + "time_per_iteration": 2.4931013584136963 + }, + { + "auxiliary_loss_clip": 0.01181466, + "auxiliary_loss_mlp": 0.0105723, + "balance_loss_clip": 1.03493834, + "balance_loss_mlp": 1.05541551, + "epoch": 0.09782053209078612, + "flos": 24499480798080.0, + "grad_norm": 1.979888643217431, + "language_loss": 0.83329904, + "learning_rate": 3.9519447559168234e-06, + "loss": 0.85568601, + "num_input_tokens_seen": 34983505, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.2578125, + "step": 1627, + "time_per_iteration": 2.5056917667388916 + }, + { + "auxiliary_loss_clip": 0.01170161, + "auxiliary_loss_mlp": 0.01050744, + "balance_loss_clip": 1.03065729, + "balance_loss_mlp": 1.0488416, + "epoch": 0.09788065534345408, + "flos": 21580158833280.0, + "grad_norm": 1.7873285490487296, + "language_loss": 0.84291327, + "learning_rate": 3.951859857435534e-06, + "loss": 0.86512232, + "num_input_tokens_seen": 35001825, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.2109375, + "step": 1628, + "time_per_iteration": 2.4835076332092285 + }, + { + "auxiliary_loss_clip": 0.01169153, + "auxiliary_loss_mlp": 0.01052825, + "balance_loss_clip": 1.0321064, + "balance_loss_mlp": 1.04880238, + "epoch": 0.09794077859612205, + "flos": 23842459175040.0, + "grad_norm": 1.6092149858605884, + "language_loss": 0.75609362, + "learning_rate": 3.951774884939523e-06, + "loss": 0.77831334, + "num_input_tokens_seen": 35023075, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.203125, + "step": 1629, + "time_per_iteration": 2.4959983825683594 + }, + { + "auxiliary_loss_clip": 0.01175285, + "auxiliary_loss_mlp": 0.01046701, + "balance_loss_clip": 1.02412319, + "balance_loss_mlp": 1.0530107, + "epoch": 0.09800090184879003, + "flos": 23659889322240.0, + "grad_norm": 1.5982247062153871, + "language_loss": 0.78224194, + "learning_rate": 3.951689838432013e-06, + "loss": 0.80446172, + "num_input_tokens_seen": 35043480, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.21875, + "step": 1630, + "time_per_iteration": 2.5138731002807617 + }, + { + "auxiliary_loss_clip": 0.01177321, + "auxiliary_loss_mlp": 0.01051411, + "balance_loss_clip": 1.0292381, + "balance_loss_mlp": 1.05457997, + "epoch": 0.09806102510145799, + "flos": 17055773631360.0, + "grad_norm": 1.9134334701620013, + "language_loss": 0.86704385, + "learning_rate": 3.951604717916228e-06, + "loss": 0.8893311, + "num_input_tokens_seen": 35061490, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.2265625, + "step": 1631, + "time_per_iteration": 2.443878173828125 + }, + { + "auxiliary_loss_clip": 0.01172712, + "auxiliary_loss_mlp": 0.01050929, + "balance_loss_clip": 1.03065109, + "balance_loss_mlp": 1.05258322, + "epoch": 0.09812114835412596, + "flos": 23878477537920.0, + "grad_norm": 2.096430969489036, + "language_loss": 0.83111286, + "learning_rate": 3.9515195233953975e-06, + "loss": 0.85334921, + "num_input_tokens_seen": 35079670, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.203125, + "step": 1632, + "time_per_iteration": 2.478825807571411 + }, + { + "auxiliary_loss_clip": 0.01174956, + "auxiliary_loss_mlp": 0.01056552, + "balance_loss_clip": 1.0368464, + "balance_loss_mlp": 1.05281615, + "epoch": 0.09818127160679392, + "flos": 20595488325120.0, + "grad_norm": 1.5107232822128822, + "language_loss": 0.7877655, + "learning_rate": 3.951434254872751e-06, + "loss": 0.81008065, + "num_input_tokens_seen": 35099205, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.21875, + "step": 1633, + "time_per_iteration": 2.447930097579956 + }, + { + "auxiliary_loss_clip": 0.01169184, + "auxiliary_loss_mlp": 0.01049402, + "balance_loss_clip": 1.02833819, + "balance_loss_mlp": 1.04989707, + "epoch": 0.0982413948594619, + "flos": 15487339288320.0, + "grad_norm": 2.0663591821232865, + "language_loss": 0.73159611, + "learning_rate": 3.951348912351521e-06, + "loss": 0.75378191, + "num_input_tokens_seen": 35115270, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1875, + "step": 1634, + "time_per_iteration": 2.460265636444092 + }, + { + "auxiliary_loss_clip": 0.01179893, + "auxiliary_loss_mlp": 0.01062758, + "balance_loss_clip": 1.04026294, + "balance_loss_mlp": 1.0516957, + "epoch": 0.09830151811212987, + "flos": 24207958016640.0, + "grad_norm": 2.7516342600991868, + "language_loss": 0.72714394, + "learning_rate": 3.951263495834947e-06, + "loss": 0.74957043, + "num_input_tokens_seen": 35134065, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.28125, + "step": 1635, + "time_per_iteration": 2.4835710525512695 + }, + { + "auxiliary_loss_clip": 0.01177592, + "auxiliary_loss_mlp": 0.01055297, + "balance_loss_clip": 1.03301644, + "balance_loss_mlp": 1.05253148, + "epoch": 0.09836164136479783, + "flos": 20594590485120.0, + "grad_norm": 1.8458745824258636, + "language_loss": 0.7819975, + "learning_rate": 3.951178005326264e-06, + "loss": 0.80432636, + "num_input_tokens_seen": 35154870, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.25, + "step": 1636, + "time_per_iteration": 2.53061842918396 + }, + { + "auxiliary_loss_clip": 0.01173491, + "auxiliary_loss_mlp": 0.01056847, + "balance_loss_clip": 1.03498387, + "balance_loss_mlp": 1.05113721, + "epoch": 0.09842176461746581, + "flos": 19934157070080.0, + "grad_norm": 2.2976115041381386, + "language_loss": 0.70005965, + "learning_rate": 3.951092440828715e-06, + "loss": 0.722363, + "num_input_tokens_seen": 35171850, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.21875, + "step": 1637, + "time_per_iteration": 2.445742130279541 + }, + { + "auxiliary_loss_clip": 0.0117379, + "auxiliary_loss_mlp": 0.01053221, + "balance_loss_clip": 1.03175139, + "balance_loss_mlp": 1.05108416, + "epoch": 0.09848188787013377, + "flos": 21214659991680.0, + "grad_norm": 2.115587702667026, + "language_loss": 0.77395654, + "learning_rate": 3.951006802345545e-06, + "loss": 0.79622668, + "num_input_tokens_seen": 35188795, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.2265625, + "step": 1638, + "time_per_iteration": 2.4725139141082764 + }, + { + "auxiliary_loss_clip": 0.01170234, + "auxiliary_loss_mlp": 0.01046096, + "balance_loss_clip": 1.02524579, + "balance_loss_mlp": 1.05077171, + "epoch": 0.09854201112280174, + "flos": 30154226071680.0, + "grad_norm": 1.4162008179950134, + "language_loss": 0.7263118, + "learning_rate": 3.950921089880003e-06, + "loss": 0.74847507, + "num_input_tokens_seen": 35212100, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1953125, + "step": 1639, + "time_per_iteration": 2.5534512996673584 + }, + { + "auxiliary_loss_clip": 0.01173162, + "auxiliary_loss_mlp": 0.01040763, + "balance_loss_clip": 1.01943696, + "balance_loss_mlp": 1.05003214, + "epoch": 0.09860213437546972, + "flos": 21795730306560.0, + "grad_norm": 1.8280373897837945, + "language_loss": 0.88669002, + "learning_rate": 3.950835303435337e-06, + "loss": 0.90882927, + "num_input_tokens_seen": 35230390, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.234375, + "step": 1640, + "time_per_iteration": 2.4868786334991455 + }, + { + "auxiliary_loss_clip": 0.01173727, + "auxiliary_loss_mlp": 0.01037743, + "balance_loss_clip": 1.01685774, + "balance_loss_mlp": 1.05164635, + "epoch": 0.09866225762813768, + "flos": 21835555511040.0, + "grad_norm": 2.1859335509376527, + "language_loss": 0.8086108, + "learning_rate": 3.950749443014801e-06, + "loss": 0.83072555, + "num_input_tokens_seen": 35250405, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.21875, + "step": 1641, + "time_per_iteration": 2.5081584453582764 + }, + { + "auxiliary_loss_clip": 0.01173536, + "auxiliary_loss_mlp": 0.01054387, + "balance_loss_clip": 1.03130805, + "balance_loss_mlp": 1.05067503, + "epoch": 0.09872238088080565, + "flos": 17599855916160.0, + "grad_norm": 2.4983515693134417, + "language_loss": 0.85826755, + "learning_rate": 3.95066350862165e-06, + "loss": 0.88054669, + "num_input_tokens_seen": 35262820, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.2265625, + "step": 1642, + "time_per_iteration": 2.4351255893707275 + }, + { + "auxiliary_loss_clip": 0.01177694, + "auxiliary_loss_mlp": 0.01053725, + "balance_loss_clip": 1.0326128, + "balance_loss_mlp": 1.05365527, + "epoch": 0.09878250413347361, + "flos": 27636134002560.0, + "grad_norm": 1.7421144196917664, + "language_loss": 0.80859929, + "learning_rate": 3.950577500259144e-06, + "loss": 0.83091342, + "num_input_tokens_seen": 35284490, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.2421875, + "step": 1643, + "time_per_iteration": 3.9550716876983643 + }, + { + "auxiliary_loss_clip": 0.01170472, + "auxiliary_loss_mlp": 0.01063125, + "balance_loss_clip": 1.04138088, + "balance_loss_mlp": 1.0494256, + "epoch": 0.0988426273861416, + "flos": 16544728880640.0, + "grad_norm": 1.9624417465121429, + "language_loss": 0.8262763, + "learning_rate": 3.950491417930543e-06, + "loss": 0.84861231, + "num_input_tokens_seen": 35302815, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.2109375, + "step": 1644, + "time_per_iteration": 3.8253817558288574 + }, + { + "auxiliary_loss_clip": 0.01169448, + "auxiliary_loss_mlp": 0.01048566, + "balance_loss_clip": 1.02733469, + "balance_loss_mlp": 1.05048347, + "epoch": 0.09890275063880956, + "flos": 21215270522880.0, + "grad_norm": 1.7099323885745632, + "language_loss": 0.6819675, + "learning_rate": 3.9504052616391124e-06, + "loss": 0.70414758, + "num_input_tokens_seen": 35321175, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1953125, + "step": 1645, + "time_per_iteration": 2.4549567699432373 + }, + { + "auxiliary_loss_clip": 0.01065531, + "auxiliary_loss_mlp": 0.01023286, + "balance_loss_clip": 1.0206517, + "balance_loss_mlp": 1.01924491, + "epoch": 0.09896287389147752, + "flos": 59379372910080.0, + "grad_norm": 0.9514884974425206, + "language_loss": 0.60854232, + "learning_rate": 3.950319031388119e-06, + "loss": 0.62943053, + "num_input_tokens_seen": 35381740, + "router_z_loss_clip": 0.02636719, + "router_z_loss_mlp": 0.46289062, + "step": 1646, + "time_per_iteration": 2.9953765869140625 + }, + { + "auxiliary_loss_clip": 0.01170253, + "auxiliary_loss_mlp": 0.01049996, + "balance_loss_clip": 1.02725112, + "balance_loss_mlp": 1.04880357, + "epoch": 0.0990229971441455, + "flos": 29642678530560.0, + "grad_norm": 2.5496486678231425, + "language_loss": 0.73046064, + "learning_rate": 3.950232727180833e-06, + "loss": 0.75266314, + "num_input_tokens_seen": 35403760, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.2109375, + "step": 1647, + "time_per_iteration": 2.5241641998291016 + }, + { + "auxiliary_loss_clip": 0.01171762, + "auxiliary_loss_mlp": 0.0105645, + "balance_loss_clip": 1.03663731, + "balance_loss_mlp": 1.04955053, + "epoch": 0.09908312039681347, + "flos": 21834873152640.0, + "grad_norm": 1.8237647662791463, + "language_loss": 0.84120429, + "learning_rate": 3.950146349020525e-06, + "loss": 0.86348635, + "num_input_tokens_seen": 35424050, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.21875, + "step": 1648, + "time_per_iteration": 2.467717170715332 + }, + { + "auxiliary_loss_clip": 0.01061152, + "auxiliary_loss_mlp": 0.01009658, + "balance_loss_clip": 1.00701165, + "balance_loss_mlp": 1.0159142, + "epoch": 0.09914324364948143, + "flos": 57564304807680.0, + "grad_norm": 0.7437092318732932, + "language_loss": 0.55674303, + "learning_rate": 3.950059896910473e-06, + "loss": 0.57745123, + "num_input_tokens_seen": 35481690, + "router_z_loss_clip": 0.02648926, + "router_z_loss_mlp": 0.453125, + "step": 1649, + "time_per_iteration": 2.99874210357666 + }, + { + "auxiliary_loss_clip": 0.01165781, + "auxiliary_loss_mlp": 0.01046657, + "balance_loss_clip": 1.02598572, + "balance_loss_mlp": 1.04597533, + "epoch": 0.09920336690214941, + "flos": 34123934476800.0, + "grad_norm": 2.284847215884091, + "language_loss": 0.89930248, + "learning_rate": 3.949973370853954e-06, + "loss": 0.92142689, + "num_input_tokens_seen": 35498635, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1953125, + "step": 1650, + "time_per_iteration": 2.5542855262756348 + }, + { + "auxiliary_loss_clip": 0.01058202, + "auxiliary_loss_mlp": 0.01003693, + "balance_loss_clip": 1.00105858, + "balance_loss_mlp": 1.01395106, + "epoch": 0.09926349015481738, + "flos": 71216428464000.0, + "grad_norm": 0.8031298543824162, + "language_loss": 0.63733649, + "learning_rate": 3.94988677085425e-06, + "loss": 0.65795547, + "num_input_tokens_seen": 35565720, + "router_z_loss_clip": 0.02636719, + "router_z_loss_mlp": 0.44140625, + "step": 1651, + "time_per_iteration": 3.217806100845337 + }, + { + "auxiliary_loss_clip": 0.01168872, + "auxiliary_loss_mlp": 0.01054978, + "balance_loss_clip": 1.03318655, + "balance_loss_mlp": 1.04885435, + "epoch": 0.09932361340748534, + "flos": 23148700917120.0, + "grad_norm": 1.9462006377707899, + "language_loss": 0.88288587, + "learning_rate": 3.949800096914643e-06, + "loss": 0.90512443, + "num_input_tokens_seen": 35586000, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.203125, + "step": 1652, + "time_per_iteration": 2.5014448165893555 + }, + { + "auxiliary_loss_clip": 0.01174376, + "auxiliary_loss_mlp": 0.01057611, + "balance_loss_clip": 1.03692842, + "balance_loss_mlp": 1.05190849, + "epoch": 0.09938373666015332, + "flos": 19828651847040.0, + "grad_norm": 1.9500387106757973, + "language_loss": 0.82206833, + "learning_rate": 3.949713349038422e-06, + "loss": 0.84438825, + "num_input_tokens_seen": 35604355, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.2265625, + "step": 1653, + "time_per_iteration": 2.4881839752197266 + }, + { + "auxiliary_loss_clip": 0.01172582, + "auxiliary_loss_mlp": 0.010545, + "balance_loss_clip": 1.03330469, + "balance_loss_mlp": 1.04984093, + "epoch": 0.09944385991282129, + "flos": 22090664880000.0, + "grad_norm": 2.0314065071494136, + "language_loss": 0.79399735, + "learning_rate": 3.949626527228875e-06, + "loss": 0.81626815, + "num_input_tokens_seen": 35625495, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.2265625, + "step": 1654, + "time_per_iteration": 2.5269205570220947 + }, + { + "auxiliary_loss_clip": 0.01167439, + "auxiliary_loss_mlp": 0.01055854, + "balance_loss_clip": 1.03700721, + "balance_loss_mlp": 1.05072093, + "epoch": 0.09950398316548925, + "flos": 19828867328640.0, + "grad_norm": 1.5637423809135174, + "language_loss": 0.8088094, + "learning_rate": 3.949539631489295e-06, + "loss": 0.83104229, + "num_input_tokens_seen": 35645030, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.171875, + "step": 1655, + "time_per_iteration": 2.4652602672576904 + }, + { + "auxiliary_loss_clip": 0.01167369, + "auxiliary_loss_mlp": 0.0105576, + "balance_loss_clip": 1.03495777, + "balance_loss_mlp": 1.04891443, + "epoch": 0.09956410641815722, + "flos": 25003701964800.0, + "grad_norm": 1.9082198159511756, + "language_loss": 0.80947387, + "learning_rate": 3.9494526618229765e-06, + "loss": 0.83170521, + "num_input_tokens_seen": 35664305, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.1875, + "step": 1656, + "time_per_iteration": 2.4966416358947754 + }, + { + "auxiliary_loss_clip": 0.01170477, + "auxiliary_loss_mlp": 0.0106116, + "balance_loss_clip": 1.04066813, + "balance_loss_mlp": 1.05147541, + "epoch": 0.0996242296708252, + "flos": 19317714837120.0, + "grad_norm": 1.6268850155063674, + "language_loss": 0.88850212, + "learning_rate": 3.949365618233217e-06, + "loss": 0.91081852, + "num_input_tokens_seen": 35684060, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1875, + "step": 1657, + "time_per_iteration": 2.446124792098999 + }, + { + "auxiliary_loss_clip": 0.01175951, + "auxiliary_loss_mlp": 0.01063236, + "balance_loss_clip": 1.04088378, + "balance_loss_mlp": 1.05091214, + "epoch": 0.09968435292349316, + "flos": 21871609787520.0, + "grad_norm": 2.0057694643168302, + "language_loss": 0.84758937, + "learning_rate": 3.9492785007233195e-06, + "loss": 0.86998123, + "num_input_tokens_seen": 35703250, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.25, + "step": 1658, + "time_per_iteration": 2.457902669906616 + }, + { + "auxiliary_loss_clip": 0.01054631, + "auxiliary_loss_mlp": 0.01077242, + "balance_loss_clip": 1.07460773, + "balance_loss_mlp": 1.0110395, + "epoch": 0.09974447617616113, + "flos": 65384533313280.0, + "grad_norm": 0.9153195332104517, + "language_loss": 0.60843968, + "learning_rate": 3.949191309296585e-06, + "loss": 0.62975848, + "num_input_tokens_seen": 35762165, + "router_z_loss_clip": 0.02636719, + "router_z_loss_mlp": 0.43554688, + "step": 1659, + "time_per_iteration": 3.077805519104004 + }, + { + "auxiliary_loss_clip": 0.01170517, + "auxiliary_loss_mlp": 0.01052954, + "balance_loss_clip": 1.03155613, + "balance_loss_mlp": 1.04999721, + "epoch": 0.0998045994288291, + "flos": 23659817495040.0, + "grad_norm": 1.8691655756599186, + "language_loss": 0.85116851, + "learning_rate": 3.949104043956321e-06, + "loss": 0.87340325, + "num_input_tokens_seen": 35781520, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.2109375, + "step": 1660, + "time_per_iteration": 2.49082612991333 + }, + { + "auxiliary_loss_clip": 0.01171003, + "auxiliary_loss_mlp": 0.01056184, + "balance_loss_clip": 1.03393948, + "balance_loss_mlp": 1.05291247, + "epoch": 0.09986472268149707, + "flos": 19609704495360.0, + "grad_norm": 2.130922035700174, + "language_loss": 0.80037123, + "learning_rate": 3.949016704705836e-06, + "loss": 0.8226431, + "num_input_tokens_seen": 35799565, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.1796875, + "step": 1661, + "time_per_iteration": 2.4412636756896973 + }, + { + "auxiliary_loss_clip": 0.01175671, + "auxiliary_loss_mlp": 0.01050112, + "balance_loss_clip": 1.02801085, + "balance_loss_mlp": 1.05002224, + "epoch": 0.09992484593416504, + "flos": 26213317395840.0, + "grad_norm": 1.8939661728963775, + "language_loss": 0.83592767, + "learning_rate": 3.948929291548443e-06, + "loss": 0.85818553, + "num_input_tokens_seen": 35821085, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.2578125, + "step": 1662, + "time_per_iteration": 2.5200328826904297 + }, + { + "auxiliary_loss_clip": 0.01171098, + "auxiliary_loss_mlp": 0.01052384, + "balance_loss_clip": 1.02972281, + "balance_loss_mlp": 1.05104828, + "epoch": 0.09998496918683301, + "flos": 17493632421120.0, + "grad_norm": 2.1063962968477, + "language_loss": 0.88696563, + "learning_rate": 3.9488418044874546e-06, + "loss": 0.90920055, + "num_input_tokens_seen": 35839840, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.203125, + "step": 1663, + "time_per_iteration": 2.42790150642395 + }, + { + "auxiliary_loss_clip": 0.01174901, + "auxiliary_loss_mlp": 0.01052956, + "balance_loss_clip": 1.03084326, + "balance_loss_mlp": 1.05225635, + "epoch": 0.10004509243950098, + "flos": 22784925928320.0, + "grad_norm": 1.6888490247303796, + "language_loss": 0.7034179, + "learning_rate": 3.948754243526191e-06, + "loss": 0.72569644, + "num_input_tokens_seen": 35861545, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.21875, + "step": 1664, + "time_per_iteration": 2.512596368789673 + }, + { + "auxiliary_loss_clip": 0.01173831, + "auxiliary_loss_mlp": 0.01050685, + "balance_loss_clip": 1.02903676, + "balance_loss_mlp": 1.0535655, + "epoch": 0.10010521569216894, + "flos": 16253385667200.0, + "grad_norm": 2.1773983349048804, + "language_loss": 0.7878316, + "learning_rate": 3.94866660866797e-06, + "loss": 0.81007671, + "num_input_tokens_seen": 35878295, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.203125, + "step": 1665, + "time_per_iteration": 2.4271252155303955 + }, + { + "auxiliary_loss_clip": 0.0117847, + "auxiliary_loss_mlp": 0.01061559, + "balance_loss_clip": 1.0404706, + "balance_loss_mlp": 1.05681181, + "epoch": 0.10016533894483691, + "flos": 23402589223680.0, + "grad_norm": 1.663243771388797, + "language_loss": 0.70152062, + "learning_rate": 3.9485788999161165e-06, + "loss": 0.72392094, + "num_input_tokens_seen": 35898990, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.21875, + "step": 1666, + "time_per_iteration": 2.499131202697754 + }, + { + "auxiliary_loss_clip": 0.01173729, + "auxiliary_loss_mlp": 0.01060981, + "balance_loss_clip": 1.03777063, + "balance_loss_mlp": 1.0506525, + "epoch": 0.10022546219750489, + "flos": 19354164163200.0, + "grad_norm": 1.8121915129470096, + "language_loss": 0.791031, + "learning_rate": 3.948491117273956e-06, + "loss": 0.8133781, + "num_input_tokens_seen": 35916225, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.234375, + "step": 1667, + "time_per_iteration": 2.4429264068603516 + }, + { + "auxiliary_loss_clip": 0.0117317, + "auxiliary_loss_mlp": 0.01050651, + "balance_loss_clip": 1.02810836, + "balance_loss_mlp": 1.05261493, + "epoch": 0.10028558545017285, + "flos": 27085766837760.0, + "grad_norm": 2.9507555712476945, + "language_loss": 0.7715596, + "learning_rate": 3.948403260744817e-06, + "loss": 0.79379785, + "num_input_tokens_seen": 35934630, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.203125, + "step": 1668, + "time_per_iteration": 2.5223031044006348 + }, + { + "auxiliary_loss_clip": 0.01173457, + "auxiliary_loss_mlp": 0.01057389, + "balance_loss_clip": 1.03434563, + "balance_loss_mlp": 1.05256963, + "epoch": 0.10034570870284082, + "flos": 25847136195840.0, + "grad_norm": 1.9809152554972944, + "language_loss": 0.77852714, + "learning_rate": 3.948315330332031e-06, + "loss": 0.80083561, + "num_input_tokens_seen": 35953855, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.2109375, + "step": 1669, + "time_per_iteration": 2.5082881450653076 + }, + { + "auxiliary_loss_clip": 0.01181618, + "auxiliary_loss_mlp": 0.01059972, + "balance_loss_clip": 1.03641593, + "balance_loss_mlp": 1.05464602, + "epoch": 0.1004058319555088, + "flos": 26249587153920.0, + "grad_norm": 2.145889566444559, + "language_loss": 0.85461181, + "learning_rate": 3.948227326038933e-06, + "loss": 0.87702769, + "num_input_tokens_seen": 35974555, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.2734375, + "step": 1670, + "time_per_iteration": 2.5235135555267334 + }, + { + "auxiliary_loss_clip": 0.01166248, + "auxiliary_loss_mlp": 0.01057789, + "balance_loss_clip": 1.03681993, + "balance_loss_mlp": 1.0501771, + "epoch": 0.10046595520817676, + "flos": 25374480105600.0, + "grad_norm": 1.5986093935623644, + "language_loss": 0.76899171, + "learning_rate": 3.9481392478688586e-06, + "loss": 0.79123211, + "num_input_tokens_seen": 35996830, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.15625, + "step": 1671, + "time_per_iteration": 2.505441665649414 + }, + { + "auxiliary_loss_clip": 0.01059926, + "auxiliary_loss_mlp": 0.01022857, + "balance_loss_clip": 1.02019823, + "balance_loss_mlp": 1.01598763, + "epoch": 0.10052607846084473, + "flos": 67461821677440.0, + "grad_norm": 0.7900846916321359, + "language_loss": 0.60719293, + "learning_rate": 3.948051095825149e-06, + "loss": 0.62802076, + "num_input_tokens_seen": 36054465, + "router_z_loss_clip": 0.02661133, + "router_z_loss_mlp": 0.43945312, + "step": 1672, + "time_per_iteration": 3.07255482673645 + }, + { + "auxiliary_loss_clip": 0.01173395, + "auxiliary_loss_mlp": 0.01064348, + "balance_loss_clip": 1.04179382, + "balance_loss_mlp": 1.05045998, + "epoch": 0.10058620171351271, + "flos": 21360493209600.0, + "grad_norm": 2.0407855091156377, + "language_loss": 0.77119517, + "learning_rate": 3.947962869911147e-06, + "loss": 0.79357255, + "num_input_tokens_seen": 36073480, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.234375, + "step": 1673, + "time_per_iteration": 2.4693222045898438 + }, + { + "auxiliary_loss_clip": 0.01171478, + "auxiliary_loss_mlp": 0.01052114, + "balance_loss_clip": 1.03066778, + "balance_loss_mlp": 1.04964709, + "epoch": 0.10064632496618067, + "flos": 16800125558400.0, + "grad_norm": 2.2570599367002835, + "language_loss": 0.72829556, + "learning_rate": 3.947874570130197e-06, + "loss": 0.75053144, + "num_input_tokens_seen": 36091830, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.21875, + "step": 1674, + "time_per_iteration": 2.4534130096435547 + }, + { + "auxiliary_loss_clip": 0.01170516, + "auxiliary_loss_mlp": 0.01051148, + "balance_loss_clip": 1.03047729, + "balance_loss_mlp": 1.04903197, + "epoch": 0.10070644821884864, + "flos": 23624445576960.0, + "grad_norm": 2.043409325490185, + "language_loss": 0.79386973, + "learning_rate": 3.947786196485649e-06, + "loss": 0.81608635, + "num_input_tokens_seen": 36111400, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.21875, + "step": 1675, + "time_per_iteration": 2.496504545211792 + }, + { + "auxiliary_loss_clip": 0.01168157, + "auxiliary_loss_mlp": 0.01064762, + "balance_loss_clip": 1.04449606, + "balance_loss_mlp": 1.04908013, + "epoch": 0.1007665714715166, + "flos": 24462564595200.0, + "grad_norm": 2.0305638084579294, + "language_loss": 0.81565315, + "learning_rate": 3.947697748980853e-06, + "loss": 0.8379823, + "num_input_tokens_seen": 36129345, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1875, + "step": 1676, + "time_per_iteration": 2.5022919178009033 + }, + { + "auxiliary_loss_clip": 0.01174395, + "auxiliary_loss_mlp": 0.01058603, + "balance_loss_clip": 1.03713369, + "balance_loss_mlp": 1.05283856, + "epoch": 0.10082669472418458, + "flos": 16799119977600.0, + "grad_norm": 2.134524944411931, + "language_loss": 0.86155027, + "learning_rate": 3.947609227619163e-06, + "loss": 0.88388026, + "num_input_tokens_seen": 36146255, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.2109375, + "step": 1677, + "time_per_iteration": 2.44887113571167 + }, + { + "auxiliary_loss_clip": 0.01171962, + "auxiliary_loss_mlp": 0.01055328, + "balance_loss_clip": 1.03452563, + "balance_loss_mlp": 1.05113602, + "epoch": 0.10088681797685255, + "flos": 13553513844480.0, + "grad_norm": 5.349815535910457, + "language_loss": 0.86318195, + "learning_rate": 3.947520632403936e-06, + "loss": 0.88545489, + "num_input_tokens_seen": 36164050, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.2109375, + "step": 1678, + "time_per_iteration": 2.4373903274536133 + }, + { + "auxiliary_loss_clip": 0.01172423, + "auxiliary_loss_mlp": 0.01055078, + "balance_loss_clip": 1.03359675, + "balance_loss_mlp": 1.05214512, + "epoch": 0.10094694122952051, + "flos": 25265706744960.0, + "grad_norm": 2.6897314721028867, + "language_loss": 0.89726269, + "learning_rate": 3.947431963338532e-06, + "loss": 0.91953766, + "num_input_tokens_seen": 36183530, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.203125, + "step": 1679, + "time_per_iteration": 2.493959903717041 + }, + { + "auxiliary_loss_clip": 0.01056795, + "auxiliary_loss_mlp": 0.01017317, + "balance_loss_clip": 1.01468229, + "balance_loss_mlp": 1.01327634, + "epoch": 0.10100706448218849, + "flos": 69854299885440.0, + "grad_norm": 0.7831657514235874, + "language_loss": 0.53018153, + "learning_rate": 3.947343220426312e-06, + "loss": 0.55092263, + "num_input_tokens_seen": 36248550, + "router_z_loss_clip": 0.02636719, + "router_z_loss_mlp": 0.43554688, + "step": 1680, + "time_per_iteration": 3.15899658203125 + }, + { + "auxiliary_loss_clip": 0.01168402, + "auxiliary_loss_mlp": 0.01055332, + "balance_loss_clip": 1.03429151, + "balance_loss_mlp": 1.04983318, + "epoch": 0.10106718773485646, + "flos": 20007163463040.0, + "grad_norm": 1.657625192327098, + "language_loss": 0.76889706, + "learning_rate": 3.947254403670641e-06, + "loss": 0.79113436, + "num_input_tokens_seen": 36266065, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.1875, + "step": 1681, + "time_per_iteration": 2.446937322616577 + }, + { + "auxiliary_loss_clip": 0.01175341, + "auxiliary_loss_mlp": 0.01058478, + "balance_loss_clip": 1.03423131, + "balance_loss_mlp": 1.04937744, + "epoch": 0.10112731098752442, + "flos": 13479825093120.0, + "grad_norm": 2.135292201068385, + "language_loss": 0.93928307, + "learning_rate": 3.947165513074889e-06, + "loss": 0.96162128, + "num_input_tokens_seen": 36280960, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.2578125, + "step": 1682, + "time_per_iteration": 2.4357759952545166 + }, + { + "auxiliary_loss_clip": 0.01172101, + "auxiliary_loss_mlp": 0.01053977, + "balance_loss_clip": 1.03315091, + "balance_loss_mlp": 1.05045152, + "epoch": 0.1011874342401924, + "flos": 18515901490560.0, + "grad_norm": 5.112669241194533, + "language_loss": 0.87866408, + "learning_rate": 3.947076548642425e-06, + "loss": 0.90092492, + "num_input_tokens_seen": 36299010, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.21875, + "step": 1683, + "time_per_iteration": 2.427802562713623 + }, + { + "auxiliary_loss_clip": 0.01169341, + "auxiliary_loss_mlp": 0.01059869, + "balance_loss_clip": 1.03888798, + "balance_loss_mlp": 1.05144525, + "epoch": 0.10124755749286037, + "flos": 20702861055360.0, + "grad_norm": 1.7718228637860187, + "language_loss": 0.74768114, + "learning_rate": 3.946987510376624e-06, + "loss": 0.76997328, + "num_input_tokens_seen": 36318400, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.1796875, + "step": 1684, + "time_per_iteration": 5.332470417022705 + }, + { + "auxiliary_loss_clip": 0.01059522, + "auxiliary_loss_mlp": 0.01011499, + "balance_loss_clip": 1.00863802, + "balance_loss_mlp": 1.01624751, + "epoch": 0.10130768074552833, + "flos": 56109456247680.0, + "grad_norm": 0.760003339390084, + "language_loss": 0.61090153, + "learning_rate": 3.9468983982808615e-06, + "loss": 0.6316117, + "num_input_tokens_seen": 36381815, + "router_z_loss_clip": 0.02856445, + "router_z_loss_mlp": 0.43359375, + "step": 1685, + "time_per_iteration": 4.508171081542969 + }, + { + "auxiliary_loss_clip": 0.01169013, + "auxiliary_loss_mlp": 0.01049359, + "balance_loss_clip": 1.02769828, + "balance_loss_mlp": 1.04891801, + "epoch": 0.1013678039981963, + "flos": 33402346156800.0, + "grad_norm": 2.3224629698824075, + "language_loss": 0.61664945, + "learning_rate": 3.946809212358516e-06, + "loss": 0.63883317, + "num_input_tokens_seen": 36404320, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.203125, + "step": 1686, + "time_per_iteration": 2.586596965789795 + }, + { + "auxiliary_loss_clip": 0.01173787, + "auxiliary_loss_mlp": 0.01054454, + "balance_loss_clip": 1.03238797, + "balance_loss_mlp": 1.0545882, + "epoch": 0.10142792725086427, + "flos": 31905338008320.0, + "grad_norm": 2.1992592502117443, + "language_loss": 0.81408226, + "learning_rate": 3.946719952612972e-06, + "loss": 0.83636469, + "num_input_tokens_seen": 36427510, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1875, + "step": 1687, + "time_per_iteration": 2.5495810508728027 + }, + { + "auxiliary_loss_clip": 0.01173812, + "auxiliary_loss_mlp": 0.01051846, + "balance_loss_clip": 1.03055501, + "balance_loss_mlp": 1.0514555, + "epoch": 0.10148805050353224, + "flos": 28475905046400.0, + "grad_norm": 1.783489688966995, + "language_loss": 0.72360015, + "learning_rate": 3.94663061904761e-06, + "loss": 0.74585676, + "num_input_tokens_seen": 36448230, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.21875, + "step": 1688, + "time_per_iteration": 2.5242748260498047 + }, + { + "auxiliary_loss_clip": 0.01169898, + "auxiliary_loss_mlp": 0.01054433, + "balance_loss_clip": 1.03264165, + "balance_loss_mlp": 1.05043888, + "epoch": 0.1015481737562002, + "flos": 25148888737920.0, + "grad_norm": 1.9893327907397977, + "language_loss": 0.86880058, + "learning_rate": 3.94654121166582e-06, + "loss": 0.8910439, + "num_input_tokens_seen": 36464395, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.1875, + "step": 1689, + "time_per_iteration": 2.5283408164978027 + }, + { + "auxiliary_loss_clip": 0.01165961, + "auxiliary_loss_mlp": 0.01045526, + "balance_loss_clip": 1.02585626, + "balance_loss_mlp": 1.04692245, + "epoch": 0.10160829700886818, + "flos": 30882781630080.0, + "grad_norm": 1.8972643802531153, + "language_loss": 0.88054395, + "learning_rate": 3.946451730470993e-06, + "loss": 0.90265882, + "num_input_tokens_seen": 36486475, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1875, + "step": 1690, + "time_per_iteration": 2.5732247829437256 + }, + { + "auxiliary_loss_clip": 0.01170509, + "auxiliary_loss_mlp": 0.01051598, + "balance_loss_clip": 1.02961624, + "balance_loss_mlp": 1.04965854, + "epoch": 0.10166842026153615, + "flos": 20412020632320.0, + "grad_norm": 1.8841763324380914, + "language_loss": 0.83124495, + "learning_rate": 3.946362175466521e-06, + "loss": 0.85346603, + "num_input_tokens_seen": 36505310, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.203125, + "step": 1691, + "time_per_iteration": 2.453263282775879 + }, + { + "auxiliary_loss_clip": 0.01172636, + "auxiliary_loss_mlp": 0.01050561, + "balance_loss_clip": 1.028579, + "balance_loss_mlp": 1.05049825, + "epoch": 0.10172854351420411, + "flos": 33476968661760.0, + "grad_norm": 1.648035623213742, + "language_loss": 0.66938514, + "learning_rate": 3.946272546655801e-06, + "loss": 0.69161713, + "num_input_tokens_seen": 36529820, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.21875, + "step": 1692, + "time_per_iteration": 2.5865867137908936 + }, + { + "auxiliary_loss_clip": 0.01167535, + "auxiliary_loss_mlp": 0.01067279, + "balance_loss_clip": 1.04540372, + "balance_loss_mlp": 1.0471102, + "epoch": 0.1017886667668721, + "flos": 23550325862400.0, + "grad_norm": 1.649284734670808, + "language_loss": 0.75387824, + "learning_rate": 3.94618284404223e-06, + "loss": 0.77622634, + "num_input_tokens_seen": 36549000, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.203125, + "step": 1693, + "time_per_iteration": 2.499476194381714 + }, + { + "auxiliary_loss_clip": 0.01171507, + "auxiliary_loss_mlp": 0.01049517, + "balance_loss_clip": 1.02685595, + "balance_loss_mlp": 1.04984784, + "epoch": 0.10184879001954006, + "flos": 23296078419840.0, + "grad_norm": 1.6930931596653784, + "language_loss": 0.87206519, + "learning_rate": 3.9460930676292105e-06, + "loss": 0.89427543, + "num_input_tokens_seen": 36567515, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.21875, + "step": 1694, + "time_per_iteration": 2.483264923095703 + }, + { + "auxiliary_loss_clip": 0.01177185, + "auxiliary_loss_mlp": 0.01052768, + "balance_loss_clip": 1.03013015, + "balance_loss_mlp": 1.05056214, + "epoch": 0.10190891327220802, + "flos": 18333116156160.0, + "grad_norm": 3.1999162319303274, + "language_loss": 0.79579329, + "learning_rate": 3.946003217420147e-06, + "loss": 0.81809288, + "num_input_tokens_seen": 36586190, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.265625, + "step": 1695, + "time_per_iteration": 2.4574177265167236 + }, + { + "auxiliary_loss_clip": 0.01168528, + "auxiliary_loss_mlp": 0.01055372, + "balance_loss_clip": 1.03280592, + "balance_loss_mlp": 1.04648614, + "epoch": 0.10196903652487599, + "flos": 26465374108800.0, + "grad_norm": 1.7546035908378184, + "language_loss": 0.86581397, + "learning_rate": 3.945913293418447e-06, + "loss": 0.88805294, + "num_input_tokens_seen": 36607495, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.21875, + "step": 1696, + "time_per_iteration": 2.4986772537231445 + }, + { + "auxiliary_loss_clip": 0.01168623, + "auxiliary_loss_mlp": 0.01056747, + "balance_loss_clip": 1.03532469, + "balance_loss_mlp": 1.04927731, + "epoch": 0.10202915977754397, + "flos": 21869526798720.0, + "grad_norm": 1.97196247739744, + "language_loss": 0.82034266, + "learning_rate": 3.945823295627519e-06, + "loss": 0.84259629, + "num_input_tokens_seen": 36628555, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.1953125, + "step": 1697, + "time_per_iteration": 2.483682155609131 + }, + { + "auxiliary_loss_clip": 0.01170239, + "auxiliary_loss_mlp": 0.0104937, + "balance_loss_clip": 1.02674437, + "balance_loss_mlp": 1.0477041, + "epoch": 0.10208928303021193, + "flos": 22309755886080.0, + "grad_norm": 1.9483747561194416, + "language_loss": 0.80650747, + "learning_rate": 3.9457332240507775e-06, + "loss": 0.82870358, + "num_input_tokens_seen": 36646250, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.2265625, + "step": 1698, + "time_per_iteration": 2.4512858390808105 + }, + { + "auxiliary_loss_clip": 0.01172882, + "auxiliary_loss_mlp": 0.01048738, + "balance_loss_clip": 1.02756608, + "balance_loss_mlp": 1.05113077, + "epoch": 0.1021494062828799, + "flos": 22125569921280.0, + "grad_norm": 4.641294823605382, + "language_loss": 0.75680709, + "learning_rate": 3.945643078691637e-06, + "loss": 0.77902329, + "num_input_tokens_seen": 36666675, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.21875, + "step": 1699, + "time_per_iteration": 2.458315849304199 + }, + { + "auxiliary_loss_clip": 0.01171952, + "auxiliary_loss_mlp": 0.01048121, + "balance_loss_clip": 1.02606726, + "balance_loss_mlp": 1.05093145, + "epoch": 0.10220952953554788, + "flos": 19646728439040.0, + "grad_norm": 1.7623204527071121, + "language_loss": 0.79777479, + "learning_rate": 3.945552859553516e-06, + "loss": 0.81997555, + "num_input_tokens_seen": 36685225, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.2109375, + "step": 1700, + "time_per_iteration": 2.4692423343658447 + }, + { + "auxiliary_loss_clip": 0.01170922, + "auxiliary_loss_mlp": 0.01045823, + "balance_loss_clip": 1.02411532, + "balance_loss_mlp": 1.04850125, + "epoch": 0.10226965278821584, + "flos": 29787290686080.0, + "grad_norm": 1.8827887870563835, + "language_loss": 0.76854098, + "learning_rate": 3.945462566639836e-06, + "loss": 0.79070842, + "num_input_tokens_seen": 36705985, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.21875, + "step": 1701, + "time_per_iteration": 2.5015852451324463 + }, + { + "auxiliary_loss_clip": 0.01176415, + "auxiliary_loss_mlp": 0.01048843, + "balance_loss_clip": 1.02708709, + "balance_loss_mlp": 1.05213511, + "epoch": 0.10232977604088381, + "flos": 27016818681600.0, + "grad_norm": 2.1180628790190927, + "language_loss": 0.78123891, + "learning_rate": 3.945372199954019e-06, + "loss": 0.80349147, + "num_input_tokens_seen": 36725815, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.2421875, + "step": 1702, + "time_per_iteration": 2.4999852180480957 + }, + { + "auxiliary_loss_clip": 0.01168217, + "auxiliary_loss_mlp": 0.01046251, + "balance_loss_clip": 1.02586651, + "balance_loss_mlp": 1.0487566, + "epoch": 0.10238989929355179, + "flos": 20777519473920.0, + "grad_norm": 2.3091523831758765, + "language_loss": 0.94838184, + "learning_rate": 3.945281759499494e-06, + "loss": 0.97052652, + "num_input_tokens_seen": 36742345, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.1953125, + "step": 1703, + "time_per_iteration": 2.4586100578308105 + }, + { + "auxiliary_loss_clip": 0.01058115, + "auxiliary_loss_mlp": 0.01013234, + "balance_loss_clip": 1.01077867, + "balance_loss_mlp": 1.01462317, + "epoch": 0.10245002254621975, + "flos": 57698322451200.0, + "grad_norm": 0.8800585598511617, + "language_loss": 0.55092424, + "learning_rate": 3.94519124527969e-06, + "loss": 0.57163775, + "num_input_tokens_seen": 36798775, + "router_z_loss_clip": 0.02453613, + "router_z_loss_mlp": 0.43554688, + "step": 1704, + "time_per_iteration": 2.998384952545166 + }, + { + "auxiliary_loss_clip": 0.01170706, + "auxiliary_loss_mlp": 0.01050153, + "balance_loss_clip": 1.02790844, + "balance_loss_mlp": 1.04962945, + "epoch": 0.10251014579888772, + "flos": 16800125558400.0, + "grad_norm": 3.5257555777633174, + "language_loss": 0.83979154, + "learning_rate": 3.945100657298039e-06, + "loss": 0.86200017, + "num_input_tokens_seen": 36816295, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.2109375, + "step": 1705, + "time_per_iteration": 2.4242281913757324 + }, + { + "auxiliary_loss_clip": 0.01057951, + "auxiliary_loss_mlp": 0.01005039, + "balance_loss_clip": 1.00258374, + "balance_loss_mlp": 1.01514411, + "epoch": 0.1025702690515557, + "flos": 68565500922240.0, + "grad_norm": 0.7733309182053202, + "language_loss": 0.60434854, + "learning_rate": 3.9450099955579765e-06, + "loss": 0.62497854, + "num_input_tokens_seen": 36882030, + "router_z_loss_clip": 0.02453613, + "router_z_loss_mlp": 0.4296875, + "step": 1706, + "time_per_iteration": 3.127495765686035 + }, + { + "auxiliary_loss_clip": 0.01175774, + "auxiliary_loss_mlp": 0.01050349, + "balance_loss_clip": 1.02876019, + "balance_loss_mlp": 1.05214357, + "epoch": 0.10263039230422366, + "flos": 14866623336960.0, + "grad_norm": 2.0444921886168284, + "language_loss": 0.85967243, + "learning_rate": 3.94491926006294e-06, + "loss": 0.88193369, + "num_input_tokens_seen": 36899245, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.234375, + "step": 1707, + "time_per_iteration": 2.4486777782440186 + }, + { + "auxiliary_loss_clip": 0.01169845, + "auxiliary_loss_mlp": 0.01046854, + "balance_loss_clip": 1.02654099, + "balance_loss_mlp": 1.04891372, + "epoch": 0.10269051555689163, + "flos": 25337599816320.0, + "grad_norm": 1.6368034329364625, + "language_loss": 0.72840983, + "learning_rate": 3.944828450816369e-06, + "loss": 0.75057685, + "num_input_tokens_seen": 36920950, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.2109375, + "step": 1708, + "time_per_iteration": 2.5019850730895996 + }, + { + "auxiliary_loss_clip": 0.01168702, + "auxiliary_loss_mlp": 0.01054619, + "balance_loss_clip": 1.0325532, + "balance_loss_mlp": 1.0493356, + "epoch": 0.10275063880955959, + "flos": 21068826773760.0, + "grad_norm": 1.9016884094819633, + "language_loss": 0.90944314, + "learning_rate": 3.944737567821709e-06, + "loss": 0.93167639, + "num_input_tokens_seen": 36938900, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1953125, + "step": 1709, + "time_per_iteration": 2.4590399265289307 + }, + { + "auxiliary_loss_clip": 0.01173643, + "auxiliary_loss_mlp": 0.01055032, + "balance_loss_clip": 1.03357422, + "balance_loss_mlp": 1.05296373, + "epoch": 0.10281076206222757, + "flos": 30366780802560.0, + "grad_norm": 3.826538703219267, + "language_loss": 0.8828221, + "learning_rate": 3.944646611082406e-06, + "loss": 0.90510881, + "num_input_tokens_seen": 36957010, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.203125, + "step": 1710, + "time_per_iteration": 2.533165216445923 + }, + { + "auxiliary_loss_clip": 0.01167248, + "auxiliary_loss_mlp": 0.01053637, + "balance_loss_clip": 1.03229809, + "balance_loss_mlp": 1.04937959, + "epoch": 0.10287088531489554, + "flos": 22418313765120.0, + "grad_norm": 1.824520485293549, + "language_loss": 0.79264998, + "learning_rate": 3.944555580601908e-06, + "loss": 0.81485879, + "num_input_tokens_seen": 36977690, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1796875, + "step": 1711, + "time_per_iteration": 2.4947102069854736 + }, + { + "auxiliary_loss_clip": 0.01171963, + "auxiliary_loss_mlp": 0.01058195, + "balance_loss_clip": 1.03615332, + "balance_loss_mlp": 1.05005431, + "epoch": 0.1029310085675635, + "flos": 25115994858240.0, + "grad_norm": 2.0689984646996016, + "language_loss": 0.73589319, + "learning_rate": 3.944464476383668e-06, + "loss": 0.7581948, + "num_input_tokens_seen": 36997300, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.21875, + "step": 1712, + "time_per_iteration": 2.521899461746216 + }, + { + "auxiliary_loss_clip": 0.01166438, + "auxiliary_loss_mlp": 0.01058704, + "balance_loss_clip": 1.03902221, + "balance_loss_mlp": 1.04961872, + "epoch": 0.10299113182023148, + "flos": 19865639877120.0, + "grad_norm": 1.8460865361447714, + "language_loss": 0.86673403, + "learning_rate": 3.94437329843114e-06, + "loss": 0.8889854, + "num_input_tokens_seen": 37016110, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.1640625, + "step": 1713, + "time_per_iteration": 2.467824935913086 + }, + { + "auxiliary_loss_clip": 0.01166539, + "auxiliary_loss_mlp": 0.01059926, + "balance_loss_clip": 1.04019666, + "balance_loss_mlp": 1.04741335, + "epoch": 0.10305125507289944, + "flos": 20447608032000.0, + "grad_norm": 2.6691144860495126, + "language_loss": 0.72610664, + "learning_rate": 3.944282046747782e-06, + "loss": 0.74837124, + "num_input_tokens_seen": 37036405, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1875, + "step": 1714, + "time_per_iteration": 2.478605031967163 + }, + { + "auxiliary_loss_clip": 0.0117345, + "auxiliary_loss_mlp": 0.01057893, + "balance_loss_clip": 1.03542209, + "balance_loss_mlp": 1.04920006, + "epoch": 0.10311137832556741, + "flos": 26250772302720.0, + "grad_norm": 2.3323118637090605, + "language_loss": 0.91395295, + "learning_rate": 3.944190721337053e-06, + "loss": 0.93626636, + "num_input_tokens_seen": 37057580, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.2421875, + "step": 1715, + "time_per_iteration": 2.5223729610443115 + }, + { + "auxiliary_loss_clip": 0.01167345, + "auxiliary_loss_mlp": 0.01053609, + "balance_loss_clip": 1.03290248, + "balance_loss_mlp": 1.04737377, + "epoch": 0.10317150157823539, + "flos": 35298932175360.0, + "grad_norm": 1.9302110224144968, + "language_loss": 0.75736755, + "learning_rate": 3.944099322202418e-06, + "loss": 0.77957708, + "num_input_tokens_seen": 37079120, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.203125, + "step": 1716, + "time_per_iteration": 2.585345506668091 + }, + { + "auxiliary_loss_clip": 0.01171415, + "auxiliary_loss_mlp": 0.01068189, + "balance_loss_clip": 1.04601645, + "balance_loss_mlp": 1.04868793, + "epoch": 0.10323162483090335, + "flos": 25739943033600.0, + "grad_norm": 2.1161503252482747, + "language_loss": 0.85214567, + "learning_rate": 3.944007849347342e-06, + "loss": 0.87454176, + "num_input_tokens_seen": 37099710, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.2265625, + "step": 1717, + "time_per_iteration": 2.500964879989624 + }, + { + "auxiliary_loss_clip": 0.01169937, + "auxiliary_loss_mlp": 0.01055982, + "balance_loss_clip": 1.0358479, + "balance_loss_mlp": 1.05102515, + "epoch": 0.10329174808357132, + "flos": 16289870906880.0, + "grad_norm": 2.0308520014155746, + "language_loss": 0.82883167, + "learning_rate": 3.943916302775292e-06, + "loss": 0.85109091, + "num_input_tokens_seen": 37117775, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.1875, + "step": 1718, + "time_per_iteration": 2.436836004257202 + }, + { + "auxiliary_loss_clip": 0.01169212, + "auxiliary_loss_mlp": 0.01052655, + "balance_loss_clip": 1.03058898, + "balance_loss_mlp": 1.05092025, + "epoch": 0.10335187133623928, + "flos": 36687166963200.0, + "grad_norm": 1.8725763890619624, + "language_loss": 0.73192763, + "learning_rate": 3.943824682489742e-06, + "loss": 0.75414634, + "num_input_tokens_seen": 37140280, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1796875, + "step": 1719, + "time_per_iteration": 2.606293201446533 + }, + { + "auxiliary_loss_clip": 0.01172065, + "auxiliary_loss_mlp": 0.01046316, + "balance_loss_clip": 1.02637219, + "balance_loss_mlp": 1.05197001, + "epoch": 0.10341199458890726, + "flos": 14975648092800.0, + "grad_norm": 2.356604748076592, + "language_loss": 0.92601806, + "learning_rate": 3.9437329884941665e-06, + "loss": 0.94820189, + "num_input_tokens_seen": 37158350, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.203125, + "step": 1720, + "time_per_iteration": 2.4628992080688477 + }, + { + "auxiliary_loss_clip": 0.01167182, + "auxiliary_loss_mlp": 0.01054246, + "balance_loss_clip": 1.03239512, + "balance_loss_mlp": 1.04656935, + "epoch": 0.10347211784157523, + "flos": 21031587348480.0, + "grad_norm": 2.8075298743139174, + "language_loss": 0.79416633, + "learning_rate": 3.943641220792039e-06, + "loss": 0.81638062, + "num_input_tokens_seen": 37177120, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.2109375, + "step": 1721, + "time_per_iteration": 2.4713754653930664 + }, + { + "auxiliary_loss_clip": 0.0117694, + "auxiliary_loss_mlp": 0.01056525, + "balance_loss_clip": 1.03317165, + "balance_loss_mlp": 1.05172479, + "epoch": 0.1035322410942432, + "flos": 19792094780160.0, + "grad_norm": 2.496468299898097, + "language_loss": 0.80755401, + "learning_rate": 3.9435493793868434e-06, + "loss": 0.82988858, + "num_input_tokens_seen": 37195895, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.25, + "step": 1722, + "time_per_iteration": 2.4676520824432373 + }, + { + "auxiliary_loss_clip": 0.01056762, + "auxiliary_loss_mlp": 0.01049921, + "balance_loss_clip": 1.04772782, + "balance_loss_mlp": 1.013726, + "epoch": 0.10359236434691117, + "flos": 52698874947840.0, + "grad_norm": 0.9564367479099696, + "language_loss": 0.67185652, + "learning_rate": 3.943457464282059e-06, + "loss": 0.69292337, + "num_input_tokens_seen": 37247270, + "router_z_loss_clip": 0.02197266, + "router_z_loss_mlp": 0.4296875, + "step": 1723, + "time_per_iteration": 2.8474721908569336 + }, + { + "auxiliary_loss_clip": 0.01170693, + "auxiliary_loss_mlp": 0.01050183, + "balance_loss_clip": 1.02951217, + "balance_loss_mlp": 1.04747462, + "epoch": 0.10365248759957914, + "flos": 18405404277120.0, + "grad_norm": 2.780632359822339, + "language_loss": 0.77922273, + "learning_rate": 3.9433654754811745e-06, + "loss": 0.80143142, + "num_input_tokens_seen": 37265595, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.234375, + "step": 1724, + "time_per_iteration": 2.4311840534210205 + }, + { + "auxiliary_loss_clip": 0.01175556, + "auxiliary_loss_mlp": 0.01052899, + "balance_loss_clip": 1.03233576, + "balance_loss_mlp": 1.05101144, + "epoch": 0.1037126108522471, + "flos": 47553555335040.0, + "grad_norm": 1.8180629527722856, + "language_loss": 0.74894094, + "learning_rate": 3.943273412987676e-06, + "loss": 0.77122545, + "num_input_tokens_seen": 37286660, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.2421875, + "step": 1725, + "time_per_iteration": 2.6802284717559814 + }, + { + "auxiliary_loss_clip": 0.01170353, + "auxiliary_loss_mlp": 0.01049343, + "balance_loss_clip": 1.02852905, + "balance_loss_mlp": 1.05098462, + "epoch": 0.10377273410491508, + "flos": 22816670572800.0, + "grad_norm": 2.4392097975248244, + "language_loss": 0.75290418, + "learning_rate": 3.943181276805054e-06, + "loss": 0.77510113, + "num_input_tokens_seen": 37304915, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.1875, + "step": 1726, + "time_per_iteration": 5.461729049682617 + }, + { + "auxiliary_loss_clip": 0.01174745, + "auxiliary_loss_mlp": 0.01059612, + "balance_loss_clip": 1.03765321, + "balance_loss_mlp": 1.0527426, + "epoch": 0.10383285735758305, + "flos": 26138694890880.0, + "grad_norm": 1.8824890959349092, + "language_loss": 0.73943913, + "learning_rate": 3.9430890669368035e-06, + "loss": 0.76178271, + "num_input_tokens_seen": 37325265, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.21875, + "step": 1727, + "time_per_iteration": 3.883134126663208 + }, + { + "auxiliary_loss_clip": 0.01169505, + "auxiliary_loss_mlp": 0.01051483, + "balance_loss_clip": 1.03023946, + "balance_loss_mlp": 1.04815936, + "epoch": 0.10389298061025101, + "flos": 17091791994240.0, + "grad_norm": 2.187385195417556, + "language_loss": 0.84670323, + "learning_rate": 3.942996783386422e-06, + "loss": 0.86891311, + "num_input_tokens_seen": 37341650, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.2109375, + "step": 1728, + "time_per_iteration": 2.4405598640441895 + }, + { + "auxiliary_loss_clip": 0.01171168, + "auxiliary_loss_mlp": 0.0105142, + "balance_loss_clip": 1.02980709, + "balance_loss_mlp": 1.05098438, + "epoch": 0.10395310386291898, + "flos": 20776513893120.0, + "grad_norm": 2.4528097766615677, + "language_loss": 0.70985407, + "learning_rate": 3.942904426157406e-06, + "loss": 0.73207992, + "num_input_tokens_seen": 37360270, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.203125, + "step": 1729, + "time_per_iteration": 2.465688467025757 + }, + { + "auxiliary_loss_clip": 0.01170997, + "auxiliary_loss_mlp": 0.01059912, + "balance_loss_clip": 1.03679705, + "balance_loss_mlp": 1.05000722, + "epoch": 0.10401322711558696, + "flos": 12820540913280.0, + "grad_norm": 2.5788681057232625, + "language_loss": 0.81288344, + "learning_rate": 3.9428119952532605e-06, + "loss": 0.8351925, + "num_input_tokens_seen": 37375225, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.2109375, + "step": 1730, + "time_per_iteration": 2.4582717418670654 + }, + { + "auxiliary_loss_clip": 0.01167657, + "auxiliary_loss_mlp": 0.010515, + "balance_loss_clip": 1.03190255, + "balance_loss_mlp": 1.04836845, + "epoch": 0.10407335036825492, + "flos": 23184683366400.0, + "grad_norm": 2.1021084439253723, + "language_loss": 0.75932384, + "learning_rate": 3.942719490677489e-06, + "loss": 0.78151548, + "num_input_tokens_seen": 37395165, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.1953125, + "step": 1731, + "time_per_iteration": 2.4650096893310547 + }, + { + "auxiliary_loss_clip": 0.01164648, + "auxiliary_loss_mlp": 0.01046999, + "balance_loss_clip": 1.02762735, + "balance_loss_mlp": 1.04899907, + "epoch": 0.10413347362092289, + "flos": 26104184899200.0, + "grad_norm": 1.8082651510271561, + "language_loss": 0.82679468, + "learning_rate": 3.9426269124336e-06, + "loss": 0.84891117, + "num_input_tokens_seen": 37414845, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.15625, + "step": 1732, + "time_per_iteration": 2.5152552127838135 + }, + { + "auxiliary_loss_clip": 0.01169252, + "auxiliary_loss_mlp": 0.01048286, + "balance_loss_clip": 1.02881873, + "balance_loss_mlp": 1.05052853, + "epoch": 0.10419359687359087, + "flos": 12641059630080.0, + "grad_norm": 2.755876599624297, + "language_loss": 0.82947195, + "learning_rate": 3.942534260525104e-06, + "loss": 0.85164732, + "num_input_tokens_seen": 37432490, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.1875, + "step": 1733, + "time_per_iteration": 2.4426257610321045 + }, + { + "auxiliary_loss_clip": 0.01171007, + "auxiliary_loss_mlp": 0.01052347, + "balance_loss_clip": 1.03171146, + "balance_loss_mlp": 1.04982805, + "epoch": 0.10425372012625883, + "flos": 12125094716160.0, + "grad_norm": 2.4971959439308336, + "language_loss": 0.76446331, + "learning_rate": 3.942441534955514e-06, + "loss": 0.78669679, + "num_input_tokens_seen": 37449435, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.2109375, + "step": 1734, + "time_per_iteration": 2.4556663036346436 + }, + { + "auxiliary_loss_clip": 0.01165131, + "auxiliary_loss_mlp": 0.01047841, + "balance_loss_clip": 1.02795696, + "balance_loss_mlp": 1.04718471, + "epoch": 0.1043138433789268, + "flos": 25337563902720.0, + "grad_norm": 1.9861442095390862, + "language_loss": 0.74962163, + "learning_rate": 3.9423487357283465e-06, + "loss": 0.7717514, + "num_input_tokens_seen": 37469105, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.1796875, + "step": 1735, + "time_per_iteration": 2.4961798191070557 + }, + { + "auxiliary_loss_clip": 0.01172587, + "auxiliary_loss_mlp": 0.01048204, + "balance_loss_clip": 1.02724743, + "balance_loss_mlp": 1.05081487, + "epoch": 0.10437396663159478, + "flos": 29167149352320.0, + "grad_norm": 1.9829662552727403, + "language_loss": 0.79049939, + "learning_rate": 3.94225586284712e-06, + "loss": 0.8127073, + "num_input_tokens_seen": 37490540, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.21875, + "step": 1736, + "time_per_iteration": 2.530808448791504 + }, + { + "auxiliary_loss_clip": 0.01166906, + "auxiliary_loss_mlp": 0.01057245, + "balance_loss_clip": 1.03655005, + "balance_loss_mlp": 1.0491184, + "epoch": 0.10443408988426274, + "flos": 25080946162560.0, + "grad_norm": 1.8105684861006923, + "language_loss": 0.70339012, + "learning_rate": 3.942162916315356e-06, + "loss": 0.72563159, + "num_input_tokens_seen": 37511905, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.171875, + "step": 1737, + "time_per_iteration": 2.4789419174194336 + }, + { + "auxiliary_loss_clip": 0.01170601, + "auxiliary_loss_mlp": 0.01051121, + "balance_loss_clip": 1.02758932, + "balance_loss_mlp": 1.04718471, + "epoch": 0.1044942131369307, + "flos": 26759662237440.0, + "grad_norm": 2.004598680960266, + "language_loss": 0.81483257, + "learning_rate": 3.942069896136581e-06, + "loss": 0.83704984, + "num_input_tokens_seen": 37533635, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.234375, + "step": 1738, + "time_per_iteration": 2.505312442779541 + }, + { + "auxiliary_loss_clip": 0.0116919, + "auxiliary_loss_mlp": 0.01058357, + "balance_loss_clip": 1.0351944, + "balance_loss_mlp": 1.04712963, + "epoch": 0.10455433638959867, + "flos": 18442571875200.0, + "grad_norm": 4.442978598454381, + "language_loss": 0.750579, + "learning_rate": 3.9419768023143196e-06, + "loss": 0.77285445, + "num_input_tokens_seen": 37552035, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.21875, + "step": 1739, + "time_per_iteration": 2.4544031620025635 + }, + { + "auxiliary_loss_clip": 0.01168087, + "auxiliary_loss_mlp": 0.01055908, + "balance_loss_clip": 1.0349865, + "balance_loss_mlp": 1.04893625, + "epoch": 0.10461445964226665, + "flos": 23218977876480.0, + "grad_norm": 1.676051388115223, + "language_loss": 0.77279431, + "learning_rate": 3.941883634852104e-06, + "loss": 0.79503429, + "num_input_tokens_seen": 37571540, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1875, + "step": 1740, + "time_per_iteration": 2.489302635192871 + }, + { + "auxiliary_loss_clip": 0.01169756, + "auxiliary_loss_mlp": 0.01048525, + "balance_loss_clip": 1.02820003, + "balance_loss_mlp": 1.05093944, + "epoch": 0.10467458289493461, + "flos": 24345243797760.0, + "grad_norm": 2.1911967502326775, + "language_loss": 0.85983682, + "learning_rate": 3.941790393753467e-06, + "loss": 0.88201964, + "num_input_tokens_seen": 37588265, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1875, + "step": 1741, + "time_per_iteration": 2.4571211338043213 + }, + { + "auxiliary_loss_clip": 0.01171445, + "auxiliary_loss_mlp": 0.01053113, + "balance_loss_clip": 1.03091609, + "balance_loss_mlp": 1.04901385, + "epoch": 0.10473470614760258, + "flos": 21287953693440.0, + "grad_norm": 4.086245960730198, + "language_loss": 0.74991679, + "learning_rate": 3.941697079021942e-06, + "loss": 0.77216244, + "num_input_tokens_seen": 37606860, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.2265625, + "step": 1742, + "time_per_iteration": 2.4919426441192627 + }, + { + "auxiliary_loss_clip": 0.01171849, + "auxiliary_loss_mlp": 0.01059576, + "balance_loss_clip": 1.03914368, + "balance_loss_mlp": 1.05323386, + "epoch": 0.10479482940027056, + "flos": 21687208341120.0, + "grad_norm": 1.9550995481311175, + "language_loss": 0.87150526, + "learning_rate": 3.94160369066107e-06, + "loss": 0.89381945, + "num_input_tokens_seen": 37625210, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.1875, + "step": 1743, + "time_per_iteration": 2.470841884613037 + }, + { + "auxiliary_loss_clip": 0.01168292, + "auxiliary_loss_mlp": 0.01049872, + "balance_loss_clip": 1.02760363, + "balance_loss_mlp": 1.04964471, + "epoch": 0.10485495265293852, + "flos": 21573694385280.0, + "grad_norm": 2.1176645115958923, + "language_loss": 0.75532508, + "learning_rate": 3.941510228674391e-06, + "loss": 0.77750671, + "num_input_tokens_seen": 37644110, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.1875, + "step": 1744, + "time_per_iteration": 2.4725873470306396 + }, + { + "auxiliary_loss_clip": 0.01171079, + "auxiliary_loss_mlp": 0.01052914, + "balance_loss_clip": 1.03336394, + "balance_loss_mlp": 1.05184436, + "epoch": 0.10491507590560649, + "flos": 37961923708800.0, + "grad_norm": 2.151699961275852, + "language_loss": 0.79306591, + "learning_rate": 3.941416693065451e-06, + "loss": 0.81530583, + "num_input_tokens_seen": 37665800, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.1875, + "step": 1745, + "time_per_iteration": 2.5937912464141846 + }, + { + "auxiliary_loss_clip": 0.01166892, + "auxiliary_loss_mlp": 0.01062835, + "balance_loss_clip": 1.04194999, + "balance_loss_mlp": 1.047683, + "epoch": 0.10497519915827447, + "flos": 26396282298240.0, + "grad_norm": 2.087314316255438, + "language_loss": 0.82382894, + "learning_rate": 3.941323083837794e-06, + "loss": 0.8461262, + "num_input_tokens_seen": 37685095, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1875, + "step": 1746, + "time_per_iteration": 2.520306348800659 + }, + { + "auxiliary_loss_clip": 0.01170145, + "auxiliary_loss_mlp": 0.01062461, + "balance_loss_clip": 1.04186153, + "balance_loss_mlp": 1.05198646, + "epoch": 0.10503532241094243, + "flos": 40662190581120.0, + "grad_norm": 1.645771273172373, + "language_loss": 0.69951761, + "learning_rate": 3.941229400994971e-06, + "loss": 0.7218436, + "num_input_tokens_seen": 37707445, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1875, + "step": 1747, + "time_per_iteration": 2.618581771850586 + }, + { + "auxiliary_loss_clip": 0.01176288, + "auxiliary_loss_mlp": 0.01062255, + "balance_loss_clip": 1.04140496, + "balance_loss_mlp": 1.05136323, + "epoch": 0.1050954456636104, + "flos": 29789409588480.0, + "grad_norm": 2.3385484358742192, + "language_loss": 0.84245849, + "learning_rate": 3.941135644540535e-06, + "loss": 0.86484385, + "num_input_tokens_seen": 37728325, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.25, + "step": 1748, + "time_per_iteration": 2.539386034011841 + }, + { + "auxiliary_loss_clip": 0.01165269, + "auxiliary_loss_mlp": 0.01049548, + "balance_loss_clip": 1.02797103, + "balance_loss_mlp": 1.04729426, + "epoch": 0.10515556891627838, + "flos": 23948754497280.0, + "grad_norm": 1.8953667439120294, + "language_loss": 0.71491921, + "learning_rate": 3.941041814478041e-06, + "loss": 0.7370674, + "num_input_tokens_seen": 37748910, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.1796875, + "step": 1749, + "time_per_iteration": 2.481700897216797 + }, + { + "auxiliary_loss_clip": 0.01166695, + "auxiliary_loss_mlp": 0.01060715, + "balance_loss_clip": 1.0395906, + "balance_loss_mlp": 1.04953468, + "epoch": 0.10521569216894634, + "flos": 18259606972800.0, + "grad_norm": 1.9760411129591238, + "language_loss": 0.81960011, + "learning_rate": 3.940947910811047e-06, + "loss": 0.84187424, + "num_input_tokens_seen": 37765745, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.171875, + "step": 1750, + "time_per_iteration": 2.4454832077026367 + }, + { + "auxiliary_loss_clip": 0.01173904, + "auxiliary_loss_mlp": 0.01060945, + "balance_loss_clip": 1.03946304, + "balance_loss_mlp": 1.05259562, + "epoch": 0.10527581542161431, + "flos": 15630909949440.0, + "grad_norm": 2.3402404294313524, + "language_loss": 0.91871023, + "learning_rate": 3.940853933543114e-06, + "loss": 0.94105875, + "num_input_tokens_seen": 37780520, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.21875, + "step": 1751, + "time_per_iteration": 2.416607141494751 + }, + { + "auxiliary_loss_clip": 0.01166575, + "auxiliary_loss_mlp": 0.0104776, + "balance_loss_clip": 1.02698207, + "balance_loss_mlp": 1.04889047, + "epoch": 0.10533593867428227, + "flos": 18296559089280.0, + "grad_norm": 2.265296057434122, + "language_loss": 0.79560149, + "learning_rate": 3.940759882677805e-06, + "loss": 0.81774485, + "num_input_tokens_seen": 37799515, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.1796875, + "step": 1752, + "time_per_iteration": 2.46063494682312 + }, + { + "auxiliary_loss_clip": 0.01167711, + "auxiliary_loss_mlp": 0.01052906, + "balance_loss_clip": 1.03202033, + "balance_loss_mlp": 1.05050862, + "epoch": 0.10539606192695025, + "flos": 29023219555200.0, + "grad_norm": 2.1401152378303867, + "language_loss": 0.75782037, + "learning_rate": 3.940665758218686e-06, + "loss": 0.78002656, + "num_input_tokens_seen": 37818695, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.171875, + "step": 1753, + "time_per_iteration": 2.51767635345459 + }, + { + "auxiliary_loss_clip": 0.01172527, + "auxiliary_loss_mlp": 0.01057137, + "balance_loss_clip": 1.03436756, + "balance_loss_mlp": 1.04939532, + "epoch": 0.10545618517961822, + "flos": 19969313506560.0, + "grad_norm": 2.0790136174876546, + "language_loss": 0.84048498, + "learning_rate": 3.940571560169328e-06, + "loss": 0.86278164, + "num_input_tokens_seen": 37837860, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.234375, + "step": 1754, + "time_per_iteration": 2.4683756828308105 + }, + { + "auxiliary_loss_clip": 0.01175207, + "auxiliary_loss_mlp": 0.01053622, + "balance_loss_clip": 1.03044736, + "balance_loss_mlp": 1.05438888, + "epoch": 0.10551630843228618, + "flos": 16143427157760.0, + "grad_norm": 2.8736094439376645, + "language_loss": 0.68956709, + "learning_rate": 3.940477288533302e-06, + "loss": 0.71185535, + "num_input_tokens_seen": 37856260, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.2109375, + "step": 1755, + "time_per_iteration": 2.45597243309021 + }, + { + "auxiliary_loss_clip": 0.01172827, + "auxiliary_loss_mlp": 0.01061763, + "balance_loss_clip": 1.03989983, + "balance_loss_mlp": 1.05102587, + "epoch": 0.10557643168495416, + "flos": 23440115957760.0, + "grad_norm": 5.502613786824721, + "language_loss": 0.76718754, + "learning_rate": 3.940382943314182e-06, + "loss": 0.78953344, + "num_input_tokens_seen": 37876960, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.21875, + "step": 1756, + "time_per_iteration": 2.5239176750183105 + }, + { + "auxiliary_loss_clip": 0.01172125, + "auxiliary_loss_mlp": 0.01058013, + "balance_loss_clip": 1.03712726, + "balance_loss_mlp": 1.04982626, + "epoch": 0.10563655493762213, + "flos": 21799034357760.0, + "grad_norm": 1.7784869470084927, + "language_loss": 0.80162531, + "learning_rate": 3.940288524515547e-06, + "loss": 0.82392669, + "num_input_tokens_seen": 37897070, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.21875, + "step": 1757, + "time_per_iteration": 2.4551706314086914 + }, + { + "auxiliary_loss_clip": 0.01171845, + "auxiliary_loss_mlp": 0.01056344, + "balance_loss_clip": 1.03499317, + "balance_loss_mlp": 1.05132246, + "epoch": 0.10569667819029009, + "flos": 53800863275520.0, + "grad_norm": 1.631431596421375, + "language_loss": 0.78800333, + "learning_rate": 3.940194032140976e-06, + "loss": 0.81028521, + "num_input_tokens_seen": 37923635, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.203125, + "step": 1758, + "time_per_iteration": 2.7955896854400635 + }, + { + "auxiliary_loss_clip": 0.01177436, + "auxiliary_loss_mlp": 0.01050523, + "balance_loss_clip": 1.02865982, + "balance_loss_mlp": 1.05364573, + "epoch": 0.10575680144295807, + "flos": 22925515760640.0, + "grad_norm": 2.609159841262955, + "language_loss": 0.9189958, + "learning_rate": 3.940099466194054e-06, + "loss": 0.94127536, + "num_input_tokens_seen": 37942650, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.234375, + "step": 1759, + "time_per_iteration": 2.4853782653808594 + }, + { + "auxiliary_loss_clip": 0.01173064, + "auxiliary_loss_mlp": 0.01055702, + "balance_loss_clip": 1.03276575, + "balance_loss_mlp": 1.04970741, + "epoch": 0.10581692469562604, + "flos": 14136667148160.0, + "grad_norm": 2.498568213886603, + "language_loss": 0.76932353, + "learning_rate": 3.940004826678365e-06, + "loss": 0.79161119, + "num_input_tokens_seen": 37960660, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.234375, + "step": 1760, + "time_per_iteration": 2.470705509185791 + }, + { + "auxiliary_loss_clip": 0.01173982, + "auxiliary_loss_mlp": 0.01061265, + "balance_loss_clip": 1.03825736, + "balance_loss_mlp": 1.05152941, + "epoch": 0.105877047948294, + "flos": 25958674903680.0, + "grad_norm": 2.349800445259612, + "language_loss": 0.89282435, + "learning_rate": 3.939910113597498e-06, + "loss": 0.91517675, + "num_input_tokens_seen": 37978625, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.2265625, + "step": 1761, + "time_per_iteration": 2.491501569747925 + }, + { + "auxiliary_loss_clip": 0.01173015, + "auxiliary_loss_mlp": 0.01060542, + "balance_loss_clip": 1.03944254, + "balance_loss_mlp": 1.0518589, + "epoch": 0.10593717120096197, + "flos": 30664768032000.0, + "grad_norm": 2.4794664397863877, + "language_loss": 0.78304708, + "learning_rate": 3.9398153269550464e-06, + "loss": 0.80538261, + "num_input_tokens_seen": 38000005, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.2109375, + "step": 1762, + "time_per_iteration": 2.5563831329345703 + }, + { + "auxiliary_loss_clip": 0.01062071, + "auxiliary_loss_mlp": 0.01014008, + "balance_loss_clip": 1.0110991, + "balance_loss_mlp": 1.02000487, + "epoch": 0.10599729445362994, + "flos": 66436682497920.0, + "grad_norm": 0.753444103392694, + "language_loss": 0.60481733, + "learning_rate": 3.939720466754602e-06, + "loss": 0.62557811, + "num_input_tokens_seen": 38066165, + "router_z_loss_clip": 0.02905273, + "router_z_loss_mlp": 0.421875, + "step": 1763, + "time_per_iteration": 3.2239294052124023 + }, + { + "auxiliary_loss_clip": 0.01170891, + "auxiliary_loss_mlp": 0.01048971, + "balance_loss_clip": 1.02777529, + "balance_loss_mlp": 1.04924011, + "epoch": 0.10605741770629791, + "flos": 23948179879680.0, + "grad_norm": 2.054980370260194, + "language_loss": 0.8010751, + "learning_rate": 3.939625532999763e-06, + "loss": 0.82327372, + "num_input_tokens_seen": 38086150, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.21875, + "step": 1764, + "time_per_iteration": 2.476325273513794 + }, + { + "auxiliary_loss_clip": 0.01169028, + "auxiliary_loss_mlp": 0.01049345, + "balance_loss_clip": 1.02745855, + "balance_loss_mlp": 1.04961264, + "epoch": 0.10611754095896588, + "flos": 19387524919680.0, + "grad_norm": 1.7621956234955212, + "language_loss": 0.7999962, + "learning_rate": 3.9395305256941314e-06, + "loss": 0.82217997, + "num_input_tokens_seen": 38104205, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.1953125, + "step": 1765, + "time_per_iteration": 2.446593999862671 + }, + { + "auxiliary_loss_clip": 0.01167126, + "auxiliary_loss_mlp": 0.01054873, + "balance_loss_clip": 1.03394008, + "balance_loss_mlp": 1.04794002, + "epoch": 0.10617766421163385, + "flos": 22237755073920.0, + "grad_norm": 1.867239621884004, + "language_loss": 0.76693732, + "learning_rate": 3.939435444841306e-06, + "loss": 0.78915727, + "num_input_tokens_seen": 38122005, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1875, + "step": 1766, + "time_per_iteration": 2.4462356567382812 + }, + { + "auxiliary_loss_clip": 0.01170332, + "auxiliary_loss_mlp": 0.01059306, + "balance_loss_clip": 1.0366683, + "balance_loss_mlp": 1.05017042, + "epoch": 0.10623778746430182, + "flos": 28404407024640.0, + "grad_norm": 1.6580981789618001, + "language_loss": 0.77319431, + "learning_rate": 3.939340290444895e-06, + "loss": 0.79549068, + "num_input_tokens_seen": 38143365, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.203125, + "step": 1767, + "time_per_iteration": 2.542797088623047 + }, + { + "auxiliary_loss_clip": 0.01060068, + "auxiliary_loss_mlp": 0.01000453, + "balance_loss_clip": 0.99785471, + "balance_loss_mlp": 1.01804066, + "epoch": 0.10629791071696978, + "flos": 64234639221120.0, + "grad_norm": 0.6789245534488961, + "language_loss": 0.57902765, + "learning_rate": 3.939245062508506e-06, + "loss": 0.59963286, + "num_input_tokens_seen": 38210035, + "router_z_loss_clip": 0.02600098, + "router_z_loss_mlp": 0.421875, + "step": 1768, + "time_per_iteration": 6.071596384048462 + }, + { + "auxiliary_loss_clip": 0.01172748, + "auxiliary_loss_mlp": 0.01041813, + "balance_loss_clip": 1.0219171, + "balance_loss_mlp": 1.05201912, + "epoch": 0.10635803396963776, + "flos": 22747578762240.0, + "grad_norm": 1.446404125156032, + "language_loss": 0.86796767, + "learning_rate": 3.939149761035749e-06, + "loss": 0.89011335, + "num_input_tokens_seen": 38231230, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.2109375, + "step": 1769, + "time_per_iteration": 2.5106868743896484 + }, + { + "auxiliary_loss_clip": 0.01175908, + "auxiliary_loss_mlp": 0.01056805, + "balance_loss_clip": 1.03496528, + "balance_loss_mlp": 1.05300689, + "epoch": 0.10641815722230573, + "flos": 31395586147200.0, + "grad_norm": 1.766851816283336, + "language_loss": 0.61890501, + "learning_rate": 3.9390543860302395e-06, + "loss": 0.64123213, + "num_input_tokens_seen": 38253890, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.2265625, + "step": 1770, + "time_per_iteration": 2.5770323276519775 + }, + { + "auxiliary_loss_clip": 0.01061292, + "auxiliary_loss_mlp": 0.01003176, + "balance_loss_clip": 1.00058925, + "balance_loss_mlp": 1.01873469, + "epoch": 0.1064782804749737, + "flos": 58552527784320.0, + "grad_norm": 0.8864779346546747, + "language_loss": 0.57095039, + "learning_rate": 3.9389589374955925e-06, + "loss": 0.59159505, + "num_input_tokens_seen": 38304290, + "router_z_loss_clip": 0.02587891, + "router_z_loss_mlp": 0.42578125, + "step": 1771, + "time_per_iteration": 2.957993507385254 + }, + { + "auxiliary_loss_clip": 0.01174087, + "auxiliary_loss_mlp": 0.01063103, + "balance_loss_clip": 1.04187179, + "balance_loss_mlp": 1.05443954, + "epoch": 0.10653840372764166, + "flos": 23987825516160.0, + "grad_norm": 1.6398085638646198, + "language_loss": 0.88530469, + "learning_rate": 3.938863415435429e-06, + "loss": 0.90767658, + "num_input_tokens_seen": 38324725, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1953125, + "step": 1772, + "time_per_iteration": 2.520744562149048 + }, + { + "auxiliary_loss_clip": 0.01176768, + "auxiliary_loss_mlp": 0.01063607, + "balance_loss_clip": 1.03945482, + "balance_loss_mlp": 1.05091381, + "epoch": 0.10659852698030964, + "flos": 18294655668480.0, + "grad_norm": 2.8236986107629094, + "language_loss": 0.76021719, + "learning_rate": 3.93876781985337e-06, + "loss": 0.78262091, + "num_input_tokens_seen": 38340735, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.2578125, + "step": 1773, + "time_per_iteration": 2.4228129386901855 + }, + { + "auxiliary_loss_clip": 0.01171647, + "auxiliary_loss_mlp": 0.01063224, + "balance_loss_clip": 1.04087257, + "balance_loss_mlp": 1.05147731, + "epoch": 0.1066586502329776, + "flos": 32160591031680.0, + "grad_norm": 2.1931291175477177, + "language_loss": 0.83184093, + "learning_rate": 3.938672150753041e-06, + "loss": 0.85418963, + "num_input_tokens_seen": 38361315, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.203125, + "step": 1774, + "time_per_iteration": 2.5613787174224854 + }, + { + "auxiliary_loss_clip": 0.01177598, + "auxiliary_loss_mlp": 0.01054445, + "balance_loss_clip": 1.03262997, + "balance_loss_mlp": 1.05220413, + "epoch": 0.10671877348564557, + "flos": 17785155202560.0, + "grad_norm": 2.683505024819064, + "language_loss": 0.76297373, + "learning_rate": 3.9385764081380704e-06, + "loss": 0.78529418, + "num_input_tokens_seen": 38377425, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.25, + "step": 1775, + "time_per_iteration": 2.437676429748535 + }, + { + "auxiliary_loss_clip": 0.01057587, + "auxiliary_loss_mlp": 0.01006639, + "balance_loss_clip": 1.00413537, + "balance_loss_mlp": 1.01520467, + "epoch": 0.10677889673831355, + "flos": 63510177813120.0, + "grad_norm": 0.8253045983972309, + "language_loss": 0.57443953, + "learning_rate": 3.9384805920120876e-06, + "loss": 0.59508181, + "num_input_tokens_seen": 38440275, + "router_z_loss_clip": 0.02502441, + "router_z_loss_mlp": 0.42382812, + "step": 1776, + "time_per_iteration": 3.101378917694092 + }, + { + "auxiliary_loss_clip": 0.01176962, + "auxiliary_loss_mlp": 0.01059775, + "balance_loss_clip": 1.0365653, + "balance_loss_mlp": 1.05411029, + "epoch": 0.10683901999098151, + "flos": 22017694400640.0, + "grad_norm": 1.6481869723516467, + "language_loss": 0.83374244, + "learning_rate": 3.938384702378727e-06, + "loss": 0.8561098, + "num_input_tokens_seen": 38461820, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.2265625, + "step": 1777, + "time_per_iteration": 2.5109002590179443 + }, + { + "auxiliary_loss_clip": 0.01170133, + "auxiliary_loss_mlp": 0.01055162, + "balance_loss_clip": 1.03371584, + "balance_loss_mlp": 1.05298579, + "epoch": 0.10689914324364948, + "flos": 25042952551680.0, + "grad_norm": 2.6420984425067013, + "language_loss": 0.87275863, + "learning_rate": 3.938288739241625e-06, + "loss": 0.89501154, + "num_input_tokens_seen": 38482235, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.171875, + "step": 1778, + "time_per_iteration": 2.503103494644165 + }, + { + "auxiliary_loss_clip": 0.01175003, + "auxiliary_loss_mlp": 0.01053847, + "balance_loss_clip": 1.032354, + "balance_loss_mlp": 1.05328, + "epoch": 0.10695926649631746, + "flos": 16435129507200.0, + "grad_norm": 2.213225731734914, + "language_loss": 0.83970487, + "learning_rate": 3.938192702604417e-06, + "loss": 0.86199337, + "num_input_tokens_seen": 38500690, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.21875, + "step": 1779, + "time_per_iteration": 2.4574496746063232 + }, + { + "auxiliary_loss_clip": 0.01169562, + "auxiliary_loss_mlp": 0.01052117, + "balance_loss_clip": 1.03086162, + "balance_loss_mlp": 1.04975557, + "epoch": 0.10701938974898542, + "flos": 16979211792000.0, + "grad_norm": 2.4959309518827655, + "language_loss": 0.67064941, + "learning_rate": 3.9380965924707495e-06, + "loss": 0.69286621, + "num_input_tokens_seen": 38518405, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1953125, + "step": 1780, + "time_per_iteration": 2.447756052017212 + }, + { + "auxiliary_loss_clip": 0.01171847, + "auxiliary_loss_mlp": 0.01046888, + "balance_loss_clip": 1.02553749, + "balance_loss_mlp": 1.05183458, + "epoch": 0.10707951300165339, + "flos": 15888102307200.0, + "grad_norm": 2.25546613947904, + "language_loss": 0.91667759, + "learning_rate": 3.938000408844265e-06, + "loss": 0.93886495, + "num_input_tokens_seen": 38535060, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.203125, + "step": 1781, + "time_per_iteration": 2.4367144107818604 + }, + { + "auxiliary_loss_clip": 0.01175433, + "auxiliary_loss_mlp": 0.01046071, + "balance_loss_clip": 1.02524495, + "balance_loss_mlp": 1.05302, + "epoch": 0.10713963625432135, + "flos": 14247164361600.0, + "grad_norm": 2.202402738572802, + "language_loss": 0.79505372, + "learning_rate": 3.9379041517286105e-06, + "loss": 0.81726873, + "num_input_tokens_seen": 38552855, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.2265625, + "step": 1782, + "time_per_iteration": 2.4340877532958984 + }, + { + "auxiliary_loss_clip": 0.01175468, + "auxiliary_loss_mlp": 0.01052246, + "balance_loss_clip": 1.03055024, + "balance_loss_mlp": 1.0517509, + "epoch": 0.10719975950698933, + "flos": 16756780821120.0, + "grad_norm": 2.0445491568240994, + "language_loss": 0.78994977, + "learning_rate": 3.937807821127436e-06, + "loss": 0.81222689, + "num_input_tokens_seen": 38570075, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.234375, + "step": 1783, + "time_per_iteration": 2.434527635574341 + }, + { + "auxiliary_loss_clip": 0.01176375, + "auxiliary_loss_mlp": 0.01052212, + "balance_loss_clip": 1.02991986, + "balance_loss_mlp": 1.0529108, + "epoch": 0.1072598827596573, + "flos": 22710626645760.0, + "grad_norm": 1.8050343336808015, + "language_loss": 0.85956216, + "learning_rate": 3.937711417044395e-06, + "loss": 0.88184798, + "num_input_tokens_seen": 38587970, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.234375, + "step": 1784, + "time_per_iteration": 2.4867746829986572 + }, + { + "auxiliary_loss_clip": 0.01174134, + "auxiliary_loss_mlp": 0.01054075, + "balance_loss_clip": 1.03188968, + "balance_loss_mlp": 1.05080986, + "epoch": 0.10732000601232526, + "flos": 23258264376960.0, + "grad_norm": 3.0774406347184806, + "language_loss": 1.00899053, + "learning_rate": 3.937614939483143e-06, + "loss": 1.03127265, + "num_input_tokens_seen": 38605840, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.234375, + "step": 1785, + "time_per_iteration": 2.46663498878479 + }, + { + "auxiliary_loss_clip": 0.01171119, + "auxiliary_loss_mlp": 0.01057254, + "balance_loss_clip": 1.03636849, + "balance_loss_mlp": 1.05306709, + "epoch": 0.10738012926499324, + "flos": 24207060176640.0, + "grad_norm": 1.4495948735276882, + "language_loss": 0.85070992, + "learning_rate": 3.937518388447339e-06, + "loss": 0.87299371, + "num_input_tokens_seen": 38627070, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1796875, + "step": 1786, + "time_per_iteration": 2.505018949508667 + }, + { + "auxiliary_loss_clip": 0.01170035, + "auxiliary_loss_mlp": 0.01059108, + "balance_loss_clip": 1.035779, + "balance_loss_mlp": 1.04750311, + "epoch": 0.1074402525176612, + "flos": 20923065383040.0, + "grad_norm": 1.8788886178726656, + "language_loss": 0.78817046, + "learning_rate": 3.937421763940642e-06, + "loss": 0.81046188, + "num_input_tokens_seen": 38645840, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.21875, + "step": 1787, + "time_per_iteration": 2.443532705307007 + }, + { + "auxiliary_loss_clip": 0.01176938, + "auxiliary_loss_mlp": 0.01049821, + "balance_loss_clip": 1.02768385, + "balance_loss_mlp": 1.0517112, + "epoch": 0.10750037577032917, + "flos": 16946928443520.0, + "grad_norm": 2.551869220071384, + "language_loss": 0.82557851, + "learning_rate": 3.937325065966719e-06, + "loss": 0.84784609, + "num_input_tokens_seen": 38664770, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.25, + "step": 1788, + "time_per_iteration": 2.4807305335998535 + }, + { + "auxiliary_loss_clip": 0.01170019, + "auxiliary_loss_mlp": 0.0106343, + "balance_loss_clip": 1.04219902, + "balance_loss_mlp": 1.04939878, + "epoch": 0.10756049902299715, + "flos": 20266546550400.0, + "grad_norm": 2.778852512980128, + "language_loss": 0.77794182, + "learning_rate": 3.9372282945292335e-06, + "loss": 0.80027628, + "num_input_tokens_seen": 38683865, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.203125, + "step": 1789, + "time_per_iteration": 2.482330322265625 + }, + { + "auxiliary_loss_clip": 0.01173111, + "auxiliary_loss_mlp": 0.01061244, + "balance_loss_clip": 1.03631723, + "balance_loss_mlp": 1.05133712, + "epoch": 0.10762062227566511, + "flos": 23586523793280.0, + "grad_norm": 2.434124451319009, + "language_loss": 0.74467903, + "learning_rate": 3.937131449631859e-06, + "loss": 0.76702261, + "num_input_tokens_seen": 38702485, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 1.21875, + "step": 1790, + "time_per_iteration": 2.5921239852905273 + }, + { + "auxiliary_loss_clip": 0.01177807, + "auxiliary_loss_mlp": 0.01072366, + "balance_loss_clip": 1.04766607, + "balance_loss_mlp": 1.05428767, + "epoch": 0.10768074552833308, + "flos": 24310626065280.0, + "grad_norm": 2.5839507236364554, + "language_loss": 0.78495383, + "learning_rate": 3.9370345312782645e-06, + "loss": 0.80745554, + "num_input_tokens_seen": 38722475, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.234375, + "step": 1791, + "time_per_iteration": 2.5242488384246826 + }, + { + "auxiliary_loss_clip": 0.01167341, + "auxiliary_loss_mlp": 0.01053897, + "balance_loss_clip": 1.0330478, + "balance_loss_mlp": 1.05112934, + "epoch": 0.10774086878100106, + "flos": 25299965341440.0, + "grad_norm": 1.8605555947944812, + "language_loss": 0.70855284, + "learning_rate": 3.936937539472126e-06, + "loss": 0.73076522, + "num_input_tokens_seen": 38743285, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.15625, + "step": 1792, + "time_per_iteration": 2.5260751247406006 + }, + { + "auxiliary_loss_clip": 0.01175824, + "auxiliary_loss_mlp": 0.010463, + "balance_loss_clip": 1.02330506, + "balance_loss_mlp": 1.05109024, + "epoch": 0.10780099203366902, + "flos": 22054035985920.0, + "grad_norm": 2.973355145299492, + "language_loss": 0.76029646, + "learning_rate": 3.9368404742171236e-06, + "loss": 0.78251767, + "num_input_tokens_seen": 38763035, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.25, + "step": 1793, + "time_per_iteration": 2.5037007331848145 + }, + { + "auxiliary_loss_clip": 0.01171847, + "auxiliary_loss_mlp": 0.01060242, + "balance_loss_clip": 1.03793848, + "balance_loss_mlp": 1.0537113, + "epoch": 0.10786111528633699, + "flos": 22747471021440.0, + "grad_norm": 1.7251623627880495, + "language_loss": 0.85158944, + "learning_rate": 3.936743335516936e-06, + "loss": 0.87391031, + "num_input_tokens_seen": 38784900, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.1875, + "step": 1794, + "time_per_iteration": 2.5210132598876953 + }, + { + "auxiliary_loss_clip": 0.01180393, + "auxiliary_loss_mlp": 0.01052991, + "balance_loss_clip": 1.02954292, + "balance_loss_mlp": 1.05342674, + "epoch": 0.10792123853900495, + "flos": 20851064570880.0, + "grad_norm": 1.9245153565321482, + "language_loss": 0.74914879, + "learning_rate": 3.936646123375246e-06, + "loss": 0.77148265, + "num_input_tokens_seen": 38804695, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.265625, + "step": 1795, + "time_per_iteration": 2.486111879348755 + }, + { + "auxiliary_loss_clip": 0.0117609, + "auxiliary_loss_mlp": 0.01060963, + "balance_loss_clip": 1.03863525, + "balance_loss_mlp": 1.05227423, + "epoch": 0.10798136179167293, + "flos": 17748705876480.0, + "grad_norm": 2.917857918230487, + "language_loss": 0.8116014, + "learning_rate": 3.936548837795741e-06, + "loss": 0.83397192, + "num_input_tokens_seen": 38822395, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.234375, + "step": 1796, + "time_per_iteration": 2.4357504844665527 + }, + { + "auxiliary_loss_clip": 0.01177296, + "auxiliary_loss_mlp": 0.01075942, + "balance_loss_clip": 1.05260134, + "balance_loss_mlp": 1.05476594, + "epoch": 0.1080414850443409, + "flos": 13589639948160.0, + "grad_norm": 2.4043777768562293, + "language_loss": 0.73476732, + "learning_rate": 3.936451478782111e-06, + "loss": 0.75729972, + "num_input_tokens_seen": 38839865, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.21875, + "step": 1797, + "time_per_iteration": 2.477867841720581 + }, + { + "auxiliary_loss_clip": 0.01172695, + "auxiliary_loss_mlp": 0.01051138, + "balance_loss_clip": 1.03081274, + "balance_loss_mlp": 1.05260658, + "epoch": 0.10810160829700886, + "flos": 16253421580800.0, + "grad_norm": 3.1892188654982396, + "language_loss": 0.81348622, + "learning_rate": 3.936354046338046e-06, + "loss": 0.83572453, + "num_input_tokens_seen": 38857300, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.203125, + "step": 1798, + "time_per_iteration": 2.5060064792633057 + }, + { + "auxiliary_loss_clip": 0.011719, + "auxiliary_loss_mlp": 0.01053896, + "balance_loss_clip": 1.03075755, + "balance_loss_mlp": 1.0508821, + "epoch": 0.10816173154967684, + "flos": 15158002464000.0, + "grad_norm": 2.4195393058725623, + "language_loss": 0.85180116, + "learning_rate": 3.936256540467242e-06, + "loss": 0.87405908, + "num_input_tokens_seen": 38874960, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.2109375, + "step": 1799, + "time_per_iteration": 2.4546945095062256 + }, + { + "auxiliary_loss_clip": 0.01172982, + "auxiliary_loss_mlp": 0.01064124, + "balance_loss_clip": 1.04271412, + "balance_loss_mlp": 1.0546999, + "epoch": 0.10822185480234481, + "flos": 17785334770560.0, + "grad_norm": 2.2474252534922265, + "language_loss": 0.77365196, + "learning_rate": 3.9361589611733955e-06, + "loss": 0.79602301, + "num_input_tokens_seen": 38893610, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.1796875, + "step": 1800, + "time_per_iteration": 2.4650769233703613 + }, + { + "auxiliary_loss_clip": 0.01168665, + "auxiliary_loss_mlp": 0.01044543, + "balance_loss_clip": 1.02443254, + "balance_loss_mlp": 1.05136347, + "epoch": 0.10828197805501277, + "flos": 25556654908800.0, + "grad_norm": 2.2954016650766844, + "language_loss": 0.7287963, + "learning_rate": 3.9360613084602075e-06, + "loss": 0.7509284, + "num_input_tokens_seen": 38913485, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.171875, + "step": 1801, + "time_per_iteration": 2.5045113563537598 + }, + { + "auxiliary_loss_clip": 0.01177863, + "auxiliary_loss_mlp": 0.01048534, + "balance_loss_clip": 1.02785134, + "balance_loss_mlp": 1.05259442, + "epoch": 0.10834210130768075, + "flos": 28984435845120.0, + "grad_norm": 1.8364602771794378, + "language_loss": 0.66427058, + "learning_rate": 3.935963582331381e-06, + "loss": 0.68653458, + "num_input_tokens_seen": 38935650, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.25, + "step": 1802, + "time_per_iteration": 2.5547947883605957 + }, + { + "auxiliary_loss_clip": 0.01170285, + "auxiliary_loss_mlp": 0.01059138, + "balance_loss_clip": 1.03712058, + "balance_loss_mlp": 1.05202222, + "epoch": 0.10840222456034872, + "flos": 20264212166400.0, + "grad_norm": 1.7898565484043845, + "language_loss": 0.8136133, + "learning_rate": 3.935865782790621e-06, + "loss": 0.83590758, + "num_input_tokens_seen": 38954130, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1796875, + "step": 1803, + "time_per_iteration": 2.4758658409118652 + }, + { + "auxiliary_loss_clip": 0.0116949, + "auxiliary_loss_mlp": 0.01052862, + "balance_loss_clip": 1.031106, + "balance_loss_mlp": 1.05126929, + "epoch": 0.10846234781301668, + "flos": 19863054097920.0, + "grad_norm": 2.61974519761109, + "language_loss": 0.9122982, + "learning_rate": 3.9357679098416365e-06, + "loss": 0.93452168, + "num_input_tokens_seen": 38972905, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.1875, + "step": 1804, + "time_per_iteration": 2.4858944416046143 + }, + { + "auxiliary_loss_clip": 0.01175328, + "auxiliary_loss_mlp": 0.01052796, + "balance_loss_clip": 1.03031349, + "balance_loss_mlp": 1.05401301, + "epoch": 0.10852247106568465, + "flos": 26469037296000.0, + "grad_norm": 2.0091269076806078, + "language_loss": 0.7623654, + "learning_rate": 3.935669963488139e-06, + "loss": 0.78464663, + "num_input_tokens_seen": 38993255, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.2109375, + "step": 1805, + "time_per_iteration": 2.5379836559295654 + }, + { + "auxiliary_loss_clip": 0.01172079, + "auxiliary_loss_mlp": 0.01048159, + "balance_loss_clip": 1.02842999, + "balance_loss_mlp": 1.0535754, + "epoch": 0.10858259431835263, + "flos": 30081506987520.0, + "grad_norm": 1.8192828849331855, + "language_loss": 0.860416, + "learning_rate": 3.935571943733843e-06, + "loss": 0.88261837, + "num_input_tokens_seen": 39012610, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1875, + "step": 1806, + "time_per_iteration": 2.5523955821990967 + }, + { + "auxiliary_loss_clip": 0.01170931, + "auxiliary_loss_mlp": 0.01053704, + "balance_loss_clip": 1.03275895, + "balance_loss_mlp": 1.05068612, + "epoch": 0.10864271757102059, + "flos": 19063180085760.0, + "grad_norm": 5.439462316727856, + "language_loss": 0.80572915, + "learning_rate": 3.9354738505824635e-06, + "loss": 0.82797557, + "num_input_tokens_seen": 39030120, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.203125, + "step": 1807, + "time_per_iteration": 2.514390230178833 + }, + { + "auxiliary_loss_clip": 0.01171878, + "auxiliary_loss_mlp": 0.01051305, + "balance_loss_clip": 1.03168321, + "balance_loss_mlp": 1.05415583, + "epoch": 0.10870284082368856, + "flos": 24715052271360.0, + "grad_norm": 1.7684897552837426, + "language_loss": 0.78731525, + "learning_rate": 3.9353756840377225e-06, + "loss": 0.80954707, + "num_input_tokens_seen": 39049875, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.171875, + "step": 1808, + "time_per_iteration": 2.5084331035614014 + }, + { + "auxiliary_loss_clip": 0.01176105, + "auxiliary_loss_mlp": 0.01052005, + "balance_loss_clip": 1.03090501, + "balance_loss_mlp": 1.05633223, + "epoch": 0.10876296407635654, + "flos": 20627663932800.0, + "grad_norm": 1.6609588216066864, + "language_loss": 0.78927523, + "learning_rate": 3.935277444103342e-06, + "loss": 0.81155634, + "num_input_tokens_seen": 39068935, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1953125, + "step": 1809, + "time_per_iteration": 5.368049621582031 + }, + { + "auxiliary_loss_clip": 0.01171492, + "auxiliary_loss_mlp": 0.01053913, + "balance_loss_clip": 1.03318286, + "balance_loss_mlp": 1.05087388, + "epoch": 0.1088230873290245, + "flos": 21579835610880.0, + "grad_norm": 2.0370215842844197, + "language_loss": 0.8468523, + "learning_rate": 3.935179130783046e-06, + "loss": 0.86910635, + "num_input_tokens_seen": 39087370, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.203125, + "step": 1810, + "time_per_iteration": 3.904432535171509 + }, + { + "auxiliary_loss_clip": 0.01180342, + "auxiliary_loss_mlp": 0.01054633, + "balance_loss_clip": 1.03111291, + "balance_loss_mlp": 1.05665135, + "epoch": 0.10888321058169247, + "flos": 26469037296000.0, + "grad_norm": 1.9531179942167565, + "language_loss": 0.63677633, + "learning_rate": 3.935080744080564e-06, + "loss": 0.6591261, + "num_input_tokens_seen": 39106635, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.234375, + "step": 1811, + "time_per_iteration": 2.523650646209717 + }, + { + "auxiliary_loss_clip": 0.01171345, + "auxiliary_loss_mlp": 0.01048747, + "balance_loss_clip": 1.02737319, + "balance_loss_mlp": 1.05139136, + "epoch": 0.10894333383436045, + "flos": 25848608653440.0, + "grad_norm": 3.279966127836369, + "language_loss": 0.74238914, + "learning_rate": 3.934982283999626e-06, + "loss": 0.76459008, + "num_input_tokens_seen": 39126335, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.203125, + "step": 1812, + "time_per_iteration": 2.5579042434692383 + }, + { + "auxiliary_loss_clip": 0.01173457, + "auxiliary_loss_mlp": 0.01047521, + "balance_loss_clip": 1.02587295, + "balance_loss_mlp": 1.05391026, + "epoch": 0.10900345708702841, + "flos": 19537093152000.0, + "grad_norm": 1.9314487748153213, + "language_loss": 0.72647583, + "learning_rate": 3.934883750543966e-06, + "loss": 0.74868566, + "num_input_tokens_seen": 39144820, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.1875, + "step": 1813, + "time_per_iteration": 2.488762617111206 + }, + { + "auxiliary_loss_clip": 0.01174675, + "auxiliary_loss_mlp": 0.01051455, + "balance_loss_clip": 1.02999711, + "balance_loss_mlp": 1.05744648, + "epoch": 0.10906358033969638, + "flos": 23623296341760.0, + "grad_norm": 10.097396236718186, + "language_loss": 0.82224226, + "learning_rate": 3.93478514371732e-06, + "loss": 0.84450358, + "num_input_tokens_seen": 39165945, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.171875, + "step": 1814, + "time_per_iteration": 2.495798349380493 + }, + { + "auxiliary_loss_clip": 0.01176897, + "auxiliary_loss_mlp": 0.01057916, + "balance_loss_clip": 1.03670859, + "balance_loss_mlp": 1.05595291, + "epoch": 0.10912370359236434, + "flos": 21214731818880.0, + "grad_norm": 2.3551509805271422, + "language_loss": 0.84218144, + "learning_rate": 3.934686463523429e-06, + "loss": 0.86452949, + "num_input_tokens_seen": 39183520, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.2109375, + "step": 1815, + "time_per_iteration": 2.462663173675537 + }, + { + "auxiliary_loss_clip": 0.01175955, + "auxiliary_loss_mlp": 0.01053131, + "balance_loss_clip": 1.03054035, + "balance_loss_mlp": 1.05833483, + "epoch": 0.10918382684503232, + "flos": 13553190622080.0, + "grad_norm": 2.3954928768695027, + "language_loss": 0.71048725, + "learning_rate": 3.9345877099660315e-06, + "loss": 0.73277813, + "num_input_tokens_seen": 39201190, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.171875, + "step": 1816, + "time_per_iteration": 2.465953826904297 + }, + { + "auxiliary_loss_clip": 0.01178612, + "auxiliary_loss_mlp": 0.01063123, + "balance_loss_clip": 1.04061651, + "balance_loss_mlp": 1.056674, + "epoch": 0.10924395009770028, + "flos": 27964321591680.0, + "grad_norm": 2.0063973144433067, + "language_loss": 0.72811669, + "learning_rate": 3.9344888830488744e-06, + "loss": 0.75053406, + "num_input_tokens_seen": 39221210, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.21875, + "step": 1817, + "time_per_iteration": 2.5323143005371094 + }, + { + "auxiliary_loss_clip": 0.01178871, + "auxiliary_loss_mlp": 0.01054207, + "balance_loss_clip": 1.03167605, + "balance_loss_mlp": 1.05709267, + "epoch": 0.10930407335036825, + "flos": 25593750679680.0, + "grad_norm": 1.767365755633268, + "language_loss": 0.67279243, + "learning_rate": 3.934389982775706e-06, + "loss": 0.6951232, + "num_input_tokens_seen": 39242025, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.21875, + "step": 1818, + "time_per_iteration": 2.5450243949890137 + }, + { + "auxiliary_loss_clip": 0.01177017, + "auxiliary_loss_mlp": 0.01063325, + "balance_loss_clip": 1.04123521, + "balance_loss_mlp": 1.05534315, + "epoch": 0.10936419660303623, + "flos": 18406194376320.0, + "grad_norm": 2.0802139312896744, + "language_loss": 0.72992313, + "learning_rate": 3.934291009150275e-06, + "loss": 0.75232661, + "num_input_tokens_seen": 39259870, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.21875, + "step": 1819, + "time_per_iteration": 2.487644910812378 + }, + { + "auxiliary_loss_clip": 0.01180831, + "auxiliary_loss_mlp": 0.01051168, + "balance_loss_clip": 1.02994883, + "balance_loss_mlp": 1.06090236, + "epoch": 0.1094243198557042, + "flos": 23840052963840.0, + "grad_norm": 7.240077427900601, + "language_loss": 0.73943537, + "learning_rate": 3.934191962176335e-06, + "loss": 0.76175541, + "num_input_tokens_seen": 39278500, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.203125, + "step": 1820, + "time_per_iteration": 2.5064899921417236 + }, + { + "auxiliary_loss_clip": 0.01177081, + "auxiliary_loss_mlp": 0.0105084, + "balance_loss_clip": 1.02765381, + "balance_loss_mlp": 1.05699766, + "epoch": 0.10948444310837216, + "flos": 14643940970880.0, + "grad_norm": 2.1677198782015887, + "language_loss": 0.82586408, + "learning_rate": 3.934092841857642e-06, + "loss": 0.84814322, + "num_input_tokens_seen": 39294800, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.203125, + "step": 1821, + "time_per_iteration": 2.4487218856811523 + }, + { + "auxiliary_loss_clip": 0.01174491, + "auxiliary_loss_mlp": 0.01049191, + "balance_loss_clip": 1.0280906, + "balance_loss_mlp": 1.05549288, + "epoch": 0.10954456636104014, + "flos": 27818811596160.0, + "grad_norm": 2.4783722356243065, + "language_loss": 0.76171732, + "learning_rate": 3.933993648197955e-06, + "loss": 0.78395414, + "num_input_tokens_seen": 39314625, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1875, + "step": 1822, + "time_per_iteration": 2.5196666717529297 + }, + { + "auxiliary_loss_clip": 0.01175357, + "auxiliary_loss_mlp": 0.01051738, + "balance_loss_clip": 1.03070986, + "balance_loss_mlp": 1.05751145, + "epoch": 0.1096046896137081, + "flos": 33620934372480.0, + "grad_norm": 1.9066217775511896, + "language_loss": 0.79275787, + "learning_rate": 3.933894381201034e-06, + "loss": 0.81502879, + "num_input_tokens_seen": 39336465, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1796875, + "step": 1823, + "time_per_iteration": 2.5665249824523926 + }, + { + "auxiliary_loss_clip": 0.01176588, + "auxiliary_loss_mlp": 0.01047872, + "balance_loss_clip": 1.02583015, + "balance_loss_mlp": 1.05788529, + "epoch": 0.10966481286637607, + "flos": 26980010219520.0, + "grad_norm": 1.7066251744315906, + "language_loss": 0.79424715, + "learning_rate": 3.933795040870645e-06, + "loss": 0.81649172, + "num_input_tokens_seen": 39357930, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1875, + "step": 1824, + "time_per_iteration": 2.5238118171691895 + }, + { + "auxiliary_loss_clip": 0.01173809, + "auxiliary_loss_mlp": 0.01054255, + "balance_loss_clip": 1.03264284, + "balance_loss_mlp": 1.05610347, + "epoch": 0.10972493611904403, + "flos": 23036551678080.0, + "grad_norm": 2.2183246130345, + "language_loss": 0.87992203, + "learning_rate": 3.933695627210554e-06, + "loss": 0.90220273, + "num_input_tokens_seen": 39376380, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.171875, + "step": 1825, + "time_per_iteration": 2.48294734954834 + }, + { + "auxiliary_loss_clip": 0.01171104, + "auxiliary_loss_mlp": 0.01056568, + "balance_loss_clip": 1.03483629, + "balance_loss_mlp": 1.05362988, + "epoch": 0.10978505937171201, + "flos": 38104632443520.0, + "grad_norm": 1.8404731426595848, + "language_loss": 0.76462233, + "learning_rate": 3.933596140224532e-06, + "loss": 0.78689909, + "num_input_tokens_seen": 39399935, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.171875, + "step": 1826, + "time_per_iteration": 2.6397035121917725 + }, + { + "auxiliary_loss_clip": 0.01066703, + "auxiliary_loss_mlp": 0.01001412, + "balance_loss_clip": 0.9983961, + "balance_loss_mlp": 1.02257371, + "epoch": 0.10984518262437998, + "flos": 59849694616320.0, + "grad_norm": 0.8361632453995619, + "language_loss": 0.54999328, + "learning_rate": 3.93349657991635e-06, + "loss": 0.57067442, + "num_input_tokens_seen": 39460685, + "router_z_loss_clip": 0.03015137, + "router_z_loss_mlp": 0.44140625, + "step": 1827, + "time_per_iteration": 3.065896511077881 + }, + { + "auxiliary_loss_clip": 0.01064494, + "auxiliary_loss_mlp": 0.01003719, + "balance_loss_clip": 1.00082231, + "balance_loss_mlp": 1.02098036, + "epoch": 0.10990530587704794, + "flos": 66719837410560.0, + "grad_norm": 0.7348311418426204, + "language_loss": 0.55346334, + "learning_rate": 3.933396946289784e-06, + "loss": 0.57414544, + "num_input_tokens_seen": 39524765, + "router_z_loss_clip": 0.02893066, + "router_z_loss_mlp": 0.43359375, + "step": 1828, + "time_per_iteration": 3.0850460529327393 + }, + { + "auxiliary_loss_clip": 0.01180205, + "auxiliary_loss_mlp": 0.01063699, + "balance_loss_clip": 1.03967869, + "balance_loss_mlp": 1.05754089, + "epoch": 0.10996542912971592, + "flos": 25447199189760.0, + "grad_norm": 2.992065013624077, + "language_loss": 0.84191215, + "learning_rate": 3.933297239348612e-06, + "loss": 0.86435115, + "num_input_tokens_seen": 39543640, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.2265625, + "step": 1829, + "time_per_iteration": 2.5398643016815186 + }, + { + "auxiliary_loss_clip": 0.01181422, + "auxiliary_loss_mlp": 0.01057367, + "balance_loss_clip": 1.03348923, + "balance_loss_mlp": 1.05845475, + "epoch": 0.11002555238238389, + "flos": 44018186186880.0, + "grad_norm": 2.654516298718269, + "language_loss": 0.8878119, + "learning_rate": 3.933197459096614e-06, + "loss": 0.91019976, + "num_input_tokens_seen": 39567525, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.2265625, + "step": 1830, + "time_per_iteration": 2.6912100315093994 + }, + { + "auxiliary_loss_clip": 0.01061018, + "auxiliary_loss_mlp": 0.01017752, + "balance_loss_clip": 1.01497495, + "balance_loss_mlp": 1.01824236, + "epoch": 0.11008567563505185, + "flos": 54065133590400.0, + "grad_norm": 0.6883241829767079, + "language_loss": 0.55492055, + "learning_rate": 3.9330976055375756e-06, + "loss": 0.57570827, + "num_input_tokens_seen": 39628470, + "router_z_loss_clip": 0.02783203, + "router_z_loss_mlp": 0.42773438, + "step": 1831, + "time_per_iteration": 3.075678825378418 + }, + { + "auxiliary_loss_clip": 0.01183643, + "auxiliary_loss_mlp": 0.01072422, + "balance_loss_clip": 1.04829443, + "balance_loss_mlp": 1.05867732, + "epoch": 0.11014579888771983, + "flos": 24243150366720.0, + "grad_norm": 2.054835171188452, + "language_loss": 0.90726995, + "learning_rate": 3.932997678675282e-06, + "loss": 0.92983055, + "num_input_tokens_seen": 39646670, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.25, + "step": 1832, + "time_per_iteration": 2.5084948539733887 + }, + { + "auxiliary_loss_clip": 0.01058943, + "auxiliary_loss_mlp": 0.01015171, + "balance_loss_clip": 1.01245296, + "balance_loss_mlp": 1.01603723, + "epoch": 0.1102059221403878, + "flos": 57743965658880.0, + "grad_norm": 0.7159549093535102, + "language_loss": 0.59889859, + "learning_rate": 3.932897678513523e-06, + "loss": 0.61963969, + "num_input_tokens_seen": 39712915, + "router_z_loss_clip": 0.02722168, + "router_z_loss_mlp": 0.4296875, + "step": 1833, + "time_per_iteration": 3.0748977661132812 + }, + { + "auxiliary_loss_clip": 0.01175273, + "auxiliary_loss_mlp": 0.01050301, + "balance_loss_clip": 1.0277946, + "balance_loss_mlp": 1.05353165, + "epoch": 0.11026604539305576, + "flos": 16795923667200.0, + "grad_norm": 2.6030857455850303, + "language_loss": 0.8095156, + "learning_rate": 3.93279760505609e-06, + "loss": 0.83177137, + "num_input_tokens_seen": 39730650, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.21875, + "step": 1834, + "time_per_iteration": 2.452131509780884 + }, + { + "auxiliary_loss_clip": 0.01179876, + "auxiliary_loss_mlp": 0.0105407, + "balance_loss_clip": 1.0302285, + "balance_loss_mlp": 1.05899858, + "epoch": 0.11032616864572373, + "flos": 23988076911360.0, + "grad_norm": 2.5262438386564807, + "language_loss": 0.90514123, + "learning_rate": 3.932697458306779e-06, + "loss": 0.9274807, + "num_input_tokens_seen": 39751065, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.2109375, + "step": 1835, + "time_per_iteration": 2.5261363983154297 + }, + { + "auxiliary_loss_clip": 0.01177237, + "auxiliary_loss_mlp": 0.0105853, + "balance_loss_clip": 1.03445005, + "balance_loss_mlp": 1.05625033, + "epoch": 0.1103862918983917, + "flos": 19683141851520.0, + "grad_norm": 2.0785934228774003, + "language_loss": 0.63590646, + "learning_rate": 3.932597238269386e-06, + "loss": 0.65826416, + "num_input_tokens_seen": 39769245, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.2109375, + "step": 1836, + "time_per_iteration": 2.502586603164673 + }, + { + "auxiliary_loss_clip": 0.01173672, + "auxiliary_loss_mlp": 0.01057372, + "balance_loss_clip": 1.03547311, + "balance_loss_mlp": 1.05388379, + "epoch": 0.11044641515105967, + "flos": 32160878340480.0, + "grad_norm": 1.9330421575083043, + "language_loss": 0.72814602, + "learning_rate": 3.932496944947711e-06, + "loss": 0.75045645, + "num_input_tokens_seen": 39790830, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.1953125, + "step": 1837, + "time_per_iteration": 2.594910144805908 + }, + { + "auxiliary_loss_clip": 0.01179947, + "auxiliary_loss_mlp": 0.0105928, + "balance_loss_clip": 1.03733349, + "balance_loss_mlp": 1.05921102, + "epoch": 0.11050653840372764, + "flos": 16689233295360.0, + "grad_norm": 2.132041599419941, + "language_loss": 0.79049784, + "learning_rate": 3.93239657834556e-06, + "loss": 0.81289005, + "num_input_tokens_seen": 39809475, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.203125, + "step": 1838, + "time_per_iteration": 2.4922690391540527 + }, + { + "auxiliary_loss_clip": 0.01175098, + "auxiliary_loss_mlp": 0.01061476, + "balance_loss_clip": 1.03883791, + "balance_loss_mlp": 1.05623114, + "epoch": 0.11056666165639562, + "flos": 21208877902080.0, + "grad_norm": 4.130442583787946, + "language_loss": 0.71453696, + "learning_rate": 3.932296138466736e-06, + "loss": 0.73690271, + "num_input_tokens_seen": 39826355, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.1875, + "step": 1839, + "time_per_iteration": 2.5151031017303467 + }, + { + "auxiliary_loss_clip": 0.01183988, + "auxiliary_loss_mlp": 0.01053903, + "balance_loss_clip": 1.03082371, + "balance_loss_mlp": 1.05938148, + "epoch": 0.11062678490906358, + "flos": 19165488998400.0, + "grad_norm": 2.064820600929851, + "language_loss": 0.79099703, + "learning_rate": 3.93219562531505e-06, + "loss": 0.81337595, + "num_input_tokens_seen": 39845335, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.25, + "step": 1840, + "time_per_iteration": 2.487116575241089 + }, + { + "auxiliary_loss_clip": 0.01171241, + "auxiliary_loss_mlp": 0.01053863, + "balance_loss_clip": 1.03234553, + "balance_loss_mlp": 1.05329347, + "epoch": 0.11068690816173155, + "flos": 24895287740160.0, + "grad_norm": 2.0204098875762293, + "language_loss": 0.87691998, + "learning_rate": 3.932095038894311e-06, + "loss": 0.89917111, + "num_input_tokens_seen": 39865065, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.1796875, + "step": 1841, + "time_per_iteration": 2.5141868591308594 + }, + { + "auxiliary_loss_clip": 0.01170262, + "auxiliary_loss_mlp": 0.01053518, + "balance_loss_clip": 1.03126192, + "balance_loss_mlp": 1.05365491, + "epoch": 0.11074703141439952, + "flos": 16472368932480.0, + "grad_norm": 2.3404569451138535, + "language_loss": 0.90582979, + "learning_rate": 3.931994379208334e-06, + "loss": 0.92806768, + "num_input_tokens_seen": 39882780, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.1640625, + "step": 1842, + "time_per_iteration": 2.4583773612976074 + }, + { + "auxiliary_loss_clip": 0.0117179, + "auxiliary_loss_mlp": 0.0105155, + "balance_loss_clip": 1.03080761, + "balance_loss_mlp": 1.05210185, + "epoch": 0.11080715466706749, + "flos": 19172420323200.0, + "grad_norm": 2.171204868901281, + "language_loss": 0.85597986, + "learning_rate": 3.931893646260937e-06, + "loss": 0.87821329, + "num_input_tokens_seen": 39900295, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1953125, + "step": 1843, + "time_per_iteration": 2.4801278114318848 + }, + { + "auxiliary_loss_clip": 0.01174004, + "auxiliary_loss_mlp": 0.01060021, + "balance_loss_clip": 1.03645349, + "balance_loss_mlp": 1.05622911, + "epoch": 0.11086727791973545, + "flos": 27704687109120.0, + "grad_norm": 1.47825888700324, + "language_loss": 0.7494424, + "learning_rate": 3.931792840055941e-06, + "loss": 0.77178264, + "num_input_tokens_seen": 39922075, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.1796875, + "step": 1844, + "time_per_iteration": 2.526383876800537 + }, + { + "auxiliary_loss_clip": 0.01173241, + "auxiliary_loss_mlp": 0.01054334, + "balance_loss_clip": 1.0304563, + "balance_loss_mlp": 1.05405343, + "epoch": 0.11092740117240343, + "flos": 18514967736960.0, + "grad_norm": 2.0036363505702433, + "language_loss": 0.75732028, + "learning_rate": 3.931691960597165e-06, + "loss": 0.77959603, + "num_input_tokens_seen": 39940115, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.1953125, + "step": 1845, + "time_per_iteration": 2.463327169418335 + }, + { + "auxiliary_loss_clip": 0.01171011, + "auxiliary_loss_mlp": 0.01054645, + "balance_loss_clip": 1.03341389, + "balance_loss_mlp": 1.05351365, + "epoch": 0.1109875244250714, + "flos": 20522446018560.0, + "grad_norm": 1.6129010657048202, + "language_loss": 0.76336479, + "learning_rate": 3.9315910078884375e-06, + "loss": 0.7856214, + "num_input_tokens_seen": 39959920, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.171875, + "step": 1846, + "time_per_iteration": 2.465045928955078 + }, + { + "auxiliary_loss_clip": 0.01175917, + "auxiliary_loss_mlp": 0.01053852, + "balance_loss_clip": 1.03262115, + "balance_loss_mlp": 1.05392015, + "epoch": 0.11104764767773936, + "flos": 14098601710080.0, + "grad_norm": 2.9965527726637577, + "language_loss": 0.85611343, + "learning_rate": 3.931489981933584e-06, + "loss": 0.87841111, + "num_input_tokens_seen": 39974755, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.21875, + "step": 1847, + "time_per_iteration": 2.4493908882141113 + }, + { + "auxiliary_loss_clip": 0.01174539, + "auxiliary_loss_mlp": 0.01053148, + "balance_loss_clip": 1.03018796, + "balance_loss_mlp": 1.05326366, + "epoch": 0.11110777093040733, + "flos": 20594518657920.0, + "grad_norm": 3.3740806549350086, + "language_loss": 0.76464605, + "learning_rate": 3.931388882736438e-06, + "loss": 0.78692293, + "num_input_tokens_seen": 39993355, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.2109375, + "step": 1848, + "time_per_iteration": 2.4647536277770996 + }, + { + "auxiliary_loss_clip": 0.01173713, + "auxiliary_loss_mlp": 0.01048417, + "balance_loss_clip": 1.02754378, + "balance_loss_mlp": 1.05833888, + "epoch": 0.11116789418307531, + "flos": 21870065502720.0, + "grad_norm": 2.0750561163348173, + "language_loss": 0.77849847, + "learning_rate": 3.931287710300832e-06, + "loss": 0.8007198, + "num_input_tokens_seen": 40012410, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1484375, + "step": 1849, + "time_per_iteration": 2.514777660369873 + }, + { + "auxiliary_loss_clip": 0.01176001, + "auxiliary_loss_mlp": 0.01056414, + "balance_loss_clip": 1.03496861, + "balance_loss_mlp": 1.05422294, + "epoch": 0.11122801743574327, + "flos": 15523106256000.0, + "grad_norm": 3.6662643697478066, + "language_loss": 0.71315688, + "learning_rate": 3.931186464630601e-06, + "loss": 0.73548102, + "num_input_tokens_seen": 40029315, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.21875, + "step": 1850, + "time_per_iteration": 2.4420053958892822 + }, + { + "auxiliary_loss_clip": 0.01174469, + "auxiliary_loss_mlp": 0.01056777, + "balance_loss_clip": 1.03434181, + "balance_loss_mlp": 1.05444217, + "epoch": 0.11128814068841124, + "flos": 14392279307520.0, + "grad_norm": 2.2721050151861912, + "language_loss": 0.81174368, + "learning_rate": 3.931085145729588e-06, + "loss": 0.83405614, + "num_input_tokens_seen": 40045765, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.203125, + "step": 1851, + "time_per_iteration": 5.341679811477661 + }, + { + "auxiliary_loss_clip": 0.01173679, + "auxiliary_loss_mlp": 0.01051699, + "balance_loss_clip": 1.03126621, + "balance_loss_mlp": 1.05519962, + "epoch": 0.11134826394107922, + "flos": 16653933204480.0, + "grad_norm": 3.240427658931177, + "language_loss": 0.88860446, + "learning_rate": 3.930983753601631e-06, + "loss": 0.91085827, + "num_input_tokens_seen": 40061660, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.1875, + "step": 1852, + "time_per_iteration": 3.8281352519989014 + }, + { + "auxiliary_loss_clip": 0.01176515, + "auxiliary_loss_mlp": 0.01057817, + "balance_loss_clip": 1.03514326, + "balance_loss_mlp": 1.05636191, + "epoch": 0.11140838719374718, + "flos": 16690993061760.0, + "grad_norm": 2.0685366180695848, + "language_loss": 0.72092974, + "learning_rate": 3.930882288250578e-06, + "loss": 0.74327302, + "num_input_tokens_seen": 40080180, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.203125, + "step": 1853, + "time_per_iteration": 2.4896738529205322 + }, + { + "auxiliary_loss_clip": 0.01055744, + "auxiliary_loss_mlp": 0.01002079, + "balance_loss_clip": 0.99923038, + "balance_loss_mlp": 1.0132798, + "epoch": 0.11146851044641515, + "flos": 60976355587200.0, + "grad_norm": 0.7783537669608381, + "language_loss": 0.53647029, + "learning_rate": 3.930780749680273e-06, + "loss": 0.5570485, + "num_input_tokens_seen": 40138910, + "router_z_loss_clip": 0.02844238, + "router_z_loss_mlp": 0.42578125, + "step": 1854, + "time_per_iteration": 3.0189781188964844 + }, + { + "auxiliary_loss_clip": 0.01184355, + "auxiliary_loss_mlp": 0.01052254, + "balance_loss_clip": 1.02937746, + "balance_loss_mlp": 1.057657, + "epoch": 0.11152863369908313, + "flos": 22193835719040.0, + "grad_norm": 2.006296213399466, + "language_loss": 0.8394689, + "learning_rate": 3.9306791378945705e-06, + "loss": 0.861835, + "num_input_tokens_seen": 40157745, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.265625, + "step": 1855, + "time_per_iteration": 2.4908485412597656 + }, + { + "auxiliary_loss_clip": 0.01173641, + "auxiliary_loss_mlp": 0.0106694, + "balance_loss_clip": 1.04588723, + "balance_loss_mlp": 1.05353498, + "epoch": 0.11158875695175109, + "flos": 19537524115200.0, + "grad_norm": 2.2091175797191815, + "language_loss": 0.82098675, + "learning_rate": 3.9305774528973205e-06, + "loss": 0.84339261, + "num_input_tokens_seen": 40175375, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.203125, + "step": 1856, + "time_per_iteration": 2.4653480052948 + }, + { + "auxiliary_loss_clip": 0.01173222, + "auxiliary_loss_mlp": 0.01048519, + "balance_loss_clip": 1.02631092, + "balance_loss_mlp": 1.05662763, + "epoch": 0.11164888020441906, + "flos": 25442709989760.0, + "grad_norm": 2.9605277294776, + "language_loss": 0.8305279, + "learning_rate": 3.93047569469238e-06, + "loss": 0.85274535, + "num_input_tokens_seen": 40195715, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.1640625, + "step": 1857, + "time_per_iteration": 2.5205907821655273 + }, + { + "auxiliary_loss_clip": 0.01173614, + "auxiliary_loss_mlp": 0.01049346, + "balance_loss_clip": 1.0279119, + "balance_loss_mlp": 1.05195725, + "epoch": 0.11170900345708702, + "flos": 15632741543040.0, + "grad_norm": 2.3309612964817923, + "language_loss": 0.83037764, + "learning_rate": 3.930373863283608e-06, + "loss": 0.85260725, + "num_input_tokens_seen": 40213975, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.21875, + "step": 1858, + "time_per_iteration": 2.4567432403564453 + }, + { + "auxiliary_loss_clip": 0.01175678, + "auxiliary_loss_mlp": 0.01062921, + "balance_loss_clip": 1.04205894, + "balance_loss_mlp": 1.05549788, + "epoch": 0.111769126709755, + "flos": 23039424766080.0, + "grad_norm": 2.004830650729854, + "language_loss": 0.91120583, + "learning_rate": 3.930271958674866e-06, + "loss": 0.93359184, + "num_input_tokens_seen": 40233905, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.203125, + "step": 1859, + "time_per_iteration": 2.532048463821411 + }, + { + "auxiliary_loss_clip": 0.01173939, + "auxiliary_loss_mlp": 0.0105127, + "balance_loss_clip": 1.02983618, + "balance_loss_mlp": 1.05344319, + "epoch": 0.11182924996242297, + "flos": 20850705434880.0, + "grad_norm": 2.4768392741235306, + "language_loss": 0.81709313, + "learning_rate": 3.930169980870018e-06, + "loss": 0.83934522, + "num_input_tokens_seen": 40252810, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.203125, + "step": 1860, + "time_per_iteration": 2.4747087955474854 + }, + { + "auxiliary_loss_clip": 0.01170638, + "auxiliary_loss_mlp": 0.01056481, + "balance_loss_clip": 1.0361197, + "balance_loss_mlp": 1.05388653, + "epoch": 0.11188937321509093, + "flos": 17455315587840.0, + "grad_norm": 2.1256274007234937, + "language_loss": 0.75203162, + "learning_rate": 3.930067929872931e-06, + "loss": 0.77430284, + "num_input_tokens_seen": 40272000, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.171875, + "step": 1861, + "time_per_iteration": 2.4773240089416504 + }, + { + "auxiliary_loss_clip": 0.01169857, + "auxiliary_loss_mlp": 0.01052708, + "balance_loss_clip": 1.03318143, + "balance_loss_mlp": 1.05338371, + "epoch": 0.11194949646775891, + "flos": 24095916518400.0, + "grad_norm": 2.0016824982414776, + "language_loss": 0.88759935, + "learning_rate": 3.929965805687474e-06, + "loss": 0.90982509, + "num_input_tokens_seen": 40290660, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.1640625, + "step": 1862, + "time_per_iteration": 2.4750735759735107 + }, + { + "auxiliary_loss_clip": 0.01173358, + "auxiliary_loss_mlp": 0.01059619, + "balance_loss_clip": 1.03880525, + "balance_loss_mlp": 1.05597067, + "epoch": 0.11200961972042688, + "flos": 25153880728320.0, + "grad_norm": 2.1858127473987015, + "language_loss": 0.8707, + "learning_rate": 3.92986360831752e-06, + "loss": 0.89302975, + "num_input_tokens_seen": 40307820, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.171875, + "step": 1863, + "time_per_iteration": 2.510676860809326 + }, + { + "auxiliary_loss_clip": 0.01173984, + "auxiliary_loss_mlp": 0.01051873, + "balance_loss_clip": 1.0283289, + "balance_loss_mlp": 1.05463171, + "epoch": 0.11206974297309484, + "flos": 21288312829440.0, + "grad_norm": 2.0887108243102976, + "language_loss": 0.64630157, + "learning_rate": 3.929761337766945e-06, + "loss": 0.66856015, + "num_input_tokens_seen": 40327430, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.1953125, + "step": 1864, + "time_per_iteration": 2.4843807220458984 + }, + { + "auxiliary_loss_clip": 0.01171142, + "auxiliary_loss_mlp": 0.01051015, + "balance_loss_clip": 1.03169096, + "balance_loss_mlp": 1.05504417, + "epoch": 0.11212986622576282, + "flos": 18915982151040.0, + "grad_norm": 2.0715232833306874, + "language_loss": 0.73895639, + "learning_rate": 3.929658994039627e-06, + "loss": 0.76117796, + "num_input_tokens_seen": 40344545, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.15625, + "step": 1865, + "time_per_iteration": 2.4509596824645996 + }, + { + "auxiliary_loss_clip": 0.01169998, + "auxiliary_loss_mlp": 0.01051954, + "balance_loss_clip": 1.02928007, + "balance_loss_mlp": 1.05253589, + "epoch": 0.11218998947843078, + "flos": 22054754257920.0, + "grad_norm": 2.190736679244475, + "language_loss": 0.84019023, + "learning_rate": 3.929556577139446e-06, + "loss": 0.86240977, + "num_input_tokens_seen": 40362300, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.171875, + "step": 1866, + "time_per_iteration": 2.473715305328369 + }, + { + "auxiliary_loss_clip": 0.01169711, + "auxiliary_loss_mlp": 0.01048459, + "balance_loss_clip": 1.02737069, + "balance_loss_mlp": 1.05260134, + "epoch": 0.11225011273109875, + "flos": 24571697091840.0, + "grad_norm": 1.5419857436109028, + "language_loss": 0.81424987, + "learning_rate": 3.929454087070286e-06, + "loss": 0.83643156, + "num_input_tokens_seen": 40384720, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.171875, + "step": 1867, + "time_per_iteration": 2.5367391109466553 + }, + { + "auxiliary_loss_clip": 0.01172987, + "auxiliary_loss_mlp": 0.01055012, + "balance_loss_clip": 1.03473496, + "balance_loss_mlp": 1.05594015, + "epoch": 0.11231023598376672, + "flos": 28438665621120.0, + "grad_norm": 2.5308159777425976, + "language_loss": 0.86677599, + "learning_rate": 3.929351523836035e-06, + "loss": 0.88905597, + "num_input_tokens_seen": 40404000, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.171875, + "step": 1868, + "time_per_iteration": 2.5044100284576416 + }, + { + "auxiliary_loss_clip": 0.01172172, + "auxiliary_loss_mlp": 0.01049736, + "balance_loss_clip": 1.03026938, + "balance_loss_mlp": 1.05724931, + "epoch": 0.1123703592364347, + "flos": 14426466076800.0, + "grad_norm": 2.333499600894065, + "language_loss": 0.68059367, + "learning_rate": 3.9292488874405795e-06, + "loss": 0.70281279, + "num_input_tokens_seen": 40418665, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.1484375, + "step": 1869, + "time_per_iteration": 2.4462318420410156 + }, + { + "auxiliary_loss_clip": 0.01176659, + "auxiliary_loss_mlp": 0.01061629, + "balance_loss_clip": 1.03969407, + "balance_loss_mlp": 1.05456114, + "epoch": 0.11243048248910266, + "flos": 22236282616320.0, + "grad_norm": 2.049754856307833, + "language_loss": 0.7735095, + "learning_rate": 3.929146177887814e-06, + "loss": 0.79589236, + "num_input_tokens_seen": 40437870, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.21875, + "step": 1870, + "time_per_iteration": 2.488471031188965 + }, + { + "auxiliary_loss_clip": 0.01174025, + "auxiliary_loss_mlp": 0.01053264, + "balance_loss_clip": 1.03177094, + "balance_loss_mlp": 1.05264199, + "epoch": 0.11249060574177062, + "flos": 18584167288320.0, + "grad_norm": 1.8085683914823212, + "language_loss": 0.75747174, + "learning_rate": 3.929043395181631e-06, + "loss": 0.77974463, + "num_input_tokens_seen": 40455570, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.21875, + "step": 1871, + "time_per_iteration": 2.475358486175537 + }, + { + "auxiliary_loss_clip": 0.01171992, + "auxiliary_loss_mlp": 0.01049389, + "balance_loss_clip": 1.02936232, + "balance_loss_mlp": 1.05448031, + "epoch": 0.1125507289944386, + "flos": 22856567604480.0, + "grad_norm": 2.4822417703451265, + "language_loss": 0.81949306, + "learning_rate": 3.928940539325929e-06, + "loss": 0.84170687, + "num_input_tokens_seen": 40473600, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.171875, + "step": 1872, + "time_per_iteration": 2.4984912872314453 + }, + { + "auxiliary_loss_clip": 0.01172929, + "auxiliary_loss_mlp": 0.01052146, + "balance_loss_clip": 1.03183281, + "balance_loss_mlp": 1.05497694, + "epoch": 0.11261085224710657, + "flos": 19676390094720.0, + "grad_norm": 2.7250665555581937, + "language_loss": 0.83564019, + "learning_rate": 3.9288376103246095e-06, + "loss": 0.85789096, + "num_input_tokens_seen": 40490025, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1796875, + "step": 1873, + "time_per_iteration": 2.480172872543335 + }, + { + "auxiliary_loss_clip": 0.01175617, + "auxiliary_loss_mlp": 0.01053305, + "balance_loss_clip": 1.03089404, + "balance_loss_mlp": 1.05352998, + "epoch": 0.11267097549977453, + "flos": 26063246373120.0, + "grad_norm": 2.2103217259008985, + "language_loss": 0.91925669, + "learning_rate": 3.928734608181575e-06, + "loss": 0.9415459, + "num_input_tokens_seen": 40511580, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.21875, + "step": 1874, + "time_per_iteration": 2.5193865299224854 + }, + { + "auxiliary_loss_clip": 0.01168968, + "auxiliary_loss_mlp": 0.01057524, + "balance_loss_clip": 1.0376637, + "balance_loss_mlp": 1.0528394, + "epoch": 0.11273109875244251, + "flos": 21068036674560.0, + "grad_norm": 1.5656160151577971, + "language_loss": 0.7534616, + "learning_rate": 3.928631532900729e-06, + "loss": 0.77572656, + "num_input_tokens_seen": 40530155, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.15625, + "step": 1875, + "time_per_iteration": 2.509000062942505 + }, + { + "auxiliary_loss_clip": 0.01168067, + "auxiliary_loss_mlp": 0.01054767, + "balance_loss_clip": 1.03545499, + "balance_loss_mlp": 1.05498421, + "epoch": 0.11279122200511048, + "flos": 27088999061760.0, + "grad_norm": 1.875753927893446, + "language_loss": 0.71727258, + "learning_rate": 3.928528384485984e-06, + "loss": 0.73950088, + "num_input_tokens_seen": 40549500, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.1328125, + "step": 1876, + "time_per_iteration": 2.5222911834716797 + }, + { + "auxiliary_loss_clip": 0.01170022, + "auxiliary_loss_mlp": 0.01051226, + "balance_loss_clip": 1.03036463, + "balance_loss_mlp": 1.05574679, + "epoch": 0.11285134525777844, + "flos": 20187901722240.0, + "grad_norm": 2.408917627715415, + "language_loss": 0.76760256, + "learning_rate": 3.9284251629412475e-06, + "loss": 0.78981495, + "num_input_tokens_seen": 40567475, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.140625, + "step": 1877, + "time_per_iteration": 2.487870693206787 + }, + { + "auxiliary_loss_clip": 0.01173931, + "auxiliary_loss_mlp": 0.01057623, + "balance_loss_clip": 1.03555715, + "balance_loss_mlp": 1.05530918, + "epoch": 0.11291146851044641, + "flos": 12458453863680.0, + "grad_norm": 2.569804002246691, + "language_loss": 0.88132238, + "learning_rate": 3.928321868270436e-06, + "loss": 0.90363795, + "num_input_tokens_seen": 40583280, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1875, + "step": 1878, + "time_per_iteration": 2.4562089443206787 + }, + { + "auxiliary_loss_clip": 0.01171231, + "auxiliary_loss_mlp": 0.01046868, + "balance_loss_clip": 1.02628088, + "balance_loss_mlp": 1.05382609, + "epoch": 0.11297159176311439, + "flos": 23842315520640.0, + "grad_norm": 2.2792620862185036, + "language_loss": 0.81521666, + "learning_rate": 3.928218500477466e-06, + "loss": 0.83739763, + "num_input_tokens_seen": 40603080, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.171875, + "step": 1879, + "time_per_iteration": 2.515162944793701 + }, + { + "auxiliary_loss_clip": 0.01174903, + "auxiliary_loss_mlp": 0.01056113, + "balance_loss_clip": 1.03513217, + "balance_loss_mlp": 1.05591071, + "epoch": 0.11303171501578235, + "flos": 29930538124800.0, + "grad_norm": 2.9729184409385376, + "language_loss": 0.70101768, + "learning_rate": 3.928115059566259e-06, + "loss": 0.72332788, + "num_input_tokens_seen": 40623255, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.1875, + "step": 1880, + "time_per_iteration": 2.5420267581939697 + }, + { + "auxiliary_loss_clip": 0.01169399, + "auxiliary_loss_mlp": 0.01045441, + "balance_loss_clip": 1.02442431, + "balance_loss_mlp": 1.05396068, + "epoch": 0.11309183826845032, + "flos": 16180558842240.0, + "grad_norm": 1.7442831242084353, + "language_loss": 0.72337204, + "learning_rate": 3.928011545540734e-06, + "loss": 0.74552047, + "num_input_tokens_seen": 40641570, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.15625, + "step": 1881, + "time_per_iteration": 2.4648680686950684 + }, + { + "auxiliary_loss_clip": 0.01172977, + "auxiliary_loss_mlp": 0.01057236, + "balance_loss_clip": 1.03452694, + "balance_loss_mlp": 1.05385113, + "epoch": 0.1131519615211183, + "flos": 12020702814720.0, + "grad_norm": 2.4452990726029533, + "language_loss": 0.74243963, + "learning_rate": 3.927907958404819e-06, + "loss": 0.76474178, + "num_input_tokens_seen": 40658775, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.1953125, + "step": 1882, + "time_per_iteration": 2.459181547164917 + }, + { + "auxiliary_loss_clip": 0.01171271, + "auxiliary_loss_mlp": 0.01052266, + "balance_loss_clip": 1.03045106, + "balance_loss_mlp": 1.05493677, + "epoch": 0.11321208477378626, + "flos": 26250125857920.0, + "grad_norm": 2.8641228673356873, + "language_loss": 0.79328096, + "learning_rate": 3.92780429816244e-06, + "loss": 0.81551635, + "num_input_tokens_seen": 40679555, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.15625, + "step": 1883, + "time_per_iteration": 2.5236945152282715 + }, + { + "auxiliary_loss_clip": 0.01173507, + "auxiliary_loss_mlp": 0.01054538, + "balance_loss_clip": 1.03271067, + "balance_loss_mlp": 1.05288672, + "epoch": 0.11327220802645423, + "flos": 13626376583040.0, + "grad_norm": 3.0524763398538193, + "language_loss": 0.77151698, + "learning_rate": 3.927700564817529e-06, + "loss": 0.79379749, + "num_input_tokens_seen": 40697295, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.203125, + "step": 1884, + "time_per_iteration": 2.4974489212036133 + }, + { + "auxiliary_loss_clip": 0.01063959, + "auxiliary_loss_mlp": 0.01012749, + "balance_loss_clip": 1.0102694, + "balance_loss_mlp": 1.02156711, + "epoch": 0.1133323312791222, + "flos": 57191802814080.0, + "grad_norm": 0.7928734254501784, + "language_loss": 0.55183071, + "learning_rate": 3.927596758374019e-06, + "loss": 0.5725978, + "num_input_tokens_seen": 40758095, + "router_z_loss_clip": 0.02478027, + "router_z_loss_mlp": 0.42382812, + "step": 1885, + "time_per_iteration": 2.9756290912628174 + }, + { + "auxiliary_loss_clip": 0.01166272, + "auxiliary_loss_mlp": 0.01047922, + "balance_loss_clip": 1.02758515, + "balance_loss_mlp": 1.0534817, + "epoch": 0.11339245453179017, + "flos": 24351708245760.0, + "grad_norm": 5.752063942495911, + "language_loss": 0.90240276, + "learning_rate": 3.927492878835848e-06, + "loss": 0.92454469, + "num_input_tokens_seen": 40777140, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.125, + "step": 1886, + "time_per_iteration": 2.5031139850616455 + }, + { + "auxiliary_loss_clip": 0.01168969, + "auxiliary_loss_mlp": 0.01051145, + "balance_loss_clip": 1.03018832, + "balance_loss_mlp": 1.05306387, + "epoch": 0.11345257778445814, + "flos": 22670693700480.0, + "grad_norm": 2.0267704425546036, + "language_loss": 0.85101235, + "learning_rate": 3.927388926206953e-06, + "loss": 0.87321353, + "num_input_tokens_seen": 40797505, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.15625, + "step": 1887, + "time_per_iteration": 2.5177412033081055 + }, + { + "auxiliary_loss_clip": 0.01172698, + "auxiliary_loss_mlp": 0.01061982, + "balance_loss_clip": 1.0417881, + "balance_loss_mlp": 1.05554259, + "epoch": 0.11351270103712612, + "flos": 20988242611200.0, + "grad_norm": 5.5783153731033055, + "language_loss": 0.76168925, + "learning_rate": 3.927284900491277e-06, + "loss": 0.78403604, + "num_input_tokens_seen": 40812970, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.171875, + "step": 1888, + "time_per_iteration": 2.4965853691101074 + }, + { + "auxiliary_loss_clip": 0.01177743, + "auxiliary_loss_mlp": 0.01057849, + "balance_loss_clip": 1.03542566, + "balance_loss_mlp": 1.05632472, + "epoch": 0.11357282428979408, + "flos": 37347923600640.0, + "grad_norm": 2.114301103868513, + "language_loss": 0.68039739, + "learning_rate": 3.927180801692764e-06, + "loss": 0.70275331, + "num_input_tokens_seen": 40837745, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.21875, + "step": 1889, + "time_per_iteration": 2.643867015838623 + }, + { + "auxiliary_loss_clip": 0.01172679, + "auxiliary_loss_mlp": 0.01047613, + "balance_loss_clip": 1.02611947, + "balance_loss_mlp": 1.05620956, + "epoch": 0.11363294754246205, + "flos": 21757018423680.0, + "grad_norm": 2.158184033346157, + "language_loss": 0.84414917, + "learning_rate": 3.927076629815362e-06, + "loss": 0.86635208, + "num_input_tokens_seen": 40856490, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.1640625, + "step": 1890, + "time_per_iteration": 2.5018270015716553 + }, + { + "auxiliary_loss_clip": 0.01168344, + "auxiliary_loss_mlp": 0.01050115, + "balance_loss_clip": 1.02855039, + "balance_loss_mlp": 1.05288363, + "epoch": 0.11369307079513001, + "flos": 22601637803520.0, + "grad_norm": 2.2859967152973373, + "language_loss": 0.65099049, + "learning_rate": 3.926972384863022e-06, + "loss": 0.67317504, + "num_input_tokens_seen": 40874070, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.15625, + "step": 1891, + "time_per_iteration": 2.4870762825012207 + }, + { + "auxiliary_loss_clip": 0.01173219, + "auxiliary_loss_mlp": 0.01043072, + "balance_loss_clip": 1.02274656, + "balance_loss_mlp": 1.05397856, + "epoch": 0.11375319404779799, + "flos": 21944257044480.0, + "grad_norm": 2.358390081637715, + "language_loss": 0.87789619, + "learning_rate": 3.9268680668396956e-06, + "loss": 0.90005904, + "num_input_tokens_seen": 40892425, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1953125, + "step": 1892, + "time_per_iteration": 2.469215154647827 + }, + { + "auxiliary_loss_clip": 0.01173439, + "auxiliary_loss_mlp": 0.01066287, + "balance_loss_clip": 1.04509139, + "balance_loss_mlp": 1.05419993, + "epoch": 0.11381331730046595, + "flos": 26395456285440.0, + "grad_norm": 2.4185703679999775, + "language_loss": 0.72724342, + "learning_rate": 3.926763675749339e-06, + "loss": 0.7496407, + "num_input_tokens_seen": 40912190, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1875, + "step": 1893, + "time_per_iteration": 4.021688222885132 + }, + { + "auxiliary_loss_clip": 0.01169367, + "auxiliary_loss_mlp": 0.0105827, + "balance_loss_clip": 1.03531051, + "balance_loss_mlp": 1.05175805, + "epoch": 0.11387344055313392, + "flos": 23804716959360.0, + "grad_norm": 2.254020248775613, + "language_loss": 0.79367435, + "learning_rate": 3.92665921159591e-06, + "loss": 0.81595069, + "num_input_tokens_seen": 40928395, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.171875, + "step": 1894, + "time_per_iteration": 3.9190711975097656 + }, + { + "auxiliary_loss_clip": 0.01176791, + "auxiliary_loss_mlp": 0.01052535, + "balance_loss_clip": 1.03074312, + "balance_loss_mlp": 1.05530715, + "epoch": 0.1139335638058019, + "flos": 34522865902080.0, + "grad_norm": 2.587114905294773, + "language_loss": 0.78868139, + "learning_rate": 3.926554674383371e-06, + "loss": 0.81097472, + "num_input_tokens_seen": 40946555, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.21875, + "step": 1895, + "time_per_iteration": 2.5924861431121826 + }, + { + "auxiliary_loss_clip": 0.0106161, + "auxiliary_loss_mlp": 0.01001633, + "balance_loss_clip": 0.99917758, + "balance_loss_mlp": 1.01840448, + "epoch": 0.11399368705846986, + "flos": 70587811520640.0, + "grad_norm": 0.8005582337036792, + "language_loss": 0.63316774, + "learning_rate": 3.926450064115686e-06, + "loss": 0.65380025, + "num_input_tokens_seen": 41004910, + "router_z_loss_clip": 0.02453613, + "router_z_loss_mlp": 0.43359375, + "step": 1896, + "time_per_iteration": 3.143843412399292 + }, + { + "auxiliary_loss_clip": 0.01170086, + "auxiliary_loss_mlp": 0.01059473, + "balance_loss_clip": 1.03600097, + "balance_loss_mlp": 1.05385494, + "epoch": 0.11405381031113783, + "flos": 21324259365120.0, + "grad_norm": 1.6058527618620146, + "language_loss": 0.84707338, + "learning_rate": 3.926345380796821e-06, + "loss": 0.86936897, + "num_input_tokens_seen": 41026385, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.15625, + "step": 1897, + "time_per_iteration": 2.5120036602020264 + }, + { + "auxiliary_loss_clip": 0.0117262, + "auxiliary_loss_mlp": 0.01053072, + "balance_loss_clip": 1.03159046, + "balance_loss_mlp": 1.05385423, + "epoch": 0.11411393356380581, + "flos": 19719627091200.0, + "grad_norm": 2.3286063431421926, + "language_loss": 0.79776239, + "learning_rate": 3.9262406244307465e-06, + "loss": 0.8200193, + "num_input_tokens_seen": 41045315, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.1875, + "step": 1898, + "time_per_iteration": 2.5186216831207275 + }, + { + "auxiliary_loss_clip": 0.01174476, + "auxiliary_loss_mlp": 0.01056562, + "balance_loss_clip": 1.03330398, + "balance_loss_mlp": 1.05247831, + "epoch": 0.11417405681647377, + "flos": 17530440883200.0, + "grad_norm": 1.996095488823442, + "language_loss": 0.73049861, + "learning_rate": 3.926135795021435e-06, + "loss": 0.75280899, + "num_input_tokens_seen": 41063390, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.21875, + "step": 1899, + "time_per_iteration": 2.484767198562622 + }, + { + "auxiliary_loss_clip": 0.01059885, + "auxiliary_loss_mlp": 0.01004452, + "balance_loss_clip": 1.00197208, + "balance_loss_mlp": 1.0167762, + "epoch": 0.11423418006914174, + "flos": 59674666619520.0, + "grad_norm": 0.9092154832512579, + "language_loss": 0.63432097, + "learning_rate": 3.92603089257286e-06, + "loss": 0.65496433, + "num_input_tokens_seen": 41124180, + "router_z_loss_clip": 0.02478027, + "router_z_loss_mlp": 0.4296875, + "step": 1900, + "time_per_iteration": 3.0239956378936768 + }, + { + "auxiliary_loss_clip": 0.0117026, + "auxiliary_loss_mlp": 0.01058021, + "balance_loss_clip": 1.03600276, + "balance_loss_mlp": 1.05181098, + "epoch": 0.1142943033218097, + "flos": 22963114321920.0, + "grad_norm": 1.6715138036124124, + "language_loss": 0.78116465, + "learning_rate": 3.925925917089001e-06, + "loss": 0.80344748, + "num_input_tokens_seen": 41143485, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.1875, + "step": 1901, + "time_per_iteration": 2.5007457733154297 + }, + { + "auxiliary_loss_clip": 0.01172182, + "auxiliary_loss_mlp": 0.01059819, + "balance_loss_clip": 1.03894591, + "balance_loss_mlp": 1.05482793, + "epoch": 0.11435442657447768, + "flos": 18256267008000.0, + "grad_norm": 1.9023337273707566, + "language_loss": 0.83676988, + "learning_rate": 3.925820868573839e-06, + "loss": 0.85908997, + "num_input_tokens_seen": 41161695, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.171875, + "step": 1902, + "time_per_iteration": 2.4389002323150635 + }, + { + "auxiliary_loss_clip": 0.0117356, + "auxiliary_loss_mlp": 0.01050952, + "balance_loss_clip": 1.02856493, + "balance_loss_mlp": 1.05356252, + "epoch": 0.11441454982714565, + "flos": 24061191045120.0, + "grad_norm": 1.6958297254772137, + "language_loss": 0.77551281, + "learning_rate": 3.925715747031356e-06, + "loss": 0.79775804, + "num_input_tokens_seen": 41181715, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.203125, + "step": 1903, + "time_per_iteration": 2.503164768218994 + }, + { + "auxiliary_loss_clip": 0.01171838, + "auxiliary_loss_mlp": 0.01045456, + "balance_loss_clip": 1.02651334, + "balance_loss_mlp": 1.05437744, + "epoch": 0.11447467307981361, + "flos": 25337707557120.0, + "grad_norm": 2.553861289811236, + "language_loss": 0.75704938, + "learning_rate": 3.925610552465539e-06, + "loss": 0.77922231, + "num_input_tokens_seen": 41201770, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.171875, + "step": 1904, + "time_per_iteration": 2.5097854137420654 + }, + { + "auxiliary_loss_clip": 0.01171595, + "auxiliary_loss_mlp": 0.01053755, + "balance_loss_clip": 1.03192747, + "balance_loss_mlp": 1.05519056, + "epoch": 0.11453479633248159, + "flos": 21726063878400.0, + "grad_norm": 2.146045336495955, + "language_loss": 0.92476678, + "learning_rate": 3.9255052848803764e-06, + "loss": 0.94702017, + "num_input_tokens_seen": 41220590, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.1640625, + "step": 1905, + "time_per_iteration": 2.4905850887298584 + }, + { + "auxiliary_loss_clip": 0.0117632, + "auxiliary_loss_mlp": 0.01050773, + "balance_loss_clip": 1.02755141, + "balance_loss_mlp": 1.0496794, + "epoch": 0.11459491958514956, + "flos": 12969714096000.0, + "grad_norm": 2.457773566764277, + "language_loss": 0.77108872, + "learning_rate": 3.925399944279861e-06, + "loss": 0.7933597, + "num_input_tokens_seen": 41237250, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.265625, + "step": 1906, + "time_per_iteration": 2.4469265937805176 + }, + { + "auxiliary_loss_clip": 0.01173869, + "auxiliary_loss_mlp": 0.01053097, + "balance_loss_clip": 1.03072143, + "balance_loss_mlp": 1.05375302, + "epoch": 0.11465504283781752, + "flos": 22711273090560.0, + "grad_norm": 2.4555636334810593, + "language_loss": 0.81855345, + "learning_rate": 3.925294530667986e-06, + "loss": 0.84082305, + "num_input_tokens_seen": 41256680, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.203125, + "step": 1907, + "time_per_iteration": 2.499772071838379 + }, + { + "auxiliary_loss_clip": 0.01173651, + "auxiliary_loss_mlp": 0.0106593, + "balance_loss_clip": 1.045784, + "balance_loss_mlp": 1.05599511, + "epoch": 0.1147151660904855, + "flos": 23398387332480.0, + "grad_norm": 4.041607412488977, + "language_loss": 0.84798187, + "learning_rate": 3.92518904404875e-06, + "loss": 0.87037772, + "num_input_tokens_seen": 41270955, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.171875, + "step": 1908, + "time_per_iteration": 2.468519687652588 + }, + { + "auxiliary_loss_clip": 0.0105669, + "auxiliary_loss_mlp": 0.01004376, + "balance_loss_clip": 1.0018847, + "balance_loss_mlp": 1.01344705, + "epoch": 0.11477528934315347, + "flos": 63011843498880.0, + "grad_norm": 0.9477470057539497, + "language_loss": 0.6100027, + "learning_rate": 3.925083484426153e-06, + "loss": 0.63061339, + "num_input_tokens_seen": 41319180, + "router_z_loss_clip": 0.02490234, + "router_z_loss_mlp": 0.43164062, + "step": 1909, + "time_per_iteration": 2.8313472270965576 + }, + { + "auxiliary_loss_clip": 0.01174173, + "auxiliary_loss_mlp": 0.01052438, + "balance_loss_clip": 1.03223228, + "balance_loss_mlp": 1.05660319, + "epoch": 0.11483541259582143, + "flos": 16325601960960.0, + "grad_norm": 2.135894642259737, + "language_loss": 0.78793955, + "learning_rate": 3.924977851804197e-06, + "loss": 0.8102057, + "num_input_tokens_seen": 41337480, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.171875, + "step": 1910, + "time_per_iteration": 2.4613592624664307 + }, + { + "auxiliary_loss_clip": 0.01178149, + "auxiliary_loss_mlp": 0.01051798, + "balance_loss_clip": 1.03005373, + "balance_loss_mlp": 1.05803406, + "epoch": 0.1148955358484894, + "flos": 21580410228480.0, + "grad_norm": 3.035949872237615, + "language_loss": 0.76787984, + "learning_rate": 3.9248721461868875e-06, + "loss": 0.79017925, + "num_input_tokens_seen": 41354650, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.203125, + "step": 1911, + "time_per_iteration": 2.475069761276245 + }, + { + "auxiliary_loss_clip": 0.01166349, + "auxiliary_loss_mlp": 0.01051416, + "balance_loss_clip": 1.03048277, + "balance_loss_mlp": 1.05284548, + "epoch": 0.11495565910115738, + "flos": 27673696650240.0, + "grad_norm": 2.1144124150337023, + "language_loss": 0.7927531, + "learning_rate": 3.9247663675782336e-06, + "loss": 0.81493074, + "num_input_tokens_seen": 41376935, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1328125, + "step": 1912, + "time_per_iteration": 2.543607473373413 + }, + { + "auxiliary_loss_clip": 0.01169469, + "auxiliary_loss_mlp": 0.0105862, + "balance_loss_clip": 1.0369364, + "balance_loss_mlp": 1.05352569, + "epoch": 0.11501578235382534, + "flos": 20632368614400.0, + "grad_norm": 1.9322037304643997, + "language_loss": 0.7777245, + "learning_rate": 3.924660515982246e-06, + "loss": 0.80000544, + "num_input_tokens_seen": 41396105, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.1640625, + "step": 1913, + "time_per_iteration": 2.5093326568603516 + }, + { + "auxiliary_loss_clip": 0.01169525, + "auxiliary_loss_mlp": 0.01051154, + "balance_loss_clip": 1.02889776, + "balance_loss_mlp": 1.05118954, + "epoch": 0.1150759056064933, + "flos": 19829046896640.0, + "grad_norm": 3.783180746712747, + "language_loss": 0.70389271, + "learning_rate": 3.924554591402939e-06, + "loss": 0.72609949, + "num_input_tokens_seen": 41415600, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.1875, + "step": 1914, + "time_per_iteration": 2.5099785327911377 + }, + { + "auxiliary_loss_clip": 0.01058565, + "auxiliary_loss_mlp": 0.01002053, + "balance_loss_clip": 0.99943084, + "balance_loss_mlp": 1.01452589, + "epoch": 0.11513602885916129, + "flos": 70045776311040.0, + "grad_norm": 0.7556045547130329, + "language_loss": 0.61044526, + "learning_rate": 3.92444859384433e-06, + "loss": 0.63105142, + "num_input_tokens_seen": 41478760, + "router_z_loss_clip": 0.02624512, + "router_z_loss_mlp": 0.44140625, + "step": 1915, + "time_per_iteration": 3.1735148429870605 + }, + { + "auxiliary_loss_clip": 0.01172283, + "auxiliary_loss_mlp": 0.01054758, + "balance_loss_clip": 1.03273964, + "balance_loss_mlp": 1.05674434, + "epoch": 0.11519615211182925, + "flos": 15741730385280.0, + "grad_norm": 2.822924091618307, + "language_loss": 0.9323889, + "learning_rate": 3.924342523310436e-06, + "loss": 0.95465934, + "num_input_tokens_seen": 41495720, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.15625, + "step": 1916, + "time_per_iteration": 2.4806342124938965 + }, + { + "auxiliary_loss_clip": 0.01171086, + "auxiliary_loss_mlp": 0.01061893, + "balance_loss_clip": 1.03845596, + "balance_loss_mlp": 1.05340374, + "epoch": 0.11525627536449722, + "flos": 20667632791680.0, + "grad_norm": 1.8768677942494545, + "language_loss": 0.72286755, + "learning_rate": 3.9242363798052806e-06, + "loss": 0.7451973, + "num_input_tokens_seen": 41513585, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.171875, + "step": 1917, + "time_per_iteration": 2.519758701324463 + }, + { + "auxiliary_loss_clip": 0.01171782, + "auxiliary_loss_mlp": 0.0104867, + "balance_loss_clip": 1.02664053, + "balance_loss_mlp": 1.05521619, + "epoch": 0.1153163986171652, + "flos": 20303283185280.0, + "grad_norm": 2.2984335892825594, + "language_loss": 0.74389827, + "learning_rate": 3.92413016333289e-06, + "loss": 0.76610279, + "num_input_tokens_seen": 41533390, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1640625, + "step": 1918, + "time_per_iteration": 2.4867136478424072 + }, + { + "auxiliary_loss_clip": 0.01173604, + "auxiliary_loss_mlp": 0.01045744, + "balance_loss_clip": 1.02394044, + "balance_loss_mlp": 1.05273843, + "epoch": 0.11537652186983316, + "flos": 17639321984640.0, + "grad_norm": 2.1981507651696193, + "language_loss": 0.86515707, + "learning_rate": 3.92402387389729e-06, + "loss": 0.88735056, + "num_input_tokens_seen": 41551015, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.2109375, + "step": 1919, + "time_per_iteration": 2.4838428497314453 + }, + { + "auxiliary_loss_clip": 0.01168988, + "auxiliary_loss_mlp": 0.01054432, + "balance_loss_clip": 1.03190136, + "balance_loss_mlp": 1.05291939, + "epoch": 0.11543664512250112, + "flos": 21069401391360.0, + "grad_norm": 2.516832715272094, + "language_loss": 0.86640596, + "learning_rate": 3.923917511502512e-06, + "loss": 0.88864017, + "num_input_tokens_seen": 41568055, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.15625, + "step": 1920, + "time_per_iteration": 2.524017333984375 + }, + { + "auxiliary_loss_clip": 0.01167627, + "auxiliary_loss_mlp": 0.01047596, + "balance_loss_clip": 1.02549434, + "balance_loss_mlp": 1.05360281, + "epoch": 0.11549676837516909, + "flos": 22747542848640.0, + "grad_norm": 2.2143351457696525, + "language_loss": 0.79792106, + "learning_rate": 3.923811076152589e-06, + "loss": 0.82007331, + "num_input_tokens_seen": 41587435, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.140625, + "step": 1921, + "time_per_iteration": 2.494673252105713 + }, + { + "auxiliary_loss_clip": 0.01174597, + "auxiliary_loss_mlp": 0.01056005, + "balance_loss_clip": 1.03331947, + "balance_loss_mlp": 1.05358851, + "epoch": 0.11555689162783707, + "flos": 19168972617600.0, + "grad_norm": 8.96706495073623, + "language_loss": 0.78418177, + "learning_rate": 3.923704567851557e-06, + "loss": 0.8064878, + "num_input_tokens_seen": 41604975, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.2109375, + "step": 1922, + "time_per_iteration": 2.5293705463409424 + }, + { + "auxiliary_loss_clip": 0.01174074, + "auxiliary_loss_mlp": 0.01060645, + "balance_loss_clip": 1.03910375, + "balance_loss_mlp": 1.05410469, + "epoch": 0.11561701488050503, + "flos": 24572056227840.0, + "grad_norm": 1.8482726295091094, + "language_loss": 0.84187758, + "learning_rate": 3.923597986603456e-06, + "loss": 0.86422473, + "num_input_tokens_seen": 41626155, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.203125, + "step": 1923, + "time_per_iteration": 2.5203118324279785 + }, + { + "auxiliary_loss_clip": 0.01175585, + "auxiliary_loss_mlp": 0.01053498, + "balance_loss_clip": 1.03074098, + "balance_loss_mlp": 1.05742192, + "epoch": 0.115677138133173, + "flos": 17092546179840.0, + "grad_norm": 2.0576366068601666, + "language_loss": 0.80471247, + "learning_rate": 3.9234913324123264e-06, + "loss": 0.82700336, + "num_input_tokens_seen": 41644805, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.1796875, + "step": 1924, + "time_per_iteration": 2.48531436920166 + }, + { + "auxiliary_loss_clip": 0.01058783, + "auxiliary_loss_mlp": 0.01001491, + "balance_loss_clip": 0.99917841, + "balance_loss_mlp": 1.0154866, + "epoch": 0.11573726138584098, + "flos": 62703875266560.0, + "grad_norm": 0.810907468185892, + "language_loss": 0.6115036, + "learning_rate": 3.923384605282212e-06, + "loss": 0.6321063, + "num_input_tokens_seen": 41709345, + "router_z_loss_clip": 0.02307129, + "router_z_loss_mlp": 0.43359375, + "step": 1925, + "time_per_iteration": 3.112396478652954 + }, + { + "auxiliary_loss_clip": 0.01173159, + "auxiliary_loss_mlp": 0.01076027, + "balance_loss_clip": 1.05304384, + "balance_loss_mlp": 1.05447614, + "epoch": 0.11579738463850894, + "flos": 22601135013120.0, + "grad_norm": 2.806943429185086, + "language_loss": 0.7482335, + "learning_rate": 3.923277805217161e-06, + "loss": 0.77072537, + "num_input_tokens_seen": 41730210, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.1875, + "step": 1926, + "time_per_iteration": 2.4890315532684326 + }, + { + "auxiliary_loss_clip": 0.01174997, + "auxiliary_loss_mlp": 0.0106307, + "balance_loss_clip": 1.03873897, + "balance_loss_mlp": 1.0552361, + "epoch": 0.11585750789117691, + "flos": 21726135705600.0, + "grad_norm": 2.429758451090488, + "language_loss": 0.73112315, + "learning_rate": 3.923170932221222e-06, + "loss": 0.7535038, + "num_input_tokens_seen": 41750270, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.203125, + "step": 1927, + "time_per_iteration": 2.4673402309417725 + }, + { + "auxiliary_loss_clip": 0.0117016, + "auxiliary_loss_mlp": 0.01054777, + "balance_loss_clip": 1.03244913, + "balance_loss_mlp": 1.05291271, + "epoch": 0.11591763114384489, + "flos": 26287544851200.0, + "grad_norm": 2.854021270140142, + "language_loss": 0.86824137, + "learning_rate": 3.92306398629845e-06, + "loss": 0.89049077, + "num_input_tokens_seen": 41772975, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.171875, + "step": 1928, + "time_per_iteration": 2.530325412750244 + }, + { + "auxiliary_loss_clip": 0.01173569, + "auxiliary_loss_mlp": 0.01055147, + "balance_loss_clip": 1.03289056, + "balance_loss_mlp": 1.05469573, + "epoch": 0.11597775439651285, + "flos": 23000461488000.0, + "grad_norm": 1.71243688867153, + "language_loss": 0.77567977, + "learning_rate": 3.922956967452898e-06, + "loss": 0.79796684, + "num_input_tokens_seen": 41791765, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.1875, + "step": 1929, + "time_per_iteration": 2.489664316177368 + }, + { + "auxiliary_loss_clip": 0.01168882, + "auxiliary_loss_mlp": 0.01062437, + "balance_loss_clip": 1.04238629, + "balance_loss_mlp": 1.05385804, + "epoch": 0.11603787764918082, + "flos": 31941715507200.0, + "grad_norm": 1.6293868207273203, + "language_loss": 0.76724243, + "learning_rate": 3.922849875688626e-06, + "loss": 0.78955561, + "num_input_tokens_seen": 41815615, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.1484375, + "step": 1930, + "time_per_iteration": 2.5867533683776855 + }, + { + "auxiliary_loss_clip": 0.01169352, + "auxiliary_loss_mlp": 0.0105213, + "balance_loss_clip": 1.03027928, + "balance_loss_mlp": 1.05313969, + "epoch": 0.1160980009018488, + "flos": 22271654534400.0, + "grad_norm": 1.9270697111110349, + "language_loss": 0.72114342, + "learning_rate": 3.922742711009693e-06, + "loss": 0.74335825, + "num_input_tokens_seen": 41834810, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.1640625, + "step": 1931, + "time_per_iteration": 2.5218429565429688 + }, + { + "auxiliary_loss_clip": 0.01173627, + "auxiliary_loss_mlp": 0.0105412, + "balance_loss_clip": 1.03168511, + "balance_loss_mlp": 1.05528855, + "epoch": 0.11615812415451676, + "flos": 22783633038720.0, + "grad_norm": 1.5295866923660926, + "language_loss": 0.82133794, + "learning_rate": 3.922635473420164e-06, + "loss": 0.84361541, + "num_input_tokens_seen": 41854975, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.1875, + "step": 1932, + "time_per_iteration": 2.4879212379455566 + }, + { + "auxiliary_loss_clip": 0.01053319, + "auxiliary_loss_mlp": 0.01007659, + "balance_loss_clip": 1.00539386, + "balance_loss_mlp": 1.0111897, + "epoch": 0.11621824740718473, + "flos": 67146096107520.0, + "grad_norm": 0.7701959329661775, + "language_loss": 0.61053753, + "learning_rate": 3.922528162924105e-06, + "loss": 0.63114727, + "num_input_tokens_seen": 41911105, + "router_z_loss_clip": 0.02270508, + "router_z_loss_mlp": 0.421875, + "step": 1933, + "time_per_iteration": 2.960437059402466 + }, + { + "auxiliary_loss_clip": 0.01172297, + "auxiliary_loss_mlp": 0.01054299, + "balance_loss_clip": 1.03248382, + "balance_loss_mlp": 1.05259895, + "epoch": 0.11627837065985269, + "flos": 20375930442240.0, + "grad_norm": 2.2263920275904425, + "language_loss": 0.85587192, + "learning_rate": 3.922420779525586e-06, + "loss": 0.87813795, + "num_input_tokens_seen": 41931750, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.1953125, + "step": 1934, + "time_per_iteration": 5.3810875415802 + }, + { + "auxiliary_loss_clip": 0.01178805, + "auxiliary_loss_mlp": 0.01059072, + "balance_loss_clip": 1.03667259, + "balance_loss_mlp": 1.05852652, + "epoch": 0.11633849391252067, + "flos": 21725812483200.0, + "grad_norm": 2.481370623449466, + "language_loss": 0.65555394, + "learning_rate": 3.9223133232286776e-06, + "loss": 0.67793274, + "num_input_tokens_seen": 41949400, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.203125, + "step": 1935, + "time_per_iteration": 2.483814239501953 + }, + { + "auxiliary_loss_clip": 0.01176161, + "auxiliary_loss_mlp": 0.01053675, + "balance_loss_clip": 1.03352857, + "balance_loss_mlp": 1.05533004, + "epoch": 0.11639861716518864, + "flos": 18805341283200.0, + "grad_norm": 1.8046174937009931, + "language_loss": 0.75469184, + "learning_rate": 3.922205794037456e-06, + "loss": 0.77699012, + "num_input_tokens_seen": 41968100, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.2109375, + "step": 1936, + "time_per_iteration": 3.8786003589630127 + }, + { + "auxiliary_loss_clip": 0.01173369, + "auxiliary_loss_mlp": 0.0105617, + "balance_loss_clip": 1.0325551, + "balance_loss_mlp": 1.05320179, + "epoch": 0.1164587404178566, + "flos": 21214983214080.0, + "grad_norm": 1.9600676544166102, + "language_loss": 0.84061754, + "learning_rate": 3.922098191955998e-06, + "loss": 0.86291301, + "num_input_tokens_seen": 41986375, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.1953125, + "step": 1937, + "time_per_iteration": 2.5084798336029053 + }, + { + "auxiliary_loss_clip": 0.01166803, + "auxiliary_loss_mlp": 0.01045843, + "balance_loss_clip": 1.02533889, + "balance_loss_mlp": 1.05254185, + "epoch": 0.11651886367052458, + "flos": 27818632028160.0, + "grad_norm": 2.0067941571917927, + "language_loss": 0.76479459, + "learning_rate": 3.921990516988384e-06, + "loss": 0.78692102, + "num_input_tokens_seen": 42006055, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.140625, + "step": 1938, + "time_per_iteration": 2.5770225524902344 + }, + { + "auxiliary_loss_clip": 0.01177239, + "auxiliary_loss_mlp": 0.01051282, + "balance_loss_clip": 1.02963328, + "balance_loss_mlp": 1.05566061, + "epoch": 0.11657898692319255, + "flos": 22889569224960.0, + "grad_norm": 2.0274312317590084, + "language_loss": 0.79127967, + "learning_rate": 3.921882769138696e-06, + "loss": 0.8135649, + "num_input_tokens_seen": 42024995, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.21875, + "step": 1939, + "time_per_iteration": 2.5020864009857178 + }, + { + "auxiliary_loss_clip": 0.01173869, + "auxiliary_loss_mlp": 0.0105151, + "balance_loss_clip": 1.02886081, + "balance_loss_mlp": 1.05530274, + "epoch": 0.11663911017586051, + "flos": 24315905364480.0, + "grad_norm": 3.7077039427391343, + "language_loss": 0.86712289, + "learning_rate": 3.9217749484110215e-06, + "loss": 0.88937664, + "num_input_tokens_seen": 42042640, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.1875, + "step": 1940, + "time_per_iteration": 2.484750270843506 + }, + { + "auxiliary_loss_clip": 0.01172427, + "auxiliary_loss_mlp": 0.0105781, + "balance_loss_clip": 1.03699601, + "balance_loss_mlp": 1.05674481, + "epoch": 0.11669923342852849, + "flos": 42340152470400.0, + "grad_norm": 1.4506595925957548, + "language_loss": 0.75750297, + "learning_rate": 3.921667054809449e-06, + "loss": 0.7798053, + "num_input_tokens_seen": 42067005, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.15625, + "step": 1941, + "time_per_iteration": 2.7000842094421387 + }, + { + "auxiliary_loss_clip": 0.01167731, + "auxiliary_loss_mlp": 0.01059034, + "balance_loss_clip": 1.0375998, + "balance_loss_mlp": 1.05215478, + "epoch": 0.11675935668119646, + "flos": 14642288945280.0, + "grad_norm": 2.1675787105273256, + "language_loss": 0.8828994, + "learning_rate": 3.921559088338068e-06, + "loss": 0.90516704, + "num_input_tokens_seen": 42082295, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.15625, + "step": 1942, + "time_per_iteration": 2.460014581680298 + }, + { + "auxiliary_loss_clip": 0.01170106, + "auxiliary_loss_mlp": 0.01048326, + "balance_loss_clip": 1.02839422, + "balance_loss_mlp": 1.05465341, + "epoch": 0.11681947993386442, + "flos": 35116470063360.0, + "grad_norm": 1.688985931696262, + "language_loss": 0.67729998, + "learning_rate": 3.921451049000975e-06, + "loss": 0.69948429, + "num_input_tokens_seen": 42105295, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.15625, + "step": 1943, + "time_per_iteration": 2.5899837017059326 + }, + { + "auxiliary_loss_clip": 0.01170349, + "auxiliary_loss_mlp": 0.01046897, + "balance_loss_clip": 1.02586865, + "balance_loss_mlp": 1.05437136, + "epoch": 0.11687960318653239, + "flos": 38983259024640.0, + "grad_norm": 2.2767867948110263, + "language_loss": 0.69852126, + "learning_rate": 3.921342936802265e-06, + "loss": 0.72069371, + "num_input_tokens_seen": 42125520, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1640625, + "step": 1944, + "time_per_iteration": 2.6237125396728516 + }, + { + "auxiliary_loss_clip": 0.01166997, + "auxiliary_loss_mlp": 0.01045496, + "balance_loss_clip": 1.02513456, + "balance_loss_mlp": 1.05112338, + "epoch": 0.11693972643920036, + "flos": 25994980575360.0, + "grad_norm": 2.1059371232711572, + "language_loss": 0.82477605, + "learning_rate": 3.921234751746038e-06, + "loss": 0.84690094, + "num_input_tokens_seen": 42146335, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.15625, + "step": 1945, + "time_per_iteration": 2.519148349761963 + }, + { + "auxiliary_loss_clip": 0.01169071, + "auxiliary_loss_mlp": 0.01058941, + "balance_loss_clip": 1.03805542, + "balance_loss_mlp": 1.05241919, + "epoch": 0.11699984969186833, + "flos": 27272107618560.0, + "grad_norm": 2.378189536328268, + "language_loss": 0.7640717, + "learning_rate": 3.9211264938363975e-06, + "loss": 0.7863518, + "num_input_tokens_seen": 42165320, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1640625, + "step": 1946, + "time_per_iteration": 2.516782283782959 + }, + { + "auxiliary_loss_clip": 0.01169578, + "auxiliary_loss_mlp": 0.0105231, + "balance_loss_clip": 1.03249717, + "balance_loss_mlp": 1.05597568, + "epoch": 0.1170599729445363, + "flos": 15267853232640.0, + "grad_norm": 2.040115867247402, + "language_loss": 0.68749321, + "learning_rate": 3.921018163077448e-06, + "loss": 0.70971209, + "num_input_tokens_seen": 42182955, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.140625, + "step": 1947, + "time_per_iteration": 2.443979501724243 + }, + { + "auxiliary_loss_clip": 0.01173266, + "auxiliary_loss_mlp": 0.01062988, + "balance_loss_clip": 1.041924, + "balance_loss_mlp": 1.05761504, + "epoch": 0.11712009619720427, + "flos": 17164439251200.0, + "grad_norm": 1.892409556337103, + "language_loss": 0.84730887, + "learning_rate": 3.920909759473295e-06, + "loss": 0.86967146, + "num_input_tokens_seen": 42200760, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.15625, + "step": 1948, + "time_per_iteration": 2.456883192062378 + }, + { + "auxiliary_loss_clip": 0.01060706, + "auxiliary_loss_mlp": 0.01000375, + "balance_loss_clip": 0.99784815, + "balance_loss_mlp": 1.01743388, + "epoch": 0.11718021944987224, + "flos": 70940991997440.0, + "grad_norm": 0.8146373030628324, + "language_loss": 0.65102834, + "learning_rate": 3.920801283028054e-06, + "loss": 0.6716392, + "num_input_tokens_seen": 42265745, + "router_z_loss_clip": 0.02526855, + "router_z_loss_mlp": 0.43359375, + "step": 1949, + "time_per_iteration": 3.083716630935669 + }, + { + "auxiliary_loss_clip": 0.01168495, + "auxiliary_loss_mlp": 0.01056708, + "balance_loss_clip": 1.03614426, + "balance_loss_mlp": 1.05524707, + "epoch": 0.1172403427025402, + "flos": 27453456408960.0, + "grad_norm": 1.7265339558443402, + "language_loss": 0.71616268, + "learning_rate": 3.920692733745835e-06, + "loss": 0.73841476, + "num_input_tokens_seen": 42286245, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.1328125, + "step": 1950, + "time_per_iteration": 2.5140750408172607 + }, + { + "auxiliary_loss_clip": 0.01174036, + "auxiliary_loss_mlp": 0.0105899, + "balance_loss_clip": 1.03823543, + "balance_loss_mlp": 1.05524027, + "epoch": 0.11730046595520818, + "flos": 15668723992320.0, + "grad_norm": 13.047142281747327, + "language_loss": 0.76811576, + "learning_rate": 3.920584111630755e-06, + "loss": 0.79044604, + "num_input_tokens_seen": 42302710, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1875, + "step": 1951, + "time_per_iteration": 2.4511098861694336 + }, + { + "auxiliary_loss_clip": 0.01172385, + "auxiliary_loss_mlp": 0.0106409, + "balance_loss_clip": 1.04351449, + "balance_loss_mlp": 1.05736876, + "epoch": 0.11736058920787615, + "flos": 25630164092160.0, + "grad_norm": 2.4689531190361858, + "language_loss": 0.75770319, + "learning_rate": 3.9204754166869325e-06, + "loss": 0.78006792, + "num_input_tokens_seen": 42324115, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.15625, + "step": 1952, + "time_per_iteration": 2.5249404907226562 + }, + { + "auxiliary_loss_clip": 0.01170041, + "auxiliary_loss_mlp": 0.01060486, + "balance_loss_clip": 1.04038692, + "balance_loss_mlp": 1.05350161, + "epoch": 0.11742071246054411, + "flos": 21434289701760.0, + "grad_norm": 1.8929141854364566, + "language_loss": 0.71838403, + "learning_rate": 3.920366648918491e-06, + "loss": 0.74068928, + "num_input_tokens_seen": 42342505, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.171875, + "step": 1953, + "time_per_iteration": 2.5321006774902344 + }, + { + "auxiliary_loss_clip": 0.01178671, + "auxiliary_loss_mlp": 0.01054108, + "balance_loss_clip": 1.03186345, + "balance_loss_mlp": 1.05794597, + "epoch": 0.11748083571321208, + "flos": 15997845335040.0, + "grad_norm": 2.5505654209141317, + "language_loss": 0.7939415, + "learning_rate": 3.920257808329552e-06, + "loss": 0.81626928, + "num_input_tokens_seen": 42360525, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.203125, + "step": 1954, + "time_per_iteration": 2.477182149887085 + }, + { + "auxiliary_loss_clip": 0.01174109, + "auxiliary_loss_mlp": 0.01060284, + "balance_loss_clip": 1.03859961, + "balance_loss_mlp": 1.05628419, + "epoch": 0.11754095896588006, + "flos": 16180056051840.0, + "grad_norm": 2.1305529461824344, + "language_loss": 0.85609406, + "learning_rate": 3.920148894924246e-06, + "loss": 0.878438, + "num_input_tokens_seen": 42377045, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.1796875, + "step": 1955, + "time_per_iteration": 2.4685070514678955 + }, + { + "auxiliary_loss_clip": 0.01172636, + "auxiliary_loss_mlp": 0.01049417, + "balance_loss_clip": 1.02949762, + "balance_loss_mlp": 1.05551839, + "epoch": 0.11760108221854802, + "flos": 13261596013440.0, + "grad_norm": 3.149612339355701, + "language_loss": 0.77626467, + "learning_rate": 3.920039908706701e-06, + "loss": 0.79848516, + "num_input_tokens_seen": 42393960, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.171875, + "step": 1956, + "time_per_iteration": 2.469529151916504 + }, + { + "auxiliary_loss_clip": 0.01169266, + "auxiliary_loss_mlp": 0.01054147, + "balance_loss_clip": 1.03357112, + "balance_loss_mlp": 1.05667603, + "epoch": 0.11766120547121599, + "flos": 24498439303680.0, + "grad_norm": 4.253665449575931, + "language_loss": 0.80333984, + "learning_rate": 3.91993084968105e-06, + "loss": 0.82557392, + "num_input_tokens_seen": 42413160, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.125, + "step": 1957, + "time_per_iteration": 2.508272886276245 + }, + { + "auxiliary_loss_clip": 0.01176684, + "auxiliary_loss_mlp": 0.01049845, + "balance_loss_clip": 1.03003287, + "balance_loss_mlp": 1.05895627, + "epoch": 0.11772132872388397, + "flos": 17784005967360.0, + "grad_norm": 3.1587185145349737, + "language_loss": 0.77638769, + "learning_rate": 3.919821717851428e-06, + "loss": 0.79865301, + "num_input_tokens_seen": 42432590, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.1796875, + "step": 1958, + "time_per_iteration": 2.48563551902771 + }, + { + "auxiliary_loss_clip": 0.01174636, + "auxiliary_loss_mlp": 0.01048305, + "balance_loss_clip": 1.02640605, + "balance_loss_mlp": 1.05859971, + "epoch": 0.11778145197655193, + "flos": 13217030213760.0, + "grad_norm": 2.0966272081131985, + "language_loss": 0.76906043, + "learning_rate": 3.919712513221976e-06, + "loss": 0.79128981, + "num_input_tokens_seen": 42450135, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.15625, + "step": 1959, + "time_per_iteration": 2.4826674461364746 + }, + { + "auxiliary_loss_clip": 0.01171719, + "auxiliary_loss_mlp": 0.01050959, + "balance_loss_clip": 1.03128934, + "balance_loss_mlp": 1.05581582, + "epoch": 0.1178415752292199, + "flos": 20230204965120.0, + "grad_norm": 3.13785825532277, + "language_loss": 0.69989765, + "learning_rate": 3.919603235796832e-06, + "loss": 0.72212446, + "num_input_tokens_seen": 42470050, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.15625, + "step": 1960, + "time_per_iteration": 2.4965405464172363 + }, + { + "auxiliary_loss_clip": 0.01178622, + "auxiliary_loss_mlp": 0.01058274, + "balance_loss_clip": 1.03704309, + "balance_loss_mlp": 1.05921102, + "epoch": 0.11790169848188788, + "flos": 13040134709760.0, + "grad_norm": 2.5802576751796327, + "language_loss": 0.81135678, + "learning_rate": 3.9194938855801406e-06, + "loss": 0.83372575, + "num_input_tokens_seen": 42484335, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1953125, + "step": 1961, + "time_per_iteration": 2.456537961959839 + }, + { + "auxiliary_loss_clip": 0.01167569, + "auxiliary_loss_mlp": 0.01055573, + "balance_loss_clip": 1.03640413, + "balance_loss_mlp": 1.05682623, + "epoch": 0.11796182173455584, + "flos": 22265728790400.0, + "grad_norm": 3.5009623449342206, + "language_loss": 0.92335653, + "learning_rate": 3.919384462576049e-06, + "loss": 0.94558799, + "num_input_tokens_seen": 42502720, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.109375, + "step": 1962, + "time_per_iteration": 2.4831955432891846 + }, + { + "auxiliary_loss_clip": 0.01175087, + "auxiliary_loss_mlp": 0.01054037, + "balance_loss_clip": 1.03379536, + "balance_loss_mlp": 1.05849361, + "epoch": 0.1180219449872238, + "flos": 10635017892480.0, + "grad_norm": 2.1891263418172353, + "language_loss": 0.87132198, + "learning_rate": 3.919274966788707e-06, + "loss": 0.89361322, + "num_input_tokens_seen": 42519460, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.1640625, + "step": 1963, + "time_per_iteration": 2.453864097595215 + }, + { + "auxiliary_loss_clip": 0.01174267, + "auxiliary_loss_mlp": 0.01047313, + "balance_loss_clip": 1.02764392, + "balance_loss_mlp": 1.05800569, + "epoch": 0.11808206823989177, + "flos": 20923532259840.0, + "grad_norm": 2.1122466665000155, + "language_loss": 0.84163988, + "learning_rate": 3.919165398222265e-06, + "loss": 0.86385566, + "num_input_tokens_seen": 42539420, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1640625, + "step": 1964, + "time_per_iteration": 2.496471405029297 + }, + { + "auxiliary_loss_clip": 0.01178376, + "auxiliary_loss_mlp": 0.01056634, + "balance_loss_clip": 1.03699994, + "balance_loss_mlp": 1.06327403, + "epoch": 0.11814219149255975, + "flos": 20777770869120.0, + "grad_norm": 1.965243610427017, + "language_loss": 0.82994169, + "learning_rate": 3.919055756880879e-06, + "loss": 0.85229176, + "num_input_tokens_seen": 42558225, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.1484375, + "step": 1965, + "time_per_iteration": 2.46545672416687 + }, + { + "auxiliary_loss_clip": 0.01175057, + "auxiliary_loss_mlp": 0.01049044, + "balance_loss_clip": 1.02883816, + "balance_loss_mlp": 1.05948591, + "epoch": 0.11820231474522772, + "flos": 48759938542080.0, + "grad_norm": 1.6968751772896917, + "language_loss": 0.74517393, + "learning_rate": 3.918946042768707e-06, + "loss": 0.76741493, + "num_input_tokens_seen": 42580790, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.15625, + "step": 1966, + "time_per_iteration": 2.730928421020508 + }, + { + "auxiliary_loss_clip": 0.01185811, + "auxiliary_loss_mlp": 0.01055482, + "balance_loss_clip": 1.03552604, + "balance_loss_mlp": 1.0661025, + "epoch": 0.11826243799789568, + "flos": 16690598012160.0, + "grad_norm": 3.573953561090722, + "language_loss": 0.725128, + "learning_rate": 3.918836255889908e-06, + "loss": 0.74754095, + "num_input_tokens_seen": 42597355, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.1953125, + "step": 1967, + "time_per_iteration": 2.459409713745117 + }, + { + "auxiliary_loss_clip": 0.01174728, + "auxiliary_loss_mlp": 0.01044222, + "balance_loss_clip": 1.02409899, + "balance_loss_mlp": 1.0596199, + "epoch": 0.11832256125056366, + "flos": 16909868586240.0, + "grad_norm": 2.07735233424318, + "language_loss": 0.87874025, + "learning_rate": 3.9187263962486456e-06, + "loss": 0.90092969, + "num_input_tokens_seen": 42616060, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.15625, + "step": 1968, + "time_per_iteration": 2.474860191345215 + }, + { + "auxiliary_loss_clip": 0.0117476, + "auxiliary_loss_mlp": 0.01051094, + "balance_loss_clip": 1.03083992, + "balance_loss_mlp": 1.05980873, + "epoch": 0.11838268450323162, + "flos": 22820405587200.0, + "grad_norm": 2.3710109771053904, + "language_loss": 0.66827953, + "learning_rate": 3.918616463849087e-06, + "loss": 0.69053805, + "num_input_tokens_seen": 42636285, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1484375, + "step": 1969, + "time_per_iteration": 2.5025057792663574 + }, + { + "auxiliary_loss_clip": 0.01177024, + "auxiliary_loss_mlp": 0.01052173, + "balance_loss_clip": 1.03172874, + "balance_loss_mlp": 1.06375933, + "epoch": 0.11844280775589959, + "flos": 33545844990720.0, + "grad_norm": 2.0668162562591013, + "language_loss": 0.81199527, + "learning_rate": 3.918506458695399e-06, + "loss": 0.83428723, + "num_input_tokens_seen": 42658320, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1328125, + "step": 1970, + "time_per_iteration": 2.6005184650421143 + }, + { + "auxiliary_loss_clip": 0.01071753, + "auxiliary_loss_mlp": 0.0102596, + "balance_loss_clip": 1.02306354, + "balance_loss_mlp": 1.02803779, + "epoch": 0.11850293100856757, + "flos": 66350998604160.0, + "grad_norm": 0.8059191438251484, + "language_loss": 0.66145539, + "learning_rate": 3.918396380791754e-06, + "loss": 0.68243253, + "num_input_tokens_seen": 42721500, + "router_z_loss_clip": 0.02893066, + "router_z_loss_mlp": 0.4375, + "step": 1971, + "time_per_iteration": 3.0580737590789795 + }, + { + "auxiliary_loss_clip": 0.01173379, + "auxiliary_loss_mlp": 0.0105069, + "balance_loss_clip": 1.03112769, + "balance_loss_mlp": 1.0578413, + "epoch": 0.11856305426123553, + "flos": 24681045070080.0, + "grad_norm": 1.9720310647047086, + "language_loss": 0.79760695, + "learning_rate": 3.918286230142327e-06, + "loss": 0.81984764, + "num_input_tokens_seen": 42739825, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.15625, + "step": 1972, + "time_per_iteration": 2.5330677032470703 + }, + { + "auxiliary_loss_clip": 0.01174806, + "auxiliary_loss_mlp": 0.01052417, + "balance_loss_clip": 1.03144813, + "balance_loss_mlp": 1.06013465, + "epoch": 0.1186231775139035, + "flos": 24280102483200.0, + "grad_norm": 2.451560144092476, + "language_loss": 0.72162819, + "learning_rate": 3.918176006751292e-06, + "loss": 0.74390036, + "num_input_tokens_seen": 42758695, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.140625, + "step": 1973, + "time_per_iteration": 2.497079372406006 + }, + { + "auxiliary_loss_clip": 0.0117035, + "auxiliary_loss_mlp": 0.01043803, + "balance_loss_clip": 1.02407408, + "balance_loss_mlp": 1.05802357, + "epoch": 0.11868330076657148, + "flos": 21757413473280.0, + "grad_norm": 2.2680636805256897, + "language_loss": 0.71724641, + "learning_rate": 3.918065710622832e-06, + "loss": 0.73938787, + "num_input_tokens_seen": 42778510, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.125, + "step": 1974, + "time_per_iteration": 2.5145771503448486 + }, + { + "auxiliary_loss_clip": 0.01170733, + "auxiliary_loss_mlp": 0.01038557, + "balance_loss_clip": 1.01937568, + "balance_loss_mlp": 1.05660915, + "epoch": 0.11874342401923944, + "flos": 17193274894080.0, + "grad_norm": 2.192039880981389, + "language_loss": 0.77186036, + "learning_rate": 3.917955341761128e-06, + "loss": 0.7939533, + "num_input_tokens_seen": 42793995, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.140625, + "step": 1975, + "time_per_iteration": 2.4483766555786133 + }, + { + "auxiliary_loss_clip": 0.01172653, + "auxiliary_loss_mlp": 0.01051494, + "balance_loss_clip": 1.03212273, + "balance_loss_mlp": 1.06021976, + "epoch": 0.11880354727190741, + "flos": 15229572312960.0, + "grad_norm": 2.2667330410251596, + "language_loss": 0.7498399, + "learning_rate": 3.917844900170364e-06, + "loss": 0.77208138, + "num_input_tokens_seen": 42809000, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.125, + "step": 1976, + "time_per_iteration": 3.9421374797821045 + }, + { + "auxiliary_loss_clip": 0.01172444, + "auxiliary_loss_mlp": 0.01044553, + "balance_loss_clip": 1.02544367, + "balance_loss_mlp": 1.05979395, + "epoch": 0.11886367052457537, + "flos": 27309706179840.0, + "grad_norm": 1.6192257034176818, + "language_loss": 0.75191766, + "learning_rate": 3.91773438585473e-06, + "loss": 0.77408761, + "num_input_tokens_seen": 42831585, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.125, + "step": 1977, + "time_per_iteration": 3.9506070613861084 + }, + { + "auxiliary_loss_clip": 0.01172952, + "auxiliary_loss_mlp": 0.01053238, + "balance_loss_clip": 1.0338068, + "balance_loss_mlp": 1.05777454, + "epoch": 0.11892379377724335, + "flos": 21798280172160.0, + "grad_norm": 7.387040580957373, + "language_loss": 0.7393533, + "learning_rate": 3.9176237988184165e-06, + "loss": 0.76161528, + "num_input_tokens_seen": 42848420, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.15625, + "step": 1978, + "time_per_iteration": 2.4754912853240967 + }, + { + "auxiliary_loss_clip": 0.01168854, + "auxiliary_loss_mlp": 0.01048253, + "balance_loss_clip": 1.02872658, + "balance_loss_mlp": 1.05782461, + "epoch": 0.11898391702991132, + "flos": 13991013498240.0, + "grad_norm": 1.709416576437117, + "language_loss": 0.73273945, + "learning_rate": 3.917513139065616e-06, + "loss": 0.75491059, + "num_input_tokens_seen": 42866645, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.109375, + "step": 1979, + "time_per_iteration": 2.478938579559326 + }, + { + "auxiliary_loss_clip": 0.01172076, + "auxiliary_loss_mlp": 0.01048439, + "balance_loss_clip": 1.0286746, + "balance_loss_mlp": 1.05735934, + "epoch": 0.11904404028257928, + "flos": 32234567091840.0, + "grad_norm": 1.877436937799078, + "language_loss": 0.98387957, + "learning_rate": 3.917402406600525e-06, + "loss": 1.00608468, + "num_input_tokens_seen": 42888515, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1484375, + "step": 1980, + "time_per_iteration": 2.5758843421936035 + }, + { + "auxiliary_loss_clip": 0.01173349, + "auxiliary_loss_mlp": 0.01046819, + "balance_loss_clip": 1.02580202, + "balance_loss_mlp": 1.05741775, + "epoch": 0.11910416353524726, + "flos": 23586272398080.0, + "grad_norm": 1.8930015682875676, + "language_loss": 0.85929906, + "learning_rate": 3.917291601427342e-06, + "loss": 0.88150084, + "num_input_tokens_seen": 42909035, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.15625, + "step": 1981, + "time_per_iteration": 2.505051612854004 + }, + { + "auxiliary_loss_clip": 0.01172656, + "auxiliary_loss_mlp": 0.01057237, + "balance_loss_clip": 1.03601766, + "balance_loss_mlp": 1.057832, + "epoch": 0.11916428678791523, + "flos": 25333038789120.0, + "grad_norm": 1.9242535829958574, + "language_loss": 0.85007018, + "learning_rate": 3.91718072355027e-06, + "loss": 0.87236911, + "num_input_tokens_seen": 42927555, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1484375, + "step": 1982, + "time_per_iteration": 2.513012409210205 + }, + { + "auxiliary_loss_clip": 0.01166906, + "auxiliary_loss_mlp": 0.01046511, + "balance_loss_clip": 1.02667475, + "balance_loss_mlp": 1.05463564, + "epoch": 0.11922441004058319, + "flos": 19788431592960.0, + "grad_norm": 1.926275276354154, + "language_loss": 0.85026526, + "learning_rate": 3.917069772973513e-06, + "loss": 0.87239939, + "num_input_tokens_seen": 42945300, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.125, + "step": 1983, + "time_per_iteration": 2.4627623558044434 + }, + { + "auxiliary_loss_clip": 0.01172266, + "auxiliary_loss_mlp": 0.01049845, + "balance_loss_clip": 1.02980638, + "balance_loss_mlp": 1.05581713, + "epoch": 0.11928453329325117, + "flos": 21536347219200.0, + "grad_norm": 3.2679367356540894, + "language_loss": 0.77020949, + "learning_rate": 3.916958749701277e-06, + "loss": 0.79243064, + "num_input_tokens_seen": 42961295, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.15625, + "step": 1984, + "time_per_iteration": 2.466224193572998 + }, + { + "auxiliary_loss_clip": 0.01168386, + "auxiliary_loss_mlp": 0.01055095, + "balance_loss_clip": 1.03542554, + "balance_loss_mlp": 1.05464029, + "epoch": 0.11934465654591914, + "flos": 20815010294400.0, + "grad_norm": 1.7272493982968635, + "language_loss": 0.83323789, + "learning_rate": 3.9168476537377745e-06, + "loss": 0.85547268, + "num_input_tokens_seen": 42980330, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.140625, + "step": 1985, + "time_per_iteration": 2.485797882080078 + }, + { + "auxiliary_loss_clip": 0.01162278, + "auxiliary_loss_mlp": 0.01046498, + "balance_loss_clip": 1.02659011, + "balance_loss_mlp": 1.05230284, + "epoch": 0.1194047797985871, + "flos": 19060486565760.0, + "grad_norm": 1.9847962315308523, + "language_loss": 0.7379061, + "learning_rate": 3.916736485087216e-06, + "loss": 0.75999391, + "num_input_tokens_seen": 42996125, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.1015625, + "step": 1986, + "time_per_iteration": 2.4477651119232178 + }, + { + "auxiliary_loss_clip": 0.01167211, + "auxiliary_loss_mlp": 0.01055872, + "balance_loss_clip": 1.03664303, + "balance_loss_mlp": 1.05418456, + "epoch": 0.11946490305125507, + "flos": 27190805184000.0, + "grad_norm": 2.0940320364759573, + "language_loss": 0.7209813, + "learning_rate": 3.916625243753819e-06, + "loss": 0.74321216, + "num_input_tokens_seen": 43014180, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.125, + "step": 1987, + "time_per_iteration": 2.528564929962158 + }, + { + "auxiliary_loss_clip": 0.01166851, + "auxiliary_loss_mlp": 0.01053632, + "balance_loss_clip": 1.03256774, + "balance_loss_mlp": 1.05243921, + "epoch": 0.11952502630392305, + "flos": 21140791672320.0, + "grad_norm": 2.544292945564917, + "language_loss": 0.72455966, + "learning_rate": 3.916513929741799e-06, + "loss": 0.74676454, + "num_input_tokens_seen": 43032120, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.140625, + "step": 1988, + "time_per_iteration": 2.482295274734497 + }, + { + "auxiliary_loss_clip": 0.01168039, + "auxiliary_loss_mlp": 0.01063511, + "balance_loss_clip": 1.04195809, + "balance_loss_mlp": 1.05425191, + "epoch": 0.11958514955659101, + "flos": 22124241118080.0, + "grad_norm": 2.3919568417846544, + "language_loss": 0.80848205, + "learning_rate": 3.91640254305538e-06, + "loss": 0.83079755, + "num_input_tokens_seen": 43052215, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.140625, + "step": 1989, + "time_per_iteration": 2.5321335792541504 + }, + { + "auxiliary_loss_clip": 0.01171171, + "auxiliary_loss_mlp": 0.01051003, + "balance_loss_clip": 1.03040385, + "balance_loss_mlp": 1.05518925, + "epoch": 0.11964527280925898, + "flos": 17421452040960.0, + "grad_norm": 2.7848130249027077, + "language_loss": 0.76000333, + "learning_rate": 3.916291083698784e-06, + "loss": 0.78222507, + "num_input_tokens_seen": 43069720, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.15625, + "step": 1990, + "time_per_iteration": 2.4608383178710938 + }, + { + "auxiliary_loss_clip": 0.01060104, + "auxiliary_loss_mlp": 0.0101675, + "balance_loss_clip": 1.01392448, + "balance_loss_mlp": 1.01813149, + "epoch": 0.11970539606192696, + "flos": 70679741402880.0, + "grad_norm": 0.8877551125762418, + "language_loss": 0.55219597, + "learning_rate": 3.916179551676238e-06, + "loss": 0.57296449, + "num_input_tokens_seen": 43123130, + "router_z_loss_clip": 0.02819824, + "router_z_loss_mlp": 0.41992188, + "step": 1991, + "time_per_iteration": 3.0575883388519287 + }, + { + "auxiliary_loss_clip": 0.01166639, + "auxiliary_loss_mlp": 0.01048947, + "balance_loss_clip": 1.02905095, + "balance_loss_mlp": 1.05472517, + "epoch": 0.11976551931459492, + "flos": 21215019127680.0, + "grad_norm": 2.2244739837006797, + "language_loss": 0.78156978, + "learning_rate": 3.916067946991971e-06, + "loss": 0.8037256, + "num_input_tokens_seen": 43140015, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.1171875, + "step": 1992, + "time_per_iteration": 2.5395517349243164 + }, + { + "auxiliary_loss_clip": 0.01170251, + "auxiliary_loss_mlp": 0.0104925, + "balance_loss_clip": 1.02819777, + "balance_loss_mlp": 1.0534482, + "epoch": 0.11982564256726289, + "flos": 25989306226560.0, + "grad_norm": 1.898510109378507, + "language_loss": 0.78694016, + "learning_rate": 3.915956269650216e-06, + "loss": 0.80913514, + "num_input_tokens_seen": 43160105, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1640625, + "step": 1993, + "time_per_iteration": 2.5264625549316406 + }, + { + "auxiliary_loss_clip": 0.01165494, + "auxiliary_loss_mlp": 0.01058458, + "balance_loss_clip": 1.03837109, + "balance_loss_mlp": 1.05150676, + "epoch": 0.11988576581993086, + "flos": 21650866755840.0, + "grad_norm": 1.7590613991113047, + "language_loss": 0.82287014, + "learning_rate": 3.915844519655208e-06, + "loss": 0.8451097, + "num_input_tokens_seen": 43179835, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.140625, + "step": 1994, + "time_per_iteration": 2.4871127605438232 + }, + { + "auxiliary_loss_clip": 0.01166639, + "auxiliary_loss_mlp": 0.01054967, + "balance_loss_clip": 1.03551149, + "balance_loss_mlp": 1.05389762, + "epoch": 0.11994588907259883, + "flos": 17857407409920.0, + "grad_norm": 2.1035856813409786, + "language_loss": 0.87953222, + "learning_rate": 3.915732697011183e-06, + "loss": 0.9017483, + "num_input_tokens_seen": 43197210, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.125, + "step": 1995, + "time_per_iteration": 2.46690034866333 + }, + { + "auxiliary_loss_clip": 0.01169288, + "auxiliary_loss_mlp": 0.01057862, + "balance_loss_clip": 1.03692937, + "balance_loss_mlp": 1.05346155, + "epoch": 0.1200060123252668, + "flos": 24462744163200.0, + "grad_norm": 2.783456627489481, + "language_loss": 0.74206698, + "learning_rate": 3.9156208017223825e-06, + "loss": 0.76433849, + "num_input_tokens_seen": 43215050, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.15625, + "step": 1996, + "time_per_iteration": 2.5115768909454346 + }, + { + "auxiliary_loss_clip": 0.01167539, + "auxiliary_loss_mlp": 0.01052549, + "balance_loss_clip": 1.03138888, + "balance_loss_mlp": 1.05337763, + "epoch": 0.12006613557793476, + "flos": 18732191235840.0, + "grad_norm": 1.9342712291191904, + "language_loss": 0.88266122, + "learning_rate": 3.915508833793048e-06, + "loss": 0.90486217, + "num_input_tokens_seen": 43233900, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.140625, + "step": 1997, + "time_per_iteration": 2.4716532230377197 + }, + { + "auxiliary_loss_clip": 0.01167703, + "auxiliary_loss_mlp": 0.01063842, + "balance_loss_clip": 1.04287314, + "balance_loss_mlp": 1.05315256, + "epoch": 0.12012625883060274, + "flos": 22267739952000.0, + "grad_norm": 3.8633631849497054, + "language_loss": 0.78929418, + "learning_rate": 3.915396793227428e-06, + "loss": 0.81160963, + "num_input_tokens_seen": 43252105, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1484375, + "step": 1998, + "time_per_iteration": 2.4798996448516846 + }, + { + "auxiliary_loss_clip": 0.01170638, + "auxiliary_loss_mlp": 0.01048668, + "balance_loss_clip": 1.027318, + "balance_loss_mlp": 1.05610394, + "epoch": 0.1201863820832707, + "flos": 21758885930880.0, + "grad_norm": 2.053047413592738, + "language_loss": 0.73435485, + "learning_rate": 3.915284680029769e-06, + "loss": 0.75654793, + "num_input_tokens_seen": 43270315, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1484375, + "step": 1999, + "time_per_iteration": 2.5017611980438232 + }, + { + "auxiliary_loss_clip": 0.01169689, + "auxiliary_loss_mlp": 0.01065385, + "balance_loss_clip": 1.04436839, + "balance_loss_mlp": 1.05347967, + "epoch": 0.12024650533593867, + "flos": 21907987286400.0, + "grad_norm": 3.6093884580795677, + "language_loss": 0.74955112, + "learning_rate": 3.915172494204323e-06, + "loss": 0.77190185, + "num_input_tokens_seen": 43289935, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.15625, + "step": 2000, + "time_per_iteration": 2.5060245990753174 + }, + { + "auxiliary_loss_clip": 0.01170552, + "auxiliary_loss_mlp": 0.01050835, + "balance_loss_clip": 1.02997398, + "balance_loss_mlp": 1.05408299, + "epoch": 0.12030662858860665, + "flos": 21689219502720.0, + "grad_norm": 1.5368563042333518, + "language_loss": 0.84667969, + "learning_rate": 3.915060235755344e-06, + "loss": 0.86889356, + "num_input_tokens_seen": 43309325, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.171875, + "step": 2001, + "time_per_iteration": 2.499922752380371 + }, + { + "auxiliary_loss_clip": 0.01168457, + "auxiliary_loss_mlp": 0.0105136, + "balance_loss_clip": 1.03176236, + "balance_loss_mlp": 1.05330753, + "epoch": 0.12036675184127461, + "flos": 12933228856320.0, + "grad_norm": 2.074842616733997, + "language_loss": 0.73982531, + "learning_rate": 3.91494790468709e-06, + "loss": 0.76202351, + "num_input_tokens_seen": 43327010, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.15625, + "step": 2002, + "time_per_iteration": 2.486853837966919 + }, + { + "auxiliary_loss_clip": 0.01175825, + "auxiliary_loss_mlp": 0.01058049, + "balance_loss_clip": 1.03599501, + "balance_loss_mlp": 1.05508709, + "epoch": 0.12042687509394258, + "flos": 20851028657280.0, + "grad_norm": 2.832741043586106, + "language_loss": 0.78091669, + "learning_rate": 3.9148355010038185e-06, + "loss": 0.80325544, + "num_input_tokens_seen": 43345650, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.2109375, + "step": 2003, + "time_per_iteration": 2.4740982055664062 + }, + { + "auxiliary_loss_clip": 0.01166397, + "auxiliary_loss_mlp": 0.01050741, + "balance_loss_clip": 1.02979612, + "balance_loss_mlp": 1.0521121, + "epoch": 0.12048699834661056, + "flos": 23878513451520.0, + "grad_norm": 1.9652989098821625, + "language_loss": 0.72093791, + "learning_rate": 3.914723024709793e-06, + "loss": 0.74310923, + "num_input_tokens_seen": 43365555, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.140625, + "step": 2004, + "time_per_iteration": 2.5126965045928955 + }, + { + "auxiliary_loss_clip": 0.01174991, + "auxiliary_loss_mlp": 0.01061179, + "balance_loss_clip": 1.03877997, + "balance_loss_mlp": 1.0546937, + "epoch": 0.12054712159927852, + "flos": 19756363726080.0, + "grad_norm": 2.2150760255497945, + "language_loss": 0.78260767, + "learning_rate": 3.914610475809279e-06, + "loss": 0.80496937, + "num_input_tokens_seen": 43384990, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.203125, + "step": 2005, + "time_per_iteration": 2.4991190433502197 + }, + { + "auxiliary_loss_clip": 0.01056255, + "auxiliary_loss_mlp": 0.01005501, + "balance_loss_clip": 1.00284314, + "balance_loss_mlp": 1.01496482, + "epoch": 0.12060724485194649, + "flos": 51672763123200.0, + "grad_norm": 0.9233110616682776, + "language_loss": 0.58020771, + "learning_rate": 3.914497854306543e-06, + "loss": 0.60082525, + "num_input_tokens_seen": 43436335, + "router_z_loss_clip": 0.02661133, + "router_z_loss_mlp": 0.4140625, + "step": 2006, + "time_per_iteration": 2.8520798683166504 + }, + { + "auxiliary_loss_clip": 0.01165745, + "auxiliary_loss_mlp": 0.01049181, + "balance_loss_clip": 1.02958333, + "balance_loss_mlp": 1.05345094, + "epoch": 0.12066736810461445, + "flos": 18990425088000.0, + "grad_norm": 1.7247761793975513, + "language_loss": 0.76275218, + "learning_rate": 3.9143851602058575e-06, + "loss": 0.78490144, + "num_input_tokens_seen": 43456495, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.125, + "step": 2007, + "time_per_iteration": 2.50325083732605 + }, + { + "auxiliary_loss_clip": 0.01170732, + "auxiliary_loss_mlp": 0.01058411, + "balance_loss_clip": 1.03653646, + "balance_loss_mlp": 1.05348623, + "epoch": 0.12072749135728243, + "flos": 16471973882880.0, + "grad_norm": 3.332475401193337, + "language_loss": 0.82973194, + "learning_rate": 3.914272393511494e-06, + "loss": 0.85202336, + "num_input_tokens_seen": 43473085, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.171875, + "step": 2008, + "time_per_iteration": 2.4650609493255615 + }, + { + "auxiliary_loss_clip": 0.0116834, + "auxiliary_loss_mlp": 0.01054228, + "balance_loss_clip": 1.03319979, + "balance_loss_mlp": 1.05225682, + "epoch": 0.1207876146099504, + "flos": 18077108947200.0, + "grad_norm": 2.236244219024357, + "language_loss": 0.84184098, + "learning_rate": 3.91415955422773e-06, + "loss": 0.86406672, + "num_input_tokens_seen": 43491135, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1640625, + "step": 2009, + "time_per_iteration": 2.4602744579315186 + }, + { + "auxiliary_loss_clip": 0.01170159, + "auxiliary_loss_mlp": 0.01053411, + "balance_loss_clip": 1.03083277, + "balance_loss_mlp": 1.0551877, + "epoch": 0.12084773786261836, + "flos": 21871573873920.0, + "grad_norm": 1.7312486930792712, + "language_loss": 0.83945864, + "learning_rate": 3.914046642358844e-06, + "loss": 0.86169434, + "num_input_tokens_seen": 43510440, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.1484375, + "step": 2010, + "time_per_iteration": 2.480238437652588 + }, + { + "auxiliary_loss_clip": 0.01171814, + "auxiliary_loss_mlp": 0.01056176, + "balance_loss_clip": 1.03437304, + "balance_loss_mlp": 1.05634403, + "epoch": 0.12090786111528634, + "flos": 18333044328960.0, + "grad_norm": 1.658807365911602, + "language_loss": 0.84157598, + "learning_rate": 3.9139336579091174e-06, + "loss": 0.8638559, + "num_input_tokens_seen": 43530145, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.15625, + "step": 2011, + "time_per_iteration": 2.454406499862671 + }, + { + "auxiliary_loss_clip": 0.01172165, + "auxiliary_loss_mlp": 0.01055064, + "balance_loss_clip": 1.03386891, + "balance_loss_mlp": 1.055547, + "epoch": 0.1209679843679543, + "flos": 21105850717440.0, + "grad_norm": 1.879921554869875, + "language_loss": 0.96007967, + "learning_rate": 3.913820600882834e-06, + "loss": 0.9823519, + "num_input_tokens_seen": 43549315, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.171875, + "step": 2012, + "time_per_iteration": 2.479583740234375 + }, + { + "auxiliary_loss_clip": 0.01166488, + "auxiliary_loss_mlp": 0.01047788, + "balance_loss_clip": 1.026914, + "balance_loss_mlp": 1.05365777, + "epoch": 0.12102810762062227, + "flos": 29241053585280.0, + "grad_norm": 2.6055417591736036, + "language_loss": 0.80619711, + "learning_rate": 3.913707471284283e-06, + "loss": 0.82833993, + "num_input_tokens_seen": 43569240, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.125, + "step": 2013, + "time_per_iteration": 2.538651704788208 + }, + { + "auxiliary_loss_clip": 0.01172968, + "auxiliary_loss_mlp": 0.0104686, + "balance_loss_clip": 1.02444816, + "balance_loss_mlp": 1.05412138, + "epoch": 0.12108823087329025, + "flos": 17930701111680.0, + "grad_norm": 3.9791821612033953, + "language_loss": 0.77157021, + "learning_rate": 3.9135942691177515e-06, + "loss": 0.79376847, + "num_input_tokens_seen": 43587710, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.1875, + "step": 2014, + "time_per_iteration": 2.4411396980285645 + }, + { + "auxiliary_loss_clip": 0.01169091, + "auxiliary_loss_mlp": 0.01046827, + "balance_loss_clip": 1.02509499, + "balance_loss_mlp": 1.05448556, + "epoch": 0.12114835412595822, + "flos": 22091850028800.0, + "grad_norm": 2.028780359370303, + "language_loss": 0.86930937, + "learning_rate": 3.913480994387535e-06, + "loss": 0.89146852, + "num_input_tokens_seen": 43606000, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.140625, + "step": 2015, + "time_per_iteration": 2.4546844959259033 + }, + { + "auxiliary_loss_clip": 0.01159471, + "auxiliary_loss_mlp": 0.01047561, + "balance_loss_clip": 1.0268662, + "balance_loss_mlp": 1.04779112, + "epoch": 0.12120847737862618, + "flos": 20412343854720.0, + "grad_norm": 2.0866681231001762, + "language_loss": 0.69274801, + "learning_rate": 3.913367647097926e-06, + "loss": 0.71481836, + "num_input_tokens_seen": 43624815, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.109375, + "step": 2016, + "time_per_iteration": 2.469177007675171 + }, + { + "auxiliary_loss_clip": 0.01169041, + "auxiliary_loss_mlp": 0.01043193, + "balance_loss_clip": 1.02042413, + "balance_loss_mlp": 1.05407953, + "epoch": 0.12126860063129415, + "flos": 22309037614080.0, + "grad_norm": 3.095255398319528, + "language_loss": 0.80049825, + "learning_rate": 3.913254227253225e-06, + "loss": 0.82262057, + "num_input_tokens_seen": 43643960, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.15625, + "step": 2017, + "time_per_iteration": 2.459447145462036 + }, + { + "auxiliary_loss_clip": 0.01168347, + "auxiliary_loss_mlp": 0.01051308, + "balance_loss_clip": 1.0292666, + "balance_loss_mlp": 1.05315137, + "epoch": 0.12132872388396213, + "flos": 13699275235200.0, + "grad_norm": 2.364451122732105, + "language_loss": 0.69343489, + "learning_rate": 3.913140734857731e-06, + "loss": 0.71563143, + "num_input_tokens_seen": 43662650, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1484375, + "step": 2018, + "time_per_iteration": 3.919508695602417 + }, + { + "auxiliary_loss_clip": 0.0117102, + "auxiliary_loss_mlp": 0.0105213, + "balance_loss_clip": 1.03226995, + "balance_loss_mlp": 1.05712008, + "epoch": 0.12138884713663009, + "flos": 26466954307200.0, + "grad_norm": 2.162901456551013, + "language_loss": 0.72318506, + "learning_rate": 3.91302716991575e-06, + "loss": 0.74541652, + "num_input_tokens_seen": 43684205, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.140625, + "step": 2019, + "time_per_iteration": 3.910888433456421 + }, + { + "auxiliary_loss_clip": 0.01168573, + "auxiliary_loss_mlp": 0.01057878, + "balance_loss_clip": 1.03615856, + "balance_loss_mlp": 1.05187333, + "epoch": 0.12144897038929806, + "flos": 26141603892480.0, + "grad_norm": 1.8061721544245042, + "language_loss": 0.92484713, + "learning_rate": 3.912913532431586e-06, + "loss": 0.94711161, + "num_input_tokens_seen": 43706320, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.1640625, + "step": 2020, + "time_per_iteration": 2.5007998943328857 + }, + { + "auxiliary_loss_clip": 0.01168404, + "auxiliary_loss_mlp": 0.0105088, + "balance_loss_clip": 1.03064966, + "balance_loss_mlp": 1.05388308, + "epoch": 0.12150909364196603, + "flos": 24717530309760.0, + "grad_norm": 1.9478588429028871, + "language_loss": 0.77149868, + "learning_rate": 3.912799822409549e-06, + "loss": 0.79369152, + "num_input_tokens_seen": 43724805, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.140625, + "step": 2021, + "time_per_iteration": 2.522216796875 + }, + { + "auxiliary_loss_clip": 0.01165897, + "auxiliary_loss_mlp": 0.01046456, + "balance_loss_clip": 1.02586901, + "balance_loss_mlp": 1.05312037, + "epoch": 0.121569216894634, + "flos": 25186990089600.0, + "grad_norm": 2.0305604143992944, + "language_loss": 0.80324662, + "learning_rate": 3.912686039853952e-06, + "loss": 0.82537007, + "num_input_tokens_seen": 43742320, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.125, + "step": 2022, + "time_per_iteration": 2.518737316131592 + }, + { + "auxiliary_loss_clip": 0.01173528, + "auxiliary_loss_mlp": 0.01051897, + "balance_loss_clip": 1.03094029, + "balance_loss_mlp": 1.057019, + "epoch": 0.12162934014730196, + "flos": 13444094039040.0, + "grad_norm": 1.9019957932594662, + "language_loss": 0.8458122, + "learning_rate": 3.912572184769108e-06, + "loss": 0.86806649, + "num_input_tokens_seen": 43760665, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1640625, + "step": 2023, + "time_per_iteration": 2.4534339904785156 + }, + { + "auxiliary_loss_clip": 0.01169339, + "auxiliary_loss_mlp": 0.0104975, + "balance_loss_clip": 1.02916241, + "balance_loss_mlp": 1.05421007, + "epoch": 0.12168946339996994, + "flos": 16946138344320.0, + "grad_norm": 2.2004951084054234, + "language_loss": 0.85155022, + "learning_rate": 3.912458257159335e-06, + "loss": 0.87374109, + "num_input_tokens_seen": 43779020, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.15625, + "step": 2024, + "time_per_iteration": 2.436833143234253 + }, + { + "auxiliary_loss_clip": 0.0116415, + "auxiliary_loss_mlp": 0.010498, + "balance_loss_clip": 1.02974951, + "balance_loss_mlp": 1.04884946, + "epoch": 0.12174958665263791, + "flos": 29821585196160.0, + "grad_norm": 2.043367551334066, + "language_loss": 0.71662712, + "learning_rate": 3.912344257028954e-06, + "loss": 0.73876667, + "num_input_tokens_seen": 43798850, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.15625, + "step": 2025, + "time_per_iteration": 2.541215658187866 + }, + { + "auxiliary_loss_clip": 0.01168343, + "auxiliary_loss_mlp": 0.0104585, + "balance_loss_clip": 1.02564383, + "balance_loss_mlp": 1.05309796, + "epoch": 0.12180970990530587, + "flos": 24641902224000.0, + "grad_norm": 2.0848974538483755, + "language_loss": 0.75976777, + "learning_rate": 3.912230184382286e-06, + "loss": 0.7819097, + "num_input_tokens_seen": 43820130, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.15625, + "step": 2026, + "time_per_iteration": 2.529049873352051 + }, + { + "auxiliary_loss_clip": 0.01167307, + "auxiliary_loss_mlp": 0.01045796, + "balance_loss_clip": 1.02570963, + "balance_loss_mlp": 1.05251837, + "epoch": 0.12186983315797385, + "flos": 20521691832960.0, + "grad_norm": 2.6572777094172597, + "language_loss": 0.88875067, + "learning_rate": 3.912116039223659e-06, + "loss": 0.9108817, + "num_input_tokens_seen": 43838485, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.1484375, + "step": 2027, + "time_per_iteration": 2.472158432006836 + }, + { + "auxiliary_loss_clip": 0.01165413, + "auxiliary_loss_mlp": 0.01052054, + "balance_loss_clip": 1.03375518, + "balance_loss_mlp": 1.05316114, + "epoch": 0.12192995641064182, + "flos": 27818344719360.0, + "grad_norm": 2.343330799439898, + "language_loss": 0.75515145, + "learning_rate": 3.912001821557399e-06, + "loss": 0.77732611, + "num_input_tokens_seen": 43859080, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.125, + "step": 2028, + "time_per_iteration": 2.5286035537719727 + }, + { + "auxiliary_loss_clip": 0.01164893, + "auxiliary_loss_mlp": 0.010582, + "balance_loss_clip": 1.03758836, + "balance_loss_mlp": 1.05089998, + "epoch": 0.12199007966330978, + "flos": 22017119783040.0, + "grad_norm": 2.270604294931249, + "language_loss": 0.766294, + "learning_rate": 3.911887531387839e-06, + "loss": 0.78852487, + "num_input_tokens_seen": 43879030, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.140625, + "step": 2029, + "time_per_iteration": 2.479799747467041 + }, + { + "auxiliary_loss_clip": 0.0116289, + "auxiliary_loss_mlp": 0.01051159, + "balance_loss_clip": 1.03113246, + "balance_loss_mlp": 1.05001879, + "epoch": 0.12205020291597775, + "flos": 23295216493440.0, + "grad_norm": 2.2290592341985747, + "language_loss": 0.7955277, + "learning_rate": 3.911773168719313e-06, + "loss": 0.81766814, + "num_input_tokens_seen": 43898505, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.125, + "step": 2030, + "time_per_iteration": 2.479250431060791 + }, + { + "auxiliary_loss_clip": 0.01164659, + "auxiliary_loss_mlp": 0.01054283, + "balance_loss_clip": 1.03301597, + "balance_loss_mlp": 1.0526309, + "epoch": 0.12211032616864573, + "flos": 26031609469440.0, + "grad_norm": 3.9595633959777694, + "language_loss": 0.74556369, + "learning_rate": 3.911658733556155e-06, + "loss": 0.76775312, + "num_input_tokens_seen": 43917945, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.125, + "step": 2031, + "time_per_iteration": 2.4966888427734375 + }, + { + "auxiliary_loss_clip": 0.01166064, + "auxiliary_loss_mlp": 0.01045008, + "balance_loss_clip": 1.0269599, + "balance_loss_mlp": 1.05319047, + "epoch": 0.12217044942131369, + "flos": 20410943224320.0, + "grad_norm": 1.9774178696035418, + "language_loss": 0.75045705, + "learning_rate": 3.911544225902707e-06, + "loss": 0.77256775, + "num_input_tokens_seen": 43937385, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.1328125, + "step": 2032, + "time_per_iteration": 2.4545648097991943 + }, + { + "auxiliary_loss_clip": 0.01156748, + "auxiliary_loss_mlp": 0.01043308, + "balance_loss_clip": 1.02398455, + "balance_loss_mlp": 1.04844511, + "epoch": 0.12223057267398166, + "flos": 22857142222080.0, + "grad_norm": 1.6143118682838826, + "language_loss": 0.88853258, + "learning_rate": 3.911429645763311e-06, + "loss": 0.91053319, + "num_input_tokens_seen": 43958130, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0859375, + "step": 2033, + "time_per_iteration": 2.505521535873413 + }, + { + "auxiliary_loss_clip": 0.01170793, + "auxiliary_loss_mlp": 0.01050252, + "balance_loss_clip": 1.03059459, + "balance_loss_mlp": 1.05660009, + "epoch": 0.12229069592664964, + "flos": 20047563285120.0, + "grad_norm": 2.1152048244965096, + "language_loss": 0.65517056, + "learning_rate": 3.911314993142311e-06, + "loss": 0.67738092, + "num_input_tokens_seen": 43976800, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.140625, + "step": 2034, + "time_per_iteration": 2.4647884368896484 + }, + { + "auxiliary_loss_clip": 0.01167041, + "auxiliary_loss_mlp": 0.01055195, + "balance_loss_clip": 1.03425026, + "balance_loss_mlp": 1.05399358, + "epoch": 0.1223508191793176, + "flos": 22274240313600.0, + "grad_norm": 1.59634219760927, + "language_loss": 0.76435542, + "learning_rate": 3.911200268044055e-06, + "loss": 0.78657782, + "num_input_tokens_seen": 43996620, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.125, + "step": 2035, + "time_per_iteration": 2.483016014099121 + }, + { + "auxiliary_loss_clip": 0.01169828, + "auxiliary_loss_mlp": 0.01051267, + "balance_loss_clip": 1.03104889, + "balance_loss_mlp": 1.0543201, + "epoch": 0.12241094243198557, + "flos": 21285978445440.0, + "grad_norm": 1.8316823187763973, + "language_loss": 0.71407682, + "learning_rate": 3.911085470472892e-06, + "loss": 0.73628777, + "num_input_tokens_seen": 44016175, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.15625, + "step": 2036, + "time_per_iteration": 2.476471185684204 + }, + { + "auxiliary_loss_clip": 0.01168411, + "auxiliary_loss_mlp": 0.01051825, + "balance_loss_clip": 1.0309397, + "balance_loss_mlp": 1.05532706, + "epoch": 0.12247106568465355, + "flos": 17382381022080.0, + "grad_norm": 1.632988910709452, + "language_loss": 0.83352619, + "learning_rate": 3.910970600433178e-06, + "loss": 0.85572863, + "num_input_tokens_seen": 44035060, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.125, + "step": 2037, + "time_per_iteration": 2.476040840148926 + }, + { + "auxiliary_loss_clip": 0.0117386, + "auxiliary_loss_mlp": 0.01058093, + "balance_loss_clip": 1.03625405, + "balance_loss_mlp": 1.05652785, + "epoch": 0.12253118893732151, + "flos": 27045438842880.0, + "grad_norm": 2.722283338591856, + "language_loss": 0.80255699, + "learning_rate": 3.910855657929267e-06, + "loss": 0.82487655, + "num_input_tokens_seen": 44053330, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.171875, + "step": 2038, + "time_per_iteration": 2.5043163299560547 + }, + { + "auxiliary_loss_clip": 0.01058546, + "auxiliary_loss_mlp": 0.01007425, + "balance_loss_clip": 1.0051837, + "balance_loss_mlp": 1.01638949, + "epoch": 0.12259131218998948, + "flos": 53861518368000.0, + "grad_norm": 0.832889593555193, + "language_loss": 0.58671033, + "learning_rate": 3.910740642965518e-06, + "loss": 0.60737002, + "num_input_tokens_seen": 44107575, + "router_z_loss_clip": 0.02246094, + "router_z_loss_mlp": 0.421875, + "step": 2039, + "time_per_iteration": 2.9495608806610107 + }, + { + "auxiliary_loss_clip": 0.01172242, + "auxiliary_loss_mlp": 0.01049387, + "balance_loss_clip": 1.0277977, + "balance_loss_mlp": 1.05559754, + "epoch": 0.12265143544265744, + "flos": 17891917401600.0, + "grad_norm": 2.6229044060505298, + "language_loss": 0.80485016, + "learning_rate": 3.910625555546292e-06, + "loss": 0.82706642, + "num_input_tokens_seen": 44126075, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.171875, + "step": 2040, + "time_per_iteration": 2.4483039379119873 + }, + { + "auxiliary_loss_clip": 0.01166059, + "auxiliary_loss_mlp": 0.01050675, + "balance_loss_clip": 1.02977788, + "balance_loss_mlp": 1.05270815, + "epoch": 0.12271155869532542, + "flos": 21799932197760.0, + "grad_norm": 1.8235003945490114, + "language_loss": 0.82753873, + "learning_rate": 3.910510395675953e-06, + "loss": 0.84970617, + "num_input_tokens_seen": 44145605, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1328125, + "step": 2041, + "time_per_iteration": 2.4804372787475586 + }, + { + "auxiliary_loss_clip": 0.01170766, + "auxiliary_loss_mlp": 0.01049407, + "balance_loss_clip": 1.02846217, + "balance_loss_mlp": 1.05399048, + "epoch": 0.12277168194799339, + "flos": 19828759587840.0, + "grad_norm": 1.7522185366152092, + "language_loss": 0.66806722, + "learning_rate": 3.9103951633588694e-06, + "loss": 0.69026893, + "num_input_tokens_seen": 44164770, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1640625, + "step": 2042, + "time_per_iteration": 2.4683480262756348 + }, + { + "auxiliary_loss_clip": 0.01165609, + "auxiliary_loss_mlp": 0.01051247, + "balance_loss_clip": 1.03032589, + "balance_loss_mlp": 1.05184031, + "epoch": 0.12283180520066135, + "flos": 23221024951680.0, + "grad_norm": 1.8478924147346443, + "language_loss": 0.81661081, + "learning_rate": 3.910279858599409e-06, + "loss": 0.83877933, + "num_input_tokens_seen": 44184025, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.140625, + "step": 2043, + "time_per_iteration": 2.5265614986419678 + }, + { + "auxiliary_loss_clip": 0.01166463, + "auxiliary_loss_mlp": 0.01049773, + "balance_loss_clip": 1.02792168, + "balance_loss_mlp": 1.05028844, + "epoch": 0.12289192845332933, + "flos": 18588476920320.0, + "grad_norm": 2.0920421188484095, + "language_loss": 0.8049221, + "learning_rate": 3.910164481401946e-06, + "loss": 0.82708442, + "num_input_tokens_seen": 44202950, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.1640625, + "step": 2044, + "time_per_iteration": 2.45843768119812 + }, + { + "auxiliary_loss_clip": 0.0116264, + "auxiliary_loss_mlp": 0.01046352, + "balance_loss_clip": 1.02577674, + "balance_loss_mlp": 1.05169511, + "epoch": 0.1229520517059973, + "flos": 25769532862080.0, + "grad_norm": 1.7057283877293323, + "language_loss": 0.7796452, + "learning_rate": 3.910049031770853e-06, + "loss": 0.8017351, + "num_input_tokens_seen": 44221115, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.109375, + "step": 2045, + "time_per_iteration": 2.5117220878601074 + }, + { + "auxiliary_loss_clip": 0.01172524, + "auxiliary_loss_mlp": 0.01063382, + "balance_loss_clip": 1.04210341, + "balance_loss_mlp": 1.05461311, + "epoch": 0.12301217495866526, + "flos": 20887154760960.0, + "grad_norm": 2.0659302798736436, + "language_loss": 0.67135215, + "learning_rate": 3.90993350971051e-06, + "loss": 0.69371116, + "num_input_tokens_seen": 44240575, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1796875, + "step": 2046, + "time_per_iteration": 2.466304063796997 + }, + { + "auxiliary_loss_clip": 0.01166597, + "auxiliary_loss_mlp": 0.01058908, + "balance_loss_clip": 1.03793919, + "balance_loss_mlp": 1.05408335, + "epoch": 0.12307229821133324, + "flos": 22378811783040.0, + "grad_norm": 2.3143924335245654, + "language_loss": 0.72491664, + "learning_rate": 3.909817915225297e-06, + "loss": 0.7471717, + "num_input_tokens_seen": 44257145, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.125, + "step": 2047, + "time_per_iteration": 2.4625275135040283 + }, + { + "auxiliary_loss_clip": 0.01163998, + "auxiliary_loss_mlp": 0.0106421, + "balance_loss_clip": 1.04232347, + "balance_loss_mlp": 1.05105257, + "epoch": 0.1231324214640012, + "flos": 23367396873600.0, + "grad_norm": 1.6458989790549132, + "language_loss": 0.76394033, + "learning_rate": 3.909702248319597e-06, + "loss": 0.7862224, + "num_input_tokens_seen": 44278035, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.125, + "step": 2048, + "time_per_iteration": 2.508790969848633 + }, + { + "auxiliary_loss_clip": 0.01165989, + "auxiliary_loss_mlp": 0.01048998, + "balance_loss_clip": 1.03061616, + "balance_loss_mlp": 1.05322123, + "epoch": 0.12319254471666917, + "flos": 23767154311680.0, + "grad_norm": 2.118548028298143, + "language_loss": 0.84626836, + "learning_rate": 3.909586508997797e-06, + "loss": 0.86841822, + "num_input_tokens_seen": 44296980, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.125, + "step": 2049, + "time_per_iteration": 2.538325071334839 + }, + { + "auxiliary_loss_clip": 0.01164402, + "auxiliary_loss_mlp": 0.01053692, + "balance_loss_clip": 1.0336647, + "balance_loss_mlp": 1.05051267, + "epoch": 0.12325266796933713, + "flos": 23550146294400.0, + "grad_norm": 3.176509780932849, + "language_loss": 0.75351131, + "learning_rate": 3.909470697264285e-06, + "loss": 0.77569222, + "num_input_tokens_seen": 44318005, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.140625, + "step": 2050, + "time_per_iteration": 2.499915599822998 + }, + { + "auxiliary_loss_clip": 0.0116542, + "auxiliary_loss_mlp": 0.01054604, + "balance_loss_clip": 1.03431439, + "balance_loss_mlp": 1.05127048, + "epoch": 0.12331279122200511, + "flos": 24423996366720.0, + "grad_norm": 1.9728027261326873, + "language_loss": 0.80877042, + "learning_rate": 3.909354813123452e-06, + "loss": 0.83097064, + "num_input_tokens_seen": 44335260, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.140625, + "step": 2051, + "time_per_iteration": 2.5018789768218994 + }, + { + "auxiliary_loss_clip": 0.01164273, + "auxiliary_loss_mlp": 0.01053369, + "balance_loss_clip": 1.03338933, + "balance_loss_mlp": 1.05348301, + "epoch": 0.12337291447467308, + "flos": 25484294960640.0, + "grad_norm": 1.7756923294305167, + "language_loss": 0.79991698, + "learning_rate": 3.909238856579693e-06, + "loss": 0.82209337, + "num_input_tokens_seen": 44355315, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.109375, + "step": 2052, + "time_per_iteration": 2.4962196350097656 + }, + { + "auxiliary_loss_clip": 0.01167428, + "auxiliary_loss_mlp": 0.01059063, + "balance_loss_clip": 1.03793955, + "balance_loss_mlp": 1.0515492, + "epoch": 0.12343303772734104, + "flos": 23550002640000.0, + "grad_norm": 2.071130498978609, + "language_loss": 0.73757279, + "learning_rate": 3.909122827637406e-06, + "loss": 0.75983769, + "num_input_tokens_seen": 44373020, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.15625, + "step": 2053, + "time_per_iteration": 2.4748997688293457 + }, + { + "auxiliary_loss_clip": 0.01164856, + "auxiliary_loss_mlp": 0.01054483, + "balance_loss_clip": 1.03337085, + "balance_loss_mlp": 1.04912996, + "epoch": 0.12349316098000902, + "flos": 47557074867840.0, + "grad_norm": 2.5139588428492408, + "language_loss": 0.73835206, + "learning_rate": 3.909006726300991e-06, + "loss": 0.76054543, + "num_input_tokens_seen": 44397525, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.15625, + "step": 2054, + "time_per_iteration": 2.7009665966033936 + }, + { + "auxiliary_loss_clip": 0.01161738, + "auxiliary_loss_mlp": 0.01042094, + "balance_loss_clip": 1.02381933, + "balance_loss_mlp": 1.04980421, + "epoch": 0.12355328423267699, + "flos": 25045969294080.0, + "grad_norm": 2.0020033330801863, + "language_loss": 0.85107529, + "learning_rate": 3.908890552574849e-06, + "loss": 0.87311363, + "num_input_tokens_seen": 44415890, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.1171875, + "step": 2055, + "time_per_iteration": 2.5038392543792725 + }, + { + "auxiliary_loss_clip": 0.01164626, + "auxiliary_loss_mlp": 0.01053304, + "balance_loss_clip": 1.03445673, + "balance_loss_mlp": 1.05093932, + "epoch": 0.12361340748534495, + "flos": 27709140395520.0, + "grad_norm": 1.9818000135561404, + "language_loss": 0.77465194, + "learning_rate": 3.908774306463384e-06, + "loss": 0.79683125, + "num_input_tokens_seen": 44436625, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.140625, + "step": 2056, + "time_per_iteration": 2.5265629291534424 + }, + { + "auxiliary_loss_clip": 0.01162241, + "auxiliary_loss_mlp": 0.01055177, + "balance_loss_clip": 1.03486395, + "balance_loss_mlp": 1.04937708, + "epoch": 0.12367353073801293, + "flos": 26140598311680.0, + "grad_norm": 1.9976131339644834, + "language_loss": 0.83188522, + "learning_rate": 3.908657987971009e-06, + "loss": 0.85405934, + "num_input_tokens_seen": 44455265, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1328125, + "step": 2057, + "time_per_iteration": 2.502987861633301 + }, + { + "auxiliary_loss_clip": 0.0116756, + "auxiliary_loss_mlp": 0.01053922, + "balance_loss_clip": 1.03272629, + "balance_loss_mlp": 1.05169332, + "epoch": 0.1237336539906809, + "flos": 25156035544320.0, + "grad_norm": 1.751792200322901, + "language_loss": 0.78356105, + "learning_rate": 3.90854159710213e-06, + "loss": 0.80577588, + "num_input_tokens_seen": 44475815, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1640625, + "step": 2058, + "time_per_iteration": 2.5236053466796875 + }, + { + "auxiliary_loss_clip": 0.01167574, + "auxiliary_loss_mlp": 0.01052354, + "balance_loss_clip": 1.03086066, + "balance_loss_mlp": 1.05105174, + "epoch": 0.12379377724334886, + "flos": 15304589867520.0, + "grad_norm": 2.1327254817813124, + "language_loss": 0.83191061, + "learning_rate": 3.9084251338611624e-06, + "loss": 0.85410988, + "num_input_tokens_seen": 44494045, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.1640625, + "step": 2059, + "time_per_iteration": 5.313246726989746 + }, + { + "auxiliary_loss_clip": 0.01169117, + "auxiliary_loss_mlp": 0.01056711, + "balance_loss_clip": 1.0344671, + "balance_loss_mlp": 1.05206418, + "epoch": 0.12385390049601683, + "flos": 21316717509120.0, + "grad_norm": 2.990324814625926, + "language_loss": 0.81387389, + "learning_rate": 3.908308598252523e-06, + "loss": 0.83613217, + "num_input_tokens_seen": 44509120, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.171875, + "step": 2060, + "time_per_iteration": 3.8617331981658936 + }, + { + "auxiliary_loss_clip": 0.01163462, + "auxiliary_loss_mlp": 0.01050537, + "balance_loss_clip": 1.02928221, + "balance_loss_mlp": 1.04859161, + "epoch": 0.1239140237486848, + "flos": 15116309752320.0, + "grad_norm": 2.0129231677956105, + "language_loss": 0.86278749, + "learning_rate": 3.9081919902806306e-06, + "loss": 0.88492751, + "num_input_tokens_seen": 44525780, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1484375, + "step": 2061, + "time_per_iteration": 2.4531033039093018 + }, + { + "auxiliary_loss_clip": 0.01163888, + "auxiliary_loss_mlp": 0.01045306, + "balance_loss_clip": 1.02552915, + "balance_loss_mlp": 1.05163288, + "epoch": 0.12397414700135277, + "flos": 21976791788160.0, + "grad_norm": 2.146204871859891, + "language_loss": 0.84992719, + "learning_rate": 3.908075309949906e-06, + "loss": 0.87201917, + "num_input_tokens_seen": 44543125, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.125, + "step": 2062, + "time_per_iteration": 2.475050449371338 + }, + { + "auxiliary_loss_clip": 0.01167091, + "auxiliary_loss_mlp": 0.01057701, + "balance_loss_clip": 1.03600502, + "balance_loss_mlp": 1.05348217, + "epoch": 0.12403427025402074, + "flos": 13400892956160.0, + "grad_norm": 2.194910982672458, + "language_loss": 0.78651118, + "learning_rate": 3.907958557264774e-06, + "loss": 0.80875909, + "num_input_tokens_seen": 44560275, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.140625, + "step": 2063, + "time_per_iteration": 2.4638655185699463 + }, + { + "auxiliary_loss_clip": 0.01167155, + "auxiliary_loss_mlp": 0.01058063, + "balance_loss_clip": 1.03590226, + "balance_loss_mlp": 1.05330634, + "epoch": 0.12409439350668872, + "flos": 15304374385920.0, + "grad_norm": 2.133219584666701, + "language_loss": 0.79411167, + "learning_rate": 3.907841732229663e-06, + "loss": 0.81636381, + "num_input_tokens_seen": 44577640, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.1328125, + "step": 2064, + "time_per_iteration": 2.4441418647766113 + }, + { + "auxiliary_loss_clip": 0.01163006, + "auxiliary_loss_mlp": 0.01051291, + "balance_loss_clip": 1.03083503, + "balance_loss_mlp": 1.04955256, + "epoch": 0.12415451675935668, + "flos": 25009376313600.0, + "grad_norm": 2.2298036351802533, + "language_loss": 0.92358226, + "learning_rate": 3.907724834849002e-06, + "loss": 0.9457252, + "num_input_tokens_seen": 44594860, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1328125, + "step": 2065, + "time_per_iteration": 2.4909794330596924 + }, + { + "auxiliary_loss_clip": 0.01166011, + "auxiliary_loss_mlp": 0.01050011, + "balance_loss_clip": 1.02880335, + "balance_loss_mlp": 1.05061674, + "epoch": 0.12421464001202465, + "flos": 23659673840640.0, + "grad_norm": 1.7134253508315578, + "language_loss": 0.8042016, + "learning_rate": 3.907607865127225e-06, + "loss": 0.82636184, + "num_input_tokens_seen": 44614780, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.15625, + "step": 2066, + "time_per_iteration": 2.484276056289673 + }, + { + "auxiliary_loss_clip": 0.01052523, + "auxiliary_loss_mlp": 0.0100337, + "balance_loss_clip": 1.00111723, + "balance_loss_mlp": 1.01144505, + "epoch": 0.12427476326469263, + "flos": 65732904345600.0, + "grad_norm": 0.8687209975293121, + "language_loss": 0.63275361, + "learning_rate": 3.907490823068766e-06, + "loss": 0.65331256, + "num_input_tokens_seen": 44671240, + "router_z_loss_clip": 0.02258301, + "router_z_loss_mlp": 0.41015625, + "step": 2067, + "time_per_iteration": 3.0286524295806885 + }, + { + "auxiliary_loss_clip": 0.01166519, + "auxiliary_loss_mlp": 0.0105175, + "balance_loss_clip": 1.03103137, + "balance_loss_mlp": 1.05087852, + "epoch": 0.12433488651736059, + "flos": 24535427333760.0, + "grad_norm": 1.9774411847970965, + "language_loss": 0.93209147, + "learning_rate": 3.907373708678063e-06, + "loss": 0.95427418, + "num_input_tokens_seen": 44691050, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.15625, + "step": 2068, + "time_per_iteration": 2.4971697330474854 + }, + { + "auxiliary_loss_clip": 0.01167817, + "auxiliary_loss_mlp": 0.01049229, + "balance_loss_clip": 1.03079867, + "balance_loss_mlp": 1.053213, + "epoch": 0.12439500977002856, + "flos": 21031659175680.0, + "grad_norm": 1.9835561743386452, + "language_loss": 0.81277847, + "learning_rate": 3.9072565219595596e-06, + "loss": 0.83494884, + "num_input_tokens_seen": 44709850, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.1484375, + "step": 2069, + "time_per_iteration": 2.4772391319274902 + }, + { + "auxiliary_loss_clip": 0.01166777, + "auxiliary_loss_mlp": 0.01055339, + "balance_loss_clip": 1.03519261, + "balance_loss_mlp": 1.05177176, + "epoch": 0.12445513302269653, + "flos": 26830621555200.0, + "grad_norm": 1.606173275168009, + "language_loss": 0.77390277, + "learning_rate": 3.907139262917696e-06, + "loss": 0.79612398, + "num_input_tokens_seen": 44731475, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.1484375, + "step": 2070, + "time_per_iteration": 2.4962410926818848 + }, + { + "auxiliary_loss_clip": 0.01172971, + "auxiliary_loss_mlp": 0.01046497, + "balance_loss_clip": 1.02598071, + "balance_loss_mlp": 1.05637431, + "epoch": 0.1245152562753645, + "flos": 18368919037440.0, + "grad_norm": 2.418044156181854, + "language_loss": 0.80847198, + "learning_rate": 3.907021931556922e-06, + "loss": 0.83066666, + "num_input_tokens_seen": 44749685, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1640625, + "step": 2071, + "time_per_iteration": 2.452148199081421 + }, + { + "auxiliary_loss_clip": 0.01162159, + "auxiliary_loss_mlp": 0.01051578, + "balance_loss_clip": 1.03063262, + "balance_loss_mlp": 1.05134583, + "epoch": 0.12457537952803246, + "flos": 33107986200960.0, + "grad_norm": 1.802846280579791, + "language_loss": 0.77933639, + "learning_rate": 3.906904527881684e-06, + "loss": 0.80147374, + "num_input_tokens_seen": 44772165, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.109375, + "step": 2072, + "time_per_iteration": 2.5763509273529053 + }, + { + "auxiliary_loss_clip": 0.01166298, + "auxiliary_loss_mlp": 0.01054628, + "balance_loss_clip": 1.03480363, + "balance_loss_mlp": 1.05423427, + "epoch": 0.12463550278070043, + "flos": 22270217990400.0, + "grad_norm": 2.6278132513508976, + "language_loss": 0.74839735, + "learning_rate": 3.9067870518964355e-06, + "loss": 0.77060658, + "num_input_tokens_seen": 44790580, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.125, + "step": 2073, + "time_per_iteration": 2.4676945209503174 + }, + { + "auxiliary_loss_clip": 0.01162135, + "auxiliary_loss_mlp": 0.01050014, + "balance_loss_clip": 1.02904546, + "balance_loss_mlp": 1.04915833, + "epoch": 0.12469562603336841, + "flos": 14679025580160.0, + "grad_norm": 1.9457561725453951, + "language_loss": 0.90556443, + "learning_rate": 3.906669503605631e-06, + "loss": 0.92768592, + "num_input_tokens_seen": 44806730, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1328125, + "step": 2074, + "time_per_iteration": 2.4873156547546387 + }, + { + "auxiliary_loss_clip": 0.01168793, + "auxiliary_loss_mlp": 0.01051059, + "balance_loss_clip": 1.02843285, + "balance_loss_mlp": 1.05183172, + "epoch": 0.12475574928603637, + "flos": 24644775312000.0, + "grad_norm": 2.3814572559525877, + "language_loss": 0.83753067, + "learning_rate": 3.906551883013728e-06, + "loss": 0.85972917, + "num_input_tokens_seen": 44825550, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.171875, + "step": 2075, + "time_per_iteration": 2.500657320022583 + }, + { + "auxiliary_loss_clip": 0.01164838, + "auxiliary_loss_mlp": 0.01056086, + "balance_loss_clip": 1.0341754, + "balance_loss_mlp": 1.05080831, + "epoch": 0.12481587253870434, + "flos": 21762980081280.0, + "grad_norm": 2.1638910845289567, + "language_loss": 0.73802024, + "learning_rate": 3.9064341901251865e-06, + "loss": 0.76022947, + "num_input_tokens_seen": 44844155, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.140625, + "step": 2076, + "time_per_iteration": 2.5686564445495605 + }, + { + "auxiliary_loss_clip": 0.01161577, + "auxiliary_loss_mlp": 0.01043023, + "balance_loss_clip": 1.02346063, + "balance_loss_mlp": 1.05219531, + "epoch": 0.12487599579137232, + "flos": 21432529935360.0, + "grad_norm": 1.967733683791653, + "language_loss": 0.7551648, + "learning_rate": 3.906316424944469e-06, + "loss": 0.77721083, + "num_input_tokens_seen": 44863780, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.09375, + "step": 2077, + "time_per_iteration": 2.489954710006714 + }, + { + "auxiliary_loss_clip": 0.01163633, + "auxiliary_loss_mlp": 0.0105265, + "balance_loss_clip": 1.03104901, + "balance_loss_mlp": 1.05015802, + "epoch": 0.12493611904404028, + "flos": 16107624276480.0, + "grad_norm": 4.043491061132511, + "language_loss": 0.82077563, + "learning_rate": 3.906198587476043e-06, + "loss": 0.84293842, + "num_input_tokens_seen": 44881480, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.1328125, + "step": 2078, + "time_per_iteration": 2.445270299911499 + }, + { + "auxiliary_loss_clip": 0.01168396, + "auxiliary_loss_mlp": 0.0104761, + "balance_loss_clip": 1.02629507, + "balance_loss_mlp": 1.05372512, + "epoch": 0.12499624229670825, + "flos": 21580266574080.0, + "grad_norm": 2.023726857078381, + "language_loss": 0.75024784, + "learning_rate": 3.906080677724374e-06, + "loss": 0.77240789, + "num_input_tokens_seen": 44900390, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1484375, + "step": 2079, + "time_per_iteration": 2.4694364070892334 + }, + { + "auxiliary_loss_clip": 0.01173002, + "auxiliary_loss_mlp": 0.01056904, + "balance_loss_clip": 1.03578043, + "balance_loss_mlp": 1.05697465, + "epoch": 0.1250563655493762, + "flos": 25699040421120.0, + "grad_norm": 2.9314739831996124, + "language_loss": 0.83961046, + "learning_rate": 3.905962695693935e-06, + "loss": 0.86190951, + "num_input_tokens_seen": 44920375, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1640625, + "step": 2080, + "time_per_iteration": 2.52955961227417 + }, + { + "auxiliary_loss_clip": 0.01166024, + "auxiliary_loss_mlp": 0.0105753, + "balance_loss_clip": 1.0364058, + "balance_loss_mlp": 1.05275226, + "epoch": 0.12511648880204418, + "flos": 16909509450240.0, + "grad_norm": 2.0357346796271307, + "language_loss": 0.84575123, + "learning_rate": 3.9058446413892e-06, + "loss": 0.8679868, + "num_input_tokens_seen": 44938415, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1328125, + "step": 2081, + "time_per_iteration": 2.4380433559417725 + }, + { + "auxiliary_loss_clip": 0.0116507, + "auxiliary_loss_mlp": 0.01044581, + "balance_loss_clip": 1.02430391, + "balance_loss_mlp": 1.05154538, + "epoch": 0.12517661205471217, + "flos": 17567500740480.0, + "grad_norm": 1.660916229819668, + "language_loss": 0.76882648, + "learning_rate": 3.905726514814646e-06, + "loss": 0.790923, + "num_input_tokens_seen": 44957135, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1328125, + "step": 2082, + "time_per_iteration": 2.454939842224121 + }, + { + "auxiliary_loss_clip": 0.01182882, + "auxiliary_loss_mlp": 0.01052846, + "balance_loss_clip": 1.03117347, + "balance_loss_mlp": 1.06035674, + "epoch": 0.12523673530738014, + "flos": 16033791870720.0, + "grad_norm": 2.833832134330164, + "language_loss": 0.78994107, + "learning_rate": 3.9056083159747495e-06, + "loss": 0.81229836, + "num_input_tokens_seen": 44974480, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.2265625, + "step": 2083, + "time_per_iteration": 2.4439167976379395 + }, + { + "auxiliary_loss_clip": 0.01168103, + "auxiliary_loss_mlp": 0.01051445, + "balance_loss_clip": 1.02855682, + "balance_loss_mlp": 1.05132031, + "epoch": 0.1252968585600481, + "flos": 18807747494400.0, + "grad_norm": 2.376124844090109, + "language_loss": 0.89690113, + "learning_rate": 3.9054900448739966e-06, + "loss": 0.91909659, + "num_input_tokens_seen": 44990310, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.171875, + "step": 2084, + "time_per_iteration": 2.492274045944214 + }, + { + "auxiliary_loss_clip": 0.01168755, + "auxiliary_loss_mlp": 0.01049772, + "balance_loss_clip": 1.02876747, + "balance_loss_mlp": 1.05379784, + "epoch": 0.12535698181271607, + "flos": 27271568914560.0, + "grad_norm": 1.9059704425119062, + "language_loss": 0.79718572, + "learning_rate": 3.905371701516869e-06, + "loss": 0.81937099, + "num_input_tokens_seen": 45010720, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.1484375, + "step": 2085, + "time_per_iteration": 2.5295538902282715 + }, + { + "auxiliary_loss_clip": 0.0116658, + "auxiliary_loss_mlp": 0.01051649, + "balance_loss_clip": 1.03011954, + "balance_loss_mlp": 1.05235541, + "epoch": 0.12541710506538403, + "flos": 22054107813120.0, + "grad_norm": 1.9580642243137214, + "language_loss": 0.88227898, + "learning_rate": 3.905253285907856e-06, + "loss": 0.90446126, + "num_input_tokens_seen": 45030360, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.140625, + "step": 2086, + "time_per_iteration": 2.4508614540100098 + }, + { + "auxiliary_loss_clip": 0.01162238, + "auxiliary_loss_mlp": 0.01045013, + "balance_loss_clip": 1.02541506, + "balance_loss_mlp": 1.05238986, + "epoch": 0.125477228318052, + "flos": 12603173760000.0, + "grad_norm": 2.3707303368435957, + "language_loss": 0.87088495, + "learning_rate": 3.905134798051447e-06, + "loss": 0.89295745, + "num_input_tokens_seen": 45045085, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.09375, + "step": 2087, + "time_per_iteration": 2.4342494010925293 + }, + { + "auxiliary_loss_clip": 0.01166252, + "auxiliary_loss_mlp": 0.01056999, + "balance_loss_clip": 1.03444421, + "balance_loss_mlp": 1.05230761, + "epoch": 0.12553735157071996, + "flos": 23878549365120.0, + "grad_norm": 3.239876707553976, + "language_loss": 0.73480451, + "learning_rate": 3.905016237952136e-06, + "loss": 0.75703704, + "num_input_tokens_seen": 45065145, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.140625, + "step": 2088, + "time_per_iteration": 2.4926228523254395 + }, + { + "auxiliary_loss_clip": 0.01053685, + "auxiliary_loss_mlp": 0.01004858, + "balance_loss_clip": 1.00259304, + "balance_loss_mlp": 1.01231122, + "epoch": 0.12559747482338796, + "flos": 69920841830400.0, + "grad_norm": 0.759594920780347, + "language_loss": 0.61699253, + "learning_rate": 3.904897605614418e-06, + "loss": 0.63757795, + "num_input_tokens_seen": 45126230, + "router_z_loss_clip": 0.02270508, + "router_z_loss_mlp": 0.4140625, + "step": 2089, + "time_per_iteration": 3.0373222827911377 + }, + { + "auxiliary_loss_clip": 0.01165987, + "auxiliary_loss_mlp": 0.01057326, + "balance_loss_clip": 1.03522468, + "balance_loss_mlp": 1.05317736, + "epoch": 0.12565759807605592, + "flos": 24279563779200.0, + "grad_norm": 2.0159960445234746, + "language_loss": 0.78266793, + "learning_rate": 3.904778901042793e-06, + "loss": 0.80490106, + "num_input_tokens_seen": 45145545, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.125, + "step": 2090, + "time_per_iteration": 2.5307860374450684 + }, + { + "auxiliary_loss_clip": 0.01051163, + "auxiliary_loss_mlp": 0.01005786, + "balance_loss_clip": 1.00381935, + "balance_loss_mlp": 1.01062346, + "epoch": 0.12571772132872389, + "flos": 56451180286080.0, + "grad_norm": 0.749206069507312, + "language_loss": 0.59394926, + "learning_rate": 3.90466012424176e-06, + "loss": 0.61451876, + "num_input_tokens_seen": 45206845, + "router_z_loss_clip": 0.01965332, + "router_z_loss_mlp": 0.40625, + "step": 2091, + "time_per_iteration": 2.976081609725952 + }, + { + "auxiliary_loss_clip": 0.01166574, + "auxiliary_loss_mlp": 0.0105012, + "balance_loss_clip": 1.03016472, + "balance_loss_mlp": 1.0538522, + "epoch": 0.12577784458139185, + "flos": 41245846675200.0, + "grad_norm": 1.8692826570762828, + "language_loss": 0.63588953, + "learning_rate": 3.904541275215825e-06, + "loss": 0.6580565, + "num_input_tokens_seen": 45228495, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.125, + "step": 2092, + "time_per_iteration": 2.633054733276367 + }, + { + "auxiliary_loss_clip": 0.01169654, + "auxiliary_loss_mlp": 0.01059319, + "balance_loss_clip": 1.03770638, + "balance_loss_mlp": 1.05095637, + "epoch": 0.12583796783405982, + "flos": 19755501799680.0, + "grad_norm": 3.3800613541528257, + "language_loss": 0.80149096, + "learning_rate": 3.904422353969493e-06, + "loss": 0.82378066, + "num_input_tokens_seen": 45245720, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.1875, + "step": 2093, + "time_per_iteration": 2.4769086837768555 + }, + { + "auxiliary_loss_clip": 0.01166637, + "auxiliary_loss_mlp": 0.01065148, + "balance_loss_clip": 1.04385769, + "balance_loss_mlp": 1.05323935, + "epoch": 0.12589809108672778, + "flos": 22602104680320.0, + "grad_norm": 1.7179534274341421, + "language_loss": 0.75928843, + "learning_rate": 3.904303360507276e-06, + "loss": 0.78160632, + "num_input_tokens_seen": 45265650, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1328125, + "step": 2094, + "time_per_iteration": 2.4775569438934326 + }, + { + "auxiliary_loss_clip": 0.01163905, + "auxiliary_loss_mlp": 0.01053098, + "balance_loss_clip": 1.03322637, + "balance_loss_mlp": 1.05116057, + "epoch": 0.12595821433939577, + "flos": 45222845541120.0, + "grad_norm": 1.654740537988477, + "language_loss": 0.76833487, + "learning_rate": 3.9041842948336835e-06, + "loss": 0.79050487, + "num_input_tokens_seen": 45287790, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.125, + "step": 2095, + "time_per_iteration": 2.669593095779419 + }, + { + "auxiliary_loss_clip": 0.01166425, + "auxiliary_loss_mlp": 0.01064344, + "balance_loss_clip": 1.04330409, + "balance_loss_mlp": 1.05012596, + "epoch": 0.12601833759206374, + "flos": 14319811618560.0, + "grad_norm": 2.7658625824396568, + "language_loss": 0.8312341, + "learning_rate": 3.904065156953232e-06, + "loss": 0.85354173, + "num_input_tokens_seen": 45305720, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1640625, + "step": 2096, + "time_per_iteration": 2.446169853210449 + }, + { + "auxiliary_loss_clip": 0.01167307, + "auxiliary_loss_mlp": 0.0105403, + "balance_loss_clip": 1.03317988, + "balance_loss_mlp": 1.05236387, + "epoch": 0.1260784608447317, + "flos": 21288241002240.0, + "grad_norm": 1.9365429623482773, + "language_loss": 0.7532599, + "learning_rate": 3.903945946870439e-06, + "loss": 0.77547324, + "num_input_tokens_seen": 45325290, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1484375, + "step": 2097, + "time_per_iteration": 2.46520733833313 + }, + { + "auxiliary_loss_clip": 0.0116818, + "auxiliary_loss_mlp": 0.0105919, + "balance_loss_clip": 1.0399375, + "balance_loss_mlp": 1.05366278, + "epoch": 0.12613858409739967, + "flos": 26251311006720.0, + "grad_norm": 2.0415683165998004, + "language_loss": 0.8696878, + "learning_rate": 3.9038266645898246e-06, + "loss": 0.89196146, + "num_input_tokens_seen": 45344465, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.1484375, + "step": 2098, + "time_per_iteration": 2.488985061645508 + }, + { + "auxiliary_loss_clip": 0.01171506, + "auxiliary_loss_mlp": 0.01063691, + "balance_loss_clip": 1.03984964, + "balance_loss_mlp": 1.05263424, + "epoch": 0.12619870735006763, + "flos": 21579979265280.0, + "grad_norm": 1.8810788789855342, + "language_loss": 0.69538295, + "learning_rate": 3.903707310115912e-06, + "loss": 0.71773493, + "num_input_tokens_seen": 45362465, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.1875, + "step": 2099, + "time_per_iteration": 2.4791061878204346 + }, + { + "auxiliary_loss_clip": 0.01167442, + "auxiliary_loss_mlp": 0.01058165, + "balance_loss_clip": 1.03538442, + "balance_loss_mlp": 1.05016196, + "epoch": 0.1262588306027356, + "flos": 23367037737600.0, + "grad_norm": 3.489186386071109, + "language_loss": 0.81622505, + "learning_rate": 3.903587883453228e-06, + "loss": 0.83848113, + "num_input_tokens_seen": 45382700, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.171875, + "step": 2100, + "time_per_iteration": 2.4970083236694336 + }, + { + "auxiliary_loss_clip": 0.01170444, + "auxiliary_loss_mlp": 0.01056399, + "balance_loss_clip": 1.03558493, + "balance_loss_mlp": 1.05375385, + "epoch": 0.12631895385540357, + "flos": 23949185460480.0, + "grad_norm": 21.240028764463403, + "language_loss": 0.80653214, + "learning_rate": 3.903468384606302e-06, + "loss": 0.82880062, + "num_input_tokens_seen": 45401005, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.1640625, + "step": 2101, + "time_per_iteration": 5.441275596618652 + }, + { + "auxiliary_loss_clip": 0.01053889, + "auxiliary_loss_mlp": 0.01009667, + "balance_loss_clip": 1.00753367, + "balance_loss_mlp": 1.01423335, + "epoch": 0.12637907710807156, + "flos": 70282138780800.0, + "grad_norm": 0.7055092704674581, + "language_loss": 0.57077372, + "learning_rate": 3.903348813579662e-06, + "loss": 0.59140933, + "num_input_tokens_seen": 45466555, + "router_z_loss_clip": 0.0213623, + "router_z_loss_mlp": 0.39648438, + "step": 2102, + "time_per_iteration": 4.4595959186553955 + }, + { + "auxiliary_loss_clip": 0.01172982, + "auxiliary_loss_mlp": 0.0105633, + "balance_loss_clip": 1.03513408, + "balance_loss_mlp": 1.05443108, + "epoch": 0.12643920036073952, + "flos": 18915084311040.0, + "grad_norm": 1.9163731362545673, + "language_loss": 0.93033105, + "learning_rate": 3.903229170377845e-06, + "loss": 0.9526242, + "num_input_tokens_seen": 45485165, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1875, + "step": 2103, + "time_per_iteration": 2.4612908363342285 + }, + { + "auxiliary_loss_clip": 0.01160763, + "auxiliary_loss_mlp": 0.01039063, + "balance_loss_clip": 1.01929784, + "balance_loss_mlp": 1.05146646, + "epoch": 0.1264993236134075, + "flos": 27782470010880.0, + "grad_norm": 1.70771861982282, + "language_loss": 0.7804687, + "learning_rate": 3.903109455005387e-06, + "loss": 0.80246699, + "num_input_tokens_seen": 45504630, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.09375, + "step": 2104, + "time_per_iteration": 2.556351661682129 + }, + { + "auxiliary_loss_clip": 0.01173017, + "auxiliary_loss_mlp": 0.01056721, + "balance_loss_clip": 1.03659892, + "balance_loss_mlp": 1.05698192, + "epoch": 0.12655944686607545, + "flos": 24754697907840.0, + "grad_norm": 1.9983303318130716, + "language_loss": 0.81274837, + "learning_rate": 3.902989667466828e-06, + "loss": 0.83504581, + "num_input_tokens_seen": 45524885, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.15625, + "step": 2105, + "time_per_iteration": 2.4998059272766113 + }, + { + "auxiliary_loss_clip": 0.01177911, + "auxiliary_loss_mlp": 0.01057389, + "balance_loss_clip": 1.03515697, + "balance_loss_mlp": 1.05756688, + "epoch": 0.12661957011874342, + "flos": 24133048202880.0, + "grad_norm": 2.6618923007939728, + "language_loss": 0.83258855, + "learning_rate": 3.90286980776671e-06, + "loss": 0.85494161, + "num_input_tokens_seen": 45545000, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.203125, + "step": 2106, + "time_per_iteration": 2.4816856384277344 + }, + { + "auxiliary_loss_clip": 0.01170292, + "auxiliary_loss_mlp": 0.01048713, + "balance_loss_clip": 1.02755296, + "balance_loss_mlp": 1.05664992, + "epoch": 0.12667969337141138, + "flos": 24569614103040.0, + "grad_norm": 2.017673348074064, + "language_loss": 0.73717511, + "learning_rate": 3.902749875909578e-06, + "loss": 0.75936514, + "num_input_tokens_seen": 45564210, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.140625, + "step": 2107, + "time_per_iteration": 2.503575325012207 + }, + { + "auxiliary_loss_clip": 0.01166119, + "auxiliary_loss_mlp": 0.01046685, + "balance_loss_clip": 1.02683651, + "balance_loss_mlp": 1.05330598, + "epoch": 0.12673981662407935, + "flos": 22961677777920.0, + "grad_norm": 2.8409726657459213, + "language_loss": 0.79492414, + "learning_rate": 3.90262987189998e-06, + "loss": 0.81705213, + "num_input_tokens_seen": 45583030, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.125, + "step": 2108, + "time_per_iteration": 2.448009967803955 + }, + { + "auxiliary_loss_clip": 0.01168328, + "auxiliary_loss_mlp": 0.01048086, + "balance_loss_clip": 1.02635407, + "balance_loss_mlp": 1.05213785, + "epoch": 0.12679993987674734, + "flos": 17274864637440.0, + "grad_norm": 2.700834997101356, + "language_loss": 0.75458848, + "learning_rate": 3.902509795742467e-06, + "loss": 0.77675259, + "num_input_tokens_seen": 45602265, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.1640625, + "step": 2109, + "time_per_iteration": 2.463996171951294 + }, + { + "auxiliary_loss_clip": 0.01165378, + "auxiliary_loss_mlp": 0.01046335, + "balance_loss_clip": 1.02641523, + "balance_loss_mlp": 1.05309939, + "epoch": 0.1268600631294153, + "flos": 17275080119040.0, + "grad_norm": 5.620565406896926, + "language_loss": 0.82876229, + "learning_rate": 3.902389647441592e-06, + "loss": 0.85087943, + "num_input_tokens_seen": 45620595, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.125, + "step": 2110, + "time_per_iteration": 2.4536476135253906 + }, + { + "auxiliary_loss_clip": 0.01166918, + "auxiliary_loss_mlp": 0.01055332, + "balance_loss_clip": 1.03271818, + "balance_loss_mlp": 1.0524385, + "epoch": 0.12692018638208327, + "flos": 24061047390720.0, + "grad_norm": 1.8108257578185059, + "language_loss": 0.78553301, + "learning_rate": 3.90226942700191e-06, + "loss": 0.80775553, + "num_input_tokens_seen": 45641140, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.140625, + "step": 2111, + "time_per_iteration": 2.4898500442504883 + }, + { + "auxiliary_loss_clip": 0.01178398, + "auxiliary_loss_mlp": 0.01069762, + "balance_loss_clip": 1.04634905, + "balance_loss_mlp": 1.05599511, + "epoch": 0.12698030963475124, + "flos": 31831900652160.0, + "grad_norm": 2.2255287569010567, + "language_loss": 0.76852119, + "learning_rate": 3.902149134427982e-06, + "loss": 0.79100275, + "num_input_tokens_seen": 45662315, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.2265625, + "step": 2112, + "time_per_iteration": 2.534062623977661 + }, + { + "auxiliary_loss_clip": 0.0116691, + "auxiliary_loss_mlp": 0.01060346, + "balance_loss_clip": 1.03878117, + "balance_loss_mlp": 1.05138493, + "epoch": 0.1270404328874192, + "flos": 25187744275200.0, + "grad_norm": 1.901101750436338, + "language_loss": 0.85764933, + "learning_rate": 3.902028769724367e-06, + "loss": 0.87992191, + "num_input_tokens_seen": 45680335, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.15625, + "step": 2113, + "time_per_iteration": 2.4980924129486084 + }, + { + "auxiliary_loss_clip": 0.01166421, + "auxiliary_loss_mlp": 0.01057595, + "balance_loss_clip": 1.03581548, + "balance_loss_mlp": 1.05287683, + "epoch": 0.12710055614008717, + "flos": 15997342544640.0, + "grad_norm": 2.270588429793272, + "language_loss": 0.74000478, + "learning_rate": 3.9019083328956315e-06, + "loss": 0.76224494, + "num_input_tokens_seen": 45696240, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.1328125, + "step": 2114, + "time_per_iteration": 2.422631025314331 + }, + { + "auxiliary_loss_clip": 0.01170563, + "auxiliary_loss_mlp": 0.01057942, + "balance_loss_clip": 1.03504217, + "balance_loss_mlp": 1.05601084, + "epoch": 0.12716067939275516, + "flos": 15085642515840.0, + "grad_norm": 1.7902572486589996, + "language_loss": 0.83236456, + "learning_rate": 3.901787823946341e-06, + "loss": 0.85464966, + "num_input_tokens_seen": 45713695, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.1484375, + "step": 2115, + "time_per_iteration": 2.4601340293884277 + }, + { + "auxiliary_loss_clip": 0.01169954, + "auxiliary_loss_mlp": 0.01060607, + "balance_loss_clip": 1.03953075, + "balance_loss_mlp": 1.05397201, + "epoch": 0.12722080264542313, + "flos": 28366736636160.0, + "grad_norm": 1.532692301262898, + "language_loss": 0.86615002, + "learning_rate": 3.901667242881065e-06, + "loss": 0.88845563, + "num_input_tokens_seen": 45736655, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1640625, + "step": 2116, + "time_per_iteration": 2.5315732955932617 + }, + { + "auxiliary_loss_clip": 0.01164638, + "auxiliary_loss_mlp": 0.01050843, + "balance_loss_clip": 1.03062534, + "balance_loss_mlp": 1.05188024, + "epoch": 0.1272809258980911, + "flos": 32379897519360.0, + "grad_norm": 1.8525451323112498, + "language_loss": 0.70492947, + "learning_rate": 3.9015465897043775e-06, + "loss": 0.72708428, + "num_input_tokens_seen": 45758195, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.125, + "step": 2117, + "time_per_iteration": 2.6213905811309814 + }, + { + "auxiliary_loss_clip": 0.01168229, + "auxiliary_loss_mlp": 0.01055103, + "balance_loss_clip": 1.03346658, + "balance_loss_mlp": 1.05461121, + "epoch": 0.12734104915075906, + "flos": 16034402401920.0, + "grad_norm": 2.4058915352959294, + "language_loss": 0.86858076, + "learning_rate": 3.901425864420852e-06, + "loss": 0.89081407, + "num_input_tokens_seen": 45774280, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.140625, + "step": 2118, + "time_per_iteration": 2.4760360717773438 + }, + { + "auxiliary_loss_clip": 0.01164532, + "auxiliary_loss_mlp": 0.01048268, + "balance_loss_clip": 1.0279547, + "balance_loss_mlp": 1.0518508, + "epoch": 0.12740117240342702, + "flos": 18260325244800.0, + "grad_norm": 1.7933295144796901, + "language_loss": 0.87325591, + "learning_rate": 3.901305067035068e-06, + "loss": 0.89538383, + "num_input_tokens_seen": 45792760, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.125, + "step": 2119, + "time_per_iteration": 2.547213315963745 + }, + { + "auxiliary_loss_clip": 0.01167828, + "auxiliary_loss_mlp": 0.01051741, + "balance_loss_clip": 1.03024805, + "balance_loss_mlp": 1.05369782, + "epoch": 0.127461295656095, + "flos": 12121790664960.0, + "grad_norm": 2.4444945117671018, + "language_loss": 0.8769815, + "learning_rate": 3.901184197551605e-06, + "loss": 0.89917719, + "num_input_tokens_seen": 45804300, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.140625, + "step": 2120, + "time_per_iteration": 2.4568872451782227 + }, + { + "auxiliary_loss_clip": 0.01169401, + "auxiliary_loss_mlp": 0.0104623, + "balance_loss_clip": 1.02553487, + "balance_loss_mlp": 1.05405664, + "epoch": 0.12752141890876295, + "flos": 23149095966720.0, + "grad_norm": 1.8558714180118523, + "language_loss": 0.75193042, + "learning_rate": 3.901063255975046e-06, + "loss": 0.77408671, + "num_input_tokens_seen": 45823780, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1484375, + "step": 2121, + "time_per_iteration": 2.508117437362671 + }, + { + "auxiliary_loss_clip": 0.01167335, + "auxiliary_loss_mlp": 0.01050063, + "balance_loss_clip": 1.02895081, + "balance_loss_mlp": 1.05228865, + "epoch": 0.12758154216143094, + "flos": 21615997628160.0, + "grad_norm": 2.458066848563671, + "language_loss": 0.8294577, + "learning_rate": 3.900942242309978e-06, + "loss": 0.8516317, + "num_input_tokens_seen": 45840495, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.15625, + "step": 2122, + "time_per_iteration": 2.4878990650177 + }, + { + "auxiliary_loss_clip": 0.01168476, + "auxiliary_loss_mlp": 0.01050394, + "balance_loss_clip": 1.02924609, + "balance_loss_mlp": 1.05379128, + "epoch": 0.1276416654140989, + "flos": 15924874855680.0, + "grad_norm": 2.1208761223769375, + "language_loss": 0.79040462, + "learning_rate": 3.90082115656099e-06, + "loss": 0.81259328, + "num_input_tokens_seen": 45857735, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1484375, + "step": 2123, + "time_per_iteration": 2.512085199356079 + }, + { + "auxiliary_loss_clip": 0.0117181, + "auxiliary_loss_mlp": 0.01056255, + "balance_loss_clip": 1.03411841, + "balance_loss_mlp": 1.05565643, + "epoch": 0.12770178866676687, + "flos": 22382690451840.0, + "grad_norm": 1.7846776317234667, + "language_loss": 0.79227948, + "learning_rate": 3.900699998732673e-06, + "loss": 0.81456017, + "num_input_tokens_seen": 45876485, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1640625, + "step": 2124, + "time_per_iteration": 2.4865264892578125 + }, + { + "auxiliary_loss_clip": 0.01168084, + "auxiliary_loss_mlp": 0.01054179, + "balance_loss_clip": 1.03267348, + "balance_loss_mlp": 1.05149364, + "epoch": 0.12776191191943484, + "flos": 21652482867840.0, + "grad_norm": 2.8175561910153215, + "language_loss": 0.75565529, + "learning_rate": 3.900578768829623e-06, + "loss": 0.77787793, + "num_input_tokens_seen": 45894645, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.1640625, + "step": 2125, + "time_per_iteration": 2.514455795288086 + }, + { + "auxiliary_loss_clip": 0.01166899, + "auxiliary_loss_mlp": 0.01047376, + "balance_loss_clip": 1.02645469, + "balance_loss_mlp": 1.05262208, + "epoch": 0.1278220351721028, + "flos": 25735561574400.0, + "grad_norm": 2.1990589160087493, + "language_loss": 0.77811432, + "learning_rate": 3.900457466856434e-06, + "loss": 0.80025709, + "num_input_tokens_seen": 45913755, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.140625, + "step": 2126, + "time_per_iteration": 2.556657075881958 + }, + { + "auxiliary_loss_clip": 0.01167875, + "auxiliary_loss_mlp": 0.01050746, + "balance_loss_clip": 1.03124356, + "balance_loss_mlp": 1.05559683, + "epoch": 0.12788215842477077, + "flos": 41243224982400.0, + "grad_norm": 1.702389562623477, + "language_loss": 0.69255161, + "learning_rate": 3.9003360928177085e-06, + "loss": 0.71473777, + "num_input_tokens_seen": 45936095, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.125, + "step": 2127, + "time_per_iteration": 2.629990339279175 + }, + { + "auxiliary_loss_clip": 0.01050691, + "auxiliary_loss_mlp": 0.01005275, + "balance_loss_clip": 1.00326061, + "balance_loss_mlp": 1.01139402, + "epoch": 0.12794228167743876, + "flos": 70877430881280.0, + "grad_norm": 0.8552720802624753, + "language_loss": 0.62738979, + "learning_rate": 3.900214646718047e-06, + "loss": 0.64794946, + "num_input_tokens_seen": 46004655, + "router_z_loss_clip": 0.0201416, + "router_z_loss_mlp": 0.39257812, + "step": 2128, + "time_per_iteration": 3.1237356662750244 + }, + { + "auxiliary_loss_clip": 0.01168478, + "auxiliary_loss_mlp": 0.01048721, + "balance_loss_clip": 1.02646422, + "balance_loss_mlp": 1.05287039, + "epoch": 0.12800240493010673, + "flos": 16289727252480.0, + "grad_norm": 2.3711218915030368, + "language_loss": 0.77148604, + "learning_rate": 3.900093128562056e-06, + "loss": 0.79365802, + "num_input_tokens_seen": 46023610, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.15625, + "step": 2129, + "time_per_iteration": 2.4499564170837402 + }, + { + "auxiliary_loss_clip": 0.01179121, + "auxiliary_loss_mlp": 0.01052089, + "balance_loss_clip": 1.02902186, + "balance_loss_mlp": 1.05744195, + "epoch": 0.1280625281827747, + "flos": 20631542601600.0, + "grad_norm": 2.273395516882369, + "language_loss": 0.79321349, + "learning_rate": 3.899971538354343e-06, + "loss": 0.81552559, + "num_input_tokens_seen": 46041725, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.21875, + "step": 2130, + "time_per_iteration": 2.4536893367767334 + }, + { + "auxiliary_loss_clip": 0.0116812, + "auxiliary_loss_mlp": 0.01044456, + "balance_loss_clip": 1.02463198, + "balance_loss_mlp": 1.05328345, + "epoch": 0.12812265143544266, + "flos": 22638230784000.0, + "grad_norm": 2.267455405666958, + "language_loss": 0.70879477, + "learning_rate": 3.899849876099518e-06, + "loss": 0.73092055, + "num_input_tokens_seen": 46061095, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.1484375, + "step": 2131, + "time_per_iteration": 2.514155149459839 + }, + { + "auxiliary_loss_clip": 0.01166691, + "auxiliary_loss_mlp": 0.01051797, + "balance_loss_clip": 1.03007698, + "balance_loss_mlp": 1.05375445, + "epoch": 0.12818277468811062, + "flos": 34714701463680.0, + "grad_norm": 2.2952793086030376, + "language_loss": 0.72266257, + "learning_rate": 3.899728141802197e-06, + "loss": 0.74484742, + "num_input_tokens_seen": 46082670, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.125, + "step": 2132, + "time_per_iteration": 2.5662834644317627 + }, + { + "auxiliary_loss_clip": 0.01163765, + "auxiliary_loss_mlp": 0.01054914, + "balance_loss_clip": 1.03396928, + "balance_loss_mlp": 1.05281162, + "epoch": 0.1282428979407786, + "flos": 23112107936640.0, + "grad_norm": 2.1162344308699828, + "language_loss": 0.82306767, + "learning_rate": 3.8996063354669935e-06, + "loss": 0.84525442, + "num_input_tokens_seen": 46102410, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.109375, + "step": 2133, + "time_per_iteration": 2.488302230834961 + }, + { + "auxiliary_loss_clip": 0.01174206, + "auxiliary_loss_mlp": 0.01061813, + "balance_loss_clip": 1.03871, + "balance_loss_mlp": 1.05329132, + "epoch": 0.12830302119344655, + "flos": 20886508316160.0, + "grad_norm": 2.538367341661163, + "language_loss": 0.79631573, + "learning_rate": 3.899484457098528e-06, + "loss": 0.81867594, + "num_input_tokens_seen": 46121145, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.203125, + "step": 2134, + "time_per_iteration": 2.4610936641693115 + }, + { + "auxiliary_loss_clip": 0.01172893, + "auxiliary_loss_mlp": 0.01045118, + "balance_loss_clip": 1.02393413, + "balance_loss_mlp": 1.05650806, + "epoch": 0.12836314444611455, + "flos": 21397768548480.0, + "grad_norm": 2.033800341734765, + "language_loss": 0.83015293, + "learning_rate": 3.899362506701421e-06, + "loss": 0.85233301, + "num_input_tokens_seen": 46140740, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1640625, + "step": 2135, + "time_per_iteration": 2.4743056297302246 + }, + { + "auxiliary_loss_clip": 0.01165668, + "auxiliary_loss_mlp": 0.01061205, + "balance_loss_clip": 1.03842425, + "balance_loss_mlp": 1.05173945, + "epoch": 0.1284232676987825, + "flos": 13662466773120.0, + "grad_norm": 2.9021762622464853, + "language_loss": 0.77293968, + "learning_rate": 3.899240484280298e-06, + "loss": 0.79520839, + "num_input_tokens_seen": 46156805, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.140625, + "step": 2136, + "time_per_iteration": 2.4412362575531006 + }, + { + "auxiliary_loss_clip": 0.01051729, + "auxiliary_loss_mlp": 0.01001869, + "balance_loss_clip": 0.99983084, + "balance_loss_mlp": 1.01248765, + "epoch": 0.12848339095145048, + "flos": 59994737735040.0, + "grad_norm": 0.8943310105061408, + "language_loss": 0.59115362, + "learning_rate": 3.899118389839785e-06, + "loss": 0.61168963, + "num_input_tokens_seen": 46222085, + "router_z_loss_clip": 0.02038574, + "router_z_loss_mlp": 0.39257812, + "step": 2137, + "time_per_iteration": 3.2407264709472656 + }, + { + "auxiliary_loss_clip": 0.01164926, + "auxiliary_loss_mlp": 0.01052629, + "balance_loss_clip": 1.03207743, + "balance_loss_mlp": 1.04970789, + "epoch": 0.12854351420411844, + "flos": 13881378211200.0, + "grad_norm": 2.4694787743163404, + "language_loss": 0.81923193, + "learning_rate": 3.898996223384512e-06, + "loss": 0.84140748, + "num_input_tokens_seen": 46239970, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.15625, + "step": 2138, + "time_per_iteration": 2.441708564758301 + }, + { + "auxiliary_loss_clip": 0.01170897, + "auxiliary_loss_mlp": 0.01055556, + "balance_loss_clip": 1.03207207, + "balance_loss_mlp": 1.05353928, + "epoch": 0.1286036374567864, + "flos": 22637943475200.0, + "grad_norm": 2.804990264663657, + "language_loss": 0.79418135, + "learning_rate": 3.898873984919113e-06, + "loss": 0.81644583, + "num_input_tokens_seen": 46257740, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.171875, + "step": 2139, + "time_per_iteration": 2.5321907997131348 + }, + { + "auxiliary_loss_clip": 0.01169458, + "auxiliary_loss_mlp": 0.0104552, + "balance_loss_clip": 1.02488446, + "balance_loss_mlp": 1.05315363, + "epoch": 0.12866376070945437, + "flos": 16324775948160.0, + "grad_norm": 2.1742564972583667, + "language_loss": 0.84761363, + "learning_rate": 3.8987516744482215e-06, + "loss": 0.86976337, + "num_input_tokens_seen": 46275445, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.1640625, + "step": 2140, + "time_per_iteration": 2.469543933868408 + }, + { + "auxiliary_loss_clip": 0.01164368, + "auxiliary_loss_mlp": 0.01045521, + "balance_loss_clip": 1.02524316, + "balance_loss_mlp": 1.05079114, + "epoch": 0.12872388396212234, + "flos": 11874546374400.0, + "grad_norm": 2.376703775404894, + "language_loss": 0.85850012, + "learning_rate": 3.898629291976476e-06, + "loss": 0.88059902, + "num_input_tokens_seen": 46291710, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1328125, + "step": 2141, + "time_per_iteration": 2.473205327987671 + }, + { + "auxiliary_loss_clip": 0.0116884, + "auxiliary_loss_mlp": 0.01049008, + "balance_loss_clip": 1.0278126, + "balance_loss_mlp": 1.05059922, + "epoch": 0.12878400721479033, + "flos": 28366700722560.0, + "grad_norm": 3.411777854813752, + "language_loss": 0.68245387, + "learning_rate": 3.898506837508518e-06, + "loss": 0.7046324, + "num_input_tokens_seen": 46311335, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1796875, + "step": 2142, + "time_per_iteration": 2.5327556133270264 + }, + { + "auxiliary_loss_clip": 0.01171992, + "auxiliary_loss_mlp": 0.01048809, + "balance_loss_clip": 1.02702951, + "balance_loss_mlp": 1.05430341, + "epoch": 0.1288441304674583, + "flos": 25885632597120.0, + "grad_norm": 2.0295098459565692, + "language_loss": 0.82883704, + "learning_rate": 3.89838431104899e-06, + "loss": 0.85104507, + "num_input_tokens_seen": 46330985, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.171875, + "step": 2143, + "time_per_iteration": 4.014873743057251 + }, + { + "auxiliary_loss_clip": 0.01171398, + "auxiliary_loss_mlp": 0.01053828, + "balance_loss_clip": 1.03262091, + "balance_loss_mlp": 1.05572712, + "epoch": 0.12890425372012626, + "flos": 20813789232000.0, + "grad_norm": 1.7367706894947552, + "language_loss": 0.81788546, + "learning_rate": 3.898261712602539e-06, + "loss": 0.84013772, + "num_input_tokens_seen": 46351295, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.15625, + "step": 2144, + "time_per_iteration": 4.002255439758301 + }, + { + "auxiliary_loss_clip": 0.0116509, + "auxiliary_loss_mlp": 0.0105384, + "balance_loss_clip": 1.03108335, + "balance_loss_mlp": 1.04864693, + "epoch": 0.12896437697279423, + "flos": 22565870835840.0, + "grad_norm": 3.8817809862500727, + "language_loss": 0.78257203, + "learning_rate": 3.898139042173813e-06, + "loss": 0.80476135, + "num_input_tokens_seen": 46368600, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.1640625, + "step": 2145, + "time_per_iteration": 2.4952287673950195 + }, + { + "auxiliary_loss_clip": 0.01167211, + "auxiliary_loss_mlp": 0.01049919, + "balance_loss_clip": 1.02825832, + "balance_loss_mlp": 1.05031526, + "epoch": 0.1290245002254622, + "flos": 17493776075520.0, + "grad_norm": 2.1659704609946897, + "language_loss": 0.82622325, + "learning_rate": 3.898016299767465e-06, + "loss": 0.84839463, + "num_input_tokens_seen": 46387370, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.171875, + "step": 2146, + "time_per_iteration": 2.4898681640625 + }, + { + "auxiliary_loss_clip": 0.01165601, + "auxiliary_loss_mlp": 0.01051615, + "balance_loss_clip": 1.02959681, + "balance_loss_mlp": 1.05129158, + "epoch": 0.12908462347813016, + "flos": 36315957859200.0, + "grad_norm": 2.717320122986492, + "language_loss": 0.70446974, + "learning_rate": 3.897893485388149e-06, + "loss": 0.72664189, + "num_input_tokens_seen": 46409570, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.140625, + "step": 2147, + "time_per_iteration": 2.5964484214782715 + }, + { + "auxiliary_loss_clip": 0.01165989, + "auxiliary_loss_mlp": 0.01051149, + "balance_loss_clip": 1.03069305, + "balance_loss_mlp": 1.05166912, + "epoch": 0.12914474673079815, + "flos": 22528703237760.0, + "grad_norm": 2.443887417123452, + "language_loss": 0.71685153, + "learning_rate": 3.897770599040521e-06, + "loss": 0.73902297, + "num_input_tokens_seen": 46429320, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.140625, + "step": 2148, + "time_per_iteration": 2.4910573959350586 + }, + { + "auxiliary_loss_clip": 0.01165944, + "auxiliary_loss_mlp": 0.01046939, + "balance_loss_clip": 1.02681684, + "balance_loss_mlp": 1.05413008, + "epoch": 0.12920486998346611, + "flos": 21471888263040.0, + "grad_norm": 1.666574129953403, + "language_loss": 0.79379606, + "learning_rate": 3.897647640729242e-06, + "loss": 0.81592482, + "num_input_tokens_seen": 46450155, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.1171875, + "step": 2149, + "time_per_iteration": 2.495443820953369 + }, + { + "auxiliary_loss_clip": 0.01167493, + "auxiliary_loss_mlp": 0.01046346, + "balance_loss_clip": 1.02455473, + "balance_loss_mlp": 1.05306077, + "epoch": 0.12926499323613408, + "flos": 27308556944640.0, + "grad_norm": 2.1379132369478313, + "language_loss": 0.76475441, + "learning_rate": 3.897524610458975e-06, + "loss": 0.78689277, + "num_input_tokens_seen": 46470280, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.140625, + "step": 2150, + "time_per_iteration": 2.524395704269409 + }, + { + "auxiliary_loss_clip": 0.01166143, + "auxiliary_loss_mlp": 0.0105244, + "balance_loss_clip": 1.03124499, + "balance_loss_mlp": 1.05094671, + "epoch": 0.12932511648880204, + "flos": 22091131756800.0, + "grad_norm": 2.417935370690141, + "language_loss": 0.70735669, + "learning_rate": 3.8974015082343835e-06, + "loss": 0.72954249, + "num_input_tokens_seen": 46487605, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1484375, + "step": 2151, + "time_per_iteration": 2.5213184356689453 + }, + { + "auxiliary_loss_clip": 0.01165721, + "auxiliary_loss_mlp": 0.01044761, + "balance_loss_clip": 1.02502, + "balance_loss_mlp": 1.05457592, + "epoch": 0.12938523974147, + "flos": 20302780394880.0, + "grad_norm": 1.9866869590783298, + "language_loss": 0.84050369, + "learning_rate": 3.897278334060137e-06, + "loss": 0.86260849, + "num_input_tokens_seen": 46505100, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.109375, + "step": 2152, + "time_per_iteration": 2.4494428634643555 + }, + { + "auxiliary_loss_clip": 0.01167192, + "auxiliary_loss_mlp": 0.01057934, + "balance_loss_clip": 1.03689384, + "balance_loss_mlp": 1.05128813, + "epoch": 0.12944536299413797, + "flos": 19499961467520.0, + "grad_norm": 2.226463520109079, + "language_loss": 0.78646791, + "learning_rate": 3.897155087940906e-06, + "loss": 0.80871922, + "num_input_tokens_seen": 46524020, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.15625, + "step": 2153, + "time_per_iteration": 2.46975040435791 + }, + { + "auxiliary_loss_clip": 0.01163518, + "auxiliary_loss_mlp": 0.01052865, + "balance_loss_clip": 1.03220654, + "balance_loss_mlp": 1.05069268, + "epoch": 0.12950548624680594, + "flos": 27707919333120.0, + "grad_norm": 2.482522823334948, + "language_loss": 0.80135351, + "learning_rate": 3.897031769881364e-06, + "loss": 0.82351738, + "num_input_tokens_seen": 46544640, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.125, + "step": 2154, + "time_per_iteration": 2.558769941329956 + }, + { + "auxiliary_loss_clip": 0.01170487, + "auxiliary_loss_mlp": 0.01051039, + "balance_loss_clip": 1.02998686, + "balance_loss_mlp": 1.05522227, + "epoch": 0.12956560949947393, + "flos": 17565740974080.0, + "grad_norm": 2.0988715261553774, + "language_loss": 0.83128881, + "learning_rate": 3.896908379886188e-06, + "loss": 0.85350406, + "num_input_tokens_seen": 46561395, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1484375, + "step": 2155, + "time_per_iteration": 2.476299524307251 + }, + { + "auxiliary_loss_clip": 0.01164922, + "auxiliary_loss_mlp": 0.01050282, + "balance_loss_clip": 1.02961075, + "balance_loss_mlp": 1.05010283, + "epoch": 0.1296257327521419, + "flos": 20740711011840.0, + "grad_norm": 2.842594732542889, + "language_loss": 0.76062953, + "learning_rate": 3.896784917960055e-06, + "loss": 0.7827816, + "num_input_tokens_seen": 46579395, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1484375, + "step": 2156, + "time_per_iteration": 2.6024632453918457 + }, + { + "auxiliary_loss_clip": 0.01161875, + "auxiliary_loss_mlp": 0.01051596, + "balance_loss_clip": 1.03078222, + "balance_loss_mlp": 1.05121815, + "epoch": 0.12968585600480986, + "flos": 16395735265920.0, + "grad_norm": 1.9934077258859366, + "language_loss": 0.86546719, + "learning_rate": 3.896661384107648e-06, + "loss": 0.88760191, + "num_input_tokens_seen": 46597090, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.109375, + "step": 2157, + "time_per_iteration": 2.452826976776123 + }, + { + "auxiliary_loss_clip": 0.01164359, + "auxiliary_loss_mlp": 0.01059125, + "balance_loss_clip": 1.03745282, + "balance_loss_mlp": 1.04796743, + "epoch": 0.12974597925747783, + "flos": 28329533124480.0, + "grad_norm": 2.339899004847696, + "language_loss": 0.80590808, + "learning_rate": 3.896537778333651e-06, + "loss": 0.82814288, + "num_input_tokens_seen": 46617355, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.1640625, + "step": 2158, + "time_per_iteration": 2.5332443714141846 + }, + { + "auxiliary_loss_clip": 0.01169288, + "auxiliary_loss_mlp": 0.01055971, + "balance_loss_clip": 1.03510916, + "balance_loss_mlp": 1.05294585, + "epoch": 0.1298061025101458, + "flos": 9683025782400.0, + "grad_norm": 2.254282600322574, + "language_loss": 0.74603379, + "learning_rate": 3.896414100642752e-06, + "loss": 0.76828635, + "num_input_tokens_seen": 46633130, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1640625, + "step": 2159, + "time_per_iteration": 2.469038963317871 + }, + { + "auxiliary_loss_clip": 0.01158286, + "auxiliary_loss_mlp": 0.0105286, + "balance_loss_clip": 1.0323323, + "balance_loss_mlp": 1.04777908, + "epoch": 0.12986622576281376, + "flos": 27709535445120.0, + "grad_norm": 2.1260113568932746, + "language_loss": 0.8227706, + "learning_rate": 3.89629035103964e-06, + "loss": 0.84488213, + "num_input_tokens_seen": 46650575, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.109375, + "step": 2160, + "time_per_iteration": 2.516723155975342 + }, + { + "auxiliary_loss_clip": 0.01159917, + "auxiliary_loss_mlp": 0.01044215, + "balance_loss_clip": 1.02450943, + "balance_loss_mlp": 1.05318654, + "epoch": 0.12992634901548175, + "flos": 18802719590400.0, + "grad_norm": 1.6308358458278915, + "language_loss": 0.81877828, + "learning_rate": 3.896166529529008e-06, + "loss": 0.8408196, + "num_input_tokens_seen": 46668780, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0625, + "step": 2161, + "time_per_iteration": 2.4677131175994873 + }, + { + "auxiliary_loss_clip": 0.01161774, + "auxiliary_loss_mlp": 0.01056265, + "balance_loss_clip": 1.03479493, + "balance_loss_mlp": 1.05035043, + "epoch": 0.12998647226814972, + "flos": 29127575543040.0, + "grad_norm": 2.2782308625037686, + "language_loss": 0.82592809, + "learning_rate": 3.896042636115551e-06, + "loss": 0.84810847, + "num_input_tokens_seen": 46687550, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.109375, + "step": 2162, + "time_per_iteration": 2.5702993869781494 + }, + { + "auxiliary_loss_clip": 0.01164237, + "auxiliary_loss_mlp": 0.01054699, + "balance_loss_clip": 1.03454113, + "balance_loss_mlp": 1.04993796, + "epoch": 0.13004659552081768, + "flos": 19573686132480.0, + "grad_norm": 2.619296712638915, + "language_loss": 0.72762972, + "learning_rate": 3.895918670803968e-06, + "loss": 0.7498191, + "num_input_tokens_seen": 46706730, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.140625, + "step": 2163, + "time_per_iteration": 2.4531478881835938 + }, + { + "auxiliary_loss_clip": 0.0116607, + "auxiliary_loss_mlp": 0.01053845, + "balance_loss_clip": 1.03183889, + "balance_loss_mlp": 1.05107188, + "epoch": 0.13010671877348565, + "flos": 22490709626880.0, + "grad_norm": 2.0773433264348435, + "language_loss": 0.81498116, + "learning_rate": 3.895794633598958e-06, + "loss": 0.83718032, + "num_input_tokens_seen": 46724250, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1484375, + "step": 2164, + "time_per_iteration": 2.497072458267212 + }, + { + "auxiliary_loss_clip": 0.01164255, + "auxiliary_loss_mlp": 0.01042951, + "balance_loss_clip": 1.02381766, + "balance_loss_mlp": 1.05107093, + "epoch": 0.1301668420261536, + "flos": 23878226142720.0, + "grad_norm": 2.2040156749440523, + "language_loss": 0.72564822, + "learning_rate": 3.8956705245052256e-06, + "loss": 0.7477203, + "num_input_tokens_seen": 46744105, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.1328125, + "step": 2165, + "time_per_iteration": 2.515026807785034 + }, + { + "auxiliary_loss_clip": 0.01167589, + "auxiliary_loss_mlp": 0.01047652, + "balance_loss_clip": 1.02599204, + "balance_loss_mlp": 1.05286038, + "epoch": 0.13022696527882158, + "flos": 23150065633920.0, + "grad_norm": 2.8786436091142913, + "language_loss": 0.74697578, + "learning_rate": 3.8955463435274765e-06, + "loss": 0.76912814, + "num_input_tokens_seen": 46764250, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.1484375, + "step": 2166, + "time_per_iteration": 2.5301709175109863 + }, + { + "auxiliary_loss_clip": 0.01165477, + "auxiliary_loss_mlp": 0.01047606, + "balance_loss_clip": 1.02751899, + "balance_loss_mlp": 1.05156064, + "epoch": 0.13028708853148954, + "flos": 26908548111360.0, + "grad_norm": 1.5708346768068926, + "language_loss": 0.83053899, + "learning_rate": 3.895422090670421e-06, + "loss": 0.85266984, + "num_input_tokens_seen": 46786865, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.140625, + "step": 2167, + "time_per_iteration": 2.632035732269287 + }, + { + "auxiliary_loss_clip": 0.01163335, + "auxiliary_loss_mlp": 0.01060394, + "balance_loss_clip": 1.03931761, + "balance_loss_mlp": 1.05201721, + "epoch": 0.13034721178415754, + "flos": 21251468453760.0, + "grad_norm": 1.9158171210349437, + "language_loss": 0.83286303, + "learning_rate": 3.89529776593877e-06, + "loss": 0.85510027, + "num_input_tokens_seen": 46807030, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.109375, + "step": 2168, + "time_per_iteration": 2.4766387939453125 + }, + { + "auxiliary_loss_clip": 0.0116626, + "auxiliary_loss_mlp": 0.01052307, + "balance_loss_clip": 1.03075409, + "balance_loss_mlp": 1.05258656, + "epoch": 0.1304073350368255, + "flos": 18767239931520.0, + "grad_norm": 2.304013454801214, + "language_loss": 0.80027354, + "learning_rate": 3.8951733693372375e-06, + "loss": 0.82245922, + "num_input_tokens_seen": 46826280, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.140625, + "step": 2169, + "time_per_iteration": 2.5185413360595703 + }, + { + "auxiliary_loss_clip": 0.01166949, + "auxiliary_loss_mlp": 0.01045126, + "balance_loss_clip": 1.02329922, + "balance_loss_mlp": 1.05451608, + "epoch": 0.13046745828949347, + "flos": 28364653647360.0, + "grad_norm": 4.565704621626811, + "language_loss": 0.66456163, + "learning_rate": 3.8950489008705406e-06, + "loss": 0.68668246, + "num_input_tokens_seen": 46846505, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.125, + "step": 2170, + "time_per_iteration": 2.5556788444519043 + }, + { + "auxiliary_loss_clip": 0.01165384, + "auxiliary_loss_mlp": 0.01044241, + "balance_loss_clip": 1.02397573, + "balance_loss_mlp": 1.05294132, + "epoch": 0.13052758154216143, + "flos": 29605044055680.0, + "grad_norm": 1.848772151746763, + "language_loss": 0.66935396, + "learning_rate": 3.8949243605434e-06, + "loss": 0.69145024, + "num_input_tokens_seen": 46867380, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.125, + "step": 2171, + "time_per_iteration": 2.553422451019287 + }, + { + "auxiliary_loss_clip": 0.01164709, + "auxiliary_loss_mlp": 0.01048224, + "balance_loss_clip": 1.02649236, + "balance_loss_mlp": 1.05211711, + "epoch": 0.1305877047948294, + "flos": 19390864884480.0, + "grad_norm": 1.9479804069383955, + "language_loss": 0.71952963, + "learning_rate": 3.894799748360537e-06, + "loss": 0.74165899, + "num_input_tokens_seen": 46886810, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.125, + "step": 2172, + "time_per_iteration": 2.4801840782165527 + }, + { + "auxiliary_loss_clip": 0.01161466, + "auxiliary_loss_mlp": 0.01043706, + "balance_loss_clip": 1.02508521, + "balance_loss_mlp": 1.05435848, + "epoch": 0.13064782804749736, + "flos": 16873527000960.0, + "grad_norm": 1.8616776845407013, + "language_loss": 0.75547618, + "learning_rate": 3.894675064326678e-06, + "loss": 0.77752787, + "num_input_tokens_seen": 46905620, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.0703125, + "step": 2173, + "time_per_iteration": 2.4639194011688232 + }, + { + "auxiliary_loss_clip": 0.01165867, + "auxiliary_loss_mlp": 0.01055263, + "balance_loss_clip": 1.03406715, + "balance_loss_mlp": 1.05319107, + "epoch": 0.13070795130016533, + "flos": 24499085748480.0, + "grad_norm": 2.777389952877741, + "language_loss": 0.70484382, + "learning_rate": 3.894550308446551e-06, + "loss": 0.72705513, + "num_input_tokens_seen": 46925120, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.125, + "step": 2174, + "time_per_iteration": 2.4914908409118652 + }, + { + "auxiliary_loss_clip": 0.01055025, + "auxiliary_loss_mlp": 0.01011158, + "balance_loss_clip": 1.0087378, + "balance_loss_mlp": 1.0165, + "epoch": 0.13076807455283332, + "flos": 71054505953280.0, + "grad_norm": 0.8022263951171452, + "language_loss": 0.59071571, + "learning_rate": 3.894425480724886e-06, + "loss": 0.61137754, + "num_input_tokens_seen": 46988195, + "router_z_loss_clip": 0.02416992, + "router_z_loss_mlp": 0.38671875, + "step": 2175, + "time_per_iteration": 3.244633913040161 + }, + { + "auxiliary_loss_clip": 0.01164931, + "auxiliary_loss_mlp": 0.01051735, + "balance_loss_clip": 1.03214908, + "balance_loss_mlp": 1.05474329, + "epoch": 0.13082819780550128, + "flos": 20264499475200.0, + "grad_norm": 2.247504257537708, + "language_loss": 0.79946023, + "learning_rate": 3.894300581166417e-06, + "loss": 0.8216269, + "num_input_tokens_seen": 47004720, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.1015625, + "step": 2176, + "time_per_iteration": 2.439883232116699 + }, + { + "auxiliary_loss_clip": 0.01163907, + "auxiliary_loss_mlp": 0.01050259, + "balance_loss_clip": 1.02806199, + "balance_loss_mlp": 1.05234194, + "epoch": 0.13088832105816925, + "flos": 34203441231360.0, + "grad_norm": 1.8562517641565577, + "language_loss": 0.74595284, + "learning_rate": 3.894175609775881e-06, + "loss": 0.76809454, + "num_input_tokens_seen": 47024255, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.1171875, + "step": 2177, + "time_per_iteration": 2.561140298843384 + }, + { + "auxiliary_loss_clip": 0.01163674, + "auxiliary_loss_mlp": 0.01046693, + "balance_loss_clip": 1.024472, + "balance_loss_mlp": 1.05222929, + "epoch": 0.13094844431083721, + "flos": 17894970057600.0, + "grad_norm": 2.128567307625778, + "language_loss": 0.81855309, + "learning_rate": 3.894050566558015e-06, + "loss": 0.84065676, + "num_input_tokens_seen": 47042465, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.1171875, + "step": 2178, + "time_per_iteration": 2.458812713623047 + }, + { + "auxiliary_loss_clip": 0.01166111, + "auxiliary_loss_mlp": 0.01047933, + "balance_loss_clip": 1.02695179, + "balance_loss_mlp": 1.05466795, + "epoch": 0.13100856756350518, + "flos": 17311313963520.0, + "grad_norm": 2.66972533149016, + "language_loss": 0.74942935, + "learning_rate": 3.893925451517562e-06, + "loss": 0.77156973, + "num_input_tokens_seen": 47060370, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.109375, + "step": 2179, + "time_per_iteration": 2.4679782390594482 + }, + { + "auxiliary_loss_clip": 0.01161603, + "auxiliary_loss_mlp": 0.0105054, + "balance_loss_clip": 1.03079903, + "balance_loss_mlp": 1.05280709, + "epoch": 0.13106869081617314, + "flos": 22200551562240.0, + "grad_norm": 2.0560779031919636, + "language_loss": 0.84319234, + "learning_rate": 3.893800264659266e-06, + "loss": 0.86531377, + "num_input_tokens_seen": 47081415, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0859375, + "step": 2180, + "time_per_iteration": 2.567873477935791 + }, + { + "auxiliary_loss_clip": 0.01166279, + "auxiliary_loss_mlp": 0.01054601, + "balance_loss_clip": 1.03483582, + "balance_loss_mlp": 1.05700839, + "epoch": 0.13112881406884114, + "flos": 21763123735680.0, + "grad_norm": 2.214126283525484, + "language_loss": 0.8987745, + "learning_rate": 3.8936750059878746e-06, + "loss": 0.92098325, + "num_input_tokens_seen": 47099860, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.09375, + "step": 2181, + "time_per_iteration": 2.4802486896514893 + }, + { + "auxiliary_loss_clip": 0.01166281, + "auxiliary_loss_mlp": 0.01043829, + "balance_loss_clip": 1.02438569, + "balance_loss_mlp": 1.0557189, + "epoch": 0.1311889373215091, + "flos": 23331091201920.0, + "grad_norm": 1.8993602522657917, + "language_loss": 0.68657839, + "learning_rate": 3.893549675508137e-06, + "loss": 0.70867944, + "num_input_tokens_seen": 47118540, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.109375, + "step": 2182, + "time_per_iteration": 2.460148572921753 + }, + { + "auxiliary_loss_clip": 0.01167141, + "auxiliary_loss_mlp": 0.01048146, + "balance_loss_clip": 1.02745128, + "balance_loss_mlp": 1.05504203, + "epoch": 0.13124906057417707, + "flos": 21467363149440.0, + "grad_norm": 2.6442759836393277, + "language_loss": 0.78435183, + "learning_rate": 3.893424273224806e-06, + "loss": 0.80650467, + "num_input_tokens_seen": 47136710, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.125, + "step": 2183, + "time_per_iteration": 2.5462143421173096 + }, + { + "auxiliary_loss_clip": 0.01162472, + "auxiliary_loss_mlp": 0.01043905, + "balance_loss_clip": 1.02375841, + "balance_loss_mlp": 1.05238128, + "epoch": 0.13130918382684503, + "flos": 23255319461760.0, + "grad_norm": 2.788927255894662, + "language_loss": 0.85543215, + "learning_rate": 3.893298799142636e-06, + "loss": 0.87749588, + "num_input_tokens_seen": 47157155, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.1015625, + "step": 2184, + "time_per_iteration": 3.8904993534088135 + }, + { + "auxiliary_loss_clip": 0.01165934, + "auxiliary_loss_mlp": 0.01047649, + "balance_loss_clip": 1.0265255, + "balance_loss_mlp": 1.0529201, + "epoch": 0.131369307079513, + "flos": 20850274471680.0, + "grad_norm": 2.505672435211917, + "language_loss": 0.82206696, + "learning_rate": 3.893173253266387e-06, + "loss": 0.84420282, + "num_input_tokens_seen": 47176820, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1328125, + "step": 2185, + "time_per_iteration": 5.3855485916137695 + }, + { + "auxiliary_loss_clip": 0.01168066, + "auxiliary_loss_mlp": 0.0105393, + "balance_loss_clip": 1.03323543, + "balance_loss_mlp": 1.05440092, + "epoch": 0.13142943033218096, + "flos": 17858341163520.0, + "grad_norm": 2.0294565364346235, + "language_loss": 0.73037684, + "learning_rate": 3.893047635600818e-06, + "loss": 0.7525968, + "num_input_tokens_seen": 47195855, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1328125, + "step": 2186, + "time_per_iteration": 2.4839119911193848 + }, + { + "auxiliary_loss_clip": 0.01165928, + "auxiliary_loss_mlp": 0.01048235, + "balance_loss_clip": 1.02601433, + "balance_loss_mlp": 1.05449164, + "epoch": 0.13148955358484893, + "flos": 20996035862400.0, + "grad_norm": 2.0525608711513614, + "language_loss": 0.80174023, + "learning_rate": 3.892921946150693e-06, + "loss": 0.82388186, + "num_input_tokens_seen": 47214535, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.109375, + "step": 2187, + "time_per_iteration": 2.463906764984131 + }, + { + "auxiliary_loss_clip": 0.01053378, + "auxiliary_loss_mlp": 0.01005839, + "balance_loss_clip": 1.00344312, + "balance_loss_mlp": 1.01508641, + "epoch": 0.13154967683751692, + "flos": 70172467580160.0, + "grad_norm": 0.8435449169341035, + "language_loss": 0.58977342, + "learning_rate": 3.892796184920778e-06, + "loss": 0.61036563, + "num_input_tokens_seen": 47270300, + "router_z_loss_clip": 0.02392578, + "router_z_loss_mlp": 0.3828125, + "step": 2188, + "time_per_iteration": 3.1052041053771973 + }, + { + "auxiliary_loss_clip": 0.01169813, + "auxiliary_loss_mlp": 0.01050803, + "balance_loss_clip": 1.03037024, + "balance_loss_mlp": 1.05918622, + "epoch": 0.1316098000901849, + "flos": 20376145923840.0, + "grad_norm": 2.1443848583942846, + "language_loss": 0.74199927, + "learning_rate": 3.892670351915842e-06, + "loss": 0.76420546, + "num_input_tokens_seen": 47290720, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.109375, + "step": 2189, + "time_per_iteration": 2.5137264728546143 + }, + { + "auxiliary_loss_clip": 0.01166605, + "auxiliary_loss_mlp": 0.01049022, + "balance_loss_clip": 1.02894759, + "balance_loss_mlp": 1.05678558, + "epoch": 0.13166992334285285, + "flos": 23221132692480.0, + "grad_norm": 1.7642431940848833, + "language_loss": 0.72561657, + "learning_rate": 3.892544447140657e-06, + "loss": 0.74777287, + "num_input_tokens_seen": 47311820, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.1015625, + "step": 2190, + "time_per_iteration": 2.5053412914276123 + }, + { + "auxiliary_loss_clip": 0.01169095, + "auxiliary_loss_mlp": 0.01051343, + "balance_loss_clip": 1.03094649, + "balance_loss_mlp": 1.05706906, + "epoch": 0.13173004659552082, + "flos": 23330947547520.0, + "grad_norm": 8.700182749243472, + "language_loss": 0.74395585, + "learning_rate": 3.892418470599996e-06, + "loss": 0.76616025, + "num_input_tokens_seen": 47331605, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.1171875, + "step": 2191, + "time_per_iteration": 2.507687568664551 + }, + { + "auxiliary_loss_clip": 0.01168877, + "auxiliary_loss_mlp": 0.01049305, + "balance_loss_clip": 1.02841949, + "balance_loss_mlp": 1.05689156, + "epoch": 0.13179016984818878, + "flos": 21251504367360.0, + "grad_norm": 2.0250128968483403, + "language_loss": 0.79286075, + "learning_rate": 3.892292422298637e-06, + "loss": 0.8150425, + "num_input_tokens_seen": 47350455, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1171875, + "step": 2192, + "time_per_iteration": 2.5068893432617188 + }, + { + "auxiliary_loss_clip": 0.01168449, + "auxiliary_loss_mlp": 0.01053422, + "balance_loss_clip": 1.03290629, + "balance_loss_mlp": 1.05564141, + "epoch": 0.13185029310085675, + "flos": 17778690754560.0, + "grad_norm": 1.9285179647135495, + "language_loss": 0.84827602, + "learning_rate": 3.892166302241361e-06, + "loss": 0.87049472, + "num_input_tokens_seen": 47368225, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.125, + "step": 2193, + "time_per_iteration": 2.456409215927124 + }, + { + "auxiliary_loss_clip": 0.0105585, + "auxiliary_loss_mlp": 0.01002145, + "balance_loss_clip": 0.99976075, + "balance_loss_mlp": 1.0179081, + "epoch": 0.1319104163535247, + "flos": 69851785933440.0, + "grad_norm": 0.7727203010194038, + "language_loss": 0.54049635, + "learning_rate": 3.8920401104329475e-06, + "loss": 0.56107628, + "num_input_tokens_seen": 47427125, + "router_z_loss_clip": 0.02380371, + "router_z_loss_mlp": 0.37890625, + "step": 2194, + "time_per_iteration": 3.0569794178009033 + }, + { + "auxiliary_loss_clip": 0.01165932, + "auxiliary_loss_mlp": 0.01046302, + "balance_loss_clip": 1.02566671, + "balance_loss_mlp": 1.05514359, + "epoch": 0.1319705396061927, + "flos": 25193095401600.0, + "grad_norm": 1.7688784093808256, + "language_loss": 0.72086227, + "learning_rate": 3.891913846878185e-06, + "loss": 0.74298465, + "num_input_tokens_seen": 47450275, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.109375, + "step": 2195, + "time_per_iteration": 2.527435541152954 + }, + { + "auxiliary_loss_clip": 0.01173804, + "auxiliary_loss_mlp": 0.01045041, + "balance_loss_clip": 1.02310634, + "balance_loss_mlp": 1.05663633, + "epoch": 0.13203066285886067, + "flos": 20740459616640.0, + "grad_norm": 1.7664998702658374, + "language_loss": 0.78195536, + "learning_rate": 3.891787511581859e-06, + "loss": 0.80414379, + "num_input_tokens_seen": 47469155, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.171875, + "step": 2196, + "time_per_iteration": 2.4685165882110596 + }, + { + "auxiliary_loss_clip": 0.01169552, + "auxiliary_loss_mlp": 0.01046979, + "balance_loss_clip": 1.02714252, + "balance_loss_mlp": 1.05638218, + "epoch": 0.13209078611152864, + "flos": 22054395121920.0, + "grad_norm": 2.1663119445052295, + "language_loss": 0.74861938, + "learning_rate": 3.89166110454876e-06, + "loss": 0.77078474, + "num_input_tokens_seen": 47488405, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.1328125, + "step": 2197, + "time_per_iteration": 2.489504814147949 + }, + { + "auxiliary_loss_clip": 0.01170611, + "auxiliary_loss_mlp": 0.01044966, + "balance_loss_clip": 1.02430725, + "balance_loss_mlp": 1.05543399, + "epoch": 0.1321509093641966, + "flos": 16284950743680.0, + "grad_norm": 2.4378795089069674, + "language_loss": 0.8011694, + "learning_rate": 3.891534625783685e-06, + "loss": 0.82332516, + "num_input_tokens_seen": 47505650, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1484375, + "step": 2198, + "time_per_iteration": 2.437718391418457 + }, + { + "auxiliary_loss_clip": 0.0116676, + "auxiliary_loss_mlp": 0.01061419, + "balance_loss_clip": 1.04173732, + "balance_loss_mlp": 1.05483699, + "epoch": 0.13221103261686457, + "flos": 16983018633600.0, + "grad_norm": 2.4514815632850038, + "language_loss": 0.82552117, + "learning_rate": 3.891408075291425e-06, + "loss": 0.847803, + "num_input_tokens_seen": 47521540, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1171875, + "step": 2199, + "time_per_iteration": 2.47356915473938 + }, + { + "auxiliary_loss_clip": 0.01167277, + "auxiliary_loss_mlp": 0.01047633, + "balance_loss_clip": 1.02724838, + "balance_loss_mlp": 1.05458844, + "epoch": 0.13227115586953253, + "flos": 34233605677440.0, + "grad_norm": 2.465688895758548, + "language_loss": 0.68963099, + "learning_rate": 3.8912814530767826e-06, + "loss": 0.71178007, + "num_input_tokens_seen": 47543625, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.125, + "step": 2200, + "time_per_iteration": 2.5828843116760254 + }, + { + "auxiliary_loss_clip": 0.01166491, + "auxiliary_loss_mlp": 0.01055533, + "balance_loss_clip": 1.03420663, + "balance_loss_mlp": 1.05397916, + "epoch": 0.13233127912220052, + "flos": 20704656735360.0, + "grad_norm": 2.591612522060186, + "language_loss": 0.84600091, + "learning_rate": 3.891154759144557e-06, + "loss": 0.86822116, + "num_input_tokens_seen": 47563740, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.125, + "step": 2201, + "time_per_iteration": 2.5546202659606934 + }, + { + "auxiliary_loss_clip": 0.01168797, + "auxiliary_loss_mlp": 0.01054072, + "balance_loss_clip": 1.03315115, + "balance_loss_mlp": 1.05466592, + "epoch": 0.1323914023748685, + "flos": 25805048434560.0, + "grad_norm": 1.901870031688447, + "language_loss": 0.86978126, + "learning_rate": 3.891027993499554e-06, + "loss": 0.89200991, + "num_input_tokens_seen": 47582655, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.140625, + "step": 2202, + "time_per_iteration": 2.509300470352173 + }, + { + "auxiliary_loss_clip": 0.01164666, + "auxiliary_loss_mlp": 0.01042993, + "balance_loss_clip": 1.02364576, + "balance_loss_mlp": 1.05389142, + "epoch": 0.13245152562753645, + "flos": 21251540280960.0, + "grad_norm": 2.3614014237187084, + "language_loss": 0.72746712, + "learning_rate": 3.89090115614658e-06, + "loss": 0.74954367, + "num_input_tokens_seen": 47600875, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.109375, + "step": 2203, + "time_per_iteration": 2.5267388820648193 + }, + { + "auxiliary_loss_clip": 0.01167891, + "auxiliary_loss_mlp": 0.0105678, + "balance_loss_clip": 1.03781366, + "balance_loss_mlp": 1.05453348, + "epoch": 0.13251164888020442, + "flos": 26610955931520.0, + "grad_norm": 2.5436302639516, + "language_loss": 0.73248756, + "learning_rate": 3.890774247090444e-06, + "loss": 0.75473428, + "num_input_tokens_seen": 47619250, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.1328125, + "step": 2204, + "time_per_iteration": 2.5298051834106445 + }, + { + "auxiliary_loss_clip": 0.01168712, + "auxiliary_loss_mlp": 0.01053403, + "balance_loss_clip": 1.03211212, + "balance_loss_mlp": 1.05558085, + "epoch": 0.13257177213287238, + "flos": 29826541272960.0, + "grad_norm": 1.7540271848273767, + "language_loss": 0.78627133, + "learning_rate": 3.89064726633596e-06, + "loss": 0.80849254, + "num_input_tokens_seen": 47639445, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1328125, + "step": 2205, + "time_per_iteration": 2.5343189239501953 + }, + { + "auxiliary_loss_clip": 0.01166449, + "auxiliary_loss_mlp": 0.01053788, + "balance_loss_clip": 1.033391, + "balance_loss_mlp": 1.05560231, + "epoch": 0.13263189538554035, + "flos": 21288456483840.0, + "grad_norm": 2.234297854715259, + "language_loss": 0.78748876, + "learning_rate": 3.890520213887941e-06, + "loss": 0.80969107, + "num_input_tokens_seen": 47658740, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.109375, + "step": 2206, + "time_per_iteration": 2.473229169845581 + }, + { + "auxiliary_loss_clip": 0.01170953, + "auxiliary_loss_mlp": 0.01046503, + "balance_loss_clip": 1.02750087, + "balance_loss_mlp": 1.05758011, + "epoch": 0.13269201863820831, + "flos": 16874101618560.0, + "grad_norm": 2.3028539815574494, + "language_loss": 0.73993444, + "learning_rate": 3.890393089751208e-06, + "loss": 0.76210898, + "num_input_tokens_seen": 47676880, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.125, + "step": 2207, + "time_per_iteration": 2.479421854019165 + }, + { + "auxiliary_loss_clip": 0.01160402, + "auxiliary_loss_mlp": 0.0104899, + "balance_loss_clip": 1.02822387, + "balance_loss_mlp": 1.05323017, + "epoch": 0.1327521418908763, + "flos": 23768914078080.0, + "grad_norm": 2.4105539478543454, + "language_loss": 0.84151787, + "learning_rate": 3.890265893930578e-06, + "loss": 0.86361182, + "num_input_tokens_seen": 47696635, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.0703125, + "step": 2208, + "time_per_iteration": 2.501969337463379 + }, + { + "auxiliary_loss_clip": 0.01161894, + "auxiliary_loss_mlp": 0.01055633, + "balance_loss_clip": 1.03621435, + "balance_loss_mlp": 1.05553222, + "epoch": 0.13281226514354427, + "flos": 26505594362880.0, + "grad_norm": 1.9362156368998853, + "language_loss": 0.85323346, + "learning_rate": 3.890138626430876e-06, + "loss": 0.87540877, + "num_input_tokens_seen": 47717760, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0625, + "step": 2209, + "time_per_iteration": 2.509761333465576 + }, + { + "auxiliary_loss_clip": 0.01166975, + "auxiliary_loss_mlp": 0.01049621, + "balance_loss_clip": 1.03039217, + "balance_loss_mlp": 1.05628705, + "epoch": 0.13287238839621224, + "flos": 24498762526080.0, + "grad_norm": 2.055387861012722, + "language_loss": 0.81545013, + "learning_rate": 3.890011287256929e-06, + "loss": 0.83761609, + "num_input_tokens_seen": 47737685, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.109375, + "step": 2210, + "time_per_iteration": 2.4920527935028076 + }, + { + "auxiliary_loss_clip": 0.0106048, + "auxiliary_loss_mlp": 0.01003994, + "balance_loss_clip": 1.0016222, + "balance_loss_mlp": 1.02205014, + "epoch": 0.1329325116488802, + "flos": 67694344369920.0, + "grad_norm": 0.7616894664797615, + "language_loss": 0.57984382, + "learning_rate": 3.889883876413563e-06, + "loss": 0.6004886, + "num_input_tokens_seen": 47802415, + "router_z_loss_clip": 0.02368164, + "router_z_loss_mlp": 0.3828125, + "step": 2211, + "time_per_iteration": 3.1735260486602783 + }, + { + "auxiliary_loss_clip": 0.01059664, + "auxiliary_loss_mlp": 0.01005439, + "balance_loss_clip": 1.00312614, + "balance_loss_mlp": 1.02081084, + "epoch": 0.13299263490154817, + "flos": 72261894741120.0, + "grad_norm": 0.7970523185699088, + "language_loss": 0.55364317, + "learning_rate": 3.889756393905611e-06, + "loss": 0.57429421, + "num_input_tokens_seen": 47871485, + "router_z_loss_clip": 0.02307129, + "router_z_loss_mlp": 0.38671875, + "step": 2212, + "time_per_iteration": 3.142056465148926 + }, + { + "auxiliary_loss_clip": 0.01170665, + "auxiliary_loss_mlp": 0.01052255, + "balance_loss_clip": 1.03164423, + "balance_loss_mlp": 1.056463, + "epoch": 0.13305275815421613, + "flos": 17931275729280.0, + "grad_norm": 4.2694742121271645, + "language_loss": 0.74779308, + "learning_rate": 3.889628839737908e-06, + "loss": 0.77002227, + "num_input_tokens_seen": 47888315, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.140625, + "step": 2213, + "time_per_iteration": 2.4599013328552246 + }, + { + "auxiliary_loss_clip": 0.0115893, + "auxiliary_loss_mlp": 0.01047564, + "balance_loss_clip": 1.02889609, + "balance_loss_mlp": 1.05235839, + "epoch": 0.13311288140688413, + "flos": 22340889999360.0, + "grad_norm": 2.0343460890824927, + "language_loss": 0.79269958, + "learning_rate": 3.889501213915291e-06, + "loss": 0.81476456, + "num_input_tokens_seen": 47906600, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.0625, + "step": 2214, + "time_per_iteration": 2.4635097980499268 + }, + { + "auxiliary_loss_clip": 0.01166328, + "auxiliary_loss_mlp": 0.01051317, + "balance_loss_clip": 1.03062189, + "balance_loss_mlp": 1.05593503, + "epoch": 0.1331730046595521, + "flos": 31868888682240.0, + "grad_norm": 2.0399610331480407, + "language_loss": 0.69410872, + "learning_rate": 3.889373516442597e-06, + "loss": 0.71628523, + "num_input_tokens_seen": 47927630, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1015625, + "step": 2215, + "time_per_iteration": 2.5798754692077637 + }, + { + "auxiliary_loss_clip": 0.01166771, + "auxiliary_loss_mlp": 0.01046808, + "balance_loss_clip": 1.02725816, + "balance_loss_mlp": 1.05576539, + "epoch": 0.13323312791222006, + "flos": 22566589107840.0, + "grad_norm": 2.4518621177772175, + "language_loss": 0.81136751, + "learning_rate": 3.889245747324671e-06, + "loss": 0.83350337, + "num_input_tokens_seen": 47947935, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.109375, + "step": 2216, + "time_per_iteration": 2.49601674079895 + }, + { + "auxiliary_loss_clip": 0.01166215, + "auxiliary_loss_mlp": 0.01057297, + "balance_loss_clip": 1.03668606, + "balance_loss_mlp": 1.05610895, + "epoch": 0.13329325116488802, + "flos": 15085319293440.0, + "grad_norm": 3.5729384628186307, + "language_loss": 0.87350845, + "learning_rate": 3.889117906566356e-06, + "loss": 0.89574361, + "num_input_tokens_seen": 47965515, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.1015625, + "step": 2217, + "time_per_iteration": 2.435224771499634 + }, + { + "auxiliary_loss_clip": 0.01165439, + "auxiliary_loss_mlp": 0.01048261, + "balance_loss_clip": 1.02716112, + "balance_loss_mlp": 1.05609739, + "epoch": 0.133353374417556, + "flos": 27453671890560.0, + "grad_norm": 2.6393181601709057, + "language_loss": 0.73460543, + "learning_rate": 3.888989994172501e-06, + "loss": 0.75674248, + "num_input_tokens_seen": 47985675, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.09375, + "step": 2218, + "time_per_iteration": 2.4984188079833984 + }, + { + "auxiliary_loss_clip": 0.01164132, + "auxiliary_loss_mlp": 0.01044805, + "balance_loss_clip": 1.02401495, + "balance_loss_mlp": 1.05406141, + "epoch": 0.13341349767022395, + "flos": 24094695456000.0, + "grad_norm": 1.803125703936159, + "language_loss": 0.87483871, + "learning_rate": 3.8888620101479565e-06, + "loss": 0.89692807, + "num_input_tokens_seen": 48004985, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.09375, + "step": 2219, + "time_per_iteration": 2.4761111736297607 + }, + { + "auxiliary_loss_clip": 0.01166927, + "auxiliary_loss_mlp": 0.01051114, + "balance_loss_clip": 1.03198123, + "balance_loss_mlp": 1.05804753, + "epoch": 0.13347362092289192, + "flos": 24133335511680.0, + "grad_norm": 1.5604165479120375, + "language_loss": 0.77241862, + "learning_rate": 3.888733954497574e-06, + "loss": 0.79459906, + "num_input_tokens_seen": 48024965, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0859375, + "step": 2220, + "time_per_iteration": 2.5172770023345947 + }, + { + "auxiliary_loss_clip": 0.01158357, + "auxiliary_loss_mlp": 0.01044474, + "balance_loss_clip": 1.02599692, + "balance_loss_mlp": 1.05065227, + "epoch": 0.1335337441755599, + "flos": 18436538390400.0, + "grad_norm": 2.752699726256429, + "language_loss": 0.79361391, + "learning_rate": 3.888605827226212e-06, + "loss": 0.81564224, + "num_input_tokens_seen": 48040890, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.078125, + "step": 2221, + "time_per_iteration": 2.4729459285736084 + }, + { + "auxiliary_loss_clip": 0.01056162, + "auxiliary_loss_mlp": 0.01004009, + "balance_loss_clip": 1.00177944, + "balance_loss_mlp": 1.01797867, + "epoch": 0.13359386742822787, + "flos": 50611997652480.0, + "grad_norm": 0.9620212456786271, + "language_loss": 0.6890744, + "learning_rate": 3.8884776283387275e-06, + "loss": 0.70967615, + "num_input_tokens_seen": 48091855, + "router_z_loss_clip": 0.02233887, + "router_z_loss_mlp": 0.3828125, + "step": 2222, + "time_per_iteration": 2.9102694988250732 + }, + { + "auxiliary_loss_clip": 0.011664, + "auxiliary_loss_mlp": 0.01047762, + "balance_loss_clip": 1.02885592, + "balance_loss_mlp": 1.05645049, + "epoch": 0.13365399068089584, + "flos": 22778569221120.0, + "grad_norm": 1.8990549263762904, + "language_loss": 0.66966134, + "learning_rate": 3.888349357839982e-06, + "loss": 0.69180298, + "num_input_tokens_seen": 48111350, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.1015625, + "step": 2223, + "time_per_iteration": 2.4860363006591797 + }, + { + "auxiliary_loss_clip": 0.01162257, + "auxiliary_loss_mlp": 0.01055999, + "balance_loss_clip": 1.03584075, + "balance_loss_mlp": 1.05173874, + "epoch": 0.1337141139335638, + "flos": 12531603911040.0, + "grad_norm": 2.0940561003244738, + "language_loss": 0.82572883, + "learning_rate": 3.88822101573484e-06, + "loss": 0.84791142, + "num_input_tokens_seen": 48129840, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.109375, + "step": 2224, + "time_per_iteration": 2.453310966491699 + }, + { + "auxiliary_loss_clip": 0.01167505, + "auxiliary_loss_mlp": 0.01047104, + "balance_loss_clip": 1.0266118, + "balance_loss_mlp": 1.05410361, + "epoch": 0.13377423718623177, + "flos": 23038957889280.0, + "grad_norm": 2.0797940389634624, + "language_loss": 0.66006851, + "learning_rate": 3.888092602028167e-06, + "loss": 0.68221462, + "num_input_tokens_seen": 48149240, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1328125, + "step": 2225, + "time_per_iteration": 2.505760669708252 + }, + { + "auxiliary_loss_clip": 0.01164479, + "auxiliary_loss_mlp": 0.01054978, + "balance_loss_clip": 1.03491461, + "balance_loss_mlp": 1.05366707, + "epoch": 0.13383436043889974, + "flos": 16216397637120.0, + "grad_norm": 2.2490181158076545, + "language_loss": 0.89484501, + "learning_rate": 3.887964116724835e-06, + "loss": 0.91703951, + "num_input_tokens_seen": 48166330, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.109375, + "step": 2226, + "time_per_iteration": 3.827432632446289 + }, + { + "auxiliary_loss_clip": 0.01166768, + "auxiliary_loss_mlp": 0.01050683, + "balance_loss_clip": 1.03132319, + "balance_loss_mlp": 1.05492473, + "epoch": 0.1338944836915677, + "flos": 24279671520000.0, + "grad_norm": 2.0692514385202947, + "language_loss": 0.73874348, + "learning_rate": 3.887835559829712e-06, + "loss": 0.76091796, + "num_input_tokens_seen": 48187600, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.1171875, + "step": 2227, + "time_per_iteration": 5.469221115112305 + }, + { + "auxiliary_loss_clip": 0.01166021, + "auxiliary_loss_mlp": 0.01049972, + "balance_loss_clip": 1.02971888, + "balance_loss_mlp": 1.05582607, + "epoch": 0.1339546069442357, + "flos": 17598742594560.0, + "grad_norm": 2.597241668203809, + "language_loss": 0.8519839, + "learning_rate": 3.8877069313476764e-06, + "loss": 0.87414384, + "num_input_tokens_seen": 48204400, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1015625, + "step": 2228, + "time_per_iteration": 2.449289560317993 + }, + { + "auxiliary_loss_clip": 0.01162737, + "auxiliary_loss_mlp": 0.01047632, + "balance_loss_clip": 1.0275687, + "balance_loss_mlp": 1.05501461, + "epoch": 0.13401473019690366, + "flos": 18990065952000.0, + "grad_norm": 2.700498827765594, + "language_loss": 0.8100034, + "learning_rate": 3.8875782312836054e-06, + "loss": 0.83210707, + "num_input_tokens_seen": 48222180, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.078125, + "step": 2229, + "time_per_iteration": 2.454185962677002 + }, + { + "auxiliary_loss_clip": 0.01165405, + "auxiliary_loss_mlp": 0.01055372, + "balance_loss_clip": 1.03528547, + "balance_loss_mlp": 1.05576682, + "epoch": 0.13407485344957162, + "flos": 26943812288640.0, + "grad_norm": 2.350850930683171, + "language_loss": 0.73814881, + "learning_rate": 3.887449459642378e-06, + "loss": 0.76035661, + "num_input_tokens_seen": 48243245, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.09375, + "step": 2230, + "time_per_iteration": 2.538679838180542 + }, + { + "auxiliary_loss_clip": 0.0116587, + "auxiliary_loss_mlp": 0.01055192, + "balance_loss_clip": 1.03551102, + "balance_loss_mlp": 1.0541544, + "epoch": 0.1341349767022396, + "flos": 20339373375360.0, + "grad_norm": 8.27737726970052, + "language_loss": 0.79914325, + "learning_rate": 3.8873206164288785e-06, + "loss": 0.82135391, + "num_input_tokens_seen": 48262600, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.1171875, + "step": 2231, + "time_per_iteration": 2.436964273452759 + }, + { + "auxiliary_loss_clip": 0.0116777, + "auxiliary_loss_mlp": 0.01049092, + "balance_loss_clip": 1.02906466, + "balance_loss_mlp": 1.05716896, + "epoch": 0.13419509995490755, + "flos": 29862020931840.0, + "grad_norm": 1.9954658779127024, + "language_loss": 0.72341192, + "learning_rate": 3.887191701647992e-06, + "loss": 0.74558049, + "num_input_tokens_seen": 48285075, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.109375, + "step": 2232, + "time_per_iteration": 2.5315330028533936 + }, + { + "auxiliary_loss_clip": 0.01169038, + "auxiliary_loss_mlp": 0.01047761, + "balance_loss_clip": 1.02664888, + "balance_loss_mlp": 1.05505097, + "epoch": 0.13425522320757552, + "flos": 26942986275840.0, + "grad_norm": 2.53729194427275, + "language_loss": 0.65508974, + "learning_rate": 3.8870627153046066e-06, + "loss": 0.67725778, + "num_input_tokens_seen": 48301285, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.140625, + "step": 2233, + "time_per_iteration": 2.480006694793701 + }, + { + "auxiliary_loss_clip": 0.01161612, + "auxiliary_loss_mlp": 0.01047371, + "balance_loss_clip": 1.02687883, + "balance_loss_mlp": 1.05011904, + "epoch": 0.1343153464602435, + "flos": 15777281871360.0, + "grad_norm": 4.541384002557222, + "language_loss": 0.81492066, + "learning_rate": 3.886933657403615e-06, + "loss": 0.8370105, + "num_input_tokens_seen": 48317835, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1171875, + "step": 2234, + "time_per_iteration": 2.431962490081787 + }, + { + "auxiliary_loss_clip": 0.01165653, + "auxiliary_loss_mlp": 0.01054939, + "balance_loss_clip": 1.03466105, + "balance_loss_mlp": 1.05424869, + "epoch": 0.13437546971291148, + "flos": 24314756129280.0, + "grad_norm": 1.9481483268780417, + "language_loss": 0.82361299, + "learning_rate": 3.886804527949909e-06, + "loss": 0.84581894, + "num_input_tokens_seen": 48335670, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1171875, + "step": 2235, + "time_per_iteration": 2.4478979110717773 + }, + { + "auxiliary_loss_clip": 0.0116322, + "auxiliary_loss_mlp": 0.01055841, + "balance_loss_clip": 1.03378713, + "balance_loss_mlp": 1.05170834, + "epoch": 0.13443559296557944, + "flos": 26650673395200.0, + "grad_norm": 1.6568048404288893, + "language_loss": 0.86399209, + "learning_rate": 3.8866753269483864e-06, + "loss": 0.88618279, + "num_input_tokens_seen": 48357805, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1171875, + "step": 2236, + "time_per_iteration": 2.534761428833008 + }, + { + "auxiliary_loss_clip": 0.01166625, + "auxiliary_loss_mlp": 0.0104777, + "balance_loss_clip": 1.02712345, + "balance_loss_mlp": 1.05506372, + "epoch": 0.1344957162182474, + "flos": 21796197183360.0, + "grad_norm": 1.5401183277834882, + "language_loss": 0.76936173, + "learning_rate": 3.886546054403946e-06, + "loss": 0.79150563, + "num_input_tokens_seen": 48377845, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1171875, + "step": 2237, + "time_per_iteration": 2.454881191253662 + }, + { + "auxiliary_loss_clip": 0.0116441, + "auxiliary_loss_mlp": 0.01051932, + "balance_loss_clip": 1.02974725, + "balance_loss_mlp": 1.05312407, + "epoch": 0.13455583947091537, + "flos": 19865568049920.0, + "grad_norm": 1.976295310563951, + "language_loss": 0.78737688, + "learning_rate": 3.886416710321491e-06, + "loss": 0.80954033, + "num_input_tokens_seen": 48394735, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.1171875, + "step": 2238, + "time_per_iteration": 2.508364200592041 + }, + { + "auxiliary_loss_clip": 0.01162005, + "auxiliary_loss_mlp": 0.01051856, + "balance_loss_clip": 1.03057706, + "balance_loss_mlp": 1.0530107, + "epoch": 0.13461596272358334, + "flos": 30846835094400.0, + "grad_norm": 2.3078790626960246, + "language_loss": 0.67977941, + "learning_rate": 3.886287294705924e-06, + "loss": 0.70191795, + "num_input_tokens_seen": 48414200, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.09375, + "step": 2239, + "time_per_iteration": 2.533148765563965 + }, + { + "auxiliary_loss_clip": 0.01165153, + "auxiliary_loss_mlp": 0.01049226, + "balance_loss_clip": 1.02888918, + "balance_loss_mlp": 1.05296254, + "epoch": 0.1346760859762513, + "flos": 12494436312960.0, + "grad_norm": 2.7482132203763245, + "language_loss": 0.81085825, + "learning_rate": 3.8861578075621555e-06, + "loss": 0.83300203, + "num_input_tokens_seen": 48431065, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.125, + "step": 2240, + "time_per_iteration": 2.458702802658081 + }, + { + "auxiliary_loss_clip": 0.01166075, + "auxiliary_loss_mlp": 0.01050419, + "balance_loss_clip": 1.02958083, + "balance_loss_mlp": 1.05302262, + "epoch": 0.1347362092289193, + "flos": 21836022387840.0, + "grad_norm": 1.775061814751768, + "language_loss": 0.77491653, + "learning_rate": 3.886028248895093e-06, + "loss": 0.79708141, + "num_input_tokens_seen": 48450335, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1328125, + "step": 2241, + "time_per_iteration": 2.4814610481262207 + }, + { + "auxiliary_loss_clip": 0.01160364, + "auxiliary_loss_mlp": 0.01044969, + "balance_loss_clip": 1.0265156, + "balance_loss_mlp": 1.05368328, + "epoch": 0.13479633248158726, + "flos": 23509459163520.0, + "grad_norm": 1.708340264075402, + "language_loss": 0.83106101, + "learning_rate": 3.88589861870965e-06, + "loss": 0.85311437, + "num_input_tokens_seen": 48468555, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.0625, + "step": 2242, + "time_per_iteration": 2.531010627746582 + }, + { + "auxiliary_loss_clip": 0.01166889, + "auxiliary_loss_mlp": 0.01052169, + "balance_loss_clip": 1.03056788, + "balance_loss_mlp": 1.05465889, + "epoch": 0.13485645573425523, + "flos": 29344332165120.0, + "grad_norm": 3.594763109819468, + "language_loss": 0.64927268, + "learning_rate": 3.885768917010744e-06, + "loss": 0.67146331, + "num_input_tokens_seen": 48488515, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.125, + "step": 2243, + "time_per_iteration": 2.5215864181518555 + }, + { + "auxiliary_loss_clip": 0.01158013, + "auxiliary_loss_mlp": 0.01045929, + "balance_loss_clip": 1.02573538, + "balance_loss_mlp": 1.05214143, + "epoch": 0.1349165789869232, + "flos": 28037112503040.0, + "grad_norm": 1.6702464572283469, + "language_loss": 0.72275442, + "learning_rate": 3.8856391438032895e-06, + "loss": 0.74479383, + "num_input_tokens_seen": 48510515, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0625, + "step": 2244, + "time_per_iteration": 2.572275161743164 + }, + { + "auxiliary_loss_clip": 0.01161264, + "auxiliary_loss_mlp": 0.01052624, + "balance_loss_clip": 1.03339577, + "balance_loss_mlp": 1.0510093, + "epoch": 0.13497670223959116, + "flos": 22853730430080.0, + "grad_norm": 1.6251739599249553, + "language_loss": 0.86419517, + "learning_rate": 3.88550929909221e-06, + "loss": 0.886334, + "num_input_tokens_seen": 48529940, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.1015625, + "step": 2245, + "time_per_iteration": 2.4847447872161865 + }, + { + "auxiliary_loss_clip": 0.0115964, + "auxiliary_loss_mlp": 0.01049956, + "balance_loss_clip": 1.029953, + "balance_loss_mlp": 1.0534606, + "epoch": 0.13503682549225912, + "flos": 16504580453760.0, + "grad_norm": 1.986035604010071, + "language_loss": 0.79054129, + "learning_rate": 3.88537938288243e-06, + "loss": 0.81263721, + "num_input_tokens_seen": 48548190, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0625, + "step": 2246, + "time_per_iteration": 2.521500825881958 + }, + { + "auxiliary_loss_clip": 0.01053943, + "auxiliary_loss_mlp": 0.01006436, + "balance_loss_clip": 1.00378919, + "balance_loss_mlp": 1.01705432, + "epoch": 0.1350969487449271, + "flos": 70756303242240.0, + "grad_norm": 0.7498554605470831, + "language_loss": 0.60597092, + "learning_rate": 3.885249395178874e-06, + "loss": 0.6265747, + "num_input_tokens_seen": 48613165, + "router_z_loss_clip": 0.02648926, + "router_z_loss_mlp": 0.3671875, + "step": 2247, + "time_per_iteration": 3.209567070007324 + }, + { + "auxiliary_loss_clip": 0.0117261, + "auxiliary_loss_mlp": 0.01058621, + "balance_loss_clip": 1.03629315, + "balance_loss_mlp": 1.05673957, + "epoch": 0.13515707199759508, + "flos": 23075981832960.0, + "grad_norm": 2.930333372025318, + "language_loss": 0.81250268, + "learning_rate": 3.885119335986473e-06, + "loss": 0.83481503, + "num_input_tokens_seen": 48631705, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.15625, + "step": 2248, + "time_per_iteration": 2.5274717807769775 + }, + { + "auxiliary_loss_clip": 0.01157631, + "auxiliary_loss_mlp": 0.0104321, + "balance_loss_clip": 1.02503014, + "balance_loss_mlp": 1.0515008, + "epoch": 0.13521719525026304, + "flos": 23186371305600.0, + "grad_norm": 2.1598236051462383, + "language_loss": 0.77427459, + "learning_rate": 3.884989205310157e-06, + "loss": 0.79628301, + "num_input_tokens_seen": 48649740, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.0625, + "step": 2249, + "time_per_iteration": 2.475325345993042 + }, + { + "auxiliary_loss_clip": 0.01161564, + "auxiliary_loss_mlp": 0.01053869, + "balance_loss_clip": 1.03477216, + "balance_loss_mlp": 1.05408192, + "epoch": 0.135277318502931, + "flos": 24790931752320.0, + "grad_norm": 1.4620260499768896, + "language_loss": 0.84598488, + "learning_rate": 3.884859003154862e-06, + "loss": 0.86813927, + "num_input_tokens_seen": 48671565, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0703125, + "step": 2250, + "time_per_iteration": 2.5579018592834473 + }, + { + "auxiliary_loss_clip": 0.01162926, + "auxiliary_loss_mlp": 0.0104688, + "balance_loss_clip": 1.02586317, + "balance_loss_mlp": 1.05311561, + "epoch": 0.13533744175559898, + "flos": 21908525990400.0, + "grad_norm": 1.9830962049575767, + "language_loss": 0.8213973, + "learning_rate": 3.884728729525524e-06, + "loss": 0.84349537, + "num_input_tokens_seen": 48690425, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.1015625, + "step": 2251, + "time_per_iteration": 2.459254503250122 + }, + { + "auxiliary_loss_clip": 0.01160349, + "auxiliary_loss_mlp": 0.01053163, + "balance_loss_clip": 1.03144348, + "balance_loss_mlp": 1.05075097, + "epoch": 0.13539756500826694, + "flos": 21211643249280.0, + "grad_norm": 1.6927381248236872, + "language_loss": 0.85981321, + "learning_rate": 3.884598384427084e-06, + "loss": 0.88194835, + "num_input_tokens_seen": 48707505, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.09375, + "step": 2252, + "time_per_iteration": 2.508246421813965 + }, + { + "auxiliary_loss_clip": 0.01050697, + "auxiliary_loss_mlp": 0.0100648, + "balance_loss_clip": 1.00398886, + "balance_loss_mlp": 1.01368976, + "epoch": 0.1354576882609349, + "flos": 63242103634560.0, + "grad_norm": 0.7502755191421498, + "language_loss": 0.61736262, + "learning_rate": 3.884467967864485e-06, + "loss": 0.63793439, + "num_input_tokens_seen": 48775895, + "router_z_loss_clip": 0.02490234, + "router_z_loss_mlp": 0.37109375, + "step": 2253, + "time_per_iteration": 3.1357691287994385 + }, + { + "auxiliary_loss_clip": 0.01163708, + "auxiliary_loss_mlp": 0.01055809, + "balance_loss_clip": 1.0357219, + "balance_loss_mlp": 1.05454588, + "epoch": 0.1355178115136029, + "flos": 25483037984640.0, + "grad_norm": 2.033104819567641, + "language_loss": 0.89383745, + "learning_rate": 3.884337479842671e-06, + "loss": 0.91603261, + "num_input_tokens_seen": 48798370, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.09375, + "step": 2254, + "time_per_iteration": 2.4983997344970703 + }, + { + "auxiliary_loss_clip": 0.01166229, + "auxiliary_loss_mlp": 0.01050811, + "balance_loss_clip": 1.02786362, + "balance_loss_mlp": 1.05202925, + "epoch": 0.13557793476627086, + "flos": 21616967295360.0, + "grad_norm": 2.0851597725495843, + "language_loss": 0.84461302, + "learning_rate": 3.884206920366591e-06, + "loss": 0.86678338, + "num_input_tokens_seen": 48817955, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.140625, + "step": 2255, + "time_per_iteration": 2.4466094970703125 + }, + { + "auxiliary_loss_clip": 0.01159898, + "auxiliary_loss_mlp": 0.01046769, + "balance_loss_clip": 1.02632451, + "balance_loss_mlp": 1.05059099, + "epoch": 0.13563805801893883, + "flos": 24928253447040.0, + "grad_norm": 2.8290739743459126, + "language_loss": 0.7493006, + "learning_rate": 3.884076289441196e-06, + "loss": 0.77136725, + "num_input_tokens_seen": 48836330, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.09375, + "step": 2256, + "time_per_iteration": 2.49464750289917 + }, + { + "auxiliary_loss_clip": 0.01164681, + "auxiliary_loss_mlp": 0.01049021, + "balance_loss_clip": 1.02806377, + "balance_loss_mlp": 1.05080438, + "epoch": 0.1356981812716068, + "flos": 14750272206720.0, + "grad_norm": 4.107811937736733, + "language_loss": 0.83023381, + "learning_rate": 3.88394558707144e-06, + "loss": 0.85237086, + "num_input_tokens_seen": 48851890, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.140625, + "step": 2257, + "time_per_iteration": 2.4069128036499023 + }, + { + "auxiliary_loss_clip": 0.0116793, + "auxiliary_loss_mlp": 0.01054876, + "balance_loss_clip": 1.03272712, + "balance_loss_mlp": 1.05211377, + "epoch": 0.13575830452427476, + "flos": 11108571822720.0, + "grad_norm": 2.2162023158830655, + "language_loss": 0.82266492, + "learning_rate": 3.883814813262277e-06, + "loss": 0.84489298, + "num_input_tokens_seen": 48865510, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.15625, + "step": 2258, + "time_per_iteration": 2.4187939167022705 + }, + { + "auxiliary_loss_clip": 0.01161942, + "auxiliary_loss_mlp": 0.01051916, + "balance_loss_clip": 1.02890849, + "balance_loss_mlp": 1.05117583, + "epoch": 0.13581842777694272, + "flos": 17960290940160.0, + "grad_norm": 2.3528312033652434, + "language_loss": 0.82556236, + "learning_rate": 3.883683968018669e-06, + "loss": 0.84770095, + "num_input_tokens_seen": 48882360, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.109375, + "step": 2259, + "time_per_iteration": 2.4182498455047607 + }, + { + "auxiliary_loss_clip": 0.01162398, + "auxiliary_loss_mlp": 0.0105006, + "balance_loss_clip": 1.0313561, + "balance_loss_mlp": 1.05370188, + "epoch": 0.1358785510296107, + "flos": 22857142222080.0, + "grad_norm": 1.9951846625000045, + "language_loss": 0.73434722, + "learning_rate": 3.8835530513455755e-06, + "loss": 0.75647175, + "num_input_tokens_seen": 48902700, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0859375, + "step": 2260, + "time_per_iteration": 2.5117952823638916 + }, + { + "auxiliary_loss_clip": 0.01160597, + "auxiliary_loss_mlp": 0.01053624, + "balance_loss_clip": 1.03389525, + "balance_loss_mlp": 1.05164778, + "epoch": 0.13593867428227868, + "flos": 25739404329600.0, + "grad_norm": 2.6406640236232826, + "language_loss": 0.75450647, + "learning_rate": 3.883422063247961e-06, + "loss": 0.77664864, + "num_input_tokens_seen": 48922525, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.09375, + "step": 2261, + "time_per_iteration": 2.4773809909820557 + }, + { + "auxiliary_loss_clip": 0.01164897, + "auxiliary_loss_mlp": 0.01048665, + "balance_loss_clip": 1.02887654, + "balance_loss_mlp": 1.05329657, + "epoch": 0.13599879753494665, + "flos": 31249214225280.0, + "grad_norm": 1.9984757312973846, + "language_loss": 0.63141024, + "learning_rate": 3.883291003730794e-06, + "loss": 0.65354586, + "num_input_tokens_seen": 48942510, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.1171875, + "step": 2262, + "time_per_iteration": 2.5423331260681152 + }, + { + "auxiliary_loss_clip": 0.01161423, + "auxiliary_loss_mlp": 0.01043862, + "balance_loss_clip": 1.02458549, + "balance_loss_mlp": 1.05046105, + "epoch": 0.1360589207876146, + "flos": 23915034604800.0, + "grad_norm": 2.598036861128168, + "language_loss": 0.82363462, + "learning_rate": 3.883159872799043e-06, + "loss": 0.84568739, + "num_input_tokens_seen": 48962625, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.109375, + "step": 2263, + "time_per_iteration": 2.472050428390503 + }, + { + "auxiliary_loss_clip": 0.01166147, + "auxiliary_loss_mlp": 0.01061822, + "balance_loss_clip": 1.03914785, + "balance_loss_mlp": 1.05306447, + "epoch": 0.13611904404028258, + "flos": 19974197756160.0, + "grad_norm": 1.7757676532235749, + "language_loss": 0.87984985, + "learning_rate": 3.8830286704576815e-06, + "loss": 0.90212959, + "num_input_tokens_seen": 48982525, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.1328125, + "step": 2264, + "time_per_iteration": 2.4857943058013916 + }, + { + "auxiliary_loss_clip": 0.01163519, + "auxiliary_loss_mlp": 0.01048759, + "balance_loss_clip": 1.02700329, + "balance_loss_mlp": 1.05115557, + "epoch": 0.13617916729295054, + "flos": 15340644144000.0, + "grad_norm": 2.9904691281538693, + "language_loss": 0.7103616, + "learning_rate": 3.882897396711683e-06, + "loss": 0.73248434, + "num_input_tokens_seen": 48997605, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.125, + "step": 2265, + "time_per_iteration": 2.428753614425659 + }, + { + "auxiliary_loss_clip": 0.01160486, + "auxiliary_loss_mlp": 0.01041679, + "balance_loss_clip": 1.02187812, + "balance_loss_mlp": 1.05258036, + "epoch": 0.1362392905456185, + "flos": 27451445247360.0, + "grad_norm": 2.049615390343222, + "language_loss": 0.66760135, + "learning_rate": 3.882766051566027e-06, + "loss": 0.689623, + "num_input_tokens_seen": 49018535, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.078125, + "step": 2266, + "time_per_iteration": 2.4990508556365967 + }, + { + "auxiliary_loss_clip": 0.01159505, + "auxiliary_loss_mlp": 0.01060297, + "balance_loss_clip": 1.04079425, + "balance_loss_mlp": 1.05220675, + "epoch": 0.1362994137982865, + "flos": 25009017177600.0, + "grad_norm": 1.7538751206895893, + "language_loss": 0.76376909, + "learning_rate": 3.882634635025694e-06, + "loss": 0.78596711, + "num_input_tokens_seen": 49038865, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0703125, + "step": 2267, + "time_per_iteration": 2.485907554626465 + }, + { + "auxiliary_loss_clip": 0.0116058, + "auxiliary_loss_mlp": 0.01046136, + "balance_loss_clip": 1.02639508, + "balance_loss_mlp": 1.05051804, + "epoch": 0.13635953705095447, + "flos": 20303031790080.0, + "grad_norm": 2.002795226804265, + "language_loss": 0.81781995, + "learning_rate": 3.882503147095667e-06, + "loss": 0.83988714, + "num_input_tokens_seen": 49058010, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1015625, + "step": 2268, + "time_per_iteration": 3.890936851501465 + }, + { + "auxiliary_loss_clip": 0.01161581, + "auxiliary_loss_mlp": 0.01046085, + "balance_loss_clip": 1.02567649, + "balance_loss_mlp": 1.0542717, + "epoch": 0.13641966030362243, + "flos": 31358418549120.0, + "grad_norm": 2.071095479959133, + "language_loss": 0.76078153, + "learning_rate": 3.882371587780931e-06, + "loss": 0.78285825, + "num_input_tokens_seen": 49080330, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0703125, + "step": 2269, + "time_per_iteration": 4.03081202507019 + }, + { + "auxiliary_loss_clip": 0.01165717, + "auxiliary_loss_mlp": 0.0104628, + "balance_loss_clip": 1.02612138, + "balance_loss_mlp": 1.05518508, + "epoch": 0.1364797835562904, + "flos": 20478095700480.0, + "grad_norm": 2.039865659244694, + "language_loss": 0.80856502, + "learning_rate": 3.882239957086477e-06, + "loss": 0.83068502, + "num_input_tokens_seen": 49097035, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.109375, + "step": 2270, + "time_per_iteration": 2.431426525115967 + }, + { + "auxiliary_loss_clip": 0.01164608, + "auxiliary_loss_mlp": 0.01055428, + "balance_loss_clip": 1.03463817, + "balance_loss_mlp": 1.05227089, + "epoch": 0.13653990680895836, + "flos": 13078343802240.0, + "grad_norm": 2.715242097566801, + "language_loss": 0.75720018, + "learning_rate": 3.882108255017295e-06, + "loss": 0.77940053, + "num_input_tokens_seen": 49113945, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.125, + "step": 2271, + "time_per_iteration": 2.440701961517334 + }, + { + "auxiliary_loss_clip": 0.01161613, + "auxiliary_loss_mlp": 0.01052373, + "balance_loss_clip": 1.03149939, + "balance_loss_mlp": 1.05171776, + "epoch": 0.13660003006162633, + "flos": 16946712961920.0, + "grad_norm": 2.2487551674667565, + "language_loss": 0.80084515, + "learning_rate": 3.881976481578379e-06, + "loss": 0.82298499, + "num_input_tokens_seen": 49132855, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1015625, + "step": 2272, + "time_per_iteration": 2.4305598735809326 + }, + { + "auxiliary_loss_clip": 0.01056084, + "auxiliary_loss_mlp": 0.01011943, + "balance_loss_clip": 1.00937963, + "balance_loss_mlp": 1.01818228, + "epoch": 0.1366601533142943, + "flos": 68682749892480.0, + "grad_norm": 0.7032235049035468, + "language_loss": 0.60682511, + "learning_rate": 3.8818446367747255e-06, + "loss": 0.62750536, + "num_input_tokens_seen": 49198310, + "router_z_loss_clip": 0.02563477, + "router_z_loss_mlp": 0.37890625, + "step": 2273, + "time_per_iteration": 3.1601598262786865 + }, + { + "auxiliary_loss_clip": 0.01158579, + "auxiliary_loss_mlp": 0.01047766, + "balance_loss_clip": 1.02732205, + "balance_loss_mlp": 1.05170178, + "epoch": 0.13672027656696228, + "flos": 19244241567360.0, + "grad_norm": 1.7482195510707834, + "language_loss": 0.77978206, + "learning_rate": 3.881712720611336e-06, + "loss": 0.80184555, + "num_input_tokens_seen": 49217250, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0625, + "step": 2274, + "time_per_iteration": 2.448374032974243 + }, + { + "auxiliary_loss_clip": 0.01163563, + "auxiliary_loss_mlp": 0.01046845, + "balance_loss_clip": 1.02613878, + "balance_loss_mlp": 1.0536654, + "epoch": 0.13678039981963025, + "flos": 24534924543360.0, + "grad_norm": 2.152740159395537, + "language_loss": 0.78435361, + "learning_rate": 3.881580733093211e-06, + "loss": 0.80645764, + "num_input_tokens_seen": 49236615, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1015625, + "step": 2275, + "time_per_iteration": 2.4761078357696533 + }, + { + "auxiliary_loss_clip": 0.01161418, + "auxiliary_loss_mlp": 0.01039, + "balance_loss_clip": 1.02003431, + "balance_loss_mlp": 1.05312562, + "epoch": 0.13684052307229821, + "flos": 15669334523520.0, + "grad_norm": 2.879456622893362, + "language_loss": 0.81436646, + "learning_rate": 3.881448674225356e-06, + "loss": 0.83637059, + "num_input_tokens_seen": 49253935, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0859375, + "step": 2276, + "time_per_iteration": 2.453623056411743 + }, + { + "auxiliary_loss_clip": 0.01169888, + "auxiliary_loss_mlp": 0.01054109, + "balance_loss_clip": 1.03082716, + "balance_loss_mlp": 1.05443549, + "epoch": 0.13690064632496618, + "flos": 28364689560960.0, + "grad_norm": 2.7308629221608576, + "language_loss": 0.69347179, + "learning_rate": 3.881316544012779e-06, + "loss": 0.71571183, + "num_input_tokens_seen": 49273605, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.15625, + "step": 2277, + "time_per_iteration": 2.537464141845703 + }, + { + "auxiliary_loss_clip": 0.01162034, + "auxiliary_loss_mlp": 0.01051118, + "balance_loss_clip": 1.03056657, + "balance_loss_mlp": 1.05136657, + "epoch": 0.13696076957763414, + "flos": 23404779953280.0, + "grad_norm": 2.1796180013972384, + "language_loss": 0.80487186, + "learning_rate": 3.88118434246049e-06, + "loss": 0.82700336, + "num_input_tokens_seen": 49291785, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.109375, + "step": 2278, + "time_per_iteration": 2.478158950805664 + }, + { + "auxiliary_loss_clip": 0.01164216, + "auxiliary_loss_mlp": 0.01047731, + "balance_loss_clip": 1.02788246, + "balance_loss_mlp": 1.05658543, + "epoch": 0.1370208928303021, + "flos": 37196595601920.0, + "grad_norm": 2.2222454745927744, + "language_loss": 0.74863833, + "learning_rate": 3.881052069573502e-06, + "loss": 0.77075779, + "num_input_tokens_seen": 49311405, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.078125, + "step": 2279, + "time_per_iteration": 2.5930991172790527 + }, + { + "auxiliary_loss_clip": 0.01166611, + "auxiliary_loss_mlp": 0.0105256, + "balance_loss_clip": 1.03232992, + "balance_loss_mlp": 1.05331779, + "epoch": 0.13708101608297008, + "flos": 26976311118720.0, + "grad_norm": 2.3437990696634916, + "language_loss": 0.76614088, + "learning_rate": 3.880919725356831e-06, + "loss": 0.78833258, + "num_input_tokens_seen": 49331835, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.1328125, + "step": 2280, + "time_per_iteration": 2.527808666229248 + }, + { + "auxiliary_loss_clip": 0.01156674, + "auxiliary_loss_mlp": 0.01046301, + "balance_loss_clip": 1.0272876, + "balance_loss_mlp": 1.04930711, + "epoch": 0.13714113933563807, + "flos": 32556864850560.0, + "grad_norm": 1.7035700975942816, + "language_loss": 0.79808372, + "learning_rate": 3.880787309815496e-06, + "loss": 0.82011348, + "num_input_tokens_seen": 49352290, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.078125, + "step": 2281, + "time_per_iteration": 2.5486884117126465 + }, + { + "auxiliary_loss_clip": 0.01167882, + "auxiliary_loss_mlp": 0.01055632, + "balance_loss_clip": 1.03618872, + "balance_loss_mlp": 1.05488086, + "epoch": 0.13720126258830603, + "flos": 16101267569280.0, + "grad_norm": 1.697672260024265, + "language_loss": 0.83955061, + "learning_rate": 3.880654822954518e-06, + "loss": 0.86178571, + "num_input_tokens_seen": 49370285, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.125, + "step": 2282, + "time_per_iteration": 2.4731719493865967 + }, + { + "auxiliary_loss_clip": 0.01157557, + "auxiliary_loss_mlp": 0.01055496, + "balance_loss_clip": 1.03664923, + "balance_loss_mlp": 1.05028629, + "epoch": 0.137261385840974, + "flos": 18953544798720.0, + "grad_norm": 1.8152250836173982, + "language_loss": 0.73821312, + "learning_rate": 3.8805222647789195e-06, + "loss": 0.76034367, + "num_input_tokens_seen": 49389610, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0703125, + "step": 2283, + "time_per_iteration": 2.5041310787200928 + }, + { + "auxiliary_loss_clip": 0.01161767, + "auxiliary_loss_mlp": 0.01048138, + "balance_loss_clip": 1.02991104, + "balance_loss_mlp": 1.05546188, + "epoch": 0.13732150909364196, + "flos": 23295360147840.0, + "grad_norm": 2.845966051455131, + "language_loss": 0.83875519, + "learning_rate": 3.880389635293729e-06, + "loss": 0.86085427, + "num_input_tokens_seen": 49408390, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.0625, + "step": 2284, + "time_per_iteration": 2.489459991455078 + }, + { + "auxiliary_loss_clip": 0.01164156, + "auxiliary_loss_mlp": 0.01049019, + "balance_loss_clip": 1.02784729, + "balance_loss_mlp": 1.05016088, + "epoch": 0.13738163234630993, + "flos": 29351263489920.0, + "grad_norm": 1.9356174938409232, + "language_loss": 0.74778754, + "learning_rate": 3.880256934503974e-06, + "loss": 0.76991928, + "num_input_tokens_seen": 49427725, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.140625, + "step": 2285, + "time_per_iteration": 2.542114734649658 + }, + { + "auxiliary_loss_clip": 0.01158945, + "auxiliary_loss_mlp": 0.01045355, + "balance_loss_clip": 1.02680647, + "balance_loss_mlp": 1.05192137, + "epoch": 0.1374417555989779, + "flos": 26651319840000.0, + "grad_norm": 1.7476035379248278, + "language_loss": 0.74461651, + "learning_rate": 3.880124162414689e-06, + "loss": 0.7666595, + "num_input_tokens_seen": 49449000, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0703125, + "step": 2286, + "time_per_iteration": 2.52837872505188 + }, + { + "auxiliary_loss_clip": 0.01165905, + "auxiliary_loss_mlp": 0.01045032, + "balance_loss_clip": 1.02407491, + "balance_loss_mlp": 1.05466056, + "epoch": 0.1375018788516459, + "flos": 28403401443840.0, + "grad_norm": 2.4229799840234936, + "language_loss": 0.86074513, + "learning_rate": 3.879991319030908e-06, + "loss": 0.88285446, + "num_input_tokens_seen": 49468360, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.109375, + "step": 2287, + "time_per_iteration": 2.5267093181610107 + }, + { + "auxiliary_loss_clip": 0.01162503, + "auxiliary_loss_mlp": 0.01046382, + "balance_loss_clip": 1.02724862, + "balance_loss_mlp": 1.05281329, + "epoch": 0.13756200210431385, + "flos": 37413783187200.0, + "grad_norm": 2.1686670508464783, + "language_loss": 0.68304116, + "learning_rate": 3.879858404357666e-06, + "loss": 0.70512998, + "num_input_tokens_seen": 49493450, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.09375, + "step": 2288, + "time_per_iteration": 2.6589176654815674 + }, + { + "auxiliary_loss_clip": 0.01162886, + "auxiliary_loss_mlp": 0.01054184, + "balance_loss_clip": 1.03410959, + "balance_loss_mlp": 1.05404294, + "epoch": 0.13762212535698182, + "flos": 22711021695360.0, + "grad_norm": 3.8263362529629896, + "language_loss": 0.87251699, + "learning_rate": 3.879725418400005e-06, + "loss": 0.89468765, + "num_input_tokens_seen": 49511220, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.0859375, + "step": 2289, + "time_per_iteration": 2.4834415912628174 + }, + { + "auxiliary_loss_clip": 0.01154414, + "auxiliary_loss_mlp": 0.01045882, + "balance_loss_clip": 1.02735722, + "balance_loss_mlp": 1.0496552, + "epoch": 0.13768224860964978, + "flos": 23952130375680.0, + "grad_norm": 1.801469753111382, + "language_loss": 0.74045157, + "learning_rate": 3.879592361162969e-06, + "loss": 0.76245451, + "num_input_tokens_seen": 49529820, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.046875, + "step": 2290, + "time_per_iteration": 2.4901175498962402 + }, + { + "auxiliary_loss_clip": 0.01056657, + "auxiliary_loss_mlp": 0.01003238, + "balance_loss_clip": 1.00099707, + "balance_loss_mlp": 1.01923215, + "epoch": 0.13774237186231775, + "flos": 63590438753280.0, + "grad_norm": 0.7021136788609851, + "language_loss": 0.5160234, + "learning_rate": 3.8794592326516015e-06, + "loss": 0.53662229, + "num_input_tokens_seen": 49595325, + "router_z_loss_clip": 0.02246094, + "router_z_loss_mlp": 0.375, + "step": 2291, + "time_per_iteration": 3.1141176223754883 + }, + { + "auxiliary_loss_clip": 0.01158988, + "auxiliary_loss_mlp": 0.01049009, + "balance_loss_clip": 1.02856493, + "balance_loss_mlp": 1.05007744, + "epoch": 0.1378024951149857, + "flos": 24279456038400.0, + "grad_norm": 2.104305633549435, + "language_loss": 0.7090801, + "learning_rate": 3.879326032870952e-06, + "loss": 0.73116004, + "num_input_tokens_seen": 49615850, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.09375, + "step": 2292, + "time_per_iteration": 2.5535075664520264 + }, + { + "auxiliary_loss_clip": 0.01160381, + "auxiliary_loss_mlp": 0.01044896, + "balance_loss_clip": 1.02613211, + "balance_loss_mlp": 1.05272794, + "epoch": 0.13786261836765368, + "flos": 14021537080320.0, + "grad_norm": 2.835181445389694, + "language_loss": 0.79774708, + "learning_rate": 3.879192761826071e-06, + "loss": 0.81979978, + "num_input_tokens_seen": 49631860, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.078125, + "step": 2293, + "time_per_iteration": 2.4434242248535156 + }, + { + "auxiliary_loss_clip": 0.01159833, + "auxiliary_loss_mlp": 0.01050431, + "balance_loss_clip": 1.03065419, + "balance_loss_mlp": 1.0489893, + "epoch": 0.13792274162032167, + "flos": 28878679226880.0, + "grad_norm": 2.8100583587938566, + "language_loss": 0.78455698, + "learning_rate": 3.879059419522011e-06, + "loss": 0.80665964, + "num_input_tokens_seen": 49652145, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.109375, + "step": 2294, + "time_per_iteration": 2.5279018878936768 + }, + { + "auxiliary_loss_clip": 0.01156302, + "auxiliary_loss_mlp": 0.01044594, + "balance_loss_clip": 1.02679634, + "balance_loss_mlp": 1.05053687, + "epoch": 0.13798286487298964, + "flos": 21141150808320.0, + "grad_norm": 2.844605455172751, + "language_loss": 0.80448526, + "learning_rate": 3.878926005963831e-06, + "loss": 0.82649422, + "num_input_tokens_seen": 49669880, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 1.0546875, + "step": 2295, + "time_per_iteration": 2.46471905708313 + }, + { + "auxiliary_loss_clip": 0.01158353, + "auxiliary_loss_mlp": 0.01045588, + "balance_loss_clip": 1.02604938, + "balance_loss_mlp": 1.04990947, + "epoch": 0.1380429881256576, + "flos": 22487477402880.0, + "grad_norm": 1.905081494696058, + "language_loss": 0.78027165, + "learning_rate": 3.878792521156588e-06, + "loss": 0.80231106, + "num_input_tokens_seen": 49687255, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0859375, + "step": 2296, + "time_per_iteration": 2.489081859588623 + }, + { + "auxiliary_loss_clip": 0.0116031, + "auxiliary_loss_mlp": 0.01052914, + "balance_loss_clip": 1.03356612, + "balance_loss_mlp": 1.05272174, + "epoch": 0.13810311137832557, + "flos": 21393674398080.0, + "grad_norm": 1.8577842545242083, + "language_loss": 0.78632545, + "learning_rate": 3.8786589651053446e-06, + "loss": 0.80845773, + "num_input_tokens_seen": 49706650, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.078125, + "step": 2297, + "time_per_iteration": 2.479617118835449 + }, + { + "auxiliary_loss_clip": 0.01157952, + "auxiliary_loss_mlp": 0.01050362, + "balance_loss_clip": 1.03187263, + "balance_loss_mlp": 1.05133367, + "epoch": 0.13816323463099353, + "flos": 25989844930560.0, + "grad_norm": 2.1383795008624946, + "language_loss": 0.69005466, + "learning_rate": 3.878525337815164e-06, + "loss": 0.71213776, + "num_input_tokens_seen": 49725715, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0625, + "step": 2298, + "time_per_iteration": 2.4894726276397705 + }, + { + "auxiliary_loss_clip": 0.01163842, + "auxiliary_loss_mlp": 0.0105021, + "balance_loss_clip": 1.03075552, + "balance_loss_mlp": 1.05287397, + "epoch": 0.1382233578836615, + "flos": 19244313394560.0, + "grad_norm": 1.7932718261070644, + "language_loss": 0.86958891, + "learning_rate": 3.878391639291116e-06, + "loss": 0.89172935, + "num_input_tokens_seen": 49744710, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.109375, + "step": 2299, + "time_per_iteration": 2.4343175888061523 + }, + { + "auxiliary_loss_clip": 0.01158457, + "auxiliary_loss_mlp": 0.01052611, + "balance_loss_clip": 1.03221393, + "balance_loss_mlp": 1.05076718, + "epoch": 0.1382834811363295, + "flos": 25666290195840.0, + "grad_norm": 2.6477233854648015, + "language_loss": 0.7542398, + "learning_rate": 3.878257869538267e-06, + "loss": 0.7763505, + "num_input_tokens_seen": 49764300, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.078125, + "step": 2300, + "time_per_iteration": 2.5398943424224854 + }, + { + "auxiliary_loss_clip": 0.01160789, + "auxiliary_loss_mlp": 0.01050356, + "balance_loss_clip": 1.03088915, + "balance_loss_mlp": 1.05409729, + "epoch": 0.13834360438899745, + "flos": 19784193788160.0, + "grad_norm": 2.6084363319634956, + "language_loss": 0.82612532, + "learning_rate": 3.878124028561692e-06, + "loss": 0.8482368, + "num_input_tokens_seen": 49778380, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0625, + "step": 2301, + "time_per_iteration": 2.435732841491699 + }, + { + "auxiliary_loss_clip": 0.01155849, + "auxiliary_loss_mlp": 0.01043651, + "balance_loss_clip": 1.02461374, + "balance_loss_mlp": 1.04986811, + "epoch": 0.13840372764166542, + "flos": 26651858544000.0, + "grad_norm": 2.0886382571109987, + "language_loss": 0.85972583, + "learning_rate": 3.877990116366466e-06, + "loss": 0.8817209, + "num_input_tokens_seen": 49797460, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0625, + "step": 2302, + "time_per_iteration": 2.504011869430542 + }, + { + "auxiliary_loss_clip": 0.01055451, + "auxiliary_loss_mlp": 0.01009124, + "balance_loss_clip": 1.00688314, + "balance_loss_mlp": 1.0189817, + "epoch": 0.13846385089433338, + "flos": 70510998286080.0, + "grad_norm": 0.7554932596602951, + "language_loss": 0.65648526, + "learning_rate": 3.877856132957667e-06, + "loss": 0.677131, + "num_input_tokens_seen": 49868005, + "router_z_loss_clip": 0.02246094, + "router_z_loss_mlp": 0.36328125, + "step": 2303, + "time_per_iteration": 3.2563750743865967 + }, + { + "auxiliary_loss_clip": 0.0115535, + "auxiliary_loss_mlp": 0.01038432, + "balance_loss_clip": 1.01971662, + "balance_loss_mlp": 1.05022073, + "epoch": 0.13852397414700135, + "flos": 17348732956800.0, + "grad_norm": 2.0694955360834912, + "language_loss": 0.78234196, + "learning_rate": 3.877722078340374e-06, + "loss": 0.80427974, + "num_input_tokens_seen": 49885825, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.046875, + "step": 2304, + "time_per_iteration": 2.461975574493408 + }, + { + "auxiliary_loss_clip": 0.01161783, + "auxiliary_loss_mlp": 0.01038842, + "balance_loss_clip": 1.01991165, + "balance_loss_mlp": 1.05225086, + "epoch": 0.13858409739966931, + "flos": 21543781334400.0, + "grad_norm": 1.838077080535218, + "language_loss": 0.77824223, + "learning_rate": 3.877587952519672e-06, + "loss": 0.8002485, + "num_input_tokens_seen": 49905975, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.09375, + "step": 2305, + "time_per_iteration": 2.468254804611206 + }, + { + "auxiliary_loss_clip": 0.0115424, + "auxiliary_loss_mlp": 0.01045678, + "balance_loss_clip": 1.02732027, + "balance_loss_mlp": 1.04923558, + "epoch": 0.13864422065233728, + "flos": 21579907438080.0, + "grad_norm": 3.2063314507866947, + "language_loss": 0.87484217, + "learning_rate": 3.877453755500647e-06, + "loss": 0.89684129, + "num_input_tokens_seen": 49925800, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.046875, + "step": 2306, + "time_per_iteration": 2.4840242862701416 + }, + { + "auxiliary_loss_clip": 0.0105475, + "auxiliary_loss_mlp": 0.01012102, + "balance_loss_clip": 1.00995588, + "balance_loss_mlp": 1.01749539, + "epoch": 0.13870434390500527, + "flos": 53371156872960.0, + "grad_norm": 0.8793018572536648, + "language_loss": 0.59049129, + "learning_rate": 3.877319487288387e-06, + "loss": 0.6111598, + "num_input_tokens_seen": 49977620, + "router_z_loss_clip": 0.02148438, + "router_z_loss_mlp": 0.37304688, + "step": 2307, + "time_per_iteration": 3.1098880767822266 + }, + { + "auxiliary_loss_clip": 0.01164649, + "auxiliary_loss_mlp": 0.01043993, + "balance_loss_clip": 1.0233345, + "balance_loss_mlp": 1.05279016, + "epoch": 0.13876446715767324, + "flos": 22565906749440.0, + "grad_norm": 1.7539420555734833, + "language_loss": 0.79683769, + "learning_rate": 3.877185147887984e-06, + "loss": 0.81892413, + "num_input_tokens_seen": 49996650, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1171875, + "step": 2308, + "time_per_iteration": 2.5119385719299316 + }, + { + "auxiliary_loss_clip": 0.01159668, + "auxiliary_loss_mlp": 0.01042487, + "balance_loss_clip": 1.02331865, + "balance_loss_mlp": 1.05292201, + "epoch": 0.1388245904103412, + "flos": 20705231352960.0, + "grad_norm": 2.1876242684272342, + "language_loss": 0.78186178, + "learning_rate": 3.877050737304533e-06, + "loss": 0.80388331, + "num_input_tokens_seen": 50015640, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0625, + "step": 2309, + "time_per_iteration": 3.889902353286743 + }, + { + "auxiliary_loss_clip": 0.01164667, + "auxiliary_loss_mlp": 0.01044971, + "balance_loss_clip": 1.02517033, + "balance_loss_mlp": 1.05319023, + "epoch": 0.13888471366300917, + "flos": 20554729367040.0, + "grad_norm": 1.9671645437439387, + "language_loss": 0.67473733, + "learning_rate": 3.876916255543129e-06, + "loss": 0.69683367, + "num_input_tokens_seen": 50033500, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1171875, + "step": 2310, + "time_per_iteration": 5.331011056900024 + }, + { + "auxiliary_loss_clip": 0.01159907, + "auxiliary_loss_mlp": 0.01051301, + "balance_loss_clip": 1.03067827, + "balance_loss_mlp": 1.0511837, + "epoch": 0.13894483691567713, + "flos": 13838033473920.0, + "grad_norm": 1.8339330301012977, + "language_loss": 0.83962393, + "learning_rate": 3.8767817026088725e-06, + "loss": 0.86173606, + "num_input_tokens_seen": 50050075, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.0859375, + "step": 2311, + "time_per_iteration": 2.4287211894989014 + }, + { + "auxiliary_loss_clip": 0.01165031, + "auxiliary_loss_mlp": 0.01046165, + "balance_loss_clip": 1.02629256, + "balance_loss_mlp": 1.05262017, + "epoch": 0.1390049601683451, + "flos": 28031186759040.0, + "grad_norm": 2.2677083380951997, + "language_loss": 0.81788063, + "learning_rate": 3.876647078506866e-06, + "loss": 0.83999264, + "num_input_tokens_seen": 50070080, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.125, + "step": 2312, + "time_per_iteration": 2.5261852741241455 + }, + { + "auxiliary_loss_clip": 0.01165344, + "auxiliary_loss_mlp": 0.01045576, + "balance_loss_clip": 1.02634764, + "balance_loss_mlp": 1.05353236, + "epoch": 0.13906508342101306, + "flos": 26756860976640.0, + "grad_norm": 2.1868066623869202, + "language_loss": 0.86641061, + "learning_rate": 3.876512383242215e-06, + "loss": 0.88851982, + "num_input_tokens_seen": 50090040, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.1171875, + "step": 2313, + "time_per_iteration": 2.491847515106201 + }, + { + "auxiliary_loss_clip": 0.0116138, + "auxiliary_loss_mlp": 0.01052556, + "balance_loss_clip": 1.03208828, + "balance_loss_mlp": 1.05377281, + "epoch": 0.13912520667368106, + "flos": 24535104111360.0, + "grad_norm": 2.199884337980412, + "language_loss": 0.79629153, + "learning_rate": 3.876377616820024e-06, + "loss": 0.8184309, + "num_input_tokens_seen": 50110595, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.078125, + "step": 2314, + "time_per_iteration": 2.513545036315918 + }, + { + "auxiliary_loss_clip": 0.0116003, + "auxiliary_loss_mlp": 0.0104633, + "balance_loss_clip": 1.02668452, + "balance_loss_mlp": 1.05130863, + "epoch": 0.13918532992634902, + "flos": 19383215287680.0, + "grad_norm": 2.30759926974498, + "language_loss": 0.86246645, + "learning_rate": 3.876242779245409e-06, + "loss": 0.88453007, + "num_input_tokens_seen": 50125430, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0859375, + "step": 2315, + "time_per_iteration": 2.4236056804656982 + }, + { + "auxiliary_loss_clip": 0.01159066, + "auxiliary_loss_mlp": 0.01052564, + "balance_loss_clip": 1.03192866, + "balance_loss_mlp": 1.05146074, + "epoch": 0.139245453179017, + "flos": 21323756574720.0, + "grad_norm": 2.162038852448813, + "language_loss": 0.77074778, + "learning_rate": 3.876107870523477e-06, + "loss": 0.79286408, + "num_input_tokens_seen": 50144120, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.078125, + "step": 2316, + "time_per_iteration": 2.4574813842773438 + }, + { + "auxiliary_loss_clip": 0.01157842, + "auxiliary_loss_mlp": 0.01058721, + "balance_loss_clip": 1.03733492, + "balance_loss_mlp": 1.05045736, + "epoch": 0.13930557643168495, + "flos": 19500607912320.0, + "grad_norm": 1.6719823206156588, + "language_loss": 0.76972795, + "learning_rate": 3.875972890659349e-06, + "loss": 0.7918936, + "num_input_tokens_seen": 50162500, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.078125, + "step": 2317, + "time_per_iteration": 2.448096990585327 + }, + { + "auxiliary_loss_clip": 0.01162372, + "auxiliary_loss_mlp": 0.01049791, + "balance_loss_clip": 1.02993095, + "balance_loss_mlp": 1.05272126, + "epoch": 0.13936569968435292, + "flos": 25410821690880.0, + "grad_norm": 2.004328537884534, + "language_loss": 0.80159998, + "learning_rate": 3.875837839658139e-06, + "loss": 0.82372165, + "num_input_tokens_seen": 50182415, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.09375, + "step": 2318, + "time_per_iteration": 2.5152556896209717 + }, + { + "auxiliary_loss_clip": 0.01050424, + "auxiliary_loss_mlp": 0.0100261, + "balance_loss_clip": 1.00027394, + "balance_loss_mlp": 1.01373565, + "epoch": 0.13942582293702088, + "flos": 70771063731840.0, + "grad_norm": 0.8654041988705774, + "language_loss": 0.59008324, + "learning_rate": 3.87570271752497e-06, + "loss": 0.61061358, + "num_input_tokens_seen": 50245160, + "router_z_loss_clip": 0.02331543, + "router_z_loss_mlp": 0.3671875, + "step": 2319, + "time_per_iteration": 3.101083993911743 + }, + { + "auxiliary_loss_clip": 0.01162526, + "auxiliary_loss_mlp": 0.01053809, + "balance_loss_clip": 1.03365111, + "balance_loss_mlp": 1.05213809, + "epoch": 0.13948594618968888, + "flos": 35590885920000.0, + "grad_norm": 2.2307371496542356, + "language_loss": 0.65372109, + "learning_rate": 3.875567524264967e-06, + "loss": 0.67588449, + "num_input_tokens_seen": 50268215, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.109375, + "step": 2320, + "time_per_iteration": 2.580655336380005 + }, + { + "auxiliary_loss_clip": 0.01157047, + "auxiliary_loss_mlp": 0.01043656, + "balance_loss_clip": 1.02407002, + "balance_loss_mlp": 1.0507009, + "epoch": 0.13954606944235684, + "flos": 21105204272640.0, + "grad_norm": 1.6249908375914148, + "language_loss": 0.70695353, + "learning_rate": 3.875432259883256e-06, + "loss": 0.72896051, + "num_input_tokens_seen": 50288575, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0625, + "step": 2321, + "time_per_iteration": 2.4594380855560303 + }, + { + "auxiliary_loss_clip": 0.01158572, + "auxiliary_loss_mlp": 0.01055348, + "balance_loss_clip": 1.0345459, + "balance_loss_mlp": 1.04883599, + "epoch": 0.1396061926950248, + "flos": 25044425009280.0, + "grad_norm": 43.01057366099128, + "language_loss": 0.86161166, + "learning_rate": 3.875296924384965e-06, + "loss": 0.88375086, + "num_input_tokens_seen": 50308735, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.09375, + "step": 2322, + "time_per_iteration": 2.4912750720977783 + }, + { + "auxiliary_loss_clip": 0.01152207, + "auxiliary_loss_mlp": 0.01056736, + "balance_loss_clip": 1.0373404, + "balance_loss_mlp": 1.04840016, + "epoch": 0.13966631594769277, + "flos": 37634023428480.0, + "grad_norm": 1.7187096085030618, + "language_loss": 0.6682983, + "learning_rate": 3.875161517775226e-06, + "loss": 0.69038773, + "num_input_tokens_seen": 50331025, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0390625, + "step": 2323, + "time_per_iteration": 2.606084108352661 + }, + { + "auxiliary_loss_clip": 0.0116621, + "auxiliary_loss_mlp": 0.01051125, + "balance_loss_clip": 1.03068066, + "balance_loss_mlp": 1.05250573, + "epoch": 0.13972643920036074, + "flos": 16690993061760.0, + "grad_norm": 2.0268681764850665, + "language_loss": 0.89011461, + "learning_rate": 3.875026040059175e-06, + "loss": 0.91228795, + "num_input_tokens_seen": 50349725, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1328125, + "step": 2324, + "time_per_iteration": 2.458172559738159 + }, + { + "auxiliary_loss_clip": 0.01159494, + "auxiliary_loss_mlp": 0.01056649, + "balance_loss_clip": 1.03626466, + "balance_loss_mlp": 1.04949069, + "epoch": 0.1397865624530287, + "flos": 23331055288320.0, + "grad_norm": 4.4201897818475775, + "language_loss": 0.70700991, + "learning_rate": 3.8748904912419485e-06, + "loss": 0.7291714, + "num_input_tokens_seen": 50367965, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1015625, + "step": 2325, + "time_per_iteration": 2.4608585834503174 + }, + { + "auxiliary_loss_clip": 0.01161715, + "auxiliary_loss_mlp": 0.01055057, + "balance_loss_clip": 1.03568554, + "balance_loss_mlp": 1.05384755, + "epoch": 0.13984668570569667, + "flos": 22778317825920.0, + "grad_norm": 1.8512202881484865, + "language_loss": 0.81165004, + "learning_rate": 3.874754871328688e-06, + "loss": 0.83381784, + "num_input_tokens_seen": 50385605, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.078125, + "step": 2326, + "time_per_iteration": 2.474729537963867 + }, + { + "auxiliary_loss_clip": 0.01155088, + "auxiliary_loss_mlp": 0.01047016, + "balance_loss_clip": 1.02880073, + "balance_loss_mlp": 1.05092621, + "epoch": 0.13990680895836466, + "flos": 19464553635840.0, + "grad_norm": 1.806872548679543, + "language_loss": 0.88955671, + "learning_rate": 3.874619180324534e-06, + "loss": 0.9115777, + "num_input_tokens_seen": 50403985, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.0390625, + "step": 2327, + "time_per_iteration": 2.4512577056884766 + }, + { + "auxiliary_loss_clip": 0.01155487, + "auxiliary_loss_mlp": 0.0105816, + "balance_loss_clip": 1.03790593, + "balance_loss_mlp": 1.05021226, + "epoch": 0.13996693221103262, + "flos": 20303283185280.0, + "grad_norm": 2.4750320646827992, + "language_loss": 0.85236871, + "learning_rate": 3.874483418234632e-06, + "loss": 0.87450516, + "num_input_tokens_seen": 50421590, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.046875, + "step": 2328, + "time_per_iteration": 2.4724884033203125 + }, + { + "auxiliary_loss_clip": 0.01158673, + "auxiliary_loss_mlp": 0.0104927, + "balance_loss_clip": 1.02926636, + "balance_loss_mlp": 1.05120313, + "epoch": 0.1400270554637006, + "flos": 26617707688320.0, + "grad_norm": 1.653872228613324, + "language_loss": 0.74084997, + "learning_rate": 3.874347585064131e-06, + "loss": 0.76292944, + "num_input_tokens_seen": 50443945, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.078125, + "step": 2329, + "time_per_iteration": 2.5238442420959473 + }, + { + "auxiliary_loss_clip": 0.01156952, + "auxiliary_loss_mlp": 0.01050364, + "balance_loss_clip": 1.03070641, + "balance_loss_mlp": 1.04729962, + "epoch": 0.14008717871636855, + "flos": 19391475415680.0, + "grad_norm": 1.840223813628444, + "language_loss": 0.77969897, + "learning_rate": 3.874211680818183e-06, + "loss": 0.80177212, + "num_input_tokens_seen": 50462065, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.09375, + "step": 2330, + "time_per_iteration": 2.468606948852539 + }, + { + "auxiliary_loss_clip": 0.01156133, + "auxiliary_loss_mlp": 0.01046075, + "balance_loss_clip": 1.02738333, + "balance_loss_mlp": 1.0495398, + "epoch": 0.14014730196903652, + "flos": 15304266645120.0, + "grad_norm": 2.6993483396219506, + "language_loss": 0.72030222, + "learning_rate": 3.87407570550194e-06, + "loss": 0.74232423, + "num_input_tokens_seen": 50479565, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0625, + "step": 2331, + "time_per_iteration": 2.504417896270752 + }, + { + "auxiliary_loss_clip": 0.01150975, + "auxiliary_loss_mlp": 0.01053113, + "balance_loss_clip": 1.03333664, + "balance_loss_mlp": 1.05008936, + "epoch": 0.14020742522170448, + "flos": 14939701557120.0, + "grad_norm": 1.585347596838152, + "language_loss": 0.72609055, + "learning_rate": 3.873939659120557e-06, + "loss": 0.74813151, + "num_input_tokens_seen": 50497305, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0078125, + "step": 2332, + "time_per_iteration": 2.4244635105133057 + }, + { + "auxiliary_loss_clip": 0.01047328, + "auxiliary_loss_mlp": 0.01002801, + "balance_loss_clip": 1.00048828, + "balance_loss_mlp": 1.01059568, + "epoch": 0.14026754847437245, + "flos": 48824580044160.0, + "grad_norm": 0.8290843953692559, + "language_loss": 0.56071591, + "learning_rate": 3.873803541679196e-06, + "loss": 0.58121729, + "num_input_tokens_seen": 50549735, + "router_z_loss_clip": 0.02307129, + "router_z_loss_mlp": 0.3671875, + "step": 2333, + "time_per_iteration": 2.8934712409973145 + }, + { + "auxiliary_loss_clip": 0.01155339, + "auxiliary_loss_mlp": 0.01046057, + "balance_loss_clip": 1.02629185, + "balance_loss_mlp": 1.05001664, + "epoch": 0.14032767172704044, + "flos": 25773267876480.0, + "grad_norm": 1.7851490004805215, + "language_loss": 0.82529652, + "learning_rate": 3.873667353183016e-06, + "loss": 0.84731042, + "num_input_tokens_seen": 50570100, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0546875, + "step": 2334, + "time_per_iteration": 2.495786428451538 + }, + { + "auxiliary_loss_clip": 0.01155545, + "auxiliary_loss_mlp": 0.01048248, + "balance_loss_clip": 1.0293529, + "balance_loss_mlp": 1.05012262, + "epoch": 0.1403877949797084, + "flos": 21216312017280.0, + "grad_norm": 1.8251700419130605, + "language_loss": 0.81237197, + "learning_rate": 3.8735310936371825e-06, + "loss": 0.83440989, + "num_input_tokens_seen": 50589185, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0546875, + "step": 2335, + "time_per_iteration": 2.483055591583252 + }, + { + "auxiliary_loss_clip": 0.01163426, + "auxiliary_loss_mlp": 0.01051429, + "balance_loss_clip": 1.02829087, + "balance_loss_mlp": 1.05328035, + "epoch": 0.14044791823237637, + "flos": 22747973811840.0, + "grad_norm": 1.83822789048078, + "language_loss": 0.82159901, + "learning_rate": 3.873394763046862e-06, + "loss": 0.8437475, + "num_input_tokens_seen": 50609645, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.1015625, + "step": 2336, + "time_per_iteration": 2.4732770919799805 + }, + { + "auxiliary_loss_clip": 0.01157668, + "auxiliary_loss_mlp": 0.01044768, + "balance_loss_clip": 1.02526581, + "balance_loss_mlp": 1.05202782, + "epoch": 0.14050804148504434, + "flos": 22964443125120.0, + "grad_norm": 1.8506426201256954, + "language_loss": 0.80081403, + "learning_rate": 3.873258361417225e-06, + "loss": 0.82283843, + "num_input_tokens_seen": 50628385, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0546875, + "step": 2337, + "time_per_iteration": 2.4599671363830566 + }, + { + "auxiliary_loss_clip": 0.01155582, + "auxiliary_loss_mlp": 0.0104864, + "balance_loss_clip": 1.02911353, + "balance_loss_mlp": 1.04861474, + "epoch": 0.1405681647377123, + "flos": 22200336080640.0, + "grad_norm": 2.2474896580124963, + "language_loss": 0.7927807, + "learning_rate": 3.873121888753442e-06, + "loss": 0.81482291, + "num_input_tokens_seen": 50647260, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0703125, + "step": 2338, + "time_per_iteration": 2.4892208576202393 + }, + { + "auxiliary_loss_clip": 0.01165053, + "auxiliary_loss_mlp": 0.01046329, + "balance_loss_clip": 1.02577746, + "balance_loss_mlp": 1.05685067, + "epoch": 0.14062828799038027, + "flos": 23732787974400.0, + "grad_norm": 2.148660398501072, + "language_loss": 0.79827893, + "learning_rate": 3.87298534506069e-06, + "loss": 0.82039273, + "num_input_tokens_seen": 50666130, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.078125, + "step": 2339, + "time_per_iteration": 2.4672555923461914 + }, + { + "auxiliary_loss_clip": 0.01159986, + "auxiliary_loss_mlp": 0.01055012, + "balance_loss_clip": 1.03506875, + "balance_loss_mlp": 1.0527122, + "epoch": 0.14068841124304826, + "flos": 39202493685120.0, + "grad_norm": 1.7979240482106922, + "language_loss": 0.6582588, + "learning_rate": 3.872848730344146e-06, + "loss": 0.68040884, + "num_input_tokens_seen": 50687440, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0703125, + "step": 2340, + "time_per_iteration": 2.614506483078003 + }, + { + "auxiliary_loss_clip": 0.01155238, + "auxiliary_loss_mlp": 0.01048788, + "balance_loss_clip": 1.02936912, + "balance_loss_mlp": 1.05242825, + "epoch": 0.14074853449571623, + "flos": 20192283181440.0, + "grad_norm": 2.5431372850663334, + "language_loss": 0.78670812, + "learning_rate": 3.87271204460899e-06, + "loss": 0.80874836, + "num_input_tokens_seen": 50704030, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.03125, + "step": 2341, + "time_per_iteration": 2.4420077800750732 + }, + { + "auxiliary_loss_clip": 0.01156345, + "auxiliary_loss_mlp": 0.01050043, + "balance_loss_clip": 1.03058767, + "balance_loss_mlp": 1.05246425, + "epoch": 0.1408086577483842, + "flos": 18405871153920.0, + "grad_norm": 11.570217446637303, + "language_loss": 0.80154169, + "learning_rate": 3.8725752878604066e-06, + "loss": 0.82360554, + "num_input_tokens_seen": 50723305, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.03125, + "step": 2342, + "time_per_iteration": 2.4961190223693848 + }, + { + "auxiliary_loss_clip": 0.01159304, + "auxiliary_loss_mlp": 0.010435, + "balance_loss_clip": 1.02486777, + "balance_loss_mlp": 1.05673313, + "epoch": 0.14086878100105216, + "flos": 25264593423360.0, + "grad_norm": 1.9358851833739352, + "language_loss": 0.77974075, + "learning_rate": 3.87243846010358e-06, + "loss": 0.80176884, + "num_input_tokens_seen": 50743270, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.0234375, + "step": 2343, + "time_per_iteration": 2.479679584503174 + }, + { + "auxiliary_loss_clip": 0.01049361, + "auxiliary_loss_mlp": 0.0100492, + "balance_loss_clip": 1.0025475, + "balance_loss_mlp": 1.01255798, + "epoch": 0.14092890425372012, + "flos": 65978388869760.0, + "grad_norm": 0.8341361150670269, + "language_loss": 0.6155628, + "learning_rate": 3.872301561343699e-06, + "loss": 0.63610566, + "num_input_tokens_seen": 50802710, + "router_z_loss_clip": 0.02368164, + "router_z_loss_mlp": 0.3671875, + "step": 2344, + "time_per_iteration": 3.048691987991333 + }, + { + "auxiliary_loss_clip": 0.01151538, + "auxiliary_loss_mlp": 0.01040868, + "balance_loss_clip": 1.02309346, + "balance_loss_mlp": 1.04911709, + "epoch": 0.1409890275063881, + "flos": 23694973931520.0, + "grad_norm": 1.886714907416039, + "language_loss": 0.64591062, + "learning_rate": 3.872164591585956e-06, + "loss": 0.6678347, + "num_input_tokens_seen": 50822625, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 1.0234375, + "step": 2345, + "time_per_iteration": 2.509552240371704 + }, + { + "auxiliary_loss_clip": 0.01162324, + "auxiliary_loss_mlp": 0.01044831, + "balance_loss_clip": 1.023803, + "balance_loss_mlp": 1.05019534, + "epoch": 0.14104915075905605, + "flos": 23623152687360.0, + "grad_norm": 2.502398022219224, + "language_loss": 0.736485, + "learning_rate": 3.8720275508355435e-06, + "loss": 0.7585566, + "num_input_tokens_seen": 50842330, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1171875, + "step": 2346, + "time_per_iteration": 2.4962430000305176 + }, + { + "auxiliary_loss_clip": 0.01160187, + "auxiliary_loss_mlp": 0.01046171, + "balance_loss_clip": 1.02600074, + "balance_loss_mlp": 1.05144429, + "epoch": 0.14110927401172405, + "flos": 20595165102720.0, + "grad_norm": 2.4324488814849703, + "language_loss": 0.77868927, + "learning_rate": 3.8718904390976585e-06, + "loss": 0.80075288, + "num_input_tokens_seen": 50861035, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0859375, + "step": 2347, + "time_per_iteration": 2.4663050174713135 + }, + { + "auxiliary_loss_clip": 0.01155281, + "auxiliary_loss_mlp": 0.01046804, + "balance_loss_clip": 1.02852941, + "balance_loss_mlp": 1.04918981, + "epoch": 0.141169397264392, + "flos": 28548049512960.0, + "grad_norm": 1.7514485331985392, + "language_loss": 0.76446569, + "learning_rate": 3.8717532563775e-06, + "loss": 0.78648651, + "num_input_tokens_seen": 50880105, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.0625, + "step": 2348, + "time_per_iteration": 2.508002758026123 + }, + { + "auxiliary_loss_clip": 0.0115565, + "auxiliary_loss_mlp": 0.01043027, + "balance_loss_clip": 1.02346444, + "balance_loss_mlp": 1.0508523, + "epoch": 0.14122952051705998, + "flos": 17092258871040.0, + "grad_norm": 1.8350283773112115, + "language_loss": 0.8686446, + "learning_rate": 3.871616002680272e-06, + "loss": 0.89063132, + "num_input_tokens_seen": 50897720, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.046875, + "step": 2349, + "time_per_iteration": 2.4446985721588135 + }, + { + "auxiliary_loss_clip": 0.01156083, + "auxiliary_loss_mlp": 0.0104419, + "balance_loss_clip": 1.02478313, + "balance_loss_mlp": 1.05220377, + "epoch": 0.14128964376972794, + "flos": 28946801370240.0, + "grad_norm": 1.7285118920158233, + "language_loss": 0.8895669, + "learning_rate": 3.871478678011177e-06, + "loss": 0.9115696, + "num_input_tokens_seen": 50918385, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0390625, + "step": 2350, + "time_per_iteration": 2.49725341796875 + }, + { + "auxiliary_loss_clip": 0.0115943, + "auxiliary_loss_mlp": 0.01046195, + "balance_loss_clip": 1.02542889, + "balance_loss_mlp": 1.05281878, + "epoch": 0.1413497670223959, + "flos": 18989778643200.0, + "grad_norm": 1.8656651100546833, + "language_loss": 0.814816, + "learning_rate": 3.871341282375423e-06, + "loss": 0.83687228, + "num_input_tokens_seen": 50938270, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.0703125, + "step": 2351, + "time_per_iteration": 3.941416025161743 + }, + { + "auxiliary_loss_clip": 0.01157242, + "auxiliary_loss_mlp": 0.01040097, + "balance_loss_clip": 1.02102304, + "balance_loss_mlp": 1.05032706, + "epoch": 0.14140989027506387, + "flos": 29862236413440.0, + "grad_norm": 2.6782915885510286, + "language_loss": 0.82935351, + "learning_rate": 3.871203815778219e-06, + "loss": 0.85132694, + "num_input_tokens_seen": 50958155, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0703125, + "step": 2352, + "time_per_iteration": 5.431722640991211 + }, + { + "auxiliary_loss_clip": 0.01047453, + "auxiliary_loss_mlp": 0.01006216, + "balance_loss_clip": 1.00387907, + "balance_loss_mlp": 1.01053333, + "epoch": 0.14147001352773186, + "flos": 62079532041600.0, + "grad_norm": 0.90864091090638, + "language_loss": 0.61894125, + "learning_rate": 3.87106627822478e-06, + "loss": 0.63947791, + "num_input_tokens_seen": 51020705, + "router_z_loss_clip": 0.02331543, + "router_z_loss_mlp": 0.36914062, + "step": 2353, + "time_per_iteration": 3.0071640014648438 + }, + { + "auxiliary_loss_clip": 0.01154516, + "auxiliary_loss_mlp": 0.01047207, + "balance_loss_clip": 1.02807426, + "balance_loss_mlp": 1.05024958, + "epoch": 0.14153013678039983, + "flos": 22017514832640.0, + "grad_norm": 1.8535903324814498, + "language_loss": 0.87264848, + "learning_rate": 3.8709286697203196e-06, + "loss": 0.89466572, + "num_input_tokens_seen": 51039995, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0390625, + "step": 2354, + "time_per_iteration": 2.4613726139068604 + }, + { + "auxiliary_loss_clip": 0.01157155, + "auxiliary_loss_mlp": 0.01046298, + "balance_loss_clip": 1.02607965, + "balance_loss_mlp": 1.04953241, + "epoch": 0.1415902600330678, + "flos": 19720093968000.0, + "grad_norm": 1.9651075901387003, + "language_loss": 0.74872321, + "learning_rate": 3.870790990270057e-06, + "loss": 0.77075779, + "num_input_tokens_seen": 51059075, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.078125, + "step": 2355, + "time_per_iteration": 2.442379951477051 + }, + { + "auxiliary_loss_clip": 0.01047047, + "auxiliary_loss_mlp": 0.01002716, + "balance_loss_clip": 1.00052261, + "balance_loss_mlp": 1.01023293, + "epoch": 0.14165038328573576, + "flos": 65900929190400.0, + "grad_norm": 0.6790475533637321, + "language_loss": 0.5182299, + "learning_rate": 3.870653239879212e-06, + "loss": 0.53872752, + "num_input_tokens_seen": 51120380, + "router_z_loss_clip": 0.02197266, + "router_z_loss_mlp": 0.3671875, + "step": 2356, + "time_per_iteration": 2.9892258644104004 + }, + { + "auxiliary_loss_clip": 0.01156071, + "auxiliary_loss_mlp": 0.01053896, + "balance_loss_clip": 1.03495359, + "balance_loss_mlp": 1.05080867, + "epoch": 0.14171050653840372, + "flos": 12130158533760.0, + "grad_norm": 3.0630792396255053, + "language_loss": 0.70576489, + "learning_rate": 3.8705154185530095e-06, + "loss": 0.72786456, + "num_input_tokens_seen": 51136950, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0546875, + "step": 2357, + "time_per_iteration": 2.421844005584717 + }, + { + "auxiliary_loss_clip": 0.0116013, + "auxiliary_loss_mlp": 0.01050288, + "balance_loss_clip": 1.03169179, + "balance_loss_mlp": 1.05012453, + "epoch": 0.1417706297910717, + "flos": 20412487509120.0, + "grad_norm": 1.8720076771552743, + "language_loss": 0.82205695, + "learning_rate": 3.870377526296674e-06, + "loss": 0.84416115, + "num_input_tokens_seen": 51155175, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.09375, + "step": 2358, + "time_per_iteration": 2.4519011974334717 + }, + { + "auxiliary_loss_clip": 0.01160902, + "auxiliary_loss_mlp": 0.01047176, + "balance_loss_clip": 1.02663624, + "balance_loss_mlp": 1.051018, + "epoch": 0.14183075304373965, + "flos": 22380607463040.0, + "grad_norm": 6.439592826280342, + "language_loss": 0.7129705, + "learning_rate": 3.870239563115436e-06, + "loss": 0.73505127, + "num_input_tokens_seen": 51174500, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1015625, + "step": 2359, + "time_per_iteration": 2.4797613620758057 + }, + { + "auxiliary_loss_clip": 0.0115558, + "auxiliary_loss_mlp": 0.01043529, + "balance_loss_clip": 1.02374041, + "balance_loss_mlp": 1.04988599, + "epoch": 0.14189087629640765, + "flos": 21580913018880.0, + "grad_norm": 5.514404455287625, + "language_loss": 0.76040578, + "learning_rate": 3.870101529014526e-06, + "loss": 0.78239685, + "num_input_tokens_seen": 51194270, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.0546875, + "step": 2360, + "time_per_iteration": 2.4538815021514893 + }, + { + "auxiliary_loss_clip": 0.011559, + "auxiliary_loss_mlp": 0.01041926, + "balance_loss_clip": 1.02173233, + "balance_loss_mlp": 1.05221295, + "epoch": 0.1419509995490756, + "flos": 20008564093440.0, + "grad_norm": 2.1535632205539135, + "language_loss": 0.8188749, + "learning_rate": 3.869963423999178e-06, + "loss": 0.84085315, + "num_input_tokens_seen": 51211850, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0390625, + "step": 2361, + "time_per_iteration": 2.4411346912384033 + }, + { + "auxiliary_loss_clip": 0.01152529, + "auxiliary_loss_mlp": 0.01047315, + "balance_loss_clip": 1.02826524, + "balance_loss_mlp": 1.04964995, + "epoch": 0.14201112280174358, + "flos": 31941464112000.0, + "grad_norm": 2.775663525053056, + "language_loss": 0.74489617, + "learning_rate": 3.86982524807463e-06, + "loss": 0.76689464, + "num_input_tokens_seen": 51233545, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.03125, + "step": 2362, + "time_per_iteration": 2.530163049697876 + }, + { + "auxiliary_loss_clip": 0.01158195, + "auxiliary_loss_mlp": 0.01046424, + "balance_loss_clip": 1.0265274, + "balance_loss_mlp": 1.05187464, + "epoch": 0.14207124605441154, + "flos": 41464147582080.0, + "grad_norm": 4.478599792998506, + "language_loss": 0.73748112, + "learning_rate": 3.869687001246122e-06, + "loss": 0.75952733, + "num_input_tokens_seen": 51257615, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0625, + "step": 2363, + "time_per_iteration": 2.646651029586792 + }, + { + "auxiliary_loss_clip": 0.01156109, + "auxiliary_loss_mlp": 0.01045606, + "balance_loss_clip": 1.02605534, + "balance_loss_mlp": 1.05005693, + "epoch": 0.1421313693070795, + "flos": 31905086613120.0, + "grad_norm": 1.8353407682080387, + "language_loss": 0.72971261, + "learning_rate": 3.8695486835188946e-06, + "loss": 0.75172973, + "num_input_tokens_seen": 51279645, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0625, + "step": 2364, + "time_per_iteration": 2.5670576095581055 + }, + { + "auxiliary_loss_clip": 0.01152213, + "auxiliary_loss_mlp": 0.01048707, + "balance_loss_clip": 1.031183, + "balance_loss_mlp": 1.05015445, + "epoch": 0.14219149255974747, + "flos": 26871165031680.0, + "grad_norm": 4.452075303519762, + "language_loss": 0.90230036, + "learning_rate": 3.869410294898195e-06, + "loss": 0.92430955, + "num_input_tokens_seen": 51299775, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 1.015625, + "step": 2365, + "time_per_iteration": 2.5130062103271484 + }, + { + "auxiliary_loss_clip": 0.01155172, + "auxiliary_loss_mlp": 0.0104726, + "balance_loss_clip": 1.02735198, + "balance_loss_mlp": 1.04896259, + "epoch": 0.14225161581241544, + "flos": 27454426076160.0, + "grad_norm": 1.956458588852685, + "language_loss": 0.65377176, + "learning_rate": 3.869271835389268e-06, + "loss": 0.67579615, + "num_input_tokens_seen": 51319430, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0625, + "step": 2366, + "time_per_iteration": 2.5081095695495605 + }, + { + "auxiliary_loss_clip": 0.01152693, + "auxiliary_loss_mlp": 0.01056429, + "balance_loss_clip": 1.03640223, + "balance_loss_mlp": 1.04979372, + "epoch": 0.14231173906508343, + "flos": 10561436881920.0, + "grad_norm": 2.190613479881076, + "language_loss": 0.80414236, + "learning_rate": 3.8691333049973665e-06, + "loss": 0.82623357, + "num_input_tokens_seen": 51336045, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.03125, + "step": 2367, + "time_per_iteration": 2.4398317337036133 + }, + { + "auxiliary_loss_clip": 0.01158941, + "auxiliary_loss_mlp": 0.01054295, + "balance_loss_clip": 1.0333972, + "balance_loss_mlp": 1.05221498, + "epoch": 0.1423718623177514, + "flos": 28360882719360.0, + "grad_norm": 2.898581267606924, + "language_loss": 0.82619941, + "learning_rate": 3.868994703727742e-06, + "loss": 0.84833181, + "num_input_tokens_seen": 51357030, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.0703125, + "step": 2368, + "time_per_iteration": 2.512401580810547 + }, + { + "auxiliary_loss_clip": 0.01157439, + "auxiliary_loss_mlp": 0.01050054, + "balance_loss_clip": 1.0298835, + "balance_loss_mlp": 1.05165803, + "epoch": 0.14243198557041936, + "flos": 19354235990400.0, + "grad_norm": 2.7587049982231675, + "language_loss": 0.86971414, + "learning_rate": 3.868856031585652e-06, + "loss": 0.89178908, + "num_input_tokens_seen": 51374890, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0625, + "step": 2369, + "time_per_iteration": 2.444784164428711 + }, + { + "auxiliary_loss_clip": 0.01158905, + "auxiliary_loss_mlp": 0.01042779, + "balance_loss_clip": 1.02303767, + "balance_loss_mlp": 1.04913163, + "epoch": 0.14249210882308733, + "flos": 28806857982720.0, + "grad_norm": 1.4370193327140612, + "language_loss": 0.75704634, + "learning_rate": 3.868717288576354e-06, + "loss": 0.77906322, + "num_input_tokens_seen": 51398100, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.09375, + "step": 2370, + "time_per_iteration": 2.527740240097046 + }, + { + "auxiliary_loss_clip": 0.01154457, + "auxiliary_loss_mlp": 0.01058445, + "balance_loss_clip": 1.0384295, + "balance_loss_mlp": 1.04879546, + "epoch": 0.1425522320757553, + "flos": 21835016807040.0, + "grad_norm": 1.7319048865171518, + "language_loss": 0.82923144, + "learning_rate": 3.868578474705109e-06, + "loss": 0.85136044, + "num_input_tokens_seen": 51418745, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.0625, + "step": 2371, + "time_per_iteration": 2.4644808769226074 + }, + { + "auxiliary_loss_clip": 0.01158835, + "auxiliary_loss_mlp": 0.01051346, + "balance_loss_clip": 1.03171265, + "balance_loss_mlp": 1.05157602, + "epoch": 0.14261235532842326, + "flos": 17311457617920.0, + "grad_norm": 1.956158386855541, + "language_loss": 0.82575452, + "learning_rate": 3.868439589977181e-06, + "loss": 0.84785628, + "num_input_tokens_seen": 51437455, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0703125, + "step": 2372, + "time_per_iteration": 2.42240047454834 + }, + { + "auxiliary_loss_clip": 0.01157732, + "auxiliary_loss_mlp": 0.01051664, + "balance_loss_clip": 1.03175569, + "balance_loss_mlp": 1.05134308, + "epoch": 0.14267247858109125, + "flos": 18806741913600.0, + "grad_norm": 2.19442784605527, + "language_loss": 0.8396256, + "learning_rate": 3.868300634397836e-06, + "loss": 0.86171949, + "num_input_tokens_seen": 51455710, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0625, + "step": 2373, + "time_per_iteration": 2.444695472717285 + }, + { + "auxiliary_loss_clip": 0.01154816, + "auxiliary_loss_mlp": 0.01050946, + "balance_loss_clip": 1.03294528, + "balance_loss_mlp": 1.05012143, + "epoch": 0.14273260183375922, + "flos": 11358904682880.0, + "grad_norm": 2.034088541649992, + "language_loss": 0.86271042, + "learning_rate": 3.8681616079723445e-06, + "loss": 0.88476801, + "num_input_tokens_seen": 51471270, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.046875, + "step": 2374, + "time_per_iteration": 2.428062915802002 + }, + { + "auxiliary_loss_clip": 0.01161306, + "auxiliary_loss_mlp": 0.01050996, + "balance_loss_clip": 1.03024197, + "balance_loss_mlp": 1.05125451, + "epoch": 0.14279272508642718, + "flos": 27567688636800.0, + "grad_norm": 4.612229602439842, + "language_loss": 0.7919687, + "learning_rate": 3.868022510705977e-06, + "loss": 0.81409162, + "num_input_tokens_seen": 51492705, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1015625, + "step": 2375, + "time_per_iteration": 2.526838541030884 + }, + { + "auxiliary_loss_clip": 0.01157834, + "auxiliary_loss_mlp": 0.01056869, + "balance_loss_clip": 1.03655601, + "balance_loss_mlp": 1.05240607, + "epoch": 0.14285284833909515, + "flos": 16252559654400.0, + "grad_norm": 2.386247922788535, + "language_loss": 0.76400912, + "learning_rate": 3.867883342604009e-06, + "loss": 0.78615618, + "num_input_tokens_seen": 51510780, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.046875, + "step": 2376, + "time_per_iteration": 2.4554591178894043 + }, + { + "auxiliary_loss_clip": 0.01156552, + "auxiliary_loss_mlp": 0.0104949, + "balance_loss_clip": 1.02995205, + "balance_loss_mlp": 1.05075741, + "epoch": 0.1429129715917631, + "flos": 19755609540480.0, + "grad_norm": 2.9035160782842753, + "language_loss": 0.93037754, + "learning_rate": 3.867744103671717e-06, + "loss": 0.952438, + "num_input_tokens_seen": 51531400, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0625, + "step": 2377, + "time_per_iteration": 2.51385760307312 + }, + { + "auxiliary_loss_clip": 0.01157682, + "auxiliary_loss_mlp": 0.01051526, + "balance_loss_clip": 1.02991319, + "balance_loss_mlp": 1.05085003, + "epoch": 0.14297309484443108, + "flos": 21137092571520.0, + "grad_norm": 1.9751577144221115, + "language_loss": 0.91598773, + "learning_rate": 3.867604793914382e-06, + "loss": 0.93807983, + "num_input_tokens_seen": 51548215, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.0703125, + "step": 2378, + "time_per_iteration": 2.558563470840454 + }, + { + "auxiliary_loss_clip": 0.01159674, + "auxiliary_loss_mlp": 0.01044299, + "balance_loss_clip": 1.02410531, + "balance_loss_mlp": 1.051296, + "epoch": 0.14303321809709904, + "flos": 23586667447680.0, + "grad_norm": 1.745891074970689, + "language_loss": 0.73947102, + "learning_rate": 3.8674654133372864e-06, + "loss": 0.76151079, + "num_input_tokens_seen": 51566820, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0859375, + "step": 2379, + "time_per_iteration": 2.511359214782715 + }, + { + "auxiliary_loss_clip": 0.01156473, + "auxiliary_loss_mlp": 0.01056109, + "balance_loss_clip": 1.03636849, + "balance_loss_mlp": 1.05014992, + "epoch": 0.14309334134976703, + "flos": 15888281875200.0, + "grad_norm": 1.8640465231226504, + "language_loss": 0.79013336, + "learning_rate": 3.867325961945714e-06, + "loss": 0.81225914, + "num_input_tokens_seen": 51585075, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0625, + "step": 2380, + "time_per_iteration": 2.466219663619995 + }, + { + "auxiliary_loss_clip": 0.01162977, + "auxiliary_loss_mlp": 0.0105089, + "balance_loss_clip": 1.03124452, + "balance_loss_mlp": 1.05528164, + "epoch": 0.143153464602435, + "flos": 16325601960960.0, + "grad_norm": 2.3244590707621073, + "language_loss": 0.87958229, + "learning_rate": 3.867186439744955e-06, + "loss": 0.90172088, + "num_input_tokens_seen": 51603185, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.078125, + "step": 2381, + "time_per_iteration": 2.4476850032806396 + }, + { + "auxiliary_loss_clip": 0.01156941, + "auxiliary_loss_mlp": 0.01051059, + "balance_loss_clip": 1.03084123, + "balance_loss_mlp": 1.0517571, + "epoch": 0.14321358785510296, + "flos": 17092079303040.0, + "grad_norm": 2.599935932772449, + "language_loss": 0.76852649, + "learning_rate": 3.867046846740299e-06, + "loss": 0.7906065, + "num_input_tokens_seen": 51620880, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.046875, + "step": 2382, + "time_per_iteration": 2.4389045238494873 + }, + { + "auxiliary_loss_clip": 0.01157847, + "auxiliary_loss_mlp": 0.01053474, + "balance_loss_clip": 1.03338671, + "balance_loss_mlp": 1.05068171, + "epoch": 0.14327371110777093, + "flos": 26322916769280.0, + "grad_norm": 2.461149819336849, + "language_loss": 0.76948071, + "learning_rate": 3.866907182937039e-06, + "loss": 0.79159391, + "num_input_tokens_seen": 51640170, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0703125, + "step": 2383, + "time_per_iteration": 2.516038179397583 + }, + { + "auxiliary_loss_clip": 0.01158748, + "auxiliary_loss_mlp": 0.01050878, + "balance_loss_clip": 1.0299803, + "balance_loss_mlp": 1.05114412, + "epoch": 0.1433338343604389, + "flos": 18076462502400.0, + "grad_norm": 2.169581662424978, + "language_loss": 0.88202822, + "learning_rate": 3.866767448340471e-06, + "loss": 0.9041245, + "num_input_tokens_seen": 51656580, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.078125, + "step": 2384, + "time_per_iteration": 2.42138934135437 + }, + { + "auxiliary_loss_clip": 0.01164338, + "auxiliary_loss_mlp": 0.01049242, + "balance_loss_clip": 1.02780819, + "balance_loss_mlp": 1.05382657, + "epoch": 0.14339395761310686, + "flos": 15522783033600.0, + "grad_norm": 4.175812514986151, + "language_loss": 0.79225606, + "learning_rate": 3.866627642955895e-06, + "loss": 0.81439185, + "num_input_tokens_seen": 51674645, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.109375, + "step": 2385, + "time_per_iteration": 2.4439244270324707 + }, + { + "auxiliary_loss_clip": 0.01156029, + "auxiliary_loss_mlp": 0.01046717, + "balance_loss_clip": 1.02692771, + "balance_loss_mlp": 1.04881537, + "epoch": 0.14345408086577485, + "flos": 28548767784960.0, + "grad_norm": 1.9672730758223058, + "language_loss": 0.74989617, + "learning_rate": 3.866487766788612e-06, + "loss": 0.77192366, + "num_input_tokens_seen": 51695770, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.078125, + "step": 2386, + "time_per_iteration": 2.533304214477539 + }, + { + "auxiliary_loss_clip": 0.01159067, + "auxiliary_loss_mlp": 0.01047419, + "balance_loss_clip": 1.02777338, + "balance_loss_mlp": 1.05180025, + "epoch": 0.14351420411844282, + "flos": 20230061310720.0, + "grad_norm": 2.5174427688568626, + "language_loss": 0.78475344, + "learning_rate": 3.866347819843925e-06, + "loss": 0.80681831, + "num_input_tokens_seen": 51714165, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0703125, + "step": 2387, + "time_per_iteration": 2.4568724632263184 + }, + { + "auxiliary_loss_clip": 0.01157837, + "auxiliary_loss_mlp": 0.010548, + "balance_loss_clip": 1.03389072, + "balance_loss_mlp": 1.05092847, + "epoch": 0.14357432737111078, + "flos": 19865029345920.0, + "grad_norm": 2.559937991009886, + "language_loss": 0.82087159, + "learning_rate": 3.866207802127143e-06, + "loss": 0.84299791, + "num_input_tokens_seen": 51734440, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.0703125, + "step": 2388, + "time_per_iteration": 2.5136237144470215 + }, + { + "auxiliary_loss_clip": 0.01161514, + "auxiliary_loss_mlp": 0.01044981, + "balance_loss_clip": 1.02633715, + "balance_loss_mlp": 1.05393136, + "epoch": 0.14363445062377875, + "flos": 28256814040320.0, + "grad_norm": 2.471836270672028, + "language_loss": 0.82267237, + "learning_rate": 3.866067713643573e-06, + "loss": 0.84473729, + "num_input_tokens_seen": 51753730, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.078125, + "step": 2389, + "time_per_iteration": 2.576352119445801 + }, + { + "auxiliary_loss_clip": 0.01161426, + "auxiliary_loss_mlp": 0.01051291, + "balance_loss_clip": 1.03020322, + "balance_loss_mlp": 1.05032301, + "epoch": 0.1436945738764467, + "flos": 18186672407040.0, + "grad_norm": 2.165584666776674, + "language_loss": 0.82654548, + "learning_rate": 3.8659275543985285e-06, + "loss": 0.84867263, + "num_input_tokens_seen": 51771195, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.109375, + "step": 2390, + "time_per_iteration": 2.5145435333251953 + }, + { + "auxiliary_loss_clip": 0.01158378, + "auxiliary_loss_mlp": 0.01054186, + "balance_loss_clip": 1.03406334, + "balance_loss_mlp": 1.0510571, + "epoch": 0.14375469712911468, + "flos": 27307910499840.0, + "grad_norm": 3.0575281215329086, + "language_loss": 0.74616158, + "learning_rate": 3.865787324397324e-06, + "loss": 0.76828718, + "num_input_tokens_seen": 51792290, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.078125, + "step": 2391, + "time_per_iteration": 2.5368545055389404 + }, + { + "auxiliary_loss_clip": 0.01050934, + "auxiliary_loss_mlp": 0.01014282, + "balance_loss_clip": 1.0121367, + "balance_loss_mlp": 1.01461065, + "epoch": 0.14381482038178264, + "flos": 56891445287040.0, + "grad_norm": 0.8732258813949081, + "language_loss": 0.61769497, + "learning_rate": 3.865647023645277e-06, + "loss": 0.63834715, + "num_input_tokens_seen": 51843675, + "router_z_loss_clip": 0.02148438, + "router_z_loss_mlp": 0.36328125, + "step": 2392, + "time_per_iteration": 2.9315476417541504 + }, + { + "auxiliary_loss_clip": 0.01161818, + "auxiliary_loss_mlp": 0.01056559, + "balance_loss_clip": 1.03449333, + "balance_loss_mlp": 1.04981267, + "epoch": 0.14387494363445064, + "flos": 14282177143680.0, + "grad_norm": 2.638581894381379, + "language_loss": 0.76172751, + "learning_rate": 3.865506652147709e-06, + "loss": 0.78391123, + "num_input_tokens_seen": 51860285, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1171875, + "step": 2393, + "time_per_iteration": 3.857799530029297 + }, + { + "auxiliary_loss_clip": 0.01161345, + "auxiliary_loss_mlp": 0.01049065, + "balance_loss_clip": 1.02908611, + "balance_loss_mlp": 1.05249143, + "epoch": 0.1439350668871186, + "flos": 26761493831040.0, + "grad_norm": 1.8778469598095298, + "language_loss": 0.76782668, + "learning_rate": 3.865366209909941e-06, + "loss": 0.78993082, + "num_input_tokens_seen": 51880105, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.09375, + "step": 2394, + "time_per_iteration": 3.979130983352661 + }, + { + "auxiliary_loss_clip": 0.01158023, + "auxiliary_loss_mlp": 0.01048603, + "balance_loss_clip": 1.02836156, + "balance_loss_mlp": 1.05062532, + "epoch": 0.14399519013978657, + "flos": 40700040537600.0, + "grad_norm": 1.605706810552395, + "language_loss": 0.85831755, + "learning_rate": 3.8652256969372994e-06, + "loss": 0.88038385, + "num_input_tokens_seen": 51905175, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.078125, + "step": 2395, + "time_per_iteration": 2.652092933654785 + }, + { + "auxiliary_loss_clip": 0.01157831, + "auxiliary_loss_mlp": 0.01049814, + "balance_loss_clip": 1.03040648, + "balance_loss_mlp": 1.05241179, + "epoch": 0.14405531339245453, + "flos": 20557530627840.0, + "grad_norm": 1.5230484666362787, + "language_loss": 0.82984561, + "learning_rate": 3.865085113235113e-06, + "loss": 0.85192204, + "num_input_tokens_seen": 51924490, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0546875, + "step": 2396, + "time_per_iteration": 2.4647467136383057 + }, + { + "auxiliary_loss_clip": 0.01152766, + "auxiliary_loss_mlp": 0.01046059, + "balance_loss_clip": 1.02691364, + "balance_loss_mlp": 1.04947495, + "epoch": 0.1441154366451225, + "flos": 19572931946880.0, + "grad_norm": 2.435366869769497, + "language_loss": 0.82564163, + "learning_rate": 3.864944458808712e-06, + "loss": 0.8476299, + "num_input_tokens_seen": 51940490, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.03125, + "step": 2397, + "time_per_iteration": 2.4151055812835693 + }, + { + "auxiliary_loss_clip": 0.01161338, + "auxiliary_loss_mlp": 0.01047669, + "balance_loss_clip": 1.02689052, + "balance_loss_mlp": 1.05216622, + "epoch": 0.14417555989779046, + "flos": 18515721922560.0, + "grad_norm": 1.6104109289920625, + "language_loss": 0.79418427, + "learning_rate": 3.86480373366343e-06, + "loss": 0.81627429, + "num_input_tokens_seen": 51957910, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.09375, + "step": 2398, + "time_per_iteration": 2.4470388889312744 + }, + { + "auxiliary_loss_clip": 0.01158929, + "auxiliary_loss_mlp": 0.01052066, + "balance_loss_clip": 1.03246808, + "balance_loss_mlp": 1.05359757, + "epoch": 0.14423568315045843, + "flos": 26031681296640.0, + "grad_norm": 2.7500042291552433, + "language_loss": 0.64847696, + "learning_rate": 3.864662937804603e-06, + "loss": 0.67058688, + "num_input_tokens_seen": 51978010, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0546875, + "step": 2399, + "time_per_iteration": 2.5123891830444336 + }, + { + "auxiliary_loss_clip": 0.01157977, + "auxiliary_loss_mlp": 0.01044487, + "balance_loss_clip": 1.02472198, + "balance_loss_mlp": 1.05306005, + "epoch": 0.14429580640312642, + "flos": 21288743792640.0, + "grad_norm": 1.4896130870957418, + "language_loss": 0.82329226, + "learning_rate": 3.864522071237571e-06, + "loss": 0.84531689, + "num_input_tokens_seen": 51998515, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0546875, + "step": 2400, + "time_per_iteration": 2.4825797080993652 + }, + { + "auxiliary_loss_clip": 0.01162323, + "auxiliary_loss_mlp": 0.01052957, + "balance_loss_clip": 1.03165436, + "balance_loss_mlp": 1.053689, + "epoch": 0.14435592965579438, + "flos": 25627865621760.0, + "grad_norm": 1.540874002782335, + "language_loss": 0.74606794, + "learning_rate": 3.864381133967676e-06, + "loss": 0.76822078, + "num_input_tokens_seen": 52019270, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.0859375, + "step": 2401, + "time_per_iteration": 2.507983684539795 + }, + { + "auxiliary_loss_clip": 0.01156636, + "auxiliary_loss_mlp": 0.01046459, + "balance_loss_clip": 1.0269084, + "balance_loss_mlp": 1.05109596, + "epoch": 0.14441605290846235, + "flos": 22965053656320.0, + "grad_norm": 1.7568662987329828, + "language_loss": 0.80577219, + "learning_rate": 3.86424012600026e-06, + "loss": 0.82780313, + "num_input_tokens_seen": 52039315, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.046875, + "step": 2402, + "time_per_iteration": 2.4913880825042725 + }, + { + "auxiliary_loss_clip": 0.01156436, + "auxiliary_loss_mlp": 0.01048893, + "balance_loss_clip": 1.02880669, + "balance_loss_mlp": 1.05137098, + "epoch": 0.14447617616113032, + "flos": 17347655548800.0, + "grad_norm": 2.1115432529250753, + "language_loss": 0.84918672, + "learning_rate": 3.864099047340673e-06, + "loss": 0.87124002, + "num_input_tokens_seen": 52056555, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.046875, + "step": 2403, + "time_per_iteration": 2.4267525672912598 + }, + { + "auxiliary_loss_clip": 0.01155438, + "auxiliary_loss_mlp": 0.01053748, + "balance_loss_clip": 1.03312445, + "balance_loss_mlp": 1.04934669, + "epoch": 0.14453629941379828, + "flos": 24060185464320.0, + "grad_norm": 3.423742001713465, + "language_loss": 0.70017314, + "learning_rate": 3.863957897994262e-06, + "loss": 0.72226501, + "num_input_tokens_seen": 52075800, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.0625, + "step": 2404, + "time_per_iteration": 2.487827777862549 + }, + { + "auxiliary_loss_clip": 0.01151274, + "auxiliary_loss_mlp": 0.0104872, + "balance_loss_clip": 1.02976513, + "balance_loss_mlp": 1.0473218, + "epoch": 0.14459642266646625, + "flos": 14429554646400.0, + "grad_norm": 2.368746641876408, + "language_loss": 0.72847003, + "learning_rate": 3.863816677966381e-06, + "loss": 0.75046992, + "num_input_tokens_seen": 52092585, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0390625, + "step": 2405, + "time_per_iteration": 2.458444833755493 + }, + { + "auxiliary_loss_clip": 0.01152813, + "auxiliary_loss_mlp": 0.0104761, + "balance_loss_clip": 1.02879858, + "balance_loss_mlp": 1.04891181, + "epoch": 0.14465654591913424, + "flos": 9867032179200.0, + "grad_norm": 2.2064790582144473, + "language_loss": 0.73115766, + "learning_rate": 3.863675387262386e-06, + "loss": 0.75316191, + "num_input_tokens_seen": 52108990, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0390625, + "step": 2406, + "time_per_iteration": 2.4501168727874756 + }, + { + "auxiliary_loss_clip": 0.0115439, + "auxiliary_loss_mlp": 0.01052848, + "balance_loss_clip": 1.03161645, + "balance_loss_mlp": 1.04889357, + "epoch": 0.1447166691718022, + "flos": 24972926987520.0, + "grad_norm": 4.997473868200426, + "language_loss": 0.75399184, + "learning_rate": 3.8635340258876325e-06, + "loss": 0.77606416, + "num_input_tokens_seen": 52125385, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.0546875, + "step": 2407, + "time_per_iteration": 2.482008934020996 + }, + { + "auxiliary_loss_clip": 0.01151849, + "auxiliary_loss_mlp": 0.01043439, + "balance_loss_clip": 1.02418649, + "balance_loss_mlp": 1.04607177, + "epoch": 0.14477679242447017, + "flos": 21908023200000.0, + "grad_norm": 1.6082248834480546, + "language_loss": 0.79472804, + "learning_rate": 3.8633925938474826e-06, + "loss": 0.81668091, + "num_input_tokens_seen": 52144985, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.0625, + "step": 2408, + "time_per_iteration": 2.4657323360443115 + }, + { + "auxiliary_loss_clip": 0.01155517, + "auxiliary_loss_mlp": 0.01051689, + "balance_loss_clip": 1.03006482, + "balance_loss_mlp": 1.05088127, + "epoch": 0.14483691567713813, + "flos": 20740746925440.0, + "grad_norm": 2.1979655558708893, + "language_loss": 0.82594806, + "learning_rate": 3.863251091147299e-06, + "loss": 0.84802014, + "num_input_tokens_seen": 52163885, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.046875, + "step": 2409, + "time_per_iteration": 2.450345039367676 + }, + { + "auxiliary_loss_clip": 0.01156412, + "auxiliary_loss_mlp": 0.01053711, + "balance_loss_clip": 1.03411365, + "balance_loss_mlp": 1.05046105, + "epoch": 0.1448970389298061, + "flos": 35407705536000.0, + "grad_norm": 1.954409921875598, + "language_loss": 0.74561608, + "learning_rate": 3.863109517792446e-06, + "loss": 0.7677173, + "num_input_tokens_seen": 52184325, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0625, + "step": 2410, + "time_per_iteration": 2.5861988067626953 + }, + { + "auxiliary_loss_clip": 0.01154014, + "auxiliary_loss_mlp": 0.01047443, + "balance_loss_clip": 1.02883387, + "balance_loss_mlp": 1.04858971, + "epoch": 0.14495716218247406, + "flos": 15414368808960.0, + "grad_norm": 2.3844352739280597, + "language_loss": 0.81135416, + "learning_rate": 3.8629678737882945e-06, + "loss": 0.83336866, + "num_input_tokens_seen": 52202740, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0546875, + "step": 2411, + "time_per_iteration": 2.4708898067474365 + }, + { + "auxiliary_loss_clip": 0.0115486, + "auxiliary_loss_mlp": 0.01053033, + "balance_loss_clip": 1.03403103, + "balance_loss_mlp": 1.05123138, + "epoch": 0.14501728543514203, + "flos": 33693222493440.0, + "grad_norm": 1.954560524414831, + "language_loss": 0.69816971, + "learning_rate": 3.862826159140214e-06, + "loss": 0.7202487, + "num_input_tokens_seen": 52223100, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.03125, + "step": 2412, + "time_per_iteration": 2.5614776611328125 + }, + { + "auxiliary_loss_clip": 0.0115476, + "auxiliary_loss_mlp": 0.01046078, + "balance_loss_clip": 1.02640891, + "balance_loss_mlp": 1.05100143, + "epoch": 0.14507740868781002, + "flos": 15596112648960.0, + "grad_norm": 2.1541085269745803, + "language_loss": 0.77347231, + "learning_rate": 3.862684373853579e-06, + "loss": 0.79548067, + "num_input_tokens_seen": 52239690, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0390625, + "step": 2413, + "time_per_iteration": 2.4292590618133545 + }, + { + "auxiliary_loss_clip": 0.01049286, + "auxiliary_loss_mlp": 0.0100403, + "balance_loss_clip": 1.00175285, + "balance_loss_mlp": 1.01294982, + "epoch": 0.145137531940478, + "flos": 66675343438080.0, + "grad_norm": 0.9152840666775347, + "language_loss": 0.58887923, + "learning_rate": 3.8625425179337656e-06, + "loss": 0.60941237, + "num_input_tokens_seen": 52296705, + "router_z_loss_clip": 0.02282715, + "router_z_loss_mlp": 0.36328125, + "step": 2414, + "time_per_iteration": 2.9752402305603027 + }, + { + "auxiliary_loss_clip": 0.01048826, + "auxiliary_loss_mlp": 0.01001535, + "balance_loss_clip": 0.99943656, + "balance_loss_mlp": 1.01240802, + "epoch": 0.14519765519314595, + "flos": 67521578929920.0, + "grad_norm": 0.8348908268898737, + "language_loss": 0.6218617, + "learning_rate": 3.862400591386154e-06, + "loss": 0.64236534, + "num_input_tokens_seen": 52361830, + "router_z_loss_clip": 0.02099609, + "router_z_loss_mlp": 0.36328125, + "step": 2415, + "time_per_iteration": 3.039710521697998 + }, + { + "auxiliary_loss_clip": 0.01151709, + "auxiliary_loss_mlp": 0.01046414, + "balance_loss_clip": 1.02637458, + "balance_loss_mlp": 1.04699647, + "epoch": 0.14525777844581392, + "flos": 17198913329280.0, + "grad_norm": 1.8743578134099377, + "language_loss": 0.72001135, + "learning_rate": 3.8622585942161245e-06, + "loss": 0.74199259, + "num_input_tokens_seen": 52379420, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.046875, + "step": 2416, + "time_per_iteration": 2.4283041954040527 + }, + { + "auxiliary_loss_clip": 0.0104556, + "auxiliary_loss_mlp": 0.01005813, + "balance_loss_clip": 1.00379848, + "balance_loss_mlp": 1.01002693, + "epoch": 0.14531790169848188, + "flos": 65404609015680.0, + "grad_norm": 0.711670432605859, + "language_loss": 0.60392165, + "learning_rate": 3.8621165264290635e-06, + "loss": 0.62443542, + "num_input_tokens_seen": 52446290, + "router_z_loss_clip": 0.0201416, + "router_z_loss_mlp": 0.35546875, + "step": 2417, + "time_per_iteration": 3.0824739933013916 + }, + { + "auxiliary_loss_clip": 0.01155799, + "auxiliary_loss_mlp": 0.01055986, + "balance_loss_clip": 1.03639972, + "balance_loss_mlp": 1.04795754, + "epoch": 0.14537802495114985, + "flos": 32562467372160.0, + "grad_norm": 2.9144560714513363, + "language_loss": 0.79237175, + "learning_rate": 3.861974388030356e-06, + "loss": 0.8144896, + "num_input_tokens_seen": 52467295, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.078125, + "step": 2418, + "time_per_iteration": 2.564497947692871 + }, + { + "auxiliary_loss_clip": 0.01150145, + "auxiliary_loss_mlp": 0.01051645, + "balance_loss_clip": 1.03267837, + "balance_loss_mlp": 1.04712582, + "epoch": 0.1454381482038178, + "flos": 20226685432320.0, + "grad_norm": 1.8755047341617508, + "language_loss": 0.72032261, + "learning_rate": 3.861832179025394e-06, + "loss": 0.74234051, + "num_input_tokens_seen": 52487295, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.03125, + "step": 2419, + "time_per_iteration": 2.457617998123169 + }, + { + "auxiliary_loss_clip": 0.01153915, + "auxiliary_loss_mlp": 0.01053899, + "balance_loss_clip": 1.0335021, + "balance_loss_mlp": 1.05042267, + "epoch": 0.1454982714564858, + "flos": 22893124671360.0, + "grad_norm": 2.3659429121693525, + "language_loss": 0.90125811, + "learning_rate": 3.861689899419569e-06, + "loss": 0.92333627, + "num_input_tokens_seen": 52504220, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.03125, + "step": 2420, + "time_per_iteration": 2.456087827682495 + }, + { + "auxiliary_loss_clip": 0.01154143, + "auxiliary_loss_mlp": 0.01057012, + "balance_loss_clip": 1.0382725, + "balance_loss_mlp": 1.04868603, + "epoch": 0.14555839470915377, + "flos": 20229845829120.0, + "grad_norm": 2.2940003535379057, + "language_loss": 0.83309549, + "learning_rate": 3.861547549218276e-06, + "loss": 0.85520703, + "num_input_tokens_seen": 52521900, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0546875, + "step": 2421, + "time_per_iteration": 2.441432476043701 + }, + { + "auxiliary_loss_clip": 0.01153189, + "auxiliary_loss_mlp": 0.01053683, + "balance_loss_clip": 1.03400183, + "balance_loss_mlp": 1.04684627, + "epoch": 0.14561851796182174, + "flos": 22236282616320.0, + "grad_norm": 1.6167157199382733, + "language_loss": 0.81511533, + "learning_rate": 3.861405128426914e-06, + "loss": 0.83718407, + "num_input_tokens_seen": 52540495, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0625, + "step": 2422, + "time_per_iteration": 2.473010540008545 + }, + { + "auxiliary_loss_clip": 0.01046424, + "auxiliary_loss_mlp": 0.01017838, + "balance_loss_clip": 1.01558518, + "balance_loss_mlp": 1.01065397, + "epoch": 0.1456786412144897, + "flos": 52636786289280.0, + "grad_norm": 0.9226410759759552, + "language_loss": 0.63245702, + "learning_rate": 3.861262637050883e-06, + "loss": 0.65309966, + "num_input_tokens_seen": 52603305, + "router_z_loss_clip": 0.02258301, + "router_z_loss_mlp": 0.35742188, + "step": 2423, + "time_per_iteration": 3.0516433715820312 + }, + { + "auxiliary_loss_clip": 0.01155109, + "auxiliary_loss_mlp": 0.01045363, + "balance_loss_clip": 1.02756512, + "balance_loss_mlp": 1.05096769, + "epoch": 0.14573876446715767, + "flos": 23221671396480.0, + "grad_norm": 1.7656587875688796, + "language_loss": 0.8267172, + "learning_rate": 3.861120075095585e-06, + "loss": 0.84872198, + "num_input_tokens_seen": 52623435, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 1.046875, + "step": 2424, + "time_per_iteration": 2.4918792247772217 + }, + { + "auxiliary_loss_clip": 0.01153149, + "auxiliary_loss_mlp": 0.0104962, + "balance_loss_clip": 1.03071296, + "balance_loss_mlp": 1.04970837, + "epoch": 0.14579888771982563, + "flos": 18114384286080.0, + "grad_norm": 2.0603730404595915, + "language_loss": 0.79317909, + "learning_rate": 3.860977442566429e-06, + "loss": 0.81520677, + "num_input_tokens_seen": 52642255, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.03125, + "step": 2425, + "time_per_iteration": 2.4607083797454834 + }, + { + "auxiliary_loss_clip": 0.01155851, + "auxiliary_loss_mlp": 0.01048544, + "balance_loss_clip": 1.030007, + "balance_loss_mlp": 1.05136847, + "epoch": 0.14585901097249362, + "flos": 23001107932800.0, + "grad_norm": 2.4026453111661703, + "language_loss": 0.83269531, + "learning_rate": 3.860834739468821e-06, + "loss": 0.85473925, + "num_input_tokens_seen": 52658700, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0390625, + "step": 2426, + "time_per_iteration": 2.4615883827209473 + }, + { + "auxiliary_loss_clip": 0.01153183, + "auxiliary_loss_mlp": 0.0104253, + "balance_loss_clip": 1.02420735, + "balance_loss_mlp": 1.05100346, + "epoch": 0.1459191342251616, + "flos": 21908669644800.0, + "grad_norm": 1.78851961601388, + "language_loss": 0.86878085, + "learning_rate": 3.860691965808173e-06, + "loss": 0.89073801, + "num_input_tokens_seen": 52678140, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.0234375, + "step": 2427, + "time_per_iteration": 2.46846866607666 + }, + { + "auxiliary_loss_clip": 0.01159617, + "auxiliary_loss_mlp": 0.01046481, + "balance_loss_clip": 1.0264895, + "balance_loss_mlp": 1.05060291, + "epoch": 0.14597925747782955, + "flos": 14975504438400.0, + "grad_norm": 1.9424277979169204, + "language_loss": 0.66795039, + "learning_rate": 3.8605491215899e-06, + "loss": 0.69001138, + "num_input_tokens_seen": 52696825, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.0859375, + "step": 2428, + "time_per_iteration": 2.4277987480163574 + }, + { + "auxiliary_loss_clip": 0.01154279, + "auxiliary_loss_mlp": 0.01048778, + "balance_loss_clip": 1.02870345, + "balance_loss_mlp": 1.05036306, + "epoch": 0.14603938073049752, + "flos": 21068898600960.0, + "grad_norm": 1.7447652065053452, + "language_loss": 0.8363744, + "learning_rate": 3.860406206819417e-06, + "loss": 0.85840499, + "num_input_tokens_seen": 52715125, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0390625, + "step": 2429, + "time_per_iteration": 2.5208661556243896 + }, + { + "auxiliary_loss_clip": 0.01152615, + "auxiliary_loss_mlp": 0.01048492, + "balance_loss_clip": 1.02972817, + "balance_loss_mlp": 1.04804671, + "epoch": 0.14609950398316549, + "flos": 19864777950720.0, + "grad_norm": 1.723947749216575, + "language_loss": 0.78811824, + "learning_rate": 3.860263221502145e-06, + "loss": 0.8101294, + "num_input_tokens_seen": 52734015, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.046875, + "step": 2430, + "time_per_iteration": 2.460575580596924 + }, + { + "auxiliary_loss_clip": 0.0115835, + "auxiliary_loss_mlp": 0.01049311, + "balance_loss_clip": 1.03014231, + "balance_loss_mlp": 1.0529238, + "epoch": 0.14615962723583345, + "flos": 22418852469120.0, + "grad_norm": 2.3723861833809767, + "language_loss": 0.83178174, + "learning_rate": 3.860120165643504e-06, + "loss": 0.85385835, + "num_input_tokens_seen": 52753025, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0546875, + "step": 2431, + "time_per_iteration": 2.468472480773926 + }, + { + "auxiliary_loss_clip": 0.01158923, + "auxiliary_loss_mlp": 0.01053127, + "balance_loss_clip": 1.03244448, + "balance_loss_mlp": 1.05131185, + "epoch": 0.14621975048850142, + "flos": 22346241125760.0, + "grad_norm": 1.7402379411604871, + "language_loss": 0.78777766, + "learning_rate": 3.859977039248921e-06, + "loss": 0.80989814, + "num_input_tokens_seen": 52773420, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.078125, + "step": 2432, + "time_per_iteration": 2.4618513584136963 + }, + { + "auxiliary_loss_clip": 0.01153865, + "auxiliary_loss_mlp": 0.01052087, + "balance_loss_clip": 1.03158331, + "balance_loss_mlp": 1.04917812, + "epoch": 0.1462798737411694, + "flos": 24389163152640.0, + "grad_norm": 1.9105383938395448, + "language_loss": 0.79940903, + "learning_rate": 3.859833842323822e-06, + "loss": 0.82146859, + "num_input_tokens_seen": 52792870, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.046875, + "step": 2433, + "time_per_iteration": 2.4901435375213623 + }, + { + "auxiliary_loss_clip": 0.01152814, + "auxiliary_loss_mlp": 0.01051119, + "balance_loss_clip": 1.03149712, + "balance_loss_mlp": 1.05186844, + "epoch": 0.14633999699383737, + "flos": 19244672530560.0, + "grad_norm": 1.8984055506020234, + "language_loss": 0.78421938, + "learning_rate": 3.859690574873638e-06, + "loss": 0.80625868, + "num_input_tokens_seen": 52811615, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0078125, + "step": 2434, + "time_per_iteration": 3.833007335662842 + }, + { + "auxiliary_loss_clip": 0.01046525, + "auxiliary_loss_mlp": 0.01005945, + "balance_loss_clip": 1.00356054, + "balance_loss_mlp": 1.01038933, + "epoch": 0.14640012024650534, + "flos": 62660638270080.0, + "grad_norm": 0.8674820067375166, + "language_loss": 0.58373666, + "learning_rate": 3.8595472369038e-06, + "loss": 0.60426134, + "num_input_tokens_seen": 52873230, + "router_z_loss_clip": 0.02380371, + "router_z_loss_mlp": 0.36132812, + "step": 2435, + "time_per_iteration": 5.911077499389648 + }, + { + "auxiliary_loss_clip": 0.01147895, + "auxiliary_loss_mlp": 0.01045492, + "balance_loss_clip": 1.02620411, + "balance_loss_mlp": 1.04662895, + "epoch": 0.1464602434991733, + "flos": 12276243146880.0, + "grad_norm": 2.2832294661951753, + "language_loss": 0.88395989, + "learning_rate": 3.859403828419744e-06, + "loss": 0.90589368, + "num_input_tokens_seen": 52889325, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.015625, + "step": 2436, + "time_per_iteration": 2.440303325653076 + }, + { + "auxiliary_loss_clip": 0.01156296, + "auxiliary_loss_mlp": 0.01046658, + "balance_loss_clip": 1.02697682, + "balance_loss_mlp": 1.05032742, + "epoch": 0.14652036675184127, + "flos": 20922311197440.0, + "grad_norm": 2.0196076648737, + "language_loss": 0.74832988, + "learning_rate": 3.85926034942691e-06, + "loss": 0.7703594, + "num_input_tokens_seen": 52909705, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0625, + "step": 2437, + "time_per_iteration": 2.460806369781494 + }, + { + "auxiliary_loss_clip": 0.01153675, + "auxiliary_loss_mlp": 0.01044961, + "balance_loss_clip": 1.02374196, + "balance_loss_mlp": 1.04798007, + "epoch": 0.14658049000450923, + "flos": 27703681528320.0, + "grad_norm": 2.346268485469047, + "language_loss": 0.73932636, + "learning_rate": 3.859116799930736e-06, + "loss": 0.76131272, + "num_input_tokens_seen": 52930300, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.0546875, + "step": 2438, + "time_per_iteration": 2.5051729679107666 + }, + { + "auxiliary_loss_clip": 0.01154512, + "auxiliary_loss_mlp": 0.01041271, + "balance_loss_clip": 1.02310383, + "balance_loss_mlp": 1.05231857, + "epoch": 0.14664061325717723, + "flos": 24936513575040.0, + "grad_norm": 1.8289443089735578, + "language_loss": 0.74791402, + "learning_rate": 3.858973179936668e-06, + "loss": 0.76987189, + "num_input_tokens_seen": 52949955, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.0234375, + "step": 2439, + "time_per_iteration": 2.4596338272094727 + }, + { + "auxiliary_loss_clip": 0.01151843, + "auxiliary_loss_mlp": 0.01047986, + "balance_loss_clip": 1.02872145, + "balance_loss_mlp": 1.04913521, + "epoch": 0.1467007365098452, + "flos": 40297661406720.0, + "grad_norm": 2.106046924266039, + "language_loss": 0.74542844, + "learning_rate": 3.85882948945015e-06, + "loss": 0.76742673, + "num_input_tokens_seen": 52972905, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.03125, + "step": 2440, + "time_per_iteration": 2.613889217376709 + }, + { + "auxiliary_loss_clip": 0.01146734, + "auxiliary_loss_mlp": 0.01048348, + "balance_loss_clip": 1.02964425, + "balance_loss_mlp": 1.04660702, + "epoch": 0.14676085976251316, + "flos": 26541074021760.0, + "grad_norm": 1.6151911954653986, + "language_loss": 0.83047861, + "learning_rate": 3.85868572847663e-06, + "loss": 0.85242939, + "num_input_tokens_seen": 52994850, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0, + "step": 2441, + "time_per_iteration": 2.508570432662964 + }, + { + "auxiliary_loss_clip": 0.01157481, + "auxiliary_loss_mlp": 0.0104725, + "balance_loss_clip": 1.0275681, + "balance_loss_mlp": 1.04952955, + "epoch": 0.14682098301518112, + "flos": 23550110380800.0, + "grad_norm": 3.362343971731744, + "language_loss": 0.71562135, + "learning_rate": 3.858541897021563e-06, + "loss": 0.73766863, + "num_input_tokens_seen": 53014740, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.078125, + "step": 2442, + "time_per_iteration": 2.4903416633605957 + }, + { + "auxiliary_loss_clip": 0.01160717, + "auxiliary_loss_mlp": 0.01042253, + "balance_loss_clip": 1.02257109, + "balance_loss_mlp": 1.0510819, + "epoch": 0.1468811062678491, + "flos": 11651073909120.0, + "grad_norm": 3.2762909335645043, + "language_loss": 0.80804002, + "learning_rate": 3.8583979950904e-06, + "loss": 0.83006966, + "num_input_tokens_seen": 53029780, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.09375, + "step": 2443, + "time_per_iteration": 2.424539089202881 + }, + { + "auxiliary_loss_clip": 0.01154468, + "auxiliary_loss_mlp": 0.01049831, + "balance_loss_clip": 1.03005409, + "balance_loss_mlp": 1.0504694, + "epoch": 0.14694122952051705, + "flos": 23002616304000.0, + "grad_norm": 2.077049554342068, + "language_loss": 0.8297509, + "learning_rate": 3.858254022688599e-06, + "loss": 0.85179389, + "num_input_tokens_seen": 53048620, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0390625, + "step": 2444, + "time_per_iteration": 2.4937214851379395 + }, + { + "auxiliary_loss_clip": 0.01154781, + "auxiliary_loss_mlp": 0.01048939, + "balance_loss_clip": 1.02961493, + "balance_loss_mlp": 1.05025554, + "epoch": 0.14700135277318502, + "flos": 26502972670080.0, + "grad_norm": 1.763635964291881, + "language_loss": 0.71218902, + "learning_rate": 3.85810997982162e-06, + "loss": 0.73422623, + "num_input_tokens_seen": 53070055, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.046875, + "step": 2445, + "time_per_iteration": 2.491645336151123 + }, + { + "auxiliary_loss_clip": 0.01045345, + "auxiliary_loss_mlp": 0.01028611, + "balance_loss_clip": 1.02659595, + "balance_loss_mlp": 1.00942683, + "epoch": 0.147061476025853, + "flos": 59449434387840.0, + "grad_norm": 0.8232649654452494, + "language_loss": 0.63138294, + "learning_rate": 3.857965866494923e-06, + "loss": 0.6521225, + "num_input_tokens_seen": 53126945, + "router_z_loss_clip": 0.0201416, + "router_z_loss_mlp": 0.359375, + "step": 2446, + "time_per_iteration": 2.9610531330108643 + }, + { + "auxiliary_loss_clip": 0.01158924, + "auxiliary_loss_mlp": 0.0104308, + "balance_loss_clip": 1.02355385, + "balance_loss_mlp": 1.05348802, + "epoch": 0.14712159927852098, + "flos": 28330897841280.0, + "grad_norm": 1.8119571313268434, + "language_loss": 0.74937665, + "learning_rate": 3.857821682713975e-06, + "loss": 0.7713967, + "num_input_tokens_seen": 53149130, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0546875, + "step": 2447, + "time_per_iteration": 2.547112226486206 + }, + { + "auxiliary_loss_clip": 0.0115445, + "auxiliary_loss_mlp": 0.01046965, + "balance_loss_clip": 1.02838051, + "balance_loss_mlp": 1.04998112, + "epoch": 0.14718172253118894, + "flos": 27089825074560.0, + "grad_norm": 2.0554455972062744, + "language_loss": 0.85722244, + "learning_rate": 3.857677428484242e-06, + "loss": 0.87923658, + "num_input_tokens_seen": 53167120, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.046875, + "step": 2448, + "time_per_iteration": 2.519530773162842 + }, + { + "auxiliary_loss_clip": 0.01045412, + "auxiliary_loss_mlp": 0.01010534, + "balance_loss_clip": 1.0081377, + "balance_loss_mlp": 1.00952029, + "epoch": 0.1472418457838569, + "flos": 66706764860160.0, + "grad_norm": 0.7649510042513386, + "language_loss": 0.56836212, + "learning_rate": 3.857533103811195e-06, + "loss": 0.58892155, + "num_input_tokens_seen": 53227945, + "router_z_loss_clip": 0.02392578, + "router_z_loss_mlp": 0.359375, + "step": 2449, + "time_per_iteration": 3.0049068927764893 + }, + { + "auxiliary_loss_clip": 0.01150109, + "auxiliary_loss_mlp": 0.01044261, + "balance_loss_clip": 1.02462673, + "balance_loss_mlp": 1.04850447, + "epoch": 0.14730196903652487, + "flos": 19573578391680.0, + "grad_norm": 1.900224172693126, + "language_loss": 0.85544562, + "learning_rate": 3.857388708700307e-06, + "loss": 0.87738931, + "num_input_tokens_seen": 53244615, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.015625, + "step": 2450, + "time_per_iteration": 2.5826945304870605 + }, + { + "auxiliary_loss_clip": 0.01155696, + "auxiliary_loss_mlp": 0.01049879, + "balance_loss_clip": 1.03009009, + "balance_loss_mlp": 1.05074143, + "epoch": 0.14736209228919284, + "flos": 16071031296000.0, + "grad_norm": 2.029178420182481, + "language_loss": 0.74693608, + "learning_rate": 3.857244243157052e-06, + "loss": 0.76899183, + "num_input_tokens_seen": 53262205, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0546875, + "step": 2451, + "time_per_iteration": 2.4345250129699707 + }, + { + "auxiliary_loss_clip": 0.01146898, + "auxiliary_loss_mlp": 0.01039395, + "balance_loss_clip": 1.02092934, + "balance_loss_mlp": 1.04758763, + "epoch": 0.1474222155418608, + "flos": 23039460679680.0, + "grad_norm": 1.6073898366987713, + "language_loss": 0.82240498, + "learning_rate": 3.85709970718691e-06, + "loss": 0.8442679, + "num_input_tokens_seen": 53282445, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.9921875, + "step": 2452, + "time_per_iteration": 2.468869924545288 + }, + { + "auxiliary_loss_clip": 0.01154267, + "auxiliary_loss_mlp": 0.01038485, + "balance_loss_clip": 1.02032936, + "balance_loss_mlp": 1.05154371, + "epoch": 0.1474823387945288, + "flos": 17018641946880.0, + "grad_norm": 1.7191329381743174, + "language_loss": 0.74021572, + "learning_rate": 3.856955100795361e-06, + "loss": 0.76214325, + "num_input_tokens_seen": 53299060, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.03125, + "step": 2453, + "time_per_iteration": 2.433424472808838 + }, + { + "auxiliary_loss_clip": 0.01154761, + "auxiliary_loss_mlp": 0.01050025, + "balance_loss_clip": 1.03048682, + "balance_loss_mlp": 1.04918802, + "epoch": 0.14754246204719676, + "flos": 17895041884800.0, + "grad_norm": 2.171465059586897, + "language_loss": 0.76326835, + "learning_rate": 3.856810423987889e-06, + "loss": 0.78531623, + "num_input_tokens_seen": 53315970, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0546875, + "step": 2454, + "time_per_iteration": 2.419368028640747 + }, + { + "auxiliary_loss_clip": 0.01155198, + "auxiliary_loss_mlp": 0.01038866, + "balance_loss_clip": 1.01980472, + "balance_loss_mlp": 1.04922831, + "epoch": 0.14760258529986472, + "flos": 13079097987840.0, + "grad_norm": 2.006370127686132, + "language_loss": 0.8301537, + "learning_rate": 3.856665676769979e-06, + "loss": 0.85209435, + "num_input_tokens_seen": 53332940, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0625, + "step": 2455, + "time_per_iteration": 2.426819324493408 + }, + { + "auxiliary_loss_clip": 0.01157227, + "auxiliary_loss_mlp": 0.01044033, + "balance_loss_clip": 1.02519834, + "balance_loss_mlp": 1.04846048, + "epoch": 0.1476627085525327, + "flos": 30806399358720.0, + "grad_norm": 2.442844218228049, + "language_loss": 0.83938581, + "learning_rate": 3.85652085914712e-06, + "loss": 0.8613984, + "num_input_tokens_seen": 53353295, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.09375, + "step": 2456, + "time_per_iteration": 2.525296926498413 + }, + { + "auxiliary_loss_clip": 0.01151791, + "auxiliary_loss_mlp": 0.01043419, + "balance_loss_clip": 1.02459574, + "balance_loss_mlp": 1.04980254, + "epoch": 0.14772283180520066, + "flos": 21689434984320.0, + "grad_norm": 1.8839437807359896, + "language_loss": 0.84325618, + "learning_rate": 3.856375971124805e-06, + "loss": 0.86520827, + "num_input_tokens_seen": 53373410, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0234375, + "step": 2457, + "time_per_iteration": 2.471068859100342 + }, + { + "auxiliary_loss_clip": 0.01149123, + "auxiliary_loss_mlp": 0.01041136, + "balance_loss_clip": 1.02237296, + "balance_loss_mlp": 1.04932761, + "epoch": 0.14778295505786862, + "flos": 18770400328320.0, + "grad_norm": 1.9862753985638202, + "language_loss": 0.75645256, + "learning_rate": 3.856231012708527e-06, + "loss": 0.77835512, + "num_input_tokens_seen": 53391430, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.99609375, + "step": 2458, + "time_per_iteration": 2.44146466255188 + }, + { + "auxiliary_loss_clip": 0.01160318, + "auxiliary_loss_mlp": 0.01049421, + "balance_loss_clip": 1.0284996, + "balance_loss_mlp": 1.05119324, + "epoch": 0.1478430783105366, + "flos": 22893555634560.0, + "grad_norm": 2.405388225865701, + "language_loss": 0.83817005, + "learning_rate": 3.856085983903782e-06, + "loss": 0.86026746, + "num_input_tokens_seen": 53409960, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.09375, + "step": 2459, + "time_per_iteration": 2.470345973968506 + }, + { + "auxiliary_loss_clip": 0.01149406, + "auxiliary_loss_mlp": 0.01041796, + "balance_loss_clip": 1.02359271, + "balance_loss_mlp": 1.0489651, + "epoch": 0.14790320156320458, + "flos": 15085319293440.0, + "grad_norm": 2.6666731923680733, + "language_loss": 0.75856471, + "learning_rate": 3.855940884716071e-06, + "loss": 0.78047681, + "num_input_tokens_seen": 53426160, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.0, + "step": 2460, + "time_per_iteration": 2.4294657707214355 + }, + { + "auxiliary_loss_clip": 0.01157631, + "auxiliary_loss_mlp": 0.01042027, + "balance_loss_clip": 1.02260733, + "balance_loss_mlp": 1.05102873, + "epoch": 0.14796332481587254, + "flos": 26504768350080.0, + "grad_norm": 1.6904429322803973, + "language_loss": 0.81591463, + "learning_rate": 3.855795715150896e-06, + "loss": 0.83791113, + "num_input_tokens_seen": 53448530, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0703125, + "step": 2461, + "time_per_iteration": 2.4993178844451904 + }, + { + "auxiliary_loss_clip": 0.01159506, + "auxiliary_loss_mlp": 0.01046713, + "balance_loss_clip": 1.02611399, + "balance_loss_mlp": 1.05356562, + "epoch": 0.1480234480685405, + "flos": 17563191108480.0, + "grad_norm": 3.2471604819605036, + "language_loss": 0.65689576, + "learning_rate": 3.855650475213761e-06, + "loss": 0.678958, + "num_input_tokens_seen": 53465915, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.0625, + "step": 2462, + "time_per_iteration": 2.4197235107421875 + }, + { + "auxiliary_loss_clip": 0.0115574, + "auxiliary_loss_mlp": 0.01048819, + "balance_loss_clip": 1.02929282, + "balance_loss_mlp": 1.05148113, + "epoch": 0.14808357132120847, + "flos": 53582203232640.0, + "grad_norm": 1.4717210360784851, + "language_loss": 0.67368174, + "learning_rate": 3.8555051649101745e-06, + "loss": 0.69572735, + "num_input_tokens_seen": 53496055, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0390625, + "step": 2463, + "time_per_iteration": 2.774268865585327 + }, + { + "auxiliary_loss_clip": 0.01154664, + "auxiliary_loss_mlp": 0.01050077, + "balance_loss_clip": 1.03071713, + "balance_loss_mlp": 1.04978383, + "epoch": 0.14814369457387644, + "flos": 19829190551040.0, + "grad_norm": 2.177919724516607, + "language_loss": 0.76567936, + "learning_rate": 3.855359784245646e-06, + "loss": 0.78772676, + "num_input_tokens_seen": 53513790, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.046875, + "step": 2464, + "time_per_iteration": 2.4522674083709717 + }, + { + "auxiliary_loss_clip": 0.01152455, + "auxiliary_loss_mlp": 0.01049156, + "balance_loss_clip": 1.03089297, + "balance_loss_mlp": 1.05009413, + "epoch": 0.1482038178265444, + "flos": 23914962777600.0, + "grad_norm": 1.623144605896263, + "language_loss": 0.79623306, + "learning_rate": 3.855214333225688e-06, + "loss": 0.81824923, + "num_input_tokens_seen": 53533410, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.0234375, + "step": 2465, + "time_per_iteration": 2.4946794509887695 + }, + { + "auxiliary_loss_clip": 0.01159963, + "auxiliary_loss_mlp": 0.01045929, + "balance_loss_clip": 1.02543747, + "balance_loss_mlp": 1.0522809, + "epoch": 0.1482639410792124, + "flos": 24170503109760.0, + "grad_norm": 2.8838905575360925, + "language_loss": 0.76230991, + "learning_rate": 3.855068811855817e-06, + "loss": 0.78436887, + "num_input_tokens_seen": 53554775, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.078125, + "step": 2466, + "time_per_iteration": 2.4722483158111572 + }, + { + "auxiliary_loss_clip": 0.01052707, + "auxiliary_loss_mlp": 0.01020247, + "balance_loss_clip": 1.01781487, + "balance_loss_mlp": 1.01613474, + "epoch": 0.14832406433188036, + "flos": 66191051341440.0, + "grad_norm": 0.8013334536894682, + "language_loss": 0.60022712, + "learning_rate": 3.854923220141551e-06, + "loss": 0.62095666, + "num_input_tokens_seen": 53609675, + "router_z_loss_clip": 0.02429199, + "router_z_loss_mlp": 0.3671875, + "step": 2467, + "time_per_iteration": 3.0702927112579346 + }, + { + "auxiliary_loss_clip": 0.01154799, + "auxiliary_loss_mlp": 0.01043072, + "balance_loss_clip": 1.02393889, + "balance_loss_mlp": 1.05059397, + "epoch": 0.14838418758454833, + "flos": 25411252654080.0, + "grad_norm": 2.3345318496369405, + "language_loss": 0.87671721, + "learning_rate": 3.85477755808841e-06, + "loss": 0.89869595, + "num_input_tokens_seen": 53626950, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.046875, + "step": 2468, + "time_per_iteration": 2.5042202472686768 + }, + { + "auxiliary_loss_clip": 0.0115781, + "auxiliary_loss_mlp": 0.01052711, + "balance_loss_clip": 1.0322901, + "balance_loss_mlp": 1.05078602, + "epoch": 0.1484443108372163, + "flos": 23289901280640.0, + "grad_norm": 4.884804263226826, + "language_loss": 0.75884396, + "learning_rate": 3.854631825701919e-06, + "loss": 0.78094912, + "num_input_tokens_seen": 53644200, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0703125, + "step": 2469, + "time_per_iteration": 2.4750967025756836 + }, + { + "auxiliary_loss_clip": 0.01153722, + "auxiliary_loss_mlp": 0.01053888, + "balance_loss_clip": 1.03425384, + "balance_loss_mlp": 1.04954958, + "epoch": 0.14850443408988426, + "flos": 14647675985280.0, + "grad_norm": 2.457578452134473, + "language_loss": 0.76183128, + "learning_rate": 3.854486022987603e-06, + "loss": 0.78390741, + "num_input_tokens_seen": 53659650, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.046875, + "step": 2470, + "time_per_iteration": 2.4312937259674072 + }, + { + "auxiliary_loss_clip": 0.01152707, + "auxiliary_loss_mlp": 0.01045721, + "balance_loss_clip": 1.02651668, + "balance_loss_mlp": 1.05050206, + "epoch": 0.14856455734255222, + "flos": 23548314700800.0, + "grad_norm": 1.9398758609720104, + "language_loss": 0.72121894, + "learning_rate": 3.8543401499509905e-06, + "loss": 0.74320322, + "num_input_tokens_seen": 53680275, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0234375, + "step": 2471, + "time_per_iteration": 2.519866466522217 + }, + { + "auxiliary_loss_clip": 0.01160204, + "auxiliary_loss_mlp": 0.01047639, + "balance_loss_clip": 1.0272181, + "balance_loss_mlp": 1.0499022, + "epoch": 0.1486246805952202, + "flos": 18077288515200.0, + "grad_norm": 2.11598070664324, + "language_loss": 0.89739621, + "learning_rate": 3.854194206597615e-06, + "loss": 0.91947466, + "num_input_tokens_seen": 53698270, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.1015625, + "step": 2472, + "time_per_iteration": 2.4281632900238037 + }, + { + "auxiliary_loss_clip": 0.01155174, + "auxiliary_loss_mlp": 0.01049471, + "balance_loss_clip": 1.030123, + "balance_loss_mlp": 1.05059123, + "epoch": 0.14868480384788818, + "flos": 19353625459200.0, + "grad_norm": 4.013793804030176, + "language_loss": 0.80734539, + "learning_rate": 3.854048192933008e-06, + "loss": 0.82939184, + "num_input_tokens_seen": 53716845, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.046875, + "step": 2473, + "time_per_iteration": 2.4329466819763184 + }, + { + "auxiliary_loss_clip": 0.0115911, + "auxiliary_loss_mlp": 0.01063152, + "balance_loss_clip": 1.04358959, + "balance_loss_mlp": 1.05129409, + "epoch": 0.14874492710055615, + "flos": 22200192426240.0, + "grad_norm": 2.5981192604624526, + "language_loss": 0.77540123, + "learning_rate": 3.853902108962709e-06, + "loss": 0.79762381, + "num_input_tokens_seen": 53734970, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.078125, + "step": 2474, + "time_per_iteration": 2.453432083129883 + }, + { + "auxiliary_loss_clip": 0.01157718, + "auxiliary_loss_mlp": 0.01057886, + "balance_loss_clip": 1.03763211, + "balance_loss_mlp": 1.04955983, + "epoch": 0.1488050503532241, + "flos": 21103444506240.0, + "grad_norm": 1.8103491271764227, + "language_loss": 0.82315612, + "learning_rate": 3.853755954692255e-06, + "loss": 0.84531218, + "num_input_tokens_seen": 53753415, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.0859375, + "step": 2475, + "time_per_iteration": 2.4591174125671387 + }, + { + "auxiliary_loss_clip": 0.01157844, + "auxiliary_loss_mlp": 0.01058234, + "balance_loss_clip": 1.03985167, + "balance_loss_mlp": 1.05399168, + "epoch": 0.14886517360589208, + "flos": 12786569625600.0, + "grad_norm": 1.9240192853863896, + "language_loss": 0.80811602, + "learning_rate": 3.85360973012719e-06, + "loss": 0.83027685, + "num_input_tokens_seen": 53770305, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.0390625, + "step": 2476, + "time_per_iteration": 3.810553789138794 + }, + { + "auxiliary_loss_clip": 0.01148934, + "auxiliary_loss_mlp": 0.01053022, + "balance_loss_clip": 1.03467607, + "balance_loss_mlp": 1.05016851, + "epoch": 0.14892529685856004, + "flos": 29022860419200.0, + "grad_norm": 1.8396010916090604, + "language_loss": 0.77889222, + "learning_rate": 3.853463435273058e-06, + "loss": 0.80091178, + "num_input_tokens_seen": 53788895, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.98828125, + "step": 2477, + "time_per_iteration": 4.031312942504883 + }, + { + "auxiliary_loss_clip": 0.01048753, + "auxiliary_loss_mlp": 0.01018076, + "balance_loss_clip": 1.01581085, + "balance_loss_mlp": 1.01302671, + "epoch": 0.148985420111228, + "flos": 61926121054080.0, + "grad_norm": 0.8050876444063699, + "language_loss": 0.60130364, + "learning_rate": 3.853317070135407e-06, + "loss": 0.62197196, + "num_input_tokens_seen": 53850260, + "router_z_loss_clip": 0.02270508, + "router_z_loss_mlp": 0.35742188, + "step": 2478, + "time_per_iteration": 3.1073787212371826 + }, + { + "auxiliary_loss_clip": 0.01154836, + "auxiliary_loss_mlp": 0.01044957, + "balance_loss_clip": 1.02695656, + "balance_loss_mlp": 1.05078554, + "epoch": 0.149045543363896, + "flos": 23915106432000.0, + "grad_norm": 2.232556799389181, + "language_loss": 0.70951897, + "learning_rate": 3.853170634719787e-06, + "loss": 0.7315169, + "num_input_tokens_seen": 53867520, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0390625, + "step": 2479, + "time_per_iteration": 2.475215435028076 + }, + { + "auxiliary_loss_clip": 0.01153193, + "auxiliary_loss_mlp": 0.01050847, + "balance_loss_clip": 1.0313679, + "balance_loss_mlp": 1.04886127, + "epoch": 0.14910566661656396, + "flos": 23654394541440.0, + "grad_norm": 1.5896653051626852, + "language_loss": 0.80748487, + "learning_rate": 3.853024129031751e-06, + "loss": 0.82952535, + "num_input_tokens_seen": 53886620, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.046875, + "step": 2480, + "time_per_iteration": 2.4618492126464844 + }, + { + "auxiliary_loss_clip": 0.01156746, + "auxiliary_loss_mlp": 0.01047338, + "balance_loss_clip": 1.02838397, + "balance_loss_mlp": 1.05017209, + "epoch": 0.14916578986923193, + "flos": 20515299212160.0, + "grad_norm": 2.4101793906634894, + "language_loss": 0.84132183, + "learning_rate": 3.852877553076854e-06, + "loss": 0.86336267, + "num_input_tokens_seen": 53902230, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0625, + "step": 2481, + "time_per_iteration": 2.437391519546509 + }, + { + "auxiliary_loss_clip": 0.01152664, + "auxiliary_loss_mlp": 0.01051193, + "balance_loss_clip": 1.03046227, + "balance_loss_mlp": 1.04808569, + "epoch": 0.1492259131218999, + "flos": 22491822948480.0, + "grad_norm": 3.194199563979109, + "language_loss": 0.77347398, + "learning_rate": 3.8527309068606546e-06, + "loss": 0.79551256, + "num_input_tokens_seen": 53919475, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.046875, + "step": 2482, + "time_per_iteration": 2.4710068702697754 + }, + { + "auxiliary_loss_clip": 0.01161857, + "auxiliary_loss_mlp": 0.01040162, + "balance_loss_clip": 1.01939583, + "balance_loss_mlp": 1.05186439, + "epoch": 0.14928603637456786, + "flos": 23185868515200.0, + "grad_norm": 2.968394626295353, + "language_loss": 0.78719991, + "learning_rate": 3.852584190388713e-06, + "loss": 0.80922014, + "num_input_tokens_seen": 53939150, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1015625, + "step": 2483, + "time_per_iteration": 2.5075182914733887 + }, + { + "auxiliary_loss_clip": 0.0114759, + "auxiliary_loss_mlp": 0.01040314, + "balance_loss_clip": 1.02304077, + "balance_loss_mlp": 1.04774714, + "epoch": 0.14934615962723582, + "flos": 21653237053440.0, + "grad_norm": 1.642113570978582, + "language_loss": 0.70521605, + "learning_rate": 3.852437403666595e-06, + "loss": 0.72709513, + "num_input_tokens_seen": 53958735, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 1.0, + "step": 2484, + "time_per_iteration": 2.4810657501220703 + }, + { + "auxiliary_loss_clip": 0.01154341, + "auxiliary_loss_mlp": 0.01041731, + "balance_loss_clip": 1.02049971, + "balance_loss_mlp": 1.04769683, + "epoch": 0.1494062828799038, + "flos": 27010066924800.0, + "grad_norm": 2.5518326423103654, + "language_loss": 0.84396368, + "learning_rate": 3.852290546699863e-06, + "loss": 0.86592442, + "num_input_tokens_seen": 53975065, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.0703125, + "step": 2485, + "time_per_iteration": 2.47004771232605 + }, + { + "auxiliary_loss_clip": 0.01155612, + "auxiliary_loss_mlp": 0.01044521, + "balance_loss_clip": 1.02442229, + "balance_loss_mlp": 1.04906201, + "epoch": 0.14946640613257178, + "flos": 21214947300480.0, + "grad_norm": 2.1854599778658663, + "language_loss": 0.84902173, + "learning_rate": 3.8521436194940894e-06, + "loss": 0.87102306, + "num_input_tokens_seen": 53993330, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0625, + "step": 2486, + "time_per_iteration": 2.4553234577178955 + }, + { + "auxiliary_loss_clip": 0.011482, + "auxiliary_loss_mlp": 0.01038818, + "balance_loss_clip": 1.02208114, + "balance_loss_mlp": 1.04672825, + "epoch": 0.14952652938523975, + "flos": 13370872164480.0, + "grad_norm": 2.4579579723442855, + "language_loss": 0.74329305, + "learning_rate": 3.851996622054842e-06, + "loss": 0.76516318, + "num_input_tokens_seen": 54010515, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 1.015625, + "step": 2487, + "time_per_iteration": 2.436316967010498 + }, + { + "auxiliary_loss_clip": 0.01148703, + "auxiliary_loss_mlp": 0.01048053, + "balance_loss_clip": 1.02934861, + "balance_loss_mlp": 1.04707325, + "epoch": 0.1495866526379077, + "flos": 35517699959040.0, + "grad_norm": 2.1423480103066375, + "language_loss": 0.71837348, + "learning_rate": 3.8518495543877e-06, + "loss": 0.74034101, + "num_input_tokens_seen": 54031315, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 2488, + "time_per_iteration": 2.649794816970825 + }, + { + "auxiliary_loss_clip": 0.01156424, + "auxiliary_loss_mlp": 0.01046549, + "balance_loss_clip": 1.02780962, + "balance_loss_mlp": 1.04946375, + "epoch": 0.14964677589057568, + "flos": 17632749795840.0, + "grad_norm": 2.5167610907777513, + "language_loss": 0.70519507, + "learning_rate": 3.851702416498235e-06, + "loss": 0.72722483, + "num_input_tokens_seen": 54045965, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0703125, + "step": 2489, + "time_per_iteration": 2.416708469390869 + }, + { + "auxiliary_loss_clip": 0.01153385, + "auxiliary_loss_mlp": 0.01045512, + "balance_loss_clip": 1.02637911, + "balance_loss_mlp": 1.04785299, + "epoch": 0.14970689914324364, + "flos": 20185280029440.0, + "grad_norm": 6.063777716142612, + "language_loss": 0.81789696, + "learning_rate": 3.8515552083920295e-06, + "loss": 0.83988589, + "num_input_tokens_seen": 54059960, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0546875, + "step": 2490, + "time_per_iteration": 2.433284282684326 + }, + { + "auxiliary_loss_clip": 0.0115747, + "auxiliary_loss_mlp": 0.01042151, + "balance_loss_clip": 1.02357852, + "balance_loss_mlp": 1.05097246, + "epoch": 0.1497670223959116, + "flos": 37228699382400.0, + "grad_norm": 1.781748843431282, + "language_loss": 0.79878485, + "learning_rate": 3.851407930074666e-06, + "loss": 0.82078111, + "num_input_tokens_seen": 54079330, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0625, + "step": 2491, + "time_per_iteration": 2.616642475128174 + }, + { + "auxiliary_loss_clip": 0.0115457, + "auxiliary_loss_mlp": 0.01046038, + "balance_loss_clip": 1.02491403, + "balance_loss_mlp": 1.04683256, + "epoch": 0.1498271456485796, + "flos": 24455848752000.0, + "grad_norm": 2.263792295832721, + "language_loss": 0.90779251, + "learning_rate": 3.851260581551727e-06, + "loss": 0.9297986, + "num_input_tokens_seen": 54097555, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.078125, + "step": 2492, + "time_per_iteration": 2.508188009262085 + }, + { + "auxiliary_loss_clip": 0.01152347, + "auxiliary_loss_mlp": 0.01059815, + "balance_loss_clip": 1.04028893, + "balance_loss_mlp": 1.04883122, + "epoch": 0.14988726890124757, + "flos": 16253601148800.0, + "grad_norm": 2.7210225604175116, + "language_loss": 0.79162109, + "learning_rate": 3.851113162828802e-06, + "loss": 0.8137427, + "num_input_tokens_seen": 54115600, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.03125, + "step": 2493, + "time_per_iteration": 2.4228014945983887 + }, + { + "auxiliary_loss_clip": 0.01150881, + "auxiliary_loss_mlp": 0.01042857, + "balance_loss_clip": 1.02299631, + "balance_loss_mlp": 1.04643607, + "epoch": 0.14994739215391553, + "flos": 20666555383680.0, + "grad_norm": 2.8095511996528297, + "language_loss": 0.80186284, + "learning_rate": 3.85096567391148e-06, + "loss": 0.82380015, + "num_input_tokens_seen": 54135220, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.046875, + "step": 2494, + "time_per_iteration": 2.4774162769317627 + }, + { + "auxiliary_loss_clip": 0.01149241, + "auxiliary_loss_mlp": 0.01046465, + "balance_loss_clip": 1.02613974, + "balance_loss_mlp": 1.04731214, + "epoch": 0.1500075154065835, + "flos": 70652375239680.0, + "grad_norm": 1.9697458415941205, + "language_loss": 0.65825832, + "learning_rate": 3.850818114805354e-06, + "loss": 0.68021536, + "num_input_tokens_seen": 54161065, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.015625, + "step": 2495, + "time_per_iteration": 2.87758207321167 + }, + { + "auxiliary_loss_clip": 0.01053312, + "auxiliary_loss_mlp": 0.01002116, + "balance_loss_clip": 0.99992257, + "balance_loss_mlp": 1.01668406, + "epoch": 0.15006763865925146, + "flos": 68011937447040.0, + "grad_norm": 1.1924806916138095, + "language_loss": 0.59488082, + "learning_rate": 3.850670485516019e-06, + "loss": 0.61543506, + "num_input_tokens_seen": 54225095, + "router_z_loss_clip": 0.02197266, + "router_z_loss_mlp": 0.3671875, + "step": 2496, + "time_per_iteration": 3.0807061195373535 + }, + { + "auxiliary_loss_clip": 0.01152427, + "auxiliary_loss_mlp": 0.01054598, + "balance_loss_clip": 1.03467774, + "balance_loss_mlp": 1.0468092, + "epoch": 0.15012776191191943, + "flos": 18916269459840.0, + "grad_norm": 2.296903755979897, + "language_loss": 0.65457296, + "learning_rate": 3.850522786049075e-06, + "loss": 0.67664325, + "num_input_tokens_seen": 54243750, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0546875, + "step": 2497, + "time_per_iteration": 2.4403655529022217 + }, + { + "auxiliary_loss_clip": 0.01155934, + "auxiliary_loss_mlp": 0.01048581, + "balance_loss_clip": 1.03021121, + "balance_loss_mlp": 1.05125117, + "epoch": 0.1501878851645874, + "flos": 23701330638720.0, + "grad_norm": 1.4500790349521295, + "language_loss": 0.75247943, + "learning_rate": 3.850375016410121e-06, + "loss": 0.77452457, + "num_input_tokens_seen": 54266185, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.046875, + "step": 2498, + "time_per_iteration": 2.5286927223205566 + }, + { + "auxiliary_loss_clip": 0.01155949, + "auxiliary_loss_mlp": 0.01043093, + "balance_loss_clip": 1.02256477, + "balance_loss_mlp": 1.04910398, + "epoch": 0.15024800841725539, + "flos": 20412523422720.0, + "grad_norm": 2.1627878003877257, + "language_loss": 0.72073609, + "learning_rate": 3.850227176604761e-06, + "loss": 0.74272656, + "num_input_tokens_seen": 54283940, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0625, + "step": 2499, + "time_per_iteration": 2.4415009021759033 + }, + { + "auxiliary_loss_clip": 0.01153017, + "auxiliary_loss_mlp": 0.01049378, + "balance_loss_clip": 1.03001857, + "balance_loss_mlp": 1.04765654, + "epoch": 0.15030813166992335, + "flos": 31831002812160.0, + "grad_norm": 1.7935878764928508, + "language_loss": 0.7195605, + "learning_rate": 3.850079266638601e-06, + "loss": 0.74158442, + "num_input_tokens_seen": 54304830, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0625, + "step": 2500, + "time_per_iteration": 2.5504300594329834 + }, + { + "auxiliary_loss_clip": 0.01152715, + "auxiliary_loss_mlp": 0.01058033, + "balance_loss_clip": 1.03831601, + "balance_loss_mlp": 1.04960001, + "epoch": 0.15036825492259132, + "flos": 35657822914560.0, + "grad_norm": 2.491284008551419, + "language_loss": 0.64973354, + "learning_rate": 3.849931286517249e-06, + "loss": 0.67184103, + "num_input_tokens_seen": 54325595, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.03125, + "step": 2501, + "time_per_iteration": 2.587292432785034 + }, + { + "auxiliary_loss_clip": 0.01153217, + "auxiliary_loss_mlp": 0.01059755, + "balance_loss_clip": 1.03940582, + "balance_loss_mlp": 1.04861319, + "epoch": 0.15042837817525928, + "flos": 18838163335680.0, + "grad_norm": 2.0240839018319, + "language_loss": 0.83043593, + "learning_rate": 3.849783236246318e-06, + "loss": 0.85256565, + "num_input_tokens_seen": 54342180, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.046875, + "step": 2502, + "time_per_iteration": 2.470350980758667 + }, + { + "auxiliary_loss_clip": 0.01149694, + "auxiliary_loss_mlp": 0.01050766, + "balance_loss_clip": 1.03272963, + "balance_loss_mlp": 1.04702473, + "epoch": 0.15048850142792725, + "flos": 19535548867200.0, + "grad_norm": 2.3174234065433597, + "language_loss": 0.77197748, + "learning_rate": 3.849635115831421e-06, + "loss": 0.79398209, + "num_input_tokens_seen": 54360255, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.0234375, + "step": 2503, + "time_per_iteration": 2.6598432064056396 + }, + { + "auxiliary_loss_clip": 0.01151836, + "auxiliary_loss_mlp": 0.01043843, + "balance_loss_clip": 1.02585387, + "balance_loss_mlp": 1.04901898, + "epoch": 0.1505486246805952, + "flos": 22017550746240.0, + "grad_norm": 2.1270494317377007, + "language_loss": 0.85432625, + "learning_rate": 3.849486925278176e-06, + "loss": 0.87628305, + "num_input_tokens_seen": 54378260, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0234375, + "step": 2504, + "time_per_iteration": 2.7323355674743652 + }, + { + "auxiliary_loss_clip": 0.01150395, + "auxiliary_loss_mlp": 0.01041023, + "balance_loss_clip": 1.02322519, + "balance_loss_mlp": 1.04855871, + "epoch": 0.15060874793326318, + "flos": 20743153136640.0, + "grad_norm": 1.6383963769174188, + "language_loss": 0.83226919, + "learning_rate": 3.8493386645922e-06, + "loss": 0.85418344, + "num_input_tokens_seen": 54399745, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 1.015625, + "step": 2505, + "time_per_iteration": 2.4866323471069336 + }, + { + "auxiliary_loss_clip": 0.01150621, + "auxiliary_loss_mlp": 0.01046549, + "balance_loss_clip": 1.02851272, + "balance_loss_mlp": 1.04672468, + "epoch": 0.15066887118593117, + "flos": 16471902055680.0, + "grad_norm": 2.268670074130615, + "language_loss": 0.7639147, + "learning_rate": 3.849190333779117e-06, + "loss": 0.78588635, + "num_input_tokens_seen": 54417105, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.0390625, + "step": 2506, + "time_per_iteration": 2.4266390800476074 + }, + { + "auxiliary_loss_clip": 0.01156061, + "auxiliary_loss_mlp": 0.01043099, + "balance_loss_clip": 1.02452636, + "balance_loss_mlp": 1.04987144, + "epoch": 0.15072899443859913, + "flos": 19859319083520.0, + "grad_norm": 4.189374997051622, + "language_loss": 0.76202261, + "learning_rate": 3.849041932844552e-06, + "loss": 0.78401417, + "num_input_tokens_seen": 54433920, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0625, + "step": 2507, + "time_per_iteration": 2.477936029434204 + }, + { + "auxiliary_loss_clip": 0.01145213, + "auxiliary_loss_mlp": 0.01043256, + "balance_loss_clip": 1.02519584, + "balance_loss_mlp": 1.04538798, + "epoch": 0.1507891176912671, + "flos": 20776226584320.0, + "grad_norm": 2.4120052182021503, + "language_loss": 0.69041586, + "learning_rate": 3.848893461794131e-06, + "loss": 0.71230054, + "num_input_tokens_seen": 54451540, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.0, + "step": 2508, + "time_per_iteration": 2.4462738037109375 + }, + { + "auxiliary_loss_clip": 0.01156095, + "auxiliary_loss_mlp": 0.01046654, + "balance_loss_clip": 1.02870142, + "balance_loss_mlp": 1.05190873, + "epoch": 0.15084924094393506, + "flos": 23586631534080.0, + "grad_norm": 1.8904486830015208, + "language_loss": 0.77516425, + "learning_rate": 3.8487449206334845e-06, + "loss": 0.79719174, + "num_input_tokens_seen": 54470800, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0390625, + "step": 2509, + "time_per_iteration": 2.47723126411438 + }, + { + "auxiliary_loss_clip": 0.01160822, + "auxiliary_loss_mlp": 0.01052281, + "balance_loss_clip": 1.0307281, + "balance_loss_mlp": 1.05027628, + "epoch": 0.15090936419660303, + "flos": 18911313383040.0, + "grad_norm": 2.607083522867767, + "language_loss": 0.80497003, + "learning_rate": 3.848596309368246e-06, + "loss": 0.82710105, + "num_input_tokens_seen": 54486525, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.1015625, + "step": 2510, + "time_per_iteration": 2.4445176124572754 + }, + { + "auxiliary_loss_clip": 0.0115714, + "auxiliary_loss_mlp": 0.01053415, + "balance_loss_clip": 1.0336144, + "balance_loss_mlp": 1.05078745, + "epoch": 0.150969487449271, + "flos": 17928223073280.0, + "grad_norm": 2.033214689307001, + "language_loss": 0.73913604, + "learning_rate": 3.8484476280040495e-06, + "loss": 0.76124156, + "num_input_tokens_seen": 54503795, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0625, + "step": 2511, + "time_per_iteration": 2.4372222423553467 + }, + { + "auxiliary_loss_clip": 0.01152059, + "auxiliary_loss_mlp": 0.01040964, + "balance_loss_clip": 1.02332115, + "balance_loss_mlp": 1.04880548, + "epoch": 0.151029610701939, + "flos": 24243078539520.0, + "grad_norm": 2.077792778828972, + "language_loss": 0.6935091, + "learning_rate": 3.848298876546534e-06, + "loss": 0.71543926, + "num_input_tokens_seen": 54523025, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 1.03125, + "step": 2512, + "time_per_iteration": 2.5126166343688965 + }, + { + "auxiliary_loss_clip": 0.01154623, + "auxiliary_loss_mlp": 0.01047016, + "balance_loss_clip": 1.02903962, + "balance_loss_mlp": 1.05130434, + "epoch": 0.15108973395460695, + "flos": 30262496641920.0, + "grad_norm": 3.0703205269170364, + "language_loss": 0.73833334, + "learning_rate": 3.84815005500134e-06, + "loss": 0.76034975, + "num_input_tokens_seen": 54545025, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.03125, + "step": 2513, + "time_per_iteration": 2.5560262203216553 + }, + { + "auxiliary_loss_clip": 0.01052097, + "auxiliary_loss_mlp": 0.01001905, + "balance_loss_clip": 0.99995023, + "balance_loss_mlp": 1.01588845, + "epoch": 0.15114985720727492, + "flos": 60437624428800.0, + "grad_norm": 0.8742342414591, + "language_loss": 0.64759278, + "learning_rate": 3.84800116337411e-06, + "loss": 0.6681329, + "num_input_tokens_seen": 54604545, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.36328125, + "step": 2514, + "time_per_iteration": 3.0147135257720947 + }, + { + "auxiliary_loss_clip": 0.01150943, + "auxiliary_loss_mlp": 0.01043819, + "balance_loss_clip": 1.02588964, + "balance_loss_mlp": 1.04910421, + "epoch": 0.15120998045994288, + "flos": 20521691832960.0, + "grad_norm": 2.6951033245551597, + "language_loss": 0.73257691, + "learning_rate": 3.8478522016704916e-06, + "loss": 0.75452447, + "num_input_tokens_seen": 54620590, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0234375, + "step": 2515, + "time_per_iteration": 2.4640309810638428 + }, + { + "auxiliary_loss_clip": 0.01151787, + "auxiliary_loss_mlp": 0.0104255, + "balance_loss_clip": 1.02361989, + "balance_loss_mlp": 1.04967082, + "epoch": 0.15127010371261085, + "flos": 21178893024000.0, + "grad_norm": 1.8637331039353218, + "language_loss": 0.76990104, + "learning_rate": 3.8477031698961325e-06, + "loss": 0.79184443, + "num_input_tokens_seen": 54640410, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.015625, + "step": 2516, + "time_per_iteration": 2.4672725200653076 + }, + { + "auxiliary_loss_clip": 0.01049641, + "auxiliary_loss_mlp": 0.01003705, + "balance_loss_clip": 1.00177407, + "balance_loss_mlp": 1.01351547, + "epoch": 0.1513302269652788, + "flos": 65320648974720.0, + "grad_norm": 0.745436195681612, + "language_loss": 0.54673135, + "learning_rate": 3.8475540680566835e-06, + "loss": 0.56726485, + "num_input_tokens_seen": 54701430, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.36132812, + "step": 2517, + "time_per_iteration": 3.0677855014801025 + }, + { + "auxiliary_loss_clip": 0.01151686, + "auxiliary_loss_mlp": 0.0104095, + "balance_loss_clip": 1.02126849, + "balance_loss_mlp": 1.04780149, + "epoch": 0.15139035021794678, + "flos": 19135827342720.0, + "grad_norm": 2.2326216563166983, + "language_loss": 0.78515786, + "learning_rate": 3.8474048961577995e-06, + "loss": 0.8070842, + "num_input_tokens_seen": 54720845, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0390625, + "step": 2518, + "time_per_iteration": 3.8305110931396484 + }, + { + "auxiliary_loss_clip": 0.01159011, + "auxiliary_loss_mlp": 0.01048517, + "balance_loss_clip": 1.02851379, + "balance_loss_mlp": 1.05163026, + "epoch": 0.15145047347061477, + "flos": 26578564842240.0, + "grad_norm": 2.1364726943924772, + "language_loss": 0.70153689, + "learning_rate": 3.847255654205137e-06, + "loss": 0.72361219, + "num_input_tokens_seen": 54740495, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0703125, + "step": 2519, + "time_per_iteration": 3.9920616149902344 + }, + { + "auxiliary_loss_clip": 0.01151572, + "auxiliary_loss_mlp": 0.01044317, + "balance_loss_clip": 1.02549386, + "balance_loss_mlp": 1.04812384, + "epoch": 0.15151059672328274, + "flos": 20302959962880.0, + "grad_norm": 1.9802508383478334, + "language_loss": 0.79219216, + "learning_rate": 3.847106342204354e-06, + "loss": 0.81415105, + "num_input_tokens_seen": 54758415, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.03125, + "step": 2520, + "time_per_iteration": 2.4853925704956055 + }, + { + "auxiliary_loss_clip": 0.01155647, + "auxiliary_loss_mlp": 0.01050752, + "balance_loss_clip": 1.03090394, + "balance_loss_mlp": 1.05067897, + "epoch": 0.1515707199759507, + "flos": 27228367831680.0, + "grad_norm": 2.075013959426641, + "language_loss": 0.74324691, + "learning_rate": 3.846956960161114e-06, + "loss": 0.76531088, + "num_input_tokens_seen": 54779355, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.046875, + "step": 2521, + "time_per_iteration": 2.6154706478118896 + }, + { + "auxiliary_loss_clip": 0.01160623, + "auxiliary_loss_mlp": 0.01045817, + "balance_loss_clip": 1.02587366, + "balance_loss_mlp": 1.05273759, + "epoch": 0.15163084322861867, + "flos": 23587349806080.0, + "grad_norm": 2.7623729867934737, + "language_loss": 0.81996739, + "learning_rate": 3.84680750808108e-06, + "loss": 0.84203184, + "num_input_tokens_seen": 54799465, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.078125, + "step": 2522, + "time_per_iteration": 2.4873530864715576 + }, + { + "auxiliary_loss_clip": 0.0104876, + "auxiliary_loss_mlp": 0.01001752, + "balance_loss_clip": 0.99982071, + "balance_loss_mlp": 1.01252866, + "epoch": 0.15169096648128663, + "flos": 66889622021760.0, + "grad_norm": 0.824359498034346, + "language_loss": 0.57915509, + "learning_rate": 3.846657985969922e-06, + "loss": 0.59966022, + "num_input_tokens_seen": 54857665, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.36328125, + "step": 2523, + "time_per_iteration": 2.998990774154663 + }, + { + "auxiliary_loss_clip": 0.01153336, + "auxiliary_loss_mlp": 0.01050595, + "balance_loss_clip": 1.03147376, + "balance_loss_mlp": 1.04972816, + "epoch": 0.1517510897339546, + "flos": 29095435848960.0, + "grad_norm": 1.970015434384356, + "language_loss": 0.7485956, + "learning_rate": 3.8465083938333066e-06, + "loss": 0.77063495, + "num_input_tokens_seen": 54879895, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0390625, + "step": 2524, + "time_per_iteration": 2.570068836212158 + }, + { + "auxiliary_loss_clip": 0.0115237, + "auxiliary_loss_mlp": 0.01044934, + "balance_loss_clip": 1.02603889, + "balance_loss_mlp": 1.0488894, + "epoch": 0.1518112129866226, + "flos": 18406553512320.0, + "grad_norm": 1.8388163356316347, + "language_loss": 0.74780655, + "learning_rate": 3.8463587316769085e-06, + "loss": 0.76977956, + "num_input_tokens_seen": 54898245, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.03125, + "step": 2525, + "time_per_iteration": 2.431143283843994 + }, + { + "auxiliary_loss_clip": 0.01157293, + "auxiliary_loss_mlp": 0.01043467, + "balance_loss_clip": 1.02432156, + "balance_loss_mlp": 1.05145812, + "epoch": 0.15187133623929056, + "flos": 19425410789760.0, + "grad_norm": 1.8962457769996104, + "language_loss": 0.79644465, + "learning_rate": 3.846208999506402e-06, + "loss": 0.81845224, + "num_input_tokens_seen": 54917060, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0625, + "step": 2526, + "time_per_iteration": 2.5167391300201416 + }, + { + "auxiliary_loss_clip": 0.01151222, + "auxiliary_loss_mlp": 0.01044184, + "balance_loss_clip": 1.0271492, + "balance_loss_mlp": 1.05228162, + "epoch": 0.15193145949195852, + "flos": 17566207850880.0, + "grad_norm": 1.8025865198757494, + "language_loss": 0.84928662, + "learning_rate": 3.846059197327466e-06, + "loss": 0.87124068, + "num_input_tokens_seen": 54936365, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9921875, + "step": 2527, + "time_per_iteration": 2.4550719261169434 + }, + { + "auxiliary_loss_clip": 0.01151683, + "auxiliary_loss_mlp": 0.01041065, + "balance_loss_clip": 1.02321947, + "balance_loss_mlp": 1.04876995, + "epoch": 0.15199158274462649, + "flos": 36176265866880.0, + "grad_norm": 2.2810224367730156, + "language_loss": 0.69326001, + "learning_rate": 3.845909325145779e-06, + "loss": 0.71518755, + "num_input_tokens_seen": 54961365, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 1.03125, + "step": 2528, + "time_per_iteration": 2.610042095184326 + }, + { + "auxiliary_loss_clip": 0.01153606, + "auxiliary_loss_mlp": 0.01047581, + "balance_loss_clip": 1.0288415, + "balance_loss_mlp": 1.05137038, + "epoch": 0.15205170599729445, + "flos": 23074042498560.0, + "grad_norm": 2.490892546855648, + "language_loss": 0.86502308, + "learning_rate": 3.845759382967026e-06, + "loss": 0.88703495, + "num_input_tokens_seen": 54980750, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0234375, + "step": 2529, + "time_per_iteration": 2.4695634841918945 + }, + { + "auxiliary_loss_clip": 0.01147713, + "auxiliary_loss_mlp": 0.0103836, + "balance_loss_clip": 1.01989472, + "balance_loss_mlp": 1.04683101, + "epoch": 0.15211182924996242, + "flos": 21908382336000.0, + "grad_norm": 1.8772276619965056, + "language_loss": 0.83002013, + "learning_rate": 3.845609370796893e-06, + "loss": 0.85188091, + "num_input_tokens_seen": 54999675, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.0078125, + "step": 2530, + "time_per_iteration": 2.476238489151001 + }, + { + "auxiliary_loss_clip": 0.01153377, + "auxiliary_loss_mlp": 0.01044599, + "balance_loss_clip": 1.02550209, + "balance_loss_mlp": 1.04987955, + "epoch": 0.15217195250263038, + "flos": 13881521865600.0, + "grad_norm": 2.344030506991615, + "language_loss": 0.80540878, + "learning_rate": 3.845459288641066e-06, + "loss": 0.82738853, + "num_input_tokens_seen": 55018295, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.03125, + "step": 2531, + "time_per_iteration": 2.443617105484009 + }, + { + "auxiliary_loss_clip": 0.01149745, + "auxiliary_loss_mlp": 0.01049084, + "balance_loss_clip": 1.03138137, + "balance_loss_mlp": 1.04895151, + "epoch": 0.15223207575529837, + "flos": 24535319592960.0, + "grad_norm": 2.0816362099746017, + "language_loss": 0.79241651, + "learning_rate": 3.8453091365052394e-06, + "loss": 0.81440473, + "num_input_tokens_seen": 55037975, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 1.0078125, + "step": 2532, + "time_per_iteration": 2.5071239471435547 + }, + { + "auxiliary_loss_clip": 0.0115001, + "auxiliary_loss_mlp": 0.01046515, + "balance_loss_clip": 1.02694106, + "balance_loss_mlp": 1.04952455, + "epoch": 0.15229219900796634, + "flos": 25556798563200.0, + "grad_norm": 1.8298502444413876, + "language_loss": 0.87712961, + "learning_rate": 3.845158914395105e-06, + "loss": 0.89909488, + "num_input_tokens_seen": 55057135, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0, + "step": 2533, + "time_per_iteration": 2.5262463092803955 + }, + { + "auxiliary_loss_clip": 0.01150696, + "auxiliary_loss_mlp": 0.01047398, + "balance_loss_clip": 1.02932572, + "balance_loss_mlp": 1.04766071, + "epoch": 0.1523523222606343, + "flos": 18217806520320.0, + "grad_norm": 2.2606742211331556, + "language_loss": 0.79057097, + "learning_rate": 3.84500862231636e-06, + "loss": 0.81255192, + "num_input_tokens_seen": 55075525, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.03125, + "step": 2534, + "time_per_iteration": 2.4421815872192383 + }, + { + "auxiliary_loss_clip": 0.01156406, + "auxiliary_loss_mlp": 0.01041573, + "balance_loss_clip": 1.02177238, + "balance_loss_mlp": 1.04847312, + "epoch": 0.15241244551330227, + "flos": 13260087642240.0, + "grad_norm": 2.8989864742133933, + "language_loss": 0.76862979, + "learning_rate": 3.844858260274702e-06, + "loss": 0.7906096, + "num_input_tokens_seen": 55090845, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.078125, + "step": 2535, + "time_per_iteration": 2.4193530082702637 + }, + { + "auxiliary_loss_clip": 0.01156147, + "auxiliary_loss_mlp": 0.01040468, + "balance_loss_clip": 1.02153718, + "balance_loss_mlp": 1.04885459, + "epoch": 0.15247256876597023, + "flos": 19715568854400.0, + "grad_norm": 2.234687708038525, + "language_loss": 0.78185135, + "learning_rate": 3.844707828275835e-06, + "loss": 0.80381751, + "num_input_tokens_seen": 55108750, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0703125, + "step": 2536, + "time_per_iteration": 2.478066921234131 + }, + { + "auxiliary_loss_clip": 0.01150448, + "auxiliary_loss_mlp": 0.0105096, + "balance_loss_clip": 1.03305459, + "balance_loss_mlp": 1.05067229, + "epoch": 0.1525326920186382, + "flos": 20375858615040.0, + "grad_norm": 2.124557148089124, + "language_loss": 0.74979979, + "learning_rate": 3.844557326325461e-06, + "loss": 0.77181387, + "num_input_tokens_seen": 55126750, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0, + "step": 2537, + "time_per_iteration": 2.455779552459717 + }, + { + "auxiliary_loss_clip": 0.01152934, + "auxiliary_loss_mlp": 0.01043794, + "balance_loss_clip": 1.02545929, + "balance_loss_mlp": 1.04965043, + "epoch": 0.15259281527130616, + "flos": 13589963170560.0, + "grad_norm": 2.005826380833244, + "language_loss": 0.77631724, + "learning_rate": 3.8444067544292896e-06, + "loss": 0.79828459, + "num_input_tokens_seen": 55144690, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.03125, + "step": 2538, + "time_per_iteration": 2.527730941772461 + }, + { + "auxiliary_loss_clip": 0.01147714, + "auxiliary_loss_mlp": 0.01040159, + "balance_loss_clip": 1.02308786, + "balance_loss_mlp": 1.04806781, + "epoch": 0.15265293852397416, + "flos": 22860374446080.0, + "grad_norm": 1.6961003069906246, + "language_loss": 0.89707708, + "learning_rate": 3.844256112593029e-06, + "loss": 0.9189558, + "num_input_tokens_seen": 55166055, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.99609375, + "step": 2539, + "time_per_iteration": 2.485410451889038 + }, + { + "auxiliary_loss_clip": 0.01151642, + "auxiliary_loss_mlp": 0.01043152, + "balance_loss_clip": 1.02491331, + "balance_loss_mlp": 1.05028892, + "epoch": 0.15271306177664212, + "flos": 29238108670080.0, + "grad_norm": 2.1834515010765627, + "language_loss": 0.93514961, + "learning_rate": 3.844105400822391e-06, + "loss": 0.95709753, + "num_input_tokens_seen": 55186285, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.015625, + "step": 2540, + "time_per_iteration": 2.5399627685546875 + }, + { + "auxiliary_loss_clip": 0.01144897, + "auxiliary_loss_mlp": 0.01043966, + "balance_loss_clip": 1.0266571, + "balance_loss_mlp": 1.04625463, + "epoch": 0.1527731850293101, + "flos": 31246269310080.0, + "grad_norm": 1.9271166035098393, + "language_loss": 0.75039941, + "learning_rate": 3.843954619123092e-06, + "loss": 0.77228808, + "num_input_tokens_seen": 55207915, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.984375, + "step": 2541, + "time_per_iteration": 2.516559362411499 + }, + { + "auxiliary_loss_clip": 0.01147451, + "auxiliary_loss_mlp": 0.01048877, + "balance_loss_clip": 1.03025603, + "balance_loss_mlp": 1.04787207, + "epoch": 0.15283330828197805, + "flos": 22382079920640.0, + "grad_norm": 1.7480154890803248, + "language_loss": 0.81308234, + "learning_rate": 3.84380376750085e-06, + "loss": 0.83504558, + "num_input_tokens_seen": 55227860, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.99609375, + "step": 2542, + "time_per_iteration": 2.4681694507598877 + }, + { + "auxiliary_loss_clip": 0.01150381, + "auxiliary_loss_mlp": 0.01050782, + "balance_loss_clip": 1.03213799, + "balance_loss_mlp": 1.04772067, + "epoch": 0.15289343153464602, + "flos": 25520133755520.0, + "grad_norm": 2.009812895323552, + "language_loss": 0.77568293, + "learning_rate": 3.843652845961383e-06, + "loss": 0.79769456, + "num_input_tokens_seen": 55247330, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.0234375, + "step": 2543, + "time_per_iteration": 2.4899120330810547 + }, + { + "auxiliary_loss_clip": 0.01147346, + "auxiliary_loss_mlp": 0.01045177, + "balance_loss_clip": 1.0266397, + "balance_loss_mlp": 1.04692626, + "epoch": 0.15295355478731398, + "flos": 22710016114560.0, + "grad_norm": 2.3128696364379935, + "language_loss": 0.86483204, + "learning_rate": 3.843501854510416e-06, + "loss": 0.88675725, + "num_input_tokens_seen": 55266195, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0, + "step": 2544, + "time_per_iteration": 2.4774844646453857 + }, + { + "auxiliary_loss_clip": 0.01152485, + "auxiliary_loss_mlp": 0.010531, + "balance_loss_clip": 1.03287029, + "balance_loss_mlp": 1.04675508, + "epoch": 0.15301367803998198, + "flos": 23251907669760.0, + "grad_norm": 2.0966566192890106, + "language_loss": 0.8228749, + "learning_rate": 3.843350793153673e-06, + "loss": 0.84493077, + "num_input_tokens_seen": 55283305, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0546875, + "step": 2545, + "time_per_iteration": 2.4526925086975098 + }, + { + "auxiliary_loss_clip": 0.01149503, + "auxiliary_loss_mlp": 0.01044491, + "balance_loss_clip": 1.02614498, + "balance_loss_mlp": 1.04802954, + "epoch": 0.15307380129264994, + "flos": 25886279041920.0, + "grad_norm": 2.540509049886226, + "language_loss": 0.70711339, + "learning_rate": 3.843199661896884e-06, + "loss": 0.72905338, + "num_input_tokens_seen": 55303035, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.015625, + "step": 2546, + "time_per_iteration": 2.5009732246398926 + }, + { + "auxiliary_loss_clip": 0.01152354, + "auxiliary_loss_mlp": 0.01043417, + "balance_loss_clip": 1.02423596, + "balance_loss_mlp": 1.04967904, + "epoch": 0.1531339245453179, + "flos": 46973239205760.0, + "grad_norm": 1.5770850469719229, + "language_loss": 0.77521312, + "learning_rate": 3.843048460745779e-06, + "loss": 0.79717076, + "num_input_tokens_seen": 55327570, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.03125, + "step": 2547, + "time_per_iteration": 2.6822421550750732 + }, + { + "auxiliary_loss_clip": 0.01152263, + "auxiliary_loss_mlp": 0.01047861, + "balance_loss_clip": 1.02932382, + "balance_loss_mlp": 1.04904902, + "epoch": 0.15319404779798587, + "flos": 35882049565440.0, + "grad_norm": 2.0900989153424976, + "language_loss": 0.73985445, + "learning_rate": 3.842897189706092e-06, + "loss": 0.76185566, + "num_input_tokens_seen": 55351090, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.03125, + "step": 2548, + "time_per_iteration": 2.59080171585083 + }, + { + "auxiliary_loss_clip": 0.01150578, + "auxiliary_loss_mlp": 0.01050674, + "balance_loss_clip": 1.03158915, + "balance_loss_mlp": 1.04806828, + "epoch": 0.15325417105065384, + "flos": 25664638170240.0, + "grad_norm": 1.499185349529517, + "language_loss": 0.80589813, + "learning_rate": 3.842745848783558e-06, + "loss": 0.82791066, + "num_input_tokens_seen": 55371050, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0234375, + "step": 2549, + "time_per_iteration": 2.498096227645874 + }, + { + "auxiliary_loss_clip": 0.01150664, + "auxiliary_loss_mlp": 0.01048572, + "balance_loss_clip": 1.02951026, + "balance_loss_mlp": 1.04750037, + "epoch": 0.1533142943033218, + "flos": 18770831291520.0, + "grad_norm": 1.687491024735964, + "language_loss": 0.74760693, + "learning_rate": 3.842594437983917e-06, + "loss": 0.76959932, + "num_input_tokens_seen": 55390375, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.03125, + "step": 2550, + "time_per_iteration": 2.4740684032440186 + }, + { + "auxiliary_loss_clip": 0.01153822, + "auxiliary_loss_mlp": 0.01039682, + "balance_loss_clip": 1.02035773, + "balance_loss_mlp": 1.04903841, + "epoch": 0.15337441755598977, + "flos": 23107367341440.0, + "grad_norm": 2.205632522725416, + "language_loss": 0.76839805, + "learning_rate": 3.8424429573129115e-06, + "loss": 0.79033309, + "num_input_tokens_seen": 55408890, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.046875, + "step": 2551, + "time_per_iteration": 2.468886375427246 + }, + { + "auxiliary_loss_clip": 0.01045401, + "auxiliary_loss_mlp": 0.01020401, + "balance_loss_clip": 1.01873255, + "balance_loss_mlp": 1.0102303, + "epoch": 0.15343454080865776, + "flos": 59861079227520.0, + "grad_norm": 0.9464853846906186, + "language_loss": 0.56666422, + "learning_rate": 3.842291406776283e-06, + "loss": 0.58732224, + "num_input_tokens_seen": 55463815, + "router_z_loss_clip": 0.01672363, + "router_z_loss_mlp": 0.3515625, + "step": 2552, + "time_per_iteration": 3.0059380531311035 + }, + { + "auxiliary_loss_clip": 0.01152358, + "auxiliary_loss_mlp": 0.010458, + "balance_loss_clip": 1.02684569, + "balance_loss_mlp": 1.04793155, + "epoch": 0.15349466406132573, + "flos": 11910887959680.0, + "grad_norm": 3.2490122092843947, + "language_loss": 0.88505352, + "learning_rate": 3.84213978637978e-06, + "loss": 0.90703511, + "num_input_tokens_seen": 55481050, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.046875, + "step": 2553, + "time_per_iteration": 2.4523322582244873 + }, + { + "auxiliary_loss_clip": 0.01153624, + "auxiliary_loss_mlp": 0.01047537, + "balance_loss_clip": 1.02858269, + "balance_loss_mlp": 1.04771137, + "epoch": 0.1535547873139937, + "flos": 24096922099200.0, + "grad_norm": 1.8003580088176259, + "language_loss": 0.78462374, + "learning_rate": 3.841988096129152e-06, + "loss": 0.80663538, + "num_input_tokens_seen": 55500050, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0625, + "step": 2554, + "time_per_iteration": 2.48526668548584 + }, + { + "auxiliary_loss_clip": 0.01154341, + "auxiliary_loss_mlp": 0.01052395, + "balance_loss_clip": 1.03212881, + "balance_loss_mlp": 1.04941773, + "epoch": 0.15361491056666166, + "flos": 17566459246080.0, + "grad_norm": 2.4926146542113763, + "language_loss": 0.78344929, + "learning_rate": 3.841836336030151e-06, + "loss": 0.80551672, + "num_input_tokens_seen": 55518125, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.046875, + "step": 2555, + "time_per_iteration": 2.4687228202819824 + }, + { + "auxiliary_loss_clip": 0.01149124, + "auxiliary_loss_mlp": 0.01053536, + "balance_loss_clip": 1.03543973, + "balance_loss_mlp": 1.04890609, + "epoch": 0.15367503381932962, + "flos": 25046041121280.0, + "grad_norm": 1.6634961059278193, + "language_loss": 0.76901627, + "learning_rate": 3.8416845060885305e-06, + "loss": 0.7910428, + "num_input_tokens_seen": 55540960, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.0, + "step": 2556, + "time_per_iteration": 2.5006635189056396 + }, + { + "auxiliary_loss_clip": 0.01145988, + "auxiliary_loss_mlp": 0.0104239, + "balance_loss_clip": 1.02362633, + "balance_loss_mlp": 1.04657805, + "epoch": 0.15373515707199759, + "flos": 21507332008320.0, + "grad_norm": 1.8623555031997667, + "language_loss": 0.89489496, + "learning_rate": 3.84153260631005e-06, + "loss": 0.9167788, + "num_input_tokens_seen": 55559210, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.99609375, + "step": 2557, + "time_per_iteration": 2.4434657096862793 + }, + { + "auxiliary_loss_clip": 0.01151609, + "auxiliary_loss_mlp": 0.01046416, + "balance_loss_clip": 1.0263536, + "balance_loss_mlp": 1.04834831, + "epoch": 0.15379528032466555, + "flos": 25994729180160.0, + "grad_norm": 2.0348980361104587, + "language_loss": 0.7119934, + "learning_rate": 3.841380636700468e-06, + "loss": 0.73397368, + "num_input_tokens_seen": 55578925, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.03125, + "step": 2558, + "time_per_iteration": 2.490226984024048 + }, + { + "auxiliary_loss_clip": 0.01152232, + "auxiliary_loss_mlp": 0.01047681, + "balance_loss_clip": 1.02863097, + "balance_loss_mlp": 1.04888546, + "epoch": 0.15385540357733354, + "flos": 19277315015040.0, + "grad_norm": 2.2935483083292705, + "language_loss": 0.92370701, + "learning_rate": 3.841228597265548e-06, + "loss": 0.94570613, + "num_input_tokens_seen": 55597255, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.03125, + "step": 2559, + "time_per_iteration": 3.885131597518921 + }, + { + "auxiliary_loss_clip": 0.01155373, + "auxiliary_loss_mlp": 0.01053347, + "balance_loss_clip": 1.03331971, + "balance_loss_mlp": 1.05068171, + "epoch": 0.1539155268300015, + "flos": 28549126920960.0, + "grad_norm": 5.140445938018919, + "language_loss": 0.63637704, + "learning_rate": 3.841076488011055e-06, + "loss": 0.65846419, + "num_input_tokens_seen": 55619515, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.046875, + "step": 2560, + "time_per_iteration": 5.343889236450195 + }, + { + "auxiliary_loss_clip": 0.01153839, + "auxiliary_loss_mlp": 0.01046849, + "balance_loss_clip": 1.02725124, + "balance_loss_mlp": 1.04950392, + "epoch": 0.15397565008266947, + "flos": 23547883737600.0, + "grad_norm": 1.8613162525264346, + "language_loss": 0.88230681, + "learning_rate": 3.8409243089427574e-06, + "loss": 0.90431374, + "num_input_tokens_seen": 55640050, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.046875, + "step": 2561, + "time_per_iteration": 2.4648611545562744 + }, + { + "auxiliary_loss_clip": 0.01145331, + "auxiliary_loss_mlp": 0.01041909, + "balance_loss_clip": 1.02433765, + "balance_loss_mlp": 1.0477581, + "epoch": 0.15403577333533744, + "flos": 17129821518720.0, + "grad_norm": 1.8458305826175445, + "language_loss": 0.82909077, + "learning_rate": 3.840772060066425e-06, + "loss": 0.85096323, + "num_input_tokens_seen": 55658695, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9765625, + "step": 2562, + "time_per_iteration": 2.4327874183654785 + }, + { + "auxiliary_loss_clip": 0.01160792, + "auxiliary_loss_mlp": 0.0104767, + "balance_loss_clip": 1.02614117, + "balance_loss_mlp": 1.05274105, + "epoch": 0.1540958965880054, + "flos": 17894503180800.0, + "grad_norm": 1.8513620412223286, + "language_loss": 0.74713194, + "learning_rate": 3.840619741387832e-06, + "loss": 0.7692166, + "num_input_tokens_seen": 55676340, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.078125, + "step": 2563, + "time_per_iteration": 2.4246435165405273 + }, + { + "auxiliary_loss_clip": 0.01152598, + "auxiliary_loss_mlp": 0.01044039, + "balance_loss_clip": 1.02425051, + "balance_loss_mlp": 1.04708791, + "epoch": 0.15415601984067337, + "flos": 32161057908480.0, + "grad_norm": 4.308351588789828, + "language_loss": 0.75896233, + "learning_rate": 3.8404673529127534e-06, + "loss": 0.78092873, + "num_input_tokens_seen": 55698890, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.0546875, + "step": 2564, + "time_per_iteration": 2.5528018474578857 + }, + { + "auxiliary_loss_clip": 0.01149402, + "auxiliary_loss_mlp": 0.01050825, + "balance_loss_clip": 1.03233564, + "balance_loss_mlp": 1.04782677, + "epoch": 0.15421614309334136, + "flos": 24024418496640.0, + "grad_norm": 1.9915177170702032, + "language_loss": 0.70825899, + "learning_rate": 3.840314894646969e-06, + "loss": 0.73026133, + "num_input_tokens_seen": 55718535, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.015625, + "step": 2565, + "time_per_iteration": 2.5007505416870117 + }, + { + "auxiliary_loss_clip": 0.01149717, + "auxiliary_loss_mlp": 0.01050801, + "balance_loss_clip": 1.0315845, + "balance_loss_mlp": 1.04728019, + "epoch": 0.15427626634600933, + "flos": 24386290064640.0, + "grad_norm": 2.308308002927142, + "language_loss": 0.71535969, + "learning_rate": 3.840162366596259e-06, + "loss": 0.73736489, + "num_input_tokens_seen": 55738970, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.0234375, + "step": 2566, + "time_per_iteration": 2.498033285140991 + }, + { + "auxiliary_loss_clip": 0.01143736, + "auxiliary_loss_mlp": 0.01042132, + "balance_loss_clip": 1.02379811, + "balance_loss_mlp": 1.04381752, + "epoch": 0.1543363895986773, + "flos": 23331522165120.0, + "grad_norm": 1.7584763964610812, + "language_loss": 0.85129261, + "learning_rate": 3.840009768766408e-06, + "loss": 0.87315124, + "num_input_tokens_seen": 55759585, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.0, + "step": 2567, + "time_per_iteration": 2.46708083152771 + }, + { + "auxiliary_loss_clip": 0.01150763, + "auxiliary_loss_mlp": 0.01050725, + "balance_loss_clip": 1.03266454, + "balance_loss_mlp": 1.0491097, + "epoch": 0.15439651285134526, + "flos": 24274284480000.0, + "grad_norm": 2.4904852760766127, + "language_loss": 0.78025472, + "learning_rate": 3.839857101163202e-06, + "loss": 0.80226958, + "num_input_tokens_seen": 55779250, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.015625, + "step": 2568, + "time_per_iteration": 2.476029634475708 + }, + { + "auxiliary_loss_clip": 0.01150703, + "auxiliary_loss_mlp": 0.01039967, + "balance_loss_clip": 1.01974905, + "balance_loss_mlp": 1.04835856, + "epoch": 0.15445663610401322, + "flos": 22456163721600.0, + "grad_norm": 1.967048361077992, + "language_loss": 0.70183134, + "learning_rate": 3.83970436379243e-06, + "loss": 0.72373807, + "num_input_tokens_seen": 55800470, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0234375, + "step": 2569, + "time_per_iteration": 2.4566383361816406 + }, + { + "auxiliary_loss_clip": 0.011445, + "auxiliary_loss_mlp": 0.0104299, + "balance_loss_clip": 1.0243814, + "balance_loss_mlp": 1.04563344, + "epoch": 0.1545167593566812, + "flos": 22049510872320.0, + "grad_norm": 1.7954711420319855, + "language_loss": 0.76502788, + "learning_rate": 3.839551556659884e-06, + "loss": 0.78690279, + "num_input_tokens_seen": 55817795, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.98828125, + "step": 2570, + "time_per_iteration": 2.4543209075927734 + }, + { + "auxiliary_loss_clip": 0.01149071, + "auxiliary_loss_mlp": 0.01044712, + "balance_loss_clip": 1.02532816, + "balance_loss_mlp": 1.04811645, + "epoch": 0.15457688260934915, + "flos": 19318253541120.0, + "grad_norm": 7.2402617485583525, + "language_loss": 0.77214551, + "learning_rate": 3.839398679771359e-06, + "loss": 0.7940833, + "num_input_tokens_seen": 55836125, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0078125, + "step": 2571, + "time_per_iteration": 2.4532222747802734 + }, + { + "auxiliary_loss_clip": 0.0114992, + "auxiliary_loss_mlp": 0.01049579, + "balance_loss_clip": 1.03086352, + "balance_loss_mlp": 1.04835165, + "epoch": 0.15463700586201715, + "flos": 24133981956480.0, + "grad_norm": 1.949392721600437, + "language_loss": 0.82254899, + "learning_rate": 3.839245733132652e-06, + "loss": 0.84454399, + "num_input_tokens_seen": 55855280, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 2572, + "time_per_iteration": 2.4919703006744385 + }, + { + "auxiliary_loss_clip": 0.01152049, + "auxiliary_loss_mlp": 0.01047577, + "balance_loss_clip": 1.02838445, + "balance_loss_mlp": 1.04827368, + "epoch": 0.1546971291146851, + "flos": 22420935457920.0, + "grad_norm": 1.621727953381826, + "language_loss": 0.90506172, + "learning_rate": 3.839092716749563e-06, + "loss": 0.92705798, + "num_input_tokens_seen": 55875695, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.03125, + "step": 2573, + "time_per_iteration": 2.4679911136627197 + }, + { + "auxiliary_loss_clip": 0.01152025, + "auxiliary_loss_mlp": 0.01056653, + "balance_loss_clip": 1.03724563, + "balance_loss_mlp": 1.04919529, + "epoch": 0.15475725236735308, + "flos": 17530225401600.0, + "grad_norm": 1.7899098306423509, + "language_loss": 0.70378339, + "learning_rate": 3.838939630627893e-06, + "loss": 0.72587025, + "num_input_tokens_seen": 55894575, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0234375, + "step": 2574, + "time_per_iteration": 2.448148012161255 + }, + { + "auxiliary_loss_clip": 0.01150284, + "auxiliary_loss_mlp": 0.01048729, + "balance_loss_clip": 1.02798676, + "balance_loss_mlp": 1.04641008, + "epoch": 0.15481737562002104, + "flos": 22561740771840.0, + "grad_norm": 1.761755301023602, + "language_loss": 0.82718939, + "learning_rate": 3.838786474773448e-06, + "loss": 0.84917951, + "num_input_tokens_seen": 55912855, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.0390625, + "step": 2575, + "time_per_iteration": 2.4515788555145264 + }, + { + "auxiliary_loss_clip": 0.011498, + "auxiliary_loss_mlp": 0.01047927, + "balance_loss_clip": 1.02937794, + "balance_loss_mlp": 1.0456214, + "epoch": 0.154877498872689, + "flos": 24900567039360.0, + "grad_norm": 2.21774000772259, + "language_loss": 0.84661531, + "learning_rate": 3.838633249192036e-06, + "loss": 0.86859256, + "num_input_tokens_seen": 55932375, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0390625, + "step": 2576, + "time_per_iteration": 2.5052003860473633 + }, + { + "auxiliary_loss_clip": 0.01149148, + "auxiliary_loss_mlp": 0.01043114, + "balance_loss_clip": 1.02414751, + "balance_loss_mlp": 1.04679108, + "epoch": 0.15493762212535697, + "flos": 28147501975680.0, + "grad_norm": 1.816317520286285, + "language_loss": 0.81942815, + "learning_rate": 3.838479953889465e-06, + "loss": 0.84135079, + "num_input_tokens_seen": 55953970, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0234375, + "step": 2577, + "time_per_iteration": 2.5133895874023438 + }, + { + "auxiliary_loss_clip": 0.01151988, + "auxiliary_loss_mlp": 0.01049852, + "balance_loss_clip": 1.03090954, + "balance_loss_mlp": 1.04980743, + "epoch": 0.15499774537802496, + "flos": 25411073086080.0, + "grad_norm": 2.384736720709717, + "language_loss": 0.76260924, + "learning_rate": 3.8383265888715525e-06, + "loss": 0.78462768, + "num_input_tokens_seen": 55973120, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0234375, + "step": 2578, + "time_per_iteration": 2.5140793323516846 + }, + { + "auxiliary_loss_clip": 0.01151489, + "auxiliary_loss_mlp": 0.01045761, + "balance_loss_clip": 1.02630556, + "balance_loss_mlp": 1.04832911, + "epoch": 0.15505786863069293, + "flos": 22091562720000.0, + "grad_norm": 2.651100693067537, + "language_loss": 0.82420707, + "learning_rate": 3.83817315414411e-06, + "loss": 0.84617954, + "num_input_tokens_seen": 55993260, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.03125, + "step": 2579, + "time_per_iteration": 2.4410548210144043 + }, + { + "auxiliary_loss_clip": 0.01152359, + "auxiliary_loss_mlp": 0.01049402, + "balance_loss_clip": 1.03056741, + "balance_loss_mlp": 1.05137682, + "epoch": 0.1551179918833609, + "flos": 18917131386240.0, + "grad_norm": 1.6356270056083286, + "language_loss": 0.80460835, + "learning_rate": 3.838019649712958e-06, + "loss": 0.82662606, + "num_input_tokens_seen": 56012130, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0078125, + "step": 2580, + "time_per_iteration": 2.457929849624634 + }, + { + "auxiliary_loss_clip": 0.01050492, + "auxiliary_loss_mlp": 0.01014696, + "balance_loss_clip": 1.0128479, + "balance_loss_mlp": 1.01473403, + "epoch": 0.15517811513602886, + "flos": 66239172587520.0, + "grad_norm": 0.84873853717235, + "language_loss": 0.58840239, + "learning_rate": 3.8378660755839166e-06, + "loss": 0.60905427, + "num_input_tokens_seen": 56079045, + "router_z_loss_clip": 0.01843262, + "router_z_loss_mlp": 0.35742188, + "step": 2581, + "time_per_iteration": 3.1725480556488037 + }, + { + "auxiliary_loss_clip": 0.01152966, + "auxiliary_loss_mlp": 0.01044952, + "balance_loss_clip": 1.02615237, + "balance_loss_mlp": 1.04869819, + "epoch": 0.15523823838869683, + "flos": 24021078531840.0, + "grad_norm": 1.8637973548327127, + "language_loss": 0.85214508, + "learning_rate": 3.8377124317628095e-06, + "loss": 0.87412429, + "num_input_tokens_seen": 56098745, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0390625, + "step": 2582, + "time_per_iteration": 2.486454963684082 + }, + { + "auxiliary_loss_clip": 0.01150766, + "auxiliary_loss_mlp": 0.01055198, + "balance_loss_clip": 1.03534937, + "balance_loss_mlp": 1.04837251, + "epoch": 0.1552983616413648, + "flos": 20485062938880.0, + "grad_norm": 2.457099081417407, + "language_loss": 0.78432047, + "learning_rate": 3.8375587182554625e-06, + "loss": 0.80638009, + "num_input_tokens_seen": 56117655, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0234375, + "step": 2583, + "time_per_iteration": 2.468686580657959 + }, + { + "auxiliary_loss_clip": 0.01151702, + "auxiliary_loss_mlp": 0.01054386, + "balance_loss_clip": 1.03458571, + "balance_loss_mlp": 1.04853427, + "epoch": 0.15535848489403276, + "flos": 32123710742400.0, + "grad_norm": 1.6727812592242826, + "language_loss": 0.76121294, + "learning_rate": 3.837404935067705e-06, + "loss": 0.78327382, + "num_input_tokens_seen": 56141960, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.03125, + "step": 2584, + "time_per_iteration": 2.5471444129943848 + }, + { + "auxiliary_loss_clip": 0.01149066, + "auxiliary_loss_mlp": 0.01046504, + "balance_loss_clip": 1.02746594, + "balance_loss_mlp": 1.04740906, + "epoch": 0.15541860814670075, + "flos": 19098444263040.0, + "grad_norm": 2.0194610159936324, + "language_loss": 0.75623107, + "learning_rate": 3.837251082205368e-06, + "loss": 0.7781868, + "num_input_tokens_seen": 56161430, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.015625, + "step": 2585, + "time_per_iteration": 2.4448020458221436 + }, + { + "auxiliary_loss_clip": 0.01146182, + "auxiliary_loss_mlp": 0.01049826, + "balance_loss_clip": 1.03101528, + "balance_loss_mlp": 1.04662418, + "epoch": 0.1554787313993687, + "flos": 19172097100800.0, + "grad_norm": 2.233481730992117, + "language_loss": 0.611651, + "learning_rate": 3.837097159674286e-06, + "loss": 0.63361114, + "num_input_tokens_seen": 56179390, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.9921875, + "step": 2586, + "time_per_iteration": 2.4375994205474854 + }, + { + "auxiliary_loss_clip": 0.01150781, + "auxiliary_loss_mlp": 0.01047148, + "balance_loss_clip": 1.02814651, + "balance_loss_mlp": 1.04623449, + "epoch": 0.15553885465203668, + "flos": 16143822207360.0, + "grad_norm": 1.8194244944539537, + "language_loss": 0.8108865, + "learning_rate": 3.836943167480296e-06, + "loss": 0.83286583, + "num_input_tokens_seen": 56198020, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.046875, + "step": 2587, + "time_per_iteration": 2.4394617080688477 + }, + { + "auxiliary_loss_clip": 0.01155076, + "auxiliary_loss_mlp": 0.01058653, + "balance_loss_clip": 1.03512144, + "balance_loss_mlp": 1.04851258, + "epoch": 0.15559897790470464, + "flos": 25337779384320.0, + "grad_norm": 1.8978014455674168, + "language_loss": 0.88844347, + "learning_rate": 3.836789105629236e-06, + "loss": 0.91058075, + "num_input_tokens_seen": 56218165, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.0625, + "step": 2588, + "time_per_iteration": 2.519864559173584 + }, + { + "auxiliary_loss_clip": 0.01150101, + "auxiliary_loss_mlp": 0.01053957, + "balance_loss_clip": 1.03351235, + "balance_loss_mlp": 1.04859662, + "epoch": 0.1556591011573726, + "flos": 23148772744320.0, + "grad_norm": 2.6765596364055266, + "language_loss": 0.64950025, + "learning_rate": 3.83663497412695e-06, + "loss": 0.6715408, + "num_input_tokens_seen": 56237160, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.015625, + "step": 2589, + "time_per_iteration": 2.5106732845306396 + }, + { + "auxiliary_loss_clip": 0.01150618, + "auxiliary_loss_mlp": 0.01044948, + "balance_loss_clip": 1.02451587, + "balance_loss_mlp": 1.0483036, + "epoch": 0.15571922441004057, + "flos": 25370888745600.0, + "grad_norm": 1.7614316666112095, + "language_loss": 0.82610166, + "learning_rate": 3.836480772979281e-06, + "loss": 0.84805739, + "num_input_tokens_seen": 56257610, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0234375, + "step": 2590, + "time_per_iteration": 2.519573211669922 + }, + { + "auxiliary_loss_clip": 0.01151128, + "auxiliary_loss_mlp": 0.01047405, + "balance_loss_clip": 1.02761662, + "balance_loss_mlp": 1.04740536, + "epoch": 0.15577934766270854, + "flos": 14501375890560.0, + "grad_norm": 2.1478399705358195, + "language_loss": 0.78919029, + "learning_rate": 3.836326502192077e-06, + "loss": 0.81117558, + "num_input_tokens_seen": 56275215, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.0390625, + "step": 2591, + "time_per_iteration": 2.446871519088745 + }, + { + "auxiliary_loss_clip": 0.01150021, + "auxiliary_loss_mlp": 0.01051358, + "balance_loss_clip": 1.03271413, + "balance_loss_mlp": 1.04902434, + "epoch": 0.15583947091537653, + "flos": 37414537372800.0, + "grad_norm": 1.9877262596002243, + "language_loss": 0.64780253, + "learning_rate": 3.836172161771189e-06, + "loss": 0.66981632, + "num_input_tokens_seen": 56297130, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0078125, + "step": 2592, + "time_per_iteration": 2.5992095470428467 + }, + { + "auxiliary_loss_clip": 0.01156577, + "auxiliary_loss_mlp": 0.01052338, + "balance_loss_clip": 1.03195322, + "balance_loss_mlp": 1.0518856, + "epoch": 0.1558995941680445, + "flos": 21834729498240.0, + "grad_norm": 2.6077304694487062, + "language_loss": 0.81806099, + "learning_rate": 3.836017751722467e-06, + "loss": 0.84015012, + "num_input_tokens_seen": 56314995, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.046875, + "step": 2593, + "time_per_iteration": 2.4317471981048584 + }, + { + "auxiliary_loss_clip": 0.01148564, + "auxiliary_loss_mlp": 0.01049732, + "balance_loss_clip": 1.02876306, + "balance_loss_mlp": 1.04862404, + "epoch": 0.15595971742071246, + "flos": 19792633484160.0, + "grad_norm": 2.3131099691306445, + "language_loss": 0.72585857, + "learning_rate": 3.8358632720517695e-06, + "loss": 0.7478416, + "num_input_tokens_seen": 56334005, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.0, + "step": 2594, + "time_per_iteration": 2.454946994781494 + }, + { + "auxiliary_loss_clip": 0.01145676, + "auxiliary_loss_mlp": 0.01044453, + "balance_loss_clip": 1.02514088, + "balance_loss_mlp": 1.0476191, + "epoch": 0.15601984067338043, + "flos": 26722135503360.0, + "grad_norm": 1.980280068020953, + "language_loss": 0.8170377, + "learning_rate": 3.835708722764952e-06, + "loss": 0.83893895, + "num_input_tokens_seen": 56353795, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.98046875, + "step": 2595, + "time_per_iteration": 2.4859232902526855 + }, + { + "auxiliary_loss_clip": 0.01149214, + "auxiliary_loss_mlp": 0.01047121, + "balance_loss_clip": 1.02761889, + "balance_loss_mlp": 1.04722846, + "epoch": 0.1560799639260484, + "flos": 18369278173440.0, + "grad_norm": 2.3729637830877177, + "language_loss": 0.86587811, + "learning_rate": 3.835554103867876e-06, + "loss": 0.88784146, + "num_input_tokens_seen": 56373195, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.015625, + "step": 2596, + "time_per_iteration": 2.431851387023926 + }, + { + "auxiliary_loss_clip": 0.0114636, + "auxiliary_loss_mlp": 0.01043935, + "balance_loss_clip": 1.02558839, + "balance_loss_mlp": 1.04831815, + "epoch": 0.15614008717871636, + "flos": 22598980197120.0, + "grad_norm": 1.6624104890405602, + "language_loss": 0.68610018, + "learning_rate": 3.835399415366404e-06, + "loss": 0.70800316, + "num_input_tokens_seen": 56391525, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.98046875, + "step": 2597, + "time_per_iteration": 2.447265625 + }, + { + "auxiliary_loss_clip": 0.01144111, + "auxiliary_loss_mlp": 0.01040539, + "balance_loss_clip": 1.02210891, + "balance_loss_mlp": 1.04714298, + "epoch": 0.15620021043138435, + "flos": 22746860490240.0, + "grad_norm": 1.638980754682227, + "language_loss": 0.79885375, + "learning_rate": 3.8352446572664035e-06, + "loss": 0.82070029, + "num_input_tokens_seen": 56410715, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.97265625, + "step": 2598, + "time_per_iteration": 2.4641571044921875 + }, + { + "auxiliary_loss_clip": 0.01141262, + "auxiliary_loss_mlp": 0.01039052, + "balance_loss_clip": 1.02003777, + "balance_loss_mlp": 1.04484367, + "epoch": 0.15626033368405232, + "flos": 13114936782720.0, + "grad_norm": 2.19687533686526, + "language_loss": 0.82877028, + "learning_rate": 3.8350898295737405e-06, + "loss": 0.85057342, + "num_input_tokens_seen": 56429170, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.96484375, + "step": 2599, + "time_per_iteration": 2.419464111328125 + }, + { + "auxiliary_loss_clip": 0.01155461, + "auxiliary_loss_mlp": 0.01052363, + "balance_loss_clip": 1.03115571, + "balance_loss_mlp": 1.04991198, + "epoch": 0.15632045693672028, + "flos": 16472297105280.0, + "grad_norm": 3.412785735027946, + "language_loss": 0.81813747, + "learning_rate": 3.834934932294287e-06, + "loss": 0.84021574, + "num_input_tokens_seen": 56445685, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.0546875, + "step": 2600, + "time_per_iteration": 2.408848524093628 + }, + { + "auxiliary_loss_clip": 0.01152936, + "auxiliary_loss_mlp": 0.0104778, + "balance_loss_clip": 1.02813435, + "balance_loss_mlp": 1.05145574, + "epoch": 0.15638058018938825, + "flos": 20850346298880.0, + "grad_norm": 1.8570517134994367, + "language_loss": 0.8869983, + "learning_rate": 3.834779965433917e-06, + "loss": 0.90900552, + "num_input_tokens_seen": 56465900, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.015625, + "step": 2601, + "time_per_iteration": 3.8960022926330566 + }, + { + "auxiliary_loss_clip": 0.01155618, + "auxiliary_loss_mlp": 0.01064496, + "balance_loss_clip": 1.04250216, + "balance_loss_mlp": 1.05294669, + "epoch": 0.1564407034420562, + "flos": 21872220318720.0, + "grad_norm": 1.6572791804428935, + "language_loss": 0.78657669, + "learning_rate": 3.834624928998508e-06, + "loss": 0.80877781, + "num_input_tokens_seen": 56485020, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.0234375, + "step": 2602, + "time_per_iteration": 5.330498456954956 + }, + { + "auxiliary_loss_clip": 0.01148351, + "auxiliary_loss_mlp": 0.01041482, + "balance_loss_clip": 1.02178836, + "balance_loss_mlp": 1.04872918, + "epoch": 0.15650082669472418, + "flos": 21834549930240.0, + "grad_norm": 1.9481072701353659, + "language_loss": 0.73668396, + "learning_rate": 3.8344698229939376e-06, + "loss": 0.75858229, + "num_input_tokens_seen": 56505205, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.99609375, + "step": 2603, + "time_per_iteration": 2.4632985591888428 + }, + { + "auxiliary_loss_clip": 0.01152236, + "auxiliary_loss_mlp": 0.01051929, + "balance_loss_clip": 1.03205693, + "balance_loss_mlp": 1.05066442, + "epoch": 0.15656094994739214, + "flos": 13800542653440.0, + "grad_norm": 3.4624008692922583, + "language_loss": 0.87223339, + "learning_rate": 3.8343146474260865e-06, + "loss": 0.89427507, + "num_input_tokens_seen": 56521495, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.015625, + "step": 2604, + "time_per_iteration": 2.449589490890503 + }, + { + "auxiliary_loss_clip": 0.01151636, + "auxiliary_loss_mlp": 0.01043178, + "balance_loss_clip": 1.02404523, + "balance_loss_mlp": 1.04892218, + "epoch": 0.15662107320006013, + "flos": 27308197808640.0, + "grad_norm": 1.883819023069068, + "language_loss": 0.85465723, + "learning_rate": 3.834159402300841e-06, + "loss": 0.87660539, + "num_input_tokens_seen": 56540665, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0234375, + "step": 2605, + "time_per_iteration": 2.4958839416503906 + }, + { + "auxiliary_loss_clip": 0.01153078, + "auxiliary_loss_mlp": 0.01047461, + "balance_loss_clip": 1.0274334, + "balance_loss_mlp": 1.04840827, + "epoch": 0.1566811964527281, + "flos": 26685075646080.0, + "grad_norm": 2.4518366617864897, + "language_loss": 0.72954321, + "learning_rate": 3.834004087624087e-06, + "loss": 0.75154853, + "num_input_tokens_seen": 56560805, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.046875, + "step": 2606, + "time_per_iteration": 2.5142898559570312 + }, + { + "auxiliary_loss_clip": 0.01153185, + "auxiliary_loss_mlp": 0.01052184, + "balance_loss_clip": 1.03406429, + "balance_loss_mlp": 1.05257165, + "epoch": 0.15674131970539606, + "flos": 16103422385280.0, + "grad_norm": 1.9820673877795116, + "language_loss": 0.7643044, + "learning_rate": 3.8338487034017145e-06, + "loss": 0.78635812, + "num_input_tokens_seen": 56576335, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.0, + "step": 2607, + "time_per_iteration": 2.433779239654541 + }, + { + "auxiliary_loss_clip": 0.01150219, + "auxiliary_loss_mlp": 0.01046695, + "balance_loss_clip": 1.0282656, + "balance_loss_mlp": 1.05097091, + "epoch": 0.15680144295806403, + "flos": 19169690889600.0, + "grad_norm": 1.7850270515341367, + "language_loss": 0.8191157, + "learning_rate": 3.833693249639615e-06, + "loss": 0.8410849, + "num_input_tokens_seen": 56595880, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.9921875, + "step": 2608, + "time_per_iteration": 2.4599456787109375 + }, + { + "auxiliary_loss_clip": 0.0115477, + "auxiliary_loss_mlp": 0.01051079, + "balance_loss_clip": 1.03001475, + "balance_loss_mlp": 1.05087662, + "epoch": 0.156861566210732, + "flos": 20813430096000.0, + "grad_norm": 1.762197880640894, + "language_loss": 0.72479111, + "learning_rate": 3.833537726343684e-06, + "loss": 0.74684954, + "num_input_tokens_seen": 56615130, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.0390625, + "step": 2609, + "time_per_iteration": 2.478262424468994 + }, + { + "auxiliary_loss_clip": 0.0115339, + "auxiliary_loss_mlp": 0.01043612, + "balance_loss_clip": 1.02415729, + "balance_loss_mlp": 1.04881263, + "epoch": 0.15692168946339996, + "flos": 20047922421120.0, + "grad_norm": 1.8833233307981396, + "language_loss": 0.71974212, + "learning_rate": 3.833382133519818e-06, + "loss": 0.74171209, + "num_input_tokens_seen": 56634005, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.046875, + "step": 2610, + "time_per_iteration": 2.468616247177124 + }, + { + "auxiliary_loss_clip": 0.01153055, + "auxiliary_loss_mlp": 0.01052533, + "balance_loss_clip": 1.03119481, + "balance_loss_mlp": 1.04865789, + "epoch": 0.15698181271606793, + "flos": 21398019943680.0, + "grad_norm": 2.0486839750324117, + "language_loss": 0.72148776, + "learning_rate": 3.833226471173919e-06, + "loss": 0.74354362, + "num_input_tokens_seen": 56653480, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.046875, + "step": 2611, + "time_per_iteration": 2.4812967777252197 + }, + { + "auxiliary_loss_clip": 0.01152967, + "auxiliary_loss_mlp": 0.01044873, + "balance_loss_clip": 1.02517986, + "balance_loss_mlp": 1.05081797, + "epoch": 0.15704193596873592, + "flos": 20845785271680.0, + "grad_norm": 2.1526303920645153, + "language_loss": 0.70732605, + "learning_rate": 3.833070739311887e-06, + "loss": 0.72930443, + "num_input_tokens_seen": 56672270, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0234375, + "step": 2612, + "time_per_iteration": 2.4659905433654785 + }, + { + "auxiliary_loss_clip": 0.0115345, + "auxiliary_loss_mlp": 0.01053573, + "balance_loss_clip": 1.03448749, + "balance_loss_mlp": 1.05112672, + "epoch": 0.15710205922140388, + "flos": 21762908254080.0, + "grad_norm": 1.98698506128839, + "language_loss": 0.75649011, + "learning_rate": 3.83291493793963e-06, + "loss": 0.77856034, + "num_input_tokens_seen": 56691510, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0234375, + "step": 2613, + "time_per_iteration": 2.5053935050964355 + }, + { + "auxiliary_loss_clip": 0.01150247, + "auxiliary_loss_mlp": 0.01054631, + "balance_loss_clip": 1.03454411, + "balance_loss_mlp": 1.04870725, + "epoch": 0.15716218247407185, + "flos": 25007760201600.0, + "grad_norm": 1.7256548803860323, + "language_loss": 0.6593504, + "learning_rate": 3.832759067063055e-06, + "loss": 0.68139917, + "num_input_tokens_seen": 56712230, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.015625, + "step": 2614, + "time_per_iteration": 2.49568772315979 + }, + { + "auxiliary_loss_clip": 0.01154976, + "auxiliary_loss_mlp": 0.01050381, + "balance_loss_clip": 1.02972233, + "balance_loss_mlp": 1.04979289, + "epoch": 0.1572223057267398, + "flos": 20191780391040.0, + "grad_norm": 2.1509467282749055, + "language_loss": 0.7554003, + "learning_rate": 3.832603126688072e-06, + "loss": 0.7774539, + "num_input_tokens_seen": 56727490, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.0546875, + "step": 2615, + "time_per_iteration": 2.529383420944214 + }, + { + "auxiliary_loss_clip": 0.0115204, + "auxiliary_loss_mlp": 0.01052516, + "balance_loss_clip": 1.03374028, + "balance_loss_mlp": 1.05295634, + "epoch": 0.15728242897940778, + "flos": 20959514709120.0, + "grad_norm": 1.616950748432624, + "language_loss": 0.72989607, + "learning_rate": 3.832447116820594e-06, + "loss": 0.75194162, + "num_input_tokens_seen": 56747385, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9921875, + "step": 2616, + "time_per_iteration": 2.5096960067749023 + }, + { + "auxiliary_loss_clip": 0.01152584, + "auxiliary_loss_mlp": 0.01055054, + "balance_loss_clip": 1.03453839, + "balance_loss_mlp": 1.04991412, + "epoch": 0.15734255223207574, + "flos": 23038275530880.0, + "grad_norm": 3.5663633553154774, + "language_loss": 0.72316766, + "learning_rate": 3.832291037466539e-06, + "loss": 0.74524403, + "num_input_tokens_seen": 56768055, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0234375, + "step": 2617, + "time_per_iteration": 2.46756911277771 + }, + { + "auxiliary_loss_clip": 0.01151577, + "auxiliary_loss_mlp": 0.01043789, + "balance_loss_clip": 1.02453637, + "balance_loss_mlp": 1.05169988, + "epoch": 0.15740267548474374, + "flos": 20551281661440.0, + "grad_norm": 2.0296559288157563, + "language_loss": 0.74336463, + "learning_rate": 3.8321348886318235e-06, + "loss": 0.76531827, + "num_input_tokens_seen": 56785110, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.0, + "step": 2618, + "time_per_iteration": 2.4584109783172607 + }, + { + "auxiliary_loss_clip": 0.01156356, + "auxiliary_loss_mlp": 0.01051737, + "balance_loss_clip": 1.02976644, + "balance_loss_mlp": 1.05079079, + "epoch": 0.1574627987374117, + "flos": 22666922772480.0, + "grad_norm": 2.116136233608656, + "language_loss": 0.78624105, + "learning_rate": 3.8319786703223695e-06, + "loss": 0.80832201, + "num_input_tokens_seen": 56804975, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.0546875, + "step": 2619, + "time_per_iteration": 2.481902837753296 + }, + { + "auxiliary_loss_clip": 0.01151953, + "auxiliary_loss_mlp": 0.01052764, + "balance_loss_clip": 1.03373837, + "balance_loss_mlp": 1.05213726, + "epoch": 0.15752292199007967, + "flos": 16800664262400.0, + "grad_norm": 1.705564128099723, + "language_loss": 0.76632881, + "learning_rate": 3.831822382544101e-06, + "loss": 0.78837597, + "num_input_tokens_seen": 56822470, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0, + "step": 2620, + "time_per_iteration": 2.432645082473755 + }, + { + "auxiliary_loss_clip": 0.01153614, + "auxiliary_loss_mlp": 0.01050007, + "balance_loss_clip": 1.02901375, + "balance_loss_mlp": 1.05096626, + "epoch": 0.15758304524274763, + "flos": 29826002568960.0, + "grad_norm": 1.7942321132139696, + "language_loss": 0.70836174, + "learning_rate": 3.831666025302944e-06, + "loss": 0.73039794, + "num_input_tokens_seen": 56842100, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.03125, + "step": 2621, + "time_per_iteration": 2.5259244441986084 + }, + { + "auxiliary_loss_clip": 0.01156472, + "auxiliary_loss_mlp": 0.01049198, + "balance_loss_clip": 1.0277524, + "balance_loss_mlp": 1.05222857, + "epoch": 0.1576431684954156, + "flos": 53577426723840.0, + "grad_norm": 2.5825564073202467, + "language_loss": 0.71880406, + "learning_rate": 3.831509598604828e-06, + "loss": 0.74086076, + "num_input_tokens_seen": 56865920, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.046875, + "step": 2622, + "time_per_iteration": 2.738351583480835 + }, + { + "auxiliary_loss_clip": 0.01153726, + "auxiliary_loss_mlp": 0.01047401, + "balance_loss_clip": 1.02826762, + "balance_loss_mlp": 1.05162704, + "epoch": 0.15770329174808356, + "flos": 20813609664000.0, + "grad_norm": 1.7275011876813262, + "language_loss": 0.87603116, + "learning_rate": 3.831353102455684e-06, + "loss": 0.89804244, + "num_input_tokens_seen": 56885265, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0234375, + "step": 2623, + "time_per_iteration": 2.439276695251465 + }, + { + "auxiliary_loss_clip": 0.01153997, + "auxiliary_loss_mlp": 0.01046147, + "balance_loss_clip": 1.02774143, + "balance_loss_mlp": 1.05301619, + "epoch": 0.15776341500075153, + "flos": 24974004395520.0, + "grad_norm": 1.7488793041913886, + "language_loss": 0.82132548, + "learning_rate": 3.831196536861448e-06, + "loss": 0.84332693, + "num_input_tokens_seen": 56906710, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.0078125, + "step": 2624, + "time_per_iteration": 2.5011823177337646 + }, + { + "auxiliary_loss_clip": 0.01156666, + "auxiliary_loss_mlp": 0.01047764, + "balance_loss_clip": 1.02720022, + "balance_loss_mlp": 1.0518285, + "epoch": 0.15782353825341952, + "flos": 21907915459200.0, + "grad_norm": 2.213311097116894, + "language_loss": 0.79965818, + "learning_rate": 3.831039901828054e-06, + "loss": 0.82170242, + "num_input_tokens_seen": 56924275, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.046875, + "step": 2625, + "time_per_iteration": 2.469705581665039 + }, + { + "auxiliary_loss_clip": 0.01152837, + "auxiliary_loss_mlp": 0.01050956, + "balance_loss_clip": 1.03215635, + "balance_loss_mlp": 1.05189955, + "epoch": 0.15788366150608749, + "flos": 26177191292160.0, + "grad_norm": 2.0497226184185044, + "language_loss": 0.80393386, + "learning_rate": 3.830883197361445e-06, + "loss": 0.82597172, + "num_input_tokens_seen": 56941525, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0078125, + "step": 2626, + "time_per_iteration": 2.4822630882263184 + }, + { + "auxiliary_loss_clip": 0.01157567, + "auxiliary_loss_mlp": 0.01046921, + "balance_loss_clip": 1.02703679, + "balance_loss_mlp": 1.05660009, + "epoch": 0.15794378475875545, + "flos": 27709822753920.0, + "grad_norm": 1.8439314798963051, + "language_loss": 0.73819017, + "learning_rate": 3.830726423467561e-06, + "loss": 0.76023501, + "num_input_tokens_seen": 56962145, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0078125, + "step": 2627, + "time_per_iteration": 2.5146384239196777 + }, + { + "auxiliary_loss_clip": 0.01153645, + "auxiliary_loss_mlp": 0.01055765, + "balance_loss_clip": 1.03515387, + "balance_loss_mlp": 1.05136025, + "epoch": 0.15800390801142342, + "flos": 12130158533760.0, + "grad_norm": 2.581375347872909, + "language_loss": 0.84926289, + "learning_rate": 3.830569580152348e-06, + "loss": 0.87135696, + "num_input_tokens_seen": 56977505, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.0234375, + "step": 2628, + "time_per_iteration": 2.476461172103882 + }, + { + "auxiliary_loss_clip": 0.01152526, + "auxiliary_loss_mlp": 0.01039179, + "balance_loss_clip": 1.02045107, + "balance_loss_mlp": 1.05181646, + "epoch": 0.15806403126409138, + "flos": 20704728562560.0, + "grad_norm": 1.9330212081502065, + "language_loss": 0.76414472, + "learning_rate": 3.830412667421752e-06, + "loss": 0.78606176, + "num_input_tokens_seen": 56996770, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0078125, + "step": 2629, + "time_per_iteration": 2.4604575634002686 + }, + { + "auxiliary_loss_clip": 0.01157301, + "auxiliary_loss_mlp": 0.01053673, + "balance_loss_clip": 1.03277516, + "balance_loss_mlp": 1.05376625, + "epoch": 0.15812415451675935, + "flos": 17821712269440.0, + "grad_norm": 2.3335878107949624, + "language_loss": 0.73786485, + "learning_rate": 3.8302556852817245e-06, + "loss": 0.7599746, + "num_input_tokens_seen": 57014970, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.0390625, + "step": 2630, + "time_per_iteration": 2.4556961059570312 + }, + { + "auxiliary_loss_clip": 0.01159154, + "auxiliary_loss_mlp": 0.01049527, + "balance_loss_clip": 1.02934527, + "balance_loss_mlp": 1.05278432, + "epoch": 0.15818427776942734, + "flos": 20084048524800.0, + "grad_norm": 3.0799062126580385, + "language_loss": 0.83732498, + "learning_rate": 3.8300986337382184e-06, + "loss": 0.85941184, + "num_input_tokens_seen": 57034045, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0625, + "step": 2631, + "time_per_iteration": 2.46466326713562 + }, + { + "auxiliary_loss_clip": 0.0115417, + "auxiliary_loss_mlp": 0.01047476, + "balance_loss_clip": 1.02800894, + "balance_loss_mlp": 1.05072045, + "epoch": 0.1582444010220953, + "flos": 21214911386880.0, + "grad_norm": 1.8231521117013414, + "language_loss": 0.78509778, + "learning_rate": 3.8299415127971895e-06, + "loss": 0.80711424, + "num_input_tokens_seen": 57053695, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0390625, + "step": 2632, + "time_per_iteration": 2.4678170680999756 + }, + { + "auxiliary_loss_clip": 0.01160199, + "auxiliary_loss_mlp": 0.01058182, + "balance_loss_clip": 1.03766572, + "balance_loss_mlp": 1.05516291, + "epoch": 0.15830452427476327, + "flos": 17858341163520.0, + "grad_norm": 2.1429957658458374, + "language_loss": 0.83250827, + "learning_rate": 3.829784322464594e-06, + "loss": 0.8546921, + "num_input_tokens_seen": 57071290, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.046875, + "step": 2633, + "time_per_iteration": 2.4329495429992676 + }, + { + "auxiliary_loss_clip": 0.01161566, + "auxiliary_loss_mlp": 0.01046458, + "balance_loss_clip": 1.02641928, + "balance_loss_mlp": 1.05591452, + "epoch": 0.15836464752743123, + "flos": 24534960456960.0, + "grad_norm": 1.9651575849984717, + "language_loss": 0.77401066, + "learning_rate": 3.829627062746394e-06, + "loss": 0.79609084, + "num_input_tokens_seen": 57091465, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.0625, + "step": 2634, + "time_per_iteration": 2.4989452362060547 + }, + { + "auxiliary_loss_clip": 0.01158347, + "auxiliary_loss_mlp": 0.01049894, + "balance_loss_clip": 1.02961695, + "balance_loss_mlp": 1.05281138, + "epoch": 0.1584247707800992, + "flos": 20120821073280.0, + "grad_norm": 2.178604932363088, + "language_loss": 0.89144027, + "learning_rate": 3.829469733648552e-06, + "loss": 0.91352272, + "num_input_tokens_seen": 57110075, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.0546875, + "step": 2635, + "time_per_iteration": 2.45926570892334 + }, + { + "auxiliary_loss_clip": 0.0115666, + "auxiliary_loss_mlp": 0.0105615, + "balance_loss_clip": 1.03518081, + "balance_loss_mlp": 1.05145168, + "epoch": 0.15848489403276717, + "flos": 20375966355840.0, + "grad_norm": 2.07071202721755, + "language_loss": 0.75814605, + "learning_rate": 3.829312335177034e-06, + "loss": 0.78027415, + "num_input_tokens_seen": 57128945, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.046875, + "step": 2636, + "time_per_iteration": 2.4601919651031494 + }, + { + "auxiliary_loss_clip": 0.01159967, + "auxiliary_loss_mlp": 0.01046973, + "balance_loss_clip": 1.0252409, + "balance_loss_mlp": 1.05383635, + "epoch": 0.15854501728543513, + "flos": 39346890359040.0, + "grad_norm": 2.192817266182781, + "language_loss": 0.72065628, + "learning_rate": 3.82915486733781e-06, + "loss": 0.74272561, + "num_input_tokens_seen": 57152385, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.0625, + "step": 2637, + "time_per_iteration": 2.6509416103363037 + }, + { + "auxiliary_loss_clip": 0.01154792, + "auxiliary_loss_mlp": 0.01042754, + "balance_loss_clip": 1.02395523, + "balance_loss_mlp": 1.05307317, + "epoch": 0.15860514053810312, + "flos": 24864225454080.0, + "grad_norm": 1.9644709833035638, + "language_loss": 0.77938193, + "learning_rate": 3.82899733013685e-06, + "loss": 0.80135739, + "num_input_tokens_seen": 57172620, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 2638, + "time_per_iteration": 2.516597032546997 + }, + { + "auxiliary_loss_clip": 0.01160159, + "auxiliary_loss_mlp": 0.01062214, + "balance_loss_clip": 1.04074407, + "balance_loss_mlp": 1.05348861, + "epoch": 0.1586652637907711, + "flos": 26177694082560.0, + "grad_norm": 1.8473853011869859, + "language_loss": 0.75521988, + "learning_rate": 3.828839723580128e-06, + "loss": 0.77744359, + "num_input_tokens_seen": 57194680, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.0625, + "step": 2639, + "time_per_iteration": 2.5517024993896484 + }, + { + "auxiliary_loss_clip": 0.01159513, + "auxiliary_loss_mlp": 0.01061213, + "balance_loss_clip": 1.04115009, + "balance_loss_mlp": 1.0541048, + "epoch": 0.15872538704343905, + "flos": 19792058866560.0, + "grad_norm": 2.7935559917311212, + "language_loss": 0.81487972, + "learning_rate": 3.82868204767362e-06, + "loss": 0.83708692, + "num_input_tokens_seen": 57214675, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0546875, + "step": 2640, + "time_per_iteration": 2.5613112449645996 + }, + { + "auxiliary_loss_clip": 0.01152653, + "auxiliary_loss_mlp": 0.01050922, + "balance_loss_clip": 1.030406, + "balance_loss_mlp": 1.05107331, + "epoch": 0.15878551029610702, + "flos": 28475366342400.0, + "grad_norm": 1.4887809421561018, + "language_loss": 0.67051661, + "learning_rate": 3.828524302423306e-06, + "loss": 0.69255233, + "num_input_tokens_seen": 57235830, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.015625, + "step": 2641, + "time_per_iteration": 2.5603220462799072 + }, + { + "auxiliary_loss_clip": 0.01163302, + "auxiliary_loss_mlp": 0.01057677, + "balance_loss_clip": 1.03670835, + "balance_loss_mlp": 1.05338526, + "epoch": 0.15884563354877498, + "flos": 24206701040640.0, + "grad_norm": 2.894977763056953, + "language_loss": 0.7508198, + "learning_rate": 3.828366487835167e-06, + "loss": 0.77302957, + "num_input_tokens_seen": 57255970, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.09375, + "step": 2642, + "time_per_iteration": 2.4783003330230713 + }, + { + "auxiliary_loss_clip": 0.01154514, + "auxiliary_loss_mlp": 0.01054374, + "balance_loss_clip": 1.0343703, + "balance_loss_mlp": 1.05342579, + "epoch": 0.15890575680144295, + "flos": 23949795991680.0, + "grad_norm": 2.1233146618452046, + "language_loss": 0.70096999, + "learning_rate": 3.828208603915186e-06, + "loss": 0.72305882, + "num_input_tokens_seen": 57274435, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.015625, + "step": 2643, + "time_per_iteration": 3.8417530059814453 + }, + { + "auxiliary_loss_clip": 0.0115474, + "auxiliary_loss_mlp": 0.01046992, + "balance_loss_clip": 1.02801371, + "balance_loss_mlp": 1.05399418, + "epoch": 0.15896588005411091, + "flos": 21215019127680.0, + "grad_norm": 2.266510625665779, + "language_loss": 0.78172421, + "learning_rate": 3.828050650669353e-06, + "loss": 0.80374151, + "num_input_tokens_seen": 57293115, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0078125, + "step": 2644, + "time_per_iteration": 3.918332099914551 + }, + { + "auxiliary_loss_clip": 0.01155626, + "auxiliary_loss_mlp": 0.01054747, + "balance_loss_clip": 1.03432608, + "balance_loss_mlp": 1.05189228, + "epoch": 0.1590260033067789, + "flos": 24352390604160.0, + "grad_norm": 1.8745538844001242, + "language_loss": 0.82203078, + "learning_rate": 3.827892628103657e-06, + "loss": 0.84413457, + "num_input_tokens_seen": 57312565, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0390625, + "step": 2645, + "time_per_iteration": 2.484264373779297 + }, + { + "auxiliary_loss_clip": 0.01156639, + "auxiliary_loss_mlp": 0.01055562, + "balance_loss_clip": 1.0340929, + "balance_loss_mlp": 1.05192447, + "epoch": 0.15908612655944687, + "flos": 32048944583040.0, + "grad_norm": 1.974907168100252, + "language_loss": 0.69778836, + "learning_rate": 3.827734536224087e-06, + "loss": 0.71991032, + "num_input_tokens_seen": 57333360, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.046875, + "step": 2646, + "time_per_iteration": 2.5406665802001953 + }, + { + "auxiliary_loss_clip": 0.01151139, + "auxiliary_loss_mlp": 0.01046289, + "balance_loss_clip": 1.02738249, + "balance_loss_mlp": 1.05206954, + "epoch": 0.15914624981211484, + "flos": 17785370684160.0, + "grad_norm": 2.5066454352116914, + "language_loss": 0.62659109, + "learning_rate": 3.827576375036642e-06, + "loss": 0.64856541, + "num_input_tokens_seen": 57350575, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.9921875, + "step": 2647, + "time_per_iteration": 2.442711353302002 + }, + { + "auxiliary_loss_clip": 0.01155368, + "auxiliary_loss_mlp": 0.01052347, + "balance_loss_clip": 1.03226066, + "balance_loss_mlp": 1.05410099, + "epoch": 0.1592063730647828, + "flos": 17712507945600.0, + "grad_norm": 2.1253745247586204, + "language_loss": 0.8942067, + "learning_rate": 3.827418144547318e-06, + "loss": 0.91628385, + "num_input_tokens_seen": 57367570, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.015625, + "step": 2648, + "time_per_iteration": 2.4649319648742676 + }, + { + "auxiliary_loss_clip": 0.01152722, + "auxiliary_loss_mlp": 0.01049569, + "balance_loss_clip": 1.03141308, + "balance_loss_mlp": 1.05391204, + "epoch": 0.15926649631745077, + "flos": 18803545603200.0, + "grad_norm": 1.8651001097947648, + "language_loss": 0.91716385, + "learning_rate": 3.827259844762114e-06, + "loss": 0.93918669, + "num_input_tokens_seen": 57383980, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.98828125, + "step": 2649, + "time_per_iteration": 2.451261520385742 + }, + { + "auxiliary_loss_clip": 0.01163223, + "auxiliary_loss_mlp": 0.01049063, + "balance_loss_clip": 1.02802217, + "balance_loss_mlp": 1.05272281, + "epoch": 0.15932661957011873, + "flos": 17566243764480.0, + "grad_norm": 3.3226984417644028, + "language_loss": 0.71273595, + "learning_rate": 3.827101475687033e-06, + "loss": 0.73485881, + "num_input_tokens_seen": 57400840, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1015625, + "step": 2650, + "time_per_iteration": 2.43603253364563 + }, + { + "auxiliary_loss_clip": 0.01153823, + "auxiliary_loss_mlp": 0.01044738, + "balance_loss_clip": 1.02695203, + "balance_loss_mlp": 1.05372715, + "epoch": 0.15938674282278673, + "flos": 13334351011200.0, + "grad_norm": 2.4247432930640898, + "language_loss": 0.71116996, + "learning_rate": 3.826943037328082e-06, + "loss": 0.73315561, + "num_input_tokens_seen": 57419230, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 1.0, + "step": 2651, + "time_per_iteration": 2.467451572418213 + }, + { + "auxiliary_loss_clip": 0.01155622, + "auxiliary_loss_mlp": 0.01049144, + "balance_loss_clip": 1.02912855, + "balance_loss_mlp": 1.0513978, + "epoch": 0.1594468660754547, + "flos": 22488842119680.0, + "grad_norm": 1.909821572556346, + "language_loss": 0.7997523, + "learning_rate": 3.8267845296912674e-06, + "loss": 0.82179999, + "num_input_tokens_seen": 57439315, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.046875, + "step": 2652, + "time_per_iteration": 2.519624948501587 + }, + { + "auxiliary_loss_clip": 0.01153837, + "auxiliary_loss_mlp": 0.01045946, + "balance_loss_clip": 1.02665794, + "balance_loss_mlp": 1.05385149, + "epoch": 0.15950698932812266, + "flos": 15007320910080.0, + "grad_norm": 2.695147262103697, + "language_loss": 0.70050812, + "learning_rate": 3.826625952782601e-06, + "loss": 0.72250587, + "num_input_tokens_seen": 57454635, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.0, + "step": 2653, + "time_per_iteration": 2.439445972442627 + }, + { + "auxiliary_loss_clip": 0.01154814, + "auxiliary_loss_mlp": 0.01043059, + "balance_loss_clip": 1.02309155, + "balance_loss_mlp": 1.05308652, + "epoch": 0.15956711258079062, + "flos": 30155052084480.0, + "grad_norm": 2.046273350718209, + "language_loss": 0.76509416, + "learning_rate": 3.826467306608095e-06, + "loss": 0.7870729, + "num_input_tokens_seen": 57476805, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.015625, + "step": 2654, + "time_per_iteration": 2.529644012451172 + }, + { + "auxiliary_loss_clip": 0.01154147, + "auxiliary_loss_mlp": 0.01047134, + "balance_loss_clip": 1.02750051, + "balance_loss_mlp": 1.0526185, + "epoch": 0.1596272358334586, + "flos": 21032700670080.0, + "grad_norm": 1.961582700797155, + "language_loss": 0.8208828, + "learning_rate": 3.826308591173765e-06, + "loss": 0.84289569, + "num_input_tokens_seen": 57496400, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.015625, + "step": 2655, + "time_per_iteration": 2.4841158390045166 + }, + { + "auxiliary_loss_clip": 0.01154679, + "auxiliary_loss_mlp": 0.01049984, + "balance_loss_clip": 1.03166127, + "balance_loss_mlp": 1.05125904, + "epoch": 0.15968735908612655, + "flos": 15268032800640.0, + "grad_norm": 2.077546195878165, + "language_loss": 0.73565602, + "learning_rate": 3.826149806485631e-06, + "loss": 0.75770259, + "num_input_tokens_seen": 57513700, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.03125, + "step": 2656, + "time_per_iteration": 2.4727072715759277 + }, + { + "auxiliary_loss_clip": 0.01149623, + "auxiliary_loss_mlp": 0.01046235, + "balance_loss_clip": 1.02766216, + "balance_loss_mlp": 1.05170095, + "epoch": 0.15974748233879452, + "flos": 52665726695040.0, + "grad_norm": 1.884771930829773, + "language_loss": 0.77508467, + "learning_rate": 3.825990952549713e-06, + "loss": 0.79704326, + "num_input_tokens_seen": 57536180, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.98046875, + "step": 2657, + "time_per_iteration": 2.801560401916504 + }, + { + "auxiliary_loss_clip": 0.01154211, + "auxiliary_loss_mlp": 0.01048143, + "balance_loss_clip": 1.02910495, + "balance_loss_mlp": 1.05459499, + "epoch": 0.1598076055914625, + "flos": 18733232730240.0, + "grad_norm": 1.6493844029380673, + "language_loss": 0.74807733, + "learning_rate": 3.825832029372035e-06, + "loss": 0.77010089, + "num_input_tokens_seen": 57555025, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.99609375, + "step": 2658, + "time_per_iteration": 2.4434328079223633 + }, + { + "auxiliary_loss_clip": 0.01155878, + "auxiliary_loss_mlp": 0.01050595, + "balance_loss_clip": 1.02912521, + "balance_loss_mlp": 1.05291355, + "epoch": 0.15986772884413047, + "flos": 34349238535680.0, + "grad_norm": 1.8153435843839463, + "language_loss": 0.75194407, + "learning_rate": 3.825673036958624e-06, + "loss": 0.77400887, + "num_input_tokens_seen": 57577660, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.03125, + "step": 2659, + "time_per_iteration": 2.587700366973877 + }, + { + "auxiliary_loss_clip": 0.01159224, + "auxiliary_loss_mlp": 0.01052946, + "balance_loss_clip": 1.03295422, + "balance_loss_mlp": 1.05531979, + "epoch": 0.15992785209679844, + "flos": 22054969739520.0, + "grad_norm": 2.4521775760186526, + "language_loss": 0.90417045, + "learning_rate": 3.825513975315508e-06, + "loss": 0.92629218, + "num_input_tokens_seen": 57596335, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0390625, + "step": 2660, + "time_per_iteration": 2.45237398147583 + }, + { + "auxiliary_loss_clip": 0.01161295, + "auxiliary_loss_mlp": 0.0105014, + "balance_loss_clip": 1.0300889, + "balance_loss_mlp": 1.05822825, + "epoch": 0.1599879753494664, + "flos": 33066652625280.0, + "grad_norm": 2.0123178843036373, + "language_loss": 0.77552611, + "learning_rate": 3.82535484444872e-06, + "loss": 0.79764044, + "num_input_tokens_seen": 57616830, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.03125, + "step": 2661, + "time_per_iteration": 2.574652910232544 + }, + { + "auxiliary_loss_clip": 0.01158998, + "auxiliary_loss_mlp": 0.01048944, + "balance_loss_clip": 1.02913153, + "balance_loss_mlp": 1.05460262, + "epoch": 0.16004809860213437, + "flos": 28038010343040.0, + "grad_norm": 1.7348749157972516, + "language_loss": 0.74735796, + "learning_rate": 3.825195644364292e-06, + "loss": 0.76943737, + "num_input_tokens_seen": 57635515, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.046875, + "step": 2662, + "time_per_iteration": 2.506974935531616 + }, + { + "auxiliary_loss_clip": 0.01158039, + "auxiliary_loss_mlp": 0.01051532, + "balance_loss_clip": 1.03233898, + "balance_loss_mlp": 1.05416894, + "epoch": 0.16010822185480234, + "flos": 22780113505920.0, + "grad_norm": 2.0770925688556074, + "language_loss": 0.82047677, + "learning_rate": 3.825036375068263e-06, + "loss": 0.84257245, + "num_input_tokens_seen": 57654250, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0390625, + "step": 2663, + "time_per_iteration": 2.459630012512207 + }, + { + "auxiliary_loss_clip": 0.0116012, + "auxiliary_loss_mlp": 0.0104966, + "balance_loss_clip": 1.02978826, + "balance_loss_mlp": 1.05576038, + "epoch": 0.16016834510747033, + "flos": 20084012611200.0, + "grad_norm": 2.5815812177362454, + "language_loss": 0.7910682, + "learning_rate": 3.824877036566672e-06, + "loss": 0.81316602, + "num_input_tokens_seen": 57672645, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0390625, + "step": 2664, + "time_per_iteration": 2.4978790283203125 + }, + { + "auxiliary_loss_clip": 0.01156133, + "auxiliary_loss_mlp": 0.01051164, + "balance_loss_clip": 1.03222167, + "balance_loss_mlp": 1.05318165, + "epoch": 0.1602284683601383, + "flos": 21173829206400.0, + "grad_norm": 1.8148985254226184, + "language_loss": 0.93767202, + "learning_rate": 3.824717628865561e-06, + "loss": 0.95974499, + "num_input_tokens_seen": 57691055, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.03125, + "step": 2665, + "time_per_iteration": 2.467349052429199 + }, + { + "auxiliary_loss_clip": 0.01157965, + "auxiliary_loss_mlp": 0.01047224, + "balance_loss_clip": 1.02750635, + "balance_loss_mlp": 1.05352151, + "epoch": 0.16028859161280626, + "flos": 14647568244480.0, + "grad_norm": 1.9534389472193405, + "language_loss": 0.85255575, + "learning_rate": 3.824558151970974e-06, + "loss": 0.87460762, + "num_input_tokens_seen": 57707235, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.046875, + "step": 2666, + "time_per_iteration": 2.4229867458343506 + }, + { + "auxiliary_loss_clip": 0.01155877, + "auxiliary_loss_mlp": 0.01047711, + "balance_loss_clip": 1.02899504, + "balance_loss_mlp": 1.05404496, + "epoch": 0.16034871486547422, + "flos": 20990325600000.0, + "grad_norm": 1.873987360542769, + "language_loss": 0.81461811, + "learning_rate": 3.8243986058889595e-06, + "loss": 0.83665401, + "num_input_tokens_seen": 57724190, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 2667, + "time_per_iteration": 2.4989583492279053 + }, + { + "auxiliary_loss_clip": 0.01157612, + "auxiliary_loss_mlp": 0.01050501, + "balance_loss_clip": 1.03104627, + "balance_loss_mlp": 1.05707479, + "epoch": 0.1604088381181422, + "flos": 21397732634880.0, + "grad_norm": 2.676276626789842, + "language_loss": 0.74079859, + "learning_rate": 3.824238990625567e-06, + "loss": 0.76287973, + "num_input_tokens_seen": 57743620, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0078125, + "step": 2668, + "time_per_iteration": 2.463395357131958 + }, + { + "auxiliary_loss_clip": 0.01158531, + "auxiliary_loss_mlp": 0.01051768, + "balance_loss_clip": 1.03175282, + "balance_loss_mlp": 1.05527806, + "epoch": 0.16046896137081015, + "flos": 23877040993920.0, + "grad_norm": 1.6382268793433732, + "language_loss": 0.77214229, + "learning_rate": 3.824079306186848e-06, + "loss": 0.79424524, + "num_input_tokens_seen": 57764810, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.03125, + "step": 2669, + "time_per_iteration": 2.5107781887054443 + }, + { + "auxiliary_loss_clip": 0.01059914, + "auxiliary_loss_mlp": 0.01008943, + "balance_loss_clip": 1.0062964, + "balance_loss_mlp": 1.0249362, + "epoch": 0.16052908462347812, + "flos": 59806709015040.0, + "grad_norm": 0.8072457077707946, + "language_loss": 0.55571371, + "learning_rate": 3.823919552578861e-06, + "loss": 0.57640231, + "num_input_tokens_seen": 57824390, + "router_z_loss_clip": 0.02648926, + "router_z_loss_mlp": 0.34960938, + "step": 2670, + "time_per_iteration": 2.964386463165283 + }, + { + "auxiliary_loss_clip": 0.01157188, + "auxiliary_loss_mlp": 0.01043938, + "balance_loss_clip": 1.02544856, + "balance_loss_mlp": 1.05379438, + "epoch": 0.1605892078761461, + "flos": 18296559089280.0, + "grad_norm": 8.31640977393562, + "language_loss": 0.77088535, + "learning_rate": 3.82375972980766e-06, + "loss": 0.79289663, + "num_input_tokens_seen": 57843665, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.03125, + "step": 2671, + "time_per_iteration": 2.4722845554351807 + }, + { + "auxiliary_loss_clip": 0.01159298, + "auxiliary_loss_mlp": 0.01045605, + "balance_loss_clip": 1.02684164, + "balance_loss_mlp": 1.05666459, + "epoch": 0.16064933112881408, + "flos": 32160734686080.0, + "grad_norm": 1.9636142117953166, + "language_loss": 0.64497644, + "learning_rate": 3.8235998378793086e-06, + "loss": 0.66702545, + "num_input_tokens_seen": 57863305, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.03125, + "step": 2672, + "time_per_iteration": 2.5702145099639893 + }, + { + "auxiliary_loss_clip": 0.01157155, + "auxiliary_loss_mlp": 0.01042294, + "balance_loss_clip": 1.02128983, + "balance_loss_mlp": 1.05270457, + "epoch": 0.16070945438148204, + "flos": 19828795501440.0, + "grad_norm": 1.885579538712505, + "language_loss": 0.8533771, + "learning_rate": 3.8234398767998675e-06, + "loss": 0.87537158, + "num_input_tokens_seen": 57883025, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.046875, + "step": 2673, + "time_per_iteration": 2.4754209518432617 + }, + { + "auxiliary_loss_clip": 0.01156938, + "auxiliary_loss_mlp": 0.01055602, + "balance_loss_clip": 1.03718424, + "balance_loss_mlp": 1.05537605, + "epoch": 0.16076957763415, + "flos": 18913144976640.0, + "grad_norm": 2.484212796080384, + "language_loss": 0.72797197, + "learning_rate": 3.823279846575403e-06, + "loss": 0.75009739, + "num_input_tokens_seen": 57901430, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.015625, + "step": 2674, + "time_per_iteration": 2.4771230220794678 + }, + { + "auxiliary_loss_clip": 0.01153717, + "auxiliary_loss_mlp": 0.01047616, + "balance_loss_clip": 1.02745771, + "balance_loss_mlp": 1.05242229, + "epoch": 0.16082970088681797, + "flos": 16764358590720.0, + "grad_norm": 2.0917218572710143, + "language_loss": 0.84550452, + "learning_rate": 3.823119747211986e-06, + "loss": 0.86751789, + "num_input_tokens_seen": 57919550, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.015625, + "step": 2675, + "time_per_iteration": 2.4583237171173096 + }, + { + "auxiliary_loss_clip": 0.01158822, + "auxiliary_loss_mlp": 0.0104935, + "balance_loss_clip": 1.02890563, + "balance_loss_mlp": 1.0566349, + "epoch": 0.16088982413948594, + "flos": 35150261783040.0, + "grad_norm": 1.979365293626276, + "language_loss": 0.82605797, + "learning_rate": 3.822959578715685e-06, + "loss": 0.84813964, + "num_input_tokens_seen": 57939890, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0234375, + "step": 2676, + "time_per_iteration": 2.5966403484344482 + }, + { + "auxiliary_loss_clip": 0.01157172, + "auxiliary_loss_mlp": 0.01050632, + "balance_loss_clip": 1.03263116, + "balance_loss_mlp": 1.05701363, + "epoch": 0.1609499473921539, + "flos": 18625105814400.0, + "grad_norm": 1.9372140801278581, + "language_loss": 0.73252106, + "learning_rate": 3.822799341092573e-06, + "loss": 0.75459909, + "num_input_tokens_seen": 57957410, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0, + "step": 2677, + "time_per_iteration": 2.459545135498047 + }, + { + "auxiliary_loss_clip": 0.01153742, + "auxiliary_loss_mlp": 0.01046774, + "balance_loss_clip": 1.02774811, + "balance_loss_mlp": 1.05381799, + "epoch": 0.1610100706448219, + "flos": 33145728416640.0, + "grad_norm": 3.4714871699848, + "language_loss": 0.76175338, + "learning_rate": 3.822639034348728e-06, + "loss": 0.78375852, + "num_input_tokens_seen": 57977900, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0, + "step": 2678, + "time_per_iteration": 2.6220550537109375 + }, + { + "auxiliary_loss_clip": 0.01154476, + "auxiliary_loss_mlp": 0.01048242, + "balance_loss_clip": 1.02835846, + "balance_loss_mlp": 1.05157948, + "epoch": 0.16107019389748986, + "flos": 34676707852800.0, + "grad_norm": 1.6939354956764687, + "language_loss": 0.70202518, + "learning_rate": 3.822478658490228e-06, + "loss": 0.72405231, + "num_input_tokens_seen": 57998210, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.03125, + "step": 2679, + "time_per_iteration": 2.580995559692383 + }, + { + "auxiliary_loss_clip": 0.01054747, + "auxiliary_loss_mlp": 0.01023179, + "balance_loss_clip": 1.020854, + "balance_loss_mlp": 1.02026391, + "epoch": 0.16113031715015783, + "flos": 65713403260800.0, + "grad_norm": 0.8161414687228778, + "language_loss": 0.51844025, + "learning_rate": 3.822318213523154e-06, + "loss": 0.5392195, + "num_input_tokens_seen": 58059420, + "router_z_loss_clip": 0.02319336, + "router_z_loss_mlp": 0.34375, + "step": 2680, + "time_per_iteration": 3.105682849884033 + }, + { + "auxiliary_loss_clip": 0.01155604, + "auxiliary_loss_mlp": 0.01047691, + "balance_loss_clip": 1.02750874, + "balance_loss_mlp": 1.05157876, + "epoch": 0.1611904404028258, + "flos": 20810413353600.0, + "grad_norm": 1.8335073832427007, + "language_loss": 0.80319828, + "learning_rate": 3.8221576994535925e-06, + "loss": 0.82523119, + "num_input_tokens_seen": 58078370, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0390625, + "step": 2681, + "time_per_iteration": 2.4695565700531006 + }, + { + "auxiliary_loss_clip": 0.01151045, + "auxiliary_loss_mlp": 0.01058971, + "balance_loss_clip": 1.04031444, + "balance_loss_mlp": 1.05258918, + "epoch": 0.16125056365549376, + "flos": 27013335062400.0, + "grad_norm": 1.8021457293712753, + "language_loss": 0.69142133, + "learning_rate": 3.821997116287627e-06, + "loss": 0.71352148, + "num_input_tokens_seen": 58097395, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.984375, + "step": 2682, + "time_per_iteration": 2.5027854442596436 + }, + { + "auxiliary_loss_clip": 0.011576, + "auxiliary_loss_mlp": 0.01048243, + "balance_loss_clip": 1.02800107, + "balance_loss_mlp": 1.0559957, + "epoch": 0.16131068690816172, + "flos": 19276524915840.0, + "grad_norm": 1.8107912193408944, + "language_loss": 0.87568235, + "learning_rate": 3.821836464031348e-06, + "loss": 0.89774084, + "num_input_tokens_seen": 58115630, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.015625, + "step": 2683, + "time_per_iteration": 2.461944341659546 + }, + { + "auxiliary_loss_clip": 0.01156212, + "auxiliary_loss_mlp": 0.0105566, + "balance_loss_clip": 1.03587174, + "balance_loss_mlp": 1.05452991, + "epoch": 0.16137081016082971, + "flos": 35337931367040.0, + "grad_norm": 3.5824209574719035, + "language_loss": 0.74160969, + "learning_rate": 3.821675742690849e-06, + "loss": 0.76372838, + "num_input_tokens_seen": 58138655, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.015625, + "step": 2684, + "time_per_iteration": 4.005981206893921 + }, + { + "auxiliary_loss_clip": 0.01159701, + "auxiliary_loss_mlp": 0.01048543, + "balance_loss_clip": 1.02811038, + "balance_loss_mlp": 1.05543995, + "epoch": 0.16143093341349768, + "flos": 34235257703040.0, + "grad_norm": 1.919238603617177, + "language_loss": 0.70244128, + "learning_rate": 3.821514952272223e-06, + "loss": 0.72452366, + "num_input_tokens_seen": 58157440, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.046875, + "step": 2685, + "time_per_iteration": 5.387023448944092 + }, + { + "auxiliary_loss_clip": 0.0115036, + "auxiliary_loss_mlp": 0.01047397, + "balance_loss_clip": 1.0282284, + "balance_loss_mlp": 1.0518229, + "epoch": 0.16149105666616564, + "flos": 27999262546560.0, + "grad_norm": 1.8016019482814314, + "language_loss": 0.71518582, + "learning_rate": 3.821354092781567e-06, + "loss": 0.73716336, + "num_input_tokens_seen": 58176660, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.984375, + "step": 2686, + "time_per_iteration": 2.5451064109802246 + }, + { + "auxiliary_loss_clip": 0.01157161, + "auxiliary_loss_mlp": 0.01051189, + "balance_loss_clip": 1.03191292, + "balance_loss_mlp": 1.05551481, + "epoch": 0.1615511799188336, + "flos": 19422214479360.0, + "grad_norm": 1.8631629169214377, + "language_loss": 0.81521869, + "learning_rate": 3.821193164224981e-06, + "loss": 0.83730221, + "num_input_tokens_seen": 58195085, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.015625, + "step": 2687, + "time_per_iteration": 2.4542620182037354 + }, + { + "auxiliary_loss_clip": 0.01155843, + "auxiliary_loss_mlp": 0.01044301, + "balance_loss_clip": 1.02327275, + "balance_loss_mlp": 1.04894984, + "epoch": 0.16161130317150157, + "flos": 22854915578880.0, + "grad_norm": 1.8081463969498348, + "language_loss": 0.71823454, + "learning_rate": 3.821032166608568e-06, + "loss": 0.74023592, + "num_input_tokens_seen": 58213540, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.0625, + "step": 2688, + "time_per_iteration": 2.493476152420044 + }, + { + "auxiliary_loss_clip": 0.0115191, + "auxiliary_loss_mlp": 0.01045785, + "balance_loss_clip": 1.02730739, + "balance_loss_mlp": 1.05067098, + "epoch": 0.16167142642416954, + "flos": 26110577520000.0, + "grad_norm": 2.2392978206929555, + "language_loss": 0.76041406, + "learning_rate": 3.8208710999384325e-06, + "loss": 0.78239101, + "num_input_tokens_seen": 58236995, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.015625, + "step": 2689, + "time_per_iteration": 2.5840976238250732 + }, + { + "auxiliary_loss_clip": 0.01155388, + "auxiliary_loss_mlp": 0.01046669, + "balance_loss_clip": 1.02704763, + "balance_loss_mlp": 1.05417943, + "epoch": 0.1617315496768375, + "flos": 22779646629120.0, + "grad_norm": 1.9258973882551216, + "language_loss": 0.87260234, + "learning_rate": 3.820709964220683e-06, + "loss": 0.89462292, + "num_input_tokens_seen": 58257230, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.015625, + "step": 2690, + "time_per_iteration": 2.496943473815918 + }, + { + "auxiliary_loss_clip": 0.01151534, + "auxiliary_loss_mlp": 0.01047712, + "balance_loss_clip": 1.02980638, + "balance_loss_mlp": 1.05211663, + "epoch": 0.1617916729295055, + "flos": 22017299351040.0, + "grad_norm": 1.562024048541713, + "language_loss": 0.87728393, + "learning_rate": 3.8205487594614284e-06, + "loss": 0.89927632, + "num_input_tokens_seen": 58277080, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9921875, + "step": 2691, + "time_per_iteration": 2.510960817337036 + }, + { + "auxiliary_loss_clip": 0.01157097, + "auxiliary_loss_mlp": 0.01049167, + "balance_loss_clip": 1.02764988, + "balance_loss_mlp": 1.05021381, + "epoch": 0.16185179618217346, + "flos": 23438248450560.0, + "grad_norm": 2.082856606872889, + "language_loss": 0.82327259, + "learning_rate": 3.820387485666784e-06, + "loss": 0.84533525, + "num_input_tokens_seen": 58294815, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.0703125, + "step": 2692, + "time_per_iteration": 2.481032371520996 + }, + { + "auxiliary_loss_clip": 0.0115716, + "auxiliary_loss_mlp": 0.01048999, + "balance_loss_clip": 1.02835155, + "balance_loss_mlp": 1.05069244, + "epoch": 0.16191191943484143, + "flos": 25666110627840.0, + "grad_norm": 3.0763505181853454, + "language_loss": 0.80942917, + "learning_rate": 3.820226142842862e-06, + "loss": 0.83149081, + "num_input_tokens_seen": 58313215, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.0625, + "step": 2693, + "time_per_iteration": 2.493278980255127 + }, + { + "auxiliary_loss_clip": 0.01150662, + "auxiliary_loss_mlp": 0.01054953, + "balance_loss_clip": 1.03670192, + "balance_loss_mlp": 1.05223358, + "epoch": 0.1619720426875094, + "flos": 23477355383040.0, + "grad_norm": 1.7139740211881158, + "language_loss": 0.83639967, + "learning_rate": 3.820064730995783e-06, + "loss": 0.85845578, + "num_input_tokens_seen": 58333215, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.984375, + "step": 2694, + "time_per_iteration": 2.5051510334014893 + }, + { + "auxiliary_loss_clip": 0.01156309, + "auxiliary_loss_mlp": 0.01048183, + "balance_loss_clip": 1.02764273, + "balance_loss_mlp": 1.0509156, + "epoch": 0.16203216594017736, + "flos": 24133658734080.0, + "grad_norm": 1.9608549080280004, + "language_loss": 0.69125426, + "learning_rate": 3.819903250131667e-06, + "loss": 0.71329916, + "num_input_tokens_seen": 58351160, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0546875, + "step": 2695, + "time_per_iteration": 2.495098352432251 + }, + { + "auxiliary_loss_clip": 0.01159947, + "auxiliary_loss_mlp": 0.01054922, + "balance_loss_clip": 1.03391731, + "balance_loss_mlp": 1.05520689, + "epoch": 0.16209228919284532, + "flos": 22340889999360.0, + "grad_norm": 2.466913217352614, + "language_loss": 0.82403111, + "learning_rate": 3.819741700256637e-06, + "loss": 0.84617984, + "num_input_tokens_seen": 58368505, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.046875, + "step": 2696, + "time_per_iteration": 2.484523296356201 + }, + { + "auxiliary_loss_clip": 0.01161904, + "auxiliary_loss_mlp": 0.01056335, + "balance_loss_clip": 1.03529406, + "balance_loss_mlp": 1.05316591, + "epoch": 0.1621524124455133, + "flos": 15815131827840.0, + "grad_norm": 1.9982919021229957, + "language_loss": 0.8852337, + "learning_rate": 3.8195800813768194e-06, + "loss": 0.90741605, + "num_input_tokens_seen": 58385085, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.09375, + "step": 2697, + "time_per_iteration": 2.4806151390075684 + }, + { + "auxiliary_loss_clip": 0.01147135, + "auxiliary_loss_mlp": 0.01046149, + "balance_loss_clip": 1.02756453, + "balance_loss_mlp": 1.04989469, + "epoch": 0.16221253569818128, + "flos": 30186688988160.0, + "grad_norm": 1.4702975792509376, + "language_loss": 0.80172735, + "learning_rate": 3.819418393498343e-06, + "loss": 0.82366014, + "num_input_tokens_seen": 58406985, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.97265625, + "step": 2698, + "time_per_iteration": 2.532137393951416 + }, + { + "auxiliary_loss_clip": 0.01149805, + "auxiliary_loss_mlp": 0.01049018, + "balance_loss_clip": 1.02957439, + "balance_loss_mlp": 1.05167758, + "epoch": 0.16227265895084925, + "flos": 24605991601920.0, + "grad_norm": 1.5576448961090323, + "language_loss": 0.77258182, + "learning_rate": 3.819256636627339e-06, + "loss": 0.79456997, + "num_input_tokens_seen": 58426205, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.98046875, + "step": 2699, + "time_per_iteration": 2.514084577560425 + }, + { + "auxiliary_loss_clip": 0.01150261, + "auxiliary_loss_mlp": 0.0104371, + "balance_loss_clip": 1.0251497, + "balance_loss_mlp": 1.04891944, + "epoch": 0.1623327822035172, + "flos": 19573326996480.0, + "grad_norm": 2.038036982956784, + "language_loss": 0.85697722, + "learning_rate": 3.81909481076994e-06, + "loss": 0.87891692, + "num_input_tokens_seen": 58443830, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.015625, + "step": 2700, + "time_per_iteration": 2.4434289932250977 + }, + { + "auxiliary_loss_clip": 0.01147712, + "auxiliary_loss_mlp": 0.0104585, + "balance_loss_clip": 1.0247376, + "balance_loss_mlp": 1.04878318, + "epoch": 0.16239290545618518, + "flos": 26468462678400.0, + "grad_norm": 1.6982179557795123, + "language_loss": 0.80378878, + "learning_rate": 3.818932915932284e-06, + "loss": 0.82572436, + "num_input_tokens_seen": 58464405, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.98828125, + "step": 2701, + "time_per_iteration": 2.5267322063446045 + }, + { + "auxiliary_loss_clip": 0.01156296, + "auxiliary_loss_mlp": 0.01048895, + "balance_loss_clip": 1.02945244, + "balance_loss_mlp": 1.05514598, + "epoch": 0.16245302870885314, + "flos": 15851940289920.0, + "grad_norm": 1.5999982166608073, + "language_loss": 0.73006868, + "learning_rate": 3.818770952120511e-06, + "loss": 0.75212055, + "num_input_tokens_seen": 58483295, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0078125, + "step": 2702, + "time_per_iteration": 2.44750714302063 + }, + { + "auxiliary_loss_clip": 0.01153204, + "auxiliary_loss_mlp": 0.01050741, + "balance_loss_clip": 1.02986753, + "balance_loss_mlp": 1.05053687, + "epoch": 0.1625131519615211, + "flos": 14756521173120.0, + "grad_norm": 2.5386207662450464, + "language_loss": 0.73164749, + "learning_rate": 3.81860891934076e-06, + "loss": 0.7536869, + "num_input_tokens_seen": 58501205, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.0234375, + "step": 2703, + "time_per_iteration": 2.469242811203003 + }, + { + "auxiliary_loss_clip": 0.01150736, + "auxiliary_loss_mlp": 0.01046914, + "balance_loss_clip": 1.02538514, + "balance_loss_mlp": 1.04765964, + "epoch": 0.1625732752141891, + "flos": 28220508368640.0, + "grad_norm": 1.9216464968932823, + "language_loss": 0.70681584, + "learning_rate": 3.818446817599176e-06, + "loss": 0.72879231, + "num_input_tokens_seen": 58522315, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.03125, + "step": 2704, + "time_per_iteration": 2.5236263275146484 + }, + { + "auxiliary_loss_clip": 0.0105028, + "auxiliary_loss_mlp": 0.01003507, + "balance_loss_clip": 1.00091982, + "balance_loss_mlp": 1.01563144, + "epoch": 0.16263339846685707, + "flos": 67327947688320.0, + "grad_norm": 0.7797469934396678, + "language_loss": 0.53369009, + "learning_rate": 3.818284646901907e-06, + "loss": 0.55422795, + "num_input_tokens_seen": 58586695, + "router_z_loss_clip": 0.02587891, + "router_z_loss_mlp": 0.34765625, + "step": 2705, + "time_per_iteration": 3.0887868404388428 + }, + { + "auxiliary_loss_clip": 0.0115608, + "auxiliary_loss_mlp": 0.01048272, + "balance_loss_clip": 1.02873373, + "balance_loss_mlp": 1.05151534, + "epoch": 0.16269352171952503, + "flos": 14319165173760.0, + "grad_norm": 2.4525976943058896, + "language_loss": 0.75060308, + "learning_rate": 3.818122407255102e-06, + "loss": 0.77264655, + "num_input_tokens_seen": 58602435, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.046875, + "step": 2706, + "time_per_iteration": 2.439283847808838 + }, + { + "auxiliary_loss_clip": 0.01154579, + "auxiliary_loss_mlp": 0.01051686, + "balance_loss_clip": 1.03248119, + "balance_loss_mlp": 1.05240536, + "epoch": 0.162753644972193, + "flos": 28361205941760.0, + "grad_norm": 1.9153778871117788, + "language_loss": 0.7234174, + "learning_rate": 3.817960098664914e-06, + "loss": 0.74547994, + "num_input_tokens_seen": 58621275, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.015625, + "step": 2707, + "time_per_iteration": 2.51819109916687 + }, + { + "auxiliary_loss_clip": 0.01155215, + "auxiliary_loss_mlp": 0.01050366, + "balance_loss_clip": 1.03154302, + "balance_loss_mlp": 1.05275822, + "epoch": 0.16281376822486096, + "flos": 19937856170880.0, + "grad_norm": 3.869992791268662, + "language_loss": 0.83790398, + "learning_rate": 3.817797721137495e-06, + "loss": 0.85995972, + "num_input_tokens_seen": 58637550, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.03125, + "step": 2708, + "time_per_iteration": 2.4592010974884033 + }, + { + "auxiliary_loss_clip": 0.0115992, + "auxiliary_loss_mlp": 0.01049095, + "balance_loss_clip": 1.02768469, + "balance_loss_mlp": 1.05268705, + "epoch": 0.16287389147752893, + "flos": 21251719848960.0, + "grad_norm": 2.162290718142945, + "language_loss": 0.86529553, + "learning_rate": 3.817635274679006e-06, + "loss": 0.88738573, + "num_input_tokens_seen": 58654135, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.0703125, + "step": 2709, + "time_per_iteration": 2.4745054244995117 + }, + { + "auxiliary_loss_clip": 0.01154974, + "auxiliary_loss_mlp": 0.01054439, + "balance_loss_clip": 1.0353297, + "balance_loss_mlp": 1.05096519, + "epoch": 0.1629340147301969, + "flos": 19244672530560.0, + "grad_norm": 1.6782807127870958, + "language_loss": 0.91449893, + "learning_rate": 3.817472759295605e-06, + "loss": 0.93659306, + "num_input_tokens_seen": 58674320, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0390625, + "step": 2710, + "time_per_iteration": 2.4846651554107666 + }, + { + "auxiliary_loss_clip": 0.0115562, + "auxiliary_loss_mlp": 0.01054818, + "balance_loss_clip": 1.03549433, + "balance_loss_mlp": 1.05447197, + "epoch": 0.16299413798286488, + "flos": 21249816428160.0, + "grad_norm": 1.99410407833921, + "language_loss": 0.8129673, + "learning_rate": 3.817310174993453e-06, + "loss": 0.83507168, + "num_input_tokens_seen": 58691000, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0078125, + "step": 2711, + "time_per_iteration": 2.4878618717193604 + }, + { + "auxiliary_loss_clip": 0.01153665, + "auxiliary_loss_mlp": 0.01046499, + "balance_loss_clip": 1.02731824, + "balance_loss_mlp": 1.04737568, + "epoch": 0.16305426123553285, + "flos": 18770579896320.0, + "grad_norm": 2.7794575527068077, + "language_loss": 0.81605875, + "learning_rate": 3.817147521778719e-06, + "loss": 0.83806038, + "num_input_tokens_seen": 58710230, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0625, + "step": 2712, + "time_per_iteration": 2.4479072093963623 + }, + { + "auxiliary_loss_clip": 0.01158025, + "auxiliary_loss_mlp": 0.01058532, + "balance_loss_clip": 1.03858864, + "balance_loss_mlp": 1.05211174, + "epoch": 0.16311438448820081, + "flos": 22087648137600.0, + "grad_norm": 2.1959953506899774, + "language_loss": 0.76885653, + "learning_rate": 3.816984799657568e-06, + "loss": 0.79102206, + "num_input_tokens_seen": 58728610, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0625, + "step": 2713, + "time_per_iteration": 2.493394374847412 + }, + { + "auxiliary_loss_clip": 0.01155185, + "auxiliary_loss_mlp": 0.01062558, + "balance_loss_clip": 1.04290032, + "balance_loss_mlp": 1.05623782, + "epoch": 0.16317450774086878, + "flos": 16467700164480.0, + "grad_norm": 2.081844956712308, + "language_loss": 0.78926778, + "learning_rate": 3.8168220086361715e-06, + "loss": 0.8114453, + "num_input_tokens_seen": 58744385, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.98828125, + "step": 2714, + "time_per_iteration": 2.442214012145996 + }, + { + "auxiliary_loss_clip": 0.01155305, + "auxiliary_loss_mlp": 0.01059199, + "balance_loss_clip": 1.04011369, + "balance_loss_mlp": 1.05286288, + "epoch": 0.16323463099353674, + "flos": 24352929308160.0, + "grad_norm": 2.259619309439112, + "language_loss": 0.78143466, + "learning_rate": 3.816659148720702e-06, + "loss": 0.80357969, + "num_input_tokens_seen": 58763905, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0234375, + "step": 2715, + "time_per_iteration": 2.499178409576416 + }, + { + "auxiliary_loss_clip": 0.01150615, + "auxiliary_loss_mlp": 0.01047807, + "balance_loss_clip": 1.02973497, + "balance_loss_mlp": 1.04868412, + "epoch": 0.1632947542462047, + "flos": 24900782520960.0, + "grad_norm": 2.0916631483814783, + "language_loss": 0.81397748, + "learning_rate": 3.816496219917336e-06, + "loss": 0.8359617, + "num_input_tokens_seen": 58785580, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.015625, + "step": 2716, + "time_per_iteration": 2.5004689693450928 + }, + { + "auxiliary_loss_clip": 0.01158421, + "auxiliary_loss_mlp": 0.01057354, + "balance_loss_clip": 1.03853106, + "balance_loss_mlp": 1.05482328, + "epoch": 0.1633548774988727, + "flos": 24900279730560.0, + "grad_norm": 1.8793848003912939, + "language_loss": 0.86203027, + "learning_rate": 3.816333222232251e-06, + "loss": 0.88418794, + "num_input_tokens_seen": 58806075, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0390625, + "step": 2717, + "time_per_iteration": 2.5112617015838623 + }, + { + "auxiliary_loss_clip": 0.0115161, + "auxiliary_loss_mlp": 0.01046152, + "balance_loss_clip": 1.02725708, + "balance_loss_mlp": 1.05153894, + "epoch": 0.16341500075154067, + "flos": 30441798357120.0, + "grad_norm": 1.652261986612604, + "language_loss": 0.76514149, + "learning_rate": 3.816170155671629e-06, + "loss": 0.78711915, + "num_input_tokens_seen": 58827405, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0, + "step": 2718, + "time_per_iteration": 2.549654245376587 + }, + { + "auxiliary_loss_clip": 0.01156654, + "auxiliary_loss_mlp": 0.01045361, + "balance_loss_clip": 1.02696729, + "balance_loss_mlp": 1.05180717, + "epoch": 0.16347512400420863, + "flos": 22784530878720.0, + "grad_norm": 2.080955072975882, + "language_loss": 0.73027492, + "learning_rate": 3.816007020241652e-06, + "loss": 0.75229508, + "num_input_tokens_seen": 58847205, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.046875, + "step": 2719, + "time_per_iteration": 2.4911599159240723 + }, + { + "auxiliary_loss_clip": 0.01151759, + "auxiliary_loss_mlp": 0.01049636, + "balance_loss_clip": 1.03084862, + "balance_loss_mlp": 1.0492239, + "epoch": 0.1635352472568766, + "flos": 22633274707200.0, + "grad_norm": 1.6610037254914274, + "language_loss": 0.72384167, + "learning_rate": 3.815843815948507e-06, + "loss": 0.74585563, + "num_input_tokens_seen": 58866865, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0234375, + "step": 2720, + "time_per_iteration": 2.4733760356903076 + }, + { + "auxiliary_loss_clip": 0.01150132, + "auxiliary_loss_mlp": 0.01048266, + "balance_loss_clip": 1.02789283, + "balance_loss_mlp": 1.05076206, + "epoch": 0.16359537050954456, + "flos": 15522998515200.0, + "grad_norm": 2.2797021453727893, + "language_loss": 0.75100243, + "learning_rate": 3.8156805427983824e-06, + "loss": 0.77298641, + "num_input_tokens_seen": 58885200, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.9921875, + "step": 2721, + "time_per_iteration": 2.44942569732666 + }, + { + "auxiliary_loss_clip": 0.01155245, + "auxiliary_loss_mlp": 0.01049168, + "balance_loss_clip": 1.02893853, + "balance_loss_mlp": 1.0502317, + "epoch": 0.16365549376221253, + "flos": 22090162089600.0, + "grad_norm": 1.74959220753002, + "language_loss": 0.79254043, + "learning_rate": 3.8155172007974695e-06, + "loss": 0.81458461, + "num_input_tokens_seen": 58906385, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.046875, + "step": 2722, + "time_per_iteration": 2.4775915145874023 + }, + { + "auxiliary_loss_clip": 0.01158964, + "auxiliary_loss_mlp": 0.01049216, + "balance_loss_clip": 1.02675653, + "balance_loss_mlp": 1.05248678, + "epoch": 0.1637156170148805, + "flos": 24060400945920.0, + "grad_norm": 2.0539311275727634, + "language_loss": 0.8477816, + "learning_rate": 3.8153537899519624e-06, + "loss": 0.86986339, + "num_input_tokens_seen": 58925040, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.0625, + "step": 2723, + "time_per_iteration": 2.5084922313690186 + }, + { + "auxiliary_loss_clip": 0.01146914, + "auxiliary_loss_mlp": 0.01037208, + "balance_loss_clip": 1.0177772, + "balance_loss_mlp": 1.04940808, + "epoch": 0.1637757402675485, + "flos": 26685362954880.0, + "grad_norm": 2.0049787201865503, + "language_loss": 0.70883536, + "learning_rate": 3.815190310268058e-06, + "loss": 0.73067659, + "num_input_tokens_seen": 58944790, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.9765625, + "step": 2724, + "time_per_iteration": 2.5094263553619385 + }, + { + "auxiliary_loss_clip": 0.01150034, + "auxiliary_loss_mlp": 0.01044202, + "balance_loss_clip": 1.02583206, + "balance_loss_mlp": 1.05113125, + "epoch": 0.16383586352021645, + "flos": 16106941918080.0, + "grad_norm": 2.04326868324577, + "language_loss": 0.70914948, + "learning_rate": 3.815026761751955e-06, + "loss": 0.73109186, + "num_input_tokens_seen": 58962500, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.9921875, + "step": 2725, + "time_per_iteration": 2.495342254638672 + }, + { + "auxiliary_loss_clip": 0.01149794, + "auxiliary_loss_mlp": 0.01042547, + "balance_loss_clip": 1.02437937, + "balance_loss_mlp": 1.05219352, + "epoch": 0.16389598677288442, + "flos": 19165991788800.0, + "grad_norm": 1.9381311422505, + "language_loss": 0.8873682, + "learning_rate": 3.814863144409855e-06, + "loss": 0.90929163, + "num_input_tokens_seen": 58980355, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9765625, + "step": 2726, + "time_per_iteration": 3.983738660812378 + }, + { + "auxiliary_loss_clip": 0.01156798, + "auxiliary_loss_mlp": 0.01049309, + "balance_loss_clip": 1.02965117, + "balance_loss_mlp": 1.05406547, + "epoch": 0.16395611002555238, + "flos": 21507008785920.0, + "grad_norm": 1.8502717081228044, + "language_loss": 0.7439661, + "learning_rate": 3.814699458247963e-06, + "loss": 0.76602715, + "num_input_tokens_seen": 58999505, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.03125, + "step": 2727, + "time_per_iteration": 5.52494215965271 + }, + { + "auxiliary_loss_clip": 0.01150784, + "auxiliary_loss_mlp": 0.0105177, + "balance_loss_clip": 1.03429413, + "balance_loss_mlp": 1.05145037, + "epoch": 0.16401623327822035, + "flos": 21470918595840.0, + "grad_norm": 1.6814144838265654, + "language_loss": 0.82321334, + "learning_rate": 3.8145357032724855e-06, + "loss": 0.84523886, + "num_input_tokens_seen": 59017930, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9921875, + "step": 2728, + "time_per_iteration": 2.4621498584747314 + }, + { + "auxiliary_loss_clip": 0.01156146, + "auxiliary_loss_mlp": 0.01050932, + "balance_loss_clip": 1.03131044, + "balance_loss_mlp": 1.05167341, + "epoch": 0.1640763565308883, + "flos": 13626232928640.0, + "grad_norm": 2.4458707176630425, + "language_loss": 0.84766865, + "learning_rate": 3.814371879489633e-06, + "loss": 0.86973941, + "num_input_tokens_seen": 59035130, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0390625, + "step": 2729, + "time_per_iteration": 2.459495782852173 + }, + { + "auxiliary_loss_clip": 0.01151277, + "auxiliary_loss_mlp": 0.01044659, + "balance_loss_clip": 1.02661061, + "balance_loss_mlp": 1.04923487, + "epoch": 0.16413647978355628, + "flos": 15451464579840.0, + "grad_norm": 1.9327126112676087, + "language_loss": 0.72569054, + "learning_rate": 3.814207986905616e-06, + "loss": 0.74764991, + "num_input_tokens_seen": 59053080, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.0234375, + "step": 2730, + "time_per_iteration": 2.451016902923584 + }, + { + "auxiliary_loss_clip": 0.01153124, + "auxiliary_loss_mlp": 0.01053311, + "balance_loss_clip": 1.03243709, + "balance_loss_mlp": 1.04862678, + "epoch": 0.16419660303622427, + "flos": 45878682015360.0, + "grad_norm": 2.2141787283307854, + "language_loss": 0.74431163, + "learning_rate": 3.814044025526651e-06, + "loss": 0.76637596, + "num_input_tokens_seen": 59075610, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.046875, + "step": 2731, + "time_per_iteration": 2.6857874393463135 + }, + { + "auxiliary_loss_clip": 0.0115844, + "auxiliary_loss_mlp": 0.01048812, + "balance_loss_clip": 1.02818894, + "balance_loss_mlp": 1.05408466, + "epoch": 0.16425672628889224, + "flos": 18952826526720.0, + "grad_norm": 2.15833206643789, + "language_loss": 0.78783584, + "learning_rate": 3.8138799953589548e-06, + "loss": 0.80990839, + "num_input_tokens_seen": 59094555, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.046875, + "step": 2732, + "time_per_iteration": 2.44146728515625 + }, + { + "auxiliary_loss_clip": 0.01155842, + "auxiliary_loss_mlp": 0.01051717, + "balance_loss_clip": 1.03166568, + "balance_loss_mlp": 1.05211556, + "epoch": 0.1643168495415602, + "flos": 24312996362880.0, + "grad_norm": 1.9937390498547816, + "language_loss": 0.68943298, + "learning_rate": 3.8137158964087473e-06, + "loss": 0.71150857, + "num_input_tokens_seen": 59113515, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.0390625, + "step": 2733, + "time_per_iteration": 2.4981601238250732 + }, + { + "auxiliary_loss_clip": 0.01151384, + "auxiliary_loss_mlp": 0.01048132, + "balance_loss_clip": 1.02792621, + "balance_loss_mlp": 1.05054927, + "epoch": 0.16437697279422817, + "flos": 26428421992320.0, + "grad_norm": 2.20018793155086, + "language_loss": 0.80626202, + "learning_rate": 3.8135517286822508e-06, + "loss": 0.8282572, + "num_input_tokens_seen": 59133275, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0078125, + "step": 2734, + "time_per_iteration": 2.495030641555786 + }, + { + "auxiliary_loss_clip": 0.01152713, + "auxiliary_loss_mlp": 0.0105599, + "balance_loss_clip": 1.03638041, + "balance_loss_mlp": 1.05143905, + "epoch": 0.16443709604689613, + "flos": 34532239351680.0, + "grad_norm": 4.0691467716051175, + "language_loss": 0.82265377, + "learning_rate": 3.8133874921856914e-06, + "loss": 0.84474081, + "num_input_tokens_seen": 59154095, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0078125, + "step": 2735, + "time_per_iteration": 2.5911896228790283 + }, + { + "auxiliary_loss_clip": 0.01150004, + "auxiliary_loss_mlp": 0.01044021, + "balance_loss_clip": 1.02556753, + "balance_loss_mlp": 1.05158913, + "epoch": 0.1644972192995641, + "flos": 23258048895360.0, + "grad_norm": 2.5735103485950077, + "language_loss": 0.78697491, + "learning_rate": 3.813223186925296e-06, + "loss": 0.80891526, + "num_input_tokens_seen": 59173795, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.984375, + "step": 2736, + "time_per_iteration": 2.4699559211730957 + }, + { + "auxiliary_loss_clip": 0.01155005, + "auxiliary_loss_mlp": 0.01052588, + "balance_loss_clip": 1.03438449, + "balance_loss_mlp": 1.05231023, + "epoch": 0.1645573425522321, + "flos": 26979543342720.0, + "grad_norm": 1.680513335410081, + "language_loss": 0.81409019, + "learning_rate": 3.8130588129072964e-06, + "loss": 0.83616614, + "num_input_tokens_seen": 59191610, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.03125, + "step": 2737, + "time_per_iteration": 2.4892401695251465 + }, + { + "auxiliary_loss_clip": 0.0115392, + "auxiliary_loss_mlp": 0.01046744, + "balance_loss_clip": 1.02819467, + "balance_loss_mlp": 1.05107307, + "epoch": 0.16461746580490005, + "flos": 28731768600960.0, + "grad_norm": 1.8393773079816103, + "language_loss": 0.87291563, + "learning_rate": 3.8128943701379246e-06, + "loss": 0.89492232, + "num_input_tokens_seen": 59213000, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.03125, + "step": 2738, + "time_per_iteration": 2.54569935798645 + }, + { + "auxiliary_loss_clip": 0.01154293, + "auxiliary_loss_mlp": 0.01055893, + "balance_loss_clip": 1.03653371, + "balance_loss_mlp": 1.05139303, + "epoch": 0.16467758905756802, + "flos": 24930156867840.0, + "grad_norm": 2.0122721864238438, + "language_loss": 0.72351867, + "learning_rate": 3.8127298586234167e-06, + "loss": 0.74562055, + "num_input_tokens_seen": 59232340, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.03125, + "step": 2739, + "time_per_iteration": 2.5309460163116455 + }, + { + "auxiliary_loss_clip": 0.01148442, + "auxiliary_loss_mlp": 0.0104888, + "balance_loss_clip": 1.02991343, + "balance_loss_mlp": 1.04766631, + "epoch": 0.16473771231023598, + "flos": 24826519152000.0, + "grad_norm": 1.690107638621115, + "language_loss": 0.81735384, + "learning_rate": 3.8125652783700104e-06, + "loss": 0.8393271, + "num_input_tokens_seen": 59253950, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0078125, + "step": 2740, + "time_per_iteration": 2.5005404949188232 + }, + { + "auxiliary_loss_clip": 0.01157284, + "auxiliary_loss_mlp": 0.01053239, + "balance_loss_clip": 1.03176928, + "balance_loss_mlp": 1.05347896, + "epoch": 0.16479783556290395, + "flos": 39896072375040.0, + "grad_norm": 2.8033984026588756, + "language_loss": 0.69098473, + "learning_rate": 3.8124006293839475e-06, + "loss": 0.71308994, + "num_input_tokens_seen": 59275545, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.0390625, + "step": 2741, + "time_per_iteration": 2.6353659629821777 + }, + { + "auxiliary_loss_clip": 0.01151645, + "auxiliary_loss_mlp": 0.01044648, + "balance_loss_clip": 1.02588463, + "balance_loss_mlp": 1.04987025, + "epoch": 0.16485795881557191, + "flos": 19897061299200.0, + "grad_norm": 2.1078448839323167, + "language_loss": 0.79967189, + "learning_rate": 3.812235911671472e-06, + "loss": 0.82163477, + "num_input_tokens_seen": 59293480, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 2742, + "time_per_iteration": 2.4471442699432373 + }, + { + "auxiliary_loss_clip": 0.01150824, + "auxiliary_loss_mlp": 0.01053847, + "balance_loss_clip": 1.03373659, + "balance_loss_mlp": 1.05117011, + "epoch": 0.16491808206823988, + "flos": 20556129997440.0, + "grad_norm": 2.1468697804747823, + "language_loss": 0.84769481, + "learning_rate": 3.8120711252388274e-06, + "loss": 0.86974156, + "num_input_tokens_seen": 59313435, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0, + "step": 2743, + "time_per_iteration": 2.459146022796631 + }, + { + "auxiliary_loss_clip": 0.01149398, + "auxiliary_loss_mlp": 0.01052609, + "balance_loss_clip": 1.03359556, + "balance_loss_mlp": 1.05074859, + "epoch": 0.16497820532090787, + "flos": 23800802376960.0, + "grad_norm": 1.5853616537097488, + "language_loss": 0.85723281, + "learning_rate": 3.811906270092265e-06, + "loss": 0.87925285, + "num_input_tokens_seen": 59331535, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.984375, + "step": 2744, + "time_per_iteration": 2.4920642375946045 + }, + { + "auxiliary_loss_clip": 0.01147114, + "auxiliary_loss_mlp": 0.01046312, + "balance_loss_clip": 1.0283947, + "balance_loss_mlp": 1.05124998, + "epoch": 0.16503832857357584, + "flos": 25482642935040.0, + "grad_norm": 1.7300129139105382, + "language_loss": 0.82973897, + "learning_rate": 3.811741346238036e-06, + "loss": 0.85167319, + "num_input_tokens_seen": 59350680, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9609375, + "step": 2745, + "time_per_iteration": 2.490399122238159 + }, + { + "auxiliary_loss_clip": 0.0115758, + "auxiliary_loss_mlp": 0.01054165, + "balance_loss_clip": 1.03548467, + "balance_loss_mlp": 1.05477679, + "epoch": 0.1650984518262438, + "flos": 17676058619520.0, + "grad_norm": 2.19754759855213, + "language_loss": 0.76411253, + "learning_rate": 3.8115763536823923e-06, + "loss": 0.78622997, + "num_input_tokens_seen": 59367020, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.03125, + "step": 2746, + "time_per_iteration": 2.46258282661438 + }, + { + "auxiliary_loss_clip": 0.01152266, + "auxiliary_loss_mlp": 0.01052583, + "balance_loss_clip": 1.03387904, + "balance_loss_mlp": 1.05164099, + "epoch": 0.16515857507891177, + "flos": 18698327688960.0, + "grad_norm": 1.5978428663850568, + "language_loss": 0.80686736, + "learning_rate": 3.811411292431592e-06, + "loss": 0.82891583, + "num_input_tokens_seen": 59386075, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0078125, + "step": 2747, + "time_per_iteration": 2.4612972736358643 + }, + { + "auxiliary_loss_clip": 0.01158238, + "auxiliary_loss_mlp": 0.01048108, + "balance_loss_clip": 1.02848577, + "balance_loss_mlp": 1.05559731, + "epoch": 0.16521869833157973, + "flos": 15010481306880.0, + "grad_norm": 1.853069559467639, + "language_loss": 0.69463658, + "learning_rate": 3.8112461624918945e-06, + "loss": 0.71670008, + "num_input_tokens_seen": 59402690, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0234375, + "step": 2748, + "time_per_iteration": 2.4235999584198 + }, + { + "auxiliary_loss_clip": 0.01155731, + "auxiliary_loss_mlp": 0.0105142, + "balance_loss_clip": 1.03314471, + "balance_loss_mlp": 1.05482006, + "epoch": 0.1652788215842477, + "flos": 22121152548480.0, + "grad_norm": 2.265414403061137, + "language_loss": 0.87653661, + "learning_rate": 3.811080963869561e-06, + "loss": 0.89860809, + "num_input_tokens_seen": 59421130, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.0078125, + "step": 2749, + "time_per_iteration": 2.4706709384918213 + }, + { + "auxiliary_loss_clip": 0.01153325, + "auxiliary_loss_mlp": 0.01048781, + "balance_loss_clip": 1.02905142, + "balance_loss_mlp": 1.0509429, + "epoch": 0.16533894483691566, + "flos": 18333080242560.0, + "grad_norm": 2.3451981357461444, + "language_loss": 0.79248077, + "learning_rate": 3.8109156965708557e-06, + "loss": 0.81450188, + "num_input_tokens_seen": 59438970, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0234375, + "step": 2750, + "time_per_iteration": 2.4588990211486816 + }, + { + "auxiliary_loss_clip": 0.01151699, + "auxiliary_loss_mlp": 0.01045956, + "balance_loss_clip": 1.02657294, + "balance_loss_mlp": 1.05188382, + "epoch": 0.16539906808958366, + "flos": 22382115834240.0, + "grad_norm": 1.7653411133265118, + "language_loss": 0.95010567, + "learning_rate": 3.8107503606020455e-06, + "loss": 0.9720822, + "num_input_tokens_seen": 59458510, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.99609375, + "step": 2751, + "time_per_iteration": 2.4776439666748047 + }, + { + "auxiliary_loss_clip": 0.01152135, + "auxiliary_loss_mlp": 0.01045797, + "balance_loss_clip": 1.02762985, + "balance_loss_mlp": 1.05480134, + "epoch": 0.16545919134225162, + "flos": 22711093522560.0, + "grad_norm": 1.9833662518999209, + "language_loss": 0.71080822, + "learning_rate": 3.8105849559693997e-06, + "loss": 0.73278749, + "num_input_tokens_seen": 59477110, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.97265625, + "step": 2752, + "time_per_iteration": 2.4609227180480957 + }, + { + "auxiliary_loss_clip": 0.01051961, + "auxiliary_loss_mlp": 0.01021231, + "balance_loss_clip": 1.01878762, + "balance_loss_mlp": 1.01785779, + "epoch": 0.1655193145949196, + "flos": 67802974076160.0, + "grad_norm": 0.7698122762266473, + "language_loss": 0.54079807, + "learning_rate": 3.810419482679192e-06, + "loss": 0.56152999, + "num_input_tokens_seen": 59541155, + "router_z_loss_clip": 0.02441406, + "router_z_loss_mlp": 0.33984375, + "step": 2753, + "time_per_iteration": 3.161339282989502 + }, + { + "auxiliary_loss_clip": 0.01152964, + "auxiliary_loss_mlp": 0.01042006, + "balance_loss_clip": 1.02282572, + "balance_loss_mlp": 1.05254793, + "epoch": 0.16557943784758755, + "flos": 24280389792000.0, + "grad_norm": 1.9686645345026932, + "language_loss": 0.75467873, + "learning_rate": 3.8102539407376954e-06, + "loss": 0.77662838, + "num_input_tokens_seen": 59561155, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.0, + "step": 2754, + "time_per_iteration": 2.4837357997894287 + }, + { + "auxiliary_loss_clip": 0.01160718, + "auxiliary_loss_mlp": 0.01060834, + "balance_loss_clip": 1.03875661, + "balance_loss_mlp": 1.05358946, + "epoch": 0.16563956110025552, + "flos": 20083617561600.0, + "grad_norm": 3.81944507319113, + "language_loss": 0.87154973, + "learning_rate": 3.810088330151188e-06, + "loss": 0.89376527, + "num_input_tokens_seen": 59580460, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.0703125, + "step": 2755, + "time_per_iteration": 2.482780933380127 + }, + { + "auxiliary_loss_clip": 0.01148695, + "auxiliary_loss_mlp": 0.01052509, + "balance_loss_clip": 1.03348362, + "balance_loss_mlp": 1.04862666, + "epoch": 0.16569968435292348, + "flos": 28034454896640.0, + "grad_norm": 1.859731734913831, + "language_loss": 0.73258269, + "learning_rate": 3.80992265092595e-06, + "loss": 0.7545948, + "num_input_tokens_seen": 59600025, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0, + "step": 2756, + "time_per_iteration": 2.519432783126831 + }, + { + "auxiliary_loss_clip": 0.01149207, + "auxiliary_loss_mlp": 0.01049415, + "balance_loss_clip": 1.02999544, + "balance_loss_mlp": 1.05331099, + "epoch": 0.16575980760559147, + "flos": 26250233598720.0, + "grad_norm": 1.6628427585054586, + "language_loss": 0.74967468, + "learning_rate": 3.8097569030682636e-06, + "loss": 0.77166092, + "num_input_tokens_seen": 59620600, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.9609375, + "step": 2757, + "time_per_iteration": 2.5122530460357666 + }, + { + "auxiliary_loss_clip": 0.01154145, + "auxiliary_loss_mlp": 0.01044644, + "balance_loss_clip": 1.02590466, + "balance_loss_mlp": 1.05359447, + "epoch": 0.16581993085825944, + "flos": 26943955943040.0, + "grad_norm": 2.101183789218018, + "language_loss": 0.84532511, + "learning_rate": 3.8095910865844137e-06, + "loss": 0.86731303, + "num_input_tokens_seen": 59641385, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0078125, + "step": 2758, + "time_per_iteration": 2.5268592834472656 + }, + { + "auxiliary_loss_clip": 0.01153935, + "auxiliary_loss_mlp": 0.01051485, + "balance_loss_clip": 1.03382993, + "balance_loss_mlp": 1.05355358, + "epoch": 0.1658800541109274, + "flos": 21653632103040.0, + "grad_norm": 3.016772390052645, + "language_loss": 0.79003322, + "learning_rate": 3.809425201480689e-06, + "loss": 0.81208748, + "num_input_tokens_seen": 59659865, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 1.0, + "step": 2759, + "time_per_iteration": 2.468798875808716 + }, + { + "auxiliary_loss_clip": 0.01151828, + "auxiliary_loss_mlp": 0.01048294, + "balance_loss_clip": 1.02953088, + "balance_loss_mlp": 1.05121255, + "epoch": 0.16594017736359537, + "flos": 16435488643200.0, + "grad_norm": 4.81235802271706, + "language_loss": 0.75059134, + "learning_rate": 3.8092592477633793e-06, + "loss": 0.77259254, + "num_input_tokens_seen": 59678780, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0, + "step": 2760, + "time_per_iteration": 2.459453582763672 + }, + { + "auxiliary_loss_clip": 0.01158028, + "auxiliary_loss_mlp": 0.0104013, + "balance_loss_clip": 1.02139056, + "balance_loss_mlp": 1.05363011, + "epoch": 0.16600030061626334, + "flos": 22637297030400.0, + "grad_norm": 1.843496656605, + "language_loss": 0.73409051, + "learning_rate": 3.8090932254387774e-06, + "loss": 0.75607204, + "num_input_tokens_seen": 59698795, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.046875, + "step": 2761, + "time_per_iteration": 2.473264455795288 + }, + { + "auxiliary_loss_clip": 0.01154594, + "auxiliary_loss_mlp": 0.01046157, + "balance_loss_clip": 1.02709532, + "balance_loss_mlp": 1.05460942, + "epoch": 0.1660604238689313, + "flos": 26396569607040.0, + "grad_norm": 2.076392836835936, + "language_loss": 0.89255953, + "learning_rate": 3.8089271345131788e-06, + "loss": 0.91456699, + "num_input_tokens_seen": 59718795, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0, + "step": 2762, + "time_per_iteration": 2.4917852878570557 + }, + { + "auxiliary_loss_clip": 0.01153346, + "auxiliary_loss_mlp": 0.01052721, + "balance_loss_clip": 1.03327847, + "balance_loss_mlp": 1.0517025, + "epoch": 0.16612054712159927, + "flos": 23039999383680.0, + "grad_norm": 1.6634533311047424, + "language_loss": 0.87782222, + "learning_rate": 3.8087609749928822e-06, + "loss": 0.89988291, + "num_input_tokens_seen": 59737555, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.015625, + "step": 2763, + "time_per_iteration": 2.48002028465271 + }, + { + "auxiliary_loss_clip": 0.01051809, + "auxiliary_loss_mlp": 0.01013596, + "balance_loss_clip": 1.01105642, + "balance_loss_mlp": 1.01786494, + "epoch": 0.16618067037426726, + "flos": 59241225202560.0, + "grad_norm": 0.7771287992078079, + "language_loss": 0.59777391, + "learning_rate": 3.8085947468841885e-06, + "loss": 0.61842799, + "num_input_tokens_seen": 59800915, + "router_z_loss_clip": 0.02539062, + "router_z_loss_mlp": 0.33984375, + "step": 2764, + "time_per_iteration": 3.0722031593322754 + }, + { + "auxiliary_loss_clip": 0.01154679, + "auxiliary_loss_mlp": 0.01052765, + "balance_loss_clip": 1.03183234, + "balance_loss_mlp": 1.05292118, + "epoch": 0.16624079362693522, + "flos": 27198813916800.0, + "grad_norm": 1.8564974944455146, + "language_loss": 0.82349414, + "learning_rate": 3.808428450193401e-06, + "loss": 0.8455686, + "num_input_tokens_seen": 59822910, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.015625, + "step": 2765, + "time_per_iteration": 2.5071089267730713 + }, + { + "auxiliary_loss_clip": 0.01161301, + "auxiliary_loss_mlp": 0.01048817, + "balance_loss_clip": 1.02758563, + "balance_loss_mlp": 1.05308914, + "epoch": 0.1663009168796032, + "flos": 10925068216320.0, + "grad_norm": 2.1954568630881566, + "language_loss": 0.70029616, + "learning_rate": 3.8082620849268244e-06, + "loss": 0.72239733, + "num_input_tokens_seen": 59838805, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.078125, + "step": 2766, + "time_per_iteration": 2.417538642883301 + }, + { + "auxiliary_loss_clip": 0.01153227, + "auxiliary_loss_mlp": 0.01045171, + "balance_loss_clip": 1.02669311, + "balance_loss_mlp": 1.05449462, + "epoch": 0.16636104013227115, + "flos": 17894431353600.0, + "grad_norm": 2.3642497854018174, + "language_loss": 0.88693011, + "learning_rate": 3.808095651090769e-06, + "loss": 0.90891409, + "num_input_tokens_seen": 59855345, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.98828125, + "step": 2767, + "time_per_iteration": 2.447087287902832 + }, + { + "auxiliary_loss_clip": 0.01048639, + "auxiliary_loss_mlp": 0.01007692, + "balance_loss_clip": 1.0051651, + "balance_loss_mlp": 1.01474071, + "epoch": 0.16642116338493912, + "flos": 66726050463360.0, + "grad_norm": 0.659533193053428, + "language_loss": 0.52894622, + "learning_rate": 3.8079291486915447e-06, + "loss": 0.54950953, + "num_input_tokens_seen": 59917710, + "router_z_loss_clip": 0.02526855, + "router_z_loss_mlp": 0.33984375, + "step": 2768, + "time_per_iteration": 4.540286064147949 + }, + { + "auxiliary_loss_clip": 0.01156575, + "auxiliary_loss_mlp": 0.01051889, + "balance_loss_clip": 1.03196931, + "balance_loss_mlp": 1.05233693, + "epoch": 0.16648128663760708, + "flos": 19026048401280.0, + "grad_norm": 2.4421243199538543, + "language_loss": 0.84964579, + "learning_rate": 3.8077625777354667e-06, + "loss": 0.87173045, + "num_input_tokens_seen": 59935105, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.046875, + "step": 2769, + "time_per_iteration": 3.9888546466827393 + }, + { + "auxiliary_loss_clip": 0.01046706, + "auxiliary_loss_mlp": 0.01007405, + "balance_loss_clip": 1.00486565, + "balance_loss_mlp": 1.01284146, + "epoch": 0.16654140989027508, + "flos": 70134976759680.0, + "grad_norm": 0.809970645404753, + "language_loss": 0.57417655, + "learning_rate": 3.80759593822885e-06, + "loss": 0.59471762, + "num_input_tokens_seen": 59984085, + "router_z_loss_clip": 0.02539062, + "router_z_loss_mlp": 0.33984375, + "step": 2770, + "time_per_iteration": 2.909212350845337 + }, + { + "auxiliary_loss_clip": 0.01045765, + "auxiliary_loss_mlp": 0.01004174, + "balance_loss_clip": 1.00161099, + "balance_loss_mlp": 1.0120976, + "epoch": 0.16660153314294304, + "flos": 70272406195200.0, + "grad_norm": 0.8642108743281017, + "language_loss": 0.5621168, + "learning_rate": 3.807429230178015e-06, + "loss": 0.58261615, + "num_input_tokens_seen": 60043470, + "router_z_loss_clip": 0.02563477, + "router_z_loss_mlp": 0.3359375, + "step": 2771, + "time_per_iteration": 2.9000375270843506 + }, + { + "auxiliary_loss_clip": 0.01152287, + "auxiliary_loss_mlp": 0.01058074, + "balance_loss_clip": 1.03741515, + "balance_loss_mlp": 1.05137527, + "epoch": 0.166661656395611, + "flos": 23075048079360.0, + "grad_norm": 2.4271023422086593, + "language_loss": 0.70461071, + "learning_rate": 3.8072624535892817e-06, + "loss": 0.72671425, + "num_input_tokens_seen": 60063045, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.0078125, + "step": 2772, + "time_per_iteration": 2.45868182182312 + }, + { + "auxiliary_loss_clip": 0.01150213, + "auxiliary_loss_mlp": 0.01052488, + "balance_loss_clip": 1.03305721, + "balance_loss_mlp": 1.04914951, + "epoch": 0.16672177964827897, + "flos": 28366341586560.0, + "grad_norm": 1.8764675289735346, + "language_loss": 0.86201918, + "learning_rate": 3.807095608468975e-06, + "loss": 0.8840462, + "num_input_tokens_seen": 60081945, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0078125, + "step": 2773, + "time_per_iteration": 2.513784885406494 + }, + { + "auxiliary_loss_clip": 0.01152492, + "auxiliary_loss_mlp": 0.01046232, + "balance_loss_clip": 1.02808821, + "balance_loss_mlp": 1.05230188, + "epoch": 0.16678190290094694, + "flos": 19091010147840.0, + "grad_norm": 2.2216439453760595, + "language_loss": 0.81859678, + "learning_rate": 3.8069286948234224e-06, + "loss": 0.84058398, + "num_input_tokens_seen": 60096820, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.0, + "step": 2774, + "time_per_iteration": 2.4288830757141113 + }, + { + "auxiliary_loss_clip": 0.01155539, + "auxiliary_loss_mlp": 0.0104957, + "balance_loss_clip": 1.02955508, + "balance_loss_mlp": 1.05290627, + "epoch": 0.1668420261536149, + "flos": 21799106184960.0, + "grad_norm": 2.1125697386324576, + "language_loss": 0.83287829, + "learning_rate": 3.806761712658952e-06, + "loss": 0.85492939, + "num_input_tokens_seen": 60116140, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.0234375, + "step": 2775, + "time_per_iteration": 2.4773504734039307 + }, + { + "auxiliary_loss_clip": 0.01151, + "auxiliary_loss_mlp": 0.01053902, + "balance_loss_clip": 1.03599668, + "balance_loss_mlp": 1.0527029, + "epoch": 0.16690214940628287, + "flos": 19062533640960.0, + "grad_norm": 1.9011936520028738, + "language_loss": 0.80721045, + "learning_rate": 3.806594661981897e-06, + "loss": 0.82925946, + "num_input_tokens_seen": 60134235, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.984375, + "step": 2776, + "time_per_iteration": 2.4736995697021484 + }, + { + "auxiliary_loss_clip": 0.01147621, + "auxiliary_loss_mlp": 0.01053383, + "balance_loss_clip": 1.03457189, + "balance_loss_mlp": 1.05260348, + "epoch": 0.16696227265895086, + "flos": 18588548747520.0, + "grad_norm": 1.7922512358148395, + "language_loss": 0.798361, + "learning_rate": 3.8064275427985906e-06, + "loss": 0.82037103, + "num_input_tokens_seen": 60153275, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.953125, + "step": 2777, + "time_per_iteration": 2.4625258445739746 + }, + { + "auxiliary_loss_clip": 0.01149386, + "auxiliary_loss_mlp": 0.0105028, + "balance_loss_clip": 1.0313735, + "balance_loss_mlp": 1.05002642, + "epoch": 0.16702239591161883, + "flos": 23294139085440.0, + "grad_norm": 1.8218923631286437, + "language_loss": 0.85132945, + "learning_rate": 3.806260355115371e-06, + "loss": 0.87332618, + "num_input_tokens_seen": 60173215, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.99609375, + "step": 2778, + "time_per_iteration": 2.4819412231445312 + }, + { + "auxiliary_loss_clip": 0.01154381, + "auxiliary_loss_mlp": 0.0104532, + "balance_loss_clip": 1.02626991, + "balance_loss_mlp": 1.05222583, + "epoch": 0.1670825191642868, + "flos": 24425648392320.0, + "grad_norm": 2.6489491047564826, + "language_loss": 0.74133682, + "learning_rate": 3.8060930989385778e-06, + "loss": 0.76333386, + "num_input_tokens_seen": 60190515, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0234375, + "step": 2779, + "time_per_iteration": 2.510207176208496 + }, + { + "auxiliary_loss_clip": 0.0115174, + "auxiliary_loss_mlp": 0.01045601, + "balance_loss_clip": 1.02625358, + "balance_loss_mlp": 1.05116367, + "epoch": 0.16714264241695476, + "flos": 26797512193920.0, + "grad_norm": 2.2761441742273663, + "language_loss": 0.65382051, + "learning_rate": 3.805925774274554e-06, + "loss": 0.67579395, + "num_input_tokens_seen": 60211655, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0078125, + "step": 2780, + "time_per_iteration": 2.5250439643859863 + }, + { + "auxiliary_loss_clip": 0.01150325, + "auxiliary_loss_mlp": 0.01048314, + "balance_loss_clip": 1.02856088, + "balance_loss_mlp": 1.05120933, + "epoch": 0.16720276566962272, + "flos": 21835304115840.0, + "grad_norm": 2.0602280440022382, + "language_loss": 0.78563058, + "learning_rate": 3.805758381129643e-06, + "loss": 0.80761701, + "num_input_tokens_seen": 60230860, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.9921875, + "step": 2781, + "time_per_iteration": 2.4921979904174805 + }, + { + "auxiliary_loss_clip": 0.01153739, + "auxiliary_loss_mlp": 0.01049181, + "balance_loss_clip": 1.03065586, + "balance_loss_mlp": 1.05227423, + "epoch": 0.1672628889222907, + "flos": 21470415805440.0, + "grad_norm": 1.480266857331911, + "language_loss": 0.75262564, + "learning_rate": 3.805590919510193e-06, + "loss": 0.77465487, + "num_input_tokens_seen": 60250535, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.015625, + "step": 2782, + "time_per_iteration": 2.468590021133423 + }, + { + "auxiliary_loss_clip": 0.01159372, + "auxiliary_loss_mlp": 0.01052642, + "balance_loss_clip": 1.03141046, + "balance_loss_mlp": 1.05443954, + "epoch": 0.16732301217495865, + "flos": 30774008269440.0, + "grad_norm": 1.999958464394936, + "language_loss": 0.67841566, + "learning_rate": 3.8054233894225547e-06, + "loss": 0.70053571, + "num_input_tokens_seen": 60269530, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.0546875, + "step": 2783, + "time_per_iteration": 2.5312225818634033 + }, + { + "auxiliary_loss_clip": 0.01153889, + "auxiliary_loss_mlp": 0.01050749, + "balance_loss_clip": 1.03193808, + "balance_loss_mlp": 1.0538497, + "epoch": 0.16738313542762664, + "flos": 23474625949440.0, + "grad_norm": 2.209785525271013, + "language_loss": 0.70028126, + "learning_rate": 3.805255790873081e-06, + "loss": 0.72232759, + "num_input_tokens_seen": 60289900, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0, + "step": 2784, + "time_per_iteration": 2.4932820796966553 + }, + { + "auxiliary_loss_clip": 0.01154602, + "auxiliary_loss_mlp": 0.0105186, + "balance_loss_clip": 1.03087902, + "balance_loss_mlp": 1.05120277, + "epoch": 0.1674432586802946, + "flos": 29789086366080.0, + "grad_norm": 1.9638597335511054, + "language_loss": 0.60441053, + "learning_rate": 3.805088123868126e-06, + "loss": 0.62647516, + "num_input_tokens_seen": 60310025, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.03125, + "step": 2785, + "time_per_iteration": 2.527010440826416 + }, + { + "auxiliary_loss_clip": 0.0104901, + "auxiliary_loss_mlp": 0.01029558, + "balance_loss_clip": 1.02681625, + "balance_loss_mlp": 1.01595187, + "epoch": 0.16750338193296258, + "flos": 66136073575680.0, + "grad_norm": 0.8343482124814343, + "language_loss": 0.588, + "learning_rate": 3.8049203884140492e-06, + "loss": 0.60878569, + "num_input_tokens_seen": 60377800, + "router_z_loss_clip": 0.02746582, + "router_z_loss_mlp": 0.33007812, + "step": 2786, + "time_per_iteration": 3.1062281131744385 + }, + { + "auxiliary_loss_clip": 0.0115343, + "auxiliary_loss_mlp": 0.01044844, + "balance_loss_clip": 1.0253408, + "balance_loss_mlp": 1.05108333, + "epoch": 0.16756350518563054, + "flos": 25696777864320.0, + "grad_norm": 1.9494651562196093, + "language_loss": 0.75846571, + "learning_rate": 3.80475258451721e-06, + "loss": 0.78044844, + "num_input_tokens_seen": 60398215, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0234375, + "step": 2787, + "time_per_iteration": 2.51383900642395 + }, + { + "auxiliary_loss_clip": 0.0115361, + "auxiliary_loss_mlp": 0.0104169, + "balance_loss_clip": 1.02287841, + "balance_loss_mlp": 1.05218899, + "epoch": 0.1676236284382985, + "flos": 23836102467840.0, + "grad_norm": 2.088538847955111, + "language_loss": 0.77615869, + "learning_rate": 3.804584712183972e-06, + "loss": 0.79811174, + "num_input_tokens_seen": 60416910, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 2788, + "time_per_iteration": 2.4926373958587646 + }, + { + "auxiliary_loss_clip": 0.01048965, + "auxiliary_loss_mlp": 0.01004104, + "balance_loss_clip": 1.00154078, + "balance_loss_mlp": 1.01582766, + "epoch": 0.16768375169096647, + "flos": 59874902985600.0, + "grad_norm": 0.861309286667726, + "language_loss": 0.59360403, + "learning_rate": 3.8044167714207013e-06, + "loss": 0.61413473, + "num_input_tokens_seen": 60468660, + "router_z_loss_clip": 0.02563477, + "router_z_loss_mlp": 0.33203125, + "step": 2789, + "time_per_iteration": 2.9390883445739746 + }, + { + "auxiliary_loss_clip": 0.01153417, + "auxiliary_loss_mlp": 0.01052728, + "balance_loss_clip": 1.03262937, + "balance_loss_mlp": 1.05115533, + "epoch": 0.16774387494363446, + "flos": 38435657207040.0, + "grad_norm": 1.8582032581880512, + "language_loss": 0.70117038, + "learning_rate": 3.804248762233765e-06, + "loss": 0.72323185, + "num_input_tokens_seen": 60492370, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.015625, + "step": 2790, + "time_per_iteration": 2.6337287425994873 + }, + { + "auxiliary_loss_clip": 0.01154528, + "auxiliary_loss_mlp": 0.01057043, + "balance_loss_clip": 1.03852975, + "balance_loss_mlp": 1.05254579, + "epoch": 0.16780399819630243, + "flos": 22637620252800.0, + "grad_norm": 1.9267324208283758, + "language_loss": 0.7914235, + "learning_rate": 3.8040806846295356e-06, + "loss": 0.81353921, + "num_input_tokens_seen": 60512655, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0234375, + "step": 2791, + "time_per_iteration": 2.4992258548736572 + }, + { + "auxiliary_loss_clip": 0.01154296, + "auxiliary_loss_mlp": 0.0104755, + "balance_loss_clip": 1.02807093, + "balance_loss_mlp": 1.05311096, + "epoch": 0.1678641214489704, + "flos": 32891516887680.0, + "grad_norm": 1.670563786806713, + "language_loss": 0.71465087, + "learning_rate": 3.8039125386143853e-06, + "loss": 0.73666936, + "num_input_tokens_seen": 60533090, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.015625, + "step": 2792, + "time_per_iteration": 2.5886104106903076 + }, + { + "auxiliary_loss_clip": 0.01154826, + "auxiliary_loss_mlp": 0.01045884, + "balance_loss_clip": 1.02648878, + "balance_loss_mlp": 1.05179656, + "epoch": 0.16792424470163836, + "flos": 19974916028160.0, + "grad_norm": 2.423044729867527, + "language_loss": 0.72166264, + "learning_rate": 3.803744324194691e-06, + "loss": 0.74366981, + "num_input_tokens_seen": 60553190, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.03125, + "step": 2793, + "time_per_iteration": 2.5197043418884277 + }, + { + "auxiliary_loss_clip": 0.01153184, + "auxiliary_loss_mlp": 0.01054586, + "balance_loss_clip": 1.03502417, + "balance_loss_mlp": 1.05135465, + "epoch": 0.16798436795430632, + "flos": 19719878486400.0, + "grad_norm": 1.9474647186442988, + "language_loss": 0.77305138, + "learning_rate": 3.803576041376831e-06, + "loss": 0.79512912, + "num_input_tokens_seen": 60571995, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.015625, + "step": 2794, + "time_per_iteration": 2.467292547225952 + }, + { + "auxiliary_loss_clip": 0.01154384, + "auxiliary_loss_mlp": 0.01054467, + "balance_loss_clip": 1.03558397, + "balance_loss_mlp": 1.05253601, + "epoch": 0.1680444912069743, + "flos": 28104839596800.0, + "grad_norm": 2.2742759048834578, + "language_loss": 0.71613103, + "learning_rate": 3.803407690167187e-06, + "loss": 0.7382195, + "num_input_tokens_seen": 60591275, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.015625, + "step": 2795, + "time_per_iteration": 2.5272278785705566 + }, + { + "auxiliary_loss_clip": 0.01149377, + "auxiliary_loss_mlp": 0.01044626, + "balance_loss_clip": 1.02592218, + "balance_loss_mlp": 1.04932868, + "epoch": 0.16810461445964225, + "flos": 18075205526400.0, + "grad_norm": 1.942494339721957, + "language_loss": 0.83784455, + "learning_rate": 3.803239270572142e-06, + "loss": 0.8597846, + "num_input_tokens_seen": 60609235, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0, + "step": 2796, + "time_per_iteration": 2.448528289794922 + }, + { + "auxiliary_loss_clip": 0.01152862, + "auxiliary_loss_mlp": 0.01059215, + "balance_loss_clip": 1.03911614, + "balance_loss_mlp": 1.04904127, + "epoch": 0.16816473771231025, + "flos": 23878657105920.0, + "grad_norm": 1.6778887705488965, + "language_loss": 0.8109591, + "learning_rate": 3.8030707825980838e-06, + "loss": 0.83307993, + "num_input_tokens_seen": 60629880, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0390625, + "step": 2797, + "time_per_iteration": 2.5044567584991455 + }, + { + "auxiliary_loss_clip": 0.01147186, + "auxiliary_loss_mlp": 0.01044345, + "balance_loss_clip": 1.02766752, + "balance_loss_mlp": 1.05142093, + "epoch": 0.1682248609649782, + "flos": 22783597125120.0, + "grad_norm": 1.4189820060365406, + "language_loss": 0.74740726, + "learning_rate": 3.802902226251401e-06, + "loss": 0.76932257, + "num_input_tokens_seen": 60651175, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.95703125, + "step": 2798, + "time_per_iteration": 2.4913666248321533 + }, + { + "auxiliary_loss_clip": 0.01154688, + "auxiliary_loss_mlp": 0.01049917, + "balance_loss_clip": 1.03250098, + "balance_loss_mlp": 1.05462337, + "epoch": 0.16828498421764618, + "flos": 20705123612160.0, + "grad_norm": 1.8962576537558784, + "language_loss": 0.79592311, + "learning_rate": 3.8027336015384845e-06, + "loss": 0.81796914, + "num_input_tokens_seen": 60670210, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 1.0, + "step": 2799, + "time_per_iteration": 2.4844021797180176 + }, + { + "auxiliary_loss_clip": 0.01153892, + "auxiliary_loss_mlp": 0.01046392, + "balance_loss_clip": 1.02597189, + "balance_loss_mlp": 1.04983997, + "epoch": 0.16834510747031414, + "flos": 29420606695680.0, + "grad_norm": 2.7819182919151455, + "language_loss": 0.70778632, + "learning_rate": 3.8025649084657296e-06, + "loss": 0.72978926, + "num_input_tokens_seen": 60690895, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0390625, + "step": 2800, + "time_per_iteration": 2.548715829849243 + }, + { + "auxiliary_loss_clip": 0.01148463, + "auxiliary_loss_mlp": 0.01043839, + "balance_loss_clip": 1.02365637, + "balance_loss_mlp": 1.04882574, + "epoch": 0.1684052307229821, + "flos": 18145374744960.0, + "grad_norm": 1.9135359518782422, + "language_loss": 0.83549178, + "learning_rate": 3.8023961470395326e-06, + "loss": 0.85741478, + "num_input_tokens_seen": 60708280, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.99609375, + "step": 2801, + "time_per_iteration": 2.456601858139038 + }, + { + "auxiliary_loss_clip": 0.01149997, + "auxiliary_loss_mlp": 0.01052315, + "balance_loss_clip": 1.03355145, + "balance_loss_mlp": 1.04947591, + "epoch": 0.16846535397565007, + "flos": 16574929240320.0, + "grad_norm": 2.757874152621573, + "language_loss": 0.822721, + "learning_rate": 3.8022273172662933e-06, + "loss": 0.84474415, + "num_input_tokens_seen": 60724150, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0078125, + "step": 2802, + "time_per_iteration": 2.4426534175872803 + }, + { + "auxiliary_loss_clip": 0.01153107, + "auxiliary_loss_mlp": 0.01047694, + "balance_loss_clip": 1.02764344, + "balance_loss_mlp": 1.05123353, + "epoch": 0.16852547722831807, + "flos": 30408868563840.0, + "grad_norm": 1.4855905624355255, + "language_loss": 0.81064272, + "learning_rate": 3.802058419152413e-06, + "loss": 0.83265072, + "num_input_tokens_seen": 60746485, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.015625, + "step": 2803, + "time_per_iteration": 2.5615930557250977 + }, + { + "auxiliary_loss_clip": 0.01150255, + "auxiliary_loss_mlp": 0.01045652, + "balance_loss_clip": 1.02693641, + "balance_loss_mlp": 1.05246449, + "epoch": 0.16858560048098603, + "flos": 33507420416640.0, + "grad_norm": 2.2799183114600545, + "language_loss": 0.7645762, + "learning_rate": 3.801889452704297e-06, + "loss": 0.78653532, + "num_input_tokens_seen": 60762875, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9765625, + "step": 2804, + "time_per_iteration": 2.541059970855713 + }, + { + "auxiliary_loss_clip": 0.01045818, + "auxiliary_loss_mlp": 0.01026702, + "balance_loss_clip": 1.02452028, + "balance_loss_mlp": 1.01328063, + "epoch": 0.168645723733654, + "flos": 67370502326400.0, + "grad_norm": 0.8620881286764229, + "language_loss": 0.55414748, + "learning_rate": 3.8017204179283526e-06, + "loss": 0.57487267, + "num_input_tokens_seen": 60825510, + "router_z_loss_clip": 0.02185059, + "router_z_loss_mlp": 0.32421875, + "step": 2805, + "time_per_iteration": 3.033358573913574 + }, + { + "auxiliary_loss_clip": 0.01144187, + "auxiliary_loss_mlp": 0.01039064, + "balance_loss_clip": 1.02161169, + "balance_loss_mlp": 1.04741919, + "epoch": 0.16870584698632196, + "flos": 21324618501120.0, + "grad_norm": 1.9122963285347783, + "language_loss": 0.73038024, + "learning_rate": 3.8015513148309892e-06, + "loss": 0.75221276, + "num_input_tokens_seen": 60844440, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.96484375, + "step": 2806, + "time_per_iteration": 2.4699463844299316 + }, + { + "auxiliary_loss_clip": 0.01148467, + "auxiliary_loss_mlp": 0.01045307, + "balance_loss_clip": 1.02712786, + "balance_loss_mlp": 1.05072176, + "epoch": 0.16876597023898993, + "flos": 20740746925440.0, + "grad_norm": 1.9407491705316076, + "language_loss": 0.69966477, + "learning_rate": 3.80138214341862e-06, + "loss": 0.7216025, + "num_input_tokens_seen": 60863210, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.98046875, + "step": 2807, + "time_per_iteration": 2.4583139419555664 + }, + { + "auxiliary_loss_clip": 0.01149832, + "auxiliary_loss_mlp": 0.01051611, + "balance_loss_clip": 1.03196526, + "balance_loss_mlp": 1.05013919, + "epoch": 0.1688260934916579, + "flos": 20303498666880.0, + "grad_norm": 2.8028706291815912, + "language_loss": 0.70265883, + "learning_rate": 3.8012129036976587e-06, + "loss": 0.72467327, + "num_input_tokens_seen": 60882510, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.9921875, + "step": 2808, + "time_per_iteration": 2.4724719524383545 + }, + { + "auxiliary_loss_clip": 0.01154296, + "auxiliary_loss_mlp": 0.0104775, + "balance_loss_clip": 1.02792549, + "balance_loss_mlp": 1.05130935, + "epoch": 0.16888621674432586, + "flos": 20340702178560.0, + "grad_norm": 2.1293629398657954, + "language_loss": 0.80103064, + "learning_rate": 3.8010435956745236e-06, + "loss": 0.8230511, + "num_input_tokens_seen": 60901105, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.03125, + "step": 2809, + "time_per_iteration": 3.844451427459717 + }, + { + "auxiliary_loss_clip": 0.01155336, + "auxiliary_loss_mlp": 0.01051942, + "balance_loss_clip": 1.03301144, + "balance_loss_mlp": 1.050385, + "epoch": 0.16894633999699385, + "flos": 16244802316800.0, + "grad_norm": 2.0909159229075245, + "language_loss": 0.88465077, + "learning_rate": 3.8008742193556358e-06, + "loss": 0.9067235, + "num_input_tokens_seen": 60915340, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.046875, + "step": 2810, + "time_per_iteration": 5.43256688117981 + }, + { + "auxiliary_loss_clip": 0.0115459, + "auxiliary_loss_mlp": 0.01052284, + "balance_loss_clip": 1.03238845, + "balance_loss_mlp": 1.05188894, + "epoch": 0.16900646324966181, + "flos": 19610171372160.0, + "grad_norm": 2.324870160833927, + "language_loss": 0.92483926, + "learning_rate": 3.800704774747416e-06, + "loss": 0.94690794, + "num_input_tokens_seen": 60933735, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.03125, + "step": 2811, + "time_per_iteration": 2.4633538722991943 + }, + { + "auxiliary_loss_clip": 0.01157458, + "auxiliary_loss_mlp": 0.01049775, + "balance_loss_clip": 1.03154814, + "balance_loss_mlp": 1.05537057, + "epoch": 0.16906658650232978, + "flos": 22018089450240.0, + "grad_norm": 20.150047321728213, + "language_loss": 0.78719699, + "learning_rate": 3.800535261856291e-06, + "loss": 0.80926931, + "num_input_tokens_seen": 60953105, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.0234375, + "step": 2812, + "time_per_iteration": 2.475893974304199 + }, + { + "auxiliary_loss_clip": 0.01154531, + "auxiliary_loss_mlp": 0.01053249, + "balance_loss_clip": 1.0353322, + "balance_loss_mlp": 1.05427527, + "epoch": 0.16912670975499774, + "flos": 11763690024960.0, + "grad_norm": 2.3708558754635103, + "language_loss": 0.7492249, + "learning_rate": 3.8003656806886887e-06, + "loss": 0.7713027, + "num_input_tokens_seen": 60969150, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 1.0078125, + "step": 2813, + "time_per_iteration": 2.4622457027435303 + }, + { + "auxiliary_loss_clip": 0.01155154, + "auxiliary_loss_mlp": 0.01047749, + "balance_loss_clip": 1.02862835, + "balance_loss_mlp": 1.05231524, + "epoch": 0.1691868330076657, + "flos": 17161386595200.0, + "grad_norm": 2.6643465032783955, + "language_loss": 0.69000697, + "learning_rate": 3.8001960312510396e-06, + "loss": 0.71203601, + "num_input_tokens_seen": 60982825, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.03125, + "step": 2814, + "time_per_iteration": 2.442352771759033 + }, + { + "auxiliary_loss_clip": 0.01152587, + "auxiliary_loss_mlp": 0.01048898, + "balance_loss_clip": 1.03032494, + "balance_loss_mlp": 1.05269694, + "epoch": 0.16924695626033368, + "flos": 22416553998720.0, + "grad_norm": 3.3683342322522543, + "language_loss": 0.61842358, + "learning_rate": 3.800026313549776e-06, + "loss": 0.64043844, + "num_input_tokens_seen": 61000875, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0, + "step": 2815, + "time_per_iteration": 2.4859516620635986 + }, + { + "auxiliary_loss_clip": 0.01149812, + "auxiliary_loss_mlp": 0.01050269, + "balance_loss_clip": 1.03179121, + "balance_loss_mlp": 1.05104065, + "epoch": 0.16930707951300164, + "flos": 25739655724800.0, + "grad_norm": 1.9947957584318596, + "language_loss": 0.81983805, + "learning_rate": 3.7998565275913342e-06, + "loss": 0.84183884, + "num_input_tokens_seen": 61021940, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9921875, + "step": 2816, + "time_per_iteration": 2.5549440383911133 + }, + { + "auxiliary_loss_clip": 0.01156016, + "auxiliary_loss_mlp": 0.01049677, + "balance_loss_clip": 1.03072321, + "balance_loss_mlp": 1.05379295, + "epoch": 0.16936720276566963, + "flos": 22747040058240.0, + "grad_norm": 2.502019531770294, + "language_loss": 0.8722589, + "learning_rate": 3.799686673382153e-06, + "loss": 0.89431584, + "num_input_tokens_seen": 61040285, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0234375, + "step": 2817, + "time_per_iteration": 2.4906835556030273 + }, + { + "auxiliary_loss_clip": 0.01152128, + "auxiliary_loss_mlp": 0.01051154, + "balance_loss_clip": 1.03200889, + "balance_loss_mlp": 1.05302715, + "epoch": 0.1694273260183376, + "flos": 19573973441280.0, + "grad_norm": 1.7787508021643152, + "language_loss": 0.81666476, + "learning_rate": 3.799516750928672e-06, + "loss": 0.83869755, + "num_input_tokens_seen": 61059020, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.98828125, + "step": 2818, + "time_per_iteration": 2.4673428535461426 + }, + { + "auxiliary_loss_clip": 0.01151603, + "auxiliary_loss_mlp": 0.01052661, + "balance_loss_clip": 1.03339636, + "balance_loss_mlp": 1.05154157, + "epoch": 0.16948744927100556, + "flos": 12457843332480.0, + "grad_norm": 5.791836374282792, + "language_loss": 0.80712807, + "learning_rate": 3.799346760237336e-06, + "loss": 0.8291707, + "num_input_tokens_seen": 61074245, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0, + "step": 2819, + "time_per_iteration": 2.43947434425354 + }, + { + "auxiliary_loss_clip": 0.01048844, + "auxiliary_loss_mlp": 0.01007246, + "balance_loss_clip": 1.00504076, + "balance_loss_mlp": 1.01552486, + "epoch": 0.16954757252367353, + "flos": 71291694435840.0, + "grad_norm": 0.9491282523447765, + "language_loss": 0.61080176, + "learning_rate": 3.7991767013145902e-06, + "loss": 0.63136268, + "num_input_tokens_seen": 61127080, + "router_z_loss_clip": 0.02209473, + "router_z_loss_mlp": 0.33203125, + "step": 2820, + "time_per_iteration": 3.008953809738159 + }, + { + "auxiliary_loss_clip": 0.01152835, + "auxiliary_loss_mlp": 0.01049908, + "balance_loss_clip": 1.031335, + "balance_loss_mlp": 1.05163527, + "epoch": 0.1696076957763415, + "flos": 29606516513280.0, + "grad_norm": 2.1013484538112097, + "language_loss": 0.78625357, + "learning_rate": 3.7990065741668844e-06, + "loss": 0.808281, + "num_input_tokens_seen": 61146955, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.015625, + "step": 2821, + "time_per_iteration": 2.5363481044769287 + }, + { + "auxiliary_loss_clip": 0.01152889, + "auxiliary_loss_mlp": 0.01056486, + "balance_loss_clip": 1.03667343, + "balance_loss_mlp": 1.05229986, + "epoch": 0.16966781902900946, + "flos": 24388588535040.0, + "grad_norm": 2.87583667245789, + "language_loss": 0.78450388, + "learning_rate": 3.7988363788006685e-06, + "loss": 0.80659759, + "num_input_tokens_seen": 61166605, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.0078125, + "step": 2822, + "time_per_iteration": 2.4969065189361572 + }, + { + "auxiliary_loss_clip": 0.01147495, + "auxiliary_loss_mlp": 0.01050996, + "balance_loss_clip": 1.03299582, + "balance_loss_mlp": 1.04956698, + "epoch": 0.16972794228167745, + "flos": 23038814234880.0, + "grad_norm": 1.9220487825624015, + "language_loss": 0.75016022, + "learning_rate": 3.7986661152223967e-06, + "loss": 0.77214515, + "num_input_tokens_seen": 61186535, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9765625, + "step": 2823, + "time_per_iteration": 2.491588830947876 + }, + { + "auxiliary_loss_clip": 0.01151822, + "auxiliary_loss_mlp": 0.0105186, + "balance_loss_clip": 1.03198779, + "balance_loss_mlp": 1.05209637, + "epoch": 0.16978806553434542, + "flos": 35228691129600.0, + "grad_norm": 1.9648811068121905, + "language_loss": 0.60514438, + "learning_rate": 3.7984957834385257e-06, + "loss": 0.62718117, + "num_input_tokens_seen": 61208965, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.99609375, + "step": 2824, + "time_per_iteration": 2.6178910732269287 + }, + { + "auxiliary_loss_clip": 0.01151958, + "auxiliary_loss_mlp": 0.01040113, + "balance_loss_clip": 1.02030015, + "balance_loss_mlp": 1.05367076, + "epoch": 0.16984818878701338, + "flos": 32014290936960.0, + "grad_norm": 1.6856049786717988, + "language_loss": 0.73004806, + "learning_rate": 3.7983253834555144e-06, + "loss": 0.75196874, + "num_input_tokens_seen": 61230670, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.98046875, + "step": 2825, + "time_per_iteration": 2.559774398803711 + }, + { + "auxiliary_loss_clip": 0.01155697, + "auxiliary_loss_mlp": 0.01054546, + "balance_loss_clip": 1.03321934, + "balance_loss_mlp": 1.0505774, + "epoch": 0.16990831203968135, + "flos": 22818609907200.0, + "grad_norm": 1.7849035157466668, + "language_loss": 0.85660541, + "learning_rate": 3.7981549152798245e-06, + "loss": 0.87870789, + "num_input_tokens_seen": 61249510, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.0546875, + "step": 2826, + "time_per_iteration": 2.4860360622406006 + }, + { + "auxiliary_loss_clip": 0.0115502, + "auxiliary_loss_mlp": 0.01050706, + "balance_loss_clip": 1.03164482, + "balance_loss_mlp": 1.0515151, + "epoch": 0.1699684352923493, + "flos": 23039604334080.0, + "grad_norm": 2.3205594057943175, + "language_loss": 0.8232255, + "learning_rate": 3.7979843789179196e-06, + "loss": 0.84528267, + "num_input_tokens_seen": 61269440, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.03125, + "step": 2827, + "time_per_iteration": 2.4965107440948486 + }, + { + "auxiliary_loss_clip": 0.01153252, + "auxiliary_loss_mlp": 0.01049837, + "balance_loss_clip": 1.02965498, + "balance_loss_mlp": 1.05059743, + "epoch": 0.17002855854501728, + "flos": 21434110133760.0, + "grad_norm": 2.393760877815214, + "language_loss": 0.73652613, + "learning_rate": 3.797813774376267e-06, + "loss": 0.75855708, + "num_input_tokens_seen": 61288195, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0234375, + "step": 2828, + "time_per_iteration": 2.5726237297058105 + }, + { + "auxiliary_loss_clip": 0.01046718, + "auxiliary_loss_mlp": 0.01008554, + "balance_loss_clip": 1.00625372, + "balance_loss_mlp": 1.01360035, + "epoch": 0.17008868179768524, + "flos": 71453509205760.0, + "grad_norm": 0.76062911359866, + "language_loss": 0.56446254, + "learning_rate": 3.797643101661336e-06, + "loss": 0.5850153, + "num_input_tokens_seen": 61350850, + "router_z_loss_clip": 0.02294922, + "router_z_loss_mlp": 0.33203125, + "step": 2829, + "time_per_iteration": 3.1035284996032715 + }, + { + "auxiliary_loss_clip": 0.01148906, + "auxiliary_loss_mlp": 0.01048452, + "balance_loss_clip": 1.02912867, + "balance_loss_mlp": 1.04916263, + "epoch": 0.17014880505035324, + "flos": 24900315644160.0, + "grad_norm": 1.7229604876305038, + "language_loss": 0.83673382, + "learning_rate": 3.7974723607795983e-06, + "loss": 0.85870743, + "num_input_tokens_seen": 61370765, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.99609375, + "step": 2830, + "time_per_iteration": 2.5140810012817383 + }, + { + "auxiliary_loss_clip": 0.01150805, + "auxiliary_loss_mlp": 0.01048567, + "balance_loss_clip": 1.02792013, + "balance_loss_mlp": 1.04919207, + "epoch": 0.1702089283030212, + "flos": 29862415981440.0, + "grad_norm": 2.0065309441313337, + "language_loss": 0.77852297, + "learning_rate": 3.797301551737529e-06, + "loss": 0.80051666, + "num_input_tokens_seen": 61388935, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.015625, + "step": 2831, + "time_per_iteration": 2.524578094482422 + }, + { + "auxiliary_loss_clip": 0.01152075, + "auxiliary_loss_mlp": 0.01050912, + "balance_loss_clip": 1.03013349, + "balance_loss_mlp": 1.04948521, + "epoch": 0.17026905155568917, + "flos": 17744180762880.0, + "grad_norm": 2.1211873867699285, + "language_loss": 0.79345167, + "learning_rate": 3.7971306745416044e-06, + "loss": 0.81548154, + "num_input_tokens_seen": 61407350, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.0234375, + "step": 2832, + "time_per_iteration": 2.459954261779785 + }, + { + "auxiliary_loss_clip": 0.01151972, + "auxiliary_loss_mlp": 0.0104718, + "balance_loss_clip": 1.02836847, + "balance_loss_mlp": 1.05050385, + "epoch": 0.17032917480835713, + "flos": 23148665003520.0, + "grad_norm": 1.9382017652854369, + "language_loss": 0.89026237, + "learning_rate": 3.7969597291983046e-06, + "loss": 0.91225392, + "num_input_tokens_seen": 61429010, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 2833, + "time_per_iteration": 2.4812114238739014 + }, + { + "auxiliary_loss_clip": 0.0115284, + "auxiliary_loss_mlp": 0.01048999, + "balance_loss_clip": 1.02963924, + "balance_loss_mlp": 1.05124569, + "epoch": 0.1703892980610251, + "flos": 39202565512320.0, + "grad_norm": 2.853060698790674, + "language_loss": 0.72425497, + "learning_rate": 3.7967887157141115e-06, + "loss": 0.74627328, + "num_input_tokens_seen": 61450040, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.015625, + "step": 2834, + "time_per_iteration": 2.6271297931671143 + }, + { + "auxiliary_loss_clip": 0.01156378, + "auxiliary_loss_mlp": 0.01058486, + "balance_loss_clip": 1.03894782, + "balance_loss_mlp": 1.05294132, + "epoch": 0.17044942131369306, + "flos": 23039101543680.0, + "grad_norm": 1.9954265429463485, + "language_loss": 0.86434042, + "learning_rate": 3.7966176340955106e-06, + "loss": 0.88648909, + "num_input_tokens_seen": 61468585, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.03125, + "step": 2835, + "time_per_iteration": 2.4804999828338623 + }, + { + "auxiliary_loss_clip": 0.01155592, + "auxiliary_loss_mlp": 0.01051963, + "balance_loss_clip": 1.03007674, + "balance_loss_mlp": 1.05081642, + "epoch": 0.17050954456636103, + "flos": 17054983532160.0, + "grad_norm": 1.9180646463430515, + "language_loss": 0.73242748, + "learning_rate": 3.796446484348989e-06, + "loss": 0.75450307, + "num_input_tokens_seen": 61486330, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.046875, + "step": 2836, + "time_per_iteration": 2.4694178104400635 + }, + { + "auxiliary_loss_clip": 0.01156947, + "auxiliary_loss_mlp": 0.01048414, + "balance_loss_clip": 1.02599072, + "balance_loss_mlp": 1.05033076, + "epoch": 0.17056966781902902, + "flos": 16836969934080.0, + "grad_norm": 2.1253309510576717, + "language_loss": 0.79653537, + "learning_rate": 3.796275266481036e-06, + "loss": 0.81858897, + "num_input_tokens_seen": 61503950, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.0703125, + "step": 2837, + "time_per_iteration": 2.452153444290161 + }, + { + "auxiliary_loss_clip": 0.01150588, + "auxiliary_loss_mlp": 0.01045279, + "balance_loss_clip": 1.02550185, + "balance_loss_mlp": 1.05232143, + "epoch": 0.17062979107169698, + "flos": 17712543859200.0, + "grad_norm": 2.19906443062711, + "language_loss": 0.83575213, + "learning_rate": 3.7961039804981456e-06, + "loss": 0.85771078, + "num_input_tokens_seen": 61523550, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.98046875, + "step": 2838, + "time_per_iteration": 2.479573965072632 + }, + { + "auxiliary_loss_clip": 0.01148981, + "auxiliary_loss_mlp": 0.01045249, + "balance_loss_clip": 1.02660489, + "balance_loss_mlp": 1.05069315, + "epoch": 0.17068991432436495, + "flos": 22525040050560.0, + "grad_norm": 1.7423496230624245, + "language_loss": 0.93620354, + "learning_rate": 3.795932626406812e-06, + "loss": 0.95814586, + "num_input_tokens_seen": 61542720, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.984375, + "step": 2839, + "time_per_iteration": 2.5399010181427 + }, + { + "auxiliary_loss_clip": 0.01154457, + "auxiliary_loss_mlp": 0.01049077, + "balance_loss_clip": 1.0277859, + "balance_loss_mlp": 1.05050242, + "epoch": 0.17075003757703291, + "flos": 25882939077120.0, + "grad_norm": 2.8052720148780894, + "language_loss": 0.83847374, + "learning_rate": 3.7957612042135336e-06, + "loss": 0.86050916, + "num_input_tokens_seen": 61563040, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.0390625, + "step": 2840, + "time_per_iteration": 2.5449130535125732 + }, + { + "auxiliary_loss_clip": 0.01155521, + "auxiliary_loss_mlp": 0.01047778, + "balance_loss_clip": 1.02647519, + "balance_loss_mlp": 1.05213881, + "epoch": 0.17081016082970088, + "flos": 20120713332480.0, + "grad_norm": 2.014300966058614, + "language_loss": 0.76390004, + "learning_rate": 3.79558971392481e-06, + "loss": 0.78593302, + "num_input_tokens_seen": 61581890, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.03125, + "step": 2841, + "time_per_iteration": 2.4836723804473877 + }, + { + "auxiliary_loss_clip": 0.01152003, + "auxiliary_loss_mlp": 0.01052533, + "balance_loss_clip": 1.03243482, + "balance_loss_mlp": 1.04932261, + "epoch": 0.17087028408236885, + "flos": 24936477661440.0, + "grad_norm": 1.8874127741110907, + "language_loss": 0.77000463, + "learning_rate": 3.7954181555471443e-06, + "loss": 0.79205, + "num_input_tokens_seen": 61602095, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.03125, + "step": 2842, + "time_per_iteration": 2.5051841735839844 + }, + { + "auxiliary_loss_clip": 0.01148386, + "auxiliary_loss_mlp": 0.01046299, + "balance_loss_clip": 1.02647448, + "balance_loss_mlp": 1.0497905, + "epoch": 0.17093040733503684, + "flos": 19057864872960.0, + "grad_norm": 2.05566421297988, + "language_loss": 0.86086738, + "learning_rate": 3.795246529087043e-06, + "loss": 0.88281423, + "num_input_tokens_seen": 61620400, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.98828125, + "step": 2843, + "time_per_iteration": 2.4487509727478027 + }, + { + "auxiliary_loss_clip": 0.01150009, + "auxiliary_loss_mlp": 0.01046155, + "balance_loss_clip": 1.02696228, + "balance_loss_mlp": 1.05090249, + "epoch": 0.1709905305877048, + "flos": 13078954333440.0, + "grad_norm": 1.8875494657309706, + "language_loss": 0.6826812, + "learning_rate": 3.7950748345510126e-06, + "loss": 0.70464289, + "num_input_tokens_seen": 61637680, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.9921875, + "step": 2844, + "time_per_iteration": 2.4429779052734375 + }, + { + "auxiliary_loss_clip": 0.01150851, + "auxiliary_loss_mlp": 0.01054229, + "balance_loss_clip": 1.03371274, + "balance_loss_mlp": 1.05040824, + "epoch": 0.17105065384037277, + "flos": 19209336526080.0, + "grad_norm": 1.8058232236820264, + "language_loss": 0.78258789, + "learning_rate": 3.7949030719455646e-06, + "loss": 0.80463862, + "num_input_tokens_seen": 61655630, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0, + "step": 2845, + "time_per_iteration": 2.4377951622009277 + }, + { + "auxiliary_loss_clip": 0.01151786, + "auxiliary_loss_mlp": 0.01045909, + "balance_loss_clip": 1.02687097, + "balance_loss_mlp": 1.05064154, + "epoch": 0.17111077709304073, + "flos": 18515183218560.0, + "grad_norm": 2.746386155528142, + "language_loss": 0.77959955, + "learning_rate": 3.7947312412772127e-06, + "loss": 0.8015765, + "num_input_tokens_seen": 61673475, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0078125, + "step": 2846, + "time_per_iteration": 2.4196622371673584 + }, + { + "auxiliary_loss_clip": 0.01152165, + "auxiliary_loss_mlp": 0.0104791, + "balance_loss_clip": 1.02895534, + "balance_loss_mlp": 1.05158973, + "epoch": 0.1711709003457087, + "flos": 25082670015360.0, + "grad_norm": 1.7441395807388675, + "language_loss": 0.7942031, + "learning_rate": 3.794559342552472e-06, + "loss": 0.81620383, + "num_input_tokens_seen": 61693370, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0078125, + "step": 2847, + "time_per_iteration": 2.504087448120117 + }, + { + "auxiliary_loss_clip": 0.01148457, + "auxiliary_loss_mlp": 0.0104865, + "balance_loss_clip": 1.02913523, + "balance_loss_mlp": 1.04612017, + "epoch": 0.17123102359837666, + "flos": 17566387418880.0, + "grad_norm": 2.239997254259111, + "language_loss": 0.86818451, + "learning_rate": 3.7943873757778614e-06, + "loss": 0.89015555, + "num_input_tokens_seen": 61710820, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0234375, + "step": 2848, + "time_per_iteration": 2.438711643218994 + }, + { + "auxiliary_loss_clip": 0.0115323, + "auxiliary_loss_mlp": 0.01044307, + "balance_loss_clip": 1.02438748, + "balance_loss_mlp": 1.05133212, + "epoch": 0.17129114685104463, + "flos": 26173635845760.0, + "grad_norm": 1.715396677859901, + "language_loss": 0.75223613, + "learning_rate": 3.794215340959902e-06, + "loss": 0.77421153, + "num_input_tokens_seen": 61729855, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.015625, + "step": 2849, + "time_per_iteration": 2.4918415546417236 + }, + { + "auxiliary_loss_clip": 0.01047678, + "auxiliary_loss_mlp": 0.01003312, + "balance_loss_clip": 1.00107098, + "balance_loss_mlp": 1.01492834, + "epoch": 0.17135127010371262, + "flos": 69269710037760.0, + "grad_norm": 0.7949737728021388, + "language_loss": 0.57471085, + "learning_rate": 3.7940432381051163e-06, + "loss": 0.59522074, + "num_input_tokens_seen": 61790290, + "router_z_loss_clip": 0.02246094, + "router_z_loss_mlp": 0.328125, + "step": 2850, + "time_per_iteration": 3.057778835296631 + }, + { + "auxiliary_loss_clip": 0.01146039, + "auxiliary_loss_mlp": 0.0105304, + "balance_loss_clip": 1.03332317, + "balance_loss_mlp": 1.04852295, + "epoch": 0.1714113933563806, + "flos": 23550110380800.0, + "grad_norm": 2.4364727127987704, + "language_loss": 0.80988616, + "learning_rate": 3.793871067220031e-06, + "loss": 0.83187693, + "num_input_tokens_seen": 61809265, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.9765625, + "step": 2851, + "time_per_iteration": 3.887600898742676 + }, + { + "auxiliary_loss_clip": 0.01146778, + "auxiliary_loss_mlp": 0.0104369, + "balance_loss_clip": 1.02549911, + "balance_loss_mlp": 1.04858351, + "epoch": 0.17147151660904855, + "flos": 21142443697920.0, + "grad_norm": 2.035620688428962, + "language_loss": 0.93063158, + "learning_rate": 3.7936988283111764e-06, + "loss": 0.95253623, + "num_input_tokens_seen": 61828980, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.98046875, + "step": 2852, + "time_per_iteration": 3.920153856277466 + }, + { + "auxiliary_loss_clip": 0.01149404, + "auxiliary_loss_mlp": 0.01053071, + "balance_loss_clip": 1.03374732, + "balance_loss_mlp": 1.04728949, + "epoch": 0.17153163986171652, + "flos": 18624890332800.0, + "grad_norm": 1.8406206656402175, + "language_loss": 0.69480836, + "learning_rate": 3.7935265213850817e-06, + "loss": 0.71683311, + "num_input_tokens_seen": 61847915, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.015625, + "step": 2853, + "time_per_iteration": 2.4457037448883057 + }, + { + "auxiliary_loss_clip": 0.0115316, + "auxiliary_loss_mlp": 0.01050964, + "balance_loss_clip": 1.03150904, + "balance_loss_mlp": 1.05059445, + "epoch": 0.17159176311438448, + "flos": 18223265387520.0, + "grad_norm": 2.187977199847503, + "language_loss": 0.66505128, + "learning_rate": 3.7933541464482815e-06, + "loss": 0.68709248, + "num_input_tokens_seen": 61865570, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0234375, + "step": 2854, + "time_per_iteration": 2.4421632289886475 + }, + { + "auxiliary_loss_clip": 0.01144359, + "auxiliary_loss_mlp": 0.01044047, + "balance_loss_clip": 1.02520037, + "balance_loss_mlp": 1.04574227, + "epoch": 0.17165188636705245, + "flos": 20738987159040.0, + "grad_norm": 1.8257227486643586, + "language_loss": 0.89394444, + "learning_rate": 3.7931817035073124e-06, + "loss": 0.91582847, + "num_input_tokens_seen": 61883340, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.984375, + "step": 2855, + "time_per_iteration": 2.4601552486419678 + }, + { + "auxiliary_loss_clip": 0.01148566, + "auxiliary_loss_mlp": 0.01051381, + "balance_loss_clip": 1.03286791, + "balance_loss_mlp": 1.04792452, + "epoch": 0.17171200961972044, + "flos": 24899884680960.0, + "grad_norm": 2.515892939250119, + "language_loss": 0.83822739, + "learning_rate": 3.7930091925687134e-06, + "loss": 0.86022681, + "num_input_tokens_seen": 61900610, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0078125, + "step": 2856, + "time_per_iteration": 2.4747347831726074 + }, + { + "auxiliary_loss_clip": 0.01151618, + "auxiliary_loss_mlp": 0.01048758, + "balance_loss_clip": 1.02906466, + "balance_loss_mlp": 1.05112195, + "epoch": 0.1717721328723884, + "flos": 20157234485760.0, + "grad_norm": 1.9053156238546485, + "language_loss": 0.8645792, + "learning_rate": 3.792836613639026e-06, + "loss": 0.88658297, + "num_input_tokens_seen": 61916795, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0078125, + "step": 2857, + "time_per_iteration": 2.4460220336914062 + }, + { + "auxiliary_loss_clip": 0.01148045, + "auxiliary_loss_mlp": 0.0105234, + "balance_loss_clip": 1.03327847, + "balance_loss_mlp": 1.04805577, + "epoch": 0.17183225612505637, + "flos": 23361650697600.0, + "grad_norm": 2.139076633770832, + "language_loss": 0.77919662, + "learning_rate": 3.7926639667247947e-06, + "loss": 0.80120051, + "num_input_tokens_seen": 61936665, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0, + "step": 2858, + "time_per_iteration": 2.4459195137023926 + }, + { + "auxiliary_loss_clip": 0.01156262, + "auxiliary_loss_mlp": 0.01058687, + "balance_loss_clip": 1.03761101, + "balance_loss_mlp": 1.04760742, + "epoch": 0.17189237937772434, + "flos": 18114240631680.0, + "grad_norm": 2.423579883765011, + "language_loss": 0.77235049, + "learning_rate": 3.7924912518325663e-06, + "loss": 0.79449999, + "num_input_tokens_seen": 61954415, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.0859375, + "step": 2859, + "time_per_iteration": 2.43471360206604 + }, + { + "auxiliary_loss_clip": 0.01148379, + "auxiliary_loss_mlp": 0.01050312, + "balance_loss_clip": 1.03069019, + "balance_loss_mlp": 1.04920983, + "epoch": 0.1719525026303923, + "flos": 23258408031360.0, + "grad_norm": 3.774880148287903, + "language_loss": 0.77179611, + "learning_rate": 3.7923184689688902e-06, + "loss": 0.79378301, + "num_input_tokens_seen": 61973940, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.9921875, + "step": 2860, + "time_per_iteration": 2.463344097137451 + }, + { + "auxiliary_loss_clip": 0.01149457, + "auxiliary_loss_mlp": 0.01051045, + "balance_loss_clip": 1.03217435, + "balance_loss_mlp": 1.04703689, + "epoch": 0.17201262588306027, + "flos": 20810413353600.0, + "grad_norm": 2.1505291491255463, + "language_loss": 0.81964719, + "learning_rate": 3.792145618140317e-06, + "loss": 0.84165227, + "num_input_tokens_seen": 61991845, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0234375, + "step": 2861, + "time_per_iteration": 2.4505395889282227 + }, + { + "auxiliary_loss_clip": 0.01149339, + "auxiliary_loss_mlp": 0.01050609, + "balance_loss_clip": 1.03163123, + "balance_loss_mlp": 1.04897118, + "epoch": 0.17207274913572823, + "flos": 20375858615040.0, + "grad_norm": 4.22955926449596, + "language_loss": 0.85649675, + "learning_rate": 3.7919726993534038e-06, + "loss": 0.87849623, + "num_input_tokens_seen": 62009395, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0078125, + "step": 2862, + "time_per_iteration": 2.4392077922821045 + }, + { + "auxiliary_loss_clip": 0.01144423, + "auxiliary_loss_mlp": 0.01046105, + "balance_loss_clip": 1.02867651, + "balance_loss_mlp": 1.04785109, + "epoch": 0.17213287238839622, + "flos": 26797727675520.0, + "grad_norm": 2.3146804122881037, + "language_loss": 0.77874523, + "learning_rate": 3.7917997126147054e-06, + "loss": 0.80065054, + "num_input_tokens_seen": 62029005, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.96484375, + "step": 2863, + "time_per_iteration": 2.4745166301727295 + }, + { + "auxiliary_loss_clip": 0.01147347, + "auxiliary_loss_mlp": 0.01048138, + "balance_loss_clip": 1.02935052, + "balance_loss_mlp": 1.04726493, + "epoch": 0.1721929956410642, + "flos": 26030819370240.0, + "grad_norm": 1.7012031973405044, + "language_loss": 0.72191179, + "learning_rate": 3.7916266579307823e-06, + "loss": 0.74386668, + "num_input_tokens_seen": 62048730, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0, + "step": 2864, + "time_per_iteration": 2.496522903442383 + }, + { + "auxiliary_loss_clip": 0.01151447, + "auxiliary_loss_mlp": 0.01053526, + "balance_loss_clip": 1.03497648, + "balance_loss_mlp": 1.04935968, + "epoch": 0.17225311889373215, + "flos": 22273091078400.0, + "grad_norm": 1.6688219876641972, + "language_loss": 0.72896975, + "learning_rate": 3.7914535353081973e-06, + "loss": 0.75101948, + "num_input_tokens_seen": 62069000, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.015625, + "step": 2865, + "time_per_iteration": 2.468726396560669 + }, + { + "auxiliary_loss_clip": 0.01151587, + "auxiliary_loss_mlp": 0.01044873, + "balance_loss_clip": 1.02608538, + "balance_loss_mlp": 1.05194211, + "epoch": 0.17231324214640012, + "flos": 21287774125440.0, + "grad_norm": 3.1747822479918764, + "language_loss": 0.79011786, + "learning_rate": 3.7912803447535145e-06, + "loss": 0.81208247, + "num_input_tokens_seen": 62086750, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.99609375, + "step": 2866, + "time_per_iteration": 2.445716381072998 + }, + { + "auxiliary_loss_clip": 0.01149563, + "auxiliary_loss_mlp": 0.01046901, + "balance_loss_clip": 1.02651668, + "balance_loss_mlp": 1.04966402, + "epoch": 0.17237336539906808, + "flos": 19680735640320.0, + "grad_norm": 1.797659045411876, + "language_loss": 0.79865277, + "learning_rate": 3.7911070862733016e-06, + "loss": 0.82061744, + "num_input_tokens_seen": 62106240, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0, + "step": 2867, + "time_per_iteration": 2.4745590686798096 + }, + { + "auxiliary_loss_clip": 0.0114836, + "auxiliary_loss_mlp": 0.01037447, + "balance_loss_clip": 1.01887417, + "balance_loss_mlp": 1.04821014, + "epoch": 0.17243348865173605, + "flos": 17529650784000.0, + "grad_norm": 1.717941409951427, + "language_loss": 0.79707634, + "learning_rate": 3.7909337598741276e-06, + "loss": 0.81893444, + "num_input_tokens_seen": 62124895, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0, + "step": 2868, + "time_per_iteration": 2.4545693397521973 + }, + { + "auxiliary_loss_clip": 0.01157442, + "auxiliary_loss_mlp": 0.0104461, + "balance_loss_clip": 1.02645397, + "balance_loss_mlp": 1.0538218, + "epoch": 0.17249361190440402, + "flos": 18259858368000.0, + "grad_norm": 1.9332967921770021, + "language_loss": 0.84265673, + "learning_rate": 3.7907603655625674e-06, + "loss": 0.86467719, + "num_input_tokens_seen": 62143510, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.03125, + "step": 2869, + "time_per_iteration": 2.445429563522339 + }, + { + "auxiliary_loss_clip": 0.01151297, + "auxiliary_loss_mlp": 0.01052302, + "balance_loss_clip": 1.03226328, + "balance_loss_mlp": 1.04971075, + "epoch": 0.172553735157072, + "flos": 21174367910400.0, + "grad_norm": 2.3539211413688954, + "language_loss": 0.77522051, + "learning_rate": 3.7905869033451932e-06, + "loss": 0.79725653, + "num_input_tokens_seen": 62162285, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.015625, + "step": 2870, + "time_per_iteration": 2.4975087642669678 + }, + { + "auxiliary_loss_clip": 0.01146931, + "auxiliary_loss_mlp": 0.01043287, + "balance_loss_clip": 1.02609706, + "balance_loss_mlp": 1.05132568, + "epoch": 0.17261385840973997, + "flos": 22273270646400.0, + "grad_norm": 1.897031493968697, + "language_loss": 0.7680704, + "learning_rate": 3.7904133732285857e-06, + "loss": 0.78997254, + "num_input_tokens_seen": 62180970, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.95703125, + "step": 2871, + "time_per_iteration": 2.4777348041534424 + }, + { + "auxiliary_loss_clip": 0.01150344, + "auxiliary_loss_mlp": 0.01043916, + "balance_loss_clip": 1.02442563, + "balance_loss_mlp": 1.05061746, + "epoch": 0.17267398166240794, + "flos": 27922233830400.0, + "grad_norm": 2.240934958328371, + "language_loss": 0.74448204, + "learning_rate": 3.7902397752193228e-06, + "loss": 0.76642466, + "num_input_tokens_seen": 62198965, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0, + "step": 2872, + "time_per_iteration": 2.5021097660064697 + }, + { + "auxiliary_loss_clip": 0.01147343, + "auxiliary_loss_mlp": 0.01039942, + "balance_loss_clip": 1.02117848, + "balance_loss_mlp": 1.05127549, + "epoch": 0.1727341049150759, + "flos": 21945118970880.0, + "grad_norm": 1.8155923086100165, + "language_loss": 0.82694656, + "learning_rate": 3.790066109323988e-06, + "loss": 0.84881938, + "num_input_tokens_seen": 62219890, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9609375, + "step": 2873, + "time_per_iteration": 2.4852540493011475 + }, + { + "auxiliary_loss_clip": 0.01147589, + "auxiliary_loss_mlp": 0.01043231, + "balance_loss_clip": 1.0229888, + "balance_loss_mlp": 1.049196, + "epoch": 0.17279422816774387, + "flos": 18107883924480.0, + "grad_norm": 2.0464410919173814, + "language_loss": 0.75083232, + "learning_rate": 3.7898923755491678e-06, + "loss": 0.77274048, + "num_input_tokens_seen": 62237140, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.984375, + "step": 2874, + "time_per_iteration": 2.440610885620117 + }, + { + "auxiliary_loss_clip": 0.01151305, + "auxiliary_loss_mlp": 0.01044062, + "balance_loss_clip": 1.0238322, + "balance_loss_mlp": 1.0515728, + "epoch": 0.17285435142041183, + "flos": 21835447770240.0, + "grad_norm": 1.9230852666364326, + "language_loss": 0.8067199, + "learning_rate": 3.7897185739014487e-06, + "loss": 0.8286736, + "num_input_tokens_seen": 62255405, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.99609375, + "step": 2875, + "time_per_iteration": 2.478473424911499 + }, + { + "auxiliary_loss_clip": 0.01153488, + "auxiliary_loss_mlp": 0.01049908, + "balance_loss_clip": 1.02984488, + "balance_loss_mlp": 1.05083489, + "epoch": 0.17291447467307983, + "flos": 18368452160640.0, + "grad_norm": 2.5699127680633542, + "language_loss": 0.87525117, + "learning_rate": 3.7895447043874217e-06, + "loss": 0.89728516, + "num_input_tokens_seen": 62271280, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.03125, + "step": 2876, + "time_per_iteration": 2.4580602645874023 + }, + { + "auxiliary_loss_clip": 0.01150473, + "auxiliary_loss_mlp": 0.01042457, + "balance_loss_clip": 1.02384901, + "balance_loss_mlp": 1.05273616, + "epoch": 0.1729745979257478, + "flos": 18624638937600.0, + "grad_norm": 1.9567138745888089, + "language_loss": 0.84561193, + "learning_rate": 3.789370767013681e-06, + "loss": 0.86754125, + "num_input_tokens_seen": 62289140, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9765625, + "step": 2877, + "time_per_iteration": 2.4696123600006104 + }, + { + "auxiliary_loss_clip": 0.01151589, + "auxiliary_loss_mlp": 0.01041028, + "balance_loss_clip": 1.02179909, + "balance_loss_mlp": 1.05281305, + "epoch": 0.17303472117841576, + "flos": 22998234844800.0, + "grad_norm": 3.0724129461132406, + "language_loss": 0.79527134, + "learning_rate": 3.7891967617868204e-06, + "loss": 0.81719756, + "num_input_tokens_seen": 62307490, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.984375, + "step": 2878, + "time_per_iteration": 2.4739902019500732 + }, + { + "auxiliary_loss_clip": 0.01147004, + "auxiliary_loss_mlp": 0.01042956, + "balance_loss_clip": 1.02450228, + "balance_loss_mlp": 1.04968572, + "epoch": 0.17309484443108372, + "flos": 25664386775040.0, + "grad_norm": 1.9694378769308076, + "language_loss": 0.70306808, + "learning_rate": 3.78902268871344e-06, + "loss": 0.72496772, + "num_input_tokens_seen": 62328570, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.97265625, + "step": 2879, + "time_per_iteration": 2.5014665126800537 + }, + { + "auxiliary_loss_clip": 0.01151101, + "auxiliary_loss_mlp": 0.01050497, + "balance_loss_clip": 1.03156662, + "balance_loss_mlp": 1.05038834, + "epoch": 0.1731549676837517, + "flos": 13552903313280.0, + "grad_norm": 2.4431111997211734, + "language_loss": 0.83465785, + "learning_rate": 3.78884854780014e-06, + "loss": 0.85667384, + "num_input_tokens_seen": 62345735, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0078125, + "step": 2880, + "time_per_iteration": 2.433776378631592 + }, + { + "auxiliary_loss_clip": 0.01153087, + "auxiliary_loss_mlp": 0.01044219, + "balance_loss_clip": 1.0250026, + "balance_loss_mlp": 1.05171311, + "epoch": 0.17321509093641965, + "flos": 22857070394880.0, + "grad_norm": 2.135155165507549, + "language_loss": 0.80866969, + "learning_rate": 3.7886743390535236e-06, + "loss": 0.8306427, + "num_input_tokens_seen": 62365525, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.0078125, + "step": 2881, + "time_per_iteration": 2.4944772720336914 + }, + { + "auxiliary_loss_clip": 0.01148623, + "auxiliary_loss_mlp": 0.01043966, + "balance_loss_clip": 1.0263828, + "balance_loss_mlp": 1.05030859, + "epoch": 0.17327521418908762, + "flos": 24352785653760.0, + "grad_norm": 2.5502275528368066, + "language_loss": 0.77372867, + "learning_rate": 3.788500062480197e-06, + "loss": 0.79565454, + "num_input_tokens_seen": 62385160, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.984375, + "step": 2882, + "time_per_iteration": 2.5426836013793945 + }, + { + "auxiliary_loss_clip": 0.011482, + "auxiliary_loss_mlp": 0.01051627, + "balance_loss_clip": 1.03276825, + "balance_loss_mlp": 1.05005169, + "epoch": 0.1733353374417556, + "flos": 33105651816960.0, + "grad_norm": 1.8718611847068298, + "language_loss": 0.76652586, + "learning_rate": 3.788325718086769e-06, + "loss": 0.78852415, + "num_input_tokens_seen": 62405280, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.984375, + "step": 2883, + "time_per_iteration": 2.5733277797698975 + }, + { + "auxiliary_loss_clip": 0.01146516, + "auxiliary_loss_mlp": 0.01044391, + "balance_loss_clip": 1.0265696, + "balance_loss_mlp": 1.04944682, + "epoch": 0.17339546069442358, + "flos": 24388947671040.0, + "grad_norm": 1.945193845574475, + "language_loss": 0.85463524, + "learning_rate": 3.7881513058798503e-06, + "loss": 0.87654424, + "num_input_tokens_seen": 62423665, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.97265625, + "step": 2884, + "time_per_iteration": 2.4708735942840576 + }, + { + "auxiliary_loss_clip": 0.01149646, + "auxiliary_loss_mlp": 0.01039486, + "balance_loss_clip": 1.02122355, + "balance_loss_mlp": 1.05114794, + "epoch": 0.17345558394709154, + "flos": 27454174680960.0, + "grad_norm": 1.6148586475999513, + "language_loss": 0.73758793, + "learning_rate": 3.787976825866055e-06, + "loss": 0.75947917, + "num_input_tokens_seen": 62445170, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.984375, + "step": 2885, + "time_per_iteration": 2.5266878604888916 + }, + { + "auxiliary_loss_clip": 0.01147273, + "auxiliary_loss_mlp": 0.01044711, + "balance_loss_clip": 1.02775908, + "balance_loss_mlp": 1.05269074, + "epoch": 0.1735157071997595, + "flos": 24682158391680.0, + "grad_norm": 1.9690054244815705, + "language_loss": 0.70377076, + "learning_rate": 3.7878022780519998e-06, + "loss": 0.72569054, + "num_input_tokens_seen": 62466135, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9453125, + "step": 2886, + "time_per_iteration": 2.508695363998413 + }, + { + "auxiliary_loss_clip": 0.01146959, + "auxiliary_loss_mlp": 0.01040101, + "balance_loss_clip": 1.0212425, + "balance_loss_mlp": 1.04799545, + "epoch": 0.17357583045242747, + "flos": 21688932193920.0, + "grad_norm": 1.9665325510573808, + "language_loss": 0.69294798, + "learning_rate": 3.7876276624443024e-06, + "loss": 0.7148186, + "num_input_tokens_seen": 62483910, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.98828125, + "step": 2887, + "time_per_iteration": 2.4787776470184326 + }, + { + "auxiliary_loss_clip": 0.01149915, + "auxiliary_loss_mlp": 0.01049822, + "balance_loss_clip": 1.03180945, + "balance_loss_mlp": 1.05075955, + "epoch": 0.17363595370509544, + "flos": 15375728753280.0, + "grad_norm": 1.791000255721863, + "language_loss": 0.85391176, + "learning_rate": 3.787452979049585e-06, + "loss": 0.87590909, + "num_input_tokens_seen": 62501530, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9921875, + "step": 2888, + "time_per_iteration": 2.4234085083007812 + }, + { + "auxiliary_loss_clip": 0.01149617, + "auxiliary_loss_mlp": 0.01046928, + "balance_loss_clip": 1.02668667, + "balance_loss_mlp": 1.05046952, + "epoch": 0.1736960769577634, + "flos": 23440941970560.0, + "grad_norm": 3.660213605651755, + "language_loss": 0.78465497, + "learning_rate": 3.7872782278744718e-06, + "loss": 0.80662042, + "num_input_tokens_seen": 62521295, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.9921875, + "step": 2889, + "time_per_iteration": 2.5042123794555664 + }, + { + "auxiliary_loss_clip": 0.01146581, + "auxiliary_loss_mlp": 0.01047944, + "balance_loss_clip": 1.02913308, + "balance_loss_mlp": 1.05222893, + "epoch": 0.1737562002104314, + "flos": 18587830475520.0, + "grad_norm": 2.9081348702485723, + "language_loss": 0.83860242, + "learning_rate": 3.7871034089255883e-06, + "loss": 0.86054766, + "num_input_tokens_seen": 62539615, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.9453125, + "step": 2890, + "time_per_iteration": 2.4698500633239746 + }, + { + "auxiliary_loss_clip": 0.01150813, + "auxiliary_loss_mlp": 0.01047378, + "balance_loss_clip": 1.02880502, + "balance_loss_mlp": 1.05083108, + "epoch": 0.17381632346309936, + "flos": 15998060816640.0, + "grad_norm": 1.9935479009749588, + "language_loss": 0.82253492, + "learning_rate": 3.7869285222095653e-06, + "loss": 0.84451687, + "num_input_tokens_seen": 62556820, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0, + "step": 2891, + "time_per_iteration": 2.4478886127471924 + }, + { + "auxiliary_loss_clip": 0.01150226, + "auxiliary_loss_mlp": 0.01045776, + "balance_loss_clip": 1.02608275, + "balance_loss_mlp": 1.04824781, + "epoch": 0.17387644671576732, + "flos": 13369830670080.0, + "grad_norm": 2.3073165362682873, + "language_loss": 0.81479478, + "learning_rate": 3.7867535677330334e-06, + "loss": 0.8367548, + "num_input_tokens_seen": 62572450, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.015625, + "step": 2892, + "time_per_iteration": 2.4094645977020264 + }, + { + "auxiliary_loss_clip": 0.01154909, + "auxiliary_loss_mlp": 0.01055666, + "balance_loss_clip": 1.03519785, + "balance_loss_mlp": 1.05379355, + "epoch": 0.1739365699684353, + "flos": 26615516958720.0, + "grad_norm": 2.24459564009462, + "language_loss": 0.74480057, + "learning_rate": 3.786578545502627e-06, + "loss": 0.76690638, + "num_input_tokens_seen": 62592580, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0078125, + "step": 2893, + "time_per_iteration": 3.8296191692352295 + }, + { + "auxiliary_loss_clip": 0.01152082, + "auxiliary_loss_mlp": 0.010434, + "balance_loss_clip": 1.02375412, + "balance_loss_mlp": 1.05193436, + "epoch": 0.17399669322110325, + "flos": 23367971491200.0, + "grad_norm": 2.117368029368179, + "language_loss": 0.83073241, + "learning_rate": 3.7864034555249828e-06, + "loss": 0.85268712, + "num_input_tokens_seen": 62611220, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0, + "step": 2894, + "time_per_iteration": 3.9817075729370117 + }, + { + "auxiliary_loss_clip": 0.01150382, + "auxiliary_loss_mlp": 0.01047312, + "balance_loss_clip": 1.02523482, + "balance_loss_mlp": 1.05032384, + "epoch": 0.17405681647377122, + "flos": 22054107813120.0, + "grad_norm": 2.157907065313142, + "language_loss": 0.74051547, + "learning_rate": 3.786228297806741e-06, + "loss": 0.76249242, + "num_input_tokens_seen": 62629185, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.0, + "step": 2895, + "time_per_iteration": 2.461857318878174 + }, + { + "auxiliary_loss_clip": 0.01048544, + "auxiliary_loss_mlp": 0.01006984, + "balance_loss_clip": 1.00467134, + "balance_loss_mlp": 1.01600659, + "epoch": 0.1741169397264392, + "flos": 61457559114240.0, + "grad_norm": 0.8715266336267762, + "language_loss": 0.6273998, + "learning_rate": 3.7860530723545435e-06, + "loss": 0.64795506, + "num_input_tokens_seen": 62691895, + "router_z_loss_clip": 0.02307129, + "router_z_loss_mlp": 0.32421875, + "step": 2896, + "time_per_iteration": 3.1462173461914062 + }, + { + "auxiliary_loss_clip": 0.01148575, + "auxiliary_loss_mlp": 0.01041696, + "balance_loss_clip": 1.02160895, + "balance_loss_mlp": 1.04787612, + "epoch": 0.17417706297910718, + "flos": 27017680608000.0, + "grad_norm": 2.3238967096174923, + "language_loss": 0.75600475, + "learning_rate": 3.785877779175034e-06, + "loss": 0.77790749, + "num_input_tokens_seen": 62713790, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0078125, + "step": 2897, + "time_per_iteration": 2.4974682331085205 + }, + { + "auxiliary_loss_clip": 0.01147676, + "auxiliary_loss_mlp": 0.01042715, + "balance_loss_clip": 1.02354646, + "balance_loss_mlp": 1.05000067, + "epoch": 0.17423718623177514, + "flos": 33508856960640.0, + "grad_norm": 1.9004029304223122, + "language_loss": 0.69384712, + "learning_rate": 3.7857024182748606e-06, + "loss": 0.71575105, + "num_input_tokens_seen": 62736285, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.9765625, + "step": 2898, + "time_per_iteration": 2.5650558471679688 + }, + { + "auxiliary_loss_clip": 0.0115334, + "auxiliary_loss_mlp": 0.01049615, + "balance_loss_clip": 1.03026772, + "balance_loss_mlp": 1.05215359, + "epoch": 0.1742973094844431, + "flos": 27198634348800.0, + "grad_norm": 2.315885710988465, + "language_loss": 0.76069367, + "learning_rate": 3.7855269896606717e-06, + "loss": 0.78272319, + "num_input_tokens_seen": 62756240, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.015625, + "step": 2899, + "time_per_iteration": 2.5006191730499268 + }, + { + "auxiliary_loss_clip": 0.01145178, + "auxiliary_loss_mlp": 0.01045245, + "balance_loss_clip": 1.02571905, + "balance_loss_mlp": 1.04929495, + "epoch": 0.17435743273711107, + "flos": 22710734386560.0, + "grad_norm": 1.9440585306650153, + "language_loss": 0.72821134, + "learning_rate": 3.785351493339121e-06, + "loss": 0.75011557, + "num_input_tokens_seen": 62775910, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.9609375, + "step": 2900, + "time_per_iteration": 2.5199801921844482 + }, + { + "auxiliary_loss_clip": 0.01150074, + "auxiliary_loss_mlp": 0.01051215, + "balance_loss_clip": 1.03261876, + "balance_loss_mlp": 1.04989529, + "epoch": 0.17441755598977904, + "flos": 41646466039680.0, + "grad_norm": 1.6677330343015109, + "language_loss": 0.70085949, + "learning_rate": 3.785175929316863e-06, + "loss": 0.72287238, + "num_input_tokens_seen": 62799385, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0, + "step": 2901, + "time_per_iteration": 2.624864101409912 + }, + { + "auxiliary_loss_clip": 0.01152064, + "auxiliary_loss_mlp": 0.01048884, + "balance_loss_clip": 1.03022778, + "balance_loss_mlp": 1.05087507, + "epoch": 0.174477679242447, + "flos": 26287077974400.0, + "grad_norm": 1.7643324639769489, + "language_loss": 0.76549768, + "learning_rate": 3.7850002976005543e-06, + "loss": 0.78750718, + "num_input_tokens_seen": 62819380, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.0078125, + "step": 2902, + "time_per_iteration": 2.51680850982666 + }, + { + "auxiliary_loss_clip": 0.01150394, + "auxiliary_loss_mlp": 0.01056591, + "balance_loss_clip": 1.03741002, + "balance_loss_mlp": 1.04885221, + "epoch": 0.174537802495115, + "flos": 17858412990720.0, + "grad_norm": 2.129298660499851, + "language_loss": 0.81787169, + "learning_rate": 3.7848245981968558e-06, + "loss": 0.8399415, + "num_input_tokens_seen": 62836205, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.015625, + "step": 2903, + "time_per_iteration": 2.436877727508545 + }, + { + "auxiliary_loss_clip": 0.01148467, + "auxiliary_loss_mlp": 0.01041626, + "balance_loss_clip": 1.02255297, + "balance_loss_mlp": 1.04978609, + "epoch": 0.17459792574778296, + "flos": 16940715390720.0, + "grad_norm": 2.1703016783079327, + "language_loss": 0.73228866, + "learning_rate": 3.784648831112429e-06, + "loss": 0.75418955, + "num_input_tokens_seen": 62854045, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.98828125, + "step": 2904, + "time_per_iteration": 2.462775707244873 + }, + { + "auxiliary_loss_clip": 0.0114756, + "auxiliary_loss_mlp": 0.01045319, + "balance_loss_clip": 1.02719879, + "balance_loss_mlp": 1.04777265, + "epoch": 0.17465804900045093, + "flos": 25520026014720.0, + "grad_norm": 1.9374721445221084, + "language_loss": 0.64526325, + "learning_rate": 3.7844729963539406e-06, + "loss": 0.6671921, + "num_input_tokens_seen": 62873075, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.0, + "step": 2905, + "time_per_iteration": 2.468395233154297 + }, + { + "auxiliary_loss_clip": 0.01157304, + "auxiliary_loss_mlp": 0.01050089, + "balance_loss_clip": 1.0292747, + "balance_loss_mlp": 1.05202341, + "epoch": 0.1747181722531189, + "flos": 24129708238080.0, + "grad_norm": 1.804147248272645, + "language_loss": 0.79236615, + "learning_rate": 3.7842970939280566e-06, + "loss": 0.81444013, + "num_input_tokens_seen": 62892675, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.0546875, + "step": 2906, + "time_per_iteration": 2.4632725715637207 + }, + { + "auxiliary_loss_clip": 0.01150693, + "auxiliary_loss_mlp": 0.01055346, + "balance_loss_clip": 1.03577161, + "balance_loss_mlp": 1.05044913, + "epoch": 0.17477829550578686, + "flos": 17748813617280.0, + "grad_norm": 1.7929508882228948, + "language_loss": 0.81010377, + "learning_rate": 3.784121123841449e-06, + "loss": 0.83216417, + "num_input_tokens_seen": 62910675, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0, + "step": 2907, + "time_per_iteration": 2.4214229583740234 + }, + { + "auxiliary_loss_clip": 0.01152007, + "auxiliary_loss_mlp": 0.01050463, + "balance_loss_clip": 1.03161621, + "balance_loss_mlp": 1.05040026, + "epoch": 0.17483841875845482, + "flos": 15377344865280.0, + "grad_norm": 2.7402312811515515, + "language_loss": 0.81315112, + "learning_rate": 3.7839450861007886e-06, + "loss": 0.83517587, + "num_input_tokens_seen": 62928130, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.015625, + "step": 2908, + "time_per_iteration": 2.4340970516204834 + }, + { + "auxiliary_loss_clip": 0.01150458, + "auxiliary_loss_mlp": 0.01051278, + "balance_loss_clip": 1.03047633, + "balance_loss_mlp": 1.04978228, + "epoch": 0.17489854201112282, + "flos": 17163254102400.0, + "grad_norm": 2.419675279893618, + "language_loss": 0.80399191, + "learning_rate": 3.7837689807127518e-06, + "loss": 0.82600915, + "num_input_tokens_seen": 62944290, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.0078125, + "step": 2909, + "time_per_iteration": 2.4170033931732178 + }, + { + "auxiliary_loss_clip": 0.0115308, + "auxiliary_loss_mlp": 0.01053412, + "balance_loss_clip": 1.03319383, + "balance_loss_mlp": 1.05133021, + "epoch": 0.17495866526379078, + "flos": 19755286318080.0, + "grad_norm": 1.6998329053727648, + "language_loss": 0.76530939, + "learning_rate": 3.783592807684017e-06, + "loss": 0.78737426, + "num_input_tokens_seen": 62963505, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.015625, + "step": 2910, + "time_per_iteration": 2.457628011703491 + }, + { + "auxiliary_loss_clip": 0.01151475, + "auxiliary_loss_mlp": 0.01049527, + "balance_loss_clip": 1.02901077, + "balance_loss_mlp": 1.05060935, + "epoch": 0.17501878851645875, + "flos": 28511133310080.0, + "grad_norm": 1.6502133484544155, + "language_loss": 0.87255991, + "learning_rate": 3.7834165670212645e-06, + "loss": 0.89456993, + "num_input_tokens_seen": 62985020, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0078125, + "step": 2911, + "time_per_iteration": 2.5302672386169434 + }, + { + "auxiliary_loss_clip": 0.01148398, + "auxiliary_loss_mlp": 0.0105451, + "balance_loss_clip": 1.03349352, + "balance_loss_mlp": 1.04746377, + "epoch": 0.1750789117691267, + "flos": 17931203902080.0, + "grad_norm": 2.260601647926804, + "language_loss": 0.89586449, + "learning_rate": 3.7832402587311764e-06, + "loss": 0.91789353, + "num_input_tokens_seen": 63001745, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.0078125, + "step": 2912, + "time_per_iteration": 2.447650194168091 + }, + { + "auxiliary_loss_clip": 0.01150608, + "auxiliary_loss_mlp": 0.01050267, + "balance_loss_clip": 1.0302161, + "balance_loss_mlp": 1.04871392, + "epoch": 0.17513903502179468, + "flos": 18259427404800.0, + "grad_norm": 2.8836544870459813, + "language_loss": 0.7262938, + "learning_rate": 3.783063882820439e-06, + "loss": 0.74830252, + "num_input_tokens_seen": 63019750, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.015625, + "step": 2913, + "time_per_iteration": 2.423595666885376 + }, + { + "auxiliary_loss_clip": 0.01150722, + "auxiliary_loss_mlp": 0.01047113, + "balance_loss_clip": 1.02738369, + "balance_loss_mlp": 1.0522244, + "epoch": 0.17519915827446264, + "flos": 20704728562560.0, + "grad_norm": 2.243393227782369, + "language_loss": 0.68799925, + "learning_rate": 3.782887439295741e-06, + "loss": 0.70997757, + "num_input_tokens_seen": 63039500, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.984375, + "step": 2914, + "time_per_iteration": 2.46085262298584 + }, + { + "auxiliary_loss_clip": 0.01149727, + "auxiliary_loss_mlp": 0.01056566, + "balance_loss_clip": 1.03616977, + "balance_loss_mlp": 1.05143356, + "epoch": 0.1752592815271306, + "flos": 20523415685760.0, + "grad_norm": 1.8218690011087264, + "language_loss": 0.93755293, + "learning_rate": 3.782710928163772e-06, + "loss": 0.95961595, + "num_input_tokens_seen": 63059785, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.98046875, + "step": 2915, + "time_per_iteration": 2.457148551940918 + }, + { + "auxiliary_loss_clip": 0.01143068, + "auxiliary_loss_mlp": 0.01047095, + "balance_loss_clip": 1.02744889, + "balance_loss_mlp": 1.04722261, + "epoch": 0.1753194047797986, + "flos": 21799178012160.0, + "grad_norm": 1.8144768789670476, + "language_loss": 0.80869162, + "learning_rate": 3.782534349431226e-06, + "loss": 0.83059323, + "num_input_tokens_seen": 63079385, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.95703125, + "step": 2916, + "time_per_iteration": 2.4740476608276367 + }, + { + "auxiliary_loss_clip": 0.01150429, + "auxiliary_loss_mlp": 0.01056449, + "balance_loss_clip": 1.03663611, + "balance_loss_mlp": 1.04854608, + "epoch": 0.17537952803246656, + "flos": 20668351063680.0, + "grad_norm": 1.67512565222408, + "language_loss": 0.73645711, + "learning_rate": 3.782357703104799e-06, + "loss": 0.75852591, + "num_input_tokens_seen": 63098970, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.015625, + "step": 2917, + "time_per_iteration": 2.4484915733337402 + }, + { + "auxiliary_loss_clip": 0.01144993, + "auxiliary_loss_mlp": 0.01055794, + "balance_loss_clip": 1.03517044, + "balance_loss_mlp": 1.04897738, + "epoch": 0.17543965128513453, + "flos": 23295072839040.0, + "grad_norm": 12.675743752905372, + "language_loss": 0.77019119, + "learning_rate": 3.7821809891911897e-06, + "loss": 0.79219908, + "num_input_tokens_seen": 63118750, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 0.9609375, + "step": 2918, + "time_per_iteration": 2.4723429679870605 + }, + { + "auxiliary_loss_clip": 0.01154194, + "auxiliary_loss_mlp": 0.01046159, + "balance_loss_clip": 1.0260129, + "balance_loss_mlp": 1.05131745, + "epoch": 0.1754997745378025, + "flos": 29095615416960.0, + "grad_norm": 3.415786226656528, + "language_loss": 0.74196291, + "learning_rate": 3.782004207697098e-06, + "loss": 0.76396644, + "num_input_tokens_seen": 63136865, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.03125, + "step": 2919, + "time_per_iteration": 2.5049829483032227 + }, + { + "auxiliary_loss_clip": 0.01154792, + "auxiliary_loss_mlp": 0.01049917, + "balance_loss_clip": 1.03080809, + "balance_loss_mlp": 1.05090559, + "epoch": 0.17555989779047046, + "flos": 30371844620160.0, + "grad_norm": 1.7754050788280298, + "language_loss": 0.74211872, + "learning_rate": 3.781827358629228e-06, + "loss": 0.76416576, + "num_input_tokens_seen": 63158325, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0390625, + "step": 2920, + "time_per_iteration": 2.565361738204956 + }, + { + "auxiliary_loss_clip": 0.01144387, + "auxiliary_loss_mlp": 0.01039886, + "balance_loss_clip": 1.0219686, + "balance_loss_mlp": 1.04717219, + "epoch": 0.17562002104313842, + "flos": 23287746464640.0, + "grad_norm": 2.3164139995284834, + "language_loss": 0.7949307, + "learning_rate": 3.7816504419942873e-06, + "loss": 0.81677347, + "num_input_tokens_seen": 63173115, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.97265625, + "step": 2921, + "time_per_iteration": 2.4471213817596436 + }, + { + "auxiliary_loss_clip": 0.01153986, + "auxiliary_loss_mlp": 0.0104563, + "balance_loss_clip": 1.02569795, + "balance_loss_mlp": 1.05029321, + "epoch": 0.1756801442958064, + "flos": 24790500789120.0, + "grad_norm": 1.6170497741380607, + "language_loss": 0.87493849, + "learning_rate": 3.7814734577989823e-06, + "loss": 0.89693457, + "num_input_tokens_seen": 63192880, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0390625, + "step": 2922, + "time_per_iteration": 2.5042173862457275 + }, + { + "auxiliary_loss_clip": 0.01149338, + "auxiliary_loss_mlp": 0.01050477, + "balance_loss_clip": 1.03074801, + "balance_loss_mlp": 1.04808784, + "epoch": 0.17574026754847438, + "flos": 25771651764480.0, + "grad_norm": 2.3811708545321735, + "language_loss": 0.62097687, + "learning_rate": 3.7812964060500253e-06, + "loss": 0.64297503, + "num_input_tokens_seen": 63214395, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.015625, + "step": 2923, + "time_per_iteration": 2.5067484378814697 + }, + { + "auxiliary_loss_clip": 0.01154551, + "auxiliary_loss_mlp": 0.01048297, + "balance_loss_clip": 1.02775693, + "balance_loss_mlp": 1.05287814, + "epoch": 0.17580039080114235, + "flos": 17456608477440.0, + "grad_norm": 2.1344206016331797, + "language_loss": 0.80602306, + "learning_rate": 3.78111928675413e-06, + "loss": 0.82805157, + "num_input_tokens_seen": 63231020, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.015625, + "step": 2924, + "time_per_iteration": 2.453174114227295 + }, + { + "auxiliary_loss_clip": 0.0115147, + "auxiliary_loss_mlp": 0.01053673, + "balance_loss_clip": 1.03214407, + "balance_loss_mlp": 1.04809761, + "epoch": 0.1758605140538103, + "flos": 14864648088960.0, + "grad_norm": 3.672968077353321, + "language_loss": 0.70954067, + "learning_rate": 3.7809420999180126e-06, + "loss": 0.73159206, + "num_input_tokens_seen": 63246245, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.03125, + "step": 2925, + "time_per_iteration": 2.4666385650634766 + }, + { + "auxiliary_loss_clip": 0.01148763, + "auxiliary_loss_mlp": 0.0104438, + "balance_loss_clip": 1.02538979, + "balance_loss_mlp": 1.05147243, + "epoch": 0.17592063730647828, + "flos": 23004268329600.0, + "grad_norm": 1.6622274839000213, + "language_loss": 0.71700275, + "learning_rate": 3.7807648455483934e-06, + "loss": 0.73893416, + "num_input_tokens_seen": 63267790, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.97265625, + "step": 2926, + "time_per_iteration": 2.50289249420166 + }, + { + "auxiliary_loss_clip": 0.01150931, + "auxiliary_loss_mlp": 0.01043072, + "balance_loss_clip": 1.02191257, + "balance_loss_mlp": 1.04857433, + "epoch": 0.17598076055914624, + "flos": 20741501111040.0, + "grad_norm": 1.8916391197618272, + "language_loss": 0.84433806, + "learning_rate": 3.7805875236519918e-06, + "loss": 0.86627805, + "num_input_tokens_seen": 63286830, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.0234375, + "step": 2927, + "time_per_iteration": 2.447207450866699 + }, + { + "auxiliary_loss_clip": 0.01149947, + "auxiliary_loss_mlp": 0.01043802, + "balance_loss_clip": 1.02568233, + "balance_loss_mlp": 1.0506475, + "epoch": 0.1760408838118142, + "flos": 34092441227520.0, + "grad_norm": 1.8156588356210406, + "language_loss": 0.71879232, + "learning_rate": 3.7804101342355336e-06, + "loss": 0.74072987, + "num_input_tokens_seen": 63308870, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9921875, + "step": 2928, + "time_per_iteration": 2.585942029953003 + }, + { + "auxiliary_loss_clip": 0.01150116, + "auxiliary_loss_mlp": 0.01048544, + "balance_loss_clip": 1.028934, + "balance_loss_mlp": 1.05230594, + "epoch": 0.1761010070644822, + "flos": 24168384207360.0, + "grad_norm": 2.0402577824357886, + "language_loss": 0.83222824, + "learning_rate": 3.780232677305744e-06, + "loss": 0.85421479, + "num_input_tokens_seen": 63329005, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.9765625, + "step": 2929, + "time_per_iteration": 2.461101770401001 + }, + { + "auxiliary_loss_clip": 0.01149627, + "auxiliary_loss_mlp": 0.0104173, + "balance_loss_clip": 1.02298999, + "balance_loss_mlp": 1.0493536, + "epoch": 0.17616113031715017, + "flos": 26576697335040.0, + "grad_norm": 1.817429721867852, + "language_loss": 0.7933988, + "learning_rate": 3.7800551528693535e-06, + "loss": 0.81531239, + "num_input_tokens_seen": 63349390, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0, + "step": 2930, + "time_per_iteration": 2.491748571395874 + }, + { + "auxiliary_loss_clip": 0.01154203, + "auxiliary_loss_mlp": 0.0104708, + "balance_loss_clip": 1.02671921, + "balance_loss_mlp": 1.05319881, + "epoch": 0.17622125356981813, + "flos": 25666685245440.0, + "grad_norm": 2.194829469856105, + "language_loss": 0.76142448, + "learning_rate": 3.7798775609330927e-06, + "loss": 0.78343737, + "num_input_tokens_seen": 63368835, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0078125, + "step": 2931, + "time_per_iteration": 2.4907379150390625 + }, + { + "auxiliary_loss_clip": 0.01151699, + "auxiliary_loss_mlp": 0.0104003, + "balance_loss_clip": 1.0211587, + "balance_loss_mlp": 1.05108666, + "epoch": 0.1762813768224861, + "flos": 16508530949760.0, + "grad_norm": 2.8261445455709153, + "language_loss": 0.74740392, + "learning_rate": 3.779699901503696e-06, + "loss": 0.7693212, + "num_input_tokens_seen": 63385220, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0078125, + "step": 2932, + "time_per_iteration": 2.4252588748931885 + }, + { + "auxiliary_loss_clip": 0.01157373, + "auxiliary_loss_mlp": 0.01043179, + "balance_loss_clip": 1.0221262, + "balance_loss_mlp": 1.05086923, + "epoch": 0.17634150007515406, + "flos": 11211850402560.0, + "grad_norm": 2.4930669650063355, + "language_loss": 0.8968839, + "learning_rate": 3.7795221745879016e-06, + "loss": 0.9188894, + "num_input_tokens_seen": 63400865, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.0625, + "step": 2933, + "time_per_iteration": 2.4334278106689453 + }, + { + "auxiliary_loss_clip": 0.01147962, + "auxiliary_loss_mlp": 0.01047507, + "balance_loss_clip": 1.02980459, + "balance_loss_mlp": 1.05053639, + "epoch": 0.17640162332782203, + "flos": 23659925235840.0, + "grad_norm": 1.6616334836184845, + "language_loss": 0.88273364, + "learning_rate": 3.779344380192448e-06, + "loss": 0.90468836, + "num_input_tokens_seen": 63421390, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9765625, + "step": 2934, + "time_per_iteration": 3.891472578048706 + }, + { + "auxiliary_loss_clip": 0.01147552, + "auxiliary_loss_mlp": 0.0104641, + "balance_loss_clip": 1.02827823, + "balance_loss_mlp": 1.04972959, + "epoch": 0.17646174658049, + "flos": 53796984606720.0, + "grad_norm": 1.7575209177187046, + "language_loss": 0.70843625, + "learning_rate": 3.779166518324077e-06, + "loss": 0.73037589, + "num_input_tokens_seen": 63444715, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.98046875, + "step": 2935, + "time_per_iteration": 5.650984287261963 + }, + { + "auxiliary_loss_clip": 0.01157572, + "auxiliary_loss_mlp": 0.0104314, + "balance_loss_clip": 1.02405488, + "balance_loss_mlp": 1.05251908, + "epoch": 0.17652186983315798, + "flos": 24243868638720.0, + "grad_norm": 2.2448658169111795, + "language_loss": 0.69255942, + "learning_rate": 3.7789885889895325e-06, + "loss": 0.71456659, + "num_input_tokens_seen": 63465525, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0546875, + "step": 2936, + "time_per_iteration": 2.4864091873168945 + }, + { + "auxiliary_loss_clip": 0.01154775, + "auxiliary_loss_mlp": 0.01045313, + "balance_loss_clip": 1.02758646, + "balance_loss_mlp": 1.05530488, + "epoch": 0.17658199308582595, + "flos": 27454282421760.0, + "grad_norm": 1.883537128373794, + "language_loss": 0.71391022, + "learning_rate": 3.7788105921955634e-06, + "loss": 0.73591107, + "num_input_tokens_seen": 63485815, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.99609375, + "step": 2937, + "time_per_iteration": 2.5096240043640137 + }, + { + "auxiliary_loss_clip": 0.01158892, + "auxiliary_loss_mlp": 0.010448, + "balance_loss_clip": 1.02461779, + "balance_loss_mlp": 1.05530524, + "epoch": 0.17664211633849392, + "flos": 22418672901120.0, + "grad_norm": 2.165923066719211, + "language_loss": 0.7584855, + "learning_rate": 3.7786325279489184e-06, + "loss": 0.78052241, + "num_input_tokens_seen": 63503905, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.03125, + "step": 2938, + "time_per_iteration": 2.475069284439087 + }, + { + "auxiliary_loss_clip": 0.01153261, + "auxiliary_loss_mlp": 0.01043059, + "balance_loss_clip": 1.02466512, + "balance_loss_mlp": 1.05156195, + "epoch": 0.17670223959116188, + "flos": 24715124098560.0, + "grad_norm": 2.20477923303766, + "language_loss": 0.71130306, + "learning_rate": 3.7784543962563495e-06, + "loss": 0.73326623, + "num_input_tokens_seen": 63521985, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.015625, + "step": 2939, + "time_per_iteration": 2.4806766510009766 + }, + { + "auxiliary_loss_clip": 0.01153772, + "auxiliary_loss_mlp": 0.0104332, + "balance_loss_clip": 1.02421093, + "balance_loss_mlp": 1.0538342, + "epoch": 0.17676236284382985, + "flos": 22527051212160.0, + "grad_norm": 3.125031265469358, + "language_loss": 0.73781312, + "learning_rate": 3.7782761971246115e-06, + "loss": 0.7597841, + "num_input_tokens_seen": 63539830, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0, + "step": 2940, + "time_per_iteration": 2.5438694953918457 + }, + { + "auxiliary_loss_clip": 0.01154904, + "auxiliary_loss_mlp": 0.01045748, + "balance_loss_clip": 1.02568471, + "balance_loss_mlp": 1.05372643, + "epoch": 0.1768224860964978, + "flos": 12385160161920.0, + "grad_norm": 2.4976558026918703, + "language_loss": 0.85003591, + "learning_rate": 3.7780979305604616e-06, + "loss": 0.87204242, + "num_input_tokens_seen": 63555495, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0078125, + "step": 2941, + "time_per_iteration": 2.4616622924804688 + }, + { + "auxiliary_loss_clip": 0.01154492, + "auxiliary_loss_mlp": 0.0104577, + "balance_loss_clip": 1.02687514, + "balance_loss_mlp": 1.05292201, + "epoch": 0.1768826093491658, + "flos": 24353360271360.0, + "grad_norm": 2.199835477442084, + "language_loss": 0.7711162, + "learning_rate": 3.7779195965706607e-06, + "loss": 0.79311877, + "num_input_tokens_seen": 63575290, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.015625, + "step": 2942, + "time_per_iteration": 2.512493848800659 + }, + { + "auxiliary_loss_clip": 0.01154308, + "auxiliary_loss_mlp": 0.01044542, + "balance_loss_clip": 1.02514625, + "balance_loss_mlp": 1.05181623, + "epoch": 0.17694273260183377, + "flos": 23587062497280.0, + "grad_norm": 1.9811917296629065, + "language_loss": 0.80591762, + "learning_rate": 3.77774119516197e-06, + "loss": 0.82790613, + "num_input_tokens_seen": 63594670, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0234375, + "step": 2943, + "time_per_iteration": 2.4898416996002197 + }, + { + "auxiliary_loss_clip": 0.01154834, + "auxiliary_loss_mlp": 0.01050893, + "balance_loss_clip": 1.02953053, + "balance_loss_mlp": 1.05046725, + "epoch": 0.17700285585450173, + "flos": 26760991040640.0, + "grad_norm": 2.9958912509352866, + "language_loss": 0.80558729, + "learning_rate": 3.777562726341155e-06, + "loss": 0.82764459, + "num_input_tokens_seen": 63614780, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.046875, + "step": 2944, + "time_per_iteration": 2.533968448638916 + }, + { + "auxiliary_loss_clip": 0.01154843, + "auxiliary_loss_mlp": 0.01062464, + "balance_loss_clip": 1.04353368, + "balance_loss_mlp": 1.05239737, + "epoch": 0.1770629791071697, + "flos": 42776323320960.0, + "grad_norm": 1.992535786356086, + "language_loss": 0.73450243, + "learning_rate": 3.7773841901149835e-06, + "loss": 0.75667548, + "num_input_tokens_seen": 63637190, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0234375, + "step": 2945, + "time_per_iteration": 2.641890287399292 + }, + { + "auxiliary_loss_clip": 0.01152525, + "auxiliary_loss_mlp": 0.01050215, + "balance_loss_clip": 1.03179753, + "balance_loss_mlp": 1.05274916, + "epoch": 0.17712310235983766, + "flos": 17345572560000.0, + "grad_norm": 2.3259800829895028, + "language_loss": 0.7778489, + "learning_rate": 3.7772055864902256e-06, + "loss": 0.79987633, + "num_input_tokens_seen": 63652140, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.99609375, + "step": 2946, + "time_per_iteration": 2.420511484146118 + }, + { + "auxiliary_loss_clip": 0.01150621, + "auxiliary_loss_mlp": 0.01051141, + "balance_loss_clip": 1.03190041, + "balance_loss_mlp": 1.05060697, + "epoch": 0.17718322561250563, + "flos": 23878477537920.0, + "grad_norm": 1.9846715459481197, + "language_loss": 0.76240218, + "learning_rate": 3.7770269154736535e-06, + "loss": 0.78441978, + "num_input_tokens_seen": 63671700, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.0, + "step": 2947, + "time_per_iteration": 2.485795259475708 + }, + { + "auxiliary_loss_clip": 0.01148639, + "auxiliary_loss_mlp": 0.01046512, + "balance_loss_clip": 1.02725959, + "balance_loss_mlp": 1.04881549, + "epoch": 0.1772433488651736, + "flos": 36466352104320.0, + "grad_norm": 2.7031010106606654, + "language_loss": 0.71890748, + "learning_rate": 3.7768481770720424e-06, + "loss": 0.74085903, + "num_input_tokens_seen": 63691685, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.99609375, + "step": 2948, + "time_per_iteration": 2.598586320877075 + }, + { + "auxiliary_loss_clip": 0.01151482, + "auxiliary_loss_mlp": 0.01051629, + "balance_loss_clip": 1.03313947, + "balance_loss_mlp": 1.05261326, + "epoch": 0.1773034721178416, + "flos": 26684716510080.0, + "grad_norm": 1.809900152556277, + "language_loss": 0.81843233, + "learning_rate": 3.776669371292171e-06, + "loss": 0.8404634, + "num_input_tokens_seen": 63711720, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.98828125, + "step": 2949, + "time_per_iteration": 2.496962547302246 + }, + { + "auxiliary_loss_clip": 0.01051917, + "auxiliary_loss_mlp": 0.01007586, + "balance_loss_clip": 1.00552368, + "balance_loss_mlp": 1.01889789, + "epoch": 0.17736359537050955, + "flos": 57117467617920.0, + "grad_norm": 0.7669309197050882, + "language_loss": 0.64973593, + "learning_rate": 3.7764904981408186e-06, + "loss": 0.670331, + "num_input_tokens_seen": 63776280, + "router_z_loss_clip": 0.02062988, + "router_z_loss_mlp": 0.33007812, + "step": 2950, + "time_per_iteration": 3.1220879554748535 + }, + { + "auxiliary_loss_clip": 0.01145274, + "auxiliary_loss_mlp": 0.01049164, + "balance_loss_clip": 1.02992332, + "balance_loss_mlp": 1.04777181, + "epoch": 0.17742371862317752, + "flos": 27198203385600.0, + "grad_norm": 1.9502306021254343, + "language_loss": 0.83540517, + "learning_rate": 3.7763115576247686e-06, + "loss": 0.85734957, + "num_input_tokens_seen": 63797535, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.9765625, + "step": 2951, + "time_per_iteration": 2.5360641479492188 + }, + { + "auxiliary_loss_clip": 0.01153398, + "auxiliary_loss_mlp": 0.01055919, + "balance_loss_clip": 1.03710794, + "balance_loss_mlp": 1.04963326, + "epoch": 0.17748384187584548, + "flos": 20959694277120.0, + "grad_norm": 3.175759961241781, + "language_loss": 0.80564123, + "learning_rate": 3.776132549750806e-06, + "loss": 0.82773435, + "num_input_tokens_seen": 63817045, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0390625, + "step": 2952, + "time_per_iteration": 2.478635787963867 + }, + { + "auxiliary_loss_clip": 0.01150606, + "auxiliary_loss_mlp": 0.01051207, + "balance_loss_clip": 1.03157318, + "balance_loss_mlp": 1.05045855, + "epoch": 0.17754396512851345, + "flos": 25009986844800.0, + "grad_norm": 2.157061982289712, + "language_loss": 0.79982865, + "learning_rate": 3.7759534745257194e-06, + "loss": 0.82184678, + "num_input_tokens_seen": 63837665, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0, + "step": 2953, + "time_per_iteration": 2.5143978595733643 + }, + { + "auxiliary_loss_clip": 0.01152559, + "auxiliary_loss_mlp": 0.01048489, + "balance_loss_clip": 1.03003526, + "balance_loss_mlp": 1.05173969, + "epoch": 0.1776040883811814, + "flos": 32051566275840.0, + "grad_norm": 1.8943960347088487, + "language_loss": 0.88006002, + "learning_rate": 3.7757743319562994e-06, + "loss": 0.90207046, + "num_input_tokens_seen": 63858455, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.0078125, + "step": 2954, + "time_per_iteration": 2.575603485107422 + }, + { + "auxiliary_loss_clip": 0.01150383, + "auxiliary_loss_mlp": 0.01052239, + "balance_loss_clip": 1.0327127, + "balance_loss_mlp": 1.05101538, + "epoch": 0.17766421163384938, + "flos": 21574125348480.0, + "grad_norm": 2.123866524492404, + "language_loss": 0.84441978, + "learning_rate": 3.7755951220493386e-06, + "loss": 0.86644602, + "num_input_tokens_seen": 63876935, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.9921875, + "step": 2955, + "time_per_iteration": 2.476022958755493 + }, + { + "auxiliary_loss_clip": 0.01148363, + "auxiliary_loss_mlp": 0.0104412, + "balance_loss_clip": 1.02591681, + "balance_loss_mlp": 1.04843807, + "epoch": 0.17772433488651737, + "flos": 22419319345920.0, + "grad_norm": 2.0229859139182382, + "language_loss": 0.71172267, + "learning_rate": 3.7754158448116327e-06, + "loss": 0.73364747, + "num_input_tokens_seen": 63896815, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.0, + "step": 2956, + "time_per_iteration": 2.4795608520507812 + }, + { + "auxiliary_loss_clip": 0.0114893, + "auxiliary_loss_mlp": 0.01052163, + "balance_loss_clip": 1.03226662, + "balance_loss_mlp": 1.04974461, + "epoch": 0.17778445813918534, + "flos": 25629445820160.0, + "grad_norm": 1.891261769499534, + "language_loss": 0.82908547, + "learning_rate": 3.7752365002499795e-06, + "loss": 0.85109639, + "num_input_tokens_seen": 63916140, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.9921875, + "step": 2957, + "time_per_iteration": 2.494279384613037 + }, + { + "auxiliary_loss_clip": 0.01146796, + "auxiliary_loss_mlp": 0.01046422, + "balance_loss_clip": 1.02819514, + "balance_loss_mlp": 1.04814482, + "epoch": 0.1778445813918533, + "flos": 25628871202560.0, + "grad_norm": 1.926043663168548, + "language_loss": 0.75286758, + "learning_rate": 3.7750570883711807e-06, + "loss": 0.7747997, + "num_input_tokens_seen": 63935220, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.984375, + "step": 2958, + "time_per_iteration": 2.532339572906494 + }, + { + "auxiliary_loss_clip": 0.01153516, + "auxiliary_loss_mlp": 0.01043348, + "balance_loss_clip": 1.02483475, + "balance_loss_mlp": 1.05278933, + "epoch": 0.17790470464452127, + "flos": 22345522853760.0, + "grad_norm": 2.0794730574663265, + "language_loss": 0.79558724, + "learning_rate": 3.7748776091820397e-06, + "loss": 0.8175559, + "num_input_tokens_seen": 63954550, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.0078125, + "step": 2959, + "time_per_iteration": 2.45941424369812 + }, + { + "auxiliary_loss_clip": 0.01152953, + "auxiliary_loss_mlp": 0.01045372, + "balance_loss_clip": 1.02573824, + "balance_loss_mlp": 1.04968762, + "epoch": 0.17796482789718923, + "flos": 18765875214720.0, + "grad_norm": 2.284306220471852, + "language_loss": 0.52288693, + "learning_rate": 3.774698062689362e-06, + "loss": 0.5448702, + "num_input_tokens_seen": 63972425, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.03125, + "step": 2960, + "time_per_iteration": 2.4603421688079834 + }, + { + "auxiliary_loss_clip": 0.01154348, + "auxiliary_loss_mlp": 0.01055908, + "balance_loss_clip": 1.03685808, + "balance_loss_mlp": 1.05185843, + "epoch": 0.1780249511498572, + "flos": 23440941970560.0, + "grad_norm": 1.9615261009939866, + "language_loss": 0.89047921, + "learning_rate": 3.7745184488999548e-06, + "loss": 0.9125818, + "num_input_tokens_seen": 63992165, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0234375, + "step": 2961, + "time_per_iteration": 2.475848913192749 + }, + { + "auxiliary_loss_clip": 0.01151915, + "auxiliary_loss_mlp": 0.01051556, + "balance_loss_clip": 1.0313381, + "balance_loss_mlp": 1.04849648, + "epoch": 0.1780850744025252, + "flos": 23367468700800.0, + "grad_norm": 2.2193748892921517, + "language_loss": 0.79186273, + "learning_rate": 3.774338767820631e-06, + "loss": 0.81389749, + "num_input_tokens_seen": 64013470, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.03125, + "step": 2962, + "time_per_iteration": 2.508054256439209 + }, + { + "auxiliary_loss_clip": 0.011535, + "auxiliary_loss_mlp": 0.01051549, + "balance_loss_clip": 1.03175986, + "balance_loss_mlp": 1.0524615, + "epoch": 0.17814519765519315, + "flos": 13771994319360.0, + "grad_norm": 1.9550413638631114, + "language_loss": 0.74514943, + "learning_rate": 3.774159019458203e-06, + "loss": 0.76719993, + "num_input_tokens_seen": 64030975, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.015625, + "step": 2963, + "time_per_iteration": 2.4414234161376953 + }, + { + "auxiliary_loss_clip": 0.01156042, + "auxiliary_loss_mlp": 0.01048013, + "balance_loss_clip": 1.02822399, + "balance_loss_mlp": 1.05221784, + "epoch": 0.17820532090786112, + "flos": 21976396738560.0, + "grad_norm": 1.541363360665875, + "language_loss": 0.78624183, + "learning_rate": 3.7739792038194877e-06, + "loss": 0.80828238, + "num_input_tokens_seen": 64050075, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.0390625, + "step": 2964, + "time_per_iteration": 2.502497911453247 + }, + { + "auxiliary_loss_clip": 0.0115044, + "auxiliary_loss_mlp": 0.01056098, + "balance_loss_clip": 1.03661871, + "balance_loss_mlp": 1.05026746, + "epoch": 0.17826544416052909, + "flos": 24790752184320.0, + "grad_norm": 1.923237578914178, + "language_loss": 0.81686175, + "learning_rate": 3.7737993209113027e-06, + "loss": 0.83892715, + "num_input_tokens_seen": 64071920, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0, + "step": 2965, + "time_per_iteration": 2.538076400756836 + }, + { + "auxiliary_loss_clip": 0.01147349, + "auxiliary_loss_mlp": 0.01049832, + "balance_loss_clip": 1.03273785, + "balance_loss_mlp": 1.04941893, + "epoch": 0.17832556741319705, + "flos": 13879582531200.0, + "grad_norm": 2.2408088539265183, + "language_loss": 0.94580686, + "learning_rate": 3.7736193707404698e-06, + "loss": 0.96777868, + "num_input_tokens_seen": 64086835, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.98046875, + "step": 2966, + "time_per_iteration": 2.43082332611084 + }, + { + "auxiliary_loss_clip": 0.01149854, + "auxiliary_loss_mlp": 0.01045128, + "balance_loss_clip": 1.02470756, + "balance_loss_mlp": 1.05002928, + "epoch": 0.17838569066586502, + "flos": 36641703323520.0, + "grad_norm": 2.145285080590972, + "language_loss": 0.72469354, + "learning_rate": 3.7734393533138127e-06, + "loss": 0.74664342, + "num_input_tokens_seen": 64107360, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0, + "step": 2967, + "time_per_iteration": 2.5735998153686523 + }, + { + "auxiliary_loss_clip": 0.01145139, + "auxiliary_loss_mlp": 0.01044527, + "balance_loss_clip": 1.02613282, + "balance_loss_mlp": 1.04889679, + "epoch": 0.17844581391853298, + "flos": 18727271072640.0, + "grad_norm": 2.088672387523525, + "language_loss": 0.76831949, + "learning_rate": 3.773259268638157e-06, + "loss": 0.79021615, + "num_input_tokens_seen": 64124690, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.9609375, + "step": 2968, + "time_per_iteration": 2.437344789505005 + }, + { + "auxiliary_loss_clip": 0.01147824, + "auxiliary_loss_mlp": 0.01047277, + "balance_loss_clip": 1.0287044, + "balance_loss_mlp": 1.04982233, + "epoch": 0.17850593717120097, + "flos": 27378259286400.0, + "grad_norm": 3.3962137266502075, + "language_loss": 0.75934523, + "learning_rate": 3.7730791167203333e-06, + "loss": 0.78129619, + "num_input_tokens_seen": 64146315, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.98046875, + "step": 2969, + "time_per_iteration": 2.5003507137298584 + }, + { + "auxiliary_loss_clip": 0.01047445, + "auxiliary_loss_mlp": 0.01001591, + "balance_loss_clip": 0.99940914, + "balance_loss_mlp": 1.01426291, + "epoch": 0.17856606042386894, + "flos": 66996025084800.0, + "grad_norm": 0.8459028719848601, + "language_loss": 0.69080526, + "learning_rate": 3.772898897567171e-06, + "loss": 0.7112956, + "num_input_tokens_seen": 64210875, + "router_z_loss_clip": 0.02185059, + "router_z_loss_mlp": 0.33203125, + "step": 2970, + "time_per_iteration": 3.1193249225616455 + }, + { + "auxiliary_loss_clip": 0.01153596, + "auxiliary_loss_mlp": 0.01041832, + "balance_loss_clip": 1.0229373, + "balance_loss_mlp": 1.0498271, + "epoch": 0.1786261836765369, + "flos": 36977001805440.0, + "grad_norm": 2.0858657386647614, + "language_loss": 0.67452097, + "learning_rate": 3.772718611185505e-06, + "loss": 0.69647527, + "num_input_tokens_seen": 64230740, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0390625, + "step": 2971, + "time_per_iteration": 2.580946683883667 + }, + { + "auxiliary_loss_clip": 0.01146095, + "auxiliary_loss_mlp": 0.0105018, + "balance_loss_clip": 1.03059363, + "balance_loss_mlp": 1.04643905, + "epoch": 0.17868630692920487, + "flos": 24825441744000.0, + "grad_norm": 1.713623966203784, + "language_loss": 0.89631712, + "learning_rate": 3.7725382575821717e-06, + "loss": 0.91827983, + "num_input_tokens_seen": 64252300, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.99609375, + "step": 2972, + "time_per_iteration": 2.491608142852783 + }, + { + "auxiliary_loss_clip": 0.01150348, + "auxiliary_loss_mlp": 0.01056161, + "balance_loss_clip": 1.03762364, + "balance_loss_mlp": 1.05058205, + "epoch": 0.17874643018187283, + "flos": 16981977139200.0, + "grad_norm": 2.067523530387673, + "language_loss": 0.88030291, + "learning_rate": 3.77235783676401e-06, + "loss": 0.90236795, + "num_input_tokens_seen": 64270105, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0, + "step": 2973, + "time_per_iteration": 2.4357106685638428 + }, + { + "auxiliary_loss_clip": 0.01148396, + "auxiliary_loss_mlp": 0.01051569, + "balance_loss_clip": 1.03282917, + "balance_loss_mlp": 1.04979324, + "epoch": 0.1788065534345408, + "flos": 21032233793280.0, + "grad_norm": 2.1406659419236176, + "language_loss": 0.75648922, + "learning_rate": 3.7721773487378615e-06, + "loss": 0.77848881, + "num_input_tokens_seen": 64287250, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.984375, + "step": 2974, + "time_per_iteration": 2.484236478805542 + }, + { + "auxiliary_loss_clip": 0.01148515, + "auxiliary_loss_mlp": 0.01044686, + "balance_loss_clip": 1.02560067, + "balance_loss_mlp": 1.04925394, + "epoch": 0.17886667668720876, + "flos": 23987717775360.0, + "grad_norm": 2.8019304252630453, + "language_loss": 0.74556506, + "learning_rate": 3.7719967935105705e-06, + "loss": 0.76749712, + "num_input_tokens_seen": 64307140, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.9921875, + "step": 2975, + "time_per_iteration": 2.4658849239349365 + }, + { + "auxiliary_loss_clip": 0.01145454, + "auxiliary_loss_mlp": 0.0104533, + "balance_loss_clip": 1.02692378, + "balance_loss_mlp": 1.04805982, + "epoch": 0.17892679993987676, + "flos": 25739476156800.0, + "grad_norm": 1.5963289978134585, + "language_loss": 0.73245859, + "learning_rate": 3.7718161710889833e-06, + "loss": 0.7543664, + "num_input_tokens_seen": 64328760, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.97265625, + "step": 2976, + "time_per_iteration": 3.921170949935913 + }, + { + "auxiliary_loss_clip": 0.01140857, + "auxiliary_loss_mlp": 0.01040265, + "balance_loss_clip": 1.02455354, + "balance_loss_mlp": 1.04732931, + "epoch": 0.17898692319254472, + "flos": 25699686865920.0, + "grad_norm": 1.5556273460638488, + "language_loss": 0.77324069, + "learning_rate": 3.7716354814799495e-06, + "loss": 0.79505193, + "num_input_tokens_seen": 64348800, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.9375, + "step": 2977, + "time_per_iteration": 5.459218978881836 + }, + { + "auxiliary_loss_clip": 0.01150602, + "auxiliary_loss_mlp": 0.0105157, + "balance_loss_clip": 1.03352153, + "balance_loss_mlp": 1.05327988, + "epoch": 0.1790470464452127, + "flos": 19317786664320.0, + "grad_norm": 2.814268655584857, + "language_loss": 0.79470795, + "learning_rate": 3.7714547246903203e-06, + "loss": 0.81672966, + "num_input_tokens_seen": 64367955, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.97265625, + "step": 2978, + "time_per_iteration": 2.4917376041412354 + }, + { + "auxiliary_loss_clip": 0.01152273, + "auxiliary_loss_mlp": 0.0104187, + "balance_loss_clip": 1.022892, + "balance_loss_mlp": 1.04982674, + "epoch": 0.17910716969788065, + "flos": 30044267562240.0, + "grad_norm": 1.6585859201367117, + "language_loss": 0.76166439, + "learning_rate": 3.7712739007269508e-06, + "loss": 0.78360581, + "num_input_tokens_seen": 64389805, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0234375, + "step": 2979, + "time_per_iteration": 2.5283753871917725 + }, + { + "auxiliary_loss_clip": 0.01144584, + "auxiliary_loss_mlp": 0.01046117, + "balance_loss_clip": 1.0283196, + "balance_loss_mlp": 1.04760695, + "epoch": 0.17916729295054862, + "flos": 19427709260160.0, + "grad_norm": 2.3100878996861014, + "language_loss": 0.69246143, + "learning_rate": 3.7710930095966976e-06, + "loss": 0.7143684, + "num_input_tokens_seen": 64408220, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.96875, + "step": 2980, + "time_per_iteration": 2.452199935913086 + }, + { + "auxiliary_loss_clip": 0.01148553, + "auxiliary_loss_mlp": 0.01047507, + "balance_loss_clip": 1.02703881, + "balance_loss_mlp": 1.04957294, + "epoch": 0.17922741620321658, + "flos": 14611549881600.0, + "grad_norm": 1.6769030770257147, + "language_loss": 0.7077347, + "learning_rate": 3.7709120513064196e-06, + "loss": 0.72969532, + "num_input_tokens_seen": 64426380, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.98828125, + "step": 2981, + "time_per_iteration": 2.453328847885132 + }, + { + "auxiliary_loss_clip": 0.01151272, + "auxiliary_loss_mlp": 0.01057949, + "balance_loss_clip": 1.03929293, + "balance_loss_mlp": 1.05124855, + "epoch": 0.17928753945588458, + "flos": 17165301177600.0, + "grad_norm": 2.4096510966801916, + "language_loss": 0.82313269, + "learning_rate": 3.7707310258629796e-06, + "loss": 0.84522492, + "num_input_tokens_seen": 64444355, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.0, + "step": 2982, + "time_per_iteration": 2.4727423191070557 + }, + { + "auxiliary_loss_clip": 0.01145202, + "auxiliary_loss_mlp": 0.01048964, + "balance_loss_clip": 1.0309453, + "balance_loss_mlp": 1.04754186, + "epoch": 0.17934766270855254, + "flos": 31395622060800.0, + "grad_norm": 2.0170018574221404, + "language_loss": 0.82899523, + "learning_rate": 3.7705499332732413e-06, + "loss": 0.85093689, + "num_input_tokens_seen": 64467800, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9765625, + "step": 2983, + "time_per_iteration": 2.5544486045837402 + }, + { + "auxiliary_loss_clip": 0.01148269, + "auxiliary_loss_mlp": 0.01051569, + "balance_loss_clip": 1.03234076, + "balance_loss_mlp": 1.04676509, + "epoch": 0.1794077859612205, + "flos": 20814184281600.0, + "grad_norm": 2.0025677466759175, + "language_loss": 0.84977567, + "learning_rate": 3.7703687735440718e-06, + "loss": 0.87177408, + "num_input_tokens_seen": 64487230, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.015625, + "step": 2984, + "time_per_iteration": 2.461451530456543 + }, + { + "auxiliary_loss_clip": 0.01146636, + "auxiliary_loss_mlp": 0.01044432, + "balance_loss_clip": 1.02558494, + "balance_loss_mlp": 1.04734373, + "epoch": 0.17946790921388847, + "flos": 28986447006720.0, + "grad_norm": 2.5972673531528874, + "language_loss": 0.89526331, + "learning_rate": 3.7701875466823416e-06, + "loss": 0.91717398, + "num_input_tokens_seen": 64509165, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.9921875, + "step": 2985, + "time_per_iteration": 2.5644643306732178 + }, + { + "auxiliary_loss_clip": 0.01142965, + "auxiliary_loss_mlp": 0.01045202, + "balance_loss_clip": 1.02879906, + "balance_loss_mlp": 1.0478375, + "epoch": 0.17952803246655644, + "flos": 20737406960640.0, + "grad_norm": 1.9029387971382474, + "language_loss": 0.69863129, + "learning_rate": 3.770006252694922e-06, + "loss": 0.72051299, + "num_input_tokens_seen": 64527940, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.953125, + "step": 2986, + "time_per_iteration": 2.4629499912261963 + }, + { + "auxiliary_loss_clip": 0.01144523, + "auxiliary_loss_mlp": 0.01043434, + "balance_loss_clip": 1.02507591, + "balance_loss_mlp": 1.04828227, + "epoch": 0.1795881557192244, + "flos": 28255988027520.0, + "grad_norm": 2.203273814413497, + "language_loss": 0.77872753, + "learning_rate": 3.769824891588688e-06, + "loss": 0.80060714, + "num_input_tokens_seen": 64545230, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.96484375, + "step": 2987, + "time_per_iteration": 2.524712562561035 + }, + { + "auxiliary_loss_clip": 0.01149287, + "auxiliary_loss_mlp": 0.01043881, + "balance_loss_clip": 1.02412844, + "balance_loss_mlp": 1.04834962, + "epoch": 0.17964827897189237, + "flos": 18552027594240.0, + "grad_norm": 2.225668764256514, + "language_loss": 0.78012109, + "learning_rate": 3.7696434633705164e-06, + "loss": 0.8020528, + "num_input_tokens_seen": 64563820, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0078125, + "step": 2988, + "time_per_iteration": 2.4608163833618164 + }, + { + "auxiliary_loss_clip": 0.01048374, + "auxiliary_loss_mlp": 0.01007691, + "balance_loss_clip": 1.00570035, + "balance_loss_mlp": 1.0154314, + "epoch": 0.17970840222456036, + "flos": 58165088711040.0, + "grad_norm": 0.7961406236538413, + "language_loss": 0.62767559, + "learning_rate": 3.7694619680472875e-06, + "loss": 0.64823627, + "num_input_tokens_seen": 64621315, + "router_z_loss_clip": 0.01989746, + "router_z_loss_mlp": 0.33007812, + "step": 2989, + "time_per_iteration": 2.9831957817077637 + }, + { + "auxiliary_loss_clip": 0.01146079, + "auxiliary_loss_mlp": 0.01041184, + "balance_loss_clip": 1.02363658, + "balance_loss_mlp": 1.04836369, + "epoch": 0.17976852547722832, + "flos": 20300805146880.0, + "grad_norm": 3.4434429944335525, + "language_loss": 0.70464563, + "learning_rate": 3.7692804056258837e-06, + "loss": 0.72651821, + "num_input_tokens_seen": 64639885, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.98046875, + "step": 2990, + "time_per_iteration": 2.556100606918335 + }, + { + "auxiliary_loss_clip": 0.01146243, + "auxiliary_loss_mlp": 0.01039011, + "balance_loss_clip": 1.0210464, + "balance_loss_mlp": 1.04735422, + "epoch": 0.1798286487298963, + "flos": 39669367685760.0, + "grad_norm": 1.7649502456354873, + "language_loss": 0.68110204, + "learning_rate": 3.7690987761131893e-06, + "loss": 0.70295459, + "num_input_tokens_seen": 64661220, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.98828125, + "step": 2991, + "time_per_iteration": 2.6224544048309326 + }, + { + "auxiliary_loss_clip": 0.01145545, + "auxiliary_loss_mlp": 0.01040119, + "balance_loss_clip": 1.02194011, + "balance_loss_mlp": 1.04794931, + "epoch": 0.17988877198256426, + "flos": 25520313323520.0, + "grad_norm": 1.5716432326573742, + "language_loss": 0.82754636, + "learning_rate": 3.7689170795160924e-06, + "loss": 0.84940296, + "num_input_tokens_seen": 64682530, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9765625, + "step": 2992, + "time_per_iteration": 2.51824951171875 + }, + { + "auxiliary_loss_clip": 0.01138637, + "auxiliary_loss_mlp": 0.01040458, + "balance_loss_clip": 1.02301776, + "balance_loss_mlp": 1.04464579, + "epoch": 0.17994889523523222, + "flos": 18807496099200.0, + "grad_norm": 2.1353598877924806, + "language_loss": 0.81958085, + "learning_rate": 3.7687353158414822e-06, + "loss": 0.84137177, + "num_input_tokens_seen": 64701025, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.94140625, + "step": 2993, + "time_per_iteration": 2.4349074363708496 + }, + { + "auxiliary_loss_clip": 0.01143824, + "auxiliary_loss_mlp": 0.01047314, + "balance_loss_clip": 1.02889621, + "balance_loss_mlp": 1.04586673, + "epoch": 0.18000901848790019, + "flos": 21104450087040.0, + "grad_norm": 1.7254805142405878, + "language_loss": 0.78390837, + "learning_rate": 3.7685534850962517e-06, + "loss": 0.80581975, + "num_input_tokens_seen": 64719570, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.98046875, + "step": 2994, + "time_per_iteration": 2.4898691177368164 + }, + { + "auxiliary_loss_clip": 0.01148185, + "auxiliary_loss_mlp": 0.01043708, + "balance_loss_clip": 1.02642226, + "balance_loss_mlp": 1.04966068, + "epoch": 0.18006914174056818, + "flos": 19646441130240.0, + "grad_norm": 1.8689491925476576, + "language_loss": 0.80392146, + "learning_rate": 3.768371587287296e-06, + "loss": 0.82584035, + "num_input_tokens_seen": 64738110, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.984375, + "step": 2995, + "time_per_iteration": 2.4521572589874268 + }, + { + "auxiliary_loss_clip": 0.01144196, + "auxiliary_loss_mlp": 0.01046311, + "balance_loss_clip": 1.02939498, + "balance_loss_mlp": 1.04679298, + "epoch": 0.18012926499323614, + "flos": 19499889640320.0, + "grad_norm": 1.5635152056288029, + "language_loss": 0.84467834, + "learning_rate": 3.768189622421512e-06, + "loss": 0.86658335, + "num_input_tokens_seen": 64756345, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.97265625, + "step": 2996, + "time_per_iteration": 2.46993088722229 + }, + { + "auxiliary_loss_clip": 0.01139788, + "auxiliary_loss_mlp": 0.01041997, + "balance_loss_clip": 1.02493799, + "balance_loss_mlp": 1.04656756, + "epoch": 0.1801893882459041, + "flos": 19464553635840.0, + "grad_norm": 2.9197857622903793, + "language_loss": 0.88254511, + "learning_rate": 3.7680075905058006e-06, + "loss": 0.90436304, + "num_input_tokens_seen": 64776375, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9296875, + "step": 2997, + "time_per_iteration": 2.470113515853882 + }, + { + "auxiliary_loss_clip": 0.01148809, + "auxiliary_loss_mlp": 0.01043259, + "balance_loss_clip": 1.02435279, + "balance_loss_mlp": 1.04666877, + "epoch": 0.18024951149857207, + "flos": 26870590414080.0, + "grad_norm": 2.5635961030192935, + "language_loss": 0.8504566, + "learning_rate": 3.7678254915470643e-06, + "loss": 0.87237728, + "num_input_tokens_seen": 64796210, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0234375, + "step": 2998, + "time_per_iteration": 2.5252864360809326 + }, + { + "auxiliary_loss_clip": 0.0114547, + "auxiliary_loss_mlp": 0.01045025, + "balance_loss_clip": 1.02783537, + "balance_loss_mlp": 1.05022454, + "epoch": 0.18030963475124004, + "flos": 30226621933440.0, + "grad_norm": 1.8695557812200347, + "language_loss": 0.84270376, + "learning_rate": 3.7676433255522084e-06, + "loss": 0.86460871, + "num_input_tokens_seen": 64818590, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.953125, + "step": 2999, + "time_per_iteration": 2.5272696018218994 + }, + { + "auxiliary_loss_clip": 0.01143823, + "auxiliary_loss_mlp": 0.01044085, + "balance_loss_clip": 1.02577412, + "balance_loss_mlp": 1.04662383, + "epoch": 0.180369758003908, + "flos": 22307493329280.0, + "grad_norm": 1.7700032623605295, + "language_loss": 0.74753368, + "learning_rate": 3.76746109252814e-06, + "loss": 0.76941276, + "num_input_tokens_seen": 64838350, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.97265625, + "step": 3000, + "time_per_iteration": 2.4800922870635986 + }, + { + "auxiliary_loss_clip": 0.01143329, + "auxiliary_loss_mlp": 0.01060132, + "balance_loss_clip": 1.04111791, + "balance_loss_mlp": 1.04825568, + "epoch": 0.18042988125657597, + "flos": 23732033788800.0, + "grad_norm": 2.369063359757221, + "language_loss": 0.71625632, + "learning_rate": 3.76727879248177e-06, + "loss": 0.73829091, + "num_input_tokens_seen": 64858065, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.953125, + "step": 3001, + "time_per_iteration": 2.484309434890747 + }, + { + "auxiliary_loss_clip": 0.01148499, + "auxiliary_loss_mlp": 0.01048814, + "balance_loss_clip": 1.03010964, + "balance_loss_mlp": 1.04815364, + "epoch": 0.18049000450924396, + "flos": 24093582134400.0, + "grad_norm": 2.7240097708601225, + "language_loss": 0.87795258, + "learning_rate": 3.767096425420011e-06, + "loss": 0.89992571, + "num_input_tokens_seen": 64877305, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.0078125, + "step": 3002, + "time_per_iteration": 2.4881784915924072 + }, + { + "auxiliary_loss_clip": 0.011444, + "auxiliary_loss_mlp": 0.01044754, + "balance_loss_clip": 1.02689672, + "balance_loss_mlp": 1.04694915, + "epoch": 0.18055012776191193, + "flos": 22163168482560.0, + "grad_norm": 1.6880476069492312, + "language_loss": 0.80563951, + "learning_rate": 3.7669139913497788e-06, + "loss": 0.8275311, + "num_input_tokens_seen": 64896955, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.9765625, + "step": 3003, + "time_per_iteration": 2.452103614807129 + }, + { + "auxiliary_loss_clip": 0.0114812, + "auxiliary_loss_mlp": 0.01044755, + "balance_loss_clip": 1.02673101, + "balance_loss_mlp": 1.04780829, + "epoch": 0.1806102510145799, + "flos": 28913512440960.0, + "grad_norm": 2.4630533980116804, + "language_loss": 0.66931474, + "learning_rate": 3.7667314902779907e-06, + "loss": 0.69124347, + "num_input_tokens_seen": 64917080, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0, + "step": 3004, + "time_per_iteration": 2.5085701942443848 + }, + { + "auxiliary_loss_clip": 0.0114685, + "auxiliary_loss_mlp": 0.01050762, + "balance_loss_clip": 1.03184342, + "balance_loss_mlp": 1.04860806, + "epoch": 0.18067037426724786, + "flos": 19025689265280.0, + "grad_norm": 1.8927608809249736, + "language_loss": 0.85172975, + "learning_rate": 3.7665489222115677e-06, + "loss": 0.87370586, + "num_input_tokens_seen": 64935215, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.984375, + "step": 3005, + "time_per_iteration": 2.44529128074646 + }, + { + "auxiliary_loss_clip": 0.01141782, + "auxiliary_loss_mlp": 0.01042658, + "balance_loss_clip": 1.02611172, + "balance_loss_mlp": 1.04684031, + "epoch": 0.18073049751991582, + "flos": 27453635976960.0, + "grad_norm": 1.553419886600377, + "language_loss": 0.82951266, + "learning_rate": 3.766366287157432e-06, + "loss": 0.85135704, + "num_input_tokens_seen": 64956275, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.94921875, + "step": 3006, + "time_per_iteration": 2.532597780227661 + }, + { + "auxiliary_loss_clip": 0.01143778, + "auxiliary_loss_mlp": 0.01050753, + "balance_loss_clip": 1.0315007, + "balance_loss_mlp": 1.04581141, + "epoch": 0.1807906207725838, + "flos": 28729039167360.0, + "grad_norm": 1.6363768703600998, + "language_loss": 0.76883924, + "learning_rate": 3.7661835851225103e-06, + "loss": 0.79078454, + "num_input_tokens_seen": 64979390, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.98046875, + "step": 3007, + "time_per_iteration": 2.5265002250671387 + }, + { + "auxiliary_loss_clip": 0.01046842, + "auxiliary_loss_mlp": 0.01004593, + "balance_loss_clip": 1.00238752, + "balance_loss_mlp": 1.01358199, + "epoch": 0.18085074402525175, + "flos": 64466515468800.0, + "grad_norm": 0.8067080511403597, + "language_loss": 0.56949043, + "learning_rate": 3.7660008161137294e-06, + "loss": 0.59000474, + "num_input_tokens_seen": 65043135, + "router_z_loss_clip": 0.02209473, + "router_z_loss_mlp": 0.33203125, + "step": 3008, + "time_per_iteration": 3.1923961639404297 + }, + { + "auxiliary_loss_clip": 0.01148419, + "auxiliary_loss_mlp": 0.01048421, + "balance_loss_clip": 1.02878737, + "balance_loss_mlp": 1.04951596, + "epoch": 0.18091086727791975, + "flos": 23476960333440.0, + "grad_norm": 1.8063105677439477, + "language_loss": 0.67226636, + "learning_rate": 3.765817980138021e-06, + "loss": 0.69423479, + "num_input_tokens_seen": 65062845, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.98828125, + "step": 3009, + "time_per_iteration": 2.467525005340576 + }, + { + "auxiliary_loss_clip": 0.01147918, + "auxiliary_loss_mlp": 0.01047401, + "balance_loss_clip": 1.02993655, + "balance_loss_mlp": 1.04874969, + "epoch": 0.1809709905305877, + "flos": 24170467196160.0, + "grad_norm": 1.842230928142314, + "language_loss": 0.75573891, + "learning_rate": 3.7656350772023177e-06, + "loss": 0.77769208, + "num_input_tokens_seen": 65082110, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.98828125, + "step": 3010, + "time_per_iteration": 2.486067533493042 + }, + { + "auxiliary_loss_clip": 0.01141012, + "auxiliary_loss_mlp": 0.0104251, + "balance_loss_clip": 1.02585649, + "balance_loss_mlp": 1.04816866, + "epoch": 0.18103111378325568, + "flos": 21650902669440.0, + "grad_norm": 1.6130539386655762, + "language_loss": 0.66672593, + "learning_rate": 3.7654521073135553e-06, + "loss": 0.6885612, + "num_input_tokens_seen": 65101985, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9296875, + "step": 3011, + "time_per_iteration": 2.461749792098999 + }, + { + "auxiliary_loss_clip": 0.01142359, + "auxiliary_loss_mlp": 0.01048591, + "balance_loss_clip": 1.0309006, + "balance_loss_mlp": 1.04706419, + "epoch": 0.18109123703592364, + "flos": 53686918356480.0, + "grad_norm": 2.1517129990512927, + "language_loss": 0.71184897, + "learning_rate": 3.7652690704786723e-06, + "loss": 0.73375839, + "num_input_tokens_seen": 65129295, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.953125, + "step": 3012, + "time_per_iteration": 2.7380943298339844 + }, + { + "auxiliary_loss_clip": 0.01145213, + "auxiliary_loss_mlp": 0.01048493, + "balance_loss_clip": 1.03045654, + "balance_loss_mlp": 1.05109787, + "epoch": 0.1811513602885916, + "flos": 35845564325760.0, + "grad_norm": 2.2489260815019447, + "language_loss": 0.62039113, + "learning_rate": 3.765085966704609e-06, + "loss": 0.64232826, + "num_input_tokens_seen": 65150625, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.94140625, + "step": 3013, + "time_per_iteration": 2.5800936222076416 + }, + { + "auxiliary_loss_clip": 0.01145888, + "auxiliary_loss_mlp": 0.01050021, + "balance_loss_clip": 1.03303385, + "balance_loss_mlp": 1.04870379, + "epoch": 0.18121148354125957, + "flos": 23732572492800.0, + "grad_norm": 1.5535403171237991, + "language_loss": 0.76026124, + "learning_rate": 3.764902795998309e-06, + "loss": 0.7822203, + "num_input_tokens_seen": 65170880, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.97265625, + "step": 3014, + "time_per_iteration": 2.5049405097961426 + }, + { + "auxiliary_loss_clip": 0.01151342, + "auxiliary_loss_mlp": 0.01046668, + "balance_loss_clip": 1.02697504, + "balance_loss_mlp": 1.05086446, + "epoch": 0.18127160679392756, + "flos": 28728320895360.0, + "grad_norm": 1.7733972454950666, + "language_loss": 0.65696967, + "learning_rate": 3.7647195583667184e-06, + "loss": 0.67894971, + "num_input_tokens_seen": 65192530, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0, + "step": 3015, + "time_per_iteration": 2.52614426612854 + }, + { + "auxiliary_loss_clip": 0.01143858, + "auxiliary_loss_mlp": 0.01043977, + "balance_loss_clip": 1.0262742, + "balance_loss_mlp": 1.0490694, + "epoch": 0.18133173004659553, + "flos": 20485062938880.0, + "grad_norm": 1.7500400577379265, + "language_loss": 0.7809943, + "learning_rate": 3.764536253816785e-06, + "loss": 0.80287266, + "num_input_tokens_seen": 65211675, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9453125, + "step": 3016, + "time_per_iteration": 2.4736039638519287 + }, + { + "auxiliary_loss_clip": 0.01152649, + "auxiliary_loss_mlp": 0.01050768, + "balance_loss_clip": 1.03214788, + "balance_loss_mlp": 1.05294776, + "epoch": 0.1813918532992635, + "flos": 22852078404480.0, + "grad_norm": 1.6390488083316745, + "language_loss": 0.83498454, + "learning_rate": 3.7643528823554602e-06, + "loss": 0.85701871, + "num_input_tokens_seen": 65231185, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0, + "step": 3017, + "time_per_iteration": 2.454888105392456 + }, + { + "auxiliary_loss_clip": 0.01142751, + "auxiliary_loss_mlp": 0.01039497, + "balance_loss_clip": 1.02192545, + "balance_loss_mlp": 1.0486486, + "epoch": 0.18145197655193146, + "flos": 36065122208640.0, + "grad_norm": 2.2301629944757964, + "language_loss": 0.67067724, + "learning_rate": 3.764169443989697e-06, + "loss": 0.69249976, + "num_input_tokens_seen": 65251645, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.94140625, + "step": 3018, + "time_per_iteration": 3.950299024581909 + }, + { + "auxiliary_loss_clip": 0.01146405, + "auxiliary_loss_mlp": 0.0103774, + "balance_loss_clip": 1.01953697, + "balance_loss_mlp": 1.04928112, + "epoch": 0.18151209980459942, + "flos": 24023951619840.0, + "grad_norm": 2.174717508383113, + "language_loss": 0.75745898, + "learning_rate": 3.7639859387264518e-06, + "loss": 0.77930045, + "num_input_tokens_seen": 65271125, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.97265625, + "step": 3019, + "time_per_iteration": 3.9721574783325195 + }, + { + "auxiliary_loss_clip": 0.01149794, + "auxiliary_loss_mlp": 0.01045611, + "balance_loss_clip": 1.02653718, + "balance_loss_mlp": 1.05230832, + "epoch": 0.1815722230572674, + "flos": 23951627585280.0, + "grad_norm": 2.1373464597463574, + "language_loss": 0.81687438, + "learning_rate": 3.7638023665726834e-06, + "loss": 0.83882844, + "num_input_tokens_seen": 65290600, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.9765625, + "step": 3020, + "time_per_iteration": 2.510564088821411 + }, + { + "auxiliary_loss_clip": 0.01148401, + "auxiliary_loss_mlp": 0.01042965, + "balance_loss_clip": 1.02373672, + "balance_loss_mlp": 1.05124021, + "epoch": 0.18163234630993536, + "flos": 24386469632640.0, + "grad_norm": 2.9178918869439654, + "language_loss": 0.77220714, + "learning_rate": 3.763618727535352e-06, + "loss": 0.79412079, + "num_input_tokens_seen": 65311040, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.96875, + "step": 3021, + "time_per_iteration": 2.4856297969818115 + }, + { + "auxiliary_loss_clip": 0.01141247, + "auxiliary_loss_mlp": 0.0104233, + "balance_loss_clip": 1.02419829, + "balance_loss_mlp": 1.04617524, + "epoch": 0.18169246956260335, + "flos": 24681332378880.0, + "grad_norm": 1.7066661124221545, + "language_loss": 0.84841502, + "learning_rate": 3.763435021621422e-06, + "loss": 0.87025082, + "num_input_tokens_seen": 65332115, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.94921875, + "step": 3022, + "time_per_iteration": 2.4933700561523438 + }, + { + "auxiliary_loss_clip": 0.01148694, + "auxiliary_loss_mlp": 0.01042276, + "balance_loss_clip": 1.02296424, + "balance_loss_mlp": 1.0491302, + "epoch": 0.1817525928152713, + "flos": 24243294021120.0, + "grad_norm": 1.9452352079001236, + "language_loss": 0.69178426, + "learning_rate": 3.763251248837859e-06, + "loss": 0.7136941, + "num_input_tokens_seen": 65352210, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.9921875, + "step": 3023, + "time_per_iteration": 2.495107412338257 + }, + { + "auxiliary_loss_clip": 0.01144443, + "auxiliary_loss_mlp": 0.01044488, + "balance_loss_clip": 1.0261296, + "balance_loss_mlp": 1.04748738, + "epoch": 0.18181271606793928, + "flos": 16472081623680.0, + "grad_norm": 1.9417078000950883, + "language_loss": 0.73956865, + "learning_rate": 3.7630674091916317e-06, + "loss": 0.76145792, + "num_input_tokens_seen": 65370600, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.96875, + "step": 3024, + "time_per_iteration": 2.4531846046447754 + }, + { + "auxiliary_loss_clip": 0.01145205, + "auxiliary_loss_mlp": 0.01043186, + "balance_loss_clip": 1.02549553, + "balance_loss_mlp": 1.0490942, + "epoch": 0.18187283932060724, + "flos": 18581042805120.0, + "grad_norm": 2.344564071286257, + "language_loss": 0.88167858, + "learning_rate": 3.7628835026897123e-06, + "loss": 0.90356255, + "num_input_tokens_seen": 65387270, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9609375, + "step": 3025, + "time_per_iteration": 2.4708051681518555 + }, + { + "auxiliary_loss_clip": 0.01145802, + "auxiliary_loss_mlp": 0.01052568, + "balance_loss_clip": 1.03356552, + "balance_loss_mlp": 1.05046904, + "epoch": 0.1819329625732752, + "flos": 20266833859200.0, + "grad_norm": 2.755473586939447, + "language_loss": 0.79284346, + "learning_rate": 3.7626995293390735e-06, + "loss": 0.8148272, + "num_input_tokens_seen": 65406550, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.953125, + "step": 3026, + "time_per_iteration": 2.482987403869629 + }, + { + "auxiliary_loss_clip": 0.01149047, + "auxiliary_loss_mlp": 0.01053602, + "balance_loss_clip": 1.03424227, + "balance_loss_mlp": 1.0502665, + "epoch": 0.18199308582594317, + "flos": 25915186512000.0, + "grad_norm": 1.6571051349992714, + "language_loss": 0.76047945, + "learning_rate": 3.762515489146692e-06, + "loss": 0.78250599, + "num_input_tokens_seen": 65425955, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.98828125, + "step": 3027, + "time_per_iteration": 2.4952149391174316 + }, + { + "auxiliary_loss_clip": 0.01151758, + "auxiliary_loss_mlp": 0.01049071, + "balance_loss_clip": 1.03055763, + "balance_loss_mlp": 1.05106115, + "epoch": 0.18205320907861114, + "flos": 15377524433280.0, + "grad_norm": 1.7989426432275553, + "language_loss": 0.85400331, + "learning_rate": 3.762331382119546e-06, + "loss": 0.87601155, + "num_input_tokens_seen": 65442820, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0078125, + "step": 3028, + "time_per_iteration": 2.438113212585449 + }, + { + "auxiliary_loss_clip": 0.01144845, + "auxiliary_loss_mlp": 0.01043225, + "balance_loss_clip": 1.02543902, + "balance_loss_mlp": 1.04937243, + "epoch": 0.18211333233127913, + "flos": 25624310175360.0, + "grad_norm": 1.8205418995180693, + "language_loss": 0.82655656, + "learning_rate": 3.7621472082646183e-06, + "loss": 0.84843719, + "num_input_tokens_seen": 65461825, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.953125, + "step": 3029, + "time_per_iteration": 2.4866995811462402 + }, + { + "auxiliary_loss_clip": 0.01151958, + "auxiliary_loss_mlp": 0.01045395, + "balance_loss_clip": 1.02640462, + "balance_loss_mlp": 1.05306637, + "epoch": 0.1821734555839471, + "flos": 14976007228800.0, + "grad_norm": 2.0975281503542433, + "language_loss": 0.78150737, + "learning_rate": 3.761962967588891e-06, + "loss": 0.80348092, + "num_input_tokens_seen": 65479480, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.9921875, + "step": 3030, + "time_per_iteration": 2.458627700805664 + }, + { + "auxiliary_loss_clip": 0.01150751, + "auxiliary_loss_mlp": 0.01043659, + "balance_loss_clip": 1.02495515, + "balance_loss_mlp": 1.05141127, + "epoch": 0.18223357883661506, + "flos": 20194007034240.0, + "grad_norm": 1.955618442063123, + "language_loss": 0.85318518, + "learning_rate": 3.761778660099352e-06, + "loss": 0.87512928, + "num_input_tokens_seen": 65497775, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.99609375, + "step": 3031, + "time_per_iteration": 2.4492268562316895 + }, + { + "auxiliary_loss_clip": 0.01150203, + "auxiliary_loss_mlp": 0.01045881, + "balance_loss_clip": 1.02824974, + "balance_loss_mlp": 1.05232072, + "epoch": 0.18229370208928303, + "flos": 15231978524160.0, + "grad_norm": 1.8744751837074634, + "language_loss": 0.79713088, + "learning_rate": 3.76159428580299e-06, + "loss": 0.81909174, + "num_input_tokens_seen": 65516505, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.98046875, + "step": 3032, + "time_per_iteration": 2.4449126720428467 + }, + { + "auxiliary_loss_clip": 0.0115633, + "auxiliary_loss_mlp": 0.01044461, + "balance_loss_clip": 1.0260191, + "balance_loss_mlp": 1.05395341, + "epoch": 0.182353825341951, + "flos": 23840483927040.0, + "grad_norm": 2.0774072235136964, + "language_loss": 0.81420642, + "learning_rate": 3.761409844706795e-06, + "loss": 0.8362143, + "num_input_tokens_seen": 65536160, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.0234375, + "step": 3033, + "time_per_iteration": 2.47562575340271 + }, + { + "auxiliary_loss_clip": 0.01052781, + "auxiliary_loss_mlp": 0.01006645, + "balance_loss_clip": 1.00452268, + "balance_loss_mlp": 1.01995599, + "epoch": 0.18241394859461896, + "flos": 61190957393280.0, + "grad_norm": 0.8883360043233282, + "language_loss": 0.63479006, + "learning_rate": 3.7612253368177625e-06, + "loss": 0.6553843, + "num_input_tokens_seen": 65589375, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.328125, + "step": 3034, + "time_per_iteration": 2.9712142944335938 + }, + { + "auxiliary_loss_clip": 0.01148548, + "auxiliary_loss_mlp": 0.01043903, + "balance_loss_clip": 1.0263083, + "balance_loss_mlp": 1.05033147, + "epoch": 0.18247407184728695, + "flos": 18471694826880.0, + "grad_norm": 2.0132790953316113, + "language_loss": 0.79684323, + "learning_rate": 3.7610407621428893e-06, + "loss": 0.81876773, + "num_input_tokens_seen": 65606720, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.984375, + "step": 3035, + "time_per_iteration": 2.4517030715942383 + }, + { + "auxiliary_loss_clip": 0.01147231, + "auxiliary_loss_mlp": 0.01044908, + "balance_loss_clip": 1.02792096, + "balance_loss_mlp": 1.05231702, + "epoch": 0.18253419509995492, + "flos": 21795191602560.0, + "grad_norm": 2.217606261766961, + "language_loss": 0.84895855, + "learning_rate": 3.7608561206891735e-06, + "loss": 0.87087989, + "num_input_tokens_seen": 65625495, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.94921875, + "step": 3036, + "time_per_iteration": 2.5017378330230713 + }, + { + "auxiliary_loss_clip": 0.01142577, + "auxiliary_loss_mlp": 0.01042615, + "balance_loss_clip": 1.02524662, + "balance_loss_mlp": 1.04940438, + "epoch": 0.18259431835262288, + "flos": 20149764456960.0, + "grad_norm": 2.216717642760365, + "language_loss": 0.79836094, + "learning_rate": 3.760671412463617e-06, + "loss": 0.82021284, + "num_input_tokens_seen": 65643515, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9296875, + "step": 3037, + "time_per_iteration": 2.4591338634490967 + }, + { + "auxiliary_loss_clip": 0.01149617, + "auxiliary_loss_mlp": 0.01047298, + "balance_loss_clip": 1.02686584, + "balance_loss_mlp": 1.05208671, + "epoch": 0.18265444160529085, + "flos": 16981653916800.0, + "grad_norm": 2.68131613553598, + "language_loss": 0.79450762, + "learning_rate": 3.7604866374732246e-06, + "loss": 0.81647676, + "num_input_tokens_seen": 65658155, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.9765625, + "step": 3038, + "time_per_iteration": 2.440664768218994 + }, + { + "auxiliary_loss_clip": 0.0114731, + "auxiliary_loss_mlp": 0.01048244, + "balance_loss_clip": 1.03069699, + "balance_loss_mlp": 1.05140162, + "epoch": 0.1827145648579588, + "flos": 34423250509440.0, + "grad_norm": 2.3213350225315748, + "language_loss": 0.67311364, + "learning_rate": 3.7603017957250023e-06, + "loss": 0.69506919, + "num_input_tokens_seen": 65679310, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.95703125, + "step": 3039, + "time_per_iteration": 2.573272466659546 + }, + { + "auxiliary_loss_clip": 0.01148566, + "auxiliary_loss_mlp": 0.01051856, + "balance_loss_clip": 1.03323567, + "balance_loss_mlp": 1.05112875, + "epoch": 0.18277468811062678, + "flos": 53287017264000.0, + "grad_norm": 1.9125298187860031, + "language_loss": 0.73687911, + "learning_rate": 3.7601168872259593e-06, + "loss": 0.75888336, + "num_input_tokens_seen": 65705235, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9765625, + "step": 3040, + "time_per_iteration": 2.771242618560791 + }, + { + "auxiliary_loss_clip": 0.0114474, + "auxiliary_loss_mlp": 0.01042775, + "balance_loss_clip": 1.02418995, + "balance_loss_mlp": 1.04849768, + "epoch": 0.18283481136329474, + "flos": 31650659602560.0, + "grad_norm": 1.8780343880464916, + "language_loss": 0.60176188, + "learning_rate": 3.7599319119831075e-06, + "loss": 0.62363702, + "num_input_tokens_seen": 65727575, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9609375, + "step": 3041, + "time_per_iteration": 2.670691967010498 + }, + { + "auxiliary_loss_clip": 0.01146425, + "auxiliary_loss_mlp": 0.01055713, + "balance_loss_clip": 1.03756928, + "balance_loss_mlp": 1.05012786, + "epoch": 0.18289493461596273, + "flos": 53137664513280.0, + "grad_norm": 1.7488247873746179, + "language_loss": 0.60361505, + "learning_rate": 3.7597468700034616e-06, + "loss": 0.6256364, + "num_input_tokens_seen": 65751370, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9609375, + "step": 3042, + "time_per_iteration": 2.7942960262298584 + }, + { + "auxiliary_loss_clip": 0.01144442, + "auxiliary_loss_mlp": 0.0104919, + "balance_loss_clip": 1.03143954, + "balance_loss_mlp": 1.04945385, + "epoch": 0.1829550578686307, + "flos": 25589369220480.0, + "grad_norm": 1.6831322617730042, + "language_loss": 0.8769263, + "learning_rate": 3.7595617612940374e-06, + "loss": 0.8988626, + "num_input_tokens_seen": 65771040, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.94921875, + "step": 3043, + "time_per_iteration": 2.524871587753296 + }, + { + "auxiliary_loss_clip": 0.01149409, + "auxiliary_loss_mlp": 0.01049211, + "balance_loss_clip": 1.03005409, + "balance_loss_mlp": 1.05107832, + "epoch": 0.18301518112129866, + "flos": 22601422321920.0, + "grad_norm": 1.9464603469819268, + "language_loss": 0.707008, + "learning_rate": 3.7593765858618552e-06, + "loss": 0.72899425, + "num_input_tokens_seen": 65789345, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.984375, + "step": 3044, + "time_per_iteration": 2.483335018157959 + }, + { + "auxiliary_loss_clip": 0.01150484, + "auxiliary_loss_mlp": 0.01055406, + "balance_loss_clip": 1.03552175, + "balance_loss_mlp": 1.04929996, + "epoch": 0.18307530437396663, + "flos": 34020799551360.0, + "grad_norm": 2.0901220952627497, + "language_loss": 0.64385587, + "learning_rate": 3.7591913437139365e-06, + "loss": 0.66591471, + "num_input_tokens_seen": 65810990, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.015625, + "step": 3045, + "time_per_iteration": 2.592855453491211 + }, + { + "auxiliary_loss_clip": 0.01145205, + "auxiliary_loss_mlp": 0.01054969, + "balance_loss_clip": 1.0368377, + "balance_loss_mlp": 1.04977548, + "epoch": 0.1831354276266346, + "flos": 21279765392640.0, + "grad_norm": 2.998731206361719, + "language_loss": 0.79165137, + "learning_rate": 3.7590060348573066e-06, + "loss": 0.81365317, + "num_input_tokens_seen": 65827230, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.953125, + "step": 3046, + "time_per_iteration": 2.5034587383270264 + }, + { + "auxiliary_loss_clip": 0.01146985, + "auxiliary_loss_mlp": 0.01048107, + "balance_loss_clip": 1.02908087, + "balance_loss_mlp": 1.04764223, + "epoch": 0.18319555087930256, + "flos": 21032952065280.0, + "grad_norm": 3.3529268295267016, + "language_loss": 0.78991181, + "learning_rate": 3.7588206592989903e-06, + "loss": 0.81186271, + "num_input_tokens_seen": 65845900, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.9921875, + "step": 3047, + "time_per_iteration": 2.5140535831451416 + }, + { + "auxiliary_loss_clip": 0.01145799, + "auxiliary_loss_mlp": 0.01046901, + "balance_loss_clip": 1.02923381, + "balance_loss_mlp": 1.05111742, + "epoch": 0.18325567413197055, + "flos": 34382958428160.0, + "grad_norm": 1.5613113238500957, + "language_loss": 0.80888635, + "learning_rate": 3.7586352170460194e-06, + "loss": 0.83081341, + "num_input_tokens_seen": 65868730, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9453125, + "step": 3048, + "time_per_iteration": 2.5848963260650635 + }, + { + "auxiliary_loss_clip": 0.01148052, + "auxiliary_loss_mlp": 0.01042108, + "balance_loss_clip": 1.02283192, + "balance_loss_mlp": 1.0502528, + "epoch": 0.18331579738463852, + "flos": 20558464381440.0, + "grad_norm": 1.8161394933049422, + "language_loss": 0.86232805, + "learning_rate": 3.758449708105424e-06, + "loss": 0.88422966, + "num_input_tokens_seen": 65888420, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.9765625, + "step": 3049, + "time_per_iteration": 2.4665114879608154 + }, + { + "auxiliary_loss_clip": 0.01154476, + "auxiliary_loss_mlp": 0.01043247, + "balance_loss_clip": 1.02364874, + "balance_loss_mlp": 1.05159521, + "epoch": 0.18337592063730648, + "flos": 19607872901760.0, + "grad_norm": 2.2703740748038066, + "language_loss": 0.77160966, + "learning_rate": 3.75826413248424e-06, + "loss": 0.79358685, + "num_input_tokens_seen": 65905840, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.03125, + "step": 3050, + "time_per_iteration": 2.4525256156921387 + }, + { + "auxiliary_loss_clip": 0.01146286, + "auxiliary_loss_mlp": 0.01045385, + "balance_loss_clip": 1.02683592, + "balance_loss_mlp": 1.04867804, + "epoch": 0.18343604388997445, + "flos": 20850885002880.0, + "grad_norm": 2.010292972394078, + "language_loss": 0.99174476, + "learning_rate": 3.7580784901895035e-06, + "loss": 1.0136615, + "num_input_tokens_seen": 65922845, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9765625, + "step": 3051, + "time_per_iteration": 2.4559926986694336 + }, + { + "auxiliary_loss_clip": 0.01145751, + "auxiliary_loss_mlp": 0.01039078, + "balance_loss_clip": 1.02096963, + "balance_loss_mlp": 1.050529, + "epoch": 0.1834961671426424, + "flos": 24394370624640.0, + "grad_norm": 1.5992624239842805, + "language_loss": 0.86153144, + "learning_rate": 3.7578927812282542e-06, + "loss": 0.8833797, + "num_input_tokens_seen": 65945555, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.953125, + "step": 3052, + "time_per_iteration": 2.559396505355835 + }, + { + "auxiliary_loss_clip": 0.01145626, + "auxiliary_loss_mlp": 0.0105059, + "balance_loss_clip": 1.03267264, + "balance_loss_mlp": 1.04985499, + "epoch": 0.18355629039531038, + "flos": 21251612108160.0, + "grad_norm": 1.8182752776897229, + "language_loss": 0.73004341, + "learning_rate": 3.7577070056075356e-06, + "loss": 0.75200558, + "num_input_tokens_seen": 65963965, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.95703125, + "step": 3053, + "time_per_iteration": 2.4481074810028076 + }, + { + "auxiliary_loss_clip": 0.01150169, + "auxiliary_loss_mlp": 0.01049972, + "balance_loss_clip": 1.03051662, + "balance_loss_mlp": 1.05208337, + "epoch": 0.18361641364797834, + "flos": 28656499651200.0, + "grad_norm": 1.6467304764216655, + "language_loss": 0.62212563, + "learning_rate": 3.7575211633343902e-06, + "loss": 0.64412701, + "num_input_tokens_seen": 65985965, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.98046875, + "step": 3054, + "time_per_iteration": 2.5701377391815186 + }, + { + "auxiliary_loss_clip": 0.01146023, + "auxiliary_loss_mlp": 0.01042771, + "balance_loss_clip": 1.02510393, + "balance_loss_mlp": 1.04962707, + "epoch": 0.18367653690064634, + "flos": 20918827578240.0, + "grad_norm": 2.2210920593094325, + "language_loss": 0.78501689, + "learning_rate": 3.7573352544158663e-06, + "loss": 0.80690485, + "num_input_tokens_seen": 66005645, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9609375, + "step": 3055, + "time_per_iteration": 2.47776198387146 + }, + { + "auxiliary_loss_clip": 0.0114232, + "auxiliary_loss_mlp": 0.01053937, + "balance_loss_clip": 1.03622222, + "balance_loss_mlp": 1.04779387, + "epoch": 0.1837366601533143, + "flos": 28765596234240.0, + "grad_norm": 1.894881128028073, + "language_loss": 0.70218527, + "learning_rate": 3.757149278859014e-06, + "loss": 0.72414786, + "num_input_tokens_seen": 66025675, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9453125, + "step": 3056, + "time_per_iteration": 2.541361093521118 + }, + { + "auxiliary_loss_clip": 0.0114918, + "auxiliary_loss_mlp": 0.01043721, + "balance_loss_clip": 1.02612543, + "balance_loss_mlp": 1.05066419, + "epoch": 0.18379678340598227, + "flos": 21251432540160.0, + "grad_norm": 1.4932354373853338, + "language_loss": 0.8028152, + "learning_rate": 3.7569632366708842e-06, + "loss": 0.82474422, + "num_input_tokens_seen": 66046125, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.984375, + "step": 3057, + "time_per_iteration": 2.4718995094299316 + }, + { + "auxiliary_loss_clip": 0.0115229, + "auxiliary_loss_mlp": 0.01049373, + "balance_loss_clip": 1.02864265, + "balance_loss_mlp": 1.04847729, + "epoch": 0.18385690665865023, + "flos": 20449619193600.0, + "grad_norm": 2.0112890674266914, + "language_loss": 0.82289785, + "learning_rate": 3.756777127858533e-06, + "loss": 0.84491444, + "num_input_tokens_seen": 66064375, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.0390625, + "step": 3058, + "time_per_iteration": 2.4653379917144775 + }, + { + "auxiliary_loss_clip": 0.01148012, + "auxiliary_loss_mlp": 0.01046535, + "balance_loss_clip": 1.02818882, + "balance_loss_mlp": 1.04893029, + "epoch": 0.1839170299113182, + "flos": 26140562398080.0, + "grad_norm": 2.205773819593527, + "language_loss": 0.85894352, + "learning_rate": 3.756590952429017e-06, + "loss": 0.88088906, + "num_input_tokens_seen": 66084590, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.9921875, + "step": 3059, + "time_per_iteration": 4.0151047706604 + }, + { + "auxiliary_loss_clip": 0.01145706, + "auxiliary_loss_mlp": 0.01045338, + "balance_loss_clip": 1.02724195, + "balance_loss_mlp": 1.04931092, + "epoch": 0.18397715316398616, + "flos": 31758032332800.0, + "grad_norm": 1.70952354928268, + "language_loss": 0.72799402, + "learning_rate": 3.756404710389396e-06, + "loss": 0.74990445, + "num_input_tokens_seen": 66107105, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9609375, + "step": 3060, + "time_per_iteration": 5.466471195220947 + }, + { + "auxiliary_loss_clip": 0.01151276, + "auxiliary_loss_mlp": 0.01042783, + "balance_loss_clip": 1.02375722, + "balance_loss_mlp": 1.05253565, + "epoch": 0.18403727641665413, + "flos": 24611989173120.0, + "grad_norm": 1.7373746338425942, + "language_loss": 0.72797298, + "learning_rate": 3.7562184017467323e-06, + "loss": 0.74991357, + "num_input_tokens_seen": 66129295, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.98828125, + "step": 3061, + "time_per_iteration": 2.5244035720825195 + }, + { + "auxiliary_loss_clip": 0.01146537, + "auxiliary_loss_mlp": 0.0104557, + "balance_loss_clip": 1.02697313, + "balance_loss_mlp": 1.05087519, + "epoch": 0.18409739966932212, + "flos": 23439900476160.0, + "grad_norm": 1.8714044833418495, + "language_loss": 0.81622046, + "learning_rate": 3.7560320265080906e-06, + "loss": 0.83814156, + "num_input_tokens_seen": 66146910, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.95703125, + "step": 3062, + "time_per_iteration": 2.4767649173736572 + }, + { + "auxiliary_loss_clip": 0.01154667, + "auxiliary_loss_mlp": 0.01045371, + "balance_loss_clip": 1.02681041, + "balance_loss_mlp": 1.05394542, + "epoch": 0.18415752292199009, + "flos": 21872112577920.0, + "grad_norm": 1.7582970194369052, + "language_loss": 0.72718614, + "learning_rate": 3.7558455846805383e-06, + "loss": 0.74918652, + "num_input_tokens_seen": 66165370, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0078125, + "step": 3063, + "time_per_iteration": 2.5082144737243652 + }, + { + "auxiliary_loss_clip": 0.01146453, + "auxiliary_loss_mlp": 0.01041784, + "balance_loss_clip": 1.02516627, + "balance_loss_mlp": 1.04935837, + "epoch": 0.18421764617465805, + "flos": 25410678036480.0, + "grad_norm": 2.1216519555610183, + "language_loss": 0.65496099, + "learning_rate": 3.7556590762711463e-06, + "loss": 0.6768434, + "num_input_tokens_seen": 66186210, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.96875, + "step": 3064, + "time_per_iteration": 2.523141622543335 + }, + { + "auxiliary_loss_clip": 0.01149329, + "auxiliary_loss_mlp": 0.01048992, + "balance_loss_clip": 1.03081298, + "balance_loss_mlp": 1.05274165, + "epoch": 0.18427776942732602, + "flos": 27198131558400.0, + "grad_norm": 2.6163412642887947, + "language_loss": 0.68768656, + "learning_rate": 3.7554725012869853e-06, + "loss": 0.70966971, + "num_input_tokens_seen": 66204800, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.96875, + "step": 3065, + "time_per_iteration": 2.5244293212890625 + }, + { + "auxiliary_loss_clip": 0.01151353, + "auxiliary_loss_mlp": 0.01047403, + "balance_loss_clip": 1.02819824, + "balance_loss_mlp": 1.05120087, + "epoch": 0.18433789267999398, + "flos": 27852351920640.0, + "grad_norm": 4.932084281869228, + "language_loss": 0.72561431, + "learning_rate": 3.7552858597351318e-06, + "loss": 0.74760187, + "num_input_tokens_seen": 66222195, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0, + "step": 3066, + "time_per_iteration": 2.5428919792175293 + }, + { + "auxiliary_loss_clip": 0.01148706, + "auxiliary_loss_mlp": 0.01043732, + "balance_loss_clip": 1.02608871, + "balance_loss_mlp": 1.05074954, + "epoch": 0.18439801593266195, + "flos": 17856940533120.0, + "grad_norm": 1.9825677919996112, + "language_loss": 0.82477474, + "learning_rate": 3.7550991516226622e-06, + "loss": 0.84669906, + "num_input_tokens_seen": 66239505, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.98046875, + "step": 3067, + "time_per_iteration": 2.4500880241394043 + }, + { + "auxiliary_loss_clip": 0.01048916, + "auxiliary_loss_mlp": 0.01007477, + "balance_loss_clip": 1.00535476, + "balance_loss_mlp": 1.01668859, + "epoch": 0.18445813918532994, + "flos": 56389522590720.0, + "grad_norm": 0.7924805733675573, + "language_loss": 0.59706604, + "learning_rate": 3.754912376956657e-06, + "loss": 0.61763, + "num_input_tokens_seen": 66295695, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.32226562, + "step": 3068, + "time_per_iteration": 2.9375104904174805 + }, + { + "auxiliary_loss_clip": 0.01153283, + "auxiliary_loss_mlp": 0.01040124, + "balance_loss_clip": 1.02232563, + "balance_loss_mlp": 1.05714762, + "epoch": 0.1845182624379979, + "flos": 20957180325120.0, + "grad_norm": 1.8708990955689164, + "language_loss": 0.76227212, + "learning_rate": 3.7547255357441987e-06, + "loss": 0.78420615, + "num_input_tokens_seen": 66315315, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9609375, + "step": 3069, + "time_per_iteration": 2.462446451187134 + }, + { + "auxiliary_loss_clip": 0.01151625, + "auxiliary_loss_mlp": 0.01040566, + "balance_loss_clip": 1.02233863, + "balance_loss_mlp": 1.05299067, + "epoch": 0.18457838569066587, + "flos": 20485170679680.0, + "grad_norm": 1.7428293735192475, + "language_loss": 0.84803855, + "learning_rate": 3.7545386279923718e-06, + "loss": 0.86996043, + "num_input_tokens_seen": 66333675, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.984375, + "step": 3070, + "time_per_iteration": 2.4887194633483887 + }, + { + "auxiliary_loss_clip": 0.01152145, + "auxiliary_loss_mlp": 0.01042624, + "balance_loss_clip": 1.02462363, + "balance_loss_mlp": 1.05298758, + "epoch": 0.18463850894333383, + "flos": 25010022758400.0, + "grad_norm": 1.9722863584187038, + "language_loss": 0.77370453, + "learning_rate": 3.754351653708265e-06, + "loss": 0.79565221, + "num_input_tokens_seen": 66354075, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.98828125, + "step": 3071, + "time_per_iteration": 2.482213258743286 + }, + { + "auxiliary_loss_clip": 0.01152228, + "auxiliary_loss_mlp": 0.01048541, + "balance_loss_clip": 1.03042173, + "balance_loss_mlp": 1.05342758, + "epoch": 0.1846986321960018, + "flos": 16800628348800.0, + "grad_norm": 2.705053980849468, + "language_loss": 0.77691031, + "learning_rate": 3.7541646128989674e-06, + "loss": 0.79891801, + "num_input_tokens_seen": 66372520, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9921875, + "step": 3072, + "time_per_iteration": 2.466387987136841 + }, + { + "auxiliary_loss_clip": 0.01150023, + "auxiliary_loss_mlp": 0.01042519, + "balance_loss_clip": 1.02339804, + "balance_loss_mlp": 1.05013216, + "epoch": 0.18475875544866976, + "flos": 20814327936000.0, + "grad_norm": 1.8173375196390826, + "language_loss": 0.8607235, + "learning_rate": 3.7539775055715715e-06, + "loss": 0.88264889, + "num_input_tokens_seen": 66390745, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0, + "step": 3073, + "time_per_iteration": 2.4510810375213623 + }, + { + "auxiliary_loss_clip": 0.01151045, + "auxiliary_loss_mlp": 0.01045152, + "balance_loss_clip": 1.02851045, + "balance_loss_mlp": 1.05339348, + "epoch": 0.18481887870133773, + "flos": 22601422321920.0, + "grad_norm": 2.2059027996031877, + "language_loss": 0.92005521, + "learning_rate": 3.7537903317331732e-06, + "loss": 0.9420172, + "num_input_tokens_seen": 66410525, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.97265625, + "step": 3074, + "time_per_iteration": 2.473710298538208 + }, + { + "auxiliary_loss_clip": 0.01146992, + "auxiliary_loss_mlp": 0.01044255, + "balance_loss_clip": 1.02490735, + "balance_loss_mlp": 1.05028176, + "epoch": 0.18487900195400572, + "flos": 29458815788160.0, + "grad_norm": 1.9913742546968862, + "language_loss": 0.65041798, + "learning_rate": 3.75360309139087e-06, + "loss": 0.67233044, + "num_input_tokens_seen": 66432535, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.96875, + "step": 3075, + "time_per_iteration": 2.533724784851074 + }, + { + "auxiliary_loss_clip": 0.01149493, + "auxiliary_loss_mlp": 0.01043367, + "balance_loss_clip": 1.02578402, + "balance_loss_mlp": 1.053177, + "epoch": 0.1849391252066737, + "flos": 20628777254400.0, + "grad_norm": 1.709240712607824, + "language_loss": 0.72323918, + "learning_rate": 3.753415784551761e-06, + "loss": 0.74516779, + "num_input_tokens_seen": 66450620, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9609375, + "step": 3076, + "time_per_iteration": 2.4544899463653564 + }, + { + "auxiliary_loss_clip": 0.01153692, + "auxiliary_loss_mlp": 0.01046042, + "balance_loss_clip": 1.0280292, + "balance_loss_mlp": 1.05341136, + "epoch": 0.18499924845934165, + "flos": 14428549065600.0, + "grad_norm": 2.4900368363969854, + "language_loss": 0.80860448, + "learning_rate": 3.7532284112229507e-06, + "loss": 0.83060181, + "num_input_tokens_seen": 66467865, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0, + "step": 3077, + "time_per_iteration": 2.45137882232666 + }, + { + "auxiliary_loss_clip": 0.01146798, + "auxiliary_loss_mlp": 0.01045715, + "balance_loss_clip": 1.02816749, + "balance_loss_mlp": 1.05103469, + "epoch": 0.18505937171200962, + "flos": 23727652329600.0, + "grad_norm": 1.7908770900539794, + "language_loss": 0.78764129, + "learning_rate": 3.7530409714115424e-06, + "loss": 0.8095665, + "num_input_tokens_seen": 66486245, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.95703125, + "step": 3078, + "time_per_iteration": 2.477393865585327 + }, + { + "auxiliary_loss_clip": 0.01147874, + "auxiliary_loss_mlp": 0.01044325, + "balance_loss_clip": 1.02714717, + "balance_loss_mlp": 1.05057585, + "epoch": 0.18511949496467758, + "flos": 25957489754880.0, + "grad_norm": 1.8549646444276375, + "language_loss": 0.7758081, + "learning_rate": 3.7528534651246453e-06, + "loss": 0.79773009, + "num_input_tokens_seen": 66506510, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9765625, + "step": 3079, + "time_per_iteration": 2.5069448947906494 + }, + { + "auxiliary_loss_clip": 0.01143899, + "auxiliary_loss_mlp": 0.0104323, + "balance_loss_clip": 1.02581406, + "balance_loss_mlp": 1.04723024, + "epoch": 0.18517961821734555, + "flos": 42413553912960.0, + "grad_norm": 2.3452692712375893, + "language_loss": 0.81668431, + "learning_rate": 3.752665892369369e-06, + "loss": 0.83855557, + "num_input_tokens_seen": 66530960, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.96484375, + "step": 3080, + "time_per_iteration": 2.688206911087036 + }, + { + "auxiliary_loss_clip": 0.01149652, + "auxiliary_loss_mlp": 0.01046927, + "balance_loss_clip": 1.02812803, + "balance_loss_mlp": 1.05079699, + "epoch": 0.18523974147001354, + "flos": 24097568544000.0, + "grad_norm": 2.0276132956863764, + "language_loss": 0.7435087, + "learning_rate": 3.7524782531528266e-06, + "loss": 0.7654745, + "num_input_tokens_seen": 66550275, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.98828125, + "step": 3081, + "time_per_iteration": 2.5003983974456787 + }, + { + "auxiliary_loss_clip": 0.01151656, + "auxiliary_loss_mlp": 0.01050271, + "balance_loss_clip": 1.03124547, + "balance_loss_mlp": 1.05527234, + "epoch": 0.1852998647226815, + "flos": 27375278457600.0, + "grad_norm": 2.070281784994394, + "language_loss": 0.71532816, + "learning_rate": 3.7522905474821334e-06, + "loss": 0.73734742, + "num_input_tokens_seen": 66569040, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.9609375, + "step": 3082, + "time_per_iteration": 2.514004707336426 + }, + { + "auxiliary_loss_clip": 0.011545, + "auxiliary_loss_mlp": 0.01050471, + "balance_loss_clip": 1.03155267, + "balance_loss_mlp": 1.05488813, + "epoch": 0.18535998797534947, + "flos": 18332757020160.0, + "grad_norm": 1.869200996989063, + "language_loss": 0.69338834, + "learning_rate": 3.752102775364407e-06, + "loss": 0.71543807, + "num_input_tokens_seen": 66587775, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.9921875, + "step": 3083, + "time_per_iteration": 2.446418523788452 + }, + { + "auxiliary_loss_clip": 0.0114679, + "auxiliary_loss_mlp": 0.01049301, + "balance_loss_clip": 1.03187287, + "balance_loss_mlp": 1.05216169, + "epoch": 0.18542011122801744, + "flos": 37845859887360.0, + "grad_norm": 4.022344342016001, + "language_loss": 0.68854296, + "learning_rate": 3.751914936806767e-06, + "loss": 0.71050388, + "num_input_tokens_seen": 66610800, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9453125, + "step": 3084, + "time_per_iteration": 2.5964090824127197 + }, + { + "auxiliary_loss_clip": 0.01145496, + "auxiliary_loss_mlp": 0.01043341, + "balance_loss_clip": 1.02541232, + "balance_loss_mlp": 1.04961908, + "epoch": 0.1854802344806854, + "flos": 25186128163200.0, + "grad_norm": 1.5883609883793584, + "language_loss": 0.77831411, + "learning_rate": 3.7517270318163377e-06, + "loss": 0.80020249, + "num_input_tokens_seen": 66630960, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9609375, + "step": 3085, + "time_per_iteration": 2.500401020050049 + }, + { + "auxiliary_loss_clip": 0.01145039, + "auxiliary_loss_mlp": 0.01053452, + "balance_loss_clip": 1.03557014, + "balance_loss_mlp": 1.04887915, + "epoch": 0.18554035773335337, + "flos": 26684788337280.0, + "grad_norm": 1.8880953488015286, + "language_loss": 0.73488086, + "learning_rate": 3.751539060400244e-06, + "loss": 0.7568658, + "num_input_tokens_seen": 66650585, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.9609375, + "step": 3086, + "time_per_iteration": 2.5121798515319824 + }, + { + "auxiliary_loss_clip": 0.0114717, + "auxiliary_loss_mlp": 0.01048198, + "balance_loss_clip": 1.02949429, + "balance_loss_mlp": 1.05223882, + "epoch": 0.18560048098602133, + "flos": 22346887570560.0, + "grad_norm": 4.074676999617497, + "language_loss": 0.70087367, + "learning_rate": 3.7513510225656132e-06, + "loss": 0.72282737, + "num_input_tokens_seen": 66670045, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.953125, + "step": 3087, + "time_per_iteration": 2.469980001449585 + }, + { + "auxiliary_loss_clip": 0.01149215, + "auxiliary_loss_mlp": 0.01048668, + "balance_loss_clip": 1.02928519, + "balance_loss_mlp": 1.05118215, + "epoch": 0.18566060423868933, + "flos": 17748526308480.0, + "grad_norm": 2.299065028063824, + "language_loss": 0.72731185, + "learning_rate": 3.7511629183195764e-06, + "loss": 0.74929065, + "num_input_tokens_seen": 66688790, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.98046875, + "step": 3088, + "time_per_iteration": 2.4569249153137207 + }, + { + "auxiliary_loss_clip": 0.01144422, + "auxiliary_loss_mlp": 0.01045089, + "balance_loss_clip": 1.02733839, + "balance_loss_mlp": 1.05015588, + "epoch": 0.1857207274913573, + "flos": 24677274142080.0, + "grad_norm": 2.023411505730453, + "language_loss": 0.91849768, + "learning_rate": 3.7509747476692663e-06, + "loss": 0.94039273, + "num_input_tokens_seen": 66708090, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.94140625, + "step": 3089, + "time_per_iteration": 2.5086276531219482 + }, + { + "auxiliary_loss_clip": 0.01146464, + "auxiliary_loss_mlp": 0.0104377, + "balance_loss_clip": 1.02573323, + "balance_loss_mlp": 1.05124271, + "epoch": 0.18578085074402526, + "flos": 28147825198080.0, + "grad_norm": 2.7535733421879174, + "language_loss": 0.57406759, + "learning_rate": 3.7507865106218176e-06, + "loss": 0.59596992, + "num_input_tokens_seen": 66727320, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.953125, + "step": 3090, + "time_per_iteration": 2.544934034347534 + }, + { + "auxiliary_loss_clip": 0.011443, + "auxiliary_loss_mlp": 0.01049477, + "balance_loss_clip": 1.03133333, + "balance_loss_mlp": 1.04945779, + "epoch": 0.18584097399669322, + "flos": 23951878980480.0, + "grad_norm": 1.9526543189913628, + "language_loss": 0.82229531, + "learning_rate": 3.7505982071843695e-06, + "loss": 0.84423304, + "num_input_tokens_seen": 66747505, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9453125, + "step": 3091, + "time_per_iteration": 2.5339536666870117 + }, + { + "auxiliary_loss_clip": 0.01149127, + "auxiliary_loss_mlp": 0.01049478, + "balance_loss_clip": 1.03165662, + "balance_loss_mlp": 1.05212235, + "epoch": 0.18590109724936119, + "flos": 17201678676480.0, + "grad_norm": 2.0588011246991127, + "language_loss": 0.83561456, + "learning_rate": 3.7504098373640617e-06, + "loss": 0.85760063, + "num_input_tokens_seen": 66766425, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.96875, + "step": 3092, + "time_per_iteration": 2.5091474056243896 + }, + { + "auxiliary_loss_clip": 0.01151013, + "auxiliary_loss_mlp": 0.01044436, + "balance_loss_clip": 1.02562487, + "balance_loss_mlp": 1.05010569, + "epoch": 0.18596122050202915, + "flos": 17234644383360.0, + "grad_norm": 4.142827775979207, + "language_loss": 0.93487823, + "learning_rate": 3.750221401168038e-06, + "loss": 0.95683277, + "num_input_tokens_seen": 66781130, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0078125, + "step": 3093, + "time_per_iteration": 2.4307475090026855 + }, + { + "auxiliary_loss_clip": 0.01146588, + "auxiliary_loss_mlp": 0.01038182, + "balance_loss_clip": 1.02115917, + "balance_loss_mlp": 1.05090082, + "epoch": 0.18602134375469712, + "flos": 19020733188480.0, + "grad_norm": 2.060946690404802, + "language_loss": 0.77380008, + "learning_rate": 3.750032898603443e-06, + "loss": 0.79564774, + "num_input_tokens_seen": 66797535, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.953125, + "step": 3094, + "time_per_iteration": 2.4520375728607178 + }, + { + "auxiliary_loss_clip": 0.01147212, + "auxiliary_loss_mlp": 0.01047698, + "balance_loss_clip": 1.03098452, + "balance_loss_mlp": 1.05099964, + "epoch": 0.1860814670073651, + "flos": 50950094417280.0, + "grad_norm": 1.6535165555915046, + "language_loss": 0.69985378, + "learning_rate": 3.749844329677425e-06, + "loss": 0.72180283, + "num_input_tokens_seen": 66821720, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9609375, + "step": 3095, + "time_per_iteration": 2.7395834922790527 + }, + { + "auxiliary_loss_clip": 0.01149572, + "auxiliary_loss_mlp": 0.01045107, + "balance_loss_clip": 1.02614033, + "balance_loss_mlp": 1.05169249, + "epoch": 0.18614159026003307, + "flos": 19390972625280.0, + "grad_norm": 1.9053555001005595, + "language_loss": 0.8077082, + "learning_rate": 3.749655694397135e-06, + "loss": 0.82965505, + "num_input_tokens_seen": 66839060, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.98046875, + "step": 3096, + "time_per_iteration": 2.4506232738494873 + }, + { + "auxiliary_loss_clip": 0.01147695, + "auxiliary_loss_mlp": 0.01047208, + "balance_loss_clip": 1.02883816, + "balance_loss_mlp": 1.05086875, + "epoch": 0.18620171351270104, + "flos": 21798782962560.0, + "grad_norm": 2.061308652340225, + "language_loss": 0.75101036, + "learning_rate": 3.7494669927697255e-06, + "loss": 0.77295941, + "num_input_tokens_seen": 66857760, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.96875, + "step": 3097, + "time_per_iteration": 2.46639347076416 + }, + { + "auxiliary_loss_clip": 0.01147181, + "auxiliary_loss_mlp": 0.01045993, + "balance_loss_clip": 1.02789688, + "balance_loss_mlp": 1.05196047, + "epoch": 0.186261836765369, + "flos": 16362877299840.0, + "grad_norm": 2.5365100966912664, + "language_loss": 0.66038394, + "learning_rate": 3.749278224802352e-06, + "loss": 0.68231571, + "num_input_tokens_seen": 66876460, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.953125, + "step": 3098, + "time_per_iteration": 2.46763014793396 + }, + { + "auxiliary_loss_clip": 0.01148744, + "auxiliary_loss_mlp": 0.01048857, + "balance_loss_clip": 1.02973545, + "balance_loss_mlp": 1.04978585, + "epoch": 0.18632196001803697, + "flos": 23370054480000.0, + "grad_norm": 1.6025275160282182, + "language_loss": 0.69907904, + "learning_rate": 3.7490893905021733e-06, + "loss": 0.72105503, + "num_input_tokens_seen": 66897960, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.98828125, + "step": 3099, + "time_per_iteration": 2.469336748123169 + }, + { + "auxiliary_loss_clip": 0.01147788, + "auxiliary_loss_mlp": 0.01052362, + "balance_loss_clip": 1.03290749, + "balance_loss_mlp": 1.04985309, + "epoch": 0.18638208327070493, + "flos": 22492002516480.0, + "grad_norm": 1.4888180158498334, + "language_loss": 0.71623552, + "learning_rate": 3.7489004898763494e-06, + "loss": 0.73823702, + "num_input_tokens_seen": 66917675, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.9765625, + "step": 3100, + "time_per_iteration": 2.510803699493408 + }, + { + "auxiliary_loss_clip": 0.01150621, + "auxiliary_loss_mlp": 0.01050424, + "balance_loss_clip": 1.03104091, + "balance_loss_mlp": 1.05147338, + "epoch": 0.18644220652337293, + "flos": 29165245931520.0, + "grad_norm": 2.2181859131844757, + "language_loss": 0.80163074, + "learning_rate": 3.7487115229320444e-06, + "loss": 0.82364118, + "num_input_tokens_seen": 66936000, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.9921875, + "step": 3101, + "time_per_iteration": 4.007607936859131 + }, + { + "auxiliary_loss_clip": 0.0114449, + "auxiliary_loss_mlp": 0.01043434, + "balance_loss_clip": 1.02606487, + "balance_loss_mlp": 1.05100489, + "epoch": 0.1865023297760409, + "flos": 24243796811520.0, + "grad_norm": 2.082156961368248, + "language_loss": 0.76803768, + "learning_rate": 3.7485224896764222e-06, + "loss": 0.78991693, + "num_input_tokens_seen": 66955700, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9375, + "step": 3102, + "time_per_iteration": 5.438685894012451 + }, + { + "auxiliary_loss_clip": 0.0114717, + "auxiliary_loss_mlp": 0.01041158, + "balance_loss_clip": 1.02322865, + "balance_loss_mlp": 1.04973269, + "epoch": 0.18656245302870886, + "flos": 19128716449920.0, + "grad_norm": 2.5595226686006565, + "language_loss": 0.76962835, + "learning_rate": 3.7483333901166525e-06, + "loss": 0.79151165, + "num_input_tokens_seen": 66972815, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9765625, + "step": 3103, + "time_per_iteration": 2.4742202758789062 + }, + { + "auxiliary_loss_clip": 0.0114783, + "auxiliary_loss_mlp": 0.0104302, + "balance_loss_clip": 1.02540123, + "balance_loss_mlp": 1.05014729, + "epoch": 0.18662257628137682, + "flos": 17786088956160.0, + "grad_norm": 1.966347666558745, + "language_loss": 0.79074025, + "learning_rate": 3.7481442242599054e-06, + "loss": 0.81264877, + "num_input_tokens_seen": 66992280, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9765625, + "step": 3104, + "time_per_iteration": 2.4873924255371094 + }, + { + "auxiliary_loss_clip": 0.01148715, + "auxiliary_loss_mlp": 0.01043939, + "balance_loss_clip": 1.02653468, + "balance_loss_mlp": 1.05237842, + "epoch": 0.1866826995340448, + "flos": 24024382583040.0, + "grad_norm": 1.943867006204371, + "language_loss": 0.8519029, + "learning_rate": 3.747954992113354e-06, + "loss": 0.87382948, + "num_input_tokens_seen": 67012220, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9609375, + "step": 3105, + "time_per_iteration": 2.488638162612915 + }, + { + "auxiliary_loss_clip": 0.01152184, + "auxiliary_loss_mlp": 0.01047951, + "balance_loss_clip": 1.02872288, + "balance_loss_mlp": 1.0491997, + "epoch": 0.18674282278671275, + "flos": 26141244756480.0, + "grad_norm": 1.7838474228223986, + "language_loss": 0.86952424, + "learning_rate": 3.7477656936841742e-06, + "loss": 0.89152563, + "num_input_tokens_seen": 67032030, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.03125, + "step": 3106, + "time_per_iteration": 2.5103402137756348 + }, + { + "auxiliary_loss_clip": 0.0115436, + "auxiliary_loss_mlp": 0.01044282, + "balance_loss_clip": 1.02623367, + "balance_loss_mlp": 1.05296755, + "epoch": 0.18680294603938072, + "flos": 19201938324480.0, + "grad_norm": 1.9680738799082358, + "language_loss": 0.78253353, + "learning_rate": 3.7475763289795445e-06, + "loss": 0.80451989, + "num_input_tokens_seen": 67048920, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.015625, + "step": 3107, + "time_per_iteration": 2.44567608833313 + }, + { + "auxiliary_loss_clip": 0.01150298, + "auxiliary_loss_mlp": 0.01051545, + "balance_loss_clip": 1.03179181, + "balance_loss_mlp": 1.05040216, + "epoch": 0.1868630692920487, + "flos": 28544889116160.0, + "grad_norm": 1.9125203241398734, + "language_loss": 0.74114668, + "learning_rate": 3.7473868980066446e-06, + "loss": 0.76316506, + "num_input_tokens_seen": 67068645, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0, + "step": 3108, + "time_per_iteration": 2.5254971981048584 + }, + { + "auxiliary_loss_clip": 0.0115125, + "auxiliary_loss_mlp": 0.01045574, + "balance_loss_clip": 1.02684629, + "balance_loss_mlp": 1.05332017, + "epoch": 0.18692319254471668, + "flos": 17238020261760.0, + "grad_norm": 1.6536820415924105, + "language_loss": 0.74707133, + "learning_rate": 3.747197400772658e-06, + "loss": 0.76903957, + "num_input_tokens_seen": 67087075, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.98046875, + "step": 3109, + "time_per_iteration": 2.426945924758911 + }, + { + "auxiliary_loss_clip": 0.01147996, + "auxiliary_loss_mlp": 0.0104719, + "balance_loss_clip": 1.02845001, + "balance_loss_mlp": 1.05078959, + "epoch": 0.18698331579738464, + "flos": 23185186156800.0, + "grad_norm": 1.4293009008592994, + "language_loss": 0.84324062, + "learning_rate": 3.747007837284772e-06, + "loss": 0.86519247, + "num_input_tokens_seen": 67108040, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.97265625, + "step": 3110, + "time_per_iteration": 2.4744956493377686 + }, + { + "auxiliary_loss_clip": 0.01154611, + "auxiliary_loss_mlp": 0.0104307, + "balance_loss_clip": 1.02472341, + "balance_loss_mlp": 1.05598927, + "epoch": 0.1870434390500526, + "flos": 25516721963520.0, + "grad_norm": 1.633662412254079, + "language_loss": 0.84753799, + "learning_rate": 3.7468182075501737e-06, + "loss": 0.86951482, + "num_input_tokens_seen": 67127605, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.984375, + "step": 3111, + "time_per_iteration": 2.4757230281829834 + }, + { + "auxiliary_loss_clip": 0.01149935, + "auxiliary_loss_mlp": 0.01042098, + "balance_loss_clip": 1.02408528, + "balance_loss_mlp": 1.05231404, + "epoch": 0.18710356230272057, + "flos": 19500823393920.0, + "grad_norm": 1.8513735900463348, + "language_loss": 0.76565534, + "learning_rate": 3.7466285115760536e-06, + "loss": 0.78757566, + "num_input_tokens_seen": 67145785, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9765625, + "step": 3112, + "time_per_iteration": 2.465552806854248 + }, + { + "auxiliary_loss_clip": 0.01150842, + "auxiliary_loss_mlp": 0.0104724, + "balance_loss_clip": 1.02907228, + "balance_loss_mlp": 1.0516355, + "epoch": 0.18716368555538854, + "flos": 26760847386240.0, + "grad_norm": 1.8580615351340177, + "language_loss": 0.64277315, + "learning_rate": 3.7464387493696046e-06, + "loss": 0.66475397, + "num_input_tokens_seen": 67165930, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9921875, + "step": 3113, + "time_per_iteration": 2.491805076599121 + }, + { + "auxiliary_loss_clip": 0.01155946, + "auxiliary_loss_mlp": 0.01047625, + "balance_loss_clip": 1.02817035, + "balance_loss_mlp": 1.0528996, + "epoch": 0.1872238088080565, + "flos": 25189827264000.0, + "grad_norm": 2.238258329288858, + "language_loss": 0.81043601, + "learning_rate": 3.746248920938024e-06, + "loss": 0.83247173, + "num_input_tokens_seen": 67185830, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.03125, + "step": 3114, + "time_per_iteration": 2.4947290420532227 + }, + { + "auxiliary_loss_clip": 0.01153492, + "auxiliary_loss_mlp": 0.01054246, + "balance_loss_clip": 1.03361082, + "balance_loss_mlp": 1.05319226, + "epoch": 0.1872839320607245, + "flos": 24134305178880.0, + "grad_norm": 2.2102322241331467, + "language_loss": 0.57819968, + "learning_rate": 3.74605902628851e-06, + "loss": 0.60027713, + "num_input_tokens_seen": 67206930, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.0, + "step": 3115, + "time_per_iteration": 2.4892075061798096 + }, + { + "auxiliary_loss_clip": 0.01151062, + "auxiliary_loss_mlp": 0.01056747, + "balance_loss_clip": 1.03873444, + "balance_loss_mlp": 1.05434299, + "epoch": 0.18734405531339246, + "flos": 21173793292800.0, + "grad_norm": 1.8141768865365742, + "language_loss": 0.71160758, + "learning_rate": 3.745869065428261e-06, + "loss": 0.73368567, + "num_input_tokens_seen": 67226290, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.96484375, + "step": 3116, + "time_per_iteration": 2.4705467224121094 + }, + { + "auxiliary_loss_clip": 0.01142667, + "auxiliary_loss_mlp": 0.01035702, + "balance_loss_clip": 1.01751065, + "balance_loss_mlp": 1.04771161, + "epoch": 0.18740417856606043, + "flos": 17237697039360.0, + "grad_norm": 1.8736078530078255, + "language_loss": 0.78733885, + "learning_rate": 3.7456790383644833e-06, + "loss": 0.80912256, + "num_input_tokens_seen": 67244410, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.94921875, + "step": 3117, + "time_per_iteration": 2.418527126312256 + }, + { + "auxiliary_loss_clip": 0.01151486, + "auxiliary_loss_mlp": 0.01048418, + "balance_loss_clip": 1.02898717, + "balance_loss_mlp": 1.05421317, + "epoch": 0.1874643018187284, + "flos": 32558049999360.0, + "grad_norm": 1.743274375857092, + "language_loss": 0.83945131, + "learning_rate": 3.745488945104381e-06, + "loss": 0.86145031, + "num_input_tokens_seen": 67264470, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.97265625, + "step": 3118, + "time_per_iteration": 2.5691416263580322 + }, + { + "auxiliary_loss_clip": 0.01151442, + "auxiliary_loss_mlp": 0.01049226, + "balance_loss_clip": 1.03109384, + "balance_loss_mlp": 1.0525409, + "epoch": 0.18752442507139636, + "flos": 23258156636160.0, + "grad_norm": 1.7594323212393352, + "language_loss": 0.76151264, + "learning_rate": 3.7452987856551636e-06, + "loss": 0.78351927, + "num_input_tokens_seen": 67284315, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.98828125, + "step": 3119, + "time_per_iteration": 2.459648847579956 + }, + { + "auxiliary_loss_clip": 0.01150997, + "auxiliary_loss_mlp": 0.01053702, + "balance_loss_clip": 1.03549838, + "balance_loss_mlp": 1.05181718, + "epoch": 0.18758454832406432, + "flos": 21760933006080.0, + "grad_norm": 1.593515591831454, + "language_loss": 0.81975627, + "learning_rate": 3.7451085600240406e-06, + "loss": 0.84180319, + "num_input_tokens_seen": 67302780, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9921875, + "step": 3120, + "time_per_iteration": 2.478870153427124 + }, + { + "auxiliary_loss_clip": 0.01148213, + "auxiliary_loss_mlp": 0.01043273, + "balance_loss_clip": 1.02526081, + "balance_loss_mlp": 1.05178094, + "epoch": 0.1876446715767323, + "flos": 29570210841600.0, + "grad_norm": 1.7598733043788508, + "language_loss": 0.8513701, + "learning_rate": 3.7449182682182263e-06, + "loss": 0.873285, + "num_input_tokens_seen": 67323405, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9609375, + "step": 3121, + "time_per_iteration": 2.5178277492523193 + }, + { + "auxiliary_loss_clip": 0.0115058, + "auxiliary_loss_mlp": 0.0104859, + "balance_loss_clip": 1.02976704, + "balance_loss_mlp": 1.05281448, + "epoch": 0.18770479482940028, + "flos": 30339992234880.0, + "grad_norm": 2.163070382320244, + "language_loss": 0.70038795, + "learning_rate": 3.744727910244937e-06, + "loss": 0.72237968, + "num_input_tokens_seen": 67345800, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9765625, + "step": 3122, + "time_per_iteration": 2.5523242950439453 + }, + { + "auxiliary_loss_clip": 0.0114817, + "auxiliary_loss_mlp": 0.01045591, + "balance_loss_clip": 1.02524245, + "balance_loss_mlp": 1.05194402, + "epoch": 0.18776491808206824, + "flos": 14465357527680.0, + "grad_norm": 2.352571744641408, + "language_loss": 0.7034744, + "learning_rate": 3.7445374861113905e-06, + "loss": 0.72541201, + "num_input_tokens_seen": 67363575, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.9609375, + "step": 3123, + "time_per_iteration": 2.4145569801330566 + }, + { + "auxiliary_loss_clip": 0.01149047, + "auxiliary_loss_mlp": 0.01047451, + "balance_loss_clip": 1.02968884, + "balance_loss_mlp": 1.05238771, + "epoch": 0.1878250413347362, + "flos": 24498547044480.0, + "grad_norm": 2.0330816469172097, + "language_loss": 0.73851109, + "learning_rate": 3.7443469958248066e-06, + "loss": 0.76047611, + "num_input_tokens_seen": 67381765, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.96875, + "step": 3124, + "time_per_iteration": 2.497352123260498 + }, + { + "auxiliary_loss_clip": 0.01153024, + "auxiliary_loss_mlp": 0.01050586, + "balance_loss_clip": 1.03027248, + "balance_loss_mlp": 1.05275774, + "epoch": 0.18788516458740417, + "flos": 39786185692800.0, + "grad_norm": 1.9990758157966066, + "language_loss": 0.80601895, + "learning_rate": 3.7441564393924106e-06, + "loss": 0.82805508, + "num_input_tokens_seen": 67405000, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.0, + "step": 3125, + "time_per_iteration": 2.605851411819458 + }, + { + "auxiliary_loss_clip": 0.01056257, + "auxiliary_loss_mlp": 0.01009024, + "balance_loss_clip": 1.00697315, + "balance_loss_mlp": 1.02352476, + "epoch": 0.18794528784007214, + "flos": 64699250664960.0, + "grad_norm": 0.9386177249275542, + "language_loss": 0.63591504, + "learning_rate": 3.7439658168214273e-06, + "loss": 0.65656781, + "num_input_tokens_seen": 67467140, + "router_z_loss_clip": 0.02050781, + "router_z_loss_mlp": 0.328125, + "step": 3126, + "time_per_iteration": 3.0943961143493652 + }, + { + "auxiliary_loss_clip": 0.01150221, + "auxiliary_loss_mlp": 0.01042071, + "balance_loss_clip": 1.02366543, + "balance_loss_mlp": 1.05439222, + "epoch": 0.1880054110927401, + "flos": 28622061486720.0, + "grad_norm": 1.7984129752859428, + "language_loss": 0.81274688, + "learning_rate": 3.7437751281190857e-06, + "loss": 0.83466977, + "num_input_tokens_seen": 67487980, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.9609375, + "step": 3127, + "time_per_iteration": 2.535048723220825 + }, + { + "auxiliary_loss_clip": 0.01054784, + "auxiliary_loss_mlp": 0.0100739, + "balance_loss_clip": 1.00543487, + "balance_loss_mlp": 1.02235639, + "epoch": 0.1880655343454081, + "flos": 64488958490880.0, + "grad_norm": 0.7620779230288282, + "language_loss": 0.6191628, + "learning_rate": 3.7435843732926164e-06, + "loss": 0.63978451, + "num_input_tokens_seen": 67552500, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.32421875, + "step": 3128, + "time_per_iteration": 3.1384503841400146 + }, + { + "auxiliary_loss_clip": 0.01153999, + "auxiliary_loss_mlp": 0.0104217, + "balance_loss_clip": 1.02329898, + "balance_loss_mlp": 1.05182266, + "epoch": 0.18812565759807606, + "flos": 32124464928000.0, + "grad_norm": 2.171302965646948, + "language_loss": 0.71237707, + "learning_rate": 3.7433935523492536e-06, + "loss": 0.73433876, + "num_input_tokens_seen": 67573295, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0234375, + "step": 3129, + "time_per_iteration": 2.560601234436035 + }, + { + "auxiliary_loss_clip": 0.01149923, + "auxiliary_loss_mlp": 0.01051091, + "balance_loss_clip": 1.03206491, + "balance_loss_mlp": 1.05224252, + "epoch": 0.18818578085074403, + "flos": 20624539449600.0, + "grad_norm": 2.040923932078449, + "language_loss": 0.85375232, + "learning_rate": 3.7432026652962314e-06, + "loss": 0.87576246, + "num_input_tokens_seen": 67590010, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.9765625, + "step": 3130, + "time_per_iteration": 2.4366040229797363 + }, + { + "auxiliary_loss_clip": 0.0114816, + "auxiliary_loss_mlp": 0.01044498, + "balance_loss_clip": 1.02507877, + "balance_loss_mlp": 1.04844868, + "epoch": 0.188245904103412, + "flos": 28840506048000.0, + "grad_norm": 1.9842347260172397, + "language_loss": 0.77227372, + "learning_rate": 3.7430117121407897e-06, + "loss": 0.7942003, + "num_input_tokens_seen": 67611110, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0, + "step": 3131, + "time_per_iteration": 2.503112554550171 + }, + { + "auxiliary_loss_clip": 0.01151098, + "auxiliary_loss_mlp": 0.0104766, + "balance_loss_clip": 1.02800202, + "balance_loss_mlp": 1.05402517, + "epoch": 0.18830602735607996, + "flos": 29420319386880.0, + "grad_norm": 1.8095346888628816, + "language_loss": 0.81244844, + "learning_rate": 3.74282069289017e-06, + "loss": 0.834436, + "num_input_tokens_seen": 67631990, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.96875, + "step": 3132, + "time_per_iteration": 2.5265986919403076 + }, + { + "auxiliary_loss_clip": 0.01154443, + "auxiliary_loss_mlp": 0.01048532, + "balance_loss_clip": 1.02939904, + "balance_loss_mlp": 1.05395401, + "epoch": 0.18836615060874792, + "flos": 28872933050880.0, + "grad_norm": 2.3595669444771135, + "language_loss": 0.79035556, + "learning_rate": 3.742629607551614e-06, + "loss": 0.81238532, + "num_input_tokens_seen": 67650490, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0, + "step": 3133, + "time_per_iteration": 2.500927209854126 + }, + { + "auxiliary_loss_clip": 0.01151251, + "auxiliary_loss_mlp": 0.0105121, + "balance_loss_clip": 1.03224421, + "balance_loss_mlp": 1.05204821, + "epoch": 0.18842627386141592, + "flos": 22601673717120.0, + "grad_norm": 4.024150314183157, + "language_loss": 0.82826144, + "learning_rate": 3.7424384561323698e-06, + "loss": 0.85028601, + "num_input_tokens_seen": 67668860, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.9921875, + "step": 3134, + "time_per_iteration": 2.4773380756378174 + }, + { + "auxiliary_loss_clip": 0.01146819, + "auxiliary_loss_mlp": 0.01047119, + "balance_loss_clip": 1.02847505, + "balance_loss_mlp": 1.05027199, + "epoch": 0.18848639711408388, + "flos": 24573600512640.0, + "grad_norm": 1.4735244825899, + "language_loss": 0.82783771, + "learning_rate": 3.742247238639684e-06, + "loss": 0.8497771, + "num_input_tokens_seen": 67690220, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.96484375, + "step": 3135, + "time_per_iteration": 2.4957115650177 + }, + { + "auxiliary_loss_clip": 0.01149872, + "auxiliary_loss_mlp": 0.01052674, + "balance_loss_clip": 1.03343356, + "balance_loss_mlp": 1.0503304, + "epoch": 0.18854652036675185, + "flos": 34166920078080.0, + "grad_norm": 1.8513380433423674, + "language_loss": 0.79031271, + "learning_rate": 3.7420559550808083e-06, + "loss": 0.81233823, + "num_input_tokens_seen": 67709820, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.9921875, + "step": 3136, + "time_per_iteration": 2.556800127029419 + }, + { + "auxiliary_loss_clip": 0.01150763, + "auxiliary_loss_mlp": 0.01048681, + "balance_loss_clip": 1.02947617, + "balance_loss_mlp": 1.05327463, + "epoch": 0.1886066436194198, + "flos": 24200236592640.0, + "grad_norm": 1.9366242888645147, + "language_loss": 0.81049621, + "learning_rate": 3.741864605462996e-06, + "loss": 0.83249068, + "num_input_tokens_seen": 67729490, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.9765625, + "step": 3137, + "time_per_iteration": 2.487513542175293 + }, + { + "auxiliary_loss_clip": 0.01151307, + "auxiliary_loss_mlp": 0.01057024, + "balance_loss_clip": 1.03913093, + "balance_loss_mlp": 1.05406666, + "epoch": 0.18866676687208778, + "flos": 21251109317760.0, + "grad_norm": 1.5870634004860276, + "language_loss": 0.8119483, + "learning_rate": 3.741673189793504e-06, + "loss": 0.83403158, + "num_input_tokens_seen": 67749665, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.97265625, + "step": 3138, + "time_per_iteration": 2.4554855823516846 + }, + { + "auxiliary_loss_clip": 0.01152889, + "auxiliary_loss_mlp": 0.01050697, + "balance_loss_clip": 1.03162408, + "balance_loss_mlp": 1.05190897, + "epoch": 0.18872689012475574, + "flos": 37308673013760.0, + "grad_norm": 1.760814692015778, + "language_loss": 0.636096, + "learning_rate": 3.7414817080795896e-06, + "loss": 0.6581319, + "num_input_tokens_seen": 67776230, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0078125, + "step": 3139, + "time_per_iteration": 2.6103553771972656 + }, + { + "auxiliary_loss_clip": 0.01146092, + "auxiliary_loss_mlp": 0.0105006, + "balance_loss_clip": 1.03046215, + "balance_loss_mlp": 1.04812348, + "epoch": 0.1887870133774237, + "flos": 21652303299840.0, + "grad_norm": 2.433795452320061, + "language_loss": 0.71546841, + "learning_rate": 3.741290160328514e-06, + "loss": 0.73742986, + "num_input_tokens_seen": 67795080, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.98046875, + "step": 3140, + "time_per_iteration": 2.4519457817077637 + }, + { + "auxiliary_loss_clip": 0.01147517, + "auxiliary_loss_mlp": 0.01047268, + "balance_loss_clip": 1.02764606, + "balance_loss_mlp": 1.04848385, + "epoch": 0.1888471366300917, + "flos": 15924659374080.0, + "grad_norm": 3.1391974719951574, + "language_loss": 0.87001872, + "learning_rate": 3.7410985465475412e-06, + "loss": 0.89196658, + "num_input_tokens_seen": 67813110, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.98828125, + "step": 3141, + "time_per_iteration": 2.4811747074127197 + }, + { + "auxiliary_loss_clip": 0.01153623, + "auxiliary_loss_mlp": 0.0104492, + "balance_loss_clip": 1.02460694, + "balance_loss_mlp": 1.05144691, + "epoch": 0.18890725988275966, + "flos": 18551955767040.0, + "grad_norm": 2.021325930100965, + "language_loss": 0.77418405, + "learning_rate": 3.7409068667439378e-06, + "loss": 0.79616946, + "num_input_tokens_seen": 67831070, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.0234375, + "step": 3142, + "time_per_iteration": 2.5001184940338135 + }, + { + "auxiliary_loss_clip": 0.01148279, + "auxiliary_loss_mlp": 0.01042631, + "balance_loss_clip": 1.02542925, + "balance_loss_mlp": 1.05104184, + "epoch": 0.18896738313542763, + "flos": 28840865184000.0, + "grad_norm": 1.6841374820722228, + "language_loss": 0.78446913, + "learning_rate": 3.740715120924971e-06, + "loss": 0.80637825, + "num_input_tokens_seen": 67852170, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.97265625, + "step": 3143, + "time_per_iteration": 3.9074132442474365 + }, + { + "auxiliary_loss_clip": 0.01150084, + "auxiliary_loss_mlp": 0.01049426, + "balance_loss_clip": 1.03081727, + "balance_loss_mlp": 1.05069065, + "epoch": 0.1890275063880956, + "flos": 22412747157120.0, + "grad_norm": 4.1822349926512485, + "language_loss": 0.71507585, + "learning_rate": 3.740523309097912e-06, + "loss": 0.73707104, + "num_input_tokens_seen": 67869945, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9921875, + "step": 3144, + "time_per_iteration": 3.981715679168701 + }, + { + "auxiliary_loss_clip": 0.01152034, + "auxiliary_loss_mlp": 0.01045652, + "balance_loss_clip": 1.02605355, + "balance_loss_mlp": 1.0513736, + "epoch": 0.18908762964076356, + "flos": 24243904552320.0, + "grad_norm": 2.6203593578621893, + "language_loss": 0.73683178, + "learning_rate": 3.7403314312700356e-06, + "loss": 0.75880861, + "num_input_tokens_seen": 67890240, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0078125, + "step": 3145, + "time_per_iteration": 2.5101706981658936 + }, + { + "auxiliary_loss_clip": 0.01143872, + "auxiliary_loss_mlp": 0.01045631, + "balance_loss_clip": 1.02783298, + "balance_loss_mlp": 1.04759097, + "epoch": 0.18914775289343153, + "flos": 16982910892800.0, + "grad_norm": 2.6756165752276027, + "language_loss": 0.77081764, + "learning_rate": 3.740139487448616e-06, + "loss": 0.79271269, + "num_input_tokens_seen": 67907825, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9609375, + "step": 3146, + "time_per_iteration": 2.4278056621551514 + }, + { + "auxiliary_loss_clip": 0.01148489, + "auxiliary_loss_mlp": 0.01046339, + "balance_loss_clip": 1.02811205, + "balance_loss_mlp": 1.04947495, + "epoch": 0.1892078761460995, + "flos": 21543781334400.0, + "grad_norm": 1.794796296308648, + "language_loss": 0.78377169, + "learning_rate": 3.7399474776409326e-06, + "loss": 0.80571997, + "num_input_tokens_seen": 67926670, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.98828125, + "step": 3147, + "time_per_iteration": 2.467607259750366 + }, + { + "auxiliary_loss_clip": 0.0114757, + "auxiliary_loss_mlp": 0.01048988, + "balance_loss_clip": 1.0310235, + "balance_loss_mlp": 1.0499115, + "epoch": 0.18926799939876748, + "flos": 23001538896000.0, + "grad_norm": 3.2769360880247853, + "language_loss": 0.67016155, + "learning_rate": 3.739755401854267e-06, + "loss": 0.69212711, + "num_input_tokens_seen": 67943645, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9765625, + "step": 3148, + "time_per_iteration": 2.4846954345703125 + }, + { + "auxiliary_loss_clip": 0.01145427, + "auxiliary_loss_mlp": 0.01037742, + "balance_loss_clip": 1.02037382, + "balance_loss_mlp": 1.04898858, + "epoch": 0.18932812265143545, + "flos": 22273019251200.0, + "grad_norm": 4.644784357412393, + "language_loss": 0.75978655, + "learning_rate": 3.739563260095902e-06, + "loss": 0.78161824, + "num_input_tokens_seen": 67962345, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.96484375, + "step": 3149, + "time_per_iteration": 2.4768459796905518 + }, + { + "auxiliary_loss_clip": 0.01143839, + "auxiliary_loss_mlp": 0.01047607, + "balance_loss_clip": 1.03028584, + "balance_loss_mlp": 1.05033517, + "epoch": 0.1893882459041034, + "flos": 18624423456000.0, + "grad_norm": 2.9181295874949735, + "language_loss": 0.81229341, + "learning_rate": 3.7393710523731245e-06, + "loss": 0.83420789, + "num_input_tokens_seen": 67979760, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9375, + "step": 3150, + "time_per_iteration": 2.42832088470459 + }, + { + "auxiliary_loss_clip": 0.01148187, + "auxiliary_loss_mlp": 0.01046446, + "balance_loss_clip": 1.02886271, + "balance_loss_mlp": 1.05068374, + "epoch": 0.18944836915677138, + "flos": 22892981016960.0, + "grad_norm": 2.066054594612055, + "language_loss": 0.84966886, + "learning_rate": 3.7391787786932215e-06, + "loss": 0.87161517, + "num_input_tokens_seen": 67996895, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9765625, + "step": 3151, + "time_per_iteration": 2.458054542541504 + }, + { + "auxiliary_loss_clip": 0.01148364, + "auxiliary_loss_mlp": 0.01052715, + "balance_loss_clip": 1.03441668, + "balance_loss_mlp": 1.04896331, + "epoch": 0.18950849240943934, + "flos": 26796542526720.0, + "grad_norm": 1.9128881662164896, + "language_loss": 0.7443462, + "learning_rate": 3.7389864390634857e-06, + "loss": 0.76635695, + "num_input_tokens_seen": 68018365, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.99609375, + "step": 3152, + "time_per_iteration": 2.4904792308807373 + }, + { + "auxiliary_loss_clip": 0.01146776, + "auxiliary_loss_mlp": 0.01048229, + "balance_loss_clip": 1.02937067, + "balance_loss_mlp": 1.0502255, + "epoch": 0.1895686156621073, + "flos": 24971239048320.0, + "grad_norm": 1.8661622565083957, + "language_loss": 0.75719136, + "learning_rate": 3.738794033491209e-06, + "loss": 0.77914143, + "num_input_tokens_seen": 68037985, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.96484375, + "step": 3153, + "time_per_iteration": 2.5026283264160156 + }, + { + "auxiliary_loss_clip": 0.01148349, + "auxiliary_loss_mlp": 0.01048305, + "balance_loss_clip": 1.03007817, + "balance_loss_mlp": 1.04962945, + "epoch": 0.1896287389147753, + "flos": 21944544353280.0, + "grad_norm": 1.8393709351558127, + "language_loss": 0.79529279, + "learning_rate": 3.7386015619836887e-06, + "loss": 0.81725931, + "num_input_tokens_seen": 68057975, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.984375, + "step": 3154, + "time_per_iteration": 2.4544081687927246 + }, + { + "auxiliary_loss_clip": 0.01151316, + "auxiliary_loss_mlp": 0.01047877, + "balance_loss_clip": 1.02919698, + "balance_loss_mlp": 1.04986668, + "epoch": 0.18968886216744327, + "flos": 18179058723840.0, + "grad_norm": 2.673670363277482, + "language_loss": 0.72798991, + "learning_rate": 3.738409024548223e-06, + "loss": 0.74998182, + "num_input_tokens_seen": 68074175, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 3155, + "time_per_iteration": 2.425431728363037 + }, + { + "auxiliary_loss_clip": 0.01145009, + "auxiliary_loss_mlp": 0.01048344, + "balance_loss_clip": 1.03042662, + "balance_loss_mlp": 1.04930019, + "epoch": 0.18974898542011123, + "flos": 20412487509120.0, + "grad_norm": 1.676026678838244, + "language_loss": 0.73911691, + "learning_rate": 3.7382164211921136e-06, + "loss": 0.76105046, + "num_input_tokens_seen": 68095230, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.95703125, + "step": 3156, + "time_per_iteration": 2.4683640003204346 + }, + { + "auxiliary_loss_clip": 0.01149399, + "auxiliary_loss_mlp": 0.01050259, + "balance_loss_clip": 1.03281915, + "balance_loss_mlp": 1.05195308, + "epoch": 0.1898091086727792, + "flos": 23985024255360.0, + "grad_norm": 1.5984593201401434, + "language_loss": 0.68251741, + "learning_rate": 3.7380237519226623e-06, + "loss": 0.70451397, + "num_input_tokens_seen": 68113805, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9765625, + "step": 3157, + "time_per_iteration": 2.472182512283325 + }, + { + "auxiliary_loss_clip": 0.01146139, + "auxiliary_loss_mlp": 0.01043457, + "balance_loss_clip": 1.02539706, + "balance_loss_mlp": 1.04914486, + "epoch": 0.18986923192544716, + "flos": 27637067756160.0, + "grad_norm": 1.9937577865402571, + "language_loss": 0.80197155, + "learning_rate": 3.737831016747176e-06, + "loss": 0.82386756, + "num_input_tokens_seen": 68133190, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.96875, + "step": 3158, + "time_per_iteration": 2.4978723526000977 + }, + { + "auxiliary_loss_clip": 0.01152812, + "auxiliary_loss_mlp": 0.01045212, + "balance_loss_clip": 1.02624583, + "balance_loss_mlp": 1.05201745, + "epoch": 0.18992935517811513, + "flos": 25484151306240.0, + "grad_norm": 1.9065090881698699, + "language_loss": 0.71940476, + "learning_rate": 3.737638215672964e-06, + "loss": 0.74138498, + "num_input_tokens_seen": 68152330, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0078125, + "step": 3159, + "time_per_iteration": 2.503129720687866 + }, + { + "auxiliary_loss_clip": 0.01150054, + "auxiliary_loss_mlp": 0.01049079, + "balance_loss_clip": 1.02987432, + "balance_loss_mlp": 1.05255282, + "epoch": 0.1899894784307831, + "flos": 17420805596160.0, + "grad_norm": 1.8597759984302606, + "language_loss": 0.85071993, + "learning_rate": 3.7374453487073366e-06, + "loss": 0.8727113, + "num_input_tokens_seen": 68170185, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.9765625, + "step": 3160, + "time_per_iteration": 2.45534348487854 + }, + { + "auxiliary_loss_clip": 0.01143204, + "auxiliary_loss_mlp": 0.01049047, + "balance_loss_clip": 1.03235734, + "balance_loss_mlp": 1.050807, + "epoch": 0.19004960168345109, + "flos": 27492240119040.0, + "grad_norm": 1.7120140162377986, + "language_loss": 0.73554128, + "learning_rate": 3.7372524158576074e-06, + "loss": 0.75746381, + "num_input_tokens_seen": 68191665, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.92578125, + "step": 3161, + "time_per_iteration": 2.5551726818084717 + }, + { + "auxiliary_loss_clip": 0.01150414, + "auxiliary_loss_mlp": 0.01047878, + "balance_loss_clip": 1.02982974, + "balance_loss_mlp": 1.05420387, + "epoch": 0.19010972493611905, + "flos": 38654676385920.0, + "grad_norm": 1.554139282497156, + "language_loss": 0.80939364, + "learning_rate": 3.7370594171310926e-06, + "loss": 0.83137655, + "num_input_tokens_seen": 68214635, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9609375, + "step": 3162, + "time_per_iteration": 2.609764337539673 + }, + { + "auxiliary_loss_clip": 0.01149524, + "auxiliary_loss_mlp": 0.01043018, + "balance_loss_clip": 1.02486265, + "balance_loss_mlp": 1.05257571, + "epoch": 0.19016984818878702, + "flos": 19244744357760.0, + "grad_norm": 1.8884975109329094, + "language_loss": 0.75600141, + "learning_rate": 3.73686635253511e-06, + "loss": 0.77792686, + "num_input_tokens_seen": 68232150, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.96875, + "step": 3163, + "time_per_iteration": 2.4494824409484863 + }, + { + "auxiliary_loss_clip": 0.01149917, + "auxiliary_loss_mlp": 0.0103951, + "balance_loss_clip": 1.02161682, + "balance_loss_mlp": 1.05577397, + "epoch": 0.19022997144145498, + "flos": 37596891744000.0, + "grad_norm": 1.5980783305445414, + "language_loss": 0.74197054, + "learning_rate": 3.736673222076982e-06, + "loss": 0.76386476, + "num_input_tokens_seen": 68253370, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.94140625, + "step": 3164, + "time_per_iteration": 2.5901739597320557 + }, + { + "auxiliary_loss_clip": 0.01148415, + "auxiliary_loss_mlp": 0.01039529, + "balance_loss_clip": 1.02151656, + "balance_loss_mlp": 1.05402589, + "epoch": 0.19029009469412295, + "flos": 61530921665280.0, + "grad_norm": 1.5830796140792522, + "language_loss": 0.66913098, + "learning_rate": 3.7364800257640313e-06, + "loss": 0.69101042, + "num_input_tokens_seen": 68278895, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9453125, + "step": 3165, + "time_per_iteration": 2.899500608444214 + }, + { + "auxiliary_loss_clip": 0.01148214, + "auxiliary_loss_mlp": 0.01045421, + "balance_loss_clip": 1.02624011, + "balance_loss_mlp": 1.05282831, + "epoch": 0.1903502179467909, + "flos": 13954851480960.0, + "grad_norm": 2.1716027754337257, + "language_loss": 0.7452209, + "learning_rate": 3.7362867636035835e-06, + "loss": 0.76715726, + "num_input_tokens_seen": 68294880, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.953125, + "step": 3166, + "time_per_iteration": 2.4325685501098633 + }, + { + "auxiliary_loss_clip": 0.01062623, + "auxiliary_loss_mlp": 0.01017161, + "balance_loss_clip": 1.01490772, + "balance_loss_mlp": 1.02902174, + "epoch": 0.1904103411994589, + "flos": 66899641916160.0, + "grad_norm": 0.8067170187870535, + "language_loss": 0.50396568, + "learning_rate": 3.736093435602968e-06, + "loss": 0.52476352, + "num_input_tokens_seen": 68359665, + "router_z_loss_clip": 0.02258301, + "router_z_loss_mlp": 0.3359375, + "step": 3167, + "time_per_iteration": 3.1095221042633057 + }, + { + "auxiliary_loss_clip": 0.01146367, + "auxiliary_loss_mlp": 0.01049594, + "balance_loss_clip": 1.03184342, + "balance_loss_mlp": 1.05208659, + "epoch": 0.19047046445212687, + "flos": 21908741472000.0, + "grad_norm": 1.7496006549093657, + "language_loss": 0.74235475, + "learning_rate": 3.7359000417695156e-06, + "loss": 0.76431435, + "num_input_tokens_seen": 68378950, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9453125, + "step": 3168, + "time_per_iteration": 2.483551502227783 + }, + { + "auxiliary_loss_clip": 0.01059618, + "auxiliary_loss_mlp": 0.01004786, + "balance_loss_clip": 1.00246131, + "balance_loss_mlp": 1.02649927, + "epoch": 0.19053058770479483, + "flos": 59255156701440.0, + "grad_norm": 0.8615778549663292, + "language_loss": 0.60097563, + "learning_rate": 3.73570658211056e-06, + "loss": 0.6216197, + "num_input_tokens_seen": 68434235, + "router_z_loss_clip": 0.02319336, + "router_z_loss_mlp": 0.33203125, + "step": 3169, + "time_per_iteration": 2.958176851272583 + }, + { + "auxiliary_loss_clip": 0.01152665, + "auxiliary_loss_mlp": 0.01051299, + "balance_loss_clip": 1.03371537, + "balance_loss_mlp": 1.05302989, + "epoch": 0.1905907109574628, + "flos": 23951304362880.0, + "grad_norm": 1.550337238497042, + "language_loss": 0.77976263, + "learning_rate": 3.735513056633436e-06, + "loss": 0.80180222, + "num_input_tokens_seen": 68453830, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.99609375, + "step": 3170, + "time_per_iteration": 2.5174756050109863 + }, + { + "auxiliary_loss_clip": 0.01145075, + "auxiliary_loss_mlp": 0.01046915, + "balance_loss_clip": 1.02960575, + "balance_loss_mlp": 1.05185819, + "epoch": 0.19065083421013077, + "flos": 20812316774400.0, + "grad_norm": 1.7193055204742105, + "language_loss": 0.78597021, + "learning_rate": 3.7353194653454834e-06, + "loss": 0.80789012, + "num_input_tokens_seen": 68473005, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.93359375, + "step": 3171, + "time_per_iteration": 2.4895551204681396 + }, + { + "auxiliary_loss_clip": 0.01149187, + "auxiliary_loss_mlp": 0.0104474, + "balance_loss_clip": 1.02617931, + "balance_loss_mlp": 1.05111575, + "epoch": 0.19071095746279873, + "flos": 31284981192960.0, + "grad_norm": 3.5246110250440386, + "language_loss": 0.78578937, + "learning_rate": 3.7351258082540426e-06, + "loss": 0.80772865, + "num_input_tokens_seen": 68493470, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.98046875, + "step": 3172, + "time_per_iteration": 2.574558973312378 + }, + { + "auxiliary_loss_clip": 0.01148239, + "auxiliary_loss_mlp": 0.01054453, + "balance_loss_clip": 1.03711963, + "balance_loss_mlp": 1.05253482, + "epoch": 0.1907710807154667, + "flos": 14356117290240.0, + "grad_norm": 1.581476317811461, + "language_loss": 0.80126482, + "learning_rate": 3.7349320853664576e-06, + "loss": 0.82329178, + "num_input_tokens_seen": 68511290, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.95703125, + "step": 3173, + "time_per_iteration": 2.464979410171509 + }, + { + "auxiliary_loss_clip": 0.01149716, + "auxiliary_loss_mlp": 0.01051904, + "balance_loss_clip": 1.03432083, + "balance_loss_mlp": 1.05250478, + "epoch": 0.1908312039681347, + "flos": 26907039740160.0, + "grad_norm": 1.9222394249434893, + "language_loss": 0.78740567, + "learning_rate": 3.7347382966900735e-06, + "loss": 0.8094219, + "num_input_tokens_seen": 68532575, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.96875, + "step": 3174, + "time_per_iteration": 2.540959358215332 + }, + { + "auxiliary_loss_clip": 0.01149777, + "auxiliary_loss_mlp": 0.01047648, + "balance_loss_clip": 1.03043461, + "balance_loss_mlp": 1.05367374, + "epoch": 0.19089132722080265, + "flos": 14494695960960.0, + "grad_norm": 1.8458147293094664, + "language_loss": 0.80757344, + "learning_rate": 3.7345444422322395e-06, + "loss": 0.82954776, + "num_input_tokens_seen": 68548760, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9609375, + "step": 3175, + "time_per_iteration": 2.441190481185913 + }, + { + "auxiliary_loss_clip": 0.01148992, + "auxiliary_loss_mlp": 0.01056395, + "balance_loss_clip": 1.03821599, + "balance_loss_mlp": 1.0521791, + "epoch": 0.19095145047347062, + "flos": 13952876232960.0, + "grad_norm": 2.3562328324004445, + "language_loss": 0.85142022, + "learning_rate": 3.7343505220003067e-06, + "loss": 0.87347412, + "num_input_tokens_seen": 68563100, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.96875, + "step": 3176, + "time_per_iteration": 2.4397072792053223 + }, + { + "auxiliary_loss_clip": 0.01152727, + "auxiliary_loss_mlp": 0.01056149, + "balance_loss_clip": 1.036515, + "balance_loss_mlp": 1.05395234, + "epoch": 0.19101157372613858, + "flos": 25301832848640.0, + "grad_norm": 2.002060812172469, + "language_loss": 0.81206596, + "learning_rate": 3.7341565360016285e-06, + "loss": 0.83415473, + "num_input_tokens_seen": 68581650, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.98828125, + "step": 3177, + "time_per_iteration": 2.4980266094207764 + }, + { + "auxiliary_loss_clip": 0.01144454, + "auxiliary_loss_mlp": 0.01048966, + "balance_loss_clip": 1.03073931, + "balance_loss_mlp": 1.0503974, + "epoch": 0.19107169697880655, + "flos": 20558212986240.0, + "grad_norm": 1.9374450898751996, + "language_loss": 0.74628592, + "learning_rate": 3.73396248424356e-06, + "loss": 0.76822007, + "num_input_tokens_seen": 68600360, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.94140625, + "step": 3178, + "time_per_iteration": 2.477679967880249 + }, + { + "auxiliary_loss_clip": 0.01147132, + "auxiliary_loss_mlp": 0.01039746, + "balance_loss_clip": 1.02273464, + "balance_loss_mlp": 1.05001104, + "epoch": 0.19113182023147451, + "flos": 22163204396160.0, + "grad_norm": 1.8429055258583904, + "language_loss": 0.8167876, + "learning_rate": 3.7337683667334606e-06, + "loss": 0.83865643, + "num_input_tokens_seen": 68617885, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.97265625, + "step": 3179, + "time_per_iteration": 2.452310800552368 + }, + { + "auxiliary_loss_clip": 0.0114904, + "auxiliary_loss_mlp": 0.01046544, + "balance_loss_clip": 1.02892482, + "balance_loss_mlp": 1.05279994, + "epoch": 0.19119194348414248, + "flos": 18581796990720.0, + "grad_norm": 2.1508657656276484, + "language_loss": 0.7946887, + "learning_rate": 3.733574183478691e-06, + "loss": 0.81664455, + "num_input_tokens_seen": 68634550, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.96484375, + "step": 3180, + "time_per_iteration": 2.451066732406616 + }, + { + "auxiliary_loss_clip": 0.0114304, + "auxiliary_loss_mlp": 0.01045985, + "balance_loss_clip": 1.02770984, + "balance_loss_mlp": 1.04780042, + "epoch": 0.19125206673681047, + "flos": 19026623018880.0, + "grad_norm": 2.916741655382754, + "language_loss": 0.79891652, + "learning_rate": 3.733379934486615e-06, + "loss": 0.82080674, + "num_input_tokens_seen": 68651895, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.94921875, + "step": 3181, + "time_per_iteration": 2.4310615062713623 + }, + { + "auxiliary_loss_clip": 0.0114616, + "auxiliary_loss_mlp": 0.01053832, + "balance_loss_clip": 1.03623664, + "balance_loss_mlp": 1.04858851, + "epoch": 0.19131218998947844, + "flos": 21690153256320.0, + "grad_norm": 1.7607714952320546, + "language_loss": 0.73820639, + "learning_rate": 3.7331856197645973e-06, + "loss": 0.76020634, + "num_input_tokens_seen": 68671500, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9765625, + "step": 3182, + "time_per_iteration": 2.4712350368499756 + }, + { + "auxiliary_loss_clip": 0.01148345, + "auxiliary_loss_mlp": 0.01048421, + "balance_loss_clip": 1.03093314, + "balance_loss_mlp": 1.05187011, + "epoch": 0.1913723132421464, + "flos": 18442500048000.0, + "grad_norm": 1.8018319163421928, + "language_loss": 0.6486634, + "learning_rate": 3.7329912393200084e-06, + "loss": 0.67063105, + "num_input_tokens_seen": 68690570, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.96484375, + "step": 3183, + "time_per_iteration": 2.440232753753662 + }, + { + "auxiliary_loss_clip": 0.01145449, + "auxiliary_loss_mlp": 0.01047983, + "balance_loss_clip": 1.02920759, + "balance_loss_mlp": 1.04864669, + "epoch": 0.19143243649481437, + "flos": 27160102033920.0, + "grad_norm": 1.760716170695104, + "language_loss": 0.73234087, + "learning_rate": 3.7327967931602173e-06, + "loss": 0.7542752, + "num_input_tokens_seen": 68709735, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.96875, + "step": 3184, + "time_per_iteration": 3.9211573600769043 + }, + { + "auxiliary_loss_clip": 0.01145752, + "auxiliary_loss_mlp": 0.01047876, + "balance_loss_clip": 1.0281471, + "balance_loss_mlp": 1.04738748, + "epoch": 0.19149255974748233, + "flos": 21718952985600.0, + "grad_norm": 2.1066155051108315, + "language_loss": 0.8784132, + "learning_rate": 3.732602281292598e-06, + "loss": 0.9003495, + "num_input_tokens_seen": 68727565, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.984375, + "step": 3185, + "time_per_iteration": 5.396124601364136 + }, + { + "auxiliary_loss_clip": 0.01143307, + "auxiliary_loss_mlp": 0.01046716, + "balance_loss_clip": 1.02803612, + "balance_loss_mlp": 1.04899192, + "epoch": 0.1915526830001503, + "flos": 22963293889920.0, + "grad_norm": 2.10102369978198, + "language_loss": 0.72667789, + "learning_rate": 3.7324077037245267e-06, + "loss": 0.74857807, + "num_input_tokens_seen": 68748110, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.94140625, + "step": 3186, + "time_per_iteration": 2.498241901397705 + }, + { + "auxiliary_loss_clip": 0.01153236, + "auxiliary_loss_mlp": 0.01042185, + "balance_loss_clip": 1.02244437, + "balance_loss_mlp": 1.054919, + "epoch": 0.1916128062528183, + "flos": 26140741966080.0, + "grad_norm": 2.264264166459479, + "language_loss": 0.83865881, + "learning_rate": 3.7322130604633825e-06, + "loss": 0.86061311, + "num_input_tokens_seen": 68769765, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.98046875, + "step": 3187, + "time_per_iteration": 2.527416467666626 + }, + { + "auxiliary_loss_clip": 0.01051867, + "auxiliary_loss_mlp": 0.01015636, + "balance_loss_clip": 1.01343083, + "balance_loss_mlp": 1.01988959, + "epoch": 0.19167292950548626, + "flos": 54925767457920.0, + "grad_norm": 0.8634842964488614, + "language_loss": 0.55803859, + "learning_rate": 3.732018351516544e-06, + "loss": 0.5787136, + "num_input_tokens_seen": 68826815, + "router_z_loss_clip": 0.02209473, + "router_z_loss_mlp": 0.3203125, + "step": 3188, + "time_per_iteration": 3.0815136432647705 + }, + { + "auxiliary_loss_clip": 0.01145462, + "auxiliary_loss_mlp": 0.01055783, + "balance_loss_clip": 1.03709126, + "balance_loss_mlp": 1.04972625, + "epoch": 0.19173305275815422, + "flos": 29935601942400.0, + "grad_norm": 1.71302722892552, + "language_loss": 0.70180511, + "learning_rate": 3.731823576891397e-06, + "loss": 0.72381759, + "num_input_tokens_seen": 68847585, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.95703125, + "step": 3189, + "time_per_iteration": 2.5380465984344482 + }, + { + "auxiliary_loss_clip": 0.01140421, + "auxiliary_loss_mlp": 0.01034792, + "balance_loss_clip": 1.01822233, + "balance_loss_mlp": 1.04853344, + "epoch": 0.1917931760108222, + "flos": 24752471264640.0, + "grad_norm": 2.222159201352765, + "language_loss": 0.74234986, + "learning_rate": 3.7316287365953266e-06, + "loss": 0.76410198, + "num_input_tokens_seen": 68866620, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.91796875, + "step": 3190, + "time_per_iteration": 2.5862700939178467 + }, + { + "auxiliary_loss_clip": 0.01143494, + "auxiliary_loss_mlp": 0.01056401, + "balance_loss_clip": 1.03818583, + "balance_loss_mlp": 1.04965627, + "epoch": 0.19185329926349015, + "flos": 18843550375680.0, + "grad_norm": 1.8818377537371913, + "language_loss": 0.8394708, + "learning_rate": 3.73143383063572e-06, + "loss": 0.86146975, + "num_input_tokens_seen": 68885515, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9375, + "step": 3191, + "time_per_iteration": 2.5077905654907227 + }, + { + "auxiliary_loss_clip": 0.01139779, + "auxiliary_loss_mlp": 0.01038816, + "balance_loss_clip": 1.02217412, + "balance_loss_mlp": 1.04766488, + "epoch": 0.19191342251615812, + "flos": 22086858038400.0, + "grad_norm": 1.7694679756443132, + "language_loss": 0.89325655, + "learning_rate": 3.73123885901997e-06, + "loss": 0.91504252, + "num_input_tokens_seen": 68903225, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.921875, + "step": 3192, + "time_per_iteration": 2.4738776683807373 + }, + { + "auxiliary_loss_clip": 0.01150885, + "auxiliary_loss_mlp": 0.0105345, + "balance_loss_clip": 1.03398299, + "balance_loss_mlp": 1.0531472, + "epoch": 0.19197354576882608, + "flos": 22199115018240.0, + "grad_norm": 2.352703418633998, + "language_loss": 0.74830496, + "learning_rate": 3.7310438217554687e-06, + "loss": 0.77034831, + "num_input_tokens_seen": 68922860, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.9765625, + "step": 3193, + "time_per_iteration": 2.47143816947937 + }, + { + "auxiliary_loss_clip": 0.01146927, + "auxiliary_loss_mlp": 0.01045614, + "balance_loss_clip": 1.02717233, + "balance_loss_mlp": 1.04918766, + "epoch": 0.19203366902149407, + "flos": 24896185580160.0, + "grad_norm": 1.7283890992056894, + "language_loss": 0.74733245, + "learning_rate": 3.730848718849612e-06, + "loss": 0.7692579, + "num_input_tokens_seen": 68943000, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.9765625, + "step": 3194, + "time_per_iteration": 2.5001959800720215 + }, + { + "auxiliary_loss_clip": 0.0105047, + "auxiliary_loss_mlp": 0.010055, + "balance_loss_clip": 1.00319958, + "balance_loss_mlp": 1.01851392, + "epoch": 0.19209379227416204, + "flos": 68416722789120.0, + "grad_norm": 0.7975785668902318, + "language_loss": 0.68455988, + "learning_rate": 3.7306535503097985e-06, + "loss": 0.70511955, + "num_input_tokens_seen": 69000255, + "router_z_loss_clip": 0.02294922, + "router_z_loss_mlp": 0.3203125, + "step": 3195, + "time_per_iteration": 3.014677047729492 + }, + { + "auxiliary_loss_clip": 0.01146296, + "auxiliary_loss_mlp": 0.01043268, + "balance_loss_clip": 1.0254823, + "balance_loss_mlp": 1.05066323, + "epoch": 0.19215391552683, + "flos": 22055185221120.0, + "grad_norm": 1.9672517867074575, + "language_loss": 0.72712696, + "learning_rate": 3.730458316143429e-06, + "loss": 0.74902254, + "num_input_tokens_seen": 69019665, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.95703125, + "step": 3196, + "time_per_iteration": 2.4855856895446777 + }, + { + "auxiliary_loss_clip": 0.01151669, + "auxiliary_loss_mlp": 0.01046783, + "balance_loss_clip": 1.0284251, + "balance_loss_mlp": 1.05643284, + "epoch": 0.19221403877949797, + "flos": 20302959962880.0, + "grad_norm": 1.8158077484015336, + "language_loss": 0.83774233, + "learning_rate": 3.7302630163579068e-06, + "loss": 0.85972691, + "num_input_tokens_seen": 69039055, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.953125, + "step": 3197, + "time_per_iteration": 2.4530181884765625 + }, + { + "auxiliary_loss_clip": 0.01146905, + "auxiliary_loss_mlp": 0.01044345, + "balance_loss_clip": 1.02565312, + "balance_loss_mlp": 1.05036283, + "epoch": 0.19227416203216594, + "flos": 23185329811200.0, + "grad_norm": 2.295881830513264, + "language_loss": 0.80459738, + "learning_rate": 3.7300676509606373e-06, + "loss": 0.82650983, + "num_input_tokens_seen": 69056370, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.96875, + "step": 3198, + "time_per_iteration": 2.4882590770721436 + }, + { + "auxiliary_loss_clip": 0.01148366, + "auxiliary_loss_mlp": 0.01050243, + "balance_loss_clip": 1.03090763, + "balance_loss_mlp": 1.04984999, + "epoch": 0.1923342852848339, + "flos": 25776607841280.0, + "grad_norm": 1.9800701307051174, + "language_loss": 0.7862891, + "learning_rate": 3.729872219959029e-06, + "loss": 0.80827522, + "num_input_tokens_seen": 69075915, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.984375, + "step": 3199, + "time_per_iteration": 2.507227659225464 + }, + { + "auxiliary_loss_clip": 0.01146428, + "auxiliary_loss_mlp": 0.01042987, + "balance_loss_clip": 1.02567828, + "balance_loss_mlp": 1.05150342, + "epoch": 0.19239440853750187, + "flos": 17128349061120.0, + "grad_norm": 2.05190707233933, + "language_loss": 0.83391261, + "learning_rate": 3.7296767233604934e-06, + "loss": 0.85580671, + "num_input_tokens_seen": 69094145, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.94921875, + "step": 3200, + "time_per_iteration": 2.459218978881836 + }, + { + "auxiliary_loss_clip": 0.01148087, + "auxiliary_loss_mlp": 0.01051054, + "balance_loss_clip": 1.03286231, + "balance_loss_mlp": 1.0524931, + "epoch": 0.19245453179016986, + "flos": 16435093593600.0, + "grad_norm": 2.0233550639398428, + "language_loss": 0.78678542, + "learning_rate": 3.729481161172443e-06, + "loss": 0.80877686, + "num_input_tokens_seen": 69111110, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.95703125, + "step": 3201, + "time_per_iteration": 2.435478448867798 + }, + { + "auxiliary_loss_clip": 0.01148745, + "auxiliary_loss_mlp": 0.01046904, + "balance_loss_clip": 1.02874875, + "balance_loss_mlp": 1.05050445, + "epoch": 0.19251465504283782, + "flos": 20230276792320.0, + "grad_norm": 2.1716175760371814, + "language_loss": 0.69168961, + "learning_rate": 3.7292855334022927e-06, + "loss": 0.71364617, + "num_input_tokens_seen": 69130280, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.984375, + "step": 3202, + "time_per_iteration": 2.4596354961395264 + }, + { + "auxiliary_loss_clip": 0.01145317, + "auxiliary_loss_mlp": 0.01035376, + "balance_loss_clip": 1.01790023, + "balance_loss_mlp": 1.05140352, + "epoch": 0.1925747782955058, + "flos": 19464374067840.0, + "grad_norm": 1.7015130302687178, + "language_loss": 0.91123176, + "learning_rate": 3.7290898400574627e-06, + "loss": 0.93303871, + "num_input_tokens_seen": 69149570, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9375, + "step": 3203, + "time_per_iteration": 2.4425902366638184 + }, + { + "auxiliary_loss_clip": 0.01147002, + "auxiliary_loss_mlp": 0.01050127, + "balance_loss_clip": 1.03127956, + "balance_loss_mlp": 1.05008471, + "epoch": 0.19263490154817375, + "flos": 17785586165760.0, + "grad_norm": 2.129263396651385, + "language_loss": 0.81766933, + "learning_rate": 3.7288940811453725e-06, + "loss": 0.83964062, + "num_input_tokens_seen": 69168190, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.96875, + "step": 3204, + "time_per_iteration": 2.4466230869293213 + }, + { + "auxiliary_loss_clip": 0.01143673, + "auxiliary_loss_mlp": 0.01047511, + "balance_loss_clip": 1.03022599, + "balance_loss_mlp": 1.0497942, + "epoch": 0.19269502480084172, + "flos": 17457075354240.0, + "grad_norm": 2.065510679734303, + "language_loss": 0.75797462, + "learning_rate": 3.7286982566734454e-06, + "loss": 0.77988648, + "num_input_tokens_seen": 69186950, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9375, + "step": 3205, + "time_per_iteration": 2.439906358718872 + }, + { + "auxiliary_loss_clip": 0.01150471, + "auxiliary_loss_mlp": 0.01047316, + "balance_loss_clip": 1.02958953, + "balance_loss_mlp": 1.05312991, + "epoch": 0.19275514805350968, + "flos": 21506901045120.0, + "grad_norm": 2.4125731541540465, + "language_loss": 0.83020669, + "learning_rate": 3.728502366649107e-06, + "loss": 0.85218459, + "num_input_tokens_seen": 69204850, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.97265625, + "step": 3206, + "time_per_iteration": 2.463888168334961 + }, + { + "auxiliary_loss_clip": 0.0104957, + "auxiliary_loss_mlp": 0.01003581, + "balance_loss_clip": 1.00139928, + "balance_loss_mlp": 1.01731467, + "epoch": 0.19281527130617768, + "flos": 47695979738880.0, + "grad_norm": 0.8499440783854421, + "language_loss": 0.60609913, + "learning_rate": 3.728306411079786e-06, + "loss": 0.62663066, + "num_input_tokens_seen": 69259200, + "router_z_loss_clip": 0.02185059, + "router_z_loss_mlp": 0.32421875, + "step": 3207, + "time_per_iteration": 2.8865902423858643 + }, + { + "auxiliary_loss_clip": 0.01147085, + "auxiliary_loss_mlp": 0.01045801, + "balance_loss_clip": 1.02789569, + "balance_loss_mlp": 1.05069125, + "epoch": 0.19287539455884564, + "flos": 11801252672640.0, + "grad_norm": 2.4047527057594564, + "language_loss": 0.75119245, + "learning_rate": 3.7281103899729125e-06, + "loss": 0.77312136, + "num_input_tokens_seen": 69275835, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.9609375, + "step": 3208, + "time_per_iteration": 2.4727799892425537 + }, + { + "auxiliary_loss_clip": 0.01146825, + "auxiliary_loss_mlp": 0.01048755, + "balance_loss_clip": 1.02921605, + "balance_loss_mlp": 1.04890394, + "epoch": 0.1929355178115136, + "flos": 20631434860800.0, + "grad_norm": 2.3372356299161696, + "language_loss": 0.60567236, + "learning_rate": 3.7279143033359195e-06, + "loss": 0.62762815, + "num_input_tokens_seen": 69294810, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.98046875, + "step": 3209, + "time_per_iteration": 2.4695677757263184 + }, + { + "auxiliary_loss_clip": 0.0114885, + "auxiliary_loss_mlp": 0.01049539, + "balance_loss_clip": 1.03003573, + "balance_loss_mlp": 1.04981887, + "epoch": 0.19299564106418157, + "flos": 40807916058240.0, + "grad_norm": 1.9457412312791633, + "language_loss": 0.80153656, + "learning_rate": 3.727718151176243e-06, + "loss": 0.82352048, + "num_input_tokens_seen": 69316065, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.9921875, + "step": 3210, + "time_per_iteration": 2.6459405422210693 + }, + { + "auxiliary_loss_clip": 0.01138808, + "auxiliary_loss_mlp": 0.01041334, + "balance_loss_clip": 1.02437103, + "balance_loss_mlp": 1.04580569, + "epoch": 0.19305576431684954, + "flos": 11361418634880.0, + "grad_norm": 2.107646167575127, + "language_loss": 0.82575119, + "learning_rate": 3.7275219335013217e-06, + "loss": 0.84755266, + "num_input_tokens_seen": 69332900, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9296875, + "step": 3211, + "time_per_iteration": 2.454702615737915 + }, + { + "auxiliary_loss_clip": 0.01046258, + "auxiliary_loss_mlp": 0.01012694, + "balance_loss_clip": 1.01057243, + "balance_loss_mlp": 1.01463401, + "epoch": 0.1931158875695175, + "flos": 54511895975040.0, + "grad_norm": 0.9758169311408023, + "language_loss": 0.63670558, + "learning_rate": 3.7273256503185953e-06, + "loss": 0.65729511, + "num_input_tokens_seen": 69382535, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.31640625, + "step": 3212, + "time_per_iteration": 2.914459705352783 + }, + { + "auxiliary_loss_clip": 0.01145937, + "auxiliary_loss_mlp": 0.01046347, + "balance_loss_clip": 1.02967, + "balance_loss_mlp": 1.05140018, + "epoch": 0.19317601082218547, + "flos": 19828436365440.0, + "grad_norm": 1.5978218597026725, + "language_loss": 0.76514798, + "learning_rate": 3.7271293016355074e-06, + "loss": 0.78707075, + "num_input_tokens_seen": 69400600, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9453125, + "step": 3213, + "time_per_iteration": 2.47961163520813 + }, + { + "auxiliary_loss_clip": 0.01147383, + "auxiliary_loss_mlp": 0.01047068, + "balance_loss_clip": 1.02823281, + "balance_loss_mlp": 1.04934072, + "epoch": 0.19323613407485346, + "flos": 13152068467200.0, + "grad_norm": 4.5461953882780115, + "language_loss": 0.70799339, + "learning_rate": 3.726932887459503e-06, + "loss": 0.72993791, + "num_input_tokens_seen": 69417350, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.98046875, + "step": 3214, + "time_per_iteration": 2.4547488689422607 + }, + { + "auxiliary_loss_clip": 0.01142593, + "auxiliary_loss_mlp": 0.01046871, + "balance_loss_clip": 1.02808392, + "balance_loss_mlp": 1.0470041, + "epoch": 0.19329625732752143, + "flos": 14027247342720.0, + "grad_norm": 2.2459266127411848, + "language_loss": 0.75352395, + "learning_rate": 3.72673640779803e-06, + "loss": 0.77541864, + "num_input_tokens_seen": 69431845, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.95703125, + "step": 3215, + "time_per_iteration": 2.4477176666259766 + }, + { + "auxiliary_loss_clip": 0.01139586, + "auxiliary_loss_mlp": 0.01053833, + "balance_loss_clip": 1.03685808, + "balance_loss_mlp": 1.04626155, + "epoch": 0.1933563805801894, + "flos": 23441732069760.0, + "grad_norm": 2.304207478946857, + "language_loss": 0.88559556, + "learning_rate": 3.72653986265854e-06, + "loss": 0.90752971, + "num_input_tokens_seen": 69453275, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9296875, + "step": 3216, + "time_per_iteration": 2.499464988708496 + }, + { + "auxiliary_loss_clip": 0.01140269, + "auxiliary_loss_mlp": 0.0104998, + "balance_loss_clip": 1.0330286, + "balance_loss_mlp": 1.0474, + "epoch": 0.19341650383285736, + "flos": 20485314334080.0, + "grad_norm": 1.5978066249985532, + "language_loss": 0.79762065, + "learning_rate": 3.726343252048485e-06, + "loss": 0.8195231, + "num_input_tokens_seen": 69471830, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9296875, + "step": 3217, + "time_per_iteration": 2.4428889751434326 + }, + { + "auxiliary_loss_clip": 0.0114893, + "auxiliary_loss_mlp": 0.01048467, + "balance_loss_clip": 1.0294652, + "balance_loss_mlp": 1.0504688, + "epoch": 0.19347662708552532, + "flos": 17858484817920.0, + "grad_norm": 2.6606972104147673, + "language_loss": 0.61408496, + "learning_rate": 3.7261465759753206e-06, + "loss": 0.63605893, + "num_input_tokens_seen": 69489320, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.984375, + "step": 3218, + "time_per_iteration": 2.4313230514526367 + }, + { + "auxiliary_loss_clip": 0.0114381, + "auxiliary_loss_mlp": 0.01040596, + "balance_loss_clip": 1.02315593, + "balance_loss_mlp": 1.04883909, + "epoch": 0.1935367503381933, + "flos": 18187247024640.0, + "grad_norm": 1.6811153728366703, + "language_loss": 0.80158418, + "learning_rate": 3.7259498344465053e-06, + "loss": 0.82342821, + "num_input_tokens_seen": 69506665, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.94921875, + "step": 3219, + "time_per_iteration": 2.4347593784332275 + }, + { + "auxiliary_loss_clip": 0.01145851, + "auxiliary_loss_mlp": 0.01048329, + "balance_loss_clip": 1.03010237, + "balance_loss_mlp": 1.05070114, + "epoch": 0.19359687359086128, + "flos": 15957122290560.0, + "grad_norm": 2.032012314604138, + "language_loss": 0.85781908, + "learning_rate": 3.7257530274694993e-06, + "loss": 0.87976086, + "num_input_tokens_seen": 69523835, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.94921875, + "step": 3220, + "time_per_iteration": 2.4572718143463135 + }, + { + "auxiliary_loss_clip": 0.01136805, + "auxiliary_loss_mlp": 0.0103947, + "balance_loss_clip": 1.02356791, + "balance_loss_mlp": 1.0477736, + "epoch": 0.19365699684352924, + "flos": 21215198695680.0, + "grad_norm": 2.087292049011103, + "language_loss": 0.84617937, + "learning_rate": 3.725556155051766e-06, + "loss": 0.86794209, + "num_input_tokens_seen": 69542620, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.890625, + "step": 3221, + "time_per_iteration": 2.4601354598999023 + }, + { + "auxiliary_loss_clip": 0.01142607, + "auxiliary_loss_mlp": 0.01049362, + "balance_loss_clip": 1.0331614, + "balance_loss_mlp": 1.05009556, + "epoch": 0.1937171200961972, + "flos": 17311098481920.0, + "grad_norm": 2.075109928662421, + "language_loss": 0.85929954, + "learning_rate": 3.7253592172007702e-06, + "loss": 0.88121927, + "num_input_tokens_seen": 69561130, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.92578125, + "step": 3222, + "time_per_iteration": 2.433027505874634 + }, + { + "auxiliary_loss_clip": 0.0114145, + "auxiliary_loss_mlp": 0.01040151, + "balance_loss_clip": 1.02212656, + "balance_loss_mlp": 1.04663789, + "epoch": 0.19377724334886517, + "flos": 22635968227200.0, + "grad_norm": 3.9278404759018053, + "language_loss": 0.78207982, + "learning_rate": 3.72516221392398e-06, + "loss": 0.80389583, + "num_input_tokens_seen": 69580425, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9453125, + "step": 3223, + "time_per_iteration": 2.4451496601104736 + }, + { + "auxiliary_loss_clip": 0.01139994, + "auxiliary_loss_mlp": 0.01047584, + "balance_loss_clip": 1.03013206, + "balance_loss_mlp": 1.04896808, + "epoch": 0.19383736660153314, + "flos": 15077813351040.0, + "grad_norm": 1.8200574771064912, + "language_loss": 0.75589085, + "learning_rate": 3.7249651452288653e-06, + "loss": 0.77776659, + "num_input_tokens_seen": 69597085, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.91015625, + "step": 3224, + "time_per_iteration": 2.4390981197357178 + }, + { + "auxiliary_loss_clip": 0.0114036, + "auxiliary_loss_mlp": 0.01039996, + "balance_loss_clip": 1.02274644, + "balance_loss_mlp": 1.04741263, + "epoch": 0.1938974898542011, + "flos": 47119934350080.0, + "grad_norm": 2.092202382915022, + "language_loss": 0.71141279, + "learning_rate": 3.7247680111229e-06, + "loss": 0.73321629, + "num_input_tokens_seen": 69618885, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9296875, + "step": 3225, + "time_per_iteration": 2.6690707206726074 + }, + { + "auxiliary_loss_clip": 0.01142605, + "auxiliary_loss_mlp": 0.01044348, + "balance_loss_clip": 1.0279572, + "balance_loss_mlp": 1.04787326, + "epoch": 0.19395761310686907, + "flos": 25812554376960.0, + "grad_norm": 2.058354492672399, + "language_loss": 0.6915803, + "learning_rate": 3.7245708116135585e-06, + "loss": 0.71344984, + "num_input_tokens_seen": 69638200, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9453125, + "step": 3226, + "time_per_iteration": 3.906217336654663 + }, + { + "auxiliary_loss_clip": 0.0114437, + "auxiliary_loss_mlp": 0.01044177, + "balance_loss_clip": 1.02562809, + "balance_loss_mlp": 1.05274427, + "epoch": 0.19401773635953706, + "flos": 23039604334080.0, + "grad_norm": 1.6131772564475266, + "language_loss": 0.76138854, + "learning_rate": 3.7243735467083193e-06, + "loss": 0.78327405, + "num_input_tokens_seen": 69657550, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9140625, + "step": 3227, + "time_per_iteration": 4.168737411499023 + }, + { + "auxiliary_loss_clip": 0.01140847, + "auxiliary_loss_mlp": 0.01042545, + "balance_loss_clip": 1.02547467, + "balance_loss_mlp": 1.04588878, + "epoch": 0.19407785961220503, + "flos": 15920780705280.0, + "grad_norm": 1.8539897665707572, + "language_loss": 0.69154215, + "learning_rate": 3.724176216414662e-06, + "loss": 0.7133761, + "num_input_tokens_seen": 69675005, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.94921875, + "step": 3228, + "time_per_iteration": 2.4857404232025146 + }, + { + "auxiliary_loss_clip": 0.01142054, + "auxiliary_loss_mlp": 0.01044016, + "balance_loss_clip": 1.02698135, + "balance_loss_mlp": 1.04929864, + "epoch": 0.194137982864873, + "flos": 25921722787200.0, + "grad_norm": 1.9069922854616745, + "language_loss": 0.7428174, + "learning_rate": 3.72397882074007e-06, + "loss": 0.76467812, + "num_input_tokens_seen": 69696455, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9296875, + "step": 3229, + "time_per_iteration": 2.5357918739318848 + }, + { + "auxiliary_loss_clip": 0.01141663, + "auxiliary_loss_mlp": 0.01041685, + "balance_loss_clip": 1.02443504, + "balance_loss_mlp": 1.04832351, + "epoch": 0.19419810611754096, + "flos": 13261344618240.0, + "grad_norm": 1.6963766145995596, + "language_loss": 0.65157712, + "learning_rate": 3.7237813596920285e-06, + "loss": 0.67341059, + "num_input_tokens_seen": 69714245, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.93359375, + "step": 3230, + "time_per_iteration": 2.4796855449676514 + }, + { + "auxiliary_loss_clip": 0.01137871, + "auxiliary_loss_mlp": 0.01044544, + "balance_loss_clip": 1.0268054, + "balance_loss_mlp": 1.04652202, + "epoch": 0.19425822937020892, + "flos": 15705568368000.0, + "grad_norm": 1.8877471342298004, + "language_loss": 0.8184334, + "learning_rate": 3.7235838332780254e-06, + "loss": 0.84025759, + "num_input_tokens_seen": 69731515, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9140625, + "step": 3231, + "time_per_iteration": 2.5315961837768555 + }, + { + "auxiliary_loss_clip": 0.01145592, + "auxiliary_loss_mlp": 0.01039112, + "balance_loss_clip": 1.02045608, + "balance_loss_mlp": 1.05067456, + "epoch": 0.1943183526228769, + "flos": 23105392093440.0, + "grad_norm": 1.787689187471357, + "language_loss": 0.86743605, + "learning_rate": 3.72338624150555e-06, + "loss": 0.88928306, + "num_input_tokens_seen": 69748885, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.94921875, + "step": 3232, + "time_per_iteration": 2.4916152954101562 + }, + { + "auxiliary_loss_clip": 0.01141636, + "auxiliary_loss_mlp": 0.01052447, + "balance_loss_clip": 1.03497076, + "balance_loss_mlp": 1.05008495, + "epoch": 0.19437847587554485, + "flos": 24712610146560.0, + "grad_norm": 1.5602267859616314, + "language_loss": 0.8513217, + "learning_rate": 3.723188584382096e-06, + "loss": 0.87326247, + "num_input_tokens_seen": 69767540, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9140625, + "step": 3233, + "time_per_iteration": 2.526118040084839 + }, + { + "auxiliary_loss_clip": 0.01145232, + "auxiliary_loss_mlp": 0.01053705, + "balance_loss_clip": 1.03603804, + "balance_loss_mlp": 1.04827857, + "epoch": 0.19443859912821285, + "flos": 23116130259840.0, + "grad_norm": 1.6631942166294669, + "language_loss": 0.89191484, + "learning_rate": 3.722990861915158e-06, + "loss": 0.91390419, + "num_input_tokens_seen": 69789340, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.96875, + "step": 3234, + "time_per_iteration": 2.4783849716186523 + }, + { + "auxiliary_loss_clip": 0.01143869, + "auxiliary_loss_mlp": 0.01043333, + "balance_loss_clip": 1.02493858, + "balance_loss_mlp": 1.04675341, + "epoch": 0.1944987223808808, + "flos": 15084385539840.0, + "grad_norm": 2.1776085062187374, + "language_loss": 0.78503513, + "learning_rate": 3.722793074112234e-06, + "loss": 0.80690718, + "num_input_tokens_seen": 69806470, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.96875, + "step": 3235, + "time_per_iteration": 2.4414284229278564 + }, + { + "auxiliary_loss_clip": 0.01146423, + "auxiliary_loss_mlp": 0.01041843, + "balance_loss_clip": 1.02545178, + "balance_loss_mlp": 1.05288744, + "epoch": 0.19455884563354878, + "flos": 17126876603520.0, + "grad_norm": 2.115791514531618, + "language_loss": 0.7937218, + "learning_rate": 3.7225952209808233e-06, + "loss": 0.81560451, + "num_input_tokens_seen": 69822655, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.93359375, + "step": 3236, + "time_per_iteration": 2.4394619464874268 + }, + { + "auxiliary_loss_clip": 0.01144391, + "auxiliary_loss_mlp": 0.01040175, + "balance_loss_clip": 1.02204323, + "balance_loss_mlp": 1.05156302, + "epoch": 0.19461896888621674, + "flos": 20193396503040.0, + "grad_norm": 2.445233321344346, + "language_loss": 0.75936478, + "learning_rate": 3.72239730252843e-06, + "loss": 0.78121042, + "num_input_tokens_seen": 69841895, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9296875, + "step": 3237, + "time_per_iteration": 2.544003486633301 + }, + { + "auxiliary_loss_clip": 0.01147227, + "auxiliary_loss_mlp": 0.01046687, + "balance_loss_clip": 1.03005719, + "balance_loss_mlp": 1.05079889, + "epoch": 0.1946790921388847, + "flos": 25301365971840.0, + "grad_norm": 2.0921387862929586, + "language_loss": 0.75056225, + "learning_rate": 3.7221993187625583e-06, + "loss": 0.77250135, + "num_input_tokens_seen": 69862220, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.96484375, + "step": 3238, + "time_per_iteration": 2.4795806407928467 + }, + { + "auxiliary_loss_clip": 0.01147117, + "auxiliary_loss_mlp": 0.01044554, + "balance_loss_clip": 1.02600455, + "balance_loss_mlp": 1.05317962, + "epoch": 0.19473921539155267, + "flos": 20193396503040.0, + "grad_norm": 1.8233855681516762, + "language_loss": 0.73016453, + "learning_rate": 3.7220012696907155e-06, + "loss": 0.75208122, + "num_input_tokens_seen": 69881830, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.94140625, + "step": 3239, + "time_per_iteration": 2.4695816040039062 + }, + { + "auxiliary_loss_clip": 0.01144581, + "auxiliary_loss_mlp": 0.01048537, + "balance_loss_clip": 1.03026247, + "balance_loss_mlp": 1.0505631, + "epoch": 0.19479933864422067, + "flos": 20887549810560.0, + "grad_norm": 1.897973355517785, + "language_loss": 0.73792124, + "learning_rate": 3.721803155320412e-06, + "loss": 0.75985241, + "num_input_tokens_seen": 69900515, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.94140625, + "step": 3240, + "time_per_iteration": 2.4483954906463623 + }, + { + "auxiliary_loss_clip": 0.0114635, + "auxiliary_loss_mlp": 0.01041908, + "balance_loss_clip": 1.02477801, + "balance_loss_mlp": 1.05221701, + "epoch": 0.19485946189688863, + "flos": 23295072839040.0, + "grad_norm": 1.8797415358152445, + "language_loss": 0.66685343, + "learning_rate": 3.7216049756591606e-06, + "loss": 0.68873608, + "num_input_tokens_seen": 69920060, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.94140625, + "step": 3241, + "time_per_iteration": 2.5644116401672363 + }, + { + "auxiliary_loss_clip": 0.01144249, + "auxiliary_loss_mlp": 0.01045443, + "balance_loss_clip": 1.0280863, + "balance_loss_mlp": 1.05193758, + "epoch": 0.1949195851495566, + "flos": 23295036925440.0, + "grad_norm": 1.4346271942222966, + "language_loss": 0.82889283, + "learning_rate": 3.7214067307144754e-06, + "loss": 0.85078967, + "num_input_tokens_seen": 69939820, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.921875, + "step": 3242, + "time_per_iteration": 2.476043701171875 + }, + { + "auxiliary_loss_clip": 0.01054708, + "auxiliary_loss_mlp": 0.01010683, + "balance_loss_clip": 1.00856066, + "balance_loss_mlp": 1.02379096, + "epoch": 0.19497970840222456, + "flos": 64962871557120.0, + "grad_norm": 0.8482804620416572, + "language_loss": 0.57572454, + "learning_rate": 3.721208420493875e-06, + "loss": 0.59637845, + "num_input_tokens_seen": 70002145, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.30859375, + "step": 3243, + "time_per_iteration": 3.1217525005340576 + }, + { + "auxiliary_loss_clip": 0.01144732, + "auxiliary_loss_mlp": 0.01043073, + "balance_loss_clip": 1.02573967, + "balance_loss_mlp": 1.05099249, + "epoch": 0.19503983165489253, + "flos": 19644717277440.0, + "grad_norm": 2.02063631868758, + "language_loss": 0.83243412, + "learning_rate": 3.7210100450048784e-06, + "loss": 0.85431218, + "num_input_tokens_seen": 70020510, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9375, + "step": 3244, + "time_per_iteration": 2.4848830699920654 + }, + { + "auxiliary_loss_clip": 0.01147429, + "auxiliary_loss_mlp": 0.01048127, + "balance_loss_clip": 1.03144979, + "balance_loss_mlp": 1.05495024, + "epoch": 0.1950999549075605, + "flos": 21141976821120.0, + "grad_norm": 1.8275576625869878, + "language_loss": 0.77049786, + "learning_rate": 3.7208116042550088e-06, + "loss": 0.79245341, + "num_input_tokens_seen": 70040760, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.921875, + "step": 3245, + "time_per_iteration": 2.5539040565490723 + }, + { + "auxiliary_loss_clip": 0.01141945, + "auxiliary_loss_mlp": 0.01041151, + "balance_loss_clip": 1.0235796, + "balance_loss_mlp": 1.04852772, + "epoch": 0.19516007816022846, + "flos": 20884820376960.0, + "grad_norm": 2.8639596298576055, + "language_loss": 0.84020388, + "learning_rate": 3.7206130982517906e-06, + "loss": 0.86203486, + "num_input_tokens_seen": 70058720, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9375, + "step": 3246, + "time_per_iteration": 2.5018341541290283 + }, + { + "auxiliary_loss_clip": 0.0114444, + "auxiliary_loss_mlp": 0.01045285, + "balance_loss_clip": 1.02834511, + "balance_loss_mlp": 1.04978824, + "epoch": 0.19522020141289645, + "flos": 16910515031040.0, + "grad_norm": 2.1267063345385777, + "language_loss": 0.7636531, + "learning_rate": 3.7204145270027514e-06, + "loss": 0.78555036, + "num_input_tokens_seen": 70076470, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9453125, + "step": 3247, + "time_per_iteration": 2.4512898921966553 + }, + { + "auxiliary_loss_clip": 0.01144663, + "auxiliary_loss_mlp": 0.01038699, + "balance_loss_clip": 1.02228367, + "balance_loss_mlp": 1.05077446, + "epoch": 0.19528032466556441, + "flos": 26724829023360.0, + "grad_norm": 1.4744510548582124, + "language_loss": 0.75330198, + "learning_rate": 3.720215890515421e-06, + "loss": 0.77513552, + "num_input_tokens_seen": 70096220, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9375, + "step": 3248, + "time_per_iteration": 2.5222222805023193 + }, + { + "auxiliary_loss_clip": 0.01140079, + "auxiliary_loss_mlp": 0.0104275, + "balance_loss_clip": 1.02590537, + "balance_loss_mlp": 1.04661679, + "epoch": 0.19534044791823238, + "flos": 21032808410880.0, + "grad_norm": 1.9881324270373204, + "language_loss": 0.78316575, + "learning_rate": 3.7200171887973316e-06, + "loss": 0.80499399, + "num_input_tokens_seen": 70114800, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.93359375, + "step": 3249, + "time_per_iteration": 2.475385904312134 + }, + { + "auxiliary_loss_clip": 0.01143906, + "auxiliary_loss_mlp": 0.01048238, + "balance_loss_clip": 1.0316205, + "balance_loss_mlp": 1.04948914, + "epoch": 0.19540057117090034, + "flos": 22344050396160.0, + "grad_norm": 1.839405294960197, + "language_loss": 0.73238158, + "learning_rate": 3.7198184218560176e-06, + "loss": 0.7543031, + "num_input_tokens_seen": 70134930, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9453125, + "step": 3250, + "time_per_iteration": 2.4548323154449463 + }, + { + "auxiliary_loss_clip": 0.01136082, + "auxiliary_loss_mlp": 0.01037994, + "balance_loss_clip": 1.02206779, + "balance_loss_mlp": 1.04583359, + "epoch": 0.1954606944235683, + "flos": 20301631159680.0, + "grad_norm": 1.9014920395959154, + "language_loss": 0.79582441, + "learning_rate": 3.719619589699017e-06, + "loss": 0.8175652, + "num_input_tokens_seen": 70152045, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.90234375, + "step": 3251, + "time_per_iteration": 2.4597084522247314 + }, + { + "auxiliary_loss_clip": 0.01142571, + "auxiliary_loss_mlp": 0.01041367, + "balance_loss_clip": 1.02441597, + "balance_loss_mlp": 1.04888558, + "epoch": 0.19552081767623627, + "flos": 17346865449600.0, + "grad_norm": 3.2143497379473613, + "language_loss": 0.83534026, + "learning_rate": 3.7194206923338695e-06, + "loss": 0.85717964, + "num_input_tokens_seen": 70169240, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9375, + "step": 3252, + "time_per_iteration": 2.4245967864990234 + }, + { + "auxiliary_loss_clip": 0.01143675, + "auxiliary_loss_mlp": 0.01048315, + "balance_loss_clip": 1.03026652, + "balance_loss_mlp": 1.04651105, + "epoch": 0.19558094092890424, + "flos": 31977626129280.0, + "grad_norm": 1.7806404718622555, + "language_loss": 0.73870194, + "learning_rate": 3.719221729768117e-06, + "loss": 0.76062191, + "num_input_tokens_seen": 70192690, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.96875, + "step": 3253, + "time_per_iteration": 2.5752809047698975 + }, + { + "auxiliary_loss_clip": 0.01142809, + "auxiliary_loss_mlp": 0.01040218, + "balance_loss_clip": 1.02352846, + "balance_loss_mlp": 1.04619944, + "epoch": 0.19564106418157223, + "flos": 22268889187200.0, + "grad_norm": 1.833285648050628, + "language_loss": 0.76684111, + "learning_rate": 3.7190227020093037e-06, + "loss": 0.78867137, + "num_input_tokens_seen": 70209685, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.96484375, + "step": 3254, + "time_per_iteration": 2.533993721008301 + }, + { + "auxiliary_loss_clip": 0.01044914, + "auxiliary_loss_mlp": 0.01004749, + "balance_loss_clip": 1.00268674, + "balance_loss_mlp": 1.01349974, + "epoch": 0.1957011874342402, + "flos": 54364554385920.0, + "grad_norm": 0.7652407497357797, + "language_loss": 0.55344874, + "learning_rate": 3.7188236090649774e-06, + "loss": 0.5739454, + "num_input_tokens_seen": 70265050, + "router_z_loss_clip": 0.02062988, + "router_z_loss_mlp": 0.3125, + "step": 3255, + "time_per_iteration": 3.164173126220703 + }, + { + "auxiliary_loss_clip": 0.01144973, + "auxiliary_loss_mlp": 0.01041369, + "balance_loss_clip": 1.02407217, + "balance_loss_mlp": 1.05057478, + "epoch": 0.19576131068690816, + "flos": 16506699356160.0, + "grad_norm": 2.650975615707017, + "language_loss": 0.7066443, + "learning_rate": 3.718624450942688e-06, + "loss": 0.7285077, + "num_input_tokens_seen": 70281830, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9453125, + "step": 3256, + "time_per_iteration": 2.496424436569214 + }, + { + "auxiliary_loss_clip": 0.01139601, + "auxiliary_loss_mlp": 0.0104318, + "balance_loss_clip": 1.02635908, + "balance_loss_mlp": 1.04647136, + "epoch": 0.19582143393957613, + "flos": 14719676797440.0, + "grad_norm": 2.256610935254856, + "language_loss": 0.80055118, + "learning_rate": 3.718425227649987e-06, + "loss": 0.82237899, + "num_input_tokens_seen": 70297420, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9296875, + "step": 3257, + "time_per_iteration": 2.4771504402160645 + }, + { + "auxiliary_loss_clip": 0.01143218, + "auxiliary_loss_mlp": 0.01042656, + "balance_loss_clip": 1.02637219, + "balance_loss_mlp": 1.05034149, + "epoch": 0.1958815571922441, + "flos": 24425504737920.0, + "grad_norm": 1.9567741269254724, + "language_loss": 0.74843282, + "learning_rate": 3.7182259391944292e-06, + "loss": 0.77029151, + "num_input_tokens_seen": 70319210, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.9296875, + "step": 3258, + "time_per_iteration": 2.6177120208740234 + }, + { + "auxiliary_loss_clip": 0.01142767, + "auxiliary_loss_mlp": 0.01036452, + "balance_loss_clip": 1.01932144, + "balance_loss_mlp": 1.04772139, + "epoch": 0.19594168044491206, + "flos": 24900279730560.0, + "grad_norm": 1.7410781544458231, + "language_loss": 0.74462247, + "learning_rate": 3.7180265855835714e-06, + "loss": 0.7664147, + "num_input_tokens_seen": 70339045, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.94921875, + "step": 3259, + "time_per_iteration": 2.54068660736084 + }, + { + "auxiliary_loss_clip": 0.01145135, + "auxiliary_loss_mlp": 0.01036775, + "balance_loss_clip": 1.01923943, + "balance_loss_mlp": 1.04965675, + "epoch": 0.19600180369758005, + "flos": 12057008486400.0, + "grad_norm": 2.380592438675979, + "language_loss": 0.77040654, + "learning_rate": 3.7178271668249735e-06, + "loss": 0.7922256, + "num_input_tokens_seen": 70356505, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.953125, + "step": 3260, + "time_per_iteration": 2.4983303546905518 + }, + { + "auxiliary_loss_clip": 0.01143361, + "auxiliary_loss_mlp": 0.01041828, + "balance_loss_clip": 1.02459061, + "balance_loss_mlp": 1.0486325, + "epoch": 0.19606192695024802, + "flos": 20850202644480.0, + "grad_norm": 2.011568492365706, + "language_loss": 0.82168972, + "learning_rate": 3.7176276829261975e-06, + "loss": 0.84354162, + "num_input_tokens_seen": 70375410, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9453125, + "step": 3261, + "time_per_iteration": 2.52164626121521 + }, + { + "auxiliary_loss_clip": 0.01144228, + "auxiliary_loss_mlp": 0.01042699, + "balance_loss_clip": 1.02510428, + "balance_loss_mlp": 1.05130327, + "epoch": 0.19612205020291598, + "flos": 28475509996800.0, + "grad_norm": 2.1812525814986112, + "language_loss": 0.76691413, + "learning_rate": 3.717428133894807e-06, + "loss": 0.78878343, + "num_input_tokens_seen": 70396315, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9296875, + "step": 3262, + "time_per_iteration": 2.513619899749756 + }, + { + "auxiliary_loss_clip": 0.01145398, + "auxiliary_loss_mlp": 0.01044459, + "balance_loss_clip": 1.02775788, + "balance_loss_mlp": 1.05290008, + "epoch": 0.19618217345558395, + "flos": 25556618995200.0, + "grad_norm": 1.7175684177653927, + "language_loss": 0.8667773, + "learning_rate": 3.71722851973837e-06, + "loss": 0.88867593, + "num_input_tokens_seen": 70417945, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.921875, + "step": 3263, + "time_per_iteration": 2.49373459815979 + }, + { + "auxiliary_loss_clip": 0.01140801, + "auxiliary_loss_mlp": 0.01041854, + "balance_loss_clip": 1.0251646, + "balance_loss_mlp": 1.04784787, + "epoch": 0.1962422967082519, + "flos": 25264413855360.0, + "grad_norm": 1.5660143494742738, + "language_loss": 0.74136549, + "learning_rate": 3.717028840464455e-06, + "loss": 0.76319206, + "num_input_tokens_seen": 70438690, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9296875, + "step": 3264, + "time_per_iteration": 2.4891843795776367 + }, + { + "auxiliary_loss_clip": 0.0114591, + "auxiliary_loss_mlp": 0.01049823, + "balance_loss_clip": 1.03340793, + "balance_loss_mlp": 1.05435038, + "epoch": 0.19630241996091988, + "flos": 18807352444800.0, + "grad_norm": 4.0742741532711975, + "language_loss": 0.78590196, + "learning_rate": 3.7168290960806344e-06, + "loss": 0.8078593, + "num_input_tokens_seen": 70455385, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9140625, + "step": 3265, + "time_per_iteration": 2.4226529598236084 + }, + { + "auxiliary_loss_clip": 0.01047401, + "auxiliary_loss_mlp": 0.01014864, + "balance_loss_clip": 1.01292133, + "balance_loss_mlp": 1.01652646, + "epoch": 0.19636254321358784, + "flos": 62321137896960.0, + "grad_norm": 0.7852387786228787, + "language_loss": 0.53459084, + "learning_rate": 3.716629286594483e-06, + "loss": 0.55521357, + "num_input_tokens_seen": 70514280, + "router_z_loss_clip": 0.01940918, + "router_z_loss_mlp": 0.30859375, + "step": 3266, + "time_per_iteration": 3.0519652366638184 + }, + { + "auxiliary_loss_clip": 0.01145434, + "auxiliary_loss_mlp": 0.01041492, + "balance_loss_clip": 1.02263319, + "balance_loss_mlp": 1.04800785, + "epoch": 0.19642266646625584, + "flos": 21069329564160.0, + "grad_norm": 1.9728388819613873, + "language_loss": 0.80503136, + "learning_rate": 3.7164294120135767e-06, + "loss": 0.82690066, + "num_input_tokens_seen": 70531800, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.9765625, + "step": 3267, + "time_per_iteration": 2.436455011367798 + }, + { + "auxiliary_loss_clip": 0.01138843, + "auxiliary_loss_mlp": 0.0104324, + "balance_loss_clip": 1.02726591, + "balance_loss_mlp": 1.04780269, + "epoch": 0.1964827897189238, + "flos": 14538651229440.0, + "grad_norm": 2.528633756775916, + "language_loss": 0.87031806, + "learning_rate": 3.7162294723454953e-06, + "loss": 0.89213896, + "num_input_tokens_seen": 70550615, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.91015625, + "step": 3268, + "time_per_iteration": 5.348580360412598 + }, + { + "auxiliary_loss_clip": 0.01141651, + "auxiliary_loss_mlp": 0.01045776, + "balance_loss_clip": 1.02865744, + "balance_loss_mlp": 1.04996669, + "epoch": 0.19654291297159177, + "flos": 19244636616960.0, + "grad_norm": 2.7845337804652086, + "language_loss": 0.69331455, + "learning_rate": 3.7160294675978197e-06, + "loss": 0.71518886, + "num_input_tokens_seen": 70568690, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9140625, + "step": 3269, + "time_per_iteration": 3.9386346340179443 + }, + { + "auxiliary_loss_clip": 0.01148343, + "auxiliary_loss_mlp": 0.01051701, + "balance_loss_clip": 1.03361702, + "balance_loss_mlp": 1.0530045, + "epoch": 0.19660303622425973, + "flos": 25775710001280.0, + "grad_norm": 2.4386480468071086, + "language_loss": 0.80760634, + "learning_rate": 3.715829397778135e-06, + "loss": 0.82960677, + "num_input_tokens_seen": 70588665, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.953125, + "step": 3270, + "time_per_iteration": 2.5130820274353027 + }, + { + "auxiliary_loss_clip": 0.01140062, + "auxiliary_loss_mlp": 0.01045089, + "balance_loss_clip": 1.02848363, + "balance_loss_mlp": 1.04726839, + "epoch": 0.1966631594769277, + "flos": 20595093275520.0, + "grad_norm": 1.857854204827715, + "language_loss": 0.83918732, + "learning_rate": 3.715629262894028e-06, + "loss": 0.86103886, + "num_input_tokens_seen": 70606900, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9296875, + "step": 3271, + "time_per_iteration": 2.4522581100463867 + }, + { + "auxiliary_loss_clip": 0.01139583, + "auxiliary_loss_mlp": 0.01046491, + "balance_loss_clip": 1.0297302, + "balance_loss_mlp": 1.04943895, + "epoch": 0.19672328272959566, + "flos": 23623188600960.0, + "grad_norm": 2.1376155358713835, + "language_loss": 0.80162311, + "learning_rate": 3.715429062953087e-06, + "loss": 0.82348382, + "num_input_tokens_seen": 70625955, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8984375, + "step": 3272, + "time_per_iteration": 2.4968738555908203 + }, + { + "auxiliary_loss_clip": 0.01145188, + "auxiliary_loss_mlp": 0.01045772, + "balance_loss_clip": 1.02766371, + "balance_loss_mlp": 1.05075002, + "epoch": 0.19678340598226365, + "flos": 23110922787840.0, + "grad_norm": 1.7855512393811417, + "language_loss": 0.80728978, + "learning_rate": 3.7152287979629043e-06, + "loss": 0.82919937, + "num_input_tokens_seen": 70646090, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9453125, + "step": 3273, + "time_per_iteration": 2.525407552719116 + }, + { + "auxiliary_loss_clip": 0.01142802, + "auxiliary_loss_mlp": 0.01051833, + "balance_loss_clip": 1.03454804, + "balance_loss_mlp": 1.04807115, + "epoch": 0.19684352923493162, + "flos": 24534852716160.0, + "grad_norm": 5.081990879764466, + "language_loss": 0.7791425, + "learning_rate": 3.7150284679310735e-06, + "loss": 0.80108881, + "num_input_tokens_seen": 70666065, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9453125, + "step": 3274, + "time_per_iteration": 2.527858018875122 + }, + { + "auxiliary_loss_clip": 0.01141542, + "auxiliary_loss_mlp": 0.01052239, + "balance_loss_clip": 1.03440571, + "balance_loss_mlp": 1.04765558, + "epoch": 0.19690365248759958, + "flos": 21796448578560.0, + "grad_norm": 2.1984029701042367, + "language_loss": 0.81144857, + "learning_rate": 3.7148280728651914e-06, + "loss": 0.83338642, + "num_input_tokens_seen": 70681580, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.9375, + "step": 3275, + "time_per_iteration": 2.451392412185669 + }, + { + "auxiliary_loss_clip": 0.01143631, + "auxiliary_loss_mlp": 0.01047389, + "balance_loss_clip": 1.02934027, + "balance_loss_mlp": 1.04772139, + "epoch": 0.19696377574026755, + "flos": 19056643810560.0, + "grad_norm": 1.90284229785688, + "language_loss": 0.81104618, + "learning_rate": 3.7146276127728563e-06, + "loss": 0.83295637, + "num_input_tokens_seen": 70697745, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9609375, + "step": 3276, + "time_per_iteration": 2.462033748626709 + }, + { + "auxiliary_loss_clip": 0.01142306, + "auxiliary_loss_mlp": 0.01038428, + "balance_loss_clip": 1.02132106, + "balance_loss_mlp": 1.04889154, + "epoch": 0.19702389899293551, + "flos": 22820656982400.0, + "grad_norm": 2.0909421048868126, + "language_loss": 0.89347923, + "learning_rate": 3.7144270876616713e-06, + "loss": 0.91528654, + "num_input_tokens_seen": 70715110, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.93359375, + "step": 3277, + "time_per_iteration": 2.4887003898620605 + }, + { + "auxiliary_loss_clip": 0.01146208, + "auxiliary_loss_mlp": 0.01047781, + "balance_loss_clip": 1.02804041, + "balance_loss_mlp": 1.04832077, + "epoch": 0.19708402224560348, + "flos": 22894237992960.0, + "grad_norm": 2.9974095646387573, + "language_loss": 0.62265754, + "learning_rate": 3.714226497539239e-06, + "loss": 0.64459741, + "num_input_tokens_seen": 70734715, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.9765625, + "step": 3278, + "time_per_iteration": 2.560401201248169 + }, + { + "auxiliary_loss_clip": 0.01144829, + "auxiliary_loss_mlp": 0.01054112, + "balance_loss_clip": 1.03562284, + "balance_loss_mlp": 1.04910243, + "epoch": 0.19714414549827144, + "flos": 25662519267840.0, + "grad_norm": 3.1131920881239936, + "language_loss": 0.73664343, + "learning_rate": 3.714025842413166e-06, + "loss": 0.75863284, + "num_input_tokens_seen": 70752650, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.95703125, + "step": 3279, + "time_per_iteration": 2.5036048889160156 + }, + { + "auxiliary_loss_clip": 0.01144667, + "auxiliary_loss_mlp": 0.01045176, + "balance_loss_clip": 1.02816486, + "balance_loss_mlp": 1.04906511, + "epoch": 0.19720426875093944, + "flos": 23915824704000.0, + "grad_norm": 1.6310774806952162, + "language_loss": 0.82451236, + "learning_rate": 3.713825122291061e-06, + "loss": 0.84641075, + "num_input_tokens_seen": 70772365, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.95703125, + "step": 3280, + "time_per_iteration": 2.499962091445923 + }, + { + "auxiliary_loss_clip": 0.01145271, + "auxiliary_loss_mlp": 0.01043645, + "balance_loss_clip": 1.02744484, + "balance_loss_mlp": 1.05086279, + "epoch": 0.1972643920036074, + "flos": 13881952828800.0, + "grad_norm": 1.847926035637751, + "language_loss": 0.77581155, + "learning_rate": 3.713624337180536e-06, + "loss": 0.79770064, + "num_input_tokens_seen": 70790340, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.94140625, + "step": 3281, + "time_per_iteration": 2.4610888957977295 + }, + { + "auxiliary_loss_clip": 0.01141389, + "auxiliary_loss_mlp": 0.01043048, + "balance_loss_clip": 1.02719295, + "balance_loss_mlp": 1.0507971, + "epoch": 0.19732451525627537, + "flos": 19863592801920.0, + "grad_norm": 1.593504057665797, + "language_loss": 0.79502213, + "learning_rate": 3.7134234870892045e-06, + "loss": 0.81686652, + "num_input_tokens_seen": 70809295, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.90625, + "step": 3282, + "time_per_iteration": 2.453230857849121 + }, + { + "auxiliary_loss_clip": 0.01149903, + "auxiliary_loss_mlp": 0.01044987, + "balance_loss_clip": 1.0279994, + "balance_loss_mlp": 1.05359089, + "epoch": 0.19738463850894333, + "flos": 24973429777920.0, + "grad_norm": 2.157912578421005, + "language_loss": 0.71937042, + "learning_rate": 3.7132225720246826e-06, + "loss": 0.7413193, + "num_input_tokens_seen": 70828765, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9609375, + "step": 3283, + "time_per_iteration": 2.5070157051086426 + }, + { + "auxiliary_loss_clip": 0.0114321, + "auxiliary_loss_mlp": 0.01041465, + "balance_loss_clip": 1.02462053, + "balance_loss_mlp": 1.04858577, + "epoch": 0.1974447617616113, + "flos": 18368883123840.0, + "grad_norm": 1.741034644212953, + "language_loss": 0.78832877, + "learning_rate": 3.7130215919945886e-06, + "loss": 0.81017548, + "num_input_tokens_seen": 70846805, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9453125, + "step": 3284, + "time_per_iteration": 2.436530113220215 + }, + { + "auxiliary_loss_clip": 0.01147439, + "auxiliary_loss_mlp": 0.01047462, + "balance_loss_clip": 1.02952087, + "balance_loss_mlp": 1.05069387, + "epoch": 0.19750488501427926, + "flos": 22892945103360.0, + "grad_norm": 2.0622477624774325, + "language_loss": 0.86366653, + "learning_rate": 3.7128205470065445e-06, + "loss": 0.88561547, + "num_input_tokens_seen": 70863805, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.96875, + "step": 3285, + "time_per_iteration": 2.4581058025360107 + }, + { + "auxiliary_loss_clip": 0.01143401, + "auxiliary_loss_mlp": 0.01042449, + "balance_loss_clip": 1.02571201, + "balance_loss_mlp": 1.0520879, + "epoch": 0.19756500826694723, + "flos": 21871502046720.0, + "grad_norm": 2.7361177014734372, + "language_loss": 0.88680863, + "learning_rate": 3.712619437068174e-06, + "loss": 0.90866709, + "num_input_tokens_seen": 70882660, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9140625, + "step": 3286, + "time_per_iteration": 2.472475290298462 + }, + { + "auxiliary_loss_clip": 0.01148385, + "auxiliary_loss_mlp": 0.01049125, + "balance_loss_clip": 1.03036189, + "balance_loss_mlp": 1.05260301, + "epoch": 0.19762513151961522, + "flos": 15158972131200.0, + "grad_norm": 2.2372981039860833, + "language_loss": 0.78297567, + "learning_rate": 3.712418262187102e-06, + "loss": 0.80495083, + "num_input_tokens_seen": 70898765, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.95703125, + "step": 3287, + "time_per_iteration": 2.417569875717163 + }, + { + "auxiliary_loss_clip": 0.01146192, + "auxiliary_loss_mlp": 0.01045423, + "balance_loss_clip": 1.02674246, + "balance_loss_mlp": 1.04974318, + "epoch": 0.1976852547722832, + "flos": 16979175878400.0, + "grad_norm": 2.197025185749627, + "language_loss": 0.81252837, + "learning_rate": 3.7122170223709584e-06, + "loss": 0.83444452, + "num_input_tokens_seen": 70916370, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.96484375, + "step": 3288, + "time_per_iteration": 2.4107155799865723 + }, + { + "auxiliary_loss_clip": 0.01139417, + "auxiliary_loss_mlp": 0.01049687, + "balance_loss_clip": 1.03315234, + "balance_loss_mlp": 1.04890108, + "epoch": 0.19774537802495115, + "flos": 20302924049280.0, + "grad_norm": 1.7615970311636253, + "language_loss": 0.72502065, + "learning_rate": 3.712015717627374e-06, + "loss": 0.74691164, + "num_input_tokens_seen": 70934870, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.90625, + "step": 3289, + "time_per_iteration": 2.4479291439056396 + }, + { + "auxiliary_loss_clip": 0.01144115, + "auxiliary_loss_mlp": 0.01045349, + "balance_loss_clip": 1.02807593, + "balance_loss_mlp": 1.0500598, + "epoch": 0.19780550127761912, + "flos": 27235478724480.0, + "grad_norm": 2.0523474932115833, + "language_loss": 0.7944051, + "learning_rate": 3.7118143479639813e-06, + "loss": 0.81629974, + "num_input_tokens_seen": 70955140, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9375, + "step": 3290, + "time_per_iteration": 2.499950408935547 + }, + { + "auxiliary_loss_clip": 0.0104544, + "auxiliary_loss_mlp": 0.01002976, + "balance_loss_clip": 1.00056827, + "balance_loss_mlp": 1.01336336, + "epoch": 0.19786562453028708, + "flos": 63550972684800.0, + "grad_norm": 0.9098407078047199, + "language_loss": 0.60440773, + "learning_rate": 3.711612913388418e-06, + "loss": 0.62489194, + "num_input_tokens_seen": 71012005, + "router_z_loss_clip": 0.02404785, + "router_z_loss_mlp": 0.3203125, + "step": 3291, + "time_per_iteration": 3.1538305282592773 + }, + { + "auxiliary_loss_clip": 0.01144863, + "auxiliary_loss_mlp": 0.01044766, + "balance_loss_clip": 1.02639592, + "balance_loss_mlp": 1.04670751, + "epoch": 0.19792574778295505, + "flos": 26286647011200.0, + "grad_norm": 2.151168561582294, + "language_loss": 0.81352198, + "learning_rate": 3.7114114139083204e-06, + "loss": 0.83541822, + "num_input_tokens_seen": 71031140, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.984375, + "step": 3292, + "time_per_iteration": 2.539417028427124 + }, + { + "auxiliary_loss_clip": 0.01137712, + "auxiliary_loss_mlp": 0.01047669, + "balance_loss_clip": 1.03051507, + "balance_loss_mlp": 1.04855824, + "epoch": 0.19798587103562304, + "flos": 19938107566080.0, + "grad_norm": 2.212806192124084, + "language_loss": 0.82146955, + "learning_rate": 3.7112098495313313e-06, + "loss": 0.84332335, + "num_input_tokens_seen": 71050250, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.890625, + "step": 3293, + "time_per_iteration": 2.438809394836426 + }, + { + "auxiliary_loss_clip": 0.01151271, + "auxiliary_loss_mlp": 0.01048402, + "balance_loss_clip": 1.02988923, + "balance_loss_mlp": 1.05333924, + "epoch": 0.198045994288291, + "flos": 20120282369280.0, + "grad_norm": 2.10438249616411, + "language_loss": 0.61268854, + "learning_rate": 3.711008220265093e-06, + "loss": 0.63468528, + "num_input_tokens_seen": 71068665, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9765625, + "step": 3294, + "time_per_iteration": 2.451650381088257 + }, + { + "auxiliary_loss_clip": 0.01143209, + "auxiliary_loss_mlp": 0.01043395, + "balance_loss_clip": 1.02681279, + "balance_loss_mlp": 1.05004907, + "epoch": 0.19810611754095897, + "flos": 17967653228160.0, + "grad_norm": 2.028666267444235, + "language_loss": 0.86983609, + "learning_rate": 3.710806526117251e-06, + "loss": 0.89170212, + "num_input_tokens_seen": 71085320, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9296875, + "step": 3295, + "time_per_iteration": 2.416771411895752 + }, + { + "auxiliary_loss_clip": 0.01141633, + "auxiliary_loss_mlp": 0.01051654, + "balance_loss_clip": 1.03529871, + "balance_loss_mlp": 1.04786801, + "epoch": 0.19816624079362694, + "flos": 15084996071040.0, + "grad_norm": 13.771873008268457, + "language_loss": 0.80491048, + "learning_rate": 3.7106047670954544e-06, + "loss": 0.82684338, + "num_input_tokens_seen": 71102020, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.9375, + "step": 3296, + "time_per_iteration": 2.450934648513794 + }, + { + "auxiliary_loss_clip": 0.01145402, + "auxiliary_loss_mlp": 0.01045523, + "balance_loss_clip": 1.02637851, + "balance_loss_mlp": 1.0482688, + "epoch": 0.1982263640462949, + "flos": 24900315644160.0, + "grad_norm": 2.0804115334054134, + "language_loss": 0.68406892, + "learning_rate": 3.710402943207354e-06, + "loss": 0.70597816, + "num_input_tokens_seen": 71123390, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.97265625, + "step": 3297, + "time_per_iteration": 2.5111610889434814 + }, + { + "auxiliary_loss_clip": 0.01138573, + "auxiliary_loss_mlp": 0.01040678, + "balance_loss_clip": 1.02440548, + "balance_loss_mlp": 1.04895413, + "epoch": 0.19828648729896287, + "flos": 20376181837440.0, + "grad_norm": 1.7575465421519259, + "language_loss": 0.81232154, + "learning_rate": 3.7102010544606016e-06, + "loss": 0.83411407, + "num_input_tokens_seen": 71141800, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.89453125, + "step": 3298, + "time_per_iteration": 2.472025156021118 + }, + { + "auxiliary_loss_clip": 0.01147375, + "auxiliary_loss_mlp": 0.01046386, + "balance_loss_clip": 1.02634668, + "balance_loss_mlp": 1.05001056, + "epoch": 0.19834661055163083, + "flos": 18880035615360.0, + "grad_norm": 2.343960149367745, + "language_loss": 0.85115641, + "learning_rate": 3.7099991008628544e-06, + "loss": 0.87309396, + "num_input_tokens_seen": 71159505, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 0.9765625, + "step": 3299, + "time_per_iteration": 2.4725356101989746 + }, + { + "auxiliary_loss_clip": 0.01045198, + "auxiliary_loss_mlp": 0.01003179, + "balance_loss_clip": 1.00097358, + "balance_loss_mlp": 1.0131526, + "epoch": 0.19840673380429882, + "flos": 60259184640000.0, + "grad_norm": 0.7731212371218976, + "language_loss": 0.53215671, + "learning_rate": 3.7097970824217706e-06, + "loss": 0.55264044, + "num_input_tokens_seen": 71223265, + "router_z_loss_clip": 0.02209473, + "router_z_loss_mlp": 0.3203125, + "step": 3300, + "time_per_iteration": 3.004054069519043 + }, + { + "auxiliary_loss_clip": 0.01142157, + "auxiliary_loss_mlp": 0.01051571, + "balance_loss_clip": 1.03298628, + "balance_loss_mlp": 1.04772329, + "epoch": 0.1984668570569668, + "flos": 19902017376000.0, + "grad_norm": 1.6138936044346288, + "language_loss": 0.73150593, + "learning_rate": 3.7095949991450093e-06, + "loss": 0.75344324, + "num_input_tokens_seen": 71242385, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9453125, + "step": 3301, + "time_per_iteration": 2.4547884464263916 + }, + { + "auxiliary_loss_clip": 0.01140885, + "auxiliary_loss_mlp": 0.01038256, + "balance_loss_clip": 1.02191293, + "balance_loss_mlp": 1.04811358, + "epoch": 0.19852698030963475, + "flos": 15630766295040.0, + "grad_norm": 2.437382428027231, + "language_loss": 0.88445318, + "learning_rate": 3.709392851040235e-06, + "loss": 0.90624458, + "num_input_tokens_seen": 71258990, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.9296875, + "step": 3302, + "time_per_iteration": 2.429579019546509 + }, + { + "auxiliary_loss_clip": 0.01142317, + "auxiliary_loss_mlp": 0.0104676, + "balance_loss_clip": 1.02940273, + "balance_loss_mlp": 1.04750872, + "epoch": 0.19858710356230272, + "flos": 43143007311360.0, + "grad_norm": 1.9503370408087137, + "language_loss": 0.73907369, + "learning_rate": 3.709190638115111e-06, + "loss": 0.76096445, + "num_input_tokens_seen": 71282770, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9453125, + "step": 3303, + "time_per_iteration": 2.627835273742676 + }, + { + "auxiliary_loss_clip": 0.01141217, + "auxiliary_loss_mlp": 0.0104825, + "balance_loss_clip": 1.03117871, + "balance_loss_mlp": 1.04874539, + "epoch": 0.19864722681497068, + "flos": 35144084643840.0, + "grad_norm": 1.8172241344194675, + "language_loss": 0.74761099, + "learning_rate": 3.7089883603773084e-06, + "loss": 0.76950562, + "num_input_tokens_seen": 71301410, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.921875, + "step": 3304, + "time_per_iteration": 2.551241397857666 + }, + { + "auxiliary_loss_clip": 0.01139854, + "auxiliary_loss_mlp": 0.01039407, + "balance_loss_clip": 1.02333784, + "balance_loss_mlp": 1.04763281, + "epoch": 0.19870735006763865, + "flos": 19426200888960.0, + "grad_norm": 2.605019982075021, + "language_loss": 0.85717452, + "learning_rate": 3.7087860178344955e-06, + "loss": 0.87896717, + "num_input_tokens_seen": 71319670, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.921875, + "step": 3305, + "time_per_iteration": 2.432363986968994 + }, + { + "auxiliary_loss_clip": 0.01141298, + "auxiliary_loss_mlp": 0.01040354, + "balance_loss_clip": 1.02408171, + "balance_loss_mlp": 1.04600525, + "epoch": 0.19876747332030664, + "flos": 23547380947200.0, + "grad_norm": 1.7555780714506408, + "language_loss": 0.68014234, + "learning_rate": 3.7085836104943445e-06, + "loss": 0.70195889, + "num_input_tokens_seen": 71339850, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.953125, + "step": 3306, + "time_per_iteration": 2.478422164916992 + }, + { + "auxiliary_loss_clip": 0.01137681, + "auxiliary_loss_mlp": 0.01036227, + "balance_loss_clip": 1.02098584, + "balance_loss_mlp": 1.0453912, + "epoch": 0.1988275965729746, + "flos": 19829406032640.0, + "grad_norm": 1.4744708200758283, + "language_loss": 0.76455241, + "learning_rate": 3.7083811383645332e-06, + "loss": 0.78629148, + "num_input_tokens_seen": 71359795, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.921875, + "step": 3307, + "time_per_iteration": 2.459176778793335 + }, + { + "auxiliary_loss_clip": 0.01140736, + "auxiliary_loss_mlp": 0.0104117, + "balance_loss_clip": 1.02520776, + "balance_loss_mlp": 1.04866791, + "epoch": 0.19888771982564257, + "flos": 23513625141120.0, + "grad_norm": 1.8666050855147507, + "language_loss": 0.75933248, + "learning_rate": 3.708178601452737e-06, + "loss": 0.78115153, + "num_input_tokens_seen": 71378885, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.921875, + "step": 3308, + "time_per_iteration": 2.483060121536255 + }, + { + "auxiliary_loss_clip": 0.01141228, + "auxiliary_loss_mlp": 0.01041046, + "balance_loss_clip": 1.02426159, + "balance_loss_mlp": 1.04736626, + "epoch": 0.19894784307831054, + "flos": 18150510389760.0, + "grad_norm": 1.6368693105847256, + "language_loss": 0.75640005, + "learning_rate": 3.7079759997666374e-06, + "loss": 0.7782228, + "num_input_tokens_seen": 71397285, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.94140625, + "step": 3309, + "time_per_iteration": 3.8069632053375244 + }, + { + "auxiliary_loss_clip": 0.01138354, + "auxiliary_loss_mlp": 0.01046592, + "balance_loss_clip": 1.02869844, + "balance_loss_mlp": 1.04665506, + "epoch": 0.1990079663309785, + "flos": 24276044246400.0, + "grad_norm": 1.6858420956549012, + "language_loss": 0.87646699, + "learning_rate": 3.707773333313917e-06, + "loss": 0.8983165, + "num_input_tokens_seen": 71415775, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.9140625, + "step": 3310, + "time_per_iteration": 3.9299721717834473 + }, + { + "auxiliary_loss_clip": 0.01138843, + "auxiliary_loss_mlp": 0.01041462, + "balance_loss_clip": 1.02431977, + "balance_loss_mlp": 1.04637599, + "epoch": 0.19906808958364647, + "flos": 34897666366080.0, + "grad_norm": 3.6845239503362412, + "language_loss": 0.64166129, + "learning_rate": 3.70757060210226e-06, + "loss": 0.66346431, + "num_input_tokens_seen": 71437315, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.92578125, + "step": 3311, + "time_per_iteration": 2.5747337341308594 + }, + { + "auxiliary_loss_clip": 0.01143032, + "auxiliary_loss_mlp": 0.01042216, + "balance_loss_clip": 1.02559805, + "balance_loss_mlp": 1.04768658, + "epoch": 0.19912821283631443, + "flos": 24024885373440.0, + "grad_norm": 2.462607887220823, + "language_loss": 0.74053729, + "learning_rate": 3.707367806139355e-06, + "loss": 0.76238978, + "num_input_tokens_seen": 71456320, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.953125, + "step": 3312, + "time_per_iteration": 2.471867799758911 + }, + { + "auxiliary_loss_clip": 0.01141113, + "auxiliary_loss_mlp": 0.01046905, + "balance_loss_clip": 1.03060961, + "balance_loss_mlp": 1.04843581, + "epoch": 0.19918833608898243, + "flos": 19859031774720.0, + "grad_norm": 2.2841450786746016, + "language_loss": 0.83511955, + "learning_rate": 3.7071649454328915e-06, + "loss": 0.8569997, + "num_input_tokens_seen": 71475360, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.92578125, + "step": 3313, + "time_per_iteration": 2.4846627712249756 + }, + { + "auxiliary_loss_clip": 0.01142431, + "auxiliary_loss_mlp": 0.01041924, + "balance_loss_clip": 1.02575922, + "balance_loss_mlp": 1.04944849, + "epoch": 0.1992484593416504, + "flos": 29095794984960.0, + "grad_norm": 3.438256379955746, + "language_loss": 0.80930895, + "learning_rate": 3.7069620199905625e-06, + "loss": 0.83115256, + "num_input_tokens_seen": 71496155, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.9296875, + "step": 3314, + "time_per_iteration": 2.525754928588867 + }, + { + "auxiliary_loss_clip": 0.01137185, + "auxiliary_loss_mlp": 0.01043596, + "balance_loss_clip": 1.0280745, + "balance_loss_mlp": 1.04706359, + "epoch": 0.19930858259431836, + "flos": 23295001011840.0, + "grad_norm": 1.5137591341622172, + "language_loss": 0.87549174, + "learning_rate": 3.7067590298200627e-06, + "loss": 0.89729953, + "num_input_tokens_seen": 71517295, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8984375, + "step": 3315, + "time_per_iteration": 2.5170931816101074 + }, + { + "auxiliary_loss_clip": 0.01141446, + "auxiliary_loss_mlp": 0.01046665, + "balance_loss_clip": 1.03032112, + "balance_loss_mlp": 1.04808092, + "epoch": 0.19936870584698632, + "flos": 25378825651200.0, + "grad_norm": 1.5984895942740787, + "language_loss": 0.71255141, + "learning_rate": 3.7065559749290892e-06, + "loss": 0.73443246, + "num_input_tokens_seen": 71540000, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9296875, + "step": 3316, + "time_per_iteration": 2.520071029663086 + }, + { + "auxiliary_loss_clip": 0.0105243, + "auxiliary_loss_mlp": 0.01028392, + "balance_loss_clip": 1.02646089, + "balance_loss_mlp": 1.01928639, + "epoch": 0.1994288290996543, + "flos": 62168053109760.0, + "grad_norm": 0.8439111854473917, + "language_loss": 0.66260874, + "learning_rate": 3.706352855325342e-06, + "loss": 0.68341696, + "num_input_tokens_seen": 71607880, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.33203125, + "step": 3317, + "time_per_iteration": 3.1460416316986084 + }, + { + "auxiliary_loss_clip": 0.01142295, + "auxiliary_loss_mlp": 0.01052969, + "balance_loss_clip": 1.03557682, + "balance_loss_mlp": 1.04575253, + "epoch": 0.19948895235232225, + "flos": 19025832919680.0, + "grad_norm": 2.672944172124665, + "language_loss": 0.74319738, + "learning_rate": 3.7061496710165233e-06, + "loss": 0.76515001, + "num_input_tokens_seen": 71625695, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.96484375, + "step": 3318, + "time_per_iteration": 2.6139748096466064 + }, + { + "auxiliary_loss_clip": 0.01134724, + "auxiliary_loss_mlp": 0.01043694, + "balance_loss_clip": 1.0282445, + "balance_loss_mlp": 1.04536486, + "epoch": 0.19954907560499022, + "flos": 37815803182080.0, + "grad_norm": 1.900050251198073, + "language_loss": 0.78860074, + "learning_rate": 3.7059464220103385e-06, + "loss": 0.81038487, + "num_input_tokens_seen": 71648520, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.89453125, + "step": 3319, + "time_per_iteration": 2.6014342308044434 + }, + { + "auxiliary_loss_clip": 0.01141458, + "auxiliary_loss_mlp": 0.01042777, + "balance_loss_clip": 1.02439499, + "balance_loss_mlp": 1.04806578, + "epoch": 0.1996091988576582, + "flos": 49565199594240.0, + "grad_norm": 2.0962453666662073, + "language_loss": 0.75462162, + "learning_rate": 3.7057431083144945e-06, + "loss": 0.77646399, + "num_input_tokens_seen": 71672185, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.93359375, + "step": 3320, + "time_per_iteration": 2.739485263824463 + }, + { + "auxiliary_loss_clip": 0.01139438, + "auxiliary_loss_mlp": 0.01042565, + "balance_loss_clip": 1.02613819, + "balance_loss_mlp": 1.04714417, + "epoch": 0.19966932211032618, + "flos": 22635788659200.0, + "grad_norm": 2.167317842134812, + "language_loss": 0.80547488, + "learning_rate": 3.705539729936701e-06, + "loss": 0.82729495, + "num_input_tokens_seen": 71692890, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.921875, + "step": 3321, + "time_per_iteration": 2.581353187561035 + }, + { + "auxiliary_loss_clip": 0.01049309, + "auxiliary_loss_mlp": 0.01003433, + "balance_loss_clip": 1.00151408, + "balance_loss_mlp": 1.01694489, + "epoch": 0.19972944536299414, + "flos": 54082117745280.0, + "grad_norm": 0.880630206553271, + "language_loss": 0.65178835, + "learning_rate": 3.7053362868846696e-06, + "loss": 0.67231572, + "num_input_tokens_seen": 71745815, + "router_z_loss_clip": 0.01916504, + "router_z_loss_mlp": 0.32421875, + "step": 3322, + "time_per_iteration": 2.9042704105377197 + }, + { + "auxiliary_loss_clip": 0.01050141, + "auxiliary_loss_mlp": 0.01003283, + "balance_loss_clip": 1.00130391, + "balance_loss_mlp": 1.01724231, + "epoch": 0.1997895686156621, + "flos": 69355031817600.0, + "grad_norm": 0.7916622121471568, + "language_loss": 0.56975091, + "learning_rate": 3.7051327791661153e-06, + "loss": 0.59028506, + "num_input_tokens_seen": 71806915, + "router_z_loss_clip": 0.01977539, + "router_z_loss_mlp": 0.328125, + "step": 3323, + "time_per_iteration": 3.2141411304473877 + }, + { + "auxiliary_loss_clip": 0.01139547, + "auxiliary_loss_mlp": 0.01035371, + "balance_loss_clip": 1.01859808, + "balance_loss_mlp": 1.04839373, + "epoch": 0.19984969186833007, + "flos": 18552063507840.0, + "grad_norm": 1.9849201654975537, + "language_loss": 0.80526733, + "learning_rate": 3.7049292067887555e-06, + "loss": 0.82701647, + "num_input_tokens_seen": 71824645, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9140625, + "step": 3324, + "time_per_iteration": 2.5455262660980225 + }, + { + "auxiliary_loss_clip": 0.01137077, + "auxiliary_loss_mlp": 0.01040613, + "balance_loss_clip": 1.02329218, + "balance_loss_mlp": 1.04540765, + "epoch": 0.19990981512099804, + "flos": 26429678968320.0, + "grad_norm": 1.8681208438308643, + "language_loss": 0.53681695, + "learning_rate": 3.7047255697603092e-06, + "loss": 0.55859387, + "num_input_tokens_seen": 71845125, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.91796875, + "step": 3325, + "time_per_iteration": 2.581782102584839 + }, + { + "auxiliary_loss_clip": 0.01138508, + "auxiliary_loss_mlp": 0.01039502, + "balance_loss_clip": 1.02337289, + "balance_loss_mlp": 1.04565668, + "epoch": 0.19996993837366603, + "flos": 16325997010560.0, + "grad_norm": 2.0672953846254027, + "language_loss": 0.86169922, + "learning_rate": 3.7045218680884984e-06, + "loss": 0.88347936, + "num_input_tokens_seen": 71863500, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.9296875, + "step": 3326, + "time_per_iteration": 2.494718551635742 + }, + { + "auxiliary_loss_clip": 0.01138244, + "auxiliary_loss_mlp": 0.01037965, + "balance_loss_clip": 1.02243209, + "balance_loss_mlp": 1.04851878, + "epoch": 0.200030061626334, + "flos": 20844169159680.0, + "grad_norm": 1.8653522915536895, + "language_loss": 0.71835959, + "learning_rate": 3.7043181017810476e-06, + "loss": 0.74012172, + "num_input_tokens_seen": 71881845, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8984375, + "step": 3327, + "time_per_iteration": 2.536574602127075 + }, + { + "auxiliary_loss_clip": 0.0114197, + "auxiliary_loss_mlp": 0.01041829, + "balance_loss_clip": 1.02368546, + "balance_loss_mlp": 1.04750776, + "epoch": 0.20009018487900196, + "flos": 23762629198080.0, + "grad_norm": 1.83111198959611, + "language_loss": 0.76588571, + "learning_rate": 3.7041142708456833e-06, + "loss": 0.78772372, + "num_input_tokens_seen": 71900940, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9453125, + "step": 3328, + "time_per_iteration": 2.5083916187286377 + }, + { + "auxiliary_loss_clip": 0.01698253, + "auxiliary_loss_mlp": 0.01552284, + "balance_loss_clip": 1.52980089, + "balance_loss_mlp": 1.56677365, + "epoch": 0.20015030813166992, + "flos": 28106162236800.0, + "grad_norm": 1.6482454448342019, + "language_loss": 1.03044438, + "learning_rate": 3.7039103752901353e-06, + "loss": 0.7143048, + "num_input_tokens_seen": 71921925, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.3125, + "step": 3329, + "time_per_iteration": 15.37552785873413 + }, + { + "auxiliary_loss_clip": 0.01146286, + "auxiliary_loss_mlp": 0.01050404, + "balance_loss_clip": 1.03149772, + "balance_loss_mlp": 1.0504123, + "epoch": 0.2002104313843379, + "flos": 26067160955520.0, + "grad_norm": 1.5519947176183269, + "language_loss": 0.81297028, + "learning_rate": 3.7037064151221353e-06, + "loss": 0.8349371, + "num_input_tokens_seen": 71941855, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.9609375, + "step": 3330, + "time_per_iteration": 2.500103712081909 + }, + { + "auxiliary_loss_clip": 0.01140997, + "auxiliary_loss_mlp": 0.01037259, + "balance_loss_clip": 1.01994956, + "balance_loss_mlp": 1.04669356, + "epoch": 0.20027055463700585, + "flos": 22966633854720.0, + "grad_norm": 2.032272994312633, + "language_loss": 0.76649368, + "learning_rate": 3.703502390349417e-06, + "loss": 0.78827626, + "num_input_tokens_seen": 71960915, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9453125, + "step": 3331, + "time_per_iteration": 2.4018712043762207 + }, + { + "auxiliary_loss_clip": 0.01141733, + "auxiliary_loss_mlp": 0.01045779, + "balance_loss_clip": 1.02819538, + "balance_loss_mlp": 1.04608667, + "epoch": 0.20033067788967382, + "flos": 17165660313600.0, + "grad_norm": 1.6582018653132529, + "language_loss": 0.79261309, + "learning_rate": 3.7032983009797176e-06, + "loss": 0.81448817, + "num_input_tokens_seen": 71979220, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.953125, + "step": 3332, + "time_per_iteration": 2.4550859928131104 + }, + { + "auxiliary_loss_clip": 0.01045684, + "auxiliary_loss_mlp": 0.01005368, + "balance_loss_clip": 1.0036391, + "balance_loss_mlp": 1.01433849, + "epoch": 0.2003908011423418, + "flos": 60825566292480.0, + "grad_norm": 0.9315137515082259, + "language_loss": 0.61990142, + "learning_rate": 3.703094147020776e-06, + "loss": 0.64041197, + "num_input_tokens_seen": 72033950, + "router_z_loss_clip": 0.01733398, + "router_z_loss_mlp": 0.31445312, + "step": 3333, + "time_per_iteration": 2.9623756408691406 + }, + { + "auxiliary_loss_clip": 0.01139681, + "auxiliary_loss_mlp": 0.01044905, + "balance_loss_clip": 1.02819228, + "balance_loss_mlp": 1.04501462, + "epoch": 0.20045092439500978, + "flos": 24206234163840.0, + "grad_norm": 2.1372355522021893, + "language_loss": 0.81203878, + "learning_rate": 3.7028899284803334e-06, + "loss": 0.8338846, + "num_input_tokens_seen": 72051395, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9453125, + "step": 3334, + "time_per_iteration": 2.49924373626709 + }, + { + "auxiliary_loss_clip": 0.01146523, + "auxiliary_loss_mlp": 0.01047388, + "balance_loss_clip": 1.02938735, + "balance_loss_mlp": 1.04878521, + "epoch": 0.20051104764767774, + "flos": 29387605075200.0, + "grad_norm": 2.1564721635267516, + "language_loss": 0.74261904, + "learning_rate": 3.702685645366134e-06, + "loss": 0.76455814, + "num_input_tokens_seen": 72071305, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9765625, + "step": 3335, + "time_per_iteration": 2.634608745574951 + }, + { + "auxiliary_loss_clip": 0.01150022, + "auxiliary_loss_mlp": 0.01058924, + "balance_loss_clip": 1.04205632, + "balance_loss_mlp": 1.05375338, + "epoch": 0.2005711709003457, + "flos": 23513804709120.0, + "grad_norm": 1.6943946878944693, + "language_loss": 0.79839814, + "learning_rate": 3.7024812976859243e-06, + "loss": 0.82048762, + "num_input_tokens_seen": 72090165, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9609375, + "step": 3336, + "time_per_iteration": 2.7025394439697266 + }, + { + "auxiliary_loss_clip": 0.01145798, + "auxiliary_loss_mlp": 0.01045992, + "balance_loss_clip": 1.02744317, + "balance_loss_mlp": 1.04703879, + "epoch": 0.20063129415301367, + "flos": 22523388024960.0, + "grad_norm": 1.9043375292422164, + "language_loss": 0.78031212, + "learning_rate": 3.7022768854474532e-06, + "loss": 0.80223, + "num_input_tokens_seen": 72107210, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.98828125, + "step": 3337, + "time_per_iteration": 2.5718014240264893 + }, + { + "auxiliary_loss_clip": 0.01143827, + "auxiliary_loss_mlp": 0.01045572, + "balance_loss_clip": 1.02708244, + "balance_loss_mlp": 1.0486424, + "epoch": 0.20069141740568164, + "flos": 25958243940480.0, + "grad_norm": 1.9983960159800889, + "language_loss": 0.6873948, + "learning_rate": 3.7020724086584724e-06, + "loss": 0.70928884, + "num_input_tokens_seen": 72126315, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.94921875, + "step": 3338, + "time_per_iteration": 2.5848047733306885 + }, + { + "auxiliary_loss_clip": 0.01143098, + "auxiliary_loss_mlp": 0.01049172, + "balance_loss_clip": 1.03263819, + "balance_loss_mlp": 1.04853702, + "epoch": 0.2007515406583496, + "flos": 24790608529920.0, + "grad_norm": 2.1061075345379576, + "language_loss": 0.68823779, + "learning_rate": 3.701867867326735e-06, + "loss": 0.71016049, + "num_input_tokens_seen": 72146470, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.9453125, + "step": 3339, + "time_per_iteration": 2.523771047592163 + }, + { + "auxiliary_loss_clip": 0.01149874, + "auxiliary_loss_mlp": 0.01038245, + "balance_loss_clip": 1.02217603, + "balance_loss_mlp": 1.05197799, + "epoch": 0.2008116639110176, + "flos": 37925582123520.0, + "grad_norm": 2.3080693694415872, + "language_loss": 0.66263533, + "learning_rate": 3.7016632614599974e-06, + "loss": 0.68451655, + "num_input_tokens_seen": 72166600, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.9765625, + "step": 3340, + "time_per_iteration": 2.647495985031128 + }, + { + "auxiliary_loss_clip": 0.01141947, + "auxiliary_loss_mlp": 0.0103392, + "balance_loss_clip": 1.01570475, + "balance_loss_mlp": 1.0457145, + "epoch": 0.20087178716368556, + "flos": 20740531443840.0, + "grad_norm": 2.8472305033219696, + "language_loss": 0.74124628, + "learning_rate": 3.701458591066019e-06, + "loss": 0.76300496, + "num_input_tokens_seen": 72185160, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.9609375, + "step": 3341, + "time_per_iteration": 2.511585235595703 + }, + { + "auxiliary_loss_clip": 0.01140464, + "auxiliary_loss_mlp": 0.01043131, + "balance_loss_clip": 1.02689481, + "balance_loss_mlp": 1.04846787, + "epoch": 0.20093191041635353, + "flos": 23842279607040.0, + "grad_norm": 2.1698717951472326, + "language_loss": 0.71578503, + "learning_rate": 3.70125385615256e-06, + "loss": 0.73762101, + "num_input_tokens_seen": 72205160, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.91796875, + "step": 3342, + "time_per_iteration": 2.561998128890991 + }, + { + "auxiliary_loss_clip": 0.01142187, + "auxiliary_loss_mlp": 0.01045325, + "balance_loss_clip": 1.02871895, + "balance_loss_mlp": 1.04746354, + "epoch": 0.2009920336690215, + "flos": 21792067119360.0, + "grad_norm": 1.9864957062525024, + "language_loss": 0.73130047, + "learning_rate": 3.701049056727384e-06, + "loss": 0.75317556, + "num_input_tokens_seen": 72223555, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9453125, + "step": 3343, + "time_per_iteration": 4.046127557754517 + }, + { + "auxiliary_loss_clip": 0.01142173, + "auxiliary_loss_mlp": 0.01050047, + "balance_loss_clip": 1.03252363, + "balance_loss_mlp": 1.04738092, + "epoch": 0.20105215692168946, + "flos": 26359222440960.0, + "grad_norm": 1.9813453341923526, + "language_loss": 0.81026411, + "learning_rate": 3.7008441927982574e-06, + "loss": 0.83218634, + "num_input_tokens_seen": 72242465, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.94921875, + "step": 3344, + "time_per_iteration": 2.520765542984009 + }, + { + "auxiliary_loss_clip": 0.01141139, + "auxiliary_loss_mlp": 0.01050367, + "balance_loss_clip": 1.03412437, + "balance_loss_mlp": 1.04661858, + "epoch": 0.20111228017435742, + "flos": 18807280617600.0, + "grad_norm": 2.7491478080862684, + "language_loss": 0.83503234, + "learning_rate": 3.700639264372948e-06, + "loss": 0.85694736, + "num_input_tokens_seen": 72260655, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.9453125, + "step": 3345, + "time_per_iteration": 4.064355373382568 + }, + { + "auxiliary_loss_clip": 0.01135224, + "auxiliary_loss_mlp": 0.01041726, + "balance_loss_clip": 1.02689624, + "balance_loss_mlp": 1.0464828, + "epoch": 0.20117240342702541, + "flos": 19975059682560.0, + "grad_norm": 1.723487885242635, + "language_loss": 0.67909771, + "learning_rate": 3.7004342714592283e-06, + "loss": 0.70086718, + "num_input_tokens_seen": 72279055, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.88671875, + "step": 3346, + "time_per_iteration": 2.521949291229248 + }, + { + "auxiliary_loss_clip": 0.01140579, + "auxiliary_loss_mlp": 0.01048866, + "balance_loss_clip": 1.03233206, + "balance_loss_mlp": 1.04726124, + "epoch": 0.20123252667969338, + "flos": 23142703345920.0, + "grad_norm": 2.272845003166824, + "language_loss": 0.73496711, + "learning_rate": 3.70022921406487e-06, + "loss": 0.75686157, + "num_input_tokens_seen": 72297895, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.93359375, + "step": 3347, + "time_per_iteration": 2.5316877365112305 + }, + { + "auxiliary_loss_clip": 0.01140927, + "auxiliary_loss_mlp": 0.01047237, + "balance_loss_clip": 1.03179908, + "balance_loss_mlp": 1.04827023, + "epoch": 0.20129264993236134, + "flos": 23221671396480.0, + "grad_norm": 1.7467826588499227, + "language_loss": 0.86716485, + "learning_rate": 3.70002409219765e-06, + "loss": 0.88904649, + "num_input_tokens_seen": 72318385, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.92578125, + "step": 3348, + "time_per_iteration": 2.5123202800750732 + }, + { + "auxiliary_loss_clip": 0.01138726, + "auxiliary_loss_mlp": 0.01041589, + "balance_loss_clip": 1.02335036, + "balance_loss_mlp": 1.04729295, + "epoch": 0.2013527731850293, + "flos": 21871466133120.0, + "grad_norm": 1.5886148695932183, + "language_loss": 0.71200913, + "learning_rate": 3.699818905865346e-06, + "loss": 0.73381227, + "num_input_tokens_seen": 72338235, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.9140625, + "step": 3349, + "time_per_iteration": 2.505594491958618 + }, + { + "auxiliary_loss_clip": 0.01144556, + "auxiliary_loss_mlp": 0.01048519, + "balance_loss_clip": 1.03016067, + "balance_loss_mlp": 1.04982185, + "epoch": 0.20141289643769728, + "flos": 18040803275520.0, + "grad_norm": 1.649154800785762, + "language_loss": 0.71079665, + "learning_rate": 3.6996136550757377e-06, + "loss": 0.73272741, + "num_input_tokens_seen": 72357825, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.9453125, + "step": 3350, + "time_per_iteration": 2.4927315711975098 + }, + { + "auxiliary_loss_clip": 0.01145933, + "auxiliary_loss_mlp": 0.01044553, + "balance_loss_clip": 1.02612305, + "balance_loss_mlp": 1.05045485, + "epoch": 0.20147301969036524, + "flos": 23951412103680.0, + "grad_norm": 3.2873247390310554, + "language_loss": 0.76327842, + "learning_rate": 3.69940833983661e-06, + "loss": 0.78518331, + "num_input_tokens_seen": 72376335, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.953125, + "step": 3351, + "time_per_iteration": 2.5077342987060547 + }, + { + "auxiliary_loss_clip": 0.01146641, + "auxiliary_loss_mlp": 0.01043619, + "balance_loss_clip": 1.02555871, + "balance_loss_mlp": 1.05069637, + "epoch": 0.2015331429430332, + "flos": 25588471380480.0, + "grad_norm": 1.662758000066145, + "language_loss": 0.80545723, + "learning_rate": 3.699202960155748e-06, + "loss": 0.8273598, + "num_input_tokens_seen": 72395440, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.95703125, + "step": 3352, + "time_per_iteration": 2.5717766284942627 + }, + { + "auxiliary_loss_clip": 0.01146315, + "auxiliary_loss_mlp": 0.01039569, + "balance_loss_clip": 1.02274823, + "balance_loss_mlp": 1.05210721, + "epoch": 0.2015932661957012, + "flos": 26724972677760.0, + "grad_norm": 1.7179856660366186, + "language_loss": 0.8027631, + "learning_rate": 3.6989975160409396e-06, + "loss": 0.82462192, + "num_input_tokens_seen": 72414670, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9453125, + "step": 3353, + "time_per_iteration": 2.6415467262268066 + }, + { + "auxiliary_loss_clip": 0.01140403, + "auxiliary_loss_mlp": 0.01041635, + "balance_loss_clip": 1.02512455, + "balance_loss_mlp": 1.04978478, + "epoch": 0.20165338944836916, + "flos": 15633136592640.0, + "grad_norm": 2.050762039112588, + "language_loss": 0.8946988, + "learning_rate": 3.6987920074999747e-06, + "loss": 0.91651917, + "num_input_tokens_seen": 72432210, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.90625, + "step": 3354, + "time_per_iteration": 2.4780237674713135 + }, + { + "auxiliary_loss_clip": 0.01052075, + "auxiliary_loss_mlp": 0.01011403, + "balance_loss_clip": 1.00948358, + "balance_loss_mlp": 1.0202148, + "epoch": 0.20171351270103713, + "flos": 57912529207680.0, + "grad_norm": 0.830112597874188, + "language_loss": 0.55839282, + "learning_rate": 3.6985864345406465e-06, + "loss": 0.57902759, + "num_input_tokens_seen": 72489225, + "router_z_loss_clip": 0.01916504, + "router_z_loss_mlp": 0.31835938, + "step": 3355, + "time_per_iteration": 3.0224292278289795 + }, + { + "auxiliary_loss_clip": 0.01140957, + "auxiliary_loss_mlp": 0.01045212, + "balance_loss_clip": 1.02891648, + "balance_loss_mlp": 1.05068707, + "epoch": 0.2017736359537051, + "flos": 20814363849600.0, + "grad_norm": 1.5257876958196368, + "language_loss": 0.84076762, + "learning_rate": 3.698380797170751e-06, + "loss": 0.86262929, + "num_input_tokens_seen": 72508715, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.90234375, + "step": 3356, + "time_per_iteration": 2.510615348815918 + }, + { + "auxiliary_loss_clip": 0.01152963, + "auxiliary_loss_mlp": 0.01043363, + "balance_loss_clip": 1.024194, + "balance_loss_mlp": 1.05356848, + "epoch": 0.20183375920637306, + "flos": 17092043389440.0, + "grad_norm": 2.9361880537925584, + "language_loss": 0.688007, + "learning_rate": 3.698175095398085e-06, + "loss": 0.70997024, + "num_input_tokens_seen": 72525135, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.9921875, + "step": 3357, + "time_per_iteration": 2.460022211074829 + }, + { + "auxiliary_loss_clip": 0.01144866, + "auxiliary_loss_mlp": 0.01040819, + "balance_loss_clip": 1.02280617, + "balance_loss_mlp": 1.0492487, + "epoch": 0.20189388245904102, + "flos": 18661339658880.0, + "grad_norm": 1.7490617907772006, + "language_loss": 0.71748042, + "learning_rate": 3.6979693292304493e-06, + "loss": 0.73933733, + "num_input_tokens_seen": 72543690, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.95703125, + "step": 3358, + "time_per_iteration": 2.563767194747925 + }, + { + "auxiliary_loss_clip": 0.01139733, + "auxiliary_loss_mlp": 0.01054955, + "balance_loss_clip": 1.03818202, + "balance_loss_mlp": 1.04849517, + "epoch": 0.20195400571170902, + "flos": 16797539779200.0, + "grad_norm": 2.042998238377631, + "language_loss": 0.83104217, + "learning_rate": 3.6977634986756463e-06, + "loss": 0.85298896, + "num_input_tokens_seen": 72560725, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9140625, + "step": 3359, + "time_per_iteration": 2.531332015991211 + }, + { + "auxiliary_loss_clip": 0.01052883, + "auxiliary_loss_mlp": 0.01001012, + "balance_loss_clip": 0.99911654, + "balance_loss_mlp": 1.02214265, + "epoch": 0.20201412896437698, + "flos": 67174716268800.0, + "grad_norm": 12.853939959466139, + "language_loss": 0.5895561, + "learning_rate": 3.697557603741482e-06, + "loss": 0.61009508, + "num_input_tokens_seen": 72621940, + "router_z_loss_clip": 0.0189209, + "router_z_loss_mlp": 0.30859375, + "step": 3360, + "time_per_iteration": 3.0536341667175293 + }, + { + "auxiliary_loss_clip": 0.01147837, + "auxiliary_loss_mlp": 0.01049077, + "balance_loss_clip": 1.03117216, + "balance_loss_mlp": 1.05149043, + "epoch": 0.20207425221704495, + "flos": 21325013550720.0, + "grad_norm": 2.4416015649532286, + "language_loss": 0.62138069, + "learning_rate": 3.697351644435763e-06, + "loss": 0.64334983, + "num_input_tokens_seen": 72639135, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.96484375, + "step": 3361, + "time_per_iteration": 2.512657403945923 + }, + { + "auxiliary_loss_clip": 0.0114522, + "auxiliary_loss_mlp": 0.01055979, + "balance_loss_clip": 1.03900385, + "balance_loss_mlp": 1.05156183, + "epoch": 0.2021343754697129, + "flos": 22527158952960.0, + "grad_norm": 2.0025961231737526, + "language_loss": 0.75524926, + "learning_rate": 3.6971456207662993e-06, + "loss": 0.77726126, + "num_input_tokens_seen": 72658525, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9375, + "step": 3362, + "time_per_iteration": 2.555492639541626 + }, + { + "auxiliary_loss_clip": 0.01145631, + "auxiliary_loss_mlp": 0.01046193, + "balance_loss_clip": 1.02926481, + "balance_loss_mlp": 1.05209327, + "epoch": 0.20219449872238088, + "flos": 19062785036160.0, + "grad_norm": 1.6135185744423872, + "language_loss": 0.76400363, + "learning_rate": 3.6969395327409035e-06, + "loss": 0.78592181, + "num_input_tokens_seen": 72678085, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9375, + "step": 3363, + "time_per_iteration": 2.486969470977783 + }, + { + "auxiliary_loss_clip": 0.01141408, + "auxiliary_loss_mlp": 0.01052858, + "balance_loss_clip": 1.03686023, + "balance_loss_mlp": 1.04736471, + "epoch": 0.20225462197504884, + "flos": 24717027519360.0, + "grad_norm": 2.0495916908721434, + "language_loss": 0.74606001, + "learning_rate": 3.696733380367391e-06, + "loss": 0.76800275, + "num_input_tokens_seen": 72698695, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.9375, + "step": 3364, + "time_per_iteration": 2.58673095703125 + }, + { + "auxiliary_loss_clip": 0.01144028, + "auxiliary_loss_mlp": 0.01052057, + "balance_loss_clip": 1.03390145, + "balance_loss_mlp": 1.04865253, + "epoch": 0.2023147452277168, + "flos": 22018304931840.0, + "grad_norm": 2.1992700083841084, + "language_loss": 0.71451771, + "learning_rate": 3.6965271636535783e-06, + "loss": 0.73647857, + "num_input_tokens_seen": 72717880, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.953125, + "step": 3365, + "time_per_iteration": 2.522470712661743 + }, + { + "auxiliary_loss_clip": 0.01147339, + "auxiliary_loss_mlp": 0.01052179, + "balance_loss_clip": 1.03516757, + "balance_loss_mlp": 1.05331004, + "epoch": 0.2023748684803848, + "flos": 17745365911680.0, + "grad_norm": 1.9561618637344158, + "language_loss": 0.85770535, + "learning_rate": 3.696320882607286e-06, + "loss": 0.87970054, + "num_input_tokens_seen": 72736410, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.94140625, + "step": 3366, + "time_per_iteration": 2.536529541015625 + }, + { + "auxiliary_loss_clip": 0.01143453, + "auxiliary_loss_mlp": 0.01044399, + "balance_loss_clip": 1.02761412, + "balance_loss_mlp": 1.0499506, + "epoch": 0.20243499173305277, + "flos": 31138932493440.0, + "grad_norm": 1.628387041142295, + "language_loss": 0.69651556, + "learning_rate": 3.696114537236335e-06, + "loss": 0.7183941, + "num_input_tokens_seen": 72758295, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.93359375, + "step": 3367, + "time_per_iteration": 2.5608372688293457 + }, + { + "auxiliary_loss_clip": 0.01145892, + "auxiliary_loss_mlp": 0.01043195, + "balance_loss_clip": 1.0235498, + "balance_loss_mlp": 1.04696274, + "epoch": 0.20249511498572073, + "flos": 33839235279360.0, + "grad_norm": 2.963599898430263, + "language_loss": 0.68230569, + "learning_rate": 3.6959081275485512e-06, + "loss": 0.70419657, + "num_input_tokens_seen": 72782495, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.98828125, + "step": 3368, + "time_per_iteration": 2.66802978515625 + }, + { + "auxiliary_loss_clip": 0.01143607, + "auxiliary_loss_mlp": 0.01049214, + "balance_loss_clip": 1.03178596, + "balance_loss_mlp": 1.0505259, + "epoch": 0.2025552382383887, + "flos": 21215629658880.0, + "grad_norm": 7.849671101524798, + "language_loss": 0.77025628, + "learning_rate": 3.6957016535517615e-06, + "loss": 0.79218459, + "num_input_tokens_seen": 72801885, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9296875, + "step": 3369, + "time_per_iteration": 2.5143446922302246 + }, + { + "auxiliary_loss_clip": 0.01145287, + "auxiliary_loss_mlp": 0.01057318, + "balance_loss_clip": 1.04029489, + "balance_loss_mlp": 1.04800487, + "epoch": 0.20261536149105666, + "flos": 14647388676480.0, + "grad_norm": 4.298107611861754, + "language_loss": 0.65408337, + "learning_rate": 3.695495115253795e-06, + "loss": 0.67610943, + "num_input_tokens_seen": 72816990, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.97265625, + "step": 3370, + "time_per_iteration": 2.503589630126953 + }, + { + "auxiliary_loss_clip": 0.01048919, + "auxiliary_loss_mlp": 0.01024768, + "balance_loss_clip": 1.02313519, + "balance_loss_mlp": 1.01856685, + "epoch": 0.20267548474372463, + "flos": 66783649921920.0, + "grad_norm": 0.6799262329378595, + "language_loss": 0.58101869, + "learning_rate": 3.6952885126624834e-06, + "loss": 0.60175562, + "num_input_tokens_seen": 72879240, + "router_z_loss_clip": 0.01635742, + "router_z_loss_mlp": 0.3046875, + "step": 3371, + "time_per_iteration": 3.1626369953155518 + }, + { + "auxiliary_loss_clip": 0.01143688, + "auxiliary_loss_mlp": 0.0104249, + "balance_loss_clip": 1.0254668, + "balance_loss_mlp": 1.04866266, + "epoch": 0.2027356079963926, + "flos": 24680793674880.0, + "grad_norm": 1.766606164011739, + "language_loss": 0.92068136, + "learning_rate": 3.6950818457856617e-06, + "loss": 0.94254309, + "num_input_tokens_seen": 72899030, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.94921875, + "step": 3372, + "time_per_iteration": 2.578045129776001 + }, + { + "auxiliary_loss_clip": 0.0114549, + "auxiliary_loss_mlp": 0.01044018, + "balance_loss_clip": 1.02538514, + "balance_loss_mlp": 1.05037856, + "epoch": 0.20279573124906058, + "flos": 26392762765440.0, + "grad_norm": 1.6491924635250923, + "language_loss": 0.78632712, + "learning_rate": 3.694875114631167e-06, + "loss": 0.80822217, + "num_input_tokens_seen": 72919190, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.953125, + "step": 3373, + "time_per_iteration": 2.5762507915496826 + }, + { + "auxiliary_loss_clip": 0.01137806, + "auxiliary_loss_mlp": 0.01039377, + "balance_loss_clip": 1.02162719, + "balance_loss_mlp": 1.04629672, + "epoch": 0.20285585450172855, + "flos": 33799984692480.0, + "grad_norm": 1.8751465027713456, + "language_loss": 0.71102971, + "learning_rate": 3.6946683192068377e-06, + "loss": 0.73280156, + "num_input_tokens_seen": 72939720, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9140625, + "step": 3374, + "time_per_iteration": 2.6212260723114014 + }, + { + "auxiliary_loss_clip": 0.01048807, + "auxiliary_loss_mlp": 0.01001811, + "balance_loss_clip": 1.00001132, + "balance_loss_mlp": 1.01811993, + "epoch": 0.20291597775439651, + "flos": 71164823598720.0, + "grad_norm": 0.9912238676598704, + "language_loss": 0.62450445, + "learning_rate": 3.694461459520516e-06, + "loss": 0.64501071, + "num_input_tokens_seen": 73000015, + "router_z_loss_clip": 0.01794434, + "router_z_loss_mlp": 0.30859375, + "step": 3375, + "time_per_iteration": 3.0768048763275146 + }, + { + "auxiliary_loss_clip": 0.01140549, + "auxiliary_loss_mlp": 0.01044631, + "balance_loss_clip": 1.02722621, + "balance_loss_mlp": 1.04769731, + "epoch": 0.20297610100706448, + "flos": 19494287118720.0, + "grad_norm": 1.6669967725054042, + "language_loss": 0.82450807, + "learning_rate": 3.6942545355800463e-06, + "loss": 0.84635985, + "num_input_tokens_seen": 73017675, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9296875, + "step": 3376, + "time_per_iteration": 2.5632758140563965 + }, + { + "auxiliary_loss_clip": 0.011433, + "auxiliary_loss_mlp": 0.01039932, + "balance_loss_clip": 1.02110839, + "balance_loss_mlp": 1.04692364, + "epoch": 0.20303622425973245, + "flos": 25044245441280.0, + "grad_norm": 2.2640770034372006, + "language_loss": 0.81587797, + "learning_rate": 3.6940475473932743e-06, + "loss": 0.83771032, + "num_input_tokens_seen": 73036135, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.96484375, + "step": 3377, + "time_per_iteration": 2.6376402378082275 + }, + { + "auxiliary_loss_clip": 0.01139097, + "auxiliary_loss_mlp": 0.01045773, + "balance_loss_clip": 1.02786779, + "balance_loss_mlp": 1.04670048, + "epoch": 0.2030963475124004, + "flos": 21979988098560.0, + "grad_norm": 4.046949512949318, + "language_loss": 0.769104, + "learning_rate": 3.69384049496805e-06, + "loss": 0.79095268, + "num_input_tokens_seen": 73054075, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.921875, + "step": 3378, + "time_per_iteration": 2.532942056655884 + }, + { + "auxiliary_loss_clip": 0.01143396, + "auxiliary_loss_mlp": 0.01043534, + "balance_loss_clip": 1.02493691, + "balance_loss_mlp": 1.04772687, + "epoch": 0.2031564707650684, + "flos": 19500392430720.0, + "grad_norm": 1.9870266088444717, + "language_loss": 0.79710048, + "learning_rate": 3.6936333783122242e-06, + "loss": 0.81896979, + "num_input_tokens_seen": 73073530, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.95703125, + "step": 3379, + "time_per_iteration": 2.5187509059906006 + }, + { + "auxiliary_loss_clip": 0.01137083, + "auxiliary_loss_mlp": 0.01038348, + "balance_loss_clip": 1.02162337, + "balance_loss_mlp": 1.04698288, + "epoch": 0.20321659401773637, + "flos": 22747075971840.0, + "grad_norm": 1.7003196517483214, + "language_loss": 0.86949915, + "learning_rate": 3.6934261974336505e-06, + "loss": 0.89125347, + "num_input_tokens_seen": 73092820, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8984375, + "step": 3380, + "time_per_iteration": 2.5350420475006104 + }, + { + "auxiliary_loss_clip": 0.01143485, + "auxiliary_loss_mlp": 0.01046611, + "balance_loss_clip": 1.02905154, + "balance_loss_mlp": 1.05103135, + "epoch": 0.20327671727040433, + "flos": 22455840499200.0, + "grad_norm": 1.9133898096862498, + "language_loss": 0.74515057, + "learning_rate": 3.693218952340186e-06, + "loss": 0.76705158, + "num_input_tokens_seen": 73113385, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.92578125, + "step": 3381, + "time_per_iteration": 2.5428466796875 + }, + { + "auxiliary_loss_clip": 0.01143807, + "auxiliary_loss_mlp": 0.01043772, + "balance_loss_clip": 1.0258193, + "balance_loss_mlp": 1.04754519, + "epoch": 0.2033368405230723, + "flos": 19535010163200.0, + "grad_norm": 1.741042372938858, + "language_loss": 0.79304886, + "learning_rate": 3.6930116430396895e-06, + "loss": 0.81492472, + "num_input_tokens_seen": 73131195, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9609375, + "step": 3382, + "time_per_iteration": 2.51084041595459 + }, + { + "auxiliary_loss_clip": 0.01146625, + "auxiliary_loss_mlp": 0.01040796, + "balance_loss_clip": 1.02123427, + "balance_loss_mlp": 1.04849267, + "epoch": 0.20339696377574026, + "flos": 13809233744640.0, + "grad_norm": 1.8514394244027284, + "language_loss": 0.80188596, + "learning_rate": 3.6928042695400214e-06, + "loss": 0.82376015, + "num_input_tokens_seen": 73148850, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.98046875, + "step": 3383, + "time_per_iteration": 2.5047500133514404 + }, + { + "auxiliary_loss_clip": 0.0113964, + "auxiliary_loss_mlp": 0.01042049, + "balance_loss_clip": 1.02401257, + "balance_loss_mlp": 1.04616201, + "epoch": 0.20345708702840823, + "flos": 20339409288960.0, + "grad_norm": 6.482166974991387, + "language_loss": 0.74195492, + "learning_rate": 3.6925968318490464e-06, + "loss": 0.76377177, + "num_input_tokens_seen": 73166775, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.93359375, + "step": 3384, + "time_per_iteration": 2.4931931495666504 + }, + { + "auxiliary_loss_clip": 0.01147866, + "auxiliary_loss_mlp": 0.01043488, + "balance_loss_clip": 1.02442586, + "balance_loss_mlp": 1.04929996, + "epoch": 0.2035172102810762, + "flos": 20333950421760.0, + "grad_norm": 2.292912234818254, + "language_loss": 0.76429737, + "learning_rate": 3.6923893299746293e-06, + "loss": 0.78621089, + "num_input_tokens_seen": 73183215, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.984375, + "step": 3385, + "time_per_iteration": 3.9999845027923584 + }, + { + "auxiliary_loss_clip": 0.01139546, + "auxiliary_loss_mlp": 0.01058955, + "balance_loss_clip": 1.04031098, + "balance_loss_mlp": 1.04538202, + "epoch": 0.2035773335337442, + "flos": 23330983461120.0, + "grad_norm": 1.8347755395186154, + "language_loss": 0.68259251, + "learning_rate": 3.692181763924639e-06, + "loss": 0.70457751, + "num_input_tokens_seen": 73203290, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.94140625, + "step": 3386, + "time_per_iteration": 2.525538682937622 + }, + { + "auxiliary_loss_clip": 0.01143921, + "auxiliary_loss_mlp": 0.01054172, + "balance_loss_clip": 1.0348835, + "balance_loss_mlp": 1.04785144, + "epoch": 0.20363745678641215, + "flos": 28330287310080.0, + "grad_norm": 1.949323793812955, + "language_loss": 0.81000078, + "learning_rate": 3.691974133706947e-06, + "loss": 0.83198166, + "num_input_tokens_seen": 73226185, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.9609375, + "step": 3387, + "time_per_iteration": 4.122355222702026 + }, + { + "auxiliary_loss_clip": 0.01137445, + "auxiliary_loss_mlp": 0.01040694, + "balance_loss_clip": 1.02331305, + "balance_loss_mlp": 1.04754424, + "epoch": 0.20369758003908012, + "flos": 18915658928640.0, + "grad_norm": 2.869822824167972, + "language_loss": 0.79960001, + "learning_rate": 3.6917664393294262e-06, + "loss": 0.82138139, + "num_input_tokens_seen": 73243300, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.8984375, + "step": 3388, + "time_per_iteration": 2.498455047607422 + }, + { + "auxiliary_loss_clip": 0.01142619, + "auxiliary_loss_mlp": 0.01040015, + "balance_loss_clip": 1.02120411, + "balance_loss_mlp": 1.04757476, + "epoch": 0.20375770329174808, + "flos": 19206499351680.0, + "grad_norm": 1.6489636222716584, + "language_loss": 0.71810246, + "learning_rate": 3.6915586807999527e-06, + "loss": 0.73992884, + "num_input_tokens_seen": 73261490, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.94921875, + "step": 3389, + "time_per_iteration": 2.4751241207122803 + }, + { + "auxiliary_loss_clip": 0.01140457, + "auxiliary_loss_mlp": 0.01048463, + "balance_loss_clip": 1.03108239, + "balance_loss_mlp": 1.04812241, + "epoch": 0.20381782654441605, + "flos": 19391008538880.0, + "grad_norm": 1.7476252287205662, + "language_loss": 0.87431413, + "learning_rate": 3.691350858126404e-06, + "loss": 0.89620328, + "num_input_tokens_seen": 73280180, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.921875, + "step": 3390, + "time_per_iteration": 2.5229172706604004 + }, + { + "auxiliary_loss_clip": 0.01138893, + "auxiliary_loss_mlp": 0.01044263, + "balance_loss_clip": 1.02673888, + "balance_loss_mlp": 1.04638386, + "epoch": 0.203877949797084, + "flos": 24827704300800.0, + "grad_norm": 3.0399462437196743, + "language_loss": 0.71092427, + "learning_rate": 3.691142971316662e-06, + "loss": 0.73275584, + "num_input_tokens_seen": 73300680, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.92578125, + "step": 3391, + "time_per_iteration": 2.528003454208374 + }, + { + "auxiliary_loss_clip": 0.01137362, + "auxiliary_loss_mlp": 0.01043664, + "balance_loss_clip": 1.02592552, + "balance_loss_mlp": 1.04483938, + "epoch": 0.20393807304975198, + "flos": 18003707504640.0, + "grad_norm": 2.517550673127581, + "language_loss": 0.85993969, + "learning_rate": 3.6909350203786086e-06, + "loss": 0.88174999, + "num_input_tokens_seen": 73316760, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.921875, + "step": 3392, + "time_per_iteration": 2.5080008506774902 + }, + { + "auxiliary_loss_clip": 0.01143294, + "auxiliary_loss_mlp": 0.01049793, + "balance_loss_clip": 1.03231716, + "balance_loss_mlp": 1.04759896, + "epoch": 0.20399819630241997, + "flos": 24206988349440.0, + "grad_norm": 1.5067582134175779, + "language_loss": 0.80730146, + "learning_rate": 3.69072700532013e-06, + "loss": 0.82923234, + "num_input_tokens_seen": 73339385, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.95703125, + "step": 3393, + "time_per_iteration": 2.5464906692504883 + }, + { + "auxiliary_loss_clip": 0.01139211, + "auxiliary_loss_mlp": 0.01039094, + "balance_loss_clip": 1.02236915, + "balance_loss_mlp": 1.0471251, + "epoch": 0.20405831955508794, + "flos": 20777124424320.0, + "grad_norm": 1.882536464234473, + "language_loss": 0.86276352, + "learning_rate": 3.6905189261491137e-06, + "loss": 0.88454658, + "num_input_tokens_seen": 73357235, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.921875, + "step": 3394, + "time_per_iteration": 2.495544195175171 + }, + { + "auxiliary_loss_clip": 0.01139364, + "auxiliary_loss_mlp": 0.01042637, + "balance_loss_clip": 1.02640033, + "balance_loss_mlp": 1.04756498, + "epoch": 0.2041184428077559, + "flos": 15486908325120.0, + "grad_norm": 2.9880936155816324, + "language_loss": 0.83455038, + "learning_rate": 3.69031078287345e-06, + "loss": 0.85637033, + "num_input_tokens_seen": 73374435, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.91796875, + "step": 3395, + "time_per_iteration": 2.4636099338531494 + }, + { + "auxiliary_loss_clip": 0.01144564, + "auxiliary_loss_mlp": 0.01035131, + "balance_loss_clip": 1.01753616, + "balance_loss_mlp": 1.04799199, + "epoch": 0.20417856606042387, + "flos": 15588463052160.0, + "grad_norm": 2.0105247570422877, + "language_loss": 0.83632553, + "learning_rate": 3.690102575501033e-06, + "loss": 0.85812247, + "num_input_tokens_seen": 73391025, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.96875, + "step": 3396, + "time_per_iteration": 2.507140636444092 + }, + { + "auxiliary_loss_clip": 0.01139778, + "auxiliary_loss_mlp": 0.01042511, + "balance_loss_clip": 1.02470088, + "balance_loss_mlp": 1.04775488, + "epoch": 0.20423868931309183, + "flos": 24279348297600.0, + "grad_norm": 1.9261630392212734, + "language_loss": 0.77139032, + "learning_rate": 3.6898943040397556e-06, + "loss": 0.79321325, + "num_input_tokens_seen": 73409270, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.91796875, + "step": 3397, + "time_per_iteration": 2.5000061988830566 + }, + { + "auxiliary_loss_clip": 0.01140053, + "auxiliary_loss_mlp": 0.01043864, + "balance_loss_clip": 1.027771, + "balance_loss_mlp": 1.0482713, + "epoch": 0.2042988125657598, + "flos": 18614870438400.0, + "grad_norm": 2.6022565941655285, + "language_loss": 0.87048233, + "learning_rate": 3.689685968497518e-06, + "loss": 0.89232147, + "num_input_tokens_seen": 73425225, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.91796875, + "step": 3398, + "time_per_iteration": 2.4879262447357178 + }, + { + "auxiliary_loss_clip": 0.01146457, + "auxiliary_loss_mlp": 0.01045529, + "balance_loss_clip": 1.02855396, + "balance_loss_mlp": 1.05200124, + "epoch": 0.2043589358184278, + "flos": 17851230270720.0, + "grad_norm": 2.0446998950436273, + "language_loss": 0.77973163, + "learning_rate": 3.6894775688822186e-06, + "loss": 0.8016516, + "num_input_tokens_seen": 73440940, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9453125, + "step": 3399, + "time_per_iteration": 2.4417104721069336 + }, + { + "auxiliary_loss_clip": 0.01142529, + "auxiliary_loss_mlp": 0.01039697, + "balance_loss_clip": 1.02180338, + "balance_loss_mlp": 1.0471437, + "epoch": 0.20441905907109575, + "flos": 21435223455360.0, + "grad_norm": 1.9372936252349278, + "language_loss": 0.76201475, + "learning_rate": 3.6892691052017603e-06, + "loss": 0.78383702, + "num_input_tokens_seen": 73458805, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.953125, + "step": 3400, + "time_per_iteration": 2.513378858566284 + }, + { + "auxiliary_loss_clip": 0.01140509, + "auxiliary_loss_mlp": 0.01043924, + "balance_loss_clip": 1.02709138, + "balance_loss_mlp": 1.04937315, + "epoch": 0.20447918232376372, + "flos": 27707703851520.0, + "grad_norm": 1.6590163779918286, + "language_loss": 0.79357922, + "learning_rate": 3.6890605774640487e-06, + "loss": 0.81542361, + "num_input_tokens_seen": 73479380, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9140625, + "step": 3401, + "time_per_iteration": 2.5628185272216797 + }, + { + "auxiliary_loss_clip": 0.01141107, + "auxiliary_loss_mlp": 0.01041447, + "balance_loss_clip": 1.02400649, + "balance_loss_mlp": 1.04659653, + "epoch": 0.20453930557643168, + "flos": 30524214113280.0, + "grad_norm": 1.682072453203677, + "language_loss": 0.69205511, + "learning_rate": 3.688851985676991e-06, + "loss": 0.71388066, + "num_input_tokens_seen": 73505105, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9453125, + "step": 3402, + "time_per_iteration": 2.653932571411133 + }, + { + "auxiliary_loss_clip": 0.01144935, + "auxiliary_loss_mlp": 0.01043349, + "balance_loss_clip": 1.02538395, + "balance_loss_mlp": 1.05008948, + "epoch": 0.20459942882909965, + "flos": 18987767481600.0, + "grad_norm": 2.6906490082479086, + "language_loss": 0.81077826, + "learning_rate": 3.688643329848496e-06, + "loss": 0.83266115, + "num_input_tokens_seen": 73523700, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.94921875, + "step": 3403, + "time_per_iteration": 2.518402099609375 + }, + { + "auxiliary_loss_clip": 0.01145331, + "auxiliary_loss_mlp": 0.0104575, + "balance_loss_clip": 1.02873933, + "balance_loss_mlp": 1.05067933, + "epoch": 0.20465955208176762, + "flos": 20339050152960.0, + "grad_norm": 1.7308307985558895, + "language_loss": 0.83497006, + "learning_rate": 3.6884346099864772e-06, + "loss": 0.85688084, + "num_input_tokens_seen": 73542625, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9453125, + "step": 3404, + "time_per_iteration": 2.5041427612304688 + }, + { + "auxiliary_loss_clip": 0.0114107, + "auxiliary_loss_mlp": 0.0104714, + "balance_loss_clip": 1.03018808, + "balance_loss_mlp": 1.04686713, + "epoch": 0.20471967533443558, + "flos": 21251288885760.0, + "grad_norm": 1.717424757849508, + "language_loss": 0.86319768, + "learning_rate": 3.6882258260988487e-06, + "loss": 0.88507974, + "num_input_tokens_seen": 73561450, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9453125, + "step": 3405, + "time_per_iteration": 2.5019404888153076 + }, + { + "auxiliary_loss_clip": 0.01138198, + "auxiliary_loss_mlp": 0.01042134, + "balance_loss_clip": 1.02558827, + "balance_loss_mlp": 1.04664326, + "epoch": 0.20477979858710357, + "flos": 14501555458560.0, + "grad_norm": 2.0734152439752327, + "language_loss": 0.84731919, + "learning_rate": 3.6880169781935276e-06, + "loss": 0.86912251, + "num_input_tokens_seen": 73577155, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9140625, + "step": 3406, + "time_per_iteration": 2.508274793624878 + }, + { + "auxiliary_loss_clip": 0.0114, + "auxiliary_loss_mlp": 0.01042004, + "balance_loss_clip": 1.02601814, + "balance_loss_mlp": 1.04885817, + "epoch": 0.20483992183977154, + "flos": 11400310085760.0, + "grad_norm": 2.0579137112366332, + "language_loss": 0.68086451, + "learning_rate": 3.6878080662784336e-06, + "loss": 0.70268458, + "num_input_tokens_seen": 73594900, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.9140625, + "step": 3407, + "time_per_iteration": 2.4675915241241455 + }, + { + "auxiliary_loss_clip": 0.01137483, + "auxiliary_loss_mlp": 0.0104729, + "balance_loss_clip": 1.03039861, + "balance_loss_mlp": 1.0469842, + "epoch": 0.2049000450924395, + "flos": 19060271084160.0, + "grad_norm": 2.4520435823789857, + "language_loss": 0.84025276, + "learning_rate": 3.6875990903614886e-06, + "loss": 0.86210054, + "num_input_tokens_seen": 73613810, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.90625, + "step": 3408, + "time_per_iteration": 2.4996185302734375 + }, + { + "auxiliary_loss_clip": 0.01144748, + "auxiliary_loss_mlp": 0.01045034, + "balance_loss_clip": 1.02851176, + "balance_loss_mlp": 1.05156052, + "epoch": 0.20496016834510747, + "flos": 14574561851520.0, + "grad_norm": 2.726731275915995, + "language_loss": 0.64288676, + "learning_rate": 3.6873900504506166e-06, + "loss": 0.66478455, + "num_input_tokens_seen": 73631495, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9296875, + "step": 3409, + "time_per_iteration": 2.469758987426758 + }, + { + "auxiliary_loss_clip": 0.01139054, + "auxiliary_loss_mlp": 0.01046007, + "balance_loss_clip": 1.0295676, + "balance_loss_mlp": 1.04638147, + "epoch": 0.20502029159777543, + "flos": 22126647329280.0, + "grad_norm": 1.319045584705984, + "language_loss": 0.80357087, + "learning_rate": 3.687180946553745e-06, + "loss": 0.82542145, + "num_input_tokens_seen": 73652840, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.92578125, + "step": 3410, + "time_per_iteration": 2.5167293548583984 + }, + { + "auxiliary_loss_clip": 0.01140553, + "auxiliary_loss_mlp": 0.01046311, + "balance_loss_clip": 1.0303905, + "balance_loss_mlp": 1.05014896, + "epoch": 0.2050804148504434, + "flos": 25367907916800.0, + "grad_norm": 2.259997857874164, + "language_loss": 0.75796056, + "learning_rate": 3.686971778678803e-06, + "loss": 0.7798292, + "num_input_tokens_seen": 73672150, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.90625, + "step": 3411, + "time_per_iteration": 2.5411264896392822 + }, + { + "auxiliary_loss_clip": 0.01144909, + "auxiliary_loss_mlp": 0.0104429, + "balance_loss_clip": 1.02817273, + "balance_loss_mlp": 1.05220985, + "epoch": 0.2051405381031114, + "flos": 23620171858560.0, + "grad_norm": 2.0004173274373183, + "language_loss": 0.73696554, + "learning_rate": 3.686762546833722e-06, + "loss": 0.75885755, + "num_input_tokens_seen": 73691940, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.9296875, + "step": 3412, + "time_per_iteration": 2.5047144889831543 + }, + { + "auxiliary_loss_clip": 0.01143761, + "auxiliary_loss_mlp": 0.01047167, + "balance_loss_clip": 1.03015614, + "balance_loss_mlp": 1.04735541, + "epoch": 0.20520066135577936, + "flos": 19565533745280.0, + "grad_norm": 2.0925027501904228, + "language_loss": 0.77863461, + "learning_rate": 3.6865532510264362e-06, + "loss": 0.8005439, + "num_input_tokens_seen": 73709080, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.96484375, + "step": 3413, + "time_per_iteration": 2.5472991466522217 + }, + { + "auxiliary_loss_clip": 0.01138869, + "auxiliary_loss_mlp": 0.01042643, + "balance_loss_clip": 1.02534604, + "balance_loss_mlp": 1.04989886, + "epoch": 0.20526078460844732, + "flos": 17676345928320.0, + "grad_norm": 1.912987525537943, + "language_loss": 0.84719825, + "learning_rate": 3.6863438912648823e-06, + "loss": 0.86901337, + "num_input_tokens_seen": 73727670, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.890625, + "step": 3414, + "time_per_iteration": 2.478729724884033 + }, + { + "auxiliary_loss_clip": 0.01138295, + "auxiliary_loss_mlp": 0.0104162, + "balance_loss_clip": 1.02496636, + "balance_loss_mlp": 1.04659235, + "epoch": 0.2053209078611153, + "flos": 21500328856320.0, + "grad_norm": 1.9076108002018353, + "language_loss": 0.80448711, + "learning_rate": 3.6861344675569986e-06, + "loss": 0.82628626, + "num_input_tokens_seen": 73747170, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.91796875, + "step": 3415, + "time_per_iteration": 2.5366415977478027 + }, + { + "auxiliary_loss_clip": 0.01137909, + "auxiliary_loss_mlp": 0.01037046, + "balance_loss_clip": 1.02154934, + "balance_loss_mlp": 1.04796863, + "epoch": 0.20538103111378325, + "flos": 25663524848640.0, + "grad_norm": 1.7629792917286327, + "language_loss": 0.72893143, + "learning_rate": 3.6859249799107275e-06, + "loss": 0.75068092, + "num_input_tokens_seen": 73767690, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8984375, + "step": 3416, + "time_per_iteration": 2.5656492710113525 + }, + { + "auxiliary_loss_clip": 0.01140135, + "auxiliary_loss_mlp": 0.01042271, + "balance_loss_clip": 1.02520072, + "balance_loss_mlp": 1.04695165, + "epoch": 0.20544115436645122, + "flos": 23148952312320.0, + "grad_norm": 2.5523210605949425, + "language_loss": 0.78623438, + "learning_rate": 3.6857154283340115e-06, + "loss": 0.80805844, + "num_input_tokens_seen": 73786900, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.93359375, + "step": 3417, + "time_per_iteration": 2.51582932472229 + }, + { + "auxiliary_loss_clip": 0.01140114, + "auxiliary_loss_mlp": 0.01046708, + "balance_loss_clip": 1.02948236, + "balance_loss_mlp": 1.04842472, + "epoch": 0.20550127761911918, + "flos": 19390433921280.0, + "grad_norm": 2.178207343470702, + "language_loss": 0.87390542, + "learning_rate": 3.685505812834798e-06, + "loss": 0.89577365, + "num_input_tokens_seen": 73804515, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.91796875, + "step": 3418, + "time_per_iteration": 2.4900615215301514 + }, + { + "auxiliary_loss_clip": 0.01139839, + "auxiliary_loss_mlp": 0.0104158, + "balance_loss_clip": 1.0251534, + "balance_loss_mlp": 1.04798996, + "epoch": 0.20556140087178718, + "flos": 22893124671360.0, + "grad_norm": 2.115759049165993, + "language_loss": 0.62156075, + "learning_rate": 3.685296133421035e-06, + "loss": 0.64337492, + "num_input_tokens_seen": 73822910, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.921875, + "step": 3419, + "time_per_iteration": 2.527057647705078 + }, + { + "auxiliary_loss_clip": 0.01143982, + "auxiliary_loss_mlp": 0.0104893, + "balance_loss_clip": 1.02977359, + "balance_loss_mlp": 1.04905963, + "epoch": 0.20562152412445514, + "flos": 19789652655360.0, + "grad_norm": 2.2865688080492466, + "language_loss": 0.86502206, + "learning_rate": 3.685086390100674e-06, + "loss": 0.88695121, + "num_input_tokens_seen": 73841160, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.953125, + "step": 3420, + "time_per_iteration": 2.532512664794922 + }, + { + "auxiliary_loss_clip": 0.01138181, + "auxiliary_loss_mlp": 0.0104181, + "balance_loss_clip": 1.02533531, + "balance_loss_mlp": 1.04659796, + "epoch": 0.2056816473771231, + "flos": 31501989210240.0, + "grad_norm": 2.535685660701584, + "language_loss": 0.70904821, + "learning_rate": 3.684876582881668e-06, + "loss": 0.73084807, + "num_input_tokens_seen": 73862795, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.91796875, + "step": 3421, + "time_per_iteration": 2.5924150943756104 + }, + { + "auxiliary_loss_clip": 0.0113664, + "auxiliary_loss_mlp": 0.01038524, + "balance_loss_clip": 1.02099967, + "balance_loss_mlp": 1.04581738, + "epoch": 0.20574177062979107, + "flos": 23258372117760.0, + "grad_norm": 3.5707952740494235, + "language_loss": 0.70370102, + "learning_rate": 3.6846667117719732e-06, + "loss": 0.72545266, + "num_input_tokens_seen": 73881525, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.90625, + "step": 3422, + "time_per_iteration": 2.499041795730591 + }, + { + "auxiliary_loss_clip": 0.01060302, + "auxiliary_loss_mlp": 0.01012319, + "balance_loss_clip": 1.01001859, + "balance_loss_mlp": 1.02983248, + "epoch": 0.20580189388245904, + "flos": 70312518708480.0, + "grad_norm": 0.7605512778953217, + "language_loss": 0.55499864, + "learning_rate": 3.684456776779548e-06, + "loss": 0.57572484, + "num_input_tokens_seen": 73937775, + "router_z_loss_clip": 0.02294922, + "router_z_loss_mlp": 0.3046875, + "step": 3423, + "time_per_iteration": 3.1569108963012695 + }, + { + "auxiliary_loss_clip": 0.0114215, + "auxiliary_loss_mlp": 0.01042807, + "balance_loss_clip": 1.02494931, + "balance_loss_mlp": 1.04882169, + "epoch": 0.205862017135127, + "flos": 30737846252160.0, + "grad_norm": 1.7754304652232902, + "language_loss": 0.71701574, + "learning_rate": 3.684246777912353e-06, + "loss": 0.73886526, + "num_input_tokens_seen": 73958250, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.9296875, + "step": 3424, + "time_per_iteration": 2.58278751373291 + }, + { + "auxiliary_loss_clip": 0.01140924, + "auxiliary_loss_mlp": 0.01046159, + "balance_loss_clip": 1.02920699, + "balance_loss_mlp": 1.05022514, + "epoch": 0.20592214038779497, + "flos": 21324546673920.0, + "grad_norm": 1.563470220797352, + "language_loss": 0.75031066, + "learning_rate": 3.684036715178351e-06, + "loss": 0.77218151, + "num_input_tokens_seen": 73977775, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.90625, + "step": 3425, + "time_per_iteration": 2.518050193786621 + }, + { + "auxiliary_loss_clip": 0.01145974, + "auxiliary_loss_mlp": 0.01057037, + "balance_loss_clip": 1.0404191, + "balance_loss_mlp": 1.0545603, + "epoch": 0.20598226364046296, + "flos": 22891652213760.0, + "grad_norm": 1.8081006382856646, + "language_loss": 0.88246548, + "learning_rate": 3.683826588585508e-06, + "loss": 0.90449566, + "num_input_tokens_seen": 73996590, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9140625, + "step": 3426, + "time_per_iteration": 2.5141823291778564 + }, + { + "auxiliary_loss_clip": 0.01139115, + "auxiliary_loss_mlp": 0.01046156, + "balance_loss_clip": 1.02927566, + "balance_loss_mlp": 1.04961991, + "epoch": 0.20604238689313092, + "flos": 23878549365120.0, + "grad_norm": 1.8273097367093476, + "language_loss": 0.76748925, + "learning_rate": 3.6836163981417926e-06, + "loss": 0.78934193, + "num_input_tokens_seen": 74015935, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.89453125, + "step": 3427, + "time_per_iteration": 4.068110227584839 + }, + { + "auxiliary_loss_clip": 0.01143208, + "auxiliary_loss_mlp": 0.01048853, + "balance_loss_clip": 1.03143609, + "balance_loss_mlp": 1.04978716, + "epoch": 0.2061025101457989, + "flos": 22491535639680.0, + "grad_norm": 1.6956079848027177, + "language_loss": 0.73914266, + "learning_rate": 3.683406143855174e-06, + "loss": 0.76106334, + "num_input_tokens_seen": 74036575, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.93359375, + "step": 3428, + "time_per_iteration": 2.5296199321746826 + }, + { + "auxiliary_loss_clip": 0.0113987, + "auxiliary_loss_mlp": 0.01049805, + "balance_loss_clip": 1.03188777, + "balance_loss_mlp": 1.04691577, + "epoch": 0.20616263339846685, + "flos": 22778928357120.0, + "grad_norm": 3.779292361126499, + "language_loss": 0.73553443, + "learning_rate": 3.6831958257336256e-06, + "loss": 0.75743121, + "num_input_tokens_seen": 74055365, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9296875, + "step": 3429, + "time_per_iteration": 3.979640483856201 + }, + { + "auxiliary_loss_clip": 0.01146724, + "auxiliary_loss_mlp": 0.01041423, + "balance_loss_clip": 1.0242331, + "balance_loss_mlp": 1.05180049, + "epoch": 0.20622275665113482, + "flos": 20882198684160.0, + "grad_norm": 1.8474903397728304, + "language_loss": 0.85301876, + "learning_rate": 3.6829854437851237e-06, + "loss": 0.87490022, + "num_input_tokens_seen": 74074875, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.94921875, + "step": 3430, + "time_per_iteration": 2.532275438308716 + }, + { + "auxiliary_loss_clip": 0.0114587, + "auxiliary_loss_mlp": 0.01052093, + "balance_loss_clip": 1.03411579, + "balance_loss_mlp": 1.05116892, + "epoch": 0.20628287990380278, + "flos": 19354415558400.0, + "grad_norm": 1.4715876867440674, + "language_loss": 0.69369543, + "learning_rate": 3.6827749980176444e-06, + "loss": 0.715675, + "num_input_tokens_seen": 74094505, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.94921875, + "step": 3431, + "time_per_iteration": 2.4857282638549805 + }, + { + "auxiliary_loss_clip": 0.01051719, + "auxiliary_loss_mlp": 0.01015472, + "balance_loss_clip": 1.01329005, + "balance_loss_mlp": 1.02078724, + "epoch": 0.20634300315647078, + "flos": 71517932248320.0, + "grad_norm": 0.8322663536180677, + "language_loss": 0.60249984, + "learning_rate": 3.6825644884391693e-06, + "loss": 0.62317169, + "num_input_tokens_seen": 74158500, + "router_z_loss_clip": 0.02185059, + "router_z_loss_mlp": 0.30859375, + "step": 3432, + "time_per_iteration": 3.250966787338257 + }, + { + "auxiliary_loss_clip": 0.01143675, + "auxiliary_loss_mlp": 0.01047086, + "balance_loss_clip": 1.03021789, + "balance_loss_mlp": 1.05125713, + "epoch": 0.20640312640913874, + "flos": 21723944976000.0, + "grad_norm": 1.7869258470827205, + "language_loss": 0.72495091, + "learning_rate": 3.682353915057679e-06, + "loss": 0.74685854, + "num_input_tokens_seen": 74176685, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.921875, + "step": 3433, + "time_per_iteration": 2.528576135635376 + }, + { + "auxiliary_loss_clip": 0.01143793, + "auxiliary_loss_mlp": 0.01050396, + "balance_loss_clip": 1.03295541, + "balance_loss_mlp": 1.04886997, + "epoch": 0.2064632496618067, + "flos": 20554621626240.0, + "grad_norm": 1.715054190412472, + "language_loss": 0.8721565, + "learning_rate": 3.6821432778811604e-06, + "loss": 0.8940984, + "num_input_tokens_seen": 74194935, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.94921875, + "step": 3434, + "time_per_iteration": 2.507589101791382 + }, + { + "auxiliary_loss_clip": 0.01144514, + "auxiliary_loss_mlp": 0.01043806, + "balance_loss_clip": 1.0269376, + "balance_loss_mlp": 1.04833162, + "epoch": 0.20652337291447467, + "flos": 29823273135360.0, + "grad_norm": 1.6274854163318595, + "language_loss": 0.69133317, + "learning_rate": 3.6819325769176004e-06, + "loss": 0.71321636, + "num_input_tokens_seen": 74215400, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9609375, + "step": 3435, + "time_per_iteration": 2.587930679321289 + }, + { + "auxiliary_loss_clip": 0.01140929, + "auxiliary_loss_mlp": 0.01041675, + "balance_loss_clip": 1.0241158, + "balance_loss_mlp": 1.04983366, + "epoch": 0.20658349616714264, + "flos": 26213640618240.0, + "grad_norm": 1.7028603597643168, + "language_loss": 0.8922776, + "learning_rate": 3.681721812174988e-06, + "loss": 0.91410363, + "num_input_tokens_seen": 74234090, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.91015625, + "step": 3436, + "time_per_iteration": 2.57295298576355 + }, + { + "auxiliary_loss_clip": 0.01144451, + "auxiliary_loss_mlp": 0.01035549, + "balance_loss_clip": 1.01856136, + "balance_loss_mlp": 1.05126333, + "epoch": 0.2066436194198106, + "flos": 25994370044160.0, + "grad_norm": 1.8990861512322268, + "language_loss": 0.76659, + "learning_rate": 3.6815109836613163e-06, + "loss": 0.78839004, + "num_input_tokens_seen": 74253345, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9296875, + "step": 3437, + "time_per_iteration": 2.5819849967956543 + }, + { + "auxiliary_loss_clip": 0.01142266, + "auxiliary_loss_mlp": 0.0104022, + "balance_loss_clip": 1.02397132, + "balance_loss_mlp": 1.04877901, + "epoch": 0.20670374267247857, + "flos": 21361067827200.0, + "grad_norm": 1.7925672188665596, + "language_loss": 0.77611911, + "learning_rate": 3.6813000913845795e-06, + "loss": 0.79794395, + "num_input_tokens_seen": 74271615, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.93359375, + "step": 3438, + "time_per_iteration": 2.5091731548309326 + }, + { + "auxiliary_loss_clip": 0.01047915, + "auxiliary_loss_mlp": 0.01005377, + "balance_loss_clip": 1.00348175, + "balance_loss_mlp": 1.01723933, + "epoch": 0.20676386592514656, + "flos": 66383281952640.0, + "grad_norm": 0.8367234589951487, + "language_loss": 0.67141807, + "learning_rate": 3.6810891353527747e-06, + "loss": 0.69195092, + "num_input_tokens_seen": 74331390, + "router_z_loss_clip": 0.0189209, + "router_z_loss_mlp": 0.30664062, + "step": 3439, + "time_per_iteration": 3.0797181129455566 + }, + { + "auxiliary_loss_clip": 0.01142942, + "auxiliary_loss_mlp": 0.01037044, + "balance_loss_clip": 1.02028275, + "balance_loss_mlp": 1.04791629, + "epoch": 0.20682398917781453, + "flos": 17274577328640.0, + "grad_norm": 2.0580501207842428, + "language_loss": 0.83931267, + "learning_rate": 3.6808781155739014e-06, + "loss": 0.86111259, + "num_input_tokens_seen": 74347335, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.94921875, + "step": 3440, + "time_per_iteration": 2.5015172958374023 + }, + { + "auxiliary_loss_clip": 0.01143016, + "auxiliary_loss_mlp": 0.01041686, + "balance_loss_clip": 1.02584338, + "balance_loss_mlp": 1.05009377, + "epoch": 0.2068841124304825, + "flos": 18077288515200.0, + "grad_norm": 1.9416657792651912, + "language_loss": 0.84825736, + "learning_rate": 3.6806670320559614e-06, + "loss": 0.87010437, + "num_input_tokens_seen": 74366310, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.9296875, + "step": 3441, + "time_per_iteration": 2.4866137504577637 + }, + { + "auxiliary_loss_clip": 0.01140001, + "auxiliary_loss_mlp": 0.01044739, + "balance_loss_clip": 1.02778697, + "balance_loss_mlp": 1.0502038, + "epoch": 0.20694423568315046, + "flos": 27347017432320.0, + "grad_norm": 1.6577892844013908, + "language_loss": 0.85889506, + "learning_rate": 3.680455884806959e-06, + "loss": 0.88074249, + "num_input_tokens_seen": 74387100, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8984375, + "step": 3442, + "time_per_iteration": 2.5914649963378906 + }, + { + "auxiliary_loss_clip": 0.01145487, + "auxiliary_loss_mlp": 0.01040291, + "balance_loss_clip": 1.02305317, + "balance_loss_mlp": 1.05208063, + "epoch": 0.20700435893581842, + "flos": 20229845829120.0, + "grad_norm": 1.9070439101703558, + "language_loss": 0.72829354, + "learning_rate": 3.6802446738349014e-06, + "loss": 0.75015128, + "num_input_tokens_seen": 74404460, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.93359375, + "step": 3443, + "time_per_iteration": 2.5210063457489014 + }, + { + "auxiliary_loss_clip": 0.01140016, + "auxiliary_loss_mlp": 0.01044032, + "balance_loss_clip": 1.02879703, + "balance_loss_mlp": 1.0496819, + "epoch": 0.2070644821884864, + "flos": 20631111638400.0, + "grad_norm": 2.5056876708900186, + "language_loss": 0.85428166, + "learning_rate": 3.680033399147797e-06, + "loss": 0.87612224, + "num_input_tokens_seen": 74423790, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.90625, + "step": 3444, + "time_per_iteration": 2.528493881225586 + }, + { + "auxiliary_loss_clip": 0.01047325, + "auxiliary_loss_mlp": 0.0100746, + "balance_loss_clip": 1.00537384, + "balance_loss_mlp": 1.01688242, + "epoch": 0.20712460544115438, + "flos": 65941077617280.0, + "grad_norm": 0.6978715278146553, + "language_loss": 0.57091653, + "learning_rate": 3.6798220607536585e-06, + "loss": 0.5914644, + "num_input_tokens_seen": 74488130, + "router_z_loss_clip": 0.02087402, + "router_z_loss_mlp": 0.3046875, + "step": 3445, + "time_per_iteration": 3.086552619934082 + }, + { + "auxiliary_loss_clip": 0.01140085, + "auxiliary_loss_mlp": 0.01050946, + "balance_loss_clip": 1.03356516, + "balance_loss_mlp": 1.04968095, + "epoch": 0.20718472869382235, + "flos": 19425734012160.0, + "grad_norm": 1.5621496076246746, + "language_loss": 0.78459281, + "learning_rate": 3.6796106586604987e-06, + "loss": 0.80650306, + "num_input_tokens_seen": 74506720, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.90625, + "step": 3446, + "time_per_iteration": 2.4844422340393066 + }, + { + "auxiliary_loss_clip": 0.01148285, + "auxiliary_loss_mlp": 0.01048146, + "balance_loss_clip": 1.02846456, + "balance_loss_mlp": 1.05057228, + "epoch": 0.2072448519464903, + "flos": 24499049834880.0, + "grad_norm": 2.157476270385918, + "language_loss": 0.62436825, + "learning_rate": 3.679399192876334e-06, + "loss": 0.64633256, + "num_input_tokens_seen": 74525330, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.9765625, + "step": 3447, + "time_per_iteration": 2.592799663543701 + }, + { + "auxiliary_loss_clip": 0.01141894, + "auxiliary_loss_mlp": 0.01047763, + "balance_loss_clip": 1.03071666, + "balance_loss_mlp": 1.04810297, + "epoch": 0.20730497519915828, + "flos": 23075694524160.0, + "grad_norm": 1.740614876967074, + "language_loss": 0.86066437, + "learning_rate": 3.679187663409184e-06, + "loss": 0.88256097, + "num_input_tokens_seen": 74544535, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9375, + "step": 3448, + "time_per_iteration": 2.5054237842559814 + }, + { + "auxiliary_loss_clip": 0.01140662, + "auxiliary_loss_mlp": 0.01044351, + "balance_loss_clip": 1.02576649, + "balance_loss_mlp": 1.04814398, + "epoch": 0.20736509845182624, + "flos": 21069042255360.0, + "grad_norm": 3.1117492515519665, + "language_loss": 0.75452864, + "learning_rate": 3.6789760702670696e-06, + "loss": 0.77637869, + "num_input_tokens_seen": 74562300, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.92578125, + "step": 3449, + "time_per_iteration": 2.506657838821411 + }, + { + "auxiliary_loss_clip": 0.01145667, + "auxiliary_loss_mlp": 0.01050496, + "balance_loss_clip": 1.03194678, + "balance_loss_mlp": 1.04896426, + "epoch": 0.2074252217044942, + "flos": 17633288499840.0, + "grad_norm": 1.7877143934577313, + "language_loss": 0.76703656, + "learning_rate": 3.6787644134580134e-06, + "loss": 0.78899819, + "num_input_tokens_seen": 74580080, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.96875, + "step": 3450, + "time_per_iteration": 2.479090929031372 + }, + { + "auxiliary_loss_clip": 0.01143955, + "auxiliary_loss_mlp": 0.01047659, + "balance_loss_clip": 1.0302192, + "balance_loss_mlp": 1.04780531, + "epoch": 0.20748534495716217, + "flos": 23546985897600.0, + "grad_norm": 1.5227053471466307, + "language_loss": 0.822101, + "learning_rate": 3.6785526929900436e-06, + "loss": 0.84401715, + "num_input_tokens_seen": 74598980, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9609375, + "step": 3451, + "time_per_iteration": 2.5465826988220215 + }, + { + "auxiliary_loss_clip": 0.01047156, + "auxiliary_loss_mlp": 0.01003865, + "balance_loss_clip": 1.00193334, + "balance_loss_mlp": 1.01645589, + "epoch": 0.20754546820983016, + "flos": 52252935598080.0, + "grad_norm": 0.7930757504147553, + "language_loss": 0.56569821, + "learning_rate": 3.6783409088711875e-06, + "loss": 0.58620846, + "num_input_tokens_seen": 74655275, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.30859375, + "step": 3452, + "time_per_iteration": 2.979168653488159 + }, + { + "auxiliary_loss_clip": 0.01144097, + "auxiliary_loss_mlp": 0.01045617, + "balance_loss_clip": 1.02765203, + "balance_loss_mlp": 1.0492605, + "epoch": 0.20760559146249813, + "flos": 20412379768320.0, + "grad_norm": 1.970927529953097, + "language_loss": 0.88332593, + "learning_rate": 3.6781290611094755e-06, + "loss": 0.90522313, + "num_input_tokens_seen": 74674560, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.94921875, + "step": 3453, + "time_per_iteration": 2.5404746532440186 + }, + { + "auxiliary_loss_clip": 0.01145334, + "auxiliary_loss_mlp": 0.01043412, + "balance_loss_clip": 1.02518487, + "balance_loss_mlp": 1.05121803, + "epoch": 0.2076657147151661, + "flos": 23186012169600.0, + "grad_norm": 1.6193396769615114, + "language_loss": 0.80056196, + "learning_rate": 3.6779171497129407e-06, + "loss": 0.82244939, + "num_input_tokens_seen": 74694500, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.94140625, + "step": 3454, + "time_per_iteration": 2.536154270172119 + }, + { + "auxiliary_loss_clip": 0.0114231, + "auxiliary_loss_mlp": 0.0104846, + "balance_loss_clip": 1.03128242, + "balance_loss_mlp": 1.04881716, + "epoch": 0.20772583796783406, + "flos": 18293219124480.0, + "grad_norm": 3.767477329453147, + "language_loss": 0.76424366, + "learning_rate": 3.6777051746896202e-06, + "loss": 0.78615135, + "num_input_tokens_seen": 74710485, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.93359375, + "step": 3455, + "time_per_iteration": 2.502450466156006 + }, + { + "auxiliary_loss_clip": 0.01141184, + "auxiliary_loss_mlp": 0.01049655, + "balance_loss_clip": 1.03247654, + "balance_loss_mlp": 1.04867601, + "epoch": 0.20778596122050202, + "flos": 17602800831360.0, + "grad_norm": 2.1876724852466163, + "language_loss": 0.80599815, + "learning_rate": 3.6774931360475516e-06, + "loss": 0.82790661, + "num_input_tokens_seen": 74727450, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.92578125, + "step": 3456, + "time_per_iteration": 2.495405673980713 + }, + { + "auxiliary_loss_clip": 0.01147485, + "auxiliary_loss_mlp": 0.01042924, + "balance_loss_clip": 1.02447069, + "balance_loss_mlp": 1.05180097, + "epoch": 0.20784608447317, + "flos": 23805578885760.0, + "grad_norm": 1.5859267830694757, + "language_loss": 0.77988815, + "learning_rate": 3.6772810337947745e-06, + "loss": 0.80179226, + "num_input_tokens_seen": 74746725, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.95703125, + "step": 3457, + "time_per_iteration": 2.5625829696655273 + }, + { + "auxiliary_loss_clip": 0.01149281, + "auxiliary_loss_mlp": 0.01054167, + "balance_loss_clip": 1.03461635, + "balance_loss_mlp": 1.05195451, + "epoch": 0.20790620772583795, + "flos": 17639286071040.0, + "grad_norm": 2.0073788397072136, + "language_loss": 0.83581042, + "learning_rate": 3.677068867939333e-06, + "loss": 0.85784483, + "num_input_tokens_seen": 74765255, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.96875, + "step": 3458, + "time_per_iteration": 2.470740556716919 + }, + { + "auxiliary_loss_clip": 0.01142717, + "auxiliary_loss_mlp": 0.01041287, + "balance_loss_clip": 1.02443111, + "balance_loss_mlp": 1.05063045, + "epoch": 0.20796633097850595, + "flos": 27673481168640.0, + "grad_norm": 1.732611194718632, + "language_loss": 0.76041365, + "learning_rate": 3.676856638489272e-06, + "loss": 0.78225368, + "num_input_tokens_seen": 74785710, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.921875, + "step": 3459, + "time_per_iteration": 2.5753207206726074 + }, + { + "auxiliary_loss_clip": 0.01138446, + "auxiliary_loss_mlp": 0.01041199, + "balance_loss_clip": 1.02451003, + "balance_loss_mlp": 1.04829502, + "epoch": 0.2080264542311739, + "flos": 19245606284160.0, + "grad_norm": 2.1264218253084386, + "language_loss": 0.77302521, + "learning_rate": 3.6766443454526382e-06, + "loss": 0.79482168, + "num_input_tokens_seen": 74804490, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8984375, + "step": 3460, + "time_per_iteration": 2.498760938644409 + }, + { + "auxiliary_loss_clip": 0.01143636, + "auxiliary_loss_mlp": 0.01050405, + "balance_loss_clip": 1.03284574, + "balance_loss_mlp": 1.04819179, + "epoch": 0.20808657748384188, + "flos": 27525924097920.0, + "grad_norm": 2.1644839576228296, + "language_loss": 0.75785947, + "learning_rate": 3.6764319888374836e-06, + "loss": 0.77979982, + "num_input_tokens_seen": 74826340, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.953125, + "step": 3461, + "time_per_iteration": 2.5850372314453125 + }, + { + "auxiliary_loss_clip": 0.01145604, + "auxiliary_loss_mlp": 0.010446, + "balance_loss_clip": 1.02645624, + "balance_loss_mlp": 1.0469749, + "epoch": 0.20814670073650984, + "flos": 26906931999360.0, + "grad_norm": 1.8484421465162717, + "language_loss": 0.88227051, + "learning_rate": 3.6762195686518604e-06, + "loss": 0.90417254, + "num_input_tokens_seen": 74844960, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.98828125, + "step": 3462, + "time_per_iteration": 2.558375358581543 + }, + { + "auxiliary_loss_clip": 0.01043601, + "auxiliary_loss_mlp": 0.0101247, + "balance_loss_clip": 1.01059818, + "balance_loss_mlp": 1.01278758, + "epoch": 0.2082068239891778, + "flos": 70175735717760.0, + "grad_norm": 0.7627714646141646, + "language_loss": 0.59057152, + "learning_rate": 3.6760070849038226e-06, + "loss": 0.6111322, + "num_input_tokens_seen": 74909075, + "router_z_loss_clip": 0.01867676, + "router_z_loss_mlp": 0.30859375, + "step": 3463, + "time_per_iteration": 3.2280492782592773 + }, + { + "auxiliary_loss_clip": 0.01144566, + "auxiliary_loss_mlp": 0.01049331, + "balance_loss_clip": 1.03056765, + "balance_loss_mlp": 1.04713821, + "epoch": 0.20826694724184577, + "flos": 24608074590720.0, + "grad_norm": 2.542529703880477, + "language_loss": 0.65831709, + "learning_rate": 3.675794537601429e-06, + "loss": 0.68025607, + "num_input_tokens_seen": 74928125, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.97265625, + "step": 3464, + "time_per_iteration": 2.5706918239593506 + }, + { + "auxiliary_loss_clip": 0.01147872, + "auxiliary_loss_mlp": 0.01050812, + "balance_loss_clip": 1.03160763, + "balance_loss_mlp": 1.0492928, + "epoch": 0.20832707049451377, + "flos": 12892829034240.0, + "grad_norm": 2.848617339554035, + "language_loss": 0.83536243, + "learning_rate": 3.6755819267527373e-06, + "loss": 0.85734928, + "num_input_tokens_seen": 74945090, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.984375, + "step": 3465, + "time_per_iteration": 2.535473585128784 + }, + { + "auxiliary_loss_clip": 0.01143191, + "auxiliary_loss_mlp": 0.01044869, + "balance_loss_clip": 1.02767932, + "balance_loss_mlp": 1.04802513, + "epoch": 0.20838719374718173, + "flos": 22198827709440.0, + "grad_norm": 3.628659863163492, + "language_loss": 0.81463158, + "learning_rate": 3.6753692523658113e-06, + "loss": 0.83651215, + "num_input_tokens_seen": 74963630, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.953125, + "step": 3466, + "time_per_iteration": 2.535311222076416 + }, + { + "auxiliary_loss_clip": 0.01146517, + "auxiliary_loss_mlp": 0.01044717, + "balance_loss_clip": 1.02863586, + "balance_loss_mlp": 1.05303347, + "epoch": 0.2084473169998497, + "flos": 15158648908800.0, + "grad_norm": 1.967186340276973, + "language_loss": 0.81678396, + "learning_rate": 3.675156514448716e-06, + "loss": 0.83869636, + "num_input_tokens_seen": 74981875, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.9375, + "step": 3467, + "time_per_iteration": 2.4783830642700195 + }, + { + "auxiliary_loss_clip": 0.01142574, + "auxiliary_loss_mlp": 0.01041679, + "balance_loss_clip": 1.02469158, + "balance_loss_mlp": 1.05200005, + "epoch": 0.20850744025251766, + "flos": 17456788045440.0, + "grad_norm": 2.0682841758185235, + "language_loss": 0.8186093, + "learning_rate": 3.674943713009518e-06, + "loss": 0.84045184, + "num_input_tokens_seen": 74999155, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.90625, + "step": 3468, + "time_per_iteration": 2.5275001525878906 + }, + { + "auxiliary_loss_clip": 0.0114752, + "auxiliary_loss_mlp": 0.01046643, + "balance_loss_clip": 1.02677095, + "balance_loss_mlp": 1.05024171, + "epoch": 0.20856756350518563, + "flos": 25698968593920.0, + "grad_norm": 1.9832892060266627, + "language_loss": 0.90227246, + "learning_rate": 3.6747308480562856e-06, + "loss": 0.92421412, + "num_input_tokens_seen": 75017850, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.9765625, + "step": 3469, + "time_per_iteration": 3.999607563018799 + }, + { + "auxiliary_loss_clip": 0.01147477, + "auxiliary_loss_mlp": 0.01051285, + "balance_loss_clip": 1.03329682, + "balance_loss_mlp": 1.0530771, + "epoch": 0.2086276867578536, + "flos": 37889060970240.0, + "grad_norm": 1.764094275638393, + "language_loss": 0.7643016, + "learning_rate": 3.674517919597092e-06, + "loss": 0.78628922, + "num_input_tokens_seen": 75039270, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9453125, + "step": 3470, + "time_per_iteration": 4.186570405960083 + }, + { + "auxiliary_loss_clip": 0.0114555, + "auxiliary_loss_mlp": 0.01048445, + "balance_loss_clip": 1.03039646, + "balance_loss_mlp": 1.05154145, + "epoch": 0.20868781001052156, + "flos": 25557049958400.0, + "grad_norm": 1.7254586081909284, + "language_loss": 0.7592454, + "learning_rate": 3.674304927640011e-06, + "loss": 0.78118539, + "num_input_tokens_seen": 75059350, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.94140625, + "step": 3471, + "time_per_iteration": 2.5700020790100098 + }, + { + "auxiliary_loss_clip": 0.01148899, + "auxiliary_loss_mlp": 0.01054484, + "balance_loss_clip": 1.03488564, + "balance_loss_mlp": 1.04796982, + "epoch": 0.20874793326318955, + "flos": 27529192235520.0, + "grad_norm": 1.907022336492936, + "language_loss": 0.75515926, + "learning_rate": 3.67409187219312e-06, + "loss": 0.77719313, + "num_input_tokens_seen": 75080150, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0078125, + "step": 3472, + "time_per_iteration": 2.555927038192749 + }, + { + "auxiliary_loss_clip": 0.01144631, + "auxiliary_loss_mlp": 0.01045397, + "balance_loss_clip": 1.02790928, + "balance_loss_mlp": 1.05051231, + "epoch": 0.20880805651585752, + "flos": 18548795370240.0, + "grad_norm": 1.9877478939715982, + "language_loss": 0.84168947, + "learning_rate": 3.6738787532644966e-06, + "loss": 0.86358976, + "num_input_tokens_seen": 75097920, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.94140625, + "step": 3473, + "time_per_iteration": 2.5261759757995605 + }, + { + "auxiliary_loss_clip": 0.01043725, + "auxiliary_loss_mlp": 0.0100228, + "balance_loss_clip": 1.00027776, + "balance_loss_mlp": 1.01290703, + "epoch": 0.20886817976852548, + "flos": 65946644225280.0, + "grad_norm": 0.8792852781400284, + "language_loss": 0.63631999, + "learning_rate": 3.6736655708622235e-06, + "loss": 0.65678006, + "num_input_tokens_seen": 75152410, + "router_z_loss_clip": 0.02001953, + "router_z_loss_mlp": 0.30859375, + "step": 3474, + "time_per_iteration": 3.025831460952759 + }, + { + "auxiliary_loss_clip": 0.01146356, + "auxiliary_loss_mlp": 0.01041236, + "balance_loss_clip": 1.02334285, + "balance_loss_mlp": 1.04993105, + "epoch": 0.20892830302119345, + "flos": 36539178929280.0, + "grad_norm": 2.882119897934913, + "language_loss": 0.69867098, + "learning_rate": 3.6734523249943844e-06, + "loss": 0.72054696, + "num_input_tokens_seen": 75173265, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.96484375, + "step": 3475, + "time_per_iteration": 2.751676559448242 + }, + { + "auxiliary_loss_clip": 0.01146508, + "auxiliary_loss_mlp": 0.01047852, + "balance_loss_clip": 1.02961278, + "balance_loss_mlp": 1.05162299, + "epoch": 0.2089884262738614, + "flos": 20956749361920.0, + "grad_norm": 1.4951270147360183, + "language_loss": 0.70032048, + "learning_rate": 3.673239015669065e-06, + "loss": 0.72226411, + "num_input_tokens_seen": 75193640, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.94921875, + "step": 3476, + "time_per_iteration": 2.5493083000183105 + }, + { + "auxiliary_loss_clip": 0.011446, + "auxiliary_loss_mlp": 0.01046029, + "balance_loss_clip": 1.02850533, + "balance_loss_mlp": 1.05099094, + "epoch": 0.20904854952652938, + "flos": 22784028088320.0, + "grad_norm": 2.0857679152031716, + "language_loss": 0.89590299, + "learning_rate": 3.6730256428943544e-06, + "loss": 0.91780925, + "num_input_tokens_seen": 75212545, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9375, + "step": 3477, + "time_per_iteration": 2.506962537765503 + }, + { + "auxiliary_loss_clip": 0.01142894, + "auxiliary_loss_mlp": 0.01047844, + "balance_loss_clip": 1.03005815, + "balance_loss_mlp": 1.04896593, + "epoch": 0.20910867277919734, + "flos": 27303277645440.0, + "grad_norm": 4.245750786990739, + "language_loss": 0.67988396, + "learning_rate": 3.672812206678344e-06, + "loss": 0.70179135, + "num_input_tokens_seen": 75230865, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9375, + "step": 3478, + "time_per_iteration": 2.57366681098938 + }, + { + "auxiliary_loss_clip": 0.01143008, + "auxiliary_loss_mlp": 0.0104171, + "balance_loss_clip": 1.02334023, + "balance_loss_mlp": 1.04826832, + "epoch": 0.20916879603186533, + "flos": 14319237000960.0, + "grad_norm": 2.137628491911851, + "language_loss": 0.85035646, + "learning_rate": 3.672598707029127e-06, + "loss": 0.87220371, + "num_input_tokens_seen": 75248285, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.94921875, + "step": 3479, + "time_per_iteration": 2.4716267585754395 + }, + { + "auxiliary_loss_clip": 0.01146636, + "auxiliary_loss_mlp": 0.01049707, + "balance_loss_clip": 1.03156328, + "balance_loss_mlp": 1.04972577, + "epoch": 0.2092289192845333, + "flos": 22273019251200.0, + "grad_norm": 2.2225866030569175, + "language_loss": 0.73807257, + "learning_rate": 3.6723851439548003e-06, + "loss": 0.76003599, + "num_input_tokens_seen": 75266310, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.96875, + "step": 3480, + "time_per_iteration": 2.4856386184692383 + }, + { + "auxiliary_loss_clip": 0.01141126, + "auxiliary_loss_mlp": 0.01047253, + "balance_loss_clip": 1.03113592, + "balance_loss_mlp": 1.04844785, + "epoch": 0.20928904253720126, + "flos": 14830712714880.0, + "grad_norm": 2.023418551380918, + "language_loss": 0.75601453, + "learning_rate": 3.67217151746346e-06, + "loss": 0.77789831, + "num_input_tokens_seen": 75284175, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.9296875, + "step": 3481, + "time_per_iteration": 2.4812443256378174 + }, + { + "auxiliary_loss_clip": 0.01145872, + "auxiliary_loss_mlp": 0.0104777, + "balance_loss_clip": 1.03051996, + "balance_loss_mlp": 1.05047393, + "epoch": 0.20934916578986923, + "flos": 23259162216960.0, + "grad_norm": 3.5251666716598273, + "language_loss": 0.85337639, + "learning_rate": 3.671957827563209e-06, + "loss": 0.87531281, + "num_input_tokens_seen": 75303465, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.953125, + "step": 3482, + "time_per_iteration": 2.521284580230713 + }, + { + "auxiliary_loss_clip": 0.01145664, + "auxiliary_loss_mlp": 0.01048133, + "balance_loss_clip": 1.02940559, + "balance_loss_mlp": 1.05097377, + "epoch": 0.2094092890425372, + "flos": 32014398677760.0, + "grad_norm": 2.8936854891166743, + "language_loss": 0.70626152, + "learning_rate": 3.6717440742621494e-06, + "loss": 0.72819948, + "num_input_tokens_seen": 75325290, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9453125, + "step": 3483, + "time_per_iteration": 2.5876524448394775 + }, + { + "auxiliary_loss_clip": 0.01146142, + "auxiliary_loss_mlp": 0.01060474, + "balance_loss_clip": 1.04193723, + "balance_loss_mlp": 1.04891169, + "epoch": 0.20946941229520516, + "flos": 20010647082240.0, + "grad_norm": 1.8606830424584557, + "language_loss": 0.74988431, + "learning_rate": 3.6715302575683865e-06, + "loss": 0.77195048, + "num_input_tokens_seen": 75343895, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.96875, + "step": 3484, + "time_per_iteration": 2.49701189994812 + }, + { + "auxiliary_loss_clip": 0.01143763, + "auxiliary_loss_mlp": 0.01048057, + "balance_loss_clip": 1.02991378, + "balance_loss_mlp": 1.05028141, + "epoch": 0.20952953554787315, + "flos": 30740072895360.0, + "grad_norm": 1.8378150509428508, + "language_loss": 0.70690203, + "learning_rate": 3.6713163774900292e-06, + "loss": 0.7288202, + "num_input_tokens_seen": 75367100, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9375, + "step": 3485, + "time_per_iteration": 2.5692059993743896 + }, + { + "auxiliary_loss_clip": 0.01146857, + "auxiliary_loss_mlp": 0.01045553, + "balance_loss_clip": 1.02712297, + "balance_loss_mlp": 1.05028093, + "epoch": 0.20958965880054112, + "flos": 27049209770880.0, + "grad_norm": 1.9069158447471781, + "language_loss": 0.82965356, + "learning_rate": 3.6711024340351875e-06, + "loss": 0.85157764, + "num_input_tokens_seen": 75389925, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.96875, + "step": 3486, + "time_per_iteration": 2.569308042526245 + }, + { + "auxiliary_loss_clip": 0.0114472, + "auxiliary_loss_mlp": 0.01050567, + "balance_loss_clip": 1.03309095, + "balance_loss_mlp": 1.04790449, + "epoch": 0.20964978205320908, + "flos": 34204123589760.0, + "grad_norm": 3.843984040964354, + "language_loss": 0.8699702, + "learning_rate": 3.6708884272119737e-06, + "loss": 0.89192313, + "num_input_tokens_seen": 75408575, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.96875, + "step": 3487, + "time_per_iteration": 2.608441114425659 + }, + { + "auxiliary_loss_clip": 0.01141021, + "auxiliary_loss_mlp": 0.01047132, + "balance_loss_clip": 1.0287739, + "balance_loss_mlp": 1.04695904, + "epoch": 0.20970990530587705, + "flos": 23477391296640.0, + "grad_norm": 2.4377115915778713, + "language_loss": 0.72369969, + "learning_rate": 3.670674357028504e-06, + "loss": 0.74558127, + "num_input_tokens_seen": 75427155, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.94140625, + "step": 3488, + "time_per_iteration": 2.529233694076538 + }, + { + "auxiliary_loss_clip": 0.01144055, + "auxiliary_loss_mlp": 0.01045689, + "balance_loss_clip": 1.02812946, + "balance_loss_mlp": 1.04897618, + "epoch": 0.209770028558545, + "flos": 18551452976640.0, + "grad_norm": 2.6657941113460764, + "language_loss": 0.80726898, + "learning_rate": 3.6704602234928945e-06, + "loss": 0.82916641, + "num_input_tokens_seen": 75444450, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.953125, + "step": 3489, + "time_per_iteration": 2.4847962856292725 + }, + { + "auxiliary_loss_clip": 0.01142088, + "auxiliary_loss_mlp": 0.01042551, + "balance_loss_clip": 1.0253495, + "balance_loss_mlp": 1.04718399, + "epoch": 0.20983015181121298, + "flos": 21617003208960.0, + "grad_norm": 1.7888402521564877, + "language_loss": 0.72827011, + "learning_rate": 3.670246026613266e-06, + "loss": 0.75011659, + "num_input_tokens_seen": 75462625, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.94921875, + "step": 3490, + "time_per_iteration": 2.543064594268799 + }, + { + "auxiliary_loss_clip": 0.01140159, + "auxiliary_loss_mlp": 0.0105099, + "balance_loss_clip": 1.03437209, + "balance_loss_mlp": 1.04955435, + "epoch": 0.20989027506388094, + "flos": 16614718531200.0, + "grad_norm": 5.073894522138561, + "language_loss": 0.70159817, + "learning_rate": 3.6700317663977415e-06, + "loss": 0.72350967, + "num_input_tokens_seen": 75480640, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.90625, + "step": 3491, + "time_per_iteration": 2.4785172939300537 + }, + { + "auxiliary_loss_clip": 0.01142629, + "auxiliary_loss_mlp": 0.01045142, + "balance_loss_clip": 1.02633047, + "balance_loss_mlp": 1.04678369, + "epoch": 0.20995039831654894, + "flos": 23216823060480.0, + "grad_norm": 3.7459720995568557, + "language_loss": 0.7931999, + "learning_rate": 3.669817442854444e-06, + "loss": 0.8150776, + "num_input_tokens_seen": 75494900, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9609375, + "step": 3492, + "time_per_iteration": 2.5213027000427246 + }, + { + "auxiliary_loss_clip": 0.01144565, + "auxiliary_loss_mlp": 0.01041079, + "balance_loss_clip": 1.02341175, + "balance_loss_mlp": 1.04977345, + "epoch": 0.2100105215692169, + "flos": 18147493647360.0, + "grad_norm": 1.9629392465329358, + "language_loss": 0.86883962, + "learning_rate": 3.669603055991502e-06, + "loss": 0.89069605, + "num_input_tokens_seen": 75513370, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9453125, + "step": 3493, + "time_per_iteration": 2.499797821044922 + }, + { + "auxiliary_loss_clip": 0.01139311, + "auxiliary_loss_mlp": 0.01040774, + "balance_loss_clip": 1.02408433, + "balance_loss_mlp": 1.04791212, + "epoch": 0.21007064482188487, + "flos": 15961611490560.0, + "grad_norm": 1.8525794886403055, + "language_loss": 0.68810928, + "learning_rate": 3.6693886058170455e-06, + "loss": 0.70991009, + "num_input_tokens_seen": 75532480, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9140625, + "step": 3494, + "time_per_iteration": 2.5374889373779297 + }, + { + "auxiliary_loss_clip": 0.01146689, + "auxiliary_loss_mlp": 0.01037903, + "balance_loss_clip": 1.02054656, + "balance_loss_mlp": 1.05010796, + "epoch": 0.21013076807455283, + "flos": 32234315696640.0, + "grad_norm": 1.7465496854212388, + "language_loss": 0.78900456, + "learning_rate": 3.6691740923392053e-06, + "loss": 0.81085044, + "num_input_tokens_seen": 75552745, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.96484375, + "step": 3495, + "time_per_iteration": 2.6390578746795654 + }, + { + "auxiliary_loss_clip": 0.0114231, + "auxiliary_loss_mlp": 0.01042653, + "balance_loss_clip": 1.02505755, + "balance_loss_mlp": 1.04696178, + "epoch": 0.2101908913272208, + "flos": 23696625957120.0, + "grad_norm": 1.7459726457298623, + "language_loss": 0.77192879, + "learning_rate": 3.668959515566116e-06, + "loss": 0.79377842, + "num_input_tokens_seen": 75574355, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.953125, + "step": 3496, + "time_per_iteration": 2.552386522293091 + }, + { + "auxiliary_loss_clip": 0.01145605, + "auxiliary_loss_mlp": 0.0105152, + "balance_loss_clip": 1.03297126, + "balance_loss_mlp": 1.04933989, + "epoch": 0.21025101457988876, + "flos": 20375786787840.0, + "grad_norm": 2.0396086665216777, + "language_loss": 0.82009852, + "learning_rate": 3.668744875505915e-06, + "loss": 0.84206975, + "num_input_tokens_seen": 75592215, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9609375, + "step": 3497, + "time_per_iteration": 2.498359441757202 + }, + { + "auxiliary_loss_clip": 0.01146873, + "auxiliary_loss_mlp": 0.01048221, + "balance_loss_clip": 1.03091133, + "balance_loss_mlp": 1.04979134, + "epoch": 0.21031113783255675, + "flos": 25775638174080.0, + "grad_norm": 2.5223195218779577, + "language_loss": 0.67314029, + "learning_rate": 3.668530172166741e-06, + "loss": 0.69509119, + "num_input_tokens_seen": 75610740, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.96875, + "step": 3498, + "time_per_iteration": 2.540766716003418 + }, + { + "auxiliary_loss_clip": 0.01145112, + "auxiliary_loss_mlp": 0.01045261, + "balance_loss_clip": 1.02679563, + "balance_loss_mlp": 1.04782224, + "epoch": 0.21037126108522472, + "flos": 22018197191040.0, + "grad_norm": 2.2477271783909414, + "language_loss": 0.80623376, + "learning_rate": 3.6683154055567352e-06, + "loss": 0.82813752, + "num_input_tokens_seen": 75631005, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.97265625, + "step": 3499, + "time_per_iteration": 2.5283098220825195 + }, + { + "auxiliary_loss_clip": 0.0114621, + "auxiliary_loss_mlp": 0.01043014, + "balance_loss_clip": 1.02612233, + "balance_loss_mlp": 1.05201602, + "epoch": 0.21043138433789269, + "flos": 25334403505920.0, + "grad_norm": 1.776862664007905, + "language_loss": 0.78366566, + "learning_rate": 3.668100575684043e-06, + "loss": 0.80555797, + "num_input_tokens_seen": 75650655, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.94140625, + "step": 3500, + "time_per_iteration": 2.5419158935546875 + }, + { + "auxiliary_loss_clip": 0.01142389, + "auxiliary_loss_mlp": 0.01042754, + "balance_loss_clip": 1.02524185, + "balance_loss_mlp": 1.0480907, + "epoch": 0.21049150759056065, + "flos": 25556654908800.0, + "grad_norm": 1.628727093990466, + "language_loss": 0.73989725, + "learning_rate": 3.6678856825568094e-06, + "loss": 0.76174867, + "num_input_tokens_seen": 75669895, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.94140625, + "step": 3501, + "time_per_iteration": 2.535419464111328 + }, + { + "auxiliary_loss_clip": 0.01140428, + "auxiliary_loss_mlp": 0.01041829, + "balance_loss_clip": 1.02429342, + "balance_loss_mlp": 1.04671168, + "epoch": 0.21055163084322862, + "flos": 24495602129280.0, + "grad_norm": 1.6206913905571714, + "language_loss": 0.75292969, + "learning_rate": 3.667670726183183e-06, + "loss": 0.77475226, + "num_input_tokens_seen": 75689535, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9375, + "step": 3502, + "time_per_iteration": 2.508277654647827 + }, + { + "auxiliary_loss_clip": 0.01141546, + "auxiliary_loss_mlp": 0.0104558, + "balance_loss_clip": 1.02796102, + "balance_loss_mlp": 1.0475595, + "epoch": 0.21061175409589658, + "flos": 25739045193600.0, + "grad_norm": 1.9145063235338367, + "language_loss": 0.77090263, + "learning_rate": 3.667455706571316e-06, + "loss": 0.7927739, + "num_input_tokens_seen": 75709265, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.94140625, + "step": 3503, + "time_per_iteration": 2.5607948303222656 + }, + { + "auxiliary_loss_clip": 0.01148374, + "auxiliary_loss_mlp": 0.01049218, + "balance_loss_clip": 1.02813029, + "balance_loss_mlp": 1.048738, + "epoch": 0.21067187734856455, + "flos": 18989168112000.0, + "grad_norm": 2.3817148130730144, + "language_loss": 0.77991742, + "learning_rate": 3.6672406237293617e-06, + "loss": 0.80189341, + "num_input_tokens_seen": 75727050, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.9921875, + "step": 3504, + "time_per_iteration": 2.495028018951416 + }, + { + "auxiliary_loss_clip": 0.01145149, + "auxiliary_loss_mlp": 0.01047751, + "balance_loss_clip": 1.02952361, + "balance_loss_mlp": 1.0473187, + "epoch": 0.21073200060123254, + "flos": 24681368292480.0, + "grad_norm": 1.5529728217373517, + "language_loss": 0.77045631, + "learning_rate": 3.6670254776654754e-06, + "loss": 0.79238534, + "num_input_tokens_seen": 75747175, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9765625, + "step": 3505, + "time_per_iteration": 2.5408663749694824 + }, + { + "auxiliary_loss_clip": 0.01138823, + "auxiliary_loss_mlp": 0.01046578, + "balance_loss_clip": 1.02931666, + "balance_loss_mlp": 1.04786968, + "epoch": 0.2107921238539005, + "flos": 28549342402560.0, + "grad_norm": 1.9911708078552777, + "language_loss": 0.63704473, + "learning_rate": 3.6668102683878163e-06, + "loss": 0.65889871, + "num_input_tokens_seen": 75767690, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.91015625, + "step": 3506, + "time_per_iteration": 2.564246892929077 + }, + { + "auxiliary_loss_clip": 0.01140339, + "auxiliary_loss_mlp": 0.01046628, + "balance_loss_clip": 1.02904439, + "balance_loss_mlp": 1.04773796, + "epoch": 0.21085224710656847, + "flos": 25885848078720.0, + "grad_norm": 1.8633964271687153, + "language_loss": 0.81863034, + "learning_rate": 3.6665949959045443e-06, + "loss": 0.84050006, + "num_input_tokens_seen": 75787255, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.92578125, + "step": 3507, + "time_per_iteration": 2.6049435138702393 + }, + { + "auxiliary_loss_clip": 0.011401, + "auxiliary_loss_mlp": 0.0104784, + "balance_loss_clip": 1.0299232, + "balance_loss_mlp": 1.04645514, + "epoch": 0.21091237035923643, + "flos": 14976294537600.0, + "grad_norm": 2.0263301336255135, + "language_loss": 0.75496012, + "learning_rate": 3.666379660223824e-06, + "loss": 0.77683949, + "num_input_tokens_seen": 75805890, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.93359375, + "step": 3508, + "time_per_iteration": 2.5366437435150146 + }, + { + "auxiliary_loss_clip": 0.01144539, + "auxiliary_loss_mlp": 0.01042146, + "balance_loss_clip": 1.02395463, + "balance_loss_mlp": 1.04809749, + "epoch": 0.2109724936119044, + "flos": 16362518163840.0, + "grad_norm": 2.1922875924351115, + "language_loss": 0.85395098, + "learning_rate": 3.6661642613538192e-06, + "loss": 0.87581778, + "num_input_tokens_seen": 75821620, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.96484375, + "step": 3509, + "time_per_iteration": 2.4895167350769043 + }, + { + "auxiliary_loss_clip": 0.01146568, + "auxiliary_loss_mlp": 0.01043895, + "balance_loss_clip": 1.02503562, + "balance_loss_mlp": 1.04908204, + "epoch": 0.21103261686457236, + "flos": 31502492000640.0, + "grad_norm": 1.5522473876542349, + "language_loss": 0.67803288, + "learning_rate": 3.6659487993026987e-06, + "loss": 0.69993746, + "num_input_tokens_seen": 75842490, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.9765625, + "step": 3510, + "time_per_iteration": 4.065294027328491 + }, + { + "auxiliary_loss_clip": 0.01143018, + "auxiliary_loss_mlp": 0.01041477, + "balance_loss_clip": 1.02381003, + "balance_loss_mlp": 1.04653811, + "epoch": 0.21109274011724033, + "flos": 27344072517120.0, + "grad_norm": 1.9784941086490475, + "language_loss": 0.7240749, + "learning_rate": 3.6657332740786327e-06, + "loss": 0.74591982, + "num_input_tokens_seen": 75865985, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.96484375, + "step": 3511, + "time_per_iteration": 2.5701003074645996 + }, + { + "auxiliary_loss_clip": 0.01148402, + "auxiliary_loss_mlp": 0.01039531, + "balance_loss_clip": 1.02020764, + "balance_loss_mlp": 1.05022192, + "epoch": 0.21115286336990832, + "flos": 17820383466240.0, + "grad_norm": 2.3544542512902322, + "language_loss": 0.69737375, + "learning_rate": 3.665517685689794e-06, + "loss": 0.71925306, + "num_input_tokens_seen": 75882745, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.98046875, + "step": 3512, + "time_per_iteration": 3.9019229412078857 + }, + { + "auxiliary_loss_clip": 0.01143526, + "auxiliary_loss_mlp": 0.01047621, + "balance_loss_clip": 1.02872658, + "balance_loss_mlp": 1.04680824, + "epoch": 0.2112129866225763, + "flos": 27197987904000.0, + "grad_norm": 1.6756724017558497, + "language_loss": 0.73159289, + "learning_rate": 3.6653020341443584e-06, + "loss": 0.7535044, + "num_input_tokens_seen": 75904305, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.96875, + "step": 3513, + "time_per_iteration": 2.5643980503082275 + }, + { + "auxiliary_loss_clip": 0.01140444, + "auxiliary_loss_mlp": 0.0103852, + "balance_loss_clip": 1.02212906, + "balance_loss_mlp": 1.04916954, + "epoch": 0.21127310987524425, + "flos": 23731279603200.0, + "grad_norm": 1.635076517146385, + "language_loss": 0.74235332, + "learning_rate": 3.665086319450502e-06, + "loss": 0.76414299, + "num_input_tokens_seen": 75923710, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9140625, + "step": 3514, + "time_per_iteration": 2.5240070819854736 + }, + { + "auxiliary_loss_clip": 0.01144119, + "auxiliary_loss_mlp": 0.01040689, + "balance_loss_clip": 1.02347541, + "balance_loss_mlp": 1.0482856, + "epoch": 0.21133323312791222, + "flos": 18332505624960.0, + "grad_norm": 1.7928371848293583, + "language_loss": 0.76707381, + "learning_rate": 3.6648705416164062e-06, + "loss": 0.78892195, + "num_input_tokens_seen": 75942625, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9609375, + "step": 3515, + "time_per_iteration": 2.526527166366577 + }, + { + "auxiliary_loss_clip": 0.0114362, + "auxiliary_loss_mlp": 0.01042748, + "balance_loss_clip": 1.02517664, + "balance_loss_mlp": 1.04956555, + "epoch": 0.21139335638058018, + "flos": 17931203902080.0, + "grad_norm": 1.8516547188762509, + "language_loss": 0.68242604, + "learning_rate": 3.6646547006502518e-06, + "loss": 0.70428967, + "num_input_tokens_seen": 75959930, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.94140625, + "step": 3516, + "time_per_iteration": 2.46085786819458 + }, + { + "auxiliary_loss_clip": 0.01145197, + "auxiliary_loss_mlp": 0.01047209, + "balance_loss_clip": 1.02883935, + "balance_loss_mlp": 1.04901481, + "epoch": 0.21145347963324815, + "flos": 24572092141440.0, + "grad_norm": 1.653683865815189, + "language_loss": 0.85012519, + "learning_rate": 3.664438796560225e-06, + "loss": 0.87204921, + "num_input_tokens_seen": 75980335, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.9609375, + "step": 3517, + "time_per_iteration": 2.5080301761627197 + }, + { + "auxiliary_loss_clip": 0.01141463, + "auxiliary_loss_mlp": 0.01037033, + "balance_loss_clip": 1.01965201, + "balance_loss_mlp": 1.04722667, + "epoch": 0.21151360288591614, + "flos": 35845959375360.0, + "grad_norm": 2.26725319642869, + "language_loss": 0.62925792, + "learning_rate": 3.664222829354512e-06, + "loss": 0.65104288, + "num_input_tokens_seen": 76002095, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.94140625, + "step": 3518, + "time_per_iteration": 2.5949900150299072 + }, + { + "auxiliary_loss_clip": 0.01142565, + "auxiliary_loss_mlp": 0.01049413, + "balance_loss_clip": 1.03290248, + "balance_loss_mlp": 1.04891765, + "epoch": 0.2115737261385841, + "flos": 24641579001600.0, + "grad_norm": 1.8284325952385483, + "language_loss": 0.88772321, + "learning_rate": 3.664006799041303e-06, + "loss": 0.90964293, + "num_input_tokens_seen": 76020425, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.9375, + "step": 3519, + "time_per_iteration": 2.5356082916259766 + }, + { + "auxiliary_loss_clip": 0.0114445, + "auxiliary_loss_mlp": 0.01049295, + "balance_loss_clip": 1.03184235, + "balance_loss_mlp": 1.04866135, + "epoch": 0.21163384939125207, + "flos": 25226887121280.0, + "grad_norm": 1.5988506078375424, + "language_loss": 0.81066215, + "learning_rate": 3.6637907056287886e-06, + "loss": 0.83259952, + "num_input_tokens_seen": 76041210, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.95703125, + "step": 3520, + "time_per_iteration": 2.5069239139556885 + }, + { + "auxiliary_loss_clip": 0.0113827, + "auxiliary_loss_mlp": 0.01046076, + "balance_loss_clip": 1.02926779, + "balance_loss_mlp": 1.0469681, + "epoch": 0.21169397264392004, + "flos": 26067520091520.0, + "grad_norm": 1.592359744312873, + "language_loss": 0.76163614, + "learning_rate": 3.6635745491251642e-06, + "loss": 0.78347969, + "num_input_tokens_seen": 76062685, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9140625, + "step": 3521, + "time_per_iteration": 2.560037851333618 + }, + { + "auxiliary_loss_clip": 0.0113934, + "auxiliary_loss_mlp": 0.0104393, + "balance_loss_clip": 1.02842069, + "balance_loss_mlp": 1.04592443, + "epoch": 0.211754095896588, + "flos": 23108265181440.0, + "grad_norm": 2.0717596449561024, + "language_loss": 0.75950933, + "learning_rate": 3.663358329538626e-06, + "loss": 0.78134197, + "num_input_tokens_seen": 76082300, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.93359375, + "step": 3522, + "time_per_iteration": 2.4758715629577637 + }, + { + "auxiliary_loss_clip": 0.01141462, + "auxiliary_loss_mlp": 0.01049727, + "balance_loss_clip": 1.03176177, + "balance_loss_mlp": 1.04737353, + "epoch": 0.21181421914925597, + "flos": 27922341571200.0, + "grad_norm": 2.026497436525855, + "language_loss": 0.70436251, + "learning_rate": 3.663142046877374e-06, + "loss": 0.72627443, + "num_input_tokens_seen": 76101135, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.94140625, + "step": 3523, + "time_per_iteration": 2.5368640422821045 + }, + { + "auxiliary_loss_clip": 0.01140964, + "auxiliary_loss_mlp": 0.01044975, + "balance_loss_clip": 1.02786803, + "balance_loss_mlp": 1.04820895, + "epoch": 0.21187434240192393, + "flos": 17128636369920.0, + "grad_norm": 2.216886450348082, + "language_loss": 0.76683456, + "learning_rate": 3.6629257011496085e-06, + "loss": 0.7886939, + "num_input_tokens_seen": 76119320, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.92578125, + "step": 3524, + "time_per_iteration": 2.5932695865631104 + }, + { + "auxiliary_loss_clip": 0.01139634, + "auxiliary_loss_mlp": 0.01042013, + "balance_loss_clip": 1.02533603, + "balance_loss_mlp": 1.04276347, + "epoch": 0.21193446565459192, + "flos": 22347318533760.0, + "grad_norm": 2.020092904399728, + "language_loss": 0.81433582, + "learning_rate": 3.6627092923635338e-06, + "loss": 0.83615232, + "num_input_tokens_seen": 76137445, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.96875, + "step": 3525, + "time_per_iteration": 2.5425641536712646 + }, + { + "auxiliary_loss_clip": 0.011388, + "auxiliary_loss_mlp": 0.01041718, + "balance_loss_clip": 1.02484989, + "balance_loss_mlp": 1.04668331, + "epoch": 0.2119945889072599, + "flos": 27199316707200.0, + "grad_norm": 2.1031950889850655, + "language_loss": 0.75104785, + "learning_rate": 3.662492820527356e-06, + "loss": 0.77285308, + "num_input_tokens_seen": 76159500, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.921875, + "step": 3526, + "time_per_iteration": 2.533210515975952 + }, + { + "auxiliary_loss_clip": 0.01142205, + "auxiliary_loss_mlp": 0.01041732, + "balance_loss_clip": 1.02466083, + "balance_loss_mlp": 1.04663801, + "epoch": 0.21205471215992786, + "flos": 20991869884800.0, + "grad_norm": 1.9135764326712537, + "language_loss": 0.77385598, + "learning_rate": 3.662276285649284e-06, + "loss": 0.79569542, + "num_input_tokens_seen": 76177990, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.95703125, + "step": 3527, + "time_per_iteration": 2.53898286819458 + }, + { + "auxiliary_loss_clip": 0.0113944, + "auxiliary_loss_mlp": 0.01045919, + "balance_loss_clip": 1.02797842, + "balance_loss_mlp": 1.0461328, + "epoch": 0.21211483541259582, + "flos": 20777663128320.0, + "grad_norm": 1.981008674330079, + "language_loss": 0.78037727, + "learning_rate": 3.662059687737528e-06, + "loss": 0.80223083, + "num_input_tokens_seen": 76197125, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9296875, + "step": 3528, + "time_per_iteration": 2.5360231399536133 + }, + { + "auxiliary_loss_clip": 0.01138776, + "auxiliary_loss_mlp": 0.01047702, + "balance_loss_clip": 1.03096509, + "balance_loss_mlp": 1.04611731, + "epoch": 0.21217495866526379, + "flos": 18989994124800.0, + "grad_norm": 1.7275367809487383, + "language_loss": 0.8170321, + "learning_rate": 3.6618430268003024e-06, + "loss": 0.83889693, + "num_input_tokens_seen": 76216215, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.92578125, + "step": 3529, + "time_per_iteration": 2.531228542327881 + }, + { + "auxiliary_loss_clip": 0.01141251, + "auxiliary_loss_mlp": 0.01044804, + "balance_loss_clip": 1.028234, + "balance_loss_mlp": 1.04647708, + "epoch": 0.21223508191793175, + "flos": 20667309569280.0, + "grad_norm": 2.1603106904513547, + "language_loss": 0.76616383, + "learning_rate": 3.6616263028458235e-06, + "loss": 0.78802443, + "num_input_tokens_seen": 76237010, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9453125, + "step": 3530, + "time_per_iteration": 2.5361740589141846 + }, + { + "auxiliary_loss_clip": 0.01136983, + "auxiliary_loss_mlp": 0.01041907, + "balance_loss_clip": 1.02593338, + "balance_loss_mlp": 1.0451746, + "epoch": 0.21229520517059972, + "flos": 21616464504960.0, + "grad_norm": 2.3391242970409873, + "language_loss": 0.82978404, + "learning_rate": 3.661409515882308e-06, + "loss": 0.85157299, + "num_input_tokens_seen": 76255965, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.91796875, + "step": 3531, + "time_per_iteration": 2.571411609649658 + }, + { + "auxiliary_loss_clip": 0.01141528, + "auxiliary_loss_mlp": 0.01039512, + "balance_loss_clip": 1.02149963, + "balance_loss_mlp": 1.04744506, + "epoch": 0.2123553284232677, + "flos": 13991049411840.0, + "grad_norm": 2.416019676502894, + "language_loss": 0.73473567, + "learning_rate": 3.661192665917977e-06, + "loss": 0.75654608, + "num_input_tokens_seen": 76272150, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.94140625, + "step": 3532, + "time_per_iteration": 2.473006248474121 + }, + { + "auxiliary_loss_clip": 0.01138524, + "auxiliary_loss_mlp": 0.01042643, + "balance_loss_clip": 1.02485681, + "balance_loss_mlp": 1.04561734, + "epoch": 0.21241545167593567, + "flos": 18296774570880.0, + "grad_norm": 1.7353898898315339, + "language_loss": 0.73855233, + "learning_rate": 3.660975752961054e-06, + "loss": 0.76036394, + "num_input_tokens_seen": 76291425, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.92578125, + "step": 3533, + "time_per_iteration": 2.526780366897583 + }, + { + "auxiliary_loss_clip": 0.01140469, + "auxiliary_loss_mlp": 0.01045491, + "balance_loss_clip": 1.02833724, + "balance_loss_mlp": 1.04576015, + "epoch": 0.21247557492860364, + "flos": 34713121265280.0, + "grad_norm": 1.8944995629732337, + "language_loss": 0.7098999, + "learning_rate": 3.6607587770197634e-06, + "loss": 0.73175949, + "num_input_tokens_seen": 76313975, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9453125, + "step": 3534, + "time_per_iteration": 2.6947309970855713 + }, + { + "auxiliary_loss_clip": 0.01141409, + "auxiliary_loss_mlp": 0.01038239, + "balance_loss_clip": 1.02032161, + "balance_loss_mlp": 1.04669714, + "epoch": 0.2125356981812716, + "flos": 22053820504320.0, + "grad_norm": 1.9387778569542722, + "language_loss": 0.71567297, + "learning_rate": 3.6605417381023346e-06, + "loss": 0.73746949, + "num_input_tokens_seen": 76330955, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9453125, + "step": 3535, + "time_per_iteration": 2.6022329330444336 + }, + { + "auxiliary_loss_clip": 0.01136751, + "auxiliary_loss_mlp": 0.01046685, + "balance_loss_clip": 1.0299238, + "balance_loss_mlp": 1.04549336, + "epoch": 0.21259582143393957, + "flos": 28548336821760.0, + "grad_norm": 1.8756666540330442, + "language_loss": 0.7040931, + "learning_rate": 3.660324636216996e-06, + "loss": 0.72592747, + "num_input_tokens_seen": 76352680, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.91015625, + "step": 3536, + "time_per_iteration": 2.6005256175994873 + }, + { + "auxiliary_loss_clip": 0.01140865, + "auxiliary_loss_mlp": 0.01044171, + "balance_loss_clip": 1.02706444, + "balance_loss_mlp": 1.04512393, + "epoch": 0.21265594468660753, + "flos": 20120892900480.0, + "grad_norm": 1.9573194210103453, + "language_loss": 0.88217437, + "learning_rate": 3.660107471371981e-06, + "loss": 0.90402472, + "num_input_tokens_seen": 76370750, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.95703125, + "step": 3537, + "time_per_iteration": 2.5565810203552246 + }, + { + "auxiliary_loss_clip": 0.01134343, + "auxiliary_loss_mlp": 0.01040555, + "balance_loss_clip": 1.02425885, + "balance_loss_mlp": 1.0437026, + "epoch": 0.21271606793927553, + "flos": 23076161400960.0, + "grad_norm": 1.957058885696691, + "language_loss": 0.80129743, + "learning_rate": 3.659890243575524e-06, + "loss": 0.82304639, + "num_input_tokens_seen": 76390610, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.90625, + "step": 3538, + "time_per_iteration": 2.5501785278320312 + }, + { + "auxiliary_loss_clip": 0.01135328, + "auxiliary_loss_mlp": 0.01041186, + "balance_loss_clip": 1.025653, + "balance_loss_mlp": 1.0446775, + "epoch": 0.2127761911919435, + "flos": 26388201738240.0, + "grad_norm": 1.587715235485788, + "language_loss": 0.87131894, + "learning_rate": 3.659672952835863e-06, + "loss": 0.89308405, + "num_input_tokens_seen": 76408860, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.90625, + "step": 3539, + "time_per_iteration": 2.5751259326934814 + }, + { + "auxiliary_loss_clip": 0.01139264, + "auxiliary_loss_mlp": 0.01045476, + "balance_loss_clip": 1.02914476, + "balance_loss_mlp": 1.04718518, + "epoch": 0.21283631444461146, + "flos": 20228265630720.0, + "grad_norm": 3.3040839486156184, + "language_loss": 0.57464051, + "learning_rate": 3.659455599161237e-06, + "loss": 0.59648788, + "num_input_tokens_seen": 76424980, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.921875, + "step": 3540, + "time_per_iteration": 2.4746458530426025 + }, + { + "auxiliary_loss_clip": 0.01140156, + "auxiliary_loss_mlp": 0.01040246, + "balance_loss_clip": 1.02330637, + "balance_loss_mlp": 1.04658604, + "epoch": 0.21289643769727942, + "flos": 13516992691200.0, + "grad_norm": 5.8376417218282874, + "language_loss": 0.76062799, + "learning_rate": 3.659238182559888e-06, + "loss": 0.78243208, + "num_input_tokens_seen": 76443135, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.93359375, + "step": 3541, + "time_per_iteration": 2.5111818313598633 + }, + { + "auxiliary_loss_clip": 0.0113571, + "auxiliary_loss_mlp": 0.01041682, + "balance_loss_clip": 1.02517211, + "balance_loss_mlp": 1.04530454, + "epoch": 0.2129565609499474, + "flos": 24827021942400.0, + "grad_norm": 1.9190227230034667, + "language_loss": 0.69458514, + "learning_rate": 3.6590207030400615e-06, + "loss": 0.71635908, + "num_input_tokens_seen": 76462470, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.90234375, + "step": 3542, + "time_per_iteration": 2.556300401687622 + }, + { + "auxiliary_loss_clip": 0.01134092, + "auxiliary_loss_mlp": 0.01034857, + "balance_loss_clip": 1.01945567, + "balance_loss_mlp": 1.04443789, + "epoch": 0.21301668420261535, + "flos": 23659242877440.0, + "grad_norm": 1.8172219669397587, + "language_loss": 0.75591409, + "learning_rate": 3.658803160610004e-06, + "loss": 0.77760351, + "num_input_tokens_seen": 76481995, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8984375, + "step": 3543, + "time_per_iteration": 2.54424786567688 + }, + { + "auxiliary_loss_clip": 0.01138428, + "auxiliary_loss_mlp": 0.01038735, + "balance_loss_clip": 1.02253413, + "balance_loss_mlp": 1.04843175, + "epoch": 0.21307680745528332, + "flos": 16362805472640.0, + "grad_norm": 2.1531603349332915, + "language_loss": 0.66787028, + "learning_rate": 3.6585855552779634e-06, + "loss": 0.68964195, + "num_input_tokens_seen": 76500245, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8984375, + "step": 3544, + "time_per_iteration": 2.516359329223633 + }, + { + "auxiliary_loss_clip": 0.01136471, + "auxiliary_loss_mlp": 0.01040176, + "balance_loss_clip": 1.0245831, + "balance_loss_mlp": 1.04379654, + "epoch": 0.2131369307079513, + "flos": 19099054794240.0, + "grad_norm": 1.9827170900636153, + "language_loss": 0.71089172, + "learning_rate": 3.6583678870521934e-06, + "loss": 0.73265821, + "num_input_tokens_seen": 76519535, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.92578125, + "step": 3545, + "time_per_iteration": 2.5377357006073 + }, + { + "auxiliary_loss_clip": 0.01138848, + "auxiliary_loss_mlp": 0.01046644, + "balance_loss_clip": 1.03095567, + "balance_loss_mlp": 1.04571509, + "epoch": 0.21319705396061928, + "flos": 30372275583360.0, + "grad_norm": 1.730364240275379, + "language_loss": 0.72334421, + "learning_rate": 3.658150155940946e-06, + "loss": 0.74519908, + "num_input_tokens_seen": 76542065, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.9296875, + "step": 3546, + "time_per_iteration": 2.5640652179718018 + }, + { + "auxiliary_loss_clip": 0.0113929, + "auxiliary_loss_mlp": 0.0104318, + "balance_loss_clip": 1.02695596, + "balance_loss_mlp": 1.0467453, + "epoch": 0.21325717721328724, + "flos": 21756192410880.0, + "grad_norm": 1.889324350950523, + "language_loss": 0.80698627, + "learning_rate": 3.657932361952479e-06, + "loss": 0.82881093, + "num_input_tokens_seen": 76560540, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.92578125, + "step": 3547, + "time_per_iteration": 2.527398109436035 + }, + { + "auxiliary_loss_clip": 0.01140759, + "auxiliary_loss_mlp": 0.0104395, + "balance_loss_clip": 1.02702212, + "balance_loss_mlp": 1.04538703, + "epoch": 0.2133173004659552, + "flos": 28730870760960.0, + "grad_norm": 3.232228952830713, + "language_loss": 0.74496448, + "learning_rate": 3.6577145050950504e-06, + "loss": 0.76681155, + "num_input_tokens_seen": 76581760, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.953125, + "step": 3548, + "time_per_iteration": 2.5493834018707275 + }, + { + "auxiliary_loss_clip": 0.01141872, + "auxiliary_loss_mlp": 0.01045412, + "balance_loss_clip": 1.02719641, + "balance_loss_mlp": 1.04663396, + "epoch": 0.21337742371862317, + "flos": 16837077674880.0, + "grad_norm": 2.0441969792992265, + "language_loss": 0.74135804, + "learning_rate": 3.657496585376922e-06, + "loss": 0.76323086, + "num_input_tokens_seen": 76599940, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.953125, + "step": 3549, + "time_per_iteration": 2.514817476272583 + }, + { + "auxiliary_loss_clip": 0.01142468, + "auxiliary_loss_mlp": 0.01046789, + "balance_loss_clip": 1.03063631, + "balance_loss_mlp": 1.04963064, + "epoch": 0.21343754697129114, + "flos": 24424930120320.0, + "grad_norm": 1.6981522694050752, + "language_loss": 0.80653727, + "learning_rate": 3.657278602806357e-06, + "loss": 0.82842982, + "num_input_tokens_seen": 76619580, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.92578125, + "step": 3550, + "time_per_iteration": 2.541501045227051 + }, + { + "auxiliary_loss_clip": 0.01136887, + "auxiliary_loss_mlp": 0.01044073, + "balance_loss_clip": 1.02883255, + "balance_loss_mlp": 1.04706621, + "epoch": 0.21349767022395913, + "flos": 19277817805440.0, + "grad_norm": 1.615115943492657, + "language_loss": 0.88341218, + "learning_rate": 3.657060557391621e-06, + "loss": 0.90522182, + "num_input_tokens_seen": 76638195, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8984375, + "step": 3551, + "time_per_iteration": 2.5310463905334473 + }, + { + "auxiliary_loss_clip": 0.01136336, + "auxiliary_loss_mlp": 0.0104486, + "balance_loss_clip": 1.02887464, + "balance_loss_mlp": 1.04430258, + "epoch": 0.2135577934766271, + "flos": 17347547808000.0, + "grad_norm": 2.1215125327645152, + "language_loss": 0.83415043, + "learning_rate": 3.656842449140983e-06, + "loss": 0.8559624, + "num_input_tokens_seen": 76656695, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.91796875, + "step": 3552, + "time_per_iteration": 3.974120616912842 + }, + { + "auxiliary_loss_clip": 0.0113546, + "auxiliary_loss_mlp": 0.01048241, + "balance_loss_clip": 1.03164101, + "balance_loss_mlp": 1.04522753, + "epoch": 0.21361791672929506, + "flos": 24057204635520.0, + "grad_norm": 1.7556537525349103, + "language_loss": 0.76692683, + "learning_rate": 3.656624278062713e-06, + "loss": 0.78876388, + "num_input_tokens_seen": 76677430, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.90234375, + "step": 3553, + "time_per_iteration": 3.964289903640747 + }, + { + "auxiliary_loss_clip": 0.0113607, + "auxiliary_loss_mlp": 0.01040019, + "balance_loss_clip": 1.02520156, + "balance_loss_mlp": 1.04556942, + "epoch": 0.21367803998196302, + "flos": 22162306556160.0, + "grad_norm": 1.6502841430946371, + "language_loss": 0.72946119, + "learning_rate": 3.6564060441650843e-06, + "loss": 0.75122207, + "num_input_tokens_seen": 76697615, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.90625, + "step": 3554, + "time_per_iteration": 2.5141818523406982 + }, + { + "auxiliary_loss_clip": 0.01137832, + "auxiliary_loss_mlp": 0.01036933, + "balance_loss_clip": 1.02121508, + "balance_loss_mlp": 1.04672861, + "epoch": 0.213738163234631, + "flos": 20886867452160.0, + "grad_norm": 1.9371755733444218, + "language_loss": 0.6745261, + "learning_rate": 3.6561877474563724e-06, + "loss": 0.69627374, + "num_input_tokens_seen": 76715685, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.91015625, + "step": 3555, + "time_per_iteration": 2.6116089820861816 + }, + { + "auxiliary_loss_clip": 0.01138406, + "auxiliary_loss_mlp": 0.01039389, + "balance_loss_clip": 1.02293801, + "balance_loss_mlp": 1.04564714, + "epoch": 0.21379828648729896, + "flos": 28403114135040.0, + "grad_norm": 2.2550763051095752, + "language_loss": 0.64778429, + "learning_rate": 3.6559693879448553e-06, + "loss": 0.66956222, + "num_input_tokens_seen": 76735405, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9296875, + "step": 3556, + "time_per_iteration": 2.553746223449707 + }, + { + "auxiliary_loss_clip": 0.01139299, + "auxiliary_loss_mlp": 0.01045701, + "balance_loss_clip": 1.02893996, + "balance_loss_mlp": 1.04656768, + "epoch": 0.21385840973996692, + "flos": 25479662106240.0, + "grad_norm": 1.6295299556205536, + "language_loss": 0.72333252, + "learning_rate": 3.6557509656388125e-06, + "loss": 0.74518251, + "num_input_tokens_seen": 76754395, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9296875, + "step": 3557, + "time_per_iteration": 2.6562533378601074 + }, + { + "auxiliary_loss_clip": 0.0114268, + "auxiliary_loss_mlp": 0.01039642, + "balance_loss_clip": 1.02189136, + "balance_loss_mlp": 1.04716706, + "epoch": 0.2139185329926349, + "flos": 28074280101120.0, + "grad_norm": 1.6722734443717013, + "language_loss": 0.67139357, + "learning_rate": 3.655532480546528e-06, + "loss": 0.6932168, + "num_input_tokens_seen": 76777210, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.953125, + "step": 3558, + "time_per_iteration": 2.5435290336608887 + }, + { + "auxiliary_loss_clip": 0.01142773, + "auxiliary_loss_mlp": 0.01036302, + "balance_loss_clip": 1.01943386, + "balance_loss_mlp": 1.04542494, + "epoch": 0.21397865624530288, + "flos": 19608698914560.0, + "grad_norm": 1.8839208997443517, + "language_loss": 0.79702216, + "learning_rate": 3.655313932676286e-06, + "loss": 0.81881285, + "num_input_tokens_seen": 76795830, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9765625, + "step": 3559, + "time_per_iteration": 2.5535330772399902 + }, + { + "auxiliary_loss_clip": 0.01137143, + "auxiliary_loss_mlp": 0.01044167, + "balance_loss_clip": 1.02822304, + "balance_loss_mlp": 1.04436731, + "epoch": 0.21403877949797084, + "flos": 24681476033280.0, + "grad_norm": 1.6653874224583467, + "language_loss": 0.67549068, + "learning_rate": 3.655095322036373e-06, + "loss": 0.69730377, + "num_input_tokens_seen": 76814700, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.9296875, + "step": 3560, + "time_per_iteration": 2.5241451263427734 + }, + { + "auxiliary_loss_clip": 0.0114283, + "auxiliary_loss_mlp": 0.01041365, + "balance_loss_clip": 1.02514052, + "balance_loss_mlp": 1.04846883, + "epoch": 0.2140989027506388, + "flos": 19861150677120.0, + "grad_norm": 1.8721878156787213, + "language_loss": 0.72995424, + "learning_rate": 3.65487664863508e-06, + "loss": 0.75179613, + "num_input_tokens_seen": 76833400, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.9453125, + "step": 3561, + "time_per_iteration": 2.5678720474243164 + }, + { + "auxiliary_loss_clip": 0.01142897, + "auxiliary_loss_mlp": 0.01044952, + "balance_loss_clip": 1.02817965, + "balance_loss_mlp": 1.04897678, + "epoch": 0.21415902600330677, + "flos": 19135324552320.0, + "grad_norm": 2.2783713689110243, + "language_loss": 0.77110738, + "learning_rate": 3.654657912480698e-06, + "loss": 0.79298586, + "num_input_tokens_seen": 76850645, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9375, + "step": 3562, + "time_per_iteration": 2.4598803520202637 + }, + { + "auxiliary_loss_clip": 0.01140561, + "auxiliary_loss_mlp": 0.01038127, + "balance_loss_clip": 1.02160454, + "balance_loss_mlp": 1.04795694, + "epoch": 0.21421914925597474, + "flos": 22272624201600.0, + "grad_norm": 1.5929440625910447, + "language_loss": 0.84534913, + "learning_rate": 3.6544391135815237e-06, + "loss": 0.867136, + "num_input_tokens_seen": 76870135, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.92578125, + "step": 3563, + "time_per_iteration": 2.5654757022857666 + }, + { + "auxiliary_loss_clip": 0.0114087, + "auxiliary_loss_mlp": 0.01038331, + "balance_loss_clip": 1.02227342, + "balance_loss_mlp": 1.04757166, + "epoch": 0.2142792725086427, + "flos": 33875109987840.0, + "grad_norm": 1.6134338415520206, + "language_loss": 0.76727796, + "learning_rate": 3.6542202519458507e-06, + "loss": 0.78907001, + "num_input_tokens_seen": 76893905, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.93359375, + "step": 3564, + "time_per_iteration": 2.591064214706421 + }, + { + "auxiliary_loss_clip": 0.01138542, + "auxiliary_loss_mlp": 0.01041793, + "balance_loss_clip": 1.02560401, + "balance_loss_mlp": 1.0467248, + "epoch": 0.2143393957613107, + "flos": 19860216923520.0, + "grad_norm": 1.880454163642384, + "language_loss": 0.88260084, + "learning_rate": 3.654001327581981e-06, + "loss": 0.90440416, + "num_input_tokens_seen": 76914205, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.91796875, + "step": 3565, + "time_per_iteration": 2.571242094039917 + }, + { + "auxiliary_loss_clip": 0.0105057, + "auxiliary_loss_mlp": 0.01009282, + "balance_loss_clip": 1.00739813, + "balance_loss_mlp": 1.0192101, + "epoch": 0.21439951901397866, + "flos": 68530093090560.0, + "grad_norm": 0.8403524328969202, + "language_loss": 0.52300179, + "learning_rate": 3.653782340498215e-06, + "loss": 0.54360026, + "num_input_tokens_seen": 76975650, + "router_z_loss_clip": 0.01879883, + "router_z_loss_mlp": 0.3125, + "step": 3566, + "time_per_iteration": 3.055588722229004 + }, + { + "auxiliary_loss_clip": 0.01136421, + "auxiliary_loss_mlp": 0.01036219, + "balance_loss_clip": 1.02093637, + "balance_loss_mlp": 1.04677701, + "epoch": 0.21445964226664663, + "flos": 19682998197120.0, + "grad_norm": 1.91490691342046, + "language_loss": 0.67412555, + "learning_rate": 3.6535632907028566e-06, + "loss": 0.69585192, + "num_input_tokens_seen": 76992615, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.89453125, + "step": 3567, + "time_per_iteration": 2.5511529445648193 + }, + { + "auxiliary_loss_clip": 0.01135888, + "auxiliary_loss_mlp": 0.01041672, + "balance_loss_clip": 1.02630615, + "balance_loss_mlp": 1.04691041, + "epoch": 0.2145197655193146, + "flos": 31107259676160.0, + "grad_norm": 1.6974661731729381, + "language_loss": 0.74437779, + "learning_rate": 3.6533441782042126e-06, + "loss": 0.7661534, + "num_input_tokens_seen": 77017005, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.890625, + "step": 3568, + "time_per_iteration": 2.613090753555298 + }, + { + "auxiliary_loss_clip": 0.01137867, + "auxiliary_loss_mlp": 0.01043309, + "balance_loss_clip": 1.02710819, + "balance_loss_mlp": 1.04578757, + "epoch": 0.21457988877198256, + "flos": 20120785159680.0, + "grad_norm": 1.7479940521784256, + "language_loss": 0.77864397, + "learning_rate": 3.6531250030105917e-06, + "loss": 0.80045569, + "num_input_tokens_seen": 77034990, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.921875, + "step": 3569, + "time_per_iteration": 2.567439317703247 + }, + { + "auxiliary_loss_clip": 0.01147794, + "auxiliary_loss_mlp": 0.01038363, + "balance_loss_clip": 1.01981413, + "balance_loss_mlp": 1.05039883, + "epoch": 0.21464001202465052, + "flos": 18588045957120.0, + "grad_norm": 2.3364918832975317, + "language_loss": 0.69533777, + "learning_rate": 3.6529057651303053e-06, + "loss": 0.71719933, + "num_input_tokens_seen": 77052610, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9765625, + "step": 3570, + "time_per_iteration": 2.489550828933716 + }, + { + "auxiliary_loss_clip": 0.01144243, + "auxiliary_loss_mlp": 0.01043749, + "balance_loss_clip": 1.02703631, + "balance_loss_mlp": 1.0480299, + "epoch": 0.21470013527731852, + "flos": 21835160461440.0, + "grad_norm": 2.465398793786977, + "language_loss": 0.78108835, + "learning_rate": 3.6526864645716666e-06, + "loss": 0.80296826, + "num_input_tokens_seen": 77072475, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9609375, + "step": 3571, + "time_per_iteration": 2.527509927749634 + }, + { + "auxiliary_loss_clip": 0.01143428, + "auxiliary_loss_mlp": 0.01043615, + "balance_loss_clip": 1.02556705, + "balance_loss_mlp": 1.0501976, + "epoch": 0.21476025852998648, + "flos": 17603195880960.0, + "grad_norm": 2.5347995603010767, + "language_loss": 0.82851684, + "learning_rate": 3.652467101342991e-06, + "loss": 0.85038722, + "num_input_tokens_seen": 77089930, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.93359375, + "step": 3572, + "time_per_iteration": 2.491955280303955 + }, + { + "auxiliary_loss_clip": 0.01144597, + "auxiliary_loss_mlp": 0.01039432, + "balance_loss_clip": 1.02248025, + "balance_loss_mlp": 1.04700291, + "epoch": 0.21482038178265445, + "flos": 24828135264000.0, + "grad_norm": 2.35018592277076, + "language_loss": 0.64916813, + "learning_rate": 3.652247675452598e-06, + "loss": 0.67100847, + "num_input_tokens_seen": 77108970, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9765625, + "step": 3573, + "time_per_iteration": 2.5238969326019287 + }, + { + "auxiliary_loss_clip": 0.01133482, + "auxiliary_loss_mlp": 0.01040895, + "balance_loss_clip": 1.02481413, + "balance_loss_mlp": 1.04417133, + "epoch": 0.2148805050353224, + "flos": 23258228463360.0, + "grad_norm": 2.2164535787006705, + "language_loss": 0.75577438, + "learning_rate": 3.652028186908807e-06, + "loss": 0.77751815, + "num_input_tokens_seen": 77126045, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.890625, + "step": 3574, + "time_per_iteration": 2.5497734546661377 + }, + { + "auxiliary_loss_clip": 0.01137499, + "auxiliary_loss_mlp": 0.01035076, + "balance_loss_clip": 1.01752853, + "balance_loss_mlp": 1.04568887, + "epoch": 0.21494062828799038, + "flos": 21321098968320.0, + "grad_norm": 1.959683075701339, + "language_loss": 0.72380054, + "learning_rate": 3.6518086357199416e-06, + "loss": 0.74552631, + "num_input_tokens_seen": 77144600, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.91796875, + "step": 3575, + "time_per_iteration": 2.539255142211914 + }, + { + "auxiliary_loss_clip": 0.01141362, + "auxiliary_loss_mlp": 0.01036894, + "balance_loss_clip": 1.02097976, + "balance_loss_mlp": 1.04890776, + "epoch": 0.21500075154065834, + "flos": 18843334894080.0, + "grad_norm": 1.6473570004326006, + "language_loss": 0.68102455, + "learning_rate": 3.6515890218943277e-06, + "loss": 0.70280713, + "num_input_tokens_seen": 77162965, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.92578125, + "step": 3576, + "time_per_iteration": 2.515245199203491 + }, + { + "auxiliary_loss_clip": 0.01144679, + "auxiliary_loss_mlp": 0.01041063, + "balance_loss_clip": 1.02347922, + "balance_loss_mlp": 1.04820943, + "epoch": 0.2150608747933263, + "flos": 18441997257600.0, + "grad_norm": 2.1450103743023936, + "language_loss": 0.88840854, + "learning_rate": 3.651369345440292e-06, + "loss": 0.91026592, + "num_input_tokens_seen": 77179960, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9609375, + "step": 3577, + "time_per_iteration": 2.4426753520965576 + }, + { + "auxiliary_loss_clip": 0.01054886, + "auxiliary_loss_mlp": 0.01006787, + "balance_loss_clip": 1.00466526, + "balance_loss_mlp": 1.02252448, + "epoch": 0.2151209980459943, + "flos": 66598242894720.0, + "grad_norm": 0.8177210285410575, + "language_loss": 0.56242883, + "learning_rate": 3.6511496063661654e-06, + "loss": 0.5830456, + "num_input_tokens_seen": 77239500, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.32421875, + "step": 3578, + "time_per_iteration": 3.0434820652008057 + }, + { + "auxiliary_loss_clip": 0.0114273, + "auxiliary_loss_mlp": 0.01039801, + "balance_loss_clip": 1.02345788, + "balance_loss_mlp": 1.04957211, + "epoch": 0.21518112129866226, + "flos": 21575885114880.0, + "grad_norm": 1.6812319537870581, + "language_loss": 0.88500881, + "learning_rate": 3.6509298046802807e-06, + "loss": 0.90683413, + "num_input_tokens_seen": 77254680, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.9296875, + "step": 3579, + "time_per_iteration": 2.4646458625793457 + }, + { + "auxiliary_loss_clip": 0.01140846, + "auxiliary_loss_mlp": 0.01042477, + "balance_loss_clip": 1.02551329, + "balance_loss_mlp": 1.04618824, + "epoch": 0.21524124455133023, + "flos": 20047635112320.0, + "grad_norm": 1.7668055337606152, + "language_loss": 0.78238297, + "learning_rate": 3.650709940390972e-06, + "loss": 0.80421615, + "num_input_tokens_seen": 77274060, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9453125, + "step": 3580, + "time_per_iteration": 2.5029854774475098 + }, + { + "auxiliary_loss_clip": 0.01138764, + "auxiliary_loss_mlp": 0.01042384, + "balance_loss_clip": 1.02557576, + "balance_loss_mlp": 1.04757452, + "epoch": 0.2153013678039982, + "flos": 23951807153280.0, + "grad_norm": 1.7955176576656944, + "language_loss": 0.73129165, + "learning_rate": 3.6504900135065775e-06, + "loss": 0.75310302, + "num_input_tokens_seen": 77293255, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9140625, + "step": 3581, + "time_per_iteration": 2.503103733062744 + }, + { + "auxiliary_loss_clip": 0.01137091, + "auxiliary_loss_mlp": 0.0104596, + "balance_loss_clip": 1.02723205, + "balance_loss_mlp": 1.04665411, + "epoch": 0.21536149105666616, + "flos": 20594841880320.0, + "grad_norm": 2.610409860459302, + "language_loss": 0.70739609, + "learning_rate": 3.6502700240354357e-06, + "loss": 0.72922659, + "num_input_tokens_seen": 77312390, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.90234375, + "step": 3582, + "time_per_iteration": 2.4840197563171387 + }, + { + "auxiliary_loss_clip": 0.01137402, + "auxiliary_loss_mlp": 0.01041337, + "balance_loss_clip": 1.02401567, + "balance_loss_mlp": 1.04602027, + "epoch": 0.21542161430933413, + "flos": 12860042895360.0, + "grad_norm": 2.8570718584923633, + "language_loss": 0.84140432, + "learning_rate": 3.650049971985889e-06, + "loss": 0.86319172, + "num_input_tokens_seen": 77330985, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9140625, + "step": 3583, + "time_per_iteration": 2.4435312747955322 + }, + { + "auxiliary_loss_clip": 0.01143933, + "auxiliary_loss_mlp": 0.01045352, + "balance_loss_clip": 1.02834046, + "balance_loss_mlp": 1.04859185, + "epoch": 0.21548173756200212, + "flos": 26103933504000.0, + "grad_norm": 3.180305067245919, + "language_loss": 0.83226246, + "learning_rate": 3.6498298573662824e-06, + "loss": 0.8541553, + "num_input_tokens_seen": 77350770, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.953125, + "step": 3584, + "time_per_iteration": 2.521476984024048 + }, + { + "auxiliary_loss_clip": 0.01136808, + "auxiliary_loss_mlp": 0.01046426, + "balance_loss_clip": 1.02816272, + "balance_loss_mlp": 1.04518461, + "epoch": 0.21554186081467008, + "flos": 22163779013760.0, + "grad_norm": 2.0358477693345667, + "language_loss": 0.90233314, + "learning_rate": 3.6496096801849625e-06, + "loss": 0.92416549, + "num_input_tokens_seen": 77370510, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.91796875, + "step": 3585, + "time_per_iteration": 2.464745283126831 + }, + { + "auxiliary_loss_clip": 0.01140925, + "auxiliary_loss_mlp": 0.01042254, + "balance_loss_clip": 1.02568388, + "balance_loss_mlp": 1.04832685, + "epoch": 0.21560198406733805, + "flos": 22966741595520.0, + "grad_norm": 2.8296186032289348, + "language_loss": 0.74414444, + "learning_rate": 3.649389440450277e-06, + "loss": 0.76597619, + "num_input_tokens_seen": 77390645, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9296875, + "step": 3586, + "time_per_iteration": 2.5062146186828613 + }, + { + "auxiliary_loss_clip": 0.01141021, + "auxiliary_loss_mlp": 0.01042527, + "balance_loss_clip": 1.02668393, + "balance_loss_mlp": 1.04796743, + "epoch": 0.215662107320006, + "flos": 22784064001920.0, + "grad_norm": 2.1680236591426416, + "language_loss": 0.83055526, + "learning_rate": 3.6491691381705804e-06, + "loss": 0.85239077, + "num_input_tokens_seen": 77409655, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.9296875, + "step": 3587, + "time_per_iteration": 2.4784295558929443 + }, + { + "auxiliary_loss_clip": 0.01138799, + "auxiliary_loss_mlp": 0.01041925, + "balance_loss_clip": 1.02438986, + "balance_loss_mlp": 1.04664946, + "epoch": 0.21572223057267398, + "flos": 30883859038080.0, + "grad_norm": 1.8176747371086701, + "language_loss": 0.75756669, + "learning_rate": 3.648948773354224e-06, + "loss": 0.77937388, + "num_input_tokens_seen": 77430560, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.921875, + "step": 3588, + "time_per_iteration": 2.5896053314208984 + }, + { + "auxiliary_loss_clip": 0.01137468, + "auxiliary_loss_mlp": 0.01039715, + "balance_loss_clip": 1.02294254, + "balance_loss_mlp": 1.04534698, + "epoch": 0.21578235382534194, + "flos": 26910487445760.0, + "grad_norm": 1.8272464683057401, + "language_loss": 0.81006658, + "learning_rate": 3.6487283460095643e-06, + "loss": 0.83183837, + "num_input_tokens_seen": 77455000, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.921875, + "step": 3589, + "time_per_iteration": 2.540090799331665 + }, + { + "auxiliary_loss_clip": 0.01142059, + "auxiliary_loss_mlp": 0.01039493, + "balance_loss_clip": 1.02341199, + "balance_loss_mlp": 1.04792953, + "epoch": 0.2158424770780099, + "flos": 24425720219520.0, + "grad_norm": 2.6129530472479154, + "language_loss": 0.72591126, + "learning_rate": 3.648507856144961e-06, + "loss": 0.74772674, + "num_input_tokens_seen": 77475075, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.94140625, + "step": 3590, + "time_per_iteration": 2.5113861560821533 + }, + { + "auxiliary_loss_clip": 0.01145271, + "auxiliary_loss_mlp": 0.01046731, + "balance_loss_clip": 1.02769351, + "balance_loss_mlp": 1.04830956, + "epoch": 0.2159026003306779, + "flos": 23949975559680.0, + "grad_norm": 2.0133132975130477, + "language_loss": 0.83914638, + "learning_rate": 3.648287303768775e-06, + "loss": 0.86106646, + "num_input_tokens_seen": 77495945, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.96875, + "step": 3591, + "time_per_iteration": 2.488309621810913 + }, + { + "auxiliary_loss_clip": 0.01145642, + "auxiliary_loss_mlp": 0.01050952, + "balance_loss_clip": 1.03167534, + "balance_loss_mlp": 1.04884136, + "epoch": 0.21596272358334587, + "flos": 30040963511040.0, + "grad_norm": 2.271326779903827, + "language_loss": 0.69294131, + "learning_rate": 3.6480666888893686e-06, + "loss": 0.71490723, + "num_input_tokens_seen": 77517140, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.96875, + "step": 3592, + "time_per_iteration": 2.571373462677002 + }, + { + "auxiliary_loss_clip": 0.01143219, + "auxiliary_loss_mlp": 0.01049672, + "balance_loss_clip": 1.03150403, + "balance_loss_mlp": 1.04881072, + "epoch": 0.21602284683601383, + "flos": 20376217751040.0, + "grad_norm": 2.3999192225546677, + "language_loss": 0.84150124, + "learning_rate": 3.647846011515108e-06, + "loss": 0.86343014, + "num_input_tokens_seen": 77536085, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9453125, + "step": 3593, + "time_per_iteration": 2.4590611457824707 + }, + { + "auxiliary_loss_clip": 0.01144804, + "auxiliary_loss_mlp": 0.01049477, + "balance_loss_clip": 1.03210783, + "balance_loss_mlp": 1.04839182, + "epoch": 0.2160829700886818, + "flos": 20777339905920.0, + "grad_norm": 2.850380650061706, + "language_loss": 0.75163305, + "learning_rate": 3.6476252716543625e-06, + "loss": 0.77357584, + "num_input_tokens_seen": 77553675, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.96484375, + "step": 3594, + "time_per_iteration": 3.9338901042938232 + }, + { + "auxiliary_loss_clip": 0.01139476, + "auxiliary_loss_mlp": 0.01043593, + "balance_loss_clip": 1.02666509, + "balance_loss_mlp": 1.04763508, + "epoch": 0.21614309334134976, + "flos": 22309755886080.0, + "grad_norm": 2.0680180645872057, + "language_loss": 0.80541027, + "learning_rate": 3.6474044693155007e-06, + "loss": 0.82724094, + "num_input_tokens_seen": 77573360, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.921875, + "step": 3595, + "time_per_iteration": 3.9857921600341797 + }, + { + "auxiliary_loss_clip": 0.01146272, + "auxiliary_loss_mlp": 0.01043286, + "balance_loss_clip": 1.0259887, + "balance_loss_mlp": 1.04883027, + "epoch": 0.21620321659401773, + "flos": 19609524927360.0, + "grad_norm": 2.3330392864683347, + "language_loss": 0.78089929, + "learning_rate": 3.647183604506897e-06, + "loss": 0.80279487, + "num_input_tokens_seen": 77591865, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.97265625, + "step": 3596, + "time_per_iteration": 2.4515480995178223 + }, + { + "auxiliary_loss_clip": 0.01138472, + "auxiliary_loss_mlp": 0.0104618, + "balance_loss_clip": 1.03006268, + "balance_loss_mlp": 1.04786897, + "epoch": 0.2162633398466857, + "flos": 18844555956480.0, + "grad_norm": 1.9545740457841054, + "language_loss": 0.83011472, + "learning_rate": 3.6469626772369253e-06, + "loss": 0.85196126, + "num_input_tokens_seen": 77611600, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.90625, + "step": 3597, + "time_per_iteration": 2.504703998565674 + }, + { + "auxiliary_loss_clip": 0.011446, + "auxiliary_loss_mlp": 0.01045187, + "balance_loss_clip": 1.02756798, + "balance_loss_mlp": 1.05029655, + "epoch": 0.21632346309935369, + "flos": 18768820129920.0, + "grad_norm": 1.5849845027976412, + "language_loss": 0.80171728, + "learning_rate": 3.6467416875139642e-06, + "loss": 0.82361513, + "num_input_tokens_seen": 77630665, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.94140625, + "step": 3598, + "time_per_iteration": 2.487013101577759 + }, + { + "auxiliary_loss_clip": 0.0114385, + "auxiliary_loss_mlp": 0.01045551, + "balance_loss_clip": 1.02745485, + "balance_loss_mlp": 1.0476619, + "epoch": 0.21638358635202165, + "flos": 26324173745280.0, + "grad_norm": 1.8175927270691912, + "language_loss": 0.82054996, + "learning_rate": 3.6465206353463934e-06, + "loss": 0.842444, + "num_input_tokens_seen": 77650835, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9609375, + "step": 3599, + "time_per_iteration": 2.5515315532684326 + }, + { + "auxiliary_loss_clip": 0.0113915, + "auxiliary_loss_mlp": 0.0104149, + "balance_loss_clip": 1.0253613, + "balance_loss_mlp": 1.04831243, + "epoch": 0.21644370960468962, + "flos": 20740854666240.0, + "grad_norm": 3.186477441139726, + "language_loss": 0.7654863, + "learning_rate": 3.6462995207425947e-06, + "loss": 0.78729272, + "num_input_tokens_seen": 77669000, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.90625, + "step": 3600, + "time_per_iteration": 2.5067033767700195 + }, + { + "auxiliary_loss_clip": 0.01139528, + "auxiliary_loss_mlp": 0.01043686, + "balance_loss_clip": 1.02842712, + "balance_loss_mlp": 1.04657555, + "epoch": 0.21650383285735758, + "flos": 23952238116480.0, + "grad_norm": 1.9514188507385115, + "language_loss": 0.80026001, + "learning_rate": 3.6460783437109533e-06, + "loss": 0.82209218, + "num_input_tokens_seen": 77688745, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.9296875, + "step": 3601, + "time_per_iteration": 2.5383710861206055 + }, + { + "auxiliary_loss_clip": 0.01142747, + "auxiliary_loss_mlp": 0.01047381, + "balance_loss_clip": 1.0306437, + "balance_loss_mlp": 1.04938436, + "epoch": 0.21656395611002555, + "flos": 23696087253120.0, + "grad_norm": 1.8096424478422806, + "language_loss": 0.83358335, + "learning_rate": 3.6458571042598565e-06, + "loss": 0.85548466, + "num_input_tokens_seen": 77708445, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.93359375, + "step": 3602, + "time_per_iteration": 2.525151491165161 + }, + { + "auxiliary_loss_clip": 0.01140411, + "auxiliary_loss_mlp": 0.01048188, + "balance_loss_clip": 1.03065276, + "balance_loss_mlp": 1.04670155, + "epoch": 0.2166240793626935, + "flos": 20666052593280.0, + "grad_norm": 1.6489882186888527, + "language_loss": 0.74271673, + "learning_rate": 3.645635802397693e-06, + "loss": 0.76460266, + "num_input_tokens_seen": 77728465, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9375, + "step": 3603, + "time_per_iteration": 2.5083842277526855 + }, + { + "auxiliary_loss_clip": 0.01140372, + "auxiliary_loss_mlp": 0.01043135, + "balance_loss_clip": 1.02723289, + "balance_loss_mlp": 1.05022252, + "epoch": 0.2166842026153615, + "flos": 21580410228480.0, + "grad_norm": 1.5478742891076147, + "language_loss": 0.73956323, + "learning_rate": 3.645414438132855e-06, + "loss": 0.76139832, + "num_input_tokens_seen": 77746735, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.90234375, + "step": 3604, + "time_per_iteration": 2.5100204944610596 + }, + { + "auxiliary_loss_clip": 0.01137594, + "auxiliary_loss_mlp": 0.01042667, + "balance_loss_clip": 1.02598965, + "balance_loss_mlp": 1.04853153, + "epoch": 0.21674432586802947, + "flos": 25629948610560.0, + "grad_norm": 2.2268823896980376, + "language_loss": 0.80375803, + "learning_rate": 3.6451930114737366e-06, + "loss": 0.82556069, + "num_input_tokens_seen": 77768105, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.890625, + "step": 3605, + "time_per_iteration": 2.5182228088378906 + }, + { + "auxiliary_loss_clip": 0.01064224, + "auxiliary_loss_mlp": 0.01010449, + "balance_loss_clip": 1.0086962, + "balance_loss_mlp": 1.02975249, + "epoch": 0.21680444912069743, + "flos": 56417783616000.0, + "grad_norm": 0.6948121220218867, + "language_loss": 0.58376318, + "learning_rate": 3.6449715224287347e-06, + "loss": 0.60450989, + "num_input_tokens_seen": 77833750, + "router_z_loss_clip": 0.01757812, + "router_z_loss_mlp": 0.34375, + "step": 3606, + "time_per_iteration": 3.1655373573303223 + }, + { + "auxiliary_loss_clip": 0.01145196, + "auxiliary_loss_mlp": 0.01046918, + "balance_loss_clip": 1.02921534, + "balance_loss_mlp": 1.04939568, + "epoch": 0.2168645723733654, + "flos": 23878944414720.0, + "grad_norm": 2.6754398361548613, + "language_loss": 0.73210037, + "learning_rate": 3.644749971006248e-06, + "loss": 0.75402147, + "num_input_tokens_seen": 77853780, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9609375, + "step": 3607, + "time_per_iteration": 2.508920431137085 + }, + { + "auxiliary_loss_clip": 0.01146221, + "auxiliary_loss_mlp": 0.0104816, + "balance_loss_clip": 1.02995718, + "balance_loss_mlp": 1.04935443, + "epoch": 0.21692469562603336, + "flos": 16946174257920.0, + "grad_norm": 2.5718647894236053, + "language_loss": 0.76626337, + "learning_rate": 3.6445283572146765e-06, + "loss": 0.78820717, + "num_input_tokens_seen": 77872575, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.96875, + "step": 3608, + "time_per_iteration": 2.440258502960205 + }, + { + "auxiliary_loss_clip": 0.01144868, + "auxiliary_loss_mlp": 0.01046046, + "balance_loss_clip": 1.02985787, + "balance_loss_mlp": 1.04866827, + "epoch": 0.21698481887870133, + "flos": 25119047514240.0, + "grad_norm": 1.796333172920123, + "language_loss": 0.74395084, + "learning_rate": 3.6443066810624255e-06, + "loss": 0.76586002, + "num_input_tokens_seen": 77892700, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.9609375, + "step": 3609, + "time_per_iteration": 2.5326688289642334 + }, + { + "auxiliary_loss_clip": 0.01143438, + "auxiliary_loss_mlp": 0.01048498, + "balance_loss_clip": 1.03137922, + "balance_loss_mlp": 1.04871368, + "epoch": 0.2170449421313693, + "flos": 17894682748800.0, + "grad_norm": 1.781486329059154, + "language_loss": 0.88848329, + "learning_rate": 3.6440849425579e-06, + "loss": 0.91040266, + "num_input_tokens_seen": 77911060, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9453125, + "step": 3610, + "time_per_iteration": 2.4611029624938965 + }, + { + "auxiliary_loss_clip": 0.01144855, + "auxiliary_loss_mlp": 0.01038228, + "balance_loss_clip": 1.02090693, + "balance_loss_mlp": 1.05045652, + "epoch": 0.2171050653840373, + "flos": 22638446265600.0, + "grad_norm": 2.036787917991119, + "language_loss": 0.77587712, + "learning_rate": 3.6438631417095095e-06, + "loss": 0.79770797, + "num_input_tokens_seen": 77929930, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9453125, + "step": 3611, + "time_per_iteration": 2.5187723636627197 + }, + { + "auxiliary_loss_clip": 0.01136409, + "auxiliary_loss_mlp": 0.01044629, + "balance_loss_clip": 1.02829766, + "balance_loss_mlp": 1.04609489, + "epoch": 0.21716518863670525, + "flos": 19499997381120.0, + "grad_norm": 2.067133307741882, + "language_loss": 0.63197911, + "learning_rate": 3.6436412785256637e-06, + "loss": 0.65378946, + "num_input_tokens_seen": 77949060, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.90234375, + "step": 3612, + "time_per_iteration": 2.4585959911346436 + }, + { + "auxiliary_loss_clip": 0.0114176, + "auxiliary_loss_mlp": 0.01042113, + "balance_loss_clip": 1.02504194, + "balance_loss_mlp": 1.04799449, + "epoch": 0.21722531188937322, + "flos": 19792022952960.0, + "grad_norm": 1.9312736490377453, + "language_loss": 0.75120652, + "learning_rate": 3.643419353014776e-06, + "loss": 0.77304518, + "num_input_tokens_seen": 77967920, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9375, + "step": 3613, + "time_per_iteration": 2.4866983890533447 + }, + { + "auxiliary_loss_clip": 0.01136544, + "auxiliary_loss_mlp": 0.01046281, + "balance_loss_clip": 1.02900767, + "balance_loss_mlp": 1.04560208, + "epoch": 0.21728543514204118, + "flos": 13334386924800.0, + "grad_norm": 3.0184875495721, + "language_loss": 0.70767504, + "learning_rate": 3.643197365185261e-06, + "loss": 0.72950327, + "num_input_tokens_seen": 77985330, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.90625, + "step": 3614, + "time_per_iteration": 2.4454689025878906 + }, + { + "auxiliary_loss_clip": 0.01141605, + "auxiliary_loss_mlp": 0.01046562, + "balance_loss_clip": 1.0288837, + "balance_loss_mlp": 1.0491401, + "epoch": 0.21734555839470915, + "flos": 15231870783360.0, + "grad_norm": 1.8064523730299737, + "language_loss": 0.7314586, + "learning_rate": 3.6429753150455378e-06, + "loss": 0.75334036, + "num_input_tokens_seen": 78003105, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.92578125, + "step": 3615, + "time_per_iteration": 2.488711357116699 + }, + { + "auxiliary_loss_clip": 0.01145923, + "auxiliary_loss_mlp": 0.0104763, + "balance_loss_clip": 1.02832997, + "balance_loss_mlp": 1.04751146, + "epoch": 0.2174056816473771, + "flos": 19973982274560.0, + "grad_norm": 2.7876016160510377, + "language_loss": 0.90045536, + "learning_rate": 3.6427532026040263e-06, + "loss": 0.92239082, + "num_input_tokens_seen": 78019655, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.984375, + "step": 3616, + "time_per_iteration": 2.4552054405212402 + }, + { + "auxiliary_loss_clip": 0.01143252, + "auxiliary_loss_mlp": 0.01041039, + "balance_loss_clip": 1.02356279, + "balance_loss_mlp": 1.04853153, + "epoch": 0.21746580490004508, + "flos": 16687293960960.0, + "grad_norm": 2.4503731233397383, + "language_loss": 0.8111589, + "learning_rate": 3.642531027869148e-06, + "loss": 0.83300173, + "num_input_tokens_seen": 78036025, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9453125, + "step": 3617, + "time_per_iteration": 2.465254068374634 + }, + { + "auxiliary_loss_clip": 0.01143954, + "auxiliary_loss_mlp": 0.01045828, + "balance_loss_clip": 1.02928162, + "balance_loss_mlp": 1.04851139, + "epoch": 0.21752592815271307, + "flos": 25772298209280.0, + "grad_norm": 1.7784831572545423, + "language_loss": 0.75509727, + "learning_rate": 3.642308790849329e-06, + "loss": 0.77699506, + "num_input_tokens_seen": 78055645, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.953125, + "step": 3618, + "time_per_iteration": 2.5263705253601074 + }, + { + "auxiliary_loss_clip": 0.0114255, + "auxiliary_loss_mlp": 0.01049263, + "balance_loss_clip": 1.03103614, + "balance_loss_mlp": 1.04738426, + "epoch": 0.21758605140538104, + "flos": 11254692349440.0, + "grad_norm": 1.9247647214638754, + "language_loss": 0.69221723, + "learning_rate": 3.642086491552996e-06, + "loss": 0.71413535, + "num_input_tokens_seen": 78071660, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.94921875, + "step": 3619, + "time_per_iteration": 2.4615654945373535 + }, + { + "auxiliary_loss_clip": 0.01145954, + "auxiliary_loss_mlp": 0.01044723, + "balance_loss_clip": 1.02723491, + "balance_loss_mlp": 1.04906762, + "epoch": 0.217646174658049, + "flos": 19242625455360.0, + "grad_norm": 1.7662634429670958, + "language_loss": 0.78337491, + "learning_rate": 3.641864129988579e-06, + "loss": 0.80528164, + "num_input_tokens_seen": 78091265, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.96875, + "step": 3620, + "time_per_iteration": 2.4954700469970703 + }, + { + "auxiliary_loss_clip": 0.01133661, + "auxiliary_loss_mlp": 0.0103768, + "balance_loss_clip": 1.02116966, + "balance_loss_mlp": 1.04363799, + "epoch": 0.21770629791071697, + "flos": 21945083057280.0, + "grad_norm": 2.0129000326388695, + "language_loss": 0.79769373, + "learning_rate": 3.641641706164509e-06, + "loss": 0.81940717, + "num_input_tokens_seen": 78110095, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.90234375, + "step": 3621, + "time_per_iteration": 2.490427255630493 + }, + { + "auxiliary_loss_clip": 0.01138307, + "auxiliary_loss_mlp": 0.01040724, + "balance_loss_clip": 1.02436888, + "balance_loss_mlp": 1.04595852, + "epoch": 0.21776642116338493, + "flos": 24936764970240.0, + "grad_norm": 1.7548460288059653, + "language_loss": 0.87967801, + "learning_rate": 3.641419220089221e-06, + "loss": 0.90146828, + "num_input_tokens_seen": 78129475, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.921875, + "step": 3622, + "time_per_iteration": 2.484462022781372 + }, + { + "auxiliary_loss_clip": 0.01142961, + "auxiliary_loss_mlp": 0.01040225, + "balance_loss_clip": 1.02067459, + "balance_loss_mlp": 1.04766297, + "epoch": 0.2178265444160529, + "flos": 17821317219840.0, + "grad_norm": 4.811459611972859, + "language_loss": 0.76945633, + "learning_rate": 3.641196671771152e-06, + "loss": 0.79128814, + "num_input_tokens_seen": 78146880, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.94921875, + "step": 3623, + "time_per_iteration": 2.4476547241210938 + }, + { + "auxiliary_loss_clip": 0.0114403, + "auxiliary_loss_mlp": 0.01048104, + "balance_loss_clip": 1.02992439, + "balance_loss_mlp": 1.04891419, + "epoch": 0.2178866676687209, + "flos": 17712902995200.0, + "grad_norm": 2.1152987510548615, + "language_loss": 0.84886312, + "learning_rate": 3.640974061218741e-06, + "loss": 0.8707844, + "num_input_tokens_seen": 78165065, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.94921875, + "step": 3624, + "time_per_iteration": 2.444913387298584 + }, + { + "auxiliary_loss_clip": 0.0114445, + "auxiliary_loss_mlp": 0.010571, + "balance_loss_clip": 1.0397315, + "balance_loss_mlp": 1.0487287, + "epoch": 0.21794679092138886, + "flos": 16945851035520.0, + "grad_norm": 2.345969751242133, + "language_loss": 0.77035248, + "learning_rate": 3.640751388440429e-06, + "loss": 0.79236794, + "num_input_tokens_seen": 78180005, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.95703125, + "step": 3625, + "time_per_iteration": 2.4511115550994873 + }, + { + "auxiliary_loss_clip": 0.01059313, + "auxiliary_loss_mlp": 0.01000008, + "balance_loss_clip": 0.99836272, + "balance_loss_mlp": 1.02361774, + "epoch": 0.21800691417405682, + "flos": 63718566566400.0, + "grad_norm": 0.8233389824181596, + "language_loss": 0.60720766, + "learning_rate": 3.64052865344466e-06, + "loss": 0.62780088, + "num_input_tokens_seen": 78245350, + "router_z_loss_clip": 0.01647949, + "router_z_loss_mlp": 0.35546875, + "step": 3626, + "time_per_iteration": 3.21004319190979 + }, + { + "auxiliary_loss_clip": 0.0114194, + "auxiliary_loss_mlp": 0.01047127, + "balance_loss_clip": 1.02858984, + "balance_loss_mlp": 1.04572678, + "epoch": 0.21806703742672479, + "flos": 21616392677760.0, + "grad_norm": 1.8978511257882154, + "language_loss": 0.90608853, + "learning_rate": 3.6403058562398795e-06, + "loss": 0.92797917, + "num_input_tokens_seen": 78264165, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9609375, + "step": 3627, + "time_per_iteration": 2.4744250774383545 + }, + { + "auxiliary_loss_clip": 0.01138482, + "auxiliary_loss_mlp": 0.01041219, + "balance_loss_clip": 1.02346826, + "balance_loss_mlp": 1.04541492, + "epoch": 0.21812716067939275, + "flos": 19354882435200.0, + "grad_norm": 1.8495097769686537, + "language_loss": 0.73612916, + "learning_rate": 3.6400829968345365e-06, + "loss": 0.75792623, + "num_input_tokens_seen": 78283745, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9296875, + "step": 3628, + "time_per_iteration": 2.4595446586608887 + }, + { + "auxiliary_loss_clip": 0.01137064, + "auxiliary_loss_mlp": 0.01039204, + "balance_loss_clip": 1.02232444, + "balance_loss_mlp": 1.04432046, + "epoch": 0.21818728393206072, + "flos": 23548063305600.0, + "grad_norm": 1.99633175048199, + "language_loss": 0.76800162, + "learning_rate": 3.6398600752370826e-06, + "loss": 0.78976429, + "num_input_tokens_seen": 78302900, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.92578125, + "step": 3629, + "time_per_iteration": 2.487610101699829 + }, + { + "auxiliary_loss_clip": 0.01140004, + "auxiliary_loss_mlp": 0.01041342, + "balance_loss_clip": 1.02514172, + "balance_loss_mlp": 1.04701388, + "epoch": 0.21824740718472868, + "flos": 30225652266240.0, + "grad_norm": 1.5547294213075904, + "language_loss": 0.71320152, + "learning_rate": 3.63963709145597e-06, + "loss": 0.73501503, + "num_input_tokens_seen": 78326470, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.9296875, + "step": 3630, + "time_per_iteration": 2.608846426010132 + }, + { + "auxiliary_loss_clip": 0.01134439, + "auxiliary_loss_mlp": 0.01042587, + "balance_loss_clip": 1.0277338, + "balance_loss_mlp": 1.04635286, + "epoch": 0.21830753043739667, + "flos": 26134672567680.0, + "grad_norm": 1.8110131954886999, + "language_loss": 0.76331747, + "learning_rate": 3.6394140454996544e-06, + "loss": 0.78508776, + "num_input_tokens_seen": 78345810, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8828125, + "step": 3631, + "time_per_iteration": 2.53765869140625 + }, + { + "auxiliary_loss_clip": 0.01138964, + "auxiliary_loss_mlp": 0.0103994, + "balance_loss_clip": 1.0237397, + "balance_loss_mlp": 1.0455693, + "epoch": 0.21836765369006464, + "flos": 21720712752000.0, + "grad_norm": 2.0710075205659906, + "language_loss": 0.74879777, + "learning_rate": 3.639190937376594e-06, + "loss": 0.77058685, + "num_input_tokens_seen": 78364085, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.93359375, + "step": 3632, + "time_per_iteration": 2.484896421432495 + }, + { + "auxiliary_loss_clip": 0.01136054, + "auxiliary_loss_mlp": 0.01035961, + "balance_loss_clip": 1.02029681, + "balance_loss_mlp": 1.04511309, + "epoch": 0.2184277769427326, + "flos": 19937604775680.0, + "grad_norm": 1.966664682342333, + "language_loss": 0.83337629, + "learning_rate": 3.638967767095249e-06, + "loss": 0.8550964, + "num_input_tokens_seen": 78381385, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.91015625, + "step": 3633, + "time_per_iteration": 2.4721779823303223 + }, + { + "auxiliary_loss_clip": 0.01136294, + "auxiliary_loss_mlp": 0.01048418, + "balance_loss_clip": 1.03228879, + "balance_loss_mlp": 1.04592657, + "epoch": 0.21848790019540057, + "flos": 20340235301760.0, + "grad_norm": 1.8655293845238095, + "language_loss": 0.81782126, + "learning_rate": 3.6387445346640823e-06, + "loss": 0.83966839, + "num_input_tokens_seen": 78400500, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.90234375, + "step": 3634, + "time_per_iteration": 2.5514795780181885 + }, + { + "auxiliary_loss_clip": 0.01144011, + "auxiliary_loss_mlp": 0.01041001, + "balance_loss_clip": 1.02468133, + "balance_loss_mlp": 1.04863131, + "epoch": 0.21854802344806853, + "flos": 15450818135040.0, + "grad_norm": 2.010090632845536, + "language_loss": 0.75077927, + "learning_rate": 3.638521240091558e-06, + "loss": 0.77262932, + "num_input_tokens_seen": 78418340, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.953125, + "step": 3635, + "time_per_iteration": 4.07889199256897 + }, + { + "auxiliary_loss_clip": 0.01137664, + "auxiliary_loss_mlp": 0.01053987, + "balance_loss_clip": 1.03775024, + "balance_loss_mlp": 1.04744601, + "epoch": 0.2186081467007365, + "flos": 16320717711360.0, + "grad_norm": 2.2167396678675155, + "language_loss": 0.87881035, + "learning_rate": 3.6382978833861445e-06, + "loss": 0.90072685, + "num_input_tokens_seen": 78434375, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.90234375, + "step": 3636, + "time_per_iteration": 3.9134533405303955 + }, + { + "auxiliary_loss_clip": 0.01138959, + "auxiliary_loss_mlp": 0.01051316, + "balance_loss_clip": 1.03406608, + "balance_loss_mlp": 1.0456109, + "epoch": 0.2186682699534045, + "flos": 21689255416320.0, + "grad_norm": 1.9800006249435054, + "language_loss": 0.75948632, + "learning_rate": 3.638074464556311e-06, + "loss": 0.78138912, + "num_input_tokens_seen": 78451735, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.93359375, + "step": 3637, + "time_per_iteration": 2.5531604290008545 + }, + { + "auxiliary_loss_clip": 0.01143812, + "auxiliary_loss_mlp": 0.01042573, + "balance_loss_clip": 1.02445328, + "balance_loss_mlp": 1.04728055, + "epoch": 0.21872839320607246, + "flos": 17739260599680.0, + "grad_norm": 4.376345077988984, + "language_loss": 0.89677018, + "learning_rate": 3.63785098361053e-06, + "loss": 0.91863406, + "num_input_tokens_seen": 78462730, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.96875, + "step": 3638, + "time_per_iteration": 2.435544967651367 + }, + { + "auxiliary_loss_clip": 0.01140476, + "auxiliary_loss_mlp": 0.01050633, + "balance_loss_clip": 1.03377736, + "balance_loss_mlp": 1.04854274, + "epoch": 0.21878851645874042, + "flos": 18652289431680.0, + "grad_norm": 2.382131601644944, + "language_loss": 0.89958721, + "learning_rate": 3.637627440557275e-06, + "loss": 0.9214983, + "num_input_tokens_seen": 78476300, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.91796875, + "step": 3639, + "time_per_iteration": 2.448150634765625 + }, + { + "auxiliary_loss_clip": 0.01138473, + "auxiliary_loss_mlp": 0.01045558, + "balance_loss_clip": 1.02972686, + "balance_loss_mlp": 1.04632282, + "epoch": 0.2188486397114084, + "flos": 25557301353600.0, + "grad_norm": 1.7796744672676124, + "language_loss": 0.79038727, + "learning_rate": 3.637403835405024e-06, + "loss": 0.81222755, + "num_input_tokens_seen": 78496135, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.921875, + "step": 3640, + "time_per_iteration": 2.544577121734619 + }, + { + "auxiliary_loss_clip": 0.01142754, + "auxiliary_loss_mlp": 0.01051502, + "balance_loss_clip": 1.03291786, + "balance_loss_mlp": 1.05100346, + "epoch": 0.21890876296407635, + "flos": 17892061056000.0, + "grad_norm": 2.046383525913898, + "language_loss": 0.72049212, + "learning_rate": 3.637180168162255e-06, + "loss": 0.74243474, + "num_input_tokens_seen": 78513855, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.91796875, + "step": 3641, + "time_per_iteration": 2.465439558029175 + }, + { + "auxiliary_loss_clip": 0.01142611, + "auxiliary_loss_mlp": 0.01042223, + "balance_loss_clip": 1.02610588, + "balance_loss_mlp": 1.05203855, + "epoch": 0.21896888621674432, + "flos": 17749100926080.0, + "grad_norm": 2.4771917366671, + "language_loss": 0.80913448, + "learning_rate": 3.63695643883745e-06, + "loss": 0.8309828, + "num_input_tokens_seen": 78531740, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.90625, + "step": 3642, + "time_per_iteration": 2.4598801136016846 + }, + { + "auxiliary_loss_clip": 0.01144439, + "auxiliary_loss_mlp": 0.01040377, + "balance_loss_clip": 1.02319944, + "balance_loss_mlp": 1.05089164, + "epoch": 0.21902900946941228, + "flos": 23076161400960.0, + "grad_norm": 2.0352379603627684, + "language_loss": 0.71573192, + "learning_rate": 3.6367326474390928e-06, + "loss": 0.73758006, + "num_input_tokens_seen": 78549600, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9375, + "step": 3643, + "time_per_iteration": 2.4988484382629395 + }, + { + "auxiliary_loss_clip": 0.01144262, + "auxiliary_loss_mlp": 0.01048332, + "balance_loss_clip": 1.03115392, + "balance_loss_mlp": 1.05041492, + "epoch": 0.21908913272208028, + "flos": 48178545004800.0, + "grad_norm": 2.9224514767679763, + "language_loss": 0.68172711, + "learning_rate": 3.6365087939756696e-06, + "loss": 0.70365304, + "num_input_tokens_seen": 78573350, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9375, + "step": 3644, + "time_per_iteration": 2.721107244491577 + }, + { + "auxiliary_loss_clip": 0.01144867, + "auxiliary_loss_mlp": 0.01041697, + "balance_loss_clip": 1.0252583, + "balance_loss_mlp": 1.04905653, + "epoch": 0.21914925597474824, + "flos": 22236749493120.0, + "grad_norm": 2.1869112310362504, + "language_loss": 0.77744782, + "learning_rate": 3.636284878455669e-06, + "loss": 0.79931343, + "num_input_tokens_seen": 78591005, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9609375, + "step": 3645, + "time_per_iteration": 2.4838709831237793 + }, + { + "auxiliary_loss_clip": 0.01140139, + "auxiliary_loss_mlp": 0.01048358, + "balance_loss_clip": 1.03275371, + "balance_loss_mlp": 1.04988873, + "epoch": 0.2192093792274162, + "flos": 22125605834880.0, + "grad_norm": 1.575077237748942, + "language_loss": 0.82405865, + "learning_rate": 3.636060900887582e-06, + "loss": 0.84594363, + "num_input_tokens_seen": 78610645, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.90234375, + "step": 3646, + "time_per_iteration": 2.467958927154541 + }, + { + "auxiliary_loss_clip": 0.01137932, + "auxiliary_loss_mlp": 0.01036528, + "balance_loss_clip": 1.02050591, + "balance_loss_mlp": 1.04901123, + "epoch": 0.21926950248008417, + "flos": 15669442264320.0, + "grad_norm": 1.7225223193128734, + "language_loss": 0.83016759, + "learning_rate": 3.635836861279901e-06, + "loss": 0.85191214, + "num_input_tokens_seen": 78628340, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.890625, + "step": 3647, + "time_per_iteration": 2.4670159816741943 + }, + { + "auxiliary_loss_clip": 0.01137396, + "auxiliary_loss_mlp": 0.01046032, + "balance_loss_clip": 1.02991438, + "balance_loss_mlp": 1.04734278, + "epoch": 0.21932962573275214, + "flos": 30262496641920.0, + "grad_norm": 1.5879018059409027, + "language_loss": 0.72555232, + "learning_rate": 3.635612759641123e-06, + "loss": 0.74738657, + "num_input_tokens_seen": 78649355, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.90234375, + "step": 3648, + "time_per_iteration": 2.5572352409362793 + }, + { + "auxiliary_loss_clip": 0.01140287, + "auxiliary_loss_mlp": 0.01045097, + "balance_loss_clip": 1.02628565, + "balance_loss_mlp": 1.04563618, + "epoch": 0.2193897489854201, + "flos": 10780132838400.0, + "grad_norm": 2.3666125536095612, + "language_loss": 0.74363017, + "learning_rate": 3.635388595979745e-06, + "loss": 0.76548404, + "num_input_tokens_seen": 78664915, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9453125, + "step": 3649, + "time_per_iteration": 2.4465692043304443 + }, + { + "auxiliary_loss_clip": 0.01133567, + "auxiliary_loss_mlp": 0.01043993, + "balance_loss_clip": 1.02869856, + "balance_loss_mlp": 1.04609215, + "epoch": 0.21944987223808807, + "flos": 19133313390720.0, + "grad_norm": 2.0558746559562953, + "language_loss": 0.86408567, + "learning_rate": 3.635164370304267e-06, + "loss": 0.88586134, + "num_input_tokens_seen": 78681475, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.875, + "step": 3650, + "time_per_iteration": 2.4408226013183594 + }, + { + "auxiliary_loss_clip": 0.01137285, + "auxiliary_loss_mlp": 0.0104387, + "balance_loss_clip": 1.02747929, + "balance_loss_mlp": 1.04549015, + "epoch": 0.21950999549075606, + "flos": 22711093522560.0, + "grad_norm": 2.0425834927064934, + "language_loss": 0.83693743, + "learning_rate": 3.6349400826231927e-06, + "loss": 0.85874897, + "num_input_tokens_seen": 78702300, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.91796875, + "step": 3651, + "time_per_iteration": 2.502694845199585 + }, + { + "auxiliary_loss_clip": 0.01137563, + "auxiliary_loss_mlp": 0.01046031, + "balance_loss_clip": 1.02941298, + "balance_loss_mlp": 1.04595184, + "epoch": 0.21957011874342403, + "flos": 10561329141120.0, + "grad_norm": 1.8702009414404626, + "language_loss": 0.74629313, + "learning_rate": 3.634715732945027e-06, + "loss": 0.76812911, + "num_input_tokens_seen": 78720230, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9140625, + "step": 3652, + "time_per_iteration": 2.4422640800476074 + }, + { + "auxiliary_loss_clip": 0.01052644, + "auxiliary_loss_mlp": 0.01011234, + "balance_loss_clip": 1.00946999, + "balance_loss_mlp": 1.0194056, + "epoch": 0.219630241996092, + "flos": 65747913252480.0, + "grad_norm": 0.7344385056765022, + "language_loss": 0.51548386, + "learning_rate": 3.6344913212782764e-06, + "loss": 0.53612262, + "num_input_tokens_seen": 78780200, + "router_z_loss_clip": 0.0177002, + "router_z_loss_mlp": 0.33203125, + "step": 3653, + "time_per_iteration": 3.0743935108184814 + }, + { + "auxiliary_loss_clip": 0.01142335, + "auxiliary_loss_mlp": 0.01048616, + "balance_loss_clip": 1.03215361, + "balance_loss_mlp": 1.05115473, + "epoch": 0.21969036524875996, + "flos": 23696518216320.0, + "grad_norm": 1.781801507589209, + "language_loss": 0.75256276, + "learning_rate": 3.6342668476314514e-06, + "loss": 0.77447224, + "num_input_tokens_seen": 78800575, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.91015625, + "step": 3654, + "time_per_iteration": 2.4826300144195557 + }, + { + "auxiliary_loss_clip": 0.01143131, + "auxiliary_loss_mlp": 0.01041429, + "balance_loss_clip": 1.02499056, + "balance_loss_mlp": 1.04988194, + "epoch": 0.21975048850142792, + "flos": 19640910435840.0, + "grad_norm": 1.9986760770887892, + "language_loss": 0.72757828, + "learning_rate": 3.634042312013064e-06, + "loss": 0.74942386, + "num_input_tokens_seen": 78819585, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9296875, + "step": 3655, + "time_per_iteration": 2.494662284851074 + }, + { + "auxiliary_loss_clip": 0.01139919, + "auxiliary_loss_mlp": 0.01044993, + "balance_loss_clip": 1.02860177, + "balance_loss_mlp": 1.04802227, + "epoch": 0.21981061175409589, + "flos": 22448550038400.0, + "grad_norm": 1.6963533722566047, + "language_loss": 0.80971813, + "learning_rate": 3.6338177144316276e-06, + "loss": 0.83156729, + "num_input_tokens_seen": 78837330, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.91796875, + "step": 3656, + "time_per_iteration": 2.465020179748535 + }, + { + "auxiliary_loss_clip": 0.01141894, + "auxiliary_loss_mlp": 0.01039083, + "balance_loss_clip": 1.02267933, + "balance_loss_mlp": 1.05085039, + "epoch": 0.21987073500676388, + "flos": 18151049093760.0, + "grad_norm": 2.205234752003223, + "language_loss": 0.84668207, + "learning_rate": 3.63359305489566e-06, + "loss": 0.86849183, + "num_input_tokens_seen": 78854955, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.91015625, + "step": 3657, + "time_per_iteration": 2.4626548290252686 + }, + { + "auxiliary_loss_clip": 0.01138622, + "auxiliary_loss_mlp": 0.0103825, + "balance_loss_clip": 1.02126312, + "balance_loss_mlp": 1.0460434, + "epoch": 0.21993085825943184, + "flos": 25626177682560.0, + "grad_norm": 1.714181577212399, + "language_loss": 0.80485702, + "learning_rate": 3.6333683334136803e-06, + "loss": 0.8266257, + "num_input_tokens_seen": 78874965, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.92578125, + "step": 3658, + "time_per_iteration": 2.492835521697998 + }, + { + "auxiliary_loss_clip": 0.01053481, + "auxiliary_loss_mlp": 0.01002458, + "balance_loss_clip": 1.00065756, + "balance_loss_mlp": 1.02029002, + "epoch": 0.2199909815120998, + "flos": 70923217743360.0, + "grad_norm": 0.8995084923077876, + "language_loss": 0.58224851, + "learning_rate": 3.6331435499942095e-06, + "loss": 0.60280788, + "num_input_tokens_seen": 78937740, + "router_z_loss_clip": 0.01794434, + "router_z_loss_mlp": 0.33203125, + "step": 3659, + "time_per_iteration": 3.1709213256835938 + }, + { + "auxiliary_loss_clip": 0.01140235, + "auxiliary_loss_mlp": 0.0103939, + "balance_loss_clip": 1.02248645, + "balance_loss_mlp": 1.04958415, + "epoch": 0.22005110476476777, + "flos": 21543529939200.0, + "grad_norm": 2.4575828715719177, + "language_loss": 0.74535513, + "learning_rate": 3.632918704645772e-06, + "loss": 0.76715136, + "num_input_tokens_seen": 78955055, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.90625, + "step": 3660, + "time_per_iteration": 2.474397897720337 + }, + { + "auxiliary_loss_clip": 0.01139013, + "auxiliary_loss_mlp": 0.01040353, + "balance_loss_clip": 1.02336597, + "balance_loss_mlp": 1.04723859, + "epoch": 0.22011122801743574, + "flos": 22054502862720.0, + "grad_norm": 2.0332694306983723, + "language_loss": 0.81225419, + "learning_rate": 3.632693797376893e-06, + "loss": 0.83404779, + "num_input_tokens_seen": 78974895, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.91796875, + "step": 3661, + "time_per_iteration": 2.4926669597625732 + }, + { + "auxiliary_loss_clip": 0.01138494, + "auxiliary_loss_mlp": 0.01042707, + "balance_loss_clip": 1.02639949, + "balance_loss_mlp": 1.04773009, + "epoch": 0.2201713512701037, + "flos": 26687589598080.0, + "grad_norm": 1.8682139743879211, + "language_loss": 0.73236209, + "learning_rate": 3.632468828196102e-06, + "loss": 0.75417411, + "num_input_tokens_seen": 78994990, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.90625, + "step": 3662, + "time_per_iteration": 2.5111234188079834 + }, + { + "auxiliary_loss_clip": 0.01140855, + "auxiliary_loss_mlp": 0.01048578, + "balance_loss_clip": 1.03333092, + "balance_loss_mlp": 1.05132473, + "epoch": 0.22023147452277167, + "flos": 22162198815360.0, + "grad_norm": 1.6440107639340105, + "language_loss": 0.77800119, + "learning_rate": 3.632243797111929e-06, + "loss": 0.79989552, + "num_input_tokens_seen": 79014405, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.89453125, + "step": 3663, + "time_per_iteration": 2.485520601272583 + }, + { + "auxiliary_loss_clip": 0.01144475, + "auxiliary_loss_mlp": 0.01043185, + "balance_loss_clip": 1.02581656, + "balance_loss_mlp": 1.05125535, + "epoch": 0.22029159777543966, + "flos": 22523280284160.0, + "grad_norm": 3.566897500342904, + "language_loss": 0.80484056, + "learning_rate": 3.632018704132908e-06, + "loss": 0.8267172, + "num_input_tokens_seen": 79032375, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.93359375, + "step": 3664, + "time_per_iteration": 2.4827098846435547 + }, + { + "auxiliary_loss_clip": 0.01146334, + "auxiliary_loss_mlp": 0.01042617, + "balance_loss_clip": 1.02354348, + "balance_loss_mlp": 1.04959095, + "epoch": 0.22035172102810763, + "flos": 13042469093760.0, + "grad_norm": 2.530665000734818, + "language_loss": 0.76296824, + "learning_rate": 3.6317935492675742e-06, + "loss": 0.78485775, + "num_input_tokens_seen": 79049635, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.96875, + "step": 3665, + "time_per_iteration": 2.5118229389190674 + }, + { + "auxiliary_loss_clip": 0.01139389, + "auxiliary_loss_mlp": 0.01044667, + "balance_loss_clip": 1.0282042, + "balance_loss_mlp": 1.04779172, + "epoch": 0.2204118442807756, + "flos": 12165817760640.0, + "grad_norm": 2.7337119989610468, + "language_loss": 0.97959125, + "learning_rate": 3.631568332524466e-06, + "loss": 1.00143182, + "num_input_tokens_seen": 79062890, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9140625, + "step": 3666, + "time_per_iteration": 2.4461512565612793 + }, + { + "auxiliary_loss_clip": 0.01136729, + "auxiliary_loss_mlp": 0.01039342, + "balance_loss_clip": 1.02241421, + "balance_loss_mlp": 1.04582953, + "epoch": 0.22047196753344356, + "flos": 40108806673920.0, + "grad_norm": 2.115803047817727, + "language_loss": 0.80494016, + "learning_rate": 3.631343053912122e-06, + "loss": 0.82670087, + "num_input_tokens_seen": 79085495, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.90625, + "step": 3667, + "time_per_iteration": 2.65198016166687 + }, + { + "auxiliary_loss_clip": 0.01144733, + "auxiliary_loss_mlp": 0.01046593, + "balance_loss_clip": 1.02776945, + "balance_loss_mlp": 1.04882097, + "epoch": 0.22053209078611152, + "flos": 20701137202560.0, + "grad_norm": 1.916720089378095, + "language_loss": 0.77463895, + "learning_rate": 3.631117713439087e-06, + "loss": 0.79655218, + "num_input_tokens_seen": 79101820, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9609375, + "step": 3668, + "time_per_iteration": 2.459141254425049 + }, + { + "auxiliary_loss_clip": 0.0114207, + "auxiliary_loss_mlp": 0.01042745, + "balance_loss_clip": 1.02568614, + "balance_loss_mlp": 1.05058837, + "epoch": 0.2205922140387795, + "flos": 24716309247360.0, + "grad_norm": 1.730318389149699, + "language_loss": 0.71514869, + "learning_rate": 3.630892311113904e-06, + "loss": 0.73699689, + "num_input_tokens_seen": 79123320, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9140625, + "step": 3669, + "time_per_iteration": 2.550732135772705 + }, + { + "auxiliary_loss_clip": 0.01139227, + "auxiliary_loss_mlp": 0.01037839, + "balance_loss_clip": 1.02106571, + "balance_loss_mlp": 1.04615474, + "epoch": 0.22065233729144745, + "flos": 23477247642240.0, + "grad_norm": 2.0994504177928826, + "language_loss": 0.85294032, + "learning_rate": 3.6306668469451215e-06, + "loss": 0.87471098, + "num_input_tokens_seen": 79141615, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9296875, + "step": 3670, + "time_per_iteration": 2.4727606773376465 + }, + { + "auxiliary_loss_clip": 0.01147385, + "auxiliary_loss_mlp": 0.01040938, + "balance_loss_clip": 1.02360499, + "balance_loss_mlp": 1.05130565, + "epoch": 0.22071246054411545, + "flos": 35225566646400.0, + "grad_norm": 1.775856591734502, + "language_loss": 0.76796275, + "learning_rate": 3.6304413209412886e-06, + "loss": 0.789846, + "num_input_tokens_seen": 79164910, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9609375, + "step": 3671, + "time_per_iteration": 2.613104820251465 + }, + { + "auxiliary_loss_clip": 0.01140966, + "auxiliary_loss_mlp": 0.01034463, + "balance_loss_clip": 1.01758265, + "balance_loss_mlp": 1.0487864, + "epoch": 0.2207725837967834, + "flos": 18150294908160.0, + "grad_norm": 2.8820912362302202, + "language_loss": 0.80472648, + "learning_rate": 3.6302157331109573e-06, + "loss": 0.82648075, + "num_input_tokens_seen": 79179685, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.921875, + "step": 3672, + "time_per_iteration": 2.4365992546081543 + }, + { + "auxiliary_loss_clip": 0.01144646, + "auxiliary_loss_mlp": 0.01047711, + "balance_loss_clip": 1.03129566, + "balance_loss_mlp": 1.05145025, + "epoch": 0.22083270704945138, + "flos": 20479675898880.0, + "grad_norm": 1.8912849075471436, + "language_loss": 0.736193, + "learning_rate": 3.629990083462682e-06, + "loss": 0.75811654, + "num_input_tokens_seen": 79196285, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9296875, + "step": 3673, + "time_per_iteration": 2.4908931255340576 + }, + { + "auxiliary_loss_clip": 0.01145514, + "auxiliary_loss_mlp": 0.01037762, + "balance_loss_clip": 1.02064395, + "balance_loss_mlp": 1.05221379, + "epoch": 0.22089283030211934, + "flos": 34125801984000.0, + "grad_norm": 1.9375944290288487, + "language_loss": 0.76505005, + "learning_rate": 3.6297643720050203e-06, + "loss": 0.78688282, + "num_input_tokens_seen": 79216060, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9296875, + "step": 3674, + "time_per_iteration": 2.569312572479248 + }, + { + "auxiliary_loss_clip": 0.01142786, + "auxiliary_loss_mlp": 0.01043506, + "balance_loss_clip": 1.02518344, + "balance_loss_mlp": 1.05025005, + "epoch": 0.2209529535547873, + "flos": 18077216688000.0, + "grad_norm": 2.0287396146216055, + "language_loss": 0.74786556, + "learning_rate": 3.6295385987465293e-06, + "loss": 0.76972854, + "num_input_tokens_seen": 79235145, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.92578125, + "step": 3675, + "time_per_iteration": 2.4762706756591797 + }, + { + "auxiliary_loss_clip": 0.01141021, + "auxiliary_loss_mlp": 0.01040878, + "balance_loss_clip": 1.02395034, + "balance_loss_mlp": 1.0473659, + "epoch": 0.22101307680745527, + "flos": 27235335070080.0, + "grad_norm": 1.7527405009289938, + "language_loss": 0.80050498, + "learning_rate": 3.629312763695772e-06, + "loss": 0.82232398, + "num_input_tokens_seen": 79256960, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9375, + "step": 3676, + "time_per_iteration": 2.5846786499023438 + }, + { + "auxiliary_loss_clip": 0.0114147, + "auxiliary_loss_mlp": 0.01047315, + "balance_loss_clip": 1.03106666, + "balance_loss_mlp": 1.0474596, + "epoch": 0.22107320006012326, + "flos": 16543256423040.0, + "grad_norm": 1.974355382670518, + "language_loss": 0.75501895, + "learning_rate": 3.6290868668613107e-06, + "loss": 0.77690685, + "num_input_tokens_seen": 79274860, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.94140625, + "step": 3677, + "time_per_iteration": 4.02753758430481 + }, + { + "auxiliary_loss_clip": 0.01135837, + "auxiliary_loss_mlp": 0.01041033, + "balance_loss_clip": 1.02455878, + "balance_loss_mlp": 1.0449332, + "epoch": 0.22113332331279123, + "flos": 22054466949120.0, + "grad_norm": 2.0397766719275494, + "language_loss": 0.83412457, + "learning_rate": 3.628860908251712e-06, + "loss": 0.85589325, + "num_input_tokens_seen": 79294005, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.91015625, + "step": 3678, + "time_per_iteration": 3.9455032348632812 + }, + { + "auxiliary_loss_clip": 0.01140751, + "auxiliary_loss_mlp": 0.01046282, + "balance_loss_clip": 1.02903211, + "balance_loss_mlp": 1.04866314, + "epoch": 0.2211934465654592, + "flos": 26612787525120.0, + "grad_norm": 1.7724652071984504, + "language_loss": 0.89272189, + "learning_rate": 3.6286348878755452e-06, + "loss": 0.91459215, + "num_input_tokens_seen": 79314005, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.921875, + "step": 3679, + "time_per_iteration": 2.548166036605835 + }, + { + "auxiliary_loss_clip": 0.01142658, + "auxiliary_loss_mlp": 0.01053161, + "balance_loss_clip": 1.03517246, + "balance_loss_mlp": 1.04887235, + "epoch": 0.22125356981812716, + "flos": 16360363347840.0, + "grad_norm": 2.4577897330130773, + "language_loss": 0.86718571, + "learning_rate": 3.6284088057413803e-06, + "loss": 0.88914388, + "num_input_tokens_seen": 79331030, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9375, + "step": 3680, + "time_per_iteration": 2.468712329864502 + }, + { + "auxiliary_loss_clip": 0.0114123, + "auxiliary_loss_mlp": 0.01044656, + "balance_loss_clip": 1.02809739, + "balance_loss_mlp": 1.05175805, + "epoch": 0.22131369307079513, + "flos": 21651118151040.0, + "grad_norm": 2.0752123015423556, + "language_loss": 0.81897914, + "learning_rate": 3.6281826618577894e-06, + "loss": 0.84083802, + "num_input_tokens_seen": 79348560, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.89453125, + "step": 3681, + "time_per_iteration": 2.532210350036621 + }, + { + "auxiliary_loss_clip": 0.01136266, + "auxiliary_loss_mlp": 0.0103672, + "balance_loss_clip": 1.02076972, + "balance_loss_mlp": 1.04784071, + "epoch": 0.2213738163234631, + "flos": 19609524927360.0, + "grad_norm": 2.44274183004677, + "language_loss": 0.79908317, + "learning_rate": 3.62795645623335e-06, + "loss": 0.82081306, + "num_input_tokens_seen": 79367175, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.88671875, + "step": 3682, + "time_per_iteration": 2.491135358810425 + }, + { + "auxiliary_loss_clip": 0.01140313, + "auxiliary_loss_mlp": 0.01042047, + "balance_loss_clip": 1.02433276, + "balance_loss_mlp": 1.04739022, + "epoch": 0.22143393957613106, + "flos": 23623404082560.0, + "grad_norm": 2.2064811404605376, + "language_loss": 0.77283889, + "learning_rate": 3.627730188876638e-06, + "loss": 0.79466248, + "num_input_tokens_seen": 79388435, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9296875, + "step": 3683, + "time_per_iteration": 2.503041982650757 + }, + { + "auxiliary_loss_clip": 0.01141417, + "auxiliary_loss_mlp": 0.01045647, + "balance_loss_clip": 1.02824235, + "balance_loss_mlp": 1.04623342, + "epoch": 0.22149406282879905, + "flos": 26177801823360.0, + "grad_norm": 2.114071962716483, + "language_loss": 0.72779894, + "learning_rate": 3.627503859796234e-06, + "loss": 0.74966961, + "num_input_tokens_seen": 79407910, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.953125, + "step": 3684, + "time_per_iteration": 2.521495819091797 + }, + { + "auxiliary_loss_clip": 0.01142849, + "auxiliary_loss_mlp": 0.01043772, + "balance_loss_clip": 1.02598643, + "balance_loss_mlp": 1.05060613, + "epoch": 0.221554186081467, + "flos": 14538758970240.0, + "grad_norm": 1.9389187138945425, + "language_loss": 0.80108052, + "learning_rate": 3.6272774690007207e-06, + "loss": 0.82294679, + "num_input_tokens_seen": 79424020, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.921875, + "step": 3685, + "time_per_iteration": 2.436958074569702 + }, + { + "auxiliary_loss_clip": 0.01135153, + "auxiliary_loss_mlp": 0.01040139, + "balance_loss_clip": 1.02504683, + "balance_loss_mlp": 1.04634571, + "epoch": 0.22161430933413498, + "flos": 22238257864320.0, + "grad_norm": 1.5568750132404718, + "language_loss": 0.87128556, + "learning_rate": 3.6270510164986823e-06, + "loss": 0.89303845, + "num_input_tokens_seen": 79445605, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.88671875, + "step": 3686, + "time_per_iteration": 2.5519070625305176 + }, + { + "auxiliary_loss_clip": 0.01138026, + "auxiliary_loss_mlp": 0.01042632, + "balance_loss_clip": 1.02552581, + "balance_loss_mlp": 1.04762685, + "epoch": 0.22167443258680294, + "flos": 23476529370240.0, + "grad_norm": 1.942015126167962, + "language_loss": 0.77953136, + "learning_rate": 3.626824502298707e-06, + "loss": 0.8013379, + "num_input_tokens_seen": 79463850, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.90234375, + "step": 3687, + "time_per_iteration": 2.495084285736084 + }, + { + "auxiliary_loss_clip": 0.01146436, + "auxiliary_loss_mlp": 0.01048705, + "balance_loss_clip": 1.03085971, + "balance_loss_mlp": 1.05057812, + "epoch": 0.2217345558394709, + "flos": 23221132692480.0, + "grad_norm": 1.8313314390802422, + "language_loss": 0.84722549, + "learning_rate": 3.626597926409383e-06, + "loss": 0.86917698, + "num_input_tokens_seen": 79482845, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.95703125, + "step": 3688, + "time_per_iteration": 2.5029165744781494 + }, + { + "auxiliary_loss_clip": 0.01146721, + "auxiliary_loss_mlp": 0.0104649, + "balance_loss_clip": 1.02897787, + "balance_loss_mlp": 1.05005932, + "epoch": 0.22179467909213887, + "flos": 20011078045440.0, + "grad_norm": 2.7913489877281905, + "language_loss": 0.81395769, + "learning_rate": 3.6263712888393027e-06, + "loss": 0.83588976, + "num_input_tokens_seen": 79501550, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.96484375, + "step": 3689, + "time_per_iteration": 2.487032651901245 + }, + { + "auxiliary_loss_clip": 0.0114216, + "auxiliary_loss_mlp": 0.01044991, + "balance_loss_clip": 1.02758622, + "balance_loss_mlp": 1.04985952, + "epoch": 0.22185480234480687, + "flos": 19683034110720.0, + "grad_norm": 2.5504206662352082, + "language_loss": 0.70040542, + "learning_rate": 3.626144589597061e-06, + "loss": 0.72227693, + "num_input_tokens_seen": 79519680, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.921875, + "step": 3690, + "time_per_iteration": 2.5005807876586914 + }, + { + "auxiliary_loss_clip": 0.01145048, + "auxiliary_loss_mlp": 0.01038313, + "balance_loss_clip": 1.0202167, + "balance_loss_mlp": 1.04890513, + "epoch": 0.22191492559747483, + "flos": 21981316901760.0, + "grad_norm": 1.7318147752747124, + "language_loss": 0.72394359, + "learning_rate": 3.6259178286912528e-06, + "loss": 0.74577713, + "num_input_tokens_seen": 79539000, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9609375, + "step": 3691, + "time_per_iteration": 2.4835989475250244 + }, + { + "auxiliary_loss_clip": 0.01145815, + "auxiliary_loss_mlp": 0.01049746, + "balance_loss_clip": 1.03169739, + "balance_loss_mlp": 1.05317688, + "epoch": 0.2219750488501428, + "flos": 23222066446080.0, + "grad_norm": 2.1843836481793057, + "language_loss": 0.71611524, + "learning_rate": 3.625691006130477e-06, + "loss": 0.73807085, + "num_input_tokens_seen": 79559695, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.92578125, + "step": 3692, + "time_per_iteration": 2.515230655670166 + }, + { + "auxiliary_loss_clip": 0.01146831, + "auxiliary_loss_mlp": 0.01044658, + "balance_loss_clip": 1.02750337, + "balance_loss_mlp": 1.05008483, + "epoch": 0.22203517210281076, + "flos": 22453685683200.0, + "grad_norm": 2.7650002202849113, + "language_loss": 0.87580657, + "learning_rate": 3.6254641219233362e-06, + "loss": 0.89772147, + "num_input_tokens_seen": 79579095, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.96875, + "step": 3693, + "time_per_iteration": 2.5013554096221924 + }, + { + "auxiliary_loss_clip": 0.01138596, + "auxiliary_loss_mlp": 0.010363, + "balance_loss_clip": 1.02086258, + "balance_loss_mlp": 1.04947054, + "epoch": 0.22209529535547873, + "flos": 17564555825280.0, + "grad_norm": 3.031177285152565, + "language_loss": 0.85307622, + "learning_rate": 3.6252371760784325e-06, + "loss": 0.87482512, + "num_input_tokens_seen": 79596430, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.890625, + "step": 3694, + "time_per_iteration": 2.4828481674194336 + }, + { + "auxiliary_loss_clip": 0.01147368, + "auxiliary_loss_mlp": 0.01041631, + "balance_loss_clip": 1.02370214, + "balance_loss_mlp": 1.0491637, + "epoch": 0.2221554186081467, + "flos": 21469015175040.0, + "grad_norm": 1.9517253418741858, + "language_loss": 0.69055748, + "learning_rate": 3.6250101686043725e-06, + "loss": 0.71244752, + "num_input_tokens_seen": 79615825, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.984375, + "step": 3695, + "time_per_iteration": 2.49957537651062 + }, + { + "auxiliary_loss_clip": 0.01141491, + "auxiliary_loss_mlp": 0.0104003, + "balance_loss_clip": 1.02438951, + "balance_loss_mlp": 1.05095696, + "epoch": 0.22221554186081466, + "flos": 27673445255040.0, + "grad_norm": 1.4867456423055678, + "language_loss": 0.71710318, + "learning_rate": 3.6247830995097637e-06, + "loss": 0.73891842, + "num_input_tokens_seen": 79637875, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.90625, + "step": 3696, + "time_per_iteration": 2.5991299152374268 + }, + { + "auxiliary_loss_clip": 0.01140811, + "auxiliary_loss_mlp": 0.01040962, + "balance_loss_clip": 1.02387977, + "balance_loss_mlp": 1.0483942, + "epoch": 0.22227566511348265, + "flos": 25958926298880.0, + "grad_norm": 1.901791440824732, + "language_loss": 0.87694812, + "learning_rate": 3.624555968803217e-06, + "loss": 0.8987658, + "num_input_tokens_seen": 79656970, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.921875, + "step": 3697, + "time_per_iteration": 2.524841547012329 + }, + { + "auxiliary_loss_clip": 0.01134138, + "auxiliary_loss_mlp": 0.01045972, + "balance_loss_clip": 1.03020072, + "balance_loss_mlp": 1.04646909, + "epoch": 0.22233578836615062, + "flos": 39203678833920.0, + "grad_norm": 1.985465494359005, + "language_loss": 0.66109681, + "learning_rate": 3.624328776493346e-06, + "loss": 0.68289793, + "num_input_tokens_seen": 79680275, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.87890625, + "step": 3698, + "time_per_iteration": 2.6806552410125732 + }, + { + "auxiliary_loss_clip": 0.01143188, + "auxiliary_loss_mlp": 0.01038878, + "balance_loss_clip": 1.0208652, + "balance_loss_mlp": 1.049245, + "epoch": 0.22239591161881858, + "flos": 36283782251520.0, + "grad_norm": 1.9701476357110561, + "language_loss": 0.82699466, + "learning_rate": 3.6241015225887637e-06, + "loss": 0.84881532, + "num_input_tokens_seen": 79701255, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9375, + "step": 3699, + "time_per_iteration": 2.620795965194702 + }, + { + "auxiliary_loss_clip": 0.01141189, + "auxiliary_loss_mlp": 0.01044961, + "balance_loss_clip": 1.02789021, + "balance_loss_mlp": 1.04960978, + "epoch": 0.22245603487148655, + "flos": 19719591177600.0, + "grad_norm": 1.6593732889446324, + "language_loss": 0.79488564, + "learning_rate": 3.62387420709809e-06, + "loss": 0.81674713, + "num_input_tokens_seen": 79721315, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9140625, + "step": 3700, + "time_per_iteration": 2.4886739253997803 + }, + { + "auxiliary_loss_clip": 0.01148421, + "auxiliary_loss_mlp": 0.01045024, + "balance_loss_clip": 1.02639139, + "balance_loss_mlp": 1.05154204, + "epoch": 0.2225161581241545, + "flos": 46280450615040.0, + "grad_norm": 7.082418544009014, + "language_loss": 0.72063768, + "learning_rate": 3.623646830029943e-06, + "loss": 0.74257213, + "num_input_tokens_seen": 79742705, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.96875, + "step": 3701, + "time_per_iteration": 2.7293899059295654 + }, + { + "auxiliary_loss_clip": 0.01139069, + "auxiliary_loss_mlp": 0.0104219, + "balance_loss_clip": 1.02520323, + "balance_loss_mlp": 1.04706395, + "epoch": 0.22257628137682248, + "flos": 23696194993920.0, + "grad_norm": 1.9269634413479926, + "language_loss": 0.79704928, + "learning_rate": 3.6234193913929454e-06, + "loss": 0.81886196, + "num_input_tokens_seen": 79763000, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.921875, + "step": 3702, + "time_per_iteration": 2.5527849197387695 + }, + { + "auxiliary_loss_clip": 0.01132932, + "auxiliary_loss_mlp": 0.0104181, + "balance_loss_clip": 1.02487028, + "balance_loss_mlp": 1.04518211, + "epoch": 0.22263640462949044, + "flos": 19353984595200.0, + "grad_norm": 2.7410709876553447, + "language_loss": 0.78632712, + "learning_rate": 3.623191891195723e-06, + "loss": 0.80807453, + "num_input_tokens_seen": 79781335, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.87890625, + "step": 3703, + "time_per_iteration": 2.4955005645751953 + }, + { + "auxiliary_loss_clip": 0.01140692, + "auxiliary_loss_mlp": 0.01037524, + "balance_loss_clip": 1.01810527, + "balance_loss_mlp": 1.0468421, + "epoch": 0.22269652788215843, + "flos": 20776047016320.0, + "grad_norm": 1.8479834568020117, + "language_loss": 0.74212444, + "learning_rate": 3.6229643294469005e-06, + "loss": 0.7639066, + "num_input_tokens_seen": 79800150, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.9375, + "step": 3704, + "time_per_iteration": 2.5000903606414795 + }, + { + "auxiliary_loss_clip": 0.0113847, + "auxiliary_loss_mlp": 0.01042668, + "balance_loss_clip": 1.02618146, + "balance_loss_mlp": 1.05030012, + "epoch": 0.2227566511348264, + "flos": 47958843467520.0, + "grad_norm": 1.7361108874663713, + "language_loss": 0.64372134, + "learning_rate": 3.6227367061551074e-06, + "loss": 0.66553271, + "num_input_tokens_seen": 79822390, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8828125, + "step": 3705, + "time_per_iteration": 2.6993744373321533 + }, + { + "auxiliary_loss_clip": 0.01064369, + "auxiliary_loss_mlp": 0.01006302, + "balance_loss_clip": 1.00454926, + "balance_loss_mlp": 1.03098035, + "epoch": 0.22281677438749437, + "flos": 66218953230720.0, + "grad_norm": 1.353184132187748, + "language_loss": 0.65301311, + "learning_rate": 3.6225090213289766e-06, + "loss": 0.67371976, + "num_input_tokens_seen": 79873350, + "router_z_loss_clip": 0.01757812, + "router_z_loss_mlp": 0.33398438, + "step": 3706, + "time_per_iteration": 2.9832844734191895 + }, + { + "auxiliary_loss_clip": 0.01137982, + "auxiliary_loss_mlp": 0.01036629, + "balance_loss_clip": 1.02076256, + "balance_loss_mlp": 1.0461061, + "epoch": 0.22287689764016233, + "flos": 21871609787520.0, + "grad_norm": 3.09427451037038, + "language_loss": 0.80608439, + "learning_rate": 3.622281274977141e-06, + "loss": 0.82783049, + "num_input_tokens_seen": 79891715, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.91796875, + "step": 3707, + "time_per_iteration": 2.5236454010009766 + }, + { + "auxiliary_loss_clip": 0.01139003, + "auxiliary_loss_mlp": 0.01038491, + "balance_loss_clip": 1.02184916, + "balance_loss_mlp": 1.04706407, + "epoch": 0.2229370208928303, + "flos": 27672475587840.0, + "grad_norm": 2.0318896185848057, + "language_loss": 0.78124011, + "learning_rate": 3.6220534671082367e-06, + "loss": 0.80301505, + "num_input_tokens_seen": 79911175, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.921875, + "step": 3708, + "time_per_iteration": 2.5254104137420654 + }, + { + "auxiliary_loss_clip": 0.01142891, + "auxiliary_loss_mlp": 0.01039636, + "balance_loss_clip": 1.02291107, + "balance_loss_mlp": 1.04897153, + "epoch": 0.22299714414549826, + "flos": 30154657034880.0, + "grad_norm": 1.913582269302705, + "language_loss": 0.79989487, + "learning_rate": 3.6218255977309024e-06, + "loss": 0.82172012, + "num_input_tokens_seen": 79931875, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9375, + "step": 3709, + "time_per_iteration": 2.5528371334075928 + }, + { + "auxiliary_loss_clip": 0.01139166, + "auxiliary_loss_mlp": 0.01046119, + "balance_loss_clip": 1.02913201, + "balance_loss_mlp": 1.04580092, + "epoch": 0.22305726739816625, + "flos": 23143134309120.0, + "grad_norm": 2.062693768306912, + "language_loss": 0.68752408, + "learning_rate": 3.6215976668537787e-06, + "loss": 0.70937693, + "num_input_tokens_seen": 79952445, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.93359375, + "step": 3710, + "time_per_iteration": 2.511275053024292 + }, + { + "auxiliary_loss_clip": 0.01144244, + "auxiliary_loss_mlp": 0.01039149, + "balance_loss_clip": 1.0221858, + "balance_loss_mlp": 1.04812646, + "epoch": 0.22311739065083422, + "flos": 19172061187200.0, + "grad_norm": 2.3083581079415216, + "language_loss": 0.90696692, + "learning_rate": 3.6213696744855096e-06, + "loss": 0.92880082, + "num_input_tokens_seen": 79971030, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9609375, + "step": 3711, + "time_per_iteration": 2.4757487773895264 + }, + { + "auxiliary_loss_clip": 0.01138091, + "auxiliary_loss_mlp": 0.01051989, + "balance_loss_clip": 1.03406, + "balance_loss_mlp": 1.04603434, + "epoch": 0.22317751390350218, + "flos": 13617757319040.0, + "grad_norm": 2.758927620438821, + "language_loss": 0.89628232, + "learning_rate": 3.6211416206347395e-06, + "loss": 0.91818309, + "num_input_tokens_seen": 79982085, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.921875, + "step": 3712, + "time_per_iteration": 2.3870105743408203 + }, + { + "auxiliary_loss_clip": 0.01139482, + "auxiliary_loss_mlp": 0.01051487, + "balance_loss_clip": 1.03356993, + "balance_loss_mlp": 1.04956841, + "epoch": 0.22323763715617015, + "flos": 11029065068160.0, + "grad_norm": 3.039950461935961, + "language_loss": 0.74859631, + "learning_rate": 3.620913505310117e-06, + "loss": 0.77050602, + "num_input_tokens_seen": 79997460, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.8984375, + "step": 3713, + "time_per_iteration": 2.4336304664611816 + }, + { + "auxiliary_loss_clip": 0.01138793, + "auxiliary_loss_mlp": 0.01041826, + "balance_loss_clip": 1.02543497, + "balance_loss_mlp": 1.048329, + "epoch": 0.22329776040883811, + "flos": 41351531466240.0, + "grad_norm": 1.8221921578975473, + "language_loss": 0.62592143, + "learning_rate": 3.6206853285202917e-06, + "loss": 0.64772761, + "num_input_tokens_seen": 80022450, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90625, + "step": 3714, + "time_per_iteration": 2.6230995655059814 + }, + { + "auxiliary_loss_clip": 0.01139199, + "auxiliary_loss_mlp": 0.01036969, + "balance_loss_clip": 1.02073312, + "balance_loss_mlp": 1.04734552, + "epoch": 0.22335788366150608, + "flos": 25119478477440.0, + "grad_norm": 1.9329837891440178, + "language_loss": 0.79052407, + "learning_rate": 3.6204570902739164e-06, + "loss": 0.81228578, + "num_input_tokens_seen": 80042100, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.91796875, + "step": 3715, + "time_per_iteration": 2.510436534881592 + }, + { + "auxiliary_loss_clip": 0.011421, + "auxiliary_loss_mlp": 0.01050674, + "balance_loss_clip": 1.03372216, + "balance_loss_mlp": 1.05021942, + "epoch": 0.22341800691417404, + "flos": 16983377769600.0, + "grad_norm": 1.6633570096565886, + "language_loss": 0.77182817, + "learning_rate": 3.620228790579645e-06, + "loss": 0.79375589, + "num_input_tokens_seen": 80059690, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.921875, + "step": 3716, + "time_per_iteration": 2.4398605823516846 + }, + { + "auxiliary_loss_clip": 0.01141179, + "auxiliary_loss_mlp": 0.01047022, + "balance_loss_clip": 1.03046429, + "balance_loss_mlp": 1.04845762, + "epoch": 0.22347813016684204, + "flos": 14136738975360.0, + "grad_norm": 2.028714583879474, + "language_loss": 0.79209757, + "learning_rate": 3.6200004294461367e-06, + "loss": 0.81397963, + "num_input_tokens_seen": 80076060, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.92578125, + "step": 3717, + "time_per_iteration": 2.456042766571045 + }, + { + "auxiliary_loss_clip": 0.01143546, + "auxiliary_loss_mlp": 0.01041127, + "balance_loss_clip": 1.02377009, + "balance_loss_mlp": 1.04934192, + "epoch": 0.22353825341951, + "flos": 23583147914880.0, + "grad_norm": 2.2103373086531115, + "language_loss": 0.68029571, + "learning_rate": 3.6197720068820497e-06, + "loss": 0.70214242, + "num_input_tokens_seen": 80094760, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.94140625, + "step": 3718, + "time_per_iteration": 2.4818973541259766 + }, + { + "auxiliary_loss_clip": 0.01142458, + "auxiliary_loss_mlp": 0.01038613, + "balance_loss_clip": 1.02067208, + "balance_loss_mlp": 1.04784536, + "epoch": 0.22359837667217797, + "flos": 29824206888960.0, + "grad_norm": 1.9912565029374794, + "language_loss": 0.80194163, + "learning_rate": 3.619543522896045e-06, + "loss": 0.8237524, + "num_input_tokens_seen": 80114475, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.9453125, + "step": 3719, + "time_per_iteration": 3.985903263092041 + }, + { + "auxiliary_loss_clip": 0.01145808, + "auxiliary_loss_mlp": 0.01052597, + "balance_loss_clip": 1.03396416, + "balance_loss_mlp": 1.04785836, + "epoch": 0.22365849992484593, + "flos": 17603088140160.0, + "grad_norm": 2.0930960597239707, + "language_loss": 0.86421579, + "learning_rate": 3.6193149774967885e-06, + "loss": 0.88619983, + "num_input_tokens_seen": 80132920, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.98046875, + "step": 3720, + "time_per_iteration": 3.914626359939575 + }, + { + "auxiliary_loss_clip": 0.0114136, + "auxiliary_loss_mlp": 0.01033623, + "balance_loss_clip": 1.01682639, + "balance_loss_mlp": 1.05105066, + "epoch": 0.2237186231775139, + "flos": 22710949868160.0, + "grad_norm": 1.6398614781610892, + "language_loss": 0.74860299, + "learning_rate": 3.619086370692945e-06, + "loss": 0.77035284, + "num_input_tokens_seen": 80152845, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.90234375, + "step": 3721, + "time_per_iteration": 2.485271453857422 + }, + { + "auxiliary_loss_clip": 0.011451, + "auxiliary_loss_mlp": 0.01043389, + "balance_loss_clip": 1.0256865, + "balance_loss_mlp": 1.0494988, + "epoch": 0.22377874643018186, + "flos": 13371518609280.0, + "grad_norm": 2.928465692067959, + "language_loss": 0.78943181, + "learning_rate": 3.6188577024931844e-06, + "loss": 0.81131673, + "num_input_tokens_seen": 80170680, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.953125, + "step": 3722, + "time_per_iteration": 2.471928834915161 + }, + { + "auxiliary_loss_clip": 0.01140042, + "auxiliary_loss_mlp": 0.0104164, + "balance_loss_clip": 1.02551126, + "balance_loss_mlp": 1.05004597, + "epoch": 0.22383886968284986, + "flos": 17894970057600.0, + "grad_norm": 2.2482737248582247, + "language_loss": 0.82315016, + "learning_rate": 3.618628972906178e-06, + "loss": 0.84496701, + "num_input_tokens_seen": 80189030, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8984375, + "step": 3723, + "time_per_iteration": 2.4540791511535645 + }, + { + "auxiliary_loss_clip": 0.01144828, + "auxiliary_loss_mlp": 0.01044673, + "balance_loss_clip": 1.02729177, + "balance_loss_mlp": 1.05062389, + "epoch": 0.22389899293551782, + "flos": 23879123982720.0, + "grad_norm": 2.154682666342997, + "language_loss": 0.84433442, + "learning_rate": 3.6184001819405984e-06, + "loss": 0.86622941, + "num_input_tokens_seen": 80208365, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.94140625, + "step": 3724, + "time_per_iteration": 2.526204824447632 + }, + { + "auxiliary_loss_clip": 0.0114043, + "auxiliary_loss_mlp": 0.010395, + "balance_loss_clip": 1.02297735, + "balance_loss_mlp": 1.04889762, + "epoch": 0.2239591161881858, + "flos": 27272430840960.0, + "grad_norm": 2.178002887638817, + "language_loss": 0.79036546, + "learning_rate": 3.618171329605121e-06, + "loss": 0.81216478, + "num_input_tokens_seen": 80228685, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.9140625, + "step": 3725, + "time_per_iteration": 2.513136625289917 + }, + { + "auxiliary_loss_clip": 0.01139478, + "auxiliary_loss_mlp": 0.01039417, + "balance_loss_clip": 1.02271581, + "balance_loss_mlp": 1.04898071, + "epoch": 0.22401923944085375, + "flos": 22236857233920.0, + "grad_norm": 1.6732241790302085, + "language_loss": 0.77158499, + "learning_rate": 3.6179424159084254e-06, + "loss": 0.79337394, + "num_input_tokens_seen": 80247635, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.90625, + "step": 3726, + "time_per_iteration": 2.5645246505737305 + }, + { + "auxiliary_loss_clip": 0.01150164, + "auxiliary_loss_mlp": 0.01045662, + "balance_loss_clip": 1.02677917, + "balance_loss_mlp": 1.05054045, + "epoch": 0.22407936269352172, + "flos": 12053668521600.0, + "grad_norm": 2.7042555627132296, + "language_loss": 0.72376108, + "learning_rate": 3.6177134408591914e-06, + "loss": 0.74571931, + "num_input_tokens_seen": 80260045, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.99609375, + "step": 3727, + "time_per_iteration": 2.4437429904937744 + }, + { + "auxiliary_loss_clip": 0.0114439, + "auxiliary_loss_mlp": 0.01040468, + "balance_loss_clip": 1.02140689, + "balance_loss_mlp": 1.04682648, + "epoch": 0.22413948594618968, + "flos": 19353553632000.0, + "grad_norm": 2.2876633759350327, + "language_loss": 0.86584771, + "learning_rate": 3.6174844044661013e-06, + "loss": 0.88769633, + "num_input_tokens_seen": 80277680, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.9765625, + "step": 3728, + "time_per_iteration": 2.496020793914795 + }, + { + "auxiliary_loss_clip": 0.01143576, + "auxiliary_loss_mlp": 0.0104763, + "balance_loss_clip": 1.02838981, + "balance_loss_mlp": 1.05045211, + "epoch": 0.22419960919885765, + "flos": 24170000319360.0, + "grad_norm": 2.0817566504616734, + "language_loss": 0.80479026, + "learning_rate": 3.6172553067378406e-06, + "loss": 0.82670236, + "num_input_tokens_seen": 80294795, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.9296875, + "step": 3729, + "time_per_iteration": 2.4733448028564453 + }, + { + "auxiliary_loss_clip": 0.01136706, + "auxiliary_loss_mlp": 0.01046287, + "balance_loss_clip": 1.03019357, + "balance_loss_mlp": 1.04672551, + "epoch": 0.22425973245152564, + "flos": 27378977558400.0, + "grad_norm": 2.3054621640206205, + "language_loss": 0.86468041, + "learning_rate": 3.6170261476830964e-06, + "loss": 0.88651037, + "num_input_tokens_seen": 80315425, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8984375, + "step": 3730, + "time_per_iteration": 2.5348362922668457 + }, + { + "auxiliary_loss_clip": 0.01136756, + "auxiliary_loss_mlp": 0.0103563, + "balance_loss_clip": 1.01917958, + "balance_loss_mlp": 1.04737782, + "epoch": 0.2243198557041936, + "flos": 13735652734080.0, + "grad_norm": 1.75673058423422, + "language_loss": 0.73293322, + "learning_rate": 3.616796927310559e-06, + "loss": 0.75465709, + "num_input_tokens_seen": 80333905, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.89453125, + "step": 3731, + "time_per_iteration": 2.4397478103637695 + }, + { + "auxiliary_loss_clip": 0.01141304, + "auxiliary_loss_mlp": 0.0104035, + "balance_loss_clip": 1.02370882, + "balance_loss_mlp": 1.04893279, + "epoch": 0.22437997895686157, + "flos": 19530700531200.0, + "grad_norm": 2.4044438539905575, + "language_loss": 0.75237334, + "learning_rate": 3.6165676456289195e-06, + "loss": 0.77418989, + "num_input_tokens_seen": 80352165, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.921875, + "step": 3732, + "time_per_iteration": 2.476630926132202 + }, + { + "auxiliary_loss_clip": 0.01141784, + "auxiliary_loss_mlp": 0.01058138, + "balance_loss_clip": 1.04106712, + "balance_loss_mlp": 1.0494858, + "epoch": 0.22444010220952954, + "flos": 23696230907520.0, + "grad_norm": 1.8584104659795708, + "language_loss": 0.88037199, + "learning_rate": 3.616338302646873e-06, + "loss": 0.90237123, + "num_input_tokens_seen": 80371305, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.921875, + "step": 3733, + "time_per_iteration": 2.4723222255706787 + }, + { + "auxiliary_loss_clip": 0.01137652, + "auxiliary_loss_mlp": 0.01042602, + "balance_loss_clip": 1.02473271, + "balance_loss_mlp": 1.04564941, + "epoch": 0.2245002254621975, + "flos": 22382905933440.0, + "grad_norm": 1.6767676579772364, + "language_loss": 0.84200239, + "learning_rate": 3.6161088983731166e-06, + "loss": 0.86380494, + "num_input_tokens_seen": 80391020, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.921875, + "step": 3734, + "time_per_iteration": 2.5214619636535645 + }, + { + "auxiliary_loss_clip": 0.01143902, + "auxiliary_loss_mlp": 0.01048514, + "balance_loss_clip": 1.03170574, + "balance_loss_mlp": 1.0513525, + "epoch": 0.22456034871486547, + "flos": 26942303917440.0, + "grad_norm": 1.6368426378189131, + "language_loss": 0.76838279, + "learning_rate": 3.6158794328163482e-06, + "loss": 0.79030693, + "num_input_tokens_seen": 80411365, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.92578125, + "step": 3735, + "time_per_iteration": 2.5025858879089355 + }, + { + "auxiliary_loss_clip": 0.01134798, + "auxiliary_loss_mlp": 0.01047796, + "balance_loss_clip": 1.032215, + "balance_loss_mlp": 1.04791164, + "epoch": 0.22462047196753343, + "flos": 28983538005120.0, + "grad_norm": 3.6998773026048046, + "language_loss": 0.84505916, + "learning_rate": 3.6156499059852702e-06, + "loss": 0.86688507, + "num_input_tokens_seen": 80431075, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8671875, + "step": 3736, + "time_per_iteration": 2.581409454345703 + }, + { + "auxiliary_loss_clip": 0.0114079, + "auxiliary_loss_mlp": 0.01039504, + "balance_loss_clip": 1.02306545, + "balance_loss_mlp": 1.04848719, + "epoch": 0.22468059522020142, + "flos": 20011329440640.0, + "grad_norm": 2.2208030259376192, + "language_loss": 0.86398852, + "learning_rate": 3.615420317888586e-06, + "loss": 0.88579136, + "num_input_tokens_seen": 80449240, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.92578125, + "step": 3737, + "time_per_iteration": 2.4498212337493896 + }, + { + "auxiliary_loss_clip": 0.01141365, + "auxiliary_loss_mlp": 0.01047163, + "balance_loss_clip": 1.02917397, + "balance_loss_mlp": 1.0476644, + "epoch": 0.2247407184728694, + "flos": 29314239546240.0, + "grad_norm": 2.434824168439142, + "language_loss": 0.79145718, + "learning_rate": 3.6151906685350006e-06, + "loss": 0.81334245, + "num_input_tokens_seen": 80467900, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9375, + "step": 3738, + "time_per_iteration": 2.5505504608154297 + }, + { + "auxiliary_loss_clip": 0.01140019, + "auxiliary_loss_mlp": 0.01041393, + "balance_loss_clip": 1.02564526, + "balance_loss_mlp": 1.0471611, + "epoch": 0.22480084172553735, + "flos": 22310366417280.0, + "grad_norm": 2.2711438439691314, + "language_loss": 0.75895345, + "learning_rate": 3.614960957933224e-06, + "loss": 0.78076756, + "num_input_tokens_seen": 80487100, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.9296875, + "step": 3739, + "time_per_iteration": 2.458307981491089 + }, + { + "auxiliary_loss_clip": 0.01137257, + "auxiliary_loss_mlp": 0.0104211, + "balance_loss_clip": 1.0255754, + "balance_loss_mlp": 1.04610491, + "epoch": 0.22486096497820532, + "flos": 25591272641280.0, + "grad_norm": 1.9782758832921432, + "language_loss": 0.74705702, + "learning_rate": 3.6147311860919655e-06, + "loss": 0.76885068, + "num_input_tokens_seen": 80508625, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9140625, + "step": 3740, + "time_per_iteration": 2.5424981117248535 + }, + { + "auxiliary_loss_clip": 0.011377, + "auxiliary_loss_mlp": 0.01039355, + "balance_loss_clip": 1.02234411, + "balance_loss_mlp": 1.04691672, + "epoch": 0.22492108823087328, + "flos": 17639824775040.0, + "grad_norm": 2.174963459036685, + "language_loss": 0.76083958, + "learning_rate": 3.614501353019939e-06, + "loss": 0.78261012, + "num_input_tokens_seen": 80527345, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.90625, + "step": 3741, + "time_per_iteration": 2.4539613723754883 + }, + { + "auxiliary_loss_clip": 0.01140029, + "auxiliary_loss_mlp": 0.01038592, + "balance_loss_clip": 1.02263021, + "balance_loss_mlp": 1.05022252, + "epoch": 0.22498121148354125, + "flos": 16034653797120.0, + "grad_norm": 1.917686629559915, + "language_loss": 0.87458241, + "learning_rate": 3.6142714587258592e-06, + "loss": 0.89636862, + "num_input_tokens_seen": 80545545, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8984375, + "step": 3742, + "time_per_iteration": 2.483146905899048 + }, + { + "auxiliary_loss_clip": 0.01137495, + "auxiliary_loss_mlp": 0.01051324, + "balance_loss_clip": 1.03403831, + "balance_loss_mlp": 1.04824293, + "epoch": 0.22504133473620924, + "flos": 24023772051840.0, + "grad_norm": 2.0726823880461116, + "language_loss": 0.81939828, + "learning_rate": 3.614041503218444e-06, + "loss": 0.84128648, + "num_input_tokens_seen": 80565040, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.890625, + "step": 3743, + "time_per_iteration": 2.4786789417266846 + }, + { + "auxiliary_loss_clip": 0.01140562, + "auxiliary_loss_mlp": 0.01038532, + "balance_loss_clip": 1.02241504, + "balance_loss_mlp": 1.04843307, + "epoch": 0.2251014579888772, + "flos": 16763963541120.0, + "grad_norm": 3.9980575521347697, + "language_loss": 0.63616955, + "learning_rate": 3.6138114865064134e-06, + "loss": 0.65796053, + "num_input_tokens_seen": 80582815, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.921875, + "step": 3744, + "time_per_iteration": 2.4746344089508057 + }, + { + "auxiliary_loss_clip": 0.01137356, + "auxiliary_loss_mlp": 0.01042928, + "balance_loss_clip": 1.02634597, + "balance_loss_mlp": 1.04524422, + "epoch": 0.22516158124154517, + "flos": 13991013498240.0, + "grad_norm": 3.3106228370485806, + "language_loss": 0.75711048, + "learning_rate": 3.613581408598489e-06, + "loss": 0.77891332, + "num_input_tokens_seen": 80600865, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.921875, + "step": 3745, + "time_per_iteration": 2.4295878410339355 + }, + { + "auxiliary_loss_clip": 0.01136631, + "auxiliary_loss_mlp": 0.0103759, + "balance_loss_clip": 1.02142549, + "balance_loss_mlp": 1.04637384, + "epoch": 0.22522170449421314, + "flos": 14390016750720.0, + "grad_norm": 1.8117958881819525, + "language_loss": 0.80839783, + "learning_rate": 3.6133512695033965e-06, + "loss": 0.83013999, + "num_input_tokens_seen": 80617455, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.90234375, + "step": 3746, + "time_per_iteration": 2.4423928260803223 + }, + { + "auxiliary_loss_clip": 0.01138701, + "auxiliary_loss_mlp": 0.01045887, + "balance_loss_clip": 1.02903056, + "balance_loss_mlp": 1.04503584, + "epoch": 0.2252818277468811, + "flos": 23805542972160.0, + "grad_norm": 2.508960709641407, + "language_loss": 0.86067426, + "learning_rate": 3.613121069229862e-06, + "loss": 0.8825202, + "num_input_tokens_seen": 80635125, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.93359375, + "step": 3747, + "time_per_iteration": 2.471223831176758 + }, + { + "auxiliary_loss_clip": 0.01136945, + "auxiliary_loss_mlp": 0.01034039, + "balance_loss_clip": 1.01789808, + "balance_loss_mlp": 1.04515314, + "epoch": 0.22534195099954907, + "flos": 24718033100160.0, + "grad_norm": 1.812236682782158, + "language_loss": 0.76358509, + "learning_rate": 3.6128908077866145e-06, + "loss": 0.78529495, + "num_input_tokens_seen": 80656370, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.91796875, + "step": 3748, + "time_per_iteration": 2.525108575820923 + }, + { + "auxiliary_loss_clip": 0.01142287, + "auxiliary_loss_mlp": 0.01044202, + "balance_loss_clip": 1.0274291, + "balance_loss_mlp": 1.04882264, + "epoch": 0.22540207425221703, + "flos": 21032341534080.0, + "grad_norm": 1.7339876982656162, + "language_loss": 0.79497123, + "learning_rate": 3.6126604851823864e-06, + "loss": 0.81683606, + "num_input_tokens_seen": 80676495, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9375, + "step": 3749, + "time_per_iteration": 2.4881162643432617 + }, + { + "auxiliary_loss_clip": 0.01134115, + "auxiliary_loss_mlp": 0.01039256, + "balance_loss_clip": 1.02423525, + "balance_loss_mlp": 1.04609084, + "epoch": 0.22546219750488503, + "flos": 19390362094080.0, + "grad_norm": 1.6101192523185979, + "language_loss": 0.8009423, + "learning_rate": 3.6124301014259108e-06, + "loss": 0.82267606, + "num_input_tokens_seen": 80694755, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8828125, + "step": 3750, + "time_per_iteration": 2.4656643867492676 + }, + { + "auxiliary_loss_clip": 0.01140861, + "auxiliary_loss_mlp": 0.01044128, + "balance_loss_clip": 1.02733183, + "balance_loss_mlp": 1.04821157, + "epoch": 0.225522320757553, + "flos": 25192628524800.0, + "grad_norm": 2.418289881699729, + "language_loss": 0.81336129, + "learning_rate": 3.6121996565259244e-06, + "loss": 0.83521116, + "num_input_tokens_seen": 80713670, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.92578125, + "step": 3751, + "time_per_iteration": 2.4960029125213623 + }, + { + "auxiliary_loss_clip": 0.01141479, + "auxiliary_loss_mlp": 0.01038662, + "balance_loss_clip": 1.02242589, + "balance_loss_mlp": 1.04915667, + "epoch": 0.22558244401022096, + "flos": 17163110448000.0, + "grad_norm": 1.757449596716865, + "language_loss": 0.83989275, + "learning_rate": 3.611969150491165e-06, + "loss": 0.86169416, + "num_input_tokens_seen": 80731450, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.921875, + "step": 3752, + "time_per_iteration": 2.4668636322021484 + }, + { + "auxiliary_loss_clip": 0.01136965, + "auxiliary_loss_mlp": 0.01039126, + "balance_loss_clip": 1.02375996, + "balance_loss_mlp": 1.04671109, + "epoch": 0.22564256726288892, + "flos": 15231008856960.0, + "grad_norm": 1.7780915453784651, + "language_loss": 0.78616595, + "learning_rate": 3.611738583330375e-06, + "loss": 0.80792689, + "num_input_tokens_seen": 80748415, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.90234375, + "step": 3753, + "time_per_iteration": 2.4305062294006348 + }, + { + "auxiliary_loss_clip": 0.01137405, + "auxiliary_loss_mlp": 0.01038232, + "balance_loss_clip": 1.02113724, + "balance_loss_mlp": 1.04717183, + "epoch": 0.2257026905155569, + "flos": 34568652764160.0, + "grad_norm": 1.990408742554116, + "language_loss": 0.78284466, + "learning_rate": 3.611507955052295e-06, + "loss": 0.80460101, + "num_input_tokens_seen": 80770835, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.90234375, + "step": 3754, + "time_per_iteration": 2.584170341491699 + }, + { + "auxiliary_loss_clip": 0.0113674, + "auxiliary_loss_mlp": 0.01040681, + "balance_loss_clip": 1.0243969, + "balance_loss_mlp": 1.04882884, + "epoch": 0.22576281376822485, + "flos": 19938430788480.0, + "grad_norm": 1.915767444367904, + "language_loss": 0.70267534, + "learning_rate": 3.6112772656656727e-06, + "loss": 0.72444952, + "num_input_tokens_seen": 80787840, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.87890625, + "step": 3755, + "time_per_iteration": 2.458731174468994 + }, + { + "auxiliary_loss_clip": 0.01145193, + "auxiliary_loss_mlp": 0.0104804, + "balance_loss_clip": 1.031744, + "balance_loss_mlp": 1.0502069, + "epoch": 0.22582293702089282, + "flos": 24602005192320.0, + "grad_norm": 2.7446757969812783, + "language_loss": 0.77373838, + "learning_rate": 3.6110465151792547e-06, + "loss": 0.79567063, + "num_input_tokens_seen": 80806335, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.94921875, + "step": 3756, + "time_per_iteration": 2.5073161125183105 + }, + { + "auxiliary_loss_clip": 0.01145039, + "auxiliary_loss_mlp": 0.01042429, + "balance_loss_clip": 1.02498841, + "balance_loss_mlp": 1.05014277, + "epoch": 0.2258830602735608, + "flos": 23035438356480.0, + "grad_norm": 1.8909279955578986, + "language_loss": 0.82552433, + "learning_rate": 3.6108157036017916e-06, + "loss": 0.847399, + "num_input_tokens_seen": 80825355, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.94921875, + "step": 3757, + "time_per_iteration": 2.471353054046631 + }, + { + "auxiliary_loss_clip": 0.01140888, + "auxiliary_loss_mlp": 0.01039381, + "balance_loss_clip": 1.02258492, + "balance_loss_mlp": 1.04810619, + "epoch": 0.22594318352622877, + "flos": 22158427887360.0, + "grad_norm": 1.8410990661161322, + "language_loss": 0.73181808, + "learning_rate": 3.6105848309420358e-06, + "loss": 0.7536208, + "num_input_tokens_seen": 80842570, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.92578125, + "step": 3758, + "time_per_iteration": 2.5376477241516113 + }, + { + "auxiliary_loss_clip": 0.01144551, + "auxiliary_loss_mlp": 0.0104662, + "balance_loss_clip": 1.02985883, + "balance_loss_mlp": 1.04991663, + "epoch": 0.22600330677889674, + "flos": 20594303176320.0, + "grad_norm": 2.0967514749881015, + "language_loss": 0.77208662, + "learning_rate": 3.6103538972087412e-06, + "loss": 0.79399836, + "num_input_tokens_seen": 80858745, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9453125, + "step": 3759, + "time_per_iteration": 2.447608709335327 + }, + { + "auxiliary_loss_clip": 0.01141959, + "auxiliary_loss_mlp": 0.01043995, + "balance_loss_clip": 1.02643597, + "balance_loss_mlp": 1.04806697, + "epoch": 0.2260634300315647, + "flos": 35659798162560.0, + "grad_norm": 1.9036057015372598, + "language_loss": 0.78638428, + "learning_rate": 3.6101229024106655e-06, + "loss": 0.80824387, + "num_input_tokens_seen": 80880085, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.94140625, + "step": 3760, + "time_per_iteration": 4.231990098953247 + }, + { + "auxiliary_loss_clip": 0.01054354, + "auxiliary_loss_mlp": 0.01007925, + "balance_loss_clip": 1.00607765, + "balance_loss_mlp": 1.02028942, + "epoch": 0.22612355328423267, + "flos": 72090455126400.0, + "grad_norm": 0.9344871733021222, + "language_loss": 0.60090166, + "learning_rate": 3.609891846556569e-06, + "loss": 0.62152445, + "num_input_tokens_seen": 80937660, + "router_z_loss_clip": 0.01843262, + "router_z_loss_mlp": 0.33984375, + "step": 3761, + "time_per_iteration": 4.482504367828369 + }, + { + "auxiliary_loss_clip": 0.0114253, + "auxiliary_loss_mlp": 0.01044191, + "balance_loss_clip": 1.02678633, + "balance_loss_mlp": 1.0478611, + "epoch": 0.22618367653690064, + "flos": 22783776693120.0, + "grad_norm": 2.386395888426225, + "language_loss": 0.77400732, + "learning_rate": 3.609660729655211e-06, + "loss": 0.79587454, + "num_input_tokens_seen": 80956265, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9453125, + "step": 3762, + "time_per_iteration": 2.5162198543548584 + }, + { + "auxiliary_loss_clip": 0.01143363, + "auxiliary_loss_mlp": 0.01040981, + "balance_loss_clip": 1.02395821, + "balance_loss_mlp": 1.05073345, + "epoch": 0.22624379978956863, + "flos": 20448254476800.0, + "grad_norm": 2.10132066013886, + "language_loss": 0.78800118, + "learning_rate": 3.6094295517153573e-06, + "loss": 0.80984461, + "num_input_tokens_seen": 80975185, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.92578125, + "step": 3763, + "time_per_iteration": 2.4578778743743896 + }, + { + "auxiliary_loss_clip": 0.01145794, + "auxiliary_loss_mlp": 0.0105418, + "balance_loss_clip": 1.03583384, + "balance_loss_mlp": 1.05000031, + "epoch": 0.2263039230422366, + "flos": 17494314779520.0, + "grad_norm": 1.8659674868358982, + "language_loss": 0.91363662, + "learning_rate": 3.6091983127457743e-06, + "loss": 0.93563628, + "num_input_tokens_seen": 80992830, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.95703125, + "step": 3764, + "time_per_iteration": 2.536231517791748 + }, + { + "auxiliary_loss_clip": 0.01138186, + "auxiliary_loss_mlp": 0.01054666, + "balance_loss_clip": 1.03740454, + "balance_loss_mlp": 1.04773271, + "epoch": 0.22636404629490456, + "flos": 28329748606080.0, + "grad_norm": 1.6188972360392109, + "language_loss": 0.75211406, + "learning_rate": 3.6089670127552293e-06, + "loss": 0.77404261, + "num_input_tokens_seen": 81013675, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.90625, + "step": 3765, + "time_per_iteration": 2.516646146774292 + }, + { + "auxiliary_loss_clip": 0.01139986, + "auxiliary_loss_mlp": 0.01045374, + "balance_loss_clip": 1.02868426, + "balance_loss_mlp": 1.04855943, + "epoch": 0.22642416954757252, + "flos": 17489143221120.0, + "grad_norm": 1.9315012383394614, + "language_loss": 0.89618981, + "learning_rate": 3.608735651752494e-06, + "loss": 0.91804343, + "num_input_tokens_seen": 81030345, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9140625, + "step": 3766, + "time_per_iteration": 2.4829306602478027 + }, + { + "auxiliary_loss_clip": 0.01138287, + "auxiliary_loss_mlp": 0.01042769, + "balance_loss_clip": 1.02568591, + "balance_loss_mlp": 1.04891181, + "epoch": 0.2264842928002405, + "flos": 24384530298240.0, + "grad_norm": 1.6662033714223943, + "language_loss": 0.74710411, + "learning_rate": 3.6085042297463417e-06, + "loss": 0.76891464, + "num_input_tokens_seen": 81051000, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.89453125, + "step": 3767, + "time_per_iteration": 2.4989218711853027 + }, + { + "auxiliary_loss_clip": 0.011397, + "auxiliary_loss_mlp": 0.01044149, + "balance_loss_clip": 1.02664912, + "balance_loss_mlp": 1.04619229, + "epoch": 0.22654441605290845, + "flos": 19830519354240.0, + "grad_norm": 1.4804117361030718, + "language_loss": 0.7156831, + "learning_rate": 3.6082727467455477e-06, + "loss": 0.73752159, + "num_input_tokens_seen": 81071205, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.93359375, + "step": 3768, + "time_per_iteration": 2.5078160762786865 + }, + { + "auxiliary_loss_clip": 0.01143764, + "auxiliary_loss_mlp": 0.01054415, + "balance_loss_clip": 1.03682017, + "balance_loss_mlp": 1.05247319, + "epoch": 0.22660453930557642, + "flos": 27454569730560.0, + "grad_norm": 1.80046116612075, + "language_loss": 0.78268003, + "learning_rate": 3.6080412027588905e-06, + "loss": 0.80466181, + "num_input_tokens_seen": 81091880, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9140625, + "step": 3769, + "time_per_iteration": 2.5122978687286377 + }, + { + "auxiliary_loss_clip": 0.01142038, + "auxiliary_loss_mlp": 0.01042012, + "balance_loss_clip": 1.02465522, + "balance_loss_mlp": 1.0467639, + "epoch": 0.2266646625582444, + "flos": 23988148738560.0, + "grad_norm": 1.7393050758681738, + "language_loss": 0.68427956, + "learning_rate": 3.6078095977951488e-06, + "loss": 0.70612001, + "num_input_tokens_seen": 81113290, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.953125, + "step": 3770, + "time_per_iteration": 2.557098150253296 + }, + { + "auxiliary_loss_clip": 0.01141766, + "auxiliary_loss_mlp": 0.01041674, + "balance_loss_clip": 1.02537811, + "balance_loss_mlp": 1.04682195, + "epoch": 0.22672478581091238, + "flos": 26028054023040.0, + "grad_norm": 1.6251414008252867, + "language_loss": 0.80370939, + "learning_rate": 3.6075779318631067e-06, + "loss": 0.82554382, + "num_input_tokens_seen": 81133535, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.94921875, + "step": 3771, + "time_per_iteration": 2.5156240463256836 + }, + { + "auxiliary_loss_clip": 0.01135038, + "auxiliary_loss_mlp": 0.01045619, + "balance_loss_clip": 1.0290848, + "balance_loss_mlp": 1.04606724, + "epoch": 0.22678490906358034, + "flos": 23841812730240.0, + "grad_norm": 1.567346312954514, + "language_loss": 0.78844583, + "learning_rate": 3.6073462049715486e-06, + "loss": 0.81025243, + "num_input_tokens_seen": 81154650, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.890625, + "step": 3772, + "time_per_iteration": 2.539632558822632 + }, + { + "auxiliary_loss_clip": 0.0105304, + "auxiliary_loss_mlp": 0.01005348, + "balance_loss_clip": 1.00351191, + "balance_loss_mlp": 1.02012253, + "epoch": 0.2268450323162483, + "flos": 65048088574080.0, + "grad_norm": 0.6518085485856671, + "language_loss": 0.54334348, + "learning_rate": 3.607114417129261e-06, + "loss": 0.56392735, + "num_input_tokens_seen": 81221240, + "router_z_loss_clip": 0.01831055, + "router_z_loss_mlp": 0.33007812, + "step": 3773, + "time_per_iteration": 3.1463003158569336 + }, + { + "auxiliary_loss_clip": 0.01136639, + "auxiliary_loss_mlp": 0.01039094, + "balance_loss_clip": 1.02222633, + "balance_loss_mlp": 1.04712117, + "epoch": 0.22690515556891627, + "flos": 22526081544960.0, + "grad_norm": 1.9230264173849037, + "language_loss": 0.70101082, + "learning_rate": 3.6068825683450334e-06, + "loss": 0.72276813, + "num_input_tokens_seen": 81241520, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.89453125, + "step": 3774, + "time_per_iteration": 2.5099127292633057 + }, + { + "auxiliary_loss_clip": 0.01134613, + "auxiliary_loss_mlp": 0.01038845, + "balance_loss_clip": 1.02232277, + "balance_loss_mlp": 1.04480648, + "epoch": 0.22696527882158424, + "flos": 18223444955520.0, + "grad_norm": 2.4369678263863057, + "language_loss": 0.74585366, + "learning_rate": 3.606650658627658e-06, + "loss": 0.76758826, + "num_input_tokens_seen": 81256825, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8984375, + "step": 3775, + "time_per_iteration": 2.4441745281219482 + }, + { + "auxiliary_loss_clip": 0.0113676, + "auxiliary_loss_mlp": 0.01038998, + "balance_loss_clip": 1.02311933, + "balance_loss_mlp": 1.04534245, + "epoch": 0.22702540207425223, + "flos": 17019252478080.0, + "grad_norm": 2.175545430509675, + "language_loss": 0.8256253, + "learning_rate": 3.606418687985928e-06, + "loss": 0.8473829, + "num_input_tokens_seen": 81275695, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.9140625, + "step": 3776, + "time_per_iteration": 2.4418301582336426 + }, + { + "auxiliary_loss_clip": 0.01139885, + "auxiliary_loss_mlp": 0.01037889, + "balance_loss_clip": 1.02125907, + "balance_loss_mlp": 1.04619908, + "epoch": 0.2270855253269202, + "flos": 21325731822720.0, + "grad_norm": 2.75835757539417, + "language_loss": 0.83031607, + "learning_rate": 3.606186656428641e-06, + "loss": 0.85209382, + "num_input_tokens_seen": 81294920, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9375, + "step": 3777, + "time_per_iteration": 2.5585062503814697 + }, + { + "auxiliary_loss_clip": 0.01137385, + "auxiliary_loss_mlp": 0.01039138, + "balance_loss_clip": 1.02232909, + "balance_loss_mlp": 1.04596353, + "epoch": 0.22714564857958816, + "flos": 23550469516800.0, + "grad_norm": 2.6678368583827288, + "language_loss": 0.72658038, + "learning_rate": 3.6059545639645955e-06, + "loss": 0.74834561, + "num_input_tokens_seen": 81314275, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9140625, + "step": 3778, + "time_per_iteration": 2.5019333362579346 + }, + { + "auxiliary_loss_clip": 0.0113896, + "auxiliary_loss_mlp": 0.01040521, + "balance_loss_clip": 1.02386749, + "balance_loss_mlp": 1.04576886, + "epoch": 0.22720577183225613, + "flos": 25989880844160.0, + "grad_norm": 2.229609453971581, + "language_loss": 0.6414392, + "learning_rate": 3.605722410602591e-06, + "loss": 0.663234, + "num_input_tokens_seen": 81333890, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.93359375, + "step": 3779, + "time_per_iteration": 2.5082859992980957 + }, + { + "auxiliary_loss_clip": 0.01138378, + "auxiliary_loss_mlp": 0.01043458, + "balance_loss_clip": 1.02794909, + "balance_loss_mlp": 1.04837573, + "epoch": 0.2272658950849241, + "flos": 20814076540800.0, + "grad_norm": 1.9715072832436495, + "language_loss": 0.70546824, + "learning_rate": 3.6054901963514323e-06, + "loss": 0.72728658, + "num_input_tokens_seen": 81353640, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8984375, + "step": 3780, + "time_per_iteration": 2.4703643321990967 + }, + { + "auxiliary_loss_clip": 0.01140054, + "auxiliary_loss_mlp": 0.0104493, + "balance_loss_clip": 1.02689338, + "balance_loss_mlp": 1.0489254, + "epoch": 0.22732601833759206, + "flos": 23909324342400.0, + "grad_norm": 2.5454366084291133, + "language_loss": 0.89717996, + "learning_rate": 3.6052579212199246e-06, + "loss": 0.91902977, + "num_input_tokens_seen": 81371595, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9140625, + "step": 3781, + "time_per_iteration": 2.4812376499176025 + }, + { + "auxiliary_loss_clip": 0.0113992, + "auxiliary_loss_mlp": 0.01041852, + "balance_loss_clip": 1.02436364, + "balance_loss_mlp": 1.04648304, + "epoch": 0.22738614159026002, + "flos": 15924407978880.0, + "grad_norm": 2.4601522898780805, + "language_loss": 0.7434786, + "learning_rate": 3.6050255852168753e-06, + "loss": 0.76529634, + "num_input_tokens_seen": 81388435, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.93359375, + "step": 3782, + "time_per_iteration": 2.4665582180023193 + }, + { + "auxiliary_loss_clip": 0.01136804, + "auxiliary_loss_mlp": 0.01041674, + "balance_loss_clip": 1.02587914, + "balance_loss_mlp": 1.04467201, + "epoch": 0.22744626484292801, + "flos": 24205515891840.0, + "grad_norm": 1.6148985015615094, + "language_loss": 0.82393098, + "learning_rate": 3.604793188351095e-06, + "loss": 0.84571576, + "num_input_tokens_seen": 81410195, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.921875, + "step": 3783, + "time_per_iteration": 2.4820034503936768 + }, + { + "auxiliary_loss_clip": 0.01137013, + "auxiliary_loss_mlp": 0.01040248, + "balance_loss_clip": 1.02310586, + "balance_loss_mlp": 1.04418266, + "epoch": 0.22750638809559598, + "flos": 24791614110720.0, + "grad_norm": 2.4165791890347714, + "language_loss": 0.75874048, + "learning_rate": 3.6045607306313964e-06, + "loss": 0.78051311, + "num_input_tokens_seen": 81430060, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9296875, + "step": 3784, + "time_per_iteration": 2.5087246894836426 + }, + { + "auxiliary_loss_clip": 0.01134704, + "auxiliary_loss_mlp": 0.01039995, + "balance_loss_clip": 1.02303135, + "balance_loss_mlp": 1.04345798, + "epoch": 0.22756651134826394, + "flos": 22236498097920.0, + "grad_norm": 1.6490497895559066, + "language_loss": 0.70716858, + "learning_rate": 3.604328212066594e-06, + "loss": 0.72891551, + "num_input_tokens_seen": 81447375, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.91015625, + "step": 3785, + "time_per_iteration": 2.4733574390411377 + }, + { + "auxiliary_loss_clip": 0.01051525, + "auxiliary_loss_mlp": 0.01004421, + "balance_loss_clip": 1.00252521, + "balance_loss_mlp": 1.01740241, + "epoch": 0.2276266346009319, + "flos": 62707466626560.0, + "grad_norm": 0.8187947911361427, + "language_loss": 0.61915314, + "learning_rate": 3.6040956326655047e-06, + "loss": 0.63971269, + "num_input_tokens_seen": 81505235, + "router_z_loss_clip": 0.0189209, + "router_z_loss_mlp": 0.34179688, + "step": 3786, + "time_per_iteration": 3.0474631786346436 + }, + { + "auxiliary_loss_clip": 0.01143523, + "auxiliary_loss_mlp": 0.01042446, + "balance_loss_clip": 1.02488649, + "balance_loss_mlp": 1.04777002, + "epoch": 0.22768675785359987, + "flos": 18613936684800.0, + "grad_norm": 2.6740153696427247, + "language_loss": 0.86285794, + "learning_rate": 3.6038629924369486e-06, + "loss": 0.88471758, + "num_input_tokens_seen": 81518685, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.953125, + "step": 3787, + "time_per_iteration": 2.4331281185150146 + }, + { + "auxiliary_loss_clip": 0.01137002, + "auxiliary_loss_mlp": 0.01040164, + "balance_loss_clip": 1.02433276, + "balance_loss_mlp": 1.04612255, + "epoch": 0.22774688110626784, + "flos": 26870195364480.0, + "grad_norm": 1.2844293081892826, + "language_loss": 0.72555876, + "learning_rate": 3.6036302913897474e-06, + "loss": 0.74733031, + "num_input_tokens_seen": 81538940, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.91015625, + "step": 3788, + "time_per_iteration": 2.5378167629241943 + }, + { + "auxiliary_loss_clip": 0.01136486, + "auxiliary_loss_mlp": 0.01036201, + "balance_loss_clip": 1.01929688, + "balance_loss_mlp": 1.04552293, + "epoch": 0.2278070043589358, + "flos": 15553593924480.0, + "grad_norm": 2.4737623033533587, + "language_loss": 0.67524469, + "learning_rate": 3.6033975295327243e-06, + "loss": 0.69697154, + "num_input_tokens_seen": 81555525, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.91015625, + "step": 3789, + "time_per_iteration": 2.412086248397827 + }, + { + "auxiliary_loss_clip": 0.01136455, + "auxiliary_loss_mlp": 0.01041211, + "balance_loss_clip": 1.02416384, + "balance_loss_mlp": 1.04507327, + "epoch": 0.2278671276116038, + "flos": 22416805393920.0, + "grad_norm": 2.1501364843402335, + "language_loss": 0.76075745, + "learning_rate": 3.6031647068747065e-06, + "loss": 0.78253406, + "num_input_tokens_seen": 81576305, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9140625, + "step": 3790, + "time_per_iteration": 2.503600835800171 + }, + { + "auxiliary_loss_clip": 0.01134768, + "auxiliary_loss_mlp": 0.01038527, + "balance_loss_clip": 1.02174211, + "balance_loss_mlp": 1.04253387, + "epoch": 0.22792725086427176, + "flos": 20631363033600.0, + "grad_norm": 2.0794940610838397, + "language_loss": 0.90613973, + "learning_rate": 3.602931823424522e-06, + "loss": 0.92787266, + "num_input_tokens_seen": 81594115, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.921875, + "step": 3791, + "time_per_iteration": 2.4503557682037354 + }, + { + "auxiliary_loss_clip": 0.01138123, + "auxiliary_loss_mlp": 0.01036907, + "balance_loss_clip": 1.02000308, + "balance_loss_mlp": 1.04407096, + "epoch": 0.22798737411693973, + "flos": 31428946903680.0, + "grad_norm": 1.8390004860332834, + "language_loss": 0.82869208, + "learning_rate": 3.6026988791910026e-06, + "loss": 0.85044241, + "num_input_tokens_seen": 81615355, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.94140625, + "step": 3792, + "time_per_iteration": 2.5451550483703613 + }, + { + "auxiliary_loss_clip": 0.01045824, + "auxiliary_loss_mlp": 0.01012041, + "balance_loss_clip": 1.01015747, + "balance_loss_mlp": 1.01168287, + "epoch": 0.2280474973696077, + "flos": 52396685827200.0, + "grad_norm": 1.1436128607221614, + "language_loss": 0.65615487, + "learning_rate": 3.602465874182981e-06, + "loss": 0.67673355, + "num_input_tokens_seen": 81662075, + "router_z_loss_clip": 0.01879883, + "router_z_loss_mlp": 0.34179688, + "step": 3793, + "time_per_iteration": 2.7929015159606934 + }, + { + "auxiliary_loss_clip": 0.01142047, + "auxiliary_loss_mlp": 0.01050177, + "balance_loss_clip": 1.03241456, + "balance_loss_mlp": 1.04557967, + "epoch": 0.22810762062227566, + "flos": 26396066816640.0, + "grad_norm": 2.282271850248546, + "language_loss": 0.77100229, + "learning_rate": 3.602232808409293e-06, + "loss": 0.79292452, + "num_input_tokens_seen": 81681625, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.96484375, + "step": 3794, + "time_per_iteration": 2.4882023334503174 + }, + { + "auxiliary_loss_clip": 0.01139112, + "auxiliary_loss_mlp": 0.01038286, + "balance_loss_clip": 1.02146518, + "balance_loss_mlp": 1.04517698, + "epoch": 0.22816774387494362, + "flos": 25630271832960.0, + "grad_norm": 2.1931228295055716, + "language_loss": 0.80724937, + "learning_rate": 3.6019996818787755e-06, + "loss": 0.82902336, + "num_input_tokens_seen": 81701170, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9375, + "step": 3795, + "time_per_iteration": 2.475311279296875 + }, + { + "auxiliary_loss_clip": 0.0113575, + "auxiliary_loss_mlp": 0.01044854, + "balance_loss_clip": 1.02747297, + "balance_loss_mlp": 1.04336488, + "epoch": 0.22822786712761162, + "flos": 22451602694400.0, + "grad_norm": 1.8416311408581074, + "language_loss": 0.77002209, + "learning_rate": 3.6017664946002704e-06, + "loss": 0.79182816, + "num_input_tokens_seen": 81721265, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.92578125, + "step": 3796, + "time_per_iteration": 2.4734761714935303 + }, + { + "auxiliary_loss_clip": 0.01135997, + "auxiliary_loss_mlp": 0.01038978, + "balance_loss_clip": 1.02236056, + "balance_loss_mlp": 1.04312813, + "epoch": 0.22828799038027958, + "flos": 12202554395520.0, + "grad_norm": 2.506500245398156, + "language_loss": 0.9594354, + "learning_rate": 3.6015332465826188e-06, + "loss": 0.98118514, + "num_input_tokens_seen": 81736565, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9296875, + "step": 3797, + "time_per_iteration": 2.4146203994750977 + }, + { + "auxiliary_loss_clip": 0.01138366, + "auxiliary_loss_mlp": 0.01040269, + "balance_loss_clip": 1.02338922, + "balance_loss_mlp": 1.04537892, + "epoch": 0.22834811363294755, + "flos": 22085708803200.0, + "grad_norm": 1.6428427275001165, + "language_loss": 0.81446218, + "learning_rate": 3.601299937834666e-06, + "loss": 0.83624852, + "num_input_tokens_seen": 81756240, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9296875, + "step": 3798, + "time_per_iteration": 2.490849733352661 + }, + { + "auxiliary_loss_clip": 0.01137089, + "auxiliary_loss_mlp": 0.01038732, + "balance_loss_clip": 1.02080309, + "balance_loss_mlp": 1.04262519, + "epoch": 0.2284082368856155, + "flos": 24860634094080.0, + "grad_norm": 2.3515161945239833, + "language_loss": 0.78744864, + "learning_rate": 3.6010665683652596e-06, + "loss": 0.80920684, + "num_input_tokens_seen": 81775720, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9453125, + "step": 3799, + "time_per_iteration": 2.470564842224121 + }, + { + "auxiliary_loss_clip": 0.0113724, + "auxiliary_loss_mlp": 0.01050228, + "balance_loss_clip": 1.0332408, + "balance_loss_mlp": 1.04381084, + "epoch": 0.22846836013828348, + "flos": 23292882109440.0, + "grad_norm": 1.655995083326211, + "language_loss": 0.75234401, + "learning_rate": 3.6008331381832484e-06, + "loss": 0.77421868, + "num_input_tokens_seen": 81795830, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.93359375, + "step": 3800, + "time_per_iteration": 2.510788917541504 + }, + { + "auxiliary_loss_clip": 0.01137174, + "auxiliary_loss_mlp": 0.01039053, + "balance_loss_clip": 1.02320981, + "balance_loss_mlp": 1.04583156, + "epoch": 0.22852848339095144, + "flos": 27416288810880.0, + "grad_norm": 1.661997570582357, + "language_loss": 0.63433349, + "learning_rate": 3.600599647297484e-06, + "loss": 0.6560958, + "num_input_tokens_seen": 81815745, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.9140625, + "step": 3801, + "time_per_iteration": 2.503643035888672 + }, + { + "auxiliary_loss_clip": 0.01136229, + "auxiliary_loss_mlp": 0.0103618, + "balance_loss_clip": 1.02027762, + "balance_loss_mlp": 1.04721296, + "epoch": 0.2285886066436194, + "flos": 26321157002880.0, + "grad_norm": 1.7846583359688928, + "language_loss": 0.81602335, + "learning_rate": 3.60036609571682e-06, + "loss": 0.83774745, + "num_input_tokens_seen": 81835155, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.890625, + "step": 3802, + "time_per_iteration": 4.002788782119751 + }, + { + "auxiliary_loss_clip": 0.01138233, + "auxiliary_loss_mlp": 0.01047462, + "balance_loss_clip": 1.03027248, + "balance_loss_mlp": 1.04454207, + "epoch": 0.2286487298962874, + "flos": 29716475022720.0, + "grad_norm": 1.7683413549342115, + "language_loss": 0.78830242, + "learning_rate": 3.600132483450114e-06, + "loss": 0.81015933, + "num_input_tokens_seen": 81855655, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9375, + "step": 3803, + "time_per_iteration": 3.9494168758392334 + }, + { + "auxiliary_loss_clip": 0.01135958, + "auxiliary_loss_mlp": 0.01042656, + "balance_loss_clip": 1.02544212, + "balance_loss_mlp": 1.04115725, + "epoch": 0.22870885314895537, + "flos": 21287199507840.0, + "grad_norm": 1.6939241338011581, + "language_loss": 0.85561395, + "learning_rate": 3.5998988105062235e-06, + "loss": 0.87740004, + "num_input_tokens_seen": 81876385, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.94921875, + "step": 3804, + "time_per_iteration": 2.4504544734954834 + }, + { + "auxiliary_loss_clip": 0.01139159, + "auxiliary_loss_mlp": 0.01043693, + "balance_loss_clip": 1.02744436, + "balance_loss_mlp": 1.04339862, + "epoch": 0.22876897640162333, + "flos": 14939450161920.0, + "grad_norm": 2.1651494765134736, + "language_loss": 0.76485813, + "learning_rate": 3.59966507689401e-06, + "loss": 0.78668666, + "num_input_tokens_seen": 81893225, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.9609375, + "step": 3805, + "time_per_iteration": 2.4578893184661865 + }, + { + "auxiliary_loss_clip": 0.01139764, + "auxiliary_loss_mlp": 0.01043484, + "balance_loss_clip": 1.02560234, + "balance_loss_mlp": 1.04387915, + "epoch": 0.2288290996542913, + "flos": 18113917409280.0, + "grad_norm": 2.4014048134005628, + "language_loss": 0.79309744, + "learning_rate": 3.5994312826223363e-06, + "loss": 0.81492996, + "num_input_tokens_seen": 81911350, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.95703125, + "step": 3806, + "time_per_iteration": 2.415726900100708 + }, + { + "auxiliary_loss_clip": 0.01139425, + "auxiliary_loss_mlp": 0.01043738, + "balance_loss_clip": 1.02717948, + "balance_loss_mlp": 1.04547703, + "epoch": 0.22888922290695926, + "flos": 39855457071360.0, + "grad_norm": 2.230394288716221, + "language_loss": 0.69194484, + "learning_rate": 3.5991974277000684e-06, + "loss": 0.71377647, + "num_input_tokens_seen": 81935420, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9375, + "step": 3807, + "time_per_iteration": 2.6051764488220215 + }, + { + "auxiliary_loss_clip": 0.01144692, + "auxiliary_loss_mlp": 0.01053011, + "balance_loss_clip": 1.03484392, + "balance_loss_mlp": 1.04811931, + "epoch": 0.22894934615962723, + "flos": 23403774372480.0, + "grad_norm": 2.5207266425605668, + "language_loss": 0.65717816, + "learning_rate": 3.5989635121360733e-06, + "loss": 0.67915517, + "num_input_tokens_seen": 81953845, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.96484375, + "step": 3808, + "time_per_iteration": 2.463885545730591 + }, + { + "auxiliary_loss_clip": 0.01137242, + "auxiliary_loss_mlp": 0.01041588, + "balance_loss_clip": 1.02564931, + "balance_loss_mlp": 1.04470515, + "epoch": 0.22900946941229522, + "flos": 18843011671680.0, + "grad_norm": 1.8002654314964242, + "language_loss": 0.74498177, + "learning_rate": 3.598729535939222e-06, + "loss": 0.76677001, + "num_input_tokens_seen": 81972100, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.92578125, + "step": 3809, + "time_per_iteration": 2.4587652683258057 + }, + { + "auxiliary_loss_clip": 0.01138179, + "auxiliary_loss_mlp": 0.01042926, + "balance_loss_clip": 1.02695227, + "balance_loss_mlp": 1.04707646, + "epoch": 0.22906959266496318, + "flos": 22929394429440.0, + "grad_norm": 1.6413135962032894, + "language_loss": 0.81699908, + "learning_rate": 3.5984954991183862e-06, + "loss": 0.83881009, + "num_input_tokens_seen": 81992760, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.9140625, + "step": 3810, + "time_per_iteration": 2.454545736312866 + }, + { + "auxiliary_loss_clip": 0.01135521, + "auxiliary_loss_mlp": 0.01040213, + "balance_loss_clip": 1.02448893, + "balance_loss_mlp": 1.04428005, + "epoch": 0.22912971591763115, + "flos": 19354523299200.0, + "grad_norm": 2.1876822434942245, + "language_loss": 0.78671384, + "learning_rate": 3.598261401682441e-06, + "loss": 0.8084712, + "num_input_tokens_seen": 82009080, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.9140625, + "step": 3811, + "time_per_iteration": 2.4564197063446045 + }, + { + "auxiliary_loss_clip": 0.01135961, + "auxiliary_loss_mlp": 0.01046878, + "balance_loss_clip": 1.0296042, + "balance_loss_mlp": 1.04317403, + "epoch": 0.22918983917029911, + "flos": 19933546538880.0, + "grad_norm": 1.8120535445273127, + "language_loss": 0.82811391, + "learning_rate": 3.5980272436402632e-06, + "loss": 0.84994221, + "num_input_tokens_seen": 82026705, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9296875, + "step": 3812, + "time_per_iteration": 2.4357566833496094 + }, + { + "auxiliary_loss_clip": 0.01144518, + "auxiliary_loss_mlp": 0.01051465, + "balance_loss_clip": 1.03463292, + "balance_loss_mlp": 1.04750013, + "epoch": 0.22924996242296708, + "flos": 16690885320960.0, + "grad_norm": 3.041111828111396, + "language_loss": 0.82337058, + "learning_rate": 3.5977930250007324e-06, + "loss": 0.84533036, + "num_input_tokens_seen": 82043245, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.96875, + "step": 3813, + "time_per_iteration": 2.4521987438201904 + }, + { + "auxiliary_loss_clip": 0.01139715, + "auxiliary_loss_mlp": 0.01046648, + "balance_loss_clip": 1.03009009, + "balance_loss_mlp": 1.04595184, + "epoch": 0.22931008567563504, + "flos": 33036164956800.0, + "grad_norm": 3.1740680187078896, + "language_loss": 0.69927102, + "learning_rate": 3.5975587457727298e-06, + "loss": 0.72113466, + "num_input_tokens_seen": 82066870, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9375, + "step": 3814, + "time_per_iteration": 2.5528602600097656 + }, + { + "auxiliary_loss_clip": 0.01134595, + "auxiliary_loss_mlp": 0.01044391, + "balance_loss_clip": 1.02773738, + "balance_loss_mlp": 1.04310775, + "epoch": 0.229370208928303, + "flos": 23330696152320.0, + "grad_norm": 2.479981906508555, + "language_loss": 0.67106915, + "learning_rate": 3.597324405965139e-06, + "loss": 0.69285899, + "num_input_tokens_seen": 82083180, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9140625, + "step": 3815, + "time_per_iteration": 2.4768760204315186 + }, + { + "auxiliary_loss_clip": 0.01139552, + "auxiliary_loss_mlp": 0.01052238, + "balance_loss_clip": 1.03593004, + "balance_loss_mlp": 1.04644942, + "epoch": 0.229430332180971, + "flos": 28617213150720.0, + "grad_norm": 1.8467960453518941, + "language_loss": 0.83103681, + "learning_rate": 3.597090005586848e-06, + "loss": 0.85295475, + "num_input_tokens_seen": 82102950, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.9296875, + "step": 3816, + "time_per_iteration": 2.507967710494995 + }, + { + "auxiliary_loss_clip": 0.0113842, + "auxiliary_loss_mlp": 0.01038991, + "balance_loss_clip": 1.02221847, + "balance_loss_mlp": 1.04643357, + "epoch": 0.22949045543363897, + "flos": 17238199829760.0, + "grad_norm": 2.1171855882825636, + "language_loss": 0.86756372, + "learning_rate": 3.596855544646742e-06, + "loss": 0.8893379, + "num_input_tokens_seen": 82119510, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.91796875, + "step": 3817, + "time_per_iteration": 2.4445815086364746 + }, + { + "auxiliary_loss_clip": 0.01142243, + "auxiliary_loss_mlp": 0.01049311, + "balance_loss_clip": 1.03278852, + "balance_loss_mlp": 1.04829407, + "epoch": 0.22955057868630693, + "flos": 27489438858240.0, + "grad_norm": 2.403232678237585, + "language_loss": 0.75039381, + "learning_rate": 3.5966210231537154e-06, + "loss": 0.77230936, + "num_input_tokens_seen": 82140095, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.9375, + "step": 3818, + "time_per_iteration": 2.508527994155884 + }, + { + "auxiliary_loss_clip": 0.01141204, + "auxiliary_loss_mlp": 0.01041338, + "balance_loss_clip": 1.02426732, + "balance_loss_mlp": 1.04769611, + "epoch": 0.2296107019389749, + "flos": 23476421629440.0, + "grad_norm": 1.6537639427714739, + "language_loss": 0.74597251, + "learning_rate": 3.596386441116659e-06, + "loss": 0.76779795, + "num_input_tokens_seen": 82159510, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.93359375, + "step": 3819, + "time_per_iteration": 2.5009493827819824 + }, + { + "auxiliary_loss_clip": 0.01138376, + "auxiliary_loss_mlp": 0.01044786, + "balance_loss_clip": 1.02806103, + "balance_loss_mlp": 1.04632187, + "epoch": 0.22967082519164286, + "flos": 31285160760960.0, + "grad_norm": 1.815385500594849, + "language_loss": 0.80775046, + "learning_rate": 3.5961517985444684e-06, + "loss": 0.8295821, + "num_input_tokens_seen": 82179580, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.921875, + "step": 3820, + "time_per_iteration": 2.5374531745910645 + }, + { + "auxiliary_loss_clip": 0.01142613, + "auxiliary_loss_mlp": 0.01041984, + "balance_loss_clip": 1.02384043, + "balance_loss_mlp": 1.04725921, + "epoch": 0.22973094844431083, + "flos": 14642935390080.0, + "grad_norm": 2.0886359367899763, + "language_loss": 0.69226766, + "learning_rate": 3.595917095446042e-06, + "loss": 0.71411359, + "num_input_tokens_seen": 82195585, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.953125, + "step": 3821, + "time_per_iteration": 2.4539082050323486 + }, + { + "auxiliary_loss_clip": 0.0113954, + "auxiliary_loss_mlp": 0.01036487, + "balance_loss_clip": 1.01912975, + "balance_loss_mlp": 1.0466336, + "epoch": 0.2297910716969788, + "flos": 22823853292800.0, + "grad_norm": 1.623620301878745, + "language_loss": 0.82655883, + "learning_rate": 3.5956823318302796e-06, + "loss": 0.84831905, + "num_input_tokens_seen": 82217530, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9296875, + "step": 3822, + "time_per_iteration": 2.5025360584259033 + }, + { + "auxiliary_loss_clip": 0.01137437, + "auxiliary_loss_mlp": 0.01040965, + "balance_loss_clip": 1.02264285, + "balance_loss_mlp": 1.04520607, + "epoch": 0.2298511949496468, + "flos": 23039029716480.0, + "grad_norm": 1.581563173789708, + "language_loss": 0.66093826, + "learning_rate": 3.5954475077060833e-06, + "loss": 0.68272227, + "num_input_tokens_seen": 82237980, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.921875, + "step": 3823, + "time_per_iteration": 2.500643253326416 + }, + { + "auxiliary_loss_clip": 0.0104753, + "auxiliary_loss_mlp": 0.01001124, + "balance_loss_clip": 0.99913329, + "balance_loss_mlp": 1.01448655, + "epoch": 0.22991131820231475, + "flos": 66890914911360.0, + "grad_norm": 0.8191682875264555, + "language_loss": 0.56770015, + "learning_rate": 3.595212623082357e-06, + "loss": 0.58818674, + "num_input_tokens_seen": 82301785, + "router_z_loss_clip": 0.01989746, + "router_z_loss_mlp": 0.33203125, + "step": 3824, + "time_per_iteration": 3.1365485191345215 + }, + { + "auxiliary_loss_clip": 0.01135805, + "auxiliary_loss_mlp": 0.01039563, + "balance_loss_clip": 1.02363658, + "balance_loss_mlp": 1.04575276, + "epoch": 0.22997144145498272, + "flos": 17887248633600.0, + "grad_norm": 2.487273324074565, + "language_loss": 0.72840559, + "learning_rate": 3.594977677968009e-06, + "loss": 0.75015926, + "num_input_tokens_seen": 82317355, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.90234375, + "step": 3825, + "time_per_iteration": 2.444730758666992 + }, + { + "auxiliary_loss_clip": 0.01143286, + "auxiliary_loss_mlp": 0.01046033, + "balance_loss_clip": 1.02810407, + "balance_loss_mlp": 1.04978526, + "epoch": 0.23003156470765068, + "flos": 24676843178880.0, + "grad_norm": 1.8892090994393747, + "language_loss": 0.87760615, + "learning_rate": 3.5947426723719473e-06, + "loss": 0.89949936, + "num_input_tokens_seen": 82336645, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9375, + "step": 3826, + "time_per_iteration": 2.492682456970215 + }, + { + "auxiliary_loss_clip": 0.01142911, + "auxiliary_loss_mlp": 0.01043844, + "balance_loss_clip": 1.0258677, + "balance_loss_mlp": 1.04683542, + "epoch": 0.23009168796031865, + "flos": 15814126247040.0, + "grad_norm": 2.6663888482282623, + "language_loss": 0.81568289, + "learning_rate": 3.594507606303083e-06, + "loss": 0.8375504, + "num_input_tokens_seen": 82354225, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9609375, + "step": 3827, + "time_per_iteration": 2.488593578338623 + }, + { + "auxiliary_loss_clip": 0.01135849, + "auxiliary_loss_mlp": 0.01043921, + "balance_loss_clip": 1.02750623, + "balance_loss_mlp": 1.04553437, + "epoch": 0.2301518112129866, + "flos": 16212842190720.0, + "grad_norm": 1.8456206141648608, + "language_loss": 0.86791205, + "learning_rate": 3.5942724797703314e-06, + "loss": 0.88970977, + "num_input_tokens_seen": 82370240, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90625, + "step": 3828, + "time_per_iteration": 2.4386606216430664 + }, + { + "auxiliary_loss_clip": 0.01138396, + "auxiliary_loss_mlp": 0.01049169, + "balance_loss_clip": 1.03147864, + "balance_loss_mlp": 1.04512644, + "epoch": 0.2302119344656546, + "flos": 20595452411520.0, + "grad_norm": 2.106420485404446, + "language_loss": 0.70638877, + "learning_rate": 3.594037292782607e-06, + "loss": 0.72826439, + "num_input_tokens_seen": 82389145, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.93359375, + "step": 3829, + "time_per_iteration": 2.475399971008301 + }, + { + "auxiliary_loss_clip": 0.01139852, + "auxiliary_loss_mlp": 0.01038095, + "balance_loss_clip": 1.02241933, + "balance_loss_mlp": 1.05011487, + "epoch": 0.23027205771832257, + "flos": 26796901662720.0, + "grad_norm": 1.5719627508253273, + "language_loss": 0.84045994, + "learning_rate": 3.5938020453488293e-06, + "loss": 0.86223942, + "num_input_tokens_seen": 82409185, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8984375, + "step": 3830, + "time_per_iteration": 2.4943718910217285 + }, + { + "auxiliary_loss_clip": 0.01139071, + "auxiliary_loss_mlp": 0.01049012, + "balance_loss_clip": 1.03172636, + "balance_loss_mlp": 1.04637957, + "epoch": 0.23033218097099054, + "flos": 43873143068160.0, + "grad_norm": 1.733206127117623, + "language_loss": 0.66863495, + "learning_rate": 3.5935667374779177e-06, + "loss": 0.69051576, + "num_input_tokens_seen": 82432070, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.92578125, + "step": 3831, + "time_per_iteration": 2.6513662338256836 + }, + { + "auxiliary_loss_clip": 0.01141151, + "auxiliary_loss_mlp": 0.01042727, + "balance_loss_clip": 1.02603793, + "balance_loss_mlp": 1.04735637, + "epoch": 0.2303923042236585, + "flos": 26067663745920.0, + "grad_norm": 2.238850649877041, + "language_loss": 0.75253022, + "learning_rate": 3.5933313691787957e-06, + "loss": 0.77436894, + "num_input_tokens_seen": 82450625, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9375, + "step": 3832, + "time_per_iteration": 2.4889180660247803 + }, + { + "auxiliary_loss_clip": 0.01139559, + "auxiliary_loss_mlp": 0.01043075, + "balance_loss_clip": 1.02515745, + "balance_loss_mlp": 1.04709673, + "epoch": 0.23045242747632647, + "flos": 18296379521280.0, + "grad_norm": 1.8583815246829203, + "language_loss": 0.87474239, + "learning_rate": 3.593095940460389e-06, + "loss": 0.89656878, + "num_input_tokens_seen": 82468575, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.92578125, + "step": 3833, + "time_per_iteration": 2.46744966506958 + }, + { + "auxiliary_loss_clip": 0.01140821, + "auxiliary_loss_mlp": 0.01047215, + "balance_loss_clip": 1.02950096, + "balance_loss_mlp": 1.0478369, + "epoch": 0.23051255072899443, + "flos": 25520528805120.0, + "grad_norm": 3.2120713643012206, + "language_loss": 0.74875945, + "learning_rate": 3.592860451331624e-06, + "loss": 0.77063978, + "num_input_tokens_seen": 82488655, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9296875, + "step": 3834, + "time_per_iteration": 2.485504627227783 + }, + { + "auxiliary_loss_clip": 0.0113943, + "auxiliary_loss_mlp": 0.01051682, + "balance_loss_clip": 1.03408706, + "balance_loss_mlp": 1.0484879, + "epoch": 0.2305726739816624, + "flos": 21215198695680.0, + "grad_norm": 1.820281268490984, + "language_loss": 0.85338157, + "learning_rate": 3.592624901801432e-06, + "loss": 0.87529278, + "num_input_tokens_seen": 82507220, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.91015625, + "step": 3835, + "time_per_iteration": 2.4730474948883057 + }, + { + "auxiliary_loss_clip": 0.01146651, + "auxiliary_loss_mlp": 0.01049278, + "balance_loss_clip": 1.03142083, + "balance_loss_mlp": 1.04814029, + "epoch": 0.2306327972343304, + "flos": 23331127115520.0, + "grad_norm": 2.799799470431086, + "language_loss": 0.81974924, + "learning_rate": 3.5923892918787432e-06, + "loss": 0.84170854, + "num_input_tokens_seen": 82527920, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.984375, + "step": 3836, + "time_per_iteration": 2.464657783508301 + }, + { + "auxiliary_loss_clip": 0.0114557, + "auxiliary_loss_mlp": 0.01043707, + "balance_loss_clip": 1.02726793, + "balance_loss_mlp": 1.05202293, + "epoch": 0.23069292048699835, + "flos": 20666734951680.0, + "grad_norm": 1.7793450137018207, + "language_loss": 0.79603267, + "learning_rate": 3.5921536215724934e-06, + "loss": 0.81792545, + "num_input_tokens_seen": 82549040, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9375, + "step": 3837, + "time_per_iteration": 2.4715559482574463 + }, + { + "auxiliary_loss_clip": 0.01055276, + "auxiliary_loss_mlp": 0.01017826, + "balance_loss_clip": 1.01614499, + "balance_loss_mlp": 1.02046371, + "epoch": 0.23075304373966632, + "flos": 70454832393600.0, + "grad_norm": 0.9409846751082755, + "language_loss": 0.65487945, + "learning_rate": 3.5919178908916184e-06, + "loss": 0.67561042, + "num_input_tokens_seen": 82604070, + "router_z_loss_clip": 0.0168457, + "router_z_loss_mlp": 0.34765625, + "step": 3838, + "time_per_iteration": 2.9852375984191895 + }, + { + "auxiliary_loss_clip": 0.01139351, + "auxiliary_loss_mlp": 0.01047713, + "balance_loss_clip": 1.03131008, + "balance_loss_mlp": 1.04721856, + "epoch": 0.23081316699233428, + "flos": 16617986668800.0, + "grad_norm": 2.6310373190732648, + "language_loss": 0.7527796, + "learning_rate": 3.591682099845058e-06, + "loss": 0.77465028, + "num_input_tokens_seen": 82619665, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.921875, + "step": 3839, + "time_per_iteration": 2.4290778636932373 + }, + { + "auxiliary_loss_clip": 0.01145463, + "auxiliary_loss_mlp": 0.01042021, + "balance_loss_clip": 1.02486694, + "balance_loss_mlp": 1.0510757, + "epoch": 0.23087329024500225, + "flos": 13298081253120.0, + "grad_norm": 4.016837458595543, + "language_loss": 0.68691337, + "learning_rate": 3.591446248441752e-06, + "loss": 0.70878816, + "num_input_tokens_seen": 82637530, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9453125, + "step": 3840, + "time_per_iteration": 2.456422805786133 + }, + { + "auxiliary_loss_clip": 0.01143425, + "auxiliary_loss_mlp": 0.01039716, + "balance_loss_clip": 1.02084517, + "balance_loss_mlp": 1.04936612, + "epoch": 0.23093341349767021, + "flos": 17785729820160.0, + "grad_norm": 2.1574295618121426, + "language_loss": 0.79412574, + "learning_rate": 3.591210336690645e-06, + "loss": 0.81595719, + "num_input_tokens_seen": 82656130, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.9375, + "step": 3841, + "time_per_iteration": 2.4762818813323975 + }, + { + "auxiliary_loss_clip": 0.01141641, + "auxiliary_loss_mlp": 0.01041348, + "balance_loss_clip": 1.02557695, + "balance_loss_mlp": 1.04872346, + "epoch": 0.23099353675033818, + "flos": 23988076911360.0, + "grad_norm": 5.070488540070664, + "language_loss": 0.83171731, + "learning_rate": 3.590974364600683e-06, + "loss": 0.85354722, + "num_input_tokens_seen": 82675295, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.9296875, + "step": 3842, + "time_per_iteration": 2.4908032417297363 + }, + { + "auxiliary_loss_clip": 0.01139394, + "auxiliary_loss_mlp": 0.0104314, + "balance_loss_clip": 1.0255568, + "balance_loss_mlp": 1.04567111, + "epoch": 0.23105366000300617, + "flos": 35995168471680.0, + "grad_norm": 1.6842769818445011, + "language_loss": 0.66523731, + "learning_rate": 3.5907383321808135e-06, + "loss": 0.68706262, + "num_input_tokens_seen": 82703260, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9375, + "step": 3843, + "time_per_iteration": 2.6503937244415283 + }, + { + "auxiliary_loss_clip": 0.01138914, + "auxiliary_loss_mlp": 0.01043512, + "balance_loss_clip": 1.02642977, + "balance_loss_mlp": 1.04793119, + "epoch": 0.23111378325567414, + "flos": 31245335556480.0, + "grad_norm": 1.8910129932977493, + "language_loss": 0.77445257, + "learning_rate": 3.590502239439987e-06, + "loss": 0.79627681, + "num_input_tokens_seen": 82725060, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.91015625, + "step": 3844, + "time_per_iteration": 5.4645676612854 + }, + { + "auxiliary_loss_clip": 0.01142407, + "auxiliary_loss_mlp": 0.01041287, + "balance_loss_clip": 1.02321458, + "balance_loss_mlp": 1.04744804, + "epoch": 0.2311739065083421, + "flos": 19208223204480.0, + "grad_norm": 1.6615026518232119, + "language_loss": 0.77974623, + "learning_rate": 3.590266086387156e-06, + "loss": 0.80158317, + "num_input_tokens_seen": 82742960, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.94921875, + "step": 3845, + "time_per_iteration": 2.467289686203003 + }, + { + "auxiliary_loss_clip": 0.01133475, + "auxiliary_loss_mlp": 0.01032005, + "balance_loss_clip": 1.01687717, + "balance_loss_mlp": 1.04577661, + "epoch": 0.23123402976101007, + "flos": 23360178240000.0, + "grad_norm": 2.1438137502119425, + "language_loss": 0.76064527, + "learning_rate": 3.590029873031276e-06, + "loss": 0.78230006, + "num_input_tokens_seen": 82760205, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.875, + "step": 3846, + "time_per_iteration": 2.4985382556915283 + }, + { + "auxiliary_loss_clip": 0.01140881, + "auxiliary_loss_mlp": 0.0104335, + "balance_loss_clip": 1.02638626, + "balance_loss_mlp": 1.04725194, + "epoch": 0.23129415301367803, + "flos": 13735365425280.0, + "grad_norm": 2.4609763976845556, + "language_loss": 0.69493651, + "learning_rate": 3.589793599381304e-06, + "loss": 0.71677887, + "num_input_tokens_seen": 82778590, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9375, + "step": 3847, + "time_per_iteration": 2.4514195919036865 + }, + { + "auxiliary_loss_clip": 0.01048129, + "auxiliary_loss_mlp": 0.01002559, + "balance_loss_clip": 1.00074661, + "balance_loss_mlp": 1.01598144, + "epoch": 0.231354276266346, + "flos": 69737015001600.0, + "grad_norm": 0.7927409416341922, + "language_loss": 0.61051595, + "learning_rate": 3.589557265446198e-06, + "loss": 0.63102281, + "num_input_tokens_seen": 82833925, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.3203125, + "step": 3848, + "time_per_iteration": 2.981518030166626 + }, + { + "auxiliary_loss_clip": 0.011385, + "auxiliary_loss_mlp": 0.0104523, + "balance_loss_clip": 1.02789688, + "balance_loss_mlp": 1.04593349, + "epoch": 0.231414399519014, + "flos": 18835900778880.0, + "grad_norm": 2.568019101440284, + "language_loss": 0.7746805, + "learning_rate": 3.589320871234923e-06, + "loss": 0.79651785, + "num_input_tokens_seen": 82850625, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.92578125, + "step": 3849, + "time_per_iteration": 2.450693130493164 + }, + { + "auxiliary_loss_clip": 0.01139635, + "auxiliary_loss_mlp": 0.01042495, + "balance_loss_clip": 1.02551913, + "balance_loss_mlp": 1.04533124, + "epoch": 0.23147452277168196, + "flos": 36135470995200.0, + "grad_norm": 1.9223002445017061, + "language_loss": 0.71673942, + "learning_rate": 3.5890844167564405e-06, + "loss": 0.73856068, + "num_input_tokens_seen": 82872105, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9453125, + "step": 3850, + "time_per_iteration": 2.589395761489868 + }, + { + "auxiliary_loss_clip": 0.01137166, + "auxiliary_loss_mlp": 0.01035391, + "balance_loss_clip": 1.01870215, + "balance_loss_mlp": 1.04362154, + "epoch": 0.23153464602434992, + "flos": 20812927305600.0, + "grad_norm": 3.8422038584857665, + "language_loss": 0.75846308, + "learning_rate": 3.588847902019718e-06, + "loss": 0.78018856, + "num_input_tokens_seen": 82890595, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.93359375, + "step": 3851, + "time_per_iteration": 2.495729446411133 + }, + { + "auxiliary_loss_clip": 0.01138492, + "auxiliary_loss_mlp": 0.01040821, + "balance_loss_clip": 1.0234046, + "balance_loss_mlp": 1.04747272, + "epoch": 0.2315947692770179, + "flos": 19939256801280.0, + "grad_norm": 1.914141324585442, + "language_loss": 0.69797802, + "learning_rate": 3.588611327033723e-06, + "loss": 0.71977121, + "num_input_tokens_seen": 82908910, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.91015625, + "step": 3852, + "time_per_iteration": 2.478408098220825 + }, + { + "auxiliary_loss_clip": 0.01140513, + "auxiliary_loss_mlp": 0.01037305, + "balance_loss_clip": 1.0206399, + "balance_loss_mlp": 1.04643583, + "epoch": 0.23165489252968585, + "flos": 12855553695360.0, + "grad_norm": 2.1861380100726144, + "language_loss": 0.67030561, + "learning_rate": 3.588374691807428e-06, + "loss": 0.69208378, + "num_input_tokens_seen": 82925405, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.94140625, + "step": 3853, + "time_per_iteration": 2.4445838928222656 + }, + { + "auxiliary_loss_clip": 0.01141194, + "auxiliary_loss_mlp": 0.01035545, + "balance_loss_clip": 1.01815248, + "balance_loss_mlp": 1.04680121, + "epoch": 0.23171501578235382, + "flos": 30628282792320.0, + "grad_norm": 1.6671703506367506, + "language_loss": 0.79851103, + "learning_rate": 3.5881379963498053e-06, + "loss": 0.82027847, + "num_input_tokens_seen": 82945615, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9453125, + "step": 3854, + "time_per_iteration": 2.5455782413482666 + }, + { + "auxiliary_loss_clip": 0.01146661, + "auxiliary_loss_mlp": 0.01042654, + "balance_loss_clip": 1.02476025, + "balance_loss_mlp": 1.04726899, + "epoch": 0.23177513903502178, + "flos": 23842782397440.0, + "grad_norm": 3.8560715318244556, + "language_loss": 0.64987147, + "learning_rate": 3.587901240669831e-06, + "loss": 0.67176461, + "num_input_tokens_seen": 82967570, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9921875, + "step": 3855, + "time_per_iteration": 2.5996124744415283 + }, + { + "auxiliary_loss_clip": 0.01140829, + "auxiliary_loss_mlp": 0.01044083, + "balance_loss_clip": 1.02753139, + "balance_loss_mlp": 1.04570055, + "epoch": 0.23183526228768978, + "flos": 29570282668800.0, + "grad_norm": 2.1096123404526623, + "language_loss": 0.70711654, + "learning_rate": 3.5876644247764815e-06, + "loss": 0.72896564, + "num_input_tokens_seen": 82987435, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.953125, + "step": 3856, + "time_per_iteration": 2.5024092197418213 + }, + { + "auxiliary_loss_clip": 0.01137323, + "auxiliary_loss_mlp": 0.01035633, + "balance_loss_clip": 1.02062488, + "balance_loss_mlp": 1.0464257, + "epoch": 0.23189538554035774, + "flos": 34458694254720.0, + "grad_norm": 6.089384897844753, + "language_loss": 0.76997125, + "learning_rate": 3.5874275486787387e-06, + "loss": 0.79170084, + "num_input_tokens_seen": 83010505, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.91015625, + "step": 3857, + "time_per_iteration": 2.5962576866149902 + }, + { + "auxiliary_loss_clip": 0.01143962, + "auxiliary_loss_mlp": 0.01048446, + "balance_loss_clip": 1.03018308, + "balance_loss_mlp": 1.0477798, + "epoch": 0.2319555087930257, + "flos": 18003815245440.0, + "grad_norm": 3.478057752262005, + "language_loss": 0.91006696, + "learning_rate": 3.587190612385584e-06, + "loss": 0.93199098, + "num_input_tokens_seen": 83026705, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.96484375, + "step": 3858, + "time_per_iteration": 2.4276509284973145 + }, + { + "auxiliary_loss_clip": 0.01136894, + "auxiliary_loss_mlp": 0.01042737, + "balance_loss_clip": 1.02576208, + "balance_loss_mlp": 1.04679012, + "epoch": 0.23201563204569367, + "flos": 23143852581120.0, + "grad_norm": 2.1437168922033747, + "language_loss": 0.75995493, + "learning_rate": 3.5869536159060026e-06, + "loss": 0.78175128, + "num_input_tokens_seen": 83046500, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.8984375, + "step": 3859, + "time_per_iteration": 2.485426187515259 + }, + { + "auxiliary_loss_clip": 0.01136619, + "auxiliary_loss_mlp": 0.01036655, + "balance_loss_clip": 1.01962614, + "balance_loss_mlp": 1.04423487, + "epoch": 0.23207575529836164, + "flos": 20667991927680.0, + "grad_norm": 1.9055462071213993, + "language_loss": 0.84061682, + "learning_rate": 3.58671655924898e-06, + "loss": 0.86234951, + "num_input_tokens_seen": 83065280, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.92578125, + "step": 3860, + "time_per_iteration": 2.4607324600219727 + }, + { + "auxiliary_loss_clip": 0.01137991, + "auxiliary_loss_mlp": 0.01040318, + "balance_loss_clip": 1.02317619, + "balance_loss_mlp": 1.04656291, + "epoch": 0.2321358785510296, + "flos": 16472189364480.0, + "grad_norm": 2.1337823805291047, + "language_loss": 0.82972974, + "learning_rate": 3.586479442423508e-06, + "loss": 0.85151279, + "num_input_tokens_seen": 83082310, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9140625, + "step": 3861, + "time_per_iteration": 2.451805591583252 + }, + { + "auxiliary_loss_clip": 0.01142125, + "auxiliary_loss_mlp": 0.01043892, + "balance_loss_clip": 1.02702415, + "balance_loss_mlp": 1.04800034, + "epoch": 0.2321960018036976, + "flos": 21616320850560.0, + "grad_norm": 1.8456518711772996, + "language_loss": 0.85918242, + "learning_rate": 3.586242265438576e-06, + "loss": 0.8810426, + "num_input_tokens_seen": 83102065, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.94140625, + "step": 3862, + "time_per_iteration": 2.4582395553588867 + }, + { + "auxiliary_loss_clip": 0.01136451, + "auxiliary_loss_mlp": 0.01044214, + "balance_loss_clip": 1.02914643, + "balance_loss_mlp": 1.0468179, + "epoch": 0.23225612505636556, + "flos": 22271474966400.0, + "grad_norm": 1.3833481647146872, + "language_loss": 0.7492758, + "learning_rate": 3.5860050283031773e-06, + "loss": 0.7710824, + "num_input_tokens_seen": 83121445, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8984375, + "step": 3863, + "time_per_iteration": 2.496985912322998 + }, + { + "auxiliary_loss_clip": 0.01139904, + "auxiliary_loss_mlp": 0.01042767, + "balance_loss_clip": 1.02723408, + "balance_loss_mlp": 1.05037498, + "epoch": 0.23231624830903352, + "flos": 17052325925760.0, + "grad_norm": 2.003739732436234, + "language_loss": 0.74640852, + "learning_rate": 3.58576773102631e-06, + "loss": 0.76823521, + "num_input_tokens_seen": 83138175, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.89453125, + "step": 3864, + "time_per_iteration": 2.440204381942749 + }, + { + "auxiliary_loss_clip": 0.0113912, + "auxiliary_loss_mlp": 0.01035726, + "balance_loss_clip": 1.01952517, + "balance_loss_mlp": 1.0468204, + "epoch": 0.2323763715617015, + "flos": 34640043045120.0, + "grad_norm": 3.940820538439298, + "language_loss": 0.70690906, + "learning_rate": 3.5855303736169714e-06, + "loss": 0.72865754, + "num_input_tokens_seen": 83161975, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.921875, + "step": 3865, + "time_per_iteration": 2.598194122314453 + }, + { + "auxiliary_loss_clip": 0.01148702, + "auxiliary_loss_mlp": 0.01049623, + "balance_loss_clip": 1.03091884, + "balance_loss_mlp": 1.04987264, + "epoch": 0.23243649481436945, + "flos": 25551698832000.0, + "grad_norm": 1.9658537667403149, + "language_loss": 0.94853866, + "learning_rate": 3.5852929560841617e-06, + "loss": 0.97052193, + "num_input_tokens_seen": 83180905, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.98828125, + "step": 3866, + "time_per_iteration": 2.496276617050171 + }, + { + "auxiliary_loss_clip": 0.01138876, + "auxiliary_loss_mlp": 0.01040339, + "balance_loss_clip": 1.02412629, + "balance_loss_mlp": 1.04817796, + "epoch": 0.23249661806703742, + "flos": 20483482740480.0, + "grad_norm": 2.6667540210019123, + "language_loss": 0.72528732, + "learning_rate": 3.5850554784368846e-06, + "loss": 0.74707949, + "num_input_tokens_seen": 83196390, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.90625, + "step": 3867, + "time_per_iteration": 2.4933414459228516 + }, + { + "auxiliary_loss_clip": 0.01140693, + "auxiliary_loss_mlp": 0.01043897, + "balance_loss_clip": 1.02625418, + "balance_loss_mlp": 1.04734945, + "epoch": 0.23255674131970538, + "flos": 20376612800640.0, + "grad_norm": 1.8421111702540602, + "language_loss": 0.82411921, + "learning_rate": 3.584817940684145e-06, + "loss": 0.84596509, + "num_input_tokens_seen": 83216165, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.93359375, + "step": 3868, + "time_per_iteration": 2.4994540214538574 + }, + { + "auxiliary_loss_clip": 0.01136829, + "auxiliary_loss_mlp": 0.01040452, + "balance_loss_clip": 1.02433491, + "balance_loss_mlp": 1.04700828, + "epoch": 0.23261686457237338, + "flos": 17056096853760.0, + "grad_norm": 1.815886356300666, + "language_loss": 0.73335075, + "learning_rate": 3.58458034283495e-06, + "loss": 0.75512362, + "num_input_tokens_seen": 83233845, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8984375, + "step": 3869, + "time_per_iteration": 2.4486095905303955 + }, + { + "auxiliary_loss_clip": 0.01139645, + "auxiliary_loss_mlp": 0.01047185, + "balance_loss_clip": 1.03108525, + "balance_loss_mlp": 1.04929376, + "epoch": 0.23267698782504134, + "flos": 29169878785920.0, + "grad_norm": 1.6948965109205438, + "language_loss": 0.79564929, + "learning_rate": 3.5843426848983097e-06, + "loss": 0.81751764, + "num_input_tokens_seen": 83254930, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.90234375, + "step": 3870, + "time_per_iteration": 2.506114959716797 + }, + { + "auxiliary_loss_clip": 0.01143066, + "auxiliary_loss_mlp": 0.01043212, + "balance_loss_clip": 1.02574801, + "balance_loss_mlp": 1.04845953, + "epoch": 0.2327371110777093, + "flos": 21174655219200.0, + "grad_norm": 3.2368167151878797, + "language_loss": 0.70599115, + "learning_rate": 3.5841049668832357e-06, + "loss": 0.72785389, + "num_input_tokens_seen": 83272095, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9453125, + "step": 3871, + "time_per_iteration": 2.455266237258911 + }, + { + "auxiliary_loss_clip": 0.01145685, + "auxiliary_loss_mlp": 0.01055983, + "balance_loss_clip": 1.03674293, + "balance_loss_mlp": 1.05011845, + "epoch": 0.23279723433037727, + "flos": 24863112132480.0, + "grad_norm": 2.2694181422477313, + "language_loss": 0.69087327, + "learning_rate": 3.5838671887987433e-06, + "loss": 0.71289003, + "num_input_tokens_seen": 83290980, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.95703125, + "step": 3872, + "time_per_iteration": 2.482089042663574 + }, + { + "auxiliary_loss_clip": 0.01147162, + "auxiliary_loss_mlp": 0.01045167, + "balance_loss_clip": 1.0271188, + "balance_loss_mlp": 1.04984593, + "epoch": 0.23285735758304524, + "flos": 38800617344640.0, + "grad_norm": 1.4965805681858408, + "language_loss": 0.78046703, + "learning_rate": 3.5836293506538474e-06, + "loss": 0.80239034, + "num_input_tokens_seen": 83315175, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.97265625, + "step": 3873, + "time_per_iteration": 2.6179678440093994 + }, + { + "auxiliary_loss_clip": 0.01053819, + "auxiliary_loss_mlp": 0.01009657, + "balance_loss_clip": 1.00777328, + "balance_loss_mlp": 1.02347898, + "epoch": 0.2329174808357132, + "flos": 53944113692160.0, + "grad_norm": 0.841863213022928, + "language_loss": 0.60519493, + "learning_rate": 3.5833914524575687e-06, + "loss": 0.6258297, + "num_input_tokens_seen": 83372060, + "router_z_loss_clip": 0.01879883, + "router_z_loss_mlp": 0.3046875, + "step": 3874, + "time_per_iteration": 2.955524444580078 + }, + { + "auxiliary_loss_clip": 0.01142096, + "auxiliary_loss_mlp": 0.01044266, + "balance_loss_clip": 1.02695727, + "balance_loss_mlp": 1.04998708, + "epoch": 0.23297760408838117, + "flos": 21216024708480.0, + "grad_norm": 2.0817330720741287, + "language_loss": 0.8082279, + "learning_rate": 3.583153494218927e-06, + "loss": 0.83009154, + "num_input_tokens_seen": 83389795, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.921875, + "step": 3875, + "time_per_iteration": 2.4961941242218018 + }, + { + "auxiliary_loss_clip": 0.01141065, + "auxiliary_loss_mlp": 0.01039949, + "balance_loss_clip": 1.02440381, + "balance_loss_mlp": 1.04931068, + "epoch": 0.23303772734104916, + "flos": 28403006394240.0, + "grad_norm": 1.6586054731564495, + "language_loss": 0.60997009, + "learning_rate": 3.5829154759469464e-06, + "loss": 0.63178027, + "num_input_tokens_seen": 83410005, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.91796875, + "step": 3876, + "time_per_iteration": 2.5234174728393555 + }, + { + "auxiliary_loss_clip": 0.01144475, + "auxiliary_loss_mlp": 0.0104992, + "balance_loss_clip": 1.0319072, + "balance_loss_mlp": 1.05151403, + "epoch": 0.23309785059371713, + "flos": 24314720215680.0, + "grad_norm": 1.9912662806979935, + "language_loss": 0.70357525, + "learning_rate": 3.5826773976506523e-06, + "loss": 0.72551912, + "num_input_tokens_seen": 83430250, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9296875, + "step": 3877, + "time_per_iteration": 2.5117876529693604 + }, + { + "auxiliary_loss_clip": 0.01142635, + "auxiliary_loss_mlp": 0.0104808, + "balance_loss_clip": 1.02984059, + "balance_loss_mlp": 1.04846656, + "epoch": 0.2331579738463851, + "flos": 15992925171840.0, + "grad_norm": 2.20617127152986, + "language_loss": 0.81169856, + "learning_rate": 3.582439259339073e-06, + "loss": 0.83360565, + "num_input_tokens_seen": 83447950, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.94140625, + "step": 3878, + "time_per_iteration": 2.418745517730713 + }, + { + "auxiliary_loss_clip": 0.01145943, + "auxiliary_loss_mlp": 0.01047242, + "balance_loss_clip": 1.02914643, + "balance_loss_mlp": 1.04905999, + "epoch": 0.23321809709905306, + "flos": 36426957863040.0, + "grad_norm": 2.449565501872003, + "language_loss": 0.74765849, + "learning_rate": 3.5822010610212374e-06, + "loss": 0.76959032, + "num_input_tokens_seen": 83467785, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.96875, + "step": 3879, + "time_per_iteration": 2.627453088760376 + }, + { + "auxiliary_loss_clip": 0.0113984, + "auxiliary_loss_mlp": 0.01043428, + "balance_loss_clip": 1.02597582, + "balance_loss_mlp": 1.04611635, + "epoch": 0.23327822035172102, + "flos": 21324762155520.0, + "grad_norm": 2.3281305870509685, + "language_loss": 0.89896512, + "learning_rate": 3.5819628027061795e-06, + "loss": 0.92079782, + "num_input_tokens_seen": 83485390, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9375, + "step": 3880, + "time_per_iteration": 2.529181957244873 + }, + { + "auxiliary_loss_clip": 0.01144521, + "auxiliary_loss_mlp": 0.01046534, + "balance_loss_clip": 1.02926075, + "balance_loss_mlp": 1.05019975, + "epoch": 0.233338343604389, + "flos": 19171881619200.0, + "grad_norm": 1.7300006336865508, + "language_loss": 0.72026277, + "learning_rate": 3.5817244844029334e-06, + "loss": 0.74217331, + "num_input_tokens_seen": 83504890, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9453125, + "step": 3881, + "time_per_iteration": 2.5004756450653076 + }, + { + "auxiliary_loss_clip": 0.01138796, + "auxiliary_loss_mlp": 0.01044785, + "balance_loss_clip": 1.02798867, + "balance_loss_mlp": 1.04610527, + "epoch": 0.23339846685705698, + "flos": 26908368543360.0, + "grad_norm": 1.5765664683306326, + "language_loss": 0.67988127, + "learning_rate": 3.581486106120537e-06, + "loss": 0.70171714, + "num_input_tokens_seen": 83526475, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9296875, + "step": 3882, + "time_per_iteration": 2.5134541988372803 + }, + { + "auxiliary_loss_clip": 0.01143679, + "auxiliary_loss_mlp": 0.01057975, + "balance_loss_clip": 1.04020119, + "balance_loss_mlp": 1.0481658, + "epoch": 0.23345859010972494, + "flos": 32343160884480.0, + "grad_norm": 3.2831975264627116, + "language_loss": 0.76596051, + "learning_rate": 3.5812476678680287e-06, + "loss": 0.78797704, + "num_input_tokens_seen": 83546620, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.953125, + "step": 3883, + "time_per_iteration": 2.5556836128234863 + }, + { + "auxiliary_loss_clip": 0.01046918, + "auxiliary_loss_mlp": 0.01002528, + "balance_loss_clip": 1.00059688, + "balance_loss_mlp": 1.01619315, + "epoch": 0.2335187133623929, + "flos": 58484229050880.0, + "grad_norm": 0.7953130928556094, + "language_loss": 0.59102494, + "learning_rate": 3.58100916965445e-06, + "loss": 0.6115194, + "num_input_tokens_seen": 83616160, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.30859375, + "step": 3884, + "time_per_iteration": 3.210090398788452 + }, + { + "auxiliary_loss_clip": 0.01139917, + "auxiliary_loss_mlp": 0.01035881, + "balance_loss_clip": 1.0196687, + "balance_loss_mlp": 1.04723644, + "epoch": 0.23357883661506088, + "flos": 24502317972480.0, + "grad_norm": 3.4795297654408617, + "language_loss": 0.80128157, + "learning_rate": 3.5807706114888455e-06, + "loss": 0.82303953, + "num_input_tokens_seen": 83636795, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.92578125, + "step": 3885, + "time_per_iteration": 4.129857301712036 + }, + { + "auxiliary_loss_clip": 0.01139579, + "auxiliary_loss_mlp": 0.01039954, + "balance_loss_clip": 1.02257299, + "balance_loss_mlp": 1.04763317, + "epoch": 0.23363895986772884, + "flos": 18948516894720.0, + "grad_norm": 2.392049069504846, + "language_loss": 0.88482237, + "learning_rate": 3.580531993380261e-06, + "loss": 0.9066177, + "num_input_tokens_seen": 83654050, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.921875, + "step": 3886, + "time_per_iteration": 4.002579689025879 + }, + { + "auxiliary_loss_clip": 0.01143892, + "auxiliary_loss_mlp": 0.01041505, + "balance_loss_clip": 1.02452922, + "balance_loss_mlp": 1.04953825, + "epoch": 0.2336990831203968, + "flos": 31686821619840.0, + "grad_norm": 2.2740188667520815, + "language_loss": 0.73199034, + "learning_rate": 3.5802933153377445e-06, + "loss": 0.75384426, + "num_input_tokens_seen": 83673720, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9453125, + "step": 3887, + "time_per_iteration": 2.5730721950531006 + }, + { + "auxiliary_loss_clip": 0.0114256, + "auxiliary_loss_mlp": 0.01043796, + "balance_loss_clip": 1.02709508, + "balance_loss_mlp": 1.04827881, + "epoch": 0.23375920637306477, + "flos": 27709750926720.0, + "grad_norm": 1.8689872769958875, + "language_loss": 0.84098816, + "learning_rate": 3.5800545773703475e-06, + "loss": 0.86285174, + "num_input_tokens_seen": 83693470, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.94140625, + "step": 3888, + "time_per_iteration": 2.526090145111084 + }, + { + "auxiliary_loss_clip": 0.01140206, + "auxiliary_loss_mlp": 0.01051088, + "balance_loss_clip": 1.03400528, + "balance_loss_mlp": 1.04775357, + "epoch": 0.23381932962573276, + "flos": 17675627656320.0, + "grad_norm": 5.34722340994348, + "language_loss": 0.87174153, + "learning_rate": 3.5798157794871225e-06, + "loss": 0.89365447, + "num_input_tokens_seen": 83711620, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.921875, + "step": 3889, + "time_per_iteration": 2.465535879135132 + }, + { + "auxiliary_loss_clip": 0.01143335, + "auxiliary_loss_mlp": 0.0104294, + "balance_loss_clip": 1.02659607, + "balance_loss_mlp": 1.04914057, + "epoch": 0.23387945287840073, + "flos": 14390842763520.0, + "grad_norm": 4.26980733686294, + "language_loss": 0.7660414, + "learning_rate": 3.579576921697125e-06, + "loss": 0.78790414, + "num_input_tokens_seen": 83727890, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.94140625, + "step": 3890, + "time_per_iteration": 2.4164645671844482 + }, + { + "auxiliary_loss_clip": 0.01144006, + "auxiliary_loss_mlp": 0.01046428, + "balance_loss_clip": 1.02940536, + "balance_loss_mlp": 1.05018783, + "epoch": 0.2339395761310687, + "flos": 46097988503040.0, + "grad_norm": 3.12388753004446, + "language_loss": 0.73396742, + "learning_rate": 3.579338004009412e-06, + "loss": 0.75587177, + "num_input_tokens_seen": 83749370, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9375, + "step": 3891, + "time_per_iteration": 2.692251443862915 + }, + { + "auxiliary_loss_clip": 0.01136776, + "auxiliary_loss_mlp": 0.01040069, + "balance_loss_clip": 1.0227952, + "balance_loss_mlp": 1.04672241, + "epoch": 0.23399969938373666, + "flos": 22382044007040.0, + "grad_norm": 1.6638493558493535, + "language_loss": 0.82791233, + "learning_rate": 3.5790990264330433e-06, + "loss": 0.84968084, + "num_input_tokens_seen": 83769560, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.8984375, + "step": 3892, + "time_per_iteration": 2.4657654762268066 + }, + { + "auxiliary_loss_clip": 0.01143467, + "auxiliary_loss_mlp": 0.01042619, + "balance_loss_clip": 1.02550626, + "balance_loss_mlp": 1.04892194, + "epoch": 0.23405982263640462, + "flos": 43508542066560.0, + "grad_norm": 2.124834647136637, + "language_loss": 0.64928782, + "learning_rate": 3.578859988977082e-06, + "loss": 0.67114866, + "num_input_tokens_seen": 83795635, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9453125, + "step": 3893, + "time_per_iteration": 2.6640076637268066 + }, + { + "auxiliary_loss_clip": 0.01139173, + "auxiliary_loss_mlp": 0.01038221, + "balance_loss_clip": 1.02056575, + "balance_loss_mlp": 1.04930127, + "epoch": 0.2341199458890726, + "flos": 22564685687040.0, + "grad_norm": 2.3013698222001753, + "language_loss": 0.79011095, + "learning_rate": 3.5786208916505916e-06, + "loss": 0.81188488, + "num_input_tokens_seen": 83814090, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.8984375, + "step": 3894, + "time_per_iteration": 2.4596238136291504 + }, + { + "auxiliary_loss_clip": 0.01139997, + "auxiliary_loss_mlp": 0.01044293, + "balance_loss_clip": 1.02772284, + "balance_loss_mlp": 1.0473485, + "epoch": 0.23418006914174055, + "flos": 25633970933760.0, + "grad_norm": 1.4729608662155413, + "language_loss": 0.81608742, + "learning_rate": 3.5783817344626383e-06, + "loss": 0.83793032, + "num_input_tokens_seen": 83836870, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.92578125, + "step": 3895, + "time_per_iteration": 2.5229499340057373 + }, + { + "auxiliary_loss_clip": 0.01141397, + "auxiliary_loss_mlp": 0.01048743, + "balance_loss_clip": 1.03210139, + "balance_loss_mlp": 1.04895353, + "epoch": 0.23424019239440855, + "flos": 13545936074880.0, + "grad_norm": 2.370345363223057, + "language_loss": 0.79861861, + "learning_rate": 3.578142517422292e-06, + "loss": 0.82052004, + "num_input_tokens_seen": 83853275, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.92578125, + "step": 3896, + "time_per_iteration": 2.4219553470611572 + }, + { + "auxiliary_loss_clip": 0.01142956, + "auxiliary_loss_mlp": 0.01042754, + "balance_loss_clip": 1.02507555, + "balance_loss_mlp": 1.04863656, + "epoch": 0.2343003156470765, + "flos": 22419498913920.0, + "grad_norm": 1.6083647422684384, + "language_loss": 0.83279634, + "learning_rate": 3.577903240538623e-06, + "loss": 0.85465348, + "num_input_tokens_seen": 83872340, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9453125, + "step": 3897, + "time_per_iteration": 2.497347593307495 + }, + { + "auxiliary_loss_clip": 0.01144102, + "auxiliary_loss_mlp": 0.01048556, + "balance_loss_clip": 1.03093636, + "balance_loss_mlp": 1.04880857, + "epoch": 0.23436043889974448, + "flos": 14790815683200.0, + "grad_norm": 2.0551194275294784, + "language_loss": 0.79281437, + "learning_rate": 3.577663903820705e-06, + "loss": 0.8147409, + "num_input_tokens_seen": 83888795, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.953125, + "step": 3898, + "time_per_iteration": 2.4275295734405518 + }, + { + "auxiliary_loss_clip": 0.01139139, + "auxiliary_loss_mlp": 0.01047646, + "balance_loss_clip": 1.0316844, + "balance_loss_mlp": 1.05034626, + "epoch": 0.23442056215241244, + "flos": 22965700101120.0, + "grad_norm": 3.329769754331659, + "language_loss": 0.73955798, + "learning_rate": 3.577424507277614e-06, + "loss": 0.76142585, + "num_input_tokens_seen": 83906820, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.88671875, + "step": 3899, + "time_per_iteration": 2.5017077922821045 + }, + { + "auxiliary_loss_clip": 0.01141437, + "auxiliary_loss_mlp": 0.01051006, + "balance_loss_clip": 1.03412604, + "balance_loss_mlp": 1.04896975, + "epoch": 0.2344806854050804, + "flos": 23071887682560.0, + "grad_norm": 1.8374782290855665, + "language_loss": 0.75695914, + "learning_rate": 3.5771850509184277e-06, + "loss": 0.77888358, + "num_input_tokens_seen": 83926370, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.92578125, + "step": 3900, + "time_per_iteration": 2.4796969890594482 + }, + { + "auxiliary_loss_clip": 0.01137483, + "auxiliary_loss_mlp": 0.01049218, + "balance_loss_clip": 1.03224266, + "balance_loss_mlp": 1.04685295, + "epoch": 0.23454080865774837, + "flos": 16327074418560.0, + "grad_norm": 1.9641187800197561, + "language_loss": 0.66949147, + "learning_rate": 3.5769455347522256e-06, + "loss": 0.69135845, + "num_input_tokens_seen": 83944600, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.90625, + "step": 3901, + "time_per_iteration": 2.5052907466888428 + }, + { + "auxiliary_loss_clip": 0.01050259, + "auxiliary_loss_mlp": 0.0101831, + "balance_loss_clip": 1.01646185, + "balance_loss_mlp": 1.01950026, + "epoch": 0.23460093191041637, + "flos": 67760958142080.0, + "grad_norm": 0.7670843237762338, + "language_loss": 0.58209252, + "learning_rate": 3.576705958788091e-06, + "loss": 0.6027782, + "num_input_tokens_seen": 84005100, + "router_z_loss_clip": 0.01843262, + "router_z_loss_mlp": 0.30859375, + "step": 3902, + "time_per_iteration": 3.0522701740264893 + }, + { + "auxiliary_loss_clip": 0.01140756, + "auxiliary_loss_mlp": 0.01044175, + "balance_loss_clip": 1.02684176, + "balance_loss_mlp": 1.04932666, + "epoch": 0.23466105516308433, + "flos": 20077619990400.0, + "grad_norm": 1.9913375770157136, + "language_loss": 0.80411339, + "learning_rate": 3.576466323035108e-06, + "loss": 0.82596278, + "num_input_tokens_seen": 84023775, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9140625, + "step": 3903, + "time_per_iteration": 2.515796184539795 + }, + { + "auxiliary_loss_clip": 0.01139226, + "auxiliary_loss_mlp": 0.01039647, + "balance_loss_clip": 1.02274299, + "balance_loss_mlp": 1.04670942, + "epoch": 0.2347211784157523, + "flos": 24535714642560.0, + "grad_norm": 3.712536549247666, + "language_loss": 0.82183945, + "learning_rate": 3.5762266275023645e-06, + "loss": 0.84362817, + "num_input_tokens_seen": 84042605, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.92578125, + "step": 3904, + "time_per_iteration": 2.48119854927063 + }, + { + "auxiliary_loss_clip": 0.01140858, + "auxiliary_loss_mlp": 0.0104346, + "balance_loss_clip": 1.02642536, + "balance_loss_mlp": 1.05013537, + "epoch": 0.23478130166842026, + "flos": 23805040181760.0, + "grad_norm": 1.9990680719867946, + "language_loss": 0.7137326, + "learning_rate": 3.57598687219895e-06, + "loss": 0.7355758, + "num_input_tokens_seen": 84061520, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.90625, + "step": 3905, + "time_per_iteration": 2.494558811187744 + }, + { + "auxiliary_loss_clip": 0.01136571, + "auxiliary_loss_mlp": 0.01036326, + "balance_loss_clip": 1.01987517, + "balance_loss_mlp": 1.04811251, + "epoch": 0.23484142492108823, + "flos": 24093618048000.0, + "grad_norm": 1.865256832649412, + "language_loss": 0.70834756, + "learning_rate": 3.5757470571339543e-06, + "loss": 0.73007655, + "num_input_tokens_seen": 84081800, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8828125, + "step": 3906, + "time_per_iteration": 2.5057764053344727 + }, + { + "auxiliary_loss_clip": 0.01144181, + "auxiliary_loss_mlp": 0.01037627, + "balance_loss_clip": 1.01881528, + "balance_loss_mlp": 1.04728532, + "epoch": 0.2349015481737562, + "flos": 29095830898560.0, + "grad_norm": 2.129912307166789, + "language_loss": 0.73542202, + "learning_rate": 3.575507182316473e-06, + "loss": 0.75724012, + "num_input_tokens_seen": 84102340, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.96875, + "step": 3907, + "time_per_iteration": 2.5734074115753174 + }, + { + "auxiliary_loss_clip": 0.01141507, + "auxiliary_loss_mlp": 0.01047564, + "balance_loss_clip": 1.03004074, + "balance_loss_mlp": 1.04927719, + "epoch": 0.23496167142642416, + "flos": 18916305373440.0, + "grad_norm": 1.7646530569469054, + "language_loss": 0.72807813, + "learning_rate": 3.575267247755601e-06, + "loss": 0.74996883, + "num_input_tokens_seen": 84120370, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.921875, + "step": 3908, + "time_per_iteration": 2.438422441482544 + }, + { + "auxiliary_loss_clip": 0.01049243, + "auxiliary_loss_mlp": 0.01002133, + "balance_loss_clip": 1.00030959, + "balance_loss_mlp": 1.01835775, + "epoch": 0.23502179467909215, + "flos": 55868062896000.0, + "grad_norm": 1.0194055540826834, + "language_loss": 0.73271406, + "learning_rate": 3.5750272534604367e-06, + "loss": 0.75322783, + "num_input_tokens_seen": 84165515, + "router_z_loss_clip": 0.01818848, + "router_z_loss_mlp": 0.30859375, + "step": 3909, + "time_per_iteration": 2.8451788425445557 + }, + { + "auxiliary_loss_clip": 0.01139398, + "auxiliary_loss_mlp": 0.01043023, + "balance_loss_clip": 1.02607155, + "balance_loss_mlp": 1.04842734, + "epoch": 0.23508191793176011, + "flos": 23401763210880.0, + "grad_norm": 1.5487453833335116, + "language_loss": 0.87906706, + "learning_rate": 3.5747871994400822e-06, + "loss": 0.9008913, + "num_input_tokens_seen": 84184540, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.91015625, + "step": 3910, + "time_per_iteration": 2.4648385047912598 + }, + { + "auxiliary_loss_clip": 0.01141916, + "auxiliary_loss_mlp": 0.01040084, + "balance_loss_clip": 1.02370465, + "balance_loss_mlp": 1.04950166, + "epoch": 0.23514204118442808, + "flos": 20047671025920.0, + "grad_norm": 2.1910966534760297, + "language_loss": 0.75809109, + "learning_rate": 3.5745470857036386e-06, + "loss": 0.7799111, + "num_input_tokens_seen": 84202025, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.921875, + "step": 3911, + "time_per_iteration": 2.4715898036956787 + }, + { + "auxiliary_loss_clip": 0.01136455, + "auxiliary_loss_mlp": 0.01042737, + "balance_loss_clip": 1.02729297, + "balance_loss_mlp": 1.04807627, + "epoch": 0.23520216443709605, + "flos": 21580589796480.0, + "grad_norm": 1.9083148186883727, + "language_loss": 0.81775904, + "learning_rate": 3.5743069122602122e-06, + "loss": 0.83955097, + "num_input_tokens_seen": 84221895, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8828125, + "step": 3912, + "time_per_iteration": 2.4627628326416016 + }, + { + "auxiliary_loss_clip": 0.01139949, + "auxiliary_loss_mlp": 0.01050703, + "balance_loss_clip": 1.03270197, + "balance_loss_mlp": 1.04939759, + "epoch": 0.235262287689764, + "flos": 23185796688000.0, + "grad_norm": 1.7554989092460516, + "language_loss": 0.71664345, + "learning_rate": 3.574066679118909e-06, + "loss": 0.73854995, + "num_input_tokens_seen": 84240455, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.90625, + "step": 3913, + "time_per_iteration": 2.5080020427703857 + }, + { + "auxiliary_loss_clip": 0.01147528, + "auxiliary_loss_mlp": 0.01045028, + "balance_loss_clip": 1.02691996, + "balance_loss_mlp": 1.05220175, + "epoch": 0.23532241094243198, + "flos": 23185222070400.0, + "grad_norm": 1.7040704955860875, + "language_loss": 0.75903499, + "learning_rate": 3.57382638628884e-06, + "loss": 0.78096056, + "num_input_tokens_seen": 84261605, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.953125, + "step": 3914, + "time_per_iteration": 2.487429618835449 + }, + { + "auxiliary_loss_clip": 0.01141443, + "auxiliary_loss_mlp": 0.01040515, + "balance_loss_clip": 1.02307451, + "balance_loss_mlp": 1.05093837, + "epoch": 0.23538253419509997, + "flos": 17019324305280.0, + "grad_norm": 2.554647654086476, + "language_loss": 0.89353001, + "learning_rate": 3.5735860337791174e-06, + "loss": 0.9153496, + "num_input_tokens_seen": 84278675, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.90625, + "step": 3915, + "time_per_iteration": 2.500753402709961 + }, + { + "auxiliary_loss_clip": 0.01044736, + "auxiliary_loss_mlp": 0.01003661, + "balance_loss_clip": 1.00158656, + "balance_loss_mlp": 1.0141747, + "epoch": 0.23544265744776793, + "flos": 63448588967040.0, + "grad_norm": 0.8049654288159457, + "language_loss": 0.5935356, + "learning_rate": 3.573345621598854e-06, + "loss": 0.61401957, + "num_input_tokens_seen": 84329765, + "router_z_loss_clip": 0.02075195, + "router_z_loss_mlp": 0.3046875, + "step": 3916, + "time_per_iteration": 2.9926259517669678 + }, + { + "auxiliary_loss_clip": 0.01042644, + "auxiliary_loss_mlp": 0.01002857, + "balance_loss_clip": 1.00075865, + "balance_loss_mlp": 1.01226258, + "epoch": 0.2355027807004359, + "flos": 70515343831680.0, + "grad_norm": 0.7742950949727582, + "language_loss": 0.49486533, + "learning_rate": 3.5731051497571675e-06, + "loss": 0.51532036, + "num_input_tokens_seen": 84393680, + "router_z_loss_clip": 0.02099609, + "router_z_loss_mlp": 0.3046875, + "step": 3917, + "time_per_iteration": 3.085294723510742 + }, + { + "auxiliary_loss_clip": 0.01142529, + "auxiliary_loss_mlp": 0.01052441, + "balance_loss_clip": 1.03615093, + "balance_loss_mlp": 1.04923129, + "epoch": 0.23556290395310386, + "flos": 21434289701760.0, + "grad_norm": 2.000752484300541, + "language_loss": 0.76012552, + "learning_rate": 3.5728646182631756e-06, + "loss": 0.78207517, + "num_input_tokens_seen": 84412640, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.93359375, + "step": 3918, + "time_per_iteration": 2.4883201122283936 + }, + { + "auxiliary_loss_clip": 0.01145359, + "auxiliary_loss_mlp": 0.0104448, + "balance_loss_clip": 1.02805305, + "balance_loss_mlp": 1.04997587, + "epoch": 0.23562302720577183, + "flos": 18186421011840.0, + "grad_norm": 2.209135495431813, + "language_loss": 0.68728662, + "learning_rate": 3.5726240271259995e-06, + "loss": 0.709185, + "num_input_tokens_seen": 84431605, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.953125, + "step": 3919, + "time_per_iteration": 2.4489476680755615 + }, + { + "auxiliary_loss_clip": 0.01137524, + "auxiliary_loss_mlp": 0.01038874, + "balance_loss_clip": 1.02216101, + "balance_loss_mlp": 1.04864836, + "epoch": 0.2356831504584398, + "flos": 33730497832320.0, + "grad_norm": 1.8210843900818243, + "language_loss": 0.70324695, + "learning_rate": 3.5723833763547634e-06, + "loss": 0.72501087, + "num_input_tokens_seen": 84454210, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.88671875, + "step": 3920, + "time_per_iteration": 2.6011908054351807 + }, + { + "auxiliary_loss_clip": 0.01141332, + "auxiliary_loss_mlp": 0.01046958, + "balance_loss_clip": 1.03128195, + "balance_loss_mlp": 1.05122209, + "epoch": 0.23574327371110776, + "flos": 24932778560640.0, + "grad_norm": 1.6333300745229378, + "language_loss": 0.77596343, + "learning_rate": 3.5721426659585916e-06, + "loss": 0.79784632, + "num_input_tokens_seen": 84475540, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8984375, + "step": 3921, + "time_per_iteration": 2.498924732208252 + }, + { + "auxiliary_loss_clip": 0.01142015, + "auxiliary_loss_mlp": 0.01042804, + "balance_loss_clip": 1.02615058, + "balance_loss_mlp": 1.05108023, + "epoch": 0.23580339696377575, + "flos": 17822107319040.0, + "grad_norm": 2.5438781918161375, + "language_loss": 0.7561245, + "learning_rate": 3.571901895946612e-06, + "loss": 0.7779727, + "num_input_tokens_seen": 84494580, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.91015625, + "step": 3922, + "time_per_iteration": 2.467103958129883 + }, + { + "auxiliary_loss_clip": 0.01138646, + "auxiliary_loss_mlp": 0.01041381, + "balance_loss_clip": 1.02583599, + "balance_loss_mlp": 1.0489881, + "epoch": 0.23586352021644372, + "flos": 26286611097600.0, + "grad_norm": 2.3317912313524625, + "language_loss": 0.80016744, + "learning_rate": 3.571661066327956e-06, + "loss": 0.82196772, + "num_input_tokens_seen": 84513850, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8984375, + "step": 3923, + "time_per_iteration": 2.5075273513793945 + }, + { + "auxiliary_loss_clip": 0.01138213, + "auxiliary_loss_mlp": 0.01046068, + "balance_loss_clip": 1.02985525, + "balance_loss_mlp": 1.04845715, + "epoch": 0.23592364346911168, + "flos": 14246697484800.0, + "grad_norm": 1.9692150152538963, + "language_loss": 0.74753797, + "learning_rate": 3.571420177111754e-06, + "loss": 0.76938081, + "num_input_tokens_seen": 84532315, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8984375, + "step": 3924, + "time_per_iteration": 2.442448377609253 + }, + { + "auxiliary_loss_clip": 0.01141205, + "auxiliary_loss_mlp": 0.01046148, + "balance_loss_clip": 1.03013766, + "balance_loss_mlp": 1.04995513, + "epoch": 0.23598376672177965, + "flos": 18587938216320.0, + "grad_norm": 2.1681544357284093, + "language_loss": 0.82770467, + "learning_rate": 3.5711792283071416e-06, + "loss": 0.84957814, + "num_input_tokens_seen": 84550970, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.9140625, + "step": 3925, + "time_per_iteration": 2.44718337059021 + }, + { + "auxiliary_loss_clip": 0.01138195, + "auxiliary_loss_mlp": 0.01047882, + "balance_loss_clip": 1.03100252, + "balance_loss_mlp": 1.04645014, + "epoch": 0.2360438899744476, + "flos": 22675542036480.0, + "grad_norm": 1.8844556004317345, + "language_loss": 0.59408414, + "learning_rate": 3.5709382199232564e-06, + "loss": 0.61594486, + "num_input_tokens_seen": 84571655, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.91796875, + "step": 3926, + "time_per_iteration": 2.4840757846832275 + }, + { + "auxiliary_loss_clip": 0.01135063, + "auxiliary_loss_mlp": 0.01045392, + "balance_loss_clip": 1.02977526, + "balance_loss_mlp": 1.04721665, + "epoch": 0.23610401322711558, + "flos": 29570139014400.0, + "grad_norm": 1.967091588265342, + "language_loss": 0.71317631, + "learning_rate": 3.570697151969235e-06, + "loss": 0.73498082, + "num_input_tokens_seen": 84593130, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.87890625, + "step": 3927, + "time_per_iteration": 4.117234945297241 + }, + { + "auxiliary_loss_clip": 0.01137568, + "auxiliary_loss_mlp": 0.01044401, + "balance_loss_clip": 1.0295651, + "balance_loss_mlp": 1.04787612, + "epoch": 0.23616413647978354, + "flos": 17858520731520.0, + "grad_norm": 1.8263460078369782, + "language_loss": 0.75102496, + "learning_rate": 3.570456024454221e-06, + "loss": 0.77284467, + "num_input_tokens_seen": 84612410, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8984375, + "step": 3928, + "time_per_iteration": 3.9637200832366943 + }, + { + "auxiliary_loss_clip": 0.01137493, + "auxiliary_loss_mlp": 0.01048389, + "balance_loss_clip": 1.03086567, + "balance_loss_mlp": 1.04693556, + "epoch": 0.23622425973245154, + "flos": 11034847157760.0, + "grad_norm": 2.885999758146942, + "language_loss": 0.81520462, + "learning_rate": 3.5702148373873576e-06, + "loss": 0.83706343, + "num_input_tokens_seen": 84627610, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.90625, + "step": 3929, + "time_per_iteration": 2.499310255050659 + }, + { + "auxiliary_loss_clip": 0.01146116, + "auxiliary_loss_mlp": 0.01047853, + "balance_loss_clip": 1.02998328, + "balance_loss_mlp": 1.04974854, + "epoch": 0.2362843829851195, + "flos": 23404061681280.0, + "grad_norm": 4.669381706210694, + "language_loss": 0.7194528, + "learning_rate": 3.569973590777789e-06, + "loss": 0.74139249, + "num_input_tokens_seen": 84648415, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.9609375, + "step": 3930, + "time_per_iteration": 2.4964945316314697 + }, + { + "auxiliary_loss_clip": 0.01137432, + "auxiliary_loss_mlp": 0.0103904, + "balance_loss_clip": 1.02245224, + "balance_loss_mlp": 1.046561, + "epoch": 0.23634450623778747, + "flos": 39529855261440.0, + "grad_norm": 2.489267518834959, + "language_loss": 0.73764896, + "learning_rate": 3.569732284634665e-06, + "loss": 0.7594136, + "num_input_tokens_seen": 84670080, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.91015625, + "step": 3931, + "time_per_iteration": 2.6283528804779053 + }, + { + "auxiliary_loss_clip": 0.01140852, + "auxiliary_loss_mlp": 0.01039788, + "balance_loss_clip": 1.02245522, + "balance_loss_mlp": 1.04971111, + "epoch": 0.23640462949045543, + "flos": 24207167917440.0, + "grad_norm": 2.06419219579993, + "language_loss": 0.8026945, + "learning_rate": 3.569490918967136e-06, + "loss": 0.82450092, + "num_input_tokens_seen": 84686465, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9140625, + "step": 3932, + "time_per_iteration": 2.4901018142700195 + }, + { + "auxiliary_loss_clip": 0.01138855, + "auxiliary_loss_mlp": 0.01039585, + "balance_loss_clip": 1.02483916, + "balance_loss_mlp": 1.05032694, + "epoch": 0.2364647527431234, + "flos": 26177622255360.0, + "grad_norm": 1.5491195596348342, + "language_loss": 0.85760093, + "learning_rate": 3.5692494937843537e-06, + "loss": 0.87938541, + "num_input_tokens_seen": 84708825, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8828125, + "step": 3933, + "time_per_iteration": 2.5625483989715576 + }, + { + "auxiliary_loss_clip": 0.01146232, + "auxiliary_loss_mlp": 0.01037898, + "balance_loss_clip": 1.02008784, + "balance_loss_mlp": 1.0532943, + "epoch": 0.23652487599579136, + "flos": 22637009721600.0, + "grad_norm": 2.0322099534023685, + "language_loss": 0.8277775, + "learning_rate": 3.5690080090954727e-06, + "loss": 0.84961879, + "num_input_tokens_seen": 84726165, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9296875, + "step": 3934, + "time_per_iteration": 2.512068748474121 + }, + { + "auxiliary_loss_clip": 0.01141394, + "auxiliary_loss_mlp": 0.01037778, + "balance_loss_clip": 1.02102923, + "balance_loss_mlp": 1.04977798, + "epoch": 0.23658499924845935, + "flos": 21762261809280.0, + "grad_norm": 1.774494675769988, + "language_loss": 0.7864846, + "learning_rate": 3.5687664649096515e-06, + "loss": 0.80827636, + "num_input_tokens_seen": 84745815, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.91796875, + "step": 3935, + "time_per_iteration": 2.4996352195739746 + }, + { + "auxiliary_loss_clip": 0.01138141, + "auxiliary_loss_mlp": 0.01035042, + "balance_loss_clip": 1.01913905, + "balance_loss_mlp": 1.04973102, + "epoch": 0.23664512250112732, + "flos": 21798998444160.0, + "grad_norm": 1.7164724890649055, + "language_loss": 0.79656923, + "learning_rate": 3.5685248612360487e-06, + "loss": 0.81830108, + "num_input_tokens_seen": 84765415, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8828125, + "step": 3936, + "time_per_iteration": 2.4868710041046143 + }, + { + "auxiliary_loss_clip": 0.01138439, + "auxiliary_loss_mlp": 0.01036243, + "balance_loss_clip": 1.0192436, + "balance_loss_mlp": 1.04798818, + "epoch": 0.23670524575379528, + "flos": 22637871648000.0, + "grad_norm": 1.4334555797897097, + "language_loss": 0.78783411, + "learning_rate": 3.568283198083826e-06, + "loss": 0.80958092, + "num_input_tokens_seen": 84787080, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.90625, + "step": 3937, + "time_per_iteration": 2.499565362930298 + }, + { + "auxiliary_loss_clip": 0.01136409, + "auxiliary_loss_mlp": 0.01037518, + "balance_loss_clip": 1.02244997, + "balance_loss_mlp": 1.04970455, + "epoch": 0.23676536900646325, + "flos": 16725000263040.0, + "grad_norm": 2.078138882715826, + "language_loss": 0.85105085, + "learning_rate": 3.568041475462147e-06, + "loss": 0.8727901, + "num_input_tokens_seen": 84805395, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8671875, + "step": 3938, + "time_per_iteration": 2.449214220046997 + }, + { + "auxiliary_loss_clip": 0.01135246, + "auxiliary_loss_mlp": 0.01044603, + "balance_loss_clip": 1.0285933, + "balance_loss_mlp": 1.04824734, + "epoch": 0.23682549225913122, + "flos": 11135611785600.0, + "grad_norm": 2.4851234695326423, + "language_loss": 0.93872499, + "learning_rate": 3.5677996933801785e-06, + "loss": 0.96052349, + "num_input_tokens_seen": 84818090, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87109375, + "step": 3939, + "time_per_iteration": 2.415891647338867 + }, + { + "auxiliary_loss_clip": 0.01140924, + "auxiliary_loss_mlp": 0.01043341, + "balance_loss_clip": 1.02598429, + "balance_loss_mlp": 1.04769599, + "epoch": 0.23688561551179918, + "flos": 22559226819840.0, + "grad_norm": 1.6764835140151866, + "language_loss": 0.8238095, + "learning_rate": 3.567557851847088e-06, + "loss": 0.84565216, + "num_input_tokens_seen": 84837695, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9296875, + "step": 3940, + "time_per_iteration": 2.47468900680542 + }, + { + "auxiliary_loss_clip": 0.01145021, + "auxiliary_loss_mlp": 0.01042824, + "balance_loss_clip": 1.02592003, + "balance_loss_mlp": 1.04990602, + "epoch": 0.23694573876446715, + "flos": 18514895909760.0, + "grad_norm": 2.2107440191497054, + "language_loss": 0.88986713, + "learning_rate": 3.5673159508720464e-06, + "loss": 0.91174555, + "num_input_tokens_seen": 84854630, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.953125, + "step": 3941, + "time_per_iteration": 2.455631971359253 + }, + { + "auxiliary_loss_clip": 0.01136515, + "auxiliary_loss_mlp": 0.01043393, + "balance_loss_clip": 1.02580976, + "balance_loss_mlp": 1.04538155, + "epoch": 0.23700586201713514, + "flos": 15335723980800.0, + "grad_norm": 2.1526885300024072, + "language_loss": 0.84676927, + "learning_rate": 3.5670739904642274e-06, + "loss": 0.86856836, + "num_input_tokens_seen": 84871805, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9140625, + "step": 3942, + "time_per_iteration": 2.43743634223938 + }, + { + "auxiliary_loss_clip": 0.01140361, + "auxiliary_loss_mlp": 0.01045145, + "balance_loss_clip": 1.02769232, + "balance_loss_mlp": 1.04840159, + "epoch": 0.2370659852698031, + "flos": 23947605262080.0, + "grad_norm": 1.8547641010298248, + "language_loss": 0.80905575, + "learning_rate": 3.5668319706328065e-06, + "loss": 0.83091086, + "num_input_tokens_seen": 84889815, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.921875, + "step": 3943, + "time_per_iteration": 2.5058658123016357 + }, + { + "auxiliary_loss_clip": 0.01143585, + "auxiliary_loss_mlp": 0.01039213, + "balance_loss_clip": 1.02084267, + "balance_loss_mlp": 1.04731488, + "epoch": 0.23712610852247107, + "flos": 15332527670400.0, + "grad_norm": 2.308079684052438, + "language_loss": 0.67493033, + "learning_rate": 3.566589891386959e-06, + "loss": 0.69675827, + "num_input_tokens_seen": 84904380, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.9609375, + "step": 3944, + "time_per_iteration": 2.4276273250579834 + }, + { + "auxiliary_loss_clip": 0.01144217, + "auxiliary_loss_mlp": 0.01038427, + "balance_loss_clip": 1.02116549, + "balance_loss_mlp": 1.05084419, + "epoch": 0.23718623177513903, + "flos": 19682567233920.0, + "grad_norm": 2.061169456768298, + "language_loss": 0.75421506, + "learning_rate": 3.566347752735866e-06, + "loss": 0.77604151, + "num_input_tokens_seen": 84922935, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.93359375, + "step": 3945, + "time_per_iteration": 2.474323272705078 + }, + { + "auxiliary_loss_clip": 0.01137318, + "auxiliary_loss_mlp": 0.01039206, + "balance_loss_clip": 1.02304149, + "balance_loss_mlp": 1.0469377, + "epoch": 0.237246355027807, + "flos": 24973322037120.0, + "grad_norm": 1.6081639136691026, + "language_loss": 0.63469779, + "learning_rate": 3.5661055546887094e-06, + "loss": 0.65646303, + "num_input_tokens_seen": 84943685, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.90234375, + "step": 3946, + "time_per_iteration": 2.5087931156158447 + }, + { + "auxiliary_loss_clip": 0.01137558, + "auxiliary_loss_mlp": 0.01038922, + "balance_loss_clip": 1.02186346, + "balance_loss_mlp": 1.04692435, + "epoch": 0.23730647828047496, + "flos": 15377416692480.0, + "grad_norm": 2.27613511663784, + "language_loss": 0.77508283, + "learning_rate": 3.5658632972546734e-06, + "loss": 0.79684764, + "num_input_tokens_seen": 84959505, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.90625, + "step": 3947, + "time_per_iteration": 2.4716949462890625 + }, + { + "auxiliary_loss_clip": 0.01141281, + "auxiliary_loss_mlp": 0.0104192, + "balance_loss_clip": 1.02496827, + "balance_loss_mlp": 1.05008841, + "epoch": 0.23736660153314296, + "flos": 28150662372480.0, + "grad_norm": 1.6255497375782806, + "language_loss": 0.80575311, + "learning_rate": 3.565620980442944e-06, + "loss": 0.8275851, + "num_input_tokens_seen": 84982130, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.91015625, + "step": 3948, + "time_per_iteration": 2.5750784873962402 + }, + { + "auxiliary_loss_clip": 0.01138983, + "auxiliary_loss_mlp": 0.01043821, + "balance_loss_clip": 1.02715611, + "balance_loss_mlp": 1.04736018, + "epoch": 0.23742672478581092, + "flos": 22086570729600.0, + "grad_norm": 2.0638215262656696, + "language_loss": 0.80578661, + "learning_rate": 3.5653786042627107e-06, + "loss": 0.82761467, + "num_input_tokens_seen": 85000640, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.91796875, + "step": 3949, + "time_per_iteration": 2.512665271759033 + }, + { + "auxiliary_loss_clip": 0.01138607, + "auxiliary_loss_mlp": 0.0104063, + "balance_loss_clip": 1.02382135, + "balance_loss_mlp": 1.04584646, + "epoch": 0.2374868480384789, + "flos": 19537093152000.0, + "grad_norm": 1.8976071400358168, + "language_loss": 0.73124689, + "learning_rate": 3.565136168723163e-06, + "loss": 0.75303924, + "num_input_tokens_seen": 85018970, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.92578125, + "step": 3950, + "time_per_iteration": 2.4842302799224854 + }, + { + "auxiliary_loss_clip": 0.01135058, + "auxiliary_loss_mlp": 0.01034229, + "balance_loss_clip": 1.01944709, + "balance_loss_mlp": 1.04712903, + "epoch": 0.23754697129114685, + "flos": 19422501788160.0, + "grad_norm": 2.0688047231241247, + "language_loss": 0.73064256, + "learning_rate": 3.564893673833495e-06, + "loss": 0.75233537, + "num_input_tokens_seen": 85035905, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8828125, + "step": 3951, + "time_per_iteration": 2.5215439796447754 + }, + { + "auxiliary_loss_clip": 0.01144126, + "auxiliary_loss_mlp": 0.0104004, + "balance_loss_clip": 1.02258813, + "balance_loss_mlp": 1.0507673, + "epoch": 0.23760709454381482, + "flos": 19501002961920.0, + "grad_norm": 1.7591828710207016, + "language_loss": 0.73658371, + "learning_rate": 3.564651119602903e-06, + "loss": 0.75842535, + "num_input_tokens_seen": 85054560, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.93359375, + "step": 3952, + "time_per_iteration": 2.550182342529297 + }, + { + "auxiliary_loss_clip": 0.0113686, + "auxiliary_loss_mlp": 0.01037761, + "balance_loss_clip": 1.02213275, + "balance_loss_mlp": 1.04537988, + "epoch": 0.23766721779648278, + "flos": 27636600879360.0, + "grad_norm": 1.6791264380286672, + "language_loss": 0.71064484, + "learning_rate": 3.564408506040583e-06, + "loss": 0.73239112, + "num_input_tokens_seen": 85074425, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.9140625, + "step": 3953, + "time_per_iteration": 2.530381202697754 + }, + { + "auxiliary_loss_clip": 0.01140701, + "auxiliary_loss_mlp": 0.01042499, + "balance_loss_clip": 1.02522552, + "balance_loss_mlp": 1.04806364, + "epoch": 0.23772734104915075, + "flos": 23404348990080.0, + "grad_norm": 1.9696108021357461, + "language_loss": 0.81686246, + "learning_rate": 3.5641658331557356e-06, + "loss": 0.83869451, + "num_input_tokens_seen": 85092865, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.92578125, + "step": 3954, + "time_per_iteration": 2.491629123687744 + }, + { + "auxiliary_loss_clip": 0.01141999, + "auxiliary_loss_mlp": 0.01047189, + "balance_loss_clip": 1.02915251, + "balance_loss_mlp": 1.04870319, + "epoch": 0.23778746430181874, + "flos": 15705496540800.0, + "grad_norm": 2.155968963382196, + "language_loss": 0.65756261, + "learning_rate": 3.5639231009575634e-06, + "loss": 0.67945445, + "num_input_tokens_seen": 85110175, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.93359375, + "step": 3955, + "time_per_iteration": 2.4659719467163086 + }, + { + "auxiliary_loss_clip": 0.01138242, + "auxiliary_loss_mlp": 0.01053219, + "balance_loss_clip": 1.0362916, + "balance_loss_mlp": 1.04739583, + "epoch": 0.2378475875544867, + "flos": 19426452284160.0, + "grad_norm": 1.3846492045019327, + "language_loss": 0.83788121, + "learning_rate": 3.5636803094552704e-06, + "loss": 0.85979581, + "num_input_tokens_seen": 85129925, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.91015625, + "step": 3956, + "time_per_iteration": 2.48734712600708 + }, + { + "auxiliary_loss_clip": 0.011338, + "auxiliary_loss_mlp": 0.01040785, + "balance_loss_clip": 1.02471578, + "balance_loss_mlp": 1.04647636, + "epoch": 0.23790771080715467, + "flos": 22268565964800.0, + "grad_norm": 2.1805686912335656, + "language_loss": 0.85228634, + "learning_rate": 3.5634374586580635e-06, + "loss": 0.8740322, + "num_input_tokens_seen": 85147755, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87109375, + "step": 3957, + "time_per_iteration": 2.50199294090271 + }, + { + "auxiliary_loss_clip": 0.01139099, + "auxiliary_loss_mlp": 0.01041833, + "balance_loss_clip": 1.02686596, + "balance_loss_mlp": 1.04807806, + "epoch": 0.23796783405982264, + "flos": 20047311889920.0, + "grad_norm": 2.0218180107915757, + "language_loss": 0.70133704, + "learning_rate": 3.563194548575151e-06, + "loss": 0.72314632, + "num_input_tokens_seen": 85165270, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.91015625, + "step": 3958, + "time_per_iteration": 2.4798173904418945 + }, + { + "auxiliary_loss_clip": 0.01136893, + "auxiliary_loss_mlp": 0.01043034, + "balance_loss_clip": 1.02530742, + "balance_loss_mlp": 1.04581285, + "epoch": 0.2380279573124906, + "flos": 14245943299200.0, + "grad_norm": 3.373562251556634, + "language_loss": 0.65834582, + "learning_rate": 3.562951579215745e-06, + "loss": 0.68014508, + "num_input_tokens_seen": 85181555, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.91015625, + "step": 3959, + "time_per_iteration": 2.4558637142181396 + }, + { + "auxiliary_loss_clip": 0.01138452, + "auxiliary_loss_mlp": 0.01041764, + "balance_loss_clip": 1.02565885, + "balance_loss_mlp": 1.04832602, + "epoch": 0.23808808056515857, + "flos": 21179180332800.0, + "grad_norm": 1.7230243338870097, + "language_loss": 0.72128749, + "learning_rate": 3.5627085505890586e-06, + "loss": 0.74308968, + "num_input_tokens_seen": 85199455, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.90234375, + "step": 3960, + "time_per_iteration": 2.4831748008728027 + }, + { + "auxiliary_loss_clip": 0.01139565, + "auxiliary_loss_mlp": 0.01041249, + "balance_loss_clip": 1.0249052, + "balance_loss_mlp": 1.04867244, + "epoch": 0.23814820381782653, + "flos": 22528308188160.0, + "grad_norm": 1.8711627571775973, + "language_loss": 0.74181205, + "learning_rate": 3.562465462704307e-06, + "loss": 0.7636202, + "num_input_tokens_seen": 85219170, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.91015625, + "step": 3961, + "time_per_iteration": 2.5167927742004395 + }, + { + "auxiliary_loss_clip": 0.01138898, + "auxiliary_loss_mlp": 0.0105126, + "balance_loss_clip": 1.03318763, + "balance_loss_mlp": 1.04605162, + "epoch": 0.23820832707049452, + "flos": 22304332932480.0, + "grad_norm": 2.643011810367893, + "language_loss": 0.66067994, + "learning_rate": 3.5622223155707085e-06, + "loss": 0.68258154, + "num_input_tokens_seen": 85238480, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9296875, + "step": 3962, + "time_per_iteration": 2.4900338649749756 + }, + { + "auxiliary_loss_clip": 0.01138484, + "auxiliary_loss_mlp": 0.01050468, + "balance_loss_clip": 1.03387976, + "balance_loss_mlp": 1.04738379, + "epoch": 0.2382684503231625, + "flos": 24864225454080.0, + "grad_norm": 1.7740384877146562, + "language_loss": 0.74581182, + "learning_rate": 3.561979109197483e-06, + "loss": 0.76770139, + "num_input_tokens_seen": 85259180, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.91015625, + "step": 3963, + "time_per_iteration": 2.5409018993377686 + }, + { + "auxiliary_loss_clip": 0.01142407, + "auxiliary_loss_mlp": 0.01046013, + "balance_loss_clip": 1.02899039, + "balance_loss_mlp": 1.0498383, + "epoch": 0.23832857357583045, + "flos": 21871609787520.0, + "grad_norm": 2.0190521185084753, + "language_loss": 0.76898873, + "learning_rate": 3.5617358435938538e-06, + "loss": 0.79087293, + "num_input_tokens_seen": 85278550, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.92578125, + "step": 3964, + "time_per_iteration": 2.492861270904541 + }, + { + "auxiliary_loss_clip": 0.01137102, + "auxiliary_loss_mlp": 0.01044921, + "balance_loss_clip": 1.02911341, + "balance_loss_mlp": 1.04792333, + "epoch": 0.23838869682849842, + "flos": 21288061434240.0, + "grad_norm": 2.0459212281672956, + "language_loss": 0.71593058, + "learning_rate": 3.561492518769045e-06, + "loss": 0.73775077, + "num_input_tokens_seen": 85297345, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.890625, + "step": 3965, + "time_per_iteration": 2.5120911598205566 + }, + { + "auxiliary_loss_clip": 0.01134569, + "auxiliary_loss_mlp": 0.01047354, + "balance_loss_clip": 1.03158259, + "balance_loss_mlp": 1.04674065, + "epoch": 0.23844882008116638, + "flos": 16180594755840.0, + "grad_norm": 1.8902557347099018, + "language_loss": 0.78008091, + "learning_rate": 3.561249134732282e-06, + "loss": 0.80190015, + "num_input_tokens_seen": 85315105, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.87890625, + "step": 3966, + "time_per_iteration": 2.4576594829559326 + }, + { + "auxiliary_loss_clip": 0.01135801, + "auxiliary_loss_mlp": 0.01042292, + "balance_loss_clip": 1.02656794, + "balance_loss_mlp": 1.04652119, + "epoch": 0.23850894333383435, + "flos": 21069724613760.0, + "grad_norm": 2.8460709531404, + "language_loss": 0.68860286, + "learning_rate": 3.561005691492797e-06, + "loss": 0.71038377, + "num_input_tokens_seen": 85334735, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.89453125, + "step": 3967, + "time_per_iteration": 2.484840154647827 + }, + { + "auxiliary_loss_clip": 0.01137019, + "auxiliary_loss_mlp": 0.01053581, + "balance_loss_clip": 1.03739274, + "balance_loss_mlp": 1.04645443, + "epoch": 0.23856906658650234, + "flos": 17201606849280.0, + "grad_norm": 2.11266161128335, + "language_loss": 0.67849773, + "learning_rate": 3.5607621890598185e-06, + "loss": 0.70040375, + "num_input_tokens_seen": 85352875, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.90234375, + "step": 3968, + "time_per_iteration": 2.441445827484131 + }, + { + "auxiliary_loss_clip": 0.01134651, + "auxiliary_loss_mlp": 0.01038945, + "balance_loss_clip": 1.02318573, + "balance_loss_mlp": 1.0451827, + "epoch": 0.2386291898391703, + "flos": 29494223619840.0, + "grad_norm": 1.8948052650888014, + "language_loss": 0.76742399, + "learning_rate": 3.5605186274425823e-06, + "loss": 0.78916001, + "num_input_tokens_seen": 85372205, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.89453125, + "step": 3969, + "time_per_iteration": 5.413191318511963 + }, + { + "auxiliary_loss_clip": 0.01134724, + "auxiliary_loss_mlp": 0.01040503, + "balance_loss_clip": 1.02519631, + "balance_loss_mlp": 1.04734492, + "epoch": 0.23868931309183827, + "flos": 21142443697920.0, + "grad_norm": 2.7243772241637263, + "language_loss": 0.76300085, + "learning_rate": 3.5602750066503225e-06, + "loss": 0.78475308, + "num_input_tokens_seen": 85389705, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.875, + "step": 3970, + "time_per_iteration": 2.4792258739471436 + }, + { + "auxiliary_loss_clip": 0.01137573, + "auxiliary_loss_mlp": 0.01042951, + "balance_loss_clip": 1.02545094, + "balance_loss_mlp": 1.04645324, + "epoch": 0.23874943634450624, + "flos": 25659394784640.0, + "grad_norm": 3.3207921386663584, + "language_loss": 0.85399735, + "learning_rate": 3.5600313266922793e-06, + "loss": 0.87580258, + "num_input_tokens_seen": 85407855, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9140625, + "step": 3971, + "time_per_iteration": 2.500506639480591 + }, + { + "auxiliary_loss_clip": 0.01055799, + "auxiliary_loss_mlp": 0.01017218, + "balance_loss_clip": 1.01547742, + "balance_loss_mlp": 1.02590835, + "epoch": 0.2388095595971742, + "flos": 58986618624000.0, + "grad_norm": 0.7461637295582213, + "language_loss": 0.62814003, + "learning_rate": 3.5597875875776915e-06, + "loss": 0.64887029, + "num_input_tokens_seen": 85470885, + "router_z_loss_clip": 0.01745605, + "router_z_loss_mlp": 0.29882812, + "step": 3972, + "time_per_iteration": 3.173640012741089 + }, + { + "auxiliary_loss_clip": 0.0113938, + "auxiliary_loss_mlp": 0.01037064, + "balance_loss_clip": 1.02119696, + "balance_loss_mlp": 1.04922092, + "epoch": 0.23886968284984217, + "flos": 16800341040000.0, + "grad_norm": 1.9456864585596687, + "language_loss": 0.8170895, + "learning_rate": 3.5595437893158013e-06, + "loss": 0.8388539, + "num_input_tokens_seen": 85488460, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.90234375, + "step": 3973, + "time_per_iteration": 2.4529452323913574 + }, + { + "auxiliary_loss_clip": 0.01137225, + "auxiliary_loss_mlp": 0.01044983, + "balance_loss_clip": 1.02849591, + "balance_loss_mlp": 1.04869485, + "epoch": 0.23892980610251013, + "flos": 22382654538240.0, + "grad_norm": 1.6994626560625323, + "language_loss": 0.79299271, + "learning_rate": 3.5592999319158546e-06, + "loss": 0.81481481, + "num_input_tokens_seen": 85508590, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8828125, + "step": 3974, + "time_per_iteration": 2.5395772457122803 + }, + { + "auxiliary_loss_clip": 0.01139215, + "auxiliary_loss_mlp": 0.01038332, + "balance_loss_clip": 1.02155876, + "balance_loss_mlp": 1.04858148, + "epoch": 0.23898992935517813, + "flos": 12823198519680.0, + "grad_norm": 1.8925619228877844, + "language_loss": 0.84428573, + "learning_rate": 3.5590560153870984e-06, + "loss": 0.86606121, + "num_input_tokens_seen": 85525970, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.90625, + "step": 3975, + "time_per_iteration": 2.430361032485962 + }, + { + "auxiliary_loss_clip": 0.01135199, + "auxiliary_loss_mlp": 0.01037689, + "balance_loss_clip": 1.02215612, + "balance_loss_mlp": 1.0471369, + "epoch": 0.2390500526078461, + "flos": 22345666508160.0, + "grad_norm": 2.06825719132721, + "language_loss": 0.8375293, + "learning_rate": 3.5588120397387816e-06, + "loss": 0.85925817, + "num_input_tokens_seen": 85543700, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.87890625, + "step": 3976, + "time_per_iteration": 2.480534791946411 + }, + { + "auxiliary_loss_clip": 0.01132825, + "auxiliary_loss_mlp": 0.01032287, + "balance_loss_clip": 1.01798213, + "balance_loss_mlp": 1.04606938, + "epoch": 0.23911017586051406, + "flos": 22635142214400.0, + "grad_norm": 1.747752931490835, + "language_loss": 0.74532628, + "learning_rate": 3.5585680049801566e-06, + "loss": 0.76697731, + "num_input_tokens_seen": 85562765, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8671875, + "step": 3977, + "time_per_iteration": 2.4641239643096924 + }, + { + "auxiliary_loss_clip": 0.01138905, + "auxiliary_loss_mlp": 0.01045175, + "balance_loss_clip": 1.02818775, + "balance_loss_mlp": 1.04930067, + "epoch": 0.23917029911318202, + "flos": 23653281219840.0, + "grad_norm": 1.6638092474338306, + "language_loss": 0.72395146, + "learning_rate": 3.5583239111204764e-06, + "loss": 0.74579227, + "num_input_tokens_seen": 85581755, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.89453125, + "step": 3978, + "time_per_iteration": 2.5007903575897217 + }, + { + "auxiliary_loss_clip": 0.01143288, + "auxiliary_loss_mlp": 0.01042156, + "balance_loss_clip": 1.0256691, + "balance_loss_mlp": 1.05204654, + "epoch": 0.23923042236585, + "flos": 22783597125120.0, + "grad_norm": 2.0169903221822683, + "language_loss": 0.78654587, + "learning_rate": 3.558079758168997e-06, + "loss": 0.80840027, + "num_input_tokens_seen": 85599455, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.91015625, + "step": 3979, + "time_per_iteration": 2.5006349086761475 + }, + { + "auxiliary_loss_clip": 0.0113581, + "auxiliary_loss_mlp": 0.01044452, + "balance_loss_clip": 1.02769148, + "balance_loss_mlp": 1.04762173, + "epoch": 0.23929054561851795, + "flos": 28147717457280.0, + "grad_norm": 1.6987462202935262, + "language_loss": 0.81945407, + "learning_rate": 3.557835546134977e-06, + "loss": 0.84125668, + "num_input_tokens_seen": 85619970, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8828125, + "step": 3980, + "time_per_iteration": 2.5287020206451416 + }, + { + "auxiliary_loss_clip": 0.01136489, + "auxiliary_loss_mlp": 0.01036341, + "balance_loss_clip": 1.01974702, + "balance_loss_mlp": 1.04967999, + "epoch": 0.23935066887118592, + "flos": 21686525982720.0, + "grad_norm": 1.749461413213386, + "language_loss": 0.8401112, + "learning_rate": 3.5575912750276775e-06, + "loss": 0.86183953, + "num_input_tokens_seen": 85638850, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8671875, + "step": 3981, + "time_per_iteration": 2.466660261154175 + }, + { + "auxiliary_loss_clip": 0.01141626, + "auxiliary_loss_mlp": 0.01044752, + "balance_loss_clip": 1.0275383, + "balance_loss_mlp": 1.04951072, + "epoch": 0.2394107921238539, + "flos": 32122274198400.0, + "grad_norm": 3.6241006318049864, + "language_loss": 0.76872683, + "learning_rate": 3.5573469448563607e-06, + "loss": 0.79059052, + "num_input_tokens_seen": 85656285, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.921875, + "step": 3982, + "time_per_iteration": 2.558145046234131 + }, + { + "auxiliary_loss_clip": 0.01135351, + "auxiliary_loss_mlp": 0.01043953, + "balance_loss_clip": 1.02811027, + "balance_loss_mlp": 1.04844236, + "epoch": 0.23947091537652188, + "flos": 17019180650880.0, + "grad_norm": 6.059829142106342, + "language_loss": 0.77878481, + "learning_rate": 3.5571025556302915e-06, + "loss": 0.80057788, + "num_input_tokens_seen": 85673020, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8671875, + "step": 3983, + "time_per_iteration": 2.4443132877349854 + }, + { + "auxiliary_loss_clip": 0.01136897, + "auxiliary_loss_mlp": 0.0104106, + "balance_loss_clip": 1.02446592, + "balance_loss_mlp": 1.04759789, + "epoch": 0.23953103862918984, + "flos": 20593584904320.0, + "grad_norm": 1.9981470653963032, + "language_loss": 0.73163629, + "learning_rate": 3.556858107358737e-06, + "loss": 0.75341582, + "num_input_tokens_seen": 85692565, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.89453125, + "step": 3984, + "time_per_iteration": 2.491344690322876 + }, + { + "auxiliary_loss_clip": 0.01137825, + "auxiliary_loss_mlp": 0.01045273, + "balance_loss_clip": 1.02860713, + "balance_loss_mlp": 1.04674625, + "epoch": 0.2395911618818578, + "flos": 20704405340160.0, + "grad_norm": 2.064924146489818, + "language_loss": 0.79049474, + "learning_rate": 3.5566136000509674e-06, + "loss": 0.81232572, + "num_input_tokens_seen": 85709730, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.90625, + "step": 3985, + "time_per_iteration": 2.4587738513946533 + }, + { + "auxiliary_loss_clip": 0.01139616, + "auxiliary_loss_mlp": 0.01041503, + "balance_loss_clip": 1.02484989, + "balance_loss_mlp": 1.04980683, + "epoch": 0.23965128513452577, + "flos": 27053519402880.0, + "grad_norm": 2.0182764415160563, + "language_loss": 0.73312742, + "learning_rate": 3.556369033716254e-06, + "loss": 0.7549386, + "num_input_tokens_seen": 85730045, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8984375, + "step": 3986, + "time_per_iteration": 2.5608811378479004 + }, + { + "auxiliary_loss_clip": 0.0114189, + "auxiliary_loss_mlp": 0.01051013, + "balance_loss_clip": 1.03495562, + "balance_loss_mlp": 1.04923773, + "epoch": 0.23971140838719374, + "flos": 23144319457920.0, + "grad_norm": 2.2624046500679333, + "language_loss": 0.87836051, + "learning_rate": 3.556124408363871e-06, + "loss": 0.90028954, + "num_input_tokens_seen": 85747590, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.92578125, + "step": 3987, + "time_per_iteration": 2.461778402328491 + }, + { + "auxiliary_loss_clip": 0.01132193, + "auxiliary_loss_mlp": 0.01036037, + "balance_loss_clip": 1.02161288, + "balance_loss_mlp": 1.04831004, + "epoch": 0.23977153163986173, + "flos": 18034554309120.0, + "grad_norm": 2.3750633167266306, + "language_loss": 0.8308624, + "learning_rate": 3.5558797240030945e-06, + "loss": 0.85254467, + "num_input_tokens_seen": 85763460, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.83984375, + "step": 3988, + "time_per_iteration": 2.4527788162231445 + }, + { + "auxiliary_loss_clip": 0.01134459, + "auxiliary_loss_mlp": 0.01040007, + "balance_loss_clip": 1.02336502, + "balance_loss_mlp": 1.04686844, + "epoch": 0.2398316548925297, + "flos": 18113378705280.0, + "grad_norm": 1.649806875732991, + "language_loss": 0.85145879, + "learning_rate": 3.5556349806432035e-06, + "loss": 0.87320346, + "num_input_tokens_seen": 85782050, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.875, + "step": 3989, + "time_per_iteration": 2.43949031829834 + }, + { + "auxiliary_loss_clip": 0.01135126, + "auxiliary_loss_mlp": 0.01037638, + "balance_loss_clip": 1.02249837, + "balance_loss_mlp": 1.04763699, + "epoch": 0.23989177814519766, + "flos": 12567730014720.0, + "grad_norm": 2.0784071273800944, + "language_loss": 0.84493041, + "learning_rate": 3.555390178293477e-06, + "loss": 0.86665809, + "num_input_tokens_seen": 85797400, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.875, + "step": 3990, + "time_per_iteration": 2.4476051330566406 + }, + { + "auxiliary_loss_clip": 0.01133399, + "auxiliary_loss_mlp": 0.01040211, + "balance_loss_clip": 1.02507186, + "balance_loss_mlp": 1.0463922, + "epoch": 0.23995190139786562, + "flos": 25264593423360.0, + "grad_norm": 3.585202907729512, + "language_loss": 0.75312221, + "learning_rate": 3.5551453169631994e-06, + "loss": 0.77485824, + "num_input_tokens_seen": 85818995, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.87109375, + "step": 3991, + "time_per_iteration": 2.502324104309082 + }, + { + "auxiliary_loss_clip": 0.01050073, + "auxiliary_loss_mlp": 0.01009423, + "balance_loss_clip": 1.00774217, + "balance_loss_mlp": 1.02049088, + "epoch": 0.2400120246505336, + "flos": 61960379650560.0, + "grad_norm": 0.8894590829003932, + "language_loss": 0.63734841, + "learning_rate": 3.554900396661656e-06, + "loss": 0.65794337, + "num_input_tokens_seen": 85876695, + "router_z_loss_clip": 0.0168457, + "router_z_loss_mlp": 0.296875, + "step": 3992, + "time_per_iteration": 3.0017786026000977 + }, + { + "auxiliary_loss_clip": 0.01050397, + "auxiliary_loss_mlp": 0.01010168, + "balance_loss_clip": 1.00857067, + "balance_loss_mlp": 1.02071452, + "epoch": 0.24007214790320155, + "flos": 66708560540160.0, + "grad_norm": 0.7530514643625366, + "language_loss": 0.62963343, + "learning_rate": 3.5546554173981334e-06, + "loss": 0.65023899, + "num_input_tokens_seen": 85940990, + "router_z_loss_clip": 0.01599121, + "router_z_loss_mlp": 0.296875, + "step": 3993, + "time_per_iteration": 3.176184892654419 + }, + { + "auxiliary_loss_clip": 0.01140668, + "auxiliary_loss_mlp": 0.01047015, + "balance_loss_clip": 1.03085065, + "balance_loss_mlp": 1.05099177, + "epoch": 0.24013227115586952, + "flos": 25809070757760.0, + "grad_norm": 1.6383486345725178, + "language_loss": 0.76938868, + "learning_rate": 3.5544103791819218e-06, + "loss": 0.79126549, + "num_input_tokens_seen": 85961165, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8984375, + "step": 3994, + "time_per_iteration": 2.4940826892852783 + }, + { + "auxiliary_loss_clip": 0.01135853, + "auxiliary_loss_mlp": 0.01047966, + "balance_loss_clip": 1.0305258, + "balance_loss_mlp": 1.04680216, + "epoch": 0.2401923944085375, + "flos": 25557480921600.0, + "grad_norm": 1.7751147523393542, + "language_loss": 0.78457522, + "learning_rate": 3.5541652820223124e-06, + "loss": 0.80641341, + "num_input_tokens_seen": 85982710, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.890625, + "step": 3995, + "time_per_iteration": 2.5075032711029053 + }, + { + "auxiliary_loss_clip": 0.01047716, + "auxiliary_loss_mlp": 0.01003894, + "balance_loss_clip": 1.00232053, + "balance_loss_mlp": 1.01837659, + "epoch": 0.24025251766120548, + "flos": 54941138478720.0, + "grad_norm": 0.8913570860108078, + "language_loss": 0.63479292, + "learning_rate": 3.5539201259286006e-06, + "loss": 0.65530908, + "num_input_tokens_seen": 86046935, + "router_z_loss_clip": 0.01574707, + "router_z_loss_mlp": 0.29296875, + "step": 3996, + "time_per_iteration": 3.1365764141082764 + }, + { + "auxiliary_loss_clip": 0.01137569, + "auxiliary_loss_mlp": 0.01045722, + "balance_loss_clip": 1.02916384, + "balance_loss_mlp": 1.04678392, + "epoch": 0.24031264091387344, + "flos": 20631075724800.0, + "grad_norm": 2.906997418482602, + "language_loss": 0.7009505, + "learning_rate": 3.5536749109100808e-06, + "loss": 0.72278345, + "num_input_tokens_seen": 86064355, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.90625, + "step": 3997, + "time_per_iteration": 2.464714765548706 + }, + { + "auxiliary_loss_clip": 0.01134848, + "auxiliary_loss_mlp": 0.01042521, + "balance_loss_clip": 1.02654672, + "balance_loss_mlp": 1.04642928, + "epoch": 0.2403727641665414, + "flos": 20886256920960.0, + "grad_norm": 1.9831176119326495, + "language_loss": 0.87292743, + "learning_rate": 3.5534296369760535e-06, + "loss": 0.89470112, + "num_input_tokens_seen": 86081340, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8828125, + "step": 3998, + "time_per_iteration": 2.4639480113983154 + }, + { + "auxiliary_loss_clip": 0.01134933, + "auxiliary_loss_mlp": 0.0103944, + "balance_loss_clip": 1.02306032, + "balance_loss_mlp": 1.04208946, + "epoch": 0.24043288741920937, + "flos": 22820046451200.0, + "grad_norm": 1.9745565965944727, + "language_loss": 0.75798607, + "learning_rate": 3.5531843041358183e-06, + "loss": 0.77972972, + "num_input_tokens_seen": 86102260, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9296875, + "step": 3999, + "time_per_iteration": 2.4753127098083496 + }, + { + "auxiliary_loss_clip": 0.01132817, + "auxiliary_loss_mlp": 0.01038028, + "balance_loss_clip": 1.02317488, + "balance_loss_mlp": 1.04545271, + "epoch": 0.24049301067187734, + "flos": 27959652823680.0, + "grad_norm": 1.9306579449884984, + "language_loss": 0.72642016, + "learning_rate": 3.552938912398679e-06, + "loss": 0.74812865, + "num_input_tokens_seen": 86123400, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.875, + "step": 4000, + "time_per_iteration": 2.5172412395477295 + }, + { + "auxiliary_loss_clip": 0.01140243, + "auxiliary_loss_mlp": 0.01036911, + "balance_loss_clip": 1.02025795, + "balance_loss_mlp": 1.04728866, + "epoch": 0.24055313392454533, + "flos": 27451409333760.0, + "grad_norm": 2.4587541869300824, + "language_loss": 0.65991902, + "learning_rate": 3.5526934617739397e-06, + "loss": 0.68169051, + "num_input_tokens_seen": 86144060, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9296875, + "step": 4001, + "time_per_iteration": 2.511198043823242 + }, + { + "auxiliary_loss_clip": 0.01131233, + "auxiliary_loss_mlp": 0.01040424, + "balance_loss_clip": 1.02330589, + "balance_loss_mlp": 1.0427444, + "epoch": 0.2406132571772133, + "flos": 25556618995200.0, + "grad_norm": 2.6796652593661903, + "language_loss": 0.82567388, + "learning_rate": 3.5524479522709095e-06, + "loss": 0.84739041, + "num_input_tokens_seen": 86163005, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.8828125, + "step": 4002, + "time_per_iteration": 2.5147531032562256 + }, + { + "auxiliary_loss_clip": 0.01136125, + "auxiliary_loss_mlp": 0.01038837, + "balance_loss_clip": 1.02382851, + "balance_loss_mlp": 1.04682446, + "epoch": 0.24067338042988126, + "flos": 24791398629120.0, + "grad_norm": 1.8902513751119636, + "language_loss": 0.82875729, + "learning_rate": 3.552202383898897e-06, + "loss": 0.8505069, + "num_input_tokens_seen": 86182580, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.890625, + "step": 4003, + "time_per_iteration": 2.508004665374756 + }, + { + "auxiliary_loss_clip": 0.01134817, + "auxiliary_loss_mlp": 0.01037746, + "balance_loss_clip": 1.0214386, + "balance_loss_mlp": 1.04608846, + "epoch": 0.24073350368254923, + "flos": 21177923356800.0, + "grad_norm": 2.0497424292602835, + "language_loss": 0.87504768, + "learning_rate": 3.551956756667215e-06, + "loss": 0.89677334, + "num_input_tokens_seen": 86200665, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.88671875, + "step": 4004, + "time_per_iteration": 2.4581985473632812 + }, + { + "auxiliary_loss_clip": 0.01133072, + "auxiliary_loss_mlp": 0.0104917, + "balance_loss_clip": 1.03356552, + "balance_loss_mlp": 1.04228568, + "epoch": 0.2407936269352172, + "flos": 22494300986880.0, + "grad_norm": 1.9722136456468877, + "language_loss": 0.77630293, + "learning_rate": 3.551711070585177e-06, + "loss": 0.79812533, + "num_input_tokens_seen": 86221640, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.90625, + "step": 4005, + "time_per_iteration": 2.556365728378296 + }, + { + "auxiliary_loss_clip": 0.01130485, + "auxiliary_loss_mlp": 0.01036948, + "balance_loss_clip": 1.02141535, + "balance_loss_mlp": 1.04398429, + "epoch": 0.24085375018788516, + "flos": 18551129754240.0, + "grad_norm": 1.7295620858093623, + "language_loss": 0.78973985, + "learning_rate": 3.5514653256620995e-06, + "loss": 0.81141412, + "num_input_tokens_seen": 86240795, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8671875, + "step": 4006, + "time_per_iteration": 2.460961103439331 + }, + { + "auxiliary_loss_clip": 0.0113781, + "auxiliary_loss_mlp": 0.01038185, + "balance_loss_clip": 1.02072108, + "balance_loss_mlp": 1.04375279, + "epoch": 0.24091387344055312, + "flos": 24170539023360.0, + "grad_norm": 2.2017624810959346, + "language_loss": 0.71201313, + "learning_rate": 3.551219521907302e-06, + "loss": 0.73377299, + "num_input_tokens_seen": 86262000, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.94140625, + "step": 4007, + "time_per_iteration": 2.5169517993927 + }, + { + "auxiliary_loss_clip": 0.01131131, + "auxiliary_loss_mlp": 0.01039619, + "balance_loss_clip": 1.02459884, + "balance_loss_mlp": 1.04453456, + "epoch": 0.24097399669322112, + "flos": 11036319615360.0, + "grad_norm": 1.805972702734942, + "language_loss": 0.75857127, + "learning_rate": 3.5509736593301042e-06, + "loss": 0.7802788, + "num_input_tokens_seen": 86279680, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8671875, + "step": 4008, + "time_per_iteration": 2.4489922523498535 + }, + { + "auxiliary_loss_clip": 0.01131483, + "auxiliary_loss_mlp": 0.01034828, + "balance_loss_clip": 1.01940203, + "balance_loss_mlp": 1.04296207, + "epoch": 0.24103411994588908, + "flos": 17165085696000.0, + "grad_norm": 2.356516377050019, + "language_loss": 0.73922294, + "learning_rate": 3.5507277379398295e-06, + "loss": 0.76088601, + "num_input_tokens_seen": 86297180, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8828125, + "step": 4009, + "time_per_iteration": 2.4701087474823 + }, + { + "auxiliary_loss_clip": 0.01133056, + "auxiliary_loss_mlp": 0.0104248, + "balance_loss_clip": 1.02664948, + "balance_loss_mlp": 1.04632092, + "epoch": 0.24109424319855705, + "flos": 20667956014080.0, + "grad_norm": 1.636895821506206, + "language_loss": 0.79938453, + "learning_rate": 3.550481757745804e-06, + "loss": 0.82113993, + "num_input_tokens_seen": 86317660, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8671875, + "step": 4010, + "time_per_iteration": 3.9670608043670654 + }, + { + "auxiliary_loss_clip": 0.01131615, + "auxiliary_loss_mlp": 0.01047202, + "balance_loss_clip": 1.02923679, + "balance_loss_mlp": 1.04108143, + "epoch": 0.241154366451225, + "flos": 28181796485760.0, + "grad_norm": 2.295886994366384, + "language_loss": 0.70799017, + "learning_rate": 3.5502357187573555e-06, + "loss": 0.72977829, + "num_input_tokens_seen": 86338325, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.90625, + "step": 4011, + "time_per_iteration": 3.9544472694396973 + }, + { + "auxiliary_loss_clip": 0.01131445, + "auxiliary_loss_mlp": 0.01039733, + "balance_loss_clip": 1.02429593, + "balance_loss_mlp": 1.04258561, + "epoch": 0.24121448970389298, + "flos": 21689722293120.0, + "grad_norm": 1.6166610897431488, + "language_loss": 0.69062299, + "learning_rate": 3.5499896209838118e-06, + "loss": 0.71233475, + "num_input_tokens_seen": 86357615, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.88671875, + "step": 4012, + "time_per_iteration": 2.501347303390503 + }, + { + "auxiliary_loss_clip": 0.01133874, + "auxiliary_loss_mlp": 0.01039375, + "balance_loss_clip": 1.02145839, + "balance_loss_mlp": 1.04454589, + "epoch": 0.24127461295656094, + "flos": 39676191269760.0, + "grad_norm": 2.0861437601678303, + "language_loss": 0.73424822, + "learning_rate": 3.5497434644345073e-06, + "loss": 0.75598073, + "num_input_tokens_seen": 86380355, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.89453125, + "step": 4013, + "time_per_iteration": 2.6360883712768555 + }, + { + "auxiliary_loss_clip": 0.01133872, + "auxiliary_loss_mlp": 0.01036616, + "balance_loss_clip": 1.02110672, + "balance_loss_mlp": 1.04450822, + "epoch": 0.2413347362092289, + "flos": 19135863256320.0, + "grad_norm": 1.8416541794010313, + "language_loss": 0.88554955, + "learning_rate": 3.5494972491187753e-06, + "loss": 0.9072544, + "num_input_tokens_seen": 86399125, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.89453125, + "step": 4014, + "time_per_iteration": 2.4663264751434326 + }, + { + "auxiliary_loss_clip": 0.01137985, + "auxiliary_loss_mlp": 0.01043677, + "balance_loss_clip": 1.02643979, + "balance_loss_mlp": 1.04453659, + "epoch": 0.2413948594618969, + "flos": 26939430829440.0, + "grad_norm": 2.755357499792604, + "language_loss": 0.94270647, + "learning_rate": 3.549250975045952e-06, + "loss": 0.96452308, + "num_input_tokens_seen": 86418625, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.93359375, + "step": 4015, + "time_per_iteration": 2.470952033996582 + }, + { + "auxiliary_loss_clip": 0.01133849, + "auxiliary_loss_mlp": 0.01038159, + "balance_loss_clip": 1.02174377, + "balance_loss_mlp": 1.04334664, + "epoch": 0.24145498271456486, + "flos": 25228108183680.0, + "grad_norm": 1.8402084517778015, + "language_loss": 0.82513833, + "learning_rate": 3.5490046422253768e-06, + "loss": 0.84685838, + "num_input_tokens_seen": 86438375, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90625, + "step": 4016, + "time_per_iteration": 2.4922966957092285 + }, + { + "auxiliary_loss_clip": 0.01127395, + "auxiliary_loss_mlp": 0.01040098, + "balance_loss_clip": 1.02423143, + "balance_loss_mlp": 1.04197156, + "epoch": 0.24151510596723283, + "flos": 40661759617920.0, + "grad_norm": 3.4212830828584386, + "language_loss": 0.69553781, + "learning_rate": 3.54875825066639e-06, + "loss": 0.71721268, + "num_input_tokens_seen": 86463230, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8515625, + "step": 4017, + "time_per_iteration": 2.596977710723877 + }, + { + "auxiliary_loss_clip": 0.01135423, + "auxiliary_loss_mlp": 0.01046549, + "balance_loss_clip": 1.02959788, + "balance_loss_mlp": 1.04421043, + "epoch": 0.2415752292199008, + "flos": 18146667634560.0, + "grad_norm": 2.0038503347112084, + "language_loss": 0.85114455, + "learning_rate": 3.5485118003783353e-06, + "loss": 0.87296432, + "num_input_tokens_seen": 86481230, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.91015625, + "step": 4018, + "time_per_iteration": 2.440749406814575 + }, + { + "auxiliary_loss_clip": 0.01046553, + "auxiliary_loss_mlp": 0.01012788, + "balance_loss_clip": 1.0109762, + "balance_loss_mlp": 1.01676679, + "epoch": 0.24163535247256876, + "flos": 67288409792640.0, + "grad_norm": 0.8182663934779763, + "language_loss": 0.60620981, + "learning_rate": 3.548265291370558e-06, + "loss": 0.62680322, + "num_input_tokens_seen": 86541260, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.296875, + "step": 4019, + "time_per_iteration": 3.112665891647339 + }, + { + "auxiliary_loss_clip": 0.01134098, + "auxiliary_loss_mlp": 0.01038378, + "balance_loss_clip": 1.02299976, + "balance_loss_mlp": 1.04433608, + "epoch": 0.24169547572523672, + "flos": 24929941386240.0, + "grad_norm": 1.880182475838635, + "language_loss": 0.73690915, + "learning_rate": 3.5480187236524055e-06, + "loss": 0.75863391, + "num_input_tokens_seen": 86559580, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8984375, + "step": 4020, + "time_per_iteration": 2.5049281120300293 + }, + { + "auxiliary_loss_clip": 0.01134711, + "auxiliary_loss_mlp": 0.01037599, + "balance_loss_clip": 1.02199471, + "balance_loss_mlp": 1.04660118, + "epoch": 0.24175559897790472, + "flos": 18728312567040.0, + "grad_norm": 1.9671591580269927, + "language_loss": 0.82012737, + "learning_rate": 3.5477720972332285e-06, + "loss": 0.84185052, + "num_input_tokens_seen": 86577560, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8828125, + "step": 4021, + "time_per_iteration": 2.464092493057251 + }, + { + "auxiliary_loss_clip": 0.01137652, + "auxiliary_loss_mlp": 0.01048543, + "balance_loss_clip": 1.03036344, + "balance_loss_mlp": 1.04551053, + "epoch": 0.24181572223057268, + "flos": 23039281111680.0, + "grad_norm": 1.9434993168468309, + "language_loss": 0.76464498, + "learning_rate": 3.547525412122378e-06, + "loss": 0.78650689, + "num_input_tokens_seen": 86595350, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.921875, + "step": 4022, + "time_per_iteration": 2.4939990043640137 + }, + { + "auxiliary_loss_clip": 0.01140564, + "auxiliary_loss_mlp": 0.01042084, + "balance_loss_clip": 1.0248704, + "balance_loss_mlp": 1.04610109, + "epoch": 0.24187584548324065, + "flos": 20376145923840.0, + "grad_norm": 1.893594506248005, + "language_loss": 0.75172901, + "learning_rate": 3.5472786683292083e-06, + "loss": 0.77355558, + "num_input_tokens_seen": 86614805, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9453125, + "step": 4023, + "time_per_iteration": 2.442469358444214 + }, + { + "auxiliary_loss_clip": 0.0113425, + "auxiliary_loss_mlp": 0.010453, + "balance_loss_clip": 1.0288136, + "balance_loss_mlp": 1.04636168, + "epoch": 0.2419359687359086, + "flos": 21397517153280.0, + "grad_norm": 1.7406117596406352, + "language_loss": 0.81464303, + "learning_rate": 3.5470318658630766e-06, + "loss": 0.83643848, + "num_input_tokens_seen": 86633700, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.87890625, + "step": 4024, + "time_per_iteration": 2.45035719871521 + }, + { + "auxiliary_loss_clip": 0.01134068, + "auxiliary_loss_mlp": 0.01045811, + "balance_loss_clip": 1.02951503, + "balance_loss_mlp": 1.0462923, + "epoch": 0.24199609198857658, + "flos": 18369385914240.0, + "grad_norm": 1.8550338864746303, + "language_loss": 0.85851878, + "learning_rate": 3.5467850047333424e-06, + "loss": 0.88031757, + "num_input_tokens_seen": 86650905, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.87890625, + "step": 4025, + "time_per_iteration": 2.4191699028015137 + }, + { + "auxiliary_loss_clip": 0.01136643, + "auxiliary_loss_mlp": 0.01048637, + "balance_loss_clip": 1.03154194, + "balance_loss_mlp": 1.04397535, + "epoch": 0.24205621524124454, + "flos": 19463871277440.0, + "grad_norm": 1.9498897834730646, + "language_loss": 0.71243072, + "learning_rate": 3.546538084949365e-06, + "loss": 0.73428357, + "num_input_tokens_seen": 86669185, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9296875, + "step": 4026, + "time_per_iteration": 2.476792812347412 + }, + { + "auxiliary_loss_clip": 0.01132732, + "auxiliary_loss_mlp": 0.01041866, + "balance_loss_clip": 1.0259757, + "balance_loss_mlp": 1.04589748, + "epoch": 0.2421163384939125, + "flos": 14976330451200.0, + "grad_norm": 1.8853181761927913, + "language_loss": 0.64215046, + "learning_rate": 3.546291106520509e-06, + "loss": 0.66389644, + "num_input_tokens_seen": 86686805, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8671875, + "step": 4027, + "time_per_iteration": 2.443652868270874 + }, + { + "auxiliary_loss_clip": 0.01136833, + "auxiliary_loss_mlp": 0.01037586, + "balance_loss_clip": 1.02226758, + "balance_loss_mlp": 1.04601741, + "epoch": 0.2421764617465805, + "flos": 18662057930880.0, + "grad_norm": 2.5479611354975007, + "language_loss": 0.70294374, + "learning_rate": 3.5460440694561388e-06, + "loss": 0.72468793, + "num_input_tokens_seen": 86705520, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.91015625, + "step": 4028, + "time_per_iteration": 2.48252534866333 + }, + { + "auxiliary_loss_clip": 0.01044866, + "auxiliary_loss_mlp": 0.01007457, + "balance_loss_clip": 1.00585961, + "balance_loss_mlp": 1.01464319, + "epoch": 0.24223658499924847, + "flos": 64347327164160.0, + "grad_norm": 0.8570499142131055, + "language_loss": 0.55407649, + "learning_rate": 3.545796973765623e-06, + "loss": 0.57459968, + "num_input_tokens_seen": 86767320, + "router_z_loss_clip": 0.01599121, + "router_z_loss_mlp": 0.30078125, + "step": 4029, + "time_per_iteration": 3.094402551651001 + }, + { + "auxiliary_loss_clip": 0.01135659, + "auxiliary_loss_mlp": 0.01043386, + "balance_loss_clip": 1.02567101, + "balance_loss_mlp": 1.04526591, + "epoch": 0.24229670825191643, + "flos": 25775243124480.0, + "grad_norm": 2.019101437715354, + "language_loss": 0.73829788, + "learning_rate": 3.54554981945833e-06, + "loss": 0.76008832, + "num_input_tokens_seen": 86788110, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.90234375, + "step": 4030, + "time_per_iteration": 2.5176522731781006 + }, + { + "auxiliary_loss_clip": 0.01135714, + "auxiliary_loss_mlp": 0.01053146, + "balance_loss_clip": 1.03655171, + "balance_loss_mlp": 1.04541922, + "epoch": 0.2423568315045844, + "flos": 20667094087680.0, + "grad_norm": 2.062987020241499, + "language_loss": 0.76440287, + "learning_rate": 3.5453026065436343e-06, + "loss": 0.78629148, + "num_input_tokens_seen": 86807640, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.90234375, + "step": 4031, + "time_per_iteration": 2.4774179458618164 + }, + { + "auxiliary_loss_clip": 0.01140068, + "auxiliary_loss_mlp": 0.01046331, + "balance_loss_clip": 1.02974856, + "balance_loss_mlp": 1.0464952, + "epoch": 0.24241695475725236, + "flos": 22416805393920.0, + "grad_norm": 7.078640241023749, + "language_loss": 0.65947008, + "learning_rate": 3.5450553350309083e-06, + "loss": 0.68133402, + "num_input_tokens_seen": 86826795, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9375, + "step": 4032, + "time_per_iteration": 2.500488519668579 + }, + { + "auxiliary_loss_clip": 0.01130465, + "auxiliary_loss_mlp": 0.01046589, + "balance_loss_clip": 1.03026938, + "balance_loss_mlp": 1.04175007, + "epoch": 0.24247707800992033, + "flos": 17128995505920.0, + "grad_norm": 3.1167913511387995, + "language_loss": 0.81353086, + "learning_rate": 3.5448080049295286e-06, + "loss": 0.83530146, + "num_input_tokens_seen": 86843175, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.88671875, + "step": 4033, + "time_per_iteration": 2.434652805328369 + }, + { + "auxiliary_loss_clip": 0.0113019, + "auxiliary_loss_mlp": 0.01039201, + "balance_loss_clip": 1.02310205, + "balance_loss_mlp": 1.04302979, + "epoch": 0.2425372012625883, + "flos": 31613743399680.0, + "grad_norm": 2.0372289343003023, + "language_loss": 0.69200158, + "learning_rate": 3.5445606162488754e-06, + "loss": 0.71369547, + "num_input_tokens_seen": 86863185, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.87109375, + "step": 4034, + "time_per_iteration": 2.583693027496338 + }, + { + "auxiliary_loss_clip": 0.01132981, + "auxiliary_loss_mlp": 0.01036321, + "balance_loss_clip": 1.01868999, + "balance_loss_mlp": 1.04278564, + "epoch": 0.24259732451525629, + "flos": 16326032924160.0, + "grad_norm": 2.4913709616978554, + "language_loss": 0.95772272, + "learning_rate": 3.5443131689983283e-06, + "loss": 0.97941571, + "num_input_tokens_seen": 86880040, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8984375, + "step": 4035, + "time_per_iteration": 2.4757437705993652 + }, + { + "auxiliary_loss_clip": 0.01126986, + "auxiliary_loss_mlp": 0.01047233, + "balance_loss_clip": 1.03220701, + "balance_loss_mlp": 1.04172754, + "epoch": 0.24265744776792425, + "flos": 22856639431680.0, + "grad_norm": 2.0212510419571794, + "language_loss": 0.77875686, + "learning_rate": 3.5440656631872715e-06, + "loss": 0.80049908, + "num_input_tokens_seen": 86900610, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8515625, + "step": 4036, + "time_per_iteration": 2.5642547607421875 + }, + { + "auxiliary_loss_clip": 0.01134779, + "auxiliary_loss_mlp": 0.01043471, + "balance_loss_clip": 1.02642441, + "balance_loss_mlp": 1.04447269, + "epoch": 0.24271757102059222, + "flos": 21871573873920.0, + "grad_norm": 1.648393445666421, + "language_loss": 0.74427915, + "learning_rate": 3.5438180988250898e-06, + "loss": 0.76606166, + "num_input_tokens_seen": 86919385, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.90234375, + "step": 4037, + "time_per_iteration": 2.4529507160186768 + }, + { + "auxiliary_loss_clip": 0.01133991, + "auxiliary_loss_mlp": 0.0104144, + "balance_loss_clip": 1.02497733, + "balance_loss_mlp": 1.04398596, + "epoch": 0.24277769427326018, + "flos": 19208582340480.0, + "grad_norm": 2.7681997598872656, + "language_loss": 0.76223898, + "learning_rate": 3.543570475921171e-06, + "loss": 0.78399336, + "num_input_tokens_seen": 86938885, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8984375, + "step": 4038, + "time_per_iteration": 2.4618003368377686 + }, + { + "auxiliary_loss_clip": 0.01135029, + "auxiliary_loss_mlp": 0.01044933, + "balance_loss_clip": 1.02742147, + "balance_loss_mlp": 1.04415751, + "epoch": 0.24283781752592815, + "flos": 19499889640320.0, + "grad_norm": 2.0050890767905645, + "language_loss": 0.72632921, + "learning_rate": 3.543322794484905e-06, + "loss": 0.74812889, + "num_input_tokens_seen": 86957705, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.90625, + "step": 4039, + "time_per_iteration": 2.4261560440063477 + }, + { + "auxiliary_loss_clip": 0.01135297, + "auxiliary_loss_mlp": 0.01043184, + "balance_loss_clip": 1.02631593, + "balance_loss_mlp": 1.04608393, + "epoch": 0.2428979407785961, + "flos": 19902196944000.0, + "grad_norm": 1.6810247735848671, + "language_loss": 0.78330719, + "learning_rate": 3.5430750545256843e-06, + "loss": 0.80509198, + "num_input_tokens_seen": 86975845, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.890625, + "step": 4040, + "time_per_iteration": 2.4808037281036377 + }, + { + "auxiliary_loss_clip": 0.01128006, + "auxiliary_loss_mlp": 0.01034019, + "balance_loss_clip": 1.01912999, + "balance_loss_mlp": 1.04237986, + "epoch": 0.2429580640312641, + "flos": 24715878284160.0, + "grad_norm": 1.8145876332629047, + "language_loss": 0.80390251, + "learning_rate": 3.5428272560529027e-06, + "loss": 0.82552278, + "num_input_tokens_seen": 86994800, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.85546875, + "step": 4041, + "time_per_iteration": 2.482576847076416 + }, + { + "auxiliary_loss_clip": 0.01134051, + "auxiliary_loss_mlp": 0.01043295, + "balance_loss_clip": 1.02769041, + "balance_loss_mlp": 1.04653025, + "epoch": 0.24301818728393207, + "flos": 25630343660160.0, + "grad_norm": 4.455498217071982, + "language_loss": 0.76670969, + "learning_rate": 3.542579399075957e-06, + "loss": 0.78848314, + "num_input_tokens_seen": 87016845, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.875, + "step": 4042, + "time_per_iteration": 2.4944398403167725 + }, + { + "auxiliary_loss_clip": 0.01130826, + "auxiliary_loss_mlp": 0.01033112, + "balance_loss_clip": 1.01815128, + "balance_loss_mlp": 1.04393744, + "epoch": 0.24307831053660003, + "flos": 26141388410880.0, + "grad_norm": 1.7591863299055037, + "language_loss": 0.8139993, + "learning_rate": 3.542331483604246e-06, + "loss": 0.83563864, + "num_input_tokens_seen": 87036270, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8671875, + "step": 4043, + "time_per_iteration": 2.4965035915374756 + }, + { + "auxiliary_loss_clip": 0.01135997, + "auxiliary_loss_mlp": 0.01037391, + "balance_loss_clip": 1.02053475, + "balance_loss_mlp": 1.04298007, + "epoch": 0.243138433789268, + "flos": 14972415868800.0, + "grad_norm": 2.448799092011911, + "language_loss": 0.73345625, + "learning_rate": 3.5420835096471706e-06, + "loss": 0.75519013, + "num_input_tokens_seen": 87049920, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9296875, + "step": 4044, + "time_per_iteration": 2.42809796333313 + }, + { + "auxiliary_loss_clip": 0.01136098, + "auxiliary_loss_mlp": 0.01042356, + "balance_loss_clip": 1.0252496, + "balance_loss_mlp": 1.04730773, + "epoch": 0.24319855704193596, + "flos": 25191694771200.0, + "grad_norm": 1.780616714891853, + "language_loss": 0.83562207, + "learning_rate": 3.5418354772141337e-06, + "loss": 0.85740674, + "num_input_tokens_seen": 87068230, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.88671875, + "step": 4045, + "time_per_iteration": 2.4965107440948486 + }, + { + "auxiliary_loss_clip": 0.01134201, + "auxiliary_loss_mlp": 0.01045916, + "balance_loss_clip": 1.02944136, + "balance_loss_mlp": 1.04542089, + "epoch": 0.24325868029460393, + "flos": 22127221946880.0, + "grad_norm": 2.1598753545738663, + "language_loss": 0.86787856, + "learning_rate": 3.541587386314541e-06, + "loss": 0.88967973, + "num_input_tokens_seen": 87086435, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.890625, + "step": 4046, + "time_per_iteration": 2.5126357078552246 + }, + { + "auxiliary_loss_clip": 0.01128157, + "auxiliary_loss_mlp": 0.01041362, + "balance_loss_clip": 1.02526259, + "balance_loss_mlp": 1.04252553, + "epoch": 0.2433188035472719, + "flos": 23582106420480.0, + "grad_norm": 1.9885516182116696, + "language_loss": 0.7281425, + "learning_rate": 3.5413392369578e-06, + "loss": 0.7498377, + "num_input_tokens_seen": 87105340, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.859375, + "step": 4047, + "time_per_iteration": 2.4886271953582764 + }, + { + "auxiliary_loss_clip": 0.01133305, + "auxiliary_loss_mlp": 0.01039984, + "balance_loss_clip": 1.02243662, + "balance_loss_mlp": 1.0435816, + "epoch": 0.2433789267999399, + "flos": 24462815990400.0, + "grad_norm": 2.411807088840578, + "language_loss": 0.72845596, + "learning_rate": 3.5410910291533213e-06, + "loss": 0.75018883, + "num_input_tokens_seen": 87125780, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8984375, + "step": 4048, + "time_per_iteration": 2.522012710571289 + }, + { + "auxiliary_loss_clip": 0.01132229, + "auxiliary_loss_mlp": 0.01042433, + "balance_loss_clip": 1.02720952, + "balance_loss_mlp": 1.04504991, + "epoch": 0.24343905005260785, + "flos": 16727909264640.0, + "grad_norm": 4.923738678144707, + "language_loss": 0.72984087, + "learning_rate": 3.5408427629105155e-06, + "loss": 0.75158751, + "num_input_tokens_seen": 87144470, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.875, + "step": 4049, + "time_per_iteration": 2.4399380683898926 + }, + { + "auxiliary_loss_clip": 0.0112907, + "auxiliary_loss_mlp": 0.01041944, + "balance_loss_clip": 1.02654243, + "balance_loss_mlp": 1.04297137, + "epoch": 0.24349917330527582, + "flos": 20043756443520.0, + "grad_norm": 6.058583880667159, + "language_loss": 0.7388249, + "learning_rate": 3.5405944382387985e-06, + "loss": 0.760535, + "num_input_tokens_seen": 87162830, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.859375, + "step": 4050, + "time_per_iteration": 2.4589998722076416 + }, + { + "auxiliary_loss_clip": 0.01128476, + "auxiliary_loss_mlp": 0.01044223, + "balance_loss_clip": 1.02925062, + "balance_loss_mlp": 1.04373455, + "epoch": 0.24355929655794378, + "flos": 17420554200960.0, + "grad_norm": 3.083460080669968, + "language_loss": 0.74948591, + "learning_rate": 3.5403460551475854e-06, + "loss": 0.77121294, + "num_input_tokens_seen": 87180905, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.84765625, + "step": 4051, + "time_per_iteration": 2.4284183979034424 + }, + { + "auxiliary_loss_clip": 0.01129104, + "auxiliary_loss_mlp": 0.01038015, + "balance_loss_clip": 1.02251768, + "balance_loss_mlp": 1.04273975, + "epoch": 0.24361941981061175, + "flos": 25410929431680.0, + "grad_norm": 2.420510968298769, + "language_loss": 0.70638204, + "learning_rate": 3.540097613646296e-06, + "loss": 0.72805327, + "num_input_tokens_seen": 87202290, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.86328125, + "step": 4052, + "time_per_iteration": 5.468756675720215 + }, + { + "auxiliary_loss_clip": 0.01131368, + "auxiliary_loss_mlp": 0.01048115, + "balance_loss_clip": 1.03215313, + "balance_loss_mlp": 1.04370522, + "epoch": 0.2436795430632797, + "flos": 22820800636800.0, + "grad_norm": 1.61331134721481, + "language_loss": 0.81265736, + "learning_rate": 3.539849113744351e-06, + "loss": 0.83445215, + "num_input_tokens_seen": 87221650, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.875, + "step": 4053, + "time_per_iteration": 2.5280394554138184 + }, + { + "auxiliary_loss_clip": 0.01135173, + "auxiliary_loss_mlp": 0.01035062, + "balance_loss_clip": 1.01895714, + "balance_loss_mlp": 1.04522192, + "epoch": 0.2437396663159477, + "flos": 15157786982400.0, + "grad_norm": 1.5461481286352234, + "language_loss": 0.77842951, + "learning_rate": 3.539600555451172e-06, + "loss": 0.80013186, + "num_input_tokens_seen": 87238515, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8984375, + "step": 4054, + "time_per_iteration": 2.424604892730713 + }, + { + "auxiliary_loss_clip": 0.01128011, + "auxiliary_loss_mlp": 0.01044969, + "balance_loss_clip": 1.02990091, + "balance_loss_mlp": 1.04097724, + "epoch": 0.24379978956861567, + "flos": 22091131756800.0, + "grad_norm": 1.616998838355979, + "language_loss": 0.83784473, + "learning_rate": 3.5393519387761866e-06, + "loss": 0.85957456, + "num_input_tokens_seen": 87256290, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.87109375, + "step": 4055, + "time_per_iteration": 2.4814612865448 + }, + { + "auxiliary_loss_clip": 0.0113426, + "auxiliary_loss_mlp": 0.01038657, + "balance_loss_clip": 1.02194405, + "balance_loss_mlp": 1.04221749, + "epoch": 0.24385991282128364, + "flos": 31467766527360.0, + "grad_norm": 3.407480313131798, + "language_loss": 0.55291057, + "learning_rate": 3.5391032637288217e-06, + "loss": 0.57463974, + "num_input_tokens_seen": 87277085, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.921875, + "step": 4056, + "time_per_iteration": 2.5356216430664062 + }, + { + "auxiliary_loss_clip": 0.01133242, + "auxiliary_loss_mlp": 0.01043161, + "balance_loss_clip": 1.02626896, + "balance_loss_mlp": 1.04361272, + "epoch": 0.2439200360739516, + "flos": 23838795987840.0, + "grad_norm": 2.24663888381965, + "language_loss": 0.79832959, + "learning_rate": 3.538854530318506e-06, + "loss": 0.82009363, + "num_input_tokens_seen": 87293020, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8984375, + "step": 4057, + "time_per_iteration": 2.4915707111358643 + }, + { + "auxiliary_loss_clip": 0.01128391, + "auxiliary_loss_mlp": 0.01037777, + "balance_loss_clip": 1.02195764, + "balance_loss_mlp": 1.04218984, + "epoch": 0.24398015932661957, + "flos": 19169978198400.0, + "grad_norm": 1.7432058239394113, + "language_loss": 0.78817719, + "learning_rate": 3.538605738554673e-06, + "loss": 0.80983889, + "num_input_tokens_seen": 87311445, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.86328125, + "step": 4058, + "time_per_iteration": 2.426687002182007 + }, + { + "auxiliary_loss_clip": 0.01133605, + "auxiliary_loss_mlp": 0.01038515, + "balance_loss_clip": 1.02366126, + "balance_loss_mlp": 1.04273307, + "epoch": 0.24404028257928753, + "flos": 25262474520960.0, + "grad_norm": 1.688831116872718, + "language_loss": 0.85133582, + "learning_rate": 3.538356888446756e-06, + "loss": 0.87305701, + "num_input_tokens_seen": 87332055, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.90625, + "step": 4059, + "time_per_iteration": 2.499464511871338 + }, + { + "auxiliary_loss_clip": 0.01127196, + "auxiliary_loss_mlp": 0.01035816, + "balance_loss_clip": 1.02079606, + "balance_loss_mlp": 1.04288411, + "epoch": 0.2441004058319555, + "flos": 26467600752000.0, + "grad_norm": 1.6494662829711617, + "language_loss": 0.73770267, + "learning_rate": 3.5381079800041913e-06, + "loss": 0.75933278, + "num_input_tokens_seen": 87351295, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84375, + "step": 4060, + "time_per_iteration": 2.4955050945281982 + }, + { + "auxiliary_loss_clip": 0.01137637, + "auxiliary_loss_mlp": 0.01050854, + "balance_loss_clip": 1.03262711, + "balance_loss_mlp": 1.04506934, + "epoch": 0.2441605290846235, + "flos": 26760524163840.0, + "grad_norm": 1.8597953216817902, + "language_loss": 0.73587501, + "learning_rate": 3.5378590132364182e-06, + "loss": 0.75775993, + "num_input_tokens_seen": 87370650, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.92578125, + "step": 4061, + "time_per_iteration": 2.5002825260162354 + }, + { + "auxiliary_loss_clip": 0.01129662, + "auxiliary_loss_mlp": 0.01036553, + "balance_loss_clip": 1.02248669, + "balance_loss_mlp": 1.04437923, + "epoch": 0.24422065233729146, + "flos": 21105850717440.0, + "grad_norm": 1.6775055914479682, + "language_loss": 0.76006806, + "learning_rate": 3.5376099881528768e-06, + "loss": 0.78173012, + "num_input_tokens_seen": 87389020, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8515625, + "step": 4062, + "time_per_iteration": 2.478625535964966 + }, + { + "auxiliary_loss_clip": 0.01126984, + "auxiliary_loss_mlp": 0.01035611, + "balance_loss_clip": 1.0205195, + "balance_loss_mlp": 1.04376316, + "epoch": 0.24428077558995942, + "flos": 25263156879360.0, + "grad_norm": 1.7282475931571, + "language_loss": 0.85710216, + "learning_rate": 3.537360904763011e-06, + "loss": 0.87872803, + "num_input_tokens_seen": 87409695, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.83203125, + "step": 4063, + "time_per_iteration": 2.5161943435668945 + }, + { + "auxiliary_loss_clip": 0.01135931, + "auxiliary_loss_mlp": 0.01042417, + "balance_loss_clip": 1.02603722, + "balance_loss_mlp": 1.04589176, + "epoch": 0.24434089884262739, + "flos": 20485278420480.0, + "grad_norm": 6.32752237165424, + "language_loss": 0.68127096, + "learning_rate": 3.5371117630762656e-06, + "loss": 0.70305437, + "num_input_tokens_seen": 87428250, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8984375, + "step": 4064, + "time_per_iteration": 2.4434523582458496 + }, + { + "auxiliary_loss_clip": 0.01134926, + "auxiliary_loss_mlp": 0.01037547, + "balance_loss_clip": 1.02083397, + "balance_loss_mlp": 1.04318714, + "epoch": 0.24440102209529535, + "flos": 23621895711360.0, + "grad_norm": 1.5178524812834733, + "language_loss": 0.7003206, + "learning_rate": 3.536862563102088e-06, + "loss": 0.72204536, + "num_input_tokens_seen": 87449380, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.91796875, + "step": 4065, + "time_per_iteration": 2.513827085494995 + }, + { + "auxiliary_loss_clip": 0.01136726, + "auxiliary_loss_mlp": 0.01047876, + "balance_loss_clip": 1.02960134, + "balance_loss_mlp": 1.04461718, + "epoch": 0.24446114534796332, + "flos": 20554729367040.0, + "grad_norm": 2.0517728790430048, + "language_loss": 0.83912247, + "learning_rate": 3.5366133048499282e-06, + "loss": 0.86096847, + "num_input_tokens_seen": 87465365, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.921875, + "step": 4066, + "time_per_iteration": 2.4601314067840576 + }, + { + "auxiliary_loss_clip": 0.01053849, + "auxiliary_loss_mlp": 0.01006665, + "balance_loss_clip": 1.00455475, + "balance_loss_mlp": 1.02389407, + "epoch": 0.24452126860063128, + "flos": 60389575009920.0, + "grad_norm": 0.7387464995159381, + "language_loss": 0.52291965, + "learning_rate": 3.5363639883292374e-06, + "loss": 0.54352474, + "num_input_tokens_seen": 87522525, + "router_z_loss_clip": 0.02111816, + "router_z_loss_mlp": 0.29882812, + "step": 4067, + "time_per_iteration": 2.9973862171173096 + }, + { + "auxiliary_loss_clip": 0.01133754, + "auxiliary_loss_mlp": 0.01040771, + "balance_loss_clip": 1.0242008, + "balance_loss_mlp": 1.04483843, + "epoch": 0.24458139185329927, + "flos": 15121660878720.0, + "grad_norm": 3.022186633601072, + "language_loss": 0.71927387, + "learning_rate": 3.5361146135494706e-06, + "loss": 0.74101913, + "num_input_tokens_seen": 87539170, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.890625, + "step": 4068, + "time_per_iteration": 2.4484708309173584 + }, + { + "auxiliary_loss_clip": 0.01132664, + "auxiliary_loss_mlp": 0.01040777, + "balance_loss_clip": 1.02457666, + "balance_loss_mlp": 1.04505873, + "epoch": 0.24464151510596724, + "flos": 27998723842560.0, + "grad_norm": 1.494083672668599, + "language_loss": 0.77513826, + "learning_rate": 3.5358651805200835e-06, + "loss": 0.79687262, + "num_input_tokens_seen": 87558875, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.875, + "step": 4069, + "time_per_iteration": 2.5724000930786133 + }, + { + "auxiliary_loss_clip": 0.01133522, + "auxiliary_loss_mlp": 0.01047384, + "balance_loss_clip": 1.03101087, + "balance_loss_mlp": 1.04646873, + "epoch": 0.2447016383586352, + "flos": 19792884879360.0, + "grad_norm": 1.9755919994455295, + "language_loss": 0.80163878, + "learning_rate": 3.5356156892505347e-06, + "loss": 0.82344782, + "num_input_tokens_seen": 87576485, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.87109375, + "step": 4070, + "time_per_iteration": 2.4932186603546143 + }, + { + "auxiliary_loss_clip": 0.01130692, + "auxiliary_loss_mlp": 0.01045764, + "balance_loss_clip": 1.03018379, + "balance_loss_mlp": 1.04351497, + "epoch": 0.24476176161130317, + "flos": 26067340523520.0, + "grad_norm": 1.6271146290001441, + "language_loss": 0.8410303, + "learning_rate": 3.5353661397502854e-06, + "loss": 0.86279482, + "num_input_tokens_seen": 87598620, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.875, + "step": 4071, + "time_per_iteration": 2.5299296379089355 + }, + { + "auxiliary_loss_clip": 0.0113627, + "auxiliary_loss_mlp": 0.01045362, + "balance_loss_clip": 1.02795792, + "balance_loss_mlp": 1.04406631, + "epoch": 0.24482188486397113, + "flos": 18843550375680.0, + "grad_norm": 1.720640728536457, + "language_loss": 0.79751229, + "learning_rate": 3.535116532028798e-06, + "loss": 0.81932867, + "num_input_tokens_seen": 87616595, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.921875, + "step": 4072, + "time_per_iteration": 2.470327854156494 + }, + { + "auxiliary_loss_clip": 0.01129102, + "auxiliary_loss_mlp": 0.01043575, + "balance_loss_clip": 1.02906084, + "balance_loss_mlp": 1.04437995, + "epoch": 0.2448820081166391, + "flos": 21251791676160.0, + "grad_norm": 1.615929332251483, + "language_loss": 0.70322561, + "learning_rate": 3.5348668660955382e-06, + "loss": 0.7249524, + "num_input_tokens_seen": 87635755, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.84765625, + "step": 4073, + "time_per_iteration": 2.4951980113983154 + }, + { + "auxiliary_loss_clip": 0.01129351, + "auxiliary_loss_mlp": 0.01041111, + "balance_loss_clip": 1.02662683, + "balance_loss_mlp": 1.04456043, + "epoch": 0.2449421313693071, + "flos": 23950586090880.0, + "grad_norm": 2.5968867848691133, + "language_loss": 0.67692697, + "learning_rate": 3.5346171419599728e-06, + "loss": 0.69863164, + "num_input_tokens_seen": 87652885, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.84765625, + "step": 4074, + "time_per_iteration": 2.4697325229644775 + }, + { + "auxiliary_loss_clip": 0.01052266, + "auxiliary_loss_mlp": 0.01006876, + "balance_loss_clip": 1.00504053, + "balance_loss_mlp": 1.0222578, + "epoch": 0.24500225462197506, + "flos": 60687669980160.0, + "grad_norm": 0.896032421619399, + "language_loss": 0.68665123, + "learning_rate": 3.5343673596315718e-06, + "loss": 0.70724261, + "num_input_tokens_seen": 87713220, + "router_z_loss_clip": 0.01831055, + "router_z_loss_mlp": 0.30078125, + "step": 4075, + "time_per_iteration": 3.1993846893310547 + }, + { + "auxiliary_loss_clip": 0.01131428, + "auxiliary_loss_mlp": 0.01040459, + "balance_loss_clip": 1.02548659, + "balance_loss_mlp": 1.04603517, + "epoch": 0.24506237787464302, + "flos": 26284204886400.0, + "grad_norm": 2.243483207404797, + "language_loss": 0.79306483, + "learning_rate": 3.5341175191198063e-06, + "loss": 0.81478369, + "num_input_tokens_seen": 87732680, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.85546875, + "step": 4076, + "time_per_iteration": 2.542245388031006 + }, + { + "auxiliary_loss_clip": 0.01134594, + "auxiliary_loss_mlp": 0.01045082, + "balance_loss_clip": 1.02749884, + "balance_loss_mlp": 1.04342794, + "epoch": 0.245122501127311, + "flos": 20552287242240.0, + "grad_norm": 2.0630196459837618, + "language_loss": 0.82211018, + "learning_rate": 3.533867620434151e-06, + "loss": 0.84390688, + "num_input_tokens_seen": 87751880, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.91015625, + "step": 4077, + "time_per_iteration": 2.5165140628814697 + }, + { + "auxiliary_loss_clip": 0.01132098, + "auxiliary_loss_mlp": 0.01044565, + "balance_loss_clip": 1.02695799, + "balance_loss_mlp": 1.04380083, + "epoch": 0.24518262437997895, + "flos": 29132603447040.0, + "grad_norm": 12.782264679420269, + "language_loss": 0.61930454, + "learning_rate": 3.533617663584082e-06, + "loss": 0.64107114, + "num_input_tokens_seen": 87771795, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8828125, + "step": 4078, + "time_per_iteration": 2.5202372074127197 + }, + { + "auxiliary_loss_clip": 0.01129452, + "auxiliary_loss_mlp": 0.0103596, + "balance_loss_clip": 1.02035594, + "balance_loss_mlp": 1.04474652, + "epoch": 0.24524274763264692, + "flos": 23476924419840.0, + "grad_norm": 1.7044874550491866, + "language_loss": 0.75514519, + "learning_rate": 3.5333676485790765e-06, + "loss": 0.77679932, + "num_input_tokens_seen": 87793640, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84765625, + "step": 4079, + "time_per_iteration": 2.483339309692383 + }, + { + "auxiliary_loss_clip": 0.01129188, + "auxiliary_loss_mlp": 0.01042937, + "balance_loss_clip": 1.02686739, + "balance_loss_mlp": 1.04370368, + "epoch": 0.24530287088531488, + "flos": 17201175886080.0, + "grad_norm": 1.8257477744529516, + "language_loss": 0.74925131, + "learning_rate": 3.5331175754286173e-06, + "loss": 0.77097261, + "num_input_tokens_seen": 87812390, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.85546875, + "step": 4080, + "time_per_iteration": 2.4843389987945557 + }, + { + "auxiliary_loss_clip": 0.01125805, + "auxiliary_loss_mlp": 0.01039252, + "balance_loss_clip": 1.02375531, + "balance_loss_mlp": 1.04129529, + "epoch": 0.24536299413798288, + "flos": 14867449349760.0, + "grad_norm": 2.211780780293779, + "language_loss": 0.82807517, + "learning_rate": 3.532867444142186e-06, + "loss": 0.84972572, + "num_input_tokens_seen": 87830640, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.84375, + "step": 4081, + "time_per_iteration": 2.4753835201263428 + }, + { + "auxiliary_loss_clip": 0.01128982, + "auxiliary_loss_mlp": 0.01039204, + "balance_loss_clip": 1.02445221, + "balance_loss_mlp": 1.04313576, + "epoch": 0.24542311739065084, + "flos": 35262051886080.0, + "grad_norm": 4.1574914526272515, + "language_loss": 0.73153239, + "learning_rate": 3.532617254729267e-06, + "loss": 0.75321424, + "num_input_tokens_seen": 87850450, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.859375, + "step": 4082, + "time_per_iteration": 2.5975396633148193 + }, + { + "auxiliary_loss_clip": 0.01127179, + "auxiliary_loss_mlp": 0.01042857, + "balance_loss_clip": 1.02837873, + "balance_loss_mlp": 1.04274178, + "epoch": 0.2454832406433188, + "flos": 21503130117120.0, + "grad_norm": 1.543838453785988, + "language_loss": 0.71628594, + "learning_rate": 3.5323670071993485e-06, + "loss": 0.73798621, + "num_input_tokens_seen": 87868810, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.84375, + "step": 4083, + "time_per_iteration": 2.471519947052002 + }, + { + "auxiliary_loss_clip": 0.01131409, + "auxiliary_loss_mlp": 0.01040567, + "balance_loss_clip": 1.02285206, + "balance_loss_mlp": 1.04234004, + "epoch": 0.24554336389598677, + "flos": 14756664827520.0, + "grad_norm": 2.1941070650453094, + "language_loss": 0.74700832, + "learning_rate": 3.532116701561919e-06, + "loss": 0.76872808, + "num_input_tokens_seen": 87885685, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.890625, + "step": 4084, + "time_per_iteration": 2.4286506175994873 + }, + { + "auxiliary_loss_clip": 0.01126312, + "auxiliary_loss_mlp": 0.01035336, + "balance_loss_clip": 1.01986289, + "balance_loss_mlp": 1.04189909, + "epoch": 0.24560348714865474, + "flos": 14976402278400.0, + "grad_norm": 2.042106499003273, + "language_loss": 0.85206825, + "learning_rate": 3.531866337826471e-06, + "loss": 0.8736847, + "num_input_tokens_seen": 87903715, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84375, + "step": 4085, + "time_per_iteration": 2.4283318519592285 + }, + { + "auxiliary_loss_clip": 0.01130256, + "auxiliary_loss_mlp": 0.01048422, + "balance_loss_clip": 1.03209007, + "balance_loss_mlp": 1.04266381, + "epoch": 0.2456636104013227, + "flos": 22675326554880.0, + "grad_norm": 1.8090063737063005, + "language_loss": 0.7876097, + "learning_rate": 3.5316159160024982e-06, + "loss": 0.80939639, + "num_input_tokens_seen": 87923375, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.875, + "step": 4086, + "time_per_iteration": 2.478954792022705 + }, + { + "auxiliary_loss_clip": 0.01126651, + "auxiliary_loss_mlp": 0.01041575, + "balance_loss_clip": 1.02669752, + "balance_loss_mlp": 1.04330873, + "epoch": 0.2457237336539907, + "flos": 27417869009280.0, + "grad_norm": 1.6669278195562474, + "language_loss": 0.75269985, + "learning_rate": 3.531365436099496e-06, + "loss": 0.77438211, + "num_input_tokens_seen": 87943115, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8359375, + "step": 4087, + "time_per_iteration": 2.4871292114257812 + }, + { + "auxiliary_loss_clip": 0.01132319, + "auxiliary_loss_mlp": 0.01041093, + "balance_loss_clip": 1.02364135, + "balance_loss_mlp": 1.04574418, + "epoch": 0.24578385690665866, + "flos": 20412379768320.0, + "grad_norm": 2.5789657141026, + "language_loss": 0.79284519, + "learning_rate": 3.5311148981269635e-06, + "loss": 0.81457937, + "num_input_tokens_seen": 87959505, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.8671875, + "step": 4088, + "time_per_iteration": 2.479841709136963 + }, + { + "auxiliary_loss_clip": 0.01123487, + "auxiliary_loss_mlp": 0.01033801, + "balance_loss_clip": 1.0196631, + "balance_loss_mlp": 1.04091823, + "epoch": 0.24584398015932662, + "flos": 23915393740800.0, + "grad_norm": 1.6187757849670203, + "language_loss": 0.7736612, + "learning_rate": 3.5308643020944e-06, + "loss": 0.79523408, + "num_input_tokens_seen": 87979725, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.828125, + "step": 4089, + "time_per_iteration": 2.483436346054077 + }, + { + "auxiliary_loss_clip": 0.01129539, + "auxiliary_loss_mlp": 0.01040613, + "balance_loss_clip": 1.02440071, + "balance_loss_mlp": 1.04232669, + "epoch": 0.2459041034119946, + "flos": 41496359103360.0, + "grad_norm": 3.8690522662716416, + "language_loss": 0.81463957, + "learning_rate": 3.530613648011309e-06, + "loss": 0.83634108, + "num_input_tokens_seen": 87998270, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.87109375, + "step": 4090, + "time_per_iteration": 2.657944917678833 + }, + { + "auxiliary_loss_clip": 0.01132703, + "auxiliary_loss_mlp": 0.01049826, + "balance_loss_clip": 1.03265369, + "balance_loss_mlp": 1.04411578, + "epoch": 0.24596422666466256, + "flos": 19936814676480.0, + "grad_norm": 1.9398667366019489, + "language_loss": 0.72874928, + "learning_rate": 3.5303629358871946e-06, + "loss": 0.75057453, + "num_input_tokens_seen": 88016760, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.88671875, + "step": 4091, + "time_per_iteration": 2.448307991027832 + }, + { + "auxiliary_loss_clip": 0.0113285, + "auxiliary_loss_mlp": 0.01036521, + "balance_loss_clip": 1.02166772, + "balance_loss_mlp": 1.04811478, + "epoch": 0.24602434991733052, + "flos": 21544391865600.0, + "grad_norm": 1.9209724672120978, + "language_loss": 0.76486623, + "learning_rate": 3.5301121657315653e-06, + "loss": 0.78656, + "num_input_tokens_seen": 88036465, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.84765625, + "step": 4092, + "time_per_iteration": 2.510815143585205 + }, + { + "auxiliary_loss_clip": 0.01134482, + "auxiliary_loss_mlp": 0.01035407, + "balance_loss_clip": 1.01899242, + "balance_loss_mlp": 1.04404068, + "epoch": 0.24608447316999849, + "flos": 23185078416000.0, + "grad_norm": 2.544549098738024, + "language_loss": 0.80905128, + "learning_rate": 3.5298613375539287e-06, + "loss": 0.83075017, + "num_input_tokens_seen": 88053270, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90234375, + "step": 4093, + "time_per_iteration": 2.4658117294311523 + }, + { + "auxiliary_loss_clip": 0.01133522, + "auxiliary_loss_mlp": 0.01042815, + "balance_loss_clip": 1.02542281, + "balance_loss_mlp": 1.04285693, + "epoch": 0.24614459642266648, + "flos": 19641951930240.0, + "grad_norm": 1.9793331271335382, + "language_loss": 0.87355959, + "learning_rate": 3.529610451363797e-06, + "loss": 0.89532292, + "num_input_tokens_seen": 88072305, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.90625, + "step": 4094, + "time_per_iteration": 5.436578035354614 + }, + { + "auxiliary_loss_clip": 0.01055645, + "auxiliary_loss_mlp": 0.01004731, + "balance_loss_clip": 1.00285995, + "balance_loss_mlp": 1.02449679, + "epoch": 0.24620471967533444, + "flos": 61739816186880.0, + "grad_norm": 0.7591937233735362, + "language_loss": 0.57501638, + "learning_rate": 3.5293595071706833e-06, + "loss": 0.59562016, + "num_input_tokens_seen": 88137995, + "router_z_loss_clip": 0.01867676, + "router_z_loss_mlp": 0.3125, + "step": 4095, + "time_per_iteration": 3.1966967582702637 + }, + { + "auxiliary_loss_clip": 0.01055105, + "auxiliary_loss_mlp": 0.01001708, + "balance_loss_clip": 0.99987203, + "balance_loss_mlp": 1.02336812, + "epoch": 0.2462648429280024, + "flos": 69154436315520.0, + "grad_norm": 0.643968481445629, + "language_loss": 0.56195372, + "learning_rate": 3.5291085049841042e-06, + "loss": 0.58252186, + "num_input_tokens_seen": 88208490, + "router_z_loss_clip": 0.01831055, + "router_z_loss_mlp": 0.31640625, + "step": 4096, + "time_per_iteration": 3.187084436416626 + }, + { + "auxiliary_loss_clip": 0.01134655, + "auxiliary_loss_mlp": 0.01035351, + "balance_loss_clip": 1.02030087, + "balance_loss_mlp": 1.04697204, + "epoch": 0.24632496618067037, + "flos": 29459605887360.0, + "grad_norm": 2.0390556104017907, + "language_loss": 0.77674699, + "learning_rate": 3.5288574448135773e-06, + "loss": 0.79844701, + "num_input_tokens_seen": 88228050, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.875, + "step": 4097, + "time_per_iteration": 2.5585436820983887 + }, + { + "auxiliary_loss_clip": 0.01134087, + "auxiliary_loss_mlp": 0.01044655, + "balance_loss_clip": 1.02608228, + "balance_loss_mlp": 1.04491377, + "epoch": 0.24638508943333834, + "flos": 24316444068480.0, + "grad_norm": 2.135816170269485, + "language_loss": 0.76393569, + "learning_rate": 3.5286063266686235e-06, + "loss": 0.78572309, + "num_input_tokens_seen": 88248090, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.890625, + "step": 4098, + "time_per_iteration": 2.478665828704834 + }, + { + "auxiliary_loss_clip": 0.01133268, + "auxiliary_loss_mlp": 0.01040789, + "balance_loss_clip": 1.02568507, + "balance_loss_mlp": 1.04479909, + "epoch": 0.2464452126860063, + "flos": 26613254401920.0, + "grad_norm": 2.152719854213413, + "language_loss": 0.68733507, + "learning_rate": 3.528355150558764e-06, + "loss": 0.70907569, + "num_input_tokens_seen": 88267545, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8828125, + "step": 4099, + "time_per_iteration": 2.515821933746338 + }, + { + "auxiliary_loss_clip": 0.01124761, + "auxiliary_loss_mlp": 0.01041381, + "balance_loss_clip": 1.02621734, + "balance_loss_mlp": 1.04163074, + "epoch": 0.24650533593867427, + "flos": 31212405763200.0, + "grad_norm": 2.459538616056665, + "language_loss": 0.65975124, + "learning_rate": 3.5281039164935237e-06, + "loss": 0.68141258, + "num_input_tokens_seen": 88289785, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.83203125, + "step": 4100, + "time_per_iteration": 2.562962532043457 + }, + { + "auxiliary_loss_clip": 0.01051305, + "auxiliary_loss_mlp": 0.01002462, + "balance_loss_clip": 1.00055432, + "balance_loss_mlp": 1.02057505, + "epoch": 0.24656545919134226, + "flos": 68494002900480.0, + "grad_norm": 0.7078763540659354, + "language_loss": 0.61549371, + "learning_rate": 3.5278526244824304e-06, + "loss": 0.63603139, + "num_input_tokens_seen": 88357320, + "router_z_loss_clip": 0.01904297, + "router_z_loss_mlp": 0.30859375, + "step": 4101, + "time_per_iteration": 3.1617352962493896 + }, + { + "auxiliary_loss_clip": 0.01128803, + "auxiliary_loss_mlp": 0.01034815, + "balance_loss_clip": 1.01893687, + "balance_loss_mlp": 1.04385781, + "epoch": 0.24662558244401023, + "flos": 20084192179200.0, + "grad_norm": 1.7154022892986804, + "language_loss": 0.73020113, + "learning_rate": 3.527601274535012e-06, + "loss": 0.75183737, + "num_input_tokens_seen": 88377040, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8515625, + "step": 4102, + "time_per_iteration": 2.5522637367248535 + }, + { + "auxiliary_loss_clip": 0.01132375, + "auxiliary_loss_mlp": 0.0104013, + "balance_loss_clip": 1.02463281, + "balance_loss_mlp": 1.04294777, + "epoch": 0.2466857056966782, + "flos": 30701361012480.0, + "grad_norm": 2.2979425011191528, + "language_loss": 0.75574934, + "learning_rate": 3.5273498666608004e-06, + "loss": 0.7774744, + "num_input_tokens_seen": 88395085, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.89453125, + "step": 4103, + "time_per_iteration": 2.5117204189300537 + }, + { + "auxiliary_loss_clip": 0.01129454, + "auxiliary_loss_mlp": 0.01043396, + "balance_loss_clip": 1.02647424, + "balance_loss_mlp": 1.04096079, + "epoch": 0.24674582894934616, + "flos": 22528523669760.0, + "grad_norm": 2.002646106823912, + "language_loss": 0.78701174, + "learning_rate": 3.5270984008693288e-06, + "loss": 0.80874026, + "num_input_tokens_seen": 88413205, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.88671875, + "step": 4104, + "time_per_iteration": 2.5791869163513184 + }, + { + "auxiliary_loss_clip": 0.011264, + "auxiliary_loss_mlp": 0.01041575, + "balance_loss_clip": 1.02333593, + "balance_loss_mlp": 1.0411272, + "epoch": 0.24680595220201412, + "flos": 20704297599360.0, + "grad_norm": 1.7283937272898544, + "language_loss": 0.83567655, + "learning_rate": 3.526846877170133e-06, + "loss": 0.85735631, + "num_input_tokens_seen": 88431525, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.8515625, + "step": 4105, + "time_per_iteration": 2.447399854660034 + }, + { + "auxiliary_loss_clip": 0.01134164, + "auxiliary_loss_mlp": 0.01043152, + "balance_loss_clip": 1.02768457, + "balance_loss_mlp": 1.04806173, + "epoch": 0.2468660754546821, + "flos": 21831174051840.0, + "grad_norm": 1.7373974977996043, + "language_loss": 0.7646578, + "learning_rate": 3.52659529557275e-06, + "loss": 0.78643101, + "num_input_tokens_seen": 88451210, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.859375, + "step": 4106, + "time_per_iteration": 2.519059658050537 + }, + { + "auxiliary_loss_clip": 0.01127139, + "auxiliary_loss_mlp": 0.01042215, + "balance_loss_clip": 1.02539492, + "balance_loss_mlp": 1.04087114, + "epoch": 0.24692619870735008, + "flos": 15267709578240.0, + "grad_norm": 2.1665884513414513, + "language_loss": 0.72764528, + "learning_rate": 3.5263436560867205e-06, + "loss": 0.74933887, + "num_input_tokens_seen": 88467790, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.86328125, + "step": 4107, + "time_per_iteration": 2.4489266872406006 + }, + { + "auxiliary_loss_clip": 0.01131987, + "auxiliary_loss_mlp": 0.01048032, + "balance_loss_clip": 1.03173625, + "balance_loss_mlp": 1.0454886, + "epoch": 0.24698632196001805, + "flos": 29680097523840.0, + "grad_norm": 2.3712774609847274, + "language_loss": 0.65420353, + "learning_rate": 3.526091958721587e-06, + "loss": 0.67600369, + "num_input_tokens_seen": 88490330, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8671875, + "step": 4108, + "time_per_iteration": 2.5401792526245117 + }, + { + "auxiliary_loss_clip": 0.01131766, + "auxiliary_loss_mlp": 0.01046054, + "balance_loss_clip": 1.02961504, + "balance_loss_mlp": 1.04324555, + "epoch": 0.247046445212686, + "flos": 39165469741440.0, + "grad_norm": 2.174268382145969, + "language_loss": 0.72611141, + "learning_rate": 3.5258402034868936e-06, + "loss": 0.74788952, + "num_input_tokens_seen": 88512435, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8828125, + "step": 4109, + "time_per_iteration": 2.593358278274536 + }, + { + "auxiliary_loss_clip": 0.01133432, + "auxiliary_loss_mlp": 0.010446, + "balance_loss_clip": 1.02788687, + "balance_loss_mlp": 1.04414606, + "epoch": 0.24710656846535398, + "flos": 22998845376000.0, + "grad_norm": 1.7026194733932167, + "language_loss": 0.79302657, + "learning_rate": 3.5255883903921866e-06, + "loss": 0.81480682, + "num_input_tokens_seen": 88529780, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.89453125, + "step": 4110, + "time_per_iteration": 2.4776864051818848 + }, + { + "auxiliary_loss_clip": 0.01133691, + "auxiliary_loss_mlp": 0.01032561, + "balance_loss_clip": 1.01618171, + "balance_loss_mlp": 1.04541993, + "epoch": 0.24716669171802194, + "flos": 26432803451520.0, + "grad_norm": 2.5002063230568545, + "language_loss": 0.80653715, + "learning_rate": 3.5253365194470144e-06, + "loss": 0.82819968, + "num_input_tokens_seen": 88547200, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8828125, + "step": 4111, + "time_per_iteration": 2.4957237243652344 + }, + { + "auxiliary_loss_clip": 0.01130011, + "auxiliary_loss_mlp": 0.01039883, + "balance_loss_clip": 1.02517819, + "balance_loss_mlp": 1.04273677, + "epoch": 0.2472268149706899, + "flos": 23329870139520.0, + "grad_norm": 2.4547784256207663, + "language_loss": 0.75205207, + "learning_rate": 3.5250845906609294e-06, + "loss": 0.77375102, + "num_input_tokens_seen": 88566415, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.875, + "step": 4112, + "time_per_iteration": 2.481778860092163 + }, + { + "auxiliary_loss_clip": 0.01130648, + "auxiliary_loss_mlp": 0.01044261, + "balance_loss_clip": 1.02868617, + "balance_loss_mlp": 1.04366612, + "epoch": 0.24728693822335787, + "flos": 23768734510080.0, + "grad_norm": 1.9927491285660106, + "language_loss": 0.82454932, + "learning_rate": 3.5248326040434835e-06, + "loss": 0.8462984, + "num_input_tokens_seen": 88585225, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.87109375, + "step": 4113, + "time_per_iteration": 2.4658617973327637 + }, + { + "auxiliary_loss_clip": 0.01129834, + "auxiliary_loss_mlp": 0.01036833, + "balance_loss_clip": 1.0205375, + "balance_loss_mlp": 1.0423646, + "epoch": 0.24734706147602586, + "flos": 19317499355520.0, + "grad_norm": 2.834925175676511, + "language_loss": 0.87073094, + "learning_rate": 3.5245805596042322e-06, + "loss": 0.89239764, + "num_input_tokens_seen": 88603280, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.875, + "step": 4114, + "time_per_iteration": 2.4575555324554443 + }, + { + "auxiliary_loss_clip": 0.01130204, + "auxiliary_loss_mlp": 0.01038167, + "balance_loss_clip": 1.02274156, + "balance_loss_mlp": 1.04354906, + "epoch": 0.24740718472869383, + "flos": 28036932935040.0, + "grad_norm": 2.804779626044085, + "language_loss": 0.753479, + "learning_rate": 3.524328457352734e-06, + "loss": 0.7751627, + "num_input_tokens_seen": 88624925, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8671875, + "step": 4115, + "time_per_iteration": 2.5051238536834717 + }, + { + "auxiliary_loss_clip": 0.01052886, + "auxiliary_loss_mlp": 0.01002125, + "balance_loss_clip": 1.00016963, + "balance_loss_mlp": 1.02261877, + "epoch": 0.2474673079813618, + "flos": 68107569408000.0, + "grad_norm": 0.6664049604648837, + "language_loss": 0.58203655, + "learning_rate": 3.5240762972985475e-06, + "loss": 0.60258663, + "num_input_tokens_seen": 88691475, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.30273438, + "step": 4116, + "time_per_iteration": 3.172032117843628 + }, + { + "auxiliary_loss_clip": 0.01130845, + "auxiliary_loss_mlp": 0.01035114, + "balance_loss_clip": 1.01992679, + "balance_loss_mlp": 1.04510772, + "epoch": 0.24752743123402976, + "flos": 29462119839360.0, + "grad_norm": 1.6806447251481575, + "language_loss": 0.83616889, + "learning_rate": 3.523824079451235e-06, + "loss": 0.8578285, + "num_input_tokens_seen": 88713425, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.85546875, + "step": 4117, + "time_per_iteration": 2.5228748321533203 + }, + { + "auxiliary_loss_clip": 0.01053619, + "auxiliary_loss_mlp": 0.0100274, + "balance_loss_clip": 1.00073707, + "balance_loss_mlp": 1.02337885, + "epoch": 0.24758755448669773, + "flos": 58350459824640.0, + "grad_norm": 0.9069522642789956, + "language_loss": 0.63507527, + "learning_rate": 3.5235718038203602e-06, + "loss": 0.65563887, + "num_input_tokens_seen": 88769995, + "router_z_loss_clip": 0.02001953, + "router_z_loss_mlp": 0.30078125, + "step": 4118, + "time_per_iteration": 2.9459333419799805 + }, + { + "auxiliary_loss_clip": 0.0113153, + "auxiliary_loss_mlp": 0.01040526, + "balance_loss_clip": 1.02470684, + "balance_loss_mlp": 1.04544902, + "epoch": 0.2476476777393657, + "flos": 20484416494080.0, + "grad_norm": 1.5050779056214143, + "language_loss": 0.79252797, + "learning_rate": 3.523319470415491e-06, + "loss": 0.8142485, + "num_input_tokens_seen": 88789970, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.859375, + "step": 4119, + "time_per_iteration": 2.438519239425659 + }, + { + "auxiliary_loss_clip": 0.01129874, + "auxiliary_loss_mlp": 0.01039225, + "balance_loss_clip": 1.02359676, + "balance_loss_mlp": 1.04430819, + "epoch": 0.24770780099203366, + "flos": 20485853038080.0, + "grad_norm": 1.9430586352888408, + "language_loss": 0.73955107, + "learning_rate": 3.5230670792461943e-06, + "loss": 0.76124215, + "num_input_tokens_seen": 88810000, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.85546875, + "step": 4120, + "time_per_iteration": 2.4728164672851562 + }, + { + "auxiliary_loss_clip": 0.01132188, + "auxiliary_loss_mlp": 0.01047049, + "balance_loss_clip": 1.03010893, + "balance_loss_mlp": 1.0446558, + "epoch": 0.24776792424470165, + "flos": 15153405523200.0, + "grad_norm": 3.4886461941998563, + "language_loss": 0.88028777, + "learning_rate": 3.522814630322041e-06, + "loss": 0.90208006, + "num_input_tokens_seen": 88827515, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.875, + "step": 4121, + "time_per_iteration": 2.4117653369903564 + }, + { + "auxiliary_loss_clip": 0.01134577, + "auxiliary_loss_mlp": 0.01037836, + "balance_loss_clip": 1.02102745, + "balance_loss_mlp": 1.04516518, + "epoch": 0.2478280474973696, + "flos": 21725453347200.0, + "grad_norm": 2.7360865086006285, + "language_loss": 0.69088298, + "learning_rate": 3.5225621236526045e-06, + "loss": 0.71260709, + "num_input_tokens_seen": 88845025, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.89453125, + "step": 4122, + "time_per_iteration": 2.484830617904663 + }, + { + "auxiliary_loss_clip": 0.0113273, + "auxiliary_loss_mlp": 0.01040601, + "balance_loss_clip": 1.02224231, + "balance_loss_mlp": 1.04380226, + "epoch": 0.24788817075003758, + "flos": 20412200200320.0, + "grad_norm": 2.016808492688271, + "language_loss": 0.80196065, + "learning_rate": 3.5223095592474596e-06, + "loss": 0.82369387, + "num_input_tokens_seen": 88861740, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.890625, + "step": 4123, + "time_per_iteration": 2.43839955329895 + }, + { + "auxiliary_loss_clip": 0.01130784, + "auxiliary_loss_mlp": 0.01041496, + "balance_loss_clip": 1.02620113, + "balance_loss_mlp": 1.04464054, + "epoch": 0.24794829400270554, + "flos": 22594455083520.0, + "grad_norm": 2.3250466211888745, + "language_loss": 0.74919629, + "learning_rate": 3.5220569371161846e-06, + "loss": 0.77091914, + "num_input_tokens_seen": 88879740, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.86328125, + "step": 4124, + "time_per_iteration": 2.4909141063690186 + }, + { + "auxiliary_loss_clip": 0.01127616, + "auxiliary_loss_mlp": 0.01034142, + "balance_loss_clip": 1.01922846, + "balance_loss_mlp": 1.0432241, + "epoch": 0.2480084172553735, + "flos": 39676047615360.0, + "grad_norm": 1.6909299882519486, + "language_loss": 0.73759794, + "learning_rate": 3.521804257268357e-06, + "loss": 0.75921559, + "num_input_tokens_seen": 88904095, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.84375, + "step": 4125, + "time_per_iteration": 2.6068458557128906 + }, + { + "auxiliary_loss_clip": 0.01135393, + "auxiliary_loss_mlp": 0.01046005, + "balance_loss_clip": 1.02914929, + "balance_loss_mlp": 1.04383993, + "epoch": 0.24806854050804147, + "flos": 22053712763520.0, + "grad_norm": 2.376019449241759, + "language_loss": 0.69416726, + "learning_rate": 3.5215515197135595e-06, + "loss": 0.71598125, + "num_input_tokens_seen": 88920740, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9140625, + "step": 4126, + "time_per_iteration": 2.4516806602478027 + }, + { + "auxiliary_loss_clip": 0.01130323, + "auxiliary_loss_mlp": 0.01047803, + "balance_loss_clip": 1.03112614, + "balance_loss_mlp": 1.04299593, + "epoch": 0.24812866376070947, + "flos": 15486764670720.0, + "grad_norm": 2.081795572279456, + "language_loss": 0.81602275, + "learning_rate": 3.5212987244613764e-06, + "loss": 0.83780402, + "num_input_tokens_seen": 88938510, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.875, + "step": 4127, + "time_per_iteration": 2.482492446899414 + }, + { + "auxiliary_loss_clip": 0.01134053, + "auxiliary_loss_mlp": 0.01045575, + "balance_loss_clip": 1.02998269, + "balance_loss_mlp": 1.04527378, + "epoch": 0.24818878701337743, + "flos": 14757419013120.0, + "grad_norm": 5.2721581441441465, + "language_loss": 0.84604752, + "learning_rate": 3.5210458715213927e-06, + "loss": 0.86784381, + "num_input_tokens_seen": 88955235, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.88671875, + "step": 4128, + "time_per_iteration": 2.4577715396881104 + }, + { + "auxiliary_loss_clip": 0.01132567, + "auxiliary_loss_mlp": 0.01043389, + "balance_loss_clip": 1.02779055, + "balance_loss_mlp": 1.04397762, + "epoch": 0.2482489102660454, + "flos": 27089501852160.0, + "grad_norm": 3.598051635390234, + "language_loss": 0.65576231, + "learning_rate": 3.5207929609031973e-06, + "loss": 0.67752188, + "num_input_tokens_seen": 88975210, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8828125, + "step": 4129, + "time_per_iteration": 2.498321294784546 + }, + { + "auxiliary_loss_clip": 0.01130769, + "auxiliary_loss_mlp": 0.01043091, + "balance_loss_clip": 1.02573466, + "balance_loss_mlp": 1.04308498, + "epoch": 0.24830903351871336, + "flos": 26467528924800.0, + "grad_norm": 2.23477186449736, + "language_loss": 0.75251818, + "learning_rate": 3.5205399926163806e-06, + "loss": 0.77425677, + "num_input_tokens_seen": 88996120, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.875, + "step": 4130, + "time_per_iteration": 2.534014940261841 + }, + { + "auxiliary_loss_clip": 0.01132521, + "auxiliary_loss_mlp": 0.01048652, + "balance_loss_clip": 1.03198647, + "balance_loss_mlp": 1.04404271, + "epoch": 0.24836915677138133, + "flos": 10228436870400.0, + "grad_norm": 2.282827015603824, + "language_loss": 0.77323985, + "learning_rate": 3.520286966670535e-06, + "loss": 0.79505157, + "num_input_tokens_seen": 89008685, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8828125, + "step": 4131, + "time_per_iteration": 2.3971383571624756 + }, + { + "auxiliary_loss_clip": 0.011274, + "auxiliary_loss_mlp": 0.01036942, + "balance_loss_clip": 1.02241063, + "balance_loss_mlp": 1.0428257, + "epoch": 0.2484292800240493, + "flos": 30080429579520.0, + "grad_norm": 1.5452946340590639, + "language_loss": 0.83932686, + "learning_rate": 3.520033883075255e-06, + "loss": 0.86097032, + "num_input_tokens_seen": 89031160, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.84375, + "step": 4132, + "time_per_iteration": 2.552804470062256 + }, + { + "auxiliary_loss_clip": 0.01129759, + "auxiliary_loss_mlp": 0.01042276, + "balance_loss_clip": 1.02601552, + "balance_loss_mlp": 1.04280567, + "epoch": 0.24848940327671726, + "flos": 13442944803840.0, + "grad_norm": 2.4707160060639857, + "language_loss": 0.71077073, + "learning_rate": 3.5197807418401386e-06, + "loss": 0.73249108, + "num_input_tokens_seen": 89047235, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.87109375, + "step": 4133, + "time_per_iteration": 2.40258526802063 + }, + { + "auxiliary_loss_clip": 0.01138495, + "auxiliary_loss_mlp": 0.01044523, + "balance_loss_clip": 1.02486503, + "balance_loss_mlp": 1.0454644, + "epoch": 0.24854952652938525, + "flos": 19970247260160.0, + "grad_norm": 2.206352055564895, + "language_loss": 0.61492884, + "learning_rate": 3.5195275429747834e-06, + "loss": 0.63675898, + "num_input_tokens_seen": 89064790, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.9296875, + "step": 4134, + "time_per_iteration": 2.476027250289917 + }, + { + "auxiliary_loss_clip": 0.01133349, + "auxiliary_loss_mlp": 0.01037132, + "balance_loss_clip": 1.02063298, + "balance_loss_mlp": 1.04393268, + "epoch": 0.24860964978205322, + "flos": 18150187167360.0, + "grad_norm": 2.276340033899988, + "language_loss": 0.78899026, + "learning_rate": 3.5192742864887914e-06, + "loss": 0.81069505, + "num_input_tokens_seen": 89083250, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.89453125, + "step": 4135, + "time_per_iteration": 3.9668710231781006 + }, + { + "auxiliary_loss_clip": 0.01136879, + "auxiliary_loss_mlp": 0.01032054, + "balance_loss_clip": 1.01746297, + "balance_loss_mlp": 1.04908156, + "epoch": 0.24866977303472118, + "flos": 11728641329280.0, + "grad_norm": 2.12923907223803, + "language_loss": 0.82729924, + "learning_rate": 3.5190209723917662e-06, + "loss": 0.84898853, + "num_input_tokens_seen": 89100905, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.87890625, + "step": 4136, + "time_per_iteration": 3.8651821613311768 + }, + { + "auxiliary_loss_clip": 0.01135708, + "auxiliary_loss_mlp": 0.01045715, + "balance_loss_clip": 1.02919221, + "balance_loss_mlp": 1.04593039, + "epoch": 0.24872989628738915, + "flos": 34823582565120.0, + "grad_norm": 1.7063584090687087, + "language_loss": 0.70454097, + "learning_rate": 3.518767600693314e-06, + "loss": 0.72635514, + "num_input_tokens_seen": 89122630, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8984375, + "step": 4137, + "time_per_iteration": 2.581270456314087 + }, + { + "auxiliary_loss_clip": 0.01135031, + "auxiliary_loss_mlp": 0.01035007, + "balance_loss_clip": 1.0193553, + "balance_loss_mlp": 1.04428291, + "epoch": 0.2487900195400571, + "flos": 13699347062400.0, + "grad_norm": 2.0340803052703236, + "language_loss": 0.66840076, + "learning_rate": 3.518514171403042e-06, + "loss": 0.69010115, + "num_input_tokens_seen": 89141050, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.90625, + "step": 4138, + "time_per_iteration": 2.438858985900879 + }, + { + "auxiliary_loss_clip": 0.01130089, + "auxiliary_loss_mlp": 0.01035018, + "balance_loss_clip": 1.01977062, + "balance_loss_mlp": 1.0451256, + "epoch": 0.24885014279272508, + "flos": 25337815297920.0, + "grad_norm": 2.467393625239628, + "language_loss": 0.83937073, + "learning_rate": 3.51826068453056e-06, + "loss": 0.86102176, + "num_input_tokens_seen": 89160810, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8515625, + "step": 4139, + "time_per_iteration": 2.4858012199401855 + }, + { + "auxiliary_loss_clip": 0.01134672, + "auxiliary_loss_mlp": 0.0104164, + "balance_loss_clip": 1.02424788, + "balance_loss_mlp": 1.04416132, + "epoch": 0.24891026604539307, + "flos": 20631434860800.0, + "grad_norm": 1.5320149755260415, + "language_loss": 0.7864905, + "learning_rate": 3.518007140085481e-06, + "loss": 0.80825365, + "num_input_tokens_seen": 89180610, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.90625, + "step": 4140, + "time_per_iteration": 2.4608240127563477 + }, + { + "auxiliary_loss_clip": 0.01058216, + "auxiliary_loss_mlp": 0.01013447, + "balance_loss_clip": 1.01150382, + "balance_loss_mlp": 1.02780879, + "epoch": 0.24897038929806103, + "flos": 66960294030720.0, + "grad_norm": 0.8230161703115366, + "language_loss": 0.60980695, + "learning_rate": 3.51775353807742e-06, + "loss": 0.63052356, + "num_input_tokens_seen": 89241880, + "router_z_loss_clip": 0.01940918, + "router_z_loss_mlp": 0.3046875, + "step": 4141, + "time_per_iteration": 3.1306700706481934 + }, + { + "auxiliary_loss_clip": 0.01136317, + "auxiliary_loss_mlp": 0.01042658, + "balance_loss_clip": 1.02537298, + "balance_loss_mlp": 1.04692519, + "epoch": 0.249030512550729, + "flos": 36392555612160.0, + "grad_norm": 2.804889663143828, + "language_loss": 0.72997624, + "learning_rate": 3.5174998785159913e-06, + "loss": 0.75176597, + "num_input_tokens_seen": 89263340, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.89453125, + "step": 4142, + "time_per_iteration": 2.60341215133667 + }, + { + "auxiliary_loss_clip": 0.011336, + "auxiliary_loss_mlp": 0.01039484, + "balance_loss_clip": 1.02335465, + "balance_loss_mlp": 1.04601634, + "epoch": 0.24909063580339696, + "flos": 20154576879360.0, + "grad_norm": 2.0852522280017873, + "language_loss": 0.80985868, + "learning_rate": 3.5172461614108157e-06, + "loss": 0.83158958, + "num_input_tokens_seen": 89282870, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.875, + "step": 4143, + "time_per_iteration": 2.4621787071228027 + }, + { + "auxiliary_loss_clip": 0.01127478, + "auxiliary_loss_mlp": 0.01036024, + "balance_loss_clip": 1.02113485, + "balance_loss_mlp": 1.04291701, + "epoch": 0.24915075905606493, + "flos": 26396569607040.0, + "grad_norm": 1.8417531415701045, + "language_loss": 0.5884496, + "learning_rate": 3.5169923867715137e-06, + "loss": 0.61008459, + "num_input_tokens_seen": 89303830, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.84765625, + "step": 4144, + "time_per_iteration": 2.5253236293792725 + }, + { + "auxiliary_loss_clip": 0.0113091, + "auxiliary_loss_mlp": 0.0103722, + "balance_loss_clip": 1.02135301, + "balance_loss_mlp": 1.04400194, + "epoch": 0.2492108823087329, + "flos": 27527216987520.0, + "grad_norm": 2.2350400575734146, + "language_loss": 0.78882402, + "learning_rate": 3.516738554607708e-06, + "loss": 0.81050527, + "num_input_tokens_seen": 89324350, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8671875, + "step": 4145, + "time_per_iteration": 2.500868797302246 + }, + { + "auxiliary_loss_clip": 0.01141282, + "auxiliary_loss_mlp": 0.01049792, + "balance_loss_clip": 1.02981293, + "balance_loss_mlp": 1.04593182, + "epoch": 0.24927100556140086, + "flos": 16691388111360.0, + "grad_norm": 2.0986803435557415, + "language_loss": 0.65651333, + "learning_rate": 3.5164846649290253e-06, + "loss": 0.678424, + "num_input_tokens_seen": 89342875, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.953125, + "step": 4146, + "time_per_iteration": 2.482405424118042 + }, + { + "auxiliary_loss_clip": 0.01048172, + "auxiliary_loss_mlp": 0.01006681, + "balance_loss_clip": 1.00482178, + "balance_loss_mlp": 1.01849687, + "epoch": 0.24933112881406885, + "flos": 62772464286720.0, + "grad_norm": 3.0854856510049458, + "language_loss": 0.67327654, + "learning_rate": 3.5162307177450915e-06, + "loss": 0.69382501, + "num_input_tokens_seen": 89404925, + "router_z_loss_clip": 0.01855469, + "router_z_loss_mlp": 0.296875, + "step": 4147, + "time_per_iteration": 3.1769258975982666 + }, + { + "auxiliary_loss_clip": 0.01136528, + "auxiliary_loss_mlp": 0.01046222, + "balance_loss_clip": 1.02930617, + "balance_loss_mlp": 1.04857254, + "epoch": 0.24939125206673682, + "flos": 26651894457600.0, + "grad_norm": 2.0368820911017025, + "language_loss": 0.8893261, + "learning_rate": 3.5159767130655366e-06, + "loss": 0.91115361, + "num_input_tokens_seen": 89425090, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8828125, + "step": 4148, + "time_per_iteration": 2.5202085971832275 + }, + { + "auxiliary_loss_clip": 0.0113885, + "auxiliary_loss_mlp": 0.01045745, + "balance_loss_clip": 1.02649307, + "balance_loss_mlp": 1.04754162, + "epoch": 0.24945137531940478, + "flos": 20704333512960.0, + "grad_norm": 1.8605307211390085, + "language_loss": 0.68053228, + "learning_rate": 3.5157226508999935e-06, + "loss": 0.70237827, + "num_input_tokens_seen": 89442615, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.9140625, + "step": 4149, + "time_per_iteration": 2.455733060836792 + }, + { + "auxiliary_loss_clip": 0.01133288, + "auxiliary_loss_mlp": 0.01038884, + "balance_loss_clip": 1.02291596, + "balance_loss_mlp": 1.04694724, + "epoch": 0.24951149857207275, + "flos": 23768662682880.0, + "grad_norm": 2.99652773874907, + "language_loss": 0.71235985, + "learning_rate": 3.515468531258095e-06, + "loss": 0.73408163, + "num_input_tokens_seen": 89463025, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.86328125, + "step": 4150, + "time_per_iteration": 2.514190196990967 + }, + { + "auxiliary_loss_clip": 0.01134014, + "auxiliary_loss_mlp": 0.01049321, + "balance_loss_clip": 1.03256035, + "balance_loss_mlp": 1.04471052, + "epoch": 0.2495716218247407, + "flos": 15664881237120.0, + "grad_norm": 1.862035570914478, + "language_loss": 0.72954226, + "learning_rate": 3.515214354149478e-06, + "loss": 0.75137556, + "num_input_tokens_seen": 89480225, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.89453125, + "step": 4151, + "time_per_iteration": 2.4198975563049316 + }, + { + "auxiliary_loss_clip": 0.01141172, + "auxiliary_loss_mlp": 0.01049288, + "balance_loss_clip": 1.03213382, + "balance_loss_mlp": 1.04694724, + "epoch": 0.24963174507740868, + "flos": 24052499953920.0, + "grad_norm": 4.099427504771762, + "language_loss": 0.62436807, + "learning_rate": 3.514960119583781e-06, + "loss": 0.64627266, + "num_input_tokens_seen": 89496985, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.94140625, + "step": 4152, + "time_per_iteration": 2.563032865524292 + }, + { + "auxiliary_loss_clip": 0.01131413, + "auxiliary_loss_mlp": 0.01038045, + "balance_loss_clip": 1.02188039, + "balance_loss_mlp": 1.04631066, + "epoch": 0.24969186833007664, + "flos": 21799501234560.0, + "grad_norm": 2.3735561607913596, + "language_loss": 0.77219248, + "learning_rate": 3.514705827570645e-06, + "loss": 0.79388708, + "num_input_tokens_seen": 89514420, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8515625, + "step": 4153, + "time_per_iteration": 2.5059967041015625 + }, + { + "auxiliary_loss_clip": 0.01132512, + "auxiliary_loss_mlp": 0.01040076, + "balance_loss_clip": 1.0242573, + "balance_loss_mlp": 1.04642224, + "epoch": 0.24975199158274464, + "flos": 19938143479680.0, + "grad_norm": 2.164577963489155, + "language_loss": 0.76443702, + "learning_rate": 3.514451478119711e-06, + "loss": 0.78616285, + "num_input_tokens_seen": 89532925, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.859375, + "step": 4154, + "time_per_iteration": 2.48317551612854 + }, + { + "auxiliary_loss_clip": 0.01138697, + "auxiliary_loss_mlp": 0.0104451, + "balance_loss_clip": 1.02586532, + "balance_loss_mlp": 1.04451203, + "epoch": 0.2498121148354126, + "flos": 25338389915520.0, + "grad_norm": 2.2000943153895722, + "language_loss": 0.70740849, + "learning_rate": 3.5141970712406258e-06, + "loss": 0.72924054, + "num_input_tokens_seen": 89552855, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.94140625, + "step": 4155, + "time_per_iteration": 2.498227834701538 + }, + { + "auxiliary_loss_clip": 0.01137147, + "auxiliary_loss_mlp": 0.01050913, + "balance_loss_clip": 1.03379464, + "balance_loss_mlp": 1.04736114, + "epoch": 0.24987223808808057, + "flos": 20558787603840.0, + "grad_norm": 1.8252469259439843, + "language_loss": 0.7499637, + "learning_rate": 3.513942606943036e-06, + "loss": 0.77184427, + "num_input_tokens_seen": 89572830, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.8984375, + "step": 4156, + "time_per_iteration": 2.473536729812622 + }, + { + "auxiliary_loss_clip": 0.01132111, + "auxiliary_loss_mlp": 0.01040008, + "balance_loss_clip": 1.0244987, + "balance_loss_mlp": 1.04498601, + "epoch": 0.24993236134074853, + "flos": 19749037351680.0, + "grad_norm": 2.1247768054564333, + "language_loss": 0.76757634, + "learning_rate": 3.513688085236591e-06, + "loss": 0.78929752, + "num_input_tokens_seen": 89590345, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.87109375, + "step": 4157, + "time_per_iteration": 2.476402759552002 + }, + { + "auxiliary_loss_clip": 0.01135567, + "auxiliary_loss_mlp": 0.01044785, + "balance_loss_clip": 1.02821517, + "balance_loss_mlp": 1.04551077, + "epoch": 0.2499924845934165, + "flos": 18770292587520.0, + "grad_norm": 1.6430173172536622, + "language_loss": 0.81497854, + "learning_rate": 3.513433506130942e-06, + "loss": 0.8367821, + "num_input_tokens_seen": 89610295, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8984375, + "step": 4158, + "time_per_iteration": 2.4706146717071533 + }, + { + "auxiliary_loss_clip": 0.01134661, + "auxiliary_loss_mlp": 0.01031651, + "balance_loss_clip": 1.01533163, + "balance_loss_mlp": 1.04511046, + "epoch": 0.25005260784608446, + "flos": 16872198197760.0, + "grad_norm": 2.425058111765743, + "language_loss": 0.75573325, + "learning_rate": 3.5131788696357427e-06, + "loss": 0.77739644, + "num_input_tokens_seen": 89627795, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.89453125, + "step": 4159, + "time_per_iteration": 2.447530746459961 + }, + { + "auxiliary_loss_clip": 0.01137664, + "auxiliary_loss_mlp": 0.01036787, + "balance_loss_clip": 1.01928759, + "balance_loss_mlp": 1.04643881, + "epoch": 0.2501127310987524, + "flos": 22124923476480.0, + "grad_norm": 2.3851333770237044, + "language_loss": 0.71434534, + "learning_rate": 3.512924175760649e-06, + "loss": 0.73608989, + "num_input_tokens_seen": 89648090, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9140625, + "step": 4160, + "time_per_iteration": 2.4909448623657227 + }, + { + "auxiliary_loss_clip": 0.01045571, + "auxiliary_loss_mlp": 0.01008394, + "balance_loss_clip": 1.0062604, + "balance_loss_mlp": 1.01615632, + "epoch": 0.2501728543514204, + "flos": 69458061980160.0, + "grad_norm": 0.7574731626167057, + "language_loss": 0.56755257, + "learning_rate": 3.5126694245153186e-06, + "loss": 0.58809221, + "num_input_tokens_seen": 89710345, + "router_z_loss_clip": 0.0213623, + "router_z_loss_mlp": 0.29492188, + "step": 4161, + "time_per_iteration": 3.1169064044952393 + }, + { + "auxiliary_loss_clip": 0.01143652, + "auxiliary_loss_mlp": 0.01045602, + "balance_loss_clip": 1.02767324, + "balance_loss_mlp": 1.04854345, + "epoch": 0.25023297760408836, + "flos": 16289978647680.0, + "grad_norm": 1.822598728260487, + "language_loss": 0.8071059, + "learning_rate": 3.5124146159094125e-06, + "loss": 0.82899845, + "num_input_tokens_seen": 89729390, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.94921875, + "step": 4162, + "time_per_iteration": 2.4679477214813232 + }, + { + "auxiliary_loss_clip": 0.01136921, + "auxiliary_loss_mlp": 0.01039377, + "balance_loss_clip": 1.02212739, + "balance_loss_mlp": 1.04364812, + "epoch": 0.2502931008567563, + "flos": 12237998140800.0, + "grad_norm": 2.543272880301035, + "language_loss": 0.87439299, + "learning_rate": 3.5121597499525927e-06, + "loss": 0.89615595, + "num_input_tokens_seen": 89742805, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.93359375, + "step": 4163, + "time_per_iteration": 2.411324977874756 + }, + { + "auxiliary_loss_clip": 0.01135069, + "auxiliary_loss_mlp": 0.01036709, + "balance_loss_clip": 1.02013874, + "balance_loss_mlp": 1.04609334, + "epoch": 0.25035322410942434, + "flos": 23181882105600.0, + "grad_norm": 1.8835095650007205, + "language_loss": 0.83242726, + "learning_rate": 3.5119048266545232e-06, + "loss": 0.85414505, + "num_input_tokens_seen": 89761145, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.890625, + "step": 4164, + "time_per_iteration": 2.4910058975219727 + }, + { + "auxiliary_loss_clip": 0.01130392, + "auxiliary_loss_mlp": 0.01047055, + "balance_loss_clip": 1.03235698, + "balance_loss_mlp": 1.04616356, + "epoch": 0.2504133473620923, + "flos": 20917534688640.0, + "grad_norm": 1.7333709529875627, + "language_loss": 0.74548686, + "learning_rate": 3.5116498460248716e-06, + "loss": 0.76726139, + "num_input_tokens_seen": 89780905, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.84375, + "step": 4165, + "time_per_iteration": 2.4566714763641357 + }, + { + "auxiliary_loss_clip": 0.01139627, + "auxiliary_loss_mlp": 0.01045895, + "balance_loss_clip": 1.02819216, + "balance_loss_mlp": 1.04689348, + "epoch": 0.2504734706147603, + "flos": 20776549806720.0, + "grad_norm": 5.301488379412456, + "language_loss": 0.74214685, + "learning_rate": 3.5113948080733062e-06, + "loss": 0.76400197, + "num_input_tokens_seen": 89799230, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9296875, + "step": 4166, + "time_per_iteration": 2.462092161178589 + }, + { + "auxiliary_loss_clip": 0.01134276, + "auxiliary_loss_mlp": 0.01045442, + "balance_loss_clip": 1.02898526, + "balance_loss_mlp": 1.04551435, + "epoch": 0.25053359386742824, + "flos": 24349373861760.0, + "grad_norm": 1.9752225074857819, + "language_loss": 0.82011521, + "learning_rate": 3.5111397128094973e-06, + "loss": 0.84191239, + "num_input_tokens_seen": 89818240, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.890625, + "step": 4167, + "time_per_iteration": 2.482534885406494 + }, + { + "auxiliary_loss_clip": 0.01134736, + "auxiliary_loss_mlp": 0.01044102, + "balance_loss_clip": 1.0280689, + "balance_loss_mlp": 1.04616201, + "epoch": 0.2505937171200962, + "flos": 21214336769280.0, + "grad_norm": 2.42679689243218, + "language_loss": 0.79602242, + "learning_rate": 3.51088456024312e-06, + "loss": 0.81781083, + "num_input_tokens_seen": 89834485, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8828125, + "step": 4168, + "time_per_iteration": 2.463700532913208 + }, + { + "auxiliary_loss_clip": 0.01139283, + "auxiliary_loss_mlp": 0.01042051, + "balance_loss_clip": 1.02353752, + "balance_loss_mlp": 1.04523754, + "epoch": 0.25065384037276417, + "flos": 41427231379200.0, + "grad_norm": 2.966293758738445, + "language_loss": 0.70029891, + "learning_rate": 3.510629350383849e-06, + "loss": 0.72211224, + "num_input_tokens_seen": 89855645, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9375, + "step": 4169, + "time_per_iteration": 2.6148693561553955 + }, + { + "auxiliary_loss_clip": 0.01131562, + "auxiliary_loss_mlp": 0.0104538, + "balance_loss_clip": 1.02926338, + "balance_loss_mlp": 1.0446701, + "epoch": 0.25071396362543213, + "flos": 26102389219200.0, + "grad_norm": 1.8138505316100015, + "language_loss": 0.77564663, + "learning_rate": 3.510374083241361e-06, + "loss": 0.79741603, + "num_input_tokens_seen": 89874895, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8671875, + "step": 4170, + "time_per_iteration": 2.522921562194824 + }, + { + "auxiliary_loss_clip": 0.01137572, + "auxiliary_loss_mlp": 0.01043275, + "balance_loss_clip": 1.02731323, + "balance_loss_mlp": 1.04796529, + "epoch": 0.2507740868781001, + "flos": 19098982967040.0, + "grad_norm": 2.4512078878938404, + "language_loss": 0.76246989, + "learning_rate": 3.5101187588253368e-06, + "loss": 0.78427839, + "num_input_tokens_seen": 89891700, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8984375, + "step": 4171, + "time_per_iteration": 2.4322195053100586 + }, + { + "auxiliary_loss_clip": 0.01046694, + "auxiliary_loss_mlp": 0.01021172, + "balance_loss_clip": 1.01924038, + "balance_loss_mlp": 1.01739454, + "epoch": 0.25083421013076806, + "flos": 64341868296960.0, + "grad_norm": 0.8497756598481241, + "language_loss": 0.60047227, + "learning_rate": 3.509863377145458e-06, + "loss": 0.62115091, + "num_input_tokens_seen": 89955775, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.29296875, + "step": 4172, + "time_per_iteration": 3.1110994815826416 + }, + { + "auxiliary_loss_clip": 0.01137052, + "auxiliary_loss_mlp": 0.01042686, + "balance_loss_clip": 1.02567458, + "balance_loss_mlp": 1.04652381, + "epoch": 0.25089433338343603, + "flos": 24279599692800.0, + "grad_norm": 1.4442293166181488, + "language_loss": 0.78647727, + "learning_rate": 3.509607938211409e-06, + "loss": 0.80827463, + "num_input_tokens_seen": 89977150, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.90625, + "step": 4173, + "time_per_iteration": 2.481062889099121 + }, + { + "auxiliary_loss_clip": 0.01140203, + "auxiliary_loss_mlp": 0.01046542, + "balance_loss_clip": 1.0300796, + "balance_loss_mlp": 1.05017626, + "epoch": 0.250954456636104, + "flos": 14721472477440.0, + "grad_norm": 2.4202296115923883, + "language_loss": 0.83543748, + "learning_rate": 3.509352442032875e-06, + "loss": 0.85730493, + "num_input_tokens_seen": 89994925, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8984375, + "step": 4174, + "time_per_iteration": 2.4566147327423096 + }, + { + "auxiliary_loss_clip": 0.01138282, + "auxiliary_loss_mlp": 0.01040651, + "balance_loss_clip": 1.02299595, + "balance_loss_mlp": 1.04786515, + "epoch": 0.25101457988877196, + "flos": 22273593868800.0, + "grad_norm": 2.0903096624482624, + "language_loss": 0.71291864, + "learning_rate": 3.509096888619545e-06, + "loss": 0.73470795, + "num_input_tokens_seen": 90013235, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.90234375, + "step": 4175, + "time_per_iteration": 2.4616360664367676 + }, + { + "auxiliary_loss_clip": 0.01138348, + "auxiliary_loss_mlp": 0.01036282, + "balance_loss_clip": 1.01866269, + "balance_loss_mlp": 1.0460453, + "epoch": 0.2510747031414399, + "flos": 25188929424000.0, + "grad_norm": 2.247188920587568, + "language_loss": 0.80564427, + "learning_rate": 3.50884127798111e-06, + "loss": 0.82739055, + "num_input_tokens_seen": 90032150, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.921875, + "step": 4176, + "time_per_iteration": 2.525686740875244 + }, + { + "auxiliary_loss_clip": 0.01138723, + "auxiliary_loss_mlp": 0.01044107, + "balance_loss_clip": 1.02553427, + "balance_loss_mlp": 1.04782593, + "epoch": 0.25113482639410795, + "flos": 20704189858560.0, + "grad_norm": 2.362252442770041, + "language_loss": 0.83099151, + "learning_rate": 3.5085856101272623e-06, + "loss": 0.8528198, + "num_input_tokens_seen": 90049085, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.90625, + "step": 4177, + "time_per_iteration": 5.424759387969971 + }, + { + "auxiliary_loss_clip": 0.01135735, + "auxiliary_loss_mlp": 0.01043794, + "balance_loss_clip": 1.02675891, + "balance_loss_mlp": 1.04777622, + "epoch": 0.2511949496467759, + "flos": 21506936958720.0, + "grad_norm": 2.9753996759374846, + "language_loss": 0.8209883, + "learning_rate": 3.508329885067698e-06, + "loss": 0.84278357, + "num_input_tokens_seen": 90067695, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.87890625, + "step": 4178, + "time_per_iteration": 2.451418161392212 + }, + { + "auxiliary_loss_clip": 0.01130203, + "auxiliary_loss_mlp": 0.01042984, + "balance_loss_clip": 1.02702177, + "balance_loss_mlp": 1.04445124, + "epoch": 0.2512550728994439, + "flos": 20701999128960.0, + "grad_norm": 2.6671564243834505, + "language_loss": 0.75406277, + "learning_rate": 3.508074102812112e-06, + "loss": 0.77579463, + "num_input_tokens_seen": 90083890, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.859375, + "step": 4179, + "time_per_iteration": 2.4710347652435303 + }, + { + "auxiliary_loss_clip": 0.01135846, + "auxiliary_loss_mlp": 0.01048206, + "balance_loss_clip": 1.03050375, + "balance_loss_mlp": 1.04526711, + "epoch": 0.25131519615211184, + "flos": 18478626151680.0, + "grad_norm": 2.189208999533023, + "language_loss": 0.70452499, + "learning_rate": 3.507818263370206e-06, + "loss": 0.72636557, + "num_input_tokens_seen": 90100995, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.90625, + "step": 4180, + "time_per_iteration": 2.433922290802002 + }, + { + "auxiliary_loss_clip": 0.01132491, + "auxiliary_loss_mlp": 0.01041648, + "balance_loss_clip": 1.02485168, + "balance_loss_mlp": 1.04449701, + "epoch": 0.2513753194047798, + "flos": 20484955198080.0, + "grad_norm": 2.0603947372587244, + "language_loss": 0.85379761, + "learning_rate": 3.5075623667516796e-06, + "loss": 0.875539, + "num_input_tokens_seen": 90120365, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8828125, + "step": 4181, + "time_per_iteration": 2.4513771533966064 + }, + { + "auxiliary_loss_clip": 0.01135555, + "auxiliary_loss_mlp": 0.01042648, + "balance_loss_clip": 1.02608991, + "balance_loss_mlp": 1.0464716, + "epoch": 0.25143544265744777, + "flos": 37670077704960.0, + "grad_norm": 1.9568163341605829, + "language_loss": 0.67662674, + "learning_rate": 3.507306412966238e-06, + "loss": 0.69840884, + "num_input_tokens_seen": 90142610, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.890625, + "step": 4182, + "time_per_iteration": 2.588513135910034 + }, + { + "auxiliary_loss_clip": 0.01047089, + "auxiliary_loss_mlp": 0.01008874, + "balance_loss_clip": 1.00675201, + "balance_loss_mlp": 1.01742792, + "epoch": 0.25149556591011574, + "flos": 69367457923200.0, + "grad_norm": 0.8484678873575391, + "language_loss": 0.70098495, + "learning_rate": 3.5070504020235853e-06, + "loss": 0.72154456, + "num_input_tokens_seen": 90200555, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.296875, + "step": 4183, + "time_per_iteration": 3.0990090370178223 + }, + { + "auxiliary_loss_clip": 0.01129729, + "auxiliary_loss_mlp": 0.01038846, + "balance_loss_clip": 1.02088118, + "balance_loss_mlp": 1.04070854, + "epoch": 0.2515556891627837, + "flos": 13990402967040.0, + "grad_norm": 1.7162399200173233, + "language_loss": 0.7452544, + "learning_rate": 3.506794333933431e-06, + "loss": 0.76694012, + "num_input_tokens_seen": 90218120, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.890625, + "step": 4184, + "time_per_iteration": 2.4367544651031494 + }, + { + "auxiliary_loss_clip": 0.01137253, + "auxiliary_loss_mlp": 0.01045885, + "balance_loss_clip": 1.02888608, + "balance_loss_mlp": 1.04825735, + "epoch": 0.25161581241545167, + "flos": 22163527618560.0, + "grad_norm": 1.9130230292696613, + "language_loss": 0.82872695, + "learning_rate": 3.506538208705484e-06, + "loss": 0.85055834, + "num_input_tokens_seen": 90236790, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.890625, + "step": 4185, + "time_per_iteration": 2.4604692459106445 + }, + { + "auxiliary_loss_clip": 0.01047588, + "auxiliary_loss_mlp": 0.01003961, + "balance_loss_clip": 1.00194633, + "balance_loss_mlp": 1.01820421, + "epoch": 0.25167593566811963, + "flos": 69358407696000.0, + "grad_norm": 0.7885291752286397, + "language_loss": 0.61534387, + "learning_rate": 3.5062820263494574e-06, + "loss": 0.63585937, + "num_input_tokens_seen": 90297070, + "router_z_loss_clip": 0.0201416, + "router_z_loss_mlp": 0.29296875, + "step": 4186, + "time_per_iteration": 2.9629924297332764 + }, + { + "auxiliary_loss_clip": 0.01133243, + "auxiliary_loss_mlp": 0.01040873, + "balance_loss_clip": 1.02320647, + "balance_loss_mlp": 1.04432559, + "epoch": 0.2517360589207876, + "flos": 13261452359040.0, + "grad_norm": 2.1070381215060308, + "language_loss": 0.79260957, + "learning_rate": 3.5060257868750656e-06, + "loss": 0.81435084, + "num_input_tokens_seen": 90315255, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.890625, + "step": 4187, + "time_per_iteration": 2.454988479614258 + }, + { + "auxiliary_loss_clip": 0.01136483, + "auxiliary_loss_mlp": 0.01049456, + "balance_loss_clip": 1.03235006, + "balance_loss_mlp": 1.04733062, + "epoch": 0.25179618217345556, + "flos": 20376828282240.0, + "grad_norm": 1.5254881034867085, + "language_loss": 0.79854965, + "learning_rate": 3.5057694902920244e-06, + "loss": 0.82040906, + "num_input_tokens_seen": 90334990, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.890625, + "step": 4188, + "time_per_iteration": 2.4807493686676025 + }, + { + "auxiliary_loss_clip": 0.01135001, + "auxiliary_loss_mlp": 0.01046554, + "balance_loss_clip": 1.03022218, + "balance_loss_mlp": 1.04635882, + "epoch": 0.25185630542612353, + "flos": 27664718250240.0, + "grad_norm": 1.727912733373243, + "language_loss": 0.74509478, + "learning_rate": 3.5055131366100534e-06, + "loss": 0.76691031, + "num_input_tokens_seen": 90351825, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.88671875, + "step": 4189, + "time_per_iteration": 2.4887545108795166 + }, + { + "auxiliary_loss_clip": 0.01131737, + "auxiliary_loss_mlp": 0.01044524, + "balance_loss_clip": 1.02914619, + "balance_loss_mlp": 1.04616165, + "epoch": 0.25191642867879155, + "flos": 20996430912000.0, + "grad_norm": 1.957544272457229, + "language_loss": 0.84454727, + "learning_rate": 3.5052567258388745e-06, + "loss": 0.86630988, + "num_input_tokens_seen": 90369860, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.85546875, + "step": 4190, + "time_per_iteration": 2.4629735946655273 + }, + { + "auxiliary_loss_clip": 0.01134245, + "auxiliary_loss_mlp": 0.01044478, + "balance_loss_clip": 1.02633452, + "balance_loss_mlp": 1.04529381, + "epoch": 0.2519765519314595, + "flos": 21105671149440.0, + "grad_norm": 1.9468541382775664, + "language_loss": 0.75593925, + "learning_rate": 3.5050002579882082e-06, + "loss": 0.77772641, + "num_input_tokens_seen": 90389245, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.88671875, + "step": 4191, + "time_per_iteration": 2.451493263244629 + }, + { + "auxiliary_loss_clip": 0.01042669, + "auxiliary_loss_mlp": 0.0101771, + "balance_loss_clip": 1.01577878, + "balance_loss_mlp": 1.01320672, + "epoch": 0.2520366751841275, + "flos": 62744993360640.0, + "grad_norm": 0.7165761170014687, + "language_loss": 0.57155997, + "learning_rate": 3.5047437330677823e-06, + "loss": 0.59216374, + "num_input_tokens_seen": 90456735, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.29492188, + "step": 4192, + "time_per_iteration": 3.1455304622650146 + }, + { + "auxiliary_loss_clip": 0.01132992, + "auxiliary_loss_mlp": 0.01042104, + "balance_loss_clip": 1.02593958, + "balance_loss_mlp": 1.04640245, + "epoch": 0.25209679843679544, + "flos": 22230716008320.0, + "grad_norm": 2.0419031963399434, + "language_loss": 0.76306844, + "learning_rate": 3.504487151087323e-06, + "loss": 0.78481936, + "num_input_tokens_seen": 90474165, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8671875, + "step": 4193, + "time_per_iteration": 2.46201491355896 + }, + { + "auxiliary_loss_clip": 0.01136471, + "auxiliary_loss_mlp": 0.01048175, + "balance_loss_clip": 1.03115189, + "balance_loss_mlp": 1.04506373, + "epoch": 0.2521569216894634, + "flos": 12166643773440.0, + "grad_norm": 2.1192679618590007, + "language_loss": 0.84261906, + "learning_rate": 3.5042305120565598e-06, + "loss": 0.86446548, + "num_input_tokens_seen": 90491660, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9140625, + "step": 4194, + "time_per_iteration": 2.4525146484375 + }, + { + "auxiliary_loss_clip": 0.01138489, + "auxiliary_loss_mlp": 0.01049416, + "balance_loss_clip": 1.03404951, + "balance_loss_mlp": 1.04636192, + "epoch": 0.2522170449421314, + "flos": 23699786353920.0, + "grad_norm": 1.488794247862028, + "language_loss": 0.88176262, + "learning_rate": 3.5039738159852253e-06, + "loss": 0.90364158, + "num_input_tokens_seen": 90514025, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.921875, + "step": 4195, + "time_per_iteration": 2.507788896560669 + }, + { + "auxiliary_loss_clip": 0.01136069, + "auxiliary_loss_mlp": 0.01042605, + "balance_loss_clip": 1.02323329, + "balance_loss_mlp": 1.04540074, + "epoch": 0.25227716819479934, + "flos": 20955456472320.0, + "grad_norm": 2.8940350432545787, + "language_loss": 0.85288155, + "learning_rate": 3.503717062883053e-06, + "loss": 0.87466824, + "num_input_tokens_seen": 90533530, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.90625, + "step": 4196, + "time_per_iteration": 2.4843344688415527 + }, + { + "auxiliary_loss_clip": 0.01135455, + "auxiliary_loss_mlp": 0.01042942, + "balance_loss_clip": 1.02644312, + "balance_loss_mlp": 1.0454607, + "epoch": 0.2523372914474673, + "flos": 23331342597120.0, + "grad_norm": 1.6596186150335415, + "language_loss": 0.83368516, + "learning_rate": 3.5034602527597786e-06, + "loss": 0.85546911, + "num_input_tokens_seen": 90554025, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8984375, + "step": 4197, + "time_per_iteration": 2.480834484100342 + }, + { + "auxiliary_loss_clip": 0.01139173, + "auxiliary_loss_mlp": 0.01047624, + "balance_loss_clip": 1.02840698, + "balance_loss_mlp": 1.04775643, + "epoch": 0.25239741470013527, + "flos": 36970321875840.0, + "grad_norm": 2.7573342641631093, + "language_loss": 0.72406292, + "learning_rate": 3.5032033856251405e-06, + "loss": 0.74593097, + "num_input_tokens_seen": 90576930, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.9140625, + "step": 4198, + "time_per_iteration": 2.6081368923187256 + }, + { + "auxiliary_loss_clip": 0.01139571, + "auxiliary_loss_mlp": 0.01052953, + "balance_loss_clip": 1.03469038, + "balance_loss_mlp": 1.0462662, + "epoch": 0.25245753795280323, + "flos": 18515757836160.0, + "grad_norm": 1.9511850390779815, + "language_loss": 0.76798427, + "learning_rate": 3.50294646148888e-06, + "loss": 0.7899096, + "num_input_tokens_seen": 90595710, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.93359375, + "step": 4199, + "time_per_iteration": 2.463322162628174 + }, + { + "auxiliary_loss_clip": 0.01137047, + "auxiliary_loss_mlp": 0.01039237, + "balance_loss_clip": 1.02334595, + "balance_loss_mlp": 1.04600453, + "epoch": 0.2525176612054712, + "flos": 32344884737280.0, + "grad_norm": 1.6881838085079777, + "language_loss": 0.727651, + "learning_rate": 3.502689480360739e-06, + "loss": 0.74941385, + "num_input_tokens_seen": 90617945, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.91015625, + "step": 4200, + "time_per_iteration": 2.586298942565918 + }, + { + "auxiliary_loss_clip": 0.01134782, + "auxiliary_loss_mlp": 0.01047981, + "balance_loss_clip": 1.03206062, + "balance_loss_mlp": 1.04300654, + "epoch": 0.25257778445813917, + "flos": 45258217459200.0, + "grad_norm": 1.7166145531144803, + "language_loss": 0.82271791, + "learning_rate": 3.5024324422504616e-06, + "loss": 0.84454548, + "num_input_tokens_seen": 90640855, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.91796875, + "step": 4201, + "time_per_iteration": 2.6430721282958984 + }, + { + "auxiliary_loss_clip": 0.01138395, + "auxiliary_loss_mlp": 0.01046441, + "balance_loss_clip": 1.02960861, + "balance_loss_mlp": 1.04680324, + "epoch": 0.25263790771080713, + "flos": 23367791923200.0, + "grad_norm": 1.8945534984036327, + "language_loss": 0.74844849, + "learning_rate": 3.5021753471677965e-06, + "loss": 0.77029681, + "num_input_tokens_seen": 90661350, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9140625, + "step": 4202, + "time_per_iteration": 2.477376699447632 + }, + { + "auxiliary_loss_clip": 0.01133899, + "auxiliary_loss_mlp": 0.01041807, + "balance_loss_clip": 1.02545786, + "balance_loss_mlp": 1.04550529, + "epoch": 0.25269803096347515, + "flos": 18515039564160.0, + "grad_norm": 1.8769942277842264, + "language_loss": 0.73058856, + "learning_rate": 3.501918195122491e-06, + "loss": 0.75234556, + "num_input_tokens_seen": 90680540, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.88671875, + "step": 4203, + "time_per_iteration": 2.4526968002319336 + }, + { + "auxiliary_loss_clip": 0.01134593, + "auxiliary_loss_mlp": 0.01040695, + "balance_loss_clip": 1.02403569, + "balance_loss_mlp": 1.04434335, + "epoch": 0.2527581542161431, + "flos": 24610552629120.0, + "grad_norm": 1.7217444479200419, + "language_loss": 0.77377844, + "learning_rate": 3.501660986124297e-06, + "loss": 0.79553127, + "num_input_tokens_seen": 90703460, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.90234375, + "step": 4204, + "time_per_iteration": 2.540573835372925 + }, + { + "auxiliary_loss_clip": 0.01135598, + "auxiliary_loss_mlp": 0.01051513, + "balance_loss_clip": 1.03463292, + "balance_loss_mlp": 1.04443574, + "epoch": 0.2528182774688111, + "flos": 12641275111680.0, + "grad_norm": 3.2226665017353655, + "language_loss": 0.72443974, + "learning_rate": 3.5014037201829684e-06, + "loss": 0.74631095, + "num_input_tokens_seen": 90718815, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9140625, + "step": 4205, + "time_per_iteration": 2.405823230743408 + }, + { + "auxiliary_loss_clip": 0.01131667, + "auxiliary_loss_mlp": 0.01038371, + "balance_loss_clip": 1.02304697, + "balance_loss_mlp": 1.04673433, + "epoch": 0.25287840072147905, + "flos": 46936789879680.0, + "grad_norm": 1.4419344159614245, + "language_loss": 0.75674903, + "learning_rate": 3.50114639730826e-06, + "loss": 0.77844942, + "num_input_tokens_seen": 90742125, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.84765625, + "step": 4206, + "time_per_iteration": 2.7117254734039307 + }, + { + "auxiliary_loss_clip": 0.01134608, + "auxiliary_loss_mlp": 0.01041465, + "balance_loss_clip": 1.02502584, + "balance_loss_mlp": 1.04381466, + "epoch": 0.252938523974147, + "flos": 18879712392960.0, + "grad_norm": 1.8459801280493204, + "language_loss": 0.79013956, + "learning_rate": 3.5008890175099296e-06, + "loss": 0.81190026, + "num_input_tokens_seen": 90760785, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90625, + "step": 4207, + "time_per_iteration": 2.4338433742523193 + }, + { + "auxiliary_loss_clip": 0.01131159, + "auxiliary_loss_mlp": 0.01042915, + "balance_loss_clip": 1.02688169, + "balance_loss_mlp": 1.04521704, + "epoch": 0.252998647226815, + "flos": 21434720664960.0, + "grad_norm": 1.5263501886522268, + "language_loss": 0.76010746, + "learning_rate": 3.5006315807977375e-06, + "loss": 0.78184819, + "num_input_tokens_seen": 90780045, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.859375, + "step": 4208, + "time_per_iteration": 2.4712774753570557 + }, + { + "auxiliary_loss_clip": 0.01131243, + "auxiliary_loss_mlp": 0.01042204, + "balance_loss_clip": 1.02559781, + "balance_loss_mlp": 1.04407811, + "epoch": 0.25305877047948294, + "flos": 25442171285760.0, + "grad_norm": 1.8494822470113228, + "language_loss": 0.6965062, + "learning_rate": 3.5003740871814456e-06, + "loss": 0.71824062, + "num_input_tokens_seen": 90797980, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.87109375, + "step": 4209, + "time_per_iteration": 2.4723262786865234 + }, + { + "auxiliary_loss_clip": 0.01046036, + "auxiliary_loss_mlp": 0.00999993, + "balance_loss_clip": 0.99819291, + "balance_loss_mlp": 1.01643014, + "epoch": 0.2531188937321509, + "flos": 60185603629440.0, + "grad_norm": 0.7581785291884388, + "language_loss": 0.55080217, + "learning_rate": 3.5001165366708175e-06, + "loss": 0.57126248, + "num_input_tokens_seen": 90864865, + "router_z_loss_clip": 0.01794434, + "router_z_loss_mlp": 0.296875, + "step": 4210, + "time_per_iteration": 3.141958236694336 + }, + { + "auxiliary_loss_clip": 0.0113523, + "auxiliary_loss_mlp": 0.01034659, + "balance_loss_clip": 1.01853585, + "balance_loss_mlp": 1.04541481, + "epoch": 0.25317901698481887, + "flos": 19682387665920.0, + "grad_norm": 2.0581011511690606, + "language_loss": 0.8021341, + "learning_rate": 3.4998589292756204e-06, + "loss": 0.82383299, + "num_input_tokens_seen": 90882885, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8984375, + "step": 4211, + "time_per_iteration": 2.4423909187316895 + }, + { + "auxiliary_loss_clip": 0.01128499, + "auxiliary_loss_mlp": 0.01039387, + "balance_loss_clip": 1.02402079, + "balance_loss_mlp": 1.04284227, + "epoch": 0.25323914023748684, + "flos": 24424355502720.0, + "grad_norm": 1.6375033978461933, + "language_loss": 0.78310406, + "learning_rate": 3.499601265005622e-06, + "loss": 0.80478293, + "num_input_tokens_seen": 90902985, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.85546875, + "step": 4212, + "time_per_iteration": 2.535416841506958 + }, + { + "auxiliary_loss_clip": 0.01131331, + "auxiliary_loss_mlp": 0.01040125, + "balance_loss_clip": 1.02356696, + "balance_loss_mlp": 1.04314673, + "epoch": 0.2532992634901548, + "flos": 25447450584960.0, + "grad_norm": 2.0206536972721088, + "language_loss": 0.53393918, + "learning_rate": 3.4993435438705938e-06, + "loss": 0.55565375, + "num_input_tokens_seen": 90923550, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8828125, + "step": 4213, + "time_per_iteration": 2.488844871520996 + }, + { + "auxiliary_loss_clip": 0.01132972, + "auxiliary_loss_mlp": 0.01042806, + "balance_loss_clip": 1.02566385, + "balance_loss_mlp": 1.04508567, + "epoch": 0.25335938674282277, + "flos": 18880538405760.0, + "grad_norm": 2.6682600080383816, + "language_loss": 0.65329081, + "learning_rate": 3.499085765880308e-06, + "loss": 0.67504859, + "num_input_tokens_seen": 90943260, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.87890625, + "step": 4214, + "time_per_iteration": 2.478422164916992 + }, + { + "auxiliary_loss_clip": 0.01043385, + "auxiliary_loss_mlp": 0.0100812, + "balance_loss_clip": 1.00630808, + "balance_loss_mlp": 1.0142169, + "epoch": 0.25341950999549073, + "flos": 53062649936640.0, + "grad_norm": 0.8479929036578698, + "language_loss": 0.58049941, + "learning_rate": 3.4988279310445396e-06, + "loss": 0.60101438, + "num_input_tokens_seen": 90996295, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.29296875, + "step": 4215, + "time_per_iteration": 2.824084997177124 + }, + { + "auxiliary_loss_clip": 0.01133433, + "auxiliary_loss_mlp": 0.01043479, + "balance_loss_clip": 1.02636075, + "balance_loss_mlp": 1.04583967, + "epoch": 0.2534796332481587, + "flos": 39020247054720.0, + "grad_norm": 1.7693463876532338, + "language_loss": 0.83949232, + "learning_rate": 3.498570039373066e-06, + "loss": 0.86126143, + "num_input_tokens_seen": 91017545, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.875, + "step": 4216, + "time_per_iteration": 2.650329828262329 + }, + { + "auxiliary_loss_clip": 0.01134428, + "auxiliary_loss_mlp": 0.01041784, + "balance_loss_clip": 1.02504706, + "balance_loss_mlp": 1.04571652, + "epoch": 0.2535397565008267, + "flos": 23586990670080.0, + "grad_norm": 1.7652170119003572, + "language_loss": 0.80028123, + "learning_rate": 3.498312090875666e-06, + "loss": 0.82204342, + "num_input_tokens_seen": 91037715, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.88671875, + "step": 4217, + "time_per_iteration": 2.49381160736084 + }, + { + "auxiliary_loss_clip": 0.01129632, + "auxiliary_loss_mlp": 0.01039348, + "balance_loss_clip": 1.02422011, + "balance_loss_mlp": 1.04193234, + "epoch": 0.2535998797534947, + "flos": 19281373251840.0, + "grad_norm": 2.1701414828965464, + "language_loss": 0.75014293, + "learning_rate": 3.4980540855621218e-06, + "loss": 0.7718327, + "num_input_tokens_seen": 91055295, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.87890625, + "step": 4218, + "time_per_iteration": 2.4794864654541016 + }, + { + "auxiliary_loss_clip": 0.01135591, + "auxiliary_loss_mlp": 0.01041436, + "balance_loss_clip": 1.02462721, + "balance_loss_mlp": 1.04470503, + "epoch": 0.25366000300616265, + "flos": 24024382583040.0, + "grad_norm": 1.8718582993796022, + "language_loss": 0.74483025, + "learning_rate": 3.4977960234422167e-06, + "loss": 0.76660055, + "num_input_tokens_seen": 91075485, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.90625, + "step": 4219, + "time_per_iteration": 5.428370952606201 + }, + { + "auxiliary_loss_clip": 0.01137942, + "auxiliary_loss_mlp": 0.01052625, + "balance_loss_clip": 1.0351491, + "balance_loss_mlp": 1.04695058, + "epoch": 0.2537201262588306, + "flos": 16289368116480.0, + "grad_norm": 2.1507448030921057, + "language_loss": 0.81194967, + "learning_rate": 3.497537904525736e-06, + "loss": 0.83385527, + "num_input_tokens_seen": 91093620, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.91015625, + "step": 4220, + "time_per_iteration": 2.454045534133911 + }, + { + "auxiliary_loss_clip": 0.01134951, + "auxiliary_loss_mlp": 0.01047743, + "balance_loss_clip": 1.03007603, + "balance_loss_mlp": 1.04596126, + "epoch": 0.2537802495114986, + "flos": 23294677789440.0, + "grad_norm": 2.058400170489012, + "language_loss": 0.70873475, + "learning_rate": 3.497279728822468e-06, + "loss": 0.73056173, + "num_input_tokens_seen": 91114110, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.890625, + "step": 4221, + "time_per_iteration": 2.4728429317474365 + }, + { + "auxiliary_loss_clip": 0.01134228, + "auxiliary_loss_mlp": 0.01039832, + "balance_loss_clip": 1.02309537, + "balance_loss_mlp": 1.0444454, + "epoch": 0.25384037276416654, + "flos": 17639142416640.0, + "grad_norm": 2.3290205392002847, + "language_loss": 0.62039649, + "learning_rate": 3.497021496342202e-06, + "loss": 0.64213717, + "num_input_tokens_seen": 91133135, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8984375, + "step": 4222, + "time_per_iteration": 2.4465436935424805 + }, + { + "auxiliary_loss_clip": 0.01137839, + "auxiliary_loss_mlp": 0.01052178, + "balance_loss_clip": 1.0352385, + "balance_loss_mlp": 1.04635429, + "epoch": 0.2539004960168345, + "flos": 21507044699520.0, + "grad_norm": 1.6514367228652884, + "language_loss": 0.74686599, + "learning_rate": 3.496763207094731e-06, + "loss": 0.76876616, + "num_input_tokens_seen": 91151805, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9140625, + "step": 4223, + "time_per_iteration": 2.449887275695801 + }, + { + "auxiliary_loss_clip": 0.01134875, + "auxiliary_loss_mlp": 0.01035973, + "balance_loss_clip": 1.02001095, + "balance_loss_mlp": 1.04763556, + "epoch": 0.2539606192695025, + "flos": 23950909313280.0, + "grad_norm": 1.7274606282993847, + "language_loss": 0.79782087, + "learning_rate": 3.49650486108985e-06, + "loss": 0.81952935, + "num_input_tokens_seen": 91172270, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.875, + "step": 4224, + "time_per_iteration": 2.4809348583221436 + }, + { + "auxiliary_loss_clip": 0.01129812, + "auxiliary_loss_mlp": 0.01043453, + "balance_loss_clip": 1.02668035, + "balance_loss_mlp": 1.04306865, + "epoch": 0.25402074252217044, + "flos": 24169784837760.0, + "grad_norm": 1.7388314634599362, + "language_loss": 0.77813148, + "learning_rate": 3.496246458337354e-06, + "loss": 0.79986417, + "num_input_tokens_seen": 91192080, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8671875, + "step": 4225, + "time_per_iteration": 2.4813735485076904 + }, + { + "auxiliary_loss_clip": 0.01135622, + "auxiliary_loss_mlp": 0.01054065, + "balance_loss_clip": 1.03661263, + "balance_loss_mlp": 1.04603362, + "epoch": 0.2540808657748384, + "flos": 22303758314880.0, + "grad_norm": 1.6070040517314534, + "language_loss": 0.84763634, + "learning_rate": 3.4959879988470426e-06, + "loss": 0.86953318, + "num_input_tokens_seen": 91211450, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.89453125, + "step": 4226, + "time_per_iteration": 2.4583990573883057 + }, + { + "auxiliary_loss_clip": 0.01130131, + "auxiliary_loss_mlp": 0.0104498, + "balance_loss_clip": 1.0277667, + "balance_loss_mlp": 1.04317141, + "epoch": 0.25414098902750637, + "flos": 27599541022080.0, + "grad_norm": 2.4872704745527168, + "language_loss": 0.70759654, + "learning_rate": 3.4957294826287164e-06, + "loss": 0.72934765, + "num_input_tokens_seen": 91231835, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.8671875, + "step": 4227, + "time_per_iteration": 2.532057762145996 + }, + { + "auxiliary_loss_clip": 0.01041509, + "auxiliary_loss_mlp": 0.01000975, + "balance_loss_clip": 0.9989962, + "balance_loss_mlp": 1.01186037, + "epoch": 0.25420111228017434, + "flos": 58170834887040.0, + "grad_norm": 0.9701035361715339, + "language_loss": 0.61865914, + "learning_rate": 3.4954709096921785e-06, + "loss": 0.63908398, + "num_input_tokens_seen": 91288755, + "router_z_loss_clip": 0.01977539, + "router_z_loss_mlp": 0.296875, + "step": 4228, + "time_per_iteration": 2.9040682315826416 + }, + { + "auxiliary_loss_clip": 0.01136332, + "auxiliary_loss_mlp": 0.01037582, + "balance_loss_clip": 1.02026105, + "balance_loss_mlp": 1.04564357, + "epoch": 0.2542612355328423, + "flos": 11464409905920.0, + "grad_norm": 4.885618231754604, + "language_loss": 0.86024547, + "learning_rate": 3.4952122800472336e-06, + "loss": 0.88198459, + "num_input_tokens_seen": 91302485, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.90625, + "step": 4229, + "time_per_iteration": 2.404157876968384 + }, + { + "auxiliary_loss_clip": 0.0113505, + "auxiliary_loss_mlp": 0.01044312, + "balance_loss_clip": 1.02696753, + "balance_loss_mlp": 1.0466435, + "epoch": 0.2543213587855103, + "flos": 22965879669120.0, + "grad_norm": 1.8862111092995248, + "language_loss": 0.77280557, + "learning_rate": 3.4949535937036892e-06, + "loss": 0.79459918, + "num_input_tokens_seen": 91321120, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.8828125, + "step": 4230, + "time_per_iteration": 2.4956207275390625 + }, + { + "auxiliary_loss_clip": 0.01133757, + "auxiliary_loss_mlp": 0.01046935, + "balance_loss_clip": 1.02980483, + "balance_loss_mlp": 1.04598594, + "epoch": 0.2543814820381783, + "flos": 18253178438400.0, + "grad_norm": 1.9381647251913205, + "language_loss": 0.75116754, + "learning_rate": 3.4946948506713544e-06, + "loss": 0.77297449, + "num_input_tokens_seen": 91338575, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.87890625, + "step": 4231, + "time_per_iteration": 2.4570302963256836 + }, + { + "auxiliary_loss_clip": 0.0113225, + "auxiliary_loss_mlp": 0.01038768, + "balance_loss_clip": 1.02253127, + "balance_loss_mlp": 1.04484463, + "epoch": 0.25444160529084625, + "flos": 15632705629440.0, + "grad_norm": 2.3236339630790916, + "language_loss": 0.74055511, + "learning_rate": 3.4944360509600416e-06, + "loss": 0.76226532, + "num_input_tokens_seen": 91357355, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.875, + "step": 4232, + "time_per_iteration": 2.4537932872772217 + }, + { + "auxiliary_loss_clip": 0.01134838, + "auxiliary_loss_mlp": 0.01041691, + "balance_loss_clip": 1.02412581, + "balance_loss_mlp": 1.04658151, + "epoch": 0.2545017285435142, + "flos": 24601610142720.0, + "grad_norm": 1.8521853851823955, + "language_loss": 0.86557174, + "learning_rate": 3.4941771945795637e-06, + "loss": 0.88733703, + "num_input_tokens_seen": 91376515, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8828125, + "step": 4233, + "time_per_iteration": 2.4943323135375977 + }, + { + "auxiliary_loss_clip": 0.01125532, + "auxiliary_loss_mlp": 0.01040876, + "balance_loss_clip": 1.02570057, + "balance_loss_mlp": 1.04215169, + "epoch": 0.2545618517961822, + "flos": 24679069822080.0, + "grad_norm": 1.5280608213400515, + "language_loss": 0.74841732, + "learning_rate": 3.493918281539737e-06, + "loss": 0.7700814, + "num_input_tokens_seen": 91397595, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8359375, + "step": 4234, + "time_per_iteration": 2.541349172592163 + }, + { + "auxiliary_loss_clip": 0.01133471, + "auxiliary_loss_mlp": 0.01042663, + "balance_loss_clip": 1.02661681, + "balance_loss_mlp": 1.04286838, + "epoch": 0.25462197504885015, + "flos": 23915106432000.0, + "grad_norm": 1.542232814469661, + "language_loss": 0.7489568, + "learning_rate": 3.493659311850379e-06, + "loss": 0.77071816, + "num_input_tokens_seen": 91417775, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.90625, + "step": 4235, + "time_per_iteration": 2.5059099197387695 + }, + { + "auxiliary_loss_clip": 0.01141785, + "auxiliary_loss_mlp": 0.01044345, + "balance_loss_clip": 1.02570069, + "balance_loss_mlp": 1.04655004, + "epoch": 0.2546820983015181, + "flos": 24789387467520.0, + "grad_norm": 2.0015253194085645, + "language_loss": 0.64487904, + "learning_rate": 3.4934002855213106e-06, + "loss": 0.6667403, + "num_input_tokens_seen": 91437665, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.953125, + "step": 4236, + "time_per_iteration": 2.512286424636841 + }, + { + "auxiliary_loss_clip": 0.01131709, + "auxiliary_loss_mlp": 0.01032901, + "balance_loss_clip": 1.01757693, + "balance_loss_mlp": 1.04509079, + "epoch": 0.2547422215541861, + "flos": 18734130570240.0, + "grad_norm": 1.5430935122242522, + "language_loss": 0.67046815, + "learning_rate": 3.493141202562354e-06, + "loss": 0.69211423, + "num_input_tokens_seen": 91456705, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8671875, + "step": 4237, + "time_per_iteration": 2.455911636352539 + }, + { + "auxiliary_loss_clip": 0.01134325, + "auxiliary_loss_mlp": 0.01045903, + "balance_loss_clip": 1.02916634, + "balance_loss_mlp": 1.04509199, + "epoch": 0.25480234480685404, + "flos": 21032449274880.0, + "grad_norm": 1.9754127990153556, + "language_loss": 0.74863333, + "learning_rate": 3.492882062983333e-06, + "loss": 0.77043563, + "num_input_tokens_seen": 91475535, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.89453125, + "step": 4238, + "time_per_iteration": 2.4770114421844482 + }, + { + "auxiliary_loss_clip": 0.01136693, + "auxiliary_loss_mlp": 0.0104647, + "balance_loss_clip": 1.02848125, + "balance_loss_mlp": 1.04734778, + "epoch": 0.254862468059522, + "flos": 25082167224960.0, + "grad_norm": 1.8397193389954023, + "language_loss": 0.8033936, + "learning_rate": 3.492622866794074e-06, + "loss": 0.82522523, + "num_input_tokens_seen": 91499140, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.89453125, + "step": 4239, + "time_per_iteration": 2.5087499618530273 + }, + { + "auxiliary_loss_clip": 0.01131893, + "auxiliary_loss_mlp": 0.01041684, + "balance_loss_clip": 1.02457762, + "balance_loss_mlp": 1.04512548, + "epoch": 0.25492259131219, + "flos": 20558392554240.0, + "grad_norm": 1.749971041952711, + "language_loss": 0.77208781, + "learning_rate": 3.492363614004407e-06, + "loss": 0.7938236, + "num_input_tokens_seen": 91518335, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.8671875, + "step": 4240, + "time_per_iteration": 2.4757072925567627 + }, + { + "auxiliary_loss_clip": 0.01141112, + "auxiliary_loss_mlp": 0.01042401, + "balance_loss_clip": 1.02463925, + "balance_loss_mlp": 1.04773092, + "epoch": 0.25498271456485794, + "flos": 25042485674880.0, + "grad_norm": 2.0511352101670126, + "language_loss": 0.83254647, + "learning_rate": 3.492104304624162e-06, + "loss": 0.85438156, + "num_input_tokens_seen": 91537655, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.93359375, + "step": 4241, + "time_per_iteration": 2.5062708854675293 + }, + { + "auxiliary_loss_clip": 0.01136148, + "auxiliary_loss_mlp": 0.01044003, + "balance_loss_clip": 1.02761221, + "balance_loss_mlp": 1.0463624, + "epoch": 0.2550428378175259, + "flos": 26178412354560.0, + "grad_norm": 1.6663950411566644, + "language_loss": 0.73410285, + "learning_rate": 3.4918449386631725e-06, + "loss": 0.75590432, + "num_input_tokens_seen": 91557545, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8984375, + "step": 4242, + "time_per_iteration": 2.5570173263549805 + }, + { + "auxiliary_loss_clip": 0.01136205, + "auxiliary_loss_mlp": 0.01038733, + "balance_loss_clip": 1.02249646, + "balance_loss_mlp": 1.04695976, + "epoch": 0.2551029610701939, + "flos": 15267170874240.0, + "grad_norm": 2.4092613771466453, + "language_loss": 0.72371018, + "learning_rate": 3.491585516131273e-06, + "loss": 0.74545956, + "num_input_tokens_seen": 91574405, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.890625, + "step": 4243, + "time_per_iteration": 2.440492868423462 + }, + { + "auxiliary_loss_clip": 0.01136318, + "auxiliary_loss_mlp": 0.01041492, + "balance_loss_clip": 1.02507675, + "balance_loss_mlp": 1.04668963, + "epoch": 0.2551630843228619, + "flos": 18112193556480.0, + "grad_norm": 2.3937572910440847, + "language_loss": 0.81865323, + "learning_rate": 3.491326037038301e-06, + "loss": 0.84043133, + "num_input_tokens_seen": 91593755, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8984375, + "step": 4244, + "time_per_iteration": 2.4728784561157227 + }, + { + "auxiliary_loss_clip": 0.01044231, + "auxiliary_loss_mlp": 0.01002536, + "balance_loss_clip": 1.00084293, + "balance_loss_mlp": 1.01474202, + "epoch": 0.25522320757552985, + "flos": 70520192167680.0, + "grad_norm": 0.7400094393930867, + "language_loss": 0.5777986, + "learning_rate": 3.4910665013940967e-06, + "loss": 0.5982663, + "num_input_tokens_seen": 91660335, + "router_z_loss_clip": 0.01696777, + "router_z_loss_mlp": 0.29492188, + "step": 4245, + "time_per_iteration": 3.155487537384033 + }, + { + "auxiliary_loss_clip": 0.01135489, + "auxiliary_loss_mlp": 0.01049355, + "balance_loss_clip": 1.03248656, + "balance_loss_mlp": 1.04526567, + "epoch": 0.2552833308281978, + "flos": 22893088757760.0, + "grad_norm": 1.9776048921576397, + "language_loss": 0.65246034, + "learning_rate": 3.4908069092085015e-06, + "loss": 0.67430878, + "num_input_tokens_seen": 91678500, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.90234375, + "step": 4246, + "time_per_iteration": 2.4889461994171143 + }, + { + "auxiliary_loss_clip": 0.01127053, + "auxiliary_loss_mlp": 0.0104224, + "balance_loss_clip": 1.02715993, + "balance_loss_mlp": 1.04366493, + "epoch": 0.2553434540808658, + "flos": 22053605022720.0, + "grad_norm": 1.748925776992144, + "language_loss": 0.81467927, + "learning_rate": 3.4905472604913585e-06, + "loss": 0.83637214, + "num_input_tokens_seen": 91696430, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8359375, + "step": 4247, + "time_per_iteration": 2.4680213928222656 + }, + { + "auxiliary_loss_clip": 0.0114026, + "auxiliary_loss_mlp": 0.01045845, + "balance_loss_clip": 1.02718902, + "balance_loss_mlp": 1.04570985, + "epoch": 0.25540357733353375, + "flos": 16544190176640.0, + "grad_norm": 2.9702547035135165, + "language_loss": 0.83062297, + "learning_rate": 3.490287555252514e-06, + "loss": 0.85248411, + "num_input_tokens_seen": 91713270, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.9453125, + "step": 4248, + "time_per_iteration": 2.446810245513916 + }, + { + "auxiliary_loss_clip": 0.01136577, + "auxiliary_loss_mlp": 0.01045007, + "balance_loss_clip": 1.02793586, + "balance_loss_mlp": 1.04672599, + "epoch": 0.2554637005862017, + "flos": 17565022702080.0, + "grad_norm": 2.21885342952208, + "language_loss": 0.84529531, + "learning_rate": 3.4900277935018166e-06, + "loss": 0.86711109, + "num_input_tokens_seen": 91728865, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.8984375, + "step": 4249, + "time_per_iteration": 2.4372382164001465 + }, + { + "auxiliary_loss_clip": 0.01044447, + "auxiliary_loss_mlp": 0.01003984, + "balance_loss_clip": 1.00217199, + "balance_loss_mlp": 1.01503897, + "epoch": 0.2555238238388697, + "flos": 72244763953920.0, + "grad_norm": 0.7531523874953217, + "language_loss": 0.56312215, + "learning_rate": 3.489767975249115e-06, + "loss": 0.58360648, + "num_input_tokens_seen": 91787470, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.29492188, + "step": 4250, + "time_per_iteration": 3.047654628753662 + }, + { + "auxiliary_loss_clip": 0.01133573, + "auxiliary_loss_mlp": 0.01038351, + "balance_loss_clip": 1.02139914, + "balance_loss_mlp": 1.04434705, + "epoch": 0.25558394709153764, + "flos": 24389414547840.0, + "grad_norm": 2.1374171101673243, + "language_loss": 0.80306417, + "learning_rate": 3.4895081005042632e-06, + "loss": 0.82478344, + "num_input_tokens_seen": 91805640, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.890625, + "step": 4251, + "time_per_iteration": 2.4866387844085693 + }, + { + "auxiliary_loss_clip": 0.01042955, + "auxiliary_loss_mlp": 0.01004928, + "balance_loss_clip": 1.00307989, + "balance_loss_mlp": 1.01383376, + "epoch": 0.2556440703442056, + "flos": 69231213636480.0, + "grad_norm": 0.7958061962206047, + "language_loss": 0.66077995, + "learning_rate": 3.4892481692771146e-06, + "loss": 0.6812588, + "num_input_tokens_seen": 91869695, + "router_z_loss_clip": 0.01843262, + "router_z_loss_mlp": 0.29296875, + "step": 4252, + "time_per_iteration": 3.117496967315674 + }, + { + "auxiliary_loss_clip": 0.011309, + "auxiliary_loss_mlp": 0.01037068, + "balance_loss_clip": 1.02198839, + "balance_loss_mlp": 1.04373813, + "epoch": 0.2557041935968736, + "flos": 24863902231680.0, + "grad_norm": 2.169743717969613, + "language_loss": 0.73382849, + "learning_rate": 3.4889881815775267e-06, + "loss": 0.75550812, + "num_input_tokens_seen": 91889920, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.87109375, + "step": 4253, + "time_per_iteration": 2.5709948539733887 + }, + { + "auxiliary_loss_clip": 0.01134729, + "auxiliary_loss_mlp": 0.01044447, + "balance_loss_clip": 1.02873516, + "balance_loss_mlp": 1.04698956, + "epoch": 0.25576431684954154, + "flos": 22492110257280.0, + "grad_norm": 1.9741012093631007, + "language_loss": 0.72927308, + "learning_rate": 3.488728137415357e-06, + "loss": 0.75106484, + "num_input_tokens_seen": 91908665, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.875, + "step": 4254, + "time_per_iteration": 2.509932518005371 + }, + { + "auxiliary_loss_clip": 0.01133463, + "auxiliary_loss_mlp": 0.01043601, + "balance_loss_clip": 1.02636361, + "balance_loss_mlp": 1.04452896, + "epoch": 0.2558244401022095, + "flos": 19826748426240.0, + "grad_norm": 1.7290530974650873, + "language_loss": 0.80863065, + "learning_rate": 3.4884680368004675e-06, + "loss": 0.8304013, + "num_input_tokens_seen": 91927855, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.890625, + "step": 4255, + "time_per_iteration": 2.4473092555999756 + }, + { + "auxiliary_loss_clip": 0.01133499, + "auxiliary_loss_mlp": 0.01043496, + "balance_loss_clip": 1.02681875, + "balance_loss_mlp": 1.04673088, + "epoch": 0.2558845633548775, + "flos": 23220486247680.0, + "grad_norm": 1.512169748685899, + "language_loss": 0.85572308, + "learning_rate": 3.488207879742721e-06, + "loss": 0.87749302, + "num_input_tokens_seen": 91948500, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8671875, + "step": 4256, + "time_per_iteration": 2.500788927078247 + }, + { + "auxiliary_loss_clip": 0.01136428, + "auxiliary_loss_mlp": 0.01048361, + "balance_loss_clip": 1.03119493, + "balance_loss_mlp": 1.04482555, + "epoch": 0.2559446866075455, + "flos": 16837867774080.0, + "grad_norm": 4.026866255210063, + "language_loss": 0.74821836, + "learning_rate": 3.4879476662519826e-06, + "loss": 0.77006626, + "num_input_tokens_seen": 91968375, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9140625, + "step": 4257, + "time_per_iteration": 2.4511358737945557 + }, + { + "auxiliary_loss_clip": 0.01040508, + "auxiliary_loss_mlp": 0.01009541, + "balance_loss_clip": 1.00763345, + "balance_loss_mlp": 1.01154876, + "epoch": 0.25600480986021346, + "flos": 57593786895360.0, + "grad_norm": 0.8061088541165783, + "language_loss": 0.65227318, + "learning_rate": 3.4876873963381196e-06, + "loss": 0.67277366, + "num_input_tokens_seen": 92028490, + "router_z_loss_clip": 0.01904297, + "router_z_loss_mlp": 0.2890625, + "step": 4258, + "time_per_iteration": 2.9953789710998535 + }, + { + "auxiliary_loss_clip": 0.01131454, + "auxiliary_loss_mlp": 0.0103789, + "balance_loss_clip": 1.0202589, + "balance_loss_mlp": 1.04548264, + "epoch": 0.2560649331128814, + "flos": 27819529868160.0, + "grad_norm": 1.622828615893818, + "language_loss": 0.7647177, + "learning_rate": 3.4874270700110013e-06, + "loss": 0.78641111, + "num_input_tokens_seen": 92048060, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.859375, + "step": 4259, + "time_per_iteration": 2.5079360008239746 + }, + { + "auxiliary_loss_clip": 0.01038142, + "auxiliary_loss_mlp": 0.01004188, + "balance_loss_clip": 1.00237584, + "balance_loss_mlp": 1.0093925, + "epoch": 0.2561250563655494, + "flos": 70950509101440.0, + "grad_norm": 0.7946947905759578, + "language_loss": 0.58501768, + "learning_rate": 3.4871666872804994e-06, + "loss": 0.60544097, + "num_input_tokens_seen": 92118180, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.28710938, + "step": 4260, + "time_per_iteration": 4.636982202529907 + }, + { + "auxiliary_loss_clip": 0.01131187, + "auxiliary_loss_mlp": 0.01044504, + "balance_loss_clip": 1.02759969, + "balance_loss_mlp": 1.04300261, + "epoch": 0.25618517961821735, + "flos": 27012329481600.0, + "grad_norm": 1.8728817118968701, + "language_loss": 0.76659095, + "learning_rate": 3.4869062481564875e-06, + "loss": 0.7883479, + "num_input_tokens_seen": 92137570, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8828125, + "step": 4261, + "time_per_iteration": 3.974956750869751 + }, + { + "auxiliary_loss_clip": 0.01130829, + "auxiliary_loss_mlp": 0.01037912, + "balance_loss_clip": 1.02280843, + "balance_loss_mlp": 1.04460573, + "epoch": 0.2562453028708853, + "flos": 23068296322560.0, + "grad_norm": 1.6516780840688012, + "language_loss": 0.8323037, + "learning_rate": 3.486645752648842e-06, + "loss": 0.85399115, + "num_input_tokens_seen": 92157625, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.86328125, + "step": 4262, + "time_per_iteration": 2.5251948833465576 + }, + { + "auxiliary_loss_clip": 0.01136997, + "auxiliary_loss_mlp": 0.01048847, + "balance_loss_clip": 1.03123951, + "balance_loss_mlp": 1.04404712, + "epoch": 0.2563054261235533, + "flos": 15120942606720.0, + "grad_norm": 2.7380780768968016, + "language_loss": 0.74153852, + "learning_rate": 3.4863852007674405e-06, + "loss": 0.76339698, + "num_input_tokens_seen": 92175350, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9296875, + "step": 4263, + "time_per_iteration": 2.42657208442688 + }, + { + "auxiliary_loss_clip": 0.01133473, + "auxiliary_loss_mlp": 0.01051758, + "balance_loss_clip": 1.03533101, + "balance_loss_mlp": 1.04720163, + "epoch": 0.25636554937622125, + "flos": 27854865872640.0, + "grad_norm": 1.7828084139599185, + "language_loss": 0.82793939, + "learning_rate": 3.486124592522163e-06, + "loss": 0.84979165, + "num_input_tokens_seen": 92196070, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.86328125, + "step": 4264, + "time_per_iteration": 2.534097194671631 + }, + { + "auxiliary_loss_clip": 0.01134463, + "auxiliary_loss_mlp": 0.01041936, + "balance_loss_clip": 1.02506804, + "balance_loss_mlp": 1.04660988, + "epoch": 0.2564256726288892, + "flos": 28906509288960.0, + "grad_norm": 1.7080317762970965, + "language_loss": 0.7443161, + "learning_rate": 3.4858639279228924e-06, + "loss": 0.76608008, + "num_input_tokens_seen": 92216310, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.87890625, + "step": 4265, + "time_per_iteration": 2.51088809967041 + }, + { + "auxiliary_loss_clip": 0.01129849, + "auxiliary_loss_mlp": 0.01032538, + "balance_loss_clip": 1.01679027, + "balance_loss_mlp": 1.0425024, + "epoch": 0.2564857958815572, + "flos": 18514931823360.0, + "grad_norm": 1.644190377842657, + "language_loss": 0.8153013, + "learning_rate": 3.485603206979513e-06, + "loss": 0.83692515, + "num_input_tokens_seen": 92234510, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.875, + "step": 4266, + "time_per_iteration": 2.4706335067749023 + }, + { + "auxiliary_loss_clip": 0.01128054, + "auxiliary_loss_mlp": 0.01035268, + "balance_loss_clip": 1.01909137, + "balance_loss_mlp": 1.04252076, + "epoch": 0.25654591913422514, + "flos": 25808280658560.0, + "grad_norm": 1.6333370834261398, + "language_loss": 0.79287028, + "learning_rate": 3.4853424297019103e-06, + "loss": 0.81450343, + "num_input_tokens_seen": 92254070, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.85546875, + "step": 4267, + "time_per_iteration": 2.4819366931915283 + }, + { + "auxiliary_loss_clip": 0.01127366, + "auxiliary_loss_mlp": 0.01041101, + "balance_loss_clip": 1.02480555, + "balance_loss_mlp": 1.04406714, + "epoch": 0.2566060423868931, + "flos": 19099665325440.0, + "grad_norm": 1.7559000109968124, + "language_loss": 0.78708017, + "learning_rate": 3.4850815960999736e-06, + "loss": 0.80876482, + "num_input_tokens_seen": 92275060, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8359375, + "step": 4268, + "time_per_iteration": 2.4778378009796143 + }, + { + "auxiliary_loss_clip": 0.0113239, + "auxiliary_loss_mlp": 0.01037875, + "balance_loss_clip": 1.02198434, + "balance_loss_mlp": 1.04507172, + "epoch": 0.25666616563956113, + "flos": 23842674656640.0, + "grad_norm": 2.2514359992660204, + "language_loss": 0.68120348, + "learning_rate": 3.484820706183595e-06, + "loss": 0.70290613, + "num_input_tokens_seen": 92293610, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.87109375, + "step": 4269, + "time_per_iteration": 2.4696271419525146 + }, + { + "auxiliary_loss_clip": 0.01134604, + "auxiliary_loss_mlp": 0.01042059, + "balance_loss_clip": 1.0249877, + "balance_loss_mlp": 1.04593778, + "epoch": 0.2567262888922291, + "flos": 14604259420800.0, + "grad_norm": 4.018282830570473, + "language_loss": 0.78496158, + "learning_rate": 3.484559759962666e-06, + "loss": 0.80672824, + "num_input_tokens_seen": 92308305, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.88671875, + "step": 4270, + "time_per_iteration": 2.418912172317505 + }, + { + "auxiliary_loss_clip": 0.01139603, + "auxiliary_loss_mlp": 0.01037807, + "balance_loss_clip": 1.01953244, + "balance_loss_mlp": 1.04711556, + "epoch": 0.25678641214489706, + "flos": 32923117877760.0, + "grad_norm": 2.0502449379686256, + "language_loss": 0.68136632, + "learning_rate": 3.4842987574470816e-06, + "loss": 0.70314038, + "num_input_tokens_seen": 92329875, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.921875, + "step": 4271, + "time_per_iteration": 2.5410749912261963 + }, + { + "auxiliary_loss_clip": 0.01137314, + "auxiliary_loss_mlp": 0.0104993, + "balance_loss_clip": 1.0325973, + "balance_loss_mlp": 1.04592848, + "epoch": 0.256846535397565, + "flos": 24098933260800.0, + "grad_norm": 4.518410893879739, + "language_loss": 0.8741951, + "learning_rate": 3.4840376986467403e-06, + "loss": 0.8960675, + "num_input_tokens_seen": 92348780, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9140625, + "step": 4272, + "time_per_iteration": 2.5022568702697754 + }, + { + "auxiliary_loss_clip": 0.0113724, + "auxiliary_loss_mlp": 0.01044761, + "balance_loss_clip": 1.02734506, + "balance_loss_mlp": 1.04770613, + "epoch": 0.256906658650233, + "flos": 19718441942400.0, + "grad_norm": 1.953603621991432, + "language_loss": 0.81442308, + "learning_rate": 3.483776583571541e-06, + "loss": 0.83624303, + "num_input_tokens_seen": 92368175, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.8984375, + "step": 4273, + "time_per_iteration": 2.453834295272827 + }, + { + "auxiliary_loss_clip": 0.01131691, + "auxiliary_loss_mlp": 0.01041869, + "balance_loss_clip": 1.02492929, + "balance_loss_mlp": 1.04724693, + "epoch": 0.25696678190290095, + "flos": 22926018551040.0, + "grad_norm": 1.682161023261006, + "language_loss": 0.77215779, + "learning_rate": 3.4835154122313846e-06, + "loss": 0.79389334, + "num_input_tokens_seen": 92387755, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.84375, + "step": 4274, + "time_per_iteration": 2.486238956451416 + }, + { + "auxiliary_loss_clip": 0.01129914, + "auxiliary_loss_mlp": 0.01037046, + "balance_loss_clip": 1.02061856, + "balance_loss_mlp": 1.04450369, + "epoch": 0.2570269051555689, + "flos": 27307838672640.0, + "grad_norm": 1.8548211040661395, + "language_loss": 0.8401829, + "learning_rate": 3.4832541846361743e-06, + "loss": 0.86185247, + "num_input_tokens_seen": 92409850, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8515625, + "step": 4275, + "time_per_iteration": 2.5145719051361084 + }, + { + "auxiliary_loss_clip": 0.01133209, + "auxiliary_loss_mlp": 0.0103751, + "balance_loss_clip": 1.02078438, + "balance_loss_mlp": 1.04492021, + "epoch": 0.2570870284082369, + "flos": 27563414918400.0, + "grad_norm": 3.0116628321367678, + "language_loss": 0.78124094, + "learning_rate": 3.4829929007958175e-06, + "loss": 0.80294812, + "num_input_tokens_seen": 92431250, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8828125, + "step": 4276, + "time_per_iteration": 2.533989906311035 + }, + { + "auxiliary_loss_clip": 0.01133318, + "auxiliary_loss_mlp": 0.01043592, + "balance_loss_clip": 1.02723646, + "balance_loss_mlp": 1.04575086, + "epoch": 0.25714715166090485, + "flos": 28730834847360.0, + "grad_norm": 1.750550841347414, + "language_loss": 0.79439288, + "learning_rate": 3.4827315607202214e-06, + "loss": 0.81616199, + "num_input_tokens_seen": 92452065, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.875, + "step": 4277, + "time_per_iteration": 2.5131442546844482 + }, + { + "auxiliary_loss_clip": 0.01134263, + "auxiliary_loss_mlp": 0.01036244, + "balance_loss_clip": 1.01981688, + "balance_loss_mlp": 1.04671657, + "epoch": 0.2572072749135728, + "flos": 20116152305280.0, + "grad_norm": 2.0431628844466543, + "language_loss": 0.78804862, + "learning_rate": 3.482470164419295e-06, + "loss": 0.80975372, + "num_input_tokens_seen": 92470025, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.875, + "step": 4278, + "time_per_iteration": 2.4813432693481445 + }, + { + "auxiliary_loss_clip": 0.01137794, + "auxiliary_loss_mlp": 0.01039572, + "balance_loss_clip": 1.02299643, + "balance_loss_mlp": 1.04657972, + "epoch": 0.2572673981662408, + "flos": 26030855283840.0, + "grad_norm": 2.020871128069371, + "language_loss": 0.74624676, + "learning_rate": 3.482208711902952e-06, + "loss": 0.76802039, + "num_input_tokens_seen": 92489825, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9140625, + "step": 4279, + "time_per_iteration": 2.4989213943481445 + }, + { + "auxiliary_loss_clip": 0.01136508, + "auxiliary_loss_mlp": 0.01051836, + "balance_loss_clip": 1.03472984, + "balance_loss_mlp": 1.04528475, + "epoch": 0.25732752141890874, + "flos": 16106618695680.0, + "grad_norm": 2.295268067844067, + "language_loss": 0.85406947, + "learning_rate": 3.4819472031811065e-06, + "loss": 0.87595296, + "num_input_tokens_seen": 92507270, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9140625, + "step": 4280, + "time_per_iteration": 2.479163408279419 + }, + { + "auxiliary_loss_clip": 0.0113599, + "auxiliary_loss_mlp": 0.01041209, + "balance_loss_clip": 1.02362585, + "balance_loss_mlp": 1.04583955, + "epoch": 0.2573876446715767, + "flos": 22524429519360.0, + "grad_norm": 2.2211313624852447, + "language_loss": 0.78780186, + "learning_rate": 3.4816856382636744e-06, + "loss": 0.80957377, + "num_input_tokens_seen": 92526300, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8984375, + "step": 4281, + "time_per_iteration": 2.463003158569336 + }, + { + "auxiliary_loss_clip": 0.01134819, + "auxiliary_loss_mlp": 0.01039212, + "balance_loss_clip": 1.02277303, + "balance_loss_mlp": 1.0472312, + "epoch": 0.2574477679242447, + "flos": 23950837486080.0, + "grad_norm": 1.9444978312753, + "language_loss": 0.87356091, + "learning_rate": 3.4814240171605737e-06, + "loss": 0.89530122, + "num_input_tokens_seen": 92546465, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.875, + "step": 4282, + "time_per_iteration": 2.5049889087677 + }, + { + "auxiliary_loss_clip": 0.01137104, + "auxiliary_loss_mlp": 0.0104319, + "balance_loss_clip": 1.02739513, + "balance_loss_mlp": 1.04648709, + "epoch": 0.2575078911769127, + "flos": 21981711951360.0, + "grad_norm": 1.5754049466604292, + "language_loss": 0.70172656, + "learning_rate": 3.4811623398817267e-06, + "loss": 0.72352946, + "num_input_tokens_seen": 92567260, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.90625, + "step": 4283, + "time_per_iteration": 2.520315408706665 + }, + { + "auxiliary_loss_clip": 0.01132284, + "auxiliary_loss_mlp": 0.01042212, + "balance_loss_clip": 1.02698922, + "balance_loss_mlp": 1.04772711, + "epoch": 0.25756801442958066, + "flos": 21945406279680.0, + "grad_norm": 2.712350413324169, + "language_loss": 0.80323613, + "learning_rate": 3.4809006064370553e-06, + "loss": 0.82498109, + "num_input_tokens_seen": 92585425, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.84765625, + "step": 4284, + "time_per_iteration": 2.483292579650879 + }, + { + "auxiliary_loss_clip": 0.01134487, + "auxiliary_loss_mlp": 0.01040012, + "balance_loss_clip": 1.02538466, + "balance_loss_mlp": 1.04674387, + "epoch": 0.2576281376822486, + "flos": 35261980058880.0, + "grad_norm": 2.1742402973432893, + "language_loss": 0.70485193, + "learning_rate": 3.4806388168364835e-06, + "loss": 0.72659695, + "num_input_tokens_seen": 92604770, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.875, + "step": 4285, + "time_per_iteration": 2.564211130142212 + }, + { + "auxiliary_loss_clip": 0.01137353, + "auxiliary_loss_mlp": 0.01038151, + "balance_loss_clip": 1.02282071, + "balance_loss_mlp": 1.04953337, + "epoch": 0.2576882609349166, + "flos": 14132285688960.0, + "grad_norm": 2.328286971317511, + "language_loss": 0.58380014, + "learning_rate": 3.4803769710899402e-06, + "loss": 0.60555518, + "num_input_tokens_seen": 92622635, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.87890625, + "step": 4286, + "time_per_iteration": 2.4425430297851562 + }, + { + "auxiliary_loss_clip": 0.01139826, + "auxiliary_loss_mlp": 0.01043664, + "balance_loss_clip": 1.02702272, + "balance_loss_mlp": 1.04858327, + "epoch": 0.25774838418758456, + "flos": 23258336204160.0, + "grad_norm": 1.6452331987585218, + "language_loss": 0.64191288, + "learning_rate": 3.480115069207354e-06, + "loss": 0.66374773, + "num_input_tokens_seen": 92642960, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.91015625, + "step": 4287, + "time_per_iteration": 2.470015287399292 + }, + { + "auxiliary_loss_clip": 0.0113955, + "auxiliary_loss_mlp": 0.01040533, + "balance_loss_clip": 1.02293801, + "balance_loss_mlp": 1.04739881, + "epoch": 0.2578085074402525, + "flos": 22601745544320.0, + "grad_norm": 2.0830358142366148, + "language_loss": 0.72029591, + "learning_rate": 3.4798531111986557e-06, + "loss": 0.74209672, + "num_input_tokens_seen": 92662455, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.921875, + "step": 4288, + "time_per_iteration": 2.4983417987823486 + }, + { + "auxiliary_loss_clip": 0.01135736, + "auxiliary_loss_mlp": 0.01038411, + "balance_loss_clip": 1.02263355, + "balance_loss_mlp": 1.04882312, + "epoch": 0.2578686306929205, + "flos": 24571840746240.0, + "grad_norm": 1.9870049696680936, + "language_loss": 0.76965904, + "learning_rate": 3.4795910970737786e-06, + "loss": 0.79140055, + "num_input_tokens_seen": 92683520, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8671875, + "step": 4289, + "time_per_iteration": 2.4997475147247314 + }, + { + "auxiliary_loss_clip": 0.01134323, + "auxiliary_loss_mlp": 0.01040378, + "balance_loss_clip": 1.02311635, + "balance_loss_mlp": 1.04562807, + "epoch": 0.25792875394558845, + "flos": 18113953322880.0, + "grad_norm": 1.946897603323323, + "language_loss": 0.85123539, + "learning_rate": 3.4793290268426592e-06, + "loss": 0.87298238, + "num_input_tokens_seen": 92701450, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.88671875, + "step": 4290, + "time_per_iteration": 2.454871416091919 + }, + { + "auxiliary_loss_clip": 0.01140117, + "auxiliary_loss_mlp": 0.0105053, + "balance_loss_clip": 1.03159952, + "balance_loss_mlp": 1.04959655, + "epoch": 0.2579888771982564, + "flos": 17712902995200.0, + "grad_norm": 2.195715426849753, + "language_loss": 0.72170424, + "learning_rate": 3.4790669005152354e-06, + "loss": 0.74361074, + "num_input_tokens_seen": 92720355, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.90625, + "step": 4291, + "time_per_iteration": 2.4512693881988525 + }, + { + "auxiliary_loss_clip": 0.01142047, + "auxiliary_loss_mlp": 0.01041391, + "balance_loss_clip": 1.02436781, + "balance_loss_mlp": 1.05002344, + "epoch": 0.2580490004509244, + "flos": 16434878112000.0, + "grad_norm": 2.4805881311796423, + "language_loss": 0.80718195, + "learning_rate": 3.4788047181014458e-06, + "loss": 0.82901633, + "num_input_tokens_seen": 92736755, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.921875, + "step": 4292, + "time_per_iteration": 2.469034433364868 + }, + { + "auxiliary_loss_clip": 0.01141659, + "auxiliary_loss_mlp": 0.01044805, + "balance_loss_clip": 1.02767503, + "balance_loss_mlp": 1.05171072, + "epoch": 0.25810912370359235, + "flos": 33835141128960.0, + "grad_norm": 7.501455001056755, + "language_loss": 0.67646754, + "learning_rate": 3.4785424796112337e-06, + "loss": 0.69833219, + "num_input_tokens_seen": 92757655, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.8984375, + "step": 4293, + "time_per_iteration": 2.5785787105560303 + }, + { + "auxiliary_loss_clip": 0.01130106, + "auxiliary_loss_mlp": 0.01042426, + "balance_loss_clip": 1.02660704, + "balance_loss_mlp": 1.04503, + "epoch": 0.2581692469562603, + "flos": 25192197561600.0, + "grad_norm": 1.9136357435420137, + "language_loss": 0.75409257, + "learning_rate": 3.478280185054542e-06, + "loss": 0.77581787, + "num_input_tokens_seen": 92776100, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8515625, + "step": 4294, + "time_per_iteration": 2.5044636726379395 + }, + { + "auxiliary_loss_clip": 0.01136505, + "auxiliary_loss_mlp": 0.01047021, + "balance_loss_clip": 1.02974749, + "balance_loss_mlp": 1.04808116, + "epoch": 0.2582293702089283, + "flos": 34932212271360.0, + "grad_norm": 2.168244565891273, + "language_loss": 0.81049722, + "learning_rate": 3.478017834441318e-06, + "loss": 0.83233249, + "num_input_tokens_seen": 92798880, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.8828125, + "step": 4295, + "time_per_iteration": 2.5875558853149414 + }, + { + "auxiliary_loss_clip": 0.01140472, + "auxiliary_loss_mlp": 0.01046123, + "balance_loss_clip": 1.02797985, + "balance_loss_mlp": 1.04796624, + "epoch": 0.2582894934615963, + "flos": 26833746038400.0, + "grad_norm": 2.1973562505628026, + "language_loss": 0.72515166, + "learning_rate": 3.4777554277815096e-06, + "loss": 0.74701762, + "num_input_tokens_seen": 92817750, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.92578125, + "step": 4296, + "time_per_iteration": 2.535693407058716 + }, + { + "auxiliary_loss_clip": 0.01138613, + "auxiliary_loss_mlp": 0.01039903, + "balance_loss_clip": 1.02322531, + "balance_loss_mlp": 1.04918242, + "epoch": 0.25834961671426426, + "flos": 23515241253120.0, + "grad_norm": 1.8330269406357795, + "language_loss": 0.86766148, + "learning_rate": 3.477492965085067e-06, + "loss": 0.88944662, + "num_input_tokens_seen": 92837995, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.89453125, + "step": 4297, + "time_per_iteration": 2.5001306533813477 + }, + { + "auxiliary_loss_clip": 0.01137068, + "auxiliary_loss_mlp": 0.01048271, + "balance_loss_clip": 1.03208232, + "balance_loss_mlp": 1.04755223, + "epoch": 0.25840973996693223, + "flos": 22451028076800.0, + "grad_norm": 2.2622150737063955, + "language_loss": 0.84706259, + "learning_rate": 3.477230446361943e-06, + "loss": 0.86891592, + "num_input_tokens_seen": 92857245, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.89453125, + "step": 4298, + "time_per_iteration": 2.489917278289795 + }, + { + "auxiliary_loss_clip": 0.01137188, + "auxiliary_loss_mlp": 0.0103747, + "balance_loss_clip": 1.02069676, + "balance_loss_mlp": 1.04739285, + "epoch": 0.2584698632196002, + "flos": 11290854366720.0, + "grad_norm": 2.0676974538336266, + "language_loss": 0.83596241, + "learning_rate": 3.4769678716220927e-06, + "loss": 0.85770899, + "num_input_tokens_seen": 92873265, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8984375, + "step": 4299, + "time_per_iteration": 2.4274845123291016 + }, + { + "auxiliary_loss_clip": 0.0113508, + "auxiliary_loss_mlp": 0.01035558, + "balance_loss_clip": 1.01985788, + "balance_loss_mlp": 1.04795814, + "epoch": 0.25852998647226816, + "flos": 17929982839680.0, + "grad_norm": 2.477231855960524, + "language_loss": 0.82685435, + "learning_rate": 3.4767052408754726e-06, + "loss": 0.84856081, + "num_input_tokens_seen": 92890880, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.87109375, + "step": 4300, + "time_per_iteration": 2.4730846881866455 + }, + { + "auxiliary_loss_clip": 0.01137103, + "auxiliary_loss_mlp": 0.0104166, + "balance_loss_clip": 1.02492332, + "balance_loss_mlp": 1.04620934, + "epoch": 0.2585901097249361, + "flos": 33256117889280.0, + "grad_norm": 2.2046546957653077, + "language_loss": 0.67186987, + "learning_rate": 3.4764425541320417e-06, + "loss": 0.69365752, + "num_input_tokens_seen": 92910770, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.91015625, + "step": 4301, + "time_per_iteration": 2.5633106231689453 + }, + { + "auxiliary_loss_clip": 0.01141797, + "auxiliary_loss_mlp": 0.01039122, + "balance_loss_clip": 1.02191997, + "balance_loss_mlp": 1.04805672, + "epoch": 0.2586502329776041, + "flos": 18441278985600.0, + "grad_norm": 2.459016606739088, + "language_loss": 0.80929118, + "learning_rate": 3.4761798114017617e-06, + "loss": 0.83110034, + "num_input_tokens_seen": 92929520, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9375, + "step": 4302, + "time_per_iteration": 5.438407897949219 + }, + { + "auxiliary_loss_clip": 0.01138396, + "auxiliary_loss_mlp": 0.0104179, + "balance_loss_clip": 1.02535129, + "balance_loss_mlp": 1.04789591, + "epoch": 0.25871035623027205, + "flos": 17968120104960.0, + "grad_norm": 2.9925401825996545, + "language_loss": 0.92246419, + "learning_rate": 3.475917012694595e-06, + "loss": 0.94426608, + "num_input_tokens_seen": 92947890, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90625, + "step": 4303, + "time_per_iteration": 2.514573574066162 + }, + { + "auxiliary_loss_clip": 0.01139372, + "auxiliary_loss_mlp": 0.01036604, + "balance_loss_clip": 1.020046, + "balance_loss_mlp": 1.04932761, + "epoch": 0.25877047948294, + "flos": 27777729415680.0, + "grad_norm": 1.8070234866344623, + "language_loss": 0.67034984, + "learning_rate": 3.475654158020507e-06, + "loss": 0.69210964, + "num_input_tokens_seen": 92967690, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8984375, + "step": 4304, + "time_per_iteration": 2.540682315826416 + }, + { + "auxiliary_loss_clip": 0.01138164, + "auxiliary_loss_mlp": 0.0105089, + "balance_loss_clip": 1.03355694, + "balance_loss_mlp": 1.04595923, + "epoch": 0.258830602735608, + "flos": 27125843437440.0, + "grad_norm": 2.73594521825367, + "language_loss": 0.72829735, + "learning_rate": 3.4753912473894657e-06, + "loss": 0.75018799, + "num_input_tokens_seen": 92986830, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.921875, + "step": 4305, + "time_per_iteration": 2.580801248550415 + }, + { + "auxiliary_loss_clip": 0.01138565, + "auxiliary_loss_mlp": 0.01041261, + "balance_loss_clip": 1.02417874, + "balance_loss_mlp": 1.04731607, + "epoch": 0.25889072598827595, + "flos": 17891486438400.0, + "grad_norm": 2.196623082948333, + "language_loss": 0.75595653, + "learning_rate": 3.4751282808114403e-06, + "loss": 0.77775478, + "num_input_tokens_seen": 93002740, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9140625, + "step": 4306, + "time_per_iteration": 2.44267201423645 + }, + { + "auxiliary_loss_clip": 0.01045399, + "auxiliary_loss_mlp": 0.01003539, + "balance_loss_clip": 1.00138175, + "balance_loss_mlp": 1.01567113, + "epoch": 0.2589508492409439, + "flos": 53934955724160.0, + "grad_norm": 0.8506593293873899, + "language_loss": 0.5717386, + "learning_rate": 3.474865258296403e-06, + "loss": 0.59222794, + "num_input_tokens_seen": 93058645, + "router_z_loss_clip": 0.02160645, + "router_z_loss_mlp": 0.296875, + "step": 4307, + "time_per_iteration": 3.0457189083099365 + }, + { + "auxiliary_loss_clip": 0.01135839, + "auxiliary_loss_mlp": 0.01039878, + "balance_loss_clip": 1.02389181, + "balance_loss_mlp": 1.04729199, + "epoch": 0.2590109724936119, + "flos": 22125785402880.0, + "grad_norm": 1.7695447826328226, + "language_loss": 0.71543598, + "learning_rate": 3.474602179854327e-06, + "loss": 0.73719311, + "num_input_tokens_seen": 93077140, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8828125, + "step": 4308, + "time_per_iteration": 2.4612655639648438 + }, + { + "auxiliary_loss_clip": 0.0113812, + "auxiliary_loss_mlp": 0.01041087, + "balance_loss_clip": 1.02439809, + "balance_loss_mlp": 1.04625905, + "epoch": 0.2590710957462799, + "flos": 13474294398720.0, + "grad_norm": 2.097007373458932, + "language_loss": 0.84195936, + "learning_rate": 3.4743390454951886e-06, + "loss": 0.86375141, + "num_input_tokens_seen": 93093580, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.91796875, + "step": 4309, + "time_per_iteration": 2.458937883377075 + }, + { + "auxiliary_loss_clip": 0.01138522, + "auxiliary_loss_mlp": 0.01041522, + "balance_loss_clip": 1.02609062, + "balance_loss_mlp": 1.04893243, + "epoch": 0.25913121899894787, + "flos": 22307098279680.0, + "grad_norm": 1.520786669442297, + "language_loss": 0.8451637, + "learning_rate": 3.474075855228966e-06, + "loss": 0.8669641, + "num_input_tokens_seen": 93112345, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8984375, + "step": 4310, + "time_per_iteration": 2.453946828842163 + }, + { + "auxiliary_loss_clip": 0.0113925, + "auxiliary_loss_mlp": 0.01043346, + "balance_loss_clip": 1.02706194, + "balance_loss_mlp": 1.04705715, + "epoch": 0.25919134225161583, + "flos": 25811728364160.0, + "grad_norm": 2.3904067628525305, + "language_loss": 0.77478111, + "learning_rate": 3.473812609065639e-06, + "loss": 0.79660702, + "num_input_tokens_seen": 93131545, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.921875, + "step": 4311, + "time_per_iteration": 2.5142812728881836 + }, + { + "auxiliary_loss_clip": 0.01138542, + "auxiliary_loss_mlp": 0.01041115, + "balance_loss_clip": 1.0248189, + "balance_loss_mlp": 1.04691362, + "epoch": 0.2592514655042838, + "flos": 31212262108800.0, + "grad_norm": 3.1447136536803852, + "language_loss": 0.72220832, + "learning_rate": 3.4735493070151904e-06, + "loss": 0.74400491, + "num_input_tokens_seen": 93150730, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.91796875, + "step": 4312, + "time_per_iteration": 2.5275332927703857 + }, + { + "auxiliary_loss_clip": 0.01134993, + "auxiliary_loss_mlp": 0.01040705, + "balance_loss_clip": 1.02434921, + "balance_loss_mlp": 1.04480851, + "epoch": 0.25931158875695176, + "flos": 18474998878080.0, + "grad_norm": 2.2264539824076683, + "language_loss": 0.69908661, + "learning_rate": 3.4732859490876044e-06, + "loss": 0.72084355, + "num_input_tokens_seen": 93167895, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90234375, + "step": 4313, + "time_per_iteration": 2.479011058807373 + }, + { + "auxiliary_loss_clip": 0.01133563, + "auxiliary_loss_mlp": 0.01043367, + "balance_loss_clip": 1.02800131, + "balance_loss_mlp": 1.04467726, + "epoch": 0.2593717120096197, + "flos": 19207935895680.0, + "grad_norm": 1.7186396349483555, + "language_loss": 0.80486274, + "learning_rate": 3.473022535292867e-06, + "loss": 0.82663202, + "num_input_tokens_seen": 93187650, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.88671875, + "step": 4314, + "time_per_iteration": 2.443934679031372 + }, + { + "auxiliary_loss_clip": 0.01138226, + "auxiliary_loss_mlp": 0.01047643, + "balance_loss_clip": 1.03030992, + "balance_loss_mlp": 1.04506671, + "epoch": 0.2594318352622877, + "flos": 31248100903680.0, + "grad_norm": 2.0498851814527863, + "language_loss": 0.6687156, + "learning_rate": 3.472759065640968e-06, + "loss": 0.69057429, + "num_input_tokens_seen": 93207370, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9296875, + "step": 4315, + "time_per_iteration": 2.5375983715057373 + }, + { + "auxiliary_loss_clip": 0.01132586, + "auxiliary_loss_mlp": 0.01040869, + "balance_loss_clip": 1.02563381, + "balance_loss_mlp": 1.04426146, + "epoch": 0.25949195851495566, + "flos": 22237144542720.0, + "grad_norm": 1.5303062780919283, + "language_loss": 0.7911852, + "learning_rate": 3.4724955401418976e-06, + "loss": 0.81291974, + "num_input_tokens_seen": 93227925, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8828125, + "step": 4316, + "time_per_iteration": 2.448997735977173 + }, + { + "auxiliary_loss_clip": 0.01136257, + "auxiliary_loss_mlp": 0.01039906, + "balance_loss_clip": 1.02333546, + "balance_loss_mlp": 1.0446136, + "epoch": 0.2595520817676236, + "flos": 28075716645120.0, + "grad_norm": 1.687308210321376, + "language_loss": 0.77601087, + "learning_rate": 3.4722319588056487e-06, + "loss": 0.79777247, + "num_input_tokens_seen": 93250020, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9140625, + "step": 4317, + "time_per_iteration": 2.5545339584350586 + }, + { + "auxiliary_loss_clip": 0.01136641, + "auxiliary_loss_mlp": 0.01048751, + "balance_loss_clip": 1.03160882, + "balance_loss_mlp": 1.04599953, + "epoch": 0.2596122050202916, + "flos": 20190954378240.0, + "grad_norm": 2.5535432929686883, + "language_loss": 0.77773315, + "learning_rate": 3.4719683216422163e-06, + "loss": 0.79958701, + "num_input_tokens_seen": 93269070, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.90625, + "step": 4318, + "time_per_iteration": 2.450573682785034 + }, + { + "auxiliary_loss_clip": 0.01133741, + "auxiliary_loss_mlp": 0.01045128, + "balance_loss_clip": 1.02717471, + "balance_loss_mlp": 1.04450393, + "epoch": 0.25967232827295955, + "flos": 22527949052160.0, + "grad_norm": 1.801084946435003, + "language_loss": 0.76197278, + "learning_rate": 3.471704628661598e-06, + "loss": 0.78376144, + "num_input_tokens_seen": 93290250, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.890625, + "step": 4319, + "time_per_iteration": 2.5243709087371826 + }, + { + "auxiliary_loss_clip": 0.01131874, + "auxiliary_loss_mlp": 0.01037382, + "balance_loss_clip": 1.0216701, + "balance_loss_mlp": 1.04500592, + "epoch": 0.2597324515256275, + "flos": 21068252156160.0, + "grad_norm": 1.8511829127720039, + "language_loss": 0.76338619, + "learning_rate": 3.4714408798737925e-06, + "loss": 0.78507876, + "num_input_tokens_seen": 93310090, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8671875, + "step": 4320, + "time_per_iteration": 2.4792070388793945 + }, + { + "auxiliary_loss_clip": 0.01135729, + "auxiliary_loss_mlp": 0.01038323, + "balance_loss_clip": 1.02205038, + "balance_loss_mlp": 1.04641151, + "epoch": 0.2597925747782955, + "flos": 22050013662720.0, + "grad_norm": 1.7592602092397844, + "language_loss": 0.71143925, + "learning_rate": 3.471177075288801e-06, + "loss": 0.73317981, + "num_input_tokens_seen": 93329570, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.890625, + "step": 4321, + "time_per_iteration": 2.5381112098693848 + }, + { + "auxiliary_loss_clip": 0.01137982, + "auxiliary_loss_mlp": 0.01044713, + "balance_loss_clip": 1.02813125, + "balance_loss_mlp": 1.04517424, + "epoch": 0.2598526980309635, + "flos": 19536949497600.0, + "grad_norm": 2.037757848326605, + "language_loss": 0.74483943, + "learning_rate": 3.4709132149166277e-06, + "loss": 0.76666641, + "num_input_tokens_seen": 93347920, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9296875, + "step": 4322, + "time_per_iteration": 2.4379777908325195 + }, + { + "auxiliary_loss_clip": 0.01134471, + "auxiliary_loss_mlp": 0.01047461, + "balance_loss_clip": 1.03059244, + "balance_loss_mlp": 1.04368353, + "epoch": 0.25991282128363147, + "flos": 24495207079680.0, + "grad_norm": 1.9467125010752846, + "language_loss": 0.73674595, + "learning_rate": 3.470649298767278e-06, + "loss": 0.75856531, + "num_input_tokens_seen": 93367145, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.90625, + "step": 4323, + "time_per_iteration": 2.517399549484253 + }, + { + "auxiliary_loss_clip": 0.01141538, + "auxiliary_loss_mlp": 0.01044133, + "balance_loss_clip": 1.0263952, + "balance_loss_mlp": 1.04524922, + "epoch": 0.25997294453629943, + "flos": 24201457655040.0, + "grad_norm": 2.197207179409235, + "language_loss": 0.6710211, + "learning_rate": 3.4703853268507597e-06, + "loss": 0.69287789, + "num_input_tokens_seen": 93386555, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.96484375, + "step": 4324, + "time_per_iteration": 2.478419303894043 + }, + { + "auxiliary_loss_clip": 0.01132905, + "auxiliary_loss_mlp": 0.01043334, + "balance_loss_clip": 1.02839708, + "balance_loss_mlp": 1.04456055, + "epoch": 0.2600330677889674, + "flos": 31431460855680.0, + "grad_norm": 2.3342631450552838, + "language_loss": 0.70809424, + "learning_rate": 3.470121299177082e-06, + "loss": 0.72985667, + "num_input_tokens_seen": 93405590, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8828125, + "step": 4325, + "time_per_iteration": 2.5444648265838623 + }, + { + "auxiliary_loss_clip": 0.01133012, + "auxiliary_loss_mlp": 0.01037608, + "balance_loss_clip": 1.02139568, + "balance_loss_mlp": 1.04295206, + "epoch": 0.26009319104163536, + "flos": 32266527217920.0, + "grad_norm": 2.476658211689484, + "language_loss": 0.73041123, + "learning_rate": 3.469857215756257e-06, + "loss": 0.7521174, + "num_input_tokens_seen": 93424750, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.90234375, + "step": 4326, + "time_per_iteration": 2.5281147956848145 + }, + { + "auxiliary_loss_clip": 0.01127256, + "auxiliary_loss_mlp": 0.01038648, + "balance_loss_clip": 1.02424729, + "balance_loss_mlp": 1.04237306, + "epoch": 0.26015331429430333, + "flos": 26286754752000.0, + "grad_norm": 1.820673081097861, + "language_loss": 0.8661378, + "learning_rate": 3.4695930765982997e-06, + "loss": 0.88779688, + "num_input_tokens_seen": 93443465, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8515625, + "step": 4327, + "time_per_iteration": 2.4929087162017822 + }, + { + "auxiliary_loss_clip": 0.01138344, + "auxiliary_loss_mlp": 0.01049414, + "balance_loss_clip": 1.03121042, + "balance_loss_mlp": 1.04679346, + "epoch": 0.2602134375469713, + "flos": 21142335957120.0, + "grad_norm": 2.002075266566112, + "language_loss": 0.80111909, + "learning_rate": 3.4693288817132255e-06, + "loss": 0.82299662, + "num_input_tokens_seen": 93462580, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.9140625, + "step": 4328, + "time_per_iteration": 2.451131582260132 + }, + { + "auxiliary_loss_clip": 0.0112995, + "auxiliary_loss_mlp": 0.01040006, + "balance_loss_clip": 1.02394891, + "balance_loss_mlp": 1.04219353, + "epoch": 0.26027356079963926, + "flos": 25921327737600.0, + "grad_norm": 1.514483384647774, + "language_loss": 0.87428784, + "learning_rate": 3.4690646311110525e-06, + "loss": 0.89598739, + "num_input_tokens_seen": 93482790, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87890625, + "step": 4329, + "time_per_iteration": 2.522368907928467 + }, + { + "auxiliary_loss_clip": 0.01132983, + "auxiliary_loss_mlp": 0.010381, + "balance_loss_clip": 1.02261448, + "balance_loss_mlp": 1.04585731, + "epoch": 0.2603336840523072, + "flos": 26359222440960.0, + "grad_norm": 2.096665977126354, + "language_loss": 0.77746803, + "learning_rate": 3.468800324801802e-06, + "loss": 0.79917884, + "num_input_tokens_seen": 93498795, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.87109375, + "step": 4330, + "time_per_iteration": 2.4771482944488525 + }, + { + "auxiliary_loss_clip": 0.01134796, + "auxiliary_loss_mlp": 0.01047613, + "balance_loss_clip": 1.03136468, + "balance_loss_mlp": 1.04525268, + "epoch": 0.2603938073049752, + "flos": 23513661054720.0, + "grad_norm": 2.4595446714184654, + "language_loss": 0.75248575, + "learning_rate": 3.4685359627954958e-06, + "loss": 0.77430975, + "num_input_tokens_seen": 93518335, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.89453125, + "step": 4331, + "time_per_iteration": 2.5284199714660645 + }, + { + "auxiliary_loss_clip": 0.01137533, + "auxiliary_loss_mlp": 0.0103716, + "balance_loss_clip": 1.02158558, + "balance_loss_mlp": 1.05026567, + "epoch": 0.26045393055764315, + "flos": 25374300537600.0, + "grad_norm": 1.3491085383994963, + "language_loss": 0.69003588, + "learning_rate": 3.4682715451021584e-06, + "loss": 0.71178281, + "num_input_tokens_seen": 93539170, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.87109375, + "step": 4332, + "time_per_iteration": 2.476125478744507 + }, + { + "auxiliary_loss_clip": 0.0113624, + "auxiliary_loss_mlp": 0.01041054, + "balance_loss_clip": 1.02453184, + "balance_loss_mlp": 1.04542089, + "epoch": 0.2605140538103111, + "flos": 27635272076160.0, + "grad_norm": 2.3270567941112854, + "language_loss": 0.79674375, + "learning_rate": 3.4680070717318174e-06, + "loss": 0.81851673, + "num_input_tokens_seen": 93558480, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.91015625, + "step": 4333, + "time_per_iteration": 2.5234756469726562 + }, + { + "auxiliary_loss_clip": 0.01129676, + "auxiliary_loss_mlp": 0.01043365, + "balance_loss_clip": 1.02791548, + "balance_loss_mlp": 1.04336357, + "epoch": 0.2605741770629791, + "flos": 13769839503360.0, + "grad_norm": 1.7608965931322442, + "language_loss": 0.80725265, + "learning_rate": 3.467742542694501e-06, + "loss": 0.82898307, + "num_input_tokens_seen": 93575220, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.86328125, + "step": 4334, + "time_per_iteration": 2.4361026287078857 + }, + { + "auxiliary_loss_clip": 0.01132792, + "auxiliary_loss_mlp": 0.01038034, + "balance_loss_clip": 1.02128482, + "balance_loss_mlp": 1.04452491, + "epoch": 0.26063430031564705, + "flos": 26031681296640.0, + "grad_norm": 1.8337144126432974, + "language_loss": 0.80039275, + "learning_rate": 3.46747795800024e-06, + "loss": 0.822101, + "num_input_tokens_seen": 93597015, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.87890625, + "step": 4335, + "time_per_iteration": 2.5246174335479736 + }, + { + "auxiliary_loss_clip": 0.01043695, + "auxiliary_loss_mlp": 0.0102207, + "balance_loss_clip": 1.02024579, + "balance_loss_mlp": 1.01431763, + "epoch": 0.26069442356831507, + "flos": 62443809820800.0, + "grad_norm": 0.849908687169067, + "language_loss": 0.60851145, + "learning_rate": 3.467213317659068e-06, + "loss": 0.62916911, + "num_input_tokens_seen": 93657775, + "router_z_loss_clip": 0.01818848, + "router_z_loss_mlp": 0.29296875, + "step": 4336, + "time_per_iteration": 3.0349080562591553 + }, + { + "auxiliary_loss_clip": 0.01136323, + "auxiliary_loss_mlp": 0.01047902, + "balance_loss_clip": 1.03172541, + "balance_loss_mlp": 1.04599738, + "epoch": 0.26075454682098304, + "flos": 13626376583040.0, + "grad_norm": 6.860825703537795, + "language_loss": 0.77407634, + "learning_rate": 3.46694862168102e-06, + "loss": 0.79591858, + "num_input_tokens_seen": 93676145, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.90625, + "step": 4337, + "time_per_iteration": 2.4549763202667236 + }, + { + "auxiliary_loss_clip": 0.01135126, + "auxiliary_loss_mlp": 0.01045126, + "balance_loss_clip": 1.02755404, + "balance_loss_mlp": 1.04531193, + "epoch": 0.260814670073651, + "flos": 12126531260160.0, + "grad_norm": 2.1553767319060646, + "language_loss": 0.74116468, + "learning_rate": 3.4666838700761334e-06, + "loss": 0.76296723, + "num_input_tokens_seen": 93692480, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8984375, + "step": 4338, + "time_per_iteration": 2.4109654426574707 + }, + { + "auxiliary_loss_clip": 0.01137659, + "auxiliary_loss_mlp": 0.01042073, + "balance_loss_clip": 1.02495456, + "balance_loss_mlp": 1.0451895, + "epoch": 0.26087479332631897, + "flos": 15122522805120.0, + "grad_norm": 2.414973208379154, + "language_loss": 0.80645537, + "learning_rate": 3.466419062854447e-06, + "loss": 0.82825273, + "num_input_tokens_seen": 93710165, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.92578125, + "step": 4339, + "time_per_iteration": 2.4671595096588135 + }, + { + "auxiliary_loss_clip": 0.01133141, + "auxiliary_loss_mlp": 0.01038061, + "balance_loss_clip": 1.02287948, + "balance_loss_mlp": 1.04559159, + "epoch": 0.26093491657898693, + "flos": 24680937329280.0, + "grad_norm": 1.5844023841754464, + "language_loss": 0.76694596, + "learning_rate": 3.4661542000260033e-06, + "loss": 0.78865802, + "num_input_tokens_seen": 93730185, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.875, + "step": 4340, + "time_per_iteration": 2.4803388118743896 + }, + { + "auxiliary_loss_clip": 0.01137352, + "auxiliary_loss_mlp": 0.01037831, + "balance_loss_clip": 1.02185678, + "balance_loss_mlp": 1.04666209, + "epoch": 0.2609950398316549, + "flos": 25116138512640.0, + "grad_norm": 1.5290989424491332, + "language_loss": 0.82436979, + "learning_rate": 3.465889281600845e-06, + "loss": 0.84612167, + "num_input_tokens_seen": 93747690, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.90625, + "step": 4341, + "time_per_iteration": 2.5263681411743164 + }, + { + "auxiliary_loss_clip": 0.01134552, + "auxiliary_loss_mlp": 0.01039374, + "balance_loss_clip": 1.02236271, + "balance_loss_mlp": 1.04563117, + "epoch": 0.26105516308432286, + "flos": 28548588216960.0, + "grad_norm": 2.4125290221035773, + "language_loss": 0.76542389, + "learning_rate": 3.4656243075890183e-06, + "loss": 0.78716314, + "num_input_tokens_seen": 93767405, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.890625, + "step": 4342, + "time_per_iteration": 2.5043585300445557 + }, + { + "auxiliary_loss_clip": 0.01132446, + "auxiliary_loss_mlp": 0.010328, + "balance_loss_clip": 1.01570523, + "balance_loss_mlp": 1.04324019, + "epoch": 0.2611152863369908, + "flos": 39530609447040.0, + "grad_norm": 1.8018778201456855, + "language_loss": 0.66747689, + "learning_rate": 3.4653592780005707e-06, + "loss": 0.68912935, + "num_input_tokens_seen": 93789950, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.89453125, + "step": 4343, + "time_per_iteration": 2.6470234394073486 + }, + { + "auxiliary_loss_clip": 0.01136306, + "auxiliary_loss_mlp": 0.01041522, + "balance_loss_clip": 1.02467799, + "balance_loss_mlp": 1.04494977, + "epoch": 0.2611754095896588, + "flos": 13735329511680.0, + "grad_norm": 2.0339901471708646, + "language_loss": 0.73817015, + "learning_rate": 3.465094192845553e-06, + "loss": 0.75994843, + "num_input_tokens_seen": 93807835, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9140625, + "step": 4344, + "time_per_iteration": 5.431513071060181 + }, + { + "auxiliary_loss_clip": 0.0113578, + "auxiliary_loss_mlp": 0.01038278, + "balance_loss_clip": 1.02257776, + "balance_loss_mlp": 1.04692459, + "epoch": 0.26123553284232676, + "flos": 21506649649920.0, + "grad_norm": 3.7636245605224072, + "language_loss": 0.86394477, + "learning_rate": 3.4648290521340165e-06, + "loss": 0.88568532, + "num_input_tokens_seen": 93825670, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.890625, + "step": 4345, + "time_per_iteration": 2.4908552169799805 + }, + { + "auxiliary_loss_clip": 0.01129626, + "auxiliary_loss_mlp": 0.01039942, + "balance_loss_clip": 1.02422452, + "balance_loss_mlp": 1.04427588, + "epoch": 0.2612956560949947, + "flos": 21139786091520.0, + "grad_norm": 1.88977116996907, + "language_loss": 0.7612443, + "learning_rate": 3.464563855876015e-06, + "loss": 0.78293997, + "num_input_tokens_seen": 93844045, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.85546875, + "step": 4346, + "time_per_iteration": 2.4966983795166016 + }, + { + "auxiliary_loss_clip": 0.01133219, + "auxiliary_loss_mlp": 0.01041377, + "balance_loss_clip": 1.02547407, + "balance_loss_mlp": 1.04483962, + "epoch": 0.2613557793476627, + "flos": 25119011600640.0, + "grad_norm": 1.5621162347417301, + "language_loss": 0.75868237, + "learning_rate": 3.464298604081606e-06, + "loss": 0.78042835, + "num_input_tokens_seen": 93864380, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8828125, + "step": 4347, + "time_per_iteration": 2.5392181873321533 + }, + { + "auxiliary_loss_clip": 0.01133725, + "auxiliary_loss_mlp": 0.01033882, + "balance_loss_clip": 1.01846862, + "balance_loss_mlp": 1.04549503, + "epoch": 0.26141590260033065, + "flos": 26067699659520.0, + "grad_norm": 1.4125954345922265, + "language_loss": 0.73354399, + "learning_rate": 3.4640332967608476e-06, + "loss": 0.75522006, + "num_input_tokens_seen": 93885475, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8828125, + "step": 4348, + "time_per_iteration": 2.5206878185272217 + }, + { + "auxiliary_loss_clip": 0.01134547, + "auxiliary_loss_mlp": 0.01039621, + "balance_loss_clip": 1.02286005, + "balance_loss_mlp": 1.04503882, + "epoch": 0.2614760258529987, + "flos": 25701518459520.0, + "grad_norm": 1.8182616406273437, + "language_loss": 0.91063923, + "learning_rate": 3.463767933923799e-06, + "loss": 0.93238091, + "num_input_tokens_seen": 93905545, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.89453125, + "step": 4349, + "time_per_iteration": 2.526134967803955 + }, + { + "auxiliary_loss_clip": 0.01132572, + "auxiliary_loss_mlp": 0.01042392, + "balance_loss_clip": 1.02663279, + "balance_loss_mlp": 1.0461632, + "epoch": 0.26153614910566664, + "flos": 17457147181440.0, + "grad_norm": 1.7312169360414529, + "language_loss": 0.79879099, + "learning_rate": 3.463502515580524e-06, + "loss": 0.82054067, + "num_input_tokens_seen": 93924185, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.86328125, + "step": 4350, + "time_per_iteration": 2.4420506954193115 + }, + { + "auxiliary_loss_clip": 0.01129246, + "auxiliary_loss_mlp": 0.01039783, + "balance_loss_clip": 1.02388072, + "balance_loss_mlp": 1.04430401, + "epoch": 0.2615962723583346, + "flos": 17712831168000.0, + "grad_norm": 1.8647374515536046, + "language_loss": 0.62139511, + "learning_rate": 3.4632370417410866e-06, + "loss": 0.64308536, + "num_input_tokens_seen": 93942825, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8515625, + "step": 4351, + "time_per_iteration": 2.4613640308380127 + }, + { + "auxiliary_loss_clip": 0.01134641, + "auxiliary_loss_mlp": 0.01038195, + "balance_loss_clip": 1.02241123, + "balance_loss_mlp": 1.04469466, + "epoch": 0.26165639561100257, + "flos": 23257725672960.0, + "grad_norm": 2.09308554357217, + "language_loss": 0.83596927, + "learning_rate": 3.462971512415555e-06, + "loss": 0.85769767, + "num_input_tokens_seen": 93962045, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8984375, + "step": 4352, + "time_per_iteration": 2.4712979793548584 + }, + { + "auxiliary_loss_clip": 0.01045226, + "auxiliary_loss_mlp": 0.01000353, + "balance_loss_clip": 0.9986006, + "balance_loss_mlp": 1.01526213, + "epoch": 0.26171651886367053, + "flos": 66737970800640.0, + "grad_norm": 0.8010954727993301, + "language_loss": 0.70645392, + "learning_rate": 3.462705927613996e-06, + "loss": 0.72690976, + "num_input_tokens_seen": 94021175, + "router_z_loss_clip": 0.01757812, + "router_z_loss_mlp": 0.29882812, + "step": 4353, + "time_per_iteration": 3.026418447494507 + }, + { + "auxiliary_loss_clip": 0.01132608, + "auxiliary_loss_mlp": 0.01047561, + "balance_loss_clip": 1.03045464, + "balance_loss_mlp": 1.04494369, + "epoch": 0.2617766421163385, + "flos": 22349581090560.0, + "grad_norm": 1.7700850953213416, + "language_loss": 0.77393121, + "learning_rate": 3.4624402873464816e-06, + "loss": 0.79573292, + "num_input_tokens_seen": 94043370, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.875, + "step": 4354, + "time_per_iteration": 2.535482883453369 + }, + { + "auxiliary_loss_clip": 0.01138552, + "auxiliary_loss_mlp": 0.0104457, + "balance_loss_clip": 1.02826262, + "balance_loss_mlp": 1.04513574, + "epoch": 0.26183676536900646, + "flos": 26067125041920.0, + "grad_norm": 2.1625978203859826, + "language_loss": 0.68280292, + "learning_rate": 3.462174591623085e-06, + "loss": 0.70463413, + "num_input_tokens_seen": 94063510, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.93359375, + "step": 4355, + "time_per_iteration": 2.5276527404785156 + }, + { + "auxiliary_loss_clip": 0.01130838, + "auxiliary_loss_mlp": 0.01039393, + "balance_loss_clip": 1.02207148, + "balance_loss_mlp": 1.04375613, + "epoch": 0.26189688862167443, + "flos": 20996466825600.0, + "grad_norm": 1.9702640724114775, + "language_loss": 0.67509294, + "learning_rate": 3.4619088404538815e-06, + "loss": 0.69679523, + "num_input_tokens_seen": 94083865, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.87109375, + "step": 4356, + "time_per_iteration": 2.454436779022217 + }, + { + "auxiliary_loss_clip": 0.01043638, + "auxiliary_loss_mlp": 0.01003266, + "balance_loss_clip": 1.00139415, + "balance_loss_mlp": 1.01376009, + "epoch": 0.2619570118743424, + "flos": 65798261141760.0, + "grad_norm": 0.6781381277043278, + "language_loss": 0.53156137, + "learning_rate": 3.4616430338489487e-06, + "loss": 0.55203032, + "num_input_tokens_seen": 94144095, + "router_z_loss_clip": 0.01867676, + "router_z_loss_mlp": 0.29882812, + "step": 4357, + "time_per_iteration": 2.99239444732666 + }, + { + "auxiliary_loss_clip": 0.01138081, + "auxiliary_loss_mlp": 0.01045526, + "balance_loss_clip": 1.02955151, + "balance_loss_mlp": 1.04608119, + "epoch": 0.26201713512701036, + "flos": 28766817296640.0, + "grad_norm": 1.843205511563007, + "language_loss": 0.84329486, + "learning_rate": 3.4613771718183654e-06, + "loss": 0.86513096, + "num_input_tokens_seen": 94163035, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.921875, + "step": 4358, + "time_per_iteration": 2.511441707611084 + }, + { + "auxiliary_loss_clip": 0.0113833, + "auxiliary_loss_mlp": 0.01042477, + "balance_loss_clip": 1.02476251, + "balance_loss_mlp": 1.0450834, + "epoch": 0.2620772583796783, + "flos": 26432516142720.0, + "grad_norm": 2.1805365254718367, + "language_loss": 0.67303276, + "learning_rate": 3.4611112543722127e-06, + "loss": 0.69484085, + "num_input_tokens_seen": 94182520, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9296875, + "step": 4359, + "time_per_iteration": 2.5318756103515625 + }, + { + "auxiliary_loss_clip": 0.0113089, + "auxiliary_loss_mlp": 0.01042276, + "balance_loss_clip": 1.02725601, + "balance_loss_mlp": 1.04242957, + "epoch": 0.2621373816323463, + "flos": 20156552127360.0, + "grad_norm": 1.947910834650985, + "language_loss": 0.78673261, + "learning_rate": 3.4608452815205757e-06, + "loss": 0.80846429, + "num_input_tokens_seen": 94201795, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.88671875, + "step": 4360, + "time_per_iteration": 2.4551331996917725 + }, + { + "auxiliary_loss_clip": 0.01129221, + "auxiliary_loss_mlp": 0.01040075, + "balance_loss_clip": 1.02454162, + "balance_loss_mlp": 1.04250073, + "epoch": 0.26219750488501425, + "flos": 28621235473920.0, + "grad_norm": 1.9921513845886445, + "language_loss": 0.68169516, + "learning_rate": 3.4605792532735387e-06, + "loss": 0.70338809, + "num_input_tokens_seen": 94222390, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8671875, + "step": 4361, + "time_per_iteration": 2.57106351852417 + }, + { + "auxiliary_loss_clip": 0.01135372, + "auxiliary_loss_mlp": 0.01057475, + "balance_loss_clip": 1.04022598, + "balance_loss_mlp": 1.04400647, + "epoch": 0.2622576281376823, + "flos": 15042549173760.0, + "grad_norm": 1.9312179198305752, + "language_loss": 0.84310883, + "learning_rate": 3.46031316964119e-06, + "loss": 0.86503732, + "num_input_tokens_seen": 94239980, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9140625, + "step": 4362, + "time_per_iteration": 2.430020570755005 + }, + { + "auxiliary_loss_clip": 0.01133753, + "auxiliary_loss_mlp": 0.01040156, + "balance_loss_clip": 1.02282345, + "balance_loss_mlp": 1.04637551, + "epoch": 0.26231775139035024, + "flos": 26396174557440.0, + "grad_norm": 1.792780117353334, + "language_loss": 0.65294504, + "learning_rate": 3.4600470306336197e-06, + "loss": 0.67468411, + "num_input_tokens_seen": 94260715, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.875, + "step": 4363, + "time_per_iteration": 2.546393632888794 + }, + { + "auxiliary_loss_clip": 0.01042076, + "auxiliary_loss_mlp": 0.01004318, + "balance_loss_clip": 1.00252998, + "balance_loss_mlp": 1.0123173, + "epoch": 0.2623778746430182, + "flos": 65408918647680.0, + "grad_norm": 0.8867533167936222, + "language_loss": 0.61098528, + "learning_rate": 3.4597808362609194e-06, + "loss": 0.63144922, + "num_input_tokens_seen": 94321285, + "router_z_loss_clip": 0.01782227, + "router_z_loss_mlp": 0.296875, + "step": 4364, + "time_per_iteration": 3.150812864303589 + }, + { + "auxiliary_loss_clip": 0.01138346, + "auxiliary_loss_mlp": 0.01051385, + "balance_loss_clip": 1.03358722, + "balance_loss_mlp": 1.0468297, + "epoch": 0.26243799789568617, + "flos": 12604215254400.0, + "grad_norm": 2.424942653514092, + "language_loss": 0.71549827, + "learning_rate": 3.459514586533184e-06, + "loss": 0.73739558, + "num_input_tokens_seen": 94335420, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9140625, + "step": 4365, + "time_per_iteration": 2.493540048599243 + }, + { + "auxiliary_loss_clip": 0.0113494, + "auxiliary_loss_mlp": 0.01045115, + "balance_loss_clip": 1.02917075, + "balance_loss_mlp": 1.04654169, + "epoch": 0.26249812114835414, + "flos": 28623821253120.0, + "grad_norm": 1.8316261966241354, + "language_loss": 0.76925993, + "learning_rate": 3.459248281460509e-06, + "loss": 0.79106045, + "num_input_tokens_seen": 94357440, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8828125, + "step": 4366, + "time_per_iteration": 2.536853313446045 + }, + { + "auxiliary_loss_clip": 0.01135829, + "auxiliary_loss_mlp": 0.01042418, + "balance_loss_clip": 1.02684951, + "balance_loss_mlp": 1.04666197, + "epoch": 0.2625582444010221, + "flos": 14465393441280.0, + "grad_norm": 2.2091260788228975, + "language_loss": 0.75838757, + "learning_rate": 3.4589819210529927e-06, + "loss": 0.78017008, + "num_input_tokens_seen": 94375690, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.890625, + "step": 4367, + "time_per_iteration": 2.4576163291931152 + }, + { + "auxiliary_loss_clip": 0.01131307, + "auxiliary_loss_mlp": 0.01040361, + "balance_loss_clip": 1.02454233, + "balance_loss_mlp": 1.04452682, + "epoch": 0.26261836765369007, + "flos": 16613174246400.0, + "grad_norm": 2.1913456464974392, + "language_loss": 0.69633925, + "learning_rate": 3.458715505320736e-06, + "loss": 0.71805596, + "num_input_tokens_seen": 94393190, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8671875, + "step": 4368, + "time_per_iteration": 2.4301586151123047 + }, + { + "auxiliary_loss_clip": 0.01130278, + "auxiliary_loss_mlp": 0.01046678, + "balance_loss_clip": 1.02970243, + "balance_loss_mlp": 1.04319167, + "epoch": 0.26267849090635803, + "flos": 20519932066560.0, + "grad_norm": 1.7035150195415922, + "language_loss": 0.78589904, + "learning_rate": 3.458449034273841e-06, + "loss": 0.80766863, + "num_input_tokens_seen": 94410975, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.8671875, + "step": 4369, + "time_per_iteration": 2.489316701889038 + }, + { + "auxiliary_loss_clip": 0.01132105, + "auxiliary_loss_mlp": 0.01042711, + "balance_loss_clip": 1.02653408, + "balance_loss_mlp": 1.04431546, + "epoch": 0.262738614159026, + "flos": 21323936142720.0, + "grad_norm": 2.0413446884893047, + "language_loss": 0.83486217, + "learning_rate": 3.4581825079224133e-06, + "loss": 0.85661036, + "num_input_tokens_seen": 94429985, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.87890625, + "step": 4370, + "time_per_iteration": 2.4422430992126465 + }, + { + "auxiliary_loss_clip": 0.01136913, + "auxiliary_loss_mlp": 0.01050187, + "balance_loss_clip": 1.03060055, + "balance_loss_mlp": 1.04530215, + "epoch": 0.26279873741169396, + "flos": 17603590930560.0, + "grad_norm": 2.3340239620956287, + "language_loss": 0.70963454, + "learning_rate": 3.4579159262765575e-06, + "loss": 0.73150551, + "num_input_tokens_seen": 94448660, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.9140625, + "step": 4371, + "time_per_iteration": 2.5099778175354004 + }, + { + "auxiliary_loss_clip": 0.01043374, + "auxiliary_loss_mlp": 0.00999769, + "balance_loss_clip": 0.99784929, + "balance_loss_mlp": 1.01338005, + "epoch": 0.2628588606643619, + "flos": 60949746587520.0, + "grad_norm": 0.7657034729714577, + "language_loss": 0.56477904, + "learning_rate": 3.457649289346384e-06, + "loss": 0.58521044, + "num_input_tokens_seen": 94515630, + "router_z_loss_clip": 0.01916504, + "router_z_loss_mlp": 0.30078125, + "step": 4372, + "time_per_iteration": 3.244558572769165 + }, + { + "auxiliary_loss_clip": 0.01129835, + "auxiliary_loss_mlp": 0.01038918, + "balance_loss_clip": 1.02283084, + "balance_loss_mlp": 1.04335582, + "epoch": 0.2629189839170299, + "flos": 27016315891200.0, + "grad_norm": 1.7597219251079876, + "language_loss": 0.77415234, + "learning_rate": 3.4573825971420042e-06, + "loss": 0.79583991, + "num_input_tokens_seen": 94535385, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.86328125, + "step": 4373, + "time_per_iteration": 2.517784833908081 + }, + { + "auxiliary_loss_clip": 0.01131814, + "auxiliary_loss_mlp": 0.01041505, + "balance_loss_clip": 1.02563787, + "balance_loss_mlp": 1.04454422, + "epoch": 0.26297910716969786, + "flos": 17019863009280.0, + "grad_norm": 4.0873872332994905, + "language_loss": 0.71538949, + "learning_rate": 3.4571158496735294e-06, + "loss": 0.73712265, + "num_input_tokens_seen": 94552650, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.87109375, + "step": 4374, + "time_per_iteration": 2.442124605178833 + }, + { + "auxiliary_loss_clip": 0.01133779, + "auxiliary_loss_mlp": 0.01042, + "balance_loss_clip": 1.02435732, + "balance_loss_mlp": 1.0458709, + "epoch": 0.2630392304223659, + "flos": 24897370728960.0, + "grad_norm": 2.271567992891854, + "language_loss": 0.80945283, + "learning_rate": 3.4568490469510756e-06, + "loss": 0.83121061, + "num_input_tokens_seen": 94574075, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.875, + "step": 4375, + "time_per_iteration": 2.4889678955078125 + }, + { + "auxiliary_loss_clip": 0.0113002, + "auxiliary_loss_mlp": 0.0104209, + "balance_loss_clip": 1.0265336, + "balance_loss_mlp": 1.04366982, + "epoch": 0.26309935367503384, + "flos": 32854026067200.0, + "grad_norm": 2.3689389683703, + "language_loss": 0.65721256, + "learning_rate": 3.4565821889847603e-06, + "loss": 0.67893362, + "num_input_tokens_seen": 94594255, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.86328125, + "step": 4376, + "time_per_iteration": 2.563701629638672 + }, + { + "auxiliary_loss_clip": 0.01134504, + "auxiliary_loss_mlp": 0.0104592, + "balance_loss_clip": 1.02940989, + "balance_loss_mlp": 1.04445267, + "epoch": 0.2631594769277018, + "flos": 15887958652800.0, + "grad_norm": 1.8646607453842572, + "language_loss": 0.69517326, + "learning_rate": 3.4563152757847026e-06, + "loss": 0.71697748, + "num_input_tokens_seen": 94611410, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8984375, + "step": 4377, + "time_per_iteration": 2.486117124557495 + }, + { + "auxiliary_loss_clip": 0.01134243, + "auxiliary_loss_mlp": 0.01044305, + "balance_loss_clip": 1.02786613, + "balance_loss_mlp": 1.04500914, + "epoch": 0.2632196001803698, + "flos": 50804943557760.0, + "grad_norm": 1.711844873276418, + "language_loss": 0.7866202, + "learning_rate": 3.4560483073610233e-06, + "loss": 0.80840576, + "num_input_tokens_seen": 94636575, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.890625, + "step": 4378, + "time_per_iteration": 2.7608227729797363 + }, + { + "auxiliary_loss_clip": 0.01133245, + "auxiliary_loss_mlp": 0.01045029, + "balance_loss_clip": 1.03000844, + "balance_loss_mlp": 1.04554546, + "epoch": 0.26327972343303774, + "flos": 13733031041280.0, + "grad_norm": 2.6216377344963004, + "language_loss": 0.76320505, + "learning_rate": 3.455781283723846e-06, + "loss": 0.78498781, + "num_input_tokens_seen": 94654345, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.875, + "step": 4379, + "time_per_iteration": 2.4329168796539307 + }, + { + "auxiliary_loss_clip": 0.01138135, + "auxiliary_loss_mlp": 0.01041523, + "balance_loss_clip": 1.02252114, + "balance_loss_mlp": 1.04633284, + "epoch": 0.2633398466857057, + "flos": 23769057732480.0, + "grad_norm": 2.3003567904549156, + "language_loss": 0.78237861, + "learning_rate": 3.4555142048832975e-06, + "loss": 0.8041752, + "num_input_tokens_seen": 94673985, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.91796875, + "step": 4380, + "time_per_iteration": 2.5423548221588135 + }, + { + "auxiliary_loss_clip": 0.01135772, + "auxiliary_loss_mlp": 0.01040582, + "balance_loss_clip": 1.02419698, + "balance_loss_mlp": 1.0444113, + "epoch": 0.26339996993837367, + "flos": 27600223380480.0, + "grad_norm": 2.288842357619654, + "language_loss": 0.63811409, + "learning_rate": 3.4552470708495036e-06, + "loss": 0.65987766, + "num_input_tokens_seen": 94693145, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9140625, + "step": 4381, + "time_per_iteration": 2.5096213817596436 + }, + { + "auxiliary_loss_clip": 0.01131521, + "auxiliary_loss_mlp": 0.01037713, + "balance_loss_clip": 1.02148831, + "balance_loss_mlp": 1.04359913, + "epoch": 0.26346009319104163, + "flos": 16946317912320.0, + "grad_norm": 1.8729093449566216, + "language_loss": 0.82822418, + "learning_rate": 3.454979881632595e-06, + "loss": 0.84991652, + "num_input_tokens_seen": 94710185, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.87890625, + "step": 4382, + "time_per_iteration": 2.4691555500030518 + }, + { + "auxiliary_loss_clip": 0.01138155, + "auxiliary_loss_mlp": 0.010471, + "balance_loss_clip": 1.02902842, + "balance_loss_mlp": 1.04550982, + "epoch": 0.2635202164437096, + "flos": 37232218915200.0, + "grad_norm": 2.126733729537993, + "language_loss": 0.69686437, + "learning_rate": 3.4547126372427035e-06, + "loss": 0.71871686, + "num_input_tokens_seen": 94730280, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9296875, + "step": 4383, + "time_per_iteration": 2.5923891067504883 + }, + { + "auxiliary_loss_clip": 0.01135659, + "auxiliary_loss_mlp": 0.01042881, + "balance_loss_clip": 1.02732468, + "balance_loss_mlp": 1.04591441, + "epoch": 0.26358033969637756, + "flos": 20996359084800.0, + "grad_norm": 1.929045699346076, + "language_loss": 0.69191134, + "learning_rate": 3.4544453376899638e-06, + "loss": 0.71369672, + "num_input_tokens_seen": 94748560, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8984375, + "step": 4384, + "time_per_iteration": 2.5067081451416016 + }, + { + "auxiliary_loss_clip": 0.01132133, + "auxiliary_loss_mlp": 0.01039726, + "balance_loss_clip": 1.02319217, + "balance_loss_mlp": 1.04400492, + "epoch": 0.26364046294904553, + "flos": 27746092512000.0, + "grad_norm": 2.1647401570854075, + "language_loss": 0.6994158, + "learning_rate": 3.45417798298451e-06, + "loss": 0.72113448, + "num_input_tokens_seen": 94767570, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8828125, + "step": 4385, + "time_per_iteration": 4.062510251998901 + }, + { + "auxiliary_loss_clip": 0.01138578, + "auxiliary_loss_mlp": 0.01042633, + "balance_loss_clip": 1.02551472, + "balance_loss_mlp": 1.04978371, + "epoch": 0.2637005862017135, + "flos": 22893088757760.0, + "grad_norm": 2.0926426044309543, + "language_loss": 0.85188037, + "learning_rate": 3.453910573136482e-06, + "loss": 0.87369245, + "num_input_tokens_seen": 94784985, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.88671875, + "step": 4386, + "time_per_iteration": 3.9604547023773193 + }, + { + "auxiliary_loss_clip": 0.0113699, + "auxiliary_loss_mlp": 0.01041328, + "balance_loss_clip": 1.02487707, + "balance_loss_mlp": 1.04755282, + "epoch": 0.26376070945438146, + "flos": 15048834053760.0, + "grad_norm": 2.2248904155103637, + "language_loss": 0.77169371, + "learning_rate": 3.4536431081560196e-06, + "loss": 0.79347688, + "num_input_tokens_seen": 94802545, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.89453125, + "step": 4387, + "time_per_iteration": 2.472367286682129 + }, + { + "auxiliary_loss_clip": 0.01137279, + "auxiliary_loss_mlp": 0.01046481, + "balance_loss_clip": 1.0305903, + "balance_loss_mlp": 1.04989982, + "epoch": 0.2638208327070494, + "flos": 21141833166720.0, + "grad_norm": 3.996041212149396, + "language_loss": 0.76269597, + "learning_rate": 3.453375588053264e-06, + "loss": 0.78453362, + "num_input_tokens_seen": 94820730, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.875, + "step": 4388, + "time_per_iteration": 2.4858386516571045 + }, + { + "auxiliary_loss_clip": 0.01132552, + "auxiliary_loss_mlp": 0.01035954, + "balance_loss_clip": 1.01924086, + "balance_loss_mlp": 1.04387724, + "epoch": 0.26388095595971744, + "flos": 21725597001600.0, + "grad_norm": 1.9510825560869567, + "language_loss": 0.86210662, + "learning_rate": 3.4531080128383617e-06, + "loss": 0.88379163, + "num_input_tokens_seen": 94839175, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.88671875, + "step": 4389, + "time_per_iteration": 2.508162260055542 + }, + { + "auxiliary_loss_clip": 0.0104392, + "auxiliary_loss_mlp": 0.01009323, + "balance_loss_clip": 1.00736833, + "balance_loss_mlp": 1.01341343, + "epoch": 0.2639410792123854, + "flos": 65515537192320.0, + "grad_norm": 0.8096176904924934, + "language_loss": 0.60333931, + "learning_rate": 3.452840382521457e-06, + "loss": 0.6238718, + "num_input_tokens_seen": 94898865, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.3046875, + "step": 4390, + "time_per_iteration": 3.0593924522399902 + }, + { + "auxiliary_loss_clip": 0.01135834, + "auxiliary_loss_mlp": 0.01038563, + "balance_loss_clip": 1.02213633, + "balance_loss_mlp": 1.04522729, + "epoch": 0.2640012024650534, + "flos": 23948574929280.0, + "grad_norm": 1.7836890720002585, + "language_loss": 0.77702433, + "learning_rate": 3.4525726971127e-06, + "loss": 0.79876828, + "num_input_tokens_seen": 94917490, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90625, + "step": 4391, + "time_per_iteration": 2.5331051349639893 + }, + { + "auxiliary_loss_clip": 0.0104332, + "auxiliary_loss_mlp": 0.01003932, + "balance_loss_clip": 1.00221586, + "balance_loss_mlp": 1.01322889, + "epoch": 0.26406132571772134, + "flos": 56441163369600.0, + "grad_norm": 0.9020745061185262, + "language_loss": 0.58752227, + "learning_rate": 3.45230495662224e-06, + "loss": 0.60799479, + "num_input_tokens_seen": 94969065, + "router_z_loss_clip": 0.01721191, + "router_z_loss_mlp": 0.30078125, + "step": 4392, + "time_per_iteration": 3.047438144683838 + }, + { + "auxiliary_loss_clip": 0.01140884, + "auxiliary_loss_mlp": 0.0104677, + "balance_loss_clip": 1.03039694, + "balance_loss_mlp": 1.04925656, + "epoch": 0.2641214489703893, + "flos": 22090557139200.0, + "grad_norm": 2.5811541881681697, + "language_loss": 0.68459845, + "learning_rate": 3.4520371610602306e-06, + "loss": 0.70647496, + "num_input_tokens_seen": 94988540, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.91796875, + "step": 4393, + "time_per_iteration": 2.5537288188934326 + }, + { + "auxiliary_loss_clip": 0.01139955, + "auxiliary_loss_mlp": 0.01040744, + "balance_loss_clip": 1.02258813, + "balance_loss_mlp": 1.04662204, + "epoch": 0.26418157222305727, + "flos": 16544764794240.0, + "grad_norm": 1.8702197697463565, + "language_loss": 0.83116519, + "learning_rate": 3.4517693104368267e-06, + "loss": 0.85297221, + "num_input_tokens_seen": 95004810, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.93359375, + "step": 4394, + "time_per_iteration": 2.421211004257202 + }, + { + "auxiliary_loss_clip": 0.01143407, + "auxiliary_loss_mlp": 0.01042347, + "balance_loss_clip": 1.02357125, + "balance_loss_mlp": 1.04951847, + "epoch": 0.26424169547572524, + "flos": 18002486442240.0, + "grad_norm": 2.049654769643576, + "language_loss": 0.70211649, + "learning_rate": 3.4515014047621856e-06, + "loss": 0.72397399, + "num_input_tokens_seen": 95024085, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9375, + "step": 4395, + "time_per_iteration": 2.522111654281616 + }, + { + "auxiliary_loss_clip": 0.01135756, + "auxiliary_loss_mlp": 0.01035967, + "balance_loss_clip": 1.01925397, + "balance_loss_mlp": 1.04784906, + "epoch": 0.2643018187283932, + "flos": 16983162288000.0, + "grad_norm": 1.822626622734132, + "language_loss": 0.86866504, + "learning_rate": 3.4512334440464655e-06, + "loss": 0.89038229, + "num_input_tokens_seen": 95042515, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8828125, + "step": 4396, + "time_per_iteration": 2.4450392723083496 + }, + { + "auxiliary_loss_clip": 0.01042786, + "auxiliary_loss_mlp": 0.01024145, + "balance_loss_clip": 1.02226114, + "balance_loss_mlp": 1.01312816, + "epoch": 0.26436194198106117, + "flos": 59664359416320.0, + "grad_norm": 0.7917805441344085, + "language_loss": 0.54999918, + "learning_rate": 3.4509654282998277e-06, + "loss": 0.57066846, + "num_input_tokens_seen": 95094835, + "router_z_loss_clip": 0.01879883, + "router_z_loss_mlp": 0.296875, + "step": 4397, + "time_per_iteration": 2.8438708782196045 + }, + { + "auxiliary_loss_clip": 0.01134821, + "auxiliary_loss_mlp": 0.01052373, + "balance_loss_clip": 1.03567195, + "balance_loss_mlp": 1.04701614, + "epoch": 0.26442206523372913, + "flos": 32921322197760.0, + "grad_norm": 2.0493441687219724, + "language_loss": 0.77840483, + "learning_rate": 3.450697357532435e-06, + "loss": 0.80027676, + "num_input_tokens_seen": 95113480, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.87890625, + "step": 4398, + "time_per_iteration": 2.562499523162842 + }, + { + "auxiliary_loss_clip": 0.01141073, + "auxiliary_loss_mlp": 0.01040123, + "balance_loss_clip": 1.02262306, + "balance_loss_mlp": 1.05005002, + "epoch": 0.2644821884863971, + "flos": 21031300039680.0, + "grad_norm": 2.041566803030235, + "language_loss": 0.67037976, + "learning_rate": 3.4504292317544534e-06, + "loss": 0.69219166, + "num_input_tokens_seen": 95132580, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.91015625, + "step": 4399, + "time_per_iteration": 2.487778663635254 + }, + { + "auxiliary_loss_clip": 0.01128661, + "auxiliary_loss_mlp": 0.0103792, + "balance_loss_clip": 1.02288818, + "balance_loss_mlp": 1.04565811, + "epoch": 0.26454231173906506, + "flos": 20776801201920.0, + "grad_norm": 2.1160884119586303, + "language_loss": 0.86152196, + "learning_rate": 3.4501610509760504e-06, + "loss": 0.88318777, + "num_input_tokens_seen": 95152375, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 4400, + "time_per_iteration": 2.4837841987609863 + }, + { + "auxiliary_loss_clip": 0.01138875, + "auxiliary_loss_mlp": 0.01039625, + "balance_loss_clip": 1.02188635, + "balance_loss_mlp": 1.04813862, + "epoch": 0.264602434991733, + "flos": 16618669027200.0, + "grad_norm": 2.751022626956878, + "language_loss": 0.75779396, + "learning_rate": 3.4498928152073944e-06, + "loss": 0.77957898, + "num_input_tokens_seen": 95170265, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.90625, + "step": 4401, + "time_per_iteration": 2.548297166824341 + }, + { + "auxiliary_loss_clip": 0.01138206, + "auxiliary_loss_mlp": 0.01050893, + "balance_loss_clip": 1.03236771, + "balance_loss_mlp": 1.04606974, + "epoch": 0.26466255824440105, + "flos": 19062677295360.0, + "grad_norm": 1.9215434150559794, + "language_loss": 0.88267732, + "learning_rate": 3.4496245244586577e-06, + "loss": 0.90456831, + "num_input_tokens_seen": 95188655, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.921875, + "step": 4402, + "time_per_iteration": 2.4422647953033447 + }, + { + "auxiliary_loss_clip": 0.01135603, + "auxiliary_loss_mlp": 0.01048039, + "balance_loss_clip": 1.03151679, + "balance_loss_mlp": 1.04594266, + "epoch": 0.264722681497069, + "flos": 22638554006400.0, + "grad_norm": 1.8196807161845878, + "language_loss": 0.78123331, + "learning_rate": 3.4493561787400137e-06, + "loss": 0.80306977, + "num_input_tokens_seen": 95209615, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8984375, + "step": 4403, + "time_per_iteration": 2.587623357772827 + }, + { + "auxiliary_loss_clip": 0.0113528, + "auxiliary_loss_mlp": 0.01040463, + "balance_loss_clip": 1.02334428, + "balance_loss_mlp": 1.04440784, + "epoch": 0.264782804749737, + "flos": 22492253911680.0, + "grad_norm": 1.9946669841411302, + "language_loss": 0.87767446, + "learning_rate": 3.4490877780616387e-06, + "loss": 0.89943182, + "num_input_tokens_seen": 95227810, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.91015625, + "step": 4404, + "time_per_iteration": 2.492913246154785 + }, + { + "auxiliary_loss_clip": 0.01138307, + "auxiliary_loss_mlp": 0.01036911, + "balance_loss_clip": 1.02106786, + "balance_loss_mlp": 1.04683399, + "epoch": 0.26484292800240494, + "flos": 16800269212800.0, + "grad_norm": 1.7395093434050468, + "language_loss": 0.7593658, + "learning_rate": 3.448819322433709e-06, + "loss": 0.78111804, + "num_input_tokens_seen": 95245890, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.9140625, + "step": 4405, + "time_per_iteration": 2.508970260620117 + }, + { + "auxiliary_loss_clip": 0.01138042, + "auxiliary_loss_mlp": 0.01038835, + "balance_loss_clip": 1.02166891, + "balance_loss_mlp": 1.04870844, + "epoch": 0.2649030512550729, + "flos": 20449583280000.0, + "grad_norm": 1.9681610481113616, + "language_loss": 0.69979274, + "learning_rate": 3.4485508118664066e-06, + "loss": 0.72156149, + "num_input_tokens_seen": 95264955, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.89453125, + "step": 4406, + "time_per_iteration": 2.4548041820526123 + }, + { + "auxiliary_loss_clip": 0.01134971, + "auxiliary_loss_mlp": 0.01047688, + "balance_loss_clip": 1.03255999, + "balance_loss_mlp": 1.04781294, + "epoch": 0.2649631745077409, + "flos": 22416123035520.0, + "grad_norm": 1.7455123192469384, + "language_loss": 0.83764267, + "learning_rate": 3.448282246369912e-06, + "loss": 0.85946929, + "num_input_tokens_seen": 95284245, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.875, + "step": 4407, + "time_per_iteration": 2.5359292030334473 + }, + { + "auxiliary_loss_clip": 0.01134967, + "auxiliary_loss_mlp": 0.01032434, + "balance_loss_clip": 1.01566172, + "balance_loss_mlp": 1.04678226, + "epoch": 0.26502329776040884, + "flos": 35116110927360.0, + "grad_norm": 1.7942044569518307, + "language_loss": 0.76068008, + "learning_rate": 3.4480136259544084e-06, + "loss": 0.78235412, + "num_input_tokens_seen": 95307125, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8828125, + "step": 4408, + "time_per_iteration": 2.6124041080474854 + }, + { + "auxiliary_loss_clip": 0.011362, + "auxiliary_loss_mlp": 0.01034702, + "balance_loss_clip": 1.01832306, + "balance_loss_mlp": 1.04918611, + "epoch": 0.2650834210130768, + "flos": 38687498438400.0, + "grad_norm": 1.8724720588087471, + "language_loss": 0.70920485, + "learning_rate": 3.447744950630084e-06, + "loss": 0.73091388, + "num_input_tokens_seen": 95329150, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.87109375, + "step": 4409, + "time_per_iteration": 2.6539366245269775 + }, + { + "auxiliary_loss_clip": 0.01136441, + "auxiliary_loss_mlp": 0.01037034, + "balance_loss_clip": 1.01931942, + "balance_loss_mlp": 1.04666233, + "epoch": 0.26514354426574477, + "flos": 24716847951360.0, + "grad_norm": 1.7884535623295956, + "language_loss": 0.73085511, + "learning_rate": 3.4474762204071253e-06, + "loss": 0.75258988, + "num_input_tokens_seen": 95349880, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.8984375, + "step": 4410, + "time_per_iteration": 2.545083999633789 + }, + { + "auxiliary_loss_clip": 0.01139704, + "auxiliary_loss_mlp": 0.01049137, + "balance_loss_clip": 1.03218508, + "balance_loss_mlp": 1.04741001, + "epoch": 0.26520366751841273, + "flos": 20340055733760.0, + "grad_norm": 1.9280641145018393, + "language_loss": 0.73272175, + "learning_rate": 3.4472074352957244e-06, + "loss": 0.75461018, + "num_input_tokens_seen": 95368570, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.921875, + "step": 4411, + "time_per_iteration": 2.4818248748779297 + }, + { + "auxiliary_loss_clip": 0.01137094, + "auxiliary_loss_mlp": 0.01042757, + "balance_loss_clip": 1.02593684, + "balance_loss_mlp": 1.04815316, + "epoch": 0.2652637907710807, + "flos": 22343870828160.0, + "grad_norm": 2.073752901007566, + "language_loss": 0.82294202, + "learning_rate": 3.446938595306071e-06, + "loss": 0.84474051, + "num_input_tokens_seen": 95387065, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.88671875, + "step": 4412, + "time_per_iteration": 2.56634521484375 + }, + { + "auxiliary_loss_clip": 0.01134293, + "auxiliary_loss_mlp": 0.01047936, + "balance_loss_clip": 1.03161621, + "balance_loss_mlp": 1.04541004, + "epoch": 0.26532391402374866, + "flos": 19354235990400.0, + "grad_norm": 1.721718037322793, + "language_loss": 0.74245501, + "learning_rate": 3.4466697004483622e-06, + "loss": 0.76427728, + "num_input_tokens_seen": 95406345, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.890625, + "step": 4413, + "time_per_iteration": 2.4994029998779297 + }, + { + "auxiliary_loss_clip": 0.01046706, + "auxiliary_loss_mlp": 0.01018291, + "balance_loss_clip": 1.01659799, + "balance_loss_mlp": 1.0160358, + "epoch": 0.26538403727641663, + "flos": 44787611422080.0, + "grad_norm": 0.8825812455559224, + "language_loss": 0.56986731, + "learning_rate": 3.446400750732793e-06, + "loss": 0.59051728, + "num_input_tokens_seen": 95463595, + "router_z_loss_clip": 0.01696777, + "router_z_loss_mlp": 0.30664062, + "step": 4414, + "time_per_iteration": 2.9884986877441406 + }, + { + "auxiliary_loss_clip": 0.01128281, + "auxiliary_loss_mlp": 0.01041185, + "balance_loss_clip": 1.02605712, + "balance_loss_mlp": 1.04307461, + "epoch": 0.26544416052908465, + "flos": 28182119708160.0, + "grad_norm": 1.8727128035200367, + "language_loss": 0.74535894, + "learning_rate": 3.4461317461695625e-06, + "loss": 0.76705366, + "num_input_tokens_seen": 95484115, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8515625, + "step": 4415, + "time_per_iteration": 2.5531253814697266 + }, + { + "auxiliary_loss_clip": 0.01138825, + "auxiliary_loss_mlp": 0.01043694, + "balance_loss_clip": 1.02506185, + "balance_loss_mlp": 1.04656732, + "epoch": 0.2655042837817526, + "flos": 17565274097280.0, + "grad_norm": 2.3504707987247917, + "language_loss": 0.86662048, + "learning_rate": 3.4458626867688707e-06, + "loss": 0.88844568, + "num_input_tokens_seen": 95501435, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.921875, + "step": 4416, + "time_per_iteration": 2.4751384258270264 + }, + { + "auxiliary_loss_clip": 0.0113975, + "auxiliary_loss_mlp": 0.01042134, + "balance_loss_clip": 1.02439594, + "balance_loss_mlp": 1.0492208, + "epoch": 0.2655644070344206, + "flos": 23404636298880.0, + "grad_norm": 1.6281293305848954, + "language_loss": 0.76152384, + "learning_rate": 3.4455935725409217e-06, + "loss": 0.78334266, + "num_input_tokens_seen": 95520135, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.90625, + "step": 4417, + "time_per_iteration": 2.5017013549804688 + }, + { + "auxiliary_loss_clip": 0.01135215, + "auxiliary_loss_mlp": 0.01039785, + "balance_loss_clip": 1.02167702, + "balance_loss_mlp": 1.04778051, + "epoch": 0.26562453028708854, + "flos": 26468462678400.0, + "grad_norm": 1.7397383944852411, + "language_loss": 0.79984045, + "learning_rate": 3.4453244034959196e-06, + "loss": 0.82159042, + "num_input_tokens_seen": 95541705, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.875, + "step": 4418, + "time_per_iteration": 2.539454460144043 + }, + { + "auxiliary_loss_clip": 0.01138688, + "auxiliary_loss_mlp": 0.01046556, + "balance_loss_clip": 1.02983057, + "balance_loss_mlp": 1.04861307, + "epoch": 0.2656846535397565, + "flos": 19207576759680.0, + "grad_norm": 2.7780034581995965, + "language_loss": 0.67397833, + "learning_rate": 3.445055179644071e-06, + "loss": 0.69583082, + "num_input_tokens_seen": 95560300, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.90234375, + "step": 4419, + "time_per_iteration": 2.461444616317749 + }, + { + "auxiliary_loss_clip": 0.01139197, + "auxiliary_loss_mlp": 0.01045382, + "balance_loss_clip": 1.02739358, + "balance_loss_mlp": 1.04920876, + "epoch": 0.2657447767924245, + "flos": 30551325903360.0, + "grad_norm": 2.097903587873874, + "language_loss": 0.79365611, + "learning_rate": 3.444785900995585e-06, + "loss": 0.81550193, + "num_input_tokens_seen": 95580150, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.8984375, + "step": 4420, + "time_per_iteration": 2.5908427238464355 + }, + { + "auxiliary_loss_clip": 0.01141654, + "auxiliary_loss_mlp": 0.01049212, + "balance_loss_clip": 1.02990031, + "balance_loss_mlp": 1.0493983, + "epoch": 0.26580490004509244, + "flos": 20922742160640.0, + "grad_norm": 2.1223383047232933, + "language_loss": 0.81612432, + "learning_rate": 3.444516567560673e-06, + "loss": 0.83803296, + "num_input_tokens_seen": 95597570, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.921875, + "step": 4421, + "time_per_iteration": 2.4869320392608643 + }, + { + "auxiliary_loss_clip": 0.01134642, + "auxiliary_loss_mlp": 0.01037045, + "balance_loss_clip": 1.02027202, + "balance_loss_mlp": 1.04734015, + "epoch": 0.2658650232977604, + "flos": 43945682584320.0, + "grad_norm": 1.5724937400793966, + "language_loss": 0.65278006, + "learning_rate": 3.444247179349548e-06, + "loss": 0.67449689, + "num_input_tokens_seen": 95619415, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.87109375, + "step": 4422, + "time_per_iteration": 2.7370638847351074 + }, + { + "auxiliary_loss_clip": 0.01138513, + "auxiliary_loss_mlp": 0.01046085, + "balance_loss_clip": 1.02965808, + "balance_loss_mlp": 1.04750621, + "epoch": 0.26592514655042837, + "flos": 29716439109120.0, + "grad_norm": 2.411979213410041, + "language_loss": 0.73841226, + "learning_rate": 3.4439777363724252e-06, + "loss": 0.76025832, + "num_input_tokens_seen": 95639155, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.91015625, + "step": 4423, + "time_per_iteration": 2.5510191917419434 + }, + { + "auxiliary_loss_clip": 0.01136367, + "auxiliary_loss_mlp": 0.01046611, + "balance_loss_clip": 1.03017163, + "balance_loss_mlp": 1.04504442, + "epoch": 0.26598526980309634, + "flos": 46677730014720.0, + "grad_norm": 1.6317340067044743, + "language_loss": 0.77703154, + "learning_rate": 3.443708238639522e-06, + "loss": 0.79886127, + "num_input_tokens_seen": 95663320, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9140625, + "step": 4424, + "time_per_iteration": 2.809495449066162 + }, + { + "auxiliary_loss_clip": 0.01137168, + "auxiliary_loss_mlp": 0.01043079, + "balance_loss_clip": 1.02675951, + "balance_loss_mlp": 1.04695249, + "epoch": 0.2660453930557643, + "flos": 11509442582400.0, + "grad_norm": 2.064218808714238, + "language_loss": 0.79345673, + "learning_rate": 3.4434386861610573e-06, + "loss": 0.81525922, + "num_input_tokens_seen": 95680260, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.90234375, + "step": 4425, + "time_per_iteration": 2.48149037361145 + }, + { + "auxiliary_loss_clip": 0.01133425, + "auxiliary_loss_mlp": 0.01046814, + "balance_loss_clip": 1.03138816, + "balance_loss_mlp": 1.04685736, + "epoch": 0.26610551630843227, + "flos": 24791578197120.0, + "grad_norm": 1.774406296589384, + "language_loss": 0.80463314, + "learning_rate": 3.4431690789472532e-06, + "loss": 0.82643557, + "num_input_tokens_seen": 95701140, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8671875, + "step": 4426, + "time_per_iteration": 2.5968613624572754 + }, + { + "auxiliary_loss_clip": 0.01138948, + "auxiliary_loss_mlp": 0.0104835, + "balance_loss_clip": 1.03180957, + "balance_loss_mlp": 1.04982209, + "epoch": 0.26616563956110023, + "flos": 27636385397760.0, + "grad_norm": 1.8207507571493768, + "language_loss": 0.77337295, + "learning_rate": 3.442899417008333e-06, + "loss": 0.79524601, + "num_input_tokens_seen": 95722060, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.890625, + "step": 4427, + "time_per_iteration": 4.045380353927612 + }, + { + "auxiliary_loss_clip": 0.01133558, + "auxiliary_loss_mlp": 0.010331, + "balance_loss_clip": 1.01760316, + "balance_loss_mlp": 1.04737306, + "epoch": 0.26622576281376825, + "flos": 28362893880960.0, + "grad_norm": 1.8400253790543033, + "language_loss": 0.76800078, + "learning_rate": 3.4426297003545227e-06, + "loss": 0.78966737, + "num_input_tokens_seen": 95742495, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.86328125, + "step": 4428, + "time_per_iteration": 4.018831491470337 + }, + { + "auxiliary_loss_clip": 0.01135115, + "auxiliary_loss_mlp": 0.01034355, + "balance_loss_clip": 1.01858354, + "balance_loss_mlp": 1.04529297, + "epoch": 0.2662858860664362, + "flos": 18041341979520.0, + "grad_norm": 1.9075878866801723, + "language_loss": 0.83010298, + "learning_rate": 3.4423599289960495e-06, + "loss": 0.8517977, + "num_input_tokens_seen": 95761510, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8984375, + "step": 4429, + "time_per_iteration": 2.576535940170288 + }, + { + "auxiliary_loss_clip": 0.01133677, + "auxiliary_loss_mlp": 0.01042932, + "balance_loss_clip": 1.02644563, + "balance_loss_mlp": 1.04664719, + "epoch": 0.2663460093191042, + "flos": 22745818995840.0, + "grad_norm": 3.2197583620662082, + "language_loss": 0.72143924, + "learning_rate": 3.442090102943143e-06, + "loss": 0.74320537, + "num_input_tokens_seen": 95782385, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.87109375, + "step": 4430, + "time_per_iteration": 2.5262365341186523 + }, + { + "auxiliary_loss_clip": 0.01136153, + "auxiliary_loss_mlp": 0.01042808, + "balance_loss_clip": 1.02453375, + "balance_loss_mlp": 1.04667306, + "epoch": 0.26640613257177215, + "flos": 16508782344960.0, + "grad_norm": 2.382555523964676, + "language_loss": 0.81635833, + "learning_rate": 3.441820222206035e-06, + "loss": 0.83814788, + "num_input_tokens_seen": 95800595, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.89453125, + "step": 4431, + "time_per_iteration": 2.5135624408721924 + }, + { + "auxiliary_loss_clip": 0.01142285, + "auxiliary_loss_mlp": 0.01050952, + "balance_loss_clip": 1.03360736, + "balance_loss_mlp": 1.04865289, + "epoch": 0.2664662558244401, + "flos": 23075945919360.0, + "grad_norm": 2.34486467491615, + "language_loss": 0.76153386, + "learning_rate": 3.44155028679496e-06, + "loss": 0.78346616, + "num_input_tokens_seen": 95818480, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9375, + "step": 4432, + "time_per_iteration": 2.469515562057495 + }, + { + "auxiliary_loss_clip": 0.01136779, + "auxiliary_loss_mlp": 0.01044676, + "balance_loss_clip": 1.02711606, + "balance_loss_mlp": 1.04703665, + "epoch": 0.2665263790771081, + "flos": 23769273214080.0, + "grad_norm": 2.148919041496035, + "language_loss": 0.82521772, + "learning_rate": 3.441280296720154e-06, + "loss": 0.84703225, + "num_input_tokens_seen": 95837205, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8984375, + "step": 4433, + "time_per_iteration": 2.540174961090088 + }, + { + "auxiliary_loss_clip": 0.01138849, + "auxiliary_loss_mlp": 0.01048222, + "balance_loss_clip": 1.03065097, + "balance_loss_mlp": 1.04955435, + "epoch": 0.26658650232977604, + "flos": 28001273708160.0, + "grad_norm": 2.091984027516481, + "language_loss": 0.76638913, + "learning_rate": 3.441010251991854e-06, + "loss": 0.78825986, + "num_input_tokens_seen": 95858395, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.890625, + "step": 4434, + "time_per_iteration": 2.549769878387451 + }, + { + "auxiliary_loss_clip": 0.01133542, + "auxiliary_loss_mlp": 0.01043118, + "balance_loss_clip": 1.02770376, + "balance_loss_mlp": 1.04645348, + "epoch": 0.266646625582444, + "flos": 22163635359360.0, + "grad_norm": 2.251252650424801, + "language_loss": 0.82632279, + "learning_rate": 3.440740152620301e-06, + "loss": 0.84808934, + "num_input_tokens_seen": 95877875, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.87109375, + "step": 4435, + "time_per_iteration": 2.5329744815826416 + }, + { + "auxiliary_loss_clip": 0.01140704, + "auxiliary_loss_mlp": 0.0105698, + "balance_loss_clip": 1.03870487, + "balance_loss_mlp": 1.04742312, + "epoch": 0.266706748835112, + "flos": 27853537069440.0, + "grad_norm": 2.2611652281579397, + "language_loss": 0.87278962, + "learning_rate": 3.4404699986157376e-06, + "loss": 0.89476645, + "num_input_tokens_seen": 95895820, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.9296875, + "step": 4436, + "time_per_iteration": 2.5375254154205322 + }, + { + "auxiliary_loss_clip": 0.01136328, + "auxiliary_loss_mlp": 0.0104305, + "balance_loss_clip": 1.02670658, + "balance_loss_mlp": 1.04566383, + "epoch": 0.26676687208777994, + "flos": 25812123413760.0, + "grad_norm": 1.4304916595737875, + "language_loss": 0.78941, + "learning_rate": 3.440199789988407e-06, + "loss": 0.81120378, + "num_input_tokens_seen": 95918025, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90625, + "step": 4437, + "time_per_iteration": 2.591017007827759 + }, + { + "auxiliary_loss_clip": 0.01134502, + "auxiliary_loss_mlp": 0.01041567, + "balance_loss_clip": 1.02533066, + "balance_loss_mlp": 1.04595256, + "epoch": 0.2668269953404479, + "flos": 36064583504640.0, + "grad_norm": 2.0731379310987412, + "language_loss": 0.63412011, + "learning_rate": 3.439929526748556e-06, + "loss": 0.65588087, + "num_input_tokens_seen": 95937725, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8828125, + "step": 4438, + "time_per_iteration": 2.6429452896118164 + }, + { + "auxiliary_loss_clip": 0.01137556, + "auxiliary_loss_mlp": 0.01036987, + "balance_loss_clip": 1.02125144, + "balance_loss_mlp": 1.04869223, + "epoch": 0.26688711859311587, + "flos": 26570987072640.0, + "grad_norm": 1.8133794638407341, + "language_loss": 0.75628942, + "learning_rate": 3.4396592089064334e-06, + "loss": 0.77803481, + "num_input_tokens_seen": 95956335, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.890625, + "step": 4439, + "time_per_iteration": 2.5296032428741455 + }, + { + "auxiliary_loss_clip": 0.01140638, + "auxiliary_loss_mlp": 0.01038527, + "balance_loss_clip": 1.02052629, + "balance_loss_mlp": 1.04913759, + "epoch": 0.26694724184578383, + "flos": 26761565658240.0, + "grad_norm": 1.7792140134846064, + "language_loss": 0.71444011, + "learning_rate": 3.4393888364722897e-06, + "loss": 0.7362318, + "num_input_tokens_seen": 95977135, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9140625, + "step": 4440, + "time_per_iteration": 2.5714335441589355 + }, + { + "auxiliary_loss_clip": 0.01139576, + "auxiliary_loss_mlp": 0.01045623, + "balance_loss_clip": 1.02757502, + "balance_loss_mlp": 1.04816949, + "epoch": 0.2670073650984518, + "flos": 20959586536320.0, + "grad_norm": 1.8363906583736056, + "language_loss": 0.66291904, + "learning_rate": 3.439118409456376e-06, + "loss": 0.68477106, + "num_input_tokens_seen": 95995435, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9140625, + "step": 4441, + "time_per_iteration": 2.522589683532715 + }, + { + "auxiliary_loss_clip": 0.01137665, + "auxiliary_loss_mlp": 0.0104418, + "balance_loss_clip": 1.02654862, + "balance_loss_mlp": 1.04803538, + "epoch": 0.2670674883511198, + "flos": 28366054277760.0, + "grad_norm": 1.5597318548365904, + "language_loss": 0.76451373, + "learning_rate": 3.4388479278689486e-06, + "loss": 0.78633213, + "num_input_tokens_seen": 96016340, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.89453125, + "step": 4442, + "time_per_iteration": 2.5659492015838623 + }, + { + "auxiliary_loss_clip": 0.01060214, + "auxiliary_loss_mlp": 0.0100059, + "balance_loss_clip": 0.99855101, + "balance_loss_mlp": 1.02895594, + "epoch": 0.2671276116037878, + "flos": 58971319430400.0, + "grad_norm": 0.912864167592289, + "language_loss": 0.61270142, + "learning_rate": 3.4385773917202637e-06, + "loss": 0.63330936, + "num_input_tokens_seen": 96071205, + "router_z_loss_clip": 0.02038574, + "router_z_loss_mlp": 0.3125, + "step": 4443, + "time_per_iteration": 3.0256776809692383 + }, + { + "auxiliary_loss_clip": 0.01140806, + "auxiliary_loss_mlp": 0.01035952, + "balance_loss_clip": 1.01968026, + "balance_loss_mlp": 1.0495882, + "epoch": 0.26718773485645575, + "flos": 43945072053120.0, + "grad_norm": 1.5525166591100914, + "language_loss": 0.76200545, + "learning_rate": 3.4383068010205793e-06, + "loss": 0.78377306, + "num_input_tokens_seen": 96094240, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.91015625, + "step": 4444, + "time_per_iteration": 2.7414674758911133 + }, + { + "auxiliary_loss_clip": 0.0114013, + "auxiliary_loss_mlp": 0.01040836, + "balance_loss_clip": 1.02330077, + "balance_loss_mlp": 1.04932773, + "epoch": 0.2672478581091237, + "flos": 25228323665280.0, + "grad_norm": 3.16165776963455, + "language_loss": 0.80212528, + "learning_rate": 3.438036155780158e-06, + "loss": 0.82393491, + "num_input_tokens_seen": 96114105, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.90625, + "step": 4445, + "time_per_iteration": 2.5349111557006836 + }, + { + "auxiliary_loss_clip": 0.01139166, + "auxiliary_loss_mlp": 0.0104, + "balance_loss_clip": 1.02232134, + "balance_loss_mlp": 1.04797101, + "epoch": 0.2673079813617917, + "flos": 15268176455040.0, + "grad_norm": 2.3952290716593825, + "language_loss": 0.89144397, + "learning_rate": 3.43776545600926e-06, + "loss": 0.91323566, + "num_input_tokens_seen": 96132140, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.91015625, + "step": 4446, + "time_per_iteration": 2.5512521266937256 + }, + { + "auxiliary_loss_clip": 0.01140462, + "auxiliary_loss_mlp": 0.01047593, + "balance_loss_clip": 1.0311892, + "balance_loss_mlp": 1.04977763, + "epoch": 0.26736810461445965, + "flos": 25812733944960.0, + "grad_norm": 1.831363923725005, + "language_loss": 0.68259656, + "learning_rate": 3.437494701718153e-06, + "loss": 0.70447719, + "num_input_tokens_seen": 96152090, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90625, + "step": 4447, + "time_per_iteration": 2.5752837657928467 + }, + { + "auxiliary_loss_clip": 0.01140858, + "auxiliary_loss_mlp": 0.01040004, + "balance_loss_clip": 1.02261138, + "balance_loss_mlp": 1.04972827, + "epoch": 0.2674282278671276, + "flos": 24312709054080.0, + "grad_norm": 1.9862084341014827, + "language_loss": 0.82976532, + "learning_rate": 3.4372238929171026e-06, + "loss": 0.85157394, + "num_input_tokens_seen": 96170015, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.91015625, + "step": 4448, + "time_per_iteration": 2.6524059772491455 + }, + { + "auxiliary_loss_clip": 0.01137667, + "auxiliary_loss_mlp": 0.01048508, + "balance_loss_clip": 1.03110301, + "balance_loss_mlp": 1.04973495, + "epoch": 0.2674883511197956, + "flos": 22815521337600.0, + "grad_norm": 2.185461436072074, + "language_loss": 0.84288895, + "learning_rate": 3.436953029616378e-06, + "loss": 0.86475068, + "num_input_tokens_seen": 96188065, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.87890625, + "step": 4449, + "time_per_iteration": 2.5167598724365234 + }, + { + "auxiliary_loss_clip": 0.01148403, + "auxiliary_loss_mlp": 0.01047832, + "balance_loss_clip": 1.02892506, + "balance_loss_mlp": 1.05114913, + "epoch": 0.26754847437246354, + "flos": 25370170473600.0, + "grad_norm": 1.9936425417360089, + "language_loss": 0.84260273, + "learning_rate": 3.4366821118262506e-06, + "loss": 0.86456501, + "num_input_tokens_seen": 96205780, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.97265625, + "step": 4450, + "time_per_iteration": 2.555941343307495 + }, + { + "auxiliary_loss_clip": 0.01133946, + "auxiliary_loss_mlp": 0.0104095, + "balance_loss_clip": 1.02560782, + "balance_loss_mlp": 1.04674196, + "epoch": 0.2676085976251315, + "flos": 20230420446720.0, + "grad_norm": 1.900524277018137, + "language_loss": 0.81065774, + "learning_rate": 3.4364111395569937e-06, + "loss": 0.83240664, + "num_input_tokens_seen": 96224990, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.87109375, + "step": 4451, + "time_per_iteration": 2.5289859771728516 + }, + { + "auxiliary_loss_clip": 0.01140947, + "auxiliary_loss_mlp": 0.01041834, + "balance_loss_clip": 1.02593148, + "balance_loss_mlp": 1.05186319, + "epoch": 0.26766872087779947, + "flos": 28038225824640.0, + "grad_norm": 1.8040621200757803, + "language_loss": 0.86401796, + "learning_rate": 3.436140112818882e-06, + "loss": 0.88584578, + "num_input_tokens_seen": 96245345, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.890625, + "step": 4452, + "time_per_iteration": 2.617918014526367 + }, + { + "auxiliary_loss_clip": 0.01143372, + "auxiliary_loss_mlp": 0.01037743, + "balance_loss_clip": 1.02055311, + "balance_loss_mlp": 1.05132198, + "epoch": 0.26772884413046744, + "flos": 18325179250560.0, + "grad_norm": 1.9731948573099198, + "language_loss": 0.83129871, + "learning_rate": 3.435869031622194e-06, + "loss": 0.8531099, + "num_input_tokens_seen": 96259000, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.921875, + "step": 4453, + "time_per_iteration": 2.483130931854248 + }, + { + "auxiliary_loss_clip": 0.0113897, + "auxiliary_loss_mlp": 0.01046975, + "balance_loss_clip": 1.02936745, + "balance_loss_mlp": 1.04995108, + "epoch": 0.2677889673831354, + "flos": 22127509255680.0, + "grad_norm": 1.62656613015929, + "language_loss": 0.79744816, + "learning_rate": 3.435597895977208e-06, + "loss": 0.81930768, + "num_input_tokens_seen": 96277000, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.890625, + "step": 4454, + "time_per_iteration": 2.537853717803955 + }, + { + "auxiliary_loss_clip": 0.01141821, + "auxiliary_loss_mlp": 0.01042652, + "balance_loss_clip": 1.02599859, + "balance_loss_mlp": 1.04989707, + "epoch": 0.2678490906358034, + "flos": 23729699404800.0, + "grad_norm": 1.7640316216704761, + "language_loss": 0.7215519, + "learning_rate": 3.435326705894206e-06, + "loss": 0.74339664, + "num_input_tokens_seen": 96297010, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.91796875, + "step": 4455, + "time_per_iteration": 2.5023562908172607 + }, + { + "auxiliary_loss_clip": 0.01137457, + "auxiliary_loss_mlp": 0.01039805, + "balance_loss_clip": 1.02406991, + "balance_loss_mlp": 1.05066276, + "epoch": 0.2679092138884714, + "flos": 21762872340480.0, + "grad_norm": 1.5496021720121687, + "language_loss": 0.74044335, + "learning_rate": 3.435055461383471e-06, + "loss": 0.76221603, + "num_input_tokens_seen": 96315780, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8671875, + "step": 4456, + "time_per_iteration": 2.487581729888916 + }, + { + "auxiliary_loss_clip": 0.01141742, + "auxiliary_loss_mlp": 0.01038216, + "balance_loss_clip": 1.02121687, + "balance_loss_mlp": 1.04937947, + "epoch": 0.26796933714113935, + "flos": 19861186590720.0, + "grad_norm": 2.2089309948453697, + "language_loss": 0.70965469, + "learning_rate": 3.4347841624552896e-06, + "loss": 0.73145425, + "num_input_tokens_seen": 96333465, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.921875, + "step": 4457, + "time_per_iteration": 2.4584691524505615 + }, + { + "auxiliary_loss_clip": 0.01143072, + "auxiliary_loss_mlp": 0.010439, + "balance_loss_clip": 1.02681756, + "balance_loss_mlp": 1.05237103, + "epoch": 0.2680294603938073, + "flos": 20047886507520.0, + "grad_norm": 2.29797460876898, + "language_loss": 0.79029202, + "learning_rate": 3.4345128091199493e-06, + "loss": 0.81216174, + "num_input_tokens_seen": 96352005, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.90625, + "step": 4458, + "time_per_iteration": 2.6079578399658203 + }, + { + "auxiliary_loss_clip": 0.01052787, + "auxiliary_loss_mlp": 0.01006207, + "balance_loss_clip": 1.00439513, + "balance_loss_mlp": 1.02259135, + "epoch": 0.2680895836464753, + "flos": 72113763052800.0, + "grad_norm": 0.8640508796264214, + "language_loss": 0.58716619, + "learning_rate": 3.434241401387739e-06, + "loss": 0.60775614, + "num_input_tokens_seen": 96406265, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.30078125, + "step": 4459, + "time_per_iteration": 3.0725412368774414 + }, + { + "auxiliary_loss_clip": 0.0113409, + "auxiliary_loss_mlp": 0.01040081, + "balance_loss_clip": 1.02444053, + "balance_loss_mlp": 1.04671741, + "epoch": 0.26814970689914325, + "flos": 20449044576000.0, + "grad_norm": 2.0778557825519055, + "language_loss": 0.85224575, + "learning_rate": 3.4339699392689507e-06, + "loss": 0.87398744, + "num_input_tokens_seen": 96425225, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.875, + "step": 4460, + "time_per_iteration": 2.483299732208252 + }, + { + "auxiliary_loss_clip": 0.01136074, + "auxiliary_loss_mlp": 0.01043042, + "balance_loss_clip": 1.02653205, + "balance_loss_mlp": 1.04752469, + "epoch": 0.2682098301518112, + "flos": 17566674727680.0, + "grad_norm": 2.805871571962145, + "language_loss": 0.68256581, + "learning_rate": 3.4336984227738796e-06, + "loss": 0.70435691, + "num_input_tokens_seen": 96443780, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8828125, + "step": 4461, + "time_per_iteration": 2.439304828643799 + }, + { + "auxiliary_loss_clip": 0.01135713, + "auxiliary_loss_mlp": 0.01049055, + "balance_loss_clip": 1.03198409, + "balance_loss_mlp": 1.0470686, + "epoch": 0.2682699534044792, + "flos": 18333259810560.0, + "grad_norm": 1.5557483279788171, + "language_loss": 0.67342007, + "learning_rate": 3.43342685191282e-06, + "loss": 0.69526774, + "num_input_tokens_seen": 96464530, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.88671875, + "step": 4462, + "time_per_iteration": 2.5081140995025635 + }, + { + "auxiliary_loss_clip": 0.01136996, + "auxiliary_loss_mlp": 0.01041529, + "balance_loss_clip": 1.02413619, + "balance_loss_mlp": 1.04865909, + "epoch": 0.26833007665714714, + "flos": 25301294144640.0, + "grad_norm": 1.8707784514564991, + "language_loss": 0.6927141, + "learning_rate": 3.4331552266960705e-06, + "loss": 0.71449935, + "num_input_tokens_seen": 96483345, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.8828125, + "step": 4463, + "time_per_iteration": 2.5280556678771973 + }, + { + "auxiliary_loss_clip": 0.01140107, + "auxiliary_loss_mlp": 0.01041395, + "balance_loss_clip": 1.02414584, + "balance_loss_mlp": 1.04812574, + "epoch": 0.2683901999098151, + "flos": 16099759198080.0, + "grad_norm": 2.4976114648735304, + "language_loss": 0.77389008, + "learning_rate": 3.432883547133931e-06, + "loss": 0.79570508, + "num_input_tokens_seen": 96498305, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.921875, + "step": 4464, + "time_per_iteration": 2.469650983810425 + }, + { + "auxiliary_loss_clip": 0.01134508, + "auxiliary_loss_mlp": 0.01039425, + "balance_loss_clip": 1.02215123, + "balance_loss_mlp": 1.0458076, + "epoch": 0.2684503231624831, + "flos": 27308054154240.0, + "grad_norm": 1.844577670487785, + "language_loss": 0.70796561, + "learning_rate": 3.432611813236704e-06, + "loss": 0.72970498, + "num_input_tokens_seen": 96519740, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.88671875, + "step": 4465, + "time_per_iteration": 2.5685060024261475 + }, + { + "auxiliary_loss_clip": 0.01049569, + "auxiliary_loss_mlp": 0.0100238, + "balance_loss_clip": 1.00067484, + "balance_loss_mlp": 1.01956284, + "epoch": 0.26851044641515104, + "flos": 71858007239040.0, + "grad_norm": 0.6800540965400289, + "language_loss": 0.53096056, + "learning_rate": 3.4323400250146943e-06, + "loss": 0.55148005, + "num_input_tokens_seen": 96588870, + "router_z_loss_clip": 0.01708984, + "router_z_loss_mlp": 0.30078125, + "step": 4466, + "time_per_iteration": 3.2327654361724854 + }, + { + "auxiliary_loss_clip": 0.01133624, + "auxiliary_loss_mlp": 0.01039482, + "balance_loss_clip": 1.02219653, + "balance_loss_mlp": 1.04600596, + "epoch": 0.268570569667819, + "flos": 18733771434240.0, + "grad_norm": 2.0764143418179213, + "language_loss": 0.7343837, + "learning_rate": 3.4320681824782057e-06, + "loss": 0.75611472, + "num_input_tokens_seen": 96605100, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.875, + "step": 4467, + "time_per_iteration": 2.5052013397216797 + }, + { + "auxiliary_loss_clip": 0.01138792, + "auxiliary_loss_mlp": 0.01045438, + "balance_loss_clip": 1.0278548, + "balance_loss_mlp": 1.04801464, + "epoch": 0.268630692920487, + "flos": 18178376365440.0, + "grad_norm": 2.5834152956256555, + "language_loss": 0.80703115, + "learning_rate": 3.4317962856375493e-06, + "loss": 0.82887346, + "num_input_tokens_seen": 96621410, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.90625, + "step": 4468, + "time_per_iteration": 2.4547622203826904 + }, + { + "auxiliary_loss_clip": 0.01047735, + "auxiliary_loss_mlp": 0.01005617, + "balance_loss_clip": 1.00407946, + "balance_loss_mlp": 1.01768315, + "epoch": 0.268690816173155, + "flos": 68731768978560.0, + "grad_norm": 0.8449159500606429, + "language_loss": 0.59532088, + "learning_rate": 3.4315243345030334e-06, + "loss": 0.61585438, + "num_input_tokens_seen": 96684810, + "router_z_loss_clip": 0.01538086, + "router_z_loss_mlp": 0.30078125, + "step": 4469, + "time_per_iteration": 4.6310715675354 + }, + { + "auxiliary_loss_clip": 0.01137988, + "auxiliary_loss_mlp": 0.01045172, + "balance_loss_clip": 1.02687383, + "balance_loss_mlp": 1.04844749, + "epoch": 0.26875093942582295, + "flos": 23293636295040.0, + "grad_norm": 2.3316897890333954, + "language_loss": 0.81785607, + "learning_rate": 3.431252329084972e-06, + "loss": 0.83968771, + "num_input_tokens_seen": 96701920, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.89453125, + "step": 4470, + "time_per_iteration": 2.5501935482025146 + }, + { + "auxiliary_loss_clip": 0.01129268, + "auxiliary_loss_mlp": 0.01037606, + "balance_loss_clip": 1.02091098, + "balance_loss_mlp": 1.04484963, + "epoch": 0.2688110626784909, + "flos": 21543458112000.0, + "grad_norm": 1.6194658793917844, + "language_loss": 0.82648492, + "learning_rate": 3.4309802693936786e-06, + "loss": 0.84815365, + "num_input_tokens_seen": 96721260, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.84375, + "step": 4471, + "time_per_iteration": 2.559220552444458 + }, + { + "auxiliary_loss_clip": 0.0113472, + "auxiliary_loss_mlp": 0.010367, + "balance_loss_clip": 1.02042806, + "balance_loss_mlp": 1.04853129, + "epoch": 0.2688711859311589, + "flos": 28400600183040.0, + "grad_norm": 8.458966217412893, + "language_loss": 0.69382554, + "learning_rate": 3.43070815543947e-06, + "loss": 0.71553975, + "num_input_tokens_seen": 96740385, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.86328125, + "step": 4472, + "time_per_iteration": 2.561326742172241 + }, + { + "auxiliary_loss_clip": 0.01135298, + "auxiliary_loss_mlp": 0.01036687, + "balance_loss_clip": 1.02045035, + "balance_loss_mlp": 1.04783702, + "epoch": 0.26893130918382685, + "flos": 25994944661760.0, + "grad_norm": 1.596928542569954, + "language_loss": 0.67870784, + "learning_rate": 3.4304359872326656e-06, + "loss": 0.70042771, + "num_input_tokens_seen": 96761860, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.875, + "step": 4473, + "time_per_iteration": 2.5437636375427246 + }, + { + "auxiliary_loss_clip": 0.01133236, + "auxiliary_loss_mlp": 0.01044607, + "balance_loss_clip": 1.02844238, + "balance_loss_mlp": 1.04768729, + "epoch": 0.2689914324364948, + "flos": 20339624770560.0, + "grad_norm": 1.8504576821316179, + "language_loss": 0.82971931, + "learning_rate": 3.4301637647835843e-06, + "loss": 0.85149777, + "num_input_tokens_seen": 96781890, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.85546875, + "step": 4474, + "time_per_iteration": 2.474095582962036 + }, + { + "auxiliary_loss_clip": 0.01132567, + "auxiliary_loss_mlp": 0.01046818, + "balance_loss_clip": 1.03042698, + "balance_loss_mlp": 1.04697323, + "epoch": 0.2690515556891628, + "flos": 19464553635840.0, + "grad_norm": 2.0689967373005977, + "language_loss": 0.70303237, + "learning_rate": 3.4298914881025494e-06, + "loss": 0.72482622, + "num_input_tokens_seen": 96800390, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.85546875, + "step": 4475, + "time_per_iteration": 2.4865996837615967 + }, + { + "auxiliary_loss_clip": 0.01135068, + "auxiliary_loss_mlp": 0.01040112, + "balance_loss_clip": 1.02335167, + "balance_loss_mlp": 1.04614162, + "epoch": 0.26911167894183075, + "flos": 18146631720960.0, + "grad_norm": 1.7721029234489851, + "language_loss": 0.73711979, + "learning_rate": 3.4296191571998863e-06, + "loss": 0.75887156, + "num_input_tokens_seen": 96816685, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.890625, + "step": 4476, + "time_per_iteration": 2.477308988571167 + }, + { + "auxiliary_loss_clip": 0.01130545, + "auxiliary_loss_mlp": 0.01040281, + "balance_loss_clip": 1.02456927, + "balance_loss_mlp": 1.04561102, + "epoch": 0.2691718021944987, + "flos": 19975131509760.0, + "grad_norm": 1.720914514753409, + "language_loss": 0.80110955, + "learning_rate": 3.429346772085922e-06, + "loss": 0.8228178, + "num_input_tokens_seen": 96836285, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84765625, + "step": 4477, + "time_per_iteration": 2.497809648513794 + }, + { + "auxiliary_loss_clip": 0.01133072, + "auxiliary_loss_mlp": 0.01042879, + "balance_loss_clip": 1.02578449, + "balance_loss_mlp": 1.04442573, + "epoch": 0.2692319254471667, + "flos": 37447215770880.0, + "grad_norm": 1.9038830637231319, + "language_loss": 0.64580482, + "learning_rate": 3.429074332770984e-06, + "loss": 0.66756433, + "num_input_tokens_seen": 96857745, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.8828125, + "step": 4478, + "time_per_iteration": 2.6485564708709717 + }, + { + "auxiliary_loss_clip": 0.01130767, + "auxiliary_loss_mlp": 0.0104511, + "balance_loss_clip": 1.02876592, + "balance_loss_mlp": 1.04380882, + "epoch": 0.26929204869983464, + "flos": 22127796564480.0, + "grad_norm": 1.8571100614964546, + "language_loss": 0.80653036, + "learning_rate": 3.4288018392654047e-06, + "loss": 0.82828909, + "num_input_tokens_seen": 96877295, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8671875, + "step": 4479, + "time_per_iteration": 2.4851014614105225 + }, + { + "auxiliary_loss_clip": 0.01135761, + "auxiliary_loss_mlp": 0.01043964, + "balance_loss_clip": 1.02725112, + "balance_loss_mlp": 1.04611528, + "epoch": 0.2693521719525026, + "flos": 19792813052160.0, + "grad_norm": 2.4630797167742458, + "language_loss": 0.80834484, + "learning_rate": 3.4285292915795166e-06, + "loss": 0.83014214, + "num_input_tokens_seen": 96896160, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.89453125, + "step": 4480, + "time_per_iteration": 2.490147590637207 + }, + { + "auxiliary_loss_clip": 0.01124775, + "auxiliary_loss_mlp": 0.01036236, + "balance_loss_clip": 1.02066684, + "balance_loss_mlp": 1.04153395, + "epoch": 0.2694122952051706, + "flos": 20994383836800.0, + "grad_norm": 1.7677898796301312, + "language_loss": 0.77612787, + "learning_rate": 3.4282566897236543e-06, + "loss": 0.79773796, + "num_input_tokens_seen": 96915410, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.83203125, + "step": 4481, + "time_per_iteration": 2.4699158668518066 + }, + { + "auxiliary_loss_clip": 0.01134279, + "auxiliary_loss_mlp": 0.01044694, + "balance_loss_clip": 1.02737296, + "balance_loss_mlp": 1.04591584, + "epoch": 0.2694724184578386, + "flos": 25849291011840.0, + "grad_norm": 2.5981026313468525, + "language_loss": 0.74701524, + "learning_rate": 3.4279840337081547e-06, + "loss": 0.76880491, + "num_input_tokens_seen": 96937865, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.8828125, + "step": 4482, + "time_per_iteration": 2.556087017059326 + }, + { + "auxiliary_loss_clip": 0.01135034, + "auxiliary_loss_mlp": 0.01039094, + "balance_loss_clip": 1.02198792, + "balance_loss_mlp": 1.04693186, + "epoch": 0.26953254171050656, + "flos": 21726961718400.0, + "grad_norm": 1.852738059166697, + "language_loss": 0.72176206, + "learning_rate": 3.4277113235433584e-06, + "loss": 0.74350333, + "num_input_tokens_seen": 96957710, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.8828125, + "step": 4483, + "time_per_iteration": 2.4762344360351562 + }, + { + "auxiliary_loss_clip": 0.01133416, + "auxiliary_loss_mlp": 0.01043511, + "balance_loss_clip": 1.02635717, + "balance_loss_mlp": 1.04290676, + "epoch": 0.2695926649631745, + "flos": 19682926369920.0, + "grad_norm": 2.626283812761087, + "language_loss": 0.87107188, + "learning_rate": 3.427438559239605e-06, + "loss": 0.8928411, + "num_input_tokens_seen": 96975890, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.90625, + "step": 4484, + "time_per_iteration": 2.486185073852539 + }, + { + "auxiliary_loss_clip": 0.01131969, + "auxiliary_loss_mlp": 0.01040339, + "balance_loss_clip": 1.02447212, + "balance_loss_mlp": 1.04373026, + "epoch": 0.2696527882158425, + "flos": 32886596724480.0, + "grad_norm": 1.901905407661022, + "language_loss": 0.66389644, + "learning_rate": 3.427165740807239e-06, + "loss": 0.68561947, + "num_input_tokens_seen": 96998595, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8828125, + "step": 4485, + "time_per_iteration": 2.5674586296081543 + }, + { + "auxiliary_loss_clip": 0.01133447, + "auxiliary_loss_mlp": 0.01040269, + "balance_loss_clip": 1.02371132, + "balance_loss_mlp": 1.0445261, + "epoch": 0.26971291146851045, + "flos": 12124843320960.0, + "grad_norm": 2.8933932068842783, + "language_loss": 0.72378826, + "learning_rate": 3.426892868256604e-06, + "loss": 0.74552536, + "num_input_tokens_seen": 97013715, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.890625, + "step": 4486, + "time_per_iteration": 2.471036434173584 + }, + { + "auxiliary_loss_clip": 0.01137696, + "auxiliary_loss_mlp": 0.01038547, + "balance_loss_clip": 1.02257311, + "balance_loss_mlp": 1.04809284, + "epoch": 0.2697730347211784, + "flos": 22634459856000.0, + "grad_norm": 1.8546648123058087, + "language_loss": 0.83810318, + "learning_rate": 3.4266199415980495e-06, + "loss": 0.85986561, + "num_input_tokens_seen": 97031570, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8984375, + "step": 4487, + "time_per_iteration": 2.4867916107177734 + }, + { + "auxiliary_loss_clip": 0.01137573, + "auxiliary_loss_mlp": 0.01044901, + "balance_loss_clip": 1.02749646, + "balance_loss_mlp": 1.0477773, + "epoch": 0.2698331579738464, + "flos": 23513050523520.0, + "grad_norm": 2.2079504028023598, + "language_loss": 0.71220767, + "learning_rate": 3.4263469608419234e-06, + "loss": 0.73403245, + "num_input_tokens_seen": 97049815, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.8984375, + "step": 4488, + "time_per_iteration": 2.5174567699432373 + }, + { + "auxiliary_loss_clip": 0.01136886, + "auxiliary_loss_mlp": 0.01045434, + "balance_loss_clip": 1.02851868, + "balance_loss_mlp": 1.04792523, + "epoch": 0.26989328122651435, + "flos": 24641040297600.0, + "grad_norm": 1.6338784898376273, + "language_loss": 0.83736706, + "learning_rate": 3.426073925998578e-06, + "loss": 0.85919023, + "num_input_tokens_seen": 97067570, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.890625, + "step": 4489, + "time_per_iteration": 2.5314295291900635 + }, + { + "auxiliary_loss_clip": 0.01136964, + "auxiliary_loss_mlp": 0.01054545, + "balance_loss_clip": 1.03696203, + "balance_loss_mlp": 1.04693484, + "epoch": 0.2699534044791823, + "flos": 10772555068800.0, + "grad_norm": 2.5551945574509176, + "language_loss": 0.89805245, + "learning_rate": 3.4258008370783656e-06, + "loss": 0.91996753, + "num_input_tokens_seen": 97082180, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8984375, + "step": 4490, + "time_per_iteration": 2.4975826740264893 + }, + { + "auxiliary_loss_clip": 0.01128305, + "auxiliary_loss_mlp": 0.01042718, + "balance_loss_clip": 1.02741122, + "balance_loss_mlp": 1.04349554, + "epoch": 0.2700135277318503, + "flos": 36171597098880.0, + "grad_norm": 1.8455290723250308, + "language_loss": 0.73354411, + "learning_rate": 3.4255276940916434e-06, + "loss": 0.75525427, + "num_input_tokens_seen": 97103470, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.84765625, + "step": 4491, + "time_per_iteration": 2.6303470134735107 + }, + { + "auxiliary_loss_clip": 0.01138617, + "auxiliary_loss_mlp": 0.01043046, + "balance_loss_clip": 1.02613568, + "balance_loss_mlp": 1.04974079, + "epoch": 0.27007365098451824, + "flos": 17418614866560.0, + "grad_norm": 3.089516252272487, + "language_loss": 0.74379975, + "learning_rate": 3.4252544970487676e-06, + "loss": 0.7656163, + "num_input_tokens_seen": 97118100, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.890625, + "step": 4492, + "time_per_iteration": 2.5124619007110596 + }, + { + "auxiliary_loss_clip": 0.01133231, + "auxiliary_loss_mlp": 0.01040234, + "balance_loss_clip": 1.0241406, + "balance_loss_mlp": 1.04671812, + "epoch": 0.2701337742371862, + "flos": 23185688947200.0, + "grad_norm": 1.896651323252439, + "language_loss": 0.88740528, + "learning_rate": 3.4249812459600986e-06, + "loss": 0.90913987, + "num_input_tokens_seen": 97136765, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8671875, + "step": 4493, + "time_per_iteration": 2.480473756790161 + }, + { + "auxiliary_loss_clip": 0.01134006, + "auxiliary_loss_mlp": 0.01041715, + "balance_loss_clip": 1.02564538, + "balance_loss_mlp": 1.04676843, + "epoch": 0.2701938974898542, + "flos": 24389450461440.0, + "grad_norm": 1.468971775969503, + "language_loss": 0.70976114, + "learning_rate": 3.424707940835998e-06, + "loss": 0.73151839, + "num_input_tokens_seen": 97157470, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87109375, + "step": 4494, + "time_per_iteration": 2.5703446865081787 + }, + { + "auxiliary_loss_clip": 0.0112953, + "auxiliary_loss_mlp": 0.01034192, + "balance_loss_clip": 1.01920152, + "balance_loss_mlp": 1.04545951, + "epoch": 0.2702540207425222, + "flos": 26214322976640.0, + "grad_norm": 2.0322990364449325, + "language_loss": 0.86294192, + "learning_rate": 3.42443458168683e-06, + "loss": 0.88457918, + "num_input_tokens_seen": 97176905, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.83984375, + "step": 4495, + "time_per_iteration": 2.5428457260131836 + }, + { + "auxiliary_loss_clip": 0.01134, + "auxiliary_loss_mlp": 0.0104559, + "balance_loss_clip": 1.02968764, + "balance_loss_mlp": 1.04731214, + "epoch": 0.27031414399519016, + "flos": 22926377687040.0, + "grad_norm": 1.8698467905293557, + "language_loss": 0.76562083, + "learning_rate": 3.424161168522959e-06, + "loss": 0.7874167, + "num_input_tokens_seen": 97196380, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8671875, + "step": 4496, + "time_per_iteration": 2.5074446201324463 + }, + { + "auxiliary_loss_clip": 0.01048323, + "auxiliary_loss_mlp": 0.01012102, + "balance_loss_clip": 1.01042128, + "balance_loss_mlp": 1.01925802, + "epoch": 0.2703742672478581, + "flos": 63019780404480.0, + "grad_norm": 0.7221920911850954, + "language_loss": 0.50221699, + "learning_rate": 3.423887701354754e-06, + "loss": 0.52282125, + "num_input_tokens_seen": 97260100, + "router_z_loss_clip": 0.0168457, + "router_z_loss_mlp": 0.2890625, + "step": 4497, + "time_per_iteration": 3.110724687576294 + }, + { + "auxiliary_loss_clip": 0.01137008, + "auxiliary_loss_mlp": 0.01045622, + "balance_loss_clip": 1.03011322, + "balance_loss_mlp": 1.05020094, + "epoch": 0.2704343905005261, + "flos": 18840820942080.0, + "grad_norm": 1.6519561002314052, + "language_loss": 0.72420043, + "learning_rate": 3.4236141801925847e-06, + "loss": 0.74602675, + "num_input_tokens_seen": 97277935, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8671875, + "step": 4498, + "time_per_iteration": 2.522507429122925 + }, + { + "auxiliary_loss_clip": 0.01047265, + "auxiliary_loss_mlp": 0.0100549, + "balance_loss_clip": 1.0038569, + "balance_loss_mlp": 1.0182879, + "epoch": 0.27049451375319405, + "flos": 71233412618880.0, + "grad_norm": 0.7584910907853958, + "language_loss": 0.59222841, + "learning_rate": 3.4233406050468237e-06, + "loss": 0.61275595, + "num_input_tokens_seen": 97338845, + "router_z_loss_clip": 0.01635742, + "router_z_loss_mlp": 0.2890625, + "step": 4499, + "time_per_iteration": 3.1193060874938965 + }, + { + "auxiliary_loss_clip": 0.01132683, + "auxiliary_loss_mlp": 0.01036933, + "balance_loss_clip": 1.02085209, + "balance_loss_mlp": 1.04637063, + "epoch": 0.270554637005862, + "flos": 24278594112000.0, + "grad_norm": 2.0468109740969576, + "language_loss": 0.7361812, + "learning_rate": 3.4230669759278438e-06, + "loss": 0.75787735, + "num_input_tokens_seen": 97356640, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.86328125, + "step": 4500, + "time_per_iteration": 2.5073533058166504 + }, + { + "auxiliary_loss_clip": 0.01130893, + "auxiliary_loss_mlp": 0.01044299, + "balance_loss_clip": 1.02739513, + "balance_loss_mlp": 1.04379177, + "epoch": 0.27061476025853, + "flos": 17632318832640.0, + "grad_norm": 3.2528800155878765, + "language_loss": 0.80392325, + "learning_rate": 3.4227932928460215e-06, + "loss": 0.82567519, + "num_input_tokens_seen": 97372585, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.87109375, + "step": 4501, + "time_per_iteration": 2.4665989875793457 + }, + { + "auxiliary_loss_clip": 0.01134872, + "auxiliary_loss_mlp": 0.01044198, + "balance_loss_clip": 1.0278666, + "balance_loss_mlp": 1.04683352, + "epoch": 0.27067488351119795, + "flos": 22710123855360.0, + "grad_norm": 1.9148884605164396, + "language_loss": 0.72832727, + "learning_rate": 3.422519555811735e-06, + "loss": 0.75011796, + "num_input_tokens_seen": 97393315, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8828125, + "step": 4502, + "time_per_iteration": 2.511070489883423 + }, + { + "auxiliary_loss_clip": 0.01134337, + "auxiliary_loss_mlp": 0.01038575, + "balance_loss_clip": 1.0209558, + "balance_loss_mlp": 1.04282784, + "epoch": 0.2707350067638659, + "flos": 41719616087040.0, + "grad_norm": 1.724044037192685, + "language_loss": 0.68474984, + "learning_rate": 3.4222457648353642e-06, + "loss": 0.70647895, + "num_input_tokens_seen": 97417860, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9140625, + "step": 4503, + "time_per_iteration": 2.6554527282714844 + }, + { + "auxiliary_loss_clip": 0.01133759, + "auxiliary_loss_mlp": 0.01040282, + "balance_loss_clip": 1.02425468, + "balance_loss_mlp": 1.04659927, + "epoch": 0.2707951300165339, + "flos": 20193037367040.0, + "grad_norm": 2.0245220791315655, + "language_loss": 0.68488902, + "learning_rate": 3.4219719199272918e-06, + "loss": 0.7066294, + "num_input_tokens_seen": 97436780, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87109375, + "step": 4504, + "time_per_iteration": 2.4813036918640137 + }, + { + "auxiliary_loss_clip": 0.01135516, + "auxiliary_loss_mlp": 0.01043406, + "balance_loss_clip": 1.02811766, + "balance_loss_mlp": 1.05043292, + "epoch": 0.27085525326920185, + "flos": 21433966479360.0, + "grad_norm": 1.7616188880043606, + "language_loss": 0.75553012, + "learning_rate": 3.421698021097902e-06, + "loss": 0.77731931, + "num_input_tokens_seen": 97456190, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8515625, + "step": 4505, + "time_per_iteration": 2.482228994369507 + }, + { + "auxiliary_loss_clip": 0.01138199, + "auxiliary_loss_mlp": 0.01049925, + "balance_loss_clip": 1.03271127, + "balance_loss_mlp": 1.047171, + "epoch": 0.2709153765218698, + "flos": 17675232606720.0, + "grad_norm": 1.8888030992954683, + "language_loss": 0.73508286, + "learning_rate": 3.42142406835758e-06, + "loss": 0.75696409, + "num_input_tokens_seen": 97474545, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9140625, + "step": 4506, + "time_per_iteration": 2.493534803390503 + }, + { + "auxiliary_loss_clip": 0.01137077, + "auxiliary_loss_mlp": 0.01040919, + "balance_loss_clip": 1.02390218, + "balance_loss_mlp": 1.04818904, + "epoch": 0.2709754997745378, + "flos": 24456243801600.0, + "grad_norm": 2.012438120988393, + "language_loss": 0.80958861, + "learning_rate": 3.421150061716715e-06, + "loss": 0.83136857, + "num_input_tokens_seen": 97494520, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.890625, + "step": 4507, + "time_per_iteration": 2.488477945327759 + }, + { + "auxiliary_loss_clip": 0.01046128, + "auxiliary_loss_mlp": 0.01011944, + "balance_loss_clip": 1.0102514, + "balance_loss_mlp": 1.01738429, + "epoch": 0.2710356230272058, + "flos": 65210798206080.0, + "grad_norm": 0.7384209784394716, + "language_loss": 0.50892401, + "learning_rate": 3.420876001185698e-06, + "loss": 0.52950472, + "num_input_tokens_seen": 97552455, + "router_z_loss_clip": 0.01696777, + "router_z_loss_mlp": 0.28710938, + "step": 4508, + "time_per_iteration": 3.005894660949707 + }, + { + "auxiliary_loss_clip": 0.01129132, + "auxiliary_loss_mlp": 0.01039667, + "balance_loss_clip": 1.02413416, + "balance_loss_mlp": 1.04509401, + "epoch": 0.27109574627987376, + "flos": 25484438615040.0, + "grad_norm": 4.914093534195162, + "language_loss": 0.74373507, + "learning_rate": 3.4206018867749197e-06, + "loss": 0.76542306, + "num_input_tokens_seen": 97572650, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.83984375, + "step": 4509, + "time_per_iteration": 2.555645227432251 + }, + { + "auxiliary_loss_clip": 0.01126467, + "auxiliary_loss_mlp": 0.01039629, + "balance_loss_clip": 1.02418542, + "balance_loss_mlp": 1.04368544, + "epoch": 0.2711558695325417, + "flos": 19682782715520.0, + "grad_norm": 1.7859895301291084, + "language_loss": 0.71706283, + "learning_rate": 3.4203277184947757e-06, + "loss": 0.73872381, + "num_input_tokens_seen": 97591150, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.828125, + "step": 4510, + "time_per_iteration": 2.469756841659546 + }, + { + "auxiliary_loss_clip": 0.01133239, + "auxiliary_loss_mlp": 0.01034855, + "balance_loss_clip": 1.01921451, + "balance_loss_mlp": 1.04728365, + "epoch": 0.2712159927852097, + "flos": 18587758648320.0, + "grad_norm": 4.171230322312489, + "language_loss": 0.70698422, + "learning_rate": 3.4200534963556627e-06, + "loss": 0.72866517, + "num_input_tokens_seen": 97607410, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.859375, + "step": 4511, + "time_per_iteration": 3.9261832237243652 + }, + { + "auxiliary_loss_clip": 0.01133865, + "auxiliary_loss_mlp": 0.01043141, + "balance_loss_clip": 1.02660656, + "balance_loss_mlp": 1.04600286, + "epoch": 0.27127611603787766, + "flos": 25630235919360.0, + "grad_norm": 2.0859148079323564, + "language_loss": 0.80823237, + "learning_rate": 3.419779220367979e-06, + "loss": 0.83000243, + "num_input_tokens_seen": 97626870, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.87890625, + "step": 4512, + "time_per_iteration": 2.5112404823303223 + }, + { + "auxiliary_loss_clip": 0.01128916, + "auxiliary_loss_mlp": 0.01035298, + "balance_loss_clip": 1.02108788, + "balance_loss_mlp": 1.04543233, + "epoch": 0.2713362392905456, + "flos": 23148952312320.0, + "grad_norm": 1.880665339674376, + "language_loss": 0.80508482, + "learning_rate": 3.419504890542124e-06, + "loss": 0.82672697, + "num_input_tokens_seen": 97646595, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8359375, + "step": 4513, + "time_per_iteration": 2.5550525188446045 + }, + { + "auxiliary_loss_clip": 0.01132709, + "auxiliary_loss_mlp": 0.01042049, + "balance_loss_clip": 1.02668297, + "balance_loss_mlp": 1.04505134, + "epoch": 0.2713963625432136, + "flos": 18366045949440.0, + "grad_norm": 1.8883190176483522, + "language_loss": 0.88062817, + "learning_rate": 3.4192305068885026e-06, + "loss": 0.90237576, + "num_input_tokens_seen": 97665485, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.875, + "step": 4514, + "time_per_iteration": 2.4411823749542236 + }, + { + "auxiliary_loss_clip": 0.0113378, + "auxiliary_loss_mlp": 0.01041006, + "balance_loss_clip": 1.02475166, + "balance_loss_mlp": 1.04799736, + "epoch": 0.27145648579588155, + "flos": 22491751121280.0, + "grad_norm": 2.468440108941068, + "language_loss": 0.92064375, + "learning_rate": 3.418956069417517e-06, + "loss": 0.94239157, + "num_input_tokens_seen": 97683800, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.859375, + "step": 4515, + "time_per_iteration": 2.507073402404785 + }, + { + "auxiliary_loss_clip": 0.01140812, + "auxiliary_loss_mlp": 0.01050656, + "balance_loss_clip": 1.03202391, + "balance_loss_mlp": 1.04952395, + "epoch": 0.2715166090485495, + "flos": 19239177749760.0, + "grad_norm": 2.5869205534481017, + "language_loss": 0.73691195, + "learning_rate": 3.4186815781395756e-06, + "loss": 0.75882661, + "num_input_tokens_seen": 97700505, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.9140625, + "step": 4516, + "time_per_iteration": 2.4427852630615234 + }, + { + "auxiliary_loss_clip": 0.01134153, + "auxiliary_loss_mlp": 0.01040166, + "balance_loss_clip": 1.02352417, + "balance_loss_mlp": 1.0466857, + "epoch": 0.2715767323012175, + "flos": 17709598944000.0, + "grad_norm": 6.588152355110397, + "language_loss": 0.76239699, + "learning_rate": 3.4184070330650866e-06, + "loss": 0.78414017, + "num_input_tokens_seen": 97717410, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.875, + "step": 4517, + "time_per_iteration": 2.4891836643218994 + }, + { + "auxiliary_loss_clip": 0.01133662, + "auxiliary_loss_mlp": 0.01039085, + "balance_loss_clip": 1.02201402, + "balance_loss_mlp": 1.0473218, + "epoch": 0.27163685555388545, + "flos": 22382834106240.0, + "grad_norm": 2.2012309941627066, + "language_loss": 0.76785064, + "learning_rate": 3.4181324342044607e-06, + "loss": 0.78957808, + "num_input_tokens_seen": 97734545, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.86328125, + "step": 4518, + "time_per_iteration": 2.503117561340332 + }, + { + "auxiliary_loss_clip": 0.01133735, + "auxiliary_loss_mlp": 0.01039304, + "balance_loss_clip": 1.0241586, + "balance_loss_mlp": 1.04699707, + "epoch": 0.2716969788065534, + "flos": 22346708002560.0, + "grad_norm": 1.6415373198141725, + "language_loss": 0.68314338, + "learning_rate": 3.41785778156811e-06, + "loss": 0.7048738, + "num_input_tokens_seen": 97754000, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8671875, + "step": 4519, + "time_per_iteration": 2.573230028152466 + }, + { + "auxiliary_loss_clip": 0.01132669, + "auxiliary_loss_mlp": 0.01034398, + "balance_loss_clip": 1.01893663, + "balance_loss_mlp": 1.04631245, + "epoch": 0.2717571020592214, + "flos": 25228467319680.0, + "grad_norm": 2.6734918677628685, + "language_loss": 0.755759, + "learning_rate": 3.417583075166451e-06, + "loss": 0.7774297, + "num_input_tokens_seen": 97772080, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.86328125, + "step": 4520, + "time_per_iteration": 2.535546064376831 + }, + { + "auxiliary_loss_clip": 0.01138716, + "auxiliary_loss_mlp": 0.01044302, + "balance_loss_clip": 1.02628946, + "balance_loss_mlp": 1.0501039, + "epoch": 0.2718172253118894, + "flos": 20189769229440.0, + "grad_norm": 2.5201661256644523, + "language_loss": 0.76219606, + "learning_rate": 3.4173083150099e-06, + "loss": 0.78402621, + "num_input_tokens_seen": 97789370, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.88671875, + "step": 4521, + "time_per_iteration": 2.491654396057129 + }, + { + "auxiliary_loss_clip": 0.01137284, + "auxiliary_loss_mlp": 0.01047464, + "balance_loss_clip": 1.03102481, + "balance_loss_mlp": 1.04803133, + "epoch": 0.27187734856455736, + "flos": 14319129260160.0, + "grad_norm": 2.3970894391693967, + "language_loss": 0.75911158, + "learning_rate": 3.417033501108875e-06, + "loss": 0.78095901, + "num_input_tokens_seen": 97807385, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.890625, + "step": 4522, + "time_per_iteration": 2.471673011779785 + }, + { + "auxiliary_loss_clip": 0.01137707, + "auxiliary_loss_mlp": 0.0103702, + "balance_loss_clip": 1.02042627, + "balance_loss_mlp": 1.04873872, + "epoch": 0.27193747181722533, + "flos": 21107682311040.0, + "grad_norm": 5.0666434109354075, + "language_loss": 0.72895801, + "learning_rate": 3.416758633473798e-06, + "loss": 0.75070536, + "num_input_tokens_seen": 97827930, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.890625, + "step": 4523, + "time_per_iteration": 2.5152363777160645 + }, + { + "auxiliary_loss_clip": 0.01129262, + "auxiliary_loss_mlp": 0.01038594, + "balance_loss_clip": 1.02208352, + "balance_loss_mlp": 1.04448104, + "epoch": 0.2719975950698933, + "flos": 19682782715520.0, + "grad_norm": 1.5338044020439772, + "language_loss": 0.74324989, + "learning_rate": 3.4164837121150915e-06, + "loss": 0.76492846, + "num_input_tokens_seen": 97847440, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.84765625, + "step": 4524, + "time_per_iteration": 2.495253562927246 + }, + { + "auxiliary_loss_clip": 0.01135118, + "auxiliary_loss_mlp": 0.01039959, + "balance_loss_clip": 1.02380621, + "balance_loss_mlp": 1.04772878, + "epoch": 0.27205771832256126, + "flos": 24754482426240.0, + "grad_norm": 2.881398237919427, + "language_loss": 0.76651889, + "learning_rate": 3.4162087370431803e-06, + "loss": 0.78826964, + "num_input_tokens_seen": 97867620, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.875, + "step": 4525, + "time_per_iteration": 2.511634111404419 + }, + { + "auxiliary_loss_clip": 0.01131035, + "auxiliary_loss_mlp": 0.01049235, + "balance_loss_clip": 1.0334518, + "balance_loss_mlp": 1.04626358, + "epoch": 0.2721178415752292, + "flos": 21755581879680.0, + "grad_norm": 1.8599028556429251, + "language_loss": 0.81914634, + "learning_rate": 3.4159337082684926e-06, + "loss": 0.84094906, + "num_input_tokens_seen": 97884345, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.84765625, + "step": 4526, + "time_per_iteration": 2.495011568069458 + }, + { + "auxiliary_loss_clip": 0.01137971, + "auxiliary_loss_mlp": 0.01044775, + "balance_loss_clip": 1.02770483, + "balance_loss_mlp": 1.0466783, + "epoch": 0.2721779648278972, + "flos": 12676826597760.0, + "grad_norm": 3.313629745591453, + "language_loss": 0.77007318, + "learning_rate": 3.4156586258014566e-06, + "loss": 0.79190063, + "num_input_tokens_seen": 97901500, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9140625, + "step": 4527, + "time_per_iteration": 2.5181260108947754 + }, + { + "auxiliary_loss_clip": 0.0113407, + "auxiliary_loss_mlp": 0.01041797, + "balance_loss_clip": 1.02496481, + "balance_loss_mlp": 1.04637635, + "epoch": 0.27223808808056515, + "flos": 16253206099200.0, + "grad_norm": 2.1845797146290784, + "language_loss": 0.81825048, + "learning_rate": 3.415383489652503e-06, + "loss": 0.84000921, + "num_input_tokens_seen": 97917800, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.87890625, + "step": 4528, + "time_per_iteration": 2.469916582107544 + }, + { + "auxiliary_loss_clip": 0.01131576, + "auxiliary_loss_mlp": 0.01042667, + "balance_loss_clip": 1.0273608, + "balance_loss_mlp": 1.04669189, + "epoch": 0.2722982113332331, + "flos": 27745805203200.0, + "grad_norm": 1.6672454466706952, + "language_loss": 0.77123594, + "learning_rate": 3.4151082998320666e-06, + "loss": 0.79297841, + "num_input_tokens_seen": 97937225, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8515625, + "step": 4529, + "time_per_iteration": 2.5379140377044678 + }, + { + "auxiliary_loss_clip": 0.01133862, + "auxiliary_loss_mlp": 0.01044178, + "balance_loss_clip": 1.02900243, + "balance_loss_mlp": 1.04580855, + "epoch": 0.2723583345859011, + "flos": 21726243446400.0, + "grad_norm": 2.4153957329893228, + "language_loss": 0.8195889, + "learning_rate": 3.4148330563505805e-06, + "loss": 0.84136933, + "num_input_tokens_seen": 97956845, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8828125, + "step": 4530, + "time_per_iteration": 2.5363659858703613 + }, + { + "auxiliary_loss_clip": 0.01133042, + "auxiliary_loss_mlp": 0.01036315, + "balance_loss_clip": 1.02010226, + "balance_loss_mlp": 1.04630172, + "epoch": 0.27241845783856905, + "flos": 17347260499200.0, + "grad_norm": 2.1797176655983432, + "language_loss": 0.91650689, + "learning_rate": 3.4145577592184838e-06, + "loss": 0.93820047, + "num_input_tokens_seen": 97972465, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8671875, + "step": 4531, + "time_per_iteration": 2.508429765701294 + }, + { + "auxiliary_loss_clip": 0.01134833, + "auxiliary_loss_mlp": 0.01047772, + "balance_loss_clip": 1.03159511, + "balance_loss_mlp": 1.04611766, + "epoch": 0.272478581091237, + "flos": 24754302858240.0, + "grad_norm": 2.532443443519077, + "language_loss": 0.76107466, + "learning_rate": 3.4142824084462155e-06, + "loss": 0.78290069, + "num_input_tokens_seen": 97990770, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.88671875, + "step": 4532, + "time_per_iteration": 2.499457359313965 + }, + { + "auxiliary_loss_clip": 0.01130352, + "auxiliary_loss_mlp": 0.01034139, + "balance_loss_clip": 1.01861846, + "balance_loss_mlp": 1.04643464, + "epoch": 0.272538704343905, + "flos": 17890624512000.0, + "grad_norm": 3.1928401528407746, + "language_loss": 0.89197671, + "learning_rate": 3.4140070040442162e-06, + "loss": 0.91362166, + "num_input_tokens_seen": 98005775, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.83984375, + "step": 4533, + "time_per_iteration": 2.508202075958252 + }, + { + "auxiliary_loss_clip": 0.0113001, + "auxiliary_loss_mlp": 0.01037014, + "balance_loss_clip": 1.02118278, + "balance_loss_mlp": 1.04587626, + "epoch": 0.272598827596573, + "flos": 22932016122240.0, + "grad_norm": 2.096334750916122, + "language_loss": 0.7125262, + "learning_rate": 3.413731546022929e-06, + "loss": 0.73419642, + "num_input_tokens_seen": 98025750, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.84375, + "step": 4534, + "time_per_iteration": 2.5111024379730225 + }, + { + "auxiliary_loss_clip": 0.01134655, + "auxiliary_loss_mlp": 0.01040405, + "balance_loss_clip": 1.02315593, + "balance_loss_mlp": 1.04651427, + "epoch": 0.27265895084924097, + "flos": 24238409771520.0, + "grad_norm": 1.9613498766130548, + "language_loss": 0.91064882, + "learning_rate": 3.4134560343928005e-06, + "loss": 0.93239939, + "num_input_tokens_seen": 98044955, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.8828125, + "step": 4535, + "time_per_iteration": 2.5509371757507324 + }, + { + "auxiliary_loss_clip": 0.01138846, + "auxiliary_loss_mlp": 0.01039245, + "balance_loss_clip": 1.02262712, + "balance_loss_mlp": 1.05108571, + "epoch": 0.27271907410190893, + "flos": 27013155494400.0, + "grad_norm": 1.5906078149456282, + "language_loss": 0.72618866, + "learning_rate": 3.4131804691642778e-06, + "loss": 0.74796963, + "num_input_tokens_seen": 98065860, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.87890625, + "step": 4536, + "time_per_iteration": 2.5106241703033447 + }, + { + "auxiliary_loss_clip": 0.01133436, + "auxiliary_loss_mlp": 0.0103976, + "balance_loss_clip": 1.02302337, + "balance_loss_mlp": 1.04617631, + "epoch": 0.2727791973545769, + "flos": 34452588942720.0, + "grad_norm": 1.839444357786457, + "language_loss": 0.7144469, + "learning_rate": 3.41290485034781e-06, + "loss": 0.73617887, + "num_input_tokens_seen": 98085450, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.87109375, + "step": 4537, + "time_per_iteration": 2.588439464569092 + }, + { + "auxiliary_loss_clip": 0.01132537, + "auxiliary_loss_mlp": 0.01040014, + "balance_loss_clip": 1.02363503, + "balance_loss_mlp": 1.04501796, + "epoch": 0.27283932060724486, + "flos": 15041723160960.0, + "grad_norm": 2.431092364938405, + "language_loss": 0.78177559, + "learning_rate": 3.4126291779538485e-06, + "loss": 0.80350113, + "num_input_tokens_seen": 98099115, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.875, + "step": 4538, + "time_per_iteration": 2.438603639602661 + }, + { + "auxiliary_loss_clip": 0.01134265, + "auxiliary_loss_mlp": 0.01041521, + "balance_loss_clip": 1.02609527, + "balance_loss_mlp": 1.04698634, + "epoch": 0.2728994438599128, + "flos": 21652411040640.0, + "grad_norm": 1.4794812227008705, + "language_loss": 0.90038705, + "learning_rate": 3.412353451992847e-06, + "loss": 0.92214489, + "num_input_tokens_seen": 98118415, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.875, + "step": 4539, + "time_per_iteration": 2.5052709579467773 + }, + { + "auxiliary_loss_clip": 0.01132202, + "auxiliary_loss_mlp": 0.01041987, + "balance_loss_clip": 1.02414095, + "balance_loss_mlp": 1.04627967, + "epoch": 0.2729595671125808, + "flos": 17488424949120.0, + "grad_norm": 2.0712338481270884, + "language_loss": 0.88711655, + "learning_rate": 3.4120776724752607e-06, + "loss": 0.90885842, + "num_input_tokens_seen": 98136300, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.859375, + "step": 4540, + "time_per_iteration": 2.457939624786377 + }, + { + "auxiliary_loss_clip": 0.01133918, + "auxiliary_loss_mlp": 0.01033711, + "balance_loss_clip": 1.01771343, + "balance_loss_mlp": 1.04666936, + "epoch": 0.27301969036524876, + "flos": 19318145800320.0, + "grad_norm": 1.9363402300433894, + "language_loss": 0.81993663, + "learning_rate": 3.4118018394115476e-06, + "loss": 0.84161294, + "num_input_tokens_seen": 98154580, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.875, + "step": 4541, + "time_per_iteration": 2.461517333984375 + }, + { + "auxiliary_loss_clip": 0.01133224, + "auxiliary_loss_mlp": 0.01041774, + "balance_loss_clip": 1.02484596, + "balance_loss_mlp": 1.04623377, + "epoch": 0.2730798136179167, + "flos": 21065666376960.0, + "grad_norm": 1.8882731025231656, + "language_loss": 0.7925449, + "learning_rate": 3.4115259528121678e-06, + "loss": 0.81429487, + "num_input_tokens_seen": 98173115, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.87109375, + "step": 4542, + "time_per_iteration": 2.487905979156494 + }, + { + "auxiliary_loss_clip": 0.01136186, + "auxiliary_loss_mlp": 0.01040722, + "balance_loss_clip": 1.02441418, + "balance_loss_mlp": 1.04965162, + "epoch": 0.2731399368705847, + "flos": 19171737964800.0, + "grad_norm": 2.197105758262293, + "language_loss": 0.89471424, + "learning_rate": 3.411250012687582e-06, + "loss": 0.91648328, + "num_input_tokens_seen": 98190260, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8671875, + "step": 4543, + "time_per_iteration": 2.4903039932250977 + }, + { + "auxiliary_loss_clip": 0.01137887, + "auxiliary_loss_mlp": 0.01046974, + "balance_loss_clip": 1.02955735, + "balance_loss_mlp": 1.04841042, + "epoch": 0.27320006012325265, + "flos": 18290130554880.0, + "grad_norm": 2.084938235366164, + "language_loss": 0.63666493, + "learning_rate": 3.410974019048255e-06, + "loss": 0.65851355, + "num_input_tokens_seen": 98207115, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.89453125, + "step": 4544, + "time_per_iteration": 2.4529080390930176 + }, + { + "auxiliary_loss_clip": 0.01137894, + "auxiliary_loss_mlp": 0.01047722, + "balance_loss_clip": 1.03043687, + "balance_loss_mlp": 1.05032265, + "epoch": 0.2732601833759206, + "flos": 34860929731200.0, + "grad_norm": 1.5170655618085727, + "language_loss": 0.6996637, + "learning_rate": 3.410697971904651e-06, + "loss": 0.72151983, + "num_input_tokens_seen": 98230610, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.875, + "step": 4545, + "time_per_iteration": 2.6089117527008057 + }, + { + "auxiliary_loss_clip": 0.01048793, + "auxiliary_loss_mlp": 0.01019944, + "balance_loss_clip": 1.01828671, + "balance_loss_mlp": 1.01938868, + "epoch": 0.2733203066285886, + "flos": 53910824762880.0, + "grad_norm": 0.7273987605446792, + "language_loss": 0.61571473, + "learning_rate": 3.4104218712672383e-06, + "loss": 0.63640207, + "num_input_tokens_seen": 98293585, + "router_z_loss_clip": 0.01660156, + "router_z_loss_mlp": 0.29296875, + "step": 4546, + "time_per_iteration": 3.1125431060791016 + }, + { + "auxiliary_loss_clip": 0.01136724, + "auxiliary_loss_mlp": 0.0104828, + "balance_loss_clip": 1.03199649, + "balance_loss_mlp": 1.05012798, + "epoch": 0.2733804298812566, + "flos": 20660378244480.0, + "grad_norm": 1.9369682323358774, + "language_loss": 0.64982706, + "learning_rate": 3.410145717146488e-06, + "loss": 0.67167711, + "num_input_tokens_seen": 98311680, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8671875, + "step": 4547, + "time_per_iteration": 2.497563600540161 + }, + { + "auxiliary_loss_clip": 0.01132998, + "auxiliary_loss_mlp": 0.01041584, + "balance_loss_clip": 1.0262835, + "balance_loss_mlp": 1.04765081, + "epoch": 0.27344055313392457, + "flos": 25884339707520.0, + "grad_norm": 2.2377196076559183, + "language_loss": 0.77178854, + "learning_rate": 3.4098695095528694e-06, + "loss": 0.7935344, + "num_input_tokens_seen": 98330770, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8515625, + "step": 4548, + "time_per_iteration": 2.536813259124756 + }, + { + "auxiliary_loss_clip": 0.01133984, + "auxiliary_loss_mlp": 0.01043122, + "balance_loss_clip": 1.02854848, + "balance_loss_mlp": 1.04827595, + "epoch": 0.27350067638659253, + "flos": 22929753565440.0, + "grad_norm": 1.8894391736419274, + "language_loss": 0.82382214, + "learning_rate": 3.4095932484968585e-06, + "loss": 0.84559321, + "num_input_tokens_seen": 98349860, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.859375, + "step": 4549, + "time_per_iteration": 2.5156633853912354 + }, + { + "auxiliary_loss_clip": 0.01132691, + "auxiliary_loss_mlp": 0.0104484, + "balance_loss_clip": 1.02744722, + "balance_loss_mlp": 1.04482448, + "epoch": 0.2735607996392605, + "flos": 16574821499520.0, + "grad_norm": 2.2209993145005793, + "language_loss": 0.70675868, + "learning_rate": 3.4093169339889305e-06, + "loss": 0.72853404, + "num_input_tokens_seen": 98367040, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.875, + "step": 4550, + "time_per_iteration": 2.4510462284088135 + }, + { + "auxiliary_loss_clip": 0.0113302, + "auxiliary_loss_mlp": 0.01046908, + "balance_loss_clip": 1.03272784, + "balance_loss_mlp": 1.04789186, + "epoch": 0.27362092289192846, + "flos": 19645291895040.0, + "grad_norm": 2.43111621366583, + "language_loss": 0.78738058, + "learning_rate": 3.409040566039563e-06, + "loss": 0.80917984, + "num_input_tokens_seen": 98384010, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8515625, + "step": 4551, + "time_per_iteration": 2.470520496368408 + }, + { + "auxiliary_loss_clip": 0.01132621, + "auxiliary_loss_mlp": 0.01051474, + "balance_loss_clip": 1.03548765, + "balance_loss_mlp": 1.04601097, + "epoch": 0.27368104614459643, + "flos": 17639142416640.0, + "grad_norm": 2.681171335598487, + "language_loss": 0.70585275, + "learning_rate": 3.4087641446592362e-06, + "loss": 0.72769368, + "num_input_tokens_seen": 98399625, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8671875, + "step": 4552, + "time_per_iteration": 3.9179859161376953 + }, + { + "auxiliary_loss_clip": 0.01135382, + "auxiliary_loss_mlp": 0.01046031, + "balance_loss_clip": 1.02936506, + "balance_loss_mlp": 1.04864776, + "epoch": 0.2737411693972644, + "flos": 21580015178880.0, + "grad_norm": 2.3865688662341005, + "language_loss": 0.71857619, + "learning_rate": 3.408487669858431e-06, + "loss": 0.7403903, + "num_input_tokens_seen": 98417310, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8671875, + "step": 4553, + "time_per_iteration": 4.032766342163086 + }, + { + "auxiliary_loss_clip": 0.01131855, + "auxiliary_loss_mlp": 0.01044919, + "balance_loss_clip": 1.02853942, + "balance_loss_mlp": 1.04585433, + "epoch": 0.27380129264993236, + "flos": 25484043565440.0, + "grad_norm": 1.5870570208244068, + "language_loss": 0.59154749, + "learning_rate": 3.4082111416476337e-06, + "loss": 0.61331522, + "num_input_tokens_seen": 98438670, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.859375, + "step": 4554, + "time_per_iteration": 2.549534320831299 + }, + { + "auxiliary_loss_clip": 0.01138763, + "auxiliary_loss_mlp": 0.01041638, + "balance_loss_clip": 1.02448368, + "balance_loss_mlp": 1.04893517, + "epoch": 0.2738614159026003, + "flos": 18661196004480.0, + "grad_norm": 1.7727518382715788, + "language_loss": 0.73820007, + "learning_rate": 3.4079345600373275e-06, + "loss": 0.76000404, + "num_input_tokens_seen": 98456060, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.8984375, + "step": 4555, + "time_per_iteration": 2.5162432193756104 + }, + { + "auxiliary_loss_clip": 0.01136837, + "auxiliary_loss_mlp": 0.01039479, + "balance_loss_clip": 1.02348125, + "balance_loss_mlp": 1.04923606, + "epoch": 0.2739215391552683, + "flos": 23477139901440.0, + "grad_norm": 1.956724452661134, + "language_loss": 0.7785511, + "learning_rate": 3.407657925038002e-06, + "loss": 0.80031419, + "num_input_tokens_seen": 98473765, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.875, + "step": 4556, + "time_per_iteration": 2.5205135345458984 + }, + { + "auxiliary_loss_clip": 0.01145391, + "auxiliary_loss_mlp": 0.0105386, + "balance_loss_clip": 1.03640783, + "balance_loss_mlp": 1.04952264, + "epoch": 0.27398166240793626, + "flos": 17128636369920.0, + "grad_norm": 1.7956202604517526, + "language_loss": 0.82272434, + "learning_rate": 3.4073812366601473e-06, + "loss": 0.84471685, + "num_input_tokens_seen": 98490590, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9609375, + "step": 4557, + "time_per_iteration": 2.486485719680786 + }, + { + "auxiliary_loss_clip": 0.01133092, + "auxiliary_loss_mlp": 0.01042572, + "balance_loss_clip": 1.02691972, + "balance_loss_mlp": 1.04657316, + "epoch": 0.2740417856606042, + "flos": 23404744039680.0, + "grad_norm": 1.7971714372597054, + "language_loss": 0.72697943, + "learning_rate": 3.4071044949142547e-06, + "loss": 0.74873614, + "num_input_tokens_seen": 98510590, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.86328125, + "step": 4558, + "time_per_iteration": 2.5272727012634277 + }, + { + "auxiliary_loss_clip": 0.01131967, + "auxiliary_loss_mlp": 0.01048867, + "balance_loss_clip": 1.03243995, + "balance_loss_mlp": 1.04504418, + "epoch": 0.2741019089132722, + "flos": 12780428400000.0, + "grad_norm": 2.1318143008079686, + "language_loss": 0.6804775, + "learning_rate": 3.406827699810819e-06, + "loss": 0.70228577, + "num_input_tokens_seen": 98527875, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8671875, + "step": 4559, + "time_per_iteration": 2.4787509441375732 + }, + { + "auxiliary_loss_clip": 0.01131026, + "auxiliary_loss_mlp": 0.01043891, + "balance_loss_clip": 1.02750015, + "balance_loss_mlp": 1.04517901, + "epoch": 0.27416203216594015, + "flos": 20631542601600.0, + "grad_norm": 3.5500966853689673, + "language_loss": 0.71847737, + "learning_rate": 3.4065508513603353e-06, + "loss": 0.74022651, + "num_input_tokens_seen": 98547575, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.859375, + "step": 4560, + "time_per_iteration": 2.490152359008789 + }, + { + "auxiliary_loss_clip": 0.0113572, + "auxiliary_loss_mlp": 0.01041958, + "balance_loss_clip": 1.02642488, + "balance_loss_mlp": 1.04779601, + "epoch": 0.27422215541860817, + "flos": 26541576812160.0, + "grad_norm": 1.7948619898284635, + "language_loss": 0.80998009, + "learning_rate": 3.406273949573303e-06, + "loss": 0.83175689, + "num_input_tokens_seen": 98566290, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.87890625, + "step": 4561, + "time_per_iteration": 2.554872512817383 + }, + { + "auxiliary_loss_clip": 0.01136406, + "auxiliary_loss_mlp": 0.01041546, + "balance_loss_clip": 1.02600157, + "balance_loss_mlp": 1.04711854, + "epoch": 0.27428227867127614, + "flos": 23331163029120.0, + "grad_norm": 1.7370289005889625, + "language_loss": 0.7531321, + "learning_rate": 3.4059969944602214e-06, + "loss": 0.77491164, + "num_input_tokens_seen": 98586255, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.890625, + "step": 4562, + "time_per_iteration": 2.4925429821014404 + }, + { + "auxiliary_loss_clip": 0.01133486, + "auxiliary_loss_mlp": 0.01037482, + "balance_loss_clip": 1.02173424, + "balance_loss_mlp": 1.04701662, + "epoch": 0.2743424019239441, + "flos": 23035115134080.0, + "grad_norm": 1.598166418515773, + "language_loss": 0.74503827, + "learning_rate": 3.4057199860315928e-06, + "loss": 0.76674795, + "num_input_tokens_seen": 98606030, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.86328125, + "step": 4563, + "time_per_iteration": 2.5514259338378906 + }, + { + "auxiliary_loss_clip": 0.01138282, + "auxiliary_loss_mlp": 0.01045739, + "balance_loss_clip": 1.02798915, + "balance_loss_mlp": 1.04708612, + "epoch": 0.27440252517661207, + "flos": 21981101420160.0, + "grad_norm": 1.8271759108968861, + "language_loss": 0.62526429, + "learning_rate": 3.4054429242979213e-06, + "loss": 0.64710456, + "num_input_tokens_seen": 98625225, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9140625, + "step": 4564, + "time_per_iteration": 2.479156494140625 + }, + { + "auxiliary_loss_clip": 0.01136574, + "auxiliary_loss_mlp": 0.01042695, + "balance_loss_clip": 1.02513587, + "balance_loss_mlp": 1.04808652, + "epoch": 0.27446264842928003, + "flos": 40187451502080.0, + "grad_norm": 1.9245884320117708, + "language_loss": 0.78135669, + "learning_rate": 3.4051658092697135e-06, + "loss": 0.80314934, + "num_input_tokens_seen": 98649470, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8828125, + "step": 4565, + "time_per_iteration": 2.714069366455078 + }, + { + "auxiliary_loss_clip": 0.01133378, + "auxiliary_loss_mlp": 0.01039885, + "balance_loss_clip": 1.02443504, + "balance_loss_mlp": 1.04669619, + "epoch": 0.274522771681948, + "flos": 13479681438720.0, + "grad_norm": 2.3377831889988547, + "language_loss": 0.68350124, + "learning_rate": 3.404888640957477e-06, + "loss": 0.70523381, + "num_input_tokens_seen": 98666915, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8671875, + "step": 4566, + "time_per_iteration": 2.469357967376709 + }, + { + "auxiliary_loss_clip": 0.01133208, + "auxiliary_loss_mlp": 0.01046416, + "balance_loss_clip": 1.03211665, + "balance_loss_mlp": 1.04901338, + "epoch": 0.27458289493461596, + "flos": 28622133313920.0, + "grad_norm": 1.7938914020631171, + "language_loss": 0.60886472, + "learning_rate": 3.404611419371723e-06, + "loss": 0.63066101, + "num_input_tokens_seen": 98688240, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.84375, + "step": 4567, + "time_per_iteration": 2.5856754779815674 + }, + { + "auxiliary_loss_clip": 0.01134122, + "auxiliary_loss_mlp": 0.01042972, + "balance_loss_clip": 1.02597237, + "balance_loss_mlp": 1.04754972, + "epoch": 0.2746430181872839, + "flos": 20119815492480.0, + "grad_norm": 1.7650663548751138, + "language_loss": 0.82787997, + "learning_rate": 3.4043341445229627e-06, + "loss": 0.84965092, + "num_input_tokens_seen": 98708245, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.86328125, + "step": 4568, + "time_per_iteration": 2.476353168487549 + }, + { + "auxiliary_loss_clip": 0.0113839, + "auxiliary_loss_mlp": 0.01034679, + "balance_loss_clip": 1.01868141, + "balance_loss_mlp": 1.05012584, + "epoch": 0.2747031414399519, + "flos": 20193468330240.0, + "grad_norm": 2.0155686346894415, + "language_loss": 0.68656778, + "learning_rate": 3.4040568164217117e-06, + "loss": 0.7082985, + "num_input_tokens_seen": 98724575, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8828125, + "step": 4569, + "time_per_iteration": 2.5027451515197754 + }, + { + "auxiliary_loss_clip": 0.01133852, + "auxiliary_loss_mlp": 0.0103613, + "balance_loss_clip": 1.01947594, + "balance_loss_mlp": 1.0464673, + "epoch": 0.27476326469261986, + "flos": 13516346246400.0, + "grad_norm": 2.247407128453888, + "language_loss": 0.71138883, + "learning_rate": 3.4037794350784848e-06, + "loss": 0.73308867, + "num_input_tokens_seen": 98740700, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.875, + "step": 4570, + "time_per_iteration": 2.466845750808716 + }, + { + "auxiliary_loss_clip": 0.0104735, + "auxiliary_loss_mlp": 0.01010434, + "balance_loss_clip": 1.00881279, + "balance_loss_mlp": 1.01781416, + "epoch": 0.2748233879452878, + "flos": 65937127121280.0, + "grad_norm": 0.7344992896847644, + "language_loss": 0.55774754, + "learning_rate": 3.4035020005038014e-06, + "loss": 0.57832539, + "num_input_tokens_seen": 98803030, + "router_z_loss_clip": 0.01623535, + "router_z_loss_mlp": 0.296875, + "step": 4571, + "time_per_iteration": 3.192523241043091 + }, + { + "auxiliary_loss_clip": 0.01140339, + "auxiliary_loss_mlp": 0.01044242, + "balance_loss_clip": 1.02805328, + "balance_loss_mlp": 1.05039406, + "epoch": 0.2748835111979558, + "flos": 17384212615680.0, + "grad_norm": 3.6883594473706482, + "language_loss": 0.77785081, + "learning_rate": 3.4032245127081812e-06, + "loss": 0.79969662, + "num_input_tokens_seen": 98820505, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8984375, + "step": 4572, + "time_per_iteration": 2.4755914211273193 + }, + { + "auxiliary_loss_clip": 0.01129408, + "auxiliary_loss_mlp": 0.01036409, + "balance_loss_clip": 1.02200866, + "balance_loss_mlp": 1.04679561, + "epoch": 0.27494363445062375, + "flos": 23587565287680.0, + "grad_norm": 1.7042315716847805, + "language_loss": 0.81357443, + "learning_rate": 3.402946971702147e-06, + "loss": 0.83523262, + "num_input_tokens_seen": 98842150, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.828125, + "step": 4573, + "time_per_iteration": 2.540905237197876 + }, + { + "auxiliary_loss_clip": 0.01129787, + "auxiliary_loss_mlp": 0.01036343, + "balance_loss_clip": 1.02038062, + "balance_loss_mlp": 1.04580402, + "epoch": 0.2750037577032918, + "flos": 17164582905600.0, + "grad_norm": 1.7927939239771835, + "language_loss": 0.79077196, + "learning_rate": 3.402669377496223e-06, + "loss": 0.81243324, + "num_input_tokens_seen": 98861050, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.83984375, + "step": 4574, + "time_per_iteration": 2.451016664505005 + }, + { + "auxiliary_loss_clip": 0.01136155, + "auxiliary_loss_mlp": 0.01044603, + "balance_loss_clip": 1.02889121, + "balance_loss_mlp": 1.04886127, + "epoch": 0.27506388095595974, + "flos": 24491903028480.0, + "grad_norm": 2.232643844604772, + "language_loss": 0.74191976, + "learning_rate": 3.402391730100936e-06, + "loss": 0.76372731, + "num_input_tokens_seen": 98879695, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.875, + "step": 4575, + "time_per_iteration": 2.5744149684906006 + }, + { + "auxiliary_loss_clip": 0.01131901, + "auxiliary_loss_mlp": 0.01038458, + "balance_loss_clip": 1.02353263, + "balance_loss_mlp": 1.04711711, + "epoch": 0.2751240042086277, + "flos": 38764706722560.0, + "grad_norm": 1.8105072672356382, + "language_loss": 0.71877766, + "learning_rate": 3.402114029526814e-06, + "loss": 0.7404812, + "num_input_tokens_seen": 98902035, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.84765625, + "step": 4576, + "time_per_iteration": 2.634305715560913 + }, + { + "auxiliary_loss_clip": 0.01134924, + "auxiliary_loss_mlp": 0.01041859, + "balance_loss_clip": 1.02495503, + "balance_loss_mlp": 1.04823232, + "epoch": 0.27518412746129567, + "flos": 26907039740160.0, + "grad_norm": 1.7690392048384511, + "language_loss": 0.73200434, + "learning_rate": 3.4018362757843866e-06, + "loss": 0.75377214, + "num_input_tokens_seen": 98921835, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8671875, + "step": 4577, + "time_per_iteration": 2.5365946292877197 + }, + { + "auxiliary_loss_clip": 0.01137469, + "auxiliary_loss_mlp": 0.01038584, + "balance_loss_clip": 1.02182376, + "balance_loss_mlp": 1.04931974, + "epoch": 0.27524425071396363, + "flos": 24900531125760.0, + "grad_norm": 5.099060573221768, + "language_loss": 0.75943893, + "learning_rate": 3.401558468884188e-06, + "loss": 0.78119946, + "num_input_tokens_seen": 98939610, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8828125, + "step": 4578, + "time_per_iteration": 2.5121536254882812 + }, + { + "auxiliary_loss_clip": 0.01135832, + "auxiliary_loss_mlp": 0.01046459, + "balance_loss_clip": 1.02704024, + "balance_loss_mlp": 1.0475626, + "epoch": 0.2753043739666316, + "flos": 26288047641600.0, + "grad_norm": 2.3614458833507603, + "language_loss": 0.66299897, + "learning_rate": 3.4012806088367516e-06, + "loss": 0.68482184, + "num_input_tokens_seen": 98962250, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.8828125, + "step": 4579, + "time_per_iteration": 2.5445947647094727 + }, + { + "auxiliary_loss_clip": 0.01137742, + "auxiliary_loss_mlp": 0.01056341, + "balance_loss_clip": 1.03841197, + "balance_loss_mlp": 1.04862928, + "epoch": 0.27536449721929956, + "flos": 24206772867840.0, + "grad_norm": 1.9384727438162337, + "language_loss": 0.8013078, + "learning_rate": 3.4010026956526137e-06, + "loss": 0.82324862, + "num_input_tokens_seen": 98981845, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.890625, + "step": 4580, + "time_per_iteration": 2.4895741939544678 + }, + { + "auxiliary_loss_clip": 0.01138586, + "auxiliary_loss_mlp": 0.01044508, + "balance_loss_clip": 1.02581632, + "balance_loss_mlp": 1.05140579, + "epoch": 0.27542462047196753, + "flos": 19537272720000.0, + "grad_norm": 1.4702192551629332, + "language_loss": 0.67702103, + "learning_rate": 3.4007247293423137e-06, + "loss": 0.698852, + "num_input_tokens_seen": 99001855, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.87109375, + "step": 4581, + "time_per_iteration": 2.5905539989471436 + }, + { + "auxiliary_loss_clip": 0.01137135, + "auxiliary_loss_mlp": 0.01046005, + "balance_loss_clip": 1.03024602, + "balance_loss_mlp": 1.04847145, + "epoch": 0.2754847437246355, + "flos": 14319165173760.0, + "grad_norm": 1.8568978026073784, + "language_loss": 0.78120708, + "learning_rate": 3.400446709916392e-06, + "loss": 0.80303848, + "num_input_tokens_seen": 99019880, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.88671875, + "step": 4582, + "time_per_iteration": 2.467210531234741 + }, + { + "auxiliary_loss_clip": 0.01133579, + "auxiliary_loss_mlp": 0.01040863, + "balance_loss_clip": 1.02537727, + "balance_loss_mlp": 1.04905152, + "epoch": 0.27554486697730346, + "flos": 18838773866880.0, + "grad_norm": 2.5358708072067406, + "language_loss": 0.84527528, + "learning_rate": 3.4001686373853895e-06, + "loss": 0.86701977, + "num_input_tokens_seen": 99037570, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.84375, + "step": 4583, + "time_per_iteration": 2.511457920074463 + }, + { + "auxiliary_loss_clip": 0.01138165, + "auxiliary_loss_mlp": 0.01041348, + "balance_loss_clip": 1.02529025, + "balance_loss_mlp": 1.04905808, + "epoch": 0.2756049902299714, + "flos": 22382295402240.0, + "grad_norm": 2.037294788318467, + "language_loss": 0.67308438, + "learning_rate": 3.3998905117598528e-06, + "loss": 0.69487947, + "num_input_tokens_seen": 99056875, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.890625, + "step": 4584, + "time_per_iteration": 2.5193254947662354 + }, + { + "auxiliary_loss_clip": 0.01132805, + "auxiliary_loss_mlp": 0.01041645, + "balance_loss_clip": 1.02645802, + "balance_loss_mlp": 1.04761386, + "epoch": 0.2756651134826394, + "flos": 19573901614080.0, + "grad_norm": 1.737999785464117, + "language_loss": 0.77330101, + "learning_rate": 3.399612333050327e-06, + "loss": 0.7950455, + "num_input_tokens_seen": 99074685, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8515625, + "step": 4585, + "time_per_iteration": 2.5393707752227783 + }, + { + "auxiliary_loss_clip": 0.0114213, + "auxiliary_loss_mlp": 0.01039297, + "balance_loss_clip": 1.02227354, + "balance_loss_mlp": 1.0530591, + "epoch": 0.27572523673530736, + "flos": 23586559706880.0, + "grad_norm": 1.654604836009794, + "language_loss": 0.71854031, + "learning_rate": 3.399334101267362e-06, + "loss": 0.74035466, + "num_input_tokens_seen": 99095300, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.890625, + "step": 4586, + "time_per_iteration": 2.534979820251465 + }, + { + "auxiliary_loss_clip": 0.01136306, + "auxiliary_loss_mlp": 0.01035904, + "balance_loss_clip": 1.01996541, + "balance_loss_mlp": 1.04988265, + "epoch": 0.2757853599879754, + "flos": 22820118278400.0, + "grad_norm": 1.5248017982775213, + "language_loss": 0.80546939, + "learning_rate": 3.3990558164215073e-06, + "loss": 0.82719147, + "num_input_tokens_seen": 99115965, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.86328125, + "step": 4587, + "time_per_iteration": 2.5424065589904785 + }, + { + "auxiliary_loss_clip": 0.01133784, + "auxiliary_loss_mlp": 0.0103925, + "balance_loss_clip": 1.02356219, + "balance_loss_mlp": 1.04939508, + "epoch": 0.27584548324064334, + "flos": 18551704371840.0, + "grad_norm": 2.136921841599078, + "language_loss": 0.82694119, + "learning_rate": 3.398777478523316e-06, + "loss": 0.8486715, + "num_input_tokens_seen": 99134265, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.84375, + "step": 4588, + "time_per_iteration": 2.467923879623413 + }, + { + "auxiliary_loss_clip": 0.01132148, + "auxiliary_loss_mlp": 0.01038443, + "balance_loss_clip": 1.0228622, + "balance_loss_mlp": 1.04754925, + "epoch": 0.2759056064933113, + "flos": 23769883745280.0, + "grad_norm": 1.3980423175693042, + "language_loss": 0.75352502, + "learning_rate": 3.398499087583342e-06, + "loss": 0.775231, + "num_input_tokens_seen": 99156185, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84375, + "step": 4589, + "time_per_iteration": 2.535837173461914 + }, + { + "auxiliary_loss_clip": 0.01132096, + "auxiliary_loss_mlp": 0.01042232, + "balance_loss_clip": 1.02526879, + "balance_loss_mlp": 1.04686022, + "epoch": 0.27596572974597927, + "flos": 24281898163200.0, + "grad_norm": 1.7720046877472317, + "language_loss": 0.88438141, + "learning_rate": 3.398220643612143e-06, + "loss": 0.90612471, + "num_input_tokens_seen": 99176735, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.8515625, + "step": 4590, + "time_per_iteration": 2.5248916149139404 + }, + { + "auxiliary_loss_clip": 0.01135164, + "auxiliary_loss_mlp": 0.01045359, + "balance_loss_clip": 1.02946877, + "balance_loss_mlp": 1.04789972, + "epoch": 0.27602585299864724, + "flos": 35040985632000.0, + "grad_norm": 1.6299691755620427, + "language_loss": 0.7129395, + "learning_rate": 3.397942146620277e-06, + "loss": 0.73474467, + "num_input_tokens_seen": 99199765, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.87109375, + "step": 4591, + "time_per_iteration": 2.6112425327301025 + }, + { + "auxiliary_loss_clip": 0.01135759, + "auxiliary_loss_mlp": 0.01049557, + "balance_loss_clip": 1.03268862, + "balance_loss_mlp": 1.04847574, + "epoch": 0.2760859762513152, + "flos": 24309405002880.0, + "grad_norm": 1.8477043284936983, + "language_loss": 0.80190659, + "learning_rate": 3.3976635966183046e-06, + "loss": 0.82375979, + "num_input_tokens_seen": 99218435, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.875, + "step": 4592, + "time_per_iteration": 2.483894109725952 + }, + { + "auxiliary_loss_clip": 0.01048363, + "auxiliary_loss_mlp": 0.01005872, + "balance_loss_clip": 1.00416684, + "balance_loss_mlp": 1.0189774, + "epoch": 0.27614609950398317, + "flos": 71260739890560.0, + "grad_norm": 0.7716758671018623, + "language_loss": 0.61627746, + "learning_rate": 3.3973849936167886e-06, + "loss": 0.63681984, + "num_input_tokens_seen": 99276200, + "router_z_loss_clip": 0.01708984, + "router_z_loss_mlp": 0.29296875, + "step": 4593, + "time_per_iteration": 3.0616326332092285 + }, + { + "auxiliary_loss_clip": 0.01135076, + "auxiliary_loss_mlp": 0.01045597, + "balance_loss_clip": 1.02965856, + "balance_loss_mlp": 1.04938328, + "epoch": 0.27620622275665113, + "flos": 29674854138240.0, + "grad_norm": 1.8877557773606983, + "language_loss": 0.77589142, + "learning_rate": 3.3971063376262937e-06, + "loss": 0.79769808, + "num_input_tokens_seen": 99297625, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.859375, + "step": 4594, + "time_per_iteration": 4.043708086013794 + }, + { + "auxiliary_loss_clip": 0.01134807, + "auxiliary_loss_mlp": 0.01033386, + "balance_loss_clip": 1.01769793, + "balance_loss_mlp": 1.04991734, + "epoch": 0.2762663460093191, + "flos": 15378063137280.0, + "grad_norm": 1.7681451067423914, + "language_loss": 0.91645586, + "learning_rate": 3.3968276286573866e-06, + "loss": 0.93813777, + "num_input_tokens_seen": 99315790, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84765625, + "step": 4595, + "time_per_iteration": 3.973101854324341 + }, + { + "auxiliary_loss_clip": 0.01138485, + "auxiliary_loss_mlp": 0.01047274, + "balance_loss_clip": 1.03034675, + "balance_loss_mlp": 1.05122674, + "epoch": 0.27632646926198706, + "flos": 20704082117760.0, + "grad_norm": 1.7288059110569738, + "language_loss": 0.69101036, + "learning_rate": 3.3965488667206353e-06, + "loss": 0.71286798, + "num_input_tokens_seen": 99334615, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.87109375, + "step": 4596, + "time_per_iteration": 2.509199380874634 + }, + { + "auxiliary_loss_clip": 0.0114029, + "auxiliary_loss_mlp": 0.01041278, + "balance_loss_clip": 1.0249939, + "balance_loss_mlp": 1.04883707, + "epoch": 0.276386592514655, + "flos": 32813374849920.0, + "grad_norm": 2.01522187594791, + "language_loss": 0.63536406, + "learning_rate": 3.3962700518266113e-06, + "loss": 0.65717971, + "num_input_tokens_seen": 99356685, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.9140625, + "step": 4597, + "time_per_iteration": 2.5944221019744873 + }, + { + "auxiliary_loss_clip": 0.01133967, + "auxiliary_loss_mlp": 0.0104198, + "balance_loss_clip": 1.02629232, + "balance_loss_mlp": 1.05002272, + "epoch": 0.276446715767323, + "flos": 18551704371840.0, + "grad_norm": 2.1842552390134586, + "language_loss": 0.86612505, + "learning_rate": 3.395991183985887e-06, + "loss": 0.88788456, + "num_input_tokens_seen": 99374810, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.83984375, + "step": 4598, + "time_per_iteration": 2.4870996475219727 + }, + { + "auxiliary_loss_clip": 0.01135257, + "auxiliary_loss_mlp": 0.01042781, + "balance_loss_clip": 1.02586544, + "balance_loss_mlp": 1.04847229, + "epoch": 0.27650683901999096, + "flos": 22819615488000.0, + "grad_norm": 2.0694668215518996, + "language_loss": 0.79822165, + "learning_rate": 3.395712263209037e-06, + "loss": 0.82000202, + "num_input_tokens_seen": 99391290, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8671875, + "step": 4599, + "time_per_iteration": 2.4923834800720215 + }, + { + "auxiliary_loss_clip": 0.01140028, + "auxiliary_loss_mlp": 0.0104597, + "balance_loss_clip": 1.02965581, + "balance_loss_mlp": 1.04958415, + "epoch": 0.276566962272659, + "flos": 21361534704000.0, + "grad_norm": 1.9049018096400723, + "language_loss": 0.78357869, + "learning_rate": 3.395433289506639e-06, + "loss": 0.80543864, + "num_input_tokens_seen": 99409120, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.90625, + "step": 4600, + "time_per_iteration": 2.496173620223999 + }, + { + "auxiliary_loss_clip": 0.01139042, + "auxiliary_loss_mlp": 0.01046211, + "balance_loss_clip": 1.03007007, + "balance_loss_mlp": 1.04887986, + "epoch": 0.27662708552532694, + "flos": 17710604524800.0, + "grad_norm": 1.9474431855639402, + "language_loss": 0.73361742, + "learning_rate": 3.3951542628892694e-06, + "loss": 0.75546992, + "num_input_tokens_seen": 99426180, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.90234375, + "step": 4601, + "time_per_iteration": 2.475919246673584 + }, + { + "auxiliary_loss_clip": 0.01135661, + "auxiliary_loss_mlp": 0.01045476, + "balance_loss_clip": 1.02883482, + "balance_loss_mlp": 1.04879355, + "epoch": 0.2766872087779949, + "flos": 21252725429760.0, + "grad_norm": 1.9134344988482315, + "language_loss": 0.79341739, + "learning_rate": 3.3948751833675113e-06, + "loss": 0.81522876, + "num_input_tokens_seen": 99447720, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8671875, + "step": 4602, + "time_per_iteration": 2.511716842651367 + }, + { + "auxiliary_loss_clip": 0.01140401, + "auxiliary_loss_mlp": 0.01051234, + "balance_loss_clip": 1.03349614, + "balance_loss_mlp": 1.04920423, + "epoch": 0.2767473320306629, + "flos": 12931900053120.0, + "grad_norm": 2.260382216699142, + "language_loss": 0.76887643, + "learning_rate": 3.3945960509519455e-06, + "loss": 0.79079276, + "num_input_tokens_seen": 99464720, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.91015625, + "step": 4603, + "time_per_iteration": 2.4667811393737793 + }, + { + "auxiliary_loss_clip": 0.0112975, + "auxiliary_loss_mlp": 0.01040095, + "balance_loss_clip": 1.0252831, + "balance_loss_mlp": 1.04736543, + "epoch": 0.27680745528333084, + "flos": 15012851604480.0, + "grad_norm": 1.7288101924316703, + "language_loss": 0.81411278, + "learning_rate": 3.3943168656531585e-06, + "loss": 0.83581114, + "num_input_tokens_seen": 99482310, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.82421875, + "step": 4604, + "time_per_iteration": 2.4586222171783447 + }, + { + "auxiliary_loss_clip": 0.01135813, + "auxiliary_loss_mlp": 0.01031451, + "balance_loss_clip": 1.01516712, + "balance_loss_mlp": 1.04756212, + "epoch": 0.2768675785359988, + "flos": 22637835734400.0, + "grad_norm": 1.7513688477785454, + "language_loss": 0.69912565, + "learning_rate": 3.3940376274817363e-06, + "loss": 0.72079831, + "num_input_tokens_seen": 99501255, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8828125, + "step": 4605, + "time_per_iteration": 2.5138533115386963 + }, + { + "auxiliary_loss_clip": 0.01045677, + "auxiliary_loss_mlp": 0.01001918, + "balance_loss_clip": 1.00033224, + "balance_loss_mlp": 1.01580858, + "epoch": 0.27692770178866677, + "flos": 66130542881280.0, + "grad_norm": 0.7252635192802935, + "language_loss": 0.57151282, + "learning_rate": 3.3937583364482673e-06, + "loss": 0.59198874, + "num_input_tokens_seen": 99568925, + "router_z_loss_clip": 0.01586914, + "router_z_loss_mlp": 0.296875, + "step": 4606, + "time_per_iteration": 3.184955596923828 + }, + { + "auxiliary_loss_clip": 0.01136733, + "auxiliary_loss_mlp": 0.01049361, + "balance_loss_clip": 1.03234947, + "balance_loss_mlp": 1.0481658, + "epoch": 0.27698782504133473, + "flos": 26464979059200.0, + "grad_norm": 2.0717297663627825, + "language_loss": 0.69666946, + "learning_rate": 3.3934789925633424e-06, + "loss": 0.71853042, + "num_input_tokens_seen": 99588455, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.88671875, + "step": 4607, + "time_per_iteration": 2.5373001098632812 + }, + { + "auxiliary_loss_clip": 0.011299, + "auxiliary_loss_mlp": 0.01035631, + "balance_loss_clip": 1.02014589, + "balance_loss_mlp": 1.04721832, + "epoch": 0.2770479482940027, + "flos": 25884806584320.0, + "grad_norm": 3.332085537790215, + "language_loss": 0.6982615, + "learning_rate": 3.393199595837555e-06, + "loss": 0.71991682, + "num_input_tokens_seen": 99609355, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.828125, + "step": 4608, + "time_per_iteration": 2.5396809577941895 + }, + { + "auxiliary_loss_clip": 0.01135928, + "auxiliary_loss_mlp": 0.01037862, + "balance_loss_clip": 1.02185202, + "balance_loss_mlp": 1.04715931, + "epoch": 0.27710807154667066, + "flos": 22857249962880.0, + "grad_norm": 1.8242818121189563, + "language_loss": 0.72541273, + "learning_rate": 3.392920146281499e-06, + "loss": 0.74715054, + "num_input_tokens_seen": 99628780, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.88671875, + "step": 4609, + "time_per_iteration": 2.5383543968200684 + }, + { + "auxiliary_loss_clip": 0.01134274, + "auxiliary_loss_mlp": 0.01048832, + "balance_loss_clip": 1.03226149, + "balance_loss_mlp": 1.04623055, + "epoch": 0.27716819479933863, + "flos": 17711071401600.0, + "grad_norm": 2.2576811985082967, + "language_loss": 0.84010947, + "learning_rate": 3.3926406439057714e-06, + "loss": 0.86194062, + "num_input_tokens_seen": 99644545, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.87890625, + "step": 4610, + "time_per_iteration": 2.4456827640533447 + }, + { + "auxiliary_loss_clip": 0.01141086, + "auxiliary_loss_mlp": 0.01051097, + "balance_loss_clip": 1.03344178, + "balance_loss_mlp": 1.04996872, + "epoch": 0.2772283180520066, + "flos": 19646046080640.0, + "grad_norm": 2.570198611472629, + "language_loss": 0.68948054, + "learning_rate": 3.3923610887209705e-06, + "loss": 0.71140236, + "num_input_tokens_seen": 99663125, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9140625, + "step": 4611, + "time_per_iteration": 2.5342319011688232 + }, + { + "auxiliary_loss_clip": 0.01130823, + "auxiliary_loss_mlp": 0.01036995, + "balance_loss_clip": 1.0212357, + "balance_loss_mlp": 1.04892015, + "epoch": 0.27728844130467456, + "flos": 21032628842880.0, + "grad_norm": 2.354058548299899, + "language_loss": 0.73450744, + "learning_rate": 3.392081480737698e-06, + "loss": 0.75618565, + "num_input_tokens_seen": 99682645, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8203125, + "step": 4612, + "time_per_iteration": 2.472200632095337 + }, + { + "auxiliary_loss_clip": 0.01137408, + "auxiliary_loss_mlp": 0.0105089, + "balance_loss_clip": 1.03378379, + "balance_loss_mlp": 1.04807258, + "epoch": 0.2773485645573425, + "flos": 18989204025600.0, + "grad_norm": 2.166254073057622, + "language_loss": 0.66736221, + "learning_rate": 3.3918018199665563e-06, + "loss": 0.68924516, + "num_input_tokens_seen": 99700520, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.89453125, + "step": 4613, + "time_per_iteration": 2.5313632488250732 + }, + { + "auxiliary_loss_clip": 0.01131014, + "auxiliary_loss_mlp": 0.01043283, + "balance_loss_clip": 1.02721334, + "balance_loss_mlp": 1.04604864, + "epoch": 0.27740868781001055, + "flos": 21468440557440.0, + "grad_norm": 1.8826548789840187, + "language_loss": 0.79452634, + "learning_rate": 3.39152210641815e-06, + "loss": 0.81626928, + "num_input_tokens_seen": 99720355, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8515625, + "step": 4614, + "time_per_iteration": 2.4869751930236816 + }, + { + "auxiliary_loss_clip": 0.01135901, + "auxiliary_loss_mlp": 0.01043201, + "balance_loss_clip": 1.02684534, + "balance_loss_mlp": 1.0477469, + "epoch": 0.2774688110626785, + "flos": 19827825834240.0, + "grad_norm": 2.573597172535304, + "language_loss": 0.80251336, + "learning_rate": 3.3912423401030865e-06, + "loss": 0.8243044, + "num_input_tokens_seen": 99736090, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8828125, + "step": 4615, + "time_per_iteration": 2.521615505218506 + }, + { + "auxiliary_loss_clip": 0.01135416, + "auxiliary_loss_mlp": 0.01043384, + "balance_loss_clip": 1.02676582, + "balance_loss_mlp": 1.04627132, + "epoch": 0.2775289343153465, + "flos": 18216226321920.0, + "grad_norm": 2.403593727320557, + "language_loss": 0.63926548, + "learning_rate": 3.3909625210319735e-06, + "loss": 0.66105354, + "num_input_tokens_seen": 99751805, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.890625, + "step": 4616, + "time_per_iteration": 2.439410448074341 + }, + { + "auxiliary_loss_clip": 0.01133721, + "auxiliary_loss_mlp": 0.01041991, + "balance_loss_clip": 1.02593398, + "balance_loss_mlp": 1.04661143, + "epoch": 0.27758905756801444, + "flos": 16472476673280.0, + "grad_norm": 1.8467628074440183, + "language_loss": 0.82283223, + "learning_rate": 3.3906826492154226e-06, + "loss": 0.84458935, + "num_input_tokens_seen": 99770610, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87109375, + "step": 4617, + "time_per_iteration": 2.49495792388916 + }, + { + "auxiliary_loss_clip": 0.01133289, + "auxiliary_loss_mlp": 0.01041846, + "balance_loss_clip": 1.02587175, + "balance_loss_mlp": 1.04613662, + "epoch": 0.2776491808206824, + "flos": 18728240739840.0, + "grad_norm": 2.1015666973838942, + "language_loss": 0.76835418, + "learning_rate": 3.3904027246640458e-06, + "loss": 0.79010552, + "num_input_tokens_seen": 99787305, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87109375, + "step": 4618, + "time_per_iteration": 2.4882123470306396 + }, + { + "auxiliary_loss_clip": 0.01136682, + "auxiliary_loss_mlp": 0.0104057, + "balance_loss_clip": 1.02501273, + "balance_loss_mlp": 1.0495801, + "epoch": 0.27770930407335037, + "flos": 28038189911040.0, + "grad_norm": 1.6700061931983001, + "language_loss": 0.84698343, + "learning_rate": 3.390122747388459e-06, + "loss": 0.868756, + "num_input_tokens_seen": 99808940, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.87109375, + "step": 4619, + "time_per_iteration": 2.5996124744415283 + }, + { + "auxiliary_loss_clip": 0.01128767, + "auxiliary_loss_mlp": 0.0103795, + "balance_loss_clip": 1.02340662, + "balance_loss_mlp": 1.04523671, + "epoch": 0.27776942732601834, + "flos": 23549823072000.0, + "grad_norm": 1.4068177028172657, + "language_loss": 0.76720011, + "learning_rate": 3.3898427173992778e-06, + "loss": 0.78886724, + "num_input_tokens_seen": 99829575, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8359375, + "step": 4620, + "time_per_iteration": 2.4851698875427246 + }, + { + "auxiliary_loss_clip": 0.01130943, + "auxiliary_loss_mlp": 0.01036697, + "balance_loss_clip": 1.02126586, + "balance_loss_mlp": 1.04728413, + "epoch": 0.2778295505786863, + "flos": 23908713811200.0, + "grad_norm": 2.4956264272783084, + "language_loss": 0.78746819, + "learning_rate": 3.389562634707122e-06, + "loss": 0.80914462, + "num_input_tokens_seen": 99847575, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8359375, + "step": 4621, + "time_per_iteration": 2.543513774871826 + }, + { + "auxiliary_loss_clip": 0.01135835, + "auxiliary_loss_mlp": 0.01046354, + "balance_loss_clip": 1.02953315, + "balance_loss_mlp": 1.04871762, + "epoch": 0.27788967383135427, + "flos": 25554571920000.0, + "grad_norm": 1.9988562622182164, + "language_loss": 0.87520665, + "learning_rate": 3.389282499322611e-06, + "loss": 0.89702857, + "num_input_tokens_seen": 99864995, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.87109375, + "step": 4622, + "time_per_iteration": 2.4818174839019775 + }, + { + "auxiliary_loss_clip": 0.01133366, + "auxiliary_loss_mlp": 0.01046006, + "balance_loss_clip": 1.02960837, + "balance_loss_mlp": 1.04635906, + "epoch": 0.27794979708402223, + "flos": 16252631481600.0, + "grad_norm": 1.9062066208333321, + "language_loss": 0.81094646, + "learning_rate": 3.389002311256369e-06, + "loss": 0.83274019, + "num_input_tokens_seen": 99881540, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8671875, + "step": 4623, + "time_per_iteration": 2.509218692779541 + }, + { + "auxiliary_loss_clip": 0.01136736, + "auxiliary_loss_mlp": 0.01039194, + "balance_loss_clip": 1.02357817, + "balance_loss_mlp": 1.04981863, + "epoch": 0.2780099203366902, + "flos": 20667632791680.0, + "grad_norm": 1.93503772017796, + "language_loss": 0.81099498, + "learning_rate": 3.3887220705190204e-06, + "loss": 0.83275431, + "num_input_tokens_seen": 99899595, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8671875, + "step": 4624, + "time_per_iteration": 2.470041513442993 + }, + { + "auxiliary_loss_clip": 0.01135607, + "auxiliary_loss_mlp": 0.01041344, + "balance_loss_clip": 1.02563787, + "balance_loss_mlp": 1.05091214, + "epoch": 0.27807004358935816, + "flos": 17739583822080.0, + "grad_norm": 3.184384520938543, + "language_loss": 0.76514304, + "learning_rate": 3.388441777121191e-06, + "loss": 0.7869125, + "num_input_tokens_seen": 99913020, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.84765625, + "step": 4625, + "time_per_iteration": 2.4965567588806152 + }, + { + "auxiliary_loss_clip": 0.01133566, + "auxiliary_loss_mlp": 0.01041252, + "balance_loss_clip": 1.02439606, + "balance_loss_mlp": 1.04835677, + "epoch": 0.2781301668420261, + "flos": 16727119165440.0, + "grad_norm": 2.5511238477154095, + "language_loss": 0.70091927, + "learning_rate": 3.388161431073511e-06, + "loss": 0.7226674, + "num_input_tokens_seen": 99931405, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8515625, + "step": 4626, + "time_per_iteration": 2.462007522583008 + }, + { + "auxiliary_loss_clip": 0.01142353, + "auxiliary_loss_mlp": 0.01036943, + "balance_loss_clip": 1.01981282, + "balance_loss_mlp": 1.05177855, + "epoch": 0.27819029009469415, + "flos": 13844749317120.0, + "grad_norm": 2.1576082410571704, + "language_loss": 0.92738312, + "learning_rate": 3.38788103238661e-06, + "loss": 0.94917607, + "num_input_tokens_seen": 99948100, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.90625, + "step": 4627, + "time_per_iteration": 2.5731146335601807 + }, + { + "auxiliary_loss_clip": 0.01137702, + "auxiliary_loss_mlp": 0.01041394, + "balance_loss_clip": 1.02640903, + "balance_loss_mlp": 1.04856014, + "epoch": 0.2782504133473621, + "flos": 27089286370560.0, + "grad_norm": 4.44086075484182, + "language_loss": 0.85802954, + "learning_rate": 3.387600581071121e-06, + "loss": 0.87982047, + "num_input_tokens_seen": 99966470, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.890625, + "step": 4628, + "time_per_iteration": 2.502816915512085 + }, + { + "auxiliary_loss_clip": 0.01136721, + "auxiliary_loss_mlp": 0.01039197, + "balance_loss_clip": 1.02358079, + "balance_loss_mlp": 1.05035257, + "epoch": 0.2783105366000301, + "flos": 21068826773760.0, + "grad_norm": 1.4685731198996637, + "language_loss": 0.79003006, + "learning_rate": 3.387320077137679e-06, + "loss": 0.81178927, + "num_input_tokens_seen": 99985930, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.86328125, + "step": 4629, + "time_per_iteration": 2.544255256652832 + }, + { + "auxiliary_loss_clip": 0.01132865, + "auxiliary_loss_mlp": 0.01038616, + "balance_loss_clip": 1.02419138, + "balance_loss_mlp": 1.05083036, + "epoch": 0.27837065985269804, + "flos": 26501823434880.0, + "grad_norm": 1.4531737557023054, + "language_loss": 0.84322643, + "learning_rate": 3.3870395205969208e-06, + "loss": 0.86494124, + "num_input_tokens_seen": 100006235, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8203125, + "step": 4630, + "time_per_iteration": 2.514413833618164 + }, + { + "auxiliary_loss_clip": 0.01136217, + "auxiliary_loss_mlp": 0.01040004, + "balance_loss_clip": 1.02343392, + "balance_loss_mlp": 1.04834175, + "epoch": 0.278430783105366, + "flos": 20223201813120.0, + "grad_norm": 2.1800575167200997, + "language_loss": 0.80845618, + "learning_rate": 3.386758911459485e-06, + "loss": 0.83021843, + "num_input_tokens_seen": 100023655, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.87890625, + "step": 4631, + "time_per_iteration": 2.530393123626709 + }, + { + "auxiliary_loss_clip": 0.01141592, + "auxiliary_loss_mlp": 0.01050096, + "balance_loss_clip": 1.03403842, + "balance_loss_mlp": 1.05319762, + "epoch": 0.278490906358034, + "flos": 25592888753280.0, + "grad_norm": 2.154319840219951, + "language_loss": 0.71817827, + "learning_rate": 3.3864782497360126e-06, + "loss": 0.74009514, + "num_input_tokens_seen": 100043280, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8828125, + "step": 4632, + "time_per_iteration": 2.504826307296753 + }, + { + "auxiliary_loss_clip": 0.01135617, + "auxiliary_loss_mlp": 0.01040662, + "balance_loss_clip": 1.02571952, + "balance_loss_mlp": 1.05240536, + "epoch": 0.27855102961070194, + "flos": 16171544528640.0, + "grad_norm": 1.8401586776799086, + "language_loss": 0.82518554, + "learning_rate": 3.386197535437145e-06, + "loss": 0.84694839, + "num_input_tokens_seen": 100057690, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.83203125, + "step": 4633, + "time_per_iteration": 2.484894037246704 + }, + { + "auxiliary_loss_clip": 0.0113869, + "auxiliary_loss_mlp": 0.01044294, + "balance_loss_clip": 1.02622163, + "balance_loss_mlp": 1.05006409, + "epoch": 0.2786111528633699, + "flos": 22927598749440.0, + "grad_norm": 1.740894494158558, + "language_loss": 0.87933433, + "learning_rate": 3.385916768573529e-06, + "loss": 0.90116417, + "num_input_tokens_seen": 100075875, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.88671875, + "step": 4634, + "time_per_iteration": 2.465115785598755 + }, + { + "auxiliary_loss_clip": 0.01139508, + "auxiliary_loss_mlp": 0.0103873, + "balance_loss_clip": 1.02182591, + "balance_loss_mlp": 1.05175185, + "epoch": 0.27867127611603787, + "flos": 23404205335680.0, + "grad_norm": 1.5848956099548452, + "language_loss": 0.77060932, + "learning_rate": 3.38563594915581e-06, + "loss": 0.79239166, + "num_input_tokens_seen": 100092930, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.875, + "step": 4635, + "time_per_iteration": 2.5032925605773926 + }, + { + "auxiliary_loss_clip": 0.01137724, + "auxiliary_loss_mlp": 0.01045364, + "balance_loss_clip": 1.02843595, + "balance_loss_mlp": 1.04919934, + "epoch": 0.27873139936870583, + "flos": 19829010983040.0, + "grad_norm": 1.7277393232375848, + "language_loss": 0.65047133, + "learning_rate": 3.385355077194637e-06, + "loss": 0.67230225, + "num_input_tokens_seen": 100110790, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8828125, + "step": 4636, + "time_per_iteration": 4.078390121459961 + }, + { + "auxiliary_loss_clip": 0.01137292, + "auxiliary_loss_mlp": 0.01039979, + "balance_loss_clip": 1.02249098, + "balance_loss_mlp": 1.04898095, + "epoch": 0.2787915226213738, + "flos": 17707659609600.0, + "grad_norm": 2.3949865449269034, + "language_loss": 0.84131932, + "learning_rate": 3.3850741527006604e-06, + "loss": 0.86309206, + "num_input_tokens_seen": 100126970, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.8828125, + "step": 4637, + "time_per_iteration": 3.9023706912994385 + }, + { + "auxiliary_loss_clip": 0.01131584, + "auxiliary_loss_mlp": 0.0104016, + "balance_loss_clip": 1.02468669, + "balance_loss_mlp": 1.04683113, + "epoch": 0.27885164587404176, + "flos": 22090557139200.0, + "grad_norm": 1.9572077756422592, + "language_loss": 0.75880706, + "learning_rate": 3.384793175684533e-06, + "loss": 0.78052455, + "num_input_tokens_seen": 100146720, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84765625, + "step": 4638, + "time_per_iteration": 2.5291664600372314 + }, + { + "auxiliary_loss_clip": 0.01137756, + "auxiliary_loss_mlp": 0.01044501, + "balance_loss_clip": 1.0281812, + "balance_loss_mlp": 1.04918075, + "epoch": 0.27891176912670973, + "flos": 19207684500480.0, + "grad_norm": 1.663593201704466, + "language_loss": 0.71469444, + "learning_rate": 3.38451214615691e-06, + "loss": 0.73651695, + "num_input_tokens_seen": 100165920, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8828125, + "step": 4639, + "time_per_iteration": 2.4396321773529053 + }, + { + "auxiliary_loss_clip": 0.01135046, + "auxiliary_loss_mlp": 0.01035153, + "balance_loss_clip": 1.01814222, + "balance_loss_mlp": 1.0477488, + "epoch": 0.27897189237937775, + "flos": 27600007898880.0, + "grad_norm": 2.020838508390905, + "language_loss": 0.65634811, + "learning_rate": 3.384231064128447e-06, + "loss": 0.67805016, + "num_input_tokens_seen": 100185525, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.87109375, + "step": 4640, + "time_per_iteration": 2.524146556854248 + }, + { + "auxiliary_loss_clip": 0.01135389, + "auxiliary_loss_mlp": 0.01038864, + "balance_loss_clip": 1.02278829, + "balance_loss_mlp": 1.04838169, + "epoch": 0.2790320156320457, + "flos": 21178210665600.0, + "grad_norm": 1.8663182251903623, + "language_loss": 0.71682954, + "learning_rate": 3.383949929609804e-06, + "loss": 0.738572, + "num_input_tokens_seen": 100204850, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.87109375, + "step": 4641, + "time_per_iteration": 2.45416522026062 + }, + { + "auxiliary_loss_clip": 0.01137426, + "auxiliary_loss_mlp": 0.01043433, + "balance_loss_clip": 1.02620697, + "balance_loss_mlp": 1.04805887, + "epoch": 0.2790921388847137, + "flos": 22783920347520.0, + "grad_norm": 1.721157258136314, + "language_loss": 0.74843872, + "learning_rate": 3.383668742611641e-06, + "loss": 0.77024734, + "num_input_tokens_seen": 100224520, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.890625, + "step": 4642, + "time_per_iteration": 2.498901128768921 + }, + { + "auxiliary_loss_clip": 0.01136083, + "auxiliary_loss_mlp": 0.01041154, + "balance_loss_clip": 1.0241071, + "balance_loss_mlp": 1.04755557, + "epoch": 0.27915226213738165, + "flos": 23400649889280.0, + "grad_norm": 1.7771181879405247, + "language_loss": 0.85500491, + "learning_rate": 3.3833875031446205e-06, + "loss": 0.87677723, + "num_input_tokens_seen": 100243935, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.8828125, + "step": 4643, + "time_per_iteration": 2.4678151607513428 + }, + { + "auxiliary_loss_clip": 0.01135774, + "auxiliary_loss_mlp": 0.01044591, + "balance_loss_clip": 1.02831888, + "balance_loss_mlp": 1.04914284, + "epoch": 0.2792123853900496, + "flos": 22747794243840.0, + "grad_norm": 1.8372365182177028, + "language_loss": 0.8320173, + "learning_rate": 3.383106211219407e-06, + "loss": 0.85382092, + "num_input_tokens_seen": 100262290, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8671875, + "step": 4644, + "time_per_iteration": 2.4989511966705322 + }, + { + "auxiliary_loss_clip": 0.01137034, + "auxiliary_loss_mlp": 0.01039698, + "balance_loss_clip": 1.02340162, + "balance_loss_mlp": 1.04927874, + "epoch": 0.2792725086427176, + "flos": 15049372757760.0, + "grad_norm": 2.1578284197730246, + "language_loss": 0.7905547, + "learning_rate": 3.3828248668466673e-06, + "loss": 0.81232202, + "num_input_tokens_seen": 100280015, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.87890625, + "step": 4645, + "time_per_iteration": 2.444539785385132 + }, + { + "auxiliary_loss_clip": 0.01045698, + "auxiliary_loss_mlp": 0.01013694, + "balance_loss_clip": 1.01202476, + "balance_loss_mlp": 1.01603949, + "epoch": 0.27933263189538554, + "flos": 62544861757440.0, + "grad_norm": 0.7789852310638867, + "language_loss": 0.62276232, + "learning_rate": 3.3825434700370705e-06, + "loss": 0.64335632, + "num_input_tokens_seen": 100338935, + "router_z_loss_clip": 0.01672363, + "router_z_loss_mlp": 0.296875, + "step": 4646, + "time_per_iteration": 3.0487425327301025 + }, + { + "auxiliary_loss_clip": 0.01130687, + "auxiliary_loss_mlp": 0.01035262, + "balance_loss_clip": 1.02039671, + "balance_loss_mlp": 1.04760003, + "epoch": 0.2793927551480535, + "flos": 25118365155840.0, + "grad_norm": 1.6043045349905556, + "language_loss": 0.89379698, + "learning_rate": 3.3822620208012865e-06, + "loss": 0.91545647, + "num_input_tokens_seen": 100359905, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.83203125, + "step": 4647, + "time_per_iteration": 2.537818193435669 + }, + { + "auxiliary_loss_clip": 0.01137315, + "auxiliary_loss_mlp": 0.0104209, + "balance_loss_clip": 1.02559125, + "balance_loss_mlp": 1.04848313, + "epoch": 0.27945287840072147, + "flos": 21324582587520.0, + "grad_norm": 1.6404696751402497, + "language_loss": 0.87119055, + "learning_rate": 3.381980519149988e-06, + "loss": 0.89298457, + "num_input_tokens_seen": 100376955, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.890625, + "step": 4648, + "time_per_iteration": 2.490755081176758 + }, + { + "auxiliary_loss_clip": 0.01138515, + "auxiliary_loss_mlp": 0.01036468, + "balance_loss_clip": 1.01993406, + "balance_loss_mlp": 1.04894495, + "epoch": 0.27951300165338944, + "flos": 27450547407360.0, + "grad_norm": 4.859667262510518, + "language_loss": 0.72424746, + "learning_rate": 3.38169896509385e-06, + "loss": 0.74599725, + "num_input_tokens_seen": 100397545, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.89453125, + "step": 4649, + "time_per_iteration": 2.551149368286133 + }, + { + "auxiliary_loss_clip": 0.01134145, + "auxiliary_loss_mlp": 0.0104133, + "balance_loss_clip": 1.02275741, + "balance_loss_mlp": 1.04667568, + "epoch": 0.2795731249060574, + "flos": 15159008044800.0, + "grad_norm": 2.198213539311656, + "language_loss": 0.80241156, + "learning_rate": 3.381417358643549e-06, + "loss": 0.8241663, + "num_input_tokens_seen": 100415080, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.875, + "step": 4650, + "time_per_iteration": 2.495481252670288 + }, + { + "auxiliary_loss_clip": 0.01043234, + "auxiliary_loss_mlp": 0.01001825, + "balance_loss_clip": 1.00015628, + "balance_loss_mlp": 1.01336908, + "epoch": 0.27963324815872537, + "flos": 60120103178880.0, + "grad_norm": 1.2001935939690993, + "language_loss": 0.58821332, + "learning_rate": 3.3811356998097624e-06, + "loss": 0.60866392, + "num_input_tokens_seen": 100471105, + "router_z_loss_clip": 0.01672363, + "router_z_loss_mlp": 0.296875, + "step": 4651, + "time_per_iteration": 3.089278221130371 + }, + { + "auxiliary_loss_clip": 0.01136266, + "auxiliary_loss_mlp": 0.01041939, + "balance_loss_clip": 1.0239383, + "balance_loss_mlp": 1.04576242, + "epoch": 0.27969337141139333, + "flos": 21765960910080.0, + "grad_norm": 1.6305345142383205, + "language_loss": 0.74335963, + "learning_rate": 3.3808539886031726e-06, + "loss": 0.76514173, + "num_input_tokens_seen": 100492520, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.90625, + "step": 4652, + "time_per_iteration": 2.5034215450286865 + }, + { + "auxiliary_loss_clip": 0.01140774, + "auxiliary_loss_mlp": 0.01044834, + "balance_loss_clip": 1.02826357, + "balance_loss_mlp": 1.05137777, + "epoch": 0.27975349466406135, + "flos": 39851398834560.0, + "grad_norm": 2.1744902530470527, + "language_loss": 0.79703641, + "learning_rate": 3.380572225034461e-06, + "loss": 0.81889254, + "num_input_tokens_seen": 100512870, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.89453125, + "step": 4653, + "time_per_iteration": 2.654989242553711 + }, + { + "auxiliary_loss_clip": 0.0113484, + "auxiliary_loss_mlp": 0.01045549, + "balance_loss_clip": 1.02851391, + "balance_loss_mlp": 1.04782343, + "epoch": 0.2798136179167293, + "flos": 21579799697280.0, + "grad_norm": 2.2131663157599597, + "language_loss": 0.79123974, + "learning_rate": 3.380290409114312e-06, + "loss": 0.81304365, + "num_input_tokens_seen": 100531655, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.87109375, + "step": 4654, + "time_per_iteration": 2.4707679748535156 + }, + { + "auxiliary_loss_clip": 0.01139148, + "auxiliary_loss_mlp": 0.01041113, + "balance_loss_clip": 1.02370811, + "balance_loss_mlp": 1.04861951, + "epoch": 0.2798737411693973, + "flos": 21537676022400.0, + "grad_norm": 2.2002818233708497, + "language_loss": 0.80829996, + "learning_rate": 3.3800085408534127e-06, + "loss": 0.83010256, + "num_input_tokens_seen": 100548005, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.90625, + "step": 4655, + "time_per_iteration": 2.513359546661377 + }, + { + "auxiliary_loss_clip": 0.01135255, + "auxiliary_loss_mlp": 0.01040566, + "balance_loss_clip": 1.0232811, + "balance_loss_mlp": 1.04709148, + "epoch": 0.27993386442206525, + "flos": 26981051713920.0, + "grad_norm": 1.5763016498426998, + "language_loss": 0.8125751, + "learning_rate": 3.3797266202624506e-06, + "loss": 0.8343333, + "num_input_tokens_seen": 100567980, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.87890625, + "step": 4656, + "time_per_iteration": 2.519552707672119 + }, + { + "auxiliary_loss_clip": 0.01135028, + "auxiliary_loss_mlp": 0.01039911, + "balance_loss_clip": 1.02292323, + "balance_loss_mlp": 1.04802632, + "epoch": 0.2799939876747332, + "flos": 24349876652160.0, + "grad_norm": 1.6475258015019663, + "language_loss": 0.83235347, + "learning_rate": 3.3794446473521176e-06, + "loss": 0.85410285, + "num_input_tokens_seen": 100588630, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.87109375, + "step": 4657, + "time_per_iteration": 2.533052444458008 + }, + { + "auxiliary_loss_clip": 0.01136508, + "auxiliary_loss_mlp": 0.01042865, + "balance_loss_clip": 1.0267477, + "balance_loss_mlp": 1.04885554, + "epoch": 0.2800541109274012, + "flos": 33656988648960.0, + "grad_norm": 1.9420207304275756, + "language_loss": 0.63918132, + "learning_rate": 3.379162622133105e-06, + "loss": 0.66097504, + "num_input_tokens_seen": 100608775, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.875, + "step": 4658, + "time_per_iteration": 2.577223777770996 + }, + { + "auxiliary_loss_clip": 0.01137419, + "auxiliary_loss_mlp": 0.01048486, + "balance_loss_clip": 1.03177238, + "balance_loss_mlp": 1.04906631, + "epoch": 0.28011423418006914, + "flos": 21614417429760.0, + "grad_norm": 1.71469006603513, + "language_loss": 0.78447223, + "learning_rate": 3.3788805446161073e-06, + "loss": 0.80633128, + "num_input_tokens_seen": 100627975, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8828125, + "step": 4659, + "time_per_iteration": 2.5102882385253906 + }, + { + "auxiliary_loss_clip": 0.01141159, + "auxiliary_loss_mlp": 0.01052526, + "balance_loss_clip": 1.03565836, + "balance_loss_mlp": 1.05118299, + "epoch": 0.2801743574327371, + "flos": 23112431159040.0, + "grad_norm": 1.8275002529569282, + "language_loss": 0.79481149, + "learning_rate": 3.3785984148118215e-06, + "loss": 0.81674838, + "num_input_tokens_seen": 100645430, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8984375, + "step": 4660, + "time_per_iteration": 2.478348731994629 + }, + { + "auxiliary_loss_clip": 0.01134524, + "auxiliary_loss_mlp": 0.0103899, + "balance_loss_clip": 1.02289653, + "balance_loss_mlp": 1.04855609, + "epoch": 0.2802344806854051, + "flos": 12641418766080.0, + "grad_norm": 1.7763153734220711, + "language_loss": 0.80286032, + "learning_rate": 3.3783162327309453e-06, + "loss": 0.82459545, + "num_input_tokens_seen": 100663775, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.859375, + "step": 4661, + "time_per_iteration": 2.514369249343872 + }, + { + "auxiliary_loss_clip": 0.01140753, + "auxiliary_loss_mlp": 0.01055451, + "balance_loss_clip": 1.03888094, + "balance_loss_mlp": 1.05259752, + "epoch": 0.28029460393807304, + "flos": 37267878142080.0, + "grad_norm": 1.5344085017366311, + "language_loss": 0.78856266, + "learning_rate": 3.3780339983841794e-06, + "loss": 0.8105247, + "num_input_tokens_seen": 100686085, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8828125, + "step": 4662, + "time_per_iteration": 2.6068239212036133 + }, + { + "auxiliary_loss_clip": 0.01142079, + "auxiliary_loss_mlp": 0.01052002, + "balance_loss_clip": 1.03345299, + "balance_loss_mlp": 1.04998207, + "epoch": 0.280354727190741, + "flos": 20741106061440.0, + "grad_norm": 2.3559784459233923, + "language_loss": 0.70354843, + "learning_rate": 3.377751711782227e-06, + "loss": 0.72548926, + "num_input_tokens_seen": 100705135, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.921875, + "step": 4663, + "time_per_iteration": 2.530852794647217 + }, + { + "auxiliary_loss_clip": 0.01139833, + "auxiliary_loss_mlp": 0.01053723, + "balance_loss_clip": 1.03522193, + "balance_loss_mlp": 1.05016875, + "epoch": 0.28041485044340897, + "flos": 21471026336640.0, + "grad_norm": 1.7070620658846938, + "language_loss": 0.77552772, + "learning_rate": 3.377469372935791e-06, + "loss": 0.7974633, + "num_input_tokens_seen": 100724960, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.8984375, + "step": 4664, + "time_per_iteration": 2.5026586055755615 + }, + { + "auxiliary_loss_clip": 0.01132144, + "auxiliary_loss_mlp": 0.01043613, + "balance_loss_clip": 1.02688766, + "balance_loss_mlp": 1.04697514, + "epoch": 0.28047497369607693, + "flos": 14794263388800.0, + "grad_norm": 1.9676420802042491, + "language_loss": 0.79575229, + "learning_rate": 3.377186981855578e-06, + "loss": 0.81750983, + "num_input_tokens_seen": 100741995, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8515625, + "step": 4665, + "time_per_iteration": 2.496948003768921 + }, + { + "auxiliary_loss_clip": 0.01136069, + "auxiliary_loss_mlp": 0.01042713, + "balance_loss_clip": 1.02628565, + "balance_loss_mlp": 1.04934978, + "epoch": 0.2805350969487449, + "flos": 23070738447360.0, + "grad_norm": 8.778135585709748, + "language_loss": 0.80523062, + "learning_rate": 3.3769045385522968e-06, + "loss": 0.82701844, + "num_input_tokens_seen": 100758985, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8671875, + "step": 4666, + "time_per_iteration": 2.4551992416381836 + }, + { + "auxiliary_loss_clip": 0.0113922, + "auxiliary_loss_mlp": 0.01054727, + "balance_loss_clip": 1.03710806, + "balance_loss_mlp": 1.05058241, + "epoch": 0.2805952202014129, + "flos": 20479855466880.0, + "grad_norm": 2.0519370530418493, + "language_loss": 0.84514672, + "learning_rate": 3.376622043036658e-06, + "loss": 0.86708617, + "num_input_tokens_seen": 100777820, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.88671875, + "step": 4667, + "time_per_iteration": 2.503024101257324 + }, + { + "auxiliary_loss_clip": 0.01141868, + "auxiliary_loss_mlp": 0.0104464, + "balance_loss_clip": 1.02771258, + "balance_loss_mlp": 1.05165899, + "epoch": 0.2806553434540809, + "flos": 27417330305280.0, + "grad_norm": 1.59556786146991, + "language_loss": 0.79110259, + "learning_rate": 3.376339495319373e-06, + "loss": 0.81296772, + "num_input_tokens_seen": 100798205, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.90234375, + "step": 4668, + "time_per_iteration": 2.5109217166900635 + }, + { + "auxiliary_loss_clip": 0.01137821, + "auxiliary_loss_mlp": 0.01045025, + "balance_loss_clip": 1.02783513, + "balance_loss_mlp": 1.0472095, + "epoch": 0.28071546670674885, + "flos": 26505019745280.0, + "grad_norm": 5.202292388628492, + "language_loss": 0.7594949, + "learning_rate": 3.3760568954111563e-06, + "loss": 0.78132337, + "num_input_tokens_seen": 100819800, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.90625, + "step": 4669, + "time_per_iteration": 2.5443029403686523 + }, + { + "auxiliary_loss_clip": 0.01139015, + "auxiliary_loss_mlp": 0.01050472, + "balance_loss_clip": 1.03276944, + "balance_loss_mlp": 1.05060363, + "epoch": 0.2807755899594168, + "flos": 20558679863040.0, + "grad_norm": 2.249572842905479, + "language_loss": 0.78818107, + "learning_rate": 3.375774243322725e-06, + "loss": 0.81007588, + "num_input_tokens_seen": 100837880, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.8828125, + "step": 4670, + "time_per_iteration": 2.4583303928375244 + }, + { + "auxiliary_loss_clip": 0.01142576, + "auxiliary_loss_mlp": 0.010505, + "balance_loss_clip": 1.03272545, + "balance_loss_mlp": 1.05169237, + "epoch": 0.2808357132120848, + "flos": 24313319585280.0, + "grad_norm": 2.1344815005037323, + "language_loss": 0.78915119, + "learning_rate": 3.3754915390647955e-06, + "loss": 0.81108201, + "num_input_tokens_seen": 100856350, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.91015625, + "step": 4671, + "time_per_iteration": 2.576904296875 + }, + { + "auxiliary_loss_clip": 0.01136631, + "auxiliary_loss_mlp": 0.01039569, + "balance_loss_clip": 1.02419102, + "balance_loss_mlp": 1.05212355, + "epoch": 0.28089583646475275, + "flos": 26432408401920.0, + "grad_norm": 1.655300005604084, + "language_loss": 0.74891758, + "learning_rate": 3.37520878264809e-06, + "loss": 0.77067947, + "num_input_tokens_seen": 100876135, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84375, + "step": 4672, + "time_per_iteration": 2.5101001262664795 + }, + { + "auxiliary_loss_clip": 0.01139664, + "auxiliary_loss_mlp": 0.01048727, + "balance_loss_clip": 1.0297612, + "balance_loss_mlp": 1.05017138, + "epoch": 0.2809559597174207, + "flos": 23111820627840.0, + "grad_norm": 2.377632390973165, + "language_loss": 0.7485683, + "learning_rate": 3.3749259740833286e-06, + "loss": 0.77045226, + "num_input_tokens_seen": 100894790, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.89453125, + "step": 4673, + "time_per_iteration": 2.5559215545654297 + }, + { + "auxiliary_loss_clip": 0.0113758, + "auxiliary_loss_mlp": 0.01040684, + "balance_loss_clip": 1.02367294, + "balance_loss_mlp": 1.04911065, + "epoch": 0.2810160829700887, + "flos": 20923496346240.0, + "grad_norm": 2.162495737742732, + "language_loss": 0.72274792, + "learning_rate": 3.374643113381237e-06, + "loss": 0.74453062, + "num_input_tokens_seen": 100915100, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.88671875, + "step": 4674, + "time_per_iteration": 2.4755725860595703 + }, + { + "auxiliary_loss_clip": 0.01142202, + "auxiliary_loss_mlp": 0.01043016, + "balance_loss_clip": 1.02487254, + "balance_loss_mlp": 1.05152214, + "epoch": 0.28107620622275664, + "flos": 14355901808640.0, + "grad_norm": 1.8501022214838438, + "language_loss": 0.77636325, + "learning_rate": 3.374360200552541e-06, + "loss": 0.79821539, + "num_input_tokens_seen": 100932795, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.90625, + "step": 4675, + "time_per_iteration": 2.5076191425323486 + }, + { + "auxiliary_loss_clip": 0.011417, + "auxiliary_loss_mlp": 0.01048524, + "balance_loss_clip": 1.03059506, + "balance_loss_mlp": 1.05080581, + "epoch": 0.2811363294754246, + "flos": 20919078973440.0, + "grad_norm": 4.743769816525981, + "language_loss": 0.7033428, + "learning_rate": 3.374077235607968e-06, + "loss": 0.72524506, + "num_input_tokens_seen": 100950505, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.90625, + "step": 4676, + "time_per_iteration": 2.4664652347564697 + }, + { + "auxiliary_loss_clip": 0.01136213, + "auxiliary_loss_mlp": 0.01042174, + "balance_loss_clip": 1.02637279, + "balance_loss_mlp": 1.05219054, + "epoch": 0.28119645272809257, + "flos": 20594841880320.0, + "grad_norm": 1.6504598517134752, + "language_loss": 0.70294476, + "learning_rate": 3.3737942185582487e-06, + "loss": 0.7247287, + "num_input_tokens_seen": 100968790, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.83984375, + "step": 4677, + "time_per_iteration": 3.9926962852478027 + }, + { + "auxiliary_loss_clip": 0.01140831, + "auxiliary_loss_mlp": 0.01046995, + "balance_loss_clip": 1.02779067, + "balance_loss_mlp": 1.05172849, + "epoch": 0.28125657598076054, + "flos": 25337420248320.0, + "grad_norm": 1.7155329144241396, + "language_loss": 0.63506716, + "learning_rate": 3.3735111494141153e-06, + "loss": 0.65694547, + "num_input_tokens_seen": 100990205, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.890625, + "step": 4678, + "time_per_iteration": 5.452545642852783 + }, + { + "auxiliary_loss_clip": 0.01140503, + "auxiliary_loss_mlp": 0.01048157, + "balance_loss_clip": 1.031039, + "balance_loss_mlp": 1.05193949, + "epoch": 0.2813166992334285, + "flos": 24827093769600.0, + "grad_norm": 1.4644682748892532, + "language_loss": 0.70249045, + "learning_rate": 3.3732280281863013e-06, + "loss": 0.7243771, + "num_input_tokens_seen": 101009815, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.88671875, + "step": 4679, + "time_per_iteration": 2.557156801223755 + }, + { + "auxiliary_loss_clip": 0.01138678, + "auxiliary_loss_mlp": 0.01040208, + "balance_loss_clip": 1.02276742, + "balance_loss_mlp": 1.05024076, + "epoch": 0.2813768224860965, + "flos": 21760753438080.0, + "grad_norm": 1.8307759218313573, + "language_loss": 0.74600148, + "learning_rate": 3.3729448548855422e-06, + "loss": 0.76779038, + "num_input_tokens_seen": 101026780, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.8828125, + "step": 4680, + "time_per_iteration": 2.478760004043579 + }, + { + "auxiliary_loss_clip": 0.01140599, + "auxiliary_loss_mlp": 0.01043469, + "balance_loss_clip": 1.0268507, + "balance_loss_mlp": 1.0514679, + "epoch": 0.2814369457387645, + "flos": 24316803204480.0, + "grad_norm": 1.8069902018568411, + "language_loss": 0.77090317, + "learning_rate": 3.3726616295225774e-06, + "loss": 0.79274386, + "num_input_tokens_seen": 101046215, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.890625, + "step": 4681, + "time_per_iteration": 2.5532946586608887 + }, + { + "auxiliary_loss_clip": 0.01142988, + "auxiliary_loss_mlp": 0.01041039, + "balance_loss_clip": 1.02353942, + "balance_loss_mlp": 1.05301392, + "epoch": 0.28149706899143245, + "flos": 18515326872960.0, + "grad_norm": 4.33574203258507, + "language_loss": 0.74047244, + "learning_rate": 3.372378352108146e-06, + "loss": 0.76231277, + "num_input_tokens_seen": 101063365, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.8984375, + "step": 4682, + "time_per_iteration": 2.450707197189331 + }, + { + "auxiliary_loss_clip": 0.0113683, + "auxiliary_loss_mlp": 0.01043566, + "balance_loss_clip": 1.02712727, + "balance_loss_mlp": 1.04989302, + "epoch": 0.2815571922441004, + "flos": 24863255786880.0, + "grad_norm": 1.4103030378304897, + "language_loss": 0.80830532, + "learning_rate": 3.3720950226529894e-06, + "loss": 0.8301093, + "num_input_tokens_seen": 101083835, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8671875, + "step": 4683, + "time_per_iteration": 2.5405828952789307 + }, + { + "auxiliary_loss_clip": 0.01142223, + "auxiliary_loss_mlp": 0.01047785, + "balance_loss_clip": 1.02984428, + "balance_loss_mlp": 1.05146146, + "epoch": 0.2816173154967684, + "flos": 19901622326400.0, + "grad_norm": 1.6936052100643573, + "language_loss": 0.76107442, + "learning_rate": 3.371811641167852e-06, + "loss": 0.78297454, + "num_input_tokens_seen": 101101740, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.90625, + "step": 4684, + "time_per_iteration": 2.4734883308410645 + }, + { + "auxiliary_loss_clip": 0.01135013, + "auxiliary_loss_mlp": 0.01038474, + "balance_loss_clip": 1.02196348, + "balance_loss_mlp": 1.04849601, + "epoch": 0.28167743874943635, + "flos": 17491333950720.0, + "grad_norm": 1.9675146174992446, + "language_loss": 0.7601878, + "learning_rate": 3.3715282076634807e-06, + "loss": 0.7819227, + "num_input_tokens_seen": 101120480, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.86328125, + "step": 4685, + "time_per_iteration": 2.521883010864258 + }, + { + "auxiliary_loss_clip": 0.01136456, + "auxiliary_loss_mlp": 0.01043262, + "balance_loss_clip": 1.02666783, + "balance_loss_mlp": 1.05083728, + "epoch": 0.2817375620021043, + "flos": 25302120157440.0, + "grad_norm": 2.003036282603561, + "language_loss": 0.7616905, + "learning_rate": 3.3712447221506218e-06, + "loss": 0.78348768, + "num_input_tokens_seen": 101142910, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.85546875, + "step": 4686, + "time_per_iteration": 2.5261688232421875 + }, + { + "auxiliary_loss_clip": 0.01140126, + "auxiliary_loss_mlp": 0.01051506, + "balance_loss_clip": 1.03319538, + "balance_loss_mlp": 1.04916072, + "epoch": 0.2817976852547723, + "flos": 18693227957760.0, + "grad_norm": 2.230965321609006, + "language_loss": 0.63345516, + "learning_rate": 3.370961184640025e-06, + "loss": 0.65537149, + "num_input_tokens_seen": 101160030, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.90625, + "step": 4687, + "time_per_iteration": 2.473508834838867 + }, + { + "auxiliary_loss_clip": 0.0114172, + "auxiliary_loss_mlp": 0.01052796, + "balance_loss_clip": 1.03577256, + "balance_loss_mlp": 1.05180609, + "epoch": 0.28185780850744024, + "flos": 22742263549440.0, + "grad_norm": 1.9761865692880811, + "language_loss": 0.76504958, + "learning_rate": 3.3706775951424433e-06, + "loss": 0.7869947, + "num_input_tokens_seen": 101177675, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.8984375, + "step": 4688, + "time_per_iteration": 2.4815330505371094 + }, + { + "auxiliary_loss_clip": 0.01135292, + "auxiliary_loss_mlp": 0.01040309, + "balance_loss_clip": 1.02364409, + "balance_loss_mlp": 1.04902148, + "epoch": 0.2819179317601082, + "flos": 14933919467520.0, + "grad_norm": 2.291650314126009, + "language_loss": 0.78333032, + "learning_rate": 3.37039395366863e-06, + "loss": 0.80508631, + "num_input_tokens_seen": 101192225, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.86328125, + "step": 4689, + "time_per_iteration": 2.464221239089966 + }, + { + "auxiliary_loss_clip": 0.01136671, + "auxiliary_loss_mlp": 0.01042633, + "balance_loss_clip": 1.02566934, + "balance_loss_mlp": 1.04886627, + "epoch": 0.2819780550127762, + "flos": 23145325038720.0, + "grad_norm": 2.2251394110426896, + "language_loss": 0.77819848, + "learning_rate": 3.37011026022934e-06, + "loss": 0.79999155, + "num_input_tokens_seen": 101210870, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.87890625, + "step": 4690, + "time_per_iteration": 2.4802086353302 + }, + { + "auxiliary_loss_clip": 0.01138887, + "auxiliary_loss_mlp": 0.01044233, + "balance_loss_clip": 1.02809191, + "balance_loss_mlp": 1.04984617, + "epoch": 0.28203817826544414, + "flos": 21616356764160.0, + "grad_norm": 1.762007121853784, + "language_loss": 0.8775022, + "learning_rate": 3.369826514835332e-06, + "loss": 0.89933336, + "num_input_tokens_seen": 101229965, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.890625, + "step": 4691, + "time_per_iteration": 2.5098307132720947 + }, + { + "auxiliary_loss_clip": 0.01144357, + "auxiliary_loss_mlp": 0.01043906, + "balance_loss_clip": 1.02714467, + "balance_loss_mlp": 1.0519383, + "epoch": 0.2820983015181121, + "flos": 24026788794240.0, + "grad_norm": 2.144178457094415, + "language_loss": 0.81952238, + "learning_rate": 3.3695427174973654e-06, + "loss": 0.84140503, + "num_input_tokens_seen": 101250980, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.921875, + "step": 4692, + "time_per_iteration": 2.501150131225586 + }, + { + "auxiliary_loss_clip": 0.01137061, + "auxiliary_loss_mlp": 0.01039873, + "balance_loss_clip": 1.02284956, + "balance_loss_mlp": 1.04852128, + "epoch": 0.2821584247707801, + "flos": 30007925976960.0, + "grad_norm": 1.7100054669520195, + "language_loss": 0.74535745, + "learning_rate": 3.3692588682262022e-06, + "loss": 0.7671268, + "num_input_tokens_seen": 101273335, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.88671875, + "step": 4693, + "time_per_iteration": 2.581108808517456 + }, + { + "auxiliary_loss_clip": 0.01139239, + "auxiliary_loss_mlp": 0.01036265, + "balance_loss_clip": 1.01942062, + "balance_loss_mlp": 1.04924035, + "epoch": 0.2822185480234481, + "flos": 21396762967680.0, + "grad_norm": 1.6174705324311944, + "language_loss": 0.7761777, + "learning_rate": 3.3689749670326046e-06, + "loss": 0.79793274, + "num_input_tokens_seen": 101292110, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8984375, + "step": 4694, + "time_per_iteration": 2.479616403579712 + }, + { + "auxiliary_loss_clip": 0.01136707, + "auxiliary_loss_mlp": 0.01038934, + "balance_loss_clip": 1.02220941, + "balance_loss_mlp": 1.05057073, + "epoch": 0.28227867127611606, + "flos": 27452809964160.0, + "grad_norm": 2.0658621313481604, + "language_loss": 0.66812259, + "learning_rate": 3.3686910139273392e-06, + "loss": 0.68987906, + "num_input_tokens_seen": 101312815, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.86328125, + "step": 4695, + "time_per_iteration": 2.560234546661377 + }, + { + "auxiliary_loss_clip": 0.0114143, + "auxiliary_loss_mlp": 0.01047559, + "balance_loss_clip": 1.02859259, + "balance_loss_mlp": 1.05084562, + "epoch": 0.282338794528784, + "flos": 22593736811520.0, + "grad_norm": 2.206840044366299, + "language_loss": 0.75868189, + "learning_rate": 3.3684070089211736e-06, + "loss": 0.78057176, + "num_input_tokens_seen": 101329045, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.90625, + "step": 4696, + "time_per_iteration": 2.484731674194336 + }, + { + "auxiliary_loss_clip": 0.01142111, + "auxiliary_loss_mlp": 0.01049599, + "balance_loss_clip": 1.03283811, + "balance_loss_mlp": 1.05234432, + "epoch": 0.282398917781452, + "flos": 42010923386880.0, + "grad_norm": 4.801168729119655, + "language_loss": 0.62373543, + "learning_rate": 3.368122952024877e-06, + "loss": 0.64565253, + "num_input_tokens_seen": 101352715, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8984375, + "step": 4697, + "time_per_iteration": 2.6771903038024902 + }, + { + "auxiliary_loss_clip": 0.01131406, + "auxiliary_loss_mlp": 0.01035664, + "balance_loss_clip": 1.02003598, + "balance_loss_mlp": 1.0468322, + "epoch": 0.28245904103411995, + "flos": 23224724052480.0, + "grad_norm": 1.6839402690923742, + "language_loss": 0.73317522, + "learning_rate": 3.3678388432492214e-06, + "loss": 0.75484592, + "num_input_tokens_seen": 101374640, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84375, + "step": 4698, + "time_per_iteration": 2.5262162685394287 + }, + { + "auxiliary_loss_clip": 0.01130801, + "auxiliary_loss_mlp": 0.01044648, + "balance_loss_clip": 1.029091, + "balance_loss_mlp": 1.0463903, + "epoch": 0.2825191642867879, + "flos": 25374623760000.0, + "grad_norm": 2.1160143892835275, + "language_loss": 0.74896884, + "learning_rate": 3.3675546826049788e-06, + "loss": 0.77072334, + "num_input_tokens_seen": 101393595, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.84375, + "step": 4699, + "time_per_iteration": 2.5613014698028564 + }, + { + "auxiliary_loss_clip": 0.01139697, + "auxiliary_loss_mlp": 0.01041633, + "balance_loss_clip": 1.02369165, + "balance_loss_mlp": 1.05032122, + "epoch": 0.2825792875394559, + "flos": 17236799199360.0, + "grad_norm": 3.187545417707515, + "language_loss": 0.80256712, + "learning_rate": 3.3672704701029265e-06, + "loss": 0.8243804, + "num_input_tokens_seen": 101409265, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.89453125, + "step": 4700, + "time_per_iteration": 2.4355719089508057 + }, + { + "auxiliary_loss_clip": 0.01132153, + "auxiliary_loss_mlp": 0.01049134, + "balance_loss_clip": 1.03461456, + "balance_loss_mlp": 1.05022645, + "epoch": 0.28263941079212385, + "flos": 26723967096960.0, + "grad_norm": 1.7483881606912919, + "language_loss": 0.81309319, + "learning_rate": 3.3669862057538402e-06, + "loss": 0.8349061, + "num_input_tokens_seen": 101428365, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8203125, + "step": 4701, + "time_per_iteration": 2.590824842453003 + }, + { + "auxiliary_loss_clip": 0.0113653, + "auxiliary_loss_mlp": 0.01038832, + "balance_loss_clip": 1.02301347, + "balance_loss_mlp": 1.05007911, + "epoch": 0.2826995340447918, + "flos": 25921327737600.0, + "grad_norm": 2.214271940066586, + "language_loss": 0.73758674, + "learning_rate": 3.3667018895685004e-06, + "loss": 0.75934035, + "num_input_tokens_seen": 101447280, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.86328125, + "step": 4702, + "time_per_iteration": 2.496689796447754 + }, + { + "auxiliary_loss_clip": 0.01136189, + "auxiliary_loss_mlp": 0.01038892, + "balance_loss_clip": 1.02251232, + "balance_loss_mlp": 1.05127287, + "epoch": 0.2827596572974598, + "flos": 22379709623040.0, + "grad_norm": 1.7981890053968508, + "language_loss": 0.78189409, + "learning_rate": 3.3664175215576886e-06, + "loss": 0.8036449, + "num_input_tokens_seen": 101465435, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8515625, + "step": 4703, + "time_per_iteration": 2.5225300788879395 + }, + { + "auxiliary_loss_clip": 0.011353, + "auxiliary_loss_mlp": 0.01046746, + "balance_loss_clip": 1.02923465, + "balance_loss_mlp": 1.0484302, + "epoch": 0.28281978055012774, + "flos": 33547137880320.0, + "grad_norm": 1.6026897384097336, + "language_loss": 0.6944623, + "learning_rate": 3.3661331017321867e-06, + "loss": 0.71628278, + "num_input_tokens_seen": 101486355, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8671875, + "step": 4704, + "time_per_iteration": 2.5721168518066406 + }, + { + "auxiliary_loss_clip": 0.0113917, + "auxiliary_loss_mlp": 0.01043731, + "balance_loss_clip": 1.02685118, + "balance_loss_mlp": 1.05374229, + "epoch": 0.2828799038027957, + "flos": 23440870143360.0, + "grad_norm": 1.9868129767490792, + "language_loss": 0.69884789, + "learning_rate": 3.3658486301027807e-06, + "loss": 0.7206769, + "num_input_tokens_seen": 101505875, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.85546875, + "step": 4705, + "time_per_iteration": 2.532034397125244 + }, + { + "auxiliary_loss_clip": 0.01057982, + "auxiliary_loss_mlp": 0.01003525, + "balance_loss_clip": 1.0018208, + "balance_loss_mlp": 1.02761459, + "epoch": 0.2829400270554637, + "flos": 69873690251520.0, + "grad_norm": 0.7396595768854823, + "language_loss": 0.59243953, + "learning_rate": 3.3655641066802577e-06, + "loss": 0.61305463, + "num_input_tokens_seen": 101565045, + "router_z_loss_clip": 0.01708984, + "router_z_loss_mlp": 0.3046875, + "step": 4706, + "time_per_iteration": 3.1149942874908447 + }, + { + "auxiliary_loss_clip": 0.01135425, + "auxiliary_loss_mlp": 0.01040531, + "balance_loss_clip": 1.02586842, + "balance_loss_mlp": 1.05135274, + "epoch": 0.2830001503081317, + "flos": 24789028331520.0, + "grad_norm": 1.3972451569930537, + "language_loss": 0.82227451, + "learning_rate": 3.365279531475407e-06, + "loss": 0.84403402, + "num_input_tokens_seen": 101585825, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.83984375, + "step": 4707, + "time_per_iteration": 2.5387215614318848 + }, + { + "auxiliary_loss_clip": 0.01137999, + "auxiliary_loss_mlp": 0.01039747, + "balance_loss_clip": 1.02199709, + "balance_loss_mlp": 1.04914331, + "epoch": 0.28306027356079966, + "flos": 27669387018240.0, + "grad_norm": 1.4509576382878049, + "language_loss": 0.80561262, + "learning_rate": 3.36499490449902e-06, + "loss": 0.82739007, + "num_input_tokens_seen": 101606105, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.890625, + "step": 4708, + "time_per_iteration": 2.5140204429626465 + }, + { + "auxiliary_loss_clip": 0.0105521, + "auxiliary_loss_mlp": 0.01000508, + "balance_loss_clip": 0.99875605, + "balance_loss_mlp": 1.02517498, + "epoch": 0.2831203968134676, + "flos": 60527938199040.0, + "grad_norm": 0.9117312370003612, + "language_loss": 0.62801576, + "learning_rate": 3.3647102257618895e-06, + "loss": 0.64857292, + "num_input_tokens_seen": 101656875, + "router_z_loss_clip": 0.01757812, + "router_z_loss_mlp": 0.30078125, + "step": 4709, + "time_per_iteration": 2.936171054840088 + }, + { + "auxiliary_loss_clip": 0.01133236, + "auxiliary_loss_mlp": 0.01038943, + "balance_loss_clip": 1.02320743, + "balance_loss_mlp": 1.04888415, + "epoch": 0.2831805200661356, + "flos": 22054790171520.0, + "grad_norm": 1.3738384560226649, + "language_loss": 0.73850632, + "learning_rate": 3.3644254952748103e-06, + "loss": 0.76022816, + "num_input_tokens_seen": 101676225, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.84375, + "step": 4710, + "time_per_iteration": 2.4954519271850586 + }, + { + "auxiliary_loss_clip": 0.01137863, + "auxiliary_loss_mlp": 0.01049743, + "balance_loss_clip": 1.03191566, + "balance_loss_mlp": 1.04925823, + "epoch": 0.28324064331880355, + "flos": 22600668136320.0, + "grad_norm": 1.9168276099157815, + "language_loss": 0.79272872, + "learning_rate": 3.364140713048579e-06, + "loss": 0.81460476, + "num_input_tokens_seen": 101693710, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.88671875, + "step": 4711, + "time_per_iteration": 2.4867448806762695 + }, + { + "auxiliary_loss_clip": 0.01138366, + "auxiliary_loss_mlp": 0.01043891, + "balance_loss_clip": 1.02646244, + "balance_loss_mlp": 1.04965401, + "epoch": 0.2833007665714715, + "flos": 30404127968640.0, + "grad_norm": 2.0504814559042064, + "language_loss": 0.71246219, + "learning_rate": 3.363855879093996e-06, + "loss": 0.73428476, + "num_input_tokens_seen": 101714010, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.88671875, + "step": 4712, + "time_per_iteration": 2.575636863708496 + }, + { + "auxiliary_loss_clip": 0.01138441, + "auxiliary_loss_mlp": 0.01049763, + "balance_loss_clip": 1.03291881, + "balance_loss_mlp": 1.05000687, + "epoch": 0.2833608898241395, + "flos": 23549499849600.0, + "grad_norm": 1.8055678270358249, + "language_loss": 0.82008445, + "learning_rate": 3.3635709934218605e-06, + "loss": 0.84196651, + "num_input_tokens_seen": 101732995, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8828125, + "step": 4713, + "time_per_iteration": 2.493767499923706 + }, + { + "auxiliary_loss_clip": 0.01136571, + "auxiliary_loss_mlp": 0.01041134, + "balance_loss_clip": 1.02401519, + "balance_loss_mlp": 1.05028057, + "epoch": 0.28342101307680745, + "flos": 20266726118400.0, + "grad_norm": 1.7485744544400377, + "language_loss": 0.75356781, + "learning_rate": 3.3632860560429766e-06, + "loss": 0.77534491, + "num_input_tokens_seen": 101751385, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.86328125, + "step": 4714, + "time_per_iteration": 2.505153179168701 + }, + { + "auxiliary_loss_clip": 0.01136297, + "auxiliary_loss_mlp": 0.01045701, + "balance_loss_clip": 1.02967894, + "balance_loss_mlp": 1.04942465, + "epoch": 0.2834811363294754, + "flos": 30847050576000.0, + "grad_norm": 1.4087892826571713, + "language_loss": 0.78411347, + "learning_rate": 3.3630010669681494e-06, + "loss": 0.80593348, + "num_input_tokens_seen": 101773825, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8671875, + "step": 4715, + "time_per_iteration": 2.554814100265503 + }, + { + "auxiliary_loss_clip": 0.01135347, + "auxiliary_loss_mlp": 0.01036876, + "balance_loss_clip": 1.02042472, + "balance_loss_mlp": 1.04960322, + "epoch": 0.2835412595821434, + "flos": 22711021695360.0, + "grad_norm": 1.6801208741854476, + "language_loss": 0.73694074, + "learning_rate": 3.3627160262081845e-06, + "loss": 0.758663, + "num_input_tokens_seen": 101791920, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.859375, + "step": 4716, + "time_per_iteration": 2.5286571979522705 + }, + { + "auxiliary_loss_clip": 0.01139786, + "auxiliary_loss_mlp": 0.01041969, + "balance_loss_clip": 1.02437401, + "balance_loss_mlp": 1.04774714, + "epoch": 0.28360138283481134, + "flos": 18077719478400.0, + "grad_norm": 2.328876822443367, + "language_loss": 0.74648547, + "learning_rate": 3.3624309337738917e-06, + "loss": 0.76830298, + "num_input_tokens_seen": 101809515, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.921875, + "step": 4717, + "time_per_iteration": 2.46952223777771 + }, + { + "auxiliary_loss_clip": 0.01139264, + "auxiliary_loss_mlp": 0.01044702, + "balance_loss_clip": 1.02846563, + "balance_loss_mlp": 1.04963374, + "epoch": 0.2836615060874793, + "flos": 17854785717120.0, + "grad_norm": 1.4913957575980352, + "language_loss": 0.669999, + "learning_rate": 3.3621457896760813e-06, + "loss": 0.69183862, + "num_input_tokens_seen": 101827735, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.89453125, + "step": 4718, + "time_per_iteration": 2.4831228256225586 + }, + { + "auxiliary_loss_clip": 0.01137489, + "auxiliary_loss_mlp": 0.01046854, + "balance_loss_clip": 1.03000975, + "balance_loss_mlp": 1.04782009, + "epoch": 0.2837216293401473, + "flos": 25740302169600.0, + "grad_norm": 1.8756812569885382, + "language_loss": 0.72633672, + "learning_rate": 3.361860593925566e-06, + "loss": 0.74818015, + "num_input_tokens_seen": 101845970, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.89453125, + "step": 4719, + "time_per_iteration": 4.022828102111816 + }, + { + "auxiliary_loss_clip": 0.01135497, + "auxiliary_loss_mlp": 0.01041437, + "balance_loss_clip": 1.02554655, + "balance_loss_mlp": 1.04928601, + "epoch": 0.2837817525928153, + "flos": 20923532259840.0, + "grad_norm": 1.5135010931827333, + "language_loss": 0.80621493, + "learning_rate": 3.3615753465331605e-06, + "loss": 0.82798427, + "num_input_tokens_seen": 101865040, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.86328125, + "step": 4720, + "time_per_iteration": 5.367753505706787 + }, + { + "auxiliary_loss_clip": 0.0113932, + "auxiliary_loss_mlp": 0.01044199, + "balance_loss_clip": 1.02752209, + "balance_loss_mlp": 1.05115819, + "epoch": 0.28384187584548326, + "flos": 18916700423040.0, + "grad_norm": 1.7029911565101727, + "language_loss": 0.79467577, + "learning_rate": 3.3612900475096817e-06, + "loss": 0.81651098, + "num_input_tokens_seen": 101883735, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8828125, + "step": 4721, + "time_per_iteration": 2.50327730178833 + }, + { + "auxiliary_loss_clip": 0.01133729, + "auxiliary_loss_mlp": 0.01035212, + "balance_loss_clip": 1.01929736, + "balance_loss_mlp": 1.04810679, + "epoch": 0.2839019990981512, + "flos": 27343964776320.0, + "grad_norm": 2.0644081658079343, + "language_loss": 0.82823032, + "learning_rate": 3.3610046968659474e-06, + "loss": 0.84991974, + "num_input_tokens_seen": 101903025, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.859375, + "step": 4722, + "time_per_iteration": 2.4968478679656982 + }, + { + "auxiliary_loss_clip": 0.01137825, + "auxiliary_loss_mlp": 0.01039882, + "balance_loss_clip": 1.02364612, + "balance_loss_mlp": 1.05073261, + "epoch": 0.2839621223508192, + "flos": 18114312458880.0, + "grad_norm": 1.6187910677092856, + "language_loss": 0.70086461, + "learning_rate": 3.3607192946127785e-06, + "loss": 0.72264171, + "num_input_tokens_seen": 101922255, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.87109375, + "step": 4723, + "time_per_iteration": 2.4899258613586426 + }, + { + "auxiliary_loss_clip": 0.01135132, + "auxiliary_loss_mlp": 0.01044726, + "balance_loss_clip": 1.02747679, + "balance_loss_mlp": 1.04938078, + "epoch": 0.28402224560348716, + "flos": 26358360514560.0, + "grad_norm": 1.736224288784384, + "language_loss": 0.78556609, + "learning_rate": 3.360433840760998e-06, + "loss": 0.8073647, + "num_input_tokens_seen": 101943100, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.859375, + "step": 4724, + "time_per_iteration": 2.496594190597534 + }, + { + "auxiliary_loss_clip": 0.01139767, + "auxiliary_loss_mlp": 0.01043591, + "balance_loss_clip": 1.02660346, + "balance_loss_mlp": 1.05093193, + "epoch": 0.2840823688561551, + "flos": 24060795995520.0, + "grad_norm": 1.6232572980988387, + "language_loss": 0.92404163, + "learning_rate": 3.36014833532143e-06, + "loss": 0.94587529, + "num_input_tokens_seen": 101963160, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.88671875, + "step": 4725, + "time_per_iteration": 2.511526584625244 + }, + { + "auxiliary_loss_clip": 0.01140103, + "auxiliary_loss_mlp": 0.01043108, + "balance_loss_clip": 1.0257988, + "balance_loss_mlp": 1.05020452, + "epoch": 0.2841424921088231, + "flos": 29459821368960.0, + "grad_norm": 2.0539060112221645, + "language_loss": 0.88626051, + "learning_rate": 3.3598627783049e-06, + "loss": 0.90809256, + "num_input_tokens_seen": 101984300, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.8984375, + "step": 4726, + "time_per_iteration": 2.5431292057037354 + }, + { + "auxiliary_loss_clip": 0.01139706, + "auxiliary_loss_mlp": 0.01048538, + "balance_loss_clip": 1.03090727, + "balance_loss_mlp": 1.05034256, + "epoch": 0.28420261536149105, + "flos": 48100367053440.0, + "grad_norm": 2.15176079657567, + "language_loss": 0.78793001, + "learning_rate": 3.359577169722238e-06, + "loss": 0.80981243, + "num_input_tokens_seen": 102005765, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.89453125, + "step": 4727, + "time_per_iteration": 2.7037220001220703 + }, + { + "auxiliary_loss_clip": 0.01133758, + "auxiliary_loss_mlp": 0.01037347, + "balance_loss_clip": 1.02229071, + "balance_loss_mlp": 1.04985464, + "epoch": 0.284262738614159, + "flos": 25666146541440.0, + "grad_norm": 2.258515630996078, + "language_loss": 0.66358554, + "learning_rate": 3.3592915095842733e-06, + "loss": 0.68529654, + "num_input_tokens_seen": 102022755, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.83984375, + "step": 4728, + "time_per_iteration": 2.5066046714782715 + }, + { + "auxiliary_loss_clip": 0.01134281, + "auxiliary_loss_mlp": 0.01045863, + "balance_loss_clip": 1.02941179, + "balance_loss_mlp": 1.04727221, + "epoch": 0.284322861866827, + "flos": 19718980646400.0, + "grad_norm": 1.756924339447767, + "language_loss": 0.75958216, + "learning_rate": 3.3590057979018386e-06, + "loss": 0.78138363, + "num_input_tokens_seen": 102041850, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.87109375, + "step": 4729, + "time_per_iteration": 2.4989402294158936 + }, + { + "auxiliary_loss_clip": 0.01140784, + "auxiliary_loss_mlp": 0.01050786, + "balance_loss_clip": 1.03383398, + "balance_loss_mlp": 1.05095756, + "epoch": 0.28438298511949495, + "flos": 23915250086400.0, + "grad_norm": 1.9682162336594704, + "language_loss": 0.66691023, + "learning_rate": 3.3587200346857674e-06, + "loss": 0.68882596, + "num_input_tokens_seen": 102059500, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.8984375, + "step": 4730, + "time_per_iteration": 2.509514570236206 + }, + { + "auxiliary_loss_clip": 0.01138579, + "auxiliary_loss_mlp": 0.01039094, + "balance_loss_clip": 1.02232122, + "balance_loss_mlp": 1.05049443, + "epoch": 0.2844431083721629, + "flos": 26067340523520.0, + "grad_norm": 1.7814838549320247, + "language_loss": 0.74382442, + "learning_rate": 3.3584342199468965e-06, + "loss": 0.76560116, + "num_input_tokens_seen": 102080460, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8828125, + "step": 4731, + "time_per_iteration": 2.547813653945923 + }, + { + "auxiliary_loss_clip": 0.01136629, + "auxiliary_loss_mlp": 0.01033401, + "balance_loss_clip": 1.01700974, + "balance_loss_mlp": 1.04890573, + "epoch": 0.2845032316248309, + "flos": 25810435474560.0, + "grad_norm": 1.530013147894791, + "language_loss": 0.83553517, + "learning_rate": 3.3581483536960638e-06, + "loss": 0.85723549, + "num_input_tokens_seen": 102100950, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.87890625, + "step": 4732, + "time_per_iteration": 2.5120863914489746 + }, + { + "auxiliary_loss_clip": 0.01136161, + "auxiliary_loss_mlp": 0.0105072, + "balance_loss_clip": 1.03301716, + "balance_loss_mlp": 1.04855001, + "epoch": 0.2845633548774989, + "flos": 19823192979840.0, + "grad_norm": 1.9723104549008028, + "language_loss": 0.79331958, + "learning_rate": 3.357862435944109e-06, + "loss": 0.81518835, + "num_input_tokens_seen": 102119345, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.875, + "step": 4733, + "time_per_iteration": 2.5007243156433105 + }, + { + "auxiliary_loss_clip": 0.01142281, + "auxiliary_loss_mlp": 0.01047444, + "balance_loss_clip": 1.02999151, + "balance_loss_mlp": 1.05076027, + "epoch": 0.28462347813016686, + "flos": 23182815859200.0, + "grad_norm": 2.3591023601535834, + "language_loss": 0.71619761, + "learning_rate": 3.357576466701875e-06, + "loss": 0.73809481, + "num_input_tokens_seen": 102139050, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9140625, + "step": 4734, + "time_per_iteration": 2.482696771621704 + }, + { + "auxiliary_loss_clip": 0.01131669, + "auxiliary_loss_mlp": 0.01036215, + "balance_loss_clip": 1.02036047, + "balance_loss_mlp": 1.04631829, + "epoch": 0.2846836013828348, + "flos": 18660477732480.0, + "grad_norm": 1.8927344989841068, + "language_loss": 0.73762977, + "learning_rate": 3.3572904459802056e-06, + "loss": 0.75930858, + "num_input_tokens_seen": 102157935, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.85546875, + "step": 4735, + "time_per_iteration": 2.4837005138397217 + }, + { + "auxiliary_loss_clip": 0.011344, + "auxiliary_loss_mlp": 0.01045777, + "balance_loss_clip": 1.03006482, + "balance_loss_mlp": 1.04755783, + "epoch": 0.2847437246355028, + "flos": 14173511523840.0, + "grad_norm": 1.630230460143418, + "language_loss": 0.79573876, + "learning_rate": 3.357004373789946e-06, + "loss": 0.81754053, + "num_input_tokens_seen": 102175325, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8671875, + "step": 4736, + "time_per_iteration": 2.4434666633605957 + }, + { + "auxiliary_loss_clip": 0.01139538, + "auxiliary_loss_mlp": 0.0104413, + "balance_loss_clip": 1.02740479, + "balance_loss_mlp": 1.05133057, + "epoch": 0.28480384788817076, + "flos": 29278364837760.0, + "grad_norm": 2.7860738328288637, + "language_loss": 0.59551513, + "learning_rate": 3.3567182501419453e-06, + "loss": 0.61735177, + "num_input_tokens_seen": 102196625, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8828125, + "step": 4737, + "time_per_iteration": 2.580573558807373 + }, + { + "auxiliary_loss_clip": 0.01132871, + "auxiliary_loss_mlp": 0.01039338, + "balance_loss_clip": 1.02334046, + "balance_loss_mlp": 1.04766428, + "epoch": 0.2848639711408387, + "flos": 22601314581120.0, + "grad_norm": 1.7923236486738074, + "language_loss": 0.86353856, + "learning_rate": 3.356432075047052e-06, + "loss": 0.8852607, + "num_input_tokens_seen": 102214975, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8515625, + "step": 4738, + "time_per_iteration": 2.483482837677002 + }, + { + "auxiliary_loss_clip": 0.0113957, + "auxiliary_loss_mlp": 0.01045248, + "balance_loss_clip": 1.02778435, + "balance_loss_mlp": 1.04864287, + "epoch": 0.2849240943935067, + "flos": 17599460866560.0, + "grad_norm": 2.438418234236932, + "language_loss": 0.89730442, + "learning_rate": 3.356145848516118e-06, + "loss": 0.91915256, + "num_input_tokens_seen": 102231885, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.91015625, + "step": 4739, + "time_per_iteration": 2.4746406078338623 + }, + { + "auxiliary_loss_clip": 0.01137971, + "auxiliary_loss_mlp": 0.01041062, + "balance_loss_clip": 1.02450418, + "balance_loss_mlp": 1.05253863, + "epoch": 0.28498421764617465, + "flos": 24862573428480.0, + "grad_norm": 1.3849266219761887, + "language_loss": 0.7207197, + "learning_rate": 3.355859570559998e-06, + "loss": 0.74250996, + "num_input_tokens_seen": 102252725, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.85546875, + "step": 4740, + "time_per_iteration": 2.49682879447937 + }, + { + "auxiliary_loss_clip": 0.01135048, + "auxiliary_loss_mlp": 0.01036754, + "balance_loss_clip": 1.0209707, + "balance_loss_mlp": 1.04970956, + "epoch": 0.2850443408988426, + "flos": 22782555630720.0, + "grad_norm": 1.6055473402712246, + "language_loss": 0.77937335, + "learning_rate": 3.3555732411895477e-06, + "loss": 0.80109143, + "num_input_tokens_seen": 102271730, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8515625, + "step": 4741, + "time_per_iteration": 2.51096248626709 + }, + { + "auxiliary_loss_clip": 0.01136227, + "auxiliary_loss_mlp": 0.01045688, + "balance_loss_clip": 1.02828324, + "balance_loss_mlp": 1.04566443, + "epoch": 0.2851044641515106, + "flos": 18844053166080.0, + "grad_norm": 1.6279093143019605, + "language_loss": 0.76295173, + "learning_rate": 3.3552868604156235e-06, + "loss": 0.78477085, + "num_input_tokens_seen": 102291325, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.90625, + "step": 4742, + "time_per_iteration": 2.462972402572632 + }, + { + "auxiliary_loss_clip": 0.01139125, + "auxiliary_loss_mlp": 0.01048964, + "balance_loss_clip": 1.03039074, + "balance_loss_mlp": 1.04792476, + "epoch": 0.28516458740417855, + "flos": 18880502492160.0, + "grad_norm": 1.8587468959738758, + "language_loss": 0.5772593, + "learning_rate": 3.355000428249086e-06, + "loss": 0.59914023, + "num_input_tokens_seen": 102309000, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9140625, + "step": 4743, + "time_per_iteration": 2.511903762817383 + }, + { + "auxiliary_loss_clip": 0.01141389, + "auxiliary_loss_mlp": 0.01054233, + "balance_loss_clip": 1.03724515, + "balance_loss_mlp": 1.05195451, + "epoch": 0.2852247106568465, + "flos": 25299821687040.0, + "grad_norm": 2.12515026406258, + "language_loss": 0.74454999, + "learning_rate": 3.354713944700797e-06, + "loss": 0.7665062, + "num_input_tokens_seen": 102329240, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.89453125, + "step": 4744, + "time_per_iteration": 2.48883318901062 + }, + { + "auxiliary_loss_clip": 0.01135189, + "auxiliary_loss_mlp": 0.01043767, + "balance_loss_clip": 1.02801967, + "balance_loss_mlp": 1.04948175, + "epoch": 0.2852848339095145, + "flos": 11655383541120.0, + "grad_norm": 2.362002737479584, + "language_loss": 0.77483714, + "learning_rate": 3.3544274097816185e-06, + "loss": 0.79662669, + "num_input_tokens_seen": 102344440, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.859375, + "step": 4745, + "time_per_iteration": 2.4669532775878906 + }, + { + "auxiliary_loss_clip": 0.01130558, + "auxiliary_loss_mlp": 0.01039375, + "balance_loss_clip": 1.02363896, + "balance_loss_mlp": 1.04884791, + "epoch": 0.2853449571621825, + "flos": 12933228856320.0, + "grad_norm": 1.753549870597739, + "language_loss": 0.83101368, + "learning_rate": 3.3541408235024173e-06, + "loss": 0.85271305, + "num_input_tokens_seen": 102360985, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.81640625, + "step": 4746, + "time_per_iteration": 2.4236245155334473 + }, + { + "auxiliary_loss_clip": 0.01138419, + "auxiliary_loss_mlp": 0.01039496, + "balance_loss_clip": 1.02243769, + "balance_loss_mlp": 1.04718721, + "epoch": 0.28540508041485046, + "flos": 20010575255040.0, + "grad_norm": 1.6977094615171933, + "language_loss": 0.79818654, + "learning_rate": 3.3538541858740604e-06, + "loss": 0.81996572, + "num_input_tokens_seen": 102380320, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9140625, + "step": 4747, + "time_per_iteration": 2.47261118888855 + }, + { + "auxiliary_loss_clip": 0.01044617, + "auxiliary_loss_mlp": 0.01004042, + "balance_loss_clip": 1.00208712, + "balance_loss_mlp": 1.01364255, + "epoch": 0.28546520366751843, + "flos": 68139349966080.0, + "grad_norm": 0.7754058718106229, + "language_loss": 0.60505557, + "learning_rate": 3.3535674969074173e-06, + "loss": 0.62554216, + "num_input_tokens_seen": 102439140, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.30859375, + "step": 4748, + "time_per_iteration": 3.087096691131592 + }, + { + "auxiliary_loss_clip": 0.0113463, + "auxiliary_loss_mlp": 0.01041876, + "balance_loss_clip": 1.02596188, + "balance_loss_mlp": 1.04764485, + "epoch": 0.2855253269201864, + "flos": 13251540205440.0, + "grad_norm": 2.177788697298361, + "language_loss": 0.80300528, + "learning_rate": 3.3532807566133592e-06, + "loss": 0.82477033, + "num_input_tokens_seen": 102450990, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.87109375, + "step": 4749, + "time_per_iteration": 2.4132721424102783 + }, + { + "auxiliary_loss_clip": 0.01134988, + "auxiliary_loss_mlp": 0.01038736, + "balance_loss_clip": 1.022488, + "balance_loss_mlp": 1.04882109, + "epoch": 0.28558545017285436, + "flos": 28620876337920.0, + "grad_norm": 1.910787577049047, + "language_loss": 0.7067076, + "learning_rate": 3.3529939650027587e-06, + "loss": 0.72844481, + "num_input_tokens_seen": 102471820, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.86328125, + "step": 4750, + "time_per_iteration": 2.5576114654541016 + }, + { + "auxiliary_loss_clip": 0.01132669, + "auxiliary_loss_mlp": 0.01037305, + "balance_loss_clip": 1.02121782, + "balance_loss_mlp": 1.04961181, + "epoch": 0.2856455734255223, + "flos": 34130470752000.0, + "grad_norm": 1.569446011166348, + "language_loss": 0.81798106, + "learning_rate": 3.3527071220864917e-06, + "loss": 0.83968079, + "num_input_tokens_seen": 102492625, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.83203125, + "step": 4751, + "time_per_iteration": 2.5805511474609375 + }, + { + "auxiliary_loss_clip": 0.01134337, + "auxiliary_loss_mlp": 0.01044364, + "balance_loss_clip": 1.02847314, + "balance_loss_mlp": 1.04876757, + "epoch": 0.2857056966781903, + "flos": 39786149779200.0, + "grad_norm": 1.8824724995030706, + "language_loss": 0.80753136, + "learning_rate": 3.3524202278754353e-06, + "loss": 0.82931828, + "num_input_tokens_seen": 102514145, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.859375, + "step": 4752, + "time_per_iteration": 2.6541080474853516 + }, + { + "auxiliary_loss_clip": 0.01134255, + "auxiliary_loss_mlp": 0.01039105, + "balance_loss_clip": 1.02258289, + "balance_loss_mlp": 1.04778147, + "epoch": 0.28576581993085826, + "flos": 21872292145920.0, + "grad_norm": 1.8943096426553439, + "language_loss": 0.78827929, + "learning_rate": 3.3521332823804676e-06, + "loss": 0.81001288, + "num_input_tokens_seen": 102532365, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.86328125, + "step": 4753, + "time_per_iteration": 2.4775567054748535 + }, + { + "auxiliary_loss_clip": 0.01140638, + "auxiliary_loss_mlp": 0.01043914, + "balance_loss_clip": 1.02559114, + "balance_loss_mlp": 1.05078959, + "epoch": 0.2858259431835262, + "flos": 19091656592640.0, + "grad_norm": 2.205371578508451, + "language_loss": 0.89809895, + "learning_rate": 3.3518462856124704e-06, + "loss": 0.91994447, + "num_input_tokens_seen": 102548425, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.8984375, + "step": 4754, + "time_per_iteration": 2.486128091812134 + }, + { + "auxiliary_loss_clip": 0.01134093, + "auxiliary_loss_mlp": 0.01041613, + "balance_loss_clip": 1.02616322, + "balance_loss_mlp": 1.04897058, + "epoch": 0.2858860664361942, + "flos": 20334309557760.0, + "grad_norm": 1.932227485650823, + "language_loss": 0.8234359, + "learning_rate": 3.3515592375823267e-06, + "loss": 0.84519303, + "num_input_tokens_seen": 102566370, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8515625, + "step": 4755, + "time_per_iteration": 2.491184711456299 + }, + { + "auxiliary_loss_clip": 0.01133783, + "auxiliary_loss_mlp": 0.010447, + "balance_loss_clip": 1.02915466, + "balance_loss_mlp": 1.04667544, + "epoch": 0.28594618968886215, + "flos": 24461738582400.0, + "grad_norm": 1.4908389000148254, + "language_loss": 0.83846784, + "learning_rate": 3.351272138300922e-06, + "loss": 0.86025268, + "num_input_tokens_seen": 102588715, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.87109375, + "step": 4756, + "time_per_iteration": 2.5934014320373535 + }, + { + "auxiliary_loss_clip": 0.01048134, + "auxiliary_loss_mlp": 0.01008558, + "balance_loss_clip": 1.0067457, + "balance_loss_mlp": 1.01677859, + "epoch": 0.2860063129415301, + "flos": 71652850709760.0, + "grad_norm": 0.8659269702666513, + "language_loss": 0.61012161, + "learning_rate": 3.350984987779142e-06, + "loss": 0.63068855, + "num_input_tokens_seen": 102656715, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.3125, + "step": 4757, + "time_per_iteration": 3.2122225761413574 + }, + { + "auxiliary_loss_clip": 0.01137202, + "auxiliary_loss_mlp": 0.01033086, + "balance_loss_clip": 1.0173862, + "balance_loss_mlp": 1.05204773, + "epoch": 0.2860664361941981, + "flos": 20558679863040.0, + "grad_norm": 1.9457322051707677, + "language_loss": 0.65794766, + "learning_rate": 3.3506977860278756e-06, + "loss": 0.67965055, + "num_input_tokens_seen": 102676545, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8515625, + "step": 4758, + "time_per_iteration": 2.60023832321167 + }, + { + "auxiliary_loss_clip": 0.01134399, + "auxiliary_loss_mlp": 0.01036695, + "balance_loss_clip": 1.02027392, + "balance_loss_mlp": 1.04756904, + "epoch": 0.2861265594468661, + "flos": 35996389534080.0, + "grad_norm": 1.560843999265526, + "language_loss": 0.62950313, + "learning_rate": 3.3504105330580143e-06, + "loss": 0.65121412, + "num_input_tokens_seen": 102702875, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8671875, + "step": 4759, + "time_per_iteration": 2.6352102756500244 + }, + { + "auxiliary_loss_clip": 0.0113658, + "auxiliary_loss_mlp": 0.01042718, + "balance_loss_clip": 1.02611256, + "balance_loss_mlp": 1.05098844, + "epoch": 0.28618668269953407, + "flos": 20047419630720.0, + "grad_norm": 1.76909488275169, + "language_loss": 0.7385608, + "learning_rate": 3.3501232288804496e-06, + "loss": 0.76035368, + "num_input_tokens_seen": 102723160, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.85546875, + "step": 4760, + "time_per_iteration": 2.5397889614105225 + }, + { + "auxiliary_loss_clip": 0.01132288, + "auxiliary_loss_mlp": 0.01038545, + "balance_loss_clip": 1.02357185, + "balance_loss_mlp": 1.04949427, + "epoch": 0.28624680595220203, + "flos": 24971849579520.0, + "grad_norm": 1.9401243114633073, + "language_loss": 0.72422945, + "learning_rate": 3.3498358735060773e-06, + "loss": 0.74593776, + "num_input_tokens_seen": 102743855, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 4761, + "time_per_iteration": 4.029369592666626 + }, + { + "auxiliary_loss_clip": 0.01135721, + "auxiliary_loss_mlp": 0.0104628, + "balance_loss_clip": 1.0303421, + "balance_loss_mlp": 1.04875946, + "epoch": 0.28630692920487, + "flos": 22492253911680.0, + "grad_norm": 2.026540334724573, + "language_loss": 0.74605787, + "learning_rate": 3.349548466945793e-06, + "loss": 0.76787788, + "num_input_tokens_seen": 102761370, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.87109375, + "step": 4762, + "time_per_iteration": 3.9056994915008545 + }, + { + "auxiliary_loss_clip": 0.01134836, + "auxiliary_loss_mlp": 0.01045458, + "balance_loss_clip": 1.02963901, + "balance_loss_mlp": 1.05027771, + "epoch": 0.28636705245753796, + "flos": 21249888255360.0, + "grad_norm": 1.79451974437327, + "language_loss": 0.76088154, + "learning_rate": 3.349261009210496e-06, + "loss": 0.78268445, + "num_input_tokens_seen": 102780885, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.84375, + "step": 4763, + "time_per_iteration": 2.521223545074463 + }, + { + "auxiliary_loss_clip": 0.01133105, + "auxiliary_loss_mlp": 0.01035478, + "balance_loss_clip": 1.01907468, + "balance_loss_mlp": 1.04712808, + "epoch": 0.28642717571020593, + "flos": 24095772864000.0, + "grad_norm": 1.9430054907967222, + "language_loss": 0.76937616, + "learning_rate": 3.348973500311086e-06, + "loss": 0.79106188, + "num_input_tokens_seen": 102801000, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.859375, + "step": 4764, + "time_per_iteration": 2.4924814701080322 + }, + { + "auxiliary_loss_clip": 0.01137128, + "auxiliary_loss_mlp": 0.01041403, + "balance_loss_clip": 1.02354538, + "balance_loss_mlp": 1.04996395, + "epoch": 0.2864872989628739, + "flos": 22601386408320.0, + "grad_norm": 1.8973954036904035, + "language_loss": 0.71061826, + "learning_rate": 3.348685940258466e-06, + "loss": 0.73240352, + "num_input_tokens_seen": 102820230, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.87109375, + "step": 4765, + "time_per_iteration": 2.509204387664795 + }, + { + "auxiliary_loss_clip": 0.01131492, + "auxiliary_loss_mlp": 0.01037402, + "balance_loss_clip": 1.02149963, + "balance_loss_mlp": 1.04705501, + "epoch": 0.28654742221554186, + "flos": 32745073138560.0, + "grad_norm": 1.5129940587619137, + "language_loss": 0.75756145, + "learning_rate": 3.3483983290635395e-06, + "loss": 0.77925038, + "num_input_tokens_seen": 102842670, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.84375, + "step": 4766, + "time_per_iteration": 2.562422513961792 + }, + { + "auxiliary_loss_clip": 0.01135318, + "auxiliary_loss_mlp": 0.0103558, + "balance_loss_clip": 1.01960635, + "balance_loss_mlp": 1.05073392, + "epoch": 0.2866075454682098, + "flos": 26981626331520.0, + "grad_norm": 1.5780141248071407, + "language_loss": 0.77556801, + "learning_rate": 3.348110666737214e-06, + "loss": 0.79727697, + "num_input_tokens_seen": 102864480, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.84375, + "step": 4767, + "time_per_iteration": 2.5476057529449463 + }, + { + "auxiliary_loss_clip": 0.01133832, + "auxiliary_loss_mlp": 0.01041655, + "balance_loss_clip": 1.02591908, + "balance_loss_mlp": 1.04878676, + "epoch": 0.2866676687208778, + "flos": 23253847004160.0, + "grad_norm": 2.169490874338027, + "language_loss": 0.6494413, + "learning_rate": 3.3478229532903956e-06, + "loss": 0.67119616, + "num_input_tokens_seen": 102883740, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8515625, + "step": 4768, + "time_per_iteration": 2.4961044788360596 + }, + { + "auxiliary_loss_clip": 0.01137611, + "auxiliary_loss_mlp": 0.01044314, + "balance_loss_clip": 1.02807736, + "balance_loss_mlp": 1.04944301, + "epoch": 0.28672779197354575, + "flos": 21579727870080.0, + "grad_norm": 1.5253191671074575, + "language_loss": 0.70345664, + "learning_rate": 3.3475351887339967e-06, + "loss": 0.72527587, + "num_input_tokens_seen": 102902945, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8828125, + "step": 4769, + "time_per_iteration": 2.5243568420410156 + }, + { + "auxiliary_loss_clip": 0.01136117, + "auxiliary_loss_mlp": 0.01034848, + "balance_loss_clip": 1.01992261, + "balance_loss_mlp": 1.04866219, + "epoch": 0.2867879152262137, + "flos": 19865568049920.0, + "grad_norm": 1.7483868508562144, + "language_loss": 0.75552189, + "learning_rate": 3.3472473730789288e-06, + "loss": 0.77723145, + "num_input_tokens_seen": 102922405, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.875, + "step": 4770, + "time_per_iteration": 2.468655586242676 + }, + { + "auxiliary_loss_clip": 0.01137893, + "auxiliary_loss_mlp": 0.01043906, + "balance_loss_clip": 1.02745509, + "balance_loss_mlp": 1.0500282, + "epoch": 0.2868480384788817, + "flos": 28213325648640.0, + "grad_norm": 3.1666126901900107, + "language_loss": 0.6730839, + "learning_rate": 3.3469595063361045e-06, + "loss": 0.69490194, + "num_input_tokens_seen": 102938980, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.87890625, + "step": 4771, + "time_per_iteration": 2.5334818363189697 + }, + { + "auxiliary_loss_clip": 0.01046415, + "auxiliary_loss_mlp": 0.01005401, + "balance_loss_clip": 1.00367248, + "balance_loss_mlp": 1.01655006, + "epoch": 0.2869081617315497, + "flos": 65424286690560.0, + "grad_norm": 0.7694277286160668, + "language_loss": 0.56883639, + "learning_rate": 3.3466715885164414e-06, + "loss": 0.58935452, + "num_input_tokens_seen": 103000405, + "router_z_loss_clip": 0.01733398, + "router_z_loss_mlp": 0.29882812, + "step": 4772, + "time_per_iteration": 3.0373501777648926 + }, + { + "auxiliary_loss_clip": 0.01136901, + "auxiliary_loss_mlp": 0.01041473, + "balance_loss_clip": 1.02567768, + "balance_loss_mlp": 1.05014777, + "epoch": 0.28696828498421767, + "flos": 18660729127680.0, + "grad_norm": 2.6517872983988844, + "language_loss": 0.83356023, + "learning_rate": 3.346383619630856e-06, + "loss": 0.85534406, + "num_input_tokens_seen": 103017970, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8671875, + "step": 4773, + "time_per_iteration": 2.477537155151367 + }, + { + "auxiliary_loss_clip": 0.0113402, + "auxiliary_loss_mlp": 0.0103818, + "balance_loss_clip": 1.02159762, + "balance_loss_mlp": 1.04630029, + "epoch": 0.28702840823688563, + "flos": 23659745667840.0, + "grad_norm": 2.6367186533355356, + "language_loss": 0.77910906, + "learning_rate": 3.34609559969027e-06, + "loss": 0.80083102, + "num_input_tokens_seen": 103036385, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.875, + "step": 4774, + "time_per_iteration": 2.514545440673828 + }, + { + "auxiliary_loss_clip": 0.01136368, + "auxiliary_loss_mlp": 0.01037759, + "balance_loss_clip": 1.02091432, + "balance_loss_mlp": 1.05010271, + "epoch": 0.2870885314895536, + "flos": 13804744544640.0, + "grad_norm": 1.7122435327393783, + "language_loss": 0.73488462, + "learning_rate": 3.3458075287056034e-06, + "loss": 0.75662589, + "num_input_tokens_seen": 103052170, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.86328125, + "step": 4775, + "time_per_iteration": 2.4526851177215576 + }, + { + "auxiliary_loss_clip": 0.0113744, + "auxiliary_loss_mlp": 0.01037967, + "balance_loss_clip": 1.02267885, + "balance_loss_mlp": 1.05033445, + "epoch": 0.28714865474222157, + "flos": 17786771314560.0, + "grad_norm": 1.655187901014976, + "language_loss": 0.88345891, + "learning_rate": 3.34551940668778e-06, + "loss": 0.905213, + "num_input_tokens_seen": 103070510, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.87109375, + "step": 4776, + "time_per_iteration": 2.5487112998962402 + }, + { + "auxiliary_loss_clip": 0.01135791, + "auxiliary_loss_mlp": 0.01037024, + "balance_loss_clip": 1.02170587, + "balance_loss_mlp": 1.05060029, + "epoch": 0.28720877799488953, + "flos": 15997486199040.0, + "grad_norm": 1.7920640817181568, + "language_loss": 0.74046421, + "learning_rate": 3.345231233647726e-06, + "loss": 0.76219237, + "num_input_tokens_seen": 103089590, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8515625, + "step": 4777, + "time_per_iteration": 2.4858744144439697 + }, + { + "auxiliary_loss_clip": 0.01143681, + "auxiliary_loss_mlp": 0.01044417, + "balance_loss_clip": 1.02763224, + "balance_loss_mlp": 1.05306673, + "epoch": 0.2872689012475575, + "flos": 20923137210240.0, + "grad_norm": 1.9679293284940167, + "language_loss": 0.80052459, + "learning_rate": 3.3449430095963696e-06, + "loss": 0.82240558, + "num_input_tokens_seen": 103109080, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.90625, + "step": 4778, + "time_per_iteration": 2.536553382873535 + }, + { + "auxiliary_loss_clip": 0.01135156, + "auxiliary_loss_mlp": 0.01046142, + "balance_loss_clip": 1.03032279, + "balance_loss_mlp": 1.05058503, + "epoch": 0.28732902450022546, + "flos": 21325121291520.0, + "grad_norm": 1.6265242751714746, + "language_loss": 0.73940611, + "learning_rate": 3.3446547345446386e-06, + "loss": 0.76121908, + "num_input_tokens_seen": 103127755, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.84765625, + "step": 4779, + "time_per_iteration": 2.5068604946136475 + }, + { + "auxiliary_loss_clip": 0.01139025, + "auxiliary_loss_mlp": 0.01044309, + "balance_loss_clip": 1.02791739, + "balance_loss_mlp": 1.05089593, + "epoch": 0.2873891477528934, + "flos": 20850382212480.0, + "grad_norm": 1.5791887497798731, + "language_loss": 0.76378506, + "learning_rate": 3.3443664085034656e-06, + "loss": 0.78561842, + "num_input_tokens_seen": 103147035, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8828125, + "step": 4780, + "time_per_iteration": 2.6357336044311523 + }, + { + "auxiliary_loss_clip": 0.01132548, + "auxiliary_loss_mlp": 0.01042513, + "balance_loss_clip": 1.02789187, + "balance_loss_mlp": 1.04874134, + "epoch": 0.2874492710055614, + "flos": 17420051410560.0, + "grad_norm": 1.8554557560955622, + "language_loss": 0.81367111, + "learning_rate": 3.344078031483784e-06, + "loss": 0.83542168, + "num_input_tokens_seen": 103165410, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8359375, + "step": 4781, + "time_per_iteration": 2.484217405319214 + }, + { + "auxiliary_loss_clip": 0.01138983, + "auxiliary_loss_mlp": 0.01044127, + "balance_loss_clip": 1.02688909, + "balance_loss_mlp": 1.0511862, + "epoch": 0.28750939425822936, + "flos": 13406818700160.0, + "grad_norm": 1.9124031057386872, + "language_loss": 0.86249948, + "learning_rate": 3.3437896034965283e-06, + "loss": 0.88433063, + "num_input_tokens_seen": 103183710, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.87890625, + "step": 4782, + "time_per_iteration": 2.4822945594787598 + }, + { + "auxiliary_loss_clip": 0.0113749, + "auxiliary_loss_mlp": 0.0104499, + "balance_loss_clip": 1.02842641, + "balance_loss_mlp": 1.05222881, + "epoch": 0.2875695175108973, + "flos": 21870029589120.0, + "grad_norm": 1.5584901619772236, + "language_loss": 0.71195668, + "learning_rate": 3.3435011245526357e-06, + "loss": 0.73378146, + "num_input_tokens_seen": 103203790, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.85546875, + "step": 4783, + "time_per_iteration": 2.4959099292755127 + }, + { + "auxiliary_loss_clip": 0.01136896, + "auxiliary_loss_mlp": 0.0104062, + "balance_loss_clip": 1.02443171, + "balance_loss_mlp": 1.05179179, + "epoch": 0.2876296407635653, + "flos": 26245457089920.0, + "grad_norm": 3.6731562407195932, + "language_loss": 0.77011871, + "learning_rate": 3.343212594663047e-06, + "loss": 0.79189384, + "num_input_tokens_seen": 103223925, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8515625, + "step": 4784, + "time_per_iteration": 2.55037784576416 + }, + { + "auxiliary_loss_clip": 0.0113214, + "auxiliary_loss_mlp": 0.01041887, + "balance_loss_clip": 1.02603197, + "balance_loss_mlp": 1.04896331, + "epoch": 0.28768976401623325, + "flos": 25373654092800.0, + "grad_norm": 1.5223386635016902, + "language_loss": 0.75859249, + "learning_rate": 3.3429240138387015e-06, + "loss": 0.7803328, + "num_input_tokens_seen": 103244760, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.83203125, + "step": 4785, + "time_per_iteration": 2.526587724685669 + }, + { + "auxiliary_loss_clip": 0.01135192, + "auxiliary_loss_mlp": 0.01042659, + "balance_loss_clip": 1.02724528, + "balance_loss_mlp": 1.04946601, + "epoch": 0.28774988726890127, + "flos": 30664372982400.0, + "grad_norm": 1.9982438427344784, + "language_loss": 0.83033895, + "learning_rate": 3.3426353820905425e-06, + "loss": 0.85211748, + "num_input_tokens_seen": 103261995, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.859375, + "step": 4786, + "time_per_iteration": 2.5786821842193604 + }, + { + "auxiliary_loss_clip": 0.01134245, + "auxiliary_loss_mlp": 0.01033568, + "balance_loss_clip": 1.01899481, + "balance_loss_mlp": 1.04868317, + "epoch": 0.28781001052156924, + "flos": 20595452411520.0, + "grad_norm": 1.95457297040312, + "language_loss": 0.80007184, + "learning_rate": 3.342346699429516e-06, + "loss": 0.82174993, + "num_input_tokens_seen": 103279780, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.85546875, + "step": 4787, + "time_per_iteration": 2.4734396934509277 + }, + { + "auxiliary_loss_clip": 0.01136278, + "auxiliary_loss_mlp": 0.01039735, + "balance_loss_clip": 1.02397585, + "balance_loss_mlp": 1.04906642, + "epoch": 0.2878701337742372, + "flos": 26542330997760.0, + "grad_norm": 2.6671828195015044, + "language_loss": 0.83666658, + "learning_rate": 3.3420579658665677e-06, + "loss": 0.85842675, + "num_input_tokens_seen": 103300580, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.87109375, + "step": 4788, + "time_per_iteration": 2.5388548374176025 + }, + { + "auxiliary_loss_clip": 0.01137234, + "auxiliary_loss_mlp": 0.01046523, + "balance_loss_clip": 1.03135991, + "balance_loss_mlp": 1.05051816, + "epoch": 0.28793025702690517, + "flos": 28146855530880.0, + "grad_norm": 1.8168797658695668, + "language_loss": 0.73769903, + "learning_rate": 3.3417691814126468e-06, + "loss": 0.75953662, + "num_input_tokens_seen": 103320430, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8671875, + "step": 4789, + "time_per_iteration": 2.5259692668914795 + }, + { + "auxiliary_loss_clip": 0.01129641, + "auxiliary_loss_mlp": 0.01043253, + "balance_loss_clip": 1.02819657, + "balance_loss_mlp": 1.0466274, + "epoch": 0.28799038027957313, + "flos": 23805471144960.0, + "grad_norm": 1.7572733449240283, + "language_loss": 0.83982229, + "learning_rate": 3.341480346078704e-06, + "loss": 0.86155128, + "num_input_tokens_seen": 103337695, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 4790, + "time_per_iteration": 2.5347094535827637 + }, + { + "auxiliary_loss_clip": 0.01136016, + "auxiliary_loss_mlp": 0.01039193, + "balance_loss_clip": 1.02267063, + "balance_loss_mlp": 1.05011547, + "epoch": 0.2880505035322411, + "flos": 22344122223360.0, + "grad_norm": 1.8137236403798864, + "language_loss": 0.77924603, + "learning_rate": 3.3411914598756922e-06, + "loss": 0.80099815, + "num_input_tokens_seen": 103357010, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.859375, + "step": 4791, + "time_per_iteration": 2.475328207015991 + }, + { + "auxiliary_loss_clip": 0.0113676, + "auxiliary_loss_mlp": 0.01034669, + "balance_loss_clip": 1.01854002, + "balance_loss_mlp": 1.04824567, + "epoch": 0.28811062678490906, + "flos": 18004246208640.0, + "grad_norm": 1.933659829708973, + "language_loss": 0.70760292, + "learning_rate": 3.3409025228145654e-06, + "loss": 0.72931719, + "num_input_tokens_seen": 103375600, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.88671875, + "step": 4792, + "time_per_iteration": 2.4705538749694824 + }, + { + "auxiliary_loss_clip": 0.01135222, + "auxiliary_loss_mlp": 0.01035984, + "balance_loss_clip": 1.02065361, + "balance_loss_mlp": 1.04968917, + "epoch": 0.28817075003757703, + "flos": 22090880361600.0, + "grad_norm": 2.08648870526395, + "language_loss": 0.79392564, + "learning_rate": 3.3406135349062812e-06, + "loss": 0.81563771, + "num_input_tokens_seen": 103395225, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.85546875, + "step": 4793, + "time_per_iteration": 2.509697914123535 + }, + { + "auxiliary_loss_clip": 0.01131221, + "auxiliary_loss_mlp": 0.01040002, + "balance_loss_clip": 1.02479076, + "balance_loss_mlp": 1.04920101, + "epoch": 0.288230873290245, + "flos": 41683130847360.0, + "grad_norm": 1.6269924793239006, + "language_loss": 0.77731872, + "learning_rate": 3.340324496161797e-06, + "loss": 0.7990309, + "num_input_tokens_seen": 103417245, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8203125, + "step": 4794, + "time_per_iteration": 2.6943047046661377 + }, + { + "auxiliary_loss_clip": 0.01134923, + "auxiliary_loss_mlp": 0.0104449, + "balance_loss_clip": 1.02819395, + "balance_loss_mlp": 1.04913807, + "epoch": 0.28829099654291296, + "flos": 18624423456000.0, + "grad_norm": 2.663854929830155, + "language_loss": 0.8254813, + "learning_rate": 3.340035406592074e-06, + "loss": 0.84727538, + "num_input_tokens_seen": 103435500, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.859375, + "step": 4795, + "time_per_iteration": 2.4633255004882812 + }, + { + "auxiliary_loss_clip": 0.01129713, + "auxiliary_loss_mlp": 0.01038999, + "balance_loss_clip": 1.02387166, + "balance_loss_mlp": 1.04899204, + "epoch": 0.2883511197955809, + "flos": 24674832017280.0, + "grad_norm": 2.661730786650402, + "language_loss": 0.74650323, + "learning_rate": 3.339746266208074e-06, + "loss": 0.76819038, + "num_input_tokens_seen": 103451040, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.80859375, + "step": 4796, + "time_per_iteration": 2.5179266929626465 + }, + { + "auxiliary_loss_clip": 0.01138692, + "auxiliary_loss_mlp": 0.01040905, + "balance_loss_clip": 1.02334583, + "balance_loss_mlp": 1.04789257, + "epoch": 0.2884112430482489, + "flos": 23112143850240.0, + "grad_norm": 1.8865626242662115, + "language_loss": 0.72797763, + "learning_rate": 3.3394570750207614e-06, + "loss": 0.74977362, + "num_input_tokens_seen": 103471330, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.90625, + "step": 4797, + "time_per_iteration": 2.4910430908203125 + }, + { + "auxiliary_loss_clip": 0.01135339, + "auxiliary_loss_mlp": 0.01040629, + "balance_loss_clip": 1.02475667, + "balance_loss_mlp": 1.04989898, + "epoch": 0.28847136630091685, + "flos": 16873347432960.0, + "grad_norm": 2.109884297899412, + "language_loss": 0.74219149, + "learning_rate": 3.3391678330411017e-06, + "loss": 0.76395118, + "num_input_tokens_seen": 103488060, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8515625, + "step": 4798, + "time_per_iteration": 2.472590923309326 + }, + { + "auxiliary_loss_clip": 0.01134882, + "auxiliary_loss_mlp": 0.01043827, + "balance_loss_clip": 1.02631509, + "balance_loss_mlp": 1.04689598, + "epoch": 0.2885314895535849, + "flos": 25657527277440.0, + "grad_norm": 2.7660889265500996, + "language_loss": 0.64920753, + "learning_rate": 3.3388785402800642e-06, + "loss": 0.67099464, + "num_input_tokens_seen": 103503600, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.87890625, + "step": 4799, + "time_per_iteration": 2.4816339015960693 + }, + { + "auxiliary_loss_clip": 0.01136164, + "auxiliary_loss_mlp": 0.01043963, + "balance_loss_clip": 1.02784538, + "balance_loss_mlp": 1.04912758, + "epoch": 0.28859161280625284, + "flos": 21107251347840.0, + "grad_norm": 2.0794132014970272, + "language_loss": 0.82202137, + "learning_rate": 3.3385891967486178e-06, + "loss": 0.84382272, + "num_input_tokens_seen": 103524195, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.87109375, + "step": 4800, + "time_per_iteration": 2.5249674320220947 + }, + { + "auxiliary_loss_clip": 0.01128617, + "auxiliary_loss_mlp": 0.01038626, + "balance_loss_clip": 1.02312899, + "balance_loss_mlp": 1.04702258, + "epoch": 0.2886517360589208, + "flos": 26469540086400.0, + "grad_norm": 1.639042715490093, + "language_loss": 0.90946537, + "learning_rate": 3.3382998024577347e-06, + "loss": 0.93113768, + "num_input_tokens_seen": 103545235, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.81640625, + "step": 4801, + "time_per_iteration": 2.531658172607422 + }, + { + "auxiliary_loss_clip": 0.01133327, + "auxiliary_loss_mlp": 0.01038392, + "balance_loss_clip": 1.0221796, + "balance_loss_mlp": 1.04792547, + "epoch": 0.28871185931158877, + "flos": 25265275781760.0, + "grad_norm": 2.176318344562637, + "language_loss": 0.73644328, + "learning_rate": 3.33801035741839e-06, + "loss": 0.75816047, + "num_input_tokens_seen": 103563305, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8515625, + "step": 4802, + "time_per_iteration": 4.080524444580078 + }, + { + "auxiliary_loss_clip": 0.01040178, + "auxiliary_loss_mlp": 0.01006047, + "balance_loss_clip": 1.00423479, + "balance_loss_mlp": 1.01114249, + "epoch": 0.28877198256425674, + "flos": 66665431284480.0, + "grad_norm": 0.7820100192493779, + "language_loss": 0.63009298, + "learning_rate": 3.337720861641558e-06, + "loss": 0.65055525, + "num_input_tokens_seen": 103625025, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.2890625, + "step": 4803, + "time_per_iteration": 4.464243412017822 + }, + { + "auxiliary_loss_clip": 0.0112919, + "auxiliary_loss_mlp": 0.01046023, + "balance_loss_clip": 1.03008461, + "balance_loss_mlp": 1.04523563, + "epoch": 0.2888321058169247, + "flos": 20303031790080.0, + "grad_norm": 1.7581002683255658, + "language_loss": 0.70800668, + "learning_rate": 3.3374313151382165e-06, + "loss": 0.72975886, + "num_input_tokens_seen": 103644235, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8359375, + "step": 4804, + "time_per_iteration": 2.4655730724334717 + }, + { + "auxiliary_loss_clip": 0.01134858, + "auxiliary_loss_mlp": 0.0104232, + "balance_loss_clip": 1.02464128, + "balance_loss_mlp": 1.04650438, + "epoch": 0.28889222906959267, + "flos": 25516721963520.0, + "grad_norm": 1.8916446417141755, + "language_loss": 0.68253011, + "learning_rate": 3.337141717919346e-06, + "loss": 0.70430195, + "num_input_tokens_seen": 103664700, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.8828125, + "step": 4805, + "time_per_iteration": 2.53932523727417 + }, + { + "auxiliary_loss_clip": 0.01133301, + "auxiliary_loss_mlp": 0.01041795, + "balance_loss_clip": 1.0262022, + "balance_loss_mlp": 1.04706144, + "epoch": 0.28895235232226063, + "flos": 32671312560000.0, + "grad_norm": 1.968490446816616, + "language_loss": 0.69469118, + "learning_rate": 3.3368520699959272e-06, + "loss": 0.71644211, + "num_input_tokens_seen": 103686595, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.86328125, + "step": 4806, + "time_per_iteration": 2.558811902999878 + }, + { + "auxiliary_loss_clip": 0.0113054, + "auxiliary_loss_mlp": 0.01046922, + "balance_loss_clip": 1.031461, + "balance_loss_mlp": 1.04788303, + "epoch": 0.2890124755749286, + "flos": 29714679342720.0, + "grad_norm": 1.428284074184194, + "language_loss": 0.71372461, + "learning_rate": 3.3365623713789443e-06, + "loss": 0.73549926, + "num_input_tokens_seen": 103707525, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.828125, + "step": 4807, + "time_per_iteration": 2.5614373683929443 + }, + { + "auxiliary_loss_clip": 0.01132479, + "auxiliary_loss_mlp": 0.01043529, + "balance_loss_clip": 1.02736425, + "balance_loss_mlp": 1.04677331, + "epoch": 0.28907259882759656, + "flos": 22674464628480.0, + "grad_norm": 1.7487230864068215, + "language_loss": 0.81519878, + "learning_rate": 3.336272622079382e-06, + "loss": 0.83695877, + "num_input_tokens_seen": 103727905, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.85546875, + "step": 4808, + "time_per_iteration": 2.4744319915771484 + }, + { + "auxiliary_loss_clip": 0.01128992, + "auxiliary_loss_mlp": 0.0105043, + "balance_loss_clip": 1.03418779, + "balance_loss_mlp": 1.04669142, + "epoch": 0.2891327220802645, + "flos": 22566050403840.0, + "grad_norm": 1.636259514454852, + "language_loss": 0.78387201, + "learning_rate": 3.3359828221082276e-06, + "loss": 0.80566621, + "num_input_tokens_seen": 103748335, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8203125, + "step": 4809, + "time_per_iteration": 2.4998364448547363 + }, + { + "auxiliary_loss_clip": 0.01134273, + "auxiliary_loss_mlp": 0.0104619, + "balance_loss_clip": 1.02908349, + "balance_loss_mlp": 1.04490733, + "epoch": 0.2891928453329325, + "flos": 21652806090240.0, + "grad_norm": 1.6563631129995537, + "language_loss": 0.78611737, + "learning_rate": 3.3356929714764714e-06, + "loss": 0.80792195, + "num_input_tokens_seen": 103767020, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.890625, + "step": 4810, + "time_per_iteration": 2.4702351093292236 + }, + { + "auxiliary_loss_clip": 0.01129985, + "auxiliary_loss_mlp": 0.01045099, + "balance_loss_clip": 1.02966762, + "balance_loss_mlp": 1.04653728, + "epoch": 0.28925296858560046, + "flos": 23222102359680.0, + "grad_norm": 2.008599276638055, + "language_loss": 0.77134252, + "learning_rate": 3.3354030701951032e-06, + "loss": 0.79309338, + "num_input_tokens_seen": 103786355, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8359375, + "step": 4811, + "time_per_iteration": 2.502671718597412 + }, + { + "auxiliary_loss_clip": 0.01130702, + "auxiliary_loss_mlp": 0.01050867, + "balance_loss_clip": 1.03385544, + "balance_loss_mlp": 1.0460732, + "epoch": 0.2893130918382685, + "flos": 28621666437120.0, + "grad_norm": 1.3273574459957262, + "language_loss": 0.76748705, + "learning_rate": 3.335113118275117e-06, + "loss": 0.78930271, + "num_input_tokens_seen": 103809345, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.84375, + "step": 4812, + "time_per_iteration": 2.5386435985565186 + }, + { + "auxiliary_loss_clip": 0.01038211, + "auxiliary_loss_mlp": 0.01023073, + "balance_loss_clip": 1.02121317, + "balance_loss_mlp": 1.00933552, + "epoch": 0.28937321509093644, + "flos": 72301288982400.0, + "grad_norm": 0.8452992206378583, + "language_loss": 0.60239071, + "learning_rate": 3.3348231157275085e-06, + "loss": 0.62300354, + "num_input_tokens_seen": 103871180, + "router_z_loss_clip": 0.01855469, + "router_z_loss_mlp": 0.2890625, + "step": 4813, + "time_per_iteration": 3.227616548538208 + }, + { + "auxiliary_loss_clip": 0.01130233, + "auxiliary_loss_mlp": 0.0104328, + "balance_loss_clip": 1.02727079, + "balance_loss_mlp": 1.04549837, + "epoch": 0.2894333383436044, + "flos": 16216397637120.0, + "grad_norm": 1.8826759768804342, + "language_loss": 0.81616402, + "learning_rate": 3.3345330625632725e-06, + "loss": 0.83789915, + "num_input_tokens_seen": 103889040, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.84765625, + "step": 4814, + "time_per_iteration": 2.5389657020568848 + }, + { + "auxiliary_loss_clip": 0.01132807, + "auxiliary_loss_mlp": 0.01045738, + "balance_loss_clip": 1.0297873, + "balance_loss_mlp": 1.04464495, + "epoch": 0.2894934615962724, + "flos": 24828278918400.0, + "grad_norm": 1.6532361717230013, + "language_loss": 0.72615647, + "learning_rate": 3.3342429587934094e-06, + "loss": 0.74794197, + "num_input_tokens_seen": 103910380, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8828125, + "step": 4815, + "time_per_iteration": 2.517458438873291 + }, + { + "auxiliary_loss_clip": 0.01129383, + "auxiliary_loss_mlp": 0.0104797, + "balance_loss_clip": 1.03274667, + "balance_loss_mlp": 1.04815507, + "epoch": 0.28955358484894034, + "flos": 20449978329600.0, + "grad_norm": 1.520143184033477, + "language_loss": 0.70801306, + "learning_rate": 3.3339528044289198e-06, + "loss": 0.72978652, + "num_input_tokens_seen": 103929955, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8125, + "step": 4816, + "time_per_iteration": 2.5287740230560303 + }, + { + "auxiliary_loss_clip": 0.01135738, + "auxiliary_loss_mlp": 0.0104281, + "balance_loss_clip": 1.02590585, + "balance_loss_mlp": 1.04615664, + "epoch": 0.2896137081016083, + "flos": 22565188477440.0, + "grad_norm": 3.3715101323822174, + "language_loss": 0.74736607, + "learning_rate": 3.3336625994808055e-06, + "loss": 0.76915157, + "num_input_tokens_seen": 103948020, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8984375, + "step": 4817, + "time_per_iteration": 2.4828009605407715 + }, + { + "auxiliary_loss_clip": 0.01134031, + "auxiliary_loss_mlp": 0.0105341, + "balance_loss_clip": 1.03637469, + "balance_loss_mlp": 1.0465169, + "epoch": 0.28967383135427627, + "flos": 26687948734080.0, + "grad_norm": 1.754631597755812, + "language_loss": 0.76169789, + "learning_rate": 3.3333723439600723e-06, + "loss": 0.78357232, + "num_input_tokens_seen": 103968740, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.875, + "step": 4818, + "time_per_iteration": 2.5453133583068848 + }, + { + "auxiliary_loss_clip": 0.011322, + "auxiliary_loss_mlp": 0.01035125, + "balance_loss_clip": 1.0184474, + "balance_loss_mlp": 1.04606366, + "epoch": 0.28973395460694423, + "flos": 15558262692480.0, + "grad_norm": 1.8604375380991018, + "language_loss": 0.79827082, + "learning_rate": 3.3330820378777263e-06, + "loss": 0.81994408, + "num_input_tokens_seen": 103986005, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.859375, + "step": 4819, + "time_per_iteration": 2.4516472816467285 + }, + { + "auxiliary_loss_clip": 0.01135104, + "auxiliary_loss_mlp": 0.01043377, + "balance_loss_clip": 1.02553141, + "balance_loss_mlp": 1.04452121, + "epoch": 0.2897940778596122, + "flos": 18697465762560.0, + "grad_norm": 1.6026789889191464, + "language_loss": 0.78726941, + "learning_rate": 3.332791681244776e-06, + "loss": 0.80905426, + "num_input_tokens_seen": 104005070, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.90625, + "step": 4820, + "time_per_iteration": 2.512927770614624 + }, + { + "auxiliary_loss_clip": 0.0113352, + "auxiliary_loss_mlp": 0.01036477, + "balance_loss_clip": 1.0202527, + "balance_loss_mlp": 1.04560018, + "epoch": 0.28985420111228016, + "flos": 18770292587520.0, + "grad_norm": 2.352701358428358, + "language_loss": 0.73083222, + "learning_rate": 3.332501274072231e-06, + "loss": 0.75253224, + "num_input_tokens_seen": 104022945, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.87890625, + "step": 4821, + "time_per_iteration": 2.4575939178466797 + }, + { + "auxiliary_loss_clip": 0.01130585, + "auxiliary_loss_mlp": 0.01036495, + "balance_loss_clip": 1.01979387, + "balance_loss_mlp": 1.04503322, + "epoch": 0.28991432436494813, + "flos": 23069840607360.0, + "grad_norm": 1.843174914976853, + "language_loss": 0.72629523, + "learning_rate": 3.332210816371104e-06, + "loss": 0.74796605, + "num_input_tokens_seen": 104042080, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.85546875, + "step": 4822, + "time_per_iteration": 2.4981486797332764 + }, + { + "auxiliary_loss_clip": 0.01133236, + "auxiliary_loss_mlp": 0.01047323, + "balance_loss_clip": 1.03044343, + "balance_loss_mlp": 1.04679179, + "epoch": 0.2899744476176161, + "flos": 17603195880960.0, + "grad_norm": 1.7581642571514904, + "language_loss": 0.66571164, + "learning_rate": 3.3319203081524102e-06, + "loss": 0.68751729, + "num_input_tokens_seen": 104060975, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.86328125, + "step": 4823, + "time_per_iteration": 2.4363584518432617 + }, + { + "auxiliary_loss_clip": 0.01128693, + "auxiliary_loss_mlp": 0.01036254, + "balance_loss_clip": 1.02018452, + "balance_loss_mlp": 1.04382014, + "epoch": 0.29003457087028406, + "flos": 22309360836480.0, + "grad_norm": 3.6840420234688684, + "language_loss": 0.80786806, + "learning_rate": 3.331629749427164e-06, + "loss": 0.82951754, + "num_input_tokens_seen": 104081395, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8515625, + "step": 4824, + "time_per_iteration": 2.4978654384613037 + }, + { + "auxiliary_loss_clip": 0.01132559, + "auxiliary_loss_mlp": 0.01043716, + "balance_loss_clip": 1.02547669, + "balance_loss_mlp": 1.04512334, + "epoch": 0.2900946941229521, + "flos": 21944975316480.0, + "grad_norm": 1.8817460080316075, + "language_loss": 0.72507697, + "learning_rate": 3.331339140206385e-06, + "loss": 0.74683976, + "num_input_tokens_seen": 104099995, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.875, + "step": 4825, + "time_per_iteration": 2.4740118980407715 + }, + { + "auxiliary_loss_clip": 0.01136872, + "auxiliary_loss_mlp": 0.01035046, + "balance_loss_clip": 1.01760566, + "balance_loss_mlp": 1.04886889, + "epoch": 0.29015481737562004, + "flos": 17932173569280.0, + "grad_norm": 2.3450778905142813, + "language_loss": 0.73504382, + "learning_rate": 3.331048480501092e-06, + "loss": 0.75676298, + "num_input_tokens_seen": 104118930, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.8828125, + "step": 4826, + "time_per_iteration": 2.4689221382141113 + }, + { + "auxiliary_loss_clip": 0.01131943, + "auxiliary_loss_mlp": 0.01036484, + "balance_loss_clip": 1.02041411, + "balance_loss_mlp": 1.04524112, + "epoch": 0.290214940628288, + "flos": 22783525297920.0, + "grad_norm": 3.139827505949132, + "language_loss": 0.68472409, + "learning_rate": 3.3307577703223073e-06, + "loss": 0.70640838, + "num_input_tokens_seen": 104136940, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8671875, + "step": 4827, + "time_per_iteration": 2.5236809253692627 + }, + { + "auxiliary_loss_clip": 0.01136266, + "auxiliary_loss_mlp": 0.01036355, + "balance_loss_clip": 1.01816392, + "balance_loss_mlp": 1.04921937, + "epoch": 0.290275063880956, + "flos": 20006481104640.0, + "grad_norm": 1.8651963869616242, + "language_loss": 0.80072737, + "learning_rate": 3.3304670096810545e-06, + "loss": 0.82245356, + "num_input_tokens_seen": 104154280, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.87109375, + "step": 4828, + "time_per_iteration": 2.491584300994873 + }, + { + "auxiliary_loss_clip": 0.01133081, + "auxiliary_loss_mlp": 0.01042375, + "balance_loss_clip": 1.026335, + "balance_loss_mlp": 1.0482254, + "epoch": 0.29033518713362394, + "flos": 22053605022720.0, + "grad_norm": 2.2252387209358666, + "language_loss": 0.80475402, + "learning_rate": 3.33017619858836e-06, + "loss": 0.82650864, + "num_input_tokens_seen": 104172605, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.84765625, + "step": 4829, + "time_per_iteration": 2.473210334777832 + }, + { + "auxiliary_loss_clip": 0.01131629, + "auxiliary_loss_mlp": 0.01041142, + "balance_loss_clip": 1.02482176, + "balance_loss_mlp": 1.04794419, + "epoch": 0.2903953103862919, + "flos": 25630056351360.0, + "grad_norm": 1.544892870636461, + "language_loss": 0.82288766, + "learning_rate": 3.329885337055249e-06, + "loss": 0.84461534, + "num_input_tokens_seen": 104194120, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8359375, + "step": 4830, + "time_per_iteration": 2.52874755859375 + }, + { + "auxiliary_loss_clip": 0.01136051, + "auxiliary_loss_mlp": 0.01046661, + "balance_loss_clip": 1.02992344, + "balance_loss_mlp": 1.04847991, + "epoch": 0.29045543363895987, + "flos": 16945851035520.0, + "grad_norm": 2.366175746199002, + "language_loss": 0.78858435, + "learning_rate": 3.3295944250927546e-06, + "loss": 0.81041145, + "num_input_tokens_seen": 104210875, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.875, + "step": 4831, + "time_per_iteration": 2.5465588569641113 + }, + { + "auxiliary_loss_clip": 0.0112817, + "auxiliary_loss_mlp": 0.010386, + "balance_loss_clip": 1.02356744, + "balance_loss_mlp": 1.045138, + "epoch": 0.29051555689162784, + "flos": 26395492199040.0, + "grad_norm": 1.8105888440812088, + "language_loss": 0.74415791, + "learning_rate": 3.3293034627119055e-06, + "loss": 0.76582563, + "num_input_tokens_seen": 104229875, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 4832, + "time_per_iteration": 2.6398987770080566 + }, + { + "auxiliary_loss_clip": 0.0113002, + "auxiliary_loss_mlp": 0.01032742, + "balance_loss_clip": 1.01806784, + "balance_loss_mlp": 1.04516697, + "epoch": 0.2905756801442958, + "flos": 21103875469440.0, + "grad_norm": 1.6051950803449415, + "language_loss": 0.75986588, + "learning_rate": 3.329012449923736e-06, + "loss": 0.78149348, + "num_input_tokens_seen": 104250405, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.84765625, + "step": 4833, + "time_per_iteration": 2.4772675037384033 + }, + { + "auxiliary_loss_clip": 0.01129626, + "auxiliary_loss_mlp": 0.01037033, + "balance_loss_clip": 1.02108264, + "balance_loss_mlp": 1.04542434, + "epoch": 0.29063580339696377, + "flos": 15706071158400.0, + "grad_norm": 1.807689816327527, + "language_loss": 0.64523911, + "learning_rate": 3.3287213867392813e-06, + "loss": 0.6669057, + "num_input_tokens_seen": 104269185, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.83984375, + "step": 4834, + "time_per_iteration": 2.4944729804992676 + }, + { + "auxiliary_loss_clip": 0.0112967, + "auxiliary_loss_mlp": 0.01031422, + "balance_loss_clip": 1.01674771, + "balance_loss_mlp": 1.04650283, + "epoch": 0.29069592664963173, + "flos": 24644990793600.0, + "grad_norm": 1.5516449013863105, + "language_loss": 0.71436119, + "learning_rate": 3.3284302731695783e-06, + "loss": 0.73597211, + "num_input_tokens_seen": 104289400, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.83203125, + "step": 4835, + "time_per_iteration": 2.5122785568237305 + }, + { + "auxiliary_loss_clip": 0.01129192, + "auxiliary_loss_mlp": 0.01038882, + "balance_loss_clip": 1.02430248, + "balance_loss_mlp": 1.04510283, + "epoch": 0.2907560499022997, + "flos": 24973753000320.0, + "grad_norm": 2.123413568873549, + "language_loss": 0.79669547, + "learning_rate": 3.3281391092256668e-06, + "loss": 0.81837618, + "num_input_tokens_seen": 104310485, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.83984375, + "step": 4836, + "time_per_iteration": 2.533221483230591 + }, + { + "auxiliary_loss_clip": 0.01129403, + "auxiliary_loss_mlp": 0.01039274, + "balance_loss_clip": 1.02338338, + "balance_loss_mlp": 1.04589558, + "epoch": 0.29081617315496766, + "flos": 18657496903680.0, + "grad_norm": 1.6671781935549963, + "language_loss": 0.80777872, + "learning_rate": 3.3278478949185865e-06, + "loss": 0.82946539, + "num_input_tokens_seen": 104327330, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8359375, + "step": 4837, + "time_per_iteration": 2.4579083919525146 + }, + { + "auxiliary_loss_clip": 0.01131777, + "auxiliary_loss_mlp": 0.01037569, + "balance_loss_clip": 1.02170265, + "balance_loss_mlp": 1.04491532, + "epoch": 0.2908762964076356, + "flos": 35331035955840.0, + "grad_norm": 1.8624538054458508, + "language_loss": 0.67733121, + "learning_rate": 3.327556630259381e-06, + "loss": 0.69902468, + "num_input_tokens_seen": 104350350, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8671875, + "step": 4838, + "time_per_iteration": 2.613682270050049 + }, + { + "auxiliary_loss_clip": 0.01137044, + "auxiliary_loss_mlp": 0.010412, + "balance_loss_clip": 1.02485621, + "balance_loss_mlp": 1.04893696, + "epoch": 0.29093641966030365, + "flos": 23076305055360.0, + "grad_norm": 1.6135989987029238, + "language_loss": 0.71288264, + "learning_rate": 3.327265315259095e-06, + "loss": 0.73466504, + "num_input_tokens_seen": 104369995, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.87890625, + "step": 4839, + "time_per_iteration": 2.506908416748047 + }, + { + "auxiliary_loss_clip": 0.0112979, + "auxiliary_loss_mlp": 0.01038337, + "balance_loss_clip": 1.02341795, + "balance_loss_mlp": 1.04433274, + "epoch": 0.2909965429129716, + "flos": 35955415094400.0, + "grad_norm": 1.876317037835641, + "language_loss": 0.75619674, + "learning_rate": 3.326973949928776e-06, + "loss": 0.77787805, + "num_input_tokens_seen": 104392285, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.85546875, + "step": 4840, + "time_per_iteration": 2.6259472370147705 + }, + { + "auxiliary_loss_clip": 0.011316, + "auxiliary_loss_mlp": 0.01041868, + "balance_loss_clip": 1.02688372, + "balance_loss_mlp": 1.0469749, + "epoch": 0.2910566661656396, + "flos": 30880231764480.0, + "grad_norm": 1.9955793585576265, + "language_loss": 0.60459495, + "learning_rate": 3.326682534279471e-06, + "loss": 0.62632966, + "num_input_tokens_seen": 104412640, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84375, + "step": 4841, + "time_per_iteration": 2.5497686862945557 + }, + { + "auxiliary_loss_clip": 0.01133928, + "auxiliary_loss_mlp": 0.01038335, + "balance_loss_clip": 1.0215385, + "balance_loss_mlp": 1.0483892, + "epoch": 0.29111678941830754, + "flos": 30010188533760.0, + "grad_norm": 1.7266193979009703, + "language_loss": 0.71366, + "learning_rate": 3.326391068322232e-06, + "loss": 0.73538262, + "num_input_tokens_seen": 104435245, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.85546875, + "step": 4842, + "time_per_iteration": 2.5817017555236816 + }, + { + "auxiliary_loss_clip": 0.01131749, + "auxiliary_loss_mlp": 0.0103654, + "balance_loss_clip": 1.02188897, + "balance_loss_mlp": 1.04632473, + "epoch": 0.2911769126709755, + "flos": 22857393617280.0, + "grad_norm": 1.5806493177236067, + "language_loss": 0.72846174, + "learning_rate": 3.3260995520681098e-06, + "loss": 0.7501446, + "num_input_tokens_seen": 104455395, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.85546875, + "step": 4843, + "time_per_iteration": 2.4728853702545166 + }, + { + "auxiliary_loss_clip": 0.01132332, + "auxiliary_loss_mlp": 0.01038034, + "balance_loss_clip": 1.0223223, + "balance_loss_mlp": 1.04598284, + "epoch": 0.2912370359236435, + "flos": 21650507619840.0, + "grad_norm": 2.0237546438656393, + "language_loss": 0.5840022, + "learning_rate": 3.3258079855281602e-06, + "loss": 0.60570586, + "num_input_tokens_seen": 104473350, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.86328125, + "step": 4844, + "time_per_iteration": 3.9377825260162354 + }, + { + "auxiliary_loss_clip": 0.01136792, + "auxiliary_loss_mlp": 0.01042261, + "balance_loss_clip": 1.02518439, + "balance_loss_mlp": 1.04942751, + "epoch": 0.29129715917631144, + "flos": 22893340152960.0, + "grad_norm": 2.1502970284536493, + "language_loss": 0.86360186, + "learning_rate": 3.3255163687134396e-06, + "loss": 0.88539243, + "num_input_tokens_seen": 104492265, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.875, + "step": 4845, + "time_per_iteration": 5.415091276168823 + }, + { + "auxiliary_loss_clip": 0.01134782, + "auxiliary_loss_mlp": 0.01051996, + "balance_loss_clip": 1.03494883, + "balance_loss_mlp": 1.04779911, + "epoch": 0.2913572824289794, + "flos": 22674464628480.0, + "grad_norm": 1.7275133095664568, + "language_loss": 0.66684157, + "learning_rate": 3.3252247016350046e-06, + "loss": 0.68870938, + "num_input_tokens_seen": 104510755, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.8671875, + "step": 4846, + "time_per_iteration": 2.495901584625244 + }, + { + "auxiliary_loss_clip": 0.01131044, + "auxiliary_loss_mlp": 0.01042533, + "balance_loss_clip": 1.02700055, + "balance_loss_mlp": 1.04691291, + "epoch": 0.29141740568164737, + "flos": 23107403255040.0, + "grad_norm": 1.7117272730106567, + "language_loss": 0.70501876, + "learning_rate": 3.3249329843039166e-06, + "loss": 0.72675455, + "num_input_tokens_seen": 104530830, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.83984375, + "step": 4847, + "time_per_iteration": 2.50537109375 + }, + { + "auxiliary_loss_clip": 0.01131589, + "auxiliary_loss_mlp": 0.01035574, + "balance_loss_clip": 1.01918232, + "balance_loss_mlp": 1.04682243, + "epoch": 0.29147752893431533, + "flos": 23587026583680.0, + "grad_norm": 2.14972579950547, + "language_loss": 0.73494464, + "learning_rate": 3.324641216731237e-06, + "loss": 0.75661629, + "num_input_tokens_seen": 104550115, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.84765625, + "step": 4848, + "time_per_iteration": 2.506683111190796 + }, + { + "auxiliary_loss_clip": 0.01132855, + "auxiliary_loss_mlp": 0.01042107, + "balance_loss_clip": 1.02569222, + "balance_loss_mlp": 1.04670119, + "epoch": 0.2915376521869833, + "flos": 20591968792320.0, + "grad_norm": 2.106691725132959, + "language_loss": 0.76689458, + "learning_rate": 3.3243493989280295e-06, + "loss": 0.78864431, + "num_input_tokens_seen": 104566255, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.86328125, + "step": 4849, + "time_per_iteration": 2.475512742996216 + }, + { + "auxiliary_loss_clip": 0.01134647, + "auxiliary_loss_mlp": 0.01043325, + "balance_loss_clip": 1.02732718, + "balance_loss_mlp": 1.04683709, + "epoch": 0.29159777543965126, + "flos": 20811490761600.0, + "grad_norm": 1.7698868684834754, + "language_loss": 0.78437513, + "learning_rate": 3.3240575309053596e-06, + "loss": 0.80615485, + "num_input_tokens_seen": 104585235, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87890625, + "step": 4850, + "time_per_iteration": 2.4774062633514404 + }, + { + "auxiliary_loss_clip": 0.01130071, + "auxiliary_loss_mlp": 0.01038553, + "balance_loss_clip": 1.02231026, + "balance_loss_mlp": 1.04620552, + "epoch": 0.29165789869231923, + "flos": 24244155947520.0, + "grad_norm": 1.7416717517415665, + "language_loss": 0.75775445, + "learning_rate": 3.323765612674296e-06, + "loss": 0.77944064, + "num_input_tokens_seen": 104605315, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.83984375, + "step": 4851, + "time_per_iteration": 2.4973719120025635 + }, + { + "auxiliary_loss_clip": 0.01130818, + "auxiliary_loss_mlp": 0.01045515, + "balance_loss_clip": 1.03071558, + "balance_loss_mlp": 1.04819655, + "epoch": 0.29171802194498725, + "flos": 28949925853440.0, + "grad_norm": 1.378687766604426, + "language_loss": 0.77111661, + "learning_rate": 3.3234736442459078e-06, + "loss": 0.79287988, + "num_input_tokens_seen": 104626055, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.828125, + "step": 4852, + "time_per_iteration": 2.5339767932891846 + }, + { + "auxiliary_loss_clip": 0.01132717, + "auxiliary_loss_mlp": 0.01045328, + "balance_loss_clip": 1.0296402, + "balance_loss_mlp": 1.04735672, + "epoch": 0.2917781451976552, + "flos": 22598226011520.0, + "grad_norm": 1.5345579183576068, + "language_loss": 0.78385615, + "learning_rate": 3.3231816256312665e-06, + "loss": 0.80563664, + "num_input_tokens_seen": 104646005, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8515625, + "step": 4853, + "time_per_iteration": 2.511125087738037 + }, + { + "auxiliary_loss_clip": 0.0113401, + "auxiliary_loss_mlp": 0.01038526, + "balance_loss_clip": 1.02278996, + "balance_loss_mlp": 1.04668474, + "epoch": 0.2918382684503232, + "flos": 21574448570880.0, + "grad_norm": 2.984154109703724, + "language_loss": 0.87946999, + "learning_rate": 3.322889556841445e-06, + "loss": 0.90119541, + "num_input_tokens_seen": 104661620, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.875, + "step": 4854, + "time_per_iteration": 2.4654700756073 + }, + { + "auxiliary_loss_clip": 0.0113237, + "auxiliary_loss_mlp": 0.01052716, + "balance_loss_clip": 1.03352284, + "balance_loss_mlp": 1.04678071, + "epoch": 0.29189839170299114, + "flos": 24353503925760.0, + "grad_norm": 1.8357290509449282, + "language_loss": 0.86585724, + "learning_rate": 3.322597437887519e-06, + "loss": 0.88770819, + "num_input_tokens_seen": 104681445, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.85546875, + "step": 4855, + "time_per_iteration": 2.519432783126831 + }, + { + "auxiliary_loss_clip": 0.01043355, + "auxiliary_loss_mlp": 0.01004722, + "balance_loss_clip": 1.00283837, + "balance_loss_mlp": 1.01374364, + "epoch": 0.2919585149556591, + "flos": 71316726215040.0, + "grad_norm": 0.8090362112321295, + "language_loss": 0.60199535, + "learning_rate": 3.322305268780566e-06, + "loss": 0.6224761, + "num_input_tokens_seen": 104747945, + "router_z_loss_clip": 0.01879883, + "router_z_loss_mlp": 0.296875, + "step": 4856, + "time_per_iteration": 3.164905309677124 + }, + { + "auxiliary_loss_clip": 0.01130578, + "auxiliary_loss_mlp": 0.01039982, + "balance_loss_clip": 1.02499735, + "balance_loss_mlp": 1.04626632, + "epoch": 0.2920186382083271, + "flos": 15633208419840.0, + "grad_norm": 2.394144218040463, + "language_loss": 0.67995465, + "learning_rate": 3.322013049531664e-06, + "loss": 0.70166028, + "num_input_tokens_seen": 104766225, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.84375, + "step": 4857, + "time_per_iteration": 2.4615678787231445 + }, + { + "auxiliary_loss_clip": 0.01129998, + "auxiliary_loss_mlp": 0.01035751, + "balance_loss_clip": 1.0210768, + "balance_loss_mlp": 1.04613733, + "epoch": 0.29207876146099504, + "flos": 28366018364160.0, + "grad_norm": 2.1807634638236566, + "language_loss": 0.83958411, + "learning_rate": 3.321720780151895e-06, + "loss": 0.86124158, + "num_input_tokens_seen": 104785345, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.83984375, + "step": 4858, + "time_per_iteration": 2.561347723007202 + }, + { + "auxiliary_loss_clip": 0.01131346, + "auxiliary_loss_mlp": 0.0103964, + "balance_loss_clip": 1.02478647, + "balance_loss_mlp": 1.04746854, + "epoch": 0.292138884713663, + "flos": 21870963342720.0, + "grad_norm": 2.0714117361066298, + "language_loss": 0.77547097, + "learning_rate": 3.321428460652342e-06, + "loss": 0.79718083, + "num_input_tokens_seen": 104804560, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.83984375, + "step": 4859, + "time_per_iteration": 2.4801361560821533 + }, + { + "auxiliary_loss_clip": 0.01132855, + "auxiliary_loss_mlp": 0.01043796, + "balance_loss_clip": 1.02764332, + "balance_loss_mlp": 1.04424477, + "epoch": 0.29219900796633097, + "flos": 20992552243200.0, + "grad_norm": 2.0548529873010564, + "language_loss": 0.68948561, + "learning_rate": 3.3211360910440885e-06, + "loss": 0.71125209, + "num_input_tokens_seen": 104821105, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8828125, + "step": 4860, + "time_per_iteration": 2.531022071838379 + }, + { + "auxiliary_loss_clip": 0.01129954, + "auxiliary_loss_mlp": 0.01040561, + "balance_loss_clip": 1.0267868, + "balance_loss_mlp": 1.04821134, + "epoch": 0.29225913121899894, + "flos": 35004608133120.0, + "grad_norm": 2.771004145303475, + "language_loss": 0.75952631, + "learning_rate": 3.320843671338222e-06, + "loss": 0.78123146, + "num_input_tokens_seen": 104841440, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.81640625, + "step": 4861, + "time_per_iteration": 2.619257926940918 + }, + { + "auxiliary_loss_clip": 0.01129568, + "auxiliary_loss_mlp": 0.01048123, + "balance_loss_clip": 1.03350759, + "balance_loss_mlp": 1.04631817, + "epoch": 0.2923192544716669, + "flos": 13515663888000.0, + "grad_norm": 1.7230129115334698, + "language_loss": 0.91648388, + "learning_rate": 3.320551201545832e-06, + "loss": 0.93826073, + "num_input_tokens_seen": 104858210, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.83203125, + "step": 4862, + "time_per_iteration": 2.4596564769744873 + }, + { + "auxiliary_loss_clip": 0.01129785, + "auxiliary_loss_mlp": 0.01038215, + "balance_loss_clip": 1.02336144, + "balance_loss_mlp": 1.04544663, + "epoch": 0.29237937772433487, + "flos": 19463512141440.0, + "grad_norm": 2.061794510539927, + "language_loss": 0.73736131, + "learning_rate": 3.320258681678008e-06, + "loss": 0.75904131, + "num_input_tokens_seen": 104875620, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.84375, + "step": 4863, + "time_per_iteration": 2.4478728771209717 + }, + { + "auxiliary_loss_clip": 0.01125934, + "auxiliary_loss_mlp": 0.01038806, + "balance_loss_clip": 1.02474487, + "balance_loss_mlp": 1.04584527, + "epoch": 0.29243950097700283, + "flos": 20850597694080.0, + "grad_norm": 1.6779515608592832, + "language_loss": 0.78057373, + "learning_rate": 3.319966111745842e-06, + "loss": 0.80222106, + "num_input_tokens_seen": 104894600, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.80078125, + "step": 4864, + "time_per_iteration": 2.487544059753418 + }, + { + "auxiliary_loss_clip": 0.0113348, + "auxiliary_loss_mlp": 0.01045654, + "balance_loss_clip": 1.02927482, + "balance_loss_mlp": 1.04763806, + "epoch": 0.29249962422967085, + "flos": 23584225322880.0, + "grad_norm": 2.699456605470703, + "language_loss": 0.81919956, + "learning_rate": 3.319673491760429e-06, + "loss": 0.8409909, + "num_input_tokens_seen": 104914530, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.859375, + "step": 4865, + "time_per_iteration": 2.486553192138672 + }, + { + "auxiliary_loss_clip": 0.01130825, + "auxiliary_loss_mlp": 0.01040981, + "balance_loss_clip": 1.02523327, + "balance_loss_mlp": 1.04592669, + "epoch": 0.2925597474823388, + "flos": 22273342473600.0, + "grad_norm": 1.8393536761495908, + "language_loss": 0.85281575, + "learning_rate": 3.3193808217328645e-06, + "loss": 0.87453377, + "num_input_tokens_seen": 104933460, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8515625, + "step": 4866, + "time_per_iteration": 2.4981276988983154 + }, + { + "auxiliary_loss_clip": 0.01124877, + "auxiliary_loss_mlp": 0.01037248, + "balance_loss_clip": 1.02263868, + "balance_loss_mlp": 1.04323506, + "epoch": 0.2926198707350068, + "flos": 34456108475520.0, + "grad_norm": 1.627734535935432, + "language_loss": 0.755858, + "learning_rate": 3.3190881016742476e-06, + "loss": 0.77747923, + "num_input_tokens_seen": 104954495, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.81640625, + "step": 4867, + "time_per_iteration": 2.5813703536987305 + }, + { + "auxiliary_loss_clip": 0.01129928, + "auxiliary_loss_mlp": 0.01049325, + "balance_loss_clip": 1.03337526, + "balance_loss_mlp": 1.04375887, + "epoch": 0.29267999398767475, + "flos": 20704153944960.0, + "grad_norm": 4.179606236398783, + "language_loss": 0.73403615, + "learning_rate": 3.3187953315956776e-06, + "loss": 0.75582874, + "num_input_tokens_seen": 104971915, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.859375, + "step": 4868, + "time_per_iteration": 2.48374342918396 + }, + { + "auxiliary_loss_clip": 0.01128319, + "auxiliary_loss_mlp": 0.01033217, + "balance_loss_clip": 1.01857829, + "balance_loss_mlp": 1.04520726, + "epoch": 0.2927401172403427, + "flos": 18368667642240.0, + "grad_norm": 1.3015957921166281, + "language_loss": 0.74555755, + "learning_rate": 3.3185025115082566e-06, + "loss": 0.76717293, + "num_input_tokens_seen": 104991335, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.83203125, + "step": 4869, + "time_per_iteration": 2.458434820175171 + }, + { + "auxiliary_loss_clip": 0.01131121, + "auxiliary_loss_mlp": 0.01038828, + "balance_loss_clip": 1.02390289, + "balance_loss_mlp": 1.04639244, + "epoch": 0.2928002404930107, + "flos": 26104041244800.0, + "grad_norm": 1.465584897312906, + "language_loss": 0.76539874, + "learning_rate": 3.318209641423088e-06, + "loss": 0.78709823, + "num_input_tokens_seen": 105012015, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.84765625, + "step": 4870, + "time_per_iteration": 2.5194873809814453 + }, + { + "auxiliary_loss_clip": 0.01133405, + "auxiliary_loss_mlp": 0.01046415, + "balance_loss_clip": 1.03040564, + "balance_loss_mlp": 1.04584765, + "epoch": 0.29286036374567864, + "flos": 21324726241920.0, + "grad_norm": 2.259080578005736, + "language_loss": 0.67315602, + "learning_rate": 3.3179167213512777e-06, + "loss": 0.69495422, + "num_input_tokens_seen": 105031460, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.875, + "step": 4871, + "time_per_iteration": 2.4556169509887695 + }, + { + "auxiliary_loss_clip": 0.01125512, + "auxiliary_loss_mlp": 0.01039548, + "balance_loss_clip": 1.02509975, + "balance_loss_mlp": 1.04283524, + "epoch": 0.2929204869983466, + "flos": 29569492569600.0, + "grad_norm": 1.8081222369362746, + "language_loss": 0.76924586, + "learning_rate": 3.317623751303933e-06, + "loss": 0.79089642, + "num_input_tokens_seen": 105052965, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.828125, + "step": 4872, + "time_per_iteration": 2.575421094894409 + }, + { + "auxiliary_loss_clip": 0.01131072, + "auxiliary_loss_mlp": 0.01043663, + "balance_loss_clip": 1.0271883, + "balance_loss_mlp": 1.04527128, + "epoch": 0.2929806102510146, + "flos": 19058259922560.0, + "grad_norm": 2.2968152323379347, + "language_loss": 0.72835052, + "learning_rate": 3.317330731292164e-06, + "loss": 0.75009787, + "num_input_tokens_seen": 105071840, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.859375, + "step": 4873, + "time_per_iteration": 2.4370815753936768 + }, + { + "auxiliary_loss_clip": 0.01133087, + "auxiliary_loss_mlp": 0.01041861, + "balance_loss_clip": 1.02518392, + "balance_loss_mlp": 1.04519463, + "epoch": 0.29304073350368254, + "flos": 21944221130880.0, + "grad_norm": 1.8384173868300016, + "language_loss": 0.77871835, + "learning_rate": 3.3170376613270812e-06, + "loss": 0.80046785, + "num_input_tokens_seen": 105089445, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.87890625, + "step": 4874, + "time_per_iteration": 2.512613534927368 + }, + { + "auxiliary_loss_clip": 0.01135856, + "auxiliary_loss_mlp": 0.01045857, + "balance_loss_clip": 1.02962041, + "balance_loss_mlp": 1.04670048, + "epoch": 0.2931008567563505, + "flos": 15450818135040.0, + "grad_norm": 2.084283832751276, + "language_loss": 0.77047002, + "learning_rate": 3.3167445414197985e-06, + "loss": 0.79228717, + "num_input_tokens_seen": 105106210, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.890625, + "step": 4875, + "time_per_iteration": 2.487417221069336 + }, + { + "auxiliary_loss_clip": 0.01136141, + "auxiliary_loss_mlp": 0.0103441, + "balance_loss_clip": 1.01912785, + "balance_loss_mlp": 1.04909277, + "epoch": 0.29316098000901847, + "flos": 16983162288000.0, + "grad_norm": 1.6806867883636405, + "language_loss": 0.69183826, + "learning_rate": 3.316451371581431e-06, + "loss": 0.71354383, + "num_input_tokens_seen": 105124200, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.87109375, + "step": 4876, + "time_per_iteration": 2.4764888286590576 + }, + { + "auxiliary_loss_clip": 0.01128897, + "auxiliary_loss_mlp": 0.01045114, + "balance_loss_clip": 1.03027201, + "balance_loss_mlp": 1.04482532, + "epoch": 0.29322110326168643, + "flos": 16357705741440.0, + "grad_norm": 2.3621737524413913, + "language_loss": 0.8195532, + "learning_rate": 3.316158151823096e-06, + "loss": 0.84129333, + "num_input_tokens_seen": 105140400, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.83984375, + "step": 4877, + "time_per_iteration": 2.4738340377807617 + }, + { + "auxiliary_loss_clip": 0.01134087, + "auxiliary_loss_mlp": 0.01042806, + "balance_loss_clip": 1.02765405, + "balance_loss_mlp": 1.04704273, + "epoch": 0.29328122651435445, + "flos": 13990869843840.0, + "grad_norm": 1.8654341954981455, + "language_loss": 0.67843962, + "learning_rate": 3.315864882155911e-06, + "loss": 0.70020854, + "num_input_tokens_seen": 105157535, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.87109375, + "step": 4878, + "time_per_iteration": 2.4606332778930664 + }, + { + "auxiliary_loss_clip": 0.01130502, + "auxiliary_loss_mlp": 0.01041377, + "balance_loss_clip": 1.02624929, + "balance_loss_mlp": 1.04562759, + "epoch": 0.2933413497670224, + "flos": 25264593423360.0, + "grad_norm": 1.8286598598322423, + "language_loss": 0.7351383, + "learning_rate": 3.3155715625909982e-06, + "loss": 0.7568571, + "num_input_tokens_seen": 105175185, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.84765625, + "step": 4879, + "time_per_iteration": 2.502901315689087 + }, + { + "auxiliary_loss_clip": 0.01137009, + "auxiliary_loss_mlp": 0.01046436, + "balance_loss_clip": 1.02881706, + "balance_loss_mlp": 1.0484302, + "epoch": 0.2934014730196904, + "flos": 32123746656000.0, + "grad_norm": 2.0641755158914634, + "language_loss": 0.65864384, + "learning_rate": 3.3152781931394803e-06, + "loss": 0.68047822, + "num_input_tokens_seen": 105194540, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.88671875, + "step": 4880, + "time_per_iteration": 2.5785939693450928 + }, + { + "auxiliary_loss_clip": 0.01130839, + "auxiliary_loss_mlp": 0.01045571, + "balance_loss_clip": 1.02962136, + "balance_loss_mlp": 1.04453218, + "epoch": 0.29346159627235835, + "flos": 24352498344960.0, + "grad_norm": 2.157512175932489, + "language_loss": 0.70518327, + "learning_rate": 3.314984773812481e-06, + "loss": 0.72694737, + "num_input_tokens_seen": 105213215, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.86328125, + "step": 4881, + "time_per_iteration": 2.4913742542266846 + }, + { + "auxiliary_loss_clip": 0.01133082, + "auxiliary_loss_mlp": 0.01039157, + "balance_loss_clip": 1.02336192, + "balance_loss_mlp": 1.0471015, + "epoch": 0.2935217195250263, + "flos": 22746752749440.0, + "grad_norm": 2.112776228996839, + "language_loss": 0.83907056, + "learning_rate": 3.314691304621127e-06, + "loss": 0.86079299, + "num_input_tokens_seen": 105231585, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.859375, + "step": 4882, + "time_per_iteration": 2.4955010414123535 + }, + { + "auxiliary_loss_clip": 0.01135526, + "auxiliary_loss_mlp": 0.01041581, + "balance_loss_clip": 1.02495086, + "balance_loss_mlp": 1.0470233, + "epoch": 0.2935818427776943, + "flos": 21725561088000.0, + "grad_norm": 2.198383771985309, + "language_loss": 0.71811014, + "learning_rate": 3.314397785576548e-06, + "loss": 0.73988116, + "num_input_tokens_seen": 105250120, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8828125, + "step": 4883, + "time_per_iteration": 2.474574089050293 + }, + { + "auxiliary_loss_clip": 0.01132572, + "auxiliary_loss_mlp": 0.01038466, + "balance_loss_clip": 1.02225327, + "balance_loss_mlp": 1.04580843, + "epoch": 0.29364196603036224, + "flos": 23804968354560.0, + "grad_norm": 3.497082861184858, + "language_loss": 0.92629534, + "learning_rate": 3.3141042166898726e-06, + "loss": 0.94800568, + "num_input_tokens_seen": 105266065, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8671875, + "step": 4884, + "time_per_iteration": 2.4947426319122314 + }, + { + "auxiliary_loss_clip": 0.01138135, + "auxiliary_loss_mlp": 0.01045606, + "balance_loss_clip": 1.03032374, + "balance_loss_mlp": 1.05094171, + "epoch": 0.2937020892830302, + "flos": 23470064922240.0, + "grad_norm": 2.2315982417854876, + "language_loss": 0.73729408, + "learning_rate": 3.313810597972234e-06, + "loss": 0.75913155, + "num_input_tokens_seen": 105282155, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.87109375, + "step": 4885, + "time_per_iteration": 2.5076494216918945 + }, + { + "auxiliary_loss_clip": 0.01132864, + "auxiliary_loss_mlp": 0.0104824, + "balance_loss_clip": 1.03185511, + "balance_loss_mlp": 1.0468272, + "epoch": 0.2937622125356982, + "flos": 24272740195200.0, + "grad_norm": 2.1964333946604135, + "language_loss": 0.85011208, + "learning_rate": 3.3135169294347655e-06, + "loss": 0.87192315, + "num_input_tokens_seen": 105299225, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.859375, + "step": 4886, + "time_per_iteration": 3.911407232284546 + }, + { + "auxiliary_loss_clip": 0.01135202, + "auxiliary_loss_mlp": 0.01041375, + "balance_loss_clip": 1.02624702, + "balance_loss_mlp": 1.04678059, + "epoch": 0.29382233578836614, + "flos": 20662461233280.0, + "grad_norm": 2.1393217933297657, + "language_loss": 0.77027792, + "learning_rate": 3.313223211088603e-06, + "loss": 0.79204369, + "num_input_tokens_seen": 105315710, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.88671875, + "step": 4887, + "time_per_iteration": 3.906132936477661 + }, + { + "auxiliary_loss_clip": 0.01135916, + "auxiliary_loss_mlp": 0.01046614, + "balance_loss_clip": 1.03127122, + "balance_loss_mlp": 1.04697633, + "epoch": 0.2938824590410341, + "flos": 16545052103040.0, + "grad_norm": 2.1952396364021536, + "language_loss": 0.79558414, + "learning_rate": 3.3129294429448855e-06, + "loss": 0.8174094, + "num_input_tokens_seen": 105333505, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.890625, + "step": 4888, + "time_per_iteration": 2.4338221549987793 + }, + { + "auxiliary_loss_clip": 0.01130748, + "auxiliary_loss_mlp": 0.01032451, + "balance_loss_clip": 1.0173831, + "balance_loss_mlp": 1.04529762, + "epoch": 0.29394258229370207, + "flos": 37925474382720.0, + "grad_norm": 1.4299668586503376, + "language_loss": 0.55301261, + "learning_rate": 3.3126356250147517e-06, + "loss": 0.57464457, + "num_input_tokens_seen": 105355605, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.85546875, + "step": 4889, + "time_per_iteration": 2.637645959854126 + }, + { + "auxiliary_loss_clip": 0.01134449, + "auxiliary_loss_mlp": 0.01039798, + "balance_loss_clip": 1.02314413, + "balance_loss_mlp": 1.0465076, + "epoch": 0.29400270554637004, + "flos": 20044690197120.0, + "grad_norm": 1.9477461279926194, + "language_loss": 0.84309214, + "learning_rate": 3.3123417573093434e-06, + "loss": 0.86483455, + "num_input_tokens_seen": 105374225, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.87890625, + "step": 4890, + "time_per_iteration": 2.445218801498413 + }, + { + "auxiliary_loss_clip": 0.01135501, + "auxiliary_loss_mlp": 0.01039459, + "balance_loss_clip": 1.02402174, + "balance_loss_mlp": 1.04780436, + "epoch": 0.294062828799038, + "flos": 15266380775040.0, + "grad_norm": 1.9951401673219091, + "language_loss": 0.72357798, + "learning_rate": 3.3120478398398046e-06, + "loss": 0.74532759, + "num_input_tokens_seen": 105391565, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.875, + "step": 4891, + "time_per_iteration": 2.434298515319824 + }, + { + "auxiliary_loss_clip": 0.01134115, + "auxiliary_loss_mlp": 0.01045308, + "balance_loss_clip": 1.02910721, + "balance_loss_mlp": 1.04683042, + "epoch": 0.294122952051706, + "flos": 22747147799040.0, + "grad_norm": 1.9834299238301316, + "language_loss": 0.77230573, + "learning_rate": 3.3117538726172797e-06, + "loss": 0.79410005, + "num_input_tokens_seen": 105409840, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.875, + "step": 4892, + "time_per_iteration": 2.4628806114196777 + }, + { + "auxiliary_loss_clip": 0.01130172, + "auxiliary_loss_mlp": 0.01035757, + "balance_loss_clip": 1.01989055, + "balance_loss_mlp": 1.04514182, + "epoch": 0.294183075304374, + "flos": 24972891073920.0, + "grad_norm": 1.7053650125053033, + "language_loss": 0.7846024, + "learning_rate": 3.3114598556529164e-06, + "loss": 0.80626166, + "num_input_tokens_seen": 105428645, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8515625, + "step": 4893, + "time_per_iteration": 2.505946159362793 + }, + { + "auxiliary_loss_clip": 0.01132333, + "auxiliary_loss_mlp": 0.01048117, + "balance_loss_clip": 1.03252435, + "balance_loss_mlp": 1.04651928, + "epoch": 0.29424319855704195, + "flos": 30952986762240.0, + "grad_norm": 1.8389301673785101, + "language_loss": 0.85052156, + "learning_rate": 3.311165788957864e-06, + "loss": 0.87232608, + "num_input_tokens_seen": 105447480, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.85546875, + "step": 4894, + "time_per_iteration": 2.5221872329711914 + }, + { + "auxiliary_loss_clip": 0.01132661, + "auxiliary_loss_mlp": 0.01036474, + "balance_loss_clip": 1.02120304, + "balance_loss_mlp": 1.04568195, + "epoch": 0.2943033218097099, + "flos": 15231583474560.0, + "grad_norm": 2.595597690193387, + "language_loss": 0.9027828, + "learning_rate": 3.310871672543274e-06, + "loss": 0.92447418, + "num_input_tokens_seen": 105464600, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.87109375, + "step": 4895, + "time_per_iteration": 2.4466798305511475 + }, + { + "auxiliary_loss_clip": 0.01135692, + "auxiliary_loss_mlp": 0.01040955, + "balance_loss_clip": 1.02434874, + "balance_loss_mlp": 1.04720199, + "epoch": 0.2943634450623779, + "flos": 21725884310400.0, + "grad_norm": 3.001231056574592, + "language_loss": 0.86597103, + "learning_rate": 3.3105775064202982e-06, + "loss": 0.88773751, + "num_input_tokens_seen": 105481510, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8828125, + "step": 4896, + "time_per_iteration": 2.459611654281616 + }, + { + "auxiliary_loss_clip": 0.01134294, + "auxiliary_loss_mlp": 0.01050105, + "balance_loss_clip": 1.03402412, + "balance_loss_mlp": 1.04802299, + "epoch": 0.29442356831504585, + "flos": 22602104680320.0, + "grad_norm": 2.652800133974417, + "language_loss": 0.73196733, + "learning_rate": 3.3102832906000924e-06, + "loss": 0.75381136, + "num_input_tokens_seen": 105501390, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.86328125, + "step": 4897, + "time_per_iteration": 2.4981348514556885 + }, + { + "auxiliary_loss_clip": 0.01136241, + "auxiliary_loss_mlp": 0.01042547, + "balance_loss_clip": 1.02546394, + "balance_loss_mlp": 1.0458895, + "epoch": 0.2944836915677138, + "flos": 20011401267840.0, + "grad_norm": 1.867954953207583, + "language_loss": 0.73798919, + "learning_rate": 3.309989025093813e-06, + "loss": 0.75977707, + "num_input_tokens_seen": 105519600, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.90234375, + "step": 4898, + "time_per_iteration": 2.439952850341797 + }, + { + "auxiliary_loss_clip": 0.01142949, + "auxiliary_loss_mlp": 0.01042887, + "balance_loss_clip": 1.02471972, + "balance_loss_mlp": 1.05136585, + "epoch": 0.2945438148203818, + "flos": 20045875345920.0, + "grad_norm": 2.6754375338801477, + "language_loss": 0.70309317, + "learning_rate": 3.309694709912618e-06, + "loss": 0.72495157, + "num_input_tokens_seen": 105535970, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9140625, + "step": 4899, + "time_per_iteration": 2.4757347106933594 + }, + { + "auxiliary_loss_clip": 0.01135914, + "auxiliary_loss_mlp": 0.01041458, + "balance_loss_clip": 1.02458405, + "balance_loss_mlp": 1.0484879, + "epoch": 0.29460393807304974, + "flos": 23733542160000.0, + "grad_norm": 1.9063479453414416, + "language_loss": 0.79007781, + "learning_rate": 3.3094003450676685e-06, + "loss": 0.8118515, + "num_input_tokens_seen": 105556735, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.875, + "step": 4900, + "time_per_iteration": 2.50555419921875 + }, + { + "auxiliary_loss_clip": 0.01131673, + "auxiliary_loss_mlp": 0.01042831, + "balance_loss_clip": 1.02720261, + "balance_loss_mlp": 1.04425764, + "epoch": 0.2946640613257177, + "flos": 14976079056000.0, + "grad_norm": 1.709443882500664, + "language_loss": 0.80718857, + "learning_rate": 3.3091059305701268e-06, + "loss": 0.8289336, + "num_input_tokens_seen": 105574875, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.875, + "step": 4901, + "time_per_iteration": 2.481768846511841 + }, + { + "auxiliary_loss_clip": 0.01127885, + "auxiliary_loss_mlp": 0.01035027, + "balance_loss_clip": 1.02062666, + "balance_loss_mlp": 1.04583955, + "epoch": 0.2947241845783857, + "flos": 24243904552320.0, + "grad_norm": 1.9567596526300628, + "language_loss": 0.57923675, + "learning_rate": 3.308811466431157e-06, + "loss": 0.60086584, + "num_input_tokens_seen": 105594225, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8203125, + "step": 4902, + "time_per_iteration": 2.491337299346924 + }, + { + "auxiliary_loss_clip": 0.01131951, + "auxiliary_loss_mlp": 0.01038919, + "balance_loss_clip": 1.02416682, + "balance_loss_mlp": 1.045946, + "epoch": 0.29478430783105364, + "flos": 19938394874880.0, + "grad_norm": 1.6713771638909152, + "language_loss": 0.75298065, + "learning_rate": 3.308516952661925e-06, + "loss": 0.77468932, + "num_input_tokens_seen": 105614000, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.859375, + "step": 4903, + "time_per_iteration": 2.4884400367736816 + }, + { + "auxiliary_loss_clip": 0.01132991, + "auxiliary_loss_mlp": 0.01042452, + "balance_loss_clip": 1.02560806, + "balance_loss_mlp": 1.04630995, + "epoch": 0.2948444310837216, + "flos": 27381347856000.0, + "grad_norm": 1.8012466742437707, + "language_loss": 0.6254617, + "learning_rate": 3.3082223892736e-06, + "loss": 0.64721614, + "num_input_tokens_seen": 105634575, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8671875, + "step": 4904, + "time_per_iteration": 2.5288941860198975 + }, + { + "auxiliary_loss_clip": 0.01134735, + "auxiliary_loss_mlp": 0.01038462, + "balance_loss_clip": 1.02252424, + "balance_loss_mlp": 1.04603219, + "epoch": 0.2949045543363896, + "flos": 23405462311680.0, + "grad_norm": 1.5173763027357385, + "language_loss": 0.7301079, + "learning_rate": 3.3079277762773496e-06, + "loss": 0.75183994, + "num_input_tokens_seen": 105654385, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.88671875, + "step": 4905, + "time_per_iteration": 2.5069708824157715 + }, + { + "auxiliary_loss_clip": 0.01131529, + "auxiliary_loss_mlp": 0.01041997, + "balance_loss_clip": 1.02577305, + "balance_loss_mlp": 1.0456897, + "epoch": 0.2949646775890576, + "flos": 23951483930880.0, + "grad_norm": 1.6701950888056076, + "language_loss": 0.81584871, + "learning_rate": 3.3076331136843476e-06, + "loss": 0.8375839, + "num_input_tokens_seen": 105673570, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.859375, + "step": 4906, + "time_per_iteration": 2.473604202270508 + }, + { + "auxiliary_loss_clip": 0.01128251, + "auxiliary_loss_mlp": 0.01034193, + "balance_loss_clip": 1.01870799, + "balance_loss_mlp": 1.04443395, + "epoch": 0.29502480084172555, + "flos": 22784315397120.0, + "grad_norm": 1.9494272179492087, + "language_loss": 0.87158448, + "learning_rate": 3.3073384015057667e-06, + "loss": 0.89320892, + "num_input_tokens_seen": 105691940, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.83984375, + "step": 4907, + "time_per_iteration": 2.490842819213867 + }, + { + "auxiliary_loss_clip": 0.01135464, + "auxiliary_loss_mlp": 0.01042187, + "balance_loss_clip": 1.02623653, + "balance_loss_mlp": 1.04758191, + "epoch": 0.2950849240943935, + "flos": 19646656611840.0, + "grad_norm": 2.3387997458884833, + "language_loss": 0.81563503, + "learning_rate": 3.307043639752782e-06, + "loss": 0.83741152, + "num_input_tokens_seen": 105709825, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87890625, + "step": 4908, + "time_per_iteration": 2.4586410522460938 + }, + { + "auxiliary_loss_clip": 0.01054339, + "auxiliary_loss_mlp": 0.01042068, + "balance_loss_clip": 1.03970814, + "balance_loss_mlp": 1.0157342, + "epoch": 0.2951450473470615, + "flos": 71002829260800.0, + "grad_norm": 0.7811313355607663, + "language_loss": 0.57214808, + "learning_rate": 3.3067488284365728e-06, + "loss": 0.59311211, + "num_input_tokens_seen": 105766880, + "router_z_loss_clip": 0.02355957, + "router_z_loss_mlp": 0.38671875, + "step": 4909, + "time_per_iteration": 2.9739394187927246 + }, + { + "auxiliary_loss_clip": 0.01136234, + "auxiliary_loss_mlp": 0.01038405, + "balance_loss_clip": 1.02340245, + "balance_loss_mlp": 1.05156505, + "epoch": 0.29520517059972945, + "flos": 22966310632320.0, + "grad_norm": 1.44395719574742, + "language_loss": 0.86585498, + "learning_rate": 3.3064539675683163e-06, + "loss": 0.88760138, + "num_input_tokens_seen": 105786875, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.84765625, + "step": 4910, + "time_per_iteration": 2.4779117107391357 + }, + { + "auxiliary_loss_clip": 0.01126914, + "auxiliary_loss_mlp": 0.01040378, + "balance_loss_clip": 1.02551222, + "balance_loss_mlp": 1.04549575, + "epoch": 0.2952652938523974, + "flos": 20485673470080.0, + "grad_norm": 1.8630755123750513, + "language_loss": 0.72632295, + "learning_rate": 3.3061590571591946e-06, + "loss": 0.74799585, + "num_input_tokens_seen": 105805315, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8125, + "step": 4911, + "time_per_iteration": 2.4959700107574463 + }, + { + "auxiliary_loss_clip": 0.01131053, + "auxiliary_loss_mlp": 0.01037399, + "balance_loss_clip": 1.02239108, + "balance_loss_mlp": 1.04823601, + "epoch": 0.2953254171050654, + "flos": 19646584784640.0, + "grad_norm": 1.774615067737937, + "language_loss": 0.8988539, + "learning_rate": 3.3058640972203904e-06, + "loss": 0.92053854, + "num_input_tokens_seen": 105825125, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 4912, + "time_per_iteration": 2.4532997608184814 + }, + { + "auxiliary_loss_clip": 0.01131716, + "auxiliary_loss_mlp": 0.01046481, + "balance_loss_clip": 1.03022075, + "balance_loss_mlp": 1.04712319, + "epoch": 0.29538554035773334, + "flos": 22747973811840.0, + "grad_norm": 1.458226475428025, + "language_loss": 0.83448595, + "learning_rate": 3.3055690877630894e-06, + "loss": 0.85626793, + "num_input_tokens_seen": 105846085, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.84765625, + "step": 4913, + "time_per_iteration": 2.515580654144287 + }, + { + "auxiliary_loss_clip": 0.01129704, + "auxiliary_loss_mlp": 0.01039162, + "balance_loss_clip": 1.02385521, + "balance_loss_mlp": 1.0438993, + "epoch": 0.2954456636104013, + "flos": 21871861182720.0, + "grad_norm": 1.6602062940724112, + "language_loss": 0.77029538, + "learning_rate": 3.3052740287984765e-06, + "loss": 0.79198408, + "num_input_tokens_seen": 105865400, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.859375, + "step": 4914, + "time_per_iteration": 2.457158088684082 + }, + { + "auxiliary_loss_clip": 0.01128554, + "auxiliary_loss_mlp": 0.01039033, + "balance_loss_clip": 1.02302349, + "balance_loss_mlp": 1.04553497, + "epoch": 0.2955057868630693, + "flos": 40442560871040.0, + "grad_norm": 1.9027466376674422, + "language_loss": 0.81550008, + "learning_rate": 3.3049789203377424e-06, + "loss": 0.83717597, + "num_input_tokens_seen": 105887920, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.828125, + "step": 4915, + "time_per_iteration": 2.6669511795043945 + }, + { + "auxiliary_loss_clip": 0.01134085, + "auxiliary_loss_mlp": 0.01037926, + "balance_loss_clip": 1.02215445, + "balance_loss_mlp": 1.0477066, + "epoch": 0.29556591011573724, + "flos": 22564506119040.0, + "grad_norm": 1.9544787473030132, + "language_loss": 0.84415555, + "learning_rate": 3.3046837623920772e-06, + "loss": 0.8658756, + "num_input_tokens_seen": 105904035, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.86328125, + "step": 4916, + "time_per_iteration": 2.473867416381836 + }, + { + "auxiliary_loss_clip": 0.01127987, + "auxiliary_loss_mlp": 0.01033684, + "balance_loss_clip": 1.01874673, + "balance_loss_mlp": 1.04477537, + "epoch": 0.2956260333684052, + "flos": 22089300163200.0, + "grad_norm": 3.5737730841451225, + "language_loss": 0.69611692, + "learning_rate": 3.3043885549726723e-06, + "loss": 0.71773368, + "num_input_tokens_seen": 105922685, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.83203125, + "step": 4917, + "time_per_iteration": 2.5078670978546143 + }, + { + "auxiliary_loss_clip": 0.01134116, + "auxiliary_loss_mlp": 0.01041431, + "balance_loss_clip": 1.02550471, + "balance_loss_mlp": 1.04932523, + "epoch": 0.2956861566210732, + "flos": 16435488643200.0, + "grad_norm": 2.1750223310256507, + "language_loss": 0.90840054, + "learning_rate": 3.3040932980907226e-06, + "loss": 0.93015605, + "num_input_tokens_seen": 105940425, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.84765625, + "step": 4918, + "time_per_iteration": 2.438870668411255 + }, + { + "auxiliary_loss_clip": 0.01134586, + "auxiliary_loss_mlp": 0.01040808, + "balance_loss_clip": 1.02504885, + "balance_loss_mlp": 1.04929781, + "epoch": 0.2957462798737412, + "flos": 25812087500160.0, + "grad_norm": 1.9164121886210477, + "language_loss": 0.72399461, + "learning_rate": 3.303797991757425e-06, + "loss": 0.74574864, + "num_input_tokens_seen": 105960550, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8515625, + "step": 4919, + "time_per_iteration": 2.5533134937286377 + }, + { + "auxiliary_loss_clip": 0.01130751, + "auxiliary_loss_mlp": 0.01042531, + "balance_loss_clip": 1.02661633, + "balance_loss_mlp": 1.04704165, + "epoch": 0.29580640312640916, + "flos": 16690849407360.0, + "grad_norm": 1.7148380002351797, + "language_loss": 0.75758076, + "learning_rate": 3.3035026359839763e-06, + "loss": 0.77931356, + "num_input_tokens_seen": 105978820, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8359375, + "step": 4920, + "time_per_iteration": 2.4288933277130127 + }, + { + "auxiliary_loss_clip": 0.01138995, + "auxiliary_loss_mlp": 0.01045405, + "balance_loss_clip": 1.02953875, + "balance_loss_mlp": 1.05214858, + "epoch": 0.2958665263790771, + "flos": 23945594100480.0, + "grad_norm": 2.2591712667141075, + "language_loss": 0.68327153, + "learning_rate": 3.3032072307815774e-06, + "loss": 0.7051155, + "num_input_tokens_seen": 105997545, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8671875, + "step": 4921, + "time_per_iteration": 2.5978074073791504 + }, + { + "auxiliary_loss_clip": 0.01136262, + "auxiliary_loss_mlp": 0.01042633, + "balance_loss_clip": 1.02580023, + "balance_loss_mlp": 1.04953861, + "epoch": 0.2959266496317451, + "flos": 18478410670080.0, + "grad_norm": 1.8781945072150448, + "language_loss": 0.74265885, + "learning_rate": 3.3029117761614298e-06, + "loss": 0.76444781, + "num_input_tokens_seen": 106015320, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8671875, + "step": 4922, + "time_per_iteration": 2.4518954753875732 + }, + { + "auxiliary_loss_clip": 0.0113841, + "auxiliary_loss_mlp": 0.0103604, + "balance_loss_clip": 1.01932716, + "balance_loss_mlp": 1.04900336, + "epoch": 0.29598677288441305, + "flos": 25957489754880.0, + "grad_norm": 2.178664992776949, + "language_loss": 0.76679426, + "learning_rate": 3.302616272134737e-06, + "loss": 0.78853875, + "num_input_tokens_seen": 106034555, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.89453125, + "step": 4923, + "time_per_iteration": 2.5565848350524902 + }, + { + "auxiliary_loss_clip": 0.0113218, + "auxiliary_loss_mlp": 0.01039495, + "balance_loss_clip": 1.02359807, + "balance_loss_mlp": 1.04730439, + "epoch": 0.296046896137081, + "flos": 25155999630720.0, + "grad_norm": 1.616043641477794, + "language_loss": 0.86307567, + "learning_rate": 3.3023207187127042e-06, + "loss": 0.88479245, + "num_input_tokens_seen": 106054200, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8515625, + "step": 4924, + "time_per_iteration": 2.5081374645233154 + }, + { + "auxiliary_loss_clip": 0.01132422, + "auxiliary_loss_mlp": 0.01034495, + "balance_loss_clip": 1.01864016, + "balance_loss_mlp": 1.04767513, + "epoch": 0.296107019389749, + "flos": 21761148487680.0, + "grad_norm": 1.3983202546472309, + "language_loss": 0.8180936, + "learning_rate": 3.3020251159065396e-06, + "loss": 0.83976275, + "num_input_tokens_seen": 106074700, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.84765625, + "step": 4925, + "time_per_iteration": 2.5473146438598633 + }, + { + "auxiliary_loss_clip": 0.01132696, + "auxiliary_loss_mlp": 0.01036748, + "balance_loss_clip": 1.02175128, + "balance_loss_mlp": 1.04893184, + "epoch": 0.29616714264241695, + "flos": 17960039544960.0, + "grad_norm": 2.5479827750219735, + "language_loss": 0.85168374, + "learning_rate": 3.301729463727452e-06, + "loss": 0.87337816, + "num_input_tokens_seen": 106091415, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8359375, + "step": 4926, + "time_per_iteration": 2.4603803157806396 + }, + { + "auxiliary_loss_clip": 0.0113285, + "auxiliary_loss_mlp": 0.01039481, + "balance_loss_clip": 1.02391791, + "balance_loss_mlp": 1.04658842, + "epoch": 0.2962272658950849, + "flos": 15012779777280.0, + "grad_norm": 2.1014080951069913, + "language_loss": 0.85908806, + "learning_rate": 3.3014337621866527e-06, + "loss": 0.88081133, + "num_input_tokens_seen": 106109135, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.86328125, + "step": 4927, + "time_per_iteration": 2.4724504947662354 + }, + { + "auxiliary_loss_clip": 0.01129564, + "auxiliary_loss_mlp": 0.01039461, + "balance_loss_clip": 1.02434492, + "balance_loss_mlp": 1.04636681, + "epoch": 0.2962873891477529, + "flos": 14720861946240.0, + "grad_norm": 1.8730507383843338, + "language_loss": 0.80967462, + "learning_rate": 3.3011380112953553e-06, + "loss": 0.83136487, + "num_input_tokens_seen": 106125750, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.83203125, + "step": 4928, + "time_per_iteration": 5.46146297454834 + }, + { + "auxiliary_loss_clip": 0.01138553, + "auxiliary_loss_mlp": 0.01041915, + "balance_loss_clip": 1.023211, + "balance_loss_mlp": 1.04749835, + "epoch": 0.29634751240042084, + "flos": 26723787528960.0, + "grad_norm": 3.002605920988437, + "language_loss": 0.72472513, + "learning_rate": 3.300842211064773e-06, + "loss": 0.7465297, + "num_input_tokens_seen": 106142835, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.91015625, + "step": 4929, + "time_per_iteration": 2.4938502311706543 + }, + { + "auxiliary_loss_clip": 0.01136289, + "auxiliary_loss_mlp": 0.01043304, + "balance_loss_clip": 1.02631676, + "balance_loss_mlp": 1.04823208, + "epoch": 0.2964076356530888, + "flos": 14571293713920.0, + "grad_norm": 2.429634231323073, + "language_loss": 0.72424346, + "learning_rate": 3.3005463615061246e-06, + "loss": 0.74603939, + "num_input_tokens_seen": 106160680, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.8828125, + "step": 4930, + "time_per_iteration": 2.486492156982422 + }, + { + "auxiliary_loss_clip": 0.01059615, + "auxiliary_loss_mlp": 0.01002568, + "balance_loss_clip": 1.00047004, + "balance_loss_mlp": 1.0186131, + "epoch": 0.29646775890575683, + "flos": 63104315063040.0, + "grad_norm": 0.8134562784526058, + "language_loss": 0.60710716, + "learning_rate": 3.3002504626306275e-06, + "loss": 0.627729, + "num_input_tokens_seen": 106224415, + "router_z_loss_clip": 0.02099609, + "router_z_loss_mlp": 0.41015625, + "step": 4931, + "time_per_iteration": 3.002444267272949 + }, + { + "auxiliary_loss_clip": 0.01058931, + "auxiliary_loss_mlp": 0.01001224, + "balance_loss_clip": 0.99926931, + "balance_loss_mlp": 1.01823413, + "epoch": 0.2965278821584248, + "flos": 63067686168960.0, + "grad_norm": 0.7413672345708404, + "language_loss": 0.52383232, + "learning_rate": 3.2999545144495023e-06, + "loss": 0.54443383, + "num_input_tokens_seen": 106279140, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.40625, + "step": 4932, + "time_per_iteration": 2.974777936935425 + }, + { + "auxiliary_loss_clip": 0.01127694, + "auxiliary_loss_mlp": 0.01039106, + "balance_loss_clip": 1.02322757, + "balance_loss_mlp": 1.04449248, + "epoch": 0.29658800541109276, + "flos": 23768734510080.0, + "grad_norm": 3.155895790893495, + "language_loss": 0.81622797, + "learning_rate": 3.299658516973972e-06, + "loss": 0.83789599, + "num_input_tokens_seen": 106298190, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.83203125, + "step": 4933, + "time_per_iteration": 2.518906593322754 + }, + { + "auxiliary_loss_clip": 0.0112788, + "auxiliary_loss_mlp": 0.0103376, + "balance_loss_clip": 1.01854897, + "balance_loss_mlp": 1.04651821, + "epoch": 0.2966481286637607, + "flos": 23988543788160.0, + "grad_norm": 1.671865304120784, + "language_loss": 0.75257647, + "learning_rate": 3.299362470215261e-06, + "loss": 0.77419287, + "num_input_tokens_seen": 106319065, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8125, + "step": 4934, + "time_per_iteration": 2.5015640258789062 + }, + { + "auxiliary_loss_clip": 0.01134944, + "auxiliary_loss_mlp": 0.01045163, + "balance_loss_clip": 1.02837849, + "balance_loss_mlp": 1.04699588, + "epoch": 0.2967082519164287, + "flos": 17165157523200.0, + "grad_norm": 1.752558919138232, + "language_loss": 0.62510157, + "learning_rate": 3.299066374184594e-06, + "loss": 0.64690268, + "num_input_tokens_seen": 106338040, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.87890625, + "step": 4935, + "time_per_iteration": 2.462982654571533 + }, + { + "auxiliary_loss_clip": 0.01129673, + "auxiliary_loss_mlp": 0.01041832, + "balance_loss_clip": 1.02585804, + "balance_loss_mlp": 1.04613912, + "epoch": 0.29676837516909665, + "flos": 29387712816000.0, + "grad_norm": 1.4993711353436514, + "language_loss": 0.79789758, + "learning_rate": 3.2987702288932e-06, + "loss": 0.81961262, + "num_input_tokens_seen": 106358900, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8359375, + "step": 4936, + "time_per_iteration": 2.5267326831817627 + }, + { + "auxiliary_loss_clip": 0.01132719, + "auxiliary_loss_mlp": 0.01045272, + "balance_loss_clip": 1.02854681, + "balance_loss_mlp": 1.04649782, + "epoch": 0.2968284984217646, + "flos": 34751222616960.0, + "grad_norm": 1.8807271027259396, + "language_loss": 0.74074632, + "learning_rate": 3.298474034352309e-06, + "loss": 0.76252627, + "num_input_tokens_seen": 106381805, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.86328125, + "step": 4937, + "time_per_iteration": 2.607790946960449 + }, + { + "auxiliary_loss_clip": 0.01132772, + "auxiliary_loss_mlp": 0.01038823, + "balance_loss_clip": 1.0224793, + "balance_loss_mlp": 1.04839468, + "epoch": 0.2968886216744326, + "flos": 21544104556800.0, + "grad_norm": 1.629632810423829, + "language_loss": 0.7804476, + "learning_rate": 3.2981777905731526e-06, + "loss": 0.80216354, + "num_input_tokens_seen": 106402365, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.84375, + "step": 4938, + "time_per_iteration": 2.469116687774658 + }, + { + "auxiliary_loss_clip": 0.01134705, + "auxiliary_loss_mlp": 0.01041256, + "balance_loss_clip": 1.02543736, + "balance_loss_mlp": 1.04814208, + "epoch": 0.29694874492710055, + "flos": 12787323811200.0, + "grad_norm": 2.041677851061636, + "language_loss": 0.77017808, + "learning_rate": 3.297881497566964e-06, + "loss": 0.79193771, + "num_input_tokens_seen": 106419800, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8671875, + "step": 4939, + "time_per_iteration": 2.453615427017212 + }, + { + "auxiliary_loss_clip": 0.01136816, + "auxiliary_loss_mlp": 0.01036899, + "balance_loss_clip": 1.02075171, + "balance_loss_mlp": 1.04958081, + "epoch": 0.2970088681797685, + "flos": 24569973239040.0, + "grad_norm": 1.5588161926919628, + "language_loss": 0.78206903, + "learning_rate": 3.297585155344979e-06, + "loss": 0.80380619, + "num_input_tokens_seen": 106440300, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.875, + "step": 4940, + "time_per_iteration": 2.5125393867492676 + }, + { + "auxiliary_loss_clip": 0.01133351, + "auxiliary_loss_mlp": 0.01040737, + "balance_loss_clip": 1.0233798, + "balance_loss_mlp": 1.04633832, + "epoch": 0.2970689914324365, + "flos": 23659171050240.0, + "grad_norm": 3.9307439231373884, + "language_loss": 0.75487554, + "learning_rate": 3.297288763918435e-06, + "loss": 0.77661633, + "num_input_tokens_seen": 106460035, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.87109375, + "step": 4941, + "time_per_iteration": 2.5308516025543213 + }, + { + "auxiliary_loss_clip": 0.0113684, + "auxiliary_loss_mlp": 0.01050296, + "balance_loss_clip": 1.03295147, + "balance_loss_mlp": 1.04803753, + "epoch": 0.29712911468510445, + "flos": 39670301439360.0, + "grad_norm": 2.557458362521145, + "language_loss": 0.73998737, + "learning_rate": 3.2969923232985712e-06, + "loss": 0.7618587, + "num_input_tokens_seen": 106481095, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.890625, + "step": 4942, + "time_per_iteration": 2.6214303970336914 + }, + { + "auxiliary_loss_clip": 0.0113696, + "auxiliary_loss_mlp": 0.01047243, + "balance_loss_clip": 1.03017855, + "balance_loss_mlp": 1.04778039, + "epoch": 0.2971892379377724, + "flos": 26395312631040.0, + "grad_norm": 1.997792424787015, + "language_loss": 0.70484138, + "learning_rate": 3.2966958334966287e-06, + "loss": 0.72668344, + "num_input_tokens_seen": 106501590, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.890625, + "step": 4943, + "time_per_iteration": 2.533313751220703 + }, + { + "auxiliary_loss_clip": 0.01137748, + "auxiliary_loss_mlp": 0.01043005, + "balance_loss_clip": 1.02657795, + "balance_loss_mlp": 1.04838014, + "epoch": 0.2972493611904404, + "flos": 17603195880960.0, + "grad_norm": 1.9523342898428475, + "language_loss": 0.80111414, + "learning_rate": 3.2963992945238497e-06, + "loss": 0.82292169, + "num_input_tokens_seen": 106519430, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.89453125, + "step": 4944, + "time_per_iteration": 2.464364528656006 + }, + { + "auxiliary_loss_clip": 0.01129992, + "auxiliary_loss_mlp": 0.01044699, + "balance_loss_clip": 1.02979231, + "balance_loss_mlp": 1.04640603, + "epoch": 0.2973094844431084, + "flos": 20412774817920.0, + "grad_norm": 2.1633352367153105, + "language_loss": 0.83451837, + "learning_rate": 3.2961027063914795e-06, + "loss": 0.85626531, + "num_input_tokens_seen": 106535870, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8359375, + "step": 4945, + "time_per_iteration": 2.4981510639190674 + }, + { + "auxiliary_loss_clip": 0.011318, + "auxiliary_loss_mlp": 0.01039077, + "balance_loss_clip": 1.02353168, + "balance_loss_mlp": 1.04738569, + "epoch": 0.29736960769577636, + "flos": 17493488766720.0, + "grad_norm": 2.2158088930062747, + "language_loss": 0.66624904, + "learning_rate": 3.2958060691107654e-06, + "loss": 0.68795776, + "num_input_tokens_seen": 106553560, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84375, + "step": 4946, + "time_per_iteration": 2.526228666305542 + }, + { + "auxiliary_loss_clip": 0.0113802, + "auxiliary_loss_mlp": 0.01034492, + "balance_loss_clip": 1.01880383, + "balance_loss_mlp": 1.0509392, + "epoch": 0.2974297309484443, + "flos": 26103969417600.0, + "grad_norm": 1.7941079108563611, + "language_loss": 0.73766255, + "learning_rate": 3.2955093826929547e-06, + "loss": 0.75938767, + "num_input_tokens_seen": 106574115, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.87109375, + "step": 4947, + "time_per_iteration": 2.5380265712738037 + }, + { + "auxiliary_loss_clip": 0.01135403, + "auxiliary_loss_mlp": 0.01044741, + "balance_loss_clip": 1.02774215, + "balance_loss_mlp": 1.04653597, + "epoch": 0.2974898542011123, + "flos": 25666433850240.0, + "grad_norm": 2.40735653244717, + "language_loss": 0.7330308, + "learning_rate": 3.2952126471492985e-06, + "loss": 0.75483221, + "num_input_tokens_seen": 106593070, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.890625, + "step": 4948, + "time_per_iteration": 2.5096492767333984 + }, + { + "auxiliary_loss_clip": 0.01129361, + "auxiliary_loss_mlp": 0.01033618, + "balance_loss_clip": 1.01824629, + "balance_loss_mlp": 1.04442465, + "epoch": 0.29754997745378026, + "flos": 18661339658880.0, + "grad_norm": 2.0973131899278825, + "language_loss": 0.84031421, + "learning_rate": 3.2949158624910497e-06, + "loss": 0.86194396, + "num_input_tokens_seen": 106610695, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8515625, + "step": 4949, + "time_per_iteration": 2.4650402069091797 + }, + { + "auxiliary_loss_clip": 0.01129505, + "auxiliary_loss_mlp": 0.01036103, + "balance_loss_clip": 1.02019429, + "balance_loss_mlp": 1.04509461, + "epoch": 0.2976101007064482, + "flos": 22274599449600.0, + "grad_norm": 1.77267818675948, + "language_loss": 0.71322602, + "learning_rate": 3.2946190287294603e-06, + "loss": 0.73488206, + "num_input_tokens_seen": 106631300, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.84375, + "step": 4950, + "time_per_iteration": 2.491163969039917 + }, + { + "auxiliary_loss_clip": 0.01127031, + "auxiliary_loss_mlp": 0.01043355, + "balance_loss_clip": 1.02792883, + "balance_loss_mlp": 1.04543924, + "epoch": 0.2976702239591162, + "flos": 21945657674880.0, + "grad_norm": 1.7996518465212372, + "language_loss": 0.82192945, + "learning_rate": 3.294322145875789e-06, + "loss": 0.84363329, + "num_input_tokens_seen": 106650065, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.81640625, + "step": 4951, + "time_per_iteration": 2.5001299381256104 + }, + { + "auxiliary_loss_clip": 0.01127377, + "auxiliary_loss_mlp": 0.01035652, + "balance_loss_clip": 1.01936841, + "balance_loss_mlp": 1.04211378, + "epoch": 0.29773034721178415, + "flos": 24637197542400.0, + "grad_norm": 2.6816702718299763, + "language_loss": 0.73421168, + "learning_rate": 3.2940252139412912e-06, + "loss": 0.75584191, + "num_input_tokens_seen": 106668230, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8515625, + "step": 4952, + "time_per_iteration": 2.4888715744018555 + }, + { + "auxiliary_loss_clip": 0.01132645, + "auxiliary_loss_mlp": 0.01041244, + "balance_loss_clip": 1.0246501, + "balance_loss_mlp": 1.04677546, + "epoch": 0.2977904704644521, + "flos": 20557566541440.0, + "grad_norm": 1.7548041314188605, + "language_loss": 0.83702904, + "learning_rate": 3.293728232937228e-06, + "loss": 0.85876799, + "num_input_tokens_seen": 106687785, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.859375, + "step": 4953, + "time_per_iteration": 2.486267566680908 + }, + { + "auxiliary_loss_clip": 0.01131661, + "auxiliary_loss_mlp": 0.0103863, + "balance_loss_clip": 1.02330005, + "balance_loss_mlp": 1.04566419, + "epoch": 0.2978505937171201, + "flos": 18916449027840.0, + "grad_norm": 2.078619348093555, + "language_loss": 0.74560732, + "learning_rate": 3.2934312028748597e-06, + "loss": 0.7673102, + "num_input_tokens_seen": 106706875, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.859375, + "step": 4954, + "time_per_iteration": 2.454066276550293 + }, + { + "auxiliary_loss_clip": 0.01129016, + "auxiliary_loss_mlp": 0.01036885, + "balance_loss_clip": 1.02153063, + "balance_loss_mlp": 1.0450201, + "epoch": 0.29791071696978805, + "flos": 19317750750720.0, + "grad_norm": 1.9786208165821892, + "language_loss": 0.75643009, + "learning_rate": 3.293134123765452e-06, + "loss": 0.77808911, + "num_input_tokens_seen": 106725105, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.83984375, + "step": 4955, + "time_per_iteration": 2.487297773361206 + }, + { + "auxiliary_loss_clip": 0.01132846, + "auxiliary_loss_mlp": 0.01035521, + "balance_loss_clip": 1.01980329, + "balance_loss_mlp": 1.04604173, + "epoch": 0.297970840222456, + "flos": 18806813740800.0, + "grad_norm": 3.347495877937089, + "language_loss": 0.72235912, + "learning_rate": 3.2928369956202684e-06, + "loss": 0.74404275, + "num_input_tokens_seen": 106744780, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8671875, + "step": 4956, + "time_per_iteration": 2.453639507293701 + }, + { + "auxiliary_loss_clip": 0.01134178, + "auxiliary_loss_mlp": 0.01044496, + "balance_loss_clip": 1.02737164, + "balance_loss_mlp": 1.04482651, + "epoch": 0.298030963475124, + "flos": 22852760762880.0, + "grad_norm": 1.6786835957024704, + "language_loss": 0.79504669, + "learning_rate": 3.2925398184505754e-06, + "loss": 0.81683344, + "num_input_tokens_seen": 106764670, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.89453125, + "step": 4957, + "time_per_iteration": 2.4680192470550537 + }, + { + "auxiliary_loss_clip": 0.01133842, + "auxiliary_loss_mlp": 0.0103974, + "balance_loss_clip": 1.02283621, + "balance_loss_mlp": 1.04692602, + "epoch": 0.298091086727792, + "flos": 21868485304320.0, + "grad_norm": 1.5505958112028584, + "language_loss": 0.70515305, + "learning_rate": 3.2922425922676437e-06, + "loss": 0.7268889, + "num_input_tokens_seen": 106783695, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8671875, + "step": 4958, + "time_per_iteration": 2.463550090789795 + }, + { + "auxiliary_loss_clip": 0.01130665, + "auxiliary_loss_mlp": 0.0104304, + "balance_loss_clip": 1.02685153, + "balance_loss_mlp": 1.04660892, + "epoch": 0.29815120998045996, + "flos": 21175014355200.0, + "grad_norm": 1.6483091075690746, + "language_loss": 0.78709656, + "learning_rate": 3.291945317082743e-06, + "loss": 0.8088336, + "num_input_tokens_seen": 106803150, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.83984375, + "step": 4959, + "time_per_iteration": 2.4896273612976074 + }, + { + "auxiliary_loss_clip": 0.0112987, + "auxiliary_loss_mlp": 0.01045688, + "balance_loss_clip": 1.03010738, + "balance_loss_mlp": 1.04477429, + "epoch": 0.29821133323312793, + "flos": 19896271200000.0, + "grad_norm": 1.8058675414038505, + "language_loss": 0.79814601, + "learning_rate": 3.291647992907147e-06, + "loss": 0.81990159, + "num_input_tokens_seen": 106820705, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8515625, + "step": 4960, + "time_per_iteration": 2.4524307250976562 + }, + { + "auxiliary_loss_clip": 0.01133353, + "auxiliary_loss_mlp": 0.01047089, + "balance_loss_clip": 1.02998269, + "balance_loss_mlp": 1.04504156, + "epoch": 0.2982714564857959, + "flos": 12750766744320.0, + "grad_norm": 2.8105894923901418, + "language_loss": 0.73709917, + "learning_rate": 3.291350619752129e-06, + "loss": 0.75890362, + "num_input_tokens_seen": 106837335, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.8828125, + "step": 4961, + "time_per_iteration": 2.463160991668701 + }, + { + "auxiliary_loss_clip": 0.01132538, + "auxiliary_loss_mlp": 0.0103792, + "balance_loss_clip": 1.02301908, + "balance_loss_mlp": 1.0466218, + "epoch": 0.29833157973846386, + "flos": 22271905929600.0, + "grad_norm": 1.946317435202559, + "language_loss": 0.62041843, + "learning_rate": 3.291053197628967e-06, + "loss": 0.64212298, + "num_input_tokens_seen": 106856250, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.859375, + "step": 4962, + "time_per_iteration": 2.4734280109405518 + }, + { + "auxiliary_loss_clip": 0.0113099, + "auxiliary_loss_mlp": 0.01038012, + "balance_loss_clip": 1.02143037, + "balance_loss_mlp": 1.04580986, + "epoch": 0.2983917029911318, + "flos": 15372999319680.0, + "grad_norm": 1.708438122809617, + "language_loss": 0.83075964, + "learning_rate": 3.2907557265489375e-06, + "loss": 0.85244966, + "num_input_tokens_seen": 106873370, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8515625, + "step": 4963, + "time_per_iteration": 2.4676647186279297 + }, + { + "auxiliary_loss_clip": 0.01132139, + "auxiliary_loss_mlp": 0.01037543, + "balance_loss_clip": 1.02108073, + "balance_loss_mlp": 1.04811728, + "epoch": 0.2984518262437998, + "flos": 15377632174080.0, + "grad_norm": 2.8539744131594924, + "language_loss": 0.66537225, + "learning_rate": 3.290458206523322e-06, + "loss": 0.68706906, + "num_input_tokens_seen": 106890330, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.83984375, + "step": 4964, + "time_per_iteration": 2.425261974334717 + }, + { + "auxiliary_loss_clip": 0.01128116, + "auxiliary_loss_mlp": 0.01034534, + "balance_loss_clip": 1.01994288, + "balance_loss_mlp": 1.04498291, + "epoch": 0.29851194949646775, + "flos": 18108458542080.0, + "grad_norm": 1.6142193033036512, + "language_loss": 0.70836121, + "learning_rate": 3.2901606375634015e-06, + "loss": 0.72998774, + "num_input_tokens_seen": 106909190, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.83203125, + "step": 4965, + "time_per_iteration": 2.468221664428711 + }, + { + "auxiliary_loss_clip": 0.01137695, + "auxiliary_loss_mlp": 0.01047125, + "balance_loss_clip": 1.0309124, + "balance_loss_mlp": 1.05098724, + "epoch": 0.2985720727491357, + "flos": 22018233104640.0, + "grad_norm": 2.501073720290292, + "language_loss": 0.66185117, + "learning_rate": 3.289863019680461e-06, + "loss": 0.68369937, + "num_input_tokens_seen": 106927825, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8671875, + "step": 4966, + "time_per_iteration": 2.479327440261841 + }, + { + "auxiliary_loss_clip": 0.01134994, + "auxiliary_loss_mlp": 0.01040953, + "balance_loss_clip": 1.02595615, + "balance_loss_mlp": 1.04869342, + "epoch": 0.2986321960018037, + "flos": 13041355772160.0, + "grad_norm": 2.7651343279829215, + "language_loss": 0.74186444, + "learning_rate": 3.289565352885785e-06, + "loss": 0.76362395, + "num_input_tokens_seen": 106943155, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.86328125, + "step": 4967, + "time_per_iteration": 2.4752163887023926 + }, + { + "auxiliary_loss_clip": 0.01129475, + "auxiliary_loss_mlp": 0.01035044, + "balance_loss_clip": 1.02035177, + "balance_loss_mlp": 1.04422212, + "epoch": 0.29869231925447165, + "flos": 14465034305280.0, + "grad_norm": 1.9700123684688966, + "language_loss": 0.71222222, + "learning_rate": 3.2892676371906614e-06, + "loss": 0.73386747, + "num_input_tokens_seen": 106960295, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8515625, + "step": 4968, + "time_per_iteration": 2.448028564453125 + }, + { + "auxiliary_loss_clip": 0.01131577, + "auxiliary_loss_mlp": 0.0103395, + "balance_loss_clip": 1.01884651, + "balance_loss_mlp": 1.04596853, + "epoch": 0.2987524425071396, + "flos": 31650228639360.0, + "grad_norm": 2.0898000655075752, + "language_loss": 0.77127141, + "learning_rate": 3.2889698726063805e-06, + "loss": 0.79292667, + "num_input_tokens_seen": 106982870, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.85546875, + "step": 4969, + "time_per_iteration": 2.5737853050231934 + }, + { + "auxiliary_loss_clip": 0.01131698, + "auxiliary_loss_mlp": 0.01037718, + "balance_loss_clip": 1.022578, + "balance_loss_mlp": 1.04641569, + "epoch": 0.2988125657598076, + "flos": 21433427775360.0, + "grad_norm": 1.5683816051841135, + "language_loss": 0.69798505, + "learning_rate": 3.2886720591442327e-06, + "loss": 0.71967924, + "num_input_tokens_seen": 107002405, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8515625, + "step": 4970, + "time_per_iteration": 5.428143501281738 + }, + { + "auxiliary_loss_clip": 0.01135849, + "auxiliary_loss_mlp": 0.01045402, + "balance_loss_clip": 1.02831888, + "balance_loss_mlp": 1.04582572, + "epoch": 0.2988726890124756, + "flos": 18076965292800.0, + "grad_norm": 2.0403310419369314, + "language_loss": 0.85269564, + "learning_rate": 3.2883741968155103e-06, + "loss": 0.8745082, + "num_input_tokens_seen": 107017310, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.8984375, + "step": 4971, + "time_per_iteration": 2.4557158946990967 + }, + { + "auxiliary_loss_clip": 0.0113165, + "auxiliary_loss_mlp": 0.01044418, + "balance_loss_clip": 1.02905178, + "balance_loss_mlp": 1.0487361, + "epoch": 0.29893281226514357, + "flos": 21755653706880.0, + "grad_norm": 1.8300460221108372, + "language_loss": 0.79116535, + "learning_rate": 3.2880762856315107e-06, + "loss": 0.81292605, + "num_input_tokens_seen": 107034645, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.828125, + "step": 4972, + "time_per_iteration": 2.492119550704956 + }, + { + "auxiliary_loss_clip": 0.0113418, + "auxiliary_loss_mlp": 0.01040036, + "balance_loss_clip": 1.02457476, + "balance_loss_mlp": 1.0491786, + "epoch": 0.29899293551781153, + "flos": 16836718538880.0, + "grad_norm": 1.9080397703774756, + "language_loss": 0.85019803, + "learning_rate": 3.2877783256035285e-06, + "loss": 0.87194014, + "num_input_tokens_seen": 107051125, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84765625, + "step": 4973, + "time_per_iteration": 2.4409923553466797 + }, + { + "auxiliary_loss_clip": 0.01128243, + "auxiliary_loss_mlp": 0.0103693, + "balance_loss_clip": 1.02192163, + "balance_loss_mlp": 1.04866779, + "epoch": 0.2990530587704795, + "flos": 11729215946880.0, + "grad_norm": 1.5302170897903997, + "language_loss": 0.77397263, + "learning_rate": 3.287480316742863e-06, + "loss": 0.79562438, + "num_input_tokens_seen": 107068815, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.796875, + "step": 4974, + "time_per_iteration": 2.4786176681518555 + }, + { + "auxiliary_loss_clip": 0.01135129, + "auxiliary_loss_mlp": 0.01042004, + "balance_loss_clip": 1.02723432, + "balance_loss_mlp": 1.04905188, + "epoch": 0.29911318202314746, + "flos": 28039877850240.0, + "grad_norm": 2.0911748108299015, + "language_loss": 0.72264957, + "learning_rate": 3.287182259060815e-06, + "loss": 0.74442089, + "num_input_tokens_seen": 107090420, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.859375, + "step": 4975, + "time_per_iteration": 2.5267655849456787 + }, + { + "auxiliary_loss_clip": 0.01133427, + "auxiliary_loss_mlp": 0.01038056, + "balance_loss_clip": 1.02204621, + "balance_loss_mlp": 1.0501368, + "epoch": 0.2991733052758154, + "flos": 18733555952640.0, + "grad_norm": 4.957635138610608, + "language_loss": 0.76028466, + "learning_rate": 3.286884152568687e-06, + "loss": 0.78199953, + "num_input_tokens_seen": 107107255, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.83203125, + "step": 4976, + "time_per_iteration": 2.46476149559021 + }, + { + "auxiliary_loss_clip": 0.01131159, + "auxiliary_loss_mlp": 0.01039669, + "balance_loss_clip": 1.02464914, + "balance_loss_mlp": 1.04786563, + "epoch": 0.2992334285284834, + "flos": 15559160532480.0, + "grad_norm": 2.141179611311424, + "language_loss": 0.86060619, + "learning_rate": 3.2865859972777827e-06, + "loss": 0.88231456, + "num_input_tokens_seen": 107123840, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.83203125, + "step": 4977, + "time_per_iteration": 2.4342682361602783 + }, + { + "auxiliary_loss_clip": 0.01135764, + "auxiliary_loss_mlp": 0.01041989, + "balance_loss_clip": 1.02605033, + "balance_loss_mlp": 1.0510987, + "epoch": 0.29929355178115136, + "flos": 21797561900160.0, + "grad_norm": 1.6147948075287948, + "language_loss": 0.68286109, + "learning_rate": 3.2862877931994088e-06, + "loss": 0.7046386, + "num_input_tokens_seen": 107143475, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.84375, + "step": 4978, + "time_per_iteration": 2.539616823196411 + }, + { + "auxiliary_loss_clip": 0.01138133, + "auxiliary_loss_mlp": 0.01036989, + "balance_loss_clip": 1.02078843, + "balance_loss_mlp": 1.053123, + "epoch": 0.2993536750338193, + "flos": 21178533888000.0, + "grad_norm": 1.9781984123500023, + "language_loss": 0.7654568, + "learning_rate": 3.2859895403448726e-06, + "loss": 0.78720796, + "num_input_tokens_seen": 107161725, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8515625, + "step": 4979, + "time_per_iteration": 2.4865188598632812 + }, + { + "auxiliary_loss_clip": 0.01130641, + "auxiliary_loss_mlp": 0.01038072, + "balance_loss_clip": 1.02265859, + "balance_loss_mlp": 1.04520524, + "epoch": 0.2994137982864873, + "flos": 32122130544000.0, + "grad_norm": 1.7578947600277828, + "language_loss": 0.68300819, + "learning_rate": 3.285691238725484e-06, + "loss": 0.70469534, + "num_input_tokens_seen": 107183935, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.85546875, + "step": 4980, + "time_per_iteration": 2.6137757301330566 + }, + { + "auxiliary_loss_clip": 0.01132193, + "auxiliary_loss_mlp": 0.01039056, + "balance_loss_clip": 1.02396405, + "balance_loss_mlp": 1.05068171, + "epoch": 0.29947392153915525, + "flos": 21105419754240.0, + "grad_norm": 1.9242198828448243, + "language_loss": 0.73239923, + "learning_rate": 3.285392888352555e-06, + "loss": 0.75411171, + "num_input_tokens_seen": 107204285, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8125, + "step": 4981, + "time_per_iteration": 2.5342931747436523 + }, + { + "auxiliary_loss_clip": 0.01135451, + "auxiliary_loss_mlp": 0.01037274, + "balance_loss_clip": 1.02227712, + "balance_loss_mlp": 1.04691803, + "epoch": 0.2995340447918232, + "flos": 21542632099200.0, + "grad_norm": 1.470312251429405, + "language_loss": 0.86429024, + "learning_rate": 3.2850944892373987e-06, + "loss": 0.8860175, + "num_input_tokens_seen": 107225265, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.88671875, + "step": 4982, + "time_per_iteration": 2.5238943099975586 + }, + { + "auxiliary_loss_clip": 0.01138194, + "auxiliary_loss_mlp": 0.01041283, + "balance_loss_clip": 1.02446246, + "balance_loss_mlp": 1.04975057, + "epoch": 0.2995941680444912, + "flos": 16725143917440.0, + "grad_norm": 2.2481661066872904, + "language_loss": 0.86378068, + "learning_rate": 3.2847960413913307e-06, + "loss": 0.88557541, + "num_input_tokens_seen": 107241335, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8828125, + "step": 4983, + "time_per_iteration": 2.4477322101593018 + }, + { + "auxiliary_loss_clip": 0.01133456, + "auxiliary_loss_mlp": 0.01040756, + "balance_loss_clip": 1.02577138, + "balance_loss_mlp": 1.0483377, + "epoch": 0.2996542912971592, + "flos": 20923496346240.0, + "grad_norm": 1.8474343514891325, + "language_loss": 0.78286207, + "learning_rate": 3.284497544825668e-06, + "loss": 0.80460417, + "num_input_tokens_seen": 107259375, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8515625, + "step": 4984, + "time_per_iteration": 2.490079402923584 + }, + { + "auxiliary_loss_clip": 0.01136807, + "auxiliary_loss_mlp": 0.01046143, + "balance_loss_clip": 1.02960873, + "balance_loss_mlp": 1.05052662, + "epoch": 0.29971441454982717, + "flos": 25079868754560.0, + "grad_norm": 1.555514289558953, + "language_loss": 0.78418988, + "learning_rate": 3.2841989995517303e-06, + "loss": 0.80601943, + "num_input_tokens_seen": 107279890, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.86328125, + "step": 4985, + "time_per_iteration": 2.5188379287719727 + }, + { + "auxiliary_loss_clip": 0.01138287, + "auxiliary_loss_mlp": 0.01037976, + "balance_loss_clip": 1.02115583, + "balance_loss_mlp": 1.05010915, + "epoch": 0.29977453780249513, + "flos": 52555911840000.0, + "grad_norm": 3.8074401298215905, + "language_loss": 0.72157449, + "learning_rate": 3.283900405580837e-06, + "loss": 0.74333715, + "num_input_tokens_seen": 107303430, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8828125, + "step": 4986, + "time_per_iteration": 2.7730660438537598 + }, + { + "auxiliary_loss_clip": 0.01136069, + "auxiliary_loss_mlp": 0.01041734, + "balance_loss_clip": 1.02523577, + "balance_loss_mlp": 1.04813981, + "epoch": 0.2998346610551631, + "flos": 22237144542720.0, + "grad_norm": 1.7357810931981628, + "language_loss": 0.73332191, + "learning_rate": 3.283601762924312e-06, + "loss": 0.75509989, + "num_input_tokens_seen": 107323700, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.87890625, + "step": 4987, + "time_per_iteration": 2.4857406616210938 + }, + { + "auxiliary_loss_clip": 0.01131799, + "auxiliary_loss_mlp": 0.01036451, + "balance_loss_clip": 1.02162147, + "balance_loss_mlp": 1.04787469, + "epoch": 0.29989478430783106, + "flos": 16873203778560.0, + "grad_norm": 2.6184059112472817, + "language_loss": 0.80173379, + "learning_rate": 3.2833030715934793e-06, + "loss": 0.82341629, + "num_input_tokens_seen": 107341965, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.83984375, + "step": 4988, + "time_per_iteration": 2.477614641189575 + }, + { + "auxiliary_loss_clip": 0.01133993, + "auxiliary_loss_mlp": 0.01044495, + "balance_loss_clip": 1.02874756, + "balance_loss_mlp": 1.04897678, + "epoch": 0.29995490756049903, + "flos": 23768878164480.0, + "grad_norm": 1.615528223125509, + "language_loss": 0.70302641, + "learning_rate": 3.2830043315996658e-06, + "loss": 0.72481132, + "num_input_tokens_seen": 107362615, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8515625, + "step": 4989, + "time_per_iteration": 2.4942874908447266 + }, + { + "auxiliary_loss_clip": 0.01137636, + "auxiliary_loss_mlp": 0.01040507, + "balance_loss_clip": 1.02382946, + "balance_loss_mlp": 1.05045295, + "epoch": 0.300015030813167, + "flos": 14465321614080.0, + "grad_norm": 2.0547136882256654, + "language_loss": 0.85636222, + "learning_rate": 3.282705542954199e-06, + "loss": 0.87814367, + "num_input_tokens_seen": 107378980, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.87109375, + "step": 4990, + "time_per_iteration": 2.455134391784668 + }, + { + "auxiliary_loss_clip": 0.01135916, + "auxiliary_loss_mlp": 0.01035323, + "balance_loss_clip": 1.01884782, + "balance_loss_mlp": 1.04822564, + "epoch": 0.30007515406583496, + "flos": 25191982080000.0, + "grad_norm": 1.6641511475566748, + "language_loss": 0.67125142, + "learning_rate": 3.28240670566841e-06, + "loss": 0.69296378, + "num_input_tokens_seen": 107397640, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.87890625, + "step": 4991, + "time_per_iteration": 2.4928019046783447 + }, + { + "auxiliary_loss_clip": 0.01137405, + "auxiliary_loss_mlp": 0.0103888, + "balance_loss_clip": 1.02165437, + "balance_loss_mlp": 1.0479908, + "epoch": 0.3001352773185029, + "flos": 19391188106880.0, + "grad_norm": 1.5868946812173, + "language_loss": 0.78707612, + "learning_rate": 3.28210781975363e-06, + "loss": 0.80883896, + "num_input_tokens_seen": 107416020, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.89453125, + "step": 4992, + "time_per_iteration": 2.5030534267425537 + }, + { + "auxiliary_loss_clip": 0.01135049, + "auxiliary_loss_mlp": 0.01036168, + "balance_loss_clip": 1.02056348, + "balance_loss_mlp": 1.04976213, + "epoch": 0.3001954005711709, + "flos": 21543853161600.0, + "grad_norm": 1.8035914694742925, + "language_loss": 0.824085, + "learning_rate": 3.281808885221193e-06, + "loss": 0.84579718, + "num_input_tokens_seen": 107436340, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8515625, + "step": 4993, + "time_per_iteration": 2.475588083267212 + }, + { + "auxiliary_loss_clip": 0.01138101, + "auxiliary_loss_mlp": 0.01042764, + "balance_loss_clip": 1.02522802, + "balance_loss_mlp": 1.04808736, + "epoch": 0.30025552382383885, + "flos": 17384320356480.0, + "grad_norm": 2.0505124462232898, + "language_loss": 0.85850489, + "learning_rate": 3.2815099020824345e-06, + "loss": 0.88031358, + "num_input_tokens_seen": 107454585, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8984375, + "step": 4994, + "time_per_iteration": 2.47881817817688 + }, + { + "auxiliary_loss_clip": 0.0113641, + "auxiliary_loss_mlp": 0.01036445, + "balance_loss_clip": 1.02086425, + "balance_loss_mlp": 1.05017769, + "epoch": 0.3003156470765068, + "flos": 29533330552320.0, + "grad_norm": 1.5183999234373478, + "language_loss": 0.8111707, + "learning_rate": 3.2812108703486924e-06, + "loss": 0.83289921, + "num_input_tokens_seen": 107477180, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.86328125, + "step": 4995, + "time_per_iteration": 2.5481183528900146 + }, + { + "auxiliary_loss_clip": 0.0113418, + "auxiliary_loss_mlp": 0.01041404, + "balance_loss_clip": 1.02522683, + "balance_loss_mlp": 1.05089867, + "epoch": 0.3003757703291748, + "flos": 43646402465280.0, + "grad_norm": 1.7074459415862762, + "language_loss": 0.67098773, + "learning_rate": 3.2809117900313055e-06, + "loss": 0.69274354, + "num_input_tokens_seen": 107500250, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.83203125, + "step": 4996, + "time_per_iteration": 2.6810193061828613 + }, + { + "auxiliary_loss_clip": 0.01134671, + "auxiliary_loss_mlp": 0.01040082, + "balance_loss_clip": 1.02392912, + "balance_loss_mlp": 1.04883564, + "epoch": 0.30043589358184275, + "flos": 22528380015360.0, + "grad_norm": 1.7509046873587113, + "language_loss": 0.75304276, + "learning_rate": 3.280612661141615e-06, + "loss": 0.77479029, + "num_input_tokens_seen": 107520070, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.859375, + "step": 4997, + "time_per_iteration": 2.472226858139038 + }, + { + "auxiliary_loss_clip": 0.01132042, + "auxiliary_loss_mlp": 0.01038973, + "balance_loss_clip": 1.02372646, + "balance_loss_mlp": 1.04816282, + "epoch": 0.30049601683451077, + "flos": 20995892208000.0, + "grad_norm": 1.9401125864941864, + "language_loss": 0.77664721, + "learning_rate": 3.2803134836909646e-06, + "loss": 0.79835731, + "num_input_tokens_seen": 107539285, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.83984375, + "step": 4998, + "time_per_iteration": 2.495087146759033 + }, + { + "auxiliary_loss_clip": 0.01129805, + "auxiliary_loss_mlp": 0.01039417, + "balance_loss_clip": 1.02469468, + "balance_loss_mlp": 1.04812598, + "epoch": 0.30055614008717874, + "flos": 23916004272000.0, + "grad_norm": 1.5996751316274151, + "language_loss": 0.73429006, + "learning_rate": 3.2800142576906985e-06, + "loss": 0.75598228, + "num_input_tokens_seen": 107560260, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.81640625, + "step": 4999, + "time_per_iteration": 2.491774082183838 + }, + { + "auxiliary_loss_clip": 0.01134839, + "auxiliary_loss_mlp": 0.01037955, + "balance_loss_clip": 1.02250576, + "balance_loss_mlp": 1.0498935, + "epoch": 0.3006162633398467, + "flos": 19169798630400.0, + "grad_norm": 1.6017930279588588, + "language_loss": 0.756015, + "learning_rate": 3.2797149831521626e-06, + "loss": 0.77774298, + "num_input_tokens_seen": 107579260, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8515625, + "step": 5000, + "time_per_iteration": 2.572003126144409 + }, + { + "auxiliary_loss_clip": 0.01131295, + "auxiliary_loss_mlp": 0.01036738, + "balance_loss_clip": 1.02329731, + "balance_loss_mlp": 1.04886353, + "epoch": 0.30067638659251467, + "flos": 14679241061760.0, + "grad_norm": 1.977226227337592, + "language_loss": 0.81681275, + "learning_rate": 3.2794156600867073e-06, + "loss": 0.83849311, + "num_input_tokens_seen": 107595245, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.82421875, + "step": 5001, + "time_per_iteration": 2.4240355491638184 + }, + { + "auxiliary_loss_clip": 0.01136183, + "auxiliary_loss_mlp": 0.01041393, + "balance_loss_clip": 1.02538288, + "balance_loss_mlp": 1.05103087, + "epoch": 0.30073650984518263, + "flos": 23368007404800.0, + "grad_norm": 1.5846802536013025, + "language_loss": 0.8056432, + "learning_rate": 3.2791162885056815e-06, + "loss": 0.82741892, + "num_input_tokens_seen": 107613985, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8515625, + "step": 5002, + "time_per_iteration": 2.5848264694213867 + }, + { + "auxiliary_loss_clip": 0.01137551, + "auxiliary_loss_mlp": 0.01037496, + "balance_loss_clip": 1.02240372, + "balance_loss_mlp": 1.04907179, + "epoch": 0.3007966330978506, + "flos": 22966633854720.0, + "grad_norm": 1.6918091030667293, + "language_loss": 0.71209854, + "learning_rate": 3.2788168684204376e-06, + "loss": 0.73384899, + "num_input_tokens_seen": 107631435, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8828125, + "step": 5003, + "time_per_iteration": 2.4672186374664307 + }, + { + "auxiliary_loss_clip": 0.01136595, + "auxiliary_loss_mlp": 0.01038624, + "balance_loss_clip": 1.02377009, + "balance_loss_mlp": 1.05050564, + "epoch": 0.30085675635051856, + "flos": 27818452460160.0, + "grad_norm": 1.8725932973877313, + "language_loss": 0.70613277, + "learning_rate": 3.27851739984233e-06, + "loss": 0.72788501, + "num_input_tokens_seen": 107650530, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.859375, + "step": 5004, + "time_per_iteration": 2.579941511154175 + }, + { + "auxiliary_loss_clip": 0.01135872, + "auxiliary_loss_mlp": 0.01044061, + "balance_loss_clip": 1.02817035, + "balance_loss_mlp": 1.04977477, + "epoch": 0.3009168796031865, + "flos": 10882729059840.0, + "grad_norm": 2.8634075898885767, + "language_loss": 0.81359464, + "learning_rate": 3.278217882782715e-06, + "loss": 0.83539397, + "num_input_tokens_seen": 107662240, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.859375, + "step": 5005, + "time_per_iteration": 2.4043233394622803 + }, + { + "auxiliary_loss_clip": 0.01132041, + "auxiliary_loss_mlp": 0.01041947, + "balance_loss_clip": 1.02702177, + "balance_loss_mlp": 1.04792035, + "epoch": 0.3009770028558545, + "flos": 23805399317760.0, + "grad_norm": 2.9232502202927266, + "language_loss": 0.74906754, + "learning_rate": 3.2779183172529497e-06, + "loss": 0.77080745, + "num_input_tokens_seen": 107680330, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.84375, + "step": 5006, + "time_per_iteration": 2.5169718265533447 + }, + { + "auxiliary_loss_clip": 0.0113004, + "auxiliary_loss_mlp": 0.01041924, + "balance_loss_clip": 1.02712977, + "balance_loss_mlp": 1.04745531, + "epoch": 0.30103712610852246, + "flos": 26468211283200.0, + "grad_norm": 2.157802275476472, + "language_loss": 0.70810544, + "learning_rate": 3.2776187032643932e-06, + "loss": 0.72982514, + "num_input_tokens_seen": 107700020, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.828125, + "step": 5007, + "time_per_iteration": 2.500135898590088 + }, + { + "auxiliary_loss_clip": 0.01133792, + "auxiliary_loss_mlp": 0.01040278, + "balance_loss_clip": 1.02453065, + "balance_loss_mlp": 1.04947257, + "epoch": 0.3010972493611904, + "flos": 22856459863680.0, + "grad_norm": 2.301214894203853, + "language_loss": 0.76435697, + "learning_rate": 3.2773190408284075e-06, + "loss": 0.78609765, + "num_input_tokens_seen": 107718575, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.84375, + "step": 5008, + "time_per_iteration": 2.5071120262145996 + }, + { + "auxiliary_loss_clip": 0.0113182, + "auxiliary_loss_mlp": 0.01039886, + "balance_loss_clip": 1.02464485, + "balance_loss_mlp": 1.04823518, + "epoch": 0.3011573726138584, + "flos": 24053685102720.0, + "grad_norm": 1.7973688674758703, + "language_loss": 0.84830707, + "learning_rate": 3.2770193299563564e-06, + "loss": 0.87002409, + "num_input_tokens_seen": 107738635, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8359375, + "step": 5009, + "time_per_iteration": 2.531024694442749 + }, + { + "auxiliary_loss_clip": 0.01135897, + "auxiliary_loss_mlp": 0.01037547, + "balance_loss_clip": 1.0211432, + "balance_loss_mlp": 1.04830122, + "epoch": 0.30121749586652635, + "flos": 20259687052800.0, + "grad_norm": 1.9976209282841157, + "language_loss": 0.83813334, + "learning_rate": 3.276719570659604e-06, + "loss": 0.85986781, + "num_input_tokens_seen": 107753415, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.875, + "step": 5010, + "time_per_iteration": 2.4690375328063965 + }, + { + "auxiliary_loss_clip": 0.01130658, + "auxiliary_loss_mlp": 0.01034306, + "balance_loss_clip": 1.02003646, + "balance_loss_mlp": 1.04724431, + "epoch": 0.3012776191191944, + "flos": 26943058103040.0, + "grad_norm": 1.9597018241269177, + "language_loss": 0.85013181, + "learning_rate": 3.2764197629495176e-06, + "loss": 0.87178147, + "num_input_tokens_seen": 107773840, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.83203125, + "step": 5011, + "time_per_iteration": 2.501708745956421 + }, + { + "auxiliary_loss_clip": 0.01134213, + "auxiliary_loss_mlp": 0.01039104, + "balance_loss_clip": 1.02335644, + "balance_loss_mlp": 1.04754543, + "epoch": 0.30133774237186234, + "flos": 20412307941120.0, + "grad_norm": 2.0524404295798013, + "language_loss": 0.71966654, + "learning_rate": 3.2761199068374656e-06, + "loss": 0.74139971, + "num_input_tokens_seen": 107792020, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8671875, + "step": 5012, + "time_per_iteration": 3.979128360748291 + }, + { + "auxiliary_loss_clip": 0.01131878, + "auxiliary_loss_mlp": 0.01037572, + "balance_loss_clip": 1.0229032, + "balance_loss_mlp": 1.04721081, + "epoch": 0.3013978656245303, + "flos": 19792453916160.0, + "grad_norm": 1.9997819947408795, + "language_loss": 0.87396109, + "learning_rate": 3.275820002334819e-06, + "loss": 0.89565563, + "num_input_tokens_seen": 107809595, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.84765625, + "step": 5013, + "time_per_iteration": 2.467177629470825 + }, + { + "auxiliary_loss_clip": 0.01136565, + "auxiliary_loss_mlp": 0.01036881, + "balance_loss_clip": 1.0200367, + "balance_loss_mlp": 1.04842985, + "epoch": 0.30145798887719827, + "flos": 16249650652800.0, + "grad_norm": 3.4702040063697313, + "language_loss": 0.83367115, + "learning_rate": 3.2755200494529496e-06, + "loss": 0.85540557, + "num_input_tokens_seen": 107827230, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8828125, + "step": 5014, + "time_per_iteration": 2.4654901027679443 + }, + { + "auxiliary_loss_clip": 0.01128425, + "auxiliary_loss_mlp": 0.01033529, + "balance_loss_clip": 1.01896727, + "balance_loss_mlp": 1.0471499, + "epoch": 0.30151811212986623, + "flos": 24571733005440.0, + "grad_norm": 1.6346146355602116, + "language_loss": 0.68218327, + "learning_rate": 3.2752200482032323e-06, + "loss": 0.70380276, + "num_input_tokens_seen": 107847195, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8125, + "step": 5015, + "time_per_iteration": 2.4994328022003174 + }, + { + "auxiliary_loss_clip": 0.01132371, + "auxiliary_loss_mlp": 0.01038543, + "balance_loss_clip": 1.02309942, + "balance_loss_mlp": 1.04864407, + "epoch": 0.3015782353825342, + "flos": 21872076664320.0, + "grad_norm": 2.7110353723362635, + "language_loss": 0.74712509, + "learning_rate": 3.2749199985970436e-06, + "loss": 0.76883423, + "num_input_tokens_seen": 107866420, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8359375, + "step": 5016, + "time_per_iteration": 2.5168755054473877 + }, + { + "auxiliary_loss_clip": 0.01135364, + "auxiliary_loss_mlp": 0.01036445, + "balance_loss_clip": 1.0210197, + "balance_loss_mlp": 1.0498333, + "epoch": 0.30163835863520216, + "flos": 28769331248640.0, + "grad_norm": 1.6963436015958502, + "language_loss": 0.65179884, + "learning_rate": 3.2746199006457603e-06, + "loss": 0.67351693, + "num_input_tokens_seen": 107889090, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.85546875, + "step": 5017, + "time_per_iteration": 2.543577194213867 + }, + { + "auxiliary_loss_clip": 0.01134511, + "auxiliary_loss_mlp": 0.01043761, + "balance_loss_clip": 1.02860379, + "balance_loss_mlp": 1.05030179, + "epoch": 0.30169848188787013, + "flos": 22966202891520.0, + "grad_norm": 2.078433105892768, + "language_loss": 0.69045079, + "learning_rate": 3.2743197543607628e-06, + "loss": 0.71223348, + "num_input_tokens_seen": 107907520, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.84375, + "step": 5018, + "time_per_iteration": 2.498060464859009 + }, + { + "auxiliary_loss_clip": 0.01129538, + "auxiliary_loss_mlp": 0.01041131, + "balance_loss_clip": 1.02772546, + "balance_loss_mlp": 1.04842138, + "epoch": 0.3017586051405381, + "flos": 21835268202240.0, + "grad_norm": 1.9198297669603306, + "language_loss": 0.78841144, + "learning_rate": 3.2740195597534327e-06, + "loss": 0.81011814, + "num_input_tokens_seen": 107925650, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.8125, + "step": 5019, + "time_per_iteration": 2.4873573780059814 + }, + { + "auxiliary_loss_clip": 0.01134625, + "auxiliary_loss_mlp": 0.01041878, + "balance_loss_clip": 1.02695298, + "balance_loss_mlp": 1.05073094, + "epoch": 0.30181872839320606, + "flos": 22160403135360.0, + "grad_norm": 2.24109756344656, + "language_loss": 0.69867152, + "learning_rate": 3.2737193168351527e-06, + "loss": 0.72043651, + "num_input_tokens_seen": 107943975, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.83984375, + "step": 5020, + "time_per_iteration": 2.493370532989502 + }, + { + "auxiliary_loss_clip": 0.01136052, + "auxiliary_loss_mlp": 0.01040456, + "balance_loss_clip": 1.0256741, + "balance_loss_mlp": 1.04941368, + "epoch": 0.301878851645874, + "flos": 18114168804480.0, + "grad_norm": 1.9013759847828555, + "language_loss": 0.78134364, + "learning_rate": 3.2734190256173085e-06, + "loss": 0.80310869, + "num_input_tokens_seen": 107962950, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8671875, + "step": 5021, + "time_per_iteration": 2.4670474529266357 + }, + { + "auxiliary_loss_clip": 0.01133279, + "auxiliary_loss_mlp": 0.01029745, + "balance_loss_clip": 1.01527357, + "balance_loss_mlp": 1.04964936, + "epoch": 0.301938974898542, + "flos": 17602226213760.0, + "grad_norm": 2.3821225807179696, + "language_loss": 0.76075405, + "learning_rate": 3.2731186861112877e-06, + "loss": 0.78238434, + "num_input_tokens_seen": 107979700, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8359375, + "step": 5022, + "time_per_iteration": 2.4737884998321533 + }, + { + "auxiliary_loss_clip": 0.01133657, + "auxiliary_loss_mlp": 0.01042925, + "balance_loss_clip": 1.02791631, + "balance_loss_mlp": 1.04880631, + "epoch": 0.30199909815120995, + "flos": 11181219079680.0, + "grad_norm": 1.7684005868111572, + "language_loss": 0.69896525, + "learning_rate": 3.2728182983284793e-06, + "loss": 0.72073108, + "num_input_tokens_seen": 107996645, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84765625, + "step": 5023, + "time_per_iteration": 2.4453155994415283 + }, + { + "auxiliary_loss_clip": 0.01136173, + "auxiliary_loss_mlp": 0.01041698, + "balance_loss_clip": 1.02673686, + "balance_loss_mlp": 1.04927671, + "epoch": 0.302059221403878, + "flos": 21907843632000.0, + "grad_norm": 2.0912728997662127, + "language_loss": 0.71588898, + "learning_rate": 3.2725178622802724e-06, + "loss": 0.73766768, + "num_input_tokens_seen": 108015020, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8671875, + "step": 5024, + "time_per_iteration": 2.4998810291290283 + }, + { + "auxiliary_loss_clip": 0.0113052, + "auxiliary_loss_mlp": 0.01047301, + "balance_loss_clip": 1.0314939, + "balance_loss_mlp": 1.04858792, + "epoch": 0.30211934465654594, + "flos": 26396390039040.0, + "grad_norm": 1.6483742353836974, + "language_loss": 0.73955721, + "learning_rate": 3.272217377978061e-06, + "loss": 0.76133543, + "num_input_tokens_seen": 108036430, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8203125, + "step": 5025, + "time_per_iteration": 2.5167019367218018 + }, + { + "auxiliary_loss_clip": 0.0113244, + "auxiliary_loss_mlp": 0.01042201, + "balance_loss_clip": 1.02800322, + "balance_loss_mlp": 1.0518502, + "epoch": 0.3021794679092139, + "flos": 23400470321280.0, + "grad_norm": 1.4799709397217862, + "language_loss": 0.67022824, + "learning_rate": 3.2719168454332387e-06, + "loss": 0.6919747, + "num_input_tokens_seen": 108054250, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8046875, + "step": 5026, + "time_per_iteration": 2.5326507091522217 + }, + { + "auxiliary_loss_clip": 0.01134018, + "auxiliary_loss_mlp": 0.01043238, + "balance_loss_clip": 1.02799106, + "balance_loss_mlp": 1.05083036, + "epoch": 0.30223959116188187, + "flos": 20260979942400.0, + "grad_norm": 1.6876842646939136, + "language_loss": 0.85252607, + "learning_rate": 3.2716162646572034e-06, + "loss": 0.87429863, + "num_input_tokens_seen": 108071495, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.83203125, + "step": 5027, + "time_per_iteration": 2.4527347087860107 + }, + { + "auxiliary_loss_clip": 0.01129327, + "auxiliary_loss_mlp": 0.01045705, + "balance_loss_clip": 1.03187656, + "balance_loss_mlp": 1.04739702, + "epoch": 0.30229971441454984, + "flos": 26687840993280.0, + "grad_norm": 1.665552114762065, + "language_loss": 0.78757018, + "learning_rate": 3.271315635661351e-06, + "loss": 0.80932051, + "num_input_tokens_seen": 108092135, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.8203125, + "step": 5028, + "time_per_iteration": 2.5677576065063477 + }, + { + "auxiliary_loss_clip": 0.01132481, + "auxiliary_loss_mlp": 0.01044847, + "balance_loss_clip": 1.0295043, + "balance_loss_mlp": 1.04922223, + "epoch": 0.3023598376672178, + "flos": 34345323953280.0, + "grad_norm": 2.0260385179345346, + "language_loss": 0.76721144, + "learning_rate": 3.2710149584570826e-06, + "loss": 0.78898472, + "num_input_tokens_seen": 108112945, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.83203125, + "step": 5029, + "time_per_iteration": 2.611917734146118 + }, + { + "auxiliary_loss_clip": 0.01133028, + "auxiliary_loss_mlp": 0.01043332, + "balance_loss_clip": 1.02642775, + "balance_loss_mlp": 1.04855132, + "epoch": 0.30241996091988577, + "flos": 23112143850240.0, + "grad_norm": 1.944959289407135, + "language_loss": 0.81868339, + "learning_rate": 3.2707142330557993e-06, + "loss": 0.84044701, + "num_input_tokens_seen": 108130325, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.84375, + "step": 5030, + "time_per_iteration": 2.605531930923462 + }, + { + "auxiliary_loss_clip": 0.01132926, + "auxiliary_loss_mlp": 0.01045193, + "balance_loss_clip": 1.02982664, + "balance_loss_mlp": 1.04754734, + "epoch": 0.30248008417255373, + "flos": 19390002958080.0, + "grad_norm": 1.748277903644489, + "language_loss": 0.69869608, + "learning_rate": 3.270413459468905e-06, + "loss": 0.72047728, + "num_input_tokens_seen": 108150300, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.85546875, + "step": 5031, + "time_per_iteration": 2.496833086013794 + }, + { + "auxiliary_loss_clip": 0.01132221, + "auxiliary_loss_mlp": 0.01036128, + "balance_loss_clip": 1.02103615, + "balance_loss_mlp": 1.04892659, + "epoch": 0.3025402074252217, + "flos": 23769704177280.0, + "grad_norm": 1.8467264077922103, + "language_loss": 0.82302773, + "learning_rate": 3.2701126377078047e-06, + "loss": 0.84471118, + "num_input_tokens_seen": 108170330, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 5032, + "time_per_iteration": 2.5062966346740723 + }, + { + "auxiliary_loss_clip": 0.01140181, + "auxiliary_loss_mlp": 0.01046543, + "balance_loss_clip": 1.02991903, + "balance_loss_mlp": 1.05332685, + "epoch": 0.30260033067788966, + "flos": 25994118648960.0, + "grad_norm": 2.10117653020426, + "language_loss": 0.73383862, + "learning_rate": 3.269811767783906e-06, + "loss": 0.75570583, + "num_input_tokens_seen": 108191265, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8671875, + "step": 5033, + "time_per_iteration": 2.561467170715332 + }, + { + "auxiliary_loss_clip": 0.01130223, + "auxiliary_loss_mlp": 0.01045217, + "balance_loss_clip": 1.03000593, + "balance_loss_mlp": 1.04782772, + "epoch": 0.3026604539305576, + "flos": 25374551932800.0, + "grad_norm": 1.437497934350084, + "language_loss": 0.74057245, + "learning_rate": 3.2695108497086185e-06, + "loss": 0.76232684, + "num_input_tokens_seen": 108211615, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.82421875, + "step": 5034, + "time_per_iteration": 2.511861801147461 + }, + { + "auxiliary_loss_clip": 0.01131916, + "auxiliary_loss_mlp": 0.01033507, + "balance_loss_clip": 1.01840353, + "balance_loss_mlp": 1.04825819, + "epoch": 0.3027205771832256, + "flos": 25812733944960.0, + "grad_norm": 1.9672144407329994, + "language_loss": 0.71617639, + "learning_rate": 3.269209883493352e-06, + "loss": 0.73783064, + "num_input_tokens_seen": 108231080, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8359375, + "step": 5035, + "time_per_iteration": 2.545917272567749 + }, + { + "auxiliary_loss_clip": 0.0113067, + "auxiliary_loss_mlp": 0.01032255, + "balance_loss_clip": 1.01835537, + "balance_loss_mlp": 1.04876685, + "epoch": 0.30278070043589356, + "flos": 27344539393920.0, + "grad_norm": 1.774174351542542, + "language_loss": 0.87232339, + "learning_rate": 3.2689088691495196e-06, + "loss": 0.89395267, + "num_input_tokens_seen": 108251125, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.8203125, + "step": 5036, + "time_per_iteration": 2.5197184085845947 + }, + { + "auxiliary_loss_clip": 0.01131426, + "auxiliary_loss_mlp": 0.01043041, + "balance_loss_clip": 1.02679288, + "balance_loss_mlp": 1.04866219, + "epoch": 0.3028408236885616, + "flos": 24786227070720.0, + "grad_norm": 2.2121077897300134, + "language_loss": 0.77760899, + "learning_rate": 3.268607806688536e-06, + "loss": 0.7993536, + "num_input_tokens_seen": 108272545, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.828125, + "step": 5037, + "time_per_iteration": 2.5372917652130127 + }, + { + "auxiliary_loss_clip": 0.01133533, + "auxiliary_loss_mlp": 0.01041477, + "balance_loss_clip": 1.02603984, + "balance_loss_mlp": 1.04973745, + "epoch": 0.30290094694122954, + "flos": 12932474670720.0, + "grad_norm": 2.4260021818478634, + "language_loss": 0.77920854, + "learning_rate": 3.268306696121816e-06, + "loss": 0.80095863, + "num_input_tokens_seen": 108289725, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8359375, + "step": 5038, + "time_per_iteration": 2.4360761642456055 + }, + { + "auxiliary_loss_clip": 0.01128654, + "auxiliary_loss_mlp": 0.01034869, + "balance_loss_clip": 1.02073669, + "balance_loss_mlp": 1.04859674, + "epoch": 0.3029610701938975, + "flos": 25916443488000.0, + "grad_norm": 1.8428508909689656, + "language_loss": 0.74134624, + "learning_rate": 3.2680055374607804e-06, + "loss": 0.76298141, + "num_input_tokens_seen": 108310690, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.80078125, + "step": 5039, + "time_per_iteration": 2.520517587661743 + }, + { + "auxiliary_loss_clip": 0.01129815, + "auxiliary_loss_mlp": 0.01037874, + "balance_loss_clip": 1.02426052, + "balance_loss_mlp": 1.05003977, + "epoch": 0.3030211934465655, + "flos": 21980993679360.0, + "grad_norm": 1.8268154911840482, + "language_loss": 0.80263746, + "learning_rate": 3.267704330716847e-06, + "loss": 0.82431436, + "num_input_tokens_seen": 108328905, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.796875, + "step": 5040, + "time_per_iteration": 2.469822406768799 + }, + { + "auxiliary_loss_clip": 0.01131744, + "auxiliary_loss_mlp": 0.01036261, + "balance_loss_clip": 1.02227795, + "balance_loss_mlp": 1.05101466, + "epoch": 0.30308131669923344, + "flos": 20991977625600.0, + "grad_norm": 1.5747579863116856, + "language_loss": 0.81914759, + "learning_rate": 3.267403075901438e-06, + "loss": 0.8408277, + "num_input_tokens_seen": 108346680, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.8046875, + "step": 5041, + "time_per_iteration": 2.5240108966827393 + }, + { + "auxiliary_loss_clip": 0.01062494, + "auxiliary_loss_mlp": 0.01003022, + "balance_loss_clip": 1.00106716, + "balance_loss_mlp": 1.02890241, + "epoch": 0.3031414399519014, + "flos": 60548875827840.0, + "grad_norm": 0.7678965945904674, + "language_loss": 0.59521127, + "learning_rate": 3.267101773025978e-06, + "loss": 0.61586642, + "num_input_tokens_seen": 108413885, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.3359375, + "step": 5042, + "time_per_iteration": 3.169004440307617 + }, + { + "auxiliary_loss_clip": 0.01134227, + "auxiliary_loss_mlp": 0.01037406, + "balance_loss_clip": 1.02271986, + "balance_loss_mlp": 1.05006266, + "epoch": 0.30320156320456937, + "flos": 21907664064000.0, + "grad_norm": 1.6113397759888244, + "language_loss": 0.71136838, + "learning_rate": 3.266800422101892e-06, + "loss": 0.73308468, + "num_input_tokens_seen": 108433640, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.84375, + "step": 5043, + "time_per_iteration": 2.5217440128326416 + }, + { + "auxiliary_loss_clip": 0.01132657, + "auxiliary_loss_mlp": 0.01037151, + "balance_loss_clip": 1.02207136, + "balance_loss_mlp": 1.04824769, + "epoch": 0.30326168645723733, + "flos": 21652770176640.0, + "grad_norm": 2.6644669890018773, + "language_loss": 0.69351244, + "learning_rate": 3.266499023140606e-06, + "loss": 0.71521056, + "num_input_tokens_seen": 108452640, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84375, + "step": 5044, + "time_per_iteration": 2.4741897583007812 + }, + { + "auxiliary_loss_clip": 0.01129908, + "auxiliary_loss_mlp": 0.01037342, + "balance_loss_clip": 1.02252388, + "balance_loss_mlp": 1.04823565, + "epoch": 0.3033218097099053, + "flos": 21871286565120.0, + "grad_norm": 1.3748845619029404, + "language_loss": 0.77210236, + "learning_rate": 3.2661975761535513e-06, + "loss": 0.79377484, + "num_input_tokens_seen": 108472470, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.81640625, + "step": 5045, + "time_per_iteration": 2.5023043155670166 + }, + { + "auxiliary_loss_clip": 0.01132495, + "auxiliary_loss_mlp": 0.0103816, + "balance_loss_clip": 1.02240646, + "balance_loss_mlp": 1.04892182, + "epoch": 0.30338193296257326, + "flos": 27089717333760.0, + "grad_norm": 1.538768377317596, + "language_loss": 0.72444695, + "learning_rate": 3.2658960811521564e-06, + "loss": 0.74615347, + "num_input_tokens_seen": 108493025, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8359375, + "step": 5046, + "time_per_iteration": 2.5163753032684326 + }, + { + "auxiliary_loss_clip": 0.01134062, + "auxiliary_loss_mlp": 0.01042653, + "balance_loss_clip": 1.02611256, + "balance_loss_mlp": 1.04859519, + "epoch": 0.30344205621524123, + "flos": 19534363718400.0, + "grad_norm": 3.2419373644374176, + "language_loss": 0.80737638, + "learning_rate": 3.2655945381478564e-06, + "loss": 0.82914352, + "num_input_tokens_seen": 108513480, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.85546875, + "step": 5047, + "time_per_iteration": 2.547245979309082 + }, + { + "auxiliary_loss_clip": 0.01131045, + "auxiliary_loss_mlp": 0.01040382, + "balance_loss_clip": 1.02569556, + "balance_loss_mlp": 1.04871237, + "epoch": 0.3035021794679092, + "flos": 23910976368000.0, + "grad_norm": 1.9357354539113198, + "language_loss": 0.72334075, + "learning_rate": 3.265292947152084e-06, + "loss": 0.74505508, + "num_input_tokens_seen": 108533155, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.82421875, + "step": 5048, + "time_per_iteration": 2.494016170501709 + }, + { + "auxiliary_loss_clip": 0.01129755, + "auxiliary_loss_mlp": 0.01035796, + "balance_loss_clip": 1.02093613, + "balance_loss_mlp": 1.04574537, + "epoch": 0.30356230272057716, + "flos": 16143606725760.0, + "grad_norm": 1.7731178616486785, + "language_loss": 0.75098324, + "learning_rate": 3.2649913081762763e-06, + "loss": 0.7726388, + "num_input_tokens_seen": 108551900, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.83984375, + "step": 5049, + "time_per_iteration": 2.502979040145874 + }, + { + "auxiliary_loss_clip": 0.01133123, + "auxiliary_loss_mlp": 0.01037727, + "balance_loss_clip": 1.0226109, + "balance_loss_mlp": 1.04864645, + "epoch": 0.3036224259732452, + "flos": 28914697589760.0, + "grad_norm": 1.6762363098185904, + "language_loss": 0.8194561, + "learning_rate": 3.2646896212318717e-06, + "loss": 0.84116459, + "num_input_tokens_seen": 108574005, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.84375, + "step": 5050, + "time_per_iteration": 2.5254666805267334 + }, + { + "auxiliary_loss_clip": 0.01132852, + "auxiliary_loss_mlp": 0.01038752, + "balance_loss_clip": 1.02299261, + "balance_loss_mlp": 1.04868484, + "epoch": 0.30368254922591315, + "flos": 21105599322240.0, + "grad_norm": 2.8996577335854625, + "language_loss": 0.73712784, + "learning_rate": 3.2643878863303106e-06, + "loss": 0.7588439, + "num_input_tokens_seen": 108592715, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.83984375, + "step": 5051, + "time_per_iteration": 2.511455774307251 + }, + { + "auxiliary_loss_clip": 0.01130282, + "auxiliary_loss_mlp": 0.01035032, + "balance_loss_clip": 1.01967764, + "balance_loss_mlp": 1.04650712, + "epoch": 0.3037426724785811, + "flos": 23002293081600.0, + "grad_norm": 1.5939626777548828, + "language_loss": 0.76463652, + "learning_rate": 3.264086103483033e-06, + "loss": 0.78628969, + "num_input_tokens_seen": 108611770, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8359375, + "step": 5052, + "time_per_iteration": 2.478046417236328 + }, + { + "auxiliary_loss_clip": 0.01131682, + "auxiliary_loss_mlp": 0.01039995, + "balance_loss_clip": 1.02484894, + "balance_loss_mlp": 1.04609728, + "epoch": 0.3038027957312491, + "flos": 15632705629440.0, + "grad_norm": 1.8043694132732864, + "language_loss": 0.82780337, + "learning_rate": 3.2637842727014836e-06, + "loss": 0.84952009, + "num_input_tokens_seen": 108629070, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.85546875, + "step": 5053, + "time_per_iteration": 3.983353614807129 + }, + { + "auxiliary_loss_clip": 0.01130411, + "auxiliary_loss_mlp": 0.01042277, + "balance_loss_clip": 1.02661896, + "balance_loss_mlp": 1.04685903, + "epoch": 0.30386291898391704, + "flos": 12713994195840.0, + "grad_norm": 1.5364375285570075, + "language_loss": 0.70702368, + "learning_rate": 3.2634823939971083e-06, + "loss": 0.72875059, + "num_input_tokens_seen": 108646315, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8359375, + "step": 5054, + "time_per_iteration": 2.4379446506500244 + }, + { + "auxiliary_loss_clip": 0.01132155, + "auxiliary_loss_mlp": 0.01033029, + "balance_loss_clip": 1.01768088, + "balance_loss_mlp": 1.04817367, + "epoch": 0.303923042236585, + "flos": 26359437922560.0, + "grad_norm": 1.8280069054430388, + "language_loss": 0.69543922, + "learning_rate": 3.2631804673813545e-06, + "loss": 0.71709108, + "num_input_tokens_seen": 108665920, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.84375, + "step": 5055, + "time_per_iteration": 2.5247206687927246 + }, + { + "auxiliary_loss_clip": 0.01131491, + "auxiliary_loss_mlp": 0.01036764, + "balance_loss_clip": 1.02056348, + "balance_loss_mlp": 1.04682207, + "epoch": 0.30398316548925297, + "flos": 19719232041600.0, + "grad_norm": 2.038005952710024, + "language_loss": 0.67502165, + "learning_rate": 3.2628784928656707e-06, + "loss": 0.69670427, + "num_input_tokens_seen": 108683485, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.84765625, + "step": 5056, + "time_per_iteration": 2.4767425060272217 + }, + { + "auxiliary_loss_clip": 0.01130078, + "auxiliary_loss_mlp": 0.01039078, + "balance_loss_clip": 1.02434373, + "balance_loss_mlp": 1.04886115, + "epoch": 0.30404328874192094, + "flos": 24239846315520.0, + "grad_norm": 1.5579435169669187, + "language_loss": 0.82500231, + "learning_rate": 3.262576470461507e-06, + "loss": 0.84669387, + "num_input_tokens_seen": 108702700, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8125, + "step": 5057, + "time_per_iteration": 2.499105453491211 + }, + { + "auxiliary_loss_clip": 0.01129487, + "auxiliary_loss_mlp": 0.01036385, + "balance_loss_clip": 1.02171588, + "balance_loss_mlp": 1.04686213, + "epoch": 0.3041034119945889, + "flos": 24498942094080.0, + "grad_norm": 3.274565054245196, + "language_loss": 0.89040101, + "learning_rate": 3.2622744001803176e-06, + "loss": 0.91205966, + "num_input_tokens_seen": 108721860, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.828125, + "step": 5058, + "time_per_iteration": 2.4966368675231934 + }, + { + "auxiliary_loss_clip": 0.01131903, + "auxiliary_loss_mlp": 0.01042482, + "balance_loss_clip": 1.02681756, + "balance_loss_mlp": 1.04829955, + "epoch": 0.30416353524725687, + "flos": 28288881907200.0, + "grad_norm": 2.2189779437975274, + "language_loss": 0.71709251, + "learning_rate": 3.2619722820335564e-06, + "loss": 0.73883629, + "num_input_tokens_seen": 108743215, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8359375, + "step": 5059, + "time_per_iteration": 2.5429141521453857 + }, + { + "auxiliary_loss_clip": 0.01130965, + "auxiliary_loss_mlp": 0.01037733, + "balance_loss_clip": 1.0233928, + "balance_loss_mlp": 1.04720807, + "epoch": 0.30422365849992483, + "flos": 23660392112640.0, + "grad_norm": 10.158939103063299, + "language_loss": 0.73069966, + "learning_rate": 3.26167011603268e-06, + "loss": 0.75238669, + "num_input_tokens_seen": 108765505, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.8359375, + "step": 5060, + "time_per_iteration": 2.529862403869629 + }, + { + "auxiliary_loss_clip": 0.01132671, + "auxiliary_loss_mlp": 0.01034539, + "balance_loss_clip": 1.01979291, + "balance_loss_mlp": 1.04885316, + "epoch": 0.3042837817525928, + "flos": 22998773548800.0, + "grad_norm": 1.8510962431794071, + "language_loss": 0.76926744, + "learning_rate": 3.2613679021891463e-06, + "loss": 0.79093957, + "num_input_tokens_seen": 108783370, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8359375, + "step": 5061, + "time_per_iteration": 2.496739149093628 + }, + { + "auxiliary_loss_clip": 0.01138048, + "auxiliary_loss_mlp": 0.01039513, + "balance_loss_clip": 1.02312136, + "balance_loss_mlp": 1.0527482, + "epoch": 0.30434390500526076, + "flos": 22082332924800.0, + "grad_norm": 2.264413063412747, + "language_loss": 0.82064837, + "learning_rate": 3.261065640514415e-06, + "loss": 0.84242392, + "num_input_tokens_seen": 108797430, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8515625, + "step": 5062, + "time_per_iteration": 2.476290702819824 + }, + { + "auxiliary_loss_clip": 0.01128914, + "auxiliary_loss_mlp": 0.01032652, + "balance_loss_clip": 1.01821566, + "balance_loss_mlp": 1.04721808, + "epoch": 0.3044040282579287, + "flos": 25483504861440.0, + "grad_norm": 1.7072945635391377, + "language_loss": 0.74737656, + "learning_rate": 3.2607633310199483e-06, + "loss": 0.76899219, + "num_input_tokens_seen": 108816945, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.81640625, + "step": 5063, + "time_per_iteration": 2.5384082794189453 + }, + { + "auxiliary_loss_clip": 0.01132221, + "auxiliary_loss_mlp": 0.01037959, + "balance_loss_clip": 1.0214901, + "balance_loss_mlp": 1.04908288, + "epoch": 0.30446415151059675, + "flos": 21945478106880.0, + "grad_norm": 1.8176932093217915, + "language_loss": 0.84120226, + "learning_rate": 3.26046097371721e-06, + "loss": 0.86290407, + "num_input_tokens_seen": 108836615, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.83203125, + "step": 5064, + "time_per_iteration": 2.5108115673065186 + }, + { + "auxiliary_loss_clip": 0.01131651, + "auxiliary_loss_mlp": 0.01034827, + "balance_loss_clip": 1.01888871, + "balance_loss_mlp": 1.04751444, + "epoch": 0.3045242747632647, + "flos": 16435416816000.0, + "grad_norm": 1.7759562417820063, + "language_loss": 0.75990027, + "learning_rate": 3.2601585686176655e-06, + "loss": 0.78156507, + "num_input_tokens_seen": 108855165, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.84375, + "step": 5065, + "time_per_iteration": 2.5061376094818115 + }, + { + "auxiliary_loss_clip": 0.01133071, + "auxiliary_loss_mlp": 0.01040555, + "balance_loss_clip": 1.02470005, + "balance_loss_mlp": 1.04716659, + "epoch": 0.3045843980159327, + "flos": 31540341957120.0, + "grad_norm": 2.0133457948817406, + "language_loss": 0.62271762, + "learning_rate": 3.2598561157327814e-06, + "loss": 0.64445394, + "num_input_tokens_seen": 108874690, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.859375, + "step": 5066, + "time_per_iteration": 2.6000661849975586 + }, + { + "auxiliary_loss_clip": 0.01140413, + "auxiliary_loss_mlp": 0.01049285, + "balance_loss_clip": 1.03385913, + "balance_loss_mlp": 1.05344141, + "epoch": 0.30464452126860064, + "flos": 17853636481920.0, + "grad_norm": 1.7828452375691122, + "language_loss": 0.82887459, + "learning_rate": 3.2595536150740265e-06, + "loss": 0.85077155, + "num_input_tokens_seen": 108893140, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.87109375, + "step": 5067, + "time_per_iteration": 2.4754927158355713 + }, + { + "auxiliary_loss_clip": 0.01130661, + "auxiliary_loss_mlp": 0.01043304, + "balance_loss_clip": 1.02829516, + "balance_loss_mlp": 1.04839194, + "epoch": 0.3047046445212686, + "flos": 20631398947200.0, + "grad_norm": 2.0779895110277535, + "language_loss": 0.62978256, + "learning_rate": 3.259251066652873e-06, + "loss": 0.65152222, + "num_input_tokens_seen": 108911880, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8203125, + "step": 5068, + "time_per_iteration": 2.4957847595214844 + }, + { + "auxiliary_loss_clip": 0.01127866, + "auxiliary_loss_mlp": 0.01031598, + "balance_loss_clip": 1.01633286, + "balance_loss_mlp": 1.04544926, + "epoch": 0.3047647677739366, + "flos": 21287594557440.0, + "grad_norm": 1.6700683770947133, + "language_loss": 0.75058538, + "learning_rate": 3.258948470480793e-06, + "loss": 0.77217996, + "num_input_tokens_seen": 108930440, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.82421875, + "step": 5069, + "time_per_iteration": 2.487473964691162 + }, + { + "auxiliary_loss_clip": 0.0112831, + "auxiliary_loss_mlp": 0.01043362, + "balance_loss_clip": 1.02798414, + "balance_loss_mlp": 1.04746199, + "epoch": 0.30482489102660454, + "flos": 20995928121600.0, + "grad_norm": 1.839652658151057, + "language_loss": 0.75732648, + "learning_rate": 3.258645826569261e-06, + "loss": 0.7790432, + "num_input_tokens_seen": 108949125, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.80859375, + "step": 5070, + "time_per_iteration": 2.500335216522217 + }, + { + "auxiliary_loss_clip": 0.0113368, + "auxiliary_loss_mlp": 0.0103861, + "balance_loss_clip": 1.02229047, + "balance_loss_mlp": 1.04640067, + "epoch": 0.3048850142792725, + "flos": 26290812988800.0, + "grad_norm": 1.7318177446844936, + "language_loss": 0.81738281, + "learning_rate": 3.2583431349297527e-06, + "loss": 0.83910567, + "num_input_tokens_seen": 108972190, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.875, + "step": 5071, + "time_per_iteration": 2.5726318359375 + }, + { + "auxiliary_loss_clip": 0.01134597, + "auxiliary_loss_mlp": 0.01041754, + "balance_loss_clip": 1.02507651, + "balance_loss_mlp": 1.04737437, + "epoch": 0.30494513753194047, + "flos": 22346241125760.0, + "grad_norm": 1.5942809817556516, + "language_loss": 0.76252651, + "learning_rate": 3.2580403955737467e-06, + "loss": 0.78428996, + "num_input_tokens_seen": 108990325, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.875, + "step": 5072, + "time_per_iteration": 2.5147287845611572 + }, + { + "auxiliary_loss_clip": 0.01132045, + "auxiliary_loss_mlp": 0.01046119, + "balance_loss_clip": 1.03059769, + "balance_loss_mlp": 1.04904687, + "epoch": 0.30500526078460843, + "flos": 19537667769600.0, + "grad_norm": 2.176920469303851, + "language_loss": 0.71318722, + "learning_rate": 3.257737608512723e-06, + "loss": 0.73496878, + "num_input_tokens_seen": 109009505, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.83203125, + "step": 5073, + "time_per_iteration": 2.4736156463623047 + }, + { + "auxiliary_loss_clip": 0.01135708, + "auxiliary_loss_mlp": 0.0104584, + "balance_loss_clip": 1.02974713, + "balance_loss_mlp": 1.04842663, + "epoch": 0.3050653840372764, + "flos": 14465321614080.0, + "grad_norm": 2.146618897096623, + "language_loss": 0.7663309, + "learning_rate": 3.257434773758163e-06, + "loss": 0.78814638, + "num_input_tokens_seen": 109026350, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.875, + "step": 5074, + "time_per_iteration": 2.4547433853149414 + }, + { + "auxiliary_loss_clip": 0.01131716, + "auxiliary_loss_mlp": 0.01035183, + "balance_loss_clip": 1.02015638, + "balance_loss_mlp": 1.04879379, + "epoch": 0.30512550728994436, + "flos": 24243796811520.0, + "grad_norm": 1.8636036931869358, + "language_loss": 0.73939347, + "learning_rate": 3.25713189132155e-06, + "loss": 0.76106244, + "num_input_tokens_seen": 109044165, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 5075, + "time_per_iteration": 2.4922661781311035 + }, + { + "auxiliary_loss_clip": 0.01135073, + "auxiliary_loss_mlp": 0.01042646, + "balance_loss_clip": 1.02508652, + "balance_loss_mlp": 1.04769778, + "epoch": 0.30518563054261233, + "flos": 16360542915840.0, + "grad_norm": 2.14961805392919, + "language_loss": 0.75488788, + "learning_rate": 3.2568289612143703e-06, + "loss": 0.77666509, + "num_input_tokens_seen": 109060665, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.875, + "step": 5076, + "time_per_iteration": 2.471381187438965 + }, + { + "auxiliary_loss_clip": 0.0113449, + "auxiliary_loss_mlp": 0.01039246, + "balance_loss_clip": 1.02407038, + "balance_loss_mlp": 1.05137944, + "epoch": 0.30524575379528035, + "flos": 21579584215680.0, + "grad_norm": 1.505999917432091, + "language_loss": 0.79183954, + "learning_rate": 3.25652598344811e-06, + "loss": 0.81357688, + "num_input_tokens_seen": 109080035, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.828125, + "step": 5077, + "time_per_iteration": 2.5000534057617188 + }, + { + "auxiliary_loss_clip": 0.01127394, + "auxiliary_loss_mlp": 0.01030923, + "balance_loss_clip": 1.01739252, + "balance_loss_mlp": 1.0478642, + "epoch": 0.3053058770479483, + "flos": 16545231671040.0, + "grad_norm": 1.9961733055656423, + "language_loss": 0.74662113, + "learning_rate": 3.256222958034259e-06, + "loss": 0.76820433, + "num_input_tokens_seen": 109097385, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.796875, + "step": 5078, + "time_per_iteration": 2.4746944904327393 + }, + { + "auxiliary_loss_clip": 0.01130678, + "auxiliary_loss_mlp": 0.01047379, + "balance_loss_clip": 1.03203678, + "balance_loss_mlp": 1.04787958, + "epoch": 0.3053660003006163, + "flos": 12312907954560.0, + "grad_norm": 2.113994612729099, + "language_loss": 0.67216343, + "learning_rate": 3.255919884984307e-06, + "loss": 0.69394398, + "num_input_tokens_seen": 109115495, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.828125, + "step": 5079, + "time_per_iteration": 2.4575493335723877 + }, + { + "auxiliary_loss_clip": 0.01130366, + "auxiliary_loss_mlp": 0.01034996, + "balance_loss_clip": 1.02034521, + "balance_loss_mlp": 1.04758203, + "epoch": 0.30542612355328425, + "flos": 23112287504640.0, + "grad_norm": 1.7438542216491464, + "language_loss": 0.80291754, + "learning_rate": 3.2556167643097477e-06, + "loss": 0.82457113, + "num_input_tokens_seen": 109134235, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.828125, + "step": 5080, + "time_per_iteration": 2.490842342376709 + }, + { + "auxiliary_loss_clip": 0.01129694, + "auxiliary_loss_mlp": 0.01039719, + "balance_loss_clip": 1.02475858, + "balance_loss_mlp": 1.04612935, + "epoch": 0.3054862468059522, + "flos": 24389450461440.0, + "grad_norm": 2.2926909410882903, + "language_loss": 0.80971938, + "learning_rate": 3.255313596022074e-06, + "loss": 0.83141345, + "num_input_tokens_seen": 109152760, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8359375, + "step": 5081, + "time_per_iteration": 2.5298712253570557 + }, + { + "auxiliary_loss_clip": 0.01129539, + "auxiliary_loss_mlp": 0.01034881, + "balance_loss_clip": 1.01952672, + "balance_loss_mlp": 1.04690182, + "epoch": 0.3055463700586202, + "flos": 29386096704000.0, + "grad_norm": 1.691443128795128, + "language_loss": 0.71810889, + "learning_rate": 3.255010380132783e-06, + "loss": 0.73975313, + "num_input_tokens_seen": 109173925, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.828125, + "step": 5082, + "time_per_iteration": 2.5567750930786133 + }, + { + "auxiliary_loss_clip": 0.0113354, + "auxiliary_loss_mlp": 0.01038275, + "balance_loss_clip": 1.02073884, + "balance_loss_mlp": 1.0468955, + "epoch": 0.30560649331128814, + "flos": 25591775431680.0, + "grad_norm": 1.9955003311475592, + "language_loss": 0.73615241, + "learning_rate": 3.2547071166533736e-06, + "loss": 0.75787055, + "num_input_tokens_seen": 109192510, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8671875, + "step": 5083, + "time_per_iteration": 2.5083980560302734 + }, + { + "auxiliary_loss_clip": 0.01129694, + "auxiliary_loss_mlp": 0.0103765, + "balance_loss_clip": 1.02184248, + "balance_loss_mlp": 1.04441404, + "epoch": 0.3056666165639561, + "flos": 19128321400320.0, + "grad_norm": 3.7957379738132517, + "language_loss": 0.70895267, + "learning_rate": 3.254403805595344e-06, + "loss": 0.73062611, + "num_input_tokens_seen": 109210885, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8515625, + "step": 5084, + "time_per_iteration": 2.477665424346924 + }, + { + "auxiliary_loss_clip": 0.01134775, + "auxiliary_loss_mlp": 0.01032514, + "balance_loss_clip": 1.01631355, + "balance_loss_mlp": 1.04818797, + "epoch": 0.30572673981662407, + "flos": 15523860441600.0, + "grad_norm": 2.0055460894973933, + "language_loss": 0.78791595, + "learning_rate": 3.2541004469701962e-06, + "loss": 0.80958885, + "num_input_tokens_seen": 109229180, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8671875, + "step": 5085, + "time_per_iteration": 2.475783586502075 + }, + { + "auxiliary_loss_clip": 0.01127203, + "auxiliary_loss_mlp": 0.01037047, + "balance_loss_clip": 1.02187788, + "balance_loss_mlp": 1.04529142, + "epoch": 0.30578686306929204, + "flos": 21506541909120.0, + "grad_norm": 1.5510153728860234, + "language_loss": 0.77846372, + "learning_rate": 3.2537970407894342e-06, + "loss": 0.80010617, + "num_input_tokens_seen": 109249510, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8203125, + "step": 5086, + "time_per_iteration": 2.514472007751465 + }, + { + "auxiliary_loss_clip": 0.01132639, + "auxiliary_loss_mlp": 0.01041993, + "balance_loss_clip": 1.02592945, + "balance_loss_mlp": 1.04930758, + "epoch": 0.30584698632196, + "flos": 20954271323520.0, + "grad_norm": 1.7256556540888637, + "language_loss": 0.77121228, + "learning_rate": 3.253493587064563e-06, + "loss": 0.79295856, + "num_input_tokens_seen": 109268200, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8359375, + "step": 5087, + "time_per_iteration": 2.4817616939544678 + }, + { + "auxiliary_loss_clip": 0.01132683, + "auxiliary_loss_mlp": 0.01040214, + "balance_loss_clip": 1.02346563, + "balance_loss_mlp": 1.04716742, + "epoch": 0.30590710957462797, + "flos": 24681116897280.0, + "grad_norm": 2.0600622883478517, + "language_loss": 0.72582048, + "learning_rate": 3.2531900858070885e-06, + "loss": 0.74754953, + "num_input_tokens_seen": 109288370, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.85546875, + "step": 5088, + "time_per_iteration": 2.538318395614624 + }, + { + "auxiliary_loss_clip": 0.01135035, + "auxiliary_loss_mlp": 0.01039164, + "balance_loss_clip": 1.02300477, + "balance_loss_mlp": 1.04673004, + "epoch": 0.30596723282729593, + "flos": 17086907744640.0, + "grad_norm": 2.417480227404851, + "language_loss": 0.7889666, + "learning_rate": 3.252886537028521e-06, + "loss": 0.81070858, + "num_input_tokens_seen": 109306730, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8828125, + "step": 5089, + "time_per_iteration": 2.4561989307403564 + }, + { + "auxiliary_loss_clip": 0.0113113, + "auxiliary_loss_mlp": 0.0103884, + "balance_loss_clip": 1.02328289, + "balance_loss_mlp": 1.04813027, + "epoch": 0.30602735607996395, + "flos": 22857106308480.0, + "grad_norm": 2.044405318996134, + "language_loss": 0.77061844, + "learning_rate": 3.2525829407403703e-06, + "loss": 0.79231811, + "num_input_tokens_seen": 109327360, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.828125, + "step": 5090, + "time_per_iteration": 2.5215258598327637 + }, + { + "auxiliary_loss_clip": 0.01134524, + "auxiliary_loss_mlp": 0.01046182, + "balance_loss_clip": 1.02999353, + "balance_loss_mlp": 1.04693675, + "epoch": 0.3060874793326319, + "flos": 29861482227840.0, + "grad_norm": 1.7474050348479595, + "language_loss": 0.76481628, + "learning_rate": 3.2522792969541488e-06, + "loss": 0.78662336, + "num_input_tokens_seen": 109348135, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.875, + "step": 5091, + "time_per_iteration": 2.535468578338623 + }, + { + "auxiliary_loss_clip": 0.01133443, + "auxiliary_loss_mlp": 0.0103559, + "balance_loss_clip": 1.01955616, + "balance_loss_mlp": 1.04671383, + "epoch": 0.3061476025852999, + "flos": 20448577699200.0, + "grad_norm": 1.638842582319787, + "language_loss": 0.71933579, + "learning_rate": 3.2519756056813705e-06, + "loss": 0.7410261, + "num_input_tokens_seen": 109366220, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8671875, + "step": 5092, + "time_per_iteration": 2.512096405029297 + }, + { + "auxiliary_loss_clip": 0.01131454, + "auxiliary_loss_mlp": 0.01035497, + "balance_loss_clip": 1.02131701, + "balance_loss_mlp": 1.04765177, + "epoch": 0.30620772583796785, + "flos": 19391475415680.0, + "grad_norm": 1.9362192703697652, + "language_loss": 0.8216877, + "learning_rate": 3.2516718669335522e-06, + "loss": 0.84335721, + "num_input_tokens_seen": 109385260, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8359375, + "step": 5093, + "time_per_iteration": 2.464477300643921 + }, + { + "auxiliary_loss_clip": 0.01129105, + "auxiliary_loss_mlp": 0.01036711, + "balance_loss_clip": 1.02239954, + "balance_loss_mlp": 1.04639721, + "epoch": 0.3062678490906358, + "flos": 24024562151040.0, + "grad_norm": 1.6957020618246583, + "language_loss": 0.75365555, + "learning_rate": 3.2513680807222114e-06, + "loss": 0.77531368, + "num_input_tokens_seen": 109405025, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.828125, + "step": 5094, + "time_per_iteration": 2.5149855613708496 + }, + { + "auxiliary_loss_clip": 0.01128293, + "auxiliary_loss_mlp": 0.01039658, + "balance_loss_clip": 1.02464378, + "balance_loss_mlp": 1.04530072, + "epoch": 0.3063279723433038, + "flos": 19754639873280.0, + "grad_norm": 1.922814039194465, + "language_loss": 0.76033115, + "learning_rate": 3.251064247058868e-06, + "loss": 0.78201067, + "num_input_tokens_seen": 109422465, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 5095, + "time_per_iteration": 5.438723802566528 + }, + { + "auxiliary_loss_clip": 0.01127363, + "auxiliary_loss_mlp": 0.01038505, + "balance_loss_clip": 1.02325845, + "balance_loss_mlp": 1.04581833, + "epoch": 0.30638809559597174, + "flos": 22450022496000.0, + "grad_norm": 1.7577098515851188, + "language_loss": 0.8050971, + "learning_rate": 3.250760365955042e-06, + "loss": 0.82675582, + "num_input_tokens_seen": 109440575, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.81640625, + "step": 5096, + "time_per_iteration": 2.4706614017486572 + }, + { + "auxiliary_loss_clip": 0.01130131, + "auxiliary_loss_mlp": 0.01035159, + "balance_loss_clip": 1.02052069, + "balance_loss_mlp": 1.04556763, + "epoch": 0.3064482188486397, + "flos": 17165157523200.0, + "grad_norm": 2.0672553061960586, + "language_loss": 0.8209089, + "learning_rate": 3.250456437422258e-06, + "loss": 0.84256178, + "num_input_tokens_seen": 109459050, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.84375, + "step": 5097, + "time_per_iteration": 2.457242250442505 + }, + { + "auxiliary_loss_clip": 0.0112984, + "auxiliary_loss_mlp": 0.01039085, + "balance_loss_clip": 1.02227616, + "balance_loss_mlp": 1.04537082, + "epoch": 0.3065083421013077, + "flos": 23768483114880.0, + "grad_norm": 1.9081721986815667, + "language_loss": 0.77858478, + "learning_rate": 3.250152461472041e-06, + "loss": 0.80027401, + "num_input_tokens_seen": 109475860, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.84375, + "step": 5098, + "time_per_iteration": 2.4709839820861816 + }, + { + "auxiliary_loss_clip": 0.01128893, + "auxiliary_loss_mlp": 0.01035797, + "balance_loss_clip": 1.02057385, + "balance_loss_mlp": 1.0466584, + "epoch": 0.30656846535397564, + "flos": 26431833784320.0, + "grad_norm": 1.9501450681008343, + "language_loss": 0.83948421, + "learning_rate": 3.249848438115917e-06, + "loss": 0.86113107, + "num_input_tokens_seen": 109494760, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8203125, + "step": 5099, + "time_per_iteration": 2.537771224975586 + }, + { + "auxiliary_loss_clip": 0.01130145, + "auxiliary_loss_mlp": 0.01042266, + "balance_loss_clip": 1.02653074, + "balance_loss_mlp": 1.04364753, + "epoch": 0.3066285886066436, + "flos": 26651786716800.0, + "grad_norm": 2.2273819247618376, + "language_loss": 0.85744429, + "learning_rate": 3.2495443673654148e-06, + "loss": 0.87916839, + "num_input_tokens_seen": 109516480, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8671875, + "step": 5100, + "time_per_iteration": 2.5103259086608887 + }, + { + "auxiliary_loss_clip": 0.01129277, + "auxiliary_loss_mlp": 0.01038498, + "balance_loss_clip": 1.02259541, + "balance_loss_mlp": 1.04542243, + "epoch": 0.30668871185931157, + "flos": 15049947375360.0, + "grad_norm": 1.8863659276771934, + "language_loss": 0.79225194, + "learning_rate": 3.249240249232065e-06, + "loss": 0.81392968, + "num_input_tokens_seen": 109534615, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8359375, + "step": 5101, + "time_per_iteration": 2.4733920097351074 + }, + { + "auxiliary_loss_clip": 0.01131914, + "auxiliary_loss_mlp": 0.01045873, + "balance_loss_clip": 1.02869534, + "balance_loss_mlp": 1.04708326, + "epoch": 0.30674883511197953, + "flos": 20082109190400.0, + "grad_norm": 1.7393564952665503, + "language_loss": 0.79405224, + "learning_rate": 3.2489360837273998e-06, + "loss": 0.81583011, + "num_input_tokens_seen": 109554040, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.84765625, + "step": 5102, + "time_per_iteration": 2.4608778953552246 + }, + { + "auxiliary_loss_clip": 0.01134414, + "auxiliary_loss_mlp": 0.01038608, + "balance_loss_clip": 1.02135825, + "balance_loss_mlp": 1.04940438, + "epoch": 0.30680895836464755, + "flos": 22893807029760.0, + "grad_norm": 1.7201607461659805, + "language_loss": 0.88999605, + "learning_rate": 3.2486318708629532e-06, + "loss": 0.9117263, + "num_input_tokens_seen": 109574345, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.84765625, + "step": 5103, + "time_per_iteration": 2.5295228958129883 + }, + { + "auxiliary_loss_clip": 0.01131581, + "auxiliary_loss_mlp": 0.01041048, + "balance_loss_clip": 1.02549076, + "balance_loss_mlp": 1.04700959, + "epoch": 0.3068690816173155, + "flos": 23696159080320.0, + "grad_norm": 1.6453097169103326, + "language_loss": 0.74079049, + "learning_rate": 3.2483276106502607e-06, + "loss": 0.76251674, + "num_input_tokens_seen": 109593670, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84375, + "step": 5104, + "time_per_iteration": 2.4923107624053955 + }, + { + "auxiliary_loss_clip": 0.01132054, + "auxiliary_loss_mlp": 0.01042794, + "balance_loss_clip": 1.02690291, + "balance_loss_mlp": 1.04555643, + "epoch": 0.3069292048699835, + "flos": 23551044134400.0, + "grad_norm": 1.8308515164246026, + "language_loss": 0.73333633, + "learning_rate": 3.2480233031008605e-06, + "loss": 0.75508481, + "num_input_tokens_seen": 109613385, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.86328125, + "step": 5105, + "time_per_iteration": 2.542391777038574 + }, + { + "auxiliary_loss_clip": 0.01131684, + "auxiliary_loss_mlp": 0.01047183, + "balance_loss_clip": 1.03058875, + "balance_loss_mlp": 1.04582942, + "epoch": 0.30698932812265145, + "flos": 24531656405760.0, + "grad_norm": 5.5167708582846515, + "language_loss": 0.8714695, + "learning_rate": 3.2477189482262916e-06, + "loss": 0.89325809, + "num_input_tokens_seen": 109632395, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.859375, + "step": 5106, + "time_per_iteration": 2.5054032802581787 + }, + { + "auxiliary_loss_clip": 0.01136079, + "auxiliary_loss_mlp": 0.01048019, + "balance_loss_clip": 1.03128242, + "balance_loss_mlp": 1.04750919, + "epoch": 0.3070494513753194, + "flos": 20996430912000.0, + "grad_norm": 2.142568748510771, + "language_loss": 0.71183497, + "learning_rate": 3.2474145460380945e-06, + "loss": 0.73367596, + "num_input_tokens_seen": 109651380, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.88671875, + "step": 5107, + "time_per_iteration": 2.4980053901672363 + }, + { + "auxiliary_loss_clip": 0.01125715, + "auxiliary_loss_mlp": 0.01050168, + "balance_loss_clip": 1.03372955, + "balance_loss_mlp": 1.04304433, + "epoch": 0.3071095746279874, + "flos": 19025940660480.0, + "grad_norm": 1.7923615416213727, + "language_loss": 0.72302651, + "learning_rate": 3.247110096547814e-06, + "loss": 0.74478543, + "num_input_tokens_seen": 109670240, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.82421875, + "step": 5108, + "time_per_iteration": 2.4588091373443604 + }, + { + "auxiliary_loss_clip": 0.01129796, + "auxiliary_loss_mlp": 0.01039934, + "balance_loss_clip": 1.02435362, + "balance_loss_mlp": 1.04538584, + "epoch": 0.30716969788065535, + "flos": 21215521918080.0, + "grad_norm": 1.5361542639570684, + "language_loss": 0.85768104, + "learning_rate": 3.2468055997669926e-06, + "loss": 0.87937832, + "num_input_tokens_seen": 109690810, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84375, + "step": 5109, + "time_per_iteration": 2.5077664852142334 + }, + { + "auxiliary_loss_clip": 0.01129418, + "auxiliary_loss_mlp": 0.01036702, + "balance_loss_clip": 1.02176476, + "balance_loss_mlp": 1.04534364, + "epoch": 0.3072298211333233, + "flos": 25772765086080.0, + "grad_norm": 1.6710196569280569, + "language_loss": 0.67220587, + "learning_rate": 3.2465010557071788e-06, + "loss": 0.69386709, + "num_input_tokens_seen": 109711145, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.84375, + "step": 5110, + "time_per_iteration": 2.5019631385803223 + }, + { + "auxiliary_loss_clip": 0.01126741, + "auxiliary_loss_mlp": 0.01036309, + "balance_loss_clip": 1.0220511, + "balance_loss_mlp": 1.04472136, + "epoch": 0.3072899443859913, + "flos": 25848931875840.0, + "grad_norm": 1.5071731281437177, + "language_loss": 0.76981276, + "learning_rate": 3.246196464379919e-06, + "loss": 0.79144323, + "num_input_tokens_seen": 109731425, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8203125, + "step": 5111, + "time_per_iteration": 2.544111490249634 + }, + { + "auxiliary_loss_clip": 0.01130011, + "auxiliary_loss_mlp": 0.01040184, + "balance_loss_clip": 1.02486551, + "balance_loss_mlp": 1.04580235, + "epoch": 0.30735006763865924, + "flos": 25922800195200.0, + "grad_norm": 1.9077726149637915, + "language_loss": 0.67174292, + "learning_rate": 3.245891825796765e-06, + "loss": 0.69344485, + "num_input_tokens_seen": 109752720, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.84375, + "step": 5112, + "time_per_iteration": 2.5171637535095215 + }, + { + "auxiliary_loss_clip": 0.01136791, + "auxiliary_loss_mlp": 0.01041143, + "balance_loss_clip": 1.02382207, + "balance_loss_mlp": 1.04846382, + "epoch": 0.3074101908913272, + "flos": 30917004312960.0, + "grad_norm": 1.8925702151041777, + "language_loss": 0.798181, + "learning_rate": 3.2455871399692678e-06, + "loss": 0.81996036, + "num_input_tokens_seen": 109772840, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.8828125, + "step": 5113, + "time_per_iteration": 2.55889892578125 + }, + { + "auxiliary_loss_clip": 0.01130603, + "auxiliary_loss_mlp": 0.01041707, + "balance_loss_clip": 1.0257802, + "balance_loss_mlp": 1.04549623, + "epoch": 0.30747031414399517, + "flos": 18401058731520.0, + "grad_norm": 1.951625458848465, + "language_loss": 0.77243912, + "learning_rate": 3.2452824069089815e-06, + "loss": 0.79416221, + "num_input_tokens_seen": 109790150, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8515625, + "step": 5114, + "time_per_iteration": 2.4328107833862305 + }, + { + "auxiliary_loss_clip": 0.01132188, + "auxiliary_loss_mlp": 0.01037897, + "balance_loss_clip": 1.02079093, + "balance_loss_mlp": 1.04755759, + "epoch": 0.30753043739666314, + "flos": 22633166966400.0, + "grad_norm": 1.8985095809631356, + "language_loss": 0.62356925, + "learning_rate": 3.2449776266274623e-06, + "loss": 0.64527011, + "num_input_tokens_seen": 109807985, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.84765625, + "step": 5115, + "time_per_iteration": 2.480536699295044 + }, + { + "auxiliary_loss_clip": 0.01132859, + "auxiliary_loss_mlp": 0.01036205, + "balance_loss_clip": 1.02033865, + "balance_loss_mlp": 1.04663444, + "epoch": 0.3075905606493311, + "flos": 27344072517120.0, + "grad_norm": 3.0190652682973176, + "language_loss": 0.82743216, + "learning_rate": 3.2446727991362657e-06, + "loss": 0.84912288, + "num_input_tokens_seen": 109825920, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.859375, + "step": 5116, + "time_per_iteration": 2.5121662616729736 + }, + { + "auxiliary_loss_clip": 0.01131907, + "auxiliary_loss_mlp": 0.0103869, + "balance_loss_clip": 1.02322841, + "balance_loss_mlp": 1.04825926, + "epoch": 0.3076506839019991, + "flos": 22090808534400.0, + "grad_norm": 1.8681947014951163, + "language_loss": 0.75772393, + "learning_rate": 3.244367924446952e-06, + "loss": 0.77942991, + "num_input_tokens_seen": 109846220, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8359375, + "step": 5117, + "time_per_iteration": 2.48750376701355 + }, + { + "auxiliary_loss_clip": 0.0113581, + "auxiliary_loss_mlp": 0.01035582, + "balance_loss_clip": 1.01952434, + "balance_loss_mlp": 1.05018401, + "epoch": 0.3077108071546671, + "flos": 21289533891840.0, + "grad_norm": 2.225887232792708, + "language_loss": 0.71873093, + "learning_rate": 3.2440630025710826e-06, + "loss": 0.74044484, + "num_input_tokens_seen": 109863870, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.85546875, + "step": 5118, + "time_per_iteration": 2.4745492935180664 + }, + { + "auxiliary_loss_clip": 0.01130971, + "auxiliary_loss_mlp": 0.01039981, + "balance_loss_clip": 1.02442479, + "balance_loss_mlp": 1.04630661, + "epoch": 0.30777093040733505, + "flos": 21430985650560.0, + "grad_norm": 1.5789952404099556, + "language_loss": 0.74312431, + "learning_rate": 3.243758033520219e-06, + "loss": 0.76483381, + "num_input_tokens_seen": 109883500, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.84375, + "step": 5119, + "time_per_iteration": 2.5185489654541016 + }, + { + "auxiliary_loss_clip": 0.01136122, + "auxiliary_loss_mlp": 0.01051479, + "balance_loss_clip": 1.03291845, + "balance_loss_mlp": 1.04891181, + "epoch": 0.307831053660003, + "flos": 23149275534720.0, + "grad_norm": 1.733023320063412, + "language_loss": 0.80267692, + "learning_rate": 3.243453017305926e-06, + "loss": 0.82455289, + "num_input_tokens_seen": 109904620, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.875, + "step": 5120, + "time_per_iteration": 2.5592849254608154 + }, + { + "auxiliary_loss_clip": 0.01127219, + "auxiliary_loss_mlp": 0.01048202, + "balance_loss_clip": 1.03299093, + "balance_loss_mlp": 1.04384947, + "epoch": 0.307891176912671, + "flos": 17019755268480.0, + "grad_norm": 1.564134517039273, + "language_loss": 0.80110037, + "learning_rate": 3.24314795393977e-06, + "loss": 0.82285464, + "num_input_tokens_seen": 109922275, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8359375, + "step": 5121, + "time_per_iteration": 2.440516948699951 + }, + { + "auxiliary_loss_clip": 0.0113076, + "auxiliary_loss_mlp": 0.01035014, + "balance_loss_clip": 1.01981497, + "balance_loss_mlp": 1.0480212, + "epoch": 0.30795130016533895, + "flos": 27705046245120.0, + "grad_norm": 1.5001896125792977, + "language_loss": 0.82594395, + "learning_rate": 3.242842843433319e-06, + "loss": 0.84760171, + "num_input_tokens_seen": 109944265, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.828125, + "step": 5122, + "time_per_iteration": 2.510576009750366 + }, + { + "auxiliary_loss_clip": 0.01050329, + "auxiliary_loss_mlp": 0.01017411, + "balance_loss_clip": 1.01562333, + "balance_loss_mlp": 1.01982307, + "epoch": 0.3080114234180069, + "flos": 69058699591680.0, + "grad_norm": 0.7473381596642288, + "language_loss": 0.58639288, + "learning_rate": 3.242537685798143e-06, + "loss": 0.60707027, + "num_input_tokens_seen": 110014160, + "router_z_loss_clip": 0.01782227, + "router_z_loss_mlp": 0.3046875, + "step": 5123, + "time_per_iteration": 3.2167654037475586 + }, + { + "auxiliary_loss_clip": 0.01134332, + "auxiliary_loss_mlp": 0.01036733, + "balance_loss_clip": 1.01917315, + "balance_loss_mlp": 1.04640436, + "epoch": 0.3080715466706749, + "flos": 24060221377920.0, + "grad_norm": 1.5767520801619384, + "language_loss": 0.83622873, + "learning_rate": 3.242232481045813e-06, + "loss": 0.85793942, + "num_input_tokens_seen": 110034865, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.87890625, + "step": 5124, + "time_per_iteration": 2.474625587463379 + }, + { + "auxiliary_loss_clip": 0.01135515, + "auxiliary_loss_mlp": 0.01039715, + "balance_loss_clip": 1.02420545, + "balance_loss_mlp": 1.04945302, + "epoch": 0.30813166992334284, + "flos": 25848680480640.0, + "grad_norm": 1.8429802725909379, + "language_loss": 0.78703862, + "learning_rate": 3.2419272291879035e-06, + "loss": 0.80879092, + "num_input_tokens_seen": 110052930, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.859375, + "step": 5125, + "time_per_iteration": 2.5806493759155273 + }, + { + "auxiliary_loss_clip": 0.01134207, + "auxiliary_loss_mlp": 0.01037354, + "balance_loss_clip": 1.02050948, + "balance_loss_mlp": 1.04717779, + "epoch": 0.3081917931760108, + "flos": 20449619193600.0, + "grad_norm": 1.8928574451074776, + "language_loss": 0.6450479, + "learning_rate": 3.241621930235989e-06, + "loss": 0.66676342, + "num_input_tokens_seen": 110071765, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.87109375, + "step": 5126, + "time_per_iteration": 2.467099666595459 + }, + { + "auxiliary_loss_clip": 0.01129876, + "auxiliary_loss_mlp": 0.01039444, + "balance_loss_clip": 1.02367234, + "balance_loss_mlp": 1.04831636, + "epoch": 0.3082519164286788, + "flos": 22166257052160.0, + "grad_norm": 1.5538294270453243, + "language_loss": 0.86619091, + "learning_rate": 3.241316584201646e-06, + "loss": 0.88788408, + "num_input_tokens_seen": 110092660, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.81640625, + "step": 5127, + "time_per_iteration": 2.543095111846924 + }, + { + "auxiliary_loss_clip": 0.01129649, + "auxiliary_loss_mlp": 0.01040552, + "balance_loss_clip": 1.02439952, + "balance_loss_mlp": 1.04648781, + "epoch": 0.30831203968134674, + "flos": 28913404700160.0, + "grad_norm": 2.186420023793508, + "language_loss": 0.68816996, + "learning_rate": 3.2410111910964538e-06, + "loss": 0.70987189, + "num_input_tokens_seen": 110114960, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.83203125, + "step": 5128, + "time_per_iteration": 2.525390863418579 + }, + { + "auxiliary_loss_clip": 0.01134323, + "auxiliary_loss_mlp": 0.01041963, + "balance_loss_clip": 1.02571476, + "balance_loss_mlp": 1.04763198, + "epoch": 0.3083721629340147, + "flos": 25667726739840.0, + "grad_norm": 1.801256837086347, + "language_loss": 0.71226776, + "learning_rate": 3.240705750931993e-06, + "loss": 0.7340306, + "num_input_tokens_seen": 110135750, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8671875, + "step": 5129, + "time_per_iteration": 2.5417068004608154 + }, + { + "auxiliary_loss_clip": 0.01045915, + "auxiliary_loss_mlp": 0.01008464, + "balance_loss_clip": 1.00633001, + "balance_loss_mlp": 1.01580441, + "epoch": 0.3084322861866827, + "flos": 68212679581440.0, + "grad_norm": 0.9000157132793972, + "language_loss": 0.59171313, + "learning_rate": 3.240400263719846e-06, + "loss": 0.61225688, + "num_input_tokens_seen": 110189480, + "router_z_loss_clip": 0.0213623, + "router_z_loss_mlp": 0.30078125, + "step": 5130, + "time_per_iteration": 3.024799108505249 + }, + { + "auxiliary_loss_clip": 0.01135089, + "auxiliary_loss_mlp": 0.01038466, + "balance_loss_clip": 1.02233696, + "balance_loss_mlp": 1.0485276, + "epoch": 0.3084924094393507, + "flos": 20296495514880.0, + "grad_norm": 2.1422150520884773, + "language_loss": 0.72951442, + "learning_rate": 3.2400947294715957e-06, + "loss": 0.75124997, + "num_input_tokens_seen": 110206445, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8671875, + "step": 5131, + "time_per_iteration": 2.5145480632781982 + }, + { + "auxiliary_loss_clip": 0.01130631, + "auxiliary_loss_mlp": 0.01036573, + "balance_loss_clip": 1.02222049, + "balance_loss_mlp": 1.04737425, + "epoch": 0.30855253269201866, + "flos": 23949831905280.0, + "grad_norm": 1.759562546324366, + "language_loss": 0.71208251, + "learning_rate": 3.2397891481988303e-06, + "loss": 0.73375452, + "num_input_tokens_seen": 110226845, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.83203125, + "step": 5132, + "time_per_iteration": 2.4997506141662598 + }, + { + "auxiliary_loss_clip": 0.01128489, + "auxiliary_loss_mlp": 0.01040365, + "balance_loss_clip": 1.02580929, + "balance_loss_mlp": 1.04823279, + "epoch": 0.3086126559446866, + "flos": 19281876042240.0, + "grad_norm": 1.7072095629792627, + "language_loss": 0.8999784, + "learning_rate": 3.239483519913136e-06, + "loss": 0.92166698, + "num_input_tokens_seen": 110244095, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8046875, + "step": 5133, + "time_per_iteration": 2.4972143173217773 + }, + { + "auxiliary_loss_clip": 0.01136466, + "auxiliary_loss_mlp": 0.01047935, + "balance_loss_clip": 1.03186607, + "balance_loss_mlp": 1.04911399, + "epoch": 0.3086727791973546, + "flos": 33760770019200.0, + "grad_norm": 1.8506383958840185, + "language_loss": 0.67226613, + "learning_rate": 3.239177844626102e-06, + "loss": 0.6941101, + "num_input_tokens_seen": 110264240, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.875, + "step": 5134, + "time_per_iteration": 2.5700669288635254 + }, + { + "auxiliary_loss_clip": 0.0113384, + "auxiliary_loss_mlp": 0.01047239, + "balance_loss_clip": 1.0317775, + "balance_loss_mlp": 1.04718161, + "epoch": 0.30873290245002255, + "flos": 16034151006720.0, + "grad_norm": 2.423009332179396, + "language_loss": 0.82865155, + "learning_rate": 3.2388721223493197e-06, + "loss": 0.85046244, + "num_input_tokens_seen": 110282450, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8671875, + "step": 5135, + "time_per_iteration": 2.4712367057800293 + }, + { + "auxiliary_loss_clip": 0.0104583, + "auxiliary_loss_mlp": 0.01004049, + "balance_loss_clip": 1.00197494, + "balance_loss_mlp": 1.015975, + "epoch": 0.3087930257026905, + "flos": 65048304055680.0, + "grad_norm": 0.7120747448350507, + "language_loss": 0.55243868, + "learning_rate": 3.2385663530943824e-06, + "loss": 0.57293749, + "num_input_tokens_seen": 110343715, + "router_z_loss_clip": 0.02075195, + "router_z_loss_mlp": 0.29882812, + "step": 5136, + "time_per_iteration": 3.1432137489318848 + }, + { + "auxiliary_loss_clip": 0.01132561, + "auxiliary_loss_mlp": 0.01040613, + "balance_loss_clip": 1.02502048, + "balance_loss_mlp": 1.04724097, + "epoch": 0.3088531489553585, + "flos": 74738829824640.0, + "grad_norm": 1.9824711220984585, + "language_loss": 0.76057774, + "learning_rate": 3.2382605368728852e-06, + "loss": 0.78230941, + "num_input_tokens_seen": 110368430, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8515625, + "step": 5137, + "time_per_iteration": 5.764686822891235 + }, + { + "auxiliary_loss_clip": 0.0113183, + "auxiliary_loss_mlp": 0.01037097, + "balance_loss_clip": 1.02310133, + "balance_loss_mlp": 1.04696631, + "epoch": 0.30891327220802645, + "flos": 21142300043520.0, + "grad_norm": 2.0179579208290264, + "language_loss": 0.79909992, + "learning_rate": 3.237954673696424e-06, + "loss": 0.8207891, + "num_input_tokens_seen": 110386735, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.84765625, + "step": 5138, + "time_per_iteration": 2.45621657371521 + }, + { + "auxiliary_loss_clip": 0.01132491, + "auxiliary_loss_mlp": 0.01042876, + "balance_loss_clip": 1.02666378, + "balance_loss_mlp": 1.04560494, + "epoch": 0.3089733954606944, + "flos": 25664494515840.0, + "grad_norm": 1.4272945699581137, + "language_loss": 0.81220984, + "learning_rate": 3.2376487635765983e-06, + "loss": 0.83396351, + "num_input_tokens_seen": 110406820, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.87109375, + "step": 5139, + "time_per_iteration": 2.5283203125 + }, + { + "auxiliary_loss_clip": 0.01137198, + "auxiliary_loss_mlp": 0.01042206, + "balance_loss_clip": 1.02492023, + "balance_loss_mlp": 1.04787183, + "epoch": 0.3090335187133624, + "flos": 19427350124160.0, + "grad_norm": 2.1565991279061736, + "language_loss": 0.77528149, + "learning_rate": 3.2373428065250067e-06, + "loss": 0.79707557, + "num_input_tokens_seen": 110424225, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.89453125, + "step": 5140, + "time_per_iteration": 2.43929386138916 + }, + { + "auxiliary_loss_clip": 0.01129104, + "auxiliary_loss_mlp": 0.01044008, + "balance_loss_clip": 1.02920234, + "balance_loss_mlp": 1.04757929, + "epoch": 0.30909364196603034, + "flos": 20011329440640.0, + "grad_norm": 2.2023621297160156, + "language_loss": 0.78595555, + "learning_rate": 3.237036802553252e-06, + "loss": 0.80768663, + "num_input_tokens_seen": 110443310, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.81640625, + "step": 5141, + "time_per_iteration": 2.5164880752563477 + }, + { + "auxiliary_loss_clip": 0.01134378, + "auxiliary_loss_mlp": 0.01047349, + "balance_loss_clip": 1.03046894, + "balance_loss_mlp": 1.04716825, + "epoch": 0.3091537652186983, + "flos": 19677575243520.0, + "grad_norm": 2.127714885761315, + "language_loss": 0.87142885, + "learning_rate": 3.2367307516729377e-06, + "loss": 0.89324611, + "num_input_tokens_seen": 110460215, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.87109375, + "step": 5142, + "time_per_iteration": 2.4362974166870117 + }, + { + "auxiliary_loss_clip": 0.01131531, + "auxiliary_loss_mlp": 0.0104755, + "balance_loss_clip": 1.03220749, + "balance_loss_mlp": 1.04556274, + "epoch": 0.3092138884713663, + "flos": 17020042577280.0, + "grad_norm": 1.7972015737501748, + "language_loss": 0.7877624, + "learning_rate": 3.23642465389567e-06, + "loss": 0.80955315, + "num_input_tokens_seen": 110479385, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.859375, + "step": 5143, + "time_per_iteration": 2.459317445755005 + }, + { + "auxiliary_loss_clip": 0.01130331, + "auxiliary_loss_mlp": 0.01043432, + "balance_loss_clip": 1.02742219, + "balance_loss_mlp": 1.04593444, + "epoch": 0.3092740117240343, + "flos": 25009986844800.0, + "grad_norm": 1.9461458902951219, + "language_loss": 0.72098875, + "learning_rate": 3.236118509233055e-06, + "loss": 0.74272639, + "num_input_tokens_seen": 110499885, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.84375, + "step": 5144, + "time_per_iteration": 2.4872243404388428 + }, + { + "auxiliary_loss_clip": 0.01132569, + "auxiliary_loss_mlp": 0.01040748, + "balance_loss_clip": 1.02418947, + "balance_loss_mlp": 1.04587483, + "epoch": 0.30933413497670226, + "flos": 25590410714880.0, + "grad_norm": 1.7305751805857612, + "language_loss": 0.74054307, + "learning_rate": 3.235812317696702e-06, + "loss": 0.76227629, + "num_input_tokens_seen": 110519690, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8671875, + "step": 5145, + "time_per_iteration": 2.524683952331543 + }, + { + "auxiliary_loss_clip": 0.01132717, + "auxiliary_loss_mlp": 0.01045609, + "balance_loss_clip": 1.02951622, + "balance_loss_mlp": 1.04737079, + "epoch": 0.3093942582293702, + "flos": 24389665943040.0, + "grad_norm": 1.6607552662218326, + "language_loss": 0.76461762, + "learning_rate": 3.2355060792982224e-06, + "loss": 0.78640091, + "num_input_tokens_seen": 110540520, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8515625, + "step": 5146, + "time_per_iteration": 2.4848198890686035 + }, + { + "auxiliary_loss_clip": 0.01130265, + "auxiliary_loss_mlp": 0.01037136, + "balance_loss_clip": 1.02213407, + "balance_loss_mlp": 1.04672074, + "epoch": 0.3094543814820382, + "flos": 19646441130240.0, + "grad_norm": 2.385312171088194, + "language_loss": 0.66755533, + "learning_rate": 3.2351997940492286e-06, + "loss": 0.68922937, + "num_input_tokens_seen": 110557950, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8359375, + "step": 5147, + "time_per_iteration": 2.4861929416656494 + }, + { + "auxiliary_loss_clip": 0.01135751, + "auxiliary_loss_mlp": 0.01040015, + "balance_loss_clip": 1.02517319, + "balance_loss_mlp": 1.04931486, + "epoch": 0.30951450473470615, + "flos": 25663812157440.0, + "grad_norm": 2.0402709532397205, + "language_loss": 0.75148058, + "learning_rate": 3.2348934619613346e-06, + "loss": 0.77323824, + "num_input_tokens_seen": 110578215, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8671875, + "step": 5148, + "time_per_iteration": 2.505180597305298 + }, + { + "auxiliary_loss_clip": 0.01139245, + "auxiliary_loss_mlp": 0.0104464, + "balance_loss_clip": 1.02815318, + "balance_loss_mlp": 1.04876494, + "epoch": 0.3095746279873741, + "flos": 12020415505920.0, + "grad_norm": 2.1288750992632677, + "language_loss": 0.72576058, + "learning_rate": 3.2345870830461567e-06, + "loss": 0.74759942, + "num_input_tokens_seen": 110592990, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.90625, + "step": 5149, + "time_per_iteration": 2.4605252742767334 + }, + { + "auxiliary_loss_clip": 0.01133233, + "auxiliary_loss_mlp": 0.01041255, + "balance_loss_clip": 1.02442312, + "balance_loss_mlp": 1.0457058, + "epoch": 0.3096347512400421, + "flos": 23623044946560.0, + "grad_norm": 2.112154456836484, + "language_loss": 0.84981489, + "learning_rate": 3.2342806573153132e-06, + "loss": 0.87155974, + "num_input_tokens_seen": 110612130, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.875, + "step": 5150, + "time_per_iteration": 2.4866578578948975 + }, + { + "auxiliary_loss_clip": 0.01131574, + "auxiliary_loss_mlp": 0.01041618, + "balance_loss_clip": 1.02515531, + "balance_loss_mlp": 1.04593086, + "epoch": 0.30969487449271005, + "flos": 22529313768960.0, + "grad_norm": 1.9529089609254688, + "language_loss": 0.79053164, + "learning_rate": 3.233974184780424e-06, + "loss": 0.81226349, + "num_input_tokens_seen": 110632045, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.859375, + "step": 5151, + "time_per_iteration": 2.4936540126800537 + }, + { + "auxiliary_loss_clip": 0.01133842, + "auxiliary_loss_mlp": 0.01042555, + "balance_loss_clip": 1.02580595, + "balance_loss_mlp": 1.0471015, + "epoch": 0.309754997745378, + "flos": 15267925059840.0, + "grad_norm": 3.1311630498810774, + "language_loss": 0.67020154, + "learning_rate": 3.2336676654531084e-06, + "loss": 0.69196552, + "num_input_tokens_seen": 110649340, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8671875, + "step": 5152, + "time_per_iteration": 2.429640054702759 + }, + { + "auxiliary_loss_clip": 0.01132623, + "auxiliary_loss_mlp": 0.01043705, + "balance_loss_clip": 1.0275166, + "balance_loss_mlp": 1.04688787, + "epoch": 0.309815120998046, + "flos": 26979291947520.0, + "grad_norm": 12.57465651148819, + "language_loss": 0.82058132, + "learning_rate": 3.2333610993449926e-06, + "loss": 0.84234464, + "num_input_tokens_seen": 110668450, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.859375, + "step": 5153, + "time_per_iteration": 2.578856945037842 + }, + { + "auxiliary_loss_clip": 0.01133847, + "auxiliary_loss_mlp": 0.01042916, + "balance_loss_clip": 1.02788973, + "balance_loss_mlp": 1.04822588, + "epoch": 0.30987524425071394, + "flos": 21143161969920.0, + "grad_norm": 1.7956706783057126, + "language_loss": 0.73902357, + "learning_rate": 3.2330544864676997e-06, + "loss": 0.76079118, + "num_input_tokens_seen": 110689410, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.85546875, + "step": 5154, + "time_per_iteration": 2.5063655376434326 + }, + { + "auxiliary_loss_clip": 0.01133271, + "auxiliary_loss_mlp": 0.01039056, + "balance_loss_clip": 1.02287924, + "balance_loss_mlp": 1.04747653, + "epoch": 0.3099353675033819, + "flos": 15268284195840.0, + "grad_norm": 2.516871287947693, + "language_loss": 0.76051688, + "learning_rate": 3.232747826832858e-06, + "loss": 0.78224009, + "num_input_tokens_seen": 110707350, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.859375, + "step": 5155, + "time_per_iteration": 2.4838123321533203 + }, + { + "auxiliary_loss_clip": 0.01135928, + "auxiliary_loss_mlp": 0.01042973, + "balance_loss_clip": 1.02701044, + "balance_loss_mlp": 1.04871869, + "epoch": 0.30999549075604993, + "flos": 15413794191360.0, + "grad_norm": 1.7492301646526522, + "language_loss": 0.7883296, + "learning_rate": 3.232441120452094e-06, + "loss": 0.81011862, + "num_input_tokens_seen": 110724910, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87109375, + "step": 5156, + "time_per_iteration": 2.4420597553253174 + }, + { + "auxiliary_loss_clip": 0.01134302, + "auxiliary_loss_mlp": 0.01046544, + "balance_loss_clip": 1.02894902, + "balance_loss_mlp": 1.04688191, + "epoch": 0.3100556140087179, + "flos": 23184539712000.0, + "grad_norm": 3.007667649484548, + "language_loss": 0.75094402, + "learning_rate": 3.23213436733704e-06, + "loss": 0.77275252, + "num_input_tokens_seen": 110744010, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.875, + "step": 5157, + "time_per_iteration": 2.4922094345092773 + }, + { + "auxiliary_loss_clip": 0.01131262, + "auxiliary_loss_mlp": 0.01037688, + "balance_loss_clip": 1.02282262, + "balance_loss_mlp": 1.04701662, + "epoch": 0.31011573726138586, + "flos": 25742169676800.0, + "grad_norm": 1.583276716554569, + "language_loss": 0.69391131, + "learning_rate": 3.231827567499327e-06, + "loss": 0.71560085, + "num_input_tokens_seen": 110765835, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.84375, + "step": 5158, + "time_per_iteration": 2.5119874477386475 + }, + { + "auxiliary_loss_clip": 0.0113222, + "auxiliary_loss_mlp": 0.01040926, + "balance_loss_clip": 1.0260725, + "balance_loss_mlp": 1.04802489, + "epoch": 0.3101758605140538, + "flos": 20011329440640.0, + "grad_norm": 1.8674515495135584, + "language_loss": 0.84731698, + "learning_rate": 3.2315207209505896e-06, + "loss": 0.86904848, + "num_input_tokens_seen": 110784655, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.84375, + "step": 5159, + "time_per_iteration": 2.5553479194641113 + }, + { + "auxiliary_loss_clip": 0.01130577, + "auxiliary_loss_mlp": 0.01037318, + "balance_loss_clip": 1.0215224, + "balance_loss_mlp": 1.04617286, + "epoch": 0.3102359837667218, + "flos": 19135683688320.0, + "grad_norm": 1.6286624468626467, + "language_loss": 0.85222661, + "learning_rate": 3.231213827702462e-06, + "loss": 0.87390554, + "num_input_tokens_seen": 110802545, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.84375, + "step": 5160, + "time_per_iteration": 2.4521608352661133 + }, + { + "auxiliary_loss_clip": 0.01131067, + "auxiliary_loss_mlp": 0.01039214, + "balance_loss_clip": 1.02385354, + "balance_loss_mlp": 1.04720986, + "epoch": 0.31029610701938976, + "flos": 22265405568000.0, + "grad_norm": 2.1323719792042404, + "language_loss": 0.76438844, + "learning_rate": 3.230906887766584e-06, + "loss": 0.78609127, + "num_input_tokens_seen": 110820265, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.83984375, + "step": 5161, + "time_per_iteration": 2.4705073833465576 + }, + { + "auxiliary_loss_clip": 0.01133183, + "auxiliary_loss_mlp": 0.01040129, + "balance_loss_clip": 1.02420259, + "balance_loss_mlp": 1.04661226, + "epoch": 0.3103562302720577, + "flos": 20805349536000.0, + "grad_norm": 1.9681741891595628, + "language_loss": 0.81644946, + "learning_rate": 3.2305999011545924e-06, + "loss": 0.83818257, + "num_input_tokens_seen": 110836195, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8671875, + "step": 5162, + "time_per_iteration": 2.4359090328216553 + }, + { + "auxiliary_loss_clip": 0.01129525, + "auxiliary_loss_mlp": 0.01037231, + "balance_loss_clip": 1.0231998, + "balance_loss_mlp": 1.04580498, + "epoch": 0.3104163535247257, + "flos": 22344158136960.0, + "grad_norm": 1.6668116654420786, + "language_loss": 0.82879269, + "learning_rate": 3.2302928678781295e-06, + "loss": 0.85046029, + "num_input_tokens_seen": 110856420, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8359375, + "step": 5163, + "time_per_iteration": 2.536198854446411 + }, + { + "auxiliary_loss_clip": 0.01134589, + "auxiliary_loss_mlp": 0.01042558, + "balance_loss_clip": 1.02670264, + "balance_loss_mlp": 1.04848182, + "epoch": 0.31047647677739365, + "flos": 21689363157120.0, + "grad_norm": 1.61479678935284, + "language_loss": 0.76103258, + "learning_rate": 3.2299857879488376e-06, + "loss": 0.78280413, + "num_input_tokens_seen": 110876650, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.859375, + "step": 5164, + "time_per_iteration": 2.4736320972442627 + }, + { + "auxiliary_loss_clip": 0.01134485, + "auxiliary_loss_mlp": 0.01041252, + "balance_loss_clip": 1.02492666, + "balance_loss_mlp": 1.04932189, + "epoch": 0.3105366000300616, + "flos": 18917275040640.0, + "grad_norm": 1.73414256762253, + "language_loss": 0.74515426, + "learning_rate": 3.2296786613783626e-06, + "loss": 0.76691169, + "num_input_tokens_seen": 110894445, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8515625, + "step": 5165, + "time_per_iteration": 2.4788122177124023 + }, + { + "auxiliary_loss_clip": 0.01132367, + "auxiliary_loss_mlp": 0.01042006, + "balance_loss_clip": 1.02627063, + "balance_loss_mlp": 1.0472759, + "epoch": 0.3105967232827296, + "flos": 18260397072000.0, + "grad_norm": 2.461614607097325, + "language_loss": 0.75987816, + "learning_rate": 3.229371488178348e-06, + "loss": 0.78162187, + "num_input_tokens_seen": 110912855, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8515625, + "step": 5166, + "time_per_iteration": 2.4461371898651123 + }, + { + "auxiliary_loss_clip": 0.01133631, + "auxiliary_loss_mlp": 0.01045635, + "balance_loss_clip": 1.02939892, + "balance_loss_mlp": 1.04844868, + "epoch": 0.31065684653539755, + "flos": 17672144037120.0, + "grad_norm": 2.4324780660218557, + "language_loss": 0.73424876, + "learning_rate": 3.229064268360444e-06, + "loss": 0.75604147, + "num_input_tokens_seen": 110928025, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8515625, + "step": 5167, + "time_per_iteration": 2.4301631450653076 + }, + { + "auxiliary_loss_clip": 0.01047334, + "auxiliary_loss_mlp": 0.01006703, + "balance_loss_clip": 1.00467682, + "balance_loss_mlp": 1.01844001, + "epoch": 0.3107169697880655, + "flos": 68531996511360.0, + "grad_norm": 0.725291341239906, + "language_loss": 0.53031516, + "learning_rate": 3.2287570019362997e-06, + "loss": 0.55085552, + "num_input_tokens_seen": 110992215, + "router_z_loss_clip": 0.02026367, + "router_z_loss_mlp": 0.2890625, + "step": 5168, + "time_per_iteration": 3.1146020889282227 + }, + { + "auxiliary_loss_clip": 0.01133751, + "auxiliary_loss_mlp": 0.01043639, + "balance_loss_clip": 1.0269258, + "balance_loss_mlp": 1.0465318, + "epoch": 0.3107770930407335, + "flos": 13188733274880.0, + "grad_norm": 1.782356602828545, + "language_loss": 0.78745592, + "learning_rate": 3.2284496889175668e-06, + "loss": 0.80922985, + "num_input_tokens_seen": 111010400, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.87109375, + "step": 5169, + "time_per_iteration": 2.4755852222442627 + }, + { + "auxiliary_loss_clip": 0.01132974, + "auxiliary_loss_mlp": 0.01038846, + "balance_loss_clip": 1.02337217, + "balance_loss_mlp": 1.04640126, + "epoch": 0.3108372162934015, + "flos": 31580849520000.0, + "grad_norm": 1.536235209485244, + "language_loss": 0.6414057, + "learning_rate": 3.2281423293158986e-06, + "loss": 0.66312397, + "num_input_tokens_seen": 111033960, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8671875, + "step": 5170, + "time_per_iteration": 2.5690839290618896 + }, + { + "auxiliary_loss_clip": 0.01133399, + "auxiliary_loss_mlp": 0.01042243, + "balance_loss_clip": 1.02635252, + "balance_loss_mlp": 1.04721069, + "epoch": 0.31089733954606946, + "flos": 28729829266560.0, + "grad_norm": 2.41080559035864, + "language_loss": 0.77698815, + "learning_rate": 3.22783492314295e-06, + "loss": 0.79874456, + "num_input_tokens_seen": 111053265, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.86328125, + "step": 5171, + "time_per_iteration": 2.558258295059204 + }, + { + "auxiliary_loss_clip": 0.01132946, + "auxiliary_loss_mlp": 0.01053954, + "balance_loss_clip": 1.03769374, + "balance_loss_mlp": 1.04645526, + "epoch": 0.3109574627987374, + "flos": 19683249592320.0, + "grad_norm": 1.9319520361735263, + "language_loss": 0.83802366, + "learning_rate": 3.2275274704103785e-06, + "loss": 0.85989261, + "num_input_tokens_seen": 111071130, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8671875, + "step": 5172, + "time_per_iteration": 2.4601597785949707 + }, + { + "auxiliary_loss_clip": 0.01133186, + "auxiliary_loss_mlp": 0.01045771, + "balance_loss_clip": 1.02948654, + "balance_loss_mlp": 1.0467186, + "epoch": 0.3110175860514054, + "flos": 14683981656960.0, + "grad_norm": 1.9586589765002733, + "language_loss": 0.84225619, + "learning_rate": 3.227219971129842e-06, + "loss": 0.86404574, + "num_input_tokens_seen": 111089560, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8671875, + "step": 5173, + "time_per_iteration": 2.501840591430664 + }, + { + "auxiliary_loss_clip": 0.01128358, + "auxiliary_loss_mlp": 0.01034767, + "balance_loss_clip": 1.02038455, + "balance_loss_mlp": 1.04595959, + "epoch": 0.31107770930407336, + "flos": 25739655724800.0, + "grad_norm": 1.622637298809784, + "language_loss": 0.83323705, + "learning_rate": 3.226912425313001e-06, + "loss": 0.85486829, + "num_input_tokens_seen": 111109960, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.82421875, + "step": 5174, + "time_per_iteration": 2.507127285003662 + }, + { + "auxiliary_loss_clip": 0.01131648, + "auxiliary_loss_mlp": 0.01047063, + "balance_loss_clip": 1.03155434, + "balance_loss_mlp": 1.04670012, + "epoch": 0.3111378325567413, + "flos": 19208259118080.0, + "grad_norm": 2.3340025504670003, + "language_loss": 0.84681082, + "learning_rate": 3.2266048329715183e-06, + "loss": 0.86859798, + "num_input_tokens_seen": 111127960, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84765625, + "step": 5175, + "time_per_iteration": 2.4853246212005615 + }, + { + "auxiliary_loss_clip": 0.01133689, + "auxiliary_loss_mlp": 0.01047203, + "balance_loss_clip": 1.03029919, + "balance_loss_mlp": 1.04996502, + "epoch": 0.3111979558094093, + "flos": 23696374561920.0, + "grad_norm": 1.6466695594130172, + "language_loss": 0.83448446, + "learning_rate": 3.2262971941170575e-06, + "loss": 0.85629338, + "num_input_tokens_seen": 111146730, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8359375, + "step": 5176, + "time_per_iteration": 2.4759509563446045 + }, + { + "auxiliary_loss_clip": 0.01128858, + "auxiliary_loss_mlp": 0.01044446, + "balance_loss_clip": 1.02836514, + "balance_loss_mlp": 1.04442942, + "epoch": 0.31125807906207725, + "flos": 21033023892480.0, + "grad_norm": 1.7899579393784935, + "language_loss": 0.80820966, + "learning_rate": 3.2259895087612837e-06, + "loss": 0.8299427, + "num_input_tokens_seen": 111166295, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.84375, + "step": 5177, + "time_per_iteration": 2.5106611251831055 + }, + { + "auxiliary_loss_clip": 0.0113295, + "auxiliary_loss_mlp": 0.01041813, + "balance_loss_clip": 1.02629185, + "balance_loss_mlp": 1.048877, + "epoch": 0.3113182023147452, + "flos": 23076628277760.0, + "grad_norm": 1.9871899212943351, + "language_loss": 0.80703342, + "learning_rate": 3.2256817769158657e-06, + "loss": 0.82878101, + "num_input_tokens_seen": 111185665, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.84375, + "step": 5178, + "time_per_iteration": 4.0482330322265625 + }, + { + "auxiliary_loss_clip": 0.01131397, + "auxiliary_loss_mlp": 0.01048541, + "balance_loss_clip": 1.03310347, + "balance_loss_mlp": 1.04518402, + "epoch": 0.3113783255674132, + "flos": 11838994888320.0, + "grad_norm": 1.8347450184704097, + "language_loss": 0.81340981, + "learning_rate": 3.225373998592471e-06, + "loss": 0.83520925, + "num_input_tokens_seen": 111201615, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.86328125, + "step": 5179, + "time_per_iteration": 3.82991886138916 + }, + { + "auxiliary_loss_clip": 0.01132507, + "auxiliary_loss_mlp": 0.01049787, + "balance_loss_clip": 1.0338006, + "balance_loss_mlp": 1.04824936, + "epoch": 0.31143844882008115, + "flos": 16289547684480.0, + "grad_norm": 1.599561013411363, + "language_loss": 0.78199375, + "learning_rate": 3.2250661738027715e-06, + "loss": 0.8038168, + "num_input_tokens_seen": 111220515, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.84375, + "step": 5180, + "time_per_iteration": 2.4656291007995605 + }, + { + "auxiliary_loss_clip": 0.01130701, + "auxiliary_loss_mlp": 0.01035311, + "balance_loss_clip": 1.01915836, + "balance_loss_mlp": 1.04672408, + "epoch": 0.3114985720727491, + "flos": 23217792727680.0, + "grad_norm": 1.6380256774064115, + "language_loss": 0.83046079, + "learning_rate": 3.22475830255844e-06, + "loss": 0.85212088, + "num_input_tokens_seen": 111240395, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.84375, + "step": 5181, + "time_per_iteration": 2.5661914348602295 + }, + { + "auxiliary_loss_clip": 0.01128181, + "auxiliary_loss_mlp": 0.01043679, + "balance_loss_clip": 1.02903986, + "balance_loss_mlp": 1.0464232, + "epoch": 0.3115586953254171, + "flos": 30044626698240.0, + "grad_norm": 1.700886032828765, + "language_loss": 0.74084079, + "learning_rate": 3.2244503848711516e-06, + "loss": 0.76255929, + "num_input_tokens_seen": 111261100, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8203125, + "step": 5182, + "time_per_iteration": 2.5913209915161133 + }, + { + "auxiliary_loss_clip": 0.01136348, + "auxiliary_loss_mlp": 0.01050649, + "balance_loss_clip": 1.03479409, + "balance_loss_mlp": 1.04858768, + "epoch": 0.3116188185780851, + "flos": 25666326109440.0, + "grad_norm": 1.8010906920491343, + "language_loss": 0.70658493, + "learning_rate": 3.2241424207525815e-06, + "loss": 0.72845489, + "num_input_tokens_seen": 111281320, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.875, + "step": 5183, + "time_per_iteration": 2.4991438388824463 + }, + { + "auxiliary_loss_clip": 0.01045533, + "auxiliary_loss_mlp": 0.01014757, + "balance_loss_clip": 1.01301634, + "balance_loss_mlp": 1.01690507, + "epoch": 0.31167894183075306, + "flos": 69510058917120.0, + "grad_norm": 0.9414003998762589, + "language_loss": 0.59602594, + "learning_rate": 3.223834410214408e-06, + "loss": 0.61662877, + "num_input_tokens_seen": 111341405, + "router_z_loss_clip": 0.01745605, + "router_z_loss_mlp": 0.28515625, + "step": 5184, + "time_per_iteration": 3.0754520893096924 + }, + { + "auxiliary_loss_clip": 0.01130364, + "auxiliary_loss_mlp": 0.01047375, + "balance_loss_clip": 1.03264058, + "balance_loss_mlp": 1.04596519, + "epoch": 0.31173906508342103, + "flos": 14939845211520.0, + "grad_norm": 2.811836993883612, + "language_loss": 0.69750082, + "learning_rate": 3.223526353268311e-06, + "loss": 0.71927822, + "num_input_tokens_seen": 111358975, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.84375, + "step": 5185, + "time_per_iteration": 2.435033082962036 + }, + { + "auxiliary_loss_clip": 0.01136749, + "auxiliary_loss_mlp": 0.01048147, + "balance_loss_clip": 1.0323875, + "balance_loss_mlp": 1.05073345, + "epoch": 0.311799188336089, + "flos": 16176033728640.0, + "grad_norm": 2.346024133586612, + "language_loss": 0.63920057, + "learning_rate": 3.2232182499259725e-06, + "loss": 0.66104954, + "num_input_tokens_seen": 111375845, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.859375, + "step": 5186, + "time_per_iteration": 2.463900327682495 + }, + { + "auxiliary_loss_clip": 0.01137188, + "auxiliary_loss_mlp": 0.01049347, + "balance_loss_clip": 1.03219295, + "balance_loss_mlp": 1.04886758, + "epoch": 0.31185931158875696, + "flos": 25009627708800.0, + "grad_norm": 2.108066194391345, + "language_loss": 0.86249322, + "learning_rate": 3.2229101001990747e-06, + "loss": 0.88435853, + "num_input_tokens_seen": 111394150, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.8828125, + "step": 5187, + "time_per_iteration": 2.4854979515075684 + }, + { + "auxiliary_loss_clip": 0.01129847, + "auxiliary_loss_mlp": 0.0104672, + "balance_loss_clip": 1.03048384, + "balance_loss_mlp": 1.0451926, + "epoch": 0.3119194348414249, + "flos": 37232901273600.0, + "grad_norm": 1.7445298378798078, + "language_loss": 0.62983185, + "learning_rate": 3.2226019040993036e-06, + "loss": 0.6515975, + "num_input_tokens_seen": 111418355, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.84765625, + "step": 5188, + "time_per_iteration": 2.6161019802093506 + }, + { + "auxiliary_loss_clip": 0.01135744, + "auxiliary_loss_mlp": 0.01045566, + "balance_loss_clip": 1.02961564, + "balance_loss_mlp": 1.05116081, + "epoch": 0.3119795580940929, + "flos": 15012779777280.0, + "grad_norm": 2.1633857437120256, + "language_loss": 0.8347863, + "learning_rate": 3.222293661638346e-06, + "loss": 0.85659939, + "num_input_tokens_seen": 111435445, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.84375, + "step": 5189, + "time_per_iteration": 2.4360432624816895 + }, + { + "auxiliary_loss_clip": 0.01129905, + "auxiliary_loss_mlp": 0.01036753, + "balance_loss_clip": 1.0213753, + "balance_loss_mlp": 1.04657507, + "epoch": 0.31203968134676086, + "flos": 15998168557440.0, + "grad_norm": 1.6712014044776404, + "language_loss": 0.7916308, + "learning_rate": 3.22198537282789e-06, + "loss": 0.81329739, + "num_input_tokens_seen": 111453430, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.83203125, + "step": 5190, + "time_per_iteration": 2.472668170928955 + }, + { + "auxiliary_loss_clip": 0.01133914, + "auxiliary_loss_mlp": 0.01035258, + "balance_loss_clip": 1.01986194, + "balance_loss_mlp": 1.04946673, + "epoch": 0.3120998045994288, + "flos": 23837359443840.0, + "grad_norm": 1.4545499288259176, + "language_loss": 0.75318813, + "learning_rate": 3.2216770376796262e-06, + "loss": 0.77487987, + "num_input_tokens_seen": 111475325, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84375, + "step": 5191, + "time_per_iteration": 2.486673355102539 + }, + { + "auxiliary_loss_clip": 0.01049091, + "auxiliary_loss_mlp": 0.01002214, + "balance_loss_clip": 1.00025892, + "balance_loss_mlp": 1.02067924, + "epoch": 0.3121599278520968, + "flos": 69184205712000.0, + "grad_norm": 0.8451593954944295, + "language_loss": 0.63957787, + "learning_rate": 3.221368656205247e-06, + "loss": 0.66009092, + "num_input_tokens_seen": 111533960, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.28515625, + "step": 5192, + "time_per_iteration": 3.1464638710021973 + }, + { + "auxiliary_loss_clip": 0.01134311, + "auxiliary_loss_mlp": 0.01041004, + "balance_loss_clip": 1.02365923, + "balance_loss_mlp": 1.04795599, + "epoch": 0.31222005110476475, + "flos": 23806368984960.0, + "grad_norm": 1.6164756923867671, + "language_loss": 0.80154347, + "learning_rate": 3.221060228416446e-06, + "loss": 0.82329667, + "num_input_tokens_seen": 111554055, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.86328125, + "step": 5193, + "time_per_iteration": 2.5156989097595215 + }, + { + "auxiliary_loss_clip": 0.01131615, + "auxiliary_loss_mlp": 0.01042627, + "balance_loss_clip": 1.02610445, + "balance_loss_mlp": 1.045856, + "epoch": 0.3122801743574327, + "flos": 25226132935680.0, + "grad_norm": 1.8140889441731107, + "language_loss": 0.72050476, + "learning_rate": 3.2207517543249183e-06, + "loss": 0.74224722, + "num_input_tokens_seen": 111574305, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.85546875, + "step": 5194, + "time_per_iteration": 2.519972801208496 + }, + { + "auxiliary_loss_clip": 0.01133223, + "auxiliary_loss_mlp": 0.01039811, + "balance_loss_clip": 1.02471924, + "balance_loss_mlp": 1.04870749, + "epoch": 0.3123402976101007, + "flos": 22966490200320.0, + "grad_norm": 1.3544515008303952, + "language_loss": 0.76475823, + "learning_rate": 3.2204432339423616e-06, + "loss": 0.78648859, + "num_input_tokens_seen": 111595680, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84765625, + "step": 5195, + "time_per_iteration": 2.512247323989868 + }, + { + "auxiliary_loss_clip": 0.01131656, + "auxiliary_loss_mlp": 0.01042642, + "balance_loss_clip": 1.02718091, + "balance_loss_mlp": 1.0449183, + "epoch": 0.3124004208627687, + "flos": 25192089820800.0, + "grad_norm": 1.3526234536893298, + "language_loss": 0.7817502, + "learning_rate": 3.220134667280476e-06, + "loss": 0.80349314, + "num_input_tokens_seen": 111618135, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8671875, + "step": 5196, + "time_per_iteration": 2.528002977371216 + }, + { + "auxiliary_loss_clip": 0.01044386, + "auxiliary_loss_mlp": 0.01000444, + "balance_loss_clip": 0.99860841, + "balance_loss_mlp": 1.01643729, + "epoch": 0.31246054411543667, + "flos": 67485165517440.0, + "grad_norm": 0.7752479618797538, + "language_loss": 0.54834789, + "learning_rate": 3.2198260543509613e-06, + "loss": 0.56879622, + "num_input_tokens_seen": 111682220, + "router_z_loss_clip": 0.01831055, + "router_z_loss_mlp": 0.27929688, + "step": 5197, + "time_per_iteration": 3.0728254318237305 + }, + { + "auxiliary_loss_clip": 0.01130689, + "auxiliary_loss_mlp": 0.01038137, + "balance_loss_clip": 1.02328372, + "balance_loss_mlp": 1.0477525, + "epoch": 0.31252066736810463, + "flos": 17858520731520.0, + "grad_norm": 1.6543672060788046, + "language_loss": 0.66300559, + "learning_rate": 3.21951739516552e-06, + "loss": 0.68469381, + "num_input_tokens_seen": 111700815, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.828125, + "step": 5198, + "time_per_iteration": 2.4312028884887695 + }, + { + "auxiliary_loss_clip": 0.01135097, + "auxiliary_loss_mlp": 0.01037705, + "balance_loss_clip": 1.02156413, + "balance_loss_mlp": 1.0472604, + "epoch": 0.3125807906207726, + "flos": 18475034791680.0, + "grad_norm": 2.083859755504136, + "language_loss": 0.69763082, + "learning_rate": 3.219208689735857e-06, + "loss": 0.71935886, + "num_input_tokens_seen": 111718195, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.875, + "step": 5199, + "time_per_iteration": 2.454464912414551 + }, + { + "auxiliary_loss_clip": 0.01131797, + "auxiliary_loss_mlp": 0.01049575, + "balance_loss_clip": 1.0336132, + "balance_loss_mlp": 1.04692471, + "epoch": 0.31264091387344056, + "flos": 18946541646720.0, + "grad_norm": 1.8982997112015956, + "language_loss": 0.79004937, + "learning_rate": 3.2188999380736785e-06, + "loss": 0.81186306, + "num_input_tokens_seen": 111734440, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.84765625, + "step": 5200, + "time_per_iteration": 2.4382827281951904 + }, + { + "auxiliary_loss_clip": 0.01127793, + "auxiliary_loss_mlp": 0.01036846, + "balance_loss_clip": 1.02187347, + "balance_loss_mlp": 1.04621911, + "epoch": 0.3127010371261085, + "flos": 21468512384640.0, + "grad_norm": 2.042457973745699, + "language_loss": 0.83946276, + "learning_rate": 3.2185911401906917e-06, + "loss": 0.86110914, + "num_input_tokens_seen": 111751960, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.81640625, + "step": 5201, + "time_per_iteration": 2.475511074066162 + }, + { + "auxiliary_loss_clip": 0.01134303, + "auxiliary_loss_mlp": 0.01046403, + "balance_loss_clip": 1.02990484, + "balance_loss_mlp": 1.04985881, + "epoch": 0.3127611603787765, + "flos": 15336047203200.0, + "grad_norm": 2.37604325800411, + "language_loss": 0.69560832, + "learning_rate": 3.2182822960986072e-06, + "loss": 0.71741533, + "num_input_tokens_seen": 111769585, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.84375, + "step": 5202, + "time_per_iteration": 2.4265501499176025 + }, + { + "auxiliary_loss_clip": 0.01133329, + "auxiliary_loss_mlp": 0.01041339, + "balance_loss_clip": 1.02737963, + "balance_loss_mlp": 1.04759419, + "epoch": 0.31282128363144446, + "flos": 17602980399360.0, + "grad_norm": 1.800546738819683, + "language_loss": 0.84001613, + "learning_rate": 3.2179734058091358e-06, + "loss": 0.86176282, + "num_input_tokens_seen": 111787880, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.85546875, + "step": 5203, + "time_per_iteration": 2.480233907699585 + }, + { + "auxiliary_loss_clip": 0.01131997, + "auxiliary_loss_mlp": 0.01047163, + "balance_loss_clip": 1.03176749, + "balance_loss_mlp": 1.04697657, + "epoch": 0.3128814068841124, + "flos": 26756753235840.0, + "grad_norm": 2.9129021624211417, + "language_loss": 0.60623944, + "learning_rate": 3.2176644693339913e-06, + "loss": 0.62803102, + "num_input_tokens_seen": 111805950, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8515625, + "step": 5204, + "time_per_iteration": 2.50688099861145 + }, + { + "auxiliary_loss_clip": 0.01129885, + "auxiliary_loss_mlp": 0.01041088, + "balance_loss_clip": 1.02672338, + "balance_loss_mlp": 1.04707503, + "epoch": 0.3129415301367804, + "flos": 22272372806400.0, + "grad_norm": 1.6006708998064776, + "language_loss": 0.65964866, + "learning_rate": 3.217355486684887e-06, + "loss": 0.68135834, + "num_input_tokens_seen": 111826135, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.828125, + "step": 5205, + "time_per_iteration": 2.4824163913726807 + }, + { + "auxiliary_loss_clip": 0.01132532, + "auxiliary_loss_mlp": 0.01043219, + "balance_loss_clip": 1.0264169, + "balance_loss_mlp": 1.0476222, + "epoch": 0.31300165338944835, + "flos": 26464907232000.0, + "grad_norm": 1.9498647702732133, + "language_loss": 0.76618874, + "learning_rate": 3.2170464578735414e-06, + "loss": 0.78794622, + "num_input_tokens_seen": 111844700, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.84765625, + "step": 5206, + "time_per_iteration": 2.4947307109832764 + }, + { + "auxiliary_loss_clip": 0.0112786, + "auxiliary_loss_mlp": 0.01039372, + "balance_loss_clip": 1.02416039, + "balance_loss_mlp": 1.04438448, + "epoch": 0.3130617766421163, + "flos": 21944652094080.0, + "grad_norm": 3.088705810465425, + "language_loss": 0.83287984, + "learning_rate": 3.216737382911672e-06, + "loss": 0.85455215, + "num_input_tokens_seen": 111861585, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8359375, + "step": 5207, + "time_per_iteration": 2.4767825603485107 + }, + { + "auxiliary_loss_clip": 0.01128039, + "auxiliary_loss_mlp": 0.01041894, + "balance_loss_clip": 1.02784562, + "balance_loss_mlp": 1.04694057, + "epoch": 0.3131218998947843, + "flos": 23292774368640.0, + "grad_norm": 1.5219202808663073, + "language_loss": 0.71293664, + "learning_rate": 3.216428261810999e-06, + "loss": 0.73463601, + "num_input_tokens_seen": 111882950, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8125, + "step": 5208, + "time_per_iteration": 2.4853296279907227 + }, + { + "auxiliary_loss_clip": 0.01133198, + "auxiliary_loss_mlp": 0.01041056, + "balance_loss_clip": 1.02534437, + "balance_loss_mlp": 1.04957032, + "epoch": 0.3131820231474523, + "flos": 21139642437120.0, + "grad_norm": 1.8332946649412374, + "language_loss": 0.74547577, + "learning_rate": 3.2161190945832445e-06, + "loss": 0.76721835, + "num_input_tokens_seen": 111901640, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8359375, + "step": 5209, + "time_per_iteration": 2.5162742137908936 + }, + { + "auxiliary_loss_clip": 0.0113008, + "auxiliary_loss_mlp": 0.01040855, + "balance_loss_clip": 1.02695489, + "balance_loss_mlp": 1.04557538, + "epoch": 0.31324214640012027, + "flos": 23909863046400.0, + "grad_norm": 1.818845882779476, + "language_loss": 0.77656835, + "learning_rate": 3.2158098812401325e-06, + "loss": 0.79827774, + "num_input_tokens_seen": 111919615, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.84375, + "step": 5210, + "time_per_iteration": 2.4701180458068848 + }, + { + "auxiliary_loss_clip": 0.01125909, + "auxiliary_loss_mlp": 0.0103947, + "balance_loss_clip": 1.02443743, + "balance_loss_mlp": 1.04593706, + "epoch": 0.31330226965278823, + "flos": 22236929061120.0, + "grad_norm": 1.8627745841798442, + "language_loss": 0.79177994, + "learning_rate": 3.2155006217933874e-06, + "loss": 0.81343371, + "num_input_tokens_seen": 111938485, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.80078125, + "step": 5211, + "time_per_iteration": 2.482102870941162 + }, + { + "auxiliary_loss_clip": 0.01130248, + "auxiliary_loss_mlp": 0.01038221, + "balance_loss_clip": 1.02448201, + "balance_loss_mlp": 1.04849112, + "epoch": 0.3133623929054562, + "flos": 19753993428480.0, + "grad_norm": 1.64859412039223, + "language_loss": 0.79837513, + "learning_rate": 3.2151913162547367e-06, + "loss": 0.82005984, + "num_input_tokens_seen": 111956425, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.8203125, + "step": 5212, + "time_per_iteration": 2.460986852645874 + }, + { + "auxiliary_loss_clip": 0.01133278, + "auxiliary_loss_mlp": 0.01049778, + "balance_loss_clip": 1.03395939, + "balance_loss_mlp": 1.04740417, + "epoch": 0.31342251615812416, + "flos": 27162256849920.0, + "grad_norm": 2.096287390218497, + "language_loss": 0.71467483, + "learning_rate": 3.2148819646359097e-06, + "loss": 0.73650539, + "num_input_tokens_seen": 111975915, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.859375, + "step": 5213, + "time_per_iteration": 2.5129754543304443 + }, + { + "auxiliary_loss_clip": 0.01135204, + "auxiliary_loss_mlp": 0.01041521, + "balance_loss_clip": 1.02660799, + "balance_loss_mlp": 1.05014026, + "epoch": 0.31348263941079213, + "flos": 20229809915520.0, + "grad_norm": 5.183832853627301, + "language_loss": 0.77595121, + "learning_rate": 3.2145725669486374e-06, + "loss": 0.79771841, + "num_input_tokens_seen": 111995055, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8515625, + "step": 5214, + "time_per_iteration": 2.453228712081909 + }, + { + "auxiliary_loss_clip": 0.01126524, + "auxiliary_loss_mlp": 0.01034893, + "balance_loss_clip": 1.02082658, + "balance_loss_mlp": 1.04599309, + "epoch": 0.3135427626634601, + "flos": 24607643627520.0, + "grad_norm": 1.6576138068605464, + "language_loss": 0.82562625, + "learning_rate": 3.2142631232046517e-06, + "loss": 0.84724051, + "num_input_tokens_seen": 112015830, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8046875, + "step": 5215, + "time_per_iteration": 2.544684886932373 + }, + { + "auxiliary_loss_clip": 0.01131802, + "auxiliary_loss_mlp": 0.01037959, + "balance_loss_clip": 1.02242613, + "balance_loss_mlp": 1.04732776, + "epoch": 0.31360288591612806, + "flos": 20959873845120.0, + "grad_norm": 2.510877303679677, + "language_loss": 0.79557931, + "learning_rate": 3.213953633415686e-06, + "loss": 0.81727695, + "num_input_tokens_seen": 112035065, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.84375, + "step": 5216, + "time_per_iteration": 2.4559943675994873 + }, + { + "auxiliary_loss_clip": 0.0113211, + "auxiliary_loss_mlp": 0.01047322, + "balance_loss_clip": 1.03042984, + "balance_loss_mlp": 1.04632115, + "epoch": 0.313663009168796, + "flos": 26980513009920.0, + "grad_norm": 2.0079960226100293, + "language_loss": 0.68489361, + "learning_rate": 3.213644097593477e-06, + "loss": 0.70668793, + "num_input_tokens_seen": 112058405, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.859375, + "step": 5217, + "time_per_iteration": 2.524624824523926 + }, + { + "auxiliary_loss_clip": 0.01134019, + "auxiliary_loss_mlp": 0.01036007, + "balance_loss_clip": 1.02095652, + "balance_loss_mlp": 1.04952598, + "epoch": 0.313723132421464, + "flos": 18040911016320.0, + "grad_norm": 1.8597778329644077, + "language_loss": 0.80357039, + "learning_rate": 3.2133345157497624e-06, + "loss": 0.82527065, + "num_input_tokens_seen": 112076420, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84765625, + "step": 5218, + "time_per_iteration": 2.437819480895996 + }, + { + "auxiliary_loss_clip": 0.01130766, + "auxiliary_loss_mlp": 0.01041589, + "balance_loss_clip": 1.025931, + "balance_loss_mlp": 1.04692423, + "epoch": 0.31378325567413196, + "flos": 22488913946880.0, + "grad_norm": 2.311414379590861, + "language_loss": 0.68608415, + "learning_rate": 3.2130248878962813e-06, + "loss": 0.70780772, + "num_input_tokens_seen": 112090775, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8359375, + "step": 5219, + "time_per_iteration": 2.4811697006225586 + }, + { + "auxiliary_loss_clip": 0.01132783, + "auxiliary_loss_mlp": 0.01040103, + "balance_loss_clip": 1.02585125, + "balance_loss_mlp": 1.05002093, + "epoch": 0.3138433789267999, + "flos": 22419247518720.0, + "grad_norm": 1.886141735907444, + "language_loss": 0.7973401, + "learning_rate": 3.2127152140447747e-06, + "loss": 0.81906897, + "num_input_tokens_seen": 112110980, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.828125, + "step": 5220, + "time_per_iteration": 5.5014426708221436 + }, + { + "auxiliary_loss_clip": 0.01129795, + "auxiliary_loss_mlp": 0.0103477, + "balance_loss_clip": 1.02036917, + "balance_loss_mlp": 1.0470016, + "epoch": 0.3139035021794679, + "flos": 13005912026880.0, + "grad_norm": 1.696615671785811, + "language_loss": 0.72865409, + "learning_rate": 3.212405494206986e-06, + "loss": 0.75029969, + "num_input_tokens_seen": 112129020, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.828125, + "step": 5221, + "time_per_iteration": 2.4286248683929443 + }, + { + "auxiliary_loss_clip": 0.01129062, + "auxiliary_loss_mlp": 0.01037622, + "balance_loss_clip": 1.02370405, + "balance_loss_mlp": 1.0478735, + "epoch": 0.31396362543213585, + "flos": 16945994689920.0, + "grad_norm": 1.5798649053475948, + "language_loss": 0.8195132, + "learning_rate": 3.2120957283946588e-06, + "loss": 0.84118003, + "num_input_tokens_seen": 112147865, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.8125, + "step": 5222, + "time_per_iteration": 2.453622817993164 + }, + { + "auxiliary_loss_clip": 0.01133873, + "auxiliary_loss_mlp": 0.01044471, + "balance_loss_clip": 1.02744806, + "balance_loss_mlp": 1.04833627, + "epoch": 0.31402374868480387, + "flos": 20156731695360.0, + "grad_norm": 1.948806511089887, + "language_loss": 0.70150459, + "learning_rate": 3.2117859166195407e-06, + "loss": 0.723288, + "num_input_tokens_seen": 112166745, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.85546875, + "step": 5223, + "time_per_iteration": 2.442513942718506 + }, + { + "auxiliary_loss_clip": 0.01130042, + "auxiliary_loss_mlp": 0.01034314, + "balance_loss_clip": 1.01980042, + "balance_loss_mlp": 1.04643512, + "epoch": 0.31408387193747184, + "flos": 21251073404160.0, + "grad_norm": 1.6111281957709347, + "language_loss": 0.80361176, + "learning_rate": 3.211476058893379e-06, + "loss": 0.82525527, + "num_input_tokens_seen": 112185895, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8359375, + "step": 5224, + "time_per_iteration": 2.5533599853515625 + }, + { + "auxiliary_loss_clip": 0.01139668, + "auxiliary_loss_mlp": 0.01042146, + "balance_loss_clip": 1.02615976, + "balance_loss_mlp": 1.05134106, + "epoch": 0.3141439951901398, + "flos": 27484267299840.0, + "grad_norm": 1.9819108050216143, + "language_loss": 0.58416283, + "learning_rate": 3.2111661552279243e-06, + "loss": 0.60598099, + "num_input_tokens_seen": 112204465, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8828125, + "step": 5225, + "time_per_iteration": 2.493633508682251 + }, + { + "auxiliary_loss_clip": 0.01125852, + "auxiliary_loss_mlp": 0.01031717, + "balance_loss_clip": 1.01826406, + "balance_loss_mlp": 1.04575014, + "epoch": 0.31420411844280777, + "flos": 17852235851520.0, + "grad_norm": 1.9016989590060558, + "language_loss": 0.81870753, + "learning_rate": 3.2108562056349273e-06, + "loss": 0.84028322, + "num_input_tokens_seen": 112221635, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.80078125, + "step": 5226, + "time_per_iteration": 2.455474376678467 + }, + { + "auxiliary_loss_clip": 0.01132046, + "auxiliary_loss_mlp": 0.0103993, + "balance_loss_clip": 1.0245285, + "balance_loss_mlp": 1.04804921, + "epoch": 0.31426424169547573, + "flos": 21616967295360.0, + "grad_norm": 3.2929472014065864, + "language_loss": 0.73947561, + "learning_rate": 3.210546210126141e-06, + "loss": 0.7611953, + "num_input_tokens_seen": 112241240, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.83984375, + "step": 5227, + "time_per_iteration": 2.4582889080047607 + }, + { + "auxiliary_loss_clip": 0.01132913, + "auxiliary_loss_mlp": 0.01042937, + "balance_loss_clip": 1.02783334, + "balance_loss_mlp": 1.04827404, + "epoch": 0.3143243649481437, + "flos": 30920631586560.0, + "grad_norm": 1.9061545786481, + "language_loss": 0.67636049, + "learning_rate": 3.2102361687133213e-06, + "loss": 0.69811898, + "num_input_tokens_seen": 112262350, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84375, + "step": 5228, + "time_per_iteration": 2.572122573852539 + }, + { + "auxiliary_loss_clip": 0.01130676, + "auxiliary_loss_mlp": 0.01040068, + "balance_loss_clip": 1.02567399, + "balance_loss_mlp": 1.04645872, + "epoch": 0.31438448820081166, + "flos": 22821411168000.0, + "grad_norm": 1.857425256773369, + "language_loss": 0.79938543, + "learning_rate": 3.2099260814082254e-06, + "loss": 0.82109284, + "num_input_tokens_seen": 112283710, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.83984375, + "step": 5229, + "time_per_iteration": 2.4785192012786865 + }, + { + "auxiliary_loss_clip": 0.01129346, + "auxiliary_loss_mlp": 0.01039876, + "balance_loss_clip": 1.02474797, + "balance_loss_mlp": 1.04716849, + "epoch": 0.3144446114534796, + "flos": 23292127923840.0, + "grad_norm": 1.8246409730399047, + "language_loss": 0.70264775, + "learning_rate": 3.209615948222611e-06, + "loss": 0.72434002, + "num_input_tokens_seen": 112304285, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8203125, + "step": 5230, + "time_per_iteration": 2.504387140274048 + }, + { + "auxiliary_loss_clip": 0.01129413, + "auxiliary_loss_mlp": 0.01043609, + "balance_loss_clip": 1.02805161, + "balance_loss_mlp": 1.04486191, + "epoch": 0.3145047347061476, + "flos": 31355976424320.0, + "grad_norm": 1.680902640440715, + "language_loss": 0.79707456, + "learning_rate": 3.209305769168239e-06, + "loss": 0.81880474, + "num_input_tokens_seen": 112325110, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84375, + "step": 5231, + "time_per_iteration": 2.535352945327759 + }, + { + "auxiliary_loss_clip": 0.01129002, + "auxiliary_loss_mlp": 0.01042731, + "balance_loss_clip": 1.02675736, + "balance_loss_mlp": 1.04756021, + "epoch": 0.31456485795881556, + "flos": 10889552643840.0, + "grad_norm": 2.0146998384070254, + "language_loss": 0.8507638, + "learning_rate": 3.2089955442568704e-06, + "loss": 0.87248111, + "num_input_tokens_seen": 112339855, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8125, + "step": 5232, + "time_per_iteration": 2.5626280307769775 + }, + { + "auxiliary_loss_clip": 0.01127281, + "auxiliary_loss_mlp": 0.01049783, + "balance_loss_clip": 1.03439283, + "balance_loss_mlp": 1.0461762, + "epoch": 0.3146249812114835, + "flos": 17092438439040.0, + "grad_norm": 1.5681064196444345, + "language_loss": 0.7984041, + "learning_rate": 3.2086852735002692e-06, + "loss": 0.82017469, + "num_input_tokens_seen": 112358480, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.80859375, + "step": 5233, + "time_per_iteration": 2.4478254318237305 + }, + { + "auxiliary_loss_clip": 0.01132884, + "auxiliary_loss_mlp": 0.01038194, + "balance_loss_clip": 1.0233047, + "balance_loss_mlp": 1.04861724, + "epoch": 0.3146851044641515, + "flos": 55291442889600.0, + "grad_norm": 1.628646597563271, + "language_loss": 0.70788991, + "learning_rate": 3.2083749569102024e-06, + "loss": 0.72960073, + "num_input_tokens_seen": 112382350, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.84375, + "step": 5234, + "time_per_iteration": 2.775871992111206 + }, + { + "auxiliary_loss_clip": 0.01131513, + "auxiliary_loss_mlp": 0.01035924, + "balance_loss_clip": 1.0205102, + "balance_loss_mlp": 1.04739237, + "epoch": 0.31474522771681945, + "flos": 27015884928000.0, + "grad_norm": 1.8519873535555593, + "language_loss": 0.72068667, + "learning_rate": 3.2080645944984356e-06, + "loss": 0.74236101, + "num_input_tokens_seen": 112400260, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.83984375, + "step": 5235, + "time_per_iteration": 2.515869617462158 + }, + { + "auxiliary_loss_clip": 0.01126993, + "auxiliary_loss_mlp": 0.01036359, + "balance_loss_clip": 1.02204823, + "balance_loss_mlp": 1.04428434, + "epoch": 0.3148053509694875, + "flos": 21251935330560.0, + "grad_norm": 2.06424580772138, + "language_loss": 0.7832365, + "learning_rate": 3.2077541862767384e-06, + "loss": 0.80487001, + "num_input_tokens_seen": 112419400, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.828125, + "step": 5236, + "time_per_iteration": 2.5591800212860107 + }, + { + "auxiliary_loss_clip": 0.01134794, + "auxiliary_loss_mlp": 0.01041698, + "balance_loss_clip": 1.02609372, + "balance_loss_mlp": 1.04730821, + "epoch": 0.31486547422215544, + "flos": 31248675521280.0, + "grad_norm": 1.44778330648976, + "language_loss": 0.75856584, + "learning_rate": 3.207443732256881e-06, + "loss": 0.78033078, + "num_input_tokens_seen": 112440825, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.875, + "step": 5237, + "time_per_iteration": 2.5414791107177734 + }, + { + "auxiliary_loss_clip": 0.01125329, + "auxiliary_loss_mlp": 0.01037879, + "balance_loss_clip": 1.02424169, + "balance_loss_mlp": 1.04500508, + "epoch": 0.3149255974748234, + "flos": 19828615933440.0, + "grad_norm": 2.1889759499940813, + "language_loss": 0.79916662, + "learning_rate": 3.2071332324506372e-06, + "loss": 0.82079864, + "num_input_tokens_seen": 112459180, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.8046875, + "step": 5238, + "time_per_iteration": 2.484102725982666 + }, + { + "auxiliary_loss_clip": 0.01045144, + "auxiliary_loss_mlp": 0.0100711, + "balance_loss_clip": 1.0053103, + "balance_loss_mlp": 1.01739836, + "epoch": 0.31498572072749137, + "flos": 67683965339520.0, + "grad_norm": 0.8333107882681854, + "language_loss": 0.67920464, + "learning_rate": 3.2068226868697795e-06, + "loss": 0.69972724, + "num_input_tokens_seen": 112516680, + "router_z_loss_clip": 0.01794434, + "router_z_loss_mlp": 0.27734375, + "step": 5239, + "time_per_iteration": 3.0362496376037598 + }, + { + "auxiliary_loss_clip": 0.01130796, + "auxiliary_loss_mlp": 0.01038602, + "balance_loss_clip": 1.02197254, + "balance_loss_mlp": 1.04535258, + "epoch": 0.31504584398015933, + "flos": 19793136274560.0, + "grad_norm": 2.0536997136778847, + "language_loss": 0.82329869, + "learning_rate": 3.2065120955260846e-06, + "loss": 0.84499264, + "num_input_tokens_seen": 112535895, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.85546875, + "step": 5240, + "time_per_iteration": 2.5182995796203613 + }, + { + "auxiliary_loss_clip": 0.01130164, + "auxiliary_loss_mlp": 0.01039014, + "balance_loss_clip": 1.02451253, + "balance_loss_mlp": 1.04874361, + "epoch": 0.3151059672328273, + "flos": 26615409217920.0, + "grad_norm": 2.2630790499207962, + "language_loss": 0.80981195, + "learning_rate": 3.2062014584313302e-06, + "loss": 0.83150375, + "num_input_tokens_seen": 112557490, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8125, + "step": 5241, + "time_per_iteration": 2.5001909732818604 + }, + { + "auxiliary_loss_clip": 0.01129016, + "auxiliary_loss_mlp": 0.01036038, + "balance_loss_clip": 1.02131534, + "balance_loss_mlp": 1.04834199, + "epoch": 0.31516609048549526, + "flos": 24204438483840.0, + "grad_norm": 1.5804052674973608, + "language_loss": 0.74575627, + "learning_rate": 3.2058907755972956e-06, + "loss": 0.76740676, + "num_input_tokens_seen": 112577075, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8046875, + "step": 5242, + "time_per_iteration": 2.530768871307373 + }, + { + "auxiliary_loss_clip": 0.01129278, + "auxiliary_loss_mlp": 0.01034942, + "balance_loss_clip": 1.0189085, + "balance_loss_mlp": 1.04601228, + "epoch": 0.31522621373816323, + "flos": 25958710817280.0, + "grad_norm": 1.9335835713568477, + "language_loss": 0.74171245, + "learning_rate": 3.2055800470357626e-06, + "loss": 0.7633546, + "num_input_tokens_seen": 112597620, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.83203125, + "step": 5243, + "time_per_iteration": 2.495138168334961 + }, + { + "auxiliary_loss_clip": 0.01129958, + "auxiliary_loss_mlp": 0.01036952, + "balance_loss_clip": 1.02221215, + "balance_loss_mlp": 1.04677868, + "epoch": 0.3152863369908312, + "flos": 21908813299200.0, + "grad_norm": 3.400707627247709, + "language_loss": 0.64608908, + "learning_rate": 3.205269272758513e-06, + "loss": 0.66775823, + "num_input_tokens_seen": 112617150, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.83203125, + "step": 5244, + "time_per_iteration": 2.4930343627929688 + }, + { + "auxiliary_loss_clip": 0.01132393, + "auxiliary_loss_mlp": 0.01035579, + "balance_loss_clip": 1.02088022, + "balance_loss_mlp": 1.04716229, + "epoch": 0.31534646024349916, + "flos": 16281072074880.0, + "grad_norm": 2.1590647535644965, + "language_loss": 0.91464043, + "learning_rate": 3.2049584527773313e-06, + "loss": 0.93632007, + "num_input_tokens_seen": 112631090, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8515625, + "step": 5245, + "time_per_iteration": 2.4007837772369385 + }, + { + "auxiliary_loss_clip": 0.0113079, + "auxiliary_loss_mlp": 0.0104148, + "balance_loss_clip": 1.02636433, + "balance_loss_mlp": 1.04643655, + "epoch": 0.3154065834961671, + "flos": 24717243000960.0, + "grad_norm": 9.888646015204756, + "language_loss": 0.75272042, + "learning_rate": 3.2046475871040048e-06, + "loss": 0.77444315, + "num_input_tokens_seen": 112651220, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.84375, + "step": 5246, + "time_per_iteration": 2.4886202812194824 + }, + { + "auxiliary_loss_clip": 0.01131208, + "auxiliary_loss_mlp": 0.01040085, + "balance_loss_clip": 1.02524352, + "balance_loss_mlp": 1.04602718, + "epoch": 0.3154667067488351, + "flos": 35371148469120.0, + "grad_norm": 1.4670109155165818, + "language_loss": 0.6160199, + "learning_rate": 3.204336675750321e-06, + "loss": 0.63773286, + "num_input_tokens_seen": 112671560, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8515625, + "step": 5247, + "time_per_iteration": 2.567185640335083 + }, + { + "auxiliary_loss_clip": 0.01132287, + "auxiliary_loss_mlp": 0.01038251, + "balance_loss_clip": 1.02283072, + "balance_loss_mlp": 1.04756081, + "epoch": 0.31552683000150306, + "flos": 17456464823040.0, + "grad_norm": 2.2084660310503526, + "language_loss": 0.82410538, + "learning_rate": 3.2040257187280693e-06, + "loss": 0.84581077, + "num_input_tokens_seen": 112689790, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84375, + "step": 5248, + "time_per_iteration": 2.52426815032959 + }, + { + "auxiliary_loss_clip": 0.01129578, + "auxiliary_loss_mlp": 0.01050015, + "balance_loss_clip": 1.03413653, + "balance_loss_mlp": 1.04662156, + "epoch": 0.3155869532541711, + "flos": 18405763413120.0, + "grad_norm": 1.8083364563285407, + "language_loss": 0.85017586, + "learning_rate": 3.2037147160490423e-06, + "loss": 0.87197179, + "num_input_tokens_seen": 112708265, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.828125, + "step": 5249, + "time_per_iteration": 2.4549005031585693 + }, + { + "auxiliary_loss_clip": 0.0113226, + "auxiliary_loss_mlp": 0.01038074, + "balance_loss_clip": 1.02245772, + "balance_loss_mlp": 1.04802227, + "epoch": 0.31564707650683904, + "flos": 21579763783680.0, + "grad_norm": 1.8090626711780673, + "language_loss": 0.85569501, + "learning_rate": 3.2034036677250322e-06, + "loss": 0.87739837, + "num_input_tokens_seen": 112727820, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84375, + "step": 5250, + "time_per_iteration": 2.502629041671753 + }, + { + "auxiliary_loss_clip": 0.01128678, + "auxiliary_loss_mlp": 0.01042591, + "balance_loss_clip": 1.02766562, + "balance_loss_mlp": 1.04532385, + "epoch": 0.315707199759507, + "flos": 21030976817280.0, + "grad_norm": 4.215523946509053, + "language_loss": 0.68559456, + "learning_rate": 3.203092573767835e-06, + "loss": 0.70730722, + "num_input_tokens_seen": 112743140, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8359375, + "step": 5251, + "time_per_iteration": 2.4467368125915527 + }, + { + "auxiliary_loss_clip": 0.01130513, + "auxiliary_loss_mlp": 0.01039965, + "balance_loss_clip": 1.02487266, + "balance_loss_mlp": 1.04848695, + "epoch": 0.31576732301217497, + "flos": 26828861788800.0, + "grad_norm": 1.7890606859490685, + "language_loss": 0.78783, + "learning_rate": 3.202781434189246e-06, + "loss": 0.80953479, + "num_input_tokens_seen": 112764705, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8203125, + "step": 5252, + "time_per_iteration": 2.5056369304656982 + }, + { + "auxiliary_loss_clip": 0.01129131, + "auxiliary_loss_mlp": 0.01040491, + "balance_loss_clip": 1.02635264, + "balance_loss_mlp": 1.04820085, + "epoch": 0.31582744626484294, + "flos": 22711165349760.0, + "grad_norm": 1.7467438086499925, + "language_loss": 0.74374568, + "learning_rate": 3.202470249001066e-06, + "loss": 0.76544189, + "num_input_tokens_seen": 112785310, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.80859375, + "step": 5253, + "time_per_iteration": 2.485865592956543 + }, + { + "auxiliary_loss_clip": 0.01129339, + "auxiliary_loss_mlp": 0.01038803, + "balance_loss_clip": 1.02308559, + "balance_loss_mlp": 1.04530692, + "epoch": 0.3158875695175109, + "flos": 23951914894080.0, + "grad_norm": 1.6622002067810395, + "language_loss": 0.73305148, + "learning_rate": 3.2021590182150924e-06, + "loss": 0.75473285, + "num_input_tokens_seen": 112802905, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.83984375, + "step": 5254, + "time_per_iteration": 2.5044641494750977 + }, + { + "auxiliary_loss_clip": 0.01131731, + "auxiliary_loss_mlp": 0.01038119, + "balance_loss_clip": 1.02293146, + "balance_loss_mlp": 1.04714012, + "epoch": 0.31594769277017887, + "flos": 13261883322240.0, + "grad_norm": 1.9319514966089122, + "language_loss": 0.78156364, + "learning_rate": 3.201847741843128e-06, + "loss": 0.80326211, + "num_input_tokens_seen": 112820305, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.84375, + "step": 5255, + "time_per_iteration": 2.4380881786346436 + }, + { + "auxiliary_loss_clip": 0.01130732, + "auxiliary_loss_mlp": 0.0104233, + "balance_loss_clip": 1.02565229, + "balance_loss_mlp": 1.04770398, + "epoch": 0.31600781602284683, + "flos": 23368258800000.0, + "grad_norm": 2.551434599641695, + "language_loss": 0.78019011, + "learning_rate": 3.2015364198969772e-06, + "loss": 0.80192077, + "num_input_tokens_seen": 112841185, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.828125, + "step": 5256, + "time_per_iteration": 2.517211437225342 + }, + { + "auxiliary_loss_clip": 0.01125561, + "auxiliary_loss_mlp": 0.01035033, + "balance_loss_clip": 1.02159786, + "balance_loss_mlp": 1.04710865, + "epoch": 0.3160679392755148, + "flos": 19828580019840.0, + "grad_norm": 1.6136648036258991, + "language_loss": 0.71117795, + "learning_rate": 3.2012250523884453e-06, + "loss": 0.73278391, + "num_input_tokens_seen": 112860570, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.78515625, + "step": 5257, + "time_per_iteration": 2.4690449237823486 + }, + { + "auxiliary_loss_clip": 0.01129863, + "auxiliary_loss_mlp": 0.01037255, + "balance_loss_clip": 1.02207994, + "balance_loss_mlp": 1.04662931, + "epoch": 0.31612806252818276, + "flos": 20193216935040.0, + "grad_norm": 1.9672329013590102, + "language_loss": 0.77098101, + "learning_rate": 3.2009136393293393e-06, + "loss": 0.79265225, + "num_input_tokens_seen": 112877975, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.83203125, + "step": 5258, + "time_per_iteration": 2.4586384296417236 + }, + { + "auxiliary_loss_clip": 0.01130533, + "auxiliary_loss_mlp": 0.01037626, + "balance_loss_clip": 1.02291536, + "balance_loss_mlp": 1.04706669, + "epoch": 0.31618818578085073, + "flos": 24235967646720.0, + "grad_norm": 4.102208009404704, + "language_loss": 0.72829109, + "learning_rate": 3.200602180731467e-06, + "loss": 0.7499727, + "num_input_tokens_seen": 112896170, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8359375, + "step": 5259, + "time_per_iteration": 2.463867425918579 + }, + { + "auxiliary_loss_clip": 0.011339, + "auxiliary_loss_mlp": 0.01048149, + "balance_loss_clip": 1.03382003, + "balance_loss_mlp": 1.04840684, + "epoch": 0.3162483090335187, + "flos": 25081844002560.0, + "grad_norm": 1.940451679167918, + "language_loss": 0.66212165, + "learning_rate": 3.20029067660664e-06, + "loss": 0.68394214, + "num_input_tokens_seen": 112916180, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.85546875, + "step": 5260, + "time_per_iteration": 2.498173475265503 + }, + { + "auxiliary_loss_clip": 0.01125905, + "auxiliary_loss_mlp": 0.0103285, + "balance_loss_clip": 1.01806808, + "balance_loss_mlp": 1.04255199, + "epoch": 0.31630843228618666, + "flos": 26323383646080.0, + "grad_norm": 1.9564366458132632, + "language_loss": 0.72557104, + "learning_rate": 3.1999791269666706e-06, + "loss": 0.74715853, + "num_input_tokens_seen": 112936745, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8359375, + "step": 5261, + "time_per_iteration": 4.0577170848846436 + }, + { + "auxiliary_loss_clip": 0.01040968, + "auxiliary_loss_mlp": 0.01005761, + "balance_loss_clip": 1.00365114, + "balance_loss_mlp": 1.01333809, + "epoch": 0.3163685555388547, + "flos": 66758441552640.0, + "grad_norm": 0.7495327099187281, + "language_loss": 0.50639355, + "learning_rate": 3.1996675318233716e-06, + "loss": 0.52686083, + "num_input_tokens_seen": 112994845, + "router_z_loss_clip": 0.02111816, + "router_z_loss_mlp": 0.27734375, + "step": 5262, + "time_per_iteration": 5.9139063358306885 + }, + { + "auxiliary_loss_clip": 0.01133191, + "auxiliary_loss_mlp": 0.01038454, + "balance_loss_clip": 1.02408338, + "balance_loss_mlp": 1.04845881, + "epoch": 0.31642867879152264, + "flos": 25995662933760.0, + "grad_norm": 1.4936033884005069, + "language_loss": 0.85241222, + "learning_rate": 3.19935589118856e-06, + "loss": 0.87412858, + "num_input_tokens_seen": 113015125, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.84765625, + "step": 5263, + "time_per_iteration": 2.4966084957122803 + }, + { + "auxiliary_loss_clip": 0.01127359, + "auxiliary_loss_mlp": 0.01045858, + "balance_loss_clip": 1.03201818, + "balance_loss_mlp": 1.04657304, + "epoch": 0.3164888020441906, + "flos": 25774955815680.0, + "grad_norm": 1.4671140059184749, + "language_loss": 0.81675243, + "learning_rate": 3.1990442050740535e-06, + "loss": 0.83848464, + "num_input_tokens_seen": 113035535, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.80859375, + "step": 5264, + "time_per_iteration": 2.5126495361328125 + }, + { + "auxiliary_loss_clip": 0.01133844, + "auxiliary_loss_mlp": 0.0103675, + "balance_loss_clip": 1.02107441, + "balance_loss_mlp": 1.0484283, + "epoch": 0.3165489252968586, + "flos": 19756220071680.0, + "grad_norm": 1.6829803459821215, + "language_loss": 0.79974926, + "learning_rate": 3.19873247349167e-06, + "loss": 0.82145512, + "num_input_tokens_seen": 113052720, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8515625, + "step": 5265, + "time_per_iteration": 2.444263219833374 + }, + { + "auxiliary_loss_clip": 0.0113354, + "auxiliary_loss_mlp": 0.01039316, + "balance_loss_clip": 1.02361572, + "balance_loss_mlp": 1.04815876, + "epoch": 0.31660904854952654, + "flos": 23183929180800.0, + "grad_norm": 1.5672890574859826, + "language_loss": 0.74875605, + "learning_rate": 3.1984206964532307e-06, + "loss": 0.77048463, + "num_input_tokens_seen": 113071435, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8515625, + "step": 5266, + "time_per_iteration": 2.5323407649993896 + }, + { + "auxiliary_loss_clip": 0.01131974, + "auxiliary_loss_mlp": 0.01043072, + "balance_loss_clip": 1.02851653, + "balance_loss_mlp": 1.04640543, + "epoch": 0.3166691718021945, + "flos": 20408501099520.0, + "grad_norm": 2.021043754719528, + "language_loss": 0.78872609, + "learning_rate": 3.1981088739705585e-06, + "loss": 0.81047654, + "num_input_tokens_seen": 113088645, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.85546875, + "step": 5267, + "time_per_iteration": 2.4591164588928223 + }, + { + "auxiliary_loss_clip": 0.01042632, + "auxiliary_loss_mlp": 0.01004279, + "balance_loss_clip": 1.00216913, + "balance_loss_mlp": 1.01493907, + "epoch": 0.31672929505486247, + "flos": 70144781172480.0, + "grad_norm": 0.7322532755123746, + "language_loss": 0.57800645, + "learning_rate": 3.197797006055478e-06, + "loss": 0.59847558, + "num_input_tokens_seen": 113152775, + "router_z_loss_clip": 0.02111816, + "router_z_loss_mlp": 0.27734375, + "step": 5268, + "time_per_iteration": 3.061121702194214 + }, + { + "auxiliary_loss_clip": 0.01132182, + "auxiliary_loss_mlp": 0.01037998, + "balance_loss_clip": 1.02291262, + "balance_loss_mlp": 1.04683709, + "epoch": 0.31678941830753043, + "flos": 14355758154240.0, + "grad_norm": 1.8728828385616285, + "language_loss": 0.72881675, + "learning_rate": 3.197485092719815e-06, + "loss": 0.75051844, + "num_input_tokens_seen": 113171410, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.85546875, + "step": 5269, + "time_per_iteration": 2.4871747493743896 + }, + { + "auxiliary_loss_clip": 0.0113037, + "auxiliary_loss_mlp": 0.01039313, + "balance_loss_clip": 1.02470934, + "balance_loss_mlp": 1.04689598, + "epoch": 0.3168495415601984, + "flos": 22747722416640.0, + "grad_norm": 2.0592855460289394, + "language_loss": 0.79914796, + "learning_rate": 3.1971731339753973e-06, + "loss": 0.82084477, + "num_input_tokens_seen": 113189965, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8359375, + "step": 5270, + "time_per_iteration": 2.502607822418213 + }, + { + "auxiliary_loss_clip": 0.01134389, + "auxiliary_loss_mlp": 0.01041999, + "balance_loss_clip": 1.02582264, + "balance_loss_mlp": 1.04792333, + "epoch": 0.31690966481286637, + "flos": 20115254465280.0, + "grad_norm": 1.9728362515560998, + "language_loss": 0.79207718, + "learning_rate": 3.1968611298340545e-06, + "loss": 0.8138411, + "num_input_tokens_seen": 113206355, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8671875, + "step": 5271, + "time_per_iteration": 2.4412505626678467 + }, + { + "auxiliary_loss_clip": 0.0113132, + "auxiliary_loss_mlp": 0.01040651, + "balance_loss_clip": 1.02440262, + "balance_loss_mlp": 1.04685235, + "epoch": 0.31696978806553433, + "flos": 21178928937600.0, + "grad_norm": 1.769221166791082, + "language_loss": 0.73264146, + "learning_rate": 3.1965490803076173e-06, + "loss": 0.75436121, + "num_input_tokens_seen": 113225440, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.84375, + "step": 5272, + "time_per_iteration": 2.4992945194244385 + }, + { + "auxiliary_loss_clip": 0.0113408, + "auxiliary_loss_mlp": 0.01039209, + "balance_loss_clip": 1.02262676, + "balance_loss_mlp": 1.04613161, + "epoch": 0.3170299113182023, + "flos": 42997030439040.0, + "grad_norm": 1.9537759660060814, + "language_loss": 0.69159341, + "learning_rate": 3.1962369854079194e-06, + "loss": 0.71332633, + "num_input_tokens_seen": 113248840, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.87890625, + "step": 5273, + "time_per_iteration": 2.6510114669799805 + }, + { + "auxiliary_loss_clip": 0.01128979, + "auxiliary_loss_mlp": 0.01036407, + "balance_loss_clip": 1.02110016, + "balance_loss_mlp": 1.04609132, + "epoch": 0.31709003457087026, + "flos": 24460158384000.0, + "grad_norm": 1.4826309074588198, + "language_loss": 0.67691469, + "learning_rate": 3.195924845146795e-06, + "loss": 0.69856858, + "num_input_tokens_seen": 113269630, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.828125, + "step": 5274, + "time_per_iteration": 2.5467329025268555 + }, + { + "auxiliary_loss_clip": 0.01124583, + "auxiliary_loss_mlp": 0.01035156, + "balance_loss_clip": 1.02092862, + "balance_loss_mlp": 1.04432762, + "epoch": 0.3171501578235382, + "flos": 24135310759680.0, + "grad_norm": 1.5251182195487059, + "language_loss": 0.80846918, + "learning_rate": 3.195612659536081e-06, + "loss": 0.83006656, + "num_input_tokens_seen": 113291200, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8046875, + "step": 5275, + "time_per_iteration": 2.511544704437256 + }, + { + "auxiliary_loss_clip": 0.0113165, + "auxiliary_loss_mlp": 0.01044428, + "balance_loss_clip": 1.0286448, + "balance_loss_mlp": 1.04539275, + "epoch": 0.31721028107620625, + "flos": 18879712392960.0, + "grad_norm": 1.952892513614063, + "language_loss": 0.72608984, + "learning_rate": 3.1953004285876147e-06, + "loss": 0.7478506, + "num_input_tokens_seen": 113310170, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.859375, + "step": 5276, + "time_per_iteration": 2.5273983478546143 + }, + { + "auxiliary_loss_clip": 0.01124489, + "auxiliary_loss_mlp": 0.0103537, + "balance_loss_clip": 1.02098107, + "balance_loss_mlp": 1.04455817, + "epoch": 0.3172704043288742, + "flos": 23147874904320.0, + "grad_norm": 1.3590988237701342, + "language_loss": 0.77843654, + "learning_rate": 3.194988152313236e-06, + "loss": 0.80003512, + "num_input_tokens_seen": 113331140, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.796875, + "step": 5277, + "time_per_iteration": 2.51247501373291 + }, + { + "auxiliary_loss_clip": 0.0112964, + "auxiliary_loss_mlp": 0.01034254, + "balance_loss_clip": 1.01833999, + "balance_loss_mlp": 1.04444003, + "epoch": 0.3173305275815422, + "flos": 17858520731520.0, + "grad_norm": 1.8256288285105424, + "language_loss": 0.78756094, + "learning_rate": 3.1946758307247878e-06, + "loss": 0.80919981, + "num_input_tokens_seen": 113350030, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8515625, + "step": 5278, + "time_per_iteration": 2.5376405715942383 + }, + { + "auxiliary_loss_clip": 0.01037546, + "auxiliary_loss_mlp": 0.01002993, + "balance_loss_clip": 1.0011332, + "balance_loss_mlp": 1.00972891, + "epoch": 0.31739065083421014, + "flos": 59973476883840.0, + "grad_norm": 0.8755672893463982, + "language_loss": 0.62821174, + "learning_rate": 3.1943634638341114e-06, + "loss": 0.64861709, + "num_input_tokens_seen": 113395820, + "router_z_loss_clip": 0.01855469, + "router_z_loss_mlp": 0.27734375, + "step": 5279, + "time_per_iteration": 2.823489189147949 + }, + { + "auxiliary_loss_clip": 0.01133426, + "auxiliary_loss_mlp": 0.01040678, + "balance_loss_clip": 1.0242753, + "balance_loss_mlp": 1.04568505, + "epoch": 0.3174507740868781, + "flos": 23800981944960.0, + "grad_norm": 1.6672726712999033, + "language_loss": 0.8099947, + "learning_rate": 3.194051051653053e-06, + "loss": 0.83173573, + "num_input_tokens_seen": 113416835, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.87890625, + "step": 5280, + "time_per_iteration": 2.490154981613159 + }, + { + "auxiliary_loss_clip": 0.01130309, + "auxiliary_loss_mlp": 0.01044119, + "balance_loss_clip": 1.02963543, + "balance_loss_mlp": 1.04713202, + "epoch": 0.31751089733954607, + "flos": 27638899349760.0, + "grad_norm": 1.444928497123541, + "language_loss": 0.77968711, + "learning_rate": 3.19373859419346e-06, + "loss": 0.80143142, + "num_input_tokens_seen": 113440850, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.83203125, + "step": 5281, + "time_per_iteration": 2.590106248855591 + }, + { + "auxiliary_loss_clip": 0.01129621, + "auxiliary_loss_mlp": 0.01035628, + "balance_loss_clip": 1.02001119, + "balance_loss_mlp": 1.0464325, + "epoch": 0.31757102059221404, + "flos": 23769273214080.0, + "grad_norm": 1.6441690082428626, + "language_loss": 0.78319824, + "learning_rate": 3.193426091467179e-06, + "loss": 0.8048507, + "num_input_tokens_seen": 113461000, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.83203125, + "step": 5282, + "time_per_iteration": 2.4879021644592285 + }, + { + "auxiliary_loss_clip": 0.01133826, + "auxiliary_loss_mlp": 0.0103931, + "balance_loss_clip": 1.02429008, + "balance_loss_mlp": 1.04685783, + "epoch": 0.317631143844882, + "flos": 25264521596160.0, + "grad_norm": 2.066002014025373, + "language_loss": 0.66989815, + "learning_rate": 3.193113543486061e-06, + "loss": 0.69162953, + "num_input_tokens_seen": 113480820, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8671875, + "step": 5283, + "time_per_iteration": 2.4914467334747314 + }, + { + "auxiliary_loss_clip": 0.01037416, + "auxiliary_loss_mlp": 0.01002537, + "balance_loss_clip": 1.00047421, + "balance_loss_mlp": 1.00956297, + "epoch": 0.31769126709754997, + "flos": 55825939221120.0, + "grad_norm": 0.7287723120729913, + "language_loss": 0.52796859, + "learning_rate": 3.192800950261958e-06, + "loss": 0.5483681, + "num_input_tokens_seen": 113536910, + "router_z_loss_clip": 0.02062988, + "router_z_loss_mlp": 0.27734375, + "step": 5284, + "time_per_iteration": 3.0077779293060303 + }, + { + "auxiliary_loss_clip": 0.01137201, + "auxiliary_loss_mlp": 0.01037818, + "balance_loss_clip": 1.02314341, + "balance_loss_mlp": 1.04976773, + "epoch": 0.31775139035021793, + "flos": 16690562098560.0, + "grad_norm": 1.732541053937659, + "language_loss": 0.7061168, + "learning_rate": 3.1924883118067235e-06, + "loss": 0.72786701, + "num_input_tokens_seen": 113555480, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.875, + "step": 5285, + "time_per_iteration": 2.4796152114868164 + }, + { + "auxiliary_loss_clip": 0.0103775, + "auxiliary_loss_mlp": 0.01003604, + "balance_loss_clip": 1.00170827, + "balance_loss_mlp": 1.00987303, + "epoch": 0.3178115136028859, + "flos": 64227241019520.0, + "grad_norm": 0.8184329386673247, + "language_loss": 0.60497808, + "learning_rate": 3.1921756281322123e-06, + "loss": 0.6253916, + "num_input_tokens_seen": 113616790, + "router_z_loss_clip": 0.0189209, + "router_z_loss_mlp": 0.27929688, + "step": 5286, + "time_per_iteration": 3.060959815979004 + }, + { + "auxiliary_loss_clip": 0.01131379, + "auxiliary_loss_mlp": 0.01042363, + "balance_loss_clip": 1.02701449, + "balance_loss_mlp": 1.04520202, + "epoch": 0.31787163685555386, + "flos": 18697465762560.0, + "grad_norm": 1.8142745455991967, + "language_loss": 0.72112805, + "learning_rate": 3.1918628992502826e-06, + "loss": 0.74286544, + "num_input_tokens_seen": 113635320, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.86328125, + "step": 5287, + "time_per_iteration": 2.480926752090454 + }, + { + "auxiliary_loss_clip": 0.01131312, + "auxiliary_loss_mlp": 0.01047698, + "balance_loss_clip": 1.03083003, + "balance_loss_mlp": 1.04454064, + "epoch": 0.31793176010822183, + "flos": 21324762155520.0, + "grad_norm": 1.8467549942081902, + "language_loss": 0.75335222, + "learning_rate": 3.191550125172792e-06, + "loss": 0.77514231, + "num_input_tokens_seen": 113654000, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8671875, + "step": 5288, + "time_per_iteration": 2.4506337642669678 + }, + { + "auxiliary_loss_clip": 0.01123463, + "auxiliary_loss_mlp": 0.01036721, + "balance_loss_clip": 1.02344155, + "balance_loss_mlp": 1.04175711, + "epoch": 0.31799188336088985, + "flos": 20958688696320.0, + "grad_norm": 2.214262263159222, + "language_loss": 0.87642509, + "learning_rate": 3.1912373059116007e-06, + "loss": 0.89802694, + "num_input_tokens_seen": 113672375, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.8203125, + "step": 5289, + "time_per_iteration": 2.4887404441833496 + }, + { + "auxiliary_loss_clip": 0.01127988, + "auxiliary_loss_mlp": 0.01039305, + "balance_loss_clip": 1.02569127, + "balance_loss_mlp": 1.04635859, + "epoch": 0.3180520066135578, + "flos": 22491930689280.0, + "grad_norm": 1.8563377401537928, + "language_loss": 0.67677546, + "learning_rate": 3.190924441478572e-06, + "loss": 0.69844842, + "num_input_tokens_seen": 113692385, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.81640625, + "step": 5290, + "time_per_iteration": 2.4699981212615967 + }, + { + "auxiliary_loss_clip": 0.01130209, + "auxiliary_loss_mlp": 0.01045373, + "balance_loss_clip": 1.02983999, + "balance_loss_mlp": 1.04348135, + "epoch": 0.3181121298662258, + "flos": 27235335070080.0, + "grad_norm": 1.9889060202243536, + "language_loss": 0.79926544, + "learning_rate": 3.1906115318855687e-06, + "loss": 0.82102132, + "num_input_tokens_seen": 113712145, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8671875, + "step": 5291, + "time_per_iteration": 2.5350663661956787 + }, + { + "auxiliary_loss_clip": 0.011332, + "auxiliary_loss_mlp": 0.01037344, + "balance_loss_clip": 1.02160883, + "balance_loss_mlp": 1.04684091, + "epoch": 0.31817225311889374, + "flos": 23180158252800.0, + "grad_norm": 2.2851564798864694, + "language_loss": 0.79887748, + "learning_rate": 3.1902985771444577e-06, + "loss": 0.82058293, + "num_input_tokens_seen": 113731435, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8671875, + "step": 5292, + "time_per_iteration": 2.4561853408813477 + }, + { + "auxiliary_loss_clip": 0.01124086, + "auxiliary_loss_mlp": 0.01034983, + "balance_loss_clip": 1.02173245, + "balance_loss_mlp": 1.04506028, + "epoch": 0.3182323763715617, + "flos": 23258803080960.0, + "grad_norm": 1.6321803022225574, + "language_loss": 0.74406421, + "learning_rate": 3.1899855772671043e-06, + "loss": 0.76565492, + "num_input_tokens_seen": 113750825, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7890625, + "step": 5293, + "time_per_iteration": 2.562264919281006 + }, + { + "auxiliary_loss_clip": 0.01127349, + "auxiliary_loss_mlp": 0.0104221, + "balance_loss_clip": 1.02864981, + "balance_loss_mlp": 1.04655647, + "epoch": 0.3182924996242297, + "flos": 29016683280000.0, + "grad_norm": 1.669926034583184, + "language_loss": 0.74003655, + "learning_rate": 3.189672532265379e-06, + "loss": 0.7617321, + "num_input_tokens_seen": 113770010, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.80859375, + "step": 5294, + "time_per_iteration": 2.511491537094116 + }, + { + "auxiliary_loss_clip": 0.01131359, + "auxiliary_loss_mlp": 0.01034334, + "balance_loss_clip": 1.0186758, + "balance_loss_mlp": 1.04616928, + "epoch": 0.31835262287689764, + "flos": 20449188230400.0, + "grad_norm": 1.856323864882145, + "language_loss": 0.76211727, + "learning_rate": 3.189359442151152e-06, + "loss": 0.78377414, + "num_input_tokens_seen": 113788640, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8515625, + "step": 5295, + "time_per_iteration": 2.482302665710449 + }, + { + "auxiliary_loss_clip": 0.01134404, + "auxiliary_loss_mlp": 0.01042471, + "balance_loss_clip": 1.02765322, + "balance_loss_mlp": 1.04831004, + "epoch": 0.3184127461295656, + "flos": 25119478477440.0, + "grad_norm": 1.6316405915506296, + "language_loss": 0.69476807, + "learning_rate": 3.189046306936296e-06, + "loss": 0.71653676, + "num_input_tokens_seen": 113809515, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.859375, + "step": 5296, + "time_per_iteration": 2.4972259998321533 + }, + { + "auxiliary_loss_clip": 0.01129364, + "auxiliary_loss_mlp": 0.0103894, + "balance_loss_clip": 1.02421129, + "balance_loss_mlp": 1.04513788, + "epoch": 0.31847286938223357, + "flos": 25551231955200.0, + "grad_norm": 2.3772504575271367, + "language_loss": 0.77559733, + "learning_rate": 3.1887331266326846e-06, + "loss": 0.79728031, + "num_input_tokens_seen": 113829770, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.84375, + "step": 5297, + "time_per_iteration": 2.5681862831115723 + }, + { + "auxiliary_loss_clip": 0.01126969, + "auxiliary_loss_mlp": 0.01030144, + "balance_loss_clip": 1.01533866, + "balance_loss_mlp": 1.04480934, + "epoch": 0.31853299263490154, + "flos": 27782470010880.0, + "grad_norm": 1.9869765921291695, + "language_loss": 0.79451257, + "learning_rate": 3.1884199012521942e-06, + "loss": 0.81608367, + "num_input_tokens_seen": 113849320, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8203125, + "step": 5298, + "time_per_iteration": 2.4990038871765137 + }, + { + "auxiliary_loss_clip": 0.01132136, + "auxiliary_loss_mlp": 0.01039187, + "balance_loss_clip": 1.0245657, + "balance_loss_mlp": 1.04609096, + "epoch": 0.3185931158875695, + "flos": 22706747976960.0, + "grad_norm": 2.132815699592654, + "language_loss": 0.7431671, + "learning_rate": 3.1881066308067016e-06, + "loss": 0.7648803, + "num_input_tokens_seen": 113867860, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.859375, + "step": 5299, + "time_per_iteration": 2.4902234077453613 + }, + { + "auxiliary_loss_clip": 0.01130922, + "auxiliary_loss_mlp": 0.01042737, + "balance_loss_clip": 1.02775824, + "balance_loss_mlp": 1.04395795, + "epoch": 0.31865323914023747, + "flos": 24571517523840.0, + "grad_norm": 5.1444082132017925, + "language_loss": 0.7834971, + "learning_rate": 3.1877933153080873e-06, + "loss": 0.80523366, + "num_input_tokens_seen": 113886375, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8671875, + "step": 5300, + "time_per_iteration": 2.476113796234131 + }, + { + "auxiliary_loss_clip": 0.01127423, + "auxiliary_loss_mlp": 0.01038051, + "balance_loss_clip": 1.02245879, + "balance_loss_mlp": 1.04332328, + "epoch": 0.31871336239290543, + "flos": 18186564666240.0, + "grad_norm": 4.220537638442504, + "language_loss": 0.8416568, + "learning_rate": 3.1874799547682304e-06, + "loss": 0.86331153, + "num_input_tokens_seen": 113904065, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84375, + "step": 5301, + "time_per_iteration": 2.4672341346740723 + }, + { + "auxiliary_loss_clip": 0.01132761, + "auxiliary_loss_mlp": 0.01045513, + "balance_loss_clip": 1.0299325, + "balance_loss_mlp": 1.05064154, + "epoch": 0.31877348564557345, + "flos": 21826756679040.0, + "grad_norm": 2.4555807672502277, + "language_loss": 0.77689236, + "learning_rate": 3.187166549199015e-06, + "loss": 0.79867512, + "num_input_tokens_seen": 113918415, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8203125, + "step": 5302, + "time_per_iteration": 2.4480254650115967 + }, + { + "auxiliary_loss_clip": 0.011261, + "auxiliary_loss_mlp": 0.01037244, + "balance_loss_clip": 1.02197289, + "balance_loss_mlp": 1.0458461, + "epoch": 0.3188336088982414, + "flos": 22015252275840.0, + "grad_norm": 1.6601771821563076, + "language_loss": 0.79729378, + "learning_rate": 3.1868530986123255e-06, + "loss": 0.81892729, + "num_input_tokens_seen": 113938135, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8046875, + "step": 5303, + "time_per_iteration": 5.451193809509277 + }, + { + "auxiliary_loss_clip": 0.01137183, + "auxiliary_loss_mlp": 0.0104561, + "balance_loss_clip": 1.0295403, + "balance_loss_mlp": 1.04810047, + "epoch": 0.3188937321509094, + "flos": 20047886507520.0, + "grad_norm": 2.065727829234295, + "language_loss": 0.72734123, + "learning_rate": 3.186539603020047e-06, + "loss": 0.74916923, + "num_input_tokens_seen": 113957125, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.890625, + "step": 5304, + "time_per_iteration": 3.835230588912964 + }, + { + "auxiliary_loss_clip": 0.01126733, + "auxiliary_loss_mlp": 0.01039176, + "balance_loss_clip": 1.02546668, + "balance_loss_mlp": 1.04595399, + "epoch": 0.31895385540357735, + "flos": 25848105863040.0, + "grad_norm": 1.8866410100018438, + "language_loss": 0.71773344, + "learning_rate": 3.186226062434068e-06, + "loss": 0.73939252, + "num_input_tokens_seen": 113974875, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.80859375, + "step": 5305, + "time_per_iteration": 2.5330212116241455 + }, + { + "auxiliary_loss_clip": 0.01129402, + "auxiliary_loss_mlp": 0.0103604, + "balance_loss_clip": 1.02209806, + "balance_loss_mlp": 1.0472002, + "epoch": 0.3190139786562453, + "flos": 23477714519040.0, + "grad_norm": 1.6861128411196662, + "language_loss": 0.64708328, + "learning_rate": 3.1859124768662778e-06, + "loss": 0.66873765, + "num_input_tokens_seen": 113994450, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.82421875, + "step": 5306, + "time_per_iteration": 2.4788570404052734 + }, + { + "auxiliary_loss_clip": 0.01135221, + "auxiliary_loss_mlp": 0.01042556, + "balance_loss_clip": 1.02714205, + "balance_loss_mlp": 1.05026746, + "epoch": 0.3190741019089133, + "flos": 29095543589760.0, + "grad_norm": 2.161280639112344, + "language_loss": 0.79625881, + "learning_rate": 3.1855988463285678e-06, + "loss": 0.81803662, + "num_input_tokens_seen": 114013945, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84765625, + "step": 5307, + "time_per_iteration": 2.5614371299743652 + }, + { + "auxiliary_loss_clip": 0.0112354, + "auxiliary_loss_mlp": 0.01039882, + "balance_loss_clip": 1.02412832, + "balance_loss_mlp": 1.04311657, + "epoch": 0.31913422516158124, + "flos": 17129534209920.0, + "grad_norm": 1.727529620646192, + "language_loss": 0.77898794, + "learning_rate": 3.1852851708328308e-06, + "loss": 0.80062222, + "num_input_tokens_seen": 114031375, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8046875, + "step": 5308, + "time_per_iteration": 2.4443254470825195 + }, + { + "auxiliary_loss_clip": 0.01142678, + "auxiliary_loss_mlp": 0.01048979, + "balance_loss_clip": 1.03182518, + "balance_loss_mlp": 1.05046844, + "epoch": 0.3191943484142492, + "flos": 16069846147200.0, + "grad_norm": 5.1649453810283426, + "language_loss": 0.74302876, + "learning_rate": 3.184971450390961e-06, + "loss": 0.76494527, + "num_input_tokens_seen": 114048465, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.921875, + "step": 5309, + "time_per_iteration": 2.494800090789795 + }, + { + "auxiliary_loss_clip": 0.0112957, + "auxiliary_loss_mlp": 0.01034444, + "balance_loss_clip": 1.01998436, + "balance_loss_mlp": 1.04589248, + "epoch": 0.3192544716669172, + "flos": 22966166977920.0, + "grad_norm": 1.754429841361115, + "language_loss": 0.82606339, + "learning_rate": 3.184657685014856e-06, + "loss": 0.84770352, + "num_input_tokens_seen": 114068415, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8359375, + "step": 5310, + "time_per_iteration": 2.4630603790283203 + }, + { + "auxiliary_loss_clip": 0.01129012, + "auxiliary_loss_mlp": 0.01041266, + "balance_loss_clip": 1.02762246, + "balance_loss_mlp": 1.04536486, + "epoch": 0.31931459491958514, + "flos": 26870339018880.0, + "grad_norm": 1.4405475768569584, + "language_loss": 0.78319013, + "learning_rate": 3.184343874716412e-06, + "loss": 0.8048929, + "num_input_tokens_seen": 114088565, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.8359375, + "step": 5311, + "time_per_iteration": 2.5892724990844727 + }, + { + "auxiliary_loss_clip": 0.01130953, + "auxiliary_loss_mlp": 0.01040389, + "balance_loss_clip": 1.02419996, + "balance_loss_mlp": 1.04695129, + "epoch": 0.3193747181722531, + "flos": 21836525178240.0, + "grad_norm": 2.475613964939968, + "language_loss": 0.84316272, + "learning_rate": 3.1840300195075295e-06, + "loss": 0.86487615, + "num_input_tokens_seen": 114107160, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.83984375, + "step": 5312, + "time_per_iteration": 2.4625802040100098 + }, + { + "auxiliary_loss_clip": 0.01137215, + "auxiliary_loss_mlp": 0.01044515, + "balance_loss_clip": 1.02808809, + "balance_loss_mlp": 1.0480628, + "epoch": 0.31943484142492107, + "flos": 18324999682560.0, + "grad_norm": 2.3910939905221302, + "language_loss": 0.78584075, + "learning_rate": 3.1837161194001102e-06, + "loss": 0.80765808, + "num_input_tokens_seen": 114123420, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.890625, + "step": 5313, + "time_per_iteration": 2.548509359359741 + }, + { + "auxiliary_loss_clip": 0.01132383, + "auxiliary_loss_mlp": 0.01036276, + "balance_loss_clip": 1.02133918, + "balance_loss_mlp": 1.04814112, + "epoch": 0.31949496467758903, + "flos": 21615818060160.0, + "grad_norm": 2.1643333364087582, + "language_loss": 0.85868084, + "learning_rate": 3.183402174406057e-06, + "loss": 0.88036746, + "num_input_tokens_seen": 114139230, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.83984375, + "step": 5314, + "time_per_iteration": 2.4721946716308594 + }, + { + "auxiliary_loss_clip": 0.01131852, + "auxiliary_loss_mlp": 0.01040473, + "balance_loss_clip": 1.02502346, + "balance_loss_mlp": 1.04725409, + "epoch": 0.31955508793025705, + "flos": 21760214734080.0, + "grad_norm": 1.7188296838329389, + "language_loss": 0.79836512, + "learning_rate": 3.1830881845372747e-06, + "loss": 0.82008839, + "num_input_tokens_seen": 114159290, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84375, + "step": 5315, + "time_per_iteration": 2.512554407119751 + }, + { + "auxiliary_loss_clip": 0.01135172, + "auxiliary_loss_mlp": 0.01049715, + "balance_loss_clip": 1.03331804, + "balance_loss_mlp": 1.0493269, + "epoch": 0.319615211182925, + "flos": 17164331510400.0, + "grad_norm": 6.566744634036759, + "language_loss": 0.67652613, + "learning_rate": 3.18277414980567e-06, + "loss": 0.69837505, + "num_input_tokens_seen": 114177655, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.859375, + "step": 5316, + "time_per_iteration": 2.4364819526672363 + }, + { + "auxiliary_loss_clip": 0.01133826, + "auxiliary_loss_mlp": 0.01034907, + "balance_loss_clip": 1.02105474, + "balance_loss_mlp": 1.04888916, + "epoch": 0.319675334435593, + "flos": 28112812416000.0, + "grad_norm": 1.4751284993654519, + "language_loss": 0.69336772, + "learning_rate": 3.1824600702231515e-06, + "loss": 0.71505511, + "num_input_tokens_seen": 114200880, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.84765625, + "step": 5317, + "time_per_iteration": 2.6055562496185303 + }, + { + "auxiliary_loss_clip": 0.01043016, + "auxiliary_loss_mlp": 0.0100349, + "balance_loss_clip": 1.00143993, + "balance_loss_mlp": 1.01474404, + "epoch": 0.31973545768826095, + "flos": 69501119408640.0, + "grad_norm": 0.7259742625655435, + "language_loss": 0.53048342, + "learning_rate": 3.182145945801628e-06, + "loss": 0.5509485, + "num_input_tokens_seen": 114267145, + "router_z_loss_clip": 0.02050781, + "router_z_loss_mlp": 0.28320312, + "step": 5318, + "time_per_iteration": 3.200087308883667 + }, + { + "auxiliary_loss_clip": 0.01132062, + "auxiliary_loss_mlp": 0.01037492, + "balance_loss_clip": 1.02311563, + "balance_loss_mlp": 1.04900801, + "epoch": 0.3197955809409289, + "flos": 13699203408000.0, + "grad_norm": 1.839211184718713, + "language_loss": 0.83865941, + "learning_rate": 3.181831776553012e-06, + "loss": 0.8603549, + "num_input_tokens_seen": 114284630, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.83203125, + "step": 5319, + "time_per_iteration": 2.471498966217041 + }, + { + "auxiliary_loss_clip": 0.01131434, + "auxiliary_loss_mlp": 0.01042883, + "balance_loss_clip": 1.0279578, + "balance_loss_mlp": 1.04728413, + "epoch": 0.3198557041935969, + "flos": 33218124278400.0, + "grad_norm": 1.3959306603032393, + "language_loss": 0.63542199, + "learning_rate": 3.1815175624892165e-06, + "loss": 0.65716517, + "num_input_tokens_seen": 114305830, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.83984375, + "step": 5320, + "time_per_iteration": 2.5526087284088135 + }, + { + "auxiliary_loss_clip": 0.01136898, + "auxiliary_loss_mlp": 0.01040372, + "balance_loss_clip": 1.02528036, + "balance_loss_mlp": 1.04970324, + "epoch": 0.31991582744626484, + "flos": 23732033788800.0, + "grad_norm": 1.9943779690432752, + "language_loss": 0.70519614, + "learning_rate": 3.1812033036221567e-06, + "loss": 0.72696882, + "num_input_tokens_seen": 114325165, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.87109375, + "step": 5321, + "time_per_iteration": 2.5262763500213623 + }, + { + "auxiliary_loss_clip": 0.01141108, + "auxiliary_loss_mlp": 0.01056872, + "balance_loss_clip": 1.04030156, + "balance_loss_mlp": 1.05110431, + "epoch": 0.3199759506989328, + "flos": 18550842445440.0, + "grad_norm": 3.2234904552907238, + "language_loss": 0.86543447, + "learning_rate": 3.180888999963749e-06, + "loss": 0.88741434, + "num_input_tokens_seen": 114341310, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8984375, + "step": 5322, + "time_per_iteration": 2.4432008266448975 + }, + { + "auxiliary_loss_clip": 0.01132235, + "auxiliary_loss_mlp": 0.01035962, + "balance_loss_clip": 1.02119207, + "balance_loss_mlp": 1.04827893, + "epoch": 0.3200360739516008, + "flos": 22418888382720.0, + "grad_norm": 1.7854648356549414, + "language_loss": 0.82820231, + "learning_rate": 3.1805746515259123e-06, + "loss": 0.84988427, + "num_input_tokens_seen": 114360355, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.83984375, + "step": 5323, + "time_per_iteration": 2.554539680480957 + }, + { + "auxiliary_loss_clip": 0.01130058, + "auxiliary_loss_mlp": 0.01037837, + "balance_loss_clip": 1.02157664, + "balance_loss_mlp": 1.04700553, + "epoch": 0.32009619720426874, + "flos": 20595236929920.0, + "grad_norm": 1.8735349940723531, + "language_loss": 0.77858555, + "learning_rate": 3.1802602583205663e-06, + "loss": 0.8002646, + "num_input_tokens_seen": 114379220, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.828125, + "step": 5324, + "time_per_iteration": 2.452894687652588 + }, + { + "auxiliary_loss_clip": 0.0113163, + "auxiliary_loss_mlp": 0.01034726, + "balance_loss_clip": 1.01981282, + "balance_loss_mlp": 1.04770339, + "epoch": 0.3201563204569367, + "flos": 18147637301760.0, + "grad_norm": 1.8150910160625646, + "language_loss": 0.80162597, + "learning_rate": 3.1799458203596333e-06, + "loss": 0.82328951, + "num_input_tokens_seen": 114396365, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.83984375, + "step": 5325, + "time_per_iteration": 2.5261802673339844 + }, + { + "auxiliary_loss_clip": 0.01133025, + "auxiliary_loss_mlp": 0.01041931, + "balance_loss_clip": 1.02690446, + "balance_loss_mlp": 1.04872847, + "epoch": 0.32021644370960467, + "flos": 31684235840640.0, + "grad_norm": 1.8959189814779316, + "language_loss": 0.75171864, + "learning_rate": 3.179631337655037e-06, + "loss": 0.77346826, + "num_input_tokens_seen": 114416780, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84375, + "step": 5326, + "time_per_iteration": 2.5300135612487793 + }, + { + "auxiliary_loss_clip": 0.01129958, + "auxiliary_loss_mlp": 0.01037614, + "balance_loss_clip": 1.02285552, + "balance_loss_mlp": 1.04836321, + "epoch": 0.32027656696227264, + "flos": 26865921646080.0, + "grad_norm": 1.4421847054475023, + "language_loss": 0.80826092, + "learning_rate": 3.179316810218701e-06, + "loss": 0.82993662, + "num_input_tokens_seen": 114437405, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.81640625, + "step": 5327, + "time_per_iteration": 2.5393614768981934 + }, + { + "auxiliary_loss_clip": 0.01135097, + "auxiliary_loss_mlp": 0.01037836, + "balance_loss_clip": 1.022434, + "balance_loss_mlp": 1.04888535, + "epoch": 0.32033669021494066, + "flos": 24169928492160.0, + "grad_norm": 1.5386676468863185, + "language_loss": 0.77926928, + "learning_rate": 3.179002238062554e-06, + "loss": 0.80099857, + "num_input_tokens_seen": 114458505, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.859375, + "step": 5328, + "time_per_iteration": 2.471806287765503 + }, + { + "auxiliary_loss_clip": 0.011322, + "auxiliary_loss_mlp": 0.01041791, + "balance_loss_clip": 1.02550721, + "balance_loss_mlp": 1.04632294, + "epoch": 0.3203968134676086, + "flos": 24460768915200.0, + "grad_norm": 2.9951100938200765, + "language_loss": 0.73971635, + "learning_rate": 3.178687621198524e-06, + "loss": 0.76145625, + "num_input_tokens_seen": 114479050, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.859375, + "step": 5329, + "time_per_iteration": 2.52327561378479 + }, + { + "auxiliary_loss_clip": 0.01127399, + "auxiliary_loss_mlp": 0.01033731, + "balance_loss_clip": 1.02012336, + "balance_loss_mlp": 1.04675198, + "epoch": 0.3204569367202766, + "flos": 18004713085440.0, + "grad_norm": 2.060461898980319, + "language_loss": 0.71036464, + "learning_rate": 3.1783729596385415e-06, + "loss": 0.73197591, + "num_input_tokens_seen": 114497415, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.8046875, + "step": 5330, + "time_per_iteration": 2.4405477046966553 + }, + { + "auxiliary_loss_clip": 0.01136038, + "auxiliary_loss_mlp": 0.01049965, + "balance_loss_clip": 1.03343058, + "balance_loss_mlp": 1.0474323, + "epoch": 0.32051705997294455, + "flos": 30589678650240.0, + "grad_norm": 1.7909305839918348, + "language_loss": 0.80022657, + "learning_rate": 3.1780582533945376e-06, + "loss": 0.82208663, + "num_input_tokens_seen": 114518785, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8828125, + "step": 5331, + "time_per_iteration": 2.5934245586395264 + }, + { + "auxiliary_loss_clip": 0.01037799, + "auxiliary_loss_mlp": 0.01004509, + "balance_loss_clip": 1.00256538, + "balance_loss_mlp": 1.01001608, + "epoch": 0.3205771832256125, + "flos": 68417979765120.0, + "grad_norm": 0.8366333048595008, + "language_loss": 0.57806182, + "learning_rate": 3.177743502478447e-06, + "loss": 0.59848487, + "num_input_tokens_seen": 114577710, + "router_z_loss_clip": 0.01940918, + "router_z_loss_mlp": 0.27734375, + "step": 5332, + "time_per_iteration": 2.9984278678894043 + }, + { + "auxiliary_loss_clip": 0.01134361, + "auxiliary_loss_mlp": 0.01039294, + "balance_loss_clip": 1.02450585, + "balance_loss_mlp": 1.04747975, + "epoch": 0.3206373064782805, + "flos": 30443953173120.0, + "grad_norm": 1.7943987990453594, + "language_loss": 0.73309821, + "learning_rate": 3.177428706902205e-06, + "loss": 0.75483477, + "num_input_tokens_seen": 114598640, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.87109375, + "step": 5333, + "time_per_iteration": 2.554401159286499 + }, + { + "auxiliary_loss_clip": 0.01133668, + "auxiliary_loss_mlp": 0.01042462, + "balance_loss_clip": 1.02686942, + "balance_loss_mlp": 1.04836345, + "epoch": 0.32069742973094845, + "flos": 22054502862720.0, + "grad_norm": 1.5896288664703238, + "language_loss": 0.71050882, + "learning_rate": 3.1771138666777485e-06, + "loss": 0.73227012, + "num_input_tokens_seen": 114618780, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8515625, + "step": 5334, + "time_per_iteration": 2.468472957611084 + }, + { + "auxiliary_loss_clip": 0.01132404, + "auxiliary_loss_mlp": 0.01041967, + "balance_loss_clip": 1.02658951, + "balance_loss_mlp": 1.04644001, + "epoch": 0.3207575529836164, + "flos": 22054000072320.0, + "grad_norm": 1.9528247502362917, + "language_loss": 0.77601135, + "learning_rate": 3.1767989818170156e-06, + "loss": 0.797755, + "num_input_tokens_seen": 114637525, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.859375, + "step": 5335, + "time_per_iteration": 2.524211883544922 + }, + { + "auxiliary_loss_clip": 0.01131695, + "auxiliary_loss_mlp": 0.01040383, + "balance_loss_clip": 1.02519548, + "balance_loss_mlp": 1.04687452, + "epoch": 0.3208176762362844, + "flos": 34057536186240.0, + "grad_norm": 1.5197552931214375, + "language_loss": 0.68353152, + "learning_rate": 3.1764840523319477e-06, + "loss": 0.70525241, + "num_input_tokens_seen": 114659705, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.84765625, + "step": 5336, + "time_per_iteration": 2.5674326419830322 + }, + { + "auxiliary_loss_clip": 0.01131682, + "auxiliary_loss_mlp": 0.01045646, + "balance_loss_clip": 1.03027439, + "balance_loss_mlp": 1.04688144, + "epoch": 0.32087779948895234, + "flos": 21798711135360.0, + "grad_norm": 1.7063748564330914, + "language_loss": 0.7895453, + "learning_rate": 3.176169078234487e-06, + "loss": 0.81131858, + "num_input_tokens_seen": 114678340, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84765625, + "step": 5337, + "time_per_iteration": 2.5010595321655273 + }, + { + "auxiliary_loss_clip": 0.01124535, + "auxiliary_loss_mlp": 0.01035607, + "balance_loss_clip": 1.02194548, + "balance_loss_mlp": 1.04505002, + "epoch": 0.3209379227416203, + "flos": 21434110133760.0, + "grad_norm": 1.7193225847880926, + "language_loss": 0.73997593, + "learning_rate": 3.1758540595365766e-06, + "loss": 0.76157737, + "num_input_tokens_seen": 114696980, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.796875, + "step": 5338, + "time_per_iteration": 2.4961647987365723 + }, + { + "auxiliary_loss_clip": 0.01132045, + "auxiliary_loss_mlp": 0.01041805, + "balance_loss_clip": 1.02633142, + "balance_loss_mlp": 1.04477298, + "epoch": 0.3209980459942883, + "flos": 25849075530240.0, + "grad_norm": 1.8336519924948942, + "language_loss": 0.63149244, + "learning_rate": 3.1755389962501626e-06, + "loss": 0.65323097, + "num_input_tokens_seen": 114717330, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.87109375, + "step": 5339, + "time_per_iteration": 2.5218987464904785 + }, + { + "auxiliary_loss_clip": 0.01130495, + "auxiliary_loss_mlp": 0.01039604, + "balance_loss_clip": 1.02409506, + "balance_loss_mlp": 1.04546928, + "epoch": 0.32105816924695624, + "flos": 19099162535040.0, + "grad_norm": 1.814332726776551, + "language_loss": 0.81917858, + "learning_rate": 3.175223888387192e-06, + "loss": 0.84087962, + "num_input_tokens_seen": 114736320, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8515625, + "step": 5340, + "time_per_iteration": 2.427483558654785 + }, + { + "auxiliary_loss_clip": 0.0113181, + "auxiliary_loss_mlp": 0.01043319, + "balance_loss_clip": 1.02847123, + "balance_loss_mlp": 1.04696941, + "epoch": 0.3211182924996242, + "flos": 16581860565120.0, + "grad_norm": 1.7172536004624983, + "language_loss": 0.7620244, + "learning_rate": 3.1749087359596137e-06, + "loss": 0.78377569, + "num_input_tokens_seen": 114754575, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.84765625, + "step": 5341, + "time_per_iteration": 2.4785468578338623 + }, + { + "auxiliary_loss_clip": 0.01130847, + "auxiliary_loss_mlp": 0.01036241, + "balance_loss_clip": 1.02154231, + "balance_loss_mlp": 1.04897809, + "epoch": 0.3211784157522922, + "flos": 22672202071680.0, + "grad_norm": 1.9213308470980235, + "language_loss": 0.78627086, + "learning_rate": 3.1745935389793786e-06, + "loss": 0.80794168, + "num_input_tokens_seen": 114773590, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.81640625, + "step": 5342, + "time_per_iteration": 2.4524106979370117 + }, + { + "auxiliary_loss_clip": 0.01133398, + "auxiliary_loss_mlp": 0.01039022, + "balance_loss_clip": 1.02290499, + "balance_loss_mlp": 1.04772902, + "epoch": 0.3212385390049602, + "flos": 20558787603840.0, + "grad_norm": 3.762302479650767, + "language_loss": 0.74934483, + "learning_rate": 3.174278297458438e-06, + "loss": 0.77106899, + "num_input_tokens_seen": 114790775, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.859375, + "step": 5343, + "time_per_iteration": 2.4744415283203125 + }, + { + "auxiliary_loss_clip": 0.01130661, + "auxiliary_loss_mlp": 0.01035912, + "balance_loss_clip": 1.02040279, + "balance_loss_mlp": 1.04623377, + "epoch": 0.32129866225762815, + "flos": 24791147233920.0, + "grad_norm": 1.6135516142824962, + "language_loss": 0.82859504, + "learning_rate": 3.173963011408748e-06, + "loss": 0.85026079, + "num_input_tokens_seen": 114809835, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.84375, + "step": 5344, + "time_per_iteration": 2.47578763961792 + }, + { + "auxiliary_loss_clip": 0.01130938, + "auxiliary_loss_mlp": 0.01039787, + "balance_loss_clip": 1.02407503, + "balance_loss_mlp": 1.04474425, + "epoch": 0.3213587855102961, + "flos": 18366871962240.0, + "grad_norm": 2.07297685310976, + "language_loss": 0.79812628, + "learning_rate": 3.173647680842262e-06, + "loss": 0.81983352, + "num_input_tokens_seen": 114826505, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.859375, + "step": 5345, + "time_per_iteration": 5.33889365196228 + }, + { + "auxiliary_loss_clip": 0.01130545, + "auxiliary_loss_mlp": 0.01036519, + "balance_loss_clip": 1.02149296, + "balance_loss_mlp": 1.04473424, + "epoch": 0.3214189087629641, + "flos": 27015992668800.0, + "grad_norm": 1.8810220564208493, + "language_loss": 0.83404821, + "learning_rate": 3.1733323057709384e-06, + "loss": 0.85571885, + "num_input_tokens_seen": 114846140, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.859375, + "step": 5346, + "time_per_iteration": 2.500577688217163 + }, + { + "auxiliary_loss_clip": 0.01131977, + "auxiliary_loss_mlp": 0.0103944, + "balance_loss_clip": 1.02362108, + "balance_loss_mlp": 1.04492784, + "epoch": 0.32147903201563205, + "flos": 23148269953920.0, + "grad_norm": 1.4095386913443633, + "language_loss": 0.81571388, + "learning_rate": 3.1730168862067366e-06, + "loss": 0.83742809, + "num_input_tokens_seen": 114866660, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.87109375, + "step": 5347, + "time_per_iteration": 2.4491653442382812 + }, + { + "auxiliary_loss_clip": 0.01130206, + "auxiliary_loss_mlp": 0.01039058, + "balance_loss_clip": 1.02332854, + "balance_loss_mlp": 1.04715562, + "epoch": 0.3215391552683, + "flos": 16580747243520.0, + "grad_norm": 1.9965712334987884, + "language_loss": 0.79898697, + "learning_rate": 3.1727014221616164e-06, + "loss": 0.82067955, + "num_input_tokens_seen": 114882820, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.828125, + "step": 5348, + "time_per_iteration": 2.471261501312256 + }, + { + "auxiliary_loss_clip": 0.01132661, + "auxiliary_loss_mlp": 0.01047853, + "balance_loss_clip": 1.03262997, + "balance_loss_mlp": 1.04691792, + "epoch": 0.321599278520968, + "flos": 17821820010240.0, + "grad_norm": 1.9690807455187813, + "language_loss": 0.8506968, + "learning_rate": 3.172385913647542e-06, + "loss": 0.87250197, + "num_input_tokens_seen": 114900745, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.859375, + "step": 5349, + "time_per_iteration": 2.4376416206359863 + }, + { + "auxiliary_loss_clip": 0.01130553, + "auxiliary_loss_mlp": 0.01037186, + "balance_loss_clip": 1.02215409, + "balance_loss_mlp": 1.04589188, + "epoch": 0.32165940177363594, + "flos": 16251769555200.0, + "grad_norm": 1.7092259574450879, + "language_loss": 0.80862331, + "learning_rate": 3.172070360676475e-06, + "loss": 0.83030069, + "num_input_tokens_seen": 114917940, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84765625, + "step": 5350, + "time_per_iteration": 2.463998794555664 + }, + { + "auxiliary_loss_clip": 0.01129559, + "auxiliary_loss_mlp": 0.01040074, + "balance_loss_clip": 1.02545869, + "balance_loss_mlp": 1.04548049, + "epoch": 0.3217195250263039, + "flos": 27599900158080.0, + "grad_norm": 1.7709203173786705, + "language_loss": 0.79856229, + "learning_rate": 3.1717547632603828e-06, + "loss": 0.82025862, + "num_input_tokens_seen": 114937735, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.83984375, + "step": 5351, + "time_per_iteration": 2.5017340183258057 + }, + { + "auxiliary_loss_clip": 0.01129171, + "auxiliary_loss_mlp": 0.01040328, + "balance_loss_clip": 1.02396047, + "balance_loss_mlp": 1.04505897, + "epoch": 0.3217796482789719, + "flos": 21470595373440.0, + "grad_norm": 1.701097630272038, + "language_loss": 0.75491166, + "learning_rate": 3.1714391214112326e-06, + "loss": 0.77660662, + "num_input_tokens_seen": 114956630, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.83984375, + "step": 5352, + "time_per_iteration": 2.4916653633117676 + }, + { + "auxiliary_loss_clip": 0.01132153, + "auxiliary_loss_mlp": 0.01038225, + "balance_loss_clip": 1.02179837, + "balance_loss_mlp": 1.0472436, + "epoch": 0.32183977153163984, + "flos": 21215593745280.0, + "grad_norm": 1.8428416092094815, + "language_loss": 0.8174473, + "learning_rate": 3.1711234351409933e-06, + "loss": 0.83915108, + "num_input_tokens_seen": 114976470, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8515625, + "step": 5353, + "time_per_iteration": 2.4554946422576904 + }, + { + "auxiliary_loss_clip": 0.01127699, + "auxiliary_loss_mlp": 0.01037405, + "balance_loss_clip": 1.02147865, + "balance_loss_mlp": 1.04577875, + "epoch": 0.3218998947843078, + "flos": 24608182331520.0, + "grad_norm": 1.533417142425662, + "language_loss": 0.73054826, + "learning_rate": 3.1708077044616365e-06, + "loss": 0.75219929, + "num_input_tokens_seen": 114996710, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8203125, + "step": 5354, + "time_per_iteration": 2.521679639816284 + }, + { + "auxiliary_loss_clip": 0.01129194, + "auxiliary_loss_mlp": 0.01033035, + "balance_loss_clip": 1.01830053, + "balance_loss_mlp": 1.04482782, + "epoch": 0.3219600180369758, + "flos": 22270577126400.0, + "grad_norm": 1.5056594732405602, + "language_loss": 0.8349731, + "learning_rate": 3.1704919293851334e-06, + "loss": 0.8565954, + "num_input_tokens_seen": 115015775, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.84375, + "step": 5355, + "time_per_iteration": 2.4590871334075928 + }, + { + "auxiliary_loss_clip": 0.0113427, + "auxiliary_loss_mlp": 0.01045552, + "balance_loss_clip": 1.0299834, + "balance_loss_mlp": 1.04840243, + "epoch": 0.3220201412896438, + "flos": 14939126939520.0, + "grad_norm": 2.2450583198173737, + "language_loss": 0.71577442, + "learning_rate": 3.1701761099234597e-06, + "loss": 0.73757267, + "num_input_tokens_seen": 115034265, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.859375, + "step": 5356, + "time_per_iteration": 2.4499382972717285 + }, + { + "auxiliary_loss_clip": 0.01137452, + "auxiliary_loss_mlp": 0.0103626, + "balance_loss_clip": 1.0196538, + "balance_loss_mlp": 1.04720378, + "epoch": 0.32208026454231176, + "flos": 22667389649280.0, + "grad_norm": 2.5072162620412968, + "language_loss": 0.68480343, + "learning_rate": 3.1698602460885903e-06, + "loss": 0.70654052, + "num_input_tokens_seen": 115051945, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.90234375, + "step": 5357, + "time_per_iteration": 2.449125289916992 + }, + { + "auxiliary_loss_clip": 0.01042111, + "auxiliary_loss_mlp": 0.01002103, + "balance_loss_clip": 1.00029111, + "balance_loss_mlp": 1.01435876, + "epoch": 0.3221403877949797, + "flos": 64605130053120.0, + "grad_norm": 0.7023861387911429, + "language_loss": 0.58256829, + "learning_rate": 3.1695443378925035e-06, + "loss": 0.60301042, + "num_input_tokens_seen": 115119090, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.27734375, + "step": 5358, + "time_per_iteration": 3.1561930179595947 + }, + { + "auxiliary_loss_clip": 0.01130123, + "auxiliary_loss_mlp": 0.01041349, + "balance_loss_clip": 1.02506542, + "balance_loss_mlp": 1.04423356, + "epoch": 0.3222005110476477, + "flos": 20157019004160.0, + "grad_norm": 5.918956850418863, + "language_loss": 0.83524048, + "learning_rate": 3.1692283853471777e-06, + "loss": 0.85695517, + "num_input_tokens_seen": 115137755, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.859375, + "step": 5359, + "time_per_iteration": 2.4850337505340576 + }, + { + "auxiliary_loss_clip": 0.01132117, + "auxiliary_loss_mlp": 0.01034071, + "balance_loss_clip": 1.019122, + "balance_loss_mlp": 1.04514802, + "epoch": 0.32226063430031565, + "flos": 22674177319680.0, + "grad_norm": 1.5557598040672038, + "language_loss": 0.79817981, + "learning_rate": 3.168912388464595e-06, + "loss": 0.81984174, + "num_input_tokens_seen": 115158150, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8671875, + "step": 5360, + "time_per_iteration": 2.476698637008667 + }, + { + "auxiliary_loss_clip": 0.01040711, + "auxiliary_loss_mlp": 0.00999439, + "balance_loss_clip": 0.99754351, + "balance_loss_mlp": 1.01316833, + "epoch": 0.3223207575529836, + "flos": 63828525075840.0, + "grad_norm": 0.750004294413456, + "language_loss": 0.5697335, + "learning_rate": 3.168596347256737e-06, + "loss": 0.59013498, + "num_input_tokens_seen": 115212755, + "router_z_loss_clip": 0.0189209, + "router_z_loss_mlp": 0.27539062, + "step": 5361, + "time_per_iteration": 2.933368444442749 + }, + { + "auxiliary_loss_clip": 0.01129938, + "auxiliary_loss_mlp": 0.01039744, + "balance_loss_clip": 1.02452111, + "balance_loss_mlp": 1.04625082, + "epoch": 0.3223808808056516, + "flos": 26870123537280.0, + "grad_norm": 1.730134050345621, + "language_loss": 0.71349204, + "learning_rate": 3.168280261735588e-06, + "loss": 0.73518884, + "num_input_tokens_seen": 115233090, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8359375, + "step": 5362, + "time_per_iteration": 2.508444309234619 + }, + { + "auxiliary_loss_clip": 0.0113054, + "auxiliary_loss_mlp": 0.01040003, + "balance_loss_clip": 1.02606201, + "balance_loss_mlp": 1.04685211, + "epoch": 0.32244100405831955, + "flos": 26761350176640.0, + "grad_norm": 1.6566995758494631, + "language_loss": 0.74008292, + "learning_rate": 3.167964131913135e-06, + "loss": 0.76178837, + "num_input_tokens_seen": 115252645, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.8359375, + "step": 5363, + "time_per_iteration": 2.530428409576416 + }, + { + "auxiliary_loss_clip": 0.01134592, + "auxiliary_loss_mlp": 0.01040493, + "balance_loss_clip": 1.02481735, + "balance_loss_mlp": 1.04535139, + "epoch": 0.3225011273109875, + "flos": 23803029020160.0, + "grad_norm": 2.5112112412179624, + "language_loss": 0.77012563, + "learning_rate": 3.167647957801365e-06, + "loss": 0.79187649, + "num_input_tokens_seen": 115269085, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.890625, + "step": 5364, + "time_per_iteration": 2.475532054901123 + }, + { + "auxiliary_loss_clip": 0.01128803, + "auxiliary_loss_mlp": 0.01043179, + "balance_loss_clip": 1.02747917, + "balance_loss_mlp": 1.04455853, + "epoch": 0.3225612505636555, + "flos": 17274505501440.0, + "grad_norm": 2.1198351151285992, + "language_loss": 0.77043676, + "learning_rate": 3.1673317394122672e-06, + "loss": 0.79215652, + "num_input_tokens_seen": 115286470, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.84375, + "step": 5365, + "time_per_iteration": 2.4466004371643066 + }, + { + "auxiliary_loss_clip": 0.01133051, + "auxiliary_loss_mlp": 0.01049625, + "balance_loss_clip": 1.03444982, + "balance_loss_mlp": 1.04861832, + "epoch": 0.32262137381632344, + "flos": 23366247638400.0, + "grad_norm": 1.5183743876703555, + "language_loss": 0.76853883, + "learning_rate": 3.1670154767578333e-06, + "loss": 0.79036558, + "num_input_tokens_seen": 115307000, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.84375, + "step": 5366, + "time_per_iteration": 2.4716286659240723 + }, + { + "auxiliary_loss_clip": 0.01129975, + "auxiliary_loss_mlp": 0.01042819, + "balance_loss_clip": 1.02767324, + "balance_loss_mlp": 1.04463363, + "epoch": 0.3226814970689914, + "flos": 23258803080960.0, + "grad_norm": 1.6325357922005805, + "language_loss": 0.7200039, + "learning_rate": 3.166699169850055e-06, + "loss": 0.74173188, + "num_input_tokens_seen": 115325925, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8515625, + "step": 5367, + "time_per_iteration": 2.4936037063598633 + }, + { + "auxiliary_loss_clip": 0.01125689, + "auxiliary_loss_mlp": 0.01042014, + "balance_loss_clip": 1.02759588, + "balance_loss_mlp": 1.04335558, + "epoch": 0.32274162032165943, + "flos": 16395196561920.0, + "grad_norm": 1.8801069032327764, + "language_loss": 0.7456941, + "learning_rate": 3.1663828187009274e-06, + "loss": 0.76737112, + "num_input_tokens_seen": 115343705, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8203125, + "step": 5368, + "time_per_iteration": 2.436897039413452 + }, + { + "auxiliary_loss_clip": 0.01125271, + "auxiliary_loss_mlp": 0.0104042, + "balance_loss_clip": 1.02592432, + "balance_loss_mlp": 1.04390144, + "epoch": 0.3228017435743274, + "flos": 27855081354240.0, + "grad_norm": 1.5502047591083525, + "language_loss": 0.79212499, + "learning_rate": 3.1660664233224467e-06, + "loss": 0.81378186, + "num_input_tokens_seen": 115364170, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8125, + "step": 5369, + "time_per_iteration": 2.516191244125366 + }, + { + "auxiliary_loss_clip": 0.01124886, + "auxiliary_loss_mlp": 0.01034584, + "balance_loss_clip": 1.02042747, + "balance_loss_mlp": 1.04432988, + "epoch": 0.32286186682699536, + "flos": 19608770741760.0, + "grad_norm": 1.8370527927944635, + "language_loss": 0.83173579, + "learning_rate": 3.16574998372661e-06, + "loss": 0.85333049, + "num_input_tokens_seen": 115382495, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8046875, + "step": 5370, + "time_per_iteration": 2.423494338989258 + }, + { + "auxiliary_loss_clip": 0.01128659, + "auxiliary_loss_mlp": 0.01038158, + "balance_loss_clip": 1.02367377, + "balance_loss_mlp": 1.04524064, + "epoch": 0.3229219900796633, + "flos": 24134017870080.0, + "grad_norm": 1.743608915284185, + "language_loss": 0.83372939, + "learning_rate": 3.1654334999254177e-06, + "loss": 0.85539752, + "num_input_tokens_seen": 115399450, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8359375, + "step": 5371, + "time_per_iteration": 2.481677532196045 + }, + { + "auxiliary_loss_clip": 0.01131779, + "auxiliary_loss_mlp": 0.01048903, + "balance_loss_clip": 1.0323211, + "balance_loss_mlp": 1.04514813, + "epoch": 0.3229821133323313, + "flos": 17748705876480.0, + "grad_norm": 2.043238736788368, + "language_loss": 0.88539696, + "learning_rate": 3.1651169719308695e-06, + "loss": 0.90720367, + "num_input_tokens_seen": 115417700, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8671875, + "step": 5372, + "time_per_iteration": 2.434785842895508 + }, + { + "auxiliary_loss_clip": 0.01128015, + "auxiliary_loss_mlp": 0.01045541, + "balance_loss_clip": 1.03011537, + "balance_loss_mlp": 1.04532862, + "epoch": 0.32304223658499925, + "flos": 22346025644160.0, + "grad_norm": 1.9701661898720624, + "language_loss": 0.73064935, + "learning_rate": 3.1648003997549694e-06, + "loss": 0.75238496, + "num_input_tokens_seen": 115435840, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.828125, + "step": 5373, + "time_per_iteration": 2.509288787841797 + }, + { + "auxiliary_loss_clip": 0.01126431, + "auxiliary_loss_mlp": 0.01036263, + "balance_loss_clip": 1.0217371, + "balance_loss_mlp": 1.04496944, + "epoch": 0.3231023598376672, + "flos": 18478302929280.0, + "grad_norm": 2.118108535598075, + "language_loss": 0.81306481, + "learning_rate": 3.1644837834097214e-06, + "loss": 0.83469176, + "num_input_tokens_seen": 115454210, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8125, + "step": 5374, + "time_per_iteration": 2.43719744682312 + }, + { + "auxiliary_loss_clip": 0.01122361, + "auxiliary_loss_mlp": 0.01036065, + "balance_loss_clip": 1.02135515, + "balance_loss_mlp": 1.04158425, + "epoch": 0.3231624830903352, + "flos": 27636313570560.0, + "grad_norm": 2.0253542373007223, + "language_loss": 0.87507123, + "learning_rate": 3.1641671229071317e-06, + "loss": 0.89665556, + "num_input_tokens_seen": 115471785, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.80859375, + "step": 5375, + "time_per_iteration": 2.5192272663116455 + }, + { + "auxiliary_loss_clip": 0.0112955, + "auxiliary_loss_mlp": 0.01037551, + "balance_loss_clip": 1.02163696, + "balance_loss_mlp": 1.04312396, + "epoch": 0.32322260634300315, + "flos": 21726423014400.0, + "grad_norm": 1.8491566525281582, + "language_loss": 0.75873786, + "learning_rate": 3.1638504182592076e-06, + "loss": 0.78040886, + "num_input_tokens_seen": 115491405, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8671875, + "step": 5376, + "time_per_iteration": 2.463103771209717 + }, + { + "auxiliary_loss_clip": 0.01123814, + "auxiliary_loss_mlp": 0.0103315, + "balance_loss_clip": 1.01955426, + "balance_loss_mlp": 1.04269242, + "epoch": 0.3232827295956711, + "flos": 22637656166400.0, + "grad_norm": 1.5890241026671568, + "language_loss": 0.67173672, + "learning_rate": 3.1635336694779594e-06, + "loss": 0.69330645, + "num_input_tokens_seen": 115511555, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.8125, + "step": 5377, + "time_per_iteration": 2.5341343879699707 + }, + { + "auxiliary_loss_clip": 0.01127281, + "auxiliary_loss_mlp": 0.01046076, + "balance_loss_clip": 1.02922571, + "balance_loss_mlp": 1.04433763, + "epoch": 0.3233428528483391, + "flos": 26322593546880.0, + "grad_norm": 1.5071806558198568, + "language_loss": 0.7231617, + "learning_rate": 3.1632168765753982e-06, + "loss": 0.74489522, + "num_input_tokens_seen": 115532860, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.828125, + "step": 5378, + "time_per_iteration": 2.4838621616363525 + }, + { + "auxiliary_loss_clip": 0.01123972, + "auxiliary_loss_mlp": 0.01032073, + "balance_loss_clip": 1.0174818, + "balance_loss_mlp": 1.04056036, + "epoch": 0.32340297610100704, + "flos": 28585217111040.0, + "grad_norm": 1.9527598104570445, + "language_loss": 0.82083338, + "learning_rate": 3.1629000395635357e-06, + "loss": 0.84239388, + "num_input_tokens_seen": 115553850, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8359375, + "step": 5379, + "time_per_iteration": 2.5433154106140137 + }, + { + "auxiliary_loss_clip": 0.01127314, + "auxiliary_loss_mlp": 0.01032505, + "balance_loss_clip": 1.01805711, + "balance_loss_mlp": 1.04230165, + "epoch": 0.323463099353675, + "flos": 30773792787840.0, + "grad_norm": 1.9705325619840932, + "language_loss": 0.78379917, + "learning_rate": 3.162583158454388e-06, + "loss": 0.80539739, + "num_input_tokens_seen": 115575530, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8515625, + "step": 5380, + "time_per_iteration": 2.5306878089904785 + }, + { + "auxiliary_loss_clip": 0.0112988, + "auxiliary_loss_mlp": 0.01036402, + "balance_loss_clip": 1.02207887, + "balance_loss_mlp": 1.04637241, + "epoch": 0.32352322260634303, + "flos": 25228610974080.0, + "grad_norm": 1.5992937517204726, + "language_loss": 0.76871669, + "learning_rate": 3.1622662332599697e-06, + "loss": 0.79037952, + "num_input_tokens_seen": 115594885, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8359375, + "step": 5381, + "time_per_iteration": 2.545740842819214 + }, + { + "auxiliary_loss_clip": 0.0112089, + "auxiliary_loss_mlp": 0.01035663, + "balance_loss_clip": 1.02228761, + "balance_loss_mlp": 1.04212475, + "epoch": 0.323583345859011, + "flos": 23330480670720.0, + "grad_norm": 1.912812068704809, + "language_loss": 0.71864545, + "learning_rate": 3.1619492639922998e-06, + "loss": 0.74021101, + "num_input_tokens_seen": 115614080, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7890625, + "step": 5382, + "time_per_iteration": 2.488344430923462 + }, + { + "auxiliary_loss_clip": 0.01127382, + "auxiliary_loss_mlp": 0.0103402, + "balance_loss_clip": 1.0192976, + "balance_loss_mlp": 1.0424943, + "epoch": 0.32364346911167896, + "flos": 26207499392640.0, + "grad_norm": 2.8562908675977754, + "language_loss": 0.70752692, + "learning_rate": 3.1616322506633964e-06, + "loss": 0.72914088, + "num_input_tokens_seen": 115632820, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8515625, + "step": 5383, + "time_per_iteration": 2.5236711502075195 + }, + { + "auxiliary_loss_clip": 0.01123876, + "auxiliary_loss_mlp": 0.01039099, + "balance_loss_clip": 1.0259378, + "balance_loss_mlp": 1.0442363, + "epoch": 0.3237035923643469, + "flos": 23695764030720.0, + "grad_norm": 2.094388352971362, + "language_loss": 0.78742963, + "learning_rate": 3.161315193285283e-06, + "loss": 0.80905938, + "num_input_tokens_seen": 115652860, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.796875, + "step": 5384, + "time_per_iteration": 2.4685723781585693 + }, + { + "auxiliary_loss_clip": 0.0112912, + "auxiliary_loss_mlp": 0.01038116, + "balance_loss_clip": 1.0222249, + "balance_loss_mlp": 1.04443073, + "epoch": 0.3237637156170149, + "flos": 14428728633600.0, + "grad_norm": 2.069351852322995, + "language_loss": 0.74553645, + "learning_rate": 3.16099809186998e-06, + "loss": 0.76720881, + "num_input_tokens_seen": 115670940, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.84765625, + "step": 5385, + "time_per_iteration": 2.46968936920166 + }, + { + "auxiliary_loss_clip": 0.01127931, + "auxiliary_loss_mlp": 0.01035567, + "balance_loss_clip": 1.02101183, + "balance_loss_mlp": 1.04604125, + "epoch": 0.32382383886968286, + "flos": 31062981185280.0, + "grad_norm": 1.8196037573439483, + "language_loss": 0.72068852, + "learning_rate": 3.1606809464295145e-06, + "loss": 0.74232352, + "num_input_tokens_seen": 115691155, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8203125, + "step": 5386, + "time_per_iteration": 2.559480667114258 + }, + { + "auxiliary_loss_clip": 0.01128094, + "auxiliary_loss_mlp": 0.01036855, + "balance_loss_clip": 1.02119136, + "balance_loss_mlp": 1.04176617, + "epoch": 0.3238839621223508, + "flos": 23256935573760.0, + "grad_norm": 1.8525904099951498, + "language_loss": 0.94343817, + "learning_rate": 3.1603637569759095e-06, + "loss": 0.96508765, + "num_input_tokens_seen": 115710340, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.86328125, + "step": 5387, + "time_per_iteration": 5.378048896789551 + }, + { + "auxiliary_loss_clip": 0.0112709, + "auxiliary_loss_mlp": 0.01037979, + "balance_loss_clip": 1.02227962, + "balance_loss_mlp": 1.04373097, + "epoch": 0.3239440853750188, + "flos": 22964658606720.0, + "grad_norm": 2.7647642243142747, + "language_loss": 0.77544433, + "learning_rate": 3.1600465235211956e-06, + "loss": 0.79709506, + "num_input_tokens_seen": 115726745, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8359375, + "step": 5388, + "time_per_iteration": 2.4804563522338867 + }, + { + "auxiliary_loss_clip": 0.0112736, + "auxiliary_loss_mlp": 0.01030728, + "balance_loss_clip": 1.01554048, + "balance_loss_mlp": 1.04277194, + "epoch": 0.32400420862768675, + "flos": 36246614653440.0, + "grad_norm": 2.092216766577811, + "language_loss": 0.71867704, + "learning_rate": 3.1597292460774006e-06, + "loss": 0.74025786, + "num_input_tokens_seen": 115749385, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.84375, + "step": 5389, + "time_per_iteration": 2.5753331184387207 + }, + { + "auxiliary_loss_clip": 0.01128194, + "auxiliary_loss_mlp": 0.0103865, + "balance_loss_clip": 1.0233078, + "balance_loss_mlp": 1.04672205, + "epoch": 0.3240643318803547, + "flos": 21616500418560.0, + "grad_norm": 2.0374979548818497, + "language_loss": 0.80883735, + "learning_rate": 3.159411924656557e-06, + "loss": 0.83050573, + "num_input_tokens_seen": 115768105, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.81640625, + "step": 5390, + "time_per_iteration": 2.479557991027832 + }, + { + "auxiliary_loss_clip": 0.01130573, + "auxiliary_loss_mlp": 0.01044181, + "balance_loss_clip": 1.02911294, + "balance_loss_mlp": 1.04798484, + "epoch": 0.3241244551330227, + "flos": 23295611543040.0, + "grad_norm": 2.0682587448682384, + "language_loss": 0.72983515, + "learning_rate": 3.1590945592706967e-06, + "loss": 0.75158268, + "num_input_tokens_seen": 115787340, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.82421875, + "step": 5391, + "time_per_iteration": 2.4689247608184814 + }, + { + "auxiliary_loss_clip": 0.01125432, + "auxiliary_loss_mlp": 0.01041396, + "balance_loss_clip": 1.02728176, + "balance_loss_mlp": 1.04465139, + "epoch": 0.32418457838569065, + "flos": 14097236993280.0, + "grad_norm": 1.6356435132494873, + "language_loss": 0.77357036, + "learning_rate": 3.158777149931855e-06, + "loss": 0.79523861, + "num_input_tokens_seen": 115805565, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.80859375, + "step": 5392, + "time_per_iteration": 2.4942643642425537 + }, + { + "auxiliary_loss_clip": 0.01129141, + "auxiliary_loss_mlp": 0.01040265, + "balance_loss_clip": 1.02454162, + "balance_loss_mlp": 1.04454243, + "epoch": 0.3242447016383586, + "flos": 29752672953600.0, + "grad_norm": 2.035025217222515, + "language_loss": 0.62445068, + "learning_rate": 3.158459696652067e-06, + "loss": 0.64614469, + "num_input_tokens_seen": 115826725, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.84375, + "step": 5393, + "time_per_iteration": 2.5294058322906494 + }, + { + "auxiliary_loss_clip": 0.01127178, + "auxiliary_loss_mlp": 0.01037592, + "balance_loss_clip": 1.02292883, + "balance_loss_mlp": 1.0455395, + "epoch": 0.3243048248910266, + "flos": 24351205455360.0, + "grad_norm": 1.541011228274946, + "language_loss": 0.8250984, + "learning_rate": 3.158142199443371e-06, + "loss": 0.84674609, + "num_input_tokens_seen": 115846955, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.81640625, + "step": 5394, + "time_per_iteration": 2.5204803943634033 + }, + { + "auxiliary_loss_clip": 0.01125244, + "auxiliary_loss_mlp": 0.0104429, + "balance_loss_clip": 1.03089094, + "balance_loss_mlp": 1.04596353, + "epoch": 0.3243649481436946, + "flos": 24353037048960.0, + "grad_norm": 1.8431569167236632, + "language_loss": 0.81585443, + "learning_rate": 3.1578246583178076e-06, + "loss": 0.83754981, + "num_input_tokens_seen": 115865975, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.79296875, + "step": 5395, + "time_per_iteration": 2.481722116470337 + }, + { + "auxiliary_loss_clip": 0.01126361, + "auxiliary_loss_mlp": 0.01042015, + "balance_loss_clip": 1.02844906, + "balance_loss_mlp": 1.04834461, + "epoch": 0.32442507139636256, + "flos": 22925228451840.0, + "grad_norm": 3.644291671680186, + "language_loss": 0.83163011, + "learning_rate": 3.157507073287417e-06, + "loss": 0.8533138, + "num_input_tokens_seen": 115884950, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.78125, + "step": 5396, + "time_per_iteration": 2.5014734268188477 + }, + { + "auxiliary_loss_clip": 0.01133358, + "auxiliary_loss_mlp": 0.01039347, + "balance_loss_clip": 1.02392137, + "balance_loss_mlp": 1.04687238, + "epoch": 0.32448519464903053, + "flos": 22200192426240.0, + "grad_norm": 1.8637158339296453, + "language_loss": 0.75718713, + "learning_rate": 3.1571894443642414e-06, + "loss": 0.77891421, + "num_input_tokens_seen": 115904170, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8671875, + "step": 5397, + "time_per_iteration": 2.475958824157715 + }, + { + "auxiliary_loss_clip": 0.01125578, + "auxiliary_loss_mlp": 0.0103396, + "balance_loss_clip": 1.01953566, + "balance_loss_mlp": 1.04540443, + "epoch": 0.3245453179016985, + "flos": 18838450644480.0, + "grad_norm": 2.571224523552484, + "language_loss": 0.66835862, + "learning_rate": 3.1568717715603263e-06, + "loss": 0.68995398, + "num_input_tokens_seen": 115919255, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8046875, + "step": 5398, + "time_per_iteration": 2.447065830230713 + }, + { + "auxiliary_loss_clip": 0.01124854, + "auxiliary_loss_mlp": 0.0103244, + "balance_loss_clip": 1.0183022, + "balance_loss_mlp": 1.04326463, + "epoch": 0.32460544115436646, + "flos": 21178390233600.0, + "grad_norm": 1.4279244162742584, + "language_loss": 0.73232102, + "learning_rate": 3.156554054887718e-06, + "loss": 0.75389397, + "num_input_tokens_seen": 115938535, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8203125, + "step": 5399, + "time_per_iteration": 2.466137409210205 + }, + { + "auxiliary_loss_clip": 0.01129831, + "auxiliary_loss_mlp": 0.01035026, + "balance_loss_clip": 1.02016079, + "balance_loss_mlp": 1.04749155, + "epoch": 0.3246655644070344, + "flos": 21981137333760.0, + "grad_norm": 2.110147681467196, + "language_loss": 0.71391356, + "learning_rate": 3.1562362943584645e-06, + "loss": 0.73556215, + "num_input_tokens_seen": 115955005, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.82421875, + "step": 5400, + "time_per_iteration": 2.484243631362915 + }, + { + "auxiliary_loss_clip": 0.01128373, + "auxiliary_loss_mlp": 0.01035494, + "balance_loss_clip": 1.02108145, + "balance_loss_mlp": 1.04439175, + "epoch": 0.3247256876597024, + "flos": 32159729105280.0, + "grad_norm": 3.048924003265154, + "language_loss": 0.79583031, + "learning_rate": 3.155918489984614e-06, + "loss": 0.81746894, + "num_input_tokens_seen": 115975305, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.83984375, + "step": 5401, + "time_per_iteration": 2.5695505142211914 + }, + { + "auxiliary_loss_clip": 0.01130508, + "auxiliary_loss_mlp": 0.01042722, + "balance_loss_clip": 1.02642608, + "balance_loss_mlp": 1.04700303, + "epoch": 0.32478581091237035, + "flos": 20997544233600.0, + "grad_norm": 1.4209306386542333, + "language_loss": 0.87675726, + "learning_rate": 3.1556006417782196e-06, + "loss": 0.89848959, + "num_input_tokens_seen": 115994810, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8359375, + "step": 5402, + "time_per_iteration": 2.4811201095581055 + }, + { + "auxiliary_loss_clip": 0.01122645, + "auxiliary_loss_mlp": 0.01036689, + "balance_loss_clip": 1.02249742, + "balance_loss_mlp": 1.04369164, + "epoch": 0.3248459341650383, + "flos": 17924990849280.0, + "grad_norm": 1.934597728175988, + "language_loss": 0.84513289, + "learning_rate": 3.155282749751332e-06, + "loss": 0.86672628, + "num_input_tokens_seen": 116011095, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7890625, + "step": 5403, + "time_per_iteration": 2.418501377105713 + }, + { + "auxiliary_loss_clip": 0.01129275, + "auxiliary_loss_mlp": 0.01041866, + "balance_loss_clip": 1.02852631, + "balance_loss_mlp": 1.05024314, + "epoch": 0.3249060574177063, + "flos": 24535606901760.0, + "grad_norm": 2.0001546098828955, + "language_loss": 0.87642342, + "learning_rate": 3.154964813916007e-06, + "loss": 0.89813483, + "num_input_tokens_seen": 116028805, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7890625, + "step": 5404, + "time_per_iteration": 2.5094971656799316 + }, + { + "auxiliary_loss_clip": 0.01125879, + "auxiliary_loss_mlp": 0.01038939, + "balance_loss_clip": 1.02413273, + "balance_loss_mlp": 1.04579973, + "epoch": 0.32496618067037425, + "flos": 25994765093760.0, + "grad_norm": 1.6336968005079966, + "language_loss": 0.72491479, + "learning_rate": 3.1546468342843008e-06, + "loss": 0.74656296, + "num_input_tokens_seen": 116047765, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.80078125, + "step": 5405, + "time_per_iteration": 2.4927978515625 + }, + { + "auxiliary_loss_clip": 0.01125757, + "auxiliary_loss_mlp": 0.01035734, + "balance_loss_clip": 1.02147698, + "balance_loss_mlp": 1.04514825, + "epoch": 0.3250263039230422, + "flos": 19573757959680.0, + "grad_norm": 1.8637721662214948, + "language_loss": 0.83356953, + "learning_rate": 3.1543288108682707e-06, + "loss": 0.85518444, + "num_input_tokens_seen": 116068385, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.80859375, + "step": 5406, + "time_per_iteration": 2.534508228302002 + }, + { + "auxiliary_loss_clip": 0.01127659, + "auxiliary_loss_mlp": 0.01036296, + "balance_loss_clip": 1.02241969, + "balance_loss_mlp": 1.0469048, + "epoch": 0.3250864271757102, + "flos": 16763640318720.0, + "grad_norm": 1.836635199790601, + "language_loss": 0.8826412, + "learning_rate": 3.1540107436799764e-06, + "loss": 0.90428072, + "num_input_tokens_seen": 116085350, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.80859375, + "step": 5407, + "time_per_iteration": 2.4199326038360596 + }, + { + "auxiliary_loss_clip": 0.01127405, + "auxiliary_loss_mlp": 0.01036157, + "balance_loss_clip": 1.02160144, + "balance_loss_mlp": 1.04602861, + "epoch": 0.3251465504283782, + "flos": 27819458040960.0, + "grad_norm": 1.5140887230520799, + "language_loss": 0.69643426, + "learning_rate": 3.153692632731479e-06, + "loss": 0.71806979, + "num_input_tokens_seen": 116107560, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8125, + "step": 5408, + "time_per_iteration": 2.5646731853485107 + }, + { + "auxiliary_loss_clip": 0.01131319, + "auxiliary_loss_mlp": 0.01035172, + "balance_loss_clip": 1.02013946, + "balance_loss_mlp": 1.04438102, + "epoch": 0.32520667368104617, + "flos": 19063144172160.0, + "grad_norm": 1.6429750268405912, + "language_loss": 0.77442145, + "learning_rate": 3.153374478034841e-06, + "loss": 0.79608637, + "num_input_tokens_seen": 116125980, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.87109375, + "step": 5409, + "time_per_iteration": 2.450200080871582 + }, + { + "auxiliary_loss_clip": 0.01129924, + "auxiliary_loss_mlp": 0.01046371, + "balance_loss_clip": 1.03142262, + "balance_loss_mlp": 1.04331136, + "epoch": 0.32526679693371413, + "flos": 29382146208000.0, + "grad_norm": 2.3862040562488716, + "language_loss": 0.83582234, + "learning_rate": 3.1530562796021285e-06, + "loss": 0.85758531, + "num_input_tokens_seen": 116146530, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8671875, + "step": 5410, + "time_per_iteration": 2.5161662101745605 + }, + { + "auxiliary_loss_clip": 0.01121858, + "auxiliary_loss_mlp": 0.01034854, + "balance_loss_clip": 1.02089429, + "balance_loss_mlp": 1.04224813, + "epoch": 0.3253269201863821, + "flos": 20704513080960.0, + "grad_norm": 1.5577179591930796, + "language_loss": 0.71270931, + "learning_rate": 3.152738037445405e-06, + "loss": 0.73427641, + "num_input_tokens_seen": 116165695, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.796875, + "step": 5411, + "time_per_iteration": 2.4465057849884033 + }, + { + "auxiliary_loss_clip": 0.01125475, + "auxiliary_loss_mlp": 0.01039417, + "balance_loss_clip": 1.02544606, + "balance_loss_mlp": 1.04381669, + "epoch": 0.32538704343905006, + "flos": 29094142959360.0, + "grad_norm": 1.6024997274503978, + "language_loss": 0.83103073, + "learning_rate": 3.1524197515767403e-06, + "loss": 0.85267961, + "num_input_tokens_seen": 116185375, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.81640625, + "step": 5412, + "time_per_iteration": 2.512471914291382 + }, + { + "auxiliary_loss_clip": 0.01129762, + "auxiliary_loss_mlp": 0.01035505, + "balance_loss_clip": 1.01963782, + "balance_loss_mlp": 1.04417348, + "epoch": 0.325447166691718, + "flos": 24676124906880.0, + "grad_norm": 2.3149031646834577, + "language_loss": 0.80794364, + "learning_rate": 3.152101422008203e-06, + "loss": 0.82959628, + "num_input_tokens_seen": 116204335, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.85546875, + "step": 5413, + "time_per_iteration": 2.483309030532837 + }, + { + "auxiliary_loss_clip": 0.01128818, + "auxiliary_loss_mlp": 0.01042957, + "balance_loss_clip": 1.02723312, + "balance_loss_mlp": 1.04606462, + "epoch": 0.325507289944386, + "flos": 21543134889600.0, + "grad_norm": 1.5892127721025033, + "language_loss": 0.76887989, + "learning_rate": 3.151783048751864e-06, + "loss": 0.79059768, + "num_input_tokens_seen": 116222840, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.828125, + "step": 5414, + "time_per_iteration": 2.4696640968322754 + }, + { + "auxiliary_loss_clip": 0.01039619, + "auxiliary_loss_mlp": 0.01008091, + "balance_loss_clip": 1.00601661, + "balance_loss_mlp": 1.01271892, + "epoch": 0.32556741319705396, + "flos": 71518722347520.0, + "grad_norm": 0.9084647328862615, + "language_loss": 0.64009887, + "learning_rate": 3.1514646318197965e-06, + "loss": 0.66057593, + "num_input_tokens_seen": 116274940, + "router_z_loss_clip": 0.02075195, + "router_z_loss_mlp": 0.26953125, + "step": 5415, + "time_per_iteration": 2.982389450073242 + }, + { + "auxiliary_loss_clip": 0.01124624, + "auxiliary_loss_mlp": 0.01036817, + "balance_loss_clip": 1.02214265, + "balance_loss_mlp": 1.04286838, + "epoch": 0.3256275364497219, + "flos": 23732428838400.0, + "grad_norm": 2.942597496869342, + "language_loss": 0.74265057, + "learning_rate": 3.151146171224075e-06, + "loss": 0.764265, + "num_input_tokens_seen": 116297300, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.81640625, + "step": 5416, + "time_per_iteration": 2.526956558227539 + }, + { + "auxiliary_loss_clip": 0.01039656, + "auxiliary_loss_mlp": 0.01005548, + "balance_loss_clip": 1.00335431, + "balance_loss_mlp": 1.01254702, + "epoch": 0.3256876597023899, + "flos": 67289199891840.0, + "grad_norm": 0.7736939008633222, + "language_loss": 0.57947183, + "learning_rate": 3.1508276669767757e-06, + "loss": 0.59992385, + "num_input_tokens_seen": 116362370, + "router_z_loss_clip": 0.02197266, + "router_z_loss_mlp": 0.26953125, + "step": 5417, + "time_per_iteration": 3.1500296592712402 + }, + { + "auxiliary_loss_clip": 0.01038219, + "auxiliary_loss_mlp": 0.01002181, + "balance_loss_clip": 1.0002141, + "balance_loss_mlp": 1.01140058, + "epoch": 0.32574778295505785, + "flos": 71282323964160.0, + "grad_norm": 0.9133944403169288, + "language_loss": 0.63476181, + "learning_rate": 3.150509119089975e-06, + "loss": 0.65516579, + "num_input_tokens_seen": 116430365, + "router_z_loss_clip": 0.01965332, + "router_z_loss_mlp": 0.26953125, + "step": 5418, + "time_per_iteration": 3.1724026203155518 + }, + { + "auxiliary_loss_clip": 0.01125951, + "auxiliary_loss_mlp": 0.01041707, + "balance_loss_clip": 1.02739, + "balance_loss_mlp": 1.0441196, + "epoch": 0.3258079062077258, + "flos": 20776370238720.0, + "grad_norm": 3.240595355482155, + "language_loss": 0.69061959, + "learning_rate": 3.1501905275757537e-06, + "loss": 0.71229619, + "num_input_tokens_seen": 116447525, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.8203125, + "step": 5419, + "time_per_iteration": 2.4643847942352295 + }, + { + "auxiliary_loss_clip": 0.01125895, + "auxiliary_loss_mlp": 0.01035053, + "balance_loss_clip": 1.01951957, + "balance_loss_mlp": 1.04326844, + "epoch": 0.3258680294603938, + "flos": 22235456603520.0, + "grad_norm": 2.1209544014848443, + "language_loss": 0.77064359, + "learning_rate": 3.1498718924461926e-06, + "loss": 0.79225302, + "num_input_tokens_seen": 116466310, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.828125, + "step": 5420, + "time_per_iteration": 2.5241270065307617 + }, + { + "auxiliary_loss_clip": 0.01128645, + "auxiliary_loss_mlp": 0.01035079, + "balance_loss_clip": 1.01945043, + "balance_loss_mlp": 1.04400003, + "epoch": 0.3259281527130618, + "flos": 26979974305920.0, + "grad_norm": 1.4823274263144444, + "language_loss": 0.80134791, + "learning_rate": 3.1495532137133736e-06, + "loss": 0.82298517, + "num_input_tokens_seen": 116487825, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84375, + "step": 5421, + "time_per_iteration": 2.5376439094543457 + }, + { + "auxiliary_loss_clip": 0.01122338, + "auxiliary_loss_mlp": 0.01037347, + "balance_loss_clip": 1.02359045, + "balance_loss_mlp": 1.04254711, + "epoch": 0.32598827596572977, + "flos": 26214251149440.0, + "grad_norm": 1.5045024534641303, + "language_loss": 0.75446749, + "learning_rate": 3.149234491389381e-06, + "loss": 0.77606434, + "num_input_tokens_seen": 116509950, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.796875, + "step": 5422, + "time_per_iteration": 2.5713820457458496 + }, + { + "auxiliary_loss_clip": 0.01128336, + "auxiliary_loss_mlp": 0.01039164, + "balance_loss_clip": 1.02324986, + "balance_loss_mlp": 1.04553628, + "epoch": 0.32604839921839773, + "flos": 17639752947840.0, + "grad_norm": 2.780294141224906, + "language_loss": 0.62795889, + "learning_rate": 3.1489157254863026e-06, + "loss": 0.64963388, + "num_input_tokens_seen": 116527695, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.828125, + "step": 5423, + "time_per_iteration": 2.4667959213256836 + }, + { + "auxiliary_loss_clip": 0.01118583, + "auxiliary_loss_mlp": 0.01031258, + "balance_loss_clip": 1.01824594, + "balance_loss_mlp": 1.04085255, + "epoch": 0.3261085224710657, + "flos": 23622721724160.0, + "grad_norm": 4.488088575635961, + "language_loss": 0.74664211, + "learning_rate": 3.148596916016224e-06, + "loss": 0.76814055, + "num_input_tokens_seen": 116547800, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.77734375, + "step": 5424, + "time_per_iteration": 2.488187313079834 + }, + { + "auxiliary_loss_clip": 0.01122401, + "auxiliary_loss_mlp": 0.01035954, + "balance_loss_clip": 1.02231038, + "balance_loss_mlp": 1.04298568, + "epoch": 0.32616864572373366, + "flos": 23260455106560.0, + "grad_norm": 1.6359586167011877, + "language_loss": 0.76958472, + "learning_rate": 3.1482780629912355e-06, + "loss": 0.79116821, + "num_input_tokens_seen": 116568460, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.796875, + "step": 5425, + "time_per_iteration": 2.5025157928466797 + }, + { + "auxiliary_loss_clip": 0.01127865, + "auxiliary_loss_mlp": 0.01047272, + "balance_loss_clip": 1.03051138, + "balance_loss_mlp": 1.04193544, + "epoch": 0.32622876897640163, + "flos": 25593427457280.0, + "grad_norm": 4.663874352034687, + "language_loss": 0.78857136, + "learning_rate": 3.147959166423428e-06, + "loss": 0.8103227, + "num_input_tokens_seen": 116588705, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.859375, + "step": 5426, + "time_per_iteration": 2.484064817428589 + }, + { + "auxiliary_loss_clip": 0.01124966, + "auxiliary_loss_mlp": 0.01037083, + "balance_loss_clip": 1.02116871, + "balance_loss_mlp": 1.04324198, + "epoch": 0.3262888922290696, + "flos": 22418996123520.0, + "grad_norm": 1.7688447582142532, + "language_loss": 0.74363142, + "learning_rate": 3.147640226324893e-06, + "loss": 0.76525187, + "num_input_tokens_seen": 116608845, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.81640625, + "step": 5427, + "time_per_iteration": 2.4785962104797363 + }, + { + "auxiliary_loss_clip": 0.0112706, + "auxiliary_loss_mlp": 0.01043058, + "balance_loss_clip": 1.02742934, + "balance_loss_mlp": 1.04290414, + "epoch": 0.32634901548173756, + "flos": 19718908819200.0, + "grad_norm": 1.911492416062928, + "language_loss": 0.79305124, + "learning_rate": 3.1473212427077266e-06, + "loss": 0.8147524, + "num_input_tokens_seen": 116628145, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.83984375, + "step": 5428, + "time_per_iteration": 3.9864413738250732 + }, + { + "auxiliary_loss_clip": 0.01123467, + "auxiliary_loss_mlp": 0.01041045, + "balance_loss_clip": 1.02597678, + "balance_loss_mlp": 1.04084587, + "epoch": 0.3264091387344055, + "flos": 16142924367360.0, + "grad_norm": 1.7222830625250152, + "language_loss": 0.71369523, + "learning_rate": 3.147002215584023e-06, + "loss": 0.73534036, + "num_input_tokens_seen": 116646920, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.82421875, + "step": 5429, + "time_per_iteration": 3.8856096267700195 + }, + { + "auxiliary_loss_clip": 0.01124973, + "auxiliary_loss_mlp": 0.01038401, + "balance_loss_clip": 1.02448976, + "balance_loss_mlp": 1.04308093, + "epoch": 0.3264692619870735, + "flos": 16399075230720.0, + "grad_norm": 1.889570703315701, + "language_loss": 0.78612322, + "learning_rate": 3.146683144965881e-06, + "loss": 0.80775696, + "num_input_tokens_seen": 116665100, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.8203125, + "step": 5430, + "time_per_iteration": 2.4374818801879883 + }, + { + "auxiliary_loss_clip": 0.01128219, + "auxiliary_loss_mlp": 0.01037824, + "balance_loss_clip": 1.02077675, + "balance_loss_mlp": 1.04359281, + "epoch": 0.32652938523974145, + "flos": 22382331315840.0, + "grad_norm": 1.8594684871120744, + "language_loss": 0.83897448, + "learning_rate": 3.146364030865399e-06, + "loss": 0.86063492, + "num_input_tokens_seen": 116682205, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.84765625, + "step": 5431, + "time_per_iteration": 2.4513139724731445 + }, + { + "auxiliary_loss_clip": 0.01122027, + "auxiliary_loss_mlp": 0.01038117, + "balance_loss_clip": 1.02431297, + "balance_loss_mlp": 1.04116321, + "epoch": 0.3265895084924094, + "flos": 21908059113600.0, + "grad_norm": 1.7565110160676718, + "language_loss": 0.70459324, + "learning_rate": 3.146044873294678e-06, + "loss": 0.72619462, + "num_input_tokens_seen": 116702575, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.80859375, + "step": 5432, + "time_per_iteration": 2.529365301132202 + }, + { + "auxiliary_loss_clip": 0.01123519, + "auxiliary_loss_mlp": 0.01035948, + "balance_loss_clip": 1.02182746, + "balance_loss_mlp": 1.04076195, + "epoch": 0.3266496317450774, + "flos": 16067152627200.0, + "grad_norm": 1.4205622330102, + "language_loss": 0.84161848, + "learning_rate": 3.1457256722658203e-06, + "loss": 0.86321318, + "num_input_tokens_seen": 116720885, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.828125, + "step": 5433, + "time_per_iteration": 2.4302597045898438 + }, + { + "auxiliary_loss_clip": 0.01123612, + "auxiliary_loss_mlp": 0.01035544, + "balance_loss_clip": 1.02132881, + "balance_loss_mlp": 1.0439055, + "epoch": 0.3267097549977454, + "flos": 22528236360960.0, + "grad_norm": 1.4699213962063424, + "language_loss": 0.85906386, + "learning_rate": 3.145406427790931e-06, + "loss": 0.88065541, + "num_input_tokens_seen": 116740395, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.796875, + "step": 5434, + "time_per_iteration": 2.496676445007324 + }, + { + "auxiliary_loss_clip": 0.01129105, + "auxiliary_loss_mlp": 0.01035794, + "balance_loss_clip": 1.02083361, + "balance_loss_mlp": 1.04468119, + "epoch": 0.32676987825041337, + "flos": 27270419679360.0, + "grad_norm": 1.8331918492971015, + "language_loss": 0.87817061, + "learning_rate": 3.1450871398821147e-06, + "loss": 0.89981961, + "num_input_tokens_seen": 116758870, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.84375, + "step": 5435, + "time_per_iteration": 2.51159405708313 + }, + { + "auxiliary_loss_clip": 0.0112533, + "auxiliary_loss_mlp": 0.01035758, + "balance_loss_clip": 1.02140474, + "balance_loss_mlp": 1.04326773, + "epoch": 0.32683000150308134, + "flos": 11508257433600.0, + "grad_norm": 2.5496215899058443, + "language_loss": 0.76460963, + "learning_rate": 3.144767808551479e-06, + "loss": 0.78622043, + "num_input_tokens_seen": 116773440, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.8203125, + "step": 5436, + "time_per_iteration": 2.43637752532959 + }, + { + "auxiliary_loss_clip": 0.01125315, + "auxiliary_loss_mlp": 0.01034854, + "balance_loss_clip": 1.02040625, + "balance_loss_mlp": 1.04435849, + "epoch": 0.3268901247557493, + "flos": 25630200005760.0, + "grad_norm": 1.5905557916714361, + "language_loss": 0.72127515, + "learning_rate": 3.144448433811134e-06, + "loss": 0.74287689, + "num_input_tokens_seen": 116794375, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.80859375, + "step": 5437, + "time_per_iteration": 2.493673086166382 + }, + { + "auxiliary_loss_clip": 0.01126466, + "auxiliary_loss_mlp": 0.01039117, + "balance_loss_clip": 1.02236819, + "balance_loss_mlp": 1.04143524, + "epoch": 0.32695024800841727, + "flos": 24860849575680.0, + "grad_norm": 1.6336098458574233, + "language_loss": 0.64049256, + "learning_rate": 3.144129015673189e-06, + "loss": 0.66214842, + "num_input_tokens_seen": 116815095, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8515625, + "step": 5438, + "time_per_iteration": 2.5062596797943115 + }, + { + "auxiliary_loss_clip": 0.01126505, + "auxiliary_loss_mlp": 0.01034195, + "balance_loss_clip": 1.01943088, + "balance_loss_mlp": 1.04510128, + "epoch": 0.32701037126108523, + "flos": 28839249072000.0, + "grad_norm": 1.5452802319075516, + "language_loss": 0.74544024, + "learning_rate": 3.1438095541497576e-06, + "loss": 0.76704717, + "num_input_tokens_seen": 116836630, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8125, + "step": 5439, + "time_per_iteration": 2.501279830932617 + }, + { + "auxiliary_loss_clip": 0.01126727, + "auxiliary_loss_mlp": 0.01045237, + "balance_loss_clip": 1.02985907, + "balance_loss_mlp": 1.04374349, + "epoch": 0.3270704945137532, + "flos": 27965075777280.0, + "grad_norm": 2.6196339079167323, + "language_loss": 0.75183308, + "learning_rate": 3.1434900492529527e-06, + "loss": 0.77355272, + "num_input_tokens_seen": 116856880, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.828125, + "step": 5440, + "time_per_iteration": 2.507341146469116 + }, + { + "auxiliary_loss_clip": 0.01124779, + "auxiliary_loss_mlp": 0.01047409, + "balance_loss_clip": 1.03317571, + "balance_loss_mlp": 1.04308057, + "epoch": 0.32713061776642116, + "flos": 23690700213120.0, + "grad_norm": 1.9066250681455874, + "language_loss": 0.84613734, + "learning_rate": 3.1431705009948914e-06, + "loss": 0.86785924, + "num_input_tokens_seen": 116873770, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8203125, + "step": 5441, + "time_per_iteration": 2.4737346172332764 + }, + { + "auxiliary_loss_clip": 0.01126255, + "auxiliary_loss_mlp": 0.01042859, + "balance_loss_clip": 1.02743292, + "balance_loss_mlp": 1.04209113, + "epoch": 0.3271907410190891, + "flos": 22455625017600.0, + "grad_norm": 1.9602585650153952, + "language_loss": 0.8673979, + "learning_rate": 3.1428509093876897e-06, + "loss": 0.88908899, + "num_input_tokens_seen": 116891225, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84375, + "step": 5442, + "time_per_iteration": 2.4779980182647705 + }, + { + "auxiliary_loss_clip": 0.0113032, + "auxiliary_loss_mlp": 0.01038435, + "balance_loss_clip": 1.02193677, + "balance_loss_mlp": 1.04526424, + "epoch": 0.3272508642717571, + "flos": 22820118278400.0, + "grad_norm": 1.8849886885636646, + "language_loss": 0.77500421, + "learning_rate": 3.1425312744434668e-06, + "loss": 0.79669178, + "num_input_tokens_seen": 116912300, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8515625, + "step": 5443, + "time_per_iteration": 2.5263850688934326 + }, + { + "auxiliary_loss_clip": 0.01126577, + "auxiliary_loss_mlp": 0.01039448, + "balance_loss_clip": 1.02428412, + "balance_loss_mlp": 1.04207098, + "epoch": 0.32731098752442506, + "flos": 11801360413440.0, + "grad_norm": 2.0180593262473487, + "language_loss": 0.81630802, + "learning_rate": 3.142211596174343e-06, + "loss": 0.83796823, + "num_input_tokens_seen": 116929425, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.84765625, + "step": 5444, + "time_per_iteration": 2.447061061859131 + }, + { + "auxiliary_loss_clip": 0.0112612, + "auxiliary_loss_mlp": 0.01038044, + "balance_loss_clip": 1.02335095, + "balance_loss_mlp": 1.04356718, + "epoch": 0.327371110777093, + "flos": 21027780506880.0, + "grad_norm": 2.9587875585664523, + "language_loss": 0.59421074, + "learning_rate": 3.1418918745924423e-06, + "loss": 0.61585242, + "num_input_tokens_seen": 116948255, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.82421875, + "step": 5445, + "time_per_iteration": 2.4542667865753174 + }, + { + "auxiliary_loss_clip": 0.01128674, + "auxiliary_loss_mlp": 0.01039464, + "balance_loss_clip": 1.02397823, + "balance_loss_mlp": 1.04482532, + "epoch": 0.327431234029761, + "flos": 19062102677760.0, + "grad_norm": 2.043321690225375, + "language_loss": 0.88286638, + "learning_rate": 3.1415721097098865e-06, + "loss": 0.90454781, + "num_input_tokens_seen": 116964905, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8359375, + "step": 5446, + "time_per_iteration": 2.4518625736236572 + }, + { + "auxiliary_loss_clip": 0.01133247, + "auxiliary_loss_mlp": 0.01042878, + "balance_loss_clip": 1.02577102, + "balance_loss_mlp": 1.04609275, + "epoch": 0.32749135728242895, + "flos": 25849219184640.0, + "grad_norm": 1.9059445881205361, + "language_loss": 0.78455317, + "learning_rate": 3.141252301538802e-06, + "loss": 0.80631441, + "num_input_tokens_seen": 116983650, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.87109375, + "step": 5447, + "time_per_iteration": 2.488555669784546 + }, + { + "auxiliary_loss_clip": 0.01125433, + "auxiliary_loss_mlp": 0.01039956, + "balance_loss_clip": 1.02621138, + "balance_loss_mlp": 1.04297531, + "epoch": 0.327551480535097, + "flos": 20120533764480.0, + "grad_norm": 1.7948266966340543, + "language_loss": 0.73349774, + "learning_rate": 3.1409324500913157e-06, + "loss": 0.75515163, + "num_input_tokens_seen": 117003265, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.82421875, + "step": 5448, + "time_per_iteration": 2.460759162902832 + }, + { + "auxiliary_loss_clip": 0.01125074, + "auxiliary_loss_mlp": 0.0104344, + "balance_loss_clip": 1.02788281, + "balance_loss_mlp": 1.04221821, + "epoch": 0.32761160378776494, + "flos": 28803553931520.0, + "grad_norm": 1.3797343272994427, + "language_loss": 0.66896623, + "learning_rate": 3.1406125553795567e-06, + "loss": 0.69065142, + "num_input_tokens_seen": 117025370, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.828125, + "step": 5449, + "time_per_iteration": 2.5101547241210938 + }, + { + "auxiliary_loss_clip": 0.01125182, + "auxiliary_loss_mlp": 0.01035774, + "balance_loss_clip": 1.02111173, + "balance_loss_mlp": 1.04373384, + "epoch": 0.3276717270404329, + "flos": 26937778803840.0, + "grad_norm": 1.3889431777217922, + "language_loss": 0.65617704, + "learning_rate": 3.1402926174156556e-06, + "loss": 0.67778659, + "num_input_tokens_seen": 117044350, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8125, + "step": 5450, + "time_per_iteration": 2.4815587997436523 + }, + { + "auxiliary_loss_clip": 0.01126325, + "auxiliary_loss_mlp": 0.01041593, + "balance_loss_clip": 1.02644145, + "balance_loss_mlp": 1.04330397, + "epoch": 0.32773185029310087, + "flos": 25338425829120.0, + "grad_norm": 1.5376267502191867, + "language_loss": 0.77276003, + "learning_rate": 3.1399726362117437e-06, + "loss": 0.7944392, + "num_input_tokens_seen": 117064450, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.828125, + "step": 5451, + "time_per_iteration": 2.496264696121216 + }, + { + "auxiliary_loss_clip": 0.0112906, + "auxiliary_loss_mlp": 0.01039497, + "balance_loss_clip": 1.02348745, + "balance_loss_mlp": 1.04470944, + "epoch": 0.32779197354576883, + "flos": 26391721271040.0, + "grad_norm": 2.4373215337565015, + "language_loss": 0.7011131, + "learning_rate": 3.1396526117799555e-06, + "loss": 0.72279859, + "num_input_tokens_seen": 117083060, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.84375, + "step": 5452, + "time_per_iteration": 2.504953384399414 + }, + { + "auxiliary_loss_clip": 0.01121729, + "auxiliary_loss_mlp": 0.0103441, + "balance_loss_clip": 1.01944947, + "balance_loss_mlp": 1.04188132, + "epoch": 0.3278520967984368, + "flos": 24899381890560.0, + "grad_norm": 1.7019757848824575, + "language_loss": 0.78734571, + "learning_rate": 3.1393325441324256e-06, + "loss": 0.80890715, + "num_input_tokens_seen": 117101860, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.796875, + "step": 5453, + "time_per_iteration": 2.493701219558716 + }, + { + "auxiliary_loss_clip": 0.01126073, + "auxiliary_loss_mlp": 0.01030616, + "balance_loss_clip": 1.01610184, + "balance_loss_mlp": 1.04306984, + "epoch": 0.32791222005110476, + "flos": 29752996176000.0, + "grad_norm": 2.2894918901687333, + "language_loss": 0.75428879, + "learning_rate": 3.1390124332812916e-06, + "loss": 0.77585566, + "num_input_tokens_seen": 117123100, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.828125, + "step": 5454, + "time_per_iteration": 2.5295286178588867 + }, + { + "auxiliary_loss_clip": 0.01121153, + "auxiliary_loss_mlp": 0.01037163, + "balance_loss_clip": 1.02382326, + "balance_loss_mlp": 1.04198301, + "epoch": 0.32797234330377273, + "flos": 16508064072960.0, + "grad_norm": 2.0725507665811826, + "language_loss": 0.77059573, + "learning_rate": 3.1386922792386924e-06, + "loss": 0.79217887, + "num_input_tokens_seen": 117140515, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7890625, + "step": 5455, + "time_per_iteration": 2.426988124847412 + }, + { + "auxiliary_loss_clip": 0.0112837, + "auxiliary_loss_mlp": 0.01039409, + "balance_loss_clip": 1.02304173, + "balance_loss_mlp": 1.04281068, + "epoch": 0.3280324665564407, + "flos": 26577918397440.0, + "grad_norm": 1.669914346129418, + "language_loss": 0.74029738, + "learning_rate": 3.138372082016768e-06, + "loss": 0.76197511, + "num_input_tokens_seen": 117161485, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.85546875, + "step": 5456, + "time_per_iteration": 2.512131929397583 + }, + { + "auxiliary_loss_clip": 0.01126084, + "auxiliary_loss_mlp": 0.01049831, + "balance_loss_clip": 1.03444123, + "balance_loss_mlp": 1.04250574, + "epoch": 0.32809258980910866, + "flos": 22929969047040.0, + "grad_norm": 1.518027485126158, + "language_loss": 0.78283882, + "learning_rate": 3.1380518416276596e-06, + "loss": 0.80459797, + "num_input_tokens_seen": 117181870, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8359375, + "step": 5457, + "time_per_iteration": 2.4819135665893555 + }, + { + "auxiliary_loss_clip": 0.0112739, + "auxiliary_loss_mlp": 0.01038783, + "balance_loss_clip": 1.02432334, + "balance_loss_mlp": 1.04155684, + "epoch": 0.3281527130617766, + "flos": 22783848520320.0, + "grad_norm": 2.199350012619834, + "language_loss": 0.79332864, + "learning_rate": 3.1377315580835115e-06, + "loss": 0.81499034, + "num_input_tokens_seen": 117201380, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.859375, + "step": 5458, + "time_per_iteration": 2.4749457836151123 + }, + { + "auxiliary_loss_clip": 0.01123398, + "auxiliary_loss_mlp": 0.01035313, + "balance_loss_clip": 1.01988721, + "balance_loss_mlp": 1.04204702, + "epoch": 0.3282128363144446, + "flos": 21250678354560.0, + "grad_norm": 4.694290331797846, + "language_loss": 0.72896576, + "learning_rate": 3.1374112313964686e-06, + "loss": 0.75055289, + "num_input_tokens_seen": 117221040, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.81640625, + "step": 5459, + "time_per_iteration": 2.4506032466888428 + }, + { + "auxiliary_loss_clip": 0.01128673, + "auxiliary_loss_mlp": 0.01037647, + "balance_loss_clip": 1.02303815, + "balance_loss_mlp": 1.04444695, + "epoch": 0.32827295956711255, + "flos": 30843064166400.0, + "grad_norm": 1.8402325574836436, + "language_loss": 0.84511495, + "learning_rate": 3.1370908615786783e-06, + "loss": 0.86677814, + "num_input_tokens_seen": 117241395, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.84375, + "step": 5460, + "time_per_iteration": 2.521491527557373 + }, + { + "auxiliary_loss_clip": 0.01125172, + "auxiliary_loss_mlp": 0.01035571, + "balance_loss_clip": 1.02176023, + "balance_loss_mlp": 1.0420599, + "epoch": 0.3283330828197806, + "flos": 25915006944000.0, + "grad_norm": 1.7736363390075318, + "language_loss": 0.76822042, + "learning_rate": 3.136770448642288e-06, + "loss": 0.78982782, + "num_input_tokens_seen": 117259340, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.83203125, + "step": 5461, + "time_per_iteration": 2.4919962882995605 + }, + { + "auxiliary_loss_clip": 0.01129873, + "auxiliary_loss_mlp": 0.010367, + "balance_loss_clip": 1.02015376, + "balance_loss_mlp": 1.04589903, + "epoch": 0.32839320607244854, + "flos": 38582065042560.0, + "grad_norm": 1.6989905310418616, + "language_loss": 0.62835252, + "learning_rate": 3.1364499925994484e-06, + "loss": 0.65001822, + "num_input_tokens_seen": 117282375, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.83984375, + "step": 5462, + "time_per_iteration": 2.6128923892974854 + }, + { + "auxiliary_loss_clip": 0.0112585, + "auxiliary_loss_mlp": 0.01033948, + "balance_loss_clip": 1.02048922, + "balance_loss_mlp": 1.04426169, + "epoch": 0.3284533293251165, + "flos": 26650888876800.0, + "grad_norm": 1.8014296603715538, + "language_loss": 0.78155506, + "learning_rate": 3.1361294934623115e-06, + "loss": 0.80315304, + "num_input_tokens_seen": 117303830, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.8125, + "step": 5463, + "time_per_iteration": 2.5255165100097656 + }, + { + "auxiliary_loss_clip": 0.0112647, + "auxiliary_loss_mlp": 0.01034449, + "balance_loss_clip": 1.02001238, + "balance_loss_mlp": 1.04409099, + "epoch": 0.32851345257778447, + "flos": 15304158904320.0, + "grad_norm": 2.049558292675733, + "language_loss": 0.7029627, + "learning_rate": 3.1358089512430303e-06, + "loss": 0.72457188, + "num_input_tokens_seen": 117320665, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.82421875, + "step": 5464, + "time_per_iteration": 2.460951089859009 + }, + { + "auxiliary_loss_clip": 0.01127719, + "auxiliary_loss_mlp": 0.01039646, + "balance_loss_clip": 1.02505457, + "balance_loss_mlp": 1.04683673, + "epoch": 0.32857357583045244, + "flos": 23513732881920.0, + "grad_norm": 1.6142145677103121, + "language_loss": 0.72746348, + "learning_rate": 3.1354883659537594e-06, + "loss": 0.74913716, + "num_input_tokens_seen": 117339795, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.80859375, + "step": 5465, + "time_per_iteration": 2.4767887592315674 + }, + { + "auxiliary_loss_clip": 0.01128882, + "auxiliary_loss_mlp": 0.01036634, + "balance_loss_clip": 1.02208447, + "balance_loss_mlp": 1.04690027, + "epoch": 0.3286336990831204, + "flos": 20995209849600.0, + "grad_norm": 1.6282981827525145, + "language_loss": 0.82756901, + "learning_rate": 3.1351677376066567e-06, + "loss": 0.84922415, + "num_input_tokens_seen": 117359525, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8203125, + "step": 5466, + "time_per_iteration": 2.463127613067627 + }, + { + "auxiliary_loss_clip": 0.01127231, + "auxiliary_loss_mlp": 0.01037656, + "balance_loss_clip": 1.02343404, + "balance_loss_mlp": 1.04421949, + "epoch": 0.32869382233578837, + "flos": 23658811914240.0, + "grad_norm": 1.6977355395672606, + "language_loss": 0.79485095, + "learning_rate": 3.134847066213879e-06, + "loss": 0.81649983, + "num_input_tokens_seen": 117380320, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.83203125, + "step": 5467, + "time_per_iteration": 2.482245683670044 + }, + { + "auxiliary_loss_clip": 0.01128659, + "auxiliary_loss_mlp": 0.01034682, + "balance_loss_clip": 1.02011502, + "balance_loss_mlp": 1.0452255, + "epoch": 0.32875394558845633, + "flos": 25336522408320.0, + "grad_norm": 1.5356074654715184, + "language_loss": 0.74795353, + "learning_rate": 3.134526351787587e-06, + "loss": 0.76958692, + "num_input_tokens_seen": 117400695, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8359375, + "step": 5468, + "time_per_iteration": 2.4828743934631348 + }, + { + "auxiliary_loss_clip": 0.01136832, + "auxiliary_loss_mlp": 0.01041148, + "balance_loss_clip": 1.02467322, + "balance_loss_mlp": 1.04996455, + "epoch": 0.3288140688411243, + "flos": 14903108576640.0, + "grad_norm": 1.8525214053644714, + "language_loss": 0.78469932, + "learning_rate": 3.134205594339942e-06, + "loss": 0.8064791, + "num_input_tokens_seen": 117418800, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8671875, + "step": 5469, + "time_per_iteration": 2.455672264099121 + }, + { + "auxiliary_loss_clip": 0.01129512, + "auxiliary_loss_mlp": 0.01034665, + "balance_loss_clip": 1.02008545, + "balance_loss_mlp": 1.04602098, + "epoch": 0.32887419209379226, + "flos": 18551345235840.0, + "grad_norm": 1.646072726718358, + "language_loss": 0.82014406, + "learning_rate": 3.133884793883107e-06, + "loss": 0.84178579, + "num_input_tokens_seen": 117438220, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8359375, + "step": 5470, + "time_per_iteration": 5.531651020050049 + }, + { + "auxiliary_loss_clip": 0.0112936, + "auxiliary_loss_mlp": 0.01038355, + "balance_loss_clip": 1.02315605, + "balance_loss_mlp": 1.04359245, + "epoch": 0.3289343153464602, + "flos": 48105610439040.0, + "grad_norm": 1.806312825179731, + "language_loss": 0.67675972, + "learning_rate": 3.1335639504292478e-06, + "loss": 0.69843686, + "num_input_tokens_seen": 117462560, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.859375, + "step": 5471, + "time_per_iteration": 2.7400858402252197 + }, + { + "auxiliary_loss_clip": 0.01135248, + "auxiliary_loss_mlp": 0.01042507, + "balance_loss_clip": 1.02578163, + "balance_loss_mlp": 1.04856122, + "epoch": 0.3289944385991282, + "flos": 27600295207680.0, + "grad_norm": 1.6357076803377442, + "language_loss": 0.65059721, + "learning_rate": 3.1332430639905288e-06, + "loss": 0.67237478, + "num_input_tokens_seen": 117483665, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8671875, + "step": 5472, + "time_per_iteration": 2.530604124069214 + }, + { + "auxiliary_loss_clip": 0.01132922, + "auxiliary_loss_mlp": 0.01043552, + "balance_loss_clip": 1.0271014, + "balance_loss_mlp": 1.04821706, + "epoch": 0.32905456185179616, + "flos": 20120318282880.0, + "grad_norm": 1.6631612231063349, + "language_loss": 0.88497955, + "learning_rate": 3.13292213457912e-06, + "loss": 0.9067443, + "num_input_tokens_seen": 117503565, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.84765625, + "step": 5473, + "time_per_iteration": 2.521026611328125 + }, + { + "auxiliary_loss_clip": 0.01133162, + "auxiliary_loss_mlp": 0.01043181, + "balance_loss_clip": 1.02669442, + "balance_loss_mlp": 1.0483191, + "epoch": 0.3291146851044642, + "flos": 23180230080000.0, + "grad_norm": 2.3087074790673423, + "language_loss": 0.78349268, + "learning_rate": 3.1326011622071903e-06, + "loss": 0.80525613, + "num_input_tokens_seen": 117521460, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.84765625, + "step": 5474, + "time_per_iteration": 2.4769628047943115 + }, + { + "auxiliary_loss_clip": 0.01047146, + "auxiliary_loss_mlp": 0.00999487, + "balance_loss_clip": 0.99740046, + "balance_loss_mlp": 1.02056372, + "epoch": 0.32917480835713214, + "flos": 67621912594560.0, + "grad_norm": 0.888273800575083, + "language_loss": 0.60237771, + "learning_rate": 3.132280146886911e-06, + "loss": 0.62284404, + "num_input_tokens_seen": 117580550, + "router_z_loss_clip": 0.02087402, + "router_z_loss_mlp": 0.265625, + "step": 5475, + "time_per_iteration": 3.039971351623535 + }, + { + "auxiliary_loss_clip": 0.01133227, + "auxiliary_loss_mlp": 0.01051514, + "balance_loss_clip": 1.03437138, + "balance_loss_mlp": 1.04512429, + "epoch": 0.3292349316098001, + "flos": 27964537073280.0, + "grad_norm": 2.5350164106808766, + "language_loss": 0.76634103, + "learning_rate": 3.131959088630455e-06, + "loss": 0.78818846, + "num_input_tokens_seen": 117600645, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.8828125, + "step": 5476, + "time_per_iteration": 2.488698959350586 + }, + { + "auxiliary_loss_clip": 0.01131587, + "auxiliary_loss_mlp": 0.01040976, + "balance_loss_clip": 1.02640307, + "balance_loss_mlp": 1.04819024, + "epoch": 0.3292950548624681, + "flos": 20263673462400.0, + "grad_norm": 1.8435246505513339, + "language_loss": 0.74520677, + "learning_rate": 3.131637987449997e-06, + "loss": 0.76693243, + "num_input_tokens_seen": 117618880, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8359375, + "step": 5477, + "time_per_iteration": 2.533641815185547 + }, + { + "auxiliary_loss_clip": 0.01124642, + "auxiliary_loss_mlp": 0.01034244, + "balance_loss_clip": 1.02036786, + "balance_loss_mlp": 1.04507232, + "epoch": 0.32935517811513604, + "flos": 20812999132800.0, + "grad_norm": 1.9138938380730264, + "language_loss": 0.75581098, + "learning_rate": 3.131316843357713e-06, + "loss": 0.7773999, + "num_input_tokens_seen": 117636445, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.796875, + "step": 5478, + "time_per_iteration": 2.4541866779327393 + }, + { + "auxiliary_loss_clip": 0.01129718, + "auxiliary_loss_mlp": 0.01036647, + "balance_loss_clip": 1.02218664, + "balance_loss_mlp": 1.04736805, + "epoch": 0.329415301367804, + "flos": 18441853603200.0, + "grad_norm": 1.6780134795902322, + "language_loss": 0.80241555, + "learning_rate": 3.1309956563657807e-06, + "loss": 0.82407916, + "num_input_tokens_seen": 117653105, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8203125, + "step": 5479, + "time_per_iteration": 2.5348050594329834 + }, + { + "auxiliary_loss_clip": 0.01046129, + "auxiliary_loss_mlp": 0.01003977, + "balance_loss_clip": 1.00210512, + "balance_loss_mlp": 1.01921439, + "epoch": 0.32947542462047197, + "flos": 66323024887680.0, + "grad_norm": 0.7411588561506779, + "language_loss": 0.56543052, + "learning_rate": 3.1306744264863804e-06, + "loss": 0.58593154, + "num_input_tokens_seen": 117719225, + "router_z_loss_clip": 0.01867676, + "router_z_loss_mlp": 0.26953125, + "step": 5480, + "time_per_iteration": 3.121812343597412 + }, + { + "auxiliary_loss_clip": 0.01128951, + "auxiliary_loss_mlp": 0.01044263, + "balance_loss_clip": 1.02871847, + "balance_loss_mlp": 1.04606879, + "epoch": 0.32953554787313993, + "flos": 23221599569280.0, + "grad_norm": 1.656023636160042, + "language_loss": 0.77029848, + "learning_rate": 3.1303531537316915e-06, + "loss": 0.79203057, + "num_input_tokens_seen": 117738725, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.828125, + "step": 5481, + "time_per_iteration": 2.4819936752319336 + }, + { + "auxiliary_loss_clip": 0.01129556, + "auxiliary_loss_mlp": 0.01034734, + "balance_loss_clip": 1.02028024, + "balance_loss_mlp": 1.04622722, + "epoch": 0.3295956711258079, + "flos": 27009492307200.0, + "grad_norm": 1.8057287203311059, + "language_loss": 0.78732938, + "learning_rate": 3.130031838113899e-06, + "loss": 0.80897224, + "num_input_tokens_seen": 117757765, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.83203125, + "step": 5482, + "time_per_iteration": 2.501615285873413 + }, + { + "auxiliary_loss_clip": 0.01129525, + "auxiliary_loss_mlp": 0.01041437, + "balance_loss_clip": 1.02601135, + "balance_loss_mlp": 1.04573894, + "epoch": 0.32965579437847586, + "flos": 19171702051200.0, + "grad_norm": 1.6414395423474737, + "language_loss": 0.74055123, + "learning_rate": 3.129710479645185e-06, + "loss": 0.76226085, + "num_input_tokens_seen": 117776810, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8359375, + "step": 5483, + "time_per_iteration": 2.5213518142700195 + }, + { + "auxiliary_loss_clip": 0.01128456, + "auxiliary_loss_mlp": 0.01032447, + "balance_loss_clip": 1.0187676, + "balance_loss_mlp": 1.04614615, + "epoch": 0.32971591763114383, + "flos": 30482521401600.0, + "grad_norm": 1.8373674608308554, + "language_loss": 0.75627816, + "learning_rate": 3.1293890783377366e-06, + "loss": 0.77788723, + "num_input_tokens_seen": 117797730, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.8203125, + "step": 5484, + "time_per_iteration": 2.543795108795166 + }, + { + "auxiliary_loss_clip": 0.01129378, + "auxiliary_loss_mlp": 0.01038991, + "balance_loss_clip": 1.02441156, + "balance_loss_mlp": 1.04699099, + "epoch": 0.3297760408838118, + "flos": 16289583598080.0, + "grad_norm": 2.1329507570753243, + "language_loss": 0.7209897, + "learning_rate": 3.129067634203742e-06, + "loss": 0.74267334, + "num_input_tokens_seen": 117815365, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.82421875, + "step": 5485, + "time_per_iteration": 2.4598846435546875 + }, + { + "auxiliary_loss_clip": 0.01124565, + "auxiliary_loss_mlp": 0.01040064, + "balance_loss_clip": 1.02626562, + "balance_loss_mlp": 1.04448354, + "epoch": 0.32983616413647976, + "flos": 29530924341120.0, + "grad_norm": 1.7963509228415293, + "language_loss": 0.80416954, + "learning_rate": 3.128746147255388e-06, + "loss": 0.8258158, + "num_input_tokens_seen": 117836095, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.80078125, + "step": 5486, + "time_per_iteration": 2.5368754863739014 + }, + { + "auxiliary_loss_clip": 0.011236, + "auxiliary_loss_mlp": 0.01037413, + "balance_loss_clip": 1.02264309, + "balance_loss_mlp": 1.04300976, + "epoch": 0.3298962873891478, + "flos": 20631398947200.0, + "grad_norm": 2.3473245188806056, + "language_loss": 0.84351611, + "learning_rate": 3.1284246175048683e-06, + "loss": 0.86512625, + "num_input_tokens_seen": 117854655, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8046875, + "step": 5487, + "time_per_iteration": 2.5140841007232666 + }, + { + "auxiliary_loss_clip": 0.01131842, + "auxiliary_loss_mlp": 0.01040276, + "balance_loss_clip": 1.02440929, + "balance_loss_mlp": 1.04636502, + "epoch": 0.32995641064181574, + "flos": 14976007228800.0, + "grad_norm": 2.289610395509379, + "language_loss": 0.74163198, + "learning_rate": 3.1281030449643735e-06, + "loss": 0.76335323, + "num_input_tokens_seen": 117873300, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.85546875, + "step": 5488, + "time_per_iteration": 2.4159257411956787 + }, + { + "auxiliary_loss_clip": 0.01127802, + "auxiliary_loss_mlp": 0.01040136, + "balance_loss_clip": 1.02519917, + "balance_loss_mlp": 1.04548192, + "epoch": 0.3300165338944837, + "flos": 18661447399680.0, + "grad_norm": 2.3379517114480004, + "language_loss": 0.72564352, + "learning_rate": 3.127781429646098e-06, + "loss": 0.74732298, + "num_input_tokens_seen": 117891540, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.82421875, + "step": 5489, + "time_per_iteration": 2.4810056686401367 + }, + { + "auxiliary_loss_clip": 0.01122617, + "auxiliary_loss_mlp": 0.01033113, + "balance_loss_clip": 1.01852202, + "balance_loss_mlp": 1.04076719, + "epoch": 0.3300766571471517, + "flos": 25583730785280.0, + "grad_norm": 2.5348585918072235, + "language_loss": 0.88752508, + "learning_rate": 3.127459771562238e-06, + "loss": 0.90908241, + "num_input_tokens_seen": 117907690, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8203125, + "step": 5490, + "time_per_iteration": 2.448437452316284 + }, + { + "auxiliary_loss_clip": 0.01121475, + "auxiliary_loss_mlp": 0.01034389, + "balance_loss_clip": 1.02022719, + "balance_loss_mlp": 1.0403626, + "epoch": 0.33013678039981964, + "flos": 11363501623680.0, + "grad_norm": 1.9493471797358817, + "language_loss": 0.83395195, + "learning_rate": 3.1271380707249907e-06, + "loss": 0.85551059, + "num_input_tokens_seen": 117925640, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8125, + "step": 5491, + "time_per_iteration": 2.44634747505188 + }, + { + "auxiliary_loss_clip": 0.01126063, + "auxiliary_loss_mlp": 0.01039892, + "balance_loss_clip": 1.02492499, + "balance_loss_mlp": 1.04421842, + "epoch": 0.3301969036524876, + "flos": 24821203939200.0, + "grad_norm": 2.715750342336911, + "language_loss": 0.77514994, + "learning_rate": 3.126816327146554e-06, + "loss": 0.79680943, + "num_input_tokens_seen": 117944525, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.81640625, + "step": 5492, + "time_per_iteration": 2.4870479106903076 + }, + { + "auxiliary_loss_clip": 0.01131001, + "auxiliary_loss_mlp": 0.0104338, + "balance_loss_clip": 1.0269649, + "balance_loss_mlp": 1.04629827, + "epoch": 0.33025702690515557, + "flos": 15961144613760.0, + "grad_norm": 2.2776411561569265, + "language_loss": 0.7450884, + "learning_rate": 3.12649454083913e-06, + "loss": 0.76683223, + "num_input_tokens_seen": 117962515, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.84375, + "step": 5493, + "time_per_iteration": 2.4506607055664062 + }, + { + "auxiliary_loss_clip": 0.01045286, + "auxiliary_loss_mlp": 0.01012729, + "balance_loss_clip": 1.01074982, + "balance_loss_mlp": 1.01881337, + "epoch": 0.33031715015782354, + "flos": 59416755989760.0, + "grad_norm": 0.7955029917088393, + "language_loss": 0.53910893, + "learning_rate": 3.12617271181492e-06, + "loss": 0.55968904, + "num_input_tokens_seen": 118018780, + "router_z_loss_clip": 0.01977539, + "router_z_loss_mlp": 0.265625, + "step": 5494, + "time_per_iteration": 3.0042550563812256 + }, + { + "auxiliary_loss_clip": 0.01124159, + "auxiliary_loss_mlp": 0.01037133, + "balance_loss_clip": 1.02245855, + "balance_loss_mlp": 1.04378355, + "epoch": 0.3303772734104915, + "flos": 23184360144000.0, + "grad_norm": 1.6073630563578136, + "language_loss": 0.87087989, + "learning_rate": 3.1258508400861276e-06, + "loss": 0.89249277, + "num_input_tokens_seen": 118038610, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8046875, + "step": 5495, + "time_per_iteration": 2.4716837406158447 + }, + { + "auxiliary_loss_clip": 0.01128875, + "auxiliary_loss_mlp": 0.01047751, + "balance_loss_clip": 1.03133559, + "balance_loss_mlp": 1.04508138, + "epoch": 0.33043739666315947, + "flos": 33071896010880.0, + "grad_norm": 3.5655917637781784, + "language_loss": 0.73526418, + "learning_rate": 3.1255289256649587e-06, + "loss": 0.75703049, + "num_input_tokens_seen": 118055905, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8359375, + "step": 5496, + "time_per_iteration": 2.531670570373535 + }, + { + "auxiliary_loss_clip": 0.01124295, + "auxiliary_loss_mlp": 0.01028859, + "balance_loss_clip": 1.01509058, + "balance_loss_mlp": 1.04384971, + "epoch": 0.33049751991582743, + "flos": 24895431394560.0, + "grad_norm": 2.1703031984353514, + "language_loss": 0.72764325, + "learning_rate": 3.1252069685636196e-06, + "loss": 0.74917477, + "num_input_tokens_seen": 118073695, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.8046875, + "step": 5497, + "time_per_iteration": 2.5148839950561523 + }, + { + "auxiliary_loss_clip": 0.01123603, + "auxiliary_loss_mlp": 0.01033477, + "balance_loss_clip": 1.01861119, + "balance_loss_mlp": 1.04340625, + "epoch": 0.3305576431684954, + "flos": 29460575554560.0, + "grad_norm": 2.5654673530164307, + "language_loss": 0.80193126, + "learning_rate": 3.124884968794321e-06, + "loss": 0.82350206, + "num_input_tokens_seen": 118094030, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.80078125, + "step": 5498, + "time_per_iteration": 2.517765522003174 + }, + { + "auxiliary_loss_clip": 0.01123393, + "auxiliary_loss_mlp": 0.01038843, + "balance_loss_clip": 1.02397776, + "balance_loss_mlp": 1.03977811, + "epoch": 0.33061776642116336, + "flos": 22632305040000.0, + "grad_norm": 2.1435474357237405, + "language_loss": 0.76491725, + "learning_rate": 3.12456292636927e-06, + "loss": 0.78653955, + "num_input_tokens_seen": 118111665, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8359375, + "step": 5499, + "time_per_iteration": 2.5006067752838135 + }, + { + "auxiliary_loss_clip": 0.01122541, + "auxiliary_loss_mlp": 0.01031983, + "balance_loss_clip": 1.0175705, + "balance_loss_mlp": 1.04131985, + "epoch": 0.3306778896738313, + "flos": 25776320532480.0, + "grad_norm": 1.506886865759599, + "language_loss": 0.79332948, + "learning_rate": 3.124240841300681e-06, + "loss": 0.81487471, + "num_input_tokens_seen": 118132435, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8125, + "step": 5500, + "time_per_iteration": 2.4859495162963867 + }, + { + "auxiliary_loss_clip": 0.01129022, + "auxiliary_loss_mlp": 0.0103113, + "balance_loss_clip": 1.01607347, + "balance_loss_mlp": 1.04564214, + "epoch": 0.33073801292649935, + "flos": 36940552479360.0, + "grad_norm": 2.164639953437845, + "language_loss": 0.66065335, + "learning_rate": 3.1239187136007665e-06, + "loss": 0.68225485, + "num_input_tokens_seen": 118155255, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.83203125, + "step": 5501, + "time_per_iteration": 2.6189892292022705 + }, + { + "auxiliary_loss_clip": 0.01126823, + "auxiliary_loss_mlp": 0.01041122, + "balance_loss_clip": 1.02471876, + "balance_loss_mlp": 1.04285216, + "epoch": 0.3307981361791673, + "flos": 12967738848000.0, + "grad_norm": 2.260615362067107, + "language_loss": 0.77580702, + "learning_rate": 3.1235965432817417e-06, + "loss": 0.79748642, + "num_input_tokens_seen": 118169865, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.83984375, + "step": 5502, + "time_per_iteration": 2.4086782932281494 + }, + { + "auxiliary_loss_clip": 0.01130061, + "auxiliary_loss_mlp": 0.01039, + "balance_loss_clip": 1.02389622, + "balance_loss_mlp": 1.04632545, + "epoch": 0.3308582594318353, + "flos": 25374372364800.0, + "grad_norm": 2.045089737815956, + "language_loss": 0.72346115, + "learning_rate": 3.123274330355824e-06, + "loss": 0.74515176, + "num_input_tokens_seen": 118190760, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8359375, + "step": 5503, + "time_per_iteration": 2.5176749229431152 + }, + { + "auxiliary_loss_clip": 0.0112493, + "auxiliary_loss_mlp": 0.01033516, + "balance_loss_clip": 1.01865053, + "balance_loss_mlp": 1.04248357, + "epoch": 0.33091838268450324, + "flos": 26468570419200.0, + "grad_norm": 1.5402224202893484, + "language_loss": 0.75216055, + "learning_rate": 3.12295207483523e-06, + "loss": 0.77374506, + "num_input_tokens_seen": 118213620, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.82421875, + "step": 5504, + "time_per_iteration": 2.530212879180908 + }, + { + "auxiliary_loss_clip": 0.01127019, + "auxiliary_loss_mlp": 0.01038843, + "balance_loss_clip": 1.02438283, + "balance_loss_mlp": 1.04382253, + "epoch": 0.3309785059371712, + "flos": 24971167221120.0, + "grad_norm": 1.6148817370045387, + "language_loss": 0.70049053, + "learning_rate": 3.1226297767321816e-06, + "loss": 0.72214913, + "num_input_tokens_seen": 118235010, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.83203125, + "step": 5505, + "time_per_iteration": 2.5212292671203613 + }, + { + "auxiliary_loss_clip": 0.01126444, + "auxiliary_loss_mlp": 0.01041221, + "balance_loss_clip": 1.02720845, + "balance_loss_mlp": 1.04601455, + "epoch": 0.3310386291898392, + "flos": 20446710192000.0, + "grad_norm": 1.586520967819923, + "language_loss": 0.81541443, + "learning_rate": 3.122307436058899e-06, + "loss": 0.83709103, + "num_input_tokens_seen": 118255820, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.8046875, + "step": 5506, + "time_per_iteration": 2.5494561195373535 + }, + { + "auxiliary_loss_clip": 0.01128621, + "auxiliary_loss_mlp": 0.01037729, + "balance_loss_clip": 1.02277398, + "balance_loss_mlp": 1.04704857, + "epoch": 0.33109875244250714, + "flos": 23182672204800.0, + "grad_norm": 1.929478423939084, + "language_loss": 0.79097712, + "learning_rate": 3.121985052827606e-06, + "loss": 0.81264055, + "num_input_tokens_seen": 118274160, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.81640625, + "step": 5507, + "time_per_iteration": 2.498659610748291 + }, + { + "auxiliary_loss_clip": 0.01123401, + "auxiliary_loss_mlp": 0.01040623, + "balance_loss_clip": 1.02594829, + "balance_loss_mlp": 1.04136062, + "epoch": 0.3311588756951751, + "flos": 24168384207360.0, + "grad_norm": 1.6667627205960738, + "language_loss": 0.71733725, + "learning_rate": 3.1216626270505274e-06, + "loss": 0.73897743, + "num_input_tokens_seen": 118294385, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8203125, + "step": 5508, + "time_per_iteration": 2.478593111038208 + }, + { + "auxiliary_loss_clip": 0.01124563, + "auxiliary_loss_mlp": 0.0102968, + "balance_loss_clip": 1.01566064, + "balance_loss_mlp": 1.04539418, + "epoch": 0.33121899894784307, + "flos": 28145742209280.0, + "grad_norm": 2.030813517097255, + "language_loss": 0.72023594, + "learning_rate": 3.12134015873989e-06, + "loss": 0.74177837, + "num_input_tokens_seen": 118313105, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 5509, + "time_per_iteration": 2.539806842803955 + }, + { + "auxiliary_loss_clip": 0.01126062, + "auxiliary_loss_mlp": 0.01034216, + "balance_loss_clip": 1.01975, + "balance_loss_mlp": 1.04503942, + "epoch": 0.33127912220051103, + "flos": 29567660976000.0, + "grad_norm": 1.5191607241878, + "language_loss": 0.73049426, + "learning_rate": 3.121017647907921e-06, + "loss": 0.75209701, + "num_input_tokens_seen": 118335250, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8125, + "step": 5510, + "time_per_iteration": 2.536083698272705 + }, + { + "auxiliary_loss_clip": 0.01123553, + "auxiliary_loss_mlp": 0.01035708, + "balance_loss_clip": 1.02148628, + "balance_loss_mlp": 1.0429213, + "epoch": 0.331339245453179, + "flos": 14428836374400.0, + "grad_norm": 2.1286159820346984, + "language_loss": 0.87371129, + "learning_rate": 3.1206950945668508e-06, + "loss": 0.89530391, + "num_input_tokens_seen": 118351470, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8046875, + "step": 5511, + "time_per_iteration": 2.4380695819854736 + }, + { + "auxiliary_loss_clip": 0.01119745, + "auxiliary_loss_mlp": 0.01032859, + "balance_loss_clip": 1.01986468, + "balance_loss_mlp": 1.04396749, + "epoch": 0.33139936870584696, + "flos": 20887118847360.0, + "grad_norm": 1.6025966363766477, + "language_loss": 0.72926772, + "learning_rate": 3.12037249872891e-06, + "loss": 0.7507937, + "num_input_tokens_seen": 118370970, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7578125, + "step": 5512, + "time_per_iteration": 5.464786767959595 + }, + { + "auxiliary_loss_clip": 0.01124343, + "auxiliary_loss_mlp": 0.01041694, + "balance_loss_clip": 1.02759719, + "balance_loss_mlp": 1.04466701, + "epoch": 0.33145949195851493, + "flos": 36284356869120.0, + "grad_norm": 1.8365879467062751, + "language_loss": 0.72230887, + "learning_rate": 3.1200498604063317e-06, + "loss": 0.7439692, + "num_input_tokens_seen": 118393125, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.796875, + "step": 5513, + "time_per_iteration": 2.6175873279571533 + }, + { + "auxiliary_loss_clip": 0.01128264, + "auxiliary_loss_mlp": 0.01034719, + "balance_loss_clip": 1.01972222, + "balance_loss_mlp": 1.04398656, + "epoch": 0.33151961521118295, + "flos": 14279735018880.0, + "grad_norm": 1.8557947519919487, + "language_loss": 0.68629253, + "learning_rate": 3.1197271796113507e-06, + "loss": 0.70792234, + "num_input_tokens_seen": 118410860, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84375, + "step": 5514, + "time_per_iteration": 2.4340810775756836 + }, + { + "auxiliary_loss_clip": 0.01127749, + "auxiliary_loss_mlp": 0.01041934, + "balance_loss_clip": 1.0251019, + "balance_loss_mlp": 1.04505849, + "epoch": 0.3315797384638509, + "flos": 20774323163520.0, + "grad_norm": 2.411486097564539, + "language_loss": 0.66439879, + "learning_rate": 3.1194044563562026e-06, + "loss": 0.6860956, + "num_input_tokens_seen": 118429570, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.828125, + "step": 5515, + "time_per_iteration": 2.4983339309692383 + }, + { + "auxiliary_loss_clip": 0.01124572, + "auxiliary_loss_mlp": 0.01034351, + "balance_loss_clip": 1.01960468, + "balance_loss_mlp": 1.04258537, + "epoch": 0.3316398617165189, + "flos": 24679464871680.0, + "grad_norm": 1.4970111675637168, + "language_loss": 0.69111156, + "learning_rate": 3.1190816906531257e-06, + "loss": 0.71270084, + "num_input_tokens_seen": 118450285, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8203125, + "step": 5516, + "time_per_iteration": 2.515367031097412 + }, + { + "auxiliary_loss_clip": 0.0112502, + "auxiliary_loss_mlp": 0.01036046, + "balance_loss_clip": 1.02154398, + "balance_loss_mlp": 1.04021645, + "epoch": 0.33169998496918685, + "flos": 18587974129920.0, + "grad_norm": 2.365933570102145, + "language_loss": 0.80287617, + "learning_rate": 3.118758882514359e-06, + "loss": 0.82448685, + "num_input_tokens_seen": 118468270, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.84765625, + "step": 5517, + "time_per_iteration": 2.5149497985839844 + }, + { + "auxiliary_loss_clip": 0.01121413, + "auxiliary_loss_mlp": 0.01036775, + "balance_loss_clip": 1.02264261, + "balance_loss_mlp": 1.04258931, + "epoch": 0.3317601082218548, + "flos": 20193647898240.0, + "grad_norm": 2.188422581245926, + "language_loss": 0.74551105, + "learning_rate": 3.118436031952143e-06, + "loss": 0.76709294, + "num_input_tokens_seen": 118486615, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7890625, + "step": 5518, + "time_per_iteration": 2.450188159942627 + }, + { + "auxiliary_loss_clip": 0.01048984, + "auxiliary_loss_mlp": 0.01008888, + "balance_loss_clip": 1.00682592, + "balance_loss_mlp": 1.02244139, + "epoch": 0.3318202314745228, + "flos": 68974703637120.0, + "grad_norm": 0.6172932492598038, + "language_loss": 0.54346693, + "learning_rate": 3.1181131389787206e-06, + "loss": 0.56404567, + "num_input_tokens_seen": 118553580, + "router_z_loss_clip": 0.02062988, + "router_z_loss_mlp": 0.265625, + "step": 5519, + "time_per_iteration": 3.167750358581543 + }, + { + "auxiliary_loss_clip": 0.0112453, + "auxiliary_loss_mlp": 0.01039354, + "balance_loss_clip": 1.0239042, + "balance_loss_mlp": 1.0434345, + "epoch": 0.33188035472719074, + "flos": 21500113374720.0, + "grad_norm": 3.8105825888408855, + "language_loss": 0.78854358, + "learning_rate": 3.117790203606336e-06, + "loss": 0.81018245, + "num_input_tokens_seen": 118570280, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8125, + "step": 5520, + "time_per_iteration": 2.451781988143921 + }, + { + "auxiliary_loss_clip": 0.01121269, + "auxiliary_loss_mlp": 0.01032127, + "balance_loss_clip": 1.01835227, + "balance_loss_mlp": 1.04244733, + "epoch": 0.3319404779798587, + "flos": 28870490926080.0, + "grad_norm": 2.656623957411012, + "language_loss": 0.76576293, + "learning_rate": 3.1174672258472344e-06, + "loss": 0.78729689, + "num_input_tokens_seen": 118590455, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7890625, + "step": 5521, + "time_per_iteration": 2.525865077972412 + }, + { + "auxiliary_loss_clip": 0.01126792, + "auxiliary_loss_mlp": 0.01044731, + "balance_loss_clip": 1.02932894, + "balance_loss_mlp": 1.04259682, + "epoch": 0.33200060123252667, + "flos": 23076915586560.0, + "grad_norm": 3.3004720611075964, + "language_loss": 0.70353854, + "learning_rate": 3.117144205713664e-06, + "loss": 0.72525376, + "num_input_tokens_seen": 118609495, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.83984375, + "step": 5522, + "time_per_iteration": 2.472001791000366 + }, + { + "auxiliary_loss_clip": 0.01123475, + "auxiliary_loss_mlp": 0.01030854, + "balance_loss_clip": 1.01739514, + "balance_loss_mlp": 1.04362595, + "epoch": 0.33206072448519464, + "flos": 21142479611520.0, + "grad_norm": 1.7154852702320889, + "language_loss": 0.74052203, + "learning_rate": 3.1168211432178735e-06, + "loss": 0.76206541, + "num_input_tokens_seen": 118628720, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.80078125, + "step": 5523, + "time_per_iteration": 2.4924776554107666 + }, + { + "auxiliary_loss_clip": 0.01122263, + "auxiliary_loss_mlp": 0.0103648, + "balance_loss_clip": 1.0211792, + "balance_loss_mlp": 1.04308188, + "epoch": 0.3321208477378626, + "flos": 13079097987840.0, + "grad_norm": 1.6905303226226114, + "language_loss": 0.82272083, + "learning_rate": 3.116498038372114e-06, + "loss": 0.84430826, + "num_input_tokens_seen": 118645955, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.79296875, + "step": 5524, + "time_per_iteration": 2.439711332321167 + }, + { + "auxiliary_loss_clip": 0.01123508, + "auxiliary_loss_mlp": 0.01038508, + "balance_loss_clip": 1.0251627, + "balance_loss_mlp": 1.04402184, + "epoch": 0.33218097099053057, + "flos": 21215414177280.0, + "grad_norm": 1.6540586406432352, + "language_loss": 0.8307848, + "learning_rate": 3.116174891188636e-06, + "loss": 0.85240501, + "num_input_tokens_seen": 118665605, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.79296875, + "step": 5525, + "time_per_iteration": 2.4927310943603516 + }, + { + "auxiliary_loss_clip": 0.01044531, + "auxiliary_loss_mlp": 0.01006175, + "balance_loss_clip": 1.00405347, + "balance_loss_mlp": 1.01804781, + "epoch": 0.33224109424319853, + "flos": 64348979189760.0, + "grad_norm": 0.7716933739699889, + "language_loss": 0.5260945, + "learning_rate": 3.1158517016796945e-06, + "loss": 0.54660153, + "num_input_tokens_seen": 118728155, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.265625, + "step": 5526, + "time_per_iteration": 3.0598835945129395 + }, + { + "auxiliary_loss_clip": 0.01126467, + "auxiliary_loss_mlp": 0.01042827, + "balance_loss_clip": 1.02724671, + "balance_loss_mlp": 1.04371929, + "epoch": 0.33230121749586655, + "flos": 17346003523200.0, + "grad_norm": 2.1037159361855737, + "language_loss": 0.77490491, + "learning_rate": 3.1155284698575445e-06, + "loss": 0.79659784, + "num_input_tokens_seen": 118743955, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.828125, + "step": 5527, + "time_per_iteration": 2.4878480434417725 + }, + { + "auxiliary_loss_clip": 0.01126946, + "auxiliary_loss_mlp": 0.01044009, + "balance_loss_clip": 1.03025246, + "balance_loss_mlp": 1.04651201, + "epoch": 0.3323613407485345, + "flos": 20997041443200.0, + "grad_norm": 2.9813221594214494, + "language_loss": 0.72143763, + "learning_rate": 3.1152051957344434e-06, + "loss": 0.74314719, + "num_input_tokens_seen": 118763275, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.8046875, + "step": 5528, + "time_per_iteration": 2.4562795162200928 + }, + { + "auxiliary_loss_clip": 0.0112635, + "auxiliary_loss_mlp": 0.01036386, + "balance_loss_clip": 1.02256346, + "balance_loss_mlp": 1.04463542, + "epoch": 0.3324214640012025, + "flos": 13152535344000.0, + "grad_norm": 1.7054310511699202, + "language_loss": 0.82638806, + "learning_rate": 3.1148818793226497e-06, + "loss": 0.84801543, + "num_input_tokens_seen": 118781110, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.81640625, + "step": 5529, + "time_per_iteration": 2.474243640899658 + }, + { + "auxiliary_loss_clip": 0.01129499, + "auxiliary_loss_mlp": 0.01036464, + "balance_loss_clip": 1.02223659, + "balance_loss_mlp": 1.04554248, + "epoch": 0.33248158725387045, + "flos": 22273522041600.0, + "grad_norm": 1.9738718949190572, + "language_loss": 0.69718957, + "learning_rate": 3.114558520634423e-06, + "loss": 0.71884924, + "num_input_tokens_seen": 118800620, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.83984375, + "step": 5530, + "time_per_iteration": 2.471686840057373 + }, + { + "auxiliary_loss_clip": 0.01127236, + "auxiliary_loss_mlp": 0.01045423, + "balance_loss_clip": 1.02996182, + "balance_loss_mlp": 1.04500127, + "epoch": 0.3325417105065384, + "flos": 20740998320640.0, + "grad_norm": 2.4616968900166643, + "language_loss": 0.7616601, + "learning_rate": 3.1142351196820256e-06, + "loss": 0.78338665, + "num_input_tokens_seen": 118818725, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8203125, + "step": 5531, + "time_per_iteration": 2.473328113555908 + }, + { + "auxiliary_loss_clip": 0.01128043, + "auxiliary_loss_mlp": 0.01037476, + "balance_loss_clip": 1.0231235, + "balance_loss_mlp": 1.04481292, + "epoch": 0.3326018337592064, + "flos": 24790536702720.0, + "grad_norm": 1.7553607817915955, + "language_loss": 0.73413068, + "learning_rate": 3.1139116764777206e-06, + "loss": 0.75578588, + "num_input_tokens_seen": 118839390, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.83203125, + "step": 5532, + "time_per_iteration": 2.4864931106567383 + }, + { + "auxiliary_loss_clip": 0.01128977, + "auxiliary_loss_mlp": 0.01026777, + "balance_loss_clip": 1.01321709, + "balance_loss_mlp": 1.04721618, + "epoch": 0.33266195701187434, + "flos": 14501699112960.0, + "grad_norm": 2.2280638741168057, + "language_loss": 0.65813714, + "learning_rate": 3.1135881910337735e-06, + "loss": 0.67969465, + "num_input_tokens_seen": 118856275, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.8203125, + "step": 5533, + "time_per_iteration": 2.5232229232788086 + }, + { + "auxiliary_loss_clip": 0.01126882, + "auxiliary_loss_mlp": 0.01040338, + "balance_loss_clip": 1.02541876, + "balance_loss_mlp": 1.04451632, + "epoch": 0.3327220802645423, + "flos": 15304410299520.0, + "grad_norm": 1.9248590192503388, + "language_loss": 0.70790148, + "learning_rate": 3.113264663362451e-06, + "loss": 0.72957367, + "num_input_tokens_seen": 118873830, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.82421875, + "step": 5534, + "time_per_iteration": 2.418875217437744 + }, + { + "auxiliary_loss_clip": 0.01125629, + "auxiliary_loss_mlp": 0.01033414, + "balance_loss_clip": 1.01890588, + "balance_loss_mlp": 1.04565191, + "epoch": 0.3327822035172103, + "flos": 23477534951040.0, + "grad_norm": 1.8142926842561948, + "language_loss": 0.6684956, + "learning_rate": 3.1129410934760204e-06, + "loss": 0.69008601, + "num_input_tokens_seen": 118891560, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.80078125, + "step": 5535, + "time_per_iteration": 2.5031726360321045 + }, + { + "auxiliary_loss_clip": 0.01126804, + "auxiliary_loss_mlp": 0.01038594, + "balance_loss_clip": 1.02450383, + "balance_loss_mlp": 1.04416704, + "epoch": 0.33284232676987824, + "flos": 25374516019200.0, + "grad_norm": 2.1308907042960525, + "language_loss": 0.72915065, + "learning_rate": 3.1126174813867517e-06, + "loss": 0.75080466, + "num_input_tokens_seen": 118910260, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.828125, + "step": 5536, + "time_per_iteration": 2.494007110595703 + }, + { + "auxiliary_loss_clip": 0.01126771, + "auxiliary_loss_mlp": 0.01038616, + "balance_loss_clip": 1.02474046, + "balance_loss_mlp": 1.0450089, + "epoch": 0.3329024500225462, + "flos": 23694363400320.0, + "grad_norm": 1.6653416647198893, + "language_loss": 0.81801486, + "learning_rate": 3.112293827106917e-06, + "loss": 0.83966869, + "num_input_tokens_seen": 118929985, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.81640625, + "step": 5537, + "time_per_iteration": 2.611788272857666 + }, + { + "auxiliary_loss_clip": 0.01131655, + "auxiliary_loss_mlp": 0.01042421, + "balance_loss_clip": 1.02805638, + "balance_loss_mlp": 1.04771638, + "epoch": 0.33296257327521417, + "flos": 31723163205120.0, + "grad_norm": 1.938500745409862, + "language_loss": 0.71606827, + "learning_rate": 3.111970130648789e-06, + "loss": 0.73780894, + "num_input_tokens_seen": 118951355, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.83984375, + "step": 5538, + "time_per_iteration": 2.538574695587158 + }, + { + "auxiliary_loss_clip": 0.01125022, + "auxiliary_loss_mlp": 0.01030337, + "balance_loss_clip": 1.01642489, + "balance_loss_mlp": 1.04461074, + "epoch": 0.33302269652788213, + "flos": 22744705674240.0, + "grad_norm": 2.0173985756025417, + "language_loss": 0.7442342, + "learning_rate": 3.1116463920246424e-06, + "loss": 0.76578778, + "num_input_tokens_seen": 118970910, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.8046875, + "step": 5539, + "time_per_iteration": 2.539393424987793 + }, + { + "auxiliary_loss_clip": 0.01132315, + "auxiliary_loss_mlp": 0.01045465, + "balance_loss_clip": 1.03062367, + "balance_loss_mlp": 1.04543138, + "epoch": 0.33308281978055015, + "flos": 11473747441920.0, + "grad_norm": 1.8798801752229715, + "language_loss": 0.70726681, + "learning_rate": 3.1113226112467527e-06, + "loss": 0.72904468, + "num_input_tokens_seen": 118989200, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8671875, + "step": 5540, + "time_per_iteration": 2.460745096206665 + }, + { + "auxiliary_loss_clip": 0.01123042, + "auxiliary_loss_mlp": 0.01035986, + "balance_loss_clip": 1.02156138, + "balance_loss_mlp": 1.04151917, + "epoch": 0.3331429430332181, + "flos": 38213693112960.0, + "grad_norm": 2.212860979219503, + "language_loss": 0.60678709, + "learning_rate": 3.1109987883273983e-06, + "loss": 0.62837738, + "num_input_tokens_seen": 119011030, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8125, + "step": 5541, + "time_per_iteration": 2.643308162689209 + }, + { + "auxiliary_loss_clip": 0.01129096, + "auxiliary_loss_mlp": 0.01040856, + "balance_loss_clip": 1.0256207, + "balance_loss_mlp": 1.04428339, + "epoch": 0.3332030662858861, + "flos": 22528667324160.0, + "grad_norm": 1.7250198470895146, + "language_loss": 0.68636936, + "learning_rate": 3.1106749232788584e-06, + "loss": 0.70806885, + "num_input_tokens_seen": 119030620, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8515625, + "step": 5542, + "time_per_iteration": 2.472029209136963 + }, + { + "auxiliary_loss_clip": 0.0112742, + "auxiliary_loss_mlp": 0.01037594, + "balance_loss_clip": 1.02362895, + "balance_loss_mlp": 1.04488277, + "epoch": 0.33326318953855405, + "flos": 15997773507840.0, + "grad_norm": 1.6472310915335262, + "language_loss": 0.75526464, + "learning_rate": 3.110351016113414e-06, + "loss": 0.77691472, + "num_input_tokens_seen": 119048015, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.82421875, + "step": 5543, + "time_per_iteration": 2.453550100326538 + }, + { + "auxiliary_loss_clip": 0.01132047, + "auxiliary_loss_mlp": 0.01037735, + "balance_loss_clip": 1.02342415, + "balance_loss_mlp": 1.04834402, + "epoch": 0.333323312791222, + "flos": 25593535198080.0, + "grad_norm": 1.6694578175563026, + "language_loss": 0.75282717, + "learning_rate": 3.110027066843348e-06, + "loss": 0.77452493, + "num_input_tokens_seen": 119066280, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.8359375, + "step": 5544, + "time_per_iteration": 2.486992835998535 + }, + { + "auxiliary_loss_clip": 0.01124934, + "auxiliary_loss_mlp": 0.01033224, + "balance_loss_clip": 1.01910329, + "balance_loss_mlp": 1.04350412, + "epoch": 0.33338343604389, + "flos": 25119550304640.0, + "grad_norm": 1.4864809930890506, + "language_loss": 0.70886022, + "learning_rate": 3.1097030754809456e-06, + "loss": 0.73044181, + "num_input_tokens_seen": 119087680, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8125, + "step": 5545, + "time_per_iteration": 2.5813279151916504 + }, + { + "auxiliary_loss_clip": 0.01125022, + "auxiliary_loss_mlp": 0.01037243, + "balance_loss_clip": 1.02333164, + "balance_loss_mlp": 1.04530168, + "epoch": 0.33344355929655795, + "flos": 16947287579520.0, + "grad_norm": 1.7150542013191912, + "language_loss": 0.69300294, + "learning_rate": 3.1093790420384894e-06, + "loss": 0.7146256, + "num_input_tokens_seen": 119105820, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.796875, + "step": 5546, + "time_per_iteration": 2.4564788341522217 + }, + { + "auxiliary_loss_clip": 0.01129119, + "auxiliary_loss_mlp": 0.01037833, + "balance_loss_clip": 1.02340865, + "balance_loss_mlp": 1.04343665, + "epoch": 0.3335036825492259, + "flos": 27889591345920.0, + "grad_norm": 1.6632006519185205, + "language_loss": 0.64804697, + "learning_rate": 3.1090549665282702e-06, + "loss": 0.66971648, + "num_input_tokens_seen": 119126630, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.859375, + "step": 5547, + "time_per_iteration": 2.554959774017334 + }, + { + "auxiliary_loss_clip": 0.01127842, + "auxiliary_loss_mlp": 0.01030841, + "balance_loss_clip": 1.01782918, + "balance_loss_mlp": 1.0467664, + "epoch": 0.3335638058018939, + "flos": 16179553261440.0, + "grad_norm": 2.454082693277369, + "language_loss": 0.856148, + "learning_rate": 3.1087308489625742e-06, + "loss": 0.87773478, + "num_input_tokens_seen": 119143375, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.8125, + "step": 5548, + "time_per_iteration": 2.451032876968384 + }, + { + "auxiliary_loss_clip": 0.01129139, + "auxiliary_loss_mlp": 0.01036581, + "balance_loss_clip": 1.02100003, + "balance_loss_mlp": 1.04508662, + "epoch": 0.33362392905456184, + "flos": 39896108288640.0, + "grad_norm": 2.024965729715467, + "language_loss": 0.74754196, + "learning_rate": 3.1084066893536945e-06, + "loss": 0.76919919, + "num_input_tokens_seen": 119166450, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.83984375, + "step": 5549, + "time_per_iteration": 2.6875991821289062 + }, + { + "auxiliary_loss_clip": 0.01128755, + "auxiliary_loss_mlp": 0.01038769, + "balance_loss_clip": 1.02362955, + "balance_loss_mlp": 1.04486775, + "epoch": 0.3336840523072298, + "flos": 44271212567040.0, + "grad_norm": 1.8150391856089545, + "language_loss": 0.68361247, + "learning_rate": 3.108082487713921e-06, + "loss": 0.70528769, + "num_input_tokens_seen": 119189645, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.83984375, + "step": 5550, + "time_per_iteration": 2.640758752822876 + }, + { + "auxiliary_loss_clip": 0.0112866, + "auxiliary_loss_mlp": 0.01039899, + "balance_loss_clip": 1.02611244, + "balance_loss_mlp": 1.04545677, + "epoch": 0.33374417555989777, + "flos": 15085678429440.0, + "grad_norm": 1.742869766825136, + "language_loss": 0.60666394, + "learning_rate": 3.1077582440555495e-06, + "loss": 0.62834954, + "num_input_tokens_seen": 119208045, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.83203125, + "step": 5551, + "time_per_iteration": 2.454871654510498 + }, + { + "auxiliary_loss_clip": 0.01127389, + "auxiliary_loss_mlp": 0.01040643, + "balance_loss_clip": 1.02569366, + "balance_loss_mlp": 1.0459497, + "epoch": 0.33380429881256574, + "flos": 15849174942720.0, + "grad_norm": 1.6119589143573256, + "language_loss": 0.70450759, + "learning_rate": 3.1074339583908746e-06, + "loss": 0.72618788, + "num_input_tokens_seen": 119224910, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8125, + "step": 5552, + "time_per_iteration": 2.4226949214935303 + }, + { + "auxiliary_loss_clip": 0.01127587, + "auxiliary_loss_mlp": 0.01036933, + "balance_loss_clip": 1.02297902, + "balance_loss_mlp": 1.04462051, + "epoch": 0.33386442206523376, + "flos": 13480327883520.0, + "grad_norm": 2.0022942324560145, + "language_loss": 0.8289907, + "learning_rate": 3.107109630732192e-06, + "loss": 0.85063589, + "num_input_tokens_seen": 119243290, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.828125, + "step": 5553, + "time_per_iteration": 3.8951358795166016 + }, + { + "auxiliary_loss_clip": 0.01128647, + "auxiliary_loss_mlp": 0.01036827, + "balance_loss_clip": 1.0211153, + "balance_loss_mlp": 1.04528964, + "epoch": 0.3339245453179017, + "flos": 16690669839360.0, + "grad_norm": 2.095475541363027, + "language_loss": 0.81220448, + "learning_rate": 3.1067852610918017e-06, + "loss": 0.83385921, + "num_input_tokens_seen": 119261195, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.83203125, + "step": 5554, + "time_per_iteration": 3.8097896575927734 + }, + { + "auxiliary_loss_clip": 0.01128551, + "auxiliary_loss_mlp": 0.0104249, + "balance_loss_clip": 1.02811968, + "balance_loss_mlp": 1.0457983, + "epoch": 0.3339846685705697, + "flos": 24610624456320.0, + "grad_norm": 1.4459560856203526, + "language_loss": 0.81277251, + "learning_rate": 3.1064608494820032e-06, + "loss": 0.83448291, + "num_input_tokens_seen": 119282845, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.828125, + "step": 5555, + "time_per_iteration": 2.51686954498291 + }, + { + "auxiliary_loss_clip": 0.01126865, + "auxiliary_loss_mlp": 0.01038536, + "balance_loss_clip": 1.02469552, + "balance_loss_mlp": 1.04441357, + "epoch": 0.33404479182323765, + "flos": 30953812775040.0, + "grad_norm": 1.713035899616047, + "language_loss": 0.74563497, + "learning_rate": 3.106136395915099e-06, + "loss": 0.76728898, + "num_input_tokens_seen": 119304430, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.82421875, + "step": 5556, + "time_per_iteration": 2.550630807876587 + }, + { + "auxiliary_loss_clip": 0.0112773, + "auxiliary_loss_mlp": 0.01038673, + "balance_loss_clip": 1.02459431, + "balance_loss_mlp": 1.04586554, + "epoch": 0.3341049150759056, + "flos": 23513301918720.0, + "grad_norm": 1.4096864083862861, + "language_loss": 0.82588691, + "learning_rate": 3.105811900403391e-06, + "loss": 0.84755093, + "num_input_tokens_seen": 119323830, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8203125, + "step": 5557, + "time_per_iteration": 2.498108148574829 + }, + { + "auxiliary_loss_clip": 0.01129625, + "auxiliary_loss_mlp": 0.01045289, + "balance_loss_clip": 1.03055513, + "balance_loss_mlp": 1.04486346, + "epoch": 0.3341650383285736, + "flos": 24026824707840.0, + "grad_norm": 1.7414701325609587, + "language_loss": 0.80056083, + "learning_rate": 3.1054873629591855e-06, + "loss": 0.82230997, + "num_input_tokens_seen": 119346340, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.84765625, + "step": 5558, + "time_per_iteration": 2.5519607067108154 + }, + { + "auxiliary_loss_clip": 0.01129512, + "auxiliary_loss_mlp": 0.01034927, + "balance_loss_clip": 1.02159929, + "balance_loss_mlp": 1.04537535, + "epoch": 0.33422516158124155, + "flos": 24901967669760.0, + "grad_norm": 1.595273660638049, + "language_loss": 0.81953323, + "learning_rate": 3.105162783594788e-06, + "loss": 0.84117764, + "num_input_tokens_seen": 119367285, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.84375, + "step": 5559, + "time_per_iteration": 2.5202248096466064 + }, + { + "auxiliary_loss_clip": 0.01126195, + "auxiliary_loss_mlp": 0.01038303, + "balance_loss_clip": 1.02384293, + "balance_loss_mlp": 1.04450536, + "epoch": 0.3342852848339095, + "flos": 18333403464960.0, + "grad_norm": 2.784570608011319, + "language_loss": 0.72027284, + "learning_rate": 3.1048381623225074e-06, + "loss": 0.74191785, + "num_input_tokens_seen": 119385370, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.81640625, + "step": 5560, + "time_per_iteration": 2.453016757965088 + }, + { + "auxiliary_loss_clip": 0.01133571, + "auxiliary_loss_mlp": 0.01046441, + "balance_loss_clip": 1.03118193, + "balance_loss_mlp": 1.04679513, + "epoch": 0.3343454080865775, + "flos": 30046530119040.0, + "grad_norm": 2.584817000325422, + "language_loss": 0.74888778, + "learning_rate": 3.1045134991546526e-06, + "loss": 0.77068788, + "num_input_tokens_seen": 119409150, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8671875, + "step": 5561, + "time_per_iteration": 2.526980400085449 + }, + { + "auxiliary_loss_clip": 0.01128977, + "auxiliary_loss_mlp": 0.01038649, + "balance_loss_clip": 1.02410603, + "balance_loss_mlp": 1.04610825, + "epoch": 0.33440553133924544, + "flos": 16398823835520.0, + "grad_norm": 2.2689753945529176, + "language_loss": 0.69638503, + "learning_rate": 3.1041887941035355e-06, + "loss": 0.71806127, + "num_input_tokens_seen": 119426475, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.828125, + "step": 5562, + "time_per_iteration": 2.483530282974243 + }, + { + "auxiliary_loss_clip": 0.01127212, + "auxiliary_loss_mlp": 0.01041398, + "balance_loss_clip": 1.02821374, + "balance_loss_mlp": 1.04549575, + "epoch": 0.3344656545919134, + "flos": 24242072958720.0, + "grad_norm": 1.5595683236821118, + "language_loss": 0.65407914, + "learning_rate": 3.1038640471814685e-06, + "loss": 0.67576528, + "num_input_tokens_seen": 119446900, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.8203125, + "step": 5563, + "time_per_iteration": 2.489734649658203 + }, + { + "auxiliary_loss_clip": 0.01131891, + "auxiliary_loss_mlp": 0.0104331, + "balance_loss_clip": 1.027843, + "balance_loss_mlp": 1.0464654, + "epoch": 0.3345257778445814, + "flos": 52118843149440.0, + "grad_norm": 3.650208894964183, + "language_loss": 0.74457055, + "learning_rate": 3.103539258400766e-06, + "loss": 0.76632255, + "num_input_tokens_seen": 119470945, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.85546875, + "step": 5564, + "time_per_iteration": 2.7312686443328857 + }, + { + "auxiliary_loss_clip": 0.01049511, + "auxiliary_loss_mlp": 0.00999253, + "balance_loss_clip": 0.99735802, + "balance_loss_mlp": 1.02280784, + "epoch": 0.33458590109724934, + "flos": 68048602254720.0, + "grad_norm": 0.7800603717209338, + "language_loss": 0.55489159, + "learning_rate": 3.103214427773745e-06, + "loss": 0.57537925, + "num_input_tokens_seen": 119529925, + "router_z_loss_clip": 0.0189209, + "router_z_loss_mlp": 0.265625, + "step": 5565, + "time_per_iteration": 3.0266246795654297 + }, + { + "auxiliary_loss_clip": 0.01126829, + "auxiliary_loss_mlp": 0.0103706, + "balance_loss_clip": 1.02271366, + "balance_loss_mlp": 1.04589689, + "epoch": 0.3346460243499173, + "flos": 37414788768000.0, + "grad_norm": 1.7346222757402157, + "language_loss": 0.64754677, + "learning_rate": 3.102889555312721e-06, + "loss": 0.66918564, + "num_input_tokens_seen": 119550700, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.80859375, + "step": 5566, + "time_per_iteration": 2.5819363594055176 + }, + { + "auxiliary_loss_clip": 0.01128946, + "auxiliary_loss_mlp": 0.01040566, + "balance_loss_clip": 1.0259037, + "balance_loss_mlp": 1.04706717, + "epoch": 0.3347061476025853, + "flos": 18697358021760.0, + "grad_norm": 1.73011072762743, + "language_loss": 0.77735972, + "learning_rate": 3.102564641030016e-06, + "loss": 0.7990548, + "num_input_tokens_seen": 119569295, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8203125, + "step": 5567, + "time_per_iteration": 2.508108377456665 + }, + { + "auxiliary_loss_clip": 0.01130701, + "auxiliary_loss_mlp": 0.01040305, + "balance_loss_clip": 1.02480745, + "balance_loss_mlp": 1.04583585, + "epoch": 0.3347662708552533, + "flos": 13917827537280.0, + "grad_norm": 1.719738804733239, + "language_loss": 0.76512182, + "learning_rate": 3.102239684937949e-06, + "loss": 0.78683186, + "num_input_tokens_seen": 119587375, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84765625, + "step": 5568, + "time_per_iteration": 2.4344217777252197 + }, + { + "auxiliary_loss_clip": 0.01130164, + "auxiliary_loss_mlp": 0.01044901, + "balance_loss_clip": 1.02973104, + "balance_loss_mlp": 1.04528308, + "epoch": 0.33482639410792125, + "flos": 19750402068480.0, + "grad_norm": 2.265483767853782, + "language_loss": 0.71277773, + "learning_rate": 3.101914687048842e-06, + "loss": 0.73452842, + "num_input_tokens_seen": 119604530, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.84765625, + "step": 5569, + "time_per_iteration": 2.462592840194702 + }, + { + "auxiliary_loss_clip": 0.0112772, + "auxiliary_loss_mlp": 0.0103514, + "balance_loss_clip": 1.01920176, + "balance_loss_mlp": 1.04275155, + "epoch": 0.3348865173605892, + "flos": 16102991422080.0, + "grad_norm": 1.859999754882374, + "language_loss": 0.90291858, + "learning_rate": 3.10158964737502e-06, + "loss": 0.9245472, + "num_input_tokens_seen": 119621025, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8515625, + "step": 5570, + "time_per_iteration": 2.432124614715576 + }, + { + "auxiliary_loss_clip": 0.0112712, + "auxiliary_loss_mlp": 0.01030792, + "balance_loss_clip": 1.01634383, + "balance_loss_mlp": 1.04461455, + "epoch": 0.3349466406132572, + "flos": 25008945350400.0, + "grad_norm": 1.7333982724081918, + "language_loss": 0.80038494, + "learning_rate": 3.101264565928808e-06, + "loss": 0.82196403, + "num_input_tokens_seen": 119641725, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.82421875, + "step": 5571, + "time_per_iteration": 2.52752947807312 + }, + { + "auxiliary_loss_clip": 0.0104544, + "auxiliary_loss_mlp": 0.00998336, + "balance_loss_clip": 0.99651235, + "balance_loss_mlp": 1.01880455, + "epoch": 0.33500676386592515, + "flos": 54319991564160.0, + "grad_norm": 0.9063074837999179, + "language_loss": 0.55948162, + "learning_rate": 3.1009394427225335e-06, + "loss": 0.5799194, + "num_input_tokens_seen": 119693560, + "router_z_loss_clip": 0.01818848, + "router_z_loss_mlp": 0.265625, + "step": 5572, + "time_per_iteration": 3.0247979164123535 + }, + { + "auxiliary_loss_clip": 0.01131084, + "auxiliary_loss_mlp": 0.01046374, + "balance_loss_clip": 1.03212237, + "balance_loss_mlp": 1.04797339, + "epoch": 0.3350668871185931, + "flos": 26797332625920.0, + "grad_norm": 2.028320341949736, + "language_loss": 0.78112698, + "learning_rate": 3.1006142777685257e-06, + "loss": 0.80290151, + "num_input_tokens_seen": 119712935, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.83203125, + "step": 5573, + "time_per_iteration": 2.5152878761291504 + }, + { + "auxiliary_loss_clip": 0.01130248, + "auxiliary_loss_mlp": 0.01046989, + "balance_loss_clip": 1.03143215, + "balance_loss_mlp": 1.04525197, + "epoch": 0.3351270103712611, + "flos": 33510508986240.0, + "grad_norm": 2.1279768530108503, + "language_loss": 0.72473001, + "learning_rate": 3.1002890710791133e-06, + "loss": 0.7465024, + "num_input_tokens_seen": 119731680, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8515625, + "step": 5574, + "time_per_iteration": 2.543531656265259 + }, + { + "auxiliary_loss_clip": 0.01125319, + "auxiliary_loss_mlp": 0.0103147, + "balance_loss_clip": 1.017308, + "balance_loss_mlp": 1.04292774, + "epoch": 0.33518713362392905, + "flos": 26506240807680.0, + "grad_norm": 2.78085640379241, + "language_loss": 0.87911499, + "learning_rate": 3.0999638226666287e-06, + "loss": 0.90068293, + "num_input_tokens_seen": 119752155, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.82421875, + "step": 5575, + "time_per_iteration": 2.546952724456787 + }, + { + "auxiliary_loss_clip": 0.01132707, + "auxiliary_loss_mlp": 0.01045745, + "balance_loss_clip": 1.02899647, + "balance_loss_mlp": 1.04479516, + "epoch": 0.335247256876597, + "flos": 17232345912960.0, + "grad_norm": 2.569353520757799, + "language_loss": 0.82441479, + "learning_rate": 3.0996385325434063e-06, + "loss": 0.84619927, + "num_input_tokens_seen": 119769195, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.875, + "step": 5576, + "time_per_iteration": 2.414294958114624 + }, + { + "auxiliary_loss_clip": 0.01129312, + "auxiliary_loss_mlp": 0.01044917, + "balance_loss_clip": 1.0286808, + "balance_loss_mlp": 1.043697, + "epoch": 0.335307380129265, + "flos": 25629373992960.0, + "grad_norm": 3.008815557703919, + "language_loss": 0.73384887, + "learning_rate": 3.0993132007217806e-06, + "loss": 0.75559115, + "num_input_tokens_seen": 119786810, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.85546875, + "step": 5577, + "time_per_iteration": 2.50136399269104 + }, + { + "auxiliary_loss_clip": 0.01131921, + "auxiliary_loss_mlp": 0.0104202, + "balance_loss_clip": 1.02667177, + "balance_loss_mlp": 1.04811549, + "epoch": 0.33536750338193294, + "flos": 19680089195520.0, + "grad_norm": 1.7225109171896533, + "language_loss": 0.81555498, + "learning_rate": 3.0989878272140883e-06, + "loss": 0.8372944, + "num_input_tokens_seen": 119805395, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8359375, + "step": 5578, + "time_per_iteration": 2.431365728378296 + }, + { + "auxiliary_loss_clip": 0.01125183, + "auxiliary_loss_mlp": 0.0103725, + "balance_loss_clip": 1.02277184, + "balance_loss_mlp": 1.04578936, + "epoch": 0.3354276266346009, + "flos": 18332613365760.0, + "grad_norm": 1.8947087551065327, + "language_loss": 0.71785814, + "learning_rate": 3.0986624120326676e-06, + "loss": 0.73948246, + "num_input_tokens_seen": 119823135, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.79296875, + "step": 5579, + "time_per_iteration": 2.4519495964050293 + }, + { + "auxiliary_loss_clip": 0.01130811, + "auxiliary_loss_mlp": 0.01037429, + "balance_loss_clip": 1.02191353, + "balance_loss_mlp": 1.0456152, + "epoch": 0.3354877498872689, + "flos": 17858556645120.0, + "grad_norm": 2.0306401350469225, + "language_loss": 0.81084043, + "learning_rate": 3.0983369551898573e-06, + "loss": 0.83252287, + "num_input_tokens_seen": 119842265, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8515625, + "step": 5580, + "time_per_iteration": 2.427481174468994 + }, + { + "auxiliary_loss_clip": 0.01130056, + "auxiliary_loss_mlp": 0.01036614, + "balance_loss_clip": 1.02091432, + "balance_loss_mlp": 1.04496789, + "epoch": 0.3355478731399369, + "flos": 24717745791360.0, + "grad_norm": 1.8687829543354073, + "language_loss": 0.77912092, + "learning_rate": 3.0980114566980003e-06, + "loss": 0.80078757, + "num_input_tokens_seen": 119862500, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8515625, + "step": 5581, + "time_per_iteration": 2.5320229530334473 + }, + { + "auxiliary_loss_clip": 0.01132086, + "auxiliary_loss_mlp": 0.01045037, + "balance_loss_clip": 1.02735782, + "balance_loss_mlp": 1.04367673, + "epoch": 0.33560799639260486, + "flos": 16873886136960.0, + "grad_norm": 5.02896087449, + "language_loss": 0.74623251, + "learning_rate": 3.0976859165694384e-06, + "loss": 0.76800376, + "num_input_tokens_seen": 119880160, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.8828125, + "step": 5582, + "time_per_iteration": 2.421482801437378 + }, + { + "auxiliary_loss_clip": 0.0113015, + "auxiliary_loss_mlp": 0.01041343, + "balance_loss_clip": 1.02528524, + "balance_loss_mlp": 1.04456937, + "epoch": 0.3356681196452728, + "flos": 18333511205760.0, + "grad_norm": 1.790512330860928, + "language_loss": 0.82143587, + "learning_rate": 3.0973603348165166e-06, + "loss": 0.84315073, + "num_input_tokens_seen": 119899040, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.85546875, + "step": 5583, + "time_per_iteration": 2.4543566703796387 + }, + { + "auxiliary_loss_clip": 0.01127596, + "auxiliary_loss_mlp": 0.01044573, + "balance_loss_clip": 1.02991009, + "balance_loss_mlp": 1.04491317, + "epoch": 0.3357282428979408, + "flos": 34750612085760.0, + "grad_norm": 1.9267692381394996, + "language_loss": 0.7779209, + "learning_rate": 3.097034711451581e-06, + "loss": 0.79964256, + "num_input_tokens_seen": 119921120, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.828125, + "step": 5584, + "time_per_iteration": 2.6100947856903076 + }, + { + "auxiliary_loss_clip": 0.01129164, + "auxiliary_loss_mlp": 0.01038197, + "balance_loss_clip": 1.02343249, + "balance_loss_mlp": 1.04359186, + "epoch": 0.33578836615060875, + "flos": 21580087006080.0, + "grad_norm": 1.4758908421399493, + "language_loss": 0.75978506, + "learning_rate": 3.0967090464869795e-06, + "loss": 0.78145868, + "num_input_tokens_seen": 119940165, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.85546875, + "step": 5585, + "time_per_iteration": 2.4898715019226074 + }, + { + "auxiliary_loss_clip": 0.01121936, + "auxiliary_loss_mlp": 0.01037049, + "balance_loss_clip": 1.02170694, + "balance_loss_mlp": 1.04066801, + "epoch": 0.3358484894032767, + "flos": 24530291688960.0, + "grad_norm": 1.4987207146888684, + "language_loss": 0.77731383, + "learning_rate": 3.0963833399350608e-06, + "loss": 0.79890364, + "num_input_tokens_seen": 119959730, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8125, + "step": 5586, + "time_per_iteration": 2.4825005531311035 + }, + { + "auxiliary_loss_clip": 0.01136236, + "auxiliary_loss_mlp": 0.01048607, + "balance_loss_clip": 1.03070199, + "balance_loss_mlp": 1.04787183, + "epoch": 0.3359086126559447, + "flos": 22455589104000.0, + "grad_norm": 1.6235624689574053, + "language_loss": 0.81027555, + "learning_rate": 3.0960575918081756e-06, + "loss": 0.83212399, + "num_input_tokens_seen": 119979315, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.8828125, + "step": 5587, + "time_per_iteration": 2.486459493637085 + }, + { + "auxiliary_loss_clip": 0.01125436, + "auxiliary_loss_mlp": 0.01040884, + "balance_loss_clip": 1.0270915, + "balance_loss_mlp": 1.04548144, + "epoch": 0.33596873590861265, + "flos": 16543687386240.0, + "grad_norm": 1.7952449023594161, + "language_loss": 0.67014575, + "learning_rate": 3.095731802118677e-06, + "loss": 0.69180894, + "num_input_tokens_seen": 119996140, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.80078125, + "step": 5588, + "time_per_iteration": 2.435070753097534 + }, + { + "auxiliary_loss_clip": 0.01130516, + "auxiliary_loss_mlp": 0.0104412, + "balance_loss_clip": 1.02784824, + "balance_loss_mlp": 1.04568088, + "epoch": 0.3360288591612806, + "flos": 31175812782720.0, + "grad_norm": 1.6839710852868943, + "language_loss": 0.69882601, + "learning_rate": 3.095405970878919e-06, + "loss": 0.72057241, + "num_input_tokens_seen": 120017720, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.84765625, + "step": 5589, + "time_per_iteration": 2.548051118850708 + }, + { + "auxiliary_loss_clip": 0.01129859, + "auxiliary_loss_mlp": 0.01043753, + "balance_loss_clip": 1.02709961, + "balance_loss_mlp": 1.04461861, + "epoch": 0.3360889824139486, + "flos": 23696913265920.0, + "grad_norm": 2.1328325025080987, + "language_loss": 0.66886735, + "learning_rate": 3.0950800981012567e-06, + "loss": 0.69060349, + "num_input_tokens_seen": 120036335, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8515625, + "step": 5590, + "time_per_iteration": 2.4735047817230225 + }, + { + "auxiliary_loss_clip": 0.01126204, + "auxiliary_loss_mlp": 0.01045607, + "balance_loss_clip": 1.02993059, + "balance_loss_mlp": 1.04570127, + "epoch": 0.33614910566661654, + "flos": 19318109886720.0, + "grad_norm": 1.8322479695472769, + "language_loss": 0.73409903, + "learning_rate": 3.094754183798047e-06, + "loss": 0.75581712, + "num_input_tokens_seen": 120056120, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8046875, + "step": 5591, + "time_per_iteration": 2.4736244678497314 + }, + { + "auxiliary_loss_clip": 0.01127166, + "auxiliary_loss_mlp": 0.01042172, + "balance_loss_clip": 1.02686584, + "balance_loss_mlp": 1.04408562, + "epoch": 0.3362092289192845, + "flos": 16472261191680.0, + "grad_norm": 1.9183925576882788, + "language_loss": 0.69446647, + "learning_rate": 3.0944282279816493e-06, + "loss": 0.71615982, + "num_input_tokens_seen": 120073650, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.828125, + "step": 5592, + "time_per_iteration": 2.4232676029205322 + }, + { + "auxiliary_loss_clip": 0.0112535, + "auxiliary_loss_mlp": 0.01037895, + "balance_loss_clip": 1.02366149, + "balance_loss_mlp": 1.0442183, + "epoch": 0.33626935217195253, + "flos": 24243581329920.0, + "grad_norm": 2.4700576130478367, + "language_loss": 0.76281321, + "learning_rate": 3.094102230664423e-06, + "loss": 0.78444564, + "num_input_tokens_seen": 120093260, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8125, + "step": 5593, + "time_per_iteration": 2.4856812953948975 + }, + { + "auxiliary_loss_clip": 0.01128845, + "auxiliary_loss_mlp": 0.01044628, + "balance_loss_clip": 1.02703261, + "balance_loss_mlp": 1.04333365, + "epoch": 0.3363294754246205, + "flos": 19718765164800.0, + "grad_norm": 2.2267028217655516, + "language_loss": 0.71435678, + "learning_rate": 3.093776191858731e-06, + "loss": 0.73609149, + "num_input_tokens_seen": 120111830, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8515625, + "step": 5594, + "time_per_iteration": 2.437554359436035 + }, + { + "auxiliary_loss_clip": 0.0113233, + "auxiliary_loss_mlp": 0.01045948, + "balance_loss_clip": 1.02985501, + "balance_loss_mlp": 1.04690135, + "epoch": 0.33638959867728846, + "flos": 22596286677120.0, + "grad_norm": 1.637052204404589, + "language_loss": 0.80350173, + "learning_rate": 3.0934501115769363e-06, + "loss": 0.82528448, + "num_input_tokens_seen": 120130470, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.85546875, + "step": 5595, + "time_per_iteration": 5.51651668548584 + }, + { + "auxiliary_loss_clip": 0.0112868, + "auxiliary_loss_mlp": 0.01033292, + "balance_loss_clip": 1.01964831, + "balance_loss_mlp": 1.04542542, + "epoch": 0.3364497219299564, + "flos": 20994742972800.0, + "grad_norm": 1.8244163047079407, + "language_loss": 0.81611145, + "learning_rate": 3.0931239898314037e-06, + "loss": 0.83773112, + "num_input_tokens_seen": 120150735, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.83203125, + "step": 5596, + "time_per_iteration": 2.4959781169891357 + }, + { + "auxiliary_loss_clip": 0.01128091, + "auxiliary_loss_mlp": 0.01039521, + "balance_loss_clip": 1.02508509, + "balance_loss_mlp": 1.04461718, + "epoch": 0.3365098451826244, + "flos": 25228610974080.0, + "grad_norm": 1.7014468319312177, + "language_loss": 0.76001227, + "learning_rate": 3.0927978266344995e-06, + "loss": 0.78168839, + "num_input_tokens_seen": 120173230, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8359375, + "step": 5597, + "time_per_iteration": 2.4965333938598633 + }, + { + "auxiliary_loss_clip": 0.01126223, + "auxiliary_loss_mlp": 0.01037273, + "balance_loss_clip": 1.0233258, + "balance_loss_mlp": 1.04597533, + "epoch": 0.33656996843529235, + "flos": 24571697091840.0, + "grad_norm": 1.8007239192940239, + "language_loss": 0.78937811, + "learning_rate": 3.0924716219985916e-06, + "loss": 0.81101304, + "num_input_tokens_seen": 120191860, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.80078125, + "step": 5598, + "time_per_iteration": 2.587813377380371 + }, + { + "auxiliary_loss_clip": 0.01133011, + "auxiliary_loss_mlp": 0.01036012, + "balance_loss_clip": 1.02036011, + "balance_loss_mlp": 1.04606342, + "epoch": 0.3366300916879603, + "flos": 44091120752640.0, + "grad_norm": 1.4664560154247552, + "language_loss": 0.64197004, + "learning_rate": 3.0921453759360514e-06, + "loss": 0.66366023, + "num_input_tokens_seen": 120219195, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.87109375, + "step": 5599, + "time_per_iteration": 2.647618293762207 + }, + { + "auxiliary_loss_clip": 0.0113527, + "auxiliary_loss_mlp": 0.01043272, + "balance_loss_clip": 1.02685726, + "balance_loss_mlp": 1.0468514, + "epoch": 0.3366902149406283, + "flos": 13879869840000.0, + "grad_norm": 2.652004853610392, + "language_loss": 0.8172245, + "learning_rate": 3.091819088459249e-06, + "loss": 0.83900994, + "num_input_tokens_seen": 120232950, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.88671875, + "step": 5600, + "time_per_iteration": 2.441237211227417 + }, + { + "auxiliary_loss_clip": 0.01130498, + "auxiliary_loss_mlp": 0.01050016, + "balance_loss_clip": 1.03369582, + "balance_loss_mlp": 1.04399288, + "epoch": 0.33675033819329625, + "flos": 16253098358400.0, + "grad_norm": 3.359102963412802, + "language_loss": 0.82717538, + "learning_rate": 3.0914927595805573e-06, + "loss": 0.84898043, + "num_input_tokens_seen": 120248865, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.86328125, + "step": 5601, + "time_per_iteration": 2.4369428157806396 + }, + { + "auxiliary_loss_clip": 0.01127768, + "auxiliary_loss_mlp": 0.01032812, + "balance_loss_clip": 1.01911497, + "balance_loss_mlp": 1.04890418, + "epoch": 0.3368104614459642, + "flos": 17055809544960.0, + "grad_norm": 1.6511579237160083, + "language_loss": 0.82726496, + "learning_rate": 3.0911663893123507e-06, + "loss": 0.84887075, + "num_input_tokens_seen": 120267820, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7890625, + "step": 5602, + "time_per_iteration": 2.463291645050049 + }, + { + "auxiliary_loss_clip": 0.01130933, + "auxiliary_loss_mlp": 0.01055384, + "balance_loss_clip": 1.04039955, + "balance_loss_mlp": 1.04712546, + "epoch": 0.3368705846986322, + "flos": 17858628472320.0, + "grad_norm": 1.700541242008466, + "language_loss": 0.70208776, + "learning_rate": 3.0908399776670048e-06, + "loss": 0.72395098, + "num_input_tokens_seen": 120286540, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8359375, + "step": 5603, + "time_per_iteration": 2.4309756755828857 + }, + { + "auxiliary_loss_clip": 0.01133654, + "auxiliary_loss_mlp": 0.0103849, + "balance_loss_clip": 1.02392292, + "balance_loss_mlp": 1.04724145, + "epoch": 0.33693070795130015, + "flos": 22929502170240.0, + "grad_norm": 1.625433979180813, + "language_loss": 0.82925308, + "learning_rate": 3.090513524656898e-06, + "loss": 0.8509745, + "num_input_tokens_seen": 120307305, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.86328125, + "step": 5604, + "time_per_iteration": 2.4980010986328125 + }, + { + "auxiliary_loss_clip": 0.01129789, + "auxiliary_loss_mlp": 0.01042861, + "balance_loss_clip": 1.02782226, + "balance_loss_mlp": 1.0447166, + "epoch": 0.3369908312039681, + "flos": 22017443005440.0, + "grad_norm": 3.2518642032613654, + "language_loss": 0.73756403, + "learning_rate": 3.090187030294409e-06, + "loss": 0.75929046, + "num_input_tokens_seen": 120327845, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8515625, + "step": 5605, + "time_per_iteration": 2.4563212394714355 + }, + { + "auxiliary_loss_clip": 0.01132634, + "auxiliary_loss_mlp": 0.01040526, + "balance_loss_clip": 1.02520752, + "balance_loss_mlp": 1.04604197, + "epoch": 0.33705095445663613, + "flos": 11801970944640.0, + "grad_norm": 2.772980532366942, + "language_loss": 0.83487791, + "learning_rate": 3.089860494591919e-06, + "loss": 0.85660958, + "num_input_tokens_seen": 120343255, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8671875, + "step": 5606, + "time_per_iteration": 2.456441640853882 + }, + { + "auxiliary_loss_clip": 0.0112361, + "auxiliary_loss_mlp": 0.01039979, + "balance_loss_clip": 1.02549469, + "balance_loss_mlp": 1.0414753, + "epoch": 0.3371110777093041, + "flos": 25046400257280.0, + "grad_norm": 1.7790448991820722, + "language_loss": 0.67335433, + "learning_rate": 3.089533917561809e-06, + "loss": 0.69499022, + "num_input_tokens_seen": 120361745, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8203125, + "step": 5607, + "time_per_iteration": 2.4964821338653564 + }, + { + "auxiliary_loss_clip": 0.01130916, + "auxiliary_loss_mlp": 0.01041895, + "balance_loss_clip": 1.02694631, + "balance_loss_mlp": 1.04507923, + "epoch": 0.33717120096197206, + "flos": 26579031719040.0, + "grad_norm": 2.032375572186737, + "language_loss": 0.71093041, + "learning_rate": 3.089207299216464e-06, + "loss": 0.73265851, + "num_input_tokens_seen": 120380565, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.859375, + "step": 5608, + "time_per_iteration": 2.5247933864593506 + }, + { + "auxiliary_loss_clip": 0.01128549, + "auxiliary_loss_mlp": 0.01038175, + "balance_loss_clip": 1.0236311, + "balance_loss_mlp": 1.0446682, + "epoch": 0.33723132421464, + "flos": 15158541168000.0, + "grad_norm": 1.8968208773724307, + "language_loss": 0.79062563, + "learning_rate": 3.088880639568269e-06, + "loss": 0.81229293, + "num_input_tokens_seen": 120399235, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.83984375, + "step": 5609, + "time_per_iteration": 2.439502477645874 + }, + { + "auxiliary_loss_clip": 0.01129667, + "auxiliary_loss_mlp": 0.01042877, + "balance_loss_clip": 1.02706969, + "balance_loss_mlp": 1.04544735, + "epoch": 0.337291447467308, + "flos": 23436093634560.0, + "grad_norm": 2.0456898754189354, + "language_loss": 0.82218611, + "learning_rate": 3.0885539386296114e-06, + "loss": 0.84391159, + "num_input_tokens_seen": 120420095, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.84375, + "step": 5610, + "time_per_iteration": 2.502028226852417 + }, + { + "auxiliary_loss_clip": 0.01123686, + "auxiliary_loss_mlp": 0.01040586, + "balance_loss_clip": 1.02520823, + "balance_loss_mlp": 1.04264688, + "epoch": 0.33735157071997596, + "flos": 17238163916160.0, + "grad_norm": 1.8264685829582996, + "language_loss": 0.81998217, + "learning_rate": 3.088227196412879e-06, + "loss": 0.84162486, + "num_input_tokens_seen": 120437690, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8125, + "step": 5611, + "time_per_iteration": 2.4255177974700928 + }, + { + "auxiliary_loss_clip": 0.01130986, + "auxiliary_loss_mlp": 0.01044325, + "balance_loss_clip": 1.02728975, + "balance_loss_mlp": 1.04550552, + "epoch": 0.3374116939726439, + "flos": 28257388657920.0, + "grad_norm": 1.5753494383615703, + "language_loss": 0.79407716, + "learning_rate": 3.0879004129304626e-06, + "loss": 0.81583023, + "num_input_tokens_seen": 120459240, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.85546875, + "step": 5612, + "time_per_iteration": 2.537048578262329 + }, + { + "auxiliary_loss_clip": 0.01124133, + "auxiliary_loss_mlp": 0.01037075, + "balance_loss_clip": 1.02212596, + "balance_loss_mlp": 1.04021907, + "epoch": 0.3374718172253119, + "flos": 35919396731520.0, + "grad_norm": 2.519050824799004, + "language_loss": 0.70024467, + "learning_rate": 3.087573588194753e-06, + "loss": 0.72185683, + "num_input_tokens_seen": 120481090, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.83984375, + "step": 5613, + "time_per_iteration": 2.570373773574829 + }, + { + "auxiliary_loss_clip": 0.01129945, + "auxiliary_loss_mlp": 0.01037211, + "balance_loss_clip": 1.02203548, + "balance_loss_mlp": 1.04490113, + "epoch": 0.33753194047797985, + "flos": 18186672407040.0, + "grad_norm": 1.6646408753448763, + "language_loss": 0.79615057, + "learning_rate": 3.087246722218144e-06, + "loss": 0.81782216, + "num_input_tokens_seen": 120500045, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8515625, + "step": 5614, + "time_per_iteration": 2.4379053115844727 + }, + { + "auxiliary_loss_clip": 0.01126744, + "auxiliary_loss_mlp": 0.01040084, + "balance_loss_clip": 1.02331161, + "balance_loss_mlp": 1.04260945, + "epoch": 0.3375920637306478, + "flos": 23148916398720.0, + "grad_norm": 1.8534958586083128, + "language_loss": 0.90879035, + "learning_rate": 3.086919815013031e-06, + "loss": 0.93045861, + "num_input_tokens_seen": 120521125, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.84375, + "step": 5615, + "time_per_iteration": 2.4876632690429688 + }, + { + "auxiliary_loss_clip": 0.0112252, + "auxiliary_loss_mlp": 0.01040203, + "balance_loss_clip": 1.02596951, + "balance_loss_mlp": 1.04105914, + "epoch": 0.3376521869833158, + "flos": 23112215677440.0, + "grad_norm": 1.6970154369052728, + "language_loss": 0.80636102, + "learning_rate": 3.086592866591809e-06, + "loss": 0.82798827, + "num_input_tokens_seen": 120539180, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8125, + "step": 5616, + "time_per_iteration": 2.476569175720215 + }, + { + "auxiliary_loss_clip": 0.01131427, + "auxiliary_loss_mlp": 0.01044004, + "balance_loss_clip": 1.02715993, + "balance_loss_mlp": 1.04379678, + "epoch": 0.33771231023598375, + "flos": 19274585581440.0, + "grad_norm": 2.5053489219363754, + "language_loss": 0.84079826, + "learning_rate": 3.0862658769668774e-06, + "loss": 0.86255258, + "num_input_tokens_seen": 120556280, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.875, + "step": 5617, + "time_per_iteration": 2.4204065799713135 + }, + { + "auxiliary_loss_clip": 0.01125211, + "auxiliary_loss_mlp": 0.01036412, + "balance_loss_clip": 1.02190411, + "balance_loss_mlp": 1.04171932, + "epoch": 0.3377724334886517, + "flos": 18150187167360.0, + "grad_norm": 1.648273719366553, + "language_loss": 0.80173457, + "learning_rate": 3.0859388461506343e-06, + "loss": 0.82335079, + "num_input_tokens_seen": 120575395, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8359375, + "step": 5618, + "time_per_iteration": 2.4789302349090576 + }, + { + "auxiliary_loss_clip": 0.01128326, + "auxiliary_loss_mlp": 0.01034119, + "balance_loss_clip": 1.01895535, + "balance_loss_mlp": 1.04367077, + "epoch": 0.3378325567413197, + "flos": 25775997310080.0, + "grad_norm": 1.9548255306646998, + "language_loss": 0.70458674, + "learning_rate": 3.085611774155481e-06, + "loss": 0.72621119, + "num_input_tokens_seen": 120596075, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.84765625, + "step": 5619, + "time_per_iteration": 2.4674489498138428 + }, + { + "auxiliary_loss_clip": 0.01127452, + "auxiliary_loss_mlp": 0.01047075, + "balance_loss_clip": 1.0322814, + "balance_loss_mlp": 1.04403424, + "epoch": 0.3378926799939877, + "flos": 21317112558720.0, + "grad_norm": 5.009208052913787, + "language_loss": 0.69223797, + "learning_rate": 3.085284660993821e-06, + "loss": 0.7139833, + "num_input_tokens_seen": 120614195, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8359375, + "step": 5620, + "time_per_iteration": 2.475889205932617 + }, + { + "auxiliary_loss_clip": 0.01127115, + "auxiliary_loss_mlp": 0.01046185, + "balance_loss_clip": 1.03159392, + "balance_loss_mlp": 1.04497766, + "epoch": 0.33795280324665566, + "flos": 24900028335360.0, + "grad_norm": 2.0914960236262075, + "language_loss": 0.67498147, + "learning_rate": 3.084957506678058e-06, + "loss": 0.69671446, + "num_input_tokens_seen": 120634475, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8203125, + "step": 5621, + "time_per_iteration": 2.4732306003570557 + }, + { + "auxiliary_loss_clip": 0.01124388, + "auxiliary_loss_mlp": 0.01036572, + "balance_loss_clip": 1.02258897, + "balance_loss_mlp": 1.04336381, + "epoch": 0.33801292649932363, + "flos": 24753943722240.0, + "grad_norm": 1.811430245584347, + "language_loss": 0.82714671, + "learning_rate": 3.0846303112205975e-06, + "loss": 0.84875631, + "num_input_tokens_seen": 120654980, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.80859375, + "step": 5622, + "time_per_iteration": 2.5028531551361084 + }, + { + "auxiliary_loss_clip": 0.01122679, + "auxiliary_loss_mlp": 0.01041633, + "balance_loss_clip": 1.0279355, + "balance_loss_mlp": 1.04111528, + "epoch": 0.3380730497519916, + "flos": 26723967096960.0, + "grad_norm": 1.4271980952069887, + "language_loss": 0.73785996, + "learning_rate": 3.0843030746338464e-06, + "loss": 0.75950313, + "num_input_tokens_seen": 120676245, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.81640625, + "step": 5623, + "time_per_iteration": 2.483354091644287 + }, + { + "auxiliary_loss_clip": 0.01044412, + "auxiliary_loss_mlp": 0.01001556, + "balance_loss_clip": 0.99976796, + "balance_loss_mlp": 1.01787817, + "epoch": 0.33813317300465956, + "flos": 70035756416640.0, + "grad_norm": 0.7308868621653948, + "language_loss": 0.54898107, + "learning_rate": 3.083975796930215e-06, + "loss": 0.56944072, + "num_input_tokens_seen": 120741965, + "router_z_loss_clip": 0.01782227, + "router_z_loss_mlp": 0.265625, + "step": 5624, + "time_per_iteration": 3.2154293060302734 + }, + { + "auxiliary_loss_clip": 0.01128701, + "auxiliary_loss_mlp": 0.01040775, + "balance_loss_clip": 1.02536166, + "balance_loss_mlp": 1.04464245, + "epoch": 0.3381932962573275, + "flos": 24097317148800.0, + "grad_norm": 3.114382300094, + "language_loss": 0.73013008, + "learning_rate": 3.083648478122111e-06, + "loss": 0.75182486, + "num_input_tokens_seen": 120760410, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.83984375, + "step": 5625, + "time_per_iteration": 2.4632089138031006 + }, + { + "auxiliary_loss_clip": 0.01129587, + "auxiliary_loss_mlp": 0.01038275, + "balance_loss_clip": 1.02315879, + "balance_loss_mlp": 1.04408085, + "epoch": 0.3382534195099955, + "flos": 19278248768640.0, + "grad_norm": 1.7442247016960708, + "language_loss": 0.70501375, + "learning_rate": 3.0833211182219497e-06, + "loss": 0.72669238, + "num_input_tokens_seen": 120777705, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.85546875, + "step": 5626, + "time_per_iteration": 2.4782652854919434 + }, + { + "auxiliary_loss_clip": 0.01123049, + "auxiliary_loss_mlp": 0.01033781, + "balance_loss_clip": 1.01887965, + "balance_loss_mlp": 1.04265583, + "epoch": 0.33831354276266346, + "flos": 25226240676480.0, + "grad_norm": 1.496721640957227, + "language_loss": 0.81184483, + "learning_rate": 3.0829937172421425e-06, + "loss": 0.83341312, + "num_input_tokens_seen": 120798660, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8046875, + "step": 5627, + "time_per_iteration": 2.48683762550354 + }, + { + "auxiliary_loss_clip": 0.01133025, + "auxiliary_loss_mlp": 0.01038727, + "balance_loss_clip": 1.02332532, + "balance_loss_mlp": 1.04643917, + "epoch": 0.3383736660153314, + "flos": 23112000195840.0, + "grad_norm": 2.112092075284961, + "language_loss": 0.80725849, + "learning_rate": 3.0826662751951055e-06, + "loss": 0.82897604, + "num_input_tokens_seen": 120816705, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.86328125, + "step": 5628, + "time_per_iteration": 2.485978841781616 + }, + { + "auxiliary_loss_clip": 0.01125942, + "auxiliary_loss_mlp": 0.01032154, + "balance_loss_clip": 1.01716328, + "balance_loss_mlp": 1.04272234, + "epoch": 0.3384337892679994, + "flos": 23477139901440.0, + "grad_norm": 1.9378827683544937, + "language_loss": 0.77360773, + "learning_rate": 3.082338792093254e-06, + "loss": 0.79518872, + "num_input_tokens_seen": 120835375, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.83203125, + "step": 5629, + "time_per_iteration": 2.459749937057495 + }, + { + "auxiliary_loss_clip": 0.0112767, + "auxiliary_loss_mlp": 0.01042637, + "balance_loss_clip": 1.02604353, + "balance_loss_mlp": 1.0426172, + "epoch": 0.33849391252066735, + "flos": 19425805839360.0, + "grad_norm": 1.750727836719773, + "language_loss": 0.84873146, + "learning_rate": 3.0820112679490074e-06, + "loss": 0.87043452, + "num_input_tokens_seen": 120854260, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.84765625, + "step": 5630, + "time_per_iteration": 2.502168655395508 + }, + { + "auxiliary_loss_clip": 0.01128287, + "auxiliary_loss_mlp": 0.01039609, + "balance_loss_clip": 1.02593017, + "balance_loss_mlp": 1.04496086, + "epoch": 0.3385540357733353, + "flos": 21064840364160.0, + "grad_norm": 2.44277401951878, + "language_loss": 0.71778762, + "learning_rate": 3.0816837027747857e-06, + "loss": 0.73946661, + "num_input_tokens_seen": 120871590, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.83203125, + "step": 5631, + "time_per_iteration": 2.4541988372802734 + }, + { + "auxiliary_loss_clip": 0.01044995, + "auxiliary_loss_mlp": 0.01006836, + "balance_loss_clip": 1.0050118, + "balance_loss_mlp": 1.01844144, + "epoch": 0.3386141590260033, + "flos": 69208013450880.0, + "grad_norm": 0.84858361279948, + "language_loss": 0.56171906, + "learning_rate": 3.0813560965830084e-06, + "loss": 0.58223736, + "num_input_tokens_seen": 120925550, + "router_z_loss_clip": 0.01818848, + "router_z_loss_mlp": 0.265625, + "step": 5632, + "time_per_iteration": 3.130112409591675 + }, + { + "auxiliary_loss_clip": 0.01126092, + "auxiliary_loss_mlp": 0.01034757, + "balance_loss_clip": 1.01925933, + "balance_loss_mlp": 1.04301071, + "epoch": 0.3386742822786713, + "flos": 25519487310720.0, + "grad_norm": 1.4746675536042473, + "language_loss": 0.80288029, + "learning_rate": 3.0810284493861005e-06, + "loss": 0.82448882, + "num_input_tokens_seen": 120947620, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.828125, + "step": 5633, + "time_per_iteration": 2.4772210121154785 + }, + { + "auxiliary_loss_clip": 0.01126262, + "auxiliary_loss_mlp": 0.01031131, + "balance_loss_clip": 1.01671278, + "balance_loss_mlp": 1.04355168, + "epoch": 0.33873440553133927, + "flos": 23623116773760.0, + "grad_norm": 2.3860801146544692, + "language_loss": 0.59222949, + "learning_rate": 3.0807007611964855e-06, + "loss": 0.61380345, + "num_input_tokens_seen": 120965205, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.828125, + "step": 5634, + "time_per_iteration": 2.490783214569092 + }, + { + "auxiliary_loss_clip": 0.01124946, + "auxiliary_loss_mlp": 0.01033397, + "balance_loss_clip": 1.01930678, + "balance_loss_mlp": 1.04328096, + "epoch": 0.33879452878400723, + "flos": 17088882992640.0, + "grad_norm": 1.758176339753219, + "language_loss": 0.92591304, + "learning_rate": 3.080373032026589e-06, + "loss": 0.94749641, + "num_input_tokens_seen": 120983560, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.81640625, + "step": 5635, + "time_per_iteration": 2.4895272254943848 + }, + { + "auxiliary_loss_clip": 0.01123271, + "auxiliary_loss_mlp": 0.01030062, + "balance_loss_clip": 1.01594758, + "balance_loss_mlp": 1.04428411, + "epoch": 0.3388546520366752, + "flos": 15742053607680.0, + "grad_norm": 1.7397877385381144, + "language_loss": 0.74791968, + "learning_rate": 3.0800452618888386e-06, + "loss": 0.76945299, + "num_input_tokens_seen": 121001400, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 5636, + "time_per_iteration": 2.4868686199188232 + }, + { + "auxiliary_loss_clip": 0.01123089, + "auxiliary_loss_mlp": 0.01037449, + "balance_loss_clip": 1.02264357, + "balance_loss_mlp": 1.04291928, + "epoch": 0.33891477528934316, + "flos": 22418744728320.0, + "grad_norm": 1.533650755617547, + "language_loss": 0.83216572, + "learning_rate": 3.0797174507956637e-06, + "loss": 0.85377115, + "num_input_tokens_seen": 121021760, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.80078125, + "step": 5637, + "time_per_iteration": 5.43249249458313 + }, + { + "auxiliary_loss_clip": 0.0112926, + "auxiliary_loss_mlp": 0.01044612, + "balance_loss_clip": 1.02837586, + "balance_loss_mlp": 1.04624391, + "epoch": 0.3389748985420111, + "flos": 17274828723840.0, + "grad_norm": 1.6200031021198193, + "language_loss": 0.70037901, + "learning_rate": 3.079389598759495e-06, + "loss": 0.72211778, + "num_input_tokens_seen": 121041070, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.828125, + "step": 5638, + "time_per_iteration": 2.430814504623413 + }, + { + "auxiliary_loss_clip": 0.01128885, + "auxiliary_loss_mlp": 0.0104494, + "balance_loss_clip": 1.02993131, + "balance_loss_mlp": 1.0461942, + "epoch": 0.3390350217946791, + "flos": 27744979190400.0, + "grad_norm": 1.644027939558444, + "language_loss": 0.80699074, + "learning_rate": 3.079061705792765e-06, + "loss": 0.82872897, + "num_input_tokens_seen": 121060890, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 5639, + "time_per_iteration": 2.5219810009002686 + }, + { + "auxiliary_loss_clip": 0.01129363, + "auxiliary_loss_mlp": 0.01042542, + "balance_loss_clip": 1.02714002, + "balance_loss_mlp": 1.044734, + "epoch": 0.33909514504734706, + "flos": 20339804338560.0, + "grad_norm": 2.006873412015597, + "language_loss": 0.67907631, + "learning_rate": 3.078733771907907e-06, + "loss": 0.70079535, + "num_input_tokens_seen": 121079135, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84765625, + "step": 5640, + "time_per_iteration": 2.4252562522888184 + }, + { + "auxiliary_loss_clip": 0.01123424, + "auxiliary_loss_mlp": 0.01037389, + "balance_loss_clip": 1.02229738, + "balance_loss_mlp": 1.0432744, + "epoch": 0.339155268300015, + "flos": 14830030356480.0, + "grad_norm": 1.561334672972187, + "language_loss": 0.70158339, + "learning_rate": 3.0784057971173554e-06, + "loss": 0.72319156, + "num_input_tokens_seen": 121097685, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.80078125, + "step": 5641, + "time_per_iteration": 2.4703073501586914 + }, + { + "auxiliary_loss_clip": 0.01129782, + "auxiliary_loss_mlp": 0.01043462, + "balance_loss_clip": 1.02881122, + "balance_loss_mlp": 1.04692698, + "epoch": 0.339215391552683, + "flos": 26067951054720.0, + "grad_norm": 1.7323035027878293, + "language_loss": 0.87336594, + "learning_rate": 3.0780777814335483e-06, + "loss": 0.89509839, + "num_input_tokens_seen": 121115640, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.828125, + "step": 5642, + "time_per_iteration": 2.489957332611084 + }, + { + "auxiliary_loss_clip": 0.01119376, + "auxiliary_loss_mlp": 0.01030563, + "balance_loss_clip": 1.01812363, + "balance_loss_mlp": 1.04361117, + "epoch": 0.33927551480535095, + "flos": 14574705505920.0, + "grad_norm": 1.899951429632433, + "language_loss": 0.83783317, + "learning_rate": 3.077749724868924e-06, + "loss": 0.85933256, + "num_input_tokens_seen": 121132485, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7578125, + "step": 5643, + "time_per_iteration": 2.454176902770996 + }, + { + "auxiliary_loss_clip": 0.01122874, + "auxiliary_loss_mlp": 0.01041824, + "balance_loss_clip": 1.02779329, + "balance_loss_mlp": 1.04303253, + "epoch": 0.3393356380580189, + "flos": 23805578885760.0, + "grad_norm": 1.6286036888414737, + "language_loss": 0.76940101, + "learning_rate": 3.077421627435922e-06, + "loss": 0.79104799, + "num_input_tokens_seen": 121152935, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.796875, + "step": 5644, + "time_per_iteration": 2.46893048286438 + }, + { + "auxiliary_loss_clip": 0.01124612, + "auxiliary_loss_mlp": 0.01043858, + "balance_loss_clip": 1.02898121, + "balance_loss_mlp": 1.04242706, + "epoch": 0.3393957613106869, + "flos": 17347871030400.0, + "grad_norm": 4.638882451456986, + "language_loss": 0.62893367, + "learning_rate": 3.0770934891469832e-06, + "loss": 0.65061837, + "num_input_tokens_seen": 121169835, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8203125, + "step": 5645, + "time_per_iteration": 2.4539859294891357 + }, + { + "auxiliary_loss_clip": 0.01120022, + "auxiliary_loss_mlp": 0.01033694, + "balance_loss_clip": 1.02033067, + "balance_loss_mlp": 1.04122853, + "epoch": 0.3394558845633549, + "flos": 28433960939520.0, + "grad_norm": 2.1237754414429637, + "language_loss": 0.76276195, + "learning_rate": 3.076765310014552e-06, + "loss": 0.78429914, + "num_input_tokens_seen": 121190290, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7890625, + "step": 5646, + "time_per_iteration": 2.4913554191589355 + }, + { + "auxiliary_loss_clip": 0.01128945, + "auxiliary_loss_mlp": 0.01043856, + "balance_loss_clip": 1.02835846, + "balance_loss_mlp": 1.04360342, + "epoch": 0.33951600781602287, + "flos": 22086929865600.0, + "grad_norm": 1.9547585113359744, + "language_loss": 0.79175937, + "learning_rate": 3.0764370900510727e-06, + "loss": 0.81348741, + "num_input_tokens_seen": 121209060, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.85546875, + "step": 5647, + "time_per_iteration": 2.521603584289551 + }, + { + "auxiliary_loss_clip": 0.01128449, + "auxiliary_loss_mlp": 0.01040236, + "balance_loss_clip": 1.02541864, + "balance_loss_mlp": 1.04706085, + "epoch": 0.33957613106869083, + "flos": 23878262056320.0, + "grad_norm": 1.87789373580567, + "language_loss": 0.77358377, + "learning_rate": 3.0761088292689904e-06, + "loss": 0.79527068, + "num_input_tokens_seen": 121227480, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8125, + "step": 5648, + "time_per_iteration": 2.4812231063842773 + }, + { + "auxiliary_loss_clip": 0.0104448, + "auxiliary_loss_mlp": 0.01001624, + "balance_loss_clip": 0.99964541, + "balance_loss_mlp": 1.01817107, + "epoch": 0.3396362543213588, + "flos": 71242642414080.0, + "grad_norm": 0.7825270224300925, + "language_loss": 0.56261832, + "learning_rate": 3.075780527680754e-06, + "loss": 0.5830794, + "num_input_tokens_seen": 121291305, + "router_z_loss_clip": 0.01977539, + "router_z_loss_mlp": 0.26171875, + "step": 5649, + "time_per_iteration": 3.1050350666046143 + }, + { + "auxiliary_loss_clip": 0.01123703, + "auxiliary_loss_mlp": 0.01042955, + "balance_loss_clip": 1.02804756, + "balance_loss_mlp": 1.0422622, + "epoch": 0.33969637757402676, + "flos": 25921615046400.0, + "grad_norm": 1.5021179324123226, + "language_loss": 0.85269898, + "learning_rate": 3.0754521852988117e-06, + "loss": 0.87436557, + "num_input_tokens_seen": 121312740, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8125, + "step": 5650, + "time_per_iteration": 2.5013816356658936 + }, + { + "auxiliary_loss_clip": 0.01123225, + "auxiliary_loss_mlp": 0.01029214, + "balance_loss_clip": 1.01540327, + "balance_loss_mlp": 1.04317355, + "epoch": 0.33975650082669473, + "flos": 35261728663680.0, + "grad_norm": 1.6954461839420942, + "language_loss": 0.70868433, + "learning_rate": 3.0751238021356152e-06, + "loss": 0.73020875, + "num_input_tokens_seen": 121334220, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.80078125, + "step": 5651, + "time_per_iteration": 2.579455852508545 + }, + { + "auxiliary_loss_clip": 0.01123721, + "auxiliary_loss_mlp": 0.01037329, + "balance_loss_clip": 1.02354813, + "balance_loss_mlp": 1.04347372, + "epoch": 0.3398166240793627, + "flos": 16647001879680.0, + "grad_norm": 1.7042541017727943, + "language_loss": 0.81267643, + "learning_rate": 3.074795378203616e-06, + "loss": 0.83428693, + "num_input_tokens_seen": 121351870, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.8046875, + "step": 5652, + "time_per_iteration": 2.4690871238708496 + }, + { + "auxiliary_loss_clip": 0.01128696, + "auxiliary_loss_mlp": 0.01041185, + "balance_loss_clip": 1.02670693, + "balance_loss_mlp": 1.04464078, + "epoch": 0.33987674733203066, + "flos": 24062196625920.0, + "grad_norm": 1.8642865553854127, + "language_loss": 0.77315342, + "learning_rate": 3.0744669135152685e-06, + "loss": 0.79485226, + "num_input_tokens_seen": 121373400, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.83984375, + "step": 5653, + "time_per_iteration": 2.4836156368255615 + }, + { + "auxiliary_loss_clip": 0.01123907, + "auxiliary_loss_mlp": 0.01036159, + "balance_loss_clip": 1.02225959, + "balance_loss_mlp": 1.04310441, + "epoch": 0.3399368705846986, + "flos": 13250678279040.0, + "grad_norm": 4.3033812467068895, + "language_loss": 0.85072839, + "learning_rate": 3.0741384080830278e-06, + "loss": 0.87232912, + "num_input_tokens_seen": 121385225, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.80859375, + "step": 5654, + "time_per_iteration": 2.4139702320098877 + }, + { + "auxiliary_loss_clip": 0.01122836, + "auxiliary_loss_mlp": 0.01042828, + "balance_loss_clip": 1.02853489, + "balance_loss_mlp": 1.04074049, + "epoch": 0.3399969938373666, + "flos": 27012832272000.0, + "grad_norm": 5.132089356193866, + "language_loss": 0.65128249, + "learning_rate": 3.073809861919351e-06, + "loss": 0.67293918, + "num_input_tokens_seen": 121404735, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8203125, + "step": 5655, + "time_per_iteration": 2.475292444229126 + }, + { + "auxiliary_loss_clip": 0.01124776, + "auxiliary_loss_mlp": 0.01041891, + "balance_loss_clip": 1.02781832, + "balance_loss_mlp": 1.04365194, + "epoch": 0.34005711709003456, + "flos": 28550096588160.0, + "grad_norm": 1.4436453355930483, + "language_loss": 0.76766688, + "learning_rate": 3.073481275036697e-06, + "loss": 0.78933358, + "num_input_tokens_seen": 121426780, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8125, + "step": 5656, + "time_per_iteration": 2.550999879837036 + }, + { + "auxiliary_loss_clip": 0.01130894, + "auxiliary_loss_mlp": 0.01039317, + "balance_loss_clip": 1.02413023, + "balance_loss_mlp": 1.04413342, + "epoch": 0.3401172403427025, + "flos": 21617003208960.0, + "grad_norm": 1.5863892165941962, + "language_loss": 0.82438695, + "learning_rate": 3.073152647447525e-06, + "loss": 0.84608912, + "num_input_tokens_seen": 121447245, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8671875, + "step": 5657, + "time_per_iteration": 2.4573473930358887 + }, + { + "auxiliary_loss_clip": 0.01122831, + "auxiliary_loss_mlp": 0.01040787, + "balance_loss_clip": 1.02675629, + "balance_loss_mlp": 1.04342616, + "epoch": 0.3401773635953705, + "flos": 25885776251520.0, + "grad_norm": 1.6511746791476316, + "language_loss": 0.85153604, + "learning_rate": 3.0728239791642976e-06, + "loss": 0.87317222, + "num_input_tokens_seen": 121468165, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.79296875, + "step": 5658, + "time_per_iteration": 2.505319833755493 + }, + { + "auxiliary_loss_clip": 0.01042351, + "auxiliary_loss_mlp": 0.01002353, + "balance_loss_clip": 1.0001955, + "balance_loss_mlp": 1.01611352, + "epoch": 0.3402374868480385, + "flos": 65507995336320.0, + "grad_norm": 0.8147477326465351, + "language_loss": 0.60012162, + "learning_rate": 3.072495270199477e-06, + "loss": 0.62056863, + "num_input_tokens_seen": 121523795, + "router_z_loss_clip": 0.02160645, + "router_z_loss_mlp": 0.26171875, + "step": 5659, + "time_per_iteration": 3.024125814437866 + }, + { + "auxiliary_loss_clip": 0.01122626, + "auxiliary_loss_mlp": 0.01035679, + "balance_loss_clip": 1.02190423, + "balance_loss_mlp": 1.04398155, + "epoch": 0.34029761010070647, + "flos": 24060580513920.0, + "grad_norm": 1.936270792227836, + "language_loss": 0.67855251, + "learning_rate": 3.0721665205655284e-06, + "loss": 0.70013559, + "num_input_tokens_seen": 121542950, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.78515625, + "step": 5660, + "time_per_iteration": 2.5009706020355225 + }, + { + "auxiliary_loss_clip": 0.01125634, + "auxiliary_loss_mlp": 0.01045639, + "balance_loss_clip": 1.0307138, + "balance_loss_mlp": 1.04558277, + "epoch": 0.34035773335337444, + "flos": 27599720590080.0, + "grad_norm": 1.6106101267942714, + "language_loss": 0.67213613, + "learning_rate": 3.071837730274918e-06, + "loss": 0.69384885, + "num_input_tokens_seen": 121562765, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.80078125, + "step": 5661, + "time_per_iteration": 2.501034736633301 + }, + { + "auxiliary_loss_clip": 0.01123137, + "auxiliary_loss_mlp": 0.01037886, + "balance_loss_clip": 1.0241766, + "balance_loss_mlp": 1.04442382, + "epoch": 0.3404178566060424, + "flos": 20812783651200.0, + "grad_norm": 1.9145784194305409, + "language_loss": 0.78845918, + "learning_rate": 3.071508899340113e-06, + "loss": 0.81006938, + "num_input_tokens_seen": 121581610, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7890625, + "step": 5662, + "time_per_iteration": 2.4689018726348877 + }, + { + "auxiliary_loss_clip": 0.01123734, + "auxiliary_loss_mlp": 0.01039121, + "balance_loss_clip": 1.02395773, + "balance_loss_mlp": 1.04277706, + "epoch": 0.34047797985871037, + "flos": 26833566470400.0, + "grad_norm": 1.9415115692891318, + "language_loss": 0.73675144, + "learning_rate": 3.0711800277735833e-06, + "loss": 0.75838, + "num_input_tokens_seen": 121601885, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.80859375, + "step": 5663, + "time_per_iteration": 2.4802587032318115 + }, + { + "auxiliary_loss_clip": 0.01121343, + "auxiliary_loss_mlp": 0.01034164, + "balance_loss_clip": 1.02101541, + "balance_loss_mlp": 1.04342198, + "epoch": 0.34053810311137833, + "flos": 19682639061120.0, + "grad_norm": 2.0753473798431608, + "language_loss": 0.85900557, + "learning_rate": 3.0708511155877997e-06, + "loss": 0.88056058, + "num_input_tokens_seen": 121621335, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.77734375, + "step": 5664, + "time_per_iteration": 2.46343731880188 + }, + { + "auxiliary_loss_clip": 0.01127455, + "auxiliary_loss_mlp": 0.01033802, + "balance_loss_clip": 1.02055156, + "balance_loss_mlp": 1.0459125, + "epoch": 0.3405982263640463, + "flos": 21725740656000.0, + "grad_norm": 1.782528704092853, + "language_loss": 0.69047546, + "learning_rate": 3.070522162795235e-06, + "loss": 0.71208799, + "num_input_tokens_seen": 121641310, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.81640625, + "step": 5665, + "time_per_iteration": 2.4448721408843994 + }, + { + "auxiliary_loss_clip": 0.01123992, + "auxiliary_loss_mlp": 0.01035732, + "balance_loss_clip": 1.02006817, + "balance_loss_mlp": 1.04218054, + "epoch": 0.34065834961671426, + "flos": 18041629288320.0, + "grad_norm": 2.296518315240935, + "language_loss": 0.72806692, + "learning_rate": 3.0701931694083626e-06, + "loss": 0.74966413, + "num_input_tokens_seen": 121659625, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8203125, + "step": 5666, + "time_per_iteration": 2.4749717712402344 + }, + { + "auxiliary_loss_clip": 0.01126484, + "auxiliary_loss_mlp": 0.0103642, + "balance_loss_clip": 1.02236485, + "balance_loss_mlp": 1.04428983, + "epoch": 0.3407184728693822, + "flos": 21397337585280.0, + "grad_norm": 1.5083890198292058, + "language_loss": 0.73306108, + "learning_rate": 3.0698641354396576e-06, + "loss": 0.75469005, + "num_input_tokens_seen": 121679205, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8203125, + "step": 5667, + "time_per_iteration": 2.467684030532837 + }, + { + "auxiliary_loss_clip": 0.0104148, + "auxiliary_loss_mlp": 0.01001962, + "balance_loss_clip": 1.00007808, + "balance_loss_mlp": 1.01518095, + "epoch": 0.3407785961220502, + "flos": 68688101018880.0, + "grad_norm": 0.8424548288565059, + "language_loss": 0.6331358, + "learning_rate": 3.069535060901597e-06, + "loss": 0.65357018, + "num_input_tokens_seen": 121751085, + "router_z_loss_clip": 0.01879883, + "router_z_loss_mlp": 0.26367188, + "step": 5668, + "time_per_iteration": 3.233991861343384 + }, + { + "auxiliary_loss_clip": 0.01124776, + "auxiliary_loss_mlp": 0.01039147, + "balance_loss_clip": 1.02460372, + "balance_loss_mlp": 1.04407477, + "epoch": 0.34083871937471816, + "flos": 14064379027200.0, + "grad_norm": 2.1457172939364892, + "language_loss": 0.72030753, + "learning_rate": 3.0692059458066596e-06, + "loss": 0.74194676, + "num_input_tokens_seen": 121768565, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.80859375, + "step": 5669, + "time_per_iteration": 2.4226186275482178 + }, + { + "auxiliary_loss_clip": 0.01127607, + "auxiliary_loss_mlp": 0.01035265, + "balance_loss_clip": 1.02078128, + "balance_loss_mlp": 1.04468203, + "epoch": 0.3408988426273861, + "flos": 17085435287040.0, + "grad_norm": 1.9050671295461388, + "language_loss": 0.80285168, + "learning_rate": 3.0688767901673265e-06, + "loss": 0.82448041, + "num_input_tokens_seen": 121784925, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.828125, + "step": 5670, + "time_per_iteration": 2.4354984760284424 + }, + { + "auxiliary_loss_clip": 0.01127772, + "auxiliary_loss_mlp": 0.0103567, + "balance_loss_clip": 1.02122176, + "balance_loss_mlp": 1.04374027, + "epoch": 0.3409589658800541, + "flos": 24024562151040.0, + "grad_norm": 1.5994061750955757, + "language_loss": 0.76886785, + "learning_rate": 3.068547593996078e-06, + "loss": 0.79050225, + "num_input_tokens_seen": 121804425, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.83984375, + "step": 5671, + "time_per_iteration": 2.4775397777557373 + }, + { + "auxiliary_loss_clip": 0.01125342, + "auxiliary_loss_mlp": 0.01040087, + "balance_loss_clip": 1.02513266, + "balance_loss_mlp": 1.04437792, + "epoch": 0.34101908913272205, + "flos": 21142012734720.0, + "grad_norm": 1.9602332848552635, + "language_loss": 0.74416959, + "learning_rate": 3.0682183573053974e-06, + "loss": 0.7658239, + "num_input_tokens_seen": 121825145, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.80859375, + "step": 5672, + "time_per_iteration": 2.5027272701263428 + }, + { + "auxiliary_loss_clip": 0.01127201, + "auxiliary_loss_mlp": 0.01032286, + "balance_loss_clip": 1.01889324, + "balance_loss_mlp": 1.04523087, + "epoch": 0.3410792123853901, + "flos": 15702012921600.0, + "grad_norm": 1.991076139860355, + "language_loss": 0.73781157, + "learning_rate": 3.06788908010777e-06, + "loss": 0.75940639, + "num_input_tokens_seen": 121842185, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.8203125, + "step": 5673, + "time_per_iteration": 2.424955368041992 + }, + { + "auxiliary_loss_clip": 0.01123926, + "auxiliary_loss_mlp": 0.01036446, + "balance_loss_clip": 1.02243853, + "balance_loss_mlp": 1.04432535, + "epoch": 0.34113933563805804, + "flos": 23036012974080.0, + "grad_norm": 1.774655206888726, + "language_loss": 0.79900169, + "learning_rate": 3.067559762415682e-06, + "loss": 0.8206054, + "num_input_tokens_seen": 121862260, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.796875, + "step": 5674, + "time_per_iteration": 2.490407705307007 + }, + { + "auxiliary_loss_clip": 0.01041345, + "auxiliary_loss_mlp": 0.01001058, + "balance_loss_clip": 0.99942493, + "balance_loss_mlp": 1.01517344, + "epoch": 0.341199458890726, + "flos": 69614235336960.0, + "grad_norm": 0.7963469989165133, + "language_loss": 0.56096685, + "learning_rate": 3.0672304042416198e-06, + "loss": 0.58139086, + "num_input_tokens_seen": 121923560, + "router_z_loss_clip": 0.01635742, + "router_z_loss_mlp": 0.26171875, + "step": 5675, + "time_per_iteration": 3.223119020462036 + }, + { + "auxiliary_loss_clip": 0.01123194, + "auxiliary_loss_mlp": 0.01041089, + "balance_loss_clip": 1.0270282, + "balance_loss_mlp": 1.04428756, + "epoch": 0.34125958214339397, + "flos": 22346348866560.0, + "grad_norm": 1.6179892480447855, + "language_loss": 0.79029286, + "learning_rate": 3.0669010055980734e-06, + "loss": 0.81193566, + "num_input_tokens_seen": 121943515, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 5676, + "time_per_iteration": 2.4798848628997803 + }, + { + "auxiliary_loss_clip": 0.01123343, + "auxiliary_loss_mlp": 0.0103332, + "balance_loss_clip": 1.01836538, + "balance_loss_mlp": 1.0424788, + "epoch": 0.34131970539606193, + "flos": 21871933009920.0, + "grad_norm": 1.8072554320592242, + "language_loss": 0.85598934, + "learning_rate": 3.0665715664975357e-06, + "loss": 0.87755597, + "num_input_tokens_seen": 121962540, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.80859375, + "step": 5677, + "time_per_iteration": 2.4501733779907227 + }, + { + "auxiliary_loss_clip": 0.01122654, + "auxiliary_loss_mlp": 0.01041495, + "balance_loss_clip": 1.0266118, + "balance_loss_mlp": 1.04244757, + "epoch": 0.3413798286487299, + "flos": 24935723475840.0, + "grad_norm": 2.009404852791833, + "language_loss": 0.79283166, + "learning_rate": 3.0662420869524966e-06, + "loss": 0.81447315, + "num_input_tokens_seen": 121979830, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.80078125, + "step": 5678, + "time_per_iteration": 4.054651260375977 + }, + { + "auxiliary_loss_clip": 0.01123013, + "auxiliary_loss_mlp": 0.01033318, + "balance_loss_clip": 1.01983547, + "balance_loss_mlp": 1.04135132, + "epoch": 0.34143995190139786, + "flos": 25374372364800.0, + "grad_norm": 1.8818653655236122, + "language_loss": 0.74546856, + "learning_rate": 3.0659125669754506e-06, + "loss": 0.76703185, + "num_input_tokens_seen": 121999055, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.81640625, + "step": 5679, + "time_per_iteration": 3.9024462699890137 + }, + { + "auxiliary_loss_clip": 0.01042201, + "auxiliary_loss_mlp": 0.01001255, + "balance_loss_clip": 0.99970549, + "balance_loss_mlp": 1.01624846, + "epoch": 0.34150007515406583, + "flos": 67782578129280.0, + "grad_norm": 0.7519133883291979, + "language_loss": 0.59481025, + "learning_rate": 3.0655830065788923e-06, + "loss": 0.61524487, + "num_input_tokens_seen": 122067015, + "router_z_loss_clip": 0.01544189, + "router_z_loss_mlp": 0.25976562, + "step": 5680, + "time_per_iteration": 3.152480125427246 + }, + { + "auxiliary_loss_clip": 0.01121207, + "auxiliary_loss_mlp": 0.01033444, + "balance_loss_clip": 1.01953864, + "balance_loss_mlp": 1.04320455, + "epoch": 0.3415601984067338, + "flos": 20302421258880.0, + "grad_norm": 2.208026502208574, + "language_loss": 0.7233687, + "learning_rate": 3.0652534057753206e-06, + "loss": 0.74491525, + "num_input_tokens_seen": 122085295, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.78125, + "step": 5681, + "time_per_iteration": 2.4450337886810303 + }, + { + "auxiliary_loss_clip": 0.01118824, + "auxiliary_loss_mlp": 0.01041972, + "balance_loss_clip": 1.02798879, + "balance_loss_mlp": 1.04110432, + "epoch": 0.34162032165940176, + "flos": 26031178506240.0, + "grad_norm": 2.0075854608407058, + "language_loss": 0.7144351, + "learning_rate": 3.064923764577233e-06, + "loss": 0.7360431, + "num_input_tokens_seen": 122104020, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7734375, + "step": 5682, + "time_per_iteration": 2.53000807762146 + }, + { + "auxiliary_loss_clip": 0.01120348, + "auxiliary_loss_mlp": 0.01039153, + "balance_loss_clip": 1.02446055, + "balance_loss_mlp": 1.04079127, + "epoch": 0.3416804449120697, + "flos": 28803338449920.0, + "grad_norm": 1.4570201559150766, + "language_loss": 0.8396616, + "learning_rate": 3.0645940829971295e-06, + "loss": 0.86125666, + "num_input_tokens_seen": 122125080, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.796875, + "step": 5683, + "time_per_iteration": 2.511646270751953 + }, + { + "auxiliary_loss_clip": 0.01126192, + "auxiliary_loss_mlp": 0.01047188, + "balance_loss_clip": 1.03189898, + "balance_loss_mlp": 1.04384482, + "epoch": 0.3417405681647377, + "flos": 22601601889920.0, + "grad_norm": 2.5567263249521965, + "language_loss": 0.70622635, + "learning_rate": 3.0642643610475116e-06, + "loss": 0.72796011, + "num_input_tokens_seen": 122146350, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.82421875, + "step": 5684, + "time_per_iteration": 2.58811616897583 + }, + { + "auxiliary_loss_clip": 0.01120756, + "auxiliary_loss_mlp": 0.01034084, + "balance_loss_clip": 1.02119195, + "balance_loss_mlp": 1.0428822, + "epoch": 0.34180069141740566, + "flos": 24716237420160.0, + "grad_norm": 1.480860615854928, + "language_loss": 0.75386423, + "learning_rate": 3.0639345987408823e-06, + "loss": 0.77541268, + "num_input_tokens_seen": 122168085, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.78125, + "step": 5685, + "time_per_iteration": 2.485405445098877 + }, + { + "auxiliary_loss_clip": 0.01120925, + "auxiliary_loss_mlp": 0.0103682, + "balance_loss_clip": 1.02399325, + "balance_loss_mlp": 1.04268134, + "epoch": 0.3418608146700737, + "flos": 30518755246080.0, + "grad_norm": 1.6707381387615057, + "language_loss": 0.70186603, + "learning_rate": 3.0636047960897468e-06, + "loss": 0.72344351, + "num_input_tokens_seen": 122191040, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.78125, + "step": 5686, + "time_per_iteration": 2.5536224842071533 + }, + { + "auxiliary_loss_clip": 0.01121848, + "auxiliary_loss_mlp": 0.01042102, + "balance_loss_clip": 1.02681327, + "balance_loss_mlp": 1.04087019, + "epoch": 0.34192093792274164, + "flos": 15122343237120.0, + "grad_norm": 2.6880234800017844, + "language_loss": 0.77629769, + "learning_rate": 3.06327495310661e-06, + "loss": 0.79793721, + "num_input_tokens_seen": 122209225, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8125, + "step": 5687, + "time_per_iteration": 2.4526383876800537 + }, + { + "auxiliary_loss_clip": 0.01122013, + "auxiliary_loss_mlp": 0.01034386, + "balance_loss_clip": 1.01947296, + "balance_loss_mlp": 1.04425466, + "epoch": 0.3419810611754096, + "flos": 13187799521280.0, + "grad_norm": 1.7522626505921908, + "language_loss": 0.86505169, + "learning_rate": 3.062945069803981e-06, + "loss": 0.88661563, + "num_input_tokens_seen": 122226160, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.77734375, + "step": 5688, + "time_per_iteration": 2.457873821258545 + }, + { + "auxiliary_loss_clip": 0.01129554, + "auxiliary_loss_mlp": 0.01038372, + "balance_loss_clip": 1.02274323, + "balance_loss_mlp": 1.04438853, + "epoch": 0.34204118442807757, + "flos": 19536267139200.0, + "grad_norm": 1.6277101200549902, + "language_loss": 0.79875666, + "learning_rate": 3.0626151461943684e-06, + "loss": 0.82043588, + "num_input_tokens_seen": 122243115, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8515625, + "step": 5689, + "time_per_iteration": 2.4494895935058594 + }, + { + "auxiliary_loss_clip": 0.01124588, + "auxiliary_loss_mlp": 0.01038419, + "balance_loss_clip": 1.02351832, + "balance_loss_mlp": 1.04300821, + "epoch": 0.34210130768074554, + "flos": 15194846839680.0, + "grad_norm": 2.0745412821804057, + "language_loss": 0.7351048, + "learning_rate": 3.0622851822902834e-06, + "loss": 0.75673485, + "num_input_tokens_seen": 122261105, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.81640625, + "step": 5690, + "time_per_iteration": 2.448133945465088 + }, + { + "auxiliary_loss_clip": 0.01120421, + "auxiliary_loss_mlp": 0.01036215, + "balance_loss_clip": 1.02270865, + "balance_loss_mlp": 1.03998768, + "epoch": 0.3421614309334135, + "flos": 24936226266240.0, + "grad_norm": 2.433761635396741, + "language_loss": 0.7631194, + "learning_rate": 3.061955178104237e-06, + "loss": 0.78468573, + "num_input_tokens_seen": 122279995, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.8046875, + "step": 5691, + "time_per_iteration": 2.479569435119629 + }, + { + "auxiliary_loss_clip": 0.01120907, + "auxiliary_loss_mlp": 0.01041441, + "balance_loss_clip": 1.02782106, + "balance_loss_mlp": 1.0415988, + "epoch": 0.34222155418608147, + "flos": 21908633731200.0, + "grad_norm": 1.5387604656502187, + "language_loss": 0.68159282, + "learning_rate": 3.0616251336487447e-06, + "loss": 0.70321631, + "num_input_tokens_seen": 122299070, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.79296875, + "step": 5692, + "time_per_iteration": 2.490466356277466 + }, + { + "auxiliary_loss_clip": 0.01124667, + "auxiliary_loss_mlp": 0.01042741, + "balance_loss_clip": 1.02682638, + "balance_loss_mlp": 1.04275179, + "epoch": 0.34228167743874943, + "flos": 18114061063680.0, + "grad_norm": 2.6924087388900606, + "language_loss": 0.72292894, + "learning_rate": 3.06129504893632e-06, + "loss": 0.74460298, + "num_input_tokens_seen": 122316800, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8203125, + "step": 5693, + "time_per_iteration": 2.451026439666748 + }, + { + "auxiliary_loss_clip": 0.01122133, + "auxiliary_loss_mlp": 0.01037278, + "balance_loss_clip": 1.02408743, + "balance_loss_mlp": 1.0417974, + "epoch": 0.3423418006914174, + "flos": 21288600138240.0, + "grad_norm": 1.7157866574439644, + "language_loss": 0.75877678, + "learning_rate": 3.0609649239794813e-06, + "loss": 0.78037089, + "num_input_tokens_seen": 122335275, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.8046875, + "step": 5694, + "time_per_iteration": 2.499997615814209 + }, + { + "auxiliary_loss_clip": 0.01119791, + "auxiliary_loss_mlp": 0.01036927, + "balance_loss_clip": 1.02320051, + "balance_loss_mlp": 1.04253125, + "epoch": 0.34240192394408536, + "flos": 19823480288640.0, + "grad_norm": 1.9697512050835562, + "language_loss": 0.79815507, + "learning_rate": 3.060634758790747e-06, + "loss": 0.81972229, + "num_input_tokens_seen": 122353215, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7734375, + "step": 5695, + "time_per_iteration": 2.4279983043670654 + }, + { + "auxiliary_loss_clip": 0.01122261, + "auxiliary_loss_mlp": 0.01039624, + "balance_loss_clip": 1.0248661, + "balance_loss_mlp": 1.04168487, + "epoch": 0.3424620471967533, + "flos": 24535535074560.0, + "grad_norm": 1.7314755849975545, + "language_loss": 0.73487073, + "learning_rate": 3.060304553382635e-06, + "loss": 0.75648957, + "num_input_tokens_seen": 122372495, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8046875, + "step": 5696, + "time_per_iteration": 2.507782459259033 + }, + { + "auxiliary_loss_clip": 0.01122963, + "auxiliary_loss_mlp": 0.01047657, + "balance_loss_clip": 1.03301835, + "balance_loss_mlp": 1.0419805, + "epoch": 0.3425221704494213, + "flos": 25848895962240.0, + "grad_norm": 1.6676891559017708, + "language_loss": 0.70874155, + "learning_rate": 3.0599743077676685e-06, + "loss": 0.73044771, + "num_input_tokens_seen": 122394600, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.80859375, + "step": 5697, + "time_per_iteration": 2.4868175983428955 + }, + { + "auxiliary_loss_clip": 0.01122392, + "auxiliary_loss_mlp": 0.01034383, + "balance_loss_clip": 1.01949954, + "balance_loss_mlp": 1.04456246, + "epoch": 0.34258229370208926, + "flos": 21540513196800.0, + "grad_norm": 1.6712097888676536, + "language_loss": 0.81875223, + "learning_rate": 3.05964402195837e-06, + "loss": 0.84031999, + "num_input_tokens_seen": 122414700, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.77734375, + "step": 5698, + "time_per_iteration": 2.500499725341797 + }, + { + "auxiliary_loss_clip": 0.01121288, + "auxiliary_loss_mlp": 0.01043706, + "balance_loss_clip": 1.02712393, + "balance_loss_mlp": 1.03982306, + "epoch": 0.3426424169547573, + "flos": 23652778429440.0, + "grad_norm": 1.9988541020523172, + "language_loss": 0.69163442, + "learning_rate": 3.0593136959672645e-06, + "loss": 0.71328437, + "num_input_tokens_seen": 122432760, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8125, + "step": 5699, + "time_per_iteration": 2.4522063732147217 + }, + { + "auxiliary_loss_clip": 0.01123011, + "auxiliary_loss_mlp": 0.01035122, + "balance_loss_clip": 1.0211153, + "balance_loss_mlp": 1.0424068, + "epoch": 0.34270254020742524, + "flos": 24644883052800.0, + "grad_norm": 2.0139701241951196, + "language_loss": 0.72246462, + "learning_rate": 3.058983329806877e-06, + "loss": 0.74404591, + "num_input_tokens_seen": 122449105, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8046875, + "step": 5700, + "time_per_iteration": 2.4942879676818848 + }, + { + "auxiliary_loss_clip": 0.01123902, + "auxiliary_loss_mlp": 0.01033961, + "balance_loss_clip": 1.02018046, + "balance_loss_mlp": 1.04403377, + "epoch": 0.3427626634600932, + "flos": 20996754134400.0, + "grad_norm": 2.026861038115517, + "language_loss": 0.81818259, + "learning_rate": 3.0586529234897354e-06, + "loss": 0.83976114, + "num_input_tokens_seen": 122468700, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.796875, + "step": 5701, + "time_per_iteration": 2.4650135040283203 + }, + { + "auxiliary_loss_clip": 0.01124816, + "auxiliary_loss_mlp": 0.01032731, + "balance_loss_clip": 1.01886129, + "balance_loss_mlp": 1.04328442, + "epoch": 0.3428227867127612, + "flos": 21433786911360.0, + "grad_norm": 1.616013756330385, + "language_loss": 0.71818215, + "learning_rate": 3.0583224770283694e-06, + "loss": 0.73975766, + "num_input_tokens_seen": 122488160, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.81640625, + "step": 5702, + "time_per_iteration": 2.446018695831299 + }, + { + "auxiliary_loss_clip": 0.01038258, + "auxiliary_loss_mlp": 0.01007974, + "balance_loss_clip": 1.00623345, + "balance_loss_mlp": 1.01261425, + "epoch": 0.34288290996542914, + "flos": 55731782695680.0, + "grad_norm": 0.78067456401119, + "language_loss": 0.57387871, + "learning_rate": 3.057991990435309e-06, + "loss": 0.5943411, + "num_input_tokens_seen": 122542890, + "router_z_loss_clip": 0.01745605, + "router_z_loss_mlp": 0.2578125, + "step": 5703, + "time_per_iteration": 2.9596943855285645 + }, + { + "auxiliary_loss_clip": 0.01125647, + "auxiliary_loss_mlp": 0.01041995, + "balance_loss_clip": 1.02599692, + "balance_loss_mlp": 1.04436553, + "epoch": 0.3429430332180971, + "flos": 20156803522560.0, + "grad_norm": 1.8868866692845514, + "language_loss": 0.74849427, + "learning_rate": 3.057661463723086e-06, + "loss": 0.77017069, + "num_input_tokens_seen": 122561770, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8125, + "step": 5704, + "time_per_iteration": 2.475206136703491 + }, + { + "auxiliary_loss_clip": 0.01122188, + "auxiliary_loss_mlp": 0.01035238, + "balance_loss_clip": 1.0218513, + "balance_loss_mlp": 1.0432725, + "epoch": 0.34300315647076507, + "flos": 17965857548160.0, + "grad_norm": 2.4058395538044572, + "language_loss": 0.73303944, + "learning_rate": 3.0573308969042346e-06, + "loss": 0.75461364, + "num_input_tokens_seen": 122580580, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7890625, + "step": 5705, + "time_per_iteration": 2.435140609741211 + }, + { + "auxiliary_loss_clip": 0.0112299, + "auxiliary_loss_mlp": 0.01035313, + "balance_loss_clip": 1.0204711, + "balance_loss_mlp": 1.04320812, + "epoch": 0.34306327972343303, + "flos": 22086822124800.0, + "grad_norm": 3.54760070735666, + "language_loss": 0.79599071, + "learning_rate": 3.057000289991289e-06, + "loss": 0.81757367, + "num_input_tokens_seen": 122599810, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 5706, + "time_per_iteration": 2.4922068119049072 + }, + { + "auxiliary_loss_clip": 0.01127669, + "auxiliary_loss_mlp": 0.01032738, + "balance_loss_clip": 1.01815271, + "balance_loss_mlp": 1.04497337, + "epoch": 0.343123402976101, + "flos": 18442679616000.0, + "grad_norm": 1.9921713202453553, + "language_loss": 0.83170593, + "learning_rate": 3.056669642996787e-06, + "loss": 0.85330999, + "num_input_tokens_seen": 122616035, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.828125, + "step": 5707, + "time_per_iteration": 2.441812753677368 + }, + { + "auxiliary_loss_clip": 0.01126551, + "auxiliary_loss_mlp": 0.0103365, + "balance_loss_clip": 1.01919019, + "balance_loss_mlp": 1.04623604, + "epoch": 0.34318352622876896, + "flos": 17163685065600.0, + "grad_norm": 1.5424527465289883, + "language_loss": 0.75429368, + "learning_rate": 3.056338955933266e-06, + "loss": 0.77589571, + "num_input_tokens_seen": 122633785, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8046875, + "step": 5708, + "time_per_iteration": 2.448415756225586 + }, + { + "auxiliary_loss_clip": 0.01120092, + "auxiliary_loss_mlp": 0.01034667, + "balance_loss_clip": 1.02046943, + "balance_loss_mlp": 1.04284358, + "epoch": 0.34324364948143693, + "flos": 26688164215680.0, + "grad_norm": 1.6552343197625845, + "language_loss": 0.81159383, + "learning_rate": 3.0560082288132662e-06, + "loss": 0.83314145, + "num_input_tokens_seen": 122652100, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7734375, + "step": 5709, + "time_per_iteration": 2.488879919052124 + }, + { + "auxiliary_loss_clip": 0.01125291, + "auxiliary_loss_mlp": 0.01039585, + "balance_loss_clip": 1.0235213, + "balance_loss_mlp": 1.04413152, + "epoch": 0.3433037727341049, + "flos": 21251576194560.0, + "grad_norm": 2.1306910299424677, + "language_loss": 0.79152101, + "learning_rate": 3.055677461649329e-06, + "loss": 0.81316978, + "num_input_tokens_seen": 122669720, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8125, + "step": 5710, + "time_per_iteration": 2.487224817276001 + }, + { + "auxiliary_loss_clip": 0.01124884, + "auxiliary_loss_mlp": 0.01036256, + "balance_loss_clip": 1.0209142, + "balance_loss_mlp": 1.04181814, + "epoch": 0.34336389598677286, + "flos": 20629423699200.0, + "grad_norm": 1.821164645381994, + "language_loss": 0.69994622, + "learning_rate": 3.055346654453996e-06, + "loss": 0.72155762, + "num_input_tokens_seen": 122688715, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.828125, + "step": 5711, + "time_per_iteration": 2.471989631652832 + }, + { + "auxiliary_loss_clip": 0.01123068, + "auxiliary_loss_mlp": 0.01039448, + "balance_loss_clip": 1.02455926, + "balance_loss_mlp": 1.04235482, + "epoch": 0.3434240192394409, + "flos": 14538579402240.0, + "grad_norm": 1.7360043656013842, + "language_loss": 0.68002397, + "learning_rate": 3.055015807239812e-06, + "loss": 0.70164913, + "num_input_tokens_seen": 122706970, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.80859375, + "step": 5712, + "time_per_iteration": 2.440960168838501 + }, + { + "auxiliary_loss_clip": 0.01036814, + "auxiliary_loss_mlp": 0.01007067, + "balance_loss_clip": 1.00550556, + "balance_loss_mlp": 1.011006, + "epoch": 0.34348414249210885, + "flos": 58051538841600.0, + "grad_norm": 0.8415582534154722, + "language_loss": 0.58101094, + "learning_rate": 3.0546849200193226e-06, + "loss": 0.60144973, + "num_input_tokens_seen": 122758095, + "router_z_loss_clip": 0.01556396, + "router_z_loss_mlp": 0.2578125, + "step": 5713, + "time_per_iteration": 3.018573045730591 + }, + { + "auxiliary_loss_clip": 0.01122962, + "auxiliary_loss_mlp": 0.01042443, + "balance_loss_clip": 1.02773833, + "balance_loss_mlp": 1.04283524, + "epoch": 0.3435442657447768, + "flos": 20704441253760.0, + "grad_norm": 1.6636797952259372, + "language_loss": 0.80745685, + "learning_rate": 3.054353992805076e-06, + "loss": 0.82911092, + "num_input_tokens_seen": 122777815, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.80078125, + "step": 5714, + "time_per_iteration": 2.4916322231292725 + }, + { + "auxiliary_loss_clip": 0.01126185, + "auxiliary_loss_mlp": 0.01040552, + "balance_loss_clip": 1.02519822, + "balance_loss_mlp": 1.04508591, + "epoch": 0.3436043889974448, + "flos": 22930256355840.0, + "grad_norm": 1.759201097406795, + "language_loss": 0.71844554, + "learning_rate": 3.05402302560962e-06, + "loss": 0.7401129, + "num_input_tokens_seen": 122797555, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8125, + "step": 5715, + "time_per_iteration": 2.468292474746704 + }, + { + "auxiliary_loss_clip": 0.01036063, + "auxiliary_loss_mlp": 0.01006756, + "balance_loss_clip": 1.00499201, + "balance_loss_mlp": 1.01020741, + "epoch": 0.34366451225011274, + "flos": 58403285752320.0, + "grad_norm": 0.8941035310387452, + "language_loss": 0.65942305, + "learning_rate": 3.053692018445505e-06, + "loss": 0.67985129, + "num_input_tokens_seen": 122863955, + "router_z_loss_clip": 0.0177002, + "router_z_loss_mlp": 0.2578125, + "step": 5716, + "time_per_iteration": 3.101933717727661 + }, + { + "auxiliary_loss_clip": 0.0112152, + "auxiliary_loss_mlp": 0.0104123, + "balance_loss_clip": 1.02705014, + "balance_loss_mlp": 1.04254961, + "epoch": 0.3437246355027807, + "flos": 15596292216960.0, + "grad_norm": 2.0405702698755657, + "language_loss": 0.74612904, + "learning_rate": 3.0533609713252838e-06, + "loss": 0.76775646, + "num_input_tokens_seen": 122883000, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7890625, + "step": 5717, + "time_per_iteration": 2.426793098449707 + }, + { + "auxiliary_loss_clip": 0.01123044, + "auxiliary_loss_mlp": 0.01042851, + "balance_loss_clip": 1.02894473, + "balance_loss_mlp": 1.0413748, + "epoch": 0.34378475875544867, + "flos": 27672260106240.0, + "grad_norm": 1.6999619338826393, + "language_loss": 0.7507081, + "learning_rate": 3.0530298842615077e-06, + "loss": 0.77236706, + "num_input_tokens_seen": 122903265, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.81640625, + "step": 5718, + "time_per_iteration": 2.534557342529297 + }, + { + "auxiliary_loss_clip": 0.01125265, + "auxiliary_loss_mlp": 0.01040651, + "balance_loss_clip": 1.02563679, + "balance_loss_mlp": 1.04245746, + "epoch": 0.34384488200811664, + "flos": 31431496769280.0, + "grad_norm": 1.9991347741656986, + "language_loss": 0.63971305, + "learning_rate": 3.052698757266734e-06, + "loss": 0.66137218, + "num_input_tokens_seen": 122923860, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 5719, + "time_per_iteration": 2.5236892700195312 + }, + { + "auxiliary_loss_clip": 0.01124826, + "auxiliary_loss_mlp": 0.01038568, + "balance_loss_clip": 1.02251017, + "balance_loss_mlp": 1.0418756, + "epoch": 0.3439050052607846, + "flos": 24899920594560.0, + "grad_norm": 2.111950804429908, + "language_loss": 0.73612356, + "learning_rate": 3.0523675903535183e-06, + "loss": 0.75775748, + "num_input_tokens_seen": 122945305, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.828125, + "step": 5720, + "time_per_iteration": 5.3536376953125 + }, + { + "auxiliary_loss_clip": 0.0112352, + "auxiliary_loss_mlp": 0.01040582, + "balance_loss_clip": 1.02520978, + "balance_loss_mlp": 1.04300022, + "epoch": 0.34396512851345257, + "flos": 18150079426560.0, + "grad_norm": 1.805745396214866, + "language_loss": 0.74198145, + "learning_rate": 3.0520363835344173e-06, + "loss": 0.76362252, + "num_input_tokens_seen": 122962535, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8046875, + "step": 5721, + "time_per_iteration": 2.4301607608795166 + }, + { + "auxiliary_loss_clip": 0.01126876, + "auxiliary_loss_mlp": 0.0104413, + "balance_loss_clip": 1.0286088, + "balance_loss_mlp": 1.04481733, + "epoch": 0.34402525176612053, + "flos": 16034438315520.0, + "grad_norm": 3.5063882769532313, + "language_loss": 0.80132651, + "learning_rate": 3.051705136821992e-06, + "loss": 0.82303661, + "num_input_tokens_seen": 122979750, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8203125, + "step": 5722, + "time_per_iteration": 2.411731243133545 + }, + { + "auxiliary_loss_clip": 0.01122709, + "auxiliary_loss_mlp": 0.01032098, + "balance_loss_clip": 1.01809728, + "balance_loss_mlp": 1.04312289, + "epoch": 0.3440853750187885, + "flos": 21178641628800.0, + "grad_norm": 1.5863267197766868, + "language_loss": 0.8194539, + "learning_rate": 3.051373850228801e-06, + "loss": 0.84100199, + "num_input_tokens_seen": 122998955, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.796875, + "step": 5723, + "time_per_iteration": 2.476672410964966 + }, + { + "auxiliary_loss_clip": 0.01124156, + "auxiliary_loss_mlp": 0.01039991, + "balance_loss_clip": 1.02559686, + "balance_loss_mlp": 1.0428493, + "epoch": 0.34414549827145646, + "flos": 12677868092160.0, + "grad_norm": 1.852885568649272, + "language_loss": 0.8147676, + "learning_rate": 3.0510425237674096e-06, + "loss": 0.83640903, + "num_input_tokens_seen": 123016165, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.8125, + "step": 5724, + "time_per_iteration": 2.4115889072418213 + }, + { + "auxiliary_loss_clip": 0.01125316, + "auxiliary_loss_mlp": 0.01036091, + "balance_loss_clip": 1.0210526, + "balance_loss_mlp": 1.04397368, + "epoch": 0.3442056215241244, + "flos": 31284514316160.0, + "grad_norm": 1.759268883551978, + "language_loss": 0.6919744, + "learning_rate": 3.05071115745038e-06, + "loss": 0.71358848, + "num_input_tokens_seen": 123036900, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8125, + "step": 5725, + "time_per_iteration": 2.589571714401245 + }, + { + "auxiliary_loss_clip": 0.01130624, + "auxiliary_loss_mlp": 0.01042614, + "balance_loss_clip": 1.02578139, + "balance_loss_mlp": 1.04464412, + "epoch": 0.34426574477679245, + "flos": 23367289132800.0, + "grad_norm": 1.4578739764018875, + "language_loss": 0.69519544, + "learning_rate": 3.0503797512902773e-06, + "loss": 0.71692783, + "num_input_tokens_seen": 123057480, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.859375, + "step": 5726, + "time_per_iteration": 2.4600956439971924 + }, + { + "auxiliary_loss_clip": 0.01123936, + "auxiliary_loss_mlp": 0.0103637, + "balance_loss_clip": 1.02222002, + "balance_loss_mlp": 1.0427928, + "epoch": 0.3443258680294604, + "flos": 24535427333760.0, + "grad_norm": 1.656148044371735, + "language_loss": 0.73426235, + "learning_rate": 3.0500483052996703e-06, + "loss": 0.7558654, + "num_input_tokens_seen": 123076890, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8125, + "step": 5727, + "time_per_iteration": 2.5102531909942627 + }, + { + "auxiliary_loss_clip": 0.01125161, + "auxiliary_loss_mlp": 0.01041626, + "balance_loss_clip": 1.02636731, + "balance_loss_mlp": 1.04398954, + "epoch": 0.3443859912821284, + "flos": 20230133137920.0, + "grad_norm": 1.8280399137078096, + "language_loss": 0.87897557, + "learning_rate": 3.0497168194911257e-06, + "loss": 0.90064341, + "num_input_tokens_seen": 123092530, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8125, + "step": 5728, + "time_per_iteration": 2.4304542541503906 + }, + { + "auxiliary_loss_clip": 0.01122947, + "auxiliary_loss_mlp": 0.01045129, + "balance_loss_clip": 1.03106284, + "balance_loss_mlp": 1.04264569, + "epoch": 0.34444611453479634, + "flos": 24316515895680.0, + "grad_norm": 2.0505664478102426, + "language_loss": 0.70451075, + "learning_rate": 3.0493852938772143e-06, + "loss": 0.72619152, + "num_input_tokens_seen": 123110560, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8046875, + "step": 5729, + "time_per_iteration": 2.4979374408721924 + }, + { + "auxiliary_loss_clip": 0.01122265, + "auxiliary_loss_mlp": 0.0103421, + "balance_loss_clip": 1.01952362, + "balance_loss_mlp": 1.0427525, + "epoch": 0.3445062377874643, + "flos": 16983413683200.0, + "grad_norm": 1.7284434335955414, + "language_loss": 0.73995942, + "learning_rate": 3.0490537284705078e-06, + "loss": 0.7615242, + "num_input_tokens_seen": 123128655, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.796875, + "step": 5730, + "time_per_iteration": 2.4471776485443115 + }, + { + "auxiliary_loss_clip": 0.0112363, + "auxiliary_loss_mlp": 0.01041517, + "balance_loss_clip": 1.02693152, + "balance_loss_mlp": 1.04263377, + "epoch": 0.3445663610401323, + "flos": 20302708567680.0, + "grad_norm": 2.104777326243209, + "language_loss": 0.80005515, + "learning_rate": 3.048722123283578e-06, + "loss": 0.82170659, + "num_input_tokens_seen": 123145130, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8125, + "step": 5731, + "time_per_iteration": 2.454735279083252 + }, + { + "auxiliary_loss_clip": 0.01123928, + "auxiliary_loss_mlp": 0.0104428, + "balance_loss_clip": 1.02953923, + "balance_loss_mlp": 1.04394484, + "epoch": 0.34462648429280024, + "flos": 15888102307200.0, + "grad_norm": 2.039149215632527, + "language_loss": 0.78837991, + "learning_rate": 3.0483904783290006e-06, + "loss": 0.81006193, + "num_input_tokens_seen": 123162265, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.796875, + "step": 5732, + "time_per_iteration": 2.4177064895629883 + }, + { + "auxiliary_loss_clip": 0.01043649, + "auxiliary_loss_mlp": 0.01003776, + "balance_loss_clip": 1.00184536, + "balance_loss_mlp": 1.01788378, + "epoch": 0.3446866075454682, + "flos": 59311035285120.0, + "grad_norm": 0.7440231134556253, + "language_loss": 0.53498071, + "learning_rate": 3.0480587936193505e-06, + "loss": 0.55545497, + "num_input_tokens_seen": 123218620, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.2578125, + "step": 5733, + "time_per_iteration": 3.0976667404174805 + }, + { + "auxiliary_loss_clip": 0.0112691, + "auxiliary_loss_mlp": 0.01042834, + "balance_loss_clip": 1.02806389, + "balance_loss_mlp": 1.04630947, + "epoch": 0.34474673079813617, + "flos": 22343799000960.0, + "grad_norm": 1.6025085195413686, + "language_loss": 0.83345532, + "learning_rate": 3.047727069167207e-06, + "loss": 0.85515279, + "num_input_tokens_seen": 123237325, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8046875, + "step": 5734, + "time_per_iteration": 2.462327718734741 + }, + { + "auxiliary_loss_clip": 0.01125766, + "auxiliary_loss_mlp": 0.0103401, + "balance_loss_clip": 1.01899588, + "balance_loss_mlp": 1.04382658, + "epoch": 0.34480685405080413, + "flos": 27670141203840.0, + "grad_norm": 2.7233898634254525, + "language_loss": 0.9245038, + "learning_rate": 3.0473953049851478e-06, + "loss": 0.94610149, + "num_input_tokens_seen": 123258650, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8203125, + "step": 5735, + "time_per_iteration": 2.600933790206909 + }, + { + "auxiliary_loss_clip": 0.01129266, + "auxiliary_loss_mlp": 0.01041814, + "balance_loss_clip": 1.02607846, + "balance_loss_mlp": 1.04662871, + "epoch": 0.3448669773034721, + "flos": 22456020067200.0, + "grad_norm": 1.628548106881684, + "language_loss": 0.76666284, + "learning_rate": 3.0470635010857533e-06, + "loss": 0.78837371, + "num_input_tokens_seen": 123277155, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.828125, + "step": 5736, + "time_per_iteration": 2.4607973098754883 + }, + { + "auxiliary_loss_clip": 0.0113014, + "auxiliary_loss_mlp": 0.01043797, + "balance_loss_clip": 1.02948046, + "balance_loss_mlp": 1.04773998, + "epoch": 0.34492710055614006, + "flos": 24936190352640.0, + "grad_norm": 1.59823002014571, + "language_loss": 0.78745639, + "learning_rate": 3.0467316574816064e-06, + "loss": 0.80919576, + "num_input_tokens_seen": 123297640, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.82421875, + "step": 5737, + "time_per_iteration": 2.5059142112731934 + }, + { + "auxiliary_loss_clip": 0.0112976, + "auxiliary_loss_mlp": 0.01040269, + "balance_loss_clip": 1.02459311, + "balance_loss_mlp": 1.04445243, + "epoch": 0.34498722380880803, + "flos": 20120821073280.0, + "grad_norm": 2.0456946138928767, + "language_loss": 0.71714234, + "learning_rate": 3.0463997741852893e-06, + "loss": 0.73884267, + "num_input_tokens_seen": 123314370, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8515625, + "step": 5738, + "time_per_iteration": 2.4374310970306396 + }, + { + "auxiliary_loss_clip": 0.01129235, + "auxiliary_loss_mlp": 0.01042362, + "balance_loss_clip": 1.02727044, + "balance_loss_mlp": 1.04496205, + "epoch": 0.34504734706147605, + "flos": 28438126917120.0, + "grad_norm": 1.8999072115309161, + "language_loss": 0.81518626, + "learning_rate": 3.046067851209389e-06, + "loss": 0.83690214, + "num_input_tokens_seen": 123336085, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84375, + "step": 5739, + "time_per_iteration": 2.559990406036377 + }, + { + "auxiliary_loss_clip": 0.0112747, + "auxiliary_loss_mlp": 0.01045734, + "balance_loss_clip": 1.03067827, + "balance_loss_mlp": 1.04620492, + "epoch": 0.345107470314144, + "flos": 22674464628480.0, + "grad_norm": 2.6856273454827275, + "language_loss": 0.8322401, + "learning_rate": 3.0457358885664898e-06, + "loss": 0.85397214, + "num_input_tokens_seen": 123354460, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8125, + "step": 5740, + "time_per_iteration": 2.4684722423553467 + }, + { + "auxiliary_loss_clip": 0.01127563, + "auxiliary_loss_mlp": 0.01038564, + "balance_loss_clip": 1.0227803, + "balance_loss_mlp": 1.04611385, + "epoch": 0.345167593566812, + "flos": 20630716588800.0, + "grad_norm": 2.03424253553345, + "language_loss": 0.77135098, + "learning_rate": 3.045403886269181e-06, + "loss": 0.7930122, + "num_input_tokens_seen": 123373420, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8125, + "step": 5741, + "time_per_iteration": 2.48624587059021 + }, + { + "auxiliary_loss_clip": 0.01125981, + "auxiliary_loss_mlp": 0.01036869, + "balance_loss_clip": 1.02226019, + "balance_loss_mlp": 1.04276562, + "epoch": 0.34522771681947995, + "flos": 26214358890240.0, + "grad_norm": 1.4993687582247586, + "language_loss": 0.77224493, + "learning_rate": 3.045071844330053e-06, + "loss": 0.79387349, + "num_input_tokens_seen": 123394730, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.83203125, + "step": 5742, + "time_per_iteration": 2.5046300888061523 + }, + { + "auxiliary_loss_clip": 0.01123657, + "auxiliary_loss_mlp": 0.01039728, + "balance_loss_clip": 1.02479076, + "balance_loss_mlp": 1.04310095, + "epoch": 0.3452878400721479, + "flos": 19062354072960.0, + "grad_norm": 1.823337430242114, + "language_loss": 0.76346177, + "learning_rate": 3.0447397627616955e-06, + "loss": 0.78509557, + "num_input_tokens_seen": 123412895, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8046875, + "step": 5743, + "time_per_iteration": 2.4554226398468018 + }, + { + "auxiliary_loss_clip": 0.01124183, + "auxiliary_loss_mlp": 0.01036757, + "balance_loss_clip": 1.02278566, + "balance_loss_mlp": 1.04435802, + "epoch": 0.3453479633248159, + "flos": 27929739772800.0, + "grad_norm": 1.5691807126711539, + "language_loss": 0.70255435, + "learning_rate": 3.0444076415767016e-06, + "loss": 0.72416371, + "num_input_tokens_seen": 123432320, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.796875, + "step": 5744, + "time_per_iteration": 2.497314929962158 + }, + { + "auxiliary_loss_clip": 0.01121947, + "auxiliary_loss_mlp": 0.01036476, + "balance_loss_clip": 1.02205133, + "balance_loss_mlp": 1.04318309, + "epoch": 0.34540808657748384, + "flos": 19606113135360.0, + "grad_norm": 1.629619176768893, + "language_loss": 0.79692256, + "learning_rate": 3.044075480787665e-06, + "loss": 0.81850678, + "num_input_tokens_seen": 123450980, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7890625, + "step": 5745, + "time_per_iteration": 2.5154099464416504 + }, + { + "auxiliary_loss_clip": 0.01129348, + "auxiliary_loss_mlp": 0.01040489, + "balance_loss_clip": 1.02460432, + "balance_loss_mlp": 1.04556072, + "epoch": 0.3454682098301518, + "flos": 20411661496320.0, + "grad_norm": 1.7858540966841563, + "language_loss": 0.88775939, + "learning_rate": 3.043743280407182e-06, + "loss": 0.9094578, + "num_input_tokens_seen": 123469365, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8359375, + "step": 5746, + "time_per_iteration": 2.436028003692627 + }, + { + "auxiliary_loss_clip": 0.01129654, + "auxiliary_loss_mlp": 0.01039755, + "balance_loss_clip": 1.02438855, + "balance_loss_mlp": 1.04509354, + "epoch": 0.34552833308281977, + "flos": 21325121291520.0, + "grad_norm": 1.8755596522528313, + "language_loss": 0.64010286, + "learning_rate": 3.043411040447849e-06, + "loss": 0.66179693, + "num_input_tokens_seen": 123489425, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.84765625, + "step": 5747, + "time_per_iteration": 2.465817451477051 + }, + { + "auxiliary_loss_clip": 0.0112633, + "auxiliary_loss_mlp": 0.01035998, + "balance_loss_clip": 1.02193761, + "balance_loss_mlp": 1.04486203, + "epoch": 0.34558845633548774, + "flos": 36243633824640.0, + "grad_norm": 1.5413680181151455, + "language_loss": 0.72813559, + "learning_rate": 3.043078760922264e-06, + "loss": 0.74975884, + "num_input_tokens_seen": 123509970, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8125, + "step": 5748, + "time_per_iteration": 2.566849946975708 + }, + { + "auxiliary_loss_clip": 0.01123147, + "auxiliary_loss_mlp": 0.01034299, + "balance_loss_clip": 1.020715, + "balance_loss_mlp": 1.04517043, + "epoch": 0.3456485795881557, + "flos": 22450561200000.0, + "grad_norm": 1.6451707518978071, + "language_loss": 0.75697249, + "learning_rate": 3.042746441843029e-06, + "loss": 0.77854693, + "num_input_tokens_seen": 123531055, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.77734375, + "step": 5749, + "time_per_iteration": 2.5068271160125732 + }, + { + "auxiliary_loss_clip": 0.01036655, + "auxiliary_loss_mlp": 0.01004838, + "balance_loss_clip": 1.00293088, + "balance_loss_mlp": 1.01066136, + "epoch": 0.34570870284082367, + "flos": 62004299005440.0, + "grad_norm": 0.8931526891439046, + "language_loss": 0.62754983, + "learning_rate": 3.0424140832227437e-06, + "loss": 0.64796478, + "num_input_tokens_seen": 123584720, + "router_z_loss_clip": 0.01904297, + "router_z_loss_mlp": 0.25976562, + "step": 5750, + "time_per_iteration": 2.930236577987671 + }, + { + "auxiliary_loss_clip": 0.01119501, + "auxiliary_loss_mlp": 0.01033201, + "balance_loss_clip": 1.01933062, + "balance_loss_mlp": 1.04268134, + "epoch": 0.34576882609349163, + "flos": 22782196494720.0, + "grad_norm": 2.1199041216122314, + "language_loss": 0.80762947, + "learning_rate": 3.042081685074012e-06, + "loss": 0.82915652, + "num_input_tokens_seen": 123604465, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 5751, + "time_per_iteration": 2.4710936546325684 + }, + { + "auxiliary_loss_clip": 0.01121328, + "auxiliary_loss_mlp": 0.01046759, + "balance_loss_clip": 1.03268027, + "balance_loss_mlp": 1.04408574, + "epoch": 0.34582894934615965, + "flos": 12348818576640.0, + "grad_norm": 3.882107217624466, + "language_loss": 0.83630323, + "learning_rate": 3.041749247409439e-06, + "loss": 0.85798407, + "num_input_tokens_seen": 123622320, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7734375, + "step": 5752, + "time_per_iteration": 2.421095132827759 + }, + { + "auxiliary_loss_clip": 0.01036836, + "auxiliary_loss_mlp": 0.01002038, + "balance_loss_clip": 1.00014234, + "balance_loss_mlp": 1.01131189, + "epoch": 0.3458890725988276, + "flos": 70167691071360.0, + "grad_norm": 0.7425573992046552, + "language_loss": 0.63106978, + "learning_rate": 3.0414167702416296e-06, + "loss": 0.6514585, + "num_input_tokens_seen": 123678010, + "router_z_loss_clip": 0.0189209, + "router_z_loss_mlp": 0.25585938, + "step": 5753, + "time_per_iteration": 2.960430383682251 + }, + { + "auxiliary_loss_clip": 0.01122475, + "auxiliary_loss_mlp": 0.01040243, + "balance_loss_clip": 1.0252701, + "balance_loss_mlp": 1.0433172, + "epoch": 0.3459491958514956, + "flos": 17092582093440.0, + "grad_norm": 1.7337780765213762, + "language_loss": 0.70964289, + "learning_rate": 3.0410842535831914e-06, + "loss": 0.73127007, + "num_input_tokens_seen": 123696830, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.79296875, + "step": 5754, + "time_per_iteration": 2.473090171813965 + }, + { + "auxiliary_loss_clip": 0.01126645, + "auxiliary_loss_mlp": 0.01033305, + "balance_loss_clip": 1.01889825, + "balance_loss_mlp": 1.04436386, + "epoch": 0.34600931910416355, + "flos": 16650952375680.0, + "grad_norm": 3.1958037374869357, + "language_loss": 0.72880316, + "learning_rate": 3.0407516974467343e-06, + "loss": 0.75040269, + "num_input_tokens_seen": 123714360, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.82421875, + "step": 5755, + "time_per_iteration": 2.486187219619751 + }, + { + "auxiliary_loss_clip": 0.01122516, + "auxiliary_loss_mlp": 0.01034129, + "balance_loss_clip": 1.01985335, + "balance_loss_mlp": 1.04448533, + "epoch": 0.3460694423568315, + "flos": 38546190334080.0, + "grad_norm": 1.6620890991055186, + "language_loss": 0.72366977, + "learning_rate": 3.040419101844869e-06, + "loss": 0.74523616, + "num_input_tokens_seen": 123739250, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 5756, + "time_per_iteration": 2.6883044242858887 + }, + { + "auxiliary_loss_clip": 0.01036738, + "auxiliary_loss_mlp": 0.01004698, + "balance_loss_clip": 1.00295758, + "balance_loss_mlp": 1.01152658, + "epoch": 0.3461295656094995, + "flos": 72081479704320.0, + "grad_norm": 0.7127234008063932, + "language_loss": 0.62522227, + "learning_rate": 3.040086466790207e-06, + "loss": 0.64563662, + "num_input_tokens_seen": 123802845, + "router_z_loss_clip": 0.01745605, + "router_z_loss_mlp": 0.25195312, + "step": 5757, + "time_per_iteration": 3.0644619464874268 + }, + { + "auxiliary_loss_clip": 0.01036676, + "auxiliary_loss_mlp": 0.01006374, + "balance_loss_clip": 1.00465703, + "balance_loss_mlp": 1.01123941, + "epoch": 0.34618968886216744, + "flos": 65460089571840.0, + "grad_norm": 0.8513650993905141, + "language_loss": 0.59153563, + "learning_rate": 3.039753792295362e-06, + "loss": 0.61196613, + "num_input_tokens_seen": 123861805, + "router_z_loss_clip": 0.01721191, + "router_z_loss_mlp": 0.25390625, + "step": 5758, + "time_per_iteration": 3.0601916313171387 + }, + { + "auxiliary_loss_clip": 0.01126165, + "auxiliary_loss_mlp": 0.0103975, + "balance_loss_clip": 1.02576697, + "balance_loss_mlp": 1.04562724, + "epoch": 0.3462498121148354, + "flos": 23472542960640.0, + "grad_norm": 1.8469236817688628, + "language_loss": 0.71498728, + "learning_rate": 3.0394210783729487e-06, + "loss": 0.73664641, + "num_input_tokens_seen": 123881820, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.8046875, + "step": 5759, + "time_per_iteration": 2.4722588062286377 + }, + { + "auxiliary_loss_clip": 0.0112123, + "auxiliary_loss_mlp": 0.01046171, + "balance_loss_clip": 1.03079295, + "balance_loss_mlp": 1.04248834, + "epoch": 0.3463099353675034, + "flos": 24170790418560.0, + "grad_norm": 1.8727439754442439, + "language_loss": 0.83008277, + "learning_rate": 3.0390883250355836e-06, + "loss": 0.85175675, + "num_input_tokens_seen": 123903700, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.7890625, + "step": 5760, + "time_per_iteration": 2.5002012252807617 + }, + { + "auxiliary_loss_clip": 0.01035648, + "auxiliary_loss_mlp": 0.01005512, + "balance_loss_clip": 1.00358045, + "balance_loss_mlp": 1.01033783, + "epoch": 0.34637005862017134, + "flos": 63700609766400.0, + "grad_norm": 0.8745886359800412, + "language_loss": 0.5653646, + "learning_rate": 3.0387555322958865e-06, + "loss": 0.58577621, + "num_input_tokens_seen": 123960075, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.25390625, + "step": 5761, + "time_per_iteration": 3.0950896739959717 + }, + { + "auxiliary_loss_clip": 0.01120096, + "auxiliary_loss_mlp": 0.01038691, + "balance_loss_clip": 1.02470756, + "balance_loss_mlp": 1.04127657, + "epoch": 0.3464301818728393, + "flos": 13145532192000.0, + "grad_norm": 2.0018538772922883, + "language_loss": 0.95053494, + "learning_rate": 3.038422700166474e-06, + "loss": 0.97212291, + "num_input_tokens_seen": 123975805, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7890625, + "step": 5762, + "time_per_iteration": 5.290884256362915 + }, + { + "auxiliary_loss_clip": 0.01123668, + "auxiliary_loss_mlp": 0.01034396, + "balance_loss_clip": 1.01935804, + "balance_loss_mlp": 1.0417943, + "epoch": 0.34649030512550727, + "flos": 29315173299840.0, + "grad_norm": 2.194288284173203, + "language_loss": 0.69335818, + "learning_rate": 3.0380898286599692e-06, + "loss": 0.71493888, + "num_input_tokens_seen": 123997530, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8203125, + "step": 5763, + "time_per_iteration": 2.5411787033081055 + }, + { + "auxiliary_loss_clip": 0.01130216, + "auxiliary_loss_mlp": 0.01045092, + "balance_loss_clip": 1.02862906, + "balance_loss_mlp": 1.0458554, + "epoch": 0.34655042837817523, + "flos": 23730884553600.0, + "grad_norm": 2.0099592928074497, + "language_loss": 0.83589876, + "learning_rate": 3.0377569177889945e-06, + "loss": 0.85765183, + "num_input_tokens_seen": 124016375, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.84375, + "step": 5764, + "time_per_iteration": 2.48040771484375 + }, + { + "auxiliary_loss_clip": 0.01123556, + "auxiliary_loss_mlp": 0.01033291, + "balance_loss_clip": 1.0186758, + "balance_loss_mlp": 1.04343057, + "epoch": 0.34661055163084326, + "flos": 22054215553920.0, + "grad_norm": 2.159805793212971, + "language_loss": 0.67403859, + "learning_rate": 3.0374239675661722e-06, + "loss": 0.69560707, + "num_input_tokens_seen": 124033975, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.80078125, + "step": 5765, + "time_per_iteration": 2.502297878265381 + }, + { + "auxiliary_loss_clip": 0.01130095, + "auxiliary_loss_mlp": 0.01037318, + "balance_loss_clip": 1.02291703, + "balance_loss_mlp": 1.04937232, + "epoch": 0.3466706748835112, + "flos": 21799213925760.0, + "grad_norm": 2.083918060213648, + "language_loss": 0.77861524, + "learning_rate": 3.03709097800413e-06, + "loss": 0.80028939, + "num_input_tokens_seen": 124051930, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.80859375, + "step": 5766, + "time_per_iteration": 2.465325355529785 + }, + { + "auxiliary_loss_clip": 0.01122551, + "auxiliary_loss_mlp": 0.01035113, + "balance_loss_clip": 1.0215292, + "balance_loss_mlp": 1.04335451, + "epoch": 0.3467307981361792, + "flos": 19461680547840.0, + "grad_norm": 1.5377908130541305, + "language_loss": 0.73529994, + "learning_rate": 3.0367579491154943e-06, + "loss": 0.75687665, + "num_input_tokens_seen": 124071220, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7890625, + "step": 5767, + "time_per_iteration": 2.4656143188476562 + }, + { + "auxiliary_loss_clip": 0.01127128, + "auxiliary_loss_mlp": 0.01040956, + "balance_loss_clip": 1.02538764, + "balance_loss_mlp": 1.04720497, + "epoch": 0.34679092138884715, + "flos": 24827452905600.0, + "grad_norm": 2.233359981487989, + "language_loss": 0.77795279, + "learning_rate": 3.036424880912893e-06, + "loss": 0.79963356, + "num_input_tokens_seen": 124090140, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.796875, + "step": 5768, + "time_per_iteration": 2.4951131343841553 + }, + { + "auxiliary_loss_clip": 0.0103542, + "auxiliary_loss_mlp": 0.01008769, + "balance_loss_clip": 1.00693345, + "balance_loss_mlp": 1.01015306, + "epoch": 0.3468510446415151, + "flos": 63236070149760.0, + "grad_norm": 0.7739728920865777, + "language_loss": 0.57404095, + "learning_rate": 3.036091773408956e-06, + "loss": 0.59448284, + "num_input_tokens_seen": 124152025, + "router_z_loss_clip": 0.01831055, + "router_z_loss_mlp": 0.25195312, + "step": 5769, + "time_per_iteration": 3.0867085456848145 + }, + { + "auxiliary_loss_clip": 0.01135857, + "auxiliary_loss_mlp": 0.01043057, + "balance_loss_clip": 1.02577174, + "balance_loss_mlp": 1.04723847, + "epoch": 0.3469111678941831, + "flos": 12120713256960.0, + "grad_norm": 2.3808887206764244, + "language_loss": 0.85625517, + "learning_rate": 3.0357586266163154e-06, + "loss": 0.87804437, + "num_input_tokens_seen": 124165795, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.88671875, + "step": 5770, + "time_per_iteration": 2.4296391010284424 + }, + { + "auxiliary_loss_clip": 0.0103532, + "auxiliary_loss_mlp": 0.01003334, + "balance_loss_clip": 1.00152194, + "balance_loss_mlp": 1.01001954, + "epoch": 0.34697129114685105, + "flos": 65934110378880.0, + "grad_norm": 0.7779481231658855, + "language_loss": 0.59827816, + "learning_rate": 3.0354254405476036e-06, + "loss": 0.61866474, + "num_input_tokens_seen": 124222925, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.25390625, + "step": 5771, + "time_per_iteration": 2.858952522277832 + }, + { + "auxiliary_loss_clip": 0.0112466, + "auxiliary_loss_mlp": 0.01046684, + "balance_loss_clip": 1.03183091, + "balance_loss_mlp": 1.04478061, + "epoch": 0.347031414399519, + "flos": 34454205054720.0, + "grad_norm": 2.6949016474557475, + "language_loss": 0.71790159, + "learning_rate": 3.0350922152154557e-06, + "loss": 0.73961502, + "num_input_tokens_seen": 124240915, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 5772, + "time_per_iteration": 2.629441976547241 + }, + { + "auxiliary_loss_clip": 0.01125724, + "auxiliary_loss_mlp": 0.01041085, + "balance_loss_clip": 1.02608275, + "balance_loss_mlp": 1.04398608, + "epoch": 0.347091537652187, + "flos": 26944135511040.0, + "grad_norm": 1.4939658014033708, + "language_loss": 0.76165307, + "learning_rate": 3.034758950632507e-06, + "loss": 0.78332114, + "num_input_tokens_seen": 124262770, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.81640625, + "step": 5773, + "time_per_iteration": 2.5281848907470703 + }, + { + "auxiliary_loss_clip": 0.01127127, + "auxiliary_loss_mlp": 0.01043606, + "balance_loss_clip": 1.02811444, + "balance_loss_mlp": 1.04447389, + "epoch": 0.34715166090485494, + "flos": 21142228216320.0, + "grad_norm": 2.0748415381607717, + "language_loss": 0.70428938, + "learning_rate": 3.034425646811396e-06, + "loss": 0.72599673, + "num_input_tokens_seen": 124280950, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.828125, + "step": 5774, + "time_per_iteration": 2.4930198192596436 + }, + { + "auxiliary_loss_clip": 0.01125136, + "auxiliary_loss_mlp": 0.01040671, + "balance_loss_clip": 1.02630043, + "balance_loss_mlp": 1.04615033, + "epoch": 0.3472117841575229, + "flos": 23478001827840.0, + "grad_norm": 1.6801460468757594, + "language_loss": 0.76410925, + "learning_rate": 3.0340923037647602e-06, + "loss": 0.78576738, + "num_input_tokens_seen": 124299540, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7890625, + "step": 5775, + "time_per_iteration": 2.501793622970581 + }, + { + "auxiliary_loss_clip": 0.01129926, + "auxiliary_loss_mlp": 0.0104389, + "balance_loss_clip": 1.02778447, + "balance_loss_mlp": 1.04408336, + "epoch": 0.34727190741019087, + "flos": 17492806408320.0, + "grad_norm": 2.2786937073337956, + "language_loss": 0.78098702, + "learning_rate": 3.0337589215052404e-06, + "loss": 0.8027252, + "num_input_tokens_seen": 124316285, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.859375, + "step": 5776, + "time_per_iteration": 2.547508716583252 + }, + { + "auxiliary_loss_clip": 0.01034004, + "auxiliary_loss_mlp": 0.01012403, + "balance_loss_clip": 1.01073408, + "balance_loss_mlp": 1.00864577, + "epoch": 0.34733203066285884, + "flos": 65265491640960.0, + "grad_norm": 0.8366551978688649, + "language_loss": 0.63353252, + "learning_rate": 3.033425500045478e-06, + "loss": 0.65399659, + "num_input_tokens_seen": 124376650, + "router_z_loss_clip": 0.01672363, + "router_z_loss_mlp": 0.25390625, + "step": 5777, + "time_per_iteration": 3.118314743041992 + }, + { + "auxiliary_loss_clip": 0.01124542, + "auxiliary_loss_mlp": 0.01047894, + "balance_loss_clip": 1.03253984, + "balance_loss_mlp": 1.04198289, + "epoch": 0.3473921539155268, + "flos": 28658726294400.0, + "grad_norm": 2.1982821508403956, + "language_loss": 0.64399695, + "learning_rate": 3.033092039398119e-06, + "loss": 0.66572136, + "num_input_tokens_seen": 124396475, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.82421875, + "step": 5778, + "time_per_iteration": 2.5438621044158936 + }, + { + "auxiliary_loss_clip": 0.01128237, + "auxiliary_loss_mlp": 0.01054212, + "balance_loss_clip": 1.03947175, + "balance_loss_mlp": 1.04425573, + "epoch": 0.3474522771681948, + "flos": 40836895355520.0, + "grad_norm": 1.7264375706792277, + "language_loss": 0.71190178, + "learning_rate": 3.0327585395758046e-06, + "loss": 0.73372632, + "num_input_tokens_seen": 124416480, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.83984375, + "step": 5779, + "time_per_iteration": 2.6116013526916504 + }, + { + "auxiliary_loss_clip": 0.01128331, + "auxiliary_loss_mlp": 0.01048092, + "balance_loss_clip": 1.03270197, + "balance_loss_mlp": 1.04354596, + "epoch": 0.3475124004208628, + "flos": 24608577381120.0, + "grad_norm": 1.874853063849031, + "language_loss": 0.62552947, + "learning_rate": 3.0324250005911837e-06, + "loss": 0.64729369, + "num_input_tokens_seen": 124435950, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84765625, + "step": 5780, + "time_per_iteration": 2.5024712085723877 + }, + { + "auxiliary_loss_clip": 0.01124027, + "auxiliary_loss_mlp": 0.01041985, + "balance_loss_clip": 1.0278883, + "balance_loss_mlp": 1.04260445, + "epoch": 0.34757252367353075, + "flos": 22711309004160.0, + "grad_norm": 1.604616792806945, + "language_loss": 0.72373253, + "learning_rate": 3.0320914224569033e-06, + "loss": 0.74539268, + "num_input_tokens_seen": 124455410, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.81640625, + "step": 5781, + "time_per_iteration": 2.471235513687134 + }, + { + "auxiliary_loss_clip": 0.01125801, + "auxiliary_loss_mlp": 0.01050458, + "balance_loss_clip": 1.03416181, + "balance_loss_mlp": 1.04316914, + "epoch": 0.3476326469261987, + "flos": 19828184970240.0, + "grad_norm": 2.0942988164582266, + "language_loss": 0.76741016, + "learning_rate": 3.031757805185612e-06, + "loss": 0.78917271, + "num_input_tokens_seen": 124474870, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.828125, + "step": 5782, + "time_per_iteration": 2.4831414222717285 + }, + { + "auxiliary_loss_clip": 0.01123989, + "auxiliary_loss_mlp": 0.01036279, + "balance_loss_clip": 1.02140737, + "balance_loss_mlp": 1.04221606, + "epoch": 0.3476927701788667, + "flos": 19938107566080.0, + "grad_norm": 1.9917493867858045, + "language_loss": 0.62131268, + "learning_rate": 3.0314241487899622e-06, + "loss": 0.64291537, + "num_input_tokens_seen": 124494105, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8203125, + "step": 5783, + "time_per_iteration": 2.4483954906463623 + }, + { + "auxiliary_loss_clip": 0.01119293, + "auxiliary_loss_mlp": 0.01031994, + "balance_loss_clip": 1.01833832, + "balance_loss_mlp": 1.0410347, + "epoch": 0.34775289343153465, + "flos": 20735108490240.0, + "grad_norm": 1.6546414102961637, + "language_loss": 0.88575971, + "learning_rate": 3.031090453282605e-06, + "loss": 0.90727258, + "num_input_tokens_seen": 124512030, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.78125, + "step": 5784, + "time_per_iteration": 2.5281262397766113 + }, + { + "auxiliary_loss_clip": 0.01121731, + "auxiliary_loss_mlp": 0.01036934, + "balance_loss_clip": 1.02219379, + "balance_loss_mlp": 1.04283547, + "epoch": 0.3478130166842026, + "flos": 19354846521600.0, + "grad_norm": 1.7834042756277195, + "language_loss": 0.81664282, + "learning_rate": 3.0307567186761946e-06, + "loss": 0.83822948, + "num_input_tokens_seen": 124530980, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.7890625, + "step": 5785, + "time_per_iteration": 2.444279432296753 + }, + { + "auxiliary_loss_clip": 0.01126224, + "auxiliary_loss_mlp": 0.01037443, + "balance_loss_clip": 1.02301908, + "balance_loss_mlp": 1.04558039, + "epoch": 0.3478731399368706, + "flos": 22051198811520.0, + "grad_norm": 1.6236713309130966, + "language_loss": 0.80679643, + "learning_rate": 3.0304229449833862e-06, + "loss": 0.82843316, + "num_input_tokens_seen": 124549330, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8046875, + "step": 5786, + "time_per_iteration": 2.506639242172241 + }, + { + "auxiliary_loss_clip": 0.01123366, + "auxiliary_loss_mlp": 0.01033692, + "balance_loss_clip": 1.01860058, + "balance_loss_mlp": 1.0443275, + "epoch": 0.34793326318953854, + "flos": 18041449720320.0, + "grad_norm": 1.5789553434659291, + "language_loss": 0.74868137, + "learning_rate": 3.030089132216836e-06, + "loss": 0.77025199, + "num_input_tokens_seen": 124567200, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7890625, + "step": 5787, + "time_per_iteration": 2.4305543899536133 + }, + { + "auxiliary_loss_clip": 0.01122978, + "auxiliary_loss_mlp": 0.01037288, + "balance_loss_clip": 1.02276862, + "balance_loss_mlp": 1.04133916, + "epoch": 0.3479933864422065, + "flos": 29314670509440.0, + "grad_norm": 1.685205733624188, + "language_loss": 0.81207466, + "learning_rate": 3.029755280389203e-06, + "loss": 0.83367729, + "num_input_tokens_seen": 124587025, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.81640625, + "step": 5788, + "time_per_iteration": 2.58461332321167 + }, + { + "auxiliary_loss_clip": 0.01130932, + "auxiliary_loss_mlp": 0.01038586, + "balance_loss_clip": 1.02333927, + "balance_loss_mlp": 1.04716599, + "epoch": 0.3480535096948745, + "flos": 20120713332480.0, + "grad_norm": 1.7599288417752579, + "language_loss": 0.85399663, + "learning_rate": 3.029421389513147e-06, + "loss": 0.87569183, + "num_input_tokens_seen": 124605860, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8359375, + "step": 5789, + "time_per_iteration": 2.4460527896881104 + }, + { + "auxiliary_loss_clip": 0.01127788, + "auxiliary_loss_mlp": 0.01050411, + "balance_loss_clip": 1.03517616, + "balance_loss_mlp": 1.04420161, + "epoch": 0.34811363294754244, + "flos": 18548974938240.0, + "grad_norm": 1.9217222904205502, + "language_loss": 0.84973574, + "learning_rate": 3.029087459601328e-06, + "loss": 0.87151778, + "num_input_tokens_seen": 124624270, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8359375, + "step": 5790, + "time_per_iteration": 2.4690423011779785 + }, + { + "auxiliary_loss_clip": 0.0112493, + "auxiliary_loss_mlp": 0.01044909, + "balance_loss_clip": 1.0295074, + "balance_loss_mlp": 1.04403305, + "epoch": 0.3481737562002104, + "flos": 26870303105280.0, + "grad_norm": 2.0218239222922785, + "language_loss": 0.82098949, + "learning_rate": 3.0287534906664097e-06, + "loss": 0.8426879, + "num_input_tokens_seen": 124644005, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.80859375, + "step": 5791, + "time_per_iteration": 2.4949092864990234 + }, + { + "auxiliary_loss_clip": 0.01124824, + "auxiliary_loss_mlp": 0.01038811, + "balance_loss_clip": 1.02386248, + "balance_loss_mlp": 1.04235744, + "epoch": 0.3482338794528784, + "flos": 28908664104960.0, + "grad_norm": 1.7691925727921667, + "language_loss": 0.77531552, + "learning_rate": 3.028419482721056e-06, + "loss": 0.79695195, + "num_input_tokens_seen": 124663020, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.82421875, + "step": 5792, + "time_per_iteration": 2.5464468002319336 + }, + { + "auxiliary_loss_clip": 0.01121021, + "auxiliary_loss_mlp": 0.01031113, + "balance_loss_clip": 1.01623607, + "balance_loss_mlp": 1.04100966, + "epoch": 0.3482940027055464, + "flos": 22200767043840.0, + "grad_norm": 1.5041206153246893, + "language_loss": 0.81592953, + "learning_rate": 3.0280854357779325e-06, + "loss": 0.83745086, + "num_input_tokens_seen": 124682975, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.80078125, + "step": 5793, + "time_per_iteration": 2.454220771789551 + }, + { + "auxiliary_loss_clip": 0.01126572, + "auxiliary_loss_mlp": 0.01046613, + "balance_loss_clip": 1.03078222, + "balance_loss_mlp": 1.04426205, + "epoch": 0.34835412595821436, + "flos": 20302708567680.0, + "grad_norm": 1.7524057524538565, + "language_loss": 0.76222527, + "learning_rate": 3.027751349849706e-06, + "loss": 0.78395712, + "num_input_tokens_seen": 124701340, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8203125, + "step": 5794, + "time_per_iteration": 2.485077142715454 + }, + { + "auxiliary_loss_clip": 0.01121136, + "auxiliary_loss_mlp": 0.01036584, + "balance_loss_clip": 1.02165866, + "balance_loss_mlp": 1.04168189, + "epoch": 0.3484142492108823, + "flos": 20449691020800.0, + "grad_norm": 2.2347385462744165, + "language_loss": 0.56926, + "learning_rate": 3.0274172249490456e-06, + "loss": 0.59083712, + "num_input_tokens_seen": 124719165, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.796875, + "step": 5795, + "time_per_iteration": 2.4378490447998047 + }, + { + "auxiliary_loss_clip": 0.01121205, + "auxiliary_loss_mlp": 0.01036633, + "balance_loss_clip": 1.02250659, + "balance_loss_mlp": 1.04285967, + "epoch": 0.3484743724635503, + "flos": 24352929308160.0, + "grad_norm": 2.137832792929428, + "language_loss": 0.82437253, + "learning_rate": 3.0270830610886213e-06, + "loss": 0.84595084, + "num_input_tokens_seen": 124738670, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.78125, + "step": 5796, + "time_per_iteration": 2.5187671184539795 + }, + { + "auxiliary_loss_clip": 0.01120499, + "auxiliary_loss_mlp": 0.01029245, + "balance_loss_clip": 1.0153811, + "balance_loss_mlp": 1.043782, + "epoch": 0.34853449571621825, + "flos": 24353001135360.0, + "grad_norm": 1.7817355656860259, + "language_loss": 0.83580989, + "learning_rate": 3.0267488582811033e-06, + "loss": 0.85730731, + "num_input_tokens_seen": 124758760, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 5797, + "time_per_iteration": 2.518832206726074 + }, + { + "auxiliary_loss_clip": 0.01119982, + "auxiliary_loss_mlp": 0.01034692, + "balance_loss_clip": 1.02017224, + "balance_loss_mlp": 1.04206371, + "epoch": 0.3485946189688862, + "flos": 27267690245760.0, + "grad_norm": 1.7199370679887815, + "language_loss": 0.73215538, + "learning_rate": 3.026414616539167e-06, + "loss": 0.7537021, + "num_input_tokens_seen": 124777765, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 5798, + "time_per_iteration": 2.499967575073242 + }, + { + "auxiliary_loss_clip": 0.01123251, + "auxiliary_loss_mlp": 0.01041885, + "balance_loss_clip": 1.02660251, + "balance_loss_mlp": 1.04203498, + "epoch": 0.3486547422215542, + "flos": 20156695781760.0, + "grad_norm": 2.0872044860332597, + "language_loss": 0.75936413, + "learning_rate": 3.026080335875485e-06, + "loss": 0.78101552, + "num_input_tokens_seen": 124796775, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8125, + "step": 5799, + "time_per_iteration": 2.4452474117279053 + }, + { + "auxiliary_loss_clip": 0.01121272, + "auxiliary_loss_mlp": 0.01038695, + "balance_loss_clip": 1.0248909, + "balance_loss_mlp": 1.04197407, + "epoch": 0.34871486547422215, + "flos": 20230348619520.0, + "grad_norm": 1.7461935027983841, + "language_loss": 0.75557071, + "learning_rate": 3.025746016302734e-06, + "loss": 0.7771703, + "num_input_tokens_seen": 124815825, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.79296875, + "step": 5800, + "time_per_iteration": 2.4526796340942383 + }, + { + "auxiliary_loss_clip": 0.01129939, + "auxiliary_loss_mlp": 0.0104466, + "balance_loss_clip": 1.02854276, + "balance_loss_mlp": 1.04578733, + "epoch": 0.3487749887268901, + "flos": 44053234882560.0, + "grad_norm": 2.3150001070935127, + "language_loss": 0.67645729, + "learning_rate": 3.025411657833591e-06, + "loss": 0.69820327, + "num_input_tokens_seen": 124838420, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.84375, + "step": 5801, + "time_per_iteration": 2.644601821899414 + }, + { + "auxiliary_loss_clip": 0.01122812, + "auxiliary_loss_mlp": 0.0104057, + "balance_loss_clip": 1.02563334, + "balance_loss_mlp": 1.04446411, + "epoch": 0.3488351119795581, + "flos": 23295144666240.0, + "grad_norm": 1.9000140831486088, + "language_loss": 0.76785576, + "learning_rate": 3.025077260480735e-06, + "loss": 0.78948951, + "num_input_tokens_seen": 124857320, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.78515625, + "step": 5802, + "time_per_iteration": 2.46921968460083 + }, + { + "auxiliary_loss_clip": 0.01118956, + "auxiliary_loss_mlp": 0.01033761, + "balance_loss_clip": 1.01905692, + "balance_loss_mlp": 1.04294538, + "epoch": 0.34889523523222604, + "flos": 19934839428480.0, + "grad_norm": 1.750768588632487, + "language_loss": 0.78868455, + "learning_rate": 3.0247428242568474e-06, + "loss": 0.81021172, + "num_input_tokens_seen": 124875685, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.76171875, + "step": 5803, + "time_per_iteration": 3.979863405227661 + }, + { + "auxiliary_loss_clip": 0.01123257, + "auxiliary_loss_mlp": 0.01036614, + "balance_loss_clip": 1.02266085, + "balance_loss_mlp": 1.0410372, + "epoch": 0.348955358484894, + "flos": 30446179816320.0, + "grad_norm": 1.9657380954946277, + "language_loss": 0.67745399, + "learning_rate": 3.0244083491746085e-06, + "loss": 0.69905275, + "num_input_tokens_seen": 124895960, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.8203125, + "step": 5804, + "time_per_iteration": 3.8562989234924316 + }, + { + "auxiliary_loss_clip": 0.01121349, + "auxiliary_loss_mlp": 0.01044714, + "balance_loss_clip": 1.03001559, + "balance_loss_mlp": 1.0454638, + "epoch": 0.349015481737562, + "flos": 17999972490240.0, + "grad_norm": 2.669385195944029, + "language_loss": 0.76021814, + "learning_rate": 3.024073835246702e-06, + "loss": 0.78187871, + "num_input_tokens_seen": 124914140, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7578125, + "step": 5805, + "time_per_iteration": 2.458235263824463 + }, + { + "auxiliary_loss_clip": 0.01124464, + "auxiliary_loss_mlp": 0.01036858, + "balance_loss_clip": 1.02199244, + "balance_loss_mlp": 1.0451802, + "epoch": 0.34907560499023, + "flos": 27198490694400.0, + "grad_norm": 3.0752866237359884, + "language_loss": 0.67804134, + "learning_rate": 3.023739282485814e-06, + "loss": 0.69965458, + "num_input_tokens_seen": 124934180, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.79296875, + "step": 5806, + "time_per_iteration": 2.4840877056121826 + }, + { + "auxiliary_loss_clip": 0.01126527, + "auxiliary_loss_mlp": 0.01040199, + "balance_loss_clip": 1.02523851, + "balance_loss_mlp": 1.04571056, + "epoch": 0.34913572824289796, + "flos": 30226873328640.0, + "grad_norm": 1.4876164360326454, + "language_loss": 0.71957624, + "learning_rate": 3.023404690904629e-06, + "loss": 0.74124348, + "num_input_tokens_seen": 124956060, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8046875, + "step": 5807, + "time_per_iteration": 2.542815685272217 + }, + { + "auxiliary_loss_clip": 0.01123687, + "auxiliary_loss_mlp": 0.01038505, + "balance_loss_clip": 1.02295971, + "balance_loss_mlp": 1.04158592, + "epoch": 0.3491958514955659, + "flos": 29971907614080.0, + "grad_norm": 1.7054576034597768, + "language_loss": 0.74218416, + "learning_rate": 3.0230700605158364e-06, + "loss": 0.7638061, + "num_input_tokens_seen": 124976070, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8203125, + "step": 5808, + "time_per_iteration": 2.503438949584961 + }, + { + "auxiliary_loss_clip": 0.01122149, + "auxiliary_loss_mlp": 0.01048536, + "balance_loss_clip": 1.03412986, + "balance_loss_mlp": 1.04479396, + "epoch": 0.3492559747482339, + "flos": 22783273902720.0, + "grad_norm": 1.5095416937429198, + "language_loss": 0.84245461, + "learning_rate": 3.0227353913321238e-06, + "loss": 0.86416149, + "num_input_tokens_seen": 124996995, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7734375, + "step": 5809, + "time_per_iteration": 2.4860358238220215 + }, + { + "auxiliary_loss_clip": 0.01119567, + "auxiliary_loss_mlp": 0.01036784, + "balance_loss_clip": 1.02354026, + "balance_loss_mlp": 1.04322374, + "epoch": 0.34931609800090185, + "flos": 26068022881920.0, + "grad_norm": 1.8434153763939258, + "language_loss": 0.80251479, + "learning_rate": 3.0224006833661835e-06, + "loss": 0.82407832, + "num_input_tokens_seen": 125015600, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.76171875, + "step": 5810, + "time_per_iteration": 2.481653928756714 + }, + { + "auxiliary_loss_clip": 0.01124044, + "auxiliary_loss_mlp": 0.01039794, + "balance_loss_clip": 1.02613211, + "balance_loss_mlp": 1.04406404, + "epoch": 0.3493762212535698, + "flos": 29242023252480.0, + "grad_norm": 1.967526444092296, + "language_loss": 0.75335366, + "learning_rate": 3.0220659366307057e-06, + "loss": 0.77499199, + "num_input_tokens_seen": 125035290, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.80078125, + "step": 5811, + "time_per_iteration": 2.534524440765381 + }, + { + "auxiliary_loss_clip": 0.01128245, + "auxiliary_loss_mlp": 0.01039888, + "balance_loss_clip": 1.02543986, + "balance_loss_mlp": 1.04616523, + "epoch": 0.3494363445062378, + "flos": 27126058919040.0, + "grad_norm": 1.4977831051483896, + "language_loss": 0.80070162, + "learning_rate": 3.021731151138386e-06, + "loss": 0.82238293, + "num_input_tokens_seen": 125057130, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8203125, + "step": 5812, + "time_per_iteration": 2.503074884414673 + }, + { + "auxiliary_loss_clip": 0.01123214, + "auxiliary_loss_mlp": 0.01042781, + "balance_loss_clip": 1.02746272, + "balance_loss_mlp": 1.04195547, + "epoch": 0.34949646775890575, + "flos": 12276207233280.0, + "grad_norm": 1.9471141693502576, + "language_loss": 0.6923517, + "learning_rate": 3.021396326901918e-06, + "loss": 0.71401167, + "num_input_tokens_seen": 125073720, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8125, + "step": 5813, + "time_per_iteration": 2.4503591060638428 + }, + { + "auxiliary_loss_clip": 0.01122458, + "auxiliary_loss_mlp": 0.01039452, + "balance_loss_clip": 1.02492023, + "balance_loss_mlp": 1.04438448, + "epoch": 0.3495565910115737, + "flos": 17165516659200.0, + "grad_norm": 2.4036318537481334, + "language_loss": 0.77007949, + "learning_rate": 3.0210614639339998e-06, + "loss": 0.79169858, + "num_input_tokens_seen": 125090635, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.78125, + "step": 5814, + "time_per_iteration": 2.4173405170440674 + }, + { + "auxiliary_loss_clip": 0.01126142, + "auxiliary_loss_mlp": 0.01042541, + "balance_loss_clip": 1.02692485, + "balance_loss_mlp": 1.04406822, + "epoch": 0.3496167142642417, + "flos": 26465661417600.0, + "grad_norm": 1.5090517849605465, + "language_loss": 0.84283173, + "learning_rate": 3.020726562247328e-06, + "loss": 0.86451852, + "num_input_tokens_seen": 125110070, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8203125, + "step": 5815, + "time_per_iteration": 2.5173141956329346 + }, + { + "auxiliary_loss_clip": 0.01124466, + "auxiliary_loss_mlp": 0.01033251, + "balance_loss_clip": 1.01981044, + "balance_loss_mlp": 1.04368711, + "epoch": 0.34967683751690964, + "flos": 17414843938560.0, + "grad_norm": 2.123091285603595, + "language_loss": 0.77423191, + "learning_rate": 3.0203916218546024e-06, + "loss": 0.79580915, + "num_input_tokens_seen": 125125730, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.80859375, + "step": 5816, + "time_per_iteration": 2.413438558578491 + }, + { + "auxiliary_loss_clip": 0.01128865, + "auxiliary_loss_mlp": 0.01042596, + "balance_loss_clip": 1.02761126, + "balance_loss_mlp": 1.0468061, + "epoch": 0.3497369607695776, + "flos": 22600021691520.0, + "grad_norm": 2.144763996717865, + "language_loss": 0.58441401, + "learning_rate": 3.0200566427685246e-06, + "loss": 0.60612863, + "num_input_tokens_seen": 125146195, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8203125, + "step": 5817, + "time_per_iteration": 2.5161447525024414 + }, + { + "auxiliary_loss_clip": 0.01042618, + "auxiliary_loss_mlp": 0.01011257, + "balance_loss_clip": 1.00957632, + "balance_loss_mlp": 1.01738954, + "epoch": 0.34979708402224563, + "flos": 68529374818560.0, + "grad_norm": 0.8658844915790124, + "language_loss": 0.59855008, + "learning_rate": 3.0197216250017975e-06, + "loss": 0.61908889, + "num_input_tokens_seen": 125207790, + "router_z_loss_clip": 0.0168457, + "router_z_loss_mlp": 0.25195312, + "step": 5818, + "time_per_iteration": 3.105595111846924 + }, + { + "auxiliary_loss_clip": 0.01123632, + "auxiliary_loss_mlp": 0.01036752, + "balance_loss_clip": 1.02226782, + "balance_loss_mlp": 1.04561055, + "epoch": 0.3498572072749136, + "flos": 18989634988800.0, + "grad_norm": 3.0068929936640103, + "language_loss": 0.83458424, + "learning_rate": 3.019386568567123e-06, + "loss": 0.85618806, + "num_input_tokens_seen": 125226220, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 5819, + "time_per_iteration": 2.47537899017334 + }, + { + "auxiliary_loss_clip": 0.01123279, + "auxiliary_loss_mlp": 0.0103063, + "balance_loss_clip": 1.01655149, + "balance_loss_mlp": 1.04359841, + "epoch": 0.34991733052758156, + "flos": 27818883423360.0, + "grad_norm": 3.6330435008795483, + "language_loss": 0.70765841, + "learning_rate": 3.0190514734772083e-06, + "loss": 0.7291975, + "num_input_tokens_seen": 125247485, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.796875, + "step": 5820, + "time_per_iteration": 2.4817428588867188 + }, + { + "auxiliary_loss_clip": 0.01125706, + "auxiliary_loss_mlp": 0.01035768, + "balance_loss_clip": 1.022434, + "balance_loss_mlp": 1.04544306, + "epoch": 0.3499774537802495, + "flos": 33584197737600.0, + "grad_norm": 2.1579309336976547, + "language_loss": 0.70112801, + "learning_rate": 3.018716339744759e-06, + "loss": 0.7227428, + "num_input_tokens_seen": 125268625, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.80078125, + "step": 5821, + "time_per_iteration": 2.578753709793091 + }, + { + "auxiliary_loss_clip": 0.01131817, + "auxiliary_loss_mlp": 0.0103919, + "balance_loss_clip": 1.02328706, + "balance_loss_mlp": 1.04798198, + "epoch": 0.3500375770329175, + "flos": 23476744851840.0, + "grad_norm": 2.9634934958204076, + "language_loss": 0.73591399, + "learning_rate": 3.0183811673824842e-06, + "loss": 0.75762403, + "num_input_tokens_seen": 125287530, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.83984375, + "step": 5822, + "time_per_iteration": 2.469041109085083 + }, + { + "auxiliary_loss_clip": 0.01127055, + "auxiliary_loss_mlp": 0.01036959, + "balance_loss_clip": 1.02150989, + "balance_loss_mlp": 1.0447278, + "epoch": 0.35009770028558546, + "flos": 19026048401280.0, + "grad_norm": 1.5203539526389718, + "language_loss": 0.78104019, + "learning_rate": 3.018045956403094e-06, + "loss": 0.80268037, + "num_input_tokens_seen": 125307020, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.82421875, + "step": 5823, + "time_per_iteration": 2.4932196140289307 + }, + { + "auxiliary_loss_clip": 0.01038228, + "auxiliary_loss_mlp": 0.01001335, + "balance_loss_clip": 0.99964237, + "balance_loss_mlp": 1.01332808, + "epoch": 0.3501578235382534, + "flos": 68351868783360.0, + "grad_norm": 1.4438996436497689, + "language_loss": 0.59237444, + "learning_rate": 3.017710706819298e-06, + "loss": 0.61277008, + "num_input_tokens_seen": 125370445, + "router_z_loss_clip": 0.01696777, + "router_z_loss_mlp": 0.24902344, + "step": 5824, + "time_per_iteration": 3.109966278076172 + }, + { + "auxiliary_loss_clip": 0.01125511, + "auxiliary_loss_mlp": 0.01036598, + "balance_loss_clip": 1.0213685, + "balance_loss_mlp": 1.04462993, + "epoch": 0.3502179467909214, + "flos": 21250893836160.0, + "grad_norm": 1.8425293735622459, + "language_loss": 0.84740114, + "learning_rate": 3.017375418643811e-06, + "loss": 0.86902225, + "num_input_tokens_seen": 125388900, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.80859375, + "step": 5825, + "time_per_iteration": 2.4780030250549316 + }, + { + "auxiliary_loss_clip": 0.01125254, + "auxiliary_loss_mlp": 0.01038054, + "balance_loss_clip": 1.02292657, + "balance_loss_mlp": 1.04522121, + "epoch": 0.35027807004358935, + "flos": 11942955826560.0, + "grad_norm": 2.24584207136959, + "language_loss": 0.82778502, + "learning_rate": 3.0170400918893464e-06, + "loss": 0.84941804, + "num_input_tokens_seen": 125402675, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.80078125, + "step": 5826, + "time_per_iteration": 2.4147045612335205 + }, + { + "auxiliary_loss_clip": 0.01126938, + "auxiliary_loss_mlp": 0.01041103, + "balance_loss_clip": 1.02587962, + "balance_loss_mlp": 1.04480314, + "epoch": 0.3503381932962573, + "flos": 21470918595840.0, + "grad_norm": 1.5075773428374344, + "language_loss": 0.80714649, + "learning_rate": 3.0167047265686186e-06, + "loss": 0.8288269, + "num_input_tokens_seen": 125421360, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8203125, + "step": 5827, + "time_per_iteration": 2.4650330543518066 + }, + { + "auxiliary_loss_clip": 0.01123347, + "auxiliary_loss_mlp": 0.01035841, + "balance_loss_clip": 1.0220902, + "balance_loss_mlp": 1.04475152, + "epoch": 0.3503983165489253, + "flos": 21251109317760.0, + "grad_norm": 2.7582821019631836, + "language_loss": 0.70936024, + "learning_rate": 3.0163693226943467e-06, + "loss": 0.73095214, + "num_input_tokens_seen": 125440000, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.78515625, + "step": 5828, + "time_per_iteration": 2.4710564613342285 + }, + { + "auxiliary_loss_clip": 0.01130881, + "auxiliary_loss_mlp": 0.01043725, + "balance_loss_clip": 1.02666616, + "balance_loss_mlp": 1.04788435, + "epoch": 0.35045843980159325, + "flos": 27815723026560.0, + "grad_norm": 1.628373483521701, + "language_loss": 0.79397106, + "learning_rate": 3.016033880279248e-06, + "loss": 0.81571716, + "num_input_tokens_seen": 125460390, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.828125, + "step": 5829, + "time_per_iteration": 2.5081264972686768 + }, + { + "auxiliary_loss_clip": 0.01129997, + "auxiliary_loss_mlp": 0.0104564, + "balance_loss_clip": 1.02900994, + "balance_loss_mlp": 1.04607642, + "epoch": 0.3505185630542612, + "flos": 25921148169600.0, + "grad_norm": 1.7135270810407168, + "language_loss": 0.72111332, + "learning_rate": 3.0156983993360417e-06, + "loss": 0.74286962, + "num_input_tokens_seen": 125478410, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.83984375, + "step": 5830, + "time_per_iteration": 2.507263422012329 + }, + { + "auxiliary_loss_clip": 0.01122818, + "auxiliary_loss_mlp": 0.01033023, + "balance_loss_clip": 1.01801419, + "balance_loss_mlp": 1.04352021, + "epoch": 0.35057868630692923, + "flos": 20521763660160.0, + "grad_norm": 2.0188022258715996, + "language_loss": 0.88740343, + "learning_rate": 3.0153628798774513e-06, + "loss": 0.90896189, + "num_input_tokens_seen": 125495975, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.79296875, + "step": 5831, + "time_per_iteration": 2.4769816398620605 + }, + { + "auxiliary_loss_clip": 0.01122435, + "auxiliary_loss_mlp": 0.01040768, + "balance_loss_clip": 1.02560508, + "balance_loss_mlp": 1.04128802, + "epoch": 0.3506388095595972, + "flos": 20448649526400.0, + "grad_norm": 1.9377344606434141, + "language_loss": 0.78478962, + "learning_rate": 3.0150273219161985e-06, + "loss": 0.80642164, + "num_input_tokens_seen": 125515035, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8125, + "step": 5832, + "time_per_iteration": 2.458019971847534 + }, + { + "auxiliary_loss_clip": 0.01125835, + "auxiliary_loss_mlp": 0.01043672, + "balance_loss_clip": 1.02744734, + "balance_loss_mlp": 1.04360127, + "epoch": 0.35069893281226516, + "flos": 23109665811840.0, + "grad_norm": 1.8976688118149017, + "language_loss": 0.70859557, + "learning_rate": 3.014691725465008e-06, + "loss": 0.73029065, + "num_input_tokens_seen": 125535555, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.82421875, + "step": 5833, + "time_per_iteration": 2.494739055633545 + }, + { + "auxiliary_loss_clip": 0.01121087, + "auxiliary_loss_mlp": 0.01030807, + "balance_loss_clip": 1.01635337, + "balance_loss_mlp": 1.04384482, + "epoch": 0.35075905606493313, + "flos": 27271999877760.0, + "grad_norm": 1.3472514068868482, + "language_loss": 0.80878949, + "learning_rate": 3.014356090536606e-06, + "loss": 0.83030844, + "num_input_tokens_seen": 125558195, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7734375, + "step": 5834, + "time_per_iteration": 2.521343231201172 + }, + { + "auxiliary_loss_clip": 0.01124914, + "auxiliary_loss_mlp": 0.01043402, + "balance_loss_clip": 1.02823853, + "balance_loss_mlp": 1.04525888, + "epoch": 0.3508191793176011, + "flos": 19128608709120.0, + "grad_norm": 2.219662071096021, + "language_loss": 0.83629, + "learning_rate": 3.0140204171437183e-06, + "loss": 0.8579731, + "num_input_tokens_seen": 125575375, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.796875, + "step": 5835, + "time_per_iteration": 2.53587007522583 + }, + { + "auxiliary_loss_clip": 0.01123177, + "auxiliary_loss_mlp": 0.01043674, + "balance_loss_clip": 1.02932119, + "balance_loss_mlp": 1.04351568, + "epoch": 0.35087930257026906, + "flos": 25557588662400.0, + "grad_norm": 2.120648036265282, + "language_loss": 0.76607329, + "learning_rate": 3.0136847052990754e-06, + "loss": 0.78774178, + "num_input_tokens_seen": 125596745, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.796875, + "step": 5836, + "time_per_iteration": 2.54390549659729 + }, + { + "auxiliary_loss_clip": 0.01128097, + "auxiliary_loss_mlp": 0.01038562, + "balance_loss_clip": 1.02382731, + "balance_loss_mlp": 1.04872775, + "epoch": 0.350939425822937, + "flos": 18004246208640.0, + "grad_norm": 2.2292749531356986, + "language_loss": 0.77354801, + "learning_rate": 3.0133489550154074e-06, + "loss": 0.79521459, + "num_input_tokens_seen": 125613980, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.79296875, + "step": 5837, + "time_per_iteration": 2.4478273391723633 + }, + { + "auxiliary_loss_clip": 0.01123898, + "auxiliary_loss_mlp": 0.01044754, + "balance_loss_clip": 1.02998376, + "balance_loss_mlp": 1.04441822, + "epoch": 0.350999549075605, + "flos": 22273198819200.0, + "grad_norm": 1.6098451794116821, + "language_loss": 0.68129408, + "learning_rate": 3.0130131663054442e-06, + "loss": 0.70298064, + "num_input_tokens_seen": 125632100, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.79296875, + "step": 5838, + "time_per_iteration": 2.505833864212036 + }, + { + "auxiliary_loss_clip": 0.01122037, + "auxiliary_loss_mlp": 0.01034351, + "balance_loss_clip": 1.01945019, + "balance_loss_mlp": 1.04240978, + "epoch": 0.35105967232827295, + "flos": 14392279307520.0, + "grad_norm": 2.0937603738721173, + "language_loss": 0.83561182, + "learning_rate": 3.0126773391819215e-06, + "loss": 0.85717571, + "num_input_tokens_seen": 125649190, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.796875, + "step": 5839, + "time_per_iteration": 2.4378576278686523 + }, + { + "auxiliary_loss_clip": 0.01126069, + "auxiliary_loss_mlp": 0.01042905, + "balance_loss_clip": 1.02775335, + "balance_loss_mlp": 1.04351032, + "epoch": 0.3511197955809409, + "flos": 25082346792960.0, + "grad_norm": 1.6277808139419232, + "language_loss": 0.58590645, + "learning_rate": 3.012341473657572e-06, + "loss": 0.60759622, + "num_input_tokens_seen": 125668680, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.828125, + "step": 5840, + "time_per_iteration": 2.4883387088775635 + }, + { + "auxiliary_loss_clip": 0.01125241, + "auxiliary_loss_mlp": 0.01035574, + "balance_loss_clip": 1.02015984, + "balance_loss_mlp": 1.04445219, + "epoch": 0.3511799188336089, + "flos": 25884160139520.0, + "grad_norm": 2.7790843018814058, + "language_loss": 0.87061596, + "learning_rate": 3.0120055697451322e-06, + "loss": 0.89222413, + "num_input_tokens_seen": 125686935, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.80859375, + "step": 5841, + "time_per_iteration": 2.5035836696624756 + }, + { + "auxiliary_loss_clip": 0.01128185, + "auxiliary_loss_mlp": 0.01041931, + "balance_loss_clip": 1.02551615, + "balance_loss_mlp": 1.0455035, + "epoch": 0.35124004208627685, + "flos": 20083725302400.0, + "grad_norm": 1.6842451001577108, + "language_loss": 0.74924648, + "learning_rate": 3.0116696274573406e-06, + "loss": 0.77094764, + "num_input_tokens_seen": 125707180, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.828125, + "step": 5842, + "time_per_iteration": 2.4677891731262207 + }, + { + "auxiliary_loss_clip": 0.01125535, + "auxiliary_loss_mlp": 0.01040751, + "balance_loss_clip": 1.02552199, + "balance_loss_mlp": 1.04403496, + "epoch": 0.3513001653389448, + "flos": 17783431349760.0, + "grad_norm": 3.45436030057014, + "language_loss": 0.68184745, + "learning_rate": 3.0113336468069346e-06, + "loss": 0.70351034, + "num_input_tokens_seen": 125722780, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8125, + "step": 5843, + "time_per_iteration": 2.4356935024261475 + }, + { + "auxiliary_loss_clip": 0.01123467, + "auxiliary_loss_mlp": 0.01042343, + "balance_loss_clip": 1.02734041, + "balance_loss_mlp": 1.04418659, + "epoch": 0.3513602885916128, + "flos": 29387138198400.0, + "grad_norm": 3.71115813366519, + "language_loss": 0.65957326, + "learning_rate": 3.010997627806655e-06, + "loss": 0.68123138, + "num_input_tokens_seen": 125742110, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.79296875, + "step": 5844, + "time_per_iteration": 2.4961743354797363 + }, + { + "auxiliary_loss_clip": 0.01124887, + "auxiliary_loss_mlp": 0.01040447, + "balance_loss_clip": 1.02446079, + "balance_loss_mlp": 1.04466677, + "epoch": 0.3514204118442808, + "flos": 16179876483840.0, + "grad_norm": 2.036064641334285, + "language_loss": 0.75629944, + "learning_rate": 3.010661570469245e-06, + "loss": 0.77795279, + "num_input_tokens_seen": 125759980, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8046875, + "step": 5845, + "time_per_iteration": 5.325402498245239 + }, + { + "auxiliary_loss_clip": 0.01123091, + "auxiliary_loss_mlp": 0.01039418, + "balance_loss_clip": 1.02483845, + "balance_loss_mlp": 1.04537153, + "epoch": 0.35148053509694877, + "flos": 23834665923840.0, + "grad_norm": 2.494167784966283, + "language_loss": 0.73075795, + "learning_rate": 3.0103254748074465e-06, + "loss": 0.75238299, + "num_input_tokens_seen": 125772660, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.77734375, + "step": 5846, + "time_per_iteration": 2.4515323638916016 + }, + { + "auxiliary_loss_clip": 0.01127959, + "auxiliary_loss_mlp": 0.01040823, + "balance_loss_clip": 1.02587426, + "balance_loss_mlp": 1.04755926, + "epoch": 0.35154065834961673, + "flos": 20991295267200.0, + "grad_norm": 1.6229430725765215, + "language_loss": 0.75876832, + "learning_rate": 3.0099893408340046e-06, + "loss": 0.78045619, + "num_input_tokens_seen": 125791935, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8046875, + "step": 5847, + "time_per_iteration": 2.4869656562805176 + }, + { + "auxiliary_loss_clip": 0.0112227, + "auxiliary_loss_mlp": 0.01034732, + "balance_loss_clip": 1.02067161, + "balance_loss_mlp": 1.04212832, + "epoch": 0.3516007816022847, + "flos": 33255471444480.0, + "grad_norm": 2.14189752244475, + "language_loss": 0.72070903, + "learning_rate": 3.009653168561666e-06, + "loss": 0.74227905, + "num_input_tokens_seen": 125813455, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.80078125, + "step": 5848, + "time_per_iteration": 2.5580503940582275 + }, + { + "auxiliary_loss_clip": 0.01127957, + "auxiliary_loss_mlp": 0.01044586, + "balance_loss_clip": 1.02953017, + "balance_loss_mlp": 1.04648554, + "epoch": 0.35166090485495266, + "flos": 11726953390080.0, + "grad_norm": 2.252970750126207, + "language_loss": 0.89321303, + "learning_rate": 3.009316958003178e-06, + "loss": 0.91493851, + "num_input_tokens_seen": 125827660, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8125, + "step": 5849, + "time_per_iteration": 2.4167070388793945 + }, + { + "auxiliary_loss_clip": 0.01123705, + "auxiliary_loss_mlp": 0.01032745, + "balance_loss_clip": 1.01810622, + "balance_loss_mlp": 1.04373825, + "epoch": 0.3517210281076206, + "flos": 22638446265600.0, + "grad_norm": 2.8040734708025026, + "language_loss": 0.74810916, + "learning_rate": 3.0089807091712897e-06, + "loss": 0.76967371, + "num_input_tokens_seen": 125846655, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.80078125, + "step": 5850, + "time_per_iteration": 2.457970142364502 + }, + { + "auxiliary_loss_clip": 0.0112382, + "auxiliary_loss_mlp": 0.01032618, + "balance_loss_clip": 1.01809859, + "balance_loss_mlp": 1.04618788, + "epoch": 0.3517811513602886, + "flos": 21322750993920.0, + "grad_norm": 1.5003899492593988, + "language_loss": 0.7563765, + "learning_rate": 3.0086444220787515e-06, + "loss": 0.77794087, + "num_input_tokens_seen": 125866290, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.77734375, + "step": 5851, + "time_per_iteration": 2.48270845413208 + }, + { + "auxiliary_loss_clip": 0.01126446, + "auxiliary_loss_mlp": 0.01036787, + "balance_loss_clip": 1.0219928, + "balance_loss_mlp": 1.04683256, + "epoch": 0.35184127461295656, + "flos": 21032880238080.0, + "grad_norm": 2.074837490144385, + "language_loss": 0.87552518, + "learning_rate": 3.0083080967383165e-06, + "loss": 0.89715755, + "num_input_tokens_seen": 125884620, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 5852, + "time_per_iteration": 2.4690029621124268 + }, + { + "auxiliary_loss_clip": 0.01122074, + "auxiliary_loss_mlp": 0.01035979, + "balance_loss_clip": 1.02193666, + "balance_loss_mlp": 1.04361391, + "epoch": 0.3519013978656245, + "flos": 22455265881600.0, + "grad_norm": 2.0973347969099048, + "language_loss": 0.67880064, + "learning_rate": 3.007971733162737e-06, + "loss": 0.70038116, + "num_input_tokens_seen": 125902430, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.78515625, + "step": 5853, + "time_per_iteration": 2.4953458309173584 + }, + { + "auxiliary_loss_clip": 0.01125495, + "auxiliary_loss_mlp": 0.01034243, + "balance_loss_clip": 1.0195092, + "balance_loss_mlp": 1.04545975, + "epoch": 0.3519615211182925, + "flos": 13115295918720.0, + "grad_norm": 1.6680659623481517, + "language_loss": 0.8122859, + "learning_rate": 3.0076353313647686e-06, + "loss": 0.83388329, + "num_input_tokens_seen": 125920570, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.80078125, + "step": 5854, + "time_per_iteration": 2.4702916145324707 + }, + { + "auxiliary_loss_clip": 0.01122667, + "auxiliary_loss_mlp": 0.01030591, + "balance_loss_clip": 1.01734662, + "balance_loss_mlp": 1.04566765, + "epoch": 0.35202164437096045, + "flos": 19135144984320.0, + "grad_norm": 1.6003148952985655, + "language_loss": 0.73131359, + "learning_rate": 3.0072988913571666e-06, + "loss": 0.75284624, + "num_input_tokens_seen": 125939800, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.76953125, + "step": 5855, + "time_per_iteration": 2.4895823001861572 + }, + { + "auxiliary_loss_clip": 0.01120527, + "auxiliary_loss_mlp": 0.01039285, + "balance_loss_clip": 1.02549887, + "balance_loss_mlp": 1.04334307, + "epoch": 0.3520817676236284, + "flos": 26542187343360.0, + "grad_norm": 3.701560840262617, + "language_loss": 0.70894778, + "learning_rate": 3.006962413152691e-06, + "loss": 0.73054588, + "num_input_tokens_seen": 125958720, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7734375, + "step": 5856, + "time_per_iteration": 2.5133585929870605 + }, + { + "auxiliary_loss_clip": 0.01126422, + "auxiliary_loss_mlp": 0.01044153, + "balance_loss_clip": 1.02881038, + "balance_loss_mlp": 1.0456897, + "epoch": 0.3521418908762964, + "flos": 44893472803200.0, + "grad_norm": 1.8086114170356375, + "language_loss": 0.60915685, + "learning_rate": 3.0066258967640987e-06, + "loss": 0.63086259, + "num_input_tokens_seen": 125984310, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.80859375, + "step": 5857, + "time_per_iteration": 2.723238468170166 + }, + { + "auxiliary_loss_clip": 0.01123346, + "auxiliary_loss_mlp": 0.01039329, + "balance_loss_clip": 1.02434421, + "balance_loss_mlp": 1.04425693, + "epoch": 0.3522020141289644, + "flos": 20187398931840.0, + "grad_norm": 1.754440516271971, + "language_loss": 0.73341751, + "learning_rate": 3.006289342204152e-06, + "loss": 0.75504428, + "num_input_tokens_seen": 126002410, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7890625, + "step": 5858, + "time_per_iteration": 2.509556293487549 + }, + { + "auxiliary_loss_clip": 0.01125415, + "auxiliary_loss_mlp": 0.01041448, + "balance_loss_clip": 1.02720821, + "balance_loss_mlp": 1.04428148, + "epoch": 0.35226213738163237, + "flos": 27563917708800.0, + "grad_norm": 1.4710047028379252, + "language_loss": 0.76090813, + "learning_rate": 3.0059527494856126e-06, + "loss": 0.7825768, + "num_input_tokens_seen": 126022490, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8125, + "step": 5859, + "time_per_iteration": 2.584312677383423 + }, + { + "auxiliary_loss_clip": 0.0113226, + "auxiliary_loss_mlp": 0.01038835, + "balance_loss_clip": 1.0230875, + "balance_loss_mlp": 1.04828274, + "epoch": 0.35232226063430033, + "flos": 22966310632320.0, + "grad_norm": 1.6944630123418771, + "language_loss": 0.71475387, + "learning_rate": 3.0056161186212435e-06, + "loss": 0.73646474, + "num_input_tokens_seen": 126042895, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.83984375, + "step": 5860, + "time_per_iteration": 2.5120623111724854 + }, + { + "auxiliary_loss_clip": 0.01125655, + "auxiliary_loss_mlp": 0.0104098, + "balance_loss_clip": 1.02506578, + "balance_loss_mlp": 1.04208136, + "epoch": 0.3523823838869683, + "flos": 19168290259200.0, + "grad_norm": 2.10777684168558, + "language_loss": 0.6624974, + "learning_rate": 3.005279449623811e-06, + "loss": 0.68416381, + "num_input_tokens_seen": 126060130, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8359375, + "step": 5861, + "time_per_iteration": 2.4927096366882324 + }, + { + "auxiliary_loss_clip": 0.01123555, + "auxiliary_loss_mlp": 0.0103431, + "balance_loss_clip": 1.01994538, + "balance_loss_mlp": 1.04497313, + "epoch": 0.35244250713963626, + "flos": 17930988420480.0, + "grad_norm": 2.1064993181157843, + "language_loss": 0.66780227, + "learning_rate": 3.0049427425060815e-06, + "loss": 0.68938088, + "num_input_tokens_seen": 126077850, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78515625, + "step": 5862, + "time_per_iteration": 2.4275379180908203 + }, + { + "auxiliary_loss_clip": 0.01124727, + "auxiliary_loss_mlp": 0.01037294, + "balance_loss_clip": 1.02132034, + "balance_loss_mlp": 1.04420304, + "epoch": 0.35250263039230423, + "flos": 21432529935360.0, + "grad_norm": 2.0193315360348842, + "language_loss": 0.77049166, + "learning_rate": 3.0046059972808215e-06, + "loss": 0.79211187, + "num_input_tokens_seen": 126095985, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8046875, + "step": 5863, + "time_per_iteration": 2.504391670227051 + }, + { + "auxiliary_loss_clip": 0.0112515, + "auxiliary_loss_mlp": 0.01034667, + "balance_loss_clip": 1.02027822, + "balance_loss_mlp": 1.04449666, + "epoch": 0.3525627536449722, + "flos": 27416863428480.0, + "grad_norm": 2.7341123556359297, + "language_loss": 0.75018549, + "learning_rate": 3.0042692139608024e-06, + "loss": 0.77178371, + "num_input_tokens_seen": 126116070, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8046875, + "step": 5864, + "time_per_iteration": 2.4962751865386963 + }, + { + "auxiliary_loss_clip": 0.01123376, + "auxiliary_loss_mlp": 0.0104564, + "balance_loss_clip": 1.03110838, + "balance_loss_mlp": 1.04376507, + "epoch": 0.35262287689764016, + "flos": 24789818430720.0, + "grad_norm": 1.9972182581193567, + "language_loss": 0.79051632, + "learning_rate": 3.003932392558793e-06, + "loss": 0.81220651, + "num_input_tokens_seen": 126135205, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.796875, + "step": 5865, + "time_per_iteration": 2.5369789600372314 + }, + { + "auxiliary_loss_clip": 0.01130515, + "auxiliary_loss_mlp": 0.01045214, + "balance_loss_clip": 1.02901387, + "balance_loss_mlp": 1.04835618, + "epoch": 0.3526830001503081, + "flos": 17821604528640.0, + "grad_norm": 1.8375125007543296, + "language_loss": 0.81622374, + "learning_rate": 3.0035955330875677e-06, + "loss": 0.8379811, + "num_input_tokens_seen": 126151895, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8203125, + "step": 5866, + "time_per_iteration": 2.497587203979492 + }, + { + "auxiliary_loss_clip": 0.01131205, + "auxiliary_loss_mlp": 0.01037377, + "balance_loss_clip": 1.02081871, + "balance_loss_mlp": 1.04493296, + "epoch": 0.3527431234029761, + "flos": 18078114528000.0, + "grad_norm": 2.1796505180833696, + "language_loss": 0.84552217, + "learning_rate": 3.0032586355598986e-06, + "loss": 0.867208, + "num_input_tokens_seen": 126168515, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.86328125, + "step": 5867, + "time_per_iteration": 2.5673649311065674 + }, + { + "auxiliary_loss_clip": 0.01126594, + "auxiliary_loss_mlp": 0.01043148, + "balance_loss_clip": 1.02764452, + "balance_loss_mlp": 1.04441357, + "epoch": 0.35280324665564405, + "flos": 19427350124160.0, + "grad_norm": 2.2018810166756873, + "language_loss": 0.74618357, + "learning_rate": 3.0029216999885613e-06, + "loss": 0.76788092, + "num_input_tokens_seen": 126186460, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8203125, + "step": 5868, + "time_per_iteration": 2.4571762084960938 + }, + { + "auxiliary_loss_clip": 0.01127392, + "auxiliary_loss_mlp": 0.01039387, + "balance_loss_clip": 1.02433038, + "balance_loss_mlp": 1.04489541, + "epoch": 0.352863369908312, + "flos": 21504027957120.0, + "grad_norm": 2.0366485396940615, + "language_loss": 0.61648643, + "learning_rate": 3.0025847263863327e-06, + "loss": 0.63815421, + "num_input_tokens_seen": 126206170, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.82421875, + "step": 5869, + "time_per_iteration": 2.5125019550323486 + }, + { + "auxiliary_loss_clip": 0.01124688, + "auxiliary_loss_mlp": 0.01042493, + "balance_loss_clip": 1.02690625, + "balance_loss_mlp": 1.04286385, + "epoch": 0.35292349316098, + "flos": 22309504490880.0, + "grad_norm": 2.290977208251557, + "language_loss": 0.74328029, + "learning_rate": 3.0022477147659917e-06, + "loss": 0.76495212, + "num_input_tokens_seen": 126225605, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8203125, + "step": 5870, + "time_per_iteration": 2.4636306762695312 + }, + { + "auxiliary_loss_clip": 0.0112446, + "auxiliary_loss_mlp": 0.01036399, + "balance_loss_clip": 1.02097332, + "balance_loss_mlp": 1.04412317, + "epoch": 0.352983616413648, + "flos": 33109745967360.0, + "grad_norm": 1.44010977521146, + "language_loss": 0.71498513, + "learning_rate": 3.001910665140316e-06, + "loss": 0.73659372, + "num_input_tokens_seen": 126250230, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8046875, + "step": 5871, + "time_per_iteration": 2.629002094268799 + }, + { + "auxiliary_loss_clip": 0.01120822, + "auxiliary_loss_mlp": 0.01033704, + "balance_loss_clip": 1.01999545, + "balance_loss_mlp": 1.04340768, + "epoch": 0.35304373966631597, + "flos": 18696603836160.0, + "grad_norm": 2.215441176085892, + "language_loss": 0.74219513, + "learning_rate": 3.0015735775220873e-06, + "loss": 0.76374042, + "num_input_tokens_seen": 126268315, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 5872, + "time_per_iteration": 2.4672691822052 + }, + { + "auxiliary_loss_clip": 0.01121667, + "auxiliary_loss_mlp": 0.01036996, + "balance_loss_clip": 1.02291727, + "balance_loss_mlp": 1.04295182, + "epoch": 0.35310386291898394, + "flos": 23364954748800.0, + "grad_norm": 1.6120105579455812, + "language_loss": 0.82492435, + "learning_rate": 3.001236451924089e-06, + "loss": 0.84651101, + "num_input_tokens_seen": 126288390, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 5873, + "time_per_iteration": 2.549706220626831 + }, + { + "auxiliary_loss_clip": 0.01128213, + "auxiliary_loss_mlp": 0.01044661, + "balance_loss_clip": 1.02800715, + "balance_loss_mlp": 1.04399252, + "epoch": 0.3531639861716519, + "flos": 24461954064000.0, + "grad_norm": 1.8495868157058504, + "language_loss": 0.6583339, + "learning_rate": 3.000899288359104e-06, + "loss": 0.68006265, + "num_input_tokens_seen": 126305750, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.84375, + "step": 5874, + "time_per_iteration": 2.4949634075164795 + }, + { + "auxiliary_loss_clip": 0.01044147, + "auxiliary_loss_mlp": 0.01006984, + "balance_loss_clip": 1.00510025, + "balance_loss_mlp": 1.01915693, + "epoch": 0.35322410942431987, + "flos": 70312446881280.0, + "grad_norm": 0.771003921858337, + "language_loss": 0.61583531, + "learning_rate": 3.000562086839917e-06, + "loss": 0.63634658, + "num_input_tokens_seen": 126362495, + "router_z_loss_clip": 0.01879883, + "router_z_loss_mlp": 0.25, + "step": 5875, + "time_per_iteration": 2.9931485652923584 + }, + { + "auxiliary_loss_clip": 0.01124819, + "auxiliary_loss_mlp": 0.01043824, + "balance_loss_clip": 1.02995443, + "balance_loss_mlp": 1.04544568, + "epoch": 0.35328423267698783, + "flos": 19820894509440.0, + "grad_norm": 1.6836782364007539, + "language_loss": 0.800933, + "learning_rate": 3.0002248473793163e-06, + "loss": 0.82261944, + "num_input_tokens_seen": 126378320, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.796875, + "step": 5876, + "time_per_iteration": 2.443178415298462 + }, + { + "auxiliary_loss_clip": 0.01041911, + "auxiliary_loss_mlp": 0.01006634, + "balance_loss_clip": 1.00477409, + "balance_loss_mlp": 1.01663578, + "epoch": 0.3533443559296558, + "flos": 60826356391680.0, + "grad_norm": 1.6287450036197537, + "language_loss": 0.5674026, + "learning_rate": 2.999887569990088e-06, + "loss": 0.587888, + "num_input_tokens_seen": 126442735, + "router_z_loss_clip": 0.01855469, + "router_z_loss_mlp": 0.25195312, + "step": 5877, + "time_per_iteration": 3.1782116889953613 + }, + { + "auxiliary_loss_clip": 0.01124291, + "auxiliary_loss_mlp": 0.01030449, + "balance_loss_clip": 1.01677024, + "balance_loss_mlp": 1.04401922, + "epoch": 0.35340447918232376, + "flos": 24755775315840.0, + "grad_norm": 1.5579095187110108, + "language_loss": 0.71649593, + "learning_rate": 2.999550254685024e-06, + "loss": 0.73804337, + "num_input_tokens_seen": 126463090, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.80078125, + "step": 5878, + "time_per_iteration": 2.4984474182128906 + }, + { + "auxiliary_loss_clip": 0.01123007, + "auxiliary_loss_mlp": 0.0103937, + "balance_loss_clip": 1.02498162, + "balance_loss_mlp": 1.04198527, + "epoch": 0.3534646024349917, + "flos": 21796304924160.0, + "grad_norm": 1.9384917614544617, + "language_loss": 0.78492844, + "learning_rate": 2.9992129014769136e-06, + "loss": 0.80655217, + "num_input_tokens_seen": 126482105, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.8125, + "step": 5879, + "time_per_iteration": 2.5369913578033447 + }, + { + "auxiliary_loss_clip": 0.01126898, + "auxiliary_loss_mlp": 0.01045347, + "balance_loss_clip": 1.02870536, + "balance_loss_mlp": 1.04373121, + "epoch": 0.3535247256876597, + "flos": 20012119539840.0, + "grad_norm": 2.0656781659104917, + "language_loss": 0.63695049, + "learning_rate": 2.9988755103785493e-06, + "loss": 0.65867293, + "num_input_tokens_seen": 126502125, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.83203125, + "step": 5880, + "time_per_iteration": 2.457787036895752 + }, + { + "auxiliary_loss_clip": 0.01125585, + "auxiliary_loss_mlp": 0.01036241, + "balance_loss_clip": 1.02078009, + "balance_loss_mlp": 1.04375386, + "epoch": 0.35358484894032766, + "flos": 18187929383040.0, + "grad_norm": 3.125568384757795, + "language_loss": 0.65818816, + "learning_rate": 2.998538081402727e-06, + "loss": 0.67980647, + "num_input_tokens_seen": 126521950, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.81640625, + "step": 5881, + "time_per_iteration": 2.5198867321014404 + }, + { + "auxiliary_loss_clip": 0.01119138, + "auxiliary_loss_mlp": 0.01031894, + "balance_loss_clip": 1.01851869, + "balance_loss_mlp": 1.04197288, + "epoch": 0.3536449721929956, + "flos": 22820369673600.0, + "grad_norm": 1.3882047203281038, + "language_loss": 0.75280428, + "learning_rate": 2.998200614562239e-06, + "loss": 0.77431458, + "num_input_tokens_seen": 126542445, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7734375, + "step": 5882, + "time_per_iteration": 2.4526872634887695 + }, + { + "auxiliary_loss_clip": 0.01126623, + "auxiliary_loss_mlp": 0.01037746, + "balance_loss_clip": 1.02266037, + "balance_loss_mlp": 1.04543018, + "epoch": 0.3537050954456636, + "flos": 26432336574720.0, + "grad_norm": 2.123888211837838, + "language_loss": 0.70349854, + "learning_rate": 2.9978631098698847e-06, + "loss": 0.72514224, + "num_input_tokens_seen": 126560690, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8125, + "step": 5883, + "time_per_iteration": 2.538865566253662 + }, + { + "auxiliary_loss_clip": 0.01129519, + "auxiliary_loss_mlp": 0.01038175, + "balance_loss_clip": 1.0228982, + "balance_loss_mlp": 1.04584253, + "epoch": 0.3537652186983316, + "flos": 17197153562880.0, + "grad_norm": 2.009195754637657, + "language_loss": 0.78500903, + "learning_rate": 2.9975255673384614e-06, + "loss": 0.80668598, + "num_input_tokens_seen": 126577620, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8359375, + "step": 5884, + "time_per_iteration": 2.4410510063171387 + }, + { + "auxiliary_loss_clip": 0.0112138, + "auxiliary_loss_mlp": 0.01032576, + "balance_loss_clip": 1.01901007, + "balance_loss_mlp": 1.04336667, + "epoch": 0.3538253419509996, + "flos": 19536769929600.0, + "grad_norm": 1.8922441591552446, + "language_loss": 0.75478536, + "learning_rate": 2.9971879869807673e-06, + "loss": 0.77632499, + "num_input_tokens_seen": 126596235, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.78125, + "step": 5885, + "time_per_iteration": 2.555816650390625 + }, + { + "auxiliary_loss_clip": 0.01127447, + "auxiliary_loss_mlp": 0.01042213, + "balance_loss_clip": 1.02666783, + "balance_loss_mlp": 1.04478371, + "epoch": 0.35388546520366754, + "flos": 12128578335360.0, + "grad_norm": 2.2081606315958635, + "language_loss": 0.82679224, + "learning_rate": 2.996850368809606e-06, + "loss": 0.84848893, + "num_input_tokens_seen": 126612830, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.82421875, + "step": 5886, + "time_per_iteration": 2.482151985168457 + }, + { + "auxiliary_loss_clip": 0.01124743, + "auxiliary_loss_mlp": 0.01032293, + "balance_loss_clip": 1.01717782, + "balance_loss_mlp": 1.04533887, + "epoch": 0.3539455884563355, + "flos": 19678149861120.0, + "grad_norm": 2.4580910750403775, + "language_loss": 0.78723359, + "learning_rate": 2.9965127128377787e-06, + "loss": 0.80880398, + "num_input_tokens_seen": 126630910, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.79296875, + "step": 5887, + "time_per_iteration": 5.388309001922607 + }, + { + "auxiliary_loss_clip": 0.01122251, + "auxiliary_loss_mlp": 0.0104141, + "balance_loss_clip": 1.0269978, + "balance_loss_mlp": 1.04226518, + "epoch": 0.35400571170900347, + "flos": 18072045129600.0, + "grad_norm": 3.1093010737907867, + "language_loss": 0.65404654, + "learning_rate": 2.996175019078089e-06, + "loss": 0.67568314, + "num_input_tokens_seen": 126648365, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.80078125, + "step": 5888, + "time_per_iteration": 2.4438626766204834 + }, + { + "auxiliary_loss_clip": 0.01123271, + "auxiliary_loss_mlp": 0.01036138, + "balance_loss_clip": 1.02248812, + "balance_loss_mlp": 1.04373193, + "epoch": 0.35406583496167143, + "flos": 26068058795520.0, + "grad_norm": 1.6702882106954304, + "language_loss": 0.76662588, + "learning_rate": 2.9958372875433437e-06, + "loss": 0.78821993, + "num_input_tokens_seen": 126667500, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.796875, + "step": 5889, + "time_per_iteration": 2.503023624420166 + }, + { + "auxiliary_loss_clip": 0.01125083, + "auxiliary_loss_mlp": 0.01037766, + "balance_loss_clip": 1.02329397, + "balance_loss_mlp": 1.0469135, + "epoch": 0.3541259582143394, + "flos": 19792453916160.0, + "grad_norm": 1.7418080185903937, + "language_loss": 0.80142188, + "learning_rate": 2.9954995182463478e-06, + "loss": 0.82305038, + "num_input_tokens_seen": 126686820, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 5890, + "time_per_iteration": 2.4669902324676514 + }, + { + "auxiliary_loss_clip": 0.01118725, + "auxiliary_loss_mlp": 0.01034555, + "balance_loss_clip": 1.02204418, + "balance_loss_mlp": 1.04123974, + "epoch": 0.35418608146700736, + "flos": 24022084112640.0, + "grad_norm": 1.4765808553545194, + "language_loss": 0.79590207, + "learning_rate": 2.99516171119991e-06, + "loss": 0.81743479, + "num_input_tokens_seen": 126706965, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7734375, + "step": 5891, + "time_per_iteration": 2.491048812866211 + }, + { + "auxiliary_loss_clip": 0.01123501, + "auxiliary_loss_mlp": 0.01037192, + "balance_loss_clip": 1.02260685, + "balance_loss_mlp": 1.04425383, + "epoch": 0.35424620471967533, + "flos": 12385770693120.0, + "grad_norm": 2.0747162768055616, + "language_loss": 0.73339593, + "learning_rate": 2.9948238664168415e-06, + "loss": 0.7550028, + "num_input_tokens_seen": 126724015, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.79296875, + "step": 5892, + "time_per_iteration": 2.497422695159912 + }, + { + "auxiliary_loss_clip": 0.01124613, + "auxiliary_loss_mlp": 0.01038788, + "balance_loss_clip": 1.02425075, + "balance_loss_mlp": 1.04473233, + "epoch": 0.3543063279723433, + "flos": 19673624747520.0, + "grad_norm": 1.9338165898472526, + "language_loss": 0.66916019, + "learning_rate": 2.9944859839099518e-06, + "loss": 0.69079423, + "num_input_tokens_seen": 126737565, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.796875, + "step": 5893, + "time_per_iteration": 2.4516420364379883 + }, + { + "auxiliary_loss_clip": 0.01123079, + "auxiliary_loss_mlp": 0.01037638, + "balance_loss_clip": 1.02253413, + "balance_loss_mlp": 1.04405212, + "epoch": 0.35436645122501126, + "flos": 21909208348800.0, + "grad_norm": 1.878049090913109, + "language_loss": 0.69472313, + "learning_rate": 2.9941480636920533e-06, + "loss": 0.71633029, + "num_input_tokens_seen": 126756095, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7890625, + "step": 5894, + "time_per_iteration": 2.479174852371216 + }, + { + "auxiliary_loss_clip": 0.01123499, + "auxiliary_loss_mlp": 0.01033075, + "balance_loss_clip": 1.01983714, + "balance_loss_mlp": 1.04524636, + "epoch": 0.3544265744776792, + "flos": 21719527603200.0, + "grad_norm": 1.6954645527360779, + "language_loss": 0.74891931, + "learning_rate": 2.9938101057759615e-06, + "loss": 0.77048504, + "num_input_tokens_seen": 126775455, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.78125, + "step": 5895, + "time_per_iteration": 2.4786908626556396 + }, + { + "auxiliary_loss_clip": 0.01122907, + "auxiliary_loss_mlp": 0.01037799, + "balance_loss_clip": 1.02366102, + "balance_loss_mlp": 1.04388869, + "epoch": 0.3544866977303472, + "flos": 21213223447680.0, + "grad_norm": 2.0548310630504854, + "language_loss": 0.83688253, + "learning_rate": 2.993472110174491e-06, + "loss": 0.85848963, + "num_input_tokens_seen": 126792320, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 5896, + "time_per_iteration": 2.4765214920043945 + }, + { + "auxiliary_loss_clip": 0.01122608, + "auxiliary_loss_mlp": 0.01048408, + "balance_loss_clip": 1.03348279, + "balance_loss_mlp": 1.0444181, + "epoch": 0.35454682098301515, + "flos": 29311402371840.0, + "grad_norm": 1.6634726813042469, + "language_loss": 0.70031154, + "learning_rate": 2.9931340769004576e-06, + "loss": 0.7220217, + "num_input_tokens_seen": 126813680, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.78125, + "step": 5897, + "time_per_iteration": 2.5142548084259033 + }, + { + "auxiliary_loss_clip": 0.01121754, + "auxiliary_loss_mlp": 0.01038358, + "balance_loss_clip": 1.02430916, + "balance_loss_mlp": 1.04337025, + "epoch": 0.3546069442356832, + "flos": 24316587722880.0, + "grad_norm": 1.7331024671064506, + "language_loss": 0.82091749, + "learning_rate": 2.9927960059666816e-06, + "loss": 0.84251857, + "num_input_tokens_seen": 126834395, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.78515625, + "step": 5898, + "time_per_iteration": 2.4900712966918945 + }, + { + "auxiliary_loss_clip": 0.0112068, + "auxiliary_loss_mlp": 0.01036408, + "balance_loss_clip": 1.0234853, + "balance_loss_mlp": 1.04411018, + "epoch": 0.35466706748835114, + "flos": 22857285876480.0, + "grad_norm": 1.4876974136883365, + "language_loss": 0.73901182, + "learning_rate": 2.9924578973859804e-06, + "loss": 0.76058269, + "num_input_tokens_seen": 126855145, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.765625, + "step": 5899, + "time_per_iteration": 2.498659133911133 + }, + { + "auxiliary_loss_clip": 0.01121982, + "auxiliary_loss_mlp": 0.01042838, + "balance_loss_clip": 1.02825308, + "balance_loss_mlp": 1.04316258, + "epoch": 0.3547271907410191, + "flos": 28330107742080.0, + "grad_norm": 1.69682390123668, + "language_loss": 0.79345262, + "learning_rate": 2.9921197511711763e-06, + "loss": 0.81510079, + "num_input_tokens_seen": 126873790, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7890625, + "step": 5900, + "time_per_iteration": 2.548612594604492 + }, + { + "auxiliary_loss_clip": 0.01123598, + "auxiliary_loss_mlp": 0.01040285, + "balance_loss_clip": 1.02556252, + "balance_loss_mlp": 1.04530048, + "epoch": 0.35478731399368707, + "flos": 23514092017920.0, + "grad_norm": 1.7758743329418227, + "language_loss": 0.81637204, + "learning_rate": 2.991781567335093e-06, + "loss": 0.83801091, + "num_input_tokens_seen": 126892865, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.78125, + "step": 5901, + "time_per_iteration": 2.6031999588012695 + }, + { + "auxiliary_loss_clip": 0.01127681, + "auxiliary_loss_mlp": 0.01034926, + "balance_loss_clip": 1.02063251, + "balance_loss_mlp": 1.04535294, + "epoch": 0.35484743724635504, + "flos": 18624315715200.0, + "grad_norm": 1.92677562296577, + "language_loss": 0.75667071, + "learning_rate": 2.9914433458905525e-06, + "loss": 0.77829683, + "num_input_tokens_seen": 126911935, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.82421875, + "step": 5902, + "time_per_iteration": 2.528026819229126 + }, + { + "auxiliary_loss_clip": 0.0112195, + "auxiliary_loss_mlp": 0.01036748, + "balance_loss_clip": 1.02359962, + "balance_loss_mlp": 1.04320014, + "epoch": 0.354907560499023, + "flos": 17384499924480.0, + "grad_norm": 1.7304108811682997, + "language_loss": 0.70582771, + "learning_rate": 2.991105086850381e-06, + "loss": 0.72741467, + "num_input_tokens_seen": 126930040, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7890625, + "step": 5903, + "time_per_iteration": 2.423454999923706 + }, + { + "auxiliary_loss_clip": 0.01124223, + "auxiliary_loss_mlp": 0.01034819, + "balance_loss_clip": 1.0205555, + "balance_loss_mlp": 1.04234982, + "epoch": 0.35496768375169097, + "flos": 19208546426880.0, + "grad_norm": 2.52210089781831, + "language_loss": 0.74574983, + "learning_rate": 2.9907667902274053e-06, + "loss": 0.76734024, + "num_input_tokens_seen": 126948390, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8203125, + "step": 5904, + "time_per_iteration": 2.462024688720703 + }, + { + "auxiliary_loss_clip": 0.0112423, + "auxiliary_loss_mlp": 0.01040901, + "balance_loss_clip": 1.02649426, + "balance_loss_mlp": 1.04362941, + "epoch": 0.35502780700435893, + "flos": 18332792933760.0, + "grad_norm": 2.0389703534000443, + "language_loss": 0.78855121, + "learning_rate": 2.9904284560344536e-06, + "loss": 0.81020248, + "num_input_tokens_seen": 126964905, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.8046875, + "step": 5905, + "time_per_iteration": 2.418665885925293 + }, + { + "auxiliary_loss_clip": 0.0111773, + "auxiliary_loss_mlp": 0.01031377, + "balance_loss_clip": 1.0190388, + "balance_loss_mlp": 1.04383469, + "epoch": 0.3550879302570269, + "flos": 15448555578240.0, + "grad_norm": 2.1398902938273547, + "language_loss": 0.72515827, + "learning_rate": 2.990090084284356e-06, + "loss": 0.74664938, + "num_input_tokens_seen": 126982000, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.73828125, + "step": 5906, + "time_per_iteration": 2.441795825958252 + }, + { + "auxiliary_loss_clip": 0.01128267, + "auxiliary_loss_mlp": 0.01037607, + "balance_loss_clip": 1.02187109, + "balance_loss_mlp": 1.04545534, + "epoch": 0.35514805350969486, + "flos": 21979197999360.0, + "grad_norm": 2.0230910533888107, + "language_loss": 0.74762344, + "learning_rate": 2.9897516749899426e-06, + "loss": 0.7692821, + "num_input_tokens_seen": 126998390, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.828125, + "step": 5907, + "time_per_iteration": 2.4404122829437256 + }, + { + "auxiliary_loss_clip": 0.01123497, + "auxiliary_loss_mlp": 0.01033795, + "balance_loss_clip": 1.01939988, + "balance_loss_mlp": 1.04492426, + "epoch": 0.3552081767623628, + "flos": 29861949104640.0, + "grad_norm": 1.7742327577799557, + "language_loss": 0.75751841, + "learning_rate": 2.989413228164047e-06, + "loss": 0.77909136, + "num_input_tokens_seen": 127020220, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78515625, + "step": 5908, + "time_per_iteration": 2.5631895065307617 + }, + { + "auxiliary_loss_clip": 0.01126393, + "auxiliary_loss_mlp": 0.01033964, + "balance_loss_clip": 1.01961696, + "balance_loss_mlp": 1.04734707, + "epoch": 0.3552683000150308, + "flos": 26432264747520.0, + "grad_norm": 1.7057235578436956, + "language_loss": 0.68026733, + "learning_rate": 2.989074743819502e-06, + "loss": 0.70187092, + "num_input_tokens_seen": 127038585, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7890625, + "step": 5909, + "time_per_iteration": 2.480511426925659 + }, + { + "auxiliary_loss_clip": 0.0112022, + "auxiliary_loss_mlp": 0.01032654, + "balance_loss_clip": 1.01937413, + "balance_loss_mlp": 1.04523396, + "epoch": 0.35532842326769876, + "flos": 19785989468160.0, + "grad_norm": 3.5777269988287297, + "language_loss": 0.78628188, + "learning_rate": 2.988736221969144e-06, + "loss": 0.8078106, + "num_input_tokens_seen": 127056215, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75, + "step": 5910, + "time_per_iteration": 2.4763131141662598 + }, + { + "auxiliary_loss_clip": 0.01128543, + "auxiliary_loss_mlp": 0.01040834, + "balance_loss_clip": 1.02545595, + "balance_loss_mlp": 1.04625309, + "epoch": 0.3553885465203668, + "flos": 17239277237760.0, + "grad_norm": 1.525011794663279, + "language_loss": 0.70639479, + "learning_rate": 2.98839766262581e-06, + "loss": 0.72808856, + "num_input_tokens_seen": 127075825, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8203125, + "step": 5911, + "time_per_iteration": 2.4335598945617676 + }, + { + "auxiliary_loss_clip": 0.01119575, + "auxiliary_loss_mlp": 0.01035647, + "balance_loss_clip": 1.02149105, + "balance_loss_mlp": 1.04294884, + "epoch": 0.35544866977303474, + "flos": 14934350430720.0, + "grad_norm": 1.9668748220600272, + "language_loss": 0.87014282, + "learning_rate": 2.9880590658023366e-06, + "loss": 0.89169508, + "num_input_tokens_seen": 127091205, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.765625, + "step": 5912, + "time_per_iteration": 2.461251735687256 + }, + { + "auxiliary_loss_clip": 0.01123002, + "auxiliary_loss_mlp": 0.01032384, + "balance_loss_clip": 1.018556, + "balance_loss_mlp": 1.04507196, + "epoch": 0.3555087930257027, + "flos": 19756040503680.0, + "grad_norm": 1.7619620740638822, + "language_loss": 0.7701745, + "learning_rate": 2.9877204315115646e-06, + "loss": 0.79172838, + "num_input_tokens_seen": 127109210, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.78125, + "step": 5913, + "time_per_iteration": 2.4517738819122314 + }, + { + "auxiliary_loss_clip": 0.01124528, + "auxiliary_loss_mlp": 0.01033929, + "balance_loss_clip": 1.02001143, + "balance_loss_mlp": 1.04793298, + "epoch": 0.3555689162783707, + "flos": 21068252156160.0, + "grad_norm": 1.3300117090522248, + "language_loss": 0.82507938, + "learning_rate": 2.9873817597663353e-06, + "loss": 0.84666395, + "num_input_tokens_seen": 127128400, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.765625, + "step": 5914, + "time_per_iteration": 2.4964141845703125 + }, + { + "auxiliary_loss_clip": 0.01124534, + "auxiliary_loss_mlp": 0.01031988, + "balance_loss_clip": 1.01771307, + "balance_loss_mlp": 1.04573739, + "epoch": 0.35562903953103864, + "flos": 33069633454080.0, + "grad_norm": 2.1657623831524604, + "language_loss": 0.70703268, + "learning_rate": 2.98704305057949e-06, + "loss": 0.72859794, + "num_input_tokens_seen": 127149965, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7890625, + "step": 5915, + "time_per_iteration": 2.5425658226013184 + }, + { + "auxiliary_loss_clip": 0.01120767, + "auxiliary_loss_mlp": 0.01038436, + "balance_loss_clip": 1.0249182, + "balance_loss_mlp": 1.04248476, + "epoch": 0.3556891627837066, + "flos": 20557853850240.0, + "grad_norm": 1.7489130528457595, + "language_loss": 0.76365829, + "learning_rate": 2.9867043039638737e-06, + "loss": 0.78525031, + "num_input_tokens_seen": 127169865, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.78125, + "step": 5916, + "time_per_iteration": 2.49629545211792 + }, + { + "auxiliary_loss_clip": 0.01128234, + "auxiliary_loss_mlp": 0.01037051, + "balance_loss_clip": 1.02360404, + "balance_loss_mlp": 1.04853928, + "epoch": 0.35574928603637457, + "flos": 20703327932160.0, + "grad_norm": 1.96232440030472, + "language_loss": 0.88380635, + "learning_rate": 2.986365519932332e-06, + "loss": 0.90545923, + "num_input_tokens_seen": 127188075, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.796875, + "step": 5917, + "time_per_iteration": 2.4549498558044434 + }, + { + "auxiliary_loss_clip": 0.01123557, + "auxiliary_loss_mlp": 0.01025214, + "balance_loss_clip": 1.01144493, + "balance_loss_mlp": 1.04562521, + "epoch": 0.35580940928904253, + "flos": 15194595444480.0, + "grad_norm": 2.0473051476373048, + "language_loss": 0.74389327, + "learning_rate": 2.98602669849771e-06, + "loss": 0.76538098, + "num_input_tokens_seen": 127206065, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.78125, + "step": 5918, + "time_per_iteration": 2.448164701461792 + }, + { + "auxiliary_loss_clip": 0.01039303, + "auxiliary_loss_mlp": 0.01015071, + "balance_loss_clip": 1.01344931, + "balance_loss_mlp": 1.01430607, + "epoch": 0.3558695325417105, + "flos": 58639145431680.0, + "grad_norm": 1.0267040132589962, + "language_loss": 0.63732457, + "learning_rate": 2.985687839672857e-06, + "loss": 0.65786839, + "num_input_tokens_seen": 127257885, + "router_z_loss_clip": 0.01623535, + "router_z_loss_mlp": 0.25, + "step": 5919, + "time_per_iteration": 2.837815999984741 + }, + { + "auxiliary_loss_clip": 0.01124878, + "auxiliary_loss_mlp": 0.01032772, + "balance_loss_clip": 1.01805615, + "balance_loss_mlp": 1.04376245, + "epoch": 0.35592965579437846, + "flos": 22018233104640.0, + "grad_norm": 2.8747663216478503, + "language_loss": 0.73868048, + "learning_rate": 2.9853489434706223e-06, + "loss": 0.76025695, + "num_input_tokens_seen": 127275550, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.80859375, + "step": 5920, + "time_per_iteration": 2.4837357997894287 + }, + { + "auxiliary_loss_clip": 0.0112079, + "auxiliary_loss_mlp": 0.01034089, + "balance_loss_clip": 1.02015972, + "balance_loss_mlp": 1.04353166, + "epoch": 0.35598977904704643, + "flos": 23367684182400.0, + "grad_norm": 1.659561193633535, + "language_loss": 0.77124226, + "learning_rate": 2.985010009903857e-06, + "loss": 0.79279101, + "num_input_tokens_seen": 127295110, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7734375, + "step": 5921, + "time_per_iteration": 2.461014986038208 + }, + { + "auxiliary_loss_clip": 0.01122705, + "auxiliary_loss_mlp": 0.0103307, + "balance_loss_clip": 1.01968277, + "balance_loss_mlp": 1.04409981, + "epoch": 0.3560499022997144, + "flos": 17785334770560.0, + "grad_norm": 3.1644779785561563, + "language_loss": 0.67710596, + "learning_rate": 2.9846710389854133e-06, + "loss": 0.69866371, + "num_input_tokens_seen": 127312865, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7890625, + "step": 5922, + "time_per_iteration": 2.495504140853882 + }, + { + "auxiliary_loss_clip": 0.01122686, + "auxiliary_loss_mlp": 0.01029775, + "balance_loss_clip": 1.01567268, + "balance_loss_mlp": 1.04373431, + "epoch": 0.35611002555238236, + "flos": 20740459616640.0, + "grad_norm": 1.9745978513449503, + "language_loss": 0.79269004, + "learning_rate": 2.9843320307281454e-06, + "loss": 0.81421471, + "num_input_tokens_seen": 127331710, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 5923, + "time_per_iteration": 2.4515416622161865 + }, + { + "auxiliary_loss_clip": 0.01124058, + "auxiliary_loss_mlp": 0.01039223, + "balance_loss_clip": 1.02631271, + "balance_loss_mlp": 1.04502511, + "epoch": 0.3561701488050504, + "flos": 19462219251840.0, + "grad_norm": 1.7698063934253627, + "language_loss": 0.85475516, + "learning_rate": 2.983992985144908e-06, + "loss": 0.87638795, + "num_input_tokens_seen": 127350950, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7890625, + "step": 5924, + "time_per_iteration": 2.4790685176849365 + }, + { + "auxiliary_loss_clip": 0.01121235, + "auxiliary_loss_mlp": 0.01037826, + "balance_loss_clip": 1.02344394, + "balance_loss_mlp": 1.04368067, + "epoch": 0.35623027205771834, + "flos": 30774942023040.0, + "grad_norm": 1.844353158814239, + "language_loss": 0.77513188, + "learning_rate": 2.9836539022485578e-06, + "loss": 0.79672253, + "num_input_tokens_seen": 127369385, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7734375, + "step": 5925, + "time_per_iteration": 2.5064613819122314 + }, + { + "auxiliary_loss_clip": 0.01119102, + "auxiliary_loss_mlp": 0.01043971, + "balance_loss_clip": 1.0301789, + "balance_loss_mlp": 1.04067063, + "epoch": 0.3562903953103863, + "flos": 16981079299200.0, + "grad_norm": 1.7016119178915972, + "language_loss": 0.75874609, + "learning_rate": 2.9833147820519535e-06, + "loss": 0.78037679, + "num_input_tokens_seen": 127386965, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.78125, + "step": 5926, + "time_per_iteration": 2.451852798461914 + }, + { + "auxiliary_loss_clip": 0.01125239, + "auxiliary_loss_mlp": 0.01036384, + "balance_loss_clip": 1.02194762, + "balance_loss_mlp": 1.04408717, + "epoch": 0.3563505185630543, + "flos": 23839837482240.0, + "grad_norm": 2.0486133546267737, + "language_loss": 0.69321811, + "learning_rate": 2.9829756245679544e-06, + "loss": 0.71483439, + "num_input_tokens_seen": 127406075, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.80859375, + "step": 5927, + "time_per_iteration": 2.4770915508270264 + }, + { + "auxiliary_loss_clip": 0.01119921, + "auxiliary_loss_mlp": 0.01036862, + "balance_loss_clip": 1.0237366, + "balance_loss_mlp": 1.0428226, + "epoch": 0.35641064181572224, + "flos": 22273450214400.0, + "grad_norm": 1.8762651107969224, + "language_loss": 0.79633021, + "learning_rate": 2.9826364298094212e-06, + "loss": 0.81789798, + "num_input_tokens_seen": 127425350, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.76953125, + "step": 5928, + "time_per_iteration": 4.019433259963989 + }, + { + "auxiliary_loss_clip": 0.01120965, + "auxiliary_loss_mlp": 0.01039766, + "balance_loss_clip": 1.02581263, + "balance_loss_mlp": 1.04338682, + "epoch": 0.3564707650683902, + "flos": 23001251587200.0, + "grad_norm": 1.4128421638180557, + "language_loss": 0.81568098, + "learning_rate": 2.982297197789215e-06, + "loss": 0.83728826, + "num_input_tokens_seen": 127446335, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7734375, + "step": 5929, + "time_per_iteration": 3.869184970855713 + }, + { + "auxiliary_loss_clip": 0.01116246, + "auxiliary_loss_mlp": 0.01034774, + "balance_loss_clip": 1.02172661, + "balance_loss_mlp": 1.0402571, + "epoch": 0.35653088832105817, + "flos": 14684268965760.0, + "grad_norm": 1.7650523310611956, + "language_loss": 0.69981778, + "learning_rate": 2.981957928520201e-06, + "loss": 0.7213279, + "num_input_tokens_seen": 127462795, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7578125, + "step": 5930, + "time_per_iteration": 2.418992519378662 + }, + { + "auxiliary_loss_clip": 0.01123929, + "auxiliary_loss_mlp": 0.01043162, + "balance_loss_clip": 1.02858853, + "balance_loss_mlp": 1.04340863, + "epoch": 0.35659101157372614, + "flos": 23477068074240.0, + "grad_norm": 1.9164187115059894, + "language_loss": 0.67766178, + "learning_rate": 2.981618622015244e-06, + "loss": 0.69933271, + "num_input_tokens_seen": 127482675, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8046875, + "step": 5931, + "time_per_iteration": 2.4688074588775635 + }, + { + "auxiliary_loss_clip": 0.01121557, + "auxiliary_loss_mlp": 0.01033991, + "balance_loss_clip": 1.0203712, + "balance_loss_mlp": 1.04403675, + "epoch": 0.3566511348263941, + "flos": 26578672583040.0, + "grad_norm": 1.736290109138699, + "language_loss": 0.67451715, + "learning_rate": 2.981279278287211e-06, + "loss": 0.69607264, + "num_input_tokens_seen": 127502275, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.77734375, + "step": 5932, + "time_per_iteration": 2.4908299446105957 + }, + { + "auxiliary_loss_clip": 0.01118994, + "auxiliary_loss_mlp": 0.01031751, + "balance_loss_clip": 1.0182085, + "balance_loss_mlp": 1.04304647, + "epoch": 0.35671125807906207, + "flos": 13115008609920.0, + "grad_norm": 2.602576254435761, + "language_loss": 0.7878592, + "learning_rate": 2.980939897348969e-06, + "loss": 0.8093667, + "num_input_tokens_seen": 127520195, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 5933, + "time_per_iteration": 2.442464590072632 + }, + { + "auxiliary_loss_clip": 0.01122141, + "auxiliary_loss_mlp": 0.01042886, + "balance_loss_clip": 1.02893806, + "balance_loss_mlp": 1.04176354, + "epoch": 0.35677138133173003, + "flos": 33000577557120.0, + "grad_norm": 1.4946029259135472, + "language_loss": 0.69271672, + "learning_rate": 2.980600479213388e-06, + "loss": 0.71436697, + "num_input_tokens_seen": 127544495, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.8046875, + "step": 5934, + "time_per_iteration": 2.5435454845428467 + }, + { + "auxiliary_loss_clip": 0.01131019, + "auxiliary_loss_mlp": 0.0104198, + "balance_loss_clip": 1.02636409, + "balance_loss_mlp": 1.04726946, + "epoch": 0.356831504584398, + "flos": 20777842696320.0, + "grad_norm": 1.881720756405168, + "language_loss": 0.71268845, + "learning_rate": 2.9802610238933384e-06, + "loss": 0.73441839, + "num_input_tokens_seen": 127563810, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8359375, + "step": 5935, + "time_per_iteration": 2.460548162460327 + }, + { + "auxiliary_loss_clip": 0.01124043, + "auxiliary_loss_mlp": 0.01039228, + "balance_loss_clip": 1.02476776, + "balance_loss_mlp": 1.04411018, + "epoch": 0.35689162783706596, + "flos": 12165566365440.0, + "grad_norm": 2.474293421119334, + "language_loss": 0.78293073, + "learning_rate": 2.979921531401692e-06, + "loss": 0.8045634, + "num_input_tokens_seen": 127579065, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.796875, + "step": 5936, + "time_per_iteration": 2.4517645835876465 + }, + { + "auxiliary_loss_clip": 0.01121611, + "auxiliary_loss_mlp": 0.01039592, + "balance_loss_clip": 1.02472031, + "balance_loss_mlp": 1.04367638, + "epoch": 0.356951751089734, + "flos": 23841489507840.0, + "grad_norm": 1.4518862241402966, + "language_loss": 0.64218014, + "learning_rate": 2.9795820017513242e-06, + "loss": 0.66379213, + "num_input_tokens_seen": 127599105, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78125, + "step": 5937, + "time_per_iteration": 2.5837321281433105 + }, + { + "auxiliary_loss_clip": 0.01124449, + "auxiliary_loss_mlp": 0.01038603, + "balance_loss_clip": 1.02395844, + "balance_loss_mlp": 1.04442978, + "epoch": 0.35701187434240195, + "flos": 11722176881280.0, + "grad_norm": 2.5143509931773553, + "language_loss": 0.77877963, + "learning_rate": 2.9792424349551073e-06, + "loss": 0.80041015, + "num_input_tokens_seen": 127614940, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.80078125, + "step": 5938, + "time_per_iteration": 2.4190945625305176 + }, + { + "auxiliary_loss_clip": 0.0112532, + "auxiliary_loss_mlp": 0.01042565, + "balance_loss_clip": 1.02890944, + "balance_loss_mlp": 1.04582071, + "epoch": 0.3570719975950699, + "flos": 24898879100160.0, + "grad_norm": 1.8770011073758637, + "language_loss": 0.80256367, + "learning_rate": 2.9789028310259202e-06, + "loss": 0.82424247, + "num_input_tokens_seen": 127634960, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.796875, + "step": 5939, + "time_per_iteration": 2.5029094219207764 + }, + { + "auxiliary_loss_clip": 0.01126611, + "auxiliary_loss_mlp": 0.01035861, + "balance_loss_clip": 1.0213412, + "balance_loss_mlp": 1.04299128, + "epoch": 0.3571321208477379, + "flos": 25994836920960.0, + "grad_norm": 1.6875415435298406, + "language_loss": 0.79203522, + "learning_rate": 2.9785631899766395e-06, + "loss": 0.81365997, + "num_input_tokens_seen": 127654545, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8359375, + "step": 5940, + "time_per_iteration": 2.526545524597168 + }, + { + "auxiliary_loss_clip": 0.01124522, + "auxiliary_loss_mlp": 0.01031852, + "balance_loss_clip": 1.01704049, + "balance_loss_mlp": 1.0441246, + "epoch": 0.35719224410040584, + "flos": 14501663199360.0, + "grad_norm": 2.480743427796476, + "language_loss": 0.72739166, + "learning_rate": 2.9782235118201443e-06, + "loss": 0.74895537, + "num_input_tokens_seen": 127672320, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8046875, + "step": 5941, + "time_per_iteration": 2.4599413871765137 + }, + { + "auxiliary_loss_clip": 0.01123947, + "auxiliary_loss_mlp": 0.01040367, + "balance_loss_clip": 1.02546012, + "balance_loss_mlp": 1.04480743, + "epoch": 0.3572523673530738, + "flos": 31175453646720.0, + "grad_norm": 1.979069530543237, + "language_loss": 0.64202702, + "learning_rate": 2.9778837965693154e-06, + "loss": 0.66367018, + "num_input_tokens_seen": 127693315, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7890625, + "step": 5942, + "time_per_iteration": 2.5174636840820312 + }, + { + "auxiliary_loss_clip": 0.01123104, + "auxiliary_loss_mlp": 0.01036752, + "balance_loss_clip": 1.02194643, + "balance_loss_mlp": 1.04385567, + "epoch": 0.3573124906057418, + "flos": 15851976203520.0, + "grad_norm": 2.2469009256176053, + "language_loss": 0.74055374, + "learning_rate": 2.9775440442370354e-06, + "loss": 0.76215225, + "num_input_tokens_seen": 127711570, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.79296875, + "step": 5943, + "time_per_iteration": 2.5392913818359375 + }, + { + "auxiliary_loss_clip": 0.01039679, + "auxiliary_loss_mlp": 0.01008275, + "balance_loss_clip": 1.00640345, + "balance_loss_mlp": 1.01455188, + "epoch": 0.35737261385840974, + "flos": 60822729118080.0, + "grad_norm": 0.7872915284740177, + "language_loss": 0.60689372, + "learning_rate": 2.9772042548361867e-06, + "loss": 0.62737316, + "num_input_tokens_seen": 127772475, + "router_z_loss_clip": 0.01867676, + "router_z_loss_mlp": 0.25, + "step": 5944, + "time_per_iteration": 3.17051100730896 + }, + { + "auxiliary_loss_clip": 0.01121351, + "auxiliary_loss_mlp": 0.01034271, + "balance_loss_clip": 1.02003157, + "balance_loss_mlp": 1.04313469, + "epoch": 0.3574327371110777, + "flos": 18843765857280.0, + "grad_norm": 2.033108996495456, + "language_loss": 0.72646821, + "learning_rate": 2.976864428379655e-06, + "loss": 0.7480244, + "num_input_tokens_seen": 127790940, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 5945, + "time_per_iteration": 2.444373846054077 + }, + { + "auxiliary_loss_clip": 0.01121962, + "auxiliary_loss_mlp": 0.01039125, + "balance_loss_clip": 1.02446246, + "balance_loss_mlp": 1.04313612, + "epoch": 0.35749286036374567, + "flos": 23549679417600.0, + "grad_norm": 1.7423109631574678, + "language_loss": 0.81255424, + "learning_rate": 2.976524564880326e-06, + "loss": 0.8341651, + "num_input_tokens_seen": 127808275, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7890625, + "step": 5946, + "time_per_iteration": 2.470513343811035 + }, + { + "auxiliary_loss_clip": 0.01124684, + "auxiliary_loss_mlp": 0.01042743, + "balance_loss_clip": 1.02808666, + "balance_loss_mlp": 1.04524601, + "epoch": 0.35755298361641363, + "flos": 21105491581440.0, + "grad_norm": 1.9099881709146462, + "language_loss": 0.68893784, + "learning_rate": 2.9761846643510882e-06, + "loss": 0.71061212, + "num_input_tokens_seen": 127828840, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.79296875, + "step": 5947, + "time_per_iteration": 2.4653477668762207 + }, + { + "auxiliary_loss_clip": 0.01120435, + "auxiliary_loss_mlp": 0.01039661, + "balance_loss_clip": 1.02568388, + "balance_loss_mlp": 1.04441905, + "epoch": 0.3576131068690816, + "flos": 19245031666560.0, + "grad_norm": 1.655085874443405, + "language_loss": 0.75428057, + "learning_rate": 2.9758447268048297e-06, + "loss": 0.77588153, + "num_input_tokens_seen": 127846240, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7578125, + "step": 5948, + "time_per_iteration": 2.4385483264923096 + }, + { + "auxiliary_loss_clip": 0.01119692, + "auxiliary_loss_mlp": 0.0104111, + "balance_loss_clip": 1.02650094, + "balance_loss_mlp": 1.04049134, + "epoch": 0.35767323012174956, + "flos": 28654703971200.0, + "grad_norm": 2.354345427402619, + "language_loss": 0.70556438, + "learning_rate": 2.9755047522544415e-06, + "loss": 0.72717237, + "num_input_tokens_seen": 127866880, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.79296875, + "step": 5949, + "time_per_iteration": 2.4992663860321045 + }, + { + "auxiliary_loss_clip": 0.01121175, + "auxiliary_loss_mlp": 0.0103916, + "balance_loss_clip": 1.02567744, + "balance_loss_mlp": 1.04348552, + "epoch": 0.35773335337441753, + "flos": 17085363459840.0, + "grad_norm": 1.8941983472442732, + "language_loss": 0.77248389, + "learning_rate": 2.9751647407128154e-06, + "loss": 0.79408723, + "num_input_tokens_seen": 127883560, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.77734375, + "step": 5950, + "time_per_iteration": 2.4295101165771484 + }, + { + "auxiliary_loss_clip": 0.0112255, + "auxiliary_loss_mlp": 0.01038813, + "balance_loss_clip": 1.02394795, + "balance_loss_mlp": 1.04274225, + "epoch": 0.35779347662708555, + "flos": 15888605097600.0, + "grad_norm": 1.5707876816938207, + "language_loss": 0.72766685, + "learning_rate": 2.9748246921928445e-06, + "loss": 0.74928057, + "num_input_tokens_seen": 127902330, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 5951, + "time_per_iteration": 2.444349765777588 + }, + { + "auxiliary_loss_clip": 0.0112562, + "auxiliary_loss_mlp": 0.01039318, + "balance_loss_clip": 1.02452421, + "balance_loss_mlp": 1.04390478, + "epoch": 0.3578535998797535, + "flos": 28658834035200.0, + "grad_norm": 1.9955959935597258, + "language_loss": 0.69730532, + "learning_rate": 2.9744846067074236e-06, + "loss": 0.71895468, + "num_input_tokens_seen": 127922325, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.81640625, + "step": 5952, + "time_per_iteration": 2.49656081199646 + }, + { + "auxiliary_loss_clip": 0.01120518, + "auxiliary_loss_mlp": 0.010387, + "balance_loss_clip": 1.02497923, + "balance_loss_mlp": 1.04271066, + "epoch": 0.3579137231324215, + "flos": 37852432076160.0, + "grad_norm": 2.0583657570083416, + "language_loss": 0.69432503, + "learning_rate": 2.974144484269449e-06, + "loss": 0.71591723, + "num_input_tokens_seen": 127942635, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.77734375, + "step": 5953, + "time_per_iteration": 2.6221721172332764 + }, + { + "auxiliary_loss_clip": 0.0112099, + "auxiliary_loss_mlp": 0.01030421, + "balance_loss_clip": 1.01641417, + "balance_loss_mlp": 1.04322994, + "epoch": 0.35797384638508944, + "flos": 22346851656960.0, + "grad_norm": 1.5429391611916807, + "language_loss": 0.66673422, + "learning_rate": 2.9738043248918175e-06, + "loss": 0.68824828, + "num_input_tokens_seen": 127962520, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.77734375, + "step": 5954, + "time_per_iteration": 2.465116262435913 + }, + { + "auxiliary_loss_clip": 0.01123263, + "auxiliary_loss_mlp": 0.01037735, + "balance_loss_clip": 1.02383566, + "balance_loss_mlp": 1.04475307, + "epoch": 0.3580339696377574, + "flos": 13589711775360.0, + "grad_norm": 1.7040470297828096, + "language_loss": 0.74838006, + "learning_rate": 2.9734641285874282e-06, + "loss": 0.76998997, + "num_input_tokens_seen": 127981180, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.78515625, + "step": 5955, + "time_per_iteration": 2.4968783855438232 + }, + { + "auxiliary_loss_clip": 0.01117597, + "auxiliary_loss_mlp": 0.01035772, + "balance_loss_clip": 1.0219382, + "balance_loss_mlp": 1.04289603, + "epoch": 0.3580940928904254, + "flos": 23768231719680.0, + "grad_norm": 1.6820855707774873, + "language_loss": 0.76043999, + "learning_rate": 2.973123895369182e-06, + "loss": 0.78197372, + "num_input_tokens_seen": 127999725, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.74609375, + "step": 5956, + "time_per_iteration": 2.498699903488159 + }, + { + "auxiliary_loss_clip": 0.01117465, + "auxiliary_loss_mlp": 0.01034975, + "balance_loss_clip": 1.0214982, + "balance_loss_mlp": 1.04263568, + "epoch": 0.35815421614309334, + "flos": 19463871277440.0, + "grad_norm": 1.7390523407913014, + "language_loss": 0.73059452, + "learning_rate": 2.9727836252499805e-06, + "loss": 0.75211895, + "num_input_tokens_seen": 128018885, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 5957, + "time_per_iteration": 2.4503817558288574 + }, + { + "auxiliary_loss_clip": 0.0112235, + "auxiliary_loss_mlp": 0.01035332, + "balance_loss_clip": 1.02197433, + "balance_loss_mlp": 1.04503369, + "epoch": 0.3582143393957613, + "flos": 23368186972800.0, + "grad_norm": 2.990259024529503, + "language_loss": 0.70640051, + "learning_rate": 2.972443318242726e-06, + "loss": 0.7279774, + "num_input_tokens_seen": 128037875, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7734375, + "step": 5958, + "time_per_iteration": 2.4611945152282715 + }, + { + "auxiliary_loss_clip": 0.01116817, + "auxiliary_loss_mlp": 0.01029572, + "balance_loss_clip": 1.0165484, + "balance_loss_mlp": 1.0413444, + "epoch": 0.35827446264842927, + "flos": 26323275905280.0, + "grad_norm": 1.7206269565580243, + "language_loss": 0.88610697, + "learning_rate": 2.972102974360324e-06, + "loss": 0.90757084, + "num_input_tokens_seen": 128056045, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.75390625, + "step": 5959, + "time_per_iteration": 2.5129401683807373 + }, + { + "auxiliary_loss_clip": 0.01121057, + "auxiliary_loss_mlp": 0.01036795, + "balance_loss_clip": 1.02281785, + "balance_loss_mlp": 1.04400599, + "epoch": 0.35833458590109724, + "flos": 30446610779520.0, + "grad_norm": 1.483187088646708, + "language_loss": 0.58103061, + "learning_rate": 2.971762593615679e-06, + "loss": 0.6026091, + "num_input_tokens_seen": 128077815, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76953125, + "step": 5960, + "time_per_iteration": 2.5110409259796143 + }, + { + "auxiliary_loss_clip": 0.01120594, + "auxiliary_loss_mlp": 0.01037396, + "balance_loss_clip": 1.02201176, + "balance_loss_mlp": 1.04267251, + "epoch": 0.3583947091537652, + "flos": 14829886702080.0, + "grad_norm": 1.9323395592862886, + "language_loss": 0.76102602, + "learning_rate": 2.9714221760216993e-06, + "loss": 0.78260595, + "num_input_tokens_seen": 128095460, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.77734375, + "step": 5961, + "time_per_iteration": 2.46943736076355 + }, + { + "auxiliary_loss_clip": 0.01121367, + "auxiliary_loss_mlp": 0.01033122, + "balance_loss_clip": 1.01862621, + "balance_loss_mlp": 1.04458857, + "epoch": 0.35845483240643317, + "flos": 34240644743040.0, + "grad_norm": 1.8327349140058107, + "language_loss": 0.69974017, + "learning_rate": 2.971081721591294e-06, + "loss": 0.72128505, + "num_input_tokens_seen": 128118605, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.765625, + "step": 5962, + "time_per_iteration": 2.5654361248016357 + }, + { + "auxiliary_loss_clip": 0.01118608, + "auxiliary_loss_mlp": 0.01033334, + "balance_loss_clip": 1.0210433, + "balance_loss_mlp": 1.04321802, + "epoch": 0.35851495565910113, + "flos": 20960089326720.0, + "grad_norm": 1.5613001239774846, + "language_loss": 0.74749398, + "learning_rate": 2.9707412303373716e-06, + "loss": 0.76901346, + "num_input_tokens_seen": 128139205, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.75390625, + "step": 5963, + "time_per_iteration": 2.5135319232940674 + }, + { + "auxiliary_loss_clip": 0.01122172, + "auxiliary_loss_mlp": 0.01034993, + "balance_loss_clip": 1.02149796, + "balance_loss_mlp": 1.04597044, + "epoch": 0.35857507891176915, + "flos": 22309863626880.0, + "grad_norm": 1.5825069258384938, + "language_loss": 0.78811383, + "learning_rate": 2.9704007022728447e-06, + "loss": 0.80968547, + "num_input_tokens_seen": 128158765, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.76171875, + "step": 5964, + "time_per_iteration": 2.493169069290161 + }, + { + "auxiliary_loss_clip": 0.01124119, + "auxiliary_loss_mlp": 0.01033282, + "balance_loss_clip": 1.01870322, + "balance_loss_mlp": 1.04482806, + "epoch": 0.3586352021644371, + "flos": 23367863750400.0, + "grad_norm": 1.8296471859577264, + "language_loss": 0.66694742, + "learning_rate": 2.970060137410626e-06, + "loss": 0.6885215, + "num_input_tokens_seen": 128177850, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.79296875, + "step": 5965, + "time_per_iteration": 2.4995884895324707 + }, + { + "auxiliary_loss_clip": 0.01120182, + "auxiliary_loss_mlp": 0.01032631, + "balance_loss_clip": 1.01876068, + "balance_loss_mlp": 1.04270399, + "epoch": 0.3586953254171051, + "flos": 27849227437440.0, + "grad_norm": 4.210402322068537, + "language_loss": 0.79008359, + "learning_rate": 2.9697195357636294e-06, + "loss": 0.81161171, + "num_input_tokens_seen": 128196925, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7734375, + "step": 5966, + "time_per_iteration": 2.485438346862793 + }, + { + "auxiliary_loss_clip": 0.01121545, + "auxiliary_loss_mlp": 0.01037084, + "balance_loss_clip": 1.02238536, + "balance_loss_mlp": 1.04341781, + "epoch": 0.35875544866977305, + "flos": 19500500171520.0, + "grad_norm": 5.107721360348662, + "language_loss": 0.90911728, + "learning_rate": 2.9693788973447715e-06, + "loss": 0.93070352, + "num_input_tokens_seen": 128213955, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78125, + "step": 5967, + "time_per_iteration": 2.547287702560425 + }, + { + "auxiliary_loss_clip": 0.01125829, + "auxiliary_loss_mlp": 0.01041518, + "balance_loss_clip": 1.02648592, + "balance_loss_mlp": 1.04528475, + "epoch": 0.358815571922441, + "flos": 21471134077440.0, + "grad_norm": 1.7620117516801617, + "language_loss": 0.79739827, + "learning_rate": 2.9690382221669682e-06, + "loss": 0.81907177, + "num_input_tokens_seen": 128232980, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8046875, + "step": 5968, + "time_per_iteration": 2.4543471336364746 + }, + { + "auxiliary_loss_clip": 0.01127455, + "auxiliary_loss_mlp": 0.01052904, + "balance_loss_clip": 1.0384376, + "balance_loss_mlp": 1.04604244, + "epoch": 0.358875695175109, + "flos": 21835411856640.0, + "grad_norm": 2.0044885906540424, + "language_loss": 0.83642054, + "learning_rate": 2.9686975102431384e-06, + "loss": 0.85822409, + "num_input_tokens_seen": 128252795, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8125, + "step": 5969, + "time_per_iteration": 2.502815008163452 + }, + { + "auxiliary_loss_clip": 0.0111906, + "auxiliary_loss_mlp": 0.01032145, + "balance_loss_clip": 1.01893663, + "balance_loss_mlp": 1.04245603, + "epoch": 0.35893581842777694, + "flos": 32011633330560.0, + "grad_norm": 1.876228198696561, + "language_loss": 0.72377515, + "learning_rate": 2.968356761586202e-06, + "loss": 0.74528718, + "num_input_tokens_seen": 128273115, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.765625, + "step": 5970, + "time_per_iteration": 4.051819086074829 + }, + { + "auxiliary_loss_clip": 0.01119038, + "auxiliary_loss_mlp": 0.01035066, + "balance_loss_clip": 1.02178049, + "balance_loss_mlp": 1.0424037, + "epoch": 0.3589959416804449, + "flos": 20485817124480.0, + "grad_norm": 1.6844020581036279, + "language_loss": 0.79522693, + "learning_rate": 2.9680159762090805e-06, + "loss": 0.81676805, + "num_input_tokens_seen": 128292220, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.765625, + "step": 5971, + "time_per_iteration": 3.8910434246063232 + }, + { + "auxiliary_loss_clip": 0.01120261, + "auxiliary_loss_mlp": 0.01039002, + "balance_loss_clip": 1.02427924, + "balance_loss_mlp": 1.0402174, + "epoch": 0.3590560649331129, + "flos": 16180666583040.0, + "grad_norm": 1.924864359347905, + "language_loss": 0.78594625, + "learning_rate": 2.967675154124696e-06, + "loss": 0.80753887, + "num_input_tokens_seen": 128310305, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.80078125, + "step": 5972, + "time_per_iteration": 2.4272611141204834 + }, + { + "auxiliary_loss_clip": 0.01120688, + "auxiliary_loss_mlp": 0.01037349, + "balance_loss_clip": 1.02378309, + "balance_loss_mlp": 1.04185021, + "epoch": 0.35911618818578084, + "flos": 20375391738240.0, + "grad_norm": 3.2741380987368327, + "language_loss": 0.81252539, + "learning_rate": 2.9673342953459722e-06, + "loss": 0.83410573, + "num_input_tokens_seen": 128328305, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7890625, + "step": 5973, + "time_per_iteration": 2.469438314437866 + }, + { + "auxiliary_loss_clip": 0.0103695, + "auxiliary_loss_mlp": 0.01001955, + "balance_loss_clip": 0.9999882, + "balance_loss_mlp": 1.01160312, + "epoch": 0.3591763114384488, + "flos": 41236691685120.0, + "grad_norm": 0.9181567019376142, + "language_loss": 0.56828684, + "learning_rate": 2.9669933998858355e-06, + "loss": 0.58867586, + "num_input_tokens_seen": 128378380, + "router_z_loss_clip": 0.01965332, + "router_z_loss_mlp": 0.25390625, + "step": 5974, + "time_per_iteration": 2.918166399002075 + }, + { + "auxiliary_loss_clip": 0.01122634, + "auxiliary_loss_mlp": 0.01038804, + "balance_loss_clip": 1.02548242, + "balance_loss_mlp": 1.04407859, + "epoch": 0.35923643469111677, + "flos": 18695454600960.0, + "grad_norm": 1.6252506462115286, + "language_loss": 0.68750453, + "learning_rate": 2.9666524677572114e-06, + "loss": 0.7091189, + "num_input_tokens_seen": 128394315, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.78515625, + "step": 5975, + "time_per_iteration": 2.4578702449798584 + }, + { + "auxiliary_loss_clip": 0.01119888, + "auxiliary_loss_mlp": 0.01034451, + "balance_loss_clip": 1.02132642, + "balance_loss_mlp": 1.04269934, + "epoch": 0.35929655794378473, + "flos": 25009950931200.0, + "grad_norm": 1.7542310571392548, + "language_loss": 0.79961413, + "learning_rate": 2.96631149897303e-06, + "loss": 0.82115752, + "num_input_tokens_seen": 128414515, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7734375, + "step": 5976, + "time_per_iteration": 2.494723081588745 + }, + { + "auxiliary_loss_clip": 0.01119534, + "auxiliary_loss_mlp": 0.0104186, + "balance_loss_clip": 1.02761412, + "balance_loss_mlp": 1.04172039, + "epoch": 0.35935668119645275, + "flos": 14975576265600.0, + "grad_norm": 1.7409485188517788, + "language_loss": 0.79081398, + "learning_rate": 2.9659704935462194e-06, + "loss": 0.81242788, + "num_input_tokens_seen": 128430615, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.77734375, + "step": 5977, + "time_per_iteration": 2.4949100017547607 + }, + { + "auxiliary_loss_clip": 0.01116029, + "auxiliary_loss_mlp": 0.01034751, + "balance_loss_clip": 1.02151847, + "balance_loss_mlp": 1.04029524, + "epoch": 0.3594168044491207, + "flos": 21178138838400.0, + "grad_norm": 1.7920092294573908, + "language_loss": 0.80654621, + "learning_rate": 2.9656294514897102e-06, + "loss": 0.82805401, + "num_input_tokens_seen": 128449480, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7578125, + "step": 5978, + "time_per_iteration": 2.445866584777832 + }, + { + "auxiliary_loss_clip": 0.01122409, + "auxiliary_loss_mlp": 0.01034873, + "balance_loss_clip": 1.02046657, + "balance_loss_mlp": 1.04394007, + "epoch": 0.3594769277017887, + "flos": 27672152365440.0, + "grad_norm": 1.5382295990908517, + "language_loss": 0.67741489, + "learning_rate": 2.965288372816436e-06, + "loss": 0.69898772, + "num_input_tokens_seen": 128471465, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78515625, + "step": 5979, + "time_per_iteration": 2.538585662841797 + }, + { + "auxiliary_loss_clip": 0.01119324, + "auxiliary_loss_mlp": 0.01038492, + "balance_loss_clip": 1.02478838, + "balance_loss_mlp": 1.04136634, + "epoch": 0.35953705095445665, + "flos": 23002328995200.0, + "grad_norm": 2.3207911240165697, + "language_loss": 0.67176729, + "learning_rate": 2.9649472575393296e-06, + "loss": 0.69334549, + "num_input_tokens_seen": 128490645, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.78125, + "step": 5980, + "time_per_iteration": 2.4896938800811768 + }, + { + "auxiliary_loss_clip": 0.01123377, + "auxiliary_loss_mlp": 0.01039239, + "balance_loss_clip": 1.02377748, + "balance_loss_mlp": 1.0416832, + "epoch": 0.3595971742071246, + "flos": 25513992529920.0, + "grad_norm": 1.8107777091561479, + "language_loss": 0.71148199, + "learning_rate": 2.964606105671327e-06, + "loss": 0.73310816, + "num_input_tokens_seen": 128510225, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.81640625, + "step": 5981, + "time_per_iteration": 2.49064302444458 + }, + { + "auxiliary_loss_clip": 0.01121979, + "auxiliary_loss_mlp": 0.01038955, + "balance_loss_clip": 1.02387476, + "balance_loss_mlp": 1.0432086, + "epoch": 0.3596572974597926, + "flos": 29862559635840.0, + "grad_norm": 1.7933500913622242, + "language_loss": 0.71331298, + "learning_rate": 2.9642649172253635e-06, + "loss": 0.73492229, + "num_input_tokens_seen": 128530195, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7890625, + "step": 5982, + "time_per_iteration": 2.5167934894561768 + }, + { + "auxiliary_loss_clip": 0.01117371, + "auxiliary_loss_mlp": 0.01036663, + "balance_loss_clip": 1.02361536, + "balance_loss_mlp": 1.0427959, + "epoch": 0.35971742071246054, + "flos": 23112538899840.0, + "grad_norm": 1.6761533335073455, + "language_loss": 0.75808942, + "learning_rate": 2.9639236922143786e-06, + "loss": 0.77962971, + "num_input_tokens_seen": 128549990, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.74609375, + "step": 5983, + "time_per_iteration": 2.4915101528167725 + }, + { + "auxiliary_loss_clip": 0.01126703, + "auxiliary_loss_mlp": 0.01043227, + "balance_loss_clip": 1.02771819, + "balance_loss_mlp": 1.04474413, + "epoch": 0.3597775439651285, + "flos": 16725359399040.0, + "grad_norm": 2.1804669018597043, + "language_loss": 0.76302433, + "learning_rate": 2.96358243065131e-06, + "loss": 0.78472364, + "num_input_tokens_seen": 128567925, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8203125, + "step": 5984, + "time_per_iteration": 2.436640501022339 + }, + { + "auxiliary_loss_clip": 0.01118377, + "auxiliary_loss_mlp": 0.01037581, + "balance_loss_clip": 1.02356207, + "balance_loss_mlp": 1.0420785, + "epoch": 0.3598376672177965, + "flos": 19719483436800.0, + "grad_norm": 1.837904559260202, + "language_loss": 0.86617446, + "learning_rate": 2.9632411325490993e-06, + "loss": 0.88773406, + "num_input_tokens_seen": 128585655, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 5985, + "time_per_iteration": 2.476853609085083 + }, + { + "auxiliary_loss_clip": 0.0111809, + "auxiliary_loss_mlp": 0.01036238, + "balance_loss_clip": 1.02130079, + "balance_loss_mlp": 1.04078126, + "epoch": 0.35989779047046444, + "flos": 17311529445120.0, + "grad_norm": 1.416236209566339, + "language_loss": 0.72801065, + "learning_rate": 2.9628997979206884e-06, + "loss": 0.74955392, + "num_input_tokens_seen": 128604820, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.7734375, + "step": 5986, + "time_per_iteration": 2.443871021270752 + }, + { + "auxiliary_loss_clip": 0.01124328, + "auxiliary_loss_mlp": 0.0103792, + "balance_loss_clip": 1.02354908, + "balance_loss_mlp": 1.04230642, + "epoch": 0.3599579137231324, + "flos": 22711237176960.0, + "grad_norm": 1.880079313238184, + "language_loss": 0.73711401, + "learning_rate": 2.9625584267790204e-06, + "loss": 0.75873649, + "num_input_tokens_seen": 128623070, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.8203125, + "step": 5987, + "time_per_iteration": 2.517045736312866 + }, + { + "auxiliary_loss_clip": 0.01121357, + "auxiliary_loss_mlp": 0.01036656, + "balance_loss_clip": 1.0217309, + "balance_loss_mlp": 1.04161966, + "epoch": 0.36001803697580037, + "flos": 20959873845120.0, + "grad_norm": 1.8583263097896845, + "language_loss": 0.69824201, + "learning_rate": 2.9622170191370404e-06, + "loss": 0.71982217, + "num_input_tokens_seen": 128642430, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.796875, + "step": 5988, + "time_per_iteration": 2.484654426574707 + }, + { + "auxiliary_loss_clip": 0.01125207, + "auxiliary_loss_mlp": 0.01041231, + "balance_loss_clip": 1.02675915, + "balance_loss_mlp": 1.04297233, + "epoch": 0.36007816022846834, + "flos": 20485565729280.0, + "grad_norm": 1.851186734533378, + "language_loss": 0.72918314, + "learning_rate": 2.9618755750076953e-06, + "loss": 0.75084746, + "num_input_tokens_seen": 128661285, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8203125, + "step": 5989, + "time_per_iteration": 2.464378833770752 + }, + { + "auxiliary_loss_clip": 0.01120868, + "auxiliary_loss_mlp": 0.0103281, + "balance_loss_clip": 1.0194943, + "balance_loss_mlp": 1.04283333, + "epoch": 0.36013828348113636, + "flos": 28001237794560.0, + "grad_norm": 1.8425061302669492, + "language_loss": 0.79664916, + "learning_rate": 2.961534094403931e-06, + "loss": 0.81818593, + "num_input_tokens_seen": 128682210, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.78125, + "step": 5990, + "time_per_iteration": 2.4947755336761475 + }, + { + "auxiliary_loss_clip": 0.01121243, + "auxiliary_loss_mlp": 0.01028868, + "balance_loss_clip": 1.01472998, + "balance_loss_mlp": 1.04281235, + "epoch": 0.3601984067338043, + "flos": 20082181017600.0, + "grad_norm": 1.9352260247419832, + "language_loss": 0.84225297, + "learning_rate": 2.961192577338698e-06, + "loss": 0.86375415, + "num_input_tokens_seen": 128700445, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.78515625, + "step": 5991, + "time_per_iteration": 2.4728991985321045 + }, + { + "auxiliary_loss_clip": 0.01123597, + "auxiliary_loss_mlp": 0.01039266, + "balance_loss_clip": 1.02490079, + "balance_loss_mlp": 1.04197788, + "epoch": 0.3602585299864723, + "flos": 18617599872000.0, + "grad_norm": 1.9640325518662143, + "language_loss": 0.75616056, + "learning_rate": 2.9608510238249463e-06, + "loss": 0.77778924, + "num_input_tokens_seen": 128716855, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.81640625, + "step": 5992, + "time_per_iteration": 2.4422738552093506 + }, + { + "auxiliary_loss_clip": 0.01119253, + "auxiliary_loss_mlp": 0.01034904, + "balance_loss_clip": 1.02022302, + "balance_loss_mlp": 1.04177451, + "epoch": 0.36031865323914025, + "flos": 19573003774080.0, + "grad_norm": 6.32582004359923, + "language_loss": 0.77500135, + "learning_rate": 2.960509433875627e-06, + "loss": 0.79654288, + "num_input_tokens_seen": 128735835, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7734375, + "step": 5993, + "time_per_iteration": 2.4513776302337646 + }, + { + "auxiliary_loss_clip": 0.01124951, + "auxiliary_loss_mlp": 0.01036544, + "balance_loss_clip": 1.02281737, + "balance_loss_mlp": 1.04405534, + "epoch": 0.3603787764918082, + "flos": 17490615678720.0, + "grad_norm": 1.9096274983436938, + "language_loss": 0.74686468, + "learning_rate": 2.9601678075036943e-06, + "loss": 0.7684797, + "num_input_tokens_seen": 128752465, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.80859375, + "step": 5994, + "time_per_iteration": 2.4278860092163086 + }, + { + "auxiliary_loss_clip": 0.0112434, + "auxiliary_loss_mlp": 0.01038632, + "balance_loss_clip": 1.02506554, + "balance_loss_mlp": 1.04320991, + "epoch": 0.3604388997444762, + "flos": 15523393564800.0, + "grad_norm": 1.8397117218597796, + "language_loss": 0.68890274, + "learning_rate": 2.9598261447221024e-06, + "loss": 0.71053243, + "num_input_tokens_seen": 128770865, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.8125, + "step": 5995, + "time_per_iteration": 2.462557554244995 + }, + { + "auxiliary_loss_clip": 0.01124519, + "auxiliary_loss_mlp": 0.01040187, + "balance_loss_clip": 1.02548289, + "balance_loss_mlp": 1.04238582, + "epoch": 0.36049902299714415, + "flos": 17310883000320.0, + "grad_norm": 1.7352965040741237, + "language_loss": 0.82057822, + "learning_rate": 2.9594844455438057e-06, + "loss": 0.84222531, + "num_input_tokens_seen": 128789730, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8203125, + "step": 5996, + "time_per_iteration": 2.4284703731536865 + }, + { + "auxiliary_loss_clip": 0.01119849, + "auxiliary_loss_mlp": 0.01034523, + "balance_loss_clip": 1.02054608, + "balance_loss_mlp": 1.04242694, + "epoch": 0.3605591462498121, + "flos": 17056025026560.0, + "grad_norm": 1.56212250683249, + "language_loss": 0.73570979, + "learning_rate": 2.959142709981763e-06, + "loss": 0.75725353, + "num_input_tokens_seen": 128806610, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7734375, + "step": 5997, + "time_per_iteration": 2.4418485164642334 + }, + { + "auxiliary_loss_clip": 0.01120213, + "auxiliary_loss_mlp": 0.01036336, + "balance_loss_clip": 1.02272177, + "balance_loss_mlp": 1.04307055, + "epoch": 0.3606192695024801, + "flos": 16836862193280.0, + "grad_norm": 2.1655767572067637, + "language_loss": 0.68651283, + "learning_rate": 2.9588009380489337e-06, + "loss": 0.70807832, + "num_input_tokens_seen": 128824830, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 5998, + "time_per_iteration": 2.435884475708008 + }, + { + "auxiliary_loss_clip": 0.01124048, + "auxiliary_loss_mlp": 0.01034007, + "balance_loss_clip": 1.01983321, + "balance_loss_mlp": 1.04494119, + "epoch": 0.36067939275514804, + "flos": 12129655743360.0, + "grad_norm": 2.6750874406601914, + "language_loss": 0.77190387, + "learning_rate": 2.9584591297582758e-06, + "loss": 0.79348445, + "num_input_tokens_seen": 128838170, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7890625, + "step": 5999, + "time_per_iteration": 2.415649175643921 + }, + { + "auxiliary_loss_clip": 0.01123679, + "auxiliary_loss_mlp": 0.01037913, + "balance_loss_clip": 1.02381015, + "balance_loss_mlp": 1.04481769, + "epoch": 0.360739516007816, + "flos": 18041449720320.0, + "grad_norm": 2.719833162653021, + "language_loss": 0.78307509, + "learning_rate": 2.9581172851227516e-06, + "loss": 0.80469108, + "num_input_tokens_seen": 128855625, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7890625, + "step": 6000, + "time_per_iteration": 2.450085401535034 + }, + { + "auxiliary_loss_clip": 0.01122067, + "auxiliary_loss_mlp": 0.01034294, + "balance_loss_clip": 1.02061474, + "balance_loss_mlp": 1.04283905, + "epoch": 0.360799639260484, + "flos": 18549800951040.0, + "grad_norm": 1.6917067376727954, + "language_loss": 0.78621352, + "learning_rate": 2.9577754041553243e-06, + "loss": 0.80777717, + "num_input_tokens_seen": 128873540, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.79296875, + "step": 6001, + "time_per_iteration": 2.4247405529022217 + }, + { + "auxiliary_loss_clip": 0.01119251, + "auxiliary_loss_mlp": 0.0103132, + "balance_loss_clip": 1.01761651, + "balance_loss_mlp": 1.04341698, + "epoch": 0.36085976251315194, + "flos": 19682028529920.0, + "grad_norm": 1.9017223481518102, + "language_loss": 0.83743405, + "learning_rate": 2.9574334868689575e-06, + "loss": 0.85893983, + "num_input_tokens_seen": 128889925, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7578125, + "step": 6002, + "time_per_iteration": 2.4587790966033936 + }, + { + "auxiliary_loss_clip": 0.01117677, + "auxiliary_loss_mlp": 0.01030372, + "balance_loss_clip": 1.01753855, + "balance_loss_mlp": 1.04298413, + "epoch": 0.3609198857658199, + "flos": 24198943703040.0, + "grad_norm": 2.101850625944426, + "language_loss": 0.90627617, + "learning_rate": 2.9570915332766165e-06, + "loss": 0.92775667, + "num_input_tokens_seen": 128906890, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.75, + "step": 6003, + "time_per_iteration": 2.450408697128296 + }, + { + "auxiliary_loss_clip": 0.01040628, + "auxiliary_loss_mlp": 0.01013073, + "balance_loss_clip": 1.01102221, + "balance_loss_mlp": 1.01496768, + "epoch": 0.3609800090184879, + "flos": 57115995160320.0, + "grad_norm": 0.8843653445723816, + "language_loss": 0.53374904, + "learning_rate": 2.9567495433912693e-06, + "loss": 0.55428606, + "num_input_tokens_seen": 128965940, + "router_z_loss_clip": 0.02050781, + "router_z_loss_mlp": 0.25585938, + "step": 6004, + "time_per_iteration": 3.005659341812134 + }, + { + "auxiliary_loss_clip": 0.01121195, + "auxiliary_loss_mlp": 0.01037049, + "balance_loss_clip": 1.02152824, + "balance_loss_mlp": 1.04164577, + "epoch": 0.3610401322711559, + "flos": 20811239366400.0, + "grad_norm": 1.7248099575523852, + "language_loss": 0.77609527, + "learning_rate": 2.956407517225883e-06, + "loss": 0.7976777, + "num_input_tokens_seen": 128985835, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.796875, + "step": 6005, + "time_per_iteration": 2.4916067123413086 + }, + { + "auxiliary_loss_clip": 0.01124405, + "auxiliary_loss_mlp": 0.0103607, + "balance_loss_clip": 1.02230704, + "balance_loss_mlp": 1.04700613, + "epoch": 0.36110025552382385, + "flos": 13699167494400.0, + "grad_norm": 2.24467290311728, + "language_loss": 0.79267776, + "learning_rate": 2.956065454793429e-06, + "loss": 0.81428248, + "num_input_tokens_seen": 129003120, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7734375, + "step": 6006, + "time_per_iteration": 2.4366166591644287 + }, + { + "auxiliary_loss_clip": 0.01124848, + "auxiliary_loss_mlp": 0.01038917, + "balance_loss_clip": 1.02309775, + "balance_loss_mlp": 1.04587984, + "epoch": 0.3611603787764918, + "flos": 22455014486400.0, + "grad_norm": 1.7888636143213261, + "language_loss": 0.84360719, + "learning_rate": 2.955723356106876e-06, + "loss": 0.86524487, + "num_input_tokens_seen": 129021645, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.7890625, + "step": 6007, + "time_per_iteration": 2.51680850982666 + }, + { + "auxiliary_loss_clip": 0.01129854, + "auxiliary_loss_mlp": 0.01037601, + "balance_loss_clip": 1.02166319, + "balance_loss_mlp": 1.04622328, + "epoch": 0.3612205020291598, + "flos": 20886651970560.0, + "grad_norm": 2.0771979180574425, + "language_loss": 0.72564125, + "learning_rate": 2.955381221179198e-06, + "loss": 0.74731576, + "num_input_tokens_seen": 129038375, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8359375, + "step": 6008, + "time_per_iteration": 2.4473018646240234 + }, + { + "auxiliary_loss_clip": 0.01120564, + "auxiliary_loss_mlp": 0.01033967, + "balance_loss_clip": 1.02066362, + "balance_loss_mlp": 1.04255283, + "epoch": 0.36128062528182775, + "flos": 15741981780480.0, + "grad_norm": 1.9836274680059969, + "language_loss": 0.8284781, + "learning_rate": 2.955039050023368e-06, + "loss": 0.85002339, + "num_input_tokens_seen": 129056235, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.78125, + "step": 6009, + "time_per_iteration": 2.470031261444092 + }, + { + "auxiliary_loss_clip": 0.01125455, + "auxiliary_loss_mlp": 0.01043722, + "balance_loss_clip": 1.02945232, + "balance_loss_mlp": 1.04598057, + "epoch": 0.3613407485344957, + "flos": 16764502245120.0, + "grad_norm": 1.714442270200285, + "language_loss": 0.76139152, + "learning_rate": 2.954696842652362e-06, + "loss": 0.78308332, + "num_input_tokens_seen": 129072405, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.79296875, + "step": 6010, + "time_per_iteration": 2.446833848953247 + }, + { + "auxiliary_loss_clip": 0.01123758, + "auxiliary_loss_mlp": 0.01037408, + "balance_loss_clip": 1.0236752, + "balance_loss_mlp": 1.04619896, + "epoch": 0.3614008717871637, + "flos": 20371189847040.0, + "grad_norm": 1.905716478313633, + "language_loss": 0.82946253, + "learning_rate": 2.9543545990791554e-06, + "loss": 0.85107422, + "num_input_tokens_seen": 129090225, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.77734375, + "step": 6011, + "time_per_iteration": 2.508147716522217 + }, + { + "auxiliary_loss_clip": 0.01132257, + "auxiliary_loss_mlp": 0.01041461, + "balance_loss_clip": 1.0264287, + "balance_loss_mlp": 1.0491302, + "epoch": 0.36146099503983165, + "flos": 22776665800320.0, + "grad_norm": 1.8484903271380355, + "language_loss": 0.62762833, + "learning_rate": 2.954012319316727e-06, + "loss": 0.64936543, + "num_input_tokens_seen": 129107685, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 6012, + "time_per_iteration": 5.36588454246521 + }, + { + "auxiliary_loss_clip": 0.01118968, + "auxiliary_loss_mlp": 0.01034853, + "balance_loss_clip": 1.02112007, + "balance_loss_mlp": 1.04337454, + "epoch": 0.3615211182924996, + "flos": 22996654646400.0, + "grad_norm": 1.8689670235824563, + "language_loss": 0.84111822, + "learning_rate": 2.9536700033780565e-06, + "loss": 0.86265635, + "num_input_tokens_seen": 129125315, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7578125, + "step": 6013, + "time_per_iteration": 2.494051933288574 + }, + { + "auxiliary_loss_clip": 0.01124804, + "auxiliary_loss_mlp": 0.0104133, + "balance_loss_clip": 1.02690601, + "balance_loss_mlp": 1.04570448, + "epoch": 0.3615812415451676, + "flos": 16648079287680.0, + "grad_norm": 1.7351999387675028, + "language_loss": 0.91496456, + "learning_rate": 2.9533276512761228e-06, + "loss": 0.93662584, + "num_input_tokens_seen": 129141600, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7890625, + "step": 6014, + "time_per_iteration": 2.4356749057769775 + }, + { + "auxiliary_loss_clip": 0.01123597, + "auxiliary_loss_mlp": 0.01044534, + "balance_loss_clip": 1.03078914, + "balance_loss_mlp": 1.04549718, + "epoch": 0.36164136479783554, + "flos": 21320093387520.0, + "grad_norm": 1.727703603585928, + "language_loss": 0.73830914, + "learning_rate": 2.95298526302391e-06, + "loss": 0.75999045, + "num_input_tokens_seen": 129160665, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.78125, + "step": 6015, + "time_per_iteration": 2.4990644454956055 + }, + { + "auxiliary_loss_clip": 0.01125644, + "auxiliary_loss_mlp": 0.01038195, + "balance_loss_clip": 1.02394915, + "balance_loss_mlp": 1.04633307, + "epoch": 0.3617014880505035, + "flos": 24169569356160.0, + "grad_norm": 1.7277224025907603, + "language_loss": 0.65316677, + "learning_rate": 2.9526428386344e-06, + "loss": 0.67480516, + "num_input_tokens_seen": 129179220, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.79296875, + "step": 6016, + "time_per_iteration": 2.5260934829711914 + }, + { + "auxiliary_loss_clip": 0.01126131, + "auxiliary_loss_mlp": 0.01041518, + "balance_loss_clip": 1.02522171, + "balance_loss_mlp": 1.04727304, + "epoch": 0.3617616113031715, + "flos": 39014824101120.0, + "grad_norm": 1.744160138264151, + "language_loss": 0.72101283, + "learning_rate": 2.9523003781205785e-06, + "loss": 0.74268931, + "num_input_tokens_seen": 129200385, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.7890625, + "step": 6017, + "time_per_iteration": 2.638683795928955 + }, + { + "auxiliary_loss_clip": 0.01126121, + "auxiliary_loss_mlp": 0.01038852, + "balance_loss_clip": 1.02413559, + "balance_loss_mlp": 1.04454577, + "epoch": 0.3618217345558395, + "flos": 12130840892160.0, + "grad_norm": 1.9120538903838002, + "language_loss": 0.73590356, + "learning_rate": 2.9519578814954307e-06, + "loss": 0.75755334, + "num_input_tokens_seen": 129217395, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.81640625, + "step": 6018, + "time_per_iteration": 2.4477858543395996 + }, + { + "auxiliary_loss_clip": 0.01119909, + "auxiliary_loss_mlp": 0.01033819, + "balance_loss_clip": 1.02013361, + "balance_loss_mlp": 1.04458487, + "epoch": 0.36188185780850746, + "flos": 24935005203840.0, + "grad_norm": 1.754547200149591, + "language_loss": 0.69080901, + "learning_rate": 2.9516153487719448e-06, + "loss": 0.71234632, + "num_input_tokens_seen": 129238940, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75390625, + "step": 6019, + "time_per_iteration": 2.519831657409668 + }, + { + "auxiliary_loss_clip": 0.01124958, + "auxiliary_loss_mlp": 0.01034647, + "balance_loss_clip": 1.01980555, + "balance_loss_mlp": 1.0443728, + "epoch": 0.3619419810611754, + "flos": 20958832350720.0, + "grad_norm": 1.5467952079219929, + "language_loss": 0.76299942, + "learning_rate": 2.95127277996311e-06, + "loss": 0.78459549, + "num_input_tokens_seen": 129258240, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8046875, + "step": 6020, + "time_per_iteration": 2.4692177772521973 + }, + { + "auxiliary_loss_clip": 0.01125932, + "auxiliary_loss_mlp": 0.01043324, + "balance_loss_clip": 1.02814841, + "balance_loss_mlp": 1.04721653, + "epoch": 0.3620021043138434, + "flos": 22528882805760.0, + "grad_norm": 1.938447153390643, + "language_loss": 0.73921824, + "learning_rate": 2.9509301750819156e-06, + "loss": 0.76091087, + "num_input_tokens_seen": 129279040, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.78515625, + "step": 6021, + "time_per_iteration": 2.5069808959960938 + }, + { + "auxiliary_loss_clip": 0.01123146, + "auxiliary_loss_mlp": 0.01034023, + "balance_loss_clip": 1.02059376, + "balance_loss_mlp": 1.04596186, + "epoch": 0.36206222756651135, + "flos": 15596687266560.0, + "grad_norm": 1.8648032073369731, + "language_loss": 0.80978441, + "learning_rate": 2.9505875341413533e-06, + "loss": 0.83135605, + "num_input_tokens_seen": 129295415, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.76953125, + "step": 6022, + "time_per_iteration": 2.4620115756988525 + }, + { + "auxiliary_loss_clip": 0.01122576, + "auxiliary_loss_mlp": 0.0103516, + "balance_loss_clip": 1.02212477, + "balance_loss_mlp": 1.04778302, + "epoch": 0.3621223508191793, + "flos": 23587170238080.0, + "grad_norm": 1.6799220656127192, + "language_loss": 0.81351119, + "learning_rate": 2.950244857154417e-06, + "loss": 0.83508855, + "num_input_tokens_seen": 129312620, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.74609375, + "step": 6023, + "time_per_iteration": 2.4969308376312256 + }, + { + "auxiliary_loss_clip": 0.01124727, + "auxiliary_loss_mlp": 0.01034523, + "balance_loss_clip": 1.01975274, + "balance_loss_mlp": 1.04494548, + "epoch": 0.3621824740718473, + "flos": 22309899540480.0, + "grad_norm": 1.8793265875700644, + "language_loss": 0.79767907, + "learning_rate": 2.9499021441341e-06, + "loss": 0.81927156, + "num_input_tokens_seen": 129331825, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.796875, + "step": 6024, + "time_per_iteration": 2.468369245529175 + }, + { + "auxiliary_loss_clip": 0.01119855, + "auxiliary_loss_mlp": 0.01029797, + "balance_loss_clip": 1.01629043, + "balance_loss_mlp": 1.04456711, + "epoch": 0.36224259732451525, + "flos": 16763640318720.0, + "grad_norm": 1.7897574616215441, + "language_loss": 0.74720407, + "learning_rate": 2.9495593950933997e-06, + "loss": 0.7687006, + "num_input_tokens_seen": 129350400, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75390625, + "step": 6025, + "time_per_iteration": 2.4410412311553955 + }, + { + "auxiliary_loss_clip": 0.01120005, + "auxiliary_loss_mlp": 0.0103221, + "balance_loss_clip": 1.01849484, + "balance_loss_mlp": 1.04340899, + "epoch": 0.3623027205771832, + "flos": 23149742411520.0, + "grad_norm": 1.5522426900619628, + "language_loss": 0.72055018, + "learning_rate": 2.9492166100453107e-06, + "loss": 0.74207234, + "num_input_tokens_seen": 129371155, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.765625, + "step": 6026, + "time_per_iteration": 2.4997596740722656 + }, + { + "auxiliary_loss_clip": 0.01128673, + "auxiliary_loss_mlp": 0.01041263, + "balance_loss_clip": 1.02645707, + "balance_loss_mlp": 1.04604256, + "epoch": 0.3623628438298512, + "flos": 28549162834560.0, + "grad_norm": 2.401846993246305, + "language_loss": 0.79332775, + "learning_rate": 2.948873789002833e-06, + "loss": 0.81502712, + "num_input_tokens_seen": 129391230, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.82421875, + "step": 6027, + "time_per_iteration": 2.5326383113861084 + }, + { + "auxiliary_loss_clip": 0.0112338, + "auxiliary_loss_mlp": 0.01040685, + "balance_loss_clip": 1.02576041, + "balance_loss_mlp": 1.04399586, + "epoch": 0.36242296708251914, + "flos": 25484941405440.0, + "grad_norm": 1.7548337209278033, + "language_loss": 0.67809385, + "learning_rate": 2.9485309319789667e-06, + "loss": 0.69973445, + "num_input_tokens_seen": 129410065, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.796875, + "step": 6028, + "time_per_iteration": 2.548088788986206 + }, + { + "auxiliary_loss_clip": 0.0112104, + "auxiliary_loss_mlp": 0.01032512, + "balance_loss_clip": 1.01922584, + "balance_loss_mlp": 1.04415894, + "epoch": 0.3624830903351871, + "flos": 16290373697280.0, + "grad_norm": 1.63067637662311, + "language_loss": 0.85700679, + "learning_rate": 2.9481880389867117e-06, + "loss": 0.8785423, + "num_input_tokens_seen": 129428655, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.76953125, + "step": 6029, + "time_per_iteration": 2.429720878601074 + }, + { + "auxiliary_loss_clip": 0.01120137, + "auxiliary_loss_mlp": 0.01038059, + "balance_loss_clip": 1.02412939, + "balance_loss_mlp": 1.04442835, + "epoch": 0.36254321358785513, + "flos": 18296307694080.0, + "grad_norm": 1.6511023563359555, + "language_loss": 0.72693753, + "learning_rate": 2.9478451100390714e-06, + "loss": 0.74851942, + "num_input_tokens_seen": 129447845, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 6030, + "time_per_iteration": 2.4299302101135254 + }, + { + "auxiliary_loss_clip": 0.01123199, + "auxiliary_loss_mlp": 0.01041671, + "balance_loss_clip": 1.02529144, + "balance_loss_mlp": 1.04264557, + "epoch": 0.3626033368405231, + "flos": 14865294533760.0, + "grad_norm": 2.02536170930057, + "language_loss": 0.73986644, + "learning_rate": 2.94750214514905e-06, + "loss": 0.76151514, + "num_input_tokens_seen": 129463275, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8046875, + "step": 6031, + "time_per_iteration": 2.4376232624053955 + }, + { + "auxiliary_loss_clip": 0.01120355, + "auxiliary_loss_mlp": 0.01031654, + "balance_loss_clip": 1.0177424, + "balance_loss_mlp": 1.04309845, + "epoch": 0.36266346009319106, + "flos": 22306595489280.0, + "grad_norm": 1.8475328889194098, + "language_loss": 0.73286617, + "learning_rate": 2.9471591443296516e-06, + "loss": 0.75438625, + "num_input_tokens_seen": 129483205, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76953125, + "step": 6032, + "time_per_iteration": 2.4811155796051025 + }, + { + "auxiliary_loss_clip": 0.01121253, + "auxiliary_loss_mlp": 0.01038206, + "balance_loss_clip": 1.02412748, + "balance_loss_mlp": 1.0427382, + "epoch": 0.362723583345859, + "flos": 18222331633920.0, + "grad_norm": 1.684246043345259, + "language_loss": 0.77953577, + "learning_rate": 2.946816107593884e-06, + "loss": 0.80113035, + "num_input_tokens_seen": 129499885, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.78515625, + "step": 6033, + "time_per_iteration": 2.4283456802368164 + }, + { + "auxiliary_loss_clip": 0.01040416, + "auxiliary_loss_mlp": 0.01019079, + "balance_loss_clip": 1.01733828, + "balance_loss_mlp": 1.01487339, + "epoch": 0.362783706598527, + "flos": 68499174458880.0, + "grad_norm": 0.786107382559835, + "language_loss": 0.64822888, + "learning_rate": 2.9464730349547547e-06, + "loss": 0.66882384, + "num_input_tokens_seen": 129561885, + "router_z_loss_clip": 0.01745605, + "router_z_loss_mlp": 0.25585938, + "step": 6034, + "time_per_iteration": 3.1253511905670166 + }, + { + "auxiliary_loss_clip": 0.01118206, + "auxiliary_loss_mlp": 0.01035423, + "balance_loss_clip": 1.02139246, + "balance_loss_mlp": 1.04131126, + "epoch": 0.36284382985119495, + "flos": 26576589594240.0, + "grad_norm": 1.4985312456135769, + "language_loss": 0.90059769, + "learning_rate": 2.946129926425273e-06, + "loss": 0.92213392, + "num_input_tokens_seen": 129582325, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76953125, + "step": 6035, + "time_per_iteration": 2.4888923168182373 + }, + { + "auxiliary_loss_clip": 0.01122001, + "auxiliary_loss_mlp": 0.01034945, + "balance_loss_clip": 1.02030611, + "balance_loss_mlp": 1.04239392, + "epoch": 0.3629039531038629, + "flos": 20156767608960.0, + "grad_norm": 1.7493433732375512, + "language_loss": 0.73526931, + "learning_rate": 2.9457867820184496e-06, + "loss": 0.7568388, + "num_input_tokens_seen": 129600350, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.796875, + "step": 6036, + "time_per_iteration": 2.445058822631836 + }, + { + "auxiliary_loss_clip": 0.01124436, + "auxiliary_loss_mlp": 0.01029235, + "balance_loss_clip": 1.01500189, + "balance_loss_mlp": 1.04274487, + "epoch": 0.3629640763565309, + "flos": 18625716345600.0, + "grad_norm": 1.901551926176817, + "language_loss": 0.75938255, + "learning_rate": 2.945443601747297e-06, + "loss": 0.78091925, + "num_input_tokens_seen": 129618425, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.81640625, + "step": 6037, + "time_per_iteration": 2.422229766845703 + }, + { + "auxiliary_loss_clip": 0.0111661, + "auxiliary_loss_mlp": 0.0103799, + "balance_loss_clip": 1.0238812, + "balance_loss_mlp": 1.04227912, + "epoch": 0.36302419960919885, + "flos": 19571459489280.0, + "grad_norm": 1.6899683541385933, + "language_loss": 0.78120697, + "learning_rate": 2.945100385624828e-06, + "loss": 0.80275297, + "num_input_tokens_seen": 129636750, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 6038, + "time_per_iteration": 2.4582855701446533 + }, + { + "auxiliary_loss_clip": 0.0103994, + "auxiliary_loss_mlp": 0.01006466, + "balance_loss_clip": 1.00467765, + "balance_loss_mlp": 1.01452303, + "epoch": 0.3630843228618668, + "flos": 63797606444160.0, + "grad_norm": 0.8286249809211084, + "language_loss": 0.63413143, + "learning_rate": 2.9447571336640573e-06, + "loss": 0.65459549, + "num_input_tokens_seen": 129699030, + "router_z_loss_clip": 0.01782227, + "router_z_loss_mlp": 0.25390625, + "step": 6039, + "time_per_iteration": 3.1417860984802246 + }, + { + "auxiliary_loss_clip": 0.01120334, + "auxiliary_loss_mlp": 0.01035281, + "balance_loss_clip": 1.02175713, + "balance_loss_mlp": 1.04391789, + "epoch": 0.3631444461145348, + "flos": 21835160461440.0, + "grad_norm": 1.9215128015710738, + "language_loss": 0.70857447, + "learning_rate": 2.944413845878002e-06, + "loss": 0.73013067, + "num_input_tokens_seen": 129717135, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 6040, + "time_per_iteration": 2.505627155303955 + }, + { + "auxiliary_loss_clip": 0.0112497, + "auxiliary_loss_mlp": 0.01032549, + "balance_loss_clip": 1.01827383, + "balance_loss_mlp": 1.04445744, + "epoch": 0.36320456936720275, + "flos": 21722041555200.0, + "grad_norm": 2.327350689124367, + "language_loss": 0.81322253, + "learning_rate": 2.9440705222796783e-06, + "loss": 0.83479762, + "num_input_tokens_seen": 129735940, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8046875, + "step": 6041, + "time_per_iteration": 2.4475231170654297 + }, + { + "auxiliary_loss_clip": 0.01120173, + "auxiliary_loss_mlp": 0.01030159, + "balance_loss_clip": 1.01526928, + "balance_loss_mlp": 1.04150891, + "epoch": 0.3632646926198707, + "flos": 17019072910080.0, + "grad_norm": 2.252727008735842, + "language_loss": 0.83721769, + "learning_rate": 2.943727162882107e-06, + "loss": 0.85872102, + "num_input_tokens_seen": 129752790, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78515625, + "step": 6042, + "time_per_iteration": 2.461111545562744 + }, + { + "auxiliary_loss_clip": 0.01120803, + "auxiliary_loss_mlp": 0.01039778, + "balance_loss_clip": 1.02583623, + "balance_loss_mlp": 1.04390788, + "epoch": 0.36332481587253873, + "flos": 23331163029120.0, + "grad_norm": 1.6644116234057968, + "language_loss": 0.78122932, + "learning_rate": 2.9433837676983064e-06, + "loss": 0.80283511, + "num_input_tokens_seen": 129773655, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76953125, + "step": 6043, + "time_per_iteration": 2.477030038833618 + }, + { + "auxiliary_loss_clip": 0.01117368, + "auxiliary_loss_mlp": 0.01035057, + "balance_loss_clip": 1.02017403, + "balance_loss_mlp": 1.04266226, + "epoch": 0.3633849391252067, + "flos": 10743539857920.0, + "grad_norm": 3.8032713581650515, + "language_loss": 0.65792918, + "learning_rate": 2.943040336741298e-06, + "loss": 0.67945337, + "num_input_tokens_seen": 129791605, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.74609375, + "step": 6044, + "time_per_iteration": 2.471221446990967 + }, + { + "auxiliary_loss_clip": 0.01118191, + "auxiliary_loss_mlp": 0.01030901, + "balance_loss_clip": 1.01706135, + "balance_loss_mlp": 1.04186332, + "epoch": 0.36344506237787466, + "flos": 25849147357440.0, + "grad_norm": 1.74112377533005, + "language_loss": 0.80978471, + "learning_rate": 2.9426968700241066e-06, + "loss": 0.83127558, + "num_input_tokens_seen": 129811075, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76171875, + "step": 6045, + "time_per_iteration": 2.482147693634033 + }, + { + "auxiliary_loss_clip": 0.01122131, + "auxiliary_loss_mlp": 0.01038556, + "balance_loss_clip": 1.02388096, + "balance_loss_mlp": 1.04342091, + "epoch": 0.3635051856305426, + "flos": 30154046503680.0, + "grad_norm": 1.7414472049280392, + "language_loss": 0.64214617, + "learning_rate": 2.942353367559755e-06, + "loss": 0.66375309, + "num_input_tokens_seen": 129833755, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78515625, + "step": 6046, + "time_per_iteration": 2.593209743499756 + }, + { + "auxiliary_loss_clip": 0.01119542, + "auxiliary_loss_mlp": 0.01035387, + "balance_loss_clip": 1.02142787, + "balance_loss_mlp": 1.04214859, + "epoch": 0.3635653088832106, + "flos": 22198396746240.0, + "grad_norm": 1.623453692259123, + "language_loss": 0.77366132, + "learning_rate": 2.9420098293612692e-06, + "loss": 0.7952106, + "num_input_tokens_seen": 129854475, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7734375, + "step": 6047, + "time_per_iteration": 2.4650797843933105 + }, + { + "auxiliary_loss_clip": 0.01125471, + "auxiliary_loss_mlp": 0.01041953, + "balance_loss_clip": 1.02609777, + "balance_loss_mlp": 1.04148006, + "epoch": 0.36362543213587856, + "flos": 24787053083520.0, + "grad_norm": 1.508802610673932, + "language_loss": 0.79679012, + "learning_rate": 2.9416662554416767e-06, + "loss": 0.81846434, + "num_input_tokens_seen": 129873530, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8359375, + "step": 6048, + "time_per_iteration": 2.5329999923706055 + }, + { + "auxiliary_loss_clip": 0.01037747, + "auxiliary_loss_mlp": 0.01000265, + "balance_loss_clip": 0.99839348, + "balance_loss_mlp": 1.0124383, + "epoch": 0.3636855553885465, + "flos": 62526369231360.0, + "grad_norm": 0.7564639677567045, + "language_loss": 0.52584642, + "learning_rate": 2.9413226458140054e-06, + "loss": 0.54622656, + "num_input_tokens_seen": 129940400, + "router_z_loss_clip": 0.01867676, + "router_z_loss_mlp": 0.25390625, + "step": 6049, + "time_per_iteration": 3.1051762104034424 + }, + { + "auxiliary_loss_clip": 0.0112103, + "auxiliary_loss_mlp": 0.01036313, + "balance_loss_clip": 1.02172136, + "balance_loss_mlp": 1.04254675, + "epoch": 0.3637456786412145, + "flos": 24060652341120.0, + "grad_norm": 2.0453292842004833, + "language_loss": 0.86365628, + "learning_rate": 2.9409790004912845e-06, + "loss": 0.88522977, + "num_input_tokens_seen": 129958635, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.78515625, + "step": 6050, + "time_per_iteration": 2.469092845916748 + }, + { + "auxiliary_loss_clip": 0.01119484, + "auxiliary_loss_mlp": 0.01036489, + "balance_loss_clip": 1.0227803, + "balance_loss_mlp": 1.04309154, + "epoch": 0.36380580189388245, + "flos": 16691495852160.0, + "grad_norm": 1.7649295268136813, + "language_loss": 0.7855531, + "learning_rate": 2.940635319486546e-06, + "loss": 0.80711287, + "num_input_tokens_seen": 129977685, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76171875, + "step": 6051, + "time_per_iteration": 2.425166368484497 + }, + { + "auxiliary_loss_clip": 0.0111821, + "auxiliary_loss_mlp": 0.01034377, + "balance_loss_clip": 1.02044129, + "balance_loss_mlp": 1.04047346, + "epoch": 0.3638659251465504, + "flos": 25114091437440.0, + "grad_norm": 2.0280679706971423, + "language_loss": 0.83024764, + "learning_rate": 2.940291602812822e-06, + "loss": 0.8517735, + "num_input_tokens_seen": 129997530, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.77734375, + "step": 6052, + "time_per_iteration": 2.511826992034912 + }, + { + "auxiliary_loss_clip": 0.01114918, + "auxiliary_loss_mlp": 0.01034279, + "balance_loss_clip": 1.02146947, + "balance_loss_mlp": 1.03992438, + "epoch": 0.3639260483992184, + "flos": 23003011353600.0, + "grad_norm": 3.055248278017369, + "language_loss": 0.72156489, + "learning_rate": 2.939947850483145e-06, + "loss": 0.74305683, + "num_input_tokens_seen": 130017955, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.75, + "step": 6053, + "time_per_iteration": 4.030078887939453 + }, + { + "auxiliary_loss_clip": 0.01038499, + "auxiliary_loss_mlp": 0.01000787, + "balance_loss_clip": 0.99893934, + "balance_loss_mlp": 1.01315093, + "epoch": 0.36398617165188635, + "flos": 70716011160960.0, + "grad_norm": 0.7695228081579073, + "language_loss": 0.61234874, + "learning_rate": 2.9396040625105532e-06, + "loss": 0.63274157, + "num_input_tokens_seen": 130074275, + "router_z_loss_clip": 0.01843262, + "router_z_loss_mlp": 0.25390625, + "step": 6054, + "time_per_iteration": 4.498634576797485 + }, + { + "auxiliary_loss_clip": 0.01121607, + "auxiliary_loss_mlp": 0.01038553, + "balance_loss_clip": 1.02378917, + "balance_loss_mlp": 1.0425837, + "epoch": 0.3640462949045543, + "flos": 22235456603520.0, + "grad_norm": 1.9647165397438333, + "language_loss": 0.75846946, + "learning_rate": 2.9392602389080802e-06, + "loss": 0.78007108, + "num_input_tokens_seen": 130091375, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.7890625, + "step": 6055, + "time_per_iteration": 2.46478271484375 + }, + { + "auxiliary_loss_clip": 0.01121913, + "auxiliary_loss_mlp": 0.010384, + "balance_loss_clip": 1.0240891, + "balance_loss_mlp": 1.04369521, + "epoch": 0.3641064181572223, + "flos": 21543529939200.0, + "grad_norm": 1.6567803669377452, + "language_loss": 0.75263339, + "learning_rate": 2.938916379688765e-06, + "loss": 0.7742365, + "num_input_tokens_seen": 130111595, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78125, + "step": 6056, + "time_per_iteration": 2.4739041328430176 + }, + { + "auxiliary_loss_clip": 0.01120031, + "auxiliary_loss_mlp": 0.01039041, + "balance_loss_clip": 1.02447379, + "balance_loss_mlp": 1.04331231, + "epoch": 0.3641665414098903, + "flos": 22273306560000.0, + "grad_norm": 2.0844054878938607, + "language_loss": 0.80676425, + "learning_rate": 2.9385724848656468e-06, + "loss": 0.82835501, + "num_input_tokens_seen": 130131440, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.765625, + "step": 6057, + "time_per_iteration": 2.4778594970703125 + }, + { + "auxiliary_loss_clip": 0.01119344, + "auxiliary_loss_mlp": 0.0103916, + "balance_loss_clip": 1.02457452, + "balance_loss_mlp": 1.04333091, + "epoch": 0.36422666466255826, + "flos": 28329676778880.0, + "grad_norm": 1.8744131952209395, + "language_loss": 0.79986346, + "learning_rate": 2.9382285544517647e-06, + "loss": 0.82144856, + "num_input_tokens_seen": 130151375, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.76171875, + "step": 6058, + "time_per_iteration": 2.5267081260681152 + }, + { + "auxiliary_loss_clip": 0.01119278, + "auxiliary_loss_mlp": 0.01035858, + "balance_loss_clip": 1.02142191, + "balance_loss_mlp": 1.04207647, + "epoch": 0.36428678791522623, + "flos": 24170503109760.0, + "grad_norm": 1.8448855765347556, + "language_loss": 0.8485254, + "learning_rate": 2.9378845884601636e-06, + "loss": 0.87007678, + "num_input_tokens_seen": 130169960, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.76953125, + "step": 6059, + "time_per_iteration": 2.4876210689544678 + }, + { + "auxiliary_loss_clip": 0.01123355, + "auxiliary_loss_mlp": 0.01040047, + "balance_loss_clip": 1.02527666, + "balance_loss_mlp": 1.04397857, + "epoch": 0.3643469111678942, + "flos": 22528451842560.0, + "grad_norm": 1.4958849024653313, + "language_loss": 0.8783946, + "learning_rate": 2.937540586903884e-06, + "loss": 0.90002865, + "num_input_tokens_seen": 130189800, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.79296875, + "step": 6060, + "time_per_iteration": 2.516439199447632 + }, + { + "auxiliary_loss_clip": 0.01124396, + "auxiliary_loss_mlp": 0.0104086, + "balance_loss_clip": 1.02583957, + "balance_loss_mlp": 1.04366183, + "epoch": 0.36440703442056216, + "flos": 19426595938560.0, + "grad_norm": 2.6600271028380824, + "language_loss": 0.67965293, + "learning_rate": 2.937196549795971e-06, + "loss": 0.70130551, + "num_input_tokens_seen": 130206370, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.80859375, + "step": 6061, + "time_per_iteration": 2.4436440467834473 + }, + { + "auxiliary_loss_clip": 0.01127668, + "auxiliary_loss_mlp": 0.01039684, + "balance_loss_clip": 1.02444267, + "balance_loss_mlp": 1.04622734, + "epoch": 0.3644671576732301, + "flos": 18040515966720.0, + "grad_norm": 2.142951671935031, + "language_loss": 0.75072217, + "learning_rate": 2.9368524771494718e-06, + "loss": 0.77239573, + "num_input_tokens_seen": 130224445, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8125, + "step": 6062, + "time_per_iteration": 2.4325368404388428 + }, + { + "auxiliary_loss_clip": 0.011222, + "auxiliary_loss_mlp": 0.01035014, + "balance_loss_clip": 1.01910567, + "balance_loss_mlp": 1.04460645, + "epoch": 0.3645272809258981, + "flos": 21542811667200.0, + "grad_norm": 1.6782897381106048, + "language_loss": 0.72632384, + "learning_rate": 2.936508368977432e-06, + "loss": 0.74789596, + "num_input_tokens_seen": 130245380, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.77734375, + "step": 6063, + "time_per_iteration": 2.498168468475342 + }, + { + "auxiliary_loss_clip": 0.0112083, + "auxiliary_loss_mlp": 0.01039322, + "balance_loss_clip": 1.0249579, + "balance_loss_mlp": 1.04365671, + "epoch": 0.36458740417856605, + "flos": 22746860490240.0, + "grad_norm": 1.8702732296649918, + "language_loss": 0.68128121, + "learning_rate": 2.936164225292901e-06, + "loss": 0.70288265, + "num_input_tokens_seen": 130265575, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7734375, + "step": 6064, + "time_per_iteration": 2.4951584339141846 + }, + { + "auxiliary_loss_clip": 0.01125722, + "auxiliary_loss_mlp": 0.01046801, + "balance_loss_clip": 1.03205502, + "balance_loss_mlp": 1.04549003, + "epoch": 0.364647527431234, + "flos": 26140670138880.0, + "grad_norm": 1.679838788119498, + "language_loss": 0.74604851, + "learning_rate": 2.9358200461089297e-06, + "loss": 0.76777375, + "num_input_tokens_seen": 130286195, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8046875, + "step": 6065, + "time_per_iteration": 2.4980344772338867 + }, + { + "auxiliary_loss_clip": 0.01125488, + "auxiliary_loss_mlp": 0.01041621, + "balance_loss_clip": 1.02544403, + "balance_loss_mlp": 1.04464209, + "epoch": 0.364707650683902, + "flos": 31029907737600.0, + "grad_norm": 1.8520658730284223, + "language_loss": 0.75248677, + "learning_rate": 2.9354758314385676e-06, + "loss": 0.77415788, + "num_input_tokens_seen": 130306095, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8046875, + "step": 6066, + "time_per_iteration": 2.5525264739990234 + }, + { + "auxiliary_loss_clip": 0.01116764, + "auxiliary_loss_mlp": 0.01034497, + "balance_loss_clip": 1.02101445, + "balance_loss_mlp": 1.04115653, + "epoch": 0.36476777393656995, + "flos": 19572896033280.0, + "grad_norm": 2.55479391525507, + "language_loss": 0.76988614, + "learning_rate": 2.9351315812948684e-06, + "loss": 0.79139876, + "num_input_tokens_seen": 130324685, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 6067, + "time_per_iteration": 2.440595865249634 + }, + { + "auxiliary_loss_clip": 0.01120327, + "auxiliary_loss_mlp": 0.01037255, + "balance_loss_clip": 1.02422583, + "balance_loss_mlp": 1.04442596, + "epoch": 0.3648278971892379, + "flos": 17748849530880.0, + "grad_norm": 2.1532465459722574, + "language_loss": 0.70826519, + "learning_rate": 2.934787295690886e-06, + "loss": 0.72984099, + "num_input_tokens_seen": 130343855, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7578125, + "step": 6068, + "time_per_iteration": 2.4555468559265137 + }, + { + "auxiliary_loss_clip": 0.01123082, + "auxiliary_loss_mlp": 0.01037893, + "balance_loss_clip": 1.02359986, + "balance_loss_mlp": 1.04301953, + "epoch": 0.3648880204419059, + "flos": 17931167988480.0, + "grad_norm": 1.8428063971352102, + "language_loss": 0.73987395, + "learning_rate": 2.9344429746396755e-06, + "loss": 0.76148373, + "num_input_tokens_seen": 130362320, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.80078125, + "step": 6069, + "time_per_iteration": 2.4380593299865723 + }, + { + "auxiliary_loss_clip": 0.01124432, + "auxiliary_loss_mlp": 0.01035086, + "balance_loss_clip": 1.0203104, + "balance_loss_mlp": 1.04434299, + "epoch": 0.3649481436945739, + "flos": 22638266697600.0, + "grad_norm": 1.740540431199334, + "language_loss": 0.66149801, + "learning_rate": 2.9340986181542945e-06, + "loss": 0.68309319, + "num_input_tokens_seen": 130383165, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.80078125, + "step": 6070, + "time_per_iteration": 2.4852278232574463 + }, + { + "auxiliary_loss_clip": 0.01120342, + "auxiliary_loss_mlp": 0.01036109, + "balance_loss_clip": 1.02225685, + "balance_loss_mlp": 1.04412127, + "epoch": 0.36500826694724187, + "flos": 21579656042880.0, + "grad_norm": 1.5531027619052142, + "language_loss": 0.74474913, + "learning_rate": 2.9337542262477994e-06, + "loss": 0.76631367, + "num_input_tokens_seen": 130402425, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76171875, + "step": 6071, + "time_per_iteration": 2.483961820602417 + }, + { + "auxiliary_loss_clip": 0.01119978, + "auxiliary_loss_mlp": 0.01034213, + "balance_loss_clip": 1.01926446, + "balance_loss_mlp": 1.04232538, + "epoch": 0.36506839019990983, + "flos": 13772533023360.0, + "grad_norm": 2.0347636440980277, + "language_loss": 0.88132894, + "learning_rate": 2.9334097989332506e-06, + "loss": 0.90287089, + "num_input_tokens_seen": 130419440, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.77734375, + "step": 6072, + "time_per_iteration": 2.4083876609802246 + }, + { + "auxiliary_loss_clip": 0.01121735, + "auxiliary_loss_mlp": 0.01035678, + "balance_loss_clip": 1.02184379, + "balance_loss_mlp": 1.04389739, + "epoch": 0.3651285134525778, + "flos": 17274972378240.0, + "grad_norm": 2.230203116909298, + "language_loss": 0.72432441, + "learning_rate": 2.9330653362237094e-06, + "loss": 0.74589849, + "num_input_tokens_seen": 130438495, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.77734375, + "step": 6073, + "time_per_iteration": 2.4769015312194824 + }, + { + "auxiliary_loss_clip": 0.01123465, + "auxiliary_loss_mlp": 0.01039544, + "balance_loss_clip": 1.02520275, + "balance_loss_mlp": 1.04425395, + "epoch": 0.36518863670524576, + "flos": 21907987286400.0, + "grad_norm": 1.8811318432297164, + "language_loss": 0.66584921, + "learning_rate": 2.932720838132236e-06, + "loss": 0.68747932, + "num_input_tokens_seen": 130455575, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7890625, + "step": 6074, + "time_per_iteration": 2.4474194049835205 + }, + { + "auxiliary_loss_clip": 0.01117894, + "auxiliary_loss_mlp": 0.01032639, + "balance_loss_clip": 1.01891208, + "balance_loss_mlp": 1.04079318, + "epoch": 0.3652487599579137, + "flos": 27122180250240.0, + "grad_norm": 1.5068114870819531, + "language_loss": 0.72946787, + "learning_rate": 2.9323763046718954e-06, + "loss": 0.75097322, + "num_input_tokens_seen": 130476385, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76953125, + "step": 6075, + "time_per_iteration": 2.5063765048980713 + }, + { + "auxiliary_loss_clip": 0.01126029, + "auxiliary_loss_mlp": 0.01044219, + "balance_loss_clip": 1.02888894, + "balance_loss_mlp": 1.04484594, + "epoch": 0.3653088832105817, + "flos": 19755573626880.0, + "grad_norm": 2.7314154698808113, + "language_loss": 0.8938573, + "learning_rate": 2.9320317358557524e-06, + "loss": 0.91555977, + "num_input_tokens_seen": 130493630, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8125, + "step": 6076, + "time_per_iteration": 2.4518303871154785 + }, + { + "auxiliary_loss_clip": 0.01121617, + "auxiliary_loss_mlp": 0.01038999, + "balance_loss_clip": 1.02438378, + "balance_loss_mlp": 1.04457617, + "epoch": 0.36536900646324966, + "flos": 13115008609920.0, + "grad_norm": 2.2164690925931976, + "language_loss": 0.69506466, + "learning_rate": 2.931687131696872e-06, + "loss": 0.71667087, + "num_input_tokens_seen": 130510735, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.76953125, + "step": 6077, + "time_per_iteration": 2.447659730911255 + }, + { + "auxiliary_loss_clip": 0.01043202, + "auxiliary_loss_mlp": 0.01009421, + "balance_loss_clip": 1.00758541, + "balance_loss_mlp": 1.01693892, + "epoch": 0.3654291297159176, + "flos": 71100472383360.0, + "grad_norm": 0.7520139059893192, + "language_loss": 0.61798048, + "learning_rate": 2.9313424922083224e-06, + "loss": 0.63850671, + "num_input_tokens_seen": 130577050, + "router_z_loss_clip": 0.01831055, + "router_z_loss_mlp": 0.26171875, + "step": 6078, + "time_per_iteration": 3.1669509410858154 + }, + { + "auxiliary_loss_clip": 0.01119836, + "auxiliary_loss_mlp": 0.01036426, + "balance_loss_clip": 1.02238369, + "balance_loss_mlp": 1.04217839, + "epoch": 0.3654892529685856, + "flos": 23617478338560.0, + "grad_norm": 1.8851740765331422, + "language_loss": 0.78088033, + "learning_rate": 2.930997817403173e-06, + "loss": 0.80244297, + "num_input_tokens_seen": 130593780, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.77734375, + "step": 6079, + "time_per_iteration": 2.4570510387420654 + }, + { + "auxiliary_loss_clip": 0.01124854, + "auxiliary_loss_mlp": 0.01040383, + "balance_loss_clip": 1.02517176, + "balance_loss_mlp": 1.04497504, + "epoch": 0.36554937622125355, + "flos": 43470799850880.0, + "grad_norm": 2.129422570654268, + "language_loss": 0.62885886, + "learning_rate": 2.9306531072944913e-06, + "loss": 0.65051121, + "num_input_tokens_seen": 130615510, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.796875, + "step": 6080, + "time_per_iteration": 2.65580415725708 + }, + { + "auxiliary_loss_clip": 0.01122781, + "auxiliary_loss_mlp": 0.01034606, + "balance_loss_clip": 1.01974106, + "balance_loss_mlp": 1.04280567, + "epoch": 0.3656094994739215, + "flos": 23294641875840.0, + "grad_norm": 2.4061972925673385, + "language_loss": 0.67665905, + "learning_rate": 2.930308361895352e-06, + "loss": 0.69823289, + "num_input_tokens_seen": 130635410, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 6081, + "time_per_iteration": 2.4747202396392822 + }, + { + "auxiliary_loss_clip": 0.01124634, + "auxiliary_loss_mlp": 0.01038138, + "balance_loss_clip": 1.02287912, + "balance_loss_mlp": 1.04305673, + "epoch": 0.3656696227265895, + "flos": 24571984400640.0, + "grad_norm": 1.9082106177767983, + "language_loss": 0.74747473, + "learning_rate": 2.9299635812188257e-06, + "loss": 0.76910245, + "num_input_tokens_seen": 130657725, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.81640625, + "step": 6082, + "time_per_iteration": 2.5238633155822754 + }, + { + "auxiliary_loss_clip": 0.01126171, + "auxiliary_loss_mlp": 0.01029096, + "balance_loss_clip": 1.01576877, + "balance_loss_mlp": 1.04598689, + "epoch": 0.3657297459792575, + "flos": 27928375056000.0, + "grad_norm": 1.8091692998669453, + "language_loss": 0.82823056, + "learning_rate": 2.929618765277987e-06, + "loss": 0.84978318, + "num_input_tokens_seen": 130678360, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.8046875, + "step": 6083, + "time_per_iteration": 2.517704963684082 + }, + { + "auxiliary_loss_clip": 0.01041849, + "auxiliary_loss_mlp": 0.01002206, + "balance_loss_clip": 1.00026309, + "balance_loss_mlp": 1.01621974, + "epoch": 0.36578986923192547, + "flos": 67392622126080.0, + "grad_norm": 0.8152809684063654, + "language_loss": 0.59372437, + "learning_rate": 2.9292739140859125e-06, + "loss": 0.61416495, + "num_input_tokens_seen": 130742110, + "router_z_loss_clip": 0.01940918, + "router_z_loss_mlp": 0.25585938, + "step": 6084, + "time_per_iteration": 3.126275062561035 + }, + { + "auxiliary_loss_clip": 0.01121734, + "auxiliary_loss_mlp": 0.01036969, + "balance_loss_clip": 1.02273536, + "balance_loss_mlp": 1.04410744, + "epoch": 0.36584999248459343, + "flos": 20227511445120.0, + "grad_norm": 2.719357970509058, + "language_loss": 0.73096633, + "learning_rate": 2.9289290276556767e-06, + "loss": 0.75255334, + "num_input_tokens_seen": 130759870, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.77734375, + "step": 6085, + "time_per_iteration": 2.436722755432129 + }, + { + "auxiliary_loss_clip": 0.01122986, + "auxiliary_loss_mlp": 0.01028561, + "balance_loss_clip": 1.01485801, + "balance_loss_mlp": 1.0447793, + "epoch": 0.3659101157372614, + "flos": 19062461813760.0, + "grad_norm": 4.360512376704014, + "language_loss": 0.7831111, + "learning_rate": 2.9285841060003604e-06, + "loss": 0.80462652, + "num_input_tokens_seen": 130778510, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.78125, + "step": 6086, + "time_per_iteration": 2.557521104812622 + }, + { + "auxiliary_loss_clip": 0.0111444, + "auxiliary_loss_mlp": 0.01029117, + "balance_loss_clip": 1.0150919, + "balance_loss_mlp": 1.0403074, + "epoch": 0.36597023898992936, + "flos": 30810708990720.0, + "grad_norm": 1.7974113126538098, + "language_loss": 0.77105325, + "learning_rate": 2.9282391491330416e-06, + "loss": 0.79248881, + "num_input_tokens_seen": 130798535, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 6087, + "time_per_iteration": 2.544868230819702 + }, + { + "auxiliary_loss_clip": 0.01121777, + "auxiliary_loss_mlp": 0.01030672, + "balance_loss_clip": 1.01587856, + "balance_loss_mlp": 1.04190612, + "epoch": 0.36603036224259733, + "flos": 20521799573760.0, + "grad_norm": 5.741725291334025, + "language_loss": 0.70710862, + "learning_rate": 2.9278941570668002e-06, + "loss": 0.72863311, + "num_input_tokens_seen": 130816655, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.80078125, + "step": 6088, + "time_per_iteration": 2.491933822631836 + }, + { + "auxiliary_loss_clip": 0.01130389, + "auxiliary_loss_mlp": 0.01034477, + "balance_loss_clip": 1.01897383, + "balance_loss_mlp": 1.04569137, + "epoch": 0.3660904854952653, + "flos": 38329397798400.0, + "grad_norm": 1.6695945607154594, + "language_loss": 0.79878473, + "learning_rate": 2.92754912981472e-06, + "loss": 0.82043338, + "num_input_tokens_seen": 130841225, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84765625, + "step": 6089, + "time_per_iteration": 2.666814088821411 + }, + { + "auxiliary_loss_clip": 0.01119748, + "auxiliary_loss_mlp": 0.01031445, + "balance_loss_clip": 1.01816463, + "balance_loss_mlp": 1.04267049, + "epoch": 0.36615060874793326, + "flos": 21835555511040.0, + "grad_norm": 1.7190941707632215, + "language_loss": 0.71335226, + "learning_rate": 2.927204067389884e-06, + "loss": 0.73486418, + "num_input_tokens_seen": 130861050, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7734375, + "step": 6090, + "time_per_iteration": 2.5138063430786133 + }, + { + "auxiliary_loss_clip": 0.01118993, + "auxiliary_loss_mlp": 0.01041328, + "balance_loss_clip": 1.02757084, + "balance_loss_mlp": 1.04391527, + "epoch": 0.3662107320006012, + "flos": 16581537342720.0, + "grad_norm": 1.9784029627642763, + "language_loss": 0.74276829, + "learning_rate": 2.9268589698053763e-06, + "loss": 0.76437145, + "num_input_tokens_seen": 130879775, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.75, + "step": 6091, + "time_per_iteration": 2.437126636505127 + }, + { + "auxiliary_loss_clip": 0.01120866, + "auxiliary_loss_mlp": 0.01039193, + "balance_loss_clip": 1.02506638, + "balance_loss_mlp": 1.04396391, + "epoch": 0.3662708552532692, + "flos": 20958365473920.0, + "grad_norm": 1.8707748404117035, + "language_loss": 0.72492194, + "learning_rate": 2.926513837074284e-06, + "loss": 0.74652249, + "num_input_tokens_seen": 130898070, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76953125, + "step": 6092, + "time_per_iteration": 2.5038540363311768 + }, + { + "auxiliary_loss_clip": 0.01122728, + "auxiliary_loss_mlp": 0.01045071, + "balance_loss_clip": 1.03072441, + "balance_loss_mlp": 1.04359424, + "epoch": 0.36633097850593715, + "flos": 21902707987200.0, + "grad_norm": 1.9548617375197639, + "language_loss": 0.78251863, + "learning_rate": 2.9261686692096942e-06, + "loss": 0.8041966, + "num_input_tokens_seen": 130915250, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7890625, + "step": 6093, + "time_per_iteration": 2.453854560852051 + }, + { + "auxiliary_loss_clip": 0.01119754, + "auxiliary_loss_mlp": 0.01036262, + "balance_loss_clip": 1.02226686, + "balance_loss_mlp": 1.04095936, + "epoch": 0.3663911017586051, + "flos": 32854133808000.0, + "grad_norm": 1.7535936892187265, + "language_loss": 0.74123377, + "learning_rate": 2.925823466224696e-06, + "loss": 0.76279384, + "num_input_tokens_seen": 130936995, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7890625, + "step": 6094, + "time_per_iteration": 2.5953075885772705 + }, + { + "auxiliary_loss_clip": 0.01125058, + "auxiliary_loss_mlp": 0.01052761, + "balance_loss_clip": 1.0381875, + "balance_loss_mlp": 1.04492939, + "epoch": 0.3664512250112731, + "flos": 27271748482560.0, + "grad_norm": 1.5564182913572622, + "language_loss": 0.79226458, + "learning_rate": 2.9254782281323785e-06, + "loss": 0.81404281, + "num_input_tokens_seen": 130957970, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.80078125, + "step": 6095, + "time_per_iteration": 5.4338037967681885 + }, + { + "auxiliary_loss_clip": 0.01125087, + "auxiliary_loss_mlp": 0.01035776, + "balance_loss_clip": 1.02055264, + "balance_loss_mlp": 1.04422212, + "epoch": 0.3665113482639411, + "flos": 17784436930560.0, + "grad_norm": 2.287741364035224, + "language_loss": 0.73586392, + "learning_rate": 2.925132954945834e-06, + "loss": 0.75747252, + "num_input_tokens_seen": 130974915, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.80859375, + "step": 6096, + "time_per_iteration": 3.923590660095215 + }, + { + "auxiliary_loss_clip": 0.0112257, + "auxiliary_loss_mlp": 0.01033526, + "balance_loss_clip": 1.0195781, + "balance_loss_mlp": 1.04206252, + "epoch": 0.36657147151660907, + "flos": 27854614477440.0, + "grad_norm": 2.2038030169597875, + "language_loss": 0.67285162, + "learning_rate": 2.924787646678155e-06, + "loss": 0.69441259, + "num_input_tokens_seen": 130995745, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.8046875, + "step": 6097, + "time_per_iteration": 2.4843504428863525 + }, + { + "auxiliary_loss_clip": 0.01123525, + "auxiliary_loss_mlp": 0.01038839, + "balance_loss_clip": 1.0249629, + "balance_loss_mlp": 1.04401898, + "epoch": 0.36663159476927704, + "flos": 25374013228800.0, + "grad_norm": 1.6404590263223953, + "language_loss": 0.77676886, + "learning_rate": 2.9244423033424365e-06, + "loss": 0.79839253, + "num_input_tokens_seen": 131015545, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.796875, + "step": 6098, + "time_per_iteration": 2.5663979053497314 + }, + { + "auxiliary_loss_clip": 0.0111895, + "auxiliary_loss_mlp": 0.01038815, + "balance_loss_clip": 1.02467644, + "balance_loss_mlp": 1.04334557, + "epoch": 0.366691718021945, + "flos": 21357225072000.0, + "grad_norm": 1.7512654587161538, + "language_loss": 0.73807114, + "learning_rate": 2.9240969249517723e-06, + "loss": 0.7596488, + "num_input_tokens_seen": 131033990, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7578125, + "step": 6099, + "time_per_iteration": 2.442549705505371 + }, + { + "auxiliary_loss_clip": 0.01116483, + "auxiliary_loss_mlp": 0.01047226, + "balance_loss_clip": 1.03380322, + "balance_loss_mlp": 1.04073739, + "epoch": 0.36675184127461297, + "flos": 16800376953600.0, + "grad_norm": 1.739052204204903, + "language_loss": 0.84383607, + "learning_rate": 2.9237515115192602e-06, + "loss": 0.86547315, + "num_input_tokens_seen": 131050710, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 6100, + "time_per_iteration": 2.4783878326416016 + }, + { + "auxiliary_loss_clip": 0.01124265, + "auxiliary_loss_mlp": 0.01034789, + "balance_loss_clip": 1.02046633, + "balance_loss_mlp": 1.04215789, + "epoch": 0.36681196452728093, + "flos": 21906514828800.0, + "grad_norm": 2.450199870045222, + "language_loss": 0.70504647, + "learning_rate": 2.9234060630579992e-06, + "loss": 0.72663701, + "num_input_tokens_seen": 131071435, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.8203125, + "step": 6101, + "time_per_iteration": 2.4591257572174072 + }, + { + "auxiliary_loss_clip": 0.01121661, + "auxiliary_loss_mlp": 0.01041857, + "balance_loss_clip": 1.02629983, + "balance_loss_mlp": 1.04228854, + "epoch": 0.3668720877799489, + "flos": 17712436118400.0, + "grad_norm": 2.0513606804107543, + "language_loss": 0.76049435, + "learning_rate": 2.9230605795810865e-06, + "loss": 0.78212953, + "num_input_tokens_seen": 131088775, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.79296875, + "step": 6102, + "time_per_iteration": 2.491046190261841 + }, + { + "auxiliary_loss_clip": 0.01126584, + "auxiliary_loss_mlp": 0.01036727, + "balance_loss_clip": 1.02142096, + "balance_loss_mlp": 1.04445052, + "epoch": 0.36693221103261686, + "flos": 47045455499520.0, + "grad_norm": 1.6383228145690705, + "language_loss": 0.69930172, + "learning_rate": 2.922715061101625e-06, + "loss": 0.72093487, + "num_input_tokens_seen": 131112800, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8203125, + "step": 6103, + "time_per_iteration": 2.676790952682495 + }, + { + "auxiliary_loss_clip": 0.01121704, + "auxiliary_loss_mlp": 0.01036936, + "balance_loss_clip": 1.02213061, + "balance_loss_mlp": 1.0423454, + "epoch": 0.3669923342852848, + "flos": 15960929132160.0, + "grad_norm": 1.8701272650505458, + "language_loss": 0.71414149, + "learning_rate": 2.922369507632716e-06, + "loss": 0.73572791, + "num_input_tokens_seen": 131131150, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.79296875, + "step": 6104, + "time_per_iteration": 2.438197374343872 + }, + { + "auxiliary_loss_clip": 0.01121263, + "auxiliary_loss_mlp": 0.01032552, + "balance_loss_clip": 1.01794899, + "balance_loss_mlp": 1.04288161, + "epoch": 0.3670524575379528, + "flos": 19974485064960.0, + "grad_norm": 2.0275913231037923, + "language_loss": 0.81653488, + "learning_rate": 2.9220239191874617e-06, + "loss": 0.83807302, + "num_input_tokens_seen": 131150365, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78515625, + "step": 6105, + "time_per_iteration": 2.437201976776123 + }, + { + "auxiliary_loss_clip": 0.0112675, + "auxiliary_loss_mlp": 0.010372, + "balance_loss_clip": 1.02255476, + "balance_loss_mlp": 1.0441767, + "epoch": 0.36711258079062076, + "flos": 25702955003520.0, + "grad_norm": 1.7477833912391936, + "language_loss": 0.81079835, + "learning_rate": 2.9216782957789692e-06, + "loss": 0.83243787, + "num_input_tokens_seen": 131169310, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.82421875, + "step": 6106, + "time_per_iteration": 2.5447771549224854 + }, + { + "auxiliary_loss_clip": 0.01041229, + "auxiliary_loss_mlp": 0.0100622, + "balance_loss_clip": 1.00440836, + "balance_loss_mlp": 1.01511836, + "epoch": 0.3671727040432887, + "flos": 60772743342720.0, + "grad_norm": 0.6829750500510474, + "language_loss": 0.59212124, + "learning_rate": 2.9213326374203426e-06, + "loss": 0.6125958, + "num_input_tokens_seen": 131232900, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.26171875, + "step": 6107, + "time_per_iteration": 3.0983083248138428 + }, + { + "auxiliary_loss_clip": 0.01119584, + "auxiliary_loss_mlp": 0.01031391, + "balance_loss_clip": 1.01756859, + "balance_loss_mlp": 1.04195333, + "epoch": 0.3672328272959567, + "flos": 18661303745280.0, + "grad_norm": 1.5524752326282045, + "language_loss": 0.74417794, + "learning_rate": 2.92098694412469e-06, + "loss": 0.7656877, + "num_input_tokens_seen": 131250920, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.77734375, + "step": 6108, + "time_per_iteration": 2.5146114826202393 + }, + { + "auxiliary_loss_clip": 0.01120578, + "auxiliary_loss_mlp": 0.01036173, + "balance_loss_clip": 1.02218354, + "balance_loss_mlp": 1.04104972, + "epoch": 0.3672929505486247, + "flos": 15049049535360.0, + "grad_norm": 2.0732100862766294, + "language_loss": 0.73141801, + "learning_rate": 2.9206412159051213e-06, + "loss": 0.7529856, + "num_input_tokens_seen": 131267910, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.796875, + "step": 6109, + "time_per_iteration": 2.4597368240356445 + }, + { + "auxiliary_loss_clip": 0.01118669, + "auxiliary_loss_mlp": 0.01034099, + "balance_loss_clip": 1.02015734, + "balance_loss_mlp": 1.0407654, + "epoch": 0.3673530738012927, + "flos": 20589347099520.0, + "grad_norm": 1.8280489650426288, + "language_loss": 0.53282952, + "learning_rate": 2.920295452774744e-06, + "loss": 0.55435723, + "num_input_tokens_seen": 131287150, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.77734375, + "step": 6110, + "time_per_iteration": 2.5454814434051514 + }, + { + "auxiliary_loss_clip": 0.01120475, + "auxiliary_loss_mlp": 0.01034416, + "balance_loss_clip": 1.01949728, + "balance_loss_mlp": 1.04360104, + "epoch": 0.36741319705396064, + "flos": 21689830033920.0, + "grad_norm": 1.4515242715586747, + "language_loss": 0.8026799, + "learning_rate": 2.919949654746672e-06, + "loss": 0.82422882, + "num_input_tokens_seen": 131308225, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.76953125, + "step": 6111, + "time_per_iteration": 2.4838016033172607 + }, + { + "auxiliary_loss_clip": 0.01119124, + "auxiliary_loss_mlp": 0.01040745, + "balance_loss_clip": 1.02637434, + "balance_loss_mlp": 1.04195952, + "epoch": 0.3674733203066286, + "flos": 29862200499840.0, + "grad_norm": 1.7574831080907656, + "language_loss": 0.72220403, + "learning_rate": 2.9196038218340163e-06, + "loss": 0.74380273, + "num_input_tokens_seen": 131332115, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.76953125, + "step": 6112, + "time_per_iteration": 2.590109348297119 + }, + { + "auxiliary_loss_clip": 0.01120572, + "auxiliary_loss_mlp": 0.01039294, + "balance_loss_clip": 1.02503061, + "balance_loss_mlp": 1.04220295, + "epoch": 0.36753344355929657, + "flos": 18257021193600.0, + "grad_norm": 1.6166739673118746, + "language_loss": 0.85398543, + "learning_rate": 2.919257954049892e-06, + "loss": 0.87558413, + "num_input_tokens_seen": 131351885, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78515625, + "step": 6113, + "time_per_iteration": 2.4480674266815186 + }, + { + "auxiliary_loss_clip": 0.01122714, + "auxiliary_loss_mlp": 0.01036848, + "balance_loss_clip": 1.02228022, + "balance_loss_mlp": 1.04214144, + "epoch": 0.36759356681196453, + "flos": 25301150490240.0, + "grad_norm": 1.8814317352542869, + "language_loss": 0.78741604, + "learning_rate": 2.918912051407413e-06, + "loss": 0.80901164, + "num_input_tokens_seen": 131370245, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8046875, + "step": 6114, + "time_per_iteration": 2.4870779514312744 + }, + { + "auxiliary_loss_clip": 0.01125295, + "auxiliary_loss_mlp": 0.01044195, + "balance_loss_clip": 1.0278033, + "balance_loss_mlp": 1.04344988, + "epoch": 0.3676536900646325, + "flos": 21032952065280.0, + "grad_norm": 1.5830307408310422, + "language_loss": 0.66854429, + "learning_rate": 2.918566113919698e-06, + "loss": 0.69023919, + "num_input_tokens_seen": 131388115, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.81640625, + "step": 6115, + "time_per_iteration": 2.4361841678619385 + }, + { + "auxiliary_loss_clip": 0.01114869, + "auxiliary_loss_mlp": 0.01033503, + "balance_loss_clip": 1.01953745, + "balance_loss_mlp": 1.03984118, + "epoch": 0.36771381331730046, + "flos": 16288506190080.0, + "grad_norm": 2.406761648754093, + "language_loss": 0.76663208, + "learning_rate": 2.9182201415998636e-06, + "loss": 0.78811574, + "num_input_tokens_seen": 131404595, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75, + "step": 6116, + "time_per_iteration": 2.417569875717163 + }, + { + "auxiliary_loss_clip": 0.01119646, + "auxiliary_loss_mlp": 0.01040473, + "balance_loss_clip": 1.02685893, + "balance_loss_mlp": 1.04111099, + "epoch": 0.36777393656996843, + "flos": 22309971367680.0, + "grad_norm": 1.9705222106020779, + "language_loss": 0.62811542, + "learning_rate": 2.9178741344610286e-06, + "loss": 0.64971662, + "num_input_tokens_seen": 131423760, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.78515625, + "step": 6117, + "time_per_iteration": 2.443798065185547 + }, + { + "auxiliary_loss_clip": 0.01118324, + "auxiliary_loss_mlp": 0.0103365, + "balance_loss_clip": 1.019261, + "balance_loss_mlp": 1.04137671, + "epoch": 0.3678340598226364, + "flos": 26834069260800.0, + "grad_norm": 1.9131647495504847, + "language_loss": 0.72974634, + "learning_rate": 2.9175280925163156e-06, + "loss": 0.75126612, + "num_input_tokens_seen": 131444955, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.76953125, + "step": 6118, + "time_per_iteration": 2.531804084777832 + }, + { + "auxiliary_loss_clip": 0.01123956, + "auxiliary_loss_mlp": 0.01042313, + "balance_loss_clip": 1.02694678, + "balance_loss_mlp": 1.04156733, + "epoch": 0.36789418307530436, + "flos": 21761723105280.0, + "grad_norm": 2.002097677722335, + "language_loss": 0.72413695, + "learning_rate": 2.9171820157788445e-06, + "loss": 0.7457996, + "num_input_tokens_seen": 131465720, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.82421875, + "step": 6119, + "time_per_iteration": 2.4641144275665283 + }, + { + "auxiliary_loss_clip": 0.01121284, + "auxiliary_loss_mlp": 0.01032475, + "balance_loss_clip": 1.0179317, + "balance_loss_mlp": 1.04397964, + "epoch": 0.3679543063279723, + "flos": 15924192497280.0, + "grad_norm": 1.84976209385018, + "language_loss": 0.79848421, + "learning_rate": 2.9168359042617404e-06, + "loss": 0.82002181, + "num_input_tokens_seen": 131483080, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7734375, + "step": 6120, + "time_per_iteration": 2.487030029296875 + }, + { + "auxiliary_loss_clip": 0.01117761, + "auxiliary_loss_mlp": 0.01040133, + "balance_loss_clip": 1.02612031, + "balance_loss_mlp": 1.04084468, + "epoch": 0.3680144295806403, + "flos": 24275541456000.0, + "grad_norm": 1.8961465807450149, + "language_loss": 0.63855267, + "learning_rate": 2.916489757978126e-06, + "loss": 0.66013169, + "num_input_tokens_seen": 131502545, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76953125, + "step": 6121, + "time_per_iteration": 2.4573564529418945 + }, + { + "auxiliary_loss_clip": 0.01122895, + "auxiliary_loss_mlp": 0.01042434, + "balance_loss_clip": 1.02755642, + "balance_loss_mlp": 1.0431416, + "epoch": 0.36807455283330826, + "flos": 26104148985600.0, + "grad_norm": 1.8845840511442051, + "language_loss": 0.71209222, + "learning_rate": 2.9161435769411286e-06, + "loss": 0.73374552, + "num_input_tokens_seen": 131522155, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 6122, + "time_per_iteration": 2.5197854042053223 + }, + { + "auxiliary_loss_clip": 0.01116909, + "auxiliary_loss_mlp": 0.01034858, + "balance_loss_clip": 1.02091694, + "balance_loss_mlp": 1.04319501, + "epoch": 0.3681346760859763, + "flos": 24644990793600.0, + "grad_norm": 1.8566190114316727, + "language_loss": 0.69493115, + "learning_rate": 2.915797361163875e-06, + "loss": 0.71644878, + "num_input_tokens_seen": 131543865, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.734375, + "step": 6123, + "time_per_iteration": 2.5585381984710693 + }, + { + "auxiliary_loss_clip": 0.0112515, + "auxiliary_loss_mlp": 0.01039826, + "balance_loss_clip": 1.02426958, + "balance_loss_mlp": 1.04312396, + "epoch": 0.36819479933864424, + "flos": 23878369797120.0, + "grad_norm": 1.995367064863914, + "language_loss": 0.73392212, + "learning_rate": 2.9154511106594933e-06, + "loss": 0.7555719, + "num_input_tokens_seen": 131562155, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8203125, + "step": 6124, + "time_per_iteration": 2.56925368309021 + }, + { + "auxiliary_loss_clip": 0.01121929, + "auxiliary_loss_mlp": 0.01040848, + "balance_loss_clip": 1.02465916, + "balance_loss_mlp": 1.04337013, + "epoch": 0.3682549225913122, + "flos": 25553997302400.0, + "grad_norm": 1.997016319446362, + "language_loss": 0.74426562, + "learning_rate": 2.915104825441114e-06, + "loss": 0.76589334, + "num_input_tokens_seen": 131581695, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.7890625, + "step": 6125, + "time_per_iteration": 2.493232488632202 + }, + { + "auxiliary_loss_clip": 0.01124729, + "auxiliary_loss_mlp": 0.01046169, + "balance_loss_clip": 1.03009367, + "balance_loss_mlp": 1.04400194, + "epoch": 0.36831504584398017, + "flos": 16946605221120.0, + "grad_norm": 1.8135805598812564, + "language_loss": 0.78254056, + "learning_rate": 2.9147585055218686e-06, + "loss": 0.80424947, + "num_input_tokens_seen": 131599465, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8046875, + "step": 6126, + "time_per_iteration": 2.4767327308654785 + }, + { + "auxiliary_loss_clip": 0.01123227, + "auxiliary_loss_mlp": 0.0103777, + "balance_loss_clip": 1.02125943, + "balance_loss_mlp": 1.04164457, + "epoch": 0.36837516909664814, + "flos": 19865065259520.0, + "grad_norm": 2.275366104968191, + "language_loss": 0.66100526, + "learning_rate": 2.914412150914888e-06, + "loss": 0.68261528, + "num_input_tokens_seen": 131618330, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.81640625, + "step": 6127, + "time_per_iteration": 2.4442801475524902 + }, + { + "auxiliary_loss_clip": 0.01126304, + "auxiliary_loss_mlp": 0.01042916, + "balance_loss_clip": 1.02783585, + "balance_loss_mlp": 1.04527378, + "epoch": 0.3684352923493161, + "flos": 37626984362880.0, + "grad_norm": 1.809419798014635, + "language_loss": 0.70553637, + "learning_rate": 2.9140657616333074e-06, + "loss": 0.72722864, + "num_input_tokens_seen": 131638960, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.80859375, + "step": 6128, + "time_per_iteration": 2.6163570880889893 + }, + { + "auxiliary_loss_clip": 0.01121361, + "auxiliary_loss_mlp": 0.01041508, + "balance_loss_clip": 1.0266788, + "balance_loss_mlp": 1.04374862, + "epoch": 0.36849541560198407, + "flos": 14465501182080.0, + "grad_norm": 2.366686546837111, + "language_loss": 0.75425905, + "learning_rate": 2.9137193376902614e-06, + "loss": 0.77588773, + "num_input_tokens_seen": 131657440, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.77734375, + "step": 6129, + "time_per_iteration": 2.418318510055542 + }, + { + "auxiliary_loss_clip": 0.01119858, + "auxiliary_loss_mlp": 0.01041313, + "balance_loss_clip": 1.02652466, + "balance_loss_mlp": 1.0419023, + "epoch": 0.36855553885465203, + "flos": 25770753924480.0, + "grad_norm": 1.583632674026135, + "language_loss": 0.84801334, + "learning_rate": 2.9133728790988868e-06, + "loss": 0.86962497, + "num_input_tokens_seen": 131678035, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.78125, + "step": 6130, + "time_per_iteration": 2.4933249950408936 + }, + { + "auxiliary_loss_clip": 0.01041681, + "auxiliary_loss_mlp": 0.010081, + "balance_loss_clip": 1.00623989, + "balance_loss_mlp": 1.01602125, + "epoch": 0.36861566210732, + "flos": 65049417377280.0, + "grad_norm": 0.8093683158704721, + "language_loss": 0.60352623, + "learning_rate": 2.913026385872321e-06, + "loss": 0.62402403, + "num_input_tokens_seen": 131742470, + "router_z_loss_clip": 0.01855469, + "router_z_loss_mlp": 0.2578125, + "step": 6131, + "time_per_iteration": 3.1686718463897705 + }, + { + "auxiliary_loss_clip": 0.01117904, + "auxiliary_loss_mlp": 0.01031101, + "balance_loss_clip": 1.01657534, + "balance_loss_mlp": 1.04083943, + "epoch": 0.36867578535998796, + "flos": 30954495133440.0, + "grad_norm": 1.5510352980860918, + "language_loss": 0.72903317, + "learning_rate": 2.9126798580237034e-06, + "loss": 0.75052321, + "num_input_tokens_seen": 131764570, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.76953125, + "step": 6132, + "time_per_iteration": 2.54154109954834 + }, + { + "auxiliary_loss_clip": 0.01124361, + "auxiliary_loss_mlp": 0.0103786, + "balance_loss_clip": 1.02221942, + "balance_loss_mlp": 1.04263651, + "epoch": 0.3687359086126559, + "flos": 28837956182400.0, + "grad_norm": 1.665822939326855, + "language_loss": 0.74255228, + "learning_rate": 2.9123332955661736e-06, + "loss": 0.76417446, + "num_input_tokens_seen": 131785720, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.81640625, + "step": 6133, + "time_per_iteration": 2.501119375228882 + }, + { + "auxiliary_loss_clip": 0.01118037, + "auxiliary_loss_mlp": 0.01038324, + "balance_loss_clip": 1.02420318, + "balance_loss_mlp": 1.04308438, + "epoch": 0.3687960318653239, + "flos": 21396798881280.0, + "grad_norm": 1.60564703390979, + "language_loss": 0.71415824, + "learning_rate": 2.911986698512874e-06, + "loss": 0.73572183, + "num_input_tokens_seen": 131804430, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.75, + "step": 6134, + "time_per_iteration": 2.472978353500366 + }, + { + "auxiliary_loss_clip": 0.01121139, + "auxiliary_loss_mlp": 0.01035306, + "balance_loss_clip": 1.0202322, + "balance_loss_mlp": 1.04333591, + "epoch": 0.36885615511799186, + "flos": 20266043760000.0, + "grad_norm": 1.501197032587339, + "language_loss": 0.74985242, + "learning_rate": 2.9116400668769477e-06, + "loss": 0.77141684, + "num_input_tokens_seen": 131822060, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.77734375, + "step": 6135, + "time_per_iteration": 2.458523750305176 + }, + { + "auxiliary_loss_clip": 0.01043215, + "auxiliary_loss_mlp": 0.01004045, + "balance_loss_clip": 1.00209045, + "balance_loss_mlp": 1.01762199, + "epoch": 0.3689162783706599, + "flos": 63088836301440.0, + "grad_norm": 0.8063752733434837, + "language_loss": 0.5878793, + "learning_rate": 2.9112934006715376e-06, + "loss": 0.60835183, + "num_input_tokens_seen": 131880715, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.25585938, + "step": 6136, + "time_per_iteration": 2.9917385578155518 + }, + { + "auxiliary_loss_clip": 0.0112236, + "auxiliary_loss_mlp": 0.01035902, + "balance_loss_clip": 1.02095878, + "balance_loss_mlp": 1.04477668, + "epoch": 0.36897640162332784, + "flos": 10961984419200.0, + "grad_norm": 1.8816926848284692, + "language_loss": 0.78812146, + "learning_rate": 2.9109466999097918e-06, + "loss": 0.80970407, + "num_input_tokens_seen": 131895850, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.77734375, + "step": 6137, + "time_per_iteration": 6.900243520736694 + }, + { + "auxiliary_loss_clip": 0.01122666, + "auxiliary_loss_mlp": 0.01040761, + "balance_loss_clip": 1.02594304, + "balance_loss_mlp": 1.04392326, + "epoch": 0.3690365248759958, + "flos": 20704297599360.0, + "grad_norm": 2.0278297083458345, + "language_loss": 0.74142605, + "learning_rate": 2.9105999646048552e-06, + "loss": 0.76306027, + "num_input_tokens_seen": 131915775, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7890625, + "step": 6138, + "time_per_iteration": 2.5056889057159424 + }, + { + "auxiliary_loss_clip": 0.01127012, + "auxiliary_loss_mlp": 0.01038954, + "balance_loss_clip": 1.02365959, + "balance_loss_mlp": 1.04482222, + "epoch": 0.3690966481286638, + "flos": 31826369957760.0, + "grad_norm": 1.957735157830462, + "language_loss": 0.64818108, + "learning_rate": 2.9102531947698764e-06, + "loss": 0.66984075, + "num_input_tokens_seen": 131935715, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8203125, + "step": 6139, + "time_per_iteration": 2.5345380306243896 + }, + { + "auxiliary_loss_clip": 0.01119273, + "auxiliary_loss_mlp": 0.01040439, + "balance_loss_clip": 1.02563334, + "balance_loss_mlp": 1.04279661, + "epoch": 0.36915677138133174, + "flos": 13114936782720.0, + "grad_norm": 2.0918485574433734, + "language_loss": 0.71384197, + "learning_rate": 2.909906390418006e-06, + "loss": 0.73543906, + "num_input_tokens_seen": 131954120, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.765625, + "step": 6140, + "time_per_iteration": 2.4318323135375977 + }, + { + "auxiliary_loss_clip": 0.01042951, + "auxiliary_loss_mlp": 0.00999596, + "balance_loss_clip": 0.99771231, + "balance_loss_mlp": 1.01712704, + "epoch": 0.3692168946339997, + "flos": 68686879956480.0, + "grad_norm": 0.7479140823872853, + "language_loss": 0.59281325, + "learning_rate": 2.9095595515623934e-06, + "loss": 0.61323869, + "num_input_tokens_seen": 132017485, + "router_z_loss_clip": 0.01879883, + "router_z_loss_mlp": 0.2578125, + "step": 6141, + "time_per_iteration": 3.1505937576293945 + }, + { + "auxiliary_loss_clip": 0.01122987, + "auxiliary_loss_mlp": 0.0103975, + "balance_loss_clip": 1.02499199, + "balance_loss_mlp": 1.04369187, + "epoch": 0.36927701788666767, + "flos": 22017873968640.0, + "grad_norm": 1.768624510630746, + "language_loss": 0.7473368, + "learning_rate": 2.909212678216192e-06, + "loss": 0.76896417, + "num_input_tokens_seen": 132036760, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.79296875, + "step": 6142, + "time_per_iteration": 2.4768457412719727 + }, + { + "auxiliary_loss_clip": 0.01119694, + "auxiliary_loss_mlp": 0.01036766, + "balance_loss_clip": 1.02291358, + "balance_loss_mlp": 1.04270506, + "epoch": 0.36933714113933563, + "flos": 21835591424640.0, + "grad_norm": 2.5385068391341603, + "language_loss": 0.76985848, + "learning_rate": 2.908865770392555e-06, + "loss": 0.79142308, + "num_input_tokens_seen": 132056935, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76953125, + "step": 6143, + "time_per_iteration": 2.4604313373565674 + }, + { + "auxiliary_loss_clip": 0.01118955, + "auxiliary_loss_mlp": 0.01035839, + "balance_loss_clip": 1.02289248, + "balance_loss_mlp": 1.04277074, + "epoch": 0.3693972643920036, + "flos": 23691705793920.0, + "grad_norm": 1.4994482416842545, + "language_loss": 0.81616801, + "learning_rate": 2.9085188281046364e-06, + "loss": 0.83771598, + "num_input_tokens_seen": 132077285, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.76171875, + "step": 6144, + "time_per_iteration": 2.529298782348633 + }, + { + "auxiliary_loss_clip": 0.0112261, + "auxiliary_loss_mlp": 0.01038443, + "balance_loss_clip": 1.02425694, + "balance_loss_mlp": 1.04323006, + "epoch": 0.36945738764467156, + "flos": 22856747172480.0, + "grad_norm": 1.9122738225408384, + "language_loss": 0.77019674, + "learning_rate": 2.908171851365593e-06, + "loss": 0.79180729, + "num_input_tokens_seen": 132095520, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.79296875, + "step": 6145, + "time_per_iteration": 2.4642515182495117 + }, + { + "auxiliary_loss_clip": 0.01123051, + "auxiliary_loss_mlp": 0.01032135, + "balance_loss_clip": 1.01760387, + "balance_loss_mlp": 1.04384804, + "epoch": 0.36951751089733953, + "flos": 16615939593600.0, + "grad_norm": 1.7518336089815172, + "language_loss": 0.76903462, + "learning_rate": 2.9078248401885815e-06, + "loss": 0.79058653, + "num_input_tokens_seen": 132112810, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.79296875, + "step": 6146, + "time_per_iteration": 2.49208927154541 + }, + { + "auxiliary_loss_clip": 0.01125412, + "auxiliary_loss_mlp": 0.01042993, + "balance_loss_clip": 1.02746034, + "balance_loss_mlp": 1.04481673, + "epoch": 0.3695776341500075, + "flos": 18914545607040.0, + "grad_norm": 1.7861503855196468, + "language_loss": 0.80794239, + "learning_rate": 2.907477794586761e-06, + "loss": 0.82962638, + "num_input_tokens_seen": 132131615, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8046875, + "step": 6147, + "time_per_iteration": 2.417968988418579 + }, + { + "auxiliary_loss_clip": 0.01120028, + "auxiliary_loss_mlp": 0.01037464, + "balance_loss_clip": 1.0238626, + "balance_loss_mlp": 1.04083371, + "epoch": 0.36963775740267546, + "flos": 20808474019200.0, + "grad_norm": 1.7356953572419536, + "language_loss": 0.83196342, + "learning_rate": 2.9071307145732926e-06, + "loss": 0.85353833, + "num_input_tokens_seen": 132149585, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.79296875, + "step": 6148, + "time_per_iteration": 2.4493086338043213 + }, + { + "auxiliary_loss_clip": 0.01118838, + "auxiliary_loss_mlp": 0.01038426, + "balance_loss_clip": 1.02424645, + "balance_loss_mlp": 1.04304922, + "epoch": 0.3696978806553435, + "flos": 26061881656320.0, + "grad_norm": 2.337121678381176, + "language_loss": 0.74373478, + "learning_rate": 2.9067836001613357e-06, + "loss": 0.76530743, + "num_input_tokens_seen": 132165555, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7578125, + "step": 6149, + "time_per_iteration": 2.4594686031341553 + }, + { + "auxiliary_loss_clip": 0.01124701, + "auxiliary_loss_mlp": 0.01038071, + "balance_loss_clip": 1.02210915, + "balance_loss_mlp": 1.04449439, + "epoch": 0.36975800390801145, + "flos": 26833925606400.0, + "grad_norm": 1.7562888589836316, + "language_loss": 0.70538592, + "learning_rate": 2.906436451364054e-06, + "loss": 0.72701365, + "num_input_tokens_seen": 132185100, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.80078125, + "step": 6150, + "time_per_iteration": 2.5232975482940674 + }, + { + "auxiliary_loss_clip": 0.01121201, + "auxiliary_loss_mlp": 0.01039019, + "balance_loss_clip": 1.02470183, + "balance_loss_mlp": 1.04390609, + "epoch": 0.3698181271606794, + "flos": 21142623265920.0, + "grad_norm": 1.6469943204532072, + "language_loss": 0.82023048, + "learning_rate": 2.906089268194611e-06, + "loss": 0.84183264, + "num_input_tokens_seen": 132203930, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7734375, + "step": 6151, + "time_per_iteration": 2.448066473007202 + }, + { + "auxiliary_loss_clip": 0.01036606, + "auxiliary_loss_mlp": 0.01001329, + "balance_loss_clip": 0.99951726, + "balance_loss_mlp": 1.01119328, + "epoch": 0.3698782504133474, + "flos": 66742639568640.0, + "grad_norm": 0.838014312453704, + "language_loss": 0.63083476, + "learning_rate": 2.9057420506661726e-06, + "loss": 0.65121406, + "num_input_tokens_seen": 132263845, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.25390625, + "step": 6152, + "time_per_iteration": 3.170707941055298 + }, + { + "auxiliary_loss_clip": 0.01117624, + "auxiliary_loss_mlp": 0.01037374, + "balance_loss_clip": 1.02347398, + "balance_loss_mlp": 1.0429337, + "epoch": 0.36993837366601534, + "flos": 24311523905280.0, + "grad_norm": 1.8166659348284784, + "language_loss": 0.70360208, + "learning_rate": 2.9053947987919044e-06, + "loss": 0.72515202, + "num_input_tokens_seen": 132282350, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 6153, + "time_per_iteration": 2.480318546295166 + }, + { + "auxiliary_loss_clip": 0.01123537, + "auxiliary_loss_mlp": 0.01039275, + "balance_loss_clip": 1.02420688, + "balance_loss_mlp": 1.04319179, + "epoch": 0.3699984969186833, + "flos": 24349194293760.0, + "grad_norm": 2.0600031325492107, + "language_loss": 0.72201782, + "learning_rate": 2.9050475125849755e-06, + "loss": 0.74364597, + "num_input_tokens_seen": 132301930, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8046875, + "step": 6154, + "time_per_iteration": 2.48018479347229 + }, + { + "auxiliary_loss_clip": 0.0111958, + "auxiliary_loss_mlp": 0.01029952, + "balance_loss_clip": 1.01624274, + "balance_loss_mlp": 1.04201758, + "epoch": 0.37005862017135127, + "flos": 19829154637440.0, + "grad_norm": 1.8383479148193087, + "language_loss": 0.67877179, + "learning_rate": 2.9047001920585534e-06, + "loss": 0.70026708, + "num_input_tokens_seen": 132320915, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 6155, + "time_per_iteration": 2.454582929611206 + }, + { + "auxiliary_loss_clip": 0.01119091, + "auxiliary_loss_mlp": 0.01028886, + "balance_loss_clip": 1.01518905, + "balance_loss_mlp": 1.0420723, + "epoch": 0.37011874342401924, + "flos": 19573793873280.0, + "grad_norm": 1.7213710867444976, + "language_loss": 0.67835188, + "learning_rate": 2.9043528372258097e-06, + "loss": 0.6998316, + "num_input_tokens_seen": 132340415, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76953125, + "step": 6156, + "time_per_iteration": 2.456244707107544 + }, + { + "auxiliary_loss_clip": 0.01117904, + "auxiliary_loss_mlp": 0.01038018, + "balance_loss_clip": 1.02461255, + "balance_loss_mlp": 1.04180884, + "epoch": 0.3701788666766872, + "flos": 20374350243840.0, + "grad_norm": 1.7871024658649661, + "language_loss": 0.82324016, + "learning_rate": 2.904005448099916e-06, + "loss": 0.8447994, + "num_input_tokens_seen": 132358600, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 6157, + "time_per_iteration": 2.467258930206299 + }, + { + "auxiliary_loss_clip": 0.0112233, + "auxiliary_loss_mlp": 0.0103747, + "balance_loss_clip": 1.02214015, + "balance_loss_mlp": 1.04224074, + "epoch": 0.37023898992935517, + "flos": 15340931452800.0, + "grad_norm": 2.319348977212497, + "language_loss": 0.76519799, + "learning_rate": 2.9036580246940444e-06, + "loss": 0.78679597, + "num_input_tokens_seen": 132373160, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.80078125, + "step": 6158, + "time_per_iteration": 2.4462850093841553 + }, + { + "auxiliary_loss_clip": 0.01121021, + "auxiliary_loss_mlp": 0.01037892, + "balance_loss_clip": 1.02276468, + "balance_loss_mlp": 1.04128695, + "epoch": 0.37029911318202313, + "flos": 19573937527680.0, + "grad_norm": 2.3237426114128903, + "language_loss": 0.6888833, + "learning_rate": 2.9033105670213708e-06, + "loss": 0.71047246, + "num_input_tokens_seen": 132392345, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.796875, + "step": 6159, + "time_per_iteration": 2.444615364074707 + }, + { + "auxiliary_loss_clip": 0.0111775, + "auxiliary_loss_mlp": 0.0103647, + "balance_loss_clip": 1.02298164, + "balance_loss_mlp": 1.04054952, + "epoch": 0.3703592364346911, + "flos": 26213353309440.0, + "grad_norm": 1.7829911261722147, + "language_loss": 0.7101602, + "learning_rate": 2.9029630750950697e-06, + "loss": 0.73170245, + "num_input_tokens_seen": 132412620, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7734375, + "step": 6160, + "time_per_iteration": 2.4807472229003906 + }, + { + "auxiliary_loss_clip": 0.01114504, + "auxiliary_loss_mlp": 0.01030762, + "balance_loss_clip": 1.01808465, + "balance_loss_mlp": 1.04033566, + "epoch": 0.37041935968735906, + "flos": 20048317470720.0, + "grad_norm": 1.5671410195286926, + "language_loss": 0.79049259, + "learning_rate": 2.9026155489283176e-06, + "loss": 0.81194532, + "num_input_tokens_seen": 132431570, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7421875, + "step": 6161, + "time_per_iteration": 2.445615768432617 + }, + { + "auxiliary_loss_clip": 0.01119907, + "auxiliary_loss_mlp": 0.01037146, + "balance_loss_clip": 1.02266204, + "balance_loss_mlp": 1.04217172, + "epoch": 0.3704794829400271, + "flos": 24133802388480.0, + "grad_norm": 1.6578530571842398, + "language_loss": 0.7961942, + "learning_rate": 2.902267988534295e-06, + "loss": 0.81776464, + "num_input_tokens_seen": 132451525, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.77734375, + "step": 6162, + "time_per_iteration": 2.474179267883301 + }, + { + "auxiliary_loss_clip": 0.01118518, + "auxiliary_loss_mlp": 0.01035343, + "balance_loss_clip": 1.02122831, + "balance_loss_mlp": 1.04136944, + "epoch": 0.37053960619269505, + "flos": 14866874732160.0, + "grad_norm": 1.751569507310971, + "language_loss": 0.79592955, + "learning_rate": 2.9019203939261783e-06, + "loss": 0.81746811, + "num_input_tokens_seen": 132469875, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76953125, + "step": 6163, + "time_per_iteration": 2.429410696029663 + }, + { + "auxiliary_loss_clip": 0.01121642, + "auxiliary_loss_mlp": 0.01032856, + "balance_loss_clip": 1.01815772, + "balance_loss_mlp": 1.04239571, + "epoch": 0.370599729445363, + "flos": 21361498790400.0, + "grad_norm": 1.6995697719291154, + "language_loss": 0.68002689, + "learning_rate": 2.9015727651171507e-06, + "loss": 0.70157188, + "num_input_tokens_seen": 132488360, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.79296875, + "step": 6164, + "time_per_iteration": 2.4500439167022705 + }, + { + "auxiliary_loss_clip": 0.01125233, + "auxiliary_loss_mlp": 0.01035754, + "balance_loss_clip": 1.0206207, + "balance_loss_mlp": 1.04507017, + "epoch": 0.370659852698031, + "flos": 26829041356800.0, + "grad_norm": 2.4697759057606197, + "language_loss": 0.82807398, + "learning_rate": 2.9012251021203935e-06, + "loss": 0.84968388, + "num_input_tokens_seen": 132508630, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.80078125, + "step": 6165, + "time_per_iteration": 2.4863715171813965 + }, + { + "auxiliary_loss_clip": 0.01125688, + "auxiliary_loss_mlp": 0.01036899, + "balance_loss_clip": 1.02060854, + "balance_loss_mlp": 1.04388845, + "epoch": 0.37071997595069894, + "flos": 19099018880640.0, + "grad_norm": 1.8224972170046692, + "language_loss": 0.69500774, + "learning_rate": 2.9008774049490896e-06, + "loss": 0.71663356, + "num_input_tokens_seen": 132527465, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.81640625, + "step": 6166, + "time_per_iteration": 2.560605049133301 + }, + { + "auxiliary_loss_clip": 0.01038031, + "auxiliary_loss_mlp": 0.01006399, + "balance_loss_clip": 1.00471771, + "balance_loss_mlp": 1.01302195, + "epoch": 0.3707800992033669, + "flos": 52178384920320.0, + "grad_norm": 0.8093247029889314, + "language_loss": 0.56892115, + "learning_rate": 2.9005296736164244e-06, + "loss": 0.58936548, + "num_input_tokens_seen": 132579940, + "router_z_loss_clip": 0.0168457, + "router_z_loss_mlp": 0.25, + "step": 6167, + "time_per_iteration": 2.922917127609253 + }, + { + "auxiliary_loss_clip": 0.01118731, + "auxiliary_loss_mlp": 0.01033249, + "balance_loss_clip": 1.01992154, + "balance_loss_mlp": 1.04288507, + "epoch": 0.3708402224560349, + "flos": 19901837808000.0, + "grad_norm": 1.945139483069219, + "language_loss": 0.75539452, + "learning_rate": 2.900181908135584e-06, + "loss": 0.77691436, + "num_input_tokens_seen": 132598390, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7578125, + "step": 6168, + "time_per_iteration": 2.4489872455596924 + }, + { + "auxiliary_loss_clip": 0.01120115, + "auxiliary_loss_mlp": 0.01035803, + "balance_loss_clip": 1.02202857, + "balance_loss_mlp": 1.04180634, + "epoch": 0.37090034570870284, + "flos": 20007630339840.0, + "grad_norm": 2.5586684776543853, + "language_loss": 0.7432459, + "learning_rate": 2.899834108519755e-06, + "loss": 0.76480508, + "num_input_tokens_seen": 132616920, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.78125, + "step": 6169, + "time_per_iteration": 2.4537463188171387 + }, + { + "auxiliary_loss_clip": 0.01120897, + "auxiliary_loss_mlp": 0.01032585, + "balance_loss_clip": 1.01891184, + "balance_loss_mlp": 1.04480267, + "epoch": 0.3709604689613708, + "flos": 24134700228480.0, + "grad_norm": 1.3706540261028175, + "language_loss": 0.79311681, + "learning_rate": 2.899486274782127e-06, + "loss": 0.81465161, + "num_input_tokens_seen": 132637660, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7578125, + "step": 6170, + "time_per_iteration": 2.4723992347717285 + }, + { + "auxiliary_loss_clip": 0.01122845, + "auxiliary_loss_mlp": 0.01038875, + "balance_loss_clip": 1.02390242, + "balance_loss_mlp": 1.04451621, + "epoch": 0.37102059221403877, + "flos": 23876071326720.0, + "grad_norm": 1.6235616399590074, + "language_loss": 0.76385272, + "learning_rate": 2.8991384069358885e-06, + "loss": 0.78546989, + "num_input_tokens_seen": 132657635, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.78125, + "step": 6171, + "time_per_iteration": 2.5364768505096436 + }, + { + "auxiliary_loss_clip": 0.01123724, + "auxiliary_loss_mlp": 0.01031905, + "balance_loss_clip": 1.01663446, + "balance_loss_mlp": 1.04594254, + "epoch": 0.37108071546670673, + "flos": 14501268149760.0, + "grad_norm": 1.9768297571305458, + "language_loss": 0.80696416, + "learning_rate": 2.898790504994232e-06, + "loss": 0.82852054, + "num_input_tokens_seen": 132674455, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.77734375, + "step": 6172, + "time_per_iteration": 2.451099395751953 + }, + { + "auxiliary_loss_clip": 0.01124197, + "auxiliary_loss_mlp": 0.01037606, + "balance_loss_clip": 1.0219543, + "balance_loss_mlp": 1.04385138, + "epoch": 0.3711408387193747, + "flos": 34562619279360.0, + "grad_norm": 2.2157067962534875, + "language_loss": 0.59447742, + "learning_rate": 2.89844256897035e-06, + "loss": 0.61609542, + "num_input_tokens_seen": 132695140, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8046875, + "step": 6173, + "time_per_iteration": 2.5750677585601807 + }, + { + "auxiliary_loss_clip": 0.01121876, + "auxiliary_loss_mlp": 0.01036073, + "balance_loss_clip": 1.02122533, + "balance_loss_mlp": 1.04391754, + "epoch": 0.37120096197204266, + "flos": 17310703432320.0, + "grad_norm": 1.9248503394254857, + "language_loss": 0.81157243, + "learning_rate": 2.898094598877435e-06, + "loss": 0.83315188, + "num_input_tokens_seen": 132712470, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78125, + "step": 6174, + "time_per_iteration": 2.421182155609131 + }, + { + "auxiliary_loss_clip": 0.01117919, + "auxiliary_loss_mlp": 0.01033906, + "balance_loss_clip": 1.02035165, + "balance_loss_mlp": 1.04281855, + "epoch": 0.37126108522471063, + "flos": 30664049760000.0, + "grad_norm": 1.8542839121663495, + "language_loss": 0.79834068, + "learning_rate": 2.8977465947286826e-06, + "loss": 0.81985891, + "num_input_tokens_seen": 132732945, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.75, + "step": 6175, + "time_per_iteration": 2.533447027206421 + }, + { + "auxiliary_loss_clip": 0.01124428, + "auxiliary_loss_mlp": 0.01046668, + "balance_loss_clip": 1.03194535, + "balance_loss_mlp": 1.04644537, + "epoch": 0.37132120847737865, + "flos": 25155640494720.0, + "grad_norm": 1.6734071315129293, + "language_loss": 0.88764346, + "learning_rate": 2.89739855653729e-06, + "loss": 0.90935433, + "num_input_tokens_seen": 132752470, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.78125, + "step": 6176, + "time_per_iteration": 2.486224412918091 + }, + { + "auxiliary_loss_clip": 0.01122363, + "auxiliary_loss_mlp": 0.01036031, + "balance_loss_clip": 1.02174938, + "balance_loss_mlp": 1.04402244, + "epoch": 0.3713813317300466, + "flos": 21213474842880.0, + "grad_norm": 1.5809846817738957, + "language_loss": 0.73293233, + "learning_rate": 2.8970504843164546e-06, + "loss": 0.75451624, + "num_input_tokens_seen": 132771485, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 6177, + "time_per_iteration": 2.492033004760742 + }, + { + "auxiliary_loss_clip": 0.01119881, + "auxiliary_loss_mlp": 0.01039443, + "balance_loss_clip": 1.02551913, + "balance_loss_mlp": 1.04359818, + "epoch": 0.3714414549827146, + "flos": 21616644072960.0, + "grad_norm": 1.8832415058442271, + "language_loss": 0.75425023, + "learning_rate": 2.896702378079374e-06, + "loss": 0.77584344, + "num_input_tokens_seen": 132791465, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76171875, + "step": 6178, + "time_per_iteration": 4.005537748336792 + }, + { + "auxiliary_loss_clip": 0.01123036, + "auxiliary_loss_mlp": 0.01033082, + "balance_loss_clip": 1.01896191, + "balance_loss_mlp": 1.04618645, + "epoch": 0.37150157823538255, + "flos": 19972294335360.0, + "grad_norm": 1.761738877644596, + "language_loss": 0.7228415, + "learning_rate": 2.8963542378392502e-06, + "loss": 0.74440265, + "num_input_tokens_seen": 132810160, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76953125, + "step": 6179, + "time_per_iteration": 5.333393812179565 + }, + { + "auxiliary_loss_clip": 0.01122372, + "auxiliary_loss_mlp": 0.01035165, + "balance_loss_clip": 1.01987052, + "balance_loss_mlp": 1.04356897, + "epoch": 0.3715617014880505, + "flos": 24860562266880.0, + "grad_norm": 2.1666258639633518, + "language_loss": 0.69705212, + "learning_rate": 2.896006063609283e-06, + "loss": 0.71862751, + "num_input_tokens_seen": 132831265, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.7890625, + "step": 6180, + "time_per_iteration": 2.4896974563598633 + }, + { + "auxiliary_loss_clip": 0.01117383, + "auxiliary_loss_mlp": 0.01030855, + "balance_loss_clip": 1.01695561, + "balance_loss_mlp": 1.04157031, + "epoch": 0.3716218247407185, + "flos": 20449080489600.0, + "grad_norm": 1.7756296340851163, + "language_loss": 0.77702844, + "learning_rate": 2.8956578554026767e-06, + "loss": 0.79851079, + "num_input_tokens_seen": 132850005, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 6181, + "time_per_iteration": 2.4324231147766113 + }, + { + "auxiliary_loss_clip": 0.01118444, + "auxiliary_loss_mlp": 0.0103491, + "balance_loss_clip": 1.0202775, + "balance_loss_mlp": 1.04225945, + "epoch": 0.37168194799338644, + "flos": 24133479166080.0, + "grad_norm": 1.8526172549307973, + "language_loss": 0.78767365, + "learning_rate": 2.8953096132326343e-06, + "loss": 0.80920726, + "num_input_tokens_seen": 132865790, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.76171875, + "step": 6182, + "time_per_iteration": 2.47566819190979 + }, + { + "auxiliary_loss_clip": 0.01036072, + "auxiliary_loss_mlp": 0.01008449, + "balance_loss_clip": 1.00650644, + "balance_loss_mlp": 1.01082778, + "epoch": 0.3717420712460544, + "flos": 67408926900480.0, + "grad_norm": 0.7841437663574693, + "language_loss": 0.5748502, + "learning_rate": 2.894961337112362e-06, + "loss": 0.59529543, + "num_input_tokens_seen": 132921775, + "router_z_loss_clip": 0.01940918, + "router_z_loss_mlp": 0.25195312, + "step": 6183, + "time_per_iteration": 3.0538721084594727 + }, + { + "auxiliary_loss_clip": 0.01124733, + "auxiliary_loss_mlp": 0.010435, + "balance_loss_clip": 1.02772832, + "balance_loss_mlp": 1.04238844, + "epoch": 0.37180219449872237, + "flos": 22376908362240.0, + "grad_norm": 2.1996761862640715, + "language_loss": 0.76940209, + "learning_rate": 2.894613027055066e-06, + "loss": 0.79108441, + "num_input_tokens_seen": 132941060, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.82421875, + "step": 6184, + "time_per_iteration": 2.4653987884521484 + }, + { + "auxiliary_loss_clip": 0.0111964, + "auxiliary_loss_mlp": 0.01036854, + "balance_loss_clip": 1.02268612, + "balance_loss_mlp": 1.04353404, + "epoch": 0.37186231775139034, + "flos": 21869885934720.0, + "grad_norm": 13.965274526936179, + "language_loss": 0.72047049, + "learning_rate": 2.894264683073954e-06, + "loss": 0.74203539, + "num_input_tokens_seen": 132961850, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.76171875, + "step": 6185, + "time_per_iteration": 2.458340644836426 + }, + { + "auxiliary_loss_clip": 0.01117506, + "auxiliary_loss_mlp": 0.01027176, + "balance_loss_clip": 1.01282895, + "balance_loss_mlp": 1.04169369, + "epoch": 0.3719224410040583, + "flos": 22415225195520.0, + "grad_norm": 1.55661462109525, + "language_loss": 0.7702297, + "learning_rate": 2.8939163051822363e-06, + "loss": 0.79167652, + "num_input_tokens_seen": 132981625, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7578125, + "step": 6186, + "time_per_iteration": 2.4665393829345703 + }, + { + "auxiliary_loss_clip": 0.01125099, + "auxiliary_loss_mlp": 0.01041622, + "balance_loss_clip": 1.02608871, + "balance_loss_mlp": 1.0436089, + "epoch": 0.37198256425672627, + "flos": 25151223121920.0, + "grad_norm": 1.8483894715485976, + "language_loss": 0.83475709, + "learning_rate": 2.8935678933931224e-06, + "loss": 0.85642433, + "num_input_tokens_seen": 133001225, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8125, + "step": 6187, + "time_per_iteration": 2.520294427871704 + }, + { + "auxiliary_loss_clip": 0.01118811, + "auxiliary_loss_mlp": 0.0103693, + "balance_loss_clip": 1.02228546, + "balance_loss_mlp": 1.0421021, + "epoch": 0.37204268750939423, + "flos": 21138313633920.0, + "grad_norm": 2.555128723697134, + "language_loss": 0.84544367, + "learning_rate": 2.893219447719824e-06, + "loss": 0.86700106, + "num_input_tokens_seen": 133018820, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.765625, + "step": 6188, + "time_per_iteration": 2.4926793575286865 + }, + { + "auxiliary_loss_clip": 0.01121509, + "auxiliary_loss_mlp": 0.01034942, + "balance_loss_clip": 1.01966548, + "balance_loss_mlp": 1.04392672, + "epoch": 0.37210281076206225, + "flos": 21506829217920.0, + "grad_norm": 1.6829112555225307, + "language_loss": 0.65646267, + "learning_rate": 2.8928709681755548e-06, + "loss": 0.67802715, + "num_input_tokens_seen": 133040205, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.7734375, + "step": 6189, + "time_per_iteration": 2.447175979614258 + }, + { + "auxiliary_loss_clip": 0.01122656, + "auxiliary_loss_mlp": 0.01039465, + "balance_loss_clip": 1.02514815, + "balance_loss_mlp": 1.04456878, + "epoch": 0.3721629340147302, + "flos": 17347835116800.0, + "grad_norm": 2.6073714147883162, + "language_loss": 0.83948457, + "learning_rate": 2.8925224547735293e-06, + "loss": 0.8611058, + "num_input_tokens_seen": 133058095, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 6190, + "time_per_iteration": 2.4410126209259033 + }, + { + "auxiliary_loss_clip": 0.01125721, + "auxiliary_loss_mlp": 0.01033909, + "balance_loss_clip": 1.01949084, + "balance_loss_mlp": 1.04337156, + "epoch": 0.3722230572673982, + "flos": 16432400073600.0, + "grad_norm": 2.3404623023220643, + "language_loss": 0.88506198, + "learning_rate": 2.8921739075269633e-06, + "loss": 0.90665835, + "num_input_tokens_seen": 133071530, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.82421875, + "step": 6191, + "time_per_iteration": 2.452972650527954 + }, + { + "auxiliary_loss_clip": 0.01123549, + "auxiliary_loss_mlp": 0.01033146, + "balance_loss_clip": 1.01648057, + "balance_loss_mlp": 1.04218102, + "epoch": 0.37228318052006615, + "flos": 22674716023680.0, + "grad_norm": 1.570395080331924, + "language_loss": 0.74228191, + "learning_rate": 2.891825326449073e-06, + "loss": 0.76384884, + "num_input_tokens_seen": 133091410, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8125, + "step": 6192, + "time_per_iteration": 2.6486353874206543 + }, + { + "auxiliary_loss_clip": 0.01119485, + "auxiliary_loss_mlp": 0.01036496, + "balance_loss_clip": 1.02246475, + "balance_loss_mlp": 1.0427109, + "epoch": 0.3723433037727341, + "flos": 25265491263360.0, + "grad_norm": 2.4820365699908944, + "language_loss": 0.79760754, + "learning_rate": 2.8914767115530766e-06, + "loss": 0.81916732, + "num_input_tokens_seen": 133110365, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.765625, + "step": 6193, + "time_per_iteration": 2.525973081588745 + }, + { + "auxiliary_loss_clip": 0.01123101, + "auxiliary_loss_mlp": 0.01039009, + "balance_loss_clip": 1.02436423, + "balance_loss_mlp": 1.043504, + "epoch": 0.3724034270254021, + "flos": 10524664333440.0, + "grad_norm": 1.7895472081978328, + "language_loss": 0.84495157, + "learning_rate": 2.891128062852194e-06, + "loss": 0.86657262, + "num_input_tokens_seen": 133128255, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.796875, + "step": 6194, + "time_per_iteration": 2.419099807739258 + }, + { + "auxiliary_loss_clip": 0.01118251, + "auxiliary_loss_mlp": 0.01034958, + "balance_loss_clip": 1.02080166, + "balance_loss_mlp": 1.04037666, + "epoch": 0.37246355027807004, + "flos": 20266223328000.0, + "grad_norm": 2.9207659578016463, + "language_loss": 0.77555239, + "learning_rate": 2.890779380359646e-06, + "loss": 0.79708451, + "num_input_tokens_seen": 133143975, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.77734375, + "step": 6195, + "time_per_iteration": 2.3995044231414795 + }, + { + "auxiliary_loss_clip": 0.01119279, + "auxiliary_loss_mlp": 0.01032495, + "balance_loss_clip": 1.01814234, + "balance_loss_mlp": 1.0428412, + "epoch": 0.372523673530738, + "flos": 19500571998720.0, + "grad_norm": 1.677102671463593, + "language_loss": 0.79111922, + "learning_rate": 2.890430664088655e-06, + "loss": 0.81263697, + "num_input_tokens_seen": 133162935, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.765625, + "step": 6196, + "time_per_iteration": 2.445478916168213 + }, + { + "auxiliary_loss_clip": 0.01119995, + "auxiliary_loss_mlp": 0.01036406, + "balance_loss_clip": 1.02235723, + "balance_loss_mlp": 1.04315817, + "epoch": 0.372583796783406, + "flos": 16764250849920.0, + "grad_norm": 1.8393036550873767, + "language_loss": 0.8332746, + "learning_rate": 2.890081914052443e-06, + "loss": 0.85483867, + "num_input_tokens_seen": 133181180, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.765625, + "step": 6197, + "time_per_iteration": 2.392005443572998 + }, + { + "auxiliary_loss_clip": 0.01115911, + "auxiliary_loss_mlp": 0.01035382, + "balance_loss_clip": 1.0202899, + "balance_loss_mlp": 1.04070568, + "epoch": 0.37264392003607394, + "flos": 22637979388800.0, + "grad_norm": 2.267147370646453, + "language_loss": 0.64613056, + "learning_rate": 2.889733130264237e-06, + "loss": 0.66764355, + "num_input_tokens_seen": 133199615, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.75, + "step": 6198, + "time_per_iteration": 2.4624876976013184 + }, + { + "auxiliary_loss_clip": 0.0111678, + "auxiliary_loss_mlp": 0.01043759, + "balance_loss_clip": 1.02989507, + "balance_loss_mlp": 1.04129016, + "epoch": 0.3727040432887419, + "flos": 19973120348160.0, + "grad_norm": 2.4815957641530084, + "language_loss": 0.7439245, + "learning_rate": 2.889384312737261e-06, + "loss": 0.76552987, + "num_input_tokens_seen": 133219650, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75390625, + "step": 6199, + "time_per_iteration": 2.454932689666748 + }, + { + "auxiliary_loss_clip": 0.01117342, + "auxiliary_loss_mlp": 0.01032553, + "balance_loss_clip": 1.01881397, + "balance_loss_mlp": 1.04112601, + "epoch": 0.37276416654140987, + "flos": 63899122279680.0, + "grad_norm": 1.569210214205425, + "language_loss": 0.80711329, + "learning_rate": 2.889035461484742e-06, + "loss": 0.82861221, + "num_input_tokens_seen": 133245675, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76171875, + "step": 6200, + "time_per_iteration": 2.853854179382324 + }, + { + "auxiliary_loss_clip": 0.01118801, + "auxiliary_loss_mlp": 0.01040438, + "balance_loss_clip": 1.02588272, + "balance_loss_mlp": 1.04248428, + "epoch": 0.37282428979407783, + "flos": 39785970211200.0, + "grad_norm": 2.046105641958108, + "language_loss": 0.60723466, + "learning_rate": 2.88868657651991e-06, + "loss": 0.6288271, + "num_input_tokens_seen": 133266905, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.765625, + "step": 6201, + "time_per_iteration": 2.58642315864563 + }, + { + "auxiliary_loss_clip": 0.01122167, + "auxiliary_loss_mlp": 0.01032753, + "balance_loss_clip": 1.01813745, + "balance_loss_mlp": 1.04334736, + "epoch": 0.37288441304674586, + "flos": 22709046447360.0, + "grad_norm": 1.5967185311646992, + "language_loss": 0.72980845, + "learning_rate": 2.8883376578559934e-06, + "loss": 0.75135767, + "num_input_tokens_seen": 133286865, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7890625, + "step": 6202, + "time_per_iteration": 2.461116075515747 + }, + { + "auxiliary_loss_clip": 0.01120095, + "auxiliary_loss_mlp": 0.01034793, + "balance_loss_clip": 1.02064919, + "balance_loss_mlp": 1.04372942, + "epoch": 0.3729445362994138, + "flos": 18770292587520.0, + "grad_norm": 2.8761852736669793, + "language_loss": 0.739654, + "learning_rate": 2.8879887055062243e-06, + "loss": 0.76120287, + "num_input_tokens_seen": 133305295, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.765625, + "step": 6203, + "time_per_iteration": 2.4199976921081543 + }, + { + "auxiliary_loss_clip": 0.01113815, + "auxiliary_loss_mlp": 0.01033282, + "balance_loss_clip": 1.02083671, + "balance_loss_mlp": 1.03933048, + "epoch": 0.3730046595520818, + "flos": 22456199635200.0, + "grad_norm": 1.6894031212763305, + "language_loss": 0.81359541, + "learning_rate": 2.8876397194838353e-06, + "loss": 0.83506644, + "num_input_tokens_seen": 133324625, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.74609375, + "step": 6204, + "time_per_iteration": 2.527442693710327 + }, + { + "auxiliary_loss_clip": 0.01122288, + "auxiliary_loss_mlp": 0.01040396, + "balance_loss_clip": 1.02538753, + "balance_loss_mlp": 1.04287875, + "epoch": 0.37306478280474975, + "flos": 24316372241280.0, + "grad_norm": 1.5818895271767701, + "language_loss": 0.75028086, + "learning_rate": 2.8872906998020577e-06, + "loss": 0.77190769, + "num_input_tokens_seen": 133344625, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.796875, + "step": 6205, + "time_per_iteration": 2.515028953552246 + }, + { + "auxiliary_loss_clip": 0.01118084, + "auxiliary_loss_mlp": 0.01034946, + "balance_loss_clip": 1.02002704, + "balance_loss_mlp": 1.04183412, + "epoch": 0.3731249060574177, + "flos": 15815167741440.0, + "grad_norm": 1.8699710225203796, + "language_loss": 0.78044879, + "learning_rate": 2.886941646474128e-06, + "loss": 0.80197906, + "num_input_tokens_seen": 133363605, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.76171875, + "step": 6206, + "time_per_iteration": 2.433136224746704 + }, + { + "auxiliary_loss_clip": 0.01119546, + "auxiliary_loss_mlp": 0.01032561, + "balance_loss_clip": 1.01752925, + "balance_loss_mlp": 1.04182768, + "epoch": 0.3731850293100857, + "flos": 19828077229440.0, + "grad_norm": 2.1358392378140487, + "language_loss": 0.93595111, + "learning_rate": 2.886592559513283e-06, + "loss": 0.95747221, + "num_input_tokens_seen": 133379405, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7734375, + "step": 6207, + "time_per_iteration": 2.422592878341675 + }, + { + "auxiliary_loss_clip": 0.01120785, + "auxiliary_loss_mlp": 0.01031375, + "balance_loss_clip": 1.01774943, + "balance_loss_mlp": 1.04154027, + "epoch": 0.37324515256275365, + "flos": 19062354072960.0, + "grad_norm": 2.238385364236049, + "language_loss": 0.82666922, + "learning_rate": 2.886243438932759e-06, + "loss": 0.84819084, + "num_input_tokens_seen": 133397585, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.79296875, + "step": 6208, + "time_per_iteration": 2.5171287059783936 + }, + { + "auxiliary_loss_clip": 0.01122491, + "auxiliary_loss_mlp": 0.0103487, + "balance_loss_clip": 1.01911068, + "balance_loss_mlp": 1.04320371, + "epoch": 0.3733052758154216, + "flos": 20704333512960.0, + "grad_norm": 1.7601988102738153, + "language_loss": 0.73197794, + "learning_rate": 2.8858942847457953e-06, + "loss": 0.75355148, + "num_input_tokens_seen": 133415365, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.79296875, + "step": 6209, + "time_per_iteration": 2.480943202972412 + }, + { + "auxiliary_loss_clip": 0.01120081, + "auxiliary_loss_mlp": 0.01037238, + "balance_loss_clip": 1.02178252, + "balance_loss_mlp": 1.0430553, + "epoch": 0.3733653990680896, + "flos": 20193504243840.0, + "grad_norm": 1.4781766070975684, + "language_loss": 0.69951272, + "learning_rate": 2.8855450969656305e-06, + "loss": 0.72108591, + "num_input_tokens_seen": 133435700, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.76953125, + "step": 6210, + "time_per_iteration": 2.5063016414642334 + }, + { + "auxiliary_loss_clip": 0.01121548, + "auxiliary_loss_mlp": 0.01030573, + "balance_loss_clip": 1.01533842, + "balance_loss_mlp": 1.04171228, + "epoch": 0.37342552232075754, + "flos": 20339660684160.0, + "grad_norm": 1.960293983782413, + "language_loss": 0.77729124, + "learning_rate": 2.8851958756055073e-06, + "loss": 0.79881245, + "num_input_tokens_seen": 133455180, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.796875, + "step": 6211, + "time_per_iteration": 2.4845266342163086 + }, + { + "auxiliary_loss_clip": 0.01121905, + "auxiliary_loss_mlp": 0.01038644, + "balance_loss_clip": 1.0240593, + "balance_loss_mlp": 1.04219186, + "epoch": 0.3734856455734255, + "flos": 35517879527040.0, + "grad_norm": 1.9911666037414828, + "language_loss": 0.73026669, + "learning_rate": 2.884846620678668e-06, + "loss": 0.75187218, + "num_input_tokens_seen": 133476715, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.796875, + "step": 6212, + "time_per_iteration": 2.615323066711426 + }, + { + "auxiliary_loss_clip": 0.01130473, + "auxiliary_loss_mlp": 0.01047817, + "balance_loss_clip": 1.03231955, + "balance_loss_mlp": 1.04560018, + "epoch": 0.37354576882609347, + "flos": 21142300043520.0, + "grad_norm": 4.00760557025762, + "language_loss": 0.81895888, + "learning_rate": 2.884497332198356e-06, + "loss": 0.84074175, + "num_input_tokens_seen": 133494550, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.84765625, + "step": 6213, + "time_per_iteration": 2.4621500968933105 + }, + { + "auxiliary_loss_clip": 0.01119566, + "auxiliary_loss_mlp": 0.01039017, + "balance_loss_clip": 1.02433026, + "balance_loss_mlp": 1.04143643, + "epoch": 0.37360589207876144, + "flos": 21506793304320.0, + "grad_norm": 2.2631910468903014, + "language_loss": 0.7890203, + "learning_rate": 2.8841480101778167e-06, + "loss": 0.81060612, + "num_input_tokens_seen": 133512640, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.78125, + "step": 6214, + "time_per_iteration": 2.5582997798919678 + }, + { + "auxiliary_loss_clip": 0.01117625, + "auxiliary_loss_mlp": 0.01043035, + "balance_loss_clip": 1.02859902, + "balance_loss_mlp": 1.04069364, + "epoch": 0.37366601533142946, + "flos": 38435800861440.0, + "grad_norm": 1.7789401165216012, + "language_loss": 0.84881294, + "learning_rate": 2.883798654630296e-06, + "loss": 0.87041962, + "num_input_tokens_seen": 133535540, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.76953125, + "step": 6215, + "time_per_iteration": 2.6216535568237305 + }, + { + "auxiliary_loss_clip": 0.01121702, + "auxiliary_loss_mlp": 0.01041572, + "balance_loss_clip": 1.02595592, + "balance_loss_mlp": 1.04088581, + "epoch": 0.3737261385840974, + "flos": 18441171244800.0, + "grad_norm": 5.614431195109344, + "language_loss": 0.67669535, + "learning_rate": 2.8834492655690423e-06, + "loss": 0.69832802, + "num_input_tokens_seen": 133555795, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.80859375, + "step": 6216, + "time_per_iteration": 2.4592814445495605 + }, + { + "auxiliary_loss_clip": 0.01121492, + "auxiliary_loss_mlp": 0.01040499, + "balance_loss_clip": 1.02500176, + "balance_loss_mlp": 1.04252148, + "epoch": 0.3737862618367654, + "flos": 22929861306240.0, + "grad_norm": 2.041107256757408, + "language_loss": 0.65695626, + "learning_rate": 2.883099843007303e-06, + "loss": 0.67857617, + "num_input_tokens_seen": 133575905, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.7890625, + "step": 6217, + "time_per_iteration": 2.50801420211792 + }, + { + "auxiliary_loss_clip": 0.01122909, + "auxiliary_loss_mlp": 0.01039721, + "balance_loss_clip": 1.02378845, + "balance_loss_mlp": 1.04290843, + "epoch": 0.37384638508943335, + "flos": 15409664127360.0, + "grad_norm": 3.2488334570714725, + "language_loss": 0.80776107, + "learning_rate": 2.88275038695833e-06, + "loss": 0.82938731, + "num_input_tokens_seen": 133592585, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.80078125, + "step": 6218, + "time_per_iteration": 2.469524383544922 + }, + { + "auxiliary_loss_clip": 0.01117083, + "auxiliary_loss_mlp": 0.01032877, + "balance_loss_clip": 1.01851249, + "balance_loss_mlp": 1.04241216, + "epoch": 0.3739065083421013, + "flos": 24280820755200.0, + "grad_norm": 1.3682227753048604, + "language_loss": 0.78710622, + "learning_rate": 2.8824008974353736e-06, + "loss": 0.80860579, + "num_input_tokens_seen": 133615070, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.74609375, + "step": 6219, + "time_per_iteration": 2.595862627029419 + }, + { + "auxiliary_loss_clip": 0.01119648, + "auxiliary_loss_mlp": 0.0104261, + "balance_loss_clip": 1.02776265, + "balance_loss_mlp": 1.0430454, + "epoch": 0.3739666315947693, + "flos": 23002831785600.0, + "grad_norm": 2.1916352692915217, + "language_loss": 0.76985866, + "learning_rate": 2.8820513744516866e-06, + "loss": 0.79148126, + "num_input_tokens_seen": 133633490, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.765625, + "step": 6220, + "time_per_iteration": 6.68864631652832 + }, + { + "auxiliary_loss_clip": 0.01120187, + "auxiliary_loss_mlp": 0.01041991, + "balance_loss_clip": 1.02635062, + "balance_loss_mlp": 1.04149485, + "epoch": 0.37402675484743725, + "flos": 19391116279680.0, + "grad_norm": 1.921342744454882, + "language_loss": 0.82958305, + "learning_rate": 2.8817018180205235e-06, + "loss": 0.85120487, + "num_input_tokens_seen": 133653425, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.7890625, + "step": 6221, + "time_per_iteration": 3.9474618434906006 + }, + { + "auxiliary_loss_clip": 0.0111979, + "auxiliary_loss_mlp": 0.01042782, + "balance_loss_clip": 1.02852452, + "balance_loss_mlp": 1.04195023, + "epoch": 0.3740868781001052, + "flos": 17126158331520.0, + "grad_norm": 1.6461952088047174, + "language_loss": 0.75817096, + "learning_rate": 2.8813522281551387e-06, + "loss": 0.7797966, + "num_input_tokens_seen": 133670220, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 6222, + "time_per_iteration": 2.43192720413208 + }, + { + "auxiliary_loss_clip": 0.01121141, + "auxiliary_loss_mlp": 0.01033917, + "balance_loss_clip": 1.0191592, + "balance_loss_mlp": 1.04333961, + "epoch": 0.3741470013527732, + "flos": 20043505048320.0, + "grad_norm": 1.6728060456550218, + "language_loss": 0.70215583, + "learning_rate": 2.881002604868789e-06, + "loss": 0.72370636, + "num_input_tokens_seen": 133688910, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.77734375, + "step": 6223, + "time_per_iteration": 2.4719529151916504 + }, + { + "auxiliary_loss_clip": 0.01123096, + "auxiliary_loss_mlp": 0.010342, + "balance_loss_clip": 1.01976991, + "balance_loss_mlp": 1.04556298, + "epoch": 0.37420712460544114, + "flos": 36897279569280.0, + "grad_norm": 2.209456781749309, + "language_loss": 0.69100869, + "learning_rate": 2.8806529481747325e-06, + "loss": 0.71258163, + "num_input_tokens_seen": 133708690, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.77734375, + "step": 6224, + "time_per_iteration": 2.6382336616516113 + }, + { + "auxiliary_loss_clip": 0.01120784, + "auxiliary_loss_mlp": 0.01033639, + "balance_loss_clip": 1.01942348, + "balance_loss_mlp": 1.04488885, + "epoch": 0.3742672478581091, + "flos": 22201198007040.0, + "grad_norm": 1.8205395187863704, + "language_loss": 0.69828689, + "learning_rate": 2.880303258086228e-06, + "loss": 0.71983123, + "num_input_tokens_seen": 133728095, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7578125, + "step": 6225, + "time_per_iteration": 2.501041889190674 + }, + { + "auxiliary_loss_clip": 0.01118888, + "auxiliary_loss_mlp": 0.01038877, + "balance_loss_clip": 1.02376127, + "balance_loss_mlp": 1.04357982, + "epoch": 0.3743273711107771, + "flos": 24681547860480.0, + "grad_norm": 2.305559014636685, + "language_loss": 0.79056358, + "learning_rate": 2.879953534616536e-06, + "loss": 0.81214118, + "num_input_tokens_seen": 133745590, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.75390625, + "step": 6226, + "time_per_iteration": 2.485196113586426 + }, + { + "auxiliary_loss_clip": 0.01121484, + "auxiliary_loss_mlp": 0.01040329, + "balance_loss_clip": 1.02517128, + "balance_loss_mlp": 1.04342556, + "epoch": 0.37438749436344504, + "flos": 24459619680000.0, + "grad_norm": 2.1155280603994546, + "language_loss": 0.68059194, + "learning_rate": 2.879603777778917e-06, + "loss": 0.70221007, + "num_input_tokens_seen": 133766155, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.78125, + "step": 6227, + "time_per_iteration": 2.553396463394165 + }, + { + "auxiliary_loss_clip": 0.01119717, + "auxiliary_loss_mlp": 0.01034725, + "balance_loss_clip": 1.02044404, + "balance_loss_mlp": 1.04391932, + "epoch": 0.374447617616113, + "flos": 21798747048960.0, + "grad_norm": 1.719573737271176, + "language_loss": 0.82955533, + "learning_rate": 2.879253987586635e-06, + "loss": 0.85109973, + "num_input_tokens_seen": 133783185, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7578125, + "step": 6228, + "time_per_iteration": 2.449979305267334 + }, + { + "auxiliary_loss_clip": 0.01120724, + "auxiliary_loss_mlp": 0.01038988, + "balance_loss_clip": 1.0244565, + "balance_loss_mlp": 1.0452075, + "epoch": 0.374507740868781, + "flos": 17968191932160.0, + "grad_norm": 1.610770216359874, + "language_loss": 0.74802738, + "learning_rate": 2.8789041640529535e-06, + "loss": 0.76962447, + "num_input_tokens_seen": 133800975, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7578125, + "step": 6229, + "time_per_iteration": 2.4768621921539307 + }, + { + "auxiliary_loss_clip": 0.01121137, + "auxiliary_loss_mlp": 0.01039119, + "balance_loss_clip": 1.02384853, + "balance_loss_mlp": 1.04209936, + "epoch": 0.374567864121449, + "flos": 16105828596480.0, + "grad_norm": 1.8233250091751425, + "language_loss": 0.83350682, + "learning_rate": 2.8785543071911383e-06, + "loss": 0.85510933, + "num_input_tokens_seen": 133818020, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.7890625, + "step": 6230, + "time_per_iteration": 2.4503889083862305 + }, + { + "auxiliary_loss_clip": 0.01125186, + "auxiliary_loss_mlp": 0.01039118, + "balance_loss_clip": 1.02383518, + "balance_loss_mlp": 1.04665947, + "epoch": 0.37462798737411696, + "flos": 25773160135680.0, + "grad_norm": 1.8327028169227884, + "language_loss": 0.73589134, + "learning_rate": 2.878204417014456e-06, + "loss": 0.75753438, + "num_input_tokens_seen": 133840690, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.78515625, + "step": 6231, + "time_per_iteration": 2.5793888568878174 + }, + { + "auxiliary_loss_clip": 0.01126351, + "auxiliary_loss_mlp": 0.01042616, + "balance_loss_clip": 1.02754807, + "balance_loss_mlp": 1.04669595, + "epoch": 0.3746881106267849, + "flos": 16654507822080.0, + "grad_norm": 2.0748427868287536, + "language_loss": 0.72982037, + "learning_rate": 2.8778544935361735e-06, + "loss": 0.75151008, + "num_input_tokens_seen": 133858350, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.796875, + "step": 6232, + "time_per_iteration": 2.5400028228759766 + }, + { + "auxiliary_loss_clip": 0.01120736, + "auxiliary_loss_mlp": 0.01034639, + "balance_loss_clip": 1.01927304, + "balance_loss_mlp": 1.04244757, + "epoch": 0.3747482338794529, + "flos": 26177981391360.0, + "grad_norm": 1.7557793199484253, + "language_loss": 0.77042818, + "learning_rate": 2.877504536769561e-06, + "loss": 0.791982, + "num_input_tokens_seen": 133879775, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.78125, + "step": 6233, + "time_per_iteration": 2.6110641956329346 + }, + { + "auxiliary_loss_clip": 0.01124346, + "auxiliary_loss_mlp": 0.01039458, + "balance_loss_clip": 1.02521205, + "balance_loss_mlp": 1.04520559, + "epoch": 0.37480835713212085, + "flos": 12021061950720.0, + "grad_norm": 1.733253645903673, + "language_loss": 0.68936831, + "learning_rate": 2.8771545467278883e-06, + "loss": 0.71100628, + "num_input_tokens_seen": 133898295, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.79296875, + "step": 6234, + "time_per_iteration": 2.4476797580718994 + }, + { + "auxiliary_loss_clip": 0.01121608, + "auxiliary_loss_mlp": 0.01040174, + "balance_loss_clip": 1.02685833, + "balance_loss_mlp": 1.04514599, + "epoch": 0.3748684803847888, + "flos": 19679263182720.0, + "grad_norm": 1.8436539021155727, + "language_loss": 0.82329285, + "learning_rate": 2.8768045234244276e-06, + "loss": 0.84491062, + "num_input_tokens_seen": 133915230, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.765625, + "step": 6235, + "time_per_iteration": 2.4766016006469727 + }, + { + "auxiliary_loss_clip": 0.01127866, + "auxiliary_loss_mlp": 0.01032441, + "balance_loss_clip": 1.01823175, + "balance_loss_mlp": 1.04744995, + "epoch": 0.3749286036374568, + "flos": 20521189042560.0, + "grad_norm": 1.8082481713782126, + "language_loss": 0.77776909, + "learning_rate": 2.8764544668724517e-06, + "loss": 0.79937214, + "num_input_tokens_seen": 133934110, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8046875, + "step": 6236, + "time_per_iteration": 2.440678596496582 + }, + { + "auxiliary_loss_clip": 0.01124108, + "auxiliary_loss_mlp": 0.0104869, + "balance_loss_clip": 1.03139293, + "balance_loss_mlp": 1.04308259, + "epoch": 0.37498872689012475, + "flos": 20704620821760.0, + "grad_norm": 2.0063576687211704, + "language_loss": 0.73203218, + "learning_rate": 2.876104377085234e-06, + "loss": 0.7537601, + "num_input_tokens_seen": 133952395, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.80859375, + "step": 6237, + "time_per_iteration": 2.5782086849212646 + }, + { + "auxiliary_loss_clip": 0.01120953, + "auxiliary_loss_mlp": 0.01037906, + "balance_loss_clip": 1.02257562, + "balance_loss_mlp": 1.04084682, + "epoch": 0.3750488501427927, + "flos": 21574843620480.0, + "grad_norm": 2.2861902523152935, + "language_loss": 0.93017888, + "learning_rate": 2.8757542540760508e-06, + "loss": 0.9517675, + "num_input_tokens_seen": 133969635, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.80078125, + "step": 6238, + "time_per_iteration": 2.514997720718384 + }, + { + "auxiliary_loss_clip": 0.01121834, + "auxiliary_loss_mlp": 0.01033577, + "balance_loss_clip": 1.01821709, + "balance_loss_mlp": 1.04316592, + "epoch": 0.3751089733954607, + "flos": 15923869274880.0, + "grad_norm": 1.9811721217026943, + "language_loss": 0.71066076, + "learning_rate": 2.8754040978581777e-06, + "loss": 0.73221493, + "num_input_tokens_seen": 133987215, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.78515625, + "step": 6239, + "time_per_iteration": 2.5054962635040283 + }, + { + "auxiliary_loss_clip": 0.01127026, + "auxiliary_loss_mlp": 0.01031398, + "balance_loss_clip": 1.01659262, + "balance_loss_mlp": 1.04635918, + "epoch": 0.37516909664812864, + "flos": 36284644177920.0, + "grad_norm": 1.6550300124553972, + "language_loss": 0.6566934, + "learning_rate": 2.875053908444895e-06, + "loss": 0.67827761, + "num_input_tokens_seen": 134009250, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8046875, + "step": 6240, + "time_per_iteration": 2.5776519775390625 + }, + { + "auxiliary_loss_clip": 0.01124905, + "auxiliary_loss_mlp": 0.01031367, + "balance_loss_clip": 1.01703799, + "balance_loss_mlp": 1.04560649, + "epoch": 0.3752292199007966, + "flos": 13515915283200.0, + "grad_norm": 2.0148493018475877, + "language_loss": 0.75634778, + "learning_rate": 2.8747036858494795e-06, + "loss": 0.77791047, + "num_input_tokens_seen": 134026875, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.79296875, + "step": 6241, + "time_per_iteration": 2.503861904144287 + }, + { + "auxiliary_loss_clip": 0.01123464, + "auxiliary_loss_mlp": 0.01040235, + "balance_loss_clip": 1.02436805, + "balance_loss_mlp": 1.04321361, + "epoch": 0.3752893431534646, + "flos": 27198095644800.0, + "grad_norm": 2.5579725641576876, + "language_loss": 0.83610159, + "learning_rate": 2.874353430085213e-06, + "loss": 0.85773861, + "num_input_tokens_seen": 134047185, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.80078125, + "step": 6242, + "time_per_iteration": 2.4933042526245117 + }, + { + "auxiliary_loss_clip": 0.01122935, + "auxiliary_loss_mlp": 0.01038353, + "balance_loss_clip": 1.02435803, + "balance_loss_mlp": 1.04265308, + "epoch": 0.3753494664061326, + "flos": 30007674581760.0, + "grad_norm": 2.190530656574709, + "language_loss": 0.67888391, + "learning_rate": 2.8740031411653766e-06, + "loss": 0.70049673, + "num_input_tokens_seen": 134067330, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.8046875, + "step": 6243, + "time_per_iteration": 2.543820381164551 + }, + { + "auxiliary_loss_clip": 0.01121963, + "auxiliary_loss_mlp": 0.01038078, + "balance_loss_clip": 1.02241397, + "balance_loss_mlp": 1.04404676, + "epoch": 0.37540958965880056, + "flos": 24461954064000.0, + "grad_norm": 1.7974063962239055, + "language_loss": 0.84275806, + "learning_rate": 2.8736528191032535e-06, + "loss": 0.86435848, + "num_input_tokens_seen": 134085525, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.78125, + "step": 6244, + "time_per_iteration": 2.4710450172424316 + }, + { + "auxiliary_loss_clip": 0.01119065, + "auxiliary_loss_mlp": 0.01036596, + "balance_loss_clip": 1.02229667, + "balance_loss_mlp": 1.0436101, + "epoch": 0.3754697129114685, + "flos": 16508387295360.0, + "grad_norm": 2.387588700969948, + "language_loss": 0.83019805, + "learning_rate": 2.8733024639121277e-06, + "loss": 0.85175467, + "num_input_tokens_seen": 134101855, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.75390625, + "step": 6245, + "time_per_iteration": 2.4594197273254395 + }, + { + "auxiliary_loss_clip": 0.01122149, + "auxiliary_loss_mlp": 0.01037692, + "balance_loss_clip": 1.02207565, + "balance_loss_mlp": 1.04337263, + "epoch": 0.3755298361641365, + "flos": 19390900798080.0, + "grad_norm": 1.94802763897559, + "language_loss": 0.64043313, + "learning_rate": 2.8729520756052853e-06, + "loss": 0.66203153, + "num_input_tokens_seen": 134119360, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.7890625, + "step": 6246, + "time_per_iteration": 2.4522809982299805 + }, + { + "auxiliary_loss_clip": 0.01125162, + "auxiliary_loss_mlp": 0.01038669, + "balance_loss_clip": 1.0231837, + "balance_loss_mlp": 1.04382014, + "epoch": 0.37558995941680445, + "flos": 14720395069440.0, + "grad_norm": 1.7195896287931138, + "language_loss": 0.75146973, + "learning_rate": 2.8726016541960124e-06, + "loss": 0.77310807, + "num_input_tokens_seen": 134137475, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8125, + "step": 6247, + "time_per_iteration": 2.4527103900909424 + }, + { + "auxiliary_loss_clip": 0.01122539, + "auxiliary_loss_mlp": 0.01037762, + "balance_loss_clip": 1.02281308, + "balance_loss_mlp": 1.04276609, + "epoch": 0.3756500826694724, + "flos": 21689901861120.0, + "grad_norm": 3.472354315090956, + "language_loss": 0.55157161, + "learning_rate": 2.872251199697598e-06, + "loss": 0.5731746, + "num_input_tokens_seen": 134154580, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.796875, + "step": 6248, + "time_per_iteration": 2.4399521350860596 + }, + { + "auxiliary_loss_clip": 0.01119734, + "auxiliary_loss_mlp": 0.0103806, + "balance_loss_clip": 1.02334976, + "balance_loss_mlp": 1.04241502, + "epoch": 0.3757102059221404, + "flos": 26505666190080.0, + "grad_norm": 2.875026035710993, + "language_loss": 0.84247208, + "learning_rate": 2.8719007121233297e-06, + "loss": 0.86404997, + "num_input_tokens_seen": 134174285, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.7734375, + "step": 6249, + "time_per_iteration": 2.529763698577881 + }, + { + "auxiliary_loss_clip": 0.01120861, + "auxiliary_loss_mlp": 0.01033042, + "balance_loss_clip": 1.018713, + "balance_loss_mlp": 1.0427655, + "epoch": 0.37577032917480835, + "flos": 37338083274240.0, + "grad_norm": 1.7253468577749267, + "language_loss": 0.68124413, + "learning_rate": 2.8715501914864993e-06, + "loss": 0.70278323, + "num_input_tokens_seen": 134195940, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78125, + "step": 6250, + "time_per_iteration": 2.572439193725586 + }, + { + "auxiliary_loss_clip": 0.01124257, + "auxiliary_loss_mlp": 0.01042227, + "balance_loss_clip": 1.02791047, + "balance_loss_mlp": 1.04538727, + "epoch": 0.3758304524274763, + "flos": 21908597817600.0, + "grad_norm": 2.0419035804756716, + "language_loss": 0.77633286, + "learning_rate": 2.8711996378003987e-06, + "loss": 0.79799771, + "num_input_tokens_seen": 134212235, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7890625, + "step": 6251, + "time_per_iteration": 2.58437442779541 + }, + { + "auxiliary_loss_clip": 0.01120391, + "auxiliary_loss_mlp": 0.01033287, + "balance_loss_clip": 1.01910138, + "balance_loss_mlp": 1.04232824, + "epoch": 0.3758905756801443, + "flos": 36569343375360.0, + "grad_norm": 2.137051103462404, + "language_loss": 0.58463252, + "learning_rate": 2.8708490510783203e-06, + "loss": 0.60616934, + "num_input_tokens_seen": 134233810, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 6252, + "time_per_iteration": 2.6117262840270996 + }, + { + "auxiliary_loss_clip": 0.01124494, + "auxiliary_loss_mlp": 0.01043009, + "balance_loss_clip": 1.02730918, + "balance_loss_mlp": 1.04393482, + "epoch": 0.37595069893281224, + "flos": 24528783317760.0, + "grad_norm": 2.9959533965383836, + "language_loss": 0.89689183, + "learning_rate": 2.8704984313335584e-06, + "loss": 0.91856694, + "num_input_tokens_seen": 134252020, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8046875, + "step": 6253, + "time_per_iteration": 2.5241925716400146 + }, + { + "auxiliary_loss_clip": 0.01123311, + "auxiliary_loss_mlp": 0.01036763, + "balance_loss_clip": 1.0227623, + "balance_loss_mlp": 1.04618073, + "epoch": 0.3760108221854802, + "flos": 16435021766400.0, + "grad_norm": 1.9568868773694639, + "language_loss": 0.76368916, + "learning_rate": 2.8701477785794097e-06, + "loss": 0.78528988, + "num_input_tokens_seen": 134269495, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76953125, + "step": 6254, + "time_per_iteration": 2.44631028175354 + }, + { + "auxiliary_loss_clip": 0.011269, + "auxiliary_loss_mlp": 0.01044619, + "balance_loss_clip": 1.02906847, + "balance_loss_mlp": 1.04640615, + "epoch": 0.37607094543814823, + "flos": 13771742924160.0, + "grad_norm": 2.019237604940679, + "language_loss": 0.61830014, + "learning_rate": 2.869797092829169e-06, + "loss": 0.6400153, + "num_input_tokens_seen": 134287035, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8046875, + "step": 6255, + "time_per_iteration": 2.474303960800171 + }, + { + "auxiliary_loss_clip": 0.01125813, + "auxiliary_loss_mlp": 0.01037924, + "balance_loss_clip": 1.02204537, + "balance_loss_mlp": 1.0434109, + "epoch": 0.3761310686908162, + "flos": 19857918453120.0, + "grad_norm": 2.4357923747979675, + "language_loss": 0.74234015, + "learning_rate": 2.869446374096135e-06, + "loss": 0.76397753, + "num_input_tokens_seen": 134304840, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.82421875, + "step": 6256, + "time_per_iteration": 2.4332830905914307 + }, + { + "auxiliary_loss_clip": 0.01129168, + "auxiliary_loss_mlp": 0.01045861, + "balance_loss_clip": 1.03029239, + "balance_loss_mlp": 1.04842019, + "epoch": 0.37619119194348416, + "flos": 12750802657920.0, + "grad_norm": 1.807318668329893, + "language_loss": 0.70297635, + "learning_rate": 2.8690956223936088e-06, + "loss": 0.72472662, + "num_input_tokens_seen": 134323180, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.80859375, + "step": 6257, + "time_per_iteration": 2.600249767303467 + }, + { + "auxiliary_loss_clip": 0.01123849, + "auxiliary_loss_mlp": 0.01034306, + "balance_loss_clip": 1.01998889, + "balance_loss_mlp": 1.04582894, + "epoch": 0.3762513151961521, + "flos": 17530548624000.0, + "grad_norm": 1.8628634379537026, + "language_loss": 0.84647095, + "learning_rate": 2.868744837734889e-06, + "loss": 0.86805254, + "num_input_tokens_seen": 134341390, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78125, + "step": 6258, + "time_per_iteration": 2.443833351135254 + }, + { + "auxiliary_loss_clip": 0.01122949, + "auxiliary_loss_mlp": 0.01043602, + "balance_loss_clip": 1.02936888, + "balance_loss_mlp": 1.04430962, + "epoch": 0.3763114384488201, + "flos": 23617406511360.0, + "grad_norm": 1.514941849696829, + "language_loss": 0.81009686, + "learning_rate": 2.868394020133277e-06, + "loss": 0.83176237, + "num_input_tokens_seen": 134360425, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78515625, + "step": 6259, + "time_per_iteration": 2.5727832317352295 + }, + { + "auxiliary_loss_clip": 0.01130377, + "auxiliary_loss_mlp": 0.01042246, + "balance_loss_clip": 1.02660608, + "balance_loss_mlp": 1.04775453, + "epoch": 0.37637156170148806, + "flos": 25406978935680.0, + "grad_norm": 1.8915772167347047, + "language_loss": 0.71919596, + "learning_rate": 2.8680431696020783e-06, + "loss": 0.74092221, + "num_input_tokens_seen": 134379775, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.828125, + "step": 6260, + "time_per_iteration": 2.5225539207458496 + }, + { + "auxiliary_loss_clip": 0.0112693, + "auxiliary_loss_mlp": 0.01036069, + "balance_loss_clip": 1.02061951, + "balance_loss_mlp": 1.04538989, + "epoch": 0.376431684954156, + "flos": 23440906056960.0, + "grad_norm": 1.725193491542272, + "language_loss": 0.78423822, + "learning_rate": 2.867692286154594e-06, + "loss": 0.80586827, + "num_input_tokens_seen": 134400315, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.81640625, + "step": 6261, + "time_per_iteration": 2.4926671981811523 + }, + { + "auxiliary_loss_clip": 0.01132188, + "auxiliary_loss_mlp": 0.01043226, + "balance_loss_clip": 1.02784848, + "balance_loss_mlp": 1.04861188, + "epoch": 0.376491808206824, + "flos": 34204482725760.0, + "grad_norm": 1.7544905551461754, + "language_loss": 0.80327791, + "learning_rate": 2.867341369804132e-06, + "loss": 0.82503211, + "num_input_tokens_seen": 134422875, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8359375, + "step": 6262, + "time_per_iteration": 6.861605167388916 + }, + { + "auxiliary_loss_clip": 0.01121874, + "auxiliary_loss_mlp": 0.01032438, + "balance_loss_clip": 1.01796031, + "balance_loss_mlp": 1.04471791, + "epoch": 0.37655193145949195, + "flos": 35185669614720.0, + "grad_norm": 1.7128267856657793, + "language_loss": 0.80543715, + "learning_rate": 2.866990420563998e-06, + "loss": 0.82698023, + "num_input_tokens_seen": 134443025, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.76953125, + "step": 6263, + "time_per_iteration": 2.6574654579162598 + }, + { + "auxiliary_loss_clip": 0.01128017, + "auxiliary_loss_mlp": 0.01041966, + "balance_loss_clip": 1.02705324, + "balance_loss_mlp": 1.04757583, + "epoch": 0.3766120547121599, + "flos": 16761844638720.0, + "grad_norm": 2.7435231382382033, + "language_loss": 0.80158919, + "learning_rate": 2.866639438447501e-06, + "loss": 0.82328904, + "num_input_tokens_seen": 134460945, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8046875, + "step": 6264, + "time_per_iteration": 2.4326720237731934 + }, + { + "auxiliary_loss_clip": 0.01122852, + "auxiliary_loss_mlp": 0.01046441, + "balance_loss_clip": 1.03120613, + "balance_loss_mlp": 1.04323912, + "epoch": 0.3766721779648279, + "flos": 23550361776000.0, + "grad_norm": 2.2579254623504585, + "language_loss": 0.73604524, + "learning_rate": 2.8662884234679497e-06, + "loss": 0.75773823, + "num_input_tokens_seen": 134480440, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.796875, + "step": 6265, + "time_per_iteration": 2.481248617172241 + }, + { + "auxiliary_loss_clip": 0.01126145, + "auxiliary_loss_mlp": 0.01038865, + "balance_loss_clip": 1.02525079, + "balance_loss_mlp": 1.04878664, + "epoch": 0.37673230121749585, + "flos": 29129191655040.0, + "grad_norm": 1.6798839148056366, + "language_loss": 0.68685853, + "learning_rate": 2.865937375638654e-06, + "loss": 0.70850861, + "num_input_tokens_seen": 134501110, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 6266, + "time_per_iteration": 2.517972946166992 + }, + { + "auxiliary_loss_clip": 0.01129377, + "auxiliary_loss_mlp": 0.0104268, + "balance_loss_clip": 1.02746832, + "balance_loss_mlp": 1.04570127, + "epoch": 0.3767924244701638, + "flos": 28146783703680.0, + "grad_norm": 21.71943634627446, + "language_loss": 0.6330213, + "learning_rate": 2.8655862949729264e-06, + "loss": 0.65474188, + "num_input_tokens_seen": 134522460, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8359375, + "step": 6267, + "time_per_iteration": 2.534775733947754 + }, + { + "auxiliary_loss_clip": 0.01049589, + "auxiliary_loss_mlp": 0.01002617, + "balance_loss_clip": 1.00076914, + "balance_loss_mlp": 1.02342653, + "epoch": 0.37685254772283183, + "flos": 60797197526400.0, + "grad_norm": 0.7181832227527338, + "language_loss": 0.58946306, + "learning_rate": 2.8652351814840795e-06, + "loss": 0.60998511, + "num_input_tokens_seen": 134589545, + "router_z_loss_clip": 0.01843262, + "router_z_loss_mlp": 0.26171875, + "step": 6268, + "time_per_iteration": 3.168419361114502 + }, + { + "auxiliary_loss_clip": 0.011283, + "auxiliary_loss_mlp": 0.01038795, + "balance_loss_clip": 1.02268982, + "balance_loss_mlp": 1.04734302, + "epoch": 0.3769126709754998, + "flos": 26032543223040.0, + "grad_norm": 1.4797604992869704, + "language_loss": 0.65026355, + "learning_rate": 2.8648840351854283e-06, + "loss": 0.67193449, + "num_input_tokens_seen": 134610550, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8125, + "step": 6269, + "time_per_iteration": 2.5472333431243896 + }, + { + "auxiliary_loss_clip": 0.01127949, + "auxiliary_loss_mlp": 0.01038619, + "balance_loss_clip": 1.02263296, + "balance_loss_mlp": 1.05022144, + "epoch": 0.37697279422816776, + "flos": 23579879777280.0, + "grad_norm": 1.46875421159053, + "language_loss": 0.70592397, + "learning_rate": 2.8645328560902874e-06, + "loss": 0.72758961, + "num_input_tokens_seen": 134630485, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.77734375, + "step": 6270, + "time_per_iteration": 2.4763948917388916 + }, + { + "auxiliary_loss_clip": 0.01045864, + "auxiliary_loss_mlp": 0.0100198, + "balance_loss_clip": 1.00021577, + "balance_loss_mlp": 1.02014744, + "epoch": 0.3770329174808357, + "flos": 64745935367040.0, + "grad_norm": 0.7024360778923162, + "language_loss": 0.56136239, + "learning_rate": 2.8641816442119746e-06, + "loss": 0.58184087, + "num_input_tokens_seen": 134693510, + "router_z_loss_clip": 0.0177002, + "router_z_loss_mlp": 0.2578125, + "step": 6271, + "time_per_iteration": 3.0738816261291504 + }, + { + "auxiliary_loss_clip": 0.01124439, + "auxiliary_loss_mlp": 0.01039364, + "balance_loss_clip": 1.02326441, + "balance_loss_mlp": 1.04638743, + "epoch": 0.3770930407335037, + "flos": 21835304115840.0, + "grad_norm": 2.066611127756055, + "language_loss": 0.79340166, + "learning_rate": 2.8638303995638066e-06, + "loss": 0.81503969, + "num_input_tokens_seen": 134713115, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.78125, + "step": 6272, + "time_per_iteration": 2.4686055183410645 + }, + { + "auxiliary_loss_clip": 0.01122198, + "auxiliary_loss_mlp": 0.01031935, + "balance_loss_clip": 1.01802933, + "balance_loss_mlp": 1.04578209, + "epoch": 0.37715316398617166, + "flos": 22747901984640.0, + "grad_norm": 1.4641670728096365, + "language_loss": 0.74172843, + "learning_rate": 2.863479122159103e-06, + "loss": 0.76326972, + "num_input_tokens_seen": 134732635, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 6273, + "time_per_iteration": 2.5079009532928467 + }, + { + "auxiliary_loss_clip": 0.01124789, + "auxiliary_loss_mlp": 0.01045969, + "balance_loss_clip": 1.03112721, + "balance_loss_mlp": 1.04621577, + "epoch": 0.3772132872388396, + "flos": 18914581520640.0, + "grad_norm": 1.4163029825487425, + "language_loss": 0.71801323, + "learning_rate": 2.8631278120111858e-06, + "loss": 0.73972082, + "num_input_tokens_seen": 134750695, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78515625, + "step": 6274, + "time_per_iteration": 2.460338592529297 + }, + { + "auxiliary_loss_clip": 0.01128245, + "auxiliary_loss_mlp": 0.01036844, + "balance_loss_clip": 1.02277732, + "balance_loss_mlp": 1.04794264, + "epoch": 0.3772734104915076, + "flos": 17346219004800.0, + "grad_norm": 1.663376044288712, + "language_loss": 0.83692443, + "learning_rate": 2.8627764691333742e-06, + "loss": 0.85857534, + "num_input_tokens_seen": 134768935, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.80078125, + "step": 6275, + "time_per_iteration": 2.48319149017334 + }, + { + "auxiliary_loss_clip": 0.01121629, + "auxiliary_loss_mlp": 0.0103252, + "balance_loss_clip": 1.01949656, + "balance_loss_mlp": 1.04532933, + "epoch": 0.37733353374417555, + "flos": 32342370785280.0, + "grad_norm": 1.4340123311349162, + "language_loss": 0.75342453, + "learning_rate": 2.8624250935389935e-06, + "loss": 0.77496612, + "num_input_tokens_seen": 134791260, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.76171875, + "step": 6276, + "time_per_iteration": 2.5773236751556396 + }, + { + "auxiliary_loss_clip": 0.01127758, + "auxiliary_loss_mlp": 0.01042729, + "balance_loss_clip": 1.02724338, + "balance_loss_mlp": 1.04667568, + "epoch": 0.3773936569968435, + "flos": 23360681030400.0, + "grad_norm": 1.858122502551201, + "language_loss": 0.85519129, + "learning_rate": 2.862073685241366e-06, + "loss": 0.87689614, + "num_input_tokens_seen": 134808350, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8125, + "step": 6277, + "time_per_iteration": 2.5827369689941406 + }, + { + "auxiliary_loss_clip": 0.01123645, + "auxiliary_loss_mlp": 0.01032265, + "balance_loss_clip": 1.01833546, + "balance_loss_mlp": 1.04713118, + "epoch": 0.3774537802495115, + "flos": 21466788531840.0, + "grad_norm": 2.807350675061797, + "language_loss": 0.78055024, + "learning_rate": 2.861722244253818e-06, + "loss": 0.80210936, + "num_input_tokens_seen": 134826005, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 6278, + "time_per_iteration": 2.491334915161133 + }, + { + "auxiliary_loss_clip": 0.01128448, + "auxiliary_loss_mlp": 0.01044224, + "balance_loss_clip": 1.02795196, + "balance_loss_mlp": 1.04698181, + "epoch": 0.37751390350217945, + "flos": 24973717086720.0, + "grad_norm": 1.933979010172509, + "language_loss": 0.82702643, + "learning_rate": 2.8613707705896767e-06, + "loss": 0.84875309, + "num_input_tokens_seen": 134844995, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8125, + "step": 6279, + "time_per_iteration": 2.538426160812378 + }, + { + "auxiliary_loss_clip": 0.01125885, + "auxiliary_loss_mlp": 0.01037058, + "balance_loss_clip": 1.02310467, + "balance_loss_mlp": 1.04578614, + "epoch": 0.3775740267548474, + "flos": 27819098904960.0, + "grad_norm": 2.0225623598483358, + "language_loss": 0.74985826, + "learning_rate": 2.861019264262269e-06, + "loss": 0.77148765, + "num_input_tokens_seen": 134865285, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.80078125, + "step": 6280, + "time_per_iteration": 2.5161032676696777 + }, + { + "auxiliary_loss_clip": 0.01123339, + "auxiliary_loss_mlp": 0.01036466, + "balance_loss_clip": 1.02283478, + "balance_loss_mlp": 1.04662085, + "epoch": 0.3776341500075154, + "flos": 22565224391040.0, + "grad_norm": 1.4438938373085308, + "language_loss": 0.76017272, + "learning_rate": 2.8606677252849242e-06, + "loss": 0.78177071, + "num_input_tokens_seen": 134886535, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.765625, + "step": 6281, + "time_per_iteration": 2.504711151123047 + }, + { + "auxiliary_loss_clip": 0.01122332, + "auxiliary_loss_mlp": 0.01035077, + "balance_loss_clip": 1.02049732, + "balance_loss_mlp": 1.04368496, + "epoch": 0.3776942732601834, + "flos": 23077238808960.0, + "grad_norm": 1.7476205657776698, + "language_loss": 0.8391279, + "learning_rate": 2.860316153670974e-06, + "loss": 0.86070192, + "num_input_tokens_seen": 134907435, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.78515625, + "step": 6282, + "time_per_iteration": 2.4668593406677246 + }, + { + "auxiliary_loss_clip": 0.01120742, + "auxiliary_loss_mlp": 0.01037931, + "balance_loss_clip": 1.02337587, + "balance_loss_mlp": 1.04434681, + "epoch": 0.37775439651285136, + "flos": 21724411852800.0, + "grad_norm": 1.8037618077250128, + "language_loss": 0.70150751, + "learning_rate": 2.8599645494337484e-06, + "loss": 0.72309422, + "num_input_tokens_seen": 134925360, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.765625, + "step": 6283, + "time_per_iteration": 2.481948137283325 + }, + { + "auxiliary_loss_clip": 0.0112321, + "auxiliary_loss_mlp": 0.01045267, + "balance_loss_clip": 1.02967477, + "balance_loss_mlp": 1.04516089, + "epoch": 0.37781451976551933, + "flos": 23987753688960.0, + "grad_norm": 1.804590454145544, + "language_loss": 0.76529062, + "learning_rate": 2.859612912586581e-06, + "loss": 0.78697532, + "num_input_tokens_seen": 134944205, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.78125, + "step": 6284, + "time_per_iteration": 2.462968349456787 + }, + { + "auxiliary_loss_clip": 0.01130082, + "auxiliary_loss_mlp": 0.01034565, + "balance_loss_clip": 1.01884174, + "balance_loss_mlp": 1.0466392, + "epoch": 0.3778746430181873, + "flos": 13727967223680.0, + "grad_norm": 2.0529722445272167, + "language_loss": 0.85851312, + "learning_rate": 2.8592612431428055e-06, + "loss": 0.88015962, + "num_input_tokens_seen": 134960255, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8359375, + "step": 6285, + "time_per_iteration": 2.4435150623321533 + }, + { + "auxiliary_loss_clip": 0.01125611, + "auxiliary_loss_mlp": 0.0104034, + "balance_loss_clip": 1.0240438, + "balance_loss_mlp": 1.04457164, + "epoch": 0.37793476627085526, + "flos": 19460495399040.0, + "grad_norm": 1.9682053367320125, + "language_loss": 0.83967972, + "learning_rate": 2.858909541115758e-06, + "loss": 0.86133921, + "num_input_tokens_seen": 134978605, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8125, + "step": 6286, + "time_per_iteration": 2.4270951747894287 + }, + { + "auxiliary_loss_clip": 0.01123272, + "auxiliary_loss_mlp": 0.01041948, + "balance_loss_clip": 1.0268203, + "balance_loss_mlp": 1.04474115, + "epoch": 0.3779948895235232, + "flos": 10707018704640.0, + "grad_norm": 2.20319687907872, + "language_loss": 0.81550682, + "learning_rate": 2.858557806518775e-06, + "loss": 0.83715904, + "num_input_tokens_seen": 134995020, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.78515625, + "step": 6287, + "time_per_iteration": 2.4504740238189697 + }, + { + "auxiliary_loss_clip": 0.01121581, + "auxiliary_loss_mlp": 0.01040758, + "balance_loss_clip": 1.02559495, + "balance_loss_mlp": 1.04340911, + "epoch": 0.3780550127761912, + "flos": 22310007281280.0, + "grad_norm": 2.428511311582982, + "language_loss": 0.73038173, + "learning_rate": 2.8582060393651927e-06, + "loss": 0.75200516, + "num_input_tokens_seen": 135012620, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.78125, + "step": 6288, + "time_per_iteration": 2.4988601207733154 + }, + { + "auxiliary_loss_clip": 0.01126071, + "auxiliary_loss_mlp": 0.01036255, + "balance_loss_clip": 1.02103162, + "balance_loss_mlp": 1.04705048, + "epoch": 0.37811513602885916, + "flos": 28950644125440.0, + "grad_norm": 1.726028925404572, + "language_loss": 0.75453335, + "learning_rate": 2.857854239668352e-06, + "loss": 0.7761566, + "num_input_tokens_seen": 135033365, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.7890625, + "step": 6289, + "time_per_iteration": 2.5323870182037354 + }, + { + "auxiliary_loss_clip": 0.01121413, + "auxiliary_loss_mlp": 0.01038832, + "balance_loss_clip": 1.02428889, + "balance_loss_mlp": 1.04395676, + "epoch": 0.3781752592815271, + "flos": 23112933949440.0, + "grad_norm": 1.9121243331279245, + "language_loss": 0.7341041, + "learning_rate": 2.857502407441593e-06, + "loss": 0.75570655, + "num_input_tokens_seen": 135052185, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7734375, + "step": 6290, + "time_per_iteration": 2.4703667163848877 + }, + { + "auxiliary_loss_clip": 0.01126076, + "auxiliary_loss_mlp": 0.01040267, + "balance_loss_clip": 1.02388752, + "balance_loss_mlp": 1.0441103, + "epoch": 0.3782353825341951, + "flos": 19755932762880.0, + "grad_norm": 2.4130424762969502, + "language_loss": 0.79729307, + "learning_rate": 2.8571505426982566e-06, + "loss": 0.81895649, + "num_input_tokens_seen": 135070425, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8203125, + "step": 6291, + "time_per_iteration": 2.590517520904541 + }, + { + "auxiliary_loss_clip": 0.01124797, + "auxiliary_loss_mlp": 0.0103595, + "balance_loss_clip": 1.02038157, + "balance_loss_mlp": 1.04347014, + "epoch": 0.37829550578686305, + "flos": 22050839675520.0, + "grad_norm": 1.7851511943573266, + "language_loss": 0.76090503, + "learning_rate": 2.8567986454516854e-06, + "loss": 0.78251249, + "num_input_tokens_seen": 135090525, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8125, + "step": 6292, + "time_per_iteration": 2.486375570297241 + }, + { + "auxiliary_loss_clip": 0.0112214, + "auxiliary_loss_mlp": 0.01042986, + "balance_loss_clip": 1.02708387, + "balance_loss_mlp": 1.04380596, + "epoch": 0.378355629039531, + "flos": 16470357770880.0, + "grad_norm": 1.8744506208430416, + "language_loss": 0.69510674, + "learning_rate": 2.856446715715224e-06, + "loss": 0.71675801, + "num_input_tokens_seen": 135109575, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.78125, + "step": 6293, + "time_per_iteration": 2.477025032043457 + }, + { + "auxiliary_loss_clip": 0.01120173, + "auxiliary_loss_mlp": 0.01036754, + "balance_loss_clip": 1.02140629, + "balance_loss_mlp": 1.04180205, + "epoch": 0.378415752292199, + "flos": 19974844200960.0, + "grad_norm": 1.812028848861632, + "language_loss": 0.71631789, + "learning_rate": 2.8560947535022173e-06, + "loss": 0.73788714, + "num_input_tokens_seen": 135127000, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.78515625, + "step": 6294, + "time_per_iteration": 2.446382522583008 + }, + { + "auxiliary_loss_clip": 0.01128463, + "auxiliary_loss_mlp": 0.01035795, + "balance_loss_clip": 1.02050054, + "balance_loss_mlp": 1.04522586, + "epoch": 0.378475875544867, + "flos": 14647388676480.0, + "grad_norm": 2.0852903309957815, + "language_loss": 0.8254326, + "learning_rate": 2.855742758826011e-06, + "loss": 0.84707516, + "num_input_tokens_seen": 135145285, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.83203125, + "step": 6295, + "time_per_iteration": 2.4684417247772217 + }, + { + "auxiliary_loss_clip": 0.01123253, + "auxiliary_loss_mlp": 0.01033263, + "balance_loss_clip": 1.01870751, + "balance_loss_mlp": 1.04352689, + "epoch": 0.37853599879753497, + "flos": 26650996617600.0, + "grad_norm": 1.687128097470698, + "language_loss": 0.71806532, + "learning_rate": 2.8553907316999547e-06, + "loss": 0.73963046, + "num_input_tokens_seen": 135165240, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.796875, + "step": 6296, + "time_per_iteration": 2.515676975250244 + }, + { + "auxiliary_loss_clip": 0.01119269, + "auxiliary_loss_mlp": 0.01039959, + "balance_loss_clip": 1.02523708, + "balance_loss_mlp": 1.04370534, + "epoch": 0.37859612205020293, + "flos": 17311960408320.0, + "grad_norm": 1.741193546240543, + "language_loss": 0.77094543, + "learning_rate": 2.855038672137396e-06, + "loss": 0.79253769, + "num_input_tokens_seen": 135184045, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7578125, + "step": 6297, + "time_per_iteration": 2.4617502689361572 + }, + { + "auxiliary_loss_clip": 0.01123428, + "auxiliary_loss_mlp": 0.01035161, + "balance_loss_clip": 1.02042699, + "balance_loss_mlp": 1.04360187, + "epoch": 0.3786562453028709, + "flos": 18220392299520.0, + "grad_norm": 2.034703790395703, + "language_loss": 0.79179847, + "learning_rate": 2.854686580151684e-06, + "loss": 0.81338429, + "num_input_tokens_seen": 135202365, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.796875, + "step": 6298, + "time_per_iteration": 2.4516994953155518 + }, + { + "auxiliary_loss_clip": 0.01121762, + "auxiliary_loss_mlp": 0.01034647, + "balance_loss_clip": 1.02001977, + "balance_loss_mlp": 1.04453242, + "epoch": 0.37871636855553886, + "flos": 21214875473280.0, + "grad_norm": 2.0947541210526466, + "language_loss": 0.84758198, + "learning_rate": 2.8543344557561722e-06, + "loss": 0.86914611, + "num_input_tokens_seen": 135220955, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7734375, + "step": 6299, + "time_per_iteration": 2.4814558029174805 + }, + { + "auxiliary_loss_clip": 0.01123706, + "auxiliary_loss_mlp": 0.0103612, + "balance_loss_clip": 1.02153504, + "balance_loss_mlp": 1.04462421, + "epoch": 0.3787764918082068, + "flos": 20952727038720.0, + "grad_norm": 2.218392777517032, + "language_loss": 0.7657811, + "learning_rate": 2.8539822989642116e-06, + "loss": 0.78737932, + "num_input_tokens_seen": 135239715, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7890625, + "step": 6300, + "time_per_iteration": 2.4615044593811035 + }, + { + "auxiliary_loss_clip": 0.01127842, + "auxiliary_loss_mlp": 0.01039306, + "balance_loss_clip": 1.02135265, + "balance_loss_mlp": 1.04486537, + "epoch": 0.3788366150608748, + "flos": 17308009912320.0, + "grad_norm": 2.28104869272164, + "language_loss": 0.82490808, + "learning_rate": 2.8536301097891577e-06, + "loss": 0.84657955, + "num_input_tokens_seen": 135257035, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.828125, + "step": 6301, + "time_per_iteration": 2.4864752292633057 + }, + { + "auxiliary_loss_clip": 0.01119304, + "auxiliary_loss_mlp": 0.0104447, + "balance_loss_clip": 1.02967012, + "balance_loss_mlp": 1.04097867, + "epoch": 0.37889673831354276, + "flos": 24311092942080.0, + "grad_norm": 1.8461206090891127, + "language_loss": 0.67669666, + "learning_rate": 2.8532778882443636e-06, + "loss": 0.69833434, + "num_input_tokens_seen": 135275720, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78125, + "step": 6302, + "time_per_iteration": 2.501873016357422 + }, + { + "auxiliary_loss_clip": 0.01122155, + "auxiliary_loss_mlp": 0.0104003, + "balance_loss_clip": 1.02617788, + "balance_loss_mlp": 1.04561174, + "epoch": 0.3789568615662107, + "flos": 26683603188480.0, + "grad_norm": 1.9271400579859064, + "language_loss": 0.68487787, + "learning_rate": 2.8529256343431867e-06, + "loss": 0.7064997, + "num_input_tokens_seen": 135294140, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 6303, + "time_per_iteration": 4.003960371017456 + }, + { + "auxiliary_loss_clip": 0.01119108, + "auxiliary_loss_mlp": 0.01034602, + "balance_loss_clip": 1.02055335, + "balance_loss_mlp": 1.04143763, + "epoch": 0.3790169848188787, + "flos": 23585194990080.0, + "grad_norm": 1.8915662489351535, + "language_loss": 0.77611423, + "learning_rate": 2.8525733480989846e-06, + "loss": 0.79765135, + "num_input_tokens_seen": 135314845, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.77734375, + "step": 6304, + "time_per_iteration": 5.393261432647705 + }, + { + "auxiliary_loss_clip": 0.01127431, + "auxiliary_loss_mlp": 0.01037711, + "balance_loss_clip": 1.02176046, + "balance_loss_mlp": 1.04611588, + "epoch": 0.37907710807154665, + "flos": 18437436230400.0, + "grad_norm": 2.1278904960845724, + "language_loss": 0.80447114, + "learning_rate": 2.8522210295251146e-06, + "loss": 0.82612252, + "num_input_tokens_seen": 135333055, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8125, + "step": 6305, + "time_per_iteration": 2.471761703491211 + }, + { + "auxiliary_loss_clip": 0.01041012, + "auxiliary_loss_mlp": 0.0101182, + "balance_loss_clip": 1.01011562, + "balance_loss_mlp": 1.01491702, + "epoch": 0.3791372313242146, + "flos": 50107165954560.0, + "grad_norm": 0.9794242329238577, + "language_loss": 0.64524716, + "learning_rate": 2.8518686786349387e-06, + "loss": 0.66577548, + "num_input_tokens_seen": 135387865, + "router_z_loss_clip": 0.01708984, + "router_z_loss_mlp": 0.26171875, + "step": 6306, + "time_per_iteration": 2.9702882766723633 + }, + { + "auxiliary_loss_clip": 0.01126961, + "auxiliary_loss_mlp": 0.01048893, + "balance_loss_clip": 1.03371215, + "balance_loss_mlp": 1.04693508, + "epoch": 0.3791973545768826, + "flos": 24316551809280.0, + "grad_norm": 1.6253037153644523, + "language_loss": 0.73722827, + "learning_rate": 2.851516295441817e-06, + "loss": 0.75898677, + "num_input_tokens_seen": 135409095, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.80078125, + "step": 6307, + "time_per_iteration": 2.508127450942993 + }, + { + "auxiliary_loss_clip": 0.01124488, + "auxiliary_loss_mlp": 0.01040535, + "balance_loss_clip": 1.02550268, + "balance_loss_mlp": 1.04390907, + "epoch": 0.3792574778295506, + "flos": 21579907438080.0, + "grad_norm": 1.494726737463818, + "language_loss": 0.78469551, + "learning_rate": 2.851163879959112e-06, + "loss": 0.80634576, + "num_input_tokens_seen": 135429585, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8046875, + "step": 6308, + "time_per_iteration": 2.453012466430664 + }, + { + "auxiliary_loss_clip": 0.01120475, + "auxiliary_loss_mlp": 0.01040507, + "balance_loss_clip": 1.02552223, + "balance_loss_mlp": 1.04146767, + "epoch": 0.37931760108221857, + "flos": 22272731942400.0, + "grad_norm": 2.8302348181917263, + "language_loss": 0.73083341, + "learning_rate": 2.8508114322001876e-06, + "loss": 0.75244319, + "num_input_tokens_seen": 135446320, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7890625, + "step": 6309, + "time_per_iteration": 2.495020866394043 + }, + { + "auxiliary_loss_clip": 0.01122333, + "auxiliary_loss_mlp": 0.01039635, + "balance_loss_clip": 1.02509165, + "balance_loss_mlp": 1.04503894, + "epoch": 0.37937772433488653, + "flos": 19682998197120.0, + "grad_norm": 1.4661467923449947, + "language_loss": 0.78449893, + "learning_rate": 2.8504589521784083e-06, + "loss": 0.80611867, + "num_input_tokens_seen": 135465720, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7734375, + "step": 6310, + "time_per_iteration": 2.466533899307251 + }, + { + "auxiliary_loss_clip": 0.01121702, + "auxiliary_loss_mlp": 0.0103985, + "balance_loss_clip": 1.02562881, + "balance_loss_mlp": 1.04319441, + "epoch": 0.3794378475875545, + "flos": 19099378016640.0, + "grad_norm": 1.894743489836823, + "language_loss": 0.76103079, + "learning_rate": 2.8501064399071403e-06, + "loss": 0.7826463, + "num_input_tokens_seen": 135485155, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78515625, + "step": 6311, + "time_per_iteration": 2.4859142303466797 + }, + { + "auxiliary_loss_clip": 0.0112103, + "auxiliary_loss_mlp": 0.0103355, + "balance_loss_clip": 1.01906657, + "balance_loss_mlp": 1.04379332, + "epoch": 0.37949797084022246, + "flos": 20339660684160.0, + "grad_norm": 1.4829862533126659, + "language_loss": 0.71025705, + "learning_rate": 2.8497538953997504e-06, + "loss": 0.73180288, + "num_input_tokens_seen": 135502675, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7734375, + "step": 6312, + "time_per_iteration": 2.4632480144500732 + }, + { + "auxiliary_loss_clip": 0.01041554, + "auxiliary_loss_mlp": 0.01005886, + "balance_loss_clip": 1.00425243, + "balance_loss_mlp": 1.01538157, + "epoch": 0.37955809409289043, + "flos": 63972203477760.0, + "grad_norm": 0.7762054489660294, + "language_loss": 0.56084001, + "learning_rate": 2.849401318669608e-06, + "loss": 0.58131444, + "num_input_tokens_seen": 135562005, + "router_z_loss_clip": 0.01635742, + "router_z_loss_mlp": 0.26171875, + "step": 6313, + "time_per_iteration": 3.0646302700042725 + }, + { + "auxiliary_loss_clip": 0.0112246, + "auxiliary_loss_mlp": 0.01043557, + "balance_loss_clip": 1.02876949, + "balance_loss_mlp": 1.04362202, + "epoch": 0.3796182173455584, + "flos": 31540665179520.0, + "grad_norm": 4.480184070608776, + "language_loss": 0.7158128, + "learning_rate": 2.849048709730083e-06, + "loss": 0.73747301, + "num_input_tokens_seen": 135582600, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.7890625, + "step": 6314, + "time_per_iteration": 2.5263309478759766 + }, + { + "auxiliary_loss_clip": 0.01126357, + "auxiliary_loss_mlp": 0.01038649, + "balance_loss_clip": 1.02331841, + "balance_loss_mlp": 1.04427075, + "epoch": 0.37967834059822636, + "flos": 12130804978560.0, + "grad_norm": 1.7655759267809688, + "language_loss": 0.73132306, + "learning_rate": 2.848696068594545e-06, + "loss": 0.75297308, + "num_input_tokens_seen": 135600280, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8203125, + "step": 6315, + "time_per_iteration": 2.4753336906433105 + }, + { + "auxiliary_loss_clip": 0.0111862, + "auxiliary_loss_mlp": 0.01038847, + "balance_loss_clip": 1.02454782, + "balance_loss_mlp": 1.04206967, + "epoch": 0.3797384638508943, + "flos": 39348578298240.0, + "grad_norm": 2.0286726324195477, + "language_loss": 0.71049547, + "learning_rate": 2.8483433952763677e-06, + "loss": 0.73207021, + "num_input_tokens_seen": 135621560, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.765625, + "step": 6316, + "time_per_iteration": 2.636176824569702 + }, + { + "auxiliary_loss_clip": 0.01122, + "auxiliary_loss_mlp": 0.01038731, + "balance_loss_clip": 1.02524233, + "balance_loss_mlp": 1.04524136, + "epoch": 0.3797985871035623, + "flos": 34054016653440.0, + "grad_norm": 1.8086467732489355, + "language_loss": 0.65270519, + "learning_rate": 2.847990689788923e-06, + "loss": 0.67431247, + "num_input_tokens_seen": 135641745, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 6317, + "time_per_iteration": 2.595952033996582 + }, + { + "auxiliary_loss_clip": 0.01118597, + "auxiliary_loss_mlp": 0.0103544, + "balance_loss_clip": 1.02174878, + "balance_loss_mlp": 1.04161143, + "epoch": 0.37985871035623026, + "flos": 23222174186880.0, + "grad_norm": 2.0501625369641867, + "language_loss": 0.85361171, + "learning_rate": 2.8476379521455877e-06, + "loss": 0.87515211, + "num_input_tokens_seen": 135660650, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76953125, + "step": 6318, + "time_per_iteration": 2.4805264472961426 + }, + { + "auxiliary_loss_clip": 0.01124758, + "auxiliary_loss_mlp": 0.01040235, + "balance_loss_clip": 1.02443993, + "balance_loss_mlp": 1.04483223, + "epoch": 0.3799188336088982, + "flos": 18114958903680.0, + "grad_norm": 2.489676718863087, + "language_loss": 0.76274204, + "learning_rate": 2.8472851823597354e-06, + "loss": 0.784392, + "num_input_tokens_seen": 135679980, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.796875, + "step": 6319, + "time_per_iteration": 2.4780025482177734 + }, + { + "auxiliary_loss_clip": 0.01123743, + "auxiliary_loss_mlp": 0.01044293, + "balance_loss_clip": 1.02961218, + "balance_loss_mlp": 1.04587555, + "epoch": 0.3799789568615662, + "flos": 21871897096320.0, + "grad_norm": 1.6998661229427972, + "language_loss": 0.63923568, + "learning_rate": 2.846932380444744e-06, + "loss": 0.66091597, + "num_input_tokens_seen": 135699400, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78125, + "step": 6320, + "time_per_iteration": 2.4700872898101807 + }, + { + "auxiliary_loss_clip": 0.01121041, + "auxiliary_loss_mlp": 0.01038091, + "balance_loss_clip": 1.02375042, + "balance_loss_mlp": 1.04365289, + "epoch": 0.3800390801142342, + "flos": 32962943082240.0, + "grad_norm": 1.883216130529445, + "language_loss": 0.7112022, + "learning_rate": 2.846579546413992e-06, + "loss": 0.73279351, + "num_input_tokens_seen": 135723455, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7734375, + "step": 6321, + "time_per_iteration": 2.5686967372894287 + }, + { + "auxiliary_loss_clip": 0.01123308, + "auxiliary_loss_mlp": 0.01038205, + "balance_loss_clip": 1.02372098, + "balance_loss_mlp": 1.04298186, + "epoch": 0.38009920336690217, + "flos": 26907075653760.0, + "grad_norm": 1.720302384597662, + "language_loss": 0.74730933, + "learning_rate": 2.846226680280859e-06, + "loss": 0.76892447, + "num_input_tokens_seen": 135744335, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8046875, + "step": 6322, + "time_per_iteration": 2.5368685722351074 + }, + { + "auxiliary_loss_clip": 0.01121658, + "auxiliary_loss_mlp": 0.01036997, + "balance_loss_clip": 1.02155948, + "balance_loss_mlp": 1.04405749, + "epoch": 0.38015932661957014, + "flos": 22488913946880.0, + "grad_norm": 2.6715016816856787, + "language_loss": 0.84910119, + "learning_rate": 2.845873782058725e-06, + "loss": 0.87068772, + "num_input_tokens_seen": 135761440, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.77734375, + "step": 6323, + "time_per_iteration": 2.483771562576294 + }, + { + "auxiliary_loss_clip": 0.01123254, + "auxiliary_loss_mlp": 0.01035788, + "balance_loss_clip": 1.01983762, + "balance_loss_mlp": 1.04395103, + "epoch": 0.3802194498722381, + "flos": 21980993679360.0, + "grad_norm": 2.3955157937634586, + "language_loss": 0.73466647, + "learning_rate": 2.845520851760973e-06, + "loss": 0.75625694, + "num_input_tokens_seen": 135779955, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.79296875, + "step": 6324, + "time_per_iteration": 2.4709885120391846 + }, + { + "auxiliary_loss_clip": 0.0112564, + "auxiliary_loss_mlp": 0.01035755, + "balance_loss_clip": 1.02020979, + "balance_loss_mlp": 1.045573, + "epoch": 0.38027957312490607, + "flos": 21324869896320.0, + "grad_norm": 1.6580896914625747, + "language_loss": 0.84147018, + "learning_rate": 2.8451678894009847e-06, + "loss": 0.86308414, + "num_input_tokens_seen": 135799840, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.80078125, + "step": 6325, + "time_per_iteration": 2.4846572875976562 + }, + { + "auxiliary_loss_clip": 0.01122273, + "auxiliary_loss_mlp": 0.01032055, + "balance_loss_clip": 1.01833439, + "balance_loss_mlp": 1.04476464, + "epoch": 0.38033969637757403, + "flos": 16691244456960.0, + "grad_norm": 1.7291759572194114, + "language_loss": 0.79642469, + "learning_rate": 2.8448148949921465e-06, + "loss": 0.81796801, + "num_input_tokens_seen": 135817880, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.77734375, + "step": 6326, + "time_per_iteration": 2.4206631183624268 + }, + { + "auxiliary_loss_clip": 0.0111945, + "auxiliary_loss_mlp": 0.01038943, + "balance_loss_clip": 1.02524638, + "balance_loss_mlp": 1.04261708, + "epoch": 0.380399819630242, + "flos": 36210847685760.0, + "grad_norm": 1.8040593924859922, + "language_loss": 0.72696453, + "learning_rate": 2.844461868547842e-06, + "loss": 0.74854851, + "num_input_tokens_seen": 135838940, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76953125, + "step": 6327, + "time_per_iteration": 2.5964794158935547 + }, + { + "auxiliary_loss_clip": 0.01122464, + "auxiliary_loss_mlp": 0.01037019, + "balance_loss_clip": 1.02165246, + "balance_loss_mlp": 1.04614949, + "epoch": 0.38045994288290996, + "flos": 21288851533440.0, + "grad_norm": 1.6287717027141382, + "language_loss": 0.83090091, + "learning_rate": 2.844108810081459e-06, + "loss": 0.85249579, + "num_input_tokens_seen": 135858325, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.76171875, + "step": 6328, + "time_per_iteration": 2.4602181911468506 + }, + { + "auxiliary_loss_clip": 0.01120102, + "auxiliary_loss_mlp": 0.01032163, + "balance_loss_clip": 1.01746464, + "balance_loss_mlp": 1.04347932, + "epoch": 0.38052006613557793, + "flos": 20922885815040.0, + "grad_norm": 1.31755328246291, + "language_loss": 0.61384171, + "learning_rate": 2.843755719606385e-06, + "loss": 0.63536435, + "num_input_tokens_seen": 135878430, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.765625, + "step": 6329, + "time_per_iteration": 2.5268959999084473 + }, + { + "auxiliary_loss_clip": 0.01124125, + "auxiliary_loss_mlp": 0.01041725, + "balance_loss_clip": 1.02731824, + "balance_loss_mlp": 1.04603863, + "epoch": 0.3805801893882459, + "flos": 20990720649600.0, + "grad_norm": 1.7232754549878644, + "language_loss": 0.5586049, + "learning_rate": 2.8434025971360104e-06, + "loss": 0.58026338, + "num_input_tokens_seen": 135894755, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 6330, + "time_per_iteration": 2.450221061706543 + }, + { + "auxiliary_loss_clip": 0.01119473, + "auxiliary_loss_mlp": 0.01035672, + "balance_loss_clip": 1.02255917, + "balance_loss_mlp": 1.04540074, + "epoch": 0.38064031264091386, + "flos": 25558594243200.0, + "grad_norm": 1.7778053530951745, + "language_loss": 0.65694439, + "learning_rate": 2.8430494426837243e-06, + "loss": 0.67849582, + "num_input_tokens_seen": 135918275, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 6331, + "time_per_iteration": 2.544187545776367 + }, + { + "auxiliary_loss_clip": 0.01126283, + "auxiliary_loss_mlp": 0.01041557, + "balance_loss_clip": 1.02635133, + "balance_loss_mlp": 1.04744291, + "epoch": 0.3807004358935818, + "flos": 15085857997440.0, + "grad_norm": 1.725296368277029, + "language_loss": 0.75737906, + "learning_rate": 2.842696256262919e-06, + "loss": 0.77905744, + "num_input_tokens_seen": 135937430, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.7890625, + "step": 6332, + "time_per_iteration": 2.443654775619507 + }, + { + "auxiliary_loss_clip": 0.01123212, + "auxiliary_loss_mlp": 0.0104071, + "balance_loss_clip": 1.02546334, + "balance_loss_mlp": 1.04323936, + "epoch": 0.3807605591462498, + "flos": 16399398453120.0, + "grad_norm": 2.2212054448627425, + "language_loss": 0.81889552, + "learning_rate": 2.842343037886987e-06, + "loss": 0.84053469, + "num_input_tokens_seen": 135954210, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.80078125, + "step": 6333, + "time_per_iteration": 2.467007637023926 + }, + { + "auxiliary_loss_clip": 0.01121534, + "auxiliary_loss_mlp": 0.01033012, + "balance_loss_clip": 1.0190227, + "balance_loss_mlp": 1.04437923, + "epoch": 0.3808206823989178, + "flos": 29057083102080.0, + "grad_norm": 1.583221243495577, + "language_loss": 0.86192155, + "learning_rate": 2.8419897875693226e-06, + "loss": 0.88346696, + "num_input_tokens_seen": 135974425, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7734375, + "step": 6334, + "time_per_iteration": 2.521341323852539 + }, + { + "auxiliary_loss_clip": 0.01123312, + "auxiliary_loss_mlp": 0.01035753, + "balance_loss_clip": 1.02130485, + "balance_loss_mlp": 1.04498506, + "epoch": 0.3808808056515858, + "flos": 15705855676800.0, + "grad_norm": 2.2115670432842847, + "language_loss": 0.79179001, + "learning_rate": 2.841636505323321e-06, + "loss": 0.8133806, + "num_input_tokens_seen": 135991985, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 6335, + "time_per_iteration": 2.4648449420928955 + }, + { + "auxiliary_loss_clip": 0.01124606, + "auxiliary_loss_mlp": 0.01035281, + "balance_loss_clip": 1.02027273, + "balance_loss_mlp": 1.04485524, + "epoch": 0.38094092890425374, + "flos": 20704584908160.0, + "grad_norm": 1.872233235491229, + "language_loss": 0.72775364, + "learning_rate": 2.8412831911623795e-06, + "loss": 0.74935251, + "num_input_tokens_seen": 136010015, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.796875, + "step": 6336, + "time_per_iteration": 2.443255662918091 + }, + { + "auxiliary_loss_clip": 0.01119223, + "auxiliary_loss_mlp": 0.01031785, + "balance_loss_clip": 1.0180763, + "balance_loss_mlp": 1.0430727, + "epoch": 0.3810010521569217, + "flos": 20667956014080.0, + "grad_norm": 1.9910419737037044, + "language_loss": 0.69146657, + "learning_rate": 2.840929845099894e-06, + "loss": 0.71297657, + "num_input_tokens_seen": 136028440, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76171875, + "step": 6337, + "time_per_iteration": 2.4838876724243164 + }, + { + "auxiliary_loss_clip": 0.01124374, + "auxiliary_loss_mlp": 0.01035164, + "balance_loss_clip": 1.02016187, + "balance_loss_mlp": 1.04606009, + "epoch": 0.38106117540958967, + "flos": 31827626933760.0, + "grad_norm": 1.9033617326941272, + "language_loss": 0.63247615, + "learning_rate": 2.8405764671492652e-06, + "loss": 0.65407151, + "num_input_tokens_seen": 136048360, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.78125, + "step": 6338, + "time_per_iteration": 2.5538294315338135 + }, + { + "auxiliary_loss_clip": 0.01123732, + "auxiliary_loss_mlp": 0.01040443, + "balance_loss_clip": 1.02492189, + "balance_loss_mlp": 1.04498446, + "epoch": 0.38112129866225763, + "flos": 16902757693440.0, + "grad_norm": 1.8718033662194862, + "language_loss": 0.69288802, + "learning_rate": 2.8402230573238923e-06, + "loss": 0.71452975, + "num_input_tokens_seen": 136065500, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.7890625, + "step": 6339, + "time_per_iteration": 2.490813970565796 + }, + { + "auxiliary_loss_clip": 0.0112515, + "auxiliary_loss_mlp": 0.01040007, + "balance_loss_clip": 1.0256902, + "balance_loss_mlp": 1.0461787, + "epoch": 0.3811814219149256, + "flos": 20887226588160.0, + "grad_norm": 2.5980221539464914, + "language_loss": 0.68312418, + "learning_rate": 2.839869615637177e-06, + "loss": 0.70477575, + "num_input_tokens_seen": 136084060, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7890625, + "step": 6340, + "time_per_iteration": 2.4576282501220703 + }, + { + "auxiliary_loss_clip": 0.01124677, + "auxiliary_loss_mlp": 0.01036157, + "balance_loss_clip": 1.02026618, + "balance_loss_mlp": 1.04393721, + "epoch": 0.38124154516759357, + "flos": 16690813493760.0, + "grad_norm": 2.141170258916756, + "language_loss": 0.89404309, + "learning_rate": 2.839516142102522e-06, + "loss": 0.91565144, + "num_input_tokens_seen": 136102310, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.80859375, + "step": 6341, + "time_per_iteration": 2.4688920974731445 + }, + { + "auxiliary_loss_clip": 0.01126312, + "auxiliary_loss_mlp": 0.01040778, + "balance_loss_clip": 1.02477455, + "balance_loss_mlp": 1.04559851, + "epoch": 0.38130166842026153, + "flos": 19681956702720.0, + "grad_norm": 1.5516456894508346, + "language_loss": 0.74665564, + "learning_rate": 2.83916263673333e-06, + "loss": 0.76832652, + "num_input_tokens_seen": 136120725, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8046875, + "step": 6342, + "time_per_iteration": 2.4610931873321533 + }, + { + "auxiliary_loss_clip": 0.0112203, + "auxiliary_loss_mlp": 0.01034151, + "balance_loss_clip": 1.01900578, + "balance_loss_mlp": 1.04325199, + "epoch": 0.3813617916729295, + "flos": 22198432659840.0, + "grad_norm": 1.6121504127073445, + "language_loss": 0.83334327, + "learning_rate": 2.838809099543007e-06, + "loss": 0.85490513, + "num_input_tokens_seen": 136139105, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.7890625, + "step": 6343, + "time_per_iteration": 2.490952730178833 + }, + { + "auxiliary_loss_clip": 0.0112236, + "auxiliary_loss_mlp": 0.01037814, + "balance_loss_clip": 1.0233357, + "balance_loss_mlp": 1.04305577, + "epoch": 0.38142191492559746, + "flos": 19096899978240.0, + "grad_norm": 1.5912858717665679, + "language_loss": 0.76965082, + "learning_rate": 2.838455530544959e-06, + "loss": 0.79125255, + "num_input_tokens_seen": 136158265, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.79296875, + "step": 6344, + "time_per_iteration": 2.458669424057007 + }, + { + "auxiliary_loss_clip": 0.01126022, + "auxiliary_loss_mlp": 0.01039382, + "balance_loss_clip": 1.02413464, + "balance_loss_mlp": 1.04601693, + "epoch": 0.3814820381782654, + "flos": 24097748112000.0, + "grad_norm": 2.369132092535199, + "language_loss": 0.72790027, + "learning_rate": 2.838101929752593e-06, + "loss": 0.7495544, + "num_input_tokens_seen": 136176100, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.796875, + "step": 6345, + "time_per_iteration": 5.361874341964722 + }, + { + "auxiliary_loss_clip": 0.0112125, + "auxiliary_loss_mlp": 0.01035577, + "balance_loss_clip": 1.02172494, + "balance_loss_mlp": 1.04348969, + "epoch": 0.3815421614309334, + "flos": 15778502933760.0, + "grad_norm": 1.723509048793367, + "language_loss": 0.69687438, + "learning_rate": 2.8377482971793187e-06, + "loss": 0.71844268, + "num_input_tokens_seen": 136195125, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.77734375, + "step": 6346, + "time_per_iteration": 3.8780832290649414 + }, + { + "auxiliary_loss_clip": 0.0112555, + "auxiliary_loss_mlp": 0.01037985, + "balance_loss_clip": 1.02351856, + "balance_loss_mlp": 1.04639161, + "epoch": 0.38160228468360136, + "flos": 19899754819200.0, + "grad_norm": 1.8691929226070287, + "language_loss": 0.75860906, + "learning_rate": 2.8373946328385437e-06, + "loss": 0.78024441, + "num_input_tokens_seen": 136213885, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.79296875, + "step": 6347, + "time_per_iteration": 2.4724838733673096 + }, + { + "auxiliary_loss_clip": 0.01121549, + "auxiliary_loss_mlp": 0.01036633, + "balance_loss_clip": 1.02258432, + "balance_loss_mlp": 1.04272556, + "epoch": 0.3816624079362694, + "flos": 19281050029440.0, + "grad_norm": 1.5494744961647557, + "language_loss": 0.74775678, + "learning_rate": 2.8370409367436813e-06, + "loss": 0.76933861, + "num_input_tokens_seen": 136232700, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 6348, + "time_per_iteration": 2.4360201358795166 + }, + { + "auxiliary_loss_clip": 0.01121636, + "auxiliary_loss_mlp": 0.01034098, + "balance_loss_clip": 1.01947105, + "balance_loss_mlp": 1.04346061, + "epoch": 0.38172253118893734, + "flos": 21177564220800.0, + "grad_norm": 2.012782025185047, + "language_loss": 0.86987114, + "learning_rate": 2.836687208908142e-06, + "loss": 0.89142847, + "num_input_tokens_seen": 136248975, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78125, + "step": 6349, + "time_per_iteration": 2.4653983116149902 + }, + { + "auxiliary_loss_clip": 0.01121297, + "auxiliary_loss_mlp": 0.01040466, + "balance_loss_clip": 1.02576792, + "balance_loss_mlp": 1.04300261, + "epoch": 0.3817826544416053, + "flos": 17529219820800.0, + "grad_norm": 3.1419886249283624, + "language_loss": 0.76335979, + "learning_rate": 2.836333449345341e-06, + "loss": 0.78497744, + "num_input_tokens_seen": 136266710, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78125, + "step": 6350, + "time_per_iteration": 2.4111151695251465 + }, + { + "auxiliary_loss_clip": 0.01122319, + "auxiliary_loss_mlp": 0.01032772, + "balance_loss_clip": 1.01693547, + "balance_loss_mlp": 1.04389453, + "epoch": 0.38184277769427327, + "flos": 16326535714560.0, + "grad_norm": 2.0441694615934325, + "language_loss": 0.76182568, + "learning_rate": 2.8359796580686907e-06, + "loss": 0.78337657, + "num_input_tokens_seen": 136284445, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.78125, + "step": 6351, + "time_per_iteration": 2.449831485748291 + }, + { + "auxiliary_loss_clip": 0.0112512, + "auxiliary_loss_mlp": 0.01039723, + "balance_loss_clip": 1.0235939, + "balance_loss_mlp": 1.04464602, + "epoch": 0.38190290094694124, + "flos": 30443450382720.0, + "grad_norm": 1.6974231581634962, + "language_loss": 0.74360836, + "learning_rate": 2.8356258350916085e-06, + "loss": 0.76525676, + "num_input_tokens_seen": 136305730, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8046875, + "step": 6352, + "time_per_iteration": 2.5342295169830322 + }, + { + "auxiliary_loss_clip": 0.01117506, + "auxiliary_loss_mlp": 0.01033939, + "balance_loss_clip": 1.02103508, + "balance_loss_mlp": 1.04153097, + "epoch": 0.3819630241996092, + "flos": 14209924936320.0, + "grad_norm": 1.834359776939538, + "language_loss": 0.64362574, + "learning_rate": 2.8352719804275104e-06, + "loss": 0.66514015, + "num_input_tokens_seen": 136323850, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7578125, + "step": 6353, + "time_per_iteration": 2.434100866317749 + }, + { + "auxiliary_loss_clip": 0.01120333, + "auxiliary_loss_mlp": 0.01033689, + "balance_loss_clip": 1.02020061, + "balance_loss_mlp": 1.04363215, + "epoch": 0.38202314745227717, + "flos": 25009699536000.0, + "grad_norm": 1.6268216674771125, + "language_loss": 0.83035302, + "learning_rate": 2.834918094089816e-06, + "loss": 0.85189331, + "num_input_tokens_seen": 136344880, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 6354, + "time_per_iteration": 2.4903476238250732 + }, + { + "auxiliary_loss_clip": 0.0112166, + "auxiliary_loss_mlp": 0.01035018, + "balance_loss_clip": 1.02154744, + "balance_loss_mlp": 1.04571426, + "epoch": 0.38208327070494513, + "flos": 20814507504000.0, + "grad_norm": 1.7360324347242302, + "language_loss": 0.8071996, + "learning_rate": 2.834564176091943e-06, + "loss": 0.82876635, + "num_input_tokens_seen": 136366060, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 6355, + "time_per_iteration": 2.5086817741394043 + }, + { + "auxiliary_loss_clip": 0.0112186, + "auxiliary_loss_mlp": 0.01033195, + "balance_loss_clip": 1.01959288, + "balance_loss_mlp": 1.04464841, + "epoch": 0.3821433939576131, + "flos": 22637727993600.0, + "grad_norm": 1.7080815693685156, + "language_loss": 0.75032043, + "learning_rate": 2.8342102264473125e-06, + "loss": 0.77187097, + "num_input_tokens_seen": 136385625, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7734375, + "step": 6356, + "time_per_iteration": 2.471919298171997 + }, + { + "auxiliary_loss_clip": 0.01121242, + "auxiliary_loss_mlp": 0.0103649, + "balance_loss_clip": 1.02251887, + "balance_loss_mlp": 1.04420352, + "epoch": 0.38220351721028106, + "flos": 26869872142080.0, + "grad_norm": 1.8091380313160346, + "language_loss": 0.81251574, + "learning_rate": 2.833856245169348e-06, + "loss": 0.83409309, + "num_input_tokens_seen": 136405750, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76953125, + "step": 6357, + "time_per_iteration": 2.5302257537841797 + }, + { + "auxiliary_loss_clip": 0.01127375, + "auxiliary_loss_mlp": 0.01040855, + "balance_loss_clip": 1.02465415, + "balance_loss_mlp": 1.04773057, + "epoch": 0.38226364046294903, + "flos": 23367468700800.0, + "grad_norm": 3.08273691075534, + "language_loss": 0.77903318, + "learning_rate": 2.8335022322714695e-06, + "loss": 0.80071545, + "num_input_tokens_seen": 136426085, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.796875, + "step": 6358, + "time_per_iteration": 2.4700090885162354 + }, + { + "auxiliary_loss_clip": 0.01122323, + "auxiliary_loss_mlp": 0.01040593, + "balance_loss_clip": 1.02576303, + "balance_loss_mlp": 1.0432725, + "epoch": 0.382323763715617, + "flos": 19646225648640.0, + "grad_norm": 2.070211767582473, + "language_loss": 0.78700459, + "learning_rate": 2.8331481877671036e-06, + "loss": 0.80863374, + "num_input_tokens_seen": 136442670, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7890625, + "step": 6359, + "time_per_iteration": 2.4555094242095947 + }, + { + "auxiliary_loss_clip": 0.01118608, + "auxiliary_loss_mlp": 0.01041395, + "balance_loss_clip": 1.02698255, + "balance_loss_mlp": 1.04290545, + "epoch": 0.38238388696828496, + "flos": 54124741232640.0, + "grad_norm": 2.6399902686671113, + "language_loss": 0.69392359, + "learning_rate": 2.8327941116696754e-06, + "loss": 0.7155236, + "num_input_tokens_seen": 136465730, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7578125, + "step": 6360, + "time_per_iteration": 2.736069440841675 + }, + { + "auxiliary_loss_clip": 0.01118797, + "auxiliary_loss_mlp": 0.01031345, + "balance_loss_clip": 1.01632452, + "balance_loss_mlp": 1.04197633, + "epoch": 0.382444010220953, + "flos": 24936190352640.0, + "grad_norm": 1.9168722583294633, + "language_loss": 0.78836095, + "learning_rate": 2.83244000399261e-06, + "loss": 0.80986238, + "num_input_tokens_seen": 136487215, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.765625, + "step": 6361, + "time_per_iteration": 2.511254072189331 + }, + { + "auxiliary_loss_clip": 0.01115444, + "auxiliary_loss_mlp": 0.01036962, + "balance_loss_clip": 1.02274048, + "balance_loss_mlp": 1.04114652, + "epoch": 0.38250413347362094, + "flos": 42337351209600.0, + "grad_norm": 1.4566170801765106, + "language_loss": 0.65315771, + "learning_rate": 2.832085864749337e-06, + "loss": 0.67468172, + "num_input_tokens_seen": 136510365, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7421875, + "step": 6362, + "time_per_iteration": 2.632784128189087 + }, + { + "auxiliary_loss_clip": 0.01118848, + "auxiliary_loss_mlp": 0.01032495, + "balance_loss_clip": 1.01779675, + "balance_loss_mlp": 1.04175615, + "epoch": 0.3825642567262889, + "flos": 16289224462080.0, + "grad_norm": 1.8527291741217293, + "language_loss": 0.82063204, + "learning_rate": 2.8317316939532848e-06, + "loss": 0.84214544, + "num_input_tokens_seen": 136527100, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.76953125, + "step": 6363, + "time_per_iteration": 2.4478373527526855 + }, + { + "auxiliary_loss_clip": 0.01119064, + "auxiliary_loss_mlp": 0.01042512, + "balance_loss_clip": 1.02837944, + "balance_loss_mlp": 1.0446111, + "epoch": 0.3826243799789569, + "flos": 45654778586880.0, + "grad_norm": 1.811422380776527, + "language_loss": 0.58428323, + "learning_rate": 2.8313774916178825e-06, + "loss": 0.60589898, + "num_input_tokens_seen": 136550870, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7421875, + "step": 6364, + "time_per_iteration": 2.655128002166748 + }, + { + "auxiliary_loss_clip": 0.01123151, + "auxiliary_loss_mlp": 0.01039269, + "balance_loss_clip": 1.02496374, + "balance_loss_mlp": 1.04423463, + "epoch": 0.38268450323162484, + "flos": 25301581453440.0, + "grad_norm": 2.1451175401130893, + "language_loss": 0.68881112, + "learning_rate": 2.8310232577565635e-06, + "loss": 0.71043533, + "num_input_tokens_seen": 136569895, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7890625, + "step": 6365, + "time_per_iteration": 2.51526141166687 + }, + { + "auxiliary_loss_clip": 0.01121408, + "auxiliary_loss_mlp": 0.01036007, + "balance_loss_clip": 1.02065301, + "balance_loss_mlp": 1.04057527, + "epoch": 0.3827446264842928, + "flos": 21836022387840.0, + "grad_norm": 4.555943608034253, + "language_loss": 0.73442698, + "learning_rate": 2.830668992382758e-06, + "loss": 0.75600111, + "num_input_tokens_seen": 136588585, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8046875, + "step": 6366, + "time_per_iteration": 2.448585033416748 + }, + { + "auxiliary_loss_clip": 0.01120534, + "auxiliary_loss_mlp": 0.01035025, + "balance_loss_clip": 1.02026677, + "balance_loss_mlp": 1.04226327, + "epoch": 0.38280474973696077, + "flos": 25734591907200.0, + "grad_norm": 2.0234001922769327, + "language_loss": 0.68829554, + "learning_rate": 2.830314695509902e-06, + "loss": 0.70985115, + "num_input_tokens_seen": 136606640, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.78125, + "step": 6367, + "time_per_iteration": 2.569301128387451 + }, + { + "auxiliary_loss_clip": 0.0111708, + "auxiliary_loss_mlp": 0.01033716, + "balance_loss_clip": 1.01917887, + "balance_loss_mlp": 1.04202485, + "epoch": 0.38286487298962874, + "flos": 24895934184960.0, + "grad_norm": 4.344593393004367, + "language_loss": 0.6481666, + "learning_rate": 2.82996036715143e-06, + "loss": 0.66967463, + "num_input_tokens_seen": 136624940, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.75, + "step": 6368, + "time_per_iteration": 2.4531960487365723 + }, + { + "auxiliary_loss_clip": 0.01120319, + "auxiliary_loss_mlp": 0.01034921, + "balance_loss_clip": 1.02053833, + "balance_loss_mlp": 1.04277039, + "epoch": 0.3829249962422967, + "flos": 28543703967360.0, + "grad_norm": 1.315785818077373, + "language_loss": 0.68389189, + "learning_rate": 2.8296060073207763e-06, + "loss": 0.70544434, + "num_input_tokens_seen": 136645540, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7734375, + "step": 6369, + "time_per_iteration": 2.5403318405151367 + }, + { + "auxiliary_loss_clip": 0.01116957, + "auxiliary_loss_mlp": 0.01042768, + "balance_loss_clip": 1.02774167, + "balance_loss_mlp": 1.04172897, + "epoch": 0.38298511949496467, + "flos": 21471205904640.0, + "grad_norm": 1.7184057003296296, + "language_loss": 0.78214431, + "learning_rate": 2.8292516160313804e-06, + "loss": 0.80374157, + "num_input_tokens_seen": 136664530, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.75, + "step": 6370, + "time_per_iteration": 2.4397096633911133 + }, + { + "auxiliary_loss_clip": 0.01119925, + "auxiliary_loss_mlp": 0.01039652, + "balance_loss_clip": 1.02569818, + "balance_loss_mlp": 1.04368424, + "epoch": 0.38304524274763263, + "flos": 31679998035840.0, + "grad_norm": 2.8055794910549525, + "language_loss": 0.64556968, + "learning_rate": 2.8288971932966805e-06, + "loss": 0.66716546, + "num_input_tokens_seen": 136682315, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76171875, + "step": 6371, + "time_per_iteration": 2.5470147132873535 + }, + { + "auxiliary_loss_clip": 0.01124784, + "auxiliary_loss_mlp": 0.01037674, + "balance_loss_clip": 1.0221653, + "balance_loss_mlp": 1.04452634, + "epoch": 0.3831053660003006, + "flos": 25076816098560.0, + "grad_norm": 1.8238449128176952, + "language_loss": 0.72682339, + "learning_rate": 2.8285427391301155e-06, + "loss": 0.7484479, + "num_input_tokens_seen": 136701185, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.80078125, + "step": 6372, + "time_per_iteration": 2.47695255279541 + }, + { + "auxiliary_loss_clip": 0.01121696, + "auxiliary_loss_mlp": 0.01038223, + "balance_loss_clip": 1.02325058, + "balance_loss_mlp": 1.04308939, + "epoch": 0.38316548925296856, + "flos": 23259018562560.0, + "grad_norm": 1.5970403518130607, + "language_loss": 0.84758627, + "learning_rate": 2.8281882535451266e-06, + "loss": 0.86918551, + "num_input_tokens_seen": 136721265, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.7890625, + "step": 6373, + "time_per_iteration": 2.514571189880371 + }, + { + "auxiliary_loss_clip": 0.01124014, + "auxiliary_loss_mlp": 0.01043172, + "balance_loss_clip": 1.02784181, + "balance_loss_mlp": 1.04392529, + "epoch": 0.3832256125056366, + "flos": 34423465991040.0, + "grad_norm": 4.718004058381721, + "language_loss": 0.74721354, + "learning_rate": 2.8278337365551567e-06, + "loss": 0.76888537, + "num_input_tokens_seen": 136741885, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.80078125, + "step": 6374, + "time_per_iteration": 2.5505032539367676 + }, + { + "auxiliary_loss_clip": 0.01124139, + "auxiliary_loss_mlp": 0.01041765, + "balance_loss_clip": 1.02675653, + "balance_loss_mlp": 1.04414058, + "epoch": 0.38328573575830455, + "flos": 21762764599680.0, + "grad_norm": 2.8586580554057472, + "language_loss": 0.75701195, + "learning_rate": 2.8274791881736485e-06, + "loss": 0.77867097, + "num_input_tokens_seen": 136760905, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.80078125, + "step": 6375, + "time_per_iteration": 2.467555522918701 + }, + { + "auxiliary_loss_clip": 0.01122331, + "auxiliary_loss_mlp": 0.01037666, + "balance_loss_clip": 1.02300918, + "balance_loss_mlp": 1.04375613, + "epoch": 0.3833458590109725, + "flos": 17380010724480.0, + "grad_norm": 2.257221103761015, + "language_loss": 0.72827101, + "learning_rate": 2.8271246084140457e-06, + "loss": 0.7498709, + "num_input_tokens_seen": 136777240, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78515625, + "step": 6376, + "time_per_iteration": 2.4082555770874023 + }, + { + "auxiliary_loss_clip": 0.01118574, + "auxiliary_loss_mlp": 0.01039859, + "balance_loss_clip": 1.02455282, + "balance_loss_mlp": 1.04245007, + "epoch": 0.3834059822636405, + "flos": 29424557191680.0, + "grad_norm": 1.5879949283042905, + "language_loss": 0.67586625, + "learning_rate": 2.826769997289796e-06, + "loss": 0.69745058, + "num_input_tokens_seen": 136801040, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.76171875, + "step": 6377, + "time_per_iteration": 2.54896879196167 + }, + { + "auxiliary_loss_clip": 0.01124961, + "auxiliary_loss_mlp": 0.01039863, + "balance_loss_clip": 1.02448511, + "balance_loss_mlp": 1.04608607, + "epoch": 0.38346610551630844, + "flos": 21470739027840.0, + "grad_norm": 2.1973025079181117, + "language_loss": 0.72991705, + "learning_rate": 2.826415354814344e-06, + "loss": 0.75156534, + "num_input_tokens_seen": 136819495, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.7890625, + "step": 6378, + "time_per_iteration": 2.4442975521087646 + }, + { + "auxiliary_loss_clip": 0.01121801, + "auxiliary_loss_mlp": 0.0104221, + "balance_loss_clip": 1.02755964, + "balance_loss_mlp": 1.04327178, + "epoch": 0.3835262287689764, + "flos": 27561224188800.0, + "grad_norm": 1.6808845830991803, + "language_loss": 0.69162869, + "learning_rate": 2.8260606810011396e-06, + "loss": 0.71326876, + "num_input_tokens_seen": 136838840, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78515625, + "step": 6379, + "time_per_iteration": 2.529088258743286 + }, + { + "auxiliary_loss_clip": 0.01121458, + "auxiliary_loss_mlp": 0.01038205, + "balance_loss_clip": 1.02344704, + "balance_loss_mlp": 1.04552865, + "epoch": 0.3835863520216444, + "flos": 15523716787200.0, + "grad_norm": 1.6321901167852362, + "language_loss": 0.82979369, + "learning_rate": 2.8257059758636315e-06, + "loss": 0.85139024, + "num_input_tokens_seen": 136854425, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7578125, + "step": 6380, + "time_per_iteration": 2.4336190223693848 + }, + { + "auxiliary_loss_clip": 0.01120843, + "auxiliary_loss_mlp": 0.0103481, + "balance_loss_clip": 1.02090406, + "balance_loss_mlp": 1.04595208, + "epoch": 0.38364647527431234, + "flos": 21904934630400.0, + "grad_norm": 1.4297951270127425, + "language_loss": 0.81347466, + "learning_rate": 2.8253512394152697e-06, + "loss": 0.83503115, + "num_input_tokens_seen": 136874355, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 6381, + "time_per_iteration": 2.5029306411743164 + }, + { + "auxiliary_loss_clip": 0.0104681, + "auxiliary_loss_mlp": 0.01005882, + "balance_loss_clip": 1.00420141, + "balance_loss_mlp": 1.02098966, + "epoch": 0.3837065985269803, + "flos": 65534927558400.0, + "grad_norm": 0.796129115027233, + "language_loss": 0.60459685, + "learning_rate": 2.8249964716695068e-06, + "loss": 0.6251238, + "num_input_tokens_seen": 136937475, + "router_z_loss_clip": 0.0168457, + "router_z_loss_mlp": 0.2578125, + "step": 6382, + "time_per_iteration": 3.0525829792022705 + }, + { + "auxiliary_loss_clip": 0.01123582, + "auxiliary_loss_mlp": 0.01036921, + "balance_loss_clip": 1.02186477, + "balance_loss_mlp": 1.04358447, + "epoch": 0.38376672177964827, + "flos": 28256598558720.0, + "grad_norm": 2.302869327575685, + "language_loss": 0.66052485, + "learning_rate": 2.824641672639794e-06, + "loss": 0.68212986, + "num_input_tokens_seen": 136955805, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.796875, + "step": 6383, + "time_per_iteration": 2.5166289806365967 + }, + { + "auxiliary_loss_clip": 0.01124634, + "auxiliary_loss_mlp": 0.01033937, + "balance_loss_clip": 1.01944149, + "balance_loss_mlp": 1.04657924, + "epoch": 0.38382684503231623, + "flos": 20631363033600.0, + "grad_norm": 2.2385812040155932, + "language_loss": 0.74811673, + "learning_rate": 2.824286842339587e-06, + "loss": 0.76970243, + "num_input_tokens_seen": 136975240, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 6384, + "time_per_iteration": 2.4451465606689453 + }, + { + "auxiliary_loss_clip": 0.01120418, + "auxiliary_loss_mlp": 0.01036466, + "balance_loss_clip": 1.02219081, + "balance_loss_mlp": 1.04429483, + "epoch": 0.3838869682849842, + "flos": 19605825826560.0, + "grad_norm": 1.4336247312181014, + "language_loss": 0.75883526, + "learning_rate": 2.823931980782341e-06, + "loss": 0.78040409, + "num_input_tokens_seen": 136994985, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7578125, + "step": 6385, + "time_per_iteration": 2.4994513988494873 + }, + { + "auxiliary_loss_clip": 0.01046845, + "auxiliary_loss_mlp": 0.0100207, + "balance_loss_clip": 1.0002346, + "balance_loss_mlp": 1.02044809, + "epoch": 0.38394709153765216, + "flos": 56556110891520.0, + "grad_norm": 0.9433326566144719, + "language_loss": 0.67094183, + "learning_rate": 2.82357708798151e-06, + "loss": 0.69143105, + "num_input_tokens_seen": 137046290, + "router_z_loss_clip": 0.01831055, + "router_z_loss_mlp": 0.265625, + "step": 6386, + "time_per_iteration": 2.938122272491455 + }, + { + "auxiliary_loss_clip": 0.0112227, + "auxiliary_loss_mlp": 0.01032989, + "balance_loss_clip": 1.01933384, + "balance_loss_mlp": 1.0465281, + "epoch": 0.3840072147903202, + "flos": 15888748752000.0, + "grad_norm": 1.7796918810721745, + "language_loss": 0.72464442, + "learning_rate": 2.8232221639505547e-06, + "loss": 0.74619704, + "num_input_tokens_seen": 137064725, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7578125, + "step": 6387, + "time_per_iteration": 5.465053081512451 + }, + { + "auxiliary_loss_clip": 0.01120429, + "auxiliary_loss_mlp": 0.01038992, + "balance_loss_clip": 1.02478194, + "balance_loss_mlp": 1.0451014, + "epoch": 0.38406733804298815, + "flos": 28218030330240.0, + "grad_norm": 1.6321565887315352, + "language_loss": 0.81181073, + "learning_rate": 2.822867208702932e-06, + "loss": 0.8334049, + "num_input_tokens_seen": 137086030, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.75390625, + "step": 6388, + "time_per_iteration": 3.940337657928467 + }, + { + "auxiliary_loss_clip": 0.01117866, + "auxiliary_loss_mlp": 0.01035288, + "balance_loss_clip": 1.02183485, + "balance_loss_mlp": 1.04249692, + "epoch": 0.3841274612956561, + "flos": 18223588609920.0, + "grad_norm": 1.6383752800672902, + "language_loss": 0.76158738, + "learning_rate": 2.8225122222521026e-06, + "loss": 0.78311884, + "num_input_tokens_seen": 137105400, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75390625, + "step": 6389, + "time_per_iteration": 2.4720914363861084 + }, + { + "auxiliary_loss_clip": 0.01125023, + "auxiliary_loss_mlp": 0.01044293, + "balance_loss_clip": 1.02846217, + "balance_loss_mlp": 1.04541564, + "epoch": 0.3841875845483241, + "flos": 19792884879360.0, + "grad_norm": 1.5616719605863645, + "language_loss": 0.76284117, + "learning_rate": 2.8221572046115273e-06, + "loss": 0.78453434, + "num_input_tokens_seen": 137124985, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.796875, + "step": 6390, + "time_per_iteration": 2.4576520919799805 + }, + { + "auxiliary_loss_clip": 0.01124413, + "auxiliary_loss_mlp": 0.01048913, + "balance_loss_clip": 1.03295112, + "balance_loss_mlp": 1.04433882, + "epoch": 0.38424770780099204, + "flos": 29898829393920.0, + "grad_norm": 1.6285452565530243, + "language_loss": 0.70119178, + "learning_rate": 2.821802155794668e-06, + "loss": 0.72292501, + "num_input_tokens_seen": 137146745, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.80078125, + "step": 6391, + "time_per_iteration": 2.5657877922058105 + }, + { + "auxiliary_loss_clip": 0.01121063, + "auxiliary_loss_mlp": 0.01035269, + "balance_loss_clip": 1.01978421, + "balance_loss_mlp": 1.04267848, + "epoch": 0.38430783105366, + "flos": 20813717404800.0, + "grad_norm": 1.938766253942268, + "language_loss": 0.84100312, + "learning_rate": 2.8214470758149884e-06, + "loss": 0.86256641, + "num_input_tokens_seen": 137163195, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.78515625, + "step": 6392, + "time_per_iteration": 2.4366884231567383 + }, + { + "auxiliary_loss_clip": 0.01120524, + "auxiliary_loss_mlp": 0.01035539, + "balance_loss_clip": 1.0215621, + "balance_loss_mlp": 1.04348612, + "epoch": 0.384367954306328, + "flos": 10998577399680.0, + "grad_norm": 2.11211623143903, + "language_loss": 0.61170864, + "learning_rate": 2.8210919646859536e-06, + "loss": 0.63326931, + "num_input_tokens_seen": 137179330, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76953125, + "step": 6393, + "time_per_iteration": 2.428238868713379 + }, + { + "auxiliary_loss_clip": 0.01128297, + "auxiliary_loss_mlp": 0.01035177, + "balance_loss_clip": 1.01886964, + "balance_loss_mlp": 1.04589796, + "epoch": 0.38442807755899594, + "flos": 25338030779520.0, + "grad_norm": 2.3555579295861775, + "language_loss": 0.71295553, + "learning_rate": 2.820736822421029e-06, + "loss": 0.73459029, + "num_input_tokens_seen": 137198655, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.82421875, + "step": 6394, + "time_per_iteration": 2.483506679534912 + }, + { + "auxiliary_loss_clip": 0.01129724, + "auxiliary_loss_mlp": 0.01033781, + "balance_loss_clip": 1.01760483, + "balance_loss_mlp": 1.04732203, + "epoch": 0.3844882008116639, + "flos": 21069760527360.0, + "grad_norm": 2.3366242235467047, + "language_loss": 0.81172824, + "learning_rate": 2.8203816490336822e-06, + "loss": 0.83336329, + "num_input_tokens_seen": 137217120, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.82421875, + "step": 6395, + "time_per_iteration": 2.471301317214966 + }, + { + "auxiliary_loss_clip": 0.01126851, + "auxiliary_loss_mlp": 0.01043233, + "balance_loss_clip": 1.02880275, + "balance_loss_mlp": 1.04770553, + "epoch": 0.38454832406433187, + "flos": 17963235855360.0, + "grad_norm": 3.9526859148826707, + "language_loss": 0.70642132, + "learning_rate": 2.8200264445373813e-06, + "loss": 0.72812212, + "num_input_tokens_seen": 137234410, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7890625, + "step": 6396, + "time_per_iteration": 2.4121108055114746 + }, + { + "auxiliary_loss_clip": 0.01046507, + "auxiliary_loss_mlp": 0.00999241, + "balance_loss_clip": 0.99745274, + "balance_loss_mlp": 1.01972008, + "epoch": 0.38460844731699984, + "flos": 67924999555200.0, + "grad_norm": 0.8889613923167966, + "language_loss": 0.59708536, + "learning_rate": 2.8196712089455954e-06, + "loss": 0.61754286, + "num_input_tokens_seen": 137294940, + "router_z_loss_clip": 0.01782227, + "router_z_loss_mlp": 0.26757812, + "step": 6397, + "time_per_iteration": 3.1453351974487305 + }, + { + "auxiliary_loss_clip": 0.01123309, + "auxiliary_loss_mlp": 0.01031546, + "balance_loss_clip": 1.01682329, + "balance_loss_mlp": 1.0459342, + "epoch": 0.3846685705696678, + "flos": 25849075530240.0, + "grad_norm": 1.8498202803423767, + "language_loss": 0.84868926, + "learning_rate": 2.819315942271794e-06, + "loss": 0.87023783, + "num_input_tokens_seen": 137315035, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.7734375, + "step": 6398, + "time_per_iteration": 2.488083839416504 + }, + { + "auxiliary_loss_clip": 0.01121502, + "auxiliary_loss_mlp": 0.01032478, + "balance_loss_clip": 1.01826787, + "balance_loss_mlp": 1.0444839, + "epoch": 0.38472869382233577, + "flos": 16290194129280.0, + "grad_norm": 1.942979036208199, + "language_loss": 0.79634017, + "learning_rate": 2.8189606445294515e-06, + "loss": 0.81787992, + "num_input_tokens_seen": 137333155, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7734375, + "step": 6399, + "time_per_iteration": 2.4537224769592285 + }, + { + "auxiliary_loss_clip": 0.01124087, + "auxiliary_loss_mlp": 0.01036794, + "balance_loss_clip": 1.02149892, + "balance_loss_mlp": 1.04439902, + "epoch": 0.38478881707500373, + "flos": 19353122668800.0, + "grad_norm": 1.8928366067789952, + "language_loss": 0.67337728, + "learning_rate": 2.818605315732038e-06, + "loss": 0.69498605, + "num_input_tokens_seen": 137351515, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.796875, + "step": 6400, + "time_per_iteration": 2.434598207473755 + }, + { + "auxiliary_loss_clip": 0.0112665, + "auxiliary_loss_mlp": 0.0104604, + "balance_loss_clip": 1.030936, + "balance_loss_mlp": 1.04645705, + "epoch": 0.38484894032767175, + "flos": 24860849575680.0, + "grad_norm": 1.6542190438860391, + "language_loss": 0.73004973, + "learning_rate": 2.81824995589303e-06, + "loss": 0.7517767, + "num_input_tokens_seen": 137371255, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.80078125, + "step": 6401, + "time_per_iteration": 2.4963061809539795 + }, + { + "auxiliary_loss_clip": 0.01123878, + "auxiliary_loss_mlp": 0.01038712, + "balance_loss_clip": 1.02329874, + "balance_loss_mlp": 1.045017, + "epoch": 0.3849090635803397, + "flos": 14501806853760.0, + "grad_norm": 1.9430058457885813, + "language_loss": 0.71920168, + "learning_rate": 2.8178945650259012e-06, + "loss": 0.74082762, + "num_input_tokens_seen": 137388980, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.7890625, + "step": 6402, + "time_per_iteration": 2.426349639892578 + }, + { + "auxiliary_loss_clip": 0.01118079, + "auxiliary_loss_mlp": 0.01034485, + "balance_loss_clip": 1.02007246, + "balance_loss_mlp": 1.04232907, + "epoch": 0.3849691868330077, + "flos": 18515865576960.0, + "grad_norm": 1.7846208976590752, + "language_loss": 0.82449806, + "learning_rate": 2.817539143144128e-06, + "loss": 0.84602368, + "num_input_tokens_seen": 137406885, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7578125, + "step": 6403, + "time_per_iteration": 2.4700570106506348 + }, + { + "auxiliary_loss_clip": 0.0112163, + "auxiliary_loss_mlp": 0.01038876, + "balance_loss_clip": 1.02340865, + "balance_loss_mlp": 1.04500651, + "epoch": 0.38502931008567565, + "flos": 21616392677760.0, + "grad_norm": 1.8891944292176732, + "language_loss": 0.82468271, + "learning_rate": 2.817183690261189e-06, + "loss": 0.84628773, + "num_input_tokens_seen": 137425535, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.765625, + "step": 6404, + "time_per_iteration": 2.481968402862549 + }, + { + "auxiliary_loss_clip": 0.01122268, + "auxiliary_loss_mlp": 0.01035856, + "balance_loss_clip": 1.02136576, + "balance_loss_mlp": 1.04299283, + "epoch": 0.3850894333383436, + "flos": 25415346804480.0, + "grad_norm": 1.6334992055527433, + "language_loss": 0.69588619, + "learning_rate": 2.816828206390563e-06, + "loss": 0.71746749, + "num_input_tokens_seen": 137447700, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.79296875, + "step": 6405, + "time_per_iteration": 2.5947635173797607 + }, + { + "auxiliary_loss_clip": 0.01119754, + "auxiliary_loss_mlp": 0.01038243, + "balance_loss_clip": 1.02475476, + "balance_loss_mlp": 1.04411674, + "epoch": 0.3851495565910116, + "flos": 20227870581120.0, + "grad_norm": 1.9268009005119906, + "language_loss": 0.79068285, + "learning_rate": 2.816472691545729e-06, + "loss": 0.81226277, + "num_input_tokens_seen": 137462245, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75390625, + "step": 6406, + "time_per_iteration": 2.4195396900177 + }, + { + "auxiliary_loss_clip": 0.01125718, + "auxiliary_loss_mlp": 0.01037827, + "balance_loss_clip": 1.02247298, + "balance_loss_mlp": 1.04682863, + "epoch": 0.38520967984367954, + "flos": 16508459122560.0, + "grad_norm": 2.277779532957622, + "language_loss": 0.8438794, + "learning_rate": 2.8161171457401694e-06, + "loss": 0.86551487, + "num_input_tokens_seen": 137476455, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.7890625, + "step": 6407, + "time_per_iteration": 2.4518916606903076 + }, + { + "auxiliary_loss_clip": 0.01043854, + "auxiliary_loss_mlp": 0.01007721, + "balance_loss_clip": 1.00623727, + "balance_loss_mlp": 1.01778841, + "epoch": 0.3852698030963475, + "flos": 61313772971520.0, + "grad_norm": 0.8214817017046727, + "language_loss": 0.64868087, + "learning_rate": 2.815761568987365e-06, + "loss": 0.66919661, + "num_input_tokens_seen": 137539845, + "router_z_loss_clip": 0.01483154, + "router_z_loss_mlp": 0.25976562, + "step": 6408, + "time_per_iteration": 3.090940475463867 + }, + { + "auxiliary_loss_clip": 0.01123062, + "auxiliary_loss_mlp": 0.01043064, + "balance_loss_clip": 1.02676785, + "balance_loss_mlp": 1.04405272, + "epoch": 0.3853299263490155, + "flos": 22893016930560.0, + "grad_norm": 1.5501960898767924, + "language_loss": 0.73628408, + "learning_rate": 2.8154059613008e-06, + "loss": 0.7579453, + "num_input_tokens_seen": 137559880, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.7890625, + "step": 6409, + "time_per_iteration": 2.4831972122192383 + }, + { + "auxiliary_loss_clip": 0.01129844, + "auxiliary_loss_mlp": 0.01049195, + "balance_loss_clip": 1.03255367, + "balance_loss_mlp": 1.04574656, + "epoch": 0.38539004960168344, + "flos": 20047491457920.0, + "grad_norm": 2.0394333066705874, + "language_loss": 0.70208335, + "learning_rate": 2.81505032269396e-06, + "loss": 0.72387373, + "num_input_tokens_seen": 137578225, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.83984375, + "step": 6410, + "time_per_iteration": 2.430617332458496 + }, + { + "auxiliary_loss_clip": 0.01043682, + "auxiliary_loss_mlp": 0.01003736, + "balance_loss_clip": 1.0021385, + "balance_loss_mlp": 1.01802111, + "epoch": 0.3854501728543514, + "flos": 68730691570560.0, + "grad_norm": 0.6794214350275563, + "language_loss": 0.60311568, + "learning_rate": 2.81469465318033e-06, + "loss": 0.62358987, + "num_input_tokens_seen": 137645770, + "router_z_loss_clip": 0.01599121, + "router_z_loss_mlp": 0.2578125, + "step": 6411, + "time_per_iteration": 3.1681244373321533 + }, + { + "auxiliary_loss_clip": 0.01118542, + "auxiliary_loss_mlp": 0.01029148, + "balance_loss_clip": 1.01543355, + "balance_loss_mlp": 1.04146707, + "epoch": 0.38551029610701937, + "flos": 20485027025280.0, + "grad_norm": 1.9543275921913768, + "language_loss": 0.7770192, + "learning_rate": 2.814338952773397e-06, + "loss": 0.79849613, + "num_input_tokens_seen": 137664090, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76953125, + "step": 6412, + "time_per_iteration": 2.4670822620391846 + }, + { + "auxiliary_loss_clip": 0.01124348, + "auxiliary_loss_mlp": 0.01037148, + "balance_loss_clip": 1.02093506, + "balance_loss_mlp": 1.0437274, + "epoch": 0.38557041935968733, + "flos": 23471788775040.0, + "grad_norm": 1.7609162802618283, + "language_loss": 0.78148544, + "learning_rate": 2.8139832214866493e-06, + "loss": 0.80310041, + "num_input_tokens_seen": 137683190, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8046875, + "step": 6413, + "time_per_iteration": 2.4506192207336426 + }, + { + "auxiliary_loss_clip": 0.01040458, + "auxiliary_loss_mlp": 0.01006495, + "balance_loss_clip": 1.00485027, + "balance_loss_mlp": 1.01477003, + "epoch": 0.38563054261235535, + "flos": 63966636869760.0, + "grad_norm": 0.8068957555662655, + "language_loss": 0.61344963, + "learning_rate": 2.813627459333576e-06, + "loss": 0.63391918, + "num_input_tokens_seen": 137737315, + "router_z_loss_clip": 0.01647949, + "router_z_loss_mlp": 0.2578125, + "step": 6414, + "time_per_iteration": 2.897420883178711 + }, + { + "auxiliary_loss_clip": 0.01124739, + "auxiliary_loss_mlp": 0.0104191, + "balance_loss_clip": 1.02712834, + "balance_loss_mlp": 1.04452538, + "epoch": 0.3856906658650233, + "flos": 23987789602560.0, + "grad_norm": 2.3808373048749543, + "language_loss": 0.77121973, + "learning_rate": 2.8132716663276685e-06, + "loss": 0.79288626, + "num_input_tokens_seen": 137753535, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.80078125, + "step": 6415, + "time_per_iteration": 2.455246686935425 + }, + { + "auxiliary_loss_clip": 0.01115597, + "auxiliary_loss_mlp": 0.01032062, + "balance_loss_clip": 1.01916933, + "balance_loss_mlp": 1.04303658, + "epoch": 0.3857507891176913, + "flos": 25007436979200.0, + "grad_norm": 1.6468091717833364, + "language_loss": 0.79597795, + "learning_rate": 2.8129158424824173e-06, + "loss": 0.81745458, + "num_input_tokens_seen": 137773405, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7265625, + "step": 6416, + "time_per_iteration": 2.5162863731384277 + }, + { + "auxiliary_loss_clip": 0.0111887, + "auxiliary_loss_mlp": 0.01034214, + "balance_loss_clip": 1.02100587, + "balance_loss_mlp": 1.04190922, + "epoch": 0.38581091237035925, + "flos": 21536778182400.0, + "grad_norm": 1.6816352340920986, + "language_loss": 0.7957328, + "learning_rate": 2.8125599878113155e-06, + "loss": 0.81726366, + "num_input_tokens_seen": 137790810, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.76953125, + "step": 6417, + "time_per_iteration": 2.462679862976074 + }, + { + "auxiliary_loss_clip": 0.01116808, + "auxiliary_loss_mlp": 0.01037406, + "balance_loss_clip": 1.02369118, + "balance_loss_mlp": 1.03945839, + "epoch": 0.3858710356230272, + "flos": 17383889393280.0, + "grad_norm": 9.924006648688666, + "language_loss": 0.80246758, + "learning_rate": 2.8122041023278583e-06, + "loss": 0.82400978, + "num_input_tokens_seen": 137810265, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 6418, + "time_per_iteration": 2.4485208988189697 + }, + { + "auxiliary_loss_clip": 0.01114184, + "auxiliary_loss_mlp": 0.01033119, + "balance_loss_clip": 1.01992905, + "balance_loss_mlp": 1.03939319, + "epoch": 0.3859311588756952, + "flos": 20339588856960.0, + "grad_norm": 1.9958339666442106, + "language_loss": 0.79694712, + "learning_rate": 2.8118481860455407e-06, + "loss": 0.81842011, + "num_input_tokens_seen": 137828580, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.75, + "step": 6419, + "time_per_iteration": 2.4360008239746094 + }, + { + "auxiliary_loss_clip": 0.01115104, + "auxiliary_loss_mlp": 0.01034912, + "balance_loss_clip": 1.01972449, + "balance_loss_mlp": 1.04120576, + "epoch": 0.38599128212836314, + "flos": 26321157002880.0, + "grad_norm": 2.0553625572614678, + "language_loss": 0.67804086, + "learning_rate": 2.8114922389778573e-06, + "loss": 0.69954103, + "num_input_tokens_seen": 137846145, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.73828125, + "step": 6420, + "time_per_iteration": 2.489661931991577 + }, + { + "auxiliary_loss_clip": 0.01116038, + "auxiliary_loss_mlp": 0.01036432, + "balance_loss_clip": 1.02286029, + "balance_loss_mlp": 1.04163957, + "epoch": 0.3860514053810311, + "flos": 13553837066880.0, + "grad_norm": 2.4512212791744576, + "language_loss": 0.81831443, + "learning_rate": 2.8111362611383076e-06, + "loss": 0.83983916, + "num_input_tokens_seen": 137863705, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.74609375, + "step": 6421, + "time_per_iteration": 2.4278934001922607 + }, + { + "auxiliary_loss_clip": 0.01118285, + "auxiliary_loss_mlp": 0.01033321, + "balance_loss_clip": 1.01888454, + "balance_loss_mlp": 1.04031229, + "epoch": 0.3861115286336991, + "flos": 20954271323520.0, + "grad_norm": 2.2431145476637266, + "language_loss": 0.72079587, + "learning_rate": 2.8107802525403886e-06, + "loss": 0.74231195, + "num_input_tokens_seen": 137880285, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 6422, + "time_per_iteration": 2.441708564758301 + }, + { + "auxiliary_loss_clip": 0.01116019, + "auxiliary_loss_mlp": 0.01038012, + "balance_loss_clip": 1.02482104, + "balance_loss_mlp": 1.0425638, + "epoch": 0.38617165188636704, + "flos": 16362697731840.0, + "grad_norm": 1.6611822537555545, + "language_loss": 0.65814191, + "learning_rate": 2.8104242131976025e-06, + "loss": 0.6796822, + "num_input_tokens_seen": 137898335, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 6423, + "time_per_iteration": 2.4211878776550293 + }, + { + "auxiliary_loss_clip": 0.01121429, + "auxiliary_loss_mlp": 0.01039251, + "balance_loss_clip": 1.02561951, + "balance_loss_mlp": 1.0439117, + "epoch": 0.386231775139035, + "flos": 34787276893440.0, + "grad_norm": 1.965242475874499, + "language_loss": 0.68746173, + "learning_rate": 2.810068143123449e-06, + "loss": 0.70906854, + "num_input_tokens_seen": 137918605, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 6424, + "time_per_iteration": 2.5804436206817627 + }, + { + "auxiliary_loss_clip": 0.01117257, + "auxiliary_loss_mlp": 0.0103853, + "balance_loss_clip": 1.0243144, + "balance_loss_mlp": 1.04261661, + "epoch": 0.38629189839170297, + "flos": 21726171619200.0, + "grad_norm": 1.3808875353222407, + "language_loss": 0.72237349, + "learning_rate": 2.809712042331429e-06, + "loss": 0.74393135, + "num_input_tokens_seen": 137938245, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.74609375, + "step": 6425, + "time_per_iteration": 2.4568634033203125 + }, + { + "auxiliary_loss_clip": 0.01121874, + "auxiliary_loss_mlp": 0.01038367, + "balance_loss_clip": 1.02413344, + "balance_loss_mlp": 1.0424571, + "epoch": 0.38635202164437094, + "flos": 27923634460800.0, + "grad_norm": 2.566599175889616, + "language_loss": 0.80062914, + "learning_rate": 2.8093559108350484e-06, + "loss": 0.82223159, + "num_input_tokens_seen": 137956770, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.79296875, + "step": 6426, + "time_per_iteration": 2.5236575603485107 + }, + { + "auxiliary_loss_clip": 0.01123371, + "auxiliary_loss_mlp": 0.01036233, + "balance_loss_clip": 1.0222559, + "balance_loss_mlp": 1.04582727, + "epoch": 0.38641214489703896, + "flos": 23586631534080.0, + "grad_norm": 2.32293087490025, + "language_loss": 0.74624443, + "learning_rate": 2.80899974864781e-06, + "loss": 0.7678405, + "num_input_tokens_seen": 137977040, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7734375, + "step": 6427, + "time_per_iteration": 2.467555046081543 + }, + { + "auxiliary_loss_clip": 0.01118544, + "auxiliary_loss_mlp": 0.01039206, + "balance_loss_clip": 1.02530599, + "balance_loss_mlp": 1.04256904, + "epoch": 0.3864722681497069, + "flos": 12641239198080.0, + "grad_norm": 1.6951631816528543, + "language_loss": 0.69630527, + "learning_rate": 2.8086435557832203e-06, + "loss": 0.71788281, + "num_input_tokens_seen": 137993545, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 6428, + "time_per_iteration": 2.4336817264556885 + }, + { + "auxiliary_loss_clip": 0.01120968, + "auxiliary_loss_mlp": 0.01042024, + "balance_loss_clip": 1.02787971, + "balance_loss_mlp": 1.0427897, + "epoch": 0.3865323914023749, + "flos": 17598922162560.0, + "grad_norm": 2.175868568260599, + "language_loss": 0.84272587, + "learning_rate": 2.8082873322547863e-06, + "loss": 0.86435586, + "num_input_tokens_seen": 138010140, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.78125, + "step": 6429, + "time_per_iteration": 5.324048757553101 + }, + { + "auxiliary_loss_clip": 0.01121297, + "auxiliary_loss_mlp": 0.01037029, + "balance_loss_clip": 1.02358222, + "balance_loss_mlp": 1.04458523, + "epoch": 0.38659251465504285, + "flos": 18478949374080.0, + "grad_norm": 2.0434704200334726, + "language_loss": 0.808312, + "learning_rate": 2.807931078076015e-06, + "loss": 0.82989526, + "num_input_tokens_seen": 138028880, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 6430, + "time_per_iteration": 3.8362674713134766 + }, + { + "auxiliary_loss_clip": 0.01037896, + "auxiliary_loss_mlp": 0.01001686, + "balance_loss_clip": 1.00019002, + "balance_loss_mlp": 1.01247668, + "epoch": 0.3866526379077108, + "flos": 64165726978560.0, + "grad_norm": 0.7147232834997996, + "language_loss": 0.58793551, + "learning_rate": 2.807574793260416e-06, + "loss": 0.60833132, + "num_input_tokens_seen": 138098090, + "router_z_loss_clip": 0.01495361, + "router_z_loss_mlp": 0.25390625, + "step": 6431, + "time_per_iteration": 3.1054275035858154 + }, + { + "auxiliary_loss_clip": 0.01123522, + "auxiliary_loss_mlp": 0.01036133, + "balance_loss_clip": 1.0213275, + "balance_loss_mlp": 1.04425848, + "epoch": 0.3867127611603788, + "flos": 14388292897920.0, + "grad_norm": 1.8418420222570902, + "language_loss": 0.78914982, + "learning_rate": 2.8072184778215004e-06, + "loss": 0.81074637, + "num_input_tokens_seen": 138114735, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 6432, + "time_per_iteration": 2.441103458404541 + }, + { + "auxiliary_loss_clip": 0.01120861, + "auxiliary_loss_mlp": 0.01042942, + "balance_loss_clip": 1.02820802, + "balance_loss_mlp": 1.04033065, + "epoch": 0.38677288441304675, + "flos": 20010754823040.0, + "grad_norm": 3.1335187433073006, + "language_loss": 0.80734611, + "learning_rate": 2.806862131772779e-06, + "loss": 0.82898408, + "num_input_tokens_seen": 138130480, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8046875, + "step": 6433, + "time_per_iteration": 2.4334840774536133 + }, + { + "auxiliary_loss_clip": 0.01122101, + "auxiliary_loss_mlp": 0.01036931, + "balance_loss_clip": 1.02167201, + "balance_loss_mlp": 1.04427695, + "epoch": 0.3868330076657147, + "flos": 22236893147520.0, + "grad_norm": 1.9920607209076013, + "language_loss": 0.70712543, + "learning_rate": 2.806505755127765e-06, + "loss": 0.72871572, + "num_input_tokens_seen": 138150640, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.77734375, + "step": 6434, + "time_per_iteration": 2.4485912322998047 + }, + { + "auxiliary_loss_clip": 0.01122001, + "auxiliary_loss_mlp": 0.01037218, + "balance_loss_clip": 1.02259684, + "balance_loss_mlp": 1.04096544, + "epoch": 0.3868931309183827, + "flos": 16727442387840.0, + "grad_norm": 3.1146547904297615, + "language_loss": 0.77674437, + "learning_rate": 2.806149347899972e-06, + "loss": 0.79833651, + "num_input_tokens_seen": 138169700, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8125, + "step": 6435, + "time_per_iteration": 2.4734902381896973 + }, + { + "auxiliary_loss_clip": 0.01117121, + "auxiliary_loss_mlp": 0.01032568, + "balance_loss_clip": 1.01877558, + "balance_loss_mlp": 1.04157901, + "epoch": 0.38695325417105064, + "flos": 22674716023680.0, + "grad_norm": 1.6626735995393465, + "language_loss": 0.79557228, + "learning_rate": 2.805792910102915e-06, + "loss": 0.81706917, + "num_input_tokens_seen": 138185835, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.75390625, + "step": 6436, + "time_per_iteration": 2.461880922317505 + }, + { + "auxiliary_loss_clip": 0.01115966, + "auxiliary_loss_mlp": 0.01032941, + "balance_loss_clip": 1.01937521, + "balance_loss_mlp": 1.04099202, + "epoch": 0.3870133774237186, + "flos": 23112036109440.0, + "grad_norm": 1.7213495950653388, + "language_loss": 0.77057981, + "learning_rate": 2.8054364417501093e-06, + "loss": 0.79206884, + "num_input_tokens_seen": 138204080, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.75, + "step": 6437, + "time_per_iteration": 2.506342649459839 + }, + { + "auxiliary_loss_clip": 0.01118581, + "auxiliary_loss_mlp": 0.0104126, + "balance_loss_clip": 1.02759838, + "balance_loss_mlp": 1.0425818, + "epoch": 0.3870735006763866, + "flos": 17675699483520.0, + "grad_norm": 2.0991099349261013, + "language_loss": 0.8199805, + "learning_rate": 2.805079942855074e-06, + "loss": 0.84157896, + "num_input_tokens_seen": 138220710, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7578125, + "step": 6438, + "time_per_iteration": 2.4236960411071777 + }, + { + "auxiliary_loss_clip": 0.01119447, + "auxiliary_loss_mlp": 0.01039004, + "balance_loss_clip": 1.02413225, + "balance_loss_mlp": 1.04198575, + "epoch": 0.38713362392905454, + "flos": 23295791111040.0, + "grad_norm": 1.4416179830694351, + "language_loss": 0.75274503, + "learning_rate": 2.804723413431326e-06, + "loss": 0.77432954, + "num_input_tokens_seen": 138241720, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7734375, + "step": 6439, + "time_per_iteration": 2.4746499061584473 + }, + { + "auxiliary_loss_clip": 0.01115954, + "auxiliary_loss_mlp": 0.01030927, + "balance_loss_clip": 1.01804042, + "balance_loss_mlp": 1.04231787, + "epoch": 0.38719374718172256, + "flos": 21031192298880.0, + "grad_norm": 1.4591961315755648, + "language_loss": 0.74029297, + "learning_rate": 2.8043668534923855e-06, + "loss": 0.76176178, + "num_input_tokens_seen": 138261885, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.734375, + "step": 6440, + "time_per_iteration": 2.470442056655884 + }, + { + "auxiliary_loss_clip": 0.01120633, + "auxiliary_loss_mlp": 0.01041682, + "balance_loss_clip": 1.02755535, + "balance_loss_mlp": 1.04172719, + "epoch": 0.3872538704343905, + "flos": 19609776322560.0, + "grad_norm": 1.882594032026591, + "language_loss": 0.82420492, + "learning_rate": 2.804010263051774e-06, + "loss": 0.84582806, + "num_input_tokens_seen": 138280255, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 6441, + "time_per_iteration": 2.4857184886932373 + }, + { + "auxiliary_loss_clip": 0.01119023, + "auxiliary_loss_mlp": 0.01044385, + "balance_loss_clip": 1.03132594, + "balance_loss_mlp": 1.04210794, + "epoch": 0.3873139936870585, + "flos": 17530045833600.0, + "grad_norm": 2.099147848905264, + "language_loss": 0.81835496, + "learning_rate": 2.8036536421230118e-06, + "loss": 0.83998901, + "num_input_tokens_seen": 138296675, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.76953125, + "step": 6442, + "time_per_iteration": 2.4149296283721924 + }, + { + "auxiliary_loss_clip": 0.01116335, + "auxiliary_loss_mlp": 0.01035738, + "balance_loss_clip": 1.02142096, + "balance_loss_mlp": 1.04025602, + "epoch": 0.38737411693972645, + "flos": 17786555832960.0, + "grad_norm": 1.5694674536603201, + "language_loss": 0.83847654, + "learning_rate": 2.803296990719624e-06, + "loss": 0.85999727, + "num_input_tokens_seen": 138314985, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7578125, + "step": 6443, + "time_per_iteration": 2.4515957832336426 + }, + { + "auxiliary_loss_clip": 0.01039021, + "auxiliary_loss_mlp": 0.01007024, + "balance_loss_clip": 1.00551593, + "balance_loss_mlp": 1.0140909, + "epoch": 0.3874342401923944, + "flos": 58304637048960.0, + "grad_norm": 0.7719544775144753, + "language_loss": 0.50268674, + "learning_rate": 2.8029403088551327e-06, + "loss": 0.52314723, + "num_input_tokens_seen": 138373275, + "router_z_loss_clip": 0.01507568, + "router_z_loss_mlp": 0.24902344, + "step": 6444, + "time_per_iteration": 3.092834711074829 + }, + { + "auxiliary_loss_clip": 0.01115245, + "auxiliary_loss_mlp": 0.01037927, + "balance_loss_clip": 1.02502251, + "balance_loss_mlp": 1.04225266, + "epoch": 0.3874943634450624, + "flos": 17711933328000.0, + "grad_norm": 1.537835026490341, + "language_loss": 0.78736365, + "learning_rate": 2.802583596543065e-06, + "loss": 0.80889541, + "num_input_tokens_seen": 138391145, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7265625, + "step": 6445, + "time_per_iteration": 2.435347557067871 + }, + { + "auxiliary_loss_clip": 0.01115913, + "auxiliary_loss_mlp": 0.01033846, + "balance_loss_clip": 1.02055407, + "balance_loss_mlp": 1.04211605, + "epoch": 0.38755448669773035, + "flos": 19244852098560.0, + "grad_norm": 1.672895701432963, + "language_loss": 0.81121695, + "learning_rate": 2.8022268537969474e-06, + "loss": 0.83271456, + "num_input_tokens_seen": 138409875, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73828125, + "step": 6446, + "time_per_iteration": 2.469536781311035 + }, + { + "auxiliary_loss_clip": 0.01114406, + "auxiliary_loss_mlp": 0.01037114, + "balance_loss_clip": 1.02434087, + "balance_loss_mlp": 1.03933239, + "epoch": 0.3876146099503983, + "flos": 20594267262720.0, + "grad_norm": 1.877585125713849, + "language_loss": 0.77093089, + "learning_rate": 2.801870080630306e-06, + "loss": 0.79244608, + "num_input_tokens_seen": 138428965, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.75, + "step": 6447, + "time_per_iteration": 2.428525447845459 + }, + { + "auxiliary_loss_clip": 0.01116221, + "auxiliary_loss_mlp": 0.01032372, + "balance_loss_clip": 1.01940775, + "balance_loss_mlp": 1.04256356, + "epoch": 0.3876747332030663, + "flos": 19281121856640.0, + "grad_norm": 1.5240627220637166, + "language_loss": 0.75767821, + "learning_rate": 2.801513277056671e-06, + "loss": 0.7791642, + "num_input_tokens_seen": 138448090, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.734375, + "step": 6448, + "time_per_iteration": 2.4325876235961914 + }, + { + "auxiliary_loss_clip": 0.01115196, + "auxiliary_loss_mlp": 0.01033743, + "balance_loss_clip": 1.02023029, + "balance_loss_mlp": 1.04179466, + "epoch": 0.38773485645573424, + "flos": 18945895201920.0, + "grad_norm": 1.6442003276819328, + "language_loss": 0.75754648, + "learning_rate": 2.8011564430895725e-06, + "loss": 0.77903593, + "num_input_tokens_seen": 138466105, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 6449, + "time_per_iteration": 2.435208320617676 + }, + { + "auxiliary_loss_clip": 0.0111808, + "auxiliary_loss_mlp": 0.01033453, + "balance_loss_clip": 1.0194999, + "balance_loss_mlp": 1.03956699, + "epoch": 0.3877949797084022, + "flos": 23071348978560.0, + "grad_norm": 1.5394171504545016, + "language_loss": 0.78183508, + "learning_rate": 2.800799578742542e-06, + "loss": 0.80335045, + "num_input_tokens_seen": 138485160, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.78515625, + "step": 6450, + "time_per_iteration": 2.467933177947998 + }, + { + "auxiliary_loss_clip": 0.0112145, + "auxiliary_loss_mlp": 0.01036071, + "balance_loss_clip": 1.02190948, + "balance_loss_mlp": 1.04104686, + "epoch": 0.3878551029610702, + "flos": 29095543589760.0, + "grad_norm": 2.1284571270947263, + "language_loss": 0.77706474, + "learning_rate": 2.8004426840291106e-06, + "loss": 0.79863995, + "num_input_tokens_seen": 138504135, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8046875, + "step": 6451, + "time_per_iteration": 2.513192892074585 + }, + { + "auxiliary_loss_clip": 0.01112409, + "auxiliary_loss_mlp": 0.01026865, + "balance_loss_clip": 1.01337111, + "balance_loss_mlp": 1.03988457, + "epoch": 0.38791522621373814, + "flos": 20996394998400.0, + "grad_norm": 1.5965207120841256, + "language_loss": 0.7642619, + "learning_rate": 2.800085758962812e-06, + "loss": 0.7856546, + "num_input_tokens_seen": 138523955, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7265625, + "step": 6452, + "time_per_iteration": 2.453756809234619 + }, + { + "auxiliary_loss_clip": 0.01118677, + "auxiliary_loss_mlp": 0.01040253, + "balance_loss_clip": 1.02721739, + "balance_loss_mlp": 1.04313231, + "epoch": 0.3879753494664061, + "flos": 15486836497920.0, + "grad_norm": 1.5417712426283914, + "language_loss": 0.79843581, + "learning_rate": 2.799728803557182e-06, + "loss": 0.82002515, + "num_input_tokens_seen": 138541655, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7578125, + "step": 6453, + "time_per_iteration": 2.434788465499878 + }, + { + "auxiliary_loss_clip": 0.01126032, + "auxiliary_loss_mlp": 0.01037051, + "balance_loss_clip": 1.02257931, + "balance_loss_mlp": 1.0456028, + "epoch": 0.3880354727190741, + "flos": 22053964158720.0, + "grad_norm": 1.779502658436086, + "language_loss": 0.71759796, + "learning_rate": 2.7993718178257555e-06, + "loss": 0.73922884, + "num_input_tokens_seen": 138560860, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8046875, + "step": 6454, + "time_per_iteration": 2.456637382507324 + }, + { + "auxiliary_loss_clip": 0.01122488, + "auxiliary_loss_mlp": 0.01039713, + "balance_loss_clip": 1.02489531, + "balance_loss_mlp": 1.04253364, + "epoch": 0.3880955959717421, + "flos": 20340307128960.0, + "grad_norm": 2.1246626443539216, + "language_loss": 0.77918947, + "learning_rate": 2.7990148017820694e-06, + "loss": 0.80081153, + "num_input_tokens_seen": 138580200, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 6455, + "time_per_iteration": 2.4589757919311523 + }, + { + "auxiliary_loss_clip": 0.01118002, + "auxiliary_loss_mlp": 0.01034735, + "balance_loss_clip": 1.02040577, + "balance_loss_mlp": 1.04232621, + "epoch": 0.38815571922441006, + "flos": 23075407215360.0, + "grad_norm": 1.6339807395025958, + "language_loss": 0.75865024, + "learning_rate": 2.798657755439662e-06, + "loss": 0.78017759, + "num_input_tokens_seen": 138598315, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7578125, + "step": 6456, + "time_per_iteration": 2.4390318393707275 + }, + { + "auxiliary_loss_clip": 0.01121145, + "auxiliary_loss_mlp": 0.01033254, + "balance_loss_clip": 1.01944995, + "balance_loss_mlp": 1.04276633, + "epoch": 0.388215842477078, + "flos": 20776944856320.0, + "grad_norm": 2.085241252102015, + "language_loss": 0.60518527, + "learning_rate": 2.7983006788120726e-06, + "loss": 0.62672919, + "num_input_tokens_seen": 138615695, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.78515625, + "step": 6457, + "time_per_iteration": 2.459535837173462 + }, + { + "auxiliary_loss_clip": 0.01121291, + "auxiliary_loss_mlp": 0.01037459, + "balance_loss_clip": 1.02167547, + "balance_loss_mlp": 1.04195237, + "epoch": 0.388275965729746, + "flos": 20448182649600.0, + "grad_norm": 2.1234505206368475, + "language_loss": 0.80247247, + "learning_rate": 2.797943571912841e-06, + "loss": 0.82405996, + "num_input_tokens_seen": 138633180, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.79296875, + "step": 6458, + "time_per_iteration": 2.425049066543579 + }, + { + "auxiliary_loss_clip": 0.01120771, + "auxiliary_loss_mlp": 0.0103458, + "balance_loss_clip": 1.02072167, + "balance_loss_mlp": 1.04291797, + "epoch": 0.38833608898241395, + "flos": 27892392606720.0, + "grad_norm": 1.8371533851039183, + "language_loss": 0.81683058, + "learning_rate": 2.797586434755509e-06, + "loss": 0.83838403, + "num_input_tokens_seen": 138654785, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.77734375, + "step": 6459, + "time_per_iteration": 2.5234129428863525 + }, + { + "auxiliary_loss_clip": 0.01117247, + "auxiliary_loss_mlp": 0.01034445, + "balance_loss_clip": 1.02105141, + "balance_loss_mlp": 1.04261899, + "epoch": 0.3883962122350819, + "flos": 18076390675200.0, + "grad_norm": 3.3845315312390643, + "language_loss": 0.61609662, + "learning_rate": 2.7972292673536202e-06, + "loss": 0.63761353, + "num_input_tokens_seen": 138673330, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 6460, + "time_per_iteration": 2.4271440505981445 + }, + { + "auxiliary_loss_clip": 0.01121121, + "auxiliary_loss_mlp": 0.01034276, + "balance_loss_clip": 1.0216701, + "balance_loss_mlp": 1.04498553, + "epoch": 0.3884563354877499, + "flos": 23622254847360.0, + "grad_norm": 1.999840896697599, + "language_loss": 0.85928953, + "learning_rate": 2.796872069720717e-06, + "loss": 0.88084352, + "num_input_tokens_seen": 138694185, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.76171875, + "step": 6461, + "time_per_iteration": 2.4874932765960693 + }, + { + "auxiliary_loss_clip": 0.01121067, + "auxiliary_loss_mlp": 0.01041247, + "balance_loss_clip": 1.02712059, + "balance_loss_mlp": 1.04198229, + "epoch": 0.38851645874041785, + "flos": 27453528236160.0, + "grad_norm": 5.6194775515218085, + "language_loss": 0.71397054, + "learning_rate": 2.7965148418703456e-06, + "loss": 0.73559368, + "num_input_tokens_seen": 138714625, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 6462, + "time_per_iteration": 2.4839894771575928 + }, + { + "auxiliary_loss_clip": 0.01120048, + "auxiliary_loss_mlp": 0.01036945, + "balance_loss_clip": 1.02274752, + "balance_loss_mlp": 1.04190457, + "epoch": 0.3885765819930858, + "flos": 25228072270080.0, + "grad_norm": 2.13487298932128, + "language_loss": 0.7582581, + "learning_rate": 2.796157583816052e-06, + "loss": 0.77982807, + "num_input_tokens_seen": 138733585, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.78125, + "step": 6463, + "time_per_iteration": 2.4897215366363525 + }, + { + "auxiliary_loss_clip": 0.0112511, + "auxiliary_loss_mlp": 0.01046321, + "balance_loss_clip": 1.0305022, + "balance_loss_mlp": 1.04482341, + "epoch": 0.3886367052457538, + "flos": 16946605221120.0, + "grad_norm": 1.9442764767857983, + "language_loss": 0.70078236, + "learning_rate": 2.795800295571382e-06, + "loss": 0.72249663, + "num_input_tokens_seen": 138752335, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8046875, + "step": 6464, + "time_per_iteration": 2.4519219398498535 + }, + { + "auxiliary_loss_clip": 0.01118845, + "auxiliary_loss_mlp": 0.01036529, + "balance_loss_clip": 1.02258134, + "balance_loss_mlp": 1.04280329, + "epoch": 0.38869682849842174, + "flos": 27154140376320.0, + "grad_norm": 1.8350923871455525, + "language_loss": 0.69608724, + "learning_rate": 2.7954429771498858e-06, + "loss": 0.717641, + "num_input_tokens_seen": 138768450, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7578125, + "step": 6465, + "time_per_iteration": 2.524698495864868 + }, + { + "auxiliary_loss_clip": 0.01120474, + "auxiliary_loss_mlp": 0.01043161, + "balance_loss_clip": 1.02772307, + "balance_loss_mlp": 1.04204226, + "epoch": 0.3887569517510897, + "flos": 21063619301760.0, + "grad_norm": 2.02186972310505, + "language_loss": 0.77957165, + "learning_rate": 2.7950856285651117e-06, + "loss": 0.80120802, + "num_input_tokens_seen": 138786775, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.78125, + "step": 6466, + "time_per_iteration": 2.4420318603515625 + }, + { + "auxiliary_loss_clip": 0.0112437, + "auxiliary_loss_mlp": 0.01039754, + "balance_loss_clip": 1.02520418, + "balance_loss_mlp": 1.04476476, + "epoch": 0.38881707500375773, + "flos": 29497384016640.0, + "grad_norm": 1.578436157089315, + "language_loss": 0.69438803, + "learning_rate": 2.794728249830611e-06, + "loss": 0.71602929, + "num_input_tokens_seen": 138810100, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.796875, + "step": 6467, + "time_per_iteration": 2.526315212249756 + }, + { + "auxiliary_loss_clip": 0.01122941, + "auxiliary_loss_mlp": 0.01048409, + "balance_loss_clip": 1.03337657, + "balance_loss_mlp": 1.04374123, + "epoch": 0.3888771982564257, + "flos": 17488281294720.0, + "grad_norm": 2.7189933074164316, + "language_loss": 0.83444071, + "learning_rate": 2.794370840959936e-06, + "loss": 0.85615414, + "num_input_tokens_seen": 138825140, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.79296875, + "step": 6468, + "time_per_iteration": 2.433612108230591 + }, + { + "auxiliary_loss_clip": 0.01119544, + "auxiliary_loss_mlp": 0.01040242, + "balance_loss_clip": 1.02720666, + "balance_loss_mlp": 1.04250181, + "epoch": 0.38893732150909366, + "flos": 21942425450880.0, + "grad_norm": 5.890128393718138, + "language_loss": 0.84300733, + "learning_rate": 2.7940134019666383e-06, + "loss": 0.86460519, + "num_input_tokens_seen": 138844115, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.76953125, + "step": 6469, + "time_per_iteration": 2.501368284225464 + }, + { + "auxiliary_loss_clip": 0.011205, + "auxiliary_loss_mlp": 0.0104307, + "balance_loss_clip": 1.02871704, + "balance_loss_mlp": 1.0433706, + "epoch": 0.3889974447617616, + "flos": 24276367468800.0, + "grad_norm": 1.6566744770772097, + "language_loss": 0.74790764, + "learning_rate": 2.793655932864273e-06, + "loss": 0.76954335, + "num_input_tokens_seen": 138860860, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.76953125, + "step": 6470, + "time_per_iteration": 5.350924015045166 + }, + { + "auxiliary_loss_clip": 0.01120302, + "auxiliary_loss_mlp": 0.01041359, + "balance_loss_clip": 1.02632678, + "balance_loss_mlp": 1.04234362, + "epoch": 0.3890575680144296, + "flos": 25667116208640.0, + "grad_norm": 1.5254918915202156, + "language_loss": 0.74916464, + "learning_rate": 2.7932984336663953e-06, + "loss": 0.77078122, + "num_input_tokens_seen": 138881910, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.78125, + "step": 6471, + "time_per_iteration": 5.323298215866089 + }, + { + "auxiliary_loss_clip": 0.01121653, + "auxiliary_loss_mlp": 0.01045365, + "balance_loss_clip": 1.0310601, + "balance_loss_mlp": 1.04548645, + "epoch": 0.38911769126709755, + "flos": 22855274714880.0, + "grad_norm": 1.9258613787227117, + "language_loss": 0.68053186, + "learning_rate": 2.792940904386562e-06, + "loss": 0.70220202, + "num_input_tokens_seen": 138900975, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.76171875, + "step": 6472, + "time_per_iteration": 2.453610420227051 + }, + { + "auxiliary_loss_clip": 0.01120597, + "auxiliary_loss_mlp": 0.01046672, + "balance_loss_clip": 1.0330584, + "balance_loss_mlp": 1.04305148, + "epoch": 0.3891778145197655, + "flos": 25447522412160.0, + "grad_norm": 1.6233097762345425, + "language_loss": 0.76542008, + "learning_rate": 2.7925833450383293e-06, + "loss": 0.7870928, + "num_input_tokens_seen": 138920795, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7734375, + "step": 6473, + "time_per_iteration": 2.487966775894165 + }, + { + "auxiliary_loss_clip": 0.01123459, + "auxiliary_loss_mlp": 0.01046447, + "balance_loss_clip": 1.03157008, + "balance_loss_mlp": 1.04532015, + "epoch": 0.3892379377724335, + "flos": 14027965614720.0, + "grad_norm": 1.8986671727726652, + "language_loss": 0.70897496, + "learning_rate": 2.792225755635257e-06, + "loss": 0.73067403, + "num_input_tokens_seen": 138938770, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78125, + "step": 6474, + "time_per_iteration": 2.4192309379577637 + }, + { + "auxiliary_loss_clip": 0.01121654, + "auxiliary_loss_mlp": 0.01039414, + "balance_loss_clip": 1.02607441, + "balance_loss_mlp": 1.04441047, + "epoch": 0.38929806102510145, + "flos": 20157449967360.0, + "grad_norm": 1.400231739949646, + "language_loss": 0.68822956, + "learning_rate": 2.7918681361909046e-06, + "loss": 0.70984024, + "num_input_tokens_seen": 138958880, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7734375, + "step": 6475, + "time_per_iteration": 2.508747100830078 + }, + { + "auxiliary_loss_clip": 0.01129756, + "auxiliary_loss_mlp": 0.0104873, + "balance_loss_clip": 1.03369188, + "balance_loss_mlp": 1.04747105, + "epoch": 0.3893581842777694, + "flos": 22163958581760.0, + "grad_norm": 2.0025883037810055, + "language_loss": 0.76052523, + "learning_rate": 2.7915104867188332e-06, + "loss": 0.78231013, + "num_input_tokens_seen": 138977240, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.82421875, + "step": 6476, + "time_per_iteration": 2.4432644844055176 + }, + { + "auxiliary_loss_clip": 0.01040957, + "auxiliary_loss_mlp": 0.01003672, + "balance_loss_clip": 1.00199068, + "balance_loss_mlp": 1.01581097, + "epoch": 0.3894183075304374, + "flos": 67301877392640.0, + "grad_norm": 0.7803986728659921, + "language_loss": 0.58254546, + "learning_rate": 2.7911528072326055e-06, + "loss": 0.60299176, + "num_input_tokens_seen": 139039035, + "router_z_loss_clip": 0.0168457, + "router_z_loss_mlp": 0.25, + "step": 6477, + "time_per_iteration": 3.0704691410064697 + }, + { + "auxiliary_loss_clip": 0.01123971, + "auxiliary_loss_mlp": 0.01038208, + "balance_loss_clip": 1.02279997, + "balance_loss_mlp": 1.04507279, + "epoch": 0.38947843078310534, + "flos": 18547502480640.0, + "grad_norm": 1.75333723767605, + "language_loss": 0.77916539, + "learning_rate": 2.7907950977457832e-06, + "loss": 0.80078721, + "num_input_tokens_seen": 139055560, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.7890625, + "step": 6478, + "time_per_iteration": 2.488922357559204 + }, + { + "auxiliary_loss_clip": 0.01118156, + "auxiliary_loss_mlp": 0.0103482, + "balance_loss_clip": 1.0212301, + "balance_loss_mlp": 1.04128957, + "epoch": 0.3895385540357733, + "flos": 14605875532800.0, + "grad_norm": 1.928920480761015, + "language_loss": 0.82250136, + "learning_rate": 2.7904373582719317e-06, + "loss": 0.8440311, + "num_input_tokens_seen": 139071865, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.76953125, + "step": 6479, + "time_per_iteration": 2.4171228408813477 + }, + { + "auxiliary_loss_clip": 0.01118219, + "auxiliary_loss_mlp": 0.01036864, + "balance_loss_clip": 1.02262461, + "balance_loss_mlp": 1.04175949, + "epoch": 0.38959867728844133, + "flos": 19975203336960.0, + "grad_norm": 1.7024032073041733, + "language_loss": 0.80111545, + "learning_rate": 2.790079588824617e-06, + "loss": 0.82266629, + "num_input_tokens_seen": 139089640, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.765625, + "step": 6480, + "time_per_iteration": 2.4750797748565674 + }, + { + "auxiliary_loss_clip": 0.01117569, + "auxiliary_loss_mlp": 0.01027596, + "balance_loss_clip": 1.01428056, + "balance_loss_mlp": 1.04215932, + "epoch": 0.3896588005411093, + "flos": 22672130244480.0, + "grad_norm": 1.550121095479633, + "language_loss": 0.83083898, + "learning_rate": 2.7897217894174038e-06, + "loss": 0.85229063, + "num_input_tokens_seen": 139109365, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75390625, + "step": 6481, + "time_per_iteration": 2.4715166091918945 + }, + { + "auxiliary_loss_clip": 0.01117656, + "auxiliary_loss_mlp": 0.01037477, + "balance_loss_clip": 1.02437592, + "balance_loss_mlp": 1.04459131, + "epoch": 0.38971892379377726, + "flos": 20996035862400.0, + "grad_norm": 1.557560720892756, + "language_loss": 0.75559932, + "learning_rate": 2.789363960063863e-06, + "loss": 0.77715063, + "num_input_tokens_seen": 139128260, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73046875, + "step": 6482, + "time_per_iteration": 2.4623568058013916 + }, + { + "auxiliary_loss_clip": 0.01119557, + "auxiliary_loss_mlp": 0.01033451, + "balance_loss_clip": 1.01972985, + "balance_loss_mlp": 1.04252028, + "epoch": 0.3897790470464452, + "flos": 22528487756160.0, + "grad_norm": 3.29893715214875, + "language_loss": 0.79150903, + "learning_rate": 2.78900610077756e-06, + "loss": 0.81303906, + "num_input_tokens_seen": 139147315, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76953125, + "step": 6483, + "time_per_iteration": 2.4530816078186035 + }, + { + "auxiliary_loss_clip": 0.01117537, + "auxiliary_loss_mlp": 0.01028675, + "balance_loss_clip": 1.0135119, + "balance_loss_mlp": 1.04091668, + "epoch": 0.3898391702991132, + "flos": 26209905603840.0, + "grad_norm": 1.4423872752445677, + "language_loss": 0.79842782, + "learning_rate": 2.788648211572067e-06, + "loss": 0.81989002, + "num_input_tokens_seen": 139167270, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.765625, + "step": 6484, + "time_per_iteration": 2.511016845703125 + }, + { + "auxiliary_loss_clip": 0.01121595, + "auxiliary_loss_mlp": 0.01044531, + "balance_loss_clip": 1.02905726, + "balance_loss_mlp": 1.04556251, + "epoch": 0.38989929355178116, + "flos": 21065558636160.0, + "grad_norm": 1.7756536915325172, + "language_loss": 0.78321344, + "learning_rate": 2.7882902924609557e-06, + "loss": 0.80487472, + "num_input_tokens_seen": 139185970, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.76171875, + "step": 6485, + "time_per_iteration": 2.443439245223999 + }, + { + "auxiliary_loss_clip": 0.01121432, + "auxiliary_loss_mlp": 0.0103836, + "balance_loss_clip": 1.02298832, + "balance_loss_mlp": 1.0427072, + "epoch": 0.3899594168044491, + "flos": 25484115392640.0, + "grad_norm": 2.7221954850945425, + "language_loss": 0.85305119, + "learning_rate": 2.7879323434577965e-06, + "loss": 0.87464917, + "num_input_tokens_seen": 139203730, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.78515625, + "step": 6486, + "time_per_iteration": 2.5056657791137695 + }, + { + "auxiliary_loss_clip": 0.01120884, + "auxiliary_loss_mlp": 0.01033404, + "balance_loss_clip": 1.01942706, + "balance_loss_mlp": 1.04115701, + "epoch": 0.3900195400571171, + "flos": 31139363456640.0, + "grad_norm": 1.7551040773297495, + "language_loss": 0.85345674, + "learning_rate": 2.7875743645761645e-06, + "loss": 0.87499964, + "num_input_tokens_seen": 139222560, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.796875, + "step": 6487, + "time_per_iteration": 2.577178478240967 + }, + { + "auxiliary_loss_clip": 0.01117565, + "auxiliary_loss_mlp": 0.01032303, + "balance_loss_clip": 1.01737833, + "balance_loss_mlp": 1.04198551, + "epoch": 0.39007966330978505, + "flos": 20229917656320.0, + "grad_norm": 1.5246902220393208, + "language_loss": 0.73225224, + "learning_rate": 2.787216355829633e-06, + "loss": 0.75375092, + "num_input_tokens_seen": 139242165, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.75390625, + "step": 6488, + "time_per_iteration": 2.523616075515747 + }, + { + "auxiliary_loss_clip": 0.01123928, + "auxiliary_loss_mlp": 0.0103261, + "balance_loss_clip": 1.01795912, + "balance_loss_mlp": 1.04519773, + "epoch": 0.390139786562453, + "flos": 22528739151360.0, + "grad_norm": 2.5708303691917815, + "language_loss": 0.68585873, + "learning_rate": 2.786858317231779e-06, + "loss": 0.7074241, + "num_input_tokens_seen": 139262525, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7890625, + "step": 6489, + "time_per_iteration": 2.478531837463379 + }, + { + "auxiliary_loss_clip": 0.01115096, + "auxiliary_loss_mlp": 0.01041079, + "balance_loss_clip": 1.02680993, + "balance_loss_mlp": 1.04124475, + "epoch": 0.390199909815121, + "flos": 26432911192320.0, + "grad_norm": 1.801271673710844, + "language_loss": 0.81112868, + "learning_rate": 2.7865002487961788e-06, + "loss": 0.83269042, + "num_input_tokens_seen": 139282835, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.734375, + "step": 6490, + "time_per_iteration": 2.511854887008667 + }, + { + "auxiliary_loss_clip": 0.01121469, + "auxiliary_loss_mlp": 0.01033838, + "balance_loss_clip": 1.0193367, + "balance_loss_mlp": 1.04286718, + "epoch": 0.39026003306778895, + "flos": 17274577328640.0, + "grad_norm": 1.9146492238240407, + "language_loss": 0.89305747, + "learning_rate": 2.7861421505364104e-06, + "loss": 0.91461056, + "num_input_tokens_seen": 139299490, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78515625, + "step": 6491, + "time_per_iteration": 2.460026264190674 + }, + { + "auxiliary_loss_clip": 0.01121295, + "auxiliary_loss_mlp": 0.01035704, + "balance_loss_clip": 1.02187026, + "balance_loss_mlp": 1.04215312, + "epoch": 0.3903201563204569, + "flos": 24532841554560.0, + "grad_norm": 1.8200320241713732, + "language_loss": 0.78811067, + "learning_rate": 2.7857840224660523e-06, + "loss": 0.80968064, + "num_input_tokens_seen": 139317865, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7890625, + "step": 6492, + "time_per_iteration": 2.529750108718872 + }, + { + "auxiliary_loss_clip": 0.01122151, + "auxiliary_loss_mlp": 0.01037174, + "balance_loss_clip": 1.02316093, + "balance_loss_mlp": 1.04309416, + "epoch": 0.39038027957312493, + "flos": 23767944410880.0, + "grad_norm": 1.613220074099035, + "language_loss": 0.74635601, + "learning_rate": 2.7854258645986857e-06, + "loss": 0.76794928, + "num_input_tokens_seen": 139339840, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 6493, + "time_per_iteration": 2.506000280380249 + }, + { + "auxiliary_loss_clip": 0.01123496, + "auxiliary_loss_mlp": 0.0103661, + "balance_loss_clip": 1.02160168, + "balance_loss_mlp": 1.04215276, + "epoch": 0.3904404028257929, + "flos": 14100612871680.0, + "grad_norm": 1.9992899078543964, + "language_loss": 0.76100057, + "learning_rate": 2.7850676769478916e-06, + "loss": 0.78260159, + "num_input_tokens_seen": 139357555, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8125, + "step": 6494, + "time_per_iteration": 2.4696662425994873 + }, + { + "auxiliary_loss_clip": 0.01128232, + "auxiliary_loss_mlp": 0.01048514, + "balance_loss_clip": 1.03233767, + "balance_loss_mlp": 1.04337156, + "epoch": 0.39050052607846086, + "flos": 16910048154240.0, + "grad_norm": 2.027559897328472, + "language_loss": 0.74284697, + "learning_rate": 2.7847094595272525e-06, + "loss": 0.76461446, + "num_input_tokens_seen": 139374455, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.84765625, + "step": 6495, + "time_per_iteration": 2.4156551361083984 + }, + { + "auxiliary_loss_clip": 0.01121782, + "auxiliary_loss_mlp": 0.01041912, + "balance_loss_clip": 1.02683187, + "balance_loss_mlp": 1.04346669, + "epoch": 0.39056064933112883, + "flos": 25915761129600.0, + "grad_norm": 1.725682312794404, + "language_loss": 0.67885542, + "learning_rate": 2.784351212350352e-06, + "loss": 0.70049238, + "num_input_tokens_seen": 139394770, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.78125, + "step": 6496, + "time_per_iteration": 2.530111789703369 + }, + { + "auxiliary_loss_clip": 0.01038749, + "auxiliary_loss_mlp": 0.01000219, + "balance_loss_clip": 0.99871743, + "balance_loss_mlp": 1.01313972, + "epoch": 0.3906207725837968, + "flos": 60028421713920.0, + "grad_norm": 0.6624336186281815, + "language_loss": 0.53998011, + "learning_rate": 2.783992935430775e-06, + "loss": 0.56036979, + "num_input_tokens_seen": 139454760, + "router_z_loss_clip": 0.01501465, + "router_z_loss_mlp": 0.25585938, + "step": 6497, + "time_per_iteration": 3.140427589416504 + }, + { + "auxiliary_loss_clip": 0.01119875, + "auxiliary_loss_mlp": 0.01038317, + "balance_loss_clip": 1.02404737, + "balance_loss_mlp": 1.04236674, + "epoch": 0.39068089583646476, + "flos": 21068683119360.0, + "grad_norm": 2.818865741362812, + "language_loss": 0.68966502, + "learning_rate": 2.7836346287821068e-06, + "loss": 0.71124697, + "num_input_tokens_seen": 139472645, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7734375, + "step": 6498, + "time_per_iteration": 2.4631001949310303 + }, + { + "auxiliary_loss_clip": 0.01037794, + "auxiliary_loss_mlp": 0.01003613, + "balance_loss_clip": 1.00210512, + "balance_loss_mlp": 1.0124712, + "epoch": 0.3907410190891327, + "flos": 70445677403520.0, + "grad_norm": 1.032001330091421, + "language_loss": 0.51830518, + "learning_rate": 2.783276292417936e-06, + "loss": 0.5387193, + "num_input_tokens_seen": 139536730, + "router_z_loss_clip": 0.01507568, + "router_z_loss_mlp": 0.25390625, + "step": 6499, + "time_per_iteration": 3.1206116676330566 + }, + { + "auxiliary_loss_clip": 0.01122549, + "auxiliary_loss_mlp": 0.01043094, + "balance_loss_clip": 1.0266552, + "balance_loss_mlp": 1.04158521, + "epoch": 0.3908011423418007, + "flos": 27962454084480.0, + "grad_norm": 1.8695650437594764, + "language_loss": 0.73693466, + "learning_rate": 2.7829179263518487e-06, + "loss": 0.75859112, + "num_input_tokens_seen": 139557540, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.80859375, + "step": 6500, + "time_per_iteration": 2.5413036346435547 + }, + { + "auxiliary_loss_clip": 0.01125544, + "auxiliary_loss_mlp": 0.01041341, + "balance_loss_clip": 1.02720869, + "balance_loss_mlp": 1.04501247, + "epoch": 0.39086126559446865, + "flos": 24462097718400.0, + "grad_norm": 2.5451317073491353, + "language_loss": 0.68355215, + "learning_rate": 2.7825595305974354e-06, + "loss": 0.70522094, + "num_input_tokens_seen": 139576875, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8046875, + "step": 6501, + "time_per_iteration": 2.4725823402404785 + }, + { + "auxiliary_loss_clip": 0.01118681, + "auxiliary_loss_mlp": 0.01039085, + "balance_loss_clip": 1.02550125, + "balance_loss_mlp": 1.04143763, + "epoch": 0.3909213888471366, + "flos": 16941541403520.0, + "grad_norm": 1.6766627212042646, + "language_loss": 0.79162323, + "learning_rate": 2.782201105168287e-06, + "loss": 0.81320089, + "num_input_tokens_seen": 139594295, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7734375, + "step": 6502, + "time_per_iteration": 2.4758012294769287 + }, + { + "auxiliary_loss_clip": 0.01118294, + "auxiliary_loss_mlp": 0.01037474, + "balance_loss_clip": 1.02378237, + "balance_loss_mlp": 1.0435648, + "epoch": 0.3909815120998046, + "flos": 29278400751360.0, + "grad_norm": 2.24722484247342, + "language_loss": 0.79379106, + "learning_rate": 2.7818426500779932e-06, + "loss": 0.81534874, + "num_input_tokens_seen": 139614080, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 6503, + "time_per_iteration": 2.510356903076172 + }, + { + "auxiliary_loss_clip": 0.0111339, + "auxiliary_loss_mlp": 0.01034049, + "balance_loss_clip": 1.02076924, + "balance_loss_mlp": 1.03882694, + "epoch": 0.39104163535247255, + "flos": 18951246328320.0, + "grad_norm": 1.8991979162106922, + "language_loss": 0.71695077, + "learning_rate": 2.7814841653401485e-06, + "loss": 0.73842514, + "num_input_tokens_seen": 139632755, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.74609375, + "step": 6504, + "time_per_iteration": 2.474257230758667 + }, + { + "auxiliary_loss_clip": 0.01116218, + "auxiliary_loss_mlp": 0.01038584, + "balance_loss_clip": 1.02404082, + "balance_loss_mlp": 1.03938556, + "epoch": 0.3911017586051405, + "flos": 26323347732480.0, + "grad_norm": 1.4403698273396093, + "language_loss": 0.83054864, + "learning_rate": 2.7811256509683454e-06, + "loss": 0.85209668, + "num_input_tokens_seen": 139654205, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.765625, + "step": 6505, + "time_per_iteration": 2.4917776584625244 + }, + { + "auxiliary_loss_clip": 0.01118318, + "auxiliary_loss_mlp": 0.01039423, + "balance_loss_clip": 1.02379465, + "balance_loss_mlp": 1.04268944, + "epoch": 0.3911618818578085, + "flos": 21835770992640.0, + "grad_norm": 1.9728617659661118, + "language_loss": 0.71202552, + "learning_rate": 2.7807671069761797e-06, + "loss": 0.73360288, + "num_input_tokens_seen": 139673595, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.7578125, + "step": 6506, + "time_per_iteration": 2.4846489429473877 + }, + { + "auxiliary_loss_clip": 0.01115165, + "auxiliary_loss_mlp": 0.01038977, + "balance_loss_clip": 1.02529216, + "balance_loss_mlp": 1.04129732, + "epoch": 0.3912220051104765, + "flos": 16359680989440.0, + "grad_norm": 2.0442674369719547, + "language_loss": 0.74914789, + "learning_rate": 2.7804085333772477e-06, + "loss": 0.77068931, + "num_input_tokens_seen": 139690565, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73828125, + "step": 6507, + "time_per_iteration": 2.4173166751861572 + }, + { + "auxiliary_loss_clip": 0.01036092, + "auxiliary_loss_mlp": 0.01010532, + "balance_loss_clip": 1.00900638, + "balance_loss_mlp": 1.01097417, + "epoch": 0.39128212836314447, + "flos": 71050986420480.0, + "grad_norm": 0.7697412763639314, + "language_loss": 0.56554615, + "learning_rate": 2.7800499301851446e-06, + "loss": 0.58601236, + "num_input_tokens_seen": 139749420, + "router_z_loss_clip": 0.01525879, + "router_z_loss_mlp": 0.25195312, + "step": 6508, + "time_per_iteration": 3.222599744796753 + }, + { + "auxiliary_loss_clip": 0.01118923, + "auxiliary_loss_mlp": 0.0103919, + "balance_loss_clip": 1.0256958, + "balance_loss_mlp": 1.04224479, + "epoch": 0.39134225161581243, + "flos": 20331975173760.0, + "grad_norm": 1.8903485988869968, + "language_loss": 0.7639432, + "learning_rate": 2.779691297413471e-06, + "loss": 0.78552431, + "num_input_tokens_seen": 139766265, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 6509, + "time_per_iteration": 2.4504122734069824 + }, + { + "auxiliary_loss_clip": 0.01119308, + "auxiliary_loss_mlp": 0.01046298, + "balance_loss_clip": 1.02919126, + "balance_loss_mlp": 1.04120517, + "epoch": 0.3914023748684804, + "flos": 17018390551680.0, + "grad_norm": 2.5320410479027284, + "language_loss": 0.82538676, + "learning_rate": 2.779332635075825e-06, + "loss": 0.84704286, + "num_input_tokens_seen": 139782400, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.78125, + "step": 6510, + "time_per_iteration": 2.4280829429626465 + }, + { + "auxiliary_loss_clip": 0.01119081, + "auxiliary_loss_mlp": 0.01036031, + "balance_loss_clip": 1.02202439, + "balance_loss_mlp": 1.04137504, + "epoch": 0.39146249812114836, + "flos": 18405224709120.0, + "grad_norm": 1.9726874536239134, + "language_loss": 0.76478642, + "learning_rate": 2.7789739431858073e-06, + "loss": 0.78633761, + "num_input_tokens_seen": 139801435, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.77734375, + "step": 6511, + "time_per_iteration": 2.438093662261963 + }, + { + "auxiliary_loss_clip": 0.01035954, + "auxiliary_loss_mlp": 0.01004811, + "balance_loss_clip": 1.0033921, + "balance_loss_mlp": 1.01070499, + "epoch": 0.3915226213738163, + "flos": 67637355442560.0, + "grad_norm": 0.7278620231464888, + "language_loss": 0.57780313, + "learning_rate": 2.7786152217570196e-06, + "loss": 0.59821081, + "num_input_tokens_seen": 139869700, + "router_z_loss_clip": 0.01416016, + "router_z_loss_mlp": 0.25390625, + "step": 6512, + "time_per_iteration": 6.094903230667114 + }, + { + "auxiliary_loss_clip": 0.01120485, + "auxiliary_loss_mlp": 0.01036295, + "balance_loss_clip": 1.02039289, + "balance_loss_mlp": 1.04215658, + "epoch": 0.3915827446264843, + "flos": 26359330181760.0, + "grad_norm": 1.6857291908308145, + "language_loss": 0.69891763, + "learning_rate": 2.7782564708030647e-06, + "loss": 0.72048545, + "num_input_tokens_seen": 139890140, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.78125, + "step": 6513, + "time_per_iteration": 3.8939309120178223 + }, + { + "auxiliary_loss_clip": 0.01122702, + "auxiliary_loss_mlp": 0.01039276, + "balance_loss_clip": 1.02474439, + "balance_loss_mlp": 1.04184556, + "epoch": 0.39164286787915226, + "flos": 21943897908480.0, + "grad_norm": 2.2930968868818606, + "language_loss": 0.76267236, + "learning_rate": 2.7778976903375464e-06, + "loss": 0.7842921, + "num_input_tokens_seen": 139908020, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.80859375, + "step": 6514, + "time_per_iteration": 2.4622693061828613 + }, + { + "auxiliary_loss_clip": 0.01118584, + "auxiliary_loss_mlp": 0.01035563, + "balance_loss_clip": 1.02168727, + "balance_loss_mlp": 1.04042864, + "epoch": 0.3917029911318202, + "flos": 16399829416320.0, + "grad_norm": 1.7838082674219136, + "language_loss": 0.77452338, + "learning_rate": 2.7775388803740693e-06, + "loss": 0.79606491, + "num_input_tokens_seen": 139926180, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.78125, + "step": 6515, + "time_per_iteration": 2.4336462020874023 + }, + { + "auxiliary_loss_clip": 0.01114007, + "auxiliary_loss_mlp": 0.01038217, + "balance_loss_clip": 1.02564025, + "balance_loss_mlp": 1.03940558, + "epoch": 0.3917631143844882, + "flos": 26211701283840.0, + "grad_norm": 1.4542421972503212, + "language_loss": 0.79846406, + "learning_rate": 2.7771800409262406e-06, + "loss": 0.81998634, + "num_input_tokens_seen": 139947420, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.74609375, + "step": 6516, + "time_per_iteration": 2.500826597213745 + }, + { + "auxiliary_loss_clip": 0.01118601, + "auxiliary_loss_mlp": 0.01033224, + "balance_loss_clip": 1.01891923, + "balance_loss_mlp": 1.04082477, + "epoch": 0.39182323763715615, + "flos": 18548364407040.0, + "grad_norm": 2.228742695866407, + "language_loss": 0.70205939, + "learning_rate": 2.7768211720076665e-06, + "loss": 0.72357762, + "num_input_tokens_seen": 139965800, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.77734375, + "step": 6517, + "time_per_iteration": 2.425739288330078 + }, + { + "auxiliary_loss_clip": 0.01117481, + "auxiliary_loss_mlp": 0.01036962, + "balance_loss_clip": 1.0218817, + "balance_loss_mlp": 1.03986263, + "epoch": 0.3918833608898241, + "flos": 34313543395200.0, + "grad_norm": 1.595983335780194, + "language_loss": 0.72092575, + "learning_rate": 2.776462273631956e-06, + "loss": 0.74247015, + "num_input_tokens_seen": 139988140, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7734375, + "step": 6518, + "time_per_iteration": 2.559140205383301 + }, + { + "auxiliary_loss_clip": 0.01118745, + "auxiliary_loss_mlp": 0.01032386, + "balance_loss_clip": 1.0179677, + "balance_loss_mlp": 1.04041731, + "epoch": 0.3919434841424921, + "flos": 36939582812160.0, + "grad_norm": 1.563160017416143, + "language_loss": 0.61668754, + "learning_rate": 2.7761033458127177e-06, + "loss": 0.63819885, + "num_input_tokens_seen": 140010060, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 6519, + "time_per_iteration": 2.5673322677612305 + }, + { + "auxiliary_loss_clip": 0.01124684, + "auxiliary_loss_mlp": 0.0104391, + "balance_loss_clip": 1.02800775, + "balance_loss_mlp": 1.04341698, + "epoch": 0.3920036073951601, + "flos": 23508956373120.0, + "grad_norm": 2.4564373100444232, + "language_loss": 0.6693083, + "learning_rate": 2.775744388563563e-06, + "loss": 0.6909942, + "num_input_tokens_seen": 140029400, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8125, + "step": 6520, + "time_per_iteration": 2.487650156021118 + }, + { + "auxiliary_loss_clip": 0.0111526, + "auxiliary_loss_mlp": 0.01033685, + "balance_loss_clip": 1.01958799, + "balance_loss_mlp": 1.03966665, + "epoch": 0.39206373064782807, + "flos": 18406086635520.0, + "grad_norm": 1.7599889377917473, + "language_loss": 0.78522319, + "learning_rate": 2.775385401898104e-06, + "loss": 0.80671263, + "num_input_tokens_seen": 140048940, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7578125, + "step": 6521, + "time_per_iteration": 2.418458938598633 + }, + { + "auxiliary_loss_clip": 0.0112345, + "auxiliary_loss_mlp": 0.01036245, + "balance_loss_clip": 1.01853049, + "balance_loss_mlp": 1.04218912, + "epoch": 0.39212385390049603, + "flos": 12313051608960.0, + "grad_norm": 2.4256865138527353, + "language_loss": 0.70340407, + "learning_rate": 2.775026385829952e-06, + "loss": 0.7250011, + "num_input_tokens_seen": 140066380, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.8125, + "step": 6522, + "time_per_iteration": 2.435802936553955 + }, + { + "auxiliary_loss_clip": 0.01120666, + "auxiliary_loss_mlp": 0.01034593, + "balance_loss_clip": 1.02013338, + "balance_loss_mlp": 1.04137838, + "epoch": 0.392183977153164, + "flos": 19719160214400.0, + "grad_norm": 1.8374103087918643, + "language_loss": 0.76740485, + "learning_rate": 2.774667340372722e-06, + "loss": 0.78895748, + "num_input_tokens_seen": 140085275, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7890625, + "step": 6523, + "time_per_iteration": 2.4279329776763916 + }, + { + "auxiliary_loss_clip": 0.01120195, + "auxiliary_loss_mlp": 0.0103949, + "balance_loss_clip": 1.02522683, + "balance_loss_mlp": 1.04124415, + "epoch": 0.39224410040583196, + "flos": 33144902403840.0, + "grad_norm": 2.339335808739943, + "language_loss": 0.61661494, + "learning_rate": 2.7743082655400293e-06, + "loss": 0.63821173, + "num_input_tokens_seen": 140105105, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7890625, + "step": 6524, + "time_per_iteration": 2.5591442584991455 + }, + { + "auxiliary_loss_clip": 0.01117506, + "auxiliary_loss_mlp": 0.0103718, + "balance_loss_clip": 1.02181363, + "balance_loss_mlp": 1.03898454, + "epoch": 0.39230422365849993, + "flos": 27782434097280.0, + "grad_norm": 1.6728206813409823, + "language_loss": 0.73940414, + "learning_rate": 2.773949161345489e-06, + "loss": 0.76095104, + "num_input_tokens_seen": 140125645, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.78515625, + "step": 6525, + "time_per_iteration": 2.4897830486297607 + }, + { + "auxiliary_loss_clip": 0.01117533, + "auxiliary_loss_mlp": 0.01036063, + "balance_loss_clip": 1.02224112, + "balance_loss_mlp": 1.03882146, + "epoch": 0.3923643469111679, + "flos": 17931634865280.0, + "grad_norm": 2.0942212479104363, + "language_loss": 0.81385779, + "learning_rate": 2.773590027802719e-06, + "loss": 0.83539373, + "num_input_tokens_seen": 140141925, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.78515625, + "step": 6526, + "time_per_iteration": 2.442091226577759 + }, + { + "auxiliary_loss_clip": 0.01115953, + "auxiliary_loss_mlp": 0.01036718, + "balance_loss_clip": 1.02265131, + "balance_loss_mlp": 1.03931344, + "epoch": 0.39242447016383586, + "flos": 24059539019520.0, + "grad_norm": 1.56527231709598, + "language_loss": 0.69802964, + "learning_rate": 2.7732308649253383e-06, + "loss": 0.71955633, + "num_input_tokens_seen": 140160965, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.765625, + "step": 6527, + "time_per_iteration": 2.465498924255371 + }, + { + "auxiliary_loss_clip": 0.01116064, + "auxiliary_loss_mlp": 0.01029624, + "balance_loss_clip": 1.0154264, + "balance_loss_mlp": 1.04067612, + "epoch": 0.3924845934165038, + "flos": 10664069016960.0, + "grad_norm": 2.4439619967755983, + "language_loss": 0.82215756, + "learning_rate": 2.772871672726965e-06, + "loss": 0.84361446, + "num_input_tokens_seen": 140177780, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.75390625, + "step": 6528, + "time_per_iteration": 2.488581418991089 + }, + { + "auxiliary_loss_clip": 0.01114295, + "auxiliary_loss_mlp": 0.0103716, + "balance_loss_clip": 1.02282465, + "balance_loss_mlp": 1.04024255, + "epoch": 0.3925447166691718, + "flos": 31245910174080.0, + "grad_norm": 1.4897772961790412, + "language_loss": 0.68726033, + "learning_rate": 2.7725124512212205e-06, + "loss": 0.70877492, + "num_input_tokens_seen": 140201660, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7421875, + "step": 6529, + "time_per_iteration": 2.5409562587738037 + }, + { + "auxiliary_loss_clip": 0.0111896, + "auxiliary_loss_mlp": 0.01039977, + "balance_loss_clip": 1.02561271, + "balance_loss_mlp": 1.04070282, + "epoch": 0.39260483992183975, + "flos": 29415040087680.0, + "grad_norm": 2.9003920421281926, + "language_loss": 0.79728955, + "learning_rate": 2.7721532004217267e-06, + "loss": 0.81887889, + "num_input_tokens_seen": 140218585, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78125, + "step": 6530, + "time_per_iteration": 2.514547109603882 + }, + { + "auxiliary_loss_clip": 0.01112608, + "auxiliary_loss_mlp": 0.01036942, + "balance_loss_clip": 1.02267241, + "balance_loss_mlp": 1.03750181, + "epoch": 0.3926649631745077, + "flos": 22857788666880.0, + "grad_norm": 1.6221630004730245, + "language_loss": 0.75564003, + "learning_rate": 2.7717939203421063e-06, + "loss": 0.77713549, + "num_input_tokens_seen": 140239905, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.75, + "step": 6531, + "time_per_iteration": 2.4572982788085938 + }, + { + "auxiliary_loss_clip": 0.01038893, + "auxiliary_loss_mlp": 0.0100286, + "balance_loss_clip": 1.00127435, + "balance_loss_mlp": 1.01370025, + "epoch": 0.3927250864271757, + "flos": 63893881872000.0, + "grad_norm": 0.8170127744653651, + "language_loss": 0.60378772, + "learning_rate": 2.7714346109959822e-06, + "loss": 0.62420523, + "num_input_tokens_seen": 140293820, + "router_z_loss_clip": 0.01586914, + "router_z_loss_mlp": 0.25195312, + "step": 6532, + "time_per_iteration": 2.929732084274292 + }, + { + "auxiliary_loss_clip": 0.01036987, + "auxiliary_loss_mlp": 0.01003862, + "balance_loss_clip": 1.00225282, + "balance_loss_mlp": 1.0117799, + "epoch": 0.3927852096798437, + "flos": 68909741890560.0, + "grad_norm": 0.7837299971611431, + "language_loss": 0.55545104, + "learning_rate": 2.771075272396981e-06, + "loss": 0.57585955, + "num_input_tokens_seen": 140360420, + "router_z_loss_clip": 0.01611328, + "router_z_loss_mlp": 0.25195312, + "step": 6533, + "time_per_iteration": 3.1820483207702637 + }, + { + "auxiliary_loss_clip": 0.01120735, + "auxiliary_loss_mlp": 0.01037419, + "balance_loss_clip": 1.02316761, + "balance_loss_mlp": 1.04170942, + "epoch": 0.39284533293251167, + "flos": 29715972232320.0, + "grad_norm": 1.9313522305780093, + "language_loss": 0.75972468, + "learning_rate": 2.7707159045587284e-06, + "loss": 0.78130615, + "num_input_tokens_seen": 140381950, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7890625, + "step": 6534, + "time_per_iteration": 2.5650813579559326 + }, + { + "auxiliary_loss_clip": 0.01122617, + "auxiliary_loss_mlp": 0.01038287, + "balance_loss_clip": 1.02376163, + "balance_loss_mlp": 1.04177046, + "epoch": 0.39290545618517964, + "flos": 18552027594240.0, + "grad_norm": 2.213634574223379, + "language_loss": 0.78067005, + "learning_rate": 2.770356507494851e-06, + "loss": 0.802279, + "num_input_tokens_seen": 140399410, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.80859375, + "step": 6535, + "time_per_iteration": 2.447950839996338 + }, + { + "auxiliary_loss_clip": 0.01113628, + "auxiliary_loss_mlp": 0.01032655, + "balance_loss_clip": 1.01950026, + "balance_loss_mlp": 1.03985262, + "epoch": 0.3929655794378476, + "flos": 26249479413120.0, + "grad_norm": 2.091132286884177, + "language_loss": 0.68613565, + "learning_rate": 2.769997081218978e-06, + "loss": 0.70759845, + "num_input_tokens_seen": 140419055, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73828125, + "step": 6536, + "time_per_iteration": 2.4873242378234863 + }, + { + "auxiliary_loss_clip": 0.01112038, + "auxiliary_loss_mlp": 0.01035235, + "balance_loss_clip": 1.0223484, + "balance_loss_mlp": 1.03908086, + "epoch": 0.39302570269051557, + "flos": 29277933874560.0, + "grad_norm": 1.7105256577096235, + "language_loss": 0.69052541, + "learning_rate": 2.769637625744738e-06, + "loss": 0.71199811, + "num_input_tokens_seen": 140438800, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.73046875, + "step": 6537, + "time_per_iteration": 2.5867457389831543 + }, + { + "auxiliary_loss_clip": 0.01117392, + "auxiliary_loss_mlp": 0.01038473, + "balance_loss_clip": 1.02420986, + "balance_loss_mlp": 1.04011965, + "epoch": 0.39308582594318353, + "flos": 17347440067200.0, + "grad_norm": 1.6628056753547982, + "language_loss": 0.79044384, + "learning_rate": 2.769278141085763e-06, + "loss": 0.81200254, + "num_input_tokens_seen": 140456880, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7734375, + "step": 6538, + "time_per_iteration": 2.437757968902588 + }, + { + "auxiliary_loss_clip": 0.01034351, + "auxiliary_loss_mlp": 0.01009828, + "balance_loss_clip": 1.0084635, + "balance_loss_mlp": 1.00972295, + "epoch": 0.3931459491958515, + "flos": 61007094650880.0, + "grad_norm": 0.8042725449961473, + "language_loss": 0.61871827, + "learning_rate": 2.768918627255683e-06, + "loss": 0.63916004, + "num_input_tokens_seen": 140507510, + "router_z_loss_clip": 0.01367188, + "router_z_loss_mlp": 0.24609375, + "step": 6539, + "time_per_iteration": 2.9012601375579834 + }, + { + "auxiliary_loss_clip": 0.01115165, + "auxiliary_loss_mlp": 0.01038758, + "balance_loss_clip": 1.02417326, + "balance_loss_mlp": 1.03897023, + "epoch": 0.39320607244851946, + "flos": 39016009249920.0, + "grad_norm": 2.1025744829352306, + "language_loss": 0.68334043, + "learning_rate": 2.7685590842681315e-06, + "loss": 0.70487964, + "num_input_tokens_seen": 140528740, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.76171875, + "step": 6540, + "time_per_iteration": 2.617544412612915 + }, + { + "auxiliary_loss_clip": 0.01114279, + "auxiliary_loss_mlp": 0.01035483, + "balance_loss_clip": 1.02167249, + "balance_loss_mlp": 1.0387044, + "epoch": 0.3932661957011874, + "flos": 24679752180480.0, + "grad_norm": 1.7155589252050778, + "language_loss": 0.72714561, + "learning_rate": 2.7681995121367433e-06, + "loss": 0.74864328, + "num_input_tokens_seen": 140547560, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 6541, + "time_per_iteration": 2.5576202869415283 + }, + { + "auxiliary_loss_clip": 0.01034882, + "auxiliary_loss_mlp": 0.01010056, + "balance_loss_clip": 1.00863171, + "balance_loss_mlp": 1.0103662, + "epoch": 0.3933263189538554, + "flos": 70096552185600.0, + "grad_norm": 0.8254504926360222, + "language_loss": 0.60302341, + "learning_rate": 2.7678399108751516e-06, + "loss": 0.62347269, + "num_input_tokens_seen": 140601175, + "router_z_loss_clip": 0.01422119, + "router_z_loss_mlp": 0.24511719, + "step": 6542, + "time_per_iteration": 2.921311378479004 + }, + { + "auxiliary_loss_clip": 0.01115263, + "auxiliary_loss_mlp": 0.01035713, + "balance_loss_clip": 1.02204013, + "balance_loss_mlp": 1.03968477, + "epoch": 0.39338644220652336, + "flos": 22929071207040.0, + "grad_norm": 1.9294145782355336, + "language_loss": 0.82255107, + "learning_rate": 2.7674802804969947e-06, + "loss": 0.84406084, + "num_input_tokens_seen": 140622200, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75390625, + "step": 6543, + "time_per_iteration": 2.5267767906188965 + }, + { + "auxiliary_loss_clip": 0.01111884, + "auxiliary_loss_mlp": 0.01035514, + "balance_loss_clip": 1.02153063, + "balance_loss_mlp": 1.03692436, + "epoch": 0.3934465654591913, + "flos": 30848163897600.0, + "grad_norm": 1.6066266241550669, + "language_loss": 0.69336796, + "learning_rate": 2.767120621015908e-06, + "loss": 0.7148419, + "num_input_tokens_seen": 140643125, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75, + "step": 6544, + "time_per_iteration": 2.5192980766296387 + }, + { + "auxiliary_loss_clip": 0.01118616, + "auxiliary_loss_mlp": 0.01042644, + "balance_loss_clip": 1.02729011, + "balance_loss_mlp": 1.03997457, + "epoch": 0.3935066887118593, + "flos": 29236528471680.0, + "grad_norm": 1.880723151689185, + "language_loss": 0.75104976, + "learning_rate": 2.76676093244553e-06, + "loss": 0.77266246, + "num_input_tokens_seen": 140662500, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.7890625, + "step": 6545, + "time_per_iteration": 2.5483953952789307 + }, + { + "auxiliary_loss_clip": 0.01112383, + "auxiliary_loss_mlp": 0.01035537, + "balance_loss_clip": 1.02350879, + "balance_loss_mlp": 1.04072022, + "epoch": 0.3935668119645273, + "flos": 19135288638720.0, + "grad_norm": 1.4191511939867936, + "language_loss": 0.74600172, + "learning_rate": 2.7664012147995015e-06, + "loss": 0.76748097, + "num_input_tokens_seen": 140681960, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.71875, + "step": 6546, + "time_per_iteration": 2.435189962387085 + }, + { + "auxiliary_loss_clip": 0.01120275, + "auxiliary_loss_mlp": 0.01037024, + "balance_loss_clip": 1.02256405, + "balance_loss_mlp": 1.03998446, + "epoch": 0.3936269352171953, + "flos": 18516116972160.0, + "grad_norm": 2.8050093889996326, + "language_loss": 0.81520575, + "learning_rate": 2.7660414680914617e-06, + "loss": 0.83677876, + "num_input_tokens_seen": 140699170, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.80078125, + "step": 6547, + "time_per_iteration": 2.5359435081481934 + }, + { + "auxiliary_loss_clip": 0.0111424, + "auxiliary_loss_mlp": 0.01028344, + "balance_loss_clip": 1.01444387, + "balance_loss_mlp": 1.03795588, + "epoch": 0.39368705846986324, + "flos": 15632813370240.0, + "grad_norm": 2.282095961224954, + "language_loss": 0.84300089, + "learning_rate": 2.7656816923350525e-06, + "loss": 0.86442673, + "num_input_tokens_seen": 140714920, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.765625, + "step": 6548, + "time_per_iteration": 2.430497407913208 + }, + { + "auxiliary_loss_clip": 0.01110548, + "auxiliary_loss_mlp": 0.01030679, + "balance_loss_clip": 1.01784039, + "balance_loss_mlp": 1.0382576, + "epoch": 0.3937471817225312, + "flos": 21325839563520.0, + "grad_norm": 1.5261467823901598, + "language_loss": 0.72481942, + "learning_rate": 2.7653218875439174e-06, + "loss": 0.74623168, + "num_input_tokens_seen": 140734595, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.72265625, + "step": 6549, + "time_per_iteration": 2.484938383102417 + }, + { + "auxiliary_loss_clip": 0.01116832, + "auxiliary_loss_mlp": 0.01034368, + "balance_loss_clip": 1.02025914, + "balance_loss_mlp": 1.04114747, + "epoch": 0.39380730497519917, + "flos": 20776693461120.0, + "grad_norm": 1.525417369659451, + "language_loss": 0.77678335, + "learning_rate": 2.764962053731699e-06, + "loss": 0.79829538, + "num_input_tokens_seen": 140754050, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7578125, + "step": 6550, + "time_per_iteration": 2.4533822536468506 + }, + { + "auxiliary_loss_clip": 0.01112095, + "auxiliary_loss_mlp": 0.0103049, + "balance_loss_clip": 1.01695979, + "balance_loss_mlp": 1.03770638, + "epoch": 0.39386742822786713, + "flos": 21609784575360.0, + "grad_norm": 1.6825180459961226, + "language_loss": 0.81065381, + "learning_rate": 2.7646021909120434e-06, + "loss": 0.83207965, + "num_input_tokens_seen": 140771440, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7421875, + "step": 6551, + "time_per_iteration": 2.4740419387817383 + }, + { + "auxiliary_loss_clip": 0.01115626, + "auxiliary_loss_mlp": 0.01037041, + "balance_loss_clip": 1.02310574, + "balance_loss_mlp": 1.03833413, + "epoch": 0.3939275514805351, + "flos": 12414642249600.0, + "grad_norm": 2.2350138021364003, + "language_loss": 0.80241704, + "learning_rate": 2.764242299098596e-06, + "loss": 0.82394373, + "num_input_tokens_seen": 140786715, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7734375, + "step": 6552, + "time_per_iteration": 2.4066245555877686 + }, + { + "auxiliary_loss_clip": 0.01118032, + "auxiliary_loss_mlp": 0.01038605, + "balance_loss_clip": 1.02449059, + "balance_loss_mlp": 1.04108357, + "epoch": 0.39398767473320306, + "flos": 18552027594240.0, + "grad_norm": 2.2028177738118884, + "language_loss": 0.71154666, + "learning_rate": 2.763882378305003e-06, + "loss": 0.73311305, + "num_input_tokens_seen": 140804950, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.76953125, + "step": 6553, + "time_per_iteration": 2.454035997390747 + }, + { + "auxiliary_loss_clip": 0.01115775, + "auxiliary_loss_mlp": 0.01034177, + "balance_loss_clip": 1.02037239, + "balance_loss_mlp": 1.0409205, + "epoch": 0.39404779798587103, + "flos": 29308888419840.0, + "grad_norm": 1.9276274050376605, + "language_loss": 0.63445336, + "learning_rate": 2.7635224285449144e-06, + "loss": 0.65595293, + "num_input_tokens_seen": 140822800, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.75, + "step": 6554, + "time_per_iteration": 5.467530250549316 + }, + { + "auxiliary_loss_clip": 0.01116231, + "auxiliary_loss_mlp": 0.0103909, + "balance_loss_clip": 1.02620983, + "balance_loss_mlp": 1.041237, + "epoch": 0.394107921238539, + "flos": 34897055834880.0, + "grad_norm": 2.7325305725381703, + "language_loss": 0.79567587, + "learning_rate": 2.7631624498319796e-06, + "loss": 0.81722915, + "num_input_tokens_seen": 140842940, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.75, + "step": 6555, + "time_per_iteration": 3.9707608222961426 + }, + { + "auxiliary_loss_clip": 0.01119332, + "auxiliary_loss_mlp": 0.01036045, + "balance_loss_clip": 1.0209887, + "balance_loss_mlp": 1.04194546, + "epoch": 0.39416804449120696, + "flos": 25081413039360.0, + "grad_norm": 1.8303237809157376, + "language_loss": 0.71571302, + "learning_rate": 2.7628024421798473e-06, + "loss": 0.73726678, + "num_input_tokens_seen": 140863060, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7734375, + "step": 6556, + "time_per_iteration": 2.5013363361358643 + }, + { + "auxiliary_loss_clip": 0.01115996, + "auxiliary_loss_mlp": 0.0103255, + "balance_loss_clip": 1.01806605, + "balance_loss_mlp": 1.03954887, + "epoch": 0.3942281677438749, + "flos": 32306639731200.0, + "grad_norm": 2.056709462434603, + "language_loss": 0.83915412, + "learning_rate": 2.7624424056021705e-06, + "loss": 0.86063957, + "num_input_tokens_seen": 140883795, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.765625, + "step": 6557, + "time_per_iteration": 2.7162060737609863 + }, + { + "auxiliary_loss_clip": 0.01115806, + "auxiliary_loss_mlp": 0.01035387, + "balance_loss_clip": 1.02195859, + "balance_loss_mlp": 1.04014397, + "epoch": 0.3942882909965429, + "flos": 24936621315840.0, + "grad_norm": 3.2694171829217953, + "language_loss": 0.80285048, + "learning_rate": 2.7620823401126004e-06, + "loss": 0.8243624, + "num_input_tokens_seen": 140903055, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 6558, + "time_per_iteration": 2.466904401779175 + }, + { + "auxiliary_loss_clip": 0.01115408, + "auxiliary_loss_mlp": 0.01033231, + "balance_loss_clip": 1.02037418, + "balance_loss_mlp": 1.04165912, + "epoch": 0.39434841424921085, + "flos": 11874797769600.0, + "grad_norm": 1.7254990423790144, + "language_loss": 0.71022832, + "learning_rate": 2.761722245724792e-06, + "loss": 0.73171461, + "num_input_tokens_seen": 140920685, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.734375, + "step": 6559, + "time_per_iteration": 2.474142551422119 + }, + { + "auxiliary_loss_clip": 0.01120627, + "auxiliary_loss_mlp": 0.0103693, + "balance_loss_clip": 1.02111125, + "balance_loss_mlp": 1.04030299, + "epoch": 0.3944085375018789, + "flos": 16361620323840.0, + "grad_norm": 1.8853849407225942, + "language_loss": 0.80391413, + "learning_rate": 2.7613621224524003e-06, + "loss": 0.82548964, + "num_input_tokens_seen": 140937320, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8046875, + "step": 6560, + "time_per_iteration": 2.4220218658447266 + }, + { + "auxiliary_loss_clip": 0.01121865, + "auxiliary_loss_mlp": 0.01037184, + "balance_loss_clip": 1.022223, + "balance_loss_mlp": 1.04395843, + "epoch": 0.39446866075454684, + "flos": 10633365866880.0, + "grad_norm": 3.2514761912447283, + "language_loss": 0.83440554, + "learning_rate": 2.7610019703090803e-06, + "loss": 0.85599601, + "num_input_tokens_seen": 140954855, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.78125, + "step": 6561, + "time_per_iteration": 2.458305835723877 + }, + { + "auxiliary_loss_clip": 0.01117482, + "auxiliary_loss_mlp": 0.01038407, + "balance_loss_clip": 1.02458477, + "balance_loss_mlp": 1.04098439, + "epoch": 0.3945287840072148, + "flos": 18187498419840.0, + "grad_norm": 2.862241713271481, + "language_loss": 0.79548055, + "learning_rate": 2.7606417893084887e-06, + "loss": 0.81703943, + "num_input_tokens_seen": 140973250, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 6562, + "time_per_iteration": 2.4390974044799805 + }, + { + "auxiliary_loss_clip": 0.01113935, + "auxiliary_loss_mlp": 0.01036907, + "balance_loss_clip": 1.02301359, + "balance_loss_mlp": 1.04043949, + "epoch": 0.39458890725988277, + "flos": 23039891642880.0, + "grad_norm": 1.512260767998718, + "language_loss": 0.81355608, + "learning_rate": 2.7602815794642853e-06, + "loss": 0.83506453, + "num_input_tokens_seen": 140993050, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.734375, + "step": 6563, + "time_per_iteration": 2.518843650817871 + }, + { + "auxiliary_loss_clip": 0.0111742, + "auxiliary_loss_mlp": 0.01040253, + "balance_loss_clip": 1.02541161, + "balance_loss_mlp": 1.041682, + "epoch": 0.39464903051255074, + "flos": 17159052211200.0, + "grad_norm": 1.9438463538262531, + "language_loss": 0.69416577, + "learning_rate": 2.759921340790127e-06, + "loss": 0.71574247, + "num_input_tokens_seen": 141010815, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7578125, + "step": 6564, + "time_per_iteration": 2.446140766143799 + }, + { + "auxiliary_loss_clip": 0.01118477, + "auxiliary_loss_mlp": 0.01034591, + "balance_loss_clip": 1.02079892, + "balance_loss_mlp": 1.04157352, + "epoch": 0.3947091537652187, + "flos": 15889000147200.0, + "grad_norm": 3.234298893133154, + "language_loss": 0.83141822, + "learning_rate": 2.759561073299676e-06, + "loss": 0.8529489, + "num_input_tokens_seen": 141028720, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76953125, + "step": 6565, + "time_per_iteration": 2.474611520767212 + }, + { + "auxiliary_loss_clip": 0.01115253, + "auxiliary_loss_mlp": 0.01033237, + "balance_loss_clip": 1.02002859, + "balance_loss_mlp": 1.04039359, + "epoch": 0.39476927701788667, + "flos": 18545491319040.0, + "grad_norm": 1.7678460287206497, + "language_loss": 0.82917452, + "learning_rate": 2.7592007770065937e-06, + "loss": 0.85065943, + "num_input_tokens_seen": 141046025, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.75, + "step": 6566, + "time_per_iteration": 2.432832956314087 + }, + { + "auxiliary_loss_clip": 0.01122918, + "auxiliary_loss_mlp": 0.01038867, + "balance_loss_clip": 1.02493143, + "balance_loss_mlp": 1.04225016, + "epoch": 0.39482940027055463, + "flos": 22275712771200.0, + "grad_norm": 2.357536272997057, + "language_loss": 0.7778033, + "learning_rate": 2.7588404519245403e-06, + "loss": 0.79942119, + "num_input_tokens_seen": 141066865, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.8046875, + "step": 6567, + "time_per_iteration": 2.5020110607147217 + }, + { + "auxiliary_loss_clip": 0.01110775, + "auxiliary_loss_mlp": 0.01040399, + "balance_loss_clip": 1.02689242, + "balance_loss_mlp": 1.04026425, + "epoch": 0.3948895235232226, + "flos": 14757634494720.0, + "grad_norm": 2.0625384967809546, + "language_loss": 0.80381507, + "learning_rate": 2.758480098067182e-06, + "loss": 0.8253268, + "num_input_tokens_seen": 141084210, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.70703125, + "step": 6568, + "time_per_iteration": 2.464186429977417 + }, + { + "auxiliary_loss_clip": 0.01116352, + "auxiliary_loss_mlp": 0.01036196, + "balance_loss_clip": 1.02282655, + "balance_loss_mlp": 1.04130197, + "epoch": 0.39494964677589056, + "flos": 22565763095040.0, + "grad_norm": 1.6625556258765348, + "language_loss": 0.84206939, + "learning_rate": 2.7581197154481816e-06, + "loss": 0.86359489, + "num_input_tokens_seen": 141103895, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.75, + "step": 6569, + "time_per_iteration": 2.4947829246520996 + }, + { + "auxiliary_loss_clip": 0.01118805, + "auxiliary_loss_mlp": 0.01037428, + "balance_loss_clip": 1.02418959, + "balance_loss_mlp": 1.04450357, + "epoch": 0.3950097700285585, + "flos": 22963186149120.0, + "grad_norm": 1.920459843417803, + "language_loss": 0.74973899, + "learning_rate": 2.7577593040812066e-06, + "loss": 0.77130127, + "num_input_tokens_seen": 141124000, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7421875, + "step": 6570, + "time_per_iteration": 2.50211763381958 + }, + { + "auxiliary_loss_clip": 0.01117011, + "auxiliary_loss_mlp": 0.01037708, + "balance_loss_clip": 1.02365923, + "balance_loss_mlp": 1.04104555, + "epoch": 0.3950698932812265, + "flos": 20595236929920.0, + "grad_norm": 1.649568183340291, + "language_loss": 0.79813123, + "learning_rate": 2.757398863979922e-06, + "loss": 0.81967843, + "num_input_tokens_seen": 141142535, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 6571, + "time_per_iteration": 2.477740526199341 + }, + { + "auxiliary_loss_clip": 0.01116017, + "auxiliary_loss_mlp": 0.01041795, + "balance_loss_clip": 1.02846146, + "balance_loss_mlp": 1.04203689, + "epoch": 0.39513001653389446, + "flos": 20375786787840.0, + "grad_norm": 1.628324795196944, + "language_loss": 0.77873337, + "learning_rate": 2.757038395157997e-06, + "loss": 0.80031145, + "num_input_tokens_seen": 141161575, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73828125, + "step": 6572, + "time_per_iteration": 2.4463839530944824 + }, + { + "auxiliary_loss_clip": 0.01118116, + "auxiliary_loss_mlp": 0.01040167, + "balance_loss_clip": 1.02636874, + "balance_loss_mlp": 1.0404911, + "epoch": 0.3951901397865625, + "flos": 26463650256000.0, + "grad_norm": 1.6456702645470058, + "language_loss": 0.7506038, + "learning_rate": 2.7566778976291002e-06, + "loss": 0.77218664, + "num_input_tokens_seen": 141181150, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.77734375, + "step": 6573, + "time_per_iteration": 2.501692295074463 + }, + { + "auxiliary_loss_clip": 0.01114036, + "auxiliary_loss_mlp": 0.01034006, + "balance_loss_clip": 1.02165031, + "balance_loss_mlp": 1.04046559, + "epoch": 0.39525026303923044, + "flos": 43838345767680.0, + "grad_norm": 1.4003162240803297, + "language_loss": 0.67956495, + "learning_rate": 2.7563173714069017e-06, + "loss": 0.70104533, + "num_input_tokens_seen": 141206310, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.734375, + "step": 6574, + "time_per_iteration": 2.6566920280456543 + }, + { + "auxiliary_loss_clip": 0.01119799, + "auxiliary_loss_mlp": 0.01034669, + "balance_loss_clip": 1.01978612, + "balance_loss_mlp": 1.04216623, + "epoch": 0.3953103862918984, + "flos": 18040803275520.0, + "grad_norm": 2.170019312223073, + "language_loss": 0.71719187, + "learning_rate": 2.755956816505072e-06, + "loss": 0.73873657, + "num_input_tokens_seen": 141223925, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7734375, + "step": 6575, + "time_per_iteration": 2.463792085647583 + }, + { + "auxiliary_loss_clip": 0.01119276, + "auxiliary_loss_mlp": 0.01042807, + "balance_loss_clip": 1.02859664, + "balance_loss_mlp": 1.04105997, + "epoch": 0.3953705095445664, + "flos": 16976015481600.0, + "grad_norm": 2.0080051897694324, + "language_loss": 0.73535955, + "learning_rate": 2.7555962329372845e-06, + "loss": 0.75698036, + "num_input_tokens_seen": 141239010, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 6576, + "time_per_iteration": 2.409817934036255 + }, + { + "auxiliary_loss_clip": 0.01115385, + "auxiliary_loss_mlp": 0.01036906, + "balance_loss_clip": 1.0243237, + "balance_loss_mlp": 1.03979337, + "epoch": 0.39543063279723434, + "flos": 17411144837760.0, + "grad_norm": 2.36733568983198, + "language_loss": 0.83294857, + "learning_rate": 2.7552356207172124e-06, + "loss": 0.8544715, + "num_input_tokens_seen": 141252255, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7578125, + "step": 6577, + "time_per_iteration": 2.4421181678771973 + }, + { + "auxiliary_loss_clip": 0.01115466, + "auxiliary_loss_mlp": 0.01031962, + "balance_loss_clip": 1.01860428, + "balance_loss_mlp": 1.04138541, + "epoch": 0.3954907560499023, + "flos": 22784207656320.0, + "grad_norm": 3.8530294325048984, + "language_loss": 0.89916354, + "learning_rate": 2.75487497985853e-06, + "loss": 0.92063785, + "num_input_tokens_seen": 141269325, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7421875, + "step": 6578, + "time_per_iteration": 2.470369577407837 + }, + { + "auxiliary_loss_clip": 0.01118987, + "auxiliary_loss_mlp": 0.01037124, + "balance_loss_clip": 1.02170992, + "balance_loss_mlp": 1.04030561, + "epoch": 0.39555087930257027, + "flos": 21944400698880.0, + "grad_norm": 1.7408596896151103, + "language_loss": 0.77871025, + "learning_rate": 2.7545143103749117e-06, + "loss": 0.80027139, + "num_input_tokens_seen": 141288505, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.7890625, + "step": 6579, + "time_per_iteration": 2.4619040489196777 + }, + { + "auxiliary_loss_clip": 0.01119633, + "auxiliary_loss_mlp": 0.01031625, + "balance_loss_clip": 1.01760054, + "balance_loss_mlp": 1.0407021, + "epoch": 0.39561100255523823, + "flos": 20404622430720.0, + "grad_norm": 2.037188254408411, + "language_loss": 0.68324131, + "learning_rate": 2.754153612280037e-06, + "loss": 0.70475388, + "num_input_tokens_seen": 141303680, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 6580, + "time_per_iteration": 2.4363577365875244 + }, + { + "auxiliary_loss_clip": 0.01115068, + "auxiliary_loss_mlp": 0.01028446, + "balance_loss_clip": 1.01499939, + "balance_loss_mlp": 1.04099488, + "epoch": 0.3956711258079062, + "flos": 27964572986880.0, + "grad_norm": 1.613777567548473, + "language_loss": 0.58620721, + "learning_rate": 2.7537928855875797e-06, + "loss": 0.60764229, + "num_input_tokens_seen": 141324090, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7421875, + "step": 6581, + "time_per_iteration": 2.5704734325408936 + }, + { + "auxiliary_loss_clip": 0.01118807, + "auxiliary_loss_mlp": 0.01038995, + "balance_loss_clip": 1.02479148, + "balance_loss_mlp": 1.04165769, + "epoch": 0.39573124906057416, + "flos": 14428297670400.0, + "grad_norm": 2.015576445189345, + "language_loss": 0.698632, + "learning_rate": 2.7534321303112224e-06, + "loss": 0.72021002, + "num_input_tokens_seen": 141342235, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7734375, + "step": 6582, + "time_per_iteration": 2.4640939235687256 + }, + { + "auxiliary_loss_clip": 0.01118406, + "auxiliary_loss_mlp": 0.01035395, + "balance_loss_clip": 1.02167404, + "balance_loss_mlp": 1.0415566, + "epoch": 0.39579137231324213, + "flos": 18733699607040.0, + "grad_norm": 2.285451965985758, + "language_loss": 0.76454568, + "learning_rate": 2.753071346464642e-06, + "loss": 0.78608364, + "num_input_tokens_seen": 141361195, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76953125, + "step": 6583, + "time_per_iteration": 2.437396287918091 + }, + { + "auxiliary_loss_clip": 0.01118401, + "auxiliary_loss_mlp": 0.01030837, + "balance_loss_clip": 1.01708043, + "balance_loss_mlp": 1.04192805, + "epoch": 0.3958514955659101, + "flos": 17676417755520.0, + "grad_norm": 1.5685917359515968, + "language_loss": 0.65989023, + "learning_rate": 2.7527105340615207e-06, + "loss": 0.68138266, + "num_input_tokens_seen": 141378275, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.765625, + "step": 6584, + "time_per_iteration": 2.4562485218048096 + }, + { + "auxiliary_loss_clip": 0.01120331, + "auxiliary_loss_mlp": 0.01037784, + "balance_loss_clip": 1.02262115, + "balance_loss_mlp": 1.04122627, + "epoch": 0.39591161881857806, + "flos": 29309103901440.0, + "grad_norm": 2.6735523944320136, + "language_loss": 0.72423065, + "learning_rate": 2.7523496931155413e-06, + "loss": 0.74581182, + "num_input_tokens_seen": 141396960, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.7890625, + "step": 6585, + "time_per_iteration": 2.517333984375 + }, + { + "auxiliary_loss_clip": 0.0111653, + "auxiliary_loss_mlp": 0.01031743, + "balance_loss_clip": 1.01811159, + "balance_loss_mlp": 1.04010367, + "epoch": 0.3959717420712461, + "flos": 25771831332480.0, + "grad_norm": 1.986310622320223, + "language_loss": 0.73430967, + "learning_rate": 2.7519888236403856e-06, + "loss": 0.75579244, + "num_input_tokens_seen": 141417320, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.765625, + "step": 6586, + "time_per_iteration": 2.513847827911377 + }, + { + "auxiliary_loss_clip": 0.01117404, + "auxiliary_loss_mlp": 0.01031194, + "balance_loss_clip": 1.01738322, + "balance_loss_mlp": 1.04139459, + "epoch": 0.39603186532391405, + "flos": 20923783655040.0, + "grad_norm": 2.2420315368265915, + "language_loss": 0.71627617, + "learning_rate": 2.7516279256497382e-06, + "loss": 0.73776209, + "num_input_tokens_seen": 141435985, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 6587, + "time_per_iteration": 2.498534917831421 + }, + { + "auxiliary_loss_clip": 0.01038457, + "auxiliary_loss_mlp": 0.01003592, + "balance_loss_clip": 1.0020541, + "balance_loss_mlp": 1.01416993, + "epoch": 0.396091988576582, + "flos": 54880986176640.0, + "grad_norm": 0.9067384171744824, + "language_loss": 0.61162889, + "learning_rate": 2.751266999157285e-06, + "loss": 0.63204944, + "num_input_tokens_seen": 141486075, + "router_z_loss_clip": 0.01531982, + "router_z_loss_mlp": 0.2421875, + "step": 6588, + "time_per_iteration": 2.9129557609558105 + }, + { + "auxiliary_loss_clip": 0.01117429, + "auxiliary_loss_mlp": 0.01035443, + "balance_loss_clip": 1.0215075, + "balance_loss_mlp": 1.04087436, + "epoch": 0.39615211182925, + "flos": 20702896968960.0, + "grad_norm": 1.9745840784771536, + "language_loss": 0.81579673, + "learning_rate": 2.7509060441767115e-06, + "loss": 0.83732545, + "num_input_tokens_seen": 141505280, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 6589, + "time_per_iteration": 2.487581253051758 + }, + { + "auxiliary_loss_clip": 0.01118186, + "auxiliary_loss_mlp": 0.01033247, + "balance_loss_clip": 1.01858449, + "balance_loss_mlp": 1.04102254, + "epoch": 0.39621223508191794, + "flos": 20994312009600.0, + "grad_norm": 2.0157149751951606, + "language_loss": 0.70171028, + "learning_rate": 2.7505450607217057e-06, + "loss": 0.72322464, + "num_input_tokens_seen": 141523930, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7734375, + "step": 6590, + "time_per_iteration": 2.4837629795074463 + }, + { + "auxiliary_loss_clip": 0.01118811, + "auxiliary_loss_mlp": 0.01039001, + "balance_loss_clip": 1.02517259, + "balance_loss_mlp": 1.04276454, + "epoch": 0.3962723583345859, + "flos": 23368833417600.0, + "grad_norm": 1.6568331410473631, + "language_loss": 0.76061213, + "learning_rate": 2.750184048805956e-06, + "loss": 0.7821902, + "num_input_tokens_seen": 141541320, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76171875, + "step": 6591, + "time_per_iteration": 2.574401617050171 + }, + { + "auxiliary_loss_clip": 0.01119076, + "auxiliary_loss_mlp": 0.0104207, + "balance_loss_clip": 1.02803326, + "balance_loss_mlp": 1.04253912, + "epoch": 0.39633248158725387, + "flos": 25115599808640.0, + "grad_norm": 1.7800794685008139, + "language_loss": 0.79121935, + "learning_rate": 2.749823008443152e-06, + "loss": 0.81283081, + "num_input_tokens_seen": 141561880, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.765625, + "step": 6592, + "time_per_iteration": 2.5065057277679443 + }, + { + "auxiliary_loss_clip": 0.01112832, + "auxiliary_loss_mlp": 0.010329, + "balance_loss_clip": 1.01945305, + "balance_loss_mlp": 1.04020298, + "epoch": 0.39639260483992184, + "flos": 39787622236800.0, + "grad_norm": 1.6584377020479992, + "language_loss": 0.69372392, + "learning_rate": 2.7494619396469843e-06, + "loss": 0.71518123, + "num_input_tokens_seen": 141586460, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7265625, + "step": 6593, + "time_per_iteration": 2.691351890563965 + }, + { + "auxiliary_loss_clip": 0.01119923, + "auxiliary_loss_mlp": 0.01038681, + "balance_loss_clip": 1.02389932, + "balance_loss_mlp": 1.04100418, + "epoch": 0.3964527280925898, + "flos": 17347045017600.0, + "grad_norm": 1.6545825162449217, + "language_loss": 0.77913815, + "learning_rate": 2.7491008424311452e-06, + "loss": 0.80072421, + "num_input_tokens_seen": 141605955, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7890625, + "step": 6594, + "time_per_iteration": 2.452536106109619 + }, + { + "auxiliary_loss_clip": 0.01038921, + "auxiliary_loss_mlp": 0.01002091, + "balance_loss_clip": 1.0005945, + "balance_loss_mlp": 1.0146898, + "epoch": 0.39651285134525777, + "flos": 71717848369920.0, + "grad_norm": 0.9454940833877284, + "language_loss": 0.63038307, + "learning_rate": 2.7487397168093265e-06, + "loss": 0.65079319, + "num_input_tokens_seen": 141673140, + "router_z_loss_clip": 0.01495361, + "router_z_loss_mlp": 0.2421875, + "step": 6595, + "time_per_iteration": 6.018520355224609 + }, + { + "auxiliary_loss_clip": 0.01121925, + "auxiliary_loss_mlp": 0.01044146, + "balance_loss_clip": 1.02908421, + "balance_loss_mlp": 1.04294038, + "epoch": 0.39657297459792573, + "flos": 25775710001280.0, + "grad_norm": 2.072222886004575, + "language_loss": 0.6329869, + "learning_rate": 2.748378562795223e-06, + "loss": 0.65464759, + "num_input_tokens_seen": 141692955, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7890625, + "step": 6596, + "time_per_iteration": 5.302752494812012 + }, + { + "auxiliary_loss_clip": 0.01115587, + "auxiliary_loss_mlp": 0.01035004, + "balance_loss_clip": 1.02110457, + "balance_loss_mlp": 1.04157937, + "epoch": 0.3966330978505937, + "flos": 20266115587200.0, + "grad_norm": 2.0492451282774273, + "language_loss": 0.78553772, + "learning_rate": 2.7480173804025293e-06, + "loss": 0.80704355, + "num_input_tokens_seen": 141710680, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.73828125, + "step": 6597, + "time_per_iteration": 2.457028388977051 + }, + { + "auxiliary_loss_clip": 0.01121814, + "auxiliary_loss_mlp": 0.01040279, + "balance_loss_clip": 1.02558672, + "balance_loss_mlp": 1.04262114, + "epoch": 0.39669322110326166, + "flos": 20631183465600.0, + "grad_norm": 1.95592503590265, + "language_loss": 0.67559552, + "learning_rate": 2.747656169644941e-06, + "loss": 0.69721651, + "num_input_tokens_seen": 141729860, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.79296875, + "step": 6598, + "time_per_iteration": 2.4448981285095215 + }, + { + "auxiliary_loss_clip": 0.01117545, + "auxiliary_loss_mlp": 0.01034771, + "balance_loss_clip": 1.02153933, + "balance_loss_mlp": 1.0411458, + "epoch": 0.3967533443559297, + "flos": 21726063878400.0, + "grad_norm": 2.3323846151329235, + "language_loss": 0.78922117, + "learning_rate": 2.747294930536157e-06, + "loss": 0.81074429, + "num_input_tokens_seen": 141749060, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.765625, + "step": 6599, + "time_per_iteration": 2.4799394607543945 + }, + { + "auxiliary_loss_clip": 0.01117884, + "auxiliary_loss_mlp": 0.01032083, + "balance_loss_clip": 1.01680064, + "balance_loss_mlp": 1.04196167, + "epoch": 0.39681346760859765, + "flos": 25484151306240.0, + "grad_norm": 1.67964508136209, + "language_loss": 0.72716624, + "learning_rate": 2.7469336630898737e-06, + "loss": 0.74866593, + "num_input_tokens_seen": 141769860, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.7578125, + "step": 6600, + "time_per_iteration": 2.4940543174743652 + }, + { + "auxiliary_loss_clip": 0.01115602, + "auxiliary_loss_mlp": 0.01032131, + "balance_loss_clip": 1.01864827, + "balance_loss_mlp": 1.03997052, + "epoch": 0.3968735908612656, + "flos": 20959586536320.0, + "grad_norm": 1.9442093512958227, + "language_loss": 0.85773253, + "learning_rate": 2.746572367319791e-06, + "loss": 0.87920988, + "num_input_tokens_seen": 141788465, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 6601, + "time_per_iteration": 2.4826369285583496 + }, + { + "auxiliary_loss_clip": 0.01124374, + "auxiliary_loss_mlp": 0.01038225, + "balance_loss_clip": 1.02191091, + "balance_loss_mlp": 1.04298782, + "epoch": 0.3969337141139336, + "flos": 10707090531840.0, + "grad_norm": 2.3202277168625054, + "language_loss": 0.70015699, + "learning_rate": 2.7462110432396095e-06, + "loss": 0.72178292, + "num_input_tokens_seen": 141804955, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8125, + "step": 6602, + "time_per_iteration": 2.4452199935913086 + }, + { + "auxiliary_loss_clip": 0.01119686, + "auxiliary_loss_mlp": 0.01038286, + "balance_loss_clip": 1.02458847, + "balance_loss_mlp": 1.04225206, + "epoch": 0.39699383736660154, + "flos": 17593714690560.0, + "grad_norm": 2.564497124514123, + "language_loss": 0.83408487, + "learning_rate": 2.7458496908630305e-06, + "loss": 0.85566461, + "num_input_tokens_seen": 141820025, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 6603, + "time_per_iteration": 2.50046968460083 + }, + { + "auxiliary_loss_clip": 0.01115539, + "auxiliary_loss_mlp": 0.01032527, + "balance_loss_clip": 1.0192889, + "balance_loss_mlp": 1.04076076, + "epoch": 0.3970539606192695, + "flos": 17785945301760.0, + "grad_norm": 1.4733286794124776, + "language_loss": 0.72804213, + "learning_rate": 2.7454883102037563e-06, + "loss": 0.74952281, + "num_input_tokens_seen": 141838735, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.74609375, + "step": 6604, + "time_per_iteration": 2.435645580291748 + }, + { + "auxiliary_loss_clip": 0.01114015, + "auxiliary_loss_mlp": 0.01037208, + "balance_loss_clip": 1.02366602, + "balance_loss_mlp": 1.0427258, + "epoch": 0.3971140838719375, + "flos": 24789495208320.0, + "grad_norm": 1.694386771997249, + "language_loss": 0.82919562, + "learning_rate": 2.745126901275491e-06, + "loss": 0.85070789, + "num_input_tokens_seen": 141858090, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7109375, + "step": 6605, + "time_per_iteration": 2.538792371749878 + }, + { + "auxiliary_loss_clip": 0.01113567, + "auxiliary_loss_mlp": 0.0103244, + "balance_loss_clip": 1.02053654, + "balance_loss_mlp": 1.04017544, + "epoch": 0.39717420712460544, + "flos": 24243581329920.0, + "grad_norm": 1.515379376113219, + "language_loss": 0.73755872, + "learning_rate": 2.7447654640919383e-06, + "loss": 0.75901884, + "num_input_tokens_seen": 141877540, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.734375, + "step": 6606, + "time_per_iteration": 2.4766290187835693 + }, + { + "auxiliary_loss_clip": 0.0111968, + "auxiliary_loss_mlp": 0.01034451, + "balance_loss_clip": 1.0207423, + "balance_loss_mlp": 1.04279184, + "epoch": 0.3972343303772734, + "flos": 25884698843520.0, + "grad_norm": 1.9669838489657716, + "language_loss": 0.73925817, + "learning_rate": 2.744403998666805e-06, + "loss": 0.76079941, + "num_input_tokens_seen": 141897315, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76953125, + "step": 6607, + "time_per_iteration": 2.550140380859375 + }, + { + "auxiliary_loss_clip": 0.01121372, + "auxiliary_loss_mlp": 0.01034109, + "balance_loss_clip": 1.02045417, + "balance_loss_mlp": 1.04417753, + "epoch": 0.39729445362994137, + "flos": 45623716300800.0, + "grad_norm": 1.5241940789626238, + "language_loss": 0.67978024, + "learning_rate": 2.744042505013797e-06, + "loss": 0.70133507, + "num_input_tokens_seen": 141919580, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 6608, + "time_per_iteration": 2.70333194732666 + }, + { + "auxiliary_loss_clip": 0.01120221, + "auxiliary_loss_mlp": 0.01042402, + "balance_loss_clip": 1.0263803, + "balance_loss_mlp": 1.04247403, + "epoch": 0.39735457688260933, + "flos": 20193971120640.0, + "grad_norm": 2.3779993769587486, + "language_loss": 0.74649572, + "learning_rate": 2.7436809831466233e-06, + "loss": 0.76812196, + "num_input_tokens_seen": 141937045, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.77734375, + "step": 6609, + "time_per_iteration": 2.4810678958892822 + }, + { + "auxiliary_loss_clip": 0.01119236, + "auxiliary_loss_mlp": 0.01032767, + "balance_loss_clip": 1.01909387, + "balance_loss_mlp": 1.04284418, + "epoch": 0.3974147001352773, + "flos": 23331163029120.0, + "grad_norm": 4.182923272039756, + "language_loss": 0.71530509, + "learning_rate": 2.7433194330789927e-06, + "loss": 0.73682511, + "num_input_tokens_seen": 141956695, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.765625, + "step": 6610, + "time_per_iteration": 2.483358860015869 + }, + { + "auxiliary_loss_clip": 0.01110348, + "auxiliary_loss_mlp": 0.0103254, + "balance_loss_clip": 1.01881909, + "balance_loss_mlp": 1.03868747, + "epoch": 0.39747482338794526, + "flos": 21688644885120.0, + "grad_norm": 1.6591621928280806, + "language_loss": 0.7848928, + "learning_rate": 2.7429578548246133e-06, + "loss": 0.80632162, + "num_input_tokens_seen": 141975935, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71484375, + "step": 6611, + "time_per_iteration": 2.4707412719726562 + }, + { + "auxiliary_loss_clip": 0.01120047, + "auxiliary_loss_mlp": 0.01036907, + "balance_loss_clip": 1.0234127, + "balance_loss_mlp": 1.04496026, + "epoch": 0.3975349466406133, + "flos": 30988717816320.0, + "grad_norm": 1.7910222988347433, + "language_loss": 0.78681552, + "learning_rate": 2.7425962483971985e-06, + "loss": 0.80838501, + "num_input_tokens_seen": 141995750, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 6612, + "time_per_iteration": 2.552384614944458 + }, + { + "auxiliary_loss_clip": 0.01042423, + "auxiliary_loss_mlp": 0.01023175, + "balance_loss_clip": 1.02180374, + "balance_loss_mlp": 1.01794136, + "epoch": 0.39759506989328125, + "flos": 63683948833920.0, + "grad_norm": 0.8703127674216669, + "language_loss": 0.64956641, + "learning_rate": 2.742234613810459e-06, + "loss": 0.6702224, + "num_input_tokens_seen": 142057655, + "router_z_loss_clip": 0.01373291, + "router_z_loss_mlp": 0.24414062, + "step": 6613, + "time_per_iteration": 2.978494882583618 + }, + { + "auxiliary_loss_clip": 0.01116625, + "auxiliary_loss_mlp": 0.01031073, + "balance_loss_clip": 1.01683927, + "balance_loss_mlp": 1.04148316, + "epoch": 0.3976551931459492, + "flos": 23695835857920.0, + "grad_norm": 2.0550022834902797, + "language_loss": 0.71538055, + "learning_rate": 2.741872951078109e-06, + "loss": 0.73685759, + "num_input_tokens_seen": 142076020, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.75390625, + "step": 6614, + "time_per_iteration": 2.4898061752319336 + }, + { + "auxiliary_loss_clip": 0.01116246, + "auxiliary_loss_mlp": 0.0103036, + "balance_loss_clip": 1.01644266, + "balance_loss_mlp": 1.04124689, + "epoch": 0.3977153163986172, + "flos": 15669657745920.0, + "grad_norm": 1.8540793086422767, + "language_loss": 0.81317735, + "learning_rate": 2.741511260213862e-06, + "loss": 0.83464336, + "num_input_tokens_seen": 142093790, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 6615, + "time_per_iteration": 2.4708592891693115 + }, + { + "auxiliary_loss_clip": 0.01116463, + "auxiliary_loss_mlp": 0.01033552, + "balance_loss_clip": 1.02074313, + "balance_loss_mlp": 1.04221725, + "epoch": 0.39777543965128515, + "flos": 14064702249600.0, + "grad_norm": 2.466828000769562, + "language_loss": 0.67015827, + "learning_rate": 2.741149541231434e-06, + "loss": 0.69165838, + "num_input_tokens_seen": 142110545, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7421875, + "step": 6616, + "time_per_iteration": 2.4453790187835693 + }, + { + "auxiliary_loss_clip": 0.01120268, + "auxiliary_loss_mlp": 0.01034152, + "balance_loss_clip": 1.02032995, + "balance_loss_mlp": 1.04185963, + "epoch": 0.3978355629039531, + "flos": 23367468700800.0, + "grad_norm": 2.097035382924748, + "language_loss": 0.83857769, + "learning_rate": 2.740787794144541e-06, + "loss": 0.86012185, + "num_input_tokens_seen": 142128695, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.78125, + "step": 6617, + "time_per_iteration": 2.4740309715270996 + }, + { + "auxiliary_loss_clip": 0.01113934, + "auxiliary_loss_mlp": 0.01035523, + "balance_loss_clip": 1.02256477, + "balance_loss_mlp": 1.04305041, + "epoch": 0.3978956861566211, + "flos": 19062785036160.0, + "grad_norm": 1.6139116519566428, + "language_loss": 0.72253633, + "learning_rate": 2.7404260189669e-06, + "loss": 0.74403095, + "num_input_tokens_seen": 142148375, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 6618, + "time_per_iteration": 2.451362371444702 + }, + { + "auxiliary_loss_clip": 0.01117142, + "auxiliary_loss_mlp": 0.01035822, + "balance_loss_clip": 1.02070642, + "balance_loss_mlp": 1.04263783, + "epoch": 0.39795580940928904, + "flos": 30227699341440.0, + "grad_norm": 1.9091502235972209, + "language_loss": 0.65847683, + "learning_rate": 2.740064215712231e-06, + "loss": 0.6800065, + "num_input_tokens_seen": 142169735, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.74609375, + "step": 6619, + "time_per_iteration": 2.5479021072387695 + }, + { + "auxiliary_loss_clip": 0.01041684, + "auxiliary_loss_mlp": 0.00999907, + "balance_loss_clip": 0.99843466, + "balance_loss_mlp": 1.0170114, + "epoch": 0.398015932661957, + "flos": 69847224906240.0, + "grad_norm": 0.7720250582246381, + "language_loss": 0.58222711, + "learning_rate": 2.7397023843942527e-06, + "loss": 0.60264301, + "num_input_tokens_seen": 142229520, + "router_z_loss_clip": 0.01470947, + "router_z_loss_mlp": 0.24609375, + "step": 6620, + "time_per_iteration": 3.0502688884735107 + }, + { + "auxiliary_loss_clip": 0.01116159, + "auxiliary_loss_mlp": 0.01036059, + "balance_loss_clip": 1.02383971, + "balance_loss_mlp": 1.04254556, + "epoch": 0.39807605591462497, + "flos": 20157773189760.0, + "grad_norm": 1.5861085047038441, + "language_loss": 0.79551339, + "learning_rate": 2.739340525026686e-06, + "loss": 0.81703556, + "num_input_tokens_seen": 142247660, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.73828125, + "step": 6621, + "time_per_iteration": 2.4595162868499756 + }, + { + "auxiliary_loss_clip": 0.01115012, + "auxiliary_loss_mlp": 0.01030289, + "balance_loss_clip": 1.01709294, + "balance_loss_mlp": 1.04198873, + "epoch": 0.39813617916729294, + "flos": 21141761339520.0, + "grad_norm": 1.9955210259775171, + "language_loss": 0.78070045, + "learning_rate": 2.738978637623252e-06, + "loss": 0.80215347, + "num_input_tokens_seen": 142266990, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73046875, + "step": 6622, + "time_per_iteration": 2.487805128097534 + }, + { + "auxiliary_loss_clip": 0.01116184, + "auxiliary_loss_mlp": 0.01030398, + "balance_loss_clip": 1.01685607, + "balance_loss_mlp": 1.04132223, + "epoch": 0.3981963024199609, + "flos": 18988485753600.0, + "grad_norm": 1.5290489885204759, + "language_loss": 0.75010175, + "learning_rate": 2.738616722197674e-06, + "loss": 0.77156758, + "num_input_tokens_seen": 142287170, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.75, + "step": 6623, + "time_per_iteration": 2.464571714401245 + }, + { + "auxiliary_loss_clip": 0.01116211, + "auxiliary_loss_mlp": 0.01036455, + "balance_loss_clip": 1.02278805, + "balance_loss_mlp": 1.04220378, + "epoch": 0.39825642567262887, + "flos": 16575108808320.0, + "grad_norm": 1.7278538768787957, + "language_loss": 0.79535556, + "learning_rate": 2.7382547787636766e-06, + "loss": 0.81688213, + "num_input_tokens_seen": 142305405, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73828125, + "step": 6624, + "time_per_iteration": 2.4550037384033203 + }, + { + "auxiliary_loss_clip": 0.01120431, + "auxiliary_loss_mlp": 0.01041321, + "balance_loss_clip": 1.02627707, + "balance_loss_mlp": 1.04234707, + "epoch": 0.39831654892529683, + "flos": 22199833290240.0, + "grad_norm": 2.035642441182755, + "language_loss": 0.83558613, + "learning_rate": 2.7378928073349832e-06, + "loss": 0.85720372, + "num_input_tokens_seen": 142322710, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.78125, + "step": 6625, + "time_per_iteration": 2.456171989440918 + }, + { + "auxiliary_loss_clip": 0.01114643, + "auxiliary_loss_mlp": 0.01042231, + "balance_loss_clip": 1.02839124, + "balance_loss_mlp": 1.04085207, + "epoch": 0.39837667217796485, + "flos": 10487963612160.0, + "grad_norm": 2.051687002705142, + "language_loss": 0.86593187, + "learning_rate": 2.737530807925321e-06, + "loss": 0.88750064, + "num_input_tokens_seen": 142338535, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.73828125, + "step": 6626, + "time_per_iteration": 2.4335460662841797 + }, + { + "auxiliary_loss_clip": 0.01115116, + "auxiliary_loss_mlp": 0.01036656, + "balance_loss_clip": 1.02238643, + "balance_loss_mlp": 1.04094946, + "epoch": 0.3984367954306328, + "flos": 17965282930560.0, + "grad_norm": 2.3900066005878386, + "language_loss": 0.83897698, + "learning_rate": 2.737168780548417e-06, + "loss": 0.86049473, + "num_input_tokens_seen": 142354570, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7421875, + "step": 6627, + "time_per_iteration": 2.4269766807556152 + }, + { + "auxiliary_loss_clip": 0.01111344, + "auxiliary_loss_mlp": 0.01038178, + "balance_loss_clip": 1.02514243, + "balance_loss_mlp": 1.03955984, + "epoch": 0.3984969186833008, + "flos": 22711057608960.0, + "grad_norm": 1.4398151096773946, + "language_loss": 0.82760668, + "learning_rate": 2.736806725217998e-06, + "loss": 0.8491019, + "num_input_tokens_seen": 142374395, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71875, + "step": 6628, + "time_per_iteration": 2.529315948486328 + }, + { + "auxiliary_loss_clip": 0.01115476, + "auxiliary_loss_mlp": 0.01040725, + "balance_loss_clip": 1.027421, + "balance_loss_mlp": 1.04130399, + "epoch": 0.39855704193596875, + "flos": 23405785534080.0, + "grad_norm": 1.8256672588255014, + "language_loss": 0.70683473, + "learning_rate": 2.7364446419477945e-06, + "loss": 0.72839677, + "num_input_tokens_seen": 142396040, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7421875, + "step": 6629, + "time_per_iteration": 2.5025413036346436 + }, + { + "auxiliary_loss_clip": 0.01114279, + "auxiliary_loss_mlp": 0.01035106, + "balance_loss_clip": 1.02155161, + "balance_loss_mlp": 1.04309297, + "epoch": 0.3986171651886367, + "flos": 21251935330560.0, + "grad_norm": 4.278612279497538, + "language_loss": 0.80683714, + "learning_rate": 2.7360825307515366e-06, + "loss": 0.82833099, + "num_input_tokens_seen": 142415495, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7109375, + "step": 6630, + "time_per_iteration": 2.4792280197143555 + }, + { + "auxiliary_loss_clip": 0.01116417, + "auxiliary_loss_mlp": 0.01027934, + "balance_loss_clip": 1.01485634, + "balance_loss_mlp": 1.04143131, + "epoch": 0.3986772884413047, + "flos": 12458705258880.0, + "grad_norm": 1.8749880656247468, + "language_loss": 0.75354141, + "learning_rate": 2.7357203916429555e-06, + "loss": 0.7749849, + "num_input_tokens_seen": 142431865, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.75, + "step": 6631, + "time_per_iteration": 2.417546272277832 + }, + { + "auxiliary_loss_clip": 0.0111705, + "auxiliary_loss_mlp": 0.01035263, + "balance_loss_clip": 1.0218699, + "balance_loss_mlp": 1.04246461, + "epoch": 0.39873741169397264, + "flos": 19646117907840.0, + "grad_norm": 2.3246230169523194, + "language_loss": 0.7156167, + "learning_rate": 2.735358224635783e-06, + "loss": 0.73713982, + "num_input_tokens_seen": 142450595, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.74609375, + "step": 6632, + "time_per_iteration": 2.446089744567871 + }, + { + "auxiliary_loss_clip": 0.01111142, + "auxiliary_loss_mlp": 0.01037094, + "balance_loss_clip": 1.02449358, + "balance_loss_mlp": 1.03939462, + "epoch": 0.3987975349466406, + "flos": 21684766216320.0, + "grad_norm": 1.8450465759001686, + "language_loss": 0.74742806, + "learning_rate": 2.7349960297437533e-06, + "loss": 0.76891041, + "num_input_tokens_seen": 142466650, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71875, + "step": 6633, + "time_per_iteration": 2.431104898452759 + }, + { + "auxiliary_loss_clip": 0.011138, + "auxiliary_loss_mlp": 0.01027649, + "balance_loss_clip": 1.01455402, + "balance_loss_mlp": 1.03961205, + "epoch": 0.3988576581993086, + "flos": 23914064937600.0, + "grad_norm": 1.781985159362602, + "language_loss": 0.808864, + "learning_rate": 2.7346338069806e-06, + "loss": 0.83027852, + "num_input_tokens_seen": 142486165, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7421875, + "step": 6634, + "time_per_iteration": 2.471496105194092 + }, + { + "auxiliary_loss_clip": 0.01116689, + "auxiliary_loss_mlp": 0.01032338, + "balance_loss_clip": 1.01856947, + "balance_loss_mlp": 1.04252565, + "epoch": 0.39891778145197654, + "flos": 18149899858560.0, + "grad_norm": 1.7295196741572958, + "language_loss": 0.74605262, + "learning_rate": 2.7342715563600597e-06, + "loss": 0.7675429, + "num_input_tokens_seen": 142505035, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7421875, + "step": 6635, + "time_per_iteration": 2.4630682468414307 + }, + { + "auxiliary_loss_clip": 0.01120499, + "auxiliary_loss_mlp": 0.01044274, + "balance_loss_clip": 1.02930093, + "balance_loss_mlp": 1.04096711, + "epoch": 0.3989779047046445, + "flos": 22595281096320.0, + "grad_norm": 1.9670463450002986, + "language_loss": 0.66429746, + "learning_rate": 2.733909277895868e-06, + "loss": 0.68594521, + "num_input_tokens_seen": 142521870, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.796875, + "step": 6636, + "time_per_iteration": 2.454789876937866 + }, + { + "auxiliary_loss_clip": 0.0111332, + "auxiliary_loss_mlp": 0.01034295, + "balance_loss_clip": 1.02131867, + "balance_loss_mlp": 1.0403626, + "epoch": 0.39903802795731247, + "flos": 18077216688000.0, + "grad_norm": 1.695302941119513, + "language_loss": 0.81410646, + "learning_rate": 2.733546971601763e-06, + "loss": 0.83558261, + "num_input_tokens_seen": 142540455, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.73046875, + "step": 6637, + "time_per_iteration": 5.387745380401611 + }, + { + "auxiliary_loss_clip": 0.01040567, + "auxiliary_loss_mlp": 0.01000444, + "balance_loss_clip": 0.99893045, + "balance_loss_mlp": 1.0159328, + "epoch": 0.39909815120998043, + "flos": 70441367771520.0, + "grad_norm": 0.7139106827959352, + "language_loss": 0.53211641, + "learning_rate": 2.733184637491484e-06, + "loss": 0.55252659, + "num_input_tokens_seen": 142599665, + "router_z_loss_clip": 0.01513672, + "router_z_loss_mlp": 0.24609375, + "step": 6638, + "time_per_iteration": 4.465191125869751 + }, + { + "auxiliary_loss_clip": 0.01114413, + "auxiliary_loss_mlp": 0.01035276, + "balance_loss_clip": 1.02260959, + "balance_loss_mlp": 1.04064405, + "epoch": 0.39915827446264845, + "flos": 18549262247040.0, + "grad_norm": 1.9403504228046689, + "language_loss": 0.75377512, + "learning_rate": 2.732822275578769e-06, + "loss": 0.77527201, + "num_input_tokens_seen": 142618845, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.73828125, + "step": 6639, + "time_per_iteration": 2.4947104454040527 + }, + { + "auxiliary_loss_clip": 0.01112086, + "auxiliary_loss_mlp": 0.01030627, + "balance_loss_clip": 1.01788926, + "balance_loss_mlp": 1.04078937, + "epoch": 0.3992183977153164, + "flos": 29897249195520.0, + "grad_norm": 1.632879790681491, + "language_loss": 0.76217377, + "learning_rate": 2.7324598858773603e-06, + "loss": 0.78360093, + "num_input_tokens_seen": 142640885, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 6640, + "time_per_iteration": 2.524815320968628 + }, + { + "auxiliary_loss_clip": 0.01112883, + "auxiliary_loss_mlp": 0.01037412, + "balance_loss_clip": 1.02448368, + "balance_loss_mlp": 1.03855717, + "epoch": 0.3992785209679844, + "flos": 22565080736640.0, + "grad_norm": 2.5962495804033794, + "language_loss": 0.82264209, + "learning_rate": 2.7320974684009996e-06, + "loss": 0.84414506, + "num_input_tokens_seen": 142659340, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7421875, + "step": 6641, + "time_per_iteration": 2.4753921031951904 + }, + { + "auxiliary_loss_clip": 0.01116915, + "auxiliary_loss_mlp": 0.010322, + "balance_loss_clip": 1.01891971, + "balance_loss_mlp": 1.04188418, + "epoch": 0.39933864422065235, + "flos": 19682674974720.0, + "grad_norm": 2.015070946619467, + "language_loss": 0.7685014, + "learning_rate": 2.7317350231634288e-06, + "loss": 0.78999245, + "num_input_tokens_seen": 142677085, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75, + "step": 6642, + "time_per_iteration": 2.431239604949951 + }, + { + "auxiliary_loss_clip": 0.01114257, + "auxiliary_loss_mlp": 0.0103328, + "balance_loss_clip": 1.019642, + "balance_loss_mlp": 1.03963089, + "epoch": 0.3993987674733203, + "flos": 23038491012480.0, + "grad_norm": 2.2960488262105145, + "language_loss": 0.7247656, + "learning_rate": 2.731372550178393e-06, + "loss": 0.74624097, + "num_input_tokens_seen": 142694595, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 6643, + "time_per_iteration": 2.4759740829467773 + }, + { + "auxiliary_loss_clip": 0.01115242, + "auxiliary_loss_mlp": 0.01035377, + "balance_loss_clip": 1.0214113, + "balance_loss_mlp": 1.04014993, + "epoch": 0.3994588907259883, + "flos": 19390828970880.0, + "grad_norm": 1.5171926718970592, + "language_loss": 0.65988386, + "learning_rate": 2.7310100494596375e-06, + "loss": 0.68139005, + "num_input_tokens_seen": 142714175, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75, + "step": 6644, + "time_per_iteration": 2.437404155731201 + }, + { + "auxiliary_loss_clip": 0.01113182, + "auxiliary_loss_mlp": 0.01037023, + "balance_loss_clip": 1.0235281, + "balance_loss_mlp": 1.0386616, + "epoch": 0.39951901397865625, + "flos": 13734395758080.0, + "grad_norm": 1.956427678643188, + "language_loss": 0.78470129, + "learning_rate": 2.730647521020907e-06, + "loss": 0.80620331, + "num_input_tokens_seen": 142730955, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 6645, + "time_per_iteration": 2.44826078414917 + }, + { + "auxiliary_loss_clip": 0.01115381, + "auxiliary_loss_mlp": 0.01033765, + "balance_loss_clip": 1.02034187, + "balance_loss_mlp": 1.04042077, + "epoch": 0.3995791372313242, + "flos": 23586451966080.0, + "grad_norm": 1.409098570486763, + "language_loss": 0.69889182, + "learning_rate": 2.73028496487595e-06, + "loss": 0.72038329, + "num_input_tokens_seen": 142751200, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 6646, + "time_per_iteration": 2.4746181964874268 + }, + { + "auxiliary_loss_clip": 0.01113557, + "auxiliary_loss_mlp": 0.01035502, + "balance_loss_clip": 1.0222578, + "balance_loss_mlp": 1.03869605, + "epoch": 0.3996392604839922, + "flos": 21355896268800.0, + "grad_norm": 1.7478077072518943, + "language_loss": 0.72165501, + "learning_rate": 2.729922381038513e-06, + "loss": 0.74314553, + "num_input_tokens_seen": 142770170, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.74609375, + "step": 6647, + "time_per_iteration": 2.4814393520355225 + }, + { + "auxiliary_loss_clip": 0.01108545, + "auxiliary_loss_mlp": 0.01037927, + "balance_loss_clip": 1.02576208, + "balance_loss_mlp": 1.03874063, + "epoch": 0.39969938373666014, + "flos": 26032255914240.0, + "grad_norm": 1.4937426139380796, + "language_loss": 0.74371958, + "learning_rate": 2.7295597695223463e-06, + "loss": 0.76518434, + "num_input_tokens_seen": 142792680, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69921875, + "step": 6648, + "time_per_iteration": 2.4970345497131348 + }, + { + "auxiliary_loss_clip": 0.01115329, + "auxiliary_loss_mlp": 0.01036503, + "balance_loss_clip": 1.02300286, + "balance_loss_mlp": 1.04061389, + "epoch": 0.3997595069893281, + "flos": 20116367786880.0, + "grad_norm": 2.209642859907432, + "language_loss": 0.66124469, + "learning_rate": 2.7291971303412006e-06, + "loss": 0.68276298, + "num_input_tokens_seen": 142810510, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 6649, + "time_per_iteration": 2.4624104499816895 + }, + { + "auxiliary_loss_clip": 0.01116294, + "auxiliary_loss_mlp": 0.01036155, + "balance_loss_clip": 1.02280378, + "balance_loss_mlp": 1.0420115, + "epoch": 0.39981963024199607, + "flos": 27783403764480.0, + "grad_norm": 1.57860522688022, + "language_loss": 0.75273359, + "learning_rate": 2.728834463508826e-06, + "loss": 0.77425814, + "num_input_tokens_seen": 142832455, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7421875, + "step": 6650, + "time_per_iteration": 2.5091254711151123 + }, + { + "auxiliary_loss_clip": 0.01112744, + "auxiliary_loss_mlp": 0.01037491, + "balance_loss_clip": 1.02397823, + "balance_loss_mlp": 1.03905869, + "epoch": 0.39987975349466404, + "flos": 21944436612480.0, + "grad_norm": 1.4583647344722164, + "language_loss": 0.71954048, + "learning_rate": 2.728471769038975e-06, + "loss": 0.74104279, + "num_input_tokens_seen": 142852590, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73828125, + "step": 6651, + "time_per_iteration": 2.4820897579193115 + }, + { + "auxiliary_loss_clip": 0.01113579, + "auxiliary_loss_mlp": 0.01035523, + "balance_loss_clip": 1.02220726, + "balance_loss_mlp": 1.03815126, + "epoch": 0.39993987674733206, + "flos": 20704405340160.0, + "grad_norm": 1.787132664616244, + "language_loss": 0.72906494, + "learning_rate": 2.728109046945403e-06, + "loss": 0.75055599, + "num_input_tokens_seen": 142870595, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75390625, + "step": 6652, + "time_per_iteration": 2.4568119049072266 + }, + { + "auxiliary_loss_clip": 0.01039541, + "auxiliary_loss_mlp": 0.00999581, + "balance_loss_clip": 0.99819815, + "balance_loss_mlp": 1.01483345, + "epoch": 0.4, + "flos": 61525429862400.0, + "grad_norm": 0.8299860195083637, + "language_loss": 0.61066198, + "learning_rate": 2.727746297241862e-06, + "loss": 0.63105321, + "num_input_tokens_seen": 142925805, + "router_z_loss_clip": 0.01385498, + "router_z_loss_mlp": 0.24707031, + "step": 6653, + "time_per_iteration": 3.0071723461151123 + }, + { + "auxiliary_loss_clip": 0.01113323, + "auxiliary_loss_mlp": 0.01038964, + "balance_loss_clip": 1.02607179, + "balance_loss_mlp": 1.04303741, + "epoch": 0.400060123252668, + "flos": 14502309644160.0, + "grad_norm": 2.127427836980077, + "language_loss": 0.67038172, + "learning_rate": 2.7273835199421085e-06, + "loss": 0.6919046, + "num_input_tokens_seen": 142943145, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 6654, + "time_per_iteration": 2.442049026489258 + }, + { + "auxiliary_loss_clip": 0.01113347, + "auxiliary_loss_mlp": 0.01039111, + "balance_loss_clip": 1.02741051, + "balance_loss_mlp": 1.03887355, + "epoch": 0.40012024650533595, + "flos": 19093308618240.0, + "grad_norm": 2.299433298478917, + "language_loss": 0.89737195, + "learning_rate": 2.7270207150599e-06, + "loss": 0.91889656, + "num_input_tokens_seen": 142956925, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.74609375, + "step": 6655, + "time_per_iteration": 2.4836323261260986 + }, + { + "auxiliary_loss_clip": 0.01110377, + "auxiliary_loss_mlp": 0.0102991, + "balance_loss_clip": 1.01865685, + "balance_loss_mlp": 1.04077053, + "epoch": 0.4001803697580039, + "flos": 29351012094720.0, + "grad_norm": 1.5855954082229138, + "language_loss": 0.73497427, + "learning_rate": 2.7266578826089917e-06, + "loss": 0.75637716, + "num_input_tokens_seen": 142978040, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6953125, + "step": 6656, + "time_per_iteration": 2.5071847438812256 + }, + { + "auxiliary_loss_clip": 0.01116544, + "auxiliary_loss_mlp": 0.01046403, + "balance_loss_clip": 1.03248513, + "balance_loss_mlp": 1.04179835, + "epoch": 0.4002404930106719, + "flos": 20920048640640.0, + "grad_norm": 1.4675228136273628, + "language_loss": 0.7344414, + "learning_rate": 2.726295022603144e-06, + "loss": 0.75607085, + "num_input_tokens_seen": 142998390, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75, + "step": 6657, + "time_per_iteration": 2.575587034225464 + }, + { + "auxiliary_loss_clip": 0.01116565, + "auxiliary_loss_mlp": 0.0103855, + "balance_loss_clip": 1.02432823, + "balance_loss_mlp": 1.04162562, + "epoch": 0.40030061626333985, + "flos": 28405735827840.0, + "grad_norm": 1.4527474123065993, + "language_loss": 0.79588759, + "learning_rate": 2.725932135056117e-06, + "loss": 0.81743878, + "num_input_tokens_seen": 143021505, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.75, + "step": 6658, + "time_per_iteration": 2.7093567848205566 + }, + { + "auxiliary_loss_clip": 0.01115311, + "auxiliary_loss_mlp": 0.01041911, + "balance_loss_clip": 1.02917993, + "balance_loss_mlp": 1.0406971, + "epoch": 0.4003607395160078, + "flos": 25921615046400.0, + "grad_norm": 1.8904694620172307, + "language_loss": 0.77345288, + "learning_rate": 2.72556921998167e-06, + "loss": 0.79502499, + "num_input_tokens_seen": 143041375, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7421875, + "step": 6659, + "time_per_iteration": 2.5323445796966553 + }, + { + "auxiliary_loss_clip": 0.01105934, + "auxiliary_loss_mlp": 0.01028537, + "balance_loss_clip": 1.01713443, + "balance_loss_mlp": 1.03853416, + "epoch": 0.4004208627686758, + "flos": 20768648814720.0, + "grad_norm": 1.7715585064718242, + "language_loss": 0.72642064, + "learning_rate": 2.7252062773935662e-06, + "loss": 0.7477653, + "num_input_tokens_seen": 143058725, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.671875, + "step": 6660, + "time_per_iteration": 2.4459004402160645 + }, + { + "auxiliary_loss_clip": 0.01113964, + "auxiliary_loss_mlp": 0.01039676, + "balance_loss_clip": 1.02753496, + "balance_loss_mlp": 1.04069686, + "epoch": 0.40048098602134374, + "flos": 24681224638080.0, + "grad_norm": 1.7053131194953803, + "language_loss": 0.70897067, + "learning_rate": 2.7248433073055674e-06, + "loss": 0.73050702, + "num_input_tokens_seen": 143076995, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.73046875, + "step": 6661, + "time_per_iteration": 2.5339720249176025 + }, + { + "auxiliary_loss_clip": 0.011183, + "auxiliary_loss_mlp": 0.01041337, + "balance_loss_clip": 1.02808094, + "balance_loss_mlp": 1.04304504, + "epoch": 0.4005411092740117, + "flos": 23185688947200.0, + "grad_norm": 1.7756888608898216, + "language_loss": 0.75688839, + "learning_rate": 2.724480309731437e-06, + "loss": 0.77848476, + "num_input_tokens_seen": 143096780, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75390625, + "step": 6662, + "time_per_iteration": 2.4546353816986084 + }, + { + "auxiliary_loss_clip": 0.01115994, + "auxiliary_loss_mlp": 0.01033447, + "balance_loss_clip": 1.01979184, + "balance_loss_mlp": 1.03956914, + "epoch": 0.4006012325266797, + "flos": 17522324409600.0, + "grad_norm": 2.0032115325237076, + "language_loss": 0.66019243, + "learning_rate": 2.7241172846849417e-06, + "loss": 0.68168688, + "num_input_tokens_seen": 143112590, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.765625, + "step": 6663, + "time_per_iteration": 2.4437708854675293 + }, + { + "auxiliary_loss_clip": 0.01115313, + "auxiliary_loss_mlp": 0.01036965, + "balance_loss_clip": 1.02409601, + "balance_loss_mlp": 1.0406127, + "epoch": 0.40066135577934764, + "flos": 19857200181120.0, + "grad_norm": 2.5671112933527542, + "language_loss": 0.85808247, + "learning_rate": 2.7237542321798455e-06, + "loss": 0.87960517, + "num_input_tokens_seen": 143130220, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.74609375, + "step": 6664, + "time_per_iteration": 2.423644781112671 + }, + { + "auxiliary_loss_clip": 0.01116399, + "auxiliary_loss_mlp": 0.01033701, + "balance_loss_clip": 1.02062321, + "balance_loss_mlp": 1.04155052, + "epoch": 0.40072147903201566, + "flos": 18150007599360.0, + "grad_norm": 1.9940684324093096, + "language_loss": 0.84890211, + "learning_rate": 2.723391152229917e-06, + "loss": 0.87040305, + "num_input_tokens_seen": 143147160, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.75, + "step": 6665, + "time_per_iteration": 2.4386377334594727 + }, + { + "auxiliary_loss_clip": 0.01119995, + "auxiliary_loss_mlp": 0.01034483, + "balance_loss_clip": 1.02107859, + "balance_loss_mlp": 1.04381645, + "epoch": 0.4007816022846836, + "flos": 18661267831680.0, + "grad_norm": 1.7199178144884215, + "language_loss": 0.78264785, + "learning_rate": 2.7230280448489236e-06, + "loss": 0.8041926, + "num_input_tokens_seen": 143164605, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.76171875, + "step": 6666, + "time_per_iteration": 2.434093952178955 + }, + { + "auxiliary_loss_clip": 0.01118396, + "auxiliary_loss_mlp": 0.0103542, + "balance_loss_clip": 1.02121019, + "balance_loss_mlp": 1.04240537, + "epoch": 0.4008417255373516, + "flos": 25703170485120.0, + "grad_norm": 1.6354204552723763, + "language_loss": 0.73558462, + "learning_rate": 2.7226649100506333e-06, + "loss": 0.75712276, + "num_input_tokens_seen": 143183965, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.76171875, + "step": 6667, + "time_per_iteration": 2.520869255065918 + }, + { + "auxiliary_loss_clip": 0.01117838, + "auxiliary_loss_mlp": 0.0104414, + "balance_loss_clip": 1.02944148, + "balance_loss_mlp": 1.04147649, + "epoch": 0.40090184879001955, + "flos": 22858614679680.0, + "grad_norm": 1.370510933760038, + "language_loss": 0.75832677, + "learning_rate": 2.7223017478488183e-06, + "loss": 0.77994657, + "num_input_tokens_seen": 143204965, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.76171875, + "step": 6668, + "time_per_iteration": 2.475261688232422 + }, + { + "auxiliary_loss_clip": 0.0111899, + "auxiliary_loss_mlp": 0.01038268, + "balance_loss_clip": 1.02470183, + "balance_loss_mlp": 1.04511833, + "epoch": 0.4009619720426875, + "flos": 29059848449280.0, + "grad_norm": 1.7348003262037657, + "language_loss": 0.82309943, + "learning_rate": 2.721938558257248e-06, + "loss": 0.84467208, + "num_input_tokens_seen": 143225015, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.73828125, + "step": 6669, + "time_per_iteration": 2.530458927154541 + }, + { + "auxiliary_loss_clip": 0.0103961, + "auxiliary_loss_mlp": 0.01001267, + "balance_loss_clip": 0.99993151, + "balance_loss_mlp": 1.01565075, + "epoch": 0.4010220952953555, + "flos": 66059763131520.0, + "grad_norm": 0.698912500879513, + "language_loss": 0.53386176, + "learning_rate": 2.721575341289695e-06, + "loss": 0.55427051, + "num_input_tokens_seen": 143294925, + "router_z_loss_clip": 0.0133667, + "router_z_loss_mlp": 0.23925781, + "step": 6670, + "time_per_iteration": 3.247837781906128 + }, + { + "auxiliary_loss_clip": 0.01115169, + "auxiliary_loss_mlp": 0.01037927, + "balance_loss_clip": 1.02476037, + "balance_loss_mlp": 1.0415678, + "epoch": 0.40108221854802345, + "flos": 29642822184960.0, + "grad_norm": 1.8543411810419943, + "language_loss": 0.88405877, + "learning_rate": 2.7212120969599333e-06, + "loss": 0.9055897, + "num_input_tokens_seen": 143314170, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 6671, + "time_per_iteration": 2.5657830238342285 + }, + { + "auxiliary_loss_clip": 0.01115344, + "auxiliary_loss_mlp": 0.01034806, + "balance_loss_clip": 1.02088797, + "balance_loss_mlp": 1.04077482, + "epoch": 0.4011423418006914, + "flos": 19929560129280.0, + "grad_norm": 1.813982967664466, + "language_loss": 0.78926146, + "learning_rate": 2.720848825281736e-06, + "loss": 0.81076294, + "num_input_tokens_seen": 143330050, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.74609375, + "step": 6672, + "time_per_iteration": 2.444209337234497 + }, + { + "auxiliary_loss_clip": 0.01110996, + "auxiliary_loss_mlp": 0.01031049, + "balance_loss_clip": 1.01829374, + "balance_loss_mlp": 1.03889108, + "epoch": 0.4012024650533594, + "flos": 20084299920000.0, + "grad_norm": 1.9086088279717175, + "language_loss": 0.63218224, + "learning_rate": 2.72048552626888e-06, + "loss": 0.65360266, + "num_input_tokens_seen": 143348650, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 6673, + "time_per_iteration": 2.577171564102173 + }, + { + "auxiliary_loss_clip": 0.01114754, + "auxiliary_loss_mlp": 0.01033396, + "balance_loss_clip": 1.02027059, + "balance_loss_mlp": 1.0399313, + "epoch": 0.40126258830602735, + "flos": 21695719864320.0, + "grad_norm": 1.4529148407259798, + "language_loss": 0.80390126, + "learning_rate": 2.7201221999351402e-06, + "loss": 0.82538271, + "num_input_tokens_seen": 143370275, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.75, + "step": 6674, + "time_per_iteration": 2.5402464866638184 + }, + { + "auxiliary_loss_clip": 0.01119667, + "auxiliary_loss_mlp": 0.01030214, + "balance_loss_clip": 1.01687407, + "balance_loss_mlp": 1.04199886, + "epoch": 0.4013227115586953, + "flos": 12020379592320.0, + "grad_norm": 2.6082453610380574, + "language_loss": 0.82641548, + "learning_rate": 2.719758846294294e-06, + "loss": 0.84791422, + "num_input_tokens_seen": 143385390, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.77734375, + "step": 6675, + "time_per_iteration": 2.4605085849761963 + }, + { + "auxiliary_loss_clip": 0.0111374, + "auxiliary_loss_mlp": 0.01032911, + "balance_loss_clip": 1.0189873, + "balance_loss_mlp": 1.04002738, + "epoch": 0.4013828348113633, + "flos": 25447522412160.0, + "grad_norm": 1.7135878896985557, + "language_loss": 0.93308246, + "learning_rate": 2.71939546536012e-06, + "loss": 0.95454895, + "num_input_tokens_seen": 143404215, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.73828125, + "step": 6676, + "time_per_iteration": 2.496168851852417 + }, + { + "auxiliary_loss_clip": 0.01121217, + "auxiliary_loss_mlp": 0.01039781, + "balance_loss_clip": 1.02516031, + "balance_loss_mlp": 1.04100275, + "epoch": 0.40144295806403124, + "flos": 18582946225920.0, + "grad_norm": 4.942241320167032, + "language_loss": 0.79622304, + "learning_rate": 2.719032057146399e-06, + "loss": 0.81783295, + "num_input_tokens_seen": 143422245, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.80078125, + "step": 6677, + "time_per_iteration": 2.4565844535827637 + }, + { + "auxiliary_loss_clip": 0.01116689, + "auxiliary_loss_mlp": 0.01032915, + "balance_loss_clip": 1.01977801, + "balance_loss_mlp": 1.0429368, + "epoch": 0.4015030813166992, + "flos": 22930220442240.0, + "grad_norm": 3.7422980142657374, + "language_loss": 0.83766311, + "learning_rate": 2.71866862166691e-06, + "loss": 0.85915917, + "num_input_tokens_seen": 143443130, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73828125, + "step": 6678, + "time_per_iteration": 4.12173318862915 + }, + { + "auxiliary_loss_clip": 0.01114267, + "auxiliary_loss_mlp": 0.01037817, + "balance_loss_clip": 1.02480578, + "balance_loss_mlp": 1.04150224, + "epoch": 0.4015632045693672, + "flos": 20595057361920.0, + "grad_norm": 2.988298740497095, + "language_loss": 0.63948399, + "learning_rate": 2.718305158935434e-06, + "loss": 0.66100478, + "num_input_tokens_seen": 143461385, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7265625, + "step": 6679, + "time_per_iteration": 5.297976016998291 + }, + { + "auxiliary_loss_clip": 0.01112719, + "auxiliary_loss_mlp": 0.01029551, + "balance_loss_clip": 1.01653934, + "balance_loss_mlp": 1.04000115, + "epoch": 0.4016233278220352, + "flos": 23438930808960.0, + "grad_norm": 1.456514191681199, + "language_loss": 0.78654617, + "learning_rate": 2.7179416689657554e-06, + "loss": 0.80796885, + "num_input_tokens_seen": 143481750, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7265625, + "step": 6680, + "time_per_iteration": 2.467042922973633 + }, + { + "auxiliary_loss_clip": 0.01119549, + "auxiliary_loss_mlp": 0.0104553, + "balance_loss_clip": 1.03065872, + "balance_loss_mlp": 1.04160023, + "epoch": 0.40168345107470316, + "flos": 21431057477760.0, + "grad_norm": 1.6886011670643926, + "language_loss": 0.75628668, + "learning_rate": 2.7175781517716556e-06, + "loss": 0.77793747, + "num_input_tokens_seen": 143501540, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78125, + "step": 6681, + "time_per_iteration": 2.579265594482422 + }, + { + "auxiliary_loss_clip": 0.01118059, + "auxiliary_loss_mlp": 0.01030247, + "balance_loss_clip": 1.01727676, + "balance_loss_mlp": 1.04282522, + "epoch": 0.4017435743273711, + "flos": 22857214049280.0, + "grad_norm": 2.058228157074571, + "language_loss": 0.64001781, + "learning_rate": 2.7172146073669213e-06, + "loss": 0.66150093, + "num_input_tokens_seen": 143520530, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.75390625, + "step": 6682, + "time_per_iteration": 2.4423694610595703 + }, + { + "auxiliary_loss_clip": 0.01115099, + "auxiliary_loss_mlp": 0.01032652, + "balance_loss_clip": 1.01953304, + "balance_loss_mlp": 1.03868985, + "epoch": 0.4018036975800391, + "flos": 28622312881920.0, + "grad_norm": 1.6867457181896433, + "language_loss": 0.73334014, + "learning_rate": 2.716851035765337e-06, + "loss": 0.75481766, + "num_input_tokens_seen": 143540210, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.765625, + "step": 6683, + "time_per_iteration": 2.5543196201324463 + }, + { + "auxiliary_loss_clip": 0.01113172, + "auxiliary_loss_mlp": 0.01043204, + "balance_loss_clip": 1.02971554, + "balance_loss_mlp": 1.03814459, + "epoch": 0.40186382083270705, + "flos": 26651212099200.0, + "grad_norm": 1.6157462356379846, + "language_loss": 0.73054385, + "learning_rate": 2.7164874369806896e-06, + "loss": 0.75210762, + "num_input_tokens_seen": 143560940, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 6684, + "time_per_iteration": 2.584984302520752 + }, + { + "auxiliary_loss_clip": 0.01036703, + "auxiliary_loss_mlp": 0.01002873, + "balance_loss_clip": 1.0016098, + "balance_loss_mlp": 1.01262808, + "epoch": 0.401923944085375, + "flos": 59259969123840.0, + "grad_norm": 0.8051502477983452, + "language_loss": 0.60442972, + "learning_rate": 2.716123811026767e-06, + "loss": 0.62482548, + "num_input_tokens_seen": 143624015, + "router_z_loss_clip": 0.01263428, + "router_z_loss_mlp": 0.24023438, + "step": 6685, + "time_per_iteration": 3.2001583576202393 + }, + { + "auxiliary_loss_clip": 0.01118672, + "auxiliary_loss_mlp": 0.01032258, + "balance_loss_clip": 1.0184474, + "balance_loss_mlp": 1.0410161, + "epoch": 0.401984067338043, + "flos": 16982803152000.0, + "grad_norm": 2.1343445795660956, + "language_loss": 0.69979215, + "learning_rate": 2.715760157917357e-06, + "loss": 0.72130144, + "num_input_tokens_seen": 143642750, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.77734375, + "step": 6686, + "time_per_iteration": 2.486487627029419 + }, + { + "auxiliary_loss_clip": 0.01113204, + "auxiliary_loss_mlp": 0.01030839, + "balance_loss_clip": 1.01784527, + "balance_loss_mlp": 1.03917289, + "epoch": 0.40204419059071095, + "flos": 24972496024320.0, + "grad_norm": 1.4076322562781298, + "language_loss": 0.74622524, + "learning_rate": 2.7153964776662504e-06, + "loss": 0.76766562, + "num_input_tokens_seen": 143664515, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7421875, + "step": 6687, + "time_per_iteration": 2.4854915142059326 + }, + { + "auxiliary_loss_clip": 0.01117283, + "auxiliary_loss_mlp": 0.01035998, + "balance_loss_clip": 1.02219915, + "balance_loss_mlp": 1.04146934, + "epoch": 0.4021043138433789, + "flos": 23477463123840.0, + "grad_norm": 1.852699339351418, + "language_loss": 0.70648831, + "learning_rate": 2.7150327702872385e-06, + "loss": 0.72802114, + "num_input_tokens_seen": 143683135, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7578125, + "step": 6688, + "time_per_iteration": 2.452765703201294 + }, + { + "auxiliary_loss_clip": 0.01117224, + "auxiliary_loss_mlp": 0.0103962, + "balance_loss_clip": 1.02558923, + "balance_loss_mlp": 1.0390867, + "epoch": 0.4021644370960469, + "flos": 25995806588160.0, + "grad_norm": 1.7360862235805987, + "language_loss": 0.64509618, + "learning_rate": 2.7146690357941112e-06, + "loss": 0.6666646, + "num_input_tokens_seen": 143703985, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.78125, + "step": 6689, + "time_per_iteration": 2.5217337608337402 + }, + { + "auxiliary_loss_clip": 0.01117214, + "auxiliary_loss_mlp": 0.01033972, + "balance_loss_clip": 1.02059698, + "balance_loss_mlp": 1.03956485, + "epoch": 0.40222456034871484, + "flos": 13587987922560.0, + "grad_norm": 2.322807889185569, + "language_loss": 0.7306338, + "learning_rate": 2.7143052742006632e-06, + "loss": 0.75214565, + "num_input_tokens_seen": 143719245, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.77734375, + "step": 6690, + "time_per_iteration": 2.421478509902954 + }, + { + "auxiliary_loss_clip": 0.01114039, + "auxiliary_loss_mlp": 0.01036823, + "balance_loss_clip": 1.02357256, + "balance_loss_mlp": 1.03967643, + "epoch": 0.4022846836013828, + "flos": 24278019494400.0, + "grad_norm": 1.4867559931284213, + "language_loss": 0.74789405, + "learning_rate": 2.7139414855206872e-06, + "loss": 0.76940262, + "num_input_tokens_seen": 143739575, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7421875, + "step": 6691, + "time_per_iteration": 2.5322606563568115 + }, + { + "auxiliary_loss_clip": 0.01119421, + "auxiliary_loss_mlp": 0.01038807, + "balance_loss_clip": 1.02530634, + "balance_loss_mlp": 1.04281604, + "epoch": 0.40234480685405083, + "flos": 20151596050560.0, + "grad_norm": 1.5836527032457117, + "language_loss": 0.72676492, + "learning_rate": 2.7135776697679785e-06, + "loss": 0.74834728, + "num_input_tokens_seen": 143758515, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 6692, + "time_per_iteration": 2.486466407775879 + }, + { + "auxiliary_loss_clip": 0.01115579, + "auxiliary_loss_mlp": 0.01037632, + "balance_loss_clip": 1.02444792, + "balance_loss_mlp": 1.039814, + "epoch": 0.4024049301067188, + "flos": 22930220442240.0, + "grad_norm": 1.7516389520719526, + "language_loss": 0.83851349, + "learning_rate": 2.7132138269563333e-06, + "loss": 0.86004555, + "num_input_tokens_seen": 143776770, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7578125, + "step": 6693, + "time_per_iteration": 2.5068037509918213 + }, + { + "auxiliary_loss_clip": 0.01118781, + "auxiliary_loss_mlp": 0.01036408, + "balance_loss_clip": 1.02325296, + "balance_loss_mlp": 1.04313457, + "epoch": 0.40246505335938676, + "flos": 36028421487360.0, + "grad_norm": 1.699829604816944, + "language_loss": 0.71295136, + "learning_rate": 2.7128499570995483e-06, + "loss": 0.73450321, + "num_input_tokens_seen": 143798450, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7578125, + "step": 6694, + "time_per_iteration": 2.5704145431518555 + }, + { + "auxiliary_loss_clip": 0.01114045, + "auxiliary_loss_mlp": 0.01038433, + "balance_loss_clip": 1.02460432, + "balance_loss_mlp": 1.03981924, + "epoch": 0.4025251766120547, + "flos": 20594303176320.0, + "grad_norm": 2.0155422945498223, + "language_loss": 0.67754763, + "learning_rate": 2.7124860602114212e-06, + "loss": 0.69907242, + "num_input_tokens_seen": 143816995, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7421875, + "step": 6695, + "time_per_iteration": 2.4664762020111084 + }, + { + "auxiliary_loss_clip": 0.0111296, + "auxiliary_loss_mlp": 0.01030605, + "balance_loss_clip": 1.01736653, + "balance_loss_mlp": 1.03826809, + "epoch": 0.4025852998647227, + "flos": 64523932381440.0, + "grad_norm": 2.459399840574827, + "language_loss": 0.79355788, + "learning_rate": 2.7121221363057515e-06, + "loss": 0.81499356, + "num_input_tokens_seen": 143842090, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.74609375, + "step": 6696, + "time_per_iteration": 2.883577346801758 + }, + { + "auxiliary_loss_clip": 0.01118448, + "auxiliary_loss_mlp": 0.0103721, + "balance_loss_clip": 1.02291059, + "balance_loss_mlp": 1.04224885, + "epoch": 0.40264542311739066, + "flos": 20886292834560.0, + "grad_norm": 1.6846278858215487, + "language_loss": 0.70899725, + "learning_rate": 2.7117581853963393e-06, + "loss": 0.73055387, + "num_input_tokens_seen": 143860800, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.76171875, + "step": 6697, + "time_per_iteration": 2.4922237396240234 + }, + { + "auxiliary_loss_clip": 0.01113557, + "auxiliary_loss_mlp": 0.01038169, + "balance_loss_clip": 1.02555108, + "balance_loss_mlp": 1.04018331, + "epoch": 0.4027055463700586, + "flos": 26250197685120.0, + "grad_norm": 2.4926240162149162, + "language_loss": 0.61456931, + "learning_rate": 2.711394207496984e-06, + "loss": 0.63608658, + "num_input_tokens_seen": 143878950, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.734375, + "step": 6698, + "time_per_iteration": 2.4892961978912354 + }, + { + "auxiliary_loss_clip": 0.01115982, + "auxiliary_loss_mlp": 0.0103183, + "balance_loss_clip": 1.01840675, + "balance_loss_mlp": 1.03997493, + "epoch": 0.4027656696227266, + "flos": 20631398947200.0, + "grad_norm": 1.8414423865451628, + "language_loss": 0.76245844, + "learning_rate": 2.711030202621491e-06, + "loss": 0.78393662, + "num_input_tokens_seen": 143898385, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.76171875, + "step": 6699, + "time_per_iteration": 2.4576990604400635 + }, + { + "auxiliary_loss_clip": 0.01110513, + "auxiliary_loss_mlp": 0.01030361, + "balance_loss_clip": 1.0171113, + "balance_loss_mlp": 1.03855538, + "epoch": 0.40282579287539455, + "flos": 22346277039360.0, + "grad_norm": 1.5844300780087603, + "language_loss": 0.80345184, + "learning_rate": 2.7106661707836605e-06, + "loss": 0.82486057, + "num_input_tokens_seen": 143918795, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 6700, + "time_per_iteration": 2.4449126720428467 + }, + { + "auxiliary_loss_clip": 0.01117537, + "auxiliary_loss_mlp": 0.01043995, + "balance_loss_clip": 1.02886689, + "balance_loss_mlp": 1.03814912, + "epoch": 0.4028859161280625, + "flos": 29274988959360.0, + "grad_norm": 2.2662820598104227, + "language_loss": 0.74967611, + "learning_rate": 2.7103021119972977e-06, + "loss": 0.77129138, + "num_input_tokens_seen": 143938245, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.79296875, + "step": 6701, + "time_per_iteration": 2.5474703311920166 + }, + { + "auxiliary_loss_clip": 0.01112492, + "auxiliary_loss_mlp": 0.01038921, + "balance_loss_clip": 1.02598631, + "balance_loss_mlp": 1.03800225, + "epoch": 0.4029460393807305, + "flos": 28622312881920.0, + "grad_norm": 1.5176135502188826, + "language_loss": 0.65989178, + "learning_rate": 2.709938026276208e-06, + "loss": 0.6814059, + "num_input_tokens_seen": 143960995, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.74609375, + "step": 6702, + "time_per_iteration": 2.5158073902130127 + }, + { + "auxiliary_loss_clip": 0.01116121, + "auxiliary_loss_mlp": 0.01039218, + "balance_loss_clip": 1.02409053, + "balance_loss_mlp": 1.03949153, + "epoch": 0.40300616263339845, + "flos": 22601925112320.0, + "grad_norm": 1.577366316976287, + "language_loss": 0.66134161, + "learning_rate": 2.7095739136341964e-06, + "loss": 0.68289495, + "num_input_tokens_seen": 143979910, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.765625, + "step": 6703, + "time_per_iteration": 2.4974560737609863 + }, + { + "auxiliary_loss_clip": 0.01119665, + "auxiliary_loss_mlp": 0.01035087, + "balance_loss_clip": 1.0210917, + "balance_loss_mlp": 1.04285431, + "epoch": 0.4030662858860664, + "flos": 25520313323520.0, + "grad_norm": 2.6870156282512245, + "language_loss": 0.82005399, + "learning_rate": 2.709209774085071e-06, + "loss": 0.84160155, + "num_input_tokens_seen": 144000095, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.765625, + "step": 6704, + "time_per_iteration": 2.5040299892425537 + }, + { + "auxiliary_loss_clip": 0.01117271, + "auxiliary_loss_mlp": 0.01034919, + "balance_loss_clip": 1.02110291, + "balance_loss_mlp": 1.03974569, + "epoch": 0.40312640913873443, + "flos": 23586703361280.0, + "grad_norm": 2.5805971030690578, + "language_loss": 0.73468685, + "learning_rate": 2.7088456076426407e-06, + "loss": 0.75620878, + "num_input_tokens_seen": 144019695, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.77734375, + "step": 6705, + "time_per_iteration": 2.520252227783203 + }, + { + "auxiliary_loss_clip": 0.01113466, + "auxiliary_loss_mlp": 0.01035202, + "balance_loss_clip": 1.02208292, + "balance_loss_mlp": 1.03979278, + "epoch": 0.4031865323914024, + "flos": 20011042131840.0, + "grad_norm": 1.712587367637223, + "language_loss": 0.66288096, + "learning_rate": 2.708481414320713e-06, + "loss": 0.68436766, + "num_input_tokens_seen": 144038525, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.734375, + "step": 6706, + "time_per_iteration": 2.4254331588745117 + }, + { + "auxiliary_loss_clip": 0.01114724, + "auxiliary_loss_mlp": 0.0103993, + "balance_loss_clip": 1.02619088, + "balance_loss_mlp": 1.03957605, + "epoch": 0.40324665564407036, + "flos": 21871430219520.0, + "grad_norm": 1.3675174561755612, + "language_loss": 0.71328777, + "learning_rate": 2.7081171941330992e-06, + "loss": 0.73483431, + "num_input_tokens_seen": 144059485, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75, + "step": 6707, + "time_per_iteration": 2.5285422801971436 + }, + { + "auxiliary_loss_clip": 0.01109979, + "auxiliary_loss_mlp": 0.01035824, + "balance_loss_clip": 1.02169156, + "balance_loss_mlp": 1.03867698, + "epoch": 0.4033067788967383, + "flos": 23878728933120.0, + "grad_norm": 1.4937460074112463, + "language_loss": 0.80080485, + "learning_rate": 2.707752947093611e-06, + "loss": 0.82226288, + "num_input_tokens_seen": 144080265, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7109375, + "step": 6708, + "time_per_iteration": 2.4664134979248047 + }, + { + "auxiliary_loss_clip": 0.01117266, + "auxiliary_loss_mlp": 0.0103605, + "balance_loss_clip": 1.02170968, + "balance_loss_mlp": 1.03778601, + "epoch": 0.4033669021494063, + "flos": 17419907756160.0, + "grad_norm": 2.013607365016592, + "language_loss": 0.82944471, + "learning_rate": 2.70738867321606e-06, + "loss": 0.8509779, + "num_input_tokens_seen": 144098040, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.796875, + "step": 6709, + "time_per_iteration": 2.461277723312378 + }, + { + "auxiliary_loss_clip": 0.01119346, + "auxiliary_loss_mlp": 0.01038536, + "balance_loss_clip": 1.02454066, + "balance_loss_mlp": 1.04260051, + "epoch": 0.40342702540207426, + "flos": 29600554855680.0, + "grad_norm": 1.4165591336273893, + "language_loss": 0.71036613, + "learning_rate": 2.70702437251426e-06, + "loss": 0.73194492, + "num_input_tokens_seen": 144118265, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.765625, + "step": 6710, + "time_per_iteration": 2.5579922199249268 + }, + { + "auxiliary_loss_clip": 0.01116194, + "auxiliary_loss_mlp": 0.01038571, + "balance_loss_clip": 1.02461195, + "balance_loss_mlp": 1.04049003, + "epoch": 0.4034871486547422, + "flos": 11284605400320.0, + "grad_norm": 1.9864485278108117, + "language_loss": 0.85366702, + "learning_rate": 2.7066600450020236e-06, + "loss": 0.87521464, + "num_input_tokens_seen": 144133865, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7578125, + "step": 6711, + "time_per_iteration": 2.511082410812378 + }, + { + "auxiliary_loss_clip": 0.01116602, + "auxiliary_loss_mlp": 0.01034461, + "balance_loss_clip": 1.02038825, + "balance_loss_mlp": 1.04072142, + "epoch": 0.4035472719074102, + "flos": 15552839738880.0, + "grad_norm": 1.9069456024701996, + "language_loss": 0.76074743, + "learning_rate": 2.706295690693168e-06, + "loss": 0.78225803, + "num_input_tokens_seen": 144150125, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 6712, + "time_per_iteration": 2.419672727584839 + }, + { + "auxiliary_loss_clip": 0.0111779, + "auxiliary_loss_mlp": 0.01037728, + "balance_loss_clip": 1.02364349, + "balance_loss_mlp": 1.04200089, + "epoch": 0.40360739516007815, + "flos": 24674365140480.0, + "grad_norm": 2.1216019240756765, + "language_loss": 0.78926992, + "learning_rate": 2.7059313096015096e-06, + "loss": 0.81082511, + "num_input_tokens_seen": 144169295, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7578125, + "step": 6713, + "time_per_iteration": 2.520109176635742 + }, + { + "auxiliary_loss_clip": 0.01113814, + "auxiliary_loss_mlp": 0.01033041, + "balance_loss_clip": 1.01912916, + "balance_loss_mlp": 1.03721881, + "epoch": 0.4036675184127461, + "flos": 17304095329920.0, + "grad_norm": 1.8945946455640421, + "language_loss": 0.88507473, + "learning_rate": 2.705566901740865e-06, + "loss": 0.90654337, + "num_input_tokens_seen": 144185790, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.765625, + "step": 6714, + "time_per_iteration": 2.4076859951019287 + }, + { + "auxiliary_loss_clip": 0.01115997, + "auxiliary_loss_mlp": 0.01040851, + "balance_loss_clip": 1.02688611, + "balance_loss_mlp": 1.04049468, + "epoch": 0.4037276416654141, + "flos": 19864023765120.0, + "grad_norm": 2.116493132238348, + "language_loss": 0.69099832, + "learning_rate": 2.7052024671250527e-06, + "loss": 0.71256685, + "num_input_tokens_seen": 144205190, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75390625, + "step": 6715, + "time_per_iteration": 2.4805076122283936 + }, + { + "auxiliary_loss_clip": 0.01117346, + "auxiliary_loss_mlp": 0.01031825, + "balance_loss_clip": 1.01785374, + "balance_loss_mlp": 1.03944981, + "epoch": 0.40378776491808205, + "flos": 18296271780480.0, + "grad_norm": 7.495764991407429, + "language_loss": 0.76919901, + "learning_rate": 2.704838005767892e-06, + "loss": 0.79069078, + "num_input_tokens_seen": 144222705, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.78125, + "step": 6716, + "time_per_iteration": 2.4244720935821533 + }, + { + "auxiliary_loss_clip": 0.0111299, + "auxiliary_loss_mlp": 0.01037832, + "balance_loss_clip": 1.02485037, + "balance_loss_mlp": 1.03992844, + "epoch": 0.40384788817075, + "flos": 15049372757760.0, + "grad_norm": 1.8407988101654404, + "language_loss": 0.76272923, + "learning_rate": 2.7044735176832037e-06, + "loss": 0.78423738, + "num_input_tokens_seen": 144239545, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.73046875, + "step": 6717, + "time_per_iteration": 2.5080463886260986 + }, + { + "auxiliary_loss_clip": 0.01036903, + "auxiliary_loss_mlp": 0.01007011, + "balance_loss_clip": 1.00571179, + "balance_loss_mlp": 1.01217222, + "epoch": 0.40390801142341803, + "flos": 61929927895680.0, + "grad_norm": 0.940083561343906, + "language_loss": 0.60735488, + "learning_rate": 2.7041090028848084e-06, + "loss": 0.62779397, + "num_input_tokens_seen": 144288145, + "router_z_loss_clip": 0.01300049, + "router_z_loss_mlp": 0.24707031, + "step": 6718, + "time_per_iteration": 2.9391937255859375 + }, + { + "auxiliary_loss_clip": 0.01120577, + "auxiliary_loss_mlp": 0.01036292, + "balance_loss_clip": 1.02140856, + "balance_loss_mlp": 1.04066229, + "epoch": 0.403968134676086, + "flos": 22738779930240.0, + "grad_norm": 2.1744660134680776, + "language_loss": 0.74794078, + "learning_rate": 2.7037444613865306e-06, + "loss": 0.76950943, + "num_input_tokens_seen": 144302315, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 6719, + "time_per_iteration": 2.4630534648895264 + }, + { + "auxiliary_loss_clip": 0.01117045, + "auxiliary_loss_mlp": 0.01043036, + "balance_loss_clip": 1.02762175, + "balance_loss_mlp": 1.0402683, + "epoch": 0.40402825792875396, + "flos": 19784409269760.0, + "grad_norm": 2.5217598497166422, + "language_loss": 0.81235194, + "learning_rate": 2.7033798932021906e-06, + "loss": 0.83395278, + "num_input_tokens_seen": 144318990, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.76953125, + "step": 6720, + "time_per_iteration": 6.786137104034424 + }, + { + "auxiliary_loss_clip": 0.01113768, + "auxiliary_loss_mlp": 0.01030391, + "balance_loss_clip": 1.01644325, + "balance_loss_mlp": 1.0376296, + "epoch": 0.40408838118142193, + "flos": 19609273532160.0, + "grad_norm": 1.933287838521713, + "language_loss": 0.7720241, + "learning_rate": 2.7030152983456153e-06, + "loss": 0.79346573, + "num_input_tokens_seen": 144335765, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76171875, + "step": 6721, + "time_per_iteration": 3.9910030364990234 + }, + { + "auxiliary_loss_clip": 0.01112718, + "auxiliary_loss_mlp": 0.01026726, + "balance_loss_clip": 1.01460266, + "balance_loss_mlp": 1.04090941, + "epoch": 0.4041485044340899, + "flos": 24426043441920.0, + "grad_norm": 2.3110658804222566, + "language_loss": 0.7264756, + "learning_rate": 2.7026506768306304e-06, + "loss": 0.74787009, + "num_input_tokens_seen": 144355825, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.71875, + "step": 6722, + "time_per_iteration": 2.5377390384674072 + }, + { + "auxiliary_loss_clip": 0.01113608, + "auxiliary_loss_mlp": 0.01036417, + "balance_loss_clip": 1.02270842, + "balance_loss_mlp": 1.03896952, + "epoch": 0.40420862768675786, + "flos": 16760192613120.0, + "grad_norm": 1.7096890061042316, + "language_loss": 0.65681767, + "learning_rate": 2.7022860286710602e-06, + "loss": 0.67831796, + "num_input_tokens_seen": 144374320, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 6723, + "time_per_iteration": 2.429657220840454 + }, + { + "auxiliary_loss_clip": 0.01118233, + "auxiliary_loss_mlp": 0.01043022, + "balance_loss_clip": 1.02834117, + "balance_loss_mlp": 1.04056454, + "epoch": 0.4042687509394258, + "flos": 22491571553280.0, + "grad_norm": 1.4515559648574707, + "language_loss": 0.74074364, + "learning_rate": 2.701921353880734e-06, + "loss": 0.76235622, + "num_input_tokens_seen": 144394325, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7734375, + "step": 6724, + "time_per_iteration": 2.485166072845459 + }, + { + "auxiliary_loss_clip": 0.011096, + "auxiliary_loss_mlp": 0.01034503, + "balance_loss_clip": 1.02133048, + "balance_loss_mlp": 1.03799534, + "epoch": 0.4043288741920938, + "flos": 30336149479680.0, + "grad_norm": 1.783988932028688, + "language_loss": 0.74764013, + "learning_rate": 2.7015566524734787e-06, + "loss": 0.76908118, + "num_input_tokens_seen": 144412765, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71484375, + "step": 6725, + "time_per_iteration": 2.5141966342926025 + }, + { + "auxiliary_loss_clip": 0.01112534, + "auxiliary_loss_mlp": 0.01034717, + "balance_loss_clip": 1.02024531, + "balance_loss_mlp": 1.03874183, + "epoch": 0.40438899744476176, + "flos": 46348321363200.0, + "grad_norm": 1.8781247850607437, + "language_loss": 0.76928914, + "learning_rate": 2.701191924463126e-06, + "loss": 0.79076171, + "num_input_tokens_seen": 144435400, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.73828125, + "step": 6726, + "time_per_iteration": 2.685609817504883 + }, + { + "auxiliary_loss_clip": 0.01115432, + "auxiliary_loss_mlp": 0.01034649, + "balance_loss_clip": 1.02004611, + "balance_loss_mlp": 1.03858769, + "epoch": 0.4044491206974297, + "flos": 13333524998400.0, + "grad_norm": 2.1780936913008646, + "language_loss": 0.81682861, + "learning_rate": 2.7008271698635054e-06, + "loss": 0.83832943, + "num_input_tokens_seen": 144452925, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.76953125, + "step": 6727, + "time_per_iteration": 2.4221317768096924 + }, + { + "auxiliary_loss_clip": 0.0111635, + "auxiliary_loss_mlp": 0.01034771, + "balance_loss_clip": 1.02088916, + "balance_loss_mlp": 1.0411514, + "epoch": 0.4045092439500977, + "flos": 12093745121280.0, + "grad_norm": 2.0089286405461246, + "language_loss": 0.85300338, + "learning_rate": 2.700462388688447e-06, + "loss": 0.87451458, + "num_input_tokens_seen": 144470195, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 6728, + "time_per_iteration": 2.4719340801239014 + }, + { + "auxiliary_loss_clip": 0.01117368, + "auxiliary_loss_mlp": 0.01034182, + "balance_loss_clip": 1.02059257, + "balance_loss_mlp": 1.04241705, + "epoch": 0.40456936720276565, + "flos": 21179683123200.0, + "grad_norm": 1.6690883830899332, + "language_loss": 0.81804991, + "learning_rate": 2.700097580951786e-06, + "loss": 0.8395654, + "num_input_tokens_seen": 144490320, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.75, + "step": 6729, + "time_per_iteration": 2.4482905864715576 + }, + { + "auxiliary_loss_clip": 0.01114628, + "auxiliary_loss_mlp": 0.01034739, + "balance_loss_clip": 1.02092838, + "balance_loss_mlp": 1.04034996, + "epoch": 0.4046294904554336, + "flos": 23915286000000.0, + "grad_norm": 1.841339511320202, + "language_loss": 0.72582501, + "learning_rate": 2.6997327466673533e-06, + "loss": 0.74731869, + "num_input_tokens_seen": 144508990, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7421875, + "step": 6730, + "time_per_iteration": 2.537121295928955 + }, + { + "auxiliary_loss_clip": 0.01114402, + "auxiliary_loss_mlp": 0.01035728, + "balance_loss_clip": 1.0216732, + "balance_loss_mlp": 1.04037821, + "epoch": 0.4046896137081016, + "flos": 38071235773440.0, + "grad_norm": 1.6090983176176454, + "language_loss": 0.67394918, + "learning_rate": 2.699367885848985e-06, + "loss": 0.69545048, + "num_input_tokens_seen": 144529550, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 6731, + "time_per_iteration": 2.645958423614502 + }, + { + "auxiliary_loss_clip": 0.01114135, + "auxiliary_loss_mlp": 0.01034866, + "balance_loss_clip": 1.02196193, + "balance_loss_mlp": 1.03986645, + "epoch": 0.4047497369607696, + "flos": 23617262856960.0, + "grad_norm": 1.6078062973222544, + "language_loss": 0.74067897, + "learning_rate": 2.699002998510517e-06, + "loss": 0.76216894, + "num_input_tokens_seen": 144549310, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7421875, + "step": 6732, + "time_per_iteration": 2.5182886123657227 + }, + { + "auxiliary_loss_clip": 0.01114756, + "auxiliary_loss_mlp": 0.01028887, + "balance_loss_clip": 1.01650739, + "balance_loss_mlp": 1.04178488, + "epoch": 0.40480986021343757, + "flos": 12823593569280.0, + "grad_norm": 1.830865433765548, + "language_loss": 0.7690779, + "learning_rate": 2.6986380846657852e-06, + "loss": 0.79051435, + "num_input_tokens_seen": 144567430, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.73046875, + "step": 6733, + "time_per_iteration": 2.430748701095581 + }, + { + "auxiliary_loss_clip": 0.01120623, + "auxiliary_loss_mlp": 0.01038866, + "balance_loss_clip": 1.02358902, + "balance_loss_mlp": 1.04164028, + "epoch": 0.40486998346610553, + "flos": 23768770423680.0, + "grad_norm": 1.8916182343646197, + "language_loss": 0.7649287, + "learning_rate": 2.698273144328627e-06, + "loss": 0.78652358, + "num_input_tokens_seen": 144585975, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.7890625, + "step": 6734, + "time_per_iteration": 2.507070541381836 + }, + { + "auxiliary_loss_clip": 0.01121282, + "auxiliary_loss_mlp": 0.01030778, + "balance_loss_clip": 1.01729572, + "balance_loss_mlp": 1.04258728, + "epoch": 0.4049301067187735, + "flos": 22856818999680.0, + "grad_norm": 2.227264135735927, + "language_loss": 0.65026176, + "learning_rate": 2.6979081775128805e-06, + "loss": 0.67178231, + "num_input_tokens_seen": 144605225, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7890625, + "step": 6735, + "time_per_iteration": 2.4677040576934814 + }, + { + "auxiliary_loss_clip": 0.01113204, + "auxiliary_loss_mlp": 0.01034185, + "balance_loss_clip": 1.02154267, + "balance_loss_mlp": 1.04025424, + "epoch": 0.40499022997144146, + "flos": 22783992174720.0, + "grad_norm": 1.9551652085107198, + "language_loss": 0.83177966, + "learning_rate": 2.697543184232387e-06, + "loss": 0.85325354, + "num_input_tokens_seen": 144624145, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7265625, + "step": 6736, + "time_per_iteration": 2.5244226455688477 + }, + { + "auxiliary_loss_clip": 0.01121161, + "auxiliary_loss_mlp": 0.01037611, + "balance_loss_clip": 1.02344942, + "balance_loss_mlp": 1.04291666, + "epoch": 0.4050503532241094, + "flos": 23039352938880.0, + "grad_norm": 1.699075737504615, + "language_loss": 0.7520684, + "learning_rate": 2.6971781645009863e-06, + "loss": 0.77365613, + "num_input_tokens_seen": 144644470, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.78125, + "step": 6737, + "time_per_iteration": 2.510906457901001 + }, + { + "auxiliary_loss_clip": 0.01117535, + "auxiliary_loss_mlp": 0.0103775, + "balance_loss_clip": 1.02408242, + "balance_loss_mlp": 1.04335642, + "epoch": 0.4051104764767774, + "flos": 16647756065280.0, + "grad_norm": 2.288492776548484, + "language_loss": 0.71790028, + "learning_rate": 2.696813118332519e-06, + "loss": 0.73945308, + "num_input_tokens_seen": 144661055, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7421875, + "step": 6738, + "time_per_iteration": 2.514575481414795 + }, + { + "auxiliary_loss_clip": 0.01114478, + "auxiliary_loss_mlp": 0.01031321, + "balance_loss_clip": 1.01845288, + "balance_loss_mlp": 1.04022241, + "epoch": 0.40517059972944536, + "flos": 16358962717440.0, + "grad_norm": 2.003378473366394, + "language_loss": 0.75169361, + "learning_rate": 2.696448045740828e-06, + "loss": 0.77315164, + "num_input_tokens_seen": 144677935, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7421875, + "step": 6739, + "time_per_iteration": 2.4737000465393066 + }, + { + "auxiliary_loss_clip": 0.01119431, + "auxiliary_loss_mlp": 0.01034852, + "balance_loss_clip": 1.02107763, + "balance_loss_mlp": 1.04296541, + "epoch": 0.4052307229821133, + "flos": 28803374363520.0, + "grad_norm": 1.7865413260400147, + "language_loss": 0.73943472, + "learning_rate": 2.6960829467397576e-06, + "loss": 0.76097751, + "num_input_tokens_seen": 144697725, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.765625, + "step": 6740, + "time_per_iteration": 2.5434296131134033 + }, + { + "auxiliary_loss_clip": 0.0111643, + "auxiliary_loss_mlp": 0.01032682, + "balance_loss_clip": 1.0190562, + "balance_loss_mlp": 1.04310441, + "epoch": 0.4052908462347813, + "flos": 21397876289280.0, + "grad_norm": 1.5350516452213203, + "language_loss": 0.77179801, + "learning_rate": 2.695717821343153e-06, + "loss": 0.79328907, + "num_input_tokens_seen": 144718805, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.734375, + "step": 6741, + "time_per_iteration": 2.473451852798462 + }, + { + "auxiliary_loss_clip": 0.01120883, + "auxiliary_loss_mlp": 0.01035782, + "balance_loss_clip": 1.02082753, + "balance_loss_mlp": 1.04359269, + "epoch": 0.40535096948744925, + "flos": 22419067950720.0, + "grad_norm": 1.8990417013226273, + "language_loss": 0.70827335, + "learning_rate": 2.6953526695648577e-06, + "loss": 0.72983992, + "num_input_tokens_seen": 144737105, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.7734375, + "step": 6742, + "time_per_iteration": 2.4797537326812744 + }, + { + "auxiliary_loss_clip": 0.01121445, + "auxiliary_loss_mlp": 0.01029673, + "balance_loss_clip": 1.01517677, + "balance_loss_mlp": 1.04446578, + "epoch": 0.4054110927401172, + "flos": 17010776868480.0, + "grad_norm": 2.180199258846301, + "language_loss": 0.72242743, + "learning_rate": 2.6949874914187202e-06, + "loss": 0.74393857, + "num_input_tokens_seen": 144751350, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.76953125, + "step": 6743, + "time_per_iteration": 2.409444808959961 + }, + { + "auxiliary_loss_clip": 0.0112179, + "auxiliary_loss_mlp": 0.0103567, + "balance_loss_clip": 1.02128196, + "balance_loss_mlp": 1.04374886, + "epoch": 0.4054712159927852, + "flos": 21614848392960.0, + "grad_norm": 3.287949139408167, + "language_loss": 0.70554733, + "learning_rate": 2.694622286918588e-06, + "loss": 0.72712195, + "num_input_tokens_seen": 144770030, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 6744, + "time_per_iteration": 2.475775957107544 + }, + { + "auxiliary_loss_clip": 0.01116341, + "auxiliary_loss_mlp": 0.01034834, + "balance_loss_clip": 1.02154207, + "balance_loss_mlp": 1.04163671, + "epoch": 0.4055313392454532, + "flos": 25812554376960.0, + "grad_norm": 1.534678646828984, + "language_loss": 0.79982138, + "learning_rate": 2.6942570560783076e-06, + "loss": 0.82133317, + "num_input_tokens_seen": 144790965, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.74609375, + "step": 6745, + "time_per_iteration": 2.492379903793335 + }, + { + "auxiliary_loss_clip": 0.01120523, + "auxiliary_loss_mlp": 0.01033491, + "balance_loss_clip": 1.01904845, + "balance_loss_mlp": 1.0463028, + "epoch": 0.40559146249812117, + "flos": 14137098111360.0, + "grad_norm": 1.8557240822638386, + "language_loss": 0.66450787, + "learning_rate": 2.693891798911731e-06, + "loss": 0.68604791, + "num_input_tokens_seen": 144807755, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7421875, + "step": 6746, + "time_per_iteration": 2.4547531604766846 + }, + { + "auxiliary_loss_clip": 0.01118105, + "auxiliary_loss_mlp": 0.01029204, + "balance_loss_clip": 1.01573384, + "balance_loss_mlp": 1.04319298, + "epoch": 0.40565158575078913, + "flos": 41355481962240.0, + "grad_norm": 1.5006534813974708, + "language_loss": 0.5713616, + "learning_rate": 2.6935265154327075e-06, + "loss": 0.59283465, + "num_input_tokens_seen": 144832405, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 6747, + "time_per_iteration": 2.627912998199463 + }, + { + "auxiliary_loss_clip": 0.01119274, + "auxiliary_loss_mlp": 0.01041713, + "balance_loss_clip": 1.02859426, + "balance_loss_mlp": 1.04399908, + "epoch": 0.4057117090034571, + "flos": 28544529980160.0, + "grad_norm": 1.605109327396707, + "language_loss": 0.8454957, + "learning_rate": 2.693161205655089e-06, + "loss": 0.8671056, + "num_input_tokens_seen": 144853890, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.75390625, + "step": 6748, + "time_per_iteration": 2.5783345699310303 + }, + { + "auxiliary_loss_clip": 0.01120452, + "auxiliary_loss_mlp": 0.01035858, + "balance_loss_clip": 1.02210689, + "balance_loss_mlp": 1.04356313, + "epoch": 0.40577183225612506, + "flos": 18004066640640.0, + "grad_norm": 2.1468645636667705, + "language_loss": 0.81288636, + "learning_rate": 2.6927958695927287e-06, + "loss": 0.83444953, + "num_input_tokens_seen": 144871395, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76953125, + "step": 6749, + "time_per_iteration": 2.433042049407959 + }, + { + "auxiliary_loss_clip": 0.01120578, + "auxiliary_loss_mlp": 0.01037463, + "balance_loss_clip": 1.02395105, + "balance_loss_mlp": 1.04512405, + "epoch": 0.40583195550879303, + "flos": 19536734016000.0, + "grad_norm": 1.6093122324869749, + "language_loss": 0.75051296, + "learning_rate": 2.6924305072594784e-06, + "loss": 0.77209336, + "num_input_tokens_seen": 144890975, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 6750, + "time_per_iteration": 2.500444173812866 + }, + { + "auxiliary_loss_clip": 0.01120094, + "auxiliary_loss_mlp": 0.01034106, + "balance_loss_clip": 1.01919341, + "balance_loss_mlp": 1.04114318, + "epoch": 0.405892078761461, + "flos": 22309468577280.0, + "grad_norm": 2.1309201825140662, + "language_loss": 0.73826647, + "learning_rate": 2.692065118669195e-06, + "loss": 0.75980842, + "num_input_tokens_seen": 144908170, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.7890625, + "step": 6751, + "time_per_iteration": 2.4808826446533203 + }, + { + "auxiliary_loss_clip": 0.01120759, + "auxiliary_loss_mlp": 0.01031901, + "balance_loss_clip": 1.01758409, + "balance_loss_mlp": 1.04471755, + "epoch": 0.40595220201412896, + "flos": 25484402701440.0, + "grad_norm": 5.559089751596236, + "language_loss": 0.6666553, + "learning_rate": 2.6916997038357326e-06, + "loss": 0.68818188, + "num_input_tokens_seen": 144928020, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.76171875, + "step": 6752, + "time_per_iteration": 2.568223714828491 + }, + { + "auxiliary_loss_clip": 0.0112446, + "auxiliary_loss_mlp": 0.0103634, + "balance_loss_clip": 1.02189183, + "balance_loss_mlp": 1.04458666, + "epoch": 0.4060123252667969, + "flos": 49856004103680.0, + "grad_norm": 1.70284971706228, + "language_loss": 0.70600617, + "learning_rate": 2.691334262772948e-06, + "loss": 0.72761416, + "num_input_tokens_seen": 144951240, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.796875, + "step": 6753, + "time_per_iteration": 2.696746587753296 + }, + { + "auxiliary_loss_clip": 0.01119466, + "auxiliary_loss_mlp": 0.01035823, + "balance_loss_clip": 1.02145791, + "balance_loss_mlp": 1.04105067, + "epoch": 0.4060724485194649, + "flos": 21135476459520.0, + "grad_norm": 2.1929566205477804, + "language_loss": 0.71584499, + "learning_rate": 2.690968795494699e-06, + "loss": 0.73739791, + "num_input_tokens_seen": 144969100, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78515625, + "step": 6754, + "time_per_iteration": 2.49405837059021 + }, + { + "auxiliary_loss_clip": 0.01120059, + "auxiliary_loss_mlp": 0.0103975, + "balance_loss_clip": 1.02568889, + "balance_loss_mlp": 1.04273617, + "epoch": 0.40613257177213286, + "flos": 21758059918080.0, + "grad_norm": 1.7112877357577985, + "language_loss": 0.82864529, + "learning_rate": 2.690603302014844e-06, + "loss": 0.85024333, + "num_input_tokens_seen": 144987065, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7734375, + "step": 6755, + "time_per_iteration": 2.4666147232055664 + }, + { + "auxiliary_loss_clip": 0.01122705, + "auxiliary_loss_mlp": 0.01040879, + "balance_loss_clip": 1.02599001, + "balance_loss_mlp": 1.04292035, + "epoch": 0.4061926950248008, + "flos": 25555074710400.0, + "grad_norm": 1.484337354822898, + "language_loss": 0.70812732, + "learning_rate": 2.6902377823472426e-06, + "loss": 0.72976315, + "num_input_tokens_seen": 145007310, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 6756, + "time_per_iteration": 2.539236307144165 + }, + { + "auxiliary_loss_clip": 0.01120038, + "auxiliary_loss_mlp": 0.01041859, + "balance_loss_clip": 1.02724361, + "balance_loss_mlp": 1.04106975, + "epoch": 0.4062528182774688, + "flos": 23695799944320.0, + "grad_norm": 1.6617053894159006, + "language_loss": 0.79047221, + "learning_rate": 2.689872236505755e-06, + "loss": 0.81209117, + "num_input_tokens_seen": 145026210, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.79296875, + "step": 6757, + "time_per_iteration": 2.4614784717559814 + }, + { + "auxiliary_loss_clip": 0.01121935, + "auxiliary_loss_mlp": 0.0103313, + "balance_loss_clip": 1.01865852, + "balance_loss_mlp": 1.04454553, + "epoch": 0.4063129415301368, + "flos": 21726027964800.0, + "grad_norm": 1.5700268222495364, + "language_loss": 0.7851724, + "learning_rate": 2.6895066645042437e-06, + "loss": 0.806723, + "num_input_tokens_seen": 145045475, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7734375, + "step": 6758, + "time_per_iteration": 2.495060920715332 + }, + { + "auxiliary_loss_clip": 0.01116636, + "auxiliary_loss_mlp": 0.01031594, + "balance_loss_clip": 1.01692557, + "balance_loss_mlp": 1.04113591, + "epoch": 0.40637306478280477, + "flos": 12787575206400.0, + "grad_norm": 2.1344538838988454, + "language_loss": 0.88668954, + "learning_rate": 2.6891410663565703e-06, + "loss": 0.90817189, + "num_input_tokens_seen": 145062260, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.75390625, + "step": 6759, + "time_per_iteration": 2.410628318786621 + }, + { + "auxiliary_loss_clip": 0.01120377, + "auxiliary_loss_mlp": 0.01033455, + "balance_loss_clip": 1.01986527, + "balance_loss_mlp": 1.04366982, + "epoch": 0.40643318803547274, + "flos": 24024490323840.0, + "grad_norm": 2.0728742760332546, + "language_loss": 0.63888443, + "learning_rate": 2.688775442076598e-06, + "loss": 0.66042268, + "num_input_tokens_seen": 145082470, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.765625, + "step": 6760, + "time_per_iteration": 2.553819417953491 + }, + { + "auxiliary_loss_clip": 0.01119037, + "auxiliary_loss_mlp": 0.01032802, + "balance_loss_clip": 1.01796103, + "balance_loss_mlp": 1.0422858, + "epoch": 0.4064933112881407, + "flos": 25592421876480.0, + "grad_norm": 1.4242582463540345, + "language_loss": 0.75060493, + "learning_rate": 2.688409791678193e-06, + "loss": 0.77212334, + "num_input_tokens_seen": 145105685, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.765625, + "step": 6761, + "time_per_iteration": 2.520904302597046 + }, + { + "auxiliary_loss_clip": 0.01111351, + "auxiliary_loss_mlp": 0.01033598, + "balance_loss_clip": 1.02029395, + "balance_loss_mlp": 1.04054725, + "epoch": 0.40655343454080867, + "flos": 22054323294720.0, + "grad_norm": 1.4265975037167853, + "language_loss": 0.70109248, + "learning_rate": 2.6880441151752185e-06, + "loss": 0.72254199, + "num_input_tokens_seen": 145125590, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 6762, + "time_per_iteration": 6.884980916976929 + }, + { + "auxiliary_loss_clip": 0.01117935, + "auxiliary_loss_mlp": 0.01032239, + "balance_loss_clip": 1.01893568, + "balance_loss_mlp": 1.04316521, + "epoch": 0.40661355779347663, + "flos": 26468893641600.0, + "grad_norm": 2.223786523351799, + "language_loss": 0.73175049, + "learning_rate": 2.6876784125815433e-06, + "loss": 0.75325227, + "num_input_tokens_seen": 145146810, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75, + "step": 6763, + "time_per_iteration": 3.8783130645751953 + }, + { + "auxiliary_loss_clip": 0.01119915, + "auxiliary_loss_mlp": 0.01036194, + "balance_loss_clip": 1.02200174, + "balance_loss_mlp": 1.04246914, + "epoch": 0.4066736810461446, + "flos": 13261129136640.0, + "grad_norm": 1.725584811158307, + "language_loss": 0.6908524, + "learning_rate": 2.687312683911033e-06, + "loss": 0.71241343, + "num_input_tokens_seen": 145163130, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7734375, + "step": 6764, + "time_per_iteration": 2.4408676624298096 + }, + { + "auxiliary_loss_clip": 0.01123793, + "auxiliary_loss_mlp": 0.01040267, + "balance_loss_clip": 1.02481747, + "balance_loss_mlp": 1.04485261, + "epoch": 0.40673380429881256, + "flos": 28803625758720.0, + "grad_norm": 2.20566464671706, + "language_loss": 0.91570717, + "learning_rate": 2.686946929177557e-06, + "loss": 0.93734777, + "num_input_tokens_seen": 145181420, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.7890625, + "step": 6765, + "time_per_iteration": 2.4904191493988037 + }, + { + "auxiliary_loss_clip": 0.01122971, + "auxiliary_loss_mlp": 0.01041584, + "balance_loss_clip": 1.02672434, + "balance_loss_mlp": 1.04374599, + "epoch": 0.4067939275514805, + "flos": 12495334152960.0, + "grad_norm": 2.279622168201086, + "language_loss": 0.78459442, + "learning_rate": 2.6865811483949855e-06, + "loss": 0.80623996, + "num_input_tokens_seen": 145198545, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.79296875, + "step": 6766, + "time_per_iteration": 2.4594480991363525 + }, + { + "auxiliary_loss_clip": 0.01119893, + "auxiliary_loss_mlp": 0.01038568, + "balance_loss_clip": 1.02457929, + "balance_loss_mlp": 1.04144108, + "epoch": 0.4068540508041485, + "flos": 18770508069120.0, + "grad_norm": 1.9487336600068845, + "language_loss": 0.76438922, + "learning_rate": 2.6862153415771867e-06, + "loss": 0.78597391, + "num_input_tokens_seen": 145215835, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.78125, + "step": 6767, + "time_per_iteration": 2.4127700328826904 + }, + { + "auxiliary_loss_clip": 0.01119033, + "auxiliary_loss_mlp": 0.01036408, + "balance_loss_clip": 1.0229435, + "balance_loss_mlp": 1.0442543, + "epoch": 0.40691417405681646, + "flos": 28512821249280.0, + "grad_norm": 1.7431301492707811, + "language_loss": 0.77572781, + "learning_rate": 2.685849508738034e-06, + "loss": 0.79728222, + "num_input_tokens_seen": 145236555, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 6768, + "time_per_iteration": 2.5312347412109375 + }, + { + "auxiliary_loss_clip": 0.01118014, + "auxiliary_loss_mlp": 0.01031889, + "balance_loss_clip": 1.01861525, + "balance_loss_mlp": 1.04248428, + "epoch": 0.4069742973094844, + "flos": 20814040627200.0, + "grad_norm": 2.7094466648077935, + "language_loss": 0.87585759, + "learning_rate": 2.6854836498913995e-06, + "loss": 0.89735663, + "num_input_tokens_seen": 145254595, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75390625, + "step": 6769, + "time_per_iteration": 2.434276580810547 + }, + { + "auxiliary_loss_clip": 0.01120302, + "auxiliary_loss_mlp": 0.01032733, + "balance_loss_clip": 1.02028155, + "balance_loss_mlp": 1.04659963, + "epoch": 0.4070344205621524, + "flos": 21470272151040.0, + "grad_norm": 1.8989360481904207, + "language_loss": 0.80883789, + "learning_rate": 2.685117765051156e-06, + "loss": 0.83036822, + "num_input_tokens_seen": 145274005, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.73828125, + "step": 6770, + "time_per_iteration": 2.4768316745758057 + }, + { + "auxiliary_loss_clip": 0.01121746, + "auxiliary_loss_mlp": 0.01032044, + "balance_loss_clip": 1.01699948, + "balance_loss_mlp": 1.04308331, + "epoch": 0.4070945438148204, + "flos": 26830046937600.0, + "grad_norm": 1.6240016049823844, + "language_loss": 0.80161405, + "learning_rate": 2.6847518542311783e-06, + "loss": 0.82315195, + "num_input_tokens_seen": 145294850, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.78515625, + "step": 6771, + "time_per_iteration": 2.4864251613616943 + }, + { + "auxiliary_loss_clip": 0.01116481, + "auxiliary_loss_mlp": 0.01038824, + "balance_loss_clip": 1.02476382, + "balance_loss_mlp": 1.04181063, + "epoch": 0.4071546670674884, + "flos": 26354158623360.0, + "grad_norm": 1.5515756087522081, + "language_loss": 0.76267636, + "learning_rate": 2.6843859174453417e-06, + "loss": 0.7842294, + "num_input_tokens_seen": 145317050, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.74609375, + "step": 6772, + "time_per_iteration": 2.5570874214172363 + }, + { + "auxiliary_loss_clip": 0.01116059, + "auxiliary_loss_mlp": 0.01040475, + "balance_loss_clip": 1.0259316, + "balance_loss_mlp": 1.04014397, + "epoch": 0.40721479032015634, + "flos": 17895401020800.0, + "grad_norm": 1.6577007729475706, + "language_loss": 0.81418705, + "learning_rate": 2.6840199547075218e-06, + "loss": 0.83575237, + "num_input_tokens_seen": 145334480, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7578125, + "step": 6773, + "time_per_iteration": 2.4311835765838623 + }, + { + "auxiliary_loss_clip": 0.01040526, + "auxiliary_loss_mlp": 0.01005684, + "balance_loss_clip": 1.00416398, + "balance_loss_mlp": 1.01639521, + "epoch": 0.4072749135728243, + "flos": 49854570537600.0, + "grad_norm": 0.8363890316728796, + "language_loss": 0.6434871, + "learning_rate": 2.683653966031597e-06, + "loss": 0.66394925, + "num_input_tokens_seen": 145388695, + "router_z_loss_clip": 0.01519775, + "router_z_loss_mlp": 0.24121094, + "step": 6774, + "time_per_iteration": 2.987610340118408 + }, + { + "auxiliary_loss_clip": 0.01119504, + "auxiliary_loss_mlp": 0.01035806, + "balance_loss_clip": 1.02136981, + "balance_loss_mlp": 1.04115796, + "epoch": 0.40733503682549227, + "flos": 27563630400000.0, + "grad_norm": 13.875946104557459, + "language_loss": 0.72097111, + "learning_rate": 2.683287951431446e-06, + "loss": 0.74252421, + "num_input_tokens_seen": 145408240, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 6775, + "time_per_iteration": 2.5014185905456543 + }, + { + "auxiliary_loss_clip": 0.01118561, + "auxiliary_loss_mlp": 0.01041249, + "balance_loss_clip": 1.02736115, + "balance_loss_mlp": 1.04123604, + "epoch": 0.40739516007816023, + "flos": 22126970551680.0, + "grad_norm": 1.3741783359801052, + "language_loss": 0.77956975, + "learning_rate": 2.6829219109209474e-06, + "loss": 0.80116785, + "num_input_tokens_seen": 145428395, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7734375, + "step": 6776, + "time_per_iteration": 2.484910488128662 + }, + { + "auxiliary_loss_clip": 0.0112306, + "auxiliary_loss_mlp": 0.01038548, + "balance_loss_clip": 1.0240761, + "balance_loss_mlp": 1.04408884, + "epoch": 0.4074552833308282, + "flos": 23842243693440.0, + "grad_norm": 2.6337418369090404, + "language_loss": 0.79015827, + "learning_rate": 2.682555844513981e-06, + "loss": 0.81177437, + "num_input_tokens_seen": 145448290, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7890625, + "step": 6777, + "time_per_iteration": 2.4701852798461914 + }, + { + "auxiliary_loss_clip": 0.01039569, + "auxiliary_loss_mlp": 0.01005822, + "balance_loss_clip": 1.00424814, + "balance_loss_mlp": 1.01542926, + "epoch": 0.40751540658349616, + "flos": 58000008781440.0, + "grad_norm": 0.6828077953919364, + "language_loss": 0.5320037, + "learning_rate": 2.6821897522244286e-06, + "loss": 0.55245763, + "num_input_tokens_seen": 145509785, + "router_z_loss_clip": 0.01574707, + "router_z_loss_mlp": 0.2421875, + "step": 6778, + "time_per_iteration": 3.117647647857666 + }, + { + "auxiliary_loss_clip": 0.01119188, + "auxiliary_loss_mlp": 0.01041042, + "balance_loss_clip": 1.02658224, + "balance_loss_mlp": 1.04310179, + "epoch": 0.40757552983616413, + "flos": 21214659991680.0, + "grad_norm": 2.2984205071258272, + "language_loss": 0.82367444, + "learning_rate": 2.6818236340661718e-06, + "loss": 0.84527671, + "num_input_tokens_seen": 145528620, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.76171875, + "step": 6779, + "time_per_iteration": 2.4653449058532715 + }, + { + "auxiliary_loss_clip": 0.0111837, + "auxiliary_loss_mlp": 0.01037706, + "balance_loss_clip": 1.02289438, + "balance_loss_mlp": 1.0422008, + "epoch": 0.4076356530888321, + "flos": 26833530556800.0, + "grad_norm": 1.7439910283418456, + "language_loss": 0.7628178, + "learning_rate": 2.6814574900530957e-06, + "loss": 0.78437853, + "num_input_tokens_seen": 145547775, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.76171875, + "step": 6780, + "time_per_iteration": 2.5031514167785645 + }, + { + "auxiliary_loss_clip": 0.01114202, + "auxiliary_loss_mlp": 0.01030127, + "balance_loss_clip": 1.01759243, + "balance_loss_mlp": 1.04146945, + "epoch": 0.40769577634150006, + "flos": 12203021272320.0, + "grad_norm": 2.107375049179959, + "language_loss": 0.65990937, + "learning_rate": 2.6810913201990827e-06, + "loss": 0.68135262, + "num_input_tokens_seen": 145564465, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7265625, + "step": 6781, + "time_per_iteration": 2.431759834289551 + }, + { + "auxiliary_loss_clip": 0.01117153, + "auxiliary_loss_mlp": 0.01036981, + "balance_loss_clip": 1.02233076, + "balance_loss_mlp": 1.04050446, + "epoch": 0.407755899594168, + "flos": 33655264796160.0, + "grad_norm": 2.315782733130647, + "language_loss": 0.71046883, + "learning_rate": 2.6807251245180183e-06, + "loss": 0.73201013, + "num_input_tokens_seen": 145585965, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.765625, + "step": 6782, + "time_per_iteration": 2.567138433456421 + }, + { + "auxiliary_loss_clip": 0.01117461, + "auxiliary_loss_mlp": 0.01031478, + "balance_loss_clip": 1.01789367, + "balance_loss_mlp": 1.04120076, + "epoch": 0.407816022846836, + "flos": 20157342226560.0, + "grad_norm": 1.7193598407967954, + "language_loss": 0.82066965, + "learning_rate": 2.6803589030237897e-06, + "loss": 0.84215903, + "num_input_tokens_seen": 145605000, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.76171875, + "step": 6783, + "time_per_iteration": 2.46891188621521 + }, + { + "auxiliary_loss_clip": 0.01116877, + "auxiliary_loss_mlp": 0.01034742, + "balance_loss_clip": 1.02065194, + "balance_loss_mlp": 1.04063141, + "epoch": 0.40787614609950396, + "flos": 21178821196800.0, + "grad_norm": 1.6682285001774693, + "language_loss": 0.80728561, + "learning_rate": 2.679992655730283e-06, + "loss": 0.82880187, + "num_input_tokens_seen": 145623740, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 6784, + "time_per_iteration": 2.456216812133789 + }, + { + "auxiliary_loss_clip": 0.01122913, + "auxiliary_loss_mlp": 0.01041361, + "balance_loss_clip": 1.0258038, + "balance_loss_mlp": 1.04271793, + "epoch": 0.407936269352172, + "flos": 20520650338560.0, + "grad_norm": 1.7628578717327703, + "language_loss": 0.65640736, + "learning_rate": 2.679626382651386e-06, + "loss": 0.67805004, + "num_input_tokens_seen": 145643515, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.80078125, + "step": 6785, + "time_per_iteration": 2.46173357963562 + }, + { + "auxiliary_loss_clip": 0.01115104, + "auxiliary_loss_mlp": 0.01030368, + "balance_loss_clip": 1.01650357, + "balance_loss_mlp": 1.0397855, + "epoch": 0.40799639260483994, + "flos": 20118809911680.0, + "grad_norm": 1.9756209352263352, + "language_loss": 0.79518569, + "learning_rate": 2.679260083800989e-06, + "loss": 0.81664044, + "num_input_tokens_seen": 145660890, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75390625, + "step": 6786, + "time_per_iteration": 2.430769205093384 + }, + { + "auxiliary_loss_clip": 0.01115587, + "auxiliary_loss_mlp": 0.01036368, + "balance_loss_clip": 1.02349889, + "balance_loss_mlp": 1.04094195, + "epoch": 0.4080565158575079, + "flos": 20997328752000.0, + "grad_norm": 1.5131366331092475, + "language_loss": 0.81249726, + "learning_rate": 2.678893759192982e-06, + "loss": 0.8340168, + "num_input_tokens_seen": 145680070, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.74609375, + "step": 6787, + "time_per_iteration": 2.4589040279388428 + }, + { + "auxiliary_loss_clip": 0.01115847, + "auxiliary_loss_mlp": 0.01033088, + "balance_loss_clip": 1.01907516, + "balance_loss_mlp": 1.04059005, + "epoch": 0.40811663911017587, + "flos": 19317714837120.0, + "grad_norm": 1.9559544882723985, + "language_loss": 0.67917293, + "learning_rate": 2.678527408841255e-06, + "loss": 0.70066231, + "num_input_tokens_seen": 145698010, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.75390625, + "step": 6788, + "time_per_iteration": 2.4450576305389404 + }, + { + "auxiliary_loss_clip": 0.01116018, + "auxiliary_loss_mlp": 0.01041079, + "balance_loss_clip": 1.02644002, + "balance_loss_mlp": 1.03975677, + "epoch": 0.40817676236284384, + "flos": 40625382119040.0, + "grad_norm": 2.2689407766698584, + "language_loss": 0.6605472, + "learning_rate": 2.678161032759701e-06, + "loss": 0.68211812, + "num_input_tokens_seen": 145722215, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.765625, + "step": 6789, + "time_per_iteration": 2.6358134746551514 + }, + { + "auxiliary_loss_clip": 0.01116808, + "auxiliary_loss_mlp": 0.01035749, + "balance_loss_clip": 1.02133691, + "balance_loss_mlp": 1.0408318, + "epoch": 0.4082368856155118, + "flos": 20522086882560.0, + "grad_norm": 1.683929923970831, + "language_loss": 0.60006517, + "learning_rate": 2.6777946309622123e-06, + "loss": 0.62159079, + "num_input_tokens_seen": 145741090, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7578125, + "step": 6790, + "time_per_iteration": 2.4339373111724854 + }, + { + "auxiliary_loss_clip": 0.01117331, + "auxiliary_loss_mlp": 0.01041648, + "balance_loss_clip": 1.02752209, + "balance_loss_mlp": 1.04277873, + "epoch": 0.40829700886817977, + "flos": 11427745098240.0, + "grad_norm": 3.0836688581186538, + "language_loss": 0.69763649, + "learning_rate": 2.677428203462683e-06, + "loss": 0.71922624, + "num_input_tokens_seen": 145754985, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.74609375, + "step": 6791, + "time_per_iteration": 2.3970839977264404 + }, + { + "auxiliary_loss_clip": 0.01036371, + "auxiliary_loss_mlp": 0.01001155, + "balance_loss_clip": 0.99973643, + "balance_loss_mlp": 1.01245427, + "epoch": 0.40835713212084773, + "flos": 67330677121920.0, + "grad_norm": 0.7479961411193888, + "language_loss": 0.59600538, + "learning_rate": 2.6770617502750093e-06, + "loss": 0.61638063, + "num_input_tokens_seen": 145815260, + "router_z_loss_clip": 0.01416016, + "router_z_loss_mlp": 0.23828125, + "step": 6792, + "time_per_iteration": 3.0660579204559326 + }, + { + "auxiliary_loss_clip": 0.01122654, + "auxiliary_loss_mlp": 0.01046383, + "balance_loss_clip": 1.03205419, + "balance_loss_mlp": 1.04478419, + "epoch": 0.4084172553735157, + "flos": 21762010414080.0, + "grad_norm": 2.1865523890186975, + "language_loss": 0.8017205, + "learning_rate": 2.6766952714130857e-06, + "loss": 0.82341087, + "num_input_tokens_seen": 145832665, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 6793, + "time_per_iteration": 2.4930570125579834 + }, + { + "auxiliary_loss_clip": 0.01117695, + "auxiliary_loss_mlp": 0.01034941, + "balance_loss_clip": 1.0203917, + "balance_loss_mlp": 1.04145718, + "epoch": 0.40847737862618366, + "flos": 27417258478080.0, + "grad_norm": 1.7948567342085118, + "language_loss": 0.85040581, + "learning_rate": 2.6763287668908094e-06, + "loss": 0.87193215, + "num_input_tokens_seen": 145850240, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.76171875, + "step": 6794, + "time_per_iteration": 2.500248670578003 + }, + { + "auxiliary_loss_clip": 0.01119372, + "auxiliary_loss_mlp": 0.01036853, + "balance_loss_clip": 1.02316737, + "balance_loss_mlp": 1.04290628, + "epoch": 0.4085375018788516, + "flos": 18587255857920.0, + "grad_norm": 1.6403079662436217, + "language_loss": 0.79991007, + "learning_rate": 2.6759622367220788e-06, + "loss": 0.82147229, + "num_input_tokens_seen": 145869545, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.765625, + "step": 6795, + "time_per_iteration": 2.4969587326049805 + }, + { + "auxiliary_loss_clip": 0.01121457, + "auxiliary_loss_mlp": 0.01034155, + "balance_loss_clip": 1.01903319, + "balance_loss_mlp": 1.0415107, + "epoch": 0.4085976251315196, + "flos": 15411783029760.0, + "grad_norm": 3.0496031094407767, + "language_loss": 0.69604456, + "learning_rate": 2.675595680920792e-06, + "loss": 0.7176007, + "num_input_tokens_seen": 145884025, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.80078125, + "step": 6796, + "time_per_iteration": 2.415790319442749 + }, + { + "auxiliary_loss_clip": 0.01115637, + "auxiliary_loss_mlp": 0.01037628, + "balance_loss_clip": 1.02436018, + "balance_loss_mlp": 1.04028058, + "epoch": 0.40865774838418756, + "flos": 21252222639360.0, + "grad_norm": 1.6154855191434097, + "language_loss": 0.77814329, + "learning_rate": 2.6752290995008498e-06, + "loss": 0.799676, + "num_input_tokens_seen": 145903210, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75390625, + "step": 6797, + "time_per_iteration": 2.4960498809814453 + }, + { + "auxiliary_loss_clip": 0.01114842, + "auxiliary_loss_mlp": 0.01043476, + "balance_loss_clip": 1.03020835, + "balance_loss_mlp": 1.03869152, + "epoch": 0.4087178716368556, + "flos": 13772245714560.0, + "grad_norm": 2.268592052790042, + "language_loss": 0.85668063, + "learning_rate": 2.6748624924761523e-06, + "loss": 0.87826383, + "num_input_tokens_seen": 145920985, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.76171875, + "step": 6798, + "time_per_iteration": 2.4271299839019775 + }, + { + "auxiliary_loss_clip": 0.01115644, + "auxiliary_loss_mlp": 0.01035575, + "balance_loss_clip": 1.02341557, + "balance_loss_mlp": 1.04205322, + "epoch": 0.40877799488952354, + "flos": 23621752056960.0, + "grad_norm": 1.4625848333242037, + "language_loss": 0.8396889, + "learning_rate": 2.674495859860601e-06, + "loss": 0.86120105, + "num_input_tokens_seen": 145940350, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.734375, + "step": 6799, + "time_per_iteration": 2.5059525966644287 + }, + { + "auxiliary_loss_clip": 0.01118535, + "auxiliary_loss_mlp": 0.01043278, + "balance_loss_clip": 1.02861547, + "balance_loss_mlp": 1.04282522, + "epoch": 0.4088381181421915, + "flos": 20918791664640.0, + "grad_norm": 2.2336787226224453, + "language_loss": 0.83352369, + "learning_rate": 2.6741292016681e-06, + "loss": 0.85514188, + "num_input_tokens_seen": 145957460, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7578125, + "step": 6800, + "time_per_iteration": 2.441771984100342 + }, + { + "auxiliary_loss_clip": 0.01118367, + "auxiliary_loss_mlp": 0.01041201, + "balance_loss_clip": 1.02665734, + "balance_loss_mlp": 1.04080248, + "epoch": 0.4088982413948595, + "flos": 13297578462720.0, + "grad_norm": 1.815509221734431, + "language_loss": 0.74838769, + "learning_rate": 2.6737625179125514e-06, + "loss": 0.76998335, + "num_input_tokens_seen": 145975285, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.77734375, + "step": 6801, + "time_per_iteration": 2.4573957920074463 + }, + { + "auxiliary_loss_clip": 0.01118841, + "auxiliary_loss_mlp": 0.01039037, + "balance_loss_clip": 1.02418303, + "balance_loss_mlp": 1.04115379, + "epoch": 0.40895836464752744, + "flos": 15267673664640.0, + "grad_norm": 3.5876275394170682, + "language_loss": 0.79983771, + "learning_rate": 2.673395808607861e-06, + "loss": 0.8214165, + "num_input_tokens_seen": 145989150, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.77734375, + "step": 6802, + "time_per_iteration": 2.4583706855773926 + }, + { + "auxiliary_loss_clip": 0.01121333, + "auxiliary_loss_mlp": 0.01040482, + "balance_loss_clip": 1.02436495, + "balance_loss_mlp": 1.04269981, + "epoch": 0.4090184879001954, + "flos": 14501411804160.0, + "grad_norm": 1.9920926766799116, + "language_loss": 0.75564265, + "learning_rate": 2.673029073767934e-06, + "loss": 0.77726078, + "num_input_tokens_seen": 146006980, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.7890625, + "step": 6803, + "time_per_iteration": 3.8293817043304443 + }, + { + "auxiliary_loss_clip": 0.01117955, + "auxiliary_loss_mlp": 0.01037436, + "balance_loss_clip": 1.02296996, + "balance_loss_mlp": 1.04163659, + "epoch": 0.40907861115286337, + "flos": 13881593692800.0, + "grad_norm": 1.8273723177462575, + "language_loss": 0.78676009, + "learning_rate": 2.6726623134066764e-06, + "loss": 0.80831397, + "num_input_tokens_seen": 146025125, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.765625, + "step": 6804, + "time_per_iteration": 5.276589393615723 + }, + { + "auxiliary_loss_clip": 0.01121753, + "auxiliary_loss_mlp": 0.01038873, + "balance_loss_clip": 1.02486575, + "balance_loss_mlp": 1.04170704, + "epoch": 0.40913873440553133, + "flos": 28037615293440.0, + "grad_norm": 1.824409853433396, + "language_loss": 0.74958569, + "learning_rate": 2.672295527537998e-06, + "loss": 0.77119195, + "num_input_tokens_seen": 146044990, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.80078125, + "step": 6805, + "time_per_iteration": 2.4856061935424805 + }, + { + "auxiliary_loss_clip": 0.01121334, + "auxiliary_loss_mlp": 0.01040926, + "balance_loss_clip": 1.02701998, + "balance_loss_mlp": 1.04323924, + "epoch": 0.4091988576581993, + "flos": 21618188357760.0, + "grad_norm": 1.6270528279533119, + "language_loss": 0.79471934, + "learning_rate": 2.671928716175804e-06, + "loss": 0.816342, + "num_input_tokens_seen": 146066045, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.78125, + "step": 6806, + "time_per_iteration": 2.4999823570251465 + }, + { + "auxiliary_loss_clip": 0.01120343, + "auxiliary_loss_mlp": 0.01034262, + "balance_loss_clip": 1.02002871, + "balance_loss_mlp": 1.04182625, + "epoch": 0.40925898091086726, + "flos": 25224085860480.0, + "grad_norm": 1.8904572172377134, + "language_loss": 0.72131455, + "learning_rate": 2.671561879334007e-06, + "loss": 0.74286067, + "num_input_tokens_seen": 146086280, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78515625, + "step": 6807, + "time_per_iteration": 2.4900894165039062 + }, + { + "auxiliary_loss_clip": 0.01035827, + "auxiliary_loss_mlp": 0.01000695, + "balance_loss_clip": 0.99931204, + "balance_loss_mlp": 1.01169431, + "epoch": 0.40931910416353523, + "flos": 68930568800640.0, + "grad_norm": 0.8333385820049739, + "language_loss": 0.58798856, + "learning_rate": 2.6711950170265155e-06, + "loss": 0.60835379, + "num_input_tokens_seen": 146148840, + "router_z_loss_clip": 0.01385498, + "router_z_loss_mlp": 0.24121094, + "step": 6808, + "time_per_iteration": 3.1670446395874023 + }, + { + "auxiliary_loss_clip": 0.0111783, + "auxiliary_loss_mlp": 0.01047199, + "balance_loss_clip": 1.03397894, + "balance_loss_mlp": 1.04200959, + "epoch": 0.4093792274162032, + "flos": 20189553747840.0, + "grad_norm": 1.6310291749342813, + "language_loss": 0.54454345, + "learning_rate": 2.670828129267242e-06, + "loss": 0.56619376, + "num_input_tokens_seen": 146166195, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7578125, + "step": 6809, + "time_per_iteration": 2.445084571838379 + }, + { + "auxiliary_loss_clip": 0.01117961, + "auxiliary_loss_mlp": 0.01031185, + "balance_loss_clip": 1.0176785, + "balance_loss_mlp": 1.0413785, + "epoch": 0.40943935066887116, + "flos": 25228754628480.0, + "grad_norm": 1.8964783600080724, + "language_loss": 0.83296275, + "learning_rate": 2.6704612160700983e-06, + "loss": 0.85445428, + "num_input_tokens_seen": 146185045, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 6810, + "time_per_iteration": 2.507234573364258 + }, + { + "auxiliary_loss_clip": 0.01121577, + "auxiliary_loss_mlp": 0.01042346, + "balance_loss_clip": 1.02736187, + "balance_loss_mlp": 1.04350328, + "epoch": 0.4094994739215392, + "flos": 23255319461760.0, + "grad_norm": 2.219108175656967, + "language_loss": 0.77739668, + "learning_rate": 2.670094277448999e-06, + "loss": 0.79903591, + "num_input_tokens_seen": 146204655, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.78125, + "step": 6811, + "time_per_iteration": 2.4652421474456787 + }, + { + "auxiliary_loss_clip": 0.01118877, + "auxiliary_loss_mlp": 0.01033588, + "balance_loss_clip": 1.01804352, + "balance_loss_mlp": 1.04151464, + "epoch": 0.40955959717420715, + "flos": 17382165540480.0, + "grad_norm": 1.8555113442690365, + "language_loss": 0.69810557, + "learning_rate": 2.669727313417857e-06, + "loss": 0.7196303, + "num_input_tokens_seen": 146222000, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.7734375, + "step": 6812, + "time_per_iteration": 2.4447555541992188 + }, + { + "auxiliary_loss_clip": 0.0111498, + "auxiliary_loss_mlp": 0.01040667, + "balance_loss_clip": 1.02644539, + "balance_loss_mlp": 1.03930998, + "epoch": 0.4096197204268751, + "flos": 25082418620160.0, + "grad_norm": 1.4849650877087106, + "language_loss": 0.66131341, + "learning_rate": 2.6693603239905872e-06, + "loss": 0.68286985, + "num_input_tokens_seen": 146242630, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7578125, + "step": 6813, + "time_per_iteration": 2.461461067199707 + }, + { + "auxiliary_loss_clip": 0.01115791, + "auxiliary_loss_mlp": 0.01036723, + "balance_loss_clip": 1.02209592, + "balance_loss_mlp": 1.04076779, + "epoch": 0.4096798436795431, + "flos": 30586769648640.0, + "grad_norm": 1.8347983960230858, + "language_loss": 0.73899138, + "learning_rate": 2.6689933091811087e-06, + "loss": 0.76051652, + "num_input_tokens_seen": 146263070, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.75, + "step": 6814, + "time_per_iteration": 2.5444507598876953 + }, + { + "auxiliary_loss_clip": 0.01120309, + "auxiliary_loss_mlp": 0.01034627, + "balance_loss_clip": 1.02035785, + "balance_loss_mlp": 1.04147315, + "epoch": 0.40973996693221104, + "flos": 24133622820480.0, + "grad_norm": 2.162963447393967, + "language_loss": 0.65966797, + "learning_rate": 2.6686262690033357e-06, + "loss": 0.68121737, + "num_input_tokens_seen": 146282890, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7890625, + "step": 6815, + "time_per_iteration": 2.4877898693084717 + }, + { + "auxiliary_loss_clip": 0.01116543, + "auxiliary_loss_mlp": 0.01037411, + "balance_loss_clip": 1.02371955, + "balance_loss_mlp": 1.04337275, + "epoch": 0.409800090184879, + "flos": 23988974751360.0, + "grad_norm": 1.6370882031659308, + "language_loss": 0.76553667, + "learning_rate": 2.668259203471188e-06, + "loss": 0.78707623, + "num_input_tokens_seen": 146301755, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73046875, + "step": 6816, + "time_per_iteration": 2.5013954639434814 + }, + { + "auxiliary_loss_clip": 0.01119372, + "auxiliary_loss_mlp": 0.01038562, + "balance_loss_clip": 1.02404261, + "balance_loss_mlp": 1.04302227, + "epoch": 0.40986021343754697, + "flos": 16143678552960.0, + "grad_norm": 2.8457932880819463, + "language_loss": 0.81718624, + "learning_rate": 2.6678921125985843e-06, + "loss": 0.8387655, + "num_input_tokens_seen": 146316835, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.765625, + "step": 6817, + "time_per_iteration": 2.407566785812378 + }, + { + "auxiliary_loss_clip": 0.01121536, + "auxiliary_loss_mlp": 0.01037881, + "balance_loss_clip": 1.02179992, + "balance_loss_mlp": 1.04166436, + "epoch": 0.40992033669021494, + "flos": 24790824011520.0, + "grad_norm": 1.7366839484469832, + "language_loss": 0.79938078, + "learning_rate": 2.667524996399444e-06, + "loss": 0.82097495, + "num_input_tokens_seen": 146336650, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.80078125, + "step": 6818, + "time_per_iteration": 2.49364972114563 + }, + { + "auxiliary_loss_clip": 0.01114596, + "auxiliary_loss_mlp": 0.01036542, + "balance_loss_clip": 1.02288651, + "balance_loss_mlp": 1.03982878, + "epoch": 0.4099804599428829, + "flos": 29641888431360.0, + "grad_norm": 1.4683684500872527, + "language_loss": 0.65939564, + "learning_rate": 2.66715785488769e-06, + "loss": 0.68090701, + "num_input_tokens_seen": 146357640, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 6819, + "time_per_iteration": 2.5122451782226562 + }, + { + "auxiliary_loss_clip": 0.01123256, + "auxiliary_loss_mlp": 0.0103677, + "balance_loss_clip": 1.02191615, + "balance_loss_mlp": 1.04243147, + "epoch": 0.41004058319555087, + "flos": 24826590979200.0, + "grad_norm": 1.4566856211473176, + "language_loss": 0.85411352, + "learning_rate": 2.6667906880772428e-06, + "loss": 0.87571383, + "num_input_tokens_seen": 146379325, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.80859375, + "step": 6820, + "time_per_iteration": 2.4924051761627197 + }, + { + "auxiliary_loss_clip": 0.01116594, + "auxiliary_loss_mlp": 0.01033155, + "balance_loss_clip": 1.0189811, + "balance_loss_mlp": 1.04211807, + "epoch": 0.41010070644821883, + "flos": 25737464995200.0, + "grad_norm": 1.9363068637508836, + "language_loss": 0.71033639, + "learning_rate": 2.6664234959820256e-06, + "loss": 0.73183382, + "num_input_tokens_seen": 146398635, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7421875, + "step": 6821, + "time_per_iteration": 2.5236756801605225 + }, + { + "auxiliary_loss_clip": 0.01115707, + "auxiliary_loss_mlp": 0.01032479, + "balance_loss_clip": 1.01859689, + "balance_loss_mlp": 1.03997672, + "epoch": 0.4101608297008868, + "flos": 22346061557760.0, + "grad_norm": 2.2789873913326404, + "language_loss": 0.74732232, + "learning_rate": 2.6660562786159634e-06, + "loss": 0.76880419, + "num_input_tokens_seen": 146417585, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 6822, + "time_per_iteration": 2.485173225402832 + }, + { + "auxiliary_loss_clip": 0.01117367, + "auxiliary_loss_mlp": 0.01036407, + "balance_loss_clip": 1.02226305, + "balance_loss_mlp": 1.04145467, + "epoch": 0.41022095295355476, + "flos": 21945083057280.0, + "grad_norm": 1.8990120981529888, + "language_loss": 0.7503438, + "learning_rate": 2.6656890359929796e-06, + "loss": 0.77188146, + "num_input_tokens_seen": 146437035, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7578125, + "step": 6823, + "time_per_iteration": 2.46115779876709 + }, + { + "auxiliary_loss_clip": 0.01124707, + "auxiliary_loss_mlp": 0.01039141, + "balance_loss_clip": 1.02359605, + "balance_loss_mlp": 1.04229724, + "epoch": 0.4102810762062228, + "flos": 27450511493760.0, + "grad_norm": 2.6227876605231986, + "language_loss": 0.73347652, + "learning_rate": 2.665321768127001e-06, + "loss": 0.75511503, + "num_input_tokens_seen": 146457370, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8203125, + "step": 6824, + "time_per_iteration": 2.504561185836792 + }, + { + "auxiliary_loss_clip": 0.01120752, + "auxiliary_loss_mlp": 0.01035065, + "balance_loss_clip": 1.01985359, + "balance_loss_mlp": 1.04105759, + "epoch": 0.41034119945889075, + "flos": 24499265316480.0, + "grad_norm": 2.228764168551681, + "language_loss": 0.71601099, + "learning_rate": 2.6649544750319548e-06, + "loss": 0.73756915, + "num_input_tokens_seen": 146478105, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.796875, + "step": 6825, + "time_per_iteration": 2.476551055908203 + }, + { + "auxiliary_loss_clip": 0.01117579, + "auxiliary_loss_mlp": 0.0103678, + "balance_loss_clip": 1.02359533, + "balance_loss_mlp": 1.04292464, + "epoch": 0.4104013227115587, + "flos": 24352641999360.0, + "grad_norm": 1.9864880407367733, + "language_loss": 0.84743512, + "learning_rate": 2.664587156721768e-06, + "loss": 0.86897874, + "num_input_tokens_seen": 146497835, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.74609375, + "step": 6826, + "time_per_iteration": 2.492030382156372 + }, + { + "auxiliary_loss_clip": 0.01117058, + "auxiliary_loss_mlp": 0.01035255, + "balance_loss_clip": 1.02066422, + "balance_loss_mlp": 1.0431006, + "epoch": 0.4104614459642267, + "flos": 23729340268800.0, + "grad_norm": 1.962634793360081, + "language_loss": 0.66582263, + "learning_rate": 2.6642198132103696e-06, + "loss": 0.68734574, + "num_input_tokens_seen": 146517735, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.73828125, + "step": 6827, + "time_per_iteration": 2.4629759788513184 + }, + { + "auxiliary_loss_clip": 0.01113749, + "auxiliary_loss_mlp": 0.01032027, + "balance_loss_clip": 1.01799607, + "balance_loss_mlp": 1.03989482, + "epoch": 0.41052156921689464, + "flos": 22127976132480.0, + "grad_norm": 1.3616881749334155, + "language_loss": 0.72346127, + "learning_rate": 2.663852444511689e-06, + "loss": 0.74491906, + "num_input_tokens_seen": 146537640, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.73828125, + "step": 6828, + "time_per_iteration": 2.4807186126708984 + }, + { + "auxiliary_loss_clip": 0.01120586, + "auxiliary_loss_mlp": 0.01042781, + "balance_loss_clip": 1.02777803, + "balance_loss_mlp": 1.0410856, + "epoch": 0.4105816924695626, + "flos": 20084371747200.0, + "grad_norm": 1.900432401993592, + "language_loss": 0.83422399, + "learning_rate": 2.6634850506396574e-06, + "loss": 0.85585773, + "num_input_tokens_seen": 146554695, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.796875, + "step": 6829, + "time_per_iteration": 2.4298055171966553 + }, + { + "auxiliary_loss_clip": 0.01114334, + "auxiliary_loss_mlp": 0.0103303, + "balance_loss_clip": 1.01940441, + "balance_loss_mlp": 1.03960419, + "epoch": 0.4106418157222306, + "flos": 18076785724800.0, + "grad_norm": 1.5044787550344432, + "language_loss": 0.9002744, + "learning_rate": 2.663117631608206e-06, + "loss": 0.92174798, + "num_input_tokens_seen": 146573740, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 6830, + "time_per_iteration": 2.4607503414154053 + }, + { + "auxiliary_loss_clip": 0.01115903, + "auxiliary_loss_mlp": 0.01025937, + "balance_loss_clip": 1.01268673, + "balance_loss_mlp": 1.04088628, + "epoch": 0.41070193897489854, + "flos": 21647850013440.0, + "grad_norm": 2.455330668305064, + "language_loss": 0.65950698, + "learning_rate": 2.662750187431268e-06, + "loss": 0.68092537, + "num_input_tokens_seen": 146592885, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75, + "step": 6831, + "time_per_iteration": 2.4402008056640625 + }, + { + "auxiliary_loss_clip": 0.01114416, + "auxiliary_loss_mlp": 0.01035741, + "balance_loss_clip": 1.02233577, + "balance_loss_mlp": 1.04019713, + "epoch": 0.4107620622275665, + "flos": 26648195356800.0, + "grad_norm": 1.7503077174044546, + "language_loss": 0.69414657, + "learning_rate": 2.662382718122776e-06, + "loss": 0.71564817, + "num_input_tokens_seen": 146611995, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7421875, + "step": 6832, + "time_per_iteration": 2.4985976219177246 + }, + { + "auxiliary_loss_clip": 0.0111274, + "auxiliary_loss_mlp": 0.0103582, + "balance_loss_clip": 1.02265322, + "balance_loss_mlp": 1.03861785, + "epoch": 0.41082218548023447, + "flos": 18734310138240.0, + "grad_norm": 2.137055635154832, + "language_loss": 0.73675501, + "learning_rate": 2.662015223696666e-06, + "loss": 0.75824058, + "num_input_tokens_seen": 146628045, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 6833, + "time_per_iteration": 2.423802375793457 + }, + { + "auxiliary_loss_clip": 0.01120262, + "auxiliary_loss_mlp": 0.01041415, + "balance_loss_clip": 1.02648401, + "balance_loss_mlp": 1.04171228, + "epoch": 0.41088230873290243, + "flos": 22893771116160.0, + "grad_norm": 1.6404428787043481, + "language_loss": 0.72538, + "learning_rate": 2.6616477041668713e-06, + "loss": 0.74699682, + "num_input_tokens_seen": 146648355, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78515625, + "step": 6834, + "time_per_iteration": 2.5415680408477783 + }, + { + "auxiliary_loss_clip": 0.01119029, + "auxiliary_loss_mlp": 0.01044226, + "balance_loss_clip": 1.03027868, + "balance_loss_mlp": 1.04038835, + "epoch": 0.4109424319855704, + "flos": 24276978000000.0, + "grad_norm": 2.0754355899076717, + "language_loss": 0.71026015, + "learning_rate": 2.661280159547329e-06, + "loss": 0.7318927, + "num_input_tokens_seen": 146668370, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.78515625, + "step": 6835, + "time_per_iteration": 2.4709722995758057 + }, + { + "auxiliary_loss_clip": 0.01118649, + "auxiliary_loss_mlp": 0.01040202, + "balance_loss_clip": 1.02521181, + "balance_loss_mlp": 1.04203069, + "epoch": 0.41100255523823837, + "flos": 12969139478400.0, + "grad_norm": 1.9290870315127813, + "language_loss": 0.86998641, + "learning_rate": 2.660912589851978e-06, + "loss": 0.89157486, + "num_input_tokens_seen": 146686665, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.765625, + "step": 6836, + "time_per_iteration": 2.4478323459625244 + }, + { + "auxiliary_loss_clip": 0.01114601, + "auxiliary_loss_mlp": 0.01038609, + "balance_loss_clip": 1.02464342, + "balance_loss_mlp": 1.040609, + "epoch": 0.4110626784909064, + "flos": 23145648261120.0, + "grad_norm": 1.7219230799083993, + "language_loss": 0.69017011, + "learning_rate": 2.6605449950947547e-06, + "loss": 0.71170223, + "num_input_tokens_seen": 146706570, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.73828125, + "step": 6837, + "time_per_iteration": 2.4600830078125 + }, + { + "auxiliary_loss_clip": 0.01116898, + "auxiliary_loss_mlp": 0.01038198, + "balance_loss_clip": 1.02394605, + "balance_loss_mlp": 1.04047167, + "epoch": 0.41112280174357435, + "flos": 22747399194240.0, + "grad_norm": 1.7295939332860302, + "language_loss": 0.75087547, + "learning_rate": 2.660177375289599e-06, + "loss": 0.77242649, + "num_input_tokens_seen": 146723425, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.765625, + "step": 6838, + "time_per_iteration": 2.460449695587158 + }, + { + "auxiliary_loss_clip": 0.01115474, + "auxiliary_loss_mlp": 0.01035463, + "balance_loss_clip": 1.02075219, + "balance_loss_mlp": 1.04058707, + "epoch": 0.4111829249962423, + "flos": 21102403011840.0, + "grad_norm": 1.8679563507274572, + "language_loss": 0.82247162, + "learning_rate": 2.659809730450451e-06, + "loss": 0.84398103, + "num_input_tokens_seen": 146741640, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.75, + "step": 6839, + "time_per_iteration": 2.4339215755462646 + }, + { + "auxiliary_loss_clip": 0.01112221, + "auxiliary_loss_mlp": 0.01032315, + "balance_loss_clip": 1.01875496, + "balance_loss_mlp": 1.03766727, + "epoch": 0.4112430482489103, + "flos": 21505787723520.0, + "grad_norm": 1.9294791670505813, + "language_loss": 0.80338049, + "learning_rate": 2.6594420605912523e-06, + "loss": 0.82482588, + "num_input_tokens_seen": 146759195, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.74609375, + "step": 6840, + "time_per_iteration": 2.464096784591675 + }, + { + "auxiliary_loss_clip": 0.01111724, + "auxiliary_loss_mlp": 0.01034503, + "balance_loss_clip": 1.02119339, + "balance_loss_mlp": 1.03856099, + "epoch": 0.41130317150157825, + "flos": 19570022945280.0, + "grad_norm": 1.7525143939260106, + "language_loss": 0.67515284, + "learning_rate": 2.6590743657259442e-06, + "loss": 0.6966151, + "num_input_tokens_seen": 146774990, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73046875, + "step": 6841, + "time_per_iteration": 2.412872314453125 + }, + { + "auxiliary_loss_clip": 0.01035921, + "auxiliary_loss_mlp": 0.01010132, + "balance_loss_clip": 1.00880933, + "balance_loss_mlp": 1.01203704, + "epoch": 0.4113632947542462, + "flos": 62383157706240.0, + "grad_norm": 0.7700890610990695, + "language_loss": 0.5963515, + "learning_rate": 2.65870664586847e-06, + "loss": 0.61681211, + "num_input_tokens_seen": 146839610, + "router_z_loss_clip": 0.01324463, + "router_z_loss_mlp": 0.23828125, + "step": 6842, + "time_per_iteration": 3.167282819747925 + }, + { + "auxiliary_loss_clip": 0.01111896, + "auxiliary_loss_mlp": 0.01033298, + "balance_loss_clip": 1.02044773, + "balance_loss_mlp": 1.04057288, + "epoch": 0.4114234180069142, + "flos": 13918617636480.0, + "grad_norm": 2.121884132790859, + "language_loss": 0.69212461, + "learning_rate": 2.6583389010327742e-06, + "loss": 0.71357656, + "num_input_tokens_seen": 146857360, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71484375, + "step": 6843, + "time_per_iteration": 2.4664626121520996 + }, + { + "auxiliary_loss_clip": 0.01035393, + "auxiliary_loss_mlp": 0.01003775, + "balance_loss_clip": 1.00222576, + "balance_loss_mlp": 1.01154804, + "epoch": 0.41148354125958214, + "flos": 64928505219840.0, + "grad_norm": 0.7178401469554447, + "language_loss": 0.53669417, + "learning_rate": 2.6579711312328013e-06, + "loss": 0.55708587, + "num_input_tokens_seen": 146917055, + "router_z_loss_clip": 0.01550293, + "router_z_loss_mlp": 0.23828125, + "step": 6844, + "time_per_iteration": 3.0998694896698 + }, + { + "auxiliary_loss_clip": 0.0111189, + "auxiliary_loss_mlp": 0.0103482, + "balance_loss_clip": 1.02213013, + "balance_loss_mlp": 1.03937054, + "epoch": 0.4115436645122501, + "flos": 18728779443840.0, + "grad_norm": 1.6545259135728443, + "language_loss": 0.66114587, + "learning_rate": 2.6576033364824967e-06, + "loss": 0.68261302, + "num_input_tokens_seen": 146935215, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7265625, + "step": 6845, + "time_per_iteration": 6.8190038204193115 + }, + { + "auxiliary_loss_clip": 0.01113046, + "auxiliary_loss_mlp": 0.01034986, + "balance_loss_clip": 1.0221113, + "balance_loss_mlp": 1.04133987, + "epoch": 0.41160378776491807, + "flos": 16252918790400.0, + "grad_norm": 1.8380761864561301, + "language_loss": 0.70359266, + "learning_rate": 2.657235516795808e-06, + "loss": 0.72507298, + "num_input_tokens_seen": 146951970, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 6846, + "time_per_iteration": 3.941171646118164 + }, + { + "auxiliary_loss_clip": 0.01112317, + "auxiliary_loss_mlp": 0.01035623, + "balance_loss_clip": 1.02163363, + "balance_loss_mlp": 1.03892803, + "epoch": 0.41166391101758604, + "flos": 27970031854080.0, + "grad_norm": 1.507800360258476, + "language_loss": 0.64964008, + "learning_rate": 2.6568676721866826e-06, + "loss": 0.67111951, + "num_input_tokens_seen": 146975615, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.73046875, + "step": 6847, + "time_per_iteration": 2.5782458782196045 + }, + { + "auxiliary_loss_clip": 0.01112352, + "auxiliary_loss_mlp": 0.0104301, + "balance_loss_clip": 1.02921724, + "balance_loss_mlp": 1.03790998, + "epoch": 0.411724034270254, + "flos": 34131296764800.0, + "grad_norm": 1.3239337291849294, + "language_loss": 0.70368952, + "learning_rate": 2.656499802669069e-06, + "loss": 0.72524321, + "num_input_tokens_seen": 146998855, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7421875, + "step": 6848, + "time_per_iteration": 2.552729606628418 + }, + { + "auxiliary_loss_clip": 0.01035603, + "auxiliary_loss_mlp": 0.00998835, + "balance_loss_clip": 0.99738103, + "balance_loss_mlp": 1.01178169, + "epoch": 0.41178415752292197, + "flos": 67923670752000.0, + "grad_norm": 0.8862972606407307, + "language_loss": 0.56235039, + "learning_rate": 2.6561319082569174e-06, + "loss": 0.58269477, + "num_input_tokens_seen": 147062710, + "router_z_loss_clip": 0.01452637, + "router_z_loss_mlp": 0.23828125, + "step": 6849, + "time_per_iteration": 3.144639730453491 + }, + { + "auxiliary_loss_clip": 0.01112679, + "auxiliary_loss_mlp": 0.01036148, + "balance_loss_clip": 1.02255821, + "balance_loss_mlp": 1.04060721, + "epoch": 0.41184428077558993, + "flos": 34313938444800.0, + "grad_norm": 1.58670522574793, + "language_loss": 0.76169646, + "learning_rate": 2.6557639889641783e-06, + "loss": 0.78318465, + "num_input_tokens_seen": 147086075, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.71875, + "step": 6850, + "time_per_iteration": 2.5668234825134277 + }, + { + "auxiliary_loss_clip": 0.01111269, + "auxiliary_loss_mlp": 0.01033693, + "balance_loss_clip": 1.02075291, + "balance_loss_mlp": 1.03937149, + "epoch": 0.41190440402825795, + "flos": 35444118948480.0, + "grad_norm": 1.4904377439692653, + "language_loss": 0.67717403, + "learning_rate": 2.6553960448048025e-06, + "loss": 0.69862366, + "num_input_tokens_seen": 147107590, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71875, + "step": 6851, + "time_per_iteration": 2.588646650314331 + }, + { + "auxiliary_loss_clip": 0.01116771, + "auxiliary_loss_mlp": 0.01043217, + "balance_loss_clip": 1.02792835, + "balance_loss_mlp": 1.03957748, + "epoch": 0.4119645272809259, + "flos": 20849879422080.0, + "grad_norm": 2.5339755397297776, + "language_loss": 0.79547226, + "learning_rate": 2.655028075792743e-06, + "loss": 0.81707215, + "num_input_tokens_seen": 147123715, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.76953125, + "step": 6852, + "time_per_iteration": 2.4342472553253174 + }, + { + "auxiliary_loss_clip": 0.01120035, + "auxiliary_loss_mlp": 0.01033443, + "balance_loss_clip": 1.01818419, + "balance_loss_mlp": 1.04227197, + "epoch": 0.4120246505335939, + "flos": 27562050201600.0, + "grad_norm": 3.302073757908878, + "language_loss": 0.78002989, + "learning_rate": 2.6546600819419537e-06, + "loss": 0.80156463, + "num_input_tokens_seen": 147144290, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.77734375, + "step": 6853, + "time_per_iteration": 2.536959409713745 + }, + { + "auxiliary_loss_clip": 0.01118617, + "auxiliary_loss_mlp": 0.01037367, + "balance_loss_clip": 1.022156, + "balance_loss_mlp": 1.04021645, + "epoch": 0.41208477378626185, + "flos": 37815444046080.0, + "grad_norm": 1.636675456410819, + "language_loss": 0.65871978, + "learning_rate": 2.6542920632663883e-06, + "loss": 0.68027961, + "num_input_tokens_seen": 147166340, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.78515625, + "step": 6854, + "time_per_iteration": 2.587641477584839 + }, + { + "auxiliary_loss_clip": 0.01113423, + "auxiliary_loss_mlp": 0.01032528, + "balance_loss_clip": 1.01973081, + "balance_loss_mlp": 1.04029512, + "epoch": 0.4121448970389298, + "flos": 23440762402560.0, + "grad_norm": 1.819965675297277, + "language_loss": 0.83530807, + "learning_rate": 2.6539240197800023e-06, + "loss": 0.85676759, + "num_input_tokens_seen": 147184025, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.73046875, + "step": 6855, + "time_per_iteration": 2.5173020362854004 + }, + { + "auxiliary_loss_clip": 0.01112, + "auxiliary_loss_mlp": 0.01036416, + "balance_loss_clip": 1.02333903, + "balance_loss_mlp": 1.03945315, + "epoch": 0.4122050202915978, + "flos": 21325300859520.0, + "grad_norm": 1.701531451547931, + "language_loss": 0.7926302, + "learning_rate": 2.6535559514967517e-06, + "loss": 0.81411433, + "num_input_tokens_seen": 147202730, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 6856, + "time_per_iteration": 2.4496660232543945 + }, + { + "auxiliary_loss_clip": 0.01115557, + "auxiliary_loss_mlp": 0.01034607, + "balance_loss_clip": 1.021119, + "balance_loss_mlp": 1.04115629, + "epoch": 0.41226514354426574, + "flos": 17306286059520.0, + "grad_norm": 6.346447490864035, + "language_loss": 0.79253089, + "learning_rate": 2.6531878584305935e-06, + "loss": 0.81403255, + "num_input_tokens_seen": 147215315, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 6857, + "time_per_iteration": 2.454458236694336 + }, + { + "auxiliary_loss_clip": 0.01114343, + "auxiliary_loss_mlp": 0.01036012, + "balance_loss_clip": 1.02169538, + "balance_loss_mlp": 1.03821683, + "epoch": 0.4123252667969337, + "flos": 17638855107840.0, + "grad_norm": 1.6045712878894351, + "language_loss": 0.70696247, + "learning_rate": 2.6528197405954873e-06, + "loss": 0.72846603, + "num_input_tokens_seen": 147233330, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.76171875, + "step": 6858, + "time_per_iteration": 2.453808069229126 + }, + { + "auxiliary_loss_clip": 0.01113834, + "auxiliary_loss_mlp": 0.01035636, + "balance_loss_clip": 1.02162266, + "balance_loss_mlp": 1.04016411, + "epoch": 0.4123853900496017, + "flos": 46424811375360.0, + "grad_norm": 1.4836752505963042, + "language_loss": 0.59489501, + "learning_rate": 2.652451598005391e-06, + "loss": 0.61638969, + "num_input_tokens_seen": 147257780, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.73828125, + "step": 6859, + "time_per_iteration": 2.6645431518554688 + }, + { + "auxiliary_loss_clip": 0.01112236, + "auxiliary_loss_mlp": 0.01036677, + "balance_loss_clip": 1.02283669, + "balance_loss_mlp": 1.03694463, + "epoch": 0.41244551330226964, + "flos": 17675160779520.0, + "grad_norm": 2.017738864380765, + "language_loss": 0.73062313, + "learning_rate": 2.652083430674264e-06, + "loss": 0.75211227, + "num_input_tokens_seen": 147276055, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75390625, + "step": 6860, + "time_per_iteration": 2.4230310916900635 + }, + { + "auxiliary_loss_clip": 0.01111098, + "auxiliary_loss_mlp": 0.01033206, + "balance_loss_clip": 1.02037311, + "balance_loss_mlp": 1.03779876, + "epoch": 0.4125056365549376, + "flos": 18693730748160.0, + "grad_norm": 1.603033952512427, + "language_loss": 0.74057221, + "learning_rate": 2.651715238616068e-06, + "loss": 0.76201528, + "num_input_tokens_seen": 147293200, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.734375, + "step": 6861, + "time_per_iteration": 2.466261863708496 + }, + { + "auxiliary_loss_clip": 0.01111959, + "auxiliary_loss_mlp": 0.01031192, + "balance_loss_clip": 1.0190326, + "balance_loss_mlp": 1.04026282, + "epoch": 0.41256575980760557, + "flos": 17895293280000.0, + "grad_norm": 2.017273954904035, + "language_loss": 0.79431915, + "learning_rate": 2.651347021844765e-06, + "loss": 0.81575066, + "num_input_tokens_seen": 147310640, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.71875, + "step": 6862, + "time_per_iteration": 2.4272851943969727 + }, + { + "auxiliary_loss_clip": 0.01115421, + "auxiliary_loss_mlp": 0.01032509, + "balance_loss_clip": 1.01946771, + "balance_loss_mlp": 1.04104841, + "epoch": 0.41262588306027354, + "flos": 21981316901760.0, + "grad_norm": 1.7023318630513873, + "language_loss": 0.76025152, + "learning_rate": 2.650978780374318e-06, + "loss": 0.78173077, + "num_input_tokens_seen": 147329435, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.74609375, + "step": 6863, + "time_per_iteration": 2.491703987121582 + }, + { + "auxiliary_loss_clip": 0.01034073, + "auxiliary_loss_mlp": 0.01002883, + "balance_loss_clip": 1.00128579, + "balance_loss_mlp": 1.01038253, + "epoch": 0.41268600631294156, + "flos": 53350006740480.0, + "grad_norm": 0.6998724627349664, + "language_loss": 0.52726007, + "learning_rate": 2.650610514218691e-06, + "loss": 0.54762965, + "num_input_tokens_seen": 147385805, + "router_z_loss_clip": 0.01599121, + "router_z_loss_mlp": 0.23632812, + "step": 6864, + "time_per_iteration": 3.05096173286438 + }, + { + "auxiliary_loss_clip": 0.01117449, + "auxiliary_loss_mlp": 0.01034441, + "balance_loss_clip": 1.02002299, + "balance_loss_mlp": 1.04010963, + "epoch": 0.4127461295656095, + "flos": 24385356311040.0, + "grad_norm": 1.8277977271365335, + "language_loss": 0.72328234, + "learning_rate": 2.6502422233918468e-06, + "loss": 0.74480128, + "num_input_tokens_seen": 147405160, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7734375, + "step": 6865, + "time_per_iteration": 2.5138418674468994 + }, + { + "auxiliary_loss_clip": 0.0103371, + "auxiliary_loss_mlp": 0.01003681, + "balance_loss_clip": 1.00216091, + "balance_loss_mlp": 1.00997901, + "epoch": 0.4128062528182775, + "flos": 71705242696320.0, + "grad_norm": 0.9175964026476935, + "language_loss": 0.66545808, + "learning_rate": 2.649873907907753e-06, + "loss": 0.68583202, + "num_input_tokens_seen": 147460245, + "router_z_loss_clip": 0.01519775, + "router_z_loss_mlp": 0.23730469, + "step": 6866, + "time_per_iteration": 2.965301513671875 + }, + { + "auxiliary_loss_clip": 0.01111664, + "auxiliary_loss_mlp": 0.01037824, + "balance_loss_clip": 1.02442443, + "balance_loss_mlp": 1.03779757, + "epoch": 0.41286637607094545, + "flos": 17849111368320.0, + "grad_norm": 1.9494269702964535, + "language_loss": 0.80854523, + "learning_rate": 2.649505567780375e-06, + "loss": 0.8300401, + "num_input_tokens_seen": 147476200, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73828125, + "step": 6867, + "time_per_iteration": 2.4153382778167725 + }, + { + "auxiliary_loss_clip": 0.01118424, + "auxiliary_loss_mlp": 0.01037514, + "balance_loss_clip": 1.02335191, + "balance_loss_mlp": 1.04141474, + "epoch": 0.4129264993236134, + "flos": 25549544016000.0, + "grad_norm": 2.031901046820099, + "language_loss": 0.77580094, + "learning_rate": 2.6491372030236815e-06, + "loss": 0.7973603, + "num_input_tokens_seen": 147494315, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7734375, + "step": 6868, + "time_per_iteration": 2.535595178604126 + }, + { + "auxiliary_loss_clip": 0.01033303, + "auxiliary_loss_mlp": 0.00999485, + "balance_loss_clip": 0.99789923, + "balance_loss_mlp": 1.0095768, + "epoch": 0.4129866225762814, + "flos": 65414446364160.0, + "grad_norm": 0.8413704541135547, + "language_loss": 0.5779494, + "learning_rate": 2.64876881365164e-06, + "loss": 0.59827721, + "num_input_tokens_seen": 147543665, + "router_z_loss_clip": 0.01586914, + "router_z_loss_mlp": 0.23730469, + "step": 6869, + "time_per_iteration": 2.8164174556732178 + }, + { + "auxiliary_loss_clip": 0.01112645, + "auxiliary_loss_mlp": 0.01034982, + "balance_loss_clip": 1.02101028, + "balance_loss_mlp": 1.03904057, + "epoch": 0.41304674582894935, + "flos": 28876991287680.0, + "grad_norm": 1.6360017889096097, + "language_loss": 0.74995548, + "learning_rate": 2.64840039967822e-06, + "loss": 0.77143168, + "num_input_tokens_seen": 147564870, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.734375, + "step": 6870, + "time_per_iteration": 2.5370054244995117 + }, + { + "auxiliary_loss_clip": 0.01114255, + "auxiliary_loss_mlp": 0.01042162, + "balance_loss_clip": 1.02757072, + "balance_loss_mlp": 1.03925085, + "epoch": 0.4131068690816173, + "flos": 22891975436160.0, + "grad_norm": 1.504144022647526, + "language_loss": 0.83272427, + "learning_rate": 2.6480319611173912e-06, + "loss": 0.85428846, + "num_input_tokens_seen": 147584840, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.75, + "step": 6871, + "time_per_iteration": 2.596686601638794 + }, + { + "auxiliary_loss_clip": 0.01117357, + "auxiliary_loss_mlp": 0.01041675, + "balance_loss_clip": 1.02738237, + "balance_loss_mlp": 1.04108167, + "epoch": 0.4131669923342853, + "flos": 26065185707520.0, + "grad_norm": 5.838045745285431, + "language_loss": 0.68951505, + "learning_rate": 2.6476634979831263e-06, + "loss": 0.71110535, + "num_input_tokens_seen": 147604635, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.76171875, + "step": 6872, + "time_per_iteration": 2.6045477390289307 + }, + { + "auxiliary_loss_clip": 0.01115693, + "auxiliary_loss_mlp": 0.01035465, + "balance_loss_clip": 1.02197695, + "balance_loss_mlp": 1.04050374, + "epoch": 0.41322711558695324, + "flos": 19244564789760.0, + "grad_norm": 1.864312912622832, + "language_loss": 0.75716275, + "learning_rate": 2.6472950102893964e-06, + "loss": 0.7786743, + "num_input_tokens_seen": 147620700, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 6873, + "time_per_iteration": 2.4200570583343506 + }, + { + "auxiliary_loss_clip": 0.01117091, + "auxiliary_loss_mlp": 0.01033505, + "balance_loss_clip": 1.01943827, + "balance_loss_mlp": 1.04055679, + "epoch": 0.4132872388396212, + "flos": 22674464628480.0, + "grad_norm": 1.671510122752512, + "language_loss": 0.82721817, + "learning_rate": 2.6469264980501746e-06, + "loss": 0.84872413, + "num_input_tokens_seen": 147639490, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.765625, + "step": 6874, + "time_per_iteration": 2.4689133167266846 + }, + { + "auxiliary_loss_clip": 0.01116401, + "auxiliary_loss_mlp": 0.01034964, + "balance_loss_clip": 1.02054608, + "balance_loss_mlp": 1.0397824, + "epoch": 0.4133473620922892, + "flos": 20150195420160.0, + "grad_norm": 2.003609916019722, + "language_loss": 0.71075761, + "learning_rate": 2.646557961279436e-06, + "loss": 0.73227131, + "num_input_tokens_seen": 147657205, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.765625, + "step": 6875, + "time_per_iteration": 2.4145123958587646 + }, + { + "auxiliary_loss_clip": 0.01110907, + "auxiliary_loss_mlp": 0.01040098, + "balance_loss_clip": 1.02686, + "balance_loss_mlp": 1.04001451, + "epoch": 0.41340748534495714, + "flos": 24242755317120.0, + "grad_norm": 1.617534223510663, + "language_loss": 0.82538921, + "learning_rate": 2.646189399991154e-06, + "loss": 0.84689927, + "num_input_tokens_seen": 147677005, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 6876, + "time_per_iteration": 2.49533748626709 + }, + { + "auxiliary_loss_clip": 0.01118483, + "auxiliary_loss_mlp": 0.010388, + "balance_loss_clip": 1.02354097, + "balance_loss_mlp": 1.03916812, + "epoch": 0.41346760859762516, + "flos": 14392171566720.0, + "grad_norm": 2.858959916779265, + "language_loss": 0.65397477, + "learning_rate": 2.6458208141993048e-06, + "loss": 0.6755476, + "num_input_tokens_seen": 147693435, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.79296875, + "step": 6877, + "time_per_iteration": 2.4231626987457275 + }, + { + "auxiliary_loss_clip": 0.01113806, + "auxiliary_loss_mlp": 0.01031336, + "balance_loss_clip": 1.01795483, + "balance_loss_mlp": 1.04000914, + "epoch": 0.4135277318502931, + "flos": 22492002516480.0, + "grad_norm": 2.013643508242888, + "language_loss": 0.76686853, + "learning_rate": 2.6454522039178668e-06, + "loss": 0.78831995, + "num_input_tokens_seen": 147714000, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73828125, + "step": 6878, + "time_per_iteration": 2.492220640182495 + }, + { + "auxiliary_loss_clip": 0.01114835, + "auxiliary_loss_mlp": 0.01039959, + "balance_loss_clip": 1.02589822, + "balance_loss_mlp": 1.040084, + "epoch": 0.4135878551029611, + "flos": 22418744728320.0, + "grad_norm": 1.8674435899066546, + "language_loss": 0.80248523, + "learning_rate": 2.6450835691608154e-06, + "loss": 0.82403314, + "num_input_tokens_seen": 147731010, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.75, + "step": 6879, + "time_per_iteration": 2.458623170852661 + }, + { + "auxiliary_loss_clip": 0.01114903, + "auxiliary_loss_mlp": 0.01036028, + "balance_loss_clip": 1.02160931, + "balance_loss_mlp": 1.03960526, + "epoch": 0.41364797835562905, + "flos": 27053232094080.0, + "grad_norm": 1.9200458523415633, + "language_loss": 0.84693611, + "learning_rate": 2.6447149099421315e-06, + "loss": 0.86844546, + "num_input_tokens_seen": 147750880, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.75390625, + "step": 6880, + "time_per_iteration": 2.605189323425293 + }, + { + "auxiliary_loss_clip": 0.01116516, + "auxiliary_loss_mlp": 0.01028456, + "balance_loss_clip": 1.01478863, + "balance_loss_mlp": 1.04023683, + "epoch": 0.413708101608297, + "flos": 22967603521920.0, + "grad_norm": 1.672120688006926, + "language_loss": 0.70195448, + "learning_rate": 2.6443462262757927e-06, + "loss": 0.72340417, + "num_input_tokens_seen": 147771360, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76171875, + "step": 6881, + "time_per_iteration": 2.4585211277008057 + }, + { + "auxiliary_loss_clip": 0.01113486, + "auxiliary_loss_mlp": 0.01037109, + "balance_loss_clip": 1.02450848, + "balance_loss_mlp": 1.04145753, + "epoch": 0.413768224860965, + "flos": 13333991875200.0, + "grad_norm": 1.702675342664879, + "language_loss": 0.81404376, + "learning_rate": 2.6439775181757805e-06, + "loss": 0.83554971, + "num_input_tokens_seen": 147787440, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71875, + "step": 6882, + "time_per_iteration": 2.451544761657715 + }, + { + "auxiliary_loss_clip": 0.01121461, + "auxiliary_loss_mlp": 0.01047549, + "balance_loss_clip": 1.0311873, + "balance_loss_mlp": 1.04304028, + "epoch": 0.41382834811363295, + "flos": 20813968800000.0, + "grad_norm": 1.9410860498070561, + "language_loss": 0.69296026, + "learning_rate": 2.643608785656077e-06, + "loss": 0.71465033, + "num_input_tokens_seen": 147805720, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.78515625, + "step": 6883, + "time_per_iteration": 2.4320569038391113 + }, + { + "auxiliary_loss_clip": 0.01115479, + "auxiliary_loss_mlp": 0.01035258, + "balance_loss_clip": 1.02175713, + "balance_loss_mlp": 1.04087615, + "epoch": 0.4138884713663009, + "flos": 20667130001280.0, + "grad_norm": 1.7677749997866015, + "language_loss": 0.75449616, + "learning_rate": 2.643240028730663e-06, + "loss": 0.77600354, + "num_input_tokens_seen": 147824605, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 6884, + "time_per_iteration": 2.4846954345703125 + }, + { + "auxiliary_loss_clip": 0.01116957, + "auxiliary_loss_mlp": 0.01037789, + "balance_loss_clip": 1.02394891, + "balance_loss_mlp": 1.04013455, + "epoch": 0.4139485946189689, + "flos": 29056616225280.0, + "grad_norm": 1.3782226444678463, + "language_loss": 0.75763476, + "learning_rate": 2.642871247413523e-06, + "loss": 0.7791822, + "num_input_tokens_seen": 147845445, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76953125, + "step": 6885, + "time_per_iteration": 2.513087511062622 + }, + { + "auxiliary_loss_clip": 0.01117144, + "auxiliary_loss_mlp": 0.01038796, + "balance_loss_clip": 1.0245266, + "balance_loss_mlp": 1.0402348, + "epoch": 0.41400871787163684, + "flos": 24425720219520.0, + "grad_norm": 1.8637223642679819, + "language_loss": 0.69820571, + "learning_rate": 2.6425024417186414e-06, + "loss": 0.71976513, + "num_input_tokens_seen": 147865580, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.765625, + "step": 6886, + "time_per_iteration": 2.49245285987854 + }, + { + "auxiliary_loss_clip": 0.01118338, + "auxiliary_loss_mlp": 0.0103733, + "balance_loss_clip": 1.02326965, + "balance_loss_mlp": 1.04143095, + "epoch": 0.4140688411243048, + "flos": 19464050845440.0, + "grad_norm": 1.5567308495418615, + "language_loss": 0.7542249, + "learning_rate": 2.642133611660002e-06, + "loss": 0.77578151, + "num_input_tokens_seen": 147885230, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76953125, + "step": 6887, + "time_per_iteration": 6.723928451538086 + }, + { + "auxiliary_loss_clip": 0.01114585, + "auxiliary_loss_mlp": 0.01031306, + "balance_loss_clip": 1.01735878, + "balance_loss_mlp": 1.03900433, + "epoch": 0.4141289643769728, + "flos": 19313656600320.0, + "grad_norm": 1.8847126889252832, + "language_loss": 0.69881892, + "learning_rate": 2.641764757251592e-06, + "loss": 0.72027779, + "num_input_tokens_seen": 147903035, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75390625, + "step": 6888, + "time_per_iteration": 3.9012765884399414 + }, + { + "auxiliary_loss_clip": 0.01110331, + "auxiliary_loss_mlp": 0.0103512, + "balance_loss_clip": 1.02070749, + "balance_loss_mlp": 1.03661156, + "epoch": 0.41418908762964074, + "flos": 16726903683840.0, + "grad_norm": 1.8064637161795956, + "language_loss": 0.75730169, + "learning_rate": 2.6413958785073976e-06, + "loss": 0.7787562, + "num_input_tokens_seen": 147918745, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.73828125, + "step": 6889, + "time_per_iteration": 2.4043526649475098 + }, + { + "auxiliary_loss_clip": 0.01115863, + "auxiliary_loss_mlp": 0.01033766, + "balance_loss_clip": 1.020468, + "balance_loss_mlp": 1.04220176, + "epoch": 0.41424921088230876, + "flos": 25296840858240.0, + "grad_norm": 1.5362774650785178, + "language_loss": 0.80159467, + "learning_rate": 2.6410269754414074e-06, + "loss": 0.82309097, + "num_input_tokens_seen": 147938265, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.734375, + "step": 6890, + "time_per_iteration": 2.515199661254883 + }, + { + "auxiliary_loss_clip": 0.01113118, + "auxiliary_loss_mlp": 0.01042194, + "balance_loss_clip": 1.02752495, + "balance_loss_mlp": 1.04047, + "epoch": 0.4143093341349767, + "flos": 20960520289920.0, + "grad_norm": 1.56935265602887, + "language_loss": 0.74256909, + "learning_rate": 2.6406580480676113e-06, + "loss": 0.76412225, + "num_input_tokens_seen": 147957320, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7265625, + "step": 6891, + "time_per_iteration": 2.4265213012695312 + }, + { + "auxiliary_loss_clip": 0.01120303, + "auxiliary_loss_mlp": 0.01037383, + "balance_loss_clip": 1.02144444, + "balance_loss_mlp": 1.04260397, + "epoch": 0.4143694573876447, + "flos": 22017694400640.0, + "grad_norm": 1.5959140747346865, + "language_loss": 0.84173661, + "learning_rate": 2.6402890963999963e-06, + "loss": 0.86331344, + "num_input_tokens_seen": 147977045, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.77734375, + "step": 6892, + "time_per_iteration": 2.4921038150787354 + }, + { + "auxiliary_loss_clip": 0.01116229, + "auxiliary_loss_mlp": 0.01035744, + "balance_loss_clip": 1.02204704, + "balance_loss_mlp": 1.04263163, + "epoch": 0.41442958064031266, + "flos": 35697396723840.0, + "grad_norm": 1.6122583846612435, + "language_loss": 0.70197237, + "learning_rate": 2.6399201204525554e-06, + "loss": 0.72349209, + "num_input_tokens_seen": 147996905, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.734375, + "step": 6893, + "time_per_iteration": 2.548509359359741 + }, + { + "auxiliary_loss_clip": 0.01115822, + "auxiliary_loss_mlp": 0.01029427, + "balance_loss_clip": 1.01573586, + "balance_loss_mlp": 1.04117119, + "epoch": 0.4144897038929806, + "flos": 28293766156800.0, + "grad_norm": 1.3754181360448814, + "language_loss": 0.72850323, + "learning_rate": 2.639551120239279e-06, + "loss": 0.74995577, + "num_input_tokens_seen": 148017875, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 6894, + "time_per_iteration": 2.521559715270996 + }, + { + "auxiliary_loss_clip": 0.01118828, + "auxiliary_loss_mlp": 0.01032562, + "balance_loss_clip": 1.01859689, + "balance_loss_mlp": 1.04199624, + "epoch": 0.4145498271456486, + "flos": 11648093080320.0, + "grad_norm": 2.672622146105704, + "language_loss": 0.6200121, + "learning_rate": 2.63918209577416e-06, + "loss": 0.64152598, + "num_input_tokens_seen": 148032300, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.765625, + "step": 6895, + "time_per_iteration": 2.3899357318878174 + }, + { + "auxiliary_loss_clip": 0.01112997, + "auxiliary_loss_mlp": 0.01034538, + "balance_loss_clip": 1.02091241, + "balance_loss_mlp": 1.03973091, + "epoch": 0.41460995039831655, + "flos": 27235622378880.0, + "grad_norm": 1.6922649240649819, + "language_loss": 0.70685059, + "learning_rate": 2.638813047071192e-06, + "loss": 0.72832596, + "num_input_tokens_seen": 148053260, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.73046875, + "step": 6896, + "time_per_iteration": 2.5296781063079834 + }, + { + "auxiliary_loss_clip": 0.01115349, + "auxiliary_loss_mlp": 0.01040374, + "balance_loss_clip": 1.02541351, + "balance_loss_mlp": 1.03898549, + "epoch": 0.4146700736509845, + "flos": 25922369232000.0, + "grad_norm": 1.6224007586570597, + "language_loss": 0.72848749, + "learning_rate": 2.6384439741443696e-06, + "loss": 0.7500447, + "num_input_tokens_seen": 148072965, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.765625, + "step": 6897, + "time_per_iteration": 2.481219530105591 + }, + { + "auxiliary_loss_clip": 0.01115287, + "auxiliary_loss_mlp": 0.01043208, + "balance_loss_clip": 1.02870619, + "balance_loss_mlp": 1.04093742, + "epoch": 0.4147301969036525, + "flos": 26833243248000.0, + "grad_norm": 4.403783878749548, + "language_loss": 0.84646589, + "learning_rate": 2.6380748770076873e-06, + "loss": 0.86805081, + "num_input_tokens_seen": 148093240, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7421875, + "step": 6898, + "time_per_iteration": 2.5150201320648193 + }, + { + "auxiliary_loss_clip": 0.01112871, + "auxiliary_loss_mlp": 0.01031359, + "balance_loss_clip": 1.01719725, + "balance_loss_mlp": 1.03681874, + "epoch": 0.41479032015632045, + "flos": 20298291194880.0, + "grad_norm": 1.644475487803214, + "language_loss": 0.74555075, + "learning_rate": 2.6377057556751416e-06, + "loss": 0.76699305, + "num_input_tokens_seen": 148110925, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.76171875, + "step": 6899, + "time_per_iteration": 2.4348104000091553 + }, + { + "auxiliary_loss_clip": 0.0112093, + "auxiliary_loss_mlp": 0.01037394, + "balance_loss_clip": 1.02145016, + "balance_loss_mlp": 1.04058647, + "epoch": 0.4148504434089884, + "flos": 25264988472960.0, + "grad_norm": 1.717830619902866, + "language_loss": 0.75609112, + "learning_rate": 2.6373366101607306e-06, + "loss": 0.77767438, + "num_input_tokens_seen": 148130670, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8046875, + "step": 6900, + "time_per_iteration": 2.5260136127471924 + }, + { + "auxiliary_loss_clip": 0.01116235, + "auxiliary_loss_mlp": 0.01040453, + "balance_loss_clip": 1.02496767, + "balance_loss_mlp": 1.04113388, + "epoch": 0.4149105666616564, + "flos": 12822300679680.0, + "grad_norm": 2.5866137476185087, + "language_loss": 0.80409849, + "learning_rate": 2.6369674404784503e-06, + "loss": 0.82566535, + "num_input_tokens_seen": 148148350, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.75, + "step": 6901, + "time_per_iteration": 2.4218883514404297 + }, + { + "auxiliary_loss_clip": 0.01114876, + "auxiliary_loss_mlp": 0.01035504, + "balance_loss_clip": 1.02178299, + "balance_loss_mlp": 1.03989518, + "epoch": 0.41497068991432434, + "flos": 16763891713920.0, + "grad_norm": 1.8085429941764752, + "language_loss": 0.69120753, + "learning_rate": 2.6365982466423014e-06, + "loss": 0.71271133, + "num_input_tokens_seen": 148167550, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75, + "step": 6902, + "time_per_iteration": 2.525836944580078 + }, + { + "auxiliary_loss_clip": 0.0111323, + "auxiliary_loss_mlp": 0.01037724, + "balance_loss_clip": 1.02421129, + "balance_loss_mlp": 1.04042315, + "epoch": 0.4150308131669923, + "flos": 18000906243840.0, + "grad_norm": 2.1056004636318817, + "language_loss": 0.83287692, + "learning_rate": 2.6362290286662834e-06, + "loss": 0.85438645, + "num_input_tokens_seen": 148184740, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7265625, + "step": 6903, + "time_per_iteration": 2.402722120285034 + }, + { + "auxiliary_loss_clip": 0.01120503, + "auxiliary_loss_mlp": 0.01038275, + "balance_loss_clip": 1.02232492, + "balance_loss_mlp": 1.0413456, + "epoch": 0.41509093641966033, + "flos": 30044770352640.0, + "grad_norm": 1.8768082111891207, + "language_loss": 0.67704409, + "learning_rate": 2.6358597865643968e-06, + "loss": 0.69863188, + "num_input_tokens_seen": 148204605, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.79296875, + "step": 6904, + "time_per_iteration": 2.5442733764648438 + }, + { + "auxiliary_loss_clip": 0.01119512, + "auxiliary_loss_mlp": 0.01035175, + "balance_loss_clip": 1.02082872, + "balance_loss_mlp": 1.04166162, + "epoch": 0.4151510596723283, + "flos": 24279994742400.0, + "grad_norm": 1.5140892492412166, + "language_loss": 0.77502626, + "learning_rate": 2.635490520350643e-06, + "loss": 0.79657316, + "num_input_tokens_seen": 148224675, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.77734375, + "step": 6905, + "time_per_iteration": 2.471850633621216 + }, + { + "auxiliary_loss_clip": 0.01119242, + "auxiliary_loss_mlp": 0.01030653, + "balance_loss_clip": 1.0168426, + "balance_loss_mlp": 1.04261923, + "epoch": 0.41521118292499626, + "flos": 23476206147840.0, + "grad_norm": 2.8616602480779427, + "language_loss": 0.68461335, + "learning_rate": 2.635121230039025e-06, + "loss": 0.70611238, + "num_input_tokens_seen": 148243375, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 6906, + "time_per_iteration": 2.501025676727295 + }, + { + "auxiliary_loss_clip": 0.0111511, + "auxiliary_loss_mlp": 0.01034311, + "balance_loss_clip": 1.02097726, + "balance_loss_mlp": 1.041152, + "epoch": 0.4152713061776642, + "flos": 22125498094080.0, + "grad_norm": 3.9013632738704347, + "language_loss": 0.67466414, + "learning_rate": 2.6347519156435467e-06, + "loss": 0.69615829, + "num_input_tokens_seen": 148261140, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7421875, + "step": 6907, + "time_per_iteration": 2.467179298400879 + }, + { + "auxiliary_loss_clip": 0.01118262, + "auxiliary_loss_mlp": 0.01034081, + "balance_loss_clip": 1.02107513, + "balance_loss_mlp": 1.04266894, + "epoch": 0.4153314294303322, + "flos": 21251396626560.0, + "grad_norm": 1.8641722195673653, + "language_loss": 0.77219629, + "learning_rate": 2.6343825771782123e-06, + "loss": 0.79371971, + "num_input_tokens_seen": 148279655, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7578125, + "step": 6908, + "time_per_iteration": 2.5124471187591553 + }, + { + "auxiliary_loss_clip": 0.01043525, + "auxiliary_loss_mlp": 0.01011962, + "balance_loss_clip": 1.01046562, + "balance_loss_mlp": 1.01946032, + "epoch": 0.41539155268300015, + "flos": 57920681594880.0, + "grad_norm": 0.7844742119516283, + "language_loss": 0.64862758, + "learning_rate": 2.634013214657026e-06, + "loss": 0.66918248, + "num_input_tokens_seen": 148339005, + "router_z_loss_clip": 0.01495361, + "router_z_loss_mlp": 0.24023438, + "step": 6909, + "time_per_iteration": 3.0118794441223145 + }, + { + "auxiliary_loss_clip": 0.01116053, + "auxiliary_loss_mlp": 0.01037602, + "balance_loss_clip": 1.02441156, + "balance_loss_mlp": 1.04182351, + "epoch": 0.4154516759356681, + "flos": 21903677654400.0, + "grad_norm": 1.432390678759805, + "language_loss": 0.87292743, + "learning_rate": 2.633643828093996e-06, + "loss": 0.8944639, + "num_input_tokens_seen": 148358715, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 6910, + "time_per_iteration": 2.4972214698791504 + }, + { + "auxiliary_loss_clip": 0.01041579, + "auxiliary_loss_mlp": 0.01001773, + "balance_loss_clip": 1.00033653, + "balance_loss_mlp": 1.01748466, + "epoch": 0.4155117991883361, + "flos": 67833677226240.0, + "grad_norm": 0.808989444092677, + "language_loss": 0.6214478, + "learning_rate": 2.633274417503128e-06, + "loss": 0.64188129, + "num_input_tokens_seen": 148417280, + "router_z_loss_clip": 0.01434326, + "router_z_loss_mlp": 0.24023438, + "step": 6911, + "time_per_iteration": 3.040469169616699 + }, + { + "auxiliary_loss_clip": 0.01126363, + "auxiliary_loss_mlp": 0.01038312, + "balance_loss_clip": 1.02386987, + "balance_loss_mlp": 1.04570675, + "epoch": 0.41557192244100405, + "flos": 14282679934080.0, + "grad_norm": 2.7143139070983313, + "language_loss": 0.87920213, + "learning_rate": 2.6329049828984312e-06, + "loss": 0.90084887, + "num_input_tokens_seen": 148432610, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8046875, + "step": 6912, + "time_per_iteration": 2.449566602706909 + }, + { + "auxiliary_loss_clip": 0.01119018, + "auxiliary_loss_mlp": 0.01031086, + "balance_loss_clip": 1.01842034, + "balance_loss_mlp": 1.04461241, + "epoch": 0.415632045693672, + "flos": 24461954064000.0, + "grad_norm": 3.208266477782979, + "language_loss": 0.62984204, + "learning_rate": 2.632535524293914e-06, + "loss": 0.65134311, + "num_input_tokens_seen": 148451510, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.74609375, + "step": 6913, + "time_per_iteration": 2.4690184593200684 + }, + { + "auxiliary_loss_clip": 0.01117176, + "auxiliary_loss_mlp": 0.01030635, + "balance_loss_clip": 1.01793909, + "balance_loss_mlp": 1.04389513, + "epoch": 0.41569216894634, + "flos": 20115290378880.0, + "grad_norm": 1.933222600231973, + "language_loss": 0.75131822, + "learning_rate": 2.632166041703586e-06, + "loss": 0.77279633, + "num_input_tokens_seen": 148469945, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.734375, + "step": 6914, + "time_per_iteration": 2.483322858810425 + }, + { + "auxiliary_loss_clip": 0.01118579, + "auxiliary_loss_mlp": 0.01035638, + "balance_loss_clip": 1.0218277, + "balance_loss_mlp": 1.04198337, + "epoch": 0.41575229219900794, + "flos": 23798827128960.0, + "grad_norm": 1.8027192281548683, + "language_loss": 0.87621439, + "learning_rate": 2.631796535141458e-06, + "loss": 0.89775658, + "num_input_tokens_seen": 148486655, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.765625, + "step": 6915, + "time_per_iteration": 2.448347806930542 + }, + { + "auxiliary_loss_clip": 0.01120782, + "auxiliary_loss_mlp": 0.01038445, + "balance_loss_clip": 1.02461123, + "balance_loss_mlp": 1.0447371, + "epoch": 0.4158124154516759, + "flos": 23108229267840.0, + "grad_norm": 2.7843871284315007, + "language_loss": 0.71427178, + "learning_rate": 2.6314270046215426e-06, + "loss": 0.7358641, + "num_input_tokens_seen": 148505035, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76171875, + "step": 6916, + "time_per_iteration": 2.490709066390991 + }, + { + "auxiliary_loss_clip": 0.01124406, + "auxiliary_loss_mlp": 0.01032755, + "balance_loss_clip": 1.018736, + "balance_loss_mlp": 1.04548466, + "epoch": 0.41587253870434393, + "flos": 24242970798720.0, + "grad_norm": 1.511699121237688, + "language_loss": 0.71604288, + "learning_rate": 2.631057450157852e-06, + "loss": 0.73761451, + "num_input_tokens_seen": 148525575, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 6917, + "time_per_iteration": 2.471165895462036 + }, + { + "auxiliary_loss_clip": 0.01118269, + "auxiliary_loss_mlp": 0.01033966, + "balance_loss_clip": 1.0205791, + "balance_loss_mlp": 1.04267478, + "epoch": 0.4159326619570119, + "flos": 23881602021120.0, + "grad_norm": 1.6845020116344738, + "language_loss": 0.80811357, + "learning_rate": 2.6306878717643988e-06, + "loss": 0.82963598, + "num_input_tokens_seen": 148547270, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7578125, + "step": 6918, + "time_per_iteration": 2.526092767715454 + }, + { + "auxiliary_loss_clip": 0.01123257, + "auxiliary_loss_mlp": 0.01037246, + "balance_loss_clip": 1.02276754, + "balance_loss_mlp": 1.04565763, + "epoch": 0.41599278520967986, + "flos": 40626531354240.0, + "grad_norm": 1.4136427424617275, + "language_loss": 0.70455492, + "learning_rate": 2.6303182694551995e-06, + "loss": 0.72615993, + "num_input_tokens_seen": 148572100, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7734375, + "step": 6919, + "time_per_iteration": 2.6142234802246094 + }, + { + "auxiliary_loss_clip": 0.01122602, + "auxiliary_loss_mlp": 0.0103457, + "balance_loss_clip": 1.02063489, + "balance_loss_mlp": 1.04595828, + "epoch": 0.4160529084623478, + "flos": 18222942165120.0, + "grad_norm": 3.306135174045704, + "language_loss": 0.80995989, + "learning_rate": 2.6299486432442677e-06, + "loss": 0.83153164, + "num_input_tokens_seen": 148591245, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.765625, + "step": 6920, + "time_per_iteration": 2.4816763401031494 + }, + { + "auxiliary_loss_clip": 0.01123811, + "auxiliary_loss_mlp": 0.01037947, + "balance_loss_clip": 1.02265263, + "balance_loss_mlp": 1.04559636, + "epoch": 0.4161130317150158, + "flos": 13661963982720.0, + "grad_norm": 1.8850349699187139, + "language_loss": 0.66103178, + "learning_rate": 2.6295789931456195e-06, + "loss": 0.68264937, + "num_input_tokens_seen": 148607980, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.78125, + "step": 6921, + "time_per_iteration": 2.4444103240966797 + }, + { + "auxiliary_loss_clip": 0.01122422, + "auxiliary_loss_mlp": 0.01039999, + "balance_loss_clip": 1.02613473, + "balance_loss_mlp": 1.04591656, + "epoch": 0.41617315496768376, + "flos": 16178511767040.0, + "grad_norm": 2.004797667242706, + "language_loss": 0.80354667, + "learning_rate": 2.629209319173274e-06, + "loss": 0.82517087, + "num_input_tokens_seen": 148624490, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 6922, + "time_per_iteration": 2.4668424129486084 + }, + { + "auxiliary_loss_clip": 0.01124248, + "auxiliary_loss_mlp": 0.01032463, + "balance_loss_clip": 1.01878977, + "balance_loss_mlp": 1.04562068, + "epoch": 0.4162332782203517, + "flos": 26213317395840.0, + "grad_norm": 1.7750243686484017, + "language_loss": 0.67461836, + "learning_rate": 2.628839621341247e-06, + "loss": 0.69618553, + "num_input_tokens_seen": 148646490, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7890625, + "step": 6923, + "time_per_iteration": 2.500643014907837 + }, + { + "auxiliary_loss_clip": 0.0112335, + "auxiliary_loss_mlp": 0.01043172, + "balance_loss_clip": 1.02822304, + "balance_loss_mlp": 1.04540539, + "epoch": 0.4162934014730197, + "flos": 28183987215360.0, + "grad_norm": 1.7543246434734396, + "language_loss": 0.75878662, + "learning_rate": 2.6284698996635593e-06, + "loss": 0.78045189, + "num_input_tokens_seen": 148668580, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.77734375, + "step": 6924, + "time_per_iteration": 2.5196292400360107 + }, + { + "auxiliary_loss_clip": 0.01120451, + "auxiliary_loss_mlp": 0.01037748, + "balance_loss_clip": 1.02382445, + "balance_loss_mlp": 1.04238617, + "epoch": 0.41635352472568765, + "flos": 19865316654720.0, + "grad_norm": 1.7266126934206025, + "language_loss": 0.72481495, + "learning_rate": 2.62810015415423e-06, + "loss": 0.74639702, + "num_input_tokens_seen": 148688410, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.78125, + "step": 6925, + "time_per_iteration": 2.4335598945617676 + }, + { + "auxiliary_loss_clip": 0.01118059, + "auxiliary_loss_mlp": 0.01034152, + "balance_loss_clip": 1.02069342, + "balance_loss_mlp": 1.0413928, + "epoch": 0.4164136479783556, + "flos": 14935356011520.0, + "grad_norm": 1.8465053152696829, + "language_loss": 0.83475816, + "learning_rate": 2.6277303848272792e-06, + "loss": 0.85628033, + "num_input_tokens_seen": 148704855, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 6926, + "time_per_iteration": 2.5088613033294678 + }, + { + "auxiliary_loss_clip": 0.01115859, + "auxiliary_loss_mlp": 0.01035349, + "balance_loss_clip": 1.02305889, + "balance_loss_mlp": 1.04325294, + "epoch": 0.4164737712310236, + "flos": 21757593041280.0, + "grad_norm": 1.6423809052501923, + "language_loss": 0.86620545, + "learning_rate": 2.6273605916967302e-06, + "loss": 0.88771755, + "num_input_tokens_seen": 148723065, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.7265625, + "step": 6927, + "time_per_iteration": 2.534503936767578 + }, + { + "auxiliary_loss_clip": 0.01118504, + "auxiliary_loss_mlp": 0.0104184, + "balance_loss_clip": 1.0272553, + "balance_loss_mlp": 1.04246414, + "epoch": 0.41653389448369155, + "flos": 20740136394240.0, + "grad_norm": 1.9802013979545179, + "language_loss": 0.72300684, + "learning_rate": 2.626990774776604e-06, + "loss": 0.74461025, + "num_input_tokens_seen": 148741780, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.76171875, + "step": 6928, + "time_per_iteration": 3.88004732131958 + }, + { + "auxiliary_loss_clip": 0.01116658, + "auxiliary_loss_mlp": 0.01039078, + "balance_loss_clip": 1.02459407, + "balance_loss_mlp": 1.04092073, + "epoch": 0.4165940177363595, + "flos": 24972891073920.0, + "grad_norm": 1.862862690513255, + "language_loss": 0.78142846, + "learning_rate": 2.6266209340809254e-06, + "loss": 0.80298579, + "num_input_tokens_seen": 148759795, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7578125, + "step": 6929, + "time_per_iteration": 5.323524713516235 + }, + { + "auxiliary_loss_clip": 0.01119115, + "auxiliary_loss_mlp": 0.01034916, + "balance_loss_clip": 1.02201128, + "balance_loss_mlp": 1.0432961, + "epoch": 0.41665414098902753, + "flos": 20521727746560.0, + "grad_norm": 1.7470362991732848, + "language_loss": 0.71024638, + "learning_rate": 2.6262510696237182e-06, + "loss": 0.73178667, + "num_input_tokens_seen": 148778680, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7578125, + "step": 6930, + "time_per_iteration": 2.4636495113372803 + }, + { + "auxiliary_loss_clip": 0.01116513, + "auxiliary_loss_mlp": 0.01035142, + "balance_loss_clip": 1.02139127, + "balance_loss_mlp": 1.04026747, + "epoch": 0.4167142642416955, + "flos": 19682926369920.0, + "grad_norm": 1.7271533589437842, + "language_loss": 0.80665648, + "learning_rate": 2.625881181419007e-06, + "loss": 0.82817304, + "num_input_tokens_seen": 148796470, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.765625, + "step": 6931, + "time_per_iteration": 2.4350993633270264 + }, + { + "auxiliary_loss_clip": 0.01115154, + "auxiliary_loss_mlp": 0.01038095, + "balance_loss_clip": 1.02392721, + "balance_loss_mlp": 1.04003608, + "epoch": 0.41677438749436346, + "flos": 23763742519680.0, + "grad_norm": 1.8450466812598405, + "language_loss": 0.79109526, + "learning_rate": 2.6255112694808193e-06, + "loss": 0.81262779, + "num_input_tokens_seen": 148815300, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.75, + "step": 6932, + "time_per_iteration": 2.499152660369873 + }, + { + "auxiliary_loss_clip": 0.01117704, + "auxiliary_loss_mlp": 0.01039084, + "balance_loss_clip": 1.02421236, + "balance_loss_mlp": 1.04105997, + "epoch": 0.41683451074703143, + "flos": 30410053712640.0, + "grad_norm": 2.265953381144445, + "language_loss": 0.81735384, + "learning_rate": 2.6251413338231813e-06, + "loss": 0.83892173, + "num_input_tokens_seen": 148834315, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.765625, + "step": 6933, + "time_per_iteration": 2.5096874237060547 + }, + { + "auxiliary_loss_clip": 0.01119747, + "auxiliary_loss_mlp": 0.01037299, + "balance_loss_clip": 1.02184963, + "balance_loss_mlp": 1.04056907, + "epoch": 0.4168946339996994, + "flos": 21506757390720.0, + "grad_norm": 2.1923639109766144, + "language_loss": 0.76769817, + "learning_rate": 2.624771374460121e-06, + "loss": 0.78926861, + "num_input_tokens_seen": 148852420, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.79296875, + "step": 6934, + "time_per_iteration": 2.4590814113616943 + }, + { + "auxiliary_loss_clip": 0.01120428, + "auxiliary_loss_mlp": 0.01034168, + "balance_loss_clip": 1.02003551, + "balance_loss_mlp": 1.04396558, + "epoch": 0.41695475725236736, + "flos": 17638675539840.0, + "grad_norm": 1.774753965654226, + "language_loss": 0.67036676, + "learning_rate": 2.624401391405668e-06, + "loss": 0.69191271, + "num_input_tokens_seen": 148869305, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.765625, + "step": 6935, + "time_per_iteration": 2.4111990928649902 + }, + { + "auxiliary_loss_clip": 0.01120243, + "auxiliary_loss_mlp": 0.01040903, + "balance_loss_clip": 1.0266757, + "balance_loss_mlp": 1.04329324, + "epoch": 0.4170148805050353, + "flos": 15668903560320.0, + "grad_norm": 2.7357101171275504, + "language_loss": 0.73245633, + "learning_rate": 2.6240313846738513e-06, + "loss": 0.75406778, + "num_input_tokens_seen": 148886395, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.76953125, + "step": 6936, + "time_per_iteration": 2.452911376953125 + }, + { + "auxiliary_loss_clip": 0.01117055, + "auxiliary_loss_mlp": 0.01034796, + "balance_loss_clip": 1.02102733, + "balance_loss_mlp": 1.0418582, + "epoch": 0.4170750037577033, + "flos": 15159151699200.0, + "grad_norm": 1.8471548990860345, + "language_loss": 0.73746514, + "learning_rate": 2.6236613542787024e-06, + "loss": 0.75898361, + "num_input_tokens_seen": 148905235, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.75, + "step": 6937, + "time_per_iteration": 2.426177978515625 + }, + { + "auxiliary_loss_clip": 0.01116111, + "auxiliary_loss_mlp": 0.01035449, + "balance_loss_clip": 1.02194881, + "balance_loss_mlp": 1.04150152, + "epoch": 0.41713512701037125, + "flos": 28768289754240.0, + "grad_norm": 1.512143650526939, + "language_loss": 0.8406328, + "learning_rate": 2.6232913002342518e-06, + "loss": 0.8621484, + "num_input_tokens_seen": 148928130, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 6938, + "time_per_iteration": 2.543088436126709 + }, + { + "auxiliary_loss_clip": 0.0112279, + "auxiliary_loss_mlp": 0.01034707, + "balance_loss_clip": 1.01959753, + "balance_loss_mlp": 1.04346168, + "epoch": 0.4171952502630392, + "flos": 28256993608320.0, + "grad_norm": 2.0225615339435183, + "language_loss": 0.74319148, + "learning_rate": 2.6229212225545334e-06, + "loss": 0.76476645, + "num_input_tokens_seen": 148948790, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.79296875, + "step": 6939, + "time_per_iteration": 2.5119175910949707 + }, + { + "auxiliary_loss_clip": 0.01120397, + "auxiliary_loss_mlp": 0.01033285, + "balance_loss_clip": 1.01864552, + "balance_loss_mlp": 1.04396725, + "epoch": 0.4172553735157072, + "flos": 24571697091840.0, + "grad_norm": 1.7048101001333908, + "language_loss": 0.7502594, + "learning_rate": 2.622551121253579e-06, + "loss": 0.77179623, + "num_input_tokens_seen": 148967690, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.765625, + "step": 6940, + "time_per_iteration": 2.505476474761963 + }, + { + "auxiliary_loss_clip": 0.01118418, + "auxiliary_loss_mlp": 0.0103924, + "balance_loss_clip": 1.02621651, + "balance_loss_mlp": 1.04277742, + "epoch": 0.41731549676837515, + "flos": 27045797978880.0, + "grad_norm": 1.6601557953990327, + "language_loss": 0.71575844, + "learning_rate": 2.622180996345424e-06, + "loss": 0.73733509, + "num_input_tokens_seen": 148987150, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7578125, + "step": 6941, + "time_per_iteration": 2.4826831817626953 + }, + { + "auxiliary_loss_clip": 0.01120873, + "auxiliary_loss_mlp": 0.0103738, + "balance_loss_clip": 1.02307487, + "balance_loss_mlp": 1.04215777, + "epoch": 0.4173756200210431, + "flos": 28394063907840.0, + "grad_norm": 1.8824806717934597, + "language_loss": 0.73884863, + "learning_rate": 2.621810847844104e-06, + "loss": 0.76043111, + "num_input_tokens_seen": 149004895, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7890625, + "step": 6942, + "time_per_iteration": 2.510179281234741 + }, + { + "auxiliary_loss_clip": 0.01124355, + "auxiliary_loss_mlp": 0.01037141, + "balance_loss_clip": 1.02190626, + "balance_loss_mlp": 1.04450595, + "epoch": 0.41743574327371114, + "flos": 22521556431360.0, + "grad_norm": 2.1000096782313644, + "language_loss": 0.72619486, + "learning_rate": 2.6214406757636534e-06, + "loss": 0.74780977, + "num_input_tokens_seen": 149020970, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.80078125, + "step": 6943, + "time_per_iteration": 2.437713861465454 + }, + { + "auxiliary_loss_clip": 0.01121913, + "auxiliary_loss_mlp": 0.01033826, + "balance_loss_clip": 1.01844811, + "balance_loss_mlp": 1.04391849, + "epoch": 0.4174958665263791, + "flos": 30113431200000.0, + "grad_norm": 1.5914405962225948, + "language_loss": 0.63451827, + "learning_rate": 2.621070480118111e-06, + "loss": 0.6560756, + "num_input_tokens_seen": 149041795, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.78125, + "step": 6944, + "time_per_iteration": 2.5866405963897705 + }, + { + "auxiliary_loss_clip": 0.01118766, + "auxiliary_loss_mlp": 0.01031312, + "balance_loss_clip": 1.01747799, + "balance_loss_mlp": 1.04272938, + "epoch": 0.41755598977904707, + "flos": 25263444188160.0, + "grad_norm": 1.6963739292171327, + "language_loss": 0.7014094, + "learning_rate": 2.620700260921513e-06, + "loss": 0.72291017, + "num_input_tokens_seen": 149063700, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76171875, + "step": 6945, + "time_per_iteration": 2.4984183311462402 + }, + { + "auxiliary_loss_clip": 0.01116559, + "auxiliary_loss_mlp": 0.01041419, + "balance_loss_clip": 1.02556372, + "balance_loss_mlp": 1.04024088, + "epoch": 0.41761611303171503, + "flos": 19828580019840.0, + "grad_norm": 1.623733928455925, + "language_loss": 0.80850792, + "learning_rate": 2.620330018187899e-06, + "loss": 0.83008766, + "num_input_tokens_seen": 149082410, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.76171875, + "step": 6946, + "time_per_iteration": 2.5301356315612793 + }, + { + "auxiliary_loss_clip": 0.01118432, + "auxiliary_loss_mlp": 0.01036458, + "balance_loss_clip": 1.02281451, + "balance_loss_mlp": 1.04321134, + "epoch": 0.417676236284383, + "flos": 15523249910400.0, + "grad_norm": 2.2176705837507784, + "language_loss": 0.77525783, + "learning_rate": 2.6199597519313086e-06, + "loss": 0.79680669, + "num_input_tokens_seen": 149098745, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75390625, + "step": 6947, + "time_per_iteration": 2.432767391204834 + }, + { + "auxiliary_loss_clip": 0.01119017, + "auxiliary_loss_mlp": 0.01035109, + "balance_loss_clip": 1.0204227, + "balance_loss_mlp": 1.04268038, + "epoch": 0.41773635953705096, + "flos": 32524473761280.0, + "grad_norm": 2.207686964264854, + "language_loss": 0.71242738, + "learning_rate": 2.6195894621657825e-06, + "loss": 0.73396862, + "num_input_tokens_seen": 149122255, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.76171875, + "step": 6948, + "time_per_iteration": 2.565560817718506 + }, + { + "auxiliary_loss_clip": 0.01112399, + "auxiliary_loss_mlp": 0.01029195, + "balance_loss_clip": 1.01575994, + "balance_loss_mlp": 1.03894424, + "epoch": 0.4177964827897189, + "flos": 23440941970560.0, + "grad_norm": 1.5189916920378803, + "language_loss": 0.77142775, + "learning_rate": 2.619219148905362e-06, + "loss": 0.7928437, + "num_input_tokens_seen": 149142845, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.734375, + "step": 6949, + "time_per_iteration": 2.459484338760376 + }, + { + "auxiliary_loss_clip": 0.01122421, + "auxiliary_loss_mlp": 0.01036015, + "balance_loss_clip": 1.02156091, + "balance_loss_mlp": 1.04367769, + "epoch": 0.4178566060423869, + "flos": 22748907565440.0, + "grad_norm": 1.5094834159772865, + "language_loss": 0.81985492, + "learning_rate": 2.6188488121640888e-06, + "loss": 0.84143925, + "num_input_tokens_seen": 149163375, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7890625, + "step": 6950, + "time_per_iteration": 2.5348877906799316 + }, + { + "auxiliary_loss_clip": 0.01113505, + "auxiliary_loss_mlp": 0.01030512, + "balance_loss_clip": 1.01804328, + "balance_loss_mlp": 1.04157758, + "epoch": 0.41791672929505486, + "flos": 26032794618240.0, + "grad_norm": 1.3221945547908684, + "language_loss": 0.76189649, + "learning_rate": 2.618478451956007e-06, + "loss": 0.78333664, + "num_input_tokens_seen": 149185610, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71875, + "step": 6951, + "time_per_iteration": 2.5055410861968994 + }, + { + "auxiliary_loss_clip": 0.01121988, + "auxiliary_loss_mlp": 0.01034046, + "balance_loss_clip": 1.01894784, + "balance_loss_mlp": 1.04247046, + "epoch": 0.4179768525477228, + "flos": 19568694142080.0, + "grad_norm": 1.7645474682355455, + "language_loss": 0.72922826, + "learning_rate": 2.61810806829516e-06, + "loss": 0.75078857, + "num_input_tokens_seen": 149203990, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.796875, + "step": 6952, + "time_per_iteration": 2.499979019165039 + }, + { + "auxiliary_loss_clip": 0.01117763, + "auxiliary_loss_mlp": 0.01032178, + "balance_loss_clip": 1.01826596, + "balance_loss_mlp": 1.04266691, + "epoch": 0.4180369758003908, + "flos": 17783826399360.0, + "grad_norm": 3.0061867681934795, + "language_loss": 0.7182008, + "learning_rate": 2.617737661195593e-06, + "loss": 0.73970026, + "num_input_tokens_seen": 149221385, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 6953, + "time_per_iteration": 2.4045305252075195 + }, + { + "auxiliary_loss_clip": 0.01116286, + "auxiliary_loss_mlp": 0.01035837, + "balance_loss_clip": 1.02106667, + "balance_loss_mlp": 1.04293513, + "epoch": 0.41809709905305875, + "flos": 20960663944320.0, + "grad_norm": 1.696123367289706, + "language_loss": 0.76163101, + "learning_rate": 2.617367230671353e-06, + "loss": 0.78315222, + "num_input_tokens_seen": 149241175, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.734375, + "step": 6954, + "time_per_iteration": 2.5208778381347656 + }, + { + "auxiliary_loss_clip": 0.01117033, + "auxiliary_loss_mlp": 0.01037952, + "balance_loss_clip": 1.02243114, + "balance_loss_mlp": 1.0407306, + "epoch": 0.4181572223057267, + "flos": 22017622573440.0, + "grad_norm": 2.123626835554744, + "language_loss": 0.84569108, + "learning_rate": 2.616996776736485e-06, + "loss": 0.86724097, + "num_input_tokens_seen": 149259115, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.765625, + "step": 6955, + "time_per_iteration": 2.4470770359039307 + }, + { + "auxiliary_loss_clip": 0.01115036, + "auxiliary_loss_mlp": 0.01035899, + "balance_loss_clip": 1.02206469, + "balance_loss_mlp": 1.04131222, + "epoch": 0.4182173455583947, + "flos": 26245528917120.0, + "grad_norm": 1.7424753883235222, + "language_loss": 0.83219767, + "learning_rate": 2.616626299405037e-06, + "loss": 0.85370708, + "num_input_tokens_seen": 149278705, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.73828125, + "step": 6956, + "time_per_iteration": 2.53238582611084 + }, + { + "auxiliary_loss_clip": 0.01120034, + "auxiliary_loss_mlp": 0.01041481, + "balance_loss_clip": 1.02661586, + "balance_loss_mlp": 1.04286742, + "epoch": 0.4182774688110627, + "flos": 14791605782400.0, + "grad_norm": 2.117667338273699, + "language_loss": 0.71621263, + "learning_rate": 2.616255798691059e-06, + "loss": 0.73782784, + "num_input_tokens_seen": 149294040, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7734375, + "step": 6957, + "time_per_iteration": 2.4127233028411865 + }, + { + "auxiliary_loss_clip": 0.01116705, + "auxiliary_loss_mlp": 0.01037238, + "balance_loss_clip": 1.02450657, + "balance_loss_mlp": 1.0416609, + "epoch": 0.41833759206373067, + "flos": 20412020632320.0, + "grad_norm": 2.020066118448717, + "language_loss": 0.75841641, + "learning_rate": 2.6158852746085982e-06, + "loss": 0.77995586, + "num_input_tokens_seen": 149310385, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.75, + "step": 6958, + "time_per_iteration": 2.621243476867676 + }, + { + "auxiliary_loss_clip": 0.01116903, + "auxiliary_loss_mlp": 0.01031018, + "balance_loss_clip": 1.01718402, + "balance_loss_mlp": 1.04121447, + "epoch": 0.41839771531639863, + "flos": 23656333875840.0, + "grad_norm": 1.5992923753241641, + "language_loss": 0.76712382, + "learning_rate": 2.6155147271717066e-06, + "loss": 0.78860307, + "num_input_tokens_seen": 149328235, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 6959, + "time_per_iteration": 2.4936535358428955 + }, + { + "auxiliary_loss_clip": 0.01117896, + "auxiliary_loss_mlp": 0.01036611, + "balance_loss_clip": 1.02191257, + "balance_loss_mlp": 1.04106176, + "epoch": 0.4184578385690666, + "flos": 19754137082880.0, + "grad_norm": 1.629552094504433, + "language_loss": 0.76652783, + "learning_rate": 2.6151441563944347e-06, + "loss": 0.78807288, + "num_input_tokens_seen": 149347465, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.765625, + "step": 6960, + "time_per_iteration": 2.513699769973755 + }, + { + "auxiliary_loss_clip": 0.01111464, + "auxiliary_loss_mlp": 0.01030825, + "balance_loss_clip": 1.01822484, + "balance_loss_mlp": 1.04088879, + "epoch": 0.41851796182173456, + "flos": 20193396503040.0, + "grad_norm": 1.8359587043053753, + "language_loss": 0.75856298, + "learning_rate": 2.614773562290835e-06, + "loss": 0.7799859, + "num_input_tokens_seen": 149366685, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.70703125, + "step": 6961, + "time_per_iteration": 2.4798686504364014 + }, + { + "auxiliary_loss_clip": 0.01040549, + "auxiliary_loss_mlp": 0.010007, + "balance_loss_clip": 0.99909067, + "balance_loss_mlp": 1.01660466, + "epoch": 0.41857808507440253, + "flos": 59018794231680.0, + "grad_norm": 0.7788999280449799, + "language_loss": 0.5466665, + "learning_rate": 2.61440294487496e-06, + "loss": 0.56707895, + "num_input_tokens_seen": 149422925, + "router_z_loss_clip": 0.01611328, + "router_z_loss_mlp": 0.23925781, + "step": 6962, + "time_per_iteration": 3.0343000888824463 + }, + { + "auxiliary_loss_clip": 0.01119412, + "auxiliary_loss_mlp": 0.01036235, + "balance_loss_clip": 1.02266252, + "balance_loss_mlp": 1.04263735, + "epoch": 0.4186382083270705, + "flos": 18478805719680.0, + "grad_norm": 1.8026406871934313, + "language_loss": 0.85487044, + "learning_rate": 2.614032304160864e-06, + "loss": 0.87642694, + "num_input_tokens_seen": 149440820, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.765625, + "step": 6963, + "time_per_iteration": 2.4352054595947266 + }, + { + "auxiliary_loss_clip": 0.01117647, + "auxiliary_loss_mlp": 0.01035822, + "balance_loss_clip": 1.02210093, + "balance_loss_mlp": 1.04331315, + "epoch": 0.41869833157973846, + "flos": 21578758202880.0, + "grad_norm": 1.6053381131745172, + "language_loss": 0.70357138, + "learning_rate": 2.6136616401626014e-06, + "loss": 0.72510606, + "num_input_tokens_seen": 149461060, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7421875, + "step": 6964, + "time_per_iteration": 2.50482439994812 + }, + { + "auxiliary_loss_clip": 0.01113376, + "auxiliary_loss_mlp": 0.01035614, + "balance_loss_clip": 1.02268004, + "balance_loss_mlp": 1.04087543, + "epoch": 0.4187584548324064, + "flos": 35517412650240.0, + "grad_norm": 1.8351593031507138, + "language_loss": 0.70862091, + "learning_rate": 2.6132909528942273e-06, + "loss": 0.73011076, + "num_input_tokens_seen": 149483115, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.72265625, + "step": 6965, + "time_per_iteration": 2.6057491302490234 + }, + { + "auxiliary_loss_clip": 0.01113418, + "auxiliary_loss_mlp": 0.01032504, + "balance_loss_clip": 1.02033257, + "balance_loss_mlp": 1.0413456, + "epoch": 0.4188185780850744, + "flos": 18655880791680.0, + "grad_norm": 1.4950689447506187, + "language_loss": 0.7175675, + "learning_rate": 2.6129202423697997e-06, + "loss": 0.73902673, + "num_input_tokens_seen": 149501495, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.71875, + "step": 6966, + "time_per_iteration": 2.4892048835754395 + }, + { + "auxiliary_loss_clip": 0.01120204, + "auxiliary_loss_mlp": 0.0103471, + "balance_loss_clip": 1.02016091, + "balance_loss_mlp": 1.0421617, + "epoch": 0.41887870133774235, + "flos": 40333428374400.0, + "grad_norm": 2.333720493500319, + "language_loss": 0.71266413, + "learning_rate": 2.612549508603375e-06, + "loss": 0.73421323, + "num_input_tokens_seen": 149523170, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.78125, + "step": 6967, + "time_per_iteration": 2.604076862335205 + }, + { + "auxiliary_loss_clip": 0.01039013, + "auxiliary_loss_mlp": 0.01005246, + "balance_loss_clip": 1.00366104, + "balance_loss_mlp": 1.01515508, + "epoch": 0.4189388245904103, + "flos": 61371336516480.0, + "grad_norm": 0.6722087248044618, + "language_loss": 0.46224236, + "learning_rate": 2.612178751609011e-06, + "loss": 0.48268497, + "num_input_tokens_seen": 149583955, + "router_z_loss_clip": 0.01586914, + "router_z_loss_mlp": 0.23828125, + "step": 6968, + "time_per_iteration": 3.0401268005371094 + }, + { + "auxiliary_loss_clip": 0.01117965, + "auxiliary_loss_mlp": 0.01038122, + "balance_loss_clip": 1.02345359, + "balance_loss_mlp": 1.03981948, + "epoch": 0.4189989478430783, + "flos": 28215624119040.0, + "grad_norm": 1.6180807795397785, + "language_loss": 0.74930859, + "learning_rate": 2.6118079714007685e-06, + "loss": 0.77086943, + "num_input_tokens_seen": 149604440, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78125, + "step": 6969, + "time_per_iteration": 2.5126969814300537 + }, + { + "auxiliary_loss_clip": 0.01112428, + "auxiliary_loss_mlp": 0.01034589, + "balance_loss_clip": 1.02160668, + "balance_loss_mlp": 1.0382787, + "epoch": 0.4190590710957463, + "flos": 24565879088640.0, + "grad_norm": 2.2016737043444903, + "language_loss": 0.80248457, + "learning_rate": 2.611437167992705e-06, + "loss": 0.8239547, + "num_input_tokens_seen": 149623745, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7421875, + "step": 6970, + "time_per_iteration": 5.640556573867798 + }, + { + "auxiliary_loss_clip": 0.01114015, + "auxiliary_loss_mlp": 0.01030966, + "balance_loss_clip": 1.01774538, + "balance_loss_mlp": 1.04030848, + "epoch": 0.41911919434841427, + "flos": 21726027964800.0, + "grad_norm": 1.9623449568843938, + "language_loss": 0.82789886, + "learning_rate": 2.6110663413988835e-06, + "loss": 0.84934866, + "num_input_tokens_seen": 149643025, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 6971, + "time_per_iteration": 3.8554296493530273 + }, + { + "auxiliary_loss_clip": 0.01113275, + "auxiliary_loss_mlp": 0.01035215, + "balance_loss_clip": 1.02057588, + "balance_loss_mlp": 1.04049933, + "epoch": 0.41917931760108224, + "flos": 17601543855360.0, + "grad_norm": 1.6158786040890867, + "language_loss": 0.7468822, + "learning_rate": 2.6106954916333648e-06, + "loss": 0.76836711, + "num_input_tokens_seen": 149660695, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7265625, + "step": 6972, + "time_per_iteration": 2.474414587020874 + }, + { + "auxiliary_loss_clip": 0.01113414, + "auxiliary_loss_mlp": 0.01033398, + "balance_loss_clip": 1.02039838, + "balance_loss_mlp": 1.0393647, + "epoch": 0.4192394408537502, + "flos": 37816701022080.0, + "grad_norm": 1.4614195470734719, + "language_loss": 0.72808421, + "learning_rate": 2.610324618710212e-06, + "loss": 0.74955231, + "num_input_tokens_seen": 149682040, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7421875, + "step": 6973, + "time_per_iteration": 2.5945606231689453 + }, + { + "auxiliary_loss_clip": 0.0112256, + "auxiliary_loss_mlp": 0.01041918, + "balance_loss_clip": 1.02769673, + "balance_loss_mlp": 1.04242992, + "epoch": 0.41929956410641817, + "flos": 23107726477440.0, + "grad_norm": 2.1718837857164464, + "language_loss": 0.74863386, + "learning_rate": 2.609953722643489e-06, + "loss": 0.77027869, + "num_input_tokens_seen": 149700855, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8046875, + "step": 6974, + "time_per_iteration": 2.4790663719177246 + }, + { + "auxiliary_loss_clip": 0.01112575, + "auxiliary_loss_mlp": 0.01030019, + "balance_loss_clip": 1.01669776, + "balance_loss_mlp": 1.03879452, + "epoch": 0.41935968735908613, + "flos": 22524537260160.0, + "grad_norm": 2.8466202693933265, + "language_loss": 0.72836936, + "learning_rate": 2.609582803447259e-06, + "loss": 0.74979532, + "num_input_tokens_seen": 149717360, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73828125, + "step": 6975, + "time_per_iteration": 2.4560608863830566 + }, + { + "auxiliary_loss_clip": 0.01114785, + "auxiliary_loss_mlp": 0.01033126, + "balance_loss_clip": 1.01961374, + "balance_loss_mlp": 1.04139054, + "epoch": 0.4194198106117541, + "flos": 26870446759680.0, + "grad_norm": 1.6070899494887878, + "language_loss": 0.80725533, + "learning_rate": 2.6092118611355885e-06, + "loss": 0.82873446, + "num_input_tokens_seen": 149738975, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 6976, + "time_per_iteration": 2.5148777961730957 + }, + { + "auxiliary_loss_clip": 0.01112592, + "auxiliary_loss_mlp": 0.01025549, + "balance_loss_clip": 1.0124954, + "balance_loss_mlp": 1.03755522, + "epoch": 0.41947993386442206, + "flos": 19902412425600.0, + "grad_norm": 2.297468657248195, + "language_loss": 0.67767072, + "learning_rate": 2.6088408957225425e-06, + "loss": 0.6990521, + "num_input_tokens_seen": 149757055, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.75, + "step": 6977, + "time_per_iteration": 2.4294896125793457 + }, + { + "auxiliary_loss_clip": 0.01116519, + "auxiliary_loss_mlp": 0.01034878, + "balance_loss_clip": 1.02193213, + "balance_loss_mlp": 1.04046345, + "epoch": 0.41954005711709, + "flos": 17383889393280.0, + "grad_norm": 2.6461140984259304, + "language_loss": 0.80869353, + "learning_rate": 2.6084699072221898e-06, + "loss": 0.83020747, + "num_input_tokens_seen": 149772885, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.76171875, + "step": 6978, + "time_per_iteration": 2.4688472747802734 + }, + { + "auxiliary_loss_clip": 0.01114359, + "auxiliary_loss_mlp": 0.01036249, + "balance_loss_clip": 1.02207506, + "balance_loss_mlp": 1.0377202, + "epoch": 0.419600180369758, + "flos": 25003306915200.0, + "grad_norm": 1.725404980402679, + "language_loss": 0.82583737, + "learning_rate": 2.6080988956485964e-06, + "loss": 0.84734344, + "num_input_tokens_seen": 149791515, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.765625, + "step": 6979, + "time_per_iteration": 2.4702186584472656 + }, + { + "auxiliary_loss_clip": 0.01113345, + "auxiliary_loss_mlp": 0.01033062, + "balance_loss_clip": 1.0194428, + "balance_loss_mlp": 1.0388211, + "epoch": 0.41966030362242596, + "flos": 17383781652480.0, + "grad_norm": 1.8637978278873943, + "language_loss": 0.83381826, + "learning_rate": 2.6077278610158325e-06, + "loss": 0.85528231, + "num_input_tokens_seen": 149807250, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 6980, + "time_per_iteration": 2.5195069313049316 + }, + { + "auxiliary_loss_clip": 0.01116413, + "auxiliary_loss_mlp": 0.01032944, + "balance_loss_clip": 1.01975989, + "balance_loss_mlp": 1.03946161, + "epoch": 0.4197204268750939, + "flos": 22156165330560.0, + "grad_norm": 2.9241676519266004, + "language_loss": 0.79068786, + "learning_rate": 2.6073568033379665e-06, + "loss": 0.81218135, + "num_input_tokens_seen": 149821640, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.765625, + "step": 6981, + "time_per_iteration": 2.4457991123199463 + }, + { + "auxiliary_loss_clip": 0.01110331, + "auxiliary_loss_mlp": 0.01034012, + "balance_loss_clip": 1.02078593, + "balance_loss_mlp": 1.03806782, + "epoch": 0.4197805501277619, + "flos": 22084128604800.0, + "grad_norm": 1.6203222993930824, + "language_loss": 0.84426481, + "learning_rate": 2.6069857226290696e-06, + "loss": 0.86570823, + "num_input_tokens_seen": 149840545, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.72265625, + "step": 6982, + "time_per_iteration": 2.483635425567627 + }, + { + "auxiliary_loss_clip": 0.01116431, + "auxiliary_loss_mlp": 0.01036883, + "balance_loss_clip": 1.02191043, + "balance_loss_mlp": 1.03910255, + "epoch": 0.4198406733804299, + "flos": 26432192920320.0, + "grad_norm": 1.9325593989695682, + "language_loss": 0.56615967, + "learning_rate": 2.606614618903214e-06, + "loss": 0.58769286, + "num_input_tokens_seen": 149860375, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7734375, + "step": 6983, + "time_per_iteration": 2.4729864597320557 + }, + { + "auxiliary_loss_clip": 0.01114232, + "auxiliary_loss_mlp": 0.01035133, + "balance_loss_clip": 1.02243733, + "balance_loss_mlp": 1.0403446, + "epoch": 0.4199007966330979, + "flos": 12531029293440.0, + "grad_norm": 2.639890794043824, + "language_loss": 0.82404107, + "learning_rate": 2.606243492174471e-06, + "loss": 0.84553468, + "num_input_tokens_seen": 149877850, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.73828125, + "step": 6984, + "time_per_iteration": 2.4610702991485596 + }, + { + "auxiliary_loss_clip": 0.01113379, + "auxiliary_loss_mlp": 0.01028562, + "balance_loss_clip": 1.01515102, + "balance_loss_mlp": 1.03938794, + "epoch": 0.41996091988576584, + "flos": 21762944167680.0, + "grad_norm": 1.6654879970317658, + "language_loss": 0.78883481, + "learning_rate": 2.605872342456914e-06, + "loss": 0.81025428, + "num_input_tokens_seen": 149896110, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73828125, + "step": 6985, + "time_per_iteration": 2.4739370346069336 + }, + { + "auxiliary_loss_clip": 0.01118591, + "auxiliary_loss_mlp": 0.01034658, + "balance_loss_clip": 1.02042401, + "balance_loss_mlp": 1.03950381, + "epoch": 0.4200210431384338, + "flos": 26541935948160.0, + "grad_norm": 3.375844113891133, + "language_loss": 0.77833611, + "learning_rate": 2.6055011697646173e-06, + "loss": 0.79986858, + "num_input_tokens_seen": 149916495, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7890625, + "step": 6986, + "time_per_iteration": 2.5488531589508057 + }, + { + "auxiliary_loss_clip": 0.01111943, + "auxiliary_loss_mlp": 0.01030974, + "balance_loss_clip": 1.01886213, + "balance_loss_mlp": 1.03984082, + "epoch": 0.42008116639110177, + "flos": 26795824254720.0, + "grad_norm": 1.5789932508621725, + "language_loss": 0.72640669, + "learning_rate": 2.605129974111655e-06, + "loss": 0.74783587, + "num_input_tokens_seen": 149936445, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71875, + "step": 6987, + "time_per_iteration": 2.522143840789795 + }, + { + "auxiliary_loss_clip": 0.01117787, + "auxiliary_loss_mlp": 0.01042745, + "balance_loss_clip": 1.02886939, + "balance_loss_mlp": 1.04176915, + "epoch": 0.42014128964376973, + "flos": 32087333243520.0, + "grad_norm": 1.4538200585449164, + "language_loss": 0.75399673, + "learning_rate": 2.604758755512104e-06, + "loss": 0.77560198, + "num_input_tokens_seen": 149959430, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76171875, + "step": 6988, + "time_per_iteration": 2.57265305519104 + }, + { + "auxiliary_loss_clip": 0.01118811, + "auxiliary_loss_mlp": 0.01036964, + "balance_loss_clip": 1.02287364, + "balance_loss_mlp": 1.04034519, + "epoch": 0.4202014128964377, + "flos": 26467133875200.0, + "grad_norm": 1.6383736622893421, + "language_loss": 0.74155712, + "learning_rate": 2.60438751398004e-06, + "loss": 0.76311487, + "num_input_tokens_seen": 149980365, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.78125, + "step": 6989, + "time_per_iteration": 2.4846689701080322 + }, + { + "auxiliary_loss_clip": 0.01118468, + "auxiliary_loss_mlp": 0.01036151, + "balance_loss_clip": 1.02213192, + "balance_loss_mlp": 1.041116, + "epoch": 0.42026153614910566, + "flos": 13401216178560.0, + "grad_norm": 2.649933968591077, + "language_loss": 0.70989478, + "learning_rate": 2.6040162495295404e-06, + "loss": 0.73144102, + "num_input_tokens_seen": 149997375, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7734375, + "step": 6990, + "time_per_iteration": 2.5092554092407227 + }, + { + "auxiliary_loss_clip": 0.01038945, + "auxiliary_loss_mlp": 0.01004482, + "balance_loss_clip": 1.00287271, + "balance_loss_mlp": 1.01510215, + "epoch": 0.42032165940177363, + "flos": 60250457635200.0, + "grad_norm": 0.8281033043630844, + "language_loss": 0.60529578, + "learning_rate": 2.603644962174685e-06, + "loss": 0.62573004, + "num_input_tokens_seen": 150051230, + "router_z_loss_clip": 0.01611328, + "router_z_loss_mlp": 0.23828125, + "step": 6991, + "time_per_iteration": 2.921936511993408 + }, + { + "auxiliary_loss_clip": 0.01120177, + "auxiliary_loss_mlp": 0.01037059, + "balance_loss_clip": 1.02322519, + "balance_loss_mlp": 1.04332614, + "epoch": 0.4203817826544416, + "flos": 24535211852160.0, + "grad_norm": 1.5069916983433078, + "language_loss": 0.83222365, + "learning_rate": 2.6032736519295517e-06, + "loss": 0.85379601, + "num_input_tokens_seen": 150071135, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76953125, + "step": 6992, + "time_per_iteration": 2.495664358139038 + }, + { + "auxiliary_loss_clip": 0.01039195, + "auxiliary_loss_mlp": 0.01003357, + "balance_loss_clip": 1.00179517, + "balance_loss_mlp": 1.01546574, + "epoch": 0.42044190590710956, + "flos": 58820781530880.0, + "grad_norm": 0.8165124973650228, + "language_loss": 0.65523541, + "learning_rate": 2.6029023188082217e-06, + "loss": 0.67566097, + "num_input_tokens_seen": 150125220, + "router_z_loss_clip": 0.01556396, + "router_z_loss_mlp": 0.23730469, + "step": 6993, + "time_per_iteration": 3.078948736190796 + }, + { + "auxiliary_loss_clip": 0.01122889, + "auxiliary_loss_mlp": 0.01034205, + "balance_loss_clip": 1.01845777, + "balance_loss_mlp": 1.04213274, + "epoch": 0.4205020291597775, + "flos": 16436063260800.0, + "grad_norm": 2.0847143106579806, + "language_loss": 0.83213866, + "learning_rate": 2.6025309628247746e-06, + "loss": 0.85370958, + "num_input_tokens_seen": 150142300, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8046875, + "step": 6994, + "time_per_iteration": 2.42958402633667 + }, + { + "auxiliary_loss_clip": 0.01115372, + "auxiliary_loss_mlp": 0.01034094, + "balance_loss_clip": 1.02112424, + "balance_loss_mlp": 1.04195786, + "epoch": 0.4205621524124455, + "flos": 18405655672320.0, + "grad_norm": 1.6590785995391892, + "language_loss": 0.78497195, + "learning_rate": 2.6021595839932934e-06, + "loss": 0.8064667, + "num_input_tokens_seen": 150161345, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.734375, + "step": 6995, + "time_per_iteration": 2.4311602115631104 + }, + { + "auxiliary_loss_clip": 0.01113356, + "auxiliary_loss_mlp": 0.01031571, + "balance_loss_clip": 1.01849341, + "balance_loss_mlp": 1.04043221, + "epoch": 0.4206222756651135, + "flos": 25520097841920.0, + "grad_norm": 1.5317093362831764, + "language_loss": 0.79829741, + "learning_rate": 2.60178818232786e-06, + "loss": 0.81974673, + "num_input_tokens_seen": 150182420, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73046875, + "step": 6996, + "time_per_iteration": 2.5032711029052734 + }, + { + "auxiliary_loss_clip": 0.01118604, + "auxiliary_loss_mlp": 0.01031481, + "balance_loss_clip": 1.01837945, + "balance_loss_mlp": 1.04208779, + "epoch": 0.4206823989177815, + "flos": 15304338472320.0, + "grad_norm": 2.3208366966184837, + "language_loss": 0.7522642, + "learning_rate": 2.601416757842559e-06, + "loss": 0.77376509, + "num_input_tokens_seen": 150200175, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.765625, + "step": 6997, + "time_per_iteration": 2.4281609058380127 + }, + { + "auxiliary_loss_clip": 0.01117176, + "auxiliary_loss_mlp": 0.01038831, + "balance_loss_clip": 1.02492523, + "balance_loss_mlp": 1.03965962, + "epoch": 0.42074252217044944, + "flos": 15554096714880.0, + "grad_norm": 1.9779533128263025, + "language_loss": 0.76193553, + "learning_rate": 2.6010453105514743e-06, + "loss": 0.78349566, + "num_input_tokens_seen": 150217100, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7734375, + "step": 6998, + "time_per_iteration": 2.4484825134277344 + }, + { + "auxiliary_loss_clip": 0.01121567, + "auxiliary_loss_mlp": 0.01043992, + "balance_loss_clip": 1.02950823, + "balance_loss_mlp": 1.04302716, + "epoch": 0.4208026454231174, + "flos": 26145877610880.0, + "grad_norm": 1.545568275541188, + "language_loss": 0.76295245, + "learning_rate": 2.60067384046869e-06, + "loss": 0.78460807, + "num_input_tokens_seen": 150239830, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78515625, + "step": 6999, + "time_per_iteration": 2.5371389389038086 + }, + { + "auxiliary_loss_clip": 0.01116809, + "auxiliary_loss_mlp": 0.01039134, + "balance_loss_clip": 1.02512717, + "balance_loss_mlp": 1.04221511, + "epoch": 0.42086276867578537, + "flos": 23550110380800.0, + "grad_norm": 1.7925226690493865, + "language_loss": 0.64549243, + "learning_rate": 2.600302347608295e-06, + "loss": 0.66705179, + "num_input_tokens_seen": 150260690, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.74609375, + "step": 7000, + "time_per_iteration": 2.492664337158203 + }, + { + "auxiliary_loss_clip": 0.01117436, + "auxiliary_loss_mlp": 0.01037742, + "balance_loss_clip": 1.02347827, + "balance_loss_mlp": 1.04157186, + "epoch": 0.42092289192845334, + "flos": 18113414618880.0, + "grad_norm": 1.6489015448559594, + "language_loss": 0.76201057, + "learning_rate": 2.5999308319843743e-06, + "loss": 0.7835623, + "num_input_tokens_seen": 150279885, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7578125, + "step": 7001, + "time_per_iteration": 2.4374375343322754 + }, + { + "auxiliary_loss_clip": 0.01116209, + "auxiliary_loss_mlp": 0.0103509, + "balance_loss_clip": 1.02163076, + "balance_loss_mlp": 1.04236293, + "epoch": 0.4209830151811213, + "flos": 20006588845440.0, + "grad_norm": 1.558613926183474, + "language_loss": 0.86427414, + "learning_rate": 2.5995592936110154e-06, + "loss": 0.88578713, + "num_input_tokens_seen": 150297390, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73828125, + "step": 7002, + "time_per_iteration": 2.4840235710144043 + }, + { + "auxiliary_loss_clip": 0.01116213, + "auxiliary_loss_mlp": 0.01035955, + "balance_loss_clip": 1.02331328, + "balance_loss_mlp": 1.04153061, + "epoch": 0.42104313843378927, + "flos": 21978946604160.0, + "grad_norm": 2.8393435321353713, + "language_loss": 0.67447579, + "learning_rate": 2.5991877325023096e-06, + "loss": 0.69599748, + "num_input_tokens_seen": 150317390, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.75, + "step": 7003, + "time_per_iteration": 2.452779531478882 + }, + { + "auxiliary_loss_clip": 0.01120595, + "auxiliary_loss_mlp": 0.01042271, + "balance_loss_clip": 1.02727461, + "balance_loss_mlp": 1.04151964, + "epoch": 0.42110326168645723, + "flos": 25443966965760.0, + "grad_norm": 2.097012731379119, + "language_loss": 0.76887131, + "learning_rate": 2.598816148672344e-06, + "loss": 0.79049993, + "num_input_tokens_seen": 150337455, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7890625, + "step": 7004, + "time_per_iteration": 2.4988765716552734 + }, + { + "auxiliary_loss_clip": 0.0111532, + "auxiliary_loss_mlp": 0.01041124, + "balance_loss_clip": 1.02649117, + "balance_loss_mlp": 1.04101729, + "epoch": 0.4211633849391252, + "flos": 17822574195840.0, + "grad_norm": 1.5948979245136696, + "language_loss": 0.68152726, + "learning_rate": 2.59844454213521e-06, + "loss": 0.70309174, + "num_input_tokens_seen": 150355385, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7421875, + "step": 7005, + "time_per_iteration": 2.4434568881988525 + }, + { + "auxiliary_loss_clip": 0.01118015, + "auxiliary_loss_mlp": 0.01037165, + "balance_loss_clip": 1.02340817, + "balance_loss_mlp": 1.04088581, + "epoch": 0.42122350819179316, + "flos": 16282436791680.0, + "grad_norm": 1.9728430752981747, + "language_loss": 0.72047079, + "learning_rate": 2.5980729129049994e-06, + "loss": 0.74202257, + "num_input_tokens_seen": 150371750, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76953125, + "step": 7006, + "time_per_iteration": 2.4487879276275635 + }, + { + "auxiliary_loss_clip": 0.01117712, + "auxiliary_loss_mlp": 0.01033901, + "balance_loss_clip": 1.01978087, + "balance_loss_mlp": 1.04068065, + "epoch": 0.4212836314444611, + "flos": 19645866512640.0, + "grad_norm": 1.688876483049264, + "language_loss": 0.70708871, + "learning_rate": 2.5977012609958033e-06, + "loss": 0.72860485, + "num_input_tokens_seen": 150389955, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76953125, + "step": 7007, + "time_per_iteration": 2.437270164489746 + }, + { + "auxiliary_loss_clip": 0.01116417, + "auxiliary_loss_mlp": 0.01037894, + "balance_loss_clip": 1.02416158, + "balance_loss_mlp": 1.04059708, + "epoch": 0.4213437546971291, + "flos": 18369026778240.0, + "grad_norm": 1.7353334268618703, + "language_loss": 0.82159567, + "learning_rate": 2.5973295864217166e-06, + "loss": 0.84313881, + "num_input_tokens_seen": 150405780, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7578125, + "step": 7008, + "time_per_iteration": 2.460923194885254 + }, + { + "auxiliary_loss_clip": 0.011146, + "auxiliary_loss_mlp": 0.01040233, + "balance_loss_clip": 1.02642226, + "balance_loss_mlp": 1.03877473, + "epoch": 0.42140387794979706, + "flos": 27704507541120.0, + "grad_norm": 2.1040552452231505, + "language_loss": 0.71574211, + "learning_rate": 2.596957889196831e-06, + "loss": 0.7372905, + "num_input_tokens_seen": 150425615, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 7009, + "time_per_iteration": 2.501915693283081 + }, + { + "auxiliary_loss_clip": 0.01116238, + "auxiliary_loss_mlp": 0.01032831, + "balance_loss_clip": 1.01875222, + "balance_loss_mlp": 1.03954792, + "epoch": 0.4214640012024651, + "flos": 28147071012480.0, + "grad_norm": 2.7512785082136952, + "language_loss": 0.66407478, + "learning_rate": 2.596586169335243e-06, + "loss": 0.68556547, + "num_input_tokens_seen": 150445765, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.765625, + "step": 7010, + "time_per_iteration": 2.5036494731903076 + }, + { + "auxiliary_loss_clip": 0.01114938, + "auxiliary_loss_mlp": 0.01037239, + "balance_loss_clip": 1.02353597, + "balance_loss_mlp": 1.03993797, + "epoch": 0.42152412445513304, + "flos": 22997265177600.0, + "grad_norm": 1.553770179625671, + "language_loss": 0.7243132, + "learning_rate": 2.5962144268510477e-06, + "loss": 0.74583495, + "num_input_tokens_seen": 150464405, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75, + "step": 7011, + "time_per_iteration": 2.471482276916504 + }, + { + "auxiliary_loss_clip": 0.01036961, + "auxiliary_loss_mlp": 0.01009192, + "balance_loss_clip": 1.00765407, + "balance_loss_mlp": 1.01291788, + "epoch": 0.421584247707801, + "flos": 63749592938880.0, + "grad_norm": 0.789677431109339, + "language_loss": 0.54321265, + "learning_rate": 2.5958426617583417e-06, + "loss": 0.56367421, + "num_input_tokens_seen": 150520430, + "router_z_loss_clip": 0.01538086, + "router_z_loss_mlp": 0.24023438, + "step": 7012, + "time_per_iteration": 7.156486511230469 + }, + { + "auxiliary_loss_clip": 0.01118573, + "auxiliary_loss_mlp": 0.01033855, + "balance_loss_clip": 1.01982975, + "balance_loss_mlp": 1.04137254, + "epoch": 0.421644370960469, + "flos": 24314612474880.0, + "grad_norm": 1.3072085820070551, + "language_loss": 0.78510618, + "learning_rate": 2.5954708740712215e-06, + "loss": 0.80663049, + "num_input_tokens_seen": 150542610, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7734375, + "step": 7013, + "time_per_iteration": 2.4873650074005127 + }, + { + "auxiliary_loss_clip": 0.0111676, + "auxiliary_loss_mlp": 0.0103421, + "balance_loss_clip": 1.01945186, + "balance_loss_mlp": 1.0393039, + "epoch": 0.42170449421313694, + "flos": 23440690575360.0, + "grad_norm": 1.8972197450653994, + "language_loss": 0.8102268, + "learning_rate": 2.595099063803787e-06, + "loss": 0.83173645, + "num_input_tokens_seen": 150560970, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.7734375, + "step": 7014, + "time_per_iteration": 2.4698970317840576 + }, + { + "auxiliary_loss_clip": 0.01116577, + "auxiliary_loss_mlp": 0.01032875, + "balance_loss_clip": 1.01885617, + "balance_loss_mlp": 1.039801, + "epoch": 0.4217646174658049, + "flos": 23695476721920.0, + "grad_norm": 1.584816158328088, + "language_loss": 0.7775718, + "learning_rate": 2.5947272309701354e-06, + "loss": 0.79906625, + "num_input_tokens_seen": 150582615, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76953125, + "step": 7015, + "time_per_iteration": 2.48061203956604 + }, + { + "auxiliary_loss_clip": 0.01119879, + "auxiliary_loss_mlp": 0.01038639, + "balance_loss_clip": 1.02382123, + "balance_loss_mlp": 1.04211378, + "epoch": 0.42182474071847287, + "flos": 24971562270720.0, + "grad_norm": 1.4014002437510662, + "language_loss": 0.82126868, + "learning_rate": 2.594355375584368e-06, + "loss": 0.84285378, + "num_input_tokens_seen": 150603640, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.77734375, + "step": 7016, + "time_per_iteration": 2.4971818923950195 + }, + { + "auxiliary_loss_clip": 0.01119768, + "auxiliary_loss_mlp": 0.01033652, + "balance_loss_clip": 1.01964498, + "balance_loss_mlp": 1.04142356, + "epoch": 0.42188486397114083, + "flos": 22856639431680.0, + "grad_norm": 2.18227993050423, + "language_loss": 0.68093193, + "learning_rate": 2.593983497660586e-06, + "loss": 0.70246613, + "num_input_tokens_seen": 150622490, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.78125, + "step": 7017, + "time_per_iteration": 2.4511165618896484 + }, + { + "auxiliary_loss_clip": 0.01038936, + "auxiliary_loss_mlp": 0.00999099, + "balance_loss_clip": 0.9975912, + "balance_loss_mlp": 1.01494193, + "epoch": 0.4219449872238088, + "flos": 66975700965120.0, + "grad_norm": 0.6893654540123721, + "language_loss": 0.59420347, + "learning_rate": 2.5936115972128895e-06, + "loss": 0.61458385, + "num_input_tokens_seen": 150689545, + "router_z_loss_clip": 0.01507568, + "router_z_loss_mlp": 0.24023438, + "step": 7018, + "time_per_iteration": 3.1184492111206055 + }, + { + "auxiliary_loss_clip": 0.01118505, + "auxiliary_loss_mlp": 0.01034307, + "balance_loss_clip": 1.02027655, + "balance_loss_mlp": 1.03985381, + "epoch": 0.42200511047647676, + "flos": 13115367745920.0, + "grad_norm": 1.7697613946295114, + "language_loss": 0.75391936, + "learning_rate": 2.593239674255382e-06, + "loss": 0.77544749, + "num_input_tokens_seen": 150707610, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 7019, + "time_per_iteration": 2.415177822113037 + }, + { + "auxiliary_loss_clip": 0.01116733, + "auxiliary_loss_mlp": 0.01034757, + "balance_loss_clip": 1.01955771, + "balance_loss_mlp": 1.04044795, + "epoch": 0.42206523372914473, + "flos": 13991193066240.0, + "grad_norm": 2.151945399878188, + "language_loss": 0.69014722, + "learning_rate": 2.592867728802166e-06, + "loss": 0.71166205, + "num_input_tokens_seen": 150724530, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.76171875, + "step": 7020, + "time_per_iteration": 2.502906560897827 + }, + { + "auxiliary_loss_clip": 0.01115881, + "auxiliary_loss_mlp": 0.01032881, + "balance_loss_clip": 1.01976776, + "balance_loss_mlp": 1.04312158, + "epoch": 0.4221253569818127, + "flos": 21942317710080.0, + "grad_norm": 1.807686142219978, + "language_loss": 0.80839896, + "learning_rate": 2.592495760867347e-06, + "loss": 0.82988656, + "num_input_tokens_seen": 150742870, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 7021, + "time_per_iteration": 2.4480793476104736 + }, + { + "auxiliary_loss_clip": 0.01117987, + "auxiliary_loss_mlp": 0.01030288, + "balance_loss_clip": 1.01682925, + "balance_loss_mlp": 1.04118109, + "epoch": 0.42218548023448066, + "flos": 32192587071360.0, + "grad_norm": 1.7624230978889854, + "language_loss": 0.70018518, + "learning_rate": 2.5921237704650293e-06, + "loss": 0.721668, + "num_input_tokens_seen": 150765500, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 7022, + "time_per_iteration": 2.5637993812561035 + }, + { + "auxiliary_loss_clip": 0.01110409, + "auxiliary_loss_mlp": 0.01030174, + "balance_loss_clip": 1.01816332, + "balance_loss_mlp": 1.03993058, + "epoch": 0.4222456034871487, + "flos": 30118961894400.0, + "grad_norm": 1.4995673529455043, + "language_loss": 0.66985959, + "learning_rate": 2.5917517576093188e-06, + "loss": 0.69126534, + "num_input_tokens_seen": 150784945, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.703125, + "step": 7023, + "time_per_iteration": 2.518887996673584 + }, + { + "auxiliary_loss_clip": 0.01113824, + "auxiliary_loss_mlp": 0.01032224, + "balance_loss_clip": 1.01872325, + "balance_loss_mlp": 1.04102015, + "epoch": 0.42230572673981664, + "flos": 22127904305280.0, + "grad_norm": 1.5242794814383198, + "language_loss": 0.69374228, + "learning_rate": 2.591379722314322e-06, + "loss": 0.71520281, + "num_input_tokens_seen": 150803120, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7265625, + "step": 7024, + "time_per_iteration": 2.47479510307312 + }, + { + "auxiliary_loss_clip": 0.01115853, + "auxiliary_loss_mlp": 0.01036383, + "balance_loss_clip": 1.02272165, + "balance_loss_mlp": 1.0406878, + "epoch": 0.4223658499924846, + "flos": 22055077480320.0, + "grad_norm": 1.4987089123245305, + "language_loss": 0.76659822, + "learning_rate": 2.591007664594147e-06, + "loss": 0.78812057, + "num_input_tokens_seen": 150823135, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75, + "step": 7025, + "time_per_iteration": 2.459552526473999 + }, + { + "auxiliary_loss_clip": 0.01111611, + "auxiliary_loss_mlp": 0.01032742, + "balance_loss_clip": 1.01950371, + "balance_loss_mlp": 1.03944087, + "epoch": 0.4224259732451526, + "flos": 20410727742720.0, + "grad_norm": 1.7650754883430373, + "language_loss": 0.79574716, + "learning_rate": 2.5906355844629024e-06, + "loss": 0.81719071, + "num_input_tokens_seen": 150842070, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.72265625, + "step": 7026, + "time_per_iteration": 2.4876604080200195 + }, + { + "auxiliary_loss_clip": 0.01039298, + "auxiliary_loss_mlp": 0.00998847, + "balance_loss_clip": 0.99741668, + "balance_loss_mlp": 1.01518142, + "epoch": 0.42248609649782054, + "flos": 62846655828480.0, + "grad_norm": 0.7186593098349721, + "language_loss": 0.6191169, + "learning_rate": 2.5902634819346966e-06, + "loss": 0.63949835, + "num_input_tokens_seen": 150907450, + "router_z_loss_clip": 0.01428223, + "router_z_loss_mlp": 0.24121094, + "step": 7027, + "time_per_iteration": 3.1553335189819336 + }, + { + "auxiliary_loss_clip": 0.01115441, + "auxiliary_loss_mlp": 0.01038835, + "balance_loss_clip": 1.02524519, + "balance_loss_mlp": 1.04096365, + "epoch": 0.4225462197504885, + "flos": 26249946289920.0, + "grad_norm": 4.428318649676281, + "language_loss": 0.70515895, + "learning_rate": 2.5898913570236414e-06, + "loss": 0.72670174, + "num_input_tokens_seen": 150928040, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.74609375, + "step": 7028, + "time_per_iteration": 2.5373435020446777 + }, + { + "auxiliary_loss_clip": 0.01117282, + "auxiliary_loss_mlp": 0.01038664, + "balance_loss_clip": 1.02488303, + "balance_loss_mlp": 1.04104543, + "epoch": 0.42260634300315647, + "flos": 20521943228160.0, + "grad_norm": 1.8463743475085548, + "language_loss": 0.82555425, + "learning_rate": 2.589519209743846e-06, + "loss": 0.84711367, + "num_input_tokens_seen": 150945760, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76171875, + "step": 7029, + "time_per_iteration": 2.5120980739593506 + }, + { + "auxiliary_loss_clip": 0.0112087, + "auxiliary_loss_mlp": 0.010423, + "balance_loss_clip": 1.02790523, + "balance_loss_mlp": 1.04274035, + "epoch": 0.42266646625582444, + "flos": 24316731377280.0, + "grad_norm": 2.3903311172404, + "language_loss": 0.75230241, + "learning_rate": 2.589147040109424e-06, + "loss": 0.77393407, + "num_input_tokens_seen": 150965665, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 7030, + "time_per_iteration": 2.5118141174316406 + }, + { + "auxiliary_loss_clip": 0.01114314, + "auxiliary_loss_mlp": 0.01038089, + "balance_loss_clip": 1.02359271, + "balance_loss_mlp": 1.03835046, + "epoch": 0.4227265895084924, + "flos": 24204151175040.0, + "grad_norm": 1.9474535697331137, + "language_loss": 0.86421049, + "learning_rate": 2.588774848134486e-06, + "loss": 0.88573444, + "num_input_tokens_seen": 150982260, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7578125, + "step": 7031, + "time_per_iteration": 2.500140905380249 + }, + { + "auxiliary_loss_clip": 0.01115501, + "auxiliary_loss_mlp": 0.0103786, + "balance_loss_clip": 1.02328062, + "balance_loss_mlp": 1.04060841, + "epoch": 0.42278671276116037, + "flos": 16909760845440.0, + "grad_norm": 2.1339679402128717, + "language_loss": 0.72855937, + "learning_rate": 2.5884026338331473e-06, + "loss": 0.75009298, + "num_input_tokens_seen": 150999990, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.75, + "step": 7032, + "time_per_iteration": 2.477363109588623 + }, + { + "auxiliary_loss_clip": 0.0111615, + "auxiliary_loss_mlp": 0.01040791, + "balance_loss_clip": 1.02711725, + "balance_loss_mlp": 1.0390861, + "epoch": 0.42284683601382833, + "flos": 25411073086080.0, + "grad_norm": 1.7148750065903648, + "language_loss": 0.699175, + "learning_rate": 2.5880303972195222e-06, + "loss": 0.72074443, + "num_input_tokens_seen": 151021105, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 7033, + "time_per_iteration": 2.5661494731903076 + }, + { + "auxiliary_loss_clip": 0.01115751, + "auxiliary_loss_mlp": 0.01032627, + "balance_loss_clip": 1.01895976, + "balance_loss_mlp": 1.03992891, + "epoch": 0.4229069592664963, + "flos": 23040322606080.0, + "grad_norm": 1.8649473631938416, + "language_loss": 0.90448046, + "learning_rate": 2.5876581383077256e-06, + "loss": 0.92596424, + "num_input_tokens_seen": 151040665, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7578125, + "step": 7034, + "time_per_iteration": 2.4802892208099365 + }, + { + "auxiliary_loss_clip": 0.01112625, + "auxiliary_loss_mlp": 0.01036891, + "balance_loss_clip": 1.02369416, + "balance_loss_mlp": 1.03800857, + "epoch": 0.42296708251916426, + "flos": 26067448264320.0, + "grad_norm": 1.6052176008605175, + "language_loss": 0.77130729, + "learning_rate": 2.5872858571118723e-06, + "loss": 0.79280239, + "num_input_tokens_seen": 151061240, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.74609375, + "step": 7035, + "time_per_iteration": 2.5044498443603516 + }, + { + "auxiliary_loss_clip": 0.01118013, + "auxiliary_loss_mlp": 0.01040699, + "balance_loss_clip": 1.02682912, + "balance_loss_mlp": 1.0414331, + "epoch": 0.4230272057718323, + "flos": 19458376496640.0, + "grad_norm": 1.9123378440021823, + "language_loss": 0.82216996, + "learning_rate": 2.5869135536460817e-06, + "loss": 0.84375703, + "num_input_tokens_seen": 151076870, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 7036, + "time_per_iteration": 2.4178695678710938 + }, + { + "auxiliary_loss_clip": 0.01112842, + "auxiliary_loss_mlp": 0.01036995, + "balance_loss_clip": 1.02382207, + "balance_loss_mlp": 1.0403924, + "epoch": 0.42308732902450025, + "flos": 22383300983040.0, + "grad_norm": 1.6417488866700152, + "language_loss": 0.70871484, + "learning_rate": 2.58654122792447e-06, + "loss": 0.73021322, + "num_input_tokens_seen": 151095110, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7265625, + "step": 7037, + "time_per_iteration": 2.485499858856201 + }, + { + "auxiliary_loss_clip": 0.01115475, + "auxiliary_loss_mlp": 0.01037386, + "balance_loss_clip": 1.02303314, + "balance_loss_mlp": 1.03976059, + "epoch": 0.4231474522771682, + "flos": 20995425331200.0, + "grad_norm": 1.5138937767155718, + "language_loss": 0.77942061, + "learning_rate": 2.586168879961155e-06, + "loss": 0.80094922, + "num_input_tokens_seen": 151114355, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7578125, + "step": 7038, + "time_per_iteration": 2.4569690227508545 + }, + { + "auxiliary_loss_clip": 0.01120787, + "auxiliary_loss_mlp": 0.01044399, + "balance_loss_clip": 1.02919412, + "balance_loss_mlp": 1.04072356, + "epoch": 0.4232075755298362, + "flos": 14975863574400.0, + "grad_norm": 2.366884859254005, + "language_loss": 0.66797423, + "learning_rate": 2.585796509770259e-06, + "loss": 0.6896261, + "num_input_tokens_seen": 151131505, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.80078125, + "step": 7039, + "time_per_iteration": 2.441373825073242 + }, + { + "auxiliary_loss_clip": 0.01119114, + "auxiliary_loss_mlp": 0.01037872, + "balance_loss_clip": 1.02372193, + "balance_loss_mlp": 1.04042578, + "epoch": 0.42326769878250414, + "flos": 24532661986560.0, + "grad_norm": 1.6082175120791662, + "language_loss": 0.75897467, + "learning_rate": 2.5854241173658996e-06, + "loss": 0.78054452, + "num_input_tokens_seen": 151151555, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.78515625, + "step": 7040, + "time_per_iteration": 2.471653938293457 + }, + { + "auxiliary_loss_clip": 0.01117046, + "auxiliary_loss_mlp": 0.01035054, + "balance_loss_clip": 1.02067101, + "balance_loss_mlp": 1.03962982, + "epoch": 0.4233278220351721, + "flos": 26870303105280.0, + "grad_norm": 1.477939672492119, + "language_loss": 0.65098798, + "learning_rate": 2.5850517027621996e-06, + "loss": 0.67250896, + "num_input_tokens_seen": 151172385, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7734375, + "step": 7041, + "time_per_iteration": 2.502443313598633 + }, + { + "auxiliary_loss_clip": 0.01118281, + "auxiliary_loss_mlp": 0.01036522, + "balance_loss_clip": 1.02233624, + "balance_loss_mlp": 1.04045236, + "epoch": 0.4233879452878401, + "flos": 42814927463040.0, + "grad_norm": 1.7627160436135367, + "language_loss": 0.73621082, + "learning_rate": 2.5846792659732803e-06, + "loss": 0.75775892, + "num_input_tokens_seen": 151194930, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.77734375, + "step": 7042, + "time_per_iteration": 2.6498820781707764 + }, + { + "auxiliary_loss_clip": 0.01112749, + "auxiliary_loss_mlp": 0.01033182, + "balance_loss_clip": 1.02020609, + "balance_loss_mlp": 1.03977966, + "epoch": 0.42344806854050804, + "flos": 25229006023680.0, + "grad_norm": 1.3177903064215164, + "language_loss": 0.82185107, + "learning_rate": 2.5843068070132643e-06, + "loss": 0.84331036, + "num_input_tokens_seen": 151217905, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.73046875, + "step": 7043, + "time_per_iteration": 2.528604745864868 + }, + { + "auxiliary_loss_clip": 0.0111836, + "auxiliary_loss_mlp": 0.01041223, + "balance_loss_clip": 1.02608395, + "balance_loss_mlp": 1.04329216, + "epoch": 0.423508191793176, + "flos": 22778820616320.0, + "grad_norm": 2.3747778329738742, + "language_loss": 0.65231359, + "learning_rate": 2.5839343258962763e-06, + "loss": 0.67390943, + "num_input_tokens_seen": 151234580, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.75, + "step": 7044, + "time_per_iteration": 2.4399802684783936 + }, + { + "auxiliary_loss_clip": 0.01121384, + "auxiliary_loss_mlp": 0.01046534, + "balance_loss_clip": 1.03126323, + "balance_loss_mlp": 1.04322433, + "epoch": 0.42356831504584397, + "flos": 34637493179520.0, + "grad_norm": 1.7497316034691441, + "language_loss": 0.7502315, + "learning_rate": 2.5835618226364393e-06, + "loss": 0.77191073, + "num_input_tokens_seen": 151254765, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.78125, + "step": 7045, + "time_per_iteration": 2.612898588180542 + }, + { + "auxiliary_loss_clip": 0.01116302, + "auxiliary_loss_mlp": 0.01035318, + "balance_loss_clip": 1.02141845, + "balance_loss_mlp": 1.04219389, + "epoch": 0.42362843829851193, + "flos": 17596767346560.0, + "grad_norm": 2.1011396794876385, + "language_loss": 0.80564952, + "learning_rate": 2.5831892972478797e-06, + "loss": 0.82716572, + "num_input_tokens_seen": 151269045, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.73828125, + "step": 7046, + "time_per_iteration": 2.4105727672576904 + }, + { + "auxiliary_loss_clip": 0.01119082, + "auxiliary_loss_mlp": 0.0103605, + "balance_loss_clip": 1.021685, + "balance_loss_mlp": 1.04078197, + "epoch": 0.4236885615511799, + "flos": 22565691267840.0, + "grad_norm": 1.59844067944401, + "language_loss": 0.76846749, + "learning_rate": 2.5828167497447242e-06, + "loss": 0.7900188, + "num_input_tokens_seen": 151287530, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78125, + "step": 7047, + "time_per_iteration": 2.486297130584717 + }, + { + "auxiliary_loss_clip": 0.01116569, + "auxiliary_loss_mlp": 0.01034292, + "balance_loss_clip": 1.02102375, + "balance_loss_mlp": 1.04264975, + "epoch": 0.42374868480384786, + "flos": 26469216864000.0, + "grad_norm": 1.8697996227798281, + "language_loss": 0.67980373, + "learning_rate": 2.582444180141098e-06, + "loss": 0.70131224, + "num_input_tokens_seen": 151308905, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73828125, + "step": 7048, + "time_per_iteration": 2.5031991004943848 + }, + { + "auxiliary_loss_clip": 0.01119136, + "auxiliary_loss_mlp": 0.01038379, + "balance_loss_clip": 1.02371609, + "balance_loss_mlp": 1.04227185, + "epoch": 0.4238088080565159, + "flos": 20370220179840.0, + "grad_norm": 1.7311423758965327, + "language_loss": 0.7829181, + "learning_rate": 2.5820715884511307e-06, + "loss": 0.80449331, + "num_input_tokens_seen": 151326525, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.76953125, + "step": 7049, + "time_per_iteration": 2.549767255783081 + }, + { + "auxiliary_loss_clip": 0.01121261, + "auxiliary_loss_mlp": 0.01039585, + "balance_loss_clip": 1.02570868, + "balance_loss_mlp": 1.0433383, + "epoch": 0.42386893130918385, + "flos": 21172105353600.0, + "grad_norm": 1.7774881318176563, + "language_loss": 0.82656097, + "learning_rate": 2.5816989746889504e-06, + "loss": 0.84816945, + "num_input_tokens_seen": 151344675, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.78125, + "step": 7050, + "time_per_iteration": 2.498494863510132 + }, + { + "auxiliary_loss_clip": 0.01115122, + "auxiliary_loss_mlp": 0.01035845, + "balance_loss_clip": 1.02233815, + "balance_loss_mlp": 1.0382762, + "epoch": 0.4239290545618518, + "flos": 17675627656320.0, + "grad_norm": 2.0169322630318844, + "language_loss": 0.73429018, + "learning_rate": 2.581326338868687e-06, + "loss": 0.75579983, + "num_input_tokens_seen": 151360730, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 7051, + "time_per_iteration": 2.441920042037964 + }, + { + "auxiliary_loss_clip": 0.01118227, + "auxiliary_loss_mlp": 0.01033059, + "balance_loss_clip": 1.01983249, + "balance_loss_mlp": 1.04219055, + "epoch": 0.4239891778145198, + "flos": 24314504734080.0, + "grad_norm": 1.4713561275118965, + "language_loss": 0.86205333, + "learning_rate": 2.5809536810044706e-06, + "loss": 0.8835662, + "num_input_tokens_seen": 151380445, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7578125, + "step": 7052, + "time_per_iteration": 2.511756658554077 + }, + { + "auxiliary_loss_clip": 0.01116616, + "auxiliary_loss_mlp": 0.01041035, + "balance_loss_clip": 1.02657533, + "balance_loss_mlp": 1.03951788, + "epoch": 0.42404930106718774, + "flos": 20558428467840.0, + "grad_norm": 1.4100722391624452, + "language_loss": 0.7240659, + "learning_rate": 2.5805810011104323e-06, + "loss": 0.74564236, + "num_input_tokens_seen": 151399325, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7734375, + "step": 7053, + "time_per_iteration": 3.9099857807159424 + }, + { + "auxiliary_loss_clip": 0.01116742, + "auxiliary_loss_mlp": 0.01033237, + "balance_loss_clip": 1.0190872, + "balance_loss_mlp": 1.04233611, + "epoch": 0.4241094243198557, + "flos": 22308067946880.0, + "grad_norm": 1.5741365926511655, + "language_loss": 0.82153803, + "learning_rate": 2.580208299200704e-06, + "loss": 0.84303784, + "num_input_tokens_seen": 151417240, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.74609375, + "step": 7054, + "time_per_iteration": 5.327679634094238 + }, + { + "auxiliary_loss_clip": 0.01040448, + "auxiliary_loss_mlp": 0.0101831, + "balance_loss_clip": 1.01700425, + "balance_loss_mlp": 1.01674867, + "epoch": 0.4241695475725237, + "flos": 70612445272320.0, + "grad_norm": 0.7840713570529064, + "language_loss": 0.60388172, + "learning_rate": 2.5798355752894183e-06, + "loss": 0.62446928, + "num_input_tokens_seen": 151476015, + "router_z_loss_clip": 0.01306152, + "router_z_loss_mlp": 0.23632812, + "step": 7055, + "time_per_iteration": 3.0450727939605713 + }, + { + "auxiliary_loss_clip": 0.01119418, + "auxiliary_loss_mlp": 0.01041139, + "balance_loss_clip": 1.02651238, + "balance_loss_mlp": 1.04204714, + "epoch": 0.42422967082519164, + "flos": 14027462824320.0, + "grad_norm": 2.951771931203088, + "language_loss": 0.76762712, + "learning_rate": 2.5794628293907107e-06, + "loss": 0.78923267, + "num_input_tokens_seen": 151492035, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7734375, + "step": 7056, + "time_per_iteration": 2.442148447036743 + }, + { + "auxiliary_loss_clip": 0.01121258, + "auxiliary_loss_mlp": 0.01039415, + "balance_loss_clip": 1.02375674, + "balance_loss_mlp": 1.04127979, + "epoch": 0.4242897940778596, + "flos": 22345522853760.0, + "grad_norm": 2.7846662247260388, + "language_loss": 0.84346795, + "learning_rate": 2.579090061518714e-06, + "loss": 0.86507463, + "num_input_tokens_seen": 151508970, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.80078125, + "step": 7057, + "time_per_iteration": 2.474519968032837 + }, + { + "auxiliary_loss_clip": 0.01119549, + "auxiliary_loss_mlp": 0.010377, + "balance_loss_clip": 1.02272737, + "balance_loss_mlp": 1.04053187, + "epoch": 0.42434991733052757, + "flos": 22595855713920.0, + "grad_norm": 3.1820547358610605, + "language_loss": 0.82999814, + "learning_rate": 2.5787172716875642e-06, + "loss": 0.85157061, + "num_input_tokens_seen": 151525295, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.7890625, + "step": 7058, + "time_per_iteration": 2.473520517349243 + }, + { + "auxiliary_loss_clip": 0.01118587, + "auxiliary_loss_mlp": 0.01028517, + "balance_loss_clip": 1.01533902, + "balance_loss_mlp": 1.04417813, + "epoch": 0.42441004058319554, + "flos": 20011437181440.0, + "grad_norm": 1.7435131696457398, + "language_loss": 0.80453449, + "learning_rate": 2.5783444599113973e-06, + "loss": 0.82600558, + "num_input_tokens_seen": 151544435, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 7059, + "time_per_iteration": 2.4719533920288086 + }, + { + "auxiliary_loss_clip": 0.01120005, + "auxiliary_loss_mlp": 0.01033964, + "balance_loss_clip": 1.01860404, + "balance_loss_mlp": 1.041839, + "epoch": 0.4244701638358635, + "flos": 11144985235200.0, + "grad_norm": 1.9429107045123646, + "language_loss": 0.70341688, + "learning_rate": 2.57797162620435e-06, + "loss": 0.72495657, + "num_input_tokens_seen": 151559520, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.78125, + "step": 7060, + "time_per_iteration": 2.4377660751342773 + }, + { + "auxiliary_loss_clip": 0.0112048, + "auxiliary_loss_mlp": 0.01032925, + "balance_loss_clip": 1.01914454, + "balance_loss_mlp": 1.04378521, + "epoch": 0.42453028708853147, + "flos": 23987753688960.0, + "grad_norm": 1.5364996273974925, + "language_loss": 0.76182258, + "learning_rate": 2.577598770580562e-06, + "loss": 0.78335667, + "num_input_tokens_seen": 151579790, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.765625, + "step": 7061, + "time_per_iteration": 2.486786365509033 + }, + { + "auxiliary_loss_clip": 0.01122599, + "auxiliary_loss_mlp": 0.01038557, + "balance_loss_clip": 1.02319098, + "balance_loss_mlp": 1.04407752, + "epoch": 0.42459041034119943, + "flos": 18406338030720.0, + "grad_norm": 3.328289037638814, + "language_loss": 0.729635, + "learning_rate": 2.5772258930541693e-06, + "loss": 0.75124645, + "num_input_tokens_seen": 151598285, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.7890625, + "step": 7062, + "time_per_iteration": 2.474193572998047 + }, + { + "auxiliary_loss_clip": 0.01116832, + "auxiliary_loss_mlp": 0.0104003, + "balance_loss_clip": 1.02600455, + "balance_loss_mlp": 1.03964305, + "epoch": 0.42465053359386745, + "flos": 20958006337920.0, + "grad_norm": 1.701854582957673, + "language_loss": 0.66343361, + "learning_rate": 2.5768529936393137e-06, + "loss": 0.68500221, + "num_input_tokens_seen": 151615430, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7734375, + "step": 7063, + "time_per_iteration": 2.458003520965576 + }, + { + "auxiliary_loss_clip": 0.01115284, + "auxiliary_loss_mlp": 0.01031318, + "balance_loss_clip": 1.0181458, + "balance_loss_mlp": 1.04179168, + "epoch": 0.4247106568465354, + "flos": 33106190520960.0, + "grad_norm": 1.4878317325171677, + "language_loss": 0.78371775, + "learning_rate": 2.5764800723501354e-06, + "loss": 0.80518377, + "num_input_tokens_seen": 151637030, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 7064, + "time_per_iteration": 2.5735623836517334 + }, + { + "auxiliary_loss_clip": 0.01118889, + "auxiliary_loss_mlp": 0.01040848, + "balance_loss_clip": 1.02636456, + "balance_loss_mlp": 1.04172683, + "epoch": 0.4247707800992034, + "flos": 20046916840320.0, + "grad_norm": 1.8409826195637737, + "language_loss": 0.74893892, + "learning_rate": 2.5761071292007736e-06, + "loss": 0.7705363, + "num_input_tokens_seen": 151655745, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.76953125, + "step": 7065, + "time_per_iteration": 2.4962844848632812 + }, + { + "auxiliary_loss_clip": 0.01119456, + "auxiliary_loss_mlp": 0.01035708, + "balance_loss_clip": 1.0206933, + "balance_loss_mlp": 1.04322076, + "epoch": 0.42483090335187135, + "flos": 22385132576640.0, + "grad_norm": 1.415711347923808, + "language_loss": 0.72713453, + "learning_rate": 2.5757341642053725e-06, + "loss": 0.74868619, + "num_input_tokens_seen": 151678040, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.76171875, + "step": 7066, + "time_per_iteration": 2.551297426223755 + }, + { + "auxiliary_loss_clip": 0.01119285, + "auxiliary_loss_mlp": 0.01038218, + "balance_loss_clip": 1.02307224, + "balance_loss_mlp": 1.04031396, + "epoch": 0.4248910266045393, + "flos": 21356830022400.0, + "grad_norm": 1.9392042625935109, + "language_loss": 0.79517603, + "learning_rate": 2.5753611773780745e-06, + "loss": 0.81675112, + "num_input_tokens_seen": 151696410, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.7890625, + "step": 7067, + "time_per_iteration": 2.4871444702148438 + }, + { + "auxiliary_loss_clip": 0.010394, + "auxiliary_loss_mlp": 0.01005215, + "balance_loss_clip": 1.00373113, + "balance_loss_mlp": 1.01538539, + "epoch": 0.4249511498572073, + "flos": 64008114099840.0, + "grad_norm": 0.919528911316311, + "language_loss": 0.63477993, + "learning_rate": 2.574988168733022e-06, + "loss": 0.65522605, + "num_input_tokens_seen": 151756365, + "router_z_loss_clip": 0.01483154, + "router_z_loss_mlp": 0.24023438, + "step": 7068, + "time_per_iteration": 3.0116004943847656 + }, + { + "auxiliary_loss_clip": 0.01119716, + "auxiliary_loss_mlp": 0.01036194, + "balance_loss_clip": 1.02073288, + "balance_loss_mlp": 1.04235375, + "epoch": 0.42501127310987524, + "flos": 19607046888960.0, + "grad_norm": 1.681037886347605, + "language_loss": 0.72381866, + "learning_rate": 2.574615138284361e-06, + "loss": 0.74537772, + "num_input_tokens_seen": 151775165, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.7734375, + "step": 7069, + "time_per_iteration": 2.5046679973602295 + }, + { + "auxiliary_loss_clip": 0.01122307, + "auxiliary_loss_mlp": 0.01034999, + "balance_loss_clip": 1.01864338, + "balance_loss_mlp": 1.04424644, + "epoch": 0.4250713963625432, + "flos": 19462326992640.0, + "grad_norm": 3.2712432047864852, + "language_loss": 0.79297352, + "learning_rate": 2.5742420860462364e-06, + "loss": 0.81454653, + "num_input_tokens_seen": 151792620, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.78125, + "step": 7070, + "time_per_iteration": 2.43115496635437 + }, + { + "auxiliary_loss_clip": 0.01118123, + "auxiliary_loss_mlp": 0.01033635, + "balance_loss_clip": 1.01863861, + "balance_loss_mlp": 1.04104066, + "epoch": 0.4251315196152112, + "flos": 25337707557120.0, + "grad_norm": 1.8101520547589562, + "language_loss": 0.70179212, + "learning_rate": 2.573869012032795e-06, + "loss": 0.7233097, + "num_input_tokens_seen": 151812850, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7734375, + "step": 7071, + "time_per_iteration": 2.5141680240631104 + }, + { + "auxiliary_loss_clip": 0.01118096, + "auxiliary_loss_mlp": 0.01033548, + "balance_loss_clip": 1.01942205, + "balance_loss_mlp": 1.04123151, + "epoch": 0.42519164286787914, + "flos": 26359186527360.0, + "grad_norm": 2.3450864635540825, + "language_loss": 0.71075511, + "learning_rate": 2.5734959162581824e-06, + "loss": 0.73227149, + "num_input_tokens_seen": 151831785, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76953125, + "step": 7072, + "time_per_iteration": 2.489187002182007 + }, + { + "auxiliary_loss_clip": 0.01122118, + "auxiliary_loss_mlp": 0.01032433, + "balance_loss_clip": 1.01865244, + "balance_loss_mlp": 1.04270983, + "epoch": 0.4252517661205471, + "flos": 26031070765440.0, + "grad_norm": 1.5399076436438217, + "language_loss": 0.81655496, + "learning_rate": 2.5731227987365475e-06, + "loss": 0.83810043, + "num_input_tokens_seen": 151853885, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.79296875, + "step": 7073, + "time_per_iteration": 2.5192041397094727 + }, + { + "auxiliary_loss_clip": 0.0111768, + "auxiliary_loss_mlp": 0.01034416, + "balance_loss_clip": 1.02097535, + "balance_loss_mlp": 1.04180706, + "epoch": 0.42531188937321507, + "flos": 12713635059840.0, + "grad_norm": 2.1264240253054227, + "language_loss": 0.90777069, + "learning_rate": 2.5727496594820386e-06, + "loss": 0.92929167, + "num_input_tokens_seen": 151871780, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 7074, + "time_per_iteration": 2.418611526489258 + }, + { + "auxiliary_loss_clip": 0.01122809, + "auxiliary_loss_mlp": 0.01039179, + "balance_loss_clip": 1.0234437, + "balance_loss_mlp": 1.04282892, + "epoch": 0.42537201262588303, + "flos": 22091670460800.0, + "grad_norm": 1.5751331844442036, + "language_loss": 0.63971686, + "learning_rate": 2.572376498508805e-06, + "loss": 0.66133678, + "num_input_tokens_seen": 151891600, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.80078125, + "step": 7075, + "time_per_iteration": 2.5064475536346436 + }, + { + "auxiliary_loss_clip": 0.01114521, + "auxiliary_loss_mlp": 0.01030161, + "balance_loss_clip": 1.01708984, + "balance_loss_mlp": 1.04121399, + "epoch": 0.42543213587855105, + "flos": 23003119094400.0, + "grad_norm": 1.5599863464934922, + "language_loss": 0.73547149, + "learning_rate": 2.5720033158309973e-06, + "loss": 0.75691831, + "num_input_tokens_seen": 151911330, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.734375, + "step": 7076, + "time_per_iteration": 2.487424850463867 + }, + { + "auxiliary_loss_clip": 0.01122674, + "auxiliary_loss_mlp": 0.01040326, + "balance_loss_clip": 1.02565181, + "balance_loss_mlp": 1.04370356, + "epoch": 0.425492259131219, + "flos": 25082454533760.0, + "grad_norm": 1.8221025125090708, + "language_loss": 0.78215933, + "learning_rate": 2.571630111462766e-06, + "loss": 0.80378938, + "num_input_tokens_seen": 151930355, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7890625, + "step": 7077, + "time_per_iteration": 2.4964394569396973 + }, + { + "auxiliary_loss_clip": 0.01114549, + "auxiliary_loss_mlp": 0.01034843, + "balance_loss_clip": 1.0221417, + "balance_loss_mlp": 1.04220366, + "epoch": 0.425552382383887, + "flos": 22816850140800.0, + "grad_norm": 1.6016827264272244, + "language_loss": 0.73013902, + "learning_rate": 2.571256885418265e-06, + "loss": 0.75163293, + "num_input_tokens_seen": 151949695, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 7078, + "time_per_iteration": 2.47660756111145 + }, + { + "auxiliary_loss_clip": 0.01120871, + "auxiliary_loss_mlp": 0.01042162, + "balance_loss_clip": 1.02880406, + "balance_loss_mlp": 1.0461756, + "epoch": 0.42561250563655495, + "flos": 13553585671680.0, + "grad_norm": 1.731645410920913, + "language_loss": 0.79469633, + "learning_rate": 2.5708836377116445e-06, + "loss": 0.81632668, + "num_input_tokens_seen": 151967640, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.74609375, + "step": 7079, + "time_per_iteration": 2.499232769012451 + }, + { + "auxiliary_loss_clip": 0.0112172, + "auxiliary_loss_mlp": 0.01031179, + "balance_loss_clip": 1.0181613, + "balance_loss_mlp": 1.04761243, + "epoch": 0.4256726288892229, + "flos": 46978303023360.0, + "grad_norm": 1.4705007316204746, + "language_loss": 0.72263241, + "learning_rate": 2.5705103683570592e-06, + "loss": 0.74416137, + "num_input_tokens_seen": 151994020, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7421875, + "step": 7080, + "time_per_iteration": 2.732074499130249 + }, + { + "auxiliary_loss_clip": 0.0111869, + "auxiliary_loss_mlp": 0.01035587, + "balance_loss_clip": 1.02206242, + "balance_loss_mlp": 1.04246545, + "epoch": 0.4257327521418909, + "flos": 23586451966080.0, + "grad_norm": 2.328741773172896, + "language_loss": 0.80405676, + "learning_rate": 2.5701370773686646e-06, + "loss": 0.82559955, + "num_input_tokens_seen": 152013415, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.76171875, + "step": 7081, + "time_per_iteration": 2.6035380363464355 + }, + { + "auxiliary_loss_clip": 0.01115229, + "auxiliary_loss_mlp": 0.01030383, + "balance_loss_clip": 1.01753235, + "balance_loss_mlp": 1.04303384, + "epoch": 0.42579287539455885, + "flos": 18989994124800.0, + "grad_norm": 1.7894721227922463, + "language_loss": 0.81618208, + "learning_rate": 2.5697637647606138e-06, + "loss": 0.8376382, + "num_input_tokens_seen": 152030860, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.72265625, + "step": 7082, + "time_per_iteration": 2.444728374481201 + }, + { + "auxiliary_loss_clip": 0.01119852, + "auxiliary_loss_mlp": 0.0103706, + "balance_loss_clip": 1.02286816, + "balance_loss_mlp": 1.04368842, + "epoch": 0.4258529986472268, + "flos": 25191910252800.0, + "grad_norm": 2.6988843094625508, + "language_loss": 0.69388473, + "learning_rate": 2.569390430547065e-06, + "loss": 0.71545386, + "num_input_tokens_seen": 152050395, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.76171875, + "step": 7083, + "time_per_iteration": 2.5369133949279785 + }, + { + "auxiliary_loss_clip": 0.01040302, + "auxiliary_loss_mlp": 0.00999977, + "balance_loss_clip": 0.99864787, + "balance_loss_mlp": 1.01655924, + "epoch": 0.4259131218998948, + "flos": 69968280718080.0, + "grad_norm": 0.8706759407802692, + "language_loss": 0.67112887, + "learning_rate": 2.569017074742173e-06, + "loss": 0.69153166, + "num_input_tokens_seen": 152113555, + "router_z_loss_clip": 0.01330566, + "router_z_loss_mlp": 0.23828125, + "step": 7084, + "time_per_iteration": 3.1631839275360107 + }, + { + "auxiliary_loss_clip": 0.01118847, + "auxiliary_loss_mlp": 0.01044199, + "balance_loss_clip": 1.02887428, + "balance_loss_mlp": 1.04295874, + "epoch": 0.42597324515256274, + "flos": 18004964480640.0, + "grad_norm": 2.6244995349856595, + "language_loss": 0.78095287, + "learning_rate": 2.5686436973600964e-06, + "loss": 0.80258334, + "num_input_tokens_seen": 152131575, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.7578125, + "step": 7085, + "time_per_iteration": 2.493157148361206 + }, + { + "auxiliary_loss_clip": 0.01129017, + "auxiliary_loss_mlp": 0.01046431, + "balance_loss_clip": 1.03102934, + "balance_loss_mlp": 1.04819477, + "epoch": 0.4260333684052307, + "flos": 15158792563200.0, + "grad_norm": 2.071277468695464, + "language_loss": 0.75757217, + "learning_rate": 2.568270298414995e-06, + "loss": 0.77932662, + "num_input_tokens_seen": 152149435, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.80859375, + "step": 7086, + "time_per_iteration": 2.426295280456543 + }, + { + "auxiliary_loss_clip": 0.01119794, + "auxiliary_loss_mlp": 0.01037065, + "balance_loss_clip": 1.02280188, + "balance_loss_mlp": 1.0433557, + "epoch": 0.42609349165789867, + "flos": 14939342421120.0, + "grad_norm": 2.1734108107028147, + "language_loss": 0.8001647, + "learning_rate": 2.5678968779210255e-06, + "loss": 0.82173336, + "num_input_tokens_seen": 152166860, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.765625, + "step": 7087, + "time_per_iteration": 2.46087384223938 + }, + { + "auxiliary_loss_clip": 0.01123365, + "auxiliary_loss_mlp": 0.01032798, + "balance_loss_clip": 1.01846862, + "balance_loss_mlp": 1.04632342, + "epoch": 0.42615361491056664, + "flos": 23731961961600.0, + "grad_norm": 1.8444426655441133, + "language_loss": 0.6603114, + "learning_rate": 2.5675234358923505e-06, + "loss": 0.68187302, + "num_input_tokens_seen": 152187475, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.76953125, + "step": 7088, + "time_per_iteration": 2.481919527053833 + }, + { + "auxiliary_loss_clip": 0.01123249, + "auxiliary_loss_mlp": 0.01039067, + "balance_loss_clip": 1.02472591, + "balance_loss_mlp": 1.0449152, + "epoch": 0.42621373816323466, + "flos": 24936441747840.0, + "grad_norm": 1.8812259313043718, + "language_loss": 0.68482029, + "learning_rate": 2.56714997234313e-06, + "loss": 0.70644343, + "num_input_tokens_seen": 152207235, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78125, + "step": 7089, + "time_per_iteration": 2.523918628692627 + }, + { + "auxiliary_loss_clip": 0.01121302, + "auxiliary_loss_mlp": 0.01038776, + "balance_loss_clip": 1.02473295, + "balance_loss_mlp": 1.0418849, + "epoch": 0.4262738614159026, + "flos": 13552975140480.0, + "grad_norm": 2.8669230196035027, + "language_loss": 0.72897398, + "learning_rate": 2.566776487287525e-06, + "loss": 0.75057483, + "num_input_tokens_seen": 152224240, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.79296875, + "step": 7090, + "time_per_iteration": 2.4340648651123047 + }, + { + "auxiliary_loss_clip": 0.01124016, + "auxiliary_loss_mlp": 0.01046428, + "balance_loss_clip": 1.03208125, + "balance_loss_mlp": 1.04372311, + "epoch": 0.4263339846685706, + "flos": 29748794284800.0, + "grad_norm": 1.7953532910276222, + "language_loss": 0.75347531, + "learning_rate": 2.5664029807396994e-06, + "loss": 0.77517974, + "num_input_tokens_seen": 152242595, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.80078125, + "step": 7091, + "time_per_iteration": 2.5973541736602783 + }, + { + "auxiliary_loss_clip": 0.0111574, + "auxiliary_loss_mlp": 0.01034559, + "balance_loss_clip": 1.02188134, + "balance_loss_mlp": 1.04312468, + "epoch": 0.42639410792123855, + "flos": 16834204586880.0, + "grad_norm": 1.6821401092021848, + "language_loss": 0.82308388, + "learning_rate": 2.5660294527138156e-06, + "loss": 0.84458697, + "num_input_tokens_seen": 152260840, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7265625, + "step": 7092, + "time_per_iteration": 2.453181266784668 + }, + { + "auxiliary_loss_clip": 0.0112628, + "auxiliary_loss_mlp": 0.01045355, + "balance_loss_clip": 1.03138983, + "balance_loss_mlp": 1.0454514, + "epoch": 0.4264542311739065, + "flos": 28763118195840.0, + "grad_norm": 1.6505279256890275, + "language_loss": 0.73916072, + "learning_rate": 2.565655903224038e-06, + "loss": 0.76087701, + "num_input_tokens_seen": 152280580, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.80859375, + "step": 7093, + "time_per_iteration": 2.5176479816436768 + }, + { + "auxiliary_loss_clip": 0.01120741, + "auxiliary_loss_mlp": 0.01039533, + "balance_loss_clip": 1.02482259, + "balance_loss_mlp": 1.04376769, + "epoch": 0.4265143544265745, + "flos": 24713615727360.0, + "grad_norm": 2.5315083588078555, + "language_loss": 0.69390249, + "learning_rate": 2.565282332284532e-06, + "loss": 0.71550524, + "num_input_tokens_seen": 152298455, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.76953125, + "step": 7094, + "time_per_iteration": 2.489561080932617 + }, + { + "auxiliary_loss_clip": 0.01122789, + "auxiliary_loss_mlp": 0.01038187, + "balance_loss_clip": 1.02379799, + "balance_loss_mlp": 1.04475617, + "epoch": 0.42657447767924245, + "flos": 21865971352320.0, + "grad_norm": 1.6055215896501054, + "language_loss": 0.81466055, + "learning_rate": 2.564908739909464e-06, + "loss": 0.83627033, + "num_input_tokens_seen": 152316995, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 7095, + "time_per_iteration": 6.829655647277832 + }, + { + "auxiliary_loss_clip": 0.01122192, + "auxiliary_loss_mlp": 0.01044565, + "balance_loss_clip": 1.02972341, + "balance_loss_mlp": 1.04453826, + "epoch": 0.4266346009319104, + "flos": 21470236237440.0, + "grad_norm": 1.7098780852895776, + "language_loss": 0.80283463, + "learning_rate": 2.5645351261129996e-06, + "loss": 0.82450223, + "num_input_tokens_seen": 152334800, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.77734375, + "step": 7096, + "time_per_iteration": 3.894577980041504 + }, + { + "auxiliary_loss_clip": 0.01125109, + "auxiliary_loss_mlp": 0.01041794, + "balance_loss_clip": 1.02754259, + "balance_loss_mlp": 1.04520798, + "epoch": 0.4266947241845784, + "flos": 25519379569920.0, + "grad_norm": 1.947200367016257, + "language_loss": 0.65628326, + "learning_rate": 2.5641614909093066e-06, + "loss": 0.67795235, + "num_input_tokens_seen": 152355175, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.796875, + "step": 7097, + "time_per_iteration": 2.5192034244537354 + }, + { + "auxiliary_loss_clip": 0.01117089, + "auxiliary_loss_mlp": 0.0103085, + "balance_loss_clip": 1.01711667, + "balance_loss_mlp": 1.04297018, + "epoch": 0.42675484743724634, + "flos": 26541217676160.0, + "grad_norm": 1.8194330831870058, + "language_loss": 0.74512994, + "learning_rate": 2.5637878343125535e-06, + "loss": 0.76660931, + "num_input_tokens_seen": 152377245, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7421875, + "step": 7098, + "time_per_iteration": 2.498380661010742 + }, + { + "auxiliary_loss_clip": 0.01118318, + "auxiliary_loss_mlp": 0.01032967, + "balance_loss_clip": 1.01969302, + "balance_loss_mlp": 1.04259086, + "epoch": 0.4268149706899143, + "flos": 23112718467840.0, + "grad_norm": 1.7218259388529535, + "language_loss": 0.75169343, + "learning_rate": 2.5634141563369086e-06, + "loss": 0.77320623, + "num_input_tokens_seen": 152396985, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7578125, + "step": 7099, + "time_per_iteration": 2.4900684356689453 + }, + { + "auxiliary_loss_clip": 0.01122249, + "auxiliary_loss_mlp": 0.01039401, + "balance_loss_clip": 1.02458942, + "balance_loss_mlp": 1.0437479, + "epoch": 0.4268750939425823, + "flos": 22706532495360.0, + "grad_norm": 1.9952935228943551, + "language_loss": 0.83543229, + "learning_rate": 2.5630404569965432e-06, + "loss": 0.85704881, + "num_input_tokens_seen": 152415590, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78515625, + "step": 7100, + "time_per_iteration": 2.467902183532715 + }, + { + "auxiliary_loss_clip": 0.01121229, + "auxiliary_loss_mlp": 0.01034586, + "balance_loss_clip": 1.0208652, + "balance_loss_mlp": 1.04333866, + "epoch": 0.42693521719525024, + "flos": 25374875155200.0, + "grad_norm": 1.3501788659102136, + "language_loss": 0.82243335, + "learning_rate": 2.562666736305627e-06, + "loss": 0.84399146, + "num_input_tokens_seen": 152436735, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.77734375, + "step": 7101, + "time_per_iteration": 2.5363035202026367 + }, + { + "auxiliary_loss_clip": 0.01124462, + "auxiliary_loss_mlp": 0.01034069, + "balance_loss_clip": 1.01972795, + "balance_loss_mlp": 1.04426765, + "epoch": 0.42699534044791826, + "flos": 18150689957760.0, + "grad_norm": 1.8760573998828747, + "language_loss": 0.7243284, + "learning_rate": 2.5622929942783314e-06, + "loss": 0.74591374, + "num_input_tokens_seen": 152455685, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.80078125, + "step": 7102, + "time_per_iteration": 2.443894624710083 + }, + { + "auxiliary_loss_clip": 0.01117542, + "auxiliary_loss_mlp": 0.01033335, + "balance_loss_clip": 1.02012062, + "balance_loss_mlp": 1.04262853, + "epoch": 0.4270554637005862, + "flos": 13698413308800.0, + "grad_norm": 1.799822548331586, + "language_loss": 0.82910782, + "learning_rate": 2.5619192309288297e-06, + "loss": 0.85061657, + "num_input_tokens_seen": 152473500, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.75, + "step": 7103, + "time_per_iteration": 2.4751625061035156 + }, + { + "auxiliary_loss_clip": 0.01122919, + "auxiliary_loss_mlp": 0.01036815, + "balance_loss_clip": 1.02205122, + "balance_loss_mlp": 1.04319, + "epoch": 0.4271155869532542, + "flos": 17493596507520.0, + "grad_norm": 2.0452515416159227, + "language_loss": 0.73823762, + "learning_rate": 2.561545446271294e-06, + "loss": 0.759835, + "num_input_tokens_seen": 152491320, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.796875, + "step": 7104, + "time_per_iteration": 2.433727264404297 + }, + { + "auxiliary_loss_clip": 0.01120598, + "auxiliary_loss_mlp": 0.01031923, + "balance_loss_clip": 1.01842821, + "balance_loss_mlp": 1.04307532, + "epoch": 0.42717571020592215, + "flos": 32452293381120.0, + "grad_norm": 2.0713947006575713, + "language_loss": 0.75097072, + "learning_rate": 2.5611716403198987e-06, + "loss": 0.77249593, + "num_input_tokens_seen": 152511970, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.77734375, + "step": 7105, + "time_per_iteration": 2.553220748901367 + }, + { + "auxiliary_loss_clip": 0.01123627, + "auxiliary_loss_mlp": 0.01038594, + "balance_loss_clip": 1.02499223, + "balance_loss_mlp": 1.04497468, + "epoch": 0.4272358334585901, + "flos": 16253062444800.0, + "grad_norm": 1.944135826622959, + "language_loss": 0.7652669, + "learning_rate": 2.560797813088819e-06, + "loss": 0.78688908, + "num_input_tokens_seen": 152530515, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.78515625, + "step": 7106, + "time_per_iteration": 2.4320499897003174 + }, + { + "auxiliary_loss_clip": 0.01116905, + "auxiliary_loss_mlp": 0.0103438, + "balance_loss_clip": 1.02062345, + "balance_loss_mlp": 1.04073668, + "epoch": 0.4272959567112581, + "flos": 24200092938240.0, + "grad_norm": 1.7002032775641, + "language_loss": 0.79748225, + "learning_rate": 2.560423964592229e-06, + "loss": 0.81899506, + "num_input_tokens_seen": 152549295, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76171875, + "step": 7107, + "time_per_iteration": 2.5138087272644043 + }, + { + "auxiliary_loss_clip": 0.01118344, + "auxiliary_loss_mlp": 0.01032973, + "balance_loss_clip": 1.01978803, + "balance_loss_mlp": 1.04365969, + "epoch": 0.42735607996392605, + "flos": 27963495578880.0, + "grad_norm": 1.5777370161888564, + "language_loss": 0.67986816, + "learning_rate": 2.5600500948443075e-06, + "loss": 0.70138133, + "num_input_tokens_seen": 152570725, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.74609375, + "step": 7108, + "time_per_iteration": 2.5148940086364746 + }, + { + "auxiliary_loss_clip": 0.01118179, + "auxiliary_loss_mlp": 0.01037518, + "balance_loss_clip": 1.02417231, + "balance_loss_mlp": 1.04141963, + "epoch": 0.427416203216594, + "flos": 20295597674880.0, + "grad_norm": 1.697941372596268, + "language_loss": 0.71379381, + "learning_rate": 2.5596762038592294e-06, + "loss": 0.73535079, + "num_input_tokens_seen": 152588950, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.765625, + "step": 7109, + "time_per_iteration": 2.514293909072876 + }, + { + "auxiliary_loss_clip": 0.01119837, + "auxiliary_loss_mlp": 0.01032817, + "balance_loss_clip": 1.01668775, + "balance_loss_mlp": 1.04248762, + "epoch": 0.427476326469262, + "flos": 26943955943040.0, + "grad_norm": 1.808555345827523, + "language_loss": 0.64390564, + "learning_rate": 2.559302291651174e-06, + "loss": 0.66543221, + "num_input_tokens_seen": 152608965, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.7734375, + "step": 7110, + "time_per_iteration": 2.507896661758423 + }, + { + "auxiliary_loss_clip": 0.01121216, + "auxiliary_loss_mlp": 0.01033451, + "balance_loss_clip": 1.01876426, + "balance_loss_mlp": 1.04310989, + "epoch": 0.42753644972192995, + "flos": 25702847262720.0, + "grad_norm": 1.6911252843933642, + "language_loss": 0.76596475, + "learning_rate": 2.5589283582343197e-06, + "loss": 0.78751141, + "num_input_tokens_seen": 152630220, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78125, + "step": 7111, + "time_per_iteration": 2.5065102577209473 + }, + { + "auxiliary_loss_clip": 0.01122655, + "auxiliary_loss_mlp": 0.01034743, + "balance_loss_clip": 1.02051497, + "balance_loss_mlp": 1.04446638, + "epoch": 0.4275965729745979, + "flos": 18767419499520.0, + "grad_norm": 1.6101339491766522, + "language_loss": 0.73021042, + "learning_rate": 2.558554403622845e-06, + "loss": 0.75178432, + "num_input_tokens_seen": 152648835, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 7112, + "time_per_iteration": 2.462275266647339 + }, + { + "auxiliary_loss_clip": 0.0111568, + "auxiliary_loss_mlp": 0.01038733, + "balance_loss_clip": 1.02527392, + "balance_loss_mlp": 1.04112434, + "epoch": 0.4276566962272659, + "flos": 23764424878080.0, + "grad_norm": 1.5100904202471843, + "language_loss": 0.71723974, + "learning_rate": 2.5581804278309323e-06, + "loss": 0.7387839, + "num_input_tokens_seen": 152668375, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 7113, + "time_per_iteration": 2.517184019088745 + }, + { + "auxiliary_loss_clip": 0.01122905, + "auxiliary_loss_mlp": 0.01044494, + "balance_loss_clip": 1.03027248, + "balance_loss_mlp": 1.04463625, + "epoch": 0.42771681947993384, + "flos": 22492505306880.0, + "grad_norm": 4.019227207544938, + "language_loss": 0.62055492, + "learning_rate": 2.5578064308727617e-06, + "loss": 0.64222896, + "num_input_tokens_seen": 152689725, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 7114, + "time_per_iteration": 2.4808969497680664 + }, + { + "auxiliary_loss_clip": 0.01127351, + "auxiliary_loss_mlp": 0.01044357, + "balance_loss_clip": 1.02779305, + "balance_loss_mlp": 1.045439, + "epoch": 0.42777694273260186, + "flos": 25044712318080.0, + "grad_norm": 1.7285817614937915, + "language_loss": 0.64558339, + "learning_rate": 2.5574324127625153e-06, + "loss": 0.66730046, + "num_input_tokens_seen": 152709375, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8203125, + "step": 7115, + "time_per_iteration": 2.4979755878448486 + }, + { + "auxiliary_loss_clip": 0.01118312, + "auxiliary_loss_mlp": 0.01036556, + "balance_loss_clip": 1.02321672, + "balance_loss_mlp": 1.04225278, + "epoch": 0.4278370659852698, + "flos": 18661519226880.0, + "grad_norm": 1.5459011503250888, + "language_loss": 0.7331425, + "learning_rate": 2.5570583735143753e-06, + "loss": 0.75469118, + "num_input_tokens_seen": 152727510, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.76171875, + "step": 7116, + "time_per_iteration": 2.4514083862304688 + }, + { + "auxiliary_loss_clip": 0.01115475, + "auxiliary_loss_mlp": 0.01042446, + "balance_loss_clip": 1.02976263, + "balance_loss_mlp": 1.04102111, + "epoch": 0.4278971892379378, + "flos": 27308269635840.0, + "grad_norm": 1.5398002166428786, + "language_loss": 0.69214165, + "learning_rate": 2.5566843131425275e-06, + "loss": 0.7137208, + "num_input_tokens_seen": 152746670, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7421875, + "step": 7117, + "time_per_iteration": 2.522881269454956 + }, + { + "auxiliary_loss_clip": 0.01122059, + "auxiliary_loss_mlp": 0.01040733, + "balance_loss_clip": 1.02657676, + "balance_loss_mlp": 1.04530859, + "epoch": 0.42795731249060576, + "flos": 12888698970240.0, + "grad_norm": 2.268053258549222, + "language_loss": 0.69909632, + "learning_rate": 2.5563102316611536e-06, + "loss": 0.72072423, + "num_input_tokens_seen": 152760545, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.765625, + "step": 7118, + "time_per_iteration": 2.3870341777801514 + }, + { + "auxiliary_loss_clip": 0.01119033, + "auxiliary_loss_mlp": 0.01043307, + "balance_loss_clip": 1.02948511, + "balance_loss_mlp": 1.04353809, + "epoch": 0.4280174357432737, + "flos": 33401448316800.0, + "grad_norm": 2.1225989928468803, + "language_loss": 0.74740356, + "learning_rate": 2.55593612908444e-06, + "loss": 0.76902699, + "num_input_tokens_seen": 152780970, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75390625, + "step": 7119, + "time_per_iteration": 2.5487277507781982 + }, + { + "auxiliary_loss_clip": 0.01118083, + "auxiliary_loss_mlp": 0.01033891, + "balance_loss_clip": 1.02040291, + "balance_loss_mlp": 1.04196107, + "epoch": 0.4280775589959417, + "flos": 18259104182400.0, + "grad_norm": 1.8104905013477006, + "language_loss": 0.74987411, + "learning_rate": 2.555562005426573e-06, + "loss": 0.7713939, + "num_input_tokens_seen": 152798475, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.76171875, + "step": 7120, + "time_per_iteration": 2.415062427520752 + }, + { + "auxiliary_loss_clip": 0.01120406, + "auxiliary_loss_mlp": 0.01036574, + "balance_loss_clip": 1.02321029, + "balance_loss_mlp": 1.04422045, + "epoch": 0.42813768224860965, + "flos": 21471277731840.0, + "grad_norm": 1.6187265972443616, + "language_loss": 0.77002251, + "learning_rate": 2.5551878607017385e-06, + "loss": 0.7915923, + "num_input_tokens_seen": 152817555, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.76171875, + "step": 7121, + "time_per_iteration": 2.4686522483825684 + }, + { + "auxiliary_loss_clip": 0.01117478, + "auxiliary_loss_mlp": 0.01035893, + "balance_loss_clip": 1.02299464, + "balance_loss_mlp": 1.04225755, + "epoch": 0.4281978055012776, + "flos": 15669262696320.0, + "grad_norm": 1.8413618192799084, + "language_loss": 0.85525274, + "learning_rate": 2.554813694924126e-06, + "loss": 0.87678635, + "num_input_tokens_seen": 152836295, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.75, + "step": 7122, + "time_per_iteration": 2.4149863719940186 + }, + { + "auxiliary_loss_clip": 0.01114983, + "auxiliary_loss_mlp": 0.01034591, + "balance_loss_clip": 1.02088189, + "balance_loss_mlp": 1.04111362, + "epoch": 0.4282579287539456, + "flos": 17712005155200.0, + "grad_norm": 1.6495062264118223, + "language_loss": 0.81354666, + "learning_rate": 2.554439508107921e-06, + "loss": 0.83504236, + "num_input_tokens_seen": 152854950, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73828125, + "step": 7123, + "time_per_iteration": 2.4846510887145996 + }, + { + "auxiliary_loss_clip": 0.01116497, + "auxiliary_loss_mlp": 0.01035689, + "balance_loss_clip": 1.02171159, + "balance_loss_mlp": 1.04286349, + "epoch": 0.42831805200661355, + "flos": 19281157770240.0, + "grad_norm": 1.6842679543274752, + "language_loss": 0.81069416, + "learning_rate": 2.5540653002673153e-06, + "loss": 0.83221602, + "num_input_tokens_seen": 152873995, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.734375, + "step": 7124, + "time_per_iteration": 2.477781057357788 + }, + { + "auxiliary_loss_clip": 0.01116184, + "auxiliary_loss_mlp": 0.01039979, + "balance_loss_clip": 1.02485132, + "balance_loss_mlp": 1.04072952, + "epoch": 0.4283781752592815, + "flos": 19792633484160.0, + "grad_norm": 7.024350858631177, + "language_loss": 0.80178392, + "learning_rate": 2.553691071416498e-06, + "loss": 0.82334554, + "num_input_tokens_seen": 152892925, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.75390625, + "step": 7125, + "time_per_iteration": 2.466099262237549 + }, + { + "auxiliary_loss_clip": 0.01117521, + "auxiliary_loss_mlp": 0.01035658, + "balance_loss_clip": 1.0230993, + "balance_loss_mlp": 1.04386544, + "epoch": 0.4284382985119495, + "flos": 16508064072960.0, + "grad_norm": 1.7536027507395449, + "language_loss": 0.74772543, + "learning_rate": 2.553316821569659e-06, + "loss": 0.76925719, + "num_input_tokens_seen": 152910935, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.734375, + "step": 7126, + "time_per_iteration": 2.4476282596588135 + }, + { + "auxiliary_loss_clip": 0.01118141, + "auxiliary_loss_mlp": 0.01037481, + "balance_loss_clip": 1.02313387, + "balance_loss_mlp": 1.04261374, + "epoch": 0.42849842176461744, + "flos": 23330767979520.0, + "grad_norm": 2.2527301233175496, + "language_loss": 0.81376731, + "learning_rate": 2.5529425507409913e-06, + "loss": 0.83532357, + "num_input_tokens_seen": 152931030, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7578125, + "step": 7127, + "time_per_iteration": 2.50627064704895 + }, + { + "auxiliary_loss_clip": 0.01117482, + "auxiliary_loss_mlp": 0.01039553, + "balance_loss_clip": 1.02554011, + "balance_loss_mlp": 1.04140556, + "epoch": 0.4285585450172854, + "flos": 17274433674240.0, + "grad_norm": 1.7148593982179101, + "language_loss": 0.76451397, + "learning_rate": 2.5525682589446867e-06, + "loss": 0.78608435, + "num_input_tokens_seen": 152948085, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 7128, + "time_per_iteration": 2.4261910915374756 + }, + { + "auxiliary_loss_clip": 0.01119221, + "auxiliary_loss_mlp": 0.01034781, + "balance_loss_clip": 1.02018988, + "balance_loss_mlp": 1.04154372, + "epoch": 0.42861866826995343, + "flos": 24279599692800.0, + "grad_norm": 1.979642374109765, + "language_loss": 0.74111116, + "learning_rate": 2.552193946194937e-06, + "loss": 0.76265121, + "num_input_tokens_seen": 152966265, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.77734375, + "step": 7129, + "time_per_iteration": 2.4977691173553467 + }, + { + "auxiliary_loss_clip": 0.01119175, + "auxiliary_loss_mlp": 0.01034497, + "balance_loss_clip": 1.02102661, + "balance_loss_mlp": 1.04335773, + "epoch": 0.4286787915226214, + "flos": 24353108876160.0, + "grad_norm": 1.7995906720856931, + "language_loss": 0.77753568, + "learning_rate": 2.5518196125059394e-06, + "loss": 0.79907238, + "num_input_tokens_seen": 152986775, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 7130, + "time_per_iteration": 2.4983179569244385 + }, + { + "auxiliary_loss_clip": 0.01123055, + "auxiliary_loss_mlp": 0.01039363, + "balance_loss_clip": 1.02476025, + "balance_loss_mlp": 1.04523921, + "epoch": 0.42873891477528936, + "flos": 15449992122240.0, + "grad_norm": 1.8571755273934152, + "language_loss": 0.7349695, + "learning_rate": 2.551445257891886e-06, + "loss": 0.75659359, + "num_input_tokens_seen": 153003595, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.77734375, + "step": 7131, + "time_per_iteration": 2.5469563007354736 + }, + { + "auxiliary_loss_clip": 0.01120536, + "auxiliary_loss_mlp": 0.01036711, + "balance_loss_clip": 1.02257299, + "balance_loss_mlp": 1.04343748, + "epoch": 0.4287990380279573, + "flos": 17639573379840.0, + "grad_norm": 2.0596069487020268, + "language_loss": 0.76299751, + "learning_rate": 2.551070882366973e-06, + "loss": 0.78456992, + "num_input_tokens_seen": 153021960, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7734375, + "step": 7132, + "time_per_iteration": 2.432889223098755 + }, + { + "auxiliary_loss_clip": 0.01119567, + "auxiliary_loss_mlp": 0.01042884, + "balance_loss_clip": 1.02821565, + "balance_loss_mlp": 1.04352558, + "epoch": 0.4288591612806253, + "flos": 27162328677120.0, + "grad_norm": 1.5221162096651724, + "language_loss": 0.78525162, + "learning_rate": 2.550696485945397e-06, + "loss": 0.80687612, + "num_input_tokens_seen": 153042110, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7578125, + "step": 7133, + "time_per_iteration": 2.544379472732544 + }, + { + "auxiliary_loss_clip": 0.01120837, + "auxiliary_loss_mlp": 0.0103873, + "balance_loss_clip": 1.02484238, + "balance_loss_mlp": 1.04305482, + "epoch": 0.42891928453329325, + "flos": 17163182275200.0, + "grad_norm": 1.8479371259746051, + "language_loss": 0.75017452, + "learning_rate": 2.550322068641355e-06, + "loss": 0.77177012, + "num_input_tokens_seen": 153058925, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.77734375, + "step": 7134, + "time_per_iteration": 2.416792154312134 + }, + { + "auxiliary_loss_clip": 0.01114501, + "auxiliary_loss_mlp": 0.01031974, + "balance_loss_clip": 1.01937902, + "balance_loss_mlp": 1.04046178, + "epoch": 0.4289794077859612, + "flos": 18187031543040.0, + "grad_norm": 2.2902258120670975, + "language_loss": 0.84066433, + "learning_rate": 2.5499476304690455e-06, + "loss": 0.86212909, + "num_input_tokens_seen": 153078070, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7421875, + "step": 7135, + "time_per_iteration": 2.4513847827911377 + }, + { + "auxiliary_loss_clip": 0.01114319, + "auxiliary_loss_mlp": 0.01036122, + "balance_loss_clip": 1.02250218, + "balance_loss_mlp": 1.04050052, + "epoch": 0.4290395310386292, + "flos": 28256885867520.0, + "grad_norm": 1.9123929145525593, + "language_loss": 0.74716437, + "learning_rate": 2.549573171442666e-06, + "loss": 0.76866877, + "num_input_tokens_seen": 153096680, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73828125, + "step": 7136, + "time_per_iteration": 2.5260956287384033 + }, + { + "auxiliary_loss_clip": 0.01117454, + "auxiliary_loss_mlp": 0.01038013, + "balance_loss_clip": 1.0243752, + "balance_loss_mlp": 1.04027987, + "epoch": 0.42909965429129715, + "flos": 16216074414720.0, + "grad_norm": 1.9374198184766858, + "language_loss": 0.78982937, + "learning_rate": 2.5491986915764175e-06, + "loss": 0.81138408, + "num_input_tokens_seen": 153113305, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76953125, + "step": 7137, + "time_per_iteration": 6.664285898208618 + }, + { + "auxiliary_loss_clip": 0.01123569, + "auxiliary_loss_mlp": 0.01034938, + "balance_loss_clip": 1.02053773, + "balance_loss_mlp": 1.04498768, + "epoch": 0.4291597775439651, + "flos": 23112862122240.0, + "grad_norm": 1.8145904182691066, + "language_loss": 0.76599205, + "learning_rate": 2.548824190884499e-06, + "loss": 0.78757715, + "num_input_tokens_seen": 153132735, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7890625, + "step": 7138, + "time_per_iteration": 2.4640390872955322 + }, + { + "auxiliary_loss_clip": 0.01043511, + "auxiliary_loss_mlp": 0.01001663, + "balance_loss_clip": 1.00025678, + "balance_loss_mlp": 1.02006102, + "epoch": 0.4292199007966331, + "flos": 67546212681600.0, + "grad_norm": 0.7743592729173089, + "language_loss": 0.56193811, + "learning_rate": 2.548449669381113e-06, + "loss": 0.58238983, + "num_input_tokens_seen": 153187925, + "router_z_loss_clip": 0.01403809, + "router_z_loss_mlp": 0.234375, + "step": 7139, + "time_per_iteration": 2.938645362854004 + }, + { + "auxiliary_loss_clip": 0.01114131, + "auxiliary_loss_mlp": 0.01041532, + "balance_loss_clip": 1.02957499, + "balance_loss_mlp": 1.04185057, + "epoch": 0.42928002404930105, + "flos": 22999850956800.0, + "grad_norm": 1.6343660010586272, + "language_loss": 0.81107223, + "learning_rate": 2.5480751270804595e-06, + "loss": 0.83262885, + "num_input_tokens_seen": 153206990, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.72265625, + "step": 7140, + "time_per_iteration": 2.4621551036834717 + }, + { + "auxiliary_loss_clip": 0.01117324, + "auxiliary_loss_mlp": 0.01032433, + "balance_loss_clip": 1.01819944, + "balance_loss_mlp": 1.04155135, + "epoch": 0.429340147301969, + "flos": 11544922241280.0, + "grad_norm": 1.7453668118354997, + "language_loss": 0.81973499, + "learning_rate": 2.5477005639967424e-06, + "loss": 0.84123254, + "num_input_tokens_seen": 153222345, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7578125, + "step": 7141, + "time_per_iteration": 2.4552011489868164 + }, + { + "auxiliary_loss_clip": 0.011238, + "auxiliary_loss_mlp": 0.01040057, + "balance_loss_clip": 1.02569795, + "balance_loss_mlp": 1.04469872, + "epoch": 0.42940027055463703, + "flos": 25264988472960.0, + "grad_norm": 1.6365702711839187, + "language_loss": 0.86302745, + "learning_rate": 2.547325980144166e-06, + "loss": 0.88466609, + "num_input_tokens_seen": 153240570, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7890625, + "step": 7142, + "time_per_iteration": 2.466599464416504 + }, + { + "auxiliary_loss_clip": 0.01119038, + "auxiliary_loss_mlp": 0.01033549, + "balance_loss_clip": 1.0205493, + "balance_loss_mlp": 1.04692888, + "epoch": 0.429460393807305, + "flos": 23805004268160.0, + "grad_norm": 1.8779834210446977, + "language_loss": 0.78367496, + "learning_rate": 2.5469513755369323e-06, + "loss": 0.80520082, + "num_input_tokens_seen": 153259575, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71875, + "step": 7143, + "time_per_iteration": 2.528383493423462 + }, + { + "auxiliary_loss_clip": 0.01120121, + "auxiliary_loss_mlp": 0.01040708, + "balance_loss_clip": 1.02731538, + "balance_loss_mlp": 1.04566526, + "epoch": 0.42952051705997296, + "flos": 13918294414080.0, + "grad_norm": 2.185103050312315, + "language_loss": 0.76671416, + "learning_rate": 2.5465767501892484e-06, + "loss": 0.78832245, + "num_input_tokens_seen": 153276650, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.74609375, + "step": 7144, + "time_per_iteration": 2.4433047771453857 + }, + { + "auxiliary_loss_clip": 0.01119183, + "auxiliary_loss_mlp": 0.01031791, + "balance_loss_clip": 1.01801622, + "balance_loss_mlp": 1.043118, + "epoch": 0.4295806403126409, + "flos": 26760380509440.0, + "grad_norm": 2.969999234773645, + "language_loss": 0.73481476, + "learning_rate": 2.54620210411532e-06, + "loss": 0.75632453, + "num_input_tokens_seen": 153298025, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76171875, + "step": 7145, + "time_per_iteration": 2.5330073833465576 + }, + { + "auxiliary_loss_clip": 0.01120569, + "auxiliary_loss_mlp": 0.01038539, + "balance_loss_clip": 1.02447844, + "balance_loss_mlp": 1.04405165, + "epoch": 0.4296407635653089, + "flos": 20952619297920.0, + "grad_norm": 1.854643653820381, + "language_loss": 0.78928959, + "learning_rate": 2.545827437329352e-06, + "loss": 0.81088066, + "num_input_tokens_seen": 153315775, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.765625, + "step": 7146, + "time_per_iteration": 2.4481821060180664 + }, + { + "auxiliary_loss_clip": 0.01116396, + "auxiliary_loss_mlp": 0.0102847, + "balance_loss_clip": 1.01590514, + "balance_loss_mlp": 1.04295409, + "epoch": 0.42970088681797686, + "flos": 15852335339520.0, + "grad_norm": 1.9767254736067894, + "language_loss": 0.83134973, + "learning_rate": 2.5454527498455532e-06, + "loss": 0.85279846, + "num_input_tokens_seen": 153332765, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.734375, + "step": 7147, + "time_per_iteration": 2.500633478164673 + }, + { + "auxiliary_loss_clip": 0.01124897, + "auxiliary_loss_mlp": 0.01039853, + "balance_loss_clip": 1.02473724, + "balance_loss_mlp": 1.04802537, + "epoch": 0.4297610100706448, + "flos": 22382618624640.0, + "grad_norm": 1.8398177405042841, + "language_loss": 0.86894512, + "learning_rate": 2.545078041678131e-06, + "loss": 0.89059258, + "num_input_tokens_seen": 153350760, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.76953125, + "step": 7148, + "time_per_iteration": 2.481743097305298 + }, + { + "auxiliary_loss_clip": 0.01120854, + "auxiliary_loss_mlp": 0.01036737, + "balance_loss_clip": 1.02405918, + "balance_loss_mlp": 1.04469061, + "epoch": 0.4298211333233128, + "flos": 27925681536000.0, + "grad_norm": 1.5258683369520107, + "language_loss": 0.77855921, + "learning_rate": 2.5447033128412957e-06, + "loss": 0.80013508, + "num_input_tokens_seen": 153370765, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.76171875, + "step": 7149, + "time_per_iteration": 2.6060431003570557 + }, + { + "auxiliary_loss_clip": 0.01118454, + "auxiliary_loss_mlp": 0.01036469, + "balance_loss_clip": 1.02247977, + "balance_loss_mlp": 1.04456902, + "epoch": 0.42988125657598075, + "flos": 24425612478720.0, + "grad_norm": 1.7047076849986806, + "language_loss": 0.79828095, + "learning_rate": 2.544328563349256e-06, + "loss": 0.81983018, + "num_input_tokens_seen": 153390725, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.73828125, + "step": 7150, + "time_per_iteration": 2.4652955532073975 + }, + { + "auxiliary_loss_clip": 0.01125949, + "auxiliary_loss_mlp": 0.0104301, + "balance_loss_clip": 1.02763176, + "balance_loss_mlp": 1.0467031, + "epoch": 0.4299413798286487, + "flos": 15850180523520.0, + "grad_norm": 1.7972230644563891, + "language_loss": 0.74738395, + "learning_rate": 2.5439537932162222e-06, + "loss": 0.76907349, + "num_input_tokens_seen": 153408010, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.79296875, + "step": 7151, + "time_per_iteration": 2.5019421577453613 + }, + { + "auxiliary_loss_clip": 0.0112419, + "auxiliary_loss_mlp": 0.01036782, + "balance_loss_clip": 1.02284098, + "balance_loss_mlp": 1.0458225, + "epoch": 0.4300015030813167, + "flos": 22309504490880.0, + "grad_norm": 1.924911798883302, + "language_loss": 0.70084447, + "learning_rate": 2.543579002456406e-06, + "loss": 0.72245419, + "num_input_tokens_seen": 153426865, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.78515625, + "step": 7152, + "time_per_iteration": 2.456465482711792 + }, + { + "auxiliary_loss_clip": 0.01117938, + "auxiliary_loss_mlp": 0.01035992, + "balance_loss_clip": 1.02268243, + "balance_loss_mlp": 1.04186821, + "epoch": 0.43006162633398465, + "flos": 34897666366080.0, + "grad_norm": 1.5367633238023177, + "language_loss": 0.71064591, + "learning_rate": 2.54320419108402e-06, + "loss": 0.73218524, + "num_input_tokens_seen": 153449410, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7578125, + "step": 7153, + "time_per_iteration": 2.6120920181274414 + }, + { + "auxiliary_loss_clip": 0.01120146, + "auxiliary_loss_mlp": 0.01033168, + "balance_loss_clip": 1.01941729, + "balance_loss_mlp": 1.04342091, + "epoch": 0.4301217495866526, + "flos": 15961575576960.0, + "grad_norm": 1.8794751780958798, + "language_loss": 0.79155993, + "learning_rate": 2.542829359113276e-06, + "loss": 0.81309307, + "num_input_tokens_seen": 153467910, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.765625, + "step": 7154, + "time_per_iteration": 2.4222962856292725 + }, + { + "auxiliary_loss_clip": 0.0111738, + "auxiliary_loss_mlp": 0.01030563, + "balance_loss_clip": 1.01818347, + "balance_loss_mlp": 1.04361236, + "epoch": 0.43018187283932063, + "flos": 18770364414720.0, + "grad_norm": 1.4801057977091479, + "language_loss": 0.78793395, + "learning_rate": 2.542454506558389e-06, + "loss": 0.80941343, + "num_input_tokens_seen": 153487100, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.73828125, + "step": 7155, + "time_per_iteration": 2.4554193019866943 + }, + { + "auxiliary_loss_clip": 0.01117238, + "auxiliary_loss_mlp": 0.01028737, + "balance_loss_clip": 1.01582694, + "balance_loss_mlp": 1.04335082, + "epoch": 0.4302419960919886, + "flos": 20151703791360.0, + "grad_norm": 1.7176839192841982, + "language_loss": 0.88779187, + "learning_rate": 2.5420796334335723e-06, + "loss": 0.90925157, + "num_input_tokens_seen": 153505565, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.734375, + "step": 7156, + "time_per_iteration": 2.446831464767456 + }, + { + "auxiliary_loss_clip": 0.01120931, + "auxiliary_loss_mlp": 0.01033948, + "balance_loss_clip": 1.01953602, + "balance_loss_mlp": 1.04361558, + "epoch": 0.43030211934465656, + "flos": 26432731624320.0, + "grad_norm": 1.9517774058288286, + "language_loss": 0.82738447, + "learning_rate": 2.541704739753042e-06, + "loss": 0.84893334, + "num_input_tokens_seen": 153526130, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7734375, + "step": 7157, + "time_per_iteration": 2.5298144817352295 + }, + { + "auxiliary_loss_clip": 0.01124397, + "auxiliary_loss_mlp": 0.01035742, + "balance_loss_clip": 1.0210917, + "balance_loss_mlp": 1.04532623, + "epoch": 0.43036224259732453, + "flos": 24389234979840.0, + "grad_norm": 1.8458285726729726, + "language_loss": 0.72177351, + "learning_rate": 2.5413298255310132e-06, + "loss": 0.74337494, + "num_input_tokens_seen": 153546370, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7890625, + "step": 7158, + "time_per_iteration": 2.4691712856292725 + }, + { + "auxiliary_loss_clip": 0.0111825, + "auxiliary_loss_mlp": 0.0103104, + "balance_loss_clip": 1.01796317, + "balance_loss_mlp": 1.04215837, + "epoch": 0.4304223658499925, + "flos": 17201714590080.0, + "grad_norm": 2.077812294320108, + "language_loss": 0.82865965, + "learning_rate": 2.5409548907817034e-06, + "loss": 0.85015261, + "num_input_tokens_seen": 153562800, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7578125, + "step": 7159, + "time_per_iteration": 2.4462857246398926 + }, + { + "auxiliary_loss_clip": 0.01118056, + "auxiliary_loss_mlp": 0.01032317, + "balance_loss_clip": 1.01887655, + "balance_loss_mlp": 1.04236865, + "epoch": 0.43048248910266046, + "flos": 14903000835840.0, + "grad_norm": 2.094804075931644, + "language_loss": 0.83043528, + "learning_rate": 2.54057993551933e-06, + "loss": 0.85193908, + "num_input_tokens_seen": 153578395, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 7160, + "time_per_iteration": 2.587928533554077 + }, + { + "auxiliary_loss_clip": 0.01123066, + "auxiliary_loss_mlp": 0.01038163, + "balance_loss_clip": 1.02249885, + "balance_loss_mlp": 1.04402685, + "epoch": 0.4305426123553284, + "flos": 21579835610880.0, + "grad_norm": 3.027641474238522, + "language_loss": 0.77379316, + "learning_rate": 2.5402049597581116e-06, + "loss": 0.79540545, + "num_input_tokens_seen": 153596880, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.79296875, + "step": 7161, + "time_per_iteration": 2.502628803253174 + }, + { + "auxiliary_loss_clip": 0.01119327, + "auxiliary_loss_mlp": 0.01034462, + "balance_loss_clip": 1.020854, + "balance_loss_mlp": 1.04304039, + "epoch": 0.4306027356079964, + "flos": 22601278667520.0, + "grad_norm": 2.05136398687674, + "language_loss": 0.73137891, + "learning_rate": 2.5398299635122662e-06, + "loss": 0.75291681, + "num_input_tokens_seen": 153616570, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.765625, + "step": 7162, + "time_per_iteration": 2.439053773880005 + }, + { + "auxiliary_loss_clip": 0.01042786, + "auxiliary_loss_mlp": 0.01005692, + "balance_loss_clip": 1.00411832, + "balance_loss_mlp": 1.01966858, + "epoch": 0.43066285886066435, + "flos": 70672091806080.0, + "grad_norm": 0.7926335078551056, + "language_loss": 0.59016478, + "learning_rate": 2.5394549467960147e-06, + "loss": 0.61064959, + "num_input_tokens_seen": 153671450, + "router_z_loss_clip": 0.01574707, + "router_z_loss_mlp": 0.23046875, + "step": 7163, + "time_per_iteration": 2.9588072299957275 + }, + { + "auxiliary_loss_clip": 0.01115064, + "auxiliary_loss_mlp": 0.01035604, + "balance_loss_clip": 1.02299142, + "balance_loss_mlp": 1.04035139, + "epoch": 0.4307229821133323, + "flos": 26720591218560.0, + "grad_norm": 1.6277980092745115, + "language_loss": 0.79140532, + "learning_rate": 2.5390799096235783e-06, + "loss": 0.81291205, + "num_input_tokens_seen": 153691405, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.74609375, + "step": 7164, + "time_per_iteration": 2.484001398086548 + }, + { + "auxiliary_loss_clip": 0.01119155, + "auxiliary_loss_mlp": 0.01041343, + "balance_loss_clip": 1.0275383, + "balance_loss_mlp": 1.04078794, + "epoch": 0.4307831053660003, + "flos": 26177119464960.0, + "grad_norm": 1.8180486110770353, + "language_loss": 0.67282438, + "learning_rate": 2.538704852009177e-06, + "loss": 0.69442934, + "num_input_tokens_seen": 153711555, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.78125, + "step": 7165, + "time_per_iteration": 2.533599376678467 + }, + { + "auxiliary_loss_clip": 0.01119036, + "auxiliary_loss_mlp": 0.01043422, + "balance_loss_clip": 1.03069651, + "balance_loss_mlp": 1.04327762, + "epoch": 0.43084322861866825, + "flos": 18910343715840.0, + "grad_norm": 1.850302447549428, + "language_loss": 0.75248688, + "learning_rate": 2.538329773967034e-06, + "loss": 0.77411151, + "num_input_tokens_seen": 153730095, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7578125, + "step": 7166, + "time_per_iteration": 2.439861536026001 + }, + { + "auxiliary_loss_clip": 0.01117069, + "auxiliary_loss_mlp": 0.01036345, + "balance_loss_clip": 1.0239172, + "balance_loss_mlp": 1.04362941, + "epoch": 0.4309033518713362, + "flos": 26432911192320.0, + "grad_norm": 1.612504951400803, + "language_loss": 0.71537554, + "learning_rate": 2.537954675511372e-06, + "loss": 0.73690969, + "num_input_tokens_seen": 153749320, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.734375, + "step": 7167, + "time_per_iteration": 2.499190092086792 + }, + { + "auxiliary_loss_clip": 0.01111616, + "auxiliary_loss_mlp": 0.01034998, + "balance_loss_clip": 1.02232647, + "balance_loss_mlp": 1.03984129, + "epoch": 0.43096347512400424, + "flos": 21213295274880.0, + "grad_norm": 1.6022700342177734, + "language_loss": 0.78459173, + "learning_rate": 2.537579556656414e-06, + "loss": 0.80605787, + "num_input_tokens_seen": 153767825, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 7168, + "time_per_iteration": 2.4372310638427734 + }, + { + "auxiliary_loss_clip": 0.01118326, + "auxiliary_loss_mlp": 0.01041364, + "balance_loss_clip": 1.02733326, + "balance_loss_mlp": 1.04224193, + "epoch": 0.4310235983766722, + "flos": 16540131939840.0, + "grad_norm": 2.3121674941994383, + "language_loss": 0.82260263, + "learning_rate": 2.537204417416387e-06, + "loss": 0.8441996, + "num_input_tokens_seen": 153785350, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 7169, + "time_per_iteration": 2.4545183181762695 + }, + { + "auxiliary_loss_clip": 0.01038578, + "auxiliary_loss_mlp": 0.01010207, + "balance_loss_clip": 1.00865698, + "balance_loss_mlp": 1.0153358, + "epoch": 0.43108372162934017, + "flos": 64775704763520.0, + "grad_norm": 0.6800543146405372, + "language_loss": 0.60812157, + "learning_rate": 2.5368292578055132e-06, + "loss": 0.62860942, + "num_input_tokens_seen": 153856400, + "router_z_loss_clip": 0.01550293, + "router_z_loss_mlp": 0.23242188, + "step": 7170, + "time_per_iteration": 3.2204582691192627 + }, + { + "auxiliary_loss_clip": 0.01116832, + "auxiliary_loss_mlp": 0.01033041, + "balance_loss_clip": 1.02039874, + "balance_loss_mlp": 1.04148889, + "epoch": 0.43114384488200813, + "flos": 13444094039040.0, + "grad_norm": 2.0659828341911615, + "language_loss": 0.76225841, + "learning_rate": 2.536454077838021e-06, + "loss": 0.78375715, + "num_input_tokens_seen": 153875230, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.75390625, + "step": 7171, + "time_per_iteration": 2.465665817260742 + }, + { + "auxiliary_loss_clip": 0.01117327, + "auxiliary_loss_mlp": 0.0103468, + "balance_loss_clip": 1.02172232, + "balance_loss_mlp": 1.04197574, + "epoch": 0.4312039681346761, + "flos": 26286682924800.0, + "grad_norm": 1.6834410044967325, + "language_loss": 0.77283418, + "learning_rate": 2.5360788775281357e-06, + "loss": 0.7943542, + "num_input_tokens_seen": 153894740, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.75390625, + "step": 7172, + "time_per_iteration": 2.4916739463806152 + }, + { + "auxiliary_loss_clip": 0.0111787, + "auxiliary_loss_mlp": 0.01039799, + "balance_loss_clip": 1.02544653, + "balance_loss_mlp": 1.04015696, + "epoch": 0.43126409138734406, + "flos": 20376684627840.0, + "grad_norm": 1.7953579135961333, + "language_loss": 0.76852405, + "learning_rate": 2.535703656890086e-06, + "loss": 0.79010069, + "num_input_tokens_seen": 153913230, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.77734375, + "step": 7173, + "time_per_iteration": 2.4764745235443115 + }, + { + "auxiliary_loss_clip": 0.01115542, + "auxiliary_loss_mlp": 0.01029073, + "balance_loss_clip": 1.01571512, + "balance_loss_mlp": 1.04070854, + "epoch": 0.431324214640012, + "flos": 22123091882880.0, + "grad_norm": 1.4568106417702447, + "language_loss": 0.77103329, + "learning_rate": 2.5353284159381e-06, + "loss": 0.79247946, + "num_input_tokens_seen": 153933250, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.75, + "step": 7174, + "time_per_iteration": 2.4860222339630127 + }, + { + "auxiliary_loss_clip": 0.01119703, + "auxiliary_loss_mlp": 0.01032961, + "balance_loss_clip": 1.01815498, + "balance_loss_mlp": 1.04199743, + "epoch": 0.43138433789268, + "flos": 15231008856960.0, + "grad_norm": 1.4198827217143106, + "language_loss": 0.82505399, + "learning_rate": 2.534953154686407e-06, + "loss": 0.84658062, + "num_input_tokens_seen": 153951325, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.77734375, + "step": 7175, + "time_per_iteration": 2.462977647781372 + }, + { + "auxiliary_loss_clip": 0.01121086, + "auxiliary_loss_mlp": 0.01036761, + "balance_loss_clip": 1.0223192, + "balance_loss_mlp": 1.04153752, + "epoch": 0.43144446114534796, + "flos": 18150294908160.0, + "grad_norm": 2.338333143716513, + "language_loss": 0.74985862, + "learning_rate": 2.5345778731492366e-06, + "loss": 0.77143705, + "num_input_tokens_seen": 153966975, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.79296875, + "step": 7176, + "time_per_iteration": 2.4185218811035156 + }, + { + "auxiliary_loss_clip": 0.01117308, + "auxiliary_loss_mlp": 0.01034436, + "balance_loss_clip": 1.020643, + "balance_loss_mlp": 1.03969014, + "epoch": 0.4315045843980159, + "flos": 22929861306240.0, + "grad_norm": 1.6024853029290826, + "language_loss": 0.73364419, + "learning_rate": 2.534202571340819e-06, + "loss": 0.75516164, + "num_input_tokens_seen": 153986695, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.77734375, + "step": 7177, + "time_per_iteration": 2.487114667892456 + }, + { + "auxiliary_loss_clip": 0.01124437, + "auxiliary_loss_mlp": 0.01042376, + "balance_loss_clip": 1.0264492, + "balance_loss_mlp": 1.04060507, + "epoch": 0.4315647076506839, + "flos": 22126862810880.0, + "grad_norm": 1.878519248272382, + "language_loss": 0.81681836, + "learning_rate": 2.533827249275387e-06, + "loss": 0.83848649, + "num_input_tokens_seen": 154004710, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8359375, + "step": 7178, + "time_per_iteration": 2.443887948989868 + }, + { + "auxiliary_loss_clip": 0.01113093, + "auxiliary_loss_mlp": 0.01033286, + "balance_loss_clip": 1.01988733, + "balance_loss_mlp": 1.04052329, + "epoch": 0.43162483090335185, + "flos": 26871129118080.0, + "grad_norm": 1.4541906286028654, + "language_loss": 0.83824348, + "learning_rate": 2.5334519069671725e-06, + "loss": 0.8597073, + "num_input_tokens_seen": 154024320, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7265625, + "step": 7179, + "time_per_iteration": 5.329441547393799 + }, + { + "auxiliary_loss_clip": 0.01114931, + "auxiliary_loss_mlp": 0.01033766, + "balance_loss_clip": 1.02040303, + "balance_loss_mlp": 1.03945267, + "epoch": 0.4316849541560198, + "flos": 13913122855680.0, + "grad_norm": 2.045990303945265, + "language_loss": 0.75710779, + "learning_rate": 2.5330765444304075e-06, + "loss": 0.77859473, + "num_input_tokens_seen": 154041755, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.75390625, + "step": 7180, + "time_per_iteration": 2.5520315170288086 + }, + { + "auxiliary_loss_clip": 0.01116541, + "auxiliary_loss_mlp": 0.0103858, + "balance_loss_clip": 1.0240128, + "balance_loss_mlp": 1.03862667, + "epoch": 0.4317450774086878, + "flos": 16435165420800.0, + "grad_norm": 1.7639080321754919, + "language_loss": 0.81907403, + "learning_rate": 2.5327011616793274e-06, + "loss": 0.84062529, + "num_input_tokens_seen": 154056775, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.78125, + "step": 7181, + "time_per_iteration": 2.4059271812438965 + }, + { + "auxiliary_loss_clip": 0.01118777, + "auxiliary_loss_mlp": 0.0103845, + "balance_loss_clip": 1.02357888, + "balance_loss_mlp": 1.04020417, + "epoch": 0.4318052006613558, + "flos": 20554980762240.0, + "grad_norm": 1.5777864051255721, + "language_loss": 0.88434547, + "learning_rate": 2.532325758728165e-06, + "loss": 0.90591776, + "num_input_tokens_seen": 154075015, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78515625, + "step": 7182, + "time_per_iteration": 2.463463306427002 + }, + { + "auxiliary_loss_clip": 0.01113816, + "auxiliary_loss_mlp": 0.0103166, + "balance_loss_clip": 1.01873803, + "balance_loss_mlp": 1.03918862, + "epoch": 0.43186532391402377, + "flos": 22820046451200.0, + "grad_norm": 1.70694658333996, + "language_loss": 0.75826657, + "learning_rate": 2.5319503355911566e-06, + "loss": 0.77972138, + "num_input_tokens_seen": 154095170, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.74609375, + "step": 7183, + "time_per_iteration": 2.4562740325927734 + }, + { + "auxiliary_loss_clip": 0.01116225, + "auxiliary_loss_mlp": 0.01032272, + "balance_loss_clip": 1.01819921, + "balance_loss_mlp": 1.03917336, + "epoch": 0.43192544716669173, + "flos": 25556583081600.0, + "grad_norm": 2.311500131527462, + "language_loss": 0.77666485, + "learning_rate": 2.5315748922825393e-06, + "loss": 0.79814982, + "num_input_tokens_seen": 154116895, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76953125, + "step": 7184, + "time_per_iteration": 2.5283145904541016 + }, + { + "auxiliary_loss_clip": 0.01110208, + "auxiliary_loss_mlp": 0.01033883, + "balance_loss_clip": 1.02065074, + "balance_loss_mlp": 1.03938413, + "epoch": 0.4319855704193597, + "flos": 30954674701440.0, + "grad_norm": 1.5490664406704935, + "language_loss": 0.73325193, + "learning_rate": 2.5311994288165474e-06, + "loss": 0.75469285, + "num_input_tokens_seen": 154138395, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.70703125, + "step": 7185, + "time_per_iteration": 2.520885467529297 + }, + { + "auxiliary_loss_clip": 0.01118704, + "auxiliary_loss_mlp": 0.01037072, + "balance_loss_clip": 1.02283251, + "balance_loss_mlp": 1.03961062, + "epoch": 0.43204569367202766, + "flos": 24238732993920.0, + "grad_norm": 2.5540588454326, + "language_loss": 0.75974178, + "learning_rate": 2.530823945207421e-06, + "loss": 0.78129953, + "num_input_tokens_seen": 154156775, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7890625, + "step": 7186, + "time_per_iteration": 2.5005605220794678 + }, + { + "auxiliary_loss_clip": 0.01116031, + "auxiliary_loss_mlp": 0.01035244, + "balance_loss_clip": 1.02164185, + "balance_loss_mlp": 1.03987479, + "epoch": 0.43210581692469563, + "flos": 18406948561920.0, + "grad_norm": 5.067701176656461, + "language_loss": 0.76043296, + "learning_rate": 2.5304484414693962e-06, + "loss": 0.78194571, + "num_input_tokens_seen": 154177500, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.76171875, + "step": 7187, + "time_per_iteration": 2.4769227504730225 + }, + { + "auxiliary_loss_clip": 0.0103801, + "auxiliary_loss_mlp": 0.00999247, + "balance_loss_clip": 0.99792367, + "balance_loss_mlp": 1.0145607, + "epoch": 0.4321659401773636, + "flos": 49832378910720.0, + "grad_norm": 0.8526585096921939, + "language_loss": 0.68180382, + "learning_rate": 2.530072917616714e-06, + "loss": 0.70217645, + "num_input_tokens_seen": 154237110, + "router_z_loss_clip": 0.01324463, + "router_z_loss_mlp": 0.234375, + "step": 7188, + "time_per_iteration": 3.095301389694214 + }, + { + "auxiliary_loss_clip": 0.01112959, + "auxiliary_loss_mlp": 0.01035631, + "balance_loss_clip": 1.02231503, + "balance_loss_mlp": 1.03992498, + "epoch": 0.43222606343003156, + "flos": 17128564542720.0, + "grad_norm": 1.742468102969242, + "language_loss": 0.7809816, + "learning_rate": 2.529697373663614e-06, + "loss": 0.80246753, + "num_input_tokens_seen": 154253910, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73046875, + "step": 7189, + "time_per_iteration": 2.4332470893859863 + }, + { + "auxiliary_loss_clip": 0.01118752, + "auxiliary_loss_mlp": 0.01041299, + "balance_loss_clip": 1.0263027, + "balance_loss_mlp": 1.03817415, + "epoch": 0.4322861866826995, + "flos": 22749949059840.0, + "grad_norm": 1.8713383629003246, + "language_loss": 0.7119785, + "learning_rate": 2.5293218096243364e-06, + "loss": 0.73357898, + "num_input_tokens_seen": 154274770, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8046875, + "step": 7190, + "time_per_iteration": 2.494537115097046 + }, + { + "auxiliary_loss_clip": 0.01113042, + "auxiliary_loss_mlp": 0.0103584, + "balance_loss_clip": 1.02275729, + "balance_loss_mlp": 1.0380528, + "epoch": 0.4323463099353675, + "flos": 27891925729920.0, + "grad_norm": 1.5245278530879214, + "language_loss": 0.79833174, + "learning_rate": 2.5289462255131223e-06, + "loss": 0.81982064, + "num_input_tokens_seen": 154295035, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.75, + "step": 7191, + "time_per_iteration": 2.478376865386963 + }, + { + "auxiliary_loss_clip": 0.01113503, + "auxiliary_loss_mlp": 0.0103395, + "balance_loss_clip": 1.020944, + "balance_loss_mlp": 1.03872573, + "epoch": 0.43240643318803546, + "flos": 21614740652160.0, + "grad_norm": 1.7647822638177795, + "language_loss": 0.74647141, + "learning_rate": 2.5285706213442146e-06, + "loss": 0.76794595, + "num_input_tokens_seen": 154314905, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.75, + "step": 7192, + "time_per_iteration": 2.4613609313964844 + }, + { + "auxiliary_loss_clip": 0.011176, + "auxiliary_loss_mlp": 0.01041388, + "balance_loss_clip": 1.02696347, + "balance_loss_mlp": 1.04183233, + "epoch": 0.4324665564407034, + "flos": 17558378686080.0, + "grad_norm": 2.014554632256561, + "language_loss": 0.78898597, + "learning_rate": 2.5281949971318557e-06, + "loss": 0.81057584, + "num_input_tokens_seen": 154331740, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7578125, + "step": 7193, + "time_per_iteration": 2.4220309257507324 + }, + { + "auxiliary_loss_clip": 0.011155, + "auxiliary_loss_mlp": 0.01040121, + "balance_loss_clip": 1.02609015, + "balance_loss_mlp": 1.0394038, + "epoch": 0.4325266796933714, + "flos": 18402423448320.0, + "grad_norm": 1.7200377707292065, + "language_loss": 0.75406849, + "learning_rate": 2.5278193528902897e-06, + "loss": 0.77562475, + "num_input_tokens_seen": 154348740, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 7194, + "time_per_iteration": 2.466512441635132 + }, + { + "auxiliary_loss_clip": 0.01117198, + "auxiliary_loss_mlp": 0.01037831, + "balance_loss_clip": 1.02435398, + "balance_loss_mlp": 1.04108119, + "epoch": 0.4325868029460394, + "flos": 22564793427840.0, + "grad_norm": 5.005212308773382, + "language_loss": 0.60044503, + "learning_rate": 2.5274436886337613e-06, + "loss": 0.62199533, + "num_input_tokens_seen": 154368835, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.76171875, + "step": 7195, + "time_per_iteration": 2.4522454738616943 + }, + { + "auxiliary_loss_clip": 0.0111962, + "auxiliary_loss_mlp": 0.01041876, + "balance_loss_clip": 1.02713561, + "balance_loss_mlp": 1.04041934, + "epoch": 0.43264692619870737, + "flos": 14605516396800.0, + "grad_norm": 2.2806268233026628, + "language_loss": 0.64930809, + "learning_rate": 2.527068004376515e-06, + "loss": 0.67092311, + "num_input_tokens_seen": 154384620, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.7890625, + "step": 7196, + "time_per_iteration": 2.4453718662261963 + }, + { + "auxiliary_loss_clip": 0.011204, + "auxiliary_loss_mlp": 0.01034681, + "balance_loss_clip": 1.02024436, + "balance_loss_mlp": 1.04024911, + "epoch": 0.43270704945137534, + "flos": 21501657659520.0, + "grad_norm": 4.696072713783665, + "language_loss": 0.72759318, + "learning_rate": 2.526692300132797e-06, + "loss": 0.74914396, + "num_input_tokens_seen": 154402865, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.80078125, + "step": 7197, + "time_per_iteration": 2.500256061553955 + }, + { + "auxiliary_loss_clip": 0.01116404, + "auxiliary_loss_mlp": 0.01045003, + "balance_loss_clip": 1.03106129, + "balance_loss_mlp": 1.04246271, + "epoch": 0.4327671727040433, + "flos": 25155891889920.0, + "grad_norm": 1.598666024351184, + "language_loss": 0.72644413, + "learning_rate": 2.5263165759168547e-06, + "loss": 0.7480582, + "num_input_tokens_seen": 154423625, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7421875, + "step": 7198, + "time_per_iteration": 2.567762613296509 + }, + { + "auxiliary_loss_clip": 0.01115203, + "auxiliary_loss_mlp": 0.01034805, + "balance_loss_clip": 1.02138782, + "balance_loss_mlp": 1.03913903, + "epoch": 0.43282729595671127, + "flos": 25447163276160.0, + "grad_norm": 1.3766106050597056, + "language_loss": 0.81292808, + "learning_rate": 2.525940831742934e-06, + "loss": 0.83442813, + "num_input_tokens_seen": 154444775, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7578125, + "step": 7199, + "time_per_iteration": 2.4782636165618896 + }, + { + "auxiliary_loss_clip": 0.01118715, + "auxiliary_loss_mlp": 0.01041613, + "balance_loss_clip": 1.02829099, + "balance_loss_mlp": 1.04219055, + "epoch": 0.43288741920937923, + "flos": 24126116878080.0, + "grad_norm": 2.2182298419994346, + "language_loss": 0.68883061, + "learning_rate": 2.525565067625286e-06, + "loss": 0.71043384, + "num_input_tokens_seen": 154460815, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.765625, + "step": 7200, + "time_per_iteration": 2.4730873107910156 + }, + { + "auxiliary_loss_clip": 0.01117767, + "auxiliary_loss_mlp": 0.01043187, + "balance_loss_clip": 1.02809453, + "balance_loss_mlp": 1.04055738, + "epoch": 0.4329475424620472, + "flos": 19204955066880.0, + "grad_norm": 2.134839210265846, + "language_loss": 0.87135142, + "learning_rate": 2.525189283578157e-06, + "loss": 0.89296097, + "num_input_tokens_seen": 154479145, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7734375, + "step": 7201, + "time_per_iteration": 2.47463321685791 + }, + { + "auxiliary_loss_clip": 0.01125345, + "auxiliary_loss_mlp": 0.01042574, + "balance_loss_clip": 1.02696979, + "balance_loss_mlp": 1.04488945, + "epoch": 0.43300766571471516, + "flos": 22638374438400.0, + "grad_norm": 2.16649852661544, + "language_loss": 0.64551014, + "learning_rate": 2.5248134796157974e-06, + "loss": 0.66718936, + "num_input_tokens_seen": 154498905, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8046875, + "step": 7202, + "time_per_iteration": 2.520963668823242 + }, + { + "auxiliary_loss_clip": 0.0111734, + "auxiliary_loss_mlp": 0.01031708, + "balance_loss_clip": 1.01931047, + "balance_loss_mlp": 1.04092193, + "epoch": 0.4330677889673831, + "flos": 22121080721280.0, + "grad_norm": 1.7838197935762699, + "language_loss": 0.81707418, + "learning_rate": 2.5244376557524586e-06, + "loss": 0.83856463, + "num_input_tokens_seen": 154517270, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.765625, + "step": 7203, + "time_per_iteration": 2.474724531173706 + }, + { + "auxiliary_loss_clip": 0.01121178, + "auxiliary_loss_mlp": 0.01047095, + "balance_loss_clip": 1.03284955, + "balance_loss_mlp": 1.04118741, + "epoch": 0.4331279122200511, + "flos": 23221527742080.0, + "grad_norm": 1.864866510083204, + "language_loss": 0.81476939, + "learning_rate": 2.5240618120023912e-06, + "loss": 0.83645213, + "num_input_tokens_seen": 154535945, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.80078125, + "step": 7204, + "time_per_iteration": 2.527064323425293 + }, + { + "auxiliary_loss_clip": 0.01117221, + "auxiliary_loss_mlp": 0.01035641, + "balance_loss_clip": 1.02226007, + "balance_loss_mlp": 1.04050207, + "epoch": 0.43318803547271906, + "flos": 18259750627200.0, + "grad_norm": 1.78968083236078, + "language_loss": 0.73432428, + "learning_rate": 2.5236859483798468e-06, + "loss": 0.75585294, + "num_input_tokens_seen": 154554935, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.765625, + "step": 7205, + "time_per_iteration": 2.406350612640381 + }, + { + "auxiliary_loss_clip": 0.01116769, + "auxiliary_loss_mlp": 0.01037164, + "balance_loss_clip": 1.02414668, + "balance_loss_mlp": 1.04308569, + "epoch": 0.433248158725387, + "flos": 27418407713280.0, + "grad_norm": 1.6284714357196102, + "language_loss": 0.75110108, + "learning_rate": 2.5233100648990803e-06, + "loss": 0.77264041, + "num_input_tokens_seen": 154576065, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.73828125, + "step": 7206, + "time_per_iteration": 2.527343511581421 + }, + { + "auxiliary_loss_clip": 0.01115193, + "auxiliary_loss_mlp": 0.01036596, + "balance_loss_clip": 1.02247548, + "balance_loss_mlp": 1.03899562, + "epoch": 0.433308281978055, + "flos": 23218008209280.0, + "grad_norm": 2.1762520186821854, + "language_loss": 0.78700626, + "learning_rate": 2.522934161574342e-06, + "loss": 0.80852419, + "num_input_tokens_seen": 154595110, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 7207, + "time_per_iteration": 2.4470536708831787 + }, + { + "auxiliary_loss_clip": 0.01121794, + "auxiliary_loss_mlp": 0.0103555, + "balance_loss_clip": 1.02026772, + "balance_loss_mlp": 1.04215813, + "epoch": 0.433368405230723, + "flos": 15852407166720.0, + "grad_norm": 1.6893238531796995, + "language_loss": 0.81100202, + "learning_rate": 2.5225582384198888e-06, + "loss": 0.83257544, + "num_input_tokens_seen": 154612255, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.796875, + "step": 7208, + "time_per_iteration": 2.4634876251220703 + }, + { + "auxiliary_loss_clip": 0.0111942, + "auxiliary_loss_mlp": 0.01034218, + "balance_loss_clip": 1.02083671, + "balance_loss_mlp": 1.04337454, + "epoch": 0.433428528483391, + "flos": 19026084314880.0, + "grad_norm": 2.072374936090108, + "language_loss": 0.70074689, + "learning_rate": 2.5221822954499744e-06, + "loss": 0.72228324, + "num_input_tokens_seen": 154630440, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7578125, + "step": 7209, + "time_per_iteration": 2.4699575901031494 + }, + { + "auxiliary_loss_clip": 0.01113916, + "auxiliary_loss_mlp": 0.01034855, + "balance_loss_clip": 1.02102125, + "balance_loss_mlp": 1.0392952, + "epoch": 0.43348865173605894, + "flos": 24718248581760.0, + "grad_norm": 1.533200118487429, + "language_loss": 0.81202382, + "learning_rate": 2.5218063326788557e-06, + "loss": 0.83351159, + "num_input_tokens_seen": 154652515, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.74609375, + "step": 7210, + "time_per_iteration": 2.5462334156036377 + }, + { + "auxiliary_loss_clip": 0.01114494, + "auxiliary_loss_mlp": 0.01036333, + "balance_loss_clip": 1.02280319, + "balance_loss_mlp": 1.03895545, + "epoch": 0.4335487749887269, + "flos": 22090664880000.0, + "grad_norm": 1.7483210767520514, + "language_loss": 0.81570554, + "learning_rate": 2.5214303501207885e-06, + "loss": 0.83721387, + "num_input_tokens_seen": 154670965, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 7211, + "time_per_iteration": 2.4835634231567383 + }, + { + "auxiliary_loss_clip": 0.01113663, + "auxiliary_loss_mlp": 0.01033957, + "balance_loss_clip": 1.02150583, + "balance_loss_mlp": 1.03778863, + "epoch": 0.43360889824139487, + "flos": 22382941847040.0, + "grad_norm": 2.083548110229539, + "language_loss": 0.74785221, + "learning_rate": 2.521054347790029e-06, + "loss": 0.76932836, + "num_input_tokens_seen": 154689980, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7578125, + "step": 7212, + "time_per_iteration": 2.492600917816162 + }, + { + "auxiliary_loss_clip": 0.01117192, + "auxiliary_loss_mlp": 0.01032782, + "balance_loss_clip": 1.01990747, + "balance_loss_mlp": 1.04162407, + "epoch": 0.43366902149406283, + "flos": 17528286067200.0, + "grad_norm": 1.6640529640233686, + "language_loss": 0.76755834, + "learning_rate": 2.5206783257008375e-06, + "loss": 0.78905809, + "num_input_tokens_seen": 154706570, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.75390625, + "step": 7213, + "time_per_iteration": 2.4060752391815186 + }, + { + "auxiliary_loss_clip": 0.01114624, + "auxiliary_loss_mlp": 0.01034054, + "balance_loss_clip": 1.02070832, + "balance_loss_mlp": 1.03933454, + "epoch": 0.4337291447467308, + "flos": 19022672522880.0, + "grad_norm": 1.5718517519296942, + "language_loss": 0.64949977, + "learning_rate": 2.520302283867471e-06, + "loss": 0.67098659, + "num_input_tokens_seen": 154725210, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75, + "step": 7214, + "time_per_iteration": 2.512662410736084 + }, + { + "auxiliary_loss_clip": 0.01110495, + "auxiliary_loss_mlp": 0.01034308, + "balance_loss_clip": 1.02173781, + "balance_loss_mlp": 1.03869057, + "epoch": 0.43378926799939876, + "flos": 27234042180480.0, + "grad_norm": 1.5916808794412316, + "language_loss": 0.71483207, + "learning_rate": 2.519926222304191e-06, + "loss": 0.73628008, + "num_input_tokens_seen": 154745945, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71875, + "step": 7215, + "time_per_iteration": 2.5099971294403076 + }, + { + "auxiliary_loss_clip": 0.01113826, + "auxiliary_loss_mlp": 0.0103555, + "balance_loss_clip": 1.02224684, + "balance_loss_mlp": 1.04080701, + "epoch": 0.43384939125206673, + "flos": 15961108700160.0, + "grad_norm": 2.1029551712935692, + "language_loss": 0.7531544, + "learning_rate": 2.519550141025255e-06, + "loss": 0.77464819, + "num_input_tokens_seen": 154763580, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73046875, + "step": 7216, + "time_per_iteration": 2.496631383895874 + }, + { + "auxiliary_loss_clip": 0.01124083, + "auxiliary_loss_mlp": 0.01044464, + "balance_loss_clip": 1.02873421, + "balance_loss_mlp": 1.04232287, + "epoch": 0.4339095145047347, + "flos": 21793216354560.0, + "grad_norm": 2.4885665438006086, + "language_loss": 0.75943911, + "learning_rate": 2.519174040044927e-06, + "loss": 0.78112465, + "num_input_tokens_seen": 154776825, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.81640625, + "step": 7217, + "time_per_iteration": 2.4563424587249756 + }, + { + "auxiliary_loss_clip": 0.01116821, + "auxiliary_loss_mlp": 0.01034071, + "balance_loss_clip": 1.02048075, + "balance_loss_mlp": 1.04149795, + "epoch": 0.43396963775740266, + "flos": 14209853109120.0, + "grad_norm": 2.0012841708103677, + "language_loss": 0.73723286, + "learning_rate": 2.5187979193774664e-06, + "loss": 0.7587418, + "num_input_tokens_seen": 154794025, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.75390625, + "step": 7218, + "time_per_iteration": 2.5055034160614014 + }, + { + "auxiliary_loss_clip": 0.01119586, + "auxiliary_loss_mlp": 0.01030517, + "balance_loss_clip": 1.01706386, + "balance_loss_mlp": 1.0420804, + "epoch": 0.4340297610100706, + "flos": 19719052473600.0, + "grad_norm": 1.7121326309499156, + "language_loss": 0.68759704, + "learning_rate": 2.5184217790371367e-06, + "loss": 0.7090981, + "num_input_tokens_seen": 154813105, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7734375, + "step": 7219, + "time_per_iteration": 2.4480419158935547 + }, + { + "auxiliary_loss_clip": 0.01117032, + "auxiliary_loss_mlp": 0.01034297, + "balance_loss_clip": 1.02088046, + "balance_loss_mlp": 1.0424881, + "epoch": 0.4340898842627386, + "flos": 18953508885120.0, + "grad_norm": 1.5876624694807844, + "language_loss": 0.77227521, + "learning_rate": 2.518045619038202e-06, + "loss": 0.79378843, + "num_input_tokens_seen": 154833525, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7421875, + "step": 7220, + "time_per_iteration": 6.918288230895996 + }, + { + "auxiliary_loss_clip": 0.01116062, + "auxiliary_loss_mlp": 0.01035178, + "balance_loss_clip": 1.02162933, + "balance_loss_mlp": 1.04022503, + "epoch": 0.4341500075154066, + "flos": 22018304931840.0, + "grad_norm": 1.9118836764348202, + "language_loss": 0.69684327, + "learning_rate": 2.5176694393949243e-06, + "loss": 0.71835566, + "num_input_tokens_seen": 154853090, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7578125, + "step": 7221, + "time_per_iteration": 2.470270872116089 + }, + { + "auxiliary_loss_clip": 0.0111827, + "auxiliary_loss_mlp": 0.01037458, + "balance_loss_clip": 1.02436888, + "balance_loss_mlp": 1.04102325, + "epoch": 0.4342101307680746, + "flos": 23582465556480.0, + "grad_norm": 2.3043912227088206, + "language_loss": 0.64915985, + "learning_rate": 2.51729324012157e-06, + "loss": 0.67071712, + "num_input_tokens_seen": 154872055, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7734375, + "step": 7222, + "time_per_iteration": 2.553450584411621 + }, + { + "auxiliary_loss_clip": 0.01115314, + "auxiliary_loss_mlp": 0.01033134, + "balance_loss_clip": 1.01851892, + "balance_loss_mlp": 1.0400629, + "epoch": 0.43427025402074254, + "flos": 17967976450560.0, + "grad_norm": 1.98015103861908, + "language_loss": 0.73039752, + "learning_rate": 2.5169170212324053e-06, + "loss": 0.75188196, + "num_input_tokens_seen": 154886645, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.75390625, + "step": 7223, + "time_per_iteration": 2.4311954975128174 + }, + { + "auxiliary_loss_clip": 0.01117336, + "auxiliary_loss_mlp": 0.01030543, + "balance_loss_clip": 1.01639247, + "balance_loss_mlp": 1.03914881, + "epoch": 0.4343303772734105, + "flos": 26286395616000.0, + "grad_norm": 1.7516175042559776, + "language_loss": 0.93677819, + "learning_rate": 2.516540782741694e-06, + "loss": 0.95825702, + "num_input_tokens_seen": 154906775, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.78125, + "step": 7224, + "time_per_iteration": 2.5507140159606934 + }, + { + "auxiliary_loss_clip": 0.0111604, + "auxiliary_loss_mlp": 0.01035929, + "balance_loss_clip": 1.02230883, + "balance_loss_mlp": 1.04143298, + "epoch": 0.43439050052607847, + "flos": 26833961520000.0, + "grad_norm": 1.4456333860398556, + "language_loss": 0.61234355, + "learning_rate": 2.5161645246637056e-06, + "loss": 0.63386333, + "num_input_tokens_seen": 154926990, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 7225, + "time_per_iteration": 2.4982893466949463 + }, + { + "auxiliary_loss_clip": 0.01118584, + "auxiliary_loss_mlp": 0.01040064, + "balance_loss_clip": 1.02594388, + "balance_loss_mlp": 1.04326594, + "epoch": 0.43445062377874644, + "flos": 21397660807680.0, + "grad_norm": 1.8262630970377216, + "language_loss": 0.77771807, + "learning_rate": 2.5157882470127054e-06, + "loss": 0.79930449, + "num_input_tokens_seen": 154946210, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.75390625, + "step": 7226, + "time_per_iteration": 2.5427355766296387 + }, + { + "auxiliary_loss_clip": 0.0111488, + "auxiliary_loss_mlp": 0.01032848, + "balance_loss_clip": 1.01968753, + "balance_loss_mlp": 1.04169869, + "epoch": 0.4345107470314144, + "flos": 19901945548800.0, + "grad_norm": 1.6421213218207402, + "language_loss": 0.84485722, + "learning_rate": 2.515411949802964e-06, + "loss": 0.8663345, + "num_input_tokens_seen": 154964995, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73046875, + "step": 7227, + "time_per_iteration": 2.450390577316284 + }, + { + "auxiliary_loss_clip": 0.01115781, + "auxiliary_loss_mlp": 0.01035722, + "balance_loss_clip": 1.02163696, + "balance_loss_mlp": 1.04135513, + "epoch": 0.43457087028408237, + "flos": 26432623883520.0, + "grad_norm": 2.0443971193166735, + "language_loss": 0.76866895, + "learning_rate": 2.5150356330487498e-06, + "loss": 0.79018396, + "num_input_tokens_seen": 154984775, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 7228, + "time_per_iteration": 2.5690906047821045 + }, + { + "auxiliary_loss_clip": 0.01118098, + "auxiliary_loss_mlp": 0.01036235, + "balance_loss_clip": 1.02229989, + "balance_loss_mlp": 1.04278994, + "epoch": 0.43463099353675033, + "flos": 31868816855040.0, + "grad_norm": 1.4832672479414948, + "language_loss": 0.80732882, + "learning_rate": 2.5146592967643324e-06, + "loss": 0.82887214, + "num_input_tokens_seen": 155008125, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75390625, + "step": 7229, + "time_per_iteration": 2.552069902420044 + }, + { + "auxiliary_loss_clip": 0.01118257, + "auxiliary_loss_mlp": 0.01040852, + "balance_loss_clip": 1.02682161, + "balance_loss_mlp": 1.04213512, + "epoch": 0.4346911167894183, + "flos": 24571266128640.0, + "grad_norm": 2.091517296377785, + "language_loss": 0.81964421, + "learning_rate": 2.5142829409639834e-06, + "loss": 0.84123534, + "num_input_tokens_seen": 155027885, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 7230, + "time_per_iteration": 2.5944671630859375 + }, + { + "auxiliary_loss_clip": 0.01123399, + "auxiliary_loss_mlp": 0.01044498, + "balance_loss_clip": 1.03034186, + "balance_loss_mlp": 1.0445168, + "epoch": 0.43475124004208626, + "flos": 17090678672640.0, + "grad_norm": 2.146338977702966, + "language_loss": 0.77091062, + "learning_rate": 2.513906565661973e-06, + "loss": 0.79258955, + "num_input_tokens_seen": 155043375, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7890625, + "step": 7231, + "time_per_iteration": 2.460886001586914 + }, + { + "auxiliary_loss_clip": 0.01116567, + "auxiliary_loss_mlp": 0.01034718, + "balance_loss_clip": 1.02217722, + "balance_loss_mlp": 1.0421958, + "epoch": 0.4348113632947542, + "flos": 26104615862400.0, + "grad_norm": 1.391615561962781, + "language_loss": 0.6858201, + "learning_rate": 2.513530170872575e-06, + "loss": 0.70733297, + "num_input_tokens_seen": 155062930, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7421875, + "step": 7232, + "time_per_iteration": 2.614415407180786 + }, + { + "auxiliary_loss_clip": 0.01119763, + "auxiliary_loss_mlp": 0.01034466, + "balance_loss_clip": 1.02036333, + "balance_loss_mlp": 1.04160166, + "epoch": 0.4348714865474222, + "flos": 34200496316160.0, + "grad_norm": 1.6911603415584286, + "language_loss": 0.7200706, + "learning_rate": 2.5131537566100605e-06, + "loss": 0.74161285, + "num_input_tokens_seen": 155084980, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.78125, + "step": 7233, + "time_per_iteration": 2.5665411949157715 + }, + { + "auxiliary_loss_clip": 0.01120637, + "auxiliary_loss_mlp": 0.01040107, + "balance_loss_clip": 1.02490747, + "balance_loss_mlp": 1.04198027, + "epoch": 0.43493160980009016, + "flos": 31537468869120.0, + "grad_norm": 1.536262058034198, + "language_loss": 0.746382, + "learning_rate": 2.5127773228887053e-06, + "loss": 0.7679894, + "num_input_tokens_seen": 155107260, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.78515625, + "step": 7234, + "time_per_iteration": 2.577014207839966 + }, + { + "auxiliary_loss_clip": 0.01123093, + "auxiliary_loss_mlp": 0.01039703, + "balance_loss_clip": 1.02523136, + "balance_loss_mlp": 1.04223037, + "epoch": 0.4349917330527582, + "flos": 24061334699520.0, + "grad_norm": 1.829117772001415, + "language_loss": 0.58860987, + "learning_rate": 2.512400869722782e-06, + "loss": 0.61023784, + "num_input_tokens_seen": 155126720, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.80859375, + "step": 7235, + "time_per_iteration": 2.4759557247161865 + }, + { + "auxiliary_loss_clip": 0.01116416, + "auxiliary_loss_mlp": 0.01032457, + "balance_loss_clip": 1.01931453, + "balance_loss_mlp": 1.04053211, + "epoch": 0.43505185630542614, + "flos": 30519329863680.0, + "grad_norm": 1.4942606531447196, + "language_loss": 0.7751596, + "learning_rate": 2.512024397126566e-06, + "loss": 0.79664838, + "num_input_tokens_seen": 155148640, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7578125, + "step": 7236, + "time_per_iteration": 2.6113193035125732 + }, + { + "auxiliary_loss_clip": 0.01113405, + "auxiliary_loss_mlp": 0.01033592, + "balance_loss_clip": 1.01958489, + "balance_loss_mlp": 1.04001045, + "epoch": 0.4351119795580941, + "flos": 15735158196480.0, + "grad_norm": 1.713978383195529, + "language_loss": 0.8155449, + "learning_rate": 2.5116479051143345e-06, + "loss": 0.83701491, + "num_input_tokens_seen": 155165870, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.734375, + "step": 7237, + "time_per_iteration": 2.4341909885406494 + }, + { + "auxiliary_loss_clip": 0.01116801, + "auxiliary_loss_mlp": 0.01035584, + "balance_loss_clip": 1.02109957, + "balance_loss_mlp": 1.04103971, + "epoch": 0.4351721028107621, + "flos": 18731760272640.0, + "grad_norm": 3.0219595130639156, + "language_loss": 0.62897265, + "learning_rate": 2.5112713937003623e-06, + "loss": 0.65049648, + "num_input_tokens_seen": 155185315, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7578125, + "step": 7238, + "time_per_iteration": 2.5014469623565674 + }, + { + "auxiliary_loss_clip": 0.01111642, + "auxiliary_loss_mlp": 0.01041748, + "balance_loss_clip": 1.02848005, + "balance_loss_mlp": 1.03874493, + "epoch": 0.43523222606343004, + "flos": 25226887121280.0, + "grad_norm": 1.5839613956475427, + "language_loss": 0.85889554, + "learning_rate": 2.510894862898928e-06, + "loss": 0.88042951, + "num_input_tokens_seen": 155205790, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73046875, + "step": 7239, + "time_per_iteration": 2.4976143836975098 + }, + { + "auxiliary_loss_clip": 0.01118679, + "auxiliary_loss_mlp": 0.01032563, + "balance_loss_clip": 1.01896167, + "balance_loss_mlp": 1.0434041, + "epoch": 0.435292349316098, + "flos": 22709190101760.0, + "grad_norm": 1.4715329043565741, + "language_loss": 0.7269268, + "learning_rate": 2.510518312724309e-06, + "loss": 0.74843925, + "num_input_tokens_seen": 155226475, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.75, + "step": 7240, + "time_per_iteration": 2.5350124835968018 + }, + { + "auxiliary_loss_clip": 0.01119982, + "auxiliary_loss_mlp": 0.01033555, + "balance_loss_clip": 1.01897597, + "balance_loss_mlp": 1.04185855, + "epoch": 0.43535247256876597, + "flos": 25775889569280.0, + "grad_norm": 1.6878068305061695, + "language_loss": 0.81562793, + "learning_rate": 2.5101417431907842e-06, + "loss": 0.83716333, + "num_input_tokens_seen": 155247110, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.78125, + "step": 7241, + "time_per_iteration": 2.4924368858337402 + }, + { + "auxiliary_loss_clip": 0.01125084, + "auxiliary_loss_mlp": 0.01041861, + "balance_loss_clip": 1.02636945, + "balance_loss_mlp": 1.04387474, + "epoch": 0.43541259582143393, + "flos": 17528142412800.0, + "grad_norm": 3.067853888150903, + "language_loss": 0.79639387, + "learning_rate": 2.5097651543126345e-06, + "loss": 0.81806338, + "num_input_tokens_seen": 155261335, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8125, + "step": 7242, + "time_per_iteration": 2.4884228706359863 + }, + { + "auxiliary_loss_clip": 0.01118288, + "auxiliary_loss_mlp": 0.01035892, + "balance_loss_clip": 1.02146733, + "balance_loss_mlp": 1.03994465, + "epoch": 0.4354727190741019, + "flos": 15195205975680.0, + "grad_norm": 2.2924190339180135, + "language_loss": 0.6872946, + "learning_rate": 2.509388546104138e-06, + "loss": 0.70883644, + "num_input_tokens_seen": 155278510, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78515625, + "step": 7243, + "time_per_iteration": 2.428065538406372 + }, + { + "auxiliary_loss_clip": 0.01114023, + "auxiliary_loss_mlp": 0.0103125, + "balance_loss_clip": 1.01814318, + "balance_loss_mlp": 1.04141152, + "epoch": 0.43553284232676986, + "flos": 16649264436480.0, + "grad_norm": 1.6975937608840317, + "language_loss": 0.8125546, + "learning_rate": 2.5090119185795766e-06, + "loss": 0.83400726, + "num_input_tokens_seen": 155296450, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 7244, + "time_per_iteration": 2.4931905269622803 + }, + { + "auxiliary_loss_clip": 0.01118248, + "auxiliary_loss_mlp": 0.01030839, + "balance_loss_clip": 1.01785159, + "balance_loss_mlp": 1.0428431, + "epoch": 0.43559296557943783, + "flos": 23400865370880.0, + "grad_norm": 1.7229772693729426, + "language_loss": 0.74017537, + "learning_rate": 2.508635271753234e-06, + "loss": 0.7616663, + "num_input_tokens_seen": 155316080, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.75390625, + "step": 7245, + "time_per_iteration": 2.4678800106048584 + }, + { + "auxiliary_loss_clip": 0.01116663, + "auxiliary_loss_mlp": 0.0103805, + "balance_loss_clip": 1.0248003, + "balance_loss_mlp": 1.041008, + "epoch": 0.4356530888321058, + "flos": 22419067950720.0, + "grad_norm": 1.577710817669204, + "language_loss": 0.7671771, + "learning_rate": 2.508258605639389e-06, + "loss": 0.78872424, + "num_input_tokens_seen": 155336765, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75390625, + "step": 7246, + "time_per_iteration": 2.5109541416168213 + }, + { + "auxiliary_loss_clip": 0.01118541, + "auxiliary_loss_mlp": 0.01037789, + "balance_loss_clip": 1.02348995, + "balance_loss_mlp": 1.04209638, + "epoch": 0.43571321208477376, + "flos": 21616141282560.0, + "grad_norm": 1.7904357433283469, + "language_loss": 0.85364228, + "learning_rate": 2.5078819202523275e-06, + "loss": 0.87520564, + "num_input_tokens_seen": 155356440, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.765625, + "step": 7247, + "time_per_iteration": 2.4546074867248535 + }, + { + "auxiliary_loss_clip": 0.01117365, + "auxiliary_loss_mlp": 0.01039048, + "balance_loss_clip": 1.02600694, + "balance_loss_mlp": 1.0420599, + "epoch": 0.4357733353374418, + "flos": 23987358639360.0, + "grad_norm": 1.5214849587217785, + "language_loss": 0.72576565, + "learning_rate": 2.507505215606333e-06, + "loss": 0.74732977, + "num_input_tokens_seen": 155377070, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.75, + "step": 7248, + "time_per_iteration": 2.5288567543029785 + }, + { + "auxiliary_loss_clip": 0.01117005, + "auxiliary_loss_mlp": 0.01036462, + "balance_loss_clip": 1.02280688, + "balance_loss_mlp": 1.04225719, + "epoch": 0.43583345859010975, + "flos": 25264737077760.0, + "grad_norm": 1.6049303411594007, + "language_loss": 0.87276042, + "learning_rate": 2.5071284917156893e-06, + "loss": 0.8942951, + "num_input_tokens_seen": 155398415, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 7249, + "time_per_iteration": 2.497281312942505 + }, + { + "auxiliary_loss_clip": 0.0111866, + "auxiliary_loss_mlp": 0.01043972, + "balance_loss_clip": 1.03053117, + "balance_loss_mlp": 1.04112244, + "epoch": 0.4358935818427777, + "flos": 23696302734720.0, + "grad_norm": 1.835450546624213, + "language_loss": 0.81989753, + "learning_rate": 2.506751748594683e-06, + "loss": 0.84152383, + "num_input_tokens_seen": 155415625, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7734375, + "step": 7250, + "time_per_iteration": 2.5563321113586426 + }, + { + "auxiliary_loss_clip": 0.01124846, + "auxiliary_loss_mlp": 0.01038743, + "balance_loss_clip": 1.02484369, + "balance_loss_mlp": 1.04729581, + "epoch": 0.4359537050954457, + "flos": 29532827761920.0, + "grad_norm": 1.737362510880261, + "language_loss": 0.84760177, + "learning_rate": 2.5063749862575988e-06, + "loss": 0.86923766, + "num_input_tokens_seen": 155435505, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.77734375, + "step": 7251, + "time_per_iteration": 2.5427803993225098 + }, + { + "auxiliary_loss_clip": 0.01113729, + "auxiliary_loss_mlp": 0.01038592, + "balance_loss_clip": 1.02469254, + "balance_loss_mlp": 1.03979266, + "epoch": 0.43601382834811364, + "flos": 22711273090560.0, + "grad_norm": 1.5112002334274994, + "language_loss": 0.69018251, + "learning_rate": 2.5059982047187245e-06, + "loss": 0.71170568, + "num_input_tokens_seen": 155455425, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.73828125, + "step": 7252, + "time_per_iteration": 2.5041210651397705 + }, + { + "auxiliary_loss_clip": 0.01115762, + "auxiliary_loss_mlp": 0.01036375, + "balance_loss_clip": 1.02233779, + "balance_loss_mlp": 1.04257536, + "epoch": 0.4360739516007816, + "flos": 19098731571840.0, + "grad_norm": 1.7846888638519947, + "language_loss": 0.83733922, + "learning_rate": 2.505621403992348e-06, + "loss": 0.85886061, + "num_input_tokens_seen": 155474250, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.734375, + "step": 7253, + "time_per_iteration": 2.434375047683716 + }, + { + "auxiliary_loss_clip": 0.01116361, + "auxiliary_loss_mlp": 0.01038184, + "balance_loss_clip": 1.02386165, + "balance_loss_mlp": 1.04254532, + "epoch": 0.43613407485344957, + "flos": 23404420817280.0, + "grad_norm": 1.4489781171091827, + "language_loss": 0.70361209, + "learning_rate": 2.505244584092757e-06, + "loss": 0.72515762, + "num_input_tokens_seen": 155494685, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.73828125, + "step": 7254, + "time_per_iteration": 2.5304319858551025 + }, + { + "auxiliary_loss_clip": 0.01116723, + "auxiliary_loss_mlp": 0.01038221, + "balance_loss_clip": 1.02503693, + "balance_loss_mlp": 1.04295266, + "epoch": 0.43619419810611754, + "flos": 22637799820800.0, + "grad_norm": 2.261189856456705, + "language_loss": 0.80833256, + "learning_rate": 2.5048677450342406e-06, + "loss": 0.82988203, + "num_input_tokens_seen": 155513040, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73828125, + "step": 7255, + "time_per_iteration": 2.4619336128234863 + }, + { + "auxiliary_loss_clip": 0.01115842, + "auxiliary_loss_mlp": 0.0103715, + "balance_loss_clip": 1.02375722, + "balance_loss_mlp": 1.0402987, + "epoch": 0.4362543213587855, + "flos": 20047958334720.0, + "grad_norm": 1.6623402785544918, + "language_loss": 0.77301329, + "learning_rate": 2.504490886831089e-06, + "loss": 0.79454327, + "num_input_tokens_seen": 155530100, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75390625, + "step": 7256, + "time_per_iteration": 2.502201557159424 + }, + { + "auxiliary_loss_clip": 0.01117553, + "auxiliary_loss_mlp": 0.01039022, + "balance_loss_clip": 1.02568853, + "balance_loss_mlp": 1.04400241, + "epoch": 0.43631444461145347, + "flos": 21361319222400.0, + "grad_norm": 1.8521029690454978, + "language_loss": 0.76273203, + "learning_rate": 2.5041140094975922e-06, + "loss": 0.78429782, + "num_input_tokens_seen": 155549375, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.734375, + "step": 7257, + "time_per_iteration": 2.4721548557281494 + }, + { + "auxiliary_loss_clip": 0.01115455, + "auxiliary_loss_mlp": 0.01039484, + "balance_loss_clip": 1.02553642, + "balance_loss_mlp": 1.04027009, + "epoch": 0.43637456786412143, + "flos": 22418529246720.0, + "grad_norm": 1.675034420512285, + "language_loss": 0.73065001, + "learning_rate": 2.5037371130480417e-06, + "loss": 0.75219941, + "num_input_tokens_seen": 155569395, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75, + "step": 7258, + "time_per_iteration": 2.5251166820526123 + }, + { + "auxiliary_loss_clip": 0.0111727, + "auxiliary_loss_mlp": 0.01034289, + "balance_loss_clip": 1.02083004, + "balance_loss_mlp": 1.04163384, + "epoch": 0.4364346911167894, + "flos": 28548839612160.0, + "grad_norm": 2.491243867162561, + "language_loss": 0.76496607, + "learning_rate": 2.5033601974967297e-06, + "loss": 0.78648162, + "num_input_tokens_seen": 155589090, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 7259, + "time_per_iteration": 2.4948387145996094 + }, + { + "auxiliary_loss_clip": 0.01038123, + "auxiliary_loss_mlp": 0.01006149, + "balance_loss_clip": 1.00483215, + "balance_loss_mlp": 1.01505399, + "epoch": 0.43649481436945736, + "flos": 62659345380480.0, + "grad_norm": 0.7446610885032177, + "language_loss": 0.570382, + "learning_rate": 2.5029832628579483e-06, + "loss": 0.59082472, + "num_input_tokens_seen": 155648660, + "router_z_loss_clip": 0.01318359, + "router_z_loss_mlp": 0.23144531, + "step": 7260, + "time_per_iteration": 3.023712396621704 + }, + { + "auxiliary_loss_clip": 0.01119405, + "auxiliary_loss_mlp": 0.01044844, + "balance_loss_clip": 1.03061068, + "balance_loss_mlp": 1.0423255, + "epoch": 0.4365549376221254, + "flos": 30592120775040.0, + "grad_norm": 2.013500079504657, + "language_loss": 0.71356845, + "learning_rate": 2.5026063091459907e-06, + "loss": 0.7352109, + "num_input_tokens_seen": 155669945, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.76953125, + "step": 7261, + "time_per_iteration": 2.559830665588379 + }, + { + "auxiliary_loss_clip": 0.01117377, + "auxiliary_loss_mlp": 0.01045312, + "balance_loss_clip": 1.0311265, + "balance_loss_mlp": 1.04076374, + "epoch": 0.43661506087479335, + "flos": 17165875795200.0, + "grad_norm": 1.767533570577482, + "language_loss": 0.69423878, + "learning_rate": 2.5022293363751522e-06, + "loss": 0.71586561, + "num_input_tokens_seen": 155688555, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.765625, + "step": 7262, + "time_per_iteration": 5.4921791553497314 + }, + { + "auxiliary_loss_clip": 0.01111099, + "auxiliary_loss_mlp": 0.01029231, + "balance_loss_clip": 1.01699996, + "balance_loss_mlp": 1.04062569, + "epoch": 0.4366751841274613, + "flos": 22047499710720.0, + "grad_norm": 1.7128833789230435, + "language_loss": 0.80033064, + "learning_rate": 2.501852344559726e-06, + "loss": 0.82173395, + "num_input_tokens_seen": 155705370, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.70703125, + "step": 7263, + "time_per_iteration": 2.5026779174804688 + }, + { + "auxiliary_loss_clip": 0.0111778, + "auxiliary_loss_mlp": 0.01046117, + "balance_loss_clip": 1.03210425, + "balance_loss_mlp": 1.043383, + "epoch": 0.4367353073801293, + "flos": 15997306631040.0, + "grad_norm": 1.8087965620474522, + "language_loss": 0.75092399, + "learning_rate": 2.50147533371401e-06, + "loss": 0.77256304, + "num_input_tokens_seen": 155721890, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 7264, + "time_per_iteration": 2.487065553665161 + }, + { + "auxiliary_loss_clip": 0.01114844, + "auxiliary_loss_mlp": 0.01035156, + "balance_loss_clip": 1.02143478, + "balance_loss_mlp": 1.04089546, + "epoch": 0.43679543063279724, + "flos": 38217535868160.0, + "grad_norm": 1.8571442110240568, + "language_loss": 0.61855227, + "learning_rate": 2.501098303852298e-06, + "loss": 0.6400522, + "num_input_tokens_seen": 155743970, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.73828125, + "step": 7265, + "time_per_iteration": 2.5982677936553955 + }, + { + "auxiliary_loss_clip": 0.01112809, + "auxiliary_loss_mlp": 0.01031457, + "balance_loss_clip": 1.01859391, + "balance_loss_mlp": 1.04026711, + "epoch": 0.4368555538854652, + "flos": 15193230727680.0, + "grad_norm": 2.1628188735926845, + "language_loss": 0.72982574, + "learning_rate": 2.5007212549888884e-06, + "loss": 0.75126845, + "num_input_tokens_seen": 155761830, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7265625, + "step": 7266, + "time_per_iteration": 2.4690847396850586 + }, + { + "auxiliary_loss_clip": 0.0111929, + "auxiliary_loss_mlp": 0.01037863, + "balance_loss_clip": 1.0240345, + "balance_loss_mlp": 1.04332638, + "epoch": 0.4369156771381332, + "flos": 23069086421760.0, + "grad_norm": 2.2896909207829954, + "language_loss": 0.81570059, + "learning_rate": 2.5003441871380794e-06, + "loss": 0.83727205, + "num_input_tokens_seen": 155779610, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76171875, + "step": 7267, + "time_per_iteration": 2.463283061981201 + }, + { + "auxiliary_loss_clip": 0.01113248, + "auxiliary_loss_mlp": 0.01030451, + "balance_loss_clip": 1.01803577, + "balance_loss_mlp": 1.04085267, + "epoch": 0.43697580039080114, + "flos": 23441085624960.0, + "grad_norm": 1.9116109849221483, + "language_loss": 0.74723095, + "learning_rate": 2.4999671003141674e-06, + "loss": 0.76866794, + "num_input_tokens_seen": 155798765, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.72265625, + "step": 7268, + "time_per_iteration": 2.516263723373413 + }, + { + "auxiliary_loss_clip": 0.01121105, + "auxiliary_loss_mlp": 0.01042039, + "balance_loss_clip": 1.02736425, + "balance_loss_mlp": 1.04315591, + "epoch": 0.4370359236434691, + "flos": 18514680428160.0, + "grad_norm": 1.9119374296408282, + "language_loss": 0.7954827, + "learning_rate": 2.499589994531454e-06, + "loss": 0.81711417, + "num_input_tokens_seen": 155817750, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78125, + "step": 7269, + "time_per_iteration": 2.4647111892700195 + }, + { + "auxiliary_loss_clip": 0.01117424, + "auxiliary_loss_mlp": 0.01037217, + "balance_loss_clip": 1.02404499, + "balance_loss_mlp": 1.04315174, + "epoch": 0.43709604689613707, + "flos": 23222497409280.0, + "grad_norm": 2.072373926876921, + "language_loss": 0.75031221, + "learning_rate": 2.499212869804237e-06, + "loss": 0.77185863, + "num_input_tokens_seen": 155836490, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 7270, + "time_per_iteration": 2.4963974952697754 + }, + { + "auxiliary_loss_clip": 0.01116927, + "auxiliary_loss_mlp": 0.01029854, + "balance_loss_clip": 1.01639485, + "balance_loss_mlp": 1.04269087, + "epoch": 0.43715617014880503, + "flos": 23803711378560.0, + "grad_norm": 1.906091328168401, + "language_loss": 0.79437554, + "learning_rate": 2.4988357261468182e-06, + "loss": 0.81584334, + "num_input_tokens_seen": 155856225, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7421875, + "step": 7271, + "time_per_iteration": 2.487238645553589 + }, + { + "auxiliary_loss_clip": 0.01039832, + "auxiliary_loss_mlp": 0.01001038, + "balance_loss_clip": 0.99965489, + "balance_loss_mlp": 1.01678514, + "epoch": 0.437216293401473, + "flos": 61941204766080.0, + "grad_norm": 0.6948313241096988, + "language_loss": 0.54902828, + "learning_rate": 2.4984585635734993e-06, + "loss": 0.56943697, + "num_input_tokens_seen": 155916770, + "router_z_loss_clip": 0.01385498, + "router_z_loss_mlp": 0.23046875, + "step": 7272, + "time_per_iteration": 3.1392502784729004 + }, + { + "auxiliary_loss_clip": 0.011197, + "auxiliary_loss_mlp": 0.01042804, + "balance_loss_clip": 1.0286535, + "balance_loss_mlp": 1.04332781, + "epoch": 0.43727641665414096, + "flos": 21982250655360.0, + "grad_norm": 2.967819772960297, + "language_loss": 0.70136559, + "learning_rate": 2.498081382098581e-06, + "loss": 0.72299063, + "num_input_tokens_seen": 155936490, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.765625, + "step": 7273, + "time_per_iteration": 2.468592643737793 + }, + { + "auxiliary_loss_clip": 0.01119234, + "auxiliary_loss_mlp": 0.01039173, + "balance_loss_clip": 1.02515411, + "balance_loss_mlp": 1.04280722, + "epoch": 0.437336539906809, + "flos": 39530860842240.0, + "grad_norm": 1.832145479464728, + "language_loss": 0.75091398, + "learning_rate": 2.497704181736367e-06, + "loss": 0.77249801, + "num_input_tokens_seen": 155957595, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.765625, + "step": 7274, + "time_per_iteration": 2.669516086578369 + }, + { + "auxiliary_loss_clip": 0.01112894, + "auxiliary_loss_mlp": 0.01029332, + "balance_loss_clip": 1.01741123, + "balance_loss_mlp": 1.04002881, + "epoch": 0.43739666315947695, + "flos": 17457147181440.0, + "grad_norm": 1.8126381729021082, + "language_loss": 0.80507416, + "learning_rate": 2.49732696250116e-06, + "loss": 0.82649636, + "num_input_tokens_seen": 155975710, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.7265625, + "step": 7275, + "time_per_iteration": 2.455235481262207 + }, + { + "auxiliary_loss_clip": 0.01118348, + "auxiliary_loss_mlp": 0.01036773, + "balance_loss_clip": 1.02357626, + "balance_loss_mlp": 1.04496706, + "epoch": 0.4374567864121449, + "flos": 16358747235840.0, + "grad_norm": 2.065941875742038, + "language_loss": 0.80955482, + "learning_rate": 2.496949724407266e-06, + "loss": 0.83110607, + "num_input_tokens_seen": 155993090, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 7276, + "time_per_iteration": 2.543306827545166 + }, + { + "auxiliary_loss_clip": 0.01122471, + "auxiliary_loss_mlp": 0.01034927, + "balance_loss_clip": 1.02145052, + "balance_loss_mlp": 1.04409111, + "epoch": 0.4375169096648129, + "flos": 30587523834240.0, + "grad_norm": 1.794283698167311, + "language_loss": 0.73373604, + "learning_rate": 2.496572467468988e-06, + "loss": 0.75530994, + "num_input_tokens_seen": 156013685, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.78515625, + "step": 7277, + "time_per_iteration": 2.5931403636932373 + }, + { + "auxiliary_loss_clip": 0.01117806, + "auxiliary_loss_mlp": 0.01035681, + "balance_loss_clip": 1.0222764, + "balance_loss_mlp": 1.04351854, + "epoch": 0.43757703291748085, + "flos": 30555599621760.0, + "grad_norm": 1.8969119275678887, + "language_loss": 0.72953606, + "learning_rate": 2.4961951917006317e-06, + "loss": 0.75107086, + "num_input_tokens_seen": 156034300, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7421875, + "step": 7278, + "time_per_iteration": 2.576266288757324 + }, + { + "auxiliary_loss_clip": 0.0111536, + "auxiliary_loss_mlp": 0.01033831, + "balance_loss_clip": 1.02152252, + "balance_loss_mlp": 1.04212785, + "epoch": 0.4376371561701488, + "flos": 21397373498880.0, + "grad_norm": 1.6273415021791042, + "language_loss": 0.65815622, + "learning_rate": 2.4958178971165046e-06, + "loss": 0.6796481, + "num_input_tokens_seen": 156053805, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.734375, + "step": 7279, + "time_per_iteration": 2.4717864990234375 + }, + { + "auxiliary_loss_clip": 0.01122391, + "auxiliary_loss_mlp": 0.01034407, + "balance_loss_clip": 1.02098393, + "balance_loss_mlp": 1.04393768, + "epoch": 0.4376972794228168, + "flos": 23404384903680.0, + "grad_norm": 1.838486718423984, + "language_loss": 0.82088757, + "learning_rate": 2.4954405837309126e-06, + "loss": 0.84245551, + "num_input_tokens_seen": 156073295, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.78515625, + "step": 7280, + "time_per_iteration": 2.5370771884918213 + }, + { + "auxiliary_loss_clip": 0.01114089, + "auxiliary_loss_mlp": 0.01033911, + "balance_loss_clip": 1.0209589, + "balance_loss_mlp": 1.04176164, + "epoch": 0.43775740267548474, + "flos": 22892945103360.0, + "grad_norm": 1.430381072646336, + "language_loss": 0.76786566, + "learning_rate": 2.4950632515581653e-06, + "loss": 0.78934562, + "num_input_tokens_seen": 156094540, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.72265625, + "step": 7281, + "time_per_iteration": 2.5260467529296875 + }, + { + "auxiliary_loss_clip": 0.01116043, + "auxiliary_loss_mlp": 0.01038639, + "balance_loss_clip": 1.02582431, + "balance_loss_mlp": 1.04211211, + "epoch": 0.4378175259281527, + "flos": 23294390480640.0, + "grad_norm": 1.8435972134321474, + "language_loss": 0.7572853, + "learning_rate": 2.494685900612569e-06, + "loss": 0.77883214, + "num_input_tokens_seen": 156114070, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7421875, + "step": 7282, + "time_per_iteration": 2.5332953929901123 + }, + { + "auxiliary_loss_clip": 0.01119087, + "auxiliary_loss_mlp": 0.01039188, + "balance_loss_clip": 1.02581239, + "balance_loss_mlp": 1.04421043, + "epoch": 0.43787764918082067, + "flos": 23876897339520.0, + "grad_norm": 1.8874106414487752, + "language_loss": 0.8494271, + "learning_rate": 2.4943085309084333e-06, + "loss": 0.87100983, + "num_input_tokens_seen": 156132130, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.75, + "step": 7283, + "time_per_iteration": 2.458500623703003 + }, + { + "auxiliary_loss_clip": 0.01119709, + "auxiliary_loss_mlp": 0.01034549, + "balance_loss_clip": 1.02060771, + "balance_loss_mlp": 1.04216719, + "epoch": 0.43793777243348864, + "flos": 23988148738560.0, + "grad_norm": 1.9095323636494845, + "language_loss": 0.8005324, + "learning_rate": 2.49393114246007e-06, + "loss": 0.82207501, + "num_input_tokens_seen": 156150820, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7734375, + "step": 7284, + "time_per_iteration": 2.5258796215057373 + }, + { + "auxiliary_loss_clip": 0.01115467, + "auxiliary_loss_mlp": 0.01040827, + "balance_loss_clip": 1.02851903, + "balance_loss_mlp": 1.04236269, + "epoch": 0.4379978956861566, + "flos": 18624064320000.0, + "grad_norm": 1.535068058496724, + "language_loss": 0.8028115, + "learning_rate": 2.493553735281787e-06, + "loss": 0.82437444, + "num_input_tokens_seen": 156170125, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.734375, + "step": 7285, + "time_per_iteration": 2.4441394805908203 + }, + { + "auxiliary_loss_clip": 0.0111367, + "auxiliary_loss_mlp": 0.01028929, + "balance_loss_clip": 1.01576853, + "balance_loss_mlp": 1.04086363, + "epoch": 0.43805801893882457, + "flos": 21981388728960.0, + "grad_norm": 1.9937836479025883, + "language_loss": 0.75031531, + "learning_rate": 2.493176309387897e-06, + "loss": 0.77174133, + "num_input_tokens_seen": 156187320, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7265625, + "step": 7286, + "time_per_iteration": 2.539954423904419 + }, + { + "auxiliary_loss_clip": 0.01118753, + "auxiliary_loss_mlp": 0.01030019, + "balance_loss_clip": 1.01642346, + "balance_loss_mlp": 1.04179096, + "epoch": 0.43811814219149253, + "flos": 26393337383040.0, + "grad_norm": 1.7090844157721894, + "language_loss": 0.73834682, + "learning_rate": 2.492798864792712e-06, + "loss": 0.75983447, + "num_input_tokens_seen": 156207455, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.76953125, + "step": 7287, + "time_per_iteration": 2.5056257247924805 + }, + { + "auxiliary_loss_clip": 0.01117808, + "auxiliary_loss_mlp": 0.010426, + "balance_loss_clip": 1.02887869, + "balance_loss_mlp": 1.04187727, + "epoch": 0.43817826544416055, + "flos": 17493309198720.0, + "grad_norm": 1.8325493621162303, + "language_loss": 0.82288051, + "learning_rate": 2.492421401510545e-06, + "loss": 0.84448457, + "num_input_tokens_seen": 156226560, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7578125, + "step": 7288, + "time_per_iteration": 2.4812850952148438 + }, + { + "auxiliary_loss_clip": 0.01117047, + "auxiliary_loss_mlp": 0.01033722, + "balance_loss_clip": 1.02008474, + "balance_loss_mlp": 1.03895211, + "epoch": 0.4382383886968285, + "flos": 21581020759680.0, + "grad_norm": 1.476666560822241, + "language_loss": 0.84346598, + "learning_rate": 2.4920439195557093e-06, + "loss": 0.86497366, + "num_input_tokens_seen": 156246740, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.78125, + "step": 7289, + "time_per_iteration": 2.482379674911499 + }, + { + "auxiliary_loss_clip": 0.01119976, + "auxiliary_loss_mlp": 0.01036661, + "balance_loss_clip": 1.0235244, + "balance_loss_mlp": 1.04139173, + "epoch": 0.4382985119494965, + "flos": 27923742201600.0, + "grad_norm": 1.4352131560569001, + "language_loss": 0.78107727, + "learning_rate": 2.4916664189425183e-06, + "loss": 0.80264366, + "num_input_tokens_seen": 156266440, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.78515625, + "step": 7290, + "time_per_iteration": 2.5521459579467773 + }, + { + "auxiliary_loss_clip": 0.01115969, + "auxiliary_loss_mlp": 0.0104054, + "balance_loss_clip": 1.02761197, + "balance_loss_mlp": 1.04235792, + "epoch": 0.43835863520216445, + "flos": 24936836797440.0, + "grad_norm": 3.384239132873348, + "language_loss": 0.77987993, + "learning_rate": 2.491288899685288e-06, + "loss": 0.80144495, + "num_input_tokens_seen": 156286900, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.734375, + "step": 7291, + "time_per_iteration": 2.512519121170044 + }, + { + "auxiliary_loss_clip": 0.01117762, + "auxiliary_loss_mlp": 0.01031364, + "balance_loss_clip": 1.01792359, + "balance_loss_mlp": 1.04297888, + "epoch": 0.4384187584548324, + "flos": 33510293504640.0, + "grad_norm": 1.5428221976657872, + "language_loss": 0.65224636, + "learning_rate": 2.4909113617983325e-06, + "loss": 0.67373765, + "num_input_tokens_seen": 156307690, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 7292, + "time_per_iteration": 2.597714424133301 + }, + { + "auxiliary_loss_clip": 0.0111598, + "auxiliary_loss_mlp": 0.01031038, + "balance_loss_clip": 1.01864016, + "balance_loss_mlp": 1.03967905, + "epoch": 0.4384788817075004, + "flos": 23951052967680.0, + "grad_norm": 1.884679810356821, + "language_loss": 0.74216962, + "learning_rate": 2.49053380529597e-06, + "loss": 0.76363981, + "num_input_tokens_seen": 156326620, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.76171875, + "step": 7293, + "time_per_iteration": 2.4943923950195312 + }, + { + "auxiliary_loss_clip": 0.01119197, + "auxiliary_loss_mlp": 0.01040872, + "balance_loss_clip": 1.02732337, + "balance_loss_mlp": 1.04433274, + "epoch": 0.43853900496016834, + "flos": 19098516090240.0, + "grad_norm": 2.4110491255972684, + "language_loss": 0.78757977, + "learning_rate": 2.490156230192516e-06, + "loss": 0.8091805, + "num_input_tokens_seen": 156345495, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.75, + "step": 7294, + "time_per_iteration": 2.495358467102051 + }, + { + "auxiliary_loss_clip": 0.0111963, + "auxiliary_loss_mlp": 0.01041568, + "balance_loss_clip": 1.02864015, + "balance_loss_mlp": 1.04313052, + "epoch": 0.4385991282128363, + "flos": 13225362168960.0, + "grad_norm": 1.7229696907351246, + "language_loss": 0.73184276, + "learning_rate": 2.4897786365022883e-06, + "loss": 0.7534548, + "num_input_tokens_seen": 156363155, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.765625, + "step": 7295, + "time_per_iteration": 2.4645302295684814 + }, + { + "auxiliary_loss_clip": 0.01119056, + "auxiliary_loss_mlp": 0.01039679, + "balance_loss_clip": 1.02573109, + "balance_loss_mlp": 1.042575, + "epoch": 0.4386592514655043, + "flos": 14319883445760.0, + "grad_norm": 2.059865438640582, + "language_loss": 0.75337231, + "learning_rate": 2.4894010242396063e-06, + "loss": 0.77495956, + "num_input_tokens_seen": 156380940, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.765625, + "step": 7296, + "time_per_iteration": 2.46444034576416 + }, + { + "auxiliary_loss_clip": 0.01117475, + "auxiliary_loss_mlp": 0.01033002, + "balance_loss_clip": 1.01976418, + "balance_loss_mlp": 1.04255402, + "epoch": 0.43871937471817224, + "flos": 22784423137920.0, + "grad_norm": 1.6034841999072227, + "language_loss": 0.69515687, + "learning_rate": 2.4890233934187873e-06, + "loss": 0.71666169, + "num_input_tokens_seen": 156400415, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75, + "step": 7297, + "time_per_iteration": 2.4995949268341064 + }, + { + "auxiliary_loss_clip": 0.01115206, + "auxiliary_loss_mlp": 0.01031948, + "balance_loss_clip": 1.01913857, + "balance_loss_mlp": 1.04173827, + "epoch": 0.4387794979708402, + "flos": 28072304853120.0, + "grad_norm": 1.494373898338378, + "language_loss": 0.70457232, + "learning_rate": 2.4886457440541535e-06, + "loss": 0.72604382, + "num_input_tokens_seen": 156421120, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.734375, + "step": 7298, + "time_per_iteration": 2.574982166290283 + }, + { + "auxiliary_loss_clip": 0.01117164, + "auxiliary_loss_mlp": 0.01029544, + "balance_loss_clip": 1.01672888, + "balance_loss_mlp": 1.04384279, + "epoch": 0.43883962122350817, + "flos": 26249551240320.0, + "grad_norm": 1.5912334767066174, + "language_loss": 0.7241621, + "learning_rate": 2.4882680761600238e-06, + "loss": 0.74562919, + "num_input_tokens_seen": 156441535, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.734375, + "step": 7299, + "time_per_iteration": 2.539013385772705 + }, + { + "auxiliary_loss_clip": 0.01120808, + "auxiliary_loss_mlp": 0.01047348, + "balance_loss_clip": 1.03278041, + "balance_loss_mlp": 1.043944, + "epoch": 0.43889974447617613, + "flos": 25883765089920.0, + "grad_norm": 1.8082969607549542, + "language_loss": 0.77112591, + "learning_rate": 2.487890389750719e-06, + "loss": 0.79280752, + "num_input_tokens_seen": 156462015, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.76953125, + "step": 7300, + "time_per_iteration": 2.567291259765625 + }, + { + "auxiliary_loss_clip": 0.0111673, + "auxiliary_loss_mlp": 0.01037561, + "balance_loss_clip": 1.02442431, + "balance_loss_mlp": 1.04064155, + "epoch": 0.43895986772884416, + "flos": 25046615738880.0, + "grad_norm": 1.6241879676388415, + "language_loss": 0.70685148, + "learning_rate": 2.4875126848405626e-06, + "loss": 0.72839439, + "num_input_tokens_seen": 156482165, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7578125, + "step": 7301, + "time_per_iteration": 2.497025489807129 + }, + { + "auxiliary_loss_clip": 0.01122863, + "auxiliary_loss_mlp": 0.01033554, + "balance_loss_clip": 1.01932693, + "balance_loss_mlp": 1.04512143, + "epoch": 0.4390199909815121, + "flos": 25994585525760.0, + "grad_norm": 1.911748384222125, + "language_loss": 0.70491576, + "learning_rate": 2.4871349614438757e-06, + "loss": 0.72647995, + "num_input_tokens_seen": 156503170, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.77734375, + "step": 7302, + "time_per_iteration": 2.5212793350219727 + }, + { + "auxiliary_loss_clip": 0.011184, + "auxiliary_loss_mlp": 0.01039693, + "balance_loss_clip": 1.02676439, + "balance_loss_mlp": 1.04383337, + "epoch": 0.4390801142341801, + "flos": 29022249888000.0, + "grad_norm": 1.741042450815644, + "language_loss": 0.82304549, + "learning_rate": 2.486757219574983e-06, + "loss": 0.84462643, + "num_input_tokens_seen": 156523005, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.74609375, + "step": 7303, + "time_per_iteration": 2.5407814979553223 + }, + { + "auxiliary_loss_clip": 0.01123737, + "auxiliary_loss_mlp": 0.01042372, + "balance_loss_clip": 1.02753651, + "balance_loss_mlp": 1.04429436, + "epoch": 0.43914023748684805, + "flos": 33438544087680.0, + "grad_norm": 2.4492152950747412, + "language_loss": 0.68408841, + "learning_rate": 2.4863794592482067e-06, + "loss": 0.70574951, + "num_input_tokens_seen": 156544440, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 7304, + "time_per_iteration": 4.099287509918213 + }, + { + "auxiliary_loss_clip": 0.01116014, + "auxiliary_loss_mlp": 0.01039361, + "balance_loss_clip": 1.02631354, + "balance_loss_mlp": 1.04335666, + "epoch": 0.439200360739516, + "flos": 34531844302080.0, + "grad_norm": 1.4059546174528585, + "language_loss": 0.78115439, + "learning_rate": 2.486001680477873e-06, + "loss": 0.80270815, + "num_input_tokens_seen": 156565410, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 7305, + "time_per_iteration": 2.6079509258270264 + }, + { + "auxiliary_loss_clip": 0.01116718, + "auxiliary_loss_mlp": 0.01037045, + "balance_loss_clip": 1.02376556, + "balance_loss_mlp": 1.04186165, + "epoch": 0.439260483992184, + "flos": 21907843632000.0, + "grad_norm": 1.688110038500655, + "language_loss": 0.68754542, + "learning_rate": 2.485623883278308e-06, + "loss": 0.70908302, + "num_input_tokens_seen": 156584210, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.74609375, + "step": 7306, + "time_per_iteration": 2.4539954662323 + }, + { + "auxiliary_loss_clip": 0.01119821, + "auxiliary_loss_mlp": 0.0103523, + "balance_loss_clip": 1.0214076, + "balance_loss_mlp": 1.04369712, + "epoch": 0.43932060724485195, + "flos": 20996430912000.0, + "grad_norm": 1.4603628541776523, + "language_loss": 0.6270709, + "learning_rate": 2.4852460676638344e-06, + "loss": 0.64862138, + "num_input_tokens_seen": 156602730, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76171875, + "step": 7307, + "time_per_iteration": 2.490736484527588 + }, + { + "auxiliary_loss_clip": 0.01121175, + "auxiliary_loss_mlp": 0.01032991, + "balance_loss_clip": 1.02001536, + "balance_loss_mlp": 1.04338455, + "epoch": 0.4393807304975199, + "flos": 17747053850880.0, + "grad_norm": 1.9032558944481925, + "language_loss": 0.72409779, + "learning_rate": 2.4848682336487828e-06, + "loss": 0.74563944, + "num_input_tokens_seen": 156619405, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.77734375, + "step": 7308, + "time_per_iteration": 2.4319982528686523 + }, + { + "auxiliary_loss_clip": 0.01120176, + "auxiliary_loss_mlp": 0.01037474, + "balance_loss_clip": 1.02347863, + "balance_loss_mlp": 1.04077995, + "epoch": 0.4394408537501879, + "flos": 22528523669760.0, + "grad_norm": 1.6404677903158766, + "language_loss": 0.76631165, + "learning_rate": 2.4844903812474787e-06, + "loss": 0.78788805, + "num_input_tokens_seen": 156638165, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.79296875, + "step": 7309, + "time_per_iteration": 2.5045857429504395 + }, + { + "auxiliary_loss_clip": 0.01115088, + "auxiliary_loss_mlp": 0.01032267, + "balance_loss_clip": 1.01943445, + "balance_loss_mlp": 1.04314303, + "epoch": 0.43950097700285584, + "flos": 23440654661760.0, + "grad_norm": 1.788496009330223, + "language_loss": 0.70666951, + "learning_rate": 2.484112510474251e-06, + "loss": 0.72814304, + "num_input_tokens_seen": 156658845, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 7310, + "time_per_iteration": 2.4732789993286133 + }, + { + "auxiliary_loss_clip": 0.01120896, + "auxiliary_loss_mlp": 0.01036599, + "balance_loss_clip": 1.02293789, + "balance_loss_mlp": 1.04397106, + "epoch": 0.4395611002555238, + "flos": 23180696956800.0, + "grad_norm": 2.1134854859852505, + "language_loss": 0.75800377, + "learning_rate": 2.483734621343429e-06, + "loss": 0.77957869, + "num_input_tokens_seen": 156677275, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.765625, + "step": 7311, + "time_per_iteration": 2.5372462272644043 + }, + { + "auxiliary_loss_clip": 0.01119727, + "auxiliary_loss_mlp": 0.01034557, + "balance_loss_clip": 1.02171779, + "balance_loss_mlp": 1.04376173, + "epoch": 0.43962122350819177, + "flos": 22127365601280.0, + "grad_norm": 1.9313159099964634, + "language_loss": 0.8127231, + "learning_rate": 2.483356713869341e-06, + "loss": 0.83426595, + "num_input_tokens_seen": 156695815, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7578125, + "step": 7312, + "time_per_iteration": 2.4858858585357666 + }, + { + "auxiliary_loss_clip": 0.01115776, + "auxiliary_loss_mlp": 0.01037621, + "balance_loss_clip": 1.02404332, + "balance_loss_mlp": 1.04030704, + "epoch": 0.43968134676085974, + "flos": 17420554200960.0, + "grad_norm": 2.2005104401689177, + "language_loss": 0.85444236, + "learning_rate": 2.482978788066318e-06, + "loss": 0.87597632, + "num_input_tokens_seen": 156714385, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.75390625, + "step": 7313, + "time_per_iteration": 2.493032932281494 + }, + { + "auxiliary_loss_clip": 0.01119815, + "auxiliary_loss_mlp": 0.01035042, + "balance_loss_clip": 1.02176809, + "balance_loss_mlp": 1.04182911, + "epoch": 0.43974147001352776, + "flos": 18952646958720.0, + "grad_norm": 3.8587100296686145, + "language_loss": 0.67464912, + "learning_rate": 2.4826008439486904e-06, + "loss": 0.69619775, + "num_input_tokens_seen": 156732615, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.78125, + "step": 7314, + "time_per_iteration": 2.4542195796966553 + }, + { + "auxiliary_loss_clip": 0.01121265, + "auxiliary_loss_mlp": 0.01034379, + "balance_loss_clip": 1.02063417, + "balance_loss_mlp": 1.04389846, + "epoch": 0.4398015932661957, + "flos": 18953508885120.0, + "grad_norm": 1.8025616803524547, + "language_loss": 0.76954508, + "learning_rate": 2.4822228815307915e-06, + "loss": 0.79110146, + "num_input_tokens_seen": 156750920, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7734375, + "step": 7315, + "time_per_iteration": 2.4988253116607666 + }, + { + "auxiliary_loss_clip": 0.01117641, + "auxiliary_loss_mlp": 0.01032745, + "balance_loss_clip": 1.01938725, + "balance_loss_mlp": 1.04280567, + "epoch": 0.4398617165188637, + "flos": 24199913370240.0, + "grad_norm": 2.4575060004131895, + "language_loss": 0.74807358, + "learning_rate": 2.4818449008269523e-06, + "loss": 0.76957744, + "num_input_tokens_seen": 156768520, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.75, + "step": 7316, + "time_per_iteration": 2.530104398727417 + }, + { + "auxiliary_loss_clip": 0.0112083, + "auxiliary_loss_mlp": 0.01041846, + "balance_loss_clip": 1.02928746, + "balance_loss_mlp": 1.04640257, + "epoch": 0.43992183977153165, + "flos": 22236677665920.0, + "grad_norm": 2.8405076524150568, + "language_loss": 0.65180635, + "learning_rate": 2.481466901851506e-06, + "loss": 0.67343318, + "num_input_tokens_seen": 156788700, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.74609375, + "step": 7317, + "time_per_iteration": 2.5233771800994873 + }, + { + "auxiliary_loss_clip": 0.01121891, + "auxiliary_loss_mlp": 0.01034231, + "balance_loss_clip": 1.02082634, + "balance_loss_mlp": 1.04455566, + "epoch": 0.4399819630241996, + "flos": 18697465762560.0, + "grad_norm": 1.7710834755986071, + "language_loss": 0.7968365, + "learning_rate": 2.4810888846187865e-06, + "loss": 0.8183977, + "num_input_tokens_seen": 156806470, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7734375, + "step": 7318, + "time_per_iteration": 2.4618961811065674 + }, + { + "auxiliary_loss_clip": 0.01122714, + "auxiliary_loss_mlp": 0.01036897, + "balance_loss_clip": 1.02316427, + "balance_loss_mlp": 1.04423118, + "epoch": 0.4400420862768676, + "flos": 23879375377920.0, + "grad_norm": 1.4932738321413537, + "language_loss": 0.79472506, + "learning_rate": 2.4807108491431283e-06, + "loss": 0.81632113, + "num_input_tokens_seen": 156825895, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.78515625, + "step": 7319, + "time_per_iteration": 2.5342819690704346 + }, + { + "auxiliary_loss_clip": 0.01117114, + "auxiliary_loss_mlp": 0.01040515, + "balance_loss_clip": 1.02637124, + "balance_loss_mlp": 1.04102063, + "epoch": 0.44010220952953555, + "flos": 28037615293440.0, + "grad_norm": 1.641668171652613, + "language_loss": 0.80221331, + "learning_rate": 2.4803327954388667e-06, + "loss": 0.82378966, + "num_input_tokens_seen": 156845990, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 7320, + "time_per_iteration": 2.520888566970825 + }, + { + "auxiliary_loss_clip": 0.01116164, + "auxiliary_loss_mlp": 0.01036235, + "balance_loss_clip": 1.02323556, + "balance_loss_mlp": 1.04136741, + "epoch": 0.4401623327822035, + "flos": 23768985905280.0, + "grad_norm": 1.6986497736973376, + "language_loss": 0.69795078, + "learning_rate": 2.4799547235203376e-06, + "loss": 0.71947479, + "num_input_tokens_seen": 156866685, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.75, + "step": 7321, + "time_per_iteration": 2.5457892417907715 + }, + { + "auxiliary_loss_clip": 0.01039878, + "auxiliary_loss_mlp": 0.01008287, + "balance_loss_clip": 1.00702953, + "balance_loss_mlp": 1.01681685, + "epoch": 0.4402224560348715, + "flos": 70774583264640.0, + "grad_norm": 0.8741267032944617, + "language_loss": 0.56908953, + "learning_rate": 2.4795766334018763e-06, + "loss": 0.58957124, + "num_input_tokens_seen": 156923450, + "router_z_loss_clip": 0.01257324, + "router_z_loss_mlp": 0.23046875, + "step": 7322, + "time_per_iteration": 3.164207935333252 + }, + { + "auxiliary_loss_clip": 0.01117179, + "auxiliary_loss_mlp": 0.01029685, + "balance_loss_clip": 1.01813388, + "balance_loss_mlp": 1.04277694, + "epoch": 0.44028257928753944, + "flos": 22891795868160.0, + "grad_norm": 1.4567737767029483, + "language_loss": 0.76075542, + "learning_rate": 2.479198525097822e-06, + "loss": 0.78222406, + "num_input_tokens_seen": 156944795, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.74609375, + "step": 7323, + "time_per_iteration": 2.5279085636138916 + }, + { + "auxiliary_loss_clip": 0.01117385, + "auxiliary_loss_mlp": 0.01037268, + "balance_loss_clip": 1.02369034, + "balance_loss_mlp": 1.0409224, + "epoch": 0.4403427025402074, + "flos": 17895760156800.0, + "grad_norm": 1.5548582319563429, + "language_loss": 0.8034448, + "learning_rate": 2.478820398622511e-06, + "loss": 0.82499135, + "num_input_tokens_seen": 156962755, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.765625, + "step": 7324, + "time_per_iteration": 2.4854304790496826 + }, + { + "auxiliary_loss_clip": 0.01039688, + "auxiliary_loss_mlp": 0.01006776, + "balance_loss_clip": 1.00549471, + "balance_loss_mlp": 1.01659369, + "epoch": 0.4404028257928754, + "flos": 69562525708800.0, + "grad_norm": 0.66599266679982, + "language_loss": 0.54557002, + "learning_rate": 2.478442253990283e-06, + "loss": 0.56603467, + "num_input_tokens_seen": 157028095, + "router_z_loss_clip": 0.01281738, + "router_z_loss_mlp": 0.23144531, + "step": 7325, + "time_per_iteration": 3.081268787384033 + }, + { + "auxiliary_loss_clip": 0.01116252, + "auxiliary_loss_mlp": 0.0103012, + "balance_loss_clip": 1.01792467, + "balance_loss_mlp": 1.04348588, + "epoch": 0.44046294904554334, + "flos": 20923675914240.0, + "grad_norm": 1.5427042359768692, + "language_loss": 0.69823551, + "learning_rate": 2.4780640912154766e-06, + "loss": 0.71969926, + "num_input_tokens_seen": 157048365, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.7265625, + "step": 7326, + "time_per_iteration": 2.489088535308838 + }, + { + "auxiliary_loss_clip": 0.01112531, + "auxiliary_loss_mlp": 0.01029175, + "balance_loss_clip": 1.01634765, + "balance_loss_mlp": 1.03926969, + "epoch": 0.44052307229821136, + "flos": 23623475909760.0, + "grad_norm": 1.4106900729498488, + "language_loss": 0.76410896, + "learning_rate": 2.477685910312432e-06, + "loss": 0.78552604, + "num_input_tokens_seen": 157069130, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.734375, + "step": 7327, + "time_per_iteration": 2.5099427700042725 + }, + { + "auxiliary_loss_clip": 0.01112963, + "auxiliary_loss_mlp": 0.01032486, + "balance_loss_clip": 1.01947999, + "balance_loss_mlp": 1.04029953, + "epoch": 0.4405831955508793, + "flos": 17597665186560.0, + "grad_norm": 1.92290278058118, + "language_loss": 0.83856362, + "learning_rate": 2.4773077112954897e-06, + "loss": 0.86001813, + "num_input_tokens_seen": 157084940, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7265625, + "step": 7328, + "time_per_iteration": 2.453078269958496 + }, + { + "auxiliary_loss_clip": 0.01114955, + "auxiliary_loss_mlp": 0.01028238, + "balance_loss_clip": 1.01505983, + "balance_loss_mlp": 1.04100752, + "epoch": 0.4406433188035473, + "flos": 21463376739840.0, + "grad_norm": 2.489103584507488, + "language_loss": 0.77842677, + "learning_rate": 2.4769294941789908e-06, + "loss": 0.79985875, + "num_input_tokens_seen": 157102770, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73828125, + "step": 7329, + "time_per_iteration": 2.4908933639526367 + }, + { + "auxiliary_loss_clip": 0.01118689, + "auxiliary_loss_mlp": 0.01033841, + "balance_loss_clip": 1.02069247, + "balance_loss_mlp": 1.04125428, + "epoch": 0.44070344205621526, + "flos": 22673566788480.0, + "grad_norm": 1.7085588184823939, + "language_loss": 0.73343551, + "learning_rate": 2.476551258977278e-06, + "loss": 0.75496078, + "num_input_tokens_seen": 157122035, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7734375, + "step": 7330, + "time_per_iteration": 2.463330030441284 + }, + { + "auxiliary_loss_clip": 0.01116337, + "auxiliary_loss_mlp": 0.01032094, + "balance_loss_clip": 1.01974368, + "balance_loss_mlp": 1.04176283, + "epoch": 0.4407635653088832, + "flos": 23441193365760.0, + "grad_norm": 1.7732063146110093, + "language_loss": 0.74867487, + "learning_rate": 2.4761730057046936e-06, + "loss": 0.77015924, + "num_input_tokens_seen": 157142800, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.74609375, + "step": 7331, + "time_per_iteration": 2.5421340465545654 + }, + { + "auxiliary_loss_clip": 0.01111824, + "auxiliary_loss_mlp": 0.01030025, + "balance_loss_clip": 1.01797271, + "balance_loss_mlp": 1.03957462, + "epoch": 0.4408236885615512, + "flos": 24021294013440.0, + "grad_norm": 1.4577784912363292, + "language_loss": 0.76381409, + "learning_rate": 2.475794734375581e-06, + "loss": 0.78523266, + "num_input_tokens_seen": 157163295, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.72265625, + "step": 7332, + "time_per_iteration": 2.5218567848205566 + }, + { + "auxiliary_loss_clip": 0.01114527, + "auxiliary_loss_mlp": 0.0103924, + "balance_loss_clip": 1.02724767, + "balance_loss_mlp": 1.03985786, + "epoch": 0.44088381181421915, + "flos": 12676826597760.0, + "grad_norm": 1.6787739774558346, + "language_loss": 0.7317301, + "learning_rate": 2.475416445004285e-06, + "loss": 0.75326777, + "num_input_tokens_seen": 157180890, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.74609375, + "step": 7333, + "time_per_iteration": 2.4611384868621826 + }, + { + "auxiliary_loss_clip": 0.01113948, + "auxiliary_loss_mlp": 0.01034468, + "balance_loss_clip": 1.0218792, + "balance_loss_mlp": 1.04222834, + "epoch": 0.4409439350668871, + "flos": 24569865498240.0, + "grad_norm": 1.7946296457229314, + "language_loss": 0.79795265, + "learning_rate": 2.4750381376051493e-06, + "loss": 0.81943679, + "num_input_tokens_seen": 157200580, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71875, + "step": 7334, + "time_per_iteration": 2.4846577644348145 + }, + { + "auxiliary_loss_clip": 0.01123312, + "auxiliary_loss_mlp": 0.01041094, + "balance_loss_clip": 1.02473879, + "balance_loss_mlp": 1.04168534, + "epoch": 0.4410040583195551, + "flos": 22668574798080.0, + "grad_norm": 2.170087212124324, + "language_loss": 0.7549156, + "learning_rate": 2.47465981219252e-06, + "loss": 0.77655965, + "num_input_tokens_seen": 157218345, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.81640625, + "step": 7335, + "time_per_iteration": 2.5086324214935303 + }, + { + "auxiliary_loss_clip": 0.01118233, + "auxiliary_loss_mlp": 0.01039933, + "balance_loss_clip": 1.02661777, + "balance_loss_mlp": 1.04259086, + "epoch": 0.44106418157222305, + "flos": 10852528700160.0, + "grad_norm": 1.91450979477167, + "language_loss": 0.72583538, + "learning_rate": 2.4742814687807423e-06, + "loss": 0.74741697, + "num_input_tokens_seen": 157234395, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7578125, + "step": 7336, + "time_per_iteration": 2.436680555343628 + }, + { + "auxiliary_loss_clip": 0.01118765, + "auxiliary_loss_mlp": 0.01039128, + "balance_loss_clip": 1.0251267, + "balance_loss_mlp": 1.04040349, + "epoch": 0.441124304824891, + "flos": 21726710323200.0, + "grad_norm": 9.267090991138677, + "language_loss": 0.62665188, + "learning_rate": 2.473903107384165e-06, + "loss": 0.64823085, + "num_input_tokens_seen": 157254805, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.78515625, + "step": 7337, + "time_per_iteration": 2.484269618988037 + }, + { + "auxiliary_loss_clip": 0.01039049, + "auxiliary_loss_mlp": 0.0100578, + "balance_loss_clip": 1.00452268, + "balance_loss_mlp": 1.01618195, + "epoch": 0.441184428077559, + "flos": 63220486625280.0, + "grad_norm": 0.7410103266773326, + "language_loss": 0.52670205, + "learning_rate": 2.473524728017134e-06, + "loss": 0.54715037, + "num_input_tokens_seen": 157317870, + "router_z_loss_clip": 0.01257324, + "router_z_loss_mlp": 0.22851562, + "step": 7338, + "time_per_iteration": 3.104921340942383 + }, + { + "auxiliary_loss_clip": 0.01120745, + "auxiliary_loss_mlp": 0.01047395, + "balance_loss_clip": 1.03303015, + "balance_loss_mlp": 1.04076958, + "epoch": 0.44124455133022694, + "flos": 21177959270400.0, + "grad_norm": 1.7777015345810536, + "language_loss": 0.70687723, + "learning_rate": 2.473146330693997e-06, + "loss": 0.7285586, + "num_input_tokens_seen": 157336505, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.80078125, + "step": 7339, + "time_per_iteration": 2.5172934532165527 + }, + { + "auxiliary_loss_clip": 0.01113596, + "auxiliary_loss_mlp": 0.01038279, + "balance_loss_clip": 1.02603626, + "balance_loss_mlp": 1.04237795, + "epoch": 0.4413046745828949, + "flos": 17457865453440.0, + "grad_norm": 1.6032661325040427, + "language_loss": 0.69992614, + "learning_rate": 2.472767915429105e-06, + "loss": 0.7214449, + "num_input_tokens_seen": 157354995, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.7109375, + "step": 7340, + "time_per_iteration": 2.4677066802978516 + }, + { + "auxiliary_loss_clip": 0.0103753, + "auxiliary_loss_mlp": 0.01002043, + "balance_loss_clip": 1.00078511, + "balance_loss_mlp": 1.01463652, + "epoch": 0.4413647978355629, + "flos": 61586153804160.0, + "grad_norm": 0.8913600985584349, + "language_loss": 0.64017105, + "learning_rate": 2.4723894822368054e-06, + "loss": 0.66056681, + "num_input_tokens_seen": 157404260, + "router_z_loss_clip": 0.01257324, + "router_z_loss_mlp": 0.22851562, + "step": 7341, + "time_per_iteration": 2.87821888923645 + }, + { + "auxiliary_loss_clip": 0.01113838, + "auxiliary_loss_mlp": 0.01038155, + "balance_loss_clip": 1.02473783, + "balance_loss_mlp": 1.04029536, + "epoch": 0.4414249210882309, + "flos": 27527001505920.0, + "grad_norm": 2.415120536593597, + "language_loss": 0.73162079, + "learning_rate": 2.47201103113145e-06, + "loss": 0.75314075, + "num_input_tokens_seen": 157423045, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 7342, + "time_per_iteration": 2.6009373664855957 + }, + { + "auxiliary_loss_clip": 0.01114735, + "auxiliary_loss_mlp": 0.01037861, + "balance_loss_clip": 1.02390742, + "balance_loss_mlp": 1.03866804, + "epoch": 0.44148504434089886, + "flos": 23513984277120.0, + "grad_norm": 1.834134484008718, + "language_loss": 0.7961756, + "learning_rate": 2.4716325621273886e-06, + "loss": 0.81770158, + "num_input_tokens_seen": 157441815, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76171875, + "step": 7343, + "time_per_iteration": 2.5102362632751465 + }, + { + "auxiliary_loss_clip": 0.01113089, + "auxiliary_loss_mlp": 0.01030659, + "balance_loss_clip": 1.01745617, + "balance_loss_mlp": 1.03901291, + "epoch": 0.4415451675935668, + "flos": 21580589796480.0, + "grad_norm": 1.5507634652992637, + "language_loss": 0.76845753, + "learning_rate": 2.4712540752389725e-06, + "loss": 0.789895, + "num_input_tokens_seen": 157460470, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73828125, + "step": 7344, + "time_per_iteration": 2.517014741897583 + }, + { + "auxiliary_loss_clip": 0.01036094, + "auxiliary_loss_mlp": 0.01000265, + "balance_loss_clip": 0.99887604, + "balance_loss_mlp": 1.01319945, + "epoch": 0.4416052908462348, + "flos": 59006368126080.0, + "grad_norm": 0.7920555871551813, + "language_loss": 0.63752162, + "learning_rate": 2.470875570480556e-06, + "loss": 0.65788519, + "num_input_tokens_seen": 157512655, + "router_z_loss_clip": 0.01391602, + "router_z_loss_mlp": 0.22949219, + "step": 7345, + "time_per_iteration": 7.267446517944336 + }, + { + "auxiliary_loss_clip": 0.01121083, + "auxiliary_loss_mlp": 0.01039556, + "balance_loss_clip": 1.02610314, + "balance_loss_mlp": 1.04385495, + "epoch": 0.44166541409890275, + "flos": 26357642242560.0, + "grad_norm": 2.1109182100548596, + "language_loss": 0.86316586, + "learning_rate": 2.470497047866489e-06, + "loss": 0.88477224, + "num_input_tokens_seen": 157533700, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7734375, + "step": 7346, + "time_per_iteration": 2.5508806705474854 + }, + { + "auxiliary_loss_clip": 0.01118131, + "auxiliary_loss_mlp": 0.01040679, + "balance_loss_clip": 1.02691066, + "balance_loss_mlp": 1.04238844, + "epoch": 0.4417255373515707, + "flos": 20192678231040.0, + "grad_norm": 1.947149735733886, + "language_loss": 0.8050105, + "learning_rate": 2.470118507411128e-06, + "loss": 0.82659858, + "num_input_tokens_seen": 157551105, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7578125, + "step": 7347, + "time_per_iteration": 2.474933624267578 + }, + { + "auxiliary_loss_clip": 0.01117003, + "auxiliary_loss_mlp": 0.01037561, + "balance_loss_clip": 1.02367926, + "balance_loss_mlp": 1.04158723, + "epoch": 0.4417856606042387, + "flos": 17887895078400.0, + "grad_norm": 1.6941368254206504, + "language_loss": 0.82639945, + "learning_rate": 2.4697399491288263e-06, + "loss": 0.84794509, + "num_input_tokens_seen": 157568285, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 7348, + "time_per_iteration": 2.4525363445281982 + }, + { + "auxiliary_loss_clip": 0.01119343, + "auxiliary_loss_mlp": 0.01037184, + "balance_loss_clip": 1.02335548, + "balance_loss_mlp": 1.04179621, + "epoch": 0.44184578385690665, + "flos": 27964034282880.0, + "grad_norm": 1.5736626646923677, + "language_loss": 0.7025882, + "learning_rate": 2.469361373033938e-06, + "loss": 0.72415352, + "num_input_tokens_seen": 157590405, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7734375, + "step": 7349, + "time_per_iteration": 2.511890172958374 + }, + { + "auxiliary_loss_clip": 0.01117351, + "auxiliary_loss_mlp": 0.01038625, + "balance_loss_clip": 1.02426577, + "balance_loss_mlp": 1.03973794, + "epoch": 0.4419059071095746, + "flos": 23367899664000.0, + "grad_norm": 1.6465526230005572, + "language_loss": 0.74427998, + "learning_rate": 2.468982779140819e-06, + "loss": 0.76583976, + "num_input_tokens_seen": 157607420, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.77734375, + "step": 7350, + "time_per_iteration": 2.496570110321045 + }, + { + "auxiliary_loss_clip": 0.01116736, + "auxiliary_loss_mlp": 0.01034613, + "balance_loss_clip": 1.02167273, + "balance_loss_mlp": 1.0410589, + "epoch": 0.4419660303622426, + "flos": 15012169246080.0, + "grad_norm": 1.9521663807923895, + "language_loss": 0.80709779, + "learning_rate": 2.468604167463827e-06, + "loss": 0.8286112, + "num_input_tokens_seen": 157624990, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7578125, + "step": 7351, + "time_per_iteration": 2.432551860809326 + }, + { + "auxiliary_loss_clip": 0.01111348, + "auxiliary_loss_mlp": 0.01035932, + "balance_loss_clip": 1.02401161, + "balance_loss_mlp": 1.03947091, + "epoch": 0.44202615361491054, + "flos": 25371750672000.0, + "grad_norm": 1.5082806208548023, + "language_loss": 0.73055673, + "learning_rate": 2.4682255380173176e-06, + "loss": 0.75202954, + "num_input_tokens_seen": 157645300, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.71875, + "step": 7352, + "time_per_iteration": 2.515235424041748 + }, + { + "auxiliary_loss_clip": 0.01116736, + "auxiliary_loss_mlp": 0.01031954, + "balance_loss_clip": 1.0184238, + "balance_loss_mlp": 1.04159904, + "epoch": 0.4420862768675785, + "flos": 24681116897280.0, + "grad_norm": 1.8470037483547026, + "language_loss": 0.87457407, + "learning_rate": 2.467846890815649e-06, + "loss": 0.89606094, + "num_input_tokens_seen": 157664060, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.75, + "step": 7353, + "time_per_iteration": 2.4880294799804688 + }, + { + "auxiliary_loss_clip": 0.0111698, + "auxiliary_loss_mlp": 0.01035238, + "balance_loss_clip": 1.02288198, + "balance_loss_mlp": 1.04091954, + "epoch": 0.44214640012024653, + "flos": 19528437974400.0, + "grad_norm": 2.0344010928875567, + "language_loss": 0.75522006, + "learning_rate": 2.4674682258731795e-06, + "loss": 0.77674222, + "num_input_tokens_seen": 157680905, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.7578125, + "step": 7354, + "time_per_iteration": 2.454554319381714 + }, + { + "auxiliary_loss_clip": 0.01112104, + "auxiliary_loss_mlp": 0.01035786, + "balance_loss_clip": 1.0235672, + "balance_loss_mlp": 1.03940272, + "epoch": 0.4422065233729145, + "flos": 47557434003840.0, + "grad_norm": 1.7346650465528282, + "language_loss": 0.64754039, + "learning_rate": 2.467089543204268e-06, + "loss": 0.66901928, + "num_input_tokens_seen": 157701980, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.7265625, + "step": 7355, + "time_per_iteration": 2.711973190307617 + }, + { + "auxiliary_loss_clip": 0.0112036, + "auxiliary_loss_mlp": 0.01036556, + "balance_loss_clip": 1.02248383, + "balance_loss_mlp": 1.04187799, + "epoch": 0.44226664662558246, + "flos": 19281050029440.0, + "grad_norm": 1.914030541413853, + "language_loss": 0.78126168, + "learning_rate": 2.466710842823274e-06, + "loss": 0.80283082, + "num_input_tokens_seen": 157720555, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.78515625, + "step": 7356, + "time_per_iteration": 2.470214366912842 + }, + { + "auxiliary_loss_clip": 0.01118926, + "auxiliary_loss_mlp": 0.01036798, + "balance_loss_clip": 1.02317214, + "balance_loss_mlp": 1.0414896, + "epoch": 0.4423267698782504, + "flos": 17821820010240.0, + "grad_norm": 1.5192892311950144, + "language_loss": 0.7712661, + "learning_rate": 2.4663321247445577e-06, + "loss": 0.79282331, + "num_input_tokens_seen": 157739160, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 7357, + "time_per_iteration": 2.461174249649048 + }, + { + "auxiliary_loss_clip": 0.01117699, + "auxiliary_loss_mlp": 0.01038392, + "balance_loss_clip": 1.02454567, + "balance_loss_mlp": 1.0424664, + "epoch": 0.4423868931309184, + "flos": 29204424691200.0, + "grad_norm": 1.4937655647898813, + "language_loss": 0.73591524, + "learning_rate": 2.465953388982481e-06, + "loss": 0.75747615, + "num_input_tokens_seen": 157760020, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 7358, + "time_per_iteration": 2.556330919265747 + }, + { + "auxiliary_loss_clip": 0.01117067, + "auxiliary_loss_mlp": 0.01030767, + "balance_loss_clip": 1.01871514, + "balance_loss_mlp": 1.0415349, + "epoch": 0.44244701638358636, + "flos": 29713135057920.0, + "grad_norm": 1.6567493539100802, + "language_loss": 0.75616974, + "learning_rate": 2.465574635551405e-06, + "loss": 0.77764809, + "num_input_tokens_seen": 157780435, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.75390625, + "step": 7359, + "time_per_iteration": 2.50827693939209 + }, + { + "auxiliary_loss_clip": 0.01116785, + "auxiliary_loss_mlp": 0.01033041, + "balance_loss_clip": 1.01920068, + "balance_loss_mlp": 1.04107249, + "epoch": 0.4425071396362543, + "flos": 22930040874240.0, + "grad_norm": 1.743382279224751, + "language_loss": 0.7001307, + "learning_rate": 2.4651958644656923e-06, + "loss": 0.72162896, + "num_input_tokens_seen": 157799420, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 7360, + "time_per_iteration": 2.4941389560699463 + }, + { + "auxiliary_loss_clip": 0.01117522, + "auxiliary_loss_mlp": 0.01033558, + "balance_loss_clip": 1.0205518, + "balance_loss_mlp": 1.04113221, + "epoch": 0.4425672628889223, + "flos": 19792346175360.0, + "grad_norm": 2.0593935576965996, + "language_loss": 0.69252694, + "learning_rate": 2.4648170757397053e-06, + "loss": 0.71403772, + "num_input_tokens_seen": 157817025, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.76171875, + "step": 7361, + "time_per_iteration": 2.4985222816467285 + }, + { + "auxiliary_loss_clip": 0.01116054, + "auxiliary_loss_mlp": 0.01032222, + "balance_loss_clip": 1.01840568, + "balance_loss_mlp": 1.04025078, + "epoch": 0.44262738614159025, + "flos": 13662215377920.0, + "grad_norm": 3.464971296188532, + "language_loss": 0.82380062, + "learning_rate": 2.464438269387809e-06, + "loss": 0.84528339, + "num_input_tokens_seen": 157834345, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 7362, + "time_per_iteration": 2.5396664142608643 + }, + { + "auxiliary_loss_clip": 0.01120785, + "auxiliary_loss_mlp": 0.0103602, + "balance_loss_clip": 1.0216198, + "balance_loss_mlp": 1.0414443, + "epoch": 0.4426875093942582, + "flos": 14210212245120.0, + "grad_norm": 1.6248096382426125, + "language_loss": 0.74421227, + "learning_rate": 2.464059445424366e-06, + "loss": 0.76578033, + "num_input_tokens_seen": 157852290, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.79296875, + "step": 7363, + "time_per_iteration": 2.452195167541504 + }, + { + "auxiliary_loss_clip": 0.01036428, + "auxiliary_loss_mlp": 0.01011165, + "balance_loss_clip": 1.00969243, + "balance_loss_mlp": 1.0129478, + "epoch": 0.4427476326469262, + "flos": 70117525728000.0, + "grad_norm": 0.6750552451063064, + "language_loss": 0.55668789, + "learning_rate": 2.463680603863743e-06, + "loss": 0.57716382, + "num_input_tokens_seen": 157923060, + "router_z_loss_clip": 0.01470947, + "router_z_loss_mlp": 0.234375, + "step": 7364, + "time_per_iteration": 3.1631510257720947 + }, + { + "auxiliary_loss_clip": 0.0111342, + "auxiliary_loss_mlp": 0.01031288, + "balance_loss_clip": 1.01869917, + "balance_loss_mlp": 1.0388242, + "epoch": 0.44280775589959415, + "flos": 25445080287360.0, + "grad_norm": 1.5647849634077904, + "language_loss": 0.74008644, + "learning_rate": 2.463301744720305e-06, + "loss": 0.76153356, + "num_input_tokens_seen": 157944110, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.74609375, + "step": 7365, + "time_per_iteration": 2.5025317668914795 + }, + { + "auxiliary_loss_clip": 0.01113151, + "auxiliary_loss_mlp": 0.01038694, + "balance_loss_clip": 1.02544355, + "balance_loss_mlp": 1.0385282, + "epoch": 0.4428678791522621, + "flos": 22857214049280.0, + "grad_norm": 1.5168930353966135, + "language_loss": 0.74242592, + "learning_rate": 2.4629228680084184e-06, + "loss": 0.76394439, + "num_input_tokens_seen": 157964295, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.74609375, + "step": 7366, + "time_per_iteration": 2.4882071018218994 + }, + { + "auxiliary_loss_clip": 0.01117127, + "auxiliary_loss_mlp": 0.01032858, + "balance_loss_clip": 1.01911306, + "balance_loss_mlp": 1.04244351, + "epoch": 0.44292800240493013, + "flos": 25812446636160.0, + "grad_norm": 1.7268166919008578, + "language_loss": 0.73934573, + "learning_rate": 2.46254397374245e-06, + "loss": 0.7608456, + "num_input_tokens_seen": 157983970, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.74609375, + "step": 7367, + "time_per_iteration": 2.494215250015259 + }, + { + "auxiliary_loss_clip": 0.01115817, + "auxiliary_loss_mlp": 0.01038126, + "balance_loss_clip": 1.02484, + "balance_loss_mlp": 1.04093957, + "epoch": 0.4429881256575981, + "flos": 32416885549440.0, + "grad_norm": 1.708386000191459, + "language_loss": 0.7409333, + "learning_rate": 2.4621650619367677e-06, + "loss": 0.76247275, + "num_input_tokens_seen": 158006515, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75, + "step": 7368, + "time_per_iteration": 2.5647008419036865 + }, + { + "auxiliary_loss_clip": 0.01114523, + "auxiliary_loss_mlp": 0.01031707, + "balance_loss_clip": 1.01905274, + "balance_loss_mlp": 1.04091215, + "epoch": 0.44304824891026606, + "flos": 22163707186560.0, + "grad_norm": 1.8689444780395545, + "language_loss": 0.79986328, + "learning_rate": 2.4617861326057403e-06, + "loss": 0.82132554, + "num_input_tokens_seen": 158025565, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.734375, + "step": 7369, + "time_per_iteration": 2.4666872024536133 + }, + { + "auxiliary_loss_clip": 0.01112296, + "auxiliary_loss_mlp": 0.0102878, + "balance_loss_clip": 1.01627517, + "balance_loss_mlp": 1.04060125, + "epoch": 0.443108372162934, + "flos": 25338569483520.0, + "grad_norm": 1.7167890006148945, + "language_loss": 0.72231519, + "learning_rate": 2.461407185763737e-06, + "loss": 0.74372596, + "num_input_tokens_seen": 158045620, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71484375, + "step": 7370, + "time_per_iteration": 2.5508570671081543 + }, + { + "auxiliary_loss_clip": 0.01113663, + "auxiliary_loss_mlp": 0.01033079, + "balance_loss_clip": 1.02021682, + "balance_loss_mlp": 1.03883541, + "epoch": 0.443168495415602, + "flos": 23330947547520.0, + "grad_norm": 1.7515847136682843, + "language_loss": 0.70318949, + "learning_rate": 2.461028221425126e-06, + "loss": 0.72465694, + "num_input_tokens_seen": 158063505, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.75, + "step": 7371, + "time_per_iteration": 2.4617960453033447 + }, + { + "auxiliary_loss_clip": 0.01110948, + "auxiliary_loss_mlp": 0.01030075, + "balance_loss_clip": 1.01818371, + "balance_loss_mlp": 1.03891456, + "epoch": 0.44322861866826996, + "flos": 21871502046720.0, + "grad_norm": 2.199744355071377, + "language_loss": 0.68163198, + "learning_rate": 2.4606492396042786e-06, + "loss": 0.70304221, + "num_input_tokens_seen": 158080335, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.72265625, + "step": 7372, + "time_per_iteration": 2.4743239879608154 + }, + { + "auxiliary_loss_clip": 0.0111515, + "auxiliary_loss_mlp": 0.01030978, + "balance_loss_clip": 1.01702499, + "balance_loss_mlp": 1.03971767, + "epoch": 0.4432887419209379, + "flos": 20084407660800.0, + "grad_norm": 1.696523180994532, + "language_loss": 0.83959508, + "learning_rate": 2.4602702403155664e-06, + "loss": 0.86105639, + "num_input_tokens_seen": 158098955, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75390625, + "step": 7373, + "time_per_iteration": 2.44077467918396 + }, + { + "auxiliary_loss_clip": 0.01038641, + "auxiliary_loss_mlp": 0.01004854, + "balance_loss_clip": 1.00331616, + "balance_loss_mlp": 1.01527071, + "epoch": 0.4433488651736059, + "flos": 70035540935040.0, + "grad_norm": 0.8140024563186875, + "language_loss": 0.55299437, + "learning_rate": 2.4598912235733604e-06, + "loss": 0.57342935, + "num_input_tokens_seen": 158164110, + "router_z_loss_clip": 0.01538086, + "router_z_loss_mlp": 0.234375, + "step": 7374, + "time_per_iteration": 3.1360692977905273 + }, + { + "auxiliary_loss_clip": 0.01113767, + "auxiliary_loss_mlp": 0.01042375, + "balance_loss_clip": 1.02858198, + "balance_loss_mlp": 1.04092741, + "epoch": 0.44340898842627385, + "flos": 16282472705280.0, + "grad_norm": 2.2551701608050636, + "language_loss": 0.82651508, + "learning_rate": 2.4595121893920327e-06, + "loss": 0.84807646, + "num_input_tokens_seen": 158179850, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7265625, + "step": 7375, + "time_per_iteration": 2.4277329444885254 + }, + { + "auxiliary_loss_clip": 0.01116501, + "auxiliary_loss_mlp": 0.01029081, + "balance_loss_clip": 1.01610494, + "balance_loss_mlp": 1.04118764, + "epoch": 0.4434691116789418, + "flos": 16611989097600.0, + "grad_norm": 1.7856786314152562, + "language_loss": 0.83470213, + "learning_rate": 2.4591331377859578e-06, + "loss": 0.85615796, + "num_input_tokens_seen": 158196590, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.75390625, + "step": 7376, + "time_per_iteration": 2.481781482696533 + }, + { + "auxiliary_loss_clip": 0.01114604, + "auxiliary_loss_mlp": 0.01032944, + "balance_loss_clip": 1.02043331, + "balance_loss_mlp": 1.04121447, + "epoch": 0.4435292349316098, + "flos": 19063251912960.0, + "grad_norm": 1.7657537697851593, + "language_loss": 0.77321744, + "learning_rate": 2.4587540687695077e-06, + "loss": 0.79469293, + "num_input_tokens_seen": 158216355, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.734375, + "step": 7377, + "time_per_iteration": 2.4599812030792236 + }, + { + "auxiliary_loss_clip": 0.01112621, + "auxiliary_loss_mlp": 0.0102944, + "balance_loss_clip": 1.01692927, + "balance_loss_mlp": 1.04132032, + "epoch": 0.44358935818427775, + "flos": 21251324799360.0, + "grad_norm": 1.8620341755948002, + "language_loss": 0.75641978, + "learning_rate": 2.458374982357057e-06, + "loss": 0.77784032, + "num_input_tokens_seen": 158235825, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7109375, + "step": 7378, + "time_per_iteration": 2.5178849697113037 + }, + { + "auxiliary_loss_clip": 0.01114317, + "auxiliary_loss_mlp": 0.01035639, + "balance_loss_clip": 1.02302647, + "balance_loss_mlp": 1.04010391, + "epoch": 0.4436494814369457, + "flos": 12495298239360.0, + "grad_norm": 2.670150777415059, + "language_loss": 0.69005907, + "learning_rate": 2.457995878562982e-06, + "loss": 0.71155864, + "num_input_tokens_seen": 158254230, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7421875, + "step": 7379, + "time_per_iteration": 2.460470199584961 + }, + { + "auxiliary_loss_clip": 0.01116042, + "auxiliary_loss_mlp": 0.01029842, + "balance_loss_clip": 1.01689601, + "balance_loss_mlp": 1.04134107, + "epoch": 0.44370960468961373, + "flos": 23659853408640.0, + "grad_norm": 1.5614200394729, + "language_loss": 0.73110741, + "learning_rate": 2.457616757401656e-06, + "loss": 0.75256622, + "num_input_tokens_seen": 158273400, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.75, + "step": 7380, + "time_per_iteration": 2.5134148597717285 + }, + { + "auxiliary_loss_clip": 0.0111454, + "auxiliary_loss_mlp": 0.01031046, + "balance_loss_clip": 1.01841021, + "balance_loss_mlp": 1.0408597, + "epoch": 0.4437697279422817, + "flos": 32416849635840.0, + "grad_norm": 1.5217984285789272, + "language_loss": 0.6470772, + "learning_rate": 2.457237618887458e-06, + "loss": 0.66853309, + "num_input_tokens_seen": 158296840, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.734375, + "step": 7381, + "time_per_iteration": 2.5547850131988525 + }, + { + "auxiliary_loss_clip": 0.01116209, + "auxiliary_loss_mlp": 0.0103301, + "balance_loss_clip": 1.02020693, + "balance_loss_mlp": 1.04110599, + "epoch": 0.44382985119494966, + "flos": 18112875914880.0, + "grad_norm": 2.3862697145357394, + "language_loss": 0.8018291, + "learning_rate": 2.456858463034763e-06, + "loss": 0.82332134, + "num_input_tokens_seen": 158314935, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.75, + "step": 7382, + "time_per_iteration": 2.575241804122925 + }, + { + "auxiliary_loss_clip": 0.01118453, + "auxiliary_loss_mlp": 0.01039182, + "balance_loss_clip": 1.02631903, + "balance_loss_mlp": 1.04359293, + "epoch": 0.44388997444761763, + "flos": 30774151923840.0, + "grad_norm": 1.657830016653087, + "language_loss": 0.65369737, + "learning_rate": 2.456479289857949e-06, + "loss": 0.67527372, + "num_input_tokens_seen": 158334620, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.75, + "step": 7383, + "time_per_iteration": 2.530205726623535 + }, + { + "auxiliary_loss_clip": 0.01118822, + "auxiliary_loss_mlp": 0.01032867, + "balance_loss_clip": 1.01928902, + "balance_loss_mlp": 1.04226518, + "epoch": 0.4439500977002856, + "flos": 20339157893760.0, + "grad_norm": 3.0329093562680023, + "language_loss": 0.75660288, + "learning_rate": 2.4561000993713953e-06, + "loss": 0.77811974, + "num_input_tokens_seen": 158350550, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.765625, + "step": 7384, + "time_per_iteration": 2.5266385078430176 + }, + { + "auxiliary_loss_clip": 0.01118828, + "auxiliary_loss_mlp": 0.01033934, + "balance_loss_clip": 1.02092242, + "balance_loss_mlp": 1.04284334, + "epoch": 0.44401022095295356, + "flos": 20371225760640.0, + "grad_norm": 1.5666997146068944, + "language_loss": 0.81029254, + "learning_rate": 2.4557208915894796e-06, + "loss": 0.83182013, + "num_input_tokens_seen": 158369555, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.76171875, + "step": 7385, + "time_per_iteration": 2.4479992389678955 + }, + { + "auxiliary_loss_clip": 0.01116566, + "auxiliary_loss_mlp": 0.01035084, + "balance_loss_clip": 1.02111292, + "balance_loss_mlp": 1.04122996, + "epoch": 0.4440703442056215, + "flos": 20230635928320.0, + "grad_norm": 1.6468061831775258, + "language_loss": 0.82127023, + "learning_rate": 2.455341666526582e-06, + "loss": 0.84278667, + "num_input_tokens_seen": 158388045, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75390625, + "step": 7386, + "time_per_iteration": 2.48417067527771 + }, + { + "auxiliary_loss_clip": 0.01120079, + "auxiliary_loss_mlp": 0.01037797, + "balance_loss_clip": 1.02320611, + "balance_loss_mlp": 1.04189587, + "epoch": 0.4441304674582895, + "flos": 39494698824960.0, + "grad_norm": 1.953099317045194, + "language_loss": 0.69732893, + "learning_rate": 2.4549624241970832e-06, + "loss": 0.71890771, + "num_input_tokens_seen": 158410115, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.78125, + "step": 7387, + "time_per_iteration": 5.494876146316528 + }, + { + "auxiliary_loss_clip": 0.01114673, + "auxiliary_loss_mlp": 0.01038672, + "balance_loss_clip": 1.02546382, + "balance_loss_mlp": 1.03957582, + "epoch": 0.44419059071095746, + "flos": 14829671220480.0, + "grad_norm": 2.035383956259629, + "language_loss": 0.7170803, + "learning_rate": 2.4545831646153628e-06, + "loss": 0.73861378, + "num_input_tokens_seen": 158427765, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75, + "step": 7388, + "time_per_iteration": 2.4271323680877686 + }, + { + "auxiliary_loss_clip": 0.011178, + "auxiliary_loss_mlp": 0.01031227, + "balance_loss_clip": 1.01776195, + "balance_loss_mlp": 1.04137266, + "epoch": 0.4442507139636254, + "flos": 22637835734400.0, + "grad_norm": 1.4848855642281624, + "language_loss": 0.6881609, + "learning_rate": 2.4542038877958044e-06, + "loss": 0.70965117, + "num_input_tokens_seen": 158446375, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 7389, + "time_per_iteration": 2.4847142696380615 + }, + { + "auxiliary_loss_clip": 0.01115516, + "auxiliary_loss_mlp": 0.01032017, + "balance_loss_clip": 1.01918983, + "balance_loss_mlp": 1.04167664, + "epoch": 0.4443108372162934, + "flos": 38290721829120.0, + "grad_norm": 2.0051609497188587, + "language_loss": 0.74621141, + "learning_rate": 2.453824593752788e-06, + "loss": 0.76768672, + "num_input_tokens_seen": 158467260, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.73828125, + "step": 7390, + "time_per_iteration": 2.594834804534912 + }, + { + "auxiliary_loss_clip": 0.01116041, + "auxiliary_loss_mlp": 0.01033099, + "balance_loss_clip": 1.0202961, + "balance_loss_mlp": 1.04296565, + "epoch": 0.44437096046896135, + "flos": 17748993185280.0, + "grad_norm": 2.702415761973985, + "language_loss": 0.811364, + "learning_rate": 2.4534452825006988e-06, + "loss": 0.83285546, + "num_input_tokens_seen": 158486720, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.73046875, + "step": 7391, + "time_per_iteration": 2.4757862091064453 + }, + { + "auxiliary_loss_clip": 0.01116609, + "auxiliary_loss_mlp": 0.01034214, + "balance_loss_clip": 1.02070808, + "balance_loss_mlp": 1.04341137, + "epoch": 0.4444310837216293, + "flos": 13732348682880.0, + "grad_norm": 1.6224407429556025, + "language_loss": 0.73400211, + "learning_rate": 2.4530659540539185e-06, + "loss": 0.75551033, + "num_input_tokens_seen": 158502530, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 7392, + "time_per_iteration": 2.423929214477539 + }, + { + "auxiliary_loss_clip": 0.01113533, + "auxiliary_loss_mlp": 0.01029467, + "balance_loss_clip": 1.01705766, + "balance_loss_mlp": 1.03988051, + "epoch": 0.44449120697429734, + "flos": 25010238240000.0, + "grad_norm": 1.5529830220947678, + "language_loss": 0.79523122, + "learning_rate": 2.4526866084268313e-06, + "loss": 0.81666124, + "num_input_tokens_seen": 158522715, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.734375, + "step": 7393, + "time_per_iteration": 2.5162272453308105 + }, + { + "auxiliary_loss_clip": 0.01119885, + "auxiliary_loss_mlp": 0.01034531, + "balance_loss_clip": 1.02125716, + "balance_loss_mlp": 1.04248941, + "epoch": 0.4445513302269653, + "flos": 32671707609600.0, + "grad_norm": 1.9165659224437794, + "language_loss": 0.8090415, + "learning_rate": 2.4523072456338226e-06, + "loss": 0.83058566, + "num_input_tokens_seen": 158543615, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7734375, + "step": 7394, + "time_per_iteration": 2.5386714935302734 + }, + { + "auxiliary_loss_clip": 0.01114869, + "auxiliary_loss_mlp": 0.01039877, + "balance_loss_clip": 1.02772927, + "balance_loss_mlp": 1.04228508, + "epoch": 0.44461145347963327, + "flos": 11655814504320.0, + "grad_norm": 3.6807348725160502, + "language_loss": 0.79471326, + "learning_rate": 2.4519278656892785e-06, + "loss": 0.81626076, + "num_input_tokens_seen": 158560330, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.7265625, + "step": 7395, + "time_per_iteration": 2.4668092727661133 + }, + { + "auxiliary_loss_clip": 0.01114654, + "auxiliary_loss_mlp": 0.01033845, + "balance_loss_clip": 1.02162027, + "balance_loss_mlp": 1.04132056, + "epoch": 0.44467157673230123, + "flos": 20886759711360.0, + "grad_norm": 1.800276006342892, + "language_loss": 0.68493867, + "learning_rate": 2.451548468607584e-06, + "loss": 0.70642376, + "num_input_tokens_seen": 158579735, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.734375, + "step": 7396, + "time_per_iteration": 2.463660717010498 + }, + { + "auxiliary_loss_clip": 0.01117407, + "auxiliary_loss_mlp": 0.01031157, + "balance_loss_clip": 1.01831245, + "balance_loss_mlp": 1.0412426, + "epoch": 0.4447316999849692, + "flos": 18546137763840.0, + "grad_norm": 1.8246827609425533, + "language_loss": 0.81007254, + "learning_rate": 2.451169054403126e-06, + "loss": 0.83155811, + "num_input_tokens_seen": 158597075, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.76171875, + "step": 7397, + "time_per_iteration": 2.4812188148498535 + }, + { + "auxiliary_loss_clip": 0.01116158, + "auxiliary_loss_mlp": 0.01033503, + "balance_loss_clip": 1.02078366, + "balance_loss_mlp": 1.04323518, + "epoch": 0.44479182323763716, + "flos": 23769057732480.0, + "grad_norm": 1.7006854584246183, + "language_loss": 0.67145807, + "learning_rate": 2.450789623090293e-06, + "loss": 0.69295466, + "num_input_tokens_seen": 158616650, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7265625, + "step": 7398, + "time_per_iteration": 2.5075526237487793 + }, + { + "auxiliary_loss_clip": 0.01114723, + "auxiliary_loss_mlp": 0.01036485, + "balance_loss_clip": 1.02443874, + "balance_loss_mlp": 1.04204428, + "epoch": 0.44485194649030513, + "flos": 16543831040640.0, + "grad_norm": 1.9000444103330927, + "language_loss": 0.69551516, + "learning_rate": 2.450410174683472e-06, + "loss": 0.71702719, + "num_input_tokens_seen": 158634515, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.7265625, + "step": 7399, + "time_per_iteration": 2.522737741470337 + }, + { + "auxiliary_loss_clip": 0.01113023, + "auxiliary_loss_mlp": 0.01037037, + "balance_loss_clip": 1.02465105, + "balance_loss_mlp": 1.0408442, + "epoch": 0.4449120697429731, + "flos": 22600955445120.0, + "grad_norm": 1.713461165054691, + "language_loss": 0.7287724, + "learning_rate": 2.4500307091970514e-06, + "loss": 0.75027299, + "num_input_tokens_seen": 158653760, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71875, + "step": 7400, + "time_per_iteration": 2.4633662700653076 + }, + { + "auxiliary_loss_clip": 0.0111367, + "auxiliary_loss_mlp": 0.01030255, + "balance_loss_clip": 1.01755965, + "balance_loss_mlp": 1.04038024, + "epoch": 0.44497219299564106, + "flos": 20004864992640.0, + "grad_norm": 1.5216060200654076, + "language_loss": 0.85054708, + "learning_rate": 2.449651226645422e-06, + "loss": 0.87198627, + "num_input_tokens_seen": 158672190, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.734375, + "step": 7401, + "time_per_iteration": 2.5034339427948 + }, + { + "auxiliary_loss_clip": 0.01111761, + "auxiliary_loss_mlp": 0.01033771, + "balance_loss_clip": 1.02213049, + "balance_loss_mlp": 1.04065824, + "epoch": 0.445032316248309, + "flos": 25594253470080.0, + "grad_norm": 1.696028331559664, + "language_loss": 0.83296156, + "learning_rate": 2.449271727042973e-06, + "loss": 0.85441685, + "num_input_tokens_seen": 158694115, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.7109375, + "step": 7402, + "time_per_iteration": 2.501981258392334 + }, + { + "auxiliary_loss_clip": 0.0111579, + "auxiliary_loss_mlp": 0.01032303, + "balance_loss_clip": 1.01979768, + "balance_loss_mlp": 1.0420711, + "epoch": 0.445092439500977, + "flos": 21250426959360.0, + "grad_norm": 1.736524647333069, + "language_loss": 0.76953578, + "learning_rate": 2.4488922104040947e-06, + "loss": 0.7910167, + "num_input_tokens_seen": 158711000, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.73828125, + "step": 7403, + "time_per_iteration": 2.4778058528900146 + }, + { + "auxiliary_loss_clip": 0.01038113, + "auxiliary_loss_mlp": 0.01001838, + "balance_loss_clip": 1.00046086, + "balance_loss_mlp": 1.014925, + "epoch": 0.44515256275364495, + "flos": 57764900309760.0, + "grad_norm": 0.7475420058163609, + "language_loss": 0.60081208, + "learning_rate": 2.4485126767431793e-06, + "loss": 0.62121159, + "num_input_tokens_seen": 158769675, + "router_z_loss_clip": 0.01379395, + "router_z_loss_mlp": 0.23242188, + "step": 7404, + "time_per_iteration": 3.0548532009124756 + }, + { + "auxiliary_loss_clip": 0.01118666, + "auxiliary_loss_mlp": 0.01035192, + "balance_loss_clip": 1.02225208, + "balance_loss_mlp": 1.04285121, + "epoch": 0.4452126860063129, + "flos": 15596004908160.0, + "grad_norm": 1.6312624429793499, + "language_loss": 0.81696916, + "learning_rate": 2.4481331260746177e-06, + "loss": 0.83850771, + "num_input_tokens_seen": 158788215, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7578125, + "step": 7405, + "time_per_iteration": 2.474632978439331 + }, + { + "auxiliary_loss_clip": 0.0111153, + "auxiliary_loss_mlp": 0.01031071, + "balance_loss_clip": 1.01864958, + "balance_loss_mlp": 1.03843176, + "epoch": 0.4452728092589809, + "flos": 21617398258560.0, + "grad_norm": 1.4258557139975254, + "language_loss": 0.74869186, + "learning_rate": 2.4477535584128036e-06, + "loss": 0.77011788, + "num_input_tokens_seen": 158809090, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.73046875, + "step": 7406, + "time_per_iteration": 2.4767563343048096 + }, + { + "auxiliary_loss_clip": 0.01108887, + "auxiliary_loss_mlp": 0.01030592, + "balance_loss_clip": 1.01837921, + "balance_loss_mlp": 1.03819203, + "epoch": 0.4453329325116489, + "flos": 29497491757440.0, + "grad_norm": 1.5627122296340765, + "language_loss": 0.65510803, + "learning_rate": 2.447373973772129e-06, + "loss": 0.67650282, + "num_input_tokens_seen": 158828320, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.70703125, + "step": 7407, + "time_per_iteration": 2.5395827293395996 + }, + { + "auxiliary_loss_clip": 0.01118546, + "auxiliary_loss_mlp": 0.01029122, + "balance_loss_clip": 1.01691461, + "balance_loss_mlp": 1.04306138, + "epoch": 0.44539305576431687, + "flos": 21361139654400.0, + "grad_norm": 1.5061477696527659, + "language_loss": 0.67724633, + "learning_rate": 2.4469943721669887e-06, + "loss": 0.69872296, + "num_input_tokens_seen": 158847040, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.75390625, + "step": 7408, + "time_per_iteration": 2.462306261062622 + }, + { + "auxiliary_loss_clip": 0.0111265, + "auxiliary_loss_mlp": 0.01032186, + "balance_loss_clip": 1.01891828, + "balance_loss_mlp": 1.0386107, + "epoch": 0.44545317901698483, + "flos": 41427626428800.0, + "grad_norm": 1.4978343447976226, + "language_loss": 0.71923941, + "learning_rate": 2.4466147536117776e-06, + "loss": 0.74068785, + "num_input_tokens_seen": 158870490, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7421875, + "step": 7409, + "time_per_iteration": 2.674224615097046 + }, + { + "auxiliary_loss_clip": 0.01114249, + "auxiliary_loss_mlp": 0.01034826, + "balance_loss_clip": 1.02100968, + "balance_loss_mlp": 1.03980279, + "epoch": 0.4455133022696528, + "flos": 22055005653120.0, + "grad_norm": 2.031581575195052, + "language_loss": 0.64823419, + "learning_rate": 2.4462351181208895e-06, + "loss": 0.66972494, + "num_input_tokens_seen": 158889920, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7421875, + "step": 7410, + "time_per_iteration": 2.524874687194824 + }, + { + "auxiliary_loss_clip": 0.01120724, + "auxiliary_loss_mlp": 0.01033754, + "balance_loss_clip": 1.0200448, + "balance_loss_mlp": 1.04309118, + "epoch": 0.44557342552232077, + "flos": 23476960333440.0, + "grad_norm": 2.015615502497161, + "language_loss": 0.74042189, + "learning_rate": 2.4458554657087217e-06, + "loss": 0.76196671, + "num_input_tokens_seen": 158909580, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.77734375, + "step": 7411, + "time_per_iteration": 2.512510061264038 + }, + { + "auxiliary_loss_clip": 0.01112773, + "auxiliary_loss_mlp": 0.01031337, + "balance_loss_clip": 1.01900446, + "balance_loss_mlp": 1.04189968, + "epoch": 0.44563354877498873, + "flos": 19134678107520.0, + "grad_norm": 1.869475782048451, + "language_loss": 0.79242551, + "learning_rate": 2.4454757963896695e-06, + "loss": 0.81386662, + "num_input_tokens_seen": 158924600, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.70703125, + "step": 7412, + "time_per_iteration": 2.472858190536499 + }, + { + "auxiliary_loss_clip": 0.01114909, + "auxiliary_loss_mlp": 0.01035461, + "balance_loss_clip": 1.02299762, + "balance_loss_mlp": 1.03920937, + "epoch": 0.4456936720276567, + "flos": 13621420506240.0, + "grad_norm": 3.400478569187806, + "language_loss": 0.798675, + "learning_rate": 2.4450961101781304e-06, + "loss": 0.82017869, + "num_input_tokens_seen": 158939345, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7578125, + "step": 7413, + "time_per_iteration": 2.4117238521575928 + }, + { + "auxiliary_loss_clip": 0.01112114, + "auxiliary_loss_mlp": 0.01028076, + "balance_loss_clip": 1.01601171, + "balance_loss_mlp": 1.04039168, + "epoch": 0.44575379528032466, + "flos": 14713715139840.0, + "grad_norm": 1.7210919700182319, + "language_loss": 0.76510686, + "learning_rate": 2.4447164070885026e-06, + "loss": 0.7865088, + "num_input_tokens_seen": 158955855, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.71875, + "step": 7414, + "time_per_iteration": 2.460224151611328 + }, + { + "auxiliary_loss_clip": 0.01113268, + "auxiliary_loss_mlp": 0.01033582, + "balance_loss_clip": 1.02064216, + "balance_loss_mlp": 1.04047227, + "epoch": 0.4458139185329926, + "flos": 24170682677760.0, + "grad_norm": 1.4395051245379855, + "language_loss": 0.83344847, + "learning_rate": 2.4443366871351837e-06, + "loss": 0.85491699, + "num_input_tokens_seen": 158976315, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7265625, + "step": 7415, + "time_per_iteration": 2.487433910369873 + }, + { + "auxiliary_loss_clip": 0.01111103, + "auxiliary_loss_mlp": 0.01039317, + "balance_loss_clip": 1.02675223, + "balance_loss_mlp": 1.03786182, + "epoch": 0.4458740417856606, + "flos": 21762225895680.0, + "grad_norm": 1.5295363489819147, + "language_loss": 0.84025514, + "learning_rate": 2.4439569503325732e-06, + "loss": 0.86175931, + "num_input_tokens_seen": 158996725, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.734375, + "step": 7416, + "time_per_iteration": 2.4827380180358887 + }, + { + "auxiliary_loss_clip": 0.0111513, + "auxiliary_loss_mlp": 0.01031747, + "balance_loss_clip": 1.01872349, + "balance_loss_mlp": 1.03937066, + "epoch": 0.44593416503832856, + "flos": 21068790860160.0, + "grad_norm": 1.5840815969934987, + "language_loss": 0.8099134, + "learning_rate": 2.4435771966950706e-06, + "loss": 0.83138216, + "num_input_tokens_seen": 159017255, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7578125, + "step": 7417, + "time_per_iteration": 2.48150897026062 + }, + { + "auxiliary_loss_clip": 0.01115498, + "auxiliary_loss_mlp": 0.01039656, + "balance_loss_clip": 1.02679276, + "balance_loss_mlp": 1.04055572, + "epoch": 0.4459942882909965, + "flos": 22600488568320.0, + "grad_norm": 1.9543176040955477, + "language_loss": 0.81078619, + "learning_rate": 2.443197426237077e-06, + "loss": 0.83233768, + "num_input_tokens_seen": 159035010, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.75, + "step": 7418, + "time_per_iteration": 2.489847421646118 + }, + { + "auxiliary_loss_clip": 0.0111452, + "auxiliary_loss_mlp": 0.01029153, + "balance_loss_clip": 1.01647544, + "balance_loss_mlp": 1.04015303, + "epoch": 0.4460544115436645, + "flos": 26505486622080.0, + "grad_norm": 1.586204851514133, + "language_loss": 0.77404898, + "learning_rate": 2.442817638972991e-06, + "loss": 0.79548573, + "num_input_tokens_seen": 159055345, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7421875, + "step": 7419, + "time_per_iteration": 2.497434377670288 + }, + { + "auxiliary_loss_clip": 0.01112333, + "auxiliary_loss_mlp": 0.01034157, + "balance_loss_clip": 1.02190208, + "balance_loss_mlp": 1.03983605, + "epoch": 0.4461145347963325, + "flos": 17604021893760.0, + "grad_norm": 1.7862585645473121, + "language_loss": 0.72408056, + "learning_rate": 2.4424378349172176e-06, + "loss": 0.74554545, + "num_input_tokens_seen": 159074225, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.7265625, + "step": 7420, + "time_per_iteration": 2.459458351135254 + }, + { + "auxiliary_loss_clip": 0.01113499, + "auxiliary_loss_mlp": 0.0103005, + "balance_loss_clip": 1.0166688, + "balance_loss_mlp": 1.0416131, + "epoch": 0.44617465804900047, + "flos": 27268193036160.0, + "grad_norm": 1.6779849239209732, + "language_loss": 0.75009704, + "learning_rate": 2.442058014084156e-06, + "loss": 0.77153254, + "num_input_tokens_seen": 159095415, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.71875, + "step": 7421, + "time_per_iteration": 2.51987624168396 + }, + { + "auxiliary_loss_clip": 0.01110345, + "auxiliary_loss_mlp": 0.01032284, + "balance_loss_clip": 1.02002299, + "balance_loss_mlp": 1.04095602, + "epoch": 0.44623478130166844, + "flos": 17786412178560.0, + "grad_norm": 1.9054244397804427, + "language_loss": 0.76410532, + "learning_rate": 2.44167817648821e-06, + "loss": 0.78553158, + "num_input_tokens_seen": 159114615, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 7422, + "time_per_iteration": 2.4755024909973145 + }, + { + "auxiliary_loss_clip": 0.0111206, + "auxiliary_loss_mlp": 0.01031918, + "balance_loss_clip": 1.01975894, + "balance_loss_mlp": 1.03931499, + "epoch": 0.4462949045543364, + "flos": 23003011353600.0, + "grad_norm": 1.4448000656244153, + "language_loss": 0.65126681, + "learning_rate": 2.441298322143784e-06, + "loss": 0.6727066, + "num_input_tokens_seen": 159134370, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.7265625, + "step": 7423, + "time_per_iteration": 2.4828243255615234 + }, + { + "auxiliary_loss_clip": 0.01110236, + "auxiliary_loss_mlp": 0.01028687, + "balance_loss_clip": 1.01719534, + "balance_loss_mlp": 1.04027271, + "epoch": 0.44635502780700437, + "flos": 17820096157440.0, + "grad_norm": 1.510185037273786, + "language_loss": 0.78842837, + "learning_rate": 2.4409184510652807e-06, + "loss": 0.80981761, + "num_input_tokens_seen": 159152540, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.69921875, + "step": 7424, + "time_per_iteration": 2.4399938583374023 + }, + { + "auxiliary_loss_clip": 0.01111318, + "auxiliary_loss_mlp": 0.01032011, + "balance_loss_clip": 1.02010214, + "balance_loss_mlp": 1.04070699, + "epoch": 0.44641515105967233, + "flos": 26688020561280.0, + "grad_norm": 1.3563203456934205, + "language_loss": 0.80225039, + "learning_rate": 2.4405385632671063e-06, + "loss": 0.82368374, + "num_input_tokens_seen": 159173425, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.703125, + "step": 7425, + "time_per_iteration": 2.5406088829040527 + }, + { + "auxiliary_loss_clip": 0.01111697, + "auxiliary_loss_mlp": 0.01031502, + "balance_loss_clip": 1.0190568, + "balance_loss_mlp": 1.04027843, + "epoch": 0.4464752743123403, + "flos": 18913324544640.0, + "grad_norm": 2.6114514678489895, + "language_loss": 0.77294517, + "learning_rate": 2.4401586587636655e-06, + "loss": 0.79437709, + "num_input_tokens_seen": 159191210, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71484375, + "step": 7426, + "time_per_iteration": 2.4845876693725586 + }, + { + "auxiliary_loss_clip": 0.01112123, + "auxiliary_loss_mlp": 0.01028013, + "balance_loss_clip": 1.01636636, + "balance_loss_mlp": 1.03881311, + "epoch": 0.44653539756500826, + "flos": 29570318582400.0, + "grad_norm": 1.552934875151276, + "language_loss": 0.64668226, + "learning_rate": 2.4397787375693634e-06, + "loss": 0.66808361, + "num_input_tokens_seen": 159211755, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.734375, + "step": 7427, + "time_per_iteration": 2.540630340576172 + }, + { + "auxiliary_loss_clip": 0.01116984, + "auxiliary_loss_mlp": 0.01032083, + "balance_loss_clip": 1.02009046, + "balance_loss_mlp": 1.04497719, + "epoch": 0.44659552081767623, + "flos": 21468979261440.0, + "grad_norm": 1.583763048167789, + "language_loss": 0.75103819, + "learning_rate": 2.439398799698608e-06, + "loss": 0.77252889, + "num_input_tokens_seen": 159230315, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.71875, + "step": 7428, + "time_per_iteration": 3.8718421459198 + }, + { + "auxiliary_loss_clip": 0.01111211, + "auxiliary_loss_mlp": 0.01032965, + "balance_loss_clip": 1.0205195, + "balance_loss_mlp": 1.03955674, + "epoch": 0.4466556440703442, + "flos": 17931886260480.0, + "grad_norm": 1.8476152433667956, + "language_loss": 0.77595931, + "learning_rate": 2.439018845165806e-06, + "loss": 0.79740107, + "num_input_tokens_seen": 159249810, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71875, + "step": 7429, + "time_per_iteration": 5.381062984466553 + }, + { + "auxiliary_loss_clip": 0.01114674, + "auxiliary_loss_mlp": 0.01032028, + "balance_loss_clip": 1.01935029, + "balance_loss_mlp": 1.04038692, + "epoch": 0.44671576732301216, + "flos": 21107430915840.0, + "grad_norm": 1.5332211966047418, + "language_loss": 0.91229695, + "learning_rate": 2.438638873985366e-06, + "loss": 0.93376398, + "num_input_tokens_seen": 159271715, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7421875, + "step": 7430, + "time_per_iteration": 2.4677700996398926 + }, + { + "auxiliary_loss_clip": 0.0111698, + "auxiliary_loss_mlp": 0.01038071, + "balance_loss_clip": 1.02439737, + "balance_loss_mlp": 1.04052413, + "epoch": 0.4467758905756801, + "flos": 23508920459520.0, + "grad_norm": 1.5443417480404311, + "language_loss": 0.79630744, + "learning_rate": 2.4382588861716954e-06, + "loss": 0.81785798, + "num_input_tokens_seen": 159290690, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.765625, + "step": 7431, + "time_per_iteration": 2.567082405090332 + }, + { + "auxiliary_loss_clip": 0.0111745, + "auxiliary_loss_mlp": 0.01037244, + "balance_loss_clip": 1.02438116, + "balance_loss_mlp": 1.04187393, + "epoch": 0.4468360138283481, + "flos": 18734022829440.0, + "grad_norm": 2.0676923701008807, + "language_loss": 0.80376756, + "learning_rate": 2.437878881739204e-06, + "loss": 0.82531446, + "num_input_tokens_seen": 159309400, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.75390625, + "step": 7432, + "time_per_iteration": 2.4359145164489746 + }, + { + "auxiliary_loss_clip": 0.01115042, + "auxiliary_loss_mlp": 0.01036362, + "balance_loss_clip": 1.02394009, + "balance_loss_mlp": 1.03957176, + "epoch": 0.4468961370810161, + "flos": 23477139901440.0, + "grad_norm": 2.022128912320156, + "language_loss": 0.76601076, + "learning_rate": 2.437498860702301e-06, + "loss": 0.78752482, + "num_input_tokens_seen": 159327425, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.75390625, + "step": 7433, + "time_per_iteration": 2.48732852935791 + }, + { + "auxiliary_loss_clip": 0.0110862, + "auxiliary_loss_mlp": 0.01033692, + "balance_loss_clip": 1.0233326, + "balance_loss_mlp": 1.03873658, + "epoch": 0.4469562603336841, + "flos": 30075042539520.0, + "grad_norm": 1.6660023236153727, + "language_loss": 0.7773807, + "learning_rate": 2.437118823075398e-06, + "loss": 0.79880381, + "num_input_tokens_seen": 159345805, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.69921875, + "step": 7434, + "time_per_iteration": 2.501410961151123 + }, + { + "auxiliary_loss_clip": 0.01117105, + "auxiliary_loss_mlp": 0.01034097, + "balance_loss_clip": 1.02160966, + "balance_loss_mlp": 1.04261708, + "epoch": 0.44701638358635204, + "flos": 22456415116800.0, + "grad_norm": 1.6324454169441744, + "language_loss": 0.64255738, + "learning_rate": 2.436738768872905e-06, + "loss": 0.66406941, + "num_input_tokens_seen": 159364595, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.74609375, + "step": 7435, + "time_per_iteration": 2.506918430328369 + }, + { + "auxiliary_loss_clip": 0.01116438, + "auxiliary_loss_mlp": 0.01029509, + "balance_loss_clip": 1.01706398, + "balance_loss_mlp": 1.04181314, + "epoch": 0.44707650683902, + "flos": 24057851080320.0, + "grad_norm": 1.4705490989927619, + "language_loss": 0.83558768, + "learning_rate": 2.4363586981092346e-06, + "loss": 0.8570472, + "num_input_tokens_seen": 159385265, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.74609375, + "step": 7436, + "time_per_iteration": 2.482273817062378 + }, + { + "auxiliary_loss_clip": 0.01114793, + "auxiliary_loss_mlp": 0.01033883, + "balance_loss_clip": 1.02067423, + "balance_loss_mlp": 1.0400939, + "epoch": 0.44713663009168797, + "flos": 23766938830080.0, + "grad_norm": 1.6782401052542175, + "language_loss": 0.79564971, + "learning_rate": 2.435978610798798e-06, + "loss": 0.81713653, + "num_input_tokens_seen": 159405080, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.74609375, + "step": 7437, + "time_per_iteration": 2.519118309020996 + }, + { + "auxiliary_loss_clip": 0.01114275, + "auxiliary_loss_mlp": 0.01032586, + "balance_loss_clip": 1.02017021, + "balance_loss_mlp": 1.03965664, + "epoch": 0.44719675334435594, + "flos": 24499265316480.0, + "grad_norm": 1.5877629147247494, + "language_loss": 0.71921134, + "learning_rate": 2.435598506956009e-06, + "loss": 0.74067998, + "num_input_tokens_seen": 159424595, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.74609375, + "step": 7438, + "time_per_iteration": 2.4918689727783203 + }, + { + "auxiliary_loss_clip": 0.01114196, + "auxiliary_loss_mlp": 0.01034386, + "balance_loss_clip": 1.02114832, + "balance_loss_mlp": 1.03908634, + "epoch": 0.4472568765970239, + "flos": 29781759991680.0, + "grad_norm": 1.558408845854645, + "language_loss": 0.67469549, + "learning_rate": 2.4352183865952808e-06, + "loss": 0.6961813, + "num_input_tokens_seen": 159443865, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75, + "step": 7439, + "time_per_iteration": 2.549445390701294 + }, + { + "auxiliary_loss_clip": 0.01116567, + "auxiliary_loss_mlp": 0.01035107, + "balance_loss_clip": 1.0218277, + "balance_loss_mlp": 1.04164815, + "epoch": 0.44731699984969187, + "flos": 24643123286400.0, + "grad_norm": 1.6525243551580215, + "language_loss": 0.73600596, + "learning_rate": 2.4348382497310285e-06, + "loss": 0.7575227, + "num_input_tokens_seen": 159464525, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75, + "step": 7440, + "time_per_iteration": 2.487545967102051 + }, + { + "auxiliary_loss_clip": 0.01112285, + "auxiliary_loss_mlp": 0.01034061, + "balance_loss_clip": 1.02215195, + "balance_loss_mlp": 1.03937638, + "epoch": 0.44737712310235983, + "flos": 29455691304960.0, + "grad_norm": 1.5916362290459067, + "language_loss": 0.74376386, + "learning_rate": 2.4344580963776655e-06, + "loss": 0.76522732, + "num_input_tokens_seen": 159486385, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.73046875, + "step": 7441, + "time_per_iteration": 2.537848472595215 + }, + { + "auxiliary_loss_clip": 0.01116121, + "auxiliary_loss_mlp": 0.01039305, + "balance_loss_clip": 1.02585804, + "balance_loss_mlp": 1.04112506, + "epoch": 0.4474372463550278, + "flos": 24896832024960.0, + "grad_norm": 2.062950208020596, + "language_loss": 0.74780977, + "learning_rate": 2.4340779265496082e-06, + "loss": 0.769364, + "num_input_tokens_seen": 159503880, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 7442, + "time_per_iteration": 2.45829701423645 + }, + { + "auxiliary_loss_clip": 0.0111647, + "auxiliary_loss_mlp": 0.01034752, + "balance_loss_clip": 1.02123356, + "balance_loss_mlp": 1.03977489, + "epoch": 0.44749736960769576, + "flos": 33181603125120.0, + "grad_norm": 1.7358505546612006, + "language_loss": 0.7456758, + "learning_rate": 2.433697740261273e-06, + "loss": 0.76718801, + "num_input_tokens_seen": 159522980, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.76953125, + "step": 7443, + "time_per_iteration": 2.604759931564331 + }, + { + "auxiliary_loss_clip": 0.01111225, + "auxiliary_loss_mlp": 0.01028498, + "balance_loss_clip": 1.01500916, + "balance_loss_mlp": 1.0379262, + "epoch": 0.4475574928603637, + "flos": 21071807602560.0, + "grad_norm": 1.8898561004653542, + "language_loss": 0.77591091, + "learning_rate": 2.4333175375270748e-06, + "loss": 0.79730821, + "num_input_tokens_seen": 159543340, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 7444, + "time_per_iteration": 2.5373945236206055 + }, + { + "auxiliary_loss_clip": 0.01110179, + "auxiliary_loss_mlp": 0.01030626, + "balance_loss_clip": 1.01813924, + "balance_loss_mlp": 1.03841698, + "epoch": 0.4476176161130317, + "flos": 21862523646720.0, + "grad_norm": 2.3020631966175893, + "language_loss": 0.85495317, + "learning_rate": 2.4329373183614333e-06, + "loss": 0.87636125, + "num_input_tokens_seen": 159558210, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71875, + "step": 7445, + "time_per_iteration": 2.4707260131835938 + }, + { + "auxiliary_loss_clip": 0.01116558, + "auxiliary_loss_mlp": 0.01030825, + "balance_loss_clip": 1.01741982, + "balance_loss_mlp": 1.04191256, + "epoch": 0.4476777393656997, + "flos": 22528667324160.0, + "grad_norm": 3.672789877680737, + "language_loss": 0.64349431, + "learning_rate": 2.432557082778765e-06, + "loss": 0.66496813, + "num_input_tokens_seen": 159577920, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.74609375, + "step": 7446, + "time_per_iteration": 2.4802255630493164 + }, + { + "auxiliary_loss_clip": 0.0103814, + "auxiliary_loss_mlp": 0.0100263, + "balance_loss_clip": 1.00128329, + "balance_loss_mlp": 1.01421368, + "epoch": 0.4477378626183677, + "flos": 49017133877760.0, + "grad_norm": 0.7477421339074387, + "language_loss": 0.50242257, + "learning_rate": 2.4321768307934884e-06, + "loss": 0.52283025, + "num_input_tokens_seen": 159632295, + "router_z_loss_clip": 0.01348877, + "router_z_loss_mlp": 0.24023438, + "step": 7447, + "time_per_iteration": 2.9262073040008545 + }, + { + "auxiliary_loss_clip": 0.01037975, + "auxiliary_loss_mlp": 0.01002161, + "balance_loss_clip": 1.00088537, + "balance_loss_mlp": 1.01407075, + "epoch": 0.44779798587103564, + "flos": 56542179392640.0, + "grad_norm": 0.7583700928831021, + "language_loss": 0.59290731, + "learning_rate": 2.4317965624200235e-06, + "loss": 0.61330867, + "num_input_tokens_seen": 159698435, + "router_z_loss_clip": 0.01275635, + "router_z_loss_mlp": 0.23925781, + "step": 7448, + "time_per_iteration": 3.2298059463500977 + }, + { + "auxiliary_loss_clip": 0.01112419, + "auxiliary_loss_mlp": 0.01032837, + "balance_loss_clip": 1.02082074, + "balance_loss_mlp": 1.03913987, + "epoch": 0.4478581091237036, + "flos": 46498536040320.0, + "grad_norm": 1.4697324100578784, + "language_loss": 0.59226847, + "learning_rate": 2.431416277672789e-06, + "loss": 0.61372101, + "num_input_tokens_seen": 159722150, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.73046875, + "step": 7449, + "time_per_iteration": 2.667651891708374 + }, + { + "auxiliary_loss_clip": 0.01114654, + "auxiliary_loss_mlp": 0.01028568, + "balance_loss_clip": 1.01638436, + "balance_loss_mlp": 1.04082561, + "epoch": 0.4479182323763716, + "flos": 20814363849600.0, + "grad_norm": 1.6912833904949394, + "language_loss": 0.79799938, + "learning_rate": 2.4310359765662065e-06, + "loss": 0.8194316, + "num_input_tokens_seen": 159740550, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.73828125, + "step": 7450, + "time_per_iteration": 2.488041400909424 + }, + { + "auxiliary_loss_clip": 0.01112446, + "auxiliary_loss_mlp": 0.01031639, + "balance_loss_clip": 1.01900911, + "balance_loss_mlp": 1.03948057, + "epoch": 0.44797835562903954, + "flos": 14245979212800.0, + "grad_norm": 2.443005371711525, + "language_loss": 0.79474008, + "learning_rate": 2.430655659114697e-06, + "loss": 0.81618094, + "num_input_tokens_seen": 159758245, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.73046875, + "step": 7451, + "time_per_iteration": 2.4184346199035645 + }, + { + "auxiliary_loss_clip": 0.01037194, + "auxiliary_loss_mlp": 0.01000693, + "balance_loss_clip": 0.99944174, + "balance_loss_mlp": 1.01323009, + "epoch": 0.4480384788817075, + "flos": 63534560169600.0, + "grad_norm": 2.1611139577707608, + "language_loss": 0.62848771, + "learning_rate": 2.430275325332681e-06, + "loss": 0.64886659, + "num_input_tokens_seen": 159826790, + "router_z_loss_clip": 0.01251221, + "router_z_loss_mlp": 0.24023438, + "step": 7452, + "time_per_iteration": 3.1637966632843018 + }, + { + "auxiliary_loss_clip": 0.01115495, + "auxiliary_loss_mlp": 0.01036415, + "balance_loss_clip": 1.0227952, + "balance_loss_mlp": 1.04087877, + "epoch": 0.44809860213437547, + "flos": 21652626522240.0, + "grad_norm": 1.7752989444397396, + "language_loss": 0.62657529, + "learning_rate": 2.429894975234582e-06, + "loss": 0.64809442, + "num_input_tokens_seen": 159845805, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.74609375, + "step": 7453, + "time_per_iteration": 2.4473493099212646 + }, + { + "auxiliary_loss_clip": 0.01036714, + "auxiliary_loss_mlp": 0.01000711, + "balance_loss_clip": 0.99935836, + "balance_loss_mlp": 1.01265335, + "epoch": 0.44815872538704343, + "flos": 69190634246400.0, + "grad_norm": 0.7532005340797263, + "language_loss": 0.57028639, + "learning_rate": 2.4295146088348224e-06, + "loss": 0.59066069, + "num_input_tokens_seen": 159898860, + "router_z_loss_clip": 0.0135498, + "router_z_loss_mlp": 0.24023438, + "step": 7454, + "time_per_iteration": 2.9524526596069336 + }, + { + "auxiliary_loss_clip": 0.01111502, + "auxiliary_loss_mlp": 0.01027601, + "balance_loss_clip": 1.01563811, + "balance_loss_mlp": 1.03850055, + "epoch": 0.4482188486397114, + "flos": 12598289510400.0, + "grad_norm": 2.2509965352428334, + "language_loss": 0.75078607, + "learning_rate": 2.4291342261478255e-06, + "loss": 0.7721771, + "num_input_tokens_seen": 159911555, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.73046875, + "step": 7455, + "time_per_iteration": 2.4103891849517822 + }, + { + "auxiliary_loss_clip": 0.01112978, + "auxiliary_loss_mlp": 0.01029679, + "balance_loss_clip": 1.01761508, + "balance_loss_mlp": 1.03976846, + "epoch": 0.44827897189237936, + "flos": 34058182631040.0, + "grad_norm": 1.6579032105665654, + "language_loss": 0.76428723, + "learning_rate": 2.428753827188016e-06, + "loss": 0.78571379, + "num_input_tokens_seen": 159931470, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.734375, + "step": 7456, + "time_per_iteration": 2.5631935596466064 + }, + { + "auxiliary_loss_clip": 0.01115214, + "auxiliary_loss_mlp": 0.01035659, + "balance_loss_clip": 1.02398849, + "balance_loss_mlp": 1.04312015, + "epoch": 0.44833909514504733, + "flos": 25147416280320.0, + "grad_norm": 1.9831255862845865, + "language_loss": 0.76475745, + "learning_rate": 2.428373411969818e-06, + "loss": 0.78626615, + "num_input_tokens_seen": 159946115, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.71875, + "step": 7457, + "time_per_iteration": 2.464808702468872 + }, + { + "auxiliary_loss_clip": 0.01113345, + "auxiliary_loss_mlp": 0.01029291, + "balance_loss_clip": 1.01611805, + "balance_loss_mlp": 1.03910387, + "epoch": 0.4483992183977153, + "flos": 16179984224640.0, + "grad_norm": 1.9767465188311044, + "language_loss": 0.67705971, + "learning_rate": 2.4279929805076576e-06, + "loss": 0.69848609, + "num_input_tokens_seen": 159963915, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 7458, + "time_per_iteration": 2.4457101821899414 + }, + { + "auxiliary_loss_clip": 0.01116638, + "auxiliary_loss_mlp": 0.01031464, + "balance_loss_clip": 1.01787972, + "balance_loss_mlp": 1.04051626, + "epoch": 0.44845934165038326, + "flos": 17746048270080.0, + "grad_norm": 1.5619796593676711, + "language_loss": 0.72202468, + "learning_rate": 2.427612532815961e-06, + "loss": 0.74350572, + "num_input_tokens_seen": 159982140, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.76171875, + "step": 7459, + "time_per_iteration": 2.433029890060425 + }, + { + "auxiliary_loss_clip": 0.0110945, + "auxiliary_loss_mlp": 0.01029093, + "balance_loss_clip": 1.01676071, + "balance_loss_mlp": 1.03716815, + "epoch": 0.4485194649030513, + "flos": 21835914647040.0, + "grad_norm": 1.8000530949283695, + "language_loss": 0.69520539, + "learning_rate": 2.427232068909154e-06, + "loss": 0.71659082, + "num_input_tokens_seen": 160002280, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.72265625, + "step": 7460, + "time_per_iteration": 2.4872210025787354 + }, + { + "auxiliary_loss_clip": 0.01111602, + "auxiliary_loss_mlp": 0.01034327, + "balance_loss_clip": 1.02144051, + "balance_loss_mlp": 1.03848231, + "epoch": 0.44857958815571924, + "flos": 20084515401600.0, + "grad_norm": 1.9864484577730697, + "language_loss": 0.77204525, + "learning_rate": 2.4268515888016635e-06, + "loss": 0.79350454, + "num_input_tokens_seen": 160020260, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.734375, + "step": 7461, + "time_per_iteration": 2.455543279647827 + }, + { + "auxiliary_loss_clip": 0.01111999, + "auxiliary_loss_mlp": 0.01029771, + "balance_loss_clip": 1.0180943, + "balance_loss_mlp": 1.03780031, + "epoch": 0.4486397114083872, + "flos": 27053519402880.0, + "grad_norm": 1.7106561387980361, + "language_loss": 0.67983574, + "learning_rate": 2.4264710925079184e-06, + "loss": 0.70125341, + "num_input_tokens_seen": 160040240, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.7421875, + "step": 7462, + "time_per_iteration": 2.5366299152374268 + }, + { + "auxiliary_loss_clip": 0.01034999, + "auxiliary_loss_mlp": 0.01002003, + "balance_loss_clip": 1.00071561, + "balance_loss_mlp": 1.01134682, + "epoch": 0.4486998346610552, + "flos": 67321195931520.0, + "grad_norm": 0.7463947253576576, + "language_loss": 0.54503644, + "learning_rate": 2.4260905800423462e-06, + "loss": 0.56540644, + "num_input_tokens_seen": 160093865, + "router_z_loss_clip": 0.01287842, + "router_z_loss_mlp": 0.23632812, + "step": 7463, + "time_per_iteration": 3.0639255046844482 + }, + { + "auxiliary_loss_clip": 0.01110954, + "auxiliary_loss_mlp": 0.01029704, + "balance_loss_clip": 1.01699638, + "balance_loss_mlp": 1.03847826, + "epoch": 0.44875995791372314, + "flos": 27636816360960.0, + "grad_norm": 1.9527582175804243, + "language_loss": 0.75866246, + "learning_rate": 2.4257100514193775e-06, + "loss": 0.78006899, + "num_input_tokens_seen": 160113590, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 7464, + "time_per_iteration": 2.5135347843170166 + }, + { + "auxiliary_loss_clip": 0.01109858, + "auxiliary_loss_mlp": 0.01033694, + "balance_loss_clip": 1.02225554, + "balance_loss_mlp": 1.03903246, + "epoch": 0.4488200811663911, + "flos": 13005947940480.0, + "grad_norm": 1.8117694427226085, + "language_loss": 0.73671377, + "learning_rate": 2.425329506653441e-06, + "loss": 0.75814927, + "num_input_tokens_seen": 160131795, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.70703125, + "step": 7465, + "time_per_iteration": 2.433394432067871 + }, + { + "auxiliary_loss_clip": 0.01118642, + "auxiliary_loss_mlp": 0.01035747, + "balance_loss_clip": 1.02193666, + "balance_loss_mlp": 1.04127038, + "epoch": 0.44888020441905907, + "flos": 27489977562240.0, + "grad_norm": 1.824586312100338, + "language_loss": 0.7996276, + "learning_rate": 2.424948945758966e-06, + "loss": 0.82117152, + "num_input_tokens_seen": 160150635, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7734375, + "step": 7466, + "time_per_iteration": 2.5013458728790283 + }, + { + "auxiliary_loss_clip": 0.01114545, + "auxiliary_loss_mlp": 0.01031895, + "balance_loss_clip": 1.01967633, + "balance_loss_mlp": 1.04118383, + "epoch": 0.44894032767172704, + "flos": 18259678800000.0, + "grad_norm": 2.612382799524426, + "language_loss": 0.80522013, + "learning_rate": 2.4245683687503844e-06, + "loss": 0.82668447, + "num_input_tokens_seen": 160168615, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.734375, + "step": 7467, + "time_per_iteration": 2.4517929553985596 + }, + { + "auxiliary_loss_clip": 0.01109457, + "auxiliary_loss_mlp": 0.01031548, + "balance_loss_clip": 1.01998448, + "balance_loss_mlp": 1.03988719, + "epoch": 0.449000450924395, + "flos": 21579835610880.0, + "grad_norm": 1.7208509955346651, + "language_loss": 0.75153285, + "learning_rate": 2.424187775642129e-06, + "loss": 0.7729429, + "num_input_tokens_seen": 160187295, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6953125, + "step": 7468, + "time_per_iteration": 2.4585771560668945 + }, + { + "auxiliary_loss_clip": 0.01112269, + "auxiliary_loss_mlp": 0.01025298, + "balance_loss_clip": 1.01422918, + "balance_loss_mlp": 1.04034877, + "epoch": 0.44906057417706297, + "flos": 17967904623360.0, + "grad_norm": 1.8721286685005696, + "language_loss": 0.7099303, + "learning_rate": 2.4238071664486297e-06, + "loss": 0.73130596, + "num_input_tokens_seen": 160205115, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.71875, + "step": 7469, + "time_per_iteration": 2.420208692550659 + }, + { + "auxiliary_loss_clip": 0.01114048, + "auxiliary_loss_mlp": 0.01035745, + "balance_loss_clip": 1.02298415, + "balance_loss_mlp": 1.04046845, + "epoch": 0.44912069742973093, + "flos": 20047347803520.0, + "grad_norm": 1.7828692415308351, + "language_loss": 0.71891844, + "learning_rate": 2.4234265411843203e-06, + "loss": 0.74041635, + "num_input_tokens_seen": 160222580, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.734375, + "step": 7470, + "time_per_iteration": 5.381145477294922 + }, + { + "auxiliary_loss_clip": 0.01112344, + "auxiliary_loss_mlp": 0.01032399, + "balance_loss_clip": 1.01940536, + "balance_loss_mlp": 1.03871441, + "epoch": 0.4491808206823989, + "flos": 21033526682880.0, + "grad_norm": 2.1026178485463274, + "language_loss": 0.76912111, + "learning_rate": 2.423045899863634e-06, + "loss": 0.79056853, + "num_input_tokens_seen": 160241520, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.73828125, + "step": 7471, + "time_per_iteration": 3.925541400909424 + }, + { + "auxiliary_loss_clip": 0.01113353, + "auxiliary_loss_mlp": 0.01033085, + "balance_loss_clip": 1.02128363, + "balance_loss_mlp": 1.04100883, + "epoch": 0.44924094393506686, + "flos": 22967136645120.0, + "grad_norm": 1.8719894830330126, + "language_loss": 0.70339048, + "learning_rate": 2.4226652425010048e-06, + "loss": 0.72485489, + "num_input_tokens_seen": 160261815, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.7265625, + "step": 7472, + "time_per_iteration": 2.5138602256774902 + }, + { + "auxiliary_loss_clip": 0.01038244, + "auxiliary_loss_mlp": 0.01015151, + "balance_loss_clip": 1.01388156, + "balance_loss_mlp": 1.01404762, + "epoch": 0.4493010671877349, + "flos": 59233467864960.0, + "grad_norm": 0.7429949026472541, + "language_loss": 0.61734539, + "learning_rate": 2.4222845691108676e-06, + "loss": 0.63787931, + "num_input_tokens_seen": 160317070, + "router_z_loss_clip": 0.01269531, + "router_z_loss_mlp": 0.2421875, + "step": 7473, + "time_per_iteration": 3.0049262046813965 + }, + { + "auxiliary_loss_clip": 0.01114767, + "auxiliary_loss_mlp": 0.01037887, + "balance_loss_clip": 1.02495253, + "balance_loss_mlp": 1.04087818, + "epoch": 0.44936119044040285, + "flos": 18004892653440.0, + "grad_norm": 2.4001000632965828, + "language_loss": 0.78185022, + "learning_rate": 2.421903879707657e-06, + "loss": 0.80337679, + "num_input_tokens_seen": 160334980, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.73828125, + "step": 7474, + "time_per_iteration": 2.4396324157714844 + }, + { + "auxiliary_loss_clip": 0.01110455, + "auxiliary_loss_mlp": 0.01034912, + "balance_loss_clip": 1.0225265, + "balance_loss_mlp": 1.04009926, + "epoch": 0.4494213136930708, + "flos": 21251827589760.0, + "grad_norm": 1.704620828516005, + "language_loss": 0.72103465, + "learning_rate": 2.4215231743058086e-06, + "loss": 0.74248827, + "num_input_tokens_seen": 160354500, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.703125, + "step": 7475, + "time_per_iteration": 2.464167356491089 + }, + { + "auxiliary_loss_clip": 0.01112269, + "auxiliary_loss_mlp": 0.01030142, + "balance_loss_clip": 1.01847768, + "balance_loss_mlp": 1.03917694, + "epoch": 0.4494814369457388, + "flos": 27418695022080.0, + "grad_norm": 1.7869016250475191, + "language_loss": 0.76343799, + "learning_rate": 2.4211424529197594e-06, + "loss": 0.7848621, + "num_input_tokens_seen": 160373650, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.73046875, + "step": 7476, + "time_per_iteration": 2.529374837875366 + }, + { + "auxiliary_loss_clip": 0.01116691, + "auxiliary_loss_mlp": 0.010361, + "balance_loss_clip": 1.02194357, + "balance_loss_mlp": 1.04036331, + "epoch": 0.44954156019840674, + "flos": 22854053652480.0, + "grad_norm": 2.3312494175836034, + "language_loss": 0.71774453, + "learning_rate": 2.4207617155639464e-06, + "loss": 0.73927242, + "num_input_tokens_seen": 160393430, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.765625, + "step": 7477, + "time_per_iteration": 2.4914534091949463 + }, + { + "auxiliary_loss_clip": 0.01116651, + "auxiliary_loss_mlp": 0.01030749, + "balance_loss_clip": 1.01757061, + "balance_loss_mlp": 1.04089749, + "epoch": 0.4496016834510747, + "flos": 17201570935680.0, + "grad_norm": 2.2338487326584073, + "language_loss": 0.68136394, + "learning_rate": 2.4203809622528062e-06, + "loss": 0.70283794, + "num_input_tokens_seen": 160410545, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7578125, + "step": 7478, + "time_per_iteration": 2.4622039794921875 + }, + { + "auxiliary_loss_clip": 0.01112091, + "auxiliary_loss_mlp": 0.01032835, + "balance_loss_clip": 1.02097332, + "balance_loss_mlp": 1.04130244, + "epoch": 0.4496618067037427, + "flos": 18916628595840.0, + "grad_norm": 1.8288012816153718, + "language_loss": 0.89528286, + "learning_rate": 2.420000193000779e-06, + "loss": 0.91673213, + "num_input_tokens_seen": 160428105, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.7109375, + "step": 7479, + "time_per_iteration": 2.4738242626190186 + }, + { + "auxiliary_loss_clip": 0.01114783, + "auxiliary_loss_mlp": 0.01032708, + "balance_loss_clip": 1.01970804, + "balance_loss_mlp": 1.0423162, + "epoch": 0.44972192995641064, + "flos": 21031659175680.0, + "grad_norm": 2.1133613410879155, + "language_loss": 0.75824946, + "learning_rate": 2.419619407822302e-06, + "loss": 0.77972436, + "num_input_tokens_seen": 160448815, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7265625, + "step": 7480, + "time_per_iteration": 2.536190986633301 + }, + { + "auxiliary_loss_clip": 0.01116796, + "auxiliary_loss_mlp": 0.01031783, + "balance_loss_clip": 1.01906347, + "balance_loss_mlp": 1.04211199, + "epoch": 0.4497820532090786, + "flos": 20777088510720.0, + "grad_norm": 2.1813635775429794, + "language_loss": 0.80066407, + "learning_rate": 2.419238606731815e-06, + "loss": 0.82214987, + "num_input_tokens_seen": 160465940, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.74609375, + "step": 7481, + "time_per_iteration": 2.4618031978607178 + }, + { + "auxiliary_loss_clip": 0.01110042, + "auxiliary_loss_mlp": 0.01030863, + "balance_loss_clip": 1.01809597, + "balance_loss_mlp": 1.04028749, + "epoch": 0.44984217646174657, + "flos": 33802606385280.0, + "grad_norm": 1.5995355023246276, + "language_loss": 0.68636084, + "learning_rate": 2.418857789743758e-06, + "loss": 0.70776993, + "num_input_tokens_seen": 160486710, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 7482, + "time_per_iteration": 2.5711851119995117 + }, + { + "auxiliary_loss_clip": 0.0111451, + "auxiliary_loss_mlp": 0.01035168, + "balance_loss_clip": 1.02260911, + "balance_loss_mlp": 1.04059076, + "epoch": 0.44990229971441453, + "flos": 15518365660800.0, + "grad_norm": 2.0339843826279504, + "language_loss": 0.84802616, + "learning_rate": 2.418476956872571e-06, + "loss": 0.86952293, + "num_input_tokens_seen": 160503405, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.73828125, + "step": 7483, + "time_per_iteration": 2.4510746002197266 + }, + { + "auxiliary_loss_clip": 0.01120092, + "auxiliary_loss_mlp": 0.01034767, + "balance_loss_clip": 1.02177286, + "balance_loss_mlp": 1.04386485, + "epoch": 0.4499624229670825, + "flos": 29861913191040.0, + "grad_norm": 1.8187080510096723, + "language_loss": 0.80409968, + "learning_rate": 2.4180961081326967e-06, + "loss": 0.82564819, + "num_input_tokens_seen": 160525080, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.765625, + "step": 7484, + "time_per_iteration": 2.539834976196289 + }, + { + "auxiliary_loss_clip": 0.01118118, + "auxiliary_loss_mlp": 0.01028797, + "balance_loss_clip": 1.01529098, + "balance_loss_mlp": 1.03992271, + "epoch": 0.45002254621975046, + "flos": 18513674847360.0, + "grad_norm": 2.310143901315373, + "language_loss": 0.75594473, + "learning_rate": 2.4177152435385754e-06, + "loss": 0.77741385, + "num_input_tokens_seen": 160540895, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.78125, + "step": 7485, + "time_per_iteration": 2.408979892730713 + }, + { + "auxiliary_loss_clip": 0.01041505, + "auxiliary_loss_mlp": 0.01002218, + "balance_loss_clip": 1.00065601, + "balance_loss_mlp": 1.0170331, + "epoch": 0.4500826694724185, + "flos": 70420394229120.0, + "grad_norm": 0.7895891566174408, + "language_loss": 0.5867179, + "learning_rate": 2.4173343631046504e-06, + "loss": 0.60715508, + "num_input_tokens_seen": 160598270, + "router_z_loss_clip": 0.015625, + "router_z_loss_mlp": 0.24511719, + "step": 7486, + "time_per_iteration": 3.09049654006958 + }, + { + "auxiliary_loss_clip": 0.01113596, + "auxiliary_loss_mlp": 0.01031271, + "balance_loss_clip": 1.0184797, + "balance_loss_mlp": 1.04104531, + "epoch": 0.45014279272508645, + "flos": 15778897983360.0, + "grad_norm": 2.266854053846726, + "language_loss": 0.83153397, + "learning_rate": 2.4169534668453654e-06, + "loss": 0.85298264, + "num_input_tokens_seen": 160614720, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7265625, + "step": 7487, + "time_per_iteration": 2.431209087371826 + }, + { + "auxiliary_loss_clip": 0.01113173, + "auxiliary_loss_mlp": 0.01028971, + "balance_loss_clip": 1.01626313, + "balance_loss_mlp": 1.04103804, + "epoch": 0.4502029159777544, + "flos": 21799573061760.0, + "grad_norm": 1.5035728003068896, + "language_loss": 0.77055335, + "learning_rate": 2.4165725547751622e-06, + "loss": 0.79197478, + "num_input_tokens_seen": 160635170, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 7488, + "time_per_iteration": 2.5085837841033936 + }, + { + "auxiliary_loss_clip": 0.01121581, + "auxiliary_loss_mlp": 0.01038217, + "balance_loss_clip": 1.02446008, + "balance_loss_mlp": 1.04378915, + "epoch": 0.4502630392304224, + "flos": 28767966531840.0, + "grad_norm": 2.6401168824150574, + "language_loss": 0.71564645, + "learning_rate": 2.4161916269084858e-06, + "loss": 0.73724437, + "num_input_tokens_seen": 160654490, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.77734375, + "step": 7489, + "time_per_iteration": 2.5106120109558105 + }, + { + "auxiliary_loss_clip": 0.01119744, + "auxiliary_loss_mlp": 0.01032559, + "balance_loss_clip": 1.01856422, + "balance_loss_mlp": 1.04424906, + "epoch": 0.45032316248309034, + "flos": 15844182952320.0, + "grad_norm": 2.1685657644370853, + "language_loss": 0.6962117, + "learning_rate": 2.4158106832597817e-06, + "loss": 0.71773469, + "num_input_tokens_seen": 160669400, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7578125, + "step": 7490, + "time_per_iteration": 2.4383597373962402 + }, + { + "auxiliary_loss_clip": 0.01038961, + "auxiliary_loss_mlp": 0.01000463, + "balance_loss_clip": 0.99907476, + "balance_loss_mlp": 1.01472032, + "epoch": 0.4503832857357583, + "flos": 57853600945920.0, + "grad_norm": 1.805652104877531, + "language_loss": 0.56691748, + "learning_rate": 2.415429723843495e-06, + "loss": 0.5873118, + "num_input_tokens_seen": 160733820, + "router_z_loss_clip": 0.01391602, + "router_z_loss_mlp": 0.2421875, + "step": 7491, + "time_per_iteration": 3.0662994384765625 + }, + { + "auxiliary_loss_clip": 0.01111025, + "auxiliary_loss_mlp": 0.01029852, + "balance_loss_clip": 1.01719177, + "balance_loss_mlp": 1.03987265, + "epoch": 0.4504434089884263, + "flos": 23878082488320.0, + "grad_norm": 1.5869212574214921, + "language_loss": 0.79462028, + "learning_rate": 2.4150487486740713e-06, + "loss": 0.81602901, + "num_input_tokens_seen": 160753175, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 7492, + "time_per_iteration": 2.497849464416504 + }, + { + "auxiliary_loss_clip": 0.01119638, + "auxiliary_loss_mlp": 0.01038139, + "balance_loss_clip": 1.02474022, + "balance_loss_mlp": 1.04271042, + "epoch": 0.45050353224109424, + "flos": 17785083375360.0, + "grad_norm": 2.074371460837293, + "language_loss": 0.92560953, + "learning_rate": 2.4146677577659573e-06, + "loss": 0.9471873, + "num_input_tokens_seen": 160768310, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.76953125, + "step": 7493, + "time_per_iteration": 2.4717981815338135 + }, + { + "auxiliary_loss_clip": 0.01039013, + "auxiliary_loss_mlp": 0.01000993, + "balance_loss_clip": 0.99946707, + "balance_loss_mlp": 1.01443267, + "epoch": 0.4505636554937622, + "flos": 65063420703360.0, + "grad_norm": 0.8118074327791402, + "language_loss": 0.62908041, + "learning_rate": 2.4142867511336e-06, + "loss": 0.64948046, + "num_input_tokens_seen": 160827370, + "router_z_loss_clip": 0.01525879, + "router_z_loss_mlp": 0.24609375, + "step": 7494, + "time_per_iteration": 3.1021509170532227 + }, + { + "auxiliary_loss_clip": 0.01113816, + "auxiliary_loss_mlp": 0.01032596, + "balance_loss_clip": 1.02063334, + "balance_loss_mlp": 1.04122376, + "epoch": 0.45062377874643017, + "flos": 22200084685440.0, + "grad_norm": 1.4599772474200656, + "language_loss": 0.81980979, + "learning_rate": 2.4139057287914484e-06, + "loss": 0.8412739, + "num_input_tokens_seen": 160849140, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.7265625, + "step": 7495, + "time_per_iteration": 2.528707981109619 + }, + { + "auxiliary_loss_clip": 0.01114077, + "auxiliary_loss_mlp": 0.010313, + "balance_loss_clip": 1.01793659, + "balance_loss_mlp": 1.04069221, + "epoch": 0.45068390199909814, + "flos": 37670293186560.0, + "grad_norm": 1.6718702145442927, + "language_loss": 0.85639864, + "learning_rate": 2.41352469075395e-06, + "loss": 0.87785244, + "num_input_tokens_seen": 160871280, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.734375, + "step": 7496, + "time_per_iteration": 2.5862984657287598 + }, + { + "auxiliary_loss_clip": 0.0111579, + "auxiliary_loss_mlp": 0.0103187, + "balance_loss_clip": 1.01913798, + "balance_loss_mlp": 1.04234052, + "epoch": 0.4507440252517661, + "flos": 22302501338880.0, + "grad_norm": 2.117680053603533, + "language_loss": 0.76342994, + "learning_rate": 2.4131436370355534e-06, + "loss": 0.78490651, + "num_input_tokens_seen": 160888625, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.734375, + "step": 7497, + "time_per_iteration": 2.4831669330596924 + }, + { + "auxiliary_loss_clip": 0.01114815, + "auxiliary_loss_mlp": 0.01030719, + "balance_loss_clip": 1.01798773, + "balance_loss_mlp": 1.03939152, + "epoch": 0.45080414850443407, + "flos": 13188374138880.0, + "grad_norm": 2.971687057549937, + "language_loss": 0.75124824, + "learning_rate": 2.4127625676507088e-06, + "loss": 0.77270365, + "num_input_tokens_seen": 160907040, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.75390625, + "step": 7498, + "time_per_iteration": 2.4243438243865967 + }, + { + "auxiliary_loss_clip": 0.01116531, + "auxiliary_loss_mlp": 0.01041824, + "balance_loss_clip": 1.02853799, + "balance_loss_mlp": 1.04190993, + "epoch": 0.4508642717571021, + "flos": 21944939402880.0, + "grad_norm": 1.8265166276024245, + "language_loss": 0.70487583, + "learning_rate": 2.4123814826138663e-06, + "loss": 0.72645926, + "num_input_tokens_seen": 160927115, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.74609375, + "step": 7499, + "time_per_iteration": 2.496595859527588 + }, + { + "auxiliary_loss_clip": 0.01118241, + "auxiliary_loss_mlp": 0.0103412, + "balance_loss_clip": 1.02090549, + "balance_loss_mlp": 1.04258835, + "epoch": 0.45092439500977005, + "flos": 23367468700800.0, + "grad_norm": 1.819855114084185, + "language_loss": 0.76870257, + "learning_rate": 2.412000381939477e-06, + "loss": 0.79022616, + "num_input_tokens_seen": 160944405, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7578125, + "step": 7500, + "time_per_iteration": 2.4659407138824463 + }, + { + "auxiliary_loss_clip": 0.01114886, + "auxiliary_loss_mlp": 0.01032223, + "balance_loss_clip": 1.01943755, + "balance_loss_mlp": 1.04146719, + "epoch": 0.450984518262438, + "flos": 20772958446720.0, + "grad_norm": 1.7705256698152247, + "language_loss": 0.62966442, + "learning_rate": 2.411619265641992e-06, + "loss": 0.6511355, + "num_input_tokens_seen": 160961345, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.734375, + "step": 7501, + "time_per_iteration": 2.474149703979492 + }, + { + "auxiliary_loss_clip": 0.01117269, + "auxiliary_loss_mlp": 0.0103454, + "balance_loss_clip": 1.02093208, + "balance_loss_mlp": 1.04161, + "epoch": 0.451044641515106, + "flos": 17707372300800.0, + "grad_norm": 1.9049764473951474, + "language_loss": 0.84758866, + "learning_rate": 2.411238133735863e-06, + "loss": 0.86910677, + "num_input_tokens_seen": 160977330, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7578125, + "step": 7502, + "time_per_iteration": 2.419093370437622 + }, + { + "auxiliary_loss_clip": 0.01111337, + "auxiliary_loss_mlp": 0.01033516, + "balance_loss_clip": 1.02135682, + "balance_loss_mlp": 1.04026246, + "epoch": 0.45110476476777395, + "flos": 20594698225920.0, + "grad_norm": 1.4187712379612754, + "language_loss": 0.79906255, + "learning_rate": 2.4108569862355418e-06, + "loss": 0.8205111, + "num_input_tokens_seen": 160997280, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.7109375, + "step": 7503, + "time_per_iteration": 2.536954164505005 + }, + { + "auxiliary_loss_clip": 0.01112743, + "auxiliary_loss_mlp": 0.01036948, + "balance_loss_clip": 1.02458, + "balance_loss_mlp": 1.04287815, + "epoch": 0.4511648880204419, + "flos": 16034043265920.0, + "grad_norm": 3.706114905397956, + "language_loss": 0.80931562, + "learning_rate": 2.410475823155484e-06, + "loss": 0.83081251, + "num_input_tokens_seen": 161014235, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6953125, + "step": 7504, + "time_per_iteration": 2.4356000423431396 + }, + { + "auxiliary_loss_clip": 0.01112245, + "auxiliary_loss_mlp": 0.01034569, + "balance_loss_clip": 1.02284479, + "balance_loss_mlp": 1.04033744, + "epoch": 0.4512250112731099, + "flos": 23978811202560.0, + "grad_norm": 5.269565558405545, + "language_loss": 0.63377774, + "learning_rate": 2.4100946445101405e-06, + "loss": 0.6552459, + "num_input_tokens_seen": 161032360, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.71875, + "step": 7505, + "time_per_iteration": 2.4934160709381104 + }, + { + "auxiliary_loss_clip": 0.01036723, + "auxiliary_loss_mlp": 0.0101133, + "balance_loss_clip": 1.00969648, + "balance_loss_mlp": 1.01246166, + "epoch": 0.45128513452577784, + "flos": 71462308037760.0, + "grad_norm": 0.8504866778221882, + "language_loss": 0.5887711, + "learning_rate": 2.409713450313968e-06, + "loss": 0.60925162, + "num_input_tokens_seen": 161091360, + "router_z_loss_clip": 0.01635742, + "router_z_loss_mlp": 0.2421875, + "step": 7506, + "time_per_iteration": 3.1150898933410645 + }, + { + "auxiliary_loss_clip": 0.01112738, + "auxiliary_loss_mlp": 0.0103395, + "balance_loss_clip": 1.02087879, + "balance_loss_mlp": 1.04194486, + "epoch": 0.4513452577784458, + "flos": 22090844448000.0, + "grad_norm": 1.6347442617822043, + "language_loss": 0.79238498, + "learning_rate": 2.40933224058142e-06, + "loss": 0.81385183, + "num_input_tokens_seen": 161110825, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.70703125, + "step": 7507, + "time_per_iteration": 2.484036684036255 + }, + { + "auxiliary_loss_clip": 0.0111511, + "auxiliary_loss_mlp": 0.01033622, + "balance_loss_clip": 1.019871, + "balance_loss_mlp": 1.04084098, + "epoch": 0.4514053810311138, + "flos": 24276403382400.0, + "grad_norm": 1.5108356171854629, + "language_loss": 0.7397756, + "learning_rate": 2.4089510153269526e-06, + "loss": 0.76126289, + "num_input_tokens_seen": 161130685, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7421875, + "step": 7508, + "time_per_iteration": 2.4958505630493164 + }, + { + "auxiliary_loss_clip": 0.01112961, + "auxiliary_loss_mlp": 0.01036508, + "balance_loss_clip": 1.02423549, + "balance_loss_mlp": 1.04263186, + "epoch": 0.45146550428378174, + "flos": 17886781756800.0, + "grad_norm": 1.9053667394121476, + "language_loss": 0.78955048, + "learning_rate": 2.4085697745650217e-06, + "loss": 0.81104517, + "num_input_tokens_seen": 161147555, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 7509, + "time_per_iteration": 2.4640209674835205 + }, + { + "auxiliary_loss_clip": 0.01114289, + "auxiliary_loss_mlp": 0.01029902, + "balance_loss_clip": 1.01759398, + "balance_loss_mlp": 1.0420239, + "epoch": 0.4515256275364497, + "flos": 24243437675520.0, + "grad_norm": 1.8944319049742213, + "language_loss": 0.73495883, + "learning_rate": 2.4081885183100837e-06, + "loss": 0.75640076, + "num_input_tokens_seen": 161166255, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.72265625, + "step": 7510, + "time_per_iteration": 2.462289810180664 + }, + { + "auxiliary_loss_clip": 0.01115796, + "auxiliary_loss_mlp": 0.01032073, + "balance_loss_clip": 1.01856017, + "balance_loss_mlp": 1.04091644, + "epoch": 0.45158575078911767, + "flos": 20631039811200.0, + "grad_norm": 1.9974195471898801, + "language_loss": 0.77053016, + "learning_rate": 2.4078072465765964e-06, + "loss": 0.79200888, + "num_input_tokens_seen": 161184720, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 7511, + "time_per_iteration": 2.5831305980682373 + }, + { + "auxiliary_loss_clip": 0.01114808, + "auxiliary_loss_mlp": 0.01032776, + "balance_loss_clip": 1.01937711, + "balance_loss_mlp": 1.04086745, + "epoch": 0.45164587404178563, + "flos": 23327751237120.0, + "grad_norm": 1.734048899080759, + "language_loss": 0.79124206, + "learning_rate": 2.4074259593790174e-06, + "loss": 0.81271791, + "num_input_tokens_seen": 161204360, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73828125, + "step": 7512, + "time_per_iteration": 6.862476587295532 + }, + { + "auxiliary_loss_clip": 0.01118735, + "auxiliary_loss_mlp": 0.01037966, + "balance_loss_clip": 1.02435863, + "balance_loss_mlp": 1.04064548, + "epoch": 0.45170599729445365, + "flos": 23805973935360.0, + "grad_norm": 1.9681233127218394, + "language_loss": 0.87461096, + "learning_rate": 2.4070446567318053e-06, + "loss": 0.89617801, + "num_input_tokens_seen": 161223575, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.78125, + "step": 7513, + "time_per_iteration": 2.5551092624664307 + }, + { + "auxiliary_loss_clip": 0.01105419, + "auxiliary_loss_mlp": 0.01030567, + "balance_loss_clip": 1.01893246, + "balance_loss_mlp": 1.0379355, + "epoch": 0.4517661205471216, + "flos": 23512942782720.0, + "grad_norm": 1.6638824980939535, + "language_loss": 0.67135286, + "learning_rate": 2.406663338649419e-06, + "loss": 0.69271272, + "num_input_tokens_seen": 161243805, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.671875, + "step": 7514, + "time_per_iteration": 2.4804775714874268 + }, + { + "auxiliary_loss_clip": 0.01115847, + "auxiliary_loss_mlp": 0.01029857, + "balance_loss_clip": 1.01448536, + "balance_loss_mlp": 1.04221404, + "epoch": 0.4518262437997896, + "flos": 23513948363520.0, + "grad_norm": 2.644844833078513, + "language_loss": 0.69455916, + "learning_rate": 2.406282005146318e-06, + "loss": 0.71601617, + "num_input_tokens_seen": 161261450, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.734375, + "step": 7515, + "time_per_iteration": 2.530089855194092 + }, + { + "auxiliary_loss_clip": 0.01117096, + "auxiliary_loss_mlp": 0.01034746, + "balance_loss_clip": 1.02060795, + "balance_loss_mlp": 1.04084945, + "epoch": 0.45188636705245755, + "flos": 14568061489920.0, + "grad_norm": 2.154684023631233, + "language_loss": 0.81658673, + "learning_rate": 2.405900656236963e-06, + "loss": 0.83810514, + "num_input_tokens_seen": 161276965, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.76171875, + "step": 7516, + "time_per_iteration": 2.405810832977295 + }, + { + "auxiliary_loss_clip": 0.01111826, + "auxiliary_loss_mlp": 0.01032545, + "balance_loss_clip": 1.01940227, + "balance_loss_mlp": 1.04099917, + "epoch": 0.4519464903051255, + "flos": 19901550499200.0, + "grad_norm": 1.5513632113186169, + "language_loss": 0.65810448, + "learning_rate": 2.4055192919358137e-06, + "loss": 0.6795482, + "num_input_tokens_seen": 161295375, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.70703125, + "step": 7517, + "time_per_iteration": 2.487539768218994 + }, + { + "auxiliary_loss_clip": 0.0111082, + "auxiliary_loss_mlp": 0.0102731, + "balance_loss_clip": 1.01549673, + "balance_loss_mlp": 1.04066491, + "epoch": 0.4520066135577935, + "flos": 18844376388480.0, + "grad_norm": 1.7604175245242084, + "language_loss": 0.63401121, + "learning_rate": 2.405137912257333e-06, + "loss": 0.65539253, + "num_input_tokens_seen": 161313010, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.69921875, + "step": 7518, + "time_per_iteration": 2.4280178546905518 + }, + { + "auxiliary_loss_clip": 0.01112305, + "auxiliary_loss_mlp": 0.010337, + "balance_loss_clip": 1.02124858, + "balance_loss_mlp": 1.04022479, + "epoch": 0.45206673681046144, + "flos": 48214419713280.0, + "grad_norm": 1.4125127095428567, + "language_loss": 0.59552354, + "learning_rate": 2.404756517215982e-06, + "loss": 0.61698353, + "num_input_tokens_seen": 161336690, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71875, + "step": 7519, + "time_per_iteration": 2.706774950027466 + }, + { + "auxiliary_loss_clip": 0.01114162, + "auxiliary_loss_mlp": 0.01036796, + "balance_loss_clip": 1.02404702, + "balance_loss_mlp": 1.04053855, + "epoch": 0.4521268600631294, + "flos": 23842171866240.0, + "grad_norm": 1.3128892020538214, + "language_loss": 0.72288704, + "learning_rate": 2.404375106826223e-06, + "loss": 0.74439663, + "num_input_tokens_seen": 161357845, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.734375, + "step": 7520, + "time_per_iteration": 2.4802541732788086 + }, + { + "auxiliary_loss_clip": 0.01113212, + "auxiliary_loss_mlp": 0.01037416, + "balance_loss_clip": 1.0250659, + "balance_loss_mlp": 1.04033482, + "epoch": 0.4521869833157974, + "flos": 18843622202880.0, + "grad_norm": 1.8726393810843218, + "language_loss": 0.75520414, + "learning_rate": 2.4039936811025194e-06, + "loss": 0.77671039, + "num_input_tokens_seen": 161375160, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.73046875, + "step": 7521, + "time_per_iteration": 2.4384777545928955 + }, + { + "auxiliary_loss_clip": 0.0111833, + "auxiliary_loss_mlp": 0.01035726, + "balance_loss_clip": 1.022416, + "balance_loss_mlp": 1.04222465, + "epoch": 0.45224710656846534, + "flos": 19788072456960.0, + "grad_norm": 1.6736116772601735, + "language_loss": 0.67521721, + "learning_rate": 2.4036122400593343e-06, + "loss": 0.69675779, + "num_input_tokens_seen": 161393690, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.76171875, + "step": 7522, + "time_per_iteration": 2.4317188262939453 + }, + { + "auxiliary_loss_clip": 0.01109922, + "auxiliary_loss_mlp": 0.01033885, + "balance_loss_clip": 1.02090335, + "balance_loss_mlp": 1.03857231, + "epoch": 0.4523072298211333, + "flos": 28256131681920.0, + "grad_norm": 1.5002177443666298, + "language_loss": 0.60627949, + "learning_rate": 2.403230783711134e-06, + "loss": 0.62771761, + "num_input_tokens_seen": 161415015, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 7523, + "time_per_iteration": 2.5312907695770264 + }, + { + "auxiliary_loss_clip": 0.01116524, + "auxiliary_loss_mlp": 0.01039355, + "balance_loss_clip": 1.02556825, + "balance_loss_mlp": 1.0399549, + "epoch": 0.45236735307380127, + "flos": 11181039511680.0, + "grad_norm": 2.0404967948828796, + "language_loss": 0.78325248, + "learning_rate": 2.4028493120723813e-06, + "loss": 0.80481124, + "num_input_tokens_seen": 161432940, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.765625, + "step": 7524, + "time_per_iteration": 2.4078996181488037 + }, + { + "auxiliary_loss_clip": 0.01111336, + "auxiliary_loss_mlp": 0.01034812, + "balance_loss_clip": 1.02216387, + "balance_loss_mlp": 1.03912878, + "epoch": 0.45242747632646924, + "flos": 22601386408320.0, + "grad_norm": 1.9789251534337415, + "language_loss": 0.63518596, + "learning_rate": 2.4024678251575417e-06, + "loss": 0.65664744, + "num_input_tokens_seen": 161452215, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.72265625, + "step": 7525, + "time_per_iteration": 2.503176212310791 + }, + { + "auxiliary_loss_clip": 0.01112174, + "auxiliary_loss_mlp": 0.01035043, + "balance_loss_clip": 1.02262783, + "balance_loss_mlp": 1.04040241, + "epoch": 0.45248759957913726, + "flos": 18256267008000.0, + "grad_norm": 1.5288172547930599, + "language_loss": 0.79163349, + "learning_rate": 2.402086322981083e-06, + "loss": 0.8131057, + "num_input_tokens_seen": 161469520, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71875, + "step": 7526, + "time_per_iteration": 2.4558780193328857 + }, + { + "auxiliary_loss_clip": 0.01111805, + "auxiliary_loss_mlp": 0.01030821, + "balance_loss_clip": 1.01851869, + "balance_loss_mlp": 1.04029512, + "epoch": 0.4525477228318052, + "flos": 22450094323200.0, + "grad_norm": 1.6413449131819307, + "language_loss": 0.80729342, + "learning_rate": 2.40170480555747e-06, + "loss": 0.82871962, + "num_input_tokens_seen": 161487335, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.7109375, + "step": 7527, + "time_per_iteration": 2.470186948776245 + }, + { + "auxiliary_loss_clip": 0.0111113, + "auxiliary_loss_mlp": 0.01028615, + "balance_loss_clip": 1.01566291, + "balance_loss_mlp": 1.039428, + "epoch": 0.4526078460844732, + "flos": 29644869260160.0, + "grad_norm": 1.450835161887395, + "language_loss": 0.65505683, + "learning_rate": 2.4013232729011706e-06, + "loss": 0.67645425, + "num_input_tokens_seen": 161510095, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71875, + "step": 7528, + "time_per_iteration": 2.541700601577759 + }, + { + "auxiliary_loss_clip": 0.01110752, + "auxiliary_loss_mlp": 0.01032238, + "balance_loss_clip": 1.02031136, + "balance_loss_mlp": 1.03976476, + "epoch": 0.45266796933714115, + "flos": 23039747988480.0, + "grad_norm": 1.6649436204324595, + "language_loss": 0.7542727, + "learning_rate": 2.4009417250266525e-06, + "loss": 0.7757026, + "num_input_tokens_seen": 161528725, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.7109375, + "step": 7529, + "time_per_iteration": 2.5726876258850098 + }, + { + "auxiliary_loss_clip": 0.01112607, + "auxiliary_loss_mlp": 0.01030788, + "balance_loss_clip": 1.01853299, + "balance_loss_mlp": 1.03971684, + "epoch": 0.4527280925898091, + "flos": 14428405411200.0, + "grad_norm": 1.7825780716691442, + "language_loss": 0.73193467, + "learning_rate": 2.400560161948384e-06, + "loss": 0.75336862, + "num_input_tokens_seen": 161547195, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.73046875, + "step": 7530, + "time_per_iteration": 2.4584052562713623 + }, + { + "auxiliary_loss_clip": 0.01113111, + "auxiliary_loss_mlp": 0.01033658, + "balance_loss_clip": 1.02193975, + "balance_loss_mlp": 1.04003453, + "epoch": 0.4527882158424771, + "flos": 22925515760640.0, + "grad_norm": 1.6012488985464985, + "language_loss": 0.75947326, + "learning_rate": 2.400178583680834e-06, + "loss": 0.78094089, + "num_input_tokens_seen": 161565565, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.734375, + "step": 7531, + "time_per_iteration": 2.484959363937378 + }, + { + "auxiliary_loss_clip": 0.01108375, + "auxiliary_loss_mlp": 0.01034859, + "balance_loss_clip": 1.02182305, + "balance_loss_mlp": 1.0382148, + "epoch": 0.45284833909514505, + "flos": 25555326105600.0, + "grad_norm": 1.4359815558452909, + "language_loss": 0.66874713, + "learning_rate": 2.3997969902384717e-06, + "loss": 0.69017947, + "num_input_tokens_seen": 161586630, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 7532, + "time_per_iteration": 2.486598253250122 + }, + { + "auxiliary_loss_clip": 0.01112272, + "auxiliary_loss_mlp": 0.01035317, + "balance_loss_clip": 1.02322936, + "balance_loss_mlp": 1.04091084, + "epoch": 0.452908462347813, + "flos": 18150007599360.0, + "grad_norm": 2.0450394734969874, + "language_loss": 0.78902352, + "learning_rate": 2.399415381635768e-06, + "loss": 0.81049943, + "num_input_tokens_seen": 161603815, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71484375, + "step": 7533, + "time_per_iteration": 2.4407958984375 + }, + { + "auxiliary_loss_clip": 0.01115719, + "auxiliary_loss_mlp": 0.01034272, + "balance_loss_clip": 1.02032459, + "balance_loss_mlp": 1.03807485, + "epoch": 0.452968585600481, + "flos": 19062749122560.0, + "grad_norm": 1.646532255034537, + "language_loss": 0.83279264, + "learning_rate": 2.3990337578871927e-06, + "loss": 0.85429263, + "num_input_tokens_seen": 161622900, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7734375, + "step": 7534, + "time_per_iteration": 2.430670976638794 + }, + { + "auxiliary_loss_clip": 0.01113826, + "auxiliary_loss_mlp": 0.01034802, + "balance_loss_clip": 1.02148068, + "balance_loss_mlp": 1.03927064, + "epoch": 0.45302870885314894, + "flos": 22051737515520.0, + "grad_norm": 1.4654832124358697, + "language_loss": 0.76578003, + "learning_rate": 2.3986521190072176e-06, + "loss": 0.78726631, + "num_input_tokens_seen": 161641700, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.74609375, + "step": 7535, + "time_per_iteration": 2.4744579792022705 + }, + { + "auxiliary_loss_clip": 0.0110944, + "auxiliary_loss_mlp": 0.01031373, + "balance_loss_clip": 1.01957679, + "balance_loss_mlp": 1.03883696, + "epoch": 0.4530888321058169, + "flos": 20376217751040.0, + "grad_norm": 1.5977579258117844, + "language_loss": 0.80234635, + "learning_rate": 2.3982704650103138e-06, + "loss": 0.82375443, + "num_input_tokens_seen": 161661955, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.703125, + "step": 7536, + "time_per_iteration": 2.4481444358825684 + }, + { + "auxiliary_loss_clip": 0.01111518, + "auxiliary_loss_mlp": 0.0102989, + "balance_loss_clip": 1.0173198, + "balance_loss_mlp": 1.03711987, + "epoch": 0.4531489553584849, + "flos": 14830425406080.0, + "grad_norm": 2.0610118763249536, + "language_loss": 0.75895774, + "learning_rate": 2.3978887959109544e-06, + "loss": 0.78037184, + "num_input_tokens_seen": 161679245, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7421875, + "step": 7537, + "time_per_iteration": 2.430119276046753 + }, + { + "auxiliary_loss_clip": 0.01115071, + "auxiliary_loss_mlp": 0.01032409, + "balance_loss_clip": 1.02058339, + "balance_loss_mlp": 1.04172075, + "epoch": 0.45320907861115284, + "flos": 21944975316480.0, + "grad_norm": 2.095176663386117, + "language_loss": 0.76420474, + "learning_rate": 2.3975071117236118e-06, + "loss": 0.78567952, + "num_input_tokens_seen": 161698795, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.734375, + "step": 7538, + "time_per_iteration": 2.4675159454345703 + }, + { + "auxiliary_loss_clip": 0.01041439, + "auxiliary_loss_mlp": 0.0100041, + "balance_loss_clip": 0.99908096, + "balance_loss_mlp": 1.01700771, + "epoch": 0.45326920186382086, + "flos": 66251455038720.0, + "grad_norm": 0.7965700347609973, + "language_loss": 0.62345123, + "learning_rate": 2.3971254124627593e-06, + "loss": 0.64386964, + "num_input_tokens_seen": 161761980, + "router_z_loss_clip": 0.01330566, + "router_z_loss_mlp": 0.24414062, + "step": 7539, + "time_per_iteration": 3.0961101055145264 + }, + { + "auxiliary_loss_clip": 0.01112571, + "auxiliary_loss_mlp": 0.01036685, + "balance_loss_clip": 1.02466285, + "balance_loss_mlp": 1.04064226, + "epoch": 0.4533293251164888, + "flos": 14684233052160.0, + "grad_norm": 1.8102149318529874, + "language_loss": 0.65997463, + "learning_rate": 2.396743698142872e-06, + "loss": 0.68146718, + "num_input_tokens_seen": 161779455, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.71875, + "step": 7540, + "time_per_iteration": 2.418170928955078 + }, + { + "auxiliary_loss_clip": 0.01118532, + "auxiliary_loss_mlp": 0.01040664, + "balance_loss_clip": 1.02721667, + "balance_loss_mlp": 1.04177594, + "epoch": 0.4533894483691568, + "flos": 22601206840320.0, + "grad_norm": 1.6922846601909878, + "language_loss": 0.84666622, + "learning_rate": 2.396361968778424e-06, + "loss": 0.86825818, + "num_input_tokens_seen": 161798980, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 7541, + "time_per_iteration": 2.4960954189300537 + }, + { + "auxiliary_loss_clip": 0.01113117, + "auxiliary_loss_mlp": 0.01031114, + "balance_loss_clip": 1.01888943, + "balance_loss_mlp": 1.03968024, + "epoch": 0.45344957162182475, + "flos": 34751617666560.0, + "grad_norm": 1.7180151747286094, + "language_loss": 0.76435781, + "learning_rate": 2.395980224383889e-06, + "loss": 0.78580016, + "num_input_tokens_seen": 161819745, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.734375, + "step": 7542, + "time_per_iteration": 2.574286937713623 + }, + { + "auxiliary_loss_clip": 0.011146, + "auxiliary_loss_mlp": 0.01029849, + "balance_loss_clip": 1.01687872, + "balance_loss_mlp": 1.04101157, + "epoch": 0.4535096948744927, + "flos": 23550218121600.0, + "grad_norm": 1.4680148354813627, + "language_loss": 0.80267954, + "learning_rate": 2.395598464973746e-06, + "loss": 0.82412398, + "num_input_tokens_seen": 161838575, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.734375, + "step": 7543, + "time_per_iteration": 2.5228359699249268 + }, + { + "auxiliary_loss_clip": 0.01115681, + "auxiliary_loss_mlp": 0.01037869, + "balance_loss_clip": 1.02517343, + "balance_loss_mlp": 1.04107285, + "epoch": 0.4535698181271607, + "flos": 25557552748800.0, + "grad_norm": 1.6471991367559184, + "language_loss": 0.75933033, + "learning_rate": 2.395216690562469e-06, + "loss": 0.78086591, + "num_input_tokens_seen": 161858590, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.74609375, + "step": 7544, + "time_per_iteration": 2.4976110458374023 + }, + { + "auxiliary_loss_clip": 0.01117877, + "auxiliary_loss_mlp": 0.01033773, + "balance_loss_clip": 1.02154779, + "balance_loss_mlp": 1.04304671, + "epoch": 0.45362994137982865, + "flos": 24864117713280.0, + "grad_norm": 1.8438932042246456, + "language_loss": 0.75447458, + "learning_rate": 2.3948349011645355e-06, + "loss": 0.77599108, + "num_input_tokens_seen": 161878390, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.75, + "step": 7545, + "time_per_iteration": 2.5022737979888916 + }, + { + "auxiliary_loss_clip": 0.01114305, + "auxiliary_loss_mlp": 0.0102975, + "balance_loss_clip": 1.01697659, + "balance_loss_mlp": 1.04100811, + "epoch": 0.4536900646324966, + "flos": 30806794408320.0, + "grad_norm": 1.5497429650402368, + "language_loss": 0.7210325, + "learning_rate": 2.394453096794423e-06, + "loss": 0.74247307, + "num_input_tokens_seen": 161898610, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.734375, + "step": 7546, + "time_per_iteration": 2.5246150493621826 + }, + { + "auxiliary_loss_clip": 0.01118375, + "auxiliary_loss_mlp": 0.01032123, + "balance_loss_clip": 1.01857507, + "balance_loss_mlp": 1.04212511, + "epoch": 0.4537501878851646, + "flos": 23404313076480.0, + "grad_norm": 1.558937793954525, + "language_loss": 0.7557559, + "learning_rate": 2.394071277466609e-06, + "loss": 0.77726084, + "num_input_tokens_seen": 161918210, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.76171875, + "step": 7547, + "time_per_iteration": 2.4949920177459717 + }, + { + "auxiliary_loss_clip": 0.01116665, + "auxiliary_loss_mlp": 0.0103361, + "balance_loss_clip": 1.02041912, + "balance_loss_mlp": 1.04200041, + "epoch": 0.45381031113783254, + "flos": 18149289327360.0, + "grad_norm": 2.0285954992459865, + "language_loss": 0.69878972, + "learning_rate": 2.393689443195573e-06, + "loss": 0.72029251, + "num_input_tokens_seen": 161936950, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.74609375, + "step": 7548, + "time_per_iteration": 2.4486818313598633 + }, + { + "auxiliary_loss_clip": 0.01114191, + "auxiliary_loss_mlp": 0.01040331, + "balance_loss_clip": 1.02771258, + "balance_loss_mlp": 1.04018688, + "epoch": 0.4538704343905005, + "flos": 25336666062720.0, + "grad_norm": 2.0627316040888117, + "language_loss": 0.72691673, + "learning_rate": 2.393307593995794e-06, + "loss": 0.74846196, + "num_input_tokens_seen": 161955550, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7421875, + "step": 7549, + "time_per_iteration": 2.509470224380493 + }, + { + "auxiliary_loss_clip": 0.01112378, + "auxiliary_loss_mlp": 0.01029099, + "balance_loss_clip": 1.01698172, + "balance_loss_mlp": 1.04035378, + "epoch": 0.4539305576431685, + "flos": 28731445378560.0, + "grad_norm": 1.7136809619022837, + "language_loss": 0.65253317, + "learning_rate": 2.392925729881751e-06, + "loss": 0.67394793, + "num_input_tokens_seen": 161976760, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71875, + "step": 7550, + "time_per_iteration": 2.5133440494537354 + }, + { + "auxiliary_loss_clip": 0.01113494, + "auxiliary_loss_mlp": 0.01037344, + "balance_loss_clip": 1.0250591, + "balance_loss_mlp": 1.04179323, + "epoch": 0.45399068089583644, + "flos": 22492397566080.0, + "grad_norm": 1.6025854653449239, + "language_loss": 0.68823695, + "learning_rate": 2.3925438508679263e-06, + "loss": 0.70974535, + "num_input_tokens_seen": 161996120, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.71875, + "step": 7551, + "time_per_iteration": 2.5188024044036865 + }, + { + "auxiliary_loss_clip": 0.01113711, + "auxiliary_loss_mlp": 0.01033106, + "balance_loss_clip": 1.02022541, + "balance_loss_mlp": 1.03923821, + "epoch": 0.45405080414850446, + "flos": 12893403651840.0, + "grad_norm": 1.6542843637965088, + "language_loss": 0.79214859, + "learning_rate": 2.392161956968798e-06, + "loss": 0.81361675, + "num_input_tokens_seen": 162011125, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.74609375, + "step": 7552, + "time_per_iteration": 2.4087183475494385 + }, + { + "auxiliary_loss_clip": 0.01039804, + "auxiliary_loss_mlp": 0.01010172, + "balance_loss_clip": 1.00893259, + "balance_loss_mlp": 1.01586497, + "epoch": 0.4541109274011724, + "flos": 59766919724160.0, + "grad_norm": 0.8232859688183145, + "language_loss": 0.57765305, + "learning_rate": 2.39178004819885e-06, + "loss": 0.59815282, + "num_input_tokens_seen": 162068705, + "router_z_loss_clip": 0.01239014, + "router_z_loss_mlp": 0.24023438, + "step": 7553, + "time_per_iteration": 4.437517881393433 + }, + { + "auxiliary_loss_clip": 0.01110712, + "auxiliary_loss_mlp": 0.01035759, + "balance_loss_clip": 1.02388608, + "balance_loss_mlp": 1.03907371, + "epoch": 0.4541710506538404, + "flos": 28511743841280.0, + "grad_norm": 1.3573100009257986, + "language_loss": 0.76541936, + "learning_rate": 2.3913981245725626e-06, + "loss": 0.78688413, + "num_input_tokens_seen": 162089655, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.71875, + "step": 7554, + "time_per_iteration": 5.404860258102417 + }, + { + "auxiliary_loss_clip": 0.01116899, + "auxiliary_loss_mlp": 0.01032245, + "balance_loss_clip": 1.01859498, + "balance_loss_mlp": 1.04073453, + "epoch": 0.45423117390650836, + "flos": 17675591742720.0, + "grad_norm": 2.6663912268828156, + "language_loss": 0.77148789, + "learning_rate": 2.3910161861044194e-06, + "loss": 0.79297936, + "num_input_tokens_seen": 162108465, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76171875, + "step": 7555, + "time_per_iteration": 2.5254242420196533 + }, + { + "auxiliary_loss_clip": 0.01111282, + "auxiliary_loss_mlp": 0.01033205, + "balance_loss_clip": 1.02112269, + "balance_loss_mlp": 1.03910041, + "epoch": 0.4542912971591763, + "flos": 28072556248320.0, + "grad_norm": 1.268885764239303, + "language_loss": 0.72658741, + "learning_rate": 2.390634232808903e-06, + "loss": 0.74803221, + "num_input_tokens_seen": 162129910, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.72265625, + "step": 7556, + "time_per_iteration": 2.5096001625061035 + }, + { + "auxiliary_loss_clip": 0.01117527, + "auxiliary_loss_mlp": 0.01033023, + "balance_loss_clip": 1.01987422, + "balance_loss_mlp": 1.0412432, + "epoch": 0.4543514204118443, + "flos": 22671771108480.0, + "grad_norm": 1.9256457801142723, + "language_loss": 0.63244998, + "learning_rate": 2.3902522647004982e-06, + "loss": 0.65395546, + "num_input_tokens_seen": 162148840, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.76171875, + "step": 7557, + "time_per_iteration": 2.489269495010376 + }, + { + "auxiliary_loss_clip": 0.010384, + "auxiliary_loss_mlp": 0.01000398, + "balance_loss_clip": 0.99909872, + "balance_loss_mlp": 1.01432419, + "epoch": 0.45441154366451225, + "flos": 58216549921920.0, + "grad_norm": 0.6891763329400619, + "language_loss": 0.57655525, + "learning_rate": 2.3898702817936875e-06, + "loss": 0.59694326, + "num_input_tokens_seen": 162208500, + "router_z_loss_clip": 0.01300049, + "router_z_loss_mlp": 0.24023438, + "step": 7558, + "time_per_iteration": 2.9631850719451904 + }, + { + "auxiliary_loss_clip": 0.01117663, + "auxiliary_loss_mlp": 0.01034797, + "balance_loss_clip": 1.02106977, + "balance_loss_mlp": 1.04180217, + "epoch": 0.4544716669171802, + "flos": 16764286763520.0, + "grad_norm": 2.9054431891281847, + "language_loss": 0.56152129, + "learning_rate": 2.3894882841029573e-06, + "loss": 0.58304584, + "num_input_tokens_seen": 162224650, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7578125, + "step": 7559, + "time_per_iteration": 2.4718172550201416 + }, + { + "auxiliary_loss_clip": 0.01116333, + "auxiliary_loss_mlp": 0.01036141, + "balance_loss_clip": 1.02320707, + "balance_loss_mlp": 1.04311991, + "epoch": 0.4545317901698482, + "flos": 15925233991680.0, + "grad_norm": 2.1225715432080863, + "language_loss": 0.72038132, + "learning_rate": 2.389106271642792e-06, + "loss": 0.74190605, + "num_input_tokens_seen": 162242930, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.734375, + "step": 7560, + "time_per_iteration": 2.4289052486419678 + }, + { + "auxiliary_loss_clip": 0.01118313, + "auxiliary_loss_mlp": 0.01032424, + "balance_loss_clip": 1.01870942, + "balance_loss_mlp": 1.04184937, + "epoch": 0.45459191342251615, + "flos": 17639752947840.0, + "grad_norm": 1.8567895139214563, + "language_loss": 0.68786752, + "learning_rate": 2.3887242444276775e-06, + "loss": 0.70937485, + "num_input_tokens_seen": 162261455, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.765625, + "step": 7561, + "time_per_iteration": 2.483013153076172 + }, + { + "auxiliary_loss_clip": 0.01111487, + "auxiliary_loss_mlp": 0.01031067, + "balance_loss_clip": 1.01933646, + "balance_loss_mlp": 1.04098606, + "epoch": 0.4546520366751841, + "flos": 16176608346240.0, + "grad_norm": 1.6472040447099916, + "language_loss": 0.84813452, + "learning_rate": 2.3883422024721015e-06, + "loss": 0.86956006, + "num_input_tokens_seen": 162279725, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.703125, + "step": 7562, + "time_per_iteration": 2.435842752456665 + }, + { + "auxiliary_loss_clip": 0.0111239, + "auxiliary_loss_mlp": 0.01033708, + "balance_loss_clip": 1.02132261, + "balance_loss_mlp": 1.0416292, + "epoch": 0.4547121599278521, + "flos": 19751443562880.0, + "grad_norm": 1.8588056575997567, + "language_loss": 0.89808047, + "learning_rate": 2.38796014579055e-06, + "loss": 0.91954148, + "num_input_tokens_seen": 162297865, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.70703125, + "step": 7563, + "time_per_iteration": 2.4962618350982666 + }, + { + "auxiliary_loss_clip": 0.0111349, + "auxiliary_loss_mlp": 0.01037794, + "balance_loss_clip": 1.02425742, + "balance_loss_mlp": 1.03999305, + "epoch": 0.45477228318052004, + "flos": 19937461121280.0, + "grad_norm": 1.9222778596605532, + "language_loss": 0.71644425, + "learning_rate": 2.3875780743975097e-06, + "loss": 0.73795712, + "num_input_tokens_seen": 162316010, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 7564, + "time_per_iteration": 2.4343371391296387 + }, + { + "auxiliary_loss_clip": 0.01115348, + "auxiliary_loss_mlp": 0.01031628, + "balance_loss_clip": 1.01898563, + "balance_loss_mlp": 1.04060352, + "epoch": 0.454832406433188, + "flos": 21288312829440.0, + "grad_norm": 2.0985180699884496, + "language_loss": 0.67973971, + "learning_rate": 2.3871959883074713e-06, + "loss": 0.70120943, + "num_input_tokens_seen": 162336115, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.74609375, + "step": 7565, + "time_per_iteration": 2.5114333629608154 + }, + { + "auxiliary_loss_clip": 0.0111081, + "auxiliary_loss_mlp": 0.010292, + "balance_loss_clip": 1.01651037, + "balance_loss_mlp": 1.03948641, + "epoch": 0.45489252968585603, + "flos": 24498726612480.0, + "grad_norm": 1.555148092913002, + "language_loss": 0.80112624, + "learning_rate": 2.386813887534922e-06, + "loss": 0.8225264, + "num_input_tokens_seen": 162355705, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71484375, + "step": 7566, + "time_per_iteration": 2.4678473472595215 + }, + { + "auxiliary_loss_clip": 0.01114664, + "auxiliary_loss_mlp": 0.01028518, + "balance_loss_clip": 1.01451695, + "balance_loss_mlp": 1.04058981, + "epoch": 0.454952652938524, + "flos": 17092474352640.0, + "grad_norm": 1.5438575571986708, + "language_loss": 0.73526263, + "learning_rate": 2.3864317720943508e-06, + "loss": 0.75669444, + "num_input_tokens_seen": 162374055, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 7567, + "time_per_iteration": 2.4749765396118164 + }, + { + "auxiliary_loss_clip": 0.01117694, + "auxiliary_loss_mlp": 0.01031931, + "balance_loss_clip": 1.01924706, + "balance_loss_mlp": 1.04315984, + "epoch": 0.45501277619119196, + "flos": 27630387826560.0, + "grad_norm": 1.4420173241258303, + "language_loss": 0.80870211, + "learning_rate": 2.386049642000249e-06, + "loss": 0.83019841, + "num_input_tokens_seen": 162393560, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.74609375, + "step": 7568, + "time_per_iteration": 2.5098068714141846 + }, + { + "auxiliary_loss_clip": 0.01119299, + "auxiliary_loss_mlp": 0.01043854, + "balance_loss_clip": 1.02927494, + "balance_loss_mlp": 1.04110444, + "epoch": 0.4550728994438599, + "flos": 19974664632960.0, + "grad_norm": 1.9046518074434846, + "language_loss": 0.79472029, + "learning_rate": 2.3856674972671055e-06, + "loss": 0.81635177, + "num_input_tokens_seen": 162413170, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.78125, + "step": 7569, + "time_per_iteration": 2.5105931758880615 + }, + { + "auxiliary_loss_clip": 0.0111814, + "auxiliary_loss_mlp": 0.01032381, + "balance_loss_clip": 1.01811135, + "balance_loss_mlp": 1.04233003, + "epoch": 0.4551330226965279, + "flos": 26066873646720.0, + "grad_norm": 1.3375300297611126, + "language_loss": 0.74826288, + "learning_rate": 2.385285337909412e-06, + "loss": 0.76976812, + "num_input_tokens_seen": 162434080, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7578125, + "step": 7570, + "time_per_iteration": 2.5360968112945557 + }, + { + "auxiliary_loss_clip": 0.0111381, + "auxiliary_loss_mlp": 0.01037907, + "balance_loss_clip": 1.02502048, + "balance_loss_mlp": 1.04281187, + "epoch": 0.45519314594919585, + "flos": 32781091501440.0, + "grad_norm": 1.5540611030471656, + "language_loss": 0.74696088, + "learning_rate": 2.3849031639416596e-06, + "loss": 0.76847816, + "num_input_tokens_seen": 162455445, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 7571, + "time_per_iteration": 2.5796499252319336 + }, + { + "auxiliary_loss_clip": 0.01110782, + "auxiliary_loss_mlp": 0.01029523, + "balance_loss_clip": 1.01708317, + "balance_loss_mlp": 1.04096079, + "epoch": 0.4552532692018638, + "flos": 19172671718400.0, + "grad_norm": 1.522963408290285, + "language_loss": 0.81392241, + "learning_rate": 2.3845209753783414e-06, + "loss": 0.83532542, + "num_input_tokens_seen": 162474940, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.69921875, + "step": 7572, + "time_per_iteration": 2.452230215072632 + }, + { + "auxiliary_loss_clip": 0.01119128, + "auxiliary_loss_mlp": 0.01034444, + "balance_loss_clip": 1.02052081, + "balance_loss_mlp": 1.04266822, + "epoch": 0.4553133924545318, + "flos": 26027156183040.0, + "grad_norm": 2.158291075293226, + "language_loss": 0.72932756, + "learning_rate": 2.3841387722339486e-06, + "loss": 0.75086331, + "num_input_tokens_seen": 162493340, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 7573, + "time_per_iteration": 2.547351598739624 + }, + { + "auxiliary_loss_clip": 0.01119875, + "auxiliary_loss_mlp": 0.01036094, + "balance_loss_clip": 1.02106202, + "balance_loss_mlp": 1.04362583, + "epoch": 0.45537351570719975, + "flos": 30661535808000.0, + "grad_norm": 1.8799787689923733, + "language_loss": 0.74544156, + "learning_rate": 2.3837565545229748e-06, + "loss": 0.76700127, + "num_input_tokens_seen": 162514360, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.76171875, + "step": 7574, + "time_per_iteration": 2.512343406677246 + }, + { + "auxiliary_loss_clip": 0.01116382, + "auxiliary_loss_mlp": 0.01031805, + "balance_loss_clip": 1.01870358, + "balance_loss_mlp": 1.0413028, + "epoch": 0.4554336389598677, + "flos": 24353396184960.0, + "grad_norm": 1.8832109226527793, + "language_loss": 0.7161721, + "learning_rate": 2.383374322259915e-06, + "loss": 0.73765397, + "num_input_tokens_seen": 162535240, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.75, + "step": 7575, + "time_per_iteration": 2.516036033630371 + }, + { + "auxiliary_loss_clip": 0.01114571, + "auxiliary_loss_mlp": 0.01030488, + "balance_loss_clip": 1.0174526, + "balance_loss_mlp": 1.04138458, + "epoch": 0.4554937622125357, + "flos": 20557925677440.0, + "grad_norm": 1.7001526143902996, + "language_loss": 0.73163939, + "learning_rate": 2.3829920754592617e-06, + "loss": 0.75308996, + "num_input_tokens_seen": 162553880, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.734375, + "step": 7576, + "time_per_iteration": 2.446596145629883 + }, + { + "auxiliary_loss_clip": 0.01114194, + "auxiliary_loss_mlp": 0.01035671, + "balance_loss_clip": 1.02232563, + "balance_loss_mlp": 1.04252386, + "epoch": 0.45555388546520365, + "flos": 22820764723200.0, + "grad_norm": 1.8829162969496007, + "language_loss": 0.66556787, + "learning_rate": 2.382609814135511e-06, + "loss": 0.68706656, + "num_input_tokens_seen": 162574485, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.71484375, + "step": 7577, + "time_per_iteration": 2.496425151824951 + }, + { + "auxiliary_loss_clip": 0.01119433, + "auxiliary_loss_mlp": 0.01041229, + "balance_loss_clip": 1.02655983, + "balance_loss_mlp": 1.04481244, + "epoch": 0.4556140087178716, + "flos": 21725992051200.0, + "grad_norm": 1.905892479596231, + "language_loss": 0.74408162, + "learning_rate": 2.382227538303157e-06, + "loss": 0.76568818, + "num_input_tokens_seen": 162595130, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.74609375, + "step": 7578, + "time_per_iteration": 2.4517569541931152 + }, + { + "auxiliary_loss_clip": 0.01117156, + "auxiliary_loss_mlp": 0.01031849, + "balance_loss_clip": 1.01923108, + "balance_loss_mlp": 1.0432775, + "epoch": 0.45567413197053963, + "flos": 25994513698560.0, + "grad_norm": 1.9332037742405612, + "language_loss": 0.70189863, + "learning_rate": 2.381845247976697e-06, + "loss": 0.72338867, + "num_input_tokens_seen": 162615720, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.73828125, + "step": 7579, + "time_per_iteration": 2.5487825870513916 + }, + { + "auxiliary_loss_clip": 0.0111145, + "auxiliary_loss_mlp": 0.01032903, + "balance_loss_clip": 1.02031469, + "balance_loss_mlp": 1.03969145, + "epoch": 0.4557342552232076, + "flos": 21537604195200.0, + "grad_norm": 1.6152122780510265, + "language_loss": 0.78727221, + "learning_rate": 2.381462943170627e-06, + "loss": 0.8087157, + "num_input_tokens_seen": 162635825, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71875, + "step": 7580, + "time_per_iteration": 2.465355157852173 + }, + { + "auxiliary_loss_clip": 0.01115593, + "auxiliary_loss_mlp": 0.01028037, + "balance_loss_clip": 1.01463163, + "balance_loss_mlp": 1.04341292, + "epoch": 0.45579437847587556, + "flos": 40001972647680.0, + "grad_norm": 1.4438503581091628, + "language_loss": 0.68864352, + "learning_rate": 2.381080623899444e-06, + "loss": 0.71007979, + "num_input_tokens_seen": 162659130, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.72265625, + "step": 7581, + "time_per_iteration": 2.6738851070404053 + }, + { + "auxiliary_loss_clip": 0.01111798, + "auxiliary_loss_mlp": 0.0103026, + "balance_loss_clip": 1.01742125, + "balance_loss_mlp": 1.03975797, + "epoch": 0.4558545017285435, + "flos": 31138501530240.0, + "grad_norm": 1.5604567804249607, + "language_loss": 0.73416924, + "learning_rate": 2.3806982901776455e-06, + "loss": 0.75558978, + "num_input_tokens_seen": 162681665, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 7582, + "time_per_iteration": 2.5402657985687256 + }, + { + "auxiliary_loss_clip": 0.01118117, + "auxiliary_loss_mlp": 0.01045735, + "balance_loss_clip": 1.03065467, + "balance_loss_mlp": 1.04215884, + "epoch": 0.4559146249812115, + "flos": 21725776569600.0, + "grad_norm": 1.7600515256353326, + "language_loss": 0.72337949, + "learning_rate": 2.380315942019729e-06, + "loss": 0.74501801, + "num_input_tokens_seen": 162702040, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7578125, + "step": 7583, + "time_per_iteration": 2.51399564743042 + }, + { + "auxiliary_loss_clip": 0.01119408, + "auxiliary_loss_mlp": 0.01036162, + "balance_loss_clip": 1.02300692, + "balance_loss_mlp": 1.04282498, + "epoch": 0.45597474823387946, + "flos": 23805973935360.0, + "grad_norm": 1.711799016610791, + "language_loss": 0.72402817, + "learning_rate": 2.379933579440195e-06, + "loss": 0.74558389, + "num_input_tokens_seen": 162722375, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.765625, + "step": 7584, + "time_per_iteration": 2.4907238483428955 + }, + { + "auxiliary_loss_clip": 0.01116974, + "auxiliary_loss_mlp": 0.01032287, + "balance_loss_clip": 1.01922798, + "balance_loss_mlp": 1.04356861, + "epoch": 0.4560348714865474, + "flos": 31905661230720.0, + "grad_norm": 1.4921764730017937, + "language_loss": 0.68272889, + "learning_rate": 2.379551202453541e-06, + "loss": 0.70422149, + "num_input_tokens_seen": 162746095, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.734375, + "step": 7585, + "time_per_iteration": 2.5741868019104004 + }, + { + "auxiliary_loss_clip": 0.01114249, + "auxiliary_loss_mlp": 0.01031651, + "balance_loss_clip": 1.01928306, + "balance_loss_mlp": 1.04099321, + "epoch": 0.4560949947392154, + "flos": 22048828513920.0, + "grad_norm": 1.3206982799231843, + "language_loss": 0.76102924, + "learning_rate": 2.379168811074267e-06, + "loss": 0.78248823, + "num_input_tokens_seen": 162766330, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.73046875, + "step": 7586, + "time_per_iteration": 2.466991662979126 + }, + { + "auxiliary_loss_clip": 0.01112098, + "auxiliary_loss_mlp": 0.01028236, + "balance_loss_clip": 1.01651812, + "balance_loss_mlp": 1.0406158, + "epoch": 0.45615511799188335, + "flos": 24571804832640.0, + "grad_norm": 1.9114474136682882, + "language_loss": 0.77912259, + "learning_rate": 2.3787864053168747e-06, + "loss": 0.80052596, + "num_input_tokens_seen": 162784755, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.71484375, + "step": 7587, + "time_per_iteration": 2.534231185913086 + }, + { + "auxiliary_loss_clip": 0.01118125, + "auxiliary_loss_mlp": 0.01039323, + "balance_loss_clip": 1.02616787, + "balance_loss_mlp": 1.03976679, + "epoch": 0.4562152412445513, + "flos": 18330709944960.0, + "grad_norm": 2.2451216970422068, + "language_loss": 0.69211191, + "learning_rate": 2.378403985195863e-06, + "loss": 0.71368635, + "num_input_tokens_seen": 162803850, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.78125, + "step": 7588, + "time_per_iteration": 2.4104104042053223 + }, + { + "auxiliary_loss_clip": 0.011124, + "auxiliary_loss_mlp": 0.01034229, + "balance_loss_clip": 1.02178395, + "balance_loss_mlp": 1.0401839, + "epoch": 0.4562753644972193, + "flos": 13516525814400.0, + "grad_norm": 1.610626761932897, + "language_loss": 0.79335272, + "learning_rate": 2.378021550725735e-06, + "loss": 0.81481898, + "num_input_tokens_seen": 162820775, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.72265625, + "step": 7589, + "time_per_iteration": 2.465728998184204 + }, + { + "auxiliary_loss_clip": 0.01113978, + "auxiliary_loss_mlp": 0.01032914, + "balance_loss_clip": 1.01955092, + "balance_loss_mlp": 1.04108429, + "epoch": 0.45633548774988725, + "flos": 29639697701760.0, + "grad_norm": 2.193606067712595, + "language_loss": 0.6227479, + "learning_rate": 2.377639101920992e-06, + "loss": 0.64421678, + "num_input_tokens_seen": 162839695, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73046875, + "step": 7590, + "time_per_iteration": 2.509962558746338 + }, + { + "auxiliary_loss_clip": 0.0111218, + "auxiliary_loss_mlp": 0.01040835, + "balance_loss_clip": 1.02830625, + "balance_loss_mlp": 1.03874183, + "epoch": 0.4563956110025552, + "flos": 22233409528320.0, + "grad_norm": 5.263909382371274, + "language_loss": 0.72727275, + "learning_rate": 2.377256638796135e-06, + "loss": 0.74880284, + "num_input_tokens_seen": 162856095, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.734375, + "step": 7591, + "time_per_iteration": 2.529491424560547 + }, + { + "auxiliary_loss_clip": 0.01117071, + "auxiliary_loss_mlp": 0.01037677, + "balance_loss_clip": 1.02413523, + "balance_loss_mlp": 1.04252648, + "epoch": 0.45645573425522323, + "flos": 17092043389440.0, + "grad_norm": 2.0725698163141058, + "language_loss": 0.76985544, + "learning_rate": 2.3768741613656695e-06, + "loss": 0.79140294, + "num_input_tokens_seen": 162874070, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 7592, + "time_per_iteration": 2.4446723461151123 + }, + { + "auxiliary_loss_clip": 0.01113834, + "auxiliary_loss_mlp": 0.01028586, + "balance_loss_clip": 1.01604521, + "balance_loss_mlp": 1.04070461, + "epoch": 0.4565158575078912, + "flos": 20332334309760.0, + "grad_norm": 1.9266503814961675, + "language_loss": 0.69611561, + "learning_rate": 2.376491669644098e-06, + "loss": 0.71753979, + "num_input_tokens_seen": 162891000, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.73046875, + "step": 7593, + "time_per_iteration": 2.4879302978515625 + }, + { + "auxiliary_loss_clip": 0.01107529, + "auxiliary_loss_mlp": 0.01030986, + "balance_loss_clip": 1.01882672, + "balance_loss_mlp": 1.03803527, + "epoch": 0.45657598076055916, + "flos": 23983013093760.0, + "grad_norm": 2.17790627040614, + "language_loss": 0.84199911, + "learning_rate": 2.3761091636459248e-06, + "loss": 0.86338425, + "num_input_tokens_seen": 162910120, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 7594, + "time_per_iteration": 2.464733362197876 + }, + { + "auxiliary_loss_clip": 0.01035796, + "auxiliary_loss_mlp": 0.00998737, + "balance_loss_clip": 0.99745506, + "balance_loss_mlp": 1.01167154, + "epoch": 0.45663610401322713, + "flos": 69364297526400.0, + "grad_norm": 0.7964417819777524, + "language_loss": 0.52721512, + "learning_rate": 2.375726643385654e-06, + "loss": 0.54756045, + "num_input_tokens_seen": 162963720, + "router_z_loss_clip": 0.01281738, + "router_z_loss_mlp": 0.2421875, + "step": 7595, + "time_per_iteration": 6.0974061489105225 + }, + { + "auxiliary_loss_clip": 0.01117501, + "auxiliary_loss_mlp": 0.01031747, + "balance_loss_clip": 1.01843739, + "balance_loss_mlp": 1.04165292, + "epoch": 0.4566962272658951, + "flos": 15149095891200.0, + "grad_norm": 2.1595430840247714, + "language_loss": 0.87448329, + "learning_rate": 2.3753441088777915e-06, + "loss": 0.89597577, + "num_input_tokens_seen": 162975760, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7578125, + "step": 7596, + "time_per_iteration": 3.862628936767578 + }, + { + "auxiliary_loss_clip": 0.01113625, + "auxiliary_loss_mlp": 0.01039379, + "balance_loss_clip": 1.02698088, + "balance_loss_mlp": 1.03993344, + "epoch": 0.45675635051856306, + "flos": 18697465762560.0, + "grad_norm": 2.2425847761174196, + "language_loss": 0.77131474, + "learning_rate": 2.374961560136843e-06, + "loss": 0.79284477, + "num_input_tokens_seen": 162994865, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.734375, + "step": 7597, + "time_per_iteration": 2.4821672439575195 + }, + { + "auxiliary_loss_clip": 0.01113745, + "auxiliary_loss_mlp": 0.01034483, + "balance_loss_clip": 1.02122104, + "balance_loss_mlp": 1.04004443, + "epoch": 0.456816473771231, + "flos": 19098300608640.0, + "grad_norm": 1.7340388440754042, + "language_loss": 0.78560513, + "learning_rate": 2.374578997177314e-06, + "loss": 0.80708742, + "num_input_tokens_seen": 163014730, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73828125, + "step": 7598, + "time_per_iteration": 2.4350392818450928 + }, + { + "auxiliary_loss_clip": 0.01113148, + "auxiliary_loss_mlp": 0.01029189, + "balance_loss_clip": 1.01735115, + "balance_loss_mlp": 1.04057133, + "epoch": 0.456876597023899, + "flos": 28950069507840.0, + "grad_norm": 2.435026889485133, + "language_loss": 0.71715307, + "learning_rate": 2.374196420013712e-06, + "loss": 0.73857641, + "num_input_tokens_seen": 163033405, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.7265625, + "step": 7599, + "time_per_iteration": 2.5838844776153564 + }, + { + "auxiliary_loss_clip": 0.01108114, + "auxiliary_loss_mlp": 0.01035222, + "balance_loss_clip": 1.02238345, + "balance_loss_mlp": 1.03702497, + "epoch": 0.45693672027656695, + "flos": 23289470317440.0, + "grad_norm": 1.734840239500452, + "language_loss": 0.69377261, + "learning_rate": 2.373813828660544e-06, + "loss": 0.71520597, + "num_input_tokens_seen": 163051400, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7109375, + "step": 7600, + "time_per_iteration": 2.4499921798706055 + }, + { + "auxiliary_loss_clip": 0.01112216, + "auxiliary_loss_mlp": 0.01038134, + "balance_loss_clip": 1.02584386, + "balance_loss_mlp": 1.03979039, + "epoch": 0.4569968435292349, + "flos": 20558212986240.0, + "grad_norm": 1.9688741418230387, + "language_loss": 0.78654951, + "learning_rate": 2.373431223132319e-06, + "loss": 0.80805302, + "num_input_tokens_seen": 163069250, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.72265625, + "step": 7601, + "time_per_iteration": 2.555522918701172 + }, + { + "auxiliary_loss_clip": 0.01114065, + "auxiliary_loss_mlp": 0.01036912, + "balance_loss_clip": 1.02500272, + "balance_loss_mlp": 1.04013097, + "epoch": 0.4570569667819029, + "flos": 41282619223680.0, + "grad_norm": 1.706657696767707, + "language_loss": 0.71609282, + "learning_rate": 2.3730486034435448e-06, + "loss": 0.73760259, + "num_input_tokens_seen": 163091755, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.73828125, + "step": 7602, + "time_per_iteration": 2.6383092403411865 + }, + { + "auxiliary_loss_clip": 0.01112609, + "auxiliary_loss_mlp": 0.01031807, + "balance_loss_clip": 1.01735842, + "balance_loss_mlp": 1.03901231, + "epoch": 0.45711709003457085, + "flos": 26031573555840.0, + "grad_norm": 1.778856324344474, + "language_loss": 0.72776276, + "learning_rate": 2.372665969608729e-06, + "loss": 0.7492069, + "num_input_tokens_seen": 163111600, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.734375, + "step": 7603, + "time_per_iteration": 2.566542387008667 + }, + { + "auxiliary_loss_clip": 0.01113258, + "auxiliary_loss_mlp": 0.0103744, + "balance_loss_clip": 1.02284837, + "balance_loss_mlp": 1.03945732, + "epoch": 0.4571772132872388, + "flos": 22158068751360.0, + "grad_norm": 1.783042546573846, + "language_loss": 0.83495164, + "learning_rate": 2.372283321642383e-06, + "loss": 0.8564586, + "num_input_tokens_seen": 163127350, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.73828125, + "step": 7604, + "time_per_iteration": 2.4322941303253174 + }, + { + "auxiliary_loss_clip": 0.0112315, + "auxiliary_loss_mlp": 0.0103587, + "balance_loss_clip": 1.02152371, + "balance_loss_mlp": 1.04472041, + "epoch": 0.45723733653990684, + "flos": 23878872587520.0, + "grad_norm": 1.742561007105776, + "language_loss": 0.85827744, + "learning_rate": 2.371900659559016e-06, + "loss": 0.87986767, + "num_input_tokens_seen": 163145855, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78515625, + "step": 7605, + "time_per_iteration": 2.495654582977295 + }, + { + "auxiliary_loss_clip": 0.01116353, + "auxiliary_loss_mlp": 0.01035384, + "balance_loss_clip": 1.02216911, + "balance_loss_mlp": 1.04045463, + "epoch": 0.4572974597925748, + "flos": 16871803148160.0, + "grad_norm": 1.9150435252301277, + "language_loss": 0.73814523, + "learning_rate": 2.371517983373138e-06, + "loss": 0.75966263, + "num_input_tokens_seen": 163163830, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7578125, + "step": 7606, + "time_per_iteration": 2.472698926925659 + }, + { + "auxiliary_loss_clip": 0.01115234, + "auxiliary_loss_mlp": 0.0103916, + "balance_loss_clip": 1.02525389, + "balance_loss_mlp": 1.03985333, + "epoch": 0.45735758304524277, + "flos": 13771491528960.0, + "grad_norm": 4.395321075422478, + "language_loss": 0.7975688, + "learning_rate": 2.371135293099262e-06, + "loss": 0.81911278, + "num_input_tokens_seen": 163180700, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75390625, + "step": 7607, + "time_per_iteration": 2.500666618347168 + }, + { + "auxiliary_loss_clip": 0.01117549, + "auxiliary_loss_mlp": 0.01042987, + "balance_loss_clip": 1.0295403, + "balance_loss_mlp": 1.0436604, + "epoch": 0.45741770629791073, + "flos": 21100750986240.0, + "grad_norm": 2.5876510188713437, + "language_loss": 0.80827034, + "learning_rate": 2.3707525887518982e-06, + "loss": 0.82987565, + "num_input_tokens_seen": 163199450, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73828125, + "step": 7608, + "time_per_iteration": 2.454738140106201 + }, + { + "auxiliary_loss_clip": 0.0111299, + "auxiliary_loss_mlp": 0.01040349, + "balance_loss_clip": 1.02624631, + "balance_loss_mlp": 1.03830588, + "epoch": 0.4574778295505787, + "flos": 23112898035840.0, + "grad_norm": 1.6879461416077837, + "language_loss": 0.68500757, + "learning_rate": 2.370369870345559e-06, + "loss": 0.70654094, + "num_input_tokens_seen": 163217875, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.75, + "step": 7609, + "time_per_iteration": 2.567387580871582 + }, + { + "auxiliary_loss_clip": 0.01113281, + "auxiliary_loss_mlp": 0.01039479, + "balance_loss_clip": 1.02609158, + "balance_loss_mlp": 1.03981042, + "epoch": 0.45753795280324666, + "flos": 24352929308160.0, + "grad_norm": 1.861126687806453, + "language_loss": 0.80749559, + "learning_rate": 2.369987137894757e-06, + "loss": 0.82902324, + "num_input_tokens_seen": 163237430, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.734375, + "step": 7610, + "time_per_iteration": 2.5181450843811035 + }, + { + "auxiliary_loss_clip": 0.01115569, + "auxiliary_loss_mlp": 0.01034872, + "balance_loss_clip": 1.02122259, + "balance_loss_mlp": 1.04017019, + "epoch": 0.4575980760559146, + "flos": 16653789550080.0, + "grad_norm": 1.991436967054915, + "language_loss": 0.82063943, + "learning_rate": 2.3696043914140057e-06, + "loss": 0.84214383, + "num_input_tokens_seen": 163253905, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75390625, + "step": 7611, + "time_per_iteration": 2.5181667804718018 + }, + { + "auxiliary_loss_clip": 0.01117824, + "auxiliary_loss_mlp": 0.01030256, + "balance_loss_clip": 1.01684475, + "balance_loss_mlp": 1.04256463, + "epoch": 0.4576581993085826, + "flos": 35911423912320.0, + "grad_norm": 1.7999257820591783, + "language_loss": 0.74032104, + "learning_rate": 2.369221630917819e-06, + "loss": 0.76180184, + "num_input_tokens_seen": 163274285, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.75, + "step": 7612, + "time_per_iteration": 2.573192596435547 + }, + { + "auxiliary_loss_clip": 0.01111253, + "auxiliary_loss_mlp": 0.01031239, + "balance_loss_clip": 1.01775634, + "balance_loss_mlp": 1.03739977, + "epoch": 0.45771832256125056, + "flos": 20080421251200.0, + "grad_norm": 1.4998899682115554, + "language_loss": 0.84958243, + "learning_rate": 2.368838856420711e-06, + "loss": 0.87100732, + "num_input_tokens_seen": 163293150, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73828125, + "step": 7613, + "time_per_iteration": 2.519374132156372 + }, + { + "auxiliary_loss_clip": 0.01113962, + "auxiliary_loss_mlp": 0.01028737, + "balance_loss_clip": 1.01548696, + "balance_loss_mlp": 1.04007339, + "epoch": 0.4577784458139185, + "flos": 10744329957120.0, + "grad_norm": 2.119092433129462, + "language_loss": 0.75686407, + "learning_rate": 2.3684560679371965e-06, + "loss": 0.77829111, + "num_input_tokens_seen": 163310065, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73828125, + "step": 7614, + "time_per_iteration": 2.435258388519287 + }, + { + "auxiliary_loss_clip": 0.01111665, + "auxiliary_loss_mlp": 0.01031457, + "balance_loss_clip": 1.01870763, + "balance_loss_mlp": 1.03973377, + "epoch": 0.4578385690665865, + "flos": 21907269014400.0, + "grad_norm": 1.4729553038511707, + "language_loss": 0.74797261, + "learning_rate": 2.368073265481791e-06, + "loss": 0.76940382, + "num_input_tokens_seen": 163329415, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.71875, + "step": 7615, + "time_per_iteration": 2.4776275157928467 + }, + { + "auxiliary_loss_clip": 0.01037994, + "auxiliary_loss_mlp": 0.00999141, + "balance_loss_clip": 0.99766314, + "balance_loss_mlp": 1.01355577, + "epoch": 0.45789869231925445, + "flos": 64758286667520.0, + "grad_norm": 0.7822572530544061, + "language_loss": 0.57660586, + "learning_rate": 2.3676904490690105e-06, + "loss": 0.59697717, + "num_input_tokens_seen": 163385875, + "router_z_loss_clip": 0.01477051, + "router_z_loss_mlp": 0.24414062, + "step": 7616, + "time_per_iteration": 2.9986298084259033 + }, + { + "auxiliary_loss_clip": 0.01111756, + "auxiliary_loss_mlp": 0.01038669, + "balance_loss_clip": 1.0251503, + "balance_loss_mlp": 1.03939307, + "epoch": 0.4579588155719224, + "flos": 16144001775360.0, + "grad_norm": 1.5412759634284317, + "language_loss": 0.70953274, + "learning_rate": 2.3673076187133704e-06, + "loss": 0.73103696, + "num_input_tokens_seen": 163405170, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.72265625, + "step": 7617, + "time_per_iteration": 2.514575958251953 + }, + { + "auxiliary_loss_clip": 0.01116383, + "auxiliary_loss_mlp": 0.01032517, + "balance_loss_clip": 1.01886725, + "balance_loss_mlp": 1.04211044, + "epoch": 0.45801893882459044, + "flos": 21395541905280.0, + "grad_norm": 2.1003257335678245, + "language_loss": 0.76458549, + "learning_rate": 2.36692477442939e-06, + "loss": 0.78607446, + "num_input_tokens_seen": 163423155, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7421875, + "step": 7618, + "time_per_iteration": 2.431196689605713 + }, + { + "auxiliary_loss_clip": 0.01118549, + "auxiliary_loss_mlp": 0.01044975, + "balance_loss_clip": 1.0323689, + "balance_loss_mlp": 1.0429455, + "epoch": 0.4580790620772584, + "flos": 19536554448000.0, + "grad_norm": 1.7069120237831286, + "language_loss": 0.76705682, + "learning_rate": 2.366541916231585e-06, + "loss": 0.788692, + "num_input_tokens_seen": 163442450, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.75390625, + "step": 7619, + "time_per_iteration": 2.491133213043213 + }, + { + "auxiliary_loss_clip": 0.01112973, + "auxiliary_loss_mlp": 0.0103583, + "balance_loss_clip": 1.02378964, + "balance_loss_mlp": 1.04174709, + "epoch": 0.45813918532992637, + "flos": 16581070465920.0, + "grad_norm": 1.9887034550999254, + "language_loss": 0.7175532, + "learning_rate": 2.366159044134473e-06, + "loss": 0.73904121, + "num_input_tokens_seen": 163459810, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.7109375, + "step": 7620, + "time_per_iteration": 2.429659366607666 + }, + { + "auxiliary_loss_clip": 0.0111009, + "auxiliary_loss_mlp": 0.01028718, + "balance_loss_clip": 1.01643384, + "balance_loss_mlp": 1.03828478, + "epoch": 0.45819930858259433, + "flos": 42230301701760.0, + "grad_norm": 2.3637648648526035, + "language_loss": 0.78374821, + "learning_rate": 2.3657761581525748e-06, + "loss": 0.80513632, + "num_input_tokens_seen": 163482970, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.71875, + "step": 7621, + "time_per_iteration": 2.69990611076355 + }, + { + "auxiliary_loss_clip": 0.01037733, + "auxiliary_loss_mlp": 0.01001998, + "balance_loss_clip": 1.00071096, + "balance_loss_mlp": 1.01315987, + "epoch": 0.4582594318352623, + "flos": 63714795638400.0, + "grad_norm": 0.7958411378428579, + "language_loss": 0.6499809, + "learning_rate": 2.3653932583004063e-06, + "loss": 0.67037821, + "num_input_tokens_seen": 163545330, + "router_z_loss_clip": 0.01287842, + "router_z_loss_mlp": 0.24609375, + "step": 7622, + "time_per_iteration": 3.0476205348968506 + }, + { + "auxiliary_loss_clip": 0.01114449, + "auxiliary_loss_mlp": 0.01030125, + "balance_loss_clip": 1.01667762, + "balance_loss_mlp": 1.04142582, + "epoch": 0.45831955508793026, + "flos": 26869979882880.0, + "grad_norm": 1.9256202714320767, + "language_loss": 0.79611146, + "learning_rate": 2.3650103445924903e-06, + "loss": 0.81755722, + "num_input_tokens_seen": 163564620, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73046875, + "step": 7623, + "time_per_iteration": 2.547234535217285 + }, + { + "auxiliary_loss_clip": 0.01115872, + "auxiliary_loss_mlp": 0.01036985, + "balance_loss_clip": 1.02382421, + "balance_loss_mlp": 1.04050457, + "epoch": 0.45837967834059823, + "flos": 18733951002240.0, + "grad_norm": 1.996922752989922, + "language_loss": 0.70809233, + "learning_rate": 2.3646274170433452e-06, + "loss": 0.72962081, + "num_input_tokens_seen": 163581010, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.75390625, + "step": 7624, + "time_per_iteration": 2.442575693130493 + }, + { + "auxiliary_loss_clip": 0.01113872, + "auxiliary_loss_mlp": 0.01032309, + "balance_loss_clip": 1.01944637, + "balance_loss_mlp": 1.0383656, + "epoch": 0.4584398015932662, + "flos": 21178102924800.0, + "grad_norm": 2.876738245253823, + "language_loss": 0.7299192, + "learning_rate": 2.364244475667491e-06, + "loss": 0.75138104, + "num_input_tokens_seen": 163599955, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7578125, + "step": 7625, + "time_per_iteration": 2.53002667427063 + }, + { + "auxiliary_loss_clip": 0.01116016, + "auxiliary_loss_mlp": 0.01033086, + "balance_loss_clip": 1.02058113, + "balance_loss_mlp": 1.04226136, + "epoch": 0.45849992484593416, + "flos": 19790047704960.0, + "grad_norm": 3.1470354950748716, + "language_loss": 0.78132713, + "learning_rate": 2.363861520479451e-06, + "loss": 0.80281818, + "num_input_tokens_seen": 163618545, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.73828125, + "step": 7626, + "time_per_iteration": 2.4544708728790283 + }, + { + "auxiliary_loss_clip": 0.01117004, + "auxiliary_loss_mlp": 0.0103582, + "balance_loss_clip": 1.02270126, + "balance_loss_mlp": 1.04142714, + "epoch": 0.4585600480986021, + "flos": 18223265387520.0, + "grad_norm": 1.604401840334718, + "language_loss": 0.85191864, + "learning_rate": 2.3634785514937445e-06, + "loss": 0.87344688, + "num_input_tokens_seen": 163636055, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7578125, + "step": 7627, + "time_per_iteration": 2.478867769241333 + }, + { + "auxiliary_loss_clip": 0.01117716, + "auxiliary_loss_mlp": 0.01036428, + "balance_loss_clip": 1.02293992, + "balance_loss_mlp": 1.04074025, + "epoch": 0.4586201713512701, + "flos": 29022213974400.0, + "grad_norm": 1.506714204397822, + "language_loss": 0.69413865, + "learning_rate": 2.3630955687248953e-06, + "loss": 0.71568, + "num_input_tokens_seen": 163657485, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.76953125, + "step": 7628, + "time_per_iteration": 2.5127782821655273 + }, + { + "auxiliary_loss_clip": 0.01113376, + "auxiliary_loss_mlp": 0.01030025, + "balance_loss_clip": 1.01654255, + "balance_loss_mlp": 1.04060626, + "epoch": 0.45868029460393805, + "flos": 23404600385280.0, + "grad_norm": 1.5379008002675938, + "language_loss": 0.78294545, + "learning_rate": 2.3627125721874265e-06, + "loss": 0.8043794, + "num_input_tokens_seen": 163676030, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7265625, + "step": 7629, + "time_per_iteration": 2.4944000244140625 + }, + { + "auxiliary_loss_clip": 0.0111907, + "auxiliary_loss_mlp": 0.01039687, + "balance_loss_clip": 1.02578115, + "balance_loss_mlp": 1.04031289, + "epoch": 0.458740417856606, + "flos": 18221972497920.0, + "grad_norm": 2.0009780664883223, + "language_loss": 0.79405141, + "learning_rate": 2.3623295618958595e-06, + "loss": 0.81563896, + "num_input_tokens_seen": 163694490, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7890625, + "step": 7630, + "time_per_iteration": 2.443598747253418 + }, + { + "auxiliary_loss_clip": 0.0111732, + "auxiliary_loss_mlp": 0.01034928, + "balance_loss_clip": 1.02108812, + "balance_loss_mlp": 1.03952336, + "epoch": 0.458800541109274, + "flos": 34568760504960.0, + "grad_norm": 1.67887072973593, + "language_loss": 0.71819407, + "learning_rate": 2.3619465378647198e-06, + "loss": 0.73971653, + "num_input_tokens_seen": 163717035, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7734375, + "step": 7631, + "time_per_iteration": 2.613935708999634 + }, + { + "auxiliary_loss_clip": 0.01118321, + "auxiliary_loss_mlp": 0.0103646, + "balance_loss_clip": 1.02248299, + "balance_loss_mlp": 1.04306722, + "epoch": 0.458860664361942, + "flos": 17712112896000.0, + "grad_norm": 2.655938907200588, + "language_loss": 0.71337265, + "learning_rate": 2.361563500108531e-06, + "loss": 0.7349205, + "num_input_tokens_seen": 163734525, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75390625, + "step": 7632, + "time_per_iteration": 2.4854414463043213 + }, + { + "auxiliary_loss_clip": 0.01118604, + "auxiliary_loss_mlp": 0.01033523, + "balance_loss_clip": 1.0190748, + "balance_loss_mlp": 1.04055059, + "epoch": 0.45892078761460997, + "flos": 18441889516800.0, + "grad_norm": 15.51679170955813, + "language_loss": 0.69212449, + "learning_rate": 2.3611804486418178e-06, + "loss": 0.71364582, + "num_input_tokens_seen": 163752860, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 7633, + "time_per_iteration": 2.488741874694824 + }, + { + "auxiliary_loss_clip": 0.01115341, + "auxiliary_loss_mlp": 0.01036682, + "balance_loss_clip": 1.02366996, + "balance_loss_mlp": 1.04068875, + "epoch": 0.45898091086727794, + "flos": 22672956257280.0, + "grad_norm": 1.4724338826500494, + "language_loss": 0.80777454, + "learning_rate": 2.3607973834791062e-06, + "loss": 0.82929468, + "num_input_tokens_seen": 163772495, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.74609375, + "step": 7634, + "time_per_iteration": 2.4676551818847656 + }, + { + "auxiliary_loss_clip": 0.01118954, + "auxiliary_loss_mlp": 0.0103355, + "balance_loss_clip": 1.0188632, + "balance_loss_mlp": 1.04032791, + "epoch": 0.4590410341199459, + "flos": 21652949744640.0, + "grad_norm": 1.9575518559569576, + "language_loss": 0.81853092, + "learning_rate": 2.3604143046349216e-06, + "loss": 0.84005594, + "num_input_tokens_seen": 163791475, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78515625, + "step": 7635, + "time_per_iteration": 2.513383150100708 + }, + { + "auxiliary_loss_clip": 0.01112964, + "auxiliary_loss_mlp": 0.01040022, + "balance_loss_clip": 1.02696204, + "balance_loss_mlp": 1.04045606, + "epoch": 0.45910115737261387, + "flos": 36535372087680.0, + "grad_norm": 1.4265799385965707, + "language_loss": 0.64948833, + "learning_rate": 2.3600312121237905e-06, + "loss": 0.67101824, + "num_input_tokens_seen": 163812995, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 7636, + "time_per_iteration": 4.062237501144409 + }, + { + "auxiliary_loss_clip": 0.01114223, + "auxiliary_loss_mlp": 0.01029599, + "balance_loss_clip": 1.01690328, + "balance_loss_mlp": 1.04186797, + "epoch": 0.45916128062528183, + "flos": 24419866302720.0, + "grad_norm": 1.4568741521374282, + "language_loss": 0.80726147, + "learning_rate": 2.3596481059602395e-06, + "loss": 0.82869971, + "num_input_tokens_seen": 163833945, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 7637, + "time_per_iteration": 4.017204999923706 + }, + { + "auxiliary_loss_clip": 0.011204, + "auxiliary_loss_mlp": 0.01037267, + "balance_loss_clip": 1.02297974, + "balance_loss_mlp": 1.0438447, + "epoch": 0.4592214038779498, + "flos": 23221958705280.0, + "grad_norm": 1.56098785708404, + "language_loss": 0.75311542, + "learning_rate": 2.3592649861587965e-06, + "loss": 0.77469212, + "num_input_tokens_seen": 163853885, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.765625, + "step": 7638, + "time_per_iteration": 2.4801623821258545 + }, + { + "auxiliary_loss_clip": 0.01113334, + "auxiliary_loss_mlp": 0.01033656, + "balance_loss_clip": 1.02054262, + "balance_loss_mlp": 1.04093051, + "epoch": 0.45928152713061776, + "flos": 19172133014400.0, + "grad_norm": 1.6757486640396035, + "language_loss": 0.74225289, + "learning_rate": 2.358881852733989e-06, + "loss": 0.76372278, + "num_input_tokens_seen": 163871855, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 7639, + "time_per_iteration": 2.457977294921875 + }, + { + "auxiliary_loss_clip": 0.0111659, + "auxiliary_loss_mlp": 0.01034149, + "balance_loss_clip": 1.02073193, + "balance_loss_mlp": 1.0410862, + "epoch": 0.4593416503832857, + "flos": 22414686491520.0, + "grad_norm": 2.7996676169839856, + "language_loss": 0.68441081, + "learning_rate": 2.358498705700346e-06, + "loss": 0.70591819, + "num_input_tokens_seen": 163891450, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75390625, + "step": 7640, + "time_per_iteration": 2.4815306663513184 + }, + { + "auxiliary_loss_clip": 0.01116242, + "auxiliary_loss_mlp": 0.01039241, + "balance_loss_clip": 1.02532363, + "balance_loss_mlp": 1.03950286, + "epoch": 0.4594017736359537, + "flos": 18880215183360.0, + "grad_norm": 4.694339799219563, + "language_loss": 0.75290608, + "learning_rate": 2.3581155450723958e-06, + "loss": 0.77446091, + "num_input_tokens_seen": 163909345, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 7641, + "time_per_iteration": 2.4738545417785645 + }, + { + "auxiliary_loss_clip": 0.0111703, + "auxiliary_loss_mlp": 0.01031975, + "balance_loss_clip": 1.0180217, + "balance_loss_mlp": 1.041008, + "epoch": 0.45946189688862166, + "flos": 20518567349760.0, + "grad_norm": 1.7266679695779108, + "language_loss": 0.74649787, + "learning_rate": 2.357732370864668e-06, + "loss": 0.76798791, + "num_input_tokens_seen": 163926940, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76171875, + "step": 7642, + "time_per_iteration": 2.474160671234131 + }, + { + "auxiliary_loss_clip": 0.01036998, + "auxiliary_loss_mlp": 0.00999788, + "balance_loss_clip": 0.99855977, + "balance_loss_mlp": 1.01273584, + "epoch": 0.4595220201412896, + "flos": 61405990162560.0, + "grad_norm": 0.8383581259748949, + "language_loss": 0.58191991, + "learning_rate": 2.357349183091694e-06, + "loss": 0.60228777, + "num_input_tokens_seen": 163977785, + "router_z_loss_clip": 0.01226807, + "router_z_loss_mlp": 0.2421875, + "step": 7643, + "time_per_iteration": 2.810622453689575 + }, + { + "auxiliary_loss_clip": 0.01118319, + "auxiliary_loss_mlp": 0.01036506, + "balance_loss_clip": 1.02267814, + "balance_loss_mlp": 1.03810704, + "epoch": 0.4595821433939576, + "flos": 23330947547520.0, + "grad_norm": 1.5583198955297553, + "language_loss": 0.92945647, + "learning_rate": 2.3569659817680016e-06, + "loss": 0.95100462, + "num_input_tokens_seen": 163996630, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.80078125, + "step": 7644, + "time_per_iteration": 2.4740004539489746 + }, + { + "auxiliary_loss_clip": 0.01116764, + "auxiliary_loss_mlp": 0.01038017, + "balance_loss_clip": 1.02458835, + "balance_loss_mlp": 1.04016256, + "epoch": 0.4596422666466256, + "flos": 14282356711680.0, + "grad_norm": 1.923875093759249, + "language_loss": 0.8283661, + "learning_rate": 2.3565827669081243e-06, + "loss": 0.8499139, + "num_input_tokens_seen": 164013190, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 7645, + "time_per_iteration": 2.459575891494751 + }, + { + "auxiliary_loss_clip": 0.01035246, + "auxiliary_loss_mlp": 0.00999372, + "balance_loss_clip": 0.99805516, + "balance_loss_mlp": 1.0108279, + "epoch": 0.4597023898992936, + "flos": 65727337737600.0, + "grad_norm": 0.7553504929083139, + "language_loss": 0.59931064, + "learning_rate": 2.356199538526593e-06, + "loss": 0.6196568, + "num_input_tokens_seen": 164074030, + "router_z_loss_clip": 0.01318359, + "router_z_loss_mlp": 0.24414062, + "step": 7646, + "time_per_iteration": 3.0040318965911865 + }, + { + "auxiliary_loss_clip": 0.01116678, + "auxiliary_loss_mlp": 0.01032804, + "balance_loss_clip": 1.01953018, + "balance_loss_mlp": 1.04043436, + "epoch": 0.45976251315196154, + "flos": 26907075653760.0, + "grad_norm": 1.6094604606837348, + "language_loss": 0.72804034, + "learning_rate": 2.355816296637939e-06, + "loss": 0.74953508, + "num_input_tokens_seen": 164095515, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.76171875, + "step": 7647, + "time_per_iteration": 2.539550304412842 + }, + { + "auxiliary_loss_clip": 0.01114997, + "auxiliary_loss_mlp": 0.01034751, + "balance_loss_clip": 1.02135134, + "balance_loss_mlp": 1.03845108, + "epoch": 0.4598226364046295, + "flos": 26618066824320.0, + "grad_norm": 1.5906503149252664, + "language_loss": 0.66864169, + "learning_rate": 2.3554330412566957e-06, + "loss": 0.69013917, + "num_input_tokens_seen": 164117270, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.765625, + "step": 7648, + "time_per_iteration": 2.538694143295288 + }, + { + "auxiliary_loss_clip": 0.01112764, + "auxiliary_loss_mlp": 0.01033419, + "balance_loss_clip": 1.01969171, + "balance_loss_mlp": 1.03751159, + "epoch": 0.45988275965729747, + "flos": 24387762522240.0, + "grad_norm": 1.4797855079557312, + "language_loss": 0.78785735, + "learning_rate": 2.3550497723973953e-06, + "loss": 0.80931914, + "num_input_tokens_seen": 164137850, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.75, + "step": 7649, + "time_per_iteration": 2.5164248943328857 + }, + { + "auxiliary_loss_clip": 0.01113076, + "auxiliary_loss_mlp": 0.01037179, + "balance_loss_clip": 1.02412558, + "balance_loss_mlp": 1.03840113, + "epoch": 0.45994288290996543, + "flos": 24535822383360.0, + "grad_norm": 3.1550947466117303, + "language_loss": 0.69324255, + "learning_rate": 2.3546664900745726e-06, + "loss": 0.7147451, + "num_input_tokens_seen": 164157960, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.75, + "step": 7650, + "time_per_iteration": 2.5182442665100098 + }, + { + "auxiliary_loss_clip": 0.01118739, + "auxiliary_loss_mlp": 0.01039502, + "balance_loss_clip": 1.0245893, + "balance_loss_mlp": 1.03925538, + "epoch": 0.4600030061626334, + "flos": 14830245838080.0, + "grad_norm": 1.968615763904363, + "language_loss": 0.83896518, + "learning_rate": 2.354283194302761e-06, + "loss": 0.86054754, + "num_input_tokens_seen": 164174590, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.79296875, + "step": 7651, + "time_per_iteration": 2.4545249938964844 + }, + { + "auxiliary_loss_clip": 0.01114537, + "auxiliary_loss_mlp": 0.01030219, + "balance_loss_clip": 1.01685548, + "balance_loss_mlp": 1.04122114, + "epoch": 0.46006312941530136, + "flos": 18113845582080.0, + "grad_norm": 2.1703456469435944, + "language_loss": 0.75375223, + "learning_rate": 2.3538998850964948e-06, + "loss": 0.77519977, + "num_input_tokens_seen": 164192935, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.734375, + "step": 7652, + "time_per_iteration": 2.4435648918151855 + }, + { + "auxiliary_loss_clip": 0.01113746, + "auxiliary_loss_mlp": 0.01029979, + "balance_loss_clip": 1.01611495, + "balance_loss_mlp": 1.03735042, + "epoch": 0.46012325266796933, + "flos": 21976468565760.0, + "grad_norm": 1.8091521205399639, + "language_loss": 0.75805604, + "learning_rate": 2.3535165624703097e-06, + "loss": 0.77949333, + "num_input_tokens_seen": 164213160, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 7653, + "time_per_iteration": 2.530977487564087 + }, + { + "auxiliary_loss_clip": 0.01121671, + "auxiliary_loss_mlp": 0.01038496, + "balance_loss_clip": 1.02338028, + "balance_loss_mlp": 1.04202819, + "epoch": 0.4601833759206373, + "flos": 15268068714240.0, + "grad_norm": 2.3598469293633584, + "language_loss": 0.6584686, + "learning_rate": 2.353133226438741e-06, + "loss": 0.68007028, + "num_input_tokens_seen": 164229330, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.796875, + "step": 7654, + "time_per_iteration": 2.3942883014678955 + }, + { + "auxiliary_loss_clip": 0.01112793, + "auxiliary_loss_mlp": 0.0103367, + "balance_loss_clip": 1.02026534, + "balance_loss_mlp": 1.0375098, + "epoch": 0.46024349917330526, + "flos": 27088999061760.0, + "grad_norm": 1.647085409720671, + "language_loss": 0.79088843, + "learning_rate": 2.3527498770163248e-06, + "loss": 0.81235307, + "num_input_tokens_seen": 164248240, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.75390625, + "step": 7655, + "time_per_iteration": 2.5213396549224854 + }, + { + "auxiliary_loss_clip": 0.01110004, + "auxiliary_loss_mlp": 0.01030693, + "balance_loss_clip": 1.01755643, + "balance_loss_mlp": 1.03802609, + "epoch": 0.4603036224259732, + "flos": 24462923731200.0, + "grad_norm": 2.0582079675710134, + "language_loss": 0.67502171, + "learning_rate": 2.3523665142175985e-06, + "loss": 0.69642866, + "num_input_tokens_seen": 164268020, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 7656, + "time_per_iteration": 2.4714531898498535 + }, + { + "auxiliary_loss_clip": 0.01112759, + "auxiliary_loss_mlp": 0.01032502, + "balance_loss_clip": 1.01965153, + "balance_loss_mlp": 1.03784871, + "epoch": 0.4603637456786412, + "flos": 28109292883200.0, + "grad_norm": 1.7896797448491664, + "language_loss": 0.81050038, + "learning_rate": 2.351983138057098e-06, + "loss": 0.83195299, + "num_input_tokens_seen": 164287305, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.75, + "step": 7657, + "time_per_iteration": 2.549114227294922 + }, + { + "auxiliary_loss_clip": 0.01113625, + "auxiliary_loss_mlp": 0.01031549, + "balance_loss_clip": 1.01767325, + "balance_loss_mlp": 1.03843951, + "epoch": 0.4604238689313092, + "flos": 24348942898560.0, + "grad_norm": 2.212167065380131, + "language_loss": 0.70071685, + "learning_rate": 2.3515997485493623e-06, + "loss": 0.72216856, + "num_input_tokens_seen": 164306835, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 7658, + "time_per_iteration": 2.4548964500427246 + }, + { + "auxiliary_loss_clip": 0.0103337, + "auxiliary_loss_mlp": 0.01002994, + "balance_loss_clip": 1.00179565, + "balance_loss_mlp": 1.00924027, + "epoch": 0.4604839921839772, + "flos": 53606229431040.0, + "grad_norm": 0.9542906494873047, + "language_loss": 0.62159562, + "learning_rate": 2.351216345708928e-06, + "loss": 0.64195925, + "num_input_tokens_seen": 164367095, + "router_z_loss_clip": 0.01196289, + "router_z_loss_mlp": 0.2421875, + "step": 7659, + "time_per_iteration": 3.194460153579712 + }, + { + "auxiliary_loss_clip": 0.01114248, + "auxiliary_loss_mlp": 0.0103108, + "balance_loss_clip": 1.01774633, + "balance_loss_mlp": 1.04089022, + "epoch": 0.46054411543664514, + "flos": 31248424126080.0, + "grad_norm": 2.0710979138047123, + "language_loss": 0.68395913, + "learning_rate": 2.350832929550336e-06, + "loss": 0.70541239, + "num_input_tokens_seen": 164388895, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.734375, + "step": 7660, + "time_per_iteration": 2.5212934017181396 + }, + { + "auxiliary_loss_clip": 0.01112449, + "auxiliary_loss_mlp": 0.01041428, + "balance_loss_clip": 1.02767086, + "balance_loss_mlp": 1.03826356, + "epoch": 0.4606042386893131, + "flos": 24092863862400.0, + "grad_norm": 1.7599753910943126, + "language_loss": 0.76785183, + "learning_rate": 2.3504495000881227e-06, + "loss": 0.78939056, + "num_input_tokens_seen": 164409080, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7421875, + "step": 7661, + "time_per_iteration": 2.504199981689453 + }, + { + "auxiliary_loss_clip": 0.01111854, + "auxiliary_loss_mlp": 0.0103438, + "balance_loss_clip": 1.02109385, + "balance_loss_mlp": 1.03997183, + "epoch": 0.46066436194198107, + "flos": 26578457101440.0, + "grad_norm": 1.743819837097498, + "language_loss": 0.74565995, + "learning_rate": 2.3500660573368305e-06, + "loss": 0.76712227, + "num_input_tokens_seen": 164427585, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 7662, + "time_per_iteration": 2.479710817337036 + }, + { + "auxiliary_loss_clip": 0.01118488, + "auxiliary_loss_mlp": 0.01032606, + "balance_loss_clip": 1.01835489, + "balance_loss_mlp": 1.03899062, + "epoch": 0.46072448519464904, + "flos": 17775602184960.0, + "grad_norm": 2.744789888238294, + "language_loss": 0.78880358, + "learning_rate": 2.349682601310998e-06, + "loss": 0.81031454, + "num_input_tokens_seen": 164438455, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.796875, + "step": 7663, + "time_per_iteration": 2.433105230331421 + }, + { + "auxiliary_loss_clip": 0.01110139, + "auxiliary_loss_mlp": 0.01035881, + "balance_loss_clip": 1.02286935, + "balance_loss_mlp": 1.03860092, + "epoch": 0.460784608447317, + "flos": 15086109392640.0, + "grad_norm": 1.8568277173945746, + "language_loss": 0.73164225, + "learning_rate": 2.3492991320251653e-06, + "loss": 0.75310248, + "num_input_tokens_seen": 164456830, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71484375, + "step": 7664, + "time_per_iteration": 2.4182069301605225 + }, + { + "auxiliary_loss_clip": 0.01114696, + "auxiliary_loss_mlp": 0.01033369, + "balance_loss_clip": 1.02064347, + "balance_loss_mlp": 1.040645, + "epoch": 0.46084473169998497, + "flos": 18588261438720.0, + "grad_norm": 1.6231584574242337, + "language_loss": 0.72039741, + "learning_rate": 2.3489156494938753e-06, + "loss": 0.74187809, + "num_input_tokens_seen": 164475375, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.73828125, + "step": 7665, + "time_per_iteration": 2.4458460807800293 + }, + { + "auxiliary_loss_clip": 0.01115054, + "auxiliary_loss_mlp": 0.01032258, + "balance_loss_clip": 1.01965141, + "balance_loss_mlp": 1.03982568, + "epoch": 0.46090485495265293, + "flos": 19494789909120.0, + "grad_norm": 1.8683756247621939, + "language_loss": 0.78134775, + "learning_rate": 2.348532153731669e-06, + "loss": 0.80282086, + "num_input_tokens_seen": 164492040, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.75390625, + "step": 7666, + "time_per_iteration": 2.4217963218688965 + }, + { + "auxiliary_loss_clip": 0.01112281, + "auxiliary_loss_mlp": 0.01034367, + "balance_loss_clip": 1.02005553, + "balance_loss_mlp": 1.03926802, + "epoch": 0.4609649782053209, + "flos": 33364927163520.0, + "grad_norm": 1.2927592404362929, + "language_loss": 0.73972279, + "learning_rate": 2.348148644753088e-06, + "loss": 0.76118922, + "num_input_tokens_seen": 164513665, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.73046875, + "step": 7667, + "time_per_iteration": 2.586657762527466 + }, + { + "auxiliary_loss_clip": 0.0111122, + "auxiliary_loss_mlp": 0.010306, + "balance_loss_clip": 1.01803541, + "balance_loss_mlp": 1.03743756, + "epoch": 0.46102510145798886, + "flos": 23769165473280.0, + "grad_norm": 1.3923437909363505, + "language_loss": 0.75857067, + "learning_rate": 2.347765122572676e-06, + "loss": 0.77998888, + "num_input_tokens_seen": 164533890, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.73828125, + "step": 7668, + "time_per_iteration": 2.456688642501831 + }, + { + "auxiliary_loss_clip": 0.01112338, + "auxiliary_loss_mlp": 0.01029444, + "balance_loss_clip": 1.01699305, + "balance_loss_mlp": 1.04143405, + "epoch": 0.4610852247106568, + "flos": 23294821443840.0, + "grad_norm": 2.015120719246451, + "language_loss": 0.77794099, + "learning_rate": 2.347381587204975e-06, + "loss": 0.79935884, + "num_input_tokens_seen": 164553815, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 7669, + "time_per_iteration": 2.503912925720215 + }, + { + "auxiliary_loss_clip": 0.01112792, + "auxiliary_loss_mlp": 0.01029623, + "balance_loss_clip": 1.01688588, + "balance_loss_mlp": 1.03798747, + "epoch": 0.4611453479633248, + "flos": 25447450584960.0, + "grad_norm": 1.8162494299938103, + "language_loss": 0.82330608, + "learning_rate": 2.34699803866453e-06, + "loss": 0.84473014, + "num_input_tokens_seen": 164573125, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.75, + "step": 7670, + "time_per_iteration": 2.481456995010376 + }, + { + "auxiliary_loss_clip": 0.01111476, + "auxiliary_loss_mlp": 0.0103085, + "balance_loss_clip": 1.01781416, + "balance_loss_mlp": 1.03845906, + "epoch": 0.4612054712159928, + "flos": 21139606523520.0, + "grad_norm": 1.6076372414606255, + "language_loss": 0.63204038, + "learning_rate": 2.3466144769658845e-06, + "loss": 0.6534636, + "num_input_tokens_seen": 164592575, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73046875, + "step": 7671, + "time_per_iteration": 2.4743082523345947 + }, + { + "auxiliary_loss_clip": 0.01034608, + "auxiliary_loss_mlp": 0.01007042, + "balance_loss_clip": 1.00571287, + "balance_loss_mlp": 1.01008546, + "epoch": 0.4612655944686608, + "flos": 69959266404480.0, + "grad_norm": 0.6877278401983052, + "language_loss": 0.55879581, + "learning_rate": 2.346230902123583e-06, + "loss": 0.57921231, + "num_input_tokens_seen": 164659795, + "router_z_loss_clip": 0.01330566, + "router_z_loss_mlp": 0.24609375, + "step": 7672, + "time_per_iteration": 3.15800142288208 + }, + { + "auxiliary_loss_clip": 0.0111558, + "auxiliary_loss_mlp": 0.01035904, + "balance_loss_clip": 1.02255249, + "balance_loss_mlp": 1.04003441, + "epoch": 0.46132571772132874, + "flos": 16837149502080.0, + "grad_norm": 1.8329231831015789, + "language_loss": 0.70920408, + "learning_rate": 2.3458473141521715e-06, + "loss": 0.73071891, + "num_input_tokens_seen": 164678735, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7578125, + "step": 7673, + "time_per_iteration": 2.4639430046081543 + }, + { + "auxiliary_loss_clip": 0.01112366, + "auxiliary_loss_mlp": 0.01034204, + "balance_loss_clip": 1.02145457, + "balance_loss_mlp": 1.04083312, + "epoch": 0.4613858409739967, + "flos": 35808935431680.0, + "grad_norm": 1.6780898708072003, + "language_loss": 0.70402145, + "learning_rate": 2.345463713066195e-06, + "loss": 0.72548711, + "num_input_tokens_seen": 164700885, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71484375, + "step": 7674, + "time_per_iteration": 2.5660369396209717 + }, + { + "auxiliary_loss_clip": 0.01111645, + "auxiliary_loss_mlp": 0.01037491, + "balance_loss_clip": 1.02384138, + "balance_loss_mlp": 1.03684926, + "epoch": 0.4614459642266647, + "flos": 35266756567680.0, + "grad_norm": 1.5790047103218752, + "language_loss": 0.65408182, + "learning_rate": 2.3450800988801996e-06, + "loss": 0.67557311, + "num_input_tokens_seen": 164726960, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75, + "step": 7675, + "time_per_iteration": 2.616771697998047 + }, + { + "auxiliary_loss_clip": 0.01034961, + "auxiliary_loss_mlp": 0.010075, + "balance_loss_clip": 1.00611675, + "balance_loss_mlp": 1.01053035, + "epoch": 0.46150608747933264, + "flos": 66704610044160.0, + "grad_norm": 0.7425701763607123, + "language_loss": 0.58600932, + "learning_rate": 2.3446964716087327e-06, + "loss": 0.60643393, + "num_input_tokens_seen": 164788525, + "router_z_loss_clip": 0.01385498, + "router_z_loss_mlp": 0.24511719, + "step": 7676, + "time_per_iteration": 3.09281325340271 + }, + { + "auxiliary_loss_clip": 0.01034023, + "auxiliary_loss_mlp": 0.01002968, + "balance_loss_clip": 1.00172222, + "balance_loss_mlp": 1.00993788, + "epoch": 0.4615662107320006, + "flos": 55830177025920.0, + "grad_norm": 0.7891273111868267, + "language_loss": 0.62684548, + "learning_rate": 2.344312831266341e-06, + "loss": 0.64721537, + "num_input_tokens_seen": 164843525, + "router_z_loss_clip": 0.01245117, + "router_z_loss_mlp": 0.24121094, + "step": 7677, + "time_per_iteration": 2.9087297916412354 + }, + { + "auxiliary_loss_clip": 0.01112185, + "auxiliary_loss_mlp": 0.01031192, + "balance_loss_clip": 1.018502, + "balance_loss_mlp": 1.03929043, + "epoch": 0.46162633398466857, + "flos": 15483245137920.0, + "grad_norm": 2.8566258545012464, + "language_loss": 0.76442772, + "learning_rate": 2.3439291778675718e-06, + "loss": 0.78586149, + "num_input_tokens_seen": 164859895, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7265625, + "step": 7678, + "time_per_iteration": 3.80979061126709 + }, + { + "auxiliary_loss_clip": 0.01115647, + "auxiliary_loss_mlp": 0.01035074, + "balance_loss_clip": 1.02148438, + "balance_loss_mlp": 1.04122365, + "epoch": 0.46168645723733653, + "flos": 20011437181440.0, + "grad_norm": 1.9875640695173902, + "language_loss": 0.66738796, + "learning_rate": 2.343545511426974e-06, + "loss": 0.68889523, + "num_input_tokens_seen": 164878030, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7421875, + "step": 7679, + "time_per_iteration": 5.473088502883911 + }, + { + "auxiliary_loss_clip": 0.01112323, + "auxiliary_loss_mlp": 0.01038079, + "balance_loss_clip": 1.02563381, + "balance_loss_mlp": 1.03913581, + "epoch": 0.4617465804900045, + "flos": 20298542590080.0, + "grad_norm": 1.9247599304086902, + "language_loss": 0.69658661, + "learning_rate": 2.3431618319590963e-06, + "loss": 0.71809065, + "num_input_tokens_seen": 164895710, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.73046875, + "step": 7680, + "time_per_iteration": 2.435971736907959 + }, + { + "auxiliary_loss_clip": 0.01121586, + "auxiliary_loss_mlp": 0.01041647, + "balance_loss_clip": 1.02805138, + "balance_loss_mlp": 1.04467559, + "epoch": 0.46180670374267246, + "flos": 22346312952960.0, + "grad_norm": 3.979685754880411, + "language_loss": 0.63813865, + "learning_rate": 2.342778139478487e-06, + "loss": 0.65977097, + "num_input_tokens_seen": 164913365, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.76953125, + "step": 7681, + "time_per_iteration": 2.486614942550659 + }, + { + "auxiliary_loss_clip": 0.01111536, + "auxiliary_loss_mlp": 0.01029983, + "balance_loss_clip": 1.01790738, + "balance_loss_mlp": 1.03925776, + "epoch": 0.46186682699534043, + "flos": 19895696582400.0, + "grad_norm": 1.518283771877835, + "language_loss": 0.66871607, + "learning_rate": 2.342394433999697e-06, + "loss": 0.69013125, + "num_input_tokens_seen": 164931620, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.72265625, + "step": 7682, + "time_per_iteration": 2.434720516204834 + }, + { + "auxiliary_loss_clip": 0.01114783, + "auxiliary_loss_mlp": 0.01036687, + "balance_loss_clip": 1.02353811, + "balance_loss_mlp": 1.03967464, + "epoch": 0.4619269502480084, + "flos": 31503569408640.0, + "grad_norm": 2.2113144827233397, + "language_loss": 0.74337292, + "learning_rate": 2.342010715537275e-06, + "loss": 0.76488769, + "num_input_tokens_seen": 164950905, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.75, + "step": 7683, + "time_per_iteration": 2.532867908477783 + }, + { + "auxiliary_loss_clip": 0.01113653, + "auxiliary_loss_mlp": 0.0103323, + "balance_loss_clip": 1.02046251, + "balance_loss_mlp": 1.04082799, + "epoch": 0.46198707350067636, + "flos": 25009484054400.0, + "grad_norm": 1.7237723920320163, + "language_loss": 0.76637614, + "learning_rate": 2.3416269841057726e-06, + "loss": 0.78784502, + "num_input_tokens_seen": 164970950, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7265625, + "step": 7684, + "time_per_iteration": 2.4763615131378174 + }, + { + "auxiliary_loss_clip": 0.01121747, + "auxiliary_loss_mlp": 0.01039637, + "balance_loss_clip": 1.02557588, + "balance_loss_mlp": 1.04270399, + "epoch": 0.4620471967533444, + "flos": 18292357198080.0, + "grad_norm": 2.012138726469413, + "language_loss": 0.80012244, + "learning_rate": 2.3412432397197412e-06, + "loss": 0.82173628, + "num_input_tokens_seen": 164989855, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 7685, + "time_per_iteration": 2.467780113220215 + }, + { + "auxiliary_loss_clip": 0.01113653, + "auxiliary_loss_mlp": 0.01038402, + "balance_loss_clip": 1.02434742, + "balance_loss_mlp": 1.04206526, + "epoch": 0.46210732000601235, + "flos": 33985104410880.0, + "grad_norm": 2.0493507584177424, + "language_loss": 0.66546774, + "learning_rate": 2.340859482393731e-06, + "loss": 0.68698829, + "num_input_tokens_seen": 165012290, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.71484375, + "step": 7686, + "time_per_iteration": 2.5675110816955566 + }, + { + "auxiliary_loss_clip": 0.01115805, + "auxiliary_loss_mlp": 0.01031167, + "balance_loss_clip": 1.01730859, + "balance_loss_mlp": 1.03924084, + "epoch": 0.4621674432586803, + "flos": 25009412227200.0, + "grad_norm": 2.0396518023333243, + "language_loss": 0.73831183, + "learning_rate": 2.340475712142296e-06, + "loss": 0.75978148, + "num_input_tokens_seen": 165030810, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 7687, + "time_per_iteration": 2.5077569484710693 + }, + { + "auxiliary_loss_clip": 0.01113947, + "auxiliary_loss_mlp": 0.01030534, + "balance_loss_clip": 1.01686668, + "balance_loss_mlp": 1.04119587, + "epoch": 0.4622275665113483, + "flos": 22014031213440.0, + "grad_norm": 2.1950912061668784, + "language_loss": 0.74758142, + "learning_rate": 2.3400919289799873e-06, + "loss": 0.76902628, + "num_input_tokens_seen": 165050205, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7265625, + "step": 7688, + "time_per_iteration": 2.4487764835357666 + }, + { + "auxiliary_loss_clip": 0.01112886, + "auxiliary_loss_mlp": 0.01035944, + "balance_loss_clip": 1.0214963, + "balance_loss_mlp": 1.03912246, + "epoch": 0.46228768976401624, + "flos": 24058820747520.0, + "grad_norm": 1.6667608580722473, + "language_loss": 0.78718561, + "learning_rate": 2.3397081329213585e-06, + "loss": 0.80867392, + "num_input_tokens_seen": 165069370, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.73828125, + "step": 7689, + "time_per_iteration": 2.504210948944092 + }, + { + "auxiliary_loss_clip": 0.01118414, + "auxiliary_loss_mlp": 0.01040294, + "balance_loss_clip": 1.02561891, + "balance_loss_mlp": 1.04086494, + "epoch": 0.4623478130166842, + "flos": 26651391667200.0, + "grad_norm": 3.5840156670541448, + "language_loss": 0.56649667, + "learning_rate": 2.339324323980964e-06, + "loss": 0.58808374, + "num_input_tokens_seen": 165089610, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7734375, + "step": 7690, + "time_per_iteration": 2.4970550537109375 + }, + { + "auxiliary_loss_clip": 0.01113577, + "auxiliary_loss_mlp": 0.01034848, + "balance_loss_clip": 1.02076888, + "balance_loss_mlp": 1.03844917, + "epoch": 0.46240793626935217, + "flos": 20558428467840.0, + "grad_norm": 2.2671044925643202, + "language_loss": 0.82513797, + "learning_rate": 2.3389405021733562e-06, + "loss": 0.84662223, + "num_input_tokens_seen": 165109050, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.75, + "step": 7691, + "time_per_iteration": 2.4712584018707275 + }, + { + "auxiliary_loss_clip": 0.01115177, + "auxiliary_loss_mlp": 0.01028123, + "balance_loss_clip": 1.01502824, + "balance_loss_mlp": 1.04124403, + "epoch": 0.46246805952202014, + "flos": 22456055980800.0, + "grad_norm": 1.513473472081282, + "language_loss": 0.75326777, + "learning_rate": 2.338556667513091e-06, + "loss": 0.77470076, + "num_input_tokens_seen": 165130130, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73828125, + "step": 7692, + "time_per_iteration": 2.462574005126953 + }, + { + "auxiliary_loss_clip": 0.01117023, + "auxiliary_loss_mlp": 0.01036723, + "balance_loss_clip": 1.0225668, + "balance_loss_mlp": 1.04110909, + "epoch": 0.4625281827746881, + "flos": 35041308854400.0, + "grad_norm": 4.10345040195295, + "language_loss": 0.74055338, + "learning_rate": 2.338172820014723e-06, + "loss": 0.76209086, + "num_input_tokens_seen": 165152685, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7578125, + "step": 7693, + "time_per_iteration": 2.578394889831543 + }, + { + "auxiliary_loss_clip": 0.01114967, + "auxiliary_loss_mlp": 0.01035157, + "balance_loss_clip": 1.02170396, + "balance_loss_mlp": 1.04132485, + "epoch": 0.46258830602735607, + "flos": 21068647205760.0, + "grad_norm": 1.5049695528407014, + "language_loss": 0.85576218, + "learning_rate": 2.337788959692808e-06, + "loss": 0.87726343, + "num_input_tokens_seen": 165173315, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 7694, + "time_per_iteration": 2.447938919067383 + }, + { + "auxiliary_loss_clip": 0.01116538, + "auxiliary_loss_mlp": 0.01036993, + "balance_loss_clip": 1.02379656, + "balance_loss_mlp": 1.04131126, + "epoch": 0.46264842928002403, + "flos": 26177227205760.0, + "grad_norm": 2.103971064334481, + "language_loss": 0.78631961, + "learning_rate": 2.337405086561902e-06, + "loss": 0.80785489, + "num_input_tokens_seen": 165192395, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.75, + "step": 7695, + "time_per_iteration": 2.510712146759033 + }, + { + "auxiliary_loss_clip": 0.01110008, + "auxiliary_loss_mlp": 0.01034157, + "balance_loss_clip": 1.021294, + "balance_loss_mlp": 1.0382899, + "epoch": 0.462708552532692, + "flos": 16764214936320.0, + "grad_norm": 1.7164209999926379, + "language_loss": 0.72215033, + "learning_rate": 2.3370212006365606e-06, + "loss": 0.74359202, + "num_input_tokens_seen": 165211355, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 7696, + "time_per_iteration": 2.427879571914673 + }, + { + "auxiliary_loss_clip": 0.01116967, + "auxiliary_loss_mlp": 0.01044874, + "balance_loss_clip": 1.03040195, + "balance_loss_mlp": 1.04200339, + "epoch": 0.46276867578535996, + "flos": 15560453422080.0, + "grad_norm": 1.7618442658513396, + "language_loss": 0.69068033, + "learning_rate": 2.3366373019313423e-06, + "loss": 0.71229875, + "num_input_tokens_seen": 165229380, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.75, + "step": 7697, + "time_per_iteration": 2.4759252071380615 + }, + { + "auxiliary_loss_clip": 0.01115214, + "auxiliary_loss_mlp": 0.01030228, + "balance_loss_clip": 1.01716232, + "balance_loss_mlp": 1.0421176, + "epoch": 0.462828799038028, + "flos": 22415404763520.0, + "grad_norm": 1.7059169761391482, + "language_loss": 0.84603721, + "learning_rate": 2.3362533904608025e-06, + "loss": 0.8674916, + "num_input_tokens_seen": 165247200, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73046875, + "step": 7698, + "time_per_iteration": 2.4416439533233643 + }, + { + "auxiliary_loss_clip": 0.01114129, + "auxiliary_loss_mlp": 0.01033925, + "balance_loss_clip": 1.02094316, + "balance_loss_mlp": 1.04008198, + "epoch": 0.46288892229069595, + "flos": 21069580959360.0, + "grad_norm": 2.2131790671554894, + "language_loss": 0.71495068, + "learning_rate": 2.335869466239502e-06, + "loss": 0.73643124, + "num_input_tokens_seen": 165265825, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7421875, + "step": 7699, + "time_per_iteration": 2.477674722671509 + }, + { + "auxiliary_loss_clip": 0.01115631, + "auxiliary_loss_mlp": 0.01036078, + "balance_loss_clip": 1.02183843, + "balance_loss_mlp": 1.03854418, + "epoch": 0.4629490455433639, + "flos": 23185688947200.0, + "grad_norm": 1.667240614809052, + "language_loss": 0.7189334, + "learning_rate": 2.335485529281996e-06, + "loss": 0.7404505, + "num_input_tokens_seen": 165284380, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7734375, + "step": 7700, + "time_per_iteration": 2.4664909839630127 + }, + { + "auxiliary_loss_clip": 0.01113806, + "auxiliary_loss_mlp": 0.01036044, + "balance_loss_clip": 1.0229491, + "balance_loss_mlp": 1.04012191, + "epoch": 0.4630091687960319, + "flos": 18835541642880.0, + "grad_norm": 1.9820544405348388, + "language_loss": 0.7245025, + "learning_rate": 2.3351015796028467e-06, + "loss": 0.74600095, + "num_input_tokens_seen": 165300320, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73828125, + "step": 7701, + "time_per_iteration": 2.4769680500030518 + }, + { + "auxiliary_loss_clip": 0.01117689, + "auxiliary_loss_mlp": 0.01035148, + "balance_loss_clip": 1.02129054, + "balance_loss_mlp": 1.04037929, + "epoch": 0.46306929204869984, + "flos": 38907020407680.0, + "grad_norm": 1.837243395087381, + "language_loss": 0.64583158, + "learning_rate": 2.3347176172166114e-06, + "loss": 0.66735995, + "num_input_tokens_seen": 165318130, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7734375, + "step": 7702, + "time_per_iteration": 2.5806338787078857 + }, + { + "auxiliary_loss_clip": 0.0111042, + "auxiliary_loss_mlp": 0.01030383, + "balance_loss_clip": 1.01753259, + "balance_loss_mlp": 1.03832746, + "epoch": 0.4631294153013678, + "flos": 19644178573440.0, + "grad_norm": 1.912512853345874, + "language_loss": 0.73265111, + "learning_rate": 2.33433364213785e-06, + "loss": 0.7540592, + "num_input_tokens_seen": 165336225, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.71875, + "step": 7703, + "time_per_iteration": 2.482374429702759 + }, + { + "auxiliary_loss_clip": 0.01119217, + "auxiliary_loss_mlp": 0.01033067, + "balance_loss_clip": 1.01882708, + "balance_loss_mlp": 1.04163849, + "epoch": 0.4631895385540358, + "flos": 24608254158720.0, + "grad_norm": 1.555397834218836, + "language_loss": 0.68780202, + "learning_rate": 2.3339496543811243e-06, + "loss": 0.70932484, + "num_input_tokens_seen": 165355005, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7734375, + "step": 7704, + "time_per_iteration": 2.4661428928375244 + }, + { + "auxiliary_loss_clip": 0.01118717, + "auxiliary_loss_mlp": 0.01027068, + "balance_loss_clip": 1.01313281, + "balance_loss_mlp": 1.04138649, + "epoch": 0.46324966180670374, + "flos": 26320115508480.0, + "grad_norm": 4.360671756910266, + "language_loss": 0.80963224, + "learning_rate": 2.3335656539609934e-06, + "loss": 0.83109009, + "num_input_tokens_seen": 165374910, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7734375, + "step": 7705, + "time_per_iteration": 2.5129587650299072 + }, + { + "auxiliary_loss_clip": 0.01116357, + "auxiliary_loss_mlp": 0.01032014, + "balance_loss_clip": 1.01863885, + "balance_loss_mlp": 1.03983259, + "epoch": 0.4633097850593717, + "flos": 19240506552960.0, + "grad_norm": 1.6860050062378817, + "language_loss": 0.77783883, + "learning_rate": 2.3331816408920196e-06, + "loss": 0.79932249, + "num_input_tokens_seen": 165392590, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.765625, + "step": 7706, + "time_per_iteration": 2.4212512969970703 + }, + { + "auxiliary_loss_clip": 0.01109433, + "auxiliary_loss_mlp": 0.0102981, + "balance_loss_clip": 1.01676846, + "balance_loss_mlp": 1.03858304, + "epoch": 0.46336990831203967, + "flos": 22783166161920.0, + "grad_norm": 1.9896841653009631, + "language_loss": 0.69805431, + "learning_rate": 2.3327976151887654e-06, + "loss": 0.71944684, + "num_input_tokens_seen": 165411195, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.70703125, + "step": 7707, + "time_per_iteration": 2.452716112136841 + }, + { + "auxiliary_loss_clip": 0.0111828, + "auxiliary_loss_mlp": 0.01037502, + "balance_loss_clip": 1.02268386, + "balance_loss_mlp": 1.03958869, + "epoch": 0.46343003156470763, + "flos": 38210604543360.0, + "grad_norm": 1.9384057680294333, + "language_loss": 0.61103344, + "learning_rate": 2.332413576865791e-06, + "loss": 0.63259125, + "num_input_tokens_seen": 165430150, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7890625, + "step": 7708, + "time_per_iteration": 2.567363739013672 + }, + { + "auxiliary_loss_clip": 0.01115409, + "auxiliary_loss_mlp": 0.01033039, + "balance_loss_clip": 1.01932991, + "balance_loss_mlp": 1.0407182, + "epoch": 0.4634901548173756, + "flos": 31938555110400.0, + "grad_norm": 1.9580912850569934, + "language_loss": 0.77165091, + "learning_rate": 2.3320295259376614e-06, + "loss": 0.7931354, + "num_input_tokens_seen": 165450595, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 7709, + "time_per_iteration": 2.532893657684326 + }, + { + "auxiliary_loss_clip": 0.01120131, + "auxiliary_loss_mlp": 0.0103614, + "balance_loss_clip": 1.02199614, + "balance_loss_mlp": 1.04260027, + "epoch": 0.46355027807004356, + "flos": 20082540153600.0, + "grad_norm": 1.8889269845152723, + "language_loss": 0.76972783, + "learning_rate": 2.3316454624189385e-06, + "loss": 0.79129058, + "num_input_tokens_seen": 165469515, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7734375, + "step": 7710, + "time_per_iteration": 2.4608266353607178 + }, + { + "auxiliary_loss_clip": 0.01119418, + "auxiliary_loss_mlp": 0.01032956, + "balance_loss_clip": 1.01812005, + "balance_loss_mlp": 1.04201198, + "epoch": 0.4636104013227116, + "flos": 24061370613120.0, + "grad_norm": 8.865430766980356, + "language_loss": 0.73548961, + "learning_rate": 2.3312613863241865e-06, + "loss": 0.75701332, + "num_input_tokens_seen": 165488125, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7734375, + "step": 7711, + "time_per_iteration": 2.4964261054992676 + }, + { + "auxiliary_loss_clip": 0.01114775, + "auxiliary_loss_mlp": 0.01043054, + "balance_loss_clip": 1.02818859, + "balance_loss_mlp": 1.04039836, + "epoch": 0.46367052457537955, + "flos": 23914639555200.0, + "grad_norm": 1.6554647385393604, + "language_loss": 0.71667624, + "learning_rate": 2.33087729766797e-06, + "loss": 0.73825449, + "num_input_tokens_seen": 165509225, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.74609375, + "step": 7712, + "time_per_iteration": 2.46760630607605 + }, + { + "auxiliary_loss_clip": 0.01121722, + "auxiliary_loss_mlp": 0.01038556, + "balance_loss_clip": 1.02325535, + "balance_loss_mlp": 1.04231286, + "epoch": 0.4637306478280475, + "flos": 26396533693440.0, + "grad_norm": 3.3767356374822053, + "language_loss": 0.72924775, + "learning_rate": 2.3304931964648524e-06, + "loss": 0.7508505, + "num_input_tokens_seen": 165529945, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.79296875, + "step": 7713, + "time_per_iteration": 2.501405954360962 + }, + { + "auxiliary_loss_clip": 0.01117475, + "auxiliary_loss_mlp": 0.01034085, + "balance_loss_clip": 1.0192256, + "balance_loss_mlp": 1.0397234, + "epoch": 0.4637907710807155, + "flos": 21980706370560.0, + "grad_norm": 1.980318346106041, + "language_loss": 0.58787149, + "learning_rate": 2.3301090827294e-06, + "loss": 0.60938716, + "num_input_tokens_seen": 165550690, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.77734375, + "step": 7714, + "time_per_iteration": 2.495403528213501 + }, + { + "auxiliary_loss_clip": 0.01113059, + "auxiliary_loss_mlp": 0.01032058, + "balance_loss_clip": 1.01873016, + "balance_loss_mlp": 1.03932118, + "epoch": 0.46385089433338345, + "flos": 12422291846400.0, + "grad_norm": 2.071541116221401, + "language_loss": 0.70241058, + "learning_rate": 2.3297249564761784e-06, + "loss": 0.72386181, + "num_input_tokens_seen": 165567775, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73828125, + "step": 7715, + "time_per_iteration": 2.4438905715942383 + }, + { + "auxiliary_loss_clip": 0.01120226, + "auxiliary_loss_mlp": 0.01034367, + "balance_loss_clip": 1.0211767, + "balance_loss_mlp": 1.04094183, + "epoch": 0.4639110175860514, + "flos": 23915752876800.0, + "grad_norm": 2.6792778299233775, + "language_loss": 0.67974752, + "learning_rate": 2.3293408177197527e-06, + "loss": 0.70129347, + "num_input_tokens_seen": 165587010, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.796875, + "step": 7716, + "time_per_iteration": 2.4544179439544678 + }, + { + "auxiliary_loss_clip": 0.01119502, + "auxiliary_loss_mlp": 0.010334, + "balance_loss_clip": 1.01913667, + "balance_loss_mlp": 1.04161263, + "epoch": 0.4639711408387194, + "flos": 25300396304640.0, + "grad_norm": 1.7705358267642153, + "language_loss": 0.81100738, + "learning_rate": 2.328956666474691e-06, + "loss": 0.8325364, + "num_input_tokens_seen": 165607850, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 7717, + "time_per_iteration": 2.491530179977417 + }, + { + "auxiliary_loss_clip": 0.0111535, + "auxiliary_loss_mlp": 0.0103239, + "balance_loss_clip": 1.01868117, + "balance_loss_mlp": 1.04001844, + "epoch": 0.46403126409138734, + "flos": 21211822817280.0, + "grad_norm": 1.8289041555667496, + "language_loss": 0.73165905, + "learning_rate": 2.3285725027555593e-06, + "loss": 0.75313652, + "num_input_tokens_seen": 165627175, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75390625, + "step": 7718, + "time_per_iteration": 2.4480137825012207 + }, + { + "auxiliary_loss_clip": 0.01114178, + "auxiliary_loss_mlp": 0.01037785, + "balance_loss_clip": 1.02355695, + "balance_loss_mlp": 1.03966463, + "epoch": 0.4640913873440553, + "flos": 35845564325760.0, + "grad_norm": 1.5484606356008148, + "language_loss": 0.70390046, + "learning_rate": 2.3281883265769254e-06, + "loss": 0.72542012, + "num_input_tokens_seen": 165648340, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.74609375, + "step": 7719, + "time_per_iteration": 2.565831422805786 + }, + { + "auxiliary_loss_clip": 0.0112125, + "auxiliary_loss_mlp": 0.01039419, + "balance_loss_clip": 1.02523875, + "balance_loss_mlp": 1.0433172, + "epoch": 0.46415151059672327, + "flos": 19166207270400.0, + "grad_norm": 1.6620583446293502, + "language_loss": 0.86685133, + "learning_rate": 2.327804137953357e-06, + "loss": 0.88845801, + "num_input_tokens_seen": 165667195, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.77734375, + "step": 7720, + "time_per_iteration": 5.243311166763306 + }, + { + "auxiliary_loss_clip": 0.01036993, + "auxiliary_loss_mlp": 0.01001352, + "balance_loss_clip": 0.99992698, + "balance_loss_mlp": 1.01241243, + "epoch": 0.46421163384939124, + "flos": 58912750304640.0, + "grad_norm": 0.7219170830729655, + "language_loss": 0.55086505, + "learning_rate": 2.3274199368994226e-06, + "loss": 0.57124853, + "num_input_tokens_seen": 165726760, + "router_z_loss_clip": 0.01422119, + "router_z_loss_mlp": 0.24609375, + "step": 7721, + "time_per_iteration": 4.553914785385132 + }, + { + "auxiliary_loss_clip": 0.01116315, + "auxiliary_loss_mlp": 0.01037313, + "balance_loss_clip": 1.02322233, + "balance_loss_mlp": 1.041767, + "epoch": 0.4642717571020592, + "flos": 20157342226560.0, + "grad_norm": 2.566766868002949, + "language_loss": 0.79665279, + "learning_rate": 2.3270357234296918e-06, + "loss": 0.81818902, + "num_input_tokens_seen": 165745005, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.74609375, + "step": 7722, + "time_per_iteration": 2.445401430130005 + }, + { + "auxiliary_loss_clip": 0.01118781, + "auxiliary_loss_mlp": 0.01032902, + "balance_loss_clip": 1.01957417, + "balance_loss_mlp": 1.04163325, + "epoch": 0.46433188035472717, + "flos": 25046184775680.0, + "grad_norm": 1.5891837623192666, + "language_loss": 0.77772748, + "learning_rate": 2.3266514975587332e-06, + "loss": 0.79924428, + "num_input_tokens_seen": 165765750, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7734375, + "step": 7723, + "time_per_iteration": 2.4992403984069824 + }, + { + "auxiliary_loss_clip": 0.01112839, + "auxiliary_loss_mlp": 0.01030607, + "balance_loss_clip": 1.01748788, + "balance_loss_mlp": 1.03973961, + "epoch": 0.4643920036073952, + "flos": 28075644817920.0, + "grad_norm": 1.5026814907271808, + "language_loss": 0.68433344, + "learning_rate": 2.326267259301118e-06, + "loss": 0.70576787, + "num_input_tokens_seen": 165787515, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73046875, + "step": 7724, + "time_per_iteration": 2.496286630630493 + }, + { + "auxiliary_loss_clip": 0.01112054, + "auxiliary_loss_mlp": 0.0103399, + "balance_loss_clip": 1.02032912, + "balance_loss_mlp": 1.03761983, + "epoch": 0.46445212686006315, + "flos": 18369350000640.0, + "grad_norm": 2.246547977212262, + "language_loss": 0.67335129, + "learning_rate": 2.325883008671415e-06, + "loss": 0.6948117, + "num_input_tokens_seen": 165806675, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7421875, + "step": 7725, + "time_per_iteration": 2.471104621887207 + }, + { + "auxiliary_loss_clip": 0.01108683, + "auxiliary_loss_mlp": 0.01037158, + "balance_loss_clip": 1.02523649, + "balance_loss_mlp": 1.03763461, + "epoch": 0.4645122501127311, + "flos": 31721618920320.0, + "grad_norm": 1.6153664866621378, + "language_loss": 0.64700842, + "learning_rate": 2.3254987456841955e-06, + "loss": 0.66846681, + "num_input_tokens_seen": 165829835, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.7109375, + "step": 7726, + "time_per_iteration": 2.5408668518066406 + }, + { + "auxiliary_loss_clip": 0.01117387, + "auxiliary_loss_mlp": 0.01032558, + "balance_loss_clip": 1.01916456, + "balance_loss_mlp": 1.04313767, + "epoch": 0.4645723733653991, + "flos": 23768806337280.0, + "grad_norm": 1.8244750339479887, + "language_loss": 0.74908936, + "learning_rate": 2.3251144703540307e-06, + "loss": 0.77058876, + "num_input_tokens_seen": 165849380, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7421875, + "step": 7727, + "time_per_iteration": 2.4853005409240723 + }, + { + "auxiliary_loss_clip": 0.01114218, + "auxiliary_loss_mlp": 0.01036565, + "balance_loss_clip": 1.02248645, + "balance_loss_mlp": 1.03968906, + "epoch": 0.46463249661806705, + "flos": 33145512935040.0, + "grad_norm": 2.0019169498028657, + "language_loss": 0.78683269, + "learning_rate": 2.3247301826954936e-06, + "loss": 0.80834055, + "num_input_tokens_seen": 165868620, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 7728, + "time_per_iteration": 2.5397188663482666 + }, + { + "auxiliary_loss_clip": 0.0111559, + "auxiliary_loss_mlp": 0.01036908, + "balance_loss_clip": 1.02303171, + "balance_loss_mlp": 1.0405283, + "epoch": 0.464692619870735, + "flos": 18296020385280.0, + "grad_norm": 2.3286376832796343, + "language_loss": 0.76053888, + "learning_rate": 2.324345882723155e-06, + "loss": 0.78206384, + "num_input_tokens_seen": 165885915, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 7729, + "time_per_iteration": 2.4818129539489746 + }, + { + "auxiliary_loss_clip": 0.011162, + "auxiliary_loss_mlp": 0.0103847, + "balance_loss_clip": 1.02543473, + "balance_loss_mlp": 1.04205704, + "epoch": 0.464752743123403, + "flos": 22638051216000.0, + "grad_norm": 1.578112141950269, + "language_loss": 0.79568058, + "learning_rate": 2.323961570451588e-06, + "loss": 0.81722724, + "num_input_tokens_seen": 165905465, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7421875, + "step": 7730, + "time_per_iteration": 2.5124597549438477 + }, + { + "auxiliary_loss_clip": 0.01113512, + "auxiliary_loss_mlp": 0.01037643, + "balance_loss_clip": 1.0245595, + "balance_loss_mlp": 1.03948402, + "epoch": 0.46481286637607094, + "flos": 20412128373120.0, + "grad_norm": 1.5075999703309564, + "language_loss": 0.76621842, + "learning_rate": 2.3235772458953655e-06, + "loss": 0.78772998, + "num_input_tokens_seen": 165924640, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7421875, + "step": 7731, + "time_per_iteration": 2.4976460933685303 + }, + { + "auxiliary_loss_clip": 0.01112526, + "auxiliary_loss_mlp": 0.01031386, + "balance_loss_clip": 1.01798737, + "balance_loss_mlp": 1.0393635, + "epoch": 0.4648729896287389, + "flos": 34275406129920.0, + "grad_norm": 1.7163179847514425, + "language_loss": 0.65824252, + "learning_rate": 2.323192909069061e-06, + "loss": 0.67968166, + "num_input_tokens_seen": 165945765, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73046875, + "step": 7732, + "time_per_iteration": 2.5720393657684326 + }, + { + "auxiliary_loss_clip": 0.01116963, + "auxiliary_loss_mlp": 0.01036386, + "balance_loss_clip": 1.02186668, + "balance_loss_mlp": 1.03906608, + "epoch": 0.4649331128814069, + "flos": 21321781326720.0, + "grad_norm": 2.6101927282287454, + "language_loss": 0.72711408, + "learning_rate": 2.32280855998725e-06, + "loss": 0.74864757, + "num_input_tokens_seen": 165964025, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.78125, + "step": 7733, + "time_per_iteration": 2.4926271438598633 + }, + { + "auxiliary_loss_clip": 0.01036248, + "auxiliary_loss_mlp": 0.01002748, + "balance_loss_clip": 1.00131154, + "balance_loss_mlp": 1.01211238, + "epoch": 0.46499323613407484, + "flos": 58308515717760.0, + "grad_norm": 1.2459739814545432, + "language_loss": 0.51962316, + "learning_rate": 2.3224241986645057e-06, + "loss": 0.54001307, + "num_input_tokens_seen": 166021950, + "router_z_loss_clip": 0.01434326, + "router_z_loss_mlp": 0.2421875, + "step": 7734, + "time_per_iteration": 3.0107176303863525 + }, + { + "auxiliary_loss_clip": 0.01113986, + "auxiliary_loss_mlp": 0.0103542, + "balance_loss_clip": 1.02194381, + "balance_loss_mlp": 1.04043412, + "epoch": 0.4650533593867428, + "flos": 10889660384640.0, + "grad_norm": 2.036607770310226, + "language_loss": 0.75633866, + "learning_rate": 2.3220398251154035e-06, + "loss": 0.77783275, + "num_input_tokens_seen": 166039675, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 7735, + "time_per_iteration": 2.487781286239624 + }, + { + "auxiliary_loss_clip": 0.01111506, + "auxiliary_loss_mlp": 0.01040266, + "balance_loss_clip": 1.02682567, + "balance_loss_mlp": 1.03985715, + "epoch": 0.46511348263941077, + "flos": 19974592805760.0, + "grad_norm": 2.402877095125316, + "language_loss": 0.70207214, + "learning_rate": 2.321655439354519e-06, + "loss": 0.7235899, + "num_input_tokens_seen": 166057745, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71484375, + "step": 7736, + "time_per_iteration": 2.4449374675750732 + }, + { + "auxiliary_loss_clip": 0.0111302, + "auxiliary_loss_mlp": 0.0103234, + "balance_loss_clip": 1.0199604, + "balance_loss_mlp": 1.04052627, + "epoch": 0.46517360589207873, + "flos": 19678401256320.0, + "grad_norm": 1.6375102922586726, + "language_loss": 0.72185129, + "learning_rate": 2.321271041396427e-06, + "loss": 0.74330497, + "num_input_tokens_seen": 166076440, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7265625, + "step": 7737, + "time_per_iteration": 2.494582176208496 + }, + { + "auxiliary_loss_clip": 0.01118991, + "auxiliary_loss_mlp": 0.01038693, + "balance_loss_clip": 1.02450085, + "balance_loss_mlp": 1.04341006, + "epoch": 0.46523372914474675, + "flos": 16872665074560.0, + "grad_norm": 2.6166748549663605, + "language_loss": 0.83362406, + "learning_rate": 2.3208866312557065e-06, + "loss": 0.85520089, + "num_input_tokens_seen": 166092520, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7578125, + "step": 7738, + "time_per_iteration": 2.427828550338745 + }, + { + "auxiliary_loss_clip": 0.01037214, + "auxiliary_loss_mlp": 0.01002748, + "balance_loss_clip": 1.0013417, + "balance_loss_mlp": 1.0132978, + "epoch": 0.4652938523974147, + "flos": 53439138339840.0, + "grad_norm": 0.7680630195464891, + "language_loss": 0.57788324, + "learning_rate": 2.320502208946932e-06, + "loss": 0.59828281, + "num_input_tokens_seen": 166156285, + "router_z_loss_clip": 0.01403809, + "router_z_loss_mlp": 0.24023438, + "step": 7739, + "time_per_iteration": 3.133042335510254 + }, + { + "auxiliary_loss_clip": 0.01113786, + "auxiliary_loss_mlp": 0.01038133, + "balance_loss_clip": 1.02543104, + "balance_loss_mlp": 1.03974605, + "epoch": 0.4653539756500827, + "flos": 15231296165760.0, + "grad_norm": 1.823827375035505, + "language_loss": 0.8481009, + "learning_rate": 2.3201177744846815e-06, + "loss": 0.86962008, + "num_input_tokens_seen": 166173455, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7421875, + "step": 7740, + "time_per_iteration": 2.4921228885650635 + }, + { + "auxiliary_loss_clip": 0.0111501, + "auxiliary_loss_mlp": 0.01037681, + "balance_loss_clip": 1.02391815, + "balance_loss_mlp": 1.04139423, + "epoch": 0.46541409890275065, + "flos": 23732249270400.0, + "grad_norm": 1.5033977780241194, + "language_loss": 0.76110768, + "learning_rate": 2.3197333278835327e-06, + "loss": 0.7826345, + "num_input_tokens_seen": 166194370, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.734375, + "step": 7741, + "time_per_iteration": 2.4922451972961426 + }, + { + "auxiliary_loss_clip": 0.01117905, + "auxiliary_loss_mlp": 0.01032255, + "balance_loss_clip": 1.01915359, + "balance_loss_mlp": 1.0404247, + "epoch": 0.4654742221554186, + "flos": 20847329556480.0, + "grad_norm": 1.7276921705055903, + "language_loss": 0.80555934, + "learning_rate": 2.319348869158064e-06, + "loss": 0.82706094, + "num_input_tokens_seen": 166213195, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7734375, + "step": 7742, + "time_per_iteration": 2.4906904697418213 + }, + { + "auxiliary_loss_clip": 0.01116814, + "auxiliary_loss_mlp": 0.01039288, + "balance_loss_clip": 1.02518523, + "balance_loss_mlp": 1.04049921, + "epoch": 0.4655343454080866, + "flos": 20704836303360.0, + "grad_norm": 1.9912151117228205, + "language_loss": 0.72541988, + "learning_rate": 2.3189643983228555e-06, + "loss": 0.74698091, + "num_input_tokens_seen": 166231350, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.765625, + "step": 7743, + "time_per_iteration": 2.4746901988983154 + }, + { + "auxiliary_loss_clip": 0.01114723, + "auxiliary_loss_mlp": 0.01030494, + "balance_loss_clip": 1.01745892, + "balance_loss_mlp": 1.0409807, + "epoch": 0.46559446866075455, + "flos": 18989850470400.0, + "grad_norm": 2.076205829431248, + "language_loss": 0.71137214, + "learning_rate": 2.318579915392483e-06, + "loss": 0.73282433, + "num_input_tokens_seen": 166250530, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.734375, + "step": 7744, + "time_per_iteration": 2.4928057193756104 + }, + { + "auxiliary_loss_clip": 0.01112536, + "auxiliary_loss_mlp": 0.01033232, + "balance_loss_clip": 1.02108455, + "balance_loss_mlp": 1.04053736, + "epoch": 0.4656545919134225, + "flos": 34496364643200.0, + "grad_norm": 1.5849641227794893, + "language_loss": 0.85084593, + "learning_rate": 2.31819542038153e-06, + "loss": 0.87230361, + "num_input_tokens_seen": 166272545, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71875, + "step": 7745, + "time_per_iteration": 2.574612617492676 + }, + { + "auxiliary_loss_clip": 0.01112672, + "auxiliary_loss_mlp": 0.01039212, + "balance_loss_clip": 1.02561605, + "balance_loss_mlp": 1.04127502, + "epoch": 0.4657147151660905, + "flos": 24310554238080.0, + "grad_norm": 1.35434162506916, + "language_loss": 0.73171556, + "learning_rate": 2.317810913304574e-06, + "loss": 0.75323439, + "num_input_tokens_seen": 166292135, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.71484375, + "step": 7746, + "time_per_iteration": 2.5375149250030518 + }, + { + "auxiliary_loss_clip": 0.01112894, + "auxiliary_loss_mlp": 0.01035164, + "balance_loss_clip": 1.02271867, + "balance_loss_mlp": 1.04081106, + "epoch": 0.46577483841875844, + "flos": 58795139220480.0, + "grad_norm": 1.5285629366651527, + "language_loss": 0.6993416, + "learning_rate": 2.3174263941761963e-06, + "loss": 0.72082222, + "num_input_tokens_seen": 166316710, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71875, + "step": 7747, + "time_per_iteration": 2.792043685913086 + }, + { + "auxiliary_loss_clip": 0.01112826, + "auxiliary_loss_mlp": 0.01031484, + "balance_loss_clip": 1.01872873, + "balance_loss_mlp": 1.03958046, + "epoch": 0.4658349616714264, + "flos": 31321969223040.0, + "grad_norm": 1.4175797777041124, + "language_loss": 0.67509431, + "learning_rate": 2.317041863010978e-06, + "loss": 0.69653738, + "num_input_tokens_seen": 166338535, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.73046875, + "step": 7748, + "time_per_iteration": 2.625060796737671 + }, + { + "auxiliary_loss_clip": 0.01117966, + "auxiliary_loss_mlp": 0.01037997, + "balance_loss_clip": 1.02341771, + "balance_loss_mlp": 1.04018533, + "epoch": 0.46589508492409437, + "flos": 14860338456960.0, + "grad_norm": 2.247229042591788, + "language_loss": 0.63667625, + "learning_rate": 2.3166573198235007e-06, + "loss": 0.65823585, + "num_input_tokens_seen": 166355540, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.77734375, + "step": 7749, + "time_per_iteration": 2.4132370948791504 + }, + { + "auxiliary_loss_clip": 0.01117494, + "auxiliary_loss_mlp": 0.0103098, + "balance_loss_clip": 1.01702619, + "balance_loss_mlp": 1.04231274, + "epoch": 0.46595520817676234, + "flos": 12895989431040.0, + "grad_norm": 2.928439488128299, + "language_loss": 0.74594498, + "learning_rate": 2.3162727646283456e-06, + "loss": 0.76742983, + "num_input_tokens_seen": 166372635, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75, + "step": 7750, + "time_per_iteration": 2.494771718978882 + }, + { + "auxiliary_loss_clip": 0.01115846, + "auxiliary_loss_mlp": 0.0103076, + "balance_loss_clip": 1.01701522, + "balance_loss_mlp": 1.0404911, + "epoch": 0.46601533142943036, + "flos": 32854169721600.0, + "grad_norm": 2.044073047720548, + "language_loss": 0.7496438, + "learning_rate": 2.3158881974400963e-06, + "loss": 0.77110994, + "num_input_tokens_seen": 166393175, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75390625, + "step": 7751, + "time_per_iteration": 2.5510993003845215 + }, + { + "auxiliary_loss_clip": 0.01118875, + "auxiliary_loss_mlp": 0.01031961, + "balance_loss_clip": 1.01816297, + "balance_loss_mlp": 1.04188776, + "epoch": 0.4660754546820983, + "flos": 19967517826560.0, + "grad_norm": 1.8775850665267624, + "language_loss": 0.73678327, + "learning_rate": 2.3155036182733345e-06, + "loss": 0.7582916, + "num_input_tokens_seen": 166408630, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76953125, + "step": 7752, + "time_per_iteration": 2.5834901332855225 + }, + { + "auxiliary_loss_clip": 0.01118438, + "auxiliary_loss_mlp": 0.0103798, + "balance_loss_clip": 1.02401483, + "balance_loss_mlp": 1.041453, + "epoch": 0.4661355779347663, + "flos": 26688164215680.0, + "grad_norm": 2.485236836866318, + "language_loss": 0.69320381, + "learning_rate": 2.315119027142644e-06, + "loss": 0.71476793, + "num_input_tokens_seen": 166428170, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76953125, + "step": 7753, + "time_per_iteration": 2.522881507873535 + }, + { + "auxiliary_loss_clip": 0.01111836, + "auxiliary_loss_mlp": 0.01031808, + "balance_loss_clip": 1.01862359, + "balance_loss_mlp": 1.04056942, + "epoch": 0.46619570118743425, + "flos": 20959442881920.0, + "grad_norm": 1.8174540980864333, + "language_loss": 0.72607052, + "learning_rate": 2.3147344240626076e-06, + "loss": 0.74750698, + "num_input_tokens_seen": 166446705, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71484375, + "step": 7754, + "time_per_iteration": 2.5403332710266113 + }, + { + "auxiliary_loss_clip": 0.01115444, + "auxiliary_loss_mlp": 0.01027181, + "balance_loss_clip": 1.01410365, + "balance_loss_mlp": 1.04032147, + "epoch": 0.4662558244401022, + "flos": 24426079355520.0, + "grad_norm": 1.501284890447191, + "language_loss": 0.78961611, + "learning_rate": 2.3143498090478114e-06, + "loss": 0.81104231, + "num_input_tokens_seen": 166466750, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.75, + "step": 7755, + "time_per_iteration": 2.4917664527893066 + }, + { + "auxiliary_loss_clip": 0.01110479, + "auxiliary_loss_mlp": 0.01029481, + "balance_loss_clip": 1.01675534, + "balance_loss_mlp": 1.03968203, + "epoch": 0.4663159476927702, + "flos": 20595452411520.0, + "grad_norm": 1.6390600579035761, + "language_loss": 0.72281897, + "learning_rate": 2.3139651821128382e-06, + "loss": 0.74421859, + "num_input_tokens_seen": 166485400, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 7756, + "time_per_iteration": 2.549678325653076 + }, + { + "auxiliary_loss_clip": 0.01111703, + "auxiliary_loss_mlp": 0.01030785, + "balance_loss_clip": 1.01770794, + "balance_loss_mlp": 1.03845477, + "epoch": 0.46637607094543815, + "flos": 25661872823040.0, + "grad_norm": 1.8004000990726714, + "language_loss": 0.78193069, + "learning_rate": 2.313580543272274e-06, + "loss": 0.80335552, + "num_input_tokens_seen": 166505730, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.734375, + "step": 7757, + "time_per_iteration": 2.483161687850952 + }, + { + "auxiliary_loss_clip": 0.01114658, + "auxiliary_loss_mlp": 0.01030645, + "balance_loss_clip": 1.01855707, + "balance_loss_mlp": 1.04131472, + "epoch": 0.4664361941981061, + "flos": 24273853516800.0, + "grad_norm": 2.024129481036371, + "language_loss": 0.66473371, + "learning_rate": 2.313195892540705e-06, + "loss": 0.68618673, + "num_input_tokens_seen": 166523770, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.734375, + "step": 7758, + "time_per_iteration": 2.5083394050598145 + }, + { + "auxiliary_loss_clip": 0.01113834, + "auxiliary_loss_mlp": 0.01037253, + "balance_loss_clip": 1.0243423, + "balance_loss_mlp": 1.04062152, + "epoch": 0.4664963174507741, + "flos": 18405871153920.0, + "grad_norm": 1.603488256474455, + "language_loss": 0.74207008, + "learning_rate": 2.3128112299327147e-06, + "loss": 0.76358092, + "num_input_tokens_seen": 166542935, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.73046875, + "step": 7759, + "time_per_iteration": 2.424461841583252 + }, + { + "auxiliary_loss_clip": 0.01113311, + "auxiliary_loss_mlp": 0.01033749, + "balance_loss_clip": 1.02089834, + "balance_loss_mlp": 1.04054224, + "epoch": 0.46655644070344204, + "flos": 22455122227200.0, + "grad_norm": 1.4805046968385447, + "language_loss": 0.77701056, + "learning_rate": 2.312426555462893e-06, + "loss": 0.79848123, + "num_input_tokens_seen": 166563935, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.73046875, + "step": 7760, + "time_per_iteration": 2.5147666931152344 + }, + { + "auxiliary_loss_clip": 0.01109461, + "auxiliary_loss_mlp": 0.01028634, + "balance_loss_clip": 1.01549125, + "balance_loss_mlp": 1.03895068, + "epoch": 0.46661656395611, + "flos": 13808407731840.0, + "grad_norm": 1.6623756387577715, + "language_loss": 0.74081796, + "learning_rate": 2.3120418691458237e-06, + "loss": 0.76219893, + "num_input_tokens_seen": 166582175, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 7761, + "time_per_iteration": 3.816096305847168 + }, + { + "auxiliary_loss_clip": 0.01117051, + "auxiliary_loss_mlp": 0.01031991, + "balance_loss_clip": 1.01743007, + "balance_loss_mlp": 1.040905, + "epoch": 0.466676687208778, + "flos": 21652159645440.0, + "grad_norm": 1.9521312394592187, + "language_loss": 0.78150368, + "learning_rate": 2.3116571709960956e-06, + "loss": 0.80299413, + "num_input_tokens_seen": 166601870, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.76171875, + "step": 7762, + "time_per_iteration": 5.593664169311523 + }, + { + "auxiliary_loss_clip": 0.01036542, + "auxiliary_loss_mlp": 0.01002344, + "balance_loss_clip": 1.00103235, + "balance_loss_mlp": 1.0128268, + "epoch": 0.46673681046144594, + "flos": 68534259068160.0, + "grad_norm": 0.7996147947039336, + "language_loss": 0.59759605, + "learning_rate": 2.311272461028297e-06, + "loss": 0.61798495, + "num_input_tokens_seen": 166668960, + "router_z_loss_clip": 0.01312256, + "router_z_loss_mlp": 0.23828125, + "step": 7763, + "time_per_iteration": 4.692638874053955 + }, + { + "auxiliary_loss_clip": 0.01115827, + "auxiliary_loss_mlp": 0.01035827, + "balance_loss_clip": 1.02139115, + "balance_loss_mlp": 1.03950739, + "epoch": 0.46679693371411396, + "flos": 15814449469440.0, + "grad_norm": 2.0939196550691075, + "language_loss": 0.78502893, + "learning_rate": 2.3108877392570146e-06, + "loss": 0.80654544, + "num_input_tokens_seen": 166686110, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.765625, + "step": 7764, + "time_per_iteration": 2.437487840652466 + }, + { + "auxiliary_loss_clip": 0.01113145, + "auxiliary_loss_mlp": 0.01035809, + "balance_loss_clip": 1.02385855, + "balance_loss_mlp": 1.04100394, + "epoch": 0.4668570569667819, + "flos": 18514572687360.0, + "grad_norm": 1.8134732296760265, + "language_loss": 0.72272134, + "learning_rate": 2.310503005696839e-06, + "loss": 0.74421084, + "num_input_tokens_seen": 166703930, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.72265625, + "step": 7765, + "time_per_iteration": 2.4413938522338867 + }, + { + "auxiliary_loss_clip": 0.01114151, + "auxiliary_loss_mlp": 0.01034738, + "balance_loss_clip": 1.02123809, + "balance_loss_mlp": 1.03898025, + "epoch": 0.4669171802194499, + "flos": 19206643006080.0, + "grad_norm": 2.045608669049209, + "language_loss": 0.77604026, + "learning_rate": 2.3101182603623576e-06, + "loss": 0.79752916, + "num_input_tokens_seen": 166719940, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 7766, + "time_per_iteration": 2.4388277530670166 + }, + { + "auxiliary_loss_clip": 0.01112932, + "auxiliary_loss_mlp": 0.01033882, + "balance_loss_clip": 1.02094162, + "balance_loss_mlp": 1.03921056, + "epoch": 0.46697730347211786, + "flos": 12276135406080.0, + "grad_norm": 1.9270773145684021, + "language_loss": 0.65106744, + "learning_rate": 2.3097335032681607e-06, + "loss": 0.67253554, + "num_input_tokens_seen": 166738285, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.73828125, + "step": 7767, + "time_per_iteration": 2.4259531497955322 + }, + { + "auxiliary_loss_clip": 0.01115563, + "auxiliary_loss_mlp": 0.01036202, + "balance_loss_clip": 1.02361989, + "balance_loss_mlp": 1.04137385, + "epoch": 0.4670374267247858, + "flos": 23586739274880.0, + "grad_norm": 1.832674622819915, + "language_loss": 0.74584204, + "learning_rate": 2.3093487344288393e-06, + "loss": 0.76735973, + "num_input_tokens_seen": 166758170, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7421875, + "step": 7768, + "time_per_iteration": 2.5001304149627686 + }, + { + "auxiliary_loss_clip": 0.01114611, + "auxiliary_loss_mlp": 0.01031838, + "balance_loss_clip": 1.01907098, + "balance_loss_mlp": 1.04069757, + "epoch": 0.4670975499774538, + "flos": 15991093578240.0, + "grad_norm": 1.7275432453698176, + "language_loss": 0.70713127, + "learning_rate": 2.308963953858982e-06, + "loss": 0.72859579, + "num_input_tokens_seen": 166775750, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.73828125, + "step": 7769, + "time_per_iteration": 2.466909408569336 + }, + { + "auxiliary_loss_clip": 0.01113851, + "auxiliary_loss_mlp": 0.01033973, + "balance_loss_clip": 1.02159858, + "balance_loss_mlp": 1.03928077, + "epoch": 0.46715767323012175, + "flos": 15377596260480.0, + "grad_norm": 1.9729575937492385, + "language_loss": 0.8121224, + "learning_rate": 2.3085791615731803e-06, + "loss": 0.83360064, + "num_input_tokens_seen": 166791720, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.74609375, + "step": 7770, + "time_per_iteration": 2.458648204803467 + }, + { + "auxiliary_loss_clip": 0.01036054, + "auxiliary_loss_mlp": 0.01001838, + "balance_loss_clip": 1.00070572, + "balance_loss_mlp": 1.01253605, + "epoch": 0.4672177964827897, + "flos": 60252217401600.0, + "grad_norm": 0.7993613034211892, + "language_loss": 0.5567323, + "learning_rate": 2.3081943575860265e-06, + "loss": 0.57711124, + "num_input_tokens_seen": 166856360, + "router_z_loss_clip": 0.01135254, + "router_z_loss_mlp": 0.23632812, + "step": 7771, + "time_per_iteration": 3.0888803005218506 + }, + { + "auxiliary_loss_clip": 0.01111082, + "auxiliary_loss_mlp": 0.01036097, + "balance_loss_clip": 1.02332425, + "balance_loss_mlp": 1.03920853, + "epoch": 0.4672779197354577, + "flos": 27636134002560.0, + "grad_norm": 2.068311261086289, + "language_loss": 0.65702665, + "learning_rate": 2.3078095419121117e-06, + "loss": 0.67849845, + "num_input_tokens_seen": 166875925, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.71875, + "step": 7772, + "time_per_iteration": 2.5242044925689697 + }, + { + "auxiliary_loss_clip": 0.01112309, + "auxiliary_loss_mlp": 0.01032453, + "balance_loss_clip": 1.0201087, + "balance_loss_mlp": 1.04012156, + "epoch": 0.46733804298812565, + "flos": 31394257344000.0, + "grad_norm": 1.8148576314480773, + "language_loss": 0.63699466, + "learning_rate": 2.3074247145660283e-06, + "loss": 0.65844226, + "num_input_tokens_seen": 166896520, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.72265625, + "step": 7773, + "time_per_iteration": 2.5828921794891357 + }, + { + "auxiliary_loss_clip": 0.01114763, + "auxiliary_loss_mlp": 0.01034304, + "balance_loss_clip": 1.02112508, + "balance_loss_mlp": 1.04050922, + "epoch": 0.4673981662407936, + "flos": 19500607912320.0, + "grad_norm": 1.942265734861076, + "language_loss": 0.79793948, + "learning_rate": 2.3070398755623685e-06, + "loss": 0.81943017, + "num_input_tokens_seen": 166915370, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 7774, + "time_per_iteration": 2.448124647140503 + }, + { + "auxiliary_loss_clip": 0.01116733, + "auxiliary_loss_mlp": 0.01030065, + "balance_loss_clip": 1.01732183, + "balance_loss_mlp": 1.04113531, + "epoch": 0.4674582894934616, + "flos": 20521835487360.0, + "grad_norm": 1.627446474145158, + "language_loss": 0.77884328, + "learning_rate": 2.306655024915726e-06, + "loss": 0.80031127, + "num_input_tokens_seen": 166934875, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7578125, + "step": 7775, + "time_per_iteration": 2.527324676513672 + }, + { + "auxiliary_loss_clip": 0.01111153, + "auxiliary_loss_mlp": 0.01029234, + "balance_loss_clip": 1.01650286, + "balance_loss_mlp": 1.03931999, + "epoch": 0.46751841274612954, + "flos": 22090952188800.0, + "grad_norm": 1.8679682194131426, + "language_loss": 0.69634461, + "learning_rate": 2.306270162640694e-06, + "loss": 0.71774852, + "num_input_tokens_seen": 166954285, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 7776, + "time_per_iteration": 2.4637980461120605 + }, + { + "auxiliary_loss_clip": 0.01113537, + "auxiliary_loss_mlp": 0.01032642, + "balance_loss_clip": 1.02123928, + "balance_loss_mlp": 1.04122162, + "epoch": 0.46757853599879756, + "flos": 26980082046720.0, + "grad_norm": 1.3721760360464321, + "language_loss": 0.73558104, + "learning_rate": 2.3058852887518678e-06, + "loss": 0.75704277, + "num_input_tokens_seen": 166975975, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.72265625, + "step": 7777, + "time_per_iteration": 2.520732879638672 + }, + { + "auxiliary_loss_clip": 0.01113463, + "auxiliary_loss_mlp": 0.01029901, + "balance_loss_clip": 1.0170207, + "balance_loss_mlp": 1.04067683, + "epoch": 0.4676386592514655, + "flos": 24134053783680.0, + "grad_norm": 2.1302386072463717, + "language_loss": 0.69626892, + "learning_rate": 2.3055004032638394e-06, + "loss": 0.71770251, + "num_input_tokens_seen": 166996140, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7265625, + "step": 7778, + "time_per_iteration": 2.514420509338379 + }, + { + "auxiliary_loss_clip": 0.01114478, + "auxiliary_loss_mlp": 0.01039246, + "balance_loss_clip": 1.02606773, + "balance_loss_mlp": 1.04059839, + "epoch": 0.4676987825041335, + "flos": 25483720343040.0, + "grad_norm": 1.560538067350171, + "language_loss": 0.73252767, + "learning_rate": 2.305115506191206e-06, + "loss": 0.75406492, + "num_input_tokens_seen": 167016105, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 7779, + "time_per_iteration": 2.5243053436279297 + }, + { + "auxiliary_loss_clip": 0.01112098, + "auxiliary_loss_mlp": 0.01039794, + "balance_loss_clip": 1.02767682, + "balance_loss_mlp": 1.04009414, + "epoch": 0.46775890575680146, + "flos": 21945298538880.0, + "grad_norm": 1.5361358548392845, + "language_loss": 0.72206026, + "learning_rate": 2.304730597548562e-06, + "loss": 0.74357915, + "num_input_tokens_seen": 167036185, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71875, + "step": 7780, + "time_per_iteration": 2.462562322616577 + }, + { + "auxiliary_loss_clip": 0.01116485, + "auxiliary_loss_mlp": 0.01036348, + "balance_loss_clip": 1.02259159, + "balance_loss_mlp": 1.03972697, + "epoch": 0.4678190290094694, + "flos": 25228395492480.0, + "grad_norm": 2.377229275085917, + "language_loss": 0.73864317, + "learning_rate": 2.3043456773505023e-06, + "loss": 0.76017153, + "num_input_tokens_seen": 167054515, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.765625, + "step": 7781, + "time_per_iteration": 2.502406358718872 + }, + { + "auxiliary_loss_clip": 0.01117462, + "auxiliary_loss_mlp": 0.01035425, + "balance_loss_clip": 1.02216315, + "balance_loss_mlp": 1.04165602, + "epoch": 0.4678791522621374, + "flos": 32268358811520.0, + "grad_norm": 1.718665338253189, + "language_loss": 0.62727809, + "learning_rate": 2.3039607456116252e-06, + "loss": 0.64880699, + "num_input_tokens_seen": 167077245, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7578125, + "step": 7782, + "time_per_iteration": 2.5425686836242676 + }, + { + "auxiliary_loss_clip": 0.01117055, + "auxiliary_loss_mlp": 0.01039299, + "balance_loss_clip": 1.02660906, + "balance_loss_mlp": 1.0408988, + "epoch": 0.46793927551480535, + "flos": 27046480337280.0, + "grad_norm": 1.7203724678454408, + "language_loss": 0.62933487, + "learning_rate": 2.3035758023465254e-06, + "loss": 0.65089834, + "num_input_tokens_seen": 167097235, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.76171875, + "step": 7783, + "time_per_iteration": 2.5380141735076904 + }, + { + "auxiliary_loss_clip": 0.01122421, + "auxiliary_loss_mlp": 0.01036672, + "balance_loss_clip": 1.02271223, + "balance_loss_mlp": 1.04462993, + "epoch": 0.4679993987674733, + "flos": 17457398576640.0, + "grad_norm": 2.164400906730855, + "language_loss": 0.67745304, + "learning_rate": 2.303190847569801e-06, + "loss": 0.69904399, + "num_input_tokens_seen": 167113155, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.77734375, + "step": 7784, + "time_per_iteration": 2.4520463943481445 + }, + { + "auxiliary_loss_clip": 0.01110794, + "auxiliary_loss_mlp": 0.01031461, + "balance_loss_clip": 1.01965284, + "balance_loss_mlp": 1.03855705, + "epoch": 0.4680595220201413, + "flos": 17165121609600.0, + "grad_norm": 1.8603472350259396, + "language_loss": 0.84720063, + "learning_rate": 2.3028058812960497e-06, + "loss": 0.8686232, + "num_input_tokens_seen": 167131765, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.72265625, + "step": 7785, + "time_per_iteration": 2.459446907043457 + }, + { + "auxiliary_loss_clip": 0.01115116, + "auxiliary_loss_mlp": 0.01029458, + "balance_loss_clip": 1.01639259, + "balance_loss_mlp": 1.04066038, + "epoch": 0.46811964527280925, + "flos": 11327591001600.0, + "grad_norm": 2.0359259581468154, + "language_loss": 0.77018952, + "learning_rate": 2.3024209035398678e-06, + "loss": 0.79163527, + "num_input_tokens_seen": 167149030, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.74609375, + "step": 7786, + "time_per_iteration": 2.415062427520752 + }, + { + "auxiliary_loss_clip": 0.01110671, + "auxiliary_loss_mlp": 0.01027657, + "balance_loss_clip": 1.01558685, + "balance_loss_mlp": 1.0400672, + "epoch": 0.4681797685254772, + "flos": 24278809593600.0, + "grad_norm": 2.023612965965443, + "language_loss": 0.73795342, + "learning_rate": 2.302035914315856e-06, + "loss": 0.75933665, + "num_input_tokens_seen": 167167375, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.70703125, + "step": 7787, + "time_per_iteration": 2.5224268436431885 + }, + { + "auxiliary_loss_clip": 0.01113364, + "auxiliary_loss_mlp": 0.01039171, + "balance_loss_clip": 1.02599859, + "balance_loss_mlp": 1.04109263, + "epoch": 0.4682398917781452, + "flos": 31650372293760.0, + "grad_norm": 1.7002718084162438, + "language_loss": 0.65639925, + "learning_rate": 2.3016509136386116e-06, + "loss": 0.67792457, + "num_input_tokens_seen": 167188065, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.72265625, + "step": 7788, + "time_per_iteration": 2.534850835800171 + }, + { + "auxiliary_loss_clip": 0.01110419, + "auxiliary_loss_mlp": 0.01030066, + "balance_loss_clip": 1.01878858, + "balance_loss_mlp": 1.03911507, + "epoch": 0.46830001503081314, + "flos": 28110765340800.0, + "grad_norm": 1.9511727744147118, + "language_loss": 0.63813901, + "learning_rate": 2.3012659015227343e-06, + "loss": 0.65954381, + "num_input_tokens_seen": 167209675, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.71484375, + "step": 7789, + "time_per_iteration": 2.5479812622070312 + }, + { + "auxiliary_loss_clip": 0.01036451, + "auxiliary_loss_mlp": 0.01005013, + "balance_loss_clip": 1.00388098, + "balance_loss_mlp": 1.01292431, + "epoch": 0.4683601382834811, + "flos": 57881718316800.0, + "grad_norm": 0.7071467356489777, + "language_loss": 0.61922455, + "learning_rate": 2.300880877982825e-06, + "loss": 0.6396392, + "num_input_tokens_seen": 167273940, + "router_z_loss_clip": 0.01135254, + "router_z_loss_mlp": 0.23632812, + "step": 7790, + "time_per_iteration": 3.1510462760925293 + }, + { + "auxiliary_loss_clip": 0.01112801, + "auxiliary_loss_mlp": 0.0103052, + "balance_loss_clip": 1.01836109, + "balance_loss_mlp": 1.04223442, + "epoch": 0.46842026153614913, + "flos": 21871933009920.0, + "grad_norm": 1.5995715197713376, + "language_loss": 0.79338831, + "learning_rate": 2.3004958430334808e-06, + "loss": 0.81482148, + "num_input_tokens_seen": 167292730, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.703125, + "step": 7791, + "time_per_iteration": 2.5008740425109863 + }, + { + "auxiliary_loss_clip": 0.01114115, + "auxiliary_loss_mlp": 0.01035459, + "balance_loss_clip": 1.02297759, + "balance_loss_mlp": 1.04113936, + "epoch": 0.4684803847888171, + "flos": 24900818434560.0, + "grad_norm": 1.651557239680421, + "language_loss": 0.7484895, + "learning_rate": 2.3001107966893052e-06, + "loss": 0.76998532, + "num_input_tokens_seen": 167313460, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.73046875, + "step": 7792, + "time_per_iteration": 2.4964823722839355 + }, + { + "auxiliary_loss_clip": 0.01108357, + "auxiliary_loss_mlp": 0.01031288, + "balance_loss_clip": 1.01953983, + "balance_loss_mlp": 1.03747678, + "epoch": 0.46854050804148506, + "flos": 26251670142720.0, + "grad_norm": 1.7412725365893262, + "language_loss": 0.6822598, + "learning_rate": 2.299725738964898e-06, + "loss": 0.70365626, + "num_input_tokens_seen": 167335385, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.70703125, + "step": 7793, + "time_per_iteration": 2.5480096340179443 + }, + { + "auxiliary_loss_clip": 0.01112468, + "auxiliary_loss_mlp": 0.01027992, + "balance_loss_clip": 1.01638055, + "balance_loss_mlp": 1.04102671, + "epoch": 0.468600631294153, + "flos": 21579799697280.0, + "grad_norm": 1.577590367357015, + "language_loss": 0.73983628, + "learning_rate": 2.2993406698748607e-06, + "loss": 0.76124084, + "num_input_tokens_seen": 167353625, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.71484375, + "step": 7794, + "time_per_iteration": 2.453190803527832 + }, + { + "auxiliary_loss_clip": 0.01114261, + "auxiliary_loss_mlp": 0.01035788, + "balance_loss_clip": 1.0227052, + "balance_loss_mlp": 1.04182243, + "epoch": 0.468660754546821, + "flos": 25885632597120.0, + "grad_norm": 1.5518603627769951, + "language_loss": 0.63617218, + "learning_rate": 2.2989555894337953e-06, + "loss": 0.65767258, + "num_input_tokens_seen": 167374565, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 7795, + "time_per_iteration": 2.5087008476257324 + }, + { + "auxiliary_loss_clip": 0.01108593, + "auxiliary_loss_mlp": 0.01023894, + "balance_loss_clip": 1.01140058, + "balance_loss_mlp": 1.03883195, + "epoch": 0.46872087779948896, + "flos": 35475001666560.0, + "grad_norm": 1.6379638897021238, + "language_loss": 0.68002474, + "learning_rate": 2.298570497656304e-06, + "loss": 0.70134962, + "num_input_tokens_seen": 167395010, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 7796, + "time_per_iteration": 2.6073970794677734 + }, + { + "auxiliary_loss_clip": 0.01110063, + "auxiliary_loss_mlp": 0.01031393, + "balance_loss_clip": 1.01876903, + "balance_loss_mlp": 1.03811777, + "epoch": 0.4687810010521569, + "flos": 26396425952640.0, + "grad_norm": 1.6469110962479863, + "language_loss": 0.70039898, + "learning_rate": 2.2981853945569894e-06, + "loss": 0.72181356, + "num_input_tokens_seen": 167415285, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71875, + "step": 7797, + "time_per_iteration": 2.5202813148498535 + }, + { + "auxiliary_loss_clip": 0.01114247, + "auxiliary_loss_mlp": 0.01030143, + "balance_loss_clip": 1.01626134, + "balance_loss_mlp": 1.04066193, + "epoch": 0.4688411243048249, + "flos": 19972761212160.0, + "grad_norm": 5.424608495577661, + "language_loss": 0.67517138, + "learning_rate": 2.297800280150454e-06, + "loss": 0.69661522, + "num_input_tokens_seen": 167432405, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.734375, + "step": 7798, + "time_per_iteration": 2.425443649291992 + }, + { + "auxiliary_loss_clip": 0.01033599, + "auxiliary_loss_mlp": 0.00999727, + "balance_loss_clip": 0.99840373, + "balance_loss_mlp": 1.00991392, + "epoch": 0.46890124755749285, + "flos": 63977015900160.0, + "grad_norm": 0.9386412030406017, + "language_loss": 0.64531696, + "learning_rate": 2.2974151544513033e-06, + "loss": 0.66565025, + "num_input_tokens_seen": 167499365, + "router_z_loss_clip": 0.01324463, + "router_z_loss_mlp": 0.23730469, + "step": 7799, + "time_per_iteration": 3.2528939247131348 + }, + { + "auxiliary_loss_clip": 0.01108747, + "auxiliary_loss_mlp": 0.01025125, + "balance_loss_clip": 1.01308465, + "balance_loss_mlp": 1.03731787, + "epoch": 0.4689613708101608, + "flos": 23768985905280.0, + "grad_norm": 1.4163336480228355, + "language_loss": 0.72242683, + "learning_rate": 2.2970300174741395e-06, + "loss": 0.74376553, + "num_input_tokens_seen": 167520390, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.71484375, + "step": 7800, + "time_per_iteration": 2.481309175491333 + }, + { + "auxiliary_loss_clip": 0.01109702, + "auxiliary_loss_mlp": 0.01030663, + "balance_loss_clip": 1.01937377, + "balance_loss_mlp": 1.0401566, + "epoch": 0.4690214940628288, + "flos": 24788705109120.0, + "grad_norm": 2.26920520557406, + "language_loss": 0.72428536, + "learning_rate": 2.296644869233568e-06, + "loss": 0.74568903, + "num_input_tokens_seen": 167539865, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6953125, + "step": 7801, + "time_per_iteration": 2.491105079650879 + }, + { + "auxiliary_loss_clip": 0.01116821, + "auxiliary_loss_mlp": 0.0103741, + "balance_loss_clip": 1.02352786, + "balance_loss_mlp": 1.04097068, + "epoch": 0.46908161731549675, + "flos": 18077324428800.0, + "grad_norm": 2.06336431229611, + "language_loss": 0.62303418, + "learning_rate": 2.2962597097441936e-06, + "loss": 0.64457649, + "num_input_tokens_seen": 167558190, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 7802, + "time_per_iteration": 2.419229030609131 + }, + { + "auxiliary_loss_clip": 0.01113209, + "auxiliary_loss_mlp": 0.01033494, + "balance_loss_clip": 1.02101874, + "balance_loss_mlp": 1.03946614, + "epoch": 0.4691417405681647, + "flos": 25703350053120.0, + "grad_norm": 1.7578029510137774, + "language_loss": 0.73409998, + "learning_rate": 2.2958745390206206e-06, + "loss": 0.75556695, + "num_input_tokens_seen": 167577685, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.73828125, + "step": 7803, + "time_per_iteration": 3.984971523284912 + }, + { + "auxiliary_loss_clip": 0.01107549, + "auxiliary_loss_mlp": 0.01035068, + "balance_loss_clip": 1.02289057, + "balance_loss_mlp": 1.0363642, + "epoch": 0.46920186382083273, + "flos": 17457039440640.0, + "grad_norm": 2.1225810300999384, + "language_loss": 0.77638352, + "learning_rate": 2.2954893570774558e-06, + "loss": 0.79780972, + "num_input_tokens_seen": 167596390, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.7109375, + "step": 7804, + "time_per_iteration": 5.432345390319824 + }, + { + "auxiliary_loss_clip": 0.01108405, + "auxiliary_loss_mlp": 0.01026664, + "balance_loss_clip": 1.01417041, + "balance_loss_mlp": 1.03702545, + "epoch": 0.4692619870735007, + "flos": 20339445202560.0, + "grad_norm": 1.8629622532391696, + "language_loss": 0.77384996, + "learning_rate": 2.295104163929305e-06, + "loss": 0.79520065, + "num_input_tokens_seen": 167614980, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71484375, + "step": 7805, + "time_per_iteration": 3.873565196990967 + }, + { + "auxiliary_loss_clip": 0.01119773, + "auxiliary_loss_mlp": 0.01037518, + "balance_loss_clip": 1.02423811, + "balance_loss_mlp": 1.04193878, + "epoch": 0.46932211032616866, + "flos": 29496558003840.0, + "grad_norm": 1.5711850680288217, + "language_loss": 0.82902926, + "learning_rate": 2.2947189595907742e-06, + "loss": 0.85060221, + "num_input_tokens_seen": 167635895, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.78125, + "step": 7806, + "time_per_iteration": 2.554081439971924 + }, + { + "auxiliary_loss_clip": 0.0111231, + "auxiliary_loss_mlp": 0.01034772, + "balance_loss_clip": 1.02150404, + "balance_loss_mlp": 1.03812897, + "epoch": 0.4693822335788366, + "flos": 36211242735360.0, + "grad_norm": 1.7011762555096541, + "language_loss": 0.77454185, + "learning_rate": 2.294333744076472e-06, + "loss": 0.79601264, + "num_input_tokens_seen": 167657440, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7421875, + "step": 7807, + "time_per_iteration": 2.5786170959472656 + }, + { + "auxiliary_loss_clip": 0.01112504, + "auxiliary_loss_mlp": 0.01033071, + "balance_loss_clip": 1.01985693, + "balance_loss_mlp": 1.03987944, + "epoch": 0.4694423568315046, + "flos": 20338978325760.0, + "grad_norm": 1.9089254292763438, + "language_loss": 0.51788038, + "learning_rate": 2.2939485174010035e-06, + "loss": 0.53933609, + "num_input_tokens_seen": 167675025, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7265625, + "step": 7808, + "time_per_iteration": 2.4730944633483887 + }, + { + "auxiliary_loss_clip": 0.01034297, + "auxiliary_loss_mlp": 0.01010423, + "balance_loss_clip": 1.00899839, + "balance_loss_mlp": 1.01039815, + "epoch": 0.46950248008417256, + "flos": 64326353621760.0, + "grad_norm": 0.782722095319277, + "language_loss": 0.57725239, + "learning_rate": 2.293563279578978e-06, + "loss": 0.59769958, + "num_input_tokens_seen": 167729635, + "router_z_loss_clip": 0.01422119, + "router_z_loss_mlp": 0.23925781, + "step": 7809, + "time_per_iteration": 2.9356954097747803 + }, + { + "auxiliary_loss_clip": 0.01116298, + "auxiliary_loss_mlp": 0.01036482, + "balance_loss_clip": 1.0237031, + "balance_loss_mlp": 1.04176784, + "epoch": 0.4695626033368405, + "flos": 19200106730880.0, + "grad_norm": 2.074581573353579, + "language_loss": 0.72116458, + "learning_rate": 2.2931780306250045e-06, + "loss": 0.74269235, + "num_input_tokens_seen": 167745135, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.74609375, + "step": 7810, + "time_per_iteration": 2.493408679962158 + }, + { + "auxiliary_loss_clip": 0.01114023, + "auxiliary_loss_mlp": 0.0103688, + "balance_loss_clip": 1.02402329, + "balance_loss_mlp": 1.040115, + "epoch": 0.4696227265895085, + "flos": 23002436736000.0, + "grad_norm": 2.1541938985336992, + "language_loss": 0.8075912, + "learning_rate": 2.29279277055369e-06, + "loss": 0.82910025, + "num_input_tokens_seen": 167763875, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.73828125, + "step": 7811, + "time_per_iteration": 2.4555575847625732 + }, + { + "auxiliary_loss_clip": 0.01114703, + "auxiliary_loss_mlp": 0.0103453, + "balance_loss_clip": 1.02146435, + "balance_loss_mlp": 1.04074228, + "epoch": 0.46968284984217645, + "flos": 21870855601920.0, + "grad_norm": 1.576643907851126, + "language_loss": 0.8039701, + "learning_rate": 2.292407499379644e-06, + "loss": 0.82546234, + "num_input_tokens_seen": 167784895, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7421875, + "step": 7812, + "time_per_iteration": 2.4640350341796875 + }, + { + "auxiliary_loss_clip": 0.01109494, + "auxiliary_loss_mlp": 0.01029039, + "balance_loss_clip": 1.0166117, + "balance_loss_mlp": 1.03902435, + "epoch": 0.4697429730948444, + "flos": 19974987855360.0, + "grad_norm": 1.5853543039664872, + "language_loss": 0.73764664, + "learning_rate": 2.292022217117477e-06, + "loss": 0.75903195, + "num_input_tokens_seen": 167803185, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.703125, + "step": 7813, + "time_per_iteration": 2.4320507049560547 + }, + { + "auxiliary_loss_clip": 0.01110282, + "auxiliary_loss_mlp": 0.01028304, + "balance_loss_clip": 1.01483905, + "balance_loss_mlp": 1.03869295, + "epoch": 0.4698030963475124, + "flos": 15156206784000.0, + "grad_norm": 2.2861298905980756, + "language_loss": 0.84540617, + "learning_rate": 2.291636923781798e-06, + "loss": 0.86679196, + "num_input_tokens_seen": 167816550, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71484375, + "step": 7814, + "time_per_iteration": 2.4274749755859375 + }, + { + "auxiliary_loss_clip": 0.01107762, + "auxiliary_loss_mlp": 0.01036717, + "balance_loss_clip": 1.02381229, + "balance_loss_mlp": 1.03796697, + "epoch": 0.46986321960018035, + "flos": 15151178880000.0, + "grad_norm": 1.8672463737050276, + "language_loss": 0.81747186, + "learning_rate": 2.291251619387217e-06, + "loss": 0.83891666, + "num_input_tokens_seen": 167831845, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69921875, + "step": 7815, + "time_per_iteration": 2.4163284301757812 + }, + { + "auxiliary_loss_clip": 0.01113111, + "auxiliary_loss_mlp": 0.01033733, + "balance_loss_clip": 1.02026868, + "balance_loss_mlp": 1.03994465, + "epoch": 0.4699233428528483, + "flos": 23108911626240.0, + "grad_norm": 2.4869249923010917, + "language_loss": 0.77289331, + "learning_rate": 2.2908663039483468e-06, + "loss": 0.79436171, + "num_input_tokens_seen": 167850360, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73046875, + "step": 7816, + "time_per_iteration": 2.4678542613983154 + }, + { + "auxiliary_loss_clip": 0.01033373, + "auxiliary_loss_mlp": 0.01001411, + "balance_loss_clip": 0.9998135, + "balance_loss_mlp": 1.00933015, + "epoch": 0.46998346610551633, + "flos": 68105558246400.0, + "grad_norm": 0.8340649958424211, + "language_loss": 0.5901494, + "learning_rate": 2.290480977479796e-06, + "loss": 0.61049724, + "num_input_tokens_seen": 167908660, + "router_z_loss_clip": 0.01599121, + "router_z_loss_mlp": 0.24023438, + "step": 7817, + "time_per_iteration": 3.0594780445098877 + }, + { + "auxiliary_loss_clip": 0.01108016, + "auxiliary_loss_mlp": 0.01029692, + "balance_loss_clip": 1.01726496, + "balance_loss_mlp": 1.03904927, + "epoch": 0.4700435893581843, + "flos": 24129456842880.0, + "grad_norm": 1.7036287613919965, + "language_loss": 0.79255462, + "learning_rate": 2.2900956399961775e-06, + "loss": 0.81393164, + "num_input_tokens_seen": 167927905, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 7818, + "time_per_iteration": 2.5072269439697266 + }, + { + "auxiliary_loss_clip": 0.01109812, + "auxiliary_loss_mlp": 0.01032563, + "balance_loss_clip": 1.02011776, + "balance_loss_mlp": 1.03705192, + "epoch": 0.47010371261085226, + "flos": 20150518642560.0, + "grad_norm": 1.8212678437549825, + "language_loss": 0.83521211, + "learning_rate": 2.289710291512104e-06, + "loss": 0.85663581, + "num_input_tokens_seen": 167945995, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7265625, + "step": 7819, + "time_per_iteration": 2.4294557571411133 + }, + { + "auxiliary_loss_clip": 0.01114675, + "auxiliary_loss_mlp": 0.01035104, + "balance_loss_clip": 1.02144313, + "balance_loss_mlp": 1.0395112, + "epoch": 0.47016383586352023, + "flos": 15122199582720.0, + "grad_norm": 2.0332467146742457, + "language_loss": 0.75860727, + "learning_rate": 2.289324932042186e-06, + "loss": 0.78010511, + "num_input_tokens_seen": 167963380, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75390625, + "step": 7820, + "time_per_iteration": 2.446664333343506 + }, + { + "auxiliary_loss_clip": 0.0111083, + "auxiliary_loss_mlp": 0.01033323, + "balance_loss_clip": 1.02034664, + "balance_loss_mlp": 1.04058981, + "epoch": 0.4702239591161882, + "flos": 13552975140480.0, + "grad_norm": 1.889014789758207, + "language_loss": 0.73767376, + "learning_rate": 2.288939561601039e-06, + "loss": 0.75911528, + "num_input_tokens_seen": 167981740, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.703125, + "step": 7821, + "time_per_iteration": 2.4138526916503906 + }, + { + "auxiliary_loss_clip": 0.01110991, + "auxiliary_loss_mlp": 0.01040092, + "balance_loss_clip": 1.02792668, + "balance_loss_mlp": 1.04042852, + "epoch": 0.47028408236885616, + "flos": 24276511123200.0, + "grad_norm": 1.6752111617055698, + "language_loss": 0.88782346, + "learning_rate": 2.2885541802032746e-06, + "loss": 0.9093343, + "num_input_tokens_seen": 167999380, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.703125, + "step": 7822, + "time_per_iteration": 2.5215280055999756 + }, + { + "auxiliary_loss_clip": 0.01110261, + "auxiliary_loss_mlp": 0.01029425, + "balance_loss_clip": 1.01693165, + "balance_loss_mlp": 1.03927922, + "epoch": 0.4703442056215241, + "flos": 22856926740480.0, + "grad_norm": 1.5082152139738452, + "language_loss": 0.79467583, + "learning_rate": 2.2881687878635055e-06, + "loss": 0.8160727, + "num_input_tokens_seen": 168018395, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 7823, + "time_per_iteration": 2.4513280391693115 + }, + { + "auxiliary_loss_clip": 0.01034267, + "auxiliary_loss_mlp": 0.01003747, + "balance_loss_clip": 1.00228715, + "balance_loss_mlp": 1.01028728, + "epoch": 0.4704043288741921, + "flos": 69240227950080.0, + "grad_norm": 0.6886986665104876, + "language_loss": 0.56664526, + "learning_rate": 2.2877833845963487e-06, + "loss": 0.5870254, + "num_input_tokens_seen": 168084080, + "router_z_loss_clip": 0.0145874, + "router_z_loss_mlp": 0.24023438, + "step": 7824, + "time_per_iteration": 3.1640188694000244 + }, + { + "auxiliary_loss_clip": 0.01113151, + "auxiliary_loss_mlp": 0.01035787, + "balance_loss_clip": 1.02209568, + "balance_loss_mlp": 1.03935504, + "epoch": 0.47046445212686006, + "flos": 18041090584320.0, + "grad_norm": 1.7687808389256934, + "language_loss": 0.81284839, + "learning_rate": 2.2873979704164157e-06, + "loss": 0.83433783, + "num_input_tokens_seen": 168101555, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73828125, + "step": 7825, + "time_per_iteration": 2.4225590229034424 + }, + { + "auxiliary_loss_clip": 0.01114172, + "auxiliary_loss_mlp": 0.01030918, + "balance_loss_clip": 1.01788807, + "balance_loss_mlp": 1.04160166, + "epoch": 0.470524575379528, + "flos": 23951448017280.0, + "grad_norm": 1.5897626143629002, + "language_loss": 0.66397595, + "learning_rate": 2.287012545338324e-06, + "loss": 0.68542683, + "num_input_tokens_seen": 168121530, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 7826, + "time_per_iteration": 2.512421131134033 + }, + { + "auxiliary_loss_clip": 0.0111203, + "auxiliary_loss_mlp": 0.01036996, + "balance_loss_clip": 1.02366889, + "balance_loss_mlp": 1.03788161, + "epoch": 0.470584698632196, + "flos": 18113558273280.0, + "grad_norm": 2.2414984964582354, + "language_loss": 0.83768737, + "learning_rate": 2.2866271093766877e-06, + "loss": 0.85917771, + "num_input_tokens_seen": 168140335, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7421875, + "step": 7827, + "time_per_iteration": 2.449002504348755 + }, + { + "auxiliary_loss_clip": 0.01034449, + "auxiliary_loss_mlp": 0.01000576, + "balance_loss_clip": 0.99914598, + "balance_loss_mlp": 1.01066613, + "epoch": 0.47064482188486395, + "flos": 57251916224640.0, + "grad_norm": 0.821565097847141, + "language_loss": 0.55694902, + "learning_rate": 2.286241662546122e-06, + "loss": 0.57729936, + "num_input_tokens_seen": 168200535, + "router_z_loss_clip": 0.01428223, + "router_z_loss_mlp": 0.23828125, + "step": 7828, + "time_per_iteration": 3.0819802284240723 + }, + { + "auxiliary_loss_clip": 0.01109156, + "auxiliary_loss_mlp": 0.01028442, + "balance_loss_clip": 1.01605594, + "balance_loss_mlp": 1.03884375, + "epoch": 0.4707049451375319, + "flos": 17895077798400.0, + "grad_norm": 1.9071991460911069, + "language_loss": 0.81054831, + "learning_rate": 2.285856204861245e-06, + "loss": 0.8319242, + "num_input_tokens_seen": 168219610, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.703125, + "step": 7829, + "time_per_iteration": 2.415055513381958 + }, + { + "auxiliary_loss_clip": 0.01110764, + "auxiliary_loss_mlp": 0.01032324, + "balance_loss_clip": 1.02058768, + "balance_loss_mlp": 1.04020715, + "epoch": 0.47076506839019994, + "flos": 25232669210880.0, + "grad_norm": 1.3327561380149306, + "language_loss": 0.7576915, + "learning_rate": 2.2854707363366703e-06, + "loss": 0.77912241, + "num_input_tokens_seen": 168242505, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.703125, + "step": 7830, + "time_per_iteration": 2.5643560886383057 + }, + { + "auxiliary_loss_clip": 0.0111195, + "auxiliary_loss_mlp": 0.01032106, + "balance_loss_clip": 1.01860535, + "balance_loss_mlp": 1.04144919, + "epoch": 0.4708251916428679, + "flos": 13479681438720.0, + "grad_norm": 1.972485160119179, + "language_loss": 0.78818381, + "learning_rate": 2.2850852569870177e-06, + "loss": 0.80962437, + "num_input_tokens_seen": 168260220, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.703125, + "step": 7831, + "time_per_iteration": 2.4193694591522217 + }, + { + "auxiliary_loss_clip": 0.01115316, + "auxiliary_loss_mlp": 0.01035851, + "balance_loss_clip": 1.02204037, + "balance_loss_mlp": 1.03843021, + "epoch": 0.47088531489553587, + "flos": 30147833450880.0, + "grad_norm": 1.7552368254682797, + "language_loss": 0.76044565, + "learning_rate": 2.2846997668269033e-06, + "loss": 0.78195733, + "num_input_tokens_seen": 168277360, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76953125, + "step": 7832, + "time_per_iteration": 2.5059313774108887 + }, + { + "auxiliary_loss_clip": 0.01110226, + "auxiliary_loss_mlp": 0.01028235, + "balance_loss_clip": 1.0164752, + "balance_loss_mlp": 1.03971505, + "epoch": 0.47094543814820383, + "flos": 21798280172160.0, + "grad_norm": 1.221217846393107, + "language_loss": 0.74499595, + "learning_rate": 2.2843142658709454e-06, + "loss": 0.76638055, + "num_input_tokens_seen": 168296605, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.703125, + "step": 7833, + "time_per_iteration": 2.473198652267456 + }, + { + "auxiliary_loss_clip": 0.01111984, + "auxiliary_loss_mlp": 0.01035389, + "balance_loss_clip": 1.0222286, + "balance_loss_mlp": 1.04079628, + "epoch": 0.4710055614008718, + "flos": 23003011353600.0, + "grad_norm": 1.540147977988576, + "language_loss": 0.7563647, + "learning_rate": 2.283928754133762e-06, + "loss": 0.77783847, + "num_input_tokens_seen": 168316205, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 7834, + "time_per_iteration": 2.4742865562438965 + }, + { + "auxiliary_loss_clip": 0.01110721, + "auxiliary_loss_mlp": 0.01038538, + "balance_loss_clip": 1.02601528, + "balance_loss_mlp": 1.04030991, + "epoch": 0.47106568465353976, + "flos": 42741346452480.0, + "grad_norm": 1.3686611384111311, + "language_loss": 0.66174978, + "learning_rate": 2.283543231629972e-06, + "loss": 0.68324244, + "num_input_tokens_seen": 168338935, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 7835, + "time_per_iteration": 2.631727933883667 + }, + { + "auxiliary_loss_clip": 0.01034831, + "auxiliary_loss_mlp": 0.01005422, + "balance_loss_clip": 1.00418234, + "balance_loss_mlp": 1.01069164, + "epoch": 0.4711258079062077, + "flos": 68554008570240.0, + "grad_norm": 0.8728088219103824, + "language_loss": 0.62162638, + "learning_rate": 2.283157698374194e-06, + "loss": 0.64202893, + "num_input_tokens_seen": 168392800, + "router_z_loss_clip": 0.01239014, + "router_z_loss_mlp": 0.2421875, + "step": 7836, + "time_per_iteration": 3.0448570251464844 + }, + { + "auxiliary_loss_clip": 0.01113991, + "auxiliary_loss_mlp": 0.01035938, + "balance_loss_clip": 1.02254474, + "balance_loss_mlp": 1.03829992, + "epoch": 0.4711859311588757, + "flos": 25446588658560.0, + "grad_norm": 1.5467691894783375, + "language_loss": 0.69550622, + "learning_rate": 2.2827721543810475e-06, + "loss": 0.71700549, + "num_input_tokens_seen": 168412940, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7578125, + "step": 7837, + "time_per_iteration": 2.480307102203369 + }, + { + "auxiliary_loss_clip": 0.01113119, + "auxiliary_loss_mlp": 0.01041426, + "balance_loss_clip": 1.02703786, + "balance_loss_mlp": 1.03986847, + "epoch": 0.47124605441154366, + "flos": 21981891519360.0, + "grad_norm": 1.8364060529940534, + "language_loss": 0.66015977, + "learning_rate": 2.282386599665153e-06, + "loss": 0.68170524, + "num_input_tokens_seen": 168431995, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.734375, + "step": 7838, + "time_per_iteration": 2.461975336074829 + }, + { + "auxiliary_loss_clip": 0.01112229, + "auxiliary_loss_mlp": 0.01030934, + "balance_loss_clip": 1.01755917, + "balance_loss_mlp": 1.03790629, + "epoch": 0.4713061776642116, + "flos": 25412689198080.0, + "grad_norm": 1.9120341376079564, + "language_loss": 0.77139461, + "learning_rate": 2.2820010342411304e-06, + "loss": 0.79282629, + "num_input_tokens_seen": 168454585, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7421875, + "step": 7839, + "time_per_iteration": 2.4788944721221924 + }, + { + "auxiliary_loss_clip": 0.01107554, + "auxiliary_loss_mlp": 0.01028891, + "balance_loss_clip": 1.0168395, + "balance_loss_mlp": 1.03794789, + "epoch": 0.4713663009168796, + "flos": 26542259170560.0, + "grad_norm": 1.9130481219619113, + "language_loss": 0.72918046, + "learning_rate": 2.2816154581235993e-06, + "loss": 0.75054491, + "num_input_tokens_seen": 168471265, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6953125, + "step": 7840, + "time_per_iteration": 2.495239019393921 + }, + { + "auxiliary_loss_clip": 0.01108309, + "auxiliary_loss_mlp": 0.01028993, + "balance_loss_clip": 1.01623714, + "balance_loss_mlp": 1.03712356, + "epoch": 0.47142642416954755, + "flos": 23623583650560.0, + "grad_norm": 1.5808172060169028, + "language_loss": 0.74886942, + "learning_rate": 2.2812298713271833e-06, + "loss": 0.77024251, + "num_input_tokens_seen": 168491360, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 7841, + "time_per_iteration": 2.454484224319458 + }, + { + "auxiliary_loss_clip": 0.01109803, + "auxiliary_loss_mlp": 0.01032659, + "balance_loss_clip": 1.02002275, + "balance_loss_mlp": 1.03838921, + "epoch": 0.4714865474222155, + "flos": 22310150935680.0, + "grad_norm": 1.602853925212418, + "language_loss": 0.70333457, + "learning_rate": 2.280844273866501e-06, + "loss": 0.72475922, + "num_input_tokens_seen": 168511335, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71484375, + "step": 7842, + "time_per_iteration": 2.4781782627105713 + }, + { + "auxiliary_loss_clip": 0.01111668, + "auxiliary_loss_mlp": 0.01029251, + "balance_loss_clip": 1.01659727, + "balance_loss_mlp": 1.04060411, + "epoch": 0.4715466706748835, + "flos": 17822430541440.0, + "grad_norm": 2.29732654226483, + "language_loss": 0.78893888, + "learning_rate": 2.280458665756177e-06, + "loss": 0.81034797, + "num_input_tokens_seen": 168529920, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 7843, + "time_per_iteration": 2.4125685691833496 + }, + { + "auxiliary_loss_clip": 0.01110204, + "auxiliary_loss_mlp": 0.0103101, + "balance_loss_clip": 1.01920795, + "balance_loss_mlp": 1.03860044, + "epoch": 0.4716067939275515, + "flos": 23659530186240.0, + "grad_norm": 1.6968163407172614, + "language_loss": 0.74375969, + "learning_rate": 2.280073047010832e-06, + "loss": 0.76517189, + "num_input_tokens_seen": 168550595, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.71484375, + "step": 7844, + "time_per_iteration": 3.915900230407715 + }, + { + "auxiliary_loss_clip": 0.01110838, + "auxiliary_loss_mlp": 0.0104121, + "balance_loss_clip": 1.0281688, + "balance_loss_mlp": 1.03888059, + "epoch": 0.47166691718021947, + "flos": 17930162407680.0, + "grad_norm": 1.5835392600478553, + "language_loss": 0.78286111, + "learning_rate": 2.279687417645088e-06, + "loss": 0.80438167, + "num_input_tokens_seen": 168569765, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 7845, + "time_per_iteration": 3.8502118587493896 + }, + { + "auxiliary_loss_clip": 0.01105883, + "auxiliary_loss_mlp": 0.01033342, + "balance_loss_clip": 1.02098632, + "balance_loss_mlp": 1.03725934, + "epoch": 0.47172704043288743, + "flos": 26614583205120.0, + "grad_norm": 1.4155938367608039, + "language_loss": 0.7311433, + "learning_rate": 2.2793017776735703e-06, + "loss": 0.75253546, + "num_input_tokens_seen": 168591525, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6875, + "step": 7846, + "time_per_iteration": 5.374008655548096 + }, + { + "auxiliary_loss_clip": 0.01105934, + "auxiliary_loss_mlp": 0.01030323, + "balance_loss_clip": 1.01794863, + "balance_loss_mlp": 1.03715074, + "epoch": 0.4717871636855554, + "flos": 27922700707200.0, + "grad_norm": 1.2885600176299252, + "language_loss": 0.74075842, + "learning_rate": 2.2789161271109e-06, + "loss": 0.76212096, + "num_input_tokens_seen": 168611235, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 7847, + "time_per_iteration": 2.5333058834075928 + }, + { + "auxiliary_loss_clip": 0.01110234, + "auxiliary_loss_mlp": 0.01034306, + "balance_loss_clip": 1.02229548, + "balance_loss_mlp": 1.03908157, + "epoch": 0.47184728693822336, + "flos": 14502237816960.0, + "grad_norm": 1.6263943719256755, + "language_loss": 0.80717957, + "learning_rate": 2.278530465971703e-06, + "loss": 0.82862496, + "num_input_tokens_seen": 168628710, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.7109375, + "step": 7848, + "time_per_iteration": 2.408688545227051 + }, + { + "auxiliary_loss_clip": 0.01115584, + "auxiliary_loss_mlp": 0.01031135, + "balance_loss_clip": 1.01844501, + "balance_loss_mlp": 1.04345632, + "epoch": 0.47190741019089133, + "flos": 17856545483520.0, + "grad_norm": 1.7499376956487047, + "language_loss": 0.70086265, + "learning_rate": 2.2781447942706032e-06, + "loss": 0.72232985, + "num_input_tokens_seen": 168645645, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 7849, + "time_per_iteration": 2.453542709350586 + }, + { + "auxiliary_loss_clip": 0.01114658, + "auxiliary_loss_mlp": 0.01035623, + "balance_loss_clip": 1.02144289, + "balance_loss_mlp": 1.03961349, + "epoch": 0.4719675334435593, + "flos": 17895472848000.0, + "grad_norm": 2.1591296324254095, + "language_loss": 0.69831544, + "learning_rate": 2.277759112022224e-06, + "loss": 0.71981823, + "num_input_tokens_seen": 168664165, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.75, + "step": 7850, + "time_per_iteration": 2.421095371246338 + }, + { + "auxiliary_loss_clip": 0.01115823, + "auxiliary_loss_mlp": 0.01030409, + "balance_loss_clip": 1.0175221, + "balance_loss_mlp": 1.04188704, + "epoch": 0.47202765669622726, + "flos": 20704369426560.0, + "grad_norm": 1.815710496912415, + "language_loss": 0.75220203, + "learning_rate": 2.2773734192411916e-06, + "loss": 0.7736643, + "num_input_tokens_seen": 168681940, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.73828125, + "step": 7851, + "time_per_iteration": 2.4666483402252197 + }, + { + "auxiliary_loss_clip": 0.01112485, + "auxiliary_loss_mlp": 0.01036309, + "balance_loss_clip": 1.02262962, + "balance_loss_mlp": 1.03831601, + "epoch": 0.4720877799488952, + "flos": 16360255607040.0, + "grad_norm": 1.7847776856215107, + "language_loss": 0.76165771, + "learning_rate": 2.276987715942132e-06, + "loss": 0.78314561, + "num_input_tokens_seen": 168698830, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7421875, + "step": 7852, + "time_per_iteration": 2.415109395980835 + }, + { + "auxiliary_loss_clip": 0.01111391, + "auxiliary_loss_mlp": 0.01029022, + "balance_loss_clip": 1.01553345, + "balance_loss_mlp": 1.04077876, + "epoch": 0.4721479032015632, + "flos": 20668171495680.0, + "grad_norm": 1.4478461916623044, + "language_loss": 0.68933171, + "learning_rate": 2.2766020021396696e-06, + "loss": 0.71073586, + "num_input_tokens_seen": 168718305, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.70703125, + "step": 7853, + "time_per_iteration": 2.4654150009155273 + }, + { + "auxiliary_loss_clip": 0.01033922, + "auxiliary_loss_mlp": 0.00998653, + "balance_loss_clip": 0.99743122, + "balance_loss_mlp": 1.01008511, + "epoch": 0.47220802645423116, + "flos": 67750438435200.0, + "grad_norm": 0.6983660788322832, + "language_loss": 0.50161922, + "learning_rate": 2.276216277848432e-06, + "loss": 0.52194494, + "num_input_tokens_seen": 168782365, + "router_z_loss_clip": 0.01220703, + "router_z_loss_mlp": 0.23828125, + "step": 7854, + "time_per_iteration": 3.190991163253784 + }, + { + "auxiliary_loss_clip": 0.0111395, + "auxiliary_loss_mlp": 0.01032681, + "balance_loss_clip": 1.0189656, + "balance_loss_mlp": 1.04039025, + "epoch": 0.4722681497068991, + "flos": 20921449271040.0, + "grad_norm": 1.7794050652620443, + "language_loss": 0.63844812, + "learning_rate": 2.2758305430830455e-06, + "loss": 0.65991443, + "num_input_tokens_seen": 168800485, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.734375, + "step": 7855, + "time_per_iteration": 2.503614664077759 + }, + { + "auxiliary_loss_clip": 0.01111503, + "auxiliary_loss_mlp": 0.01035422, + "balance_loss_clip": 1.02274394, + "balance_loss_mlp": 1.0393486, + "epoch": 0.4723282729595671, + "flos": 28293083798400.0, + "grad_norm": 1.8062233622492851, + "language_loss": 0.75802517, + "learning_rate": 2.2754447978581376e-06, + "loss": 0.7794944, + "num_input_tokens_seen": 168818965, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 7856, + "time_per_iteration": 2.499197244644165 + }, + { + "auxiliary_loss_clip": 0.01108332, + "auxiliary_loss_mlp": 0.01034982, + "balance_loss_clip": 1.02270377, + "balance_loss_mlp": 1.03774405, + "epoch": 0.4723883962122351, + "flos": 27125053338240.0, + "grad_norm": 1.914023874649731, + "language_loss": 0.7484442, + "learning_rate": 2.2750590421883347e-06, + "loss": 0.76987731, + "num_input_tokens_seen": 168840355, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.70703125, + "step": 7857, + "time_per_iteration": 2.5192370414733887 + }, + { + "auxiliary_loss_clip": 0.01109783, + "auxiliary_loss_mlp": 0.01043222, + "balance_loss_clip": 1.03118157, + "balance_loss_mlp": 1.03967714, + "epoch": 0.47244851946490307, + "flos": 31537253387520.0, + "grad_norm": 1.4716352183066603, + "language_loss": 0.6482265, + "learning_rate": 2.2746732760882655e-06, + "loss": 0.66975653, + "num_input_tokens_seen": 168861765, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.703125, + "step": 7858, + "time_per_iteration": 2.5169341564178467 + }, + { + "auxiliary_loss_clip": 0.01107114, + "auxiliary_loss_mlp": 0.01034557, + "balance_loss_clip": 1.02124774, + "balance_loss_mlp": 1.03680444, + "epoch": 0.47250864271757104, + "flos": 20886544229760.0, + "grad_norm": 1.569061056560701, + "language_loss": 0.70402861, + "learning_rate": 2.2742874995725575e-06, + "loss": 0.72544539, + "num_input_tokens_seen": 168881310, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.703125, + "step": 7859, + "time_per_iteration": 2.4850962162017822 + }, + { + "auxiliary_loss_clip": 0.01115812, + "auxiliary_loss_mlp": 0.01037422, + "balance_loss_clip": 1.0245533, + "balance_loss_mlp": 1.03993118, + "epoch": 0.472568765970239, + "flos": 20522086882560.0, + "grad_norm": 1.957216681544069, + "language_loss": 0.62261212, + "learning_rate": 2.2739017126558413e-06, + "loss": 0.64414442, + "num_input_tokens_seen": 168899470, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7578125, + "step": 7860, + "time_per_iteration": 2.435559034347534 + }, + { + "auxiliary_loss_clip": 0.01114067, + "auxiliary_loss_mlp": 0.01039582, + "balance_loss_clip": 1.02632594, + "balance_loss_mlp": 1.03998029, + "epoch": 0.47262888922290697, + "flos": 35805200417280.0, + "grad_norm": 2.1159962326169097, + "language_loss": 0.71988773, + "learning_rate": 2.2735159153527445e-06, + "loss": 0.7414242, + "num_input_tokens_seen": 168921495, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73828125, + "step": 7861, + "time_per_iteration": 2.5884346961975098 + }, + { + "auxiliary_loss_clip": 0.0111296, + "auxiliary_loss_mlp": 0.01037156, + "balance_loss_clip": 1.02440643, + "balance_loss_mlp": 1.03970647, + "epoch": 0.47268901247557493, + "flos": 20667740532480.0, + "grad_norm": 1.8695032169355525, + "language_loss": 0.85058391, + "learning_rate": 2.273130107677896e-06, + "loss": 0.87208509, + "num_input_tokens_seen": 168940515, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.734375, + "step": 7862, + "time_per_iteration": 2.439347505569458 + }, + { + "auxiliary_loss_clip": 0.01111085, + "auxiliary_loss_mlp": 0.01030878, + "balance_loss_clip": 1.01822364, + "balance_loss_mlp": 1.03786755, + "epoch": 0.4727491357282429, + "flos": 19573291082880.0, + "grad_norm": 1.736958967740828, + "language_loss": 0.8456251, + "learning_rate": 2.272744289645927e-06, + "loss": 0.86704469, + "num_input_tokens_seen": 168958340, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.734375, + "step": 7863, + "time_per_iteration": 2.48335862159729 + }, + { + "auxiliary_loss_clip": 0.01112215, + "auxiliary_loss_mlp": 0.0103699, + "balance_loss_clip": 1.02422917, + "balance_loss_mlp": 1.04029155, + "epoch": 0.47280925898091086, + "flos": 18217231902720.0, + "grad_norm": 1.8450896018132297, + "language_loss": 0.65939879, + "learning_rate": 2.272358461271467e-06, + "loss": 0.68089092, + "num_input_tokens_seen": 168974850, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.71875, + "step": 7864, + "time_per_iteration": 2.430302381515503 + }, + { + "auxiliary_loss_clip": 0.01111041, + "auxiliary_loss_mlp": 0.01030923, + "balance_loss_clip": 1.01771474, + "balance_loss_mlp": 1.03911948, + "epoch": 0.4728693822335788, + "flos": 17821820010240.0, + "grad_norm": 1.898956112201793, + "language_loss": 0.65435767, + "learning_rate": 2.271972622569147e-06, + "loss": 0.67577726, + "num_input_tokens_seen": 168992860, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71875, + "step": 7865, + "time_per_iteration": 2.4585866928100586 + }, + { + "auxiliary_loss_clip": 0.01107492, + "auxiliary_loss_mlp": 0.01033897, + "balance_loss_clip": 1.02195215, + "balance_loss_mlp": 1.0378449, + "epoch": 0.4729295054862468, + "flos": 20595057361920.0, + "grad_norm": 2.8918998215840244, + "language_loss": 0.74357843, + "learning_rate": 2.2715867735535976e-06, + "loss": 0.76499236, + "num_input_tokens_seen": 169010325, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6953125, + "step": 7866, + "time_per_iteration": 2.4264490604400635 + }, + { + "auxiliary_loss_clip": 0.01111501, + "auxiliary_loss_mlp": 0.0102998, + "balance_loss_clip": 1.01718307, + "balance_loss_mlp": 1.03777552, + "epoch": 0.47298962873891476, + "flos": 23368079232000.0, + "grad_norm": 3.2754467592530476, + "language_loss": 0.8285951, + "learning_rate": 2.271200914239451e-06, + "loss": 0.85000992, + "num_input_tokens_seen": 169029840, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.734375, + "step": 7867, + "time_per_iteration": 2.4925811290740967 + }, + { + "auxiliary_loss_clip": 0.011073, + "auxiliary_loss_mlp": 0.01029056, + "balance_loss_clip": 1.01655674, + "balance_loss_mlp": 1.03702307, + "epoch": 0.4730497519915827, + "flos": 22052240305920.0, + "grad_norm": 1.5927913973026295, + "language_loss": 0.79137915, + "learning_rate": 2.2708150446413385e-06, + "loss": 0.81274265, + "num_input_tokens_seen": 169049975, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 7868, + "time_per_iteration": 2.454094171524048 + }, + { + "auxiliary_loss_clip": 0.01114352, + "auxiliary_loss_mlp": 0.01029772, + "balance_loss_clip": 1.01608682, + "balance_loss_mlp": 1.03858244, + "epoch": 0.4731098752442507, + "flos": 21069724613760.0, + "grad_norm": 2.558281214251347, + "language_loss": 0.74588537, + "learning_rate": 2.2704291647738915e-06, + "loss": 0.76732659, + "num_input_tokens_seen": 169069540, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7578125, + "step": 7869, + "time_per_iteration": 2.4809184074401855 + }, + { + "auxiliary_loss_clip": 0.01114593, + "auxiliary_loss_mlp": 0.01042175, + "balance_loss_clip": 1.02767277, + "balance_loss_mlp": 1.04122782, + "epoch": 0.4731699984969187, + "flos": 22528775064960.0, + "grad_norm": 1.571794234452096, + "language_loss": 0.73950672, + "learning_rate": 2.2700432746517443e-06, + "loss": 0.76107442, + "num_input_tokens_seen": 169089940, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.734375, + "step": 7870, + "time_per_iteration": 2.4553706645965576 + }, + { + "auxiliary_loss_clip": 0.01117025, + "auxiliary_loss_mlp": 0.01038302, + "balance_loss_clip": 1.02400887, + "balance_loss_mlp": 1.04082036, + "epoch": 0.4732301217495867, + "flos": 24898124914560.0, + "grad_norm": 1.9039581815830153, + "language_loss": 0.81513011, + "learning_rate": 2.2696573742895292e-06, + "loss": 0.83668333, + "num_input_tokens_seen": 169109650, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.76171875, + "step": 7871, + "time_per_iteration": 2.5156424045562744 + }, + { + "auxiliary_loss_clip": 0.0111227, + "auxiliary_loss_mlp": 0.01033696, + "balance_loss_clip": 1.02067888, + "balance_loss_mlp": 1.03990555, + "epoch": 0.47329024500225464, + "flos": 22784423137920.0, + "grad_norm": 1.6438263319482285, + "language_loss": 0.75679815, + "learning_rate": 2.269271463701879e-06, + "loss": 0.77825779, + "num_input_tokens_seen": 169128990, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 7872, + "time_per_iteration": 2.453831672668457 + }, + { + "auxiliary_loss_clip": 0.01110565, + "auxiliary_loss_mlp": 0.01034313, + "balance_loss_clip": 1.02088451, + "balance_loss_mlp": 1.03784847, + "epoch": 0.4733503682549226, + "flos": 38695902220800.0, + "grad_norm": 1.7923349992019921, + "language_loss": 0.67857021, + "learning_rate": 2.268885542903428e-06, + "loss": 0.700019, + "num_input_tokens_seen": 169154645, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7265625, + "step": 7873, + "time_per_iteration": 2.6532957553863525 + }, + { + "auxiliary_loss_clip": 0.01113022, + "auxiliary_loss_mlp": 0.01031944, + "balance_loss_clip": 1.01881886, + "balance_loss_mlp": 1.04162037, + "epoch": 0.47341049150759057, + "flos": 22966849336320.0, + "grad_norm": 1.6289748569468698, + "language_loss": 0.72085869, + "learning_rate": 2.26849961190881e-06, + "loss": 0.74230838, + "num_input_tokens_seen": 169174995, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71484375, + "step": 7874, + "time_per_iteration": 2.474073648452759 + }, + { + "auxiliary_loss_clip": 0.01113429, + "auxiliary_loss_mlp": 0.01035269, + "balance_loss_clip": 1.02190506, + "balance_loss_mlp": 1.03987253, + "epoch": 0.47347061476025853, + "flos": 14538471661440.0, + "grad_norm": 2.446593699000123, + "language_loss": 0.65108937, + "learning_rate": 2.26811367073266e-06, + "loss": 0.67257631, + "num_input_tokens_seen": 169191815, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.734375, + "step": 7875, + "time_per_iteration": 2.4433648586273193 + }, + { + "auxiliary_loss_clip": 0.01115895, + "auxiliary_loss_mlp": 0.0103072, + "balance_loss_clip": 1.01718342, + "balance_loss_mlp": 1.04219341, + "epoch": 0.4735307380129265, + "flos": 30263250827520.0, + "grad_norm": 2.56524610984038, + "language_loss": 0.81091076, + "learning_rate": 2.2677277193896125e-06, + "loss": 0.83237696, + "num_input_tokens_seen": 169210430, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73828125, + "step": 7876, + "time_per_iteration": 2.540485143661499 + }, + { + "auxiliary_loss_clip": 0.01108757, + "auxiliary_loss_mlp": 0.01035508, + "balance_loss_clip": 1.02232385, + "balance_loss_mlp": 1.0358628, + "epoch": 0.47359086126559446, + "flos": 19391044452480.0, + "grad_norm": 1.7859307736041579, + "language_loss": 0.7925123, + "learning_rate": 2.267341757894304e-06, + "loss": 0.81395495, + "num_input_tokens_seen": 169229295, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73046875, + "step": 7877, + "time_per_iteration": 2.627589225769043 + }, + { + "auxiliary_loss_clip": 0.01110689, + "auxiliary_loss_mlp": 0.01030438, + "balance_loss_clip": 1.01751554, + "balance_loss_mlp": 1.03852785, + "epoch": 0.47365098451826243, + "flos": 21939408708480.0, + "grad_norm": 1.8692095295200843, + "language_loss": 0.70723194, + "learning_rate": 2.2669557862613685e-06, + "loss": 0.72864318, + "num_input_tokens_seen": 169247855, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.72265625, + "step": 7878, + "time_per_iteration": 2.535684108734131 + }, + { + "auxiliary_loss_clip": 0.01108668, + "auxiliary_loss_mlp": 0.01030671, + "balance_loss_clip": 1.01792121, + "balance_loss_mlp": 1.03918552, + "epoch": 0.4737111077709304, + "flos": 25845053207040.0, + "grad_norm": 1.811278524460759, + "language_loss": 0.75030494, + "learning_rate": 2.2665698045054425e-06, + "loss": 0.77169836, + "num_input_tokens_seen": 169268860, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 7879, + "time_per_iteration": 2.518188953399658 + }, + { + "auxiliary_loss_clip": 0.01034961, + "auxiliary_loss_mlp": 0.01000904, + "balance_loss_clip": 0.99943775, + "balance_loss_mlp": 1.01098931, + "epoch": 0.47377123102359836, + "flos": 67760886314880.0, + "grad_norm": 0.7286317750961989, + "language_loss": 0.6135056, + "learning_rate": 2.266183812641164e-06, + "loss": 0.63386428, + "num_input_tokens_seen": 169331855, + "router_z_loss_clip": 0.01464844, + "router_z_loss_mlp": 0.24023438, + "step": 7880, + "time_per_iteration": 3.0518951416015625 + }, + { + "auxiliary_loss_clip": 0.01110775, + "auxiliary_loss_mlp": 0.01033424, + "balance_loss_clip": 1.01922059, + "balance_loss_mlp": 1.03901792, + "epoch": 0.4738313542762663, + "flos": 24315977191680.0, + "grad_norm": 1.5146846775966347, + "language_loss": 0.6795128, + "learning_rate": 2.2657978106831675e-06, + "loss": 0.70095479, + "num_input_tokens_seen": 169352175, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.71875, + "step": 7881, + "time_per_iteration": 2.5058367252349854 + }, + { + "auxiliary_loss_clip": 0.01111211, + "auxiliary_loss_mlp": 0.01028797, + "balance_loss_clip": 1.01614857, + "balance_loss_mlp": 1.03997886, + "epoch": 0.4738914775289343, + "flos": 20705339093760.0, + "grad_norm": 1.916106799054198, + "language_loss": 0.77455914, + "learning_rate": 2.265411798646092e-06, + "loss": 0.79595923, + "num_input_tokens_seen": 169371215, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 7882, + "time_per_iteration": 2.475503921508789 + }, + { + "auxiliary_loss_clip": 0.01113056, + "auxiliary_loss_mlp": 0.01030232, + "balance_loss_clip": 1.01675582, + "balance_loss_mlp": 1.03993428, + "epoch": 0.4739516007816023, + "flos": 25446337263360.0, + "grad_norm": 1.505527482540033, + "language_loss": 0.7617712, + "learning_rate": 2.2650257765445747e-06, + "loss": 0.78320408, + "num_input_tokens_seen": 169391745, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 7883, + "time_per_iteration": 2.5051398277282715 + }, + { + "auxiliary_loss_clip": 0.01111273, + "auxiliary_loss_mlp": 0.01029698, + "balance_loss_clip": 1.01724708, + "balance_loss_mlp": 1.03893495, + "epoch": 0.4740117240342703, + "flos": 19974341410560.0, + "grad_norm": 1.7576670192685107, + "language_loss": 0.71994746, + "learning_rate": 2.2646397443932525e-06, + "loss": 0.74135715, + "num_input_tokens_seen": 169409845, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7265625, + "step": 7884, + "time_per_iteration": 2.4406635761260986 + }, + { + "auxiliary_loss_clip": 0.01117273, + "auxiliary_loss_mlp": 0.0103414, + "balance_loss_clip": 1.02024651, + "balance_loss_mlp": 1.04002821, + "epoch": 0.47407184728693824, + "flos": 15661146222720.0, + "grad_norm": 2.026641651540024, + "language_loss": 0.82025737, + "learning_rate": 2.2642537022067655e-06, + "loss": 0.84177154, + "num_input_tokens_seen": 169426085, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7734375, + "step": 7885, + "time_per_iteration": 2.463895797729492 + }, + { + "auxiliary_loss_clip": 0.01115601, + "auxiliary_loss_mlp": 0.01034821, + "balance_loss_clip": 1.02152371, + "balance_loss_mlp": 1.04353762, + "epoch": 0.4741319705396062, + "flos": 18588800142720.0, + "grad_norm": 1.728500395905687, + "language_loss": 0.73431885, + "learning_rate": 2.263867649999751e-06, + "loss": 0.75582302, + "num_input_tokens_seen": 169444705, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 7886, + "time_per_iteration": 3.8351001739501953 + }, + { + "auxiliary_loss_clip": 0.01116571, + "auxiliary_loss_mlp": 0.01034684, + "balance_loss_clip": 1.02036691, + "balance_loss_mlp": 1.03938007, + "epoch": 0.47419209379227417, + "flos": 13261093223040.0, + "grad_norm": 2.1265145819393667, + "language_loss": 0.73465097, + "learning_rate": 2.263481587786849e-06, + "loss": 0.75616348, + "num_input_tokens_seen": 169460850, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.76953125, + "step": 7887, + "time_per_iteration": 5.437266111373901 + }, + { + "auxiliary_loss_clip": 0.01108649, + "auxiliary_loss_mlp": 0.01028216, + "balance_loss_clip": 1.01562774, + "balance_loss_mlp": 1.03885245, + "epoch": 0.47425221704494214, + "flos": 20044043752320.0, + "grad_norm": 1.895223723891788, + "language_loss": 0.77138984, + "learning_rate": 2.2630955155826993e-06, + "loss": 0.79275852, + "num_input_tokens_seen": 169478890, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69921875, + "step": 7888, + "time_per_iteration": 3.8908259868621826 + }, + { + "auxiliary_loss_clip": 0.01113126, + "auxiliary_loss_mlp": 0.01032757, + "balance_loss_clip": 1.02004313, + "balance_loss_mlp": 1.04045427, + "epoch": 0.4743123402976101, + "flos": 27271892136960.0, + "grad_norm": 1.663584432705133, + "language_loss": 0.72822642, + "learning_rate": 2.2627094334019406e-06, + "loss": 0.74968517, + "num_input_tokens_seen": 169499690, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7265625, + "step": 7889, + "time_per_iteration": 2.5004560947418213 + }, + { + "auxiliary_loss_clip": 0.01036118, + "auxiliary_loss_mlp": 0.01004378, + "balance_loss_clip": 1.00301266, + "balance_loss_mlp": 1.0120219, + "epoch": 0.47437246355027807, + "flos": 55393970261760.0, + "grad_norm": 1.138520548555467, + "language_loss": 0.5608511, + "learning_rate": 2.262323341259214e-06, + "loss": 0.58125609, + "num_input_tokens_seen": 169560475, + "router_z_loss_clip": 0.01367188, + "router_z_loss_mlp": 0.24121094, + "step": 7890, + "time_per_iteration": 3.116922378540039 + }, + { + "auxiliary_loss_clip": 0.01114938, + "auxiliary_loss_mlp": 0.0103492, + "balance_loss_clip": 1.02009606, + "balance_loss_mlp": 1.04115105, + "epoch": 0.47443258680294603, + "flos": 23878477537920.0, + "grad_norm": 2.185015538438359, + "language_loss": 0.6552254, + "learning_rate": 2.2619372391691605e-06, + "loss": 0.67672396, + "num_input_tokens_seen": 169580110, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.734375, + "step": 7891, + "time_per_iteration": 2.475003242492676 + }, + { + "auxiliary_loss_clip": 0.011182, + "auxiliary_loss_mlp": 0.01035949, + "balance_loss_clip": 1.02170992, + "balance_loss_mlp": 1.04182184, + "epoch": 0.474492710055614, + "flos": 21977761455360.0, + "grad_norm": 2.136023484028619, + "language_loss": 0.70221758, + "learning_rate": 2.26155112714642e-06, + "loss": 0.72375906, + "num_input_tokens_seen": 169597510, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.76171875, + "step": 7892, + "time_per_iteration": 2.45662260055542 + }, + { + "auxiliary_loss_clip": 0.01036198, + "auxiliary_loss_mlp": 0.01003564, + "balance_loss_clip": 1.00212717, + "balance_loss_mlp": 1.01211762, + "epoch": 0.47455283330828196, + "flos": 62557180122240.0, + "grad_norm": 0.8097608885887184, + "language_loss": 0.5861572, + "learning_rate": 2.2611650052056355e-06, + "loss": 0.60655481, + "num_input_tokens_seen": 169660010, + "router_z_loss_clip": 0.01434326, + "router_z_loss_mlp": 0.24121094, + "step": 7893, + "time_per_iteration": 3.1652448177337646 + }, + { + "auxiliary_loss_clip": 0.01114001, + "auxiliary_loss_mlp": 0.01033874, + "balance_loss_clip": 1.02131534, + "balance_loss_mlp": 1.04149461, + "epoch": 0.47461295656094993, + "flos": 12093637380480.0, + "grad_norm": 1.8991850536849317, + "language_loss": 0.77645361, + "learning_rate": 2.2607788733614463e-06, + "loss": 0.79793239, + "num_input_tokens_seen": 169678485, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7265625, + "step": 7894, + "time_per_iteration": 2.4849085807800293 + }, + { + "auxiliary_loss_clip": 0.01112228, + "auxiliary_loss_mlp": 0.01031855, + "balance_loss_clip": 1.01912403, + "balance_loss_mlp": 1.04029822, + "epoch": 0.4746730798136179, + "flos": 20884568981760.0, + "grad_norm": 1.6188047164673534, + "language_loss": 0.74456996, + "learning_rate": 2.260392731628497e-06, + "loss": 0.76601076, + "num_input_tokens_seen": 169697335, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 7895, + "time_per_iteration": 2.456735372543335 + }, + { + "auxiliary_loss_clip": 0.01110765, + "auxiliary_loss_mlp": 0.01029148, + "balance_loss_clip": 1.01553416, + "balance_loss_mlp": 1.03990245, + "epoch": 0.4747332030662859, + "flos": 19974808287360.0, + "grad_norm": 1.9073077974003343, + "language_loss": 0.82539713, + "learning_rate": 2.260006580021429e-06, + "loss": 0.84679627, + "num_input_tokens_seen": 169715395, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7109375, + "step": 7896, + "time_per_iteration": 2.5201456546783447 + }, + { + "auxiliary_loss_clip": 0.01110585, + "auxiliary_loss_mlp": 0.01029897, + "balance_loss_clip": 1.0161047, + "balance_loss_mlp": 1.03953171, + "epoch": 0.4747933263189539, + "flos": 16034186920320.0, + "grad_norm": 1.922550471395919, + "language_loss": 0.75487721, + "learning_rate": 2.259620418554886e-06, + "loss": 0.77628207, + "num_input_tokens_seen": 169733755, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7109375, + "step": 7897, + "time_per_iteration": 2.42526912689209 + }, + { + "auxiliary_loss_clip": 0.01116598, + "auxiliary_loss_mlp": 0.01036051, + "balance_loss_clip": 1.02316415, + "balance_loss_mlp": 1.04003334, + "epoch": 0.47485344957162184, + "flos": 13955102876160.0, + "grad_norm": 2.1696415620255145, + "language_loss": 0.63682836, + "learning_rate": 2.25923424724351e-06, + "loss": 0.65835488, + "num_input_tokens_seen": 169751390, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.765625, + "step": 7898, + "time_per_iteration": 2.443390369415283 + }, + { + "auxiliary_loss_clip": 0.01111767, + "auxiliary_loss_mlp": 0.01036151, + "balance_loss_clip": 1.02263284, + "balance_loss_mlp": 1.03901982, + "epoch": 0.4749135728242898, + "flos": 20449080489600.0, + "grad_norm": 2.0733269605967997, + "language_loss": 0.6999402, + "learning_rate": 2.258848066101946e-06, + "loss": 0.72141939, + "num_input_tokens_seen": 169769500, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7265625, + "step": 7899, + "time_per_iteration": 2.5906245708465576 + }, + { + "auxiliary_loss_clip": 0.01114723, + "auxiliary_loss_mlp": 0.01036945, + "balance_loss_clip": 1.02314603, + "balance_loss_mlp": 1.04054523, + "epoch": 0.4749736960769578, + "flos": 28949961767040.0, + "grad_norm": 1.8534573860401393, + "language_loss": 0.68523431, + "learning_rate": 2.258461875144837e-06, + "loss": 0.70675093, + "num_input_tokens_seen": 169789215, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7421875, + "step": 7900, + "time_per_iteration": 2.5417144298553467 + }, + { + "auxiliary_loss_clip": 0.01112761, + "auxiliary_loss_mlp": 0.01033859, + "balance_loss_clip": 1.02096641, + "balance_loss_mlp": 1.03979492, + "epoch": 0.47503381932962574, + "flos": 31938770592000.0, + "grad_norm": 1.9751823447072345, + "language_loss": 0.70783907, + "learning_rate": 2.2580756743868273e-06, + "loss": 0.72930533, + "num_input_tokens_seen": 169808825, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.73046875, + "step": 7901, + "time_per_iteration": 2.5215682983398438 + }, + { + "auxiliary_loss_clip": 0.01115181, + "auxiliary_loss_mlp": 0.01041261, + "balance_loss_clip": 1.02833235, + "balance_loss_mlp": 1.0420568, + "epoch": 0.4750939425822937, + "flos": 22127257860480.0, + "grad_norm": 1.7245601487210742, + "language_loss": 0.73674953, + "learning_rate": 2.2576894638425636e-06, + "loss": 0.75831395, + "num_input_tokens_seen": 169827590, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.734375, + "step": 7902, + "time_per_iteration": 2.4854226112365723 + }, + { + "auxiliary_loss_clip": 0.01108872, + "auxiliary_loss_mlp": 0.01033639, + "balance_loss_clip": 1.02169394, + "balance_loss_mlp": 1.03990698, + "epoch": 0.47515406583496167, + "flos": 20850094903680.0, + "grad_norm": 1.6802974507725348, + "language_loss": 0.68601072, + "learning_rate": 2.257303243526688e-06, + "loss": 0.70743585, + "num_input_tokens_seen": 169844925, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.69140625, + "step": 7903, + "time_per_iteration": 2.44101619720459 + }, + { + "auxiliary_loss_clip": 0.01108361, + "auxiliary_loss_mlp": 0.0103213, + "balance_loss_clip": 1.01995277, + "balance_loss_mlp": 1.03901863, + "epoch": 0.47521418908762963, + "flos": 17524802448000.0, + "grad_norm": 1.4630263980427167, + "language_loss": 0.7225582, + "learning_rate": 2.256917013453848e-06, + "loss": 0.74396306, + "num_input_tokens_seen": 169862705, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6953125, + "step": 7904, + "time_per_iteration": 2.469230890274048 + }, + { + "auxiliary_loss_clip": 0.01108968, + "auxiliary_loss_mlp": 0.01030528, + "balance_loss_clip": 1.01894665, + "balance_loss_mlp": 1.03912354, + "epoch": 0.4752743123402976, + "flos": 20559434048640.0, + "grad_norm": 1.669936371268517, + "language_loss": 0.86257637, + "learning_rate": 2.25653077363869e-06, + "loss": 0.88397133, + "num_input_tokens_seen": 169880155, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6953125, + "step": 7905, + "time_per_iteration": 2.442215919494629 + }, + { + "auxiliary_loss_clip": 0.0110692, + "auxiliary_loss_mlp": 0.0102936, + "balance_loss_clip": 1.01750422, + "balance_loss_mlp": 1.03796053, + "epoch": 0.47533443559296557, + "flos": 26360623071360.0, + "grad_norm": 1.6116801799731275, + "language_loss": 0.82223809, + "learning_rate": 2.2561445240958583e-06, + "loss": 0.84360093, + "num_input_tokens_seen": 169901525, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 7906, + "time_per_iteration": 2.503708600997925 + }, + { + "auxiliary_loss_clip": 0.01033043, + "auxiliary_loss_mlp": 0.01004824, + "balance_loss_clip": 1.00345886, + "balance_loss_mlp": 1.00910616, + "epoch": 0.47539455884563353, + "flos": 65949660967680.0, + "grad_norm": 0.6702574149317626, + "language_loss": 0.59028685, + "learning_rate": 2.255758264840002e-06, + "loss": 0.61066544, + "num_input_tokens_seen": 169970345, + "router_z_loss_clip": 0.01367188, + "router_z_loss_mlp": 0.23925781, + "step": 7907, + "time_per_iteration": 3.156270980834961 + }, + { + "auxiliary_loss_clip": 0.01112242, + "auxiliary_loss_mlp": 0.01036172, + "balance_loss_clip": 1.02349377, + "balance_loss_mlp": 1.04145598, + "epoch": 0.4754546820983015, + "flos": 17238128002560.0, + "grad_norm": 1.9115330257313565, + "language_loss": 0.81044137, + "learning_rate": 2.255371995885765e-06, + "loss": 0.83192551, + "num_input_tokens_seen": 169986440, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 7908, + "time_per_iteration": 2.4719884395599365 + }, + { + "auxiliary_loss_clip": 0.01115991, + "auxiliary_loss_mlp": 0.01033248, + "balance_loss_clip": 1.01944923, + "balance_loss_mlp": 1.04349983, + "epoch": 0.47551480535096946, + "flos": 19825886499840.0, + "grad_norm": 1.7275790068018955, + "language_loss": 0.73515987, + "learning_rate": 2.254985717247797e-06, + "loss": 0.75665224, + "num_input_tokens_seen": 170005705, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.72265625, + "step": 7909, + "time_per_iteration": 2.4672436714172363 + }, + { + "auxiliary_loss_clip": 0.01110088, + "auxiliary_loss_mlp": 0.01031421, + "balance_loss_clip": 1.01887441, + "balance_loss_mlp": 1.03941047, + "epoch": 0.4755749286036375, + "flos": 22163958581760.0, + "grad_norm": 1.618978075546398, + "language_loss": 0.75284743, + "learning_rate": 2.2545994289407457e-06, + "loss": 0.77426249, + "num_input_tokens_seen": 170023415, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.70703125, + "step": 7910, + "time_per_iteration": 2.498745918273926 + }, + { + "auxiliary_loss_clip": 0.0110873, + "auxiliary_loss_mlp": 0.01026701, + "balance_loss_clip": 1.01494122, + "balance_loss_mlp": 1.03872323, + "epoch": 0.47563505185630545, + "flos": 21648280976640.0, + "grad_norm": 1.8146975429148502, + "language_loss": 0.78950047, + "learning_rate": 2.2542131309792577e-06, + "loss": 0.81085479, + "num_input_tokens_seen": 170042395, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.703125, + "step": 7911, + "time_per_iteration": 2.4530739784240723 + }, + { + "auxiliary_loss_clip": 0.01112727, + "auxiliary_loss_mlp": 0.01030628, + "balance_loss_clip": 1.01709199, + "balance_loss_mlp": 1.03904319, + "epoch": 0.4756951751089734, + "flos": 20628777254400.0, + "grad_norm": 1.5788116451196046, + "language_loss": 0.75611186, + "learning_rate": 2.253826823377983e-06, + "loss": 0.77754539, + "num_input_tokens_seen": 170061610, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 7912, + "time_per_iteration": 2.468348741531372 + }, + { + "auxiliary_loss_clip": 0.01107815, + "auxiliary_loss_mlp": 0.01033048, + "balance_loss_clip": 1.02094245, + "balance_loss_mlp": 1.03746927, + "epoch": 0.4757552983616414, + "flos": 25848788221440.0, + "grad_norm": 1.4305595105203048, + "language_loss": 0.74305665, + "learning_rate": 2.253440506151569e-06, + "loss": 0.76446521, + "num_input_tokens_seen": 170083505, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.703125, + "step": 7913, + "time_per_iteration": 2.4857094287872314 + }, + { + "auxiliary_loss_clip": 0.01111637, + "auxiliary_loss_mlp": 0.01026142, + "balance_loss_clip": 1.01336265, + "balance_loss_mlp": 1.04057527, + "epoch": 0.47581542161430934, + "flos": 18223013992320.0, + "grad_norm": 1.9652679728787295, + "language_loss": 0.72320372, + "learning_rate": 2.253054179314666e-06, + "loss": 0.74458152, + "num_input_tokens_seen": 170100690, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7109375, + "step": 7914, + "time_per_iteration": 2.4559848308563232 + }, + { + "auxiliary_loss_clip": 0.01114052, + "auxiliary_loss_mlp": 0.01031259, + "balance_loss_clip": 1.0191946, + "balance_loss_mlp": 1.04203475, + "epoch": 0.4758755448669773, + "flos": 21579763783680.0, + "grad_norm": 1.960460869956429, + "language_loss": 0.64513958, + "learning_rate": 2.2526678428819227e-06, + "loss": 0.66659272, + "num_input_tokens_seen": 170119240, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71875, + "step": 7915, + "time_per_iteration": 2.4528729915618896 + }, + { + "auxiliary_loss_clip": 0.01106319, + "auxiliary_loss_mlp": 0.01032601, + "balance_loss_clip": 1.020257, + "balance_loss_mlp": 1.03847694, + "epoch": 0.47593566811964527, + "flos": 15231152511360.0, + "grad_norm": 1.6765568872542898, + "language_loss": 0.76760435, + "learning_rate": 2.2522814968679896e-06, + "loss": 0.7889936, + "num_input_tokens_seen": 170136450, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6796875, + "step": 7916, + "time_per_iteration": 2.4544637203216553 + }, + { + "auxiliary_loss_clip": 0.01109831, + "auxiliary_loss_mlp": 0.01029473, + "balance_loss_clip": 1.01720083, + "balance_loss_mlp": 1.038872, + "epoch": 0.47599579137231324, + "flos": 21543242630400.0, + "grad_norm": 1.7964770898598468, + "language_loss": 0.64513361, + "learning_rate": 2.2518951412875173e-06, + "loss": 0.66652668, + "num_input_tokens_seen": 170155295, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.70703125, + "step": 7917, + "time_per_iteration": 2.4966535568237305 + }, + { + "auxiliary_loss_clip": 0.01033431, + "auxiliary_loss_mlp": 0.01003778, + "balance_loss_clip": 1.00258541, + "balance_loss_mlp": 1.00975943, + "epoch": 0.4760559146249812, + "flos": 64554602595840.0, + "grad_norm": 0.8336021747517385, + "language_loss": 0.6568867, + "learning_rate": 2.2515087761551557e-06, + "loss": 0.67725885, + "num_input_tokens_seen": 170222325, + "router_z_loss_clip": 0.01190186, + "router_z_loss_mlp": 0.23632812, + "step": 7918, + "time_per_iteration": 3.0902352333068848 + }, + { + "auxiliary_loss_clip": 0.01111138, + "auxiliary_loss_mlp": 0.01031837, + "balance_loss_clip": 1.01937342, + "balance_loss_mlp": 1.03909731, + "epoch": 0.47611603787764917, + "flos": 22233876405120.0, + "grad_norm": 1.7210259476746916, + "language_loss": 0.6884234, + "learning_rate": 2.2511224014855563e-06, + "loss": 0.70985305, + "num_input_tokens_seen": 170241625, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71875, + "step": 7919, + "time_per_iteration": 2.451730728149414 + }, + { + "auxiliary_loss_clip": 0.01111075, + "auxiliary_loss_mlp": 0.0103435, + "balance_loss_clip": 1.02188087, + "balance_loss_mlp": 1.03897047, + "epoch": 0.47617616113031713, + "flos": 22780005765120.0, + "grad_norm": 1.5380536315740185, + "language_loss": 0.74750632, + "learning_rate": 2.2507360172933694e-06, + "loss": 0.7689606, + "num_input_tokens_seen": 170262470, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71875, + "step": 7920, + "time_per_iteration": 2.5365359783172607 + }, + { + "auxiliary_loss_clip": 0.0111556, + "auxiliary_loss_mlp": 0.01032511, + "balance_loss_clip": 1.01854539, + "balance_loss_mlp": 1.04174948, + "epoch": 0.4762362843829851, + "flos": 24133802388480.0, + "grad_norm": 1.4190261222987137, + "language_loss": 0.77478063, + "learning_rate": 2.2503496235932487e-06, + "loss": 0.79626137, + "num_input_tokens_seen": 170283460, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.73828125, + "step": 7921, + "time_per_iteration": 2.4841856956481934 + }, + { + "auxiliary_loss_clip": 0.01112061, + "auxiliary_loss_mlp": 0.01035062, + "balance_loss_clip": 1.02113843, + "balance_loss_mlp": 1.03917885, + "epoch": 0.47629640763565306, + "flos": 22452069571200.0, + "grad_norm": 1.531083685843196, + "language_loss": 0.78213, + "learning_rate": 2.249963220399845e-06, + "loss": 0.80360126, + "num_input_tokens_seen": 170304225, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7265625, + "step": 7922, + "time_per_iteration": 2.537930965423584 + }, + { + "auxiliary_loss_clip": 0.01115671, + "auxiliary_loss_mlp": 0.0103746, + "balance_loss_clip": 1.02360809, + "balance_loss_mlp": 1.04113102, + "epoch": 0.4763565308883211, + "flos": 11181398647680.0, + "grad_norm": 1.7101716924021442, + "language_loss": 0.72932559, + "learning_rate": 2.2495768077278104e-06, + "loss": 0.75085688, + "num_input_tokens_seen": 170322110, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.74609375, + "step": 7923, + "time_per_iteration": 2.4527640342712402 + }, + { + "auxiliary_loss_clip": 0.01110421, + "auxiliary_loss_mlp": 0.01032066, + "balance_loss_clip": 1.01978159, + "balance_loss_mlp": 1.03808331, + "epoch": 0.47641665414098905, + "flos": 22382151747840.0, + "grad_norm": 2.125534979901623, + "language_loss": 0.81915551, + "learning_rate": 2.2491903855917992e-06, + "loss": 0.84058034, + "num_input_tokens_seen": 170340700, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.7265625, + "step": 7924, + "time_per_iteration": 2.480109930038452 + }, + { + "auxiliary_loss_clip": 0.01120558, + "auxiliary_loss_mlp": 0.01038344, + "balance_loss_clip": 1.0246644, + "balance_loss_mlp": 1.04359889, + "epoch": 0.476476777393657, + "flos": 25046148862080.0, + "grad_norm": 1.7710398873833821, + "language_loss": 0.80079067, + "learning_rate": 2.2488039540064626e-06, + "loss": 0.82237971, + "num_input_tokens_seen": 170359780, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76953125, + "step": 7925, + "time_per_iteration": 2.4877142906188965 + }, + { + "auxiliary_loss_clip": 0.01110581, + "auxiliary_loss_mlp": 0.01035936, + "balance_loss_clip": 1.02343702, + "balance_loss_mlp": 1.03800642, + "epoch": 0.476536900646325, + "flos": 27269916888960.0, + "grad_norm": 2.066985409764694, + "language_loss": 0.72263825, + "learning_rate": 2.2484175129864558e-06, + "loss": 0.74410343, + "num_input_tokens_seen": 170381260, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7265625, + "step": 7926, + "time_per_iteration": 2.561140298843384 + }, + { + "auxiliary_loss_clip": 0.01116818, + "auxiliary_loss_mlp": 0.01029726, + "balance_loss_clip": 1.01623797, + "balance_loss_mlp": 1.04205072, + "epoch": 0.47659702389899294, + "flos": 25301401885440.0, + "grad_norm": 8.404578303652414, + "language_loss": 0.68589562, + "learning_rate": 2.248031062546432e-06, + "loss": 0.7073611, + "num_input_tokens_seen": 170400595, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 7927, + "time_per_iteration": 2.4860117435455322 + }, + { + "auxiliary_loss_clip": 0.01111384, + "auxiliary_loss_mlp": 0.01025704, + "balance_loss_clip": 1.0138253, + "balance_loss_mlp": 1.04121518, + "epoch": 0.4766571471516609, + "flos": 25992861672960.0, + "grad_norm": 1.5906069345122125, + "language_loss": 0.68003678, + "learning_rate": 2.247644602701045e-06, + "loss": 0.70140767, + "num_input_tokens_seen": 170421110, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.703125, + "step": 7928, + "time_per_iteration": 3.917212724685669 + }, + { + "auxiliary_loss_clip": 0.0111287, + "auxiliary_loss_mlp": 0.01028459, + "balance_loss_clip": 1.0160315, + "balance_loss_mlp": 1.04099739, + "epoch": 0.4767172704043289, + "flos": 16032211672320.0, + "grad_norm": 2.0359036820122762, + "language_loss": 0.79055941, + "learning_rate": 2.2472581334649496e-06, + "loss": 0.81197274, + "num_input_tokens_seen": 170436700, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71875, + "step": 7929, + "time_per_iteration": 5.38159441947937 + }, + { + "auxiliary_loss_clip": 0.01109888, + "auxiliary_loss_mlp": 0.01032742, + "balance_loss_clip": 1.02098787, + "balance_loss_mlp": 1.04033756, + "epoch": 0.47677739365699684, + "flos": 39235351651200.0, + "grad_norm": 1.8427147864954625, + "language_loss": 0.6634798, + "learning_rate": 2.2468716548528016e-06, + "loss": 0.68490613, + "num_input_tokens_seen": 170459555, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6953125, + "step": 7930, + "time_per_iteration": 4.1562559604644775 + }, + { + "auxiliary_loss_clip": 0.01110022, + "auxiliary_loss_mlp": 0.01030402, + "balance_loss_clip": 1.01830864, + "balance_loss_mlp": 1.03929853, + "epoch": 0.4768375169096648, + "flos": 24717781704960.0, + "grad_norm": 1.7695493738399266, + "language_loss": 0.80279613, + "learning_rate": 2.2464851668792555e-06, + "loss": 0.82420039, + "num_input_tokens_seen": 170479175, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.70703125, + "step": 7931, + "time_per_iteration": 2.483144760131836 + }, + { + "auxiliary_loss_clip": 0.01112785, + "auxiliary_loss_mlp": 0.01029869, + "balance_loss_clip": 1.01667237, + "balance_loss_mlp": 1.04009867, + "epoch": 0.47689764016233277, + "flos": 22528667324160.0, + "grad_norm": 1.714860616709588, + "language_loss": 0.75956833, + "learning_rate": 2.2460986695589678e-06, + "loss": 0.78099489, + "num_input_tokens_seen": 170498450, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7265625, + "step": 7932, + "time_per_iteration": 2.4789490699768066 + }, + { + "auxiliary_loss_clip": 0.0111028, + "auxiliary_loss_mlp": 0.01033967, + "balance_loss_clip": 1.02110386, + "balance_loss_mlp": 1.04108882, + "epoch": 0.47695776341500074, + "flos": 15120619384320.0, + "grad_norm": 2.3368480026304748, + "language_loss": 0.79639196, + "learning_rate": 2.245712162906593e-06, + "loss": 0.81783438, + "num_input_tokens_seen": 170516255, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69140625, + "step": 7933, + "time_per_iteration": 2.4574432373046875 + }, + { + "auxiliary_loss_clip": 0.01116858, + "auxiliary_loss_mlp": 0.0103583, + "balance_loss_clip": 1.02131057, + "balance_loss_mlp": 1.04114437, + "epoch": 0.4770178866676687, + "flos": 14678917839360.0, + "grad_norm": 1.7879612820388389, + "language_loss": 0.73776019, + "learning_rate": 2.2453256469367888e-06, + "loss": 0.759287, + "num_input_tokens_seen": 170532705, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7578125, + "step": 7934, + "time_per_iteration": 2.4703593254089355 + }, + { + "auxiliary_loss_clip": 0.0111259, + "auxiliary_loss_mlp": 0.01028961, + "balance_loss_clip": 1.01611567, + "balance_loss_mlp": 1.03858674, + "epoch": 0.47707800992033667, + "flos": 22565583527040.0, + "grad_norm": 1.719427707895152, + "language_loss": 0.7973842, + "learning_rate": 2.244939121664211e-06, + "loss": 0.81879967, + "num_input_tokens_seen": 170551925, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7421875, + "step": 7935, + "time_per_iteration": 2.459326982498169 + }, + { + "auxiliary_loss_clip": 0.01119081, + "auxiliary_loss_mlp": 0.01039794, + "balance_loss_clip": 1.02566767, + "balance_loss_mlp": 1.04244995, + "epoch": 0.4771381331730047, + "flos": 30918225375360.0, + "grad_norm": 1.7712234775739364, + "language_loss": 0.71105671, + "learning_rate": 2.2445525871035177e-06, + "loss": 0.73264545, + "num_input_tokens_seen": 170572320, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.76953125, + "step": 7936, + "time_per_iteration": 2.599914312362671 + }, + { + "auxiliary_loss_clip": 0.01112402, + "auxiliary_loss_mlp": 0.01028093, + "balance_loss_clip": 1.01529551, + "balance_loss_mlp": 1.03864932, + "epoch": 0.47719825642567265, + "flos": 25738901539200.0, + "grad_norm": 2.8731818732430927, + "language_loss": 0.68026948, + "learning_rate": 2.2441660432693656e-06, + "loss": 0.7016744, + "num_input_tokens_seen": 170589470, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.73828125, + "step": 7937, + "time_per_iteration": 2.4884297847747803 + }, + { + "auxiliary_loss_clip": 0.01035404, + "auxiliary_loss_mlp": 0.00999711, + "balance_loss_clip": 0.99838793, + "balance_loss_mlp": 1.01120472, + "epoch": 0.4772583796783406, + "flos": 66355128668160.0, + "grad_norm": 0.7133873095384958, + "language_loss": 0.56401992, + "learning_rate": 2.2437794901764128e-06, + "loss": 0.58437109, + "num_input_tokens_seen": 170662265, + "router_z_loss_clip": 0.01324463, + "router_z_loss_mlp": 0.2421875, + "step": 7938, + "time_per_iteration": 3.27707576751709 + }, + { + "auxiliary_loss_clip": 0.01113753, + "auxiliary_loss_mlp": 0.01032085, + "balance_loss_clip": 1.01889467, + "balance_loss_mlp": 1.04162848, + "epoch": 0.4773185029310086, + "flos": 22051091070720.0, + "grad_norm": 1.6305385471502185, + "language_loss": 0.88721037, + "learning_rate": 2.243392927839317e-06, + "loss": 0.9086687, + "num_input_tokens_seen": 170679680, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.72265625, + "step": 7939, + "time_per_iteration": 2.503838300704956 + }, + { + "auxiliary_loss_clip": 0.01110311, + "auxiliary_loss_mlp": 0.01032369, + "balance_loss_clip": 1.02037096, + "balance_loss_mlp": 1.03832293, + "epoch": 0.47737862618367655, + "flos": 16727801523840.0, + "grad_norm": 2.146362570276984, + "language_loss": 0.76661658, + "learning_rate": 2.2430063562727367e-06, + "loss": 0.78804338, + "num_input_tokens_seen": 170697340, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.72265625, + "step": 7940, + "time_per_iteration": 2.4230127334594727 + }, + { + "auxiliary_loss_clip": 0.01109098, + "auxiliary_loss_mlp": 0.01031125, + "balance_loss_clip": 1.0194304, + "balance_loss_mlp": 1.03975916, + "epoch": 0.4774387494363445, + "flos": 19609453100160.0, + "grad_norm": 1.568994035010224, + "language_loss": 0.84892023, + "learning_rate": 2.2426197754913322e-06, + "loss": 0.87032247, + "num_input_tokens_seen": 170714905, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6953125, + "step": 7941, + "time_per_iteration": 2.4640510082244873 + }, + { + "auxiliary_loss_clip": 0.01116894, + "auxiliary_loss_mlp": 0.01035857, + "balance_loss_clip": 1.02263689, + "balance_loss_mlp": 1.04307771, + "epoch": 0.4774988726890125, + "flos": 16653969118080.0, + "grad_norm": 2.0154740266117104, + "language_loss": 0.75996536, + "learning_rate": 2.24223318550976e-06, + "loss": 0.78149283, + "num_input_tokens_seen": 170731810, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73828125, + "step": 7942, + "time_per_iteration": 2.4304351806640625 + }, + { + "auxiliary_loss_clip": 0.01113984, + "auxiliary_loss_mlp": 0.01038477, + "balance_loss_clip": 1.02646661, + "balance_loss_mlp": 1.0415473, + "epoch": 0.47755899594168044, + "flos": 20485565729280.0, + "grad_norm": 1.8198127192389717, + "language_loss": 0.64578187, + "learning_rate": 2.241846586342682e-06, + "loss": 0.66730648, + "num_input_tokens_seen": 170750270, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.72265625, + "step": 7943, + "time_per_iteration": 2.469884157180786 + }, + { + "auxiliary_loss_clip": 0.01114805, + "auxiliary_loss_mlp": 0.01036635, + "balance_loss_clip": 1.02318239, + "balance_loss_mlp": 1.04029822, + "epoch": 0.4776191191943484, + "flos": 21652806090240.0, + "grad_norm": 1.6437441778624493, + "language_loss": 0.73638076, + "learning_rate": 2.2414599780047577e-06, + "loss": 0.75789517, + "num_input_tokens_seen": 170769015, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 7944, + "time_per_iteration": 2.462620258331299 + }, + { + "auxiliary_loss_clip": 0.0111381, + "auxiliary_loss_mlp": 0.01034914, + "balance_loss_clip": 1.02092481, + "balance_loss_mlp": 1.04105759, + "epoch": 0.4776792424470164, + "flos": 18770220760320.0, + "grad_norm": 2.2015870606275785, + "language_loss": 0.67936689, + "learning_rate": 2.2410733605106456e-06, + "loss": 0.70085418, + "num_input_tokens_seen": 170785725, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7265625, + "step": 7945, + "time_per_iteration": 2.498506784439087 + }, + { + "auxiliary_loss_clip": 0.01110287, + "auxiliary_loss_mlp": 0.01036964, + "balance_loss_clip": 1.02483487, + "balance_loss_mlp": 1.03805077, + "epoch": 0.47773936569968434, + "flos": 29715828577920.0, + "grad_norm": 1.8282867356700874, + "language_loss": 0.75330615, + "learning_rate": 2.240686733875009e-06, + "loss": 0.77477872, + "num_input_tokens_seen": 170804600, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.72265625, + "step": 7946, + "time_per_iteration": 2.5168514251708984 + }, + { + "auxiliary_loss_clip": 0.01116531, + "auxiliary_loss_mlp": 0.01041116, + "balance_loss_clip": 1.02759135, + "balance_loss_mlp": 1.04283607, + "epoch": 0.4777994889523523, + "flos": 24791542283520.0, + "grad_norm": 1.7491504350819331, + "language_loss": 0.79312646, + "learning_rate": 2.240300098112506e-06, + "loss": 0.81470287, + "num_input_tokens_seen": 170824230, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.734375, + "step": 7947, + "time_per_iteration": 2.5980498790740967 + }, + { + "auxiliary_loss_clip": 0.01107555, + "auxiliary_loss_mlp": 0.01036726, + "balance_loss_clip": 1.02433419, + "balance_loss_mlp": 1.0381552, + "epoch": 0.47785961220502027, + "flos": 17858161595520.0, + "grad_norm": 1.7633094448758173, + "language_loss": 0.73717982, + "learning_rate": 2.2399134532377998e-06, + "loss": 0.75862265, + "num_input_tokens_seen": 170843365, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6953125, + "step": 7948, + "time_per_iteration": 2.446190357208252 + }, + { + "auxiliary_loss_clip": 0.01115036, + "auxiliary_loss_mlp": 0.01033851, + "balance_loss_clip": 1.02050555, + "balance_loss_mlp": 1.04240656, + "epoch": 0.4779197354576883, + "flos": 20266546550400.0, + "grad_norm": 1.5048270934573464, + "language_loss": 0.77945703, + "learning_rate": 2.2395267992655514e-06, + "loss": 0.80094588, + "num_input_tokens_seen": 170863515, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7265625, + "step": 7949, + "time_per_iteration": 2.4999916553497314 + }, + { + "auxiliary_loss_clip": 0.01107805, + "auxiliary_loss_mlp": 0.01032834, + "balance_loss_clip": 1.02077556, + "balance_loss_mlp": 1.0387454, + "epoch": 0.47797985871035625, + "flos": 17056599644160.0, + "grad_norm": 2.112378262987889, + "language_loss": 0.74019569, + "learning_rate": 2.2391401362104227e-06, + "loss": 0.7616021, + "num_input_tokens_seen": 170881245, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6875, + "step": 7950, + "time_per_iteration": 2.4387645721435547 + }, + { + "auxiliary_loss_clip": 0.01110159, + "auxiliary_loss_mlp": 0.01039658, + "balance_loss_clip": 1.02609253, + "balance_loss_mlp": 1.03978574, + "epoch": 0.4780399819630242, + "flos": 31358418549120.0, + "grad_norm": 1.7104198942075015, + "language_loss": 0.74135828, + "learning_rate": 2.2387534640870756e-06, + "loss": 0.76285648, + "num_input_tokens_seen": 170901285, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.703125, + "step": 7951, + "time_per_iteration": 2.579258680343628 + }, + { + "auxiliary_loss_clip": 0.01112662, + "auxiliary_loss_mlp": 0.01032575, + "balance_loss_clip": 1.01945043, + "balance_loss_mlp": 1.03915167, + "epoch": 0.4781001052156922, + "flos": 24899597372160.0, + "grad_norm": 1.8112920130665326, + "language_loss": 0.79960251, + "learning_rate": 2.238366782910174e-06, + "loss": 0.82105488, + "num_input_tokens_seen": 170919740, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.734375, + "step": 7952, + "time_per_iteration": 2.5007214546203613 + }, + { + "auxiliary_loss_clip": 0.01114258, + "auxiliary_loss_mlp": 0.01040283, + "balance_loss_clip": 1.02687836, + "balance_loss_mlp": 1.04040217, + "epoch": 0.47816022846836015, + "flos": 18697717157760.0, + "grad_norm": 1.7026148138194093, + "language_loss": 0.78196061, + "learning_rate": 2.23798009269438e-06, + "loss": 0.80350602, + "num_input_tokens_seen": 170938510, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73828125, + "step": 7953, + "time_per_iteration": 2.4699995517730713 + }, + { + "auxiliary_loss_clip": 0.01114922, + "auxiliary_loss_mlp": 0.01036085, + "balance_loss_clip": 1.02362204, + "balance_loss_mlp": 1.0405128, + "epoch": 0.4782203517210281, + "flos": 11977573559040.0, + "grad_norm": 2.2363441879819224, + "language_loss": 0.84142399, + "learning_rate": 2.2375933934543566e-06, + "loss": 0.86293399, + "num_input_tokens_seen": 170951170, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7421875, + "step": 7954, + "time_per_iteration": 2.41294527053833 + }, + { + "auxiliary_loss_clip": 0.01109876, + "auxiliary_loss_mlp": 0.01035461, + "balance_loss_clip": 1.02254462, + "balance_loss_mlp": 1.03839588, + "epoch": 0.4782804749736961, + "flos": 20813501923200.0, + "grad_norm": 1.442835840236476, + "language_loss": 0.70588672, + "learning_rate": 2.237206685204768e-06, + "loss": 0.72734004, + "num_input_tokens_seen": 170970990, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71484375, + "step": 7955, + "time_per_iteration": 2.4867892265319824 + }, + { + "auxiliary_loss_clip": 0.0111092, + "auxiliary_loss_mlp": 0.01037464, + "balance_loss_clip": 1.02507281, + "balance_loss_mlp": 1.03925073, + "epoch": 0.47834059822636404, + "flos": 23840304359040.0, + "grad_norm": 1.5835230785797205, + "language_loss": 0.817267, + "learning_rate": 2.2368199679602787e-06, + "loss": 0.83875084, + "num_input_tokens_seen": 170991215, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71484375, + "step": 7956, + "time_per_iteration": 2.4756619930267334 + }, + { + "auxiliary_loss_clip": 0.0111219, + "auxiliary_loss_mlp": 0.01033269, + "balance_loss_clip": 1.01935172, + "balance_loss_mlp": 1.04097366, + "epoch": 0.478400721479032, + "flos": 22633777497600.0, + "grad_norm": 1.8961411498697718, + "language_loss": 0.84901869, + "learning_rate": 2.2364332417355516e-06, + "loss": 0.87047327, + "num_input_tokens_seen": 171007325, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7109375, + "step": 7957, + "time_per_iteration": 2.4848859310150146 + }, + { + "auxiliary_loss_clip": 0.01110703, + "auxiliary_loss_mlp": 0.01032705, + "balance_loss_clip": 1.02065289, + "balance_loss_mlp": 1.0396328, + "epoch": 0.4784608447317, + "flos": 19354954262400.0, + "grad_norm": 1.5799276625975138, + "language_loss": 0.79682672, + "learning_rate": 2.2360465065452527e-06, + "loss": 0.81826073, + "num_input_tokens_seen": 171025650, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.7109375, + "step": 7958, + "time_per_iteration": 2.439040422439575 + }, + { + "auxiliary_loss_clip": 0.01109825, + "auxiliary_loss_mlp": 0.01034266, + "balance_loss_clip": 1.02074742, + "balance_loss_mlp": 1.03806448, + "epoch": 0.47852096798436794, + "flos": 24021114445440.0, + "grad_norm": 2.0401185124291406, + "language_loss": 0.82728368, + "learning_rate": 2.235659762404047e-06, + "loss": 0.8487246, + "num_input_tokens_seen": 171045045, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71875, + "step": 7959, + "time_per_iteration": 2.500182867050171 + }, + { + "auxiliary_loss_clip": 0.01108176, + "auxiliary_loss_mlp": 0.01033073, + "balance_loss_clip": 1.0219152, + "balance_loss_mlp": 1.04054058, + "epoch": 0.4785810912370359, + "flos": 25666433850240.0, + "grad_norm": 2.3853858164000292, + "language_loss": 0.7333414, + "learning_rate": 2.235273009326599e-06, + "loss": 0.75475383, + "num_input_tokens_seen": 171062910, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.67578125, + "step": 7960, + "time_per_iteration": 2.4852850437164307 + }, + { + "auxiliary_loss_clip": 0.01108515, + "auxiliary_loss_mlp": 0.01036385, + "balance_loss_clip": 1.02413607, + "balance_loss_mlp": 1.03937268, + "epoch": 0.47864121448970387, + "flos": 21432134885760.0, + "grad_norm": 1.8739024393884087, + "language_loss": 0.77067018, + "learning_rate": 2.2348862473275745e-06, + "loss": 0.79211915, + "num_input_tokens_seen": 171080875, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69140625, + "step": 7961, + "time_per_iteration": 2.482361316680908 + }, + { + "auxiliary_loss_clip": 0.01108097, + "auxiliary_loss_mlp": 0.01030382, + "balance_loss_clip": 1.01817513, + "balance_loss_mlp": 1.03838158, + "epoch": 0.47870133774237184, + "flos": 16143894034560.0, + "grad_norm": 1.629700477315198, + "language_loss": 0.77528512, + "learning_rate": 2.2344994764216405e-06, + "loss": 0.7966699, + "num_input_tokens_seen": 171099190, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6953125, + "step": 7962, + "time_per_iteration": 2.427537679672241 + }, + { + "auxiliary_loss_clip": 0.01112825, + "auxiliary_loss_mlp": 0.01034413, + "balance_loss_clip": 1.02196801, + "balance_loss_mlp": 1.04174328, + "epoch": 0.47876146099503986, + "flos": 26906788344960.0, + "grad_norm": 1.5913499246445781, + "language_loss": 0.64895082, + "learning_rate": 2.2341126966234635e-06, + "loss": 0.67042321, + "num_input_tokens_seen": 171119060, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 7963, + "time_per_iteration": 2.51082181930542 + }, + { + "auxiliary_loss_clip": 0.01110812, + "auxiliary_loss_mlp": 0.01030041, + "balance_loss_clip": 1.01748848, + "balance_loss_mlp": 1.03972077, + "epoch": 0.4788215842477078, + "flos": 45332085778560.0, + "grad_norm": 1.658229101322456, + "language_loss": 0.77974397, + "learning_rate": 2.2337259079477083e-06, + "loss": 0.80115253, + "num_input_tokens_seen": 171141900, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7109375, + "step": 7964, + "time_per_iteration": 2.6512372493743896 + }, + { + "auxiliary_loss_clip": 0.01113836, + "auxiliary_loss_mlp": 0.0103048, + "balance_loss_clip": 1.01617479, + "balance_loss_mlp": 1.03944111, + "epoch": 0.4788817075003758, + "flos": 22237180456320.0, + "grad_norm": 1.7558149312417117, + "language_loss": 0.76227248, + "learning_rate": 2.233339110409044e-06, + "loss": 0.78371561, + "num_input_tokens_seen": 171161045, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.74609375, + "step": 7965, + "time_per_iteration": 2.4919536113739014 + }, + { + "auxiliary_loss_clip": 0.01108501, + "auxiliary_loss_mlp": 0.01032128, + "balance_loss_clip": 1.01957512, + "balance_loss_mlp": 1.0382036, + "epoch": 0.47894183075304375, + "flos": 16471183783680.0, + "grad_norm": 2.251400870531799, + "language_loss": 0.74590349, + "learning_rate": 2.232952304022137e-06, + "loss": 0.76730978, + "num_input_tokens_seen": 171179675, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 7966, + "time_per_iteration": 2.4254770278930664 + }, + { + "auxiliary_loss_clip": 0.01108309, + "auxiliary_loss_mlp": 0.01030103, + "balance_loss_clip": 1.0169003, + "balance_loss_mlp": 1.03785586, + "epoch": 0.4790019540057117, + "flos": 24282688262400.0, + "grad_norm": 1.521959054408531, + "language_loss": 0.72728515, + "learning_rate": 2.232565488801655e-06, + "loss": 0.74866927, + "num_input_tokens_seen": 171201175, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 7967, + "time_per_iteration": 2.522883892059326 + }, + { + "auxiliary_loss_clip": 0.01103831, + "auxiliary_loss_mlp": 0.01026385, + "balance_loss_clip": 1.01433849, + "balance_loss_mlp": 1.0371958, + "epoch": 0.4790620772583797, + "flos": 25666469763840.0, + "grad_norm": 2.344774601020355, + "language_loss": 0.79174602, + "learning_rate": 2.232178664762267e-06, + "loss": 0.81304824, + "num_input_tokens_seen": 171221750, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66796875, + "step": 7968, + "time_per_iteration": 2.4777579307556152 + }, + { + "auxiliary_loss_clip": 0.01035385, + "auxiliary_loss_mlp": 0.01007575, + "balance_loss_clip": 1.00622833, + "balance_loss_mlp": 1.0118711, + "epoch": 0.47912220051104765, + "flos": 69428077102080.0, + "grad_norm": 0.7636022901302345, + "language_loss": 0.62258303, + "learning_rate": 2.2317918319186408e-06, + "loss": 0.64301264, + "num_input_tokens_seen": 171292235, + "router_z_loss_clip": 0.01348877, + "router_z_loss_mlp": 0.23535156, + "step": 7969, + "time_per_iteration": 4.618057012557983 + }, + { + "auxiliary_loss_clip": 0.01107101, + "auxiliary_loss_mlp": 0.01027179, + "balance_loss_clip": 1.01555026, + "balance_loss_mlp": 1.04000521, + "epoch": 0.4791823237637156, + "flos": 24168922911360.0, + "grad_norm": 1.5307915717866403, + "language_loss": 0.77086926, + "learning_rate": 2.2314049902854446e-06, + "loss": 0.79221207, + "num_input_tokens_seen": 171312215, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 7970, + "time_per_iteration": 2.469363212585449 + }, + { + "auxiliary_loss_clip": 0.01106643, + "auxiliary_loss_mlp": 0.01032734, + "balance_loss_clip": 1.01962733, + "balance_loss_mlp": 1.03676999, + "epoch": 0.4792424470163836, + "flos": 24751465683840.0, + "grad_norm": 1.595425961628827, + "language_loss": 0.70320344, + "learning_rate": 2.231018139877349e-06, + "loss": 0.72459716, + "num_input_tokens_seen": 171332975, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.69921875, + "step": 7971, + "time_per_iteration": 5.436426401138306 + }, + { + "auxiliary_loss_clip": 0.01107204, + "auxiliary_loss_mlp": 0.01025624, + "balance_loss_clip": 1.01228452, + "balance_loss_mlp": 1.03725302, + "epoch": 0.47930257026905154, + "flos": 23257905240960.0, + "grad_norm": 1.2757793979028687, + "language_loss": 0.79909688, + "learning_rate": 2.230631280709021e-06, + "loss": 0.82042515, + "num_input_tokens_seen": 171353880, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.69921875, + "step": 7972, + "time_per_iteration": 2.4788928031921387 + }, + { + "auxiliary_loss_clip": 0.01109213, + "auxiliary_loss_mlp": 0.01025441, + "balance_loss_clip": 1.01220274, + "balance_loss_mlp": 1.03801394, + "epoch": 0.4793626935217195, + "flos": 14064091718400.0, + "grad_norm": 2.154896563362021, + "language_loss": 0.69762838, + "learning_rate": 2.2302444127951327e-06, + "loss": 0.71897495, + "num_input_tokens_seen": 171370930, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 7973, + "time_per_iteration": 2.462674140930176 + }, + { + "auxiliary_loss_clip": 0.01108438, + "auxiliary_loss_mlp": 0.01031526, + "balance_loss_clip": 1.01943266, + "balance_loss_mlp": 1.0401777, + "epoch": 0.4794228167743875, + "flos": 21798854789760.0, + "grad_norm": 1.7300676969557445, + "language_loss": 0.78652924, + "learning_rate": 2.2298575361503523e-06, + "loss": 0.80792892, + "num_input_tokens_seen": 171387575, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 7974, + "time_per_iteration": 2.523935079574585 + }, + { + "auxiliary_loss_clip": 0.01035404, + "auxiliary_loss_mlp": 0.01004075, + "balance_loss_clip": 1.00275135, + "balance_loss_mlp": 1.01174331, + "epoch": 0.47948294002705544, + "flos": 66968805553920.0, + "grad_norm": 0.7575595850509929, + "language_loss": 0.54076326, + "learning_rate": 2.2294706507893517e-06, + "loss": 0.56115806, + "num_input_tokens_seen": 171449980, + "router_z_loss_clip": 0.01324463, + "router_z_loss_mlp": 0.23632812, + "step": 7975, + "time_per_iteration": 3.120290756225586 + }, + { + "auxiliary_loss_clip": 0.01113379, + "auxiliary_loss_mlp": 0.01033426, + "balance_loss_clip": 1.01946688, + "balance_loss_mlp": 1.03872228, + "epoch": 0.47954306327972346, + "flos": 12422471414400.0, + "grad_norm": 2.0952625936259226, + "language_loss": 0.90246761, + "learning_rate": 2.2290837567268008e-06, + "loss": 0.92393565, + "num_input_tokens_seen": 171465290, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75, + "step": 7976, + "time_per_iteration": 2.4177215099334717 + }, + { + "auxiliary_loss_clip": 0.01113502, + "auxiliary_loss_mlp": 0.01034601, + "balance_loss_clip": 1.02070153, + "balance_loss_mlp": 1.03989267, + "epoch": 0.4796031865323914, + "flos": 18361951799040.0, + "grad_norm": 2.1692733838107148, + "language_loss": 0.73631197, + "learning_rate": 2.2286968539773713e-06, + "loss": 0.75779295, + "num_input_tokens_seen": 171481130, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.734375, + "step": 7977, + "time_per_iteration": 2.478994846343994 + }, + { + "auxiliary_loss_clip": 0.01105095, + "auxiliary_loss_mlp": 0.01033101, + "balance_loss_clip": 1.02095962, + "balance_loss_mlp": 1.03737617, + "epoch": 0.4796633097850594, + "flos": 21835088634240.0, + "grad_norm": 1.5189317692466735, + "language_loss": 0.78386033, + "learning_rate": 2.228309942555734e-06, + "loss": 0.80524224, + "num_input_tokens_seen": 171501140, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6796875, + "step": 7978, + "time_per_iteration": 2.441770315170288 + }, + { + "auxiliary_loss_clip": 0.01110092, + "auxiliary_loss_mlp": 0.01032979, + "balance_loss_clip": 1.02036691, + "balance_loss_mlp": 1.03895688, + "epoch": 0.47972343303772735, + "flos": 23437350610560.0, + "grad_norm": 1.9080949377976553, + "language_loss": 0.89561266, + "learning_rate": 2.22792302247656e-06, + "loss": 0.91704339, + "num_input_tokens_seen": 171519835, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 7979, + "time_per_iteration": 2.5005874633789062 + }, + { + "auxiliary_loss_clip": 0.01111373, + "auxiliary_loss_mlp": 0.01032357, + "balance_loss_clip": 1.01854038, + "balance_loss_mlp": 1.03977728, + "epoch": 0.4797835562903953, + "flos": 24899776940160.0, + "grad_norm": 1.512941625260848, + "language_loss": 0.77104276, + "learning_rate": 2.227536093754523e-06, + "loss": 0.79248011, + "num_input_tokens_seen": 171540980, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.71875, + "step": 7980, + "time_per_iteration": 2.514702320098877 + }, + { + "auxiliary_loss_clip": 0.01112304, + "auxiliary_loss_mlp": 0.01031739, + "balance_loss_clip": 1.0177083, + "balance_loss_mlp": 1.03812611, + "epoch": 0.4798436795430633, + "flos": 35042996793600.0, + "grad_norm": 1.6709892763913308, + "language_loss": 0.71718562, + "learning_rate": 2.227149156404295e-06, + "loss": 0.738626, + "num_input_tokens_seen": 171563600, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 7981, + "time_per_iteration": 2.606919050216675 + }, + { + "auxiliary_loss_clip": 0.01107255, + "auxiliary_loss_mlp": 0.01029759, + "balance_loss_clip": 1.01743317, + "balance_loss_mlp": 1.03878653, + "epoch": 0.47990380279573125, + "flos": 20590209025920.0, + "grad_norm": 1.7550369517172573, + "language_loss": 0.70141387, + "learning_rate": 2.2267622104405473e-06, + "loss": 0.72278404, + "num_input_tokens_seen": 171580700, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.68359375, + "step": 7982, + "time_per_iteration": 2.4303736686706543 + }, + { + "auxiliary_loss_clip": 0.01102477, + "auxiliary_loss_mlp": 0.0102651, + "balance_loss_clip": 1.01558483, + "balance_loss_mlp": 1.03694749, + "epoch": 0.4799639260483992, + "flos": 26359402008960.0, + "grad_norm": 2.256566494766253, + "language_loss": 0.70977259, + "learning_rate": 2.2263752558779544e-06, + "loss": 0.73106241, + "num_input_tokens_seen": 171602035, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.65625, + "step": 7983, + "time_per_iteration": 2.520749092102051 + }, + { + "auxiliary_loss_clip": 0.01032541, + "auxiliary_loss_mlp": 0.01011047, + "balance_loss_clip": 1.00992036, + "balance_loss_mlp": 1.00916195, + "epoch": 0.4800240493010672, + "flos": 70979021521920.0, + "grad_norm": 0.8049867321392653, + "language_loss": 0.59458363, + "learning_rate": 2.2259882927311883e-06, + "loss": 0.6150195, + "num_input_tokens_seen": 171659215, + "router_z_loss_clip": 0.0112915, + "router_z_loss_mlp": 0.234375, + "step": 7984, + "time_per_iteration": 3.0019614696502686 + }, + { + "auxiliary_loss_clip": 0.01107343, + "auxiliary_loss_mlp": 0.01031912, + "balance_loss_clip": 1.01912713, + "balance_loss_mlp": 1.0376364, + "epoch": 0.48008417255373514, + "flos": 17086656349440.0, + "grad_norm": 1.5803111762139084, + "language_loss": 0.66603255, + "learning_rate": 2.2256013210149247e-06, + "loss": 0.68742514, + "num_input_tokens_seen": 171675710, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 7985, + "time_per_iteration": 2.459381341934204 + }, + { + "auxiliary_loss_clip": 0.01108889, + "auxiliary_loss_mlp": 0.01036103, + "balance_loss_clip": 1.02279973, + "balance_loss_mlp": 1.03655791, + "epoch": 0.4801442958064031, + "flos": 15413435055360.0, + "grad_norm": 1.8105960725352928, + "language_loss": 0.70750952, + "learning_rate": 2.225214340743835e-06, + "loss": 0.72895944, + "num_input_tokens_seen": 171692510, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.72265625, + "step": 7986, + "time_per_iteration": 2.412890911102295 + }, + { + "auxiliary_loss_clip": 0.01113566, + "auxiliary_loss_mlp": 0.01038838, + "balance_loss_clip": 1.02515244, + "balance_loss_mlp": 1.03964305, + "epoch": 0.4802044190590711, + "flos": 11473747441920.0, + "grad_norm": 2.571002176109277, + "language_loss": 0.78704774, + "learning_rate": 2.2248273519325956e-06, + "loss": 0.80857182, + "num_input_tokens_seen": 171710235, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7421875, + "step": 7987, + "time_per_iteration": 2.464531898498535 + }, + { + "auxiliary_loss_clip": 0.01107017, + "auxiliary_loss_mlp": 0.01036694, + "balance_loss_clip": 1.02410507, + "balance_loss_mlp": 1.03615475, + "epoch": 0.48026454231173904, + "flos": 20951003185920.0, + "grad_norm": 1.8312114483143844, + "language_loss": 0.75309592, + "learning_rate": 2.2244403545958812e-06, + "loss": 0.77453303, + "num_input_tokens_seen": 171726715, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 7988, + "time_per_iteration": 2.4185469150543213 + }, + { + "auxiliary_loss_clip": 0.01113071, + "auxiliary_loss_mlp": 0.01029368, + "balance_loss_clip": 1.01667249, + "balance_loss_mlp": 1.04115009, + "epoch": 0.48032466556440706, + "flos": 20448110822400.0, + "grad_norm": 1.9770525324174564, + "language_loss": 0.78992975, + "learning_rate": 2.224053348748365e-06, + "loss": 0.81135416, + "num_input_tokens_seen": 171743605, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 7989, + "time_per_iteration": 2.4614450931549072 + }, + { + "auxiliary_loss_clip": 0.01113161, + "auxiliary_loss_mlp": 0.01036646, + "balance_loss_clip": 1.02273488, + "balance_loss_mlp": 1.03810394, + "epoch": 0.480384788817075, + "flos": 37120823861760.0, + "grad_norm": 1.6525338075260034, + "language_loss": 0.73414218, + "learning_rate": 2.223666334404724e-06, + "loss": 0.75564027, + "num_input_tokens_seen": 171765445, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 7990, + "time_per_iteration": 2.562366008758545 + }, + { + "auxiliary_loss_clip": 0.01032695, + "auxiliary_loss_mlp": 0.0100018, + "balance_loss_clip": 0.99901813, + "balance_loss_mlp": 1.00915992, + "epoch": 0.480444912069743, + "flos": 69552577641600.0, + "grad_norm": 1.0595345338831614, + "language_loss": 0.59085703, + "learning_rate": 2.223279311579633e-06, + "loss": 0.61118573, + "num_input_tokens_seen": 171830115, + "router_z_loss_clip": 0.01159668, + "router_z_loss_mlp": 0.23535156, + "step": 7991, + "time_per_iteration": 3.1877033710479736 + }, + { + "auxiliary_loss_clip": 0.01107688, + "auxiliary_loss_mlp": 0.01029352, + "balance_loss_clip": 1.01626837, + "balance_loss_mlp": 1.03751063, + "epoch": 0.48050503532241096, + "flos": 29822231640960.0, + "grad_norm": 1.8662124275999659, + "language_loss": 0.67495418, + "learning_rate": 2.222892280287768e-06, + "loss": 0.69632453, + "num_input_tokens_seen": 171849135, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 7992, + "time_per_iteration": 2.5135016441345215 + }, + { + "auxiliary_loss_clip": 0.01109706, + "auxiliary_loss_mlp": 0.0103267, + "balance_loss_clip": 1.01969361, + "balance_loss_mlp": 1.03664112, + "epoch": 0.4805651585750789, + "flos": 23948539015680.0, + "grad_norm": 1.6211148746347477, + "language_loss": 0.76493919, + "learning_rate": 2.2225052405438056e-06, + "loss": 0.78636301, + "num_input_tokens_seen": 171868880, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.73046875, + "step": 7993, + "time_per_iteration": 2.5075619220733643 + }, + { + "auxiliary_loss_clip": 0.01108105, + "auxiliary_loss_mlp": 0.01035536, + "balance_loss_clip": 1.02267301, + "balance_loss_mlp": 1.03899574, + "epoch": 0.4806252818277469, + "flos": 25665428269440.0, + "grad_norm": 1.5028541481112037, + "language_loss": 0.78277898, + "learning_rate": 2.222118192362422e-06, + "loss": 0.80421537, + "num_input_tokens_seen": 171889455, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69140625, + "step": 7994, + "time_per_iteration": 2.4792723655700684 + }, + { + "auxiliary_loss_clip": 0.01108503, + "auxiliary_loss_mlp": 0.01032981, + "balance_loss_clip": 1.02010691, + "balance_loss_mlp": 1.03752637, + "epoch": 0.48068540508041485, + "flos": 13151996640000.0, + "grad_norm": 1.8792905950371066, + "language_loss": 0.79627287, + "learning_rate": 2.2217311357582946e-06, + "loss": 0.81768769, + "num_input_tokens_seen": 171906070, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 7995, + "time_per_iteration": 2.4605226516723633 + }, + { + "auxiliary_loss_clip": 0.0110729, + "auxiliary_loss_mlp": 0.01029971, + "balance_loss_clip": 1.01676297, + "balance_loss_mlp": 1.03693795, + "epoch": 0.4807455283330828, + "flos": 21176738208000.0, + "grad_norm": 1.8681673839648991, + "language_loss": 0.8255161, + "learning_rate": 2.2213440707461e-06, + "loss": 0.84688872, + "num_input_tokens_seen": 171926515, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 7996, + "time_per_iteration": 2.4627599716186523 + }, + { + "auxiliary_loss_clip": 0.01108595, + "auxiliary_loss_mlp": 0.01028236, + "balance_loss_clip": 1.01562989, + "balance_loss_mlp": 1.03879523, + "epoch": 0.4808056515857508, + "flos": 12275991751680.0, + "grad_norm": 1.619215200240117, + "language_loss": 0.80642337, + "learning_rate": 2.220956997340516e-06, + "loss": 0.82779169, + "num_input_tokens_seen": 171943845, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69921875, + "step": 7997, + "time_per_iteration": 2.450486660003662 + }, + { + "auxiliary_loss_clip": 0.01108626, + "auxiliary_loss_mlp": 0.01034627, + "balance_loss_clip": 1.02174699, + "balance_loss_mlp": 1.03695917, + "epoch": 0.48086577483841875, + "flos": 24826052275200.0, + "grad_norm": 1.8605056175819474, + "language_loss": 0.72481054, + "learning_rate": 2.220569915556221e-06, + "loss": 0.74624306, + "num_input_tokens_seen": 171964970, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 7998, + "time_per_iteration": 2.484501361846924 + }, + { + "auxiliary_loss_clip": 0.0111064, + "auxiliary_loss_mlp": 0.01032099, + "balance_loss_clip": 1.01893795, + "balance_loss_mlp": 1.03890526, + "epoch": 0.4809258980910867, + "flos": 24465365856000.0, + "grad_norm": 1.7021894106986095, + "language_loss": 0.71182632, + "learning_rate": 2.220182825407892e-06, + "loss": 0.73325378, + "num_input_tokens_seen": 171986340, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71484375, + "step": 7999, + "time_per_iteration": 2.5011837482452393 + }, + { + "auxiliary_loss_clip": 0.01112886, + "auxiliary_loss_mlp": 0.0104057, + "balance_loss_clip": 1.02758801, + "balance_loss_mlp": 1.03862715, + "epoch": 0.4809860213437547, + "flos": 21215952881280.0, + "grad_norm": 2.087936802810397, + "language_loss": 0.71136171, + "learning_rate": 2.2197957269102083e-06, + "loss": 0.73289621, + "num_input_tokens_seen": 172007300, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7421875, + "step": 8000, + "time_per_iteration": 2.473083019256592 + }, + { + "auxiliary_loss_clip": 0.01111531, + "auxiliary_loss_mlp": 0.01036663, + "balance_loss_clip": 1.02291203, + "balance_loss_mlp": 1.03987443, + "epoch": 0.48104614459642264, + "flos": 37632084094080.0, + "grad_norm": 1.2945806687832948, + "language_loss": 0.75104553, + "learning_rate": 2.2194086200778485e-06, + "loss": 0.77252746, + "num_input_tokens_seen": 172029585, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.71484375, + "step": 8001, + "time_per_iteration": 2.6078953742980957 + }, + { + "auxiliary_loss_clip": 0.0111278, + "auxiliary_loss_mlp": 0.01040194, + "balance_loss_clip": 1.02701581, + "balance_loss_mlp": 1.03889596, + "epoch": 0.48110626784909066, + "flos": 18406122549120.0, + "grad_norm": 1.8640621993165467, + "language_loss": 0.81407833, + "learning_rate": 2.219021504925493e-06, + "loss": 0.83560812, + "num_input_tokens_seen": 172047495, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73828125, + "step": 8002, + "time_per_iteration": 2.4381091594696045 + }, + { + "auxiliary_loss_clip": 0.01113899, + "auxiliary_loss_mlp": 0.01038529, + "balance_loss_clip": 1.02415216, + "balance_loss_mlp": 1.04037309, + "epoch": 0.48116639110175863, + "flos": 28439814856320.0, + "grad_norm": 1.7407260367663493, + "language_loss": 0.71673185, + "learning_rate": 2.218634381467819e-06, + "loss": 0.7382561, + "num_input_tokens_seen": 172067625, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.734375, + "step": 8003, + "time_per_iteration": 2.5028979778289795 + }, + { + "auxiliary_loss_clip": 0.01110475, + "auxiliary_loss_mlp": 0.01038852, + "balance_loss_clip": 1.0263828, + "balance_loss_mlp": 1.04041362, + "epoch": 0.4812265143544266, + "flos": 21725237865600.0, + "grad_norm": 1.9713418243952783, + "language_loss": 0.82751715, + "learning_rate": 2.218247249719507e-06, + "loss": 0.84901035, + "num_input_tokens_seen": 172087885, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 8004, + "time_per_iteration": 2.4438235759735107 + }, + { + "auxiliary_loss_clip": 0.0112055, + "auxiliary_loss_mlp": 0.01044746, + "balance_loss_clip": 1.02951062, + "balance_loss_mlp": 1.04235947, + "epoch": 0.48128663760709456, + "flos": 13224679810560.0, + "grad_norm": 2.0081127141146964, + "language_loss": 0.77780354, + "learning_rate": 2.217860109695239e-06, + "loss": 0.7994566, + "num_input_tokens_seen": 172105815, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.78125, + "step": 8005, + "time_per_iteration": 2.4440789222717285 + }, + { + "auxiliary_loss_clip": 0.01109918, + "auxiliary_loss_mlp": 0.01035382, + "balance_loss_clip": 1.0218395, + "balance_loss_mlp": 1.03705537, + "epoch": 0.4813467608597625, + "flos": 24243437675520.0, + "grad_norm": 3.988142696329101, + "language_loss": 0.70656502, + "learning_rate": 2.217472961409692e-06, + "loss": 0.72801799, + "num_input_tokens_seen": 172126125, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7265625, + "step": 8006, + "time_per_iteration": 2.4627490043640137 + }, + { + "auxiliary_loss_clip": 0.0111164, + "auxiliary_loss_mlp": 0.01036734, + "balance_loss_clip": 1.02357328, + "balance_loss_mlp": 1.03939271, + "epoch": 0.4814068841124305, + "flos": 27480424544640.0, + "grad_norm": 1.9148811651735764, + "language_loss": 0.70463514, + "learning_rate": 2.2170858048775495e-06, + "loss": 0.72611892, + "num_input_tokens_seen": 172141945, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.72265625, + "step": 8007, + "time_per_iteration": 2.4923551082611084 + }, + { + "auxiliary_loss_clip": 0.01112639, + "auxiliary_loss_mlp": 0.01035836, + "balance_loss_clip": 1.02225244, + "balance_loss_mlp": 1.03924334, + "epoch": 0.48146700736509845, + "flos": 19572896033280.0, + "grad_norm": 2.0099977087556202, + "language_loss": 0.71720552, + "learning_rate": 2.2166986401134914e-06, + "loss": 0.7386902, + "num_input_tokens_seen": 172161095, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.734375, + "step": 8008, + "time_per_iteration": 2.443068742752075 + }, + { + "auxiliary_loss_clip": 0.01114704, + "auxiliary_loss_mlp": 0.01046807, + "balance_loss_clip": 1.0317508, + "balance_loss_mlp": 1.03984571, + "epoch": 0.4815271306177664, + "flos": 20627771673600.0, + "grad_norm": 1.7155117192574523, + "language_loss": 0.60448718, + "learning_rate": 2.216311467132199e-06, + "loss": 0.62610233, + "num_input_tokens_seen": 172178750, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.74609375, + "step": 8009, + "time_per_iteration": 2.4860730171203613 + }, + { + "auxiliary_loss_clip": 0.01041953, + "auxiliary_loss_mlp": 0.01003034, + "balance_loss_clip": 1.00188339, + "balance_loss_mlp": 1.01788867, + "epoch": 0.4815872538704344, + "flos": 67691076232320.0, + "grad_norm": 0.861211973736155, + "language_loss": 0.61329502, + "learning_rate": 2.2159242859483547e-06, + "loss": 0.6337449, + "num_input_tokens_seen": 172240235, + "router_z_loss_clip": 0.01147461, + "router_z_loss_mlp": 0.24121094, + "step": 8010, + "time_per_iteration": 3.073617935180664 + }, + { + "auxiliary_loss_clip": 0.01115187, + "auxiliary_loss_mlp": 0.01045892, + "balance_loss_clip": 1.03135991, + "balance_loss_mlp": 1.04191947, + "epoch": 0.48164737712310235, + "flos": 22820764723200.0, + "grad_norm": 2.200850795507016, + "language_loss": 0.73003197, + "learning_rate": 2.215537096576639e-06, + "loss": 0.75164282, + "num_input_tokens_seen": 172259875, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.734375, + "step": 8011, + "time_per_iteration": 3.875464677810669 + }, + { + "auxiliary_loss_clip": 0.01108987, + "auxiliary_loss_mlp": 0.01036612, + "balance_loss_clip": 1.02398205, + "balance_loss_mlp": 1.03922546, + "epoch": 0.4817075003757703, + "flos": 23733865382400.0, + "grad_norm": 1.7669872730797296, + "language_loss": 0.79906964, + "learning_rate": 2.2151498990317354e-06, + "loss": 0.82052571, + "num_input_tokens_seen": 172280150, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 8012, + "time_per_iteration": 5.410374164581299 + }, + { + "auxiliary_loss_clip": 0.01114267, + "auxiliary_loss_mlp": 0.01047469, + "balance_loss_clip": 1.03336632, + "balance_loss_mlp": 1.04086518, + "epoch": 0.4817676236284383, + "flos": 28182909807360.0, + "grad_norm": 1.5982967759080098, + "language_loss": 0.73816693, + "learning_rate": 2.214762693328326e-06, + "loss": 0.75978434, + "num_input_tokens_seen": 172300810, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.734375, + "step": 8013, + "time_per_iteration": 4.00807785987854 + }, + { + "auxiliary_loss_clip": 0.01112131, + "auxiliary_loss_mlp": 0.01033013, + "balance_loss_clip": 1.02043676, + "balance_loss_mlp": 1.04102039, + "epoch": 0.48182774688110624, + "flos": 17091756080640.0, + "grad_norm": 4.768803838152643, + "language_loss": 0.90554619, + "learning_rate": 2.214375479481094e-06, + "loss": 0.92699754, + "num_input_tokens_seen": 172317930, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 8014, + "time_per_iteration": 2.4615042209625244 + }, + { + "auxiliary_loss_clip": 0.01116604, + "auxiliary_loss_mlp": 0.01038374, + "balance_loss_clip": 1.02456379, + "balance_loss_mlp": 1.04058647, + "epoch": 0.4818878701337742, + "flos": 12567873669120.0, + "grad_norm": 3.0531094865391073, + "language_loss": 0.74407947, + "learning_rate": 2.213988257504722e-06, + "loss": 0.76562929, + "num_input_tokens_seen": 172336340, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 8015, + "time_per_iteration": 2.434838056564331 + }, + { + "auxiliary_loss_clip": 0.01117647, + "auxiliary_loss_mlp": 0.01040504, + "balance_loss_clip": 1.02588332, + "balance_loss_mlp": 1.04072225, + "epoch": 0.48194799338644223, + "flos": 24608505553920.0, + "grad_norm": 2.017951331310383, + "language_loss": 0.8059243, + "learning_rate": 2.213601027413894e-06, + "loss": 0.82750583, + "num_input_tokens_seen": 172354315, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.76953125, + "step": 8016, + "time_per_iteration": 2.513319492340088 + }, + { + "auxiliary_loss_clip": 0.01109398, + "auxiliary_loss_mlp": 0.01035038, + "balance_loss_clip": 1.02206254, + "balance_loss_mlp": 1.04101717, + "epoch": 0.4820081166391102, + "flos": 21105204272640.0, + "grad_norm": 2.4127244097624847, + "language_loss": 0.76781118, + "learning_rate": 2.2132137892232933e-06, + "loss": 0.78925556, + "num_input_tokens_seen": 172372695, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6796875, + "step": 8017, + "time_per_iteration": 2.4602606296539307 + }, + { + "auxiliary_loss_clip": 0.011107, + "auxiliary_loss_mlp": 0.01032569, + "balance_loss_clip": 1.01862764, + "balance_loss_mlp": 1.04151559, + "epoch": 0.48206823989177816, + "flos": 25264593423360.0, + "grad_norm": 1.9887798442379552, + "language_loss": 0.80156118, + "learning_rate": 2.2128265429476043e-06, + "loss": 0.82299387, + "num_input_tokens_seen": 172390905, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.69140625, + "step": 8018, + "time_per_iteration": 2.5529282093048096 + }, + { + "auxiliary_loss_clip": 0.01113443, + "auxiliary_loss_mlp": 0.01029419, + "balance_loss_clip": 1.01667559, + "balance_loss_mlp": 1.04109669, + "epoch": 0.4821283631444461, + "flos": 24645062620800.0, + "grad_norm": 1.7653706812529009, + "language_loss": 0.75843483, + "learning_rate": 2.2124392886015124e-06, + "loss": 0.77986348, + "num_input_tokens_seen": 172412295, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 8019, + "time_per_iteration": 2.4978489875793457 + }, + { + "auxiliary_loss_clip": 0.01112605, + "auxiliary_loss_mlp": 0.01036673, + "balance_loss_clip": 1.02286255, + "balance_loss_mlp": 1.03955722, + "epoch": 0.4821884863971141, + "flos": 23952094462080.0, + "grad_norm": 1.7828460534537498, + "language_loss": 0.78554976, + "learning_rate": 2.212052026199701e-06, + "loss": 0.80704254, + "num_input_tokens_seen": 172432625, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.73046875, + "step": 8020, + "time_per_iteration": 2.503870725631714 + }, + { + "auxiliary_loss_clip": 0.01112649, + "auxiliary_loss_mlp": 0.01034544, + "balance_loss_clip": 1.02043533, + "balance_loss_mlp": 1.04134321, + "epoch": 0.48224860964978206, + "flos": 17160668323200.0, + "grad_norm": 2.4275685595470207, + "language_loss": 0.69718045, + "learning_rate": 2.211664755756855e-06, + "loss": 0.71865243, + "num_input_tokens_seen": 172450010, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7109375, + "step": 8021, + "time_per_iteration": 2.4298038482666016 + }, + { + "auxiliary_loss_clip": 0.011165, + "auxiliary_loss_mlp": 0.01032417, + "balance_loss_clip": 1.01797438, + "balance_loss_mlp": 1.0407902, + "epoch": 0.48230873290245, + "flos": 23075838178560.0, + "grad_norm": 1.6547112313669838, + "language_loss": 0.62773043, + "learning_rate": 2.2112774772876603e-06, + "loss": 0.64921963, + "num_input_tokens_seen": 172469080, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7578125, + "step": 8022, + "time_per_iteration": 2.4862682819366455 + }, + { + "auxiliary_loss_clip": 0.01109497, + "auxiliary_loss_mlp": 0.01029473, + "balance_loss_clip": 1.01683092, + "balance_loss_mlp": 1.03976464, + "epoch": 0.482368856155118, + "flos": 19353517718400.0, + "grad_norm": 2.257171661165274, + "language_loss": 0.66345549, + "learning_rate": 2.2108901908068028e-06, + "loss": 0.68484527, + "num_input_tokens_seen": 172484850, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 8023, + "time_per_iteration": 2.4498074054718018 + }, + { + "auxiliary_loss_clip": 0.01109691, + "auxiliary_loss_mlp": 0.01035383, + "balance_loss_clip": 1.02181077, + "balance_loss_mlp": 1.0379076, + "epoch": 0.48242897940778595, + "flos": 20078984707200.0, + "grad_norm": 2.6609441563285485, + "language_loss": 0.76680458, + "learning_rate": 2.2105028963289683e-06, + "loss": 0.78825533, + "num_input_tokens_seen": 172503525, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.71875, + "step": 8024, + "time_per_iteration": 2.5641326904296875 + }, + { + "auxiliary_loss_clip": 0.01111982, + "auxiliary_loss_mlp": 0.01033968, + "balance_loss_clip": 1.01926339, + "balance_loss_mlp": 1.03856826, + "epoch": 0.4824891026604539, + "flos": 23403989854080.0, + "grad_norm": 1.4456982310337658, + "language_loss": 0.75299227, + "learning_rate": 2.2101155938688423e-06, + "loss": 0.77445179, + "num_input_tokens_seen": 172524360, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.734375, + "step": 8025, + "time_per_iteration": 2.4700748920440674 + }, + { + "auxiliary_loss_clip": 0.0111201, + "auxiliary_loss_mlp": 0.01034955, + "balance_loss_clip": 1.02159774, + "balance_loss_mlp": 1.04015994, + "epoch": 0.4825492259131219, + "flos": 20368675895040.0, + "grad_norm": 1.85740453148256, + "language_loss": 0.71010149, + "learning_rate": 2.209728283441112e-06, + "loss": 0.7315712, + "num_input_tokens_seen": 172541480, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.71875, + "step": 8026, + "time_per_iteration": 2.451942205429077 + }, + { + "auxiliary_loss_clip": 0.01115796, + "auxiliary_loss_mlp": 0.01043054, + "balance_loss_clip": 1.02739012, + "balance_loss_mlp": 1.04088664, + "epoch": 0.48260934916578985, + "flos": 14319021519360.0, + "grad_norm": 2.002376238963681, + "language_loss": 0.74738306, + "learning_rate": 2.209340965060465e-06, + "loss": 0.76897156, + "num_input_tokens_seen": 172559005, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.75, + "step": 8027, + "time_per_iteration": 2.511625051498413 + }, + { + "auxiliary_loss_clip": 0.01116324, + "auxiliary_loss_mlp": 0.01036177, + "balance_loss_clip": 1.02260458, + "balance_loss_mlp": 1.0418303, + "epoch": 0.4826694724184578, + "flos": 22121152548480.0, + "grad_norm": 1.8015680699639052, + "language_loss": 0.6744982, + "learning_rate": 2.2089536387415868e-06, + "loss": 0.69602323, + "num_input_tokens_seen": 172578435, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.74609375, + "step": 8028, + "time_per_iteration": 2.487610101699829 + }, + { + "auxiliary_loss_clip": 0.01114464, + "auxiliary_loss_mlp": 0.01039272, + "balance_loss_clip": 1.02490783, + "balance_loss_mlp": 1.04192257, + "epoch": 0.48272959567112583, + "flos": 16181169373440.0, + "grad_norm": 1.8869203156454395, + "language_loss": 0.73063505, + "learning_rate": 2.2085663044991655e-06, + "loss": 0.75217235, + "num_input_tokens_seen": 172596095, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7265625, + "step": 8029, + "time_per_iteration": 2.4256598949432373 + }, + { + "auxiliary_loss_clip": 0.01114009, + "auxiliary_loss_mlp": 0.01031401, + "balance_loss_clip": 1.01691651, + "balance_loss_mlp": 1.03949094, + "epoch": 0.4827897189237938, + "flos": 23180445561600.0, + "grad_norm": 1.9568889088417416, + "language_loss": 0.85374999, + "learning_rate": 2.2081789623478896e-06, + "loss": 0.87520409, + "num_input_tokens_seen": 172615255, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7421875, + "step": 8030, + "time_per_iteration": 2.4838480949401855 + }, + { + "auxiliary_loss_clip": 0.01111314, + "auxiliary_loss_mlp": 0.01032477, + "balance_loss_clip": 1.01917291, + "balance_loss_mlp": 1.03858352, + "epoch": 0.48284984217646176, + "flos": 21652626522240.0, + "grad_norm": 1.946134860300181, + "language_loss": 0.74173188, + "learning_rate": 2.2077916123024466e-06, + "loss": 0.76316977, + "num_input_tokens_seen": 172633185, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7265625, + "step": 8031, + "time_per_iteration": 2.475564956665039 + }, + { + "auxiliary_loss_clip": 0.01118074, + "auxiliary_loss_mlp": 0.01045075, + "balance_loss_clip": 1.03023958, + "balance_loss_mlp": 1.04181576, + "epoch": 0.48290996542912973, + "flos": 31467443304960.0, + "grad_norm": 1.8194651882134072, + "language_loss": 0.71833324, + "learning_rate": 2.2074042543775245e-06, + "loss": 0.73996472, + "num_input_tokens_seen": 172654280, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.76171875, + "step": 8032, + "time_per_iteration": 2.5389230251312256 + }, + { + "auxiliary_loss_clip": 0.01111799, + "auxiliary_loss_mlp": 0.01036984, + "balance_loss_clip": 1.02326274, + "balance_loss_mlp": 1.03896618, + "epoch": 0.4829700886817977, + "flos": 24461954064000.0, + "grad_norm": 1.5190699612157064, + "language_loss": 0.74008, + "learning_rate": 2.2070168885878126e-06, + "loss": 0.76156777, + "num_input_tokens_seen": 172675545, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7265625, + "step": 8033, + "time_per_iteration": 2.497344493865967 + }, + { + "auxiliary_loss_clip": 0.01118056, + "auxiliary_loss_mlp": 0.01037099, + "balance_loss_clip": 1.02273428, + "balance_loss_mlp": 1.04200494, + "epoch": 0.48303021193446566, + "flos": 25702164904320.0, + "grad_norm": 1.7070178882470917, + "language_loss": 0.82929307, + "learning_rate": 2.2066295149479996e-06, + "loss": 0.85084462, + "num_input_tokens_seen": 172696455, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.76171875, + "step": 8034, + "time_per_iteration": 2.504986524581909 + }, + { + "auxiliary_loss_clip": 0.01110784, + "auxiliary_loss_mlp": 0.01032285, + "balance_loss_clip": 1.01862347, + "balance_loss_mlp": 1.04048431, + "epoch": 0.4830903351871336, + "flos": 20085233673600.0, + "grad_norm": 2.2841237596844493, + "language_loss": 0.79519325, + "learning_rate": 2.2062421334727744e-06, + "loss": 0.81662393, + "num_input_tokens_seen": 172716720, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.703125, + "step": 8035, + "time_per_iteration": 2.497851610183716 + }, + { + "auxiliary_loss_clip": 0.01115806, + "auxiliary_loss_mlp": 0.01041785, + "balance_loss_clip": 1.02656746, + "balance_loss_mlp": 1.04139149, + "epoch": 0.4831504584398016, + "flos": 39452216014080.0, + "grad_norm": 1.7925521800027493, + "language_loss": 0.69359076, + "learning_rate": 2.2058547441768267e-06, + "loss": 0.71516669, + "num_input_tokens_seen": 172737435, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.7421875, + "step": 8036, + "time_per_iteration": 2.6260759830474854 + }, + { + "auxiliary_loss_clip": 0.01112129, + "auxiliary_loss_mlp": 0.01034751, + "balance_loss_clip": 1.0211308, + "balance_loss_mlp": 1.03983057, + "epoch": 0.48321058169246955, + "flos": 20006588845440.0, + "grad_norm": 2.034912964838748, + "language_loss": 0.72518653, + "learning_rate": 2.205467347074847e-06, + "loss": 0.74665534, + "num_input_tokens_seen": 172755700, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.72265625, + "step": 8037, + "time_per_iteration": 2.4452965259552 + }, + { + "auxiliary_loss_clip": 0.01120439, + "auxiliary_loss_mlp": 0.0103565, + "balance_loss_clip": 1.02053404, + "balance_loss_mlp": 1.04226792, + "epoch": 0.4832707049451375, + "flos": 20741465197440.0, + "grad_norm": 2.369475157435804, + "language_loss": 0.69122416, + "learning_rate": 2.205079942181525e-06, + "loss": 0.71278501, + "num_input_tokens_seen": 172775185, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.78125, + "step": 8038, + "time_per_iteration": 2.4694747924804688 + }, + { + "auxiliary_loss_clip": 0.01114495, + "auxiliary_loss_mlp": 0.01036639, + "balance_loss_clip": 1.02201188, + "balance_loss_mlp": 1.04133189, + "epoch": 0.4833308281978055, + "flos": 33145584762240.0, + "grad_norm": 1.4952565926757524, + "language_loss": 0.78972542, + "learning_rate": 2.20469252951155e-06, + "loss": 0.8112368, + "num_input_tokens_seen": 172796990, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.73046875, + "step": 8039, + "time_per_iteration": 2.5778839588165283 + }, + { + "auxiliary_loss_clip": 0.01116832, + "auxiliary_loss_mlp": 0.01032622, + "balance_loss_clip": 1.01874638, + "balance_loss_mlp": 1.04335415, + "epoch": 0.48339095145047345, + "flos": 19099234362240.0, + "grad_norm": 1.6799663014860025, + "language_loss": 0.76981616, + "learning_rate": 2.2043051090796143e-06, + "loss": 0.79131073, + "num_input_tokens_seen": 172814915, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.734375, + "step": 8040, + "time_per_iteration": 2.4846322536468506 + }, + { + "auxiliary_loss_clip": 0.01116146, + "auxiliary_loss_mlp": 0.01037901, + "balance_loss_clip": 1.02283335, + "balance_loss_mlp": 1.04120946, + "epoch": 0.4834510747031414, + "flos": 34459448440320.0, + "grad_norm": 1.5584368035119462, + "language_loss": 0.75443131, + "learning_rate": 2.203917680900409e-06, + "loss": 0.77597177, + "num_input_tokens_seen": 172837060, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.75, + "step": 8041, + "time_per_iteration": 2.5853140354156494 + }, + { + "auxiliary_loss_clip": 0.01115991, + "auxiliary_loss_mlp": 0.0103594, + "balance_loss_clip": 1.02178383, + "balance_loss_mlp": 1.04486728, + "epoch": 0.48351119795580944, + "flos": 27380845065600.0, + "grad_norm": 1.8135207231669344, + "language_loss": 0.66745925, + "learning_rate": 2.203530244988624e-06, + "loss": 0.68897855, + "num_input_tokens_seen": 172856545, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7109375, + "step": 8042, + "time_per_iteration": 2.5322182178497314 + }, + { + "auxiliary_loss_clip": 0.01040325, + "auxiliary_loss_mlp": 0.00998367, + "balance_loss_clip": 0.99714488, + "balance_loss_mlp": 1.0165081, + "epoch": 0.4835713212084774, + "flos": 67143941291520.0, + "grad_norm": 0.687656922942032, + "language_loss": 0.58557642, + "learning_rate": 2.2031428013589517e-06, + "loss": 0.60596335, + "num_input_tokens_seen": 172923055, + "router_z_loss_clip": 0.01220703, + "router_z_loss_mlp": 0.23828125, + "step": 8043, + "time_per_iteration": 3.1435444355010986 + }, + { + "auxiliary_loss_clip": 0.01115264, + "auxiliary_loss_mlp": 0.01034627, + "balance_loss_clip": 1.01982713, + "balance_loss_mlp": 1.04060805, + "epoch": 0.48363144446114537, + "flos": 17967473660160.0, + "grad_norm": 1.8614249809437893, + "language_loss": 0.71973354, + "learning_rate": 2.2027553500260847e-06, + "loss": 0.7412324, + "num_input_tokens_seen": 172940700, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7421875, + "step": 8044, + "time_per_iteration": 2.4688329696655273 + }, + { + "auxiliary_loss_clip": 0.01113296, + "auxiliary_loss_mlp": 0.01032041, + "balance_loss_clip": 1.01702118, + "balance_loss_mlp": 1.04181921, + "epoch": 0.48369156771381333, + "flos": 20593513077120.0, + "grad_norm": 1.358705165779184, + "language_loss": 0.75938857, + "learning_rate": 2.202367891004714e-06, + "loss": 0.78084195, + "num_input_tokens_seen": 172961125, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.71484375, + "step": 8045, + "time_per_iteration": 2.455991506576538 + }, + { + "auxiliary_loss_clip": 0.01117445, + "auxiliary_loss_mlp": 0.0104056, + "balance_loss_clip": 1.02640939, + "balance_loss_mlp": 1.04251719, + "epoch": 0.4837516909664813, + "flos": 22675075159680.0, + "grad_norm": 1.8505124624812508, + "language_loss": 0.69661564, + "learning_rate": 2.201980424309533e-06, + "loss": 0.71819568, + "num_input_tokens_seen": 172980405, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.75, + "step": 8046, + "time_per_iteration": 2.480437994003296 + }, + { + "auxiliary_loss_clip": 0.01113741, + "auxiliary_loss_mlp": 0.01036039, + "balance_loss_clip": 1.02159083, + "balance_loss_mlp": 1.04073739, + "epoch": 0.48381181421914926, + "flos": 25518625384320.0, + "grad_norm": 3.209923694390607, + "language_loss": 0.819103, + "learning_rate": 2.2015929499552337e-06, + "loss": 0.84060085, + "num_input_tokens_seen": 172999105, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.73046875, + "step": 8047, + "time_per_iteration": 2.4875996112823486 + }, + { + "auxiliary_loss_clip": 0.01111465, + "auxiliary_loss_mlp": 0.01031694, + "balance_loss_clip": 1.01802719, + "balance_loss_mlp": 1.04047942, + "epoch": 0.4838719374718172, + "flos": 24207491139840.0, + "grad_norm": 1.602624612336977, + "language_loss": 0.80215144, + "learning_rate": 2.2012054679565092e-06, + "loss": 0.82358307, + "num_input_tokens_seen": 173019935, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7109375, + "step": 8048, + "time_per_iteration": 2.5097532272338867 + }, + { + "auxiliary_loss_clip": 0.0111735, + "auxiliary_loss_mlp": 0.01036589, + "balance_loss_clip": 1.02204585, + "balance_loss_mlp": 1.0415504, + "epoch": 0.4839320607244852, + "flos": 26724577628160.0, + "grad_norm": 1.5504815305200743, + "language_loss": 0.81360143, + "learning_rate": 2.200817978328054e-06, + "loss": 0.83514082, + "num_input_tokens_seen": 173039700, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7578125, + "step": 8049, + "time_per_iteration": 2.5025296211242676 + }, + { + "auxiliary_loss_clip": 0.01111119, + "auxiliary_loss_mlp": 0.01034433, + "balance_loss_clip": 1.02170801, + "balance_loss_mlp": 1.04200411, + "epoch": 0.48399218397715316, + "flos": 20448900921600.0, + "grad_norm": 1.7765572151997517, + "language_loss": 0.72636938, + "learning_rate": 2.2004304810845602e-06, + "loss": 0.74782485, + "num_input_tokens_seen": 173059170, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 8050, + "time_per_iteration": 2.4983279705047607 + }, + { + "auxiliary_loss_clip": 0.01039152, + "auxiliary_loss_mlp": 0.01005399, + "balance_loss_clip": 1.00414741, + "balance_loss_mlp": 1.01505625, + "epoch": 0.4840523072298211, + "flos": 67180570185600.0, + "grad_norm": 0.7015070380534334, + "language_loss": 0.56459856, + "learning_rate": 2.200042976240723e-06, + "loss": 0.58504415, + "num_input_tokens_seen": 173119000, + "router_z_loss_clip": 0.01251221, + "router_z_loss_mlp": 0.24121094, + "step": 8051, + "time_per_iteration": 3.1124837398529053 + }, + { + "auxiliary_loss_clip": 0.01116144, + "auxiliary_loss_mlp": 0.01033942, + "balance_loss_clip": 1.0198456, + "balance_loss_mlp": 1.04258502, + "epoch": 0.4841124304824891, + "flos": 22411490181120.0, + "grad_norm": 2.416646260203107, + "language_loss": 0.7510823, + "learning_rate": 2.199655463811236e-06, + "loss": 0.77258313, + "num_input_tokens_seen": 173137570, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.734375, + "step": 8052, + "time_per_iteration": 3.970653772354126 + }, + { + "auxiliary_loss_clip": 0.01114314, + "auxiliary_loss_mlp": 0.01033056, + "balance_loss_clip": 1.01953709, + "balance_loss_mlp": 1.04124272, + "epoch": 0.48417255373515705, + "flos": 13843959217920.0, + "grad_norm": 3.0848333967382855, + "language_loss": 0.65859687, + "learning_rate": 2.1992679438107936e-06, + "loss": 0.68007052, + "num_input_tokens_seen": 173154355, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73046875, + "step": 8053, + "time_per_iteration": 2.489314079284668 + }, + { + "auxiliary_loss_clip": 0.01108306, + "auxiliary_loss_mlp": 0.01033063, + "balance_loss_clip": 1.01981306, + "balance_loss_mlp": 1.03776336, + "epoch": 0.484232676987825, + "flos": 31649689935360.0, + "grad_norm": 1.8753990029707186, + "language_loss": 0.6933912, + "learning_rate": 2.198880416254091e-06, + "loss": 0.71480489, + "num_input_tokens_seen": 173174845, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.703125, + "step": 8054, + "time_per_iteration": 4.118170976638794 + }, + { + "auxiliary_loss_clip": 0.01110556, + "auxiliary_loss_mlp": 0.01035408, + "balance_loss_clip": 1.02187181, + "balance_loss_mlp": 1.03860784, + "epoch": 0.48429280024049304, + "flos": 24095377814400.0, + "grad_norm": 1.7081803235265158, + "language_loss": 0.69577026, + "learning_rate": 2.1984928811558233e-06, + "loss": 0.7172299, + "num_input_tokens_seen": 173195025, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71875, + "step": 8055, + "time_per_iteration": 3.932403326034546 + }, + { + "auxiliary_loss_clip": 0.01115214, + "auxiliary_loss_mlp": 0.01036587, + "balance_loss_clip": 1.0229013, + "balance_loss_mlp": 1.04260492, + "epoch": 0.484352923493161, + "flos": 17530081747200.0, + "grad_norm": 2.9345474086324397, + "language_loss": 0.631603, + "learning_rate": 2.198105338530685e-06, + "loss": 0.65312105, + "num_input_tokens_seen": 173213065, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7265625, + "step": 8056, + "time_per_iteration": 2.4628608226776123 + }, + { + "auxiliary_loss_clip": 0.01110953, + "auxiliary_loss_mlp": 0.0103397, + "balance_loss_clip": 1.01945043, + "balance_loss_mlp": 1.03856075, + "epoch": 0.48441304674582897, + "flos": 29166862043520.0, + "grad_norm": 1.6727278675155979, + "language_loss": 0.67380416, + "learning_rate": 2.1977177883933726e-06, + "loss": 0.69525343, + "num_input_tokens_seen": 173234545, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7265625, + "step": 8057, + "time_per_iteration": 2.5488758087158203 + }, + { + "auxiliary_loss_clip": 0.0111047, + "auxiliary_loss_mlp": 0.01036278, + "balance_loss_clip": 1.02286661, + "balance_loss_mlp": 1.03944063, + "epoch": 0.48447316999849693, + "flos": 15886701676800.0, + "grad_norm": 1.62294394814829, + "language_loss": 0.81633735, + "learning_rate": 2.1973302307585827e-06, + "loss": 0.83780485, + "num_input_tokens_seen": 173252175, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7109375, + "step": 8058, + "time_per_iteration": 2.4864389896392822 + }, + { + "auxiliary_loss_clip": 0.01116596, + "auxiliary_loss_mlp": 0.01038397, + "balance_loss_clip": 1.02458692, + "balance_loss_mlp": 1.04142284, + "epoch": 0.4845332932511649, + "flos": 24381405815040.0, + "grad_norm": 1.5675258134335472, + "language_loss": 0.79917222, + "learning_rate": 2.1969426656410097e-06, + "loss": 0.82072222, + "num_input_tokens_seen": 173268790, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.75390625, + "step": 8059, + "time_per_iteration": 2.4964730739593506 + }, + { + "auxiliary_loss_clip": 0.01117834, + "auxiliary_loss_mlp": 0.0104156, + "balance_loss_clip": 1.02709424, + "balance_loss_mlp": 1.04217446, + "epoch": 0.48459341650383286, + "flos": 37116478316160.0, + "grad_norm": 2.4233986338774347, + "language_loss": 0.66882968, + "learning_rate": 2.196555093055352e-06, + "loss": 0.69042355, + "num_input_tokens_seen": 173288030, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7578125, + "step": 8060, + "time_per_iteration": 2.6209259033203125 + }, + { + "auxiliary_loss_clip": 0.01116591, + "auxiliary_loss_mlp": 0.01040178, + "balance_loss_clip": 1.02654088, + "balance_loss_mlp": 1.04357326, + "epoch": 0.48465353975650083, + "flos": 22966777509120.0, + "grad_norm": 1.8494683744964096, + "language_loss": 0.67328548, + "learning_rate": 2.1961675130163046e-06, + "loss": 0.69485319, + "num_input_tokens_seen": 173305965, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73046875, + "step": 8061, + "time_per_iteration": 2.460986614227295 + }, + { + "auxiliary_loss_clip": 0.01116735, + "auxiliary_loss_mlp": 0.0104191, + "balance_loss_clip": 1.0274322, + "balance_loss_mlp": 1.04356933, + "epoch": 0.4847136630091688, + "flos": 17707695523200.0, + "grad_norm": 2.133282380017761, + "language_loss": 0.82559311, + "learning_rate": 2.1957799255385653e-06, + "loss": 0.84717953, + "num_input_tokens_seen": 173321985, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.73046875, + "step": 8062, + "time_per_iteration": 2.453993320465088 + }, + { + "auxiliary_loss_clip": 0.01112047, + "auxiliary_loss_mlp": 0.01035491, + "balance_loss_clip": 1.022277, + "balance_loss_mlp": 1.04087675, + "epoch": 0.48477378626183676, + "flos": 22018269018240.0, + "grad_norm": 1.7643008090816974, + "language_loss": 0.7443378, + "learning_rate": 2.1953923306368325e-06, + "loss": 0.76581317, + "num_input_tokens_seen": 173341315, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 8063, + "time_per_iteration": 2.4603588581085205 + }, + { + "auxiliary_loss_clip": 0.01113086, + "auxiliary_loss_mlp": 0.01033324, + "balance_loss_clip": 1.01978183, + "balance_loss_mlp": 1.04069591, + "epoch": 0.4848339095145047, + "flos": 27962956874880.0, + "grad_norm": 1.6491790763512546, + "language_loss": 0.78826106, + "learning_rate": 2.1950047283258023e-06, + "loss": 0.80972517, + "num_input_tokens_seen": 173361055, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7265625, + "step": 8064, + "time_per_iteration": 2.5214664936065674 + }, + { + "auxiliary_loss_clip": 0.01110182, + "auxiliary_loss_mlp": 0.01036452, + "balance_loss_clip": 1.02426863, + "balance_loss_mlp": 1.04178667, + "epoch": 0.4848940327671727, + "flos": 21688752625920.0, + "grad_norm": 1.866783501124255, + "language_loss": 0.79383814, + "learning_rate": 2.194617118620173e-06, + "loss": 0.81530446, + "num_input_tokens_seen": 173379255, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.68359375, + "step": 8065, + "time_per_iteration": 2.445235013961792 + }, + { + "auxiliary_loss_clip": 0.01104927, + "auxiliary_loss_mlp": 0.01033365, + "balance_loss_clip": 1.02112269, + "balance_loss_mlp": 1.03714252, + "epoch": 0.48495415601984065, + "flos": 20631578515200.0, + "grad_norm": 2.505071872189949, + "language_loss": 0.76120496, + "learning_rate": 2.194229501534644e-06, + "loss": 0.78258789, + "num_input_tokens_seen": 173398370, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.67578125, + "step": 8066, + "time_per_iteration": 2.484790325164795 + }, + { + "auxiliary_loss_clip": 0.01111648, + "auxiliary_loss_mlp": 0.01033332, + "balance_loss_clip": 1.02022457, + "balance_loss_mlp": 1.04121971, + "epoch": 0.4850142792725086, + "flos": 25628152930560.0, + "grad_norm": 1.8377201756800503, + "language_loss": 0.7205655, + "learning_rate": 2.193841877083912e-06, + "loss": 0.74201524, + "num_input_tokens_seen": 173419595, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 8067, + "time_per_iteration": 2.4876203536987305 + }, + { + "auxiliary_loss_clip": 0.01112073, + "auxiliary_loss_mlp": 0.0103587, + "balance_loss_clip": 1.02231634, + "balance_loss_mlp": 1.04024172, + "epoch": 0.4850744025251766, + "flos": 13771958405760.0, + "grad_norm": 2.0010459311949393, + "language_loss": 0.79434109, + "learning_rate": 2.1934542452826767e-06, + "loss": 0.81582052, + "num_input_tokens_seen": 173435390, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.71875, + "step": 8068, + "time_per_iteration": 2.4537808895111084 + }, + { + "auxiliary_loss_clip": 0.01108507, + "auxiliary_loss_mlp": 0.01034395, + "balance_loss_clip": 1.02171147, + "balance_loss_mlp": 1.0385673, + "epoch": 0.4851345257778446, + "flos": 20261339078400.0, + "grad_norm": 1.4177927500996443, + "language_loss": 0.8413924, + "learning_rate": 2.193066606145638e-06, + "loss": 0.86282146, + "num_input_tokens_seen": 173454095, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 8069, + "time_per_iteration": 2.4553275108337402 + }, + { + "auxiliary_loss_clip": 0.0110935, + "auxiliary_loss_mlp": 0.0103299, + "balance_loss_clip": 1.02042496, + "balance_loss_mlp": 1.03913558, + "epoch": 0.48519464903051257, + "flos": 27089681420160.0, + "grad_norm": 1.6522403411207847, + "language_loss": 0.77863526, + "learning_rate": 2.192678959687493e-06, + "loss": 0.8000586, + "num_input_tokens_seen": 173475300, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 8070, + "time_per_iteration": 2.5032036304473877 + }, + { + "auxiliary_loss_clip": 0.01110754, + "auxiliary_loss_mlp": 0.01033168, + "balance_loss_clip": 1.01985812, + "balance_loss_mlp": 1.0400399, + "epoch": 0.48525477228318054, + "flos": 17127235739520.0, + "grad_norm": 2.1929202067055993, + "language_loss": 0.78031409, + "learning_rate": 2.192291305922943e-06, + "loss": 0.80175334, + "num_input_tokens_seen": 173492005, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.70703125, + "step": 8071, + "time_per_iteration": 2.4315407276153564 + }, + { + "auxiliary_loss_clip": 0.01109006, + "auxiliary_loss_mlp": 0.01032016, + "balance_loss_clip": 1.01822925, + "balance_loss_mlp": 1.03733289, + "epoch": 0.4853148955358485, + "flos": 28180324028160.0, + "grad_norm": 1.7778798626181176, + "language_loss": 0.72204757, + "learning_rate": 2.1919036448666873e-06, + "loss": 0.74345779, + "num_input_tokens_seen": 173511995, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.71875, + "step": 8072, + "time_per_iteration": 2.510474920272827 + }, + { + "auxiliary_loss_clip": 0.01116993, + "auxiliary_loss_mlp": 0.01039835, + "balance_loss_clip": 1.02580357, + "balance_loss_mlp": 1.04254019, + "epoch": 0.48537501878851647, + "flos": 17493309198720.0, + "grad_norm": 2.999761551965867, + "language_loss": 0.8779549, + "learning_rate": 2.1915159765334262e-06, + "loss": 0.89952314, + "num_input_tokens_seen": 173530215, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 8073, + "time_per_iteration": 2.4295654296875 + }, + { + "auxiliary_loss_clip": 0.01106811, + "auxiliary_loss_mlp": 0.01031305, + "balance_loss_clip": 1.01805508, + "balance_loss_mlp": 1.03857493, + "epoch": 0.48543514204118443, + "flos": 28584857975040.0, + "grad_norm": 1.702758380167849, + "language_loss": 0.60793108, + "learning_rate": 2.19112830093786e-06, + "loss": 0.62931222, + "num_input_tokens_seen": 173550920, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.68359375, + "step": 8074, + "time_per_iteration": 2.641831636428833 + }, + { + "auxiliary_loss_clip": 0.01112393, + "auxiliary_loss_mlp": 0.01039275, + "balance_loss_clip": 1.02540481, + "balance_loss_mlp": 1.03871894, + "epoch": 0.4854952652938524, + "flos": 20959981585920.0, + "grad_norm": 1.6649133015556126, + "language_loss": 0.73151296, + "learning_rate": 2.19074061809469e-06, + "loss": 0.75302958, + "num_input_tokens_seen": 173569065, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.734375, + "step": 8075, + "time_per_iteration": 2.4624290466308594 + }, + { + "auxiliary_loss_clip": 0.01108632, + "auxiliary_loss_mlp": 0.01035606, + "balance_loss_clip": 1.02328563, + "balance_loss_mlp": 1.04028702, + "epoch": 0.48555538854652036, + "flos": 66529543155840.0, + "grad_norm": 1.6285965401893183, + "language_loss": 0.82012558, + "learning_rate": 2.1903529280186163e-06, + "loss": 0.84156799, + "num_input_tokens_seen": 173596085, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.68359375, + "step": 8076, + "time_per_iteration": 2.902468681335449 + }, + { + "auxiliary_loss_clip": 0.01112144, + "auxiliary_loss_mlp": 0.01033517, + "balance_loss_clip": 1.01899099, + "balance_loss_mlp": 1.0407958, + "epoch": 0.4856155117991883, + "flos": 15924982596480.0, + "grad_norm": 1.793912725367087, + "language_loss": 0.86204815, + "learning_rate": 2.1899652307243407e-06, + "loss": 0.88350475, + "num_input_tokens_seen": 173613900, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7109375, + "step": 8077, + "time_per_iteration": 2.4470572471618652 + }, + { + "auxiliary_loss_clip": 0.01035955, + "auxiliary_loss_mlp": 0.01003512, + "balance_loss_clip": 1.00206935, + "balance_loss_mlp": 1.01168394, + "epoch": 0.4856756350518563, + "flos": 71047395060480.0, + "grad_norm": 0.9017192941717106, + "language_loss": 0.58489066, + "learning_rate": 2.189577526226564e-06, + "loss": 0.60528529, + "num_input_tokens_seen": 173671305, + "router_z_loss_clip": 0.0144043, + "router_z_loss_mlp": 0.24316406, + "step": 8078, + "time_per_iteration": 3.061302661895752 + }, + { + "auxiliary_loss_clip": 0.01115187, + "auxiliary_loss_mlp": 0.01030587, + "balance_loss_clip": 1.01750946, + "balance_loss_mlp": 1.04146993, + "epoch": 0.48573575830452426, + "flos": 29825679346560.0, + "grad_norm": 1.8290534457206422, + "language_loss": 0.72197151, + "learning_rate": 2.1891898145399884e-06, + "loss": 0.7434293, + "num_input_tokens_seen": 173692070, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.734375, + "step": 8079, + "time_per_iteration": 2.545018434524536 + }, + { + "auxiliary_loss_clip": 0.0111477, + "auxiliary_loss_mlp": 0.01029859, + "balance_loss_clip": 1.01643038, + "balance_loss_mlp": 1.04235518, + "epoch": 0.4857958815571922, + "flos": 17639501552640.0, + "grad_norm": 2.180592453343409, + "language_loss": 0.79515052, + "learning_rate": 2.1888020956793172e-06, + "loss": 0.81659681, + "num_input_tokens_seen": 173709785, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.72265625, + "step": 8080, + "time_per_iteration": 2.4793026447296143 + }, + { + "auxiliary_loss_clip": 0.01111199, + "auxiliary_loss_mlp": 0.01030093, + "balance_loss_clip": 1.01659858, + "balance_loss_mlp": 1.03938115, + "epoch": 0.4858560048098602, + "flos": 21105491581440.0, + "grad_norm": 2.102088815710231, + "language_loss": 0.83866465, + "learning_rate": 2.188414369659251e-06, + "loss": 0.86007756, + "num_input_tokens_seen": 173728770, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71875, + "step": 8081, + "time_per_iteration": 2.4615542888641357 + }, + { + "auxiliary_loss_clip": 0.0110941, + "auxiliary_loss_mlp": 0.01032193, + "balance_loss_clip": 1.01766098, + "balance_loss_mlp": 1.03858256, + "epoch": 0.4859161280625282, + "flos": 22090844448000.0, + "grad_norm": 1.4514708090647532, + "language_loss": 0.83281112, + "learning_rate": 2.1880266364944924e-06, + "loss": 0.85422719, + "num_input_tokens_seen": 173747355, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.70703125, + "step": 8082, + "time_per_iteration": 2.506359100341797 + }, + { + "auxiliary_loss_clip": 0.01111508, + "auxiliary_loss_mlp": 0.01032003, + "balance_loss_clip": 1.01930749, + "balance_loss_mlp": 1.04239488, + "epoch": 0.4859762513151962, + "flos": 17493452853120.0, + "grad_norm": 2.0513098734750153, + "language_loss": 0.87210095, + "learning_rate": 2.187638896199746e-06, + "loss": 0.89353603, + "num_input_tokens_seen": 173764825, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 8083, + "time_per_iteration": 2.4269142150878906 + }, + { + "auxiliary_loss_clip": 0.01109655, + "auxiliary_loss_mlp": 0.01038876, + "balance_loss_clip": 1.0264957, + "balance_loss_mlp": 1.03958535, + "epoch": 0.48603637456786414, + "flos": 18004246208640.0, + "grad_norm": 1.6599209376706838, + "language_loss": 0.8107174, + "learning_rate": 2.1872511487897126e-06, + "loss": 0.83220273, + "num_input_tokens_seen": 173783215, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69921875, + "step": 8084, + "time_per_iteration": 2.451949119567871 + }, + { + "auxiliary_loss_clip": 0.01112614, + "auxiliary_loss_mlp": 0.01035292, + "balance_loss_clip": 1.02148795, + "balance_loss_mlp": 1.04034543, + "epoch": 0.4860964978205321, + "flos": 22492038430080.0, + "grad_norm": 2.346430029405153, + "language_loss": 0.68347323, + "learning_rate": 2.186863394279098e-06, + "loss": 0.70495236, + "num_input_tokens_seen": 173801905, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.72265625, + "step": 8085, + "time_per_iteration": 2.499215841293335 + }, + { + "auxiliary_loss_clip": 0.0111142, + "auxiliary_loss_mlp": 0.01040793, + "balance_loss_clip": 1.0276444, + "balance_loss_mlp": 1.04064536, + "epoch": 0.48615662107320007, + "flos": 23372532518400.0, + "grad_norm": 1.46412171762657, + "language_loss": 0.77375883, + "learning_rate": 2.1864756326826046e-06, + "loss": 0.79528093, + "num_input_tokens_seen": 173824690, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.70703125, + "step": 8086, + "time_per_iteration": 2.541616678237915 + }, + { + "auxiliary_loss_clip": 0.01111956, + "auxiliary_loss_mlp": 0.01029921, + "balance_loss_clip": 1.01655173, + "balance_loss_mlp": 1.04059958, + "epoch": 0.48621674432586803, + "flos": 34418833136640.0, + "grad_norm": 1.9494281519542558, + "language_loss": 0.69733107, + "learning_rate": 2.1860878640149355e-06, + "loss": 0.71874988, + "num_input_tokens_seen": 173844450, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7109375, + "step": 8087, + "time_per_iteration": 2.5694613456726074 + }, + { + "auxiliary_loss_clip": 0.01115057, + "auxiliary_loss_mlp": 0.01037115, + "balance_loss_clip": 1.02278614, + "balance_loss_mlp": 1.03913963, + "epoch": 0.486276867578536, + "flos": 33107555237760.0, + "grad_norm": 1.610275852133116, + "language_loss": 0.72411895, + "learning_rate": 2.1857000882907974e-06, + "loss": 0.7456407, + "num_input_tokens_seen": 173864975, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7578125, + "step": 8088, + "time_per_iteration": 2.5770511627197266 + }, + { + "auxiliary_loss_clip": 0.01111259, + "auxiliary_loss_mlp": 0.01037547, + "balance_loss_clip": 1.02443993, + "balance_loss_mlp": 1.04033983, + "epoch": 0.48633699083120396, + "flos": 21470703114240.0, + "grad_norm": 1.6468852838011347, + "language_loss": 0.7557345, + "learning_rate": 2.185312305524892e-06, + "loss": 0.77722251, + "num_input_tokens_seen": 173883805, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 8089, + "time_per_iteration": 2.4625489711761475 + }, + { + "auxiliary_loss_clip": 0.01114004, + "auxiliary_loss_mlp": 0.01030212, + "balance_loss_clip": 1.0165205, + "balance_loss_mlp": 1.04078937, + "epoch": 0.48639711408387193, + "flos": 20084335833600.0, + "grad_norm": 1.5811587339913937, + "language_loss": 0.83939755, + "learning_rate": 2.184924515731926e-06, + "loss": 0.86083972, + "num_input_tokens_seen": 173903520, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73046875, + "step": 8090, + "time_per_iteration": 2.500293731689453 + }, + { + "auxiliary_loss_clip": 0.01107626, + "auxiliary_loss_mlp": 0.010336, + "balance_loss_clip": 1.02016521, + "balance_loss_mlp": 1.03945088, + "epoch": 0.4864572373365399, + "flos": 20778884190720.0, + "grad_norm": 1.6075799019512609, + "language_loss": 0.76256877, + "learning_rate": 2.1845367189266045e-06, + "loss": 0.78398097, + "num_input_tokens_seen": 173924255, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.68359375, + "step": 8091, + "time_per_iteration": 2.465998411178589 + }, + { + "auxiliary_loss_clip": 0.01110716, + "auxiliary_loss_mlp": 0.01030434, + "balance_loss_clip": 1.01714182, + "balance_loss_mlp": 1.03904068, + "epoch": 0.48651736058920786, + "flos": 26025360503040.0, + "grad_norm": 1.4690121920213544, + "language_loss": 0.80391169, + "learning_rate": 2.184148915123631e-06, + "loss": 0.82532316, + "num_input_tokens_seen": 173943285, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 8092, + "time_per_iteration": 2.509016513824463 + }, + { + "auxiliary_loss_clip": 0.01113066, + "auxiliary_loss_mlp": 0.01030079, + "balance_loss_clip": 1.01679361, + "balance_loss_mlp": 1.040061, + "epoch": 0.4865774838418758, + "flos": 20485601642880.0, + "grad_norm": 1.4222056252501818, + "language_loss": 0.71696734, + "learning_rate": 2.1837611043377126e-06, + "loss": 0.73839879, + "num_input_tokens_seen": 173962205, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73046875, + "step": 8093, + "time_per_iteration": 2.47951078414917 + }, + { + "auxiliary_loss_clip": 0.01109125, + "auxiliary_loss_mlp": 0.01032178, + "balance_loss_clip": 1.0194819, + "balance_loss_mlp": 1.03917289, + "epoch": 0.4866376070945438, + "flos": 23547704169600.0, + "grad_norm": 1.5524869827771763, + "language_loss": 0.67529863, + "learning_rate": 2.1833732865835545e-06, + "loss": 0.69671166, + "num_input_tokens_seen": 173980945, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 8094, + "time_per_iteration": 3.9874253273010254 + }, + { + "auxiliary_loss_clip": 0.01116894, + "auxiliary_loss_mlp": 0.01033116, + "balance_loss_clip": 1.01933527, + "balance_loss_mlp": 1.04218793, + "epoch": 0.4866977303472118, + "flos": 16690598012160.0, + "grad_norm": 1.8480915023468016, + "language_loss": 0.66936231, + "learning_rate": 2.1829854618758636e-06, + "loss": 0.69086242, + "num_input_tokens_seen": 173998860, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.74609375, + "step": 8095, + "time_per_iteration": 2.477593183517456 + }, + { + "auxiliary_loss_clip": 0.01112855, + "auxiliary_loss_mlp": 0.01032969, + "balance_loss_clip": 1.01847899, + "balance_loss_mlp": 1.04048705, + "epoch": 0.4867578535998798, + "flos": 17896011552000.0, + "grad_norm": 2.265808316415622, + "language_loss": 0.78996563, + "learning_rate": 2.182597630229345e-06, + "loss": 0.8114239, + "num_input_tokens_seen": 174016665, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.72265625, + "step": 8096, + "time_per_iteration": 5.404834985733032 + }, + { + "auxiliary_loss_clip": 0.01107949, + "auxiliary_loss_mlp": 0.01032056, + "balance_loss_clip": 1.01872253, + "balance_loss_mlp": 1.03737998, + "epoch": 0.48681797685254774, + "flos": 22637799820800.0, + "grad_norm": 1.7396987354687747, + "language_loss": 0.67313123, + "learning_rate": 2.1822097916587067e-06, + "loss": 0.69453126, + "num_input_tokens_seen": 174034800, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.70703125, + "step": 8097, + "time_per_iteration": 2.450967788696289 + }, + { + "auxiliary_loss_clip": 0.01108969, + "auxiliary_loss_mlp": 0.01033813, + "balance_loss_clip": 1.02071154, + "balance_loss_mlp": 1.03922939, + "epoch": 0.4868781001052157, + "flos": 20886077352960.0, + "grad_norm": 1.4534902730904964, + "language_loss": 0.71347374, + "learning_rate": 2.1818219461786543e-06, + "loss": 0.73490155, + "num_input_tokens_seen": 174054445, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.69921875, + "step": 8098, + "time_per_iteration": 2.4994144439697266 + }, + { + "auxiliary_loss_clip": 0.01116904, + "auxiliary_loss_mlp": 0.01037143, + "balance_loss_clip": 1.02274871, + "balance_loss_mlp": 1.04109979, + "epoch": 0.48693822335788367, + "flos": 41974940937600.0, + "grad_norm": 1.7962943745015671, + "language_loss": 0.66037756, + "learning_rate": 2.1814340938038956e-06, + "loss": 0.68191803, + "num_input_tokens_seen": 174077890, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7578125, + "step": 8099, + "time_per_iteration": 2.624321222305298 + }, + { + "auxiliary_loss_clip": 0.01107533, + "auxiliary_loss_mlp": 0.01032829, + "balance_loss_clip": 1.01988339, + "balance_loss_mlp": 1.03698707, + "epoch": 0.48699834661055164, + "flos": 24243294021120.0, + "grad_norm": 1.6079322443898665, + "language_loss": 0.66464651, + "learning_rate": 2.181046234549138e-06, + "loss": 0.68605012, + "num_input_tokens_seen": 174097460, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.70703125, + "step": 8100, + "time_per_iteration": 2.52364182472229 + }, + { + "auxiliary_loss_clip": 0.01108299, + "auxiliary_loss_mlp": 0.01030722, + "balance_loss_clip": 1.01802635, + "balance_loss_mlp": 1.03990841, + "epoch": 0.4870584698632196, + "flos": 25923877603200.0, + "grad_norm": 1.3375285332360751, + "language_loss": 0.76606798, + "learning_rate": 2.180658368429088e-06, + "loss": 0.78745818, + "num_input_tokens_seen": 174120775, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.68359375, + "step": 8101, + "time_per_iteration": 2.5515174865722656 + }, + { + "auxiliary_loss_clip": 0.01037344, + "auxiliary_loss_mlp": 0.01004126, + "balance_loss_clip": 1.00279069, + "balance_loss_mlp": 1.01343942, + "epoch": 0.48711859311588757, + "flos": 70211933648640.0, + "grad_norm": 0.6857117323737989, + "language_loss": 0.52317238, + "learning_rate": 2.1802704954584565e-06, + "loss": 0.54358709, + "num_input_tokens_seen": 174189135, + "router_z_loss_clip": 0.0133667, + "router_z_loss_mlp": 0.23925781, + "step": 8102, + "time_per_iteration": 3.2370035648345947 + }, + { + "auxiliary_loss_clip": 0.01110233, + "auxiliary_loss_mlp": 0.01033636, + "balance_loss_clip": 1.02098215, + "balance_loss_mlp": 1.03864419, + "epoch": 0.48717871636855553, + "flos": 12342964659840.0, + "grad_norm": 2.066543814817077, + "language_loss": 0.73703957, + "learning_rate": 2.1798826156519484e-06, + "loss": 0.75847828, + "num_input_tokens_seen": 174203250, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71484375, + "step": 8103, + "time_per_iteration": 2.401146650314331 + }, + { + "auxiliary_loss_clip": 0.01113681, + "auxiliary_loss_mlp": 0.01042266, + "balance_loss_clip": 1.02845609, + "balance_loss_mlp": 1.04083562, + "epoch": 0.4872388396212235, + "flos": 23477139901440.0, + "grad_norm": 2.0729106414348686, + "language_loss": 0.62816393, + "learning_rate": 2.1794947290242737e-06, + "loss": 0.64972341, + "num_input_tokens_seen": 174224145, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7265625, + "step": 8104, + "time_per_iteration": 2.489887237548828 + }, + { + "auxiliary_loss_clip": 0.01111014, + "auxiliary_loss_mlp": 0.01029613, + "balance_loss_clip": 1.01661348, + "balance_loss_mlp": 1.04093325, + "epoch": 0.48729896287389146, + "flos": 31427582186880.0, + "grad_norm": 2.098514623938467, + "language_loss": 0.68962336, + "learning_rate": 2.1791068355901413e-06, + "loss": 0.71102965, + "num_input_tokens_seen": 174244435, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.703125, + "step": 8105, + "time_per_iteration": 2.521994113922119 + }, + { + "auxiliary_loss_clip": 0.01106075, + "auxiliary_loss_mlp": 0.01029561, + "balance_loss_clip": 1.01682925, + "balance_loss_mlp": 1.0371716, + "epoch": 0.4873590861265594, + "flos": 19057936700160.0, + "grad_norm": 1.8440715600711883, + "language_loss": 0.73333305, + "learning_rate": 2.178718935364259e-06, + "loss": 0.75468934, + "num_input_tokens_seen": 174262710, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 8106, + "time_per_iteration": 2.471409797668457 + }, + { + "auxiliary_loss_clip": 0.01116936, + "auxiliary_loss_mlp": 0.01033734, + "balance_loss_clip": 1.01994157, + "balance_loss_mlp": 1.04300117, + "epoch": 0.4874192093792274, + "flos": 24348296453760.0, + "grad_norm": 1.861183691551934, + "language_loss": 0.77122629, + "learning_rate": 2.1783310283613373e-06, + "loss": 0.79273301, + "num_input_tokens_seen": 174281545, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.73828125, + "step": 8107, + "time_per_iteration": 2.4802913665771484 + }, + { + "auxiliary_loss_clip": 0.01109095, + "auxiliary_loss_mlp": 0.01027736, + "balance_loss_clip": 1.01563621, + "balance_loss_mlp": 1.04061639, + "epoch": 0.4874793326318954, + "flos": 23112610727040.0, + "grad_norm": 1.543990493512169, + "language_loss": 0.75148052, + "learning_rate": 2.1779431145960853e-06, + "loss": 0.77284884, + "num_input_tokens_seen": 174300290, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 8108, + "time_per_iteration": 2.4680538177490234 + }, + { + "auxiliary_loss_clip": 0.01108856, + "auxiliary_loss_mlp": 0.0102965, + "balance_loss_clip": 1.01803327, + "balance_loss_mlp": 1.04023099, + "epoch": 0.4875394558845634, + "flos": 19026156142080.0, + "grad_norm": 1.75674444511609, + "language_loss": 0.73340857, + "learning_rate": 2.177555194083212e-06, + "loss": 0.75479364, + "num_input_tokens_seen": 174318490, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6875, + "step": 8109, + "time_per_iteration": 2.4528889656066895 + }, + { + "auxiliary_loss_clip": 0.01108152, + "auxiliary_loss_mlp": 0.01030298, + "balance_loss_clip": 1.0175966, + "balance_loss_mlp": 1.0391928, + "epoch": 0.48759957913723134, + "flos": 21433607343360.0, + "grad_norm": 1.7970671112238439, + "language_loss": 0.78590822, + "learning_rate": 2.177167266837428e-06, + "loss": 0.80729276, + "num_input_tokens_seen": 174335505, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 8110, + "time_per_iteration": 2.4653971195220947 + }, + { + "auxiliary_loss_clip": 0.0111191, + "auxiliary_loss_mlp": 0.01040228, + "balance_loss_clip": 1.02730024, + "balance_loss_mlp": 1.04083896, + "epoch": 0.4876597023898993, + "flos": 17748669962880.0, + "grad_norm": 1.8027530171186463, + "language_loss": 0.72216076, + "learning_rate": 2.176779332873444e-06, + "loss": 0.74368215, + "num_input_tokens_seen": 174353990, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 8111, + "time_per_iteration": 2.4242806434631348 + }, + { + "auxiliary_loss_clip": 0.0111024, + "auxiliary_loss_mlp": 0.01034699, + "balance_loss_clip": 1.02137125, + "balance_loss_mlp": 1.04143023, + "epoch": 0.4877198256425673, + "flos": 17019647527680.0, + "grad_norm": 1.5451794032223725, + "language_loss": 0.75719351, + "learning_rate": 2.17639139220597e-06, + "loss": 0.77864289, + "num_input_tokens_seen": 174373425, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6875, + "step": 8112, + "time_per_iteration": 2.4681711196899414 + }, + { + "auxiliary_loss_clip": 0.0111572, + "auxiliary_loss_mlp": 0.01036111, + "balance_loss_clip": 1.0223484, + "balance_loss_mlp": 1.04125154, + "epoch": 0.48777994889523524, + "flos": 22384091082240.0, + "grad_norm": 1.5422638957013077, + "language_loss": 0.75012642, + "learning_rate": 2.1760034448497166e-06, + "loss": 0.77164471, + "num_input_tokens_seen": 174393070, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.74609375, + "step": 8113, + "time_per_iteration": 2.458070993423462 + }, + { + "auxiliary_loss_clip": 0.0103493, + "auxiliary_loss_mlp": 0.00999333, + "balance_loss_clip": 0.99799174, + "balance_loss_mlp": 1.01145339, + "epoch": 0.4878400721479032, + "flos": 61241772159360.0, + "grad_norm": 0.779968435998717, + "language_loss": 0.48876739, + "learning_rate": 2.1756154908193943e-06, + "loss": 0.50911003, + "num_input_tokens_seen": 174446880, + "router_z_loss_clip": 0.01342773, + "router_z_loss_mlp": 0.23535156, + "step": 8114, + "time_per_iteration": 2.964735507965088 + }, + { + "auxiliary_loss_clip": 0.01112827, + "auxiliary_loss_mlp": 0.01041502, + "balance_loss_clip": 1.02769804, + "balance_loss_mlp": 1.04015875, + "epoch": 0.48790019540057117, + "flos": 24536612482560.0, + "grad_norm": 1.346675786458265, + "language_loss": 0.76713175, + "learning_rate": 2.1752275301297155e-06, + "loss": 0.78867507, + "num_input_tokens_seen": 174468485, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7265625, + "step": 8115, + "time_per_iteration": 2.5008208751678467 + }, + { + "auxiliary_loss_clip": 0.01116462, + "auxiliary_loss_mlp": 0.01036306, + "balance_loss_clip": 1.02220368, + "balance_loss_mlp": 1.0430454, + "epoch": 0.48796031865323913, + "flos": 21833939399040.0, + "grad_norm": 2.9741706409780697, + "language_loss": 0.72150338, + "learning_rate": 2.1748395627953915e-06, + "loss": 0.74303102, + "num_input_tokens_seen": 174486360, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.734375, + "step": 8116, + "time_per_iteration": 2.471170425415039 + }, + { + "auxiliary_loss_clip": 0.01108955, + "auxiliary_loss_mlp": 0.01038046, + "balance_loss_clip": 1.02506459, + "balance_loss_mlp": 1.03951752, + "epoch": 0.4880204419059071, + "flos": 18588907883520.0, + "grad_norm": 1.626628974836948, + "language_loss": 0.63457322, + "learning_rate": 2.1744515888311335e-06, + "loss": 0.65604323, + "num_input_tokens_seen": 174505075, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6953125, + "step": 8117, + "time_per_iteration": 2.4408295154571533 + }, + { + "auxiliary_loss_clip": 0.01106242, + "auxiliary_loss_mlp": 0.01033541, + "balance_loss_clip": 1.02082098, + "balance_loss_mlp": 1.03648984, + "epoch": 0.48808056515857506, + "flos": 19172168928000.0, + "grad_norm": 1.7937040821955612, + "language_loss": 0.79223609, + "learning_rate": 2.1740636082516533e-06, + "loss": 0.81363392, + "num_input_tokens_seen": 174523385, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 8118, + "time_per_iteration": 2.4724843502044678 + }, + { + "auxiliary_loss_clip": 0.01111434, + "auxiliary_loss_mlp": 0.01037003, + "balance_loss_clip": 1.02359247, + "balance_loss_mlp": 1.03926289, + "epoch": 0.48814068841124303, + "flos": 20120497850880.0, + "grad_norm": 2.8027989615224427, + "language_loss": 0.63472134, + "learning_rate": 2.1736756210716645e-06, + "loss": 0.65620571, + "num_input_tokens_seen": 174542200, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.72265625, + "step": 8119, + "time_per_iteration": 2.478968381881714 + }, + { + "auxiliary_loss_clip": 0.01111182, + "auxiliary_loss_mlp": 0.01032744, + "balance_loss_clip": 1.02006578, + "balance_loss_mlp": 1.04054463, + "epoch": 0.488200811663911, + "flos": 22965592360320.0, + "grad_norm": 1.9034604660173908, + "language_loss": 0.72397757, + "learning_rate": 2.173287627305878e-06, + "loss": 0.74541688, + "num_input_tokens_seen": 174563620, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 8120, + "time_per_iteration": 2.5204596519470215 + }, + { + "auxiliary_loss_clip": 0.01109957, + "auxiliary_loss_mlp": 0.01034651, + "balance_loss_clip": 1.02122211, + "balance_loss_mlp": 1.03855026, + "epoch": 0.48826093491657896, + "flos": 33910697387520.0, + "grad_norm": 1.5930525886491658, + "language_loss": 0.63636339, + "learning_rate": 2.1728996269690075e-06, + "loss": 0.65780938, + "num_input_tokens_seen": 174586465, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71484375, + "step": 8121, + "time_per_iteration": 2.5647690296173096 + }, + { + "auxiliary_loss_clip": 0.01113983, + "auxiliary_loss_mlp": 0.01038884, + "balance_loss_clip": 1.02521062, + "balance_loss_mlp": 1.04131413, + "epoch": 0.488321058169247, + "flos": 23070307484160.0, + "grad_norm": 1.870740841609923, + "language_loss": 0.82433021, + "learning_rate": 2.1725116200757664e-06, + "loss": 0.84585893, + "num_input_tokens_seen": 174604035, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7265625, + "step": 8122, + "time_per_iteration": 2.4753966331481934 + }, + { + "auxiliary_loss_clip": 0.01113704, + "auxiliary_loss_mlp": 0.01034348, + "balance_loss_clip": 1.02019167, + "balance_loss_mlp": 1.04063094, + "epoch": 0.48838118142191494, + "flos": 19317714837120.0, + "grad_norm": 2.206764356510625, + "language_loss": 0.85308874, + "learning_rate": 2.172123606640866e-06, + "loss": 0.8745693, + "num_input_tokens_seen": 174621715, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.73046875, + "step": 8123, + "time_per_iteration": 2.5124545097351074 + }, + { + "auxiliary_loss_clip": 0.01111875, + "auxiliary_loss_mlp": 0.01033202, + "balance_loss_clip": 1.02075016, + "balance_loss_mlp": 1.03892267, + "epoch": 0.4884413046745829, + "flos": 25410678036480.0, + "grad_norm": 2.940858316224804, + "language_loss": 0.85766631, + "learning_rate": 2.1717355866790227e-06, + "loss": 0.87911713, + "num_input_tokens_seen": 174643835, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.73046875, + "step": 8124, + "time_per_iteration": 2.5632708072662354 + }, + { + "auxiliary_loss_clip": 0.01112362, + "auxiliary_loss_mlp": 0.01034904, + "balance_loss_clip": 1.02157593, + "balance_loss_mlp": 1.04022837, + "epoch": 0.4885014279272509, + "flos": 20991546662400.0, + "grad_norm": 2.663608167377633, + "language_loss": 0.79223049, + "learning_rate": 2.171347560204948e-06, + "loss": 0.81370318, + "num_input_tokens_seen": 174660955, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 8125, + "time_per_iteration": 2.4487855434417725 + }, + { + "auxiliary_loss_clip": 0.01108941, + "auxiliary_loss_mlp": 0.01033765, + "balance_loss_clip": 1.0211587, + "balance_loss_mlp": 1.03887916, + "epoch": 0.48856155117991884, + "flos": 13771599269760.0, + "grad_norm": 2.7973571608225063, + "language_loss": 0.72273839, + "learning_rate": 2.170959527233356e-06, + "loss": 0.74416542, + "num_input_tokens_seen": 174678270, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 8126, + "time_per_iteration": 2.437833309173584 + }, + { + "auxiliary_loss_clip": 0.01111271, + "auxiliary_loss_mlp": 0.01033174, + "balance_loss_clip": 1.02007866, + "balance_loss_mlp": 1.0383321, + "epoch": 0.4886216744325868, + "flos": 32087764206720.0, + "grad_norm": 1.6636646152839605, + "language_loss": 0.68598747, + "learning_rate": 2.1705714877789633e-06, + "loss": 0.70743197, + "num_input_tokens_seen": 174698360, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73046875, + "step": 8127, + "time_per_iteration": 2.593252420425415 + }, + { + "auxiliary_loss_clip": 0.01111716, + "auxiliary_loss_mlp": 0.01036256, + "balance_loss_clip": 1.02271378, + "balance_loss_mlp": 1.03772545, + "epoch": 0.48868179768525477, + "flos": 19610063631360.0, + "grad_norm": 2.237259843406747, + "language_loss": 0.76160932, + "learning_rate": 2.170183441856481e-06, + "loss": 0.78308904, + "num_input_tokens_seen": 174716755, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73828125, + "step": 8128, + "time_per_iteration": 2.4540648460388184 + }, + { + "auxiliary_loss_clip": 0.01111402, + "auxiliary_loss_mlp": 0.01033977, + "balance_loss_clip": 1.02170467, + "balance_loss_mlp": 1.03979826, + "epoch": 0.48874192093792274, + "flos": 21286912199040.0, + "grad_norm": 1.8007841393953645, + "language_loss": 0.75974828, + "learning_rate": 2.1697953894806265e-06, + "loss": 0.78120208, + "num_input_tokens_seen": 174735560, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.71875, + "step": 8129, + "time_per_iteration": 2.4460771083831787 + }, + { + "auxiliary_loss_clip": 0.01108237, + "auxiliary_loss_mlp": 0.01031838, + "balance_loss_clip": 1.01829541, + "balance_loss_mlp": 1.03739452, + "epoch": 0.4888020441905907, + "flos": 14173439696640.0, + "grad_norm": 2.2474332482435684, + "language_loss": 0.64869368, + "learning_rate": 2.169407330666114e-06, + "loss": 0.67009449, + "num_input_tokens_seen": 174752730, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.70703125, + "step": 8130, + "time_per_iteration": 2.4403305053710938 + }, + { + "auxiliary_loss_clip": 0.01104742, + "auxiliary_loss_mlp": 0.01033698, + "balance_loss_clip": 1.0213058, + "balance_loss_mlp": 1.03528643, + "epoch": 0.48886216744325867, + "flos": 24097891766400.0, + "grad_norm": 2.48357292354413, + "language_loss": 0.71885133, + "learning_rate": 2.169019265427658e-06, + "loss": 0.74023575, + "num_input_tokens_seen": 174772520, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6953125, + "step": 8131, + "time_per_iteration": 2.4774324893951416 + }, + { + "auxiliary_loss_clip": 0.01113099, + "auxiliary_loss_mlp": 0.01038002, + "balance_loss_clip": 1.02447748, + "balance_loss_mlp": 1.04011512, + "epoch": 0.48892229069592663, + "flos": 38431419402240.0, + "grad_norm": 1.6326145167913504, + "language_loss": 0.69524658, + "learning_rate": 2.1686311937799745e-06, + "loss": 0.7167576, + "num_input_tokens_seen": 174796540, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73046875, + "step": 8132, + "time_per_iteration": 2.5888383388519287 + }, + { + "auxiliary_loss_clip": 0.011075, + "auxiliary_loss_mlp": 0.0102913, + "balance_loss_clip": 1.01641083, + "balance_loss_mlp": 1.03793633, + "epoch": 0.4889824139485946, + "flos": 23843321101440.0, + "grad_norm": 1.374551885233197, + "language_loss": 0.70177239, + "learning_rate": 2.1682431157377797e-06, + "loss": 0.72313869, + "num_input_tokens_seen": 174817840, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 8133, + "time_per_iteration": 2.5105628967285156 + }, + { + "auxiliary_loss_clip": 0.01108745, + "auxiliary_loss_mlp": 0.01033533, + "balance_loss_clip": 1.02086735, + "balance_loss_mlp": 1.03843439, + "epoch": 0.48904253720126256, + "flos": 24425827960320.0, + "grad_norm": 1.701581568458854, + "language_loss": 0.70707083, + "learning_rate": 2.1678550313157883e-06, + "loss": 0.72849363, + "num_input_tokens_seen": 174837885, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 8134, + "time_per_iteration": 2.4894602298736572 + }, + { + "auxiliary_loss_clip": 0.01114154, + "auxiliary_loss_mlp": 0.01035613, + "balance_loss_clip": 1.02214789, + "balance_loss_mlp": 1.04088461, + "epoch": 0.4891026604539306, + "flos": 24170682677760.0, + "grad_norm": 2.0967568848691105, + "language_loss": 0.80384946, + "learning_rate": 2.167466940528718e-06, + "loss": 0.82534719, + "num_input_tokens_seen": 174855240, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 8135, + "time_per_iteration": 2.453099489212036 + }, + { + "auxiliary_loss_clip": 0.0110553, + "auxiliary_loss_mlp": 0.0103091, + "balance_loss_clip": 1.01895332, + "balance_loss_mlp": 1.03636014, + "epoch": 0.48916278370659855, + "flos": 21470954509440.0, + "grad_norm": 1.7196560423786724, + "language_loss": 0.74302435, + "learning_rate": 2.1670788433912843e-06, + "loss": 0.7643888, + "num_input_tokens_seen": 174875145, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.69140625, + "step": 8136, + "time_per_iteration": 3.877336025238037 + }, + { + "auxiliary_loss_clip": 0.0110843, + "auxiliary_loss_mlp": 0.01030189, + "balance_loss_clip": 1.01817274, + "balance_loss_mlp": 1.03903699, + "epoch": 0.4892229069592665, + "flos": 22309755886080.0, + "grad_norm": 2.212302237726986, + "language_loss": 0.73165262, + "learning_rate": 2.166690739918204e-06, + "loss": 0.75303876, + "num_input_tokens_seen": 174894770, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6953125, + "step": 8137, + "time_per_iteration": 5.387110471725464 + }, + { + "auxiliary_loss_clip": 0.01109302, + "auxiliary_loss_mlp": 0.01031575, + "balance_loss_clip": 1.01846206, + "balance_loss_mlp": 1.03721762, + "epoch": 0.4892830302119345, + "flos": 12786856934400.0, + "grad_norm": 1.8416541749331667, + "language_loss": 0.74448442, + "learning_rate": 2.1663026301241944e-06, + "loss": 0.76589316, + "num_input_tokens_seen": 174912780, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 8138, + "time_per_iteration": 3.9045798778533936 + }, + { + "auxiliary_loss_clip": 0.01108399, + "auxiliary_loss_mlp": 0.01033464, + "balance_loss_clip": 1.02114367, + "balance_loss_mlp": 1.039101, + "epoch": 0.48934315346460244, + "flos": 20813896972800.0, + "grad_norm": 1.5284975125240874, + "language_loss": 0.74403191, + "learning_rate": 2.165914514023972e-06, + "loss": 0.76545048, + "num_input_tokens_seen": 174931250, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.69140625, + "step": 8139, + "time_per_iteration": 2.4808132648468018 + }, + { + "auxiliary_loss_clip": 0.01109253, + "auxiliary_loss_mlp": 0.01034607, + "balance_loss_clip": 1.02220941, + "balance_loss_mlp": 1.03792441, + "epoch": 0.4894032767172704, + "flos": 19755537713280.0, + "grad_norm": 1.7092479760411836, + "language_loss": 0.61867124, + "learning_rate": 2.165526391632255e-06, + "loss": 0.64010978, + "num_input_tokens_seen": 174951105, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 8140, + "time_per_iteration": 2.4676973819732666 + }, + { + "auxiliary_loss_clip": 0.01110437, + "auxiliary_loss_mlp": 0.01040632, + "balance_loss_clip": 1.02696478, + "balance_loss_mlp": 1.03864169, + "epoch": 0.4894633999699384, + "flos": 17818982835840.0, + "grad_norm": 11.553990271771063, + "language_loss": 0.82090259, + "learning_rate": 2.1651382629637608e-06, + "loss": 0.84241331, + "num_input_tokens_seen": 174969120, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71875, + "step": 8141, + "time_per_iteration": 2.4469456672668457 + }, + { + "auxiliary_loss_clip": 0.01112856, + "auxiliary_loss_mlp": 0.01033863, + "balance_loss_clip": 1.02006459, + "balance_loss_mlp": 1.04014516, + "epoch": 0.48952352322260634, + "flos": 25523222325120.0, + "grad_norm": 1.575169950356119, + "language_loss": 0.72470534, + "learning_rate": 2.1647501280332066e-06, + "loss": 0.74617255, + "num_input_tokens_seen": 174991295, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7265625, + "step": 8142, + "time_per_iteration": 2.5793039798736572 + }, + { + "auxiliary_loss_clip": 0.01105636, + "auxiliary_loss_mlp": 0.01032347, + "balance_loss_clip": 1.02019358, + "balance_loss_mlp": 1.03645492, + "epoch": 0.4895836464752743, + "flos": 29055502903680.0, + "grad_norm": 1.7422772510583273, + "language_loss": 0.66720849, + "learning_rate": 2.1643619868553105e-06, + "loss": 0.68858832, + "num_input_tokens_seen": 175012830, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69140625, + "step": 8143, + "time_per_iteration": 2.529869556427002 + }, + { + "auxiliary_loss_clip": 0.01104447, + "auxiliary_loss_mlp": 0.01029513, + "balance_loss_clip": 1.01746714, + "balance_loss_mlp": 1.03620982, + "epoch": 0.48964376972794227, + "flos": 33546958312320.0, + "grad_norm": 1.6744857165672533, + "language_loss": 0.75076014, + "learning_rate": 2.163973839444793e-06, + "loss": 0.77209973, + "num_input_tokens_seen": 175035695, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.68359375, + "step": 8144, + "time_per_iteration": 2.5917482376098633 + }, + { + "auxiliary_loss_clip": 0.0110724, + "auxiliary_loss_mlp": 0.01028535, + "balance_loss_clip": 1.0158155, + "balance_loss_mlp": 1.0373745, + "epoch": 0.48970389298061023, + "flos": 22054035985920.0, + "grad_norm": 1.7401505251342857, + "language_loss": 0.75606745, + "learning_rate": 2.1635856858163695e-06, + "loss": 0.77742517, + "num_input_tokens_seen": 175056425, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 8145, + "time_per_iteration": 2.4766342639923096 + }, + { + "auxiliary_loss_clip": 0.01110696, + "auxiliary_loss_mlp": 0.01035586, + "balance_loss_clip": 1.0224849, + "balance_loss_mlp": 1.03849018, + "epoch": 0.4897640162332782, + "flos": 20084299920000.0, + "grad_norm": 1.7624340526507305, + "language_loss": 0.79901314, + "learning_rate": 2.163197525984761e-06, + "loss": 0.820476, + "num_input_tokens_seen": 175074800, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 8146, + "time_per_iteration": 2.461480140686035 + }, + { + "auxiliary_loss_clip": 0.01102906, + "auxiliary_loss_mlp": 0.01031626, + "balance_loss_clip": 1.01866233, + "balance_loss_mlp": 1.03510666, + "epoch": 0.48982413948594616, + "flos": 23806225330560.0, + "grad_norm": 1.6218674355963285, + "language_loss": 0.74327677, + "learning_rate": 2.162809359964687e-06, + "loss": 0.76462203, + "num_input_tokens_seen": 175094500, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.67578125, + "step": 8147, + "time_per_iteration": 2.4981865882873535 + }, + { + "auxiliary_loss_clip": 0.01109193, + "auxiliary_loss_mlp": 0.01028663, + "balance_loss_clip": 1.01614654, + "balance_loss_mlp": 1.0397613, + "epoch": 0.4898842627386142, + "flos": 17639645207040.0, + "grad_norm": 2.4473724892456126, + "language_loss": 0.83147472, + "learning_rate": 2.162421187770864e-06, + "loss": 0.8528533, + "num_input_tokens_seen": 175112920, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 8148, + "time_per_iteration": 2.4251036643981934 + }, + { + "auxiliary_loss_clip": 0.01104505, + "auxiliary_loss_mlp": 0.01027888, + "balance_loss_clip": 1.01701021, + "balance_loss_mlp": 1.03808641, + "epoch": 0.48994438599128215, + "flos": 16617914841600.0, + "grad_norm": 1.6244569398372493, + "language_loss": 0.73749536, + "learning_rate": 2.162033009418015e-06, + "loss": 0.75881934, + "num_input_tokens_seen": 175129910, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6640625, + "step": 8149, + "time_per_iteration": 2.4356369972229004 + }, + { + "auxiliary_loss_clip": 0.01112401, + "auxiliary_loss_mlp": 0.01030368, + "balance_loss_clip": 1.01667118, + "balance_loss_mlp": 1.03944612, + "epoch": 0.4900045092439501, + "flos": 26614834600320.0, + "grad_norm": 2.7362049095417516, + "language_loss": 0.75515091, + "learning_rate": 2.1616448249208567e-06, + "loss": 0.77657855, + "num_input_tokens_seen": 175148705, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7265625, + "step": 8150, + "time_per_iteration": 2.4834423065185547 + }, + { + "auxiliary_loss_clip": 0.01111432, + "auxiliary_loss_mlp": 0.01030069, + "balance_loss_clip": 1.0169735, + "balance_loss_mlp": 1.04018414, + "epoch": 0.4900646324966181, + "flos": 19902125116800.0, + "grad_norm": 2.027803048960678, + "language_loss": 0.72891176, + "learning_rate": 2.1612566342941106e-06, + "loss": 0.75032675, + "num_input_tokens_seen": 175167425, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 8151, + "time_per_iteration": 2.448648691177368 + }, + { + "auxiliary_loss_clip": 0.01033992, + "auxiliary_loss_mlp": 0.01002772, + "balance_loss_clip": 1.0015738, + "balance_loss_mlp": 1.01003349, + "epoch": 0.49012475574928605, + "flos": 59189620337280.0, + "grad_norm": 0.8338756787223442, + "language_loss": 0.54366148, + "learning_rate": 2.1608684375524977e-06, + "loss": 0.5640291, + "num_input_tokens_seen": 175227985, + "router_z_loss_clip": 0.01196289, + "router_z_loss_mlp": 0.24023438, + "step": 8152, + "time_per_iteration": 3.0414862632751465 + }, + { + "auxiliary_loss_clip": 0.01109949, + "auxiliary_loss_mlp": 0.01030719, + "balance_loss_clip": 1.01807642, + "balance_loss_mlp": 1.03726649, + "epoch": 0.490184879001954, + "flos": 45259797657600.0, + "grad_norm": 1.8071588573161568, + "language_loss": 0.61403525, + "learning_rate": 2.1604802347107364e-06, + "loss": 0.6354419, + "num_input_tokens_seen": 175251895, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7265625, + "step": 8153, + "time_per_iteration": 2.6923155784606934 + }, + { + "auxiliary_loss_clip": 0.01105747, + "auxiliary_loss_mlp": 0.01036584, + "balance_loss_clip": 1.02371526, + "balance_loss_mlp": 1.03589535, + "epoch": 0.490245002254622, + "flos": 28002135634560.0, + "grad_norm": 1.4691031789751592, + "language_loss": 0.76673591, + "learning_rate": 2.160092025783549e-06, + "loss": 0.78815919, + "num_input_tokens_seen": 175272770, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69921875, + "step": 8154, + "time_per_iteration": 2.490353584289551 + }, + { + "auxiliary_loss_clip": 0.01034079, + "auxiliary_loss_mlp": 0.01008709, + "balance_loss_clip": 1.00767767, + "balance_loss_mlp": 1.01043367, + "epoch": 0.49030512550728994, + "flos": 58951318533120.0, + "grad_norm": 0.9669855284605297, + "language_loss": 0.67019808, + "learning_rate": 2.1597038107856564e-06, + "loss": 0.69062597, + "num_input_tokens_seen": 175336320, + "router_z_loss_clip": 0.01031494, + "router_z_loss_mlp": 0.23632812, + "step": 8155, + "time_per_iteration": 3.1443841457366943 + }, + { + "auxiliary_loss_clip": 0.0110817, + "auxiliary_loss_mlp": 0.0102773, + "balance_loss_clip": 1.01594031, + "balance_loss_mlp": 1.03842843, + "epoch": 0.4903652487599579, + "flos": 19791843384960.0, + "grad_norm": 2.3165784732113965, + "language_loss": 0.76883155, + "learning_rate": 2.1593155897317784e-06, + "loss": 0.79019058, + "num_input_tokens_seen": 175353540, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.69921875, + "step": 8156, + "time_per_iteration": 2.4431064128875732 + }, + { + "auxiliary_loss_clip": 0.01107345, + "auxiliary_loss_mlp": 0.01029515, + "balance_loss_clip": 1.01737309, + "balance_loss_mlp": 1.03692055, + "epoch": 0.49042537201262587, + "flos": 21762082241280.0, + "grad_norm": 2.1340841853754084, + "language_loss": 0.83395588, + "learning_rate": 2.1589273626366377e-06, + "loss": 0.85532445, + "num_input_tokens_seen": 175370445, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.703125, + "step": 8157, + "time_per_iteration": 2.478027582168579 + }, + { + "auxiliary_loss_clip": 0.01108499, + "auxiliary_loss_mlp": 0.01032069, + "balance_loss_clip": 1.01971316, + "balance_loss_mlp": 1.03797531, + "epoch": 0.49048549526529384, + "flos": 18953042008320.0, + "grad_norm": 1.799550006100146, + "language_loss": 0.79893947, + "learning_rate": 2.158539129514956e-06, + "loss": 0.8203451, + "num_input_tokens_seen": 175389020, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.70703125, + "step": 8158, + "time_per_iteration": 2.453590154647827 + }, + { + "auxiliary_loss_clip": 0.0111001, + "auxiliary_loss_mlp": 0.01030333, + "balance_loss_clip": 1.01731563, + "balance_loss_mlp": 1.03768444, + "epoch": 0.4905456185179618, + "flos": 26906393295360.0, + "grad_norm": 2.6065217447562015, + "language_loss": 0.69529265, + "learning_rate": 2.158150890381454e-06, + "loss": 0.71669614, + "num_input_tokens_seen": 175409545, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.72265625, + "step": 8159, + "time_per_iteration": 2.531371593475342 + }, + { + "auxiliary_loss_clip": 0.01106025, + "auxiliary_loss_mlp": 0.01032529, + "balance_loss_clip": 1.01975548, + "balance_loss_mlp": 1.03706563, + "epoch": 0.49060574177062977, + "flos": 20412343854720.0, + "grad_norm": 1.8340548446534848, + "language_loss": 0.73084885, + "learning_rate": 2.157762645250854e-06, + "loss": 0.7522344, + "num_input_tokens_seen": 175429335, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 8160, + "time_per_iteration": 2.4504506587982178 + }, + { + "auxiliary_loss_clip": 0.01109213, + "auxiliary_loss_mlp": 0.0103886, + "balance_loss_clip": 1.02510881, + "balance_loss_mlp": 1.03650105, + "epoch": 0.4906658650232978, + "flos": 17493704248320.0, + "grad_norm": 1.9580885379656197, + "language_loss": 0.71372044, + "learning_rate": 2.1573743941378796e-06, + "loss": 0.73520112, + "num_input_tokens_seen": 175446955, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7265625, + "step": 8161, + "time_per_iteration": 2.4428305625915527 + }, + { + "auxiliary_loss_clip": 0.01106928, + "auxiliary_loss_mlp": 0.01033381, + "balance_loss_clip": 1.02100754, + "balance_loss_mlp": 1.03813958, + "epoch": 0.49072598827596575, + "flos": 26614439550720.0, + "grad_norm": 1.8633116916333885, + "language_loss": 0.67950338, + "learning_rate": 2.1569861370572517e-06, + "loss": 0.70090652, + "num_input_tokens_seen": 175468195, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6875, + "step": 8162, + "time_per_iteration": 2.478804349899292 + }, + { + "auxiliary_loss_clip": 0.01110496, + "auxiliary_loss_mlp": 0.01033543, + "balance_loss_clip": 1.01964319, + "balance_loss_mlp": 1.03701675, + "epoch": 0.4907861115286337, + "flos": 20412595249920.0, + "grad_norm": 1.7117590070355053, + "language_loss": 0.63264233, + "learning_rate": 2.1565978740236944e-06, + "loss": 0.65408272, + "num_input_tokens_seen": 175487455, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.734375, + "step": 8163, + "time_per_iteration": 2.474439859390259 + }, + { + "auxiliary_loss_clip": 0.01104573, + "auxiliary_loss_mlp": 0.01030344, + "balance_loss_clip": 1.01754081, + "balance_loss_mlp": 1.03680897, + "epoch": 0.4908462347813017, + "flos": 14064271286400.0, + "grad_norm": 5.481003364843308, + "language_loss": 0.76853907, + "learning_rate": 2.1562096050519293e-06, + "loss": 0.78988826, + "num_input_tokens_seen": 175504450, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 8164, + "time_per_iteration": 2.4202303886413574 + }, + { + "auxiliary_loss_clip": 0.01106417, + "auxiliary_loss_mlp": 0.01028083, + "balance_loss_clip": 1.01487494, + "balance_loss_mlp": 1.03511751, + "epoch": 0.49090635803396965, + "flos": 18735100237440.0, + "grad_norm": 1.943812351193686, + "language_loss": 0.76509839, + "learning_rate": 2.1558213301566806e-06, + "loss": 0.78644335, + "num_input_tokens_seen": 175523600, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71484375, + "step": 8165, + "time_per_iteration": 2.4495608806610107 + }, + { + "auxiliary_loss_clip": 0.01106443, + "auxiliary_loss_mlp": 0.0103224, + "balance_loss_clip": 1.01949036, + "balance_loss_mlp": 1.03724587, + "epoch": 0.4909664812866376, + "flos": 20558500295040.0, + "grad_norm": 1.5511500992998777, + "language_loss": 0.77538848, + "learning_rate": 2.1554330493526716e-06, + "loss": 0.79677534, + "num_input_tokens_seen": 175542720, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 8166, + "time_per_iteration": 2.431838274002075 + }, + { + "auxiliary_loss_clip": 0.01035489, + "auxiliary_loss_mlp": 0.00999269, + "balance_loss_clip": 0.99796408, + "balance_loss_mlp": 1.01166928, + "epoch": 0.4910266045393056, + "flos": 54684017948160.0, + "grad_norm": 0.7997768420675069, + "language_loss": 0.54261303, + "learning_rate": 2.1550447626546253e-06, + "loss": 0.56296062, + "num_input_tokens_seen": 175598640, + "router_z_loss_clip": 0.01306152, + "router_z_loss_mlp": 0.23828125, + "step": 8167, + "time_per_iteration": 3.1150460243225098 + }, + { + "auxiliary_loss_clip": 0.01104818, + "auxiliary_loss_mlp": 0.0103103, + "balance_loss_clip": 1.0184176, + "balance_loss_mlp": 1.03619838, + "epoch": 0.49108672779197354, + "flos": 16246454342400.0, + "grad_norm": 2.5337625100343173, + "language_loss": 0.85566431, + "learning_rate": 2.1546564700772665e-06, + "loss": 0.8770228, + "num_input_tokens_seen": 175615675, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 8168, + "time_per_iteration": 2.4139063358306885 + }, + { + "auxiliary_loss_clip": 0.01105043, + "auxiliary_loss_mlp": 0.01029235, + "balance_loss_clip": 1.01706409, + "balance_loss_mlp": 1.03805184, + "epoch": 0.4911468510446415, + "flos": 19825419623040.0, + "grad_norm": 1.6015963996367162, + "language_loss": 0.73052484, + "learning_rate": 2.1542681716353193e-06, + "loss": 0.75186759, + "num_input_tokens_seen": 175632255, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.671875, + "step": 8169, + "time_per_iteration": 2.45638370513916 + }, + { + "auxiliary_loss_clip": 0.01104357, + "auxiliary_loss_mlp": 0.01028462, + "balance_loss_clip": 1.01673138, + "balance_loss_mlp": 1.03472865, + "epoch": 0.4912069742973095, + "flos": 21212684743680.0, + "grad_norm": 1.6971136818289634, + "language_loss": 0.78070778, + "learning_rate": 2.1538798673435068e-06, + "loss": 0.80203593, + "num_input_tokens_seen": 175651625, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6953125, + "step": 8170, + "time_per_iteration": 2.4314279556274414 + }, + { + "auxiliary_loss_clip": 0.01108102, + "auxiliary_loss_mlp": 0.01033192, + "balance_loss_clip": 1.02121162, + "balance_loss_mlp": 1.03809822, + "epoch": 0.49126709754997744, + "flos": 19537129065600.0, + "grad_norm": 3.6606474387116363, + "language_loss": 0.75769788, + "learning_rate": 2.1534915572165545e-06, + "loss": 0.77911079, + "num_input_tokens_seen": 175669265, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.703125, + "step": 8171, + "time_per_iteration": 2.4608027935028076 + }, + { + "auxiliary_loss_clip": 0.01109941, + "auxiliary_loss_mlp": 0.0103435, + "balance_loss_clip": 1.02194011, + "balance_loss_mlp": 1.03800821, + "epoch": 0.4913272208026454, + "flos": 12239686080000.0, + "grad_norm": 2.121204048765929, + "language_loss": 0.81676465, + "learning_rate": 2.1531032412691875e-06, + "loss": 0.83820748, + "num_input_tokens_seen": 175686065, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71875, + "step": 8172, + "time_per_iteration": 2.44052791595459 + }, + { + "auxiliary_loss_clip": 0.01034483, + "auxiliary_loss_mlp": 0.00996712, + "balance_loss_clip": 0.99551356, + "balance_loss_mlp": 1.0111028, + "epoch": 0.49138734405531337, + "flos": 65465871661440.0, + "grad_norm": 0.6914312886696967, + "language_loss": 0.53323382, + "learning_rate": 2.1527149195161295e-06, + "loss": 0.55354571, + "num_input_tokens_seen": 175748595, + "router_z_loss_clip": 0.01196289, + "router_z_loss_mlp": 0.234375, + "step": 8173, + "time_per_iteration": 3.0708565711975098 + }, + { + "auxiliary_loss_clip": 0.01108663, + "auxiliary_loss_mlp": 0.01032993, + "balance_loss_clip": 1.01985621, + "balance_loss_mlp": 1.0374558, + "epoch": 0.4914474673079814, + "flos": 18439052342400.0, + "grad_norm": 1.811286975884668, + "language_loss": 0.62879664, + "learning_rate": 2.152326591972107e-06, + "loss": 0.65021324, + "num_input_tokens_seen": 175766770, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 8174, + "time_per_iteration": 2.4336249828338623 + }, + { + "auxiliary_loss_clip": 0.01106845, + "auxiliary_loss_mlp": 0.01034775, + "balance_loss_clip": 1.02208483, + "balance_loss_mlp": 1.03750002, + "epoch": 0.49150759056064935, + "flos": 21685053525120.0, + "grad_norm": 1.779537870111139, + "language_loss": 0.69111979, + "learning_rate": 2.1519382586518445e-06, + "loss": 0.71253598, + "num_input_tokens_seen": 175783605, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 8175, + "time_per_iteration": 2.4554460048675537 + }, + { + "auxiliary_loss_clip": 0.01106829, + "auxiliary_loss_mlp": 0.01032059, + "balance_loss_clip": 1.01980472, + "balance_loss_mlp": 1.03808653, + "epoch": 0.4915677138133173, + "flos": 22382439056640.0, + "grad_norm": 1.5246237839161791, + "language_loss": 0.74398279, + "learning_rate": 2.151549919570068e-06, + "loss": 0.76537168, + "num_input_tokens_seen": 175801390, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 8176, + "time_per_iteration": 2.4888904094696045 + }, + { + "auxiliary_loss_clip": 0.01107276, + "auxiliary_loss_mlp": 0.01042259, + "balance_loss_clip": 1.0297358, + "balance_loss_mlp": 1.03694725, + "epoch": 0.4916278370659853, + "flos": 18402890325120.0, + "grad_norm": 1.7568126082203932, + "language_loss": 0.69846892, + "learning_rate": 2.1511615747415036e-06, + "loss": 0.71996421, + "num_input_tokens_seen": 175819830, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 8177, + "time_per_iteration": 3.8634564876556396 + }, + { + "auxiliary_loss_clip": 0.01035127, + "auxiliary_loss_mlp": 0.00999453, + "balance_loss_clip": 0.99834442, + "balance_loss_mlp": 1.01137829, + "epoch": 0.49168796031865325, + "flos": 66609124715520.0, + "grad_norm": 0.6749706589091774, + "language_loss": 0.46188164, + "learning_rate": 2.150773224180877e-06, + "loss": 0.48222741, + "num_input_tokens_seen": 175881765, + "router_z_loss_clip": 0.0111084, + "router_z_loss_mlp": 0.23828125, + "step": 8178, + "time_per_iteration": 3.0891001224517822 + }, + { + "auxiliary_loss_clip": 0.01110485, + "auxiliary_loss_mlp": 0.01036426, + "balance_loss_clip": 1.02311015, + "balance_loss_mlp": 1.03835034, + "epoch": 0.4917480835713212, + "flos": 20959335141120.0, + "grad_norm": 1.813634772504209, + "language_loss": 0.66008747, + "learning_rate": 2.1503848679029147e-06, + "loss": 0.68155658, + "num_input_tokens_seen": 175901795, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 8179, + "time_per_iteration": 5.296982049942017 + }, + { + "auxiliary_loss_clip": 0.01111217, + "auxiliary_loss_mlp": 0.01035796, + "balance_loss_clip": 1.021873, + "balance_loss_mlp": 1.03712761, + "epoch": 0.4918082068239892, + "flos": 15772900412160.0, + "grad_norm": 1.8426949121819989, + "language_loss": 0.70288503, + "learning_rate": 2.149996505922343e-06, + "loss": 0.72435522, + "num_input_tokens_seen": 175917770, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7421875, + "step": 8180, + "time_per_iteration": 3.9257376194000244 + }, + { + "auxiliary_loss_clip": 0.01104266, + "auxiliary_loss_mlp": 0.01037933, + "balance_loss_clip": 1.02467656, + "balance_loss_mlp": 1.03577447, + "epoch": 0.49186833007665715, + "flos": 24604806453120.0, + "grad_norm": 1.68068912028803, + "language_loss": 0.83982801, + "learning_rate": 2.1496081382538895e-06, + "loss": 0.86125004, + "num_input_tokens_seen": 175937000, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.68359375, + "step": 8181, + "time_per_iteration": 2.464665174484253 + }, + { + "auxiliary_loss_clip": 0.01104535, + "auxiliary_loss_mlp": 0.01030918, + "balance_loss_clip": 1.01947999, + "balance_loss_mlp": 1.03746653, + "epoch": 0.4919284533293251, + "flos": 22090557139200.0, + "grad_norm": 2.0240623883749724, + "language_loss": 0.72286201, + "learning_rate": 2.1492197649122793e-06, + "loss": 0.74421656, + "num_input_tokens_seen": 175955170, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 8182, + "time_per_iteration": 2.5358242988586426 + }, + { + "auxiliary_loss_clip": 0.01108049, + "auxiliary_loss_mlp": 0.01031803, + "balance_loss_clip": 1.01904118, + "balance_loss_mlp": 1.03814411, + "epoch": 0.4919885765819931, + "flos": 23368043318400.0, + "grad_norm": 2.2040850478726357, + "language_loss": 0.72828728, + "learning_rate": 2.1488313859122412e-06, + "loss": 0.74968582, + "num_input_tokens_seen": 175973725, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 8183, + "time_per_iteration": 2.484051465988159 + }, + { + "auxiliary_loss_clip": 0.01110545, + "auxiliary_loss_mlp": 0.01031345, + "balance_loss_clip": 1.0178628, + "balance_loss_mlp": 1.03733599, + "epoch": 0.49204869983466104, + "flos": 21360493209600.0, + "grad_norm": 1.6157316160481727, + "language_loss": 0.77338606, + "learning_rate": 2.1484430012685015e-06, + "loss": 0.79480493, + "num_input_tokens_seen": 175993885, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 8184, + "time_per_iteration": 2.4630794525146484 + }, + { + "auxiliary_loss_clip": 0.01107787, + "auxiliary_loss_mlp": 0.01035741, + "balance_loss_clip": 1.02359986, + "balance_loss_mlp": 1.03868532, + "epoch": 0.492108823087329, + "flos": 21142695093120.0, + "grad_norm": 1.7266312313882144, + "language_loss": 0.71020061, + "learning_rate": 2.148054610995789e-06, + "loss": 0.73163593, + "num_input_tokens_seen": 176014210, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69140625, + "step": 8185, + "time_per_iteration": 2.4472904205322266 + }, + { + "auxiliary_loss_clip": 0.01109756, + "auxiliary_loss_mlp": 0.01037838, + "balance_loss_clip": 1.02348495, + "balance_loss_mlp": 1.03818357, + "epoch": 0.49216894633999697, + "flos": 25116605389440.0, + "grad_norm": 2.357724154899576, + "language_loss": 0.75007719, + "learning_rate": 2.147666215108831e-06, + "loss": 0.7715531, + "num_input_tokens_seen": 176033890, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.71484375, + "step": 8186, + "time_per_iteration": 2.497887372970581 + }, + { + "auxiliary_loss_clip": 0.01108113, + "auxiliary_loss_mlp": 0.01036969, + "balance_loss_clip": 1.0240649, + "balance_loss_mlp": 1.03769946, + "epoch": 0.49222906959266494, + "flos": 22637943475200.0, + "grad_norm": 2.2731376810200947, + "language_loss": 0.67426246, + "learning_rate": 2.1472778136223545e-06, + "loss": 0.69571328, + "num_input_tokens_seen": 176052720, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 8187, + "time_per_iteration": 2.4402377605438232 + }, + { + "auxiliary_loss_clip": 0.01105993, + "auxiliary_loss_mlp": 0.01034798, + "balance_loss_clip": 1.02205503, + "balance_loss_mlp": 1.03659558, + "epoch": 0.49228919284533296, + "flos": 20410548174720.0, + "grad_norm": 1.3838016666023416, + "language_loss": 0.66984355, + "learning_rate": 2.1468894065510894e-06, + "loss": 0.69125152, + "num_input_tokens_seen": 176072545, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 8188, + "time_per_iteration": 2.4889986515045166 + }, + { + "auxiliary_loss_clip": 0.01108628, + "auxiliary_loss_mlp": 0.01027775, + "balance_loss_clip": 1.01627779, + "balance_loss_mlp": 1.03854966, + "epoch": 0.4923493160980009, + "flos": 27122359818240.0, + "grad_norm": 1.5428848144341532, + "language_loss": 0.7457763, + "learning_rate": 2.1465009939097623e-06, + "loss": 0.76714027, + "num_input_tokens_seen": 176091490, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.703125, + "step": 8189, + "time_per_iteration": 2.4837827682495117 + }, + { + "auxiliary_loss_clip": 0.011062, + "auxiliary_loss_mlp": 0.01032136, + "balance_loss_clip": 1.01975584, + "balance_loss_mlp": 1.03744173, + "epoch": 0.4924094393506689, + "flos": 35736683224320.0, + "grad_norm": 1.5888967888129601, + "language_loss": 0.64360684, + "learning_rate": 2.146112575713104e-06, + "loss": 0.66499019, + "num_input_tokens_seen": 176113200, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 8190, + "time_per_iteration": 2.606388807296753 + }, + { + "auxiliary_loss_clip": 0.01107034, + "auxiliary_loss_mlp": 0.01027622, + "balance_loss_clip": 1.01528418, + "balance_loss_mlp": 1.0383538, + "epoch": 0.49246956260333685, + "flos": 20412487509120.0, + "grad_norm": 1.9368790872615624, + "language_loss": 0.71231604, + "learning_rate": 2.1457241519758413e-06, + "loss": 0.73366261, + "num_input_tokens_seen": 176132485, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6875, + "step": 8191, + "time_per_iteration": 2.4383578300476074 + }, + { + "auxiliary_loss_clip": 0.01106251, + "auxiliary_loss_mlp": 0.01033855, + "balance_loss_clip": 1.02162957, + "balance_loss_mlp": 1.03718042, + "epoch": 0.4925296858560048, + "flos": 38976938231040.0, + "grad_norm": 1.5667911589112589, + "language_loss": 0.71698356, + "learning_rate": 2.1453357227127043e-06, + "loss": 0.7383846, + "num_input_tokens_seen": 176155755, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69140625, + "step": 8192, + "time_per_iteration": 2.6127231121063232 + }, + { + "auxiliary_loss_clip": 0.01033253, + "auxiliary_loss_mlp": 0.01011533, + "balance_loss_clip": 1.01047826, + "balance_loss_mlp": 1.00980878, + "epoch": 0.4925898091086728, + "flos": 64278917712000.0, + "grad_norm": 0.7610920789142134, + "language_loss": 0.52138889, + "learning_rate": 2.1449472879384224e-06, + "loss": 0.54183674, + "num_input_tokens_seen": 176216295, + "router_z_loss_clip": 0.01055908, + "router_z_loss_mlp": 0.234375, + "step": 8193, + "time_per_iteration": 3.1151235103607178 + }, + { + "auxiliary_loss_clip": 0.01106303, + "auxiliary_loss_mlp": 0.01037325, + "balance_loss_clip": 1.02470672, + "balance_loss_mlp": 1.03862, + "epoch": 0.49264993236134075, + "flos": 23036372110080.0, + "grad_norm": 1.5012892842908303, + "language_loss": 0.77071059, + "learning_rate": 2.1445588476677246e-06, + "loss": 0.79214686, + "num_input_tokens_seen": 176235925, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.67578125, + "step": 8194, + "time_per_iteration": 2.4766407012939453 + }, + { + "auxiliary_loss_clip": 0.01104661, + "auxiliary_loss_mlp": 0.01029401, + "balance_loss_clip": 1.01783228, + "balance_loss_mlp": 1.03554666, + "epoch": 0.4927100556140087, + "flos": 24718212668160.0, + "grad_norm": 1.9786600447906189, + "language_loss": 0.70556259, + "learning_rate": 2.144170401915341e-06, + "loss": 0.7269032, + "num_input_tokens_seen": 176253865, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.69140625, + "step": 8195, + "time_per_iteration": 2.489412784576416 + }, + { + "auxiliary_loss_clip": 0.01108151, + "auxiliary_loss_mlp": 0.01027525, + "balance_loss_clip": 1.01537156, + "balance_loss_mlp": 1.0380609, + "epoch": 0.4927701788666767, + "flos": 23505544581120.0, + "grad_norm": 1.8494849345903903, + "language_loss": 0.81095743, + "learning_rate": 2.143781950696001e-06, + "loss": 0.83231419, + "num_input_tokens_seen": 176271525, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69921875, + "step": 8196, + "time_per_iteration": 2.5489988327026367 + }, + { + "auxiliary_loss_clip": 0.01108856, + "auxiliary_loss_mlp": 0.01033443, + "balance_loss_clip": 1.02033019, + "balance_loss_mlp": 1.03709757, + "epoch": 0.49283030211934464, + "flos": 22928891639040.0, + "grad_norm": 1.848981865854384, + "language_loss": 0.7100687, + "learning_rate": 2.1433934940244356e-06, + "loss": 0.73149174, + "num_input_tokens_seen": 176290810, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 8197, + "time_per_iteration": 2.4621787071228027 + }, + { + "auxiliary_loss_clip": 0.01105723, + "auxiliary_loss_mlp": 0.01031298, + "balance_loss_clip": 1.01988339, + "balance_loss_mlp": 1.03815627, + "epoch": 0.4928904253720126, + "flos": 16873024210560.0, + "grad_norm": 1.7362069513061655, + "language_loss": 0.84122622, + "learning_rate": 2.143005031915374e-06, + "loss": 0.86259645, + "num_input_tokens_seen": 176309165, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.67578125, + "step": 8198, + "time_per_iteration": 2.4596786499023438 + }, + { + "auxiliary_loss_clip": 0.01110423, + "auxiliary_loss_mlp": 0.01034702, + "balance_loss_clip": 1.02139831, + "balance_loss_mlp": 1.03913713, + "epoch": 0.4929505486246806, + "flos": 14866551509760.0, + "grad_norm": 1.767623263247313, + "language_loss": 0.76214266, + "learning_rate": 2.1426165643835467e-06, + "loss": 0.78359395, + "num_input_tokens_seen": 176324960, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 8199, + "time_per_iteration": 2.413482189178467 + }, + { + "auxiliary_loss_clip": 0.01109624, + "auxiliary_loss_mlp": 0.01035996, + "balance_loss_clip": 1.02215028, + "balance_loss_mlp": 1.03712904, + "epoch": 0.49301067187734854, + "flos": 23842351434240.0, + "grad_norm": 1.555242231339172, + "language_loss": 0.59918249, + "learning_rate": 2.1422280914436864e-06, + "loss": 0.62063873, + "num_input_tokens_seen": 176346195, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7265625, + "step": 8200, + "time_per_iteration": 2.515371561050415 + }, + { + "auxiliary_loss_clip": 0.01101467, + "auxiliary_loss_mlp": 0.01033481, + "balance_loss_clip": 1.02128601, + "balance_loss_mlp": 1.03560054, + "epoch": 0.49307079513001656, + "flos": 22491284244480.0, + "grad_norm": 1.4972351372180894, + "language_loss": 0.78781515, + "learning_rate": 2.1418396131105213e-06, + "loss": 0.80916464, + "num_input_tokens_seen": 176366735, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66015625, + "step": 8201, + "time_per_iteration": 2.4688665866851807 + }, + { + "auxiliary_loss_clip": 0.01112129, + "auxiliary_loss_mlp": 0.01032302, + "balance_loss_clip": 1.01858091, + "balance_loss_mlp": 1.03761029, + "epoch": 0.4931309183826845, + "flos": 15924587546880.0, + "grad_norm": 2.1515546014570766, + "language_loss": 0.67352241, + "learning_rate": 2.141451129398785e-06, + "loss": 0.69496673, + "num_input_tokens_seen": 176384475, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7421875, + "step": 8202, + "time_per_iteration": 2.6021947860717773 + }, + { + "auxiliary_loss_clip": 0.01106578, + "auxiliary_loss_mlp": 0.01030125, + "balance_loss_clip": 1.01781058, + "balance_loss_mlp": 1.03682148, + "epoch": 0.4931910416353525, + "flos": 27309059735040.0, + "grad_norm": 3.4273755266911845, + "language_loss": 0.75192142, + "learning_rate": 2.1410626403232076e-06, + "loss": 0.77328843, + "num_input_tokens_seen": 176402645, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.69921875, + "step": 8203, + "time_per_iteration": 2.501173496246338 + }, + { + "auxiliary_loss_clip": 0.01106791, + "auxiliary_loss_mlp": 0.01034465, + "balance_loss_clip": 1.0214237, + "balance_loss_mlp": 1.03780818, + "epoch": 0.49325116488802045, + "flos": 20806139635200.0, + "grad_norm": 2.0656815740777152, + "language_loss": 0.80908394, + "learning_rate": 2.1406741458985197e-06, + "loss": 0.83049649, + "num_input_tokens_seen": 176416715, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6875, + "step": 8204, + "time_per_iteration": 2.481666088104248 + }, + { + "auxiliary_loss_clip": 0.01105243, + "auxiliary_loss_mlp": 0.01033404, + "balance_loss_clip": 1.02180493, + "balance_loss_mlp": 1.03788805, + "epoch": 0.4933112881406884, + "flos": 19865963099520.0, + "grad_norm": 2.2280647806743183, + "language_loss": 0.65550953, + "learning_rate": 2.140285646139455e-06, + "loss": 0.67689598, + "num_input_tokens_seen": 176435755, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 8205, + "time_per_iteration": 2.439408302307129 + }, + { + "auxiliary_loss_clip": 0.01112282, + "auxiliary_loss_mlp": 0.01035253, + "balance_loss_clip": 1.02083468, + "balance_loss_mlp": 1.03837705, + "epoch": 0.4933714113933564, + "flos": 21827977741440.0, + "grad_norm": 1.7727903919462147, + "language_loss": 0.67009246, + "learning_rate": 2.139897141060744e-06, + "loss": 0.69156778, + "num_input_tokens_seen": 176453915, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.73828125, + "step": 8206, + "time_per_iteration": 2.4607954025268555 + }, + { + "auxiliary_loss_clip": 0.01106251, + "auxiliary_loss_mlp": 0.01026736, + "balance_loss_clip": 1.01473176, + "balance_loss_mlp": 1.03630567, + "epoch": 0.49343153464602435, + "flos": 27890130049920.0, + "grad_norm": 1.822649710507408, + "language_loss": 0.76363301, + "learning_rate": 2.1395086306771196e-06, + "loss": 0.78496289, + "num_input_tokens_seen": 176475175, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69921875, + "step": 8207, + "time_per_iteration": 2.508553981781006 + }, + { + "auxiliary_loss_clip": 0.01109244, + "auxiliary_loss_mlp": 0.01032575, + "balance_loss_clip": 1.01912785, + "balance_loss_mlp": 1.03869963, + "epoch": 0.4934916578986923, + "flos": 24681080983680.0, + "grad_norm": 2.308112072386131, + "language_loss": 0.59984541, + "learning_rate": 2.1391201150033147e-06, + "loss": 0.62126362, + "num_input_tokens_seen": 176494250, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.703125, + "step": 8208, + "time_per_iteration": 2.505990982055664 + }, + { + "auxiliary_loss_clip": 0.01108969, + "auxiliary_loss_mlp": 0.01029375, + "balance_loss_clip": 1.01622033, + "balance_loss_mlp": 1.03816974, + "epoch": 0.4935517811513603, + "flos": 23405139089280.0, + "grad_norm": 2.3772506823576407, + "language_loss": 0.7851491, + "learning_rate": 2.1387315940540598e-06, + "loss": 0.80653256, + "num_input_tokens_seen": 176513325, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.70703125, + "step": 8209, + "time_per_iteration": 2.4622652530670166 + }, + { + "auxiliary_loss_clip": 0.01105789, + "auxiliary_loss_mlp": 0.01030424, + "balance_loss_clip": 1.01728129, + "balance_loss_mlp": 1.03630066, + "epoch": 0.49361190440402825, + "flos": 21944508439680.0, + "grad_norm": 1.7984719462813816, + "language_loss": 0.78806269, + "learning_rate": 2.138343067844089e-06, + "loss": 0.80942488, + "num_input_tokens_seen": 176532915, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6953125, + "step": 8210, + "time_per_iteration": 2.4884698390960693 + }, + { + "auxiliary_loss_clip": 0.01111365, + "auxiliary_loss_mlp": 0.01032283, + "balance_loss_clip": 1.01888382, + "balance_loss_mlp": 1.0381912, + "epoch": 0.4936720276566962, + "flos": 25115671635840.0, + "grad_norm": 2.2650712316686903, + "language_loss": 0.81229484, + "learning_rate": 2.1379545363881363e-06, + "loss": 0.83373135, + "num_input_tokens_seen": 176552775, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73046875, + "step": 8211, + "time_per_iteration": 2.4839043617248535 + }, + { + "auxiliary_loss_clip": 0.01109974, + "auxiliary_loss_mlp": 0.01036004, + "balance_loss_clip": 1.02280188, + "balance_loss_mlp": 1.03911519, + "epoch": 0.4937321509093642, + "flos": 26358935132160.0, + "grad_norm": 2.6136684102444665, + "language_loss": 0.91496241, + "learning_rate": 2.137565999700933e-06, + "loss": 0.93642217, + "num_input_tokens_seen": 176572185, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 8212, + "time_per_iteration": 2.5103862285614014 + }, + { + "auxiliary_loss_clip": 0.01106972, + "auxiliary_loss_mlp": 0.01031672, + "balance_loss_clip": 1.01925647, + "balance_loss_mlp": 1.036484, + "epoch": 0.49379227416203214, + "flos": 22961390469120.0, + "grad_norm": 1.7787072133843917, + "language_loss": 0.64901662, + "learning_rate": 2.1371774577972138e-06, + "loss": 0.670403, + "num_input_tokens_seen": 176591490, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.703125, + "step": 8213, + "time_per_iteration": 2.460123300552368 + }, + { + "auxiliary_loss_clip": 0.01106125, + "auxiliary_loss_mlp": 0.01027241, + "balance_loss_clip": 1.01356125, + "balance_loss_mlp": 1.03668904, + "epoch": 0.49385239741470016, + "flos": 32489101843200.0, + "grad_norm": 1.9389339120527038, + "language_loss": 0.75199962, + "learning_rate": 2.136788910691711e-06, + "loss": 0.77333331, + "num_input_tokens_seen": 176612715, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.69140625, + "step": 8214, + "time_per_iteration": 2.5719900131225586 + }, + { + "auxiliary_loss_clip": 0.01109359, + "auxiliary_loss_mlp": 0.01035274, + "balance_loss_clip": 1.02212512, + "balance_loss_mlp": 1.03959298, + "epoch": 0.4939125206673681, + "flos": 22492864442880.0, + "grad_norm": 1.828808325177945, + "language_loss": 0.84395385, + "learning_rate": 2.1364003583991594e-06, + "loss": 0.86540014, + "num_input_tokens_seen": 176631950, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.6953125, + "step": 8215, + "time_per_iteration": 2.468804121017456 + }, + { + "auxiliary_loss_clip": 0.01101226, + "auxiliary_loss_mlp": 0.0102806, + "balance_loss_clip": 1.01656199, + "balance_loss_mlp": 1.03478694, + "epoch": 0.4939726439200361, + "flos": 31176351486720.0, + "grad_norm": 1.6051587100805058, + "language_loss": 0.82859147, + "learning_rate": 2.136011800934292e-06, + "loss": 0.84988439, + "num_input_tokens_seen": 176653060, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 8216, + "time_per_iteration": 2.5819287300109863 + }, + { + "auxiliary_loss_clip": 0.01107134, + "auxiliary_loss_mlp": 0.01031262, + "balance_loss_clip": 1.01918006, + "balance_loss_mlp": 1.03821325, + "epoch": 0.49403276717270406, + "flos": 22674213233280.0, + "grad_norm": 1.4383830441547378, + "language_loss": 0.74774921, + "learning_rate": 2.1356232383118442e-06, + "loss": 0.76913321, + "num_input_tokens_seen": 176673895, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 8217, + "time_per_iteration": 2.4628379344940186 + }, + { + "auxiliary_loss_clip": 0.01104285, + "auxiliary_loss_mlp": 0.01032577, + "balance_loss_clip": 1.01928544, + "balance_loss_mlp": 1.03777707, + "epoch": 0.494092890425372, + "flos": 20741070147840.0, + "grad_norm": 1.733886360732455, + "language_loss": 0.78829861, + "learning_rate": 2.1352346705465494e-06, + "loss": 0.80966723, + "num_input_tokens_seen": 176692550, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6640625, + "step": 8218, + "time_per_iteration": 2.4809412956237793 + }, + { + "auxiliary_loss_clip": 0.0110198, + "auxiliary_loss_mlp": 0.01035085, + "balance_loss_clip": 1.02269292, + "balance_loss_mlp": 1.03510332, + "epoch": 0.49415301367804, + "flos": 18369026778240.0, + "grad_norm": 2.0240627965271187, + "language_loss": 0.76301086, + "learning_rate": 2.134846097653142e-06, + "loss": 0.78438151, + "num_input_tokens_seen": 176709335, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 8219, + "time_per_iteration": 3.8202009201049805 + }, + { + "auxiliary_loss_clip": 0.01108105, + "auxiliary_loss_mlp": 0.01033934, + "balance_loss_clip": 1.02106571, + "balance_loss_mlp": 1.03764367, + "epoch": 0.49421313693070795, + "flos": 17530620451200.0, + "grad_norm": 1.6690505128843895, + "language_loss": 0.6190055, + "learning_rate": 2.134457519646357e-06, + "loss": 0.64042592, + "num_input_tokens_seen": 176727715, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 8220, + "time_per_iteration": 2.453230857849121 + }, + { + "auxiliary_loss_clip": 0.01106287, + "auxiliary_loss_mlp": 0.01029403, + "balance_loss_clip": 1.01656425, + "balance_loss_mlp": 1.03672814, + "epoch": 0.4942732601833759, + "flos": 20812173120000.0, + "grad_norm": 1.7319378421104112, + "language_loss": 0.72381485, + "learning_rate": 2.1340689365409296e-06, + "loss": 0.74517179, + "num_input_tokens_seen": 176747530, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 8221, + "time_per_iteration": 5.506774187088013 + }, + { + "auxiliary_loss_clip": 0.01106502, + "auxiliary_loss_mlp": 0.01037169, + "balance_loss_clip": 1.02521193, + "balance_loss_mlp": 1.04006767, + "epoch": 0.4943333834360439, + "flos": 15048941794560.0, + "grad_norm": 1.681203667545881, + "language_loss": 0.79131603, + "learning_rate": 2.133680348351595e-06, + "loss": 0.81275266, + "num_input_tokens_seen": 176765260, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6640625, + "step": 8222, + "time_per_iteration": 2.491175889968872 + }, + { + "auxiliary_loss_clip": 0.01108448, + "auxiliary_loss_mlp": 0.01034922, + "balance_loss_clip": 1.02147555, + "balance_loss_mlp": 1.03941715, + "epoch": 0.49439350668871185, + "flos": 16070420764800.0, + "grad_norm": 2.3506903054927015, + "language_loss": 0.73205507, + "learning_rate": 2.133291755093088e-06, + "loss": 0.75348878, + "num_input_tokens_seen": 176781770, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.69140625, + "step": 8223, + "time_per_iteration": 2.4359662532806396 + }, + { + "auxiliary_loss_clip": 0.01109917, + "auxiliary_loss_mlp": 0.01035963, + "balance_loss_clip": 1.02264762, + "balance_loss_mlp": 1.03850269, + "epoch": 0.4944536299413798, + "flos": 20880079781760.0, + "grad_norm": 1.7533498543998463, + "language_loss": 0.75144434, + "learning_rate": 2.132903156780144e-06, + "loss": 0.7729032, + "num_input_tokens_seen": 176800655, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.71484375, + "step": 8224, + "time_per_iteration": 2.5716288089752197 + }, + { + "auxiliary_loss_clip": 0.01111376, + "auxiliary_loss_mlp": 0.01030429, + "balance_loss_clip": 1.01807868, + "balance_loss_mlp": 1.04080439, + "epoch": 0.4945137531940478, + "flos": 26608908856320.0, + "grad_norm": 2.086998261136206, + "language_loss": 0.63982892, + "learning_rate": 2.1325145534274997e-06, + "loss": 0.66124696, + "num_input_tokens_seen": 176820610, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.70703125, + "step": 8225, + "time_per_iteration": 2.524048089981079 + }, + { + "auxiliary_loss_clip": 0.01107484, + "auxiliary_loss_mlp": 0.01033109, + "balance_loss_clip": 1.0206579, + "balance_loss_mlp": 1.03766608, + "epoch": 0.49457387644671574, + "flos": 23988148738560.0, + "grad_norm": 1.839126557537864, + "language_loss": 0.76359057, + "learning_rate": 2.1321259450498893e-06, + "loss": 0.78499651, + "num_input_tokens_seen": 176840520, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 8226, + "time_per_iteration": 2.5069961547851562 + }, + { + "auxiliary_loss_clip": 0.01109174, + "auxiliary_loss_mlp": 0.01039999, + "balance_loss_clip": 1.02578914, + "balance_loss_mlp": 1.03735518, + "epoch": 0.49463399969938376, + "flos": 26976598427520.0, + "grad_norm": 1.6377261486682646, + "language_loss": 0.71156305, + "learning_rate": 2.131737331662051e-06, + "loss": 0.73305476, + "num_input_tokens_seen": 176860265, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.71875, + "step": 8227, + "time_per_iteration": 2.4763920307159424 + }, + { + "auxiliary_loss_clip": 0.01112289, + "auxiliary_loss_mlp": 0.01034696, + "balance_loss_clip": 1.02160668, + "balance_loss_mlp": 1.03914213, + "epoch": 0.49469412295205173, + "flos": 29681534067840.0, + "grad_norm": 1.614424212368193, + "language_loss": 0.71484196, + "learning_rate": 2.131348713278718e-06, + "loss": 0.73631173, + "num_input_tokens_seen": 176882910, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.734375, + "step": 8228, + "time_per_iteration": 2.550083637237549 + }, + { + "auxiliary_loss_clip": 0.01105792, + "auxiliary_loss_mlp": 0.0103118, + "balance_loss_clip": 1.01829386, + "balance_loss_mlp": 1.03837276, + "epoch": 0.4947542462047197, + "flos": 24131791226880.0, + "grad_norm": 1.6200219454444607, + "language_loss": 0.83788311, + "learning_rate": 2.1309600899146304e-06, + "loss": 0.85925281, + "num_input_tokens_seen": 176903030, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.67578125, + "step": 8229, + "time_per_iteration": 2.474684238433838 + }, + { + "auxiliary_loss_clip": 0.01108289, + "auxiliary_loss_mlp": 0.01035108, + "balance_loss_clip": 1.02103567, + "balance_loss_mlp": 1.03685689, + "epoch": 0.49481436945738766, + "flos": 20045049333120.0, + "grad_norm": 2.055489394198818, + "language_loss": 0.75105131, + "learning_rate": 2.1305714615845227e-06, + "loss": 0.77248526, + "num_input_tokens_seen": 176919025, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.71484375, + "step": 8230, + "time_per_iteration": 2.506950616836548 + }, + { + "auxiliary_loss_clip": 0.01109012, + "auxiliary_loss_mlp": 0.01027613, + "balance_loss_clip": 1.01497638, + "balance_loss_mlp": 1.03868175, + "epoch": 0.4948744927100556, + "flos": 15669550005120.0, + "grad_norm": 2.703005059233118, + "language_loss": 0.79713035, + "learning_rate": 2.1301828283031314e-06, + "loss": 0.8184967, + "num_input_tokens_seen": 176937945, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 8231, + "time_per_iteration": 2.4176137447357178 + }, + { + "auxiliary_loss_clip": 0.01035427, + "auxiliary_loss_mlp": 0.01002857, + "balance_loss_clip": 1.00169492, + "balance_loss_mlp": 1.01191425, + "epoch": 0.4949346159627236, + "flos": 68872071502080.0, + "grad_norm": 0.7419788553124401, + "language_loss": 0.60237485, + "learning_rate": 2.1297941900851944e-06, + "loss": 0.62275773, + "num_input_tokens_seen": 177004575, + "router_z_loss_clip": 0.01159668, + "router_z_loss_mlp": 0.23535156, + "step": 8232, + "time_per_iteration": 3.183783531188965 + }, + { + "auxiliary_loss_clip": 0.0111307, + "auxiliary_loss_mlp": 0.01035045, + "balance_loss_clip": 1.02119923, + "balance_loss_mlp": 1.03889871, + "epoch": 0.49499473921539155, + "flos": 24790285307520.0, + "grad_norm": 1.7147216218758814, + "language_loss": 0.69257128, + "learning_rate": 2.1294055469454496e-06, + "loss": 0.71405244, + "num_input_tokens_seen": 177024155, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7421875, + "step": 8233, + "time_per_iteration": 2.477755546569824 + }, + { + "auxiliary_loss_clip": 0.01106869, + "auxiliary_loss_mlp": 0.01035041, + "balance_loss_clip": 1.02111769, + "balance_loss_mlp": 1.03714275, + "epoch": 0.4950548624680595, + "flos": 32707905540480.0, + "grad_norm": 3.246275947254348, + "language_loss": 0.6678468, + "learning_rate": 2.129016898898633e-06, + "loss": 0.68926585, + "num_input_tokens_seen": 177046185, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.6953125, + "step": 8234, + "time_per_iteration": 2.5594117641448975 + }, + { + "auxiliary_loss_clip": 0.0103478, + "auxiliary_loss_mlp": 0.01003988, + "balance_loss_clip": 1.00288522, + "balance_loss_mlp": 1.01140106, + "epoch": 0.4951149857207275, + "flos": 50082173066880.0, + "grad_norm": 0.8288840425421409, + "language_loss": 0.57987183, + "learning_rate": 2.128628245959482e-06, + "loss": 0.60025948, + "num_input_tokens_seen": 177099025, + "router_z_loss_clip": 0.01104736, + "router_z_loss_mlp": 0.234375, + "step": 8235, + "time_per_iteration": 3.0041370391845703 + }, + { + "auxiliary_loss_clip": 0.01108932, + "auxiliary_loss_mlp": 0.01037443, + "balance_loss_clip": 1.02345991, + "balance_loss_mlp": 1.03770208, + "epoch": 0.49517510897339545, + "flos": 22236785406720.0, + "grad_norm": 1.4917768542550827, + "language_loss": 0.76824737, + "learning_rate": 2.1282395881427355e-06, + "loss": 0.78971112, + "num_input_tokens_seen": 177118365, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7109375, + "step": 8236, + "time_per_iteration": 2.498105525970459 + }, + { + "auxiliary_loss_clip": 0.01107773, + "auxiliary_loss_mlp": 0.01032214, + "balance_loss_clip": 1.01948881, + "balance_loss_mlp": 1.03860247, + "epoch": 0.4952352322260634, + "flos": 25374120969600.0, + "grad_norm": 1.8006519774313887, + "language_loss": 0.72554326, + "learning_rate": 2.1278509254631315e-06, + "loss": 0.74694312, + "num_input_tokens_seen": 177136415, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 8237, + "time_per_iteration": 2.487849473953247 + }, + { + "auxiliary_loss_clip": 0.01105674, + "auxiliary_loss_mlp": 0.01032739, + "balance_loss_clip": 1.02024627, + "balance_loss_mlp": 1.03722131, + "epoch": 0.4952953554787314, + "flos": 24608721035520.0, + "grad_norm": 1.8061825502363815, + "language_loss": 0.75687563, + "learning_rate": 2.127462257935406e-06, + "loss": 0.77825987, + "num_input_tokens_seen": 177155690, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 8238, + "time_per_iteration": 2.4926116466522217 + }, + { + "auxiliary_loss_clip": 0.01110283, + "auxiliary_loss_mlp": 0.01034271, + "balance_loss_clip": 1.02057362, + "balance_loss_mlp": 1.03765702, + "epoch": 0.49535547873139935, + "flos": 17311278049920.0, + "grad_norm": 2.197202607879525, + "language_loss": 0.73434591, + "learning_rate": 2.1270735855743008e-06, + "loss": 0.75579149, + "num_input_tokens_seen": 177173350, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7265625, + "step": 8239, + "time_per_iteration": 2.4181203842163086 + }, + { + "auxiliary_loss_clip": 0.01109997, + "auxiliary_loss_mlp": 0.01037672, + "balance_loss_clip": 1.02266932, + "balance_loss_mlp": 1.03704619, + "epoch": 0.4954156019840673, + "flos": 20740315962240.0, + "grad_norm": 2.4131176994917936, + "language_loss": 0.78344893, + "learning_rate": 2.126684908394552e-06, + "loss": 0.80492562, + "num_input_tokens_seen": 177191115, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.73046875, + "step": 8240, + "time_per_iteration": 2.479642391204834 + }, + { + "auxiliary_loss_clip": 0.01104608, + "auxiliary_loss_mlp": 0.01040833, + "balance_loss_clip": 1.02865601, + "balance_loss_mlp": 1.03746533, + "epoch": 0.49547572523673533, + "flos": 12820684567680.0, + "grad_norm": 2.0234307188816993, + "language_loss": 0.85579056, + "learning_rate": 2.126296226410898e-06, + "loss": 0.87724495, + "num_input_tokens_seen": 177206155, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.671875, + "step": 8241, + "time_per_iteration": 2.4081263542175293 + }, + { + "auxiliary_loss_clip": 0.01106442, + "auxiliary_loss_mlp": 0.01035574, + "balance_loss_clip": 1.02337933, + "balance_loss_mlp": 1.03813624, + "epoch": 0.4955358484894033, + "flos": 15597046402560.0, + "grad_norm": 1.761079127200854, + "language_loss": 0.77041149, + "learning_rate": 2.1259075396380794e-06, + "loss": 0.79183173, + "num_input_tokens_seen": 177224815, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 8242, + "time_per_iteration": 2.4439215660095215 + }, + { + "auxiliary_loss_clip": 0.01106589, + "auxiliary_loss_mlp": 0.01031002, + "balance_loss_clip": 1.01821673, + "balance_loss_mlp": 1.03676701, + "epoch": 0.49559597174207126, + "flos": 26464368528000.0, + "grad_norm": 1.7216813067847012, + "language_loss": 0.67493725, + "learning_rate": 2.125518848090833e-06, + "loss": 0.6963132, + "num_input_tokens_seen": 177244490, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 8243, + "time_per_iteration": 2.4888081550598145 + }, + { + "auxiliary_loss_clip": 0.01107757, + "auxiliary_loss_mlp": 0.01030474, + "balance_loss_clip": 1.01805878, + "balance_loss_mlp": 1.03910422, + "epoch": 0.4956560949947392, + "flos": 23148234040320.0, + "grad_norm": 1.8355775234908949, + "language_loss": 0.68218768, + "learning_rate": 2.125130151783901e-06, + "loss": 0.70357001, + "num_input_tokens_seen": 177264340, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 8244, + "time_per_iteration": 2.481220245361328 + }, + { + "auxiliary_loss_clip": 0.01109231, + "auxiliary_loss_mlp": 0.01035961, + "balance_loss_clip": 1.02201915, + "balance_loss_mlp": 1.03828287, + "epoch": 0.4957162182474072, + "flos": 20773461237120.0, + "grad_norm": 1.8414695050792438, + "language_loss": 0.74998277, + "learning_rate": 2.12474145073202e-06, + "loss": 0.77143466, + "num_input_tokens_seen": 177283055, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7109375, + "step": 8245, + "time_per_iteration": 2.459244728088379 + }, + { + "auxiliary_loss_clip": 0.01105994, + "auxiliary_loss_mlp": 0.01029176, + "balance_loss_clip": 1.01628923, + "balance_loss_mlp": 1.03797877, + "epoch": 0.49577634150007516, + "flos": 18734202397440.0, + "grad_norm": 3.047248940663427, + "language_loss": 0.81496358, + "learning_rate": 2.1243527449499306e-06, + "loss": 0.83631527, + "num_input_tokens_seen": 177301140, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6796875, + "step": 8246, + "time_per_iteration": 2.54664945602417 + }, + { + "auxiliary_loss_clip": 0.01110421, + "auxiliary_loss_mlp": 0.01039175, + "balance_loss_clip": 1.02553713, + "balance_loss_mlp": 1.03858495, + "epoch": 0.4958364647527431, + "flos": 25554176870400.0, + "grad_norm": 1.7095262667552558, + "language_loss": 0.83750397, + "learning_rate": 2.1239640344523733e-06, + "loss": 0.85899985, + "num_input_tokens_seen": 177323095, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71875, + "step": 8247, + "time_per_iteration": 2.478410482406616 + }, + { + "auxiliary_loss_clip": 0.01111487, + "auxiliary_loss_mlp": 0.01030982, + "balance_loss_clip": 1.01897812, + "balance_loss_mlp": 1.04011726, + "epoch": 0.4958965880054111, + "flos": 24425325169920.0, + "grad_norm": 2.0177325188605018, + "language_loss": 0.83758432, + "learning_rate": 2.123575319254087e-06, + "loss": 0.85900903, + "num_input_tokens_seen": 177339845, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.7109375, + "step": 8248, + "time_per_iteration": 2.490619659423828 + }, + { + "auxiliary_loss_clip": 0.01109734, + "auxiliary_loss_mlp": 0.01028278, + "balance_loss_clip": 1.01518941, + "balance_loss_mlp": 1.03800774, + "epoch": 0.49595671125807905, + "flos": 25083460114560.0, + "grad_norm": 2.055191909263014, + "language_loss": 0.73715985, + "learning_rate": 2.123186599369812e-06, + "loss": 0.75853992, + "num_input_tokens_seen": 177359980, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 8249, + "time_per_iteration": 2.5232534408569336 + }, + { + "auxiliary_loss_clip": 0.0111234, + "auxiliary_loss_mlp": 0.01038359, + "balance_loss_clip": 1.02504992, + "balance_loss_mlp": 1.04018188, + "epoch": 0.496016834510747, + "flos": 16435883692800.0, + "grad_norm": 1.9063816639589337, + "language_loss": 0.76176995, + "learning_rate": 2.122797874814289e-06, + "loss": 0.78327698, + "num_input_tokens_seen": 177378580, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.72265625, + "step": 8250, + "time_per_iteration": 2.5368192195892334 + }, + { + "auxiliary_loss_clip": 0.01108406, + "auxiliary_loss_mlp": 0.0103451, + "balance_loss_clip": 1.02170718, + "balance_loss_mlp": 1.03792036, + "epoch": 0.496076957763415, + "flos": 23437925228160.0, + "grad_norm": 1.615677709430237, + "language_loss": 0.69986647, + "learning_rate": 2.1224091456022585e-06, + "loss": 0.72129565, + "num_input_tokens_seen": 177398790, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.703125, + "step": 8251, + "time_per_iteration": 2.4543070793151855 + }, + { + "auxiliary_loss_clip": 0.01108023, + "auxiliary_loss_mlp": 0.01027913, + "balance_loss_clip": 1.01586699, + "balance_loss_mlp": 1.03890181, + "epoch": 0.49613708101608295, + "flos": 16909509450240.0, + "grad_norm": 1.8749041446582064, + "language_loss": 0.79864365, + "learning_rate": 2.122020411748461e-06, + "loss": 0.82000297, + "num_input_tokens_seen": 177416515, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 8252, + "time_per_iteration": 2.4386792182922363 + }, + { + "auxiliary_loss_clip": 0.01108859, + "auxiliary_loss_mlp": 0.01028142, + "balance_loss_clip": 1.01384854, + "balance_loss_mlp": 1.03821409, + "epoch": 0.4961972042687509, + "flos": 16618094409600.0, + "grad_norm": 1.7863838823967775, + "language_loss": 0.80688357, + "learning_rate": 2.1216316732676363e-06, + "loss": 0.82825357, + "num_input_tokens_seen": 177434425, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.70703125, + "step": 8253, + "time_per_iteration": 2.440727710723877 + }, + { + "auxiliary_loss_clip": 0.01105434, + "auxiliary_loss_mlp": 0.01030191, + "balance_loss_clip": 1.01863384, + "balance_loss_mlp": 1.03654194, + "epoch": 0.49625732752141893, + "flos": 28956749437440.0, + "grad_norm": 1.548882190492268, + "language_loss": 0.67088544, + "learning_rate": 2.1212429301745275e-06, + "loss": 0.69224173, + "num_input_tokens_seen": 177459675, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6875, + "step": 8254, + "time_per_iteration": 2.575490951538086 + }, + { + "auxiliary_loss_clip": 0.011067, + "auxiliary_loss_mlp": 0.01035621, + "balance_loss_clip": 1.02257323, + "balance_loss_mlp": 1.03522658, + "epoch": 0.4963174507740869, + "flos": 23112359331840.0, + "grad_norm": 1.5646536445016186, + "language_loss": 0.73859739, + "learning_rate": 2.1208541824838743e-06, + "loss": 0.76002055, + "num_input_tokens_seen": 177478895, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71484375, + "step": 8255, + "time_per_iteration": 2.478703498840332 + }, + { + "auxiliary_loss_clip": 0.0110576, + "auxiliary_loss_mlp": 0.01034939, + "balance_loss_clip": 1.02208281, + "balance_loss_mlp": 1.0362165, + "epoch": 0.49637757402675486, + "flos": 13917863450880.0, + "grad_norm": 1.8563521426834817, + "language_loss": 0.81378329, + "learning_rate": 2.1204654302104183e-06, + "loss": 0.8351903, + "num_input_tokens_seen": 177494920, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 8256, + "time_per_iteration": 2.4312291145324707 + }, + { + "auxiliary_loss_clip": 0.01105024, + "auxiliary_loss_mlp": 0.01024955, + "balance_loss_clip": 1.01246178, + "balance_loss_mlp": 1.03679466, + "epoch": 0.49643769727942283, + "flos": 22309001700480.0, + "grad_norm": 1.8572652078491616, + "language_loss": 0.80710369, + "learning_rate": 2.120076673368901e-06, + "loss": 0.82840347, + "num_input_tokens_seen": 177515455, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 8257, + "time_per_iteration": 2.4589884281158447 + }, + { + "auxiliary_loss_clip": 0.01111951, + "auxiliary_loss_mlp": 0.0103531, + "balance_loss_clip": 1.02173841, + "balance_loss_mlp": 1.03759003, + "epoch": 0.4964978205320908, + "flos": 19500248776320.0, + "grad_norm": 2.788575980623821, + "language_loss": 0.66533971, + "learning_rate": 2.1196879119740647e-06, + "loss": 0.68681228, + "num_input_tokens_seen": 177534040, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7421875, + "step": 8258, + "time_per_iteration": 2.477653741836548 + }, + { + "auxiliary_loss_clip": 0.01103911, + "auxiliary_loss_mlp": 0.01028497, + "balance_loss_clip": 1.01674283, + "balance_loss_mlp": 1.03566313, + "epoch": 0.49655794378475876, + "flos": 23436524597760.0, + "grad_norm": 2.207120440649978, + "language_loss": 0.77672231, + "learning_rate": 2.1192991460406502e-06, + "loss": 0.79804647, + "num_input_tokens_seen": 177554510, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6796875, + "step": 8259, + "time_per_iteration": 2.482516050338745 + }, + { + "auxiliary_loss_clip": 0.01107983, + "auxiliary_loss_mlp": 0.01030822, + "balance_loss_clip": 1.01802468, + "balance_loss_mlp": 1.03903294, + "epoch": 0.4966180670374267, + "flos": 26831124345600.0, + "grad_norm": 1.536511866358609, + "language_loss": 0.78612608, + "learning_rate": 2.1189103755834e-06, + "loss": 0.80751413, + "num_input_tokens_seen": 177575780, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6875, + "step": 8260, + "time_per_iteration": 4.0255560874938965 + }, + { + "auxiliary_loss_clip": 0.0110786, + "auxiliary_loss_mlp": 0.01030696, + "balance_loss_clip": 1.01785684, + "balance_loss_mlp": 1.03662324, + "epoch": 0.4966781902900947, + "flos": 22009326531840.0, + "grad_norm": 4.674193904345997, + "language_loss": 0.76227403, + "learning_rate": 2.1185216006170573e-06, + "loss": 0.78365964, + "num_input_tokens_seen": 177588965, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 8261, + "time_per_iteration": 2.537996530532837 + }, + { + "auxiliary_loss_clip": 0.01104467, + "auxiliary_loss_mlp": 0.01028346, + "balance_loss_clip": 1.01622844, + "balance_loss_mlp": 1.03667367, + "epoch": 0.49673831354276266, + "flos": 26213353309440.0, + "grad_norm": 1.9998040798137362, + "language_loss": 0.89328134, + "learning_rate": 2.1181328211563627e-06, + "loss": 0.91460943, + "num_input_tokens_seen": 177608425, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 8262, + "time_per_iteration": 5.405071020126343 + }, + { + "auxiliary_loss_clip": 0.01104636, + "auxiliary_loss_mlp": 0.01029475, + "balance_loss_clip": 1.0172143, + "balance_loss_mlp": 1.03765512, + "epoch": 0.4967984367954306, + "flos": 23182277155200.0, + "grad_norm": 1.4087924984120455, + "language_loss": 0.73918653, + "learning_rate": 2.11774403721606e-06, + "loss": 0.76052761, + "num_input_tokens_seen": 177628240, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.671875, + "step": 8263, + "time_per_iteration": 3.9610228538513184 + }, + { + "auxiliary_loss_clip": 0.01112691, + "auxiliary_loss_mlp": 0.01033653, + "balance_loss_clip": 1.0196991, + "balance_loss_mlp": 1.04077482, + "epoch": 0.4968585600480986, + "flos": 19281445079040.0, + "grad_norm": 2.641620630884259, + "language_loss": 0.69445115, + "learning_rate": 2.1173552488108923e-06, + "loss": 0.71591461, + "num_input_tokens_seen": 177645920, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.71875, + "step": 8264, + "time_per_iteration": 2.4799907207489014 + }, + { + "auxiliary_loss_clip": 0.01105995, + "auxiliary_loss_mlp": 0.01028905, + "balance_loss_clip": 1.01585722, + "balance_loss_mlp": 1.03470981, + "epoch": 0.49691868330076655, + "flos": 22528703237760.0, + "grad_norm": 1.3808235907294704, + "language_loss": 0.64915001, + "learning_rate": 2.1169664559556007e-06, + "loss": 0.67049909, + "num_input_tokens_seen": 177667185, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 8265, + "time_per_iteration": 2.491708517074585 + }, + { + "auxiliary_loss_clip": 0.01034788, + "auxiliary_loss_mlp": 0.01001781, + "balance_loss_clip": 1.00064886, + "balance_loss_mlp": 1.01169205, + "epoch": 0.4969788065534345, + "flos": 66577128675840.0, + "grad_norm": 0.8684712318419048, + "language_loss": 0.53446817, + "learning_rate": 2.1165776586649304e-06, + "loss": 0.55483389, + "num_input_tokens_seen": 177733020, + "router_z_loss_clip": 0.01135254, + "router_z_loss_mlp": 0.23144531, + "step": 8266, + "time_per_iteration": 3.1343002319335938 + }, + { + "auxiliary_loss_clip": 0.01104137, + "auxiliary_loss_mlp": 0.0102799, + "balance_loss_clip": 1.01567531, + "balance_loss_mlp": 1.03706813, + "epoch": 0.49703892980610254, + "flos": 24059503105920.0, + "grad_norm": 3.469499482915289, + "language_loss": 0.79616332, + "learning_rate": 2.1161888569536223e-06, + "loss": 0.81748462, + "num_input_tokens_seen": 177753370, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.671875, + "step": 8267, + "time_per_iteration": 2.5316126346588135 + }, + { + "auxiliary_loss_clip": 0.01109343, + "auxiliary_loss_mlp": 0.01032461, + "balance_loss_clip": 1.01856148, + "balance_loss_mlp": 1.03869104, + "epoch": 0.4970990530587705, + "flos": 29126174912640.0, + "grad_norm": 2.5132671844419434, + "language_loss": 0.74805677, + "learning_rate": 2.1158000508364223e-06, + "loss": 0.76947474, + "num_input_tokens_seen": 177771530, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.70703125, + "step": 8268, + "time_per_iteration": 2.5102896690368652 + }, + { + "auxiliary_loss_clip": 0.0110689, + "auxiliary_loss_mlp": 0.01033853, + "balance_loss_clip": 1.01998329, + "balance_loss_mlp": 1.0366255, + "epoch": 0.49715917631143847, + "flos": 46026167258880.0, + "grad_norm": 1.9572065929893177, + "language_loss": 0.67818397, + "learning_rate": 2.115411240328073e-06, + "loss": 0.6995914, + "num_input_tokens_seen": 177796355, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.703125, + "step": 8269, + "time_per_iteration": 2.7194817066192627 + }, + { + "auxiliary_loss_clip": 0.0110431, + "auxiliary_loss_mlp": 0.01032199, + "balance_loss_clip": 1.01968217, + "balance_loss_mlp": 1.03744197, + "epoch": 0.49721929956410643, + "flos": 20191277600640.0, + "grad_norm": 1.6139896668987463, + "language_loss": 0.85450721, + "learning_rate": 2.1150224254433167e-06, + "loss": 0.87587237, + "num_input_tokens_seen": 177814300, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.671875, + "step": 8270, + "time_per_iteration": 2.4423561096191406 + }, + { + "auxiliary_loss_clip": 0.01108462, + "auxiliary_loss_mlp": 0.01029428, + "balance_loss_clip": 1.01834702, + "balance_loss_mlp": 1.03857064, + "epoch": 0.4972794228167744, + "flos": 21653560275840.0, + "grad_norm": 1.6811398863814482, + "language_loss": 0.71087623, + "learning_rate": 2.114633606196899e-06, + "loss": 0.73225504, + "num_input_tokens_seen": 177833615, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.69921875, + "step": 8271, + "time_per_iteration": 2.54892635345459 + }, + { + "auxiliary_loss_clip": 0.01109407, + "auxiliary_loss_mlp": 0.01029685, + "balance_loss_clip": 1.01633358, + "balance_loss_mlp": 1.03880143, + "epoch": 0.49733954606944236, + "flos": 24279743347200.0, + "grad_norm": 1.4557340389451365, + "language_loss": 0.7848624, + "learning_rate": 2.1142447826035635e-06, + "loss": 0.80625331, + "num_input_tokens_seen": 177855315, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.703125, + "step": 8272, + "time_per_iteration": 2.462470054626465 + }, + { + "auxiliary_loss_clip": 0.01109645, + "auxiliary_loss_mlp": 0.01035441, + "balance_loss_clip": 1.02275074, + "balance_loss_mlp": 1.03950167, + "epoch": 0.4973996693221103, + "flos": 37852575730560.0, + "grad_norm": 2.5057831430835686, + "language_loss": 0.66278791, + "learning_rate": 2.1138559546780544e-06, + "loss": 0.68423879, + "num_input_tokens_seen": 177875590, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 8273, + "time_per_iteration": 2.6735026836395264 + }, + { + "auxiliary_loss_clip": 0.01109746, + "auxiliary_loss_mlp": 0.01031477, + "balance_loss_clip": 1.01891851, + "balance_loss_mlp": 1.03968048, + "epoch": 0.4974597925747783, + "flos": 21361426963200.0, + "grad_norm": 1.871691944459235, + "language_loss": 0.77977264, + "learning_rate": 2.1134671224351163e-06, + "loss": 0.80118477, + "num_input_tokens_seen": 177894175, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69921875, + "step": 8274, + "time_per_iteration": 2.462465763092041 + }, + { + "auxiliary_loss_clip": 0.01110748, + "auxiliary_loss_mlp": 0.01032049, + "balance_loss_clip": 1.01864374, + "balance_loss_mlp": 1.03865933, + "epoch": 0.49751991582744626, + "flos": 30738133560960.0, + "grad_norm": 2.0388244744713724, + "language_loss": 0.75829184, + "learning_rate": 2.113078285889493e-06, + "loss": 0.77971983, + "num_input_tokens_seen": 177913920, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.72265625, + "step": 8275, + "time_per_iteration": 2.6034398078918457 + }, + { + "auxiliary_loss_clip": 0.01110746, + "auxiliary_loss_mlp": 0.01034383, + "balance_loss_clip": 1.01974416, + "balance_loss_mlp": 1.03761268, + "epoch": 0.4975800390801142, + "flos": 14100541044480.0, + "grad_norm": 1.9341151140441402, + "language_loss": 0.8392635, + "learning_rate": 2.1126894450559303e-06, + "loss": 0.86071479, + "num_input_tokens_seen": 177930425, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.734375, + "step": 8276, + "time_per_iteration": 2.435999870300293 + }, + { + "auxiliary_loss_clip": 0.01102851, + "auxiliary_loss_mlp": 0.01028231, + "balance_loss_clip": 1.01664937, + "balance_loss_mlp": 1.03633988, + "epoch": 0.4976401623327822, + "flos": 24207275658240.0, + "grad_norm": 1.3535075156355831, + "language_loss": 0.70188868, + "learning_rate": 2.112300599949172e-06, + "loss": 0.72319949, + "num_input_tokens_seen": 177949885, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 8277, + "time_per_iteration": 2.5726187229156494 + }, + { + "auxiliary_loss_clip": 0.01105349, + "auxiliary_loss_mlp": 0.01032526, + "balance_loss_clip": 1.01952052, + "balance_loss_mlp": 1.03669858, + "epoch": 0.49770028558545015, + "flos": 21136769349120.0, + "grad_norm": 1.773647946812319, + "language_loss": 0.82609779, + "learning_rate": 2.111911750583964e-06, + "loss": 0.84747648, + "num_input_tokens_seen": 177965720, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6875, + "step": 8278, + "time_per_iteration": 2.4459898471832275 + }, + { + "auxiliary_loss_clip": 0.01108155, + "auxiliary_loss_mlp": 0.01032407, + "balance_loss_clip": 1.01992559, + "balance_loss_mlp": 1.03671384, + "epoch": 0.4977604088381181, + "flos": 16763927627520.0, + "grad_norm": 1.8017237706358624, + "language_loss": 0.6784246, + "learning_rate": 2.111522896975052e-06, + "loss": 0.69983023, + "num_input_tokens_seen": 177983190, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71484375, + "step": 8279, + "time_per_iteration": 2.4793283939361572 + }, + { + "auxiliary_loss_clip": 0.0110668, + "auxiliary_loss_mlp": 0.01034393, + "balance_loss_clip": 1.0204277, + "balance_loss_mlp": 1.03561902, + "epoch": 0.49782053209078614, + "flos": 15703521292800.0, + "grad_norm": 1.9740212049853438, + "language_loss": 0.70469928, + "learning_rate": 2.1111340391371794e-06, + "loss": 0.72610998, + "num_input_tokens_seen": 178000155, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7109375, + "step": 8280, + "time_per_iteration": 2.427778482437134 + }, + { + "auxiliary_loss_clip": 0.01104778, + "auxiliary_loss_mlp": 0.01033065, + "balance_loss_clip": 1.02028, + "balance_loss_mlp": 1.03475237, + "epoch": 0.4978806553434541, + "flos": 24753692327040.0, + "grad_norm": 1.6232736941666084, + "language_loss": 0.64461923, + "learning_rate": 2.1107451770850936e-06, + "loss": 0.66599762, + "num_input_tokens_seen": 178021060, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 8281, + "time_per_iteration": 2.511054515838623 + }, + { + "auxiliary_loss_clip": 0.01109361, + "auxiliary_loss_mlp": 0.01035185, + "balance_loss_clip": 1.02175605, + "balance_loss_mlp": 1.03830338, + "epoch": 0.49794077859612207, + "flos": 13115726881920.0, + "grad_norm": 1.82873470978674, + "language_loss": 0.72714734, + "learning_rate": 2.1103563108335387e-06, + "loss": 0.74859279, + "num_input_tokens_seen": 178038180, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7109375, + "step": 8282, + "time_per_iteration": 2.417059898376465 + }, + { + "auxiliary_loss_clip": 0.01103243, + "auxiliary_loss_mlp": 0.01029512, + "balance_loss_clip": 1.01804423, + "balance_loss_mlp": 1.03591275, + "epoch": 0.49800090184879003, + "flos": 27525133998720.0, + "grad_norm": 1.6753255120783885, + "language_loss": 0.73373008, + "learning_rate": 2.109967440397263e-06, + "loss": 0.75505757, + "num_input_tokens_seen": 178057565, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.671875, + "step": 8283, + "time_per_iteration": 2.531747341156006 + }, + { + "auxiliary_loss_clip": 0.01106082, + "auxiliary_loss_mlp": 0.01037237, + "balance_loss_clip": 1.02446926, + "balance_loss_mlp": 1.03696167, + "epoch": 0.498061025101458, + "flos": 19792489829760.0, + "grad_norm": 1.6101503544989328, + "language_loss": 0.78866243, + "learning_rate": 2.1095785657910095e-06, + "loss": 0.81009555, + "num_input_tokens_seen": 178076965, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69140625, + "step": 8284, + "time_per_iteration": 2.4609432220458984 + }, + { + "auxiliary_loss_clip": 0.01113439, + "auxiliary_loss_mlp": 0.01038109, + "balance_loss_clip": 1.02398884, + "balance_loss_mlp": 1.0390476, + "epoch": 0.49812114835412596, + "flos": 29893909230720.0, + "grad_norm": 1.8191212695174297, + "language_loss": 0.73705399, + "learning_rate": 2.109189687029526e-06, + "loss": 0.75856948, + "num_input_tokens_seen": 178095105, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.74609375, + "step": 8285, + "time_per_iteration": 2.5364696979522705 + }, + { + "auxiliary_loss_clip": 0.01112037, + "auxiliary_loss_mlp": 0.01032281, + "balance_loss_clip": 1.01872683, + "balance_loss_mlp": 1.0420599, + "epoch": 0.49818127160679393, + "flos": 23147048891520.0, + "grad_norm": 1.6445235471758528, + "language_loss": 0.74477649, + "learning_rate": 2.1088008041275598e-06, + "loss": 0.76621962, + "num_input_tokens_seen": 178114505, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.69921875, + "step": 8286, + "time_per_iteration": 2.4888620376586914 + }, + { + "auxiliary_loss_clip": 0.01112849, + "auxiliary_loss_mlp": 0.0104004, + "balance_loss_clip": 1.02713549, + "balance_loss_mlp": 1.04156506, + "epoch": 0.4982413948594619, + "flos": 21652806090240.0, + "grad_norm": 1.7365216069979077, + "language_loss": 0.85467643, + "learning_rate": 2.1084119170998545e-06, + "loss": 0.87620533, + "num_input_tokens_seen": 178131595, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 8287, + "time_per_iteration": 2.5058188438415527 + }, + { + "auxiliary_loss_clip": 0.01107755, + "auxiliary_loss_mlp": 0.01025542, + "balance_loss_clip": 1.01267338, + "balance_loss_mlp": 1.03729916, + "epoch": 0.49830151811212986, + "flos": 32486982940800.0, + "grad_norm": 1.6348463305948138, + "language_loss": 0.72363204, + "learning_rate": 2.108023025961159e-06, + "loss": 0.74496502, + "num_input_tokens_seen": 178152055, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 8288, + "time_per_iteration": 2.528475046157837 + }, + { + "auxiliary_loss_clip": 0.0111456, + "auxiliary_loss_mlp": 0.01038212, + "balance_loss_clip": 1.02319193, + "balance_loss_mlp": 1.04041409, + "epoch": 0.4983616413647978, + "flos": 18142358002560.0, + "grad_norm": 2.900373689725773, + "language_loss": 0.80002087, + "learning_rate": 2.10763413072622e-06, + "loss": 0.82154852, + "num_input_tokens_seen": 178168150, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7421875, + "step": 8289, + "time_per_iteration": 2.4667603969573975 + }, + { + "auxiliary_loss_clip": 0.01106957, + "auxiliary_loss_mlp": 0.01033142, + "balance_loss_clip": 1.0199995, + "balance_loss_mlp": 1.03680038, + "epoch": 0.4984217646174658, + "flos": 19718836992000.0, + "grad_norm": 2.15669041751919, + "language_loss": 0.73524791, + "learning_rate": 2.107245231409784e-06, + "loss": 0.7566489, + "num_input_tokens_seen": 178186150, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 8290, + "time_per_iteration": 2.4318900108337402 + }, + { + "auxiliary_loss_clip": 0.01112096, + "auxiliary_loss_mlp": 0.01037317, + "balance_loss_clip": 1.02232039, + "balance_loss_mlp": 1.04070783, + "epoch": 0.49848188787013376, + "flos": 24936549488640.0, + "grad_norm": 1.4681011524205945, + "language_loss": 0.84016359, + "learning_rate": 2.106856328026598e-06, + "loss": 0.86165774, + "num_input_tokens_seen": 178207665, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7109375, + "step": 8291, + "time_per_iteration": 2.502545118331909 + }, + { + "auxiliary_loss_clip": 0.01117247, + "auxiliary_loss_mlp": 0.01037074, + "balance_loss_clip": 1.02307272, + "balance_loss_mlp": 1.04216146, + "epoch": 0.4985420111228017, + "flos": 22382439056640.0, + "grad_norm": 1.910804847598398, + "language_loss": 0.67084122, + "learning_rate": 2.106467420591409e-06, + "loss": 0.69238442, + "num_input_tokens_seen": 178226325, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.75, + "step": 8292, + "time_per_iteration": 2.4527781009674072 + }, + { + "auxiliary_loss_clip": 0.01108825, + "auxiliary_loss_mlp": 0.01031205, + "balance_loss_clip": 1.01933742, + "balance_loss_mlp": 1.03864646, + "epoch": 0.4986021343754697, + "flos": 16216469464320.0, + "grad_norm": 1.7642237687107358, + "language_loss": 0.67300534, + "learning_rate": 2.106078509118965e-06, + "loss": 0.69440567, + "num_input_tokens_seen": 178244960, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.703125, + "step": 8293, + "time_per_iteration": 2.4598476886749268 + }, + { + "auxiliary_loss_clip": 0.01110354, + "auxiliary_loss_mlp": 0.01028466, + "balance_loss_clip": 1.01615214, + "balance_loss_mlp": 1.03958893, + "epoch": 0.4986622576281377, + "flos": 23403594804480.0, + "grad_norm": 1.987515516196069, + "language_loss": 0.8202461, + "learning_rate": 2.1056895936240133e-06, + "loss": 0.84163427, + "num_input_tokens_seen": 178265400, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.7109375, + "step": 8294, + "time_per_iteration": 2.4827442169189453 + }, + { + "auxiliary_loss_clip": 0.01110277, + "auxiliary_loss_mlp": 0.01034377, + "balance_loss_clip": 1.02032816, + "balance_loss_mlp": 1.03937042, + "epoch": 0.49872238088080567, + "flos": 19974556892160.0, + "grad_norm": 1.7471179574646651, + "language_loss": 0.73073918, + "learning_rate": 2.1053006741213016e-06, + "loss": 0.7521857, + "num_input_tokens_seen": 178284535, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7109375, + "step": 8295, + "time_per_iteration": 2.4712820053100586 + }, + { + "auxiliary_loss_clip": 0.01108254, + "auxiliary_loss_mlp": 0.01036676, + "balance_loss_clip": 1.02435029, + "balance_loss_mlp": 1.03895998, + "epoch": 0.49878250413347364, + "flos": 22893016930560.0, + "grad_norm": 1.9200384732673381, + "language_loss": 0.673262, + "learning_rate": 2.1049117506255775e-06, + "loss": 0.69471127, + "num_input_tokens_seen": 178302425, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 8296, + "time_per_iteration": 2.45139479637146 + }, + { + "auxiliary_loss_clip": 0.01111689, + "auxiliary_loss_mlp": 0.01033799, + "balance_loss_clip": 1.0202632, + "balance_loss_mlp": 1.03996015, + "epoch": 0.4988426273861416, + "flos": 32598449821440.0, + "grad_norm": 1.713618634115876, + "language_loss": 0.64634776, + "learning_rate": 2.1045228231515895e-06, + "loss": 0.66780269, + "num_input_tokens_seen": 178323065, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.71875, + "step": 8297, + "time_per_iteration": 2.5514614582061768 + }, + { + "auxiliary_loss_clip": 0.0110753, + "auxiliary_loss_mlp": 0.01033016, + "balance_loss_clip": 1.02121472, + "balance_loss_mlp": 1.03931689, + "epoch": 0.49890275063880957, + "flos": 20923604087040.0, + "grad_norm": 1.9440676372274848, + "language_loss": 0.69621831, + "learning_rate": 2.1041338917140857e-06, + "loss": 0.71762383, + "num_input_tokens_seen": 178343985, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.68359375, + "step": 8298, + "time_per_iteration": 2.4699370861053467 + }, + { + "auxiliary_loss_clip": 0.01107047, + "auxiliary_loss_mlp": 0.01036879, + "balance_loss_clip": 1.02421331, + "balance_loss_mlp": 1.03804398, + "epoch": 0.49896287389147753, + "flos": 18624459369600.0, + "grad_norm": 2.087380746796303, + "language_loss": 0.84278095, + "learning_rate": 2.103744956327814e-06, + "loss": 0.86422026, + "num_input_tokens_seen": 178362345, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 8299, + "time_per_iteration": 2.4820563793182373 + }, + { + "auxiliary_loss_clip": 0.01113334, + "auxiliary_loss_mlp": 0.01037602, + "balance_loss_clip": 1.02327859, + "balance_loss_mlp": 1.03978848, + "epoch": 0.4990229971441455, + "flos": 24826555065600.0, + "grad_norm": 5.591354549929027, + "language_loss": 0.69272447, + "learning_rate": 2.1033560170075234e-06, + "loss": 0.71423382, + "num_input_tokens_seen": 178383190, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.734375, + "step": 8300, + "time_per_iteration": 2.473634719848633 + }, + { + "auxiliary_loss_clip": 0.01037164, + "auxiliary_loss_mlp": 0.01003582, + "balance_loss_clip": 1.00239551, + "balance_loss_mlp": 1.01397431, + "epoch": 0.49908312039681346, + "flos": 71384525136000.0, + "grad_norm": 0.7592353305728455, + "language_loss": 0.51136976, + "learning_rate": 2.1029670737679623e-06, + "loss": 0.5317772, + "num_input_tokens_seen": 178444250, + "router_z_loss_clip": 0.01184082, + "router_z_loss_mlp": 0.23242188, + "step": 8301, + "time_per_iteration": 3.1719589233398438 + }, + { + "auxiliary_loss_clip": 0.01106374, + "auxiliary_loss_mlp": 0.01040035, + "balance_loss_clip": 1.02670741, + "balance_loss_mlp": 1.03841138, + "epoch": 0.4991432436494814, + "flos": 19828651847040.0, + "grad_norm": 1.9297901828770159, + "language_loss": 0.84423494, + "learning_rate": 2.102578126623879e-06, + "loss": 0.86569905, + "num_input_tokens_seen": 178463250, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.6796875, + "step": 8302, + "time_per_iteration": 3.8624472618103027 + }, + { + "auxiliary_loss_clip": 0.01107901, + "auxiliary_loss_mlp": 0.01027812, + "balance_loss_clip": 1.0157299, + "balance_loss_mlp": 1.03963566, + "epoch": 0.4992033669021494, + "flos": 15121912273920.0, + "grad_norm": 1.7245012471823244, + "language_loss": 0.68831706, + "learning_rate": 2.102189175590024e-06, + "loss": 0.70967424, + "num_input_tokens_seen": 178481340, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 8303, + "time_per_iteration": 2.4496121406555176 + }, + { + "auxiliary_loss_clip": 0.01111721, + "auxiliary_loss_mlp": 0.01031821, + "balance_loss_clip": 1.01871395, + "balance_loss_mlp": 1.0395093, + "epoch": 0.49926349015481736, + "flos": 31207952476800.0, + "grad_norm": 1.8500063703376581, + "language_loss": 0.72523201, + "learning_rate": 2.101800220681144e-06, + "loss": 0.7466675, + "num_input_tokens_seen": 178501545, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.72265625, + "step": 8304, + "time_per_iteration": 5.351519346237183 + }, + { + "auxiliary_loss_clip": 0.01109868, + "auxiliary_loss_mlp": 0.01038641, + "balance_loss_clip": 1.02633858, + "balance_loss_mlp": 1.03971672, + "epoch": 0.4993236134074853, + "flos": 24900207903360.0, + "grad_norm": 2.113610055263332, + "language_loss": 0.81011766, + "learning_rate": 2.10141126191199e-06, + "loss": 0.83160275, + "num_input_tokens_seen": 178519700, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 8305, + "time_per_iteration": 3.9764394760131836 + }, + { + "auxiliary_loss_clip": 0.0103618, + "auxiliary_loss_mlp": 0.01001804, + "balance_loss_clip": 1.00061762, + "balance_loss_mlp": 1.01301277, + "epoch": 0.4993837366601533, + "flos": 70420573797120.0, + "grad_norm": 0.7225706425993785, + "language_loss": 0.56916559, + "learning_rate": 2.1010222992973107e-06, + "loss": 0.58954537, + "num_input_tokens_seen": 178576740, + "router_z_loss_clip": 0.01184082, + "router_z_loss_mlp": 0.23144531, + "step": 8306, + "time_per_iteration": 3.1952388286590576 + }, + { + "auxiliary_loss_clip": 0.01114208, + "auxiliary_loss_mlp": 0.01037149, + "balance_loss_clip": 1.02323711, + "balance_loss_mlp": 1.04268515, + "epoch": 0.4994438599128213, + "flos": 15961216440960.0, + "grad_norm": 1.791967653711514, + "language_loss": 0.82407033, + "learning_rate": 2.1006333328518556e-06, + "loss": 0.84558392, + "num_input_tokens_seen": 178594745, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.71484375, + "step": 8307, + "time_per_iteration": 2.4501423835754395 + }, + { + "auxiliary_loss_clip": 0.01108969, + "auxiliary_loss_mlp": 0.01033029, + "balance_loss_clip": 1.01987445, + "balance_loss_mlp": 1.03845966, + "epoch": 0.4995039831654893, + "flos": 27928303228800.0, + "grad_norm": 2.0869484891217973, + "language_loss": 0.60544026, + "learning_rate": 2.1002443625903748e-06, + "loss": 0.62686026, + "num_input_tokens_seen": 178614110, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 8308, + "time_per_iteration": 2.5023903846740723 + }, + { + "auxiliary_loss_clip": 0.01106463, + "auxiliary_loss_mlp": 0.01030347, + "balance_loss_clip": 1.0179193, + "balance_loss_mlp": 1.03760242, + "epoch": 0.49956410641815724, + "flos": 24204797619840.0, + "grad_norm": 1.5917355796130328, + "language_loss": 0.74632615, + "learning_rate": 2.0998553885276168e-06, + "loss": 0.76769423, + "num_input_tokens_seen": 178634170, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6875, + "step": 8309, + "time_per_iteration": 2.473018169403076 + }, + { + "auxiliary_loss_clip": 0.01109782, + "auxiliary_loss_mlp": 0.01034327, + "balance_loss_clip": 1.02136922, + "balance_loss_mlp": 1.03926158, + "epoch": 0.4996242296708252, + "flos": 16180127879040.0, + "grad_norm": 2.147167346860859, + "language_loss": 0.80117911, + "learning_rate": 2.0994664106783335e-06, + "loss": 0.82262021, + "num_input_tokens_seen": 178651775, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.703125, + "step": 8310, + "time_per_iteration": 2.4172844886779785 + }, + { + "auxiliary_loss_clip": 0.01112518, + "auxiliary_loss_mlp": 0.01035729, + "balance_loss_clip": 1.02339089, + "balance_loss_mlp": 1.04019213, + "epoch": 0.49968435292349317, + "flos": 16873527000960.0, + "grad_norm": 1.6036366291386785, + "language_loss": 0.70938641, + "learning_rate": 2.0990774290572735e-06, + "loss": 0.73086882, + "num_input_tokens_seen": 178669720, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.72265625, + "step": 8311, + "time_per_iteration": 2.4804234504699707 + }, + { + "auxiliary_loss_clip": 0.01111462, + "auxiliary_loss_mlp": 0.01034214, + "balance_loss_clip": 1.02229297, + "balance_loss_mlp": 1.04154408, + "epoch": 0.49974447617616113, + "flos": 14939521989120.0, + "grad_norm": 1.923283457940722, + "language_loss": 0.77138013, + "learning_rate": 2.098688443679187e-06, + "loss": 0.79283684, + "num_input_tokens_seen": 178686765, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69921875, + "step": 8312, + "time_per_iteration": 2.4233593940734863 + }, + { + "auxiliary_loss_clip": 0.01111451, + "auxiliary_loss_mlp": 0.01032282, + "balance_loss_clip": 1.01910901, + "balance_loss_mlp": 1.04093099, + "epoch": 0.4998045994288291, + "flos": 26651535321600.0, + "grad_norm": 1.7466795572602452, + "language_loss": 0.84205925, + "learning_rate": 2.0982994545588256e-06, + "loss": 0.86349666, + "num_input_tokens_seen": 178705845, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.70703125, + "step": 8313, + "time_per_iteration": 2.509953260421753 + }, + { + "auxiliary_loss_clip": 0.01111508, + "auxiliary_loss_mlp": 0.01029516, + "balance_loss_clip": 1.01633728, + "balance_loss_mlp": 1.03987491, + "epoch": 0.49986472268149706, + "flos": 20953768533120.0, + "grad_norm": 2.119225345296983, + "language_loss": 0.80887723, + "learning_rate": 2.097910461710939e-06, + "loss": 0.83028746, + "num_input_tokens_seen": 178723410, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71484375, + "step": 8314, + "time_per_iteration": 2.452765703201294 + }, + { + "auxiliary_loss_clip": 0.01113768, + "auxiliary_loss_mlp": 0.01041835, + "balance_loss_clip": 1.02763736, + "balance_loss_mlp": 1.0418222, + "epoch": 0.49992484593416503, + "flos": 22783884433920.0, + "grad_norm": 2.4967995028767778, + "language_loss": 0.79017889, + "learning_rate": 2.0975214651502773e-06, + "loss": 0.81173497, + "num_input_tokens_seen": 178743560, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.71875, + "step": 8315, + "time_per_iteration": 2.4926230907440186 + }, + { + "auxiliary_loss_clip": 0.01110205, + "auxiliary_loss_mlp": 0.01034063, + "balance_loss_clip": 1.02123618, + "balance_loss_mlp": 1.04051793, + "epoch": 0.499984969186833, + "flos": 46786970252160.0, + "grad_norm": 2.5792388666411274, + "language_loss": 0.73983908, + "learning_rate": 2.0971324648915926e-06, + "loss": 0.76128173, + "num_input_tokens_seen": 178767225, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 8316, + "time_per_iteration": 2.692228317260742 + }, + { + "auxiliary_loss_clip": 0.01109445, + "auxiliary_loss_mlp": 0.01032836, + "balance_loss_clip": 1.02058125, + "balance_loss_mlp": 1.04118443, + "epoch": 0.500045092439501, + "flos": 25556978131200.0, + "grad_norm": 1.4190232020266644, + "language_loss": 0.81204319, + "learning_rate": 2.0967434609496343e-06, + "loss": 0.83346593, + "num_input_tokens_seen": 178786810, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.68359375, + "step": 8317, + "time_per_iteration": 2.4997825622558594 + }, + { + "auxiliary_loss_clip": 0.01111618, + "auxiliary_loss_mlp": 0.01038384, + "balance_loss_clip": 1.02436495, + "balance_loss_mlp": 1.04001343, + "epoch": 0.5001052156921689, + "flos": 20704764476160.0, + "grad_norm": 1.649167878849496, + "language_loss": 0.83189869, + "learning_rate": 2.0963544533391548e-06, + "loss": 0.85339868, + "num_input_tokens_seen": 178805660, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.71875, + "step": 8318, + "time_per_iteration": 2.516118049621582 + }, + { + "auxiliary_loss_clip": 0.01111509, + "auxiliary_loss_mlp": 0.01030416, + "balance_loss_clip": 1.01778626, + "balance_loss_mlp": 1.04068375, + "epoch": 0.500165338944837, + "flos": 21251109317760.0, + "grad_norm": 1.8062739344487506, + "language_loss": 0.81684446, + "learning_rate": 2.0959654420749045e-06, + "loss": 0.83826375, + "num_input_tokens_seen": 178824780, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.70703125, + "step": 8319, + "time_per_iteration": 2.4977705478668213 + }, + { + "auxiliary_loss_clip": 0.01112348, + "auxiliary_loss_mlp": 0.01026791, + "balance_loss_clip": 1.01469707, + "balance_loss_mlp": 1.04046464, + "epoch": 0.5002254621975049, + "flos": 27854398995840.0, + "grad_norm": 1.7611824883833367, + "language_loss": 0.71951354, + "learning_rate": 2.095576427171635e-06, + "loss": 0.74090493, + "num_input_tokens_seen": 178845640, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71875, + "step": 8320, + "time_per_iteration": 2.5664663314819336 + }, + { + "auxiliary_loss_clip": 0.01116964, + "auxiliary_loss_mlp": 0.01043637, + "balance_loss_clip": 1.02903366, + "balance_loss_mlp": 1.03925049, + "epoch": 0.5002855854501729, + "flos": 15551941898880.0, + "grad_norm": 3.538267489088781, + "language_loss": 0.76840645, + "learning_rate": 2.0951874086440978e-06, + "loss": 0.79001242, + "num_input_tokens_seen": 178862290, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7734375, + "step": 8321, + "time_per_iteration": 2.5154004096984863 + }, + { + "auxiliary_loss_clip": 0.01113289, + "auxiliary_loss_mlp": 0.01038756, + "balance_loss_clip": 1.0255599, + "balance_loss_mlp": 1.04125774, + "epoch": 0.5003457087028408, + "flos": 16107408794880.0, + "grad_norm": 1.9154758393965534, + "language_loss": 0.82959068, + "learning_rate": 2.0947983865070455e-06, + "loss": 0.85111117, + "num_input_tokens_seen": 178879805, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71875, + "step": 8322, + "time_per_iteration": 2.4235384464263916 + }, + { + "auxiliary_loss_clip": 0.01114951, + "auxiliary_loss_mlp": 0.01035, + "balance_loss_clip": 1.02180934, + "balance_loss_mlp": 1.04190695, + "epoch": 0.5004058319555088, + "flos": 22710518904960.0, + "grad_norm": 2.1453827228353166, + "language_loss": 0.73670769, + "learning_rate": 2.094409360775228e-06, + "loss": 0.7582072, + "num_input_tokens_seen": 178896985, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73046875, + "step": 8323, + "time_per_iteration": 2.495490312576294 + }, + { + "auxiliary_loss_clip": 0.01111344, + "auxiliary_loss_mlp": 0.01035029, + "balance_loss_clip": 1.02152205, + "balance_loss_mlp": 1.04043198, + "epoch": 0.5004659552081767, + "flos": 30117956313600.0, + "grad_norm": 1.517177144462768, + "language_loss": 0.69255745, + "learning_rate": 2.0940203314633977e-06, + "loss": 0.71402115, + "num_input_tokens_seen": 178920605, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7109375, + "step": 8324, + "time_per_iteration": 2.534043550491333 + }, + { + "auxiliary_loss_clip": 0.01110991, + "auxiliary_loss_mlp": 0.01033694, + "balance_loss_clip": 1.02072978, + "balance_loss_mlp": 1.03958941, + "epoch": 0.5005260784608447, + "flos": 18624710764800.0, + "grad_norm": 1.9198571129878061, + "language_loss": 0.72153628, + "learning_rate": 2.0936312985863077e-06, + "loss": 0.7429831, + "num_input_tokens_seen": 178937760, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71484375, + "step": 8325, + "time_per_iteration": 2.4783544540405273 + }, + { + "auxiliary_loss_clip": 0.01114311, + "auxiliary_loss_mlp": 0.01038383, + "balance_loss_clip": 1.0237087, + "balance_loss_mlp": 1.04212904, + "epoch": 0.5005862017135126, + "flos": 24859987649280.0, + "grad_norm": 1.5620326365302057, + "language_loss": 0.73494631, + "learning_rate": 2.093242262158709e-06, + "loss": 0.7564733, + "num_input_tokens_seen": 178957985, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.72265625, + "step": 8326, + "time_per_iteration": 2.4836461544036865 + }, + { + "auxiliary_loss_clip": 0.01110122, + "auxiliary_loss_mlp": 0.01031456, + "balance_loss_clip": 1.0189389, + "balance_loss_mlp": 1.03965449, + "epoch": 0.5006463249661807, + "flos": 18734381965440.0, + "grad_norm": 1.5385455876451686, + "language_loss": 0.78168696, + "learning_rate": 2.0928532221953544e-06, + "loss": 0.80310273, + "num_input_tokens_seen": 178977070, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 8327, + "time_per_iteration": 2.477095127105713 + }, + { + "auxiliary_loss_clip": 0.01117029, + "auxiliary_loss_mlp": 0.01035945, + "balance_loss_clip": 1.02261126, + "balance_loss_mlp": 1.04402947, + "epoch": 0.5007064482188487, + "flos": 13042145871360.0, + "grad_norm": 2.31963767631444, + "language_loss": 0.88008773, + "learning_rate": 2.092464178710997e-06, + "loss": 0.90161747, + "num_input_tokens_seen": 178994175, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73046875, + "step": 8328, + "time_per_iteration": 2.479931116104126 + }, + { + "auxiliary_loss_clip": 0.01116123, + "auxiliary_loss_mlp": 0.01035928, + "balance_loss_clip": 1.02290463, + "balance_loss_mlp": 1.0408715, + "epoch": 0.5007665714715166, + "flos": 21288671965440.0, + "grad_norm": 2.0106246059801482, + "language_loss": 0.74407351, + "learning_rate": 2.092075131720388e-06, + "loss": 0.76559395, + "num_input_tokens_seen": 179013710, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.75, + "step": 8329, + "time_per_iteration": 2.480037212371826 + }, + { + "auxiliary_loss_clip": 0.01112626, + "auxiliary_loss_mlp": 0.01033135, + "balance_loss_clip": 1.02014136, + "balance_loss_mlp": 1.04276633, + "epoch": 0.5008266947241846, + "flos": 29754576374400.0, + "grad_norm": 2.2897047741072063, + "language_loss": 0.79602063, + "learning_rate": 2.091686081238281e-06, + "loss": 0.81747818, + "num_input_tokens_seen": 179035255, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.69921875, + "step": 8330, + "time_per_iteration": 2.529446601867676 + }, + { + "auxiliary_loss_clip": 0.0103803, + "auxiliary_loss_mlp": 0.01000333, + "balance_loss_clip": 0.99922389, + "balance_loss_mlp": 1.01505685, + "epoch": 0.5008868179768525, + "flos": 63557829204480.0, + "grad_norm": 0.7317803530986337, + "language_loss": 0.56073356, + "learning_rate": 2.0912970272794282e-06, + "loss": 0.58111727, + "num_input_tokens_seen": 179090915, + "router_z_loss_clip": 0.0111084, + "router_z_loss_mlp": 0.23046875, + "step": 8331, + "time_per_iteration": 2.89511775970459 + }, + { + "auxiliary_loss_clip": 0.01110931, + "auxiliary_loss_mlp": 0.0102697, + "balance_loss_clip": 1.01504326, + "balance_loss_mlp": 1.041206, + "epoch": 0.5009469412295205, + "flos": 27375637593600.0, + "grad_norm": 2.865515028785386, + "language_loss": 0.65518546, + "learning_rate": 2.0909079698585833e-06, + "loss": 0.67656446, + "num_input_tokens_seen": 179109160, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 8332, + "time_per_iteration": 2.497129201889038 + }, + { + "auxiliary_loss_clip": 0.01109356, + "auxiliary_loss_mlp": 0.01034733, + "balance_loss_clip": 1.02261496, + "balance_loss_mlp": 1.0400846, + "epoch": 0.5010070644821885, + "flos": 27378833904000.0, + "grad_norm": 1.477043934406584, + "language_loss": 0.74687374, + "learning_rate": 2.0905189089904993e-06, + "loss": 0.76831466, + "num_input_tokens_seen": 179130610, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69140625, + "step": 8333, + "time_per_iteration": 2.506769895553589 + }, + { + "auxiliary_loss_clip": 0.01114084, + "auxiliary_loss_mlp": 0.01034969, + "balance_loss_clip": 1.02242804, + "balance_loss_mlp": 1.04128885, + "epoch": 0.5010671877348565, + "flos": 20662748542080.0, + "grad_norm": 3.419508092200526, + "language_loss": 0.80619013, + "learning_rate": 2.090129844689929e-06, + "loss": 0.82768065, + "num_input_tokens_seen": 179147860, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7265625, + "step": 8334, + "time_per_iteration": 2.4492759704589844 + }, + { + "auxiliary_loss_clip": 0.01038411, + "auxiliary_loss_mlp": 0.00996695, + "balance_loss_clip": 0.99557459, + "balance_loss_mlp": 1.01541471, + "epoch": 0.5011273109875244, + "flos": 59128645000320.0, + "grad_norm": 0.8938151962133672, + "language_loss": 0.62658346, + "learning_rate": 2.089740776971626e-06, + "loss": 0.64693451, + "num_input_tokens_seen": 179210490, + "router_z_loss_clip": 0.01123047, + "router_z_loss_mlp": 0.23046875, + "step": 8335, + "time_per_iteration": 3.044527530670166 + }, + { + "auxiliary_loss_clip": 0.01108292, + "auxiliary_loss_mlp": 0.01027703, + "balance_loss_clip": 1.01548398, + "balance_loss_mlp": 1.03883338, + "epoch": 0.5011874342401924, + "flos": 25336342840320.0, + "grad_norm": 1.39366543335018, + "language_loss": 0.79443586, + "learning_rate": 2.0893517058503435e-06, + "loss": 0.81579578, + "num_input_tokens_seen": 179231360, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6953125, + "step": 8336, + "time_per_iteration": 2.5133562088012695 + }, + { + "auxiliary_loss_clip": 0.01111717, + "auxiliary_loss_mlp": 0.01031101, + "balance_loss_clip": 1.01791, + "balance_loss_mlp": 1.0402261, + "epoch": 0.5012475574928603, + "flos": 20229953569920.0, + "grad_norm": 1.7464580749308463, + "language_loss": 0.80139911, + "learning_rate": 2.088962631340836e-06, + "loss": 0.82282722, + "num_input_tokens_seen": 179250625, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71484375, + "step": 8337, + "time_per_iteration": 2.4671413898468018 + }, + { + "auxiliary_loss_clip": 0.01114807, + "auxiliary_loss_mlp": 0.01033341, + "balance_loss_clip": 1.0201329, + "balance_loss_mlp": 1.03992128, + "epoch": 0.5013076807455283, + "flos": 22710123855360.0, + "grad_norm": 1.859552309481282, + "language_loss": 0.79314995, + "learning_rate": 2.0885735534578555e-06, + "loss": 0.8146314, + "num_input_tokens_seen": 179267360, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.75, + "step": 8338, + "time_per_iteration": 2.4763965606689453 + }, + { + "auxiliary_loss_clip": 0.01112164, + "auxiliary_loss_mlp": 0.01031399, + "balance_loss_clip": 1.0178982, + "balance_loss_mlp": 1.0390203, + "epoch": 0.5013678039981962, + "flos": 24245161528320.0, + "grad_norm": 1.6104717001039177, + "language_loss": 0.85006964, + "learning_rate": 2.0881844722161583e-06, + "loss": 0.87150526, + "num_input_tokens_seen": 179289810, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73046875, + "step": 8339, + "time_per_iteration": 2.507951259613037 + }, + { + "auxiliary_loss_clip": 0.01110925, + "auxiliary_loss_mlp": 0.01038014, + "balance_loss_clip": 1.02476954, + "balance_loss_mlp": 1.03943646, + "epoch": 0.5014279272508643, + "flos": 26176688501760.0, + "grad_norm": 1.484784321746097, + "language_loss": 0.70492387, + "learning_rate": 2.0877953876304962e-06, + "loss": 0.72641325, + "num_input_tokens_seen": 179310620, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71484375, + "step": 8340, + "time_per_iteration": 2.5271620750427246 + }, + { + "auxiliary_loss_clip": 0.01116646, + "auxiliary_loss_mlp": 0.01035561, + "balance_loss_clip": 1.02178025, + "balance_loss_mlp": 1.04153883, + "epoch": 0.5014880505035323, + "flos": 21430446946560.0, + "grad_norm": 1.9114275861555547, + "language_loss": 0.77793235, + "learning_rate": 2.0874062997156245e-06, + "loss": 0.79945439, + "num_input_tokens_seen": 179329005, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.75, + "step": 8341, + "time_per_iteration": 2.467557430267334 + }, + { + "auxiliary_loss_clip": 0.01116354, + "auxiliary_loss_mlp": 0.01039659, + "balance_loss_clip": 1.02543771, + "balance_loss_mlp": 1.04048502, + "epoch": 0.5015481737562002, + "flos": 15770745596160.0, + "grad_norm": 2.478803711535475, + "language_loss": 0.8961392, + "learning_rate": 2.0870172084862975e-06, + "loss": 0.91769934, + "num_input_tokens_seen": 179343785, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7578125, + "step": 8342, + "time_per_iteration": 2.454822063446045 + }, + { + "auxiliary_loss_clip": 0.01110124, + "auxiliary_loss_mlp": 0.01036028, + "balance_loss_clip": 1.02272439, + "balance_loss_mlp": 1.03894877, + "epoch": 0.5016082970088682, + "flos": 26830801123200.0, + "grad_norm": 3.1772216639919906, + "language_loss": 0.76625615, + "learning_rate": 2.0866281139572682e-06, + "loss": 0.7877177, + "num_input_tokens_seen": 179364070, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 8343, + "time_per_iteration": 2.485499143600464 + }, + { + "auxiliary_loss_clip": 0.0110844, + "auxiliary_loss_mlp": 0.01027749, + "balance_loss_clip": 1.01584053, + "balance_loss_mlp": 1.03967083, + "epoch": 0.5016684202615361, + "flos": 21470595373440.0, + "grad_norm": 2.1220779506727574, + "language_loss": 0.67086864, + "learning_rate": 2.086239016143293e-06, + "loss": 0.69223046, + "num_input_tokens_seen": 179384225, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 8344, + "time_per_iteration": 3.88729190826416 + }, + { + "auxiliary_loss_clip": 0.01111927, + "auxiliary_loss_mlp": 0.01034179, + "balance_loss_clip": 1.02143502, + "balance_loss_mlp": 1.03998613, + "epoch": 0.5017285435142042, + "flos": 26246821806720.0, + "grad_norm": 1.9395231632627998, + "language_loss": 0.75212955, + "learning_rate": 2.0858499150591258e-06, + "loss": 0.77359062, + "num_input_tokens_seen": 179402595, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 8345, + "time_per_iteration": 2.4836034774780273 + }, + { + "auxiliary_loss_clip": 0.01112737, + "auxiliary_loss_mlp": 0.01031643, + "balance_loss_clip": 1.01769578, + "balance_loss_mlp": 1.04121828, + "epoch": 0.5017886667668721, + "flos": 20777555387520.0, + "grad_norm": 1.95370753247372, + "language_loss": 0.78477418, + "learning_rate": 2.0854608107195203e-06, + "loss": 0.80621803, + "num_input_tokens_seen": 179419635, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.71484375, + "step": 8346, + "time_per_iteration": 5.420297861099243 + }, + { + "auxiliary_loss_clip": 0.01110161, + "auxiliary_loss_mlp": 0.01036529, + "balance_loss_clip": 1.02408957, + "balance_loss_mlp": 1.03860831, + "epoch": 0.5018487900195401, + "flos": 20156408472960.0, + "grad_norm": 1.6533044146295508, + "language_loss": 0.69167304, + "learning_rate": 2.0850717031392333e-06, + "loss": 0.71313995, + "num_input_tokens_seen": 179438770, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71484375, + "step": 8347, + "time_per_iteration": 2.5022430419921875 + }, + { + "auxiliary_loss_clip": 0.01112834, + "auxiliary_loss_mlp": 0.01034397, + "balance_loss_clip": 1.02136123, + "balance_loss_mlp": 1.03990984, + "epoch": 0.501908913272208, + "flos": 18150689957760.0, + "grad_norm": 1.8545802319259819, + "language_loss": 0.71527761, + "learning_rate": 2.0846825923330174e-06, + "loss": 0.73674989, + "num_input_tokens_seen": 179457475, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73046875, + "step": 8348, + "time_per_iteration": 2.491255760192871 + }, + { + "auxiliary_loss_clip": 0.01108777, + "auxiliary_loss_mlp": 0.01032852, + "balance_loss_clip": 1.02089548, + "balance_loss_mlp": 1.04003596, + "epoch": 0.501969036524876, + "flos": 23112287504640.0, + "grad_norm": 1.6664488621380107, + "language_loss": 0.73957872, + "learning_rate": 2.0842934783156303e-06, + "loss": 0.76099503, + "num_input_tokens_seen": 179478140, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6875, + "step": 8349, + "time_per_iteration": 2.478173017501831 + }, + { + "auxiliary_loss_clip": 0.01111134, + "auxiliary_loss_mlp": 0.01030526, + "balance_loss_clip": 1.01726353, + "balance_loss_mlp": 1.03897953, + "epoch": 0.5020291597775439, + "flos": 11363214314880.0, + "grad_norm": 2.0979883436616915, + "language_loss": 0.63680947, + "learning_rate": 2.0839043611018266e-06, + "loss": 0.65822613, + "num_input_tokens_seen": 179494325, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.72265625, + "step": 8350, + "time_per_iteration": 2.407949686050415 + }, + { + "auxiliary_loss_clip": 0.01035777, + "auxiliary_loss_mlp": 0.01011664, + "balance_loss_clip": 1.01064515, + "balance_loss_mlp": 1.01269341, + "epoch": 0.5020892830302119, + "flos": 64011094928640.0, + "grad_norm": 1.0786206787107346, + "language_loss": 0.59814817, + "learning_rate": 2.0835152407063597e-06, + "loss": 0.6186226, + "num_input_tokens_seen": 179553545, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.23144531, + "step": 8351, + "time_per_iteration": 3.199061393737793 + }, + { + "auxiliary_loss_clip": 0.01111613, + "auxiliary_loss_mlp": 0.01034378, + "balance_loss_clip": 1.02156925, + "balance_loss_mlp": 1.0395788, + "epoch": 0.5021494062828799, + "flos": 23732859801600.0, + "grad_norm": 2.3062568387149365, + "language_loss": 0.75367033, + "learning_rate": 2.0831261171439873e-06, + "loss": 0.77513033, + "num_input_tokens_seen": 179573645, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.71875, + "step": 8352, + "time_per_iteration": 2.506408214569092 + }, + { + "auxiliary_loss_clip": 0.01113074, + "auxiliary_loss_mlp": 0.01032571, + "balance_loss_clip": 1.01963115, + "balance_loss_mlp": 1.04205072, + "epoch": 0.5022095295355479, + "flos": 21576747041280.0, + "grad_norm": 1.6126052392954302, + "language_loss": 0.71743786, + "learning_rate": 2.082736990429464e-06, + "loss": 0.73889434, + "num_input_tokens_seen": 179591435, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 8353, + "time_per_iteration": 2.469383478164673 + }, + { + "auxiliary_loss_clip": 0.01115894, + "auxiliary_loss_mlp": 0.0103681, + "balance_loss_clip": 1.02279735, + "balance_loss_mlp": 1.04492378, + "epoch": 0.5022696527882159, + "flos": 21397229844480.0, + "grad_norm": 3.986170886248432, + "language_loss": 0.73818904, + "learning_rate": 2.0823478605775455e-06, + "loss": 0.75971609, + "num_input_tokens_seen": 179609955, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7109375, + "step": 8354, + "time_per_iteration": 2.510967254638672 + }, + { + "auxiliary_loss_clip": 0.01111051, + "auxiliary_loss_mlp": 0.01036606, + "balance_loss_clip": 1.02324271, + "balance_loss_mlp": 1.04122615, + "epoch": 0.5023297760408838, + "flos": 27160712565120.0, + "grad_norm": 1.6375075569861386, + "language_loss": 0.72198367, + "learning_rate": 2.0819587276029884e-06, + "loss": 0.74346024, + "num_input_tokens_seen": 179630875, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.69921875, + "step": 8355, + "time_per_iteration": 2.5355918407440186 + }, + { + "auxiliary_loss_clip": 0.01113009, + "auxiliary_loss_mlp": 0.01036959, + "balance_loss_clip": 1.0234164, + "balance_loss_mlp": 1.04037476, + "epoch": 0.5023898992935518, + "flos": 26213820186240.0, + "grad_norm": 1.5634548911110102, + "language_loss": 0.81171584, + "learning_rate": 2.081569591520548e-06, + "loss": 0.83321553, + "num_input_tokens_seen": 179649835, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7265625, + "step": 8356, + "time_per_iteration": 2.5366694927215576 + }, + { + "auxiliary_loss_clip": 0.01115056, + "auxiliary_loss_mlp": 0.01038235, + "balance_loss_clip": 1.0234828, + "balance_loss_mlp": 1.03943825, + "epoch": 0.5024500225462197, + "flos": 13440323111040.0, + "grad_norm": 2.216032444638608, + "language_loss": 0.76043326, + "learning_rate": 2.0811804523449803e-06, + "loss": 0.78196621, + "num_input_tokens_seen": 179667605, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.7578125, + "step": 8357, + "time_per_iteration": 2.4454803466796875 + }, + { + "auxiliary_loss_clip": 0.01112875, + "auxiliary_loss_mlp": 0.01033956, + "balance_loss_clip": 1.0196929, + "balance_loss_mlp": 1.04054666, + "epoch": 0.5025101457988878, + "flos": 21579584215680.0, + "grad_norm": 1.6874014883711121, + "language_loss": 0.75969183, + "learning_rate": 2.0807913100910417e-06, + "loss": 0.78116012, + "num_input_tokens_seen": 179686910, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7265625, + "step": 8358, + "time_per_iteration": 2.4932358264923096 + }, + { + "auxiliary_loss_clip": 0.01111732, + "auxiliary_loss_mlp": 0.01034386, + "balance_loss_clip": 1.02163708, + "balance_loss_mlp": 1.04097748, + "epoch": 0.5025702690515557, + "flos": 24645134448000.0, + "grad_norm": 2.322067399050787, + "language_loss": 0.72372258, + "learning_rate": 2.0804021647734887e-06, + "loss": 0.74518377, + "num_input_tokens_seen": 179706395, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 8359, + "time_per_iteration": 2.500152826309204 + }, + { + "auxiliary_loss_clip": 0.01111655, + "auxiliary_loss_mlp": 0.01036283, + "balance_loss_clip": 1.02361679, + "balance_loss_mlp": 1.04144287, + "epoch": 0.5026303923042237, + "flos": 22090162089600.0, + "grad_norm": 1.6242275025336705, + "language_loss": 0.77095789, + "learning_rate": 2.080013016407077e-06, + "loss": 0.79243731, + "num_input_tokens_seen": 179725735, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 8360, + "time_per_iteration": 2.5194928646087646 + }, + { + "auxiliary_loss_clip": 0.01111322, + "auxiliary_loss_mlp": 0.01032708, + "balance_loss_clip": 1.02062035, + "balance_loss_mlp": 1.04179871, + "epoch": 0.5026905155568916, + "flos": 23697200574720.0, + "grad_norm": 1.6325944972725464, + "language_loss": 0.76545495, + "learning_rate": 2.0796238650065645e-06, + "loss": 0.78689528, + "num_input_tokens_seen": 179746150, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 8361, + "time_per_iteration": 2.4667415618896484 + }, + { + "auxiliary_loss_clip": 0.0111058, + "auxiliary_loss_mlp": 0.01033627, + "balance_loss_clip": 1.01973319, + "balance_loss_mlp": 1.03841019, + "epoch": 0.5027506388095596, + "flos": 25812410722560.0, + "grad_norm": 1.6123805658340187, + "language_loss": 0.84681976, + "learning_rate": 2.0792347105867065e-06, + "loss": 0.86826181, + "num_input_tokens_seen": 179767550, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.71875, + "step": 8362, + "time_per_iteration": 2.5463051795959473 + }, + { + "auxiliary_loss_clip": 0.01109115, + "auxiliary_loss_mlp": 0.01034772, + "balance_loss_clip": 1.02232695, + "balance_loss_mlp": 1.03756952, + "epoch": 0.5028107620622275, + "flos": 27526606456320.0, + "grad_norm": 1.4590070504225026, + "language_loss": 0.78211838, + "learning_rate": 2.0788455531622605e-06, + "loss": 0.80355728, + "num_input_tokens_seen": 179790075, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71484375, + "step": 8363, + "time_per_iteration": 2.5163207054138184 + }, + { + "auxiliary_loss_clip": 0.0110737, + "auxiliary_loss_mlp": 0.0103085, + "balance_loss_clip": 1.01799965, + "balance_loss_mlp": 1.04016399, + "epoch": 0.5028708853148955, + "flos": 24534278098560.0, + "grad_norm": 3.0044110074814627, + "language_loss": 0.75747573, + "learning_rate": 2.0784563927479838e-06, + "loss": 0.77885795, + "num_input_tokens_seen": 179806515, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.671875, + "step": 8364, + "time_per_iteration": 2.490145444869995 + }, + { + "auxiliary_loss_clip": 0.01106443, + "auxiliary_loss_mlp": 0.01029652, + "balance_loss_clip": 1.01749849, + "balance_loss_mlp": 1.03816295, + "epoch": 0.5029310085675635, + "flos": 20813609664000.0, + "grad_norm": 1.5639014752994398, + "language_loss": 0.69354087, + "learning_rate": 2.0780672293586317e-06, + "loss": 0.7149018, + "num_input_tokens_seen": 179826450, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.68359375, + "step": 8365, + "time_per_iteration": 2.473787307739258 + }, + { + "auxiliary_loss_clip": 0.01113542, + "auxiliary_loss_mlp": 0.01035128, + "balance_loss_clip": 1.02149057, + "balance_loss_mlp": 1.03982782, + "epoch": 0.5029911318202315, + "flos": 22342470197760.0, + "grad_norm": 1.442330503817835, + "language_loss": 0.73213601, + "learning_rate": 2.0776780630089635e-06, + "loss": 0.75362265, + "num_input_tokens_seen": 179846770, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.734375, + "step": 8366, + "time_per_iteration": 2.549877405166626 + }, + { + "auxiliary_loss_clip": 0.01109966, + "auxiliary_loss_mlp": 0.01032343, + "balance_loss_clip": 1.02064812, + "balance_loss_mlp": 1.04103982, + "epoch": 0.5030512550728995, + "flos": 24352713826560.0, + "grad_norm": 1.4509464249778803, + "language_loss": 0.78301162, + "learning_rate": 2.077288893713735e-06, + "loss": 0.80443466, + "num_input_tokens_seen": 179866585, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6875, + "step": 8367, + "time_per_iteration": 2.495147705078125 + }, + { + "auxiliary_loss_clip": 0.01108781, + "auxiliary_loss_mlp": 0.01030247, + "balance_loss_clip": 1.0180459, + "balance_loss_mlp": 1.03853226, + "epoch": 0.5031113783255674, + "flos": 18259930195200.0, + "grad_norm": 1.842981496070619, + "language_loss": 0.69923592, + "learning_rate": 2.0768997214877035e-06, + "loss": 0.72062624, + "num_input_tokens_seen": 179885575, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.703125, + "step": 8368, + "time_per_iteration": 2.4830057621002197 + }, + { + "auxiliary_loss_clip": 0.01035945, + "auxiliary_loss_mlp": 0.01007176, + "balance_loss_clip": 1.00621665, + "balance_loss_mlp": 1.01321661, + "epoch": 0.5031715015782354, + "flos": 57253173200640.0, + "grad_norm": 0.8570502115037558, + "language_loss": 0.63344997, + "learning_rate": 2.0765105463456274e-06, + "loss": 0.65388119, + "num_input_tokens_seen": 179939650, + "router_z_loss_clip": 0.00958252, + "router_z_loss_mlp": 0.22851562, + "step": 8369, + "time_per_iteration": 3.0224173069000244 + }, + { + "auxiliary_loss_clip": 0.0110829, + "auxiliary_loss_mlp": 0.01031598, + "balance_loss_clip": 1.01973677, + "balance_loss_mlp": 1.03877878, + "epoch": 0.5032316248309033, + "flos": 27527360641920.0, + "grad_norm": 2.153532760870157, + "language_loss": 0.60134995, + "learning_rate": 2.076121368302263e-06, + "loss": 0.62274879, + "num_input_tokens_seen": 179961765, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6953125, + "step": 8370, + "time_per_iteration": 2.570244073867798 + }, + { + "auxiliary_loss_clip": 0.01110861, + "auxiliary_loss_mlp": 0.01034812, + "balance_loss_clip": 1.02094817, + "balance_loss_mlp": 1.03846478, + "epoch": 0.5032917480835714, + "flos": 34495825939200.0, + "grad_norm": 1.5686803599666441, + "language_loss": 0.68485558, + "learning_rate": 2.0757321873723695e-06, + "loss": 0.7063123, + "num_input_tokens_seen": 179983015, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.72265625, + "step": 8371, + "time_per_iteration": 2.5606741905212402 + }, + { + "auxiliary_loss_clip": 0.01110798, + "auxiliary_loss_mlp": 0.01030836, + "balance_loss_clip": 1.01710284, + "balance_loss_mlp": 1.04021561, + "epoch": 0.5033518713362393, + "flos": 33656773167360.0, + "grad_norm": 2.6972353884187776, + "language_loss": 0.67238319, + "learning_rate": 2.0753430035707042e-06, + "loss": 0.6937995, + "num_input_tokens_seen": 180003210, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.703125, + "step": 8372, + "time_per_iteration": 2.5703678131103516 + }, + { + "auxiliary_loss_clip": 0.0110914, + "auxiliary_loss_mlp": 0.01035865, + "balance_loss_clip": 1.02197719, + "balance_loss_mlp": 1.03876567, + "epoch": 0.5034119945889073, + "flos": 28185495586560.0, + "grad_norm": 2.7198935997293683, + "language_loss": 0.66590893, + "learning_rate": 2.0749538169120235e-06, + "loss": 0.68735898, + "num_input_tokens_seen": 180025530, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.703125, + "step": 8373, + "time_per_iteration": 2.526221513748169 + }, + { + "auxiliary_loss_clip": 0.01106535, + "auxiliary_loss_mlp": 0.01028349, + "balance_loss_clip": 1.01558208, + "balance_loss_mlp": 1.03755879, + "epoch": 0.5034721178415752, + "flos": 21358697529600.0, + "grad_norm": 1.6286907446961802, + "language_loss": 0.74674404, + "learning_rate": 2.0745646274110872e-06, + "loss": 0.76809293, + "num_input_tokens_seen": 180043180, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69140625, + "step": 8374, + "time_per_iteration": 2.488349199295044 + }, + { + "auxiliary_loss_clip": 0.01111709, + "auxiliary_loss_mlp": 0.01037533, + "balance_loss_clip": 1.02400887, + "balance_loss_mlp": 1.04047632, + "epoch": 0.5035322410942432, + "flos": 22674823764480.0, + "grad_norm": 1.5485355079726564, + "language_loss": 0.67947745, + "learning_rate": 2.0741754350826525e-06, + "loss": 0.70096987, + "num_input_tokens_seen": 180062905, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7109375, + "step": 8375, + "time_per_iteration": 2.4445972442626953 + }, + { + "auxiliary_loss_clip": 0.01114075, + "auxiliary_loss_mlp": 0.01034329, + "balance_loss_clip": 1.02008343, + "balance_loss_mlp": 1.04047072, + "epoch": 0.5035923643469111, + "flos": 19828723674240.0, + "grad_norm": 1.8481066708574578, + "language_loss": 0.78526819, + "learning_rate": 2.0737862399414777e-06, + "loss": 0.8067522, + "num_input_tokens_seen": 180082000, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.734375, + "step": 8376, + "time_per_iteration": 2.468104124069214 + }, + { + "auxiliary_loss_clip": 0.0111296, + "auxiliary_loss_mlp": 0.0103107, + "balance_loss_clip": 1.01704502, + "balance_loss_mlp": 1.03864694, + "epoch": 0.5036524875995791, + "flos": 30514625182080.0, + "grad_norm": 2.8611372201727234, + "language_loss": 0.59723544, + "learning_rate": 2.0733970420023213e-06, + "loss": 0.61867571, + "num_input_tokens_seen": 180101340, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 8377, + "time_per_iteration": 2.5277962684631348 + }, + { + "auxiliary_loss_clip": 0.01108859, + "auxiliary_loss_mlp": 0.01034548, + "balance_loss_clip": 1.02114892, + "balance_loss_mlp": 1.03836918, + "epoch": 0.5037126108522471, + "flos": 14720574637440.0, + "grad_norm": 1.9462161897860946, + "language_loss": 0.76360452, + "learning_rate": 2.0730078412799425e-06, + "loss": 0.78503865, + "num_input_tokens_seen": 180119160, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.703125, + "step": 8378, + "time_per_iteration": 2.448323965072632 + }, + { + "auxiliary_loss_clip": 0.01109358, + "auxiliary_loss_mlp": 0.01034908, + "balance_loss_clip": 1.02211046, + "balance_loss_mlp": 1.03916407, + "epoch": 0.5037727341049151, + "flos": 25297702784640.0, + "grad_norm": 1.6531450393233522, + "language_loss": 0.74565625, + "learning_rate": 2.0726186377890985e-06, + "loss": 0.7670989, + "num_input_tokens_seen": 180138730, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.703125, + "step": 8379, + "time_per_iteration": 2.5036356449127197 + }, + { + "auxiliary_loss_clip": 0.01109557, + "auxiliary_loss_mlp": 0.01031633, + "balance_loss_clip": 1.01952767, + "balance_loss_mlp": 1.04144955, + "epoch": 0.5038328573575831, + "flos": 28541764632960.0, + "grad_norm": 5.059413081923233, + "language_loss": 0.6692574, + "learning_rate": 2.072229431544548e-06, + "loss": 0.6906693, + "num_input_tokens_seen": 180158810, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 8380, + "time_per_iteration": 2.524144411087036 + }, + { + "auxiliary_loss_clip": 0.01109006, + "auxiliary_loss_mlp": 0.01030794, + "balance_loss_clip": 1.01879573, + "balance_loss_mlp": 1.03999329, + "epoch": 0.503892980610251, + "flos": 31649869503360.0, + "grad_norm": 1.7991215942112995, + "language_loss": 0.63869506, + "learning_rate": 2.071840222561051e-06, + "loss": 0.66009307, + "num_input_tokens_seen": 180179700, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 8381, + "time_per_iteration": 2.5605592727661133 + }, + { + "auxiliary_loss_clip": 0.01108854, + "auxiliary_loss_mlp": 0.01035256, + "balance_loss_clip": 1.02296555, + "balance_loss_mlp": 1.04009557, + "epoch": 0.503953103862919, + "flos": 27089358197760.0, + "grad_norm": 1.6170974847944384, + "language_loss": 0.67252153, + "learning_rate": 2.071451010853365e-06, + "loss": 0.69396263, + "num_input_tokens_seen": 180199890, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 8382, + "time_per_iteration": 2.5227982997894287 + }, + { + "auxiliary_loss_clip": 0.01116241, + "auxiliary_loss_mlp": 0.01039894, + "balance_loss_clip": 1.02614903, + "balance_loss_mlp": 1.04075313, + "epoch": 0.5040132271155869, + "flos": 15632957024640.0, + "grad_norm": 2.0398701191748, + "language_loss": 0.62190729, + "learning_rate": 2.0710617964362506e-06, + "loss": 0.64346862, + "num_input_tokens_seen": 180217840, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.75390625, + "step": 8383, + "time_per_iteration": 2.43418288230896 + }, + { + "auxiliary_loss_clip": 0.01106599, + "auxiliary_loss_mlp": 0.01034318, + "balance_loss_clip": 1.02198625, + "balance_loss_mlp": 1.03885436, + "epoch": 0.504073350368255, + "flos": 13590106824960.0, + "grad_norm": 3.355380782185913, + "language_loss": 0.67041314, + "learning_rate": 2.070672579324465e-06, + "loss": 0.69182235, + "num_input_tokens_seen": 180236465, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.67578125, + "step": 8384, + "time_per_iteration": 2.450605630874634 + }, + { + "auxiliary_loss_clip": 0.01112035, + "auxiliary_loss_mlp": 0.01036023, + "balance_loss_clip": 1.02414393, + "balance_loss_mlp": 1.0412066, + "epoch": 0.5041334736209229, + "flos": 29058160510080.0, + "grad_norm": 1.6534299501213623, + "language_loss": 0.70829523, + "learning_rate": 2.0702833595327674e-06, + "loss": 0.72977579, + "num_input_tokens_seen": 180258025, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.70703125, + "step": 8385, + "time_per_iteration": 3.9600095748901367 + }, + { + "auxiliary_loss_clip": 0.01106768, + "auxiliary_loss_mlp": 0.01027134, + "balance_loss_clip": 1.0147717, + "balance_loss_mlp": 1.03961098, + "epoch": 0.5041935968735909, + "flos": 24608361899520.0, + "grad_norm": 2.2280411323646687, + "language_loss": 0.83021009, + "learning_rate": 2.069894137075919e-06, + "loss": 0.85154909, + "num_input_tokens_seen": 180277825, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.671875, + "step": 8386, + "time_per_iteration": 2.5137035846710205 + }, + { + "auxiliary_loss_clip": 0.01109584, + "auxiliary_loss_mlp": 0.01034789, + "balance_loss_clip": 1.02156925, + "balance_loss_mlp": 1.03921139, + "epoch": 0.5042537201262588, + "flos": 26286934320000.0, + "grad_norm": 1.4630184477724049, + "language_loss": 0.66776884, + "learning_rate": 2.0695049119686766e-06, + "loss": 0.6892125, + "num_input_tokens_seen": 180300465, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 8387, + "time_per_iteration": 5.38523268699646 + }, + { + "auxiliary_loss_clip": 0.01110278, + "auxiliary_loss_mlp": 0.01029754, + "balance_loss_clip": 1.01780963, + "balance_loss_mlp": 1.04077113, + "epoch": 0.5043138433789268, + "flos": 22017371178240.0, + "grad_norm": 1.3874005116173278, + "language_loss": 0.80059648, + "learning_rate": 2.0691156842258016e-06, + "loss": 0.82199681, + "num_input_tokens_seen": 180321050, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 8388, + "time_per_iteration": 3.938295364379883 + }, + { + "auxiliary_loss_clip": 0.01109371, + "auxiliary_loss_mlp": 0.0103035, + "balance_loss_clip": 1.0181793, + "balance_loss_mlp": 1.03903794, + "epoch": 0.5043739666315947, + "flos": 28767104605440.0, + "grad_norm": 2.6549702991910453, + "language_loss": 0.69832838, + "learning_rate": 2.0687264538620537e-06, + "loss": 0.71972561, + "num_input_tokens_seen": 180338870, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.703125, + "step": 8389, + "time_per_iteration": 2.514204978942871 + }, + { + "auxiliary_loss_clip": 0.01110176, + "auxiliary_loss_mlp": 0.01033206, + "balance_loss_clip": 1.02127957, + "balance_loss_mlp": 1.03844476, + "epoch": 0.5044340898842627, + "flos": 27599253713280.0, + "grad_norm": 1.5923484046165255, + "language_loss": 0.69297862, + "learning_rate": 2.068337220892191e-06, + "loss": 0.71441251, + "num_input_tokens_seen": 180361285, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.71875, + "step": 8390, + "time_per_iteration": 2.517423152923584 + }, + { + "auxiliary_loss_clip": 0.01034589, + "auxiliary_loss_mlp": 0.01005008, + "balance_loss_clip": 1.00389957, + "balance_loss_mlp": 1.0117954, + "epoch": 0.5044942131369307, + "flos": 67458050749440.0, + "grad_norm": 0.8182221752596884, + "language_loss": 0.52977288, + "learning_rate": 2.067947985330974e-06, + "loss": 0.55016881, + "num_input_tokens_seen": 180415170, + "router_z_loss_clip": 0.0111084, + "router_z_loss_mlp": 0.22851562, + "step": 8391, + "time_per_iteration": 2.8990061283111572 + }, + { + "auxiliary_loss_clip": 0.01034773, + "auxiliary_loss_mlp": 0.01000958, + "balance_loss_clip": 0.99989092, + "balance_loss_mlp": 1.01217151, + "epoch": 0.5045543363895987, + "flos": 58630849390080.0, + "grad_norm": 0.8813101083301623, + "language_loss": 0.60678625, + "learning_rate": 2.0675587471931628e-06, + "loss": 0.62714356, + "num_input_tokens_seen": 180468060, + "router_z_loss_clip": 0.01068115, + "router_z_loss_mlp": 0.2265625, + "step": 8392, + "time_per_iteration": 2.91495680809021 + }, + { + "auxiliary_loss_clip": 0.01106534, + "auxiliary_loss_mlp": 0.01032936, + "balance_loss_clip": 1.02103257, + "balance_loss_mlp": 1.03893185, + "epoch": 0.5046144596422667, + "flos": 22526620248960.0, + "grad_norm": 1.5806327501196855, + "language_loss": 0.84691715, + "learning_rate": 2.067169506493517e-06, + "loss": 0.86831182, + "num_input_tokens_seen": 180486610, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 8393, + "time_per_iteration": 2.5033814907073975 + }, + { + "auxiliary_loss_clip": 0.01110241, + "auxiliary_loss_mlp": 0.01028823, + "balance_loss_clip": 1.01680708, + "balance_loss_mlp": 1.04046786, + "epoch": 0.5046745828949346, + "flos": 27454246508160.0, + "grad_norm": 2.96195836984414, + "language_loss": 0.50628948, + "learning_rate": 2.0667802632467974e-06, + "loss": 0.52768016, + "num_input_tokens_seen": 180508135, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6953125, + "step": 8394, + "time_per_iteration": 2.492766857147217 + }, + { + "auxiliary_loss_clip": 0.01108439, + "auxiliary_loss_mlp": 0.01032191, + "balance_loss_clip": 1.01906633, + "balance_loss_mlp": 1.03773594, + "epoch": 0.5047347061476026, + "flos": 17274541415040.0, + "grad_norm": 1.6061893361767445, + "language_loss": 0.75181741, + "learning_rate": 2.0663910174677627e-06, + "loss": 0.7732237, + "num_input_tokens_seen": 180527000, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.70703125, + "step": 8395, + "time_per_iteration": 2.4661927223205566 + }, + { + "auxiliary_loss_clip": 0.01107947, + "auxiliary_loss_mlp": 0.01030847, + "balance_loss_clip": 1.01859236, + "balance_loss_mlp": 1.03834832, + "epoch": 0.5047948294002705, + "flos": 16649515831680.0, + "grad_norm": 2.243385214175979, + "language_loss": 0.67677552, + "learning_rate": 2.0660017691711737e-06, + "loss": 0.69816345, + "num_input_tokens_seen": 180544715, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6953125, + "step": 8396, + "time_per_iteration": 2.416499376296997 + }, + { + "auxiliary_loss_clip": 0.01109504, + "auxiliary_loss_mlp": 0.01027964, + "balance_loss_clip": 1.01623356, + "balance_loss_mlp": 1.0404129, + "epoch": 0.5048549526529386, + "flos": 26865706164480.0, + "grad_norm": 1.7915756184866887, + "language_loss": 0.79064161, + "learning_rate": 2.065612518371792e-06, + "loss": 0.81201625, + "num_input_tokens_seen": 180565365, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6875, + "step": 8397, + "time_per_iteration": 2.5530309677124023 + }, + { + "auxiliary_loss_clip": 0.01107401, + "auxiliary_loss_mlp": 0.01029415, + "balance_loss_clip": 1.01741672, + "balance_loss_mlp": 1.03848135, + "epoch": 0.5049150759056065, + "flos": 21833939399040.0, + "grad_norm": 1.652903699623706, + "language_loss": 0.66017222, + "learning_rate": 2.065223265084376e-06, + "loss": 0.68154037, + "num_input_tokens_seen": 180586670, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6875, + "step": 8398, + "time_per_iteration": 2.4544124603271484 + }, + { + "auxiliary_loss_clip": 0.01108938, + "auxiliary_loss_mlp": 0.01029574, + "balance_loss_clip": 1.017313, + "balance_loss_mlp": 1.0395267, + "epoch": 0.5049751991582745, + "flos": 21685807710720.0, + "grad_norm": 1.639047703672107, + "language_loss": 0.71633506, + "learning_rate": 2.064834009323688e-06, + "loss": 0.73772013, + "num_input_tokens_seen": 180605085, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6953125, + "step": 8399, + "time_per_iteration": 2.5301358699798584 + }, + { + "auxiliary_loss_clip": 0.01111081, + "auxiliary_loss_mlp": 0.01038179, + "balance_loss_clip": 1.02533388, + "balance_loss_mlp": 1.03947675, + "epoch": 0.5050353224109424, + "flos": 21359379888000.0, + "grad_norm": 1.6970917460172408, + "language_loss": 0.81506133, + "learning_rate": 2.0644447511044878e-06, + "loss": 0.83655393, + "num_input_tokens_seen": 180624370, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71484375, + "step": 8400, + "time_per_iteration": 2.4705498218536377 + }, + { + "auxiliary_loss_clip": 0.01109099, + "auxiliary_loss_mlp": 0.01029733, + "balance_loss_clip": 1.01716256, + "balance_loss_mlp": 1.03942847, + "epoch": 0.5050954456636104, + "flos": 22820082364800.0, + "grad_norm": 1.8569234799708698, + "language_loss": 0.79040837, + "learning_rate": 2.0640554904415362e-06, + "loss": 0.81179667, + "num_input_tokens_seen": 180642450, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69921875, + "step": 8401, + "time_per_iteration": 2.4791224002838135 + }, + { + "auxiliary_loss_clip": 0.01109433, + "auxiliary_loss_mlp": 0.01030061, + "balance_loss_clip": 1.01775241, + "balance_loss_mlp": 1.03751659, + "epoch": 0.5051555689162783, + "flos": 30448226891520.0, + "grad_norm": 1.5775455049866824, + "language_loss": 0.69999743, + "learning_rate": 2.063666227349593e-06, + "loss": 0.72139227, + "num_input_tokens_seen": 180665250, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.71875, + "step": 8402, + "time_per_iteration": 2.5591325759887695 + }, + { + "auxiliary_loss_clip": 0.01105942, + "auxiliary_loss_mlp": 0.0102692, + "balance_loss_clip": 1.01515996, + "balance_loss_mlp": 1.03572834, + "epoch": 0.5052156921689464, + "flos": 21287953693440.0, + "grad_norm": 1.822367858534602, + "language_loss": 0.68917859, + "learning_rate": 2.063276961843422e-06, + "loss": 0.71050715, + "num_input_tokens_seen": 180687425, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.703125, + "step": 8403, + "time_per_iteration": 2.5292510986328125 + }, + { + "auxiliary_loss_clip": 0.01106316, + "auxiliary_loss_mlp": 0.01034839, + "balance_loss_clip": 1.02275133, + "balance_loss_mlp": 1.03929162, + "epoch": 0.5052758154216143, + "flos": 25081305298560.0, + "grad_norm": 1.4593040849849852, + "language_loss": 0.85396838, + "learning_rate": 2.062887693937781e-06, + "loss": 0.87537992, + "num_input_tokens_seen": 180708725, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.671875, + "step": 8404, + "time_per_iteration": 2.4852187633514404 + }, + { + "auxiliary_loss_clip": 0.01107561, + "auxiliary_loss_mlp": 0.01027359, + "balance_loss_clip": 1.01565218, + "balance_loss_mlp": 1.03806567, + "epoch": 0.5053359386742823, + "flos": 20885502735360.0, + "grad_norm": 1.5717367434630007, + "language_loss": 0.75364089, + "learning_rate": 2.0624984236474322e-06, + "loss": 0.77499014, + "num_input_tokens_seen": 180727990, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6953125, + "step": 8405, + "time_per_iteration": 2.4850387573242188 + }, + { + "auxiliary_loss_clip": 0.01109835, + "auxiliary_loss_mlp": 0.01028348, + "balance_loss_clip": 1.01514542, + "balance_loss_mlp": 1.0388459, + "epoch": 0.5053960619269503, + "flos": 37743335493120.0, + "grad_norm": 1.5541955318463554, + "language_loss": 0.72983336, + "learning_rate": 2.0621091509871378e-06, + "loss": 0.75121522, + "num_input_tokens_seen": 180749765, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 8406, + "time_per_iteration": 2.59979510307312 + }, + { + "auxiliary_loss_clip": 0.01102813, + "auxiliary_loss_mlp": 0.01028972, + "balance_loss_clip": 1.01712823, + "balance_loss_mlp": 1.03577971, + "epoch": 0.5054561851796182, + "flos": 23513840622720.0, + "grad_norm": 1.7094740961502104, + "language_loss": 0.76863986, + "learning_rate": 2.0617198759716568e-06, + "loss": 0.7899577, + "num_input_tokens_seen": 180769580, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 8407, + "time_per_iteration": 2.527543067932129 + }, + { + "auxiliary_loss_clip": 0.01108813, + "auxiliary_loss_mlp": 0.01027236, + "balance_loss_clip": 1.01535106, + "balance_loss_mlp": 1.03706717, + "epoch": 0.5055163084322862, + "flos": 30410233280640.0, + "grad_norm": 1.6525886874932982, + "language_loss": 0.63115776, + "learning_rate": 2.0613305986157535e-06, + "loss": 0.65251827, + "num_input_tokens_seen": 180790295, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.71875, + "step": 8408, + "time_per_iteration": 2.53218150138855 + }, + { + "auxiliary_loss_clip": 0.01106822, + "auxiliary_loss_mlp": 0.01031761, + "balance_loss_clip": 1.01871967, + "balance_loss_mlp": 1.0382477, + "epoch": 0.5055764316849541, + "flos": 20259651139200.0, + "grad_norm": 1.695436010833495, + "language_loss": 0.63705122, + "learning_rate": 2.0609413189341865e-06, + "loss": 0.65843707, + "num_input_tokens_seen": 180807875, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6875, + "step": 8409, + "time_per_iteration": 2.4916255474090576 + }, + { + "auxiliary_loss_clip": 0.01105638, + "auxiliary_loss_mlp": 0.01025409, + "balance_loss_clip": 1.01410186, + "balance_loss_mlp": 1.03845859, + "epoch": 0.5056365549376222, + "flos": 26070895969920.0, + "grad_norm": 1.3247049855298083, + "language_loss": 0.70876539, + "learning_rate": 2.0605520369417193e-06, + "loss": 0.73007584, + "num_input_tokens_seen": 180831300, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 8410, + "time_per_iteration": 2.527935266494751 + }, + { + "auxiliary_loss_clip": 0.01107655, + "auxiliary_loss_mlp": 0.0103361, + "balance_loss_clip": 1.02100372, + "balance_loss_mlp": 1.03812361, + "epoch": 0.5056966781902901, + "flos": 19279074781440.0, + "grad_norm": 1.5323244298402565, + "language_loss": 0.79243749, + "learning_rate": 2.060162752653113e-06, + "loss": 0.81385016, + "num_input_tokens_seen": 180849055, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 8411, + "time_per_iteration": 2.4926035404205322 + }, + { + "auxiliary_loss_clip": 0.01107995, + "auxiliary_loss_mlp": 0.0103704, + "balance_loss_clip": 1.02357578, + "balance_loss_mlp": 1.03764153, + "epoch": 0.5057568014429581, + "flos": 21323325611520.0, + "grad_norm": 1.7118743762511017, + "language_loss": 0.81584603, + "learning_rate": 2.0597734660831285e-06, + "loss": 0.83729643, + "num_input_tokens_seen": 180867395, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.703125, + "step": 8412, + "time_per_iteration": 2.4696593284606934 + }, + { + "auxiliary_loss_clip": 0.0110966, + "auxiliary_loss_mlp": 0.0103257, + "balance_loss_clip": 1.02057767, + "balance_loss_mlp": 1.04071307, + "epoch": 0.505816924695626, + "flos": 17493596507520.0, + "grad_norm": 2.1036912411500555, + "language_loss": 0.80586725, + "learning_rate": 2.0593841772465283e-06, + "loss": 0.82728952, + "num_input_tokens_seen": 180886670, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6875, + "step": 8413, + "time_per_iteration": 2.4840738773345947 + }, + { + "auxiliary_loss_clip": 0.01111974, + "auxiliary_loss_mlp": 0.01032705, + "balance_loss_clip": 1.01959252, + "balance_loss_mlp": 1.04003644, + "epoch": 0.505877047948294, + "flos": 21142084561920.0, + "grad_norm": 1.7598991939758672, + "language_loss": 0.80167186, + "learning_rate": 2.0589948861580737e-06, + "loss": 0.82311857, + "num_input_tokens_seen": 180904645, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 8414, + "time_per_iteration": 2.4437410831451416 + }, + { + "auxiliary_loss_clip": 0.01106268, + "auxiliary_loss_mlp": 0.0102984, + "balance_loss_clip": 1.0174123, + "balance_loss_mlp": 1.03536403, + "epoch": 0.5059371712009619, + "flos": 36350036887680.0, + "grad_norm": 2.1880801569958486, + "language_loss": 0.62188816, + "learning_rate": 2.058605592832528e-06, + "loss": 0.64324927, + "num_input_tokens_seen": 180922340, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 8415, + "time_per_iteration": 2.617699384689331 + }, + { + "auxiliary_loss_clip": 0.01107633, + "auxiliary_loss_mlp": 0.01029289, + "balance_loss_clip": 1.01712978, + "balance_loss_mlp": 1.03840709, + "epoch": 0.50599729445363, + "flos": 22673387220480.0, + "grad_norm": 1.5996951654726725, + "language_loss": 0.81836188, + "learning_rate": 2.0582162972846515e-06, + "loss": 0.8397311, + "num_input_tokens_seen": 180941350, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69140625, + "step": 8416, + "time_per_iteration": 2.484717607498169 + }, + { + "auxiliary_loss_clip": 0.0110817, + "auxiliary_loss_mlp": 0.01033768, + "balance_loss_clip": 1.02253819, + "balance_loss_mlp": 1.04098511, + "epoch": 0.5060574177062979, + "flos": 22747866071040.0, + "grad_norm": 1.7782267995500585, + "language_loss": 0.79110944, + "learning_rate": 2.0578269995292078e-06, + "loss": 0.81252885, + "num_input_tokens_seen": 180960720, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 8417, + "time_per_iteration": 2.544739246368408 + }, + { + "auxiliary_loss_clip": 0.01103419, + "auxiliary_loss_mlp": 0.01030044, + "balance_loss_clip": 1.01789641, + "balance_loss_mlp": 1.03713858, + "epoch": 0.5061175409589659, + "flos": 21653201139840.0, + "grad_norm": 1.8205649281423022, + "language_loss": 0.62930262, + "learning_rate": 2.0574376995809588e-06, + "loss": 0.65063727, + "num_input_tokens_seen": 180979725, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 8418, + "time_per_iteration": 2.4795963764190674 + }, + { + "auxiliary_loss_clip": 0.01109111, + "auxiliary_loss_mlp": 0.01034585, + "balance_loss_clip": 1.02232397, + "balance_loss_mlp": 1.03859878, + "epoch": 0.5061776642116339, + "flos": 21616249023360.0, + "grad_norm": 2.1933090002480182, + "language_loss": 0.77840686, + "learning_rate": 2.0570483974546653e-06, + "loss": 0.79984379, + "num_input_tokens_seen": 180998980, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.703125, + "step": 8419, + "time_per_iteration": 2.491931915283203 + }, + { + "auxiliary_loss_clip": 0.0110836, + "auxiliary_loss_mlp": 0.01032721, + "balance_loss_clip": 1.01950645, + "balance_loss_mlp": 1.0373354, + "epoch": 0.5062377874643018, + "flos": 24426294837120.0, + "grad_norm": 1.7154546366730201, + "language_loss": 0.77258635, + "learning_rate": 2.0566590931650917e-06, + "loss": 0.79399723, + "num_input_tokens_seen": 181019165, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.70703125, + "step": 8420, + "time_per_iteration": 2.5963363647460938 + }, + { + "auxiliary_loss_clip": 0.01109095, + "auxiliary_loss_mlp": 0.01037587, + "balance_loss_clip": 1.02459884, + "balance_loss_mlp": 1.03782094, + "epoch": 0.5062979107169698, + "flos": 22524429519360.0, + "grad_norm": 1.679092087125118, + "language_loss": 0.77511621, + "learning_rate": 2.056269786726999e-06, + "loss": 0.79658306, + "num_input_tokens_seen": 181037110, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 8421, + "time_per_iteration": 2.4954135417938232 + }, + { + "auxiliary_loss_clip": 0.01105449, + "auxiliary_loss_mlp": 0.01029273, + "balance_loss_clip": 1.01762629, + "balance_loss_mlp": 1.03668654, + "epoch": 0.5063580339696377, + "flos": 24571984400640.0, + "grad_norm": 1.4641430762434493, + "language_loss": 0.66987717, + "learning_rate": 2.0558804781551512e-06, + "loss": 0.69122434, + "num_input_tokens_seen": 181057775, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6875, + "step": 8422, + "time_per_iteration": 2.4802937507629395 + }, + { + "auxiliary_loss_clip": 0.01109498, + "auxiliary_loss_mlp": 0.01032318, + "balance_loss_clip": 1.01998544, + "balance_loss_mlp": 1.04081178, + "epoch": 0.5064181572223058, + "flos": 22596143022720.0, + "grad_norm": 1.8050040320885787, + "language_loss": 0.81599188, + "learning_rate": 2.05549116746431e-06, + "loss": 0.83741009, + "num_input_tokens_seen": 181078260, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 8423, + "time_per_iteration": 2.591792345046997 + }, + { + "auxiliary_loss_clip": 0.01109343, + "auxiliary_loss_mlp": 0.01031623, + "balance_loss_clip": 1.01859319, + "balance_loss_mlp": 1.03820443, + "epoch": 0.5064782804749737, + "flos": 25994944661760.0, + "grad_norm": 1.8632464802837558, + "language_loss": 0.74227667, + "learning_rate": 2.055101854669237e-06, + "loss": 0.76368636, + "num_input_tokens_seen": 181098755, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 8424, + "time_per_iteration": 2.5076076984405518 + }, + { + "auxiliary_loss_clip": 0.01105301, + "auxiliary_loss_mlp": 0.01033895, + "balance_loss_clip": 1.02120495, + "balance_loss_mlp": 1.03742146, + "epoch": 0.5065384037276417, + "flos": 28553041503360.0, + "grad_norm": 1.6339612294396895, + "language_loss": 0.71546394, + "learning_rate": 2.0547125397846975e-06, + "loss": 0.73685586, + "num_input_tokens_seen": 181121570, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 8425, + "time_per_iteration": 2.570103406906128 + }, + { + "auxiliary_loss_clip": 0.01108568, + "auxiliary_loss_mlp": 0.01035107, + "balance_loss_clip": 1.02325118, + "balance_loss_mlp": 1.0379858, + "epoch": 0.5065985269803096, + "flos": 22966023323520.0, + "grad_norm": 1.6987499343502257, + "language_loss": 0.78614688, + "learning_rate": 2.0543232228254524e-06, + "loss": 0.80758357, + "num_input_tokens_seen": 181140240, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.703125, + "step": 8426, + "time_per_iteration": 2.4616403579711914 + }, + { + "auxiliary_loss_clip": 0.01111124, + "auxiliary_loss_mlp": 0.01035589, + "balance_loss_clip": 1.02312577, + "balance_loss_mlp": 1.03994358, + "epoch": 0.5066586502329776, + "flos": 21608563512960.0, + "grad_norm": 2.818748758654822, + "language_loss": 0.77855921, + "learning_rate": 2.053933903806265e-06, + "loss": 0.80002636, + "num_input_tokens_seen": 181158630, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 8427, + "time_per_iteration": 3.908625364303589 + }, + { + "auxiliary_loss_clip": 0.0110433, + "auxiliary_loss_mlp": 0.01026092, + "balance_loss_clip": 1.01382565, + "balance_loss_mlp": 1.03709817, + "epoch": 0.5067187734856455, + "flos": 20339912079360.0, + "grad_norm": 1.8142719003609429, + "language_loss": 0.71444368, + "learning_rate": 2.0535445827418997e-06, + "loss": 0.73574793, + "num_input_tokens_seen": 181176405, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.671875, + "step": 8428, + "time_per_iteration": 2.4540021419525146 + }, + { + "auxiliary_loss_clip": 0.0110492, + "auxiliary_loss_mlp": 0.01031645, + "balance_loss_clip": 1.01983786, + "balance_loss_mlp": 1.03622389, + "epoch": 0.5067788967383136, + "flos": 28841080665600.0, + "grad_norm": 1.6344761677930288, + "language_loss": 0.82693905, + "learning_rate": 2.0531552596471168e-06, + "loss": 0.84830469, + "num_input_tokens_seen": 181197595, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 8429, + "time_per_iteration": 3.977104902267456 + }, + { + "auxiliary_loss_clip": 0.01113682, + "auxiliary_loss_mlp": 0.01036238, + "balance_loss_clip": 1.02267253, + "balance_loss_mlp": 1.04074979, + "epoch": 0.5068390199909815, + "flos": 32450174478720.0, + "grad_norm": 2.1730745276419485, + "language_loss": 0.73167485, + "learning_rate": 2.052765934536682e-06, + "loss": 0.75317407, + "num_input_tokens_seen": 181218560, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7265625, + "step": 8430, + "time_per_iteration": 4.066487073898315 + }, + { + "auxiliary_loss_clip": 0.01109473, + "auxiliary_loss_mlp": 0.01031357, + "balance_loss_clip": 1.01953173, + "balance_loss_mlp": 1.03904748, + "epoch": 0.5068991432436495, + "flos": 23146582014720.0, + "grad_norm": 1.7614160050819483, + "language_loss": 0.76304209, + "learning_rate": 2.0523766074253575e-06, + "loss": 0.78445041, + "num_input_tokens_seen": 181237095, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.703125, + "step": 8431, + "time_per_iteration": 2.459061861038208 + }, + { + "auxiliary_loss_clip": 0.01107362, + "auxiliary_loss_mlp": 0.01031584, + "balance_loss_clip": 1.01899588, + "balance_loss_mlp": 1.0388869, + "epoch": 0.5069592664963174, + "flos": 19936096404480.0, + "grad_norm": 1.4179396940955034, + "language_loss": 0.72168291, + "learning_rate": 2.0519872783279074e-06, + "loss": 0.74307233, + "num_input_tokens_seen": 181255940, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 8432, + "time_per_iteration": 2.4937191009521484 + }, + { + "auxiliary_loss_clip": 0.01040308, + "auxiliary_loss_mlp": 0.0100546, + "balance_loss_clip": 1.00428617, + "balance_loss_mlp": 1.01756871, + "epoch": 0.5070193897489854, + "flos": 65793771941760.0, + "grad_norm": 0.7612043046384747, + "language_loss": 0.63704848, + "learning_rate": 2.0515979472590945e-06, + "loss": 0.65750623, + "num_input_tokens_seen": 181316945, + "router_z_loss_clip": 0.01171875, + "router_z_loss_mlp": 0.22753906, + "step": 8433, + "time_per_iteration": 3.10312819480896 + }, + { + "auxiliary_loss_clip": 0.01109071, + "auxiliary_loss_mlp": 0.01035563, + "balance_loss_clip": 1.02276051, + "balance_loss_mlp": 1.0391171, + "epoch": 0.5070795130016534, + "flos": 17275331514240.0, + "grad_norm": 1.7667352609332163, + "language_loss": 0.77104461, + "learning_rate": 2.051208614233681e-06, + "loss": 0.79249096, + "num_input_tokens_seen": 181335555, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 8434, + "time_per_iteration": 2.4761765003204346 + }, + { + "auxiliary_loss_clip": 0.01110101, + "auxiliary_loss_mlp": 0.01032152, + "balance_loss_clip": 1.01997447, + "balance_loss_mlp": 1.03937244, + "epoch": 0.5071396362543213, + "flos": 21069940095360.0, + "grad_norm": 1.7167508969307774, + "language_loss": 0.71062863, + "learning_rate": 2.0508192792664326e-06, + "loss": 0.73205119, + "num_input_tokens_seen": 181354580, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.70703125, + "step": 8435, + "time_per_iteration": 2.476259231567383 + }, + { + "auxiliary_loss_clip": 0.01111512, + "auxiliary_loss_mlp": 0.01034776, + "balance_loss_clip": 1.02086425, + "balance_loss_mlp": 1.04086459, + "epoch": 0.5071997595069894, + "flos": 23144822248320.0, + "grad_norm": 2.1519666669040407, + "language_loss": 0.71635526, + "learning_rate": 2.050429942372112e-06, + "loss": 0.73781812, + "num_input_tokens_seen": 181374320, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.70703125, + "step": 8436, + "time_per_iteration": 2.4717278480529785 + }, + { + "auxiliary_loss_clip": 0.0111073, + "auxiliary_loss_mlp": 0.01029431, + "balance_loss_clip": 1.01621652, + "balance_loss_mlp": 1.04132712, + "epoch": 0.5072598827596573, + "flos": 22747183712640.0, + "grad_norm": 1.5051036444651287, + "language_loss": 0.8370682, + "learning_rate": 2.050040603565483e-06, + "loss": 0.85846984, + "num_input_tokens_seen": 181392190, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.6953125, + "step": 8437, + "time_per_iteration": 2.51187801361084 + }, + { + "auxiliary_loss_clip": 0.01107572, + "auxiliary_loss_mlp": 0.01025487, + "balance_loss_clip": 1.01340485, + "balance_loss_mlp": 1.03941774, + "epoch": 0.5073200060123253, + "flos": 22566301799040.0, + "grad_norm": 1.8339895444539178, + "language_loss": 0.80925703, + "learning_rate": 2.049651262861309e-06, + "loss": 0.83058763, + "num_input_tokens_seen": 181413890, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 8438, + "time_per_iteration": 2.5101053714752197 + }, + { + "auxiliary_loss_clip": 0.01112175, + "auxiliary_loss_mlp": 0.0103796, + "balance_loss_clip": 1.023947, + "balance_loss_mlp": 1.04053128, + "epoch": 0.5073801292649932, + "flos": 25806341324160.0, + "grad_norm": 1.458277190934999, + "language_loss": 0.79797888, + "learning_rate": 2.0492619202743543e-06, + "loss": 0.81948024, + "num_input_tokens_seen": 181433240, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.71875, + "step": 8439, + "time_per_iteration": 2.5196681022644043 + }, + { + "auxiliary_loss_clip": 0.01107511, + "auxiliary_loss_mlp": 0.01033327, + "balance_loss_clip": 1.02176344, + "balance_loss_mlp": 1.03948164, + "epoch": 0.5074402525176612, + "flos": 25373941401600.0, + "grad_norm": 1.5054968059802218, + "language_loss": 0.7129699, + "learning_rate": 2.048872575819383e-06, + "loss": 0.73437822, + "num_input_tokens_seen": 181453535, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6796875, + "step": 8440, + "time_per_iteration": 2.482475757598877 + }, + { + "auxiliary_loss_clip": 0.01110635, + "auxiliary_loss_mlp": 0.0103111, + "balance_loss_clip": 1.01877761, + "balance_loss_mlp": 1.03933895, + "epoch": 0.5075003757703291, + "flos": 26064431521920.0, + "grad_norm": 1.6937518353915977, + "language_loss": 0.70555139, + "learning_rate": 2.048483229511158e-06, + "loss": 0.72696882, + "num_input_tokens_seen": 181474195, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.7109375, + "step": 8441, + "time_per_iteration": 2.5299065113067627 + }, + { + "auxiliary_loss_clip": 0.01113885, + "auxiliary_loss_mlp": 0.01035023, + "balance_loss_clip": 1.0219456, + "balance_loss_mlp": 1.04142308, + "epoch": 0.5075604990229972, + "flos": 21835447770240.0, + "grad_norm": 1.8980066327338418, + "language_loss": 0.63670987, + "learning_rate": 2.0480938813644445e-06, + "loss": 0.65819889, + "num_input_tokens_seen": 181494000, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.72265625, + "step": 8442, + "time_per_iteration": 2.4623775482177734 + }, + { + "auxiliary_loss_clip": 0.01108296, + "auxiliary_loss_mlp": 0.01027699, + "balance_loss_clip": 1.016011, + "balance_loss_mlp": 1.04047632, + "epoch": 0.5076206222756651, + "flos": 31978703537280.0, + "grad_norm": 1.5153774279484464, + "language_loss": 0.7150898, + "learning_rate": 2.047704531394006e-06, + "loss": 0.73644972, + "num_input_tokens_seen": 181515955, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 8443, + "time_per_iteration": 2.586273670196533 + }, + { + "auxiliary_loss_clip": 0.01110925, + "auxiliary_loss_mlp": 0.01033689, + "balance_loss_clip": 1.02046299, + "balance_loss_mlp": 1.03887248, + "epoch": 0.5076807455283331, + "flos": 36904031326080.0, + "grad_norm": 1.223488951652841, + "language_loss": 0.61766541, + "learning_rate": 2.047315179614607e-06, + "loss": 0.63911152, + "num_input_tokens_seen": 181540225, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 8444, + "time_per_iteration": 2.5941321849823 + }, + { + "auxiliary_loss_clip": 0.01107921, + "auxiliary_loss_mlp": 0.01032806, + "balance_loss_clip": 1.02056909, + "balance_loss_mlp": 1.0380075, + "epoch": 0.507740868781001, + "flos": 29862415981440.0, + "grad_norm": 1.7476957798256931, + "language_loss": 0.6370405, + "learning_rate": 2.046925826041012e-06, + "loss": 0.65844774, + "num_input_tokens_seen": 181560125, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69921875, + "step": 8445, + "time_per_iteration": 2.622295379638672 + }, + { + "auxiliary_loss_clip": 0.01042597, + "auxiliary_loss_mlp": 0.01005213, + "balance_loss_clip": 1.00411069, + "balance_loss_mlp": 1.019732, + "epoch": 0.507800992033669, + "flos": 61918974247680.0, + "grad_norm": 0.8272934825203048, + "language_loss": 0.61873507, + "learning_rate": 2.0465364706879845e-06, + "loss": 0.6392132, + "num_input_tokens_seen": 181618830, + "router_z_loss_clip": 0.01104736, + "router_z_loss_mlp": 0.22851562, + "step": 8446, + "time_per_iteration": 3.106067180633545 + }, + { + "auxiliary_loss_clip": 0.01107421, + "auxiliary_loss_mlp": 0.01028834, + "balance_loss_clip": 1.01656127, + "balance_loss_mlp": 1.03849411, + "epoch": 0.507861115286337, + "flos": 20700490757760.0, + "grad_norm": 1.6783761303243148, + "language_loss": 0.80458808, + "learning_rate": 2.04614711357029e-06, + "loss": 0.82595056, + "num_input_tokens_seen": 181637120, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 8447, + "time_per_iteration": 2.483449935913086 + }, + { + "auxiliary_loss_clip": 0.01109683, + "auxiliary_loss_mlp": 0.01031748, + "balance_loss_clip": 1.01955903, + "balance_loss_mlp": 1.04166472, + "epoch": 0.507921238539005, + "flos": 30847050576000.0, + "grad_norm": 1.6097524760484219, + "language_loss": 0.70526159, + "learning_rate": 2.0457577547026916e-06, + "loss": 0.72667593, + "num_input_tokens_seen": 181659965, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6796875, + "step": 8448, + "time_per_iteration": 2.5377211570739746 + }, + { + "auxiliary_loss_clip": 0.01108561, + "auxiliary_loss_mlp": 0.01030678, + "balance_loss_clip": 1.01906157, + "balance_loss_mlp": 1.04054332, + "epoch": 0.507981361791673, + "flos": 35700197984640.0, + "grad_norm": 1.775058362169557, + "language_loss": 0.72186208, + "learning_rate": 2.045368394099955e-06, + "loss": 0.74325454, + "num_input_tokens_seen": 181685290, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 8449, + "time_per_iteration": 2.6247637271881104 + }, + { + "auxiliary_loss_clip": 0.01105391, + "auxiliary_loss_mlp": 0.01030497, + "balance_loss_clip": 1.01862371, + "balance_loss_mlp": 1.0373019, + "epoch": 0.5080414850443409, + "flos": 27161466750720.0, + "grad_norm": 1.4717194557779922, + "language_loss": 0.72751403, + "learning_rate": 2.044979031776844e-06, + "loss": 0.74887294, + "num_input_tokens_seen": 181706080, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 8450, + "time_per_iteration": 2.5097148418426514 + }, + { + "auxiliary_loss_clip": 0.01112404, + "auxiliary_loss_mlp": 0.01033223, + "balance_loss_clip": 1.02104533, + "balance_loss_mlp": 1.04217696, + "epoch": 0.5081016082970089, + "flos": 27085192220160.0, + "grad_norm": 1.631370100986613, + "language_loss": 0.7704621, + "learning_rate": 2.0445896677481234e-06, + "loss": 0.7919184, + "num_input_tokens_seen": 181724805, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69921875, + "step": 8451, + "time_per_iteration": 2.5109496116638184 + }, + { + "auxiliary_loss_clip": 0.01109885, + "auxiliary_loss_mlp": 0.01036862, + "balance_loss_clip": 1.02502477, + "balance_loss_mlp": 1.03928411, + "epoch": 0.5081617315496768, + "flos": 22856531690880.0, + "grad_norm": 1.7784899256909827, + "language_loss": 0.8518312, + "learning_rate": 2.044200302028559e-06, + "loss": 0.8732987, + "num_input_tokens_seen": 181743725, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.703125, + "step": 8452, + "time_per_iteration": 2.4603476524353027 + }, + { + "auxiliary_loss_clip": 0.01115612, + "auxiliary_loss_mlp": 0.01036365, + "balance_loss_clip": 1.02284074, + "balance_loss_mlp": 1.04209125, + "epoch": 0.5082218548023448, + "flos": 16281898087680.0, + "grad_norm": 2.2856093940760274, + "language_loss": 0.78046912, + "learning_rate": 2.0438109346329143e-06, + "loss": 0.80198884, + "num_input_tokens_seen": 181757720, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.734375, + "step": 8453, + "time_per_iteration": 2.450873613357544 + }, + { + "auxiliary_loss_clip": 0.01106928, + "auxiliary_loss_mlp": 0.01034326, + "balance_loss_clip": 1.02200532, + "balance_loss_mlp": 1.03973246, + "epoch": 0.5082819780550127, + "flos": 24460768915200.0, + "grad_norm": 1.6556718901191125, + "language_loss": 0.7626555, + "learning_rate": 2.0434215655759544e-06, + "loss": 0.78406799, + "num_input_tokens_seen": 181778545, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.671875, + "step": 8454, + "time_per_iteration": 2.4831783771514893 + }, + { + "auxiliary_loss_clip": 0.01110162, + "auxiliary_loss_mlp": 0.01032609, + "balance_loss_clip": 1.01998448, + "balance_loss_mlp": 1.03985167, + "epoch": 0.5083421013076808, + "flos": 23403271582080.0, + "grad_norm": 1.7440679508015728, + "language_loss": 0.89345592, + "learning_rate": 2.0430321948724446e-06, + "loss": 0.91488367, + "num_input_tokens_seen": 181799495, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 8455, + "time_per_iteration": 2.48486590385437 + }, + { + "auxiliary_loss_clip": 0.01116133, + "auxiliary_loss_mlp": 0.01035999, + "balance_loss_clip": 1.02230144, + "balance_loss_mlp": 1.04198599, + "epoch": 0.5084022245603487, + "flos": 23872695448320.0, + "grad_norm": 2.029385394187206, + "language_loss": 0.62613618, + "learning_rate": 2.042642822537149e-06, + "loss": 0.64765751, + "num_input_tokens_seen": 181818400, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7421875, + "step": 8456, + "time_per_iteration": 2.476060390472412 + }, + { + "auxiliary_loss_clip": 0.01038842, + "auxiliary_loss_mlp": 0.00998694, + "balance_loss_clip": 0.99766272, + "balance_loss_mlp": 1.01592362, + "epoch": 0.5084623478130167, + "flos": 62873336655360.0, + "grad_norm": 0.816065361839575, + "language_loss": 0.62538505, + "learning_rate": 2.0422534485848343e-06, + "loss": 0.64576042, + "num_input_tokens_seen": 181875975, + "router_z_loss_clip": 0.01031494, + "router_z_loss_mlp": 0.22949219, + "step": 8457, + "time_per_iteration": 2.9627416133880615 + }, + { + "auxiliary_loss_clip": 0.01110833, + "auxiliary_loss_mlp": 0.01033568, + "balance_loss_clip": 1.02069306, + "balance_loss_mlp": 1.04062462, + "epoch": 0.5085224710656846, + "flos": 22346133384960.0, + "grad_norm": 1.5574868486202833, + "language_loss": 0.67412502, + "learning_rate": 2.0418640730302644e-06, + "loss": 0.69556904, + "num_input_tokens_seen": 181896450, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 8458, + "time_per_iteration": 2.4851465225219727 + }, + { + "auxiliary_loss_clip": 0.01109854, + "auxiliary_loss_mlp": 0.01031481, + "balance_loss_clip": 1.01840341, + "balance_loss_mlp": 1.03811622, + "epoch": 0.5085825943183526, + "flos": 26066263115520.0, + "grad_norm": 1.6253676139168076, + "language_loss": 0.77861875, + "learning_rate": 2.0414746958882043e-06, + "loss": 0.80003208, + "num_input_tokens_seen": 181916770, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 8459, + "time_per_iteration": 2.5043020248413086 + }, + { + "auxiliary_loss_clip": 0.01117652, + "auxiliary_loss_mlp": 0.01035652, + "balance_loss_clip": 1.02252126, + "balance_loss_mlp": 1.04386926, + "epoch": 0.5086427175710206, + "flos": 17420733768960.0, + "grad_norm": 2.213093169353168, + "language_loss": 0.81109118, + "learning_rate": 2.0410853171734196e-06, + "loss": 0.83262426, + "num_input_tokens_seen": 181932710, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73828125, + "step": 8460, + "time_per_iteration": 2.4239838123321533 + }, + { + "auxiliary_loss_clip": 0.01111375, + "auxiliary_loss_mlp": 0.010378, + "balance_loss_clip": 1.02565289, + "balance_loss_mlp": 1.03999329, + "epoch": 0.5087028408236886, + "flos": 20631758083200.0, + "grad_norm": 1.5640945155523684, + "language_loss": 0.6866132, + "learning_rate": 2.0406959369006754e-06, + "loss": 0.70810497, + "num_input_tokens_seen": 181950665, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.7109375, + "step": 8461, + "time_per_iteration": 2.469954490661621 + }, + { + "auxiliary_loss_clip": 0.01107585, + "auxiliary_loss_mlp": 0.01032567, + "balance_loss_clip": 1.01997876, + "balance_loss_mlp": 1.03908265, + "epoch": 0.5087629640763566, + "flos": 25593822506880.0, + "grad_norm": 1.5611830538381608, + "language_loss": 0.76059598, + "learning_rate": 2.0403065550847375e-06, + "loss": 0.7819975, + "num_input_tokens_seen": 181971270, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 8462, + "time_per_iteration": 2.4907591342926025 + }, + { + "auxiliary_loss_clip": 0.01111001, + "auxiliary_loss_mlp": 0.01036225, + "balance_loss_clip": 1.02376187, + "balance_loss_mlp": 1.04031515, + "epoch": 0.5088230873290245, + "flos": 13261631927040.0, + "grad_norm": 1.977849325123916, + "language_loss": 0.8121528, + "learning_rate": 2.0399171717403706e-06, + "loss": 0.83362508, + "num_input_tokens_seen": 181988410, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.70703125, + "step": 8463, + "time_per_iteration": 2.460604190826416 + }, + { + "auxiliary_loss_clip": 0.01109081, + "auxiliary_loss_mlp": 0.01037256, + "balance_loss_clip": 1.02527571, + "balance_loss_mlp": 1.03999758, + "epoch": 0.5088832105816925, + "flos": 20043469134720.0, + "grad_norm": 1.7045720874408852, + "language_loss": 0.7630803, + "learning_rate": 2.039527786882341e-06, + "loss": 0.78454363, + "num_input_tokens_seen": 182006530, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 8464, + "time_per_iteration": 2.4307475090026855 + }, + { + "auxiliary_loss_clip": 0.01037487, + "auxiliary_loss_mlp": 0.01005228, + "balance_loss_clip": 1.00426793, + "balance_loss_mlp": 1.01476121, + "epoch": 0.5089433338343604, + "flos": 67422179018880.0, + "grad_norm": 0.687733273493157, + "language_loss": 0.59352195, + "learning_rate": 2.0391384005254133e-06, + "loss": 0.61394918, + "num_input_tokens_seen": 182074240, + "router_z_loss_clip": 0.00958252, + "router_z_loss_mlp": 0.2265625, + "step": 8465, + "time_per_iteration": 3.1989307403564453 + }, + { + "auxiliary_loss_clip": 0.01106873, + "auxiliary_loss_mlp": 0.01035022, + "balance_loss_clip": 1.02263045, + "balance_loss_mlp": 1.03822207, + "epoch": 0.5090034570870284, + "flos": 22710339336960.0, + "grad_norm": 1.7579634525926484, + "language_loss": 0.79857922, + "learning_rate": 2.038749012684354e-06, + "loss": 0.81999815, + "num_input_tokens_seen": 182093360, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 8466, + "time_per_iteration": 2.472186326980591 + }, + { + "auxiliary_loss_clip": 0.01104861, + "auxiliary_loss_mlp": 0.01032192, + "balance_loss_clip": 1.01950181, + "balance_loss_mlp": 1.03679371, + "epoch": 0.5090635803396963, + "flos": 20445812352000.0, + "grad_norm": 1.5999387152583837, + "language_loss": 0.78222281, + "learning_rate": 2.0383596233739286e-06, + "loss": 0.80359334, + "num_input_tokens_seen": 182110170, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 8467, + "time_per_iteration": 2.4692180156707764 + }, + { + "auxiliary_loss_clip": 0.01107209, + "auxiliary_loss_mlp": 0.01032197, + "balance_loss_clip": 1.02041364, + "balance_loss_mlp": 1.03994191, + "epoch": 0.5091237035923644, + "flos": 23768878164480.0, + "grad_norm": 1.7540939283261232, + "language_loss": 0.7467652, + "learning_rate": 2.0379702326089013e-06, + "loss": 0.76815927, + "num_input_tokens_seen": 182129570, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 8468, + "time_per_iteration": 3.8722333908081055 + }, + { + "auxiliary_loss_clip": 0.01107691, + "auxiliary_loss_mlp": 0.01031114, + "balance_loss_clip": 1.01877546, + "balance_loss_mlp": 1.03856027, + "epoch": 0.5091838268450323, + "flos": 18327908684160.0, + "grad_norm": 1.7320149470681812, + "language_loss": 0.77835757, + "learning_rate": 2.03758084040404e-06, + "loss": 0.79974556, + "num_input_tokens_seen": 182147565, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.69140625, + "step": 8469, + "time_per_iteration": 2.4514496326446533 + }, + { + "auxiliary_loss_clip": 0.01112445, + "auxiliary_loss_mlp": 0.01035475, + "balance_loss_clip": 1.0221895, + "balance_loss_mlp": 1.04265046, + "epoch": 0.5092439500977003, + "flos": 29057621806080.0, + "grad_norm": 1.5013208791161945, + "language_loss": 0.69422746, + "learning_rate": 2.037191446774109e-06, + "loss": 0.71570665, + "num_input_tokens_seen": 182169695, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6953125, + "step": 8470, + "time_per_iteration": 2.5658817291259766 + }, + { + "auxiliary_loss_clip": 0.01112957, + "auxiliary_loss_mlp": 0.01033067, + "balance_loss_clip": 1.01997817, + "balance_loss_mlp": 1.04058552, + "epoch": 0.5093040733503682, + "flos": 13553908894080.0, + "grad_norm": 2.018231732442679, + "language_loss": 0.73409355, + "learning_rate": 2.0368020517338745e-06, + "loss": 0.75555384, + "num_input_tokens_seen": 182186385, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.72265625, + "step": 8471, + "time_per_iteration": 5.355906009674072 + }, + { + "auxiliary_loss_clip": 0.01036047, + "auxiliary_loss_mlp": 0.01003435, + "balance_loss_clip": 1.00242805, + "balance_loss_mlp": 1.01322865, + "epoch": 0.5093641966030362, + "flos": 68906617407360.0, + "grad_norm": 0.7572542385247485, + "language_loss": 0.58153868, + "learning_rate": 2.036412655298103e-06, + "loss": 0.60193354, + "num_input_tokens_seen": 182247095, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.22851562, + "step": 8472, + "time_per_iteration": 3.0752861499786377 + }, + { + "auxiliary_loss_clip": 0.01111139, + "auxiliary_loss_mlp": 0.01032478, + "balance_loss_clip": 1.02100456, + "balance_loss_mlp": 1.04138827, + "epoch": 0.5094243198557042, + "flos": 21580948932480.0, + "grad_norm": 1.783541878810952, + "language_loss": 0.69200397, + "learning_rate": 2.03602325748156e-06, + "loss": 0.71344012, + "num_input_tokens_seen": 182266380, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6953125, + "step": 8473, + "time_per_iteration": 2.4832053184509277 + }, + { + "auxiliary_loss_clip": 0.01109225, + "auxiliary_loss_mlp": 0.01033767, + "balance_loss_clip": 1.02144074, + "balance_loss_mlp": 1.03987551, + "epoch": 0.5094844431083722, + "flos": 28840721529600.0, + "grad_norm": 2.2073606957030143, + "language_loss": 0.85564739, + "learning_rate": 2.0356338582990105e-06, + "loss": 0.87707734, + "num_input_tokens_seen": 182284685, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6953125, + "step": 8474, + "time_per_iteration": 2.5068845748901367 + }, + { + "auxiliary_loss_clip": 0.01110669, + "auxiliary_loss_mlp": 0.01031974, + "balance_loss_clip": 1.01944494, + "balance_loss_mlp": 1.03983307, + "epoch": 0.5095445663610402, + "flos": 14976114969600.0, + "grad_norm": 2.014074019348489, + "language_loss": 0.64659619, + "learning_rate": 2.035244457765222e-06, + "loss": 0.66802263, + "num_input_tokens_seen": 182301810, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.70703125, + "step": 8475, + "time_per_iteration": 2.4363739490509033 + }, + { + "auxiliary_loss_clip": 0.01115225, + "auxiliary_loss_mlp": 0.01038799, + "balance_loss_clip": 1.02557325, + "balance_loss_mlp": 1.04094887, + "epoch": 0.5096046896137081, + "flos": 20777088510720.0, + "grad_norm": 4.024838672705198, + "language_loss": 0.81962836, + "learning_rate": 2.0348550558949605e-06, + "loss": 0.84116852, + "num_input_tokens_seen": 182320285, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 8476, + "time_per_iteration": 2.448249578475952 + }, + { + "auxiliary_loss_clip": 0.01111186, + "auxiliary_loss_mlp": 0.01035572, + "balance_loss_clip": 1.02019382, + "balance_loss_mlp": 1.03794646, + "epoch": 0.5096648128663761, + "flos": 23185078416000.0, + "grad_norm": 1.9611523426566915, + "language_loss": 0.81148994, + "learning_rate": 2.0344656527029917e-06, + "loss": 0.83295757, + "num_input_tokens_seen": 182339465, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.734375, + "step": 8477, + "time_per_iteration": 2.470248222351074 + }, + { + "auxiliary_loss_clip": 0.01111185, + "auxiliary_loss_mlp": 0.01029928, + "balance_loss_clip": 1.01584899, + "balance_loss_mlp": 1.03962493, + "epoch": 0.509724936119044, + "flos": 22309432663680.0, + "grad_norm": 1.8342280591951767, + "language_loss": 0.61682522, + "learning_rate": 2.034076248204082e-06, + "loss": 0.6382364, + "num_input_tokens_seen": 182358375, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.71484375, + "step": 8478, + "time_per_iteration": 2.4439172744750977 + }, + { + "auxiliary_loss_clip": 0.01108849, + "auxiliary_loss_mlp": 0.01037275, + "balance_loss_clip": 1.02540779, + "balance_loss_mlp": 1.03930426, + "epoch": 0.509785059371712, + "flos": 26287077974400.0, + "grad_norm": 1.4883331760724325, + "language_loss": 0.65860271, + "learning_rate": 2.0336868424129968e-06, + "loss": 0.6800639, + "num_input_tokens_seen": 182377935, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6953125, + "step": 8479, + "time_per_iteration": 2.4965710639953613 + }, + { + "auxiliary_loss_clip": 0.01107177, + "auxiliary_loss_mlp": 0.01031743, + "balance_loss_clip": 1.01974487, + "balance_loss_mlp": 1.0389936, + "epoch": 0.50984518262438, + "flos": 22964586779520.0, + "grad_norm": 1.620468938265791, + "language_loss": 0.69455707, + "learning_rate": 2.0332974353445037e-06, + "loss": 0.71594626, + "num_input_tokens_seen": 182396440, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 8480, + "time_per_iteration": 2.4500057697296143 + }, + { + "auxiliary_loss_clip": 0.01109301, + "auxiliary_loss_mlp": 0.01031568, + "balance_loss_clip": 1.01871157, + "balance_loss_mlp": 1.03733814, + "epoch": 0.509905305877048, + "flos": 26213389223040.0, + "grad_norm": 1.6808533459383284, + "language_loss": 0.79027826, + "learning_rate": 2.0329080270133688e-06, + "loss": 0.81168693, + "num_input_tokens_seen": 182415890, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 8481, + "time_per_iteration": 2.507157564163208 + }, + { + "auxiliary_loss_clip": 0.01104124, + "auxiliary_loss_mlp": 0.01034339, + "balance_loss_clip": 1.02170324, + "balance_loss_mlp": 1.03702283, + "epoch": 0.5099654291297159, + "flos": 20340055733760.0, + "grad_norm": 1.5080021873745288, + "language_loss": 0.83429766, + "learning_rate": 2.0325186174343578e-06, + "loss": 0.85568231, + "num_input_tokens_seen": 182434235, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.671875, + "step": 8482, + "time_per_iteration": 2.4544076919555664 + }, + { + "auxiliary_loss_clip": 0.0111291, + "auxiliary_loss_mlp": 0.01032891, + "balance_loss_clip": 1.01925349, + "balance_loss_mlp": 1.03990221, + "epoch": 0.5100255523823839, + "flos": 29054820545280.0, + "grad_norm": 1.7853243252822575, + "language_loss": 0.85625446, + "learning_rate": 2.032129206622238e-06, + "loss": 0.87771249, + "num_input_tokens_seen": 182454360, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.73046875, + "step": 8483, + "time_per_iteration": 2.519747734069824 + }, + { + "auxiliary_loss_clip": 0.01107969, + "auxiliary_loss_mlp": 0.01031848, + "balance_loss_clip": 1.01965284, + "balance_loss_mlp": 1.03712344, + "epoch": 0.5100856756350518, + "flos": 22455912326400.0, + "grad_norm": 1.7164607290812173, + "language_loss": 0.83208412, + "learning_rate": 2.031739794591775e-06, + "loss": 0.85348231, + "num_input_tokens_seen": 182471940, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.70703125, + "step": 8484, + "time_per_iteration": 2.4549949169158936 + }, + { + "auxiliary_loss_clip": 0.01109177, + "auxiliary_loss_mlp": 0.01028587, + "balance_loss_clip": 1.0154798, + "balance_loss_mlp": 1.03849459, + "epoch": 0.5101457988877198, + "flos": 19171055606400.0, + "grad_norm": 2.0216137506651983, + "language_loss": 0.81388122, + "learning_rate": 2.031350381357736e-06, + "loss": 0.83525884, + "num_input_tokens_seen": 182490685, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.70703125, + "step": 8485, + "time_per_iteration": 2.4612390995025635 + }, + { + "auxiliary_loss_clip": 0.01103382, + "auxiliary_loss_mlp": 0.01032642, + "balance_loss_clip": 1.02036929, + "balance_loss_mlp": 1.03675199, + "epoch": 0.5102059221403878, + "flos": 14866371941760.0, + "grad_norm": 2.1191716083834025, + "language_loss": 0.73653662, + "learning_rate": 2.0309609669348874e-06, + "loss": 0.7578969, + "num_input_tokens_seen": 182508325, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.66796875, + "step": 8486, + "time_per_iteration": 2.426042318344116 + }, + { + "auxiliary_loss_clip": 0.01112031, + "auxiliary_loss_mlp": 0.01031218, + "balance_loss_clip": 1.01824152, + "balance_loss_mlp": 1.03990436, + "epoch": 0.5102660453930558, + "flos": 22961103160320.0, + "grad_norm": 1.4808929350883289, + "language_loss": 0.69956315, + "learning_rate": 2.0305715513379953e-06, + "loss": 0.72099566, + "num_input_tokens_seen": 182527020, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.72265625, + "step": 8487, + "time_per_iteration": 2.5032570362091064 + }, + { + "auxiliary_loss_clip": 0.01108669, + "auxiliary_loss_mlp": 0.01033355, + "balance_loss_clip": 1.01987231, + "balance_loss_mlp": 1.04012084, + "epoch": 0.5103261686457238, + "flos": 23149311448320.0, + "grad_norm": 1.9552461936614123, + "language_loss": 0.72984374, + "learning_rate": 2.030182134581827e-06, + "loss": 0.75126404, + "num_input_tokens_seen": 182543505, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.6875, + "step": 8488, + "time_per_iteration": 2.454589605331421 + }, + { + "auxiliary_loss_clip": 0.011087, + "auxiliary_loss_mlp": 0.01032468, + "balance_loss_clip": 1.02002835, + "balance_loss_mlp": 1.03795087, + "epoch": 0.5103862918983917, + "flos": 14319237000960.0, + "grad_norm": 1.814097723080907, + "language_loss": 0.69584548, + "learning_rate": 2.0297927166811503e-06, + "loss": 0.71725714, + "num_input_tokens_seen": 182562250, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.70703125, + "step": 8489, + "time_per_iteration": 2.4295358657836914 + }, + { + "auxiliary_loss_clip": 0.01108544, + "auxiliary_loss_mlp": 0.01030245, + "balance_loss_clip": 1.01800227, + "balance_loss_mlp": 1.03788161, + "epoch": 0.5104464151510597, + "flos": 25848536826240.0, + "grad_norm": 1.8877500438207433, + "language_loss": 0.72447532, + "learning_rate": 2.0294032976507297e-06, + "loss": 0.7458632, + "num_input_tokens_seen": 182581910, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.70703125, + "step": 8490, + "time_per_iteration": 2.484398603439331 + }, + { + "auxiliary_loss_clip": 0.01105533, + "auxiliary_loss_mlp": 0.01028247, + "balance_loss_clip": 1.01649261, + "balance_loss_mlp": 1.03803921, + "epoch": 0.5105065384037276, + "flos": 21652913831040.0, + "grad_norm": 1.594832362291185, + "language_loss": 0.80287743, + "learning_rate": 2.0290138775053337e-06, + "loss": 0.82421523, + "num_input_tokens_seen": 182601350, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.67578125, + "step": 8491, + "time_per_iteration": 2.4715051651000977 + }, + { + "auxiliary_loss_clip": 0.01103108, + "auxiliary_loss_mlp": 0.01028042, + "balance_loss_clip": 1.0155549, + "balance_loss_mlp": 1.03651989, + "epoch": 0.5105666616563956, + "flos": 22491571553280.0, + "grad_norm": 2.311833139697555, + "language_loss": 0.79033649, + "learning_rate": 2.028624456259728e-06, + "loss": 0.81164801, + "num_input_tokens_seen": 182619660, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6640625, + "step": 8492, + "time_per_iteration": 2.4697651863098145 + }, + { + "auxiliary_loss_clip": 0.01114847, + "auxiliary_loss_mlp": 0.01038448, + "balance_loss_clip": 1.02560329, + "balance_loss_mlp": 1.04234147, + "epoch": 0.5106267849090635, + "flos": 22455768672000.0, + "grad_norm": 2.1680982451379607, + "language_loss": 0.77821648, + "learning_rate": 2.0282350339286804e-06, + "loss": 0.79974937, + "num_input_tokens_seen": 182639815, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7265625, + "step": 8493, + "time_per_iteration": 2.490349054336548 + }, + { + "auxiliary_loss_clip": 0.01109447, + "auxiliary_loss_mlp": 0.01029414, + "balance_loss_clip": 1.01608634, + "balance_loss_mlp": 1.03989387, + "epoch": 0.5106869081617316, + "flos": 23547093638400.0, + "grad_norm": 2.213061013784994, + "language_loss": 0.83690828, + "learning_rate": 2.0278456105269574e-06, + "loss": 0.85829687, + "num_input_tokens_seen": 182659655, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6953125, + "step": 8494, + "time_per_iteration": 2.4604976177215576 + }, + { + "auxiliary_loss_clip": 0.01112511, + "auxiliary_loss_mlp": 0.01033364, + "balance_loss_clip": 1.02189648, + "balance_loss_mlp": 1.04180336, + "epoch": 0.5107470314143995, + "flos": 26792987080320.0, + "grad_norm": 1.8678450133518327, + "language_loss": 0.79117751, + "learning_rate": 2.027456186069326e-06, + "loss": 0.81263626, + "num_input_tokens_seen": 182677075, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.70703125, + "step": 8495, + "time_per_iteration": 2.5202648639678955 + }, + { + "auxiliary_loss_clip": 0.01109453, + "auxiliary_loss_mlp": 0.01034821, + "balance_loss_clip": 1.02276945, + "balance_loss_mlp": 1.04033172, + "epoch": 0.5108071546670675, + "flos": 25739691638400.0, + "grad_norm": 1.5685043948688704, + "language_loss": 0.78221929, + "learning_rate": 2.0270667605705535e-06, + "loss": 0.80366194, + "num_input_tokens_seen": 182699625, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 8496, + "time_per_iteration": 2.499793767929077 + }, + { + "auxiliary_loss_clip": 0.01105005, + "auxiliary_loss_mlp": 0.01026512, + "balance_loss_clip": 1.01508582, + "balance_loss_mlp": 1.03803635, + "epoch": 0.5108672779197354, + "flos": 18697537589760.0, + "grad_norm": 1.9336450862291243, + "language_loss": 0.7876817, + "learning_rate": 2.0266773340454066e-06, + "loss": 0.8089968, + "num_input_tokens_seen": 182717020, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 8497, + "time_per_iteration": 2.450246572494507 + }, + { + "auxiliary_loss_clip": 0.01106851, + "auxiliary_loss_mlp": 0.01032259, + "balance_loss_clip": 1.0203619, + "balance_loss_mlp": 1.03829265, + "epoch": 0.5109274011724034, + "flos": 26688164215680.0, + "grad_norm": 1.6296784083005205, + "language_loss": 0.8186121, + "learning_rate": 2.0262879065086525e-06, + "loss": 0.84000313, + "num_input_tokens_seen": 182736955, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.68359375, + "step": 8498, + "time_per_iteration": 2.4860284328460693 + }, + { + "auxiliary_loss_clip": 0.0110713, + "auxiliary_loss_mlp": 0.01028216, + "balance_loss_clip": 1.01559711, + "balance_loss_mlp": 1.03989053, + "epoch": 0.5109875244250714, + "flos": 22784028088320.0, + "grad_norm": 1.9511970266493632, + "language_loss": 0.71084464, + "learning_rate": 2.0258984779750584e-06, + "loss": 0.73219806, + "num_input_tokens_seen": 182757620, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.671875, + "step": 8499, + "time_per_iteration": 2.488870859146118 + }, + { + "auxiliary_loss_clip": 0.01108699, + "auxiliary_loss_mlp": 0.01031134, + "balance_loss_clip": 1.01899862, + "balance_loss_mlp": 1.03962827, + "epoch": 0.5110476476777394, + "flos": 35588515622400.0, + "grad_norm": 1.470448999091522, + "language_loss": 0.72600758, + "learning_rate": 2.0255090484593914e-06, + "loss": 0.74740595, + "num_input_tokens_seen": 182780195, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69140625, + "step": 8500, + "time_per_iteration": 2.554612874984741 + }, + { + "auxiliary_loss_clip": 0.01113166, + "auxiliary_loss_mlp": 0.01032162, + "balance_loss_clip": 1.01870334, + "balance_loss_mlp": 1.03988254, + "epoch": 0.5111077709304074, + "flos": 19280798634240.0, + "grad_norm": 2.631045408977224, + "language_loss": 0.63011086, + "learning_rate": 2.0251196179764183e-06, + "loss": 0.65156412, + "num_input_tokens_seen": 182795765, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 8501, + "time_per_iteration": 2.4470977783203125 + }, + { + "auxiliary_loss_clip": 0.01109012, + "auxiliary_loss_mlp": 0.01033513, + "balance_loss_clip": 1.02117443, + "balance_loss_mlp": 1.03708565, + "epoch": 0.5111678941830753, + "flos": 20668207409280.0, + "grad_norm": 1.7479031643347964, + "language_loss": 0.8759163, + "learning_rate": 2.024730186540907e-06, + "loss": 0.89734155, + "num_input_tokens_seen": 182813120, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.71875, + "step": 8502, + "time_per_iteration": 2.4252443313598633 + }, + { + "auxiliary_loss_clip": 0.01103318, + "auxiliary_loss_mlp": 0.01033556, + "balance_loss_clip": 1.02192163, + "balance_loss_mlp": 1.0349071, + "epoch": 0.5112280174357433, + "flos": 26287903987200.0, + "grad_norm": 1.3950925269756227, + "language_loss": 0.82526219, + "learning_rate": 2.0243407541676253e-06, + "loss": 0.84663093, + "num_input_tokens_seen": 182835745, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.68359375, + "step": 8503, + "time_per_iteration": 2.5170319080352783 + }, + { + "auxiliary_loss_clip": 0.01038121, + "auxiliary_loss_mlp": 0.01001996, + "balance_loss_clip": 1.00103021, + "balance_loss_mlp": 1.01512361, + "epoch": 0.5112881406884112, + "flos": 59474247707520.0, + "grad_norm": 0.8658208518316733, + "language_loss": 0.63857049, + "learning_rate": 2.023951320871339e-06, + "loss": 0.65897167, + "num_input_tokens_seen": 182892540, + "router_z_loss_clip": 0.00964355, + "router_z_loss_mlp": 0.23046875, + "step": 8504, + "time_per_iteration": 3.098529577255249 + }, + { + "auxiliary_loss_clip": 0.01108099, + "auxiliary_loss_mlp": 0.01030933, + "balance_loss_clip": 1.01815391, + "balance_loss_mlp": 1.03960776, + "epoch": 0.5113482639410792, + "flos": 26468857728000.0, + "grad_norm": 3.195489539056655, + "language_loss": 0.84326482, + "learning_rate": 2.023561886666816e-06, + "loss": 0.86465514, + "num_input_tokens_seen": 182911515, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.68359375, + "step": 8505, + "time_per_iteration": 2.5145134925842285 + }, + { + "auxiliary_loss_clip": 0.01107392, + "auxiliary_loss_mlp": 0.01026895, + "balance_loss_clip": 1.01499188, + "balance_loss_mlp": 1.0399797, + "epoch": 0.5114083871937471, + "flos": 29895848565120.0, + "grad_norm": 1.9725783043316722, + "language_loss": 0.75117159, + "learning_rate": 2.0231724515688246e-06, + "loss": 0.77251446, + "num_input_tokens_seen": 182930860, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 8506, + "time_per_iteration": 2.529463052749634 + }, + { + "auxiliary_loss_clip": 0.01107977, + "auxiliary_loss_mlp": 0.01032843, + "balance_loss_clip": 1.01951551, + "balance_loss_mlp": 1.03808045, + "epoch": 0.5114685104464152, + "flos": 24314576561280.0, + "grad_norm": 1.6477689192158658, + "language_loss": 0.58288801, + "learning_rate": 2.022783015592131e-06, + "loss": 0.60429621, + "num_input_tokens_seen": 182949960, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.69921875, + "step": 8507, + "time_per_iteration": 2.515449047088623 + }, + { + "auxiliary_loss_clip": 0.01111035, + "auxiliary_loss_mlp": 0.01039811, + "balance_loss_clip": 1.02690697, + "balance_loss_mlp": 1.04132211, + "epoch": 0.5115286336990831, + "flos": 17019288391680.0, + "grad_norm": 1.6046089096743523, + "language_loss": 0.85276306, + "learning_rate": 2.022393578751503e-06, + "loss": 0.87427151, + "num_input_tokens_seen": 182968085, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 8508, + "time_per_iteration": 2.4760663509368896 + }, + { + "auxiliary_loss_clip": 0.01110329, + "auxiliary_loss_mlp": 0.01033776, + "balance_loss_clip": 1.02051985, + "balance_loss_mlp": 1.03969765, + "epoch": 0.5115887569517511, + "flos": 23659386531840.0, + "grad_norm": 1.6014168180464263, + "language_loss": 0.72123772, + "learning_rate": 2.022004141061709e-06, + "loss": 0.74267876, + "num_input_tokens_seen": 182987275, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.70703125, + "step": 8509, + "time_per_iteration": 2.5354809761047363 + }, + { + "auxiliary_loss_clip": 0.01107381, + "auxiliary_loss_mlp": 0.01032, + "balance_loss_clip": 1.02060962, + "balance_loss_mlp": 1.03980041, + "epoch": 0.511648880204419, + "flos": 16107193313280.0, + "grad_norm": 1.6675565589278303, + "language_loss": 0.75862014, + "learning_rate": 2.0216147025375153e-06, + "loss": 0.78001392, + "num_input_tokens_seen": 183004700, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.67578125, + "step": 8510, + "time_per_iteration": 3.945136785507202 + }, + { + "auxiliary_loss_clip": 0.01108162, + "auxiliary_loss_mlp": 0.01033651, + "balance_loss_clip": 1.02163482, + "balance_loss_mlp": 1.04065561, + "epoch": 0.511709003457087, + "flos": 32634970974720.0, + "grad_norm": 1.6646040073598372, + "language_loss": 0.71192694, + "learning_rate": 2.0212252631936907e-06, + "loss": 0.73334503, + "num_input_tokens_seen": 183025830, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 8511, + "time_per_iteration": 2.541703701019287 + }, + { + "auxiliary_loss_clip": 0.01105914, + "auxiliary_loss_mlp": 0.01029434, + "balance_loss_clip": 1.01763797, + "balance_loss_mlp": 1.03958058, + "epoch": 0.511769126709755, + "flos": 21762082241280.0, + "grad_norm": 2.060947746528677, + "language_loss": 0.66430634, + "learning_rate": 2.020835823045001e-06, + "loss": 0.68565977, + "num_input_tokens_seen": 183045140, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 8512, + "time_per_iteration": 5.427145481109619 + }, + { + "auxiliary_loss_clip": 0.01109669, + "auxiliary_loss_mlp": 0.01036594, + "balance_loss_clip": 1.02326632, + "balance_loss_mlp": 1.03883505, + "epoch": 0.511829249962423, + "flos": 23915357827200.0, + "grad_norm": 2.433145093070313, + "language_loss": 0.66578728, + "learning_rate": 2.0204463821062146e-06, + "loss": 0.6872499, + "num_input_tokens_seen": 183063935, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.70703125, + "step": 8513, + "time_per_iteration": 3.935227870941162 + }, + { + "auxiliary_loss_clip": 0.01106032, + "auxiliary_loss_mlp": 0.01033163, + "balance_loss_clip": 1.02099788, + "balance_loss_mlp": 1.03927946, + "epoch": 0.511889373215091, + "flos": 23727005884800.0, + "grad_norm": 2.0509279474405115, + "language_loss": 0.69136906, + "learning_rate": 2.0200569403921e-06, + "loss": 0.71276104, + "num_input_tokens_seen": 183084135, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6640625, + "step": 8514, + "time_per_iteration": 2.5390119552612305 + }, + { + "auxiliary_loss_clip": 0.01102947, + "auxiliary_loss_mlp": 0.01026976, + "balance_loss_clip": 1.01599109, + "balance_loss_mlp": 1.03685427, + "epoch": 0.5119494964677589, + "flos": 28111519526400.0, + "grad_norm": 1.6362442678403473, + "language_loss": 0.66014814, + "learning_rate": 2.019667497917424e-06, + "loss": 0.68144739, + "num_input_tokens_seen": 183104570, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.66015625, + "step": 8515, + "time_per_iteration": 2.492664098739624 + }, + { + "auxiliary_loss_clip": 0.01103893, + "auxiliary_loss_mlp": 0.01031754, + "balance_loss_clip": 1.02031612, + "balance_loss_mlp": 1.03691602, + "epoch": 0.5120096197204269, + "flos": 24973214296320.0, + "grad_norm": 2.89314496105325, + "language_loss": 0.74966168, + "learning_rate": 2.019278054696955e-06, + "loss": 0.77101815, + "num_input_tokens_seen": 183123850, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 8516, + "time_per_iteration": 2.5428519248962402 + }, + { + "auxiliary_loss_clip": 0.01111253, + "auxiliary_loss_mlp": 0.01033594, + "balance_loss_clip": 1.02181602, + "balance_loss_mlp": 1.04198885, + "epoch": 0.5120697429730948, + "flos": 17968012364160.0, + "grad_norm": 1.7790403014833382, + "language_loss": 0.77862155, + "learning_rate": 2.0188886107454595e-06, + "loss": 0.80007005, + "num_input_tokens_seen": 183141725, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.69140625, + "step": 8517, + "time_per_iteration": 2.4259724617004395 + }, + { + "auxiliary_loss_clip": 0.01110887, + "auxiliary_loss_mlp": 0.01031413, + "balance_loss_clip": 1.01897407, + "balance_loss_mlp": 1.03983212, + "epoch": 0.5121298662257628, + "flos": 23292343405440.0, + "grad_norm": 1.7905284866787141, + "language_loss": 0.73672384, + "learning_rate": 2.0184991660777063e-06, + "loss": 0.75814688, + "num_input_tokens_seen": 183161300, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 8518, + "time_per_iteration": 2.5707037448883057 + }, + { + "auxiliary_loss_clip": 0.01107458, + "auxiliary_loss_mlp": 0.01037485, + "balance_loss_clip": 1.02557039, + "balance_loss_mlp": 1.03892565, + "epoch": 0.5121899894784308, + "flos": 17311062568320.0, + "grad_norm": 1.6752140453085944, + "language_loss": 0.78055197, + "learning_rate": 2.0181097207084625e-06, + "loss": 0.80200136, + "num_input_tokens_seen": 183180495, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 8519, + "time_per_iteration": 2.417372226715088 + }, + { + "auxiliary_loss_clip": 0.01109296, + "auxiliary_loss_mlp": 0.0103241, + "balance_loss_clip": 1.02049518, + "balance_loss_mlp": 1.04082775, + "epoch": 0.5122501127310988, + "flos": 24930085040640.0, + "grad_norm": 1.573776111474748, + "language_loss": 0.79204106, + "learning_rate": 2.017720274652497e-06, + "loss": 0.8134582, + "num_input_tokens_seen": 183200330, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.68359375, + "step": 8520, + "time_per_iteration": 2.63323712348938 + }, + { + "auxiliary_loss_clip": 0.01112541, + "auxiliary_loss_mlp": 0.01039702, + "balance_loss_clip": 1.02623105, + "balance_loss_mlp": 1.03924751, + "epoch": 0.5123102359837667, + "flos": 18442859184000.0, + "grad_norm": 1.6319482307550086, + "language_loss": 0.81403995, + "learning_rate": 2.0173308279245765e-06, + "loss": 0.83556241, + "num_input_tokens_seen": 183218230, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 8521, + "time_per_iteration": 2.4723713397979736 + }, + { + "auxiliary_loss_clip": 0.01106273, + "auxiliary_loss_mlp": 0.01029547, + "balance_loss_clip": 1.01685691, + "balance_loss_mlp": 1.03599286, + "epoch": 0.5123703592364347, + "flos": 26684860164480.0, + "grad_norm": 1.90297827684807, + "language_loss": 0.68368387, + "learning_rate": 2.0169413805394692e-06, + "loss": 0.70504206, + "num_input_tokens_seen": 183236735, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 8522, + "time_per_iteration": 2.516411066055298 + }, + { + "auxiliary_loss_clip": 0.01115928, + "auxiliary_loss_mlp": 0.01041085, + "balance_loss_clip": 1.02506292, + "balance_loss_mlp": 1.04201221, + "epoch": 0.5124304824891026, + "flos": 28803948981120.0, + "grad_norm": 2.718510344621862, + "language_loss": 0.6155864, + "learning_rate": 2.0165519325119433e-06, + "loss": 0.63715655, + "num_input_tokens_seen": 183257550, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.73828125, + "step": 8523, + "time_per_iteration": 2.524775266647339 + }, + { + "auxiliary_loss_clip": 0.01110788, + "auxiliary_loss_mlp": 0.010355, + "balance_loss_clip": 1.0238173, + "balance_loss_mlp": 1.04113579, + "epoch": 0.5124906057417706, + "flos": 21761830846080.0, + "grad_norm": 2.0609816781673884, + "language_loss": 0.78066456, + "learning_rate": 2.0161624838567656e-06, + "loss": 0.80212736, + "num_input_tokens_seen": 183275515, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6953125, + "step": 8524, + "time_per_iteration": 2.526226043701172 + }, + { + "auxiliary_loss_clip": 0.01109029, + "auxiliary_loss_mlp": 0.0103495, + "balance_loss_clip": 1.02350545, + "balance_loss_mlp": 1.0413003, + "epoch": 0.5125507289944387, + "flos": 18880538405760.0, + "grad_norm": 1.8496964430325211, + "language_loss": 0.75055063, + "learning_rate": 2.015773034588706e-06, + "loss": 0.77199042, + "num_input_tokens_seen": 183293880, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6796875, + "step": 8525, + "time_per_iteration": 2.432555913925171 + }, + { + "auxiliary_loss_clip": 0.01112941, + "auxiliary_loss_mlp": 0.01036722, + "balance_loss_clip": 1.02385902, + "balance_loss_mlp": 1.04111516, + "epoch": 0.5126108522471066, + "flos": 35627838036480.0, + "grad_norm": 1.559913373859493, + "language_loss": 0.74452645, + "learning_rate": 2.015383584722531e-06, + "loss": 0.76602304, + "num_input_tokens_seen": 183315860, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 8526, + "time_per_iteration": 2.6282670497894287 + }, + { + "auxiliary_loss_clip": 0.01110533, + "auxiliary_loss_mlp": 0.01040593, + "balance_loss_clip": 1.02799845, + "balance_loss_mlp": 1.04028583, + "epoch": 0.5126709754997746, + "flos": 20190918464640.0, + "grad_norm": 1.490779495017149, + "language_loss": 0.65322489, + "learning_rate": 2.0149941342730088e-06, + "loss": 0.67473614, + "num_input_tokens_seen": 183335480, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 8527, + "time_per_iteration": 2.467350482940674 + }, + { + "auxiliary_loss_clip": 0.01108518, + "auxiliary_loss_mlp": 0.01039646, + "balance_loss_clip": 1.02852428, + "balance_loss_mlp": 1.04277444, + "epoch": 0.5127310987524425, + "flos": 18588548747520.0, + "grad_norm": 1.5603597457219889, + "language_loss": 0.74514449, + "learning_rate": 2.014604683254908e-06, + "loss": 0.76662612, + "num_input_tokens_seen": 183354395, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 8528, + "time_per_iteration": 2.513795852661133 + }, + { + "auxiliary_loss_clip": 0.01105318, + "auxiliary_loss_mlp": 0.01034313, + "balance_loss_clip": 1.02236843, + "balance_loss_mlp": 1.03608227, + "epoch": 0.5127912220051105, + "flos": 22454691264000.0, + "grad_norm": 1.756255656529514, + "language_loss": 0.83061087, + "learning_rate": 2.014215231682995e-06, + "loss": 0.85200721, + "num_input_tokens_seen": 183372980, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 8529, + "time_per_iteration": 2.4574379920959473 + }, + { + "auxiliary_loss_clip": 0.01106885, + "auxiliary_loss_mlp": 0.01032856, + "balance_loss_clip": 1.02045822, + "balance_loss_mlp": 1.03895748, + "epoch": 0.5128513452577784, + "flos": 19093703667840.0, + "grad_norm": 1.6787234743344808, + "language_loss": 0.73559862, + "learning_rate": 2.01382577957204e-06, + "loss": 0.75699604, + "num_input_tokens_seen": 183390160, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 8530, + "time_per_iteration": 2.4669532775878906 + }, + { + "auxiliary_loss_clip": 0.01039899, + "auxiliary_loss_mlp": 0.0100398, + "balance_loss_clip": 1.00278807, + "balance_loss_mlp": 1.01703906, + "epoch": 0.5129114685104464, + "flos": 67892285243520.0, + "grad_norm": 0.7465649329198393, + "language_loss": 0.60806251, + "learning_rate": 2.0134363269368095e-06, + "loss": 0.6285013, + "num_input_tokens_seen": 183455280, + "router_z_loss_clip": 0.01190186, + "router_z_loss_mlp": 0.22851562, + "step": 8531, + "time_per_iteration": 3.1615967750549316 + }, + { + "auxiliary_loss_clip": 0.01110166, + "auxiliary_loss_mlp": 0.01029475, + "balance_loss_clip": 1.01732779, + "balance_loss_mlp": 1.04014051, + "epoch": 0.5129715917631144, + "flos": 20449152316800.0, + "grad_norm": 1.6561974446519532, + "language_loss": 0.76540768, + "learning_rate": 2.0130468737920725e-06, + "loss": 0.78680408, + "num_input_tokens_seen": 183473955, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.703125, + "step": 8532, + "time_per_iteration": 2.4836883544921875 + }, + { + "auxiliary_loss_clip": 0.01107783, + "auxiliary_loss_mlp": 0.01031606, + "balance_loss_clip": 1.01894033, + "balance_loss_mlp": 1.03866601, + "epoch": 0.5130317150157824, + "flos": 35116146840960.0, + "grad_norm": 2.847315245703251, + "language_loss": 0.67183244, + "learning_rate": 2.012657420152597e-06, + "loss": 0.6932264, + "num_input_tokens_seen": 183497195, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 8533, + "time_per_iteration": 2.6025052070617676 + }, + { + "auxiliary_loss_clip": 0.01112515, + "auxiliary_loss_mlp": 0.01036644, + "balance_loss_clip": 1.02333999, + "balance_loss_mlp": 1.04080868, + "epoch": 0.5130918382684503, + "flos": 19791627903360.0, + "grad_norm": 1.8363553974693196, + "language_loss": 0.81724054, + "learning_rate": 2.01226796603315e-06, + "loss": 0.83873212, + "num_input_tokens_seen": 183513675, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 8534, + "time_per_iteration": 2.465374231338501 + }, + { + "auxiliary_loss_clip": 0.01111356, + "auxiliary_loss_mlp": 0.01035387, + "balance_loss_clip": 1.02167177, + "balance_loss_mlp": 1.0399549, + "epoch": 0.5131519615211183, + "flos": 26323096337280.0, + "grad_norm": 1.5787063577136407, + "language_loss": 0.63588178, + "learning_rate": 2.0118785114485017e-06, + "loss": 0.65734923, + "num_input_tokens_seen": 183535165, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71484375, + "step": 8535, + "time_per_iteration": 2.50287127494812 + }, + { + "auxiliary_loss_clip": 0.01111823, + "auxiliary_loss_mlp": 0.0102686, + "balance_loss_clip": 1.01434922, + "balance_loss_mlp": 1.04166365, + "epoch": 0.5132120847737862, + "flos": 19171917532800.0, + "grad_norm": 1.5428442042942097, + "language_loss": 0.69746888, + "learning_rate": 2.011489056413418e-06, + "loss": 0.71885574, + "num_input_tokens_seen": 183553780, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69921875, + "step": 8536, + "time_per_iteration": 2.459897041320801 + }, + { + "auxiliary_loss_clip": 0.01113816, + "auxiliary_loss_mlp": 0.01033547, + "balance_loss_clip": 1.01963568, + "balance_loss_mlp": 1.04082823, + "epoch": 0.5132722080264542, + "flos": 20230420446720.0, + "grad_norm": 2.3299626101952784, + "language_loss": 0.71215963, + "learning_rate": 2.011099600942669e-06, + "loss": 0.73363328, + "num_input_tokens_seen": 183572285, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7265625, + "step": 8537, + "time_per_iteration": 2.4840991497039795 + }, + { + "auxiliary_loss_clip": 0.01111456, + "auxiliary_loss_mlp": 0.0103313, + "balance_loss_clip": 1.02013016, + "balance_loss_mlp": 1.03927016, + "epoch": 0.5133323312791223, + "flos": 16469459930880.0, + "grad_norm": 6.302946358508802, + "language_loss": 0.80441952, + "learning_rate": 2.0107101450510214e-06, + "loss": 0.82586539, + "num_input_tokens_seen": 183589330, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71875, + "step": 8538, + "time_per_iteration": 2.4378812313079834 + }, + { + "auxiliary_loss_clip": 0.01107763, + "auxiliary_loss_mlp": 0.01031809, + "balance_loss_clip": 1.01880276, + "balance_loss_mlp": 1.03764546, + "epoch": 0.5133924545317902, + "flos": 26068094709120.0, + "grad_norm": 1.8808034234185624, + "language_loss": 0.78517324, + "learning_rate": 2.0103206887532437e-06, + "loss": 0.80656898, + "num_input_tokens_seen": 183609205, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.703125, + "step": 8539, + "time_per_iteration": 2.5144600868225098 + }, + { + "auxiliary_loss_clip": 0.0111221, + "auxiliary_loss_mlp": 0.01032928, + "balance_loss_clip": 1.02025044, + "balance_loss_mlp": 1.04009342, + "epoch": 0.5134525777844582, + "flos": 29131023248640.0, + "grad_norm": 1.5130664168284647, + "language_loss": 0.75880563, + "learning_rate": 2.009931232064105e-06, + "loss": 0.78025699, + "num_input_tokens_seen": 183629985, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 8540, + "time_per_iteration": 2.55734920501709 + }, + { + "auxiliary_loss_clip": 0.0111462, + "auxiliary_loss_mlp": 0.01033022, + "balance_loss_clip": 1.01905608, + "balance_loss_mlp": 1.04176068, + "epoch": 0.5135127010371261, + "flos": 17454776883840.0, + "grad_norm": 2.8219986700547555, + "language_loss": 0.74552548, + "learning_rate": 2.0095417749983724e-06, + "loss": 0.76700193, + "num_input_tokens_seen": 183648220, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.73046875, + "step": 8541, + "time_per_iteration": 2.432055711746216 + }, + { + "auxiliary_loss_clip": 0.01110326, + "auxiliary_loss_mlp": 0.01033335, + "balance_loss_clip": 1.02005482, + "balance_loss_mlp": 1.03941679, + "epoch": 0.5135728242897941, + "flos": 21944975316480.0, + "grad_norm": 1.945278300015613, + "language_loss": 0.70215029, + "learning_rate": 2.0091523175708162e-06, + "loss": 0.72358692, + "num_input_tokens_seen": 183668230, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 8542, + "time_per_iteration": 2.5227723121643066 + }, + { + "auxiliary_loss_clip": 0.01112639, + "auxiliary_loss_mlp": 0.01026897, + "balance_loss_clip": 1.01403403, + "balance_loss_mlp": 1.04146171, + "epoch": 0.513632947542462, + "flos": 22674859678080.0, + "grad_norm": 1.83289507202946, + "language_loss": 0.78898811, + "learning_rate": 2.0087628597962023e-06, + "loss": 0.8103835, + "num_input_tokens_seen": 183687800, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 8543, + "time_per_iteration": 2.4559075832366943 + }, + { + "auxiliary_loss_clip": 0.0111214, + "auxiliary_loss_mlp": 0.01037576, + "balance_loss_clip": 1.02426672, + "balance_loss_mlp": 1.04161441, + "epoch": 0.51369307079513, + "flos": 29457163762560.0, + "grad_norm": 1.9171309591761885, + "language_loss": 0.68051696, + "learning_rate": 2.008373401689299e-06, + "loss": 0.70201409, + "num_input_tokens_seen": 183709025, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.703125, + "step": 8544, + "time_per_iteration": 2.5344274044036865 + }, + { + "auxiliary_loss_clip": 0.01113551, + "auxiliary_loss_mlp": 0.01039409, + "balance_loss_clip": 1.02671301, + "balance_loss_mlp": 1.04096842, + "epoch": 0.513753194047798, + "flos": 18989347680000.0, + "grad_norm": 2.2205990317105395, + "language_loss": 0.7225253, + "learning_rate": 2.0079839432648765e-06, + "loss": 0.74405491, + "num_input_tokens_seen": 183725740, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7265625, + "step": 8545, + "time_per_iteration": 2.4303176403045654 + }, + { + "auxiliary_loss_clip": 0.01112226, + "auxiliary_loss_mlp": 0.01038034, + "balance_loss_clip": 1.02431881, + "balance_loss_mlp": 1.03957486, + "epoch": 0.513813317300466, + "flos": 17821855923840.0, + "grad_norm": 1.967971348268394, + "language_loss": 0.81898367, + "learning_rate": 2.0075944845377016e-06, + "loss": 0.84048629, + "num_input_tokens_seen": 183743995, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7265625, + "step": 8546, + "time_per_iteration": 2.4504597187042236 + }, + { + "auxiliary_loss_clip": 0.01111418, + "auxiliary_loss_mlp": 0.01033938, + "balance_loss_clip": 1.02099776, + "balance_loss_mlp": 1.03963637, + "epoch": 0.5138734405531339, + "flos": 24061191045120.0, + "grad_norm": 1.6545588723955058, + "language_loss": 0.73301136, + "learning_rate": 2.007205025522544e-06, + "loss": 0.75446492, + "num_input_tokens_seen": 183764150, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71875, + "step": 8547, + "time_per_iteration": 2.4682819843292236 + }, + { + "auxiliary_loss_clip": 0.01108626, + "auxiliary_loss_mlp": 0.01043301, + "balance_loss_clip": 1.03010488, + "balance_loss_mlp": 1.03783822, + "epoch": 0.5139335638058019, + "flos": 26097253574400.0, + "grad_norm": 1.620202866362127, + "language_loss": 0.73577881, + "learning_rate": 2.0068155662341702e-06, + "loss": 0.75729811, + "num_input_tokens_seen": 183783280, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.70703125, + "step": 8548, + "time_per_iteration": 2.511922597885132 + }, + { + "auxiliary_loss_clip": 0.01110019, + "auxiliary_loss_mlp": 0.01034147, + "balance_loss_clip": 1.02117133, + "balance_loss_mlp": 1.03852081, + "epoch": 0.5139936870584698, + "flos": 18917095472640.0, + "grad_norm": 1.506476906057379, + "language_loss": 0.82239324, + "learning_rate": 2.0064261066873495e-06, + "loss": 0.84383494, + "num_input_tokens_seen": 183800725, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71484375, + "step": 8549, + "time_per_iteration": 2.433605194091797 + }, + { + "auxiliary_loss_clip": 0.01110043, + "auxiliary_loss_mlp": 0.01035127, + "balance_loss_clip": 1.02292621, + "balance_loss_mlp": 1.04096317, + "epoch": 0.5140538103111378, + "flos": 16144001775360.0, + "grad_norm": 1.8131541317091766, + "language_loss": 0.72331119, + "learning_rate": 2.0060366468968504e-06, + "loss": 0.7447629, + "num_input_tokens_seen": 183818735, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69140625, + "step": 8550, + "time_per_iteration": 2.4659972190856934 + }, + { + "auxiliary_loss_clip": 0.01114255, + "auxiliary_loss_mlp": 0.01034858, + "balance_loss_clip": 1.02173352, + "balance_loss_mlp": 1.0404501, + "epoch": 0.5141139335638057, + "flos": 22420145358720.0, + "grad_norm": 1.6035097357113468, + "language_loss": 0.75497758, + "learning_rate": 2.0056471868774408e-06, + "loss": 0.77646863, + "num_input_tokens_seen": 183840015, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73828125, + "step": 8551, + "time_per_iteration": 2.453734874725342 + }, + { + "auxiliary_loss_clip": 0.01108366, + "auxiliary_loss_mlp": 0.01030625, + "balance_loss_clip": 1.01805425, + "balance_loss_mlp": 1.04017091, + "epoch": 0.5141740568164738, + "flos": 27089645506560.0, + "grad_norm": 1.6015349884444547, + "language_loss": 0.69001007, + "learning_rate": 2.0052577266438897e-06, + "loss": 0.71140003, + "num_input_tokens_seen": 183860145, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.68359375, + "step": 8552, + "time_per_iteration": 3.9047505855560303 + }, + { + "auxiliary_loss_clip": 0.01110174, + "auxiliary_loss_mlp": 0.01032262, + "balance_loss_clip": 1.01927972, + "balance_loss_mlp": 1.03868091, + "epoch": 0.5142341800691418, + "flos": 24973250209920.0, + "grad_norm": 1.7916575293353634, + "language_loss": 0.74736363, + "learning_rate": 2.004868266210965e-06, + "loss": 0.76878798, + "num_input_tokens_seen": 183880540, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71484375, + "step": 8553, + "time_per_iteration": 2.5039455890655518 + }, + { + "auxiliary_loss_clip": 0.01109768, + "auxiliary_loss_mlp": 0.01035209, + "balance_loss_clip": 1.02241778, + "balance_loss_mlp": 1.0397613, + "epoch": 0.5142943033218097, + "flos": 20704513080960.0, + "grad_norm": 1.707634664835445, + "language_loss": 0.68126231, + "learning_rate": 2.004478805593435e-06, + "loss": 0.70271206, + "num_input_tokens_seen": 183900895, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 8554, + "time_per_iteration": 5.488779544830322 + }, + { + "auxiliary_loss_clip": 0.01112685, + "auxiliary_loss_mlp": 0.01036304, + "balance_loss_clip": 1.02173042, + "balance_loss_mlp": 1.03879559, + "epoch": 0.5143544265744777, + "flos": 22925479847040.0, + "grad_norm": 2.3217393931515846, + "language_loss": 0.73303884, + "learning_rate": 2.004089344806068e-06, + "loss": 0.75452876, + "num_input_tokens_seen": 183920335, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.73828125, + "step": 8555, + "time_per_iteration": 3.866107940673828 + }, + { + "auxiliary_loss_clip": 0.01111396, + "auxiliary_loss_mlp": 0.01035591, + "balance_loss_clip": 1.02278817, + "balance_loss_mlp": 1.04023397, + "epoch": 0.5144145498271456, + "flos": 15921391236480.0, + "grad_norm": 2.3509367679077124, + "language_loss": 0.74724478, + "learning_rate": 2.003699883863633e-06, + "loss": 0.76871467, + "num_input_tokens_seen": 183936220, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7109375, + "step": 8556, + "time_per_iteration": 2.423941135406494 + }, + { + "auxiliary_loss_clip": 0.01105419, + "auxiliary_loss_mlp": 0.0103358, + "balance_loss_clip": 1.02135563, + "balance_loss_mlp": 1.03695798, + "epoch": 0.5144746730798136, + "flos": 19681238430720.0, + "grad_norm": 1.7510489074761373, + "language_loss": 0.86147487, + "learning_rate": 2.003310422780898e-06, + "loss": 0.88286483, + "num_input_tokens_seen": 183953250, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 8557, + "time_per_iteration": 2.4232289791107178 + }, + { + "auxiliary_loss_clip": 0.01105513, + "auxiliary_loss_mlp": 0.01033194, + "balance_loss_clip": 1.02162433, + "balance_loss_mlp": 1.03741109, + "epoch": 0.5145347963324816, + "flos": 23914711382400.0, + "grad_norm": 1.4648111070630687, + "language_loss": 0.89026904, + "learning_rate": 2.0029209615726307e-06, + "loss": 0.91165608, + "num_input_tokens_seen": 183973865, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6796875, + "step": 8558, + "time_per_iteration": 2.4937002658843994 + }, + { + "auxiliary_loss_clip": 0.01106843, + "auxiliary_loss_mlp": 0.01032131, + "balance_loss_clip": 1.01952434, + "balance_loss_mlp": 1.03844643, + "epoch": 0.5145949195851496, + "flos": 18260002022400.0, + "grad_norm": 1.959206520418211, + "language_loss": 0.65027267, + "learning_rate": 2.002531500253602e-06, + "loss": 0.67166239, + "num_input_tokens_seen": 183992555, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.68359375, + "step": 8559, + "time_per_iteration": 2.4625425338745117 + }, + { + "auxiliary_loss_clip": 0.01109462, + "auxiliary_loss_mlp": 0.01035258, + "balance_loss_clip": 1.02255082, + "balance_loss_mlp": 1.04041696, + "epoch": 0.5146550428378175, + "flos": 26213425136640.0, + "grad_norm": 1.5416961138531182, + "language_loss": 0.62973124, + "learning_rate": 2.002142038838577e-06, + "loss": 0.65117842, + "num_input_tokens_seen": 184010825, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 8560, + "time_per_iteration": 2.509413719177246 + }, + { + "auxiliary_loss_clip": 0.01107571, + "auxiliary_loss_mlp": 0.01030305, + "balance_loss_clip": 1.01798463, + "balance_loss_mlp": 1.03850913, + "epoch": 0.5147151660904855, + "flos": 22674177319680.0, + "grad_norm": 1.5387222778191898, + "language_loss": 0.69879884, + "learning_rate": 2.0017525773423265e-06, + "loss": 0.72017759, + "num_input_tokens_seen": 184030155, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69140625, + "step": 8561, + "time_per_iteration": 2.4802825450897217 + }, + { + "auxiliary_loss_clip": 0.01108154, + "auxiliary_loss_mlp": 0.01030825, + "balance_loss_clip": 1.01894569, + "balance_loss_mlp": 1.03752971, + "epoch": 0.5147752893431534, + "flos": 24972388283520.0, + "grad_norm": 1.5731273846161422, + "language_loss": 0.66646934, + "learning_rate": 2.0013631157796177e-06, + "loss": 0.68785918, + "num_input_tokens_seen": 184051440, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.70703125, + "step": 8562, + "time_per_iteration": 2.505180835723877 + }, + { + "auxiliary_loss_clip": 0.01110444, + "auxiliary_loss_mlp": 0.01030865, + "balance_loss_clip": 1.01824713, + "balance_loss_mlp": 1.03924227, + "epoch": 0.5148354125958214, + "flos": 22744669760640.0, + "grad_norm": 1.6680045222139546, + "language_loss": 0.77707577, + "learning_rate": 2.0009736541652188e-06, + "loss": 0.79848886, + "num_input_tokens_seen": 184070205, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 8563, + "time_per_iteration": 2.4935452938079834 + }, + { + "auxiliary_loss_clip": 0.01112567, + "auxiliary_loss_mlp": 0.01033673, + "balance_loss_clip": 1.01932585, + "balance_loss_mlp": 1.03827047, + "epoch": 0.5148955358484893, + "flos": 23068763199360.0, + "grad_norm": 2.1629374301288284, + "language_loss": 0.82324845, + "learning_rate": 2.0005841925139e-06, + "loss": 0.84471083, + "num_input_tokens_seen": 184087345, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7421875, + "step": 8564, + "time_per_iteration": 2.4590399265289307 + }, + { + "auxiliary_loss_clip": 0.01112048, + "auxiliary_loss_mlp": 0.01035558, + "balance_loss_clip": 1.0223794, + "balance_loss_mlp": 1.03859615, + "epoch": 0.5149556591011574, + "flos": 20340127560960.0, + "grad_norm": 1.7207643570499924, + "language_loss": 0.73255235, + "learning_rate": 2.0001947308404283e-06, + "loss": 0.75402838, + "num_input_tokens_seen": 184107110, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 8565, + "time_per_iteration": 2.471970558166504 + }, + { + "auxiliary_loss_clip": 0.01113674, + "auxiliary_loss_mlp": 0.01033516, + "balance_loss_clip": 1.01884174, + "balance_loss_mlp": 1.03977931, + "epoch": 0.5150157823538254, + "flos": 22638230784000.0, + "grad_norm": 1.8782058792026062, + "language_loss": 0.683079, + "learning_rate": 1.9998052691595715e-06, + "loss": 0.70455092, + "num_input_tokens_seen": 184127105, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.73828125, + "step": 8566, + "time_per_iteration": 2.4981720447540283 + }, + { + "auxiliary_loss_clip": 0.01109217, + "auxiliary_loss_mlp": 0.01029217, + "balance_loss_clip": 1.01639605, + "balance_loss_mlp": 1.03583431, + "epoch": 0.5150759056064933, + "flos": 26067627832320.0, + "grad_norm": 2.0482874573832177, + "language_loss": 0.78111541, + "learning_rate": 1.9994158074861005e-06, + "loss": 0.80249971, + "num_input_tokens_seen": 184148060, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.734375, + "step": 8567, + "time_per_iteration": 2.490272045135498 + }, + { + "auxiliary_loss_clip": 0.01113521, + "auxiliary_loss_mlp": 0.01033959, + "balance_loss_clip": 1.02054214, + "balance_loss_mlp": 1.04046249, + "epoch": 0.5151360288591613, + "flos": 25952641418880.0, + "grad_norm": 2.0737995601061274, + "language_loss": 0.790721, + "learning_rate": 1.9990263458347806e-06, + "loss": 0.81219578, + "num_input_tokens_seen": 184166175, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73046875, + "step": 8568, + "time_per_iteration": 2.602315902709961 + }, + { + "auxiliary_loss_clip": 0.01106374, + "auxiliary_loss_mlp": 0.01031237, + "balance_loss_clip": 1.01885664, + "balance_loss_mlp": 1.03637588, + "epoch": 0.5151961521118292, + "flos": 18507246312960.0, + "grad_norm": 2.0499636702484945, + "language_loss": 0.90935498, + "learning_rate": 1.9986368842203825e-06, + "loss": 0.93073106, + "num_input_tokens_seen": 184182600, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.703125, + "step": 8569, + "time_per_iteration": 2.430600643157959 + }, + { + "auxiliary_loss_clip": 0.01110259, + "auxiliary_loss_mlp": 0.01030056, + "balance_loss_clip": 1.01677024, + "balance_loss_mlp": 1.03865302, + "epoch": 0.5152562753644973, + "flos": 22233696837120.0, + "grad_norm": 1.6639049645433037, + "language_loss": 0.76229095, + "learning_rate": 1.998247422657674e-06, + "loss": 0.78369409, + "num_input_tokens_seen": 184202020, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 8570, + "time_per_iteration": 2.48988676071167 + }, + { + "auxiliary_loss_clip": 0.01108277, + "auxiliary_loss_mlp": 0.01037495, + "balance_loss_clip": 1.02357769, + "balance_loss_mlp": 1.03741157, + "epoch": 0.5153163986171652, + "flos": 38436555047040.0, + "grad_norm": 1.5896565556148876, + "language_loss": 0.7375021, + "learning_rate": 1.9978579611614227e-06, + "loss": 0.75895989, + "num_input_tokens_seen": 184224850, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7109375, + "step": 8571, + "time_per_iteration": 2.618849754333496 + }, + { + "auxiliary_loss_clip": 0.01035305, + "auxiliary_loss_mlp": 0.00998776, + "balance_loss_clip": 0.99780464, + "balance_loss_mlp": 1.0127461, + "epoch": 0.5153765218698332, + "flos": 66384503015040.0, + "grad_norm": 0.7780004501915253, + "language_loss": 0.52940249, + "learning_rate": 1.9974684997463984e-06, + "loss": 0.54974329, + "num_input_tokens_seen": 184288520, + "router_z_loss_clip": 0.00970459, + "router_z_loss_mlp": 0.22558594, + "step": 8572, + "time_per_iteration": 3.1418654918670654 + }, + { + "auxiliary_loss_clip": 0.01108043, + "auxiliary_loss_mlp": 0.01032788, + "balance_loss_clip": 1.02087331, + "balance_loss_mlp": 1.04004169, + "epoch": 0.5154366451225011, + "flos": 24024669891840.0, + "grad_norm": 1.7275406058075027, + "language_loss": 0.76217729, + "learning_rate": 1.9970790384273687e-06, + "loss": 0.78358561, + "num_input_tokens_seen": 184308565, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 8573, + "time_per_iteration": 2.4757239818573 + }, + { + "auxiliary_loss_clip": 0.01105818, + "auxiliary_loss_mlp": 0.01029217, + "balance_loss_clip": 1.01627111, + "balance_loss_mlp": 1.03679562, + "epoch": 0.5154967683751691, + "flos": 23468843859840.0, + "grad_norm": 1.9279490614808483, + "language_loss": 0.77039665, + "learning_rate": 1.996689577219102e-06, + "loss": 0.79174697, + "num_input_tokens_seen": 184326795, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6875, + "step": 8574, + "time_per_iteration": 2.478935718536377 + }, + { + "auxiliary_loss_clip": 0.01107394, + "auxiliary_loss_mlp": 0.01029105, + "balance_loss_clip": 1.01714277, + "balance_loss_mlp": 1.03757906, + "epoch": 0.515556891627837, + "flos": 23805650712960.0, + "grad_norm": 1.6824577114627284, + "language_loss": 0.85421538, + "learning_rate": 1.996300116136367e-06, + "loss": 0.87558043, + "num_input_tokens_seen": 184345990, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.69921875, + "step": 8575, + "time_per_iteration": 2.4811151027679443 + }, + { + "auxiliary_loss_clip": 0.01108061, + "auxiliary_loss_mlp": 0.01032098, + "balance_loss_clip": 1.01971185, + "balance_loss_mlp": 1.03703451, + "epoch": 0.515617014880505, + "flos": 19828544106240.0, + "grad_norm": 1.6692718685381052, + "language_loss": 0.76704675, + "learning_rate": 1.995910655193932e-06, + "loss": 0.78844833, + "num_input_tokens_seen": 184366300, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 8576, + "time_per_iteration": 2.490389108657837 + }, + { + "auxiliary_loss_clip": 0.011134, + "auxiliary_loss_mlp": 0.01031565, + "balance_loss_clip": 1.01836872, + "balance_loss_mlp": 1.03960061, + "epoch": 0.515677138133173, + "flos": 14245907385600.0, + "grad_norm": 3.052053268886893, + "language_loss": 0.75463682, + "learning_rate": 1.9955211944065654e-06, + "loss": 0.77608645, + "num_input_tokens_seen": 184383030, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73828125, + "step": 8577, + "time_per_iteration": 2.416757583618164 + }, + { + "auxiliary_loss_clip": 0.0111005, + "auxiliary_loss_mlp": 0.01037518, + "balance_loss_clip": 1.02441728, + "balance_loss_mlp": 1.0376997, + "epoch": 0.515737261385841, + "flos": 28289707920000.0, + "grad_norm": 1.834882992604573, + "language_loss": 0.80803275, + "learning_rate": 1.9951317337890353e-06, + "loss": 0.82950842, + "num_input_tokens_seen": 184403410, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 8578, + "time_per_iteration": 2.517292022705078 + }, + { + "auxiliary_loss_clip": 0.01104508, + "auxiliary_loss_mlp": 0.01032552, + "balance_loss_clip": 1.02046442, + "balance_loss_mlp": 1.0357188, + "epoch": 0.515797384638509, + "flos": 27891925729920.0, + "grad_norm": 1.7011032882300805, + "language_loss": 0.76299787, + "learning_rate": 1.9947422733561105e-06, + "loss": 0.78436846, + "num_input_tokens_seen": 184423830, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 8579, + "time_per_iteration": 2.4907805919647217 + }, + { + "auxiliary_loss_clip": 0.01109486, + "auxiliary_loss_mlp": 0.01031503, + "balance_loss_clip": 1.01890254, + "balance_loss_mlp": 1.03864014, + "epoch": 0.5158575078911769, + "flos": 23040071210880.0, + "grad_norm": 1.5884760036798964, + "language_loss": 0.79018867, + "learning_rate": 1.994352813122559e-06, + "loss": 0.81159854, + "num_input_tokens_seen": 184445050, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 8580, + "time_per_iteration": 2.490298271179199 + }, + { + "auxiliary_loss_clip": 0.01111804, + "auxiliary_loss_mlp": 0.01036819, + "balance_loss_clip": 1.0237354, + "balance_loss_mlp": 1.03874159, + "epoch": 0.5159176311438449, + "flos": 12641346938880.0, + "grad_norm": 2.2420547036898277, + "language_loss": 0.72657341, + "learning_rate": 1.99396335310315e-06, + "loss": 0.74805963, + "num_input_tokens_seen": 184460775, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73046875, + "step": 8581, + "time_per_iteration": 2.419196367263794 + }, + { + "auxiliary_loss_clip": 0.01107618, + "auxiliary_loss_mlp": 0.01030311, + "balance_loss_clip": 1.01844954, + "balance_loss_mlp": 1.03848028, + "epoch": 0.5159777543965128, + "flos": 15558154951680.0, + "grad_norm": 2.260602789840083, + "language_loss": 0.74468267, + "learning_rate": 1.9935738933126508e-06, + "loss": 0.76606196, + "num_input_tokens_seen": 184477365, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.69140625, + "step": 8582, + "time_per_iteration": 2.4235429763793945 + }, + { + "auxiliary_loss_clip": 0.01107491, + "auxiliary_loss_mlp": 0.01033441, + "balance_loss_clip": 1.02201486, + "balance_loss_mlp": 1.03820109, + "epoch": 0.5160378776491809, + "flos": 23221671396480.0, + "grad_norm": 3.661326019284234, + "language_loss": 0.66308093, + "learning_rate": 1.99318443376583e-06, + "loss": 0.68449032, + "num_input_tokens_seen": 184497045, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.69140625, + "step": 8583, + "time_per_iteration": 2.483489990234375 + }, + { + "auxiliary_loss_clip": 0.0111088, + "auxiliary_loss_mlp": 0.01036135, + "balance_loss_clip": 1.02315259, + "balance_loss_mlp": 1.04015112, + "epoch": 0.5160980009018488, + "flos": 21944616180480.0, + "grad_norm": 1.4772972874821377, + "language_loss": 0.75878769, + "learning_rate": 1.9927949744774568e-06, + "loss": 0.78025782, + "num_input_tokens_seen": 184517675, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.70703125, + "step": 8584, + "time_per_iteration": 2.469770908355713 + }, + { + "auxiliary_loss_clip": 0.01109729, + "auxiliary_loss_mlp": 0.01038496, + "balance_loss_clip": 1.026057, + "balance_loss_mlp": 1.03763115, + "epoch": 0.5161581241545168, + "flos": 22784064001920.0, + "grad_norm": 1.908038470800245, + "language_loss": 0.78773153, + "learning_rate": 1.9924055154622983e-06, + "loss": 0.80921382, + "num_input_tokens_seen": 184537745, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.72265625, + "step": 8585, + "time_per_iteration": 2.4765405654907227 + }, + { + "auxiliary_loss_clip": 0.01103444, + "auxiliary_loss_mlp": 0.01031519, + "balance_loss_clip": 1.01976502, + "balance_loss_mlp": 1.03624129, + "epoch": 0.5162182474071847, + "flos": 19675384513920.0, + "grad_norm": 2.394419079152278, + "language_loss": 0.81022364, + "learning_rate": 1.9920160567351238e-06, + "loss": 0.83157325, + "num_input_tokens_seen": 184553630, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.671875, + "step": 8586, + "time_per_iteration": 2.45131254196167 + }, + { + "auxiliary_loss_clip": 0.01107797, + "auxiliary_loss_mlp": 0.01033426, + "balance_loss_clip": 1.02106369, + "balance_loss_mlp": 1.03754663, + "epoch": 0.5162783706598527, + "flos": 20046198568320.0, + "grad_norm": 2.0375667228771572, + "language_loss": 0.71716821, + "learning_rate": 1.991626598310701e-06, + "loss": 0.73858047, + "num_input_tokens_seen": 184573530, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.703125, + "step": 8587, + "time_per_iteration": 2.464603900909424 + }, + { + "auxiliary_loss_clip": 0.0103385, + "auxiliary_loss_mlp": 0.01011507, + "balance_loss_clip": 1.01052976, + "balance_loss_mlp": 1.01128352, + "epoch": 0.5163384939125206, + "flos": 69959553713280.0, + "grad_norm": 0.7317367951541988, + "language_loss": 0.57798368, + "learning_rate": 1.9912371402037984e-06, + "loss": 0.59843719, + "num_input_tokens_seen": 184637875, + "router_z_loss_clip": 0.00976562, + "router_z_loss_mlp": 0.22558594, + "step": 8588, + "time_per_iteration": 3.0708353519439697 + }, + { + "auxiliary_loss_clip": 0.01106665, + "auxiliary_loss_mlp": 0.01038792, + "balance_loss_clip": 1.02560759, + "balance_loss_mlp": 1.03631115, + "epoch": 0.5163986171651886, + "flos": 17417034668160.0, + "grad_norm": 1.9433685436573729, + "language_loss": 0.7553345, + "learning_rate": 1.990847682429185e-06, + "loss": 0.77678907, + "num_input_tokens_seen": 184656125, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 8589, + "time_per_iteration": 2.4392945766448975 + }, + { + "auxiliary_loss_clip": 0.0110855, + "auxiliary_loss_mlp": 0.01032646, + "balance_loss_clip": 1.02110088, + "balance_loss_mlp": 1.03822279, + "epoch": 0.5164587404178566, + "flos": 21322679166720.0, + "grad_norm": 2.018268520776434, + "language_loss": 0.67597556, + "learning_rate": 1.990458225001627e-06, + "loss": 0.69738752, + "num_input_tokens_seen": 184675920, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.703125, + "step": 8590, + "time_per_iteration": 2.480978012084961 + }, + { + "auxiliary_loss_clip": 0.01034536, + "auxiliary_loss_mlp": 0.01003309, + "balance_loss_clip": 1.00217628, + "balance_loss_mlp": 1.01181984, + "epoch": 0.5165188636705246, + "flos": 68057149691520.0, + "grad_norm": 0.7844517010344912, + "language_loss": 0.5593977, + "learning_rate": 1.990068767935895e-06, + "loss": 0.57977605, + "num_input_tokens_seen": 184730520, + "router_z_loss_clip": 0.01135254, + "router_z_loss_mlp": 0.2265625, + "step": 8591, + "time_per_iteration": 3.0380799770355225 + }, + { + "auxiliary_loss_clip": 0.01101472, + "auxiliary_loss_mlp": 0.01023222, + "balance_loss_clip": 1.01192665, + "balance_loss_mlp": 1.03659964, + "epoch": 0.5165789869231926, + "flos": 19385657412480.0, + "grad_norm": 1.5513724058155185, + "language_loss": 0.81425416, + "learning_rate": 1.9896793112467566e-06, + "loss": 0.83550113, + "num_input_tokens_seen": 184748340, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6484375, + "step": 8592, + "time_per_iteration": 2.4280107021331787 + }, + { + "auxiliary_loss_clip": 0.0110705, + "auxiliary_loss_mlp": 0.01023209, + "balance_loss_clip": 1.01141334, + "balance_loss_mlp": 1.04046106, + "epoch": 0.5166391101758605, + "flos": 20960197067520.0, + "grad_norm": 1.8100942034895195, + "language_loss": 0.83394146, + "learning_rate": 1.989289854948979e-06, + "loss": 0.85524404, + "num_input_tokens_seen": 184766615, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66796875, + "step": 8593, + "time_per_iteration": 3.9351704120635986 + }, + { + "auxiliary_loss_clip": 0.01109969, + "auxiliary_loss_mlp": 0.01031901, + "balance_loss_clip": 1.02004552, + "balance_loss_mlp": 1.04028952, + "epoch": 0.5166992334285285, + "flos": 29462407148160.0, + "grad_norm": 1.576203753972958, + "language_loss": 0.68724298, + "learning_rate": 1.9889003990573314e-06, + "loss": 0.70866162, + "num_input_tokens_seen": 184788075, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6953125, + "step": 8594, + "time_per_iteration": 2.547206163406372 + }, + { + "auxiliary_loss_clip": 0.01105211, + "auxiliary_loss_mlp": 0.0102705, + "balance_loss_clip": 1.01459885, + "balance_loss_mlp": 1.03660214, + "epoch": 0.5167593566811964, + "flos": 20304360593280.0, + "grad_norm": 1.9981153431236998, + "language_loss": 0.77706152, + "learning_rate": 1.988510943586582e-06, + "loss": 0.79838419, + "num_input_tokens_seen": 184808710, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6875, + "step": 8595, + "time_per_iteration": 2.5214362144470215 + }, + { + "auxiliary_loss_clip": 0.01107198, + "auxiliary_loss_mlp": 0.0103521, + "balance_loss_clip": 1.02278233, + "balance_loss_mlp": 1.03896379, + "epoch": 0.5168194799338645, + "flos": 14611370313600.0, + "grad_norm": 1.5236872991766963, + "language_loss": 0.64860648, + "learning_rate": 1.9881214885514986e-06, + "loss": 0.67003053, + "num_input_tokens_seen": 184826475, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6796875, + "step": 8596, + "time_per_iteration": 5.460975885391235 + }, + { + "auxiliary_loss_clip": 0.01109553, + "auxiliary_loss_mlp": 0.01029844, + "balance_loss_clip": 1.01603329, + "balance_loss_mlp": 1.04030609, + "epoch": 0.5168796031865324, + "flos": 25007257411200.0, + "grad_norm": 1.6129264208414336, + "language_loss": 0.75417203, + "learning_rate": 1.9877320339668492e-06, + "loss": 0.77556598, + "num_input_tokens_seen": 184845245, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.6953125, + "step": 8597, + "time_per_iteration": 2.477386236190796 + }, + { + "auxiliary_loss_clip": 0.01108076, + "auxiliary_loss_mlp": 0.01025716, + "balance_loss_clip": 1.01356828, + "balance_loss_mlp": 1.03728151, + "epoch": 0.5169397264392004, + "flos": 26939969533440.0, + "grad_norm": 1.684107970499364, + "language_loss": 0.80853873, + "learning_rate": 1.987342579847403e-06, + "loss": 0.82987666, + "num_input_tokens_seen": 184866605, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.70703125, + "step": 8598, + "time_per_iteration": 2.5056118965148926 + }, + { + "auxiliary_loss_clip": 0.01107998, + "auxiliary_loss_mlp": 0.01038534, + "balance_loss_clip": 1.02550411, + "balance_loss_mlp": 1.03853858, + "epoch": 0.5169998496918683, + "flos": 25407804948480.0, + "grad_norm": 1.5161151475530301, + "language_loss": 0.75315893, + "learning_rate": 1.9869531262079273e-06, + "loss": 0.77462423, + "num_input_tokens_seen": 184886945, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6953125, + "step": 8599, + "time_per_iteration": 2.4907233715057373 + }, + { + "auxiliary_loss_clip": 0.01106465, + "auxiliary_loss_mlp": 0.01033371, + "balance_loss_clip": 1.02142024, + "balance_loss_mlp": 1.03874612, + "epoch": 0.5170599729445363, + "flos": 24680793674880.0, + "grad_norm": 5.031269669902368, + "language_loss": 0.72193408, + "learning_rate": 1.9865636730631904e-06, + "loss": 0.74333239, + "num_input_tokens_seen": 184905590, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6796875, + "step": 8600, + "time_per_iteration": 2.4958672523498535 + }, + { + "auxiliary_loss_clip": 0.01107841, + "auxiliary_loss_mlp": 0.01031875, + "balance_loss_clip": 1.01924503, + "balance_loss_mlp": 1.03902841, + "epoch": 0.5171200961972042, + "flos": 20994455664000.0, + "grad_norm": 1.5543027238719596, + "language_loss": 0.74527812, + "learning_rate": 1.9861742204279602e-06, + "loss": 0.76667523, + "num_input_tokens_seen": 184925555, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 8601, + "time_per_iteration": 2.4545562267303467 + }, + { + "auxiliary_loss_clip": 0.01108233, + "auxiliary_loss_mlp": 0.01038986, + "balance_loss_clip": 1.02540207, + "balance_loss_mlp": 1.03855383, + "epoch": 0.5171802194498722, + "flos": 22745639427840.0, + "grad_norm": 1.930843678841908, + "language_loss": 0.83770829, + "learning_rate": 1.9857847683170045e-06, + "loss": 0.85918051, + "num_input_tokens_seen": 184944490, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.6953125, + "step": 8602, + "time_per_iteration": 2.478315591812134 + }, + { + "auxiliary_loss_clip": 0.01109334, + "auxiliary_loss_mlp": 0.01030291, + "balance_loss_clip": 1.01727891, + "balance_loss_mlp": 1.03919971, + "epoch": 0.5172403427025402, + "flos": 28176732668160.0, + "grad_norm": 1.739467426965746, + "language_loss": 0.74487793, + "learning_rate": 1.9853953167450926e-06, + "loss": 0.76627421, + "num_input_tokens_seen": 184963190, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 8603, + "time_per_iteration": 2.541987180709839 + }, + { + "auxiliary_loss_clip": 0.01110457, + "auxiliary_loss_mlp": 0.0103389, + "balance_loss_clip": 1.02172458, + "balance_loss_mlp": 1.04043818, + "epoch": 0.5173004659552082, + "flos": 20337829090560.0, + "grad_norm": 2.0493295845447435, + "language_loss": 0.72732627, + "learning_rate": 1.9850058657269915e-06, + "loss": 0.74876976, + "num_input_tokens_seen": 184981220, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69921875, + "step": 8604, + "time_per_iteration": 2.464036226272583 + }, + { + "auxiliary_loss_clip": 0.01113997, + "auxiliary_loss_mlp": 0.01032385, + "balance_loss_clip": 1.01927209, + "balance_loss_mlp": 1.03878832, + "epoch": 0.5173605892078762, + "flos": 19063323740160.0, + "grad_norm": 1.890584135418456, + "language_loss": 0.85098851, + "learning_rate": 1.984616415277469e-06, + "loss": 0.87245226, + "num_input_tokens_seen": 184998810, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.75, + "step": 8605, + "time_per_iteration": 2.469414472579956 + }, + { + "auxiliary_loss_clip": 0.01107307, + "auxiliary_loss_mlp": 0.01024655, + "balance_loss_clip": 1.01271009, + "balance_loss_mlp": 1.03827572, + "epoch": 0.5174207124605441, + "flos": 27995168396160.0, + "grad_norm": 1.4962077074735805, + "language_loss": 0.64887142, + "learning_rate": 1.984226965411294e-06, + "loss": 0.67019105, + "num_input_tokens_seen": 185021185, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.69140625, + "step": 8606, + "time_per_iteration": 2.5391039848327637 + }, + { + "auxiliary_loss_clip": 0.01108829, + "auxiliary_loss_mlp": 0.01027754, + "balance_loss_clip": 1.0153147, + "balance_loss_mlp": 1.04041243, + "epoch": 0.5174808357132121, + "flos": 19496657416320.0, + "grad_norm": 1.6359731326945595, + "language_loss": 0.77811146, + "learning_rate": 1.983837516143234e-06, + "loss": 0.79947728, + "num_input_tokens_seen": 185038465, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 8607, + "time_per_iteration": 2.4382975101470947 + }, + { + "auxiliary_loss_clip": 0.01111137, + "auxiliary_loss_mlp": 0.01033709, + "balance_loss_clip": 1.02053022, + "balance_loss_mlp": 1.0399344, + "epoch": 0.51754095896588, + "flos": 22784171742720.0, + "grad_norm": 3.5447610791610638, + "language_loss": 0.72232366, + "learning_rate": 1.983448067488057e-06, + "loss": 0.74377209, + "num_input_tokens_seen": 185057340, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 8608, + "time_per_iteration": 2.511740207672119 + }, + { + "auxiliary_loss_clip": 0.01115322, + "auxiliary_loss_mlp": 0.01032677, + "balance_loss_clip": 1.01927149, + "balance_loss_mlp": 1.04073501, + "epoch": 0.5176010822185481, + "flos": 22669257156480.0, + "grad_norm": 1.8799970026389359, + "language_loss": 0.86513162, + "learning_rate": 1.983058619460531e-06, + "loss": 0.88661158, + "num_input_tokens_seen": 185074935, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 8609, + "time_per_iteration": 2.453684091567993 + }, + { + "auxiliary_loss_clip": 0.01108892, + "auxiliary_loss_mlp": 0.01030489, + "balance_loss_clip": 1.01888371, + "balance_loss_mlp": 1.03858495, + "epoch": 0.517661205471216, + "flos": 23951196622080.0, + "grad_norm": 1.565375500859336, + "language_loss": 0.73396695, + "learning_rate": 1.9826691720754237e-06, + "loss": 0.75536072, + "num_input_tokens_seen": 185095050, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.703125, + "step": 8610, + "time_per_iteration": 2.5529308319091797 + }, + { + "auxiliary_loss_clip": 0.01115772, + "auxiliary_loss_mlp": 0.01032009, + "balance_loss_clip": 1.01813269, + "balance_loss_mlp": 1.04202247, + "epoch": 0.517721328723884, + "flos": 15596076735360.0, + "grad_norm": 1.8297114771569651, + "language_loss": 0.67358816, + "learning_rate": 1.9822797253475034e-06, + "loss": 0.69506592, + "num_input_tokens_seen": 185112275, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.73828125, + "step": 8611, + "time_per_iteration": 2.4198501110076904 + }, + { + "auxiliary_loss_clip": 0.01108783, + "auxiliary_loss_mlp": 0.01030358, + "balance_loss_clip": 1.01808488, + "balance_loss_mlp": 1.0382731, + "epoch": 0.5177814519765519, + "flos": 20960197067520.0, + "grad_norm": 2.316941620789411, + "language_loss": 0.77502143, + "learning_rate": 1.9818902792915373e-06, + "loss": 0.79641283, + "num_input_tokens_seen": 185132165, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.70703125, + "step": 8612, + "time_per_iteration": 2.4943206310272217 + }, + { + "auxiliary_loss_clip": 0.01110636, + "auxiliary_loss_mlp": 0.01034055, + "balance_loss_clip": 1.02186632, + "balance_loss_mlp": 1.03938198, + "epoch": 0.5178415752292199, + "flos": 17967832796160.0, + "grad_norm": 1.9039649692993772, + "language_loss": 0.8192755, + "learning_rate": 1.981500833922294e-06, + "loss": 0.84072244, + "num_input_tokens_seen": 185151025, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.71484375, + "step": 8613, + "time_per_iteration": 2.434479236602783 + }, + { + "auxiliary_loss_clip": 0.01114755, + "auxiliary_loss_mlp": 0.01034084, + "balance_loss_clip": 1.02059531, + "balance_loss_mlp": 1.04346251, + "epoch": 0.5179016984818878, + "flos": 17821496787840.0, + "grad_norm": 2.1674567731422987, + "language_loss": 0.66747862, + "learning_rate": 1.981111389254541e-06, + "loss": 0.68896699, + "num_input_tokens_seen": 185168455, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7109375, + "step": 8614, + "time_per_iteration": 2.4598941802978516 + }, + { + "auxiliary_loss_clip": 0.01112182, + "auxiliary_loss_mlp": 0.0103035, + "balance_loss_clip": 1.01736188, + "balance_loss_mlp": 1.04048586, + "epoch": 0.5179618217345558, + "flos": 17820455293440.0, + "grad_norm": 1.9388641649707037, + "language_loss": 0.86660814, + "learning_rate": 1.9807219453030453e-06, + "loss": 0.88803345, + "num_input_tokens_seen": 185184415, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71875, + "step": 8615, + "time_per_iteration": 2.434614419937134 + }, + { + "auxiliary_loss_clip": 0.01110692, + "auxiliary_loss_mlp": 0.01040873, + "balance_loss_clip": 1.02877903, + "balance_loss_mlp": 1.04087663, + "epoch": 0.5180219449872238, + "flos": 22522131048960.0, + "grad_norm": 1.572223272426788, + "language_loss": 0.80601507, + "learning_rate": 1.9803325020825763e-06, + "loss": 0.82753074, + "num_input_tokens_seen": 185202910, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69921875, + "step": 8616, + "time_per_iteration": 2.489898920059204 + }, + { + "auxiliary_loss_clip": 0.01119523, + "auxiliary_loss_mlp": 0.01042995, + "balance_loss_clip": 1.02928019, + "balance_loss_mlp": 1.04558134, + "epoch": 0.5180820682398918, + "flos": 23915465568000.0, + "grad_norm": 1.6322050900799092, + "language_loss": 0.7524333, + "learning_rate": 1.9799430596079e-06, + "loss": 0.77405852, + "num_input_tokens_seen": 185223085, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7421875, + "step": 8617, + "time_per_iteration": 2.4741597175598145 + }, + { + "auxiliary_loss_clip": 0.0111036, + "auxiliary_loss_mlp": 0.01032777, + "balance_loss_clip": 1.01977718, + "balance_loss_mlp": 1.03946304, + "epoch": 0.5181421914925598, + "flos": 16979930064000.0, + "grad_norm": 1.8314484463575909, + "language_loss": 0.70137858, + "learning_rate": 1.979553617893785e-06, + "loss": 0.72280991, + "num_input_tokens_seen": 185241295, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.70703125, + "step": 8618, + "time_per_iteration": 2.4596426486968994 + }, + { + "auxiliary_loss_clip": 0.01036764, + "auxiliary_loss_mlp": 0.01001258, + "balance_loss_clip": 1.00007808, + "balance_loss_mlp": 1.01408625, + "epoch": 0.5182023147452277, + "flos": 66059870872320.0, + "grad_norm": 0.9556911586994957, + "language_loss": 0.67222798, + "learning_rate": 1.979164176954999e-06, + "loss": 0.69260818, + "num_input_tokens_seen": 185298295, + "router_z_loss_clip": 0.01177979, + "router_z_loss_mlp": 0.2265625, + "step": 8619, + "time_per_iteration": 3.0123016834259033 + }, + { + "auxiliary_loss_clip": 0.01107081, + "auxiliary_loss_mlp": 0.01032829, + "balance_loss_clip": 1.02055597, + "balance_loss_mlp": 1.03924203, + "epoch": 0.5182624379978957, + "flos": 18187749815040.0, + "grad_norm": 2.197431442121674, + "language_loss": 0.79314506, + "learning_rate": 1.97877473680631e-06, + "loss": 0.81454414, + "num_input_tokens_seen": 185317000, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6796875, + "step": 8620, + "time_per_iteration": 2.445173740386963 + }, + { + "auxiliary_loss_clip": 0.01108259, + "auxiliary_loss_mlp": 0.01038483, + "balance_loss_clip": 1.02625203, + "balance_loss_mlp": 1.03989077, + "epoch": 0.5183225612505636, + "flos": 14026708638720.0, + "grad_norm": 2.0514402600561765, + "language_loss": 0.81893396, + "learning_rate": 1.9783852974624846e-06, + "loss": 0.84040135, + "num_input_tokens_seen": 185331185, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 8621, + "time_per_iteration": 2.4382779598236084 + }, + { + "auxiliary_loss_clip": 0.01109273, + "auxiliary_loss_mlp": 0.01032378, + "balance_loss_clip": 1.02073121, + "balance_loss_mlp": 1.0391438, + "epoch": 0.5183826845032317, + "flos": 23659781581440.0, + "grad_norm": 1.9740999547408657, + "language_loss": 0.65540636, + "learning_rate": 1.9779958589382905e-06, + "loss": 0.67682284, + "num_input_tokens_seen": 185348955, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.69921875, + "step": 8622, + "time_per_iteration": 2.494173288345337 + }, + { + "auxiliary_loss_clip": 0.01114132, + "auxiliary_loss_mlp": 0.01038751, + "balance_loss_clip": 1.02528644, + "balance_loss_mlp": 1.04077148, + "epoch": 0.5184428077558996, + "flos": 15888605097600.0, + "grad_norm": 1.975231537474399, + "language_loss": 0.60350323, + "learning_rate": 1.977606421248497e-06, + "loss": 0.62503201, + "num_input_tokens_seen": 185367330, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 8623, + "time_per_iteration": 2.427819013595581 + }, + { + "auxiliary_loss_clip": 0.0110912, + "auxiliary_loss_mlp": 0.01032142, + "balance_loss_clip": 1.01995301, + "balance_loss_mlp": 1.03832614, + "epoch": 0.5185029310085676, + "flos": 21030833162880.0, + "grad_norm": 1.7021073046505133, + "language_loss": 0.76074666, + "learning_rate": 1.9772169844078685e-06, + "loss": 0.78215921, + "num_input_tokens_seen": 185385060, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.70703125, + "step": 8624, + "time_per_iteration": 2.4636356830596924 + }, + { + "auxiliary_loss_clip": 0.01109665, + "auxiliary_loss_mlp": 0.01036507, + "balance_loss_clip": 1.02441311, + "balance_loss_mlp": 1.03890038, + "epoch": 0.5185630542612355, + "flos": 26542690133760.0, + "grad_norm": 2.7326139645058456, + "language_loss": 0.71175325, + "learning_rate": 1.9768275484311756e-06, + "loss": 0.73321491, + "num_input_tokens_seen": 185403745, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.70703125, + "step": 8625, + "time_per_iteration": 2.4977569580078125 + }, + { + "auxiliary_loss_clip": 0.01110816, + "auxiliary_loss_mlp": 0.01034143, + "balance_loss_clip": 1.02223408, + "balance_loss_mlp": 1.03980732, + "epoch": 0.5186231775139035, + "flos": 20668422890880.0, + "grad_norm": 1.8950159086376122, + "language_loss": 0.67929721, + "learning_rate": 1.976438113333184e-06, + "loss": 0.70074677, + "num_input_tokens_seen": 185422620, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.7109375, + "step": 8626, + "time_per_iteration": 2.4934957027435303 + }, + { + "auxiliary_loss_clip": 0.01108701, + "auxiliary_loss_mlp": 0.010311, + "balance_loss_clip": 1.01889873, + "balance_loss_mlp": 1.03984976, + "epoch": 0.5186833007665714, + "flos": 20885502735360.0, + "grad_norm": 2.322377605069906, + "language_loss": 0.70487207, + "learning_rate": 1.9760486791286612e-06, + "loss": 0.72627008, + "num_input_tokens_seen": 185439380, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 8627, + "time_per_iteration": 2.445827007293701 + }, + { + "auxiliary_loss_clip": 0.01114683, + "auxiliary_loss_mlp": 0.01037557, + "balance_loss_clip": 1.02539158, + "balance_loss_mlp": 1.04147446, + "epoch": 0.5187434240192395, + "flos": 20886903365760.0, + "grad_norm": 1.9255563847501656, + "language_loss": 0.73209083, + "learning_rate": 1.9756592458323753e-06, + "loss": 0.75361323, + "num_input_tokens_seen": 185458830, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.734375, + "step": 8628, + "time_per_iteration": 2.500955581665039 + }, + { + "auxiliary_loss_clip": 0.01110668, + "auxiliary_loss_mlp": 0.01032241, + "balance_loss_clip": 1.02039731, + "balance_loss_mlp": 1.04147768, + "epoch": 0.5188035472719074, + "flos": 19859929614720.0, + "grad_norm": 3.3927220028721994, + "language_loss": 0.77245331, + "learning_rate": 1.9752698134590927e-06, + "loss": 0.79388249, + "num_input_tokens_seen": 185477270, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.69140625, + "step": 8629, + "time_per_iteration": 2.4560301303863525 + }, + { + "auxiliary_loss_clip": 0.01113327, + "auxiliary_loss_mlp": 0.01030634, + "balance_loss_clip": 1.0179081, + "balance_loss_mlp": 1.04206562, + "epoch": 0.5188636705245754, + "flos": 21138313633920.0, + "grad_norm": 2.1928775386787187, + "language_loss": 0.74820137, + "learning_rate": 1.9748803820235815e-06, + "loss": 0.76964092, + "num_input_tokens_seen": 185495795, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 8630, + "time_per_iteration": 2.496370792388916 + }, + { + "auxiliary_loss_clip": 0.01110145, + "auxiliary_loss_mlp": 0.01035209, + "balance_loss_clip": 1.02210796, + "balance_loss_mlp": 1.03882229, + "epoch": 0.5189237937772434, + "flos": 22419786222720.0, + "grad_norm": 1.6137116253106134, + "language_loss": 0.80663669, + "learning_rate": 1.9744909515406093e-06, + "loss": 0.82809031, + "num_input_tokens_seen": 185514885, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 8631, + "time_per_iteration": 2.4534530639648438 + }, + { + "auxiliary_loss_clip": 0.01112884, + "auxiliary_loss_mlp": 0.01032333, + "balance_loss_clip": 1.01893413, + "balance_loss_mlp": 1.04085588, + "epoch": 0.5189839170299113, + "flos": 25446696399360.0, + "grad_norm": 1.5022963557810187, + "language_loss": 0.74575752, + "learning_rate": 1.974101522024942e-06, + "loss": 0.76720965, + "num_input_tokens_seen": 185537155, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.71875, + "step": 8632, + "time_per_iteration": 2.5295352935791016 + }, + { + "auxiliary_loss_clip": 0.01105073, + "auxiliary_loss_mlp": 0.01030609, + "balance_loss_clip": 1.01810372, + "balance_loss_mlp": 1.03738809, + "epoch": 0.5190440402825793, + "flos": 18587722734720.0, + "grad_norm": 1.784064079335437, + "language_loss": 0.78812337, + "learning_rate": 1.9737120934913477e-06, + "loss": 0.80948019, + "num_input_tokens_seen": 185555520, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.67578125, + "step": 8633, + "time_per_iteration": 2.4241905212402344 + }, + { + "auxiliary_loss_clip": 0.01109914, + "auxiliary_loss_mlp": 0.01031089, + "balance_loss_clip": 1.01873302, + "balance_loss_mlp": 1.03893745, + "epoch": 0.5191041635352472, + "flos": 21908633731200.0, + "grad_norm": 1.7026702061892323, + "language_loss": 0.80149853, + "learning_rate": 1.9733226659545936e-06, + "loss": 0.82290852, + "num_input_tokens_seen": 185573855, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.70703125, + "step": 8634, + "time_per_iteration": 2.4851884841918945 + }, + { + "auxiliary_loss_clip": 0.01108415, + "auxiliary_loss_mlp": 0.01035144, + "balance_loss_clip": 1.02305627, + "balance_loss_mlp": 1.04024315, + "epoch": 0.5191642867879153, + "flos": 27527971173120.0, + "grad_norm": 1.4600796720036056, + "language_loss": 0.68628252, + "learning_rate": 1.9729332394294467e-06, + "loss": 0.70771807, + "num_input_tokens_seen": 185595145, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 8635, + "time_per_iteration": 3.921346426010132 + }, + { + "auxiliary_loss_clip": 0.01113121, + "auxiliary_loss_mlp": 0.01033538, + "balance_loss_clip": 1.02083683, + "balance_loss_mlp": 1.04083443, + "epoch": 0.5192244100405832, + "flos": 15705999331200.0, + "grad_norm": 1.6781612563386181, + "language_loss": 0.7704699, + "learning_rate": 1.9725438139306742e-06, + "loss": 0.79193652, + "num_input_tokens_seen": 185613320, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 8636, + "time_per_iteration": 2.45908260345459 + }, + { + "auxiliary_loss_clip": 0.01112314, + "auxiliary_loss_mlp": 0.01031183, + "balance_loss_clip": 1.01861811, + "balance_loss_mlp": 1.04090476, + "epoch": 0.5192845332932512, + "flos": 12057080313600.0, + "grad_norm": 1.9891179602637588, + "language_loss": 0.71459377, + "learning_rate": 1.9721543894730425e-06, + "loss": 0.73602873, + "num_input_tokens_seen": 185630730, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71484375, + "step": 8637, + "time_per_iteration": 5.353722810745239 + }, + { + "auxiliary_loss_clip": 0.01108688, + "auxiliary_loss_mlp": 0.01032602, + "balance_loss_clip": 1.01964426, + "balance_loss_mlp": 1.0394423, + "epoch": 0.5193446565459191, + "flos": 18953185662720.0, + "grad_norm": 3.7284266214304576, + "language_loss": 0.75943041, + "learning_rate": 1.9717649660713194e-06, + "loss": 0.78084332, + "num_input_tokens_seen": 185648515, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.69140625, + "step": 8638, + "time_per_iteration": 3.902477741241455 + }, + { + "auxiliary_loss_clip": 0.0110838, + "auxiliary_loss_mlp": 0.01030358, + "balance_loss_clip": 1.0175786, + "balance_loss_mlp": 1.03863966, + "epoch": 0.5194047797985871, + "flos": 20374960775040.0, + "grad_norm": 2.006346025426826, + "language_loss": 0.74846971, + "learning_rate": 1.971375543740272e-06, + "loss": 0.76985711, + "num_input_tokens_seen": 185665220, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 8639, + "time_per_iteration": 2.453634738922119 + }, + { + "auxiliary_loss_clip": 0.01109964, + "auxiliary_loss_mlp": 0.01028741, + "balance_loss_clip": 1.01604497, + "balance_loss_mlp": 1.04051375, + "epoch": 0.519464903051255, + "flos": 24353001135360.0, + "grad_norm": 1.6163455561126134, + "language_loss": 0.77538067, + "learning_rate": 1.9709861224946665e-06, + "loss": 0.79676771, + "num_input_tokens_seen": 185683750, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 8640, + "time_per_iteration": 2.482334613800049 + }, + { + "auxiliary_loss_clip": 0.01110191, + "auxiliary_loss_mlp": 0.01030568, + "balance_loss_clip": 1.01883161, + "balance_loss_mlp": 1.04175985, + "epoch": 0.519525026303923, + "flos": 14061829161600.0, + "grad_norm": 1.623082815057782, + "language_loss": 0.65734208, + "learning_rate": 1.97059670234927e-06, + "loss": 0.67874962, + "num_input_tokens_seen": 185700625, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.68359375, + "step": 8641, + "time_per_iteration": 2.4567995071411133 + }, + { + "auxiliary_loss_clip": 0.01109994, + "auxiliary_loss_mlp": 0.01033178, + "balance_loss_clip": 1.02142978, + "balance_loss_mlp": 1.04105425, + "epoch": 0.519585149556591, + "flos": 28835873193600.0, + "grad_norm": 1.8491224599980307, + "language_loss": 0.76197445, + "learning_rate": 1.97020728331885e-06, + "loss": 0.78340614, + "num_input_tokens_seen": 185721155, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.69140625, + "step": 8642, + "time_per_iteration": 2.5128276348114014 + }, + { + "auxiliary_loss_clip": 0.01109094, + "auxiliary_loss_mlp": 0.01031951, + "balance_loss_clip": 1.02001774, + "balance_loss_mlp": 1.04037452, + "epoch": 0.519645272809259, + "flos": 25373007648000.0, + "grad_norm": 1.4733024685255247, + "language_loss": 0.83179498, + "learning_rate": 1.9698178654181726e-06, + "loss": 0.85320538, + "num_input_tokens_seen": 185740990, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 8643, + "time_per_iteration": 2.5094587802886963 + }, + { + "auxiliary_loss_clip": 0.0111188, + "auxiliary_loss_mlp": 0.01042001, + "balance_loss_clip": 1.02856052, + "balance_loss_mlp": 1.03983521, + "epoch": 0.519705396061927, + "flos": 25372863993600.0, + "grad_norm": 1.5341454697133152, + "language_loss": 0.70307451, + "learning_rate": 1.969428448662004e-06, + "loss": 0.72461337, + "num_input_tokens_seen": 185762235, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71875, + "step": 8644, + "time_per_iteration": 2.5111963748931885 + }, + { + "auxiliary_loss_clip": 0.01110422, + "auxiliary_loss_mlp": 0.01031611, + "balance_loss_clip": 1.01967788, + "balance_loss_mlp": 1.03966331, + "epoch": 0.5197655193145949, + "flos": 28476228268800.0, + "grad_norm": 1.8635414079348847, + "language_loss": 0.80144334, + "learning_rate": 1.9690390330651133e-06, + "loss": 0.82286364, + "num_input_tokens_seen": 185783415, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.70703125, + "step": 8645, + "time_per_iteration": 2.529616117477417 + }, + { + "auxiliary_loss_clip": 0.01109035, + "auxiliary_loss_mlp": 0.01029263, + "balance_loss_clip": 1.01647151, + "balance_loss_mlp": 1.03836131, + "epoch": 0.5198256425672629, + "flos": 20009138711040.0, + "grad_norm": 1.899493861617854, + "language_loss": 0.78147799, + "learning_rate": 1.968649618642264e-06, + "loss": 0.80286086, + "num_input_tokens_seen": 185801345, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.70703125, + "step": 8646, + "time_per_iteration": 2.4409830570220947 + }, + { + "auxiliary_loss_clip": 0.01112803, + "auxiliary_loss_mlp": 0.01033352, + "balance_loss_clip": 1.02101934, + "balance_loss_mlp": 1.04184628, + "epoch": 0.5198857658199308, + "flos": 19828867328640.0, + "grad_norm": 1.8109153766187511, + "language_loss": 0.66239858, + "learning_rate": 1.9682602054082252e-06, + "loss": 0.68386012, + "num_input_tokens_seen": 185820815, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.7109375, + "step": 8647, + "time_per_iteration": 2.4503657817840576 + }, + { + "auxiliary_loss_clip": 0.01113411, + "auxiliary_loss_mlp": 0.01032738, + "balance_loss_clip": 1.01834917, + "balance_loss_mlp": 1.04010677, + "epoch": 0.5199458890725989, + "flos": 24461918150400.0, + "grad_norm": 4.112424605735972, + "language_loss": 0.71817285, + "learning_rate": 1.967870793377763e-06, + "loss": 0.73963439, + "num_input_tokens_seen": 185841450, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.734375, + "step": 8648, + "time_per_iteration": 2.49595308303833 + }, + { + "auxiliary_loss_clip": 0.01112873, + "auxiliary_loss_mlp": 0.0103029, + "balance_loss_clip": 1.01714706, + "balance_loss_mlp": 1.0411458, + "epoch": 0.5200060123252668, + "flos": 23404779953280.0, + "grad_norm": 1.6438613988660609, + "language_loss": 0.64412069, + "learning_rate": 1.967481382565642e-06, + "loss": 0.66555232, + "num_input_tokens_seen": 185859935, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 8649, + "time_per_iteration": 2.4781436920166016 + }, + { + "auxiliary_loss_clip": 0.01116558, + "auxiliary_loss_mlp": 0.01035075, + "balance_loss_clip": 1.02025771, + "balance_loss_mlp": 1.04224229, + "epoch": 0.5200661355779348, + "flos": 17201355454080.0, + "grad_norm": 1.8268985026448872, + "language_loss": 0.70691884, + "learning_rate": 1.9670919729866315e-06, + "loss": 0.72843516, + "num_input_tokens_seen": 185876795, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7421875, + "step": 8650, + "time_per_iteration": 2.4350762367248535 + }, + { + "auxiliary_loss_clip": 0.01108729, + "auxiliary_loss_mlp": 0.01028355, + "balance_loss_clip": 1.01559973, + "balance_loss_mlp": 1.03854239, + "epoch": 0.5201262588306027, + "flos": 18515075477760.0, + "grad_norm": 1.6557672224542628, + "language_loss": 0.7709741, + "learning_rate": 1.966702564655496e-06, + "loss": 0.79234493, + "num_input_tokens_seen": 185895570, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 8651, + "time_per_iteration": 2.4439852237701416 + }, + { + "auxiliary_loss_clip": 0.01115555, + "auxiliary_loss_mlp": 0.01035706, + "balance_loss_clip": 1.02171111, + "balance_loss_mlp": 1.04384518, + "epoch": 0.5201863820832707, + "flos": 18619395552000.0, + "grad_norm": 1.7772284952150523, + "language_loss": 0.78304142, + "learning_rate": 1.966313157587003e-06, + "loss": 0.80455399, + "num_input_tokens_seen": 185913700, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.71484375, + "step": 8652, + "time_per_iteration": 2.4581267833709717 + }, + { + "auxiliary_loss_clip": 0.01114617, + "auxiliary_loss_mlp": 0.01031149, + "balance_loss_clip": 1.01683807, + "balance_loss_mlp": 1.04281044, + "epoch": 0.5202465053359386, + "flos": 22857142222080.0, + "grad_norm": 2.0186078989624017, + "language_loss": 0.7027083, + "learning_rate": 1.9659237517959187e-06, + "loss": 0.72416592, + "num_input_tokens_seen": 185932460, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.71875, + "step": 8653, + "time_per_iteration": 2.4945242404937744 + }, + { + "auxiliary_loss_clip": 0.01114383, + "auxiliary_loss_mlp": 0.01040745, + "balance_loss_clip": 1.02703571, + "balance_loss_mlp": 1.04092932, + "epoch": 0.5203066285886067, + "flos": 21981532383360.0, + "grad_norm": 1.6276924489714153, + "language_loss": 0.78420818, + "learning_rate": 1.965534347297008e-06, + "loss": 0.80575949, + "num_input_tokens_seen": 185952030, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.734375, + "step": 8654, + "time_per_iteration": 2.4857122898101807 + }, + { + "auxiliary_loss_clip": 0.01117815, + "auxiliary_loss_mlp": 0.01038442, + "balance_loss_clip": 1.02450645, + "balance_loss_mlp": 1.04275405, + "epoch": 0.5203667518412746, + "flos": 20233329448320.0, + "grad_norm": 2.316843494652732, + "language_loss": 0.8424964, + "learning_rate": 1.9651449441050393e-06, + "loss": 0.86405897, + "num_input_tokens_seen": 185973130, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 8655, + "time_per_iteration": 2.48307728767395 + }, + { + "auxiliary_loss_clip": 0.0111092, + "auxiliary_loss_mlp": 0.01030844, + "balance_loss_clip": 1.01860702, + "balance_loss_mlp": 1.04225183, + "epoch": 0.5204268750939426, + "flos": 15705460627200.0, + "grad_norm": 3.712191764961765, + "language_loss": 0.65503991, + "learning_rate": 1.9647555422347777e-06, + "loss": 0.67645752, + "num_input_tokens_seen": 185990200, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 8656, + "time_per_iteration": 2.442760705947876 + }, + { + "auxiliary_loss_clip": 0.01114044, + "auxiliary_loss_mlp": 0.01029702, + "balance_loss_clip": 1.0173285, + "balance_loss_mlp": 1.04263127, + "epoch": 0.5204869983466105, + "flos": 27449469999360.0, + "grad_norm": 2.4919467158509385, + "language_loss": 0.73240453, + "learning_rate": 1.9643661417009893e-06, + "loss": 0.753842, + "num_input_tokens_seen": 186009880, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.71484375, + "step": 8657, + "time_per_iteration": 2.5198535919189453 + }, + { + "auxiliary_loss_clip": 0.01111884, + "auxiliary_loss_mlp": 0.01033998, + "balance_loss_clip": 1.02064037, + "balance_loss_mlp": 1.042382, + "epoch": 0.5205471215992785, + "flos": 20595452411520.0, + "grad_norm": 1.757060291742625, + "language_loss": 0.71675289, + "learning_rate": 1.9639767425184408e-06, + "loss": 0.73821175, + "num_input_tokens_seen": 186026680, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.6953125, + "step": 8658, + "time_per_iteration": 2.4651598930358887 + }, + { + "auxiliary_loss_clip": 0.0111093, + "auxiliary_loss_mlp": 0.01031842, + "balance_loss_clip": 1.01868176, + "balance_loss_mlp": 1.0400281, + "epoch": 0.5206072448519465, + "flos": 22127904305280.0, + "grad_norm": 1.6795003925123537, + "language_loss": 0.83473611, + "learning_rate": 1.963587344701897e-06, + "loss": 0.85616386, + "num_input_tokens_seen": 186046920, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 8659, + "time_per_iteration": 2.462956428527832 + }, + { + "auxiliary_loss_clip": 0.01119845, + "auxiliary_loss_mlp": 0.01039223, + "balance_loss_clip": 1.02470934, + "balance_loss_mlp": 1.04351366, + "epoch": 0.5206673681046144, + "flos": 18330422636160.0, + "grad_norm": 1.9135176980647008, + "language_loss": 0.75763941, + "learning_rate": 1.9631979482661253e-06, + "loss": 0.77923, + "num_input_tokens_seen": 186062090, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.76171875, + "step": 8660, + "time_per_iteration": 2.4544646739959717 + }, + { + "auxiliary_loss_clip": 0.01111893, + "auxiliary_loss_mlp": 0.0103427, + "balance_loss_clip": 1.02199721, + "balance_loss_mlp": 1.04152977, + "epoch": 0.5207274913572825, + "flos": 20230240878720.0, + "grad_norm": 1.7715737398241405, + "language_loss": 0.78001404, + "learning_rate": 1.9628085532258906e-06, + "loss": 0.80147564, + "num_input_tokens_seen": 186081135, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.703125, + "step": 8661, + "time_per_iteration": 2.4456324577331543 + }, + { + "auxiliary_loss_clip": 0.01113873, + "auxiliary_loss_mlp": 0.01030884, + "balance_loss_clip": 1.01818848, + "balance_loss_mlp": 1.0404228, + "epoch": 0.5207876146099504, + "flos": 22127042378880.0, + "grad_norm": 1.805356331270093, + "language_loss": 0.70643514, + "learning_rate": 1.9624191595959603e-06, + "loss": 0.72788274, + "num_input_tokens_seen": 186099700, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.734375, + "step": 8662, + "time_per_iteration": 2.5272181034088135 + }, + { + "auxiliary_loss_clip": 0.01110335, + "auxiliary_loss_mlp": 0.01032574, + "balance_loss_clip": 1.01835203, + "balance_loss_mlp": 1.04033709, + "epoch": 0.5208477378626184, + "flos": 23878908501120.0, + "grad_norm": 1.669754729528693, + "language_loss": 0.6935755, + "learning_rate": 1.962029767391098e-06, + "loss": 0.71500456, + "num_input_tokens_seen": 186119740, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.69921875, + "step": 8663, + "time_per_iteration": 2.468287706375122 + }, + { + "auxiliary_loss_clip": 0.01113011, + "auxiliary_loss_mlp": 0.01031152, + "balance_loss_clip": 1.01822364, + "balance_loss_mlp": 1.04173064, + "epoch": 0.5209078611152863, + "flos": 20961525870720.0, + "grad_norm": 2.618720199838109, + "language_loss": 0.76771712, + "learning_rate": 1.961640376626072e-06, + "loss": 0.7891587, + "num_input_tokens_seen": 186140645, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71484375, + "step": 8664, + "time_per_iteration": 2.519645929336548 + }, + { + "auxiliary_loss_clip": 0.01111987, + "auxiliary_loss_mlp": 0.01036724, + "balance_loss_clip": 1.02387905, + "balance_loss_mlp": 1.04057467, + "epoch": 0.5209679843679543, + "flos": 20667740532480.0, + "grad_norm": 1.987870026093088, + "language_loss": 0.76193488, + "learning_rate": 1.961250987315646e-06, + "loss": 0.78342199, + "num_input_tokens_seen": 186160130, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71484375, + "step": 8665, + "time_per_iteration": 2.4501259326934814 + }, + { + "auxiliary_loss_clip": 0.01111359, + "auxiliary_loss_mlp": 0.01033252, + "balance_loss_clip": 1.02113414, + "balance_loss_mlp": 1.04135728, + "epoch": 0.5210281076206222, + "flos": 20227295963520.0, + "grad_norm": 1.609030555811117, + "language_loss": 0.71689177, + "learning_rate": 1.960861599474586e-06, + "loss": 0.73833793, + "num_input_tokens_seen": 186179485, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69921875, + "step": 8666, + "time_per_iteration": 2.4961183071136475 + }, + { + "auxiliary_loss_clip": 0.01119663, + "auxiliary_loss_mlp": 0.01035445, + "balance_loss_clip": 1.02031779, + "balance_loss_mlp": 1.04257357, + "epoch": 0.5210882308732903, + "flos": 16069989801600.0, + "grad_norm": 2.081998488723945, + "language_loss": 0.68599117, + "learning_rate": 1.9604722131176592e-06, + "loss": 0.7075423, + "num_input_tokens_seen": 186197140, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.76953125, + "step": 8667, + "time_per_iteration": 2.4216842651367188 + }, + { + "auxiliary_loss_clip": 0.01107841, + "auxiliary_loss_mlp": 0.01034805, + "balance_loss_clip": 1.02247858, + "balance_loss_mlp": 1.03913903, + "epoch": 0.5211483541259582, + "flos": 24825298089600.0, + "grad_norm": 1.3811752682570164, + "language_loss": 0.81006289, + "learning_rate": 1.960082828259629e-06, + "loss": 0.83148932, + "num_input_tokens_seen": 186216800, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6875, + "step": 8668, + "time_per_iteration": 2.5712640285491943 + }, + { + "auxiliary_loss_clip": 0.01112305, + "auxiliary_loss_mlp": 0.01031429, + "balance_loss_clip": 1.0184648, + "balance_loss_mlp": 1.0413909, + "epoch": 0.5212084773786262, + "flos": 20370651143040.0, + "grad_norm": 2.7130530435254507, + "language_loss": 0.63821161, + "learning_rate": 1.9596934449152623e-06, + "loss": 0.65964901, + "num_input_tokens_seen": 186235320, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 8669, + "time_per_iteration": 2.485560894012451 + }, + { + "auxiliary_loss_clip": 0.01114118, + "auxiliary_loss_mlp": 0.01040749, + "balance_loss_clip": 1.02779722, + "balance_loss_mlp": 1.0434041, + "epoch": 0.5212686006312941, + "flos": 23145468693120.0, + "grad_norm": 1.5472632399176471, + "language_loss": 0.66420943, + "learning_rate": 1.959304063099325e-06, + "loss": 0.68575811, + "num_input_tokens_seen": 186254460, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 8670, + "time_per_iteration": 2.5161590576171875 + }, + { + "auxiliary_loss_clip": 0.01107902, + "auxiliary_loss_mlp": 0.01034298, + "balance_loss_clip": 1.02204931, + "balance_loss_mlp": 1.04005504, + "epoch": 0.5213287238839621, + "flos": 27774030314880.0, + "grad_norm": 2.0274420083477436, + "language_loss": 0.7666502, + "learning_rate": 1.9589146828265806e-06, + "loss": 0.78807229, + "num_input_tokens_seen": 186269465, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6796875, + "step": 8671, + "time_per_iteration": 2.4505884647369385 + }, + { + "auxiliary_loss_clip": 0.01117202, + "auxiliary_loss_mlp": 0.01035681, + "balance_loss_clip": 1.022246, + "balance_loss_mlp": 1.0442729, + "epoch": 0.5213888471366301, + "flos": 19937676602880.0, + "grad_norm": 6.168212064153821, + "language_loss": 0.78184325, + "learning_rate": 1.958525304111796e-06, + "loss": 0.80337209, + "num_input_tokens_seen": 186288660, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73046875, + "step": 8672, + "time_per_iteration": 2.501171350479126 + }, + { + "auxiliary_loss_clip": 0.0110814, + "auxiliary_loss_mlp": 0.0103169, + "balance_loss_clip": 1.01958418, + "balance_loss_mlp": 1.03945541, + "epoch": 0.521448970389298, + "flos": 16982731324800.0, + "grad_norm": 1.8428028532242804, + "language_loss": 0.72013724, + "learning_rate": 1.958135926969736e-06, + "loss": 0.74153554, + "num_input_tokens_seen": 186305760, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 8673, + "time_per_iteration": 2.4188430309295654 + }, + { + "auxiliary_loss_clip": 0.01110821, + "auxiliary_loss_mlp": 0.0102766, + "balance_loss_clip": 1.01467764, + "balance_loss_mlp": 1.04007983, + "epoch": 0.5215090936419661, + "flos": 18989706816000.0, + "grad_norm": 1.5425888836045836, + "language_loss": 0.75258517, + "learning_rate": 1.957746551415166e-06, + "loss": 0.77397001, + "num_input_tokens_seen": 186324135, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.70703125, + "step": 8674, + "time_per_iteration": 2.4615721702575684 + }, + { + "auxiliary_loss_clip": 0.01112251, + "auxiliary_loss_mlp": 0.01034993, + "balance_loss_clip": 1.02111149, + "balance_loss_mlp": 1.03926849, + "epoch": 0.521569216894634, + "flos": 16143427157760.0, + "grad_norm": 2.4005630002003198, + "language_loss": 0.86177206, + "learning_rate": 1.9573571774628506e-06, + "loss": 0.88324457, + "num_input_tokens_seen": 186340205, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7265625, + "step": 8675, + "time_per_iteration": 2.4192757606506348 + }, + { + "auxiliary_loss_clip": 0.01036097, + "auxiliary_loss_mlp": 0.00999914, + "balance_loss_clip": 0.99874002, + "balance_loss_mlp": 1.01361609, + "epoch": 0.521629340147302, + "flos": 57579493282560.0, + "grad_norm": 0.8810836824461878, + "language_loss": 0.6315189, + "learning_rate": 1.9569678051275556e-06, + "loss": 0.65187901, + "num_input_tokens_seen": 186396940, + "router_z_loss_clip": 0.01171875, + "router_z_loss_mlp": 0.22460938, + "step": 8676, + "time_per_iteration": 4.428101062774658 + }, + { + "auxiliary_loss_clip": 0.01110201, + "auxiliary_loss_mlp": 0.01030366, + "balance_loss_clip": 1.0180341, + "balance_loss_mlp": 1.04064405, + "epoch": 0.5216894633999699, + "flos": 26796901662720.0, + "grad_norm": 1.671918865817182, + "language_loss": 0.68830431, + "learning_rate": 1.956578434424046e-06, + "loss": 0.70970994, + "num_input_tokens_seen": 186418680, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6953125, + "step": 8677, + "time_per_iteration": 2.54658579826355 + }, + { + "auxiliary_loss_clip": 0.01110241, + "auxiliary_loss_mlp": 0.01030131, + "balance_loss_clip": 1.01739907, + "balance_loss_mlp": 1.03994, + "epoch": 0.5217495866526379, + "flos": 26358719650560.0, + "grad_norm": 1.5408434392952677, + "language_loss": 0.65516353, + "learning_rate": 1.956189065367086e-06, + "loss": 0.6765672, + "num_input_tokens_seen": 186438265, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 8678, + "time_per_iteration": 2.4848899841308594 + }, + { + "auxiliary_loss_clip": 0.01115921, + "auxiliary_loss_mlp": 0.01039888, + "balance_loss_clip": 1.02607715, + "balance_loss_mlp": 1.04188991, + "epoch": 0.5218097099053058, + "flos": 23584009841280.0, + "grad_norm": 2.860112109233836, + "language_loss": 0.69020754, + "learning_rate": 1.9557996979714414e-06, + "loss": 0.71176565, + "num_input_tokens_seen": 186456870, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7421875, + "step": 8679, + "time_per_iteration": 5.267160654067993 + }, + { + "auxiliary_loss_clip": 0.01114296, + "auxiliary_loss_mlp": 0.01037367, + "balance_loss_clip": 1.02467108, + "balance_loss_mlp": 1.04272938, + "epoch": 0.5218698331579739, + "flos": 18077396256000.0, + "grad_norm": 1.7057222009225053, + "language_loss": 0.66956079, + "learning_rate": 1.9554103322518764e-06, + "loss": 0.69107741, + "num_input_tokens_seen": 186476425, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71484375, + "step": 8680, + "time_per_iteration": 3.938239574432373 + }, + { + "auxiliary_loss_clip": 0.01112432, + "auxiliary_loss_mlp": 0.01035918, + "balance_loss_clip": 1.02248955, + "balance_loss_mlp": 1.04123902, + "epoch": 0.5219299564106418, + "flos": 19281121856640.0, + "grad_norm": 1.8837479968625288, + "language_loss": 0.83069575, + "learning_rate": 1.955020968223156e-06, + "loss": 0.85217923, + "num_input_tokens_seen": 186492555, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7109375, + "step": 8681, + "time_per_iteration": 2.475834369659424 + }, + { + "auxiliary_loss_clip": 0.01110385, + "auxiliary_loss_mlp": 0.01034309, + "balance_loss_clip": 1.02189326, + "balance_loss_mlp": 1.03964293, + "epoch": 0.5219900796633098, + "flos": 26651355753600.0, + "grad_norm": 1.7236617199536146, + "language_loss": 0.77448237, + "learning_rate": 1.9546316059000454e-06, + "loss": 0.79592931, + "num_input_tokens_seen": 186513190, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.70703125, + "step": 8682, + "time_per_iteration": 2.484111785888672 + }, + { + "auxiliary_loss_clip": 0.01112356, + "auxiliary_loss_mlp": 0.01043116, + "balance_loss_clip": 1.03124917, + "balance_loss_mlp": 1.041852, + "epoch": 0.5220502029159777, + "flos": 34312717382400.0, + "grad_norm": 1.4820765209382558, + "language_loss": 0.68982363, + "learning_rate": 1.9542422452973082e-06, + "loss": 0.71137834, + "num_input_tokens_seen": 186534830, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.703125, + "step": 8683, + "time_per_iteration": 2.579467535018921 + }, + { + "auxiliary_loss_clip": 0.01112188, + "auxiliary_loss_mlp": 0.01040104, + "balance_loss_clip": 1.02706265, + "balance_loss_mlp": 1.04016137, + "epoch": 0.5221103261686457, + "flos": 22156488552960.0, + "grad_norm": 1.598693343235541, + "language_loss": 0.7622329, + "learning_rate": 1.9538528864297104e-06, + "loss": 0.78375584, + "num_input_tokens_seen": 186554390, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 8684, + "time_per_iteration": 2.4642298221588135 + }, + { + "auxiliary_loss_clip": 0.01107617, + "auxiliary_loss_mlp": 0.01031395, + "balance_loss_clip": 1.01886606, + "balance_loss_mlp": 1.03845632, + "epoch": 0.5221704494213137, + "flos": 19208402772480.0, + "grad_norm": 1.6077803987399797, + "language_loss": 0.75887376, + "learning_rate": 1.9534635293120153e-06, + "loss": 0.7802639, + "num_input_tokens_seen": 186572360, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 8685, + "time_per_iteration": 2.4533908367156982 + }, + { + "auxiliary_loss_clip": 0.01113803, + "auxiliary_loss_mlp": 0.01038269, + "balance_loss_clip": 1.02562094, + "balance_loss_mlp": 1.0427258, + "epoch": 0.5222305726739817, + "flos": 19354056422400.0, + "grad_norm": 1.88354393014551, + "language_loss": 0.80851054, + "learning_rate": 1.9530741739589876e-06, + "loss": 0.83003128, + "num_input_tokens_seen": 186590655, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 8686, + "time_per_iteration": 2.430154323577881 + }, + { + "auxiliary_loss_clip": 0.01106791, + "auxiliary_loss_mlp": 0.01036694, + "balance_loss_clip": 1.02474344, + "balance_loss_mlp": 1.03876567, + "epoch": 0.5222906959266497, + "flos": 27814789272960.0, + "grad_norm": 1.664143868034185, + "language_loss": 0.70208037, + "learning_rate": 1.9526848203853927e-06, + "loss": 0.72351515, + "num_input_tokens_seen": 186610345, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6796875, + "step": 8687, + "time_per_iteration": 2.510512590408325 + }, + { + "auxiliary_loss_clip": 0.01107628, + "auxiliary_loss_mlp": 0.01033442, + "balance_loss_clip": 1.02171767, + "balance_loss_mlp": 1.03840709, + "epoch": 0.5223508191793176, + "flos": 12712988615040.0, + "grad_norm": 2.0206883326938407, + "language_loss": 0.82963884, + "learning_rate": 1.9522954686059936e-06, + "loss": 0.85104954, + "num_input_tokens_seen": 186624360, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.69140625, + "step": 8688, + "time_per_iteration": 2.4092836380004883 + }, + { + "auxiliary_loss_clip": 0.0110979, + "auxiliary_loss_mlp": 0.01033698, + "balance_loss_clip": 1.02107966, + "balance_loss_mlp": 1.04007506, + "epoch": 0.5224109424319856, + "flos": 15632238752640.0, + "grad_norm": 2.711188417076446, + "language_loss": 0.73736638, + "learning_rate": 1.9519061186355558e-06, + "loss": 0.75880128, + "num_input_tokens_seen": 186638680, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69921875, + "step": 8689, + "time_per_iteration": 2.4741477966308594 + }, + { + "auxiliary_loss_clip": 0.01109408, + "auxiliary_loss_mlp": 0.01033862, + "balance_loss_clip": 1.02147067, + "balance_loss_mlp": 1.04056704, + "epoch": 0.5224710656846535, + "flos": 15742233175680.0, + "grad_norm": 1.8604688899774438, + "language_loss": 0.82882619, + "learning_rate": 1.9515167704888417e-06, + "loss": 0.85025889, + "num_input_tokens_seen": 186655840, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 8690, + "time_per_iteration": 2.4194648265838623 + }, + { + "auxiliary_loss_clip": 0.01110389, + "auxiliary_loss_mlp": 0.01038197, + "balance_loss_clip": 1.02476192, + "balance_loss_mlp": 1.03937626, + "epoch": 0.5225311889373215, + "flos": 26030998938240.0, + "grad_norm": 2.3332187959772246, + "language_loss": 0.79397631, + "learning_rate": 1.9511274241806173e-06, + "loss": 0.81546217, + "num_input_tokens_seen": 186674150, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7109375, + "step": 8691, + "time_per_iteration": 2.52500319480896 + }, + { + "auxiliary_loss_clip": 0.01113798, + "auxiliary_loss_mlp": 0.0104147, + "balance_loss_clip": 1.02794003, + "balance_loss_mlp": 1.04154706, + "epoch": 0.5225913121899894, + "flos": 18369278173440.0, + "grad_norm": 1.8556717943569576, + "language_loss": 0.7679857, + "learning_rate": 1.950738079725646e-06, + "loss": 0.78953838, + "num_input_tokens_seen": 186690675, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.72265625, + "step": 8692, + "time_per_iteration": 2.4420764446258545 + }, + { + "auxiliary_loss_clip": 0.0110865, + "auxiliary_loss_mlp": 0.01032578, + "balance_loss_clip": 1.02139628, + "balance_loss_mlp": 1.04145277, + "epoch": 0.5226514354426575, + "flos": 29273516501760.0, + "grad_norm": 1.6990103355094375, + "language_loss": 0.72441196, + "learning_rate": 1.950348737138691e-06, + "loss": 0.74582422, + "num_input_tokens_seen": 186710380, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.671875, + "step": 8693, + "time_per_iteration": 2.551316261291504 + }, + { + "auxiliary_loss_clip": 0.01114591, + "auxiliary_loss_mlp": 0.01042549, + "balance_loss_clip": 1.02841115, + "balance_loss_mlp": 1.04073966, + "epoch": 0.5227115586953254, + "flos": 22853299466880.0, + "grad_norm": 1.780524663497215, + "language_loss": 0.81990045, + "learning_rate": 1.949959396434517e-06, + "loss": 0.84147185, + "num_input_tokens_seen": 186729135, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7421875, + "step": 8694, + "time_per_iteration": 2.4666013717651367 + }, + { + "auxiliary_loss_clip": 0.01036217, + "auxiliary_loss_mlp": 0.01006918, + "balance_loss_clip": 1.00584531, + "balance_loss_mlp": 1.01379716, + "epoch": 0.5227716819479934, + "flos": 57474419022720.0, + "grad_norm": 0.771665075265138, + "language_loss": 0.55743444, + "learning_rate": 1.949570057627888e-06, + "loss": 0.57786584, + "num_input_tokens_seen": 186791115, + "router_z_loss_clip": 0.01074219, + "router_z_loss_mlp": 0.22460938, + "step": 8695, + "time_per_iteration": 3.116420269012451 + }, + { + "auxiliary_loss_clip": 0.01111884, + "auxiliary_loss_mlp": 0.01033913, + "balance_loss_clip": 1.02121711, + "balance_loss_mlp": 1.04176521, + "epoch": 0.5228318052006613, + "flos": 13808264077440.0, + "grad_norm": 1.693403101851131, + "language_loss": 0.7333045, + "learning_rate": 1.9491807207335672e-06, + "loss": 0.75476253, + "num_input_tokens_seen": 186808660, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 8696, + "time_per_iteration": 2.437974452972412 + }, + { + "auxiliary_loss_clip": 0.01112043, + "auxiliary_loss_mlp": 0.01030931, + "balance_loss_clip": 1.01840782, + "balance_loss_mlp": 1.04123831, + "epoch": 0.5228919284533293, + "flos": 15596184476160.0, + "grad_norm": 1.6647399718358808, + "language_loss": 0.7097398, + "learning_rate": 1.948791385766319e-06, + "loss": 0.73116946, + "num_input_tokens_seen": 186825900, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.70703125, + "step": 8697, + "time_per_iteration": 2.5316948890686035 + }, + { + "auxiliary_loss_clip": 0.01107343, + "auxiliary_loss_mlp": 0.01028965, + "balance_loss_clip": 1.0171392, + "balance_loss_mlp": 1.04016519, + "epoch": 0.5229520517059973, + "flos": 22491499726080.0, + "grad_norm": 1.6518576838111187, + "language_loss": 0.80392116, + "learning_rate": 1.948402052740906e-06, + "loss": 0.82528424, + "num_input_tokens_seen": 186843735, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 8698, + "time_per_iteration": 2.4515864849090576 + }, + { + "auxiliary_loss_clip": 0.01110863, + "auxiliary_loss_mlp": 0.01034022, + "balance_loss_clip": 1.0218327, + "balance_loss_mlp": 1.04055512, + "epoch": 0.5230121749586653, + "flos": 22090880361600.0, + "grad_norm": 1.702568194733703, + "language_loss": 0.74550211, + "learning_rate": 1.948012721672093e-06, + "loss": 0.76695091, + "num_input_tokens_seen": 186862440, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.703125, + "step": 8699, + "time_per_iteration": 2.508180856704712 + }, + { + "auxiliary_loss_clip": 0.01114495, + "auxiliary_loss_mlp": 0.01030324, + "balance_loss_clip": 1.01700819, + "balance_loss_mlp": 1.04079318, + "epoch": 0.5230722982113333, + "flos": 22127150119680.0, + "grad_norm": 1.4994824070372519, + "language_loss": 0.73465139, + "learning_rate": 1.947623392574642e-06, + "loss": 0.75609958, + "num_input_tokens_seen": 186880940, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.734375, + "step": 8700, + "time_per_iteration": 2.455620765686035 + }, + { + "auxiliary_loss_clip": 0.01114495, + "auxiliary_loss_mlp": 0.01035916, + "balance_loss_clip": 1.02276719, + "balance_loss_mlp": 1.0418222, + "epoch": 0.5231324214640012, + "flos": 25009268572800.0, + "grad_norm": 1.82733314477648, + "language_loss": 0.66863132, + "learning_rate": 1.947234065463318e-06, + "loss": 0.69013548, + "num_input_tokens_seen": 186900785, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7265625, + "step": 8701, + "time_per_iteration": 2.5278706550598145 + }, + { + "auxiliary_loss_clip": 0.01107483, + "auxiliary_loss_mlp": 0.01034421, + "balance_loss_clip": 1.02162433, + "balance_loss_mlp": 1.03844106, + "epoch": 0.5231925447166692, + "flos": 25740517651200.0, + "grad_norm": 2.0326391886622686, + "language_loss": 0.66616488, + "learning_rate": 1.9468447403528826e-06, + "loss": 0.68758386, + "num_input_tokens_seen": 186920895, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69140625, + "step": 8702, + "time_per_iteration": 2.474238872528076 + }, + { + "auxiliary_loss_clip": 0.01110954, + "auxiliary_loss_mlp": 0.01033297, + "balance_loss_clip": 1.02040434, + "balance_loss_mlp": 1.04128182, + "epoch": 0.5232526679693371, + "flos": 21433930565760.0, + "grad_norm": 1.9248840397651374, + "language_loss": 0.7671175, + "learning_rate": 1.946455417258101e-06, + "loss": 0.78856003, + "num_input_tokens_seen": 186940605, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 8703, + "time_per_iteration": 2.466836929321289 + }, + { + "auxiliary_loss_clip": 0.01115826, + "auxiliary_loss_mlp": 0.0104125, + "balance_loss_clip": 1.02648616, + "balance_loss_mlp": 1.04065156, + "epoch": 0.5233127912220051, + "flos": 35298393471360.0, + "grad_norm": 2.7352924521395576, + "language_loss": 0.76380461, + "learning_rate": 1.9460660961937348e-06, + "loss": 0.78537536, + "num_input_tokens_seen": 186960820, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.75390625, + "step": 8704, + "time_per_iteration": 2.566021680831909 + }, + { + "auxiliary_loss_clip": 0.01109442, + "auxiliary_loss_mlp": 0.01039766, + "balance_loss_clip": 1.0272727, + "balance_loss_mlp": 1.04157901, + "epoch": 0.523372914474673, + "flos": 17051320344960.0, + "grad_norm": 1.6527680542100833, + "language_loss": 0.7804389, + "learning_rate": 1.9456767771745474e-06, + "loss": 0.80193096, + "num_input_tokens_seen": 186976240, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 8705, + "time_per_iteration": 2.4414021968841553 + }, + { + "auxiliary_loss_clip": 0.01113477, + "auxiliary_loss_mlp": 0.01028501, + "balance_loss_clip": 1.01545918, + "balance_loss_mlp": 1.04121351, + "epoch": 0.5234330377273411, + "flos": 18406302117120.0, + "grad_norm": 1.9173845394592544, + "language_loss": 0.69808084, + "learning_rate": 1.9452874602153027e-06, + "loss": 0.7195006, + "num_input_tokens_seen": 186992855, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.72265625, + "step": 8706, + "time_per_iteration": 2.4252305030822754 + }, + { + "auxiliary_loss_clip": 0.01033927, + "auxiliary_loss_mlp": 0.00999849, + "balance_loss_clip": 0.99876386, + "balance_loss_mlp": 1.01179016, + "epoch": 0.523493160980009, + "flos": 65850296970240.0, + "grad_norm": 0.6804801593959132, + "language_loss": 0.52532774, + "learning_rate": 1.9448981453307623e-06, + "loss": 0.5456655, + "num_input_tokens_seen": 187051205, + "router_z_loss_clip": 0.01086426, + "router_z_loss_mlp": 0.22167969, + "step": 8707, + "time_per_iteration": 3.142758369445801 + }, + { + "auxiliary_loss_clip": 0.01109991, + "auxiliary_loss_mlp": 0.01035147, + "balance_loss_clip": 1.02262449, + "balance_loss_mlp": 1.03904724, + "epoch": 0.523553284232677, + "flos": 21872076664320.0, + "grad_norm": 1.7383881327323734, + "language_loss": 0.74716955, + "learning_rate": 1.9445088325356904e-06, + "loss": 0.76862097, + "num_input_tokens_seen": 187070540, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7109375, + "step": 8708, + "time_per_iteration": 2.4591562747955322 + }, + { + "auxiliary_loss_clip": 0.01109127, + "auxiliary_loss_mlp": 0.01026089, + "balance_loss_clip": 1.01402545, + "balance_loss_mlp": 1.04014444, + "epoch": 0.5236134074853449, + "flos": 20848191482880.0, + "grad_norm": 1.691977522935515, + "language_loss": 0.77432841, + "learning_rate": 1.944119521844849e-06, + "loss": 0.79568058, + "num_input_tokens_seen": 187089975, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.69140625, + "step": 8709, + "time_per_iteration": 2.480982780456543 + }, + { + "auxiliary_loss_clip": 0.01114299, + "auxiliary_loss_mlp": 0.01034543, + "balance_loss_clip": 1.01927257, + "balance_loss_mlp": 1.03814077, + "epoch": 0.5236735307380129, + "flos": 25520421064320.0, + "grad_norm": 1.9878514646446084, + "language_loss": 0.8357569, + "learning_rate": 1.9437302132730003e-06, + "loss": 0.85724527, + "num_input_tokens_seen": 187108775, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.76171875, + "step": 8710, + "time_per_iteration": 2.4901626110076904 + }, + { + "auxiliary_loss_clip": 0.01107894, + "auxiliary_loss_mlp": 0.01026835, + "balance_loss_clip": 1.01440704, + "balance_loss_mlp": 1.03936791, + "epoch": 0.523733653990681, + "flos": 23583112001280.0, + "grad_norm": 1.6699101384293633, + "language_loss": 0.69427162, + "learning_rate": 1.943340906834908e-06, + "loss": 0.71561891, + "num_input_tokens_seen": 187128830, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 8711, + "time_per_iteration": 2.476573944091797 + }, + { + "auxiliary_loss_clip": 0.01108558, + "auxiliary_loss_mlp": 0.0103175, + "balance_loss_clip": 1.01879799, + "balance_loss_mlp": 1.03732038, + "epoch": 0.5237937772433489, + "flos": 21106245767040.0, + "grad_norm": 1.8448951706521464, + "language_loss": 0.83195686, + "learning_rate": 1.9429516025453345e-06, + "loss": 0.85335994, + "num_input_tokens_seen": 187149570, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 8712, + "time_per_iteration": 2.4485836029052734 + }, + { + "auxiliary_loss_clip": 0.01111097, + "auxiliary_loss_mlp": 0.01036681, + "balance_loss_clip": 1.02286506, + "balance_loss_mlp": 1.03859973, + "epoch": 0.5238539004960169, + "flos": 19172887200000.0, + "grad_norm": 1.7709353735200277, + "language_loss": 0.69517416, + "learning_rate": 1.9425623004190415e-06, + "loss": 0.71665198, + "num_input_tokens_seen": 187170575, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7265625, + "step": 8713, + "time_per_iteration": 2.496649980545044 + }, + { + "auxiliary_loss_clip": 0.01112233, + "auxiliary_loss_mlp": 0.01033578, + "balance_loss_clip": 1.01934421, + "balance_loss_mlp": 1.03752589, + "epoch": 0.5239140237486848, + "flos": 17888218300800.0, + "grad_norm": 2.61615049353435, + "language_loss": 0.76978022, + "learning_rate": 1.9421730004707925e-06, + "loss": 0.79123831, + "num_input_tokens_seen": 187187190, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.74609375, + "step": 8714, + "time_per_iteration": 2.42134428024292 + }, + { + "auxiliary_loss_clip": 0.01113875, + "auxiliary_loss_mlp": 0.01030729, + "balance_loss_clip": 1.01703143, + "balance_loss_mlp": 1.04200637, + "epoch": 0.5239741470013528, + "flos": 17930413802880.0, + "grad_norm": 1.883747352805191, + "language_loss": 0.75953126, + "learning_rate": 1.9417837027153483e-06, + "loss": 0.78097725, + "num_input_tokens_seen": 187204350, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71875, + "step": 8715, + "time_per_iteration": 2.453313112258911 + }, + { + "auxiliary_loss_clip": 0.01106451, + "auxiliary_loss_mlp": 0.01030996, + "balance_loss_clip": 1.01807356, + "balance_loss_mlp": 1.0377413, + "epoch": 0.5240342702540207, + "flos": 30993386584320.0, + "grad_norm": 1.4951701207047352, + "language_loss": 0.7078892, + "learning_rate": 1.9413944071674723e-06, + "loss": 0.72926366, + "num_input_tokens_seen": 187225605, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6875, + "step": 8716, + "time_per_iteration": 2.536285638809204 + }, + { + "auxiliary_loss_clip": 0.01107976, + "auxiliary_loss_mlp": 0.01035517, + "balance_loss_clip": 1.02394176, + "balance_loss_mlp": 1.03838778, + "epoch": 0.5240943935066887, + "flos": 25005066681600.0, + "grad_norm": 2.055978260271784, + "language_loss": 0.86706465, + "learning_rate": 1.941005113841926e-06, + "loss": 0.88849956, + "num_input_tokens_seen": 187241335, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6953125, + "step": 8717, + "time_per_iteration": 2.5015134811401367 + }, + { + "auxiliary_loss_clip": 0.01108796, + "auxiliary_loss_mlp": 0.01029411, + "balance_loss_clip": 1.01737654, + "balance_loss_mlp": 1.03882921, + "epoch": 0.5241545167593566, + "flos": 23659099223040.0, + "grad_norm": 1.8178940063432978, + "language_loss": 0.60516441, + "learning_rate": 1.9406158227534723e-06, + "loss": 0.6265465, + "num_input_tokens_seen": 187259925, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.69921875, + "step": 8718, + "time_per_iteration": 4.028836488723755 + }, + { + "auxiliary_loss_clip": 0.01112927, + "auxiliary_loss_mlp": 0.01031382, + "balance_loss_clip": 1.01830447, + "balance_loss_mlp": 1.04012215, + "epoch": 0.5242146400120247, + "flos": 23400398494080.0, + "grad_norm": 1.7437517815053911, + "language_loss": 0.71897364, + "learning_rate": 1.940226533916872e-06, + "loss": 0.74041677, + "num_input_tokens_seen": 187279035, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 8719, + "time_per_iteration": 2.455796003341675 + }, + { + "auxiliary_loss_clip": 0.01106409, + "auxiliary_loss_mlp": 0.01027949, + "balance_loss_clip": 1.0163976, + "balance_loss_mlp": 1.03797865, + "epoch": 0.5242747632646926, + "flos": 17749065012480.0, + "grad_norm": 1.705660803101178, + "language_loss": 0.72716737, + "learning_rate": 1.9398372473468877e-06, + "loss": 0.74851096, + "num_input_tokens_seen": 187297555, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.68359375, + "step": 8720, + "time_per_iteration": 2.445131301879883 + }, + { + "auxiliary_loss_clip": 0.01110289, + "auxiliary_loss_mlp": 0.01032373, + "balance_loss_clip": 1.01948094, + "balance_loss_mlp": 1.04000795, + "epoch": 0.5243348865173606, + "flos": 32597731549440.0, + "grad_norm": 1.6022030744217663, + "language_loss": 0.70251679, + "learning_rate": 1.939447963058281e-06, + "loss": 0.72394347, + "num_input_tokens_seen": 187320265, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 8721, + "time_per_iteration": 5.4637322425842285 + }, + { + "auxiliary_loss_clip": 0.01106478, + "auxiliary_loss_mlp": 0.01031051, + "balance_loss_clip": 1.01883805, + "balance_loss_mlp": 1.03700781, + "epoch": 0.5243950097700285, + "flos": 25484115392640.0, + "grad_norm": 1.710812698690052, + "language_loss": 0.86623824, + "learning_rate": 1.939058681065813e-06, + "loss": 0.88761353, + "num_input_tokens_seen": 187338045, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6953125, + "step": 8722, + "time_per_iteration": 2.4582130908966064 + }, + { + "auxiliary_loss_clip": 0.01107687, + "auxiliary_loss_mlp": 0.01032314, + "balance_loss_clip": 1.01850319, + "balance_loss_mlp": 1.03929901, + "epoch": 0.5244551330226965, + "flos": 15268391936640.0, + "grad_norm": 1.6752601944842513, + "language_loss": 0.79654807, + "learning_rate": 1.938669401384247e-06, + "loss": 0.8179481, + "num_input_tokens_seen": 187356040, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.68359375, + "step": 8723, + "time_per_iteration": 2.4436798095703125 + }, + { + "auxiliary_loss_clip": 0.0111223, + "auxiliary_loss_mlp": 0.01035425, + "balance_loss_clip": 1.02165055, + "balance_loss_mlp": 1.04074168, + "epoch": 0.5245152562753645, + "flos": 22237108629120.0, + "grad_norm": 2.2643940307400054, + "language_loss": 0.74980783, + "learning_rate": 1.9382801240283426e-06, + "loss": 0.77128434, + "num_input_tokens_seen": 187374185, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.71484375, + "step": 8724, + "time_per_iteration": 2.4523351192474365 + }, + { + "auxiliary_loss_clip": 0.01114812, + "auxiliary_loss_mlp": 0.01033971, + "balance_loss_clip": 1.0193913, + "balance_loss_mlp": 1.03920281, + "epoch": 0.5245753795280325, + "flos": 29426460612480.0, + "grad_norm": 1.7907307804166401, + "language_loss": 0.70031178, + "learning_rate": 1.9378908490128625e-06, + "loss": 0.72179961, + "num_input_tokens_seen": 187396640, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7578125, + "step": 8725, + "time_per_iteration": 2.548102617263794 + }, + { + "auxiliary_loss_clip": 0.01033499, + "auxiliary_loss_mlp": 0.00998708, + "balance_loss_clip": 0.99755734, + "balance_loss_mlp": 1.01092362, + "epoch": 0.5246355027807005, + "flos": 58834392785280.0, + "grad_norm": 0.7538969042021075, + "language_loss": 0.55637997, + "learning_rate": 1.937501576352568e-06, + "loss": 0.576702, + "num_input_tokens_seen": 187455945, + "router_z_loss_clip": 0.01147461, + "router_z_loss_mlp": 0.2265625, + "step": 8726, + "time_per_iteration": 3.055438995361328 + }, + { + "auxiliary_loss_clip": 0.01033831, + "auxiliary_loss_mlp": 0.00998072, + "balance_loss_clip": 0.99698144, + "balance_loss_mlp": 1.01147294, + "epoch": 0.5246956260333684, + "flos": 64526592965760.0, + "grad_norm": 0.8042859023243575, + "language_loss": 0.58400142, + "learning_rate": 1.937112306062219e-06, + "loss": 0.60432053, + "num_input_tokens_seen": 187519975, + "router_z_loss_clip": 0.01092529, + "router_z_loss_mlp": 0.22460938, + "step": 8727, + "time_per_iteration": 3.071913719177246 + }, + { + "auxiliary_loss_clip": 0.0111222, + "auxiliary_loss_mlp": 0.0103046, + "balance_loss_clip": 1.01701272, + "balance_loss_mlp": 1.03976107, + "epoch": 0.5247557492860364, + "flos": 24533631653760.0, + "grad_norm": 1.3114988788354258, + "language_loss": 0.70559728, + "learning_rate": 1.9367230381565786e-06, + "loss": 0.72702408, + "num_input_tokens_seen": 187541775, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.72265625, + "step": 8728, + "time_per_iteration": -0.15050816535949707 + }, + { + "auxiliary_loss_clip": 0.01108011, + "auxiliary_loss_mlp": 0.01026221, + "balance_loss_clip": 1.01421666, + "balance_loss_mlp": 1.03783965, + "epoch": 0.5248158725387043, + "flos": 18806131382400.0, + "grad_norm": 1.5256282262341387, + "language_loss": 0.6966821, + "learning_rate": 1.9363337726504062e-06, + "loss": 0.71802437, + "num_input_tokens_seen": 187560425, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.703125, + "step": 8729, + "time_per_iteration": 2.470921039581299 + }, + { + "auxiliary_loss_clip": 0.0111289, + "auxiliary_loss_mlp": 0.01031243, + "balance_loss_clip": 1.01859486, + "balance_loss_mlp": 1.04002178, + "epoch": 0.5248759957913723, + "flos": 20955851521920.0, + "grad_norm": 1.7430499295764175, + "language_loss": 0.83498538, + "learning_rate": 1.935944509558464e-06, + "loss": 0.85642672, + "num_input_tokens_seen": 187579930, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7265625, + "step": 8730, + "time_per_iteration": 2.447209358215332 + }, + { + "auxiliary_loss_clip": 0.01109708, + "auxiliary_loss_mlp": 0.01033665, + "balance_loss_clip": 1.02034974, + "balance_loss_mlp": 1.03944659, + "epoch": 0.5249361190440403, + "flos": 18660980522880.0, + "grad_norm": 2.372255604306618, + "language_loss": 0.79440451, + "learning_rate": 1.9355552488955125e-06, + "loss": 0.81583822, + "num_input_tokens_seen": 187595365, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.703125, + "step": 8731, + "time_per_iteration": 2.4764487743377686 + }, + { + "auxiliary_loss_clip": 0.01104468, + "auxiliary_loss_mlp": 0.0103625, + "balance_loss_clip": 1.02373886, + "balance_loss_mlp": 1.03691411, + "epoch": 0.5249962422967083, + "flos": 24863327614080.0, + "grad_norm": 1.577877427677953, + "language_loss": 0.83057785, + "learning_rate": 1.935165990676312e-06, + "loss": 0.8519851, + "num_input_tokens_seen": 187614715, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.67578125, + "step": 8732, + "time_per_iteration": 2.4856929779052734 + }, + { + "auxiliary_loss_clip": 0.01106984, + "auxiliary_loss_mlp": 0.01032336, + "balance_loss_clip": 1.02020669, + "balance_loss_mlp": 1.03737712, + "epoch": 0.5250563655493762, + "flos": 15262681674240.0, + "grad_norm": 1.6308728168221684, + "language_loss": 0.77874607, + "learning_rate": 1.9347767349156237e-06, + "loss": 0.80013925, + "num_input_tokens_seen": 187630745, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 8733, + "time_per_iteration": 2.440887212753296 + }, + { + "auxiliary_loss_clip": 0.01113418, + "auxiliary_loss_mlp": 0.01038191, + "balance_loss_clip": 1.02521539, + "balance_loss_mlp": 1.04069221, + "epoch": 0.5251164888020442, + "flos": 18625177641600.0, + "grad_norm": 1.8154235824744323, + "language_loss": 0.81740808, + "learning_rate": 1.934387481628208e-06, + "loss": 0.83892411, + "num_input_tokens_seen": 187648200, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7265625, + "step": 8734, + "time_per_iteration": 2.4394965171813965 + }, + { + "auxiliary_loss_clip": 0.01106918, + "auxiliary_loss_mlp": 0.01030684, + "balance_loss_clip": 1.01828647, + "balance_loss_mlp": 1.03909111, + "epoch": 0.5251766120547121, + "flos": 29710764760320.0, + "grad_norm": 1.3786944232239873, + "language_loss": 0.76792759, + "learning_rate": 1.933998230828826e-06, + "loss": 0.78930354, + "num_input_tokens_seen": 187669205, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 8735, + "time_per_iteration": 2.5392351150512695 + }, + { + "auxiliary_loss_clip": 0.01110743, + "auxiliary_loss_mlp": 0.01030467, + "balance_loss_clip": 1.01861119, + "balance_loss_mlp": 1.03907919, + "epoch": 0.5252367353073801, + "flos": 23440295525760.0, + "grad_norm": 1.5767625018953106, + "language_loss": 0.80153042, + "learning_rate": 1.9336089825322376e-06, + "loss": 0.8229425, + "num_input_tokens_seen": 187690890, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.71484375, + "step": 8736, + "time_per_iteration": 2.470860242843628 + }, + { + "auxiliary_loss_clip": 0.01110972, + "auxiliary_loss_mlp": 0.01032625, + "balance_loss_clip": 1.01979208, + "balance_loss_mlp": 1.04068267, + "epoch": 0.5252968585600482, + "flos": 30810708990720.0, + "grad_norm": 2.2098484474485716, + "language_loss": 0.69838667, + "learning_rate": 1.9332197367532033e-06, + "loss": 0.71982265, + "num_input_tokens_seen": 187713045, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 8737, + "time_per_iteration": 2.5947840213775635 + }, + { + "auxiliary_loss_clip": 0.01106725, + "auxiliary_loss_mlp": 0.0103151, + "balance_loss_clip": 1.01885569, + "balance_loss_mlp": 1.0369395, + "epoch": 0.5253569818127161, + "flos": 20628274464000.0, + "grad_norm": 1.4975240773091183, + "language_loss": 0.77464664, + "learning_rate": 1.9328304935064833e-06, + "loss": 0.79602897, + "num_input_tokens_seen": 187733640, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 8738, + "time_per_iteration": 2.4910526275634766 + }, + { + "auxiliary_loss_clip": 0.01034294, + "auxiliary_loss_mlp": 0.01014673, + "balance_loss_clip": 1.01349294, + "balance_loss_mlp": 1.01161027, + "epoch": 0.5254171050653841, + "flos": 63428695810560.0, + "grad_norm": 0.7501251002484244, + "language_loss": 0.54472572, + "learning_rate": 1.932441252806837e-06, + "loss": 0.56521541, + "num_input_tokens_seen": 187792930, + "router_z_loss_clip": 0.01177979, + "router_z_loss_mlp": 0.2265625, + "step": 8739, + "time_per_iteration": 3.0936102867126465 + }, + { + "auxiliary_loss_clip": 0.01108375, + "auxiliary_loss_mlp": 0.01032467, + "balance_loss_clip": 1.02058792, + "balance_loss_mlp": 1.03920436, + "epoch": 0.525477228318052, + "flos": 34670782108800.0, + "grad_norm": 4.076584700627864, + "language_loss": 0.847902, + "learning_rate": 1.9320520146690263e-06, + "loss": 0.86931044, + "num_input_tokens_seen": 187812495, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.69140625, + "step": 8740, + "time_per_iteration": 2.5510640144348145 + }, + { + "auxiliary_loss_clip": 0.01107783, + "auxiliary_loss_mlp": 0.01034609, + "balance_loss_clip": 1.02204442, + "balance_loss_mlp": 1.0391773, + "epoch": 0.52553735157072, + "flos": 17930844766080.0, + "grad_norm": 1.9479054855450806, + "language_loss": 0.69464219, + "learning_rate": 1.9316627791078093e-06, + "loss": 0.71606612, + "num_input_tokens_seen": 187829685, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 8741, + "time_per_iteration": 2.4474291801452637 + }, + { + "auxiliary_loss_clip": 0.01112521, + "auxiliary_loss_mlp": 0.01029406, + "balance_loss_clip": 1.01657915, + "balance_loss_mlp": 1.04100168, + "epoch": 0.5255974748233879, + "flos": 9940864584960.0, + "grad_norm": 1.7696604002482594, + "language_loss": 0.6591152, + "learning_rate": 1.931273546137947e-06, + "loss": 0.68053448, + "num_input_tokens_seen": 187846495, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.71484375, + "step": 8742, + "time_per_iteration": 2.4151360988616943 + }, + { + "auxiliary_loss_clip": 0.01112065, + "auxiliary_loss_mlp": 0.01035718, + "balance_loss_clip": 1.02191377, + "balance_loss_mlp": 1.03977919, + "epoch": 0.5256575980760559, + "flos": 16868427269760.0, + "grad_norm": 2.337521906395912, + "language_loss": 0.63094312, + "learning_rate": 1.9308843157741983e-06, + "loss": 0.65242094, + "num_input_tokens_seen": 187862010, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.72265625, + "step": 8743, + "time_per_iteration": 2.4369444847106934 + }, + { + "auxiliary_loss_clip": 0.01033192, + "auxiliary_loss_mlp": 0.01006558, + "balance_loss_clip": 1.00549126, + "balance_loss_mlp": 1.01085198, + "epoch": 0.5257177213287239, + "flos": 62386210362240.0, + "grad_norm": 0.7737212884291378, + "language_loss": 0.54199207, + "learning_rate": 1.930495088031323e-06, + "loss": 0.56238955, + "num_input_tokens_seen": 187922730, + "router_z_loss_clip": 0.01068115, + "router_z_loss_mlp": 0.22363281, + "step": 8744, + "time_per_iteration": 3.1759095191955566 + }, + { + "auxiliary_loss_clip": 0.01114357, + "auxiliary_loss_mlp": 0.01031493, + "balance_loss_clip": 1.01773655, + "balance_loss_mlp": 1.04095125, + "epoch": 0.5257778445813919, + "flos": 20776908942720.0, + "grad_norm": 2.20739797588364, + "language_loss": 0.75574982, + "learning_rate": 1.9301058629240814e-06, + "loss": 0.77720833, + "num_input_tokens_seen": 187940160, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.734375, + "step": 8745, + "time_per_iteration": 2.447798728942871 + }, + { + "auxiliary_loss_clip": 0.01109482, + "auxiliary_loss_mlp": 0.01035025, + "balance_loss_clip": 1.02291942, + "balance_loss_mlp": 1.03964972, + "epoch": 0.5258379678340598, + "flos": 17018606033280.0, + "grad_norm": 1.9635902719056224, + "language_loss": 0.80408484, + "learning_rate": 1.9297166404672324e-06, + "loss": 0.82552993, + "num_input_tokens_seen": 187958625, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69921875, + "step": 8746, + "time_per_iteration": 2.4415667057037354 + }, + { + "auxiliary_loss_clip": 0.01107231, + "auxiliary_loss_mlp": 0.01032675, + "balance_loss_clip": 1.02002132, + "balance_loss_mlp": 1.03842771, + "epoch": 0.5258980910867278, + "flos": 21068754946560.0, + "grad_norm": 1.8094795225841998, + "language_loss": 0.75289273, + "learning_rate": 1.9293274206755353e-06, + "loss": 0.77429175, + "num_input_tokens_seen": 187977575, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 8747, + "time_per_iteration": 2.4909451007843018 + }, + { + "auxiliary_loss_clip": 0.01103122, + "auxiliary_loss_mlp": 0.01029398, + "balance_loss_clip": 1.01716161, + "balance_loss_mlp": 1.03701103, + "epoch": 0.5259582143393957, + "flos": 18004461690240.0, + "grad_norm": 2.3964471896172554, + "language_loss": 0.82515085, + "learning_rate": 1.9289382035637505e-06, + "loss": 0.84647602, + "num_input_tokens_seen": 187996650, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6640625, + "step": 8748, + "time_per_iteration": 2.4266607761383057 + }, + { + "auxiliary_loss_clip": 0.01108131, + "auxiliary_loss_mlp": 0.01032899, + "balance_loss_clip": 1.01948202, + "balance_loss_mlp": 1.03713202, + "epoch": 0.5260183375920637, + "flos": 22783848520320.0, + "grad_norm": 1.9711847853488498, + "language_loss": 0.80562335, + "learning_rate": 1.9285489891466345e-06, + "loss": 0.82703364, + "num_input_tokens_seen": 188013510, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7109375, + "step": 8749, + "time_per_iteration": 2.4795496463775635 + }, + { + "auxiliary_loss_clip": 0.01108885, + "auxiliary_loss_mlp": 0.01033706, + "balance_loss_clip": 1.02076626, + "balance_loss_mlp": 1.04021406, + "epoch": 0.5260784608447318, + "flos": 27052406081280.0, + "grad_norm": 1.712765899743528, + "language_loss": 0.72119522, + "learning_rate": 1.9281597774389487e-06, + "loss": 0.74262118, + "num_input_tokens_seen": 188032085, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6875, + "step": 8750, + "time_per_iteration": 2.5028066635131836 + }, + { + "auxiliary_loss_clip": 0.01105706, + "auxiliary_loss_mlp": 0.01029058, + "balance_loss_clip": 1.0166955, + "balance_loss_mlp": 1.03688407, + "epoch": 0.5261385840973997, + "flos": 20662820369280.0, + "grad_norm": 1.3484208983844765, + "language_loss": 0.76440692, + "learning_rate": 1.9277705684554517e-06, + "loss": 0.78575456, + "num_input_tokens_seen": 188050590, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6875, + "step": 8751, + "time_per_iteration": 2.49141788482666 + }, + { + "auxiliary_loss_clip": 0.0110665, + "auxiliary_loss_mlp": 0.0103178, + "balance_loss_clip": 1.01973987, + "balance_loss_mlp": 1.03969383, + "epoch": 0.5261987073500677, + "flos": 23622649896960.0, + "grad_norm": 1.3930828226372818, + "language_loss": 0.75950229, + "learning_rate": 1.927381362210902e-06, + "loss": 0.78088653, + "num_input_tokens_seen": 188071620, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66796875, + "step": 8752, + "time_per_iteration": 2.4891488552093506 + }, + { + "auxiliary_loss_clip": 0.01110452, + "auxiliary_loss_mlp": 0.0102755, + "balance_loss_clip": 1.01418078, + "balance_loss_mlp": 1.03927755, + "epoch": 0.5262588306027356, + "flos": 27636241743360.0, + "grad_norm": 1.4497375157025647, + "language_loss": 0.6776315, + "learning_rate": 1.926992158720058e-06, + "loss": 0.69901145, + "num_input_tokens_seen": 188091740, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7109375, + "step": 8753, + "time_per_iteration": 2.5364086627960205 + }, + { + "auxiliary_loss_clip": 0.01108994, + "auxiliary_loss_mlp": 0.01032754, + "balance_loss_clip": 1.02072024, + "balance_loss_mlp": 1.04052699, + "epoch": 0.5263189538554036, + "flos": 21759711943680.0, + "grad_norm": 1.4822261150811287, + "language_loss": 0.83834231, + "learning_rate": 1.9266029579976785e-06, + "loss": 0.85975981, + "num_input_tokens_seen": 188111165, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6875, + "step": 8754, + "time_per_iteration": 2.4782354831695557 + }, + { + "auxiliary_loss_clip": 0.01108303, + "auxiliary_loss_mlp": 0.01031002, + "balance_loss_clip": 1.01821733, + "balance_loss_mlp": 1.03804278, + "epoch": 0.5263790771080715, + "flos": 14276359140480.0, + "grad_norm": 2.116384687985529, + "language_loss": 0.8708753, + "learning_rate": 1.926213760058522e-06, + "loss": 0.8922683, + "num_input_tokens_seen": 188127825, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.703125, + "step": 8755, + "time_per_iteration": 2.426422357559204 + }, + { + "auxiliary_loss_clip": 0.01031717, + "auxiliary_loss_mlp": 0.01000414, + "balance_loss_clip": 0.99934119, + "balance_loss_mlp": 1.0092082, + "epoch": 0.5264392003607395, + "flos": 65806413528960.0, + "grad_norm": 0.7185760813251492, + "language_loss": 0.58853483, + "learning_rate": 1.9258245649173477e-06, + "loss": 0.60885608, + "num_input_tokens_seen": 188194050, + "router_z_loss_clip": 0.01074219, + "router_z_loss_mlp": 0.22460938, + "step": 8756, + "time_per_iteration": 3.1429710388183594 + }, + { + "auxiliary_loss_clip": 0.01109539, + "auxiliary_loss_mlp": 0.01033323, + "balance_loss_clip": 1.02052546, + "balance_loss_mlp": 1.03787899, + "epoch": 0.5264993236134075, + "flos": 21032413361280.0, + "grad_norm": 4.297833550953773, + "language_loss": 0.70166421, + "learning_rate": 1.925435372588913e-06, + "loss": 0.72309285, + "num_input_tokens_seen": 188212565, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.71875, + "step": 8757, + "time_per_iteration": 2.4352152347564697 + }, + { + "auxiliary_loss_clip": 0.01108207, + "auxiliary_loss_mlp": 0.01030645, + "balance_loss_clip": 1.01828289, + "balance_loss_mlp": 1.03741014, + "epoch": 0.5265594468660755, + "flos": 16618202150400.0, + "grad_norm": 1.637312529409449, + "language_loss": 0.8773526, + "learning_rate": 1.9250461830879768e-06, + "loss": 0.89874113, + "num_input_tokens_seen": 188229505, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.70703125, + "step": 8758, + "time_per_iteration": 2.4447832107543945 + }, + { + "auxiliary_loss_clip": 0.0110992, + "auxiliary_loss_mlp": 0.01033594, + "balance_loss_clip": 1.02048147, + "balance_loss_mlp": 1.03790975, + "epoch": 0.5266195701187434, + "flos": 24134125610880.0, + "grad_norm": 1.3883962898678874, + "language_loss": 0.76014191, + "learning_rate": 1.9246569964292965e-06, + "loss": 0.78157705, + "num_input_tokens_seen": 188250395, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.72265625, + "step": 8759, + "time_per_iteration": 2.4818501472473145 + }, + { + "auxiliary_loss_clip": 0.01102801, + "auxiliary_loss_mlp": 0.01026631, + "balance_loss_clip": 1.01460838, + "balance_loss_mlp": 1.0357269, + "epoch": 0.5266796933714114, + "flos": 15844111125120.0, + "grad_norm": 1.9978294175433113, + "language_loss": 0.71896535, + "learning_rate": 1.9242678126276307e-06, + "loss": 0.74025965, + "num_input_tokens_seen": 188266785, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 8760, + "time_per_iteration": 3.8544509410858154 + }, + { + "auxiliary_loss_clip": 0.01113013, + "auxiliary_loss_mlp": 0.01034813, + "balance_loss_clip": 1.02161074, + "balance_loss_mlp": 1.03947306, + "epoch": 0.5267398166240793, + "flos": 20951434149120.0, + "grad_norm": 1.9164441807727424, + "language_loss": 0.76221085, + "learning_rate": 1.923878631697736e-06, + "loss": 0.78368914, + "num_input_tokens_seen": 188282525, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 8761, + "time_per_iteration": 2.43031907081604 + }, + { + "auxiliary_loss_clip": 0.01108816, + "auxiliary_loss_mlp": 0.01030109, + "balance_loss_clip": 1.01879597, + "balance_loss_mlp": 1.03958154, + "epoch": 0.5267999398767473, + "flos": 20996394998400.0, + "grad_norm": 1.712095639698782, + "language_loss": 0.70643085, + "learning_rate": 1.923489453654373e-06, + "loss": 0.7278201, + "num_input_tokens_seen": 188301395, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.69140625, + "step": 8762, + "time_per_iteration": 5.321688652038574 + }, + { + "auxiliary_loss_clip": 0.01031212, + "auxiliary_loss_mlp": 0.0100382, + "balance_loss_clip": 1.00266957, + "balance_loss_mlp": 1.00896931, + "epoch": 0.5268600631294152, + "flos": 66849401767680.0, + "grad_norm": 0.9468786857883086, + "language_loss": 0.65414345, + "learning_rate": 1.9231002785122963e-06, + "loss": 0.67449379, + "num_input_tokens_seen": 188357665, + "router_z_loss_clip": 0.01147461, + "router_z_loss_mlp": 0.22265625, + "step": 8763, + "time_per_iteration": 4.360533237457275 + }, + { + "auxiliary_loss_clip": 0.0110798, + "auxiliary_loss_mlp": 0.01031724, + "balance_loss_clip": 1.01927209, + "balance_loss_mlp": 1.03798556, + "epoch": 0.5269201863820833, + "flos": 17165552572800.0, + "grad_norm": 1.6073395480000416, + "language_loss": 0.70771408, + "learning_rate": 1.922711106286265e-06, + "loss": 0.72911114, + "num_input_tokens_seen": 188376935, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.69921875, + "step": 8764, + "time_per_iteration": 2.4463791847229004 + }, + { + "auxiliary_loss_clip": 0.0110759, + "auxiliary_loss_mlp": 0.01029413, + "balance_loss_clip": 1.01640153, + "balance_loss_mlp": 1.03704798, + "epoch": 0.5269803096347513, + "flos": 20522589672960.0, + "grad_norm": 1.6766716538329436, + "language_loss": 0.74135405, + "learning_rate": 1.9223219369910368e-06, + "loss": 0.76272404, + "num_input_tokens_seen": 188394995, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.703125, + "step": 8765, + "time_per_iteration": 2.4344265460968018 + }, + { + "auxiliary_loss_clip": 0.0110988, + "auxiliary_loss_mlp": 0.01033345, + "balance_loss_clip": 1.0194571, + "balance_loss_mlp": 1.03650451, + "epoch": 0.5270404328874192, + "flos": 27230989524480.0, + "grad_norm": 1.4935943977467754, + "language_loss": 0.85193348, + "learning_rate": 1.9219327706413677e-06, + "loss": 0.87336564, + "num_input_tokens_seen": 188415475, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.734375, + "step": 8766, + "time_per_iteration": 2.52951979637146 + }, + { + "auxiliary_loss_clip": 0.0111099, + "auxiliary_loss_mlp": 0.01033628, + "balance_loss_clip": 1.0204674, + "balance_loss_mlp": 1.03980124, + "epoch": 0.5271005561400872, + "flos": 23110491824640.0, + "grad_norm": 1.7377061989269131, + "language_loss": 0.79036993, + "learning_rate": 1.921543607252017e-06, + "loss": 0.8118161, + "num_input_tokens_seen": 188435665, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 8767, + "time_per_iteration": 2.4478976726531982 + }, + { + "auxiliary_loss_clip": 0.0110965, + "auxiliary_loss_mlp": 0.01032349, + "balance_loss_clip": 1.01897943, + "balance_loss_mlp": 1.03842282, + "epoch": 0.5271606793927551, + "flos": 22564793427840.0, + "grad_norm": 1.871676480421452, + "language_loss": 0.73691523, + "learning_rate": 1.9211544468377394e-06, + "loss": 0.75833523, + "num_input_tokens_seen": 188455405, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7109375, + "step": 8768, + "time_per_iteration": 2.464952230453491 + }, + { + "auxiliary_loss_clip": 0.01106727, + "auxiliary_loss_mlp": 0.0103356, + "balance_loss_clip": 1.02222896, + "balance_loss_mlp": 1.03777611, + "epoch": 0.5272208026454231, + "flos": 18764259102720.0, + "grad_norm": 3.4895191769574354, + "language_loss": 0.74093413, + "learning_rate": 1.9207652894132933e-06, + "loss": 0.76233703, + "num_input_tokens_seen": 188472940, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.69140625, + "step": 8769, + "time_per_iteration": 2.4464261531829834 + }, + { + "auxiliary_loss_clip": 0.01108124, + "auxiliary_loss_mlp": 0.01036366, + "balance_loss_clip": 1.02372384, + "balance_loss_mlp": 1.03890908, + "epoch": 0.5272809258980911, + "flos": 20412164286720.0, + "grad_norm": 1.6831893733690892, + "language_loss": 0.7382611, + "learning_rate": 1.920376134993436e-06, + "loss": 0.75970602, + "num_input_tokens_seen": 188493035, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 8770, + "time_per_iteration": 2.4870028495788574 + }, + { + "auxiliary_loss_clip": 0.01110065, + "auxiliary_loss_mlp": 0.01030567, + "balance_loss_clip": 1.01798415, + "balance_loss_mlp": 1.03966439, + "epoch": 0.5273410491507591, + "flos": 28256742213120.0, + "grad_norm": 1.642757388746556, + "language_loss": 0.68108106, + "learning_rate": 1.9199869835929224e-06, + "loss": 0.70248735, + "num_input_tokens_seen": 188513860, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 8771, + "time_per_iteration": 2.5180561542510986 + }, + { + "auxiliary_loss_clip": 0.01106371, + "auxiliary_loss_mlp": 0.0103421, + "balance_loss_clip": 1.02130556, + "balance_loss_mlp": 1.03755426, + "epoch": 0.527401172403427, + "flos": 22455158140800.0, + "grad_norm": 1.8518077177131755, + "language_loss": 0.76476532, + "learning_rate": 1.9195978352265115e-06, + "loss": 0.78617108, + "num_input_tokens_seen": 188533345, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6875, + "step": 8772, + "time_per_iteration": 2.491196870803833 + }, + { + "auxiliary_loss_clip": 0.0111024, + "auxiliary_loss_mlp": 0.01040318, + "balance_loss_clip": 1.0271337, + "balance_loss_mlp": 1.03862512, + "epoch": 0.527461295656095, + "flos": 21031084558080.0, + "grad_norm": 1.8756798124264933, + "language_loss": 0.65986812, + "learning_rate": 1.9192086899089585e-06, + "loss": 0.68137372, + "num_input_tokens_seen": 188551550, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71484375, + "step": 8773, + "time_per_iteration": 2.464393138885498 + }, + { + "auxiliary_loss_clip": 0.01109322, + "auxiliary_loss_mlp": 0.01039476, + "balance_loss_clip": 1.02802014, + "balance_loss_mlp": 1.03791332, + "epoch": 0.5275214189087629, + "flos": 26322018929280.0, + "grad_norm": 1.5758079694219151, + "language_loss": 0.86029238, + "learning_rate": 1.91881954765502e-06, + "loss": 0.88178039, + "num_input_tokens_seen": 188571615, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.71484375, + "step": 8774, + "time_per_iteration": 2.5185675621032715 + }, + { + "auxiliary_loss_clip": 0.01105827, + "auxiliary_loss_mlp": 0.01030351, + "balance_loss_clip": 1.01860261, + "balance_loss_mlp": 1.03663182, + "epoch": 0.5275815421614309, + "flos": 20047024581120.0, + "grad_norm": 1.5254562165137588, + "language_loss": 0.79877412, + "learning_rate": 1.9184304084794523e-06, + "loss": 0.82013589, + "num_input_tokens_seen": 188591965, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.69140625, + "step": 8775, + "time_per_iteration": 2.454387664794922 + }, + { + "auxiliary_loss_clip": 0.01104897, + "auxiliary_loss_mlp": 0.01037755, + "balance_loss_clip": 1.02507758, + "balance_loss_mlp": 1.03681672, + "epoch": 0.5276416654140988, + "flos": 21432206712960.0, + "grad_norm": 1.7390352493983339, + "language_loss": 0.83807105, + "learning_rate": 1.918041272397012e-06, + "loss": 0.85949761, + "num_input_tokens_seen": 188610675, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 8776, + "time_per_iteration": 2.5026144981384277 + }, + { + "auxiliary_loss_clip": 0.01108103, + "auxiliary_loss_mlp": 0.01028347, + "balance_loss_clip": 1.01603246, + "balance_loss_mlp": 1.03759074, + "epoch": 0.5277017886667669, + "flos": 17165085696000.0, + "grad_norm": 1.6658876230443522, + "language_loss": 0.68375832, + "learning_rate": 1.9176521394224547e-06, + "loss": 0.70512283, + "num_input_tokens_seen": 188628235, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 8777, + "time_per_iteration": 2.417186975479126 + }, + { + "auxiliary_loss_clip": 0.01108134, + "auxiliary_loss_mlp": 0.01037656, + "balance_loss_clip": 1.0258069, + "balance_loss_mlp": 1.04009652, + "epoch": 0.5277619119194349, + "flos": 20448146736000.0, + "grad_norm": 2.132165937202497, + "language_loss": 0.82494706, + "learning_rate": 1.9172630095705358e-06, + "loss": 0.84640491, + "num_input_tokens_seen": 188648925, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 8778, + "time_per_iteration": 2.487772226333618 + }, + { + "auxiliary_loss_clip": 0.01110776, + "auxiliary_loss_mlp": 0.01033482, + "balance_loss_clip": 1.02037513, + "balance_loss_mlp": 1.04014647, + "epoch": 0.5278220351721028, + "flos": 24061083304320.0, + "grad_norm": 2.126071455139116, + "language_loss": 0.79359961, + "learning_rate": 1.916873882856013e-06, + "loss": 0.8150422, + "num_input_tokens_seen": 188668125, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.70703125, + "step": 8779, + "time_per_iteration": 2.4676833152770996 + }, + { + "auxiliary_loss_clip": 0.01102313, + "auxiliary_loss_mlp": 0.01031379, + "balance_loss_clip": 1.01942825, + "balance_loss_mlp": 1.03535295, + "epoch": 0.5278821584247708, + "flos": 24642907804800.0, + "grad_norm": 2.916693496001438, + "language_loss": 0.7667526, + "learning_rate": 1.9164847592936406e-06, + "loss": 0.78808951, + "num_input_tokens_seen": 188684410, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66796875, + "step": 8780, + "time_per_iteration": 2.489880323410034 + }, + { + "auxiliary_loss_clip": 0.01113237, + "auxiliary_loss_mlp": 0.01030309, + "balance_loss_clip": 1.01724982, + "balance_loss_mlp": 1.04084253, + "epoch": 0.5279422816774387, + "flos": 35408244240000.0, + "grad_norm": 1.5814481661794648, + "language_loss": 0.69506466, + "learning_rate": 1.916095638898174e-06, + "loss": 0.71650016, + "num_input_tokens_seen": 188706130, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.72265625, + "step": 8781, + "time_per_iteration": 2.570308208465576 + }, + { + "auxiliary_loss_clip": 0.01105161, + "auxiliary_loss_mlp": 0.0103252, + "balance_loss_clip": 1.02130246, + "balance_loss_mlp": 1.03748012, + "epoch": 0.5280024049301068, + "flos": 22967028904320.0, + "grad_norm": 1.5392288400315197, + "language_loss": 0.72434068, + "learning_rate": 1.9157065216843696e-06, + "loss": 0.74571753, + "num_input_tokens_seen": 188725030, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6796875, + "step": 8782, + "time_per_iteration": 2.4902799129486084 + }, + { + "auxiliary_loss_clip": 0.01104346, + "auxiliary_loss_mlp": 0.01027508, + "balance_loss_clip": 1.01557565, + "balance_loss_mlp": 1.03629112, + "epoch": 0.5280625281827747, + "flos": 21507619317120.0, + "grad_norm": 1.9147695733655095, + "language_loss": 0.68684381, + "learning_rate": 1.915317407666982e-06, + "loss": 0.70816237, + "num_input_tokens_seen": 188744325, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 8783, + "time_per_iteration": 2.4489378929138184 + }, + { + "auxiliary_loss_clip": 0.01115533, + "auxiliary_loss_mlp": 0.01037336, + "balance_loss_clip": 1.02257824, + "balance_loss_mlp": 1.04052663, + "epoch": 0.5281226514354427, + "flos": 31208167958400.0, + "grad_norm": 1.8253305439767769, + "language_loss": 0.69502926, + "learning_rate": 1.9149282968607674e-06, + "loss": 0.71655798, + "num_input_tokens_seen": 188765100, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.75, + "step": 8784, + "time_per_iteration": 2.55877947807312 + }, + { + "auxiliary_loss_clip": 0.0111041, + "auxiliary_loss_mlp": 0.01032831, + "balance_loss_clip": 1.01936626, + "balance_loss_mlp": 1.03718495, + "epoch": 0.5281827746881106, + "flos": 25077821679360.0, + "grad_norm": 2.137542562274274, + "language_loss": 0.75317723, + "learning_rate": 1.91453918928048e-06, + "loss": 0.77460963, + "num_input_tokens_seen": 188783995, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73046875, + "step": 8785, + "time_per_iteration": 2.5042202472686768 + }, + { + "auxiliary_loss_clip": 0.01109301, + "auxiliary_loss_mlp": 0.01031712, + "balance_loss_clip": 1.01858115, + "balance_loss_mlp": 1.03923512, + "epoch": 0.5282428979407786, + "flos": 20631255292800.0, + "grad_norm": 1.5356836172740989, + "language_loss": 0.8301636, + "learning_rate": 1.9141500849408745e-06, + "loss": 0.85157377, + "num_input_tokens_seen": 188803120, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.69921875, + "step": 8786, + "time_per_iteration": 2.530207872390747 + }, + { + "auxiliary_loss_clip": 0.01102608, + "auxiliary_loss_mlp": 0.01023798, + "balance_loss_clip": 1.01268828, + "balance_loss_mlp": 1.03662145, + "epoch": 0.5283030211934465, + "flos": 22419391173120.0, + "grad_norm": 6.419117505425037, + "language_loss": 0.8292653, + "learning_rate": 1.9137609838567076e-06, + "loss": 0.85052931, + "num_input_tokens_seen": 188820960, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66015625, + "step": 8787, + "time_per_iteration": 2.450303792953491 + }, + { + "auxiliary_loss_clip": 0.0110476, + "auxiliary_loss_mlp": 0.01027361, + "balance_loss_clip": 1.01601219, + "balance_loss_mlp": 1.03739762, + "epoch": 0.5283631444461145, + "flos": 23615467176960.0, + "grad_norm": 1.657610649379585, + "language_loss": 0.83385652, + "learning_rate": 1.9133718860427316e-06, + "loss": 0.85517776, + "num_input_tokens_seen": 188837165, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.671875, + "step": 8788, + "time_per_iteration": 2.4752538204193115 + }, + { + "auxiliary_loss_clip": 0.01107311, + "auxiliary_loss_mlp": 0.01036961, + "balance_loss_clip": 1.02353776, + "balance_loss_mlp": 1.04022217, + "epoch": 0.5284232676987825, + "flos": 32671994918400.0, + "grad_norm": 1.6616469699693164, + "language_loss": 0.7467941, + "learning_rate": 1.9129827915137027e-06, + "loss": 0.76823682, + "num_input_tokens_seen": 188858555, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.671875, + "step": 8789, + "time_per_iteration": 2.5324580669403076 + }, + { + "auxiliary_loss_clip": 0.01109504, + "auxiliary_loss_mlp": 0.01032501, + "balance_loss_clip": 1.01999021, + "balance_loss_mlp": 1.03898668, + "epoch": 0.5284833909514505, + "flos": 26760919213440.0, + "grad_norm": 1.4692396487834778, + "language_loss": 0.69505095, + "learning_rate": 1.9125937002843754e-06, + "loss": 0.71647108, + "num_input_tokens_seen": 188879050, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 8790, + "time_per_iteration": 2.51625919342041 + }, + { + "auxiliary_loss_clip": 0.01104373, + "auxiliary_loss_mlp": 0.01027676, + "balance_loss_clip": 1.01602292, + "balance_loss_mlp": 1.03740895, + "epoch": 0.5285435142041185, + "flos": 22090700793600.0, + "grad_norm": 1.5973748463846205, + "language_loss": 0.78992987, + "learning_rate": 1.9122046123695036e-06, + "loss": 0.81125033, + "num_input_tokens_seen": 188898885, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.671875, + "step": 8791, + "time_per_iteration": 2.4552273750305176 + }, + { + "auxiliary_loss_clip": 0.01108186, + "auxiliary_loss_mlp": 0.01029947, + "balance_loss_clip": 1.01800871, + "balance_loss_mlp": 1.04050541, + "epoch": 0.5286036374567864, + "flos": 20375463565440.0, + "grad_norm": 1.8738977568036352, + "language_loss": 0.66256213, + "learning_rate": 1.9118155277838423e-06, + "loss": 0.68394351, + "num_input_tokens_seen": 188917225, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 8792, + "time_per_iteration": 2.485501527786255 + }, + { + "auxiliary_loss_clip": 0.01103536, + "auxiliary_loss_mlp": 0.01035011, + "balance_loss_clip": 1.02362621, + "balance_loss_mlp": 1.03610563, + "epoch": 0.5286637607094544, + "flos": 24352175122560.0, + "grad_norm": 2.0158719758485226, + "language_loss": 0.79919344, + "learning_rate": 1.9114264465421443e-06, + "loss": 0.82057893, + "num_input_tokens_seen": 188936120, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.671875, + "step": 8793, + "time_per_iteration": 2.4918789863586426 + }, + { + "auxiliary_loss_clip": 0.01108596, + "auxiliary_loss_mlp": 0.01036682, + "balance_loss_clip": 1.02393866, + "balance_loss_mlp": 1.03883982, + "epoch": 0.5287238839621223, + "flos": 17271165536640.0, + "grad_norm": 1.8030848585204593, + "language_loss": 0.84791529, + "learning_rate": 1.9110373686591645e-06, + "loss": 0.86936802, + "num_input_tokens_seen": 188953405, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 8794, + "time_per_iteration": 2.451828718185425 + }, + { + "auxiliary_loss_clip": 0.01110927, + "auxiliary_loss_mlp": 0.01032186, + "balance_loss_clip": 1.01894772, + "balance_loss_mlp": 1.03798628, + "epoch": 0.5287840072147904, + "flos": 17566890209280.0, + "grad_norm": 1.927550813134725, + "language_loss": 0.67570889, + "learning_rate": 1.9106482941496564e-06, + "loss": 0.69714004, + "num_input_tokens_seen": 188971150, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73046875, + "step": 8795, + "time_per_iteration": 2.4460599422454834 + }, + { + "auxiliary_loss_clip": 0.01107843, + "auxiliary_loss_mlp": 0.01029404, + "balance_loss_clip": 1.01716161, + "balance_loss_mlp": 1.03754616, + "epoch": 0.5288441304674583, + "flos": 18552099421440.0, + "grad_norm": 1.883468232968509, + "language_loss": 0.80662012, + "learning_rate": 1.910259223028374e-06, + "loss": 0.82799256, + "num_input_tokens_seen": 188989550, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.703125, + "step": 8796, + "time_per_iteration": 2.4592626094818115 + }, + { + "auxiliary_loss_clip": 0.01112299, + "auxiliary_loss_mlp": 0.0103268, + "balance_loss_clip": 1.01978111, + "balance_loss_mlp": 1.04186153, + "epoch": 0.5289042537201263, + "flos": 20814507504000.0, + "grad_norm": 1.9732503530858911, + "language_loss": 0.69071984, + "learning_rate": 1.909870155310071e-06, + "loss": 0.71216959, + "num_input_tokens_seen": 189008795, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 8797, + "time_per_iteration": 2.4451231956481934 + }, + { + "auxiliary_loss_clip": 0.01103286, + "auxiliary_loss_mlp": 0.01032584, + "balance_loss_clip": 1.02128911, + "balance_loss_mlp": 1.03739119, + "epoch": 0.5289643769727942, + "flos": 15735265937280.0, + "grad_norm": 1.7017381786261847, + "language_loss": 0.82339096, + "learning_rate": 1.9094810910095005e-06, + "loss": 0.84474969, + "num_input_tokens_seen": 189025540, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.66015625, + "step": 8798, + "time_per_iteration": 2.4694111347198486 + }, + { + "auxiliary_loss_clip": 0.01111092, + "auxiliary_loss_mlp": 0.0103436, + "balance_loss_clip": 1.02102065, + "balance_loss_mlp": 1.03840899, + "epoch": 0.5290245002254622, + "flos": 19537308633600.0, + "grad_norm": 2.0619187329461575, + "language_loss": 0.70591879, + "learning_rate": 1.9090920301414166e-06, + "loss": 0.72737336, + "num_input_tokens_seen": 189044885, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7265625, + "step": 8799, + "time_per_iteration": 2.456692695617676 + }, + { + "auxiliary_loss_clip": 0.01104599, + "auxiliary_loss_mlp": 0.01032577, + "balance_loss_clip": 1.02132988, + "balance_loss_mlp": 1.03975451, + "epoch": 0.5290846234781301, + "flos": 15815131827840.0, + "grad_norm": 1.8240531153484045, + "language_loss": 0.69601536, + "learning_rate": 1.9087029727205716e-06, + "loss": 0.71738708, + "num_input_tokens_seen": 189061280, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 8800, + "time_per_iteration": 2.490417242050171 + }, + { + "auxiliary_loss_clip": 0.01036269, + "auxiliary_loss_mlp": 0.01012691, + "balance_loss_clip": 1.01148117, + "balance_loss_mlp": 1.01404071, + "epoch": 0.5291447467307981, + "flos": 70057624821120.0, + "grad_norm": 0.998441198923784, + "language_loss": 0.57013941, + "learning_rate": 1.9083139187617193e-06, + "loss": 0.59062898, + "num_input_tokens_seen": 189114775, + "router_z_loss_clip": 0.01208496, + "router_z_loss_mlp": 0.22265625, + "step": 8801, + "time_per_iteration": 4.385375022888184 + }, + { + "auxiliary_loss_clip": 0.01109021, + "auxiliary_loss_mlp": 0.0103377, + "balance_loss_clip": 1.02168214, + "balance_loss_mlp": 1.03874719, + "epoch": 0.529204869983466, + "flos": 28364186770560.0, + "grad_norm": 1.5128121202389628, + "language_loss": 0.63942313, + "learning_rate": 1.9079248682796123e-06, + "loss": 0.66085106, + "num_input_tokens_seen": 189134700, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.703125, + "step": 8802, + "time_per_iteration": 2.5486578941345215 + }, + { + "auxiliary_loss_clip": 0.01104672, + "auxiliary_loss_mlp": 0.01029396, + "balance_loss_clip": 1.01684964, + "balance_loss_mlp": 1.03677487, + "epoch": 0.5292649932361341, + "flos": 33758830684800.0, + "grad_norm": 1.7172902320691381, + "language_loss": 0.68250531, + "learning_rate": 1.907535821289003e-06, + "loss": 0.70384604, + "num_input_tokens_seen": 189155365, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 8803, + "time_per_iteration": 2.576460361480713 + }, + { + "auxiliary_loss_clip": 0.01102984, + "auxiliary_loss_mlp": 0.01034692, + "balance_loss_clip": 1.02233613, + "balance_loss_mlp": 1.03654003, + "epoch": 0.5293251164888021, + "flos": 20447679859200.0, + "grad_norm": 1.6769492859989101, + "language_loss": 0.76551962, + "learning_rate": 1.9071467778046458e-06, + "loss": 0.78689635, + "num_input_tokens_seen": 189173885, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6640625, + "step": 8804, + "time_per_iteration": 4.018502473831177 + }, + { + "auxiliary_loss_clip": 0.0103564, + "auxiliary_loss_mlp": 0.01005394, + "balance_loss_clip": 1.00417256, + "balance_loss_mlp": 1.01327515, + "epoch": 0.52938523974147, + "flos": 66545312204160.0, + "grad_norm": 0.749734320345171, + "language_loss": 0.53018034, + "learning_rate": 1.906757737841291e-06, + "loss": 0.55059063, + "num_input_tokens_seen": 189236515, + "router_z_loss_clip": 0.01220703, + "router_z_loss_mlp": 0.22363281, + "step": 8805, + "time_per_iteration": 4.599541902542114 + }, + { + "auxiliary_loss_clip": 0.01034831, + "auxiliary_loss_mlp": 0.01001215, + "balance_loss_clip": 0.99995738, + "balance_loss_mlp": 1.0124402, + "epoch": 0.529445362994138, + "flos": 67151734542720.0, + "grad_norm": 0.7381494507925852, + "language_loss": 0.63778675, + "learning_rate": 1.906368701413693e-06, + "loss": 0.65814722, + "num_input_tokens_seen": 189300500, + "router_z_loss_clip": 0.01257324, + "router_z_loss_mlp": 0.22460938, + "step": 8806, + "time_per_iteration": 3.067852735519409 + }, + { + "auxiliary_loss_clip": 0.01110413, + "auxiliary_loss_mlp": 0.01034235, + "balance_loss_clip": 1.02229047, + "balance_loss_mlp": 1.03770947, + "epoch": 0.5295054862468059, + "flos": 17749316407680.0, + "grad_norm": 1.9894097123133165, + "language_loss": 0.72397399, + "learning_rate": 1.9059796685366026e-06, + "loss": 0.74542046, + "num_input_tokens_seen": 189319745, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.7265625, + "step": 8807, + "time_per_iteration": 2.4303808212280273 + }, + { + "auxiliary_loss_clip": 0.0110442, + "auxiliary_loss_mlp": 0.01029131, + "balance_loss_clip": 1.01775241, + "balance_loss_mlp": 1.03735805, + "epoch": 0.529565609499474, + "flos": 11397401084160.0, + "grad_norm": 4.619049711580288, + "language_loss": 0.69640231, + "learning_rate": 1.9055906392247723e-06, + "loss": 0.71773779, + "num_input_tokens_seen": 189334550, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.671875, + "step": 8808, + "time_per_iteration": 2.418649435043335 + }, + { + "auxiliary_loss_clip": 0.01105928, + "auxiliary_loss_mlp": 0.01029731, + "balance_loss_clip": 1.01817942, + "balance_loss_mlp": 1.03796387, + "epoch": 0.5296257327521419, + "flos": 17196363463680.0, + "grad_norm": 1.7756221154666856, + "language_loss": 0.8668943, + "learning_rate": 1.9052016134929554e-06, + "loss": 0.88825089, + "num_input_tokens_seen": 189351735, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6796875, + "step": 8809, + "time_per_iteration": 2.413883686065674 + }, + { + "auxiliary_loss_clip": 0.01112398, + "auxiliary_loss_mlp": 0.01034558, + "balance_loss_clip": 1.0209322, + "balance_loss_mlp": 1.03908372, + "epoch": 0.5296858560048099, + "flos": 39964086777600.0, + "grad_norm": 1.608353260814621, + "language_loss": 0.64362073, + "learning_rate": 1.9048125913559016e-06, + "loss": 0.66509026, + "num_input_tokens_seen": 189373105, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73046875, + "step": 8810, + "time_per_iteration": 2.6121585369110107 + }, + { + "auxiliary_loss_clip": 0.0110573, + "auxiliary_loss_mlp": 0.01035568, + "balance_loss_clip": 1.02372456, + "balance_loss_mlp": 1.03820479, + "epoch": 0.5297459792574778, + "flos": 20961418129920.0, + "grad_norm": 1.5055977388002117, + "language_loss": 0.68083066, + "learning_rate": 1.9044235728283646e-06, + "loss": 0.70224369, + "num_input_tokens_seen": 189394615, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 8811, + "time_per_iteration": 2.4806406497955322 + }, + { + "auxiliary_loss_clip": 0.010328, + "auxiliary_loss_mlp": 0.00998698, + "balance_loss_clip": 0.99739295, + "balance_loss_mlp": 1.01059103, + "epoch": 0.5298061025101458, + "flos": 66523620389760.0, + "grad_norm": 0.6652461754552681, + "language_loss": 0.53400505, + "learning_rate": 1.9040345579250953e-06, + "loss": 0.5543201, + "num_input_tokens_seen": 189459750, + "router_z_loss_clip": 0.01306152, + "router_z_loss_mlp": 0.22265625, + "step": 8812, + "time_per_iteration": 3.175478458404541 + }, + { + "auxiliary_loss_clip": 0.01032825, + "auxiliary_loss_mlp": 0.01000267, + "balance_loss_clip": 0.99906272, + "balance_loss_mlp": 1.01074851, + "epoch": 0.5298662257628137, + "flos": 67662994775040.0, + "grad_norm": 0.7207460213448722, + "language_loss": 0.56372511, + "learning_rate": 1.9036455466608453e-06, + "loss": 0.58405602, + "num_input_tokens_seen": 189527540, + "router_z_loss_clip": 0.01202393, + "router_z_loss_mlp": 0.22070312, + "step": 8813, + "time_per_iteration": 3.1315269470214844 + }, + { + "auxiliary_loss_clip": 0.01102589, + "auxiliary_loss_mlp": 0.01028915, + "balance_loss_clip": 1.01751852, + "balance_loss_mlp": 1.03824615, + "epoch": 0.5299263490154817, + "flos": 19646405216640.0, + "grad_norm": 1.5478508872520975, + "language_loss": 0.81618506, + "learning_rate": 1.9032565390503657e-06, + "loss": 0.8375001, + "num_input_tokens_seen": 189546900, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.640625, + "step": 8814, + "time_per_iteration": 2.431269884109497 + }, + { + "auxiliary_loss_clip": 0.01113436, + "auxiliary_loss_mlp": 0.01028986, + "balance_loss_clip": 1.01687407, + "balance_loss_mlp": 1.04241931, + "epoch": 0.5299864722681497, + "flos": 22055005653120.0, + "grad_norm": 1.5843849623618003, + "language_loss": 0.84997016, + "learning_rate": 1.9028675351084076e-06, + "loss": 0.8713944, + "num_input_tokens_seen": 189566490, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.7109375, + "step": 8815, + "time_per_iteration": 2.531074285507202 + }, + { + "auxiliary_loss_clip": 0.01105692, + "auxiliary_loss_mlp": 0.01030775, + "balance_loss_clip": 1.01968288, + "balance_loss_mlp": 1.03940964, + "epoch": 0.5300465955208177, + "flos": 21763698353280.0, + "grad_norm": 2.126267576495584, + "language_loss": 0.66768968, + "learning_rate": 1.9024785348497225e-06, + "loss": 0.68905437, + "num_input_tokens_seen": 189585580, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6640625, + "step": 8816, + "time_per_iteration": 2.525468111038208 + }, + { + "auxiliary_loss_clip": 0.01107527, + "auxiliary_loss_mlp": 0.01033036, + "balance_loss_clip": 1.0210259, + "balance_loss_mlp": 1.03860188, + "epoch": 0.5301067187734857, + "flos": 42996491735040.0, + "grad_norm": 1.7854125043951103, + "language_loss": 0.72206688, + "learning_rate": 1.9020895382890611e-06, + "loss": 0.74347246, + "num_input_tokens_seen": 189608485, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6875, + "step": 8817, + "time_per_iteration": 2.6937406063079834 + }, + { + "auxiliary_loss_clip": 0.01107001, + "auxiliary_loss_mlp": 0.01031476, + "balance_loss_clip": 1.01921499, + "balance_loss_mlp": 1.03620088, + "epoch": 0.5301668420261536, + "flos": 20554298403840.0, + "grad_norm": 1.6863401200151742, + "language_loss": 0.6522249, + "learning_rate": 1.9017005454411743e-06, + "loss": 0.67360961, + "num_input_tokens_seen": 189627815, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.70703125, + "step": 8818, + "time_per_iteration": 2.509539842605591 + }, + { + "auxiliary_loss_clip": 0.0110849, + "auxiliary_loss_mlp": 0.01027368, + "balance_loss_clip": 1.01462412, + "balance_loss_mlp": 1.0393914, + "epoch": 0.5302269652788216, + "flos": 17486665182720.0, + "grad_norm": 1.999877555758676, + "language_loss": 0.75154972, + "learning_rate": 1.9013115563208126e-06, + "loss": 0.77290833, + "num_input_tokens_seen": 189644850, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 8819, + "time_per_iteration": 2.473130702972412 + }, + { + "auxiliary_loss_clip": 0.01110233, + "auxiliary_loss_mlp": 0.0103388, + "balance_loss_clip": 1.0214107, + "balance_loss_mlp": 1.03858495, + "epoch": 0.5302870885314895, + "flos": 14574202715520.0, + "grad_norm": 2.27674417450437, + "language_loss": 0.82333302, + "learning_rate": 1.9009225709427267e-06, + "loss": 0.84477413, + "num_input_tokens_seen": 189660945, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71875, + "step": 8820, + "time_per_iteration": 2.4328434467315674 + }, + { + "auxiliary_loss_clip": 0.01106236, + "auxiliary_loss_mlp": 0.01031495, + "balance_loss_clip": 1.02030122, + "balance_loss_mlp": 1.03725612, + "epoch": 0.5303472117841576, + "flos": 23438032968960.0, + "grad_norm": 2.049749716635941, + "language_loss": 0.72593045, + "learning_rate": 1.9005335893216667e-06, + "loss": 0.74730772, + "num_input_tokens_seen": 189680425, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.69140625, + "step": 8821, + "time_per_iteration": 2.508608102798462 + }, + { + "auxiliary_loss_clip": 0.01102657, + "auxiliary_loss_mlp": 0.01028883, + "balance_loss_clip": 1.01779675, + "balance_loss_mlp": 1.0363605, + "epoch": 0.5304073350368255, + "flos": 22709010533760.0, + "grad_norm": 1.3923419148404492, + "language_loss": 0.73939008, + "learning_rate": 1.9001446114723824e-06, + "loss": 0.76070547, + "num_input_tokens_seen": 189700375, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6640625, + "step": 8822, + "time_per_iteration": 2.4427592754364014 + }, + { + "auxiliary_loss_clip": 0.01107083, + "auxiliary_loss_mlp": 0.01035958, + "balance_loss_clip": 1.02312553, + "balance_loss_mlp": 1.03773904, + "epoch": 0.5304674582894935, + "flos": 27928554624000.0, + "grad_norm": 1.6902308577802683, + "language_loss": 0.67477053, + "learning_rate": 1.8997556374096257e-06, + "loss": 0.69620097, + "num_input_tokens_seen": 189721225, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 8823, + "time_per_iteration": 2.5047175884246826 + }, + { + "auxiliary_loss_clip": 0.0110955, + "auxiliary_loss_mlp": 0.0103452, + "balance_loss_clip": 1.02113247, + "balance_loss_mlp": 1.03756142, + "epoch": 0.5305275815421614, + "flos": 21250642440960.0, + "grad_norm": 1.5189625554392572, + "language_loss": 0.69347805, + "learning_rate": 1.8993666671481444e-06, + "loss": 0.71491873, + "num_input_tokens_seen": 189740170, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71875, + "step": 8824, + "time_per_iteration": 2.4358925819396973 + }, + { + "auxiliary_loss_clip": 0.01104053, + "auxiliary_loss_mlp": 0.01030719, + "balance_loss_clip": 1.01879227, + "balance_loss_mlp": 1.03755724, + "epoch": 0.5305877047948294, + "flos": 17603088140160.0, + "grad_norm": 2.2315847136946956, + "language_loss": 0.75412273, + "learning_rate": 1.898977700702689e-06, + "loss": 0.77547044, + "num_input_tokens_seen": 189757890, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 8825, + "time_per_iteration": 2.480656385421753 + }, + { + "auxiliary_loss_clip": 0.01105338, + "auxiliary_loss_mlp": 0.01036746, + "balance_loss_clip": 1.02433622, + "balance_loss_mlp": 1.03730893, + "epoch": 0.5306478280474973, + "flos": 15195493284480.0, + "grad_norm": 2.0577399670241125, + "language_loss": 0.85668242, + "learning_rate": 1.8985887380880103e-06, + "loss": 0.87810326, + "num_input_tokens_seen": 189775390, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 8826, + "time_per_iteration": 2.422227621078491 + }, + { + "auxiliary_loss_clip": 0.01103641, + "auxiliary_loss_mlp": 0.01030185, + "balance_loss_clip": 1.01760268, + "balance_loss_mlp": 1.03594768, + "epoch": 0.5307079513001653, + "flos": 15341218761600.0, + "grad_norm": 1.3501660325975628, + "language_loss": 0.64042354, + "learning_rate": 1.8981997793188558e-06, + "loss": 0.66176176, + "num_input_tokens_seen": 189793975, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.67578125, + "step": 8827, + "time_per_iteration": 2.461434841156006 + }, + { + "auxiliary_loss_clip": 0.0110958, + "auxiliary_loss_mlp": 0.0103759, + "balance_loss_clip": 1.02452421, + "balance_loss_mlp": 1.03835428, + "epoch": 0.5307680745528333, + "flos": 43544452688640.0, + "grad_norm": 1.5699076783392119, + "language_loss": 0.60028976, + "learning_rate": 1.8978108244099762e-06, + "loss": 0.62176144, + "num_input_tokens_seen": 189817870, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71484375, + "step": 8828, + "time_per_iteration": 2.621673107147217 + }, + { + "auxiliary_loss_clip": 0.01109989, + "auxiliary_loss_mlp": 0.01030803, + "balance_loss_clip": 1.01791096, + "balance_loss_mlp": 1.03909802, + "epoch": 0.5308281978055013, + "flos": 20048928001920.0, + "grad_norm": 1.7449235888895405, + "language_loss": 0.81386358, + "learning_rate": 1.8974218733761208e-06, + "loss": 0.83527148, + "num_input_tokens_seen": 189837905, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 8829, + "time_per_iteration": 2.472055673599243 + }, + { + "auxiliary_loss_clip": 0.0110653, + "auxiliary_loss_mlp": 0.01033982, + "balance_loss_clip": 1.02196574, + "balance_loss_mlp": 1.03871477, + "epoch": 0.5308883210581693, + "flos": 20703938463360.0, + "grad_norm": 1.483207387046285, + "language_loss": 0.78292549, + "learning_rate": 1.8970329262320375e-06, + "loss": 0.80433053, + "num_input_tokens_seen": 189856970, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 8830, + "time_per_iteration": 2.4544272422790527 + }, + { + "auxiliary_loss_clip": 0.01106311, + "auxiliary_loss_mlp": 0.01031189, + "balance_loss_clip": 1.01877975, + "balance_loss_mlp": 1.03778768, + "epoch": 0.5309484443108372, + "flos": 14355506759040.0, + "grad_norm": 2.0257257472461525, + "language_loss": 0.80643964, + "learning_rate": 1.8966439829924768e-06, + "loss": 0.82781464, + "num_input_tokens_seen": 189872830, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 8831, + "time_per_iteration": 2.4307594299316406 + }, + { + "auxiliary_loss_clip": 0.01103982, + "auxiliary_loss_mlp": 0.01028528, + "balance_loss_clip": 1.01611233, + "balance_loss_mlp": 1.03561974, + "epoch": 0.5310085675635052, + "flos": 20010503427840.0, + "grad_norm": 2.026603228036347, + "language_loss": 0.73146117, + "learning_rate": 1.896255043672186e-06, + "loss": 0.75278628, + "num_input_tokens_seen": 189891635, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 8832, + "time_per_iteration": 2.429567813873291 + }, + { + "auxiliary_loss_clip": 0.01111675, + "auxiliary_loss_mlp": 0.01034067, + "balance_loss_clip": 1.02162194, + "balance_loss_mlp": 1.04065752, + "epoch": 0.5310686908161731, + "flos": 22127293774080.0, + "grad_norm": 1.9229428073701915, + "language_loss": 0.75382435, + "learning_rate": 1.8958661082859143e-06, + "loss": 0.77528179, + "num_input_tokens_seen": 189909050, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 8833, + "time_per_iteration": 2.4731011390686035 + }, + { + "auxiliary_loss_clip": 0.01107496, + "auxiliary_loss_mlp": 0.01030529, + "balance_loss_clip": 1.01733804, + "balance_loss_mlp": 1.03697777, + "epoch": 0.5311288140688412, + "flos": 24717889445760.0, + "grad_norm": 1.9718581367947616, + "language_loss": 0.73314357, + "learning_rate": 1.8954771768484103e-06, + "loss": 0.75452387, + "num_input_tokens_seen": 189927405, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 8834, + "time_per_iteration": 2.476289987564087 + }, + { + "auxiliary_loss_clip": 0.01113252, + "auxiliary_loss_mlp": 0.01033749, + "balance_loss_clip": 1.01980758, + "balance_loss_mlp": 1.03958392, + "epoch": 0.5311889373215091, + "flos": 24097712198400.0, + "grad_norm": 2.0084943443028975, + "language_loss": 0.77603996, + "learning_rate": 1.8950882493744226e-06, + "loss": 0.79750997, + "num_input_tokens_seen": 189947740, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.73828125, + "step": 8835, + "time_per_iteration": 2.512998104095459 + }, + { + "auxiliary_loss_clip": 0.01106643, + "auxiliary_loss_mlp": 0.01036561, + "balance_loss_clip": 1.02318025, + "balance_loss_mlp": 1.03647518, + "epoch": 0.5312490605741771, + "flos": 22017012042240.0, + "grad_norm": 1.8374817013403106, + "language_loss": 0.72753531, + "learning_rate": 1.8946993258786985e-06, + "loss": 0.74896735, + "num_input_tokens_seen": 189966495, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.703125, + "step": 8836, + "time_per_iteration": 2.4509310722351074 + }, + { + "auxiliary_loss_clip": 0.01108843, + "auxiliary_loss_mlp": 0.01033453, + "balance_loss_clip": 1.01986957, + "balance_loss_mlp": 1.03784788, + "epoch": 0.531309183826845, + "flos": 19390541662080.0, + "grad_norm": 2.66525227198108, + "language_loss": 0.80936503, + "learning_rate": 1.894310406375987e-06, + "loss": 0.83078802, + "num_input_tokens_seen": 189985325, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.70703125, + "step": 8837, + "time_per_iteration": 2.471662759780884 + }, + { + "auxiliary_loss_clip": 0.0110708, + "auxiliary_loss_mlp": 0.01028737, + "balance_loss_clip": 1.01615477, + "balance_loss_mlp": 1.03874159, + "epoch": 0.531369307079513, + "flos": 20190056538240.0, + "grad_norm": 1.8452061032611426, + "language_loss": 0.85926068, + "learning_rate": 1.893921490881035e-06, + "loss": 0.88061881, + "num_input_tokens_seen": 190003290, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.68359375, + "step": 8838, + "time_per_iteration": 2.4360713958740234 + }, + { + "auxiliary_loss_clip": 0.011058, + "auxiliary_loss_mlp": 0.01029554, + "balance_loss_clip": 1.01779366, + "balance_loss_mlp": 1.03785229, + "epoch": 0.5314294303321809, + "flos": 18880143356160.0, + "grad_norm": 1.8224224127823847, + "language_loss": 0.7208544, + "learning_rate": 1.8935325794085906e-06, + "loss": 0.74220788, + "num_input_tokens_seen": 190023260, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6796875, + "step": 8839, + "time_per_iteration": 2.4806606769561768 + }, + { + "auxiliary_loss_clip": 0.01106476, + "auxiliary_loss_mlp": 0.01033997, + "balance_loss_clip": 1.02167034, + "balance_loss_mlp": 1.03606987, + "epoch": 0.531489553584849, + "flos": 23040035297280.0, + "grad_norm": 1.889571361745381, + "language_loss": 0.76674354, + "learning_rate": 1.8931436719734023e-06, + "loss": 0.78814822, + "num_input_tokens_seen": 190042035, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 8840, + "time_per_iteration": 2.47389817237854 + }, + { + "auxiliary_loss_clip": 0.01108751, + "auxiliary_loss_mlp": 0.01031156, + "balance_loss_clip": 1.01798964, + "balance_loss_mlp": 1.03678751, + "epoch": 0.5315496768375169, + "flos": 19790478668160.0, + "grad_norm": 1.9758748106511805, + "language_loss": 0.77377498, + "learning_rate": 1.892754768590216e-06, + "loss": 0.79517406, + "num_input_tokens_seen": 190057545, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71875, + "step": 8841, + "time_per_iteration": 2.4235799312591553 + }, + { + "auxiliary_loss_clip": 0.0103176, + "auxiliary_loss_mlp": 0.01011801, + "balance_loss_clip": 1.01060319, + "balance_loss_mlp": 1.00937963, + "epoch": 0.5316098000901849, + "flos": 71023228185600.0, + "grad_norm": 0.6971901974616477, + "language_loss": 0.56793272, + "learning_rate": 1.8923658692737793e-06, + "loss": 0.5883683, + "num_input_tokens_seen": 190123800, + "router_z_loss_clip": 0.01196289, + "router_z_loss_mlp": 0.22363281, + "step": 8842, + "time_per_iteration": 3.1749658584594727 + }, + { + "auxiliary_loss_clip": 0.0111031, + "auxiliary_loss_mlp": 0.01038298, + "balance_loss_clip": 1.02445221, + "balance_loss_mlp": 1.03839254, + "epoch": 0.5316699233428529, + "flos": 16435560470400.0, + "grad_norm": 1.7048374639197847, + "language_loss": 0.73877072, + "learning_rate": 1.8919769740388407e-06, + "loss": 0.76025677, + "num_input_tokens_seen": 190141625, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.71875, + "step": 8843, + "time_per_iteration": 3.7764668464660645 + }, + { + "auxiliary_loss_clip": 0.01031369, + "auxiliary_loss_mlp": 0.01005783, + "balance_loss_clip": 1.00454903, + "balance_loss_mlp": 1.0092088, + "epoch": 0.5317300465955208, + "flos": 67420814302080.0, + "grad_norm": 0.8754586803272454, + "language_loss": 0.61063367, + "learning_rate": 1.891588082900145e-06, + "loss": 0.63100517, + "num_input_tokens_seen": 190198110, + "router_z_loss_clip": 0.0123291, + "router_z_loss_mlp": 0.22265625, + "step": 8844, + "time_per_iteration": 3.1397178173065186 + }, + { + "auxiliary_loss_clip": 0.01031644, + "auxiliary_loss_mlp": 0.01000918, + "balance_loss_clip": 0.9997676, + "balance_loss_mlp": 1.00950778, + "epoch": 0.5317901698481888, + "flos": 59508075340800.0, + "grad_norm": 0.9433503667086528, + "language_loss": 0.62195891, + "learning_rate": 1.8911991958724411e-06, + "loss": 0.64228451, + "num_input_tokens_seen": 190259950, + "router_z_loss_clip": 0.01147461, + "router_z_loss_mlp": 0.22167969, + "step": 8845, + "time_per_iteration": 3.0431036949157715 + }, + { + "auxiliary_loss_clip": 0.01107979, + "auxiliary_loss_mlp": 0.01032637, + "balance_loss_clip": 1.01908851, + "balance_loss_mlp": 1.0369339, + "epoch": 0.5318502931008567, + "flos": 19129219240320.0, + "grad_norm": 2.021195915673457, + "language_loss": 0.7583214, + "learning_rate": 1.890810312970474e-06, + "loss": 0.77972758, + "num_input_tokens_seen": 190278265, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7109375, + "step": 8846, + "time_per_iteration": 5.309458017349243 + }, + { + "auxiliary_loss_clip": 0.01106825, + "auxiliary_loss_mlp": 0.01031551, + "balance_loss_clip": 1.01994586, + "balance_loss_mlp": 1.03744686, + "epoch": 0.5319104163535248, + "flos": 24681045070080.0, + "grad_norm": 1.5634287795910362, + "language_loss": 0.75384724, + "learning_rate": 1.8904214342089903e-06, + "loss": 0.775231, + "num_input_tokens_seen": 190298400, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6953125, + "step": 8847, + "time_per_iteration": 2.4939441680908203 + }, + { + "auxiliary_loss_clip": 0.01104626, + "auxiliary_loss_mlp": 0.01029614, + "balance_loss_clip": 1.01720405, + "balance_loss_mlp": 1.03563881, + "epoch": 0.5319705396061927, + "flos": 19385513758080.0, + "grad_norm": 1.798053797011527, + "language_loss": 0.87663037, + "learning_rate": 1.8900325596027378e-06, + "loss": 0.89797276, + "num_input_tokens_seen": 190316235, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 8848, + "time_per_iteration": 2.417572498321533 + }, + { + "auxiliary_loss_clip": 0.01107802, + "auxiliary_loss_mlp": 0.01035349, + "balance_loss_clip": 1.02124095, + "balance_loss_mlp": 1.03765666, + "epoch": 0.5320306628588607, + "flos": 18259319664000.0, + "grad_norm": 2.6565378723095834, + "language_loss": 0.74641025, + "learning_rate": 1.8896436891664609e-06, + "loss": 0.76784182, + "num_input_tokens_seen": 190335060, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.703125, + "step": 8849, + "time_per_iteration": 2.4509243965148926 + }, + { + "auxiliary_loss_clip": 0.01107364, + "auxiliary_loss_mlp": 0.01029496, + "balance_loss_clip": 1.01655602, + "balance_loss_mlp": 1.03593016, + "epoch": 0.5320907861115286, + "flos": 23732321097600.0, + "grad_norm": 2.164126567755358, + "language_loss": 0.79812169, + "learning_rate": 1.8892548229149066e-06, + "loss": 0.81949031, + "num_input_tokens_seen": 190353265, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71484375, + "step": 8850, + "time_per_iteration": 2.45766544342041 + }, + { + "auxiliary_loss_clip": 0.01104904, + "auxiliary_loss_mlp": 0.01028828, + "balance_loss_clip": 1.01615, + "balance_loss_mlp": 1.03538489, + "epoch": 0.5321509093641966, + "flos": 34495251321600.0, + "grad_norm": 1.4483393548737078, + "language_loss": 0.54913849, + "learning_rate": 1.888865960862821e-06, + "loss": 0.57047582, + "num_input_tokens_seen": 190376575, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 8851, + "time_per_iteration": 2.607548713684082 + }, + { + "auxiliary_loss_clip": 0.01110841, + "auxiliary_loss_mlp": 0.01030593, + "balance_loss_clip": 1.01821876, + "balance_loss_mlp": 1.03916895, + "epoch": 0.5322110326168645, + "flos": 20010934391040.0, + "grad_norm": 1.7052679387317837, + "language_loss": 0.68385565, + "learning_rate": 1.8884771030249484e-06, + "loss": 0.70526993, + "num_input_tokens_seen": 190395185, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71484375, + "step": 8852, + "time_per_iteration": 2.4444568157196045 + }, + { + "auxiliary_loss_clip": 0.01031832, + "auxiliary_loss_mlp": 0.00999979, + "balance_loss_clip": 0.9987337, + "balance_loss_mlp": 1.00941014, + "epoch": 0.5322711558695326, + "flos": 64631164435200.0, + "grad_norm": 0.8061011864926959, + "language_loss": 0.62881088, + "learning_rate": 1.8880882494160357e-06, + "loss": 0.64912903, + "num_input_tokens_seen": 190452595, + "router_z_loss_clip": 0.01245117, + "router_z_loss_mlp": 0.22460938, + "step": 8853, + "time_per_iteration": 3.0409493446350098 + }, + { + "auxiliary_loss_clip": 0.01108315, + "auxiliary_loss_mlp": 0.01029698, + "balance_loss_clip": 1.01691902, + "balance_loss_mlp": 1.03633368, + "epoch": 0.5323312791222005, + "flos": 14939342421120.0, + "grad_norm": 2.2642894326377196, + "language_loss": 0.79002404, + "learning_rate": 1.8876994000508278e-06, + "loss": 0.81140411, + "num_input_tokens_seen": 190469140, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.71875, + "step": 8854, + "time_per_iteration": 2.4175822734832764 + }, + { + "auxiliary_loss_clip": 0.01103338, + "auxiliary_loss_mlp": 0.01028455, + "balance_loss_clip": 1.01717186, + "balance_loss_mlp": 1.03635907, + "epoch": 0.5323914023748685, + "flos": 23440834229760.0, + "grad_norm": 1.6616394070358602, + "language_loss": 0.73815715, + "learning_rate": 1.8873105549440698e-06, + "loss": 0.75947511, + "num_input_tokens_seen": 190489015, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.671875, + "step": 8855, + "time_per_iteration": 2.5298781394958496 + }, + { + "auxiliary_loss_clip": 0.01104403, + "auxiliary_loss_mlp": 0.01029662, + "balance_loss_clip": 1.01806259, + "balance_loss_mlp": 1.03597307, + "epoch": 0.5324515256275365, + "flos": 26286180134400.0, + "grad_norm": 1.9409120124024815, + "language_loss": 0.64495003, + "learning_rate": 1.886921714110507e-06, + "loss": 0.66629064, + "num_input_tokens_seen": 190508065, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.68359375, + "step": 8856, + "time_per_iteration": 2.483076333999634 + }, + { + "auxiliary_loss_clip": 0.01111855, + "auxiliary_loss_mlp": 0.01035084, + "balance_loss_clip": 1.02166665, + "balance_loss_mlp": 1.03986931, + "epoch": 0.5325116488802044, + "flos": 26870913636480.0, + "grad_norm": 1.6437419686120303, + "language_loss": 0.77630389, + "learning_rate": 1.8865328775648842e-06, + "loss": 0.79777324, + "num_input_tokens_seen": 190527045, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.71875, + "step": 8857, + "time_per_iteration": 2.534383773803711 + }, + { + "auxiliary_loss_clip": 0.01105473, + "auxiliary_loss_mlp": 0.01033387, + "balance_loss_clip": 1.02073884, + "balance_loss_mlp": 1.03602767, + "epoch": 0.5325717721328724, + "flos": 25884734757120.0, + "grad_norm": 2.590488147317335, + "language_loss": 0.71136224, + "learning_rate": 1.8861440453219456e-06, + "loss": 0.73275089, + "num_input_tokens_seen": 190544075, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 8858, + "time_per_iteration": 2.48335862159729 + }, + { + "auxiliary_loss_clip": 0.01108291, + "auxiliary_loss_mlp": 0.01033735, + "balance_loss_clip": 1.02001405, + "balance_loss_mlp": 1.03818965, + "epoch": 0.5326318953855403, + "flos": 21799321666560.0, + "grad_norm": 1.5574852735183802, + "language_loss": 0.69423437, + "learning_rate": 1.8857552173964367e-06, + "loss": 0.71565467, + "num_input_tokens_seen": 190566030, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.703125, + "step": 8859, + "time_per_iteration": 2.5293610095977783 + }, + { + "auxiliary_loss_clip": 0.01104952, + "auxiliary_loss_mlp": 0.01027434, + "balance_loss_clip": 1.01622272, + "balance_loss_mlp": 1.03947163, + "epoch": 0.5326920186382084, + "flos": 20922921728640.0, + "grad_norm": 1.5500879507245162, + "language_loss": 0.69682205, + "learning_rate": 1.8853663938031013e-06, + "loss": 0.71814591, + "num_input_tokens_seen": 190585605, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 8860, + "time_per_iteration": 2.453315019607544 + }, + { + "auxiliary_loss_clip": 0.01105883, + "auxiliary_loss_mlp": 0.01031982, + "balance_loss_clip": 1.02027583, + "balance_loss_mlp": 1.03789401, + "epoch": 0.5327521418908763, + "flos": 21433427775360.0, + "grad_norm": 1.830505462704671, + "language_loss": 0.78035998, + "learning_rate": 1.884977574556683e-06, + "loss": 0.80173862, + "num_input_tokens_seen": 190604625, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 8861, + "time_per_iteration": 2.4910025596618652 + }, + { + "auxiliary_loss_clip": 0.01107901, + "auxiliary_loss_mlp": 0.01037922, + "balance_loss_clip": 1.02470744, + "balance_loss_mlp": 1.03778684, + "epoch": 0.5328122651435443, + "flos": 21760250647680.0, + "grad_norm": 3.045684614472066, + "language_loss": 0.85532111, + "learning_rate": 1.8845887596719279e-06, + "loss": 0.87677932, + "num_input_tokens_seen": 190625060, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 8862, + "time_per_iteration": 2.4594204425811768 + }, + { + "auxiliary_loss_clip": 0.01108341, + "auxiliary_loss_mlp": 0.01035571, + "balance_loss_clip": 1.02181435, + "balance_loss_mlp": 1.03708994, + "epoch": 0.5328723883962122, + "flos": 18296487262080.0, + "grad_norm": 2.155580167277434, + "language_loss": 0.61776686, + "learning_rate": 1.8841999491635778e-06, + "loss": 0.63920593, + "num_input_tokens_seen": 190643150, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7109375, + "step": 8863, + "time_per_iteration": 2.431844472885132 + }, + { + "auxiliary_loss_clip": 0.01107834, + "auxiliary_loss_mlp": 0.01033129, + "balance_loss_clip": 1.02161896, + "balance_loss_mlp": 1.03979647, + "epoch": 0.5329325116488802, + "flos": 25374911068800.0, + "grad_norm": 1.808986842092349, + "language_loss": 0.73174077, + "learning_rate": 1.883811143046377e-06, + "loss": 0.7531504, + "num_input_tokens_seen": 190662725, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6796875, + "step": 8864, + "time_per_iteration": 2.481052875518799 + }, + { + "auxiliary_loss_clip": 0.01106149, + "auxiliary_loss_mlp": 0.01036127, + "balance_loss_clip": 1.02406275, + "balance_loss_mlp": 1.03704095, + "epoch": 0.5329926349015481, + "flos": 25592098654080.0, + "grad_norm": 1.770075213018519, + "language_loss": 0.64782691, + "learning_rate": 1.8834223413350702e-06, + "loss": 0.66924965, + "num_input_tokens_seen": 190683680, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.69140625, + "step": 8865, + "time_per_iteration": 2.5422523021698 + }, + { + "auxiliary_loss_clip": 0.01106424, + "auxiliary_loss_mlp": 0.010298, + "balance_loss_clip": 1.01711667, + "balance_loss_mlp": 1.0374155, + "epoch": 0.5330527581542162, + "flos": 22889605138560.0, + "grad_norm": 1.6788966461131323, + "language_loss": 0.78194928, + "learning_rate": 1.8830335440443989e-06, + "loss": 0.80331147, + "num_input_tokens_seen": 190703350, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 8866, + "time_per_iteration": 2.4783847332000732 + }, + { + "auxiliary_loss_clip": 0.01106298, + "auxiliary_loss_mlp": 0.01033575, + "balance_loss_clip": 1.02127266, + "balance_loss_mlp": 1.03756702, + "epoch": 0.5331128814068841, + "flos": 16026752805120.0, + "grad_norm": 2.4645319902700136, + "language_loss": 0.73618174, + "learning_rate": 1.882644751189108e-06, + "loss": 0.75758052, + "num_input_tokens_seen": 190721170, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 8867, + "time_per_iteration": 2.4607431888580322 + }, + { + "auxiliary_loss_clip": 0.01109812, + "auxiliary_loss_mlp": 0.01039021, + "balance_loss_clip": 1.02575922, + "balance_loss_mlp": 1.03957081, + "epoch": 0.5331730046595521, + "flos": 39344699629440.0, + "grad_norm": 1.616723113347984, + "language_loss": 0.72235525, + "learning_rate": 1.88225596278394e-06, + "loss": 0.7438435, + "num_input_tokens_seen": 190743795, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.703125, + "step": 8868, + "time_per_iteration": 2.6005828380584717 + }, + { + "auxiliary_loss_clip": 0.01107325, + "auxiliary_loss_mlp": 0.01032835, + "balance_loss_clip": 1.02044368, + "balance_loss_mlp": 1.03801441, + "epoch": 0.5332331279122201, + "flos": 24024382583040.0, + "grad_norm": 1.8848687711222403, + "language_loss": 0.78688312, + "learning_rate": 1.881867178843637e-06, + "loss": 0.80828476, + "num_input_tokens_seen": 190761560, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6953125, + "step": 8869, + "time_per_iteration": 2.527679681777954 + }, + { + "auxiliary_loss_clip": 0.01112421, + "auxiliary_loss_mlp": 0.0103673, + "balance_loss_clip": 1.02396262, + "balance_loss_mlp": 1.03942657, + "epoch": 0.533293251164888, + "flos": 17129318728320.0, + "grad_norm": 1.8336580730917733, + "language_loss": 0.75656843, + "learning_rate": 1.8814783993829434e-06, + "loss": 0.7780599, + "num_input_tokens_seen": 190778875, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7265625, + "step": 8870, + "time_per_iteration": 2.408651113510132 + }, + { + "auxiliary_loss_clip": 0.01112864, + "auxiliary_loss_mlp": 0.0103788, + "balance_loss_clip": 1.024266, + "balance_loss_mlp": 1.04069293, + "epoch": 0.533353374417556, + "flos": 22126360020480.0, + "grad_norm": 1.8439379115111716, + "language_loss": 0.75255805, + "learning_rate": 1.8810896244165997e-06, + "loss": 0.77406549, + "num_input_tokens_seen": 190799830, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.71875, + "step": 8871, + "time_per_iteration": 2.501173257827759 + }, + { + "auxiliary_loss_clip": 0.01109454, + "auxiliary_loss_mlp": 0.01032765, + "balance_loss_clip": 1.02014637, + "balance_loss_mlp": 1.03973055, + "epoch": 0.533413497670224, + "flos": 15011091838080.0, + "grad_norm": 1.7881983016452072, + "language_loss": 0.72249746, + "learning_rate": 1.8807008539593498e-06, + "loss": 0.74391973, + "num_input_tokens_seen": 190817155, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 8872, + "time_per_iteration": 2.4058215618133545 + }, + { + "auxiliary_loss_clip": 0.01110293, + "auxiliary_loss_mlp": 0.01038023, + "balance_loss_clip": 1.02498162, + "balance_loss_mlp": 1.04132104, + "epoch": 0.533473620922892, + "flos": 19609955890560.0, + "grad_norm": 1.7441588702127815, + "language_loss": 0.65051317, + "learning_rate": 1.880312088025936e-06, + "loss": 0.67199636, + "num_input_tokens_seen": 190835240, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6875, + "step": 8873, + "time_per_iteration": 2.4598374366760254 + }, + { + "auxiliary_loss_clip": 0.01108657, + "auxiliary_loss_mlp": 0.01037842, + "balance_loss_clip": 1.02549779, + "balance_loss_mlp": 1.03951979, + "epoch": 0.5335337441755599, + "flos": 14282644020480.0, + "grad_norm": 7.037025883542546, + "language_loss": 0.80012232, + "learning_rate": 1.879923326631099e-06, + "loss": 0.82158732, + "num_input_tokens_seen": 190851620, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.69140625, + "step": 8874, + "time_per_iteration": 2.43198299407959 + }, + { + "auxiliary_loss_clip": 0.0110808, + "auxiliary_loss_mlp": 0.01031501, + "balance_loss_clip": 1.01874542, + "balance_loss_mlp": 1.03897262, + "epoch": 0.5335938674282279, + "flos": 20814830726400.0, + "grad_norm": 2.558835697133273, + "language_loss": 0.70077014, + "learning_rate": 1.879534569789582e-06, + "loss": 0.72216594, + "num_input_tokens_seen": 190870545, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6875, + "step": 8875, + "time_per_iteration": 2.4746885299682617 + }, + { + "auxiliary_loss_clip": 0.01033299, + "auxiliary_loss_mlp": 0.01014121, + "balance_loss_clip": 1.01290536, + "balance_loss_mlp": 1.01076412, + "epoch": 0.5336539906808958, + "flos": 71396448451200.0, + "grad_norm": 0.7274620052615154, + "language_loss": 0.59653223, + "learning_rate": 1.879145817516126e-06, + "loss": 0.61700642, + "num_input_tokens_seen": 190931995, + "router_z_loss_clip": 0.012146, + "router_z_loss_mlp": 0.22460938, + "step": 8876, + "time_per_iteration": 3.1654725074768066 + }, + { + "auxiliary_loss_clip": 0.01107319, + "auxiliary_loss_mlp": 0.01031759, + "balance_loss_clip": 1.01971292, + "balance_loss_mlp": 1.0382477, + "epoch": 0.5337141139335638, + "flos": 20152996680960.0, + "grad_norm": 1.894052458703423, + "language_loss": 0.74833322, + "learning_rate": 1.8787570698254727e-06, + "loss": 0.76972401, + "num_input_tokens_seen": 190949890, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6875, + "step": 8877, + "time_per_iteration": 2.4836068153381348 + }, + { + "auxiliary_loss_clip": 0.01032923, + "auxiliary_loss_mlp": 0.0100501, + "balance_loss_clip": 1.00374663, + "balance_loss_mlp": 1.01051378, + "epoch": 0.5337742371862317, + "flos": 67728387484800.0, + "grad_norm": 0.7537185456157387, + "language_loss": 0.57229304, + "learning_rate": 1.8783683267323629e-06, + "loss": 0.59267235, + "num_input_tokens_seen": 191008480, + "router_z_loss_clip": 0.01263428, + "router_z_loss_mlp": 0.22460938, + "step": 8878, + "time_per_iteration": 2.9712772369384766 + }, + { + "auxiliary_loss_clip": 0.01111898, + "auxiliary_loss_mlp": 0.01034899, + "balance_loss_clip": 1.02161908, + "balance_loss_mlp": 1.04023981, + "epoch": 0.5338343604388998, + "flos": 25008909436800.0, + "grad_norm": 1.4246995459674998, + "language_loss": 0.72007561, + "learning_rate": 1.8779795882515395e-06, + "loss": 0.74154353, + "num_input_tokens_seen": 191028995, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 8879, + "time_per_iteration": 2.5073280334472656 + }, + { + "auxiliary_loss_clip": 0.01110375, + "auxiliary_loss_mlp": 0.01029972, + "balance_loss_clip": 1.01706791, + "balance_loss_mlp": 1.03980017, + "epoch": 0.5338944836915677, + "flos": 17601256546560.0, + "grad_norm": 2.331544880776984, + "language_loss": 0.8328526, + "learning_rate": 1.8775908543977416e-06, + "loss": 0.85425603, + "num_input_tokens_seen": 191045285, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 8880, + "time_per_iteration": 2.4154322147369385 + }, + { + "auxiliary_loss_clip": 0.01106028, + "auxiliary_loss_mlp": 0.0103408, + "balance_loss_clip": 1.02200413, + "balance_loss_mlp": 1.03857374, + "epoch": 0.5339546069442357, + "flos": 21724124544000.0, + "grad_norm": 1.3819058164028981, + "language_loss": 0.79567689, + "learning_rate": 1.8772021251857107e-06, + "loss": 0.81707799, + "num_input_tokens_seen": 191066105, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.671875, + "step": 8881, + "time_per_iteration": 2.4748446941375732 + }, + { + "auxiliary_loss_clip": 0.01032611, + "auxiliary_loss_mlp": 0.00999583, + "balance_loss_clip": 0.99825948, + "balance_loss_mlp": 1.01026177, + "epoch": 0.5340147301969036, + "flos": 69723583315200.0, + "grad_norm": 0.7951386121617492, + "language_loss": 0.59243226, + "learning_rate": 1.8768134006301882e-06, + "loss": 0.61275423, + "num_input_tokens_seen": 191126315, + "router_z_loss_clip": 0.01324463, + "router_z_loss_mlp": 0.22363281, + "step": 8882, + "time_per_iteration": 3.0554563999176025 + }, + { + "auxiliary_loss_clip": 0.01032284, + "auxiliary_loss_mlp": 0.01002778, + "balance_loss_clip": 1.0013417, + "balance_loss_mlp": 1.00965989, + "epoch": 0.5340748534495716, + "flos": 63880701580800.0, + "grad_norm": 0.8657705918333868, + "language_loss": 0.63714904, + "learning_rate": 1.876424680745913e-06, + "loss": 0.65749967, + "num_input_tokens_seen": 191174240, + "router_z_loss_clip": 0.01434326, + "router_z_loss_mlp": 0.2265625, + "step": 8883, + "time_per_iteration": 2.8666210174560547 + }, + { + "auxiliary_loss_clip": 0.01112111, + "auxiliary_loss_mlp": 0.01028534, + "balance_loss_clip": 1.01528406, + "balance_loss_mlp": 1.04020667, + "epoch": 0.5341349767022396, + "flos": 28694313694080.0, + "grad_norm": 2.5638154038033334, + "language_loss": 0.82000816, + "learning_rate": 1.8760359655476272e-06, + "loss": 0.84141463, + "num_input_tokens_seen": 191193335, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71875, + "step": 8884, + "time_per_iteration": 3.910738706588745 + }, + { + "auxiliary_loss_clip": 0.01104995, + "auxiliary_loss_mlp": 0.01028727, + "balance_loss_clip": 1.0165143, + "balance_loss_mlp": 1.03923178, + "epoch": 0.5341950999549075, + "flos": 16289691338880.0, + "grad_norm": 1.647799538914853, + "language_loss": 0.7224586, + "learning_rate": 1.8756472550500695e-06, + "loss": 0.74379575, + "num_input_tokens_seen": 191210900, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65625, + "step": 8885, + "time_per_iteration": 2.4330668449401855 + }, + { + "auxiliary_loss_clip": 0.01111824, + "auxiliary_loss_mlp": 0.01031625, + "balance_loss_clip": 1.01816654, + "balance_loss_mlp": 1.03816104, + "epoch": 0.5342552232075756, + "flos": 14355650413440.0, + "grad_norm": 1.9571098005847307, + "language_loss": 0.78834218, + "learning_rate": 1.87525854926798e-06, + "loss": 0.80977666, + "num_input_tokens_seen": 191226730, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 8886, + "time_per_iteration": 2.4285924434661865 + }, + { + "auxiliary_loss_clip": 0.01109212, + "auxiliary_loss_mlp": 0.01027987, + "balance_loss_clip": 1.01453424, + "balance_loss_mlp": 1.03859282, + "epoch": 0.5343153464602435, + "flos": 30297976300800.0, + "grad_norm": 1.4869737557636773, + "language_loss": 0.74745071, + "learning_rate": 1.8748698482160996e-06, + "loss": 0.76882267, + "num_input_tokens_seen": 191250435, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.70703125, + "step": 8887, + "time_per_iteration": 5.458622932434082 + }, + { + "auxiliary_loss_clip": 0.01106415, + "auxiliary_loss_mlp": 0.01025963, + "balance_loss_clip": 1.01351762, + "balance_loss_mlp": 1.03839684, + "epoch": 0.5343754697129115, + "flos": 15596292216960.0, + "grad_norm": 1.9580001729257437, + "language_loss": 0.68680072, + "learning_rate": 1.8744811519091663e-06, + "loss": 0.70812452, + "num_input_tokens_seen": 191268315, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6796875, + "step": 8888, + "time_per_iteration": 3.871016263961792 + }, + { + "auxiliary_loss_clip": 0.01115673, + "auxiliary_loss_mlp": 0.01037433, + "balance_loss_clip": 1.02426004, + "balance_loss_mlp": 1.03957748, + "epoch": 0.5344355929655794, + "flos": 16909617191040.0, + "grad_norm": 2.039365083298093, + "language_loss": 0.77427757, + "learning_rate": 1.8740924603619208e-06, + "loss": 0.79580867, + "num_input_tokens_seen": 191287000, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.76171875, + "step": 8889, + "time_per_iteration": 2.4321072101593018 + }, + { + "auxiliary_loss_clip": 0.01107574, + "auxiliary_loss_mlp": 0.01036398, + "balance_loss_clip": 1.02382183, + "balance_loss_mlp": 1.03896809, + "epoch": 0.5344957162182474, + "flos": 16798186224000.0, + "grad_norm": 1.7896399215033527, + "language_loss": 0.68882942, + "learning_rate": 1.873703773589102e-06, + "loss": 0.71026921, + "num_input_tokens_seen": 191304565, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 8890, + "time_per_iteration": 2.4512557983398438 + }, + { + "auxiliary_loss_clip": 0.01112757, + "auxiliary_loss_mlp": 0.01039562, + "balance_loss_clip": 1.02532864, + "balance_loss_mlp": 1.03882933, + "epoch": 0.5345558394709153, + "flos": 12705590413440.0, + "grad_norm": 3.075420511300943, + "language_loss": 0.77339637, + "learning_rate": 1.8733150916054483e-06, + "loss": 0.79491955, + "num_input_tokens_seen": 191318300, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.73828125, + "step": 8891, + "time_per_iteration": 2.4134135246276855 + }, + { + "auxiliary_loss_clip": 0.01106185, + "auxiliary_loss_mlp": 0.01030725, + "balance_loss_clip": 1.01904237, + "balance_loss_mlp": 1.03873932, + "epoch": 0.5346159627235834, + "flos": 22455050400000.0, + "grad_norm": 1.5298342127178157, + "language_loss": 0.73841035, + "learning_rate": 1.872926414425699e-06, + "loss": 0.75977939, + "num_input_tokens_seen": 191337925, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.67578125, + "step": 8892, + "time_per_iteration": 2.4843709468841553 + }, + { + "auxiliary_loss_clip": 0.0110608, + "auxiliary_loss_mlp": 0.0103126, + "balance_loss_clip": 1.01874948, + "balance_loss_mlp": 1.03663301, + "epoch": 0.5346760859762513, + "flos": 22415763899520.0, + "grad_norm": 1.5614617741562322, + "language_loss": 0.88069522, + "learning_rate": 1.8725377420645932e-06, + "loss": 0.90206861, + "num_input_tokens_seen": 191357120, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 8893, + "time_per_iteration": 2.445389747619629 + }, + { + "auxiliary_loss_clip": 0.0110385, + "auxiliary_loss_mlp": 0.01031179, + "balance_loss_clip": 1.01968718, + "balance_loss_mlp": 1.03617978, + "epoch": 0.5347362092289193, + "flos": 22816131868800.0, + "grad_norm": 1.5898186397759002, + "language_loss": 0.72623652, + "learning_rate": 1.872149074536869e-06, + "loss": 0.74758679, + "num_input_tokens_seen": 191375395, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6796875, + "step": 8894, + "time_per_iteration": 2.475914239883423 + }, + { + "auxiliary_loss_clip": 0.01106294, + "auxiliary_loss_mlp": 0.01030754, + "balance_loss_clip": 1.01774812, + "balance_loss_mlp": 1.03794241, + "epoch": 0.5347963324815872, + "flos": 23219480666880.0, + "grad_norm": 2.053516557339631, + "language_loss": 0.74730217, + "learning_rate": 1.8717604118572648e-06, + "loss": 0.7686727, + "num_input_tokens_seen": 191395595, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.68359375, + "step": 8895, + "time_per_iteration": 2.4524707794189453 + }, + { + "auxiliary_loss_clip": 0.01105976, + "auxiliary_loss_mlp": 0.01028564, + "balance_loss_clip": 1.01558769, + "balance_loss_mlp": 1.03688455, + "epoch": 0.5348564557342552, + "flos": 22601350494720.0, + "grad_norm": 1.7004701648033584, + "language_loss": 0.76999986, + "learning_rate": 1.8713717540405178e-06, + "loss": 0.79134524, + "num_input_tokens_seen": 191413730, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.69140625, + "step": 8896, + "time_per_iteration": 2.4727749824523926 + }, + { + "auxiliary_loss_clip": 0.01105321, + "auxiliary_loss_mlp": 0.01024889, + "balance_loss_clip": 1.01200807, + "balance_loss_mlp": 1.03771544, + "epoch": 0.5349165789869232, + "flos": 18002378701440.0, + "grad_norm": 1.674513516034323, + "language_loss": 0.78698516, + "learning_rate": 1.8709831011013676e-06, + "loss": 0.80828726, + "num_input_tokens_seen": 191432400, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.67578125, + "step": 8897, + "time_per_iteration": 2.437924861907959 + }, + { + "auxiliary_loss_clip": 0.01110861, + "auxiliary_loss_mlp": 0.01028076, + "balance_loss_clip": 1.015589, + "balance_loss_mlp": 1.04029751, + "epoch": 0.5349767022395912, + "flos": 17159770483200.0, + "grad_norm": 1.8516386867396797, + "language_loss": 0.75758165, + "learning_rate": 1.8705944530545509e-06, + "loss": 0.77897102, + "num_input_tokens_seen": 191448855, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 8898, + "time_per_iteration": 2.4490232467651367 + }, + { + "auxiliary_loss_clip": 0.0103315, + "auxiliary_loss_mlp": 0.00997269, + "balance_loss_clip": 0.99616033, + "balance_loss_mlp": 1.01073837, + "epoch": 0.5350368254922592, + "flos": 70992058158720.0, + "grad_norm": 0.8534656988697606, + "language_loss": 0.58027738, + "learning_rate": 1.8702058099148052e-06, + "loss": 0.60058159, + "num_input_tokens_seen": 191519690, + "router_z_loss_clip": 0.0111084, + "router_z_loss_mlp": 0.22460938, + "step": 8899, + "time_per_iteration": 3.2222988605499268 + }, + { + "auxiliary_loss_clip": 0.01105996, + "auxiliary_loss_mlp": 0.01028751, + "balance_loss_clip": 1.01625824, + "balance_loss_mlp": 1.03779793, + "epoch": 0.5350969487449271, + "flos": 27417833095680.0, + "grad_norm": 1.754025350675293, + "language_loss": 0.69734174, + "learning_rate": 1.869817171696868e-06, + "loss": 0.7186892, + "num_input_tokens_seen": 191539380, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 8900, + "time_per_iteration": 2.5348854064941406 + }, + { + "auxiliary_loss_clip": 0.01109931, + "auxiliary_loss_mlp": 0.01031276, + "balance_loss_clip": 1.01857448, + "balance_loss_mlp": 1.03874683, + "epoch": 0.5351570719975951, + "flos": 19316134638720.0, + "grad_norm": 1.712056344952118, + "language_loss": 0.71436262, + "learning_rate": 1.8694285384154777e-06, + "loss": 0.73577476, + "num_input_tokens_seen": 191557400, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 8901, + "time_per_iteration": 2.486694097518921 + }, + { + "auxiliary_loss_clip": 0.01108252, + "auxiliary_loss_mlp": 0.0102913, + "balance_loss_clip": 1.01632655, + "balance_loss_mlp": 1.03779531, + "epoch": 0.535217195250263, + "flos": 19828580019840.0, + "grad_norm": 2.0243685582186477, + "language_loss": 0.77403963, + "learning_rate": 1.8690399100853699e-06, + "loss": 0.79541337, + "num_input_tokens_seen": 191575860, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.703125, + "step": 8902, + "time_per_iteration": 2.4521291255950928 + }, + { + "auxiliary_loss_clip": 0.01103104, + "auxiliary_loss_mlp": 0.01032569, + "balance_loss_clip": 1.02103007, + "balance_loss_mlp": 1.03727639, + "epoch": 0.535277318502931, + "flos": 22127868391680.0, + "grad_norm": 1.5596437382067054, + "language_loss": 0.69763452, + "learning_rate": 1.868651286721281e-06, + "loss": 0.71899128, + "num_input_tokens_seen": 191595775, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 8903, + "time_per_iteration": 2.4639296531677246 + }, + { + "auxiliary_loss_clip": 0.01111291, + "auxiliary_loss_mlp": 0.01038911, + "balance_loss_clip": 1.02613187, + "balance_loss_mlp": 1.03885889, + "epoch": 0.5353374417555989, + "flos": 25045897466880.0, + "grad_norm": 1.4813880450748405, + "language_loss": 0.71867597, + "learning_rate": 1.86826266833795e-06, + "loss": 0.74017799, + "num_input_tokens_seen": 191617785, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7265625, + "step": 8904, + "time_per_iteration": 2.518556833267212 + }, + { + "auxiliary_loss_clip": 0.01109721, + "auxiliary_loss_mlp": 0.01035534, + "balance_loss_clip": 1.0223856, + "balance_loss_mlp": 1.03955388, + "epoch": 0.535397565008267, + "flos": 19388710068480.0, + "grad_norm": 1.7385404274740348, + "language_loss": 0.73125184, + "learning_rate": 1.8678740549501103e-06, + "loss": 0.75270438, + "num_input_tokens_seen": 191636900, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 8905, + "time_per_iteration": 2.481398582458496 + }, + { + "auxiliary_loss_clip": 0.01103053, + "auxiliary_loss_mlp": 0.01033307, + "balance_loss_clip": 1.02244139, + "balance_loss_mlp": 1.03704035, + "epoch": 0.5354576882609349, + "flos": 21471205904640.0, + "grad_norm": 1.4036286343955833, + "language_loss": 0.83569062, + "learning_rate": 1.8674854465725005e-06, + "loss": 0.85705423, + "num_input_tokens_seen": 191656720, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.66015625, + "step": 8906, + "time_per_iteration": 2.4822022914886475 + }, + { + "auxiliary_loss_clip": 0.01110585, + "auxiliary_loss_mlp": 0.01033769, + "balance_loss_clip": 1.02053666, + "balance_loss_mlp": 1.03906655, + "epoch": 0.5355178115136029, + "flos": 20777519473920.0, + "grad_norm": 3.1110381495397688, + "language_loss": 0.74120319, + "learning_rate": 1.8670968432198563e-06, + "loss": 0.76264668, + "num_input_tokens_seen": 191674445, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71484375, + "step": 8907, + "time_per_iteration": 2.4488067626953125 + }, + { + "auxiliary_loss_clip": 0.01109051, + "auxiliary_loss_mlp": 0.01028908, + "balance_loss_clip": 1.01639736, + "balance_loss_mlp": 1.03933167, + "epoch": 0.5355779347662708, + "flos": 23514020190720.0, + "grad_norm": 1.8326240405987804, + "language_loss": 0.77272546, + "learning_rate": 1.866708244906912e-06, + "loss": 0.79410505, + "num_input_tokens_seen": 191695000, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 8908, + "time_per_iteration": 2.5009818077087402 + }, + { + "auxiliary_loss_clip": 0.01111027, + "auxiliary_loss_mlp": 0.01035847, + "balance_loss_clip": 1.02252579, + "balance_loss_mlp": 1.039222, + "epoch": 0.5356380580189388, + "flos": 20303211358080.0, + "grad_norm": 9.969716540759343, + "language_loss": 0.7407465, + "learning_rate": 1.8663196516484055e-06, + "loss": 0.7622152, + "num_input_tokens_seen": 191713295, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.71875, + "step": 8909, + "time_per_iteration": 2.4272916316986084 + }, + { + "auxiliary_loss_clip": 0.01110397, + "auxiliary_loss_mlp": 0.01034564, + "balance_loss_clip": 1.02267265, + "balance_loss_mlp": 1.04071856, + "epoch": 0.5356981812716068, + "flos": 21361642444800.0, + "grad_norm": 1.9518435489791055, + "language_loss": 0.841941, + "learning_rate": 1.8659310634590702e-06, + "loss": 0.86339062, + "num_input_tokens_seen": 191732725, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69921875, + "step": 8910, + "time_per_iteration": 2.4678404331207275 + }, + { + "auxiliary_loss_clip": 0.01109272, + "auxiliary_loss_mlp": 0.01030589, + "balance_loss_clip": 1.0175302, + "balance_loss_mlp": 1.03802073, + "epoch": 0.5357583045242748, + "flos": 23111246010240.0, + "grad_norm": 1.5065365564315203, + "language_loss": 0.81728303, + "learning_rate": 1.8655424803536427e-06, + "loss": 0.83868158, + "num_input_tokens_seen": 191753765, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 8911, + "time_per_iteration": 2.482515335083008 + }, + { + "auxiliary_loss_clip": 0.01107773, + "auxiliary_loss_mlp": 0.01033913, + "balance_loss_clip": 1.02217102, + "balance_loss_mlp": 1.03894281, + "epoch": 0.5358184277769428, + "flos": 21141761339520.0, + "grad_norm": 1.8795354415042287, + "language_loss": 0.6902765, + "learning_rate": 1.8651539023468585e-06, + "loss": 0.71169335, + "num_input_tokens_seen": 191773560, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6875, + "step": 8912, + "time_per_iteration": 2.489625930786133 + }, + { + "auxiliary_loss_clip": 0.01110703, + "auxiliary_loss_mlp": 0.01035561, + "balance_loss_clip": 1.02269232, + "balance_loss_mlp": 1.04099894, + "epoch": 0.5358785510296107, + "flos": 16282400878080.0, + "grad_norm": 1.778457710383864, + "language_loss": 0.71355128, + "learning_rate": 1.8647653294534509e-06, + "loss": 0.73501396, + "num_input_tokens_seen": 191791255, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 8913, + "time_per_iteration": 2.4120781421661377 + }, + { + "auxiliary_loss_clip": 0.01114215, + "auxiliary_loss_mlp": 0.01036322, + "balance_loss_clip": 1.02322149, + "balance_loss_mlp": 1.04114628, + "epoch": 0.5359386742822787, + "flos": 16976877408000.0, + "grad_norm": 1.8082872891744106, + "language_loss": 0.72335684, + "learning_rate": 1.864376761688156e-06, + "loss": 0.7448622, + "num_input_tokens_seen": 191809325, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.734375, + "step": 8914, + "time_per_iteration": 2.466946840286255 + }, + { + "auxiliary_loss_clip": 0.01115012, + "auxiliary_loss_mlp": 0.01039705, + "balance_loss_clip": 1.02528632, + "balance_loss_mlp": 1.04084253, + "epoch": 0.5359987975349466, + "flos": 20812927305600.0, + "grad_norm": 2.2402764225711915, + "language_loss": 0.70448041, + "learning_rate": 1.8639881990657079e-06, + "loss": 0.72602755, + "num_input_tokens_seen": 191829795, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7421875, + "step": 8915, + "time_per_iteration": 2.5281713008880615 + }, + { + "auxiliary_loss_clip": 0.01108649, + "auxiliary_loss_mlp": 0.01036619, + "balance_loss_clip": 1.02335119, + "balance_loss_mlp": 1.03934813, + "epoch": 0.5360589207876146, + "flos": 22199941031040.0, + "grad_norm": 4.884439280571106, + "language_loss": 0.75188339, + "learning_rate": 1.8635996416008408e-06, + "loss": 0.77333617, + "num_input_tokens_seen": 191850840, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.69140625, + "step": 8916, + "time_per_iteration": 2.4901540279388428 + }, + { + "auxiliary_loss_clip": 0.01110696, + "auxiliary_loss_mlp": 0.01029597, + "balance_loss_clip": 1.01685333, + "balance_loss_mlp": 1.03908181, + "epoch": 0.5361190440402825, + "flos": 31394365084800.0, + "grad_norm": 2.001008974250462, + "language_loss": 0.72230595, + "learning_rate": 1.863211089308289e-06, + "loss": 0.74370885, + "num_input_tokens_seen": 191869520, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71484375, + "step": 8917, + "time_per_iteration": 2.5355899333953857 + }, + { + "auxiliary_loss_clip": 0.01109638, + "auxiliary_loss_mlp": 0.01037576, + "balance_loss_clip": 1.02460611, + "balance_loss_mlp": 1.04033589, + "epoch": 0.5361791672929506, + "flos": 16069882060800.0, + "grad_norm": 2.185479233449534, + "language_loss": 0.71158117, + "learning_rate": 1.8628225422027865e-06, + "loss": 0.73305333, + "num_input_tokens_seen": 191887240, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.69140625, + "step": 8918, + "time_per_iteration": 2.497854709625244 + }, + { + "auxiliary_loss_clip": 0.011106, + "auxiliary_loss_mlp": 0.01036514, + "balance_loss_clip": 1.02387154, + "balance_loss_mlp": 1.04111099, + "epoch": 0.5362392905456185, + "flos": 20740926493440.0, + "grad_norm": 1.4281907235735687, + "language_loss": 0.75156265, + "learning_rate": 1.862434000299067e-06, + "loss": 0.7730338, + "num_input_tokens_seen": 191905690, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 8919, + "time_per_iteration": 2.4522061347961426 + }, + { + "auxiliary_loss_clip": 0.01109131, + "auxiliary_loss_mlp": 0.01031322, + "balance_loss_clip": 1.0192163, + "balance_loss_mlp": 1.0374527, + "epoch": 0.5362994137982865, + "flos": 17340077779200.0, + "grad_norm": 1.9146697385716565, + "language_loss": 0.71194351, + "learning_rate": 1.862045463611864e-06, + "loss": 0.73334807, + "num_input_tokens_seen": 191920725, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71875, + "step": 8920, + "time_per_iteration": 2.4363694190979004 + }, + { + "auxiliary_loss_clip": 0.01106889, + "auxiliary_loss_mlp": 0.01031821, + "balance_loss_clip": 1.01886892, + "balance_loss_mlp": 1.03738046, + "epoch": 0.5363595370509544, + "flos": 42813957795840.0, + "grad_norm": 1.417495166440162, + "language_loss": 0.68572164, + "learning_rate": 1.8616569321559105e-06, + "loss": 0.7071088, + "num_input_tokens_seen": 191944645, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 8921, + "time_per_iteration": 2.659815788269043 + }, + { + "auxiliary_loss_clip": 0.01110885, + "auxiliary_loss_mlp": 0.01036076, + "balance_loss_clip": 1.02357066, + "balance_loss_mlp": 1.04096341, + "epoch": 0.5364196603036224, + "flos": 19171953446400.0, + "grad_norm": 1.806007791508249, + "language_loss": 0.81778204, + "learning_rate": 1.86126840594594e-06, + "loss": 0.83925164, + "num_input_tokens_seen": 191962265, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69921875, + "step": 8922, + "time_per_iteration": 2.4896881580352783 + }, + { + "auxiliary_loss_clip": 0.01109712, + "auxiliary_loss_mlp": 0.01028286, + "balance_loss_clip": 1.01601934, + "balance_loss_mlp": 1.03847456, + "epoch": 0.5364797835562904, + "flos": 17931060247680.0, + "grad_norm": 1.9048762186543056, + "language_loss": 0.76640022, + "learning_rate": 1.860879884996686e-06, + "loss": 0.78778023, + "num_input_tokens_seen": 191978850, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.7109375, + "step": 8923, + "time_per_iteration": 2.46250319480896 + }, + { + "auxiliary_loss_clip": 0.01112498, + "auxiliary_loss_mlp": 0.0103384, + "balance_loss_clip": 1.02061963, + "balance_loss_mlp": 1.04007745, + "epoch": 0.5365399068089584, + "flos": 30228058477440.0, + "grad_norm": 1.372230243923659, + "language_loss": 0.70459902, + "learning_rate": 1.8604913693228804e-06, + "loss": 0.72606242, + "num_input_tokens_seen": 192002000, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7265625, + "step": 8924, + "time_per_iteration": 2.5744879245758057 + }, + { + "auxiliary_loss_clip": 0.0111402, + "auxiliary_loss_mlp": 0.01036175, + "balance_loss_clip": 1.02251387, + "balance_loss_mlp": 1.04109585, + "epoch": 0.5366000300616264, + "flos": 24891696380160.0, + "grad_norm": 1.82023886715655, + "language_loss": 0.86756319, + "learning_rate": 1.8601028589392558e-06, + "loss": 0.88906515, + "num_input_tokens_seen": 192019100, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7265625, + "step": 8925, + "time_per_iteration": 2.4910149574279785 + }, + { + "auxiliary_loss_clip": 0.01110722, + "auxiliary_loss_mlp": 0.01031177, + "balance_loss_clip": 1.01847553, + "balance_loss_mlp": 1.03855276, + "epoch": 0.5366601533142943, + "flos": 29826649013760.0, + "grad_norm": 1.7557992545857284, + "language_loss": 0.77842706, + "learning_rate": 1.8597143538605455e-06, + "loss": 0.79984611, + "num_input_tokens_seen": 192041660, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 8926, + "time_per_iteration": 3.935426950454712 + }, + { + "auxiliary_loss_clip": 0.01108374, + "auxiliary_loss_mlp": 0.0103378, + "balance_loss_clip": 1.02207375, + "balance_loss_mlp": 1.04045248, + "epoch": 0.5367202765669623, + "flos": 27199352620800.0, + "grad_norm": 1.9312965019913735, + "language_loss": 0.66655087, + "learning_rate": 1.85932585410148e-06, + "loss": 0.68797243, + "num_input_tokens_seen": 192063540, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 8927, + "time_per_iteration": 2.547527313232422 + }, + { + "auxiliary_loss_clip": 0.01109886, + "auxiliary_loss_mlp": 0.01028352, + "balance_loss_clip": 1.01575708, + "balance_loss_mlp": 1.03839135, + "epoch": 0.5367803998196302, + "flos": 20229953569920.0, + "grad_norm": 1.6954569855299475, + "language_loss": 0.73241496, + "learning_rate": 1.8589373596767929e-06, + "loss": 0.75379729, + "num_input_tokens_seen": 192081760, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71484375, + "step": 8928, + "time_per_iteration": 2.432772636413574 + }, + { + "auxiliary_loss_clip": 0.01109785, + "auxiliary_loss_mlp": 0.01031284, + "balance_loss_clip": 1.01908278, + "balance_loss_mlp": 1.03883481, + "epoch": 0.5368405230722982, + "flos": 32154629374080.0, + "grad_norm": 1.7056756537874223, + "language_loss": 0.62998128, + "learning_rate": 1.8585488706012154e-06, + "loss": 0.65139198, + "num_input_tokens_seen": 192101620, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.7109375, + "step": 8929, + "time_per_iteration": 5.517207145690918 + }, + { + "auxiliary_loss_clip": 0.01109689, + "auxiliary_loss_mlp": 0.01031499, + "balance_loss_clip": 1.01881528, + "balance_loss_mlp": 1.03864491, + "epoch": 0.5369006463249661, + "flos": 26247935128320.0, + "grad_norm": 1.7096435666181475, + "language_loss": 0.65986609, + "learning_rate": 1.8581603868894781e-06, + "loss": 0.68127799, + "num_input_tokens_seen": 192121805, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 8930, + "time_per_iteration": 4.042668581008911 + }, + { + "auxiliary_loss_clip": 0.01105563, + "auxiliary_loss_mlp": 0.01029425, + "balance_loss_clip": 1.01673484, + "balance_loss_mlp": 1.03648782, + "epoch": 0.5369607695776342, + "flos": 26211306234240.0, + "grad_norm": 1.4058068619041801, + "language_loss": 0.66875708, + "learning_rate": 1.8577719085563136e-06, + "loss": 0.69010699, + "num_input_tokens_seen": 192141765, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 8931, + "time_per_iteration": 2.4965057373046875 + }, + { + "auxiliary_loss_clip": 0.01111171, + "auxiliary_loss_mlp": 0.01032988, + "balance_loss_clip": 1.01932693, + "balance_loss_mlp": 1.04157209, + "epoch": 0.5370208928303021, + "flos": 25009017177600.0, + "grad_norm": 1.7390938861026815, + "language_loss": 0.75847304, + "learning_rate": 1.8573834356164525e-06, + "loss": 0.77991474, + "num_input_tokens_seen": 192161560, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.6953125, + "step": 8932, + "time_per_iteration": 2.4885287284851074 + }, + { + "auxiliary_loss_clip": 0.01110784, + "auxiliary_loss_mlp": 0.0103335, + "balance_loss_clip": 1.01999855, + "balance_loss_mlp": 1.04103768, + "epoch": 0.5370810160829701, + "flos": 31792147274880.0, + "grad_norm": 1.8276755120836934, + "language_loss": 0.66255939, + "learning_rate": 1.8569949680846261e-06, + "loss": 0.68400073, + "num_input_tokens_seen": 192180190, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.69921875, + "step": 8933, + "time_per_iteration": 2.545335292816162 + }, + { + "auxiliary_loss_clip": 0.01106255, + "auxiliary_loss_mlp": 0.01032805, + "balance_loss_clip": 1.02077079, + "balance_loss_mlp": 1.03900647, + "epoch": 0.537141139335638, + "flos": 23842602829440.0, + "grad_norm": 1.6337429593741761, + "language_loss": 0.82865143, + "learning_rate": 1.856606505975565e-06, + "loss": 0.85004205, + "num_input_tokens_seen": 192198855, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 8934, + "time_per_iteration": 2.503974437713623 + }, + { + "auxiliary_loss_clip": 0.0110502, + "auxiliary_loss_mlp": 0.01035873, + "balance_loss_clip": 1.02293336, + "balance_loss_mlp": 1.03738618, + "epoch": 0.537201262588306, + "flos": 18508826511360.0, + "grad_norm": 1.7935675007471827, + "language_loss": 0.79473621, + "learning_rate": 1.856218049303999e-06, + "loss": 0.81614518, + "num_input_tokens_seen": 192216555, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.67578125, + "step": 8935, + "time_per_iteration": 2.4432904720306396 + }, + { + "auxiliary_loss_clip": 0.01109317, + "auxiliary_loss_mlp": 0.01037309, + "balance_loss_clip": 1.02450609, + "balance_loss_mlp": 1.03854251, + "epoch": 0.537261385840974, + "flos": 25662950231040.0, + "grad_norm": 1.6092738011459846, + "language_loss": 0.83558774, + "learning_rate": 1.855829598084659e-06, + "loss": 0.857054, + "num_input_tokens_seen": 192236910, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.70703125, + "step": 8936, + "time_per_iteration": 2.5320403575897217 + }, + { + "auxiliary_loss_clip": 0.01106939, + "auxiliary_loss_mlp": 0.01029184, + "balance_loss_clip": 1.0173173, + "balance_loss_mlp": 1.03860474, + "epoch": 0.537321509093642, + "flos": 40735017406080.0, + "grad_norm": 1.2642552304862777, + "language_loss": 0.72749949, + "learning_rate": 1.8554411523322754e-06, + "loss": 0.74886072, + "num_input_tokens_seen": 192260790, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.68359375, + "step": 8937, + "time_per_iteration": 2.6381869316101074 + }, + { + "auxiliary_loss_clip": 0.01110674, + "auxiliary_loss_mlp": 0.01028782, + "balance_loss_clip": 1.01589561, + "balance_loss_mlp": 1.03737688, + "epoch": 0.53738163234631, + "flos": 17238487138560.0, + "grad_norm": 2.79948851304012, + "language_loss": 0.81773913, + "learning_rate": 1.8550527120615778e-06, + "loss": 0.83913368, + "num_input_tokens_seen": 192277230, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.73046875, + "step": 8938, + "time_per_iteration": 2.4865500926971436 + }, + { + "auxiliary_loss_clip": 0.01115105, + "auxiliary_loss_mlp": 0.01035683, + "balance_loss_clip": 1.0231539, + "balance_loss_mlp": 1.04058433, + "epoch": 0.5374417555989779, + "flos": 12821977457280.0, + "grad_norm": 2.3721010649860403, + "language_loss": 0.80348092, + "learning_rate": 1.8546642772872957e-06, + "loss": 0.82498878, + "num_input_tokens_seen": 192292840, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7421875, + "step": 8939, + "time_per_iteration": 2.4440550804138184 + }, + { + "auxiliary_loss_clip": 0.01034483, + "auxiliary_loss_mlp": 0.01002274, + "balance_loss_clip": 1.00117719, + "balance_loss_mlp": 1.01246023, + "epoch": 0.5375018788516459, + "flos": 67256018703360.0, + "grad_norm": 0.7105496368182959, + "language_loss": 0.52484262, + "learning_rate": 1.8542758480241589e-06, + "loss": 0.54521012, + "num_input_tokens_seen": 192358240, + "router_z_loss_clip": 0.01098633, + "router_z_loss_mlp": 0.22070312, + "step": 8940, + "time_per_iteration": 3.091242790222168 + }, + { + "auxiliary_loss_clip": 0.01107473, + "auxiliary_loss_mlp": 0.01029266, + "balance_loss_clip": 1.01732159, + "balance_loss_mlp": 1.03880298, + "epoch": 0.5375620021043138, + "flos": 18114168804480.0, + "grad_norm": 1.7538523818266185, + "language_loss": 0.71252179, + "learning_rate": 1.8538874242868965e-06, + "loss": 0.73388922, + "num_input_tokens_seen": 192377370, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6875, + "step": 8941, + "time_per_iteration": 2.497748613357544 + }, + { + "auxiliary_loss_clip": 0.01106467, + "auxiliary_loss_mlp": 0.01030418, + "balance_loss_clip": 1.01807404, + "balance_loss_mlp": 1.03906739, + "epoch": 0.5376221253569818, + "flos": 23149383275520.0, + "grad_norm": 1.7257322220940274, + "language_loss": 0.7928313, + "learning_rate": 1.853499006090237e-06, + "loss": 0.81420016, + "num_input_tokens_seen": 192396450, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.671875, + "step": 8942, + "time_per_iteration": 2.5012340545654297 + }, + { + "auxiliary_loss_clip": 0.01113441, + "auxiliary_loss_mlp": 0.01036513, + "balance_loss_clip": 1.02305436, + "balance_loss_mlp": 1.04004788, + "epoch": 0.5376822486096497, + "flos": 29972302663680.0, + "grad_norm": 1.6646036710876846, + "language_loss": 0.69918364, + "learning_rate": 1.853110593448911e-06, + "loss": 0.72068322, + "num_input_tokens_seen": 192417390, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 8943, + "time_per_iteration": 2.5815587043762207 + }, + { + "auxiliary_loss_clip": 0.01032313, + "auxiliary_loss_mlp": 0.0099905, + "balance_loss_clip": 0.99804258, + "balance_loss_mlp": 1.01022053, + "epoch": 0.5377423718623178, + "flos": 54168950874240.0, + "grad_norm": 0.8193486791235207, + "language_loss": 0.59579939, + "learning_rate": 1.852722186377645e-06, + "loss": 0.61611301, + "num_input_tokens_seen": 192478060, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.22070312, + "step": 8944, + "time_per_iteration": 3.0560412406921387 + }, + { + "auxiliary_loss_clip": 0.01117959, + "auxiliary_loss_mlp": 0.01036857, + "balance_loss_clip": 1.02264094, + "balance_loss_mlp": 1.0415678, + "epoch": 0.5378024951149857, + "flos": 23257079228160.0, + "grad_norm": 2.048508714437824, + "language_loss": 0.77503264, + "learning_rate": 1.852333784891169e-06, + "loss": 0.79658085, + "num_input_tokens_seen": 192495985, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.765625, + "step": 8945, + "time_per_iteration": 2.4893672466278076 + }, + { + "auxiliary_loss_clip": 0.01109506, + "auxiliary_loss_mlp": 0.01034395, + "balance_loss_clip": 1.02192593, + "balance_loss_mlp": 1.03820658, + "epoch": 0.5378626183676537, + "flos": 24024095274240.0, + "grad_norm": 1.7269314210534699, + "language_loss": 0.68465722, + "learning_rate": 1.8519453890042112e-06, + "loss": 0.70609617, + "num_input_tokens_seen": 192515445, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7109375, + "step": 8946, + "time_per_iteration": 2.4605491161346436 + }, + { + "auxiliary_loss_clip": 0.01109149, + "auxiliary_loss_mlp": 0.0104377, + "balance_loss_clip": 1.03090715, + "balance_loss_mlp": 1.03953493, + "epoch": 0.5379227416203216, + "flos": 27161789973120.0, + "grad_norm": 1.7416668567009066, + "language_loss": 0.76750016, + "learning_rate": 1.851556998731498e-06, + "loss": 0.78902936, + "num_input_tokens_seen": 192536530, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 8947, + "time_per_iteration": 2.547470808029175 + }, + { + "auxiliary_loss_clip": 0.01108626, + "auxiliary_loss_mlp": 0.01029842, + "balance_loss_clip": 1.01731312, + "balance_loss_mlp": 1.03834343, + "epoch": 0.5379828648729896, + "flos": 24681619687680.0, + "grad_norm": 1.559080956726188, + "language_loss": 0.60268521, + "learning_rate": 1.8511686140877592e-06, + "loss": 0.62406987, + "num_input_tokens_seen": 192556075, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 8948, + "time_per_iteration": 2.486721992492676 + }, + { + "auxiliary_loss_clip": 0.01112593, + "auxiliary_loss_mlp": 0.01034497, + "balance_loss_clip": 1.0221529, + "balance_loss_mlp": 1.04152977, + "epoch": 0.5380429881256577, + "flos": 22523280284160.0, + "grad_norm": 1.6883046071040144, + "language_loss": 0.7951721, + "learning_rate": 1.8507802350877205e-06, + "loss": 0.816643, + "num_input_tokens_seen": 192575535, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.7109375, + "step": 8949, + "time_per_iteration": 2.504025936126709 + }, + { + "auxiliary_loss_clip": 0.01107717, + "auxiliary_loss_mlp": 0.0103256, + "balance_loss_clip": 1.01955473, + "balance_loss_mlp": 1.03890014, + "epoch": 0.5381031113783256, + "flos": 26979543342720.0, + "grad_norm": 1.5394027339965872, + "language_loss": 0.77871096, + "learning_rate": 1.850391861746111e-06, + "loss": 0.80011374, + "num_input_tokens_seen": 192594490, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6875, + "step": 8950, + "time_per_iteration": 2.4836034774780273 + }, + { + "auxiliary_loss_clip": 0.01108112, + "auxiliary_loss_mlp": 0.01031572, + "balance_loss_clip": 1.01990116, + "balance_loss_mlp": 1.04001009, + "epoch": 0.5381632346309936, + "flos": 24754087376640.0, + "grad_norm": 1.7709921726317892, + "language_loss": 0.72630781, + "learning_rate": 1.8500034940776573e-06, + "loss": 0.74770463, + "num_input_tokens_seen": 192615650, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 8951, + "time_per_iteration": 2.5027382373809814 + }, + { + "auxiliary_loss_clip": 0.01109504, + "auxiliary_loss_mlp": 0.0102749, + "balance_loss_clip": 1.01503229, + "balance_loss_mlp": 1.03817379, + "epoch": 0.5382233578836615, + "flos": 15560058372480.0, + "grad_norm": 1.739294207658579, + "language_loss": 0.75148916, + "learning_rate": 1.849615132097085e-06, + "loss": 0.7728591, + "num_input_tokens_seen": 192633840, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71484375, + "step": 8952, + "time_per_iteration": 2.423635244369507 + }, + { + "auxiliary_loss_clip": 0.01109041, + "auxiliary_loss_mlp": 0.01028303, + "balance_loss_clip": 1.01504064, + "balance_loss_mlp": 1.03914118, + "epoch": 0.5382834811363295, + "flos": 25084501608960.0, + "grad_norm": 1.5972619646266322, + "language_loss": 0.79724902, + "learning_rate": 1.8492267758191228e-06, + "loss": 0.81862247, + "num_input_tokens_seen": 192655890, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.69921875, + "step": 8953, + "time_per_iteration": 2.532107353210449 + }, + { + "auxiliary_loss_clip": 0.01106301, + "auxiliary_loss_mlp": 0.01033247, + "balance_loss_clip": 1.01993775, + "balance_loss_mlp": 1.03857923, + "epoch": 0.5383436043889974, + "flos": 13297901685120.0, + "grad_norm": 2.0280242140271336, + "language_loss": 0.80724108, + "learning_rate": 1.8488384252584964e-06, + "loss": 0.82863653, + "num_input_tokens_seen": 192673025, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.67578125, + "step": 8954, + "time_per_iteration": 2.404942512512207 + }, + { + "auxiliary_loss_clip": 0.01112016, + "auxiliary_loss_mlp": 0.01030551, + "balance_loss_clip": 1.01780725, + "balance_loss_mlp": 1.04119825, + "epoch": 0.5384037276416654, + "flos": 23039388852480.0, + "grad_norm": 2.327007095214437, + "language_loss": 0.76461661, + "learning_rate": 1.8484500804299318e-06, + "loss": 0.78604227, + "num_input_tokens_seen": 192692190, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 8955, + "time_per_iteration": 2.511826992034912 + }, + { + "auxiliary_loss_clip": 0.01110374, + "auxiliary_loss_mlp": 0.01036995, + "balance_loss_clip": 1.02414417, + "balance_loss_mlp": 1.04121125, + "epoch": 0.5384638508943334, + "flos": 20631147552000.0, + "grad_norm": 1.5710344626373696, + "language_loss": 0.7823422, + "learning_rate": 1.8480617413481557e-06, + "loss": 0.80381584, + "num_input_tokens_seen": 192710380, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69140625, + "step": 8956, + "time_per_iteration": 2.484722375869751 + }, + { + "auxiliary_loss_clip": 0.0103322, + "auxiliary_loss_mlp": 0.01002994, + "balance_loss_clip": 1.00186145, + "balance_loss_mlp": 1.01120663, + "epoch": 0.5385239741470014, + "flos": 66737683491840.0, + "grad_norm": 0.8559223539778376, + "language_loss": 0.63550651, + "learning_rate": 1.8476734080278932e-06, + "loss": 0.65586865, + "num_input_tokens_seen": 192768995, + "router_z_loss_clip": 0.01135254, + "router_z_loss_mlp": 0.22070312, + "step": 8957, + "time_per_iteration": 3.065546751022339 + }, + { + "auxiliary_loss_clip": 0.01032349, + "auxiliary_loss_mlp": 0.01008296, + "balance_loss_clip": 1.00706863, + "balance_loss_mlp": 1.01029825, + "epoch": 0.5385840973996693, + "flos": 64716058229760.0, + "grad_norm": 0.7038941855074313, + "language_loss": 0.5158186, + "learning_rate": 1.8472850804838705e-06, + "loss": 0.53622508, + "num_input_tokens_seen": 192825585, + "router_z_loss_clip": 0.01226807, + "router_z_loss_mlp": 0.22070312, + "step": 8958, + "time_per_iteration": 3.0705761909484863 + }, + { + "auxiliary_loss_clip": 0.01115886, + "auxiliary_loss_mlp": 0.01030672, + "balance_loss_clip": 1.01678383, + "balance_loss_mlp": 1.04319501, + "epoch": 0.5386442206523373, + "flos": 26141783460480.0, + "grad_norm": 1.5948521762422991, + "language_loss": 0.77216792, + "learning_rate": 1.8468967587308128e-06, + "loss": 0.79363346, + "num_input_tokens_seen": 192847335, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7265625, + "step": 8959, + "time_per_iteration": 2.4907429218292236 + }, + { + "auxiliary_loss_clip": 0.01109786, + "auxiliary_loss_mlp": 0.0103173, + "balance_loss_clip": 1.0190165, + "balance_loss_mlp": 1.03810203, + "epoch": 0.5387043439050052, + "flos": 18251849635200.0, + "grad_norm": 2.0946376118717493, + "language_loss": 0.83630693, + "learning_rate": 1.8465084427834455e-06, + "loss": 0.85772204, + "num_input_tokens_seen": 192862205, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 8960, + "time_per_iteration": 2.4251809120178223 + }, + { + "auxiliary_loss_clip": 0.01112347, + "auxiliary_loss_mlp": 0.01030452, + "balance_loss_clip": 1.01780403, + "balance_loss_mlp": 1.0417726, + "epoch": 0.5387644671576732, + "flos": 29788296266880.0, + "grad_norm": 1.575363596920687, + "language_loss": 0.78489578, + "learning_rate": 1.8461201326564933e-06, + "loss": 0.80632377, + "num_input_tokens_seen": 192883695, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 8961, + "time_per_iteration": 2.5358235836029053 + }, + { + "auxiliary_loss_clip": 0.01110674, + "auxiliary_loss_mlp": 0.01032204, + "balance_loss_clip": 1.01921666, + "balance_loss_mlp": 1.04004741, + "epoch": 0.5388245904103413, + "flos": 22374466237440.0, + "grad_norm": 1.7764783659945997, + "language_loss": 0.84602159, + "learning_rate": 1.845731828364681e-06, + "loss": 0.86745036, + "num_input_tokens_seen": 192900190, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.70703125, + "step": 8962, + "time_per_iteration": 2.462369918823242 + }, + { + "auxiliary_loss_clip": 0.01032215, + "auxiliary_loss_mlp": 0.00998189, + "balance_loss_clip": 0.99696141, + "balance_loss_mlp": 1.01020229, + "epoch": 0.5388847136630092, + "flos": 69807794751360.0, + "grad_norm": 0.7323858189394533, + "language_loss": 0.54189092, + "learning_rate": 1.8453435299227333e-06, + "loss": 0.56219494, + "num_input_tokens_seen": 192958675, + "router_z_loss_clip": 0.01226807, + "router_z_loss_mlp": 0.22070312, + "step": 8963, + "time_per_iteration": 3.000844717025757 + }, + { + "auxiliary_loss_clip": 0.01031141, + "auxiliary_loss_mlp": 0.00998281, + "balance_loss_clip": 0.99717277, + "balance_loss_mlp": 1.00911307, + "epoch": 0.5389448369156772, + "flos": 69822303845760.0, + "grad_norm": 0.8055122078658323, + "language_loss": 0.63433194, + "learning_rate": 1.8449552373453744e-06, + "loss": 0.65462613, + "num_input_tokens_seen": 193033135, + "router_z_loss_clip": 0.0111084, + "router_z_loss_mlp": 0.22070312, + "step": 8964, + "time_per_iteration": 3.241182565689087 + }, + { + "auxiliary_loss_clip": 0.01112883, + "auxiliary_loss_mlp": 0.01030674, + "balance_loss_clip": 1.01782298, + "balance_loss_mlp": 1.03918004, + "epoch": 0.5390049601683451, + "flos": 31722444933120.0, + "grad_norm": 1.532843563745025, + "language_loss": 0.69958258, + "learning_rate": 1.8445669506473287e-06, + "loss": 0.72101814, + "num_input_tokens_seen": 193055570, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.73828125, + "step": 8965, + "time_per_iteration": 2.524223804473877 + }, + { + "auxiliary_loss_clip": 0.01114315, + "auxiliary_loss_mlp": 0.01035135, + "balance_loss_clip": 1.02103257, + "balance_loss_mlp": 1.04133582, + "epoch": 0.5390650834210131, + "flos": 18113486446080.0, + "grad_norm": 2.362623955664157, + "language_loss": 0.81848061, + "learning_rate": 1.8441786698433192e-06, + "loss": 0.83997512, + "num_input_tokens_seen": 193073120, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.73046875, + "step": 8966, + "time_per_iteration": 2.477625608444214 + }, + { + "auxiliary_loss_clip": 0.01110928, + "auxiliary_loss_mlp": 0.0103216, + "balance_loss_clip": 1.01913619, + "balance_loss_mlp": 1.04063606, + "epoch": 0.539125206673681, + "flos": 17416711445760.0, + "grad_norm": 1.8348280049509287, + "language_loss": 0.72713602, + "learning_rate": 1.8437903949480706e-06, + "loss": 0.74856687, + "num_input_tokens_seen": 193090105, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 8967, + "time_per_iteration": 2.419088125228882 + }, + { + "auxiliary_loss_clip": 0.0110771, + "auxiliary_loss_mlp": 0.01031235, + "balance_loss_clip": 1.01884913, + "balance_loss_mlp": 1.03676677, + "epoch": 0.539185329926349, + "flos": 22198935450240.0, + "grad_norm": 1.8042691798262989, + "language_loss": 0.81596529, + "learning_rate": 1.8434021259763065e-06, + "loss": 0.83735478, + "num_input_tokens_seen": 193109325, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 8968, + "time_per_iteration": 3.8650004863739014 + }, + { + "auxiliary_loss_clip": 0.01111598, + "auxiliary_loss_mlp": 0.01030147, + "balance_loss_clip": 1.0168612, + "balance_loss_mlp": 1.0391978, + "epoch": 0.539245453179017, + "flos": 21434397442560.0, + "grad_norm": 1.5993373110169542, + "language_loss": 0.73938435, + "learning_rate": 1.8430138629427484e-06, + "loss": 0.76080179, + "num_input_tokens_seen": 193130595, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.72265625, + "step": 8969, + "time_per_iteration": 2.485146999359131 + }, + { + "auxiliary_loss_clip": 0.01111919, + "auxiliary_loss_mlp": 0.01032887, + "balance_loss_clip": 1.01886833, + "balance_loss_mlp": 1.03785658, + "epoch": 0.539305576431685, + "flos": 20735000749440.0, + "grad_norm": 2.3553854013154907, + "language_loss": 0.82165599, + "learning_rate": 1.8426256058621205e-06, + "loss": 0.84310412, + "num_input_tokens_seen": 193148930, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 8970, + "time_per_iteration": 2.4504613876342773 + }, + { + "auxiliary_loss_clip": 0.01109668, + "auxiliary_loss_mlp": 0.01032979, + "balance_loss_clip": 1.02005112, + "balance_loss_mlp": 1.03989851, + "epoch": 0.5393656996843529, + "flos": 30920452018560.0, + "grad_norm": 1.5328161731771237, + "language_loss": 0.75619417, + "learning_rate": 1.842237354749146e-06, + "loss": 0.77762067, + "num_input_tokens_seen": 193170140, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 8971, + "time_per_iteration": 5.434189558029175 + }, + { + "auxiliary_loss_clip": 0.01030677, + "auxiliary_loss_mlp": 0.00999826, + "balance_loss_clip": 0.99856228, + "balance_loss_mlp": 1.00854254, + "epoch": 0.5394258229370209, + "flos": 50317781351040.0, + "grad_norm": 0.8757990223887638, + "language_loss": 0.60310632, + "learning_rate": 1.8418491096185465e-06, + "loss": 0.62341136, + "num_input_tokens_seen": 193227235, + "router_z_loss_clip": 0.01263428, + "router_z_loss_mlp": 0.22167969, + "step": 8972, + "time_per_iteration": 3.070239782333374 + }, + { + "auxiliary_loss_clip": 0.01109336, + "auxiliary_loss_mlp": 0.01044193, + "balance_loss_clip": 1.03085351, + "balance_loss_mlp": 1.0389235, + "epoch": 0.5394859461896888, + "flos": 25411935012480.0, + "grad_norm": 1.4916710753135305, + "language_loss": 0.78427428, + "learning_rate": 1.841460870485045e-06, + "loss": 0.80580956, + "num_input_tokens_seen": 193248435, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.703125, + "step": 8973, + "time_per_iteration": 2.4841833114624023 + }, + { + "auxiliary_loss_clip": 0.01116334, + "auxiliary_loss_mlp": 0.01037039, + "balance_loss_clip": 1.02265668, + "balance_loss_mlp": 1.03959453, + "epoch": 0.5395460694423568, + "flos": 25478476957440.0, + "grad_norm": 2.2712479958365304, + "language_loss": 0.73893452, + "learning_rate": 1.8410726373633623e-06, + "loss": 0.76046824, + "num_input_tokens_seen": 193267490, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.765625, + "step": 8974, + "time_per_iteration": 2.5056395530700684 + }, + { + "auxiliary_loss_clip": 0.01029707, + "auxiliary_loss_mlp": 0.01005081, + "balance_loss_clip": 1.00388896, + "balance_loss_mlp": 1.00777423, + "epoch": 0.5396061926950249, + "flos": 53249493507840.0, + "grad_norm": 0.7339193766969773, + "language_loss": 0.51197326, + "learning_rate": 1.8406844102682215e-06, + "loss": 0.53232116, + "num_input_tokens_seen": 193326050, + "router_z_loss_clip": 0.01190186, + "router_z_loss_mlp": 0.21972656, + "step": 8975, + "time_per_iteration": 3.0552287101745605 + }, + { + "auxiliary_loss_clip": 0.01110098, + "auxiliary_loss_mlp": 0.01040418, + "balance_loss_clip": 1.02723336, + "balance_loss_mlp": 1.03983927, + "epoch": 0.5396663159476928, + "flos": 26725080418560.0, + "grad_norm": 1.5397959415241314, + "language_loss": 0.71919322, + "learning_rate": 1.840296189214344e-06, + "loss": 0.74069834, + "num_input_tokens_seen": 193348785, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 8976, + "time_per_iteration": 2.5368118286132812 + }, + { + "auxiliary_loss_clip": 0.01112047, + "auxiliary_loss_mlp": 0.01035595, + "balance_loss_clip": 1.02300107, + "balance_loss_mlp": 1.03994215, + "epoch": 0.5397264392003608, + "flos": 23253380127360.0, + "grad_norm": 2.148603673983975, + "language_loss": 0.70274073, + "learning_rate": 1.8399079742164509e-06, + "loss": 0.72421718, + "num_input_tokens_seen": 193367080, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71875, + "step": 8977, + "time_per_iteration": 2.4685816764831543 + }, + { + "auxiliary_loss_clip": 0.01113255, + "auxiliary_loss_mlp": 0.01034111, + "balance_loss_clip": 1.02102757, + "balance_loss_mlp": 1.04169548, + "epoch": 0.5397865624530287, + "flos": 18294188791680.0, + "grad_norm": 1.656094242871676, + "language_loss": 0.7241326, + "learning_rate": 1.8395197652892636e-06, + "loss": 0.7456063, + "num_input_tokens_seen": 193383715, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 8978, + "time_per_iteration": 2.4495601654052734 + }, + { + "auxiliary_loss_clip": 0.01118429, + "auxiliary_loss_mlp": 0.01032682, + "balance_loss_clip": 1.01778078, + "balance_loss_mlp": 1.04137743, + "epoch": 0.5398466857056967, + "flos": 15297514888320.0, + "grad_norm": 2.582100330429111, + "language_loss": 0.73947239, + "learning_rate": 1.8391315624475028e-06, + "loss": 0.76098353, + "num_input_tokens_seen": 193400560, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7734375, + "step": 8979, + "time_per_iteration": 2.467693328857422 + }, + { + "auxiliary_loss_clip": 0.0111825, + "auxiliary_loss_mlp": 0.01049486, + "balance_loss_clip": 1.03538978, + "balance_loss_mlp": 1.04216337, + "epoch": 0.5399068089583646, + "flos": 17821748183040.0, + "grad_norm": 2.0456901795615656, + "language_loss": 0.76959479, + "learning_rate": 1.8387433657058892e-06, + "loss": 0.79127216, + "num_input_tokens_seen": 193418680, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 8980, + "time_per_iteration": 2.5299665927886963 + }, + { + "auxiliary_loss_clip": 0.01111255, + "auxiliary_loss_mlp": 0.01035786, + "balance_loss_clip": 1.02332902, + "balance_loss_mlp": 1.0388093, + "epoch": 0.5399669322110326, + "flos": 27381635164800.0, + "grad_norm": 1.6658662418671077, + "language_loss": 0.81773221, + "learning_rate": 1.8383551750791431e-06, + "loss": 0.83920264, + "num_input_tokens_seen": 193439310, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7265625, + "step": 8981, + "time_per_iteration": 2.593594789505005 + }, + { + "auxiliary_loss_clip": 0.01113866, + "auxiliary_loss_mlp": 0.01032362, + "balance_loss_clip": 1.01837826, + "balance_loss_mlp": 1.03922904, + "epoch": 0.5400270554637006, + "flos": 20449116403200.0, + "grad_norm": 1.7978808319720327, + "language_loss": 0.66842318, + "learning_rate": 1.8379669905819857e-06, + "loss": 0.68988544, + "num_input_tokens_seen": 193458115, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.74609375, + "step": 8982, + "time_per_iteration": 2.5118813514709473 + }, + { + "auxiliary_loss_clip": 0.01110986, + "auxiliary_loss_mlp": 0.01039664, + "balance_loss_clip": 1.02715898, + "balance_loss_mlp": 1.03987551, + "epoch": 0.5400871787163686, + "flos": 21689578638720.0, + "grad_norm": 1.4560866330096367, + "language_loss": 0.82442951, + "learning_rate": 1.8375788122291358e-06, + "loss": 0.84593606, + "num_input_tokens_seen": 193477365, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7109375, + "step": 8983, + "time_per_iteration": 2.457221269607544 + }, + { + "auxiliary_loss_clip": 0.01110015, + "auxiliary_loss_mlp": 0.01035726, + "balance_loss_clip": 1.02204108, + "balance_loss_mlp": 1.03799057, + "epoch": 0.5401473019690365, + "flos": 19204739585280.0, + "grad_norm": 1.7289170608138429, + "language_loss": 0.7078771, + "learning_rate": 1.8371906400353138e-06, + "loss": 0.72933447, + "num_input_tokens_seen": 193495595, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71875, + "step": 8984, + "time_per_iteration": 2.4523980617523193 + }, + { + "auxiliary_loss_clip": 0.01115801, + "auxiliary_loss_mlp": 0.01034543, + "balance_loss_clip": 1.02000558, + "balance_loss_mlp": 1.04127955, + "epoch": 0.5402074252217045, + "flos": 20627376624000.0, + "grad_norm": 1.7555929792269789, + "language_loss": 0.80110276, + "learning_rate": 1.8368024740152386e-06, + "loss": 0.82260621, + "num_input_tokens_seen": 193514035, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7421875, + "step": 8985, + "time_per_iteration": 2.446753740310669 + }, + { + "auxiliary_loss_clip": 0.01104654, + "auxiliary_loss_mlp": 0.0102882, + "balance_loss_clip": 1.01560616, + "balance_loss_mlp": 1.03796721, + "epoch": 0.5402675484743724, + "flos": 24973465691520.0, + "grad_norm": 2.3719765019392844, + "language_loss": 0.78840292, + "learning_rate": 1.83641431418363e-06, + "loss": 0.80973768, + "num_input_tokens_seen": 193535445, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.66796875, + "step": 8986, + "time_per_iteration": 2.5318102836608887 + }, + { + "auxiliary_loss_clip": 0.01109855, + "auxiliary_loss_mlp": 0.01031286, + "balance_loss_clip": 1.01879263, + "balance_loss_mlp": 1.03847885, + "epoch": 0.5403276717270404, + "flos": 19459022941440.0, + "grad_norm": 1.6989773263518806, + "language_loss": 0.77060419, + "learning_rate": 1.8360261605552075e-06, + "loss": 0.79201555, + "num_input_tokens_seen": 193554780, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71484375, + "step": 8987, + "time_per_iteration": 2.524240732192993 + }, + { + "auxiliary_loss_clip": 0.01109666, + "auxiliary_loss_mlp": 0.01031219, + "balance_loss_clip": 1.0178858, + "balance_loss_mlp": 1.03889561, + "epoch": 0.5403877949797083, + "flos": 18442140912000.0, + "grad_norm": 2.580263640738581, + "language_loss": 0.71292162, + "learning_rate": 1.8356380131446887e-06, + "loss": 0.73433048, + "num_input_tokens_seen": 193573580, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 8988, + "time_per_iteration": 2.4638671875 + }, + { + "auxiliary_loss_clip": 0.01110225, + "auxiliary_loss_mlp": 0.01036979, + "balance_loss_clip": 1.0228405, + "balance_loss_mlp": 1.03822088, + "epoch": 0.5404479182323764, + "flos": 28292868316800.0, + "grad_norm": 2.2630612952232827, + "language_loss": 0.67666376, + "learning_rate": 1.8352498719667934e-06, + "loss": 0.69813585, + "num_input_tokens_seen": 193590490, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.71875, + "step": 8989, + "time_per_iteration": 2.508855104446411 + }, + { + "auxiliary_loss_clip": 0.01111455, + "auxiliary_loss_mlp": 0.01037396, + "balance_loss_clip": 1.02386594, + "balance_loss_mlp": 1.03881633, + "epoch": 0.5405080414850444, + "flos": 23367325046400.0, + "grad_norm": 1.5798861838358007, + "language_loss": 0.77628905, + "learning_rate": 1.8348617370362399e-06, + "loss": 0.79777759, + "num_input_tokens_seen": 193609900, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7265625, + "step": 8990, + "time_per_iteration": 2.489483118057251 + }, + { + "auxiliary_loss_clip": 0.01106485, + "auxiliary_loss_mlp": 0.01028032, + "balance_loss_clip": 1.01594377, + "balance_loss_mlp": 1.03673029, + "epoch": 0.5405681647377123, + "flos": 21106425335040.0, + "grad_norm": 1.5931818725193578, + "language_loss": 0.69039345, + "learning_rate": 1.834473608367745e-06, + "loss": 0.71173859, + "num_input_tokens_seen": 193629775, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 8991, + "time_per_iteration": 2.4418294429779053 + }, + { + "auxiliary_loss_clip": 0.01109673, + "auxiliary_loss_mlp": 0.01035539, + "balance_loss_clip": 1.02171683, + "balance_loss_mlp": 1.03739381, + "epoch": 0.5406282879903803, + "flos": 20449188230400.0, + "grad_norm": 1.7624988623501092, + "language_loss": 0.7614572, + "learning_rate": 1.8340854859760277e-06, + "loss": 0.78290933, + "num_input_tokens_seen": 193648070, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.72265625, + "step": 8992, + "time_per_iteration": 2.4845540523529053 + }, + { + "auxiliary_loss_clip": 0.01110684, + "auxiliary_loss_mlp": 0.01032261, + "balance_loss_clip": 1.01963115, + "balance_loss_mlp": 1.03731656, + "epoch": 0.5406884112430482, + "flos": 14209493973120.0, + "grad_norm": 2.6314606707027304, + "language_loss": 0.76393229, + "learning_rate": 1.8336973698758056e-06, + "loss": 0.78536171, + "num_input_tokens_seen": 193665060, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.734375, + "step": 8993, + "time_per_iteration": 2.4074175357818604 + }, + { + "auxiliary_loss_clip": 0.01107083, + "auxiliary_loss_mlp": 0.01033943, + "balance_loss_clip": 1.02129519, + "balance_loss_mlp": 1.03785443, + "epoch": 0.5407485344957162, + "flos": 23875568536320.0, + "grad_norm": 1.6731423627794038, + "language_loss": 0.70444834, + "learning_rate": 1.8333092600817959e-06, + "loss": 0.72585857, + "num_input_tokens_seen": 193683620, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 8994, + "time_per_iteration": 2.5207760334014893 + }, + { + "auxiliary_loss_clip": 0.01110631, + "auxiliary_loss_mlp": 0.01031119, + "balance_loss_clip": 1.01729715, + "balance_loss_mlp": 1.03817177, + "epoch": 0.5408086577483842, + "flos": 23148485435520.0, + "grad_norm": 1.7966588085871025, + "language_loss": 0.74846065, + "learning_rate": 1.8329211566087157e-06, + "loss": 0.76987815, + "num_input_tokens_seen": 193702990, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7265625, + "step": 8995, + "time_per_iteration": 2.468820095062256 + }, + { + "auxiliary_loss_clip": 0.01107091, + "auxiliary_loss_mlp": 0.01035132, + "balance_loss_clip": 1.02315211, + "balance_loss_mlp": 1.0381844, + "epoch": 0.5408687810010522, + "flos": 18771046773120.0, + "grad_norm": 1.845320286189123, + "language_loss": 0.73867524, + "learning_rate": 1.832533059471282e-06, + "loss": 0.7600975, + "num_input_tokens_seen": 193721785, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6875, + "step": 8996, + "time_per_iteration": 2.4607067108154297 + }, + { + "auxiliary_loss_clip": 0.01105028, + "auxiliary_loss_mlp": 0.01033976, + "balance_loss_clip": 1.02183414, + "balance_loss_mlp": 1.03760076, + "epoch": 0.5409289042537201, + "flos": 13881557779200.0, + "grad_norm": 1.7779086932858201, + "language_loss": 0.73281908, + "learning_rate": 1.8321449686842115e-06, + "loss": 0.75420916, + "num_input_tokens_seen": 193740315, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.671875, + "step": 8997, + "time_per_iteration": 2.433438301086426 + }, + { + "auxiliary_loss_clip": 0.01109644, + "auxiliary_loss_mlp": 0.01033634, + "balance_loss_clip": 1.02052116, + "balance_loss_mlp": 1.03904319, + "epoch": 0.5409890275063881, + "flos": 14465357527680.0, + "grad_norm": 2.01233035965423, + "language_loss": 0.71775877, + "learning_rate": 1.8317568842622207e-06, + "loss": 0.73919159, + "num_input_tokens_seen": 193757580, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 8998, + "time_per_iteration": 2.4791901111602783 + }, + { + "auxiliary_loss_clip": 0.01107126, + "auxiliary_loss_mlp": 0.01037885, + "balance_loss_clip": 1.02471876, + "balance_loss_mlp": 1.03724909, + "epoch": 0.541049150759056, + "flos": 48977449349760.0, + "grad_norm": 1.596226887866337, + "language_loss": 0.70601052, + "learning_rate": 1.8313688062200256e-06, + "loss": 0.72746068, + "num_input_tokens_seen": 193780965, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.69921875, + "step": 8999, + "time_per_iteration": 2.6774816513061523 + }, + { + "auxiliary_loss_clip": 0.01106234, + "auxiliary_loss_mlp": 0.01035678, + "balance_loss_clip": 1.0222373, + "balance_loss_mlp": 1.03789854, + "epoch": 0.541109274011724, + "flos": 18147601388160.0, + "grad_norm": 2.5727427903087716, + "language_loss": 0.80433559, + "learning_rate": 1.8309807345723422e-06, + "loss": 0.8257547, + "num_input_tokens_seen": 193797855, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.68359375, + "step": 9000, + "time_per_iteration": 2.4608795642852783 + }, + { + "auxiliary_loss_clip": 0.0110639, + "auxiliary_loss_mlp": 0.01029527, + "balance_loss_clip": 1.01646805, + "balance_loss_mlp": 1.03770971, + "epoch": 0.541169397264392, + "flos": 20522553759360.0, + "grad_norm": 1.4688376580267075, + "language_loss": 0.72885478, + "learning_rate": 1.8305926693338863e-06, + "loss": 0.75021398, + "num_input_tokens_seen": 193817375, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6875, + "step": 9001, + "time_per_iteration": 2.469433069229126 + }, + { + "auxiliary_loss_clip": 0.01112566, + "auxiliary_loss_mlp": 0.01035385, + "balance_loss_clip": 1.0213902, + "balance_loss_mlp": 1.03844023, + "epoch": 0.54122952051706, + "flos": 20044043752320.0, + "grad_norm": 2.257759724972284, + "language_loss": 0.85127461, + "learning_rate": 1.8302046105193734e-06, + "loss": 0.87275422, + "num_input_tokens_seen": 193832205, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7421875, + "step": 9002, + "time_per_iteration": 2.4405739307403564 + }, + { + "auxiliary_loss_clip": 0.01107037, + "auxiliary_loss_mlp": 0.010314, + "balance_loss_clip": 1.02020574, + "balance_loss_mlp": 1.0384078, + "epoch": 0.541289643769728, + "flos": 19062246332160.0, + "grad_norm": 1.7125809204353786, + "language_loss": 0.77755821, + "learning_rate": 1.8298165581435183e-06, + "loss": 0.79894257, + "num_input_tokens_seen": 193849830, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6875, + "step": 9003, + "time_per_iteration": 2.451507806777954 + }, + { + "auxiliary_loss_clip": 0.01105384, + "auxiliary_loss_mlp": 0.01029055, + "balance_loss_clip": 1.01557827, + "balance_loss_mlp": 1.03640234, + "epoch": 0.5413497670223959, + "flos": 22382295402240.0, + "grad_norm": 2.168361582224207, + "language_loss": 0.69784325, + "learning_rate": 1.8294285122210372e-06, + "loss": 0.71918762, + "num_input_tokens_seen": 193869945, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.6875, + "step": 9004, + "time_per_iteration": 2.613961935043335 + }, + { + "auxiliary_loss_clip": 0.01028073, + "auxiliary_loss_mlp": 0.01010119, + "balance_loss_clip": 1.00899816, + "balance_loss_mlp": 1.00624812, + "epoch": 0.5414098902750639, + "flos": 70031734093440.0, + "grad_norm": 0.9677352946959291, + "language_loss": 0.59124619, + "learning_rate": 1.8290404727666434e-06, + "loss": 0.61162812, + "num_input_tokens_seen": 193930860, + "router_z_loss_clip": 0.01123047, + "router_z_loss_mlp": 0.21875, + "step": 9005, + "time_per_iteration": 3.175964832305908 + }, + { + "auxiliary_loss_clip": 0.01110665, + "auxiliary_loss_mlp": 0.01033824, + "balance_loss_clip": 1.02183771, + "balance_loss_mlp": 1.03938627, + "epoch": 0.5414700135277318, + "flos": 21798962530560.0, + "grad_norm": 1.6968329328942213, + "language_loss": 0.77685302, + "learning_rate": 1.8286524397950517e-06, + "loss": 0.79829788, + "num_input_tokens_seen": 193949075, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.7109375, + "step": 9006, + "time_per_iteration": 2.455742359161377 + }, + { + "auxiliary_loss_clip": 0.01104494, + "auxiliary_loss_mlp": 0.01032845, + "balance_loss_clip": 1.02205062, + "balance_loss_mlp": 1.03625751, + "epoch": 0.5415301367803999, + "flos": 16907929251840.0, + "grad_norm": 1.624690870596759, + "language_loss": 0.82998371, + "learning_rate": 1.8282644133209777e-06, + "loss": 0.8513571, + "num_input_tokens_seen": 193967630, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.68359375, + "step": 9007, + "time_per_iteration": 2.4356093406677246 + }, + { + "auxiliary_loss_clip": 0.01107937, + "auxiliary_loss_mlp": 0.01030224, + "balance_loss_clip": 1.01693249, + "balance_loss_mlp": 1.03761423, + "epoch": 0.5415902600330678, + "flos": 25704176065920.0, + "grad_norm": 2.1377427178959434, + "language_loss": 0.67209023, + "learning_rate": 1.8278763933591334e-06, + "loss": 0.69347185, + "num_input_tokens_seen": 193988730, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.703125, + "step": 9008, + "time_per_iteration": 2.5489509105682373 + }, + { + "auxiliary_loss_clip": 0.01111879, + "auxiliary_loss_mlp": 0.01031733, + "balance_loss_clip": 1.01810145, + "balance_loss_mlp": 1.03802204, + "epoch": 0.5416503832857358, + "flos": 19208151377280.0, + "grad_norm": 2.189253604566193, + "language_loss": 0.74129766, + "learning_rate": 1.827488379924234e-06, + "loss": 0.76273382, + "num_input_tokens_seen": 194005160, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7421875, + "step": 9009, + "time_per_iteration": 3.8252077102661133 + }, + { + "auxiliary_loss_clip": 0.01110449, + "auxiliary_loss_mlp": 0.01034408, + "balance_loss_clip": 1.02109861, + "balance_loss_mlp": 1.03791738, + "epoch": 0.5417105065384037, + "flos": 12713706887040.0, + "grad_norm": 2.141173328238238, + "language_loss": 0.87482637, + "learning_rate": 1.8271003730309923e-06, + "loss": 0.89627492, + "num_input_tokens_seen": 194021700, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.72265625, + "step": 9010, + "time_per_iteration": 2.4628190994262695 + }, + { + "auxiliary_loss_clip": 0.01106778, + "auxiliary_loss_mlp": 0.01032743, + "balance_loss_clip": 1.02007151, + "balance_loss_mlp": 1.03684556, + "epoch": 0.5417706297910717, + "flos": 30335933998080.0, + "grad_norm": 1.9800903494769417, + "language_loss": 0.64830345, + "learning_rate": 1.826712372694122e-06, + "loss": 0.66969872, + "num_input_tokens_seen": 194042620, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 9011, + "time_per_iteration": 2.530303955078125 + }, + { + "auxiliary_loss_clip": 0.01109504, + "auxiliary_loss_mlp": 0.01036995, + "balance_loss_clip": 1.02463341, + "balance_loss_mlp": 1.03945065, + "epoch": 0.5418307530437396, + "flos": 29020992912000.0, + "grad_norm": 3.61342010762258, + "language_loss": 0.79000378, + "learning_rate": 1.8263243789283362e-06, + "loss": 0.81146884, + "num_input_tokens_seen": 194061800, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.69921875, + "step": 9012, + "time_per_iteration": 5.477705240249634 + }, + { + "auxiliary_loss_clip": 0.01106272, + "auxiliary_loss_mlp": 0.01030108, + "balance_loss_clip": 1.01720369, + "balance_loss_mlp": 1.0364089, + "epoch": 0.5418908762964076, + "flos": 16873455173760.0, + "grad_norm": 1.7419259634167055, + "language_loss": 0.74031919, + "learning_rate": 1.8259363917483466e-06, + "loss": 0.76168299, + "num_input_tokens_seen": 194079890, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69921875, + "step": 9013, + "time_per_iteration": 3.8720171451568604 + }, + { + "auxiliary_loss_clip": 0.01109547, + "auxiliary_loss_mlp": 0.01029661, + "balance_loss_clip": 1.01657844, + "balance_loss_mlp": 1.0367403, + "epoch": 0.5419509995490756, + "flos": 18949702043520.0, + "grad_norm": 2.040050456437719, + "language_loss": 0.72289932, + "learning_rate": 1.8255484111688667e-06, + "loss": 0.74429148, + "num_input_tokens_seen": 194097625, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 9014, + "time_per_iteration": 2.436251640319824 + }, + { + "auxiliary_loss_clip": 0.01108382, + "auxiliary_loss_mlp": 0.01031413, + "balance_loss_clip": 1.01889062, + "balance_loss_mlp": 1.03802454, + "epoch": 0.5420111228017436, + "flos": 18077719478400.0, + "grad_norm": 1.601636110073364, + "language_loss": 0.80585766, + "learning_rate": 1.8251604372046085e-06, + "loss": 0.82725561, + "num_input_tokens_seen": 194116055, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 9015, + "time_per_iteration": 2.4523091316223145 + }, + { + "auxiliary_loss_clip": 0.01112438, + "auxiliary_loss_mlp": 0.0103619, + "balance_loss_clip": 1.02298188, + "balance_loss_mlp": 1.03929543, + "epoch": 0.5420712460544116, + "flos": 19061779455360.0, + "grad_norm": 3.6814275573944717, + "language_loss": 0.81413746, + "learning_rate": 1.8247724698702843e-06, + "loss": 0.83562374, + "num_input_tokens_seen": 194130365, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73046875, + "step": 9016, + "time_per_iteration": 2.4310686588287354 + }, + { + "auxiliary_loss_clip": 0.01107219, + "auxiliary_loss_mlp": 0.01030151, + "balance_loss_clip": 1.01763988, + "balance_loss_mlp": 1.03753281, + "epoch": 0.5421313693070795, + "flos": 18187103370240.0, + "grad_norm": 2.1017981350927646, + "language_loss": 0.81103092, + "learning_rate": 1.8243845091806053e-06, + "loss": 0.83240461, + "num_input_tokens_seen": 194148975, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69921875, + "step": 9017, + "time_per_iteration": 2.427536725997925 + }, + { + "auxiliary_loss_clip": 0.01104389, + "auxiliary_loss_mlp": 0.01029579, + "balance_loss_clip": 1.01719928, + "balance_loss_mlp": 1.03666961, + "epoch": 0.5421914925597475, + "flos": 13005947940480.0, + "grad_norm": 1.7397815948262747, + "language_loss": 0.77372575, + "learning_rate": 1.8239965551502837e-06, + "loss": 0.79506552, + "num_input_tokens_seen": 194167185, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 9018, + "time_per_iteration": 2.4533066749572754 + }, + { + "auxiliary_loss_clip": 0.01107196, + "auxiliary_loss_mlp": 0.01037507, + "balance_loss_clip": 1.02436996, + "balance_loss_mlp": 1.03481603, + "epoch": 0.5422516158124154, + "flos": 46758457831680.0, + "grad_norm": 1.448924926163926, + "language_loss": 0.66352963, + "learning_rate": 1.8236086077940303e-06, + "loss": 0.68497658, + "num_input_tokens_seen": 194192840, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7265625, + "step": 9019, + "time_per_iteration": 2.6830832958221436 + }, + { + "auxiliary_loss_clip": 0.01103655, + "auxiliary_loss_mlp": 0.0102679, + "balance_loss_clip": 1.0157038, + "balance_loss_mlp": 1.03604794, + "epoch": 0.5423117390650835, + "flos": 31758642864000.0, + "grad_norm": 1.5485094933207573, + "language_loss": 0.69635725, + "learning_rate": 1.8232206671265555e-06, + "loss": 0.71766162, + "num_input_tokens_seen": 194213150, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.67578125, + "step": 9020, + "time_per_iteration": 2.5516250133514404 + }, + { + "auxiliary_loss_clip": 0.01101699, + "auxiliary_loss_mlp": 0.01036657, + "balance_loss_clip": 1.02415812, + "balance_loss_mlp": 1.03544152, + "epoch": 0.5423718623177514, + "flos": 27201974313600.0, + "grad_norm": 1.4647880942088878, + "language_loss": 0.80443847, + "learning_rate": 1.8228327331625717e-06, + "loss": 0.825822, + "num_input_tokens_seen": 194234665, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6640625, + "step": 9021, + "time_per_iteration": 2.52411150932312 + }, + { + "auxiliary_loss_clip": 0.01107355, + "auxiliary_loss_mlp": 0.01033488, + "balance_loss_clip": 1.02107835, + "balance_loss_mlp": 1.03812504, + "epoch": 0.5424319855704194, + "flos": 23546447193600.0, + "grad_norm": 1.483970922248673, + "language_loss": 0.78272343, + "learning_rate": 1.822444805916788e-06, + "loss": 0.80413187, + "num_input_tokens_seen": 194253790, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6953125, + "step": 9022, + "time_per_iteration": 2.4745841026306152 + }, + { + "auxiliary_loss_clip": 0.01104936, + "auxiliary_loss_mlp": 0.01033878, + "balance_loss_clip": 1.02170706, + "balance_loss_mlp": 1.03559494, + "epoch": 0.5424921088230873, + "flos": 26615624699520.0, + "grad_norm": 1.6624827413591161, + "language_loss": 0.82107073, + "learning_rate": 1.822056885403915e-06, + "loss": 0.84245884, + "num_input_tokens_seen": 194274950, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6953125, + "step": 9023, + "time_per_iteration": 2.4953298568725586 + }, + { + "auxiliary_loss_clip": 0.01106969, + "auxiliary_loss_mlp": 0.01028855, + "balance_loss_clip": 1.01670718, + "balance_loss_mlp": 1.03815961, + "epoch": 0.5425522320757553, + "flos": 23586811102080.0, + "grad_norm": 1.8210142178846183, + "language_loss": 0.71515894, + "learning_rate": 1.8216689716386627e-06, + "loss": 0.73651719, + "num_input_tokens_seen": 194296155, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 9024, + "time_per_iteration": 2.512596368789673 + }, + { + "auxiliary_loss_clip": 0.01107389, + "auxiliary_loss_mlp": 0.01030904, + "balance_loss_clip": 1.01878023, + "balance_loss_mlp": 1.03640127, + "epoch": 0.5426123553284232, + "flos": 30592264429440.0, + "grad_norm": 1.659326462636006, + "language_loss": 0.64976329, + "learning_rate": 1.8212810646357405e-06, + "loss": 0.67114621, + "num_input_tokens_seen": 194318025, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.7109375, + "step": 9025, + "time_per_iteration": 2.512734889984131 + }, + { + "auxiliary_loss_clip": 0.0110856, + "auxiliary_loss_mlp": 0.01030004, + "balance_loss_clip": 1.01776159, + "balance_loss_mlp": 1.0378685, + "epoch": 0.5426724785810912, + "flos": 12495118671360.0, + "grad_norm": 6.402510966233504, + "language_loss": 0.74099922, + "learning_rate": 1.8208931644098591e-06, + "loss": 0.76238489, + "num_input_tokens_seen": 194336150, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.70703125, + "step": 9026, + "time_per_iteration": 2.42434024810791 + }, + { + "auxiliary_loss_clip": 0.01107364, + "auxiliary_loss_mlp": 0.01040251, + "balance_loss_clip": 1.02587438, + "balance_loss_mlp": 1.03585124, + "epoch": 0.5427326018337592, + "flos": 26064611089920.0, + "grad_norm": 1.637995325273745, + "language_loss": 0.78638506, + "learning_rate": 1.8205052709757265e-06, + "loss": 0.80786121, + "num_input_tokens_seen": 194355980, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.71484375, + "step": 9027, + "time_per_iteration": 2.488490104675293 + }, + { + "auxiliary_loss_clip": 0.01029187, + "auxiliary_loss_mlp": 0.01006045, + "balance_loss_clip": 1.00479341, + "balance_loss_mlp": 1.00745916, + "epoch": 0.5427927250864272, + "flos": 65984745576960.0, + "grad_norm": 0.7366554152868067, + "language_loss": 0.56548405, + "learning_rate": 1.8201173843480515e-06, + "loss": 0.58583641, + "num_input_tokens_seen": 194422660, + "router_z_loss_clip": 0.01251221, + "router_z_loss_mlp": 0.21679688, + "step": 9028, + "time_per_iteration": 3.0799479484558105 + }, + { + "auxiliary_loss_clip": 0.01108987, + "auxiliary_loss_mlp": 0.01030483, + "balance_loss_clip": 1.01727474, + "balance_loss_mlp": 1.03760409, + "epoch": 0.5428528483390952, + "flos": 19975382904960.0, + "grad_norm": 2.289578054979344, + "language_loss": 0.7793408, + "learning_rate": 1.8197295045415442e-06, + "loss": 0.80073547, + "num_input_tokens_seen": 194438545, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71484375, + "step": 9029, + "time_per_iteration": 2.454566478729248 + }, + { + "auxiliary_loss_clip": 0.01105649, + "auxiliary_loss_mlp": 0.01027546, + "balance_loss_clip": 1.01489735, + "balance_loss_mlp": 1.03734791, + "epoch": 0.5429129715917631, + "flos": 21832323287040.0, + "grad_norm": 1.5369423730734595, + "language_loss": 0.83306921, + "learning_rate": 1.8193416315709112e-06, + "loss": 0.85440123, + "num_input_tokens_seen": 194458060, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.68359375, + "step": 9030, + "time_per_iteration": 2.4675095081329346 + }, + { + "auxiliary_loss_clip": 0.01105498, + "auxiliary_loss_mlp": 0.01028416, + "balance_loss_clip": 1.01676893, + "balance_loss_mlp": 1.0374887, + "epoch": 0.5429730948444311, + "flos": 27782685492480.0, + "grad_norm": 1.5422544284751551, + "language_loss": 0.74720484, + "learning_rate": 1.8189537654508623e-06, + "loss": 0.76854396, + "num_input_tokens_seen": 194477405, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 9031, + "time_per_iteration": 2.4871413707733154 + }, + { + "auxiliary_loss_clip": 0.01103416, + "auxiliary_loss_mlp": 0.01030421, + "balance_loss_clip": 1.01883435, + "balance_loss_mlp": 1.03710687, + "epoch": 0.543033218097099, + "flos": 26760452336640.0, + "grad_norm": 1.9031998711979703, + "language_loss": 0.85544586, + "learning_rate": 1.8185659061961045e-06, + "loss": 0.87678427, + "num_input_tokens_seen": 194497085, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6640625, + "step": 9032, + "time_per_iteration": 2.492750406265259 + }, + { + "auxiliary_loss_clip": 0.01110136, + "auxiliary_loss_mlp": 0.01029381, + "balance_loss_clip": 1.01670289, + "balance_loss_mlp": 1.03757548, + "epoch": 0.5430933413497671, + "flos": 22675254727680.0, + "grad_norm": 1.71218946587007, + "language_loss": 0.73568988, + "learning_rate": 1.8181780538213457e-06, + "loss": 0.75708508, + "num_input_tokens_seen": 194516785, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7265625, + "step": 9033, + "time_per_iteration": 2.458281993865967 + }, + { + "auxiliary_loss_clip": 0.01106249, + "auxiliary_loss_mlp": 0.01033307, + "balance_loss_clip": 1.02057564, + "balance_loss_mlp": 1.03709424, + "epoch": 0.543153464602435, + "flos": 24607499973120.0, + "grad_norm": 1.6976408638259588, + "language_loss": 0.75797909, + "learning_rate": 1.8177902083412935e-06, + "loss": 0.77937472, + "num_input_tokens_seen": 194536475, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 9034, + "time_per_iteration": 2.491690158843994 + }, + { + "auxiliary_loss_clip": 0.01105341, + "auxiliary_loss_mlp": 0.01031202, + "balance_loss_clip": 1.01932836, + "balance_loss_mlp": 1.03710067, + "epoch": 0.543213587855103, + "flos": 19025725178880.0, + "grad_norm": 1.7098309272106547, + "language_loss": 0.84488094, + "learning_rate": 1.817402369770655e-06, + "loss": 0.86624634, + "num_input_tokens_seen": 194554495, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 9035, + "time_per_iteration": 2.4352262020111084 + }, + { + "auxiliary_loss_clip": 0.01028064, + "auxiliary_loss_mlp": 0.01007827, + "balance_loss_clip": 1.00669503, + "balance_loss_mlp": 1.00628209, + "epoch": 0.5432737111077709, + "flos": 65686435125120.0, + "grad_norm": 0.7231810753813949, + "language_loss": 0.55908412, + "learning_rate": 1.8170145381241364e-06, + "loss": 0.57944304, + "num_input_tokens_seen": 194617620, + "router_z_loss_clip": 0.01135254, + "router_z_loss_mlp": 0.21777344, + "step": 9036, + "time_per_iteration": 3.041694402694702 + }, + { + "auxiliary_loss_clip": 0.01108199, + "auxiliary_loss_mlp": 0.01034372, + "balance_loss_clip": 1.02147961, + "balance_loss_mlp": 1.03686309, + "epoch": 0.5433338343604389, + "flos": 22091670460800.0, + "grad_norm": 1.5099374695532384, + "language_loss": 0.75264686, + "learning_rate": 1.8166267134164451e-06, + "loss": 0.77407253, + "num_input_tokens_seen": 194637690, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9037, + "time_per_iteration": 2.4950051307678223 + }, + { + "auxiliary_loss_clip": 0.01106194, + "auxiliary_loss_mlp": 0.01035411, + "balance_loss_clip": 1.02301288, + "balance_loss_mlp": 1.03557479, + "epoch": 0.5433939576131068, + "flos": 34672649616000.0, + "grad_norm": 1.5216693219084618, + "language_loss": 0.66438931, + "learning_rate": 1.8162388956622875e-06, + "loss": 0.68580532, + "num_input_tokens_seen": 194659520, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.70703125, + "step": 9038, + "time_per_iteration": 2.559807777404785 + }, + { + "auxiliary_loss_clip": 0.01103453, + "auxiliary_loss_mlp": 0.01030083, + "balance_loss_clip": 1.0184598, + "balance_loss_mlp": 1.03513312, + "epoch": 0.5434540808657748, + "flos": 20303355012480.0, + "grad_norm": 1.8787316560909988, + "language_loss": 0.78100199, + "learning_rate": 1.8158510848763692e-06, + "loss": 0.80233729, + "num_input_tokens_seen": 194677645, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.68359375, + "step": 9039, + "time_per_iteration": 2.4654388427734375 + }, + { + "auxiliary_loss_clip": 0.01106931, + "auxiliary_loss_mlp": 0.01032936, + "balance_loss_clip": 1.02066386, + "balance_loss_mlp": 1.03744531, + "epoch": 0.5435142041184428, + "flos": 23112790295040.0, + "grad_norm": 1.8309305249268624, + "language_loss": 0.76449573, + "learning_rate": 1.8154632810733962e-06, + "loss": 0.78589433, + "num_input_tokens_seen": 194697400, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 9040, + "time_per_iteration": 2.527614116668701 + }, + { + "auxiliary_loss_clip": 0.0102829, + "auxiliary_loss_mlp": 0.01001895, + "balance_loss_clip": 1.00074422, + "balance_loss_mlp": 1.0065496, + "epoch": 0.5435743273711108, + "flos": 64012746954240.0, + "grad_norm": 0.6649082596858222, + "language_loss": 0.52501261, + "learning_rate": 1.815075484268074e-06, + "loss": 0.54531443, + "num_input_tokens_seen": 194761205, + "router_z_loss_clip": 0.01147461, + "router_z_loss_mlp": 0.21777344, + "step": 9041, + "time_per_iteration": 3.0513055324554443 + }, + { + "auxiliary_loss_clip": 0.01105303, + "auxiliary_loss_mlp": 0.01036545, + "balance_loss_clip": 1.02383089, + "balance_loss_mlp": 1.03610432, + "epoch": 0.5436344506237788, + "flos": 25118903859840.0, + "grad_norm": 1.5670483715805776, + "language_loss": 0.76206207, + "learning_rate": 1.8146876944751078e-06, + "loss": 0.78348053, + "num_input_tokens_seen": 194782445, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 9042, + "time_per_iteration": 2.4679293632507324 + }, + { + "auxiliary_loss_clip": 0.01102475, + "auxiliary_loss_mlp": 0.01031619, + "balance_loss_clip": 1.02001429, + "balance_loss_mlp": 1.03483939, + "epoch": 0.5436945738764467, + "flos": 19572967860480.0, + "grad_norm": 1.637929025007711, + "language_loss": 0.67479855, + "learning_rate": 1.8142999117092033e-06, + "loss": 0.69613945, + "num_input_tokens_seen": 194800325, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.67578125, + "step": 9043, + "time_per_iteration": 2.469393730163574 + }, + { + "auxiliary_loss_clip": 0.01101674, + "auxiliary_loss_mlp": 0.01031755, + "balance_loss_clip": 1.019876, + "balance_loss_mlp": 1.03556848, + "epoch": 0.5437546971291147, + "flos": 21142515525120.0, + "grad_norm": 1.6229792564391676, + "language_loss": 0.8417449, + "learning_rate": 1.8139121359850644e-06, + "loss": 0.86307919, + "num_input_tokens_seen": 194818675, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66015625, + "step": 9044, + "time_per_iteration": 2.4827311038970947 + }, + { + "auxiliary_loss_clip": 0.01111478, + "auxiliary_loss_mlp": 0.01026732, + "balance_loss_clip": 1.01375592, + "balance_loss_mlp": 1.03744245, + "epoch": 0.5438148203817826, + "flos": 25118688378240.0, + "grad_norm": 4.385221285903045, + "language_loss": 0.6211096, + "learning_rate": 1.8135243673173956e-06, + "loss": 0.6424917, + "num_input_tokens_seen": 194836595, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7421875, + "step": 9045, + "time_per_iteration": 2.5340473651885986 + }, + { + "auxiliary_loss_clip": 0.01108322, + "auxiliary_loss_mlp": 0.01030915, + "balance_loss_clip": 1.01814771, + "balance_loss_mlp": 1.03780746, + "epoch": 0.5438749436344507, + "flos": 23002939526400.0, + "grad_norm": 1.4286240482824728, + "language_loss": 0.69942701, + "learning_rate": 1.8131366057209023e-06, + "loss": 0.72081935, + "num_input_tokens_seen": 194857520, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.703125, + "step": 9046, + "time_per_iteration": 2.4620296955108643 + }, + { + "auxiliary_loss_clip": 0.01104565, + "auxiliary_loss_mlp": 0.01027743, + "balance_loss_clip": 1.01592338, + "balance_loss_mlp": 1.03681147, + "epoch": 0.5439350668871186, + "flos": 15487016065920.0, + "grad_norm": 2.1944623143587667, + "language_loss": 0.77171725, + "learning_rate": 1.8127488512102868e-06, + "loss": 0.79304034, + "num_input_tokens_seen": 194876020, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 9047, + "time_per_iteration": 2.4618160724639893 + }, + { + "auxiliary_loss_clip": 0.01107988, + "auxiliary_loss_mlp": 0.0103533, + "balance_loss_clip": 1.0232358, + "balance_loss_mlp": 1.03817999, + "epoch": 0.5439951901397866, + "flos": 17238415311360.0, + "grad_norm": 1.7709524835714412, + "language_loss": 0.72530591, + "learning_rate": 1.8123611038002547e-06, + "loss": 0.74673903, + "num_input_tokens_seen": 194894650, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69921875, + "step": 9048, + "time_per_iteration": 2.43306827545166 + }, + { + "auxiliary_loss_clip": 0.01108101, + "auxiliary_loss_mlp": 0.0103279, + "balance_loss_clip": 1.01999831, + "balance_loss_mlp": 1.03979266, + "epoch": 0.5440553133924545, + "flos": 18661016436480.0, + "grad_norm": 2.1212679973875805, + "language_loss": 0.93380594, + "learning_rate": 1.8119733635055076e-06, + "loss": 0.95521486, + "num_input_tokens_seen": 194911935, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.68359375, + "step": 9049, + "time_per_iteration": 2.4344465732574463 + }, + { + "auxiliary_loss_clip": 0.01102747, + "auxiliary_loss_mlp": 0.01029345, + "balance_loss_clip": 1.01810968, + "balance_loss_mlp": 1.0347991, + "epoch": 0.5441154366451225, + "flos": 27122934435840.0, + "grad_norm": 1.8375314287256255, + "language_loss": 0.73678643, + "learning_rate": 1.8115856303407492e-06, + "loss": 0.75810736, + "num_input_tokens_seen": 194931620, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6796875, + "step": 9050, + "time_per_iteration": 2.491992473602295 + }, + { + "auxiliary_loss_clip": 0.01109361, + "auxiliary_loss_mlp": 0.01028722, + "balance_loss_clip": 1.01630008, + "balance_loss_mlp": 1.0390985, + "epoch": 0.5441755598977904, + "flos": 25993867253760.0, + "grad_norm": 1.7129729573051025, + "language_loss": 0.67238903, + "learning_rate": 1.8111979043206832e-06, + "loss": 0.69376987, + "num_input_tokens_seen": 194952560, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.703125, + "step": 9051, + "time_per_iteration": 3.862109661102295 + }, + { + "auxiliary_loss_clip": 0.01104183, + "auxiliary_loss_mlp": 0.01029305, + "balance_loss_clip": 1.0174253, + "balance_loss_mlp": 1.03553367, + "epoch": 0.5442356831504584, + "flos": 32380041173760.0, + "grad_norm": 1.6461015999412698, + "language_loss": 0.67748392, + "learning_rate": 1.810810185460011e-06, + "loss": 0.6988188, + "num_input_tokens_seen": 194973915, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 9052, + "time_per_iteration": 2.5398967266082764 + }, + { + "auxiliary_loss_clip": 0.01108274, + "auxiliary_loss_mlp": 0.010316, + "balance_loss_clip": 1.01914227, + "balance_loss_mlp": 1.03725493, + "epoch": 0.5442958064031264, + "flos": 24164290056960.0, + "grad_norm": 1.7506645402052365, + "language_loss": 0.92625535, + "learning_rate": 1.810422473773436e-06, + "loss": 0.94765407, + "num_input_tokens_seen": 194990170, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 9053, + "time_per_iteration": 2.4675142765045166 + }, + { + "auxiliary_loss_clip": 0.01107915, + "auxiliary_loss_mlp": 0.01034902, + "balance_loss_clip": 1.02233112, + "balance_loss_mlp": 1.03685415, + "epoch": 0.5443559296557944, + "flos": 18764690065920.0, + "grad_norm": 2.7890591975918206, + "language_loss": 0.83447516, + "learning_rate": 1.8100347692756595e-06, + "loss": 0.85590339, + "num_input_tokens_seen": 195006395, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 9054, + "time_per_iteration": 5.314599275588989 + }, + { + "auxiliary_loss_clip": 0.01109858, + "auxiliary_loss_mlp": 0.01032667, + "balance_loss_clip": 1.02034652, + "balance_loss_mlp": 1.04010189, + "epoch": 0.5444160529084624, + "flos": 22632556435200.0, + "grad_norm": 2.3459133888285564, + "language_loss": 0.68981498, + "learning_rate": 1.8096470719813836e-06, + "loss": 0.71124029, + "num_input_tokens_seen": 195025080, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 9055, + "time_per_iteration": 3.926511287689209 + }, + { + "auxiliary_loss_clip": 0.01028465, + "auxiliary_loss_mlp": 0.00999723, + "balance_loss_clip": 0.99868602, + "balance_loss_mlp": 1.00688159, + "epoch": 0.5444761761611303, + "flos": 69671909600640.0, + "grad_norm": 0.7309752042107527, + "language_loss": 0.57659, + "learning_rate": 1.80925938190531e-06, + "loss": 0.59687185, + "num_input_tokens_seen": 195085725, + "router_z_loss_clip": 0.01037598, + "router_z_loss_mlp": 0.21582031, + "step": 9056, + "time_per_iteration": 3.0622963905334473 + }, + { + "auxiliary_loss_clip": 0.01106856, + "auxiliary_loss_mlp": 0.01029461, + "balance_loss_clip": 1.01665783, + "balance_loss_mlp": 1.03565168, + "epoch": 0.5445362994137983, + "flos": 14278442129280.0, + "grad_norm": 1.7313106745452744, + "language_loss": 0.69337952, + "learning_rate": 1.8088716990621395e-06, + "loss": 0.71474266, + "num_input_tokens_seen": 195102585, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7109375, + "step": 9057, + "time_per_iteration": 2.4510855674743652 + }, + { + "auxiliary_loss_clip": 0.01106022, + "auxiliary_loss_mlp": 0.01035977, + "balance_loss_clip": 1.02320337, + "balance_loss_mlp": 1.03730392, + "epoch": 0.5445964226664662, + "flos": 28986195611520.0, + "grad_norm": 2.1714933584662615, + "language_loss": 0.7508406, + "learning_rate": 1.8084840234665738e-06, + "loss": 0.77226055, + "num_input_tokens_seen": 195120055, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6875, + "step": 9058, + "time_per_iteration": 2.526362419128418 + }, + { + "auxiliary_loss_clip": 0.01028725, + "auxiliary_loss_mlp": 0.0100208, + "balance_loss_clip": 1.00100732, + "balance_loss_mlp": 1.00713301, + "epoch": 0.5446565459191343, + "flos": 68620230270720.0, + "grad_norm": 0.7971345769694276, + "language_loss": 0.62662959, + "learning_rate": 1.808096355133312e-06, + "loss": 0.64693761, + "num_input_tokens_seen": 195181045, + "router_z_loss_clip": 0.01074219, + "router_z_loss_mlp": 0.21582031, + "step": 9059, + "time_per_iteration": 3.1505026817321777 + }, + { + "auxiliary_loss_clip": 0.01105797, + "auxiliary_loss_mlp": 0.01030676, + "balance_loss_clip": 1.01862383, + "balance_loss_mlp": 1.03710485, + "epoch": 0.5447166691718022, + "flos": 16216469464320.0, + "grad_norm": 1.9373576881408119, + "language_loss": 0.791785, + "learning_rate": 1.8077086940770572e-06, + "loss": 0.81314969, + "num_input_tokens_seen": 195198840, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6875, + "step": 9060, + "time_per_iteration": 2.4754552841186523 + }, + { + "auxiliary_loss_clip": 0.01106659, + "auxiliary_loss_mlp": 0.01033204, + "balance_loss_clip": 1.02058554, + "balance_loss_mlp": 1.03625464, + "epoch": 0.5447767924244702, + "flos": 25849039616640.0, + "grad_norm": 1.604299719110434, + "language_loss": 0.7939564, + "learning_rate": 1.8073210403125072e-06, + "loss": 0.81535506, + "num_input_tokens_seen": 195218720, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 9061, + "time_per_iteration": 2.556467056274414 + }, + { + "auxiliary_loss_clip": 0.01104902, + "auxiliary_loss_mlp": 0.01026453, + "balance_loss_clip": 1.0152173, + "balance_loss_mlp": 1.03701198, + "epoch": 0.5448369156771381, + "flos": 19677718897920.0, + "grad_norm": 1.7809339372629867, + "language_loss": 0.87091219, + "learning_rate": 1.8069333938543627e-06, + "loss": 0.89222574, + "num_input_tokens_seen": 195235770, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6796875, + "step": 9062, + "time_per_iteration": 2.4758143424987793 + }, + { + "auxiliary_loss_clip": 0.01111266, + "auxiliary_loss_mlp": 0.01032954, + "balance_loss_clip": 1.01959074, + "balance_loss_mlp": 1.03804517, + "epoch": 0.5448970389298061, + "flos": 19281804215040.0, + "grad_norm": 1.9589069040824287, + "language_loss": 0.82366961, + "learning_rate": 1.8065457547173233e-06, + "loss": 0.84511185, + "num_input_tokens_seen": 195254870, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.734375, + "step": 9063, + "time_per_iteration": 2.4351277351379395 + }, + { + "auxiliary_loss_clip": 0.01106592, + "auxiliary_loss_mlp": 0.01028534, + "balance_loss_clip": 1.01580811, + "balance_loss_mlp": 1.0372479, + "epoch": 0.544957162182474, + "flos": 20991690316800.0, + "grad_norm": 1.809751627458355, + "language_loss": 0.63477433, + "learning_rate": 1.8061581229160878e-06, + "loss": 0.65612566, + "num_input_tokens_seen": 195273390, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 9064, + "time_per_iteration": 2.5002574920654297 + }, + { + "auxiliary_loss_clip": 0.01108938, + "auxiliary_loss_mlp": 0.01031086, + "balance_loss_clip": 1.01844406, + "balance_loss_mlp": 1.0378474, + "epoch": 0.545017285435142, + "flos": 25374587846400.0, + "grad_norm": 1.5950372697964212, + "language_loss": 0.79787326, + "learning_rate": 1.8057704984653566e-06, + "loss": 0.81927347, + "num_input_tokens_seen": 195295635, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 9065, + "time_per_iteration": 2.485886335372925 + }, + { + "auxiliary_loss_clip": 0.01104024, + "auxiliary_loss_mlp": 0.01029589, + "balance_loss_clip": 1.01893747, + "balance_loss_mlp": 1.03695667, + "epoch": 0.54507740868781, + "flos": 19134749934720.0, + "grad_norm": 1.9866274876050938, + "language_loss": 0.78143919, + "learning_rate": 1.805382881379827e-06, + "loss": 0.80277526, + "num_input_tokens_seen": 195312545, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.671875, + "step": 9066, + "time_per_iteration": 2.4608097076416016 + }, + { + "auxiliary_loss_clip": 0.0110724, + "auxiliary_loss_mlp": 0.0102858, + "balance_loss_clip": 1.0161345, + "balance_loss_mlp": 1.03510523, + "epoch": 0.545137531940478, + "flos": 26249802635520.0, + "grad_norm": 1.7709941680506742, + "language_loss": 0.75842655, + "learning_rate": 1.8049952716741975e-06, + "loss": 0.7797848, + "num_input_tokens_seen": 195332955, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.72265625, + "step": 9067, + "time_per_iteration": 2.4940598011016846 + }, + { + "auxiliary_loss_clip": 0.01114286, + "auxiliary_loss_mlp": 0.01035797, + "balance_loss_clip": 1.02152777, + "balance_loss_mlp": 1.0393995, + "epoch": 0.545197655193146, + "flos": 37555629995520.0, + "grad_norm": 2.2574843156274, + "language_loss": 0.63637972, + "learning_rate": 1.8046076693631682e-06, + "loss": 0.65788054, + "num_input_tokens_seen": 195355930, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.75, + "step": 9068, + "time_per_iteration": 2.570791244506836 + }, + { + "auxiliary_loss_clip": 0.0110619, + "auxiliary_loss_mlp": 0.01035364, + "balance_loss_clip": 1.02378917, + "balance_loss_mlp": 1.03860283, + "epoch": 0.5452577784458139, + "flos": 26031250333440.0, + "grad_norm": 1.608624941379858, + "language_loss": 0.7232843, + "learning_rate": 1.8042200744614343e-06, + "loss": 0.74469984, + "num_input_tokens_seen": 195376445, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.67578125, + "step": 9069, + "time_per_iteration": 2.49194073677063 + }, + { + "auxiliary_loss_clip": 0.01105915, + "auxiliary_loss_mlp": 0.01029861, + "balance_loss_clip": 1.01882815, + "balance_loss_mlp": 1.03988457, + "epoch": 0.5453179016984819, + "flos": 17639034675840.0, + "grad_norm": 1.7038570560603954, + "language_loss": 0.74060583, + "learning_rate": 1.8038324869836957e-06, + "loss": 0.76196355, + "num_input_tokens_seen": 195393725, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.66015625, + "step": 9070, + "time_per_iteration": 2.4085381031036377 + }, + { + "auxiliary_loss_clip": 0.0110378, + "auxiliary_loss_mlp": 0.01032237, + "balance_loss_clip": 1.02016675, + "balance_loss_mlp": 1.035869, + "epoch": 0.5453780249511498, + "flos": 23216679406080.0, + "grad_norm": 1.9518916968876514, + "language_loss": 0.60487843, + "learning_rate": 1.8034449069446489e-06, + "loss": 0.62623858, + "num_input_tokens_seen": 195411380, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 9071, + "time_per_iteration": 2.4736368656158447 + }, + { + "auxiliary_loss_clip": 0.01029891, + "auxiliary_loss_mlp": 0.01009543, + "balance_loss_clip": 1.00851762, + "balance_loss_mlp": 1.00855255, + "epoch": 0.5454381482038179, + "flos": 68696504801280.0, + "grad_norm": 0.702361481728272, + "language_loss": 0.57095647, + "learning_rate": 1.80305733435899e-06, + "loss": 0.59135079, + "num_input_tokens_seen": 195482015, + "router_z_loss_clip": 0.01025391, + "router_z_loss_mlp": 0.21386719, + "step": 9072, + "time_per_iteration": 3.1778738498687744 + }, + { + "auxiliary_loss_clip": 0.01104044, + "auxiliary_loss_mlp": 0.01029766, + "balance_loss_clip": 1.01834023, + "balance_loss_mlp": 1.03754437, + "epoch": 0.5454982714564858, + "flos": 13260626346240.0, + "grad_norm": 1.6497532443668452, + "language_loss": 0.69947577, + "learning_rate": 1.8026697692414174e-06, + "loss": 0.72081387, + "num_input_tokens_seen": 195500440, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 9073, + "time_per_iteration": 2.414483070373535 + }, + { + "auxiliary_loss_clip": 0.01102116, + "auxiliary_loss_mlp": 0.01032371, + "balance_loss_clip": 1.02133226, + "balance_loss_mlp": 1.03575385, + "epoch": 0.5455583947091538, + "flos": 21835878733440.0, + "grad_norm": 1.7860657423568516, + "language_loss": 0.71207851, + "learning_rate": 1.802282211606627e-06, + "loss": 0.73342335, + "num_input_tokens_seen": 195520860, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6640625, + "step": 9074, + "time_per_iteration": 2.5126519203186035 + }, + { + "auxiliary_loss_clip": 0.01105462, + "auxiliary_loss_mlp": 0.01037255, + "balance_loss_clip": 1.02541733, + "balance_loss_mlp": 1.03713095, + "epoch": 0.5456185179618217, + "flos": 17817438551040.0, + "grad_norm": 1.7043380827263428, + "language_loss": 0.68845975, + "learning_rate": 1.8018946614693148e-06, + "loss": 0.70988691, + "num_input_tokens_seen": 195538615, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.68359375, + "step": 9075, + "time_per_iteration": 2.4271233081817627 + }, + { + "auxiliary_loss_clip": 0.01104973, + "auxiliary_loss_mlp": 0.01029765, + "balance_loss_clip": 1.01904845, + "balance_loss_mlp": 1.03828716, + "epoch": 0.5456786412144897, + "flos": 21069401391360.0, + "grad_norm": 2.0277857780736155, + "language_loss": 0.804497, + "learning_rate": 1.8015071188441768e-06, + "loss": 0.82584435, + "num_input_tokens_seen": 195557460, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.66796875, + "step": 9076, + "time_per_iteration": 2.5117785930633545 + }, + { + "auxiliary_loss_clip": 0.01105415, + "auxiliary_loss_mlp": 0.01030375, + "balance_loss_clip": 1.01892447, + "balance_loss_mlp": 1.03663969, + "epoch": 0.5457387644671576, + "flos": 23294965098240.0, + "grad_norm": 1.583996751680831, + "language_loss": 0.80426413, + "learning_rate": 1.8011195837459089e-06, + "loss": 0.82562208, + "num_input_tokens_seen": 195577985, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6875, + "step": 9077, + "time_per_iteration": 2.4544837474823 + }, + { + "auxiliary_loss_clip": 0.0110649, + "auxiliary_loss_mlp": 0.01030156, + "balance_loss_clip": 1.01880729, + "balance_loss_mlp": 1.03688538, + "epoch": 0.5457988877198257, + "flos": 21617039122560.0, + "grad_norm": 1.9788210228225505, + "language_loss": 0.67737269, + "learning_rate": 1.8007320561892064e-06, + "loss": 0.69873917, + "num_input_tokens_seen": 195597620, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6953125, + "step": 9078, + "time_per_iteration": 2.5323657989501953 + }, + { + "auxiliary_loss_clip": 0.01107395, + "auxiliary_loss_mlp": 0.01032857, + "balance_loss_clip": 1.02072752, + "balance_loss_mlp": 1.03703523, + "epoch": 0.5458590109724936, + "flos": 23762485543680.0, + "grad_norm": 1.8696943679753917, + "language_loss": 0.80740905, + "learning_rate": 1.800344536188764e-06, + "loss": 0.82881159, + "num_input_tokens_seen": 195615910, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.703125, + "step": 9079, + "time_per_iteration": 2.458158493041992 + }, + { + "auxiliary_loss_clip": 0.01110307, + "auxiliary_loss_mlp": 0.01032502, + "balance_loss_clip": 1.01966298, + "balance_loss_mlp": 1.03775454, + "epoch": 0.5459191342251616, + "flos": 24424283675520.0, + "grad_norm": 1.6840905516778153, + "language_loss": 0.75812018, + "learning_rate": 1.799957023759277e-06, + "loss": 0.77954829, + "num_input_tokens_seen": 195635620, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7265625, + "step": 9080, + "time_per_iteration": 2.4955971240997314 + }, + { + "auxiliary_loss_clip": 0.01108419, + "auxiliary_loss_mlp": 0.01033024, + "balance_loss_clip": 1.02007222, + "balance_loss_mlp": 1.03805685, + "epoch": 0.5459792574778296, + "flos": 23623009032960.0, + "grad_norm": 2.4851521305720627, + "language_loss": 0.83080792, + "learning_rate": 1.7995695189154392e-06, + "loss": 0.85222232, + "num_input_tokens_seen": 195652495, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.703125, + "step": 9081, + "time_per_iteration": 2.4580602645874023 + }, + { + "auxiliary_loss_clip": 0.01110385, + "auxiliary_loss_mlp": 0.01029387, + "balance_loss_clip": 1.01722193, + "balance_loss_mlp": 1.03842843, + "epoch": 0.5460393807304975, + "flos": 19135540033920.0, + "grad_norm": 1.5408403844848193, + "language_loss": 0.69658768, + "learning_rate": 1.7991820216719461e-06, + "loss": 0.71798551, + "num_input_tokens_seen": 195671965, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.71875, + "step": 9082, + "time_per_iteration": 2.472858428955078 + }, + { + "auxiliary_loss_clip": 0.01102277, + "auxiliary_loss_mlp": 0.01026377, + "balance_loss_clip": 1.01434886, + "balance_loss_mlp": 1.03546321, + "epoch": 0.5460995039831655, + "flos": 35918534805120.0, + "grad_norm": 1.7415454834760362, + "language_loss": 0.66599333, + "learning_rate": 1.7987945320434906e-06, + "loss": 0.68727982, + "num_input_tokens_seen": 195694725, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 9083, + "time_per_iteration": 2.5756945610046387 + }, + { + "auxiliary_loss_clip": 0.01104147, + "auxiliary_loss_mlp": 0.01029182, + "balance_loss_clip": 1.01772594, + "balance_loss_mlp": 1.03678334, + "epoch": 0.5461596272358334, + "flos": 26759231274240.0, + "grad_norm": 1.6516896910486423, + "language_loss": 0.78909004, + "learning_rate": 1.798407050044766e-06, + "loss": 0.81042337, + "num_input_tokens_seen": 195714090, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.67578125, + "step": 9084, + "time_per_iteration": 2.5361523628234863 + }, + { + "auxiliary_loss_clip": 0.01107894, + "auxiliary_loss_mlp": 0.01032728, + "balance_loss_clip": 1.02093244, + "balance_loss_mlp": 1.03781819, + "epoch": 0.5462197504885015, + "flos": 20886580143360.0, + "grad_norm": 2.0163372032767826, + "language_loss": 0.74970639, + "learning_rate": 1.7980195756904675e-06, + "loss": 0.77111256, + "num_input_tokens_seen": 195733585, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.69921875, + "step": 9085, + "time_per_iteration": 2.461916208267212 + }, + { + "auxiliary_loss_clip": 0.01107723, + "auxiliary_loss_mlp": 0.0102905, + "balance_loss_clip": 1.01702785, + "balance_loss_mlp": 1.03705621, + "epoch": 0.5462798737411694, + "flos": 25804976607360.0, + "grad_norm": 1.6682732441654566, + "language_loss": 0.74792248, + "learning_rate": 1.7976321089952857e-06, + "loss": 0.76929021, + "num_input_tokens_seen": 195752820, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.703125, + "step": 9086, + "time_per_iteration": 2.530505657196045 + }, + { + "auxiliary_loss_clip": 0.01105061, + "auxiliary_loss_mlp": 0.01029176, + "balance_loss_clip": 1.01707602, + "balance_loss_mlp": 1.03592753, + "epoch": 0.5463399969938374, + "flos": 25775027642880.0, + "grad_norm": 1.5861549378759865, + "language_loss": 0.76987553, + "learning_rate": 1.7972446499739155e-06, + "loss": 0.79121786, + "num_input_tokens_seen": 195773740, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69140625, + "step": 9087, + "time_per_iteration": 2.4786858558654785 + }, + { + "auxiliary_loss_clip": 0.01110207, + "auxiliary_loss_mlp": 0.01035533, + "balance_loss_clip": 1.0228374, + "balance_loss_mlp": 1.03895903, + "epoch": 0.5464001202465053, + "flos": 18843298980480.0, + "grad_norm": 1.736831801992395, + "language_loss": 0.77471095, + "learning_rate": 1.7968571986410484e-06, + "loss": 0.79616833, + "num_input_tokens_seen": 195792125, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71484375, + "step": 9088, + "time_per_iteration": 2.450409173965454 + }, + { + "auxiliary_loss_clip": 0.01030156, + "auxiliary_loss_mlp": 0.01001999, + "balance_loss_clip": 1.0009743, + "balance_loss_mlp": 1.0086112, + "epoch": 0.5464602434991733, + "flos": 69049541623680.0, + "grad_norm": 0.7273835392783513, + "language_loss": 0.57771385, + "learning_rate": 1.7964697550113758e-06, + "loss": 0.59803545, + "num_input_tokens_seen": 195854935, + "router_z_loss_clip": 0.01025391, + "router_z_loss_mlp": 0.21484375, + "step": 9089, + "time_per_iteration": 3.1002800464630127 + }, + { + "auxiliary_loss_clip": 0.01107167, + "auxiliary_loss_mlp": 0.01030942, + "balance_loss_clip": 1.01875257, + "balance_loss_mlp": 1.03710759, + "epoch": 0.5465203667518412, + "flos": 27560039040000.0, + "grad_norm": 1.6935215277859987, + "language_loss": 0.76448178, + "learning_rate": 1.7960823190995918e-06, + "loss": 0.78586286, + "num_input_tokens_seen": 195874715, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.703125, + "step": 9090, + "time_per_iteration": 2.5178091526031494 + }, + { + "auxiliary_loss_clip": 0.0110913, + "auxiliary_loss_mlp": 0.01035309, + "balance_loss_clip": 1.02226758, + "balance_loss_mlp": 1.0362854, + "epoch": 0.5465804900045093, + "flos": 21210206705280.0, + "grad_norm": 2.128546091443876, + "language_loss": 0.73422724, + "learning_rate": 1.7956948909203855e-06, + "loss": 0.75567162, + "num_input_tokens_seen": 195892610, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7265625, + "step": 9091, + "time_per_iteration": 2.4523463249206543 + }, + { + "auxiliary_loss_clip": 0.0110893, + "auxiliary_loss_mlp": 0.01035178, + "balance_loss_clip": 1.02313828, + "balance_loss_mlp": 1.03835773, + "epoch": 0.5466406132571772, + "flos": 22488949860480.0, + "grad_norm": 1.850730557544026, + "language_loss": 0.77855682, + "learning_rate": 1.7953074704884498e-06, + "loss": 0.79999787, + "num_input_tokens_seen": 195911085, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.70703125, + "step": 9092, + "time_per_iteration": 2.463998556137085 + }, + { + "auxiliary_loss_clip": 0.01110185, + "auxiliary_loss_mlp": 0.01032754, + "balance_loss_clip": 1.01975393, + "balance_loss_mlp": 1.03879404, + "epoch": 0.5467007365098452, + "flos": 17675843137920.0, + "grad_norm": 1.992080116269468, + "language_loss": 0.74526983, + "learning_rate": 1.794920057818476e-06, + "loss": 0.76669919, + "num_input_tokens_seen": 195929845, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71484375, + "step": 9093, + "time_per_iteration": 3.8121659755706787 + }, + { + "auxiliary_loss_clip": 0.01108983, + "auxiliary_loss_mlp": 0.01037849, + "balance_loss_clip": 1.02420545, + "balance_loss_mlp": 1.03643596, + "epoch": 0.5467608597625132, + "flos": 15698852524800.0, + "grad_norm": 1.8684331289519012, + "language_loss": 0.69012475, + "learning_rate": 1.7945326529251533e-06, + "loss": 0.71159303, + "num_input_tokens_seen": 195946350, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.72265625, + "step": 9094, + "time_per_iteration": 2.406708240509033 + }, + { + "auxiliary_loss_clip": 0.0110964, + "auxiliary_loss_mlp": 0.01035906, + "balance_loss_clip": 1.02463508, + "balance_loss_mlp": 1.0408746, + "epoch": 0.5468209830151811, + "flos": 24312816794880.0, + "grad_norm": 3.1943674750228426, + "language_loss": 0.68355155, + "learning_rate": 1.7941452558231731e-06, + "loss": 0.70500696, + "num_input_tokens_seen": 195959840, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6875, + "step": 9095, + "time_per_iteration": 2.4663615226745605 + }, + { + "auxiliary_loss_clip": 0.0110876, + "auxiliary_loss_mlp": 0.01035993, + "balance_loss_clip": 1.0244838, + "balance_loss_mlp": 1.04013026, + "epoch": 0.5468811062678491, + "flos": 29166323339520.0, + "grad_norm": 1.544968347193232, + "language_loss": 0.66645032, + "learning_rate": 1.7937578665272256e-06, + "loss": 0.6878978, + "num_input_tokens_seen": 195981125, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6875, + "step": 9096, + "time_per_iteration": 5.378362417221069 + }, + { + "auxiliary_loss_clip": 0.01030132, + "auxiliary_loss_mlp": 0.00998409, + "balance_loss_clip": 0.99731266, + "balance_loss_mlp": 1.00865221, + "epoch": 0.546941229520517, + "flos": 67867037982720.0, + "grad_norm": 0.7389922300516351, + "language_loss": 0.57573926, + "learning_rate": 1.7933704850520007e-06, + "loss": 0.59602463, + "num_input_tokens_seen": 196038880, + "router_z_loss_clip": 0.01098633, + "router_z_loss_mlp": 0.21484375, + "step": 9097, + "time_per_iteration": 3.168614387512207 + }, + { + "auxiliary_loss_clip": 0.01030189, + "auxiliary_loss_mlp": 0.01002061, + "balance_loss_clip": 1.00105369, + "balance_loss_mlp": 1.00863671, + "epoch": 0.5470013527731851, + "flos": 58270306625280.0, + "grad_norm": 0.9052213801384115, + "language_loss": 0.64790761, + "learning_rate": 1.7929831114121868e-06, + "loss": 0.66823018, + "num_input_tokens_seen": 196099215, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.21484375, + "step": 9098, + "time_per_iteration": 3.01711106300354 + }, + { + "auxiliary_loss_clip": 0.01108703, + "auxiliary_loss_mlp": 0.01036918, + "balance_loss_clip": 1.02399004, + "balance_loss_mlp": 1.03762555, + "epoch": 0.547061476025853, + "flos": 22965915582720.0, + "grad_norm": 1.9907442514686344, + "language_loss": 0.73179287, + "learning_rate": 1.7925957456224753e-06, + "loss": 0.75324905, + "num_input_tokens_seen": 196120370, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9099, + "time_per_iteration": 2.50752592086792 + }, + { + "auxiliary_loss_clip": 0.01105594, + "auxiliary_loss_mlp": 0.01028658, + "balance_loss_clip": 1.01767302, + "balance_loss_mlp": 1.03749669, + "epoch": 0.547121599278521, + "flos": 29968244426880.0, + "grad_norm": 1.9036037415187144, + "language_loss": 0.72414565, + "learning_rate": 1.7922083876975537e-06, + "loss": 0.74548817, + "num_input_tokens_seen": 196139075, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6796875, + "step": 9100, + "time_per_iteration": 2.5455925464630127 + }, + { + "auxiliary_loss_clip": 0.01105887, + "auxiliary_loss_mlp": 0.01026443, + "balance_loss_clip": 1.01381898, + "balance_loss_mlp": 1.03679228, + "epoch": 0.5471817225311889, + "flos": 36535443914880.0, + "grad_norm": 1.608228209483335, + "language_loss": 0.67675304, + "learning_rate": 1.7918210376521102e-06, + "loss": 0.69807637, + "num_input_tokens_seen": 196159990, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69140625, + "step": 9101, + "time_per_iteration": 2.638460397720337 + }, + { + "auxiliary_loss_clip": 0.01108046, + "auxiliary_loss_mlp": 0.0102741, + "balance_loss_clip": 1.01515532, + "balance_loss_mlp": 1.03816807, + "epoch": 0.5472418457838569, + "flos": 25775243124480.0, + "grad_norm": 1.6461027740418694, + "language_loss": 0.78004694, + "learning_rate": 1.7914336955008343e-06, + "loss": 0.80140156, + "num_input_tokens_seen": 196180570, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 9102, + "time_per_iteration": 2.515669822692871 + }, + { + "auxiliary_loss_clip": 0.01109549, + "auxiliary_loss_mlp": 0.01036821, + "balance_loss_clip": 1.02434635, + "balance_loss_mlp": 1.04091179, + "epoch": 0.5473019690365248, + "flos": 27887687925120.0, + "grad_norm": 1.641023318874669, + "language_loss": 0.72358656, + "learning_rate": 1.791046361258413e-06, + "loss": 0.74505031, + "num_input_tokens_seen": 196200300, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 9103, + "time_per_iteration": 2.516160249710083 + }, + { + "auxiliary_loss_clip": 0.0110583, + "auxiliary_loss_mlp": 0.01027804, + "balance_loss_clip": 1.01571035, + "balance_loss_mlp": 1.03704, + "epoch": 0.5473620922891929, + "flos": 57631490219520.0, + "grad_norm": 1.3192542299458547, + "language_loss": 0.65333968, + "learning_rate": 1.7906590349395356e-06, + "loss": 0.674676, + "num_input_tokens_seen": 196228525, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 9104, + "time_per_iteration": 2.8076846599578857 + }, + { + "auxiliary_loss_clip": 0.01110613, + "auxiliary_loss_mlp": 0.01032122, + "balance_loss_clip": 1.0188477, + "balance_loss_mlp": 1.03879666, + "epoch": 0.5474222155418608, + "flos": 19354056422400.0, + "grad_norm": 1.7582225342351636, + "language_loss": 0.81346989, + "learning_rate": 1.790271716558888e-06, + "loss": 0.83489728, + "num_input_tokens_seen": 196247690, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 9105, + "time_per_iteration": 2.4436333179473877 + }, + { + "auxiliary_loss_clip": 0.01106137, + "auxiliary_loss_mlp": 0.01029973, + "balance_loss_clip": 1.01836777, + "balance_loss_mlp": 1.03727031, + "epoch": 0.5474823387945288, + "flos": 25120448144640.0, + "grad_norm": 1.5498107295674015, + "language_loss": 0.80534816, + "learning_rate": 1.7898844061311575e-06, + "loss": 0.82670921, + "num_input_tokens_seen": 196268555, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6875, + "step": 9106, + "time_per_iteration": 2.5293564796447754 + }, + { + "auxiliary_loss_clip": 0.01108965, + "auxiliary_loss_mlp": 0.0103263, + "balance_loss_clip": 1.02120996, + "balance_loss_mlp": 1.03986609, + "epoch": 0.5475424620471967, + "flos": 18004174381440.0, + "grad_norm": 1.7454593746340303, + "language_loss": 0.69378364, + "learning_rate": 1.7894971036710322e-06, + "loss": 0.71519959, + "num_input_tokens_seen": 196285585, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.69140625, + "step": 9107, + "time_per_iteration": 2.423023223876953 + }, + { + "auxiliary_loss_clip": 0.01110146, + "auxiliary_loss_mlp": 0.01029027, + "balance_loss_clip": 1.01680255, + "balance_loss_mlp": 1.03831339, + "epoch": 0.5476025852998647, + "flos": 22309324922880.0, + "grad_norm": 1.6483473327352183, + "language_loss": 0.63088882, + "learning_rate": 1.789109809193197e-06, + "loss": 0.65228057, + "num_input_tokens_seen": 196305085, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.71875, + "step": 9108, + "time_per_iteration": 2.4629247188568115 + }, + { + "auxiliary_loss_clip": 0.01106827, + "auxiliary_loss_mlp": 0.0102654, + "balance_loss_clip": 1.01526904, + "balance_loss_mlp": 1.03832912, + "epoch": 0.5476627085525327, + "flos": 20120497850880.0, + "grad_norm": 1.6809972098624877, + "language_loss": 0.74894333, + "learning_rate": 1.7887225227123396e-06, + "loss": 0.77027702, + "num_input_tokens_seen": 196323945, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.68359375, + "step": 9109, + "time_per_iteration": 2.445711851119995 + }, + { + "auxiliary_loss_clip": 0.01105646, + "auxiliary_loss_mlp": 0.01033842, + "balance_loss_clip": 1.02130747, + "balance_loss_mlp": 1.03783536, + "epoch": 0.5477228318052006, + "flos": 17712579772800.0, + "grad_norm": 1.9460400321268034, + "language_loss": 0.77668434, + "learning_rate": 1.7883352442431457e-06, + "loss": 0.79807919, + "num_input_tokens_seen": 196342200, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.67578125, + "step": 9110, + "time_per_iteration": 2.4724810123443604 + }, + { + "auxiliary_loss_clip": 0.01105056, + "auxiliary_loss_mlp": 0.01033075, + "balance_loss_clip": 1.02193475, + "balance_loss_mlp": 1.03772378, + "epoch": 0.5477829550578687, + "flos": 25848895962240.0, + "grad_norm": 1.7745449116751173, + "language_loss": 0.71189445, + "learning_rate": 1.7879479738002993e-06, + "loss": 0.73327577, + "num_input_tokens_seen": 196362940, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.67578125, + "step": 9111, + "time_per_iteration": 2.5220110416412354 + }, + { + "auxiliary_loss_clip": 0.01108238, + "auxiliary_loss_mlp": 0.01036998, + "balance_loss_clip": 1.02525544, + "balance_loss_mlp": 1.03890049, + "epoch": 0.5478430783105366, + "flos": 23039676161280.0, + "grad_norm": 1.5754245119869974, + "language_loss": 0.71029758, + "learning_rate": 1.7875607113984876e-06, + "loss": 0.73174989, + "num_input_tokens_seen": 196383070, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.69140625, + "step": 9112, + "time_per_iteration": 2.4876022338867188 + }, + { + "auxiliary_loss_clip": 0.01108992, + "auxiliary_loss_mlp": 0.01029489, + "balance_loss_clip": 1.0176518, + "balance_loss_mlp": 1.03795052, + "epoch": 0.5479032015632046, + "flos": 16071210864000.0, + "grad_norm": 2.4321144529101946, + "language_loss": 0.88027447, + "learning_rate": 1.7871734570523953e-06, + "loss": 0.90165925, + "num_input_tokens_seen": 196398485, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.7109375, + "step": 9113, + "time_per_iteration": 2.4495129585266113 + }, + { + "auxiliary_loss_clip": 0.01110892, + "auxiliary_loss_mlp": 0.01031433, + "balance_loss_clip": 1.01863575, + "balance_loss_mlp": 1.04015231, + "epoch": 0.5479633248158725, + "flos": 24278701852800.0, + "grad_norm": 1.4380357531145453, + "language_loss": 0.73040199, + "learning_rate": 1.7867862107767067e-06, + "loss": 0.75182521, + "num_input_tokens_seen": 196417725, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.70703125, + "step": 9114, + "time_per_iteration": 2.49124813079834 + }, + { + "auxiliary_loss_clip": 0.0110468, + "auxiliary_loss_mlp": 0.01031545, + "balance_loss_clip": 1.0205301, + "balance_loss_mlp": 1.03658402, + "epoch": 0.5480234480685405, + "flos": 26358216860160.0, + "grad_norm": 1.7175878836105734, + "language_loss": 0.72105908, + "learning_rate": 1.7863989725861066e-06, + "loss": 0.74242127, + "num_input_tokens_seen": 196437840, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6796875, + "step": 9115, + "time_per_iteration": 2.4818665981292725 + }, + { + "auxiliary_loss_clip": 0.01109482, + "auxiliary_loss_mlp": 0.01032226, + "balance_loss_clip": 1.01915491, + "balance_loss_mlp": 1.03801298, + "epoch": 0.5480835713212084, + "flos": 22055077480320.0, + "grad_norm": 1.8153830213846445, + "language_loss": 0.7222048, + "learning_rate": 1.7860117424952781e-06, + "loss": 0.74362183, + "num_input_tokens_seen": 196457300, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71484375, + "step": 9116, + "time_per_iteration": 2.4857382774353027 + }, + { + "auxiliary_loss_clip": 0.01108168, + "auxiliary_loss_mlp": 0.01038569, + "balance_loss_clip": 1.02634406, + "balance_loss_mlp": 1.03931904, + "epoch": 0.5481436945738765, + "flos": 25301042749440.0, + "grad_norm": 2.1442712779415025, + "language_loss": 0.76391387, + "learning_rate": 1.7856245205189063e-06, + "loss": 0.78538126, + "num_input_tokens_seen": 196476720, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6875, + "step": 9117, + "time_per_iteration": 2.481539726257324 + }, + { + "auxiliary_loss_clip": 0.01102281, + "auxiliary_loss_mlp": 0.01032027, + "balance_loss_clip": 1.02069592, + "balance_loss_mlp": 1.03559899, + "epoch": 0.5482038178265444, + "flos": 33580857772800.0, + "grad_norm": 1.6184993035700161, + "language_loss": 0.62667149, + "learning_rate": 1.785237306671674e-06, + "loss": 0.64801455, + "num_input_tokens_seen": 196496765, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 9118, + "time_per_iteration": 2.582087516784668 + }, + { + "auxiliary_loss_clip": 0.01112715, + "auxiliary_loss_mlp": 0.01030243, + "balance_loss_clip": 1.01705241, + "balance_loss_mlp": 1.04148602, + "epoch": 0.5482639410792124, + "flos": 19026192055680.0, + "grad_norm": 2.080656601028848, + "language_loss": 0.79054701, + "learning_rate": 1.7848501009682646e-06, + "loss": 0.81197661, + "num_input_tokens_seen": 196516220, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 9119, + "time_per_iteration": 2.431641101837158 + }, + { + "auxiliary_loss_clip": 0.01106769, + "auxiliary_loss_mlp": 0.01032074, + "balance_loss_clip": 1.02143443, + "balance_loss_mlp": 1.0393101, + "epoch": 0.5483240643318803, + "flos": 25410318900480.0, + "grad_norm": 1.6818671426073972, + "language_loss": 0.82585561, + "learning_rate": 1.7844629034233604e-06, + "loss": 0.84724402, + "num_input_tokens_seen": 196533860, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.671875, + "step": 9120, + "time_per_iteration": 2.501248359680176 + }, + { + "auxiliary_loss_clip": 0.01110356, + "auxiliary_loss_mlp": 0.01038217, + "balance_loss_clip": 1.02538443, + "balance_loss_mlp": 1.03979588, + "epoch": 0.5483841875845483, + "flos": 21466896272640.0, + "grad_norm": 1.7397757233914666, + "language_loss": 0.80841327, + "learning_rate": 1.7840757140516455e-06, + "loss": 0.82989895, + "num_input_tokens_seen": 196551305, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.70703125, + "step": 9121, + "time_per_iteration": 2.449951171875 + }, + { + "auxiliary_loss_clip": 0.01108531, + "auxiliary_loss_mlp": 0.0103453, + "balance_loss_clip": 1.02164376, + "balance_loss_mlp": 1.03663361, + "epoch": 0.5484443108372163, + "flos": 24747263792640.0, + "grad_norm": 2.0253856212842662, + "language_loss": 0.61077833, + "learning_rate": 1.7836885328678008e-06, + "loss": 0.63220894, + "num_input_tokens_seen": 196569420, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 9122, + "time_per_iteration": 2.4943363666534424 + }, + { + "auxiliary_loss_clip": 0.01107335, + "auxiliary_loss_mlp": 0.01031907, + "balance_loss_clip": 1.02135706, + "balance_loss_mlp": 1.03908038, + "epoch": 0.5485044340898843, + "flos": 25375377945600.0, + "grad_norm": 1.7986157880414966, + "language_loss": 0.71862841, + "learning_rate": 1.7833013598865084e-06, + "loss": 0.74002087, + "num_input_tokens_seen": 196590610, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.68359375, + "step": 9123, + "time_per_iteration": 2.4815285205841064 + }, + { + "auxiliary_loss_clip": 0.01108125, + "auxiliary_loss_mlp": 0.01028332, + "balance_loss_clip": 1.01702476, + "balance_loss_mlp": 1.03875828, + "epoch": 0.5485645573425523, + "flos": 12641167370880.0, + "grad_norm": 1.9471016807647592, + "language_loss": 0.83393133, + "learning_rate": 1.7829141951224505e-06, + "loss": 0.8552959, + "num_input_tokens_seen": 196606495, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6953125, + "step": 9124, + "time_per_iteration": 2.442490816116333 + }, + { + "auxiliary_loss_clip": 0.0110907, + "auxiliary_loss_mlp": 0.01034722, + "balance_loss_clip": 1.02254486, + "balance_loss_mlp": 1.04040182, + "epoch": 0.5486246805952202, + "flos": 28329425383680.0, + "grad_norm": 1.9388864941150135, + "language_loss": 0.79954362, + "learning_rate": 1.7825270385903075e-06, + "loss": 0.82098156, + "num_input_tokens_seen": 196626365, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 9125, + "time_per_iteration": 2.5117273330688477 + }, + { + "auxiliary_loss_clip": 0.01109363, + "auxiliary_loss_mlp": 0.01030791, + "balance_loss_clip": 1.01844716, + "balance_loss_mlp": 1.03870225, + "epoch": 0.5486848038478882, + "flos": 16800017817600.0, + "grad_norm": 2.35248102892353, + "language_loss": 0.74499249, + "learning_rate": 1.7821398903047617e-06, + "loss": 0.76639402, + "num_input_tokens_seen": 196644465, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.70703125, + "step": 9126, + "time_per_iteration": 2.481576442718506 + }, + { + "auxiliary_loss_clip": 0.01110687, + "auxiliary_loss_mlp": 0.01031323, + "balance_loss_clip": 1.01794803, + "balance_loss_mlp": 1.03789783, + "epoch": 0.5487449271005561, + "flos": 17236224581760.0, + "grad_norm": 2.4816786154583212, + "language_loss": 0.66715956, + "learning_rate": 1.7817527502804928e-06, + "loss": 0.68857968, + "num_input_tokens_seen": 196659160, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7265625, + "step": 9127, + "time_per_iteration": 2.462186574935913 + }, + { + "auxiliary_loss_clip": 0.01106989, + "auxiliary_loss_mlp": 0.01034985, + "balance_loss_clip": 1.02183664, + "balance_loss_mlp": 1.03737557, + "epoch": 0.5488050503532241, + "flos": 17340867878400.0, + "grad_norm": 1.7392555793748137, + "language_loss": 0.83598024, + "learning_rate": 1.781365618532181e-06, + "loss": 0.85740006, + "num_input_tokens_seen": 196677410, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.6953125, + "step": 9128, + "time_per_iteration": 2.4559218883514404 + }, + { + "auxiliary_loss_clip": 0.01107314, + "auxiliary_loss_mlp": 0.01031923, + "balance_loss_clip": 1.01948333, + "balance_loss_mlp": 1.03735828, + "epoch": 0.548865173605892, + "flos": 17239169496960.0, + "grad_norm": 1.8252742071628254, + "language_loss": 0.74370325, + "learning_rate": 1.7809784950745078e-06, + "loss": 0.76509559, + "num_input_tokens_seen": 196696765, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.69921875, + "step": 9129, + "time_per_iteration": 2.443394422531128 + }, + { + "auxiliary_loss_clip": 0.01111598, + "auxiliary_loss_mlp": 0.01030328, + "balance_loss_clip": 1.01749516, + "balance_loss_mlp": 1.0391345, + "epoch": 0.5489252968585601, + "flos": 17456716218240.0, + "grad_norm": 2.8843985474075557, + "language_loss": 0.6325981, + "learning_rate": 1.7805913799221511e-06, + "loss": 0.65401739, + "num_input_tokens_seen": 196714895, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7265625, + "step": 9130, + "time_per_iteration": 2.424933433532715 + }, + { + "auxiliary_loss_clip": 0.01109538, + "auxiliary_loss_mlp": 0.01035884, + "balance_loss_clip": 1.02319455, + "balance_loss_mlp": 1.03782725, + "epoch": 0.548985420111228, + "flos": 26323383646080.0, + "grad_norm": 2.1259011139704804, + "language_loss": 0.62936115, + "learning_rate": 1.7802042730897915e-06, + "loss": 0.65081537, + "num_input_tokens_seen": 196735510, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71484375, + "step": 9131, + "time_per_iteration": 2.508054256439209 + }, + { + "auxiliary_loss_clip": 0.01109907, + "auxiliary_loss_mlp": 0.01032434, + "balance_loss_clip": 1.01925564, + "balance_loss_mlp": 1.03880227, + "epoch": 0.549045543363896, + "flos": 18693730748160.0, + "grad_norm": 1.7299030045344002, + "language_loss": 0.74452615, + "learning_rate": 1.7798171745921084e-06, + "loss": 0.76594955, + "num_input_tokens_seen": 196752855, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 9132, + "time_per_iteration": 2.456127166748047 + }, + { + "auxiliary_loss_clip": 0.0110607, + "auxiliary_loss_mlp": 0.01027832, + "balance_loss_clip": 1.01656091, + "balance_loss_mlp": 1.03589082, + "epoch": 0.5491056666165639, + "flos": 24717386655360.0, + "grad_norm": 1.6111198761107228, + "language_loss": 0.8129831, + "learning_rate": 1.7794300844437795e-06, + "loss": 0.83432209, + "num_input_tokens_seen": 196772230, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.703125, + "step": 9133, + "time_per_iteration": 2.490236759185791 + }, + { + "auxiliary_loss_clip": 0.01106997, + "auxiliary_loss_mlp": 0.01033411, + "balance_loss_clip": 1.02131128, + "balance_loss_mlp": 1.03802598, + "epoch": 0.5491657898692319, + "flos": 21576926609280.0, + "grad_norm": 1.7268592344479874, + "language_loss": 0.70094633, + "learning_rate": 1.7790430026594841e-06, + "loss": 0.72235036, + "num_input_tokens_seen": 196790405, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 9134, + "time_per_iteration": 3.827064275741577 + }, + { + "auxiliary_loss_clip": 0.01110087, + "auxiliary_loss_mlp": 0.01032274, + "balance_loss_clip": 1.02014494, + "balance_loss_mlp": 1.03806603, + "epoch": 0.5492259131219, + "flos": 50476432746240.0, + "grad_norm": 1.744868024388231, + "language_loss": 0.61109304, + "learning_rate": 1.7786559292539004e-06, + "loss": 0.63251662, + "num_input_tokens_seen": 196813785, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71875, + "step": 9135, + "time_per_iteration": 2.730273723602295 + }, + { + "auxiliary_loss_clip": 0.01110668, + "auxiliary_loss_mlp": 0.01034696, + "balance_loss_clip": 1.02089787, + "balance_loss_mlp": 1.03864121, + "epoch": 0.5492860363745679, + "flos": 25119262995840.0, + "grad_norm": 1.7368953039767876, + "language_loss": 0.72582811, + "learning_rate": 1.7782688642417058e-06, + "loss": 0.74728173, + "num_input_tokens_seen": 196834390, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.71875, + "step": 9136, + "time_per_iteration": 2.483704090118408 + }, + { + "auxiliary_loss_clip": 0.01111013, + "auxiliary_loss_mlp": 0.01036009, + "balance_loss_clip": 1.02256799, + "balance_loss_mlp": 1.03636873, + "epoch": 0.5493461596272359, + "flos": 22633777497600.0, + "grad_norm": 3.852349726597511, + "language_loss": 0.68771708, + "learning_rate": 1.7778818076375781e-06, + "loss": 0.70918733, + "num_input_tokens_seen": 196853290, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 9137, + "time_per_iteration": 5.456461191177368 + }, + { + "auxiliary_loss_clip": 0.01031834, + "auxiliary_loss_mlp": 0.01007044, + "balance_loss_clip": 1.00602436, + "balance_loss_mlp": 1.01015878, + "epoch": 0.5494062828799038, + "flos": 66151800754560.0, + "grad_norm": 0.9040496486989937, + "language_loss": 0.6527245, + "learning_rate": 1.7774947594561947e-06, + "loss": 0.67311323, + "num_input_tokens_seen": 196913120, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.21679688, + "step": 9138, + "time_per_iteration": 4.559895753860474 + }, + { + "auxiliary_loss_clip": 0.01109871, + "auxiliary_loss_mlp": 0.01030983, + "balance_loss_clip": 1.01828778, + "balance_loss_mlp": 1.03911173, + "epoch": 0.5494664061325718, + "flos": 21105958458240.0, + "grad_norm": 1.6793798945838962, + "language_loss": 0.74981934, + "learning_rate": 1.7771077197122321e-06, + "loss": 0.7712279, + "num_input_tokens_seen": 196931530, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 9139, + "time_per_iteration": 2.4897236824035645 + }, + { + "auxiliary_loss_clip": 0.01107792, + "auxiliary_loss_mlp": 0.01029596, + "balance_loss_clip": 1.01754975, + "balance_loss_mlp": 1.03827238, + "epoch": 0.5495265293852397, + "flos": 14392566616320.0, + "grad_norm": 1.7331605634368676, + "language_loss": 0.71274745, + "learning_rate": 1.7767206884203672e-06, + "loss": 0.73412126, + "num_input_tokens_seen": 196949430, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6953125, + "step": 9140, + "time_per_iteration": 2.416760206222534 + }, + { + "auxiliary_loss_clip": 0.01105846, + "auxiliary_loss_mlp": 0.01033348, + "balance_loss_clip": 1.02035391, + "balance_loss_mlp": 1.03625703, + "epoch": 0.5495866526379077, + "flos": 25549148966400.0, + "grad_norm": 1.6373657351429003, + "language_loss": 0.76304853, + "learning_rate": 1.7763336655952762e-06, + "loss": 0.78444046, + "num_input_tokens_seen": 196968265, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6953125, + "step": 9141, + "time_per_iteration": 2.495957612991333 + }, + { + "auxiliary_loss_clip": 0.01104653, + "auxiliary_loss_mlp": 0.0102863, + "balance_loss_clip": 1.01651192, + "balance_loss_mlp": 1.03816998, + "epoch": 0.5496467758905756, + "flos": 21317256213120.0, + "grad_norm": 1.8000642859490852, + "language_loss": 0.74711812, + "learning_rate": 1.7759466512516346e-06, + "loss": 0.76845098, + "num_input_tokens_seen": 196984930, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 9142, + "time_per_iteration": 2.476701259613037 + }, + { + "auxiliary_loss_clip": 0.01112649, + "auxiliary_loss_mlp": 0.01033014, + "balance_loss_clip": 1.01920366, + "balance_loss_mlp": 1.04044414, + "epoch": 0.5497068991432437, + "flos": 22233086305920.0, + "grad_norm": 3.087747357168804, + "language_loss": 0.76516807, + "learning_rate": 1.7755596454041192e-06, + "loss": 0.78662473, + "num_input_tokens_seen": 197002320, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.72265625, + "step": 9143, + "time_per_iteration": 2.4777820110321045 + }, + { + "auxiliary_loss_clip": 0.01105498, + "auxiliary_loss_mlp": 0.0103083, + "balance_loss_clip": 1.0188787, + "balance_loss_mlp": 1.03639066, + "epoch": 0.5497670223959116, + "flos": 18479093028480.0, + "grad_norm": 4.124964872446098, + "language_loss": 0.79934669, + "learning_rate": 1.7751726480674044e-06, + "loss": 0.82070994, + "num_input_tokens_seen": 197020825, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.69140625, + "step": 9144, + "time_per_iteration": 2.470946788787842 + }, + { + "auxiliary_loss_clip": 0.01109215, + "auxiliary_loss_mlp": 0.01028797, + "balance_loss_clip": 1.0163275, + "balance_loss_mlp": 1.03886819, + "epoch": 0.5498271456485796, + "flos": 29205107049600.0, + "grad_norm": 2.259125962742438, + "language_loss": 0.71273595, + "learning_rate": 1.7747856592561645e-06, + "loss": 0.73411608, + "num_input_tokens_seen": 197040450, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.703125, + "step": 9145, + "time_per_iteration": 2.5155293941497803 + }, + { + "auxiliary_loss_clip": 0.0110796, + "auxiliary_loss_mlp": 0.01027624, + "balance_loss_clip": 1.01604867, + "balance_loss_mlp": 1.03797007, + "epoch": 0.5498872689012475, + "flos": 34824372664320.0, + "grad_norm": 1.760392083970442, + "language_loss": 0.70398986, + "learning_rate": 1.774398678985076e-06, + "loss": 0.72534567, + "num_input_tokens_seen": 197063930, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.703125, + "step": 9146, + "time_per_iteration": 2.5837745666503906 + }, + { + "auxiliary_loss_clip": 0.01103828, + "auxiliary_loss_mlp": 0.01030506, + "balance_loss_clip": 1.01897275, + "balance_loss_mlp": 1.03747129, + "epoch": 0.5499473921539155, + "flos": 25921938268800.0, + "grad_norm": 1.7328002119898687, + "language_loss": 0.6403445, + "learning_rate": 1.7740117072688113e-06, + "loss": 0.66168791, + "num_input_tokens_seen": 197082660, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 9147, + "time_per_iteration": 2.5004754066467285 + }, + { + "auxiliary_loss_clip": 0.01110115, + "auxiliary_loss_mlp": 0.01029285, + "balance_loss_clip": 1.01714349, + "balance_loss_mlp": 1.04033351, + "epoch": 0.5500075154065835, + "flos": 22273701609600.0, + "grad_norm": 2.3129813772985854, + "language_loss": 0.80632472, + "learning_rate": 1.7736247441220458e-06, + "loss": 0.82771873, + "num_input_tokens_seen": 197100675, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6953125, + "step": 9148, + "time_per_iteration": 2.4941914081573486 + }, + { + "auxiliary_loss_clip": 0.01109987, + "auxiliary_loss_mlp": 0.01034322, + "balance_loss_clip": 1.0224669, + "balance_loss_mlp": 1.04013515, + "epoch": 0.5500676386592515, + "flos": 28037507552640.0, + "grad_norm": 1.5952381042001647, + "language_loss": 0.78739786, + "learning_rate": 1.773237789559453e-06, + "loss": 0.80884099, + "num_input_tokens_seen": 197121320, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.69921875, + "step": 9149, + "time_per_iteration": 2.5276949405670166 + }, + { + "auxiliary_loss_clip": 0.01108964, + "auxiliary_loss_mlp": 0.01029124, + "balance_loss_clip": 1.01695323, + "balance_loss_mlp": 1.03880644, + "epoch": 0.5501277619119195, + "flos": 23914819123200.0, + "grad_norm": 2.0296810240639847, + "language_loss": 0.72119236, + "learning_rate": 1.7728508435957052e-06, + "loss": 0.74257326, + "num_input_tokens_seen": 197138965, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.703125, + "step": 9150, + "time_per_iteration": 2.4646284580230713 + }, + { + "auxiliary_loss_clip": 0.01110946, + "auxiliary_loss_mlp": 0.01027984, + "balance_loss_clip": 1.01450694, + "balance_loss_mlp": 1.03812099, + "epoch": 0.5501878851645874, + "flos": 20923783655040.0, + "grad_norm": 1.6901514106805953, + "language_loss": 0.74800563, + "learning_rate": 1.772463906245477e-06, + "loss": 0.76939499, + "num_input_tokens_seen": 197156460, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7265625, + "step": 9151, + "time_per_iteration": 2.4528467655181885 + }, + { + "auxiliary_loss_clip": 0.01109486, + "auxiliary_loss_mlp": 0.01027197, + "balance_loss_clip": 1.01572907, + "balance_loss_mlp": 1.03945291, + "epoch": 0.5502480084172554, + "flos": 20665298407680.0, + "grad_norm": 1.835684303690663, + "language_loss": 0.76049578, + "learning_rate": 1.7720769775234394e-06, + "loss": 0.78186262, + "num_input_tokens_seen": 197175140, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.703125, + "step": 9152, + "time_per_iteration": 2.4587628841400146 + }, + { + "auxiliary_loss_clip": 0.011054, + "auxiliary_loss_mlp": 0.01028906, + "balance_loss_clip": 1.01691318, + "balance_loss_mlp": 1.03700173, + "epoch": 0.5503081316699233, + "flos": 26432552056320.0, + "grad_norm": 1.7890824738540096, + "language_loss": 0.82162666, + "learning_rate": 1.7716900574442662e-06, + "loss": 0.84296966, + "num_input_tokens_seen": 197194345, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.68359375, + "step": 9153, + "time_per_iteration": 2.490391492843628 + }, + { + "auxiliary_loss_clip": 0.01107152, + "auxiliary_loss_mlp": 0.01032594, + "balance_loss_clip": 1.02004111, + "balance_loss_mlp": 1.03787208, + "epoch": 0.5503682549225913, + "flos": 30629144718720.0, + "grad_norm": 1.7732052023343188, + "language_loss": 0.74143934, + "learning_rate": 1.7713031460226294e-06, + "loss": 0.76283687, + "num_input_tokens_seen": 197215535, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69140625, + "step": 9154, + "time_per_iteration": 2.5304152965545654 + }, + { + "auxiliary_loss_clip": 0.01113689, + "auxiliary_loss_mlp": 0.01035044, + "balance_loss_clip": 1.02184761, + "balance_loss_mlp": 1.04016376, + "epoch": 0.5504283781752592, + "flos": 22565439872640.0, + "grad_norm": 1.4983591953206352, + "language_loss": 0.7257731, + "learning_rate": 1.770916243273199e-06, + "loss": 0.74726045, + "num_input_tokens_seen": 197234945, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 9155, + "time_per_iteration": 2.4642586708068848 + }, + { + "auxiliary_loss_clip": 0.01033812, + "auxiliary_loss_mlp": 0.01001849, + "balance_loss_clip": 1.00080609, + "balance_loss_mlp": 1.01202416, + "epoch": 0.5504885014279273, + "flos": 67901009270400.0, + "grad_norm": 0.7480439065154532, + "language_loss": 0.55414248, + "learning_rate": 1.7705293492106483e-06, + "loss": 0.57449913, + "num_input_tokens_seen": 197302285, + "router_z_loss_clip": 0.01043701, + "router_z_loss_mlp": 0.21777344, + "step": 9156, + "time_per_iteration": 3.184554100036621 + }, + { + "auxiliary_loss_clip": 0.0110658, + "auxiliary_loss_mlp": 0.01029477, + "balance_loss_clip": 1.01741338, + "balance_loss_mlp": 1.0373919, + "epoch": 0.5505486246805952, + "flos": 22450058409600.0, + "grad_norm": 1.690497670143624, + "language_loss": 0.82608092, + "learning_rate": 1.7701424638496475e-06, + "loss": 0.84744143, + "num_input_tokens_seen": 197321575, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.69140625, + "step": 9157, + "time_per_iteration": 2.4718377590179443 + }, + { + "auxiliary_loss_clip": 0.01115009, + "auxiliary_loss_mlp": 0.01031366, + "balance_loss_clip": 1.01764512, + "balance_loss_mlp": 1.04140961, + "epoch": 0.5506087479332632, + "flos": 26906896085760.0, + "grad_norm": 2.5846917450647138, + "language_loss": 0.75262648, + "learning_rate": 1.7697555872048677e-06, + "loss": 0.77409017, + "num_input_tokens_seen": 197340255, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.734375, + "step": 9158, + "time_per_iteration": 2.483400583267212 + }, + { + "auxiliary_loss_clip": 0.01105976, + "auxiliary_loss_mlp": 0.01030833, + "balance_loss_clip": 1.01863742, + "balance_loss_mlp": 1.0392096, + "epoch": 0.5506688711859311, + "flos": 22930256355840.0, + "grad_norm": 1.6248211907364027, + "language_loss": 0.69624805, + "learning_rate": 1.769368719290979e-06, + "loss": 0.71761608, + "num_input_tokens_seen": 197360360, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66796875, + "step": 9159, + "time_per_iteration": 2.5159049034118652 + }, + { + "auxiliary_loss_clip": 0.01110817, + "auxiliary_loss_mlp": 0.01032003, + "balance_loss_clip": 1.01913416, + "balance_loss_mlp": 1.03923249, + "epoch": 0.5507289944385991, + "flos": 29606408772480.0, + "grad_norm": 1.7392637683079002, + "language_loss": 0.67766821, + "learning_rate": 1.7689818601226516e-06, + "loss": 0.69909644, + "num_input_tokens_seen": 197381905, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71484375, + "step": 9160, + "time_per_iteration": 2.5915122032165527 + }, + { + "auxiliary_loss_clip": 0.01106091, + "auxiliary_loss_mlp": 0.01032828, + "balance_loss_clip": 1.02106166, + "balance_loss_mlp": 1.03855252, + "epoch": 0.5507891176912671, + "flos": 15334431091200.0, + "grad_norm": 1.9414097965551829, + "language_loss": 0.71404171, + "learning_rate": 1.7685950097145552e-06, + "loss": 0.7354309, + "num_input_tokens_seen": 197398555, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.67578125, + "step": 9161, + "time_per_iteration": 2.4698691368103027 + }, + { + "auxiliary_loss_clip": 0.0110819, + "auxiliary_loss_mlp": 0.01035993, + "balance_loss_clip": 1.02365494, + "balance_loss_mlp": 1.03864145, + "epoch": 0.5508492409439351, + "flos": 26578313447040.0, + "grad_norm": 2.0077015754602985, + "language_loss": 0.69346386, + "learning_rate": 1.768208168081359e-06, + "loss": 0.71490568, + "num_input_tokens_seen": 197419630, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 9162, + "time_per_iteration": 2.514615297317505 + }, + { + "auxiliary_loss_clip": 0.01107873, + "auxiliary_loss_mlp": 0.01037948, + "balance_loss_clip": 1.02538323, + "balance_loss_mlp": 1.03850245, + "epoch": 0.5509093641966031, + "flos": 25443428261760.0, + "grad_norm": 1.6272332912595904, + "language_loss": 0.8531208, + "learning_rate": 1.767821335237733e-06, + "loss": 0.87457901, + "num_input_tokens_seen": 197438480, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 9163, + "time_per_iteration": 2.55450439453125 + }, + { + "auxiliary_loss_clip": 0.01107861, + "auxiliary_loss_mlp": 0.01032273, + "balance_loss_clip": 1.02065635, + "balance_loss_mlp": 1.0394969, + "epoch": 0.550969487449271, + "flos": 18698543170560.0, + "grad_norm": 1.5452929110279412, + "language_loss": 0.8063103, + "learning_rate": 1.7674345111983441e-06, + "loss": 0.8277117, + "num_input_tokens_seen": 197456755, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.68359375, + "step": 9164, + "time_per_iteration": 2.477283239364624 + }, + { + "auxiliary_loss_clip": 0.01112735, + "auxiliary_loss_mlp": 0.01027637, + "balance_loss_clip": 1.0152092, + "balance_loss_mlp": 1.04160368, + "epoch": 0.551029610701939, + "flos": 22708723224960.0, + "grad_norm": 1.8276675469309818, + "language_loss": 0.73409986, + "learning_rate": 1.767047695977863e-06, + "loss": 0.75550359, + "num_input_tokens_seen": 197475530, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 9165, + "time_per_iteration": 2.4870002269744873 + }, + { + "auxiliary_loss_clip": 0.01105934, + "auxiliary_loss_mlp": 0.01028437, + "balance_loss_clip": 1.01700497, + "balance_loss_mlp": 1.03732443, + "epoch": 0.5510897339546069, + "flos": 12420496166400.0, + "grad_norm": 1.8849650051461906, + "language_loss": 0.79019225, + "learning_rate": 1.7666608895909563e-06, + "loss": 0.81153595, + "num_input_tokens_seen": 197490835, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6875, + "step": 9166, + "time_per_iteration": 2.435049295425415 + }, + { + "auxiliary_loss_clip": 0.01108748, + "auxiliary_loss_mlp": 0.01028394, + "balance_loss_clip": 1.01596665, + "balance_loss_mlp": 1.03822398, + "epoch": 0.5511498572072749, + "flos": 18770579896320.0, + "grad_norm": 2.033929506473001, + "language_loss": 0.76165509, + "learning_rate": 1.7662740920522913e-06, + "loss": 0.78302646, + "num_input_tokens_seen": 197508770, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.70703125, + "step": 9167, + "time_per_iteration": 2.474677562713623 + }, + { + "auxiliary_loss_clip": 0.01105676, + "auxiliary_loss_mlp": 0.01029086, + "balance_loss_clip": 1.01631832, + "balance_loss_mlp": 1.03744709, + "epoch": 0.5512099804599428, + "flos": 19573326996480.0, + "grad_norm": 2.261050601267758, + "language_loss": 0.79845661, + "learning_rate": 1.7658873033765374e-06, + "loss": 0.81980425, + "num_input_tokens_seen": 197527340, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 9168, + "time_per_iteration": 2.484435796737671 + }, + { + "auxiliary_loss_clip": 0.01110227, + "auxiliary_loss_mlp": 0.0103542, + "balance_loss_clip": 1.0231235, + "balance_loss_mlp": 1.03901529, + "epoch": 0.5512701037126109, + "flos": 26245600744320.0, + "grad_norm": 1.641322965099804, + "language_loss": 0.68934894, + "learning_rate": 1.7655005235783591e-06, + "loss": 0.71080542, + "num_input_tokens_seen": 197547280, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.7109375, + "step": 9169, + "time_per_iteration": 2.5206069946289062 + }, + { + "auxiliary_loss_clip": 0.01102507, + "auxiliary_loss_mlp": 0.01025884, + "balance_loss_clip": 1.014714, + "balance_loss_mlp": 1.03545678, + "epoch": 0.5513302269652788, + "flos": 21945406279680.0, + "grad_norm": 2.0185216192280553, + "language_loss": 0.85350084, + "learning_rate": 1.7651137526724251e-06, + "loss": 0.87478477, + "num_input_tokens_seen": 197565045, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.671875, + "step": 9170, + "time_per_iteration": 2.4762823581695557 + }, + { + "auxiliary_loss_clip": 0.01031617, + "auxiliary_loss_mlp": 0.01002445, + "balance_loss_clip": 1.00143194, + "balance_loss_mlp": 1.00984073, + "epoch": 0.5513903502179468, + "flos": 68235948616320.0, + "grad_norm": 0.7807167648980764, + "language_loss": 0.5990442, + "learning_rate": 1.7647269906734017e-06, + "loss": 0.61938488, + "num_input_tokens_seen": 197625005, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.21777344, + "step": 9171, + "time_per_iteration": 3.0934739112854004 + }, + { + "auxiliary_loss_clip": 0.01106302, + "auxiliary_loss_mlp": 0.01032525, + "balance_loss_clip": 1.02024603, + "balance_loss_mlp": 1.03768301, + "epoch": 0.5514504734706147, + "flos": 18734238311040.0, + "grad_norm": 1.4242208217777272, + "language_loss": 0.701002, + "learning_rate": 1.7643402375959533e-06, + "loss": 0.72239029, + "num_input_tokens_seen": 197645050, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 9172, + "time_per_iteration": 2.482672929763794 + }, + { + "auxiliary_loss_clip": 0.01104259, + "auxiliary_loss_mlp": 0.01029819, + "balance_loss_clip": 1.0176115, + "balance_loss_mlp": 1.03602123, + "epoch": 0.5515105967232827, + "flos": 22270972176000.0, + "grad_norm": 1.708440744181033, + "language_loss": 0.75790203, + "learning_rate": 1.7639534934547474e-06, + "loss": 0.77924281, + "num_input_tokens_seen": 197663910, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 9173, + "time_per_iteration": 2.476710557937622 + }, + { + "auxiliary_loss_clip": 0.01104019, + "auxiliary_loss_mlp": 0.01032582, + "balance_loss_clip": 1.02019644, + "balance_loss_mlp": 1.0371182, + "epoch": 0.5515707199759508, + "flos": 22557682535040.0, + "grad_norm": 1.5740431144983165, + "language_loss": 0.74457419, + "learning_rate": 1.7635667582644484e-06, + "loss": 0.76594019, + "num_input_tokens_seen": 197681580, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 9174, + "time_per_iteration": 2.4599406719207764 + }, + { + "auxiliary_loss_clip": 0.01108196, + "auxiliary_loss_mlp": 0.01029333, + "balance_loss_clip": 1.0173409, + "balance_loss_mlp": 1.03827941, + "epoch": 0.5516308432286187, + "flos": 28291072636800.0, + "grad_norm": 1.784111045924148, + "language_loss": 0.72615731, + "learning_rate": 1.7631800320397217e-06, + "loss": 0.74753261, + "num_input_tokens_seen": 197702095, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69921875, + "step": 9175, + "time_per_iteration": 2.5028982162475586 + }, + { + "auxiliary_loss_clip": 0.01108059, + "auxiliary_loss_mlp": 0.01035871, + "balance_loss_clip": 1.02403331, + "balance_loss_mlp": 1.0378927, + "epoch": 0.5516909664812867, + "flos": 18764474584320.0, + "grad_norm": 1.8209397746213287, + "language_loss": 0.69452918, + "learning_rate": 1.7627933147952318e-06, + "loss": 0.71596849, + "num_input_tokens_seen": 197720720, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.703125, + "step": 9176, + "time_per_iteration": 3.852022171020508 + }, + { + "auxiliary_loss_clip": 0.01104891, + "auxiliary_loss_mlp": 0.01027669, + "balance_loss_clip": 1.01612973, + "balance_loss_mlp": 1.03734601, + "epoch": 0.5517510897339546, + "flos": 27740346336000.0, + "grad_norm": 1.7630507090786165, + "language_loss": 0.70797551, + "learning_rate": 1.7624066065456435e-06, + "loss": 0.7293011, + "num_input_tokens_seen": 197741820, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 9177, + "time_per_iteration": 2.507990837097168 + }, + { + "auxiliary_loss_clip": 0.01109377, + "auxiliary_loss_mlp": 0.0102783, + "balance_loss_clip": 1.01644588, + "balance_loss_mlp": 1.03980064, + "epoch": 0.5518112129866226, + "flos": 18404470523520.0, + "grad_norm": 1.556329351454275, + "language_loss": 0.80197215, + "learning_rate": 1.7620199073056204e-06, + "loss": 0.82334423, + "num_input_tokens_seen": 197759160, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6953125, + "step": 9178, + "time_per_iteration": 2.4645802974700928 + }, + { + "auxiliary_loss_clip": 0.01110368, + "auxiliary_loss_mlp": 0.01040375, + "balance_loss_clip": 1.02744687, + "balance_loss_mlp": 1.03942454, + "epoch": 0.5518713362392905, + "flos": 25082670015360.0, + "grad_norm": 1.5358645892565401, + "language_loss": 0.74621391, + "learning_rate": 1.761633217089826e-06, + "loss": 0.7677213, + "num_input_tokens_seen": 197779760, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9179, + "time_per_iteration": 4.023995399475098 + }, + { + "auxiliary_loss_clip": 0.01106959, + "auxiliary_loss_mlp": 0.01034445, + "balance_loss_clip": 1.02213681, + "balance_loss_mlp": 1.0385108, + "epoch": 0.5519314594919585, + "flos": 36538999361280.0, + "grad_norm": 1.8924336027697886, + "language_loss": 0.70433038, + "learning_rate": 1.761246535912924e-06, + "loss": 0.72574437, + "num_input_tokens_seen": 197801545, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.68359375, + "step": 9180, + "time_per_iteration": 4.060170650482178 + }, + { + "auxiliary_loss_clip": 0.0110796, + "auxiliary_loss_mlp": 0.01034059, + "balance_loss_clip": 1.02197158, + "balance_loss_mlp": 1.03808069, + "epoch": 0.5519915827446265, + "flos": 20448613612800.0, + "grad_norm": 1.9150410275355574, + "language_loss": 0.66870642, + "learning_rate": 1.7608598637895776e-06, + "loss": 0.69012666, + "num_input_tokens_seen": 197820760, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69921875, + "step": 9181, + "time_per_iteration": 2.4741644859313965 + }, + { + "auxiliary_loss_clip": 0.01109873, + "auxiliary_loss_mlp": 0.01028117, + "balance_loss_clip": 1.01514149, + "balance_loss_mlp": 1.03774214, + "epoch": 0.5520517059972945, + "flos": 23768052151680.0, + "grad_norm": 1.9118124234638791, + "language_loss": 0.79398257, + "learning_rate": 1.7604732007344486e-06, + "loss": 0.81536245, + "num_input_tokens_seen": 197840195, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.72265625, + "step": 9182, + "time_per_iteration": 2.4744672775268555 + }, + { + "auxiliary_loss_clip": 0.01107607, + "auxiliary_loss_mlp": 0.01027131, + "balance_loss_clip": 1.0145787, + "balance_loss_mlp": 1.03817368, + "epoch": 0.5521118292499624, + "flos": 22196457411840.0, + "grad_norm": 1.7815316362256517, + "language_loss": 0.82710314, + "learning_rate": 1.7600865467622003e-06, + "loss": 0.84845054, + "num_input_tokens_seen": 197859475, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 9183, + "time_per_iteration": 2.4999542236328125 + }, + { + "auxiliary_loss_clip": 0.01106614, + "auxiliary_loss_mlp": 0.01026614, + "balance_loss_clip": 1.01474106, + "balance_loss_mlp": 1.03841662, + "epoch": 0.5521719525026304, + "flos": 23583291569280.0, + "grad_norm": 1.3300741669264389, + "language_loss": 0.67200708, + "learning_rate": 1.7596999018874936e-06, + "loss": 0.69333941, + "num_input_tokens_seen": 197879395, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 9184, + "time_per_iteration": 2.4747231006622314 + }, + { + "auxiliary_loss_clip": 0.01107758, + "auxiliary_loss_mlp": 0.01025737, + "balance_loss_clip": 1.01336932, + "balance_loss_mlp": 1.03818047, + "epoch": 0.5522320757552983, + "flos": 26137617482880.0, + "grad_norm": 1.521307728440283, + "language_loss": 0.76197934, + "learning_rate": 1.7593132661249917e-06, + "loss": 0.78331435, + "num_input_tokens_seen": 197900815, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6953125, + "step": 9185, + "time_per_iteration": 2.534573793411255 + }, + { + "auxiliary_loss_clip": 0.01109207, + "auxiliary_loss_mlp": 0.01034245, + "balance_loss_clip": 1.02194285, + "balance_loss_mlp": 1.0396924, + "epoch": 0.5522921990079663, + "flos": 24676160820480.0, + "grad_norm": 1.6519250451143856, + "language_loss": 0.7376985, + "learning_rate": 1.7589266394893536e-06, + "loss": 0.75913298, + "num_input_tokens_seen": 197918985, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 9186, + "time_per_iteration": 2.5148305892944336 + }, + { + "auxiliary_loss_clip": 0.01111442, + "auxiliary_loss_mlp": 0.01033032, + "balance_loss_clip": 1.02137351, + "balance_loss_mlp": 1.04041481, + "epoch": 0.5523523222606344, + "flos": 22748153379840.0, + "grad_norm": 2.3297788732806275, + "language_loss": 0.6611231, + "learning_rate": 1.7585400219952421e-06, + "loss": 0.68256783, + "num_input_tokens_seen": 197937725, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.7109375, + "step": 9187, + "time_per_iteration": 2.4953529834747314 + }, + { + "auxiliary_loss_clip": 0.0110884, + "auxiliary_loss_mlp": 0.010278, + "balance_loss_clip": 1.01550388, + "balance_loss_mlp": 1.0389905, + "epoch": 0.5524124455133023, + "flos": 19755825022080.0, + "grad_norm": 1.699111440652827, + "language_loss": 0.77629888, + "learning_rate": 1.758153413657318e-06, + "loss": 0.79766524, + "num_input_tokens_seen": 197955635, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69921875, + "step": 9188, + "time_per_iteration": 2.4593770503997803 + }, + { + "auxiliary_loss_clip": 0.01105648, + "auxiliary_loss_mlp": 0.01030962, + "balance_loss_clip": 1.01829576, + "balance_loss_mlp": 1.03729725, + "epoch": 0.5524725687659703, + "flos": 23294821443840.0, + "grad_norm": 1.837373875573988, + "language_loss": 0.81666493, + "learning_rate": 1.7577668144902394e-06, + "loss": 0.83803099, + "num_input_tokens_seen": 197974490, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.68359375, + "step": 9189, + "time_per_iteration": 2.514223098754883 + }, + { + "auxiliary_loss_clip": 0.01105635, + "auxiliary_loss_mlp": 0.01028064, + "balance_loss_clip": 1.0153625, + "balance_loss_mlp": 1.03796136, + "epoch": 0.5525326920186382, + "flos": 24862178378880.0, + "grad_norm": 1.3687672594772107, + "language_loss": 0.76419669, + "learning_rate": 1.7573802245086684e-06, + "loss": 0.78553367, + "num_input_tokens_seen": 197995735, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 9190, + "time_per_iteration": 2.4991939067840576 + }, + { + "auxiliary_loss_clip": 0.01111398, + "auxiliary_loss_mlp": 0.0103191, + "balance_loss_clip": 1.01837981, + "balance_loss_mlp": 1.03823757, + "epoch": 0.5525928152713062, + "flos": 13735580906880.0, + "grad_norm": 3.1168017297152484, + "language_loss": 0.78959441, + "learning_rate": 1.7569936437272627e-06, + "loss": 0.81102753, + "num_input_tokens_seen": 198009685, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73046875, + "step": 9191, + "time_per_iteration": 2.447239875793457 + }, + { + "auxiliary_loss_clip": 0.01106392, + "auxiliary_loss_mlp": 0.01030056, + "balance_loss_clip": 1.01799178, + "balance_loss_mlp": 1.03781414, + "epoch": 0.5526529385239741, + "flos": 13071592045440.0, + "grad_norm": 2.1697062429363427, + "language_loss": 0.68734175, + "learning_rate": 1.7566070721606829e-06, + "loss": 0.70870626, + "num_input_tokens_seen": 198026845, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.68359375, + "step": 9192, + "time_per_iteration": 2.424194812774658 + }, + { + "auxiliary_loss_clip": 0.01104657, + "auxiliary_loss_mlp": 0.01031796, + "balance_loss_clip": 1.0210079, + "balance_loss_mlp": 1.03741503, + "epoch": 0.5527130617766421, + "flos": 23148377694720.0, + "grad_norm": 1.580245881596358, + "language_loss": 0.77429307, + "learning_rate": 1.756220509823588e-06, + "loss": 0.79565763, + "num_input_tokens_seen": 198045275, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.671875, + "step": 9193, + "time_per_iteration": 2.486544370651245 + }, + { + "auxiliary_loss_clip": 0.01106633, + "auxiliary_loss_mlp": 0.01033853, + "balance_loss_clip": 1.02199149, + "balance_loss_mlp": 1.03775311, + "epoch": 0.55277318502931, + "flos": 21285547482240.0, + "grad_norm": 1.6936547327162281, + "language_loss": 0.78554469, + "learning_rate": 1.7558339567306344e-06, + "loss": 0.80694956, + "num_input_tokens_seen": 198065760, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 9194, + "time_per_iteration": 2.446010112762451 + }, + { + "auxiliary_loss_clip": 0.01111391, + "auxiliary_loss_mlp": 0.01032697, + "balance_loss_clip": 1.01982856, + "balance_loss_mlp": 1.03737998, + "epoch": 0.5528333082819781, + "flos": 38324549462400.0, + "grad_norm": 1.6547854303314034, + "language_loss": 0.69580936, + "learning_rate": 1.7554474128964825e-06, + "loss": 0.71725023, + "num_input_tokens_seen": 198087595, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.73828125, + "step": 9195, + "time_per_iteration": 2.633622407913208 + }, + { + "auxiliary_loss_clip": 0.01114776, + "auxiliary_loss_mlp": 0.01029834, + "balance_loss_clip": 1.01669717, + "balance_loss_mlp": 1.0401336, + "epoch": 0.552893431534646, + "flos": 13553621585280.0, + "grad_norm": 2.085899367605988, + "language_loss": 0.73877811, + "learning_rate": 1.7550608783357887e-06, + "loss": 0.76022422, + "num_input_tokens_seen": 198104620, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.74609375, + "step": 9196, + "time_per_iteration": 2.4477953910827637 + }, + { + "auxiliary_loss_clip": 0.01107465, + "auxiliary_loss_mlp": 0.01032297, + "balance_loss_clip": 1.0202986, + "balance_loss_mlp": 1.03845131, + "epoch": 0.552953554787314, + "flos": 21939408708480.0, + "grad_norm": 1.5760086547957552, + "language_loss": 0.76767844, + "learning_rate": 1.7546743530632115e-06, + "loss": 0.78907609, + "num_input_tokens_seen": 198123565, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 9197, + "time_per_iteration": 2.4946064949035645 + }, + { + "auxiliary_loss_clip": 0.01104392, + "auxiliary_loss_mlp": 0.0102516, + "balance_loss_clip": 1.01429963, + "balance_loss_mlp": 1.03566051, + "epoch": 0.5530136780399819, + "flos": 43658002558080.0, + "grad_norm": 1.6045583807501234, + "language_loss": 0.76419538, + "learning_rate": 1.754287837093407e-06, + "loss": 0.78549087, + "num_input_tokens_seen": 198148270, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6875, + "step": 9198, + "time_per_iteration": 2.7027511596679688 + }, + { + "auxiliary_loss_clip": 0.01105504, + "auxiliary_loss_mlp": 0.01025375, + "balance_loss_clip": 1.0140028, + "balance_loss_mlp": 1.03652, + "epoch": 0.5530738012926499, + "flos": 25045502417280.0, + "grad_norm": 1.7911524754161214, + "language_loss": 0.79089695, + "learning_rate": 1.7539013304410327e-06, + "loss": 0.81220573, + "num_input_tokens_seen": 198168810, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6875, + "step": 9199, + "time_per_iteration": 2.5071682929992676 + }, + { + "auxiliary_loss_clip": 0.01106031, + "auxiliary_loss_mlp": 0.01030471, + "balance_loss_clip": 1.01893711, + "balance_loss_mlp": 1.03667951, + "epoch": 0.553133924545318, + "flos": 16472081623680.0, + "grad_norm": 1.789754163992573, + "language_loss": 0.64116317, + "learning_rate": 1.7535148331207443e-06, + "loss": 0.66252816, + "num_input_tokens_seen": 198186200, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6953125, + "step": 9200, + "time_per_iteration": 2.453810214996338 + }, + { + "auxiliary_loss_clip": 0.01112463, + "auxiliary_loss_mlp": 0.01027812, + "balance_loss_clip": 1.01444292, + "balance_loss_mlp": 1.03949916, + "epoch": 0.5531940477979859, + "flos": 24606207083520.0, + "grad_norm": 1.54627322023295, + "language_loss": 0.66172588, + "learning_rate": 1.7531283451471978e-06, + "loss": 0.6831286, + "num_input_tokens_seen": 198207050, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73046875, + "step": 9201, + "time_per_iteration": 2.5050048828125 + }, + { + "auxiliary_loss_clip": 0.01110041, + "auxiliary_loss_mlp": 0.01032711, + "balance_loss_clip": 1.02000964, + "balance_loss_mlp": 1.04039264, + "epoch": 0.5532541710506539, + "flos": 22159577122560.0, + "grad_norm": 2.1300156031813624, + "language_loss": 0.60931027, + "learning_rate": 1.7527418665350502e-06, + "loss": 0.63073778, + "num_input_tokens_seen": 198224565, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 9202, + "time_per_iteration": 2.454374074935913 + }, + { + "auxiliary_loss_clip": 0.01105546, + "auxiliary_loss_mlp": 0.01028359, + "balance_loss_clip": 1.01677179, + "balance_loss_mlp": 1.0374378, + "epoch": 0.5533142943033218, + "flos": 21397265758080.0, + "grad_norm": 1.6333926311503897, + "language_loss": 0.64007318, + "learning_rate": 1.7523553972989548e-06, + "loss": 0.66141224, + "num_input_tokens_seen": 198244790, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6796875, + "step": 9203, + "time_per_iteration": 2.520813226699829 + }, + { + "auxiliary_loss_clip": 0.01106796, + "auxiliary_loss_mlp": 0.0102947, + "balance_loss_clip": 1.01762629, + "balance_loss_mlp": 1.03710103, + "epoch": 0.5533744175559898, + "flos": 23550541344000.0, + "grad_norm": 1.5876710884236471, + "language_loss": 0.63839149, + "learning_rate": 1.7519689374535683e-06, + "loss": 0.65975416, + "num_input_tokens_seen": 198264375, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6953125, + "step": 9204, + "time_per_iteration": 2.519796371459961 + }, + { + "auxiliary_loss_clip": 0.01103569, + "auxiliary_loss_mlp": 0.01026922, + "balance_loss_clip": 1.01617515, + "balance_loss_mlp": 1.0357914, + "epoch": 0.5534345408086577, + "flos": 24061514267520.0, + "grad_norm": 1.7042490030554438, + "language_loss": 0.77431834, + "learning_rate": 1.7515824870135445e-06, + "loss": 0.79562324, + "num_input_tokens_seen": 198283895, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6796875, + "step": 9205, + "time_per_iteration": 2.5149800777435303 + }, + { + "auxiliary_loss_clip": 0.01105223, + "auxiliary_loss_mlp": 0.01029733, + "balance_loss_clip": 1.01799703, + "balance_loss_mlp": 1.03753543, + "epoch": 0.5534946640613257, + "flos": 33771831408000.0, + "grad_norm": 1.5447277527142993, + "language_loss": 0.72338134, + "learning_rate": 1.751196045993537e-06, + "loss": 0.74473095, + "num_input_tokens_seen": 198310035, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.67578125, + "step": 9206, + "time_per_iteration": 2.6088132858276367 + }, + { + "auxiliary_loss_clip": 0.01107088, + "auxiliary_loss_mlp": 0.01030178, + "balance_loss_clip": 1.01891243, + "balance_loss_mlp": 1.03847539, + "epoch": 0.5535547873139937, + "flos": 15159223526400.0, + "grad_norm": 1.9679878300179545, + "language_loss": 0.75601065, + "learning_rate": 1.7508096144082012e-06, + "loss": 0.77738333, + "num_input_tokens_seen": 198327810, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6875, + "step": 9207, + "time_per_iteration": 2.4550647735595703 + }, + { + "auxiliary_loss_clip": 0.01112139, + "auxiliary_loss_mlp": 0.01031947, + "balance_loss_clip": 1.01861894, + "balance_loss_mlp": 1.03909707, + "epoch": 0.5536149105666617, + "flos": 16980863817600.0, + "grad_norm": 2.4900859433120055, + "language_loss": 0.61790574, + "learning_rate": 1.750423192272189e-06, + "loss": 0.6393466, + "num_input_tokens_seen": 198343150, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73046875, + "step": 9208, + "time_per_iteration": 2.4474070072174072 + }, + { + "auxiliary_loss_clip": 0.01109576, + "auxiliary_loss_mlp": 0.01030497, + "balance_loss_clip": 1.01908827, + "balance_loss_mlp": 1.03917742, + "epoch": 0.5536750338193296, + "flos": 18149935772160.0, + "grad_norm": 2.138498398763569, + "language_loss": 0.64059991, + "learning_rate": 1.7500367796001547e-06, + "loss": 0.66200066, + "num_input_tokens_seen": 198360925, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.703125, + "step": 9209, + "time_per_iteration": 2.49118709564209 + }, + { + "auxiliary_loss_clip": 0.01106938, + "auxiliary_loss_mlp": 0.01032725, + "balance_loss_clip": 1.02030945, + "balance_loss_mlp": 1.03779769, + "epoch": 0.5537351570719976, + "flos": 22747794243840.0, + "grad_norm": 1.9091325066097349, + "language_loss": 0.8244276, + "learning_rate": 1.7496503764067513e-06, + "loss": 0.84582424, + "num_input_tokens_seen": 198379265, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69140625, + "step": 9210, + "time_per_iteration": 2.479508876800537 + }, + { + "auxiliary_loss_clip": 0.01104462, + "auxiliary_loss_mlp": 0.01027145, + "balance_loss_clip": 1.01554608, + "balance_loss_mlp": 1.03640354, + "epoch": 0.5537952803246655, + "flos": 26356026130560.0, + "grad_norm": 1.9903415105614328, + "language_loss": 0.72810864, + "learning_rate": 1.74926398270663e-06, + "loss": 0.74942476, + "num_input_tokens_seen": 198399490, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 9211, + "time_per_iteration": 2.56174635887146 + }, + { + "auxiliary_loss_clip": 0.01109862, + "auxiliary_loss_mlp": 0.01034269, + "balance_loss_clip": 1.02101886, + "balance_loss_mlp": 1.03795481, + "epoch": 0.5538554035773335, + "flos": 18037427397120.0, + "grad_norm": 1.687820261734967, + "language_loss": 0.66492426, + "learning_rate": 1.7488775985144437e-06, + "loss": 0.68636549, + "num_input_tokens_seen": 198419110, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 9212, + "time_per_iteration": 2.4493961334228516 + }, + { + "auxiliary_loss_clip": 0.01107628, + "auxiliary_loss_mlp": 0.01032485, + "balance_loss_clip": 1.01846039, + "balance_loss_mlp": 1.03564453, + "epoch": 0.5539155268300014, + "flos": 31686247002240.0, + "grad_norm": 1.478127311181698, + "language_loss": 0.51676697, + "learning_rate": 1.7484912238448443e-06, + "loss": 0.53816813, + "num_input_tokens_seen": 198441360, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.71875, + "step": 9213, + "time_per_iteration": 2.5872037410736084 + }, + { + "auxiliary_loss_clip": 0.01111386, + "auxiliary_loss_mlp": 0.01031335, + "balance_loss_clip": 1.01868105, + "balance_loss_mlp": 1.03979373, + "epoch": 0.5539756500826695, + "flos": 15193769431680.0, + "grad_norm": 1.9151587743929102, + "language_loss": 0.8548407, + "learning_rate": 1.7481048587124827e-06, + "loss": 0.87626791, + "num_input_tokens_seen": 198459835, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71484375, + "step": 9214, + "time_per_iteration": 2.4696502685546875 + }, + { + "auxiliary_loss_clip": 0.01108266, + "auxiliary_loss_mlp": 0.01027132, + "balance_loss_clip": 1.01570582, + "balance_loss_mlp": 1.03970075, + "epoch": 0.5540357733353375, + "flos": 26353117128960.0, + "grad_norm": 1.700191688942819, + "language_loss": 0.70016778, + "learning_rate": 1.7477185031320108e-06, + "loss": 0.72152174, + "num_input_tokens_seen": 198478955, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6875, + "step": 9215, + "time_per_iteration": 2.50022029876709 + }, + { + "auxiliary_loss_clip": 0.01109258, + "auxiliary_loss_mlp": 0.01029951, + "balance_loss_clip": 1.01724386, + "balance_loss_mlp": 1.03815317, + "epoch": 0.5540958965880054, + "flos": 21323684747520.0, + "grad_norm": 1.5266679061001223, + "language_loss": 0.73124695, + "learning_rate": 1.7473321571180773e-06, + "loss": 0.75263906, + "num_input_tokens_seen": 198499030, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9216, + "time_per_iteration": 2.4683403968811035 + }, + { + "auxiliary_loss_clip": 0.01105693, + "auxiliary_loss_mlp": 0.01031419, + "balance_loss_clip": 1.01916385, + "balance_loss_mlp": 1.03830385, + "epoch": 0.5541560198406734, + "flos": 25666828899840.0, + "grad_norm": 1.9596921442179602, + "language_loss": 0.71501839, + "learning_rate": 1.7469458206853345e-06, + "loss": 0.73638952, + "num_input_tokens_seen": 198520265, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 9217, + "time_per_iteration": 2.542431592941284 + }, + { + "auxiliary_loss_clip": 0.01103432, + "auxiliary_loss_mlp": 0.010249, + "balance_loss_clip": 1.01331282, + "balance_loss_mlp": 1.03553486, + "epoch": 0.5542161430933413, + "flos": 21939624190080.0, + "grad_norm": 1.8113809838055568, + "language_loss": 0.7838676, + "learning_rate": 1.7465594938484315e-06, + "loss": 0.80515093, + "num_input_tokens_seen": 198539645, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 9218, + "time_per_iteration": 3.8476054668426514 + }, + { + "auxiliary_loss_clip": 0.01108339, + "auxiliary_loss_mlp": 0.01034768, + "balance_loss_clip": 1.02095163, + "balance_loss_mlp": 1.03540277, + "epoch": 0.5542762663460093, + "flos": 19571459489280.0, + "grad_norm": 2.0355993872839675, + "language_loss": 0.72591358, + "learning_rate": 1.7461731766220176e-06, + "loss": 0.74734467, + "num_input_tokens_seen": 198558710, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7265625, + "step": 9219, + "time_per_iteration": 2.4924545288085938 + }, + { + "auxiliary_loss_clip": 0.01111531, + "auxiliary_loss_mlp": 0.01039554, + "balance_loss_clip": 1.02701962, + "balance_loss_mlp": 1.03986812, + "epoch": 0.5543363895986773, + "flos": 19499063627520.0, + "grad_norm": 1.546677051774663, + "language_loss": 0.71403503, + "learning_rate": 1.7457868690207426e-06, + "loss": 0.73554587, + "num_input_tokens_seen": 198577050, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71875, + "step": 9220, + "time_per_iteration": 2.4362480640411377 + }, + { + "auxiliary_loss_clip": 0.01106556, + "auxiliary_loss_mlp": 0.0102571, + "balance_loss_clip": 1.01424217, + "balance_loss_mlp": 1.03777957, + "epoch": 0.5543965128513453, + "flos": 22635609091200.0, + "grad_norm": 1.6357699921116782, + "language_loss": 0.79294407, + "learning_rate": 1.7454005710592547e-06, + "loss": 0.81426674, + "num_input_tokens_seen": 198595290, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6875, + "step": 9221, + "time_per_iteration": 5.3692920207977295 + }, + { + "auxiliary_loss_clip": 0.01107012, + "auxiliary_loss_mlp": 0.01030177, + "balance_loss_clip": 1.01745725, + "balance_loss_mlp": 1.03750253, + "epoch": 0.5544566361040132, + "flos": 25989952671360.0, + "grad_norm": 1.7434924477802918, + "language_loss": 0.83865321, + "learning_rate": 1.7450142827522027e-06, + "loss": 0.86002505, + "num_input_tokens_seen": 198614110, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 9222, + "time_per_iteration": 2.5054023265838623 + }, + { + "auxiliary_loss_clip": 0.0111308, + "auxiliary_loss_mlp": 0.01034265, + "balance_loss_clip": 1.02092493, + "balance_loss_mlp": 1.04003119, + "epoch": 0.5545167593566812, + "flos": 28257568225920.0, + "grad_norm": 1.7723513069494143, + "language_loss": 0.75498754, + "learning_rate": 1.7446280041142344e-06, + "loss": 0.77646095, + "num_input_tokens_seen": 198633880, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73046875, + "step": 9223, + "time_per_iteration": 2.5140554904937744 + }, + { + "auxiliary_loss_clip": 0.01108507, + "auxiliary_loss_mlp": 0.01028359, + "balance_loss_clip": 1.0155921, + "balance_loss_mlp": 1.03917074, + "epoch": 0.5545768826093491, + "flos": 28476551491200.0, + "grad_norm": 1.798104527740367, + "language_loss": 0.81975842, + "learning_rate": 1.7442417351599986e-06, + "loss": 0.84112704, + "num_input_tokens_seen": 198653505, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 9224, + "time_per_iteration": 2.5273303985595703 + }, + { + "auxiliary_loss_clip": 0.01110718, + "auxiliary_loss_mlp": 0.01040562, + "balance_loss_clip": 1.02769315, + "balance_loss_mlp": 1.0393647, + "epoch": 0.5546370058620171, + "flos": 18478051534080.0, + "grad_norm": 2.764116317399656, + "language_loss": 0.5700891, + "learning_rate": 1.743855475904141e-06, + "loss": 0.59160185, + "num_input_tokens_seen": 198671890, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9225, + "time_per_iteration": 2.4379100799560547 + }, + { + "auxiliary_loss_clip": 0.01110187, + "auxiliary_loss_mlp": 0.01036326, + "balance_loss_clip": 1.02342129, + "balance_loss_mlp": 1.03836024, + "epoch": 0.554697129114685, + "flos": 22930507751040.0, + "grad_norm": 1.5085866030732613, + "language_loss": 0.67495418, + "learning_rate": 1.7434692263613098e-06, + "loss": 0.69641924, + "num_input_tokens_seen": 198691995, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 9226, + "time_per_iteration": 2.4891088008880615 + }, + { + "auxiliary_loss_clip": 0.01107189, + "auxiliary_loss_mlp": 0.01032344, + "balance_loss_clip": 1.01961827, + "balance_loss_mlp": 1.03644681, + "epoch": 0.5547572523673531, + "flos": 21797166850560.0, + "grad_norm": 1.4051697234065024, + "language_loss": 0.74315172, + "learning_rate": 1.7430829865461518e-06, + "loss": 0.76454705, + "num_input_tokens_seen": 198712440, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 9227, + "time_per_iteration": 2.4678173065185547 + }, + { + "auxiliary_loss_clip": 0.01114145, + "auxiliary_loss_mlp": 0.01030915, + "balance_loss_clip": 1.01826084, + "balance_loss_mlp": 1.04228091, + "epoch": 0.5548173756200211, + "flos": 22342829333760.0, + "grad_norm": 2.5448731753452405, + "language_loss": 0.73452151, + "learning_rate": 1.7426967564733118e-06, + "loss": 0.75597215, + "num_input_tokens_seen": 198731515, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 9228, + "time_per_iteration": 2.4851813316345215 + }, + { + "auxiliary_loss_clip": 0.01110082, + "auxiliary_loss_mlp": 0.01030138, + "balance_loss_clip": 1.01803231, + "balance_loss_mlp": 1.03902888, + "epoch": 0.554877498872689, + "flos": 17858736213120.0, + "grad_norm": 2.153919283771507, + "language_loss": 0.76069826, + "learning_rate": 1.7423105361574373e-06, + "loss": 0.7821005, + "num_input_tokens_seen": 198749750, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.7109375, + "step": 9229, + "time_per_iteration": 2.4682509899139404 + }, + { + "auxiliary_loss_clip": 0.01110192, + "auxiliary_loss_mlp": 0.01039645, + "balance_loss_clip": 1.02623343, + "balance_loss_mlp": 1.03956127, + "epoch": 0.554937622125357, + "flos": 17238343484160.0, + "grad_norm": 1.3529022003633056, + "language_loss": 0.68695533, + "learning_rate": 1.741924325613172e-06, + "loss": 0.70845366, + "num_input_tokens_seen": 198768320, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.70703125, + "step": 9230, + "time_per_iteration": 2.4558916091918945 + }, + { + "auxiliary_loss_clip": 0.01110086, + "auxiliary_loss_mlp": 0.01033326, + "balance_loss_clip": 1.02054107, + "balance_loss_mlp": 1.03759503, + "epoch": 0.5549977453780249, + "flos": 25368087484800.0, + "grad_norm": 2.0513203800368327, + "language_loss": 0.67574155, + "learning_rate": 1.741538124855163e-06, + "loss": 0.69717568, + "num_input_tokens_seen": 198787230, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7265625, + "step": 9231, + "time_per_iteration": 2.4816246032714844 + }, + { + "auxiliary_loss_clip": 0.01113542, + "auxiliary_loss_mlp": 0.01035024, + "balance_loss_clip": 1.02160072, + "balance_loss_mlp": 1.03941798, + "epoch": 0.555057868630693, + "flos": 25079114568960.0, + "grad_norm": 1.5458592279354035, + "language_loss": 0.77953124, + "learning_rate": 1.7411519338980548e-06, + "loss": 0.80101693, + "num_input_tokens_seen": 198806720, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7421875, + "step": 9232, + "time_per_iteration": 2.5161256790161133 + }, + { + "auxiliary_loss_clip": 0.01106102, + "auxiliary_loss_mlp": 0.01037505, + "balance_loss_clip": 1.02622199, + "balance_loss_mlp": 1.03777027, + "epoch": 0.5551179918833609, + "flos": 26104220812800.0, + "grad_norm": 1.5305081634070101, + "language_loss": 0.82585824, + "learning_rate": 1.7407657527564898e-06, + "loss": 0.84729433, + "num_input_tokens_seen": 198826235, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.68359375, + "step": 9233, + "time_per_iteration": 2.513498306274414 + }, + { + "auxiliary_loss_clip": 0.01112184, + "auxiliary_loss_mlp": 0.01039099, + "balance_loss_clip": 1.02717805, + "balance_loss_mlp": 1.03902006, + "epoch": 0.5551781151360289, + "flos": 19384759572480.0, + "grad_norm": 2.1768956460608053, + "language_loss": 0.75171268, + "learning_rate": 1.7403795814451142e-06, + "loss": 0.77322543, + "num_input_tokens_seen": 198842655, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.734375, + "step": 9234, + "time_per_iteration": 2.4618585109710693 + }, + { + "auxiliary_loss_clip": 0.01105123, + "auxiliary_loss_mlp": 0.01028385, + "balance_loss_clip": 1.01663136, + "balance_loss_mlp": 1.03685272, + "epoch": 0.5552382383886968, + "flos": 21725956137600.0, + "grad_norm": 2.1362991517660146, + "language_loss": 0.64992738, + "learning_rate": 1.7399934199785706e-06, + "loss": 0.6712625, + "num_input_tokens_seen": 198861210, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.68359375, + "step": 9235, + "time_per_iteration": 2.4449851512908936 + }, + { + "auxiliary_loss_clip": 0.01108941, + "auxiliary_loss_mlp": 0.01032132, + "balance_loss_clip": 1.01977587, + "balance_loss_mlp": 1.03794515, + "epoch": 0.5552983616413648, + "flos": 14356189117440.0, + "grad_norm": 1.8479272776295672, + "language_loss": 0.67863953, + "learning_rate": 1.7396072683715029e-06, + "loss": 0.70005023, + "num_input_tokens_seen": 198880045, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.7109375, + "step": 9236, + "time_per_iteration": 2.4798662662506104 + }, + { + "auxiliary_loss_clip": 0.01104311, + "auxiliary_loss_mlp": 0.01024908, + "balance_loss_clip": 1.0127244, + "balance_loss_mlp": 1.03731084, + "epoch": 0.5553584848940327, + "flos": 25478548784640.0, + "grad_norm": 3.129052058582791, + "language_loss": 0.86174095, + "learning_rate": 1.7392211266385536e-06, + "loss": 0.88303316, + "num_input_tokens_seen": 198900210, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.671875, + "step": 9237, + "time_per_iteration": 2.4789483547210693 + }, + { + "auxiliary_loss_clip": 0.01104495, + "auxiliary_loss_mlp": 0.01032906, + "balance_loss_clip": 1.02062178, + "balance_loss_mlp": 1.03669763, + "epoch": 0.5554186081467007, + "flos": 22163850840960.0, + "grad_norm": 1.712591160520522, + "language_loss": 0.73281908, + "learning_rate": 1.7388349947943652e-06, + "loss": 0.75419307, + "num_input_tokens_seen": 198919055, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.67578125, + "step": 9238, + "time_per_iteration": 2.4812166690826416 + }, + { + "auxiliary_loss_clip": 0.01109816, + "auxiliary_loss_mlp": 0.0103234, + "balance_loss_clip": 1.01997221, + "balance_loss_mlp": 1.03750467, + "epoch": 0.5554787313993687, + "flos": 49746656125440.0, + "grad_norm": 1.5735650405734192, + "language_loss": 0.78268331, + "learning_rate": 1.73844887285358e-06, + "loss": 0.80410492, + "num_input_tokens_seen": 198943505, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.72265625, + "step": 9239, + "time_per_iteration": 2.6846883296966553 + }, + { + "auxiliary_loss_clip": 0.01107901, + "auxiliary_loss_mlp": 0.0102797, + "balance_loss_clip": 1.01580429, + "balance_loss_mlp": 1.03730011, + "epoch": 0.5555388546520367, + "flos": 22127365601280.0, + "grad_norm": 1.4802036052022307, + "language_loss": 0.79760826, + "learning_rate": 1.7380627608308393e-06, + "loss": 0.81896698, + "num_input_tokens_seen": 198963590, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.703125, + "step": 9240, + "time_per_iteration": 2.4733242988586426 + }, + { + "auxiliary_loss_clip": 0.01107185, + "auxiliary_loss_mlp": 0.01032369, + "balance_loss_clip": 1.02035236, + "balance_loss_mlp": 1.0374887, + "epoch": 0.5555989779047047, + "flos": 24682122478080.0, + "grad_norm": 1.5810234034759716, + "language_loss": 0.6520583, + "learning_rate": 1.737676658740786e-06, + "loss": 0.67345387, + "num_input_tokens_seen": 198982680, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69921875, + "step": 9241, + "time_per_iteration": 2.4733994007110596 + }, + { + "auxiliary_loss_clip": 0.01109149, + "auxiliary_loss_mlp": 0.01032145, + "balance_loss_clip": 1.01974106, + "balance_loss_mlp": 1.03843307, + "epoch": 0.5556591011573726, + "flos": 16106510954880.0, + "grad_norm": 1.9354963557050642, + "language_loss": 0.72742647, + "learning_rate": 1.7372905665980594e-06, + "loss": 0.74883944, + "num_input_tokens_seen": 199000185, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.70703125, + "step": 9242, + "time_per_iteration": 2.439195394515991 + }, + { + "auxiliary_loss_clip": 0.01109113, + "auxiliary_loss_mlp": 0.0103746, + "balance_loss_clip": 1.02429366, + "balance_loss_mlp": 1.03737354, + "epoch": 0.5557192244100406, + "flos": 12933695733120.0, + "grad_norm": 1.6615305539564786, + "language_loss": 0.63989079, + "learning_rate": 1.7369044844173012e-06, + "loss": 0.66135651, + "num_input_tokens_seen": 199018380, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71875, + "step": 9243, + "time_per_iteration": 2.5009653568267822 + }, + { + "auxiliary_loss_clip": 0.01109943, + "auxiliary_loss_mlp": 0.01031237, + "balance_loss_clip": 1.01894689, + "balance_loss_mlp": 1.03998828, + "epoch": 0.5557793476627085, + "flos": 23111712887040.0, + "grad_norm": 1.8112849174534187, + "language_loss": 0.75149089, + "learning_rate": 1.7365184122131509e-06, + "loss": 0.77290273, + "num_input_tokens_seen": 199037115, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 9244, + "time_per_iteration": 2.475520610809326 + }, + { + "auxiliary_loss_clip": 0.01102344, + "auxiliary_loss_mlp": 0.01028296, + "balance_loss_clip": 1.01693511, + "balance_loss_mlp": 1.03605533, + "epoch": 0.5558394709153766, + "flos": 21428040735360.0, + "grad_norm": 2.1432873648263473, + "language_loss": 0.74578094, + "learning_rate": 1.7361323500002486e-06, + "loss": 0.76708734, + "num_input_tokens_seen": 199053375, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6640625, + "step": 9245, + "time_per_iteration": 2.45875883102417 + }, + { + "auxiliary_loss_clip": 0.01111156, + "auxiliary_loss_mlp": 0.01031213, + "balance_loss_clip": 1.01832068, + "balance_loss_mlp": 1.03885865, + "epoch": 0.5558995941680445, + "flos": 25078324469760.0, + "grad_norm": 2.0585608296199, + "language_loss": 0.79468071, + "learning_rate": 1.7357462977932348e-06, + "loss": 0.81610441, + "num_input_tokens_seen": 199070930, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.72265625, + "step": 9246, + "time_per_iteration": 2.5065393447875977 + }, + { + "auxiliary_loss_clip": 0.01108664, + "auxiliary_loss_mlp": 0.01032421, + "balance_loss_clip": 1.02022004, + "balance_loss_mlp": 1.03894639, + "epoch": 0.5559597174207125, + "flos": 20011149872640.0, + "grad_norm": 1.99088564820557, + "language_loss": 0.73864704, + "learning_rate": 1.7353602556067471e-06, + "loss": 0.76005793, + "num_input_tokens_seen": 199088675, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69921875, + "step": 9247, + "time_per_iteration": 2.535578489303589 + }, + { + "auxiliary_loss_clip": 0.01108657, + "auxiliary_loss_mlp": 0.01033098, + "balance_loss_clip": 1.02047944, + "balance_loss_mlp": 1.03822637, + "epoch": 0.5560198406733804, + "flos": 16835677044480.0, + "grad_norm": 3.9448346084731214, + "language_loss": 0.76161623, + "learning_rate": 1.7349742234554254e-06, + "loss": 0.78303373, + "num_input_tokens_seen": 199103075, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 9248, + "time_per_iteration": 2.4247324466705322 + }, + { + "auxiliary_loss_clip": 0.01031453, + "auxiliary_loss_mlp": 0.01002871, + "balance_loss_clip": 1.00163698, + "balance_loss_mlp": 1.00995636, + "epoch": 0.5560799639260484, + "flos": 70697051758080.0, + "grad_norm": 0.8418132845618771, + "language_loss": 0.59482312, + "learning_rate": 1.7345882013539081e-06, + "loss": 0.61516631, + "num_input_tokens_seen": 199160325, + "router_z_loss_clip": 0.0123291, + "router_z_loss_mlp": 0.21484375, + "step": 9249, + "time_per_iteration": 3.1760778427124023 + }, + { + "auxiliary_loss_clip": 0.01104052, + "auxiliary_loss_mlp": 0.01027363, + "balance_loss_clip": 1.01514411, + "balance_loss_mlp": 1.03505003, + "epoch": 0.5561400871787163, + "flos": 23148593176320.0, + "grad_norm": 1.8510226601540976, + "language_loss": 0.79942709, + "learning_rate": 1.734202189316832e-06, + "loss": 0.82074124, + "num_input_tokens_seen": 199179760, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 9250, + "time_per_iteration": 2.4803051948547363 + }, + { + "auxiliary_loss_clip": 0.01107715, + "auxiliary_loss_mlp": 0.01032139, + "balance_loss_clip": 1.01952708, + "balance_loss_mlp": 1.03654897, + "epoch": 0.5562002104313843, + "flos": 17566423332480.0, + "grad_norm": 2.627943398678235, + "language_loss": 0.68456143, + "learning_rate": 1.733816187358836e-06, + "loss": 0.70596004, + "num_input_tokens_seen": 199196695, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 9251, + "time_per_iteration": 2.4627792835235596 + }, + { + "auxiliary_loss_clip": 0.01106616, + "auxiliary_loss_mlp": 0.01029568, + "balance_loss_clip": 1.01753998, + "balance_loss_mlp": 1.03680301, + "epoch": 0.5562603336840523, + "flos": 25045430590080.0, + "grad_norm": 1.9270315036455492, + "language_loss": 0.75472188, + "learning_rate": 1.7334301954945569e-06, + "loss": 0.77608371, + "num_input_tokens_seen": 199217845, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6953125, + "step": 9252, + "time_per_iteration": 2.5131149291992188 + }, + { + "auxiliary_loss_clip": 0.01109989, + "auxiliary_loss_mlp": 0.01032399, + "balance_loss_clip": 1.02020955, + "balance_loss_mlp": 1.0379473, + "epoch": 0.5563204569367203, + "flos": 29059022436480.0, + "grad_norm": 1.5243167641625328, + "language_loss": 0.72841972, + "learning_rate": 1.7330442137386313e-06, + "loss": 0.74984354, + "num_input_tokens_seen": 199239250, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.71875, + "step": 9253, + "time_per_iteration": 2.545469045639038 + }, + { + "auxiliary_loss_clip": 0.01108615, + "auxiliary_loss_mlp": 0.01029855, + "balance_loss_clip": 1.01835763, + "balance_loss_mlp": 1.03873754, + "epoch": 0.5563805801893883, + "flos": 22090449398400.0, + "grad_norm": 1.7630844010149394, + "language_loss": 0.8319999, + "learning_rate": 1.7326582421056965e-06, + "loss": 0.85338461, + "num_input_tokens_seen": 199258320, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.69921875, + "step": 9254, + "time_per_iteration": 2.4762439727783203 + }, + { + "auxiliary_loss_clip": 0.01028463, + "auxiliary_loss_mlp": 0.00998119, + "balance_loss_clip": 0.99699229, + "balance_loss_mlp": 1.00661826, + "epoch": 0.5564407034420562, + "flos": 58636128689280.0, + "grad_norm": 0.880020971367601, + "language_loss": 0.64831799, + "learning_rate": 1.732272280610387e-06, + "loss": 0.66858381, + "num_input_tokens_seen": 199314840, + "router_z_loss_clip": 0.0112915, + "router_z_loss_mlp": 0.21875, + "step": 9255, + "time_per_iteration": 2.894592523574829 + }, + { + "auxiliary_loss_clip": 0.01108855, + "auxiliary_loss_mlp": 0.01034937, + "balance_loss_clip": 1.02330816, + "balance_loss_mlp": 1.04103208, + "epoch": 0.5565008266947242, + "flos": 23112323418240.0, + "grad_norm": 1.9305562864951415, + "language_loss": 0.69224131, + "learning_rate": 1.7318863292673399e-06, + "loss": 0.71367919, + "num_input_tokens_seen": 199335405, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 9256, + "time_per_iteration": 2.489379644393921 + }, + { + "auxiliary_loss_clip": 0.01102517, + "auxiliary_loss_mlp": 0.01029113, + "balance_loss_clip": 1.01805019, + "balance_loss_mlp": 1.03555584, + "epoch": 0.5565609499473921, + "flos": 21578399066880.0, + "grad_norm": 1.531147439374393, + "language_loss": 0.75793779, + "learning_rate": 1.73150038809119e-06, + "loss": 0.77925408, + "num_input_tokens_seen": 199354345, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.66796875, + "step": 9257, + "time_per_iteration": 2.484574794769287 + }, + { + "auxiliary_loss_clip": 0.01106314, + "auxiliary_loss_mlp": 0.01036216, + "balance_loss_clip": 1.02477169, + "balance_loss_mlp": 1.03559875, + "epoch": 0.5566210732000602, + "flos": 18369637309440.0, + "grad_norm": 4.5210433992726635, + "language_loss": 0.61403644, + "learning_rate": 1.7311144570965724e-06, + "loss": 0.63546175, + "num_input_tokens_seen": 199372250, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.70703125, + "step": 9258, + "time_per_iteration": 2.4358863830566406 + }, + { + "auxiliary_loss_clip": 0.0110731, + "auxiliary_loss_mlp": 0.01032708, + "balance_loss_clip": 1.01988161, + "balance_loss_mlp": 1.0372082, + "epoch": 0.5566811964527281, + "flos": 25703350053120.0, + "grad_norm": 1.630618195357818, + "language_loss": 0.79231477, + "learning_rate": 1.7307285362981215e-06, + "loss": 0.81371492, + "num_input_tokens_seen": 199392815, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.703125, + "step": 9259, + "time_per_iteration": 3.931269884109497 + }, + { + "auxiliary_loss_clip": 0.01106185, + "auxiliary_loss_mlp": 0.01031095, + "balance_loss_clip": 1.01859045, + "balance_loss_mlp": 1.03665948, + "epoch": 0.5567413197053961, + "flos": 26943991856640.0, + "grad_norm": 1.9981692343252953, + "language_loss": 0.81332636, + "learning_rate": 1.7303426257104712e-06, + "loss": 0.83469915, + "num_input_tokens_seen": 199412375, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 9260, + "time_per_iteration": 2.5092766284942627 + }, + { + "auxiliary_loss_clip": 0.01108362, + "auxiliary_loss_mlp": 0.01037114, + "balance_loss_clip": 1.02450144, + "balance_loss_mlp": 1.03862071, + "epoch": 0.556801442958064, + "flos": 20850597694080.0, + "grad_norm": 1.4782542821591422, + "language_loss": 0.68771613, + "learning_rate": 1.729956725348256e-06, + "loss": 0.70917082, + "num_input_tokens_seen": 199431490, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69921875, + "step": 9261, + "time_per_iteration": 2.5739381313323975 + }, + { + "auxiliary_loss_clip": 0.01027391, + "auxiliary_loss_mlp": 0.01004087, + "balance_loss_clip": 1.00296021, + "balance_loss_mlp": 1.00587916, + "epoch": 0.556861566210732, + "flos": 70498213044480.0, + "grad_norm": 0.7282105219345391, + "language_loss": 0.61132908, + "learning_rate": 1.729570835226108e-06, + "loss": 0.63164389, + "num_input_tokens_seen": 199495855, + "router_z_loss_clip": 0.0112915, + "router_z_loss_mlp": 0.21484375, + "step": 9262, + "time_per_iteration": 5.870652675628662 + }, + { + "auxiliary_loss_clip": 0.01108355, + "auxiliary_loss_mlp": 0.01033188, + "balance_loss_clip": 1.02145159, + "balance_loss_mlp": 1.0379622, + "epoch": 0.5569216894633999, + "flos": 25337276593920.0, + "grad_norm": 1.6754840031905727, + "language_loss": 0.64504874, + "learning_rate": 1.7291849553586622e-06, + "loss": 0.66646421, + "num_input_tokens_seen": 199515870, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.703125, + "step": 9263, + "time_per_iteration": 3.9533426761627197 + }, + { + "auxiliary_loss_clip": 0.01107431, + "auxiliary_loss_mlp": 0.0103239, + "balance_loss_clip": 1.02047563, + "balance_loss_mlp": 1.03795195, + "epoch": 0.556981812716068, + "flos": 22638733574400.0, + "grad_norm": 2.058460487271679, + "language_loss": 0.73137188, + "learning_rate": 1.7287990857605497e-06, + "loss": 0.75277007, + "num_input_tokens_seen": 199535745, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 9264, + "time_per_iteration": 2.493511199951172 + }, + { + "auxiliary_loss_clip": 0.0111011, + "auxiliary_loss_mlp": 0.01031978, + "balance_loss_clip": 1.02008092, + "balance_loss_mlp": 1.04015422, + "epoch": 0.5570419359687359, + "flos": 11035852738560.0, + "grad_norm": 1.9025948017547305, + "language_loss": 0.75953865, + "learning_rate": 1.7284132264464022e-06, + "loss": 0.78095955, + "num_input_tokens_seen": 199554035, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.703125, + "step": 9265, + "time_per_iteration": 2.4533309936523438 + }, + { + "auxiliary_loss_clip": 0.01103692, + "auxiliary_loss_mlp": 0.01030017, + "balance_loss_clip": 1.01909113, + "balance_loss_mlp": 1.03774786, + "epoch": 0.5571020592214039, + "flos": 22823135020800.0, + "grad_norm": 1.366142740242795, + "language_loss": 0.7096293, + "learning_rate": 1.7280273774308536e-06, + "loss": 0.73096645, + "num_input_tokens_seen": 199576120, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.66015625, + "step": 9266, + "time_per_iteration": 2.5045597553253174 + }, + { + "auxiliary_loss_clip": 0.01106333, + "auxiliary_loss_mlp": 0.01033928, + "balance_loss_clip": 1.02204871, + "balance_loss_mlp": 1.03720617, + "epoch": 0.5571621824740719, + "flos": 22927778317440.0, + "grad_norm": 1.7291111077620351, + "language_loss": 0.681355, + "learning_rate": 1.727641538728533e-06, + "loss": 0.7027576, + "num_input_tokens_seen": 199593780, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69140625, + "step": 9267, + "time_per_iteration": 2.5197811126708984 + }, + { + "auxiliary_loss_clip": 0.01103883, + "auxiliary_loss_mlp": 0.01034523, + "balance_loss_clip": 1.02367473, + "balance_loss_mlp": 1.03763127, + "epoch": 0.5572223057267398, + "flos": 22966705681920.0, + "grad_norm": 1.9159467095237732, + "language_loss": 0.74278724, + "learning_rate": 1.7272557103540736e-06, + "loss": 0.76417124, + "num_input_tokens_seen": 199613220, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6640625, + "step": 9268, + "time_per_iteration": 2.489957332611084 + }, + { + "auxiliary_loss_clip": 0.01105844, + "auxiliary_loss_mlp": 0.01030138, + "balance_loss_clip": 1.0188365, + "balance_loss_mlp": 1.03773642, + "epoch": 0.5572824289794078, + "flos": 20960053413120.0, + "grad_norm": 2.490438410193009, + "language_loss": 0.7539283, + "learning_rate": 1.726869892322104e-06, + "loss": 0.77528816, + "num_input_tokens_seen": 199632085, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6796875, + "step": 9269, + "time_per_iteration": 2.5165016651153564 + }, + { + "auxiliary_loss_clip": 0.01106358, + "auxiliary_loss_mlp": 0.01029515, + "balance_loss_clip": 1.01847041, + "balance_loss_mlp": 1.0366416, + "epoch": 0.5573425522320757, + "flos": 25042413847680.0, + "grad_norm": 1.5593232015543566, + "language_loss": 0.82527506, + "learning_rate": 1.726484084647256e-06, + "loss": 0.84663379, + "num_input_tokens_seen": 199649295, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6953125, + "step": 9270, + "time_per_iteration": 2.495546579360962 + }, + { + "auxiliary_loss_clip": 0.01107465, + "auxiliary_loss_mlp": 0.01031385, + "balance_loss_clip": 1.01927948, + "balance_loss_mlp": 1.03695226, + "epoch": 0.5574026754847438, + "flos": 23659637927040.0, + "grad_norm": 2.4402155421947485, + "language_loss": 0.79217434, + "learning_rate": 1.7260982873441591e-06, + "loss": 0.81356287, + "num_input_tokens_seen": 199668870, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.703125, + "step": 9271, + "time_per_iteration": 2.5050055980682373 + }, + { + "auxiliary_loss_clip": 0.01107417, + "auxiliary_loss_mlp": 0.01031316, + "balance_loss_clip": 1.01938963, + "balance_loss_mlp": 1.03778744, + "epoch": 0.5574627987374117, + "flos": 24782240661120.0, + "grad_norm": 1.994384891359262, + "language_loss": 0.90424085, + "learning_rate": 1.725712500427442e-06, + "loss": 0.92562819, + "num_input_tokens_seen": 199684870, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 9272, + "time_per_iteration": 2.455949068069458 + }, + { + "auxiliary_loss_clip": 0.0110516, + "auxiliary_loss_mlp": 0.01030586, + "balance_loss_clip": 1.01864076, + "balance_loss_mlp": 1.03754234, + "epoch": 0.5575229219900797, + "flos": 21834944979840.0, + "grad_norm": 1.979276269767202, + "language_loss": 0.83862162, + "learning_rate": 1.7253267239117347e-06, + "loss": 0.85997909, + "num_input_tokens_seen": 199701975, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 9273, + "time_per_iteration": 2.4802021980285645 + }, + { + "auxiliary_loss_clip": 0.01108902, + "auxiliary_loss_mlp": 0.01041124, + "balance_loss_clip": 1.02752197, + "balance_loss_mlp": 1.03908944, + "epoch": 0.5575830452427476, + "flos": 27815148408960.0, + "grad_norm": 2.0454885443684905, + "language_loss": 0.73996758, + "learning_rate": 1.7249409578116655e-06, + "loss": 0.76146781, + "num_input_tokens_seen": 199721865, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.69921875, + "step": 9274, + "time_per_iteration": 2.4761173725128174 + }, + { + "auxiliary_loss_clip": 0.01116526, + "auxiliary_loss_mlp": 0.01034454, + "balance_loss_clip": 1.02121544, + "balance_loss_mlp": 1.04015088, + "epoch": 0.5576431684954156, + "flos": 17812805696640.0, + "grad_norm": 2.9773966002159824, + "language_loss": 0.78126067, + "learning_rate": 1.7245552021418629e-06, + "loss": 0.8027705, + "num_input_tokens_seen": 199736455, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.765625, + "step": 9275, + "time_per_iteration": 2.4496877193450928 + }, + { + "auxiliary_loss_clip": 0.01106389, + "auxiliary_loss_mlp": 0.01029467, + "balance_loss_clip": 1.01745057, + "balance_loss_mlp": 1.03767419, + "epoch": 0.5577032917480835, + "flos": 15486872411520.0, + "grad_norm": 1.6885485925360224, + "language_loss": 0.74829316, + "learning_rate": 1.7241694569169546e-06, + "loss": 0.76965177, + "num_input_tokens_seen": 199753125, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6875, + "step": 9276, + "time_per_iteration": 2.413726806640625 + }, + { + "auxiliary_loss_clip": 0.0110324, + "auxiliary_loss_mlp": 0.01031184, + "balance_loss_clip": 1.01978803, + "balance_loss_mlp": 1.03508329, + "epoch": 0.5577634150007516, + "flos": 21579763783680.0, + "grad_norm": 1.7672131346084554, + "language_loss": 0.75013113, + "learning_rate": 1.7237837221515678e-06, + "loss": 0.77147532, + "num_input_tokens_seen": 199771365, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6796875, + "step": 9277, + "time_per_iteration": 2.4982142448425293 + }, + { + "auxiliary_loss_clip": 0.01102538, + "auxiliary_loss_mlp": 0.01032622, + "balance_loss_clip": 1.02155328, + "balance_loss_mlp": 1.03504467, + "epoch": 0.5578235382534195, + "flos": 21139750177920.0, + "grad_norm": 1.8714980055762023, + "language_loss": 0.71817064, + "learning_rate": 1.7233979978603304e-06, + "loss": 0.73952222, + "num_input_tokens_seen": 199790035, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.67578125, + "step": 9278, + "time_per_iteration": 2.4389007091522217 + }, + { + "auxiliary_loss_clip": 0.01109043, + "auxiliary_loss_mlp": 0.01034833, + "balance_loss_clip": 1.02185118, + "balance_loss_mlp": 1.0372287, + "epoch": 0.5578836615060875, + "flos": 26505199313280.0, + "grad_norm": 1.6538282955120047, + "language_loss": 0.75750679, + "learning_rate": 1.723012284057868e-06, + "loss": 0.77894545, + "num_input_tokens_seen": 199811125, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71875, + "step": 9279, + "time_per_iteration": 2.5255484580993652 + }, + { + "auxiliary_loss_clip": 0.0110534, + "auxiliary_loss_mlp": 0.01029428, + "balance_loss_clip": 1.01767397, + "balance_loss_mlp": 1.03544426, + "epoch": 0.5579437847587555, + "flos": 20153786780160.0, + "grad_norm": 2.2545627368714034, + "language_loss": 0.67431748, + "learning_rate": 1.7226265807588082e-06, + "loss": 0.69566512, + "num_input_tokens_seen": 199829915, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.69921875, + "step": 9280, + "time_per_iteration": 2.5258350372314453 + }, + { + "auxiliary_loss_clip": 0.01107802, + "auxiliary_loss_mlp": 0.01037985, + "balance_loss_clip": 1.02595139, + "balance_loss_mlp": 1.03626418, + "epoch": 0.5580039080114234, + "flos": 26102281478400.0, + "grad_norm": 1.676674952402485, + "language_loss": 0.72964156, + "learning_rate": 1.7222408879777763e-06, + "loss": 0.75109941, + "num_input_tokens_seen": 199850670, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.71484375, + "step": 9281, + "time_per_iteration": 2.505610466003418 + }, + { + "auxiliary_loss_clip": 0.01106676, + "auxiliary_loss_mlp": 0.01030885, + "balance_loss_clip": 1.01922011, + "balance_loss_mlp": 1.03804862, + "epoch": 0.5580640312640914, + "flos": 13771671096960.0, + "grad_norm": 2.9649443100281627, + "language_loss": 0.75254506, + "learning_rate": 1.7218552057293974e-06, + "loss": 0.77392066, + "num_input_tokens_seen": 199867645, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6875, + "step": 9282, + "time_per_iteration": 2.444455623626709 + }, + { + "auxiliary_loss_clip": 0.01104903, + "auxiliary_loss_mlp": 0.01026903, + "balance_loss_clip": 1.01507115, + "balance_loss_mlp": 1.03695285, + "epoch": 0.5581241545167593, + "flos": 17675986792320.0, + "grad_norm": 1.6849195839549764, + "language_loss": 0.66588777, + "learning_rate": 1.721469534028297e-06, + "loss": 0.68720585, + "num_input_tokens_seen": 199886320, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 9283, + "time_per_iteration": 2.4668378829956055 + }, + { + "auxiliary_loss_clip": 0.01105958, + "auxiliary_loss_mlp": 0.01025736, + "balance_loss_clip": 1.01500154, + "balance_loss_mlp": 1.03703356, + "epoch": 0.5581842777694274, + "flos": 19569161018880.0, + "grad_norm": 2.7565054625366305, + "language_loss": 0.8290503, + "learning_rate": 1.7210838728890994e-06, + "loss": 0.85036725, + "num_input_tokens_seen": 199904895, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6875, + "step": 9284, + "time_per_iteration": 2.430774688720703 + }, + { + "auxiliary_loss_clip": 0.01105717, + "auxiliary_loss_mlp": 0.01029119, + "balance_loss_clip": 1.01653099, + "balance_loss_mlp": 1.03554368, + "epoch": 0.5582444010220953, + "flos": 20595165102720.0, + "grad_norm": 2.3933521300057836, + "language_loss": 0.85047686, + "learning_rate": 1.7206982223264304e-06, + "loss": 0.87182522, + "num_input_tokens_seen": 199921090, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 9285, + "time_per_iteration": 2.4788479804992676 + }, + { + "auxiliary_loss_clip": 0.01105483, + "auxiliary_loss_mlp": 0.01031556, + "balance_loss_clip": 1.02009416, + "balance_loss_mlp": 1.03531575, + "epoch": 0.5583045242747633, + "flos": 19135504120320.0, + "grad_norm": 3.198131799092361, + "language_loss": 0.73653531, + "learning_rate": 1.720312582354912e-06, + "loss": 0.75790572, + "num_input_tokens_seen": 199939925, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.703125, + "step": 9286, + "time_per_iteration": 2.439715623855591 + }, + { + "auxiliary_loss_clip": 0.01107925, + "auxiliary_loss_mlp": 0.01031209, + "balance_loss_clip": 1.01946068, + "balance_loss_mlp": 1.03781044, + "epoch": 0.5583646475274312, + "flos": 27454569730560.0, + "grad_norm": 1.684452503968906, + "language_loss": 0.74169838, + "learning_rate": 1.7199269529891684e-06, + "loss": 0.76308966, + "num_input_tokens_seen": 199960015, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.703125, + "step": 9287, + "time_per_iteration": 2.534813642501831 + }, + { + "auxiliary_loss_clip": 0.01112227, + "auxiliary_loss_mlp": 0.01030459, + "balance_loss_clip": 1.01780486, + "balance_loss_mlp": 1.03982437, + "epoch": 0.5584247707800992, + "flos": 23653784010240.0, + "grad_norm": 2.339953652318452, + "language_loss": 0.75018406, + "learning_rate": 1.7195413342438233e-06, + "loss": 0.77161086, + "num_input_tokens_seen": 199980505, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 9288, + "time_per_iteration": 2.470242977142334 + }, + { + "auxiliary_loss_clip": 0.01109468, + "auxiliary_loss_mlp": 0.01037482, + "balance_loss_clip": 1.0241785, + "balance_loss_mlp": 1.03922033, + "epoch": 0.5584848940327671, + "flos": 13698880185600.0, + "grad_norm": 1.8804248151935914, + "language_loss": 0.77241838, + "learning_rate": 1.7191557261334984e-06, + "loss": 0.79388785, + "num_input_tokens_seen": 199999020, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.703125, + "step": 9289, + "time_per_iteration": 2.5357422828674316 + }, + { + "auxiliary_loss_clip": 0.01112615, + "auxiliary_loss_mlp": 0.01031708, + "balance_loss_clip": 1.01918483, + "balance_loss_mlp": 1.03802335, + "epoch": 0.5585450172854352, + "flos": 27016208150400.0, + "grad_norm": 1.7341259817318901, + "language_loss": 0.61310709, + "learning_rate": 1.718770128672817e-06, + "loss": 0.63455033, + "num_input_tokens_seen": 200019020, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.74609375, + "step": 9290, + "time_per_iteration": 2.479149580001831 + }, + { + "auxiliary_loss_clip": 0.01107208, + "auxiliary_loss_mlp": 0.01028765, + "balance_loss_clip": 1.01678467, + "balance_loss_mlp": 1.03602409, + "epoch": 0.5586051405381031, + "flos": 23185653033600.0, + "grad_norm": 1.9512495779204855, + "language_loss": 0.67988908, + "learning_rate": 1.7183845418764e-06, + "loss": 0.70124876, + "num_input_tokens_seen": 200038110, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.7109375, + "step": 9291, + "time_per_iteration": 2.4684019088745117 + }, + { + "auxiliary_loss_clip": 0.01108099, + "auxiliary_loss_mlp": 0.01033244, + "balance_loss_clip": 1.02022064, + "balance_loss_mlp": 1.0363071, + "epoch": 0.5586652637907711, + "flos": 20775544225920.0, + "grad_norm": 2.2522167745355524, + "language_loss": 0.83802187, + "learning_rate": 1.7179989657588698e-06, + "loss": 0.85943532, + "num_input_tokens_seen": 200056210, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 9292, + "time_per_iteration": 2.550994873046875 + }, + { + "auxiliary_loss_clip": 0.01104675, + "auxiliary_loss_mlp": 0.01033633, + "balance_loss_clip": 1.02180171, + "balance_loss_mlp": 1.03674221, + "epoch": 0.5587253870434391, + "flos": 28219897837440.0, + "grad_norm": 1.8368239448999808, + "language_loss": 0.73363894, + "learning_rate": 1.7176134003348476e-06, + "loss": 0.75502205, + "num_input_tokens_seen": 200075620, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 9293, + "time_per_iteration": 2.5334718227386475 + }, + { + "auxiliary_loss_clip": 0.01104517, + "auxiliary_loss_mlp": 0.01031653, + "balance_loss_clip": 1.02023864, + "balance_loss_mlp": 1.03715324, + "epoch": 0.558785510296107, + "flos": 26615732440320.0, + "grad_norm": 1.6770372644425844, + "language_loss": 0.7251429, + "learning_rate": 1.7172278456189523e-06, + "loss": 0.7465046, + "num_input_tokens_seen": 200095945, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 9294, + "time_per_iteration": 2.4782567024230957 + }, + { + "auxiliary_loss_clip": 0.01107679, + "auxiliary_loss_mlp": 0.01030624, + "balance_loss_clip": 1.01867914, + "balance_loss_mlp": 1.03769052, + "epoch": 0.558845633548775, + "flos": 20156767608960.0, + "grad_norm": 2.2895769976939437, + "language_loss": 0.68138099, + "learning_rate": 1.716842301625806e-06, + "loss": 0.70276403, + "num_input_tokens_seen": 200114185, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69921875, + "step": 9295, + "time_per_iteration": 2.433671474456787 + }, + { + "auxiliary_loss_clip": 0.01108675, + "auxiliary_loss_mlp": 0.0103174, + "balance_loss_clip": 1.01949131, + "balance_loss_mlp": 1.03873825, + "epoch": 0.5589057568014429, + "flos": 24350774492160.0, + "grad_norm": 1.7275865639530346, + "language_loss": 0.80619705, + "learning_rate": 1.7164567683700281e-06, + "loss": 0.82760113, + "num_input_tokens_seen": 200135030, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.69921875, + "step": 9296, + "time_per_iteration": 2.4831361770629883 + }, + { + "auxiliary_loss_clip": 0.01106832, + "auxiliary_loss_mlp": 0.01031695, + "balance_loss_clip": 1.0200243, + "balance_loss_mlp": 1.03788233, + "epoch": 0.558965880054111, + "flos": 21105168359040.0, + "grad_norm": 1.8948732644892212, + "language_loss": 0.65465128, + "learning_rate": 1.7160712458662379e-06, + "loss": 0.67603648, + "num_input_tokens_seen": 200154290, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6875, + "step": 9297, + "time_per_iteration": 2.4711036682128906 + }, + { + "auxiliary_loss_clip": 0.01109853, + "auxiliary_loss_mlp": 0.01035081, + "balance_loss_clip": 1.02202153, + "balance_loss_mlp": 1.03785491, + "epoch": 0.5590260033067789, + "flos": 18436071513600.0, + "grad_norm": 1.6800872146948855, + "language_loss": 0.7513994, + "learning_rate": 1.7156857341290544e-06, + "loss": 0.77284867, + "num_input_tokens_seen": 200171555, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 9298, + "time_per_iteration": 2.421066999435425 + }, + { + "auxiliary_loss_clip": 0.01031879, + "auxiliary_loss_mlp": 0.01018081, + "balance_loss_clip": 1.01695406, + "balance_loss_mlp": 1.01014686, + "epoch": 0.5590861265594469, + "flos": 70577432490240.0, + "grad_norm": 0.6830476030131911, + "language_loss": 0.52463478, + "learning_rate": 1.7153002331730967e-06, + "loss": 0.54513437, + "num_input_tokens_seen": 200237010, + "router_z_loss_clip": 0.0112915, + "router_z_loss_mlp": 0.21777344, + "step": 9299, + "time_per_iteration": 3.096731424331665 + }, + { + "auxiliary_loss_clip": 0.0110307, + "auxiliary_loss_mlp": 0.01029526, + "balance_loss_clip": 1.01799822, + "balance_loss_mlp": 1.03608131, + "epoch": 0.5591462498121148, + "flos": 30664408896000.0, + "grad_norm": 1.8758260689947703, + "language_loss": 0.68378884, + "learning_rate": 1.7149147430129824e-06, + "loss": 0.70511478, + "num_input_tokens_seen": 200260820, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 9300, + "time_per_iteration": 2.5355281829833984 + }, + { + "auxiliary_loss_clip": 0.01107824, + "auxiliary_loss_mlp": 0.01040798, + "balance_loss_clip": 1.02798903, + "balance_loss_mlp": 1.0372839, + "epoch": 0.5592063730647828, + "flos": 18150438562560.0, + "grad_norm": 1.868740801794004, + "language_loss": 0.81233132, + "learning_rate": 1.7145292636633293e-06, + "loss": 0.83381754, + "num_input_tokens_seen": 200278035, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.703125, + "step": 9301, + "time_per_iteration": 3.9131312370300293 + }, + { + "auxiliary_loss_clip": 0.01104347, + "auxiliary_loss_mlp": 0.01026194, + "balance_loss_clip": 1.01370668, + "balance_loss_mlp": 1.03488898, + "epoch": 0.5592664963174507, + "flos": 24060400945920.0, + "grad_norm": 2.564037719481304, + "language_loss": 0.67297423, + "learning_rate": 1.714143795138756e-06, + "loss": 0.69427967, + "num_input_tokens_seen": 200297255, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 9302, + "time_per_iteration": 2.484609365463257 + }, + { + "auxiliary_loss_clip": 0.011096, + "auxiliary_loss_mlp": 0.01024968, + "balance_loss_clip": 1.01249897, + "balance_loss_mlp": 1.03721702, + "epoch": 0.5593266195701188, + "flos": 19827897661440.0, + "grad_norm": 2.803806869845176, + "language_loss": 0.70999819, + "learning_rate": 1.713758337453878e-06, + "loss": 0.73134387, + "num_input_tokens_seen": 200317505, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7265625, + "step": 9303, + "time_per_iteration": 2.442859649658203 + }, + { + "auxiliary_loss_clip": 0.01105574, + "auxiliary_loss_mlp": 0.0102866, + "balance_loss_clip": 1.01791978, + "balance_loss_mlp": 1.03930676, + "epoch": 0.5593867428227867, + "flos": 25300755440640.0, + "grad_norm": 1.540239070281283, + "language_loss": 0.72772652, + "learning_rate": 1.7133728906233124e-06, + "loss": 0.74906886, + "num_input_tokens_seen": 200338350, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6640625, + "step": 9304, + "time_per_iteration": 5.429321765899658 + }, + { + "auxiliary_loss_clip": 0.01104708, + "auxiliary_loss_mlp": 0.01027817, + "balance_loss_clip": 1.01613426, + "balance_loss_mlp": 1.03523278, + "epoch": 0.5594468660754547, + "flos": 12933013374720.0, + "grad_norm": 1.8535856395803625, + "language_loss": 0.77888674, + "learning_rate": 1.7129874546616763e-06, + "loss": 0.80021197, + "num_input_tokens_seen": 200353965, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6953125, + "step": 9305, + "time_per_iteration": 3.8705790042877197 + }, + { + "auxiliary_loss_clip": 0.01102125, + "auxiliary_loss_mlp": 0.01024983, + "balance_loss_clip": 1.01390815, + "balance_loss_mlp": 1.03657615, + "epoch": 0.5595069893281227, + "flos": 19062713208960.0, + "grad_norm": 1.7045399129758072, + "language_loss": 0.69334519, + "learning_rate": 1.7126020295835836e-06, + "loss": 0.7146163, + "num_input_tokens_seen": 200373595, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 9306, + "time_per_iteration": 2.4669442176818848 + }, + { + "auxiliary_loss_clip": 0.01030152, + "auxiliary_loss_mlp": 0.01003605, + "balance_loss_clip": 1.0025028, + "balance_loss_mlp": 1.00838459, + "epoch": 0.5595671125807906, + "flos": 70273375862400.0, + "grad_norm": 0.9104128938879268, + "language_loss": 0.60324359, + "learning_rate": 1.7122166154036518e-06, + "loss": 0.62358117, + "num_input_tokens_seen": 200429155, + "router_z_loss_clip": 0.01104736, + "router_z_loss_mlp": 0.21777344, + "step": 9307, + "time_per_iteration": 3.167161703109741 + }, + { + "auxiliary_loss_clip": 0.01105033, + "auxiliary_loss_mlp": 0.01030814, + "balance_loss_clip": 1.01972127, + "balance_loss_mlp": 1.03697395, + "epoch": 0.5596272358334586, + "flos": 20665513889280.0, + "grad_norm": 1.9188877301503315, + "language_loss": 0.73981357, + "learning_rate": 1.7118312121364943e-06, + "loss": 0.76117194, + "num_input_tokens_seen": 200448290, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6796875, + "step": 9308, + "time_per_iteration": 2.544931650161743 + }, + { + "auxiliary_loss_clip": 0.01107282, + "auxiliary_loss_mlp": 0.01031253, + "balance_loss_clip": 1.01833069, + "balance_loss_mlp": 1.03571653, + "epoch": 0.5596873590861265, + "flos": 25041013217280.0, + "grad_norm": 1.8987333438245737, + "language_loss": 0.69393057, + "learning_rate": 1.7114458197967257e-06, + "loss": 0.71531588, + "num_input_tokens_seen": 200466555, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 9309, + "time_per_iteration": 2.5008022785186768 + }, + { + "auxiliary_loss_clip": 0.01108803, + "auxiliary_loss_mlp": 0.01031964, + "balance_loss_clip": 1.01787376, + "balance_loss_mlp": 1.03872681, + "epoch": 0.5597474823387946, + "flos": 25958387594880.0, + "grad_norm": 2.0715816525821458, + "language_loss": 0.75254035, + "learning_rate": 1.7110604383989613e-06, + "loss": 0.77394807, + "num_input_tokens_seen": 200485980, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.703125, + "step": 9310, + "time_per_iteration": 2.5096590518951416 + }, + { + "auxiliary_loss_clip": 0.01111521, + "auxiliary_loss_mlp": 0.01032538, + "balance_loss_clip": 1.01953197, + "balance_loss_mlp": 1.03922331, + "epoch": 0.5598076055914625, + "flos": 26177442687360.0, + "grad_norm": 4.006602699764322, + "language_loss": 0.69449794, + "learning_rate": 1.7106750679578133e-06, + "loss": 0.71593851, + "num_input_tokens_seen": 200504555, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.72265625, + "step": 9311, + "time_per_iteration": 2.5238418579101562 + }, + { + "auxiliary_loss_clip": 0.01103209, + "auxiliary_loss_mlp": 0.01028086, + "balance_loss_clip": 1.01616526, + "balance_loss_mlp": 1.03474474, + "epoch": 0.5598677288441305, + "flos": 11655778590720.0, + "grad_norm": 1.8631623558730779, + "language_loss": 0.72497612, + "learning_rate": 1.7102897084878962e-06, + "loss": 0.74628901, + "num_input_tokens_seen": 200522700, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.68359375, + "step": 9312, + "time_per_iteration": 2.4980969429016113 + }, + { + "auxiliary_loss_clip": 0.01106218, + "auxiliary_loss_mlp": 0.01030938, + "balance_loss_clip": 1.01871908, + "balance_loss_mlp": 1.03834271, + "epoch": 0.5599278520967984, + "flos": 22966597941120.0, + "grad_norm": 1.9916809517025356, + "language_loss": 0.89106059, + "learning_rate": 1.709904360003822e-06, + "loss": 0.91243219, + "num_input_tokens_seen": 200541910, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6796875, + "step": 9313, + "time_per_iteration": 2.43849515914917 + }, + { + "auxiliary_loss_clip": 0.01107396, + "auxiliary_loss_mlp": 0.01034864, + "balance_loss_clip": 1.0224545, + "balance_loss_mlp": 1.03886163, + "epoch": 0.5599879753494664, + "flos": 21215557831680.0, + "grad_norm": 1.848557040479868, + "language_loss": 0.77809632, + "learning_rate": 1.709519022520204e-06, + "loss": 0.79951894, + "num_input_tokens_seen": 200562600, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 9314, + "time_per_iteration": 2.4745004177093506 + }, + { + "auxiliary_loss_clip": 0.01103678, + "auxiliary_loss_mlp": 0.01027591, + "balance_loss_clip": 1.0153954, + "balance_loss_mlp": 1.03497362, + "epoch": 0.5600480986021343, + "flos": 31903219105920.0, + "grad_norm": 1.6135281246099127, + "language_loss": 0.7005592, + "learning_rate": 1.7091336960516537e-06, + "loss": 0.72187185, + "num_input_tokens_seen": 200584795, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 9315, + "time_per_iteration": 2.523815631866455 + }, + { + "auxiliary_loss_clip": 0.0110827, + "auxiliary_loss_mlp": 0.01034837, + "balance_loss_clip": 1.02225423, + "balance_loss_mlp": 1.03666615, + "epoch": 0.5601082218548024, + "flos": 28476048700800.0, + "grad_norm": 2.163442884097896, + "language_loss": 0.66467899, + "learning_rate": 1.7087483806127824e-06, + "loss": 0.68611002, + "num_input_tokens_seen": 200606945, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71875, + "step": 9316, + "time_per_iteration": 2.530667304992676 + }, + { + "auxiliary_loss_clip": 0.01106878, + "auxiliary_loss_mlp": 0.01031394, + "balance_loss_clip": 1.01796496, + "balance_loss_mlp": 1.03770351, + "epoch": 0.5601683451074703, + "flos": 24097173494400.0, + "grad_norm": 2.3805446029838624, + "language_loss": 0.86762506, + "learning_rate": 1.7083630762182022e-06, + "loss": 0.88900781, + "num_input_tokens_seen": 200626340, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.69140625, + "step": 9317, + "time_per_iteration": 2.469134569168091 + }, + { + "auxiliary_loss_clip": 0.01108894, + "auxiliary_loss_mlp": 0.01035341, + "balance_loss_clip": 1.02155399, + "balance_loss_mlp": 1.03657329, + "epoch": 0.5602284683601383, + "flos": 26356205698560.0, + "grad_norm": 1.7151693589962669, + "language_loss": 0.77363193, + "learning_rate": 1.7079777828825233e-06, + "loss": 0.79507434, + "num_input_tokens_seen": 200644520, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.72265625, + "step": 9318, + "time_per_iteration": 2.4952752590179443 + }, + { + "auxiliary_loss_clip": 0.01101693, + "auxiliary_loss_mlp": 0.01035081, + "balance_loss_clip": 1.02351773, + "balance_loss_mlp": 1.03302336, + "epoch": 0.5602885916128063, + "flos": 24496392228480.0, + "grad_norm": 1.698102214619228, + "language_loss": 0.75956237, + "learning_rate": 1.7075925006203558e-06, + "loss": 0.7809301, + "num_input_tokens_seen": 200664845, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6875, + "step": 9319, + "time_per_iteration": 2.479919910430908 + }, + { + "auxiliary_loss_clip": 0.01104648, + "auxiliary_loss_mlp": 0.01034126, + "balance_loss_clip": 1.02235985, + "balance_loss_mlp": 1.03689611, + "epoch": 0.5603487148654742, + "flos": 27345006270720.0, + "grad_norm": 1.554434910389292, + "language_loss": 0.85508537, + "learning_rate": 1.7072072294463101e-06, + "loss": 0.87647313, + "num_input_tokens_seen": 200686535, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.67578125, + "step": 9320, + "time_per_iteration": 2.511880874633789 + }, + { + "auxiliary_loss_clip": 0.01030962, + "auxiliary_loss_mlp": 0.00999706, + "balance_loss_clip": 0.99860352, + "balance_loss_mlp": 1.00918674, + "epoch": 0.5604088381181422, + "flos": 54087756180480.0, + "grad_norm": 0.7458732992694707, + "language_loss": 0.52630556, + "learning_rate": 1.706821969374996e-06, + "loss": 0.54661226, + "num_input_tokens_seen": 200736965, + "router_z_loss_clip": 0.01104736, + "router_z_loss_mlp": 0.21777344, + "step": 9321, + "time_per_iteration": 2.8576598167419434 + }, + { + "auxiliary_loss_clip": 0.01104414, + "auxiliary_loss_mlp": 0.01031081, + "balance_loss_clip": 1.01938033, + "balance_loss_mlp": 1.03744757, + "epoch": 0.5604689613708101, + "flos": 22236390357120.0, + "grad_norm": 1.4865751697326912, + "language_loss": 0.74422431, + "learning_rate": 1.7064367204210216e-06, + "loss": 0.76557928, + "num_input_tokens_seen": 200757420, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 9322, + "time_per_iteration": 2.480198383331299 + }, + { + "auxiliary_loss_clip": 0.01105934, + "auxiliary_loss_mlp": 0.01033008, + "balance_loss_clip": 1.01982379, + "balance_loss_mlp": 1.03641856, + "epoch": 0.5605290846234782, + "flos": 35297782940160.0, + "grad_norm": 1.8343710411867171, + "language_loss": 0.73661906, + "learning_rate": 1.7060514825989963e-06, + "loss": 0.75800848, + "num_input_tokens_seen": 200779520, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.6953125, + "step": 9323, + "time_per_iteration": 2.5517938137054443 + }, + { + "auxiliary_loss_clip": 0.01109096, + "auxiliary_loss_mlp": 0.01026708, + "balance_loss_clip": 1.01386333, + "balance_loss_mlp": 1.03797293, + "epoch": 0.5605892078761461, + "flos": 20263314326400.0, + "grad_norm": 1.5108510359489868, + "language_loss": 0.61287946, + "learning_rate": 1.7056662559235286e-06, + "loss": 0.63423753, + "num_input_tokens_seen": 200799485, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9324, + "time_per_iteration": 2.4675137996673584 + }, + { + "auxiliary_loss_clip": 0.01106981, + "auxiliary_loss_mlp": 0.0102911, + "balance_loss_clip": 1.01650345, + "balance_loss_mlp": 1.03693414, + "epoch": 0.5606493311288141, + "flos": 17308333134720.0, + "grad_norm": 2.2169286979326768, + "language_loss": 0.87785721, + "learning_rate": 1.705281040409226e-06, + "loss": 0.89921808, + "num_input_tokens_seen": 200817540, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 9325, + "time_per_iteration": 2.4160819053649902 + }, + { + "auxiliary_loss_clip": 0.01108623, + "auxiliary_loss_mlp": 0.01030754, + "balance_loss_clip": 1.01805806, + "balance_loss_mlp": 1.03765607, + "epoch": 0.560709454381482, + "flos": 21652985658240.0, + "grad_norm": 1.6383695475184654, + "language_loss": 0.74048722, + "learning_rate": 1.7048958360706952e-06, + "loss": 0.76188105, + "num_input_tokens_seen": 200838380, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9326, + "time_per_iteration": 2.463094711303711 + }, + { + "auxiliary_loss_clip": 0.01112046, + "auxiliary_loss_mlp": 0.01029376, + "balance_loss_clip": 1.01620328, + "balance_loss_mlp": 1.0386548, + "epoch": 0.56076957763415, + "flos": 20303355012480.0, + "grad_norm": 3.3443611641012674, + "language_loss": 0.78365433, + "learning_rate": 1.7045106429225447e-06, + "loss": 0.80506855, + "num_input_tokens_seen": 200855640, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 9327, + "time_per_iteration": 2.445756673812866 + }, + { + "auxiliary_loss_clip": 0.01108683, + "auxiliary_loss_mlp": 0.01031541, + "balance_loss_clip": 1.01842213, + "balance_loss_mlp": 1.03914046, + "epoch": 0.5608297008868179, + "flos": 25045897466880.0, + "grad_norm": 2.5559440694427478, + "language_loss": 0.78508025, + "learning_rate": 1.7041254609793795e-06, + "loss": 0.80648255, + "num_input_tokens_seen": 200876585, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6953125, + "step": 9328, + "time_per_iteration": 2.5156970024108887 + }, + { + "auxiliary_loss_clip": 0.01106121, + "auxiliary_loss_mlp": 0.01028303, + "balance_loss_clip": 1.01594675, + "balance_loss_mlp": 1.03623605, + "epoch": 0.560889824139486, + "flos": 19866825025920.0, + "grad_norm": 1.528557811702872, + "language_loss": 0.73765361, + "learning_rate": 1.7037402902558066e-06, + "loss": 0.7589978, + "num_input_tokens_seen": 200898175, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69921875, + "step": 9329, + "time_per_iteration": 2.4843335151672363 + }, + { + "auxiliary_loss_clip": 0.01110712, + "auxiliary_loss_mlp": 0.01036618, + "balance_loss_clip": 1.02325511, + "balance_loss_mlp": 1.03798938, + "epoch": 0.5609499473921539, + "flos": 22929394429440.0, + "grad_norm": 1.6466003553704387, + "language_loss": 0.83545572, + "learning_rate": 1.7033551307664324e-06, + "loss": 0.85692906, + "num_input_tokens_seen": 200917515, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7265625, + "step": 9330, + "time_per_iteration": 2.482752561569214 + }, + { + "auxiliary_loss_clip": 0.01031116, + "auxiliary_loss_mlp": 0.01002487, + "balance_loss_clip": 1.00147378, + "balance_loss_mlp": 1.0092634, + "epoch": 0.5610100706448219, + "flos": 53035825455360.0, + "grad_norm": 0.7161961657295335, + "language_loss": 0.57873559, + "learning_rate": 1.7029699825258603e-06, + "loss": 0.59907156, + "num_input_tokens_seen": 200978615, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.21875, + "step": 9331, + "time_per_iteration": 3.063901662826538 + }, + { + "auxiliary_loss_clip": 0.01108686, + "auxiliary_loss_mlp": 0.01032168, + "balance_loss_clip": 1.02002692, + "balance_loss_mlp": 1.03850377, + "epoch": 0.5610701938974898, + "flos": 21834944979840.0, + "grad_norm": 1.694841283599879, + "language_loss": 0.82141155, + "learning_rate": 1.7025848455486971e-06, + "loss": 0.84282017, + "num_input_tokens_seen": 200997745, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.703125, + "step": 9332, + "time_per_iteration": 2.475790500640869 + }, + { + "auxiliary_loss_clip": 0.01113328, + "auxiliary_loss_mlp": 0.01036451, + "balance_loss_clip": 1.02233052, + "balance_loss_mlp": 1.03915834, + "epoch": 0.5611303171501578, + "flos": 17457183095040.0, + "grad_norm": 1.7394490434662164, + "language_loss": 0.8172127, + "learning_rate": 1.7021997198495454e-06, + "loss": 0.83871055, + "num_input_tokens_seen": 201016370, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 9333, + "time_per_iteration": 2.4251558780670166 + }, + { + "auxiliary_loss_clip": 0.01106502, + "auxiliary_loss_mlp": 0.01027346, + "balance_loss_clip": 1.01541877, + "balance_loss_mlp": 1.03641915, + "epoch": 0.5611904404028258, + "flos": 22637799820800.0, + "grad_norm": 1.5456564302164297, + "language_loss": 0.73111224, + "learning_rate": 1.7018146054430108e-06, + "loss": 0.7524507, + "num_input_tokens_seen": 201034310, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.703125, + "step": 9334, + "time_per_iteration": 2.5241355895996094 + }, + { + "auxiliary_loss_clip": 0.01108924, + "auxiliary_loss_mlp": 0.01037845, + "balance_loss_clip": 1.02525675, + "balance_loss_mlp": 1.03886223, + "epoch": 0.5612505636554938, + "flos": 14316327999360.0, + "grad_norm": 1.7664531017043277, + "language_loss": 0.71317977, + "learning_rate": 1.7014295023436961e-06, + "loss": 0.73464751, + "num_input_tokens_seen": 201052030, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 9335, + "time_per_iteration": 2.4215545654296875 + }, + { + "auxiliary_loss_clip": 0.01109063, + "auxiliary_loss_mlp": 0.01029211, + "balance_loss_clip": 1.01659274, + "balance_loss_mlp": 1.0381881, + "epoch": 0.5613106869081618, + "flos": 16508279554560.0, + "grad_norm": 1.7059405915097856, + "language_loss": 0.76673937, + "learning_rate": 1.701044410566205e-06, + "loss": 0.78812212, + "num_input_tokens_seen": 201068445, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 9336, + "time_per_iteration": 2.456911087036133 + }, + { + "auxiliary_loss_clip": 0.01106617, + "auxiliary_loss_mlp": 0.01032175, + "balance_loss_clip": 1.0203793, + "balance_loss_mlp": 1.0376699, + "epoch": 0.5613708101608297, + "flos": 24058569352320.0, + "grad_norm": 2.253598480453168, + "language_loss": 0.644315, + "learning_rate": 1.7006593301251393e-06, + "loss": 0.66570294, + "num_input_tokens_seen": 201082140, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 9337, + "time_per_iteration": 2.4435572624206543 + }, + { + "auxiliary_loss_clip": 0.01029918, + "auxiliary_loss_mlp": 0.01004848, + "balance_loss_clip": 1.00367343, + "balance_loss_mlp": 1.00804543, + "epoch": 0.5614309334134977, + "flos": 64905735997440.0, + "grad_norm": 0.9905116764848269, + "language_loss": 0.62572861, + "learning_rate": 1.700274261035102e-06, + "loss": 0.64607626, + "num_input_tokens_seen": 201137245, + "router_z_loss_clip": 0.01171875, + "router_z_loss_mlp": 0.21875, + "step": 9338, + "time_per_iteration": 3.039401054382324 + }, + { + "auxiliary_loss_clip": 0.01110236, + "auxiliary_loss_mlp": 0.0103103, + "balance_loss_clip": 1.01862049, + "balance_loss_mlp": 1.03832674, + "epoch": 0.5614910566661656, + "flos": 32919849740160.0, + "grad_norm": 1.7660421922814409, + "language_loss": 0.65246809, + "learning_rate": 1.6998892033106946e-06, + "loss": 0.67388076, + "num_input_tokens_seen": 201157270, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71875, + "step": 9339, + "time_per_iteration": 2.5356857776641846 + }, + { + "auxiliary_loss_clip": 0.01106617, + "auxiliary_loss_mlp": 0.01032872, + "balance_loss_clip": 1.0203191, + "balance_loss_mlp": 1.03761101, + "epoch": 0.5615511799188336, + "flos": 18588871969920.0, + "grad_norm": 3.5768294087083317, + "language_loss": 0.69863123, + "learning_rate": 1.6995041569665184e-06, + "loss": 0.72002614, + "num_input_tokens_seen": 201174530, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 9340, + "time_per_iteration": 2.4699902534484863 + }, + { + "auxiliary_loss_clip": 0.01105107, + "auxiliary_loss_mlp": 0.01027787, + "balance_loss_clip": 1.01596177, + "balance_loss_mlp": 1.03900409, + "epoch": 0.5616113031715015, + "flos": 22820010537600.0, + "grad_norm": 1.8075752300654697, + "language_loss": 0.77621818, + "learning_rate": 1.6991191220171756e-06, + "loss": 0.7975471, + "num_input_tokens_seen": 201194905, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66015625, + "step": 9341, + "time_per_iteration": 2.456268072128296 + }, + { + "auxiliary_loss_clip": 0.01106711, + "auxiliary_loss_mlp": 0.01030586, + "balance_loss_clip": 1.01759195, + "balance_loss_mlp": 1.03572893, + "epoch": 0.5616714264241696, + "flos": 22345702421760.0, + "grad_norm": 1.9728763199974049, + "language_loss": 0.79315615, + "learning_rate": 1.6987340984772653e-06, + "loss": 0.81452906, + "num_input_tokens_seen": 201213715, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 9342, + "time_per_iteration": 2.4534597396850586 + }, + { + "auxiliary_loss_clip": 0.01111218, + "auxiliary_loss_mlp": 0.01030819, + "balance_loss_clip": 1.01735401, + "balance_loss_mlp": 1.03851485, + "epoch": 0.5617315496768375, + "flos": 18807783408000.0, + "grad_norm": 2.593835689079262, + "language_loss": 0.76322573, + "learning_rate": 1.6983490863613882e-06, + "loss": 0.78464609, + "num_input_tokens_seen": 201231415, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7265625, + "step": 9343, + "time_per_iteration": 3.8814024925231934 + }, + { + "auxiliary_loss_clip": 0.01109994, + "auxiliary_loss_mlp": 0.01037634, + "balance_loss_clip": 1.0245204, + "balance_loss_mlp": 1.03978682, + "epoch": 0.5617916729295055, + "flos": 18369314087040.0, + "grad_norm": 1.5945215839270617, + "language_loss": 0.68185151, + "learning_rate": 1.6979640856841442e-06, + "loss": 0.70332778, + "num_input_tokens_seen": 201249625, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 9344, + "time_per_iteration": 2.4659440517425537 + }, + { + "auxiliary_loss_clip": 0.01109593, + "auxiliary_loss_mlp": 0.01036696, + "balance_loss_clip": 1.02364254, + "balance_loss_mlp": 1.0381155, + "epoch": 0.5618517961821734, + "flos": 28179964892160.0, + "grad_norm": 2.2863999357797202, + "language_loss": 0.66754413, + "learning_rate": 1.6975790964601318e-06, + "loss": 0.68900704, + "num_input_tokens_seen": 201271205, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71484375, + "step": 9345, + "time_per_iteration": 2.5232093334198 + }, + { + "auxiliary_loss_clip": 0.01109525, + "auxiliary_loss_mlp": 0.01025247, + "balance_loss_clip": 1.01317143, + "balance_loss_mlp": 1.03883803, + "epoch": 0.5619119194348414, + "flos": 15486872411520.0, + "grad_norm": 1.8616054032141576, + "language_loss": 0.87347126, + "learning_rate": 1.6971941187039512e-06, + "loss": 0.89481902, + "num_input_tokens_seen": 201287700, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.70703125, + "step": 9346, + "time_per_iteration": 3.9651877880096436 + }, + { + "auxiliary_loss_clip": 0.0110623, + "auxiliary_loss_mlp": 0.01035149, + "balance_loss_clip": 1.02200019, + "balance_loss_mlp": 1.03657687, + "epoch": 0.5619720426875094, + "flos": 29128652951040.0, + "grad_norm": 2.36966351637476, + "language_loss": 0.59370089, + "learning_rate": 1.6968091524301993e-06, + "loss": 0.61511469, + "num_input_tokens_seen": 201307530, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.6953125, + "step": 9347, + "time_per_iteration": 3.9802420139312744 + }, + { + "auxiliary_loss_clip": 0.01108812, + "auxiliary_loss_mlp": 0.01037423, + "balance_loss_clip": 1.02319539, + "balance_loss_mlp": 1.03742838, + "epoch": 0.5620321659401774, + "flos": 18003743418240.0, + "grad_norm": 2.4273405009541107, + "language_loss": 0.68972194, + "learning_rate": 1.6964241976534745e-06, + "loss": 0.71118426, + "num_input_tokens_seen": 201326210, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7109375, + "step": 9348, + "time_per_iteration": 2.4413368701934814 + }, + { + "auxiliary_loss_clip": 0.01111452, + "auxiliary_loss_mlp": 0.01027053, + "balance_loss_clip": 1.01292634, + "balance_loss_mlp": 1.03695107, + "epoch": 0.5620922891928454, + "flos": 20594518657920.0, + "grad_norm": 1.9093659081457641, + "language_loss": 0.79040921, + "learning_rate": 1.6960392543883754e-06, + "loss": 0.81179428, + "num_input_tokens_seen": 201346120, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7421875, + "step": 9349, + "time_per_iteration": 2.4354894161224365 + }, + { + "auxiliary_loss_clip": 0.01110239, + "auxiliary_loss_mlp": 0.01028142, + "balance_loss_clip": 1.01527977, + "balance_loss_mlp": 1.03902698, + "epoch": 0.5621524124455133, + "flos": 26287006147200.0, + "grad_norm": 2.4504118343525207, + "language_loss": 0.67282045, + "learning_rate": 1.6956543226494975e-06, + "loss": 0.69420421, + "num_input_tokens_seen": 201365700, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9350, + "time_per_iteration": 2.548351287841797 + }, + { + "auxiliary_loss_clip": 0.01110364, + "auxiliary_loss_mlp": 0.01037163, + "balance_loss_clip": 1.02408016, + "balance_loss_mlp": 1.03830576, + "epoch": 0.5622125356981813, + "flos": 12750299867520.0, + "grad_norm": 2.1113714103165884, + "language_loss": 0.78716242, + "learning_rate": 1.6952694024514381e-06, + "loss": 0.80863774, + "num_input_tokens_seen": 201382795, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 9351, + "time_per_iteration": 2.4350974559783936 + }, + { + "auxiliary_loss_clip": 0.01112089, + "auxiliary_loss_mlp": 0.01032433, + "balance_loss_clip": 1.01989186, + "balance_loss_mlp": 1.03818786, + "epoch": 0.5622726589508492, + "flos": 23805327490560.0, + "grad_norm": 1.498970106789848, + "language_loss": 0.58875829, + "learning_rate": 1.6948844938087945e-06, + "loss": 0.6102035, + "num_input_tokens_seen": 201402780, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.73828125, + "step": 9352, + "time_per_iteration": 2.4637343883514404 + }, + { + "auxiliary_loss_clip": 0.01103109, + "auxiliary_loss_mlp": 0.01031893, + "balance_loss_clip": 1.01988828, + "balance_loss_mlp": 1.03640223, + "epoch": 0.5623327822035172, + "flos": 24718212668160.0, + "grad_norm": 1.2149782460758531, + "language_loss": 0.71828997, + "learning_rate": 1.6944995967361604e-06, + "loss": 0.73964, + "num_input_tokens_seen": 201424140, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66796875, + "step": 9353, + "time_per_iteration": 2.4747259616851807 + }, + { + "auxiliary_loss_clip": 0.01110024, + "auxiliary_loss_mlp": 0.01028607, + "balance_loss_clip": 1.01584542, + "balance_loss_mlp": 1.03763878, + "epoch": 0.5623929054561851, + "flos": 14019274523520.0, + "grad_norm": 5.092816610198626, + "language_loss": 0.75717902, + "learning_rate": 1.6941147112481327e-06, + "loss": 0.77856535, + "num_input_tokens_seen": 201439645, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7265625, + "step": 9354, + "time_per_iteration": 2.412938356399536 + }, + { + "auxiliary_loss_clip": 0.01111475, + "auxiliary_loss_mlp": 0.01033305, + "balance_loss_clip": 1.02066851, + "balance_loss_mlp": 1.03783214, + "epoch": 0.5624530287088532, + "flos": 20704405340160.0, + "grad_norm": 2.4650169046981434, + "language_loss": 0.72549778, + "learning_rate": 1.6937298373593056e-06, + "loss": 0.74694556, + "num_input_tokens_seen": 201459970, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.734375, + "step": 9355, + "time_per_iteration": 2.484099864959717 + }, + { + "auxiliary_loss_clip": 0.01108801, + "auxiliary_loss_mlp": 0.01030058, + "balance_loss_clip": 1.01700521, + "balance_loss_mlp": 1.03818929, + "epoch": 0.5625131519615211, + "flos": 21470918595840.0, + "grad_norm": 1.8617046290731056, + "language_loss": 0.73371327, + "learning_rate": 1.693344975084274e-06, + "loss": 0.75510186, + "num_input_tokens_seen": 201480055, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.70703125, + "step": 9356, + "time_per_iteration": 2.465129852294922 + }, + { + "auxiliary_loss_clip": 0.0110695, + "auxiliary_loss_mlp": 0.01035155, + "balance_loss_clip": 1.02280545, + "balance_loss_mlp": 1.03822494, + "epoch": 0.5625732752141891, + "flos": 18698004466560.0, + "grad_norm": 1.9991704999969526, + "language_loss": 0.82985485, + "learning_rate": 1.6929601244376318e-06, + "loss": 0.85127592, + "num_input_tokens_seen": 201497645, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 9357, + "time_per_iteration": 2.41115665435791 + }, + { + "auxiliary_loss_clip": 0.01107392, + "auxiliary_loss_mlp": 0.01030025, + "balance_loss_clip": 1.01797318, + "balance_loss_mlp": 1.03697777, + "epoch": 0.562633398466857, + "flos": 16216900427520.0, + "grad_norm": 1.9946457873090748, + "language_loss": 0.720213, + "learning_rate": 1.6925752854339722e-06, + "loss": 0.74158716, + "num_input_tokens_seen": 201515455, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.703125, + "step": 9358, + "time_per_iteration": 2.4276978969573975 + }, + { + "auxiliary_loss_clip": 0.01105818, + "auxiliary_loss_mlp": 0.0103922, + "balance_loss_clip": 1.0264169, + "balance_loss_mlp": 1.03677905, + "epoch": 0.562693521719525, + "flos": 22491930689280.0, + "grad_norm": 2.1174896987661755, + "language_loss": 0.77650487, + "learning_rate": 1.6921904580878885e-06, + "loss": 0.79795527, + "num_input_tokens_seen": 201534500, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69140625, + "step": 9359, + "time_per_iteration": 2.5595555305480957 + }, + { + "auxiliary_loss_clip": 0.01108374, + "auxiliary_loss_mlp": 0.01029979, + "balance_loss_clip": 1.0177722, + "balance_loss_mlp": 1.03723145, + "epoch": 0.562753644972193, + "flos": 25331171281920.0, + "grad_norm": 1.6788321894876823, + "language_loss": 0.70193481, + "learning_rate": 1.6918056424139736e-06, + "loss": 0.7233184, + "num_input_tokens_seen": 201553280, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.7109375, + "step": 9360, + "time_per_iteration": 2.485053062438965 + }, + { + "auxiliary_loss_clip": 0.01030911, + "auxiliary_loss_mlp": 0.01001933, + "balance_loss_clip": 1.0007472, + "balance_loss_mlp": 1.00916827, + "epoch": 0.562813768224861, + "flos": 67392622126080.0, + "grad_norm": 0.7762895856423075, + "language_loss": 0.55579072, + "learning_rate": 1.6914208384268197e-06, + "loss": 0.57611912, + "num_input_tokens_seen": 201610030, + "router_z_loss_clip": 0.01184082, + "router_z_loss_mlp": 0.21679688, + "step": 9361, + "time_per_iteration": 3.025913953781128 + }, + { + "auxiliary_loss_clip": 0.01105882, + "auxiliary_loss_mlp": 0.01033189, + "balance_loss_clip": 1.02153039, + "balance_loss_mlp": 1.03833425, + "epoch": 0.562873891477529, + "flos": 23331163029120.0, + "grad_norm": 1.3888397041491727, + "language_loss": 0.8183462, + "learning_rate": 1.691036046141018e-06, + "loss": 0.83973688, + "num_input_tokens_seen": 201628370, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.67578125, + "step": 9362, + "time_per_iteration": 2.5037269592285156 + }, + { + "auxiliary_loss_clip": 0.01105782, + "auxiliary_loss_mlp": 0.0103495, + "balance_loss_clip": 1.02248108, + "balance_loss_mlp": 1.03707612, + "epoch": 0.5629340147301969, + "flos": 38472824805120.0, + "grad_norm": 1.5280416781125297, + "language_loss": 0.74536633, + "learning_rate": 1.6906512655711614e-06, + "loss": 0.7667737, + "num_input_tokens_seen": 201649790, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 9363, + "time_per_iteration": 2.617192268371582 + }, + { + "auxiliary_loss_clip": 0.01109847, + "auxiliary_loss_mlp": 0.01032322, + "balance_loss_clip": 1.01944757, + "balance_loss_mlp": 1.03815794, + "epoch": 0.5629941379828649, + "flos": 29242023252480.0, + "grad_norm": 1.6569550766143035, + "language_loss": 0.83350259, + "learning_rate": 1.690266496731839e-06, + "loss": 0.85492432, + "num_input_tokens_seen": 201669175, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 9364, + "time_per_iteration": 2.5304059982299805 + }, + { + "auxiliary_loss_clip": 0.01107314, + "auxiliary_loss_mlp": 0.01034102, + "balance_loss_clip": 1.0222224, + "balance_loss_mlp": 1.03869832, + "epoch": 0.5630542612355328, + "flos": 19420885676160.0, + "grad_norm": 2.211298310091642, + "language_loss": 0.64659059, + "learning_rate": 1.689881739637642e-06, + "loss": 0.66800475, + "num_input_tokens_seen": 201687000, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 9365, + "time_per_iteration": 2.4514007568359375 + }, + { + "auxiliary_loss_clip": 0.01114055, + "auxiliary_loss_mlp": 0.01036393, + "balance_loss_clip": 1.02264261, + "balance_loss_mlp": 1.03817499, + "epoch": 0.5631143844882008, + "flos": 22266303408000.0, + "grad_norm": 3.047674915648226, + "language_loss": 0.81461316, + "learning_rate": 1.6894969943031611e-06, + "loss": 0.83611768, + "num_input_tokens_seen": 201703335, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7578125, + "step": 9366, + "time_per_iteration": 2.4486207962036133 + }, + { + "auxiliary_loss_clip": 0.01107023, + "auxiliary_loss_mlp": 0.01032651, + "balance_loss_clip": 1.02089667, + "balance_loss_mlp": 1.03850698, + "epoch": 0.5631745077408687, + "flos": 22965305051520.0, + "grad_norm": 1.4263654905382444, + "language_loss": 0.73047578, + "learning_rate": 1.6891122607429845e-06, + "loss": 0.75187254, + "num_input_tokens_seen": 201723495, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.68359375, + "step": 9367, + "time_per_iteration": 2.4800310134887695 + }, + { + "auxiliary_loss_clip": 0.01030227, + "auxiliary_loss_mlp": 0.01002627, + "balance_loss_clip": 1.00138175, + "balance_loss_mlp": 1.00840044, + "epoch": 0.5632346309935368, + "flos": 65080515576960.0, + "grad_norm": 0.6249011108272925, + "language_loss": 0.5348472, + "learning_rate": 1.6887275389717028e-06, + "loss": 0.55517572, + "num_input_tokens_seen": 201792615, + "router_z_loss_clip": 0.01245117, + "router_z_loss_mlp": 0.21875, + "step": 9368, + "time_per_iteration": 3.1797282695770264 + }, + { + "auxiliary_loss_clip": 0.01108664, + "auxiliary_loss_mlp": 0.01035647, + "balance_loss_clip": 1.02317202, + "balance_loss_mlp": 1.03974152, + "epoch": 0.5632947542462047, + "flos": 23002903612800.0, + "grad_norm": 1.7643271699947485, + "language_loss": 0.69015235, + "learning_rate": 1.6883428290039046e-06, + "loss": 0.71159542, + "num_input_tokens_seen": 201812520, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 9369, + "time_per_iteration": 2.4736390113830566 + }, + { + "auxiliary_loss_clip": 0.01105862, + "auxiliary_loss_mlp": 0.01033878, + "balance_loss_clip": 1.02091432, + "balance_loss_mlp": 1.03527367, + "epoch": 0.5633548774988727, + "flos": 30482593228800.0, + "grad_norm": 1.7859826045223857, + "language_loss": 0.7540313, + "learning_rate": 1.6879581308541763e-06, + "loss": 0.77542865, + "num_input_tokens_seen": 201834185, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.70703125, + "step": 9370, + "time_per_iteration": 2.5553858280181885 + }, + { + "auxiliary_loss_clip": 0.01109895, + "auxiliary_loss_mlp": 0.01033197, + "balance_loss_clip": 1.01930332, + "balance_loss_mlp": 1.0373863, + "epoch": 0.5634150007515406, + "flos": 18515039564160.0, + "grad_norm": 3.078957924920332, + "language_loss": 0.75699127, + "learning_rate": 1.687573444537108e-06, + "loss": 0.77842218, + "num_input_tokens_seen": 201851305, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7265625, + "step": 9371, + "time_per_iteration": 2.4327011108398438 + }, + { + "auxiliary_loss_clip": 0.01106308, + "auxiliary_loss_mlp": 0.01035962, + "balance_loss_clip": 1.02386189, + "balance_loss_mlp": 1.03729022, + "epoch": 0.5634751240042086, + "flos": 19244672530560.0, + "grad_norm": 2.3308389897051702, + "language_loss": 0.76292467, + "learning_rate": 1.687188770067285e-06, + "loss": 0.7843473, + "num_input_tokens_seen": 201870350, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69140625, + "step": 9372, + "time_per_iteration": 2.447720766067505 + }, + { + "auxiliary_loss_clip": 0.01106021, + "auxiliary_loss_mlp": 0.01032443, + "balance_loss_clip": 1.02006888, + "balance_loss_mlp": 1.03829265, + "epoch": 0.5635352472568766, + "flos": 12020630987520.0, + "grad_norm": 2.0572116747420224, + "language_loss": 0.72010261, + "learning_rate": 1.6868041074592956e-06, + "loss": 0.74148726, + "num_input_tokens_seen": 201886800, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 9373, + "time_per_iteration": 2.4268109798431396 + }, + { + "auxiliary_loss_clip": 0.01110498, + "auxiliary_loss_mlp": 0.01031888, + "balance_loss_clip": 1.01839924, + "balance_loss_mlp": 1.03994441, + "epoch": 0.5635953705095446, + "flos": 21871645701120.0, + "grad_norm": 2.3770492627250617, + "language_loss": 0.82499874, + "learning_rate": 1.6864194567277264e-06, + "loss": 0.84642255, + "num_input_tokens_seen": 201904730, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.70703125, + "step": 9374, + "time_per_iteration": 2.49582576751709 + }, + { + "auxiliary_loss_clip": 0.0110343, + "auxiliary_loss_mlp": 0.01026872, + "balance_loss_clip": 1.0145762, + "balance_loss_mlp": 1.03463507, + "epoch": 0.5636554937622126, + "flos": 27126166659840.0, + "grad_norm": 1.5156995265370945, + "language_loss": 0.66020733, + "learning_rate": 1.6860348178871618e-06, + "loss": 0.68151033, + "num_input_tokens_seen": 201924850, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 9375, + "time_per_iteration": 2.516523599624634 + }, + { + "auxiliary_loss_clip": 0.01109185, + "auxiliary_loss_mlp": 0.01036661, + "balance_loss_clip": 1.02434063, + "balance_loss_mlp": 1.03792977, + "epoch": 0.5637156170148805, + "flos": 12926405272320.0, + "grad_norm": 5.168267369431286, + "language_loss": 0.80860347, + "learning_rate": 1.6856501909521889e-06, + "loss": 0.83006191, + "num_input_tokens_seen": 201939500, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.7109375, + "step": 9376, + "time_per_iteration": 2.4961087703704834 + }, + { + "auxiliary_loss_clip": 0.01110113, + "auxiliary_loss_mlp": 0.01033851, + "balance_loss_clip": 1.02070785, + "balance_loss_mlp": 1.03650188, + "epoch": 0.5637757402675485, + "flos": 45551033130240.0, + "grad_norm": 1.331404975713729, + "language_loss": 0.69354665, + "learning_rate": 1.6852655759373925e-06, + "loss": 0.71498632, + "num_input_tokens_seen": 201963000, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 9377, + "time_per_iteration": 2.6732125282287598 + }, + { + "auxiliary_loss_clip": 0.01104228, + "auxiliary_loss_mlp": 0.01030175, + "balance_loss_clip": 1.01828349, + "balance_loss_mlp": 1.03818166, + "epoch": 0.5638358635202164, + "flos": 20886041439360.0, + "grad_norm": 1.3430474289029712, + "language_loss": 0.74622703, + "learning_rate": 1.6848809728573565e-06, + "loss": 0.76757109, + "num_input_tokens_seen": 201983145, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66015625, + "step": 9378, + "time_per_iteration": 2.4836812019348145 + }, + { + "auxiliary_loss_clip": 0.01111215, + "auxiliary_loss_mlp": 0.01035583, + "balance_loss_clip": 1.0216949, + "balance_loss_mlp": 1.03538918, + "epoch": 0.5638959867728844, + "flos": 18806562345600.0, + "grad_norm": 2.4002466182561366, + "language_loss": 0.81976169, + "learning_rate": 1.6844963817266656e-06, + "loss": 0.84122968, + "num_input_tokens_seen": 202000335, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 9379, + "time_per_iteration": 2.4185829162597656 + }, + { + "auxiliary_loss_clip": 0.01106862, + "auxiliary_loss_mlp": 0.01029607, + "balance_loss_clip": 1.01691699, + "balance_loss_mlp": 1.03549135, + "epoch": 0.5639561100255523, + "flos": 27490336698240.0, + "grad_norm": 2.697413775835763, + "language_loss": 0.71534967, + "learning_rate": 1.6841118025599042e-06, + "loss": 0.73671436, + "num_input_tokens_seen": 202018275, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71484375, + "step": 9380, + "time_per_iteration": 2.5077950954437256 + }, + { + "auxiliary_loss_clip": 0.01110271, + "auxiliary_loss_mlp": 0.01034366, + "balance_loss_clip": 1.02068686, + "balance_loss_mlp": 1.03794408, + "epoch": 0.5640162332782204, + "flos": 18076570243200.0, + "grad_norm": 3.2105212283898905, + "language_loss": 0.74216485, + "learning_rate": 1.6837272353716542e-06, + "loss": 0.7636112, + "num_input_tokens_seen": 202034330, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.72265625, + "step": 9381, + "time_per_iteration": 2.4029319286346436 + }, + { + "auxiliary_loss_clip": 0.01110337, + "auxiliary_loss_mlp": 0.01031927, + "balance_loss_clip": 1.01963091, + "balance_loss_mlp": 1.03806376, + "epoch": 0.5640763565308883, + "flos": 20884856290560.0, + "grad_norm": 3.316310717009383, + "language_loss": 0.72300208, + "learning_rate": 1.683342680176499e-06, + "loss": 0.7444247, + "num_input_tokens_seen": 202053100, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.72265625, + "step": 9382, + "time_per_iteration": 2.501958131790161 + }, + { + "auxiliary_loss_clip": 0.01028829, + "auxiliary_loss_mlp": 0.00999503, + "balance_loss_clip": 0.99848998, + "balance_loss_mlp": 1.00756264, + "epoch": 0.5641364797835563, + "flos": 64447912224000.0, + "grad_norm": 0.7363360341332579, + "language_loss": 0.54461426, + "learning_rate": 1.682958136989022e-06, + "loss": 0.5648976, + "num_input_tokens_seen": 202120125, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.21289062, + "step": 9383, + "time_per_iteration": 3.2148938179016113 + }, + { + "auxiliary_loss_clip": 0.01110708, + "auxiliary_loss_mlp": 0.01030206, + "balance_loss_clip": 1.01627028, + "balance_loss_mlp": 1.03699017, + "epoch": 0.5641966030362242, + "flos": 18660944609280.0, + "grad_norm": 1.8140556963544339, + "language_loss": 0.71018171, + "learning_rate": 1.6825736058238033e-06, + "loss": 0.73159087, + "num_input_tokens_seen": 202138030, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.73828125, + "step": 9384, + "time_per_iteration": 2.442484140396118 + }, + { + "auxiliary_loss_clip": 0.0110745, + "auxiliary_loss_mlp": 0.01030718, + "balance_loss_clip": 1.01751578, + "balance_loss_mlp": 1.03652072, + "epoch": 0.5642567262888922, + "flos": 22492325738880.0, + "grad_norm": 7.95557819766849, + "language_loss": 0.76225626, + "learning_rate": 1.6821890866954263e-06, + "loss": 0.78363794, + "num_input_tokens_seen": 202155580, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 9385, + "time_per_iteration": 3.928744316101074 + }, + { + "auxiliary_loss_clip": 0.01105207, + "auxiliary_loss_mlp": 0.0103345, + "balance_loss_clip": 1.02080739, + "balance_loss_mlp": 1.0359602, + "epoch": 0.5643168495415603, + "flos": 13003972692480.0, + "grad_norm": 2.157193633028955, + "language_loss": 0.82184142, + "learning_rate": 1.6818045796184703e-06, + "loss": 0.84322798, + "num_input_tokens_seen": 202170365, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 9386, + "time_per_iteration": 2.397623062133789 + }, + { + "auxiliary_loss_clip": 0.01112226, + "auxiliary_loss_mlp": 0.01035726, + "balance_loss_clip": 1.0220114, + "balance_loss_mlp": 1.03887677, + "epoch": 0.5643769727942282, + "flos": 18588297352320.0, + "grad_norm": 2.006582014999343, + "language_loss": 0.6989364, + "learning_rate": 1.681420084607516e-06, + "loss": 0.72041589, + "num_input_tokens_seen": 202189095, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73046875, + "step": 9387, + "time_per_iteration": 5.281404733657837 + }, + { + "auxiliary_loss_clip": 0.01110413, + "auxiliary_loss_mlp": 0.01033865, + "balance_loss_clip": 1.02143192, + "balance_loss_mlp": 1.03790522, + "epoch": 0.5644370960468962, + "flos": 33806269572480.0, + "grad_norm": 1.551891117692425, + "language_loss": 0.74553275, + "learning_rate": 1.6810356016771452e-06, + "loss": 0.76697552, + "num_input_tokens_seen": 202213500, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7265625, + "step": 9388, + "time_per_iteration": 4.091272830963135 + }, + { + "auxiliary_loss_clip": 0.01103254, + "auxiliary_loss_mlp": 0.01030903, + "balance_loss_clip": 1.01970327, + "balance_loss_mlp": 1.03551602, + "epoch": 0.5644972192995641, + "flos": 21214911386880.0, + "grad_norm": 1.6063296237871756, + "language_loss": 0.82072294, + "learning_rate": 1.6806511308419353e-06, + "loss": 0.8420645, + "num_input_tokens_seen": 202231920, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.67578125, + "step": 9389, + "time_per_iteration": 2.4588046073913574 + }, + { + "auxiliary_loss_clip": 0.01110191, + "auxiliary_loss_mlp": 0.01034189, + "balance_loss_clip": 1.01995528, + "balance_loss_mlp": 1.03775918, + "epoch": 0.5645573425522321, + "flos": 18587722734720.0, + "grad_norm": 1.8781979731175902, + "language_loss": 0.64145517, + "learning_rate": 1.680266672116467e-06, + "loss": 0.66289902, + "num_input_tokens_seen": 202247600, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.72265625, + "step": 9390, + "time_per_iteration": 2.4152185916900635 + }, + { + "auxiliary_loss_clip": 0.01108689, + "auxiliary_loss_mlp": 0.01031011, + "balance_loss_clip": 1.01928711, + "balance_loss_mlp": 1.0396266, + "epoch": 0.5646174658049, + "flos": 18113809668480.0, + "grad_norm": 1.6485981004433565, + "language_loss": 0.91899133, + "learning_rate": 1.6798822255153192e-06, + "loss": 0.94038832, + "num_input_tokens_seen": 202265350, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.69140625, + "step": 9391, + "time_per_iteration": 2.4316937923431396 + }, + { + "auxiliary_loss_clip": 0.01113898, + "auxiliary_loss_mlp": 0.01036386, + "balance_loss_clip": 1.02221847, + "balance_loss_mlp": 1.03941607, + "epoch": 0.564677589057568, + "flos": 28329964087680.0, + "grad_norm": 1.8545056387285421, + "language_loss": 0.60528994, + "learning_rate": 1.6794977910530684e-06, + "loss": 0.62679285, + "num_input_tokens_seen": 202284285, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7421875, + "step": 9392, + "time_per_iteration": 2.524616003036499 + }, + { + "auxiliary_loss_clip": 0.01106729, + "auxiliary_loss_mlp": 0.01027429, + "balance_loss_clip": 1.01412547, + "balance_loss_mlp": 1.03683674, + "epoch": 0.564737712310236, + "flos": 22163743100160.0, + "grad_norm": 1.8891326454378248, + "language_loss": 0.81002814, + "learning_rate": 1.6791133687442937e-06, + "loss": 0.83136976, + "num_input_tokens_seen": 202303450, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.69921875, + "step": 9393, + "time_per_iteration": 2.5394442081451416 + }, + { + "auxiliary_loss_clip": 0.01109875, + "auxiliary_loss_mlp": 0.01029911, + "balance_loss_clip": 1.0175252, + "balance_loss_mlp": 1.03945863, + "epoch": 0.564797835562904, + "flos": 20959011918720.0, + "grad_norm": 1.6361233529041357, + "language_loss": 0.87129962, + "learning_rate": 1.6787289586035725e-06, + "loss": 0.89269751, + "num_input_tokens_seen": 202322315, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.703125, + "step": 9394, + "time_per_iteration": 2.4735207557678223 + }, + { + "auxiliary_loss_clip": 0.01109379, + "auxiliary_loss_mlp": 0.01030351, + "balance_loss_clip": 1.01856136, + "balance_loss_mlp": 1.04019666, + "epoch": 0.5648579588155719, + "flos": 17420302805760.0, + "grad_norm": 2.1407868955990232, + "language_loss": 0.84850395, + "learning_rate": 1.6783445606454814e-06, + "loss": 0.8699013, + "num_input_tokens_seen": 202339905, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.69140625, + "step": 9395, + "time_per_iteration": 2.457840919494629 + }, + { + "auxiliary_loss_clip": 0.01029319, + "auxiliary_loss_mlp": 0.00999952, + "balance_loss_clip": 0.99876004, + "balance_loss_mlp": 1.00789344, + "epoch": 0.5649180820682399, + "flos": 69929568835200.0, + "grad_norm": 0.857023745969297, + "language_loss": 0.58308172, + "learning_rate": 1.677960174884597e-06, + "loss": 0.60337436, + "num_input_tokens_seen": 202397320, + "router_z_loss_clip": 0.01190186, + "router_z_loss_mlp": 0.21484375, + "step": 9396, + "time_per_iteration": 3.073537588119507 + }, + { + "auxiliary_loss_clip": 0.01110535, + "auxiliary_loss_mlp": 0.01030433, + "balance_loss_clip": 1.01818371, + "balance_loss_mlp": 1.03816915, + "epoch": 0.5649782053209078, + "flos": 24973070641920.0, + "grad_norm": 2.248812637940723, + "language_loss": 0.70105237, + "learning_rate": 1.6775758013354943e-06, + "loss": 0.72246206, + "num_input_tokens_seen": 202416865, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.72265625, + "step": 9397, + "time_per_iteration": 2.4962973594665527 + }, + { + "auxiliary_loss_clip": 0.01109847, + "auxiliary_loss_mlp": 0.01032556, + "balance_loss_clip": 1.02008653, + "balance_loss_mlp": 1.03723562, + "epoch": 0.5650383285735758, + "flos": 21726602582400.0, + "grad_norm": 1.751232513493423, + "language_loss": 0.66376907, + "learning_rate": 1.67719144001275e-06, + "loss": 0.68519312, + "num_input_tokens_seen": 202436210, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7265625, + "step": 9398, + "time_per_iteration": 2.4747612476348877 + }, + { + "auxiliary_loss_clip": 0.01027927, + "auxiliary_loss_mlp": 0.0100079, + "balance_loss_clip": 0.99962217, + "balance_loss_mlp": 1.00642622, + "epoch": 0.5650984518262439, + "flos": 65904484636800.0, + "grad_norm": 0.8050196413226386, + "language_loss": 0.58135325, + "learning_rate": 1.6768070909309386e-06, + "loss": 0.60164046, + "num_input_tokens_seen": 202492925, + "router_z_loss_clip": 0.01165771, + "router_z_loss_mlp": 0.21484375, + "step": 9399, + "time_per_iteration": 3.043860912322998 + }, + { + "auxiliary_loss_clip": 0.01109539, + "auxiliary_loss_mlp": 0.01034107, + "balance_loss_clip": 1.01959336, + "balance_loss_mlp": 1.03663015, + "epoch": 0.5651585750789118, + "flos": 21032592929280.0, + "grad_norm": 1.8022721102148394, + "language_loss": 0.72654182, + "learning_rate": 1.6764227541046347e-06, + "loss": 0.74797827, + "num_input_tokens_seen": 202511905, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7265625, + "step": 9400, + "time_per_iteration": 2.46345853805542 + }, + { + "auxiliary_loss_clip": 0.01112209, + "auxiliary_loss_mlp": 0.01036399, + "balance_loss_clip": 1.02223074, + "balance_loss_mlp": 1.03858781, + "epoch": 0.5652186983315798, + "flos": 18551919853440.0, + "grad_norm": 2.2275961694321254, + "language_loss": 0.61034292, + "learning_rate": 1.676038429548412e-06, + "loss": 0.63182896, + "num_input_tokens_seen": 202529815, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.734375, + "step": 9401, + "time_per_iteration": 2.4518327713012695 + }, + { + "auxiliary_loss_clip": 0.01105602, + "auxiliary_loss_mlp": 0.01028063, + "balance_loss_clip": 1.01590967, + "balance_loss_mlp": 1.03578329, + "epoch": 0.5652788215842477, + "flos": 18478662065280.0, + "grad_norm": 1.8211208041554372, + "language_loss": 0.81334603, + "learning_rate": 1.6756541172768453e-06, + "loss": 0.8346827, + "num_input_tokens_seen": 202547710, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69921875, + "step": 9402, + "time_per_iteration": 2.4201457500457764 + }, + { + "auxiliary_loss_clip": 0.0110456, + "auxiliary_loss_mlp": 0.01033217, + "balance_loss_clip": 1.02154684, + "balance_loss_mlp": 1.03594768, + "epoch": 0.5653389448369157, + "flos": 30044052080640.0, + "grad_norm": 1.4814077209882908, + "language_loss": 0.77969164, + "learning_rate": 1.6752698173045068e-06, + "loss": 0.80106944, + "num_input_tokens_seen": 202568835, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.68359375, + "step": 9403, + "time_per_iteration": 2.5353829860687256 + }, + { + "auxiliary_loss_clip": 0.01106959, + "auxiliary_loss_mlp": 0.01027921, + "balance_loss_clip": 1.01558244, + "balance_loss_mlp": 1.03666544, + "epoch": 0.5653990680895836, + "flos": 16727550128640.0, + "grad_norm": 1.6092170779922605, + "language_loss": 0.68699729, + "learning_rate": 1.6748855296459685e-06, + "loss": 0.70834613, + "num_input_tokens_seen": 202587385, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 9404, + "time_per_iteration": 2.4321181774139404 + }, + { + "auxiliary_loss_clip": 0.01102774, + "auxiliary_loss_mlp": 0.0103085, + "balance_loss_clip": 1.01951897, + "balance_loss_mlp": 1.03503776, + "epoch": 0.5654591913422516, + "flos": 14538256179840.0, + "grad_norm": 2.484491546437136, + "language_loss": 0.66842878, + "learning_rate": 1.6745012543158045e-06, + "loss": 0.68976498, + "num_input_tokens_seen": 202604815, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6796875, + "step": 9405, + "time_per_iteration": 2.440232992172241 + }, + { + "auxiliary_loss_clip": 0.01104276, + "auxiliary_loss_mlp": 0.01031828, + "balance_loss_clip": 1.02001476, + "balance_loss_mlp": 1.03823268, + "epoch": 0.5655193145949196, + "flos": 26209905603840.0, + "grad_norm": 1.9824391842040467, + "language_loss": 0.74238181, + "learning_rate": 1.6741169913285852e-06, + "loss": 0.76374286, + "num_input_tokens_seen": 202623775, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66015625, + "step": 9406, + "time_per_iteration": 2.4748172760009766 + }, + { + "auxiliary_loss_clip": 0.0110835, + "auxiliary_loss_mlp": 0.01033658, + "balance_loss_clip": 1.02006197, + "balance_loss_mlp": 1.03640151, + "epoch": 0.5655794378475876, + "flos": 25046579825280.0, + "grad_norm": 1.7875183280919196, + "language_loss": 0.79345733, + "learning_rate": 1.673732740698882e-06, + "loss": 0.81487745, + "num_input_tokens_seen": 202643375, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71875, + "step": 9407, + "time_per_iteration": 2.507815361022949 + }, + { + "auxiliary_loss_clip": 0.01104854, + "auxiliary_loss_mlp": 0.01031122, + "balance_loss_clip": 1.0192194, + "balance_loss_mlp": 1.03815317, + "epoch": 0.5656395611002555, + "flos": 31032852652800.0, + "grad_norm": 1.520930632215419, + "language_loss": 0.70626116, + "learning_rate": 1.6733485024412666e-06, + "loss": 0.7276209, + "num_input_tokens_seen": 202668400, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 9408, + "time_per_iteration": 2.62674880027771 + }, + { + "auxiliary_loss_clip": 0.01106638, + "auxiliary_loss_mlp": 0.01032436, + "balance_loss_clip": 1.02018738, + "balance_loss_mlp": 1.03758848, + "epoch": 0.5656996843529235, + "flos": 20229522606720.0, + "grad_norm": 2.0177540820880377, + "language_loss": 0.81701803, + "learning_rate": 1.672964276570308e-06, + "loss": 0.83840877, + "num_input_tokens_seen": 202685125, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.69140625, + "step": 9409, + "time_per_iteration": 2.4532053470611572 + }, + { + "auxiliary_loss_clip": 0.01105936, + "auxiliary_loss_mlp": 0.01026562, + "balance_loss_clip": 1.01446199, + "balance_loss_mlp": 1.03632855, + "epoch": 0.5657598076055914, + "flos": 20996251344000.0, + "grad_norm": 1.7583452820695855, + "language_loss": 0.77886415, + "learning_rate": 1.6725800631005776e-06, + "loss": 0.80018914, + "num_input_tokens_seen": 202703830, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69921875, + "step": 9410, + "time_per_iteration": 2.441938877105713 + }, + { + "auxiliary_loss_clip": 0.01107661, + "auxiliary_loss_mlp": 0.01033936, + "balance_loss_clip": 1.02188444, + "balance_loss_mlp": 1.0371294, + "epoch": 0.5658199308582594, + "flos": 11545999649280.0, + "grad_norm": 2.4716186369957405, + "language_loss": 0.83512276, + "learning_rate": 1.6721958620466432e-06, + "loss": 0.85653877, + "num_input_tokens_seen": 202719835, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.703125, + "step": 9411, + "time_per_iteration": 2.4718945026397705 + }, + { + "auxiliary_loss_clip": 0.01111478, + "auxiliary_loss_mlp": 0.01031553, + "balance_loss_clip": 1.01870787, + "balance_loss_mlp": 1.03809881, + "epoch": 0.5658800541109275, + "flos": 14172146807040.0, + "grad_norm": 2.235812012909735, + "language_loss": 0.67052126, + "learning_rate": 1.6718116734230749e-06, + "loss": 0.69195151, + "num_input_tokens_seen": 202736795, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.734375, + "step": 9412, + "time_per_iteration": 2.4114651679992676 + }, + { + "auxiliary_loss_clip": 0.01102875, + "auxiliary_loss_mlp": 0.01027937, + "balance_loss_clip": 1.01747072, + "balance_loss_mlp": 1.03637409, + "epoch": 0.5659401773635954, + "flos": 27305073325440.0, + "grad_norm": 1.4642683426161254, + "language_loss": 0.58723432, + "learning_rate": 1.6714274972444413e-06, + "loss": 0.60854244, + "num_input_tokens_seen": 202756900, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.6640625, + "step": 9413, + "time_per_iteration": 2.5274460315704346 + }, + { + "auxiliary_loss_clip": 0.01102994, + "auxiliary_loss_mlp": 0.01028734, + "balance_loss_clip": 1.01708674, + "balance_loss_mlp": 1.03515315, + "epoch": 0.5660003006162634, + "flos": 16728196573440.0, + "grad_norm": 1.4689493119012975, + "language_loss": 0.69065028, + "learning_rate": 1.6710433335253092e-06, + "loss": 0.71196759, + "num_input_tokens_seen": 202775145, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 9414, + "time_per_iteration": 2.4249722957611084 + }, + { + "auxiliary_loss_clip": 0.01100758, + "auxiliary_loss_mlp": 0.01026939, + "balance_loss_clip": 1.0162462, + "balance_loss_mlp": 1.03464198, + "epoch": 0.5660604238689313, + "flos": 21653452535040.0, + "grad_norm": 2.330719071721026, + "language_loss": 0.78351963, + "learning_rate": 1.670659182280247e-06, + "loss": 0.80479658, + "num_input_tokens_seen": 202794505, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.66015625, + "step": 9415, + "time_per_iteration": 2.4853508472442627 + }, + { + "auxiliary_loss_clip": 0.01027693, + "auxiliary_loss_mlp": 0.01002734, + "balance_loss_clip": 1.00167274, + "balance_loss_mlp": 1.00642896, + "epoch": 0.5661205471215993, + "flos": 68824022083200.0, + "grad_norm": 0.686572948711127, + "language_loss": 0.49232727, + "learning_rate": 1.670275043523822e-06, + "loss": 0.51263154, + "num_input_tokens_seen": 202858580, + "router_z_loss_clip": 0.01062012, + "router_z_loss_mlp": 0.21289062, + "step": 9416, + "time_per_iteration": 3.1817550659179688 + }, + { + "auxiliary_loss_clip": 0.01106414, + "auxiliary_loss_mlp": 0.01032774, + "balance_loss_clip": 1.02036452, + "balance_loss_mlp": 1.03713977, + "epoch": 0.5661806703742672, + "flos": 28621774177920.0, + "grad_norm": 1.6874553076405654, + "language_loss": 0.62577593, + "learning_rate": 1.6698909172706e-06, + "loss": 0.6471678, + "num_input_tokens_seen": 202878565, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6953125, + "step": 9417, + "time_per_iteration": 2.5856666564941406 + }, + { + "auxiliary_loss_clip": 0.01107822, + "auxiliary_loss_mlp": 0.01030901, + "balance_loss_clip": 1.01865244, + "balance_loss_mlp": 1.03606224, + "epoch": 0.5662407936269352, + "flos": 21397948116480.0, + "grad_norm": 1.797784660701456, + "language_loss": 0.68931323, + "learning_rate": 1.6695068035351479e-06, + "loss": 0.71070051, + "num_input_tokens_seen": 202897350, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.71875, + "step": 9418, + "time_per_iteration": 2.4920060634613037 + }, + { + "auxiliary_loss_clip": 0.01105804, + "auxiliary_loss_mlp": 0.01032238, + "balance_loss_clip": 1.01873779, + "balance_loss_mlp": 1.035465, + "epoch": 0.5663009168796032, + "flos": 25660005315840.0, + "grad_norm": 1.9782803688051387, + "language_loss": 0.64613676, + "learning_rate": 1.6691227023320304e-06, + "loss": 0.66751719, + "num_input_tokens_seen": 202916745, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.703125, + "step": 9419, + "time_per_iteration": 2.5130629539489746 + }, + { + "auxiliary_loss_clip": 0.01028877, + "auxiliary_loss_mlp": 0.01005663, + "balance_loss_clip": 1.00455463, + "balance_loss_mlp": 1.00721812, + "epoch": 0.5663610401322712, + "flos": 67930458422400.0, + "grad_norm": 0.7373486000439856, + "language_loss": 0.59778821, + "learning_rate": 1.6687386136758135e-06, + "loss": 0.61813354, + "num_input_tokens_seen": 202982375, + "router_z_loss_clip": 0.0111084, + "router_z_loss_mlp": 0.21679688, + "step": 9420, + "time_per_iteration": 3.1712303161621094 + }, + { + "auxiliary_loss_clip": 0.01101914, + "auxiliary_loss_mlp": 0.01029636, + "balance_loss_clip": 1.01874661, + "balance_loss_mlp": 1.03477347, + "epoch": 0.5664211633849391, + "flos": 24609367480320.0, + "grad_norm": 1.7745364781392496, + "language_loss": 0.74103463, + "learning_rate": 1.6683545375810618e-06, + "loss": 0.76235008, + "num_input_tokens_seen": 203002430, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.671875, + "step": 9421, + "time_per_iteration": 2.4926223754882812 + }, + { + "auxiliary_loss_clip": 0.01109317, + "auxiliary_loss_mlp": 0.01035488, + "balance_loss_clip": 1.02292371, + "balance_loss_mlp": 1.03705812, + "epoch": 0.5664812866376071, + "flos": 11648811352320.0, + "grad_norm": 1.8540803425049197, + "language_loss": 0.72345394, + "learning_rate": 1.6679704740623389e-06, + "loss": 0.74490201, + "num_input_tokens_seen": 203019425, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.72265625, + "step": 9422, + "time_per_iteration": 2.4081509113311768 + }, + { + "auxiliary_loss_clip": 0.01103997, + "auxiliary_loss_mlp": 0.01034779, + "balance_loss_clip": 1.02378821, + "balance_loss_mlp": 1.03694618, + "epoch": 0.566541409890275, + "flos": 24643985212800.0, + "grad_norm": 1.7305682094853587, + "language_loss": 0.81321973, + "learning_rate": 1.6675864231342085e-06, + "loss": 0.83460754, + "num_input_tokens_seen": 203039035, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.671875, + "step": 9423, + "time_per_iteration": 2.4871041774749756 + }, + { + "auxiliary_loss_clip": 0.01102932, + "auxiliary_loss_mlp": 0.0103348, + "balance_loss_clip": 1.0210824, + "balance_loss_mlp": 1.0354147, + "epoch": 0.566601533142943, + "flos": 22270577126400.0, + "grad_norm": 1.656660590859511, + "language_loss": 0.8069616, + "learning_rate": 1.6672023848112353e-06, + "loss": 0.82832569, + "num_input_tokens_seen": 203059320, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.67578125, + "step": 9424, + "time_per_iteration": 2.4634275436401367 + }, + { + "auxiliary_loss_clip": 0.01111676, + "auxiliary_loss_mlp": 0.01032203, + "balance_loss_clip": 1.01844072, + "balance_loss_mlp": 1.03887486, + "epoch": 0.5666616563956111, + "flos": 29971656218880.0, + "grad_norm": 1.8161233698436283, + "language_loss": 0.78745866, + "learning_rate": 1.6668183591079805e-06, + "loss": 0.80889738, + "num_input_tokens_seen": 203078490, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7265625, + "step": 9425, + "time_per_iteration": 2.5064780712127686 + }, + { + "auxiliary_loss_clip": 0.01105998, + "auxiliary_loss_mlp": 0.01028946, + "balance_loss_clip": 1.01658988, + "balance_loss_mlp": 1.03674626, + "epoch": 0.566721779648279, + "flos": 17781456101760.0, + "grad_norm": 1.8642193992685885, + "language_loss": 0.5897873, + "learning_rate": 1.6664343460390064e-06, + "loss": 0.61113673, + "num_input_tokens_seen": 203096065, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6953125, + "step": 9426, + "time_per_iteration": 2.4720263481140137 + }, + { + "auxiliary_loss_clip": 0.01110856, + "auxiliary_loss_mlp": 0.01030111, + "balance_loss_clip": 1.01804113, + "balance_loss_mlp": 1.03823078, + "epoch": 0.566781902900947, + "flos": 21033490769280.0, + "grad_norm": 2.0557394177022768, + "language_loss": 0.81685758, + "learning_rate": 1.6660503456188764e-06, + "loss": 0.83826721, + "num_input_tokens_seen": 203115270, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.7265625, + "step": 9427, + "time_per_iteration": 3.872758388519287 + }, + { + "auxiliary_loss_clip": 0.01104828, + "auxiliary_loss_mlp": 0.01036492, + "balance_loss_clip": 1.02394485, + "balance_loss_mlp": 1.03744185, + "epoch": 0.5668420261536149, + "flos": 23148593176320.0, + "grad_norm": 1.8776390907485432, + "language_loss": 0.86198628, + "learning_rate": 1.6656663578621498e-06, + "loss": 0.88339949, + "num_input_tokens_seen": 203134290, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.671875, + "step": 9428, + "time_per_iteration": 2.4911303520202637 + }, + { + "auxiliary_loss_clip": 0.01112998, + "auxiliary_loss_mlp": 0.01036692, + "balance_loss_clip": 1.02427602, + "balance_loss_mlp": 1.04080331, + "epoch": 0.5669021494062829, + "flos": 22601601889920.0, + "grad_norm": 2.1518083513194552, + "language_loss": 0.74125421, + "learning_rate": 1.6652823827833886e-06, + "loss": 0.7627511, + "num_input_tokens_seen": 203152935, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71875, + "step": 9429, + "time_per_iteration": 3.9635231494903564 + }, + { + "auxiliary_loss_clip": 0.01109434, + "auxiliary_loss_mlp": 0.0103455, + "balance_loss_clip": 1.02127612, + "balance_loss_mlp": 1.03756118, + "epoch": 0.5669622726589508, + "flos": 17381231786880.0, + "grad_norm": 1.7976574461964, + "language_loss": 0.7496838, + "learning_rate": 1.6648984203971538e-06, + "loss": 0.77112365, + "num_input_tokens_seen": 203170110, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 9430, + "time_per_iteration": 3.8817877769470215 + }, + { + "auxiliary_loss_clip": 0.01106735, + "auxiliary_loss_mlp": 0.01033765, + "balance_loss_clip": 1.02152157, + "balance_loss_mlp": 1.03621042, + "epoch": 0.5670223959116188, + "flos": 18763253521920.0, + "grad_norm": 2.3751678803775285, + "language_loss": 0.7272107, + "learning_rate": 1.6645144707180032e-06, + "loss": 0.74861568, + "num_input_tokens_seen": 203188825, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.703125, + "step": 9431, + "time_per_iteration": 2.51401948928833 + }, + { + "auxiliary_loss_clip": 0.01100909, + "auxiliary_loss_mlp": 0.01029023, + "balance_loss_clip": 1.01810944, + "balance_loss_mlp": 1.03722477, + "epoch": 0.5670825191642868, + "flos": 13553334276480.0, + "grad_norm": 1.9291254540879526, + "language_loss": 0.73248518, + "learning_rate": 1.6641305337604984e-06, + "loss": 0.75378448, + "num_input_tokens_seen": 203206860, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.63671875, + "step": 9432, + "time_per_iteration": 2.4319839477539062 + }, + { + "auxiliary_loss_clip": 0.01107311, + "auxiliary_loss_mlp": 0.01032729, + "balance_loss_clip": 1.02087343, + "balance_loss_mlp": 1.03681755, + "epoch": 0.5671426424169548, + "flos": 22054035985920.0, + "grad_norm": 1.5888571716641233, + "language_loss": 0.77957594, + "learning_rate": 1.663746609539197e-06, + "loss": 0.80097634, + "num_input_tokens_seen": 203225625, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.703125, + "step": 9433, + "time_per_iteration": 2.5169765949249268 + }, + { + "auxiliary_loss_clip": 0.01111851, + "auxiliary_loss_mlp": 0.01031044, + "balance_loss_clip": 1.01645875, + "balance_loss_mlp": 1.03870261, + "epoch": 0.5672027656696227, + "flos": 21323972056320.0, + "grad_norm": 1.7704673621088174, + "language_loss": 0.63839334, + "learning_rate": 1.6633626980686582e-06, + "loss": 0.65982234, + "num_input_tokens_seen": 203242920, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.73046875, + "step": 9434, + "time_per_iteration": 2.4372098445892334 + }, + { + "auxiliary_loss_clip": 0.01102835, + "auxiliary_loss_mlp": 0.01026729, + "balance_loss_clip": 1.01495695, + "balance_loss_mlp": 1.03529072, + "epoch": 0.5672628889222907, + "flos": 23514056104320.0, + "grad_norm": 1.879777953851778, + "language_loss": 0.66724491, + "learning_rate": 1.6629787993634399e-06, + "loss": 0.68854052, + "num_input_tokens_seen": 203261995, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.67578125, + "step": 9435, + "time_per_iteration": 2.5156021118164062 + }, + { + "auxiliary_loss_clip": 0.01104078, + "auxiliary_loss_mlp": 0.01032518, + "balance_loss_clip": 1.02028716, + "balance_loss_mlp": 1.03599691, + "epoch": 0.5673230121749586, + "flos": 27121928855040.0, + "grad_norm": 1.3893571871291595, + "language_loss": 0.71398699, + "learning_rate": 1.6625949134380984e-06, + "loss": 0.73535293, + "num_input_tokens_seen": 203280670, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6796875, + "step": 9436, + "time_per_iteration": 2.4815714359283447 + }, + { + "auxiliary_loss_clip": 0.01109121, + "auxiliary_loss_mlp": 0.01029795, + "balance_loss_clip": 1.01723647, + "balance_loss_mlp": 1.03756368, + "epoch": 0.5673831354276266, + "flos": 31141985149440.0, + "grad_norm": 1.6654091498260946, + "language_loss": 0.73988926, + "learning_rate": 1.6622110403071921e-06, + "loss": 0.76127845, + "num_input_tokens_seen": 203304800, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71484375, + "step": 9437, + "time_per_iteration": 2.5965943336486816 + }, + { + "auxiliary_loss_clip": 0.01112439, + "auxiliary_loss_mlp": 0.01032397, + "balance_loss_clip": 1.01942062, + "balance_loss_mlp": 1.04159832, + "epoch": 0.5674432586802945, + "flos": 27673193859840.0, + "grad_norm": 2.439390833366172, + "language_loss": 0.60905057, + "learning_rate": 1.661827179985277e-06, + "loss": 0.63049889, + "num_input_tokens_seen": 203324060, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.70703125, + "step": 9438, + "time_per_iteration": 2.512578010559082 + }, + { + "auxiliary_loss_clip": 0.01105416, + "auxiliary_loss_mlp": 0.01028895, + "balance_loss_clip": 1.01714146, + "balance_loss_mlp": 1.03543329, + "epoch": 0.5675033819329626, + "flos": 26615157822720.0, + "grad_norm": 1.6600048607148805, + "language_loss": 0.75087392, + "learning_rate": 1.661443332486909e-06, + "loss": 0.77221704, + "num_input_tokens_seen": 203344360, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.69921875, + "step": 9439, + "time_per_iteration": 2.531489133834839 + }, + { + "auxiliary_loss_clip": 0.01107772, + "auxiliary_loss_mlp": 0.01029394, + "balance_loss_clip": 1.0159471, + "balance_loss_mlp": 1.03828883, + "epoch": 0.5675635051856306, + "flos": 19098372435840.0, + "grad_norm": 1.8930047517001285, + "language_loss": 0.8361944, + "learning_rate": 1.6610594978266438e-06, + "loss": 0.857566, + "num_input_tokens_seen": 203362115, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.6953125, + "step": 9440, + "time_per_iteration": 2.4386231899261475 + }, + { + "auxiliary_loss_clip": 0.01111147, + "auxiliary_loss_mlp": 0.01034391, + "balance_loss_clip": 1.02123022, + "balance_loss_mlp": 1.03704751, + "epoch": 0.5676236284382985, + "flos": 17566315591680.0, + "grad_norm": 2.0023123091206467, + "language_loss": 0.7550447, + "learning_rate": 1.6606756760190365e-06, + "loss": 0.77650005, + "num_input_tokens_seen": 203380550, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 9441, + "time_per_iteration": 2.4788920879364014 + }, + { + "auxiliary_loss_clip": 0.01108262, + "auxiliary_loss_mlp": 0.01032728, + "balance_loss_clip": 1.02022874, + "balance_loss_mlp": 1.0381217, + "epoch": 0.5676837516909665, + "flos": 15954069634560.0, + "grad_norm": 2.003106565766755, + "language_loss": 0.83199525, + "learning_rate": 1.6602918670786413e-06, + "loss": 0.85340512, + "num_input_tokens_seen": 203396590, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 9442, + "time_per_iteration": 2.4066359996795654 + }, + { + "auxiliary_loss_clip": 0.01103828, + "auxiliary_loss_mlp": 0.01030609, + "balance_loss_clip": 1.01906371, + "balance_loss_mlp": 1.0388906, + "epoch": 0.5677438749436344, + "flos": 18295912644480.0, + "grad_norm": 2.099488848818881, + "language_loss": 0.74606907, + "learning_rate": 1.6599080710200126e-06, + "loss": 0.76741344, + "num_input_tokens_seen": 203414280, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 9443, + "time_per_iteration": 2.4699020385742188 + }, + { + "auxiliary_loss_clip": 0.0110959, + "auxiliary_loss_mlp": 0.01034472, + "balance_loss_clip": 1.02184737, + "balance_loss_mlp": 1.03892851, + "epoch": 0.5678039981963025, + "flos": 17931311642880.0, + "grad_norm": 1.9353911334921245, + "language_loss": 0.77443373, + "learning_rate": 1.6595242878577046e-06, + "loss": 0.79587436, + "num_input_tokens_seen": 203433280, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.70703125, + "step": 9444, + "time_per_iteration": 2.418164014816284 + }, + { + "auxiliary_loss_clip": 0.01110885, + "auxiliary_loss_mlp": 0.01037563, + "balance_loss_clip": 1.02498603, + "balance_loss_mlp": 1.03886068, + "epoch": 0.5678641214489704, + "flos": 19316350120320.0, + "grad_norm": 1.6369546772732781, + "language_loss": 0.80673003, + "learning_rate": 1.6591405176062687e-06, + "loss": 0.82821453, + "num_input_tokens_seen": 203449935, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71875, + "step": 9445, + "time_per_iteration": 2.4474682807922363 + }, + { + "auxiliary_loss_clip": 0.01105393, + "auxiliary_loss_mlp": 0.01026724, + "balance_loss_clip": 1.0146122, + "balance_loss_mlp": 1.03579414, + "epoch": 0.5679242447016384, + "flos": 27751084502400.0, + "grad_norm": 1.310891415120181, + "language_loss": 0.70843911, + "learning_rate": 1.658756760280259e-06, + "loss": 0.72976023, + "num_input_tokens_seen": 203473025, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 9446, + "time_per_iteration": 2.5338428020477295 + }, + { + "auxiliary_loss_clip": 0.01111342, + "auxiliary_loss_mlp": 0.01031244, + "balance_loss_clip": 1.018489, + "balance_loss_mlp": 1.03815663, + "epoch": 0.5679843679543063, + "flos": 23769093646080.0, + "grad_norm": 1.8305308972685952, + "language_loss": 0.7354359, + "learning_rate": 1.6583730158942276e-06, + "loss": 0.75686181, + "num_input_tokens_seen": 203492895, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.734375, + "step": 9447, + "time_per_iteration": 2.5152740478515625 + }, + { + "auxiliary_loss_clip": 0.01110587, + "auxiliary_loss_mlp": 0.01032075, + "balance_loss_clip": 1.01963568, + "balance_loss_mlp": 1.0382061, + "epoch": 0.5680444912069743, + "flos": 25591883172480.0, + "grad_norm": 2.262443693729548, + "language_loss": 0.74931812, + "learning_rate": 1.657989284462725e-06, + "loss": 0.77074468, + "num_input_tokens_seen": 203513710, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.72265625, + "step": 9448, + "time_per_iteration": 2.468688488006592 + }, + { + "auxiliary_loss_clip": 0.0111579, + "auxiliary_loss_mlp": 0.01035922, + "balance_loss_clip": 1.0227201, + "balance_loss_mlp": 1.04175234, + "epoch": 0.5681046144596422, + "flos": 23695799944320.0, + "grad_norm": 2.1518179799978356, + "language_loss": 0.76137841, + "learning_rate": 1.6576055660003038e-06, + "loss": 0.78289551, + "num_input_tokens_seen": 203531630, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73828125, + "step": 9449, + "time_per_iteration": 2.510693311691284 + }, + { + "auxiliary_loss_clip": 0.01110533, + "auxiliary_loss_mlp": 0.01033296, + "balance_loss_clip": 1.02046347, + "balance_loss_mlp": 1.03867984, + "epoch": 0.5681647377123102, + "flos": 28000770917760.0, + "grad_norm": 1.6592475910366993, + "language_loss": 0.74742198, + "learning_rate": 1.6572218605215128e-06, + "loss": 0.76886022, + "num_input_tokens_seen": 203551885, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 9450, + "time_per_iteration": 2.5034866333007812 + }, + { + "auxiliary_loss_clip": 0.01112382, + "auxiliary_loss_mlp": 0.01035469, + "balance_loss_clip": 1.02404845, + "balance_loss_mlp": 1.04081213, + "epoch": 0.5682248609649782, + "flos": 22747758330240.0, + "grad_norm": 3.8340234675809017, + "language_loss": 0.67216206, + "learning_rate": 1.6568381680409038e-06, + "loss": 0.69364059, + "num_input_tokens_seen": 203572250, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.71875, + "step": 9451, + "time_per_iteration": 2.503805637359619 + }, + { + "auxiliary_loss_clip": 0.01115025, + "auxiliary_loss_mlp": 0.01031671, + "balance_loss_clip": 1.01743114, + "balance_loss_mlp": 1.03788531, + "epoch": 0.5682849842176462, + "flos": 21288600138240.0, + "grad_norm": 1.8009184427821863, + "language_loss": 0.71697223, + "learning_rate": 1.656454488573026e-06, + "loss": 0.7384392, + "num_input_tokens_seen": 203590605, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7734375, + "step": 9452, + "time_per_iteration": 2.4519643783569336 + }, + { + "auxiliary_loss_clip": 0.01105455, + "auxiliary_loss_mlp": 0.01031445, + "balance_loss_clip": 1.01947021, + "balance_loss_mlp": 1.03679395, + "epoch": 0.5683451074703142, + "flos": 21141689512320.0, + "grad_norm": 1.6525298490216664, + "language_loss": 0.70272237, + "learning_rate": 1.656070822132428e-06, + "loss": 0.72409141, + "num_input_tokens_seen": 203610080, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6875, + "step": 9453, + "time_per_iteration": 2.5260796546936035 + }, + { + "auxiliary_loss_clip": 0.01110332, + "auxiliary_loss_mlp": 0.01034031, + "balance_loss_clip": 1.02190745, + "balance_loss_mlp": 1.03889799, + "epoch": 0.5684052307229821, + "flos": 22344481359360.0, + "grad_norm": 2.2860746429720833, + "language_loss": 0.69546616, + "learning_rate": 1.6556871687336592e-06, + "loss": 0.71690989, + "num_input_tokens_seen": 203630060, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71484375, + "step": 9454, + "time_per_iteration": 2.457736015319824 + }, + { + "auxiliary_loss_clip": 0.01103936, + "auxiliary_loss_mlp": 0.01029774, + "balance_loss_clip": 1.01837158, + "balance_loss_mlp": 1.03616297, + "epoch": 0.5684653539756501, + "flos": 21798639308160.0, + "grad_norm": 1.8998375571155763, + "language_loss": 0.60430771, + "learning_rate": 1.6553035283912671e-06, + "loss": 0.6256448, + "num_input_tokens_seen": 203649065, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.67578125, + "step": 9455, + "time_per_iteration": 2.506091594696045 + }, + { + "auxiliary_loss_clip": 0.01116555, + "auxiliary_loss_mlp": 0.01030863, + "balance_loss_clip": 1.01808953, + "balance_loss_mlp": 1.0424788, + "epoch": 0.568525477228318, + "flos": 22999635475200.0, + "grad_norm": 2.102932497256003, + "language_loss": 0.72914851, + "learning_rate": 1.6549199011198e-06, + "loss": 0.75062263, + "num_input_tokens_seen": 203667545, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7421875, + "step": 9456, + "time_per_iteration": 2.439221143722534 + }, + { + "auxiliary_loss_clip": 0.01108291, + "auxiliary_loss_mlp": 0.01032712, + "balance_loss_clip": 1.02125049, + "balance_loss_mlp": 1.03915823, + "epoch": 0.568585600480986, + "flos": 21392489249280.0, + "grad_norm": 1.5692423529190727, + "language_loss": 0.76402628, + "learning_rate": 1.6545362869338048e-06, + "loss": 0.78543633, + "num_input_tokens_seen": 203686025, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.69140625, + "step": 9457, + "time_per_iteration": 2.475327491760254 + }, + { + "auxiliary_loss_clip": 0.01110625, + "auxiliary_loss_mlp": 0.01036596, + "balance_loss_clip": 1.02338171, + "balance_loss_mlp": 1.03828931, + "epoch": 0.568645723733654, + "flos": 30007351359360.0, + "grad_norm": 1.8808926225586853, + "language_loss": 0.66305089, + "learning_rate": 1.6541526858478285e-06, + "loss": 0.68452305, + "num_input_tokens_seen": 203705540, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7265625, + "step": 9458, + "time_per_iteration": 2.5271642208099365 + }, + { + "auxiliary_loss_clip": 0.01111416, + "auxiliary_loss_mlp": 0.01027286, + "balance_loss_clip": 1.01424456, + "balance_loss_mlp": 1.03845215, + "epoch": 0.568705846986322, + "flos": 20412667077120.0, + "grad_norm": 2.21557799175144, + "language_loss": 0.67912495, + "learning_rate": 1.6537690978764167e-06, + "loss": 0.70051199, + "num_input_tokens_seen": 203723670, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.73046875, + "step": 9459, + "time_per_iteration": 2.534374237060547 + }, + { + "auxiliary_loss_clip": 0.0111268, + "auxiliary_loss_mlp": 0.01032195, + "balance_loss_clip": 1.01942194, + "balance_loss_mlp": 1.04046702, + "epoch": 0.5687659702389899, + "flos": 17456752131840.0, + "grad_norm": 3.4353012744759335, + "language_loss": 0.77999187, + "learning_rate": 1.6533855230341155e-06, + "loss": 0.8014406, + "num_input_tokens_seen": 203739705, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.72265625, + "step": 9460, + "time_per_iteration": 2.434570789337158 + }, + { + "auxiliary_loss_clip": 0.01109374, + "auxiliary_loss_mlp": 0.01035426, + "balance_loss_clip": 1.02221131, + "balance_loss_mlp": 1.03767824, + "epoch": 0.5688260934916579, + "flos": 25406081095680.0, + "grad_norm": 1.7026913094631195, + "language_loss": 0.71950358, + "learning_rate": 1.65300196133547e-06, + "loss": 0.74095166, + "num_input_tokens_seen": 203759000, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71875, + "step": 9461, + "time_per_iteration": 2.5527231693267822 + }, + { + "auxiliary_loss_clip": 0.01109281, + "auxiliary_loss_mlp": 0.01030442, + "balance_loss_clip": 1.01769304, + "balance_loss_mlp": 1.03814745, + "epoch": 0.5688862167443258, + "flos": 21608024808960.0, + "grad_norm": 2.8717094069028617, + "language_loss": 0.72976351, + "learning_rate": 1.6526184127950249e-06, + "loss": 0.75116074, + "num_input_tokens_seen": 203774295, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9462, + "time_per_iteration": 2.422624111175537 + }, + { + "auxiliary_loss_clip": 0.0110396, + "auxiliary_loss_mlp": 0.01026733, + "balance_loss_clip": 1.01590848, + "balance_loss_mlp": 1.03640223, + "epoch": 0.5689463399969938, + "flos": 22418996123520.0, + "grad_norm": 1.8933127595424433, + "language_loss": 0.7326529, + "learning_rate": 1.6522348774273246e-06, + "loss": 0.75395983, + "num_input_tokens_seen": 203792710, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.67578125, + "step": 9463, + "time_per_iteration": 2.466491460800171 + }, + { + "auxiliary_loss_clip": 0.01106737, + "auxiliary_loss_mlp": 0.01030565, + "balance_loss_clip": 1.01810765, + "balance_loss_mlp": 1.03583968, + "epoch": 0.5690064632496618, + "flos": 18296810484480.0, + "grad_norm": 1.7491308846328846, + "language_loss": 0.74368691, + "learning_rate": 1.6518513552469123e-06, + "loss": 0.76505989, + "num_input_tokens_seen": 203811645, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 9464, + "time_per_iteration": 2.406031370162964 + }, + { + "auxiliary_loss_clip": 0.01110663, + "auxiliary_loss_mlp": 0.01036268, + "balance_loss_clip": 1.02382255, + "balance_loss_mlp": 1.03892159, + "epoch": 0.5690665865023298, + "flos": 21579260993280.0, + "grad_norm": 1.714079864723851, + "language_loss": 0.84333247, + "learning_rate": 1.6514678462683312e-06, + "loss": 0.86480176, + "num_input_tokens_seen": 203830040, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71875, + "step": 9465, + "time_per_iteration": 2.514777183532715 + }, + { + "auxiliary_loss_clip": 0.01103611, + "auxiliary_loss_mlp": 0.01029517, + "balance_loss_clip": 1.01757169, + "balance_loss_mlp": 1.03546405, + "epoch": 0.5691267097549978, + "flos": 24421446501120.0, + "grad_norm": 1.8589721720108319, + "language_loss": 0.7226572, + "learning_rate": 1.651084350506125e-06, + "loss": 0.74398845, + "num_input_tokens_seen": 203851245, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.68359375, + "step": 9466, + "time_per_iteration": 2.475188732147217 + }, + { + "auxiliary_loss_clip": 0.01029497, + "auxiliary_loss_mlp": 0.01005385, + "balance_loss_clip": 1.00427043, + "balance_loss_mlp": 1.0077517, + "epoch": 0.5691868330076657, + "flos": 61657906199040.0, + "grad_norm": 0.7081654133828948, + "language_loss": 0.55354679, + "learning_rate": 1.6507008679748343e-06, + "loss": 0.57389557, + "num_input_tokens_seen": 203916400, + "router_z_loss_clip": 0.01116943, + "router_z_loss_mlp": 0.21777344, + "step": 9467, + "time_per_iteration": 3.185729742050171 + }, + { + "auxiliary_loss_clip": 0.01111718, + "auxiliary_loss_mlp": 0.01032639, + "balance_loss_clip": 1.01861966, + "balance_loss_mlp": 1.03861189, + "epoch": 0.5692469562603337, + "flos": 21325193118720.0, + "grad_norm": 2.2495356407271854, + "language_loss": 0.63680357, + "learning_rate": 1.6503173986890023e-06, + "loss": 0.65824717, + "num_input_tokens_seen": 203935870, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.73046875, + "step": 9468, + "time_per_iteration": 2.4373323917388916 + }, + { + "auxiliary_loss_clip": 0.01108125, + "auxiliary_loss_mlp": 0.01028705, + "balance_loss_clip": 1.01587772, + "balance_loss_mlp": 1.03801632, + "epoch": 0.5693070795130016, + "flos": 23367899664000.0, + "grad_norm": 1.8525378978069993, + "language_loss": 0.79367, + "learning_rate": 1.64993394266317e-06, + "loss": 0.81503832, + "num_input_tokens_seen": 203954950, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 9469, + "time_per_iteration": 3.8166728019714355 + }, + { + "auxiliary_loss_clip": 0.0111246, + "auxiliary_loss_mlp": 0.01041609, + "balance_loss_clip": 1.02810884, + "balance_loss_mlp": 1.03860152, + "epoch": 0.5693672027656697, + "flos": 18697250280960.0, + "grad_norm": 1.9923541987272968, + "language_loss": 0.69606256, + "learning_rate": 1.6495504999118769e-06, + "loss": 0.71760333, + "num_input_tokens_seen": 203972715, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73828125, + "step": 9470, + "time_per_iteration": 2.4572556018829346 + }, + { + "auxiliary_loss_clip": 0.01107845, + "auxiliary_loss_mlp": 0.01031625, + "balance_loss_clip": 1.01882184, + "balance_loss_mlp": 1.03729832, + "epoch": 0.5694273260183376, + "flos": 20449188230400.0, + "grad_norm": 1.5518202279497855, + "language_loss": 0.74791551, + "learning_rate": 1.6491670704496644e-06, + "loss": 0.76931024, + "num_input_tokens_seen": 203990775, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.70703125, + "step": 9471, + "time_per_iteration": 3.926091432571411 + }, + { + "auxiliary_loss_clip": 0.01108882, + "auxiliary_loss_mlp": 0.01032599, + "balance_loss_clip": 1.02006447, + "balance_loss_mlp": 1.03928542, + "epoch": 0.5694874492710056, + "flos": 17603195880960.0, + "grad_norm": 1.9616270612820847, + "language_loss": 0.57270539, + "learning_rate": 1.6487836542910716e-06, + "loss": 0.59412026, + "num_input_tokens_seen": 204008845, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 9472, + "time_per_iteration": 3.8452813625335693 + }, + { + "auxiliary_loss_clip": 0.01103976, + "auxiliary_loss_mlp": 0.010308, + "balance_loss_clip": 1.01886702, + "balance_loss_mlp": 1.03722382, + "epoch": 0.5695475725236735, + "flos": 13370836250880.0, + "grad_norm": 1.803122156723958, + "language_loss": 0.73615265, + "learning_rate": 1.648400251450638e-06, + "loss": 0.75750041, + "num_input_tokens_seen": 204023755, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66796875, + "step": 9473, + "time_per_iteration": 2.4637346267700195 + }, + { + "auxiliary_loss_clip": 0.01028797, + "auxiliary_loss_mlp": 0.01004803, + "balance_loss_clip": 1.00359905, + "balance_loss_mlp": 1.00722575, + "epoch": 0.5696076957763415, + "flos": 68174398661760.0, + "grad_norm": 0.6476817486149063, + "language_loss": 0.57596511, + "learning_rate": 1.6480168619429023e-06, + "loss": 0.59630114, + "num_input_tokens_seen": 204091255, + "router_z_loss_clip": 0.01202393, + "router_z_loss_mlp": 0.21679688, + "step": 9474, + "time_per_iteration": 3.09342622756958 + }, + { + "auxiliary_loss_clip": 0.01108341, + "auxiliary_loss_mlp": 0.01034518, + "balance_loss_clip": 1.02095199, + "balance_loss_mlp": 1.03955841, + "epoch": 0.5696678190290094, + "flos": 33838301525760.0, + "grad_norm": 1.7127367690076127, + "language_loss": 0.53624213, + "learning_rate": 1.6476334857824017e-06, + "loss": 0.55767071, + "num_input_tokens_seen": 204113285, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.6875, + "step": 9475, + "time_per_iteration": 2.6103556156158447 + }, + { + "auxiliary_loss_clip": 0.01110194, + "auxiliary_loss_mlp": 0.01032608, + "balance_loss_clip": 1.01969719, + "balance_loss_mlp": 1.03914022, + "epoch": 0.5697279422816774, + "flos": 26356600748160.0, + "grad_norm": 1.5220537573313933, + "language_loss": 0.79891974, + "learning_rate": 1.647250122983675e-06, + "loss": 0.82034773, + "num_input_tokens_seen": 204133045, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9476, + "time_per_iteration": 2.5519871711730957 + }, + { + "auxiliary_loss_clip": 0.01113361, + "auxiliary_loss_mlp": 0.0103459, + "balance_loss_clip": 1.02248454, + "balance_loss_mlp": 1.04071283, + "epoch": 0.5697880655343454, + "flos": 22930507751040.0, + "grad_norm": 2.93922823935367, + "language_loss": 0.66361278, + "learning_rate": 1.6468667735612592e-06, + "loss": 0.68509227, + "num_input_tokens_seen": 204152590, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.7265625, + "step": 9477, + "time_per_iteration": 2.556461811065674 + }, + { + "auxiliary_loss_clip": 0.01107946, + "auxiliary_loss_mlp": 0.01030235, + "balance_loss_clip": 1.01737881, + "balance_loss_mlp": 1.03697014, + "epoch": 0.5698481887870134, + "flos": 26761314263040.0, + "grad_norm": 1.8188873629652118, + "language_loss": 0.70921832, + "learning_rate": 1.6464834375296906e-06, + "loss": 0.73060012, + "num_input_tokens_seen": 204171815, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9478, + "time_per_iteration": 2.5022385120391846 + }, + { + "auxiliary_loss_clip": 0.01104521, + "auxiliary_loss_mlp": 0.01027788, + "balance_loss_clip": 1.01615286, + "balance_loss_mlp": 1.03824937, + "epoch": 0.5699083120396814, + "flos": 15742269089280.0, + "grad_norm": 1.5933810632151244, + "language_loss": 0.69647413, + "learning_rate": 1.6461001149035055e-06, + "loss": 0.71779716, + "num_input_tokens_seen": 204188535, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 9479, + "time_per_iteration": 2.544422149658203 + }, + { + "auxiliary_loss_clip": 0.01103959, + "auxiliary_loss_mlp": 0.01028863, + "balance_loss_clip": 1.01729965, + "balance_loss_mlp": 1.03753138, + "epoch": 0.5699684352923493, + "flos": 19537272720000.0, + "grad_norm": 1.4338626650619826, + "language_loss": 0.71364439, + "learning_rate": 1.6457168056972392e-06, + "loss": 0.7349726, + "num_input_tokens_seen": 204208365, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6640625, + "step": 9480, + "time_per_iteration": 2.5680878162384033 + }, + { + "auxiliary_loss_clip": 0.01106665, + "auxiliary_loss_mlp": 0.01029171, + "balance_loss_clip": 1.01615977, + "balance_loss_mlp": 1.03689599, + "epoch": 0.5700285585450173, + "flos": 16253349753600.0, + "grad_norm": 2.894404055389402, + "language_loss": 0.71927261, + "learning_rate": 1.6453335099254276e-06, + "loss": 0.74063098, + "num_input_tokens_seen": 204226560, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.69921875, + "step": 9481, + "time_per_iteration": 2.4576737880706787 + }, + { + "auxiliary_loss_clip": 0.01108109, + "auxiliary_loss_mlp": 0.01030771, + "balance_loss_clip": 1.01848626, + "balance_loss_mlp": 1.03819919, + "epoch": 0.5700886817976852, + "flos": 19864993432320.0, + "grad_norm": 1.6819252466037764, + "language_loss": 0.78134334, + "learning_rate": 1.6449502276026041e-06, + "loss": 0.80273211, + "num_input_tokens_seen": 204245410, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69921875, + "step": 9482, + "time_per_iteration": 2.4244532585144043 + }, + { + "auxiliary_loss_clip": 0.01107032, + "auxiliary_loss_mlp": 0.01026772, + "balance_loss_clip": 1.01462436, + "balance_loss_mlp": 1.0372206, + "epoch": 0.5701488050503533, + "flos": 23841704989440.0, + "grad_norm": 2.1918431398286686, + "language_loss": 0.77641654, + "learning_rate": 1.6445669587433043e-06, + "loss": 0.79775453, + "num_input_tokens_seen": 204264840, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69921875, + "step": 9483, + "time_per_iteration": 2.4840755462646484 + }, + { + "auxiliary_loss_clip": 0.01106594, + "auxiliary_loss_mlp": 0.01033667, + "balance_loss_clip": 1.0217644, + "balance_loss_mlp": 1.037377, + "epoch": 0.5702089283030212, + "flos": 23659673840640.0, + "grad_norm": 2.4281256207615702, + "language_loss": 0.8098467, + "learning_rate": 1.6441837033620612e-06, + "loss": 0.8312493, + "num_input_tokens_seen": 204284335, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69140625, + "step": 9484, + "time_per_iteration": 2.4726784229278564 + }, + { + "auxiliary_loss_clip": 0.01107682, + "auxiliary_loss_mlp": 0.01030904, + "balance_loss_clip": 1.01798165, + "balance_loss_mlp": 1.03656316, + "epoch": 0.5702690515556892, + "flos": 27891171544320.0, + "grad_norm": 9.175896769478262, + "language_loss": 0.60516417, + "learning_rate": 1.6438004614734073e-06, + "loss": 0.62655002, + "num_input_tokens_seen": 204302590, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 9485, + "time_per_iteration": 2.5423014163970947 + }, + { + "auxiliary_loss_clip": 0.01107039, + "auxiliary_loss_mlp": 0.01033529, + "balance_loss_clip": 1.02155399, + "balance_loss_mlp": 1.03619039, + "epoch": 0.5703291748083571, + "flos": 24023951619840.0, + "grad_norm": 1.6367482229195742, + "language_loss": 0.65350515, + "learning_rate": 1.6434172330918757e-06, + "loss": 0.67491084, + "num_input_tokens_seen": 204323055, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.7109375, + "step": 9486, + "time_per_iteration": 2.4597506523132324 + }, + { + "auxiliary_loss_clip": 0.01028731, + "auxiliary_loss_mlp": 0.01001408, + "balance_loss_clip": 1.00001299, + "balance_loss_mlp": 1.0072422, + "epoch": 0.5703892980610251, + "flos": 57023382919680.0, + "grad_norm": 0.6639559744347447, + "language_loss": 0.48005819, + "learning_rate": 1.6430340182319978e-06, + "loss": 0.50035954, + "num_input_tokens_seen": 204386160, + "router_z_loss_clip": 0.01397705, + "router_z_loss_mlp": 0.21484375, + "step": 9487, + "time_per_iteration": 3.139495849609375 + }, + { + "auxiliary_loss_clip": 0.01107717, + "auxiliary_loss_mlp": 0.01034452, + "balance_loss_clip": 1.02199435, + "balance_loss_mlp": 1.03726935, + "epoch": 0.570449421313693, + "flos": 24351025887360.0, + "grad_norm": 3.049670437576873, + "language_loss": 0.86058694, + "learning_rate": 1.6426508169083067e-06, + "loss": 0.88200867, + "num_input_tokens_seen": 204406315, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.703125, + "step": 9488, + "time_per_iteration": 2.474616289138794 + }, + { + "auxiliary_loss_clip": 0.01111409, + "auxiliary_loss_mlp": 0.01033222, + "balance_loss_clip": 1.02065694, + "balance_loss_mlp": 1.03814459, + "epoch": 0.570509544566361, + "flos": 24828566227200.0, + "grad_norm": 1.4447763000600118, + "language_loss": 0.79057854, + "learning_rate": 1.6422676291353314e-06, + "loss": 0.81202483, + "num_input_tokens_seen": 204427645, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.73046875, + "step": 9489, + "time_per_iteration": 2.5065059661865234 + }, + { + "auxiliary_loss_clip": 0.01109061, + "auxiliary_loss_mlp": 0.01030936, + "balance_loss_clip": 1.01978409, + "balance_loss_mlp": 1.03869939, + "epoch": 0.570569667819029, + "flos": 21397301671680.0, + "grad_norm": 1.7186115243718623, + "language_loss": 0.69906354, + "learning_rate": 1.641884454927604e-06, + "loss": 0.72046351, + "num_input_tokens_seen": 204445910, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.703125, + "step": 9490, + "time_per_iteration": 2.431102752685547 + }, + { + "auxiliary_loss_clip": 0.01107746, + "auxiliary_loss_mlp": 0.01028873, + "balance_loss_clip": 1.01676106, + "balance_loss_mlp": 1.03836775, + "epoch": 0.570629791071697, + "flos": 23216751233280.0, + "grad_norm": 1.5472180668734579, + "language_loss": 0.76222062, + "learning_rate": 1.6415012942996548e-06, + "loss": 0.78358686, + "num_input_tokens_seen": 204464680, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 9491, + "time_per_iteration": 2.4962759017944336 + }, + { + "auxiliary_loss_clip": 0.01028502, + "auxiliary_loss_mlp": 0.01004058, + "balance_loss_clip": 1.00276494, + "balance_loss_mlp": 1.00699997, + "epoch": 0.570689914324365, + "flos": 65284666525440.0, + "grad_norm": 0.7944597612251223, + "language_loss": 0.57379556, + "learning_rate": 1.641118147266011e-06, + "loss": 0.59412122, + "num_input_tokens_seen": 204525580, + "router_z_loss_clip": 0.01293945, + "router_z_loss_mlp": 0.21484375, + "step": 9492, + "time_per_iteration": 3.0417838096618652 + }, + { + "auxiliary_loss_clip": 0.01108126, + "auxiliary_loss_mlp": 0.01033252, + "balance_loss_clip": 1.02009118, + "balance_loss_mlp": 1.03813028, + "epoch": 0.5707500375770329, + "flos": 21141904993920.0, + "grad_norm": 1.7217254573804663, + "language_loss": 0.71475661, + "learning_rate": 1.6407350138412035e-06, + "loss": 0.73617041, + "num_input_tokens_seen": 204541320, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.69921875, + "step": 9493, + "time_per_iteration": 2.4304161071777344 + }, + { + "auxiliary_loss_clip": 0.01111414, + "auxiliary_loss_mlp": 0.01030431, + "balance_loss_clip": 1.01807487, + "balance_loss_mlp": 1.0384568, + "epoch": 0.5708101608297009, + "flos": 20812747737600.0, + "grad_norm": 1.5364295350921338, + "language_loss": 0.77778745, + "learning_rate": 1.6403518940397606e-06, + "loss": 0.7992059, + "num_input_tokens_seen": 204560275, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.73046875, + "step": 9494, + "time_per_iteration": 2.495940923690796 + }, + { + "auxiliary_loss_clip": 0.01111436, + "auxiliary_loss_mlp": 0.01033742, + "balance_loss_clip": 1.02015769, + "balance_loss_mlp": 1.03685784, + "epoch": 0.5708702840823688, + "flos": 25812338895360.0, + "grad_norm": 2.275602748234112, + "language_loss": 0.80153453, + "learning_rate": 1.6399687878762096e-06, + "loss": 0.82298625, + "num_input_tokens_seen": 204579430, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.74609375, + "step": 9495, + "time_per_iteration": 2.464423418045044 + }, + { + "auxiliary_loss_clip": 0.01117033, + "auxiliary_loss_mlp": 0.01039006, + "balance_loss_clip": 1.02393782, + "balance_loss_mlp": 1.04061937, + "epoch": 0.5709304073350369, + "flos": 23651916503040.0, + "grad_norm": 3.463558707959815, + "language_loss": 0.66745138, + "learning_rate": 1.6395856953650784e-06, + "loss": 0.68901181, + "num_input_tokens_seen": 204597710, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.765625, + "step": 9496, + "time_per_iteration": 2.460413694381714 + }, + { + "auxiliary_loss_clip": 0.01113845, + "auxiliary_loss_mlp": 0.01037224, + "balance_loss_clip": 1.02361047, + "balance_loss_mlp": 1.03911281, + "epoch": 0.5709905305877048, + "flos": 16107552449280.0, + "grad_norm": 2.3847499053839067, + "language_loss": 0.6960094, + "learning_rate": 1.6392026165208938e-06, + "loss": 0.71752012, + "num_input_tokens_seen": 204616140, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 9497, + "time_per_iteration": 2.4051928520202637 + }, + { + "auxiliary_loss_clip": 0.01111626, + "auxiliary_loss_mlp": 0.01030681, + "balance_loss_clip": 1.01712704, + "balance_loss_mlp": 1.03815341, + "epoch": 0.5710506538403728, + "flos": 24750819239040.0, + "grad_norm": 1.8796088723274103, + "language_loss": 0.81200778, + "learning_rate": 1.638819551358182e-06, + "loss": 0.83343083, + "num_input_tokens_seen": 204636470, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.734375, + "step": 9498, + "time_per_iteration": 2.4764246940612793 + }, + { + "auxiliary_loss_clip": 0.01111235, + "auxiliary_loss_mlp": 0.01035181, + "balance_loss_clip": 1.02085817, + "balance_loss_mlp": 1.03874803, + "epoch": 0.5711107770930407, + "flos": 21982250655360.0, + "grad_norm": 1.7968018947144153, + "language_loss": 0.66237068, + "learning_rate": 1.638436499891469e-06, + "loss": 0.68383479, + "num_input_tokens_seen": 204656640, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.72265625, + "step": 9499, + "time_per_iteration": 2.4842209815979004 + }, + { + "auxiliary_loss_clip": 0.01109681, + "auxiliary_loss_mlp": 0.0103261, + "balance_loss_clip": 1.01994455, + "balance_loss_mlp": 1.03881264, + "epoch": 0.5711709003457087, + "flos": 19574009354880.0, + "grad_norm": 2.341189176641991, + "language_loss": 0.71659786, + "learning_rate": 1.6380534621352805e-06, + "loss": 0.73802078, + "num_input_tokens_seen": 204675475, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9500, + "time_per_iteration": 2.474968671798706 + }, + { + "auxiliary_loss_clip": 0.01113264, + "auxiliary_loss_mlp": 0.01032452, + "balance_loss_clip": 1.01911259, + "balance_loss_mlp": 1.03896177, + "epoch": 0.5712310235983766, + "flos": 24242683489920.0, + "grad_norm": 1.7510176581013566, + "language_loss": 0.76148939, + "learning_rate": 1.6376704381041407e-06, + "loss": 0.78294659, + "num_input_tokens_seen": 204695385, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7421875, + "step": 9501, + "time_per_iteration": 2.481982707977295 + }, + { + "auxiliary_loss_clip": 0.01112022, + "auxiliary_loss_mlp": 0.01030774, + "balance_loss_clip": 1.01872778, + "balance_loss_mlp": 1.03827071, + "epoch": 0.5712911468510447, + "flos": 20996143603200.0, + "grad_norm": 1.6683693962706503, + "language_loss": 0.75252867, + "learning_rate": 1.6372874278125742e-06, + "loss": 0.7739566, + "num_input_tokens_seen": 204714730, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.73828125, + "step": 9502, + "time_per_iteration": 2.4645891189575195 + }, + { + "auxiliary_loss_clip": 0.01108222, + "auxiliary_loss_mlp": 0.01026372, + "balance_loss_clip": 1.01413548, + "balance_loss_mlp": 1.03776038, + "epoch": 0.5713512701037126, + "flos": 18916987731840.0, + "grad_norm": 3.8399261830524076, + "language_loss": 0.82397389, + "learning_rate": 1.636904431275105e-06, + "loss": 0.84531981, + "num_input_tokens_seen": 204735025, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.703125, + "step": 9503, + "time_per_iteration": 2.4945871829986572 + }, + { + "auxiliary_loss_clip": 0.01108893, + "auxiliary_loss_mlp": 0.01035138, + "balance_loss_clip": 1.02267456, + "balance_loss_mlp": 1.03824139, + "epoch": 0.5714113933563806, + "flos": 17413443308160.0, + "grad_norm": 2.09557851646671, + "language_loss": 0.85872537, + "learning_rate": 1.6365214485062553e-06, + "loss": 0.8801657, + "num_input_tokens_seen": 204751365, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 9504, + "time_per_iteration": 2.3861567974090576 + }, + { + "auxiliary_loss_clip": 0.01106356, + "auxiliary_loss_mlp": 0.0102491, + "balance_loss_clip": 1.01232708, + "balance_loss_mlp": 1.03753018, + "epoch": 0.5714715166090486, + "flos": 20193360589440.0, + "grad_norm": 1.9315555303189194, + "language_loss": 0.75182885, + "learning_rate": 1.6361384795205496e-06, + "loss": 0.7731415, + "num_input_tokens_seen": 204768980, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 9505, + "time_per_iteration": 2.462536573410034 + }, + { + "auxiliary_loss_clip": 0.01108197, + "auxiliary_loss_mlp": 0.01031273, + "balance_loss_clip": 1.01970994, + "balance_loss_mlp": 1.03717351, + "epoch": 0.5715316398617165, + "flos": 18551668458240.0, + "grad_norm": 1.6115496885789637, + "language_loss": 0.81918782, + "learning_rate": 1.635755524332509e-06, + "loss": 0.84058261, + "num_input_tokens_seen": 204788110, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.7109375, + "step": 9506, + "time_per_iteration": 2.467022180557251 + }, + { + "auxiliary_loss_clip": 0.01106598, + "auxiliary_loss_mlp": 0.01027974, + "balance_loss_clip": 1.01546264, + "balance_loss_mlp": 1.03684521, + "epoch": 0.5715917631143845, + "flos": 18478195188480.0, + "grad_norm": 1.6660041805363315, + "language_loss": 0.77144134, + "learning_rate": 1.6353725829566552e-06, + "loss": 0.79278708, + "num_input_tokens_seen": 204807240, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69921875, + "step": 9507, + "time_per_iteration": 2.4672694206237793 + }, + { + "auxiliary_loss_clip": 0.01110344, + "auxiliary_loss_mlp": 0.01035951, + "balance_loss_clip": 1.02186108, + "balance_loss_mlp": 1.03726792, + "epoch": 0.5716518863670524, + "flos": 24020037037440.0, + "grad_norm": 2.45367934924197, + "language_loss": 0.68435538, + "learning_rate": 1.63498965540751e-06, + "loss": 0.7058183, + "num_input_tokens_seen": 204826415, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.73046875, + "step": 9508, + "time_per_iteration": 2.464097261428833 + }, + { + "auxiliary_loss_clip": 0.01110426, + "auxiliary_loss_mlp": 0.01029096, + "balance_loss_clip": 1.01629877, + "balance_loss_mlp": 1.03722239, + "epoch": 0.5717120096197205, + "flos": 17819485626240.0, + "grad_norm": 2.0052906721639836, + "language_loss": 0.79419613, + "learning_rate": 1.634606741699593e-06, + "loss": 0.81559134, + "num_input_tokens_seen": 204844305, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.73046875, + "step": 9509, + "time_per_iteration": 2.504023551940918 + }, + { + "auxiliary_loss_clip": 0.01105897, + "auxiliary_loss_mlp": 0.01033534, + "balance_loss_clip": 1.02089834, + "balance_loss_mlp": 1.03664279, + "epoch": 0.5717721328723884, + "flos": 21866043179520.0, + "grad_norm": 1.839099502620817, + "language_loss": 0.7265448, + "learning_rate": 1.6342238418474255e-06, + "loss": 0.74793911, + "num_input_tokens_seen": 204861765, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 9510, + "time_per_iteration": 3.815577507019043 + }, + { + "auxiliary_loss_clip": 0.01107423, + "auxiliary_loss_mlp": 0.01030098, + "balance_loss_clip": 1.01810002, + "balance_loss_mlp": 1.03668678, + "epoch": 0.5718322561250564, + "flos": 28437624126720.0, + "grad_norm": 1.3819155223826083, + "language_loss": 0.69395494, + "learning_rate": 1.6338409558655264e-06, + "loss": 0.71533018, + "num_input_tokens_seen": 204882505, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.70703125, + "step": 9511, + "time_per_iteration": 2.5445902347564697 + }, + { + "auxiliary_loss_clip": 0.0110843, + "auxiliary_loss_mlp": 0.01036527, + "balance_loss_clip": 1.02426028, + "balance_loss_mlp": 1.03781009, + "epoch": 0.5718923793777243, + "flos": 13551825905280.0, + "grad_norm": 1.8672218842214499, + "language_loss": 0.61565816, + "learning_rate": 1.6334580837684152e-06, + "loss": 0.63710779, + "num_input_tokens_seen": 204899830, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.70703125, + "step": 9512, + "time_per_iteration": 3.8341665267944336 + }, + { + "auxiliary_loss_clip": 0.01106641, + "auxiliary_loss_mlp": 0.01028616, + "balance_loss_clip": 1.0164628, + "balance_loss_mlp": 1.03667331, + "epoch": 0.5719525026303923, + "flos": 17822035491840.0, + "grad_norm": 4.170405845803043, + "language_loss": 0.7586627, + "learning_rate": 1.6330752255706104e-06, + "loss": 0.78001529, + "num_input_tokens_seen": 204918100, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69921875, + "step": 9513, + "time_per_iteration": 5.299789667129517 + }, + { + "auxiliary_loss_clip": 0.01028017, + "auxiliary_loss_mlp": 0.00999308, + "balance_loss_clip": 0.99809855, + "balance_loss_mlp": 1.00645494, + "epoch": 0.5720126258830602, + "flos": 61298042814720.0, + "grad_norm": 0.8876641821203675, + "language_loss": 0.6684342, + "learning_rate": 1.6326923812866288e-06, + "loss": 0.68870747, + "num_input_tokens_seen": 204972925, + "router_z_loss_clip": 0.01208496, + "router_z_loss_mlp": 0.21582031, + "step": 9514, + "time_per_iteration": 3.0201942920684814 + }, + { + "auxiliary_loss_clip": 0.01114776, + "auxiliary_loss_mlp": 0.0104014, + "balance_loss_clip": 1.02696776, + "balance_loss_mlp": 1.04034257, + "epoch": 0.5720727491357283, + "flos": 23988040997760.0, + "grad_norm": 2.046774799271973, + "language_loss": 0.81059563, + "learning_rate": 1.63230955093099e-06, + "loss": 0.8321448, + "num_input_tokens_seen": 204990910, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.74609375, + "step": 9515, + "time_per_iteration": 2.440838575363159 + }, + { + "auxiliary_loss_clip": 0.01104804, + "auxiliary_loss_mlp": 0.01027026, + "balance_loss_clip": 1.01469994, + "balance_loss_mlp": 1.03602076, + "epoch": 0.5721328723883962, + "flos": 23405426398080.0, + "grad_norm": 1.8601231206296425, + "language_loss": 0.86125237, + "learning_rate": 1.6319267345182092e-06, + "loss": 0.88257068, + "num_input_tokens_seen": 205010500, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 9516, + "time_per_iteration": 2.477764368057251 + }, + { + "auxiliary_loss_clip": 0.01104974, + "auxiliary_loss_mlp": 0.01029332, + "balance_loss_clip": 1.01654696, + "balance_loss_mlp": 1.03561044, + "epoch": 0.5721929956410642, + "flos": 18804910320000.0, + "grad_norm": 1.8026555789133811, + "language_loss": 0.87531322, + "learning_rate": 1.6315439320628038e-06, + "loss": 0.89665627, + "num_input_tokens_seen": 205028560, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 9517, + "time_per_iteration": 2.425889253616333 + }, + { + "auxiliary_loss_clip": 0.0110684, + "auxiliary_loss_mlp": 0.0103103, + "balance_loss_clip": 1.01804841, + "balance_loss_mlp": 1.03662252, + "epoch": 0.5722531188937322, + "flos": 27196659100800.0, + "grad_norm": 1.765867586501473, + "language_loss": 0.8479656, + "learning_rate": 1.6311611435792893e-06, + "loss": 0.86934435, + "num_input_tokens_seen": 205048650, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.703125, + "step": 9518, + "time_per_iteration": 2.515908718109131 + }, + { + "auxiliary_loss_clip": 0.01102718, + "auxiliary_loss_mlp": 0.01031303, + "balance_loss_clip": 1.01909649, + "balance_loss_mlp": 1.03518391, + "epoch": 0.5723132421464001, + "flos": 15195672852480.0, + "grad_norm": 1.8620909672026127, + "language_loss": 0.7880826, + "learning_rate": 1.6307783690821812e-06, + "loss": 0.80942279, + "num_input_tokens_seen": 205066480, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.67578125, + "step": 9519, + "time_per_iteration": 2.400693893432617 + }, + { + "auxiliary_loss_clip": 0.01105893, + "auxiliary_loss_mlp": 0.01030213, + "balance_loss_clip": 1.01795244, + "balance_loss_mlp": 1.03658307, + "epoch": 0.5723733653990681, + "flos": 27599433281280.0, + "grad_norm": 1.5438950427184228, + "language_loss": 0.82970679, + "learning_rate": 1.6303956085859944e-06, + "loss": 0.85106778, + "num_input_tokens_seen": 205087475, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.69140625, + "step": 9520, + "time_per_iteration": 2.5011074542999268 + }, + { + "auxiliary_loss_clip": 0.01110791, + "auxiliary_loss_mlp": 0.01039958, + "balance_loss_clip": 1.0268625, + "balance_loss_mlp": 1.03927732, + "epoch": 0.572433488651736, + "flos": 18222870337920.0, + "grad_norm": 2.123220131944119, + "language_loss": 0.71853209, + "learning_rate": 1.630012862105243e-06, + "loss": 0.74003959, + "num_input_tokens_seen": 205106495, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71484375, + "step": 9521, + "time_per_iteration": 2.4156429767608643 + }, + { + "auxiliary_loss_clip": 0.01106899, + "auxiliary_loss_mlp": 0.01033537, + "balance_loss_clip": 1.02117443, + "balance_loss_mlp": 1.0362848, + "epoch": 0.5724936119044041, + "flos": 31249106484480.0, + "grad_norm": 1.6921576366095024, + "language_loss": 0.77830148, + "learning_rate": 1.6296301296544415e-06, + "loss": 0.79970586, + "num_input_tokens_seen": 205128285, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.70703125, + "step": 9522, + "time_per_iteration": 2.5682153701782227 + }, + { + "auxiliary_loss_clip": 0.01102798, + "auxiliary_loss_mlp": 0.01031779, + "balance_loss_clip": 1.02081728, + "balance_loss_mlp": 1.03628266, + "epoch": 0.572553735157072, + "flos": 19202189719680.0, + "grad_norm": 1.565759699688635, + "language_loss": 0.71671265, + "learning_rate": 1.629247411248102e-06, + "loss": 0.73805845, + "num_input_tokens_seen": 205146595, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6640625, + "step": 9523, + "time_per_iteration": 2.402622938156128 + }, + { + "auxiliary_loss_clip": 0.01104927, + "auxiliary_loss_mlp": 0.01026614, + "balance_loss_clip": 1.01511574, + "balance_loss_mlp": 1.03639328, + "epoch": 0.57261385840974, + "flos": 21214911386880.0, + "grad_norm": 1.6537237547017787, + "language_loss": 0.70046443, + "learning_rate": 1.628864706900738e-06, + "loss": 0.72177982, + "num_input_tokens_seen": 205164295, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6875, + "step": 9524, + "time_per_iteration": 2.478745698928833 + }, + { + "auxiliary_loss_clip": 0.01107047, + "auxiliary_loss_mlp": 0.01030877, + "balance_loss_clip": 1.01944458, + "balance_loss_mlp": 1.03783476, + "epoch": 0.5726739816624079, + "flos": 33984529793280.0, + "grad_norm": 1.431879051430598, + "language_loss": 0.65079439, + "learning_rate": 1.6284820166268615e-06, + "loss": 0.67217362, + "num_input_tokens_seen": 205185380, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.69140625, + "step": 9525, + "time_per_iteration": 2.5722320079803467 + }, + { + "auxiliary_loss_clip": 0.01102, + "auxiliary_loss_mlp": 0.01029624, + "balance_loss_clip": 1.01825702, + "balance_loss_mlp": 1.03385937, + "epoch": 0.5727341049150759, + "flos": 24275972419200.0, + "grad_norm": 1.7621674355193322, + "language_loss": 0.72353703, + "learning_rate": 1.628099340440984e-06, + "loss": 0.74485326, + "num_input_tokens_seen": 205204895, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.68359375, + "step": 9526, + "time_per_iteration": 2.5182504653930664 + }, + { + "auxiliary_loss_clip": 0.01102827, + "auxiliary_loss_mlp": 0.01031539, + "balance_loss_clip": 1.02022004, + "balance_loss_mlp": 1.03617597, + "epoch": 0.5727942281677438, + "flos": 28400564269440.0, + "grad_norm": 1.6243804380597333, + "language_loss": 0.80131519, + "learning_rate": 1.6277166783576176e-06, + "loss": 0.8226589, + "num_input_tokens_seen": 205223440, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66796875, + "step": 9527, + "time_per_iteration": 2.556168556213379 + }, + { + "auxiliary_loss_clip": 0.01104789, + "auxiliary_loss_mlp": 0.01036832, + "balance_loss_clip": 1.02399302, + "balance_loss_mlp": 1.03633451, + "epoch": 0.5728543514204119, + "flos": 19536769929600.0, + "grad_norm": 1.8731920412295517, + "language_loss": 0.71818352, + "learning_rate": 1.6273340303912713e-06, + "loss": 0.7395997, + "num_input_tokens_seen": 205242800, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.68359375, + "step": 9528, + "time_per_iteration": 2.502045154571533 + }, + { + "auxiliary_loss_clip": 0.01107269, + "auxiliary_loss_mlp": 0.01033482, + "balance_loss_clip": 1.02113199, + "balance_loss_mlp": 1.03742957, + "epoch": 0.5729144746730798, + "flos": 21506757390720.0, + "grad_norm": 1.9532280974694858, + "language_loss": 0.853854, + "learning_rate": 1.6269513965564557e-06, + "loss": 0.87526155, + "num_input_tokens_seen": 205259465, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69921875, + "step": 9529, + "time_per_iteration": 2.468146324157715 + }, + { + "auxiliary_loss_clip": 0.01028852, + "auxiliary_loss_mlp": 0.0100185, + "balance_loss_clip": 1.0006398, + "balance_loss_mlp": 1.00712085, + "epoch": 0.5729745979257478, + "flos": 58681628242560.0, + "grad_norm": 0.7632636876236247, + "language_loss": 0.56091511, + "learning_rate": 1.6265687768676813e-06, + "loss": 0.58122212, + "num_input_tokens_seen": 205314100, + "router_z_loss_clip": 0.01208496, + "router_z_loss_mlp": 0.21777344, + "step": 9530, + "time_per_iteration": 2.955796003341675 + }, + { + "auxiliary_loss_clip": 0.01109966, + "auxiliary_loss_mlp": 0.01023962, + "balance_loss_clip": 1.01241684, + "balance_loss_mlp": 1.03820443, + "epoch": 0.5730347211784158, + "flos": 18552099421440.0, + "grad_norm": 2.605800582107851, + "language_loss": 0.66667211, + "learning_rate": 1.6261861713394553e-06, + "loss": 0.68801141, + "num_input_tokens_seen": 205333420, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.71875, + "step": 9531, + "time_per_iteration": 2.4874041080474854 + }, + { + "auxiliary_loss_clip": 0.01107074, + "auxiliary_loss_mlp": 0.01031474, + "balance_loss_clip": 1.0189929, + "balance_loss_mlp": 1.0362972, + "epoch": 0.5730948444310837, + "flos": 38031482396160.0, + "grad_norm": 2.577990064326961, + "language_loss": 0.75677073, + "learning_rate": 1.6258035799862876e-06, + "loss": 0.77815616, + "num_input_tokens_seen": 205350995, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.70703125, + "step": 9532, + "time_per_iteration": 2.653745651245117 + }, + { + "auxiliary_loss_clip": 0.01105987, + "auxiliary_loss_mlp": 0.01030007, + "balance_loss_clip": 1.01779997, + "balance_loss_mlp": 1.03636467, + "epoch": 0.5731549676837517, + "flos": 25227066689280.0, + "grad_norm": 3.4857041080787696, + "language_loss": 0.78726482, + "learning_rate": 1.625421002822686e-06, + "loss": 0.80862474, + "num_input_tokens_seen": 205372675, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6953125, + "step": 9533, + "time_per_iteration": 2.5444183349609375 + }, + { + "auxiliary_loss_clip": 0.01105558, + "auxiliary_loss_mlp": 0.01033011, + "balance_loss_clip": 1.02156746, + "balance_loss_mlp": 1.03771889, + "epoch": 0.5732150909364196, + "flos": 23368222886400.0, + "grad_norm": 2.5155449858561036, + "language_loss": 0.8564285, + "learning_rate": 1.6250384398631574e-06, + "loss": 0.87781423, + "num_input_tokens_seen": 205392590, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6796875, + "step": 9534, + "time_per_iteration": 2.611769199371338 + }, + { + "auxiliary_loss_clip": 0.01108602, + "auxiliary_loss_mlp": 0.01035947, + "balance_loss_clip": 1.02241731, + "balance_loss_mlp": 1.03833961, + "epoch": 0.5732752141890877, + "flos": 23079357711360.0, + "grad_norm": 1.7913378128419626, + "language_loss": 0.74880809, + "learning_rate": 1.6246558911222085e-06, + "loss": 0.7702536, + "num_input_tokens_seen": 205414885, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.703125, + "step": 9535, + "time_per_iteration": 2.5294063091278076 + }, + { + "auxiliary_loss_clip": 0.01113223, + "auxiliary_loss_mlp": 0.010319, + "balance_loss_clip": 1.01927602, + "balance_loss_mlp": 1.04021287, + "epoch": 0.5733353374417556, + "flos": 24352282863360.0, + "grad_norm": 1.60935564318513, + "language_loss": 0.70712042, + "learning_rate": 1.624273356614346e-06, + "loss": 0.72857165, + "num_input_tokens_seen": 205434440, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.73046875, + "step": 9536, + "time_per_iteration": 2.5115044116973877 + }, + { + "auxiliary_loss_clip": 0.01104773, + "auxiliary_loss_mlp": 0.01029145, + "balance_loss_clip": 1.01741457, + "balance_loss_mlp": 1.03604972, + "epoch": 0.5733954606944236, + "flos": 27198849830400.0, + "grad_norm": 1.9605571924010112, + "language_loss": 0.69843078, + "learning_rate": 1.6238908363540755e-06, + "loss": 0.71977001, + "num_input_tokens_seen": 205454225, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6875, + "step": 9537, + "time_per_iteration": 2.485203266143799 + }, + { + "auxiliary_loss_clip": 0.01106743, + "auxiliary_loss_mlp": 0.01033894, + "balance_loss_clip": 1.02179384, + "balance_loss_mlp": 1.03693986, + "epoch": 0.5734555839470915, + "flos": 28765129357440.0, + "grad_norm": 1.9885156073739136, + "language_loss": 0.6257112, + "learning_rate": 1.623508330355902e-06, + "loss": 0.64711761, + "num_input_tokens_seen": 205474750, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 9538, + "time_per_iteration": 2.5242531299591064 + }, + { + "auxiliary_loss_clip": 0.01106895, + "auxiliary_loss_mlp": 0.01034021, + "balance_loss_clip": 1.02131939, + "balance_loss_mlp": 1.03750122, + "epoch": 0.5735157071997595, + "flos": 22966813422720.0, + "grad_norm": 1.847251631174476, + "language_loss": 0.83067656, + "learning_rate": 1.6231258386343306e-06, + "loss": 0.85208571, + "num_input_tokens_seen": 205495495, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 9539, + "time_per_iteration": 2.4557297229766846 + }, + { + "auxiliary_loss_clip": 0.01108422, + "auxiliary_loss_mlp": 0.01034021, + "balance_loss_clip": 1.02155805, + "balance_loss_mlp": 1.03672779, + "epoch": 0.5735758304524274, + "flos": 18989455420800.0, + "grad_norm": 1.9303873756935568, + "language_loss": 0.73266071, + "learning_rate": 1.6227433612038647e-06, + "loss": 0.75408518, + "num_input_tokens_seen": 205510070, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71484375, + "step": 9540, + "time_per_iteration": 2.449195384979248 + }, + { + "auxiliary_loss_clip": 0.01102256, + "auxiliary_loss_mlp": 0.01025926, + "balance_loss_clip": 1.01486361, + "balance_loss_mlp": 1.03386962, + "epoch": 0.5736359537050955, + "flos": 28397942576640.0, + "grad_norm": 1.7719156274309316, + "language_loss": 0.80036277, + "learning_rate": 1.6223608980790089e-06, + "loss": 0.82164454, + "num_input_tokens_seen": 205530190, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.68359375, + "step": 9541, + "time_per_iteration": 2.4807605743408203 + }, + { + "auxiliary_loss_clip": 0.01109647, + "auxiliary_loss_mlp": 0.01035157, + "balance_loss_clip": 1.02247286, + "balance_loss_mlp": 1.03748846, + "epoch": 0.5736960769577634, + "flos": 15627210848640.0, + "grad_norm": 2.3537030152809817, + "language_loss": 0.64358872, + "learning_rate": 1.6219784492742654e-06, + "loss": 0.66503674, + "num_input_tokens_seen": 205547380, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 9542, + "time_per_iteration": 2.417178153991699 + }, + { + "auxiliary_loss_clip": 0.01105899, + "auxiliary_loss_mlp": 0.0102862, + "balance_loss_clip": 1.01681268, + "balance_loss_mlp": 1.03586972, + "epoch": 0.5737562002104314, + "flos": 18003994813440.0, + "grad_norm": 2.222303069950764, + "language_loss": 0.82983625, + "learning_rate": 1.6215960148041365e-06, + "loss": 0.85118151, + "num_input_tokens_seen": 205566540, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.703125, + "step": 9543, + "time_per_iteration": 2.4162886142730713 + }, + { + "auxiliary_loss_clip": 0.01111645, + "auxiliary_loss_mlp": 0.01030794, + "balance_loss_clip": 1.01729405, + "balance_loss_mlp": 1.0378089, + "epoch": 0.5738163234630994, + "flos": 20698192287360.0, + "grad_norm": 2.297441344794182, + "language_loss": 0.73850191, + "learning_rate": 1.6212135946831257e-06, + "loss": 0.75992632, + "num_input_tokens_seen": 205584200, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73828125, + "step": 9544, + "time_per_iteration": 2.4531123638153076 + }, + { + "auxiliary_loss_clip": 0.01110237, + "auxiliary_loss_mlp": 0.01029691, + "balance_loss_clip": 1.01687646, + "balance_loss_mlp": 1.03741252, + "epoch": 0.5738764467157673, + "flos": 23149311448320.0, + "grad_norm": 2.106910148542404, + "language_loss": 0.75869375, + "learning_rate": 1.620831188925733e-06, + "loss": 0.78009301, + "num_input_tokens_seen": 205604675, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.73046875, + "step": 9545, + "time_per_iteration": 2.446340799331665 + }, + { + "auxiliary_loss_clip": 0.01109663, + "auxiliary_loss_mlp": 0.01033226, + "balance_loss_clip": 1.02086437, + "balance_loss_mlp": 1.03903508, + "epoch": 0.5739365699684353, + "flos": 29492930730240.0, + "grad_norm": 1.6841481616941998, + "language_loss": 0.56267381, + "learning_rate": 1.620448797546459e-06, + "loss": 0.58410275, + "num_input_tokens_seen": 205624680, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.70703125, + "step": 9546, + "time_per_iteration": 2.5431458950042725 + }, + { + "auxiliary_loss_clip": 0.01109256, + "auxiliary_loss_mlp": 0.01032725, + "balance_loss_clip": 1.02027345, + "balance_loss_mlp": 1.0375458, + "epoch": 0.5739966932211032, + "flos": 14027247342720.0, + "grad_norm": 2.2354008467729236, + "language_loss": 0.76396316, + "learning_rate": 1.6200664205598055e-06, + "loss": 0.78538299, + "num_input_tokens_seen": 205641950, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71875, + "step": 9547, + "time_per_iteration": 2.399355173110962 + }, + { + "auxiliary_loss_clip": 0.01108464, + "auxiliary_loss_mlp": 0.01030986, + "balance_loss_clip": 1.01847458, + "balance_loss_mlp": 1.03692102, + "epoch": 0.5740568164737713, + "flos": 19062030850560.0, + "grad_norm": 3.5736288481687457, + "language_loss": 0.74030554, + "learning_rate": 1.6196840579802704e-06, + "loss": 0.76169997, + "num_input_tokens_seen": 205660130, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71484375, + "step": 9548, + "time_per_iteration": 2.438188314437866 + }, + { + "auxiliary_loss_clip": 0.01107619, + "auxiliary_loss_mlp": 0.01033978, + "balance_loss_clip": 1.02162778, + "balance_loss_mlp": 1.03630018, + "epoch": 0.5741169397264392, + "flos": 22127832478080.0, + "grad_norm": 2.070673757769185, + "language_loss": 0.6898725, + "learning_rate": 1.619301709822355e-06, + "loss": 0.71128839, + "num_input_tokens_seen": 205678895, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.71484375, + "step": 9549, + "time_per_iteration": 2.4443182945251465 + }, + { + "auxiliary_loss_clip": 0.01109324, + "auxiliary_loss_mlp": 0.01029147, + "balance_loss_clip": 1.01756024, + "balance_loss_mlp": 1.0398941, + "epoch": 0.5741770629791072, + "flos": 24936836797440.0, + "grad_norm": 1.5143454441571018, + "language_loss": 0.79360747, + "learning_rate": 1.6189193761005564e-06, + "loss": 0.81499219, + "num_input_tokens_seen": 205698450, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6953125, + "step": 9550, + "time_per_iteration": 2.570117473602295 + }, + { + "auxiliary_loss_clip": 0.0111024, + "auxiliary_loss_mlp": 0.01030597, + "balance_loss_clip": 1.01844966, + "balance_loss_mlp": 1.03862011, + "epoch": 0.5742371862317751, + "flos": 18801462614400.0, + "grad_norm": 1.8121895379081407, + "language_loss": 0.67906272, + "learning_rate": 1.6185370568293727e-06, + "loss": 0.70047116, + "num_input_tokens_seen": 205714870, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.71484375, + "step": 9551, + "time_per_iteration": 2.423403024673462 + }, + { + "auxiliary_loss_clip": 0.01109924, + "auxiliary_loss_mlp": 0.01036266, + "balance_loss_clip": 1.02370107, + "balance_loss_mlp": 1.03743887, + "epoch": 0.5742973094844431, + "flos": 24460661174400.0, + "grad_norm": 1.628701607162486, + "language_loss": 0.71362531, + "learning_rate": 1.6181547520233031e-06, + "loss": 0.73508722, + "num_input_tokens_seen": 205736045, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.72265625, + "step": 9552, + "time_per_iteration": 3.886622428894043 + }, + { + "auxiliary_loss_clip": 0.01109635, + "auxiliary_loss_mlp": 0.01031692, + "balance_loss_clip": 1.01972914, + "balance_loss_mlp": 1.03975332, + "epoch": 0.574357432737111, + "flos": 21652770176640.0, + "grad_norm": 1.7228318188262413, + "language_loss": 0.79922652, + "learning_rate": 1.617772461696843e-06, + "loss": 0.82063985, + "num_input_tokens_seen": 205754445, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69921875, + "step": 9553, + "time_per_iteration": 2.431051731109619 + }, + { + "auxiliary_loss_clip": 0.01108983, + "auxiliary_loss_mlp": 0.01030986, + "balance_loss_clip": 1.01900589, + "balance_loss_mlp": 1.03611398, + "epoch": 0.5744175559897791, + "flos": 16544728880640.0, + "grad_norm": 2.015136287210995, + "language_loss": 0.83396381, + "learning_rate": 1.6173901858644895e-06, + "loss": 0.85536349, + "num_input_tokens_seen": 205770595, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.73046875, + "step": 9554, + "time_per_iteration": 3.823064088821411 + }, + { + "auxiliary_loss_clip": 0.0111382, + "auxiliary_loss_mlp": 0.01036712, + "balance_loss_clip": 1.02347982, + "balance_loss_mlp": 1.04021072, + "epoch": 0.574477679242447, + "flos": 24207598880640.0, + "grad_norm": 1.4846822756962552, + "language_loss": 0.70777845, + "learning_rate": 1.6170079245407385e-06, + "loss": 0.72928381, + "num_input_tokens_seen": 205791935, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 9555, + "time_per_iteration": 5.333508491516113 + }, + { + "auxiliary_loss_clip": 0.01109263, + "auxiliary_loss_mlp": 0.01027381, + "balance_loss_clip": 1.01494122, + "balance_loss_mlp": 1.03861225, + "epoch": 0.574537802495115, + "flos": 14903000835840.0, + "grad_norm": 2.115239569910986, + "language_loss": 0.72206348, + "learning_rate": 1.6166256777400853e-06, + "loss": 0.7434299, + "num_input_tokens_seen": 205807260, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.70703125, + "step": 9556, + "time_per_iteration": 2.4479689598083496 + }, + { + "auxiliary_loss_clip": 0.01109212, + "auxiliary_loss_mlp": 0.01034822, + "balance_loss_clip": 1.02174449, + "balance_loss_mlp": 1.03852749, + "epoch": 0.5745979257477829, + "flos": 24934969290240.0, + "grad_norm": 1.5580789907924004, + "language_loss": 0.73779786, + "learning_rate": 1.6162434454770248e-06, + "loss": 0.75923818, + "num_input_tokens_seen": 205826885, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.70703125, + "step": 9557, + "time_per_iteration": 2.53330397605896 + }, + { + "auxiliary_loss_clip": 0.01108535, + "auxiliary_loss_mlp": 0.01033277, + "balance_loss_clip": 1.02114749, + "balance_loss_mlp": 1.03805625, + "epoch": 0.5746580490004509, + "flos": 17235757704960.0, + "grad_norm": 1.551535187819687, + "language_loss": 0.67825913, + "learning_rate": 1.6158612277660514e-06, + "loss": 0.69967735, + "num_input_tokens_seen": 205844630, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.703125, + "step": 9558, + "time_per_iteration": 2.4345078468322754 + }, + { + "auxiliary_loss_clip": 0.01115654, + "auxiliary_loss_mlp": 0.01039699, + "balance_loss_clip": 1.02509618, + "balance_loss_mlp": 1.03993464, + "epoch": 0.5747181722531189, + "flos": 13187871348480.0, + "grad_norm": 2.018077791857229, + "language_loss": 0.71494532, + "learning_rate": 1.615479024621659e-06, + "loss": 0.73649883, + "num_input_tokens_seen": 205860960, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7578125, + "step": 9559, + "time_per_iteration": 2.4112660884857178 + }, + { + "auxiliary_loss_clip": 0.01109449, + "auxiliary_loss_mlp": 0.01029457, + "balance_loss_clip": 1.01856709, + "balance_loss_mlp": 1.03951454, + "epoch": 0.5747782955057869, + "flos": 22963006581120.0, + "grad_norm": 1.8277860809166269, + "language_loss": 0.79002881, + "learning_rate": 1.6150968360583398e-06, + "loss": 0.81141782, + "num_input_tokens_seen": 205880675, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.69921875, + "step": 9560, + "time_per_iteration": 2.461737871170044 + }, + { + "auxiliary_loss_clip": 0.01110078, + "auxiliary_loss_mlp": 0.01029167, + "balance_loss_clip": 1.01649547, + "balance_loss_mlp": 1.03796887, + "epoch": 0.5748384187584549, + "flos": 23403235668480.0, + "grad_norm": 2.312922307701609, + "language_loss": 0.64114952, + "learning_rate": 1.614714662090588e-06, + "loss": 0.66254199, + "num_input_tokens_seen": 205900050, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 9561, + "time_per_iteration": 2.4589121341705322 + }, + { + "auxiliary_loss_clip": 0.01116817, + "auxiliary_loss_mlp": 0.01037364, + "balance_loss_clip": 1.02403021, + "balance_loss_mlp": 1.04126084, + "epoch": 0.5748985420111228, + "flos": 17785514338560.0, + "grad_norm": 1.619271715020599, + "language_loss": 0.71404445, + "learning_rate": 1.6143325027328945e-06, + "loss": 0.73558629, + "num_input_tokens_seen": 205918855, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7578125, + "step": 9562, + "time_per_iteration": 2.4472360610961914 + }, + { + "auxiliary_loss_clip": 0.01108014, + "auxiliary_loss_mlp": 0.01036964, + "balance_loss_clip": 1.0256269, + "balance_loss_mlp": 1.03870499, + "epoch": 0.5749586652637908, + "flos": 19866250408320.0, + "grad_norm": 1.47664891140277, + "language_loss": 0.84212148, + "learning_rate": 1.613950357999751e-06, + "loss": 0.86357129, + "num_input_tokens_seen": 205936970, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6953125, + "step": 9563, + "time_per_iteration": 2.448540449142456 + }, + { + "auxiliary_loss_clip": 0.01112952, + "auxiliary_loss_mlp": 0.01035939, + "balance_loss_clip": 1.02251637, + "balance_loss_mlp": 1.03915787, + "epoch": 0.5750187885164587, + "flos": 21287235421440.0, + "grad_norm": 2.1518785584706266, + "language_loss": 0.57469738, + "learning_rate": 1.6135682279056488e-06, + "loss": 0.59618628, + "num_input_tokens_seen": 205954630, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73828125, + "step": 9564, + "time_per_iteration": 2.455137252807617 + }, + { + "auxiliary_loss_clip": 0.01104038, + "auxiliary_loss_mlp": 0.0102815, + "balance_loss_clip": 1.01645529, + "balance_loss_mlp": 1.03663075, + "epoch": 0.5750789117691267, + "flos": 18804658924800.0, + "grad_norm": 1.7205024550895016, + "language_loss": 0.75828826, + "learning_rate": 1.613186112465078e-06, + "loss": 0.7796101, + "num_input_tokens_seen": 205971510, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 9565, + "time_per_iteration": 2.4293572902679443 + }, + { + "auxiliary_loss_clip": 0.01030195, + "auxiliary_loss_mlp": 0.01000571, + "balance_loss_clip": 0.9991762, + "balance_loss_mlp": 1.00864065, + "epoch": 0.5751390350217946, + "flos": 70663224124800.0, + "grad_norm": 0.7426631899706556, + "language_loss": 0.60724127, + "learning_rate": 1.6128040116925287e-06, + "loss": 0.62754893, + "num_input_tokens_seen": 206035125, + "router_z_loss_clip": 0.01397705, + "router_z_loss_mlp": 0.21582031, + "step": 9566, + "time_per_iteration": 3.156651496887207 + }, + { + "auxiliary_loss_clip": 0.01109259, + "auxiliary_loss_mlp": 0.01033183, + "balance_loss_clip": 1.02127385, + "balance_loss_mlp": 1.03952003, + "epoch": 0.5751991582744627, + "flos": 14246338348800.0, + "grad_norm": 1.8230299531471923, + "language_loss": 0.7537874, + "learning_rate": 1.6124219256024901e-06, + "loss": 0.77521175, + "num_input_tokens_seen": 206052075, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 9567, + "time_per_iteration": 2.414881706237793 + }, + { + "auxiliary_loss_clip": 0.01108744, + "auxiliary_loss_mlp": 0.01029486, + "balance_loss_clip": 1.01692748, + "balance_loss_mlp": 1.03808224, + "epoch": 0.5752592815271306, + "flos": 18328160079360.0, + "grad_norm": 1.5717614086198337, + "language_loss": 0.74559051, + "learning_rate": 1.6120398542094504e-06, + "loss": 0.76697284, + "num_input_tokens_seen": 206069970, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.70703125, + "step": 9568, + "time_per_iteration": 2.458827495574951 + }, + { + "auxiliary_loss_clip": 0.0111112, + "auxiliary_loss_mlp": 0.01029788, + "balance_loss_clip": 1.01751542, + "balance_loss_mlp": 1.0394876, + "epoch": 0.5753194047797986, + "flos": 20922742160640.0, + "grad_norm": 1.7630953099139652, + "language_loss": 0.70951653, + "learning_rate": 1.6116577975278994e-06, + "loss": 0.73092568, + "num_input_tokens_seen": 206088950, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.71484375, + "step": 9569, + "time_per_iteration": 2.4545505046844482 + }, + { + "auxiliary_loss_clip": 0.01112247, + "auxiliary_loss_mlp": 0.01040682, + "balance_loss_clip": 1.02746797, + "balance_loss_mlp": 1.04058015, + "epoch": 0.5753795280324665, + "flos": 19281804215040.0, + "grad_norm": 1.9393871177420576, + "language_loss": 0.55699342, + "learning_rate": 1.6112757555723223e-06, + "loss": 0.57852268, + "num_input_tokens_seen": 206107780, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71875, + "step": 9570, + "time_per_iteration": 2.478793144226074 + }, + { + "auxiliary_loss_clip": 0.01106131, + "auxiliary_loss_mlp": 0.01038048, + "balance_loss_clip": 1.02648425, + "balance_loss_mlp": 1.03744042, + "epoch": 0.5754396512851345, + "flos": 21652877917440.0, + "grad_norm": 1.6217673569741213, + "language_loss": 0.64154774, + "learning_rate": 1.6108937283572082e-06, + "loss": 0.6629895, + "num_input_tokens_seen": 206127445, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6875, + "step": 9571, + "time_per_iteration": 2.4446957111358643 + }, + { + "auxiliary_loss_clip": 0.01108191, + "auxiliary_loss_mlp": 0.0103505, + "balance_loss_clip": 1.02213967, + "balance_loss_mlp": 1.03693449, + "epoch": 0.5754997745378025, + "flos": 51021700179840.0, + "grad_norm": 1.5404037339802243, + "language_loss": 0.67144608, + "learning_rate": 1.6105117158970434e-06, + "loss": 0.69287848, + "num_input_tokens_seen": 206152005, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9572, + "time_per_iteration": 2.739871025085449 + }, + { + "auxiliary_loss_clip": 0.0110922, + "auxiliary_loss_mlp": 0.01031668, + "balance_loss_clip": 1.01920414, + "balance_loss_mlp": 1.03968024, + "epoch": 0.5755598977904705, + "flos": 22856890826880.0, + "grad_norm": 2.3042557910685897, + "language_loss": 0.72336781, + "learning_rate": 1.6101297182063123e-06, + "loss": 0.74477673, + "num_input_tokens_seen": 206169875, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 9573, + "time_per_iteration": 2.446484088897705 + }, + { + "auxiliary_loss_clip": 0.01105342, + "auxiliary_loss_mlp": 0.01028983, + "balance_loss_clip": 1.01808691, + "balance_loss_mlp": 1.03999066, + "epoch": 0.5756200210431385, + "flos": 38472824805120.0, + "grad_norm": 1.9447567655956284, + "language_loss": 0.76657987, + "learning_rate": 1.6097477352995022e-06, + "loss": 0.78792316, + "num_input_tokens_seen": 206192635, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.65625, + "step": 9574, + "time_per_iteration": 2.620338201522827 + }, + { + "auxiliary_loss_clip": 0.01113268, + "auxiliary_loss_mlp": 0.01030568, + "balance_loss_clip": 1.01712155, + "balance_loss_mlp": 1.03815711, + "epoch": 0.5756801442958064, + "flos": 23910006700800.0, + "grad_norm": 2.450005891087765, + "language_loss": 0.66523874, + "learning_rate": 1.6093657671910968e-06, + "loss": 0.6866771, + "num_input_tokens_seen": 206211485, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 9575, + "time_per_iteration": 2.4487204551696777 + }, + { + "auxiliary_loss_clip": 0.01106224, + "auxiliary_loss_mlp": 0.01032413, + "balance_loss_clip": 1.02086747, + "balance_loss_mlp": 1.03883016, + "epoch": 0.5757402675484744, + "flos": 21105276099840.0, + "grad_norm": 1.5135571903226765, + "language_loss": 0.79637057, + "learning_rate": 1.6089838138955804e-06, + "loss": 0.81775701, + "num_input_tokens_seen": 206231740, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.67578125, + "step": 9576, + "time_per_iteration": 2.499525547027588 + }, + { + "auxiliary_loss_clip": 0.01106499, + "auxiliary_loss_mlp": 0.0102964, + "balance_loss_clip": 1.0181545, + "balance_loss_mlp": 1.038414, + "epoch": 0.5758003908011423, + "flos": 20559110826240.0, + "grad_norm": 1.624550594516776, + "language_loss": 0.69612324, + "learning_rate": 1.6086018754274372e-06, + "loss": 0.71748459, + "num_input_tokens_seen": 206250975, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6796875, + "step": 9577, + "time_per_iteration": 2.4342739582061768 + }, + { + "auxiliary_loss_clip": 0.0111126, + "auxiliary_loss_mlp": 0.01032786, + "balance_loss_clip": 1.02076983, + "balance_loss_mlp": 1.03889465, + "epoch": 0.5758605140538103, + "flos": 16473015377280.0, + "grad_norm": 1.7262479676640925, + "language_loss": 0.66394711, + "learning_rate": 1.6082199518011504e-06, + "loss": 0.68538755, + "num_input_tokens_seen": 206268800, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.7265625, + "step": 9578, + "time_per_iteration": 2.452836513519287 + }, + { + "auxiliary_loss_clip": 0.01104785, + "auxiliary_loss_mlp": 0.01028747, + "balance_loss_clip": 1.01713598, + "balance_loss_mlp": 1.03683639, + "epoch": 0.5759206373064782, + "flos": 21287558643840.0, + "grad_norm": 1.5955641210398863, + "language_loss": 0.72130096, + "learning_rate": 1.6078380430312016e-06, + "loss": 0.74263626, + "num_input_tokens_seen": 206287190, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 9579, + "time_per_iteration": 2.4709668159484863 + }, + { + "auxiliary_loss_clip": 0.01113888, + "auxiliary_loss_mlp": 0.01031946, + "balance_loss_clip": 1.01880956, + "balance_loss_mlp": 1.03966045, + "epoch": 0.5759807605591463, + "flos": 26067879227520.0, + "grad_norm": 2.099656741464949, + "language_loss": 0.64655066, + "learning_rate": 1.6074561491320742e-06, + "loss": 0.66800898, + "num_input_tokens_seen": 206307020, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 9580, + "time_per_iteration": 2.5071680545806885 + }, + { + "auxiliary_loss_clip": 0.01108728, + "auxiliary_loss_mlp": 0.01033088, + "balance_loss_clip": 1.02024293, + "balance_loss_mlp": 1.03776896, + "epoch": 0.5760408838118142, + "flos": 18873068376960.0, + "grad_norm": 1.9172914104456789, + "language_loss": 0.8563143, + "learning_rate": 1.6070742701182486e-06, + "loss": 0.87773246, + "num_input_tokens_seen": 206324095, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9581, + "time_per_iteration": 2.459761142730713 + }, + { + "auxiliary_loss_clip": 0.01117292, + "auxiliary_loss_mlp": 0.01040765, + "balance_loss_clip": 1.02792597, + "balance_loss_mlp": 1.04308629, + "epoch": 0.5761010070644822, + "flos": 15378134964480.0, + "grad_norm": 2.0860755056974627, + "language_loss": 0.67691463, + "learning_rate": 1.6066924060042057e-06, + "loss": 0.69849521, + "num_input_tokens_seen": 206343210, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7421875, + "step": 9582, + "time_per_iteration": 2.461245536804199 + }, + { + "auxiliary_loss_clip": 0.01030428, + "auxiliary_loss_mlp": 0.01000716, + "balance_loss_clip": 0.99950552, + "balance_loss_mlp": 1.00893497, + "epoch": 0.5761611303171501, + "flos": 71471932882560.0, + "grad_norm": 0.6389163922736963, + "language_loss": 0.57233906, + "learning_rate": 1.6063105568044271e-06, + "loss": 0.59265041, + "num_input_tokens_seen": 206415935, + "router_z_loss_clip": 0.01208496, + "router_z_loss_mlp": 0.21484375, + "step": 9583, + "time_per_iteration": 3.212454080581665 + }, + { + "auxiliary_loss_clip": 0.01108245, + "auxiliary_loss_mlp": 0.01029211, + "balance_loss_clip": 1.01740384, + "balance_loss_mlp": 1.0381434, + "epoch": 0.5762212535698181, + "flos": 16246167033600.0, + "grad_norm": 1.8641226876424317, + "language_loss": 0.82294947, + "learning_rate": 1.6059287225333912e-06, + "loss": 0.84432399, + "num_input_tokens_seen": 206431900, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.69921875, + "step": 9584, + "time_per_iteration": 2.445197582244873 + }, + { + "auxiliary_loss_clip": 0.0102928, + "auxiliary_loss_mlp": 0.01000964, + "balance_loss_clip": 0.99972469, + "balance_loss_mlp": 1.00788319, + "epoch": 0.5762813768224861, + "flos": 70185504216960.0, + "grad_norm": 0.6211358186522926, + "language_loss": 0.49536344, + "learning_rate": 1.6055469032055773e-06, + "loss": 0.51566589, + "num_input_tokens_seen": 206501200, + "router_z_loss_clip": 0.01239014, + "router_z_loss_mlp": 0.21484375, + "step": 9585, + "time_per_iteration": 3.1135380268096924 + }, + { + "auxiliary_loss_clip": 0.01103387, + "auxiliary_loss_mlp": 0.01026782, + "balance_loss_clip": 1.01523662, + "balance_loss_mlp": 1.0356468, + "epoch": 0.5763415000751541, + "flos": 20518028645760.0, + "grad_norm": 2.0469276219055037, + "language_loss": 0.84745687, + "learning_rate": 1.605165098835465e-06, + "loss": 0.86875856, + "num_input_tokens_seen": 206520575, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6796875, + "step": 9586, + "time_per_iteration": 2.4322049617767334 + }, + { + "auxiliary_loss_clip": 0.01107042, + "auxiliary_loss_mlp": 0.01033774, + "balance_loss_clip": 1.02099502, + "balance_loss_mlp": 1.0371176, + "epoch": 0.5764016233278221, + "flos": 15815526877440.0, + "grad_norm": 1.708349469848261, + "language_loss": 0.79935288, + "learning_rate": 1.6047833094375308e-06, + "loss": 0.82076108, + "num_input_tokens_seen": 206538060, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 9587, + "time_per_iteration": 2.420388698577881 + }, + { + "auxiliary_loss_clip": 0.01106973, + "auxiliary_loss_mlp": 0.01030504, + "balance_loss_clip": 1.01772523, + "balance_loss_mlp": 1.03791797, + "epoch": 0.57646174658049, + "flos": 20772312001920.0, + "grad_norm": 1.476870264659234, + "language_loss": 0.65978181, + "learning_rate": 1.6044015350262542e-06, + "loss": 0.68115664, + "num_input_tokens_seen": 206557320, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69140625, + "step": 9588, + "time_per_iteration": 2.470181941986084 + }, + { + "auxiliary_loss_clip": 0.0110785, + "auxiliary_loss_mlp": 0.0103402, + "balance_loss_clip": 1.02095485, + "balance_loss_mlp": 1.03747165, + "epoch": 0.576521869833158, + "flos": 23549930812800.0, + "grad_norm": 1.7939970430826904, + "language_loss": 0.78344554, + "learning_rate": 1.6040197756161104e-06, + "loss": 0.80486423, + "num_input_tokens_seen": 206575780, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 9589, + "time_per_iteration": 2.4622039794921875 + }, + { + "auxiliary_loss_clip": 0.0110208, + "auxiliary_loss_mlp": 0.01023642, + "balance_loss_clip": 1.01255536, + "balance_loss_mlp": 1.03513849, + "epoch": 0.5765819930858259, + "flos": 20266582464000.0, + "grad_norm": 1.899286870644745, + "language_loss": 0.79484087, + "learning_rate": 1.6036380312215762e-06, + "loss": 0.81609809, + "num_input_tokens_seen": 206594100, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.66796875, + "step": 9590, + "time_per_iteration": 2.4738223552703857 + }, + { + "auxiliary_loss_clip": 0.01107337, + "auxiliary_loss_mlp": 0.0102776, + "balance_loss_clip": 1.01693606, + "balance_loss_mlp": 1.03926742, + "epoch": 0.5766421163384939, + "flos": 23148772744320.0, + "grad_norm": 1.6468651932641252, + "language_loss": 0.63016611, + "learning_rate": 1.6032563018571283e-06, + "loss": 0.65151715, + "num_input_tokens_seen": 206613325, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6796875, + "step": 9591, + "time_per_iteration": 2.4630722999572754 + }, + { + "auxiliary_loss_clip": 0.0110984, + "auxiliary_loss_mlp": 0.01035664, + "balance_loss_clip": 1.02349293, + "balance_loss_mlp": 1.03998208, + "epoch": 0.5767022395911618, + "flos": 25848895962240.0, + "grad_norm": 1.6611744555405081, + "language_loss": 0.77684325, + "learning_rate": 1.6028745875372406e-06, + "loss": 0.7982983, + "num_input_tokens_seen": 206634265, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69921875, + "step": 9592, + "time_per_iteration": 2.4990251064300537 + }, + { + "auxiliary_loss_clip": 0.01029258, + "auxiliary_loss_mlp": 0.01002299, + "balance_loss_clip": 1.00114298, + "balance_loss_mlp": 1.00790858, + "epoch": 0.5767623628438299, + "flos": 68293299657600.0, + "grad_norm": 0.7302836874791289, + "language_loss": 0.59611464, + "learning_rate": 1.6024928882763885e-06, + "loss": 0.61643022, + "num_input_tokens_seen": 206696990, + "router_z_loss_clip": 0.01153564, + "router_z_loss_mlp": 0.21386719, + "step": 9593, + "time_per_iteration": 3.1885087490081787 + }, + { + "auxiliary_loss_clip": 0.01110729, + "auxiliary_loss_mlp": 0.01039747, + "balance_loss_clip": 1.0265801, + "balance_loss_mlp": 1.03883052, + "epoch": 0.5768224860964978, + "flos": 30188448754560.0, + "grad_norm": 2.3535875138052806, + "language_loss": 0.7131753, + "learning_rate": 1.6021112040890463e-06, + "loss": 0.73468006, + "num_input_tokens_seen": 206717815, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71875, + "step": 9594, + "time_per_iteration": 3.89677357673645 + }, + { + "auxiliary_loss_clip": 0.01107394, + "auxiliary_loss_mlp": 0.01032136, + "balance_loss_clip": 1.02087677, + "balance_loss_mlp": 1.03755784, + "epoch": 0.5768826093491658, + "flos": 17895041884800.0, + "grad_norm": 1.9084853230861274, + "language_loss": 0.71146429, + "learning_rate": 1.6017295349896863e-06, + "loss": 0.73285961, + "num_input_tokens_seen": 206735985, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.69921875, + "step": 9595, + "time_per_iteration": 2.438798666000366 + }, + { + "auxiliary_loss_clip": 0.01106901, + "auxiliary_loss_mlp": 0.01029125, + "balance_loss_clip": 1.01726389, + "balance_loss_mlp": 1.03756046, + "epoch": 0.5769427326018337, + "flos": 17457183095040.0, + "grad_norm": 2.7843520689138646, + "language_loss": 0.69750065, + "learning_rate": 1.6013478809927828e-06, + "loss": 0.71886092, + "num_input_tokens_seen": 206753370, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6953125, + "step": 9596, + "time_per_iteration": 3.8589518070220947 + }, + { + "auxiliary_loss_clip": 0.01111865, + "auxiliary_loss_mlp": 0.01036248, + "balance_loss_clip": 1.02235997, + "balance_loss_mlp": 1.03845882, + "epoch": 0.5770028558545017, + "flos": 39421728345600.0, + "grad_norm": 2.3208716765708974, + "language_loss": 0.67437601, + "learning_rate": 1.6009662421128074e-06, + "loss": 0.69585705, + "num_input_tokens_seen": 206777645, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.734375, + "step": 9597, + "time_per_iteration": 4.025861501693726 + }, + { + "auxiliary_loss_clip": 0.01107063, + "auxiliary_loss_mlp": 0.01033781, + "balance_loss_clip": 1.02220011, + "balance_loss_mlp": 1.03775668, + "epoch": 0.5770629791071697, + "flos": 21536383132800.0, + "grad_norm": 2.263151487781109, + "language_loss": 0.81492549, + "learning_rate": 1.6005846183642323e-06, + "loss": 0.83633393, + "num_input_tokens_seen": 206794865, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.69140625, + "step": 9598, + "time_per_iteration": 2.4457364082336426 + }, + { + "auxiliary_loss_clip": 0.01108029, + "auxiliary_loss_mlp": 0.01030455, + "balance_loss_clip": 1.01787877, + "balance_loss_mlp": 1.03758776, + "epoch": 0.5771231023598377, + "flos": 20886795624960.0, + "grad_norm": 1.482456402920166, + "language_loss": 0.72767603, + "learning_rate": 1.6002030097615277e-06, + "loss": 0.74906087, + "num_input_tokens_seen": 206814095, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 9599, + "time_per_iteration": 2.440633773803711 + }, + { + "auxiliary_loss_clip": 0.0110238, + "auxiliary_loss_mlp": 0.01029145, + "balance_loss_clip": 1.0178082, + "balance_loss_mlp": 1.03569376, + "epoch": 0.5771832256125057, + "flos": 18077216688000.0, + "grad_norm": 1.8193310631715605, + "language_loss": 0.77990794, + "learning_rate": 1.5998214163191663e-06, + "loss": 0.80122316, + "num_input_tokens_seen": 206832245, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66796875, + "step": 9600, + "time_per_iteration": 2.4627256393432617 + }, + { + "auxiliary_loss_clip": 0.01108817, + "auxiliary_loss_mlp": 0.01033376, + "balance_loss_clip": 1.0210135, + "balance_loss_mlp": 1.03849137, + "epoch": 0.5772433488651736, + "flos": 26359078786560.0, + "grad_norm": 1.5552976085447456, + "language_loss": 0.72505343, + "learning_rate": 1.5994398380516163e-06, + "loss": 0.74647534, + "num_input_tokens_seen": 206851535, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.703125, + "step": 9601, + "time_per_iteration": 2.5040857791900635 + }, + { + "auxiliary_loss_clip": 0.01107029, + "auxiliary_loss_mlp": 0.01033902, + "balance_loss_clip": 1.02177262, + "balance_loss_mlp": 1.03861833, + "epoch": 0.5773034721178416, + "flos": 19680987035520.0, + "grad_norm": 1.6061208919603027, + "language_loss": 0.68449026, + "learning_rate": 1.599058274973348e-06, + "loss": 0.7058996, + "num_input_tokens_seen": 206870595, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.68359375, + "step": 9602, + "time_per_iteration": 2.4730873107910156 + }, + { + "auxiliary_loss_clip": 0.01102166, + "auxiliary_loss_mlp": 0.01030844, + "balance_loss_clip": 1.01990008, + "balance_loss_mlp": 1.03666043, + "epoch": 0.5773635953705095, + "flos": 25082885496960.0, + "grad_norm": 1.4427131087039327, + "language_loss": 0.72969544, + "learning_rate": 1.5986767270988297e-06, + "loss": 0.75102556, + "num_input_tokens_seen": 206892320, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.65625, + "step": 9603, + "time_per_iteration": 2.4821383953094482 + }, + { + "auxiliary_loss_clip": 0.01106029, + "auxiliary_loss_mlp": 0.01029794, + "balance_loss_clip": 1.01787269, + "balance_loss_mlp": 1.03815305, + "epoch": 0.5774237186231775, + "flos": 21032987978880.0, + "grad_norm": 1.760798848795816, + "language_loss": 0.76811421, + "learning_rate": 1.5982951944425298e-06, + "loss": 0.78947246, + "num_input_tokens_seen": 206912485, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 9604, + "time_per_iteration": 2.4963274002075195 + }, + { + "auxiliary_loss_clip": 0.01108714, + "auxiliary_loss_mlp": 0.01033384, + "balance_loss_clip": 1.01986611, + "balance_loss_mlp": 1.03805828, + "epoch": 0.5774838418758454, + "flos": 15231727128960.0, + "grad_norm": 1.8255502953236893, + "language_loss": 0.83589303, + "learning_rate": 1.5979136770189174e-06, + "loss": 0.85731399, + "num_input_tokens_seen": 206929100, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.70703125, + "step": 9605, + "time_per_iteration": 2.420722484588623 + }, + { + "auxiliary_loss_clip": 0.01115788, + "auxiliary_loss_mlp": 0.01032506, + "balance_loss_clip": 1.01826096, + "balance_loss_mlp": 1.041394, + "epoch": 0.5775439651285135, + "flos": 23582609210880.0, + "grad_norm": 1.6448412923605056, + "language_loss": 0.78043878, + "learning_rate": 1.5975321748424581e-06, + "loss": 0.80192173, + "num_input_tokens_seen": 206947020, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.74609375, + "step": 9606, + "time_per_iteration": 2.47755765914917 + }, + { + "auxiliary_loss_clip": 0.01105815, + "auxiliary_loss_mlp": 0.01033593, + "balance_loss_clip": 1.02192283, + "balance_loss_mlp": 1.03780627, + "epoch": 0.5776040883811814, + "flos": 18040515966720.0, + "grad_norm": 1.6466821062116115, + "language_loss": 0.74067813, + "learning_rate": 1.597150687927619e-06, + "loss": 0.76207221, + "num_input_tokens_seen": 206964065, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 9607, + "time_per_iteration": 2.473158597946167 + }, + { + "auxiliary_loss_clip": 0.01107935, + "auxiliary_loss_mlp": 0.01030683, + "balance_loss_clip": 1.01862538, + "balance_loss_mlp": 1.03809416, + "epoch": 0.5776642116338494, + "flos": 18624638937600.0, + "grad_norm": 1.6703318324983303, + "language_loss": 0.69666326, + "learning_rate": 1.5967692162888664e-06, + "loss": 0.71804941, + "num_input_tokens_seen": 206981940, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.69921875, + "step": 9608, + "time_per_iteration": 2.457597255706787 + }, + { + "auxiliary_loss_clip": 0.01109603, + "auxiliary_loss_mlp": 0.01031186, + "balance_loss_clip": 1.01841307, + "balance_loss_mlp": 1.03859639, + "epoch": 0.5777243348865173, + "flos": 28402539517440.0, + "grad_norm": 1.7239529426914375, + "language_loss": 0.76340568, + "learning_rate": 1.596387759940665e-06, + "loss": 0.78481352, + "num_input_tokens_seen": 207002365, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7109375, + "step": 9609, + "time_per_iteration": 2.478379964828491 + }, + { + "auxiliary_loss_clip": 0.01106636, + "auxiliary_loss_mlp": 0.01028415, + "balance_loss_clip": 1.01672637, + "balance_loss_mlp": 1.03600001, + "epoch": 0.5777844581391853, + "flos": 24024705805440.0, + "grad_norm": 1.8185868001057917, + "language_loss": 0.77262604, + "learning_rate": 1.5960063188974808e-06, + "loss": 0.79397655, + "num_input_tokens_seen": 207021195, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.70703125, + "step": 9610, + "time_per_iteration": 2.4817564487457275 + }, + { + "auxiliary_loss_clip": 0.01108101, + "auxiliary_loss_mlp": 0.0102822, + "balance_loss_clip": 1.01526141, + "balance_loss_mlp": 1.03805757, + "epoch": 0.5778445813918534, + "flos": 17777361951360.0, + "grad_norm": 2.0354514470011327, + "language_loss": 0.68514067, + "learning_rate": 1.5956248931737777e-06, + "loss": 0.70650387, + "num_input_tokens_seen": 207037465, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.69921875, + "step": 9611, + "time_per_iteration": 2.401411771774292 + }, + { + "auxiliary_loss_clip": 0.01104847, + "auxiliary_loss_mlp": 0.01026691, + "balance_loss_clip": 1.01431727, + "balance_loss_mlp": 1.03594267, + "epoch": 0.5779047046445213, + "flos": 22233194046720.0, + "grad_norm": 1.8201815228945446, + "language_loss": 0.82796168, + "learning_rate": 1.5952434827840185e-06, + "loss": 0.84927702, + "num_input_tokens_seen": 207054230, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69140625, + "step": 9612, + "time_per_iteration": 2.4473085403442383 + }, + { + "auxiliary_loss_clip": 0.0110712, + "auxiliary_loss_mlp": 0.01031451, + "balance_loss_clip": 1.01915455, + "balance_loss_mlp": 1.0376699, + "epoch": 0.5779648278971893, + "flos": 21434361528960.0, + "grad_norm": 1.6350469107350603, + "language_loss": 0.79244345, + "learning_rate": 1.594862087742667e-06, + "loss": 0.81382918, + "num_input_tokens_seen": 207073150, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6953125, + "step": 9613, + "time_per_iteration": 2.427710771560669 + }, + { + "auxiliary_loss_clip": 0.01104237, + "auxiliary_loss_mlp": 0.01034281, + "balance_loss_clip": 1.02318311, + "balance_loss_mlp": 1.03584552, + "epoch": 0.5780249511498572, + "flos": 19026120228480.0, + "grad_norm": 1.8237036529741348, + "language_loss": 0.77103758, + "learning_rate": 1.5944807080641863e-06, + "loss": 0.79242271, + "num_input_tokens_seen": 207090375, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.68359375, + "step": 9614, + "time_per_iteration": 2.44856595993042 + }, + { + "auxiliary_loss_clip": 0.01108473, + "auxiliary_loss_mlp": 0.0103142, + "balance_loss_clip": 1.01954651, + "balance_loss_mlp": 1.03704453, + "epoch": 0.5780850744025252, + "flos": 12124663752960.0, + "grad_norm": 2.4290592896418093, + "language_loss": 0.8083241, + "learning_rate": 1.5940993437630375e-06, + "loss": 0.829723, + "num_input_tokens_seen": 207106030, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.71484375, + "step": 9615, + "time_per_iteration": 2.387230396270752 + }, + { + "auxiliary_loss_clip": 0.01107982, + "auxiliary_loss_mlp": 0.01032204, + "balance_loss_clip": 1.01979423, + "balance_loss_mlp": 1.0372864, + "epoch": 0.5781451976551931, + "flos": 25044425009280.0, + "grad_norm": 1.467111790124014, + "language_loss": 0.67172909, + "learning_rate": 1.5937179948536825e-06, + "loss": 0.69313097, + "num_input_tokens_seen": 207125435, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.70703125, + "step": 9616, + "time_per_iteration": 2.5091681480407715 + }, + { + "auxiliary_loss_clip": 0.01103982, + "auxiliary_loss_mlp": 0.01031765, + "balance_loss_clip": 1.01983774, + "balance_loss_mlp": 1.03701568, + "epoch": 0.5782053209078611, + "flos": 19245606284160.0, + "grad_norm": 1.7373937933185963, + "language_loss": 0.77820861, + "learning_rate": 1.5933366613505812e-06, + "loss": 0.79956603, + "num_input_tokens_seen": 207145095, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66796875, + "step": 9617, + "time_per_iteration": 2.434692144393921 + }, + { + "auxiliary_loss_clip": 0.01105528, + "auxiliary_loss_mlp": 0.01031393, + "balance_loss_clip": 1.01911426, + "balance_loss_mlp": 1.03798401, + "epoch": 0.578265444160529, + "flos": 25993831340160.0, + "grad_norm": 1.4913926039582375, + "language_loss": 0.75064909, + "learning_rate": 1.5929553432681947e-06, + "loss": 0.77201837, + "num_input_tokens_seen": 207166045, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.67578125, + "step": 9618, + "time_per_iteration": 2.5143377780914307 + }, + { + "auxiliary_loss_clip": 0.01103572, + "auxiliary_loss_mlp": 0.0103001, + "balance_loss_clip": 1.01855421, + "balance_loss_mlp": 1.03614712, + "epoch": 0.5783255674131971, + "flos": 21798603394560.0, + "grad_norm": 1.5244275331123438, + "language_loss": 0.81895173, + "learning_rate": 1.5925740406209826e-06, + "loss": 0.84028757, + "num_input_tokens_seen": 207185290, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.67578125, + "step": 9619, + "time_per_iteration": 2.436741828918457 + }, + { + "auxiliary_loss_clip": 0.01106581, + "auxiliary_loss_mlp": 0.01035226, + "balance_loss_clip": 1.02319741, + "balance_loss_mlp": 1.03689742, + "epoch": 0.578385690665865, + "flos": 24789746603520.0, + "grad_norm": 2.8855702259785874, + "language_loss": 0.7266885, + "learning_rate": 1.5921927534234039e-06, + "loss": 0.7481066, + "num_input_tokens_seen": 207205505, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6953125, + "step": 9620, + "time_per_iteration": 2.508638858795166 + }, + { + "auxiliary_loss_clip": 0.01106937, + "auxiliary_loss_mlp": 0.01031542, + "balance_loss_clip": 1.01994872, + "balance_loss_mlp": 1.0379591, + "epoch": 0.578445813918533, + "flos": 21212864311680.0, + "grad_norm": 1.4901469929607327, + "language_loss": 0.77143538, + "learning_rate": 1.591811481689916e-06, + "loss": 0.79282016, + "num_input_tokens_seen": 207225315, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6875, + "step": 9621, + "time_per_iteration": 2.4620673656463623 + }, + { + "auxiliary_loss_clip": 0.01106096, + "auxiliary_loss_mlp": 0.01031373, + "balance_loss_clip": 1.01862931, + "balance_loss_mlp": 1.03550279, + "epoch": 0.5785059371712009, + "flos": 25046795306880.0, + "grad_norm": 1.5105026325174375, + "language_loss": 0.70597667, + "learning_rate": 1.5914302254349787e-06, + "loss": 0.72735131, + "num_input_tokens_seen": 207247690, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.70703125, + "step": 9622, + "time_per_iteration": 2.509505033493042 + }, + { + "auxiliary_loss_clip": 0.01028849, + "auxiliary_loss_mlp": 0.01001525, + "balance_loss_clip": 1.00028539, + "balance_loss_mlp": 1.007653, + "epoch": 0.5785660604238689, + "flos": 70843172284800.0, + "grad_norm": 0.7726155153830789, + "language_loss": 0.55941814, + "learning_rate": 1.5910489846730476e-06, + "loss": 0.57972187, + "num_input_tokens_seen": 207301735, + "router_z_loss_clip": 0.01239014, + "router_z_loss_mlp": 0.21191406, + "step": 9623, + "time_per_iteration": 3.0823814868927 + }, + { + "auxiliary_loss_clip": 0.01109291, + "auxiliary_loss_mlp": 0.01036509, + "balance_loss_clip": 1.02344918, + "balance_loss_mlp": 1.03692317, + "epoch": 0.578626183676537, + "flos": 31649977244160.0, + "grad_norm": 2.2221143081246373, + "language_loss": 0.71056175, + "learning_rate": 1.5906677594185799e-06, + "loss": 0.73201978, + "num_input_tokens_seen": 207321240, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.72265625, + "step": 9624, + "time_per_iteration": 2.5265705585479736 + }, + { + "auxiliary_loss_clip": 0.01107503, + "auxiliary_loss_mlp": 0.01037993, + "balance_loss_clip": 1.02552414, + "balance_loss_mlp": 1.03862953, + "epoch": 0.5786863069292049, + "flos": 21865181253120.0, + "grad_norm": 2.222167937534436, + "language_loss": 0.82642812, + "learning_rate": 1.5902865496860322e-06, + "loss": 0.84788311, + "num_input_tokens_seen": 207339540, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 9625, + "time_per_iteration": 2.451249122619629 + }, + { + "auxiliary_loss_clip": 0.01104891, + "auxiliary_loss_mlp": 0.01034617, + "balance_loss_clip": 1.02198672, + "balance_loss_mlp": 1.03701115, + "epoch": 0.5787464301818729, + "flos": 23364954748800.0, + "grad_norm": 1.455235974234194, + "language_loss": 0.69956779, + "learning_rate": 1.5899053554898591e-06, + "loss": 0.72096288, + "num_input_tokens_seen": 207360470, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6796875, + "step": 9626, + "time_per_iteration": 2.4975287914276123 + }, + { + "auxiliary_loss_clip": 0.01103607, + "auxiliary_loss_mlp": 0.01036492, + "balance_loss_clip": 1.02480352, + "balance_loss_mlp": 1.03568482, + "epoch": 0.5788065534345408, + "flos": 30004011394560.0, + "grad_norm": 1.93553238886208, + "language_loss": 0.71862161, + "learning_rate": 1.5895241768445166e-06, + "loss": 0.7400226, + "num_input_tokens_seen": 207383080, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 9627, + "time_per_iteration": 2.5138702392578125 + }, + { + "auxiliary_loss_clip": 0.01104177, + "auxiliary_loss_mlp": 0.01028958, + "balance_loss_clip": 1.01737726, + "balance_loss_mlp": 1.03599048, + "epoch": 0.5788666766872088, + "flos": 24527849564160.0, + "grad_norm": 1.727007676436273, + "language_loss": 0.8414377, + "learning_rate": 1.589143013764458e-06, + "loss": 0.86276901, + "num_input_tokens_seen": 207401000, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6796875, + "step": 9628, + "time_per_iteration": 2.4851796627044678 + }, + { + "auxiliary_loss_clip": 0.01103695, + "auxiliary_loss_mlp": 0.01025516, + "balance_loss_clip": 1.01394033, + "balance_loss_mlp": 1.03516388, + "epoch": 0.5789267999398767, + "flos": 23732823888000.0, + "grad_norm": 1.6873428245402236, + "language_loss": 0.71942705, + "learning_rate": 1.5887618662641376e-06, + "loss": 0.74071914, + "num_input_tokens_seen": 207419230, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6875, + "step": 9629, + "time_per_iteration": 2.520517587661743 + }, + { + "auxiliary_loss_clip": 0.01108734, + "auxiliary_loss_mlp": 0.01034367, + "balance_loss_clip": 1.02181387, + "balance_loss_mlp": 1.03963637, + "epoch": 0.5789869231925447, + "flos": 21135045496320.0, + "grad_norm": 1.9628574132847711, + "language_loss": 0.74576336, + "learning_rate": 1.5883807343580087e-06, + "loss": 0.76719439, + "num_input_tokens_seen": 207437615, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69140625, + "step": 9630, + "time_per_iteration": 2.454810380935669 + }, + { + "auxiliary_loss_clip": 0.01101829, + "auxiliary_loss_mlp": 0.01028168, + "balance_loss_clip": 1.01682508, + "balance_loss_mlp": 1.03553247, + "epoch": 0.5790470464452127, + "flos": 21209632087680.0, + "grad_norm": 1.6371763310429226, + "language_loss": 0.79325604, + "learning_rate": 1.587999618060523e-06, + "loss": 0.814556, + "num_input_tokens_seen": 207457270, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 9631, + "time_per_iteration": 2.440864324569702 + }, + { + "auxiliary_loss_clip": 0.01104911, + "auxiliary_loss_mlp": 0.01030044, + "balance_loss_clip": 1.01800966, + "balance_loss_mlp": 1.03596497, + "epoch": 0.5791071696978807, + "flos": 23404384903680.0, + "grad_norm": 1.6037309933130668, + "language_loss": 0.75137591, + "learning_rate": 1.5876185173861333e-06, + "loss": 0.77272546, + "num_input_tokens_seen": 207477890, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6875, + "step": 9632, + "time_per_iteration": 2.4771668910980225 + }, + { + "auxiliary_loss_clip": 0.01106006, + "auxiliary_loss_mlp": 0.01027741, + "balance_loss_clip": 1.01517081, + "balance_loss_mlp": 1.03731871, + "epoch": 0.5791672929505486, + "flos": 24206521472640.0, + "grad_norm": 2.4626986888140716, + "language_loss": 0.79077435, + "learning_rate": 1.5872374323492915e-06, + "loss": 0.81211185, + "num_input_tokens_seen": 207497670, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 9633, + "time_per_iteration": 2.448436737060547 + }, + { + "auxiliary_loss_clip": 0.01115908, + "auxiliary_loss_mlp": 0.0103724, + "balance_loss_clip": 1.02447283, + "balance_loss_mlp": 1.04036343, + "epoch": 0.5792274162032166, + "flos": 24348871071360.0, + "grad_norm": 1.7086543878642706, + "language_loss": 0.77430606, + "learning_rate": 1.5868563629644464e-06, + "loss": 0.79583752, + "num_input_tokens_seen": 207516105, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.75390625, + "step": 9634, + "time_per_iteration": 2.4811017513275146 + }, + { + "auxiliary_loss_clip": 0.01108474, + "auxiliary_loss_mlp": 0.01038015, + "balance_loss_clip": 1.02580237, + "balance_loss_mlp": 1.03722477, + "epoch": 0.5792875394558845, + "flos": 20449403712000.0, + "grad_norm": 2.1301414361920843, + "language_loss": 0.63183784, + "learning_rate": 1.5864753092460502e-06, + "loss": 0.65330267, + "num_input_tokens_seen": 207533685, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.71484375, + "step": 9635, + "time_per_iteration": 3.8360743522644043 + }, + { + "auxiliary_loss_clip": 0.01105747, + "auxiliary_loss_mlp": 0.01036, + "balance_loss_clip": 1.02431154, + "balance_loss_mlp": 1.03854156, + "epoch": 0.5793476627085525, + "flos": 24060329118720.0, + "grad_norm": 1.5921207664968484, + "language_loss": 0.76923883, + "learning_rate": 1.5860942712085516e-06, + "loss": 0.79065627, + "num_input_tokens_seen": 207552840, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.671875, + "step": 9636, + "time_per_iteration": 2.4524970054626465 + }, + { + "auxiliary_loss_clip": 0.01101976, + "auxiliary_loss_mlp": 0.01032358, + "balance_loss_clip": 1.020854, + "balance_loss_mlp": 1.03643167, + "epoch": 0.5794077859612206, + "flos": 22054287381120.0, + "grad_norm": 1.6428369167222547, + "language_loss": 0.68367255, + "learning_rate": 1.5857132488663998e-06, + "loss": 0.70501596, + "num_input_tokens_seen": 207572095, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 9637, + "time_per_iteration": 3.9001073837280273 + }, + { + "auxiliary_loss_clip": 0.01107722, + "auxiliary_loss_mlp": 0.010306, + "balance_loss_clip": 1.0181725, + "balance_loss_mlp": 1.03622174, + "epoch": 0.5794679092138885, + "flos": 11434855991040.0, + "grad_norm": 2.3860817889930326, + "language_loss": 0.72291076, + "learning_rate": 1.585332242234043e-06, + "loss": 0.74429405, + "num_input_tokens_seen": 207587495, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71484375, + "step": 9638, + "time_per_iteration": 3.8099658489227295 + }, + { + "auxiliary_loss_clip": 0.01106068, + "auxiliary_loss_mlp": 0.01031021, + "balance_loss_clip": 1.01981568, + "balance_loss_mlp": 1.03809261, + "epoch": 0.5795280324665565, + "flos": 18880215183360.0, + "grad_norm": 2.0300843650533387, + "language_loss": 0.72111142, + "learning_rate": 1.5849512513259291e-06, + "loss": 0.7424823, + "num_input_tokens_seen": 207606795, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6796875, + "step": 9639, + "time_per_iteration": 3.9071426391601562 + }, + { + "auxiliary_loss_clip": 0.01107692, + "auxiliary_loss_mlp": 0.01034902, + "balance_loss_clip": 1.02291572, + "balance_loss_mlp": 1.03860509, + "epoch": 0.5795881557192244, + "flos": 13005947940480.0, + "grad_norm": 2.0103274032155163, + "language_loss": 0.69715077, + "learning_rate": 1.5845702761565054e-06, + "loss": 0.71857667, + "num_input_tokens_seen": 207623620, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 9640, + "time_per_iteration": 2.433104991912842 + }, + { + "auxiliary_loss_clip": 0.01113005, + "auxiliary_loss_mlp": 0.01038437, + "balance_loss_clip": 1.02583635, + "balance_loss_mlp": 1.03887677, + "epoch": 0.5796482789718924, + "flos": 19932397303680.0, + "grad_norm": 2.7872404958031884, + "language_loss": 0.77623034, + "learning_rate": 1.5841893167402183e-06, + "loss": 0.79774475, + "num_input_tokens_seen": 207639380, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7421875, + "step": 9641, + "time_per_iteration": 2.4301722049713135 + }, + { + "auxiliary_loss_clip": 0.01106272, + "auxiliary_loss_mlp": 0.01030792, + "balance_loss_clip": 1.01930058, + "balance_loss_mlp": 1.0378499, + "epoch": 0.5797084022245603, + "flos": 21650794928640.0, + "grad_norm": 1.8500908876117999, + "language_loss": 0.73673463, + "learning_rate": 1.5838083730915143e-06, + "loss": 0.75810528, + "num_input_tokens_seen": 207657915, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.68359375, + "step": 9642, + "time_per_iteration": 2.49660325050354 + }, + { + "auxiliary_loss_clip": 0.01104964, + "auxiliary_loss_mlp": 0.01029372, + "balance_loss_clip": 1.01718903, + "balance_loss_mlp": 1.03625488, + "epoch": 0.5797685254772283, + "flos": 26031573555840.0, + "grad_norm": 1.696347443177098, + "language_loss": 0.73574042, + "learning_rate": 1.5834274452248378e-06, + "loss": 0.75708383, + "num_input_tokens_seen": 207678620, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 9643, + "time_per_iteration": 2.485637903213501 + }, + { + "auxiliary_loss_clip": 0.01108659, + "auxiliary_loss_mlp": 0.01031042, + "balance_loss_clip": 1.01862597, + "balance_loss_mlp": 1.03768921, + "epoch": 0.5798286487298963, + "flos": 22705167778560.0, + "grad_norm": 1.9990943096580656, + "language_loss": 0.67527819, + "learning_rate": 1.5830465331546352e-06, + "loss": 0.69667518, + "num_input_tokens_seen": 207696980, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 9644, + "time_per_iteration": 2.487901449203491 + }, + { + "auxiliary_loss_clip": 0.01112826, + "auxiliary_loss_mlp": 0.01029368, + "balance_loss_clip": 1.01664853, + "balance_loss_mlp": 1.03988528, + "epoch": 0.5798887719825643, + "flos": 23148988225920.0, + "grad_norm": 2.232135453826953, + "language_loss": 0.85353506, + "learning_rate": 1.5826656368953496e-06, + "loss": 0.87495703, + "num_input_tokens_seen": 207714065, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7265625, + "step": 9645, + "time_per_iteration": 2.4591071605682373 + }, + { + "auxiliary_loss_clip": 0.01108849, + "auxiliary_loss_mlp": 0.01029438, + "balance_loss_clip": 1.01782739, + "balance_loss_mlp": 1.03902066, + "epoch": 0.5799488952352322, + "flos": 24426043441920.0, + "grad_norm": 1.87513340954769, + "language_loss": 0.7528075, + "learning_rate": 1.5822847564614244e-06, + "loss": 0.77419043, + "num_input_tokens_seen": 207734720, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.69921875, + "step": 9646, + "time_per_iteration": 2.5096170902252197 + }, + { + "auxiliary_loss_clip": 0.01111341, + "auxiliary_loss_mlp": 0.01033913, + "balance_loss_clip": 1.02068663, + "balance_loss_mlp": 1.03949249, + "epoch": 0.5800090184879002, + "flos": 38395903829760.0, + "grad_norm": 1.666102030467492, + "language_loss": 0.5938943, + "learning_rate": 1.5819038918673038e-06, + "loss": 0.61534685, + "num_input_tokens_seen": 207755435, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71875, + "step": 9647, + "time_per_iteration": 2.5928401947021484 + }, + { + "auxiliary_loss_clip": 0.01109539, + "auxiliary_loss_mlp": 0.0103437, + "balance_loss_clip": 1.02159083, + "balance_loss_mlp": 1.0388217, + "epoch": 0.5800691417405681, + "flos": 19784840232960.0, + "grad_norm": 1.5329184941218248, + "language_loss": 0.84261942, + "learning_rate": 1.5815230431274288e-06, + "loss": 0.86405849, + "num_input_tokens_seen": 207773570, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.70703125, + "step": 9648, + "time_per_iteration": 2.460245132446289 + }, + { + "auxiliary_loss_clip": 0.01027507, + "auxiliary_loss_mlp": 0.00998956, + "balance_loss_clip": 0.99765694, + "balance_loss_mlp": 1.00610447, + "epoch": 0.5801292649932361, + "flos": 70314565783680.0, + "grad_norm": 0.8404119708733213, + "language_loss": 0.62959844, + "learning_rate": 1.581142210256242e-06, + "loss": 0.64986312, + "num_input_tokens_seen": 207830095, + "router_z_loss_clip": 0.01300049, + "router_z_loss_mlp": 0.21484375, + "step": 9649, + "time_per_iteration": 3.1300153732299805 + }, + { + "auxiliary_loss_clip": 0.01103333, + "auxiliary_loss_mlp": 0.01031569, + "balance_loss_clip": 1.02015436, + "balance_loss_mlp": 1.03649998, + "epoch": 0.5801893882459042, + "flos": 18734812928640.0, + "grad_norm": 2.3310983541006434, + "language_loss": 0.82039601, + "learning_rate": 1.5807613932681857e-06, + "loss": 0.84174502, + "num_input_tokens_seen": 207848555, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 9650, + "time_per_iteration": 2.4216153621673584 + }, + { + "auxiliary_loss_clip": 0.0111056, + "auxiliary_loss_mlp": 0.01032759, + "balance_loss_clip": 1.02018833, + "balance_loss_mlp": 1.0376749, + "epoch": 0.5802495114985721, + "flos": 15596507698560.0, + "grad_norm": 2.3176650701334442, + "language_loss": 0.77372313, + "learning_rate": 1.580380592177698e-06, + "loss": 0.79515636, + "num_input_tokens_seen": 207867060, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7265625, + "step": 9651, + "time_per_iteration": 2.4731314182281494 + }, + { + "auxiliary_loss_clip": 0.01110796, + "auxiliary_loss_mlp": 0.01036307, + "balance_loss_clip": 1.02309239, + "balance_loss_mlp": 1.03978133, + "epoch": 0.5803096347512401, + "flos": 18255405081600.0, + "grad_norm": 2.0034024707617575, + "language_loss": 0.74143803, + "learning_rate": 1.5799998069992213e-06, + "loss": 0.76290905, + "num_input_tokens_seen": 207884520, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 9652, + "time_per_iteration": 2.426095485687256 + }, + { + "auxiliary_loss_clip": 0.01106661, + "auxiliary_loss_mlp": 0.01031597, + "balance_loss_clip": 1.01887703, + "balance_loss_mlp": 1.03536129, + "epoch": 0.580369758003908, + "flos": 22893160584960.0, + "grad_norm": 1.9100146686462136, + "language_loss": 0.76669693, + "learning_rate": 1.579619037747193e-06, + "loss": 0.78807956, + "num_input_tokens_seen": 207905370, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9653, + "time_per_iteration": 2.479843854904175 + }, + { + "auxiliary_loss_clip": 0.01107407, + "auxiliary_loss_mlp": 0.01030936, + "balance_loss_clip": 1.01702428, + "balance_loss_mlp": 1.03746295, + "epoch": 0.580429881256576, + "flos": 18697681244160.0, + "grad_norm": 2.3557465918911578, + "language_loss": 0.74466497, + "learning_rate": 1.5792382844360534e-06, + "loss": 0.76604843, + "num_input_tokens_seen": 207923790, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.69921875, + "step": 9654, + "time_per_iteration": 2.4389872550964355 + }, + { + "auxiliary_loss_clip": 0.01105384, + "auxiliary_loss_mlp": 0.01033574, + "balance_loss_clip": 1.02185535, + "balance_loss_mlp": 1.0386194, + "epoch": 0.5804900045092439, + "flos": 24681978823680.0, + "grad_norm": 1.67229579578488, + "language_loss": 0.70335853, + "learning_rate": 1.5788575470802408e-06, + "loss": 0.72474813, + "num_input_tokens_seen": 207942335, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 9655, + "time_per_iteration": 2.4667346477508545 + }, + { + "auxiliary_loss_clip": 0.01112207, + "auxiliary_loss_mlp": 0.01038295, + "balance_loss_clip": 1.025087, + "balance_loss_mlp": 1.03787553, + "epoch": 0.580550127761912, + "flos": 23112790295040.0, + "grad_norm": 3.1924669760277666, + "language_loss": 0.69441068, + "learning_rate": 1.5784768256941915e-06, + "loss": 0.71591568, + "num_input_tokens_seen": 207961975, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 9656, + "time_per_iteration": 2.47267746925354 + }, + { + "auxiliary_loss_clip": 0.01104052, + "auxiliary_loss_mlp": 0.01031775, + "balance_loss_clip": 1.02040219, + "balance_loss_mlp": 1.0376507, + "epoch": 0.5806102510145799, + "flos": 18475681236480.0, + "grad_norm": 1.8802574367017126, + "language_loss": 0.71315479, + "learning_rate": 1.5780961202923433e-06, + "loss": 0.73451304, + "num_input_tokens_seen": 207979520, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6640625, + "step": 9657, + "time_per_iteration": 2.411862850189209 + }, + { + "auxiliary_loss_clip": 0.01110384, + "auxiliary_loss_mlp": 0.01033397, + "balance_loss_clip": 1.02014136, + "balance_loss_mlp": 1.03748548, + "epoch": 0.5806703742672479, + "flos": 23915645136000.0, + "grad_norm": 2.139189937245848, + "language_loss": 0.70763719, + "learning_rate": 1.5777154308891328e-06, + "loss": 0.72907501, + "num_input_tokens_seen": 207998375, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7265625, + "step": 9658, + "time_per_iteration": 2.4618098735809326 + }, + { + "auxiliary_loss_clip": 0.01029117, + "auxiliary_loss_mlp": 0.01007613, + "balance_loss_clip": 1.00649261, + "balance_loss_mlp": 1.00762427, + "epoch": 0.5807304975199158, + "flos": 66311999412480.0, + "grad_norm": 0.6568503671216013, + "language_loss": 0.53557444, + "learning_rate": 1.5773347574989953e-06, + "loss": 0.5559417, + "num_input_tokens_seen": 208060605, + "router_z_loss_clip": 0.01123047, + "router_z_loss_mlp": 0.21484375, + "step": 9659, + "time_per_iteration": 3.081292152404785 + }, + { + "auxiliary_loss_clip": 0.01109597, + "auxiliary_loss_mlp": 0.01038179, + "balance_loss_clip": 1.02564979, + "balance_loss_mlp": 1.0386076, + "epoch": 0.5807906207725838, + "flos": 31722444933120.0, + "grad_norm": 2.325531986819307, + "language_loss": 0.62134814, + "learning_rate": 1.576954100136366e-06, + "loss": 0.6428259, + "num_input_tokens_seen": 208080320, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7109375, + "step": 9660, + "time_per_iteration": 2.5101215839385986 + }, + { + "auxiliary_loss_clip": 0.01107552, + "auxiliary_loss_mlp": 0.01033971, + "balance_loss_clip": 1.02121592, + "balance_loss_mlp": 1.03510964, + "epoch": 0.5808507440252517, + "flos": 23801161512960.0, + "grad_norm": 1.644077336412447, + "language_loss": 0.65339613, + "learning_rate": 1.5765734588156797e-06, + "loss": 0.67481142, + "num_input_tokens_seen": 208099305, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7265625, + "step": 9661, + "time_per_iteration": 2.495326042175293 + }, + { + "auxiliary_loss_clip": 0.01101624, + "auxiliary_loss_mlp": 0.0102562, + "balance_loss_clip": 1.01473665, + "balance_loss_mlp": 1.03630924, + "epoch": 0.5809108672779197, + "flos": 13698449222400.0, + "grad_norm": 1.4453410326473544, + "language_loss": 0.74667752, + "learning_rate": 1.5761928335513704e-06, + "loss": 0.76795, + "num_input_tokens_seen": 208116960, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.65234375, + "step": 9662, + "time_per_iteration": 2.4072024822235107 + }, + { + "auxiliary_loss_clip": 0.0102818, + "auxiliary_loss_mlp": 0.01003249, + "balance_loss_clip": 1.0020808, + "balance_loss_mlp": 1.00680053, + "epoch": 0.5809709905305876, + "flos": 69134866381440.0, + "grad_norm": 0.8844058515803096, + "language_loss": 0.58421201, + "learning_rate": 1.5758122243578709e-06, + "loss": 0.60452628, + "num_input_tokens_seen": 208182190, + "router_z_loss_clip": 0.01165771, + "router_z_loss_mlp": 0.21484375, + "step": 9663, + "time_per_iteration": 3.128176689147949 + }, + { + "auxiliary_loss_clip": 0.01107731, + "auxiliary_loss_mlp": 0.01032948, + "balance_loss_clip": 1.02058566, + "balance_loss_mlp": 1.03855336, + "epoch": 0.5810311137832557, + "flos": 19827538525440.0, + "grad_norm": 2.2307426037080558, + "language_loss": 0.82198572, + "learning_rate": 1.5754316312496152e-06, + "loss": 0.84339249, + "num_input_tokens_seen": 208197015, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.69140625, + "step": 9664, + "time_per_iteration": 2.4268438816070557 + }, + { + "auxiliary_loss_clip": 0.01106716, + "auxiliary_loss_mlp": 0.01024753, + "balance_loss_clip": 1.0119977, + "balance_loss_mlp": 1.03471017, + "epoch": 0.5810912370359237, + "flos": 29238503719680.0, + "grad_norm": 1.6499573770914204, + "language_loss": 0.81283242, + "learning_rate": 1.5750510542410337e-06, + "loss": 0.8341471, + "num_input_tokens_seen": 208215795, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.72265625, + "step": 9665, + "time_per_iteration": 2.539750337600708 + }, + { + "auxiliary_loss_clip": 0.01113083, + "auxiliary_loss_mlp": 0.01032325, + "balance_loss_clip": 1.0180558, + "balance_loss_mlp": 1.03968716, + "epoch": 0.5811513602885916, + "flos": 22785572373120.0, + "grad_norm": 1.6493862237198238, + "language_loss": 0.81106472, + "learning_rate": 1.5746704933465599e-06, + "loss": 0.83251882, + "num_input_tokens_seen": 208234655, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.734375, + "step": 9666, + "time_per_iteration": 2.4637341499328613 + }, + { + "auxiliary_loss_clip": 0.01105376, + "auxiliary_loss_mlp": 0.01032179, + "balance_loss_clip": 1.02047861, + "balance_loss_mlp": 1.03734851, + "epoch": 0.5812114835412596, + "flos": 18734346051840.0, + "grad_norm": 1.772076851837157, + "language_loss": 0.79902422, + "learning_rate": 1.5742899485806227e-06, + "loss": 0.82039976, + "num_input_tokens_seen": 208251300, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 9667, + "time_per_iteration": 2.4630167484283447 + }, + { + "auxiliary_loss_clip": 0.01113135, + "auxiliary_loss_mlp": 0.01033461, + "balance_loss_clip": 1.01935887, + "balance_loss_mlp": 1.03786182, + "epoch": 0.5812716067939275, + "flos": 26431295080320.0, + "grad_norm": 1.5126376316707284, + "language_loss": 0.78524494, + "learning_rate": 1.573909419957653e-06, + "loss": 0.80671084, + "num_input_tokens_seen": 208272685, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.75, + "step": 9668, + "time_per_iteration": 2.4933431148529053 + }, + { + "auxiliary_loss_clip": 0.01109741, + "auxiliary_loss_mlp": 0.01031977, + "balance_loss_clip": 1.01976347, + "balance_loss_mlp": 1.03882718, + "epoch": 0.5813317300465956, + "flos": 43397865285120.0, + "grad_norm": 2.2917193824708395, + "language_loss": 0.6405921, + "learning_rate": 1.5735289074920819e-06, + "loss": 0.66200924, + "num_input_tokens_seen": 208294315, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.7109375, + "step": 9669, + "time_per_iteration": 2.711413860321045 + }, + { + "auxiliary_loss_clip": 0.01109059, + "auxiliary_loss_mlp": 0.01034536, + "balance_loss_clip": 1.02185786, + "balance_loss_mlp": 1.03847837, + "epoch": 0.5813918532992635, + "flos": 24785472885120.0, + "grad_norm": 1.7201818199144705, + "language_loss": 0.73401237, + "learning_rate": 1.5731484111983363e-06, + "loss": 0.75544822, + "num_input_tokens_seen": 208315610, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 9670, + "time_per_iteration": 2.481351375579834 + }, + { + "auxiliary_loss_clip": 0.01108134, + "auxiliary_loss_mlp": 0.01035647, + "balance_loss_clip": 1.02327895, + "balance_loss_mlp": 1.03665125, + "epoch": 0.5814519765519315, + "flos": 22857357703680.0, + "grad_norm": 2.1547601144280693, + "language_loss": 0.79159272, + "learning_rate": 1.5727679310908464e-06, + "loss": 0.81303054, + "num_input_tokens_seen": 208334725, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71484375, + "step": 9671, + "time_per_iteration": 2.481765031814575 + }, + { + "auxiliary_loss_clip": 0.01113516, + "auxiliary_loss_mlp": 0.01036309, + "balance_loss_clip": 1.02253985, + "balance_loss_mlp": 1.04052281, + "epoch": 0.5815120998045994, + "flos": 24060831909120.0, + "grad_norm": 1.8667318330129747, + "language_loss": 0.60387075, + "learning_rate": 1.5723874671840399e-06, + "loss": 0.62536901, + "num_input_tokens_seen": 208353825, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.73046875, + "step": 9672, + "time_per_iteration": 2.4585747718811035 + }, + { + "auxiliary_loss_clip": 0.01106042, + "auxiliary_loss_mlp": 0.01027644, + "balance_loss_clip": 1.01597953, + "balance_loss_mlp": 1.03804862, + "epoch": 0.5815722230572674, + "flos": 24279491952000.0, + "grad_norm": 1.9986212138203583, + "language_loss": 0.81078732, + "learning_rate": 1.572007019492342e-06, + "loss": 0.83212423, + "num_input_tokens_seen": 208374160, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 9673, + "time_per_iteration": 2.4950785636901855 + }, + { + "auxiliary_loss_clip": 0.01113708, + "auxiliary_loss_mlp": 0.01035096, + "balance_loss_clip": 1.02148843, + "balance_loss_mlp": 1.03956604, + "epoch": 0.5816323463099353, + "flos": 22200371994240.0, + "grad_norm": 1.7057299891387632, + "language_loss": 0.87750065, + "learning_rate": 1.5716265880301817e-06, + "loss": 0.89898866, + "num_input_tokens_seen": 208392105, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7421875, + "step": 9674, + "time_per_iteration": 2.440136432647705 + }, + { + "auxiliary_loss_clip": 0.01108901, + "auxiliary_loss_mlp": 0.01033088, + "balance_loss_clip": 1.02156651, + "balance_loss_mlp": 1.03789747, + "epoch": 0.5816924695626033, + "flos": 24134448833280.0, + "grad_norm": 1.5021502044615473, + "language_loss": 0.78512001, + "learning_rate": 1.571246172811984e-06, + "loss": 0.80653995, + "num_input_tokens_seen": 208411755, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.7109375, + "step": 9675, + "time_per_iteration": 2.474719285964966 + }, + { + "auxiliary_loss_clip": 0.01108481, + "auxiliary_loss_mlp": 0.010293, + "balance_loss_clip": 1.0162822, + "balance_loss_mlp": 1.03912115, + "epoch": 0.5817525928152713, + "flos": 21324223451520.0, + "grad_norm": 2.1292944862371486, + "language_loss": 0.70189106, + "learning_rate": 1.5708657738521748e-06, + "loss": 0.72326887, + "num_input_tokens_seen": 208429995, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6953125, + "step": 9676, + "time_per_iteration": 2.435563325881958 + }, + { + "auxiliary_loss_clip": 0.01108816, + "auxiliary_loss_mlp": 0.01030513, + "balance_loss_clip": 1.01728702, + "balance_loss_mlp": 1.03810883, + "epoch": 0.5818127160679393, + "flos": 26934510666240.0, + "grad_norm": 2.2453262518267216, + "language_loss": 0.63408953, + "learning_rate": 1.5704853911651779e-06, + "loss": 0.65548283, + "num_input_tokens_seen": 208443655, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.70703125, + "step": 9677, + "time_per_iteration": 3.852684736251831 + }, + { + "auxiliary_loss_clip": 0.01029913, + "auxiliary_loss_mlp": 0.01005476, + "balance_loss_clip": 1.00418842, + "balance_loss_mlp": 1.00840485, + "epoch": 0.5818728393206073, + "flos": 63918626342400.0, + "grad_norm": 0.8082693819649737, + "language_loss": 0.54284507, + "learning_rate": 1.5701050247654182e-06, + "loss": 0.56319892, + "num_input_tokens_seen": 208498405, + "router_z_loss_clip": 0.01287842, + "router_z_loss_mlp": 0.21484375, + "step": 9678, + "time_per_iteration": 3.1727702617645264 + }, + { + "auxiliary_loss_clip": 0.01029364, + "auxiliary_loss_mlp": 0.01, + "balance_loss_clip": 0.99879593, + "balance_loss_mlp": 1.00781882, + "epoch": 0.5819329625732752, + "flos": 64954108638720.0, + "grad_norm": 0.7323225743115229, + "language_loss": 0.56212348, + "learning_rate": 1.569724674667319e-06, + "loss": 0.58241719, + "num_input_tokens_seen": 208559075, + "router_z_loss_clip": 0.01202393, + "router_z_loss_mlp": 0.21484375, + "step": 9679, + "time_per_iteration": 4.407592296600342 + }, + { + "auxiliary_loss_clip": 0.01106016, + "auxiliary_loss_mlp": 0.01028689, + "balance_loss_clip": 1.01719165, + "balance_loss_mlp": 1.03636777, + "epoch": 0.5819930858259432, + "flos": 21215270522880.0, + "grad_norm": 1.5677269140843855, + "language_loss": 0.65393043, + "learning_rate": 1.5693443408853032e-06, + "loss": 0.67527747, + "num_input_tokens_seen": 208577770, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6953125, + "step": 9680, + "time_per_iteration": 3.854875087738037 + }, + { + "auxiliary_loss_clip": 0.01106852, + "auxiliary_loss_mlp": 0.01029479, + "balance_loss_clip": 1.01755846, + "balance_loss_mlp": 1.0371331, + "epoch": 0.5820532090786111, + "flos": 19458520151040.0, + "grad_norm": 1.7974099210270778, + "language_loss": 0.83398807, + "learning_rate": 1.5689640234337933e-06, + "loss": 0.85535139, + "num_input_tokens_seen": 208595110, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 9681, + "time_per_iteration": 3.906952381134033 + }, + { + "auxiliary_loss_clip": 0.01107734, + "auxiliary_loss_mlp": 0.01029608, + "balance_loss_clip": 1.01704884, + "balance_loss_mlp": 1.03765953, + "epoch": 0.5821133323312792, + "flos": 17712615686400.0, + "grad_norm": 1.7009206287297167, + "language_loss": 0.75691867, + "learning_rate": 1.5685837223272109e-06, + "loss": 0.77829218, + "num_input_tokens_seen": 208612080, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 9682, + "time_per_iteration": 2.4177029132843018 + }, + { + "auxiliary_loss_clip": 0.01109999, + "auxiliary_loss_mlp": 0.01029335, + "balance_loss_clip": 1.01696062, + "balance_loss_mlp": 1.03816795, + "epoch": 0.5821734555839471, + "flos": 24571804832640.0, + "grad_norm": 2.1225270667604, + "language_loss": 0.75228214, + "learning_rate": 1.568203437579977e-06, + "loss": 0.77367556, + "num_input_tokens_seen": 208630235, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71875, + "step": 9683, + "time_per_iteration": 2.483633279800415 + }, + { + "auxiliary_loss_clip": 0.01110877, + "auxiliary_loss_mlp": 0.01029498, + "balance_loss_clip": 1.01652765, + "balance_loss_mlp": 1.03809631, + "epoch": 0.5822335788366151, + "flos": 22382259488640.0, + "grad_norm": 1.7411447986789845, + "language_loss": 0.74026191, + "learning_rate": 1.5678231692065116e-06, + "loss": 0.76166564, + "num_input_tokens_seen": 208647925, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7265625, + "step": 9684, + "time_per_iteration": 2.445389986038208 + }, + { + "auxiliary_loss_clip": 0.0111011, + "auxiliary_loss_mlp": 0.01036105, + "balance_loss_clip": 1.02327847, + "balance_loss_mlp": 1.03914332, + "epoch": 0.582293702089283, + "flos": 26722494639360.0, + "grad_norm": 2.480778861643935, + "language_loss": 0.77930081, + "learning_rate": 1.5674429172212348e-06, + "loss": 0.80076301, + "num_input_tokens_seen": 208666180, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7109375, + "step": 9685, + "time_per_iteration": 2.4822564125061035 + }, + { + "auxiliary_loss_clip": 0.01108161, + "auxiliary_loss_mlp": 0.01037765, + "balance_loss_clip": 1.02525425, + "balance_loss_mlp": 1.0376507, + "epoch": 0.582353825341951, + "flos": 17348661129600.0, + "grad_norm": 1.6531366373498986, + "language_loss": 0.75214118, + "learning_rate": 1.5670626816385667e-06, + "loss": 0.77360046, + "num_input_tokens_seen": 208684240, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 9686, + "time_per_iteration": 2.441162109375 + }, + { + "auxiliary_loss_clip": 0.0102947, + "auxiliary_loss_mlp": 0.01008506, + "balance_loss_clip": 1.00720644, + "balance_loss_mlp": 1.00800455, + "epoch": 0.5824139485946189, + "flos": 55473261534720.0, + "grad_norm": 0.8335448804232356, + "language_loss": 0.57427585, + "learning_rate": 1.5666824624729244e-06, + "loss": 0.59465551, + "num_input_tokens_seen": 208736090, + "router_z_loss_clip": 0.01300049, + "router_z_loss_mlp": 0.21484375, + "step": 9687, + "time_per_iteration": 2.887495279312134 + }, + { + "auxiliary_loss_clip": 0.01106071, + "auxiliary_loss_mlp": 0.01028754, + "balance_loss_clip": 1.01534319, + "balance_loss_mlp": 1.03597438, + "epoch": 0.582474071847287, + "flos": 20303031790080.0, + "grad_norm": 1.808127013520305, + "language_loss": 0.69851446, + "learning_rate": 1.566302259738727e-06, + "loss": 0.7198627, + "num_input_tokens_seen": 208754600, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.703125, + "step": 9688, + "time_per_iteration": 2.475397825241089 + }, + { + "auxiliary_loss_clip": 0.01108083, + "auxiliary_loss_mlp": 0.01032707, + "balance_loss_clip": 1.02077413, + "balance_loss_mlp": 1.03770781, + "epoch": 0.5825341950999549, + "flos": 23878010661120.0, + "grad_norm": 2.8185672100752224, + "language_loss": 0.65197223, + "learning_rate": 1.5659220734503918e-06, + "loss": 0.67338014, + "num_input_tokens_seen": 208773140, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.703125, + "step": 9689, + "time_per_iteration": 2.461808204650879 + }, + { + "auxiliary_loss_clip": 0.01107982, + "auxiliary_loss_mlp": 0.01031185, + "balance_loss_clip": 1.0186801, + "balance_loss_mlp": 1.03977919, + "epoch": 0.5825943183526229, + "flos": 23113041690240.0, + "grad_norm": 1.5648827403998262, + "language_loss": 0.73213816, + "learning_rate": 1.5655419036223341e-06, + "loss": 0.75352979, + "num_input_tokens_seen": 208793410, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 9690, + "time_per_iteration": 2.459392786026001 + }, + { + "auxiliary_loss_clip": 0.01110714, + "auxiliary_loss_mlp": 0.0103267, + "balance_loss_clip": 1.01889586, + "balance_loss_mlp": 1.03849721, + "epoch": 0.5826544416052909, + "flos": 22857429530880.0, + "grad_norm": 1.9110650477929338, + "language_loss": 0.76118016, + "learning_rate": 1.5651617502689717e-06, + "loss": 0.78261399, + "num_input_tokens_seen": 208811920, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.72265625, + "step": 9691, + "time_per_iteration": 2.454533338546753 + }, + { + "auxiliary_loss_clip": 0.01107915, + "auxiliary_loss_mlp": 0.01033477, + "balance_loss_clip": 1.02115119, + "balance_loss_mlp": 1.03619492, + "epoch": 0.5827145648579588, + "flos": 31501845555840.0, + "grad_norm": 1.7126808977143095, + "language_loss": 0.80746913, + "learning_rate": 1.5647816134047184e-06, + "loss": 0.82888305, + "num_input_tokens_seen": 208834720, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.71875, + "step": 9692, + "time_per_iteration": 2.501497268676758 + }, + { + "auxiliary_loss_clip": 0.01027994, + "auxiliary_loss_mlp": 0.01002254, + "balance_loss_clip": 1.00103843, + "balance_loss_mlp": 1.0067246, + "epoch": 0.5827746881106268, + "flos": 69811817074560.0, + "grad_norm": 0.7602984909294345, + "language_loss": 0.56910902, + "learning_rate": 1.5644014930439907e-06, + "loss": 0.5894115, + "num_input_tokens_seen": 208898415, + "router_z_loss_clip": 0.012146, + "router_z_loss_mlp": 0.21289062, + "step": 9693, + "time_per_iteration": 3.0237975120544434 + }, + { + "auxiliary_loss_clip": 0.01106474, + "auxiliary_loss_mlp": 0.01033695, + "balance_loss_clip": 1.02250707, + "balance_loss_mlp": 1.03660345, + "epoch": 0.5828348113632947, + "flos": 23112395245440.0, + "grad_norm": 2.266427213008104, + "language_loss": 0.79537672, + "learning_rate": 1.5640213892012025e-06, + "loss": 0.81677842, + "num_input_tokens_seen": 208919045, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.69921875, + "step": 9694, + "time_per_iteration": 2.4761908054351807 + }, + { + "auxiliary_loss_clip": 0.01104051, + "auxiliary_loss_mlp": 0.01033564, + "balance_loss_clip": 1.02250743, + "balance_loss_mlp": 1.03815889, + "epoch": 0.5828949346159628, + "flos": 21873082245120.0, + "grad_norm": 1.3946621855299897, + "language_loss": 0.75905991, + "learning_rate": 1.5636413018907656e-06, + "loss": 0.7804361, + "num_input_tokens_seen": 208939375, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.66015625, + "step": 9695, + "time_per_iteration": 2.4863994121551514 + }, + { + "auxiliary_loss_clip": 0.01028568, + "auxiliary_loss_mlp": 0.01000024, + "balance_loss_clip": 0.99865955, + "balance_loss_mlp": 1.00692177, + "epoch": 0.5829550578686307, + "flos": 65962553950080.0, + "grad_norm": 0.7688369043614423, + "language_loss": 0.54971713, + "learning_rate": 1.563261231127095e-06, + "loss": 0.57000303, + "num_input_tokens_seen": 209004760, + "router_z_loss_clip": 0.01367188, + "router_z_loss_mlp": 0.21679688, + "step": 9696, + "time_per_iteration": 3.1397409439086914 + }, + { + "auxiliary_loss_clip": 0.01108342, + "auxiliary_loss_mlp": 0.0102802, + "balance_loss_clip": 1.01588464, + "balance_loss_mlp": 1.03907263, + "epoch": 0.5830151811212987, + "flos": 16289799079680.0, + "grad_norm": 2.461981122956424, + "language_loss": 0.7641257, + "learning_rate": 1.5628811769246021e-06, + "loss": 0.78548938, + "num_input_tokens_seen": 209022930, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 9697, + "time_per_iteration": 2.4391984939575195 + }, + { + "auxiliary_loss_clip": 0.01110278, + "auxiliary_loss_mlp": 0.01032297, + "balance_loss_clip": 1.01940477, + "balance_loss_mlp": 1.03790259, + "epoch": 0.5830753043739666, + "flos": 24168851084160.0, + "grad_norm": 1.5880971870479619, + "language_loss": 0.77744102, + "learning_rate": 1.5625011392976991e-06, + "loss": 0.79886687, + "num_input_tokens_seen": 209043740, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7265625, + "step": 9698, + "time_per_iteration": 2.5576770305633545 + }, + { + "auxiliary_loss_clip": 0.01107792, + "auxiliary_loss_mlp": 0.01037348, + "balance_loss_clip": 1.02412117, + "balance_loss_mlp": 1.03847361, + "epoch": 0.5831354276266346, + "flos": 27059050097280.0, + "grad_norm": 1.8122014087406897, + "language_loss": 0.83381891, + "learning_rate": 1.5621211182607966e-06, + "loss": 0.85527027, + "num_input_tokens_seen": 209068885, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.6953125, + "step": 9699, + "time_per_iteration": 2.5637032985687256 + }, + { + "auxiliary_loss_clip": 0.01108462, + "auxiliary_loss_mlp": 0.01029094, + "balance_loss_clip": 1.01663673, + "balance_loss_mlp": 1.03769052, + "epoch": 0.5831955508793025, + "flos": 23623475909760.0, + "grad_norm": 2.315377539273772, + "language_loss": 0.66859722, + "learning_rate": 1.561741113828305e-06, + "loss": 0.68997276, + "num_input_tokens_seen": 209087340, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.70703125, + "step": 9700, + "time_per_iteration": 2.471012592315674 + }, + { + "auxiliary_loss_clip": 0.01106015, + "auxiliary_loss_mlp": 0.01032562, + "balance_loss_clip": 1.0199858, + "balance_loss_mlp": 1.03591251, + "epoch": 0.5832556741319705, + "flos": 24973250209920.0, + "grad_norm": 1.5256356872175616, + "language_loss": 0.713889, + "learning_rate": 1.5613611260146344e-06, + "loss": 0.73527479, + "num_input_tokens_seen": 209108840, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 9701, + "time_per_iteration": 2.4697649478912354 + }, + { + "auxiliary_loss_clip": 0.01104917, + "auxiliary_loss_mlp": 0.01031819, + "balance_loss_clip": 1.01984477, + "balance_loss_mlp": 1.03625238, + "epoch": 0.5833157973846385, + "flos": 23221563655680.0, + "grad_norm": 1.810379708827147, + "language_loss": 0.85387969, + "learning_rate": 1.5609811548341936e-06, + "loss": 0.87524706, + "num_input_tokens_seen": 209127985, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6875, + "step": 9702, + "time_per_iteration": 2.481027841567993 + }, + { + "auxiliary_loss_clip": 0.01100783, + "auxiliary_loss_mlp": 0.0103365, + "balance_loss_clip": 1.02206278, + "balance_loss_mlp": 1.0346241, + "epoch": 0.5833759206373065, + "flos": 21977941023360.0, + "grad_norm": 1.4628982512923412, + "language_loss": 0.77776694, + "learning_rate": 1.560601200301392e-06, + "loss": 0.79911131, + "num_input_tokens_seen": 209146885, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 9703, + "time_per_iteration": 2.435124397277832 + }, + { + "auxiliary_loss_clip": 0.01110145, + "auxiliary_loss_mlp": 0.01030447, + "balance_loss_clip": 1.01736951, + "balance_loss_mlp": 1.03907001, + "epoch": 0.5834360438899745, + "flos": 21762405463680.0, + "grad_norm": 1.7159930715569567, + "language_loss": 0.71405482, + "learning_rate": 1.5602212624306366e-06, + "loss": 0.73546076, + "num_input_tokens_seen": 209166130, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 9704, + "time_per_iteration": 2.4737584590911865 + }, + { + "auxiliary_loss_clip": 0.01107118, + "auxiliary_loss_mlp": 0.01031577, + "balance_loss_clip": 1.02001369, + "balance_loss_mlp": 1.03844225, + "epoch": 0.5834961671426424, + "flos": 15992566035840.0, + "grad_norm": 2.155391395554278, + "language_loss": 0.814731, + "learning_rate": 1.559841341236335e-06, + "loss": 0.83611786, + "num_input_tokens_seen": 209183350, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6875, + "step": 9705, + "time_per_iteration": 2.456681966781616 + }, + { + "auxiliary_loss_clip": 0.01105829, + "auxiliary_loss_mlp": 0.01029323, + "balance_loss_clip": 1.01780725, + "balance_loss_mlp": 1.03706515, + "epoch": 0.5835562903953104, + "flos": 22818322598400.0, + "grad_norm": 2.7067870421451805, + "language_loss": 0.80659604, + "learning_rate": 1.5594614367328937e-06, + "loss": 0.82794762, + "num_input_tokens_seen": 209203945, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6875, + "step": 9706, + "time_per_iteration": 2.497509717941284 + }, + { + "auxiliary_loss_clip": 0.01104424, + "auxiliary_loss_mlp": 0.01031781, + "balance_loss_clip": 1.01860809, + "balance_loss_mlp": 1.03667164, + "epoch": 0.5836164136479783, + "flos": 48468056624640.0, + "grad_norm": 2.0481497339382084, + "language_loss": 0.74599034, + "learning_rate": 1.5590815489347187e-06, + "loss": 0.7673524, + "num_input_tokens_seen": 209227080, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.6796875, + "step": 9707, + "time_per_iteration": 2.6745028495788574 + }, + { + "auxiliary_loss_clip": 0.01103427, + "auxiliary_loss_mlp": 0.01030508, + "balance_loss_clip": 1.01876628, + "balance_loss_mlp": 1.03624749, + "epoch": 0.5836765369006464, + "flos": 26905998245760.0, + "grad_norm": 1.608372812838098, + "language_loss": 0.81249726, + "learning_rate": 1.5587016778562163e-06, + "loss": 0.83383656, + "num_input_tokens_seen": 209248170, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 9708, + "time_per_iteration": 2.492741584777832 + }, + { + "auxiliary_loss_clip": 0.01106344, + "auxiliary_loss_mlp": 0.01027852, + "balance_loss_clip": 1.01569307, + "balance_loss_mlp": 1.03903604, + "epoch": 0.5837366601533143, + "flos": 20084048524800.0, + "grad_norm": 1.7521527331614153, + "language_loss": 0.78249604, + "learning_rate": 1.5583218235117896e-06, + "loss": 0.80383801, + "num_input_tokens_seen": 209267730, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.671875, + "step": 9709, + "time_per_iteration": 2.476956844329834 + }, + { + "auxiliary_loss_clip": 0.01027997, + "auxiliary_loss_mlp": 0.01002158, + "balance_loss_clip": 1.00083506, + "balance_loss_mlp": 1.0065155, + "epoch": 0.5837967834059823, + "flos": 65363885971200.0, + "grad_norm": 0.7691792257321526, + "language_loss": 0.56582153, + "learning_rate": 1.557941985915844e-06, + "loss": 0.58612299, + "num_input_tokens_seen": 209332510, + "router_z_loss_clip": 0.01324463, + "router_z_loss_mlp": 0.21484375, + "step": 9710, + "time_per_iteration": 3.0814101696014404 + }, + { + "auxiliary_loss_clip": 0.0110345, + "auxiliary_loss_mlp": 0.01032732, + "balance_loss_clip": 1.0211035, + "balance_loss_mlp": 1.03715682, + "epoch": 0.5838569066586502, + "flos": 25338641310720.0, + "grad_norm": 1.5515305439757483, + "language_loss": 0.65762496, + "learning_rate": 1.5575621650827833e-06, + "loss": 0.67898679, + "num_input_tokens_seen": 209353355, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 9711, + "time_per_iteration": 2.4872825145721436 + }, + { + "auxiliary_loss_clip": 0.01112071, + "auxiliary_loss_mlp": 0.01034684, + "balance_loss_clip": 1.02147532, + "balance_loss_mlp": 1.03822017, + "epoch": 0.5839170299113182, + "flos": 22229243550720.0, + "grad_norm": 1.6429842517443687, + "language_loss": 0.78599298, + "learning_rate": 1.5571823610270085e-06, + "loss": 0.80746061, + "num_input_tokens_seen": 209370960, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73828125, + "step": 9712, + "time_per_iteration": 2.442077398300171 + }, + { + "auxiliary_loss_clip": 0.01105322, + "auxiliary_loss_mlp": 0.01025498, + "balance_loss_clip": 1.01343966, + "balance_loss_mlp": 1.03646183, + "epoch": 0.5839771531639861, + "flos": 22200012858240.0, + "grad_norm": 1.7240347174541215, + "language_loss": 0.73268932, + "learning_rate": 1.5568025737629234e-06, + "loss": 0.7539975, + "num_input_tokens_seen": 209390955, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6875, + "step": 9713, + "time_per_iteration": 2.459120750427246 + }, + { + "auxiliary_loss_clip": 0.01110691, + "auxiliary_loss_mlp": 0.01029593, + "balance_loss_clip": 1.01647365, + "balance_loss_mlp": 1.03805757, + "epoch": 0.5840372764166541, + "flos": 22419355259520.0, + "grad_norm": 1.8470967199163717, + "language_loss": 0.69391453, + "learning_rate": 1.5564228033049292e-06, + "loss": 0.71531737, + "num_input_tokens_seen": 209410260, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7265625, + "step": 9714, + "time_per_iteration": 2.4558205604553223 + }, + { + "auxiliary_loss_clip": 0.01106219, + "auxiliary_loss_mlp": 0.01030193, + "balance_loss_clip": 1.01737761, + "balance_loss_mlp": 1.03574395, + "epoch": 0.5840973996693221, + "flos": 19828256797440.0, + "grad_norm": 1.7342681115417722, + "language_loss": 0.79977894, + "learning_rate": 1.5560430496674268e-06, + "loss": 0.82114303, + "num_input_tokens_seen": 209429920, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.70703125, + "step": 9715, + "time_per_iteration": 2.426506757736206 + }, + { + "auxiliary_loss_clip": 0.01106351, + "auxiliary_loss_mlp": 0.01029273, + "balance_loss_clip": 1.01666617, + "balance_loss_mlp": 1.037099, + "epoch": 0.5841575229219901, + "flos": 21142982401920.0, + "grad_norm": 4.9488403812071535, + "language_loss": 0.72778314, + "learning_rate": 1.5556633128648167e-06, + "loss": 0.74913943, + "num_input_tokens_seen": 209449470, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69140625, + "step": 9716, + "time_per_iteration": 2.44687819480896 + }, + { + "auxiliary_loss_clip": 0.01103683, + "auxiliary_loss_mlp": 0.01027677, + "balance_loss_clip": 1.01595879, + "balance_loss_mlp": 1.03716838, + "epoch": 0.5842176461746581, + "flos": 24640322025600.0, + "grad_norm": 1.6127648254863816, + "language_loss": 0.74810076, + "learning_rate": 1.5552835929114976e-06, + "loss": 0.76941431, + "num_input_tokens_seen": 209467695, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 9717, + "time_per_iteration": 2.460857629776001 + }, + { + "auxiliary_loss_clip": 0.01105902, + "auxiliary_loss_mlp": 0.0103646, + "balance_loss_clip": 1.02414012, + "balance_loss_mlp": 1.03733993, + "epoch": 0.584277769427326, + "flos": 19131158574720.0, + "grad_norm": 2.202005488151785, + "language_loss": 0.7997486, + "learning_rate": 1.5549038898218697e-06, + "loss": 0.82117224, + "num_input_tokens_seen": 209484250, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 9718, + "time_per_iteration": 2.4178881645202637 + }, + { + "auxiliary_loss_clip": 0.01106549, + "auxiliary_loss_mlp": 0.01032036, + "balance_loss_clip": 1.01891065, + "balance_loss_mlp": 1.03846669, + "epoch": 0.584337892679994, + "flos": 22675111073280.0, + "grad_norm": 1.4800218219438264, + "language_loss": 0.67422116, + "learning_rate": 1.5545242036103306e-06, + "loss": 0.69560701, + "num_input_tokens_seen": 209502830, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6796875, + "step": 9719, + "time_per_iteration": 3.8449153900146484 + }, + { + "auxiliary_loss_clip": 0.01107677, + "auxiliary_loss_mlp": 0.01028351, + "balance_loss_clip": 1.01631081, + "balance_loss_mlp": 1.03717732, + "epoch": 0.5843980159326619, + "flos": 31284083352960.0, + "grad_norm": 2.1638863024999484, + "language_loss": 0.75937355, + "learning_rate": 1.5541445342912786e-06, + "loss": 0.78073382, + "num_input_tokens_seen": 209525995, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.703125, + "step": 9720, + "time_per_iteration": 2.521005630493164 + }, + { + "auxiliary_loss_clip": 0.01106215, + "auxiliary_loss_mlp": 0.01037967, + "balance_loss_clip": 1.02579594, + "balance_loss_mlp": 1.03623533, + "epoch": 0.58445813918533, + "flos": 22748117466240.0, + "grad_norm": 1.5774446570210707, + "language_loss": 0.83079016, + "learning_rate": 1.5537648818791105e-06, + "loss": 0.85223192, + "num_input_tokens_seen": 209545895, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69921875, + "step": 9721, + "time_per_iteration": 3.9998085498809814 + }, + { + "auxiliary_loss_clip": 0.01030301, + "auxiliary_loss_mlp": 0.01007637, + "balance_loss_clip": 1.00636697, + "balance_loss_mlp": 1.00867438, + "epoch": 0.5845182624379979, + "flos": 60686556658560.0, + "grad_norm": 0.9369686939257119, + "language_loss": 0.71297473, + "learning_rate": 1.5533852463882226e-06, + "loss": 0.73335409, + "num_input_tokens_seen": 209602315, + "router_z_loss_clip": 0.01269531, + "router_z_loss_mlp": 0.21679688, + "step": 9722, + "time_per_iteration": 4.55988335609436 + }, + { + "auxiliary_loss_clip": 0.01104254, + "auxiliary_loss_mlp": 0.01033708, + "balance_loss_clip": 1.0219183, + "balance_loss_mlp": 1.03621197, + "epoch": 0.5845783856906659, + "flos": 16362446336640.0, + "grad_norm": 2.3592007880272097, + "language_loss": 0.89236099, + "learning_rate": 1.5530056278330113e-06, + "loss": 0.91374058, + "num_input_tokens_seen": 209617615, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 9723, + "time_per_iteration": 3.8671655654907227 + }, + { + "auxiliary_loss_clip": 0.01106969, + "auxiliary_loss_mlp": 0.01031836, + "balance_loss_clip": 1.02042723, + "balance_loss_mlp": 1.03859067, + "epoch": 0.5846385089433338, + "flos": 20083402080000.0, + "grad_norm": 1.4227647539631216, + "language_loss": 0.68610382, + "learning_rate": 1.5526260262278709e-06, + "loss": 0.70749187, + "num_input_tokens_seen": 209637005, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.68359375, + "step": 9724, + "time_per_iteration": 2.428325653076172 + }, + { + "auxiliary_loss_clip": 0.01114065, + "auxiliary_loss_mlp": 0.01034629, + "balance_loss_clip": 1.0221715, + "balance_loss_mlp": 1.04199743, + "epoch": 0.5846986321960018, + "flos": 17311062568320.0, + "grad_norm": 1.8750713541003288, + "language_loss": 0.86348903, + "learning_rate": 1.552246441587197e-06, + "loss": 0.88497603, + "num_input_tokens_seen": 209653170, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71875, + "step": 9725, + "time_per_iteration": 2.4113223552703857 + }, + { + "auxiliary_loss_clip": 0.01112675, + "auxiliary_loss_mlp": 0.01038738, + "balance_loss_clip": 1.02615535, + "balance_loss_mlp": 1.04008734, + "epoch": 0.5847587554486697, + "flos": 17197907748480.0, + "grad_norm": 1.9888550356442254, + "language_loss": 0.82856494, + "learning_rate": 1.5518668739253821e-06, + "loss": 0.85007912, + "num_input_tokens_seen": 209671275, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7265625, + "step": 9726, + "time_per_iteration": 2.4277760982513428 + }, + { + "auxiliary_loss_clip": 0.01108752, + "auxiliary_loss_mlp": 0.01036993, + "balance_loss_clip": 1.02550149, + "balance_loss_mlp": 1.03925705, + "epoch": 0.5848188787013378, + "flos": 24529106540160.0, + "grad_norm": 1.8720162128796731, + "language_loss": 0.66911906, + "learning_rate": 1.5514873232568206e-06, + "loss": 0.69057649, + "num_input_tokens_seen": 209690380, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6953125, + "step": 9727, + "time_per_iteration": 2.4941296577453613 + }, + { + "auxiliary_loss_clip": 0.011109, + "auxiliary_loss_mlp": 0.01042126, + "balance_loss_clip": 1.02927577, + "balance_loss_mlp": 1.04078412, + "epoch": 0.5848790019540057, + "flos": 20628382204800.0, + "grad_norm": 1.755089310778911, + "language_loss": 0.81880605, + "learning_rate": 1.5511077895959055e-06, + "loss": 0.84033632, + "num_input_tokens_seen": 209708845, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69921875, + "step": 9728, + "time_per_iteration": 2.504457950592041 + }, + { + "auxiliary_loss_clip": 0.01105423, + "auxiliary_loss_mlp": 0.01036783, + "balance_loss_clip": 1.02519631, + "balance_loss_mlp": 1.03857303, + "epoch": 0.5849391252066737, + "flos": 22418852469120.0, + "grad_norm": 1.9458365932895556, + "language_loss": 0.78459418, + "learning_rate": 1.550728272957027e-06, + "loss": 0.80601627, + "num_input_tokens_seen": 209729000, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66796875, + "step": 9729, + "time_per_iteration": 2.4906978607177734 + }, + { + "auxiliary_loss_clip": 0.0110723, + "auxiliary_loss_mlp": 0.01029316, + "balance_loss_clip": 1.01629853, + "balance_loss_mlp": 1.03705525, + "epoch": 0.5849992484593417, + "flos": 25410929431680.0, + "grad_norm": 2.2265789157985205, + "language_loss": 0.70611644, + "learning_rate": 1.5503487733545782e-06, + "loss": 0.72748184, + "num_input_tokens_seen": 209747435, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.703125, + "step": 9730, + "time_per_iteration": 2.5273194313049316 + }, + { + "auxiliary_loss_clip": 0.01113418, + "auxiliary_loss_mlp": 0.0103557, + "balance_loss_clip": 1.02182508, + "balance_loss_mlp": 1.04057014, + "epoch": 0.5850593717120096, + "flos": 21065163586560.0, + "grad_norm": 2.222037907468424, + "language_loss": 0.78473902, + "learning_rate": 1.5499692908029482e-06, + "loss": 0.80622888, + "num_input_tokens_seen": 209764910, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7265625, + "step": 9731, + "time_per_iteration": 2.4710583686828613 + }, + { + "auxiliary_loss_clip": 0.0110815, + "auxiliary_loss_mlp": 0.01032599, + "balance_loss_clip": 1.01983809, + "balance_loss_mlp": 1.03908777, + "epoch": 0.5851194949646776, + "flos": 25301545539840.0, + "grad_norm": 1.7845208257427057, + "language_loss": 0.69966131, + "learning_rate": 1.549589825316528e-06, + "loss": 0.72106874, + "num_input_tokens_seen": 209786115, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69140625, + "step": 9732, + "time_per_iteration": 2.4975006580352783 + }, + { + "auxiliary_loss_clip": 0.01113456, + "auxiliary_loss_mlp": 0.01033051, + "balance_loss_clip": 1.01913929, + "balance_loss_mlp": 1.04045916, + "epoch": 0.5851796182173455, + "flos": 23587242065280.0, + "grad_norm": 1.73190032828597, + "language_loss": 0.52698147, + "learning_rate": 1.5492103769097075e-06, + "loss": 0.54844654, + "num_input_tokens_seen": 209806095, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.73046875, + "step": 9733, + "time_per_iteration": 2.485399007797241 + }, + { + "auxiliary_loss_clip": 0.01111159, + "auxiliary_loss_mlp": 0.01030285, + "balance_loss_clip": 1.01741672, + "balance_loss_mlp": 1.04071164, + "epoch": 0.5852397414700136, + "flos": 24822712310400.0, + "grad_norm": 6.263677136925273, + "language_loss": 0.87694037, + "learning_rate": 1.5488309455968739e-06, + "loss": 0.89835489, + "num_input_tokens_seen": 209823650, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 9734, + "time_per_iteration": 2.472288131713867 + }, + { + "auxiliary_loss_clip": 0.01103403, + "auxiliary_loss_mlp": 0.01032428, + "balance_loss_clip": 1.02119839, + "balance_loss_mlp": 1.03833449, + "epoch": 0.5852998647226815, + "flos": 19937784343680.0, + "grad_norm": 1.513447931139509, + "language_loss": 0.72063559, + "learning_rate": 1.5484515313924163e-06, + "loss": 0.7419939, + "num_input_tokens_seen": 209843220, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 9735, + "time_per_iteration": 2.4491236209869385 + }, + { + "auxiliary_loss_clip": 0.01113365, + "auxiliary_loss_mlp": 0.01041862, + "balance_loss_clip": 1.02809381, + "balance_loss_mlp": 1.04022026, + "epoch": 0.5853599879753495, + "flos": 16720367408640.0, + "grad_norm": 2.443961120173282, + "language_loss": 0.74189854, + "learning_rate": 1.5480721343107217e-06, + "loss": 0.76345086, + "num_input_tokens_seen": 209854880, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.73046875, + "step": 9736, + "time_per_iteration": 2.419142961502075 + }, + { + "auxiliary_loss_clip": 0.01106138, + "auxiliary_loss_mlp": 0.01032073, + "balance_loss_clip": 1.01981854, + "balance_loss_mlp": 1.0379591, + "epoch": 0.5854201112280174, + "flos": 44456583680640.0, + "grad_norm": 2.2236691167379083, + "language_loss": 0.70181298, + "learning_rate": 1.5476927543661772e-06, + "loss": 0.72319508, + "num_input_tokens_seen": 209877870, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6796875, + "step": 9737, + "time_per_iteration": 2.6583194732666016 + }, + { + "auxiliary_loss_clip": 0.01106196, + "auxiliary_loss_mlp": 0.0103613, + "balance_loss_clip": 1.02428091, + "balance_loss_mlp": 1.03835154, + "epoch": 0.5854802344806854, + "flos": 20339193807360.0, + "grad_norm": 1.7203982017599655, + "language_loss": 0.82579291, + "learning_rate": 1.547313391573169e-06, + "loss": 0.84721613, + "num_input_tokens_seen": 209896690, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.67578125, + "step": 9738, + "time_per_iteration": 2.4531257152557373 + }, + { + "auxiliary_loss_clip": 0.01113247, + "auxiliary_loss_mlp": 0.01036554, + "balance_loss_clip": 1.02323246, + "balance_loss_mlp": 1.04034615, + "epoch": 0.5855403577333533, + "flos": 20921054221440.0, + "grad_norm": 1.7945048569600959, + "language_loss": 0.68588519, + "learning_rate": 1.546934045946082e-06, + "loss": 0.70738328, + "num_input_tokens_seen": 209914640, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7265625, + "step": 9739, + "time_per_iteration": 2.456914186477661 + }, + { + "auxiliary_loss_clip": 0.01108939, + "auxiliary_loss_mlp": 0.0102703, + "balance_loss_clip": 1.01416099, + "balance_loss_mlp": 1.03718436, + "epoch": 0.5856004809860214, + "flos": 20448649526400.0, + "grad_norm": 3.661868392990544, + "language_loss": 0.58782631, + "learning_rate": 1.5465547174993017e-06, + "loss": 0.60918605, + "num_input_tokens_seen": 209933375, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 9740, + "time_per_iteration": 2.4507863521575928 + }, + { + "auxiliary_loss_clip": 0.01106066, + "auxiliary_loss_mlp": 0.01027305, + "balance_loss_clip": 1.01462674, + "balance_loss_mlp": 1.03621328, + "epoch": 0.5856606042386893, + "flos": 19640766781440.0, + "grad_norm": 2.5503677599504138, + "language_loss": 0.74937272, + "learning_rate": 1.5461754062472113e-06, + "loss": 0.77070647, + "num_input_tokens_seen": 209952055, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 9741, + "time_per_iteration": 2.4589905738830566 + }, + { + "auxiliary_loss_clip": 0.01110252, + "auxiliary_loss_mlp": 0.01030619, + "balance_loss_clip": 1.01856065, + "balance_loss_mlp": 1.04028082, + "epoch": 0.5857207274913573, + "flos": 21686166846720.0, + "grad_norm": 5.17192355324585, + "language_loss": 0.75760782, + "learning_rate": 1.5457961122041959e-06, + "loss": 0.77901655, + "num_input_tokens_seen": 209971190, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.69921875, + "step": 9742, + "time_per_iteration": 2.4604122638702393 + }, + { + "auxiliary_loss_clip": 0.01106761, + "auxiliary_loss_mlp": 0.01028937, + "balance_loss_clip": 1.01720667, + "balance_loss_mlp": 1.03765917, + "epoch": 0.5857808507440253, + "flos": 23182708118400.0, + "grad_norm": 1.843175426453247, + "language_loss": 0.74955082, + "learning_rate": 1.5454168353846369e-06, + "loss": 0.77090788, + "num_input_tokens_seen": 209990695, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.69140625, + "step": 9743, + "time_per_iteration": 2.4604763984680176 + }, + { + "auxiliary_loss_clip": 0.01106043, + "auxiliary_loss_mlp": 0.01028717, + "balance_loss_clip": 1.0171833, + "balance_loss_mlp": 1.03878045, + "epoch": 0.5858409739966932, + "flos": 27235299156480.0, + "grad_norm": 1.7092789137699793, + "language_loss": 0.81049299, + "learning_rate": 1.5450375758029172e-06, + "loss": 0.83184063, + "num_input_tokens_seen": 210010210, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 9744, + "time_per_iteration": 2.516517162322998 + }, + { + "auxiliary_loss_clip": 0.0111328, + "auxiliary_loss_mlp": 0.01029291, + "balance_loss_clip": 1.01756728, + "balance_loss_mlp": 1.04009771, + "epoch": 0.5859010972493612, + "flos": 27855512317440.0, + "grad_norm": 1.7947324983718902, + "language_loss": 0.71260583, + "learning_rate": 1.5446583334734183e-06, + "loss": 0.73403156, + "num_input_tokens_seen": 210030030, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.73046875, + "step": 9745, + "time_per_iteration": 2.5095736980438232 + }, + { + "auxiliary_loss_clip": 0.01029472, + "auxiliary_loss_mlp": 0.01001042, + "balance_loss_clip": 0.99980211, + "balance_loss_mlp": 1.00798225, + "epoch": 0.5859612205020291, + "flos": 70007064428160.0, + "grad_norm": 0.7288291603374486, + "language_loss": 0.5328598, + "learning_rate": 1.5442791084105204e-06, + "loss": 0.55316496, + "num_input_tokens_seen": 210094840, + "router_z_loss_clip": 0.01239014, + "router_z_loss_mlp": 0.21484375, + "step": 9746, + "time_per_iteration": 3.1588006019592285 + }, + { + "auxiliary_loss_clip": 0.01111789, + "auxiliary_loss_mlp": 0.01028882, + "balance_loss_clip": 1.01581621, + "balance_loss_mlp": 1.04034877, + "epoch": 0.5860213437546972, + "flos": 24056019486720.0, + "grad_norm": 2.1076565833563743, + "language_loss": 0.73041242, + "learning_rate": 1.5438999006286054e-06, + "loss": 0.75181913, + "num_input_tokens_seen": 210114660, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71484375, + "step": 9747, + "time_per_iteration": 2.529571533203125 + }, + { + "auxiliary_loss_clip": 0.01110161, + "auxiliary_loss_mlp": 0.01034205, + "balance_loss_clip": 1.02153921, + "balance_loss_mlp": 1.03954244, + "epoch": 0.5860814670073651, + "flos": 18947583141120.0, + "grad_norm": 2.1114805581962934, + "language_loss": 0.81232262, + "learning_rate": 1.543520710142051e-06, + "loss": 0.83376622, + "num_input_tokens_seen": 210132770, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 9748, + "time_per_iteration": 2.4205257892608643 + }, + { + "auxiliary_loss_clip": 0.01108981, + "auxiliary_loss_mlp": 0.01031425, + "balance_loss_clip": 1.01904488, + "balance_loss_mlp": 1.03803837, + "epoch": 0.5861415902600331, + "flos": 22561848512640.0, + "grad_norm": 1.6594717662282998, + "language_loss": 0.71928638, + "learning_rate": 1.5431415369652375e-06, + "loss": 0.74069047, + "num_input_tokens_seen": 210151895, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 9749, + "time_per_iteration": 2.4881033897399902 + }, + { + "auxiliary_loss_clip": 0.0110821, + "auxiliary_loss_mlp": 0.0103104, + "balance_loss_clip": 1.01869583, + "balance_loss_mlp": 1.04076529, + "epoch": 0.586201713512701, + "flos": 14392027912320.0, + "grad_norm": 2.0326510096801056, + "language_loss": 0.7436285, + "learning_rate": 1.5427623811125428e-06, + "loss": 0.76502097, + "num_input_tokens_seen": 210168040, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.67578125, + "step": 9750, + "time_per_iteration": 2.414621353149414 + }, + { + "auxiliary_loss_clip": 0.01108261, + "auxiliary_loss_mlp": 0.01035384, + "balance_loss_clip": 1.02279603, + "balance_loss_mlp": 1.03921914, + "epoch": 0.586261836765369, + "flos": 19498560837120.0, + "grad_norm": 1.743949260258008, + "language_loss": 0.71048808, + "learning_rate": 1.542383242598344e-06, + "loss": 0.73192453, + "num_input_tokens_seen": 210187720, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69140625, + "step": 9751, + "time_per_iteration": 2.4829182624816895 + }, + { + "auxiliary_loss_clip": 0.01112341, + "auxiliary_loss_mlp": 0.01034246, + "balance_loss_clip": 1.02050161, + "balance_loss_mlp": 1.04000425, + "epoch": 0.5863219600180369, + "flos": 20701819560960.0, + "grad_norm": 1.8642101544605258, + "language_loss": 0.74632239, + "learning_rate": 1.5420041214370184e-06, + "loss": 0.76778823, + "num_input_tokens_seen": 210206080, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.72265625, + "step": 9752, + "time_per_iteration": 2.4715142250061035 + }, + { + "auxiliary_loss_clip": 0.01107296, + "auxiliary_loss_mlp": 0.01031223, + "balance_loss_clip": 1.01895666, + "balance_loss_mlp": 1.0386945, + "epoch": 0.586382083270705, + "flos": 19792130693760.0, + "grad_norm": 1.7856678678755609, + "language_loss": 0.77179754, + "learning_rate": 1.541625017642943e-06, + "loss": 0.79318273, + "num_input_tokens_seen": 210225660, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6875, + "step": 9753, + "time_per_iteration": 2.443422794342041 + }, + { + "auxiliary_loss_clip": 0.01105348, + "auxiliary_loss_mlp": 0.01026457, + "balance_loss_clip": 1.01546574, + "balance_loss_mlp": 1.03864121, + "epoch": 0.5864422065233729, + "flos": 16500558130560.0, + "grad_norm": 1.9587413882718219, + "language_loss": 0.70530736, + "learning_rate": 1.5412459312304927e-06, + "loss": 0.72662538, + "num_input_tokens_seen": 210242725, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6640625, + "step": 9754, + "time_per_iteration": 2.409973621368408 + }, + { + "auxiliary_loss_clip": 0.01107928, + "auxiliary_loss_mlp": 0.01031096, + "balance_loss_clip": 1.01829863, + "balance_loss_mlp": 1.03827429, + "epoch": 0.5865023297760409, + "flos": 20413277608320.0, + "grad_norm": 1.747136336565704, + "language_loss": 0.72055626, + "learning_rate": 1.540866862214043e-06, + "loss": 0.74194646, + "num_input_tokens_seen": 210263225, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 9755, + "time_per_iteration": 2.4600584506988525 + }, + { + "auxiliary_loss_clip": 0.01030392, + "auxiliary_loss_mlp": 0.01003259, + "balance_loss_clip": 1.00203705, + "balance_loss_mlp": 1.00899053, + "epoch": 0.5865624530287089, + "flos": 63350769254400.0, + "grad_norm": 0.7394274912640315, + "language_loss": 0.5697751, + "learning_rate": 1.540487810607967e-06, + "loss": 0.59011161, + "num_input_tokens_seen": 210322310, + "router_z_loss_clip": 0.01220703, + "router_z_loss_mlp": 0.21484375, + "step": 9756, + "time_per_iteration": 3.0282156467437744 + }, + { + "auxiliary_loss_clip": 0.01105056, + "auxiliary_loss_mlp": 0.01032383, + "balance_loss_clip": 1.02114117, + "balance_loss_mlp": 1.03774321, + "epoch": 0.5866225762813768, + "flos": 27016279977600.0, + "grad_norm": 1.7702895540430315, + "language_loss": 0.76155764, + "learning_rate": 1.5401087764266396e-06, + "loss": 0.78293204, + "num_input_tokens_seen": 210340845, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 9757, + "time_per_iteration": 2.5391111373901367 + }, + { + "auxiliary_loss_clip": 0.01030392, + "auxiliary_loss_mlp": 0.01004494, + "balance_loss_clip": 1.00322425, + "balance_loss_mlp": 1.00899124, + "epoch": 0.5866826995340448, + "flos": 72987038507520.0, + "grad_norm": 0.8655305518018972, + "language_loss": 0.60531819, + "learning_rate": 1.5397297596844337e-06, + "loss": 0.62566704, + "num_input_tokens_seen": 210397815, + "router_z_loss_clip": 0.01269531, + "router_z_loss_mlp": 0.21484375, + "step": 9758, + "time_per_iteration": 3.0623366832733154 + }, + { + "auxiliary_loss_clip": 0.01113209, + "auxiliary_loss_mlp": 0.01030767, + "balance_loss_clip": 1.01773787, + "balance_loss_mlp": 1.03982747, + "epoch": 0.5867428227867127, + "flos": 21285727050240.0, + "grad_norm": 2.3357598656034897, + "language_loss": 0.71766979, + "learning_rate": 1.5393507603957212e-06, + "loss": 0.73910952, + "num_input_tokens_seen": 210413900, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.734375, + "step": 9759, + "time_per_iteration": 2.474400043487549 + }, + { + "auxiliary_loss_clip": 0.0111074, + "auxiliary_loss_mlp": 0.01032361, + "balance_loss_clip": 1.0208931, + "balance_loss_mlp": 1.04039979, + "epoch": 0.5868029460393808, + "flos": 33468852188160.0, + "grad_norm": 1.5007272591007914, + "language_loss": 0.73244017, + "learning_rate": 1.5389717785748742e-06, + "loss": 0.7538712, + "num_input_tokens_seen": 210434110, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.703125, + "step": 9760, + "time_per_iteration": 4.081261396408081 + }, + { + "auxiliary_loss_clip": 0.01106401, + "auxiliary_loss_mlp": 0.01027896, + "balance_loss_clip": 1.01556969, + "balance_loss_mlp": 1.03715563, + "epoch": 0.5868630692920487, + "flos": 17889475276800.0, + "grad_norm": 1.8805423527385174, + "language_loss": 0.72491598, + "learning_rate": 1.5385928142362637e-06, + "loss": 0.74625897, + "num_input_tokens_seen": 210451685, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69140625, + "step": 9761, + "time_per_iteration": 2.42621111869812 + }, + { + "auxiliary_loss_clip": 0.01107357, + "auxiliary_loss_mlp": 0.0103042, + "balance_loss_clip": 1.0167706, + "balance_loss_mlp": 1.03563881, + "epoch": 0.5869231925447167, + "flos": 21035035054080.0, + "grad_norm": 1.837534804487864, + "language_loss": 0.74821299, + "learning_rate": 1.5382138673942597e-06, + "loss": 0.76959074, + "num_input_tokens_seen": 210470825, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71875, + "step": 9762, + "time_per_iteration": 3.899322032928467 + }, + { + "auxiliary_loss_clip": 0.01107201, + "auxiliary_loss_mlp": 0.01029073, + "balance_loss_clip": 1.01706839, + "balance_loss_mlp": 1.03918064, + "epoch": 0.5869833157973846, + "flos": 74738219293440.0, + "grad_norm": 1.367882310541282, + "language_loss": 0.72223246, + "learning_rate": 1.5378349380632317e-06, + "loss": 0.74359524, + "num_input_tokens_seen": 210500075, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 9763, + "time_per_iteration": 4.356280326843262 + }, + { + "auxiliary_loss_clip": 0.01105096, + "auxiliary_loss_mlp": 0.01029686, + "balance_loss_clip": 1.01809907, + "balance_loss_mlp": 1.03675938, + "epoch": 0.5870434390500526, + "flos": 17638998762240.0, + "grad_norm": 1.4976833867772195, + "language_loss": 0.79729784, + "learning_rate": 1.53745602625755e-06, + "loss": 0.81864572, + "num_input_tokens_seen": 210518150, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.68359375, + "step": 9764, + "time_per_iteration": 3.9194676876068115 + }, + { + "auxiliary_loss_clip": 0.01108839, + "auxiliary_loss_mlp": 0.0103278, + "balance_loss_clip": 1.0202508, + "balance_loss_mlp": 1.03856993, + "epoch": 0.5871035623027205, + "flos": 21506146859520.0, + "grad_norm": 2.0111563944475908, + "language_loss": 0.78612924, + "learning_rate": 1.5370771319915819e-06, + "loss": 0.80754542, + "num_input_tokens_seen": 210537760, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 9765, + "time_per_iteration": 2.53273344039917 + }, + { + "auxiliary_loss_clip": 0.01106605, + "auxiliary_loss_mlp": 0.01029586, + "balance_loss_clip": 1.01712823, + "balance_loss_mlp": 1.03891206, + "epoch": 0.5871636855553886, + "flos": 13551861818880.0, + "grad_norm": 1.8843759319265088, + "language_loss": 0.83718032, + "learning_rate": 1.5366982552796947e-06, + "loss": 0.8585422, + "num_input_tokens_seen": 210555515, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.67578125, + "step": 9766, + "time_per_iteration": 2.467556953430176 + }, + { + "auxiliary_loss_clip": 0.01110103, + "auxiliary_loss_mlp": 0.01032223, + "balance_loss_clip": 1.02024257, + "balance_loss_mlp": 1.03847504, + "epoch": 0.5872238088080565, + "flos": 26212922346240.0, + "grad_norm": 2.6418409503909674, + "language_loss": 0.69825381, + "learning_rate": 1.536319396136257e-06, + "loss": 0.71967709, + "num_input_tokens_seen": 210575000, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.71875, + "step": 9767, + "time_per_iteration": 2.514695405960083 + }, + { + "auxiliary_loss_clip": 0.01108076, + "auxiliary_loss_mlp": 0.01034383, + "balance_loss_clip": 1.02162123, + "balance_loss_mlp": 1.03721809, + "epoch": 0.5872839320607245, + "flos": 30665198995200.0, + "grad_norm": 1.7100990150928812, + "language_loss": 0.6345011, + "learning_rate": 1.5359405545756336e-06, + "loss": 0.65592575, + "num_input_tokens_seen": 210595185, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7109375, + "step": 9768, + "time_per_iteration": 2.510586738586426 + }, + { + "auxiliary_loss_clip": 0.01029111, + "auxiliary_loss_mlp": 0.00999867, + "balance_loss_clip": 0.9985916, + "balance_loss_mlp": 1.00760961, + "epoch": 0.5873440553133924, + "flos": 60303570871680.0, + "grad_norm": 0.7128870586180143, + "language_loss": 0.53924322, + "learning_rate": 1.5355617306121914e-06, + "loss": 0.559533, + "num_input_tokens_seen": 210653210, + "router_z_loss_clip": 0.01275635, + "router_z_loss_mlp": 0.21484375, + "step": 9769, + "time_per_iteration": 3.0710904598236084 + }, + { + "auxiliary_loss_clip": 0.01104834, + "auxiliary_loss_mlp": 0.01033468, + "balance_loss_clip": 1.02148712, + "balance_loss_mlp": 1.03672135, + "epoch": 0.5874041785660604, + "flos": 21539292134400.0, + "grad_norm": 1.4641633186547043, + "language_loss": 0.70532131, + "learning_rate": 1.5351829242602945e-06, + "loss": 0.7267043, + "num_input_tokens_seen": 210673750, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 9770, + "time_per_iteration": 2.516707420349121 + }, + { + "auxiliary_loss_clip": 0.01106458, + "auxiliary_loss_mlp": 0.01031999, + "balance_loss_clip": 1.0193336, + "balance_loss_mlp": 1.03782773, + "epoch": 0.5874643018187284, + "flos": 24388947671040.0, + "grad_norm": 3.691664094278214, + "language_loss": 0.67488074, + "learning_rate": 1.5348041355343077e-06, + "loss": 0.69626534, + "num_input_tokens_seen": 210692960, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 9771, + "time_per_iteration": 2.4816172122955322 + }, + { + "auxiliary_loss_clip": 0.01107891, + "auxiliary_loss_mlp": 0.0103358, + "balance_loss_clip": 1.02041984, + "balance_loss_mlp": 1.03628254, + "epoch": 0.5875244250713964, + "flos": 28147717457280.0, + "grad_norm": 1.6051808895674682, + "language_loss": 0.65752995, + "learning_rate": 1.5344253644485954e-06, + "loss": 0.67894471, + "num_input_tokens_seen": 210714040, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71484375, + "step": 9772, + "time_per_iteration": 2.5371270179748535 + }, + { + "auxiliary_loss_clip": 0.01111325, + "auxiliary_loss_mlp": 0.01038697, + "balance_loss_clip": 1.02478576, + "balance_loss_mlp": 1.03915095, + "epoch": 0.5875845483240644, + "flos": 25812410722560.0, + "grad_norm": 1.7393863773768459, + "language_loss": 0.74272907, + "learning_rate": 1.534046611017519e-06, + "loss": 0.7642293, + "num_input_tokens_seen": 210733710, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.71875, + "step": 9773, + "time_per_iteration": 2.4879984855651855 + }, + { + "auxiliary_loss_clip": 0.01108784, + "auxiliary_loss_mlp": 0.01037956, + "balance_loss_clip": 1.02513528, + "balance_loss_mlp": 1.03829455, + "epoch": 0.5876446715767323, + "flos": 26906572863360.0, + "grad_norm": 2.707979121748391, + "language_loss": 0.53293657, + "learning_rate": 1.5336678752554421e-06, + "loss": 0.55440396, + "num_input_tokens_seen": 210753580, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 9774, + "time_per_iteration": 2.5072500705718994 + }, + { + "auxiliary_loss_clip": 0.01109655, + "auxiliary_loss_mlp": 0.01035615, + "balance_loss_clip": 1.02257991, + "balance_loss_mlp": 1.03880942, + "epoch": 0.5877047948294003, + "flos": 36684832579200.0, + "grad_norm": 2.48971225310605, + "language_loss": 0.65312964, + "learning_rate": 1.5332891571767264e-06, + "loss": 0.6745823, + "num_input_tokens_seen": 210773495, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 9775, + "time_per_iteration": 2.5655953884124756 + }, + { + "auxiliary_loss_clip": 0.01106711, + "auxiliary_loss_mlp": 0.01033817, + "balance_loss_clip": 1.02168775, + "balance_loss_mlp": 1.03676975, + "epoch": 0.5877649180820682, + "flos": 26724721282560.0, + "grad_norm": 1.785458151895031, + "language_loss": 0.73554152, + "learning_rate": 1.5329104567957326e-06, + "loss": 0.7569468, + "num_input_tokens_seen": 210793645, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69921875, + "step": 9776, + "time_per_iteration": 2.54707932472229 + }, + { + "auxiliary_loss_clip": 0.01106906, + "auxiliary_loss_mlp": 0.01033226, + "balance_loss_clip": 1.02136469, + "balance_loss_mlp": 1.0373795, + "epoch": 0.5878250413347362, + "flos": 21032197879680.0, + "grad_norm": 2.328878154900185, + "language_loss": 0.74400878, + "learning_rate": 1.532531774126821e-06, + "loss": 0.76541013, + "num_input_tokens_seen": 210813415, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6953125, + "step": 9777, + "time_per_iteration": 2.5013017654418945 + }, + { + "auxiliary_loss_clip": 0.01103004, + "auxiliary_loss_mlp": 0.01029838, + "balance_loss_clip": 1.01816726, + "balance_loss_mlp": 1.03745651, + "epoch": 0.5878851645874041, + "flos": 25484259047040.0, + "grad_norm": 1.542678345734907, + "language_loss": 0.74238187, + "learning_rate": 1.5321531091843512e-06, + "loss": 0.76371026, + "num_input_tokens_seen": 210833850, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 9778, + "time_per_iteration": 2.548445224761963 + }, + { + "auxiliary_loss_clip": 0.01104043, + "auxiliary_loss_mlp": 0.01029504, + "balance_loss_clip": 1.01765513, + "balance_loss_mlp": 1.03588045, + "epoch": 0.5879452878400722, + "flos": 23769129559680.0, + "grad_norm": 1.8670942886874708, + "language_loss": 0.70107329, + "learning_rate": 1.5317744619826824e-06, + "loss": 0.72240877, + "num_input_tokens_seen": 210853115, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 9779, + "time_per_iteration": 2.440385341644287 + }, + { + "auxiliary_loss_clip": 0.01109422, + "auxiliary_loss_mlp": 0.01032703, + "balance_loss_clip": 1.02029324, + "balance_loss_mlp": 1.03690886, + "epoch": 0.5880054110927401, + "flos": 17824513530240.0, + "grad_norm": 1.8860885981569304, + "language_loss": 0.67181754, + "learning_rate": 1.5313958325361727e-06, + "loss": 0.69323874, + "num_input_tokens_seen": 210872090, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7265625, + "step": 9780, + "time_per_iteration": 2.5105738639831543 + }, + { + "auxiliary_loss_clip": 0.01108309, + "auxiliary_loss_mlp": 0.01035836, + "balance_loss_clip": 1.02308023, + "balance_loss_mlp": 1.03872418, + "epoch": 0.5880655343454081, + "flos": 19463404400640.0, + "grad_norm": 3.148071574180809, + "language_loss": 0.72608495, + "learning_rate": 1.5310172208591807e-06, + "loss": 0.74752629, + "num_input_tokens_seen": 210888490, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 9781, + "time_per_iteration": 2.4174652099609375 + }, + { + "auxiliary_loss_clip": 0.01104991, + "auxiliary_loss_mlp": 0.0103161, + "balance_loss_clip": 1.01946235, + "balance_loss_mlp": 1.03562713, + "epoch": 0.588125657598076, + "flos": 21397588980480.0, + "grad_norm": 1.4505377017032317, + "language_loss": 0.70405555, + "learning_rate": 1.5306386269660622e-06, + "loss": 0.72542155, + "num_input_tokens_seen": 210908220, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6953125, + "step": 9782, + "time_per_iteration": 2.4488813877105713 + }, + { + "auxiliary_loss_clip": 0.01108141, + "auxiliary_loss_mlp": 0.01032909, + "balance_loss_clip": 1.02064204, + "balance_loss_mlp": 1.03547001, + "epoch": 0.588185780850744, + "flos": 16034653797120.0, + "grad_norm": 3.528130932430564, + "language_loss": 0.70414114, + "learning_rate": 1.5302600508711741e-06, + "loss": 0.72555161, + "num_input_tokens_seen": 210923945, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.7265625, + "step": 9783, + "time_per_iteration": 2.411940813064575 + }, + { + "auxiliary_loss_clip": 0.01109132, + "auxiliary_loss_mlp": 0.01032084, + "balance_loss_clip": 1.0186553, + "balance_loss_mlp": 1.03764033, + "epoch": 0.588245904103412, + "flos": 23728226947200.0, + "grad_norm": 2.8122189742296952, + "language_loss": 0.6903708, + "learning_rate": 1.5298814925888719e-06, + "loss": 0.71178293, + "num_input_tokens_seen": 210941955, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.71484375, + "step": 9784, + "time_per_iteration": 2.4809060096740723 + }, + { + "auxiliary_loss_clip": 0.01107726, + "auxiliary_loss_mlp": 0.01034991, + "balance_loss_clip": 1.02227104, + "balance_loss_mlp": 1.03585327, + "epoch": 0.58830602735608, + "flos": 33802534558080.0, + "grad_norm": 1.976987554101205, + "language_loss": 0.69485259, + "learning_rate": 1.5295029521335102e-06, + "loss": 0.71627975, + "num_input_tokens_seen": 210963105, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 9785, + "time_per_iteration": 2.5458383560180664 + }, + { + "auxiliary_loss_clip": 0.0110444, + "auxiliary_loss_mlp": 0.01026297, + "balance_loss_clip": 1.01477504, + "balance_loss_mlp": 1.03624511, + "epoch": 0.588366150608748, + "flos": 17090714586240.0, + "grad_norm": 2.0068567513814375, + "language_loss": 0.77542102, + "learning_rate": 1.5291244295194448e-06, + "loss": 0.79672837, + "num_input_tokens_seen": 210978720, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.68359375, + "step": 9786, + "time_per_iteration": 2.4269275665283203 + }, + { + "auxiliary_loss_clip": 0.01108029, + "auxiliary_loss_mlp": 0.01033892, + "balance_loss_clip": 1.02173829, + "balance_loss_mlp": 1.03681958, + "epoch": 0.5884262738614159, + "flos": 22127186033280.0, + "grad_norm": 1.4388452349288328, + "language_loss": 0.79175329, + "learning_rate": 1.5287459247610276e-06, + "loss": 0.81317246, + "num_input_tokens_seen": 210998750, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.7109375, + "step": 9787, + "time_per_iteration": 2.441265344619751 + }, + { + "auxiliary_loss_clip": 0.01106621, + "auxiliary_loss_mlp": 0.01031149, + "balance_loss_clip": 1.01953244, + "balance_loss_mlp": 1.03677058, + "epoch": 0.5884863971140839, + "flos": 21031838743680.0, + "grad_norm": 1.596428038291934, + "language_loss": 0.66514194, + "learning_rate": 1.5283674378726116e-06, + "loss": 0.68651974, + "num_input_tokens_seen": 211017550, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.69921875, + "step": 9788, + "time_per_iteration": 2.4632344245910645 + }, + { + "auxiliary_loss_clip": 0.01106001, + "auxiliary_loss_mlp": 0.01030969, + "balance_loss_clip": 1.01877332, + "balance_loss_mlp": 1.03787911, + "epoch": 0.5885465203667518, + "flos": 23805112008960.0, + "grad_norm": 2.066265402471891, + "language_loss": 0.79951847, + "learning_rate": 1.5279889688685506e-06, + "loss": 0.82088816, + "num_input_tokens_seen": 211034135, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 9789, + "time_per_iteration": 2.4486775398254395 + }, + { + "auxiliary_loss_clip": 0.01105301, + "auxiliary_loss_mlp": 0.01027594, + "balance_loss_clip": 1.01579237, + "balance_loss_mlp": 1.03722358, + "epoch": 0.5886066436194198, + "flos": 18880574319360.0, + "grad_norm": 1.510117689081276, + "language_loss": 0.70817208, + "learning_rate": 1.5276105177631944e-06, + "loss": 0.72950107, + "num_input_tokens_seen": 211053850, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 9790, + "time_per_iteration": 2.474634885787964 + }, + { + "auxiliary_loss_clip": 0.01105567, + "auxiliary_loss_mlp": 0.01033293, + "balance_loss_clip": 1.02120566, + "balance_loss_mlp": 1.0374043, + "epoch": 0.5886667668720877, + "flos": 24790141653120.0, + "grad_norm": 1.9043586619327855, + "language_loss": 0.83184004, + "learning_rate": 1.527232084570895e-06, + "loss": 0.85322857, + "num_input_tokens_seen": 211072165, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 9791, + "time_per_iteration": 2.4930591583251953 + }, + { + "auxiliary_loss_clip": 0.01110799, + "auxiliary_loss_mlp": 0.01034535, + "balance_loss_clip": 1.02189827, + "balance_loss_mlp": 1.04020619, + "epoch": 0.5887268901247558, + "flos": 21614381516160.0, + "grad_norm": 1.5964011084944127, + "language_loss": 0.76287472, + "learning_rate": 1.5268536693060026e-06, + "loss": 0.78432798, + "num_input_tokens_seen": 211089630, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 9792, + "time_per_iteration": 2.572164297103882 + }, + { + "auxiliary_loss_clip": 0.01110663, + "auxiliary_loss_mlp": 0.01030996, + "balance_loss_clip": 1.01878858, + "balance_loss_mlp": 1.0383172, + "epoch": 0.5887870133774237, + "flos": 20481722974080.0, + "grad_norm": 1.954465265842666, + "language_loss": 0.69085598, + "learning_rate": 1.5264752719828662e-06, + "loss": 0.71227252, + "num_input_tokens_seen": 211106120, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.72265625, + "step": 9793, + "time_per_iteration": 2.440532684326172 + }, + { + "auxiliary_loss_clip": 0.01105715, + "auxiliary_loss_mlp": 0.01032775, + "balance_loss_clip": 1.02001381, + "balance_loss_mlp": 1.03754866, + "epoch": 0.5888471366300917, + "flos": 19206283870080.0, + "grad_norm": 2.2945820531528547, + "language_loss": 0.60200524, + "learning_rate": 1.5260968926158353e-06, + "loss": 0.6233902, + "num_input_tokens_seen": 211122450, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 9794, + "time_per_iteration": 2.4281349182128906 + }, + { + "auxiliary_loss_clip": 0.01107792, + "auxiliary_loss_mlp": 0.01035699, + "balance_loss_clip": 1.02265191, + "balance_loss_mlp": 1.03800488, + "epoch": 0.5889072598827596, + "flos": 19972904866560.0, + "grad_norm": 1.8105141483242522, + "language_loss": 0.65209466, + "learning_rate": 1.525718531219257e-06, + "loss": 0.67352962, + "num_input_tokens_seen": 211141765, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.69921875, + "step": 9795, + "time_per_iteration": 2.4471983909606934 + }, + { + "auxiliary_loss_clip": 0.01105789, + "auxiliary_loss_mlp": 0.01036936, + "balance_loss_clip": 1.02589679, + "balance_loss_mlp": 1.03751063, + "epoch": 0.5889673831354276, + "flos": 20741249715840.0, + "grad_norm": 1.6472816848345888, + "language_loss": 0.74171197, + "learning_rate": 1.5253401878074801e-06, + "loss": 0.76313925, + "num_input_tokens_seen": 211160475, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6796875, + "step": 9796, + "time_per_iteration": 2.4404211044311523 + }, + { + "auxiliary_loss_clip": 0.0110878, + "auxiliary_loss_mlp": 0.01029178, + "balance_loss_clip": 1.01761484, + "balance_loss_mlp": 1.04002237, + "epoch": 0.5890275063880956, + "flos": 25300935008640.0, + "grad_norm": 1.4898681844876358, + "language_loss": 0.83064574, + "learning_rate": 1.5249618623948507e-06, + "loss": 0.85202533, + "num_input_tokens_seen": 211180480, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6875, + "step": 9797, + "time_per_iteration": 2.487971544265747 + }, + { + "auxiliary_loss_clip": 0.01104148, + "auxiliary_loss_mlp": 0.01031256, + "balance_loss_clip": 1.01896548, + "balance_loss_mlp": 1.03718829, + "epoch": 0.5890876296407636, + "flos": 11765377964160.0, + "grad_norm": 1.804693100831568, + "language_loss": 0.78741366, + "learning_rate": 1.5245835549957152e-06, + "loss": 0.80876774, + "num_input_tokens_seen": 211198000, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66796875, + "step": 9798, + "time_per_iteration": 2.4391119480133057 + }, + { + "auxiliary_loss_clip": 0.01104678, + "auxiliary_loss_mlp": 0.01031268, + "balance_loss_clip": 1.01994312, + "balance_loss_mlp": 1.03718722, + "epoch": 0.5891477528934316, + "flos": 13589460380160.0, + "grad_norm": 2.097614269824193, + "language_loss": 0.74100447, + "learning_rate": 1.5242052656244186e-06, + "loss": 0.76236397, + "num_input_tokens_seen": 211214765, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.67578125, + "step": 9799, + "time_per_iteration": 2.444185972213745 + }, + { + "auxiliary_loss_clip": 0.01110656, + "auxiliary_loss_mlp": 0.01031885, + "balance_loss_clip": 1.01852775, + "balance_loss_mlp": 1.03889656, + "epoch": 0.5892078761460995, + "flos": 15049193189760.0, + "grad_norm": 1.9705578864506654, + "language_loss": 0.76078779, + "learning_rate": 1.5238269942953064e-06, + "loss": 0.78221321, + "num_input_tokens_seen": 211232335, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.71875, + "step": 9800, + "time_per_iteration": 2.4564571380615234 + }, + { + "auxiliary_loss_clip": 0.01108184, + "auxiliary_loss_mlp": 0.01039976, + "balance_loss_clip": 1.02804899, + "balance_loss_mlp": 1.03771484, + "epoch": 0.5892679993987675, + "flos": 15778215624960.0, + "grad_norm": 1.9698106702703237, + "language_loss": 0.78824806, + "learning_rate": 1.523448741022722e-06, + "loss": 0.8097297, + "num_input_tokens_seen": 211249985, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.703125, + "step": 9801, + "time_per_iteration": 2.439195156097412 + }, + { + "auxiliary_loss_clip": 0.01109337, + "auxiliary_loss_mlp": 0.01029379, + "balance_loss_clip": 1.01721966, + "balance_loss_mlp": 1.03768528, + "epoch": 0.5893281226514354, + "flos": 25265203954560.0, + "grad_norm": 2.596016426383407, + "language_loss": 0.65912932, + "learning_rate": 1.5230705058210088e-06, + "loss": 0.68051648, + "num_input_tokens_seen": 211268425, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.71875, + "step": 9802, + "time_per_iteration": 3.8562896251678467 + }, + { + "auxiliary_loss_clip": 0.01106914, + "auxiliary_loss_mlp": 0.01027404, + "balance_loss_clip": 1.01552522, + "balance_loss_mlp": 1.03888416, + "epoch": 0.5893882459041034, + "flos": 19458232842240.0, + "grad_norm": 1.5756682227023782, + "language_loss": 0.78167737, + "learning_rate": 1.5226922887045108e-06, + "loss": 0.8030206, + "num_input_tokens_seen": 211286680, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 9803, + "time_per_iteration": 2.4531607627868652 + }, + { + "auxiliary_loss_clip": 0.01108754, + "auxiliary_loss_mlp": 0.01035578, + "balance_loss_clip": 1.02300191, + "balance_loss_mlp": 1.03849792, + "epoch": 0.5894483691567713, + "flos": 20634056553600.0, + "grad_norm": 1.5070835087317231, + "language_loss": 0.7292577, + "learning_rate": 1.5223140896875686e-06, + "loss": 0.75070107, + "num_input_tokens_seen": 211307700, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 9804, + "time_per_iteration": 3.909280776977539 + }, + { + "auxiliary_loss_clip": 0.01108266, + "auxiliary_loss_mlp": 0.01029855, + "balance_loss_clip": 1.01809549, + "balance_loss_mlp": 1.03996158, + "epoch": 0.5895084924094394, + "flos": 17778223877760.0, + "grad_norm": 1.9252543926260512, + "language_loss": 0.7480545, + "learning_rate": 1.5219359087845234e-06, + "loss": 0.76943576, + "num_input_tokens_seen": 211324835, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.68359375, + "step": 9805, + "time_per_iteration": 3.92484712600708 + }, + { + "auxiliary_loss_clip": 0.01113176, + "auxiliary_loss_mlp": 0.01031215, + "balance_loss_clip": 1.01807201, + "balance_loss_mlp": 1.03880858, + "epoch": 0.5895686156621073, + "flos": 20121072468480.0, + "grad_norm": 2.2161041024358736, + "language_loss": 0.7798723, + "learning_rate": 1.5215577460097174e-06, + "loss": 0.8013162, + "num_input_tokens_seen": 211344130, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 9806, + "time_per_iteration": 3.958747625350952 + }, + { + "auxiliary_loss_clip": 0.01106773, + "auxiliary_loss_mlp": 0.01030721, + "balance_loss_clip": 1.01821566, + "balance_loss_mlp": 1.03678048, + "epoch": 0.5896287389147753, + "flos": 20850058990080.0, + "grad_norm": 2.028844636014754, + "language_loss": 0.77013928, + "learning_rate": 1.5211796013774887e-06, + "loss": 0.79151416, + "num_input_tokens_seen": 211362915, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69921875, + "step": 9807, + "time_per_iteration": 2.437091827392578 + }, + { + "auxiliary_loss_clip": 0.01111522, + "auxiliary_loss_mlp": 0.01031277, + "balance_loss_clip": 1.01834261, + "balance_loss_mlp": 1.040411, + "epoch": 0.5896888621674432, + "flos": 14537897043840.0, + "grad_norm": 2.123691808114849, + "language_loss": 0.74406278, + "learning_rate": 1.5208014749021786e-06, + "loss": 0.76549083, + "num_input_tokens_seen": 211380700, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9808, + "time_per_iteration": 2.4456939697265625 + }, + { + "auxiliary_loss_clip": 0.01111351, + "auxiliary_loss_mlp": 0.01031497, + "balance_loss_clip": 1.01794887, + "balance_loss_mlp": 1.03927052, + "epoch": 0.5897489854201112, + "flos": 20886759711360.0, + "grad_norm": 1.9040797268830973, + "language_loss": 0.71715617, + "learning_rate": 1.5204233665981236e-06, + "loss": 0.73858464, + "num_input_tokens_seen": 211400095, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.71875, + "step": 9809, + "time_per_iteration": 2.4555907249450684 + }, + { + "auxiliary_loss_clip": 0.01111034, + "auxiliary_loss_mlp": 0.0103364, + "balance_loss_clip": 1.02066374, + "balance_loss_mlp": 1.03881156, + "epoch": 0.5898091086727792, + "flos": 20011149872640.0, + "grad_norm": 2.6575599068105262, + "language_loss": 0.81872356, + "learning_rate": 1.5200452764796627e-06, + "loss": 0.84017026, + "num_input_tokens_seen": 211417810, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.72265625, + "step": 9810, + "time_per_iteration": 2.546018600463867 + }, + { + "auxiliary_loss_clip": 0.01105843, + "auxiliary_loss_mlp": 0.01030074, + "balance_loss_clip": 1.01815283, + "balance_loss_mlp": 1.03850091, + "epoch": 0.5898692319254472, + "flos": 16253242012800.0, + "grad_norm": 1.679981614097192, + "language_loss": 0.8076582, + "learning_rate": 1.5196672045611336e-06, + "loss": 0.8290174, + "num_input_tokens_seen": 211436020, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 9811, + "time_per_iteration": 2.432685613632202 + }, + { + "auxiliary_loss_clip": 0.01110453, + "auxiliary_loss_mlp": 0.01027593, + "balance_loss_clip": 1.01449776, + "balance_loss_mlp": 1.03924918, + "epoch": 0.5899293551781152, + "flos": 20448541785600.0, + "grad_norm": 1.903117615206719, + "language_loss": 0.76666933, + "learning_rate": 1.5192891508568715e-06, + "loss": 0.78804982, + "num_input_tokens_seen": 211454335, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 9812, + "time_per_iteration": 2.45906138420105 + }, + { + "auxiliary_loss_clip": 0.01107232, + "auxiliary_loss_mlp": 0.01028772, + "balance_loss_clip": 1.01794147, + "balance_loss_mlp": 1.03932881, + "epoch": 0.5899894784307831, + "flos": 13881701433600.0, + "grad_norm": 3.543593991514859, + "language_loss": 0.70407474, + "learning_rate": 1.5189111153812133e-06, + "loss": 0.72543478, + "num_input_tokens_seen": 211472775, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6796875, + "step": 9813, + "time_per_iteration": 2.417073965072632 + }, + { + "auxiliary_loss_clip": 0.0110801, + "auxiliary_loss_mlp": 0.01031885, + "balance_loss_clip": 1.01969576, + "balance_loss_mlp": 1.03846037, + "epoch": 0.5900496016834511, + "flos": 20083797129600.0, + "grad_norm": 1.496524946754694, + "language_loss": 0.72230315, + "learning_rate": 1.518533098148494e-06, + "loss": 0.74370211, + "num_input_tokens_seen": 211492195, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6953125, + "step": 9814, + "time_per_iteration": 2.527130365371704 + }, + { + "auxiliary_loss_clip": 0.0110797, + "auxiliary_loss_mlp": 0.01029956, + "balance_loss_clip": 1.01768374, + "balance_loss_mlp": 1.03837872, + "epoch": 0.590109724936119, + "flos": 20259148348800.0, + "grad_norm": 1.8734717265521494, + "language_loss": 0.78583348, + "learning_rate": 1.5181550991730476e-06, + "loss": 0.80721277, + "num_input_tokens_seen": 211510220, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 9815, + "time_per_iteration": 2.4397730827331543 + }, + { + "auxiliary_loss_clip": 0.01114156, + "auxiliary_loss_mlp": 0.01035445, + "balance_loss_clip": 1.02197468, + "balance_loss_mlp": 1.03963876, + "epoch": 0.590169848188787, + "flos": 24235069806720.0, + "grad_norm": 2.0868241481245415, + "language_loss": 0.7557171, + "learning_rate": 1.5177771184692083e-06, + "loss": 0.7772131, + "num_input_tokens_seen": 211526260, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7421875, + "step": 9816, + "time_per_iteration": 2.443861484527588 + }, + { + "auxiliary_loss_clip": 0.01110119, + "auxiliary_loss_mlp": 0.01033143, + "balance_loss_clip": 1.02063835, + "balance_loss_mlp": 1.04108596, + "epoch": 0.590229971441455, + "flos": 17784724239360.0, + "grad_norm": 2.234392841889587, + "language_loss": 0.81303239, + "learning_rate": 1.517399156051309e-06, + "loss": 0.83446503, + "num_input_tokens_seen": 211542890, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69140625, + "step": 9817, + "time_per_iteration": 2.4248719215393066 + }, + { + "auxiliary_loss_clip": 0.01109425, + "auxiliary_loss_mlp": 0.01033156, + "balance_loss_clip": 1.02112818, + "balance_loss_mlp": 1.03941548, + "epoch": 0.590290094694123, + "flos": 22236893147520.0, + "grad_norm": 1.5738429375950187, + "language_loss": 0.76401961, + "learning_rate": 1.517021211933682e-06, + "loss": 0.78544545, + "num_input_tokens_seen": 211562685, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69921875, + "step": 9818, + "time_per_iteration": 2.445507526397705 + }, + { + "auxiliary_loss_clip": 0.01104232, + "auxiliary_loss_mlp": 0.01030314, + "balance_loss_clip": 1.01861358, + "balance_loss_mlp": 1.03634679, + "epoch": 0.5903502179467909, + "flos": 19098623831040.0, + "grad_norm": 1.8418500679377416, + "language_loss": 0.66351467, + "learning_rate": 1.5166432861306592e-06, + "loss": 0.68486011, + "num_input_tokens_seen": 211579960, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 9819, + "time_per_iteration": 2.4585890769958496 + }, + { + "auxiliary_loss_clip": 0.01109622, + "auxiliary_loss_mlp": 0.01032276, + "balance_loss_clip": 1.01972878, + "balance_loss_mlp": 1.03955185, + "epoch": 0.5904103411994589, + "flos": 24235500769920.0, + "grad_norm": 1.5583203498776486, + "language_loss": 0.77830237, + "learning_rate": 1.5162653786565714e-06, + "loss": 0.79972136, + "num_input_tokens_seen": 211599310, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 9820, + "time_per_iteration": 2.452444314956665 + }, + { + "auxiliary_loss_clip": 0.01033068, + "auxiliary_loss_mlp": 0.01003995, + "balance_loss_clip": 1.00268924, + "balance_loss_mlp": 1.01099396, + "epoch": 0.5904704644521268, + "flos": 64876613045760.0, + "grad_norm": 0.9230258023741272, + "language_loss": 0.65167463, + "learning_rate": 1.5158874895257487e-06, + "loss": 0.67204523, + "num_input_tokens_seen": 211658790, + "router_z_loss_clip": 0.01306152, + "router_z_loss_mlp": 0.22070312, + "step": 9821, + "time_per_iteration": 3.0410289764404297 + }, + { + "auxiliary_loss_clip": 0.01106857, + "auxiliary_loss_mlp": 0.01028087, + "balance_loss_clip": 1.0159936, + "balance_loss_mlp": 1.03887093, + "epoch": 0.5905305877047948, + "flos": 19609991804160.0, + "grad_norm": 1.8405567429237777, + "language_loss": 0.61040848, + "learning_rate": 1.515509618752521e-06, + "loss": 0.63175792, + "num_input_tokens_seen": 211677240, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 9822, + "time_per_iteration": 2.4597485065460205 + }, + { + "auxiliary_loss_clip": 0.01110158, + "auxiliary_loss_mlp": 0.01038511, + "balance_loss_clip": 1.02598214, + "balance_loss_mlp": 1.03878164, + "epoch": 0.5905907109574628, + "flos": 18989634988800.0, + "grad_norm": 1.8163106241475082, + "language_loss": 0.82910824, + "learning_rate": 1.5151317663512173e-06, + "loss": 0.850595, + "num_input_tokens_seen": 211695485, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71484375, + "step": 9823, + "time_per_iteration": 2.4342074394226074 + }, + { + "auxiliary_loss_clip": 0.01106822, + "auxiliary_loss_mlp": 0.01032649, + "balance_loss_clip": 1.02025676, + "balance_loss_mlp": 1.03823602, + "epoch": 0.5906508342101308, + "flos": 22200407907840.0, + "grad_norm": 1.9061097186750977, + "language_loss": 0.73051912, + "learning_rate": 1.514753932336165e-06, + "loss": 0.75191379, + "num_input_tokens_seen": 211713090, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 9824, + "time_per_iteration": 2.474583387374878 + }, + { + "auxiliary_loss_clip": 0.01115754, + "auxiliary_loss_mlp": 0.0103547, + "balance_loss_clip": 1.02118862, + "balance_loss_mlp": 1.03907609, + "epoch": 0.5907109574627988, + "flos": 20886687884160.0, + "grad_norm": 2.117093757339989, + "language_loss": 0.82486725, + "learning_rate": 1.514376116721693e-06, + "loss": 0.84637952, + "num_input_tokens_seen": 211732510, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.76953125, + "step": 9825, + "time_per_iteration": 2.4499030113220215 + }, + { + "auxiliary_loss_clip": 0.01104731, + "auxiliary_loss_mlp": 0.01028819, + "balance_loss_clip": 1.01781034, + "balance_loss_mlp": 1.03812122, + "epoch": 0.5907710807154667, + "flos": 21506649649920.0, + "grad_norm": 1.7674632389005596, + "language_loss": 0.77194965, + "learning_rate": 1.5139983195221272e-06, + "loss": 0.79328513, + "num_input_tokens_seen": 211748695, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6640625, + "step": 9826, + "time_per_iteration": 2.490628480911255 + }, + { + "auxiliary_loss_clip": 0.01106346, + "auxiliary_loss_mlp": 0.01026697, + "balance_loss_clip": 1.01523519, + "balance_loss_mlp": 1.03757071, + "epoch": 0.5908312039681347, + "flos": 22018376759040.0, + "grad_norm": 1.8211120400501501, + "language_loss": 0.72350824, + "learning_rate": 1.513620540751793e-06, + "loss": 0.74483871, + "num_input_tokens_seen": 211768545, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6875, + "step": 9827, + "time_per_iteration": 2.496574640274048 + }, + { + "auxiliary_loss_clip": 0.01107742, + "auxiliary_loss_mlp": 0.01028951, + "balance_loss_clip": 1.01782858, + "balance_loss_mlp": 1.0374589, + "epoch": 0.5908913272208026, + "flos": 18479523991680.0, + "grad_norm": 1.7932913826709562, + "language_loss": 0.79741728, + "learning_rate": 1.5132427804250178e-06, + "loss": 0.81878424, + "num_input_tokens_seen": 211786665, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.703125, + "step": 9828, + "time_per_iteration": 2.51045298576355 + }, + { + "auxiliary_loss_clip": 0.01111624, + "auxiliary_loss_mlp": 0.01034198, + "balance_loss_clip": 1.02125204, + "balance_loss_mlp": 1.03958178, + "epoch": 0.5909514504734706, + "flos": 12312189682560.0, + "grad_norm": 2.271428998540672, + "language_loss": 0.88056707, + "learning_rate": 1.5128650385561241e-06, + "loss": 0.90202534, + "num_input_tokens_seen": 211801215, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 9829, + "time_per_iteration": 2.4169514179229736 + }, + { + "auxiliary_loss_clip": 0.01031439, + "auxiliary_loss_mlp": 0.00999905, + "balance_loss_clip": 0.99870729, + "balance_loss_mlp": 1.00956726, + "epoch": 0.5910115737261386, + "flos": 70213262451840.0, + "grad_norm": 0.7537251091943264, + "language_loss": 0.57855141, + "learning_rate": 1.5124873151594376e-06, + "loss": 0.59886479, + "num_input_tokens_seen": 211857005, + "router_z_loss_clip": 0.01196289, + "router_z_loss_mlp": 0.21875, + "step": 9830, + "time_per_iteration": 2.996295928955078 + }, + { + "auxiliary_loss_clip": 0.01117401, + "auxiliary_loss_mlp": 0.01032419, + "balance_loss_clip": 1.01852536, + "balance_loss_mlp": 1.04140687, + "epoch": 0.5910716969788066, + "flos": 22017766227840.0, + "grad_norm": 2.0665850759749813, + "language_loss": 0.76163888, + "learning_rate": 1.5121096102492812e-06, + "loss": 0.78313708, + "num_input_tokens_seen": 211876675, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 9831, + "time_per_iteration": 2.461068868637085 + }, + { + "auxiliary_loss_clip": 0.01105452, + "auxiliary_loss_mlp": 0.01027496, + "balance_loss_clip": 1.01565278, + "balance_loss_mlp": 1.03923118, + "epoch": 0.5911318202314745, + "flos": 21251648021760.0, + "grad_norm": 1.602158251769988, + "language_loss": 0.7790612, + "learning_rate": 1.5117319238399767e-06, + "loss": 0.80039072, + "num_input_tokens_seen": 211895725, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 9832, + "time_per_iteration": 2.4806432723999023 + }, + { + "auxiliary_loss_clip": 0.01104873, + "auxiliary_loss_mlp": 0.01027671, + "balance_loss_clip": 1.01554728, + "balance_loss_mlp": 1.03533232, + "epoch": 0.5911919434841425, + "flos": 17821604528640.0, + "grad_norm": 1.7748958571682212, + "language_loss": 0.83552635, + "learning_rate": 1.511354255945847e-06, + "loss": 0.85685176, + "num_input_tokens_seen": 211913860, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6953125, + "step": 9833, + "time_per_iteration": 2.436558961868286 + }, + { + "auxiliary_loss_clip": 0.01108125, + "auxiliary_loss_mlp": 0.01032413, + "balance_loss_clip": 1.02002692, + "balance_loss_mlp": 1.03818607, + "epoch": 0.5912520667368104, + "flos": 20374781207040.0, + "grad_norm": 1.512608687160236, + "language_loss": 0.74505258, + "learning_rate": 1.5109766065812123e-06, + "loss": 0.76645797, + "num_input_tokens_seen": 211932880, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.703125, + "step": 9834, + "time_per_iteration": 2.497488260269165 + }, + { + "auxiliary_loss_clip": 0.01107604, + "auxiliary_loss_mlp": 0.01028933, + "balance_loss_clip": 1.01680338, + "balance_loss_mlp": 1.03707302, + "epoch": 0.5913121899894784, + "flos": 17930557457280.0, + "grad_norm": 2.15246332260658, + "language_loss": 0.78111219, + "learning_rate": 1.5105989757603942e-06, + "loss": 0.8024776, + "num_input_tokens_seen": 211948625, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.703125, + "step": 9835, + "time_per_iteration": 2.428570032119751 + }, + { + "auxiliary_loss_clip": 0.01108371, + "auxiliary_loss_mlp": 0.0103274, + "balance_loss_clip": 1.02080131, + "balance_loss_mlp": 1.03782153, + "epoch": 0.5913723132421465, + "flos": 22126934638080.0, + "grad_norm": 2.790579015547894, + "language_loss": 0.74016017, + "learning_rate": 1.5102213634977117e-06, + "loss": 0.76157123, + "num_input_tokens_seen": 211965355, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.703125, + "step": 9836, + "time_per_iteration": 2.4571895599365234 + }, + { + "auxiliary_loss_clip": 0.01108454, + "auxiliary_loss_mlp": 0.01025364, + "balance_loss_clip": 1.01332974, + "balance_loss_mlp": 1.03816915, + "epoch": 0.5914324364948144, + "flos": 15697918771200.0, + "grad_norm": 2.0887710674316335, + "language_loss": 0.81834614, + "learning_rate": 1.5098437698074841e-06, + "loss": 0.83968431, + "num_input_tokens_seen": 211982245, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.703125, + "step": 9837, + "time_per_iteration": 2.425869941711426 + }, + { + "auxiliary_loss_clip": 0.01109463, + "auxiliary_loss_mlp": 0.01030556, + "balance_loss_clip": 1.01760364, + "balance_loss_mlp": 1.03828216, + "epoch": 0.5914925597474824, + "flos": 22747327367040.0, + "grad_norm": 1.6633412669476784, + "language_loss": 0.79169023, + "learning_rate": 1.5094661947040304e-06, + "loss": 0.81309044, + "num_input_tokens_seen": 212000250, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 9838, + "time_per_iteration": 2.480945348739624 + }, + { + "auxiliary_loss_clip": 0.01109443, + "auxiliary_loss_mlp": 0.01032925, + "balance_loss_clip": 1.02036071, + "balance_loss_mlp": 1.03814876, + "epoch": 0.5915526830001503, + "flos": 18292788161280.0, + "grad_norm": 1.9639883281700399, + "language_loss": 0.6955409, + "learning_rate": 1.5090886382016673e-06, + "loss": 0.7169646, + "num_input_tokens_seen": 212017505, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 9839, + "time_per_iteration": 2.445032835006714 + }, + { + "auxiliary_loss_clip": 0.01109116, + "auxiliary_loss_mlp": 0.01040629, + "balance_loss_clip": 1.02804112, + "balance_loss_mlp": 1.03763521, + "epoch": 0.5916128062528183, + "flos": 17019072910080.0, + "grad_norm": 2.156057098485451, + "language_loss": 0.65970773, + "learning_rate": 1.5087111003147124e-06, + "loss": 0.68120515, + "num_input_tokens_seen": 212034595, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71484375, + "step": 9840, + "time_per_iteration": 2.4208333492279053 + }, + { + "auxiliary_loss_clip": 0.01109278, + "auxiliary_loss_mlp": 0.01031815, + "balance_loss_clip": 1.01920867, + "balance_loss_mlp": 1.03765261, + "epoch": 0.5916729295054862, + "flos": 24754231031040.0, + "grad_norm": 1.6889823147578333, + "language_loss": 0.81775278, + "learning_rate": 1.5083335810574813e-06, + "loss": 0.83916378, + "num_input_tokens_seen": 212055775, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 9841, + "time_per_iteration": 2.485783576965332 + }, + { + "auxiliary_loss_clip": 0.0110413, + "auxiliary_loss_mlp": 0.01028956, + "balance_loss_clip": 1.0175122, + "balance_loss_mlp": 1.03609967, + "epoch": 0.5917330527581542, + "flos": 15958199698560.0, + "grad_norm": 1.5545668932192243, + "language_loss": 0.68891448, + "learning_rate": 1.507956080444291e-06, + "loss": 0.71024531, + "num_input_tokens_seen": 212074000, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6796875, + "step": 9842, + "time_per_iteration": 2.4090652465820312 + }, + { + "auxiliary_loss_clip": 0.01108304, + "auxiliary_loss_mlp": 0.0103228, + "balance_loss_clip": 1.02031779, + "balance_loss_mlp": 1.03697038, + "epoch": 0.5917931760108222, + "flos": 23800730549760.0, + "grad_norm": 1.8995177421561278, + "language_loss": 0.8258518, + "learning_rate": 1.5075785984894549e-06, + "loss": 0.84725767, + "num_input_tokens_seen": 212091415, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.7109375, + "step": 9843, + "time_per_iteration": 2.456085443496704 + }, + { + "auxiliary_loss_clip": 0.01107968, + "auxiliary_loss_mlp": 0.01031114, + "balance_loss_clip": 1.01810205, + "balance_loss_mlp": 1.03701758, + "epoch": 0.5918532992634902, + "flos": 23249609199360.0, + "grad_norm": 2.3414678440212953, + "language_loss": 0.81883448, + "learning_rate": 1.5072011352072875e-06, + "loss": 0.84022528, + "num_input_tokens_seen": 212105255, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 9844, + "time_per_iteration": 3.834216833114624 + }, + { + "auxiliary_loss_clip": 0.01111099, + "auxiliary_loss_mlp": 0.01031123, + "balance_loss_clip": 1.01842773, + "balance_loss_mlp": 1.04004455, + "epoch": 0.5919134225161581, + "flos": 19499853726720.0, + "grad_norm": 1.8185302816606077, + "language_loss": 0.74449736, + "learning_rate": 1.5068236906121032e-06, + "loss": 0.76591957, + "num_input_tokens_seen": 212122765, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9845, + "time_per_iteration": 2.409029960632324 + }, + { + "auxiliary_loss_clip": 0.01108139, + "auxiliary_loss_mlp": 0.01026326, + "balance_loss_clip": 1.01324248, + "balance_loss_mlp": 1.03682494, + "epoch": 0.5919735457688261, + "flos": 38800940567040.0, + "grad_norm": 2.2228008907542027, + "language_loss": 0.63848257, + "learning_rate": 1.506446264718213e-06, + "loss": 0.65982717, + "num_input_tokens_seen": 212143960, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 9846, + "time_per_iteration": 3.994704246520996 + }, + { + "auxiliary_loss_clip": 0.01100388, + "auxiliary_loss_mlp": 0.01026228, + "balance_loss_clip": 1.01529002, + "balance_loss_mlp": 1.03501678, + "epoch": 0.592033669021494, + "flos": 22163994495360.0, + "grad_norm": 1.7549171077463366, + "language_loss": 0.76315683, + "learning_rate": 1.506068857539931e-06, + "loss": 0.78442299, + "num_input_tokens_seen": 212162005, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.65625, + "step": 9847, + "time_per_iteration": 3.815723419189453 + }, + { + "auxiliary_loss_clip": 0.01107339, + "auxiliary_loss_mlp": 0.01031252, + "balance_loss_clip": 1.01892579, + "balance_loss_mlp": 1.03723776, + "epoch": 0.592093792274162, + "flos": 22710985781760.0, + "grad_norm": 1.7391013556086516, + "language_loss": 0.6229955, + "learning_rate": 1.5056914690915667e-06, + "loss": 0.6443814, + "num_input_tokens_seen": 212181635, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 9848, + "time_per_iteration": 3.9868550300598145 + }, + { + "auxiliary_loss_clip": 0.01108795, + "auxiliary_loss_mlp": 0.01037331, + "balance_loss_clip": 1.02532125, + "balance_loss_mlp": 1.03819513, + "epoch": 0.59215391552683, + "flos": 22528954632960.0, + "grad_norm": 2.784596822173483, + "language_loss": 0.75762534, + "learning_rate": 1.5053140993874312e-06, + "loss": 0.77908659, + "num_input_tokens_seen": 212201615, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.703125, + "step": 9849, + "time_per_iteration": 2.4613027572631836 + }, + { + "auxiliary_loss_clip": 0.01108412, + "auxiliary_loss_mlp": 0.01032802, + "balance_loss_clip": 1.02006471, + "balance_loss_mlp": 1.0370928, + "epoch": 0.592214038779498, + "flos": 24499013921280.0, + "grad_norm": 1.6562680086624124, + "language_loss": 0.75594199, + "learning_rate": 1.5049367484418353e-06, + "loss": 0.77735424, + "num_input_tokens_seen": 212219355, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9850, + "time_per_iteration": 2.5371382236480713 + }, + { + "auxiliary_loss_clip": 0.01106578, + "auxiliary_loss_mlp": 0.01029098, + "balance_loss_clip": 1.01676035, + "balance_loss_mlp": 1.03672051, + "epoch": 0.592274162032166, + "flos": 21831353619840.0, + "grad_norm": 1.7347218503083297, + "language_loss": 0.7573396, + "learning_rate": 1.5045594162690868e-06, + "loss": 0.7786963, + "num_input_tokens_seen": 212236710, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.69921875, + "step": 9851, + "time_per_iteration": 2.4500503540039062 + }, + { + "auxiliary_loss_clip": 0.01106705, + "auxiliary_loss_mlp": 0.0103027, + "balance_loss_clip": 1.0179739, + "balance_loss_mlp": 1.03609896, + "epoch": 0.5923342852848339, + "flos": 24608146417920.0, + "grad_norm": 1.818113501506117, + "language_loss": 0.70232719, + "learning_rate": 1.5041821028834954e-06, + "loss": 0.72369695, + "num_input_tokens_seen": 212256195, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.70703125, + "step": 9852, + "time_per_iteration": 2.50327205657959 + }, + { + "auxiliary_loss_clip": 0.01112321, + "auxiliary_loss_mlp": 0.01040222, + "balance_loss_clip": 1.02710271, + "balance_loss_mlp": 1.03861785, + "epoch": 0.5923944085375019, + "flos": 19938143479680.0, + "grad_norm": 38.24844963287624, + "language_loss": 0.8025564, + "learning_rate": 1.5038048082993685e-06, + "loss": 0.82408178, + "num_input_tokens_seen": 212274085, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73828125, + "step": 9853, + "time_per_iteration": 2.443661689758301 + }, + { + "auxiliary_loss_clip": 0.01103448, + "auxiliary_loss_mlp": 0.01025904, + "balance_loss_clip": 1.01480556, + "balance_loss_mlp": 1.03603673, + "epoch": 0.5924545317901698, + "flos": 28658510812800.0, + "grad_norm": 1.502563314800498, + "language_loss": 0.67641807, + "learning_rate": 1.5034275325310124e-06, + "loss": 0.69771153, + "num_input_tokens_seen": 212295530, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.67578125, + "step": 9854, + "time_per_iteration": 2.5323755741119385 + }, + { + "auxiliary_loss_clip": 0.01105063, + "auxiliary_loss_mlp": 0.01025695, + "balance_loss_clip": 1.01371408, + "balance_loss_mlp": 1.03610444, + "epoch": 0.5925146550428378, + "flos": 19864885691520.0, + "grad_norm": 1.6522001385368033, + "language_loss": 0.88777542, + "learning_rate": 1.5030502755927344e-06, + "loss": 0.90908301, + "num_input_tokens_seen": 212313770, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 9855, + "time_per_iteration": 2.4309167861938477 + }, + { + "auxiliary_loss_clip": 0.01102278, + "auxiliary_loss_mlp": 0.01030174, + "balance_loss_clip": 1.01936722, + "balance_loss_mlp": 1.03590918, + "epoch": 0.5925747782955058, + "flos": 15122989681920.0, + "grad_norm": 1.7115668008760792, + "language_loss": 0.86635554, + "learning_rate": 1.5026730374988397e-06, + "loss": 0.88768005, + "num_input_tokens_seen": 212331525, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6640625, + "step": 9856, + "time_per_iteration": 2.464066743850708 + }, + { + "auxiliary_loss_clip": 0.01105344, + "auxiliary_loss_mlp": 0.01034231, + "balance_loss_clip": 1.02256656, + "balance_loss_mlp": 1.03562045, + "epoch": 0.5926349015481738, + "flos": 18405440190720.0, + "grad_norm": 2.1473398743532153, + "language_loss": 0.77584958, + "learning_rate": 1.5022958182636332e-06, + "loss": 0.79724526, + "num_input_tokens_seen": 212347295, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.69921875, + "step": 9857, + "time_per_iteration": 2.4102070331573486 + }, + { + "auxiliary_loss_clip": 0.01109396, + "auxiliary_loss_mlp": 0.01033385, + "balance_loss_clip": 1.0216434, + "balance_loss_mlp": 1.03954232, + "epoch": 0.5926950248008417, + "flos": 23111138269440.0, + "grad_norm": 1.9751188115052367, + "language_loss": 0.64351666, + "learning_rate": 1.501918617901419e-06, + "loss": 0.66494453, + "num_input_tokens_seen": 212365750, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.69921875, + "step": 9858, + "time_per_iteration": 2.461881637573242 + }, + { + "auxiliary_loss_clip": 0.01104549, + "auxiliary_loss_mlp": 0.01030693, + "balance_loss_clip": 1.01884377, + "balance_loss_mlp": 1.03700852, + "epoch": 0.5927551480535097, + "flos": 28033916192640.0, + "grad_norm": 1.9049315760209506, + "language_loss": 0.77045393, + "learning_rate": 1.501541436426501e-06, + "loss": 0.79180634, + "num_input_tokens_seen": 212385300, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.67578125, + "step": 9859, + "time_per_iteration": 2.478782892227173 + }, + { + "auxiliary_loss_clip": 0.01110235, + "auxiliary_loss_mlp": 0.01033746, + "balance_loss_clip": 1.02082372, + "balance_loss_mlp": 1.03882456, + "epoch": 0.5928152713061776, + "flos": 21798675221760.0, + "grad_norm": 2.1565186381803194, + "language_loss": 0.75153667, + "learning_rate": 1.5011642738531818e-06, + "loss": 0.77297652, + "num_input_tokens_seen": 212402140, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71484375, + "step": 9860, + "time_per_iteration": 2.4513912200927734 + }, + { + "auxiliary_loss_clip": 0.01106266, + "auxiliary_loss_mlp": 0.01033749, + "balance_loss_clip": 1.02277529, + "balance_loss_mlp": 1.03840578, + "epoch": 0.5928753945588456, + "flos": 24316839118080.0, + "grad_norm": 1.6305970530500205, + "language_loss": 0.76227921, + "learning_rate": 1.500787130195763e-06, + "loss": 0.78367937, + "num_input_tokens_seen": 212421790, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.67578125, + "step": 9861, + "time_per_iteration": 2.474095344543457 + }, + { + "auxiliary_loss_clip": 0.01103657, + "auxiliary_loss_mlp": 0.01024434, + "balance_loss_clip": 1.0131923, + "balance_loss_mlp": 1.03595328, + "epoch": 0.5929355178115137, + "flos": 26464619923200.0, + "grad_norm": 1.8413108938997076, + "language_loss": 0.70368218, + "learning_rate": 1.5004100054685465e-06, + "loss": 0.72496319, + "num_input_tokens_seen": 212442115, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.67578125, + "step": 9862, + "time_per_iteration": 2.539903402328491 + }, + { + "auxiliary_loss_clip": 0.0110657, + "auxiliary_loss_mlp": 0.0103043, + "balance_loss_clip": 1.01868796, + "balance_loss_mlp": 1.03706694, + "epoch": 0.5929956410641816, + "flos": 24965995662720.0, + "grad_norm": 1.8355876983877193, + "language_loss": 0.77771485, + "learning_rate": 1.500032899685832e-06, + "loss": 0.7990849, + "num_input_tokens_seen": 212459535, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6953125, + "step": 9863, + "time_per_iteration": 2.4712796211242676 + }, + { + "auxiliary_loss_clip": 0.01106967, + "auxiliary_loss_mlp": 0.01038141, + "balance_loss_clip": 1.02583861, + "balance_loss_mlp": 1.03730559, + "epoch": 0.5930557643168496, + "flos": 26208325405440.0, + "grad_norm": 1.8648903136261632, + "language_loss": 0.70763469, + "learning_rate": 1.499655812861921e-06, + "loss": 0.72908574, + "num_input_tokens_seen": 212479385, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 9864, + "time_per_iteration": 2.52478289604187 + }, + { + "auxiliary_loss_clip": 0.01107547, + "auxiliary_loss_mlp": 0.01033988, + "balance_loss_clip": 1.02201343, + "balance_loss_mlp": 1.03711009, + "epoch": 0.5931158875695175, + "flos": 27854937699840.0, + "grad_norm": 2.2141122969684655, + "language_loss": 0.67234761, + "learning_rate": 1.4992787450111112e-06, + "loss": 0.69376296, + "num_input_tokens_seen": 212500060, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.703125, + "step": 9865, + "time_per_iteration": 2.4957449436187744 + }, + { + "auxiliary_loss_clip": 0.0110929, + "auxiliary_loss_mlp": 0.01031685, + "balance_loss_clip": 1.01892328, + "balance_loss_mlp": 1.03758049, + "epoch": 0.5931760108221855, + "flos": 15413650536960.0, + "grad_norm": 1.8936144812420768, + "language_loss": 0.78334385, + "learning_rate": 1.4989016961477015e-06, + "loss": 0.8047536, + "num_input_tokens_seen": 212518590, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.71875, + "step": 9866, + "time_per_iteration": 2.4394681453704834 + }, + { + "auxiliary_loss_clip": 0.01105609, + "auxiliary_loss_mlp": 0.01030002, + "balance_loss_clip": 1.01867127, + "balance_loss_mlp": 1.03786838, + "epoch": 0.5932361340748534, + "flos": 30188520581760.0, + "grad_norm": 1.98454003485575, + "language_loss": 0.72037029, + "learning_rate": 1.4985246662859903e-06, + "loss": 0.7417264, + "num_input_tokens_seen": 212538190, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.67578125, + "step": 9867, + "time_per_iteration": 2.5107383728027344 + }, + { + "auxiliary_loss_clip": 0.01107812, + "auxiliary_loss_mlp": 0.01030449, + "balance_loss_clip": 1.01795018, + "balance_loss_mlp": 1.03910947, + "epoch": 0.5932962573275214, + "flos": 20157557708160.0, + "grad_norm": 1.538584883762445, + "language_loss": 0.66726553, + "learning_rate": 1.4981476554402732e-06, + "loss": 0.68864822, + "num_input_tokens_seen": 212557820, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 9868, + "time_per_iteration": 2.5143752098083496 + }, + { + "auxiliary_loss_clip": 0.01107645, + "auxiliary_loss_mlp": 0.01033304, + "balance_loss_clip": 1.02100754, + "balance_loss_mlp": 1.03726101, + "epoch": 0.5933563805801894, + "flos": 25445906300160.0, + "grad_norm": 1.5720110660148519, + "language_loss": 0.75083476, + "learning_rate": 1.4977706636248478e-06, + "loss": 0.77224427, + "num_input_tokens_seen": 212577645, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 9869, + "time_per_iteration": 2.4784231185913086 + }, + { + "auxiliary_loss_clip": 0.01110477, + "auxiliary_loss_mlp": 0.0103956, + "balance_loss_clip": 1.02690041, + "balance_loss_mlp": 1.0391326, + "epoch": 0.5934165038328574, + "flos": 59995740337920.0, + "grad_norm": 1.6442009630814416, + "language_loss": 0.74131197, + "learning_rate": 1.4973936908540091e-06, + "loss": 0.76281238, + "num_input_tokens_seen": 212603430, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9870, + "time_per_iteration": 2.8396053314208984 + }, + { + "auxiliary_loss_clip": 0.0111113, + "auxiliary_loss_mlp": 0.01025896, + "balance_loss_clip": 1.01414764, + "balance_loss_mlp": 1.04010868, + "epoch": 0.5934766270855253, + "flos": 24420548661120.0, + "grad_norm": 1.9765481299651093, + "language_loss": 0.71421361, + "learning_rate": 1.4970167371420517e-06, + "loss": 0.7355839, + "num_input_tokens_seen": 212620730, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.7109375, + "step": 9871, + "time_per_iteration": 2.460695505142212 + }, + { + "auxiliary_loss_clip": 0.01110046, + "auxiliary_loss_mlp": 0.01032784, + "balance_loss_clip": 1.0198555, + "balance_loss_mlp": 1.03879905, + "epoch": 0.5935367503381933, + "flos": 23513158264320.0, + "grad_norm": 1.9723601672672642, + "language_loss": 0.74131697, + "learning_rate": 1.496639802503271e-06, + "loss": 0.76274526, + "num_input_tokens_seen": 212639745, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9872, + "time_per_iteration": 2.4877848625183105 + }, + { + "auxiliary_loss_clip": 0.01111497, + "auxiliary_loss_mlp": 0.01036942, + "balance_loss_clip": 1.02359688, + "balance_loss_mlp": 1.03926826, + "epoch": 0.5935968735908612, + "flos": 18948337326720.0, + "grad_norm": 2.142318153174813, + "language_loss": 0.78675568, + "learning_rate": 1.4962628869519583e-06, + "loss": 0.80824012, + "num_input_tokens_seen": 212655915, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.72265625, + "step": 9873, + "time_per_iteration": 2.4480934143066406 + }, + { + "auxiliary_loss_clip": 0.01109102, + "auxiliary_loss_mlp": 0.01034698, + "balance_loss_clip": 1.02197838, + "balance_loss_mlp": 1.03843832, + "epoch": 0.5936569968435292, + "flos": 25483433034240.0, + "grad_norm": 1.5306423792742176, + "language_loss": 0.85011673, + "learning_rate": 1.4958859905024078e-06, + "loss": 0.87155473, + "num_input_tokens_seen": 212676115, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 9874, + "time_per_iteration": 2.5098774433135986 + }, + { + "auxiliary_loss_clip": 0.01030749, + "auxiliary_loss_mlp": 0.01001619, + "balance_loss_clip": 1.00044489, + "balance_loss_mlp": 1.00908446, + "epoch": 0.5937171200961973, + "flos": 66378361789440.0, + "grad_norm": 0.6973173617166174, + "language_loss": 0.60004687, + "learning_rate": 1.4955091131689115e-06, + "loss": 0.62037057, + "num_input_tokens_seen": 212737560, + "router_z_loss_clip": 0.01171875, + "router_z_loss_mlp": 0.21679688, + "step": 9875, + "time_per_iteration": 3.1099135875701904 + }, + { + "auxiliary_loss_clip": 0.01110933, + "auxiliary_loss_mlp": 0.01033007, + "balance_loss_clip": 1.01980412, + "balance_loss_mlp": 1.0373013, + "epoch": 0.5937772433488652, + "flos": 14903467712640.0, + "grad_norm": 2.0699471238582943, + "language_loss": 0.77501059, + "learning_rate": 1.4951322549657594e-06, + "loss": 0.7964499, + "num_input_tokens_seen": 212755365, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 9876, + "time_per_iteration": 2.466031551361084 + }, + { + "auxiliary_loss_clip": 0.01103172, + "auxiliary_loss_mlp": 0.01028266, + "balance_loss_clip": 1.01652348, + "balance_loss_mlp": 1.03654408, + "epoch": 0.5938373666015332, + "flos": 22561489376640.0, + "grad_norm": 1.5589386174362272, + "language_loss": 0.75830436, + "learning_rate": 1.494755415907243e-06, + "loss": 0.77961862, + "num_input_tokens_seen": 212773875, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 9877, + "time_per_iteration": 2.4772722721099854 + }, + { + "auxiliary_loss_clip": 0.01108511, + "auxiliary_loss_mlp": 0.01031637, + "balance_loss_clip": 1.01892304, + "balance_loss_mlp": 1.03673589, + "epoch": 0.5938974898542011, + "flos": 18440883936000.0, + "grad_norm": 4.77912842405454, + "language_loss": 0.81212896, + "learning_rate": 1.4943785960076522e-06, + "loss": 0.83353043, + "num_input_tokens_seen": 212790590, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 9878, + "time_per_iteration": 2.511408805847168 + }, + { + "auxiliary_loss_clip": 0.01108341, + "auxiliary_loss_mlp": 0.01037126, + "balance_loss_clip": 1.02462077, + "balance_loss_mlp": 1.0378468, + "epoch": 0.5939576131068691, + "flos": 45586728270720.0, + "grad_norm": 1.7027842827521733, + "language_loss": 0.71123505, + "learning_rate": 1.4940017952812754e-06, + "loss": 0.73268974, + "num_input_tokens_seen": 212812265, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 9879, + "time_per_iteration": 2.6537530422210693 + }, + { + "auxiliary_loss_clip": 0.01107077, + "auxiliary_loss_mlp": 0.01032021, + "balance_loss_clip": 1.01973653, + "balance_loss_mlp": 1.03814936, + "epoch": 0.594017736359537, + "flos": 23587708942080.0, + "grad_norm": 1.4837097454893722, + "language_loss": 0.5739696, + "learning_rate": 1.493625013742401e-06, + "loss": 0.59536058, + "num_input_tokens_seen": 212831915, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69140625, + "step": 9880, + "time_per_iteration": 2.487082004547119 + }, + { + "auxiliary_loss_clip": 0.01107055, + "auxiliary_loss_mlp": 0.01037129, + "balance_loss_clip": 1.02435601, + "balance_loss_mlp": 1.03724837, + "epoch": 0.594077859612205, + "flos": 29457235589760.0, + "grad_norm": 1.7845732450958962, + "language_loss": 0.76980609, + "learning_rate": 1.4932482514053177e-06, + "loss": 0.79124796, + "num_input_tokens_seen": 212851350, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 9881, + "time_per_iteration": 2.5019240379333496 + }, + { + "auxiliary_loss_clip": 0.01105499, + "auxiliary_loss_mlp": 0.01026899, + "balance_loss_clip": 1.01437569, + "balance_loss_mlp": 1.03524506, + "epoch": 0.594137982864873, + "flos": 16800089644800.0, + "grad_norm": 2.214394269583833, + "language_loss": 0.82820934, + "learning_rate": 1.4928715082843112e-06, + "loss": 0.84953332, + "num_input_tokens_seen": 212867995, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 9882, + "time_per_iteration": 2.4258036613464355 + }, + { + "auxiliary_loss_clip": 0.01106542, + "auxiliary_loss_mlp": 0.01035086, + "balance_loss_clip": 1.02321863, + "balance_loss_mlp": 1.03781402, + "epoch": 0.594198106117541, + "flos": 12750263953920.0, + "grad_norm": 2.5324902309588855, + "language_loss": 0.79348171, + "learning_rate": 1.492494784393667e-06, + "loss": 0.81489801, + "num_input_tokens_seen": 212885220, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 9883, + "time_per_iteration": 2.4191815853118896 + }, + { + "auxiliary_loss_clip": 0.01112982, + "auxiliary_loss_mlp": 0.01034942, + "balance_loss_clip": 1.0214777, + "balance_loss_mlp": 1.03999424, + "epoch": 0.5942582293702089, + "flos": 20996538652800.0, + "grad_norm": 1.7967272432241739, + "language_loss": 0.74134135, + "learning_rate": 1.4921180797476725e-06, + "loss": 0.7628206, + "num_input_tokens_seen": 212903195, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73046875, + "step": 9884, + "time_per_iteration": 2.4599032402038574 + }, + { + "auxiliary_loss_clip": 0.01112156, + "auxiliary_loss_mlp": 0.01030057, + "balance_loss_clip": 1.0181067, + "balance_loss_mlp": 1.04232001, + "epoch": 0.5943183526228769, + "flos": 28291431772800.0, + "grad_norm": 3.4474311080183964, + "language_loss": 0.6639331, + "learning_rate": 1.4917413943606106e-06, + "loss": 0.68535531, + "num_input_tokens_seen": 212923340, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.69921875, + "step": 9885, + "time_per_iteration": 3.940159797668457 + }, + { + "auxiliary_loss_clip": 0.01107325, + "auxiliary_loss_mlp": 0.01036401, + "balance_loss_clip": 1.02392602, + "balance_loss_mlp": 1.03891098, + "epoch": 0.5943784758755448, + "flos": 26614619118720.0, + "grad_norm": 2.562196250157405, + "language_loss": 0.77456462, + "learning_rate": 1.4913647282467667e-06, + "loss": 0.79600191, + "num_input_tokens_seen": 212942755, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 9886, + "time_per_iteration": 2.4958837032318115 + }, + { + "auxiliary_loss_clip": 0.01029578, + "auxiliary_loss_mlp": 0.009997, + "balance_loss_clip": 0.99845427, + "balance_loss_mlp": 1.00789237, + "epoch": 0.5944385991282128, + "flos": 64190935347840.0, + "grad_norm": 0.8479500751523403, + "language_loss": 0.64580774, + "learning_rate": 1.490988081420423e-06, + "loss": 0.6661005, + "num_input_tokens_seen": 212999355, + "router_z_loss_clip": 0.01245117, + "router_z_loss_mlp": 0.21679688, + "step": 9887, + "time_per_iteration": 4.312393426895142 + }, + { + "auxiliary_loss_clip": 0.01106228, + "auxiliary_loss_mlp": 0.01031375, + "balance_loss_clip": 1.01911473, + "balance_loss_mlp": 1.03743696, + "epoch": 0.5944987223808808, + "flos": 19571998193280.0, + "grad_norm": 1.9767325567336362, + "language_loss": 0.69172513, + "learning_rate": 1.4906114538958615e-06, + "loss": 0.71310121, + "num_input_tokens_seen": 213018570, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 9888, + "time_per_iteration": 3.8631362915039062 + }, + { + "auxiliary_loss_clip": 0.01108213, + "auxiliary_loss_mlp": 0.01030472, + "balance_loss_clip": 1.01763296, + "balance_loss_mlp": 1.03916407, + "epoch": 0.5945588456335488, + "flos": 26177586341760.0, + "grad_norm": 1.5956528037649322, + "language_loss": 0.79466522, + "learning_rate": 1.490234845687366e-06, + "loss": 0.81605208, + "num_input_tokens_seen": 213037735, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6875, + "step": 9889, + "time_per_iteration": 4.0321431159973145 + }, + { + "auxiliary_loss_clip": 0.01105021, + "auxiliary_loss_mlp": 0.01031056, + "balance_loss_clip": 1.01912892, + "balance_loss_mlp": 1.03607225, + "epoch": 0.5946189688862168, + "flos": 20446494710400.0, + "grad_norm": 1.529319229595301, + "language_loss": 0.70732993, + "learning_rate": 1.4898582568092154e-06, + "loss": 0.72869068, + "num_input_tokens_seen": 213057160, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 9890, + "time_per_iteration": 2.465503692626953 + }, + { + "auxiliary_loss_clip": 0.01110328, + "auxiliary_loss_mlp": 0.01032793, + "balance_loss_clip": 1.01994216, + "balance_loss_mlp": 1.03921902, + "epoch": 0.5946790921388847, + "flos": 13437521850240.0, + "grad_norm": 2.2570879506032933, + "language_loss": 0.69334114, + "learning_rate": 1.489481687275691e-06, + "loss": 0.71477234, + "num_input_tokens_seen": 213073630, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9891, + "time_per_iteration": 2.4280505180358887 + }, + { + "auxiliary_loss_clip": 0.01106776, + "auxiliary_loss_mlp": 0.01036094, + "balance_loss_clip": 1.02376795, + "balance_loss_mlp": 1.03809762, + "epoch": 0.5947392153915527, + "flos": 20412272027520.0, + "grad_norm": 1.752140694177181, + "language_loss": 0.53531826, + "learning_rate": 1.4891051371010726e-06, + "loss": 0.55674696, + "num_input_tokens_seen": 213092450, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 9892, + "time_per_iteration": 2.4815757274627686 + }, + { + "auxiliary_loss_clip": 0.01030384, + "auxiliary_loss_mlp": 0.0100208, + "balance_loss_clip": 1.00095928, + "balance_loss_mlp": 1.00874603, + "epoch": 0.5947993386442206, + "flos": 65619138994560.0, + "grad_norm": 0.6588951163028871, + "language_loss": 0.54535234, + "learning_rate": 1.4887286062996375e-06, + "loss": 0.56567693, + "num_input_tokens_seen": 213155465, + "router_z_loss_clip": 0.01123047, + "router_z_loss_mlp": 0.21679688, + "step": 9893, + "time_per_iteration": 3.1101529598236084 + }, + { + "auxiliary_loss_clip": 0.01106079, + "auxiliary_loss_mlp": 0.01030799, + "balance_loss_clip": 1.01892543, + "balance_loss_mlp": 1.03811431, + "epoch": 0.5948594618968887, + "flos": 23183103168000.0, + "grad_norm": 1.588107459430707, + "language_loss": 0.74231315, + "learning_rate": 1.4883520948856658e-06, + "loss": 0.76368201, + "num_input_tokens_seen": 213174875, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 9894, + "time_per_iteration": 2.4519400596618652 + }, + { + "auxiliary_loss_clip": 0.01106074, + "auxiliary_loss_mlp": 0.01032085, + "balance_loss_clip": 1.02005649, + "balance_loss_mlp": 1.03685939, + "epoch": 0.5949195851495566, + "flos": 13626771632640.0, + "grad_norm": 1.6911288792838162, + "language_loss": 0.77848423, + "learning_rate": 1.487975602873434e-06, + "loss": 0.79986584, + "num_input_tokens_seen": 213192695, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6953125, + "step": 9895, + "time_per_iteration": 2.524150848388672 + }, + { + "auxiliary_loss_clip": 0.01110174, + "auxiliary_loss_mlp": 0.01032, + "balance_loss_clip": 1.01923883, + "balance_loss_mlp": 1.0391717, + "epoch": 0.5949797084022246, + "flos": 19751012599680.0, + "grad_norm": 1.6627914614590094, + "language_loss": 0.79355633, + "learning_rate": 1.4875991302772182e-06, + "loss": 0.814978, + "num_input_tokens_seen": 213211195, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9896, + "time_per_iteration": 2.450514078140259 + }, + { + "auxiliary_loss_clip": 0.01108131, + "auxiliary_loss_mlp": 0.01032365, + "balance_loss_clip": 1.01991367, + "balance_loss_mlp": 1.0379312, + "epoch": 0.5950398316548925, + "flos": 25773878407680.0, + "grad_norm": 1.56691412182982, + "language_loss": 0.83697438, + "learning_rate": 1.4872226771112954e-06, + "loss": 0.8583793, + "num_input_tokens_seen": 213231975, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.703125, + "step": 9897, + "time_per_iteration": 2.499427556991577 + }, + { + "auxiliary_loss_clip": 0.0111126, + "auxiliary_loss_mlp": 0.01033118, + "balance_loss_clip": 1.02043986, + "balance_loss_mlp": 1.04021525, + "epoch": 0.5950999549075605, + "flos": 23039029716480.0, + "grad_norm": 1.7628400615055348, + "language_loss": 0.70908117, + "learning_rate": 1.486846243389939e-06, + "loss": 0.7305249, + "num_input_tokens_seen": 213249760, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9898, + "time_per_iteration": 2.450711488723755 + }, + { + "auxiliary_loss_clip": 0.01114075, + "auxiliary_loss_mlp": 0.01038747, + "balance_loss_clip": 1.02481782, + "balance_loss_mlp": 1.03905582, + "epoch": 0.5951600781602284, + "flos": 32446367637120.0, + "grad_norm": 2.840239375448059, + "language_loss": 0.64112437, + "learning_rate": 1.4864698291274251e-06, + "loss": 0.66265255, + "num_input_tokens_seen": 213269890, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 9899, + "time_per_iteration": 2.5394744873046875 + }, + { + "auxiliary_loss_clip": 0.01109128, + "auxiliary_loss_mlp": 0.01026657, + "balance_loss_clip": 1.01592183, + "balance_loss_mlp": 1.04008675, + "epoch": 0.5952202014128964, + "flos": 23800874204160.0, + "grad_norm": 1.879978941191363, + "language_loss": 0.71715653, + "learning_rate": 1.4860934343380267e-06, + "loss": 0.73851436, + "num_input_tokens_seen": 213289400, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6875, + "step": 9900, + "time_per_iteration": 2.4623067378997803 + }, + { + "auxiliary_loss_clip": 0.01107194, + "auxiliary_loss_mlp": 0.01029325, + "balance_loss_clip": 1.01654577, + "balance_loss_mlp": 1.03926349, + "epoch": 0.5952803246655644, + "flos": 22492182084480.0, + "grad_norm": 1.9859766918367532, + "language_loss": 0.84489024, + "learning_rate": 1.4857170590360169e-06, + "loss": 0.86625552, + "num_input_tokens_seen": 213308040, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 9901, + "time_per_iteration": 2.4463791847229004 + }, + { + "auxiliary_loss_clip": 0.01028301, + "auxiliary_loss_mlp": 0.01003723, + "balance_loss_clip": 1.00249529, + "balance_loss_mlp": 1.00672269, + "epoch": 0.5953404479182324, + "flos": 51234688851840.0, + "grad_norm": 0.8098587011957621, + "language_loss": 0.58273184, + "learning_rate": 1.4853407032356674e-06, + "loss": 0.60305208, + "num_input_tokens_seen": 213358585, + "router_z_loss_clip": 0.01226807, + "router_z_loss_mlp": 0.21582031, + "step": 9902, + "time_per_iteration": 2.9000015258789062 + }, + { + "auxiliary_loss_clip": 0.01109898, + "auxiliary_loss_mlp": 0.0103143, + "balance_loss_clip": 1.01876402, + "balance_loss_mlp": 1.03859127, + "epoch": 0.5954005711709004, + "flos": 23112682554240.0, + "grad_norm": 3.08671627053405, + "language_loss": 0.77136552, + "learning_rate": 1.4849643669512503e-06, + "loss": 0.79277885, + "num_input_tokens_seen": 213379585, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71484375, + "step": 9903, + "time_per_iteration": 2.5076375007629395 + }, + { + "auxiliary_loss_clip": 0.01111406, + "auxiliary_loss_mlp": 0.01036008, + "balance_loss_clip": 1.02430773, + "balance_loss_mlp": 1.04097402, + "epoch": 0.5954606944235683, + "flos": 35954732736000.0, + "grad_norm": 1.7111155417857251, + "language_loss": 0.77616894, + "learning_rate": 1.4845880501970362e-06, + "loss": 0.79764313, + "num_input_tokens_seen": 213401465, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.703125, + "step": 9904, + "time_per_iteration": 2.5716845989227295 + }, + { + "auxiliary_loss_clip": 0.01110151, + "auxiliary_loss_mlp": 0.01036445, + "balance_loss_clip": 1.02405953, + "balance_loss_mlp": 1.03790653, + "epoch": 0.5955208176762363, + "flos": 30443665864320.0, + "grad_norm": 2.2036474032145192, + "language_loss": 0.72382712, + "learning_rate": 1.4842117529872942e-06, + "loss": 0.74529308, + "num_input_tokens_seen": 213422720, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.72265625, + "step": 9905, + "time_per_iteration": 2.5354321002960205 + }, + { + "auxiliary_loss_clip": 0.01109645, + "auxiliary_loss_mlp": 0.01030539, + "balance_loss_clip": 1.01789069, + "balance_loss_mlp": 1.03853083, + "epoch": 0.5955809409289042, + "flos": 17640112083840.0, + "grad_norm": 1.6203597758298474, + "language_loss": 0.69817066, + "learning_rate": 1.483835475336295e-06, + "loss": 0.71957242, + "num_input_tokens_seen": 213439480, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9906, + "time_per_iteration": 2.4373247623443604 + }, + { + "auxiliary_loss_clip": 0.01110789, + "auxiliary_loss_mlp": 0.01035293, + "balance_loss_clip": 1.02259731, + "balance_loss_mlp": 1.03987217, + "epoch": 0.5956410641815723, + "flos": 24279887001600.0, + "grad_norm": 1.782354761153575, + "language_loss": 0.7491982, + "learning_rate": 1.4834592172583057e-06, + "loss": 0.77065903, + "num_input_tokens_seen": 213458895, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9907, + "time_per_iteration": 2.5548195838928223 + }, + { + "auxiliary_loss_clip": 0.01109413, + "auxiliary_loss_mlp": 0.01035553, + "balance_loss_clip": 1.02353668, + "balance_loss_mlp": 1.0388813, + "epoch": 0.5957011874342402, + "flos": 35734277013120.0, + "grad_norm": 1.601142913290667, + "language_loss": 0.67155874, + "learning_rate": 1.483082978767595e-06, + "loss": 0.69300842, + "num_input_tokens_seen": 213481730, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.70703125, + "step": 9908, + "time_per_iteration": 2.5727956295013428 + }, + { + "auxiliary_loss_clip": 0.01108392, + "auxiliary_loss_mlp": 0.01029453, + "balance_loss_clip": 1.01753211, + "balance_loss_mlp": 1.03904438, + "epoch": 0.5957613106869082, + "flos": 21245004005760.0, + "grad_norm": 5.1100613292928365, + "language_loss": 0.76492268, + "learning_rate": 1.4827067598784298e-06, + "loss": 0.78630114, + "num_input_tokens_seen": 213497225, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 9909, + "time_per_iteration": 2.459608554840088 + }, + { + "auxiliary_loss_clip": 0.01028544, + "auxiliary_loss_mlp": 0.01005303, + "balance_loss_clip": 1.00416493, + "balance_loss_mlp": 1.00715542, + "epoch": 0.5958214339395761, + "flos": 65940969876480.0, + "grad_norm": 0.9275868367088792, + "language_loss": 0.73427647, + "learning_rate": 1.4823305606050753e-06, + "loss": 0.75461495, + "num_input_tokens_seen": 213556890, + "router_z_loss_clip": 0.01141357, + "router_z_loss_mlp": 0.21386719, + "step": 9910, + "time_per_iteration": 3.1051745414733887 + }, + { + "auxiliary_loss_clip": 0.01108818, + "auxiliary_loss_mlp": 0.01032907, + "balance_loss_clip": 1.01981187, + "balance_loss_mlp": 1.03741884, + "epoch": 0.5958815571922441, + "flos": 23218690567680.0, + "grad_norm": 1.6458105124951614, + "language_loss": 0.69844317, + "learning_rate": 1.481954380961799e-06, + "loss": 0.71986043, + "num_input_tokens_seen": 213575800, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71484375, + "step": 9911, + "time_per_iteration": 2.4647021293640137 + }, + { + "auxiliary_loss_clip": 0.01116428, + "auxiliary_loss_mlp": 0.01033668, + "balance_loss_clip": 1.02031708, + "balance_loss_mlp": 1.04145718, + "epoch": 0.595941680444912, + "flos": 16538623568640.0, + "grad_norm": 1.8630263408862686, + "language_loss": 0.65476716, + "learning_rate": 1.4815782209628631e-06, + "loss": 0.6762681, + "num_input_tokens_seen": 213592740, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.75, + "step": 9912, + "time_per_iteration": 2.4077272415161133 + }, + { + "auxiliary_loss_clip": 0.01108551, + "auxiliary_loss_mlp": 0.01036693, + "balance_loss_clip": 1.02385449, + "balance_loss_mlp": 1.03806984, + "epoch": 0.59600180369758, + "flos": 27818883423360.0, + "grad_norm": 2.0476871057930772, + "language_loss": 0.73610109, + "learning_rate": 1.4812020806225337e-06, + "loss": 0.75755352, + "num_input_tokens_seen": 213611970, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 9913, + "time_per_iteration": 2.5155045986175537 + }, + { + "auxiliary_loss_clip": 0.01113961, + "auxiliary_loss_mlp": 0.0103048, + "balance_loss_clip": 1.0178144, + "balance_loss_mlp": 1.03791463, + "epoch": 0.596061926950248, + "flos": 29491566013440.0, + "grad_norm": 2.0765652786465885, + "language_loss": 0.79696703, + "learning_rate": 1.4808259599550738e-06, + "loss": 0.81841141, + "num_input_tokens_seen": 213632230, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.76171875, + "step": 9914, + "time_per_iteration": 2.4950027465820312 + }, + { + "auxiliary_loss_clip": 0.01107633, + "auxiliary_loss_mlp": 0.01030894, + "balance_loss_clip": 1.0189786, + "balance_loss_mlp": 1.03856075, + "epoch": 0.596122050202916, + "flos": 16836790366080.0, + "grad_norm": 1.9745402695948293, + "language_loss": 0.67218065, + "learning_rate": 1.4804498589747448e-06, + "loss": 0.69356596, + "num_input_tokens_seen": 213649645, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69140625, + "step": 9915, + "time_per_iteration": 2.43723726272583 + }, + { + "auxiliary_loss_clip": 0.01107786, + "auxiliary_loss_mlp": 0.01035427, + "balance_loss_clip": 1.02319074, + "balance_loss_mlp": 1.03634763, + "epoch": 0.596182173455584, + "flos": 20996646393600.0, + "grad_norm": 1.613453800947639, + "language_loss": 0.78928566, + "learning_rate": 1.4800737776958095e-06, + "loss": 0.81071782, + "num_input_tokens_seen": 213668850, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.71484375, + "step": 9916, + "time_per_iteration": 2.456350088119507 + }, + { + "auxiliary_loss_clip": 0.01108915, + "auxiliary_loss_mlp": 0.0103207, + "balance_loss_clip": 1.01933253, + "balance_loss_mlp": 1.03744936, + "epoch": 0.5962422967082519, + "flos": 16065680169600.0, + "grad_norm": 1.7690461818627004, + "language_loss": 0.82394695, + "learning_rate": 1.4796977161325286e-06, + "loss": 0.84535682, + "num_input_tokens_seen": 213685695, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71484375, + "step": 9917, + "time_per_iteration": 2.469238758087158 + }, + { + "auxiliary_loss_clip": 0.01108059, + "auxiliary_loss_mlp": 0.01035556, + "balance_loss_clip": 1.02383804, + "balance_loss_mlp": 1.03837276, + "epoch": 0.5963024199609199, + "flos": 12166966995840.0, + "grad_norm": 1.817824058021054, + "language_loss": 0.77982944, + "learning_rate": 1.4793216742991625e-06, + "loss": 0.8012656, + "num_input_tokens_seen": 213703515, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6953125, + "step": 9918, + "time_per_iteration": 2.4436004161834717 + }, + { + "auxiliary_loss_clip": 0.01109399, + "auxiliary_loss_mlp": 0.01034518, + "balance_loss_clip": 1.02182257, + "balance_loss_mlp": 1.0390811, + "epoch": 0.5963625432135878, + "flos": 28074280101120.0, + "grad_norm": 1.422582146168897, + "language_loss": 0.78566158, + "learning_rate": 1.4789456522099707e-06, + "loss": 0.80710077, + "num_input_tokens_seen": 213724170, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 9919, + "time_per_iteration": 2.5787289142608643 + }, + { + "auxiliary_loss_clip": 0.01107781, + "auxiliary_loss_mlp": 0.01034192, + "balance_loss_clip": 1.02094173, + "balance_loss_mlp": 1.0381664, + "epoch": 0.5964226664662559, + "flos": 19860324664320.0, + "grad_norm": 1.9239790966111896, + "language_loss": 0.77425951, + "learning_rate": 1.4785696498792122e-06, + "loss": 0.79567927, + "num_input_tokens_seen": 213740620, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6953125, + "step": 9920, + "time_per_iteration": 2.4440083503723145 + }, + { + "auxiliary_loss_clip": 0.01113744, + "auxiliary_loss_mlp": 0.01030569, + "balance_loss_clip": 1.01843953, + "balance_loss_mlp": 1.04212332, + "epoch": 0.5964827897189238, + "flos": 12932618325120.0, + "grad_norm": 2.2435260632361733, + "language_loss": 0.82452321, + "learning_rate": 1.4781936673211446e-06, + "loss": 0.84596634, + "num_input_tokens_seen": 213755390, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71484375, + "step": 9921, + "time_per_iteration": 2.456138849258423 + }, + { + "auxiliary_loss_clip": 0.0110802, + "auxiliary_loss_mlp": 0.01032844, + "balance_loss_clip": 1.02021408, + "balance_loss_mlp": 1.0373764, + "epoch": 0.5965429129715918, + "flos": 18150797698560.0, + "grad_norm": 1.9967408520895134, + "language_loss": 0.80682462, + "learning_rate": 1.4778177045500252e-06, + "loss": 0.82823324, + "num_input_tokens_seen": 213773225, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.70703125, + "step": 9922, + "time_per_iteration": 2.4144599437713623 + }, + { + "auxiliary_loss_clip": 0.0110795, + "auxiliary_loss_mlp": 0.01029598, + "balance_loss_clip": 1.01693821, + "balance_loss_mlp": 1.03790641, + "epoch": 0.5966030362242597, + "flos": 21763231476480.0, + "grad_norm": 1.7485306495183626, + "language_loss": 0.77080536, + "learning_rate": 1.477441761580111e-06, + "loss": 0.79218084, + "num_input_tokens_seen": 213791860, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 9923, + "time_per_iteration": 2.489145517349243 + }, + { + "auxiliary_loss_clip": 0.01115253, + "auxiliary_loss_mlp": 0.01035824, + "balance_loss_clip": 1.02174497, + "balance_loss_mlp": 1.04084301, + "epoch": 0.5966631594769277, + "flos": 18807208790400.0, + "grad_norm": 1.7680593419575392, + "language_loss": 0.75725371, + "learning_rate": 1.4770658384256573e-06, + "loss": 0.77876449, + "num_input_tokens_seen": 213809455, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 9924, + "time_per_iteration": 2.4216740131378174 + }, + { + "auxiliary_loss_clip": 0.01105499, + "auxiliary_loss_mlp": 0.01032044, + "balance_loss_clip": 1.0190742, + "balance_loss_mlp": 1.03832626, + "epoch": 0.5967232827295956, + "flos": 14064163545600.0, + "grad_norm": 3.198852886281723, + "language_loss": 0.6646719, + "learning_rate": 1.4766899351009204e-06, + "loss": 0.68604732, + "num_input_tokens_seen": 213826615, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.671875, + "step": 9925, + "time_per_iteration": 2.4475882053375244 + }, + { + "auxiliary_loss_clip": 0.01109319, + "auxiliary_loss_mlp": 0.01032439, + "balance_loss_clip": 1.01986837, + "balance_loss_mlp": 1.04157531, + "epoch": 0.5967834059822636, + "flos": 17238235743360.0, + "grad_norm": 2.375187864026988, + "language_loss": 0.71979719, + "learning_rate": 1.4763140516201528e-06, + "loss": 0.74121475, + "num_input_tokens_seen": 213844495, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6796875, + "step": 9926, + "time_per_iteration": 2.4132394790649414 + }, + { + "auxiliary_loss_clip": 0.01111749, + "auxiliary_loss_mlp": 0.01033269, + "balance_loss_clip": 1.02014971, + "balance_loss_mlp": 1.03978753, + "epoch": 0.5968435292349316, + "flos": 42520244284800.0, + "grad_norm": 1.812838696961727, + "language_loss": 0.70522958, + "learning_rate": 1.4759381879976088e-06, + "loss": 0.7266798, + "num_input_tokens_seen": 213869125, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 9927, + "time_per_iteration": 4.071920156478882 + }, + { + "auxiliary_loss_clip": 0.01112228, + "auxiliary_loss_mlp": 0.01031173, + "balance_loss_clip": 1.01779175, + "balance_loss_mlp": 1.03788543, + "epoch": 0.5969036524875996, + "flos": 37630898945280.0, + "grad_norm": 1.756068652476383, + "language_loss": 0.63428164, + "learning_rate": 1.4755623442475415e-06, + "loss": 0.65571564, + "num_input_tokens_seen": 213891115, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7421875, + "step": 9928, + "time_per_iteration": 2.616556406021118 + }, + { + "auxiliary_loss_clip": 0.01105274, + "auxiliary_loss_mlp": 0.01029748, + "balance_loss_clip": 1.01774395, + "balance_loss_mlp": 1.0362494, + "epoch": 0.5969637757402676, + "flos": 23148377694720.0, + "grad_norm": 1.5985801618436777, + "language_loss": 0.69484866, + "learning_rate": 1.4751865203842022e-06, + "loss": 0.71619892, + "num_input_tokens_seen": 213911925, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 9929, + "time_per_iteration": 3.929401397705078 + }, + { + "auxiliary_loss_clip": 0.01106506, + "auxiliary_loss_mlp": 0.01034705, + "balance_loss_clip": 1.02314126, + "balance_loss_mlp": 1.0390749, + "epoch": 0.5970238989929355, + "flos": 24020934877440.0, + "grad_norm": 1.8723634053132125, + "language_loss": 0.7651577, + "learning_rate": 1.4748107164218431e-06, + "loss": 0.78656977, + "num_input_tokens_seen": 213930715, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.67578125, + "step": 9930, + "time_per_iteration": 3.9201314449310303 + }, + { + "auxiliary_loss_clip": 0.01114181, + "auxiliary_loss_mlp": 0.01032152, + "balance_loss_clip": 1.01845503, + "balance_loss_mlp": 1.04086351, + "epoch": 0.5970840222456035, + "flos": 19426883247360.0, + "grad_norm": 1.7493285690141849, + "language_loss": 0.69032001, + "learning_rate": 1.4744349323747146e-06, + "loss": 0.71178329, + "num_input_tokens_seen": 213950015, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.734375, + "step": 9931, + "time_per_iteration": 3.879492998123169 + }, + { + "auxiliary_loss_clip": 0.01027027, + "auxiliary_loss_mlp": 0.00997139, + "balance_loss_clip": 0.99597675, + "balance_loss_mlp": 1.00581264, + "epoch": 0.5971441454982714, + "flos": 62976615235200.0, + "grad_norm": 0.8633082810339764, + "language_loss": 0.64247859, + "learning_rate": 1.474059168257065e-06, + "loss": 0.6627202, + "num_input_tokens_seen": 214003330, + "router_z_loss_clip": 0.01159668, + "router_z_loss_mlp": 0.21289062, + "step": 9932, + "time_per_iteration": 2.985929489135742 + }, + { + "auxiliary_loss_clip": 0.01109379, + "auxiliary_loss_mlp": 0.01029489, + "balance_loss_clip": 1.01604247, + "balance_loss_mlp": 1.03876853, + "epoch": 0.5972042687509395, + "flos": 20266223328000.0, + "grad_norm": 1.8784919283093424, + "language_loss": 0.74257267, + "learning_rate": 1.4736834240831454e-06, + "loss": 0.76396132, + "num_input_tokens_seen": 214021680, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.70703125, + "step": 9933, + "time_per_iteration": 2.4789366722106934 + }, + { + "auxiliary_loss_clip": 0.01027236, + "auxiliary_loss_mlp": 0.00998624, + "balance_loss_clip": 0.997509, + "balance_loss_mlp": 1.00592136, + "epoch": 0.5972643920036074, + "flos": 71652383832960.0, + "grad_norm": 0.6667374312128803, + "language_loss": 0.51967168, + "learning_rate": 1.473307699867203e-06, + "loss": 0.53993034, + "num_input_tokens_seen": 214090265, + "router_z_loss_clip": 0.01116943, + "router_z_loss_mlp": 0.21289062, + "step": 9934, + "time_per_iteration": 3.181849956512451 + }, + { + "auxiliary_loss_clip": 0.01027661, + "auxiliary_loss_mlp": 0.00997349, + "balance_loss_clip": 0.99616891, + "balance_loss_mlp": 1.00641167, + "epoch": 0.5973245152562754, + "flos": 56892702263040.0, + "grad_norm": 0.8444164965298677, + "language_loss": 0.54164159, + "learning_rate": 1.4729319956234849e-06, + "loss": 0.56189167, + "num_input_tokens_seen": 214146375, + "router_z_loss_clip": 0.01177979, + "router_z_loss_mlp": 0.21289062, + "step": 9935, + "time_per_iteration": 2.997821807861328 + }, + { + "auxiliary_loss_clip": 0.01108103, + "auxiliary_loss_mlp": 0.01034314, + "balance_loss_clip": 1.02102828, + "balance_loss_mlp": 1.03731823, + "epoch": 0.5973846385089433, + "flos": 24164361884160.0, + "grad_norm": 1.5699606989571269, + "language_loss": 0.65541828, + "learning_rate": 1.4725563113662394e-06, + "loss": 0.67684245, + "num_input_tokens_seen": 214165340, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.70703125, + "step": 9936, + "time_per_iteration": 2.533317804336548 + }, + { + "auxiliary_loss_clip": 0.01110253, + "auxiliary_loss_mlp": 0.01032053, + "balance_loss_clip": 1.02026367, + "balance_loss_mlp": 1.03937888, + "epoch": 0.5974447617616113, + "flos": 17670599752320.0, + "grad_norm": 2.0123537966767797, + "language_loss": 0.67731905, + "learning_rate": 1.4721806471097103e-06, + "loss": 0.69874215, + "num_input_tokens_seen": 214181360, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.7109375, + "step": 9937, + "time_per_iteration": 2.4379465579986572 + }, + { + "auxiliary_loss_clip": 0.01112093, + "auxiliary_loss_mlp": 0.01034338, + "balance_loss_clip": 1.02101064, + "balance_loss_mlp": 1.03899479, + "epoch": 0.5975048850142792, + "flos": 22892514140160.0, + "grad_norm": 3.133342754143776, + "language_loss": 0.77174151, + "learning_rate": 1.4718050028681442e-06, + "loss": 0.79320574, + "num_input_tokens_seen": 214198525, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73046875, + "step": 9938, + "time_per_iteration": 2.470590114593506 + }, + { + "auxiliary_loss_clip": 0.01110044, + "auxiliary_loss_mlp": 0.01030032, + "balance_loss_clip": 1.01708603, + "balance_loss_mlp": 1.03813004, + "epoch": 0.5975650082669473, + "flos": 24353108876160.0, + "grad_norm": 1.6192850653818303, + "language_loss": 0.75987661, + "learning_rate": 1.4714293786557855e-06, + "loss": 0.78127742, + "num_input_tokens_seen": 214218710, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 9939, + "time_per_iteration": 2.477731227874756 + }, + { + "auxiliary_loss_clip": 0.01113496, + "auxiliary_loss_mlp": 0.01030328, + "balance_loss_clip": 1.01565337, + "balance_loss_mlp": 1.03811717, + "epoch": 0.5976251315196152, + "flos": 20923352691840.0, + "grad_norm": 2.2637964874634124, + "language_loss": 0.6840167, + "learning_rate": 1.471053774486878e-06, + "loss": 0.70545495, + "num_input_tokens_seen": 214237800, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.75390625, + "step": 9940, + "time_per_iteration": 2.4641294479370117 + }, + { + "auxiliary_loss_clip": 0.01103786, + "auxiliary_loss_mlp": 0.01033159, + "balance_loss_clip": 1.02150035, + "balance_loss_mlp": 1.03630126, + "epoch": 0.5976852547722832, + "flos": 35844594658560.0, + "grad_norm": 1.3031499437689418, + "language_loss": 0.70227146, + "learning_rate": 1.470678190375664e-06, + "loss": 0.72364092, + "num_input_tokens_seen": 214260355, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.67578125, + "step": 9941, + "time_per_iteration": 2.644956111907959 + }, + { + "auxiliary_loss_clip": 0.01103617, + "auxiliary_loss_mlp": 0.01033415, + "balance_loss_clip": 1.02042711, + "balance_loss_mlp": 1.0345757, + "epoch": 0.5977453780249512, + "flos": 12855948744960.0, + "grad_norm": 2.0310172288776456, + "language_loss": 0.77255404, + "learning_rate": 1.470302626336386e-06, + "loss": 0.79392433, + "num_input_tokens_seen": 214277120, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.69140625, + "step": 9942, + "time_per_iteration": 2.4575772285461426 + }, + { + "auxiliary_loss_clip": 0.01108233, + "auxiliary_loss_mlp": 0.01040799, + "balance_loss_clip": 1.02815676, + "balance_loss_mlp": 1.03664815, + "epoch": 0.5978055012776191, + "flos": 20959155573120.0, + "grad_norm": 1.8744137632140625, + "language_loss": 0.7585178, + "learning_rate": 1.4699270823832857e-06, + "loss": 0.78000808, + "num_input_tokens_seen": 214295300, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71484375, + "step": 9943, + "time_per_iteration": 2.4413061141967773 + }, + { + "auxiliary_loss_clip": 0.01105208, + "auxiliary_loss_mlp": 0.01030161, + "balance_loss_clip": 1.01884818, + "balance_loss_mlp": 1.03699136, + "epoch": 0.5978656245302871, + "flos": 34058003063040.0, + "grad_norm": 1.7396443017276344, + "language_loss": 0.61821425, + "learning_rate": 1.4695515585306032e-06, + "loss": 0.63956803, + "num_input_tokens_seen": 214317050, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.68359375, + "step": 9944, + "time_per_iteration": 2.569403886795044 + }, + { + "auxiliary_loss_clip": 0.01110079, + "auxiliary_loss_mlp": 0.01035221, + "balance_loss_clip": 1.02228653, + "balance_loss_mlp": 1.0391618, + "epoch": 0.597925747782955, + "flos": 37373275624320.0, + "grad_norm": 1.6935047887113677, + "language_loss": 0.72621685, + "learning_rate": 1.4691760547925795e-06, + "loss": 0.74766988, + "num_input_tokens_seen": 214337470, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 9945, + "time_per_iteration": 2.5811283588409424 + }, + { + "auxiliary_loss_clip": 0.0110514, + "auxiliary_loss_mlp": 0.01032852, + "balance_loss_clip": 1.02017426, + "balance_loss_mlp": 1.03536916, + "epoch": 0.5979858710356231, + "flos": 25374803328000.0, + "grad_norm": 2.0883326121528443, + "language_loss": 0.67156124, + "learning_rate": 1.4688005711834522e-06, + "loss": 0.69294119, + "num_input_tokens_seen": 214357975, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 9946, + "time_per_iteration": 2.513643503189087 + }, + { + "auxiliary_loss_clip": 0.01111839, + "auxiliary_loss_mlp": 0.01036188, + "balance_loss_clip": 1.02280676, + "balance_loss_mlp": 1.03886974, + "epoch": 0.598045994288291, + "flos": 13698413308800.0, + "grad_norm": 2.0799446912413386, + "language_loss": 0.88996196, + "learning_rate": 1.468425107717461e-06, + "loss": 0.91144222, + "num_input_tokens_seen": 214374125, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73046875, + "step": 9947, + "time_per_iteration": 2.4069466590881348 + }, + { + "auxiliary_loss_clip": 0.01102487, + "auxiliary_loss_mlp": 0.01035974, + "balance_loss_clip": 1.02501893, + "balance_loss_mlp": 1.03634834, + "epoch": 0.598106117540959, + "flos": 21981352815360.0, + "grad_norm": 1.664735448435926, + "language_loss": 0.72050726, + "learning_rate": 1.4680496644088432e-06, + "loss": 0.74189186, + "num_input_tokens_seen": 214393395, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6640625, + "step": 9948, + "time_per_iteration": 2.474961280822754 + }, + { + "auxiliary_loss_clip": 0.01107668, + "auxiliary_loss_mlp": 0.01031342, + "balance_loss_clip": 1.01749587, + "balance_loss_mlp": 1.03676891, + "epoch": 0.5981662407936269, + "flos": 20559362221440.0, + "grad_norm": 1.8018456141940389, + "language_loss": 0.89439249, + "learning_rate": 1.4676742412718347e-06, + "loss": 0.91578257, + "num_input_tokens_seen": 214411550, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7109375, + "step": 9949, + "time_per_iteration": 2.455151319503784 + }, + { + "auxiliary_loss_clip": 0.0110613, + "auxiliary_loss_mlp": 0.01026216, + "balance_loss_clip": 1.01458669, + "balance_loss_mlp": 1.03746963, + "epoch": 0.5982263640462949, + "flos": 14063840323200.0, + "grad_norm": 1.9594093526491967, + "language_loss": 0.70425475, + "learning_rate": 1.467298838320673e-06, + "loss": 0.72557819, + "num_input_tokens_seen": 214429780, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6875, + "step": 9950, + "time_per_iteration": 2.479177474975586 + }, + { + "auxiliary_loss_clip": 0.01106992, + "auxiliary_loss_mlp": 0.01030869, + "balance_loss_clip": 1.01816094, + "balance_loss_mlp": 1.03653646, + "epoch": 0.5982864872989628, + "flos": 17707228646400.0, + "grad_norm": 1.7839667170115563, + "language_loss": 0.78153586, + "learning_rate": 1.4669234555695921e-06, + "loss": 0.8029145, + "num_input_tokens_seen": 214447775, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 9951, + "time_per_iteration": 2.4318583011627197 + }, + { + "auxiliary_loss_clip": 0.01108258, + "auxiliary_loss_mlp": 0.01042077, + "balance_loss_clip": 1.02885103, + "balance_loss_mlp": 1.03666139, + "epoch": 0.5983466105516309, + "flos": 16764789553920.0, + "grad_norm": 6.7296631151691235, + "language_loss": 0.73816681, + "learning_rate": 1.4665480930328275e-06, + "loss": 0.75967014, + "num_input_tokens_seen": 214467245, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 9952, + "time_per_iteration": 2.4669008255004883 + }, + { + "auxiliary_loss_clip": 0.01109291, + "auxiliary_loss_mlp": 0.01030783, + "balance_loss_clip": 1.01705003, + "balance_loss_mlp": 1.03699803, + "epoch": 0.5984067338042988, + "flos": 20042714949120.0, + "grad_norm": 2.1100044837404264, + "language_loss": 0.78595901, + "learning_rate": 1.466172750724613e-06, + "loss": 0.8073597, + "num_input_tokens_seen": 214484385, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.72265625, + "step": 9953, + "time_per_iteration": 2.432607650756836 + }, + { + "auxiliary_loss_clip": 0.01106295, + "auxiliary_loss_mlp": 0.01030481, + "balance_loss_clip": 1.01883411, + "balance_loss_mlp": 1.03698087, + "epoch": 0.5984668570569668, + "flos": 26319900026880.0, + "grad_norm": 1.6558066102502929, + "language_loss": 0.69747621, + "learning_rate": 1.4657974286591807e-06, + "loss": 0.71884394, + "num_input_tokens_seen": 214503465, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6953125, + "step": 9954, + "time_per_iteration": 2.5316383838653564 + }, + { + "auxiliary_loss_clip": 0.01106341, + "auxiliary_loss_mlp": 0.01031118, + "balance_loss_clip": 1.01923835, + "balance_loss_mlp": 1.03664923, + "epoch": 0.5985269803096348, + "flos": 20593728558720.0, + "grad_norm": 1.7741106098423227, + "language_loss": 0.73212743, + "learning_rate": 1.4654221268507637e-06, + "loss": 0.75350201, + "num_input_tokens_seen": 214520725, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6953125, + "step": 9955, + "time_per_iteration": 2.457697629928589 + }, + { + "auxiliary_loss_clip": 0.01107558, + "auxiliary_loss_mlp": 0.01030352, + "balance_loss_clip": 1.01816237, + "balance_loss_mlp": 1.03694773, + "epoch": 0.5985871035623027, + "flos": 26865382942080.0, + "grad_norm": 1.8276717412391432, + "language_loss": 0.68681955, + "learning_rate": 1.4650468453135934e-06, + "loss": 0.70819867, + "num_input_tokens_seen": 214540675, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.70703125, + "step": 9956, + "time_per_iteration": 2.5265135765075684 + }, + { + "auxiliary_loss_clip": 0.01109542, + "auxiliary_loss_mlp": 0.01031362, + "balance_loss_clip": 1.0191431, + "balance_loss_mlp": 1.03873038, + "epoch": 0.5986472268149707, + "flos": 19609704495360.0, + "grad_norm": 2.224432093074028, + "language_loss": 0.73662853, + "learning_rate": 1.4646715840618999e-06, + "loss": 0.75803757, + "num_input_tokens_seen": 214559910, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.7109375, + "step": 9957, + "time_per_iteration": 2.4384164810180664 + }, + { + "auxiliary_loss_clip": 0.01105192, + "auxiliary_loss_mlp": 0.0102626, + "balance_loss_clip": 1.01433289, + "balance_loss_mlp": 1.03838789, + "epoch": 0.5987073500676386, + "flos": 21794616984960.0, + "grad_norm": 1.875022862600817, + "language_loss": 0.84732842, + "learning_rate": 1.4642963431099138e-06, + "loss": 0.86864293, + "num_input_tokens_seen": 214575960, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66796875, + "step": 9958, + "time_per_iteration": 2.501417636871338 + }, + { + "auxiliary_loss_clip": 0.01109112, + "auxiliary_loss_mlp": 0.01037074, + "balance_loss_clip": 1.02396715, + "balance_loss_mlp": 1.03740525, + "epoch": 0.5987674733203067, + "flos": 24314361079680.0, + "grad_norm": 2.024494152709453, + "language_loss": 0.66685295, + "learning_rate": 1.463921122471864e-06, + "loss": 0.6883148, + "num_input_tokens_seen": 214594230, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 9959, + "time_per_iteration": 2.471848726272583 + }, + { + "auxiliary_loss_clip": 0.01108718, + "auxiliary_loss_mlp": 0.01029772, + "balance_loss_clip": 1.01754093, + "balance_loss_mlp": 1.0389334, + "epoch": 0.5988275965729746, + "flos": 21320201128320.0, + "grad_norm": 1.6260957561310903, + "language_loss": 0.83360457, + "learning_rate": 1.4635459221619796e-06, + "loss": 0.85498953, + "num_input_tokens_seen": 214613130, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6953125, + "step": 9960, + "time_per_iteration": 2.4651761054992676 + }, + { + "auxiliary_loss_clip": 0.01106018, + "auxiliary_loss_mlp": 0.01029196, + "balance_loss_clip": 1.01716197, + "balance_loss_mlp": 1.03686321, + "epoch": 0.5988877198256426, + "flos": 25118041933440.0, + "grad_norm": 1.466008615140069, + "language_loss": 0.79505813, + "learning_rate": 1.4631707421944868e-06, + "loss": 0.81641018, + "num_input_tokens_seen": 214634470, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.69140625, + "step": 9961, + "time_per_iteration": 2.475454568862915 + }, + { + "auxiliary_loss_clip": 0.01106184, + "auxiliary_loss_mlp": 0.01030716, + "balance_loss_clip": 1.01849759, + "balance_loss_mlp": 1.03730237, + "epoch": 0.5989478430783105, + "flos": 26429104350720.0, + "grad_norm": 1.756927001005791, + "language_loss": 0.67329001, + "learning_rate": 1.4627955825836136e-06, + "loss": 0.69465899, + "num_input_tokens_seen": 214654030, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 9962, + "time_per_iteration": 2.489084005355835 + }, + { + "auxiliary_loss_clip": 0.01107167, + "auxiliary_loss_mlp": 0.01035232, + "balance_loss_clip": 1.02303684, + "balance_loss_mlp": 1.03722596, + "epoch": 0.5990079663309785, + "flos": 25778439434880.0, + "grad_norm": 1.365980621399165, + "language_loss": 0.74311382, + "learning_rate": 1.4624204433435857e-06, + "loss": 0.76453781, + "num_input_tokens_seen": 214676985, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.703125, + "step": 9963, + "time_per_iteration": 2.4947874546051025 + }, + { + "auxiliary_loss_clip": 0.01105091, + "auxiliary_loss_mlp": 0.01032808, + "balance_loss_clip": 1.02042198, + "balance_loss_mlp": 1.03652799, + "epoch": 0.5990680895836464, + "flos": 36831779118720.0, + "grad_norm": 2.111691032145124, + "language_loss": 0.68214118, + "learning_rate": 1.4620453244886281e-06, + "loss": 0.70352018, + "num_input_tokens_seen": 214700105, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6875, + "step": 9964, + "time_per_iteration": 2.595745086669922 + }, + { + "auxiliary_loss_clip": 0.01104549, + "auxiliary_loss_mlp": 0.01028863, + "balance_loss_clip": 1.01635242, + "balance_loss_mlp": 1.03745115, + "epoch": 0.5991282128363145, + "flos": 24133550993280.0, + "grad_norm": 1.9069133835925212, + "language_loss": 0.77044344, + "learning_rate": 1.4616702260329662e-06, + "loss": 0.79177749, + "num_input_tokens_seen": 214717885, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.671875, + "step": 9965, + "time_per_iteration": 2.447580337524414 + }, + { + "auxiliary_loss_clip": 0.01106548, + "auxiliary_loss_mlp": 0.01030278, + "balance_loss_clip": 1.01833928, + "balance_loss_mlp": 1.03651989, + "epoch": 0.5991883360889824, + "flos": 10304064956160.0, + "grad_norm": 1.8284726106569544, + "language_loss": 0.77189291, + "learning_rate": 1.4612951479908229e-06, + "loss": 0.79326117, + "num_input_tokens_seen": 214733680, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.703125, + "step": 9966, + "time_per_iteration": 2.450202226638794 + }, + { + "auxiliary_loss_clip": 0.01106883, + "auxiliary_loss_mlp": 0.01029207, + "balance_loss_clip": 1.01775706, + "balance_loss_mlp": 1.03827262, + "epoch": 0.5992484593416504, + "flos": 23951196622080.0, + "grad_norm": 1.4816211966309663, + "language_loss": 0.73338163, + "learning_rate": 1.460920090376422e-06, + "loss": 0.7547425, + "num_input_tokens_seen": 214753285, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6875, + "step": 9967, + "time_per_iteration": 2.5361080169677734 + }, + { + "auxiliary_loss_clip": 0.01113043, + "auxiliary_loss_mlp": 0.01036039, + "balance_loss_clip": 1.0229435, + "balance_loss_mlp": 1.03907526, + "epoch": 0.5993085825943184, + "flos": 11944105061760.0, + "grad_norm": 1.98552880835617, + "language_loss": 0.68667233, + "learning_rate": 1.4605450532039847e-06, + "loss": 0.70816314, + "num_input_tokens_seen": 214767810, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7421875, + "step": 9968, + "time_per_iteration": 2.4201669692993164 + }, + { + "auxiliary_loss_clip": 0.01107383, + "auxiliary_loss_mlp": 0.01034585, + "balance_loss_clip": 1.02188921, + "balance_loss_mlp": 1.03702521, + "epoch": 0.5993687058469863, + "flos": 19026838500480.0, + "grad_norm": 1.5069000727815525, + "language_loss": 0.79169899, + "learning_rate": 1.4601700364877334e-06, + "loss": 0.8131187, + "num_input_tokens_seen": 214786040, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 9969, + "time_per_iteration": 3.9278953075408936 + }, + { + "auxiliary_loss_clip": 0.01106356, + "auxiliary_loss_mlp": 0.01032743, + "balance_loss_clip": 1.01999974, + "balance_loss_mlp": 1.03598189, + "epoch": 0.5994288290996543, + "flos": 14282967242880.0, + "grad_norm": 2.0663897132059588, + "language_loss": 0.81023246, + "learning_rate": 1.4597950402418889e-06, + "loss": 0.83162344, + "num_input_tokens_seen": 214803110, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 9970, + "time_per_iteration": 2.4416465759277344 + }, + { + "auxiliary_loss_clip": 0.01109867, + "auxiliary_loss_mlp": 0.01039566, + "balance_loss_clip": 1.02511787, + "balance_loss_mlp": 1.0377593, + "epoch": 0.5994889523523222, + "flos": 19206643006080.0, + "grad_norm": 1.8664927797599988, + "language_loss": 0.62176776, + "learning_rate": 1.4594200644806697e-06, + "loss": 0.64326209, + "num_input_tokens_seen": 214819945, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.71875, + "step": 9971, + "time_per_iteration": 3.8846518993377686 + }, + { + "auxiliary_loss_clip": 0.01102408, + "auxiliary_loss_mlp": 0.01029479, + "balance_loss_clip": 1.01776624, + "balance_loss_mlp": 1.03571367, + "epoch": 0.5995490756049903, + "flos": 28037040675840.0, + "grad_norm": 1.8563043542024344, + "language_loss": 0.79314888, + "learning_rate": 1.4590451092182962e-06, + "loss": 0.81446773, + "num_input_tokens_seen": 214838810, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 9972, + "time_per_iteration": 3.901256561279297 + }, + { + "auxiliary_loss_clip": 0.01112588, + "auxiliary_loss_mlp": 0.01034647, + "balance_loss_clip": 1.02152252, + "balance_loss_mlp": 1.03817391, + "epoch": 0.5996091988576582, + "flos": 29052953038080.0, + "grad_norm": 2.1896098539024176, + "language_loss": 0.76205128, + "learning_rate": 1.4586701744689864e-06, + "loss": 0.78352362, + "num_input_tokens_seen": 214857040, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7421875, + "step": 9973, + "time_per_iteration": 3.9424259662628174 + }, + { + "auxiliary_loss_clip": 0.01106987, + "auxiliary_loss_mlp": 0.01033225, + "balance_loss_clip": 1.02021337, + "balance_loss_mlp": 1.0362227, + "epoch": 0.5996693221103262, + "flos": 20813968800000.0, + "grad_norm": 2.3034108647788933, + "language_loss": 0.64969486, + "learning_rate": 1.4582952602469578e-06, + "loss": 0.67109704, + "num_input_tokens_seen": 214873375, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 9974, + "time_per_iteration": 2.4875805377960205 + }, + { + "auxiliary_loss_clip": 0.01107343, + "auxiliary_loss_mlp": 0.0103502, + "balance_loss_clip": 1.02270579, + "balance_loss_mlp": 1.03728855, + "epoch": 0.5997294453629941, + "flos": 23768914078080.0, + "grad_norm": 1.4500461001521425, + "language_loss": 0.74434048, + "learning_rate": 1.457920366566428e-06, + "loss": 0.76576418, + "num_input_tokens_seen": 214893900, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 9975, + "time_per_iteration": 2.4895670413970947 + }, + { + "auxiliary_loss_clip": 0.01107892, + "auxiliary_loss_mlp": 0.01028638, + "balance_loss_clip": 1.01572204, + "balance_loss_mlp": 1.03760493, + "epoch": 0.5997895686156621, + "flos": 20960017499520.0, + "grad_norm": 1.7933529759094704, + "language_loss": 0.76735765, + "learning_rate": 1.457545493441611e-06, + "loss": 0.78872299, + "num_input_tokens_seen": 214912110, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 9976, + "time_per_iteration": 2.5056304931640625 + }, + { + "auxiliary_loss_clip": 0.01107614, + "auxiliary_loss_mlp": 0.01039313, + "balance_loss_clip": 1.02620029, + "balance_loss_mlp": 1.03780508, + "epoch": 0.59984969186833, + "flos": 28365443746560.0, + "grad_norm": 2.4460752586196857, + "language_loss": 0.74817264, + "learning_rate": 1.4571706408867237e-06, + "loss": 0.76964188, + "num_input_tokens_seen": 214930140, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.69921875, + "step": 9977, + "time_per_iteration": 2.496149778366089 + }, + { + "auxiliary_loss_clip": 0.01107436, + "auxiliary_loss_mlp": 0.01032043, + "balance_loss_clip": 1.01962721, + "balance_loss_mlp": 1.03684258, + "epoch": 0.5999098151209981, + "flos": 22565906749440.0, + "grad_norm": 1.6882301956293941, + "language_loss": 0.68553925, + "learning_rate": 1.4567958089159802e-06, + "loss": 0.70693398, + "num_input_tokens_seen": 214949200, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.70703125, + "step": 9978, + "time_per_iteration": 2.483567714691162 + }, + { + "auxiliary_loss_clip": 0.01113427, + "auxiliary_loss_mlp": 0.01033778, + "balance_loss_clip": 1.02087975, + "balance_loss_mlp": 1.04072738, + "epoch": 0.599969938373666, + "flos": 18768712389120.0, + "grad_norm": 2.78777966355448, + "language_loss": 0.81153774, + "learning_rate": 1.456420997543594e-06, + "loss": 0.83300972, + "num_input_tokens_seen": 214965775, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7265625, + "step": 9979, + "time_per_iteration": 2.413935899734497 + }, + { + "auxiliary_loss_clip": 0.01102922, + "auxiliary_loss_mlp": 0.01032239, + "balance_loss_clip": 1.02026439, + "balance_loss_mlp": 1.03630424, + "epoch": 0.600030061626334, + "flos": 11327231865600.0, + "grad_norm": 1.7401896529481804, + "language_loss": 0.6957618, + "learning_rate": 1.4560462067837782e-06, + "loss": 0.71711338, + "num_input_tokens_seen": 214982480, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6640625, + "step": 9980, + "time_per_iteration": 2.4312682151794434 + }, + { + "auxiliary_loss_clip": 0.01110498, + "auxiliary_loss_mlp": 0.01032669, + "balance_loss_clip": 1.01947856, + "balance_loss_mlp": 1.03764093, + "epoch": 0.600090184879002, + "flos": 16578664254720.0, + "grad_norm": 3.8237519537086238, + "language_loss": 0.68642873, + "learning_rate": 1.4556714366507445e-06, + "loss": 0.70786041, + "num_input_tokens_seen": 214998110, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73046875, + "step": 9981, + "time_per_iteration": 2.4180452823638916 + }, + { + "auxiliary_loss_clip": 0.01106792, + "auxiliary_loss_mlp": 0.01035605, + "balance_loss_clip": 1.02439916, + "balance_loss_mlp": 1.03752363, + "epoch": 0.6001503081316699, + "flos": 23618627573760.0, + "grad_norm": 3.017374403618408, + "language_loss": 0.78579712, + "learning_rate": 1.4552966871587048e-06, + "loss": 0.80722106, + "num_input_tokens_seen": 215017995, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.69140625, + "step": 9982, + "time_per_iteration": 2.5378241539001465 + }, + { + "auxiliary_loss_clip": 0.01107415, + "auxiliary_loss_mlp": 0.01034564, + "balance_loss_clip": 1.02182055, + "balance_loss_mlp": 1.03862381, + "epoch": 0.6002104313843379, + "flos": 20667668705280.0, + "grad_norm": 1.4959053225865697, + "language_loss": 0.72973263, + "learning_rate": 1.4549219583218686e-06, + "loss": 0.7511524, + "num_input_tokens_seen": 215038285, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 9983, + "time_per_iteration": 2.4516336917877197 + }, + { + "auxiliary_loss_clip": 0.01105736, + "auxiliary_loss_mlp": 0.01031565, + "balance_loss_clip": 1.01893497, + "balance_loss_mlp": 1.03546536, + "epoch": 0.6002705546370058, + "flos": 22455229968000.0, + "grad_norm": 2.0437339372279775, + "language_loss": 0.77803969, + "learning_rate": 1.454547250154447e-06, + "loss": 0.79941273, + "num_input_tokens_seen": 215057825, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 9984, + "time_per_iteration": 2.4639358520507812 + }, + { + "auxiliary_loss_clip": 0.0110781, + "auxiliary_loss_mlp": 0.01034309, + "balance_loss_clip": 1.02200651, + "balance_loss_mlp": 1.03833842, + "epoch": 0.6003306778896739, + "flos": 25191982080000.0, + "grad_norm": 1.564540000062254, + "language_loss": 0.83254963, + "learning_rate": 1.4541725626706485e-06, + "loss": 0.85397083, + "num_input_tokens_seen": 215077790, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 9985, + "time_per_iteration": 2.584782361984253 + }, + { + "auxiliary_loss_clip": 0.01107675, + "auxiliary_loss_mlp": 0.01037251, + "balance_loss_clip": 1.02535367, + "balance_loss_mlp": 1.03886271, + "epoch": 0.6003908011423418, + "flos": 26687733252480.0, + "grad_norm": 1.8232812965365295, + "language_loss": 0.71257466, + "learning_rate": 1.4537978958846809e-06, + "loss": 0.73402393, + "num_input_tokens_seen": 215097650, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 9986, + "time_per_iteration": 2.5054030418395996 + }, + { + "auxiliary_loss_clip": 0.01110337, + "auxiliary_loss_mlp": 0.01031102, + "balance_loss_clip": 1.01824546, + "balance_loss_mlp": 1.04022861, + "epoch": 0.6004509243950098, + "flos": 22565080736640.0, + "grad_norm": 2.2582190453585653, + "language_loss": 0.71791571, + "learning_rate": 1.4534232498107514e-06, + "loss": 0.73933005, + "num_input_tokens_seen": 215118235, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69921875, + "step": 9987, + "time_per_iteration": 2.4961001873016357 + }, + { + "auxiliary_loss_clip": 0.01106291, + "auxiliary_loss_mlp": 0.01034497, + "balance_loss_clip": 1.02245712, + "balance_loss_mlp": 1.03697586, + "epoch": 0.6005110476476777, + "flos": 19719303868800.0, + "grad_norm": 1.6101111043143586, + "language_loss": 0.84407473, + "learning_rate": 1.4530486244630673e-06, + "loss": 0.86548263, + "num_input_tokens_seen": 215136755, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6953125, + "step": 9988, + "time_per_iteration": 2.435049533843994 + }, + { + "auxiliary_loss_clip": 0.01105215, + "auxiliary_loss_mlp": 0.01033666, + "balance_loss_clip": 1.02113748, + "balance_loss_mlp": 1.03617096, + "epoch": 0.6005711709003457, + "flos": 17712543859200.0, + "grad_norm": 1.6559701651537184, + "language_loss": 0.65416402, + "learning_rate": 1.4526740198558346e-06, + "loss": 0.67555285, + "num_input_tokens_seen": 215155225, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69140625, + "step": 9989, + "time_per_iteration": 2.4359869956970215 + }, + { + "auxiliary_loss_clip": 0.01105185, + "auxiliary_loss_mlp": 0.01031292, + "balance_loss_clip": 1.01960373, + "balance_loss_mlp": 1.03680921, + "epoch": 0.6006312941530136, + "flos": 18514464946560.0, + "grad_norm": 1.811706113820645, + "language_loss": 0.80521321, + "learning_rate": 1.452299436003257e-06, + "loss": 0.82657802, + "num_input_tokens_seen": 215174815, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.68359375, + "step": 9990, + "time_per_iteration": 2.44775128364563 + }, + { + "auxiliary_loss_clip": 0.01108983, + "auxiliary_loss_mlp": 0.01034602, + "balance_loss_clip": 1.02215016, + "balance_loss_mlp": 1.03804195, + "epoch": 0.6006914174056817, + "flos": 21390837223680.0, + "grad_norm": 1.6786296180827829, + "language_loss": 0.82789129, + "learning_rate": 1.4519248729195403e-06, + "loss": 0.84932715, + "num_input_tokens_seen": 215192045, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 9991, + "time_per_iteration": 2.464409112930298 + }, + { + "auxiliary_loss_clip": 0.01103829, + "auxiliary_loss_mlp": 0.01034501, + "balance_loss_clip": 1.02240098, + "balance_loss_mlp": 1.03611255, + "epoch": 0.6007515406583496, + "flos": 12750515349120.0, + "grad_norm": 2.5638990933503587, + "language_loss": 0.82719564, + "learning_rate": 1.4515503306188878e-06, + "loss": 0.84857893, + "num_input_tokens_seen": 215209885, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 9992, + "time_per_iteration": 2.4012389183044434 + }, + { + "auxiliary_loss_clip": 0.01105302, + "auxiliary_loss_mlp": 0.01034099, + "balance_loss_clip": 1.02181458, + "balance_loss_mlp": 1.03721142, + "epoch": 0.6008116639110176, + "flos": 19206894401280.0, + "grad_norm": 2.724325433103902, + "language_loss": 0.6668725, + "learning_rate": 1.4511758091155008e-06, + "loss": 0.6882664, + "num_input_tokens_seen": 215228150, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6796875, + "step": 9993, + "time_per_iteration": 2.431534767150879 + }, + { + "auxiliary_loss_clip": 0.01105757, + "auxiliary_loss_mlp": 0.01031875, + "balance_loss_clip": 1.01941192, + "balance_loss_mlp": 1.03631759, + "epoch": 0.6008717871636855, + "flos": 17055342668160.0, + "grad_norm": 2.313639381360734, + "language_loss": 0.81478924, + "learning_rate": 1.4508013084235826e-06, + "loss": 0.83616555, + "num_input_tokens_seen": 215243755, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 9994, + "time_per_iteration": 2.410637140274048 + }, + { + "auxiliary_loss_clip": 0.01102128, + "auxiliary_loss_mlp": 0.01025937, + "balance_loss_clip": 1.01506472, + "balance_loss_mlp": 1.03755724, + "epoch": 0.6009319104163535, + "flos": 20298686244480.0, + "grad_norm": 1.8133737963871297, + "language_loss": 0.72619045, + "learning_rate": 1.4504268285573337e-06, + "loss": 0.74747109, + "num_input_tokens_seen": 215262130, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.64453125, + "step": 9995, + "time_per_iteration": 2.462024450302124 + }, + { + "auxiliary_loss_clip": 0.01106573, + "auxiliary_loss_mlp": 0.01033043, + "balance_loss_clip": 1.02083576, + "balance_loss_mlp": 1.03584194, + "epoch": 0.6009920336690215, + "flos": 21836776573440.0, + "grad_norm": 2.19390066880666, + "language_loss": 0.80974549, + "learning_rate": 1.4500523695309546e-06, + "loss": 0.83114165, + "num_input_tokens_seen": 215281785, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.70703125, + "step": 9996, + "time_per_iteration": 2.4826295375823975 + }, + { + "auxiliary_loss_clip": 0.01106517, + "auxiliary_loss_mlp": 0.01037102, + "balance_loss_clip": 1.02458513, + "balance_loss_mlp": 1.03807008, + "epoch": 0.6010521569216895, + "flos": 22596107109120.0, + "grad_norm": 3.1537087962017814, + "language_loss": 0.78669906, + "learning_rate": 1.4496779313586447e-06, + "loss": 0.80813521, + "num_input_tokens_seen": 215297550, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 9997, + "time_per_iteration": 2.4731595516204834 + }, + { + "auxiliary_loss_clip": 0.01107621, + "auxiliary_loss_mlp": 0.01030423, + "balance_loss_clip": 1.01708388, + "balance_loss_mlp": 1.03646445, + "epoch": 0.6011122801743575, + "flos": 19171702051200.0, + "grad_norm": 3.7238695953955263, + "language_loss": 0.73005414, + "learning_rate": 1.4493035140546028e-06, + "loss": 0.75143456, + "num_input_tokens_seen": 215316360, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 9998, + "time_per_iteration": 2.4839541912078857 + }, + { + "auxiliary_loss_clip": 0.01103199, + "auxiliary_loss_mlp": 0.01026771, + "balance_loss_clip": 1.0148679, + "balance_loss_mlp": 1.03565955, + "epoch": 0.6011724034270254, + "flos": 25010022758400.0, + "grad_norm": 1.5405076955909784, + "language_loss": 0.721259, + "learning_rate": 1.448929117633027e-06, + "loss": 0.74255872, + "num_input_tokens_seen": 215336405, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 9999, + "time_per_iteration": 2.5177793502807617 + }, + { + "auxiliary_loss_clip": 0.01108153, + "auxiliary_loss_mlp": 0.01035498, + "balance_loss_clip": 1.02320766, + "balance_loss_mlp": 1.03617668, + "epoch": 0.6012325266796934, + "flos": 21797669640960.0, + "grad_norm": 14.582740501304201, + "language_loss": 0.78332782, + "learning_rate": 1.4485547421081142e-06, + "loss": 0.80476433, + "num_input_tokens_seen": 215356590, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.71875, + "step": 10000, + "time_per_iteration": 2.5176899433135986 + }, + { + "auxiliary_loss_clip": 0.01111103, + "auxiliary_loss_mlp": 0.01033566, + "balance_loss_clip": 1.02002978, + "balance_loss_mlp": 1.03898025, + "epoch": 0.6012926499323613, + "flos": 19573003774080.0, + "grad_norm": 1.9333747533908545, + "language_loss": 0.77681154, + "learning_rate": 1.4481803874940608e-06, + "loss": 0.79825819, + "num_input_tokens_seen": 215374295, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.72265625, + "step": 10001, + "time_per_iteration": 2.4608781337738037 + }, + { + "auxiliary_loss_clip": 0.01109986, + "auxiliary_loss_mlp": 0.01031166, + "balance_loss_clip": 1.01821423, + "balance_loss_mlp": 1.03775978, + "epoch": 0.6013527731850293, + "flos": 34860786076800.0, + "grad_norm": 2.0376201380828642, + "language_loss": 0.58534205, + "learning_rate": 1.4478060538050624e-06, + "loss": 0.60675359, + "num_input_tokens_seen": 215394535, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.72265625, + "step": 10002, + "time_per_iteration": 2.573974847793579 + }, + { + "auxiliary_loss_clip": 0.01110624, + "auxiliary_loss_mlp": 0.01035664, + "balance_loss_clip": 1.02163339, + "balance_loss_mlp": 1.0399766, + "epoch": 0.6014128964376972, + "flos": 23291948355840.0, + "grad_norm": 1.4763500532767482, + "language_loss": 0.77651924, + "learning_rate": 1.447431741055314e-06, + "loss": 0.7979821, + "num_input_tokens_seen": 215414355, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.70703125, + "step": 10003, + "time_per_iteration": 2.507904291152954 + }, + { + "auxiliary_loss_clip": 0.01109401, + "auxiliary_loss_mlp": 0.01028384, + "balance_loss_clip": 1.01595616, + "balance_loss_mlp": 1.03869998, + "epoch": 0.6014730196903653, + "flos": 24820916630400.0, + "grad_norm": 2.341725474955548, + "language_loss": 0.77185351, + "learning_rate": 1.4470574492590091e-06, + "loss": 0.79323137, + "num_input_tokens_seen": 215428280, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.70703125, + "step": 10004, + "time_per_iteration": 2.4672906398773193 + }, + { + "auxiliary_loss_clip": 0.01106632, + "auxiliary_loss_mlp": 0.01029336, + "balance_loss_clip": 1.01697397, + "balance_loss_mlp": 1.03765237, + "epoch": 0.6015331429430332, + "flos": 23112359331840.0, + "grad_norm": 1.6533707293679005, + "language_loss": 0.72357887, + "learning_rate": 1.4466831784303408e-06, + "loss": 0.74493855, + "num_input_tokens_seen": 215448970, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 10005, + "time_per_iteration": 2.481327533721924 + }, + { + "auxiliary_loss_clip": 0.01103683, + "auxiliary_loss_mlp": 0.01029251, + "balance_loss_clip": 1.01724029, + "balance_loss_mlp": 1.03719342, + "epoch": 0.6015932661957012, + "flos": 19201363706880.0, + "grad_norm": 1.9903847661444378, + "language_loss": 0.74641156, + "learning_rate": 1.4463089285835026e-06, + "loss": 0.76774085, + "num_input_tokens_seen": 215465260, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 10006, + "time_per_iteration": 2.4176204204559326 + }, + { + "auxiliary_loss_clip": 0.01104928, + "auxiliary_loss_mlp": 0.01036759, + "balance_loss_clip": 1.02387798, + "balance_loss_mlp": 1.03541553, + "epoch": 0.6016533894483691, + "flos": 18113630100480.0, + "grad_norm": 2.3154709076008726, + "language_loss": 0.73940712, + "learning_rate": 1.445934699732685e-06, + "loss": 0.76082402, + "num_input_tokens_seen": 215482725, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 10007, + "time_per_iteration": 2.4568898677825928 + }, + { + "auxiliary_loss_clip": 0.01105567, + "auxiliary_loss_mlp": 0.01026535, + "balance_loss_clip": 1.01488209, + "balance_loss_mlp": 1.03767657, + "epoch": 0.6017135127010371, + "flos": 16216900427520.0, + "grad_norm": 2.0163179080147065, + "language_loss": 0.70129442, + "learning_rate": 1.4455604918920785e-06, + "loss": 0.72261548, + "num_input_tokens_seen": 215500420, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 10008, + "time_per_iteration": 2.4591152667999268 + }, + { + "auxiliary_loss_clip": 0.011063, + "auxiliary_loss_mlp": 0.01025901, + "balance_loss_clip": 1.01420045, + "balance_loss_mlp": 1.0375886, + "epoch": 0.6017736359537051, + "flos": 23444246021760.0, + "grad_norm": 1.5735106118568272, + "language_loss": 0.76055562, + "learning_rate": 1.4451863050758748e-06, + "loss": 0.78187764, + "num_input_tokens_seen": 215522260, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6875, + "step": 10009, + "time_per_iteration": 2.5413200855255127 + }, + { + "auxiliary_loss_clip": 0.01106971, + "auxiliary_loss_mlp": 0.01033511, + "balance_loss_clip": 1.02157235, + "balance_loss_mlp": 1.03784704, + "epoch": 0.601833759206373, + "flos": 23514056104320.0, + "grad_norm": 2.2862690220983257, + "language_loss": 0.74194181, + "learning_rate": 1.4448121392982608e-06, + "loss": 0.76334661, + "num_input_tokens_seen": 215541715, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 10010, + "time_per_iteration": 3.888418436050415 + }, + { + "auxiliary_loss_clip": 0.01029006, + "auxiliary_loss_mlp": 0.00995965, + "balance_loss_clip": 0.99489832, + "balance_loss_mlp": 1.00768209, + "epoch": 0.6018938824590411, + "flos": 63991668648960.0, + "grad_norm": 0.7964241921308365, + "language_loss": 0.55079472, + "learning_rate": 1.4444379945734268e-06, + "loss": 0.57104445, + "num_input_tokens_seen": 215603020, + "router_z_loss_clip": 0.01068115, + "router_z_loss_mlp": 0.21289062, + "step": 10011, + "time_per_iteration": 3.125993251800537 + }, + { + "auxiliary_loss_clip": 0.01106744, + "auxiliary_loss_mlp": 0.01035464, + "balance_loss_clip": 1.02382302, + "balance_loss_mlp": 1.03751755, + "epoch": 0.601954005711709, + "flos": 34640007131520.0, + "grad_norm": 1.3952150015846279, + "language_loss": 0.62033314, + "learning_rate": 1.44406387091556e-06, + "loss": 0.64175516, + "num_input_tokens_seen": 215625115, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.69140625, + "step": 10012, + "time_per_iteration": 3.9947257041931152 + }, + { + "auxiliary_loss_clip": 0.01107114, + "auxiliary_loss_mlp": 0.0102335, + "balance_loss_clip": 1.01210856, + "balance_loss_mlp": 1.03870738, + "epoch": 0.602014128964377, + "flos": 19427062815360.0, + "grad_norm": 1.6026031648611754, + "language_loss": 0.74765098, + "learning_rate": 1.4436897683388462e-06, + "loss": 0.76895565, + "num_input_tokens_seen": 215643730, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.68359375, + "step": 10013, + "time_per_iteration": 3.9350314140319824 + }, + { + "auxiliary_loss_clip": 0.01100697, + "auxiliary_loss_mlp": 0.01027976, + "balance_loss_clip": 1.01671076, + "balance_loss_mlp": 1.03607368, + "epoch": 0.6020742522170449, + "flos": 28329389470080.0, + "grad_norm": 1.7871112945652055, + "language_loss": 0.81346315, + "learning_rate": 1.4433156868574732e-06, + "loss": 0.83474994, + "num_input_tokens_seen": 215664425, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 10014, + "time_per_iteration": 3.929865837097168 + }, + { + "auxiliary_loss_clip": 0.01101176, + "auxiliary_loss_mlp": 0.01025273, + "balance_loss_clip": 1.01355481, + "balance_loss_mlp": 1.03631175, + "epoch": 0.6021343754697129, + "flos": 22747040058240.0, + "grad_norm": 1.3916523900358202, + "language_loss": 0.72577333, + "learning_rate": 1.442941626485624e-06, + "loss": 0.74703777, + "num_input_tokens_seen": 215684280, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6484375, + "step": 10015, + "time_per_iteration": 2.487917184829712 + }, + { + "auxiliary_loss_clip": 0.0102817, + "auxiliary_loss_mlp": 0.0100004, + "balance_loss_clip": 0.99888366, + "balance_loss_mlp": 1.00701785, + "epoch": 0.6021944987223808, + "flos": 65752007402880.0, + "grad_norm": 0.8145782570930438, + "language_loss": 0.54800987, + "learning_rate": 1.4425675872374848e-06, + "loss": 0.5682919, + "num_input_tokens_seen": 215739780, + "router_z_loss_clip": 0.01153564, + "router_z_loss_mlp": 0.2109375, + "step": 10016, + "time_per_iteration": 2.952225923538208 + }, + { + "auxiliary_loss_clip": 0.01105304, + "auxiliary_loss_mlp": 0.01029257, + "balance_loss_clip": 1.01721644, + "balance_loss_mlp": 1.03722167, + "epoch": 0.6022546219750489, + "flos": 16105182151680.0, + "grad_norm": 1.4974922822650143, + "language_loss": 0.82952374, + "learning_rate": 1.4421935691272381e-06, + "loss": 0.85086936, + "num_input_tokens_seen": 215757885, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 10017, + "time_per_iteration": 2.4482316970825195 + }, + { + "auxiliary_loss_clip": 0.01105754, + "auxiliary_loss_mlp": 0.01028153, + "balance_loss_clip": 1.01691723, + "balance_loss_mlp": 1.03885603, + "epoch": 0.6023147452277168, + "flos": 25512555985920.0, + "grad_norm": 1.7894712759587756, + "language_loss": 0.83787656, + "learning_rate": 1.4418195721690677e-06, + "loss": 0.85921562, + "num_input_tokens_seen": 215776415, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 10018, + "time_per_iteration": 2.570969820022583 + }, + { + "auxiliary_loss_clip": 0.01109615, + "auxiliary_loss_mlp": 0.01036282, + "balance_loss_clip": 1.02348518, + "balance_loss_mlp": 1.03740263, + "epoch": 0.6023748684803848, + "flos": 22636075968000.0, + "grad_norm": 1.651779624626633, + "language_loss": 0.78134441, + "learning_rate": 1.4414455963771549e-06, + "loss": 0.80280334, + "num_input_tokens_seen": 215794865, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.72265625, + "step": 10019, + "time_per_iteration": 2.4765312671661377 + }, + { + "auxiliary_loss_clip": 0.01103799, + "auxiliary_loss_mlp": 0.01027592, + "balance_loss_clip": 1.01586819, + "balance_loss_mlp": 1.03516555, + "epoch": 0.6024349917330527, + "flos": 26210444307840.0, + "grad_norm": 1.523816764872001, + "language_loss": 0.73855495, + "learning_rate": 1.441071641765681e-06, + "loss": 0.75986886, + "num_input_tokens_seen": 215816840, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6875, + "step": 10020, + "time_per_iteration": 2.530351161956787 + }, + { + "auxiliary_loss_clip": 0.01106179, + "auxiliary_loss_mlp": 0.01033215, + "balance_loss_clip": 1.02080584, + "balance_loss_mlp": 1.03670871, + "epoch": 0.6024951149857207, + "flos": 21251755762560.0, + "grad_norm": 1.5471183793037282, + "language_loss": 0.64036959, + "learning_rate": 1.4406977083488264e-06, + "loss": 0.66176355, + "num_input_tokens_seen": 215836100, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6953125, + "step": 10021, + "time_per_iteration": 2.491334915161133 + }, + { + "auxiliary_loss_clip": 0.01103767, + "auxiliary_loss_mlp": 0.01031861, + "balance_loss_clip": 1.01892638, + "balance_loss_mlp": 1.03523266, + "epoch": 0.6025552382383887, + "flos": 26943453152640.0, + "grad_norm": 1.4551090911481597, + "language_loss": 0.80527318, + "learning_rate": 1.4403237961407704e-06, + "loss": 0.8266294, + "num_input_tokens_seen": 215858480, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6875, + "step": 10022, + "time_per_iteration": 2.504343032836914 + }, + { + "auxiliary_loss_clip": 0.01110275, + "auxiliary_loss_mlp": 0.01029469, + "balance_loss_clip": 1.01720786, + "balance_loss_mlp": 1.03836441, + "epoch": 0.6026153614910567, + "flos": 31684379495040.0, + "grad_norm": 1.6380547321516945, + "language_loss": 0.66718352, + "learning_rate": 1.439949905155693e-06, + "loss": 0.68858099, + "num_input_tokens_seen": 215879950, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.71875, + "step": 10023, + "time_per_iteration": 2.550156593322754 + }, + { + "auxiliary_loss_clip": 0.01106872, + "auxiliary_loss_mlp": 0.01030774, + "balance_loss_clip": 1.01878142, + "balance_loss_mlp": 1.03709006, + "epoch": 0.6026754847437247, + "flos": 29312731175040.0, + "grad_norm": 3.9256623345472397, + "language_loss": 0.74829918, + "learning_rate": 1.4395760354077707e-06, + "loss": 0.76967561, + "num_input_tokens_seen": 215899830, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6953125, + "step": 10024, + "time_per_iteration": 2.5556838512420654 + }, + { + "auxiliary_loss_clip": 0.01106267, + "auxiliary_loss_mlp": 0.0103271, + "balance_loss_clip": 1.02027631, + "balance_loss_mlp": 1.03824794, + "epoch": 0.6027356079963926, + "flos": 23586775188480.0, + "grad_norm": 1.6728401649111677, + "language_loss": 0.7330395, + "learning_rate": 1.4392021869111815e-06, + "loss": 0.75442922, + "num_input_tokens_seen": 215920440, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6796875, + "step": 10025, + "time_per_iteration": 2.513984441757202 + }, + { + "auxiliary_loss_clip": 0.01110825, + "auxiliary_loss_mlp": 0.01034309, + "balance_loss_clip": 1.02113652, + "balance_loss_mlp": 1.03738081, + "epoch": 0.6027957312490606, + "flos": 20813753318400.0, + "grad_norm": 2.650368099581338, + "language_loss": 0.67278063, + "learning_rate": 1.4388283596801016e-06, + "loss": 0.69423193, + "num_input_tokens_seen": 215940535, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 10026, + "time_per_iteration": 2.542365550994873 + }, + { + "auxiliary_loss_clip": 0.01098285, + "auxiliary_loss_mlp": 0.0103129, + "balance_loss_clip": 1.0200423, + "balance_loss_mlp": 1.03320062, + "epoch": 0.6028558545017285, + "flos": 19935773182080.0, + "grad_norm": 2.2752496975382908, + "language_loss": 0.80318093, + "learning_rate": 1.4384545537287061e-06, + "loss": 0.82447666, + "num_input_tokens_seen": 215958045, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 10027, + "time_per_iteration": 2.444352626800537 + }, + { + "auxiliary_loss_clip": 0.01109574, + "auxiliary_loss_mlp": 0.01034466, + "balance_loss_clip": 1.02199113, + "balance_loss_mlp": 1.03832877, + "epoch": 0.6029159777543965, + "flos": 22820836550400.0, + "grad_norm": 2.211735765604233, + "language_loss": 0.71043503, + "learning_rate": 1.438080769071171e-06, + "loss": 0.73187542, + "num_input_tokens_seen": 215977330, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7109375, + "step": 10028, + "time_per_iteration": 2.479518413543701 + }, + { + "auxiliary_loss_clip": 0.01108344, + "auxiliary_loss_mlp": 0.01037039, + "balance_loss_clip": 1.02431321, + "balance_loss_mlp": 1.03661895, + "epoch": 0.6029761010070644, + "flos": 23587242065280.0, + "grad_norm": 1.6910926571719251, + "language_loss": 0.8391934, + "learning_rate": 1.437707005721669e-06, + "loss": 0.8606472, + "num_input_tokens_seen": 215997865, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 10029, + "time_per_iteration": 2.4701409339904785 + }, + { + "auxiliary_loss_clip": 0.01103116, + "auxiliary_loss_mlp": 0.01035147, + "balance_loss_clip": 1.02357185, + "balance_loss_mlp": 1.03613794, + "epoch": 0.6030362242597325, + "flos": 13662430859520.0, + "grad_norm": 1.6909986386379736, + "language_loss": 0.7958231, + "learning_rate": 1.437333263694373e-06, + "loss": 0.81720573, + "num_input_tokens_seen": 216016230, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.671875, + "step": 10030, + "time_per_iteration": 2.4561784267425537 + }, + { + "auxiliary_loss_clip": 0.01105406, + "auxiliary_loss_mlp": 0.01032107, + "balance_loss_clip": 1.0201565, + "balance_loss_mlp": 1.03732789, + "epoch": 0.6030963475124004, + "flos": 24422883045120.0, + "grad_norm": 1.5628951432606517, + "language_loss": 0.71363872, + "learning_rate": 1.4369595430034572e-06, + "loss": 0.73501384, + "num_input_tokens_seen": 216035785, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6796875, + "step": 10031, + "time_per_iteration": 2.512300729751587 + }, + { + "auxiliary_loss_clip": 0.011108, + "auxiliary_loss_mlp": 0.01032792, + "balance_loss_clip": 1.01967287, + "balance_loss_mlp": 1.03754997, + "epoch": 0.6031564707650684, + "flos": 29644043247360.0, + "grad_norm": 1.6597240808951284, + "language_loss": 0.73467577, + "learning_rate": 1.4365858436630912e-06, + "loss": 0.75611174, + "num_input_tokens_seen": 216059555, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73046875, + "step": 10032, + "time_per_iteration": 2.566749334335327 + }, + { + "auxiliary_loss_clip": 0.01111115, + "auxiliary_loss_mlp": 0.01032658, + "balance_loss_clip": 1.02004528, + "balance_loss_mlp": 1.04087365, + "epoch": 0.6032165940177363, + "flos": 16618776768000.0, + "grad_norm": 1.6790483076068066, + "language_loss": 0.68394065, + "learning_rate": 1.4362121656874465e-06, + "loss": 0.70537835, + "num_input_tokens_seen": 216077235, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 10033, + "time_per_iteration": 2.4334018230438232 + }, + { + "auxiliary_loss_clip": 0.01108457, + "auxiliary_loss_mlp": 0.01032938, + "balance_loss_clip": 1.02034903, + "balance_loss_mlp": 1.03930712, + "epoch": 0.6032767172704043, + "flos": 17488173553920.0, + "grad_norm": 1.9672909213981986, + "language_loss": 0.76032668, + "learning_rate": 1.4358385090906934e-06, + "loss": 0.78174067, + "num_input_tokens_seen": 216094985, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 10034, + "time_per_iteration": 2.430638074874878 + }, + { + "auxiliary_loss_clip": 0.01108661, + "auxiliary_loss_mlp": 0.01030537, + "balance_loss_clip": 1.01763296, + "balance_loss_mlp": 1.03813863, + "epoch": 0.6033368405230723, + "flos": 26832955939200.0, + "grad_norm": 2.463845452157716, + "language_loss": 0.74406719, + "learning_rate": 1.4354648738870004e-06, + "loss": 0.76545924, + "num_input_tokens_seen": 216115905, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 10035, + "time_per_iteration": 2.4784040451049805 + }, + { + "auxiliary_loss_clip": 0.01104517, + "auxiliary_loss_mlp": 0.0102907, + "balance_loss_clip": 1.01751912, + "balance_loss_mlp": 1.03727365, + "epoch": 0.6033969637757403, + "flos": 16909904499840.0, + "grad_norm": 1.5741870761115437, + "language_loss": 0.86713034, + "learning_rate": 1.435091260090536e-06, + "loss": 0.88846624, + "num_input_tokens_seen": 216132420, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 10036, + "time_per_iteration": 2.4385178089141846 + }, + { + "auxiliary_loss_clip": 0.01107298, + "auxiliary_loss_mlp": 0.01033369, + "balance_loss_clip": 1.02077413, + "balance_loss_mlp": 1.0369339, + "epoch": 0.6034570870284083, + "flos": 22930076787840.0, + "grad_norm": 2.0234995174732067, + "language_loss": 0.69894731, + "learning_rate": 1.4347176677154676e-06, + "loss": 0.72035396, + "num_input_tokens_seen": 216149800, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 10037, + "time_per_iteration": 2.4603824615478516 + }, + { + "auxiliary_loss_clip": 0.01106273, + "auxiliary_loss_mlp": 0.0103008, + "balance_loss_clip": 1.01800978, + "balance_loss_mlp": 1.03922844, + "epoch": 0.6035172102810762, + "flos": 23366319465600.0, + "grad_norm": 1.7516523293698103, + "language_loss": 0.85487103, + "learning_rate": 1.4343440967759616e-06, + "loss": 0.87623459, + "num_input_tokens_seen": 216168200, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.671875, + "step": 10038, + "time_per_iteration": 2.478269100189209 + }, + { + "auxiliary_loss_clip": 0.01108308, + "auxiliary_loss_mlp": 0.01035469, + "balance_loss_clip": 1.02303505, + "balance_loss_mlp": 1.03736269, + "epoch": 0.6035773335337442, + "flos": 20887082933760.0, + "grad_norm": 1.859562825285256, + "language_loss": 0.76468384, + "learning_rate": 1.4339705472861846e-06, + "loss": 0.78612161, + "num_input_tokens_seen": 216187105, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 10039, + "time_per_iteration": 2.4567699432373047 + }, + { + "auxiliary_loss_clip": 0.01104292, + "auxiliary_loss_mlp": 0.01032125, + "balance_loss_clip": 1.02047873, + "balance_loss_mlp": 1.03606224, + "epoch": 0.6036374567864121, + "flos": 24936298093440.0, + "grad_norm": 1.5744012931929299, + "language_loss": 0.70843172, + "learning_rate": 1.433597019260301e-06, + "loss": 0.72979593, + "num_input_tokens_seen": 216205440, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 10040, + "time_per_iteration": 2.491757392883301 + }, + { + "auxiliary_loss_clip": 0.01112027, + "auxiliary_loss_mlp": 0.01031343, + "balance_loss_clip": 1.01729393, + "balance_loss_mlp": 1.03952897, + "epoch": 0.6036975800390801, + "flos": 23148269953920.0, + "grad_norm": 2.4316928211832045, + "language_loss": 0.78400159, + "learning_rate": 1.433223512712475e-06, + "loss": 0.80543524, + "num_input_tokens_seen": 216223130, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7265625, + "step": 10041, + "time_per_iteration": 2.452766180038452 + }, + { + "auxiliary_loss_clip": 0.0110643, + "auxiliary_loss_mlp": 0.01030681, + "balance_loss_clip": 1.01892138, + "balance_loss_mlp": 1.03821898, + "epoch": 0.603757703291748, + "flos": 18660729127680.0, + "grad_norm": 1.6317318935059701, + "language_loss": 0.75574881, + "learning_rate": 1.4328500276568704e-06, + "loss": 0.77711999, + "num_input_tokens_seen": 216240260, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.68359375, + "step": 10042, + "time_per_iteration": 2.421757459640503 + }, + { + "auxiliary_loss_clip": 0.01103712, + "auxiliary_loss_mlp": 0.01028058, + "balance_loss_clip": 1.01626205, + "balance_loss_mlp": 1.03584445, + "epoch": 0.6038178265444161, + "flos": 19682603147520.0, + "grad_norm": 2.3271703550981138, + "language_loss": 0.84446549, + "learning_rate": 1.4324765641076498e-06, + "loss": 0.86578321, + "num_input_tokens_seen": 216258510, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6796875, + "step": 10043, + "time_per_iteration": 2.5310654640197754 + }, + { + "auxiliary_loss_clip": 0.01108903, + "auxiliary_loss_mlp": 0.01039945, + "balance_loss_clip": 1.02648067, + "balance_loss_mlp": 1.03705609, + "epoch": 0.603877949797084, + "flos": 22638230784000.0, + "grad_norm": 1.9621351051557316, + "language_loss": 0.69924289, + "learning_rate": 1.432103122078974e-06, + "loss": 0.72073138, + "num_input_tokens_seen": 216277550, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71875, + "step": 10044, + "time_per_iteration": 2.4903266429901123 + }, + { + "auxiliary_loss_clip": 0.01110997, + "auxiliary_loss_mlp": 0.01031283, + "balance_loss_clip": 1.0184021, + "balance_loss_mlp": 1.03954315, + "epoch": 0.603938073049752, + "flos": 25447881548160.0, + "grad_norm": 2.0335535035690557, + "language_loss": 0.77986026, + "learning_rate": 1.4317297015850057e-06, + "loss": 0.80128312, + "num_input_tokens_seen": 216296690, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71484375, + "step": 10045, + "time_per_iteration": 2.4881081581115723 + }, + { + "auxiliary_loss_clip": 0.0110549, + "auxiliary_loss_mlp": 0.01031468, + "balance_loss_clip": 1.01878381, + "balance_loss_mlp": 1.03781128, + "epoch": 0.6039981963024199, + "flos": 22340135813760.0, + "grad_norm": 1.5706793221026767, + "language_loss": 0.76730686, + "learning_rate": 1.4313563026399036e-06, + "loss": 0.7886765, + "num_input_tokens_seen": 216316110, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.67578125, + "step": 10046, + "time_per_iteration": 2.4508702754974365 + }, + { + "auxiliary_loss_clip": 0.01104935, + "auxiliary_loss_mlp": 0.01030001, + "balance_loss_clip": 1.01866424, + "balance_loss_mlp": 1.03633487, + "epoch": 0.6040583195550879, + "flos": 20703148364160.0, + "grad_norm": 1.5559732700373865, + "language_loss": 0.86937988, + "learning_rate": 1.430982925257827e-06, + "loss": 0.89072925, + "num_input_tokens_seen": 216333855, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6875, + "step": 10047, + "time_per_iteration": 2.465775489807129 + }, + { + "auxiliary_loss_clip": 0.01105881, + "auxiliary_loss_mlp": 0.0102976, + "balance_loss_clip": 1.01808965, + "balance_loss_mlp": 1.03915882, + "epoch": 0.604118442807756, + "flos": 27163118776320.0, + "grad_norm": 1.5346026168560238, + "language_loss": 0.75463951, + "learning_rate": 1.4306095694529358e-06, + "loss": 0.77599597, + "num_input_tokens_seen": 216354890, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66796875, + "step": 10048, + "time_per_iteration": 2.5098941326141357 + }, + { + "auxiliary_loss_clip": 0.01115671, + "auxiliary_loss_mlp": 0.01039349, + "balance_loss_clip": 1.0247221, + "balance_loss_mlp": 1.03979802, + "epoch": 0.6041785660604239, + "flos": 30881524654080.0, + "grad_norm": 2.285441895193273, + "language_loss": 0.66271615, + "learning_rate": 1.430236235239386e-06, + "loss": 0.68426633, + "num_input_tokens_seen": 216376055, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7578125, + "step": 10049, + "time_per_iteration": 2.537810802459717 + }, + { + "auxiliary_loss_clip": 0.01105568, + "auxiliary_loss_mlp": 0.01034046, + "balance_loss_clip": 1.02244711, + "balance_loss_mlp": 1.03769147, + "epoch": 0.6042386893130919, + "flos": 19938215306880.0, + "grad_norm": 1.5404607265151984, + "language_loss": 0.66999722, + "learning_rate": 1.429862922631336e-06, + "loss": 0.69139338, + "num_input_tokens_seen": 216396295, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 10050, + "time_per_iteration": 2.5025947093963623 + }, + { + "auxiliary_loss_clip": 0.01108275, + "auxiliary_loss_mlp": 0.01032262, + "balance_loss_clip": 1.01961958, + "balance_loss_mlp": 1.03837466, + "epoch": 0.6042988125657598, + "flos": 32415915882240.0, + "grad_norm": 2.5982455651349325, + "language_loss": 0.69730866, + "learning_rate": 1.4294896316429408e-06, + "loss": 0.718714, + "num_input_tokens_seen": 216416605, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69921875, + "step": 10051, + "time_per_iteration": 2.5584428310394287 + }, + { + "auxiliary_loss_clip": 0.01103115, + "auxiliary_loss_mlp": 0.01032392, + "balance_loss_clip": 1.02025664, + "balance_loss_mlp": 1.03470123, + "epoch": 0.6043589358184278, + "flos": 17420805596160.0, + "grad_norm": 1.883115508781388, + "language_loss": 0.64664817, + "learning_rate": 1.4291163622883553e-06, + "loss": 0.66800326, + "num_input_tokens_seen": 216435130, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.68359375, + "step": 10052, + "time_per_iteration": 3.8776209354400635 + }, + { + "auxiliary_loss_clip": 0.01106513, + "auxiliary_loss_mlp": 0.01035509, + "balance_loss_clip": 1.02243757, + "balance_loss_mlp": 1.03725076, + "epoch": 0.6044190590710957, + "flos": 27672834723840.0, + "grad_norm": 1.6187947947661157, + "language_loss": 0.68885666, + "learning_rate": 1.4287431145817358e-06, + "loss": 0.71027684, + "num_input_tokens_seen": 216455640, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6953125, + "step": 10053, + "time_per_iteration": 3.8864493370056152 + }, + { + "auxiliary_loss_clip": 0.0102793, + "auxiliary_loss_mlp": 0.01006986, + "balance_loss_clip": 1.00581133, + "balance_loss_mlp": 1.00684035, + "epoch": 0.6044791823237637, + "flos": 65316267515520.0, + "grad_norm": 0.7454166517190239, + "language_loss": 0.6043961, + "learning_rate": 1.4283698885372336e-06, + "loss": 0.62474525, + "num_input_tokens_seen": 216518130, + "router_z_loss_clip": 0.01171875, + "router_z_loss_mlp": 0.2109375, + "step": 10054, + "time_per_iteration": 4.507344961166382 + }, + { + "auxiliary_loss_clip": 0.01104586, + "auxiliary_loss_mlp": 0.01032941, + "balance_loss_clip": 1.01997089, + "balance_loss_mlp": 1.03684747, + "epoch": 0.6045393055764317, + "flos": 24492369905280.0, + "grad_norm": 1.6844086395494355, + "language_loss": 0.85636723, + "learning_rate": 1.4279966841690027e-06, + "loss": 0.87774247, + "num_input_tokens_seen": 216536845, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6796875, + "step": 10055, + "time_per_iteration": 3.930811643600464 + }, + { + "auxiliary_loss_clip": 0.01110141, + "auxiliary_loss_mlp": 0.01039632, + "balance_loss_clip": 1.02585101, + "balance_loss_mlp": 1.04008687, + "epoch": 0.6045994288290997, + "flos": 19054345340160.0, + "grad_norm": 2.2914523857580353, + "language_loss": 0.73531651, + "learning_rate": 1.4276235014911952e-06, + "loss": 0.75681424, + "num_input_tokens_seen": 216551860, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.69921875, + "step": 10056, + "time_per_iteration": 2.424492835998535 + }, + { + "auxiliary_loss_clip": 0.01105735, + "auxiliary_loss_mlp": 0.01033852, + "balance_loss_clip": 1.02206218, + "balance_loss_mlp": 1.03815937, + "epoch": 0.6046595520817676, + "flos": 26576697335040.0, + "grad_norm": 1.6647683047258863, + "language_loss": 0.80205089, + "learning_rate": 1.4272503405179616e-06, + "loss": 0.82344675, + "num_input_tokens_seen": 216574775, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.67578125, + "step": 10057, + "time_per_iteration": 2.4988396167755127 + }, + { + "auxiliary_loss_clip": 0.01104511, + "auxiliary_loss_mlp": 0.01030383, + "balance_loss_clip": 1.01725817, + "balance_loss_mlp": 1.0369792, + "epoch": 0.6047196753344356, + "flos": 13582277660160.0, + "grad_norm": 2.656202002056598, + "language_loss": 0.75172931, + "learning_rate": 1.4268772012634527e-06, + "loss": 0.7730782, + "num_input_tokens_seen": 216590100, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.67578125, + "step": 10058, + "time_per_iteration": 2.4108166694641113 + }, + { + "auxiliary_loss_clip": 0.01102949, + "auxiliary_loss_mlp": 0.0102824, + "balance_loss_clip": 1.01627767, + "balance_loss_mlp": 1.03582406, + "epoch": 0.6047797985871035, + "flos": 25520456977920.0, + "grad_norm": 1.75224691919055, + "language_loss": 0.71103948, + "learning_rate": 1.4265040837418176e-06, + "loss": 0.73235136, + "num_input_tokens_seen": 216610145, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 10059, + "time_per_iteration": 2.4859349727630615 + }, + { + "auxiliary_loss_clip": 0.01105606, + "auxiliary_loss_mlp": 0.01029263, + "balance_loss_clip": 1.01686525, + "balance_loss_mlp": 1.03741932, + "epoch": 0.6048399218397715, + "flos": 20520147548160.0, + "grad_norm": 1.587856969701262, + "language_loss": 0.76134253, + "learning_rate": 1.4261309879672054e-06, + "loss": 0.78269112, + "num_input_tokens_seen": 216630625, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 10060, + "time_per_iteration": 2.473043918609619 + }, + { + "auxiliary_loss_clip": 0.01104669, + "auxiliary_loss_mlp": 0.01033513, + "balance_loss_clip": 1.02149105, + "balance_loss_mlp": 1.03757381, + "epoch": 0.6049000450924396, + "flos": 20408788408320.0, + "grad_norm": 2.1588277388437276, + "language_loss": 0.73414183, + "learning_rate": 1.4257579139537628e-06, + "loss": 0.75552368, + "num_input_tokens_seen": 216649255, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 10061, + "time_per_iteration": 2.440943956375122 + }, + { + "auxiliary_loss_clip": 0.01105712, + "auxiliary_loss_mlp": 0.01029083, + "balance_loss_clip": 1.01750207, + "balance_loss_mlp": 1.03634655, + "epoch": 0.6049601683451075, + "flos": 20741357456640.0, + "grad_norm": 2.0041380833930145, + "language_loss": 0.67225152, + "learning_rate": 1.425384861715639e-06, + "loss": 0.69359946, + "num_input_tokens_seen": 216668100, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6953125, + "step": 10062, + "time_per_iteration": 2.4789950847625732 + }, + { + "auxiliary_loss_clip": 0.01105607, + "auxiliary_loss_mlp": 0.01038153, + "balance_loss_clip": 1.02592254, + "balance_loss_mlp": 1.03717685, + "epoch": 0.6050202915977755, + "flos": 20083114771200.0, + "grad_norm": 2.163401547344872, + "language_loss": 0.71361917, + "learning_rate": 1.425011831266978e-06, + "loss": 0.73505676, + "num_input_tokens_seen": 216686125, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 10063, + "time_per_iteration": 2.43302321434021 + }, + { + "auxiliary_loss_clip": 0.01102028, + "auxiliary_loss_mlp": 0.01031729, + "balance_loss_clip": 1.01968336, + "balance_loss_mlp": 1.03561401, + "epoch": 0.6050804148504434, + "flos": 15960821391360.0, + "grad_norm": 1.6164006934985269, + "language_loss": 0.84802878, + "learning_rate": 1.424638822621926e-06, + "loss": 0.86936641, + "num_input_tokens_seen": 216704265, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6640625, + "step": 10064, + "time_per_iteration": 2.447003126144409 + }, + { + "auxiliary_loss_clip": 0.0110348, + "auxiliary_loss_mlp": 0.01033722, + "balance_loss_clip": 1.02206945, + "balance_loss_mlp": 1.0354557, + "epoch": 0.6051405381031114, + "flos": 17456644391040.0, + "grad_norm": 2.2435880628396587, + "language_loss": 0.79335666, + "learning_rate": 1.4242658357946278e-06, + "loss": 0.81472868, + "num_input_tokens_seen": 216721765, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 10065, + "time_per_iteration": 2.437286376953125 + }, + { + "auxiliary_loss_clip": 0.01111102, + "auxiliary_loss_mlp": 0.01031949, + "balance_loss_clip": 1.01874626, + "balance_loss_mlp": 1.03979814, + "epoch": 0.6052006613557793, + "flos": 11400130517760.0, + "grad_norm": 1.9931239622384858, + "language_loss": 0.78788042, + "learning_rate": 1.423892870799226e-06, + "loss": 0.80931091, + "num_input_tokens_seen": 216738295, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 10066, + "time_per_iteration": 2.4346959590911865 + }, + { + "auxiliary_loss_clip": 0.01104198, + "auxiliary_loss_mlp": 0.01027592, + "balance_loss_clip": 1.01523578, + "balance_loss_mlp": 1.03645194, + "epoch": 0.6052607846084473, + "flos": 24750998807040.0, + "grad_norm": 1.5823653049215993, + "language_loss": 0.73320723, + "learning_rate": 1.4235199276498655e-06, + "loss": 0.75452518, + "num_input_tokens_seen": 216759875, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.67578125, + "step": 10067, + "time_per_iteration": 2.5625689029693604 + }, + { + "auxiliary_loss_clip": 0.01107587, + "auxiliary_loss_mlp": 0.01029785, + "balance_loss_clip": 1.01783991, + "balance_loss_mlp": 1.03971481, + "epoch": 0.6053209078611153, + "flos": 20741141975040.0, + "grad_norm": 1.6116431503881068, + "language_loss": 0.68952775, + "learning_rate": 1.4231470063606863e-06, + "loss": 0.7109015, + "num_input_tokens_seen": 216780705, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6796875, + "step": 10068, + "time_per_iteration": 2.5137228965759277 + }, + { + "auxiliary_loss_clip": 0.01103779, + "auxiliary_loss_mlp": 0.01030656, + "balance_loss_clip": 1.01877117, + "balance_loss_mlp": 1.03444147, + "epoch": 0.6053810311137833, + "flos": 18953149749120.0, + "grad_norm": 2.4473752710004586, + "language_loss": 0.86667287, + "learning_rate": 1.4227741069458303e-06, + "loss": 0.8880173, + "num_input_tokens_seen": 216797625, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 10069, + "time_per_iteration": 2.4172072410583496 + }, + { + "auxiliary_loss_clip": 0.01103834, + "auxiliary_loss_mlp": 0.01025707, + "balance_loss_clip": 1.01405478, + "balance_loss_mlp": 1.03583956, + "epoch": 0.6054411543664512, + "flos": 23951124794880.0, + "grad_norm": 1.4672457121748899, + "language_loss": 0.83270586, + "learning_rate": 1.4224012294194387e-06, + "loss": 0.85400122, + "num_input_tokens_seen": 216817610, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 10070, + "time_per_iteration": 2.464062452316284 + }, + { + "auxiliary_loss_clip": 0.01106279, + "auxiliary_loss_mlp": 0.01034036, + "balance_loss_clip": 1.02120876, + "balance_loss_mlp": 1.03630137, + "epoch": 0.6055012776191192, + "flos": 20593979953920.0, + "grad_norm": 1.5142081514734282, + "language_loss": 0.86056209, + "learning_rate": 1.4220283737956496e-06, + "loss": 0.88196522, + "num_input_tokens_seen": 216836835, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69921875, + "step": 10071, + "time_per_iteration": 2.435492515563965 + }, + { + "auxiliary_loss_clip": 0.01108912, + "auxiliary_loss_mlp": 0.01034714, + "balance_loss_clip": 1.02138042, + "balance_loss_mlp": 1.03817403, + "epoch": 0.6055614008717871, + "flos": 30298191782400.0, + "grad_norm": 1.7615317101181058, + "language_loss": 0.7703979, + "learning_rate": 1.421655540088603e-06, + "loss": 0.79183424, + "num_input_tokens_seen": 216856760, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.70703125, + "step": 10072, + "time_per_iteration": 2.5326199531555176 + }, + { + "auxiliary_loss_clip": 0.01104713, + "auxiliary_loss_mlp": 0.01026829, + "balance_loss_clip": 1.01362085, + "balance_loss_mlp": 1.03505397, + "epoch": 0.6056215241244551, + "flos": 27125017424640.0, + "grad_norm": 1.81020475903248, + "language_loss": 0.74383593, + "learning_rate": 1.4212827283124367e-06, + "loss": 0.76515132, + "num_input_tokens_seen": 216878795, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.6953125, + "step": 10073, + "time_per_iteration": 2.4809958934783936 + }, + { + "auxiliary_loss_clip": 0.01026997, + "auxiliary_loss_mlp": 0.01002422, + "balance_loss_clip": 1.00124216, + "balance_loss_mlp": 1.005988, + "epoch": 0.6056816473771232, + "flos": 56007323925120.0, + "grad_norm": 0.7588463064410728, + "language_loss": 0.55220222, + "learning_rate": 1.4209099384812863e-06, + "loss": 0.57249641, + "num_input_tokens_seen": 216937800, + "router_z_loss_clip": 0.01177979, + "router_z_loss_mlp": 0.2109375, + "step": 10074, + "time_per_iteration": 3.101125717163086 + }, + { + "auxiliary_loss_clip": 0.01105722, + "auxiliary_loss_mlp": 0.01029513, + "balance_loss_clip": 1.01752663, + "balance_loss_mlp": 1.03776407, + "epoch": 0.6057417706297911, + "flos": 23549499849600.0, + "grad_norm": 1.8033827655021575, + "language_loss": 0.81893396, + "learning_rate": 1.4205371706092894e-06, + "loss": 0.84028631, + "num_input_tokens_seen": 216955280, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6796875, + "step": 10075, + "time_per_iteration": 2.468269109725952 + }, + { + "auxiliary_loss_clip": 0.01105409, + "auxiliary_loss_mlp": 0.01024158, + "balance_loss_clip": 1.01174855, + "balance_loss_mlp": 1.03608966, + "epoch": 0.6058018938824591, + "flos": 27744296832000.0, + "grad_norm": 2.0602815760014392, + "language_loss": 0.78272569, + "learning_rate": 1.4201644247105813e-06, + "loss": 0.80402136, + "num_input_tokens_seen": 216976950, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69140625, + "step": 10076, + "time_per_iteration": 2.4932310581207275 + }, + { + "auxiliary_loss_clip": 0.01106985, + "auxiliary_loss_mlp": 0.01033773, + "balance_loss_clip": 1.02113056, + "balance_loss_mlp": 1.0365119, + "epoch": 0.605862017135127, + "flos": 22783381643520.0, + "grad_norm": 1.7482408044671829, + "language_loss": 0.72032154, + "learning_rate": 1.4197917007992964e-06, + "loss": 0.74172914, + "num_input_tokens_seen": 216996945, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 10077, + "time_per_iteration": 2.4521970748901367 + }, + { + "auxiliary_loss_clip": 0.01106927, + "auxiliary_loss_mlp": 0.01030175, + "balance_loss_clip": 1.01777112, + "balance_loss_mlp": 1.03759694, + "epoch": 0.605922140387795, + "flos": 21215019127680.0, + "grad_norm": 2.2939968580618215, + "language_loss": 0.55467492, + "learning_rate": 1.4194189988895682e-06, + "loss": 0.57604587, + "num_input_tokens_seen": 217016580, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69140625, + "step": 10078, + "time_per_iteration": 2.4789669513702393 + }, + { + "auxiliary_loss_clip": 0.01106991, + "auxiliary_loss_mlp": 0.01032923, + "balance_loss_clip": 1.02026904, + "balance_loss_mlp": 1.0364964, + "epoch": 0.6059822636404629, + "flos": 27268372604160.0, + "grad_norm": 2.206511673730914, + "language_loss": 0.70283198, + "learning_rate": 1.4190463189955297e-06, + "loss": 0.72423112, + "num_input_tokens_seen": 217037300, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 10079, + "time_per_iteration": 2.494340181350708 + }, + { + "auxiliary_loss_clip": 0.01105474, + "auxiliary_loss_mlp": 0.01038174, + "balance_loss_clip": 1.02605653, + "balance_loss_mlp": 1.03662014, + "epoch": 0.606042386893131, + "flos": 20631327120000.0, + "grad_norm": 1.7147155998392456, + "language_loss": 0.62479711, + "learning_rate": 1.4186736611313131e-06, + "loss": 0.64623356, + "num_input_tokens_seen": 217055805, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 10080, + "time_per_iteration": 2.4511730670928955 + }, + { + "auxiliary_loss_clip": 0.01107796, + "auxiliary_loss_mlp": 0.01029525, + "balance_loss_clip": 1.01679373, + "balance_loss_mlp": 1.03799117, + "epoch": 0.6061025101457989, + "flos": 23002293081600.0, + "grad_norm": 1.8405271272242842, + "language_loss": 0.71136117, + "learning_rate": 1.4183010253110492e-06, + "loss": 0.73273432, + "num_input_tokens_seen": 217074175, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 10081, + "time_per_iteration": 2.455698251724243 + }, + { + "auxiliary_loss_clip": 0.01105313, + "auxiliary_loss_mlp": 0.01028675, + "balance_loss_clip": 1.01634336, + "balance_loss_mlp": 1.03703296, + "epoch": 0.6061626333984669, + "flos": 29898937134720.0, + "grad_norm": 1.6262436392400634, + "language_loss": 0.69449544, + "learning_rate": 1.4179284115488691e-06, + "loss": 0.71583533, + "num_input_tokens_seen": 217095695, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.68359375, + "step": 10082, + "time_per_iteration": 2.52297043800354 + }, + { + "auxiliary_loss_clip": 0.01106177, + "auxiliary_loss_mlp": 0.01029148, + "balance_loss_clip": 1.01712012, + "balance_loss_mlp": 1.03799009, + "epoch": 0.6062227566511348, + "flos": 25009196745600.0, + "grad_norm": 1.4063428250351147, + "language_loss": 0.65709507, + "learning_rate": 1.4175558198589015e-06, + "loss": 0.67844832, + "num_input_tokens_seen": 217116260, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 10083, + "time_per_iteration": 2.464259147644043 + }, + { + "auxiliary_loss_clip": 0.01103152, + "auxiliary_loss_mlp": 0.01031191, + "balance_loss_clip": 1.01852512, + "balance_loss_mlp": 1.03483891, + "epoch": 0.6062828799038028, + "flos": 19463943104640.0, + "grad_norm": 2.2500443264419423, + "language_loss": 0.74058753, + "learning_rate": 1.4171832502552764e-06, + "loss": 0.76193094, + "num_input_tokens_seen": 217134465, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.68359375, + "step": 10084, + "time_per_iteration": 2.4634742736816406 + }, + { + "auxiliary_loss_clip": 0.01105676, + "auxiliary_loss_mlp": 0.01033057, + "balance_loss_clip": 1.02070093, + "balance_loss_mlp": 1.03634107, + "epoch": 0.6063430031564707, + "flos": 13589568120960.0, + "grad_norm": 2.43129197416672, + "language_loss": 0.72011673, + "learning_rate": 1.4168107027521204e-06, + "loss": 0.74150407, + "num_input_tokens_seen": 217149920, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.69140625, + "step": 10085, + "time_per_iteration": 2.4218525886535645 + }, + { + "auxiliary_loss_clip": 0.01104669, + "auxiliary_loss_mlp": 0.01036153, + "balance_loss_clip": 1.02473903, + "balance_loss_mlp": 1.03681493, + "epoch": 0.6064031264091387, + "flos": 23255499029760.0, + "grad_norm": 2.1595465216971834, + "language_loss": 0.76514173, + "learning_rate": 1.4164381773635605e-06, + "loss": 0.78655005, + "num_input_tokens_seen": 217168165, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6796875, + "step": 10086, + "time_per_iteration": 2.466749429702759 + }, + { + "auxiliary_loss_clip": 0.01103719, + "auxiliary_loss_mlp": 0.01030938, + "balance_loss_clip": 1.01887345, + "balance_loss_mlp": 1.03720832, + "epoch": 0.6064632496618068, + "flos": 22458462192000.0, + "grad_norm": 1.355452455492161, + "language_loss": 0.72577417, + "learning_rate": 1.4160656741037246e-06, + "loss": 0.74712074, + "num_input_tokens_seen": 217190070, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6640625, + "step": 10087, + "time_per_iteration": 2.465503692626953 + }, + { + "auxiliary_loss_clip": 0.01101232, + "auxiliary_loss_mlp": 0.01029891, + "balance_loss_clip": 1.01915646, + "balance_loss_mlp": 1.03517973, + "epoch": 0.6065233729144747, + "flos": 25118652464640.0, + "grad_norm": 1.707331111485516, + "language_loss": 0.83679116, + "learning_rate": 1.4156931929867355e-06, + "loss": 0.85810244, + "num_input_tokens_seen": 217209370, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.66015625, + "step": 10088, + "time_per_iteration": 2.490476369857788 + }, + { + "auxiliary_loss_clip": 0.0110302, + "auxiliary_loss_mlp": 0.01027327, + "balance_loss_clip": 1.01454818, + "balance_loss_mlp": 1.03563654, + "epoch": 0.6065834961671427, + "flos": 23477355383040.0, + "grad_norm": 2.1730607876548924, + "language_loss": 0.7139647, + "learning_rate": 1.4153207340267201e-06, + "loss": 0.73526812, + "num_input_tokens_seen": 217226990, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.671875, + "step": 10089, + "time_per_iteration": 2.4656596183776855 + }, + { + "auxiliary_loss_clip": 0.0110663, + "auxiliary_loss_mlp": 0.01036262, + "balance_loss_clip": 1.0252049, + "balance_loss_mlp": 1.0383575, + "epoch": 0.6066436194198106, + "flos": 17019396132480.0, + "grad_norm": 1.8527545733498374, + "language_loss": 0.82743609, + "learning_rate": 1.4149482972378009e-06, + "loss": 0.84886503, + "num_input_tokens_seen": 217244585, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.68359375, + "step": 10090, + "time_per_iteration": 2.4523448944091797 + }, + { + "auxiliary_loss_clip": 0.01112391, + "auxiliary_loss_mlp": 0.01036313, + "balance_loss_clip": 1.0230329, + "balance_loss_mlp": 1.03768897, + "epoch": 0.6067037426724786, + "flos": 18514752255360.0, + "grad_norm": 2.0611786286574514, + "language_loss": 0.75486428, + "learning_rate": 1.4145758826341e-06, + "loss": 0.77635133, + "num_input_tokens_seen": 217263435, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.74609375, + "step": 10091, + "time_per_iteration": 2.412745475769043 + }, + { + "auxiliary_loss_clip": 0.01103456, + "auxiliary_loss_mlp": 0.01034501, + "balance_loss_clip": 1.02224016, + "balance_loss_mlp": 1.03655899, + "epoch": 0.6067638659251465, + "flos": 22345989730560.0, + "grad_norm": 2.008159335053083, + "language_loss": 0.79580414, + "learning_rate": 1.4142034902297415e-06, + "loss": 0.81718373, + "num_input_tokens_seen": 217283725, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.66796875, + "step": 10092, + "time_per_iteration": 2.4787280559539795 + }, + { + "auxiliary_loss_clip": 0.01107853, + "auxiliary_loss_mlp": 0.01036056, + "balance_loss_clip": 1.02313423, + "balance_loss_mlp": 1.03692424, + "epoch": 0.6068239891778145, + "flos": 12451019748480.0, + "grad_norm": 1.8882550633479742, + "language_loss": 0.76085305, + "learning_rate": 1.4138311200388444e-06, + "loss": 0.78229213, + "num_input_tokens_seen": 217301120, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 10093, + "time_per_iteration": 3.8885409832000732 + }, + { + "auxiliary_loss_clip": 0.01103337, + "auxiliary_loss_mlp": 0.01033728, + "balance_loss_clip": 1.02215874, + "balance_loss_mlp": 1.03691947, + "epoch": 0.6068841124304825, + "flos": 23185868515200.0, + "grad_norm": 2.3186576779301387, + "language_loss": 0.87448221, + "learning_rate": 1.4134587720755304e-06, + "loss": 0.89585286, + "num_input_tokens_seen": 217319585, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6640625, + "step": 10094, + "time_per_iteration": 2.4714174270629883 + }, + { + "auxiliary_loss_clip": 0.01105151, + "auxiliary_loss_mlp": 0.01032495, + "balance_loss_clip": 1.01992464, + "balance_loss_mlp": 1.03669322, + "epoch": 0.6069442356831505, + "flos": 18587902302720.0, + "grad_norm": 1.5923423583427312, + "language_loss": 0.71694756, + "learning_rate": 1.413086446353919e-06, + "loss": 0.73832405, + "num_input_tokens_seen": 217338880, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 10095, + "time_per_iteration": 3.852285861968994 + }, + { + "auxiliary_loss_clip": 0.01105359, + "auxiliary_loss_mlp": 0.01028972, + "balance_loss_clip": 1.01727819, + "balance_loss_mlp": 1.036134, + "epoch": 0.6070043589358184, + "flos": 20960340721920.0, + "grad_norm": 1.6817389817846544, + "language_loss": 0.76919025, + "learning_rate": 1.4127141428881273e-06, + "loss": 0.7905336, + "num_input_tokens_seen": 217357480, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.69140625, + "step": 10096, + "time_per_iteration": 3.9708244800567627 + }, + { + "auxiliary_loss_clip": 0.01107233, + "auxiliary_loss_mlp": 0.0104091, + "balance_loss_clip": 1.0291853, + "balance_loss_mlp": 1.03734136, + "epoch": 0.6070644821884864, + "flos": 11692443398400.0, + "grad_norm": 1.7249712415107992, + "language_loss": 0.79864824, + "learning_rate": 1.4123418616922749e-06, + "loss": 0.82012963, + "num_input_tokens_seen": 217374575, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.69921875, + "step": 10097, + "time_per_iteration": 2.4229838848114014 + }, + { + "auxiliary_loss_clip": 0.01102947, + "auxiliary_loss_mlp": 0.01030901, + "balance_loss_clip": 1.01920676, + "balance_loss_mlp": 1.03555632, + "epoch": 0.6071246054411543, + "flos": 19310568030720.0, + "grad_norm": 1.4260099040951442, + "language_loss": 0.67338455, + "learning_rate": 1.411969602780478e-06, + "loss": 0.69472301, + "num_input_tokens_seen": 217392950, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.67578125, + "step": 10098, + "time_per_iteration": 3.9603915214538574 + }, + { + "auxiliary_loss_clip": 0.01103812, + "auxiliary_loss_mlp": 0.01031482, + "balance_loss_clip": 1.01989472, + "balance_loss_mlp": 1.03617251, + "epoch": 0.6071847286938223, + "flos": 17749029098880.0, + "grad_norm": 1.8973033677095168, + "language_loss": 0.80694121, + "learning_rate": 1.4115973661668523e-06, + "loss": 0.82829416, + "num_input_tokens_seen": 217412145, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.67578125, + "step": 10099, + "time_per_iteration": 2.409189462661743 + }, + { + "auxiliary_loss_clip": 0.01108088, + "auxiliary_loss_mlp": 0.01034923, + "balance_loss_clip": 1.02195358, + "balance_loss_mlp": 1.03531229, + "epoch": 0.6072448519464904, + "flos": 22637512512000.0, + "grad_norm": 2.230451803545553, + "language_loss": 0.70439708, + "learning_rate": 1.4112251518655133e-06, + "loss": 0.72582722, + "num_input_tokens_seen": 217432080, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7265625, + "step": 10100, + "time_per_iteration": 2.484339952468872 + }, + { + "auxiliary_loss_clip": 0.01108261, + "auxiliary_loss_mlp": 0.01036108, + "balance_loss_clip": 1.02344251, + "balance_loss_mlp": 1.03890038, + "epoch": 0.6073049751991583, + "flos": 19537308633600.0, + "grad_norm": 1.5791187964785582, + "language_loss": 0.70447475, + "learning_rate": 1.4108529598905764e-06, + "loss": 0.72591841, + "num_input_tokens_seen": 217450945, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 10101, + "time_per_iteration": 2.4309775829315186 + }, + { + "auxiliary_loss_clip": 0.01102039, + "auxiliary_loss_mlp": 0.01033664, + "balance_loss_clip": 1.02154672, + "balance_loss_mlp": 1.03490043, + "epoch": 0.6073650984518263, + "flos": 28294233033600.0, + "grad_norm": 1.6995748618566444, + "language_loss": 0.69606161, + "learning_rate": 1.410480790256154e-06, + "loss": 0.71741861, + "num_input_tokens_seen": 217473105, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 10102, + "time_per_iteration": 2.524376630783081 + }, + { + "auxiliary_loss_clip": 0.01107251, + "auxiliary_loss_mlp": 0.01033826, + "balance_loss_clip": 1.02220285, + "balance_loss_mlp": 1.0382359, + "epoch": 0.6074252217044942, + "flos": 25664422688640.0, + "grad_norm": 1.7952265928760782, + "language_loss": 0.73694891, + "learning_rate": 1.4101086429763589e-06, + "loss": 0.75835967, + "num_input_tokens_seen": 217491780, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.69140625, + "step": 10103, + "time_per_iteration": 2.4625236988067627 + }, + { + "auxiliary_loss_clip": 0.01110432, + "auxiliary_loss_mlp": 0.01036259, + "balance_loss_clip": 1.02295542, + "balance_loss_mlp": 1.03862011, + "epoch": 0.6074853449571622, + "flos": 22857106308480.0, + "grad_norm": 1.6961753672547197, + "language_loss": 0.76819229, + "learning_rate": 1.4097365180653032e-06, + "loss": 0.7896592, + "num_input_tokens_seen": 217510605, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 10104, + "time_per_iteration": 2.467879056930542 + }, + { + "auxiliary_loss_clip": 0.01028848, + "auxiliary_loss_mlp": 0.01014471, + "balance_loss_clip": 1.01331425, + "balance_loss_mlp": 1.00746071, + "epoch": 0.6075454682098301, + "flos": 67111406547840.0, + "grad_norm": 0.7111703190831327, + "language_loss": 0.56059039, + "learning_rate": 1.4093644155370977e-06, + "loss": 0.58102357, + "num_input_tokens_seen": 217574815, + "router_z_loss_clip": 0.01153564, + "router_z_loss_mlp": 0.21484375, + "step": 10105, + "time_per_iteration": 3.066772222518921 + }, + { + "auxiliary_loss_clip": 0.01028964, + "auxiliary_loss_mlp": 0.01012366, + "balance_loss_clip": 1.01119196, + "balance_loss_mlp": 1.00750494, + "epoch": 0.6076055914624982, + "flos": 70712024751360.0, + "grad_norm": 0.7555703523663572, + "language_loss": 0.56791615, + "learning_rate": 1.4089923354058533e-06, + "loss": 0.58832943, + "num_input_tokens_seen": 217632375, + "router_z_loss_clip": 0.01171875, + "router_z_loss_mlp": 0.21484375, + "step": 10106, + "time_per_iteration": 3.0346710681915283 + }, + { + "auxiliary_loss_clip": 0.01103036, + "auxiliary_loss_mlp": 0.01033262, + "balance_loss_clip": 1.02136517, + "balance_loss_mlp": 1.03558111, + "epoch": 0.6076657147151661, + "flos": 28364545906560.0, + "grad_norm": 1.556971911912289, + "language_loss": 0.68647003, + "learning_rate": 1.4086202776856784e-06, + "loss": 0.70783293, + "num_input_tokens_seen": 217653055, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.67578125, + "step": 10107, + "time_per_iteration": 2.5070221424102783 + }, + { + "auxiliary_loss_clip": 0.0110868, + "auxiliary_loss_mlp": 0.01030388, + "balance_loss_clip": 1.01801395, + "balance_loss_mlp": 1.03806663, + "epoch": 0.6077258379678341, + "flos": 15049767807360.0, + "grad_norm": 2.0591355858624594, + "language_loss": 0.81006205, + "learning_rate": 1.4082482423906815e-06, + "loss": 0.83145273, + "num_input_tokens_seen": 217671520, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.70703125, + "step": 10108, + "time_per_iteration": 2.449876070022583 + }, + { + "auxiliary_loss_clip": 0.01109814, + "auxiliary_loss_mlp": 0.01031426, + "balance_loss_clip": 1.01798475, + "balance_loss_mlp": 1.03772831, + "epoch": 0.607785961220502, + "flos": 36167251553280.0, + "grad_norm": 1.6885620074685026, + "language_loss": 0.70979893, + "learning_rate": 1.4078762295349714e-06, + "loss": 0.7312113, + "num_input_tokens_seen": 217691880, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.72265625, + "step": 10109, + "time_per_iteration": 2.569441318511963 + }, + { + "auxiliary_loss_clip": 0.01101619, + "auxiliary_loss_mlp": 0.01027249, + "balance_loss_clip": 1.01598346, + "balance_loss_mlp": 1.0354414, + "epoch": 0.60784608447317, + "flos": 22524249951360.0, + "grad_norm": 1.5138210455097567, + "language_loss": 0.80043399, + "learning_rate": 1.407504239132653e-06, + "loss": 0.82172269, + "num_input_tokens_seen": 217710530, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.66015625, + "step": 10110, + "time_per_iteration": 2.5667614936828613 + }, + { + "auxiliary_loss_clip": 0.01107667, + "auxiliary_loss_mlp": 0.01029434, + "balance_loss_clip": 1.01691723, + "balance_loss_mlp": 1.03725386, + "epoch": 0.6079062077258379, + "flos": 23841166285440.0, + "grad_norm": 2.0834448443085463, + "language_loss": 0.7047748, + "learning_rate": 1.4071322711978338e-06, + "loss": 0.72614574, + "num_input_tokens_seen": 217728650, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.70703125, + "step": 10111, + "time_per_iteration": 2.449047565460205 + }, + { + "auxiliary_loss_clip": 0.01107266, + "auxiliary_loss_mlp": 0.01030401, + "balance_loss_clip": 1.01765776, + "balance_loss_mlp": 1.03687668, + "epoch": 0.6079663309785059, + "flos": 23367037737600.0, + "grad_norm": 1.8731958384235612, + "language_loss": 0.65437806, + "learning_rate": 1.4067603257446186e-06, + "loss": 0.67575473, + "num_input_tokens_seen": 217747135, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 10112, + "time_per_iteration": 2.5956103801727295 + }, + { + "auxiliary_loss_clip": 0.01028267, + "auxiliary_loss_mlp": 0.00997544, + "balance_loss_clip": 0.99637622, + "balance_loss_mlp": 1.00686228, + "epoch": 0.6080264542311739, + "flos": 71382873110400.0, + "grad_norm": 0.6257418493150695, + "language_loss": 0.49600247, + "learning_rate": 1.4063884027871105e-06, + "loss": 0.51626056, + "num_input_tokens_seen": 217811860, + "router_z_loss_clip": 0.01165771, + "router_z_loss_mlp": 0.21484375, + "step": 10113, + "time_per_iteration": 3.0929043292999268 + }, + { + "auxiliary_loss_clip": 0.01027496, + "auxiliary_loss_mlp": 0.01000577, + "balance_loss_clip": 0.99939102, + "balance_loss_mlp": 1.0062747, + "epoch": 0.6080865774838419, + "flos": 66529833442560.0, + "grad_norm": 0.8371205862323671, + "language_loss": 0.56964719, + "learning_rate": 1.4060165023394147e-06, + "loss": 0.58992791, + "num_input_tokens_seen": 217866510, + "router_z_loss_clip": 0.01184082, + "router_z_loss_mlp": 0.21289062, + "step": 10114, + "time_per_iteration": 2.9712812900543213 + }, + { + "auxiliary_loss_clip": 0.01109587, + "auxiliary_loss_mlp": 0.01029457, + "balance_loss_clip": 1.01581991, + "balance_loss_mlp": 1.03810143, + "epoch": 0.6081467007365099, + "flos": 19207935895680.0, + "grad_norm": 1.7367632173905274, + "language_loss": 0.69756359, + "learning_rate": 1.4056446244156317e-06, + "loss": 0.71895409, + "num_input_tokens_seen": 217885650, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71484375, + "step": 10115, + "time_per_iteration": 2.4941470623016357 + }, + { + "auxiliary_loss_clip": 0.01106631, + "auxiliary_loss_mlp": 0.01027755, + "balance_loss_clip": 1.01550055, + "balance_loss_mlp": 1.03715134, + "epoch": 0.6082068239891778, + "flos": 24167737762560.0, + "grad_norm": 18.577098805589706, + "language_loss": 0.72356099, + "learning_rate": 1.4052727690298642e-06, + "loss": 0.74490488, + "num_input_tokens_seen": 217905300, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 10116, + "time_per_iteration": 2.448673725128174 + }, + { + "auxiliary_loss_clip": 0.01107481, + "auxiliary_loss_mlp": 0.01034138, + "balance_loss_clip": 1.02089953, + "balance_loss_mlp": 1.03622699, + "epoch": 0.6082669472418458, + "flos": 37413316310400.0, + "grad_norm": 1.8751462040451332, + "language_loss": 0.53553987, + "learning_rate": 1.4049009361962138e-06, + "loss": 0.55695611, + "num_input_tokens_seen": 217927845, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 10117, + "time_per_iteration": 2.592958927154541 + }, + { + "auxiliary_loss_clip": 0.01106561, + "auxiliary_loss_mlp": 0.01025434, + "balance_loss_clip": 1.01340544, + "balance_loss_mlp": 1.03709269, + "epoch": 0.6083270704945137, + "flos": 15085534775040.0, + "grad_norm": 1.965088318697828, + "language_loss": 0.69835466, + "learning_rate": 1.4045291259287786e-06, + "loss": 0.71967459, + "num_input_tokens_seen": 217946145, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6953125, + "step": 10118, + "time_per_iteration": 2.4184305667877197 + }, + { + "auxiliary_loss_clip": 0.01107391, + "auxiliary_loss_mlp": 0.01027545, + "balance_loss_clip": 1.01545143, + "balance_loss_mlp": 1.03855991, + "epoch": 0.6083871937471818, + "flos": 20668458804480.0, + "grad_norm": 1.4929706938116498, + "language_loss": 0.74641609, + "learning_rate": 1.4041573382416588e-06, + "loss": 0.7677654, + "num_input_tokens_seen": 217965190, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 10119, + "time_per_iteration": 2.4534857273101807 + }, + { + "auxiliary_loss_clip": 0.01104393, + "auxiliary_loss_mlp": 0.01033724, + "balance_loss_clip": 1.02189887, + "balance_loss_mlp": 1.03641152, + "epoch": 0.6084473169998497, + "flos": 21506901045120.0, + "grad_norm": 1.5799518634527623, + "language_loss": 0.67427665, + "learning_rate": 1.4037855731489525e-06, + "loss": 0.69565779, + "num_input_tokens_seen": 217983625, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 10120, + "time_per_iteration": 2.439384937286377 + }, + { + "auxiliary_loss_clip": 0.01109214, + "auxiliary_loss_mlp": 0.01032656, + "balance_loss_clip": 1.01977515, + "balance_loss_mlp": 1.03793478, + "epoch": 0.6085074402525177, + "flos": 26870051710080.0, + "grad_norm": 1.74219428879995, + "language_loss": 0.74141055, + "learning_rate": 1.4034138306647571e-06, + "loss": 0.76282924, + "num_input_tokens_seen": 218006005, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 10121, + "time_per_iteration": 2.506490707397461 + }, + { + "auxiliary_loss_clip": 0.01103145, + "auxiliary_loss_mlp": 0.01026601, + "balance_loss_clip": 1.01530576, + "balance_loss_mlp": 1.03512359, + "epoch": 0.6085675635051856, + "flos": 10889839952640.0, + "grad_norm": 1.7909457152882267, + "language_loss": 0.80599827, + "learning_rate": 1.4030421108031685e-06, + "loss": 0.82729572, + "num_input_tokens_seen": 218024195, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6796875, + "step": 10122, + "time_per_iteration": 2.422988176345825 + }, + { + "auxiliary_loss_clip": 0.01107244, + "auxiliary_loss_mlp": 0.01030794, + "balance_loss_clip": 1.01846766, + "balance_loss_mlp": 1.03843355, + "epoch": 0.6086276867578536, + "flos": 34862186707200.0, + "grad_norm": 1.4671658127927028, + "language_loss": 0.55411458, + "learning_rate": 1.402670413578284e-06, + "loss": 0.57549489, + "num_input_tokens_seen": 218047190, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 10123, + "time_per_iteration": 2.6203012466430664 + }, + { + "auxiliary_loss_clip": 0.01104564, + "auxiliary_loss_mlp": 0.01031847, + "balance_loss_clip": 1.01975298, + "balance_loss_mlp": 1.03711987, + "epoch": 0.6086878100105215, + "flos": 20047706939520.0, + "grad_norm": 1.7982570079112092, + "language_loss": 0.73612612, + "learning_rate": 1.4022987390041965e-06, + "loss": 0.75749022, + "num_input_tokens_seen": 218065945, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 10124, + "time_per_iteration": 2.465306282043457 + }, + { + "auxiliary_loss_clip": 0.01105892, + "auxiliary_loss_mlp": 0.01030108, + "balance_loss_clip": 1.01791847, + "balance_loss_mlp": 1.03691709, + "epoch": 0.6087479332631895, + "flos": 18332469711360.0, + "grad_norm": 3.6424543705255648, + "language_loss": 0.66014802, + "learning_rate": 1.4019270870950006e-06, + "loss": 0.681508, + "num_input_tokens_seen": 218085285, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 10125, + "time_per_iteration": 2.4767675399780273 + }, + { + "auxiliary_loss_clip": 0.01105208, + "auxiliary_loss_mlp": 0.01031644, + "balance_loss_clip": 1.01943719, + "balance_loss_mlp": 1.03736734, + "epoch": 0.6088080565158575, + "flos": 24493411399680.0, + "grad_norm": 2.3623427434848066, + "language_loss": 0.76202977, + "learning_rate": 1.40155545786479e-06, + "loss": 0.78339827, + "num_input_tokens_seen": 218104735, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6796875, + "step": 10126, + "time_per_iteration": 2.469557046890259 + }, + { + "auxiliary_loss_clip": 0.01109286, + "auxiliary_loss_mlp": 0.01028819, + "balance_loss_clip": 1.01583672, + "balance_loss_mlp": 1.03710127, + "epoch": 0.6088681797685255, + "flos": 10269016260480.0, + "grad_norm": 2.6026801218546036, + "language_loss": 0.71315622, + "learning_rate": 1.4011838513276558e-06, + "loss": 0.73453724, + "num_input_tokens_seen": 218121855, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.72265625, + "step": 10127, + "time_per_iteration": 2.463219404220581 + }, + { + "auxiliary_loss_clip": 0.01110954, + "auxiliary_loss_mlp": 0.01033412, + "balance_loss_clip": 1.02028704, + "balance_loss_mlp": 1.03909373, + "epoch": 0.6089283030211935, + "flos": 21973703218560.0, + "grad_norm": 2.879650268865683, + "language_loss": 0.72776711, + "learning_rate": 1.400812267497691e-06, + "loss": 0.74921077, + "num_input_tokens_seen": 218137325, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 10128, + "time_per_iteration": 2.4591028690338135 + }, + { + "auxiliary_loss_clip": 0.01105059, + "auxiliary_loss_mlp": 0.01030144, + "balance_loss_clip": 1.01816392, + "balance_loss_mlp": 1.03707957, + "epoch": 0.6089884262738614, + "flos": 17785191116160.0, + "grad_norm": 4.4407298106903585, + "language_loss": 0.73322678, + "learning_rate": 1.4004407063889842e-06, + "loss": 0.75457883, + "num_input_tokens_seen": 218155530, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6796875, + "step": 10129, + "time_per_iteration": 2.463595151901245 + }, + { + "auxiliary_loss_clip": 0.01104701, + "auxiliary_loss_mlp": 0.01034967, + "balance_loss_clip": 1.02273047, + "balance_loss_mlp": 1.03612173, + "epoch": 0.6090485495265294, + "flos": 36910423946880.0, + "grad_norm": 1.3648179669909797, + "language_loss": 0.65579844, + "learning_rate": 1.400069168015626e-06, + "loss": 0.67719507, + "num_input_tokens_seen": 218182535, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 10130, + "time_per_iteration": 2.638197183609009 + }, + { + "auxiliary_loss_clip": 0.01101489, + "auxiliary_loss_mlp": 0.01024774, + "balance_loss_clip": 1.01360381, + "balance_loss_mlp": 1.03529549, + "epoch": 0.6091086727791973, + "flos": 19899036547200.0, + "grad_norm": 1.5719208851669182, + "language_loss": 0.77160382, + "learning_rate": 1.3996976523917054e-06, + "loss": 0.79286647, + "num_input_tokens_seen": 218201740, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66015625, + "step": 10131, + "time_per_iteration": 2.4989805221557617 + }, + { + "auxiliary_loss_clip": 0.01104899, + "auxiliary_loss_mlp": 0.01030421, + "balance_loss_clip": 1.01976359, + "balance_loss_mlp": 1.03746176, + "epoch": 0.6091687960318654, + "flos": 22163635359360.0, + "grad_norm": 1.8809693968510182, + "language_loss": 0.76772207, + "learning_rate": 1.3993261595313093e-06, + "loss": 0.78907526, + "num_input_tokens_seen": 218219800, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.67578125, + "step": 10132, + "time_per_iteration": 2.4471144676208496 + }, + { + "auxiliary_loss_clip": 0.01100275, + "auxiliary_loss_mlp": 0.0103227, + "balance_loss_clip": 1.02116609, + "balance_loss_mlp": 1.035465, + "epoch": 0.6092289192845333, + "flos": 21465280160640.0, + "grad_norm": 1.8031513435586903, + "language_loss": 0.75461888, + "learning_rate": 1.3989546894485261e-06, + "loss": 0.77594435, + "num_input_tokens_seen": 218237585, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6484375, + "step": 10133, + "time_per_iteration": 2.4543044567108154 + }, + { + "auxiliary_loss_clip": 0.01104667, + "auxiliary_loss_mlp": 0.01032044, + "balance_loss_clip": 1.01921129, + "balance_loss_mlp": 1.03661132, + "epoch": 0.6092890425372013, + "flos": 28694924225280.0, + "grad_norm": 1.617219095446177, + "language_loss": 0.63404942, + "learning_rate": 1.3985832421574414e-06, + "loss": 0.65541649, + "num_input_tokens_seen": 218258700, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 10134, + "time_per_iteration": 2.4968786239624023 + }, + { + "auxiliary_loss_clip": 0.01101874, + "auxiliary_loss_mlp": 0.01028709, + "balance_loss_clip": 1.01722288, + "balance_loss_mlp": 1.03555775, + "epoch": 0.6093491657898692, + "flos": 20813178700800.0, + "grad_norm": 1.7042888689612277, + "language_loss": 0.78689611, + "learning_rate": 1.3982118176721397e-06, + "loss": 0.80820185, + "num_input_tokens_seen": 218275655, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 10135, + "time_per_iteration": 3.8730435371398926 + }, + { + "auxiliary_loss_clip": 0.01105216, + "auxiliary_loss_mlp": 0.01029652, + "balance_loss_clip": 1.01840401, + "balance_loss_mlp": 1.03660417, + "epoch": 0.6094092890425372, + "flos": 25446983708160.0, + "grad_norm": 2.4676554523748115, + "language_loss": 0.72265971, + "learning_rate": 1.3978404160067069e-06, + "loss": 0.7440083, + "num_input_tokens_seen": 218295720, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6875, + "step": 10136, + "time_per_iteration": 2.721339464187622 + }, + { + "auxiliary_loss_clip": 0.01108039, + "auxiliary_loss_mlp": 0.0102914, + "balance_loss_clip": 1.01704586, + "balance_loss_mlp": 1.0386939, + "epoch": 0.6094694122952051, + "flos": 35621265847680.0, + "grad_norm": 2.10435735907629, + "language_loss": 0.74540055, + "learning_rate": 1.3974690371752253e-06, + "loss": 0.76677233, + "num_input_tokens_seen": 218316745, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69140625, + "step": 10137, + "time_per_iteration": 3.969383716583252 + }, + { + "auxiliary_loss_clip": 0.01106599, + "auxiliary_loss_mlp": 0.01040095, + "balance_loss_clip": 1.027542, + "balance_loss_mlp": 1.03668833, + "epoch": 0.6095295355478731, + "flos": 24456962073600.0, + "grad_norm": 1.7200645743924223, + "language_loss": 0.80628771, + "learning_rate": 1.3970976811917785e-06, + "loss": 0.82775462, + "num_input_tokens_seen": 218335385, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 10138, + "time_per_iteration": 3.9027063846588135 + }, + { + "auxiliary_loss_clip": 0.01100268, + "auxiliary_loss_mlp": 0.01029542, + "balance_loss_clip": 1.01837766, + "balance_loss_mlp": 1.03564954, + "epoch": 0.6095896588005411, + "flos": 15633208419840.0, + "grad_norm": 1.5498019522052684, + "language_loss": 0.80843186, + "learning_rate": 1.3967263480704481e-06, + "loss": 0.82972997, + "num_input_tokens_seen": 218353320, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 10139, + "time_per_iteration": 3.9400634765625 + }, + { + "auxiliary_loss_clip": 0.01108872, + "auxiliary_loss_mlp": 0.01034626, + "balance_loss_clip": 1.02206182, + "balance_loss_mlp": 1.03840351, + "epoch": 0.6096497820532091, + "flos": 15550577182080.0, + "grad_norm": 2.0925165633907254, + "language_loss": 0.8375181, + "learning_rate": 1.396355037825315e-06, + "loss": 0.85895312, + "num_input_tokens_seen": 218365620, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 10140, + "time_per_iteration": 2.4656758308410645 + }, + { + "auxiliary_loss_clip": 0.01105652, + "auxiliary_loss_mlp": 0.01035103, + "balance_loss_clip": 1.02315855, + "balance_loss_mlp": 1.03600419, + "epoch": 0.6097099053058771, + "flos": 24204474397440.0, + "grad_norm": 2.1792852747623557, + "language_loss": 0.75585604, + "learning_rate": 1.3959837504704592e-06, + "loss": 0.77726358, + "num_input_tokens_seen": 218383785, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6953125, + "step": 10141, + "time_per_iteration": 2.574366331100464 + }, + { + "auxiliary_loss_clip": 0.01104603, + "auxiliary_loss_mlp": 0.01026989, + "balance_loss_clip": 1.01502669, + "balance_loss_mlp": 1.03598619, + "epoch": 0.609770028558545, + "flos": 19570238426880.0, + "grad_norm": 1.9409433083757806, + "language_loss": 0.76637286, + "learning_rate": 1.3956124860199603e-06, + "loss": 0.78768879, + "num_input_tokens_seen": 218399055, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6875, + "step": 10142, + "time_per_iteration": 2.4868385791778564 + }, + { + "auxiliary_loss_clip": 0.01104768, + "auxiliary_loss_mlp": 0.01032927, + "balance_loss_clip": 1.02058291, + "balance_loss_mlp": 1.03676569, + "epoch": 0.609830151811213, + "flos": 23949185460480.0, + "grad_norm": 1.745652179186059, + "language_loss": 0.76381373, + "learning_rate": 1.3952412444878964e-06, + "loss": 0.7851907, + "num_input_tokens_seen": 218419120, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6796875, + "step": 10143, + "time_per_iteration": 2.5635735988616943 + }, + { + "auxiliary_loss_clip": 0.01106393, + "auxiliary_loss_mlp": 0.01030422, + "balance_loss_clip": 1.01801276, + "balance_loss_mlp": 1.03715992, + "epoch": 0.6098902750638809, + "flos": 16179732829440.0, + "grad_norm": 1.7435526117723426, + "language_loss": 0.74993449, + "learning_rate": 1.3948700258883448e-06, + "loss": 0.77130264, + "num_input_tokens_seen": 218435290, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69140625, + "step": 10144, + "time_per_iteration": 2.4298861026763916 + }, + { + "auxiliary_loss_clip": 0.01107837, + "auxiliary_loss_mlp": 0.01028049, + "balance_loss_clip": 1.01527548, + "balance_loss_mlp": 1.03741479, + "epoch": 0.609950398316549, + "flos": 44526393763200.0, + "grad_norm": 2.307147766408813, + "language_loss": 0.72727025, + "learning_rate": 1.394498830235383e-06, + "loss": 0.74862915, + "num_input_tokens_seen": 218457880, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.703125, + "step": 10145, + "time_per_iteration": 2.694578170776367 + }, + { + "auxiliary_loss_clip": 0.01104204, + "auxiliary_loss_mlp": 0.01030027, + "balance_loss_clip": 1.01882768, + "balance_loss_mlp": 1.036484, + "epoch": 0.6100105215692169, + "flos": 23221743223680.0, + "grad_norm": 7.584582797643419, + "language_loss": 0.69428813, + "learning_rate": 1.3941276575430862e-06, + "loss": 0.71563041, + "num_input_tokens_seen": 218475930, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6796875, + "step": 10146, + "time_per_iteration": 2.4656052589416504 + }, + { + "auxiliary_loss_clip": 0.01102864, + "auxiliary_loss_mlp": 0.0102747, + "balance_loss_clip": 1.01665735, + "balance_loss_mlp": 1.03688705, + "epoch": 0.6100706448218849, + "flos": 15012564295680.0, + "grad_norm": 1.601297479877826, + "language_loss": 0.76745832, + "learning_rate": 1.3937565078255289e-06, + "loss": 0.78876168, + "num_input_tokens_seen": 218493675, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.65625, + "step": 10147, + "time_per_iteration": 2.5520474910736084 + }, + { + "auxiliary_loss_clip": 0.01103536, + "auxiliary_loss_mlp": 0.01026544, + "balance_loss_clip": 1.01436126, + "balance_loss_mlp": 1.03525686, + "epoch": 0.6101307680745528, + "flos": 19639976682240.0, + "grad_norm": 1.740411663388647, + "language_loss": 0.78028274, + "learning_rate": 1.393385381096786e-06, + "loss": 0.80158353, + "num_input_tokens_seen": 218511780, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 10148, + "time_per_iteration": 2.4648149013519287 + }, + { + "auxiliary_loss_clip": 0.01110722, + "auxiliary_loss_mlp": 0.0103437, + "balance_loss_clip": 1.02107859, + "balance_loss_mlp": 1.03736377, + "epoch": 0.6101908913272208, + "flos": 29935566028800.0, + "grad_norm": 2.1220511331050758, + "language_loss": 0.53903639, + "learning_rate": 1.39301427737093e-06, + "loss": 0.56048727, + "num_input_tokens_seen": 218531850, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.734375, + "step": 10149, + "time_per_iteration": 2.566124200820923 + }, + { + "auxiliary_loss_clip": 0.01101762, + "auxiliary_loss_mlp": 0.0103104, + "balance_loss_clip": 1.01953661, + "balance_loss_mlp": 1.03660202, + "epoch": 0.6102510145798887, + "flos": 21798639308160.0, + "grad_norm": 1.8365676346298867, + "language_loss": 0.80172944, + "learning_rate": 1.3926431966620333e-06, + "loss": 0.82305747, + "num_input_tokens_seen": 218551245, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65234375, + "step": 10150, + "time_per_iteration": 2.5030646324157715 + }, + { + "auxiliary_loss_clip": 0.01108008, + "auxiliary_loss_mlp": 0.01035252, + "balance_loss_clip": 1.02272308, + "balance_loss_mlp": 1.0384438, + "epoch": 0.6103111378325567, + "flos": 20706129192960.0, + "grad_norm": 1.5453703107618904, + "language_loss": 0.69006532, + "learning_rate": 1.3922721389841684e-06, + "loss": 0.7114979, + "num_input_tokens_seen": 218571365, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 10151, + "time_per_iteration": 2.5013327598571777 + }, + { + "auxiliary_loss_clip": 0.01103309, + "auxiliary_loss_mlp": 0.01028825, + "balance_loss_clip": 1.01780438, + "balance_loss_mlp": 1.0351758, + "epoch": 0.6103712610852247, + "flos": 29381643417600.0, + "grad_norm": 1.64819750933, + "language_loss": 0.70659781, + "learning_rate": 1.3919011043514036e-06, + "loss": 0.7279191, + "num_input_tokens_seen": 218588315, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6796875, + "step": 10152, + "time_per_iteration": 2.519719362258911 + }, + { + "auxiliary_loss_clip": 0.01107575, + "auxiliary_loss_mlp": 0.01032635, + "balance_loss_clip": 1.02031481, + "balance_loss_mlp": 1.03778815, + "epoch": 0.6104313843378927, + "flos": 20813035046400.0, + "grad_norm": 2.061001889975494, + "language_loss": 0.77937526, + "learning_rate": 1.391530092777811e-06, + "loss": 0.80077732, + "num_input_tokens_seen": 218605940, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69921875, + "step": 10153, + "time_per_iteration": 2.4679317474365234 + }, + { + "auxiliary_loss_clip": 0.01106601, + "auxiliary_loss_mlp": 0.01029839, + "balance_loss_clip": 1.01775157, + "balance_loss_mlp": 1.03693819, + "epoch": 0.6104915075905607, + "flos": 26578457101440.0, + "grad_norm": 1.6071348715593325, + "language_loss": 0.79040915, + "learning_rate": 1.3911591042774573e-06, + "loss": 0.8117736, + "num_input_tokens_seen": 218626100, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6953125, + "step": 10154, + "time_per_iteration": 2.4811360836029053 + }, + { + "auxiliary_loss_clip": 0.01102999, + "auxiliary_loss_mlp": 0.0102928, + "balance_loss_clip": 1.01754403, + "balance_loss_mlp": 1.03598225, + "epoch": 0.6105516308432286, + "flos": 23915788790400.0, + "grad_norm": 1.696167937827746, + "language_loss": 0.70110655, + "learning_rate": 1.3907881388644116e-06, + "loss": 0.72242928, + "num_input_tokens_seen": 218645060, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66796875, + "step": 10155, + "time_per_iteration": 2.4926087856292725 + }, + { + "auxiliary_loss_clip": 0.01107481, + "auxiliary_loss_mlp": 0.01028627, + "balance_loss_clip": 1.01622927, + "balance_loss_mlp": 1.03898025, + "epoch": 0.6106117540958966, + "flos": 31577365900800.0, + "grad_norm": 1.5701440613704458, + "language_loss": 0.7118175, + "learning_rate": 1.3904171965527413e-06, + "loss": 0.73317862, + "num_input_tokens_seen": 218667690, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 10156, + "time_per_iteration": 2.529212236404419 + }, + { + "auxiliary_loss_clip": 0.01103012, + "auxiliary_loss_mlp": 0.01029909, + "balance_loss_clip": 1.01777911, + "balance_loss_mlp": 1.0372014, + "epoch": 0.6106718773485645, + "flos": 19608160210560.0, + "grad_norm": 1.5875405214127527, + "language_loss": 0.67776453, + "learning_rate": 1.3900462773565114e-06, + "loss": 0.69909376, + "num_input_tokens_seen": 218687505, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66015625, + "step": 10157, + "time_per_iteration": 2.4632043838500977 + }, + { + "auxiliary_loss_clip": 0.01103689, + "auxiliary_loss_mlp": 0.01028593, + "balance_loss_clip": 1.01682067, + "balance_loss_mlp": 1.03470659, + "epoch": 0.6107320006012326, + "flos": 17123895774720.0, + "grad_norm": 1.8568219075391552, + "language_loss": 0.72478032, + "learning_rate": 1.3896753812897877e-06, + "loss": 0.74610317, + "num_input_tokens_seen": 218705315, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6875, + "step": 10158, + "time_per_iteration": 2.419174909591675 + }, + { + "auxiliary_loss_clip": 0.01106137, + "auxiliary_loss_mlp": 0.01032049, + "balance_loss_clip": 1.02036619, + "balance_loss_mlp": 1.03673482, + "epoch": 0.6107921238539005, + "flos": 30148228500480.0, + "grad_norm": 1.8610687942781703, + "language_loss": 0.69770175, + "learning_rate": 1.389304508366635e-06, + "loss": 0.71908361, + "num_input_tokens_seen": 218725735, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6953125, + "step": 10159, + "time_per_iteration": 2.5595028400421143 + }, + { + "auxiliary_loss_clip": 0.01106993, + "auxiliary_loss_mlp": 0.01031058, + "balance_loss_clip": 1.01866579, + "balance_loss_mlp": 1.03715146, + "epoch": 0.6108522471065685, + "flos": 18440273404800.0, + "grad_norm": 1.8623845683480673, + "language_loss": 0.79084963, + "learning_rate": 1.3889336586011167e-06, + "loss": 0.81223011, + "num_input_tokens_seen": 218743215, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6953125, + "step": 10160, + "time_per_iteration": 2.4194223880767822 + }, + { + "auxiliary_loss_clip": 0.01029586, + "auxiliary_loss_mlp": 0.0100036, + "balance_loss_clip": 0.99904329, + "balance_loss_mlp": 1.00828457, + "epoch": 0.6109123703592364, + "flos": 64135454791680.0, + "grad_norm": 0.8176802836469281, + "language_loss": 0.61464268, + "learning_rate": 1.388562832007295e-06, + "loss": 0.63494217, + "num_input_tokens_seen": 218806440, + "router_z_loss_clip": 0.01318359, + "router_z_loss_mlp": 0.21289062, + "step": 10161, + "time_per_iteration": 3.204864501953125 + }, + { + "auxiliary_loss_clip": 0.01106906, + "auxiliary_loss_mlp": 0.01033953, + "balance_loss_clip": 1.02142394, + "balance_loss_mlp": 1.03706193, + "epoch": 0.6109724936119044, + "flos": 20667848273280.0, + "grad_norm": 1.7743481380342319, + "language_loss": 0.76395631, + "learning_rate": 1.3881920285992324e-06, + "loss": 0.78536499, + "num_input_tokens_seen": 218825720, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69921875, + "step": 10162, + "time_per_iteration": 2.4414381980895996 + }, + { + "auxiliary_loss_clip": 0.01104818, + "auxiliary_loss_mlp": 0.01030642, + "balance_loss_clip": 1.01866198, + "balance_loss_mlp": 1.0372498, + "epoch": 0.6110326168645723, + "flos": 31351882273920.0, + "grad_norm": 2.0274139033268077, + "language_loss": 0.71609962, + "learning_rate": 1.3878212483909888e-06, + "loss": 0.73745424, + "num_input_tokens_seen": 218847735, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 10163, + "time_per_iteration": 2.541321039199829 + }, + { + "auxiliary_loss_clip": 0.01101773, + "auxiliary_loss_mlp": 0.01029191, + "balance_loss_clip": 1.01797926, + "balance_loss_mlp": 1.03580725, + "epoch": 0.6110927401172404, + "flos": 25003378742400.0, + "grad_norm": 14.54042933705356, + "language_loss": 0.59390211, + "learning_rate": 1.387450491396625e-06, + "loss": 0.61521178, + "num_input_tokens_seen": 218866585, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.66015625, + "step": 10164, + "time_per_iteration": 2.4755120277404785 + }, + { + "auxiliary_loss_clip": 0.01103552, + "auxiliary_loss_mlp": 0.01029571, + "balance_loss_clip": 1.01823997, + "balance_loss_mlp": 1.03602457, + "epoch": 0.6111528633699083, + "flos": 26248078782720.0, + "grad_norm": 1.7214680551340567, + "language_loss": 0.75950801, + "learning_rate": 1.3870797576302003e-06, + "loss": 0.7808392, + "num_input_tokens_seen": 218885560, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.67578125, + "step": 10165, + "time_per_iteration": 2.491528034210205 + }, + { + "auxiliary_loss_clip": 0.01105154, + "auxiliary_loss_mlp": 0.01027958, + "balance_loss_clip": 1.01582241, + "balance_loss_mlp": 1.03982759, + "epoch": 0.6112129866225763, + "flos": 22382474970240.0, + "grad_norm": 1.4553973070214548, + "language_loss": 0.78996694, + "learning_rate": 1.3867090471057719e-06, + "loss": 0.81129807, + "num_input_tokens_seen": 218905055, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.65625, + "step": 10166, + "time_per_iteration": 2.4699227809906006 + }, + { + "auxiliary_loss_clip": 0.01105985, + "auxiliary_loss_mlp": 0.01029373, + "balance_loss_clip": 1.01734531, + "balance_loss_mlp": 1.03734827, + "epoch": 0.6112731098752443, + "flos": 25227892702080.0, + "grad_norm": 3.097252625024806, + "language_loss": 0.67920876, + "learning_rate": 1.3863383598373987e-06, + "loss": 0.70056236, + "num_input_tokens_seen": 218924030, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 10167, + "time_per_iteration": 2.5190818309783936 + }, + { + "auxiliary_loss_clip": 0.01104165, + "auxiliary_loss_mlp": 0.01029568, + "balance_loss_clip": 1.01872551, + "balance_loss_mlp": 1.03759277, + "epoch": 0.6113332331279122, + "flos": 22893160584960.0, + "grad_norm": 1.7954202202348515, + "language_loss": 0.78805661, + "learning_rate": 1.3859676958391364e-06, + "loss": 0.80939388, + "num_input_tokens_seen": 218943750, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6640625, + "step": 10168, + "time_per_iteration": 2.4622983932495117 + }, + { + "auxiliary_loss_clip": 0.01110572, + "auxiliary_loss_mlp": 0.01034407, + "balance_loss_clip": 1.02078724, + "balance_loss_mlp": 1.03739679, + "epoch": 0.6113933563805802, + "flos": 18620329305600.0, + "grad_norm": 4.090256272371363, + "language_loss": 0.85369581, + "learning_rate": 1.3855970551250398e-06, + "loss": 0.87514555, + "num_input_tokens_seen": 218957585, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73046875, + "step": 10169, + "time_per_iteration": 2.4625487327575684 + }, + { + "auxiliary_loss_clip": 0.01101901, + "auxiliary_loss_mlp": 0.01027849, + "balance_loss_clip": 1.01722717, + "balance_loss_mlp": 1.03553629, + "epoch": 0.6114534796332481, + "flos": 41866275317760.0, + "grad_norm": 2.5520669740881727, + "language_loss": 0.78887564, + "learning_rate": 1.3852264377091652e-06, + "loss": 0.81017315, + "num_input_tokens_seen": 218980025, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6640625, + "step": 10170, + "time_per_iteration": 2.6308984756469727 + }, + { + "auxiliary_loss_clip": 0.01108241, + "auxiliary_loss_mlp": 0.01035541, + "balance_loss_clip": 1.02221966, + "balance_loss_mlp": 1.03567076, + "epoch": 0.6115136028859162, + "flos": 21908454163200.0, + "grad_norm": 1.8675504682209607, + "language_loss": 0.69072127, + "learning_rate": 1.3848558436055651e-06, + "loss": 0.71215916, + "num_input_tokens_seen": 218998200, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7265625, + "step": 10171, + "time_per_iteration": 2.4951138496398926 + }, + { + "auxiliary_loss_clip": 0.01106531, + "auxiliary_loss_mlp": 0.01035651, + "balance_loss_clip": 1.02225816, + "balance_loss_mlp": 1.03609705, + "epoch": 0.6115737261385841, + "flos": 28804846821120.0, + "grad_norm": 1.5834424948906107, + "language_loss": 0.78990817, + "learning_rate": 1.3844852728282934e-06, + "loss": 0.81132996, + "num_input_tokens_seen": 219017910, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.703125, + "step": 10172, + "time_per_iteration": 2.512971878051758 + }, + { + "auxiliary_loss_clip": 0.01110708, + "auxiliary_loss_mlp": 0.01033185, + "balance_loss_clip": 1.02060866, + "balance_loss_mlp": 1.03796673, + "epoch": 0.6116338493912521, + "flos": 21251468453760.0, + "grad_norm": 1.895061103662262, + "language_loss": 0.66887462, + "learning_rate": 1.3841147253914022e-06, + "loss": 0.69031352, + "num_input_tokens_seen": 219037730, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7265625, + "step": 10173, + "time_per_iteration": 2.450739860534668 + }, + { + "auxiliary_loss_clip": 0.01107875, + "auxiliary_loss_mlp": 0.01033311, + "balance_loss_clip": 1.02124667, + "balance_loss_mlp": 1.03863525, + "epoch": 0.61169397264392, + "flos": 17530189488000.0, + "grad_norm": 1.9715957300151092, + "language_loss": 0.5560292, + "learning_rate": 1.3837442013089416e-06, + "loss": 0.57744104, + "num_input_tokens_seen": 219056755, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.69140625, + "step": 10174, + "time_per_iteration": 2.4200756549835205 + }, + { + "auxiliary_loss_clip": 0.01111305, + "auxiliary_loss_mlp": 0.0103241, + "balance_loss_clip": 1.01991677, + "balance_loss_mlp": 1.04081392, + "epoch": 0.611754095896588, + "flos": 23951555758080.0, + "grad_norm": 1.8852329096028353, + "language_loss": 0.66003776, + "learning_rate": 1.3833737005949628e-06, + "loss": 0.68147486, + "num_input_tokens_seen": 219076985, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 10175, + "time_per_iteration": 2.4889590740203857 + }, + { + "auxiliary_loss_clip": 0.01102647, + "auxiliary_loss_mlp": 0.01023113, + "balance_loss_clip": 1.01236653, + "balance_loss_mlp": 1.03501439, + "epoch": 0.6118142191492559, + "flos": 25994872834560.0, + "grad_norm": 2.092985999457116, + "language_loss": 0.82515383, + "learning_rate": 1.3830032232635154e-06, + "loss": 0.84641147, + "num_input_tokens_seen": 219096050, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.67578125, + "step": 10176, + "time_per_iteration": 2.506054639816284 + }, + { + "auxiliary_loss_clip": 0.01107676, + "auxiliary_loss_mlp": 0.01036678, + "balance_loss_clip": 1.02346945, + "balance_loss_mlp": 1.03832841, + "epoch": 0.611874342401924, + "flos": 24603190341120.0, + "grad_norm": 4.162493341668284, + "language_loss": 0.76968575, + "learning_rate": 1.3826327693286474e-06, + "loss": 0.79112923, + "num_input_tokens_seen": 219112665, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6953125, + "step": 10177, + "time_per_iteration": 3.941509962081909 + }, + { + "auxiliary_loss_clip": 0.01104435, + "auxiliary_loss_mlp": 0.01033459, + "balance_loss_clip": 1.02179456, + "balance_loss_mlp": 1.03604686, + "epoch": 0.6119344656545919, + "flos": 15887132640000.0, + "grad_norm": 2.082789690638706, + "language_loss": 0.75353473, + "learning_rate": 1.3822623388044065e-06, + "loss": 0.77491367, + "num_input_tokens_seen": 219129120, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.68359375, + "step": 10178, + "time_per_iteration": 3.827141523361206 + }, + { + "auxiliary_loss_clip": 0.01107456, + "auxiliary_loss_mlp": 0.0103036, + "balance_loss_clip": 1.01814103, + "balance_loss_mlp": 1.03866005, + "epoch": 0.6119945889072599, + "flos": 21652877917440.0, + "grad_norm": 1.6048823215389816, + "language_loss": 0.6671313, + "learning_rate": 1.3818919317048402e-06, + "loss": 0.68850946, + "num_input_tokens_seen": 219148950, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6875, + "step": 10179, + "time_per_iteration": 2.467815637588501 + }, + { + "auxiliary_loss_clip": 0.01107829, + "auxiliary_loss_mlp": 0.01032233, + "balance_loss_clip": 1.02061653, + "balance_loss_mlp": 1.03923988, + "epoch": 0.6120547121599279, + "flos": 13772533023360.0, + "grad_norm": 1.8410866190884951, + "language_loss": 0.84216881, + "learning_rate": 1.3815215480439933e-06, + "loss": 0.86356938, + "num_input_tokens_seen": 219165585, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6875, + "step": 10180, + "time_per_iteration": 5.375430583953857 + }, + { + "auxiliary_loss_clip": 0.01105724, + "auxiliary_loss_mlp": 0.01030243, + "balance_loss_clip": 1.01756501, + "balance_loss_mlp": 1.03854799, + "epoch": 0.6121148354125958, + "flos": 20079164275200.0, + "grad_norm": 1.5429296840981428, + "language_loss": 0.77451497, + "learning_rate": 1.3811511878359113e-06, + "loss": 0.79587466, + "num_input_tokens_seen": 219183280, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.671875, + "step": 10181, + "time_per_iteration": 2.493150234222412 + }, + { + "auxiliary_loss_clip": 0.01106153, + "auxiliary_loss_mlp": 0.01027555, + "balance_loss_clip": 1.01597941, + "balance_loss_mlp": 1.03749657, + "epoch": 0.6121749586652638, + "flos": 13471313569920.0, + "grad_norm": 1.8534348182131113, + "language_loss": 0.80704159, + "learning_rate": 1.3807808510946384e-06, + "loss": 0.82837868, + "num_input_tokens_seen": 219197200, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6875, + "step": 10182, + "time_per_iteration": 2.5022473335266113 + }, + { + "auxiliary_loss_clip": 0.01099749, + "auxiliary_loss_mlp": 0.01025447, + "balance_loss_clip": 1.01557016, + "balance_loss_mlp": 1.03581071, + "epoch": 0.6122350819179317, + "flos": 20120533764480.0, + "grad_norm": 1.6380700202040888, + "language_loss": 0.83158624, + "learning_rate": 1.3804105378342177e-06, + "loss": 0.85283822, + "num_input_tokens_seen": 219216825, + "router_z_loss_clip": 0.09863281, + "router_z_loss_mlp": 0.640625, + "step": 10183, + "time_per_iteration": 2.489943265914917 + }, + { + "auxiliary_loss_clip": 0.01029447, + "auxiliary_loss_mlp": 0.01003231, + "balance_loss_clip": 1.00200963, + "balance_loss_mlp": 1.00785327, + "epoch": 0.6122952051705998, + "flos": 65429242767360.0, + "grad_norm": 0.7013408754852208, + "language_loss": 0.62862837, + "learning_rate": 1.3800402480686914e-06, + "loss": 0.64895517, + "num_input_tokens_seen": 219283795, + "router_z_loss_clip": 0.01220703, + "router_z_loss_mlp": 0.21582031, + "step": 10184, + "time_per_iteration": 3.1942267417907715 + }, + { + "auxiliary_loss_clip": 0.01105776, + "auxiliary_loss_mlp": 0.01026861, + "balance_loss_clip": 1.01558959, + "balance_loss_mlp": 1.03836298, + "epoch": 0.6123553284232677, + "flos": 20376253664640.0, + "grad_norm": 2.39384281866501, + "language_loss": 0.82134175, + "learning_rate": 1.379669981812101e-06, + "loss": 0.84266812, + "num_input_tokens_seen": 219302385, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.67578125, + "step": 10185, + "time_per_iteration": 2.441663980484009 + }, + { + "auxiliary_loss_clip": 0.01111146, + "auxiliary_loss_mlp": 0.01029457, + "balance_loss_clip": 1.0174942, + "balance_loss_mlp": 1.03989947, + "epoch": 0.6124154516759357, + "flos": 23987645948160.0, + "grad_norm": 5.230764283030459, + "language_loss": 0.74637246, + "learning_rate": 1.3792997390784868e-06, + "loss": 0.76777852, + "num_input_tokens_seen": 219319765, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.7109375, + "step": 10186, + "time_per_iteration": 2.494351387023926 + }, + { + "auxiliary_loss_clip": 0.01102292, + "auxiliary_loss_mlp": 0.01028561, + "balance_loss_clip": 1.01786828, + "balance_loss_mlp": 1.0364244, + "epoch": 0.6124755749286036, + "flos": 21468799693440.0, + "grad_norm": 1.5640192087821545, + "language_loss": 0.78181458, + "learning_rate": 1.3789295198818895e-06, + "loss": 0.80312312, + "num_input_tokens_seen": 219337440, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.65625, + "step": 10187, + "time_per_iteration": 2.4529902935028076 + }, + { + "auxiliary_loss_clip": 0.01101994, + "auxiliary_loss_mlp": 0.01028569, + "balance_loss_clip": 1.01668978, + "balance_loss_mlp": 1.03424489, + "epoch": 0.6125356981812716, + "flos": 23879195809920.0, + "grad_norm": 1.5585408172838955, + "language_loss": 0.82932627, + "learning_rate": 1.3785593242363462e-06, + "loss": 0.85063195, + "num_input_tokens_seen": 219357525, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 10188, + "time_per_iteration": 2.4779062271118164 + }, + { + "auxiliary_loss_clip": 0.0110417, + "auxiliary_loss_mlp": 0.01028272, + "balance_loss_clip": 1.0162673, + "balance_loss_mlp": 1.03603601, + "epoch": 0.6125958214339395, + "flos": 14425604150400.0, + "grad_norm": 2.027411293701354, + "language_loss": 0.75284189, + "learning_rate": 1.378189152155896e-06, + "loss": 0.77416623, + "num_input_tokens_seen": 219374855, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 10189, + "time_per_iteration": 2.4187629222869873 + }, + { + "auxiliary_loss_clip": 0.01105389, + "auxiliary_loss_mlp": 0.01031301, + "balance_loss_clip": 1.0194819, + "balance_loss_mlp": 1.03746915, + "epoch": 0.6126559446866076, + "flos": 23259090389760.0, + "grad_norm": 1.8604795542963726, + "language_loss": 0.74147457, + "learning_rate": 1.3778190036545758e-06, + "loss": 0.76284146, + "num_input_tokens_seen": 219394740, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 10190, + "time_per_iteration": 2.4838945865631104 + }, + { + "auxiliary_loss_clip": 0.01105194, + "auxiliary_loss_mlp": 0.01030345, + "balance_loss_clip": 1.01819181, + "balance_loss_mlp": 1.03696406, + "epoch": 0.6127160679392755, + "flos": 26864808324480.0, + "grad_norm": 1.6214214182316076, + "language_loss": 0.68505728, + "learning_rate": 1.3774488787464207e-06, + "loss": 0.70641267, + "num_input_tokens_seen": 219413755, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.68359375, + "step": 10191, + "time_per_iteration": 2.4871902465820312 + }, + { + "auxiliary_loss_clip": 0.01105112, + "auxiliary_loss_mlp": 0.01031184, + "balance_loss_clip": 1.01925695, + "balance_loss_mlp": 1.03581357, + "epoch": 0.6127761911919435, + "flos": 26396425952640.0, + "grad_norm": 2.21006786046543, + "language_loss": 0.73561746, + "learning_rate": 1.377078777445467e-06, + "loss": 0.75698042, + "num_input_tokens_seen": 219433560, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 10192, + "time_per_iteration": 2.491898536682129 + }, + { + "auxiliary_loss_clip": 0.0110379, + "auxiliary_loss_mlp": 0.01027536, + "balance_loss_clip": 1.01606202, + "balance_loss_mlp": 1.03735423, + "epoch": 0.6128363144446115, + "flos": 22634747164800.0, + "grad_norm": 1.8299896919962644, + "language_loss": 0.83299625, + "learning_rate": 1.3767086997657478e-06, + "loss": 0.85430956, + "num_input_tokens_seen": 219452640, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6640625, + "step": 10193, + "time_per_iteration": 2.459218740463257 + }, + { + "auxiliary_loss_clip": 0.0110509, + "auxiliary_loss_mlp": 0.01030227, + "balance_loss_clip": 1.01823401, + "balance_loss_mlp": 1.03667831, + "epoch": 0.6128964376972794, + "flos": 26759051706240.0, + "grad_norm": 2.3362331987729554, + "language_loss": 0.69596869, + "learning_rate": 1.3763386457212979e-06, + "loss": 0.71732187, + "num_input_tokens_seen": 219468585, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.68359375, + "step": 10194, + "time_per_iteration": 2.49104642868042 + }, + { + "auxiliary_loss_clip": 0.01026973, + "auxiliary_loss_mlp": 0.01003435, + "balance_loss_clip": 1.00231493, + "balance_loss_mlp": 1.0056808, + "epoch": 0.6129565609499474, + "flos": 65567929178880.0, + "grad_norm": 0.9308202048927251, + "language_loss": 0.58683991, + "learning_rate": 1.375968615326149e-06, + "loss": 0.607144, + "num_input_tokens_seen": 219523015, + "router_z_loss_clip": 0.01123047, + "router_z_loss_mlp": 0.21289062, + "step": 10195, + "time_per_iteration": 2.8671669960021973 + }, + { + "auxiliary_loss_clip": 0.01105637, + "auxiliary_loss_mlp": 0.01035062, + "balance_loss_clip": 1.02292037, + "balance_loss_mlp": 1.03803897, + "epoch": 0.6130166842026153, + "flos": 16362087200640.0, + "grad_norm": 1.927442212334356, + "language_loss": 0.69738579, + "learning_rate": 1.3755986085943324e-06, + "loss": 0.71879274, + "num_input_tokens_seen": 219539980, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 10196, + "time_per_iteration": 2.4702036380767822 + }, + { + "auxiliary_loss_clip": 0.01104196, + "auxiliary_loss_mlp": 0.0103889, + "balance_loss_clip": 1.02748811, + "balance_loss_mlp": 1.0374887, + "epoch": 0.6130768074552834, + "flos": 23652455207040.0, + "grad_norm": 2.920952429136396, + "language_loss": 0.71311784, + "learning_rate": 1.3752286255398788e-06, + "loss": 0.73454869, + "num_input_tokens_seen": 219556980, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 10197, + "time_per_iteration": 2.5032567977905273 + }, + { + "auxiliary_loss_clip": 0.01106358, + "auxiliary_loss_mlp": 0.01038142, + "balance_loss_clip": 1.02622151, + "balance_loss_mlp": 1.03691006, + "epoch": 0.6131369307079513, + "flos": 20047455544320.0, + "grad_norm": 1.885953700600687, + "language_loss": 0.78852749, + "learning_rate": 1.3748586661768191e-06, + "loss": 0.80997241, + "num_input_tokens_seen": 219576410, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 10198, + "time_per_iteration": 2.460963010787964 + }, + { + "auxiliary_loss_clip": 0.01107653, + "auxiliary_loss_mlp": 0.01033569, + "balance_loss_clip": 1.02179098, + "balance_loss_mlp": 1.037503, + "epoch": 0.6131970539606193, + "flos": 22672166158080.0, + "grad_norm": 1.406384953747787, + "language_loss": 0.7426461, + "learning_rate": 1.374488730519181e-06, + "loss": 0.76405835, + "num_input_tokens_seen": 219597180, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.703125, + "step": 10199, + "time_per_iteration": 2.4598445892333984 + }, + { + "auxiliary_loss_clip": 0.01108284, + "auxiliary_loss_mlp": 0.01038465, + "balance_loss_clip": 1.02553713, + "balance_loss_mlp": 1.03748035, + "epoch": 0.6132571772132872, + "flos": 26870913636480.0, + "grad_norm": 1.5460485143525171, + "language_loss": 0.62069702, + "learning_rate": 1.374118818580993e-06, + "loss": 0.64216447, + "num_input_tokens_seen": 219617630, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.70703125, + "step": 10200, + "time_per_iteration": 2.509960651397705 + }, + { + "auxiliary_loss_clip": 0.01104748, + "auxiliary_loss_mlp": 0.01031277, + "balance_loss_clip": 1.01977301, + "balance_loss_mlp": 1.03736472, + "epoch": 0.6133173004659552, + "flos": 22892657794560.0, + "grad_norm": 1.743695857232765, + "language_loss": 0.68367881, + "learning_rate": 1.3737489303762822e-06, + "loss": 0.70503902, + "num_input_tokens_seen": 219637025, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.67578125, + "step": 10201, + "time_per_iteration": 2.451493740081787 + }, + { + "auxiliary_loss_clip": 0.01100932, + "auxiliary_loss_mlp": 0.01027984, + "balance_loss_clip": 1.01627123, + "balance_loss_mlp": 1.03434098, + "epoch": 0.6133774237186231, + "flos": 20485098852480.0, + "grad_norm": 2.0127297199841747, + "language_loss": 0.83613813, + "learning_rate": 1.3733790659190746e-06, + "loss": 0.8574273, + "num_input_tokens_seen": 219656625, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 10202, + "time_per_iteration": 2.469893217086792 + }, + { + "auxiliary_loss_clip": 0.01027559, + "auxiliary_loss_mlp": 0.01002547, + "balance_loss_clip": 1.00130165, + "balance_loss_mlp": 1.00619066, + "epoch": 0.6134375469712912, + "flos": 69413065217280.0, + "grad_norm": 1.0897383842290518, + "language_loss": 0.67103815, + "learning_rate": 1.3730092252233953e-06, + "loss": 0.69133925, + "num_input_tokens_seen": 219718090, + "router_z_loss_clip": 0.01245117, + "router_z_loss_mlp": 0.21386719, + "step": 10203, + "time_per_iteration": 3.1407535076141357 + }, + { + "auxiliary_loss_clip": 0.01104451, + "auxiliary_loss_mlp": 0.01029714, + "balance_loss_clip": 1.01798368, + "balance_loss_mlp": 1.03650403, + "epoch": 0.6134976702239591, + "flos": 41281541815680.0, + "grad_norm": 2.800089510822399, + "language_loss": 0.61266363, + "learning_rate": 1.37263940830327e-06, + "loss": 0.63400525, + "num_input_tokens_seen": 219740100, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 10204, + "time_per_iteration": 2.683048963546753 + }, + { + "auxiliary_loss_clip": 0.01100362, + "auxiliary_loss_mlp": 0.01025947, + "balance_loss_clip": 1.0147351, + "balance_loss_mlp": 1.03410578, + "epoch": 0.6135577934766271, + "flos": 22346600261760.0, + "grad_norm": 1.8212112064426345, + "language_loss": 0.72582424, + "learning_rate": 1.3722696151727204e-06, + "loss": 0.74708724, + "num_input_tokens_seen": 219761225, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6640625, + "step": 10205, + "time_per_iteration": 2.5465259552001953 + }, + { + "auxiliary_loss_clip": 0.01100873, + "auxiliary_loss_mlp": 0.01023206, + "balance_loss_clip": 1.01134467, + "balance_loss_mlp": 1.03527784, + "epoch": 0.6136179167292951, + "flos": 23728155120000.0, + "grad_norm": 1.5866781109951742, + "language_loss": 0.75862819, + "learning_rate": 1.3718998458457701e-06, + "loss": 0.77986902, + "num_input_tokens_seen": 219780085, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.65625, + "step": 10206, + "time_per_iteration": 2.484109401702881 + }, + { + "auxiliary_loss_clip": 0.01105453, + "auxiliary_loss_mlp": 0.01029497, + "balance_loss_clip": 1.01753414, + "balance_loss_mlp": 1.03659487, + "epoch": 0.613678039981963, + "flos": 26024678144640.0, + "grad_norm": 1.9470428402611015, + "language_loss": 0.75471091, + "learning_rate": 1.3715301003364407e-06, + "loss": 0.77606046, + "num_input_tokens_seen": 219797895, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6875, + "step": 10207, + "time_per_iteration": 2.4940414428710938 + }, + { + "auxiliary_loss_clip": 0.01105401, + "auxiliary_loss_mlp": 0.01035368, + "balance_loss_clip": 1.02424574, + "balance_loss_mlp": 1.03734899, + "epoch": 0.613738163234631, + "flos": 9859957200000.0, + "grad_norm": 2.0213582004112336, + "language_loss": 0.82293832, + "learning_rate": 1.3711603786587525e-06, + "loss": 0.84434605, + "num_input_tokens_seen": 219811295, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6796875, + "step": 10208, + "time_per_iteration": 2.4401795864105225 + }, + { + "auxiliary_loss_clip": 0.01109978, + "auxiliary_loss_mlp": 0.01029842, + "balance_loss_clip": 1.01718831, + "balance_loss_mlp": 1.03974009, + "epoch": 0.613798286487299, + "flos": 33182070001920.0, + "grad_norm": 2.3284175830302365, + "language_loss": 0.72680509, + "learning_rate": 1.3707906808267265e-06, + "loss": 0.74820334, + "num_input_tokens_seen": 219832735, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 10209, + "time_per_iteration": 2.5886876583099365 + }, + { + "auxiliary_loss_clip": 0.01104268, + "auxiliary_loss_mlp": 0.01035755, + "balance_loss_clip": 1.02388787, + "balance_loss_mlp": 1.0384059, + "epoch": 0.613858409739967, + "flos": 25627901535360.0, + "grad_norm": 1.6658761229718997, + "language_loss": 0.74108303, + "learning_rate": 1.37042100685438e-06, + "loss": 0.76248324, + "num_input_tokens_seen": 219852755, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66015625, + "step": 10210, + "time_per_iteration": 2.521304130554199 + }, + { + "auxiliary_loss_clip": 0.01027276, + "auxiliary_loss_mlp": 0.01003672, + "balance_loss_clip": 1.00248551, + "balance_loss_mlp": 1.00609028, + "epoch": 0.6139185329926349, + "flos": 67192313932800.0, + "grad_norm": 0.8595111756805056, + "language_loss": 0.65022087, + "learning_rate": 1.3700513567557325e-06, + "loss": 0.67053032, + "num_input_tokens_seen": 219922785, + "router_z_loss_clip": 0.01184082, + "router_z_loss_mlp": 0.21191406, + "step": 10211, + "time_per_iteration": 3.2215003967285156 + }, + { + "auxiliary_loss_clip": 0.01104002, + "auxiliary_loss_mlp": 0.01037412, + "balance_loss_clip": 1.02478802, + "balance_loss_mlp": 1.03655624, + "epoch": 0.6139786562453029, + "flos": 21543637680000.0, + "grad_norm": 1.6436955201310604, + "language_loss": 0.75708187, + "learning_rate": 1.369681730544801e-06, + "loss": 0.77849603, + "num_input_tokens_seen": 219942215, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.671875, + "step": 10212, + "time_per_iteration": 2.4642996788024902 + }, + { + "auxiliary_loss_clip": 0.01106038, + "auxiliary_loss_mlp": 0.01032718, + "balance_loss_clip": 1.0206902, + "balance_loss_mlp": 1.03837156, + "epoch": 0.6140387794979708, + "flos": 26068489758720.0, + "grad_norm": 1.5692336608665938, + "language_loss": 0.74044585, + "learning_rate": 1.3693121282356009e-06, + "loss": 0.76183337, + "num_input_tokens_seen": 219963830, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 10213, + "time_per_iteration": 2.5178582668304443 + }, + { + "auxiliary_loss_clip": 0.01109398, + "auxiliary_loss_mlp": 0.01032231, + "balance_loss_clip": 1.01943398, + "balance_loss_mlp": 1.03825283, + "epoch": 0.6140989027506388, + "flos": 23694614795520.0, + "grad_norm": 1.5485308182437552, + "language_loss": 0.73049855, + "learning_rate": 1.3689425498421483e-06, + "loss": 0.75191492, + "num_input_tokens_seen": 219983815, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7109375, + "step": 10214, + "time_per_iteration": 2.4716460704803467 + }, + { + "auxiliary_loss_clip": 0.01106578, + "auxiliary_loss_mlp": 0.01032258, + "balance_loss_clip": 1.01949024, + "balance_loss_mlp": 1.03701019, + "epoch": 0.6141590260033067, + "flos": 22231721589120.0, + "grad_norm": 1.7742338479763222, + "language_loss": 0.74487185, + "learning_rate": 1.3685729953784572e-06, + "loss": 0.76626021, + "num_input_tokens_seen": 220003165, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 10215, + "time_per_iteration": 2.507734537124634 + }, + { + "auxiliary_loss_clip": 0.01103725, + "auxiliary_loss_mlp": 0.01032178, + "balance_loss_clip": 1.02031636, + "balance_loss_mlp": 1.03673744, + "epoch": 0.6142191492559748, + "flos": 23871653953920.0, + "grad_norm": 1.8655230442391189, + "language_loss": 0.78393024, + "learning_rate": 1.368203464858542e-06, + "loss": 0.80528927, + "num_input_tokens_seen": 220021015, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66796875, + "step": 10216, + "time_per_iteration": 2.479534864425659 + }, + { + "auxiliary_loss_clip": 0.01104623, + "auxiliary_loss_mlp": 0.01030214, + "balance_loss_clip": 1.01762557, + "balance_loss_mlp": 1.0373491, + "epoch": 0.6142792725086427, + "flos": 15042513260160.0, + "grad_norm": 2.5637363675830254, + "language_loss": 0.80079889, + "learning_rate": 1.3678339582964147e-06, + "loss": 0.82214725, + "num_input_tokens_seen": 220035780, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.671875, + "step": 10217, + "time_per_iteration": 2.4395620822906494 + }, + { + "auxiliary_loss_clip": 0.01105204, + "auxiliary_loss_mlp": 0.01026292, + "balance_loss_clip": 1.01397753, + "balance_loss_mlp": 1.0361073, + "epoch": 0.6143393957613107, + "flos": 23330947547520.0, + "grad_norm": 2.424574581231863, + "language_loss": 0.78246987, + "learning_rate": 1.3674644757060865e-06, + "loss": 0.80378485, + "num_input_tokens_seen": 220054280, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69140625, + "step": 10218, + "time_per_iteration": 3.851706027984619 + }, + { + "auxiliary_loss_clip": 0.01105535, + "auxiliary_loss_mlp": 0.0103101, + "balance_loss_clip": 1.01911902, + "balance_loss_mlp": 1.038481, + "epoch": 0.6143995190139786, + "flos": 20117086058880.0, + "grad_norm": 1.517262895370751, + "language_loss": 0.81908238, + "learning_rate": 1.367095017101569e-06, + "loss": 0.84044778, + "num_input_tokens_seen": 220074120, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 10219, + "time_per_iteration": 2.5016467571258545 + }, + { + "auxiliary_loss_clip": 0.01104307, + "auxiliary_loss_mlp": 0.01028106, + "balance_loss_clip": 1.01553547, + "balance_loss_mlp": 1.03468263, + "epoch": 0.6144596422666466, + "flos": 42303559489920.0, + "grad_norm": 1.8306132213683777, + "language_loss": 0.66681564, + "learning_rate": 1.3667255824968717e-06, + "loss": 0.6881398, + "num_input_tokens_seen": 220096320, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 10220, + "time_per_iteration": 4.022945404052734 + }, + { + "auxiliary_loss_clip": 0.01101764, + "auxiliary_loss_mlp": 0.01026811, + "balance_loss_clip": 1.01547968, + "balance_loss_mlp": 1.03572094, + "epoch": 0.6145197655193146, + "flos": 21573622558080.0, + "grad_norm": 1.9547432893761034, + "language_loss": 0.71545637, + "learning_rate": 1.3663561719060041e-06, + "loss": 0.73674214, + "num_input_tokens_seen": 220114850, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66015625, + "step": 10221, + "time_per_iteration": 3.984619617462158 + }, + { + "auxiliary_loss_clip": 0.01102691, + "auxiliary_loss_mlp": 0.0102811, + "balance_loss_clip": 1.01677299, + "balance_loss_mlp": 1.03571272, + "epoch": 0.6145798887719826, + "flos": 21471098163840.0, + "grad_norm": 1.6401613716258656, + "language_loss": 0.79416037, + "learning_rate": 1.3659867853429735e-06, + "loss": 0.81546843, + "num_input_tokens_seen": 220133395, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 10222, + "time_per_iteration": 2.482626438140869 + }, + { + "auxiliary_loss_clip": 0.01107554, + "auxiliary_loss_mlp": 0.01030696, + "balance_loss_clip": 1.01896548, + "balance_loss_mlp": 1.03836894, + "epoch": 0.6146400120246506, + "flos": 20777016683520.0, + "grad_norm": 4.215986026899438, + "language_loss": 0.76034737, + "learning_rate": 1.365617422821788e-06, + "loss": 0.78172994, + "num_input_tokens_seen": 220152790, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.69140625, + "step": 10223, + "time_per_iteration": 3.9831442832946777 + }, + { + "auxiliary_loss_clip": 0.01102548, + "auxiliary_loss_mlp": 0.01028023, + "balance_loss_clip": 1.01629829, + "balance_loss_mlp": 1.03779078, + "epoch": 0.6147001352773185, + "flos": 13881306384000.0, + "grad_norm": 2.127618755426409, + "language_loss": 0.78459811, + "learning_rate": 1.3652480843564535e-06, + "loss": 0.80590385, + "num_input_tokens_seen": 220169535, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6484375, + "step": 10224, + "time_per_iteration": 2.43497896194458 + }, + { + "auxiliary_loss_clip": 0.01100015, + "auxiliary_loss_mlp": 0.01025021, + "balance_loss_clip": 1.01433933, + "balance_loss_mlp": 1.03477085, + "epoch": 0.6147602585299865, + "flos": 56641791807360.0, + "grad_norm": 1.3024527007974456, + "language_loss": 0.66392958, + "learning_rate": 1.3648787699609746e-06, + "loss": 0.68517995, + "num_input_tokens_seen": 220195305, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.65234375, + "step": 10225, + "time_per_iteration": 2.845883369445801 + }, + { + "auxiliary_loss_clip": 0.01107388, + "auxiliary_loss_mlp": 0.01028986, + "balance_loss_clip": 1.01713014, + "balance_loss_mlp": 1.03713298, + "epoch": 0.6148203817826544, + "flos": 32817217605120.0, + "grad_norm": 2.1585029045138415, + "language_loss": 0.63199341, + "learning_rate": 1.364509479649357e-06, + "loss": 0.65335715, + "num_input_tokens_seen": 220215040, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.703125, + "step": 10226, + "time_per_iteration": 2.555772304534912 + }, + { + "auxiliary_loss_clip": 0.01104672, + "auxiliary_loss_mlp": 0.01030795, + "balance_loss_clip": 1.01831996, + "balance_loss_mlp": 1.03651762, + "epoch": 0.6148805050353224, + "flos": 18332038748160.0, + "grad_norm": 1.704417913895937, + "language_loss": 0.75513506, + "learning_rate": 1.3641402134356037e-06, + "loss": 0.77648973, + "num_input_tokens_seen": 220234205, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 10227, + "time_per_iteration": 2.536123514175415 + }, + { + "auxiliary_loss_clip": 0.01106007, + "auxiliary_loss_mlp": 0.01036804, + "balance_loss_clip": 1.022928, + "balance_loss_mlp": 1.03667367, + "epoch": 0.6149406282879903, + "flos": 14063983977600.0, + "grad_norm": 1.8551652476106548, + "language_loss": 0.61097801, + "learning_rate": 1.3637709713337164e-06, + "loss": 0.63240612, + "num_input_tokens_seen": 220252730, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.69140625, + "step": 10228, + "time_per_iteration": 2.419962167739868 + }, + { + "auxiliary_loss_clip": 0.01103286, + "auxiliary_loss_mlp": 0.01029831, + "balance_loss_clip": 1.01778531, + "balance_loss_mlp": 1.03672791, + "epoch": 0.6150007515406584, + "flos": 25190186400000.0, + "grad_norm": 1.3329712414655954, + "language_loss": 0.74049234, + "learning_rate": 1.3634017533576985e-06, + "loss": 0.76182348, + "num_input_tokens_seen": 220273345, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6640625, + "step": 10229, + "time_per_iteration": 2.506852626800537 + }, + { + "auxiliary_loss_clip": 0.01106333, + "auxiliary_loss_mlp": 0.01032652, + "balance_loss_clip": 1.0202601, + "balance_loss_mlp": 1.03880942, + "epoch": 0.6150608747933263, + "flos": 21945262625280.0, + "grad_norm": 1.7175132302354088, + "language_loss": 0.77996862, + "learning_rate": 1.3630325595215493e-06, + "loss": 0.80135846, + "num_input_tokens_seen": 220293845, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.67578125, + "step": 10230, + "time_per_iteration": 2.477675199508667 + }, + { + "auxiliary_loss_clip": 0.0110355, + "auxiliary_loss_mlp": 0.0102496, + "balance_loss_clip": 1.01371837, + "balance_loss_mlp": 1.03570461, + "epoch": 0.6151209980459943, + "flos": 30117453523200.0, + "grad_norm": 1.6621971423553226, + "language_loss": 0.72935748, + "learning_rate": 1.36266338983927e-06, + "loss": 0.75064254, + "num_input_tokens_seen": 220316070, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6796875, + "step": 10231, + "time_per_iteration": 2.561504602432251 + }, + { + "auxiliary_loss_clip": 0.01105925, + "auxiliary_loss_mlp": 0.0102981, + "balance_loss_clip": 1.01883698, + "balance_loss_mlp": 1.03801215, + "epoch": 0.6151811212986622, + "flos": 30008356940160.0, + "grad_norm": 1.5773460676573843, + "language_loss": 0.6960876, + "learning_rate": 1.362294244324858e-06, + "loss": 0.71744496, + "num_input_tokens_seen": 220335695, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6796875, + "step": 10232, + "time_per_iteration": 2.5435595512390137 + }, + { + "auxiliary_loss_clip": 0.01100438, + "auxiliary_loss_mlp": 0.01028039, + "balance_loss_clip": 1.0169704, + "balance_loss_mlp": 1.03564286, + "epoch": 0.6152412445513302, + "flos": 18872888808960.0, + "grad_norm": 1.9335513310183938, + "language_loss": 0.91684914, + "learning_rate": 1.3619251229923126e-06, + "loss": 0.9381339, + "num_input_tokens_seen": 220353720, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 10233, + "time_per_iteration": 2.464128017425537 + }, + { + "auxiliary_loss_clip": 0.01104077, + "auxiliary_loss_mlp": 0.01033089, + "balance_loss_clip": 1.02258682, + "balance_loss_mlp": 1.03727007, + "epoch": 0.6153013678039982, + "flos": 25703601448320.0, + "grad_norm": 1.693429608694219, + "language_loss": 0.71381217, + "learning_rate": 1.3615560258556306e-06, + "loss": 0.73518384, + "num_input_tokens_seen": 220372515, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.66796875, + "step": 10234, + "time_per_iteration": 2.484847068786621 + }, + { + "auxiliary_loss_clip": 0.01104977, + "auxiliary_loss_mlp": 0.01031373, + "balance_loss_clip": 1.01926732, + "balance_loss_mlp": 1.03558159, + "epoch": 0.6153614910566662, + "flos": 28510271383680.0, + "grad_norm": 1.9863568991468559, + "language_loss": 0.66966361, + "learning_rate": 1.3611869529288077e-06, + "loss": 0.69102716, + "num_input_tokens_seen": 220393490, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 10235, + "time_per_iteration": 2.499189853668213 + }, + { + "auxiliary_loss_clip": 0.01106455, + "auxiliary_loss_mlp": 0.01029276, + "balance_loss_clip": 1.01746273, + "balance_loss_mlp": 1.0364213, + "epoch": 0.6154216143093342, + "flos": 23549787158400.0, + "grad_norm": 2.269392311324668, + "language_loss": 0.81321824, + "learning_rate": 1.3608179042258398e-06, + "loss": 0.83457547, + "num_input_tokens_seen": 220412855, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.703125, + "step": 10236, + "time_per_iteration": 2.467374086380005 + }, + { + "auxiliary_loss_clip": 0.01106752, + "auxiliary_loss_mlp": 0.01029525, + "balance_loss_clip": 1.01815271, + "balance_loss_mlp": 1.03654408, + "epoch": 0.6154817375620021, + "flos": 22748081552640.0, + "grad_norm": 1.4348801753525875, + "language_loss": 0.80595863, + "learning_rate": 1.360448879760721e-06, + "loss": 0.82732141, + "num_input_tokens_seen": 220433440, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.703125, + "step": 10237, + "time_per_iteration": 2.4921953678131104 + }, + { + "auxiliary_loss_clip": 0.01103597, + "auxiliary_loss_mlp": 0.01036802, + "balance_loss_clip": 1.02533984, + "balance_loss_mlp": 1.03659725, + "epoch": 0.6155418608146701, + "flos": 27162975121920.0, + "grad_norm": 1.8067050747817437, + "language_loss": 0.7606861, + "learning_rate": 1.3600798795474449e-06, + "loss": 0.78209013, + "num_input_tokens_seen": 220453445, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 10238, + "time_per_iteration": 2.5077149868011475 + }, + { + "auxiliary_loss_clip": 0.01027367, + "auxiliary_loss_mlp": 0.01004239, + "balance_loss_clip": 1.00305295, + "balance_loss_mlp": 1.00621736, + "epoch": 0.615601984067338, + "flos": 68811165014400.0, + "grad_norm": 0.7636645723592903, + "language_loss": 0.57658124, + "learning_rate": 1.3597109036000036e-06, + "loss": 0.5968973, + "num_input_tokens_seen": 220509730, + "router_z_loss_clip": 0.01184082, + "router_z_loss_mlp": 0.21191406, + "step": 10239, + "time_per_iteration": 3.0781197547912598 + }, + { + "auxiliary_loss_clip": 0.01106458, + "auxiliary_loss_mlp": 0.01034804, + "balance_loss_clip": 1.02284706, + "balance_loss_mlp": 1.03747571, + "epoch": 0.615662107320006, + "flos": 15517144598400.0, + "grad_norm": 2.10217205787335, + "language_loss": 0.77644312, + "learning_rate": 1.3593419519323892e-06, + "loss": 0.79785573, + "num_input_tokens_seen": 220527295, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.69140625, + "step": 10240, + "time_per_iteration": 2.4440581798553467 + }, + { + "auxiliary_loss_clip": 0.01107517, + "auxiliary_loss_mlp": 0.01032816, + "balance_loss_clip": 1.02069855, + "balance_loss_mlp": 1.03847337, + "epoch": 0.615722230572674, + "flos": 21063691128960.0, + "grad_norm": 2.3418662553679495, + "language_loss": 0.72875106, + "learning_rate": 1.3589730245585922e-06, + "loss": 0.75015438, + "num_input_tokens_seen": 220542730, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69140625, + "step": 10241, + "time_per_iteration": 2.440458059310913 + }, + { + "auxiliary_loss_clip": 0.01102041, + "auxiliary_loss_mlp": 0.01027, + "balance_loss_clip": 1.01599121, + "balance_loss_mlp": 1.03596628, + "epoch": 0.615782353825342, + "flos": 23256791919360.0, + "grad_norm": 1.629664240741642, + "language_loss": 0.71536696, + "learning_rate": 1.3586041214926018e-06, + "loss": 0.73665738, + "num_input_tokens_seen": 220562995, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.66015625, + "step": 10242, + "time_per_iteration": 2.465280771255493 + }, + { + "auxiliary_loss_clip": 0.0110517, + "auxiliary_loss_mlp": 0.01028948, + "balance_loss_clip": 1.01760554, + "balance_loss_mlp": 1.03812838, + "epoch": 0.6158424770780099, + "flos": 21103911383040.0, + "grad_norm": 1.7806476568458218, + "language_loss": 0.72179866, + "learning_rate": 1.3582352427484086e-06, + "loss": 0.74313986, + "num_input_tokens_seen": 220581775, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 10243, + "time_per_iteration": 2.4706227779388428 + }, + { + "auxiliary_loss_clip": 0.01026424, + "auxiliary_loss_mlp": 0.01003264, + "balance_loss_clip": 1.00196481, + "balance_loss_mlp": 1.00526905, + "epoch": 0.6159026003306779, + "flos": 70333276769280.0, + "grad_norm": 0.7683330535495017, + "language_loss": 0.5684256, + "learning_rate": 1.3578663883399984e-06, + "loss": 0.58872247, + "num_input_tokens_seen": 220646395, + "router_z_loss_clip": 0.01300049, + "router_z_loss_mlp": 0.2109375, + "step": 10244, + "time_per_iteration": 3.108367919921875 + }, + { + "auxiliary_loss_clip": 0.01104886, + "auxiliary_loss_mlp": 0.01027895, + "balance_loss_clip": 1.01563978, + "balance_loss_mlp": 1.03710341, + "epoch": 0.6159627235833458, + "flos": 33874355802240.0, + "grad_norm": 1.7714653532708287, + "language_loss": 0.63837689, + "learning_rate": 1.3574975582813593e-06, + "loss": 0.65970469, + "num_input_tokens_seen": 220668335, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6796875, + "step": 10245, + "time_per_iteration": 2.5604476928710938 + }, + { + "auxiliary_loss_clip": 0.01102331, + "auxiliary_loss_mlp": 0.01028094, + "balance_loss_clip": 1.01664448, + "balance_loss_mlp": 1.03589809, + "epoch": 0.6160228468360138, + "flos": 26575440359040.0, + "grad_norm": 1.7050556240908794, + "language_loss": 0.78958333, + "learning_rate": 1.3571287525864771e-06, + "loss": 0.81088758, + "num_input_tokens_seen": 220688915, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 10246, + "time_per_iteration": 2.6499507427215576 + }, + { + "auxiliary_loss_clip": 0.01109766, + "auxiliary_loss_mlp": 0.01044472, + "balance_loss_clip": 1.03114414, + "balance_loss_mlp": 1.03871059, + "epoch": 0.6160829700886818, + "flos": 17193274894080.0, + "grad_norm": 2.2268806206076586, + "language_loss": 0.87346923, + "learning_rate": 1.3567599712693368e-06, + "loss": 0.89501166, + "num_input_tokens_seen": 220703465, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 10247, + "time_per_iteration": 2.53155517578125 + }, + { + "auxiliary_loss_clip": 0.01108151, + "auxiliary_loss_mlp": 0.01031566, + "balance_loss_clip": 1.01996708, + "balance_loss_mlp": 1.03957379, + "epoch": 0.6161430933413498, + "flos": 23623547736960.0, + "grad_norm": 2.019293099257412, + "language_loss": 0.80015755, + "learning_rate": 1.3563912143439235e-06, + "loss": 0.82155472, + "num_input_tokens_seen": 220722090, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6875, + "step": 10248, + "time_per_iteration": 2.565202236175537 + }, + { + "auxiliary_loss_clip": 0.01101653, + "auxiliary_loss_mlp": 0.01029592, + "balance_loss_clip": 1.01873195, + "balance_loss_mlp": 1.03529978, + "epoch": 0.6162032165940178, + "flos": 23002436736000.0, + "grad_norm": 1.9553906460889976, + "language_loss": 0.8661859, + "learning_rate": 1.3560224818242191e-06, + "loss": 0.88749832, + "num_input_tokens_seen": 220741075, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6640625, + "step": 10249, + "time_per_iteration": 2.5155153274536133 + }, + { + "auxiliary_loss_clip": 0.01104366, + "auxiliary_loss_mlp": 0.01026878, + "balance_loss_clip": 1.01438522, + "balance_loss_mlp": 1.03663516, + "epoch": 0.6162633398466857, + "flos": 39421979740800.0, + "grad_norm": 1.9896565394121724, + "language_loss": 0.6859656, + "learning_rate": 1.3556537737242072e-06, + "loss": 0.70727801, + "num_input_tokens_seen": 220763395, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.67578125, + "step": 10250, + "time_per_iteration": 2.6529786586761475 + }, + { + "auxiliary_loss_clip": 0.01098407, + "auxiliary_loss_mlp": 0.01025601, + "balance_loss_clip": 1.01488972, + "balance_loss_mlp": 1.03547108, + "epoch": 0.6163234630993537, + "flos": 19244672530560.0, + "grad_norm": 1.9258007321652242, + "language_loss": 0.74149621, + "learning_rate": 1.3552850900578692e-06, + "loss": 0.76273632, + "num_input_tokens_seen": 220780640, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.62890625, + "step": 10251, + "time_per_iteration": 2.5420565605163574 + }, + { + "auxiliary_loss_clip": 0.01104048, + "auxiliary_loss_mlp": 0.01027255, + "balance_loss_clip": 1.01518464, + "balance_loss_mlp": 1.03652811, + "epoch": 0.6163835863520216, + "flos": 15961791058560.0, + "grad_norm": 2.78922632869985, + "language_loss": 0.68291706, + "learning_rate": 1.3549164308391844e-06, + "loss": 0.70423007, + "num_input_tokens_seen": 220797960, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.67578125, + "step": 10252, + "time_per_iteration": 2.5236093997955322 + }, + { + "auxiliary_loss_clip": 0.01026564, + "auxiliary_loss_mlp": 0.01001879, + "balance_loss_clip": 1.0006336, + "balance_loss_mlp": 1.00562644, + "epoch": 0.6164437096046896, + "flos": 68103834393600.0, + "grad_norm": 0.8837133823521999, + "language_loss": 0.57868779, + "learning_rate": 1.3545477960821333e-06, + "loss": 0.5989722, + "num_input_tokens_seen": 220856930, + "router_z_loss_clip": 0.01245117, + "router_z_loss_mlp": 0.20898438, + "step": 10253, + "time_per_iteration": 3.103968858718872 + }, + { + "auxiliary_loss_clip": 0.01103565, + "auxiliary_loss_mlp": 0.01028041, + "balance_loss_clip": 1.01609635, + "balance_loss_mlp": 1.03543723, + "epoch": 0.6165038328573575, + "flos": 21361211481600.0, + "grad_norm": 1.4349605702857906, + "language_loss": 0.79628026, + "learning_rate": 1.3541791858006946e-06, + "loss": 0.81759632, + "num_input_tokens_seen": 220877595, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 10254, + "time_per_iteration": 2.4770078659057617 + }, + { + "auxiliary_loss_clip": 0.01107997, + "auxiliary_loss_mlp": 0.01028587, + "balance_loss_clip": 1.0167551, + "balance_loss_mlp": 1.03706634, + "epoch": 0.6165639561100256, + "flos": 21101972048640.0, + "grad_norm": 1.765232531729237, + "language_loss": 0.80340689, + "learning_rate": 1.353810600008846e-06, + "loss": 0.82477272, + "num_input_tokens_seen": 220896880, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.7109375, + "step": 10255, + "time_per_iteration": 2.4666266441345215 + }, + { + "auxiliary_loss_clip": 0.01108694, + "auxiliary_loss_mlp": 0.01032846, + "balance_loss_clip": 1.0197928, + "balance_loss_mlp": 1.03867257, + "epoch": 0.6166240793626935, + "flos": 25338533569920.0, + "grad_norm": 1.7468186030679946, + "language_loss": 0.65269709, + "learning_rate": 1.3534420387205646e-06, + "loss": 0.6741125, + "num_input_tokens_seen": 220916425, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.69921875, + "step": 10256, + "time_per_iteration": 2.514446973800659 + }, + { + "auxiliary_loss_clip": 0.01103556, + "auxiliary_loss_mlp": 0.01028163, + "balance_loss_clip": 1.01702309, + "balance_loss_mlp": 1.0371418, + "epoch": 0.6166842026153615, + "flos": 19682639061120.0, + "grad_norm": 1.5636561949397187, + "language_loss": 0.71758097, + "learning_rate": 1.353073501949825e-06, + "loss": 0.73889816, + "num_input_tokens_seen": 220935050, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 10257, + "time_per_iteration": 2.4575183391571045 + }, + { + "auxiliary_loss_clip": 0.01108721, + "auxiliary_loss_mlp": 0.01028654, + "balance_loss_clip": 1.01625657, + "balance_loss_mlp": 1.03909421, + "epoch": 0.6167443258680294, + "flos": 19318361281920.0, + "grad_norm": 2.0856421908029192, + "language_loss": 0.72058862, + "learning_rate": 1.3527049897106034e-06, + "loss": 0.74196231, + "num_input_tokens_seen": 220953085, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6953125, + "step": 10258, + "time_per_iteration": 2.4590466022491455 + }, + { + "auxiliary_loss_clip": 0.01105581, + "auxiliary_loss_mlp": 0.01030827, + "balance_loss_clip": 1.01888824, + "balance_loss_mlp": 1.03705239, + "epoch": 0.6168044491206974, + "flos": 25265239868160.0, + "grad_norm": 2.864696001888572, + "language_loss": 0.63946176, + "learning_rate": 1.3523365020168735e-06, + "loss": 0.66082585, + "num_input_tokens_seen": 220969050, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 10259, + "time_per_iteration": 2.472621202468872 + }, + { + "auxiliary_loss_clip": 0.01103568, + "auxiliary_loss_mlp": 0.0103251, + "balance_loss_clip": 1.02020216, + "balance_loss_mlp": 1.03760934, + "epoch": 0.6168645723733654, + "flos": 13219903301760.0, + "grad_norm": 1.8983508996193146, + "language_loss": 0.71194589, + "learning_rate": 1.3519680388826084e-06, + "loss": 0.73330671, + "num_input_tokens_seen": 220985825, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66015625, + "step": 10260, + "time_per_iteration": 3.8351244926452637 + }, + { + "auxiliary_loss_clip": 0.01112265, + "auxiliary_loss_mlp": 0.01030607, + "balance_loss_clip": 1.0169692, + "balance_loss_mlp": 1.04087448, + "epoch": 0.6169246956260334, + "flos": 26652038112000.0, + "grad_norm": 1.8640894588611543, + "language_loss": 0.68213212, + "learning_rate": 1.3515996003217803e-06, + "loss": 0.70356077, + "num_input_tokens_seen": 221004465, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71484375, + "step": 10261, + "time_per_iteration": 2.4846863746643066 + }, + { + "auxiliary_loss_clip": 0.01103737, + "auxiliary_loss_mlp": 0.01037916, + "balance_loss_clip": 1.02671063, + "balance_loss_mlp": 1.03602839, + "epoch": 0.6169848188787014, + "flos": 23148413608320.0, + "grad_norm": 1.7606752411550333, + "language_loss": 0.71393299, + "learning_rate": 1.3512311863483602e-06, + "loss": 0.73534954, + "num_input_tokens_seen": 221023260, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.67578125, + "step": 10262, + "time_per_iteration": 3.8463478088378906 + }, + { + "auxiliary_loss_clip": 0.01105557, + "auxiliary_loss_mlp": 0.01031442, + "balance_loss_clip": 1.01940775, + "balance_loss_mlp": 1.03685451, + "epoch": 0.6170449421313693, + "flos": 23331917214720.0, + "grad_norm": 1.9300485767677382, + "language_loss": 0.70171946, + "learning_rate": 1.3508627969763188e-06, + "loss": 0.72308946, + "num_input_tokens_seen": 221043090, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6875, + "step": 10263, + "time_per_iteration": 3.8719136714935303 + }, + { + "auxiliary_loss_clip": 0.01106014, + "auxiliary_loss_mlp": 0.01028395, + "balance_loss_clip": 1.01676631, + "balance_loss_mlp": 1.03678763, + "epoch": 0.6171050653840373, + "flos": 15851617067520.0, + "grad_norm": 8.265893448617778, + "language_loss": 0.75888687, + "learning_rate": 1.3504944322196244e-06, + "loss": 0.78023094, + "num_input_tokens_seen": 221061435, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.69140625, + "step": 10264, + "time_per_iteration": 3.9576797485351562 + }, + { + "auxiliary_loss_clip": 0.01105756, + "auxiliary_loss_mlp": 0.010292, + "balance_loss_clip": 1.01682043, + "balance_loss_mlp": 1.03773212, + "epoch": 0.6171651886367052, + "flos": 20045516209920.0, + "grad_norm": 2.621461269637815, + "language_loss": 0.85138124, + "learning_rate": 1.350126092092247e-06, + "loss": 0.87273085, + "num_input_tokens_seen": 221078705, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 10265, + "time_per_iteration": 2.4204261302948 + }, + { + "auxiliary_loss_clip": 0.01103728, + "auxiliary_loss_mlp": 0.01032958, + "balance_loss_clip": 1.02098346, + "balance_loss_mlp": 1.03761029, + "epoch": 0.6172253118893732, + "flos": 26432695710720.0, + "grad_norm": 3.6073790517357995, + "language_loss": 0.642869, + "learning_rate": 1.349757776608153e-06, + "loss": 0.66423583, + "num_input_tokens_seen": 221099245, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66015625, + "step": 10266, + "time_per_iteration": 2.5135982036590576 + }, + { + "auxiliary_loss_clip": 0.0110251, + "auxiliary_loss_mlp": 0.01031969, + "balance_loss_clip": 1.02062035, + "balance_loss_mlp": 1.03433692, + "epoch": 0.6172854351420412, + "flos": 22632879657600.0, + "grad_norm": 1.7504973624629372, + "language_loss": 0.75734687, + "learning_rate": 1.3493894857813094e-06, + "loss": 0.77869165, + "num_input_tokens_seen": 221116930, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6796875, + "step": 10267, + "time_per_iteration": 2.4403936862945557 + }, + { + "auxiliary_loss_clip": 0.01107183, + "auxiliary_loss_mlp": 0.01026945, + "balance_loss_clip": 1.0146544, + "balance_loss_mlp": 1.0368762, + "epoch": 0.6173455583947092, + "flos": 21212936138880.0, + "grad_norm": 1.5812909664018504, + "language_loss": 0.74722588, + "learning_rate": 1.3490212196256818e-06, + "loss": 0.7685672, + "num_input_tokens_seen": 221137660, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 10268, + "time_per_iteration": 2.467622995376587 + }, + { + "auxiliary_loss_clip": 0.01108432, + "auxiliary_loss_mlp": 0.01027035, + "balance_loss_clip": 1.01477456, + "balance_loss_mlp": 1.03709388, + "epoch": 0.6174056816473771, + "flos": 19500284689920.0, + "grad_norm": 1.6692354192517487, + "language_loss": 0.75483018, + "learning_rate": 1.3486529781552342e-06, + "loss": 0.77618486, + "num_input_tokens_seen": 221156225, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.71484375, + "step": 10269, + "time_per_iteration": 2.427558660507202 + }, + { + "auxiliary_loss_clip": 0.01102127, + "auxiliary_loss_mlp": 0.01025701, + "balance_loss_clip": 1.01379776, + "balance_loss_mlp": 1.03455544, + "epoch": 0.6174658049000451, + "flos": 15997342544640.0, + "grad_norm": 2.2351967956552987, + "language_loss": 0.76565802, + "learning_rate": 1.3482847613839318e-06, + "loss": 0.78693628, + "num_input_tokens_seen": 221173820, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 10270, + "time_per_iteration": 2.441521644592285 + }, + { + "auxiliary_loss_clip": 0.01106104, + "auxiliary_loss_mlp": 0.01026083, + "balance_loss_clip": 1.01432252, + "balance_loss_mlp": 1.03741896, + "epoch": 0.617525928152713, + "flos": 21903893136000.0, + "grad_norm": 1.7948450339640445, + "language_loss": 0.82511967, + "learning_rate": 1.347916569325736e-06, + "loss": 0.84644157, + "num_input_tokens_seen": 221191815, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6875, + "step": 10271, + "time_per_iteration": 2.427300453186035 + }, + { + "auxiliary_loss_clip": 0.01105866, + "auxiliary_loss_mlp": 0.01031447, + "balance_loss_clip": 1.01937735, + "balance_loss_mlp": 1.03691125, + "epoch": 0.617586051405381, + "flos": 21105958458240.0, + "grad_norm": 2.1955459228647687, + "language_loss": 0.76878774, + "learning_rate": 1.3475484019946093e-06, + "loss": 0.79016083, + "num_input_tokens_seen": 221211205, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6875, + "step": 10272, + "time_per_iteration": 2.4983582496643066 + }, + { + "auxiliary_loss_clip": 0.01028751, + "auxiliary_loss_mlp": 0.0099819, + "balance_loss_clip": 0.99684906, + "balance_loss_mlp": 1.00760865, + "epoch": 0.617646174658049, + "flos": 58610776665600.0, + "grad_norm": 0.8101209602428692, + "language_loss": 0.59128773, + "learning_rate": 1.347180259404513e-06, + "loss": 0.61155713, + "num_input_tokens_seen": 221268430, + "router_z_loss_clip": 0.01342773, + "router_z_loss_mlp": 0.21191406, + "step": 10273, + "time_per_iteration": 2.9302847385406494 + }, + { + "auxiliary_loss_clip": 0.01103173, + "auxiliary_loss_mlp": 0.01026931, + "balance_loss_clip": 1.01496243, + "balance_loss_mlp": 1.03603625, + "epoch": 0.617706297910717, + "flos": 13878684691200.0, + "grad_norm": 2.336605024454028, + "language_loss": 0.72963846, + "learning_rate": 1.3468121415694059e-06, + "loss": 0.75093955, + "num_input_tokens_seen": 221281930, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.671875, + "step": 10274, + "time_per_iteration": 2.4481325149536133 + }, + { + "auxiliary_loss_clip": 0.01104274, + "auxiliary_loss_mlp": 0.01027771, + "balance_loss_clip": 1.01643395, + "balance_loss_mlp": 1.03684974, + "epoch": 0.617766421163385, + "flos": 19208438686080.0, + "grad_norm": 3.0133252214936372, + "language_loss": 0.77358514, + "learning_rate": 1.3464440485032484e-06, + "loss": 0.79490566, + "num_input_tokens_seen": 221301605, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.67578125, + "step": 10275, + "time_per_iteration": 2.4196648597717285 + }, + { + "auxiliary_loss_clip": 0.01104297, + "auxiliary_loss_mlp": 0.01027767, + "balance_loss_clip": 1.01589358, + "balance_loss_mlp": 1.03650546, + "epoch": 0.6178265444160529, + "flos": 22565978576640.0, + "grad_norm": 1.6767450105474386, + "language_loss": 0.79291052, + "learning_rate": 1.346075980219998e-06, + "loss": 0.81423116, + "num_input_tokens_seen": 221320105, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 10276, + "time_per_iteration": 2.5229239463806152 + }, + { + "auxiliary_loss_clip": 0.01107984, + "auxiliary_loss_mlp": 0.01033282, + "balance_loss_clip": 1.02068734, + "balance_loss_mlp": 1.0383606, + "epoch": 0.6178866676687209, + "flos": 11984289402240.0, + "grad_norm": 2.1695107159415525, + "language_loss": 0.8092519, + "learning_rate": 1.345707936733612e-06, + "loss": 0.83066452, + "num_input_tokens_seen": 221335915, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 10277, + "time_per_iteration": 2.419820547103882 + }, + { + "auxiliary_loss_clip": 0.01107683, + "auxiliary_loss_mlp": 0.01031431, + "balance_loss_clip": 1.01819897, + "balance_loss_mlp": 1.03688812, + "epoch": 0.6179467909213888, + "flos": 20991510748800.0, + "grad_norm": 1.6341046500403578, + "language_loss": 0.81401992, + "learning_rate": 1.3453399180580466e-06, + "loss": 0.83541107, + "num_input_tokens_seen": 221353965, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.70703125, + "step": 10278, + "time_per_iteration": 2.451904058456421 + }, + { + "auxiliary_loss_clip": 0.01104247, + "auxiliary_loss_mlp": 0.01031724, + "balance_loss_clip": 1.02002394, + "balance_loss_mlp": 1.03586221, + "epoch": 0.6180069141740568, + "flos": 25338102606720.0, + "grad_norm": 1.4680885836846245, + "language_loss": 0.73827434, + "learning_rate": 1.3449719242072567e-06, + "loss": 0.75963408, + "num_input_tokens_seen": 221374080, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.68359375, + "step": 10279, + "time_per_iteration": 2.4702413082122803 + }, + { + "auxiliary_loss_clip": 0.01101252, + "auxiliary_loss_mlp": 0.01028202, + "balance_loss_clip": 1.01637602, + "balance_loss_mlp": 1.03415704, + "epoch": 0.6180670374267248, + "flos": 19645722858240.0, + "grad_norm": 1.5792662413822172, + "language_loss": 0.7052443, + "learning_rate": 1.3446039551951975e-06, + "loss": 0.72653878, + "num_input_tokens_seen": 221392910, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 10280, + "time_per_iteration": 2.439377784729004 + }, + { + "auxiliary_loss_clip": 0.01105085, + "auxiliary_loss_mlp": 0.01032737, + "balance_loss_clip": 1.02039266, + "balance_loss_mlp": 1.03673506, + "epoch": 0.6181271606793928, + "flos": 19464876858240.0, + "grad_norm": 1.433650263791477, + "language_loss": 0.72634661, + "learning_rate": 1.3442360110358215e-06, + "loss": 0.74772483, + "num_input_tokens_seen": 221410990, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.68359375, + "step": 10281, + "time_per_iteration": 2.4201571941375732 + }, + { + "auxiliary_loss_clip": 0.01100944, + "auxiliary_loss_mlp": 0.01030541, + "balance_loss_clip": 1.01990116, + "balance_loss_mlp": 1.0367198, + "epoch": 0.6181872839320607, + "flos": 25594289383680.0, + "grad_norm": 1.5669625672401193, + "language_loss": 0.76539791, + "learning_rate": 1.3438680917430827e-06, + "loss": 0.78671277, + "num_input_tokens_seen": 221431020, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.640625, + "step": 10282, + "time_per_iteration": 2.4729509353637695 + }, + { + "auxiliary_loss_clip": 0.01105858, + "auxiliary_loss_mlp": 0.01032009, + "balance_loss_clip": 1.01784086, + "balance_loss_mlp": 1.03611851, + "epoch": 0.6182474071847287, + "flos": 25551806572800.0, + "grad_norm": 1.557918367732971, + "language_loss": 0.69140053, + "learning_rate": 1.343500197330931e-06, + "loss": 0.71277922, + "num_input_tokens_seen": 221453235, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.69921875, + "step": 10283, + "time_per_iteration": 2.4644439220428467 + }, + { + "auxiliary_loss_clip": 0.01110819, + "auxiliary_loss_mlp": 0.01029964, + "balance_loss_clip": 1.01680923, + "balance_loss_mlp": 1.03751874, + "epoch": 0.6183075304373966, + "flos": 22123738327680.0, + "grad_norm": 1.5819420485757947, + "language_loss": 0.74983263, + "learning_rate": 1.3431323278133176e-06, + "loss": 0.77124047, + "num_input_tokens_seen": 221472560, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 10284, + "time_per_iteration": 2.4563488960266113 + }, + { + "auxiliary_loss_clip": 0.01100937, + "auxiliary_loss_mlp": 0.01034089, + "balance_loss_clip": 1.02219248, + "balance_loss_mlp": 1.03690124, + "epoch": 0.6183676536900646, + "flos": 22455589104000.0, + "grad_norm": 1.4660610214457293, + "language_loss": 0.75491369, + "learning_rate": 1.3427644832041922e-06, + "loss": 0.77626395, + "num_input_tokens_seen": 221492835, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.640625, + "step": 10285, + "time_per_iteration": 2.4554288387298584 + }, + { + "auxiliary_loss_clip": 0.01103991, + "auxiliary_loss_mlp": 0.01031793, + "balance_loss_clip": 1.0199194, + "balance_loss_mlp": 1.03520298, + "epoch": 0.6184277769427327, + "flos": 23364128736000.0, + "grad_norm": 1.5161367182822474, + "language_loss": 0.7299751, + "learning_rate": 1.342396663517503e-06, + "loss": 0.751333, + "num_input_tokens_seen": 221511870, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 10286, + "time_per_iteration": 2.487755060195923 + }, + { + "auxiliary_loss_clip": 0.01102671, + "auxiliary_loss_mlp": 0.01025604, + "balance_loss_clip": 1.01424325, + "balance_loss_mlp": 1.03537941, + "epoch": 0.6184879001954006, + "flos": 22711057608960.0, + "grad_norm": 2.03959974890174, + "language_loss": 0.75874734, + "learning_rate": 1.342028868767199e-06, + "loss": 0.78003013, + "num_input_tokens_seen": 221529915, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.671875, + "step": 10287, + "time_per_iteration": 2.4449198246002197 + }, + { + "auxiliary_loss_clip": 0.0110312, + "auxiliary_loss_mlp": 0.01031317, + "balance_loss_clip": 1.01973581, + "balance_loss_mlp": 1.03618407, + "epoch": 0.6185480234480686, + "flos": 23841920471040.0, + "grad_norm": 1.6506833358218813, + "language_loss": 0.72823429, + "learning_rate": 1.3416610989672262e-06, + "loss": 0.74957871, + "num_input_tokens_seen": 221549745, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66796875, + "step": 10288, + "time_per_iteration": 2.469217538833618 + }, + { + "auxiliary_loss_clip": 0.01099107, + "auxiliary_loss_mlp": 0.01032543, + "balance_loss_clip": 1.02139127, + "balance_loss_mlp": 1.03515327, + "epoch": 0.6186081467007365, + "flos": 45477595774080.0, + "grad_norm": 1.4866118467097145, + "language_loss": 0.72703552, + "learning_rate": 1.3412933541315296e-06, + "loss": 0.74835199, + "num_input_tokens_seen": 221572455, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.640625, + "step": 10289, + "time_per_iteration": 2.6342008113861084 + }, + { + "auxiliary_loss_clip": 0.01105306, + "auxiliary_loss_mlp": 0.01030634, + "balance_loss_clip": 1.01854038, + "balance_loss_mlp": 1.03557706, + "epoch": 0.6186682699534045, + "flos": 23550864566400.0, + "grad_norm": 1.5657368356700847, + "language_loss": 0.79090887, + "learning_rate": 1.340925634274056e-06, + "loss": 0.81226832, + "num_input_tokens_seen": 221591325, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69921875, + "step": 10290, + "time_per_iteration": 2.4762990474700928 + }, + { + "auxiliary_loss_clip": 0.01106885, + "auxiliary_loss_mlp": 0.01031151, + "balance_loss_clip": 1.01883626, + "balance_loss_mlp": 1.03720856, + "epoch": 0.6187283932060724, + "flos": 25774201630080.0, + "grad_norm": 1.6315677183830801, + "language_loss": 0.81586653, + "learning_rate": 1.3405579394087475e-06, + "loss": 0.83724689, + "num_input_tokens_seen": 221611640, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 10291, + "time_per_iteration": 2.46706223487854 + }, + { + "auxiliary_loss_clip": 0.01103179, + "auxiliary_loss_mlp": 0.01031435, + "balance_loss_clip": 1.01962161, + "balance_loss_mlp": 1.0360167, + "epoch": 0.6187885164587404, + "flos": 25265203954560.0, + "grad_norm": 1.907685541449211, + "language_loss": 0.77654225, + "learning_rate": 1.3401902695495487e-06, + "loss": 0.7978884, + "num_input_tokens_seen": 221631225, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 10292, + "time_per_iteration": 2.4810614585876465 + }, + { + "auxiliary_loss_clip": 0.01111234, + "auxiliary_loss_mlp": 0.01038447, + "balance_loss_clip": 1.02459502, + "balance_loss_mlp": 1.03891051, + "epoch": 0.6188486397114084, + "flos": 26250772302720.0, + "grad_norm": 1.9028504578301217, + "language_loss": 0.737167, + "learning_rate": 1.339822624710401e-06, + "loss": 0.75866383, + "num_input_tokens_seen": 221651035, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.72265625, + "step": 10293, + "time_per_iteration": 2.516528844833374 + }, + { + "auxiliary_loss_clip": 0.01106754, + "auxiliary_loss_mlp": 0.01032743, + "balance_loss_clip": 1.02110207, + "balance_loss_mlp": 1.03902757, + "epoch": 0.6189087629640764, + "flos": 20923388605440.0, + "grad_norm": 2.0122354574742602, + "language_loss": 0.83089775, + "learning_rate": 1.3394550049052454e-06, + "loss": 0.85229266, + "num_input_tokens_seen": 221671300, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.67578125, + "step": 10294, + "time_per_iteration": 2.499441623687744 + }, + { + "auxiliary_loss_clip": 0.01106207, + "auxiliary_loss_mlp": 0.01030165, + "balance_loss_clip": 1.01833987, + "balance_loss_mlp": 1.03719449, + "epoch": 0.6189688862167443, + "flos": 14829814874880.0, + "grad_norm": 2.183209160789612, + "language_loss": 0.70951724, + "learning_rate": 1.3390874101480225e-06, + "loss": 0.73088086, + "num_input_tokens_seen": 221687320, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 10295, + "time_per_iteration": 2.4442856311798096 + }, + { + "auxiliary_loss_clip": 0.01105622, + "auxiliary_loss_mlp": 0.01033007, + "balance_loss_clip": 1.02100849, + "balance_loss_mlp": 1.03787184, + "epoch": 0.6190290094694123, + "flos": 24285058560000.0, + "grad_norm": 1.6245756110977043, + "language_loss": 0.70113528, + "learning_rate": 1.3387198404526705e-06, + "loss": 0.72252154, + "num_input_tokens_seen": 221710175, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 10296, + "time_per_iteration": 2.636453866958618 + }, + { + "auxiliary_loss_clip": 0.01108503, + "auxiliary_loss_mlp": 0.01033341, + "balance_loss_clip": 1.02007866, + "balance_loss_mlp": 1.03864932, + "epoch": 0.6190891327220802, + "flos": 22529457423360.0, + "grad_norm": 2.076478179664887, + "language_loss": 0.71677291, + "learning_rate": 1.3383522958331287e-06, + "loss": 0.73819137, + "num_input_tokens_seen": 221728145, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.69921875, + "step": 10297, + "time_per_iteration": 2.487703800201416 + }, + { + "auxiliary_loss_clip": 0.01028294, + "auxiliary_loss_mlp": 0.00997518, + "balance_loss_clip": 0.9962309, + "balance_loss_mlp": 1.00701296, + "epoch": 0.6191492559747482, + "flos": 67729357152000.0, + "grad_norm": 0.8802858185205813, + "language_loss": 0.64150029, + "learning_rate": 1.3379847763033345e-06, + "loss": 0.66175842, + "num_input_tokens_seen": 221786100, + "router_z_loss_clip": 0.01287842, + "router_z_loss_mlp": 0.21289062, + "step": 10298, + "time_per_iteration": 2.959296226501465 + }, + { + "auxiliary_loss_clip": 0.01105855, + "auxiliary_loss_mlp": 0.01032235, + "balance_loss_clip": 1.0202961, + "balance_loss_mlp": 1.03661466, + "epoch": 0.6192093792274163, + "flos": 22346672088960.0, + "grad_norm": 1.6984885948159927, + "language_loss": 0.74105954, + "learning_rate": 1.3376172818772236e-06, + "loss": 0.76244044, + "num_input_tokens_seen": 221806450, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.69140625, + "step": 10299, + "time_per_iteration": 2.450899124145508 + }, + { + "auxiliary_loss_clip": 0.01109628, + "auxiliary_loss_mlp": 0.01032817, + "balance_loss_clip": 1.02075887, + "balance_loss_mlp": 1.0376761, + "epoch": 0.6192695024800842, + "flos": 13553944807680.0, + "grad_norm": 1.8344519767478165, + "language_loss": 0.68278986, + "learning_rate": 1.337249812568732e-06, + "loss": 0.70421433, + "num_input_tokens_seen": 221823330, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.71875, + "step": 10300, + "time_per_iteration": 2.4547624588012695 + }, + { + "auxiliary_loss_clip": 0.01108413, + "auxiliary_loss_mlp": 0.01033534, + "balance_loss_clip": 1.0215776, + "balance_loss_mlp": 1.03889441, + "epoch": 0.6193296257327522, + "flos": 17415310815360.0, + "grad_norm": 1.8244494071351975, + "language_loss": 0.66936946, + "learning_rate": 1.3368823683917939e-06, + "loss": 0.69078887, + "num_input_tokens_seen": 221839360, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6953125, + "step": 10301, + "time_per_iteration": 2.467451810836792 + }, + { + "auxiliary_loss_clip": 0.01104043, + "auxiliary_loss_mlp": 0.01029493, + "balance_loss_clip": 1.018013, + "balance_loss_mlp": 1.03542924, + "epoch": 0.6193897489854201, + "flos": 31101118450560.0, + "grad_norm": 2.0193419698977317, + "language_loss": 0.73042768, + "learning_rate": 1.3365149493603424e-06, + "loss": 0.75176305, + "num_input_tokens_seen": 221859465, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6875, + "step": 10302, + "time_per_iteration": 4.012500762939453 + }, + { + "auxiliary_loss_clip": 0.0110528, + "auxiliary_loss_mlp": 0.01031712, + "balance_loss_clip": 1.01923048, + "balance_loss_mlp": 1.03690219, + "epoch": 0.6194498722380881, + "flos": 19134031662720.0, + "grad_norm": 1.8664060987198585, + "language_loss": 0.80371857, + "learning_rate": 1.3361475554883107e-06, + "loss": 0.82508844, + "num_input_tokens_seen": 221878555, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 10303, + "time_per_iteration": 2.437244176864624 + }, + { + "auxiliary_loss_clip": 0.01107499, + "auxiliary_loss_mlp": 0.01031441, + "balance_loss_clip": 1.01827395, + "balance_loss_mlp": 1.03684223, + "epoch": 0.619509995490756, + "flos": 21835088634240.0, + "grad_norm": 1.5617333087985545, + "language_loss": 0.76300073, + "learning_rate": 1.3357801867896307e-06, + "loss": 0.78439015, + "num_input_tokens_seen": 221898790, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.70703125, + "step": 10304, + "time_per_iteration": 3.8231778144836426 + }, + { + "auxiliary_loss_clip": 0.01110648, + "auxiliary_loss_mlp": 0.01034085, + "balance_loss_clip": 1.02169371, + "balance_loss_mlp": 1.03864741, + "epoch": 0.619570118743424, + "flos": 23806548552960.0, + "grad_norm": 2.062841626901626, + "language_loss": 0.77207863, + "learning_rate": 1.3354128432782324e-06, + "loss": 0.79352599, + "num_input_tokens_seen": 221918875, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71875, + "step": 10305, + "time_per_iteration": 5.318151473999023 + }, + { + "auxiliary_loss_clip": 0.01111243, + "auxiliary_loss_mlp": 0.01032824, + "balance_loss_clip": 1.01918018, + "balance_loss_mlp": 1.03931832, + "epoch": 0.619630241996092, + "flos": 21101612912640.0, + "grad_norm": 1.6773478766205938, + "language_loss": 0.78826416, + "learning_rate": 1.335045524968045e-06, + "loss": 0.80970484, + "num_input_tokens_seen": 221937895, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71875, + "step": 10306, + "time_per_iteration": 2.4717702865600586 + }, + { + "auxiliary_loss_clip": 0.01099716, + "auxiliary_loss_mlp": 0.01026237, + "balance_loss_clip": 1.01576495, + "balance_loss_mlp": 1.03520381, + "epoch": 0.61969036524876, + "flos": 27308269635840.0, + "grad_norm": 1.579957954838489, + "language_loss": 0.79917157, + "learning_rate": 1.3346782318729988e-06, + "loss": 0.82043117, + "num_input_tokens_seen": 221955920, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.64453125, + "step": 10307, + "time_per_iteration": 2.51257586479187 + }, + { + "auxiliary_loss_clip": 0.01027759, + "auxiliary_loss_mlp": 0.00997846, + "balance_loss_clip": 0.99666041, + "balance_loss_mlp": 1.00661421, + "epoch": 0.6197504885014279, + "flos": 51648955384320.0, + "grad_norm": 0.8254095728079679, + "language_loss": 0.59419918, + "learning_rate": 1.3343109640070203e-06, + "loss": 0.61445522, + "num_input_tokens_seen": 222011405, + "router_z_loss_clip": 0.01184082, + "router_z_loss_mlp": 0.2109375, + "step": 10308, + "time_per_iteration": 3.087841510772705 + }, + { + "auxiliary_loss_clip": 0.01102523, + "auxiliary_loss_mlp": 0.01026948, + "balance_loss_clip": 1.01634467, + "balance_loss_mlp": 1.0360744, + "epoch": 0.6198106117540959, + "flos": 30557107992960.0, + "grad_norm": 1.8503774737615284, + "language_loss": 0.67855436, + "learning_rate": 1.333943721384037e-06, + "loss": 0.69984901, + "num_input_tokens_seen": 222034545, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.6640625, + "step": 10309, + "time_per_iteration": 2.516601800918579 + }, + { + "auxiliary_loss_clip": 0.01105412, + "auxiliary_loss_mlp": 0.01031211, + "balance_loss_clip": 1.01924789, + "balance_loss_mlp": 1.03811872, + "epoch": 0.6198707350067638, + "flos": 18909733184640.0, + "grad_norm": 1.5770368221477629, + "language_loss": 0.71985435, + "learning_rate": 1.3335765040179746e-06, + "loss": 0.74122059, + "num_input_tokens_seen": 222052690, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.67578125, + "step": 10310, + "time_per_iteration": 2.4543659687042236 + }, + { + "auxiliary_loss_clip": 0.01109202, + "auxiliary_loss_mlp": 0.01032021, + "balance_loss_clip": 1.01870525, + "balance_loss_mlp": 1.03908801, + "epoch": 0.6199308582594318, + "flos": 21433858738560.0, + "grad_norm": 1.8624693813193853, + "language_loss": 0.78939658, + "learning_rate": 1.3332093119227573e-06, + "loss": 0.81080884, + "num_input_tokens_seen": 222069095, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.703125, + "step": 10311, + "time_per_iteration": 2.4637980461120605 + }, + { + "auxiliary_loss_clip": 0.01105244, + "auxiliary_loss_mlp": 0.01032184, + "balance_loss_clip": 1.01957762, + "balance_loss_mlp": 1.03495574, + "epoch": 0.6199909815120999, + "flos": 18407379525120.0, + "grad_norm": 1.9506851073512315, + "language_loss": 0.72994781, + "learning_rate": 1.3328421451123105e-06, + "loss": 0.75132203, + "num_input_tokens_seen": 222087360, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 10312, + "time_per_iteration": 2.4388468265533447 + }, + { + "auxiliary_loss_clip": 0.0110771, + "auxiliary_loss_mlp": 0.01035173, + "balance_loss_clip": 1.02284074, + "balance_loss_mlp": 1.0381484, + "epoch": 0.6200511047647678, + "flos": 21466860359040.0, + "grad_norm": 2.1707252036738502, + "language_loss": 0.71927798, + "learning_rate": 1.3324750036005557e-06, + "loss": 0.7407068, + "num_input_tokens_seen": 222106130, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 10313, + "time_per_iteration": 2.4896764755249023 + }, + { + "auxiliary_loss_clip": 0.01108842, + "auxiliary_loss_mlp": 0.01030693, + "balance_loss_clip": 1.01780641, + "balance_loss_mlp": 1.03789592, + "epoch": 0.6201112280174358, + "flos": 18215903099520.0, + "grad_norm": 1.8177190018334353, + "language_loss": 0.78071815, + "learning_rate": 1.332107887401416e-06, + "loss": 0.80211347, + "num_input_tokens_seen": 222123125, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 10314, + "time_per_iteration": 2.4607138633728027 + }, + { + "auxiliary_loss_clip": 0.01105035, + "auxiliary_loss_mlp": 0.01033382, + "balance_loss_clip": 1.02113914, + "balance_loss_mlp": 1.03498077, + "epoch": 0.6201713512701037, + "flos": 20011185786240.0, + "grad_norm": 1.7685018834569248, + "language_loss": 0.78155088, + "learning_rate": 1.331740796528812e-06, + "loss": 0.80293512, + "num_input_tokens_seen": 222140655, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.703125, + "step": 10315, + "time_per_iteration": 2.428445816040039 + }, + { + "auxiliary_loss_clip": 0.01109232, + "auxiliary_loss_mlp": 0.01033411, + "balance_loss_clip": 1.02145982, + "balance_loss_mlp": 1.03922391, + "epoch": 0.6202314745227717, + "flos": 22487692884480.0, + "grad_norm": 2.596321726125175, + "language_loss": 0.76265639, + "learning_rate": 1.3313737309966641e-06, + "loss": 0.78408277, + "num_input_tokens_seen": 222160450, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.703125, + "step": 10316, + "time_per_iteration": 2.463766098022461 + }, + { + "auxiliary_loss_clip": 0.01105873, + "auxiliary_loss_mlp": 0.01030686, + "balance_loss_clip": 1.01823497, + "balance_loss_mlp": 1.0344758, + "epoch": 0.6202915977754396, + "flos": 26828682220800.0, + "grad_norm": 1.9163692596467958, + "language_loss": 0.77438551, + "learning_rate": 1.3310066908188915e-06, + "loss": 0.79575109, + "num_input_tokens_seen": 222179170, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 10317, + "time_per_iteration": 2.468884229660034 + }, + { + "auxiliary_loss_clip": 0.01027239, + "auxiliary_loss_mlp": 0.00999035, + "balance_loss_clip": 0.99786037, + "balance_loss_mlp": 1.00593257, + "epoch": 0.6203517210281076, + "flos": 62742694890240.0, + "grad_norm": 0.6919425802260456, + "language_loss": 0.59057474, + "learning_rate": 1.3306396760094122e-06, + "loss": 0.61083746, + "num_input_tokens_seen": 222242660, + "router_z_loss_clip": 0.01171875, + "router_z_loss_mlp": 0.21289062, + "step": 10318, + "time_per_iteration": 3.090552568435669 + }, + { + "auxiliary_loss_clip": 0.0110802, + "auxiliary_loss_mlp": 0.01034141, + "balance_loss_clip": 1.02163601, + "balance_loss_mlp": 1.03937101, + "epoch": 0.6204118442807756, + "flos": 23404277162880.0, + "grad_norm": 1.6841357417658411, + "language_loss": 0.77685571, + "learning_rate": 1.330272686582143e-06, + "loss": 0.79827732, + "num_input_tokens_seen": 222262170, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 10319, + "time_per_iteration": 2.4693212509155273 + }, + { + "auxiliary_loss_clip": 0.01104902, + "auxiliary_loss_mlp": 0.0103113, + "balance_loss_clip": 1.01963234, + "balance_loss_mlp": 1.03732896, + "epoch": 0.6204719675334436, + "flos": 20193647898240.0, + "grad_norm": 2.3469109769721377, + "language_loss": 0.66256416, + "learning_rate": 1.3299057225510013e-06, + "loss": 0.68392456, + "num_input_tokens_seen": 222280375, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.67578125, + "step": 10320, + "time_per_iteration": 2.447006940841675 + }, + { + "auxiliary_loss_clip": 0.01102632, + "auxiliary_loss_mlp": 0.01030299, + "balance_loss_clip": 1.01918244, + "balance_loss_mlp": 1.03645897, + "epoch": 0.6205320907861115, + "flos": 13188050916480.0, + "grad_norm": 1.6640363363170714, + "language_loss": 0.76396954, + "learning_rate": 1.3295387839299013e-06, + "loss": 0.78529894, + "num_input_tokens_seen": 222297325, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66015625, + "step": 10321, + "time_per_iteration": 2.439819574356079 + }, + { + "auxiliary_loss_clip": 0.01102881, + "auxiliary_loss_mlp": 0.01027928, + "balance_loss_clip": 1.01653743, + "balance_loss_mlp": 1.03596795, + "epoch": 0.6205922140387795, + "flos": 20668386977280.0, + "grad_norm": 1.7446721342838176, + "language_loss": 0.73165452, + "learning_rate": 1.329171870732758e-06, + "loss": 0.75296265, + "num_input_tokens_seen": 222317095, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.671875, + "step": 10322, + "time_per_iteration": 2.4455277919769287 + }, + { + "auxiliary_loss_clip": 0.01105105, + "auxiliary_loss_mlp": 0.01024456, + "balance_loss_clip": 1.01309574, + "balance_loss_mlp": 1.03739095, + "epoch": 0.6206523372914474, + "flos": 23877831093120.0, + "grad_norm": 2.5506684456453157, + "language_loss": 0.73217744, + "learning_rate": 1.3288049829734845e-06, + "loss": 0.75347304, + "num_input_tokens_seen": 222337055, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.67578125, + "step": 10323, + "time_per_iteration": 2.4893054962158203 + }, + { + "auxiliary_loss_clip": 0.01114414, + "auxiliary_loss_mlp": 0.01033533, + "balance_loss_clip": 1.02086651, + "balance_loss_mlp": 1.04062796, + "epoch": 0.6207124605441154, + "flos": 13406603218560.0, + "grad_norm": 2.3064550645164354, + "language_loss": 0.58989835, + "learning_rate": 1.3284381206659933e-06, + "loss": 0.61137784, + "num_input_tokens_seen": 222354515, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.73828125, + "step": 10324, + "time_per_iteration": 2.4318976402282715 + }, + { + "auxiliary_loss_clip": 0.01108806, + "auxiliary_loss_mlp": 0.01030221, + "balance_loss_clip": 1.01746607, + "balance_loss_mlp": 1.03886914, + "epoch": 0.6207725837967835, + "flos": 18916341287040.0, + "grad_norm": 2.054520538169497, + "language_loss": 0.76530892, + "learning_rate": 1.3280712838241956e-06, + "loss": 0.78669918, + "num_input_tokens_seen": 222372755, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 10325, + "time_per_iteration": 2.4457478523254395 + }, + { + "auxiliary_loss_clip": 0.01107557, + "auxiliary_loss_mlp": 0.01027615, + "balance_loss_clip": 1.01502085, + "balance_loss_mlp": 1.03696799, + "epoch": 0.6208327070494514, + "flos": 23980211832960.0, + "grad_norm": 1.7674606629656198, + "language_loss": 0.72749656, + "learning_rate": 1.327704472462003e-06, + "loss": 0.74884826, + "num_input_tokens_seen": 222391380, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.70703125, + "step": 10326, + "time_per_iteration": 2.469116687774658 + }, + { + "auxiliary_loss_clip": 0.01108969, + "auxiliary_loss_mlp": 0.01033575, + "balance_loss_clip": 1.02126646, + "balance_loss_mlp": 1.03798246, + "epoch": 0.6208928303021194, + "flos": 22820405587200.0, + "grad_norm": 3.158836515239834, + "language_loss": 0.73515177, + "learning_rate": 1.3273376865933234e-06, + "loss": 0.75657719, + "num_input_tokens_seen": 222411165, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.7109375, + "step": 10327, + "time_per_iteration": 2.4524545669555664 + }, + { + "auxiliary_loss_clip": 0.01109109, + "auxiliary_loss_mlp": 0.01031256, + "balance_loss_clip": 1.01832175, + "balance_loss_mlp": 1.03808546, + "epoch": 0.6209529535547873, + "flos": 17564519911680.0, + "grad_norm": 2.016240551650266, + "language_loss": 0.7945962, + "learning_rate": 1.326970926232066e-06, + "loss": 0.81599987, + "num_input_tokens_seen": 222428110, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 10328, + "time_per_iteration": 2.4385621547698975 + }, + { + "auxiliary_loss_clip": 0.01108206, + "auxiliary_loss_mlp": 0.01036726, + "balance_loss_clip": 1.02380335, + "balance_loss_mlp": 1.03790045, + "epoch": 0.6210130768074553, + "flos": 22011912311040.0, + "grad_norm": 1.9358397907066565, + "language_loss": 0.77753472, + "learning_rate": 1.3266041913921396e-06, + "loss": 0.79898405, + "num_input_tokens_seen": 222446385, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 10329, + "time_per_iteration": 2.462999105453491 + }, + { + "auxiliary_loss_clip": 0.01028614, + "auxiliary_loss_mlp": 0.01005403, + "balance_loss_clip": 1.00426447, + "balance_loss_mlp": 1.00714183, + "epoch": 0.6210732000601232, + "flos": 63676873854720.0, + "grad_norm": 0.8271913018767197, + "language_loss": 0.62140441, + "learning_rate": 1.3262374820874484e-06, + "loss": 0.64174461, + "num_input_tokens_seen": 222502150, + "router_z_loss_clip": 0.01141357, + "router_z_loss_mlp": 0.21484375, + "step": 10330, + "time_per_iteration": 3.0160677433013916 + }, + { + "auxiliary_loss_clip": 0.01111605, + "auxiliary_loss_mlp": 0.0103314, + "balance_loss_clip": 1.02014053, + "balance_loss_mlp": 1.03902602, + "epoch": 0.6211333233127913, + "flos": 24243365848320.0, + "grad_norm": 2.119882521955809, + "language_loss": 0.77734917, + "learning_rate": 1.3258707983319002e-06, + "loss": 0.79879665, + "num_input_tokens_seen": 222519880, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7265625, + "step": 10331, + "time_per_iteration": 2.489560842514038 + }, + { + "auxiliary_loss_clip": 0.01110147, + "auxiliary_loss_mlp": 0.01034338, + "balance_loss_clip": 1.0218091, + "balance_loss_mlp": 1.0385623, + "epoch": 0.6211934465654592, + "flos": 16943803960320.0, + "grad_norm": 2.1826239313183486, + "language_loss": 0.67408252, + "learning_rate": 1.3255041401393992e-06, + "loss": 0.69552743, + "num_input_tokens_seen": 222538545, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71875, + "step": 10332, + "time_per_iteration": 2.425645112991333 + }, + { + "auxiliary_loss_clip": 0.0110723, + "auxiliary_loss_mlp": 0.01027428, + "balance_loss_clip": 1.01532817, + "balance_loss_mlp": 1.03766382, + "epoch": 0.6212535698181272, + "flos": 15267386355840.0, + "grad_norm": 1.6359189592805878, + "language_loss": 0.76677281, + "learning_rate": 1.3251375075238476e-06, + "loss": 0.78811944, + "num_input_tokens_seen": 222556935, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 10333, + "time_per_iteration": 2.4364230632781982 + }, + { + "auxiliary_loss_clip": 0.01105905, + "auxiliary_loss_mlp": 0.01028787, + "balance_loss_clip": 1.01689601, + "balance_loss_mlp": 1.03827369, + "epoch": 0.6213136930707951, + "flos": 13443950384640.0, + "grad_norm": 2.0485781293514793, + "language_loss": 0.69575661, + "learning_rate": 1.3247709004991507e-06, + "loss": 0.71710348, + "num_input_tokens_seen": 222574035, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 10334, + "time_per_iteration": 2.4257168769836426 + }, + { + "auxiliary_loss_clip": 0.01107391, + "auxiliary_loss_mlp": 0.01028969, + "balance_loss_clip": 1.01766801, + "balance_loss_mlp": 1.03944373, + "epoch": 0.6213738163234631, + "flos": 18111223889280.0, + "grad_norm": 2.0078352045306507, + "language_loss": 0.70201457, + "learning_rate": 1.3244043190792078e-06, + "loss": 0.72337818, + "num_input_tokens_seen": 222592290, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6796875, + "step": 10335, + "time_per_iteration": 2.47383451461792 + }, + { + "auxiliary_loss_clip": 0.01102603, + "auxiliary_loss_mlp": 0.01030557, + "balance_loss_clip": 1.01889277, + "balance_loss_mlp": 1.03563762, + "epoch": 0.621433939576131, + "flos": 25337348421120.0, + "grad_norm": 1.47099412595651, + "language_loss": 0.80045199, + "learning_rate": 1.3240377632779213e-06, + "loss": 0.82178366, + "num_input_tokens_seen": 222612805, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66796875, + "step": 10336, + "time_per_iteration": 2.476863145828247 + }, + { + "auxiliary_loss_clip": 0.01103545, + "auxiliary_loss_mlp": 0.01027933, + "balance_loss_clip": 1.01593423, + "balance_loss_mlp": 1.03639817, + "epoch": 0.621494062828799, + "flos": 22565619440640.0, + "grad_norm": 1.8768203229000895, + "language_loss": 0.73504305, + "learning_rate": 1.3236712331091907e-06, + "loss": 0.75635779, + "num_input_tokens_seen": 222632260, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 10337, + "time_per_iteration": 2.4732797145843506 + }, + { + "auxiliary_loss_clip": 0.01109544, + "auxiliary_loss_mlp": 0.01030918, + "balance_loss_clip": 1.01764417, + "balance_loss_mlp": 1.03801644, + "epoch": 0.621554186081467, + "flos": 27417976750080.0, + "grad_norm": 1.8614452301224431, + "language_loss": 0.63164204, + "learning_rate": 1.3233047285869145e-06, + "loss": 0.65304667, + "num_input_tokens_seen": 222653570, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71484375, + "step": 10338, + "time_per_iteration": 2.4973182678222656 + }, + { + "auxiliary_loss_clip": 0.01106095, + "auxiliary_loss_mlp": 0.01029305, + "balance_loss_clip": 1.01755667, + "balance_loss_mlp": 1.03789639, + "epoch": 0.621614309334135, + "flos": 22346815743360.0, + "grad_norm": 2.390170977530988, + "language_loss": 0.71337169, + "learning_rate": 1.322938249724991e-06, + "loss": 0.73472571, + "num_input_tokens_seen": 222672480, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.68359375, + "step": 10339, + "time_per_iteration": 2.47871994972229 + }, + { + "auxiliary_loss_clip": 0.01103361, + "auxiliary_loss_mlp": 0.01027491, + "balance_loss_clip": 1.01571906, + "balance_loss_mlp": 1.03734398, + "epoch": 0.621674432586803, + "flos": 19281229597440.0, + "grad_norm": 1.5831202152699189, + "language_loss": 0.69323343, + "learning_rate": 1.3225717965373166e-06, + "loss": 0.71454197, + "num_input_tokens_seen": 222691200, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.66015625, + "step": 10340, + "time_per_iteration": 2.445570707321167 + }, + { + "auxiliary_loss_clip": 0.0110187, + "auxiliary_loss_mlp": 0.01027129, + "balance_loss_clip": 1.0154407, + "balance_loss_mlp": 1.03529525, + "epoch": 0.6217345558394709, + "flos": 21609533180160.0, + "grad_norm": 3.3727615102843513, + "language_loss": 0.68661916, + "learning_rate": 1.322205369037788e-06, + "loss": 0.70790917, + "num_input_tokens_seen": 222709975, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 10341, + "time_per_iteration": 2.439035415649414 + }, + { + "auxiliary_loss_clip": 0.01107972, + "auxiliary_loss_mlp": 0.0102942, + "balance_loss_clip": 1.01605105, + "balance_loss_mlp": 1.03783154, + "epoch": 0.6217946790921389, + "flos": 18004102554240.0, + "grad_norm": 2.06494623621423, + "language_loss": 0.81278366, + "learning_rate": 1.321838967240299e-06, + "loss": 0.83415759, + "num_input_tokens_seen": 222729005, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.703125, + "step": 10342, + "time_per_iteration": 2.445607900619507 + }, + { + "auxiliary_loss_clip": 0.01027883, + "auxiliary_loss_mlp": 0.01003385, + "balance_loss_clip": 1.00214529, + "balance_loss_mlp": 1.00662279, + "epoch": 0.6218548023448068, + "flos": 61973631768960.0, + "grad_norm": 0.7785995287469357, + "language_loss": 0.57325292, + "learning_rate": 1.3214725911587452e-06, + "loss": 0.59356558, + "num_input_tokens_seen": 222786090, + "router_z_loss_clip": 0.01239014, + "router_z_loss_mlp": 0.21289062, + "step": 10343, + "time_per_iteration": 4.364051342010498 + }, + { + "auxiliary_loss_clip": 0.01102174, + "auxiliary_loss_mlp": 0.01023841, + "balance_loss_clip": 1.01308846, + "balance_loss_mlp": 1.03629875, + "epoch": 0.6219149255974749, + "flos": 25739152934400.0, + "grad_norm": 1.780281281905301, + "language_loss": 0.72907692, + "learning_rate": 1.3211062408070184e-06, + "loss": 0.75033712, + "num_input_tokens_seen": 222806100, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.66015625, + "step": 10344, + "time_per_iteration": 2.4766275882720947 + }, + { + "auxiliary_loss_clip": 0.01107045, + "auxiliary_loss_mlp": 0.01033365, + "balance_loss_clip": 1.02209353, + "balance_loss_mlp": 1.03881705, + "epoch": 0.6219750488501428, + "flos": 25411073086080.0, + "grad_norm": 1.738872083076136, + "language_loss": 0.59990644, + "learning_rate": 1.3207399161990105e-06, + "loss": 0.62131059, + "num_input_tokens_seen": 222826575, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6796875, + "step": 10345, + "time_per_iteration": 2.541123390197754 + }, + { + "auxiliary_loss_clip": 0.01104933, + "auxiliary_loss_mlp": 0.01031607, + "balance_loss_clip": 1.01948929, + "balance_loss_mlp": 1.0357126, + "epoch": 0.6220351721028108, + "flos": 20047383717120.0, + "grad_norm": 1.9219019260210024, + "language_loss": 0.78273392, + "learning_rate": 1.320373617348614e-06, + "loss": 0.80409932, + "num_input_tokens_seen": 222845285, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 10346, + "time_per_iteration": 5.315351724624634 + }, + { + "auxiliary_loss_clip": 0.01106477, + "auxiliary_loss_mlp": 0.01028801, + "balance_loss_clip": 1.01602221, + "balance_loss_mlp": 1.03580999, + "epoch": 0.6220952953554787, + "flos": 27488397363840.0, + "grad_norm": 1.6418210301478282, + "language_loss": 0.71802652, + "learning_rate": 1.3200073442697171e-06, + "loss": 0.73937929, + "num_input_tokens_seen": 222864575, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.70703125, + "step": 10347, + "time_per_iteration": 2.497929334640503 + }, + { + "auxiliary_loss_clip": 0.01102635, + "auxiliary_loss_mlp": 0.01028399, + "balance_loss_clip": 1.01597118, + "balance_loss_mlp": 1.03503013, + "epoch": 0.6221554186081467, + "flos": 19207612673280.0, + "grad_norm": 1.625266135857152, + "language_loss": 0.71975756, + "learning_rate": 1.3196410969762108e-06, + "loss": 0.74106789, + "num_input_tokens_seen": 222884420, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.67578125, + "step": 10348, + "time_per_iteration": 3.9235568046569824 + }, + { + "auxiliary_loss_clip": 0.01026634, + "auxiliary_loss_mlp": 0.01006199, + "balance_loss_clip": 1.00494766, + "balance_loss_mlp": 1.00541496, + "epoch": 0.6222155418608146, + "flos": 62950939989120.0, + "grad_norm": 0.8371335682612564, + "language_loss": 0.54224485, + "learning_rate": 1.3192748754819815e-06, + "loss": 0.56257325, + "num_input_tokens_seen": 222944690, + "router_z_loss_clip": 0.01251221, + "router_z_loss_mlp": 0.21289062, + "step": 10349, + "time_per_iteration": 3.0496747493743896 + }, + { + "auxiliary_loss_clip": 0.0110532, + "auxiliary_loss_mlp": 0.01026788, + "balance_loss_clip": 1.01496863, + "balance_loss_mlp": 1.03663087, + "epoch": 0.6222756651134826, + "flos": 22601099099520.0, + "grad_norm": 2.1582584328539594, + "language_loss": 0.69793445, + "learning_rate": 1.3189086798009173e-06, + "loss": 0.71925557, + "num_input_tokens_seen": 222962990, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 10350, + "time_per_iteration": 2.470149278640747 + }, + { + "auxiliary_loss_clip": 0.0110629, + "auxiliary_loss_mlp": 0.0103389, + "balance_loss_clip": 1.02183747, + "balance_loss_mlp": 1.03684473, + "epoch": 0.6223357883661506, + "flos": 21142228216320.0, + "grad_norm": 1.9147448057982832, + "language_loss": 0.56816912, + "learning_rate": 1.3185425099469046e-06, + "loss": 0.58957094, + "num_input_tokens_seen": 222980715, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6953125, + "step": 10351, + "time_per_iteration": 2.505211114883423 + }, + { + "auxiliary_loss_clip": 0.01026374, + "auxiliary_loss_mlp": 0.00993206, + "balance_loss_clip": 0.99188894, + "balance_loss_mlp": 1.00534272, + "epoch": 0.6223959116188186, + "flos": 63765071700480.0, + "grad_norm": 0.8115156894720258, + "language_loss": 0.61159444, + "learning_rate": 1.3181763659338276e-06, + "loss": 0.63179016, + "num_input_tokens_seen": 223040685, + "router_z_loss_clip": 0.01318359, + "router_z_loss_mlp": 0.2109375, + "step": 10352, + "time_per_iteration": 3.021286725997925 + }, + { + "auxiliary_loss_clip": 0.01101568, + "auxiliary_loss_mlp": 0.01029379, + "balance_loss_clip": 1.017488, + "balance_loss_mlp": 1.0351944, + "epoch": 0.6224560348714866, + "flos": 22565727181440.0, + "grad_norm": 2.081556495777929, + "language_loss": 0.81940329, + "learning_rate": 1.3178102477755714e-06, + "loss": 0.84071267, + "num_input_tokens_seen": 223059000, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6640625, + "step": 10353, + "time_per_iteration": 2.496713638305664 + }, + { + "auxiliary_loss_clip": 0.010991, + "auxiliary_loss_mlp": 0.010273, + "balance_loss_clip": 1.01638615, + "balance_loss_mlp": 1.03455448, + "epoch": 0.6225161581241545, + "flos": 24097748112000.0, + "grad_norm": 1.5710771766627751, + "language_loss": 0.7576375, + "learning_rate": 1.3174441554860195e-06, + "loss": 0.77890158, + "num_input_tokens_seen": 223079345, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.64453125, + "step": 10354, + "time_per_iteration": 2.4855527877807617 + }, + { + "auxiliary_loss_clip": 0.01103725, + "auxiliary_loss_mlp": 0.01028461, + "balance_loss_clip": 1.01658213, + "balance_loss_mlp": 1.03609419, + "epoch": 0.6225762813768225, + "flos": 20443513881600.0, + "grad_norm": 1.4655004554762274, + "language_loss": 0.78727663, + "learning_rate": 1.3170780890790528e-06, + "loss": 0.80859846, + "num_input_tokens_seen": 223097880, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.67578125, + "step": 10355, + "time_per_iteration": 2.445819616317749 + }, + { + "auxiliary_loss_clip": 0.01106453, + "auxiliary_loss_mlp": 0.01030134, + "balance_loss_clip": 1.01859486, + "balance_loss_mlp": 1.03856397, + "epoch": 0.6226364046294904, + "flos": 27198131558400.0, + "grad_norm": 1.5925757296601037, + "language_loss": 0.78048426, + "learning_rate": 1.3167120485685538e-06, + "loss": 0.80185014, + "num_input_tokens_seen": 223118185, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6796875, + "step": 10356, + "time_per_iteration": 2.4893651008605957 + }, + { + "auxiliary_loss_clip": 0.01110459, + "auxiliary_loss_mlp": 0.01031899, + "balance_loss_clip": 1.01882744, + "balance_loss_mlp": 1.0377419, + "epoch": 0.6226965278821585, + "flos": 20445776438400.0, + "grad_norm": 2.1577973787104923, + "language_loss": 0.67252231, + "learning_rate": 1.3163460339684024e-06, + "loss": 0.69394588, + "num_input_tokens_seen": 223137600, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 10357, + "time_per_iteration": 2.4467334747314453 + }, + { + "auxiliary_loss_clip": 0.01111299, + "auxiliary_loss_mlp": 0.01031087, + "balance_loss_clip": 1.01744306, + "balance_loss_mlp": 1.03907299, + "epoch": 0.6227566511348264, + "flos": 22162737519360.0, + "grad_norm": 2.813144519953157, + "language_loss": 0.75561357, + "learning_rate": 1.3159800452924778e-06, + "loss": 0.77703738, + "num_input_tokens_seen": 223154360, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.72265625, + "step": 10358, + "time_per_iteration": 2.516791343688965 + }, + { + "auxiliary_loss_clip": 0.01104161, + "auxiliary_loss_mlp": 0.01031224, + "balance_loss_clip": 1.01916623, + "balance_loss_mlp": 1.03473985, + "epoch": 0.6228167743874944, + "flos": 18040875102720.0, + "grad_norm": 2.219435804709828, + "language_loss": 0.82639635, + "learning_rate": 1.3156140825546588e-06, + "loss": 0.84775025, + "num_input_tokens_seen": 223172255, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6953125, + "step": 10359, + "time_per_iteration": 2.4310834407806396 + }, + { + "auxiliary_loss_clip": 0.01102353, + "auxiliary_loss_mlp": 0.01039222, + "balance_loss_clip": 1.02763474, + "balance_loss_mlp": 1.03537011, + "epoch": 0.6228768976401623, + "flos": 17742851959680.0, + "grad_norm": 2.303439975038256, + "language_loss": 0.73551476, + "learning_rate": 1.315248145768822e-06, + "loss": 0.75693059, + "num_input_tokens_seen": 223186965, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66796875, + "step": 10360, + "time_per_iteration": 2.4032440185546875 + }, + { + "auxiliary_loss_clip": 0.01104376, + "auxiliary_loss_mlp": 0.01033974, + "balance_loss_clip": 1.02152276, + "balance_loss_mlp": 1.03514135, + "epoch": 0.6229370208928303, + "flos": 17894934144000.0, + "grad_norm": 2.1872491258589877, + "language_loss": 0.78007793, + "learning_rate": 1.3148822349488442e-06, + "loss": 0.8014614, + "num_input_tokens_seen": 223206045, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.69140625, + "step": 10361, + "time_per_iteration": 2.432612419128418 + }, + { + "auxiliary_loss_clip": 0.01105247, + "auxiliary_loss_mlp": 0.01028519, + "balance_loss_clip": 1.01694417, + "balance_loss_mlp": 1.03777361, + "epoch": 0.6229971441454982, + "flos": 17347763289600.0, + "grad_norm": 2.0406207393391322, + "language_loss": 0.67669165, + "learning_rate": 1.3145163501086005e-06, + "loss": 0.69802934, + "num_input_tokens_seen": 223224820, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.671875, + "step": 10362, + "time_per_iteration": 2.4279119968414307 + }, + { + "auxiliary_loss_clip": 0.01105655, + "auxiliary_loss_mlp": 0.01029911, + "balance_loss_clip": 1.01772738, + "balance_loss_mlp": 1.03628147, + "epoch": 0.6230572673981662, + "flos": 29241376807680.0, + "grad_norm": 1.866995951195316, + "language_loss": 0.67914844, + "learning_rate": 1.3141504912619658e-06, + "loss": 0.70050412, + "num_input_tokens_seen": 223243205, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6953125, + "step": 10363, + "time_per_iteration": 2.5570461750030518 + }, + { + "auxiliary_loss_clip": 0.01107735, + "auxiliary_loss_mlp": 0.01034747, + "balance_loss_clip": 1.02156806, + "balance_loss_mlp": 1.03598118, + "epoch": 0.6231173906508342, + "flos": 16325961096960.0, + "grad_norm": 1.8313003501061587, + "language_loss": 0.86500871, + "learning_rate": 1.3137846584228127e-06, + "loss": 0.88643348, + "num_input_tokens_seen": 223261370, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71875, + "step": 10364, + "time_per_iteration": 2.4293837547302246 + }, + { + "auxiliary_loss_clip": 0.01025186, + "auxiliary_loss_mlp": 0.01006976, + "balance_loss_clip": 1.00571883, + "balance_loss_mlp": 1.00405002, + "epoch": 0.6231775139035022, + "flos": 68702032517760.0, + "grad_norm": 0.884662336082659, + "language_loss": 0.60777593, + "learning_rate": 1.313418851605015e-06, + "loss": 0.62809759, + "num_input_tokens_seen": 223315050, + "router_z_loss_clip": 0.01257324, + "router_z_loss_mlp": 0.2109375, + "step": 10365, + "time_per_iteration": 3.0822458267211914 + }, + { + "auxiliary_loss_clip": 0.01111747, + "auxiliary_loss_mlp": 0.01039491, + "balance_loss_clip": 1.02530479, + "balance_loss_mlp": 1.03808904, + "epoch": 0.6232376371561702, + "flos": 19821038163840.0, + "grad_norm": 2.2798464083102576, + "language_loss": 0.75205708, + "learning_rate": 1.3130530708224427e-06, + "loss": 0.77356946, + "num_input_tokens_seen": 223332130, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.73828125, + "step": 10366, + "time_per_iteration": 2.441955804824829 + }, + { + "auxiliary_loss_clip": 0.01108704, + "auxiliary_loss_mlp": 0.01040835, + "balance_loss_clip": 1.02833033, + "balance_loss_mlp": 1.03776455, + "epoch": 0.6232977604088381, + "flos": 23258264376960.0, + "grad_norm": 2.0199414320321725, + "language_loss": 0.76469356, + "learning_rate": 1.3126873160889665e-06, + "loss": 0.78618896, + "num_input_tokens_seen": 223351605, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7109375, + "step": 10367, + "time_per_iteration": 2.477055072784424 + }, + { + "auxiliary_loss_clip": 0.01105026, + "auxiliary_loss_mlp": 0.01034491, + "balance_loss_clip": 1.02271938, + "balance_loss_mlp": 1.03831315, + "epoch": 0.6233578836615061, + "flos": 21106425335040.0, + "grad_norm": 1.4367646696128493, + "language_loss": 0.78561807, + "learning_rate": 1.312321587418457e-06, + "loss": 0.80701321, + "num_input_tokens_seen": 223372090, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.66796875, + "step": 10368, + "time_per_iteration": 2.4565787315368652 + }, + { + "auxiliary_loss_clip": 0.01108362, + "auxiliary_loss_mlp": 0.01031753, + "balance_loss_clip": 1.01959956, + "balance_loss_mlp": 1.03783059, + "epoch": 0.623418006914174, + "flos": 23769416868480.0, + "grad_norm": 1.854629496919494, + "language_loss": 0.68463397, + "learning_rate": 1.3119558848247811e-06, + "loss": 0.70603514, + "num_input_tokens_seen": 223390110, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.703125, + "step": 10369, + "time_per_iteration": 2.495943069458008 + }, + { + "auxiliary_loss_clip": 0.01107955, + "auxiliary_loss_mlp": 0.0103761, + "balance_loss_clip": 1.02470601, + "balance_loss_mlp": 1.03846693, + "epoch": 0.6234781301668421, + "flos": 17890480857600.0, + "grad_norm": 2.0672458586121922, + "language_loss": 0.87758917, + "learning_rate": 1.3115902083218072e-06, + "loss": 0.89904487, + "num_input_tokens_seen": 223404205, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 10370, + "time_per_iteration": 2.4028708934783936 + }, + { + "auxiliary_loss_clip": 0.01103104, + "auxiliary_loss_mlp": 0.01026152, + "balance_loss_clip": 1.01450515, + "balance_loss_mlp": 1.03551197, + "epoch": 0.62353825341951, + "flos": 26175503352960.0, + "grad_norm": 1.4687473894600929, + "language_loss": 0.65925562, + "learning_rate": 1.311224557923402e-06, + "loss": 0.68054819, + "num_input_tokens_seen": 223424855, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.67578125, + "step": 10371, + "time_per_iteration": 2.4908487796783447 + }, + { + "auxiliary_loss_clip": 0.01099208, + "auxiliary_loss_mlp": 0.01029723, + "balance_loss_clip": 1.01929259, + "balance_loss_mlp": 1.03462815, + "epoch": 0.623598376672178, + "flos": 31139902160640.0, + "grad_norm": 1.308988821713543, + "language_loss": 0.77547729, + "learning_rate": 1.3108589336434298e-06, + "loss": 0.79676664, + "num_input_tokens_seen": 223447225, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.6484375, + "step": 10372, + "time_per_iteration": 2.5180232524871826 + }, + { + "auxiliary_loss_clip": 0.01105338, + "auxiliary_loss_mlp": 0.01030431, + "balance_loss_clip": 1.01769924, + "balance_loss_mlp": 1.03540146, + "epoch": 0.6236584999248459, + "flos": 23730202195200.0, + "grad_norm": 1.565588018128666, + "language_loss": 0.77423698, + "learning_rate": 1.3104933354957568e-06, + "loss": 0.79559469, + "num_input_tokens_seen": 223467520, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 10373, + "time_per_iteration": 2.4661612510681152 + }, + { + "auxiliary_loss_clip": 0.01101212, + "auxiliary_loss_mlp": 0.01025569, + "balance_loss_clip": 1.01429188, + "balance_loss_mlp": 1.03523397, + "epoch": 0.6237186231775139, + "flos": 21762764599680.0, + "grad_norm": 1.4815417355827754, + "language_loss": 0.69228935, + "learning_rate": 1.3101277634942448e-06, + "loss": 0.71355724, + "num_input_tokens_seen": 223488130, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.66015625, + "step": 10374, + "time_per_iteration": 2.473937511444092 + }, + { + "auxiliary_loss_clip": 0.0110711, + "auxiliary_loss_mlp": 0.01031094, + "balance_loss_clip": 1.01916742, + "balance_loss_mlp": 1.03731394, + "epoch": 0.6237787464301818, + "flos": 14939486075520.0, + "grad_norm": 1.723426878177341, + "language_loss": 0.77033317, + "learning_rate": 1.3097622176527577e-06, + "loss": 0.79171526, + "num_input_tokens_seen": 223505105, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69921875, + "step": 10375, + "time_per_iteration": 2.437490463256836 + }, + { + "auxiliary_loss_clip": 0.01104528, + "auxiliary_loss_mlp": 0.01028615, + "balance_loss_clip": 1.0170275, + "balance_loss_mlp": 1.0379982, + "epoch": 0.6238388696828499, + "flos": 35590311302400.0, + "grad_norm": 1.4731613062232216, + "language_loss": 0.70344281, + "learning_rate": 1.3093966979851566e-06, + "loss": 0.72477418, + "num_input_tokens_seen": 223528065, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66796875, + "step": 10376, + "time_per_iteration": 2.6377809047698975 + }, + { + "auxiliary_loss_clip": 0.01108576, + "auxiliary_loss_mlp": 0.01029796, + "balance_loss_clip": 1.01712978, + "balance_loss_mlp": 1.03811753, + "epoch": 0.6238989929355178, + "flos": 23623511823360.0, + "grad_norm": 2.3241172647924837, + "language_loss": 0.76568282, + "learning_rate": 1.309031204505301e-06, + "loss": 0.78706658, + "num_input_tokens_seen": 223547305, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 10377, + "time_per_iteration": 2.479133367538452 + }, + { + "auxiliary_loss_clip": 0.01106151, + "auxiliary_loss_mlp": 0.01029223, + "balance_loss_clip": 1.01860189, + "balance_loss_mlp": 1.03780174, + "epoch": 0.6239591161881858, + "flos": 22087468569600.0, + "grad_norm": 1.547563238627933, + "language_loss": 0.67949808, + "learning_rate": 1.308665737227052e-06, + "loss": 0.7008518, + "num_input_tokens_seen": 223567205, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.68359375, + "step": 10378, + "time_per_iteration": 2.4531919956207275 + }, + { + "auxiliary_loss_clip": 0.01104298, + "auxiliary_loss_mlp": 0.01030974, + "balance_loss_clip": 1.01901162, + "balance_loss_mlp": 1.03584397, + "epoch": 0.6240192394408538, + "flos": 24535930124160.0, + "grad_norm": 1.7868825896573544, + "language_loss": 0.76539075, + "learning_rate": 1.3083002961642675e-06, + "loss": 0.78674352, + "num_input_tokens_seen": 223586560, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6875, + "step": 10379, + "time_per_iteration": 2.489495277404785 + }, + { + "auxiliary_loss_clip": 0.01102881, + "auxiliary_loss_mlp": 0.01027719, + "balance_loss_clip": 1.01567876, + "balance_loss_mlp": 1.0352664, + "epoch": 0.6240793626935217, + "flos": 27931930502400.0, + "grad_norm": 1.3567066837187596, + "language_loss": 0.79495847, + "learning_rate": 1.3079348813308051e-06, + "loss": 0.81626451, + "num_input_tokens_seen": 223610595, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 10380, + "time_per_iteration": 2.513836145401001 + }, + { + "auxiliary_loss_clip": 0.01105137, + "auxiliary_loss_mlp": 0.01028452, + "balance_loss_clip": 1.01738906, + "balance_loss_mlp": 1.03878844, + "epoch": 0.6241394859461897, + "flos": 22892514140160.0, + "grad_norm": 1.5683522336983957, + "language_loss": 0.79919797, + "learning_rate": 1.3075694927405207e-06, + "loss": 0.82053387, + "num_input_tokens_seen": 223630230, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6640625, + "step": 10381, + "time_per_iteration": 2.4719154834747314 + }, + { + "auxiliary_loss_clip": 0.01104983, + "auxiliary_loss_mlp": 0.01032235, + "balance_loss_clip": 1.02026606, + "balance_loss_mlp": 1.03598738, + "epoch": 0.6241996091988576, + "flos": 12750766744320.0, + "grad_norm": 2.2093057050572606, + "language_loss": 0.74530953, + "learning_rate": 1.3072041304072718e-06, + "loss": 0.76668167, + "num_input_tokens_seen": 223648360, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.69140625, + "step": 10382, + "time_per_iteration": 2.4555060863494873 + }, + { + "auxiliary_loss_clip": 0.01102662, + "auxiliary_loss_mlp": 0.01025503, + "balance_loss_clip": 1.01423788, + "balance_loss_mlp": 1.03613257, + "epoch": 0.6242597324515257, + "flos": 25851302173440.0, + "grad_norm": 1.3920284041280475, + "language_loss": 0.78429455, + "learning_rate": 1.306838794344911e-06, + "loss": 0.80557621, + "num_input_tokens_seen": 223671255, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6640625, + "step": 10383, + "time_per_iteration": 2.5131173133850098 + }, + { + "auxiliary_loss_clip": 0.01103404, + "auxiliary_loss_mlp": 0.0102853, + "balance_loss_clip": 1.01732409, + "balance_loss_mlp": 1.03612638, + "epoch": 0.6243198557041936, + "flos": 19937712516480.0, + "grad_norm": 2.28937629159475, + "language_loss": 0.7478832, + "learning_rate": 1.3064734845672925e-06, + "loss": 0.76920247, + "num_input_tokens_seen": 223689860, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 10384, + "time_per_iteration": 2.441364049911499 + }, + { + "auxiliary_loss_clip": 0.01107606, + "auxiliary_loss_mlp": 0.0103045, + "balance_loss_clip": 1.01817775, + "balance_loss_mlp": 1.03742898, + "epoch": 0.6243799789568616, + "flos": 18406194376320.0, + "grad_norm": 2.8855056380065993, + "language_loss": 0.66313016, + "learning_rate": 1.3061082010882694e-06, + "loss": 0.68451071, + "num_input_tokens_seen": 223707835, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.703125, + "step": 10385, + "time_per_iteration": 3.859321117401123 + }, + { + "auxiliary_loss_clip": 0.01027145, + "auxiliary_loss_mlp": 0.01001461, + "balance_loss_clip": 1.00013185, + "balance_loss_mlp": 1.0058732, + "epoch": 0.6244401022095295, + "flos": 66027587523840.0, + "grad_norm": 0.7546932463540804, + "language_loss": 0.62028766, + "learning_rate": 1.305742943921692e-06, + "loss": 0.64057362, + "num_input_tokens_seen": 223771875, + "router_z_loss_clip": 0.01330566, + "router_z_loss_mlp": 0.21289062, + "step": 10386, + "time_per_iteration": 3.106778860092163 + }, + { + "auxiliary_loss_clip": 0.01105241, + "auxiliary_loss_mlp": 0.01031922, + "balance_loss_clip": 1.01933956, + "balance_loss_mlp": 1.03560019, + "epoch": 0.6245002254621975, + "flos": 24571266128640.0, + "grad_norm": 2.5221123793522247, + "language_loss": 0.7170524, + "learning_rate": 1.3053777130814128e-06, + "loss": 0.73842406, + "num_input_tokens_seen": 223788895, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 10387, + "time_per_iteration": 2.471496105194092 + }, + { + "auxiliary_loss_clip": 0.01110828, + "auxiliary_loss_mlp": 0.01038535, + "balance_loss_clip": 1.02493882, + "balance_loss_mlp": 1.03753424, + "epoch": 0.6245603487148654, + "flos": 29168837291520.0, + "grad_norm": 2.0526196711418345, + "language_loss": 0.65366501, + "learning_rate": 1.3050125085812798e-06, + "loss": 0.67515868, + "num_input_tokens_seen": 223810385, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.73046875, + "step": 10388, + "time_per_iteration": 5.378544330596924 + }, + { + "auxiliary_loss_clip": 0.01104574, + "auxiliary_loss_mlp": 0.01027126, + "balance_loss_clip": 1.01566386, + "balance_loss_mlp": 1.03606319, + "epoch": 0.6246204719675335, + "flos": 14790097411200.0, + "grad_norm": 1.6446610432064326, + "language_loss": 0.79204857, + "learning_rate": 1.3046473304351417e-06, + "loss": 0.81336558, + "num_input_tokens_seen": 223826040, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6875, + "step": 10389, + "time_per_iteration": 3.85504150390625 + }, + { + "auxiliary_loss_clip": 0.01103741, + "auxiliary_loss_mlp": 0.01032365, + "balance_loss_clip": 1.02053928, + "balance_loss_mlp": 1.03604019, + "epoch": 0.6246805952202014, + "flos": 12493538472960.0, + "grad_norm": 1.9237323307273804, + "language_loss": 0.60423774, + "learning_rate": 1.3042821786568475e-06, + "loss": 0.62559879, + "num_input_tokens_seen": 223842300, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.67578125, + "step": 10390, + "time_per_iteration": 2.4648008346557617 + }, + { + "auxiliary_loss_clip": 0.01107504, + "auxiliary_loss_mlp": 0.01033109, + "balance_loss_clip": 1.02080107, + "balance_loss_mlp": 1.03688002, + "epoch": 0.6247407184728694, + "flos": 12786677366400.0, + "grad_norm": 1.88087186985586, + "language_loss": 0.77647173, + "learning_rate": 1.3039170532602416e-06, + "loss": 0.79787791, + "num_input_tokens_seen": 223858320, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.70703125, + "step": 10391, + "time_per_iteration": 2.4204020500183105 + }, + { + "auxiliary_loss_clip": 0.01107712, + "auxiliary_loss_mlp": 0.0103129, + "balance_loss_clip": 1.01849914, + "balance_loss_mlp": 1.03854263, + "epoch": 0.6248008417255374, + "flos": 40629188960640.0, + "grad_norm": 1.9064599500175736, + "language_loss": 0.64700288, + "learning_rate": 1.3035519542591718e-06, + "loss": 0.6683929, + "num_input_tokens_seen": 223883545, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69140625, + "step": 10392, + "time_per_iteration": 2.6868064403533936 + }, + { + "auxiliary_loss_clip": 0.01108711, + "auxiliary_loss_mlp": 0.010312, + "balance_loss_clip": 1.0189693, + "balance_loss_mlp": 1.03795576, + "epoch": 0.6248609649782053, + "flos": 19902017376000.0, + "grad_norm": 1.715075150061653, + "language_loss": 0.76449108, + "learning_rate": 1.3031868816674819e-06, + "loss": 0.78589016, + "num_input_tokens_seen": 223901445, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.70703125, + "step": 10393, + "time_per_iteration": 2.5002684593200684 + }, + { + "auxiliary_loss_clip": 0.01109321, + "auxiliary_loss_mlp": 0.01036311, + "balance_loss_clip": 1.02361488, + "balance_loss_mlp": 1.03849423, + "epoch": 0.6249210882308733, + "flos": 19682746801920.0, + "grad_norm": 1.7032519811811655, + "language_loss": 0.82738161, + "learning_rate": 1.3028218354990142e-06, + "loss": 0.84883797, + "num_input_tokens_seen": 223920170, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 10394, + "time_per_iteration": 2.5074119567871094 + }, + { + "auxiliary_loss_clip": 0.01108744, + "auxiliary_loss_mlp": 0.01032457, + "balance_loss_clip": 1.01968956, + "balance_loss_mlp": 1.03777504, + "epoch": 0.6249812114835412, + "flos": 13990726189440.0, + "grad_norm": 1.7635560366961225, + "language_loss": 0.75053072, + "learning_rate": 1.3024568157676128e-06, + "loss": 0.77194268, + "num_input_tokens_seen": 223936495, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7109375, + "step": 10395, + "time_per_iteration": 2.4207139015197754 + }, + { + "auxiliary_loss_clip": 0.01106696, + "auxiliary_loss_mlp": 0.01029996, + "balance_loss_clip": 1.01774108, + "balance_loss_mlp": 1.03590536, + "epoch": 0.6250413347362093, + "flos": 14530031965440.0, + "grad_norm": 2.116778231139036, + "language_loss": 0.72623551, + "learning_rate": 1.302091822487119e-06, + "loss": 0.74760246, + "num_input_tokens_seen": 223950070, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.70703125, + "step": 10396, + "time_per_iteration": 2.4098753929138184 + }, + { + "auxiliary_loss_clip": 0.01106314, + "auxiliary_loss_mlp": 0.01035035, + "balance_loss_clip": 1.02295291, + "balance_loss_mlp": 1.03761959, + "epoch": 0.6251014579888772, + "flos": 22963006581120.0, + "grad_norm": 1.639382305953213, + "language_loss": 0.75850725, + "learning_rate": 1.3017268556713732e-06, + "loss": 0.7799207, + "num_input_tokens_seen": 223970065, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6875, + "step": 10397, + "time_per_iteration": 2.437908887863159 + }, + { + "auxiliary_loss_clip": 0.01104633, + "auxiliary_loss_mlp": 0.01031448, + "balance_loss_clip": 1.0192287, + "balance_loss_mlp": 1.03570378, + "epoch": 0.6251615812415452, + "flos": 28111232217600.0, + "grad_norm": 2.1037822697926667, + "language_loss": 0.74630761, + "learning_rate": 1.3013619153342154e-06, + "loss": 0.76766837, + "num_input_tokens_seen": 223990315, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 10398, + "time_per_iteration": 2.5268969535827637 + }, + { + "auxiliary_loss_clip": 0.01107479, + "auxiliary_loss_mlp": 0.01031651, + "balance_loss_clip": 1.0180074, + "balance_loss_mlp": 1.03535593, + "epoch": 0.6252217044942131, + "flos": 26724469887360.0, + "grad_norm": 1.7918693005970583, + "language_loss": 0.74092543, + "learning_rate": 1.300997001489483e-06, + "loss": 0.7623167, + "num_input_tokens_seen": 224009960, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.72265625, + "step": 10399, + "time_per_iteration": 2.4791572093963623 + }, + { + "auxiliary_loss_clip": 0.01107905, + "auxiliary_loss_mlp": 0.01032268, + "balance_loss_clip": 1.01990008, + "balance_loss_mlp": 1.03819537, + "epoch": 0.6252818277468811, + "flos": 20006768413440.0, + "grad_norm": 1.731383234573371, + "language_loss": 0.74527764, + "learning_rate": 1.3006321141510147e-06, + "loss": 0.76667941, + "num_input_tokens_seen": 224028870, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.69921875, + "step": 10400, + "time_per_iteration": 2.473951816558838 + }, + { + "auxiliary_loss_clip": 0.01026565, + "auxiliary_loss_mlp": 0.01000492, + "balance_loss_clip": 0.99915105, + "balance_loss_mlp": 1.00554299, + "epoch": 0.625341950999549, + "flos": 59278285059840.0, + "grad_norm": 0.8444247043206139, + "language_loss": 0.5648914, + "learning_rate": 1.3002672533326465e-06, + "loss": 0.58516198, + "num_input_tokens_seen": 224094140, + "router_z_loss_clip": 0.01342773, + "router_z_loss_mlp": 0.20996094, + "step": 10401, + "time_per_iteration": 3.129333019256592 + }, + { + "auxiliary_loss_clip": 0.01107201, + "auxiliary_loss_mlp": 0.01033139, + "balance_loss_clip": 1.0204252, + "balance_loss_mlp": 1.03666401, + "epoch": 0.625402074252217, + "flos": 20157090831360.0, + "grad_norm": 2.0092602513975977, + "language_loss": 0.82945538, + "learning_rate": 1.2999024190482146e-06, + "loss": 0.85085875, + "num_input_tokens_seen": 224113235, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 10402, + "time_per_iteration": 2.460231304168701 + }, + { + "auxiliary_loss_clip": 0.01104333, + "auxiliary_loss_mlp": 0.01031135, + "balance_loss_clip": 1.01907122, + "balance_loss_mlp": 1.03590369, + "epoch": 0.625462197504885, + "flos": 29132531619840.0, + "grad_norm": 1.9961648351421997, + "language_loss": 0.69392562, + "learning_rate": 1.2995376113115527e-06, + "loss": 0.71528035, + "num_input_tokens_seen": 224134530, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.68359375, + "step": 10403, + "time_per_iteration": 2.512580156326294 + }, + { + "auxiliary_loss_clip": 0.01107476, + "auxiliary_loss_mlp": 0.01029842, + "balance_loss_clip": 1.01649678, + "balance_loss_mlp": 1.03631687, + "epoch": 0.625522320757553, + "flos": 26104436294400.0, + "grad_norm": 1.605243006168547, + "language_loss": 0.71813661, + "learning_rate": 1.2991728301364954e-06, + "loss": 0.73950982, + "num_input_tokens_seen": 224154170, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7109375, + "step": 10404, + "time_per_iteration": 2.5337743759155273 + }, + { + "auxiliary_loss_clip": 0.0110666, + "auxiliary_loss_mlp": 0.01036242, + "balance_loss_clip": 1.02410626, + "balance_loss_mlp": 1.03739667, + "epoch": 0.625582444010221, + "flos": 20630967984000.0, + "grad_norm": 2.1209903153707637, + "language_loss": 0.69724202, + "learning_rate": 1.2988080755368742e-06, + "loss": 0.71867102, + "num_input_tokens_seen": 224172730, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69140625, + "step": 10405, + "time_per_iteration": 2.429565191268921 + }, + { + "auxiliary_loss_clip": 0.01106396, + "auxiliary_loss_mlp": 0.01031388, + "balance_loss_clip": 1.01888895, + "balance_loss_mlp": 1.03722537, + "epoch": 0.6256425672628889, + "flos": 20521512264960.0, + "grad_norm": 1.5758155671533136, + "language_loss": 0.79004002, + "learning_rate": 1.2984433475265207e-06, + "loss": 0.81141788, + "num_input_tokens_seen": 224192620, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69140625, + "step": 10406, + "time_per_iteration": 2.468031167984009 + }, + { + "auxiliary_loss_clip": 0.01107697, + "auxiliary_loss_mlp": 0.01033032, + "balance_loss_clip": 1.0206759, + "balance_loss_mlp": 1.03848672, + "epoch": 0.6257026905155569, + "flos": 29529200488320.0, + "grad_norm": 2.3254582384945546, + "language_loss": 0.68920648, + "learning_rate": 1.2980786461192666e-06, + "loss": 0.71061373, + "num_input_tokens_seen": 224214660, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6953125, + "step": 10407, + "time_per_iteration": 2.527899742126465 + }, + { + "auxiliary_loss_clip": 0.01103441, + "auxiliary_loss_mlp": 0.01027988, + "balance_loss_clip": 1.0164783, + "balance_loss_mlp": 1.03711939, + "epoch": 0.6257628137682248, + "flos": 24024885373440.0, + "grad_norm": 1.6489273629254082, + "language_loss": 0.85259062, + "learning_rate": 1.2977139713289398e-06, + "loss": 0.87390488, + "num_input_tokens_seen": 224234170, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 10408, + "time_per_iteration": 2.5326271057128906 + }, + { + "auxiliary_loss_clip": 0.01103218, + "auxiliary_loss_mlp": 0.01032655, + "balance_loss_clip": 1.02121651, + "balance_loss_mlp": 1.03541374, + "epoch": 0.6258229370208929, + "flos": 20850956830080.0, + "grad_norm": 1.6409440677958231, + "language_loss": 0.79910547, + "learning_rate": 1.2973493231693699e-06, + "loss": 0.82046419, + "num_input_tokens_seen": 224253115, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6796875, + "step": 10409, + "time_per_iteration": 2.442823886871338 + }, + { + "auxiliary_loss_clip": 0.01102769, + "auxiliary_loss_mlp": 0.01029631, + "balance_loss_clip": 1.01786542, + "balance_loss_mlp": 1.03510618, + "epoch": 0.6258830602735608, + "flos": 22231542021120.0, + "grad_norm": 2.1270589511309, + "language_loss": 0.69238424, + "learning_rate": 1.2969847016543845e-06, + "loss": 0.71370828, + "num_input_tokens_seen": 224271375, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6796875, + "step": 10410, + "time_per_iteration": 2.5218586921691895 + }, + { + "auxiliary_loss_clip": 0.01102703, + "auxiliary_loss_mlp": 0.01027941, + "balance_loss_clip": 1.01665211, + "balance_loss_mlp": 1.03720927, + "epoch": 0.6259431835262288, + "flos": 25076887925760.0, + "grad_norm": 1.7979777871745755, + "language_loss": 0.67414671, + "learning_rate": 1.2966201067978086e-06, + "loss": 0.69545317, + "num_input_tokens_seen": 224290315, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65625, + "step": 10411, + "time_per_iteration": 2.4738645553588867 + }, + { + "auxiliary_loss_clip": 0.0110494, + "auxiliary_loss_mlp": 0.01035981, + "balance_loss_clip": 1.02416134, + "balance_loss_mlp": 1.03532887, + "epoch": 0.6260033067788967, + "flos": 28252288926720.0, + "grad_norm": 1.6084905019023508, + "language_loss": 0.69372767, + "learning_rate": 1.2962555386134702e-06, + "loss": 0.71513689, + "num_input_tokens_seen": 224310545, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6953125, + "step": 10412, + "time_per_iteration": 2.5545077323913574 + }, + { + "auxiliary_loss_clip": 0.01102021, + "auxiliary_loss_mlp": 0.01031498, + "balance_loss_clip": 1.02027464, + "balance_loss_mlp": 1.03490543, + "epoch": 0.6260634300315647, + "flos": 23367432787200.0, + "grad_norm": 1.551813331878434, + "language_loss": 0.69730282, + "learning_rate": 1.2958909971151908e-06, + "loss": 0.718638, + "num_input_tokens_seen": 224331115, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 10413, + "time_per_iteration": 2.4613993167877197 + }, + { + "auxiliary_loss_clip": 0.01107528, + "auxiliary_loss_mlp": 0.01031885, + "balance_loss_clip": 1.01831901, + "balance_loss_mlp": 1.03475976, + "epoch": 0.6261235532842326, + "flos": 18035308494720.0, + "grad_norm": 2.3187128472961347, + "language_loss": 0.80297446, + "learning_rate": 1.295526482316796e-06, + "loss": 0.82436854, + "num_input_tokens_seen": 224347525, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7265625, + "step": 10414, + "time_per_iteration": 2.4762308597564697 + }, + { + "auxiliary_loss_clip": 0.01106139, + "auxiliary_loss_mlp": 0.01034131, + "balance_loss_clip": 1.02244806, + "balance_loss_mlp": 1.03826272, + "epoch": 0.6261836765369007, + "flos": 22011265866240.0, + "grad_norm": 1.6885486405610761, + "language_loss": 0.74565107, + "learning_rate": 1.2951619942321083e-06, + "loss": 0.76705372, + "num_input_tokens_seen": 224367045, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 10415, + "time_per_iteration": 2.469125270843506 + }, + { + "auxiliary_loss_clip": 0.01103919, + "auxiliary_loss_mlp": 0.01027621, + "balance_loss_clip": 1.01612878, + "balance_loss_mlp": 1.03637624, + "epoch": 0.6262437997895686, + "flos": 24936010784640.0, + "grad_norm": 1.6561914595998568, + "language_loss": 0.74751735, + "learning_rate": 1.2947975328749472e-06, + "loss": 0.7688328, + "num_input_tokens_seen": 224388860, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.67578125, + "step": 10416, + "time_per_iteration": 2.5993549823760986 + }, + { + "auxiliary_loss_clip": 0.0110123, + "auxiliary_loss_mlp": 0.01029477, + "balance_loss_clip": 1.01813984, + "balance_loss_mlp": 1.03624392, + "epoch": 0.6263039230422366, + "flos": 31608428186880.0, + "grad_norm": 1.5562931530598996, + "language_loss": 0.84521848, + "learning_rate": 1.2944330982591352e-06, + "loss": 0.86652553, + "num_input_tokens_seen": 224409645, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 10417, + "time_per_iteration": 2.555704355239868 + }, + { + "auxiliary_loss_clip": 0.01105248, + "auxiliary_loss_mlp": 0.01028467, + "balance_loss_clip": 1.01628423, + "balance_loss_mlp": 1.03636765, + "epoch": 0.6263640462949046, + "flos": 17639465639040.0, + "grad_norm": 2.453683898924351, + "language_loss": 0.56929493, + "learning_rate": 1.2940686903984904e-06, + "loss": 0.59063208, + "num_input_tokens_seen": 224428530, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 10418, + "time_per_iteration": 2.443615198135376 + }, + { + "auxiliary_loss_clip": 0.01108601, + "auxiliary_loss_mlp": 0.01031797, + "balance_loss_clip": 1.01904798, + "balance_loss_mlp": 1.03636181, + "epoch": 0.6264241695475725, + "flos": 19974951941760.0, + "grad_norm": 1.7897891411455675, + "language_loss": 0.84952247, + "learning_rate": 1.2937043093068316e-06, + "loss": 0.8709265, + "num_input_tokens_seen": 224447175, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.72265625, + "step": 10419, + "time_per_iteration": 2.432539224624634 + }, + { + "auxiliary_loss_clip": 0.01108205, + "auxiliary_loss_mlp": 0.01032396, + "balance_loss_clip": 1.02055252, + "balance_loss_mlp": 1.03868783, + "epoch": 0.6264842928002405, + "flos": 27344323912320.0, + "grad_norm": 1.768912237267882, + "language_loss": 0.64837831, + "learning_rate": 1.2933399549979762e-06, + "loss": 0.66978431, + "num_input_tokens_seen": 224469445, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6953125, + "step": 10420, + "time_per_iteration": 2.5192198753356934 + }, + { + "auxiliary_loss_clip": 0.01105751, + "auxiliary_loss_mlp": 0.01030399, + "balance_loss_clip": 1.01782894, + "balance_loss_mlp": 1.03548038, + "epoch": 0.6265444160529084, + "flos": 22997265177600.0, + "grad_norm": 1.9559815455742504, + "language_loss": 0.86093545, + "learning_rate": 1.292975627485741e-06, + "loss": 0.88229704, + "num_input_tokens_seen": 224486590, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 10421, + "time_per_iteration": 2.454472303390503 + }, + { + "auxiliary_loss_clip": 0.01106789, + "auxiliary_loss_mlp": 0.01031721, + "balance_loss_clip": 1.02009797, + "balance_loss_mlp": 1.03760505, + "epoch": 0.6266045393055765, + "flos": 19938323047680.0, + "grad_norm": 2.5701422758472687, + "language_loss": 0.79219615, + "learning_rate": 1.2926113267839403e-06, + "loss": 0.81358123, + "num_input_tokens_seen": 224502795, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.69140625, + "step": 10422, + "time_per_iteration": 2.4565389156341553 + }, + { + "auxiliary_loss_clip": 0.01102252, + "auxiliary_loss_mlp": 0.01024803, + "balance_loss_clip": 1.01235723, + "balance_loss_mlp": 1.03458548, + "epoch": 0.6266646625582444, + "flos": 24389091325440.0, + "grad_norm": 2.6493252664986784, + "language_loss": 0.74391955, + "learning_rate": 1.292247052906389e-06, + "loss": 0.76519012, + "num_input_tokens_seen": 224522300, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.67578125, + "step": 10423, + "time_per_iteration": 2.4744317531585693 + }, + { + "auxiliary_loss_clip": 0.01102071, + "auxiliary_loss_mlp": 0.0102725, + "balance_loss_clip": 1.0154779, + "balance_loss_mlp": 1.03445518, + "epoch": 0.6267247858109124, + "flos": 14683802088960.0, + "grad_norm": 1.8573410403622042, + "language_loss": 0.77685475, + "learning_rate": 1.2918828058669004e-06, + "loss": 0.79814792, + "num_input_tokens_seen": 224538260, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.67578125, + "step": 10424, + "time_per_iteration": 2.459156036376953 + }, + { + "auxiliary_loss_clip": 0.0110302, + "auxiliary_loss_mlp": 0.01028249, + "balance_loss_clip": 1.01498699, + "balance_loss_mlp": 1.03587162, + "epoch": 0.6267849090635803, + "flos": 24929977299840.0, + "grad_norm": 1.722847581119462, + "language_loss": 0.6881507, + "learning_rate": 1.2915185856792868e-06, + "loss": 0.70946336, + "num_input_tokens_seen": 224559155, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.671875, + "step": 10425, + "time_per_iteration": 2.4837486743927 + }, + { + "auxiliary_loss_clip": 0.01100261, + "auxiliary_loss_mlp": 0.01028731, + "balance_loss_clip": 1.0183543, + "balance_loss_mlp": 1.0359807, + "epoch": 0.6268450323162483, + "flos": 25337851211520.0, + "grad_norm": 1.5803855338986545, + "language_loss": 0.7465167, + "learning_rate": 1.2911543923573598e-06, + "loss": 0.76780665, + "num_input_tokens_seen": 224578660, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.640625, + "step": 10426, + "time_per_iteration": 2.514317274093628 + }, + { + "auxiliary_loss_clip": 0.01105959, + "auxiliary_loss_mlp": 0.01032051, + "balance_loss_clip": 1.01989794, + "balance_loss_mlp": 1.03667617, + "epoch": 0.6269051555689162, + "flos": 26177299032960.0, + "grad_norm": 1.372305042134179, + "language_loss": 0.80499035, + "learning_rate": 1.290790225914929e-06, + "loss": 0.82637042, + "num_input_tokens_seen": 224599080, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6953125, + "step": 10427, + "time_per_iteration": 3.906360149383545 + }, + { + "auxiliary_loss_clip": 0.01106724, + "auxiliary_loss_mlp": 0.01031602, + "balance_loss_clip": 1.01931798, + "balance_loss_mlp": 1.03726578, + "epoch": 0.6269652788215843, + "flos": 18256877539200.0, + "grad_norm": 1.7157059050483638, + "language_loss": 0.68742979, + "learning_rate": 1.2904260863658034e-06, + "loss": 0.70881307, + "num_input_tokens_seen": 224614225, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6953125, + "step": 10428, + "time_per_iteration": 2.4357380867004395 + }, + { + "auxiliary_loss_clip": 0.01105018, + "auxiliary_loss_mlp": 0.01032275, + "balance_loss_clip": 1.02134943, + "balance_loss_mlp": 1.03779614, + "epoch": 0.6270254020742522, + "flos": 11765413877760.0, + "grad_norm": 1.9089213874225204, + "language_loss": 0.71640742, + "learning_rate": 1.2900619737237928e-06, + "loss": 0.73778033, + "num_input_tokens_seen": 224632365, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.671875, + "step": 10429, + "time_per_iteration": 3.8758704662323 + }, + { + "auxiliary_loss_clip": 0.01108797, + "auxiliary_loss_mlp": 0.01033731, + "balance_loss_clip": 1.02144098, + "balance_loss_mlp": 1.03881693, + "epoch": 0.6270855253269202, + "flos": 23475631530240.0, + "grad_norm": 1.5765983769123613, + "language_loss": 0.79904956, + "learning_rate": 1.2896978880027023e-06, + "loss": 0.82047486, + "num_input_tokens_seen": 224651125, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69921875, + "step": 10430, + "time_per_iteration": 5.480989217758179 + }, + { + "auxiliary_loss_clip": 0.01027432, + "auxiliary_loss_mlp": 0.01010431, + "balance_loss_clip": 1.00904214, + "balance_loss_mlp": 1.00618088, + "epoch": 0.6271456485795882, + "flos": 70064520232320.0, + "grad_norm": 0.7689165216290166, + "language_loss": 0.59162331, + "learning_rate": 1.2893338292163393e-06, + "loss": 0.6120019, + "num_input_tokens_seen": 224716115, + "router_z_loss_clip": 0.01391602, + "router_z_loss_mlp": 0.21289062, + "step": 10431, + "time_per_iteration": 3.1698784828186035 + }, + { + "auxiliary_loss_clip": 0.0102736, + "auxiliary_loss_mlp": 0.01007095, + "balance_loss_clip": 1.00575376, + "balance_loss_mlp": 1.00630832, + "epoch": 0.6272057718322561, + "flos": 65156718280320.0, + "grad_norm": 0.8815125854573025, + "language_loss": 0.63825411, + "learning_rate": 1.2889697973785095e-06, + "loss": 0.6585986, + "num_input_tokens_seen": 224782930, + "router_z_loss_clip": 0.01342773, + "router_z_loss_mlp": 0.2109375, + "step": 10432, + "time_per_iteration": 3.1316046714782715 + }, + { + "auxiliary_loss_clip": 0.01101622, + "auxiliary_loss_mlp": 0.0103135, + "balance_loss_clip": 1.02075207, + "balance_loss_mlp": 1.03523922, + "epoch": 0.6272658950849241, + "flos": 24389342720640.0, + "grad_norm": 1.6684665767860385, + "language_loss": 0.6480633, + "learning_rate": 1.2886057925030153e-06, + "loss": 0.66939294, + "num_input_tokens_seen": 224802010, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.6640625, + "step": 10433, + "time_per_iteration": 2.530367851257324 + }, + { + "auxiliary_loss_clip": 0.01109549, + "auxiliary_loss_mlp": 0.01033392, + "balance_loss_clip": 1.02095246, + "balance_loss_mlp": 1.03838599, + "epoch": 0.627326018337592, + "flos": 17966001202560.0, + "grad_norm": 1.999112171650009, + "language_loss": 0.61930764, + "learning_rate": 1.2882418146036612e-06, + "loss": 0.64073694, + "num_input_tokens_seen": 224818875, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 10434, + "time_per_iteration": 2.4613072872161865 + }, + { + "auxiliary_loss_clip": 0.01103629, + "auxiliary_loss_mlp": 0.01025848, + "balance_loss_clip": 1.01420164, + "balance_loss_mlp": 1.03523064, + "epoch": 0.6273861415902601, + "flos": 20230097224320.0, + "grad_norm": 1.7052209762713233, + "language_loss": 0.84669697, + "learning_rate": 1.2878778636942484e-06, + "loss": 0.86799175, + "num_input_tokens_seen": 224837790, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.68359375, + "step": 10435, + "time_per_iteration": 2.516956090927124 + }, + { + "auxiliary_loss_clip": 0.01027112, + "auxiliary_loss_mlp": 0.01006345, + "balance_loss_clip": 1.00503409, + "balance_loss_mlp": 1.00594997, + "epoch": 0.627446264842928, + "flos": 64953210798720.0, + "grad_norm": 0.7299742143913254, + "language_loss": 0.61572838, + "learning_rate": 1.2875139397885786e-06, + "loss": 0.63606298, + "num_input_tokens_seen": 224899685, + "router_z_loss_clip": 0.01312256, + "router_z_loss_mlp": 0.2109375, + "step": 10436, + "time_per_iteration": 3.1023128032684326 + }, + { + "auxiliary_loss_clip": 0.01107216, + "auxiliary_loss_mlp": 0.01034863, + "balance_loss_clip": 1.02201223, + "balance_loss_mlp": 1.03899169, + "epoch": 0.627506388095596, + "flos": 23584261236480.0, + "grad_norm": 1.5188433692104768, + "language_loss": 0.77361041, + "learning_rate": 1.2871500429004523e-06, + "loss": 0.79503125, + "num_input_tokens_seen": 224918650, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.68359375, + "step": 10437, + "time_per_iteration": 2.5252764225006104 + }, + { + "auxiliary_loss_clip": 0.01026138, + "auxiliary_loss_mlp": 0.01003989, + "balance_loss_clip": 1.0027318, + "balance_loss_mlp": 1.00493383, + "epoch": 0.6275665113482639, + "flos": 67583631674880.0, + "grad_norm": 0.7219307652334395, + "language_loss": 0.5436241, + "learning_rate": 1.2867861730436667e-06, + "loss": 0.56392533, + "num_input_tokens_seen": 224981575, + "router_z_loss_clip": 0.01257324, + "router_z_loss_mlp": 0.21289062, + "step": 10438, + "time_per_iteration": 3.043013572692871 + }, + { + "auxiliary_loss_clip": 0.01102529, + "auxiliary_loss_mlp": 0.01041098, + "balance_loss_clip": 1.02895069, + "balance_loss_mlp": 1.03441381, + "epoch": 0.6276266346009319, + "flos": 27636924101760.0, + "grad_norm": 2.0343389960160163, + "language_loss": 0.84072959, + "learning_rate": 1.2864223302320214e-06, + "loss": 0.86216581, + "num_input_tokens_seen": 225000820, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6796875, + "step": 10439, + "time_per_iteration": 2.5371646881103516 + }, + { + "auxiliary_loss_clip": 0.0110542, + "auxiliary_loss_mlp": 0.01040087, + "balance_loss_clip": 1.0274682, + "balance_loss_mlp": 1.03541088, + "epoch": 0.6276867578535998, + "flos": 22746142218240.0, + "grad_norm": 2.0182472461440057, + "language_loss": 0.8041876, + "learning_rate": 1.2860585144793128e-06, + "loss": 0.8256427, + "num_input_tokens_seen": 225017585, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69921875, + "step": 10440, + "time_per_iteration": 2.4601192474365234 + }, + { + "auxiliary_loss_clip": 0.01099453, + "auxiliary_loss_mlp": 0.01028712, + "balance_loss_clip": 1.01833498, + "balance_loss_mlp": 1.03509974, + "epoch": 0.6277468811062679, + "flos": 24644200694400.0, + "grad_norm": 1.4716906489338055, + "language_loss": 0.74504089, + "learning_rate": 1.285694725799337e-06, + "loss": 0.76632255, + "num_input_tokens_seen": 225039085, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.640625, + "step": 10441, + "time_per_iteration": 2.5412392616271973 + }, + { + "auxiliary_loss_clip": 0.0110177, + "auxiliary_loss_mlp": 0.01029515, + "balance_loss_clip": 1.0175643, + "balance_loss_mlp": 1.03450918, + "epoch": 0.6278070043589358, + "flos": 19678975873920.0, + "grad_norm": 1.707965956451768, + "language_loss": 0.72134054, + "learning_rate": 1.2853309642058884e-06, + "loss": 0.74265343, + "num_input_tokens_seen": 225058105, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 10442, + "time_per_iteration": 2.537446975708008 + }, + { + "auxiliary_loss_clip": 0.01103523, + "auxiliary_loss_mlp": 0.01029473, + "balance_loss_clip": 1.01785576, + "balance_loss_mlp": 1.03555417, + "epoch": 0.6278671276116038, + "flos": 22121834906880.0, + "grad_norm": 1.5665674956365474, + "language_loss": 0.71364504, + "learning_rate": 1.284967229712762e-06, + "loss": 0.73497498, + "num_input_tokens_seen": 225077605, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 10443, + "time_per_iteration": 2.49980092048645 + }, + { + "auxiliary_loss_clip": 0.01103341, + "auxiliary_loss_mlp": 0.01025519, + "balance_loss_clip": 1.01374125, + "balance_loss_mlp": 1.03619695, + "epoch": 0.6279272508642717, + "flos": 23038562839680.0, + "grad_norm": 1.9169292083366938, + "language_loss": 0.72973317, + "learning_rate": 1.2846035223337492e-06, + "loss": 0.75102174, + "num_input_tokens_seen": 225097775, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.671875, + "step": 10444, + "time_per_iteration": 2.474400520324707 + }, + { + "auxiliary_loss_clip": 0.01102711, + "auxiliary_loss_mlp": 0.01027435, + "balance_loss_clip": 1.0155499, + "balance_loss_mlp": 1.03607392, + "epoch": 0.6279873741169397, + "flos": 19824090819840.0, + "grad_norm": 1.8659138317245392, + "language_loss": 0.72426593, + "learning_rate": 1.2842398420826423e-06, + "loss": 0.74556732, + "num_input_tokens_seen": 225115585, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66796875, + "step": 10445, + "time_per_iteration": 2.4486618041992188 + }, + { + "auxiliary_loss_clip": 0.01101674, + "auxiliary_loss_mlp": 0.0103002, + "balance_loss_clip": 1.01832557, + "balance_loss_mlp": 1.03417051, + "epoch": 0.6280474973696077, + "flos": 23915393740800.0, + "grad_norm": 1.6334831062955149, + "language_loss": 0.69040692, + "learning_rate": 1.2838761889732331e-06, + "loss": 0.71172386, + "num_input_tokens_seen": 225135575, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.67578125, + "step": 10446, + "time_per_iteration": 2.4619648456573486 + }, + { + "auxiliary_loss_clip": 0.01108513, + "auxiliary_loss_mlp": 0.01030919, + "balance_loss_clip": 1.01822352, + "balance_loss_mlp": 1.03651023, + "epoch": 0.6281076206222757, + "flos": 17967976450560.0, + "grad_norm": 1.946229669067864, + "language_loss": 0.74025476, + "learning_rate": 1.2835125630193102e-06, + "loss": 0.76164913, + "num_input_tokens_seen": 225154230, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 10447, + "time_per_iteration": 2.449399948120117 + }, + { + "auxiliary_loss_clip": 0.01025063, + "auxiliary_loss_mlp": 0.00998572, + "balance_loss_clip": 0.99728459, + "balance_loss_mlp": 1.00378299, + "epoch": 0.6281677438749437, + "flos": 66778370622720.0, + "grad_norm": 0.6772794879542157, + "language_loss": 0.52363139, + "learning_rate": 1.2831489642346626e-06, + "loss": 0.54386771, + "num_input_tokens_seen": 225213650, + "router_z_loss_clip": 0.01287842, + "router_z_loss_mlp": 0.21289062, + "step": 10448, + "time_per_iteration": 2.9426791667938232 + }, + { + "auxiliary_loss_clip": 0.01106244, + "auxiliary_loss_mlp": 0.01041172, + "balance_loss_clip": 1.02860117, + "balance_loss_mlp": 1.03656423, + "epoch": 0.6282278671276116, + "flos": 11656173640320.0, + "grad_norm": 2.1057349931562275, + "language_loss": 0.91307616, + "learning_rate": 1.282785392633079e-06, + "loss": 0.93455029, + "num_input_tokens_seen": 225230135, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 10449, + "time_per_iteration": 2.4679763317108154 + }, + { + "auxiliary_loss_clip": 0.01101969, + "auxiliary_loss_mlp": 0.01029117, + "balance_loss_clip": 1.01808405, + "balance_loss_mlp": 1.03486931, + "epoch": 0.6282879903802796, + "flos": 42741597847680.0, + "grad_norm": 1.5272379639764508, + "language_loss": 0.60454214, + "learning_rate": 1.2824218482283438e-06, + "loss": 0.62585294, + "num_input_tokens_seen": 225253520, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.671875, + "step": 10450, + "time_per_iteration": 2.665226459503174 + }, + { + "auxiliary_loss_clip": 0.01101695, + "auxiliary_loss_mlp": 0.01030714, + "balance_loss_clip": 1.01926398, + "balance_loss_mlp": 1.03620005, + "epoch": 0.6283481136329475, + "flos": 20009210538240.0, + "grad_norm": 1.5565304478998412, + "language_loss": 0.76683152, + "learning_rate": 1.2820583310342452e-06, + "loss": 0.78815556, + "num_input_tokens_seen": 225272460, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65625, + "step": 10451, + "time_per_iteration": 2.4581120014190674 + }, + { + "auxiliary_loss_clip": 0.01105178, + "auxiliary_loss_mlp": 0.01031424, + "balance_loss_clip": 1.01928806, + "balance_loss_mlp": 1.0352962, + "epoch": 0.6284082368856155, + "flos": 21904431840000.0, + "grad_norm": 1.577387753245048, + "language_loss": 0.77243423, + "learning_rate": 1.281694841064566e-06, + "loss": 0.79380023, + "num_input_tokens_seen": 225291700, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69921875, + "step": 10452, + "time_per_iteration": 2.4569571018218994 + }, + { + "auxiliary_loss_clip": 0.01105275, + "auxiliary_loss_mlp": 0.01031854, + "balance_loss_clip": 1.01977849, + "balance_loss_mlp": 1.03737903, + "epoch": 0.6284683601382834, + "flos": 25484187219840.0, + "grad_norm": 1.9445051684642027, + "language_loss": 0.72382963, + "learning_rate": 1.2813313783330904e-06, + "loss": 0.74520093, + "num_input_tokens_seen": 225311470, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6796875, + "step": 10453, + "time_per_iteration": 2.4979004859924316 + }, + { + "auxiliary_loss_clip": 0.01102123, + "auxiliary_loss_mlp": 0.0102867, + "balance_loss_clip": 1.01643896, + "balance_loss_mlp": 1.03324366, + "epoch": 0.6285284833909515, + "flos": 16538695395840.0, + "grad_norm": 1.6809278534400005, + "language_loss": 0.80429286, + "learning_rate": 1.2809679428536013e-06, + "loss": 0.82560074, + "num_input_tokens_seen": 225328385, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 10454, + "time_per_iteration": 2.409714937210083 + }, + { + "auxiliary_loss_clip": 0.01102175, + "auxiliary_loss_mlp": 0.01029803, + "balance_loss_clip": 1.01829922, + "balance_loss_mlp": 1.03586721, + "epoch": 0.6285886066436194, + "flos": 22820692896000.0, + "grad_norm": 1.824800115863982, + "language_loss": 0.82303673, + "learning_rate": 1.2806045346398792e-06, + "loss": 0.84435654, + "num_input_tokens_seen": 225348415, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 10455, + "time_per_iteration": 2.4712390899658203 + }, + { + "auxiliary_loss_clip": 0.01103001, + "auxiliary_loss_mlp": 0.01029287, + "balance_loss_clip": 1.01754522, + "balance_loss_mlp": 1.03569484, + "epoch": 0.6286487298962874, + "flos": 24715734629760.0, + "grad_norm": 3.44693783643537, + "language_loss": 0.81578875, + "learning_rate": 1.280241153705706e-06, + "loss": 0.83711159, + "num_input_tokens_seen": 225367740, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 10456, + "time_per_iteration": 2.48745059967041 + }, + { + "auxiliary_loss_clip": 0.01107634, + "auxiliary_loss_mlp": 0.01030014, + "balance_loss_clip": 1.01755691, + "balance_loss_mlp": 1.03793502, + "epoch": 0.6287088531489553, + "flos": 20740818752640.0, + "grad_norm": 1.5367705166393795, + "language_loss": 0.72127652, + "learning_rate": 1.27987780006486e-06, + "loss": 0.74265301, + "num_input_tokens_seen": 225388405, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 10457, + "time_per_iteration": 2.451204776763916 + }, + { + "auxiliary_loss_clip": 0.01107301, + "auxiliary_loss_mlp": 0.0103049, + "balance_loss_clip": 1.01816964, + "balance_loss_mlp": 1.03497529, + "epoch": 0.6287689764016233, + "flos": 23070630706560.0, + "grad_norm": 2.138119380312756, + "language_loss": 0.79647571, + "learning_rate": 1.2795144737311202e-06, + "loss": 0.81785357, + "num_input_tokens_seen": 225408360, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.72265625, + "step": 10458, + "time_per_iteration": 2.4522323608398438 + }, + { + "auxiliary_loss_clip": 0.01107535, + "auxiliary_loss_mlp": 0.01031161, + "balance_loss_clip": 1.01934147, + "balance_loss_mlp": 1.03738856, + "epoch": 0.6288290996542913, + "flos": 32233669251840.0, + "grad_norm": 1.5072940054720605, + "language_loss": 0.60961497, + "learning_rate": 1.2791511747182635e-06, + "loss": 0.63100201, + "num_input_tokens_seen": 225431310, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.703125, + "step": 10459, + "time_per_iteration": 2.5262553691864014 + }, + { + "auxiliary_loss_clip": 0.01105348, + "auxiliary_loss_mlp": 0.01029341, + "balance_loss_clip": 1.01796818, + "balance_loss_mlp": 1.03684652, + "epoch": 0.6288892229069593, + "flos": 24641327606400.0, + "grad_norm": 1.7541268062536184, + "language_loss": 0.7885046, + "learning_rate": 1.2787879030400666e-06, + "loss": 0.80985153, + "num_input_tokens_seen": 225450385, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.68359375, + "step": 10460, + "time_per_iteration": 2.4601290225982666 + }, + { + "auxiliary_loss_clip": 0.01102775, + "auxiliary_loss_mlp": 0.01025947, + "balance_loss_clip": 1.01438367, + "balance_loss_mlp": 1.03575253, + "epoch": 0.6289493461596273, + "flos": 17858341163520.0, + "grad_norm": 1.7189888907813877, + "language_loss": 0.73800498, + "learning_rate": 1.2784246587103047e-06, + "loss": 0.75929219, + "num_input_tokens_seen": 225467325, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.671875, + "step": 10461, + "time_per_iteration": 2.4365780353546143 + }, + { + "auxiliary_loss_clip": 0.01100652, + "auxiliary_loss_mlp": 0.01033338, + "balance_loss_clip": 1.02188754, + "balance_loss_mlp": 1.03492045, + "epoch": 0.6290094694122952, + "flos": 22345379199360.0, + "grad_norm": 1.7518850371883825, + "language_loss": 0.70340359, + "learning_rate": 1.2780614417427523e-06, + "loss": 0.72474349, + "num_input_tokens_seen": 225487370, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 10462, + "time_per_iteration": 2.4497246742248535 + }, + { + "auxiliary_loss_clip": 0.01098069, + "auxiliary_loss_mlp": 0.01028834, + "balance_loss_clip": 1.01851106, + "balance_loss_mlp": 1.03555751, + "epoch": 0.6290695926649632, + "flos": 28402431776640.0, + "grad_norm": 1.8426896444846728, + "language_loss": 0.71998221, + "learning_rate": 1.2776982521511821e-06, + "loss": 0.74125123, + "num_input_tokens_seen": 225506915, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.625, + "step": 10463, + "time_per_iteration": 2.519118070602417 + }, + { + "auxiliary_loss_clip": 0.01104354, + "auxiliary_loss_mlp": 0.01035813, + "balance_loss_clip": 1.02407098, + "balance_loss_mlp": 1.03894711, + "epoch": 0.6291297159176311, + "flos": 21505464501120.0, + "grad_norm": 2.0251276075815507, + "language_loss": 0.72917801, + "learning_rate": 1.2773350899493665e-06, + "loss": 0.75057971, + "num_input_tokens_seen": 225525670, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 10464, + "time_per_iteration": 2.4394619464874268 + }, + { + "auxiliary_loss_clip": 0.01101197, + "auxiliary_loss_mlp": 0.01028137, + "balance_loss_clip": 1.01696074, + "balance_loss_mlp": 1.03590441, + "epoch": 0.6291898391702991, + "flos": 12203308581120.0, + "grad_norm": 1.8688314099913752, + "language_loss": 0.69353777, + "learning_rate": 1.2769719551510768e-06, + "loss": 0.71483117, + "num_input_tokens_seen": 225542235, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65234375, + "step": 10465, + "time_per_iteration": 2.420706033706665 + }, + { + "auxiliary_loss_clip": 0.01025681, + "auxiliary_loss_mlp": 0.01003212, + "balance_loss_clip": 1.00197816, + "balance_loss_mlp": 1.00449264, + "epoch": 0.629249962422967, + "flos": 69299479434240.0, + "grad_norm": 0.6783887533402703, + "language_loss": 0.59743875, + "learning_rate": 1.2766088477700832e-06, + "loss": 0.6177277, + "num_input_tokens_seen": 225607185, + "router_z_loss_clip": 0.0123291, + "router_z_loss_mlp": 0.21191406, + "step": 10466, + "time_per_iteration": 3.1529486179351807 + }, + { + "auxiliary_loss_clip": 0.01098875, + "auxiliary_loss_mlp": 0.01028989, + "balance_loss_clip": 1.01821828, + "balance_loss_mlp": 1.03199136, + "epoch": 0.6293100856756351, + "flos": 40077888042240.0, + "grad_norm": 1.895578491152679, + "language_loss": 0.64383173, + "learning_rate": 1.276245767820154e-06, + "loss": 0.66511035, + "num_input_tokens_seen": 225628785, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.66796875, + "step": 10467, + "time_per_iteration": 2.596909761428833 + }, + { + "auxiliary_loss_clip": 0.01025676, + "auxiliary_loss_mlp": 0.00999758, + "balance_loss_clip": 0.9984706, + "balance_loss_mlp": 1.00462031, + "epoch": 0.629370208928303, + "flos": 67501108177920.0, + "grad_norm": 0.7946860251086647, + "language_loss": 0.569076, + "learning_rate": 1.2758827153150586e-06, + "loss": 0.58933038, + "num_input_tokens_seen": 225678980, + "router_z_loss_clip": 0.01287842, + "router_z_loss_mlp": 0.2109375, + "step": 10468, + "time_per_iteration": 4.298036336898804 + }, + { + "auxiliary_loss_clip": 0.01025761, + "auxiliary_loss_mlp": 0.01000379, + "balance_loss_clip": 0.99905533, + "balance_loss_mlp": 1.00460362, + "epoch": 0.629430332180971, + "flos": 60660450449280.0, + "grad_norm": 0.7346247861969195, + "language_loss": 0.580616, + "learning_rate": 1.2755196902685626e-06, + "loss": 0.6008774, + "num_input_tokens_seen": 225740295, + "router_z_loss_clip": 0.01324463, + "router_z_loss_mlp": 0.2109375, + "step": 10469, + "time_per_iteration": 3.013350009918213 + }, + { + "auxiliary_loss_clip": 0.01026242, + "auxiliary_loss_mlp": 0.0100094, + "balance_loss_clip": 0.99966449, + "balance_loss_mlp": 1.00510228, + "epoch": 0.6294904554336389, + "flos": 66869764778880.0, + "grad_norm": 0.6786572594163077, + "language_loss": 0.5214479, + "learning_rate": 1.2751566926944329e-06, + "loss": 0.54171979, + "num_input_tokens_seen": 225805615, + "router_z_loss_clip": 0.01275635, + "router_z_loss_mlp": 0.21191406, + "step": 10470, + "time_per_iteration": 3.1025776863098145 + }, + { + "auxiliary_loss_clip": 0.01101792, + "auxiliary_loss_mlp": 0.01030269, + "balance_loss_clip": 1.01853275, + "balance_loss_mlp": 1.03531003, + "epoch": 0.6295505786863069, + "flos": 42522794150400.0, + "grad_norm": 1.7821374773378207, + "language_loss": 0.7444669, + "learning_rate": 1.2747937226064342e-06, + "loss": 0.76578748, + "num_input_tokens_seen": 225826585, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 10471, + "time_per_iteration": 5.750757455825806 + }, + { + "auxiliary_loss_clip": 0.01104026, + "auxiliary_loss_mlp": 0.0102689, + "balance_loss_clip": 1.0161432, + "balance_loss_mlp": 1.03594935, + "epoch": 0.629610701938975, + "flos": 17384140788480.0, + "grad_norm": 1.928248423372208, + "language_loss": 0.62892604, + "learning_rate": 1.2744307800183297e-06, + "loss": 0.65023524, + "num_input_tokens_seen": 225844095, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6796875, + "step": 10472, + "time_per_iteration": 2.4507625102996826 + }, + { + "auxiliary_loss_clip": 0.01108224, + "auxiliary_loss_mlp": 0.01032069, + "balance_loss_clip": 1.02030277, + "balance_loss_mlp": 1.03887987, + "epoch": 0.6296708251916429, + "flos": 24242934885120.0, + "grad_norm": 1.6696696732656569, + "language_loss": 0.69374871, + "learning_rate": 1.2740678649438828e-06, + "loss": 0.71515167, + "num_input_tokens_seen": 225864310, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6953125, + "step": 10473, + "time_per_iteration": 3.954071283340454 + }, + { + "auxiliary_loss_clip": 0.01101498, + "auxiliary_loss_mlp": 0.0102561, + "balance_loss_clip": 1.01492906, + "balance_loss_mlp": 1.03493738, + "epoch": 0.6297309484443109, + "flos": 19278536077440.0, + "grad_norm": 1.5555016558834316, + "language_loss": 0.74785316, + "learning_rate": 1.2737049773968554e-06, + "loss": 0.76912427, + "num_input_tokens_seen": 225883830, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6640625, + "step": 10474, + "time_per_iteration": 2.4985709190368652 + }, + { + "auxiliary_loss_clip": 0.0110251, + "auxiliary_loss_mlp": 0.01028258, + "balance_loss_clip": 1.01681423, + "balance_loss_mlp": 1.03494573, + "epoch": 0.6297910716969788, + "flos": 30662685043200.0, + "grad_norm": 1.565073448719141, + "language_loss": 0.66372955, + "learning_rate": 1.2733421173910081e-06, + "loss": 0.68503714, + "num_input_tokens_seen": 225905755, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.67578125, + "step": 10475, + "time_per_iteration": 2.511357307434082 + }, + { + "auxiliary_loss_clip": 0.01098965, + "auxiliary_loss_mlp": 0.01029394, + "balance_loss_clip": 1.01852202, + "balance_loss_mlp": 1.03441048, + "epoch": 0.6298511949496468, + "flos": 14423018371200.0, + "grad_norm": 2.080975026928719, + "language_loss": 0.9029789, + "learning_rate": 1.272979284940101e-06, + "loss": 0.92426246, + "num_input_tokens_seen": 225922155, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.64453125, + "step": 10476, + "time_per_iteration": 2.4218876361846924 + }, + { + "auxiliary_loss_clip": 0.01101376, + "auxiliary_loss_mlp": 0.01035009, + "balance_loss_clip": 1.02379751, + "balance_loss_mlp": 1.03476787, + "epoch": 0.6299113182023147, + "flos": 23514163845120.0, + "grad_norm": 1.6697359788083987, + "language_loss": 0.75050914, + "learning_rate": 1.2726164800578913e-06, + "loss": 0.771873, + "num_input_tokens_seen": 225941060, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6640625, + "step": 10477, + "time_per_iteration": 2.455409049987793 + }, + { + "auxiliary_loss_clip": 0.01101367, + "auxiliary_loss_mlp": 0.01026543, + "balance_loss_clip": 1.01468766, + "balance_loss_mlp": 1.0337708, + "epoch": 0.6299714414549827, + "flos": 22674500542080.0, + "grad_norm": 1.9554844868820769, + "language_loss": 0.70427382, + "learning_rate": 1.272253702758138e-06, + "loss": 0.72555292, + "num_input_tokens_seen": 225960870, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.67578125, + "step": 10478, + "time_per_iteration": 2.448185443878174 + }, + { + "auxiliary_loss_clip": 0.01107518, + "auxiliary_loss_mlp": 0.01031561, + "balance_loss_clip": 1.01893711, + "balance_loss_mlp": 1.03713453, + "epoch": 0.6300315647076506, + "flos": 14501735026560.0, + "grad_norm": 2.8380864968685287, + "language_loss": 0.67054832, + "learning_rate": 1.2718909530545974e-06, + "loss": 0.69193918, + "num_input_tokens_seen": 225977895, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 10479, + "time_per_iteration": 2.4200356006622314 + }, + { + "auxiliary_loss_clip": 0.01103494, + "auxiliary_loss_mlp": 0.01033351, + "balance_loss_clip": 1.0211798, + "balance_loss_mlp": 1.03659678, + "epoch": 0.6300916879603187, + "flos": 21871681614720.0, + "grad_norm": 3.6551699512461067, + "language_loss": 0.73471272, + "learning_rate": 1.2715282309610245e-06, + "loss": 0.75608122, + "num_input_tokens_seen": 225997835, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.66796875, + "step": 10480, + "time_per_iteration": 2.4555039405822754 + }, + { + "auxiliary_loss_clip": 0.01104037, + "auxiliary_loss_mlp": 0.01031551, + "balance_loss_clip": 1.01883698, + "balance_loss_mlp": 1.03487301, + "epoch": 0.6301518112129866, + "flos": 21834047139840.0, + "grad_norm": 2.336956908643113, + "language_loss": 0.78874803, + "learning_rate": 1.2711655364911744e-06, + "loss": 0.81010389, + "num_input_tokens_seen": 226017620, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 10481, + "time_per_iteration": 2.4346415996551514 + }, + { + "auxiliary_loss_clip": 0.01026096, + "auxiliary_loss_mlp": 0.01005078, + "balance_loss_clip": 1.00391531, + "balance_loss_mlp": 1.0049262, + "epoch": 0.6302119344656546, + "flos": 44334237957120.0, + "grad_norm": 0.9177955201810194, + "language_loss": 0.61818945, + "learning_rate": 1.2708028696588e-06, + "loss": 0.63850117, + "num_input_tokens_seen": 226068755, + "router_z_loss_clip": 0.01159668, + "router_z_loss_mlp": 0.21191406, + "step": 10482, + "time_per_iteration": 2.812809705734253 + }, + { + "auxiliary_loss_clip": 0.01108769, + "auxiliary_loss_mlp": 0.01030629, + "balance_loss_clip": 1.01766491, + "balance_loss_mlp": 1.03617549, + "epoch": 0.6302720577183225, + "flos": 11217919800960.0, + "grad_norm": 2.913772314034849, + "language_loss": 0.83037972, + "learning_rate": 1.2704402304776541e-06, + "loss": 0.85177374, + "num_input_tokens_seen": 226084395, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7265625, + "step": 10483, + "time_per_iteration": 2.401224374771118 + }, + { + "auxiliary_loss_clip": 0.01097662, + "auxiliary_loss_mlp": 0.01031323, + "balance_loss_clip": 1.02013469, + "balance_loss_mlp": 1.03428078, + "epoch": 0.6303321809709905, + "flos": 27964932122880.0, + "grad_norm": 1.5146236246766749, + "language_loss": 0.72939026, + "learning_rate": 1.270077618961487e-06, + "loss": 0.75068009, + "num_input_tokens_seen": 226105890, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6328125, + "step": 10484, + "time_per_iteration": 2.5125913619995117 + }, + { + "auxiliary_loss_clip": 0.01101969, + "auxiliary_loss_mlp": 0.01026821, + "balance_loss_clip": 1.01511419, + "balance_loss_mlp": 1.03390932, + "epoch": 0.6303923042236586, + "flos": 28220759763840.0, + "grad_norm": 1.8710303376286184, + "language_loss": 0.74698818, + "learning_rate": 1.2697150351240506e-06, + "loss": 0.7682761, + "num_input_tokens_seen": 226126760, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 10485, + "time_per_iteration": 2.4874563217163086 + }, + { + "auxiliary_loss_clip": 0.01107856, + "auxiliary_loss_mlp": 0.01031188, + "balance_loss_clip": 1.01911783, + "balance_loss_mlp": 1.03676295, + "epoch": 0.6304524274763265, + "flos": 27631034271360.0, + "grad_norm": 1.9819800910053105, + "language_loss": 0.81547624, + "learning_rate": 1.269352478979093e-06, + "loss": 0.83686674, + "num_input_tokens_seen": 226147315, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.7109375, + "step": 10486, + "time_per_iteration": 2.4926888942718506 + }, + { + "auxiliary_loss_clip": 0.01102993, + "auxiliary_loss_mlp": 0.01036434, + "balance_loss_clip": 1.0249896, + "balance_loss_mlp": 1.03641152, + "epoch": 0.6305125507289945, + "flos": 17311313963520.0, + "grad_norm": 2.1821850164901675, + "language_loss": 0.63638449, + "learning_rate": 1.2689899505403628e-06, + "loss": 0.65777874, + "num_input_tokens_seen": 226165935, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 10487, + "time_per_iteration": 2.408770799636841 + }, + { + "auxiliary_loss_clip": 0.01103897, + "auxiliary_loss_mlp": 0.01035839, + "balance_loss_clip": 1.02434158, + "balance_loss_mlp": 1.03714716, + "epoch": 0.6305726739816624, + "flos": 25808280658560.0, + "grad_norm": 1.4517629521514586, + "language_loss": 0.67256761, + "learning_rate": 1.2686274498216065e-06, + "loss": 0.69396502, + "num_input_tokens_seen": 226186890, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.66796875, + "step": 10488, + "time_per_iteration": 2.484377861022949 + }, + { + "auxiliary_loss_clip": 0.01105074, + "auxiliary_loss_mlp": 0.01028659, + "balance_loss_clip": 1.01726294, + "balance_loss_mlp": 1.03574753, + "epoch": 0.6306327972343304, + "flos": 21797454159360.0, + "grad_norm": 1.9513019958263491, + "language_loss": 0.67263639, + "learning_rate": 1.2682649768365706e-06, + "loss": 0.69397372, + "num_input_tokens_seen": 226206710, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.69140625, + "step": 10489, + "time_per_iteration": 2.4636588096618652 + }, + { + "auxiliary_loss_clip": 0.01110064, + "auxiliary_loss_mlp": 0.01029894, + "balance_loss_clip": 1.0174365, + "balance_loss_mlp": 1.03723645, + "epoch": 0.6306929204869983, + "flos": 20777375819520.0, + "grad_norm": 1.966397981441809, + "language_loss": 0.69455999, + "learning_rate": 1.2679025315990007e-06, + "loss": 0.71595961, + "num_input_tokens_seen": 226225565, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7265625, + "step": 10490, + "time_per_iteration": 2.4483461380004883 + }, + { + "auxiliary_loss_clip": 0.0110581, + "auxiliary_loss_mlp": 0.01033287, + "balance_loss_clip": 1.02133048, + "balance_loss_mlp": 1.03623903, + "epoch": 0.6307530437396663, + "flos": 23654214973440.0, + "grad_norm": 2.505536440046342, + "language_loss": 0.78477776, + "learning_rate": 1.2675401141226393e-06, + "loss": 0.80616874, + "num_input_tokens_seen": 226243680, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6953125, + "step": 10491, + "time_per_iteration": 2.4928994178771973 + }, + { + "auxiliary_loss_clip": 0.01104065, + "auxiliary_loss_mlp": 0.01031707, + "balance_loss_clip": 1.02038789, + "balance_loss_mlp": 1.03679323, + "epoch": 0.6308131669923343, + "flos": 24719002767360.0, + "grad_norm": 1.9616523750971206, + "language_loss": 0.55806887, + "learning_rate": 1.2671777244212308e-06, + "loss": 0.57942659, + "num_input_tokens_seen": 226264345, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 10492, + "time_per_iteration": 2.5040977001190186 + }, + { + "auxiliary_loss_clip": 0.01105591, + "auxiliary_loss_mlp": 0.01035776, + "balance_loss_clip": 1.023283, + "balance_loss_mlp": 1.03620148, + "epoch": 0.6308732902450023, + "flos": 22565403959040.0, + "grad_norm": 2.2691030779407693, + "language_loss": 0.63968873, + "learning_rate": 1.2668153625085168e-06, + "loss": 0.66110241, + "num_input_tokens_seen": 226283165, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 10493, + "time_per_iteration": 2.501648187637329 + }, + { + "auxiliary_loss_clip": 0.01103602, + "auxiliary_loss_mlp": 0.01029723, + "balance_loss_clip": 1.01742673, + "balance_loss_mlp": 1.03536439, + "epoch": 0.6309334134976702, + "flos": 24644200694400.0, + "grad_norm": 1.6404154470274028, + "language_loss": 0.82711017, + "learning_rate": 1.2664530283982367e-06, + "loss": 0.84844351, + "num_input_tokens_seen": 226304080, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.68359375, + "step": 10494, + "time_per_iteration": 2.488478183746338 + }, + { + "auxiliary_loss_clip": 0.01106273, + "auxiliary_loss_mlp": 0.010314, + "balance_loss_clip": 1.01908565, + "balance_loss_mlp": 1.03702521, + "epoch": 0.6309935367503382, + "flos": 41427949651200.0, + "grad_norm": 1.6064300635789628, + "language_loss": 0.792678, + "learning_rate": 1.2660907221041317e-06, + "loss": 0.81405473, + "num_input_tokens_seen": 226325925, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6953125, + "step": 10495, + "time_per_iteration": 2.607936143875122 + }, + { + "auxiliary_loss_clip": 0.01104478, + "auxiliary_loss_mlp": 0.01032474, + "balance_loss_clip": 1.01999319, + "balance_loss_mlp": 1.0356319, + "epoch": 0.6310536600030061, + "flos": 15118931445120.0, + "grad_norm": 1.9868473037750025, + "language_loss": 0.69977289, + "learning_rate": 1.2657284436399403e-06, + "loss": 0.72114241, + "num_input_tokens_seen": 226344190, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 10496, + "time_per_iteration": 2.4172658920288086 + }, + { + "auxiliary_loss_clip": 0.01106703, + "auxiliary_loss_mlp": 0.01032501, + "balance_loss_clip": 1.02038908, + "balance_loss_mlp": 1.03729558, + "epoch": 0.6311137832556741, + "flos": 15231619388160.0, + "grad_norm": 2.5454831155818307, + "language_loss": 0.80091369, + "learning_rate": 1.2653661930193997e-06, + "loss": 0.82230574, + "num_input_tokens_seen": 226361520, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 10497, + "time_per_iteration": 2.417558193206787 + }, + { + "auxiliary_loss_clip": 0.01101019, + "auxiliary_loss_mlp": 0.01030621, + "balance_loss_clip": 1.01958823, + "balance_loss_mlp": 1.03501368, + "epoch": 0.6311739065083422, + "flos": 22018664067840.0, + "grad_norm": 1.8690299301257927, + "language_loss": 0.74428982, + "learning_rate": 1.265003970256247e-06, + "loss": 0.76560622, + "num_input_tokens_seen": 226381920, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.66015625, + "step": 10498, + "time_per_iteration": 2.452404737472534 + }, + { + "auxiliary_loss_clip": 0.01104382, + "auxiliary_loss_mlp": 0.0102937, + "balance_loss_clip": 1.01755679, + "balance_loss_mlp": 1.03578484, + "epoch": 0.6312340297610101, + "flos": 22710770300160.0, + "grad_norm": 1.8689991492998164, + "language_loss": 0.69558024, + "learning_rate": 1.264641775364217e-06, + "loss": 0.71691775, + "num_input_tokens_seen": 226400035, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 10499, + "time_per_iteration": 2.4273722171783447 + }, + { + "auxiliary_loss_clip": 0.01101969, + "auxiliary_loss_mlp": 0.01036719, + "balance_loss_clip": 1.02513218, + "balance_loss_mlp": 1.03703976, + "epoch": 0.6312941530136781, + "flos": 24280102483200.0, + "grad_norm": 2.6400614385639294, + "language_loss": 0.70014846, + "learning_rate": 1.2642796083570448e-06, + "loss": 0.72153533, + "num_input_tokens_seen": 226418280, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6484375, + "step": 10500, + "time_per_iteration": 2.4538466930389404 + }, + { + "auxiliary_loss_clip": 0.01106013, + "auxiliary_loss_mlp": 0.01032106, + "balance_loss_clip": 1.02079344, + "balance_loss_mlp": 1.03783047, + "epoch": 0.631354276266346, + "flos": 21725956137600.0, + "grad_norm": 3.0415450485464937, + "language_loss": 0.74062467, + "learning_rate": 1.2639174692484634e-06, + "loss": 0.76200593, + "num_input_tokens_seen": 226436650, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6796875, + "step": 10501, + "time_per_iteration": 2.4303436279296875 + }, + { + "auxiliary_loss_clip": 0.01104429, + "auxiliary_loss_mlp": 0.01031161, + "balance_loss_clip": 1.01901984, + "balance_loss_mlp": 1.03665447, + "epoch": 0.631414399519014, + "flos": 24025100855040.0, + "grad_norm": 1.6794546939174708, + "language_loss": 0.75353241, + "learning_rate": 1.2635553580522053e-06, + "loss": 0.77488828, + "num_input_tokens_seen": 226456275, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6796875, + "step": 10502, + "time_per_iteration": 2.4563441276550293 + }, + { + "auxiliary_loss_clip": 0.01109898, + "auxiliary_loss_mlp": 0.01044466, + "balance_loss_clip": 1.03212154, + "balance_loss_mlp": 1.03856277, + "epoch": 0.6314745227716819, + "flos": 24315797623680.0, + "grad_norm": 2.067886001099209, + "language_loss": 0.85457253, + "learning_rate": 1.2631932747820022e-06, + "loss": 0.87611616, + "num_input_tokens_seen": 226473610, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.7109375, + "step": 10503, + "time_per_iteration": 2.454007148742676 + }, + { + "auxiliary_loss_clip": 0.01104787, + "auxiliary_loss_mlp": 0.0103047, + "balance_loss_clip": 1.01844788, + "balance_loss_mlp": 1.0356003, + "epoch": 0.6315346460243499, + "flos": 23366391292800.0, + "grad_norm": 1.7756005126280807, + "language_loss": 0.86549926, + "learning_rate": 1.2628312194515838e-06, + "loss": 0.88685179, + "num_input_tokens_seen": 226493665, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 10504, + "time_per_iteration": 2.452439546585083 + }, + { + "auxiliary_loss_clip": 0.01110828, + "auxiliary_loss_mlp": 0.01034178, + "balance_loss_clip": 1.02137482, + "balance_loss_mlp": 1.03827763, + "epoch": 0.6315947692770179, + "flos": 20260333497600.0, + "grad_norm": 1.5631411561519288, + "language_loss": 0.76411223, + "learning_rate": 1.2624691920746793e-06, + "loss": 0.78556228, + "num_input_tokens_seen": 226511625, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7265625, + "step": 10505, + "time_per_iteration": 2.4167821407318115 + }, + { + "auxiliary_loss_clip": 0.01107106, + "auxiliary_loss_mlp": 0.01031267, + "balance_loss_clip": 1.01840425, + "balance_loss_mlp": 1.03718579, + "epoch": 0.6316548925296859, + "flos": 25265850399360.0, + "grad_norm": 1.80507675782724, + "language_loss": 0.81566548, + "learning_rate": 1.2621071926650166e-06, + "loss": 0.83704925, + "num_input_tokens_seen": 226530085, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69921875, + "step": 10506, + "time_per_iteration": 2.475015163421631 + }, + { + "auxiliary_loss_clip": 0.01108071, + "auxiliary_loss_mlp": 0.01029424, + "balance_loss_clip": 1.0174973, + "balance_loss_mlp": 1.03848529, + "epoch": 0.6317150157823538, + "flos": 22930579578240.0, + "grad_norm": 1.7792905066974667, + "language_loss": 0.74235427, + "learning_rate": 1.2617452212363238e-06, + "loss": 0.76372921, + "num_input_tokens_seen": 226548115, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6953125, + "step": 10507, + "time_per_iteration": 2.4495646953582764 + }, + { + "auxiliary_loss_clip": 0.01109877, + "auxiliary_loss_mlp": 0.01034985, + "balance_loss_clip": 1.02270663, + "balance_loss_mlp": 1.03861022, + "epoch": 0.6317751390350218, + "flos": 22527051212160.0, + "grad_norm": 1.7094804545962832, + "language_loss": 0.6781255, + "learning_rate": 1.2613832778023258e-06, + "loss": 0.69957411, + "num_input_tokens_seen": 226567955, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.7109375, + "step": 10508, + "time_per_iteration": 2.4817588329315186 + }, + { + "auxiliary_loss_clip": 0.01105487, + "auxiliary_loss_mlp": 0.01028244, + "balance_loss_clip": 1.01696706, + "balance_loss_mlp": 1.03691339, + "epoch": 0.6318352622876897, + "flos": 23294749616640.0, + "grad_norm": 1.6822434485138316, + "language_loss": 0.70602268, + "learning_rate": 1.2610213623767478e-06, + "loss": 0.72736001, + "num_input_tokens_seen": 226588205, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.68359375, + "step": 10509, + "time_per_iteration": 2.511807680130005 + }, + { + "auxiliary_loss_clip": 0.01104325, + "auxiliary_loss_mlp": 0.01027474, + "balance_loss_clip": 1.01614881, + "balance_loss_mlp": 1.03634882, + "epoch": 0.6318953855403577, + "flos": 20704082117760.0, + "grad_norm": 1.6779333049559604, + "language_loss": 0.79419941, + "learning_rate": 1.2606594749733143e-06, + "loss": 0.81551743, + "num_input_tokens_seen": 226606965, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6796875, + "step": 10510, + "time_per_iteration": 3.890570640563965 + }, + { + "auxiliary_loss_clip": 0.01107002, + "auxiliary_loss_mlp": 0.01031338, + "balance_loss_clip": 1.01917839, + "balance_loss_mlp": 1.03689122, + "epoch": 0.6319555087930258, + "flos": 22820046451200.0, + "grad_norm": 1.4507580648571856, + "language_loss": 0.70762742, + "learning_rate": 1.2602976156057469e-06, + "loss": 0.72901082, + "num_input_tokens_seen": 226627845, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.703125, + "step": 10511, + "time_per_iteration": 2.502631902694702 + }, + { + "auxiliary_loss_clip": 0.01102983, + "auxiliary_loss_mlp": 0.01032047, + "balance_loss_clip": 1.02093077, + "balance_loss_mlp": 1.03624094, + "epoch": 0.6320156320456937, + "flos": 19970929618560.0, + "grad_norm": 1.7129276808255165, + "language_loss": 0.80193913, + "learning_rate": 1.2599357842877684e-06, + "loss": 0.82328945, + "num_input_tokens_seen": 226645855, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66796875, + "step": 10512, + "time_per_iteration": 2.4500255584716797 + }, + { + "auxiliary_loss_clip": 0.01108015, + "auxiliary_loss_mlp": 0.01031253, + "balance_loss_clip": 1.01859283, + "balance_loss_mlp": 1.03887498, + "epoch": 0.6320757552983617, + "flos": 27013406889600.0, + "grad_norm": 1.9936938479118853, + "language_loss": 0.70610952, + "learning_rate": 1.2595739810330994e-06, + "loss": 0.72750223, + "num_input_tokens_seen": 226665375, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 10513, + "time_per_iteration": 5.2415876388549805 + }, + { + "auxiliary_loss_clip": 0.01110907, + "auxiliary_loss_mlp": 0.01029996, + "balance_loss_clip": 1.01782441, + "balance_loss_mlp": 1.03923917, + "epoch": 0.6321358785510296, + "flos": 23695943598720.0, + "grad_norm": 1.9330841856618928, + "language_loss": 0.66179729, + "learning_rate": 1.259212205855459e-06, + "loss": 0.68320632, + "num_input_tokens_seen": 226685270, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.71875, + "step": 10514, + "time_per_iteration": 3.9086010456085205 + }, + { + "auxiliary_loss_clip": 0.01102729, + "auxiliary_loss_mlp": 0.01030448, + "balance_loss_clip": 1.01901603, + "balance_loss_mlp": 1.0355525, + "epoch": 0.6321960018036976, + "flos": 25995231970560.0, + "grad_norm": 1.657544375063904, + "language_loss": 0.74582148, + "learning_rate": 1.2588504587685663e-06, + "loss": 0.76715326, + "num_input_tokens_seen": 226705325, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 10515, + "time_per_iteration": 2.4754388332366943 + }, + { + "auxiliary_loss_clip": 0.01103002, + "auxiliary_loss_mlp": 0.01026215, + "balance_loss_clip": 1.01485467, + "balance_loss_mlp": 1.03710318, + "epoch": 0.6322561250563655, + "flos": 22821016118400.0, + "grad_norm": 1.8087331085143223, + "language_loss": 0.89853811, + "learning_rate": 1.2584887397861379e-06, + "loss": 0.91983026, + "num_input_tokens_seen": 226723815, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 10516, + "time_per_iteration": 2.431255578994751 + }, + { + "auxiliary_loss_clip": 0.01113899, + "auxiliary_loss_mlp": 0.01030505, + "balance_loss_clip": 1.01684928, + "balance_loss_mlp": 1.04077578, + "epoch": 0.6323162483090335, + "flos": 18988413926400.0, + "grad_norm": 1.8110008690321133, + "language_loss": 0.81904936, + "learning_rate": 1.2581270489218911e-06, + "loss": 0.84049344, + "num_input_tokens_seen": 226741550, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73046875, + "step": 10517, + "time_per_iteration": 2.418457508087158 + }, + { + "auxiliary_loss_clip": 0.01105413, + "auxiliary_loss_mlp": 0.01034651, + "balance_loss_clip": 1.02295017, + "balance_loss_mlp": 1.03746212, + "epoch": 0.6323763715617015, + "flos": 19865173000320.0, + "grad_norm": 1.9810559885321721, + "language_loss": 0.77525067, + "learning_rate": 1.257765386189541e-06, + "loss": 0.7966513, + "num_input_tokens_seen": 226761115, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 10518, + "time_per_iteration": 2.480358839035034 + }, + { + "auxiliary_loss_clip": 0.01102761, + "auxiliary_loss_mlp": 0.01032476, + "balance_loss_clip": 1.0210377, + "balance_loss_mlp": 1.03653479, + "epoch": 0.6324364948143695, + "flos": 22782699285120.0, + "grad_norm": 1.4836154875686243, + "language_loss": 0.85232532, + "learning_rate": 1.2574037516028018e-06, + "loss": 0.87367767, + "num_input_tokens_seen": 226782225, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 10519, + "time_per_iteration": 2.539891242980957 + }, + { + "auxiliary_loss_clip": 0.01101558, + "auxiliary_loss_mlp": 0.01032383, + "balance_loss_clip": 1.02114785, + "balance_loss_mlp": 1.03623748, + "epoch": 0.6324966180670374, + "flos": 22235923480320.0, + "grad_norm": 1.6381683069265482, + "language_loss": 0.71834314, + "learning_rate": 1.2570421451753867e-06, + "loss": 0.73968256, + "num_input_tokens_seen": 226802375, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 10520, + "time_per_iteration": 2.4911139011383057 + }, + { + "auxiliary_loss_clip": 0.0110337, + "auxiliary_loss_mlp": 0.01030762, + "balance_loss_clip": 1.01956213, + "balance_loss_mlp": 1.03599596, + "epoch": 0.6325567413197054, + "flos": 21689183589120.0, + "grad_norm": 1.7138425612253112, + "language_loss": 0.7110256, + "learning_rate": 1.2566805669210081e-06, + "loss": 0.73236692, + "num_input_tokens_seen": 226822165, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.671875, + "step": 10521, + "time_per_iteration": 2.42466402053833 + }, + { + "auxiliary_loss_clip": 0.01107506, + "auxiliary_loss_mlp": 0.0103163, + "balance_loss_clip": 1.01895845, + "balance_loss_mlp": 1.03792214, + "epoch": 0.6326168645723733, + "flos": 19937137898880.0, + "grad_norm": 1.6701833516110784, + "language_loss": 0.71829087, + "learning_rate": 1.256319016853377e-06, + "loss": 0.7396822, + "num_input_tokens_seen": 226841645, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 10522, + "time_per_iteration": 2.456470012664795 + }, + { + "auxiliary_loss_clip": 0.01105444, + "auxiliary_loss_mlp": 0.01030343, + "balance_loss_clip": 1.01849365, + "balance_loss_mlp": 1.03691065, + "epoch": 0.6326769878250413, + "flos": 20230348619520.0, + "grad_norm": 1.752428604035476, + "language_loss": 0.81730425, + "learning_rate": 1.2559574949862023e-06, + "loss": 0.83866215, + "num_input_tokens_seen": 226860355, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6875, + "step": 10523, + "time_per_iteration": 2.4390153884887695 + }, + { + "auxiliary_loss_clip": 0.01104755, + "auxiliary_loss_mlp": 0.01025919, + "balance_loss_clip": 1.01428986, + "balance_loss_mlp": 1.03734088, + "epoch": 0.6327371110777094, + "flos": 20775759707520.0, + "grad_norm": 2.695654876532073, + "language_loss": 0.73930323, + "learning_rate": 1.255596001333195e-06, + "loss": 0.76060998, + "num_input_tokens_seen": 226878390, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 10524, + "time_per_iteration": 2.4376304149627686 + }, + { + "auxiliary_loss_clip": 0.01111218, + "auxiliary_loss_mlp": 0.01035593, + "balance_loss_clip": 1.02231896, + "balance_loss_mlp": 1.03718793, + "epoch": 0.6327972343303773, + "flos": 30336544529280.0, + "grad_norm": 4.405789883496385, + "language_loss": 0.84463608, + "learning_rate": 1.2552345359080615e-06, + "loss": 0.86610419, + "num_input_tokens_seen": 226898420, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73828125, + "step": 10525, + "time_per_iteration": 2.4973292350769043 + }, + { + "auxiliary_loss_clip": 0.011025, + "auxiliary_loss_mlp": 0.01023105, + "balance_loss_clip": 1.01178622, + "balance_loss_mlp": 1.03544807, + "epoch": 0.6328573575830453, + "flos": 17092258871040.0, + "grad_norm": 3.1585625796827212, + "language_loss": 0.66817802, + "learning_rate": 1.2548730987245093e-06, + "loss": 0.68943405, + "num_input_tokens_seen": 226916305, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 10526, + "time_per_iteration": 2.431757688522339 + }, + { + "auxiliary_loss_clip": 0.01111651, + "auxiliary_loss_mlp": 0.01036373, + "balance_loss_clip": 1.02308118, + "balance_loss_mlp": 1.03971434, + "epoch": 0.6329174808357132, + "flos": 25047154442880.0, + "grad_norm": 2.135799005467542, + "language_loss": 0.7367599, + "learning_rate": 1.254511689796244e-06, + "loss": 0.75824016, + "num_input_tokens_seen": 226937705, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 10527, + "time_per_iteration": 2.473468065261841 + }, + { + "auxiliary_loss_clip": 0.01104509, + "auxiliary_loss_mlp": 0.01029387, + "balance_loss_clip": 1.01803195, + "balance_loss_mlp": 1.03822732, + "epoch": 0.6329776040883812, + "flos": 16836826279680.0, + "grad_norm": 1.98632215188849, + "language_loss": 0.71867841, + "learning_rate": 1.2541503091369693e-06, + "loss": 0.74001735, + "num_input_tokens_seen": 226954880, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6640625, + "step": 10528, + "time_per_iteration": 2.428516387939453 + }, + { + "auxiliary_loss_clip": 0.01103271, + "auxiliary_loss_mlp": 0.01028378, + "balance_loss_clip": 1.01593244, + "balance_loss_mlp": 1.03575611, + "epoch": 0.6330377273410491, + "flos": 13516705382400.0, + "grad_norm": 2.649115399957431, + "language_loss": 0.66042399, + "learning_rate": 1.2537889567603905e-06, + "loss": 0.68174052, + "num_input_tokens_seen": 226972595, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.67578125, + "step": 10529, + "time_per_iteration": 2.4110963344573975 + }, + { + "auxiliary_loss_clip": 0.01109156, + "auxiliary_loss_mlp": 0.01031761, + "balance_loss_clip": 1.01853514, + "balance_loss_mlp": 1.03828883, + "epoch": 0.6330978505937171, + "flos": 21538825257600.0, + "grad_norm": 2.3567719196586134, + "language_loss": 0.75553149, + "learning_rate": 1.2534276326802092e-06, + "loss": 0.7769407, + "num_input_tokens_seen": 226991910, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 10530, + "time_per_iteration": 2.47843074798584 + }, + { + "auxiliary_loss_clip": 0.01109816, + "auxiliary_loss_mlp": 0.01029253, + "balance_loss_clip": 1.01745164, + "balance_loss_mlp": 1.04016328, + "epoch": 0.6331579738463851, + "flos": 25009484054400.0, + "grad_norm": 2.740073625004777, + "language_loss": 0.73872888, + "learning_rate": 1.2530663369101259e-06, + "loss": 0.76011956, + "num_input_tokens_seen": 227010175, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6953125, + "step": 10531, + "time_per_iteration": 2.4678969383239746 + }, + { + "auxiliary_loss_clip": 0.01102476, + "auxiliary_loss_mlp": 0.01028951, + "balance_loss_clip": 1.01666689, + "balance_loss_mlp": 1.03636086, + "epoch": 0.6332180970990531, + "flos": 14976007228800.0, + "grad_norm": 2.9880072875831147, + "language_loss": 0.79408121, + "learning_rate": 1.2527050694638432e-06, + "loss": 0.81539547, + "num_input_tokens_seen": 227025540, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66015625, + "step": 10532, + "time_per_iteration": 2.481036901473999 + }, + { + "auxiliary_loss_clip": 0.01102051, + "auxiliary_loss_mlp": 0.0103236, + "balance_loss_clip": 1.02170849, + "balance_loss_mlp": 1.03580236, + "epoch": 0.633278220351721, + "flos": 22706963458560.0, + "grad_norm": 2.7000401748576817, + "language_loss": 0.74374038, + "learning_rate": 1.2523438303550582e-06, + "loss": 0.76508451, + "num_input_tokens_seen": 227045520, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6640625, + "step": 10533, + "time_per_iteration": 2.4607644081115723 + }, + { + "auxiliary_loss_clip": 0.01110909, + "auxiliary_loss_mlp": 0.01034703, + "balance_loss_clip": 1.02163792, + "balance_loss_mlp": 1.03844595, + "epoch": 0.633338343604389, + "flos": 12602922364800.0, + "grad_norm": 2.750255656428334, + "language_loss": 0.76894259, + "learning_rate": 1.2519826195974706e-06, + "loss": 0.79039878, + "num_input_tokens_seen": 227059420, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.72265625, + "step": 10534, + "time_per_iteration": 2.4279823303222656 + }, + { + "auxiliary_loss_clip": 0.01106846, + "auxiliary_loss_mlp": 0.0103469, + "balance_loss_clip": 1.02312016, + "balance_loss_mlp": 1.03899598, + "epoch": 0.6333984668570569, + "flos": 25960111447680.0, + "grad_norm": 1.5411023230298349, + "language_loss": 0.85583681, + "learning_rate": 1.251621437204777e-06, + "loss": 0.8772521, + "num_input_tokens_seen": 227081310, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6796875, + "step": 10535, + "time_per_iteration": 2.4824087619781494 + }, + { + "auxiliary_loss_clip": 0.01106839, + "auxiliary_loss_mlp": 0.01031821, + "balance_loss_clip": 1.01961398, + "balance_loss_mlp": 1.03782022, + "epoch": 0.6334585901097249, + "flos": 23659242877440.0, + "grad_norm": 2.0534992057606285, + "language_loss": 0.76360321, + "learning_rate": 1.2512602831906733e-06, + "loss": 0.78498983, + "num_input_tokens_seen": 227100365, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 10536, + "time_per_iteration": 2.530451774597168 + }, + { + "auxiliary_loss_clip": 0.0110721, + "auxiliary_loss_mlp": 0.01028938, + "balance_loss_clip": 1.01741064, + "balance_loss_mlp": 1.03990674, + "epoch": 0.633518713362393, + "flos": 28760496503040.0, + "grad_norm": 1.9627877064999752, + "language_loss": 0.60015184, + "learning_rate": 1.250899157568855e-06, + "loss": 0.62151325, + "num_input_tokens_seen": 227119680, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 10537, + "time_per_iteration": 2.5151615142822266 + }, + { + "auxiliary_loss_clip": 0.01026622, + "auxiliary_loss_mlp": 0.01001054, + "balance_loss_clip": 0.999695, + "balance_loss_mlp": 1.00554442, + "epoch": 0.6335788366150609, + "flos": 70420322401920.0, + "grad_norm": 0.7708037183825521, + "language_loss": 0.52472723, + "learning_rate": 1.2505380603530155e-06, + "loss": 0.54500401, + "num_input_tokens_seen": 227184465, + "router_z_loss_clip": 0.01361084, + "router_z_loss_mlp": 0.2109375, + "step": 10538, + "time_per_iteration": 3.165985584259033 + }, + { + "auxiliary_loss_clip": 0.01108701, + "auxiliary_loss_mlp": 0.01033325, + "balance_loss_clip": 1.0205456, + "balance_loss_mlp": 1.0376327, + "epoch": 0.6336389598677289, + "flos": 23732069702400.0, + "grad_norm": 1.8519204835949576, + "language_loss": 0.83039713, + "learning_rate": 1.250176991556848e-06, + "loss": 0.85181737, + "num_input_tokens_seen": 227202185, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7109375, + "step": 10539, + "time_per_iteration": 2.4390335083007812 + }, + { + "auxiliary_loss_clip": 0.01107427, + "auxiliary_loss_mlp": 0.01028431, + "balance_loss_clip": 1.01580071, + "balance_loss_mlp": 1.03738523, + "epoch": 0.6336990831203968, + "flos": 29276676898560.0, + "grad_norm": 1.637138612539208, + "language_loss": 0.86837506, + "learning_rate": 1.2498159511940438e-06, + "loss": 0.88973361, + "num_input_tokens_seen": 227222020, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69921875, + "step": 10540, + "time_per_iteration": 2.4831221103668213 + }, + { + "auxiliary_loss_clip": 0.01100728, + "auxiliary_loss_mlp": 0.01027815, + "balance_loss_clip": 1.01740217, + "balance_loss_mlp": 1.03550363, + "epoch": 0.6337592063730648, + "flos": 29096836479360.0, + "grad_norm": 1.5901447763785947, + "language_loss": 0.7268725, + "learning_rate": 1.2494549392782943e-06, + "loss": 0.74815792, + "num_input_tokens_seen": 227240885, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.65234375, + "step": 10541, + "time_per_iteration": 2.479461908340454 + }, + { + "auxiliary_loss_clip": 0.01109283, + "auxiliary_loss_mlp": 0.0103142, + "balance_loss_clip": 1.01819396, + "balance_loss_mlp": 1.03717303, + "epoch": 0.6338193296257327, + "flos": 34706477249280.0, + "grad_norm": 2.6143323692331166, + "language_loss": 0.84712064, + "learning_rate": 1.2490939558232887e-06, + "loss": 0.86852765, + "num_input_tokens_seen": 227257880, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.72265625, + "step": 10542, + "time_per_iteration": 2.54823899269104 + }, + { + "auxiliary_loss_clip": 0.01105497, + "auxiliary_loss_mlp": 0.01030798, + "balance_loss_clip": 1.01760697, + "balance_loss_mlp": 1.03709495, + "epoch": 0.6338794528784008, + "flos": 16687581269760.0, + "grad_norm": 1.6553786281241991, + "language_loss": 0.77977955, + "learning_rate": 1.2487330008427153e-06, + "loss": 0.80114251, + "num_input_tokens_seen": 227274840, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.68359375, + "step": 10543, + "time_per_iteration": 2.3880414962768555 + }, + { + "auxiliary_loss_clip": 0.0110064, + "auxiliary_loss_mlp": 0.01034533, + "balance_loss_clip": 1.02324414, + "balance_loss_mlp": 1.03599632, + "epoch": 0.6339395761310687, + "flos": 22346600261760.0, + "grad_norm": 1.6753324610621851, + "language_loss": 0.73382592, + "learning_rate": 1.2483720743502618e-06, + "loss": 0.75517762, + "num_input_tokens_seen": 227294835, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 10544, + "time_per_iteration": 2.4576821327209473 + }, + { + "auxiliary_loss_clip": 0.01108095, + "auxiliary_loss_mlp": 0.01036859, + "balance_loss_clip": 1.02458596, + "balance_loss_mlp": 1.0366528, + "epoch": 0.6339996993837367, + "flos": 18551812112640.0, + "grad_norm": 2.0297826320587844, + "language_loss": 0.68563735, + "learning_rate": 1.2480111763596144e-06, + "loss": 0.70708686, + "num_input_tokens_seen": 227314935, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.71484375, + "step": 10545, + "time_per_iteration": 2.4281883239746094 + }, + { + "auxiliary_loss_clip": 0.01102093, + "auxiliary_loss_mlp": 0.01030311, + "balance_loss_clip": 1.01794934, + "balance_loss_mlp": 1.03507733, + "epoch": 0.6340598226364046, + "flos": 12969498614400.0, + "grad_norm": 2.000384025401953, + "language_loss": 0.71141988, + "learning_rate": 1.2476503068844592e-06, + "loss": 0.73274392, + "num_input_tokens_seen": 227332905, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 10546, + "time_per_iteration": 2.4097115993499756 + }, + { + "auxiliary_loss_clip": 0.01101216, + "auxiliary_loss_mlp": 0.01031824, + "balance_loss_clip": 1.02085662, + "balance_loss_mlp": 1.03665507, + "epoch": 0.6341199458890726, + "flos": 26687984647680.0, + "grad_norm": 1.3382755401261122, + "language_loss": 0.77992189, + "learning_rate": 1.2472894659384792e-06, + "loss": 0.80125231, + "num_input_tokens_seen": 227354915, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 10547, + "time_per_iteration": 2.4647274017333984 + }, + { + "auxiliary_loss_clip": 0.01107664, + "auxiliary_loss_mlp": 0.01032127, + "balance_loss_clip": 1.02003956, + "balance_loss_mlp": 1.03658104, + "epoch": 0.6341800691417405, + "flos": 18734274224640.0, + "grad_norm": 1.7405007308500737, + "language_loss": 0.63246721, + "learning_rate": 1.2469286535353578e-06, + "loss": 0.6538651, + "num_input_tokens_seen": 227372990, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.7109375, + "step": 10548, + "time_per_iteration": 2.4153497219085693 + }, + { + "auxiliary_loss_clip": 0.01103941, + "auxiliary_loss_mlp": 0.01027195, + "balance_loss_clip": 1.01604867, + "balance_loss_mlp": 1.03657913, + "epoch": 0.6342401923944085, + "flos": 26249443499520.0, + "grad_norm": 1.5666269720045418, + "language_loss": 0.61767489, + "learning_rate": 1.2465678696887785e-06, + "loss": 0.63898623, + "num_input_tokens_seen": 227393270, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.671875, + "step": 10549, + "time_per_iteration": 2.4682185649871826 + }, + { + "auxiliary_loss_clip": 0.01102967, + "auxiliary_loss_mlp": 0.01028195, + "balance_loss_clip": 1.01777053, + "balance_loss_mlp": 1.03553009, + "epoch": 0.6343003156470765, + "flos": 24680937329280.0, + "grad_norm": 1.7174833177104423, + "language_loss": 0.73910511, + "learning_rate": 1.2462071144124197e-06, + "loss": 0.76041675, + "num_input_tokens_seen": 227413630, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.671875, + "step": 10550, + "time_per_iteration": 2.484928607940674 + }, + { + "auxiliary_loss_clip": 0.01026139, + "auxiliary_loss_mlp": 0.00996982, + "balance_loss_clip": 0.99562275, + "balance_loss_mlp": 1.00515223, + "epoch": 0.6343604388997445, + "flos": 69805352626560.0, + "grad_norm": 0.6918927993882659, + "language_loss": 0.57716167, + "learning_rate": 1.2458463877199638e-06, + "loss": 0.59739286, + "num_input_tokens_seen": 227476630, + "router_z_loss_clip": 0.01361084, + "router_z_loss_mlp": 0.20996094, + "step": 10551, + "time_per_iteration": 3.0650179386138916 + }, + { + "auxiliary_loss_clip": 0.01103158, + "auxiliary_loss_mlp": 0.01026085, + "balance_loss_clip": 1.01508236, + "balance_loss_mlp": 1.03589559, + "epoch": 0.6344205621524125, + "flos": 21982430223360.0, + "grad_norm": 1.7335763595284734, + "language_loss": 0.67098165, + "learning_rate": 1.2454856896250881e-06, + "loss": 0.69227403, + "num_input_tokens_seen": 227496060, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.671875, + "step": 10552, + "time_per_iteration": 3.8182289600372314 + }, + { + "auxiliary_loss_clip": 0.01105164, + "auxiliary_loss_mlp": 0.01028535, + "balance_loss_clip": 1.01642919, + "balance_loss_mlp": 1.03475296, + "epoch": 0.6344806854050804, + "flos": 20448865008000.0, + "grad_norm": 1.910004275661171, + "language_loss": 0.8218025, + "learning_rate": 1.24512502014147e-06, + "loss": 0.84313941, + "num_input_tokens_seen": 227513440, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.703125, + "step": 10553, + "time_per_iteration": 2.449106216430664 + }, + { + "auxiliary_loss_clip": 0.0110533, + "auxiliary_loss_mlp": 0.01033634, + "balance_loss_clip": 1.02142692, + "balance_loss_mlp": 1.03581154, + "epoch": 0.6345408086577484, + "flos": 40510611187200.0, + "grad_norm": 1.776645744912539, + "language_loss": 0.5519408, + "learning_rate": 1.2447643792827879e-06, + "loss": 0.5733304, + "num_input_tokens_seen": 227535395, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6953125, + "step": 10554, + "time_per_iteration": 4.085347652435303 + }, + { + "auxiliary_loss_clip": 0.01105981, + "auxiliary_loss_mlp": 0.01029893, + "balance_loss_clip": 1.01777518, + "balance_loss_mlp": 1.03750849, + "epoch": 0.6346009319104163, + "flos": 21361319222400.0, + "grad_norm": 1.7092991458226663, + "language_loss": 0.70511019, + "learning_rate": 1.2444037670627153e-06, + "loss": 0.72646892, + "num_input_tokens_seen": 227554545, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.68359375, + "step": 10555, + "time_per_iteration": 3.8290207386016846 + }, + { + "auxiliary_loss_clip": 0.01026207, + "auxiliary_loss_mlp": 0.01000287, + "balance_loss_clip": 0.99898165, + "balance_loss_mlp": 1.00531995, + "epoch": 0.6346610551630844, + "flos": 71365419100800.0, + "grad_norm": 0.8961338606309752, + "language_loss": 0.55477089, + "learning_rate": 1.2440431834949276e-06, + "loss": 0.57503581, + "num_input_tokens_seen": 227608575, + "router_z_loss_clip": 0.01306152, + "router_z_loss_mlp": 0.20898438, + "step": 10556, + "time_per_iteration": 4.450624227523804 + }, + { + "auxiliary_loss_clip": 0.01105056, + "auxiliary_loss_mlp": 0.01032391, + "balance_loss_clip": 1.01931942, + "balance_loss_mlp": 1.03497851, + "epoch": 0.6347211784157523, + "flos": 25411504049280.0, + "grad_norm": 1.9064112571580962, + "language_loss": 0.68177021, + "learning_rate": 1.2436826285930985e-06, + "loss": 0.70314467, + "num_input_tokens_seen": 227628175, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.69921875, + "step": 10557, + "time_per_iteration": 2.486895799636841 + }, + { + "auxiliary_loss_clip": 0.01103875, + "auxiliary_loss_mlp": 0.01031026, + "balance_loss_clip": 1.01968968, + "balance_loss_mlp": 1.03706682, + "epoch": 0.6347813016684203, + "flos": 15742735966080.0, + "grad_norm": 1.9232562930576766, + "language_loss": 0.70448172, + "learning_rate": 1.2433221023709002e-06, + "loss": 0.72583079, + "num_input_tokens_seen": 227645330, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 10558, + "time_per_iteration": 2.40922212600708 + }, + { + "auxiliary_loss_clip": 0.0110235, + "auxiliary_loss_mlp": 0.01031008, + "balance_loss_clip": 1.01924789, + "balance_loss_mlp": 1.03492951, + "epoch": 0.6348414249210882, + "flos": 21464777370240.0, + "grad_norm": 1.4566517765841722, + "language_loss": 0.78202355, + "learning_rate": 1.2429616048420031e-06, + "loss": 0.80335712, + "num_input_tokens_seen": 227665250, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.67578125, + "step": 10559, + "time_per_iteration": 2.44706130027771 + }, + { + "auxiliary_loss_clip": 0.01108267, + "auxiliary_loss_mlp": 0.01037326, + "balance_loss_clip": 1.02512479, + "balance_loss_mlp": 1.03806639, + "epoch": 0.6349015481737562, + "flos": 21653057485440.0, + "grad_norm": 2.1761339392195467, + "language_loss": 0.68320858, + "learning_rate": 1.242601136020078e-06, + "loss": 0.70466453, + "num_input_tokens_seen": 227685070, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.703125, + "step": 10560, + "time_per_iteration": 2.4409596920013428 + }, + { + "auxiliary_loss_clip": 0.01104015, + "auxiliary_loss_mlp": 0.01034754, + "balance_loss_clip": 1.02302957, + "balance_loss_mlp": 1.03592563, + "epoch": 0.6349616714264241, + "flos": 22194984954240.0, + "grad_norm": 1.67836467156634, + "language_loss": 0.7699995, + "learning_rate": 1.2422406959187939e-06, + "loss": 0.7913872, + "num_input_tokens_seen": 227704430, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 10561, + "time_per_iteration": 2.5039145946502686 + }, + { + "auxiliary_loss_clip": 0.01104347, + "auxiliary_loss_mlp": 0.01032147, + "balance_loss_clip": 1.01994574, + "balance_loss_mlp": 1.0352478, + "epoch": 0.6350217946790921, + "flos": 25410354814080.0, + "grad_norm": 4.009586317175133, + "language_loss": 0.72008455, + "learning_rate": 1.2418802845518178e-06, + "loss": 0.74144948, + "num_input_tokens_seen": 227724920, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69140625, + "step": 10562, + "time_per_iteration": 2.472137212753296 + }, + { + "auxiliary_loss_clip": 0.01107214, + "auxiliary_loss_mlp": 0.010302, + "balance_loss_clip": 1.01749265, + "balance_loss_mlp": 1.03718257, + "epoch": 0.63508191793176, + "flos": 19718944732800.0, + "grad_norm": 2.5105421382487267, + "language_loss": 0.80683196, + "learning_rate": 1.2415199019328185e-06, + "loss": 0.82820606, + "num_input_tokens_seen": 227743400, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 10563, + "time_per_iteration": 2.4413557052612305 + }, + { + "auxiliary_loss_clip": 0.01108821, + "auxiliary_loss_mlp": 0.01036739, + "balance_loss_clip": 1.02480066, + "balance_loss_mlp": 1.03883505, + "epoch": 0.6351420411844281, + "flos": 18186923802240.0, + "grad_norm": 2.110536240738381, + "language_loss": 0.80818796, + "learning_rate": 1.2411595480754597e-06, + "loss": 0.82964349, + "num_input_tokens_seen": 227759990, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.69921875, + "step": 10564, + "time_per_iteration": 2.4266111850738525 + }, + { + "auxiliary_loss_clip": 0.01107128, + "auxiliary_loss_mlp": 0.01031565, + "balance_loss_clip": 1.02003133, + "balance_loss_mlp": 1.03857136, + "epoch": 0.6352021644370961, + "flos": 33726511422720.0, + "grad_norm": 1.6172553063068438, + "language_loss": 0.72285914, + "learning_rate": 1.240799222993407e-06, + "loss": 0.74424613, + "num_input_tokens_seen": 227780835, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6875, + "step": 10565, + "time_per_iteration": 2.534834623336792 + }, + { + "auxiliary_loss_clip": 0.01106685, + "auxiliary_loss_mlp": 0.01030402, + "balance_loss_clip": 1.01762271, + "balance_loss_mlp": 1.03696799, + "epoch": 0.635262287689764, + "flos": 20374781207040.0, + "grad_norm": 2.0506297866150467, + "language_loss": 0.69144678, + "learning_rate": 1.240438926700324e-06, + "loss": 0.71281761, + "num_input_tokens_seen": 227798580, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 10566, + "time_per_iteration": 2.411491632461548 + }, + { + "auxiliary_loss_clip": 0.01102305, + "auxiliary_loss_mlp": 0.01029542, + "balance_loss_clip": 1.01837826, + "balance_loss_mlp": 1.03648448, + "epoch": 0.635322410942432, + "flos": 27525421307520.0, + "grad_norm": 1.5548948412040506, + "language_loss": 0.69706547, + "learning_rate": 1.2400786592098725e-06, + "loss": 0.71838397, + "num_input_tokens_seen": 227819210, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 10567, + "time_per_iteration": 2.48917293548584 + }, + { + "auxiliary_loss_clip": 0.01103585, + "auxiliary_loss_mlp": 0.01032656, + "balance_loss_clip": 1.02129579, + "balance_loss_mlp": 1.03807187, + "epoch": 0.6353825341950999, + "flos": 21543601766400.0, + "grad_norm": 2.2646303551803753, + "language_loss": 0.84620178, + "learning_rate": 1.2397184205357154e-06, + "loss": 0.86756414, + "num_input_tokens_seen": 227838340, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65625, + "step": 10568, + "time_per_iteration": 2.4403724670410156 + }, + { + "auxiliary_loss_clip": 0.0110714, + "auxiliary_loss_mlp": 0.01038435, + "balance_loss_clip": 1.02603149, + "balance_loss_mlp": 1.03773642, + "epoch": 0.635442657447768, + "flos": 31759756185600.0, + "grad_norm": 1.746273347856982, + "language_loss": 0.83601934, + "learning_rate": 1.2393582106915113e-06, + "loss": 0.8574751, + "num_input_tokens_seen": 227859170, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6953125, + "step": 10569, + "time_per_iteration": 2.5299484729766846 + }, + { + "auxiliary_loss_clip": 0.01103471, + "auxiliary_loss_mlp": 0.01027469, + "balance_loss_clip": 1.01576304, + "balance_loss_mlp": 1.03676128, + "epoch": 0.6355027807004359, + "flos": 19828831415040.0, + "grad_norm": 1.7093099643488843, + "language_loss": 0.69269961, + "learning_rate": 1.2389980296909198e-06, + "loss": 0.71400905, + "num_input_tokens_seen": 227878545, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66796875, + "step": 10570, + "time_per_iteration": 2.4609997272491455 + }, + { + "auxiliary_loss_clip": 0.01105483, + "auxiliary_loss_mlp": 0.01033824, + "balance_loss_clip": 1.02142024, + "balance_loss_mlp": 1.03430879, + "epoch": 0.6355629039531039, + "flos": 30372383324160.0, + "grad_norm": 1.6697776111718718, + "language_loss": 0.65798032, + "learning_rate": 1.2386378775476e-06, + "loss": 0.67937338, + "num_input_tokens_seen": 227898875, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 10571, + "time_per_iteration": 2.5261099338531494 + }, + { + "auxiliary_loss_clip": 0.01110578, + "auxiliary_loss_mlp": 0.01029161, + "balance_loss_clip": 1.01701999, + "balance_loss_mlp": 1.03919911, + "epoch": 0.6356230272057718, + "flos": 17932065828480.0, + "grad_norm": 1.7838042943408632, + "language_loss": 0.71219468, + "learning_rate": 1.2382777542752074e-06, + "loss": 0.73359203, + "num_input_tokens_seen": 227917130, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.7109375, + "step": 10572, + "time_per_iteration": 2.4292333126068115 + }, + { + "auxiliary_loss_clip": 0.0110121, + "auxiliary_loss_mlp": 0.01032091, + "balance_loss_clip": 1.02092671, + "balance_loss_mlp": 1.03459537, + "epoch": 0.6356831504584398, + "flos": 25375844822400.0, + "grad_norm": 2.8044296408111657, + "language_loss": 0.81269503, + "learning_rate": 1.2379176598873992e-06, + "loss": 0.83402801, + "num_input_tokens_seen": 227939550, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.66796875, + "step": 10573, + "time_per_iteration": 2.5012412071228027 + }, + { + "auxiliary_loss_clip": 0.01106251, + "auxiliary_loss_mlp": 0.01030673, + "balance_loss_clip": 1.01897848, + "balance_loss_mlp": 1.0366838, + "epoch": 0.6357432737111077, + "flos": 46500331720320.0, + "grad_norm": 1.5014218063812952, + "language_loss": 0.68932259, + "learning_rate": 1.2375575943978303e-06, + "loss": 0.71069181, + "num_input_tokens_seen": 227962200, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.69921875, + "step": 10574, + "time_per_iteration": 2.668290853500366 + }, + { + "auxiliary_loss_clip": 0.01104073, + "auxiliary_loss_mlp": 0.01027616, + "balance_loss_clip": 1.01580226, + "balance_loss_mlp": 1.03717065, + "epoch": 0.6358033969637757, + "flos": 17274361847040.0, + "grad_norm": 2.2372840416556476, + "language_loss": 0.86855853, + "learning_rate": 1.2371975578201525e-06, + "loss": 0.88987547, + "num_input_tokens_seen": 227979270, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 10575, + "time_per_iteration": 2.4198617935180664 + }, + { + "auxiliary_loss_clip": 0.01104492, + "auxiliary_loss_mlp": 0.01032617, + "balance_loss_clip": 1.02124405, + "balance_loss_mlp": 1.03752255, + "epoch": 0.6358635202164437, + "flos": 27125520215040.0, + "grad_norm": 1.7217722573852687, + "language_loss": 0.72000861, + "learning_rate": 1.2368375501680204e-06, + "loss": 0.74137974, + "num_input_tokens_seen": 228000550, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.66796875, + "step": 10576, + "time_per_iteration": 2.4883639812469482 + }, + { + "auxiliary_loss_clip": 0.01106159, + "auxiliary_loss_mlp": 0.01028082, + "balance_loss_clip": 1.01569581, + "balance_loss_mlp": 1.03626978, + "epoch": 0.6359236434691117, + "flos": 27525205825920.0, + "grad_norm": 1.7446831979165325, + "language_loss": 0.69537437, + "learning_rate": 1.236477571455085e-06, + "loss": 0.71671677, + "num_input_tokens_seen": 228022005, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69921875, + "step": 10577, + "time_per_iteration": 2.4888103008270264 + }, + { + "auxiliary_loss_clip": 0.01103371, + "auxiliary_loss_mlp": 0.01029672, + "balance_loss_clip": 1.01802576, + "balance_loss_mlp": 1.03562689, + "epoch": 0.6359837667217797, + "flos": 39348290989440.0, + "grad_norm": 1.9869814787183224, + "language_loss": 0.72090602, + "learning_rate": 1.2361176216949964e-06, + "loss": 0.74223644, + "num_input_tokens_seen": 228043770, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 10578, + "time_per_iteration": 2.5746970176696777 + }, + { + "auxiliary_loss_clip": 0.01025564, + "auxiliary_loss_mlp": 0.0100215, + "balance_loss_clip": 1.00076127, + "balance_loss_mlp": 1.00480723, + "epoch": 0.6360438899744476, + "flos": 56413797206400.0, + "grad_norm": 0.7033646347458022, + "language_loss": 0.54444003, + "learning_rate": 1.2357577009014044e-06, + "loss": 0.56471717, + "num_input_tokens_seen": 228104985, + "router_z_loss_clip": 0.01391602, + "router_z_loss_mlp": 0.20800781, + "step": 10579, + "time_per_iteration": 3.1232736110687256 + }, + { + "auxiliary_loss_clip": 0.01105773, + "auxiliary_loss_mlp": 0.01027914, + "balance_loss_clip": 1.01612449, + "balance_loss_mlp": 1.0368464, + "epoch": 0.6361040132271156, + "flos": 24973106555520.0, + "grad_norm": 1.7171447811267215, + "language_loss": 0.77475232, + "learning_rate": 1.2353978090879568e-06, + "loss": 0.79608917, + "num_input_tokens_seen": 228125620, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6875, + "step": 10580, + "time_per_iteration": 2.461869239807129 + }, + { + "auxiliary_loss_clip": 0.01103242, + "auxiliary_loss_mlp": 0.01025216, + "balance_loss_clip": 1.0138669, + "balance_loss_mlp": 1.03540814, + "epoch": 0.6361641364797835, + "flos": 23259198130560.0, + "grad_norm": 2.098056730123376, + "language_loss": 0.67005563, + "learning_rate": 1.235037946268301e-06, + "loss": 0.69134021, + "num_input_tokens_seen": 228143495, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6796875, + "step": 10581, + "time_per_iteration": 2.4425008296966553 + }, + { + "auxiliary_loss_clip": 0.01102376, + "auxiliary_loss_mlp": 0.01030749, + "balance_loss_clip": 1.01949584, + "balance_loss_mlp": 1.03480268, + "epoch": 0.6362242597324516, + "flos": 25994513698560.0, + "grad_norm": 1.3074505079001684, + "language_loss": 0.68299043, + "learning_rate": 1.2346781124560828e-06, + "loss": 0.70432162, + "num_input_tokens_seen": 228166500, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.67578125, + "step": 10582, + "time_per_iteration": 2.4763622283935547 + }, + { + "auxiliary_loss_clip": 0.01106848, + "auxiliary_loss_mlp": 0.01035558, + "balance_loss_clip": 1.02355933, + "balance_loss_mlp": 1.03695726, + "epoch": 0.6362843829851195, + "flos": 25703242312320.0, + "grad_norm": 2.267802402035549, + "language_loss": 0.84247005, + "learning_rate": 1.2343183076649473e-06, + "loss": 0.8638941, + "num_input_tokens_seen": 228185325, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69921875, + "step": 10583, + "time_per_iteration": 2.4797277450561523 + }, + { + "auxiliary_loss_clip": 0.0110538, + "auxiliary_loss_mlp": 0.01029153, + "balance_loss_clip": 1.01703572, + "balance_loss_mlp": 1.03860188, + "epoch": 0.6363445062377875, + "flos": 20522912895360.0, + "grad_norm": 1.5650473008286672, + "language_loss": 0.7515592, + "learning_rate": 1.233958531908538e-06, + "loss": 0.77290452, + "num_input_tokens_seen": 228204050, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 10584, + "time_per_iteration": 2.430316209793091 + }, + { + "auxiliary_loss_clip": 0.01106996, + "auxiliary_loss_mlp": 0.01037982, + "balance_loss_clip": 1.02420747, + "balance_loss_mlp": 1.03688443, + "epoch": 0.6364046294904554, + "flos": 19463799450240.0, + "grad_norm": 1.9066305180241776, + "language_loss": 0.72856915, + "learning_rate": 1.2335987852004985e-06, + "loss": 0.75001895, + "num_input_tokens_seen": 228222430, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.703125, + "step": 10585, + "time_per_iteration": 2.4419803619384766 + }, + { + "auxiliary_loss_clip": 0.01105577, + "auxiliary_loss_mlp": 0.01028793, + "balance_loss_clip": 1.01756358, + "balance_loss_mlp": 1.03718138, + "epoch": 0.6364647527431234, + "flos": 20995892208000.0, + "grad_norm": 1.8332276657421747, + "language_loss": 0.82785809, + "learning_rate": 1.2332390675544697e-06, + "loss": 0.8492018, + "num_input_tokens_seen": 228241925, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.68359375, + "step": 10586, + "time_per_iteration": 2.421600341796875 + }, + { + "auxiliary_loss_clip": 0.01103874, + "auxiliary_loss_mlp": 0.0102664, + "balance_loss_clip": 1.01523161, + "balance_loss_mlp": 1.03603029, + "epoch": 0.6365248759957913, + "flos": 25770789838080.0, + "grad_norm": 4.704421092048837, + "language_loss": 0.72570878, + "learning_rate": 1.2328793789840918e-06, + "loss": 0.74701393, + "num_input_tokens_seen": 228262535, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6796875, + "step": 10587, + "time_per_iteration": 2.472022533416748 + }, + { + "auxiliary_loss_clip": 0.0110564, + "auxiliary_loss_mlp": 0.01025045, + "balance_loss_clip": 1.01379192, + "balance_loss_mlp": 1.03764784, + "epoch": 0.6365849992484593, + "flos": 22455589104000.0, + "grad_norm": 1.7915085469286844, + "language_loss": 0.76668859, + "learning_rate": 1.2325197195030058e-06, + "loss": 0.7879954, + "num_input_tokens_seen": 228281340, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6796875, + "step": 10588, + "time_per_iteration": 2.4190168380737305 + }, + { + "auxiliary_loss_clip": 0.01102746, + "auxiliary_loss_mlp": 0.01027364, + "balance_loss_clip": 1.01552689, + "balance_loss_mlp": 1.03755879, + "epoch": 0.6366451225011273, + "flos": 19025689265280.0, + "grad_norm": 1.3970993827847034, + "language_loss": 0.79966116, + "learning_rate": 1.2321600891248478e-06, + "loss": 0.82096231, + "num_input_tokens_seen": 228300865, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65234375, + "step": 10589, + "time_per_iteration": 2.4743268489837646 + }, + { + "auxiliary_loss_clip": 0.01104028, + "auxiliary_loss_mlp": 0.01028134, + "balance_loss_clip": 1.01623118, + "balance_loss_mlp": 1.03771806, + "epoch": 0.6367052457537953, + "flos": 25228395492480.0, + "grad_norm": 2.311775126826065, + "language_loss": 0.67541653, + "learning_rate": 1.231800487863257e-06, + "loss": 0.69673812, + "num_input_tokens_seen": 228320815, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 10590, + "time_per_iteration": 2.450011730194092 + }, + { + "auxiliary_loss_clip": 0.0111127, + "auxiliary_loss_mlp": 0.01031485, + "balance_loss_clip": 1.01871789, + "balance_loss_mlp": 1.03779423, + "epoch": 0.6367653690064633, + "flos": 19208438686080.0, + "grad_norm": 1.6364871188688683, + "language_loss": 0.79574269, + "learning_rate": 1.2314409157318685e-06, + "loss": 0.8171702, + "num_input_tokens_seen": 228339065, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.734375, + "step": 10591, + "time_per_iteration": 2.4351706504821777 + }, + { + "auxiliary_loss_clip": 0.01104191, + "auxiliary_loss_mlp": 0.01026707, + "balance_loss_clip": 1.0155071, + "balance_loss_mlp": 1.03807092, + "epoch": 0.6368254922591312, + "flos": 23546806329600.0, + "grad_norm": 1.6582489812189014, + "language_loss": 0.8898353, + "learning_rate": 1.231081372744317e-06, + "loss": 0.91114426, + "num_input_tokens_seen": 228359210, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.66015625, + "step": 10592, + "time_per_iteration": 2.4826667308807373 + }, + { + "auxiliary_loss_clip": 0.01101297, + "auxiliary_loss_mlp": 0.01025871, + "balance_loss_clip": 1.01484966, + "balance_loss_mlp": 1.03570986, + "epoch": 0.6368856155117992, + "flos": 26467313443200.0, + "grad_norm": 1.2873763192716858, + "language_loss": 0.68307251, + "learning_rate": 1.2307218589142376e-06, + "loss": 0.70434421, + "num_input_tokens_seen": 228379630, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 10593, + "time_per_iteration": 3.870232105255127 + }, + { + "auxiliary_loss_clip": 0.01101916, + "auxiliary_loss_mlp": 0.01030234, + "balance_loss_clip": 1.01887894, + "balance_loss_mlp": 1.03454375, + "epoch": 0.6369457387644671, + "flos": 33692432394240.0, + "grad_norm": 1.9223941478023494, + "language_loss": 0.63311636, + "learning_rate": 1.2303623742552618e-06, + "loss": 0.6544379, + "num_input_tokens_seen": 228401410, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.671875, + "step": 10594, + "time_per_iteration": 2.5456788539886475 + }, + { + "auxiliary_loss_clip": 0.01026012, + "auxiliary_loss_mlp": 0.00997701, + "balance_loss_clip": 0.99638408, + "balance_loss_mlp": 1.0052495, + "epoch": 0.6370058620171352, + "flos": 70908600908160.0, + "grad_norm": 0.7601242064241133, + "language_loss": 0.54636633, + "learning_rate": 1.230002918781022e-06, + "loss": 0.56660342, + "num_input_tokens_seen": 228470335, + "router_z_loss_clip": 0.01318359, + "router_z_loss_mlp": 0.20703125, + "step": 10595, + "time_per_iteration": 3.1794607639312744 + }, + { + "auxiliary_loss_clip": 0.01107322, + "auxiliary_loss_mlp": 0.01033931, + "balance_loss_clip": 1.02145016, + "balance_loss_mlp": 1.03732097, + "epoch": 0.6370659852698031, + "flos": 21141940907520.0, + "grad_norm": 1.701168717319966, + "language_loss": 0.6690321, + "learning_rate": 1.2296434925051493e-06, + "loss": 0.69044465, + "num_input_tokens_seen": 228490765, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69921875, + "step": 10596, + "time_per_iteration": 5.259617328643799 + }, + { + "auxiliary_loss_clip": 0.01105102, + "auxiliary_loss_mlp": 0.01028653, + "balance_loss_clip": 1.0173285, + "balance_loss_mlp": 1.03692877, + "epoch": 0.6371261085224711, + "flos": 20193288762240.0, + "grad_norm": 2.3148419368361686, + "language_loss": 0.78864521, + "learning_rate": 1.2292840954412718e-06, + "loss": 0.80998278, + "num_input_tokens_seen": 228509700, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6796875, + "step": 10597, + "time_per_iteration": 3.8967549800872803 + }, + { + "auxiliary_loss_clip": 0.01105388, + "auxiliary_loss_mlp": 0.01028448, + "balance_loss_clip": 1.01746917, + "balance_loss_mlp": 1.03798401, + "epoch": 0.637186231775139, + "flos": 19683536901120.0, + "grad_norm": 1.7226875807463897, + "language_loss": 0.7490381, + "learning_rate": 1.2289247276030189e-06, + "loss": 0.77037644, + "num_input_tokens_seen": 228529050, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.671875, + "step": 10598, + "time_per_iteration": 2.426950693130493 + }, + { + "auxiliary_loss_clip": 0.01105339, + "auxiliary_loss_mlp": 0.01031507, + "balance_loss_clip": 1.02002072, + "balance_loss_mlp": 1.03712225, + "epoch": 0.637246355027807, + "flos": 13071196995840.0, + "grad_norm": 1.827624008719727, + "language_loss": 0.68324673, + "learning_rate": 1.2285653890040176e-06, + "loss": 0.70461518, + "num_input_tokens_seen": 228544665, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6796875, + "step": 10599, + "time_per_iteration": 2.3905580043792725 + }, + { + "auxiliary_loss_clip": 0.01106294, + "auxiliary_loss_mlp": 0.0103191, + "balance_loss_clip": 1.0198344, + "balance_loss_mlp": 1.03601742, + "epoch": 0.6373064782804749, + "flos": 18222654856320.0, + "grad_norm": 2.01568733519361, + "language_loss": 0.80380464, + "learning_rate": 1.2282060796578942e-06, + "loss": 0.82518673, + "num_input_tokens_seen": 228562060, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.703125, + "step": 10600, + "time_per_iteration": 2.390493631362915 + }, + { + "auxiliary_loss_clip": 0.01102518, + "auxiliary_loss_mlp": 0.01028291, + "balance_loss_clip": 1.01701963, + "balance_loss_mlp": 1.03515601, + "epoch": 0.637366601533143, + "flos": 24498475217280.0, + "grad_norm": 1.447681041520347, + "language_loss": 0.79922855, + "learning_rate": 1.2278467995782732e-06, + "loss": 0.82053661, + "num_input_tokens_seen": 228582550, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.67578125, + "step": 10601, + "time_per_iteration": 2.4929754734039307 + }, + { + "auxiliary_loss_clip": 0.01105771, + "auxiliary_loss_mlp": 0.01026276, + "balance_loss_clip": 1.01460528, + "balance_loss_mlp": 1.03642297, + "epoch": 0.6374267247858109, + "flos": 26359042872960.0, + "grad_norm": 1.837610857942547, + "language_loss": 0.66878605, + "learning_rate": 1.2274875487787797e-06, + "loss": 0.69010651, + "num_input_tokens_seen": 228604960, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6953125, + "step": 10602, + "time_per_iteration": 2.467132091522217 + }, + { + "auxiliary_loss_clip": 0.01101843, + "auxiliary_loss_mlp": 0.01022562, + "balance_loss_clip": 1.01127887, + "balance_loss_mlp": 1.03530014, + "epoch": 0.6374868480384789, + "flos": 20371728551040.0, + "grad_norm": 1.9668253771039714, + "language_loss": 0.79456556, + "learning_rate": 1.2271283272730354e-06, + "loss": 0.81580961, + "num_input_tokens_seen": 228622195, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6640625, + "step": 10603, + "time_per_iteration": 2.439401149749756 + }, + { + "auxiliary_loss_clip": 0.01103337, + "auxiliary_loss_mlp": 0.01026726, + "balance_loss_clip": 1.01447129, + "balance_loss_mlp": 1.03550994, + "epoch": 0.6375469712911469, + "flos": 20996251344000.0, + "grad_norm": 2.0023670291582034, + "language_loss": 0.76751029, + "learning_rate": 1.2267691350746621e-06, + "loss": 0.78881085, + "num_input_tokens_seen": 228639735, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6796875, + "step": 10604, + "time_per_iteration": 2.4105138778686523 + }, + { + "auxiliary_loss_clip": 0.01108604, + "auxiliary_loss_mlp": 0.01032325, + "balance_loss_clip": 1.02013552, + "balance_loss_mlp": 1.03714681, + "epoch": 0.6376070945438148, + "flos": 19715748422400.0, + "grad_norm": 1.780209303316883, + "language_loss": 0.77448142, + "learning_rate": 1.226409972197281e-06, + "loss": 0.79589069, + "num_input_tokens_seen": 228658195, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.71484375, + "step": 10605, + "time_per_iteration": 2.4292843341827393 + }, + { + "auxiliary_loss_clip": 0.01105536, + "auxiliary_loss_mlp": 0.01026657, + "balance_loss_clip": 1.01408625, + "balance_loss_mlp": 1.03642416, + "epoch": 0.6376672177964828, + "flos": 21506757390720.0, + "grad_norm": 1.9363320912621251, + "language_loss": 0.65341508, + "learning_rate": 1.2260508386545106e-06, + "loss": 0.67473698, + "num_input_tokens_seen": 228677415, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69140625, + "step": 10606, + "time_per_iteration": 2.427497625350952 + }, + { + "auxiliary_loss_clip": 0.01101905, + "auxiliary_loss_mlp": 0.01033123, + "balance_loss_clip": 1.02189994, + "balance_loss_mlp": 1.03686523, + "epoch": 0.6377273410491507, + "flos": 18843873598080.0, + "grad_norm": 1.731945960339434, + "language_loss": 0.75044298, + "learning_rate": 1.225691734459971e-06, + "loss": 0.77179325, + "num_input_tokens_seen": 228696450, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 10607, + "time_per_iteration": 2.446707248687744 + }, + { + "auxiliary_loss_clip": 0.01106141, + "auxiliary_loss_mlp": 0.0103654, + "balance_loss_clip": 1.02503061, + "balance_loss_mlp": 1.03733909, + "epoch": 0.6377874643018188, + "flos": 53062970181120.0, + "grad_norm": 1.7077896003554156, + "language_loss": 0.65732801, + "learning_rate": 1.225332659627278e-06, + "loss": 0.67875481, + "num_input_tokens_seen": 228721600, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6875, + "step": 10608, + "time_per_iteration": 2.7172274589538574 + }, + { + "auxiliary_loss_clip": 0.01026098, + "auxiliary_loss_mlp": 0.01010909, + "balance_loss_clip": 1.00953197, + "balance_loss_mlp": 1.00546312, + "epoch": 0.6378475875544867, + "flos": 65135026465920.0, + "grad_norm": 0.7342720172803939, + "language_loss": 0.51933324, + "learning_rate": 1.2249736141700475e-06, + "loss": 0.53970337, + "num_input_tokens_seen": 228784535, + "router_z_loss_clip": 0.01379395, + "router_z_loss_mlp": 0.20703125, + "step": 10609, + "time_per_iteration": 3.038902759552002 + }, + { + "auxiliary_loss_clip": 0.01099294, + "auxiliary_loss_mlp": 0.01023726, + "balance_loss_clip": 1.01322937, + "balance_loss_mlp": 1.03415811, + "epoch": 0.6379077108071547, + "flos": 23002759958400.0, + "grad_norm": 1.5171992119631734, + "language_loss": 0.74632645, + "learning_rate": 1.2246145981018965e-06, + "loss": 0.76755667, + "num_input_tokens_seen": 228804110, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.65234375, + "step": 10610, + "time_per_iteration": 2.472832202911377 + }, + { + "auxiliary_loss_clip": 0.0102568, + "auxiliary_loss_mlp": 0.01003259, + "balance_loss_clip": 1.00187004, + "balance_loss_mlp": 1.00496507, + "epoch": 0.6379678340598226, + "flos": 67601947610880.0, + "grad_norm": 0.8614298288544585, + "language_loss": 0.63198531, + "learning_rate": 1.2242556114364364e-06, + "loss": 0.65227467, + "num_input_tokens_seen": 228867705, + "router_z_loss_clip": 0.01391602, + "router_z_loss_mlp": 0.20703125, + "step": 10611, + "time_per_iteration": 3.118346691131592 + }, + { + "auxiliary_loss_clip": 0.01104297, + "auxiliary_loss_mlp": 0.01035566, + "balance_loss_clip": 1.02351391, + "balance_loss_mlp": 1.03604293, + "epoch": 0.6380279573124906, + "flos": 29680061610240.0, + "grad_norm": 2.12180371585039, + "language_loss": 0.72335958, + "learning_rate": 1.223896654187282e-06, + "loss": 0.74475813, + "num_input_tokens_seen": 228889215, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.68359375, + "step": 10612, + "time_per_iteration": 2.5017549991607666 + }, + { + "auxiliary_loss_clip": 0.01025775, + "auxiliary_loss_mlp": 0.01000915, + "balance_loss_clip": 0.99957991, + "balance_loss_mlp": 1.00507379, + "epoch": 0.6380880805651585, + "flos": 66484046580480.0, + "grad_norm": 0.7184948556551517, + "language_loss": 0.57873541, + "learning_rate": 1.2235377263680446e-06, + "loss": 0.5990023, + "num_input_tokens_seen": 228948465, + "router_z_loss_clip": 0.0133667, + "router_z_loss_mlp": 0.20703125, + "step": 10613, + "time_per_iteration": 2.9799587726593018 + }, + { + "auxiliary_loss_clip": 0.01107464, + "auxiliary_loss_mlp": 0.01031857, + "balance_loss_clip": 1.01890481, + "balance_loss_mlp": 1.03777075, + "epoch": 0.6381482038178266, + "flos": 23914998691200.0, + "grad_norm": 1.7105248760789145, + "language_loss": 0.75128651, + "learning_rate": 1.2231788279923334e-06, + "loss": 0.77267975, + "num_input_tokens_seen": 228967955, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69921875, + "step": 10614, + "time_per_iteration": 2.491565465927124 + }, + { + "auxiliary_loss_clip": 0.01106638, + "auxiliary_loss_mlp": 0.01034933, + "balance_loss_clip": 1.02311897, + "balance_loss_mlp": 1.03886855, + "epoch": 0.6382083270704945, + "flos": 24243042625920.0, + "grad_norm": 1.84751826433944, + "language_loss": 0.79666638, + "learning_rate": 1.2228199590737599e-06, + "loss": 0.81808209, + "num_input_tokens_seen": 228985495, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 10615, + "time_per_iteration": 2.492230176925659 + }, + { + "auxiliary_loss_clip": 0.0102549, + "auxiliary_loss_mlp": 0.01004342, + "balance_loss_clip": 1.00296533, + "balance_loss_mlp": 1.00477338, + "epoch": 0.6382684503231625, + "flos": 70775552931840.0, + "grad_norm": 0.6538614969335592, + "language_loss": 0.55591351, + "learning_rate": 1.2224611196259305e-06, + "loss": 0.57621187, + "num_input_tokens_seen": 229052995, + "router_z_loss_clip": 0.01379395, + "router_z_loss_mlp": 0.20703125, + "step": 10616, + "time_per_iteration": 3.1426796913146973 + }, + { + "auxiliary_loss_clip": 0.01103937, + "auxiliary_loss_mlp": 0.01030089, + "balance_loss_clip": 1.01841819, + "balance_loss_mlp": 1.03616679, + "epoch": 0.6383285735758305, + "flos": 16544836621440.0, + "grad_norm": 1.9069966042725246, + "language_loss": 0.83733106, + "learning_rate": 1.2221023096624538e-06, + "loss": 0.85867131, + "num_input_tokens_seen": 229071030, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 10617, + "time_per_iteration": 2.4153995513916016 + }, + { + "auxiliary_loss_clip": 0.01104997, + "auxiliary_loss_mlp": 0.01034225, + "balance_loss_clip": 1.0218631, + "balance_loss_mlp": 1.03582323, + "epoch": 0.6383886968284984, + "flos": 14427651225600.0, + "grad_norm": 1.8815450583884574, + "language_loss": 0.87111914, + "learning_rate": 1.221743529196936e-06, + "loss": 0.89251137, + "num_input_tokens_seen": 229088275, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.69140625, + "step": 10618, + "time_per_iteration": 2.4547295570373535 + }, + { + "auxiliary_loss_clip": 0.01108351, + "auxiliary_loss_mlp": 0.01033496, + "balance_loss_clip": 1.02284503, + "balance_loss_mlp": 1.03887093, + "epoch": 0.6384488200811664, + "flos": 17929659617280.0, + "grad_norm": 1.8631596079726758, + "language_loss": 0.73287827, + "learning_rate": 1.2213847782429806e-06, + "loss": 0.75429678, + "num_input_tokens_seen": 229105190, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6953125, + "step": 10619, + "time_per_iteration": 2.4028847217559814 + }, + { + "auxiliary_loss_clip": 0.01110376, + "auxiliary_loss_mlp": 0.01033636, + "balance_loss_clip": 1.02090454, + "balance_loss_mlp": 1.03807545, + "epoch": 0.6385089433338343, + "flos": 18515578268160.0, + "grad_norm": 1.9227827130097541, + "language_loss": 0.76158774, + "learning_rate": 1.221026056814193e-06, + "loss": 0.78302789, + "num_input_tokens_seen": 229122290, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 10620, + "time_per_iteration": 2.4420766830444336 + }, + { + "auxiliary_loss_clip": 0.01104115, + "auxiliary_loss_mlp": 0.0102525, + "balance_loss_clip": 1.01349616, + "balance_loss_mlp": 1.03697598, + "epoch": 0.6385690665865024, + "flos": 24753620499840.0, + "grad_norm": 2.4243704084161806, + "language_loss": 0.70476806, + "learning_rate": 1.2206673649241752e-06, + "loss": 0.7260617, + "num_input_tokens_seen": 229141620, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.671875, + "step": 10621, + "time_per_iteration": 2.474518299102783 + }, + { + "auxiliary_loss_clip": 0.01098545, + "auxiliary_loss_mlp": 0.0102422, + "balance_loss_clip": 1.01336575, + "balance_loss_mlp": 1.03505826, + "epoch": 0.6386291898391703, + "flos": 20120569678080.0, + "grad_norm": 1.540938795838808, + "language_loss": 0.77551067, + "learning_rate": 1.220308702586529e-06, + "loss": 0.79673827, + "num_input_tokens_seen": 229161570, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6328125, + "step": 10622, + "time_per_iteration": 2.4603724479675293 + }, + { + "auxiliary_loss_clip": 0.01102358, + "auxiliary_loss_mlp": 0.01027326, + "balance_loss_clip": 1.0161562, + "balance_loss_mlp": 1.0359875, + "epoch": 0.6386893130918383, + "flos": 16867278034560.0, + "grad_norm": 1.7317763854255814, + "language_loss": 0.7494216, + "learning_rate": 1.2199500698148546e-06, + "loss": 0.77071846, + "num_input_tokens_seen": 229178465, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6640625, + "step": 10623, + "time_per_iteration": 2.4120795726776123 + }, + { + "auxiliary_loss_clip": 0.01098287, + "auxiliary_loss_mlp": 0.01029397, + "balance_loss_clip": 1.01879287, + "balance_loss_mlp": 1.03354859, + "epoch": 0.6387494363445062, + "flos": 22966274718720.0, + "grad_norm": 1.6666297183957082, + "language_loss": 0.76487082, + "learning_rate": 1.2195914666227527e-06, + "loss": 0.78614771, + "num_input_tokens_seen": 229198975, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6484375, + "step": 10624, + "time_per_iteration": 2.4929676055908203 + }, + { + "auxiliary_loss_clip": 0.0110372, + "auxiliary_loss_mlp": 0.01026899, + "balance_loss_clip": 1.01597917, + "balance_loss_mlp": 1.03606153, + "epoch": 0.6388095595971742, + "flos": 22857716839680.0, + "grad_norm": 1.639287347980187, + "language_loss": 0.80685896, + "learning_rate": 1.21923289302382e-06, + "loss": 0.82816517, + "num_input_tokens_seen": 229218825, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.67578125, + "step": 10625, + "time_per_iteration": 2.4569015502929688 + }, + { + "auxiliary_loss_clip": 0.01105914, + "auxiliary_loss_mlp": 0.0103107, + "balance_loss_clip": 1.01917887, + "balance_loss_mlp": 1.03810406, + "epoch": 0.6388696828498421, + "flos": 17311529445120.0, + "grad_norm": 1.744297621070212, + "language_loss": 0.72630143, + "learning_rate": 1.218874349031654e-06, + "loss": 0.74767131, + "num_input_tokens_seen": 229236060, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 10626, + "time_per_iteration": 2.441058397293091 + }, + { + "auxiliary_loss_clip": 0.01104529, + "auxiliary_loss_mlp": 0.01029322, + "balance_loss_clip": 1.01703739, + "balance_loss_mlp": 1.03571403, + "epoch": 0.6389298061025102, + "flos": 17128636369920.0, + "grad_norm": 1.7246902612727075, + "language_loss": 0.72518885, + "learning_rate": 1.2185158346598517e-06, + "loss": 0.74652737, + "num_input_tokens_seen": 229255160, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 10627, + "time_per_iteration": 2.40901780128479 + }, + { + "auxiliary_loss_clip": 0.01108886, + "auxiliary_loss_mlp": 0.01029992, + "balance_loss_clip": 1.0163188, + "balance_loss_mlp": 1.03729248, + "epoch": 0.6389899293551781, + "flos": 27710971989120.0, + "grad_norm": 2.244776770999307, + "language_loss": 0.67281765, + "learning_rate": 1.2181573499220064e-06, + "loss": 0.69420648, + "num_input_tokens_seen": 229278705, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71875, + "step": 10628, + "time_per_iteration": 2.5263736248016357 + }, + { + "auxiliary_loss_clip": 0.01100861, + "auxiliary_loss_mlp": 0.01026385, + "balance_loss_clip": 1.01536465, + "balance_loss_mlp": 1.03674936, + "epoch": 0.6390500526078461, + "flos": 21215701486080.0, + "grad_norm": 1.8036287880835562, + "language_loss": 0.67833781, + "learning_rate": 1.2177988948317135e-06, + "loss": 0.69961035, + "num_input_tokens_seen": 229299990, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 10629, + "time_per_iteration": 2.477262258529663 + }, + { + "auxiliary_loss_clip": 0.01110269, + "auxiliary_loss_mlp": 0.01040382, + "balance_loss_clip": 1.02683949, + "balance_loss_mlp": 1.03733897, + "epoch": 0.6391101758605141, + "flos": 21581056673280.0, + "grad_norm": 1.4737896174832923, + "language_loss": 0.75127286, + "learning_rate": 1.2174404694025646e-06, + "loss": 0.77277935, + "num_input_tokens_seen": 229319230, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.73046875, + "step": 10630, + "time_per_iteration": 2.4760096073150635 + }, + { + "auxiliary_loss_clip": 0.01102055, + "auxiliary_loss_mlp": 0.01030181, + "balance_loss_clip": 1.01984, + "balance_loss_mlp": 1.03617334, + "epoch": 0.639170299113182, + "flos": 19900473091200.0, + "grad_norm": 1.5423208876827523, + "language_loss": 0.70398533, + "learning_rate": 1.2170820736481511e-06, + "loss": 0.7253077, + "num_input_tokens_seen": 229338600, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.65625, + "step": 10631, + "time_per_iteration": 2.452275514602661 + }, + { + "auxiliary_loss_clip": 0.01023775, + "auxiliary_loss_mlp": 0.00996899, + "balance_loss_clip": 0.99556983, + "balance_loss_mlp": 1.00307584, + "epoch": 0.63923042236585, + "flos": 69877604833920.0, + "grad_norm": 0.7719101864922713, + "language_loss": 0.63005149, + "learning_rate": 1.2167237075820646e-06, + "loss": 0.6502583, + "num_input_tokens_seen": 229402420, + "router_z_loss_clip": 0.01330566, + "router_z_loss_mlp": 0.20703125, + "step": 10632, + "time_per_iteration": 3.1005401611328125 + }, + { + "auxiliary_loss_clip": 0.01101477, + "auxiliary_loss_mlp": 0.01028273, + "balance_loss_clip": 1.01642942, + "balance_loss_mlp": 1.03553295, + "epoch": 0.639290545618518, + "flos": 22674823764480.0, + "grad_norm": 2.062081508069593, + "language_loss": 0.66411757, + "learning_rate": 1.216365371217893e-06, + "loss": 0.68541509, + "num_input_tokens_seen": 229419185, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66015625, + "step": 10633, + "time_per_iteration": 2.4561798572540283 + }, + { + "auxiliary_loss_clip": 0.0110405, + "auxiliary_loss_mlp": 0.01027355, + "balance_loss_clip": 1.01587558, + "balance_loss_mlp": 1.03670645, + "epoch": 0.639350668871186, + "flos": 19829190551040.0, + "grad_norm": 2.980251338642478, + "language_loss": 0.81779587, + "learning_rate": 1.216007064569225e-06, + "loss": 0.8391099, + "num_input_tokens_seen": 229436735, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.671875, + "step": 10634, + "time_per_iteration": 2.4740054607391357 + }, + { + "auxiliary_loss_clip": 0.01104597, + "auxiliary_loss_mlp": 0.01030359, + "balance_loss_clip": 1.01801491, + "balance_loss_mlp": 1.03732753, + "epoch": 0.6394107921238539, + "flos": 20553328736640.0, + "grad_norm": 1.7668249879195463, + "language_loss": 0.75268984, + "learning_rate": 1.2156487876496483e-06, + "loss": 0.77403939, + "num_input_tokens_seen": 229455595, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.671875, + "step": 10635, + "time_per_iteration": 3.8579487800598145 + }, + { + "auxiliary_loss_clip": 0.01103838, + "auxiliary_loss_mlp": 0.01030566, + "balance_loss_clip": 1.01875806, + "balance_loss_mlp": 1.03555012, + "epoch": 0.6394709153765219, + "flos": 25774991729280.0, + "grad_norm": 1.8856871240472837, + "language_loss": 0.71665233, + "learning_rate": 1.2152905404727475e-06, + "loss": 0.73799634, + "num_input_tokens_seen": 229476230, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.68359375, + "step": 10636, + "time_per_iteration": 2.4976108074188232 + }, + { + "auxiliary_loss_clip": 0.01106058, + "auxiliary_loss_mlp": 0.01030245, + "balance_loss_clip": 1.01808596, + "balance_loss_mlp": 1.03683591, + "epoch": 0.6395310386291898, + "flos": 17530153574400.0, + "grad_norm": 4.067899624402538, + "language_loss": 0.7341159, + "learning_rate": 1.2149323230521085e-06, + "loss": 0.75547898, + "num_input_tokens_seen": 229494300, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69140625, + "step": 10637, + "time_per_iteration": 2.4985272884368896 + }, + { + "auxiliary_loss_clip": 0.01105341, + "auxiliary_loss_mlp": 0.01028468, + "balance_loss_clip": 1.0159924, + "balance_loss_mlp": 1.03592014, + "epoch": 0.6395911618818578, + "flos": 18588225525120.0, + "grad_norm": 1.8415469934331217, + "language_loss": 0.77680337, + "learning_rate": 1.2145741354013143e-06, + "loss": 0.79814142, + "num_input_tokens_seen": 229512985, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6953125, + "step": 10638, + "time_per_iteration": 5.310981035232544 + }, + { + "auxiliary_loss_clip": 0.01102761, + "auxiliary_loss_mlp": 0.01028447, + "balance_loss_clip": 1.016675, + "balance_loss_mlp": 1.0358299, + "epoch": 0.6396512851345257, + "flos": 28366557068160.0, + "grad_norm": 3.6995147498561636, + "language_loss": 0.81817627, + "learning_rate": 1.2142159775339478e-06, + "loss": 0.83948827, + "num_input_tokens_seen": 229534270, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.66796875, + "step": 10639, + "time_per_iteration": 3.956713914871216 + }, + { + "auxiliary_loss_clip": 0.01024264, + "auxiliary_loss_mlp": 0.0100149, + "balance_loss_clip": 1.00012457, + "balance_loss_mlp": 1.00365281, + "epoch": 0.6397114083871938, + "flos": 70724307202560.0, + "grad_norm": 0.8122323276395823, + "language_loss": 0.59012806, + "learning_rate": 1.21385784946359e-06, + "loss": 0.61038566, + "num_input_tokens_seen": 229596455, + "router_z_loss_clip": 0.01367188, + "router_z_loss_mlp": 0.20605469, + "step": 10640, + "time_per_iteration": 3.01208758354187 + }, + { + "auxiliary_loss_clip": 0.01100429, + "auxiliary_loss_mlp": 0.01025452, + "balance_loss_clip": 1.01467586, + "balance_loss_mlp": 1.03550696, + "epoch": 0.6397715316398617, + "flos": 18142537570560.0, + "grad_norm": 1.7939599084799007, + "language_loss": 0.78193939, + "learning_rate": 1.2134997512038215e-06, + "loss": 0.80319822, + "num_input_tokens_seen": 229612860, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6484375, + "step": 10641, + "time_per_iteration": 2.399609327316284 + }, + { + "auxiliary_loss_clip": 0.01108702, + "auxiliary_loss_mlp": 0.01034799, + "balance_loss_clip": 1.02274656, + "balance_loss_mlp": 1.03676474, + "epoch": 0.6398316548925297, + "flos": 25739512070400.0, + "grad_norm": 22.013815914762134, + "language_loss": 0.63092768, + "learning_rate": 1.2131416827682209e-06, + "loss": 0.65236264, + "num_input_tokens_seen": 229633960, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.71875, + "step": 10642, + "time_per_iteration": 2.4959514141082764 + }, + { + "auxiliary_loss_clip": 0.01024704, + "auxiliary_loss_mlp": 0.01003714, + "balance_loss_clip": 1.00239646, + "balance_loss_mlp": 1.00392115, + "epoch": 0.6398917781451977, + "flos": 71214234756480.0, + "grad_norm": 0.944530378795617, + "language_loss": 0.55960983, + "learning_rate": 1.2127836441703667e-06, + "loss": 0.57989401, + "num_input_tokens_seen": 229686730, + "router_z_loss_clip": 0.01318359, + "router_z_loss_mlp": 0.20800781, + "step": 10643, + "time_per_iteration": 2.9914019107818604 + }, + { + "auxiliary_loss_clip": 0.01108117, + "auxiliary_loss_mlp": 0.01025119, + "balance_loss_clip": 1.01332903, + "balance_loss_mlp": 1.03745127, + "epoch": 0.6399519013978656, + "flos": 20521835487360.0, + "grad_norm": 2.5171801924474764, + "language_loss": 0.77069736, + "learning_rate": 1.2124256354238358e-06, + "loss": 0.79202974, + "num_input_tokens_seen": 229704800, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.70703125, + "step": 10644, + "time_per_iteration": 2.437391996383667 + }, + { + "auxiliary_loss_clip": 0.01104463, + "auxiliary_loss_mlp": 0.01031384, + "balance_loss_clip": 1.01893854, + "balance_loss_mlp": 1.03780031, + "epoch": 0.6400120246505336, + "flos": 24460840742400.0, + "grad_norm": 1.4086380930188218, + "language_loss": 0.82438183, + "learning_rate": 1.212067656542203e-06, + "loss": 0.84574032, + "num_input_tokens_seen": 229725265, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.66796875, + "step": 10645, + "time_per_iteration": 2.4806745052337646 + }, + { + "auxiliary_loss_clip": 0.01108703, + "auxiliary_loss_mlp": 0.01034822, + "balance_loss_clip": 1.0219593, + "balance_loss_mlp": 1.03747869, + "epoch": 0.6400721479032015, + "flos": 28366090191360.0, + "grad_norm": 1.670748165032705, + "language_loss": 0.73261863, + "learning_rate": 1.2117097075390447e-06, + "loss": 0.75405383, + "num_input_tokens_seen": 229744840, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 10646, + "time_per_iteration": 2.515089988708496 + }, + { + "auxiliary_loss_clip": 0.01105459, + "auxiliary_loss_mlp": 0.01031116, + "balance_loss_clip": 1.01905167, + "balance_loss_mlp": 1.03657353, + "epoch": 0.6401322711558696, + "flos": 17816540711040.0, + "grad_norm": 2.6918825179848747, + "language_loss": 0.79892278, + "learning_rate": 1.2113517884279327e-06, + "loss": 0.82028854, + "num_input_tokens_seen": 229759095, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.69140625, + "step": 10647, + "time_per_iteration": 2.423576593399048 + }, + { + "auxiliary_loss_clip": 0.01102623, + "auxiliary_loss_mlp": 0.01029035, + "balance_loss_clip": 1.01801419, + "balance_loss_mlp": 1.03732038, + "epoch": 0.6401923944085375, + "flos": 26030855283840.0, + "grad_norm": 2.4485135437848724, + "language_loss": 0.75737441, + "learning_rate": 1.2109938992224399e-06, + "loss": 0.77869105, + "num_input_tokens_seen": 229777750, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65234375, + "step": 10648, + "time_per_iteration": 2.528726100921631 + }, + { + "auxiliary_loss_clip": 0.01103307, + "auxiliary_loss_mlp": 0.01029506, + "balance_loss_clip": 1.01779962, + "balance_loss_mlp": 1.03479123, + "epoch": 0.6402525176612055, + "flos": 23586451966080.0, + "grad_norm": 1.7767786509202286, + "language_loss": 0.78653902, + "learning_rate": 1.210636039936138e-06, + "loss": 0.80786711, + "num_input_tokens_seen": 229796785, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.68359375, + "step": 10649, + "time_per_iteration": 2.4528145790100098 + }, + { + "auxiliary_loss_clip": 0.01103744, + "auxiliary_loss_mlp": 0.0103342, + "balance_loss_clip": 1.02100444, + "balance_loss_mlp": 1.03651512, + "epoch": 0.6403126409138734, + "flos": 18041413806720.0, + "grad_norm": 1.6464773742271148, + "language_loss": 0.75819784, + "learning_rate": 1.2102782105825956e-06, + "loss": 0.77956951, + "num_input_tokens_seen": 229815425, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 10650, + "time_per_iteration": 2.4333670139312744 + }, + { + "auxiliary_loss_clip": 0.01102627, + "auxiliary_loss_mlp": 0.01030825, + "balance_loss_clip": 1.01803422, + "balance_loss_mlp": 1.0351758, + "epoch": 0.6403727641665414, + "flos": 21979485308160.0, + "grad_norm": 1.4678123102603653, + "language_loss": 0.70750296, + "learning_rate": 1.2099204111753833e-06, + "loss": 0.72883749, + "num_input_tokens_seen": 229834545, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.67578125, + "step": 10651, + "time_per_iteration": 2.4399240016937256 + }, + { + "auxiliary_loss_clip": 0.01104316, + "auxiliary_loss_mlp": 0.01037837, + "balance_loss_clip": 1.02509403, + "balance_loss_mlp": 1.03600538, + "epoch": 0.6404328874192093, + "flos": 24895539135360.0, + "grad_norm": 2.264038346674132, + "language_loss": 0.63932753, + "learning_rate": 1.2095626417280684e-06, + "loss": 0.66074908, + "num_input_tokens_seen": 229849175, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.68359375, + "step": 10652, + "time_per_iteration": 2.4656026363372803 + }, + { + "auxiliary_loss_clip": 0.01104729, + "auxiliary_loss_mlp": 0.01029772, + "balance_loss_clip": 1.01769614, + "balance_loss_mlp": 1.03726971, + "epoch": 0.6404930106718774, + "flos": 17597198309760.0, + "grad_norm": 2.2063618593971586, + "language_loss": 0.79597425, + "learning_rate": 1.2092049022542168e-06, + "loss": 0.81731927, + "num_input_tokens_seen": 229865400, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 10653, + "time_per_iteration": 2.4099206924438477 + }, + { + "auxiliary_loss_clip": 0.01113277, + "auxiliary_loss_mlp": 0.01045693, + "balance_loss_clip": 1.03203726, + "balance_loss_mlp": 1.03744364, + "epoch": 0.6405531339245453, + "flos": 20157880930560.0, + "grad_norm": 2.172692455677744, + "language_loss": 0.69950652, + "learning_rate": 1.2088471927673952e-06, + "loss": 0.72109628, + "num_input_tokens_seen": 229882945, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7578125, + "step": 10654, + "time_per_iteration": 2.451249122619629 + }, + { + "auxiliary_loss_clip": 0.01108717, + "auxiliary_loss_mlp": 0.01036203, + "balance_loss_clip": 1.02347147, + "balance_loss_mlp": 1.03717566, + "epoch": 0.6406132571772133, + "flos": 21942281796480.0, + "grad_norm": 1.7648347923503578, + "language_loss": 0.72763705, + "learning_rate": 1.2084895132811666e-06, + "loss": 0.74908626, + "num_input_tokens_seen": 229901590, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 10655, + "time_per_iteration": 2.4311604499816895 + }, + { + "auxiliary_loss_clip": 0.01106611, + "auxiliary_loss_mlp": 0.0103335, + "balance_loss_clip": 1.02160144, + "balance_loss_mlp": 1.0368948, + "epoch": 0.6406733804298813, + "flos": 28768002445440.0, + "grad_norm": 1.5980795641640981, + "language_loss": 0.83070755, + "learning_rate": 1.2081318638090952e-06, + "loss": 0.85210717, + "num_input_tokens_seen": 229922535, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.69921875, + "step": 10656, + "time_per_iteration": 2.5178308486938477 + }, + { + "auxiliary_loss_clip": 0.01103309, + "auxiliary_loss_mlp": 0.01034264, + "balance_loss_clip": 1.02264667, + "balance_loss_mlp": 1.03502929, + "epoch": 0.6407335036825492, + "flos": 17457183095040.0, + "grad_norm": 2.258129795094631, + "language_loss": 0.72108161, + "learning_rate": 1.2077742443647433e-06, + "loss": 0.74245739, + "num_input_tokens_seen": 229939575, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.68359375, + "step": 10657, + "time_per_iteration": 2.422863483428955 + }, + { + "auxiliary_loss_clip": 0.01103006, + "auxiliary_loss_mlp": 0.01032513, + "balance_loss_clip": 1.02086604, + "balance_loss_mlp": 1.03499269, + "epoch": 0.6407936269352172, + "flos": 22125282612480.0, + "grad_norm": 2.427174353089587, + "language_loss": 0.7728945, + "learning_rate": 1.2074166549616707e-06, + "loss": 0.79424977, + "num_input_tokens_seen": 229958840, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 10658, + "time_per_iteration": 2.449277877807617 + }, + { + "auxiliary_loss_clip": 0.01108084, + "auxiliary_loss_mlp": 0.01039076, + "balance_loss_clip": 1.02629066, + "balance_loss_mlp": 1.03781724, + "epoch": 0.6408537501878852, + "flos": 23110635479040.0, + "grad_norm": 1.5608188078670746, + "language_loss": 0.7607885, + "learning_rate": 1.2070590956134386e-06, + "loss": 0.78226012, + "num_input_tokens_seen": 229979680, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.703125, + "step": 10659, + "time_per_iteration": 2.4464104175567627 + }, + { + "auxiliary_loss_clip": 0.01103759, + "auxiliary_loss_mlp": 0.0103387, + "balance_loss_clip": 1.02172303, + "balance_loss_mlp": 1.03568363, + "epoch": 0.6409138734405532, + "flos": 16472440759680.0, + "grad_norm": 1.6810966877518245, + "language_loss": 0.78276753, + "learning_rate": 1.2067015663336046e-06, + "loss": 0.80414379, + "num_input_tokens_seen": 229996830, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6796875, + "step": 10660, + "time_per_iteration": 2.463932752609253 + }, + { + "auxiliary_loss_clip": 0.01110744, + "auxiliary_loss_mlp": 0.01037101, + "balance_loss_clip": 1.023785, + "balance_loss_mlp": 1.03830671, + "epoch": 0.6409739966932211, + "flos": 22777922776320.0, + "grad_norm": 2.1049933789165727, + "language_loss": 0.68227595, + "learning_rate": 1.206344067135727e-06, + "loss": 0.70375443, + "num_input_tokens_seen": 230015115, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7265625, + "step": 10661, + "time_per_iteration": 2.4437673091888428 + }, + { + "auxiliary_loss_clip": 0.01103563, + "auxiliary_loss_mlp": 0.01036782, + "balance_loss_clip": 1.02566016, + "balance_loss_mlp": 1.0374167, + "epoch": 0.6410341199458891, + "flos": 25152049134720.0, + "grad_norm": 1.4944389143541703, + "language_loss": 0.75839317, + "learning_rate": 1.205986598033362e-06, + "loss": 0.77979672, + "num_input_tokens_seen": 230035515, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66015625, + "step": 10662, + "time_per_iteration": 2.4985625743865967 + }, + { + "auxiliary_loss_clip": 0.01102338, + "auxiliary_loss_mlp": 0.01030167, + "balance_loss_clip": 1.01905084, + "balance_loss_mlp": 1.03421175, + "epoch": 0.641094243198557, + "flos": 27046193028480.0, + "grad_norm": 1.8768391350540305, + "language_loss": 0.69502836, + "learning_rate": 1.2056291590400644e-06, + "loss": 0.71635342, + "num_input_tokens_seen": 230054355, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6796875, + "step": 10663, + "time_per_iteration": 2.479556083679199 + }, + { + "auxiliary_loss_clip": 0.01106696, + "auxiliary_loss_mlp": 0.01040197, + "balance_loss_clip": 1.02654743, + "balance_loss_mlp": 1.03724718, + "epoch": 0.641154366451225, + "flos": 25374551932800.0, + "grad_norm": 2.235560561918587, + "language_loss": 0.68056524, + "learning_rate": 1.205271750169389e-06, + "loss": 0.70203424, + "num_input_tokens_seen": 230074605, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.6953125, + "step": 10664, + "time_per_iteration": 2.490736484527588 + }, + { + "auxiliary_loss_clip": 0.01101883, + "auxiliary_loss_mlp": 0.01029231, + "balance_loss_clip": 1.01797128, + "balance_loss_mlp": 1.03587985, + "epoch": 0.6412144897038929, + "flos": 25153342024320.0, + "grad_norm": 1.8443375686405623, + "language_loss": 0.66447258, + "learning_rate": 1.2049143714348881e-06, + "loss": 0.68578362, + "num_input_tokens_seen": 230093820, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.66015625, + "step": 10665, + "time_per_iteration": 2.4581611156463623 + }, + { + "auxiliary_loss_clip": 0.01102013, + "auxiliary_loss_mlp": 0.01029843, + "balance_loss_clip": 1.01801181, + "balance_loss_mlp": 1.03565812, + "epoch": 0.641274612956561, + "flos": 23440762402560.0, + "grad_norm": 1.9911859706917303, + "language_loss": 0.64523447, + "learning_rate": 1.2045570228501145e-06, + "loss": 0.66655302, + "num_input_tokens_seen": 230114285, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 10666, + "time_per_iteration": 2.4770736694335938 + }, + { + "auxiliary_loss_clip": 0.01105742, + "auxiliary_loss_mlp": 0.01031432, + "balance_loss_clip": 1.01933265, + "balance_loss_mlp": 1.03609776, + "epoch": 0.6413347362092289, + "flos": 19427493778560.0, + "grad_norm": 1.666384333420834, + "language_loss": 0.7067616, + "learning_rate": 1.2041997044286176e-06, + "loss": 0.72813338, + "num_input_tokens_seen": 230132760, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 10667, + "time_per_iteration": 2.407938003540039 + }, + { + "auxiliary_loss_clip": 0.01114508, + "auxiliary_loss_mlp": 0.01034302, + "balance_loss_clip": 1.02077127, + "balance_loss_mlp": 1.03901672, + "epoch": 0.6413948594618969, + "flos": 17196578945280.0, + "grad_norm": 2.2700946721922874, + "language_loss": 0.77413416, + "learning_rate": 1.2038424161839484e-06, + "loss": 0.79562223, + "num_input_tokens_seen": 230149690, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75390625, + "step": 10668, + "time_per_iteration": 2.421332836151123 + }, + { + "auxiliary_loss_clip": 0.01105498, + "auxiliary_loss_mlp": 0.01034123, + "balance_loss_clip": 1.02199399, + "balance_loss_mlp": 1.0376507, + "epoch": 0.6414549827145648, + "flos": 22269787027200.0, + "grad_norm": 1.6100109548180268, + "language_loss": 0.67520595, + "learning_rate": 1.2034851581296544e-06, + "loss": 0.69660217, + "num_input_tokens_seen": 230166950, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 10669, + "time_per_iteration": 2.426586866378784 + }, + { + "auxiliary_loss_clip": 0.01112518, + "auxiliary_loss_mlp": 0.01037501, + "balance_loss_clip": 1.02510333, + "balance_loss_mlp": 1.03997803, + "epoch": 0.6415151059672328, + "flos": 19640192163840.0, + "grad_norm": 1.7319389151723867, + "language_loss": 0.78258085, + "learning_rate": 1.2031279302792825e-06, + "loss": 0.80408102, + "num_input_tokens_seen": 230184785, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7265625, + "step": 10670, + "time_per_iteration": 2.469668388366699 + }, + { + "auxiliary_loss_clip": 0.01108443, + "auxiliary_loss_mlp": 0.01034621, + "balance_loss_clip": 1.02200222, + "balance_loss_mlp": 1.03697228, + "epoch": 0.6415752292199008, + "flos": 14865833237760.0, + "grad_norm": 2.2181025019747445, + "language_loss": 0.88322049, + "learning_rate": 1.20277073264638e-06, + "loss": 0.90465117, + "num_input_tokens_seen": 230201385, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71484375, + "step": 10671, + "time_per_iteration": 2.391927480697632 + }, + { + "auxiliary_loss_clip": 0.01103513, + "auxiliary_loss_mlp": 0.0102843, + "balance_loss_clip": 1.01708126, + "balance_loss_mlp": 1.03752613, + "epoch": 0.6416353524725688, + "flos": 13735580906880.0, + "grad_norm": 1.4861712883005815, + "language_loss": 0.69451904, + "learning_rate": 1.2024135652444907e-06, + "loss": 0.71583843, + "num_input_tokens_seen": 230220380, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66015625, + "step": 10672, + "time_per_iteration": 2.4214959144592285 + }, + { + "auxiliary_loss_clip": 0.01109224, + "auxiliary_loss_mlp": 0.0102904, + "balance_loss_clip": 1.01554513, + "balance_loss_mlp": 1.03705025, + "epoch": 0.6416954757252368, + "flos": 24534924543360.0, + "grad_norm": 1.748656888764651, + "language_loss": 0.7392627, + "learning_rate": 1.2020564280871593e-06, + "loss": 0.76064527, + "num_input_tokens_seen": 230239845, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.72265625, + "step": 10673, + "time_per_iteration": 2.4611282348632812 + }, + { + "auxiliary_loss_clip": 0.01104131, + "auxiliary_loss_mlp": 0.01034077, + "balance_loss_clip": 1.0213933, + "balance_loss_mlp": 1.03559685, + "epoch": 0.6417555989779047, + "flos": 27710002321920.0, + "grad_norm": 1.56139787015984, + "language_loss": 0.69352114, + "learning_rate": 1.2016993211879283e-06, + "loss": 0.71490324, + "num_input_tokens_seen": 230262420, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 10674, + "time_per_iteration": 2.5161702632904053 + }, + { + "auxiliary_loss_clip": 0.01109387, + "auxiliary_loss_mlp": 0.01029296, + "balance_loss_clip": 1.01650524, + "balance_loss_mlp": 1.03618658, + "epoch": 0.6418157222305727, + "flos": 20556632787840.0, + "grad_norm": 1.8510668186633226, + "language_loss": 0.66126549, + "learning_rate": 1.201342244560338e-06, + "loss": 0.68265229, + "num_input_tokens_seen": 230279950, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.73046875, + "step": 10675, + "time_per_iteration": 2.4155290126800537 + }, + { + "auxiliary_loss_clip": 0.01106276, + "auxiliary_loss_mlp": 0.01034914, + "balance_loss_clip": 1.02304077, + "balance_loss_mlp": 1.03823316, + "epoch": 0.6418758454832406, + "flos": 22601530062720.0, + "grad_norm": 2.2027244466364486, + "language_loss": 0.66607732, + "learning_rate": 1.2009851982179307e-06, + "loss": 0.68748927, + "num_input_tokens_seen": 230299705, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 10676, + "time_per_iteration": 2.490659713745117 + }, + { + "auxiliary_loss_clip": 0.01108966, + "auxiliary_loss_mlp": 0.01030084, + "balance_loss_clip": 1.01685786, + "balance_loss_mlp": 1.03876162, + "epoch": 0.6419359687359086, + "flos": 27375098889600.0, + "grad_norm": 2.097581634404412, + "language_loss": 0.75956476, + "learning_rate": 1.2006281821742446e-06, + "loss": 0.7809552, + "num_input_tokens_seen": 230320030, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 10677, + "time_per_iteration": 3.9567973613739014 + }, + { + "auxiliary_loss_clip": 0.01026179, + "auxiliary_loss_mlp": 0.00997901, + "balance_loss_clip": 0.99666101, + "balance_loss_mlp": 1.00533533, + "epoch": 0.6419960919885765, + "flos": 67251924552960.0, + "grad_norm": 0.8065212839738138, + "language_loss": 0.60730147, + "learning_rate": 1.200271196442818e-06, + "loss": 0.62754226, + "num_input_tokens_seen": 230381495, + "router_z_loss_clip": 0.01239014, + "router_z_loss_mlp": 0.20898438, + "step": 10678, + "time_per_iteration": 3.13420033454895 + }, + { + "auxiliary_loss_clip": 0.01103932, + "auxiliary_loss_mlp": 0.01033582, + "balance_loss_clip": 1.02209604, + "balance_loss_mlp": 1.03742027, + "epoch": 0.6420562152412446, + "flos": 19901873721600.0, + "grad_norm": 1.6963549464247227, + "language_loss": 0.67299467, + "learning_rate": 1.1999142410371875e-06, + "loss": 0.69436979, + "num_input_tokens_seen": 230401385, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6640625, + "step": 10679, + "time_per_iteration": 3.808528423309326 + }, + { + "auxiliary_loss_clip": 0.01108558, + "auxiliary_loss_mlp": 0.01029627, + "balance_loss_clip": 1.01666307, + "balance_loss_mlp": 1.03855729, + "epoch": 0.6421163384939125, + "flos": 24790177566720.0, + "grad_norm": 1.6996500318605585, + "language_loss": 0.72910142, + "learning_rate": 1.1995573159708897e-06, + "loss": 0.75048327, + "num_input_tokens_seen": 230421340, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69921875, + "step": 10680, + "time_per_iteration": 3.8477213382720947 + }, + { + "auxiliary_loss_clip": 0.01103253, + "auxiliary_loss_mlp": 0.01026838, + "balance_loss_clip": 1.01612723, + "balance_loss_mlp": 1.03545952, + "epoch": 0.6421764617465805, + "flos": 25592816926080.0, + "grad_norm": 1.7403495519820134, + "language_loss": 0.67876667, + "learning_rate": 1.1992004212574582e-06, + "loss": 0.70006758, + "num_input_tokens_seen": 230441270, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6796875, + "step": 10681, + "time_per_iteration": 3.919956922531128 + }, + { + "auxiliary_loss_clip": 0.01101229, + "auxiliary_loss_mlp": 0.01029189, + "balance_loss_clip": 1.01748252, + "balance_loss_mlp": 1.03434682, + "epoch": 0.6422365849992484, + "flos": 14134727813760.0, + "grad_norm": 1.5976000618825759, + "language_loss": 0.74644732, + "learning_rate": 1.198843556910427e-06, + "loss": 0.76775151, + "num_input_tokens_seen": 230457455, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 10682, + "time_per_iteration": 2.4222958087921143 + }, + { + "auxiliary_loss_clip": 0.01099045, + "auxiliary_loss_mlp": 0.01030599, + "balance_loss_clip": 1.01960802, + "balance_loss_mlp": 1.0343014, + "epoch": 0.6422967082519164, + "flos": 22383911514240.0, + "grad_norm": 1.48329541818395, + "language_loss": 0.79282379, + "learning_rate": 1.1984867229433287e-06, + "loss": 0.81412017, + "num_input_tokens_seen": 230478955, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6484375, + "step": 10683, + "time_per_iteration": 2.4635698795318604 + }, + { + "auxiliary_loss_clip": 0.01106037, + "auxiliary_loss_mlp": 0.01036635, + "balance_loss_clip": 1.0240351, + "balance_loss_mlp": 1.03679943, + "epoch": 0.6423568315045844, + "flos": 14647927380480.0, + "grad_norm": 1.6292181520500175, + "language_loss": 0.67376101, + "learning_rate": 1.1981299193696941e-06, + "loss": 0.69518769, + "num_input_tokens_seen": 230496425, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69140625, + "step": 10684, + "time_per_iteration": 2.41907000541687 + }, + { + "auxiliary_loss_clip": 0.01104447, + "auxiliary_loss_mlp": 0.01028123, + "balance_loss_clip": 1.01595724, + "balance_loss_mlp": 1.03616428, + "epoch": 0.6424169547572524, + "flos": 26833925606400.0, + "grad_norm": 2.2028301911766976, + "language_loss": 0.71436971, + "learning_rate": 1.1977731462030533e-06, + "loss": 0.73569536, + "num_input_tokens_seen": 230516245, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.68359375, + "step": 10685, + "time_per_iteration": 2.471905469894409 + }, + { + "auxiliary_loss_clip": 0.01101725, + "auxiliary_loss_mlp": 0.01032399, + "balance_loss_clip": 1.0210917, + "balance_loss_mlp": 1.0360837, + "epoch": 0.6424770780099204, + "flos": 22707430335360.0, + "grad_norm": 1.599317002960078, + "language_loss": 0.75343961, + "learning_rate": 1.197416403456935e-06, + "loss": 0.77478087, + "num_input_tokens_seen": 230534745, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 10686, + "time_per_iteration": 2.4540653228759766 + }, + { + "auxiliary_loss_clip": 0.0110856, + "auxiliary_loss_mlp": 0.01033102, + "balance_loss_clip": 1.02034068, + "balance_loss_mlp": 1.03813434, + "epoch": 0.6425372012625883, + "flos": 28469512425600.0, + "grad_norm": 2.1016215045747684, + "language_loss": 0.6875909, + "learning_rate": 1.197059691144867e-06, + "loss": 0.70900756, + "num_input_tokens_seen": 230555895, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 10687, + "time_per_iteration": 2.4797768592834473 + }, + { + "auxiliary_loss_clip": 0.01107085, + "auxiliary_loss_mlp": 0.01030589, + "balance_loss_clip": 1.01875103, + "balance_loss_mlp": 1.03763437, + "epoch": 0.6425973245152563, + "flos": 29351694453120.0, + "grad_norm": 2.024359307432863, + "language_loss": 0.66338682, + "learning_rate": 1.1967030092803767e-06, + "loss": 0.68476355, + "num_input_tokens_seen": 230577460, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6953125, + "step": 10688, + "time_per_iteration": 2.515491247177124 + }, + { + "auxiliary_loss_clip": 0.01103677, + "auxiliary_loss_mlp": 0.01029681, + "balance_loss_clip": 1.01751018, + "balance_loss_mlp": 1.03563595, + "epoch": 0.6426574477679242, + "flos": 16430388912000.0, + "grad_norm": 1.8678327137671962, + "language_loss": 0.73044169, + "learning_rate": 1.1963463578769876e-06, + "loss": 0.75177526, + "num_input_tokens_seen": 230595030, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6796875, + "step": 10689, + "time_per_iteration": 2.442413806915283 + }, + { + "auxiliary_loss_clip": 0.01101756, + "auxiliary_loss_mlp": 0.01029518, + "balance_loss_clip": 1.01890218, + "balance_loss_mlp": 1.03588271, + "epoch": 0.6427175710205922, + "flos": 21835914647040.0, + "grad_norm": 2.3454318131191485, + "language_loss": 0.72232103, + "learning_rate": 1.195989736948226e-06, + "loss": 0.74363381, + "num_input_tokens_seen": 230615135, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.66015625, + "step": 10690, + "time_per_iteration": 2.471299648284912 + }, + { + "auxiliary_loss_clip": 0.01102076, + "auxiliary_loss_mlp": 0.01028258, + "balance_loss_clip": 1.01679587, + "balance_loss_mlp": 1.03589702, + "epoch": 0.6427776942732601, + "flos": 17786627660160.0, + "grad_norm": 1.751175955717072, + "language_loss": 0.77973688, + "learning_rate": 1.1956331465076143e-06, + "loss": 0.80104017, + "num_input_tokens_seen": 230631965, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.66015625, + "step": 10691, + "time_per_iteration": 2.405625581741333 + }, + { + "auxiliary_loss_clip": 0.01107533, + "auxiliary_loss_mlp": 0.01036737, + "balance_loss_clip": 1.02516222, + "balance_loss_mlp": 1.03734851, + "epoch": 0.6428378175259282, + "flos": 15085893911040.0, + "grad_norm": 1.7365524827328973, + "language_loss": 0.74180853, + "learning_rate": 1.1952765865686738e-06, + "loss": 0.76325125, + "num_input_tokens_seen": 230649565, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.703125, + "step": 10692, + "time_per_iteration": 2.4545161724090576 + }, + { + "auxiliary_loss_clip": 0.01105895, + "auxiliary_loss_mlp": 0.01032234, + "balance_loss_clip": 1.02066517, + "balance_loss_mlp": 1.03752697, + "epoch": 0.6428979407785961, + "flos": 23841776816640.0, + "grad_norm": 1.783950417735838, + "language_loss": 0.61135745, + "learning_rate": 1.1949200571449263e-06, + "loss": 0.63273877, + "num_input_tokens_seen": 230669265, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.68359375, + "step": 10693, + "time_per_iteration": 2.443671464920044 + }, + { + "auxiliary_loss_clip": 0.01107731, + "auxiliary_loss_mlp": 0.010259, + "balance_loss_clip": 1.01348996, + "balance_loss_mlp": 1.03660131, + "epoch": 0.6429580640312641, + "flos": 32926852892160.0, + "grad_norm": 1.580411610275865, + "language_loss": 0.59667271, + "learning_rate": 1.1945635582498903e-06, + "loss": 0.61800897, + "num_input_tokens_seen": 230690575, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 10694, + "time_per_iteration": 2.539658308029175 + }, + { + "auxiliary_loss_clip": 0.01107722, + "auxiliary_loss_mlp": 0.01026895, + "balance_loss_clip": 1.0150162, + "balance_loss_mlp": 1.03852546, + "epoch": 0.643018187283932, + "flos": 21068359896960.0, + "grad_norm": 1.3391279253609552, + "language_loss": 0.79716361, + "learning_rate": 1.1942070898970853e-06, + "loss": 0.81850976, + "num_input_tokens_seen": 230709420, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.69140625, + "step": 10695, + "time_per_iteration": 2.4294402599334717 + }, + { + "auxiliary_loss_clip": 0.01105962, + "auxiliary_loss_mlp": 0.01036859, + "balance_loss_clip": 1.02474165, + "balance_loss_mlp": 1.03641272, + "epoch": 0.6430783105366, + "flos": 26724649455360.0, + "grad_norm": 2.3258756947072112, + "language_loss": 0.73518264, + "learning_rate": 1.1938506521000285e-06, + "loss": 0.75661093, + "num_input_tokens_seen": 230729350, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 10696, + "time_per_iteration": 2.502713441848755 + }, + { + "auxiliary_loss_clip": 0.01102941, + "auxiliary_loss_mlp": 0.01026977, + "balance_loss_clip": 1.01575983, + "balance_loss_mlp": 1.03764093, + "epoch": 0.643138433789268, + "flos": 23696841438720.0, + "grad_norm": 2.7414253907465636, + "language_loss": 0.7579782, + "learning_rate": 1.1934942448722347e-06, + "loss": 0.77927744, + "num_input_tokens_seen": 230749220, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 10697, + "time_per_iteration": 2.4447250366210938 + }, + { + "auxiliary_loss_clip": 0.01102432, + "auxiliary_loss_mlp": 0.01029855, + "balance_loss_clip": 1.01867306, + "balance_loss_mlp": 1.03607345, + "epoch": 0.643198557041936, + "flos": 34202184255360.0, + "grad_norm": 1.4042502284177218, + "language_loss": 0.6627214, + "learning_rate": 1.1931378682272208e-06, + "loss": 0.68404424, + "num_input_tokens_seen": 230770245, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6640625, + "step": 10698, + "time_per_iteration": 2.585150718688965 + }, + { + "auxiliary_loss_clip": 0.01025803, + "auxiliary_loss_mlp": 0.01004446, + "balance_loss_clip": 1.00322425, + "balance_loss_mlp": 1.00514603, + "epoch": 0.643258680294604, + "flos": 67626473621760.0, + "grad_norm": 0.8344250970478979, + "language_loss": 0.63460743, + "learning_rate": 1.1927815221784996e-06, + "loss": 0.65490991, + "num_input_tokens_seen": 230837030, + "router_z_loss_clip": 0.01220703, + "router_z_loss_mlp": 0.20703125, + "step": 10699, + "time_per_iteration": 3.024700403213501 + }, + { + "auxiliary_loss_clip": 0.01103086, + "auxiliary_loss_mlp": 0.01026282, + "balance_loss_clip": 1.01535618, + "balance_loss_mlp": 1.03705359, + "epoch": 0.6433188035472719, + "flos": 25185984508800.0, + "grad_norm": 1.912981795070525, + "language_loss": 0.6912387, + "learning_rate": 1.1924252067395838e-06, + "loss": 0.71253234, + "num_input_tokens_seen": 230856845, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.66015625, + "step": 10700, + "time_per_iteration": 2.4683825969696045 + }, + { + "auxiliary_loss_clip": 0.01103852, + "auxiliary_loss_mlp": 0.01026054, + "balance_loss_clip": 1.01447868, + "balance_loss_mlp": 1.03590679, + "epoch": 0.6433789267999399, + "flos": 24973573432320.0, + "grad_norm": 1.7070737124865907, + "language_loss": 0.73354918, + "learning_rate": 1.1920689219239855e-06, + "loss": 0.75484824, + "num_input_tokens_seen": 230878785, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6796875, + "step": 10701, + "time_per_iteration": 2.4831302165985107 + }, + { + "auxiliary_loss_clip": 0.01105062, + "auxiliary_loss_mlp": 0.01028305, + "balance_loss_clip": 1.01542449, + "balance_loss_mlp": 1.03474069, + "epoch": 0.6434390500526078, + "flos": 17566028282880.0, + "grad_norm": 1.878097796503538, + "language_loss": 0.81941777, + "learning_rate": 1.1917126677452144e-06, + "loss": 0.84075147, + "num_input_tokens_seen": 230895445, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 10702, + "time_per_iteration": 2.468240261077881 + }, + { + "auxiliary_loss_clip": 0.01101591, + "auxiliary_loss_mlp": 0.01033917, + "balance_loss_clip": 1.02235985, + "balance_loss_mlp": 1.03552771, + "epoch": 0.6434991733052758, + "flos": 20843594542080.0, + "grad_norm": 1.8640854274416083, + "language_loss": 0.74179298, + "learning_rate": 1.1913564442167798e-06, + "loss": 0.76314807, + "num_input_tokens_seen": 230911375, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66015625, + "step": 10703, + "time_per_iteration": 2.413569688796997 + }, + { + "auxiliary_loss_clip": 0.01025343, + "auxiliary_loss_mlp": 0.00998028, + "balance_loss_clip": 0.99697268, + "balance_loss_mlp": 1.0046978, + "epoch": 0.6435592965579437, + "flos": 66094596345600.0, + "grad_norm": 0.6508795205779913, + "language_loss": 0.54642779, + "learning_rate": 1.1910002513521898e-06, + "loss": 0.56666148, + "num_input_tokens_seen": 230975990, + "router_z_loss_clip": 0.01055908, + "router_z_loss_mlp": 0.20703125, + "step": 10704, + "time_per_iteration": 3.0236172676086426 + }, + { + "auxiliary_loss_clip": 0.01102168, + "auxiliary_loss_mlp": 0.01022828, + "balance_loss_clip": 1.01258826, + "balance_loss_mlp": 1.03476024, + "epoch": 0.6436194198106118, + "flos": 23768842250880.0, + "grad_norm": 1.5730519252717787, + "language_loss": 0.76976264, + "learning_rate": 1.1906440891649519e-06, + "loss": 0.79101259, + "num_input_tokens_seen": 230997110, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.67578125, + "step": 10705, + "time_per_iteration": 2.455488443374634 + }, + { + "auxiliary_loss_clip": 0.01104051, + "auxiliary_loss_mlp": 0.01036328, + "balance_loss_clip": 1.02475905, + "balance_loss_mlp": 1.0358727, + "epoch": 0.6436795430632797, + "flos": 20230312705920.0, + "grad_norm": 1.7440813911831818, + "language_loss": 0.7908684, + "learning_rate": 1.1902879576685708e-06, + "loss": 0.81227219, + "num_input_tokens_seen": 231015590, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.68359375, + "step": 10706, + "time_per_iteration": 2.449542760848999 + }, + { + "auxiliary_loss_clip": 0.01103828, + "auxiliary_loss_mlp": 0.0103175, + "balance_loss_clip": 1.01926923, + "balance_loss_mlp": 1.0355916, + "epoch": 0.6437396663159477, + "flos": 20301846641280.0, + "grad_norm": 2.1755935090023164, + "language_loss": 0.80497181, + "learning_rate": 1.1899318568765518e-06, + "loss": 0.82632756, + "num_input_tokens_seen": 231033800, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 10707, + "time_per_iteration": 2.416238784790039 + }, + { + "auxiliary_loss_clip": 0.01103614, + "auxiliary_loss_mlp": 0.01028614, + "balance_loss_clip": 1.01691961, + "balance_loss_mlp": 1.03542554, + "epoch": 0.6437997895686156, + "flos": 23878585278720.0, + "grad_norm": 1.7933979371525552, + "language_loss": 0.85400867, + "learning_rate": 1.1895757868023978e-06, + "loss": 0.87533092, + "num_input_tokens_seen": 231053160, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 10708, + "time_per_iteration": 2.4596221446990967 + }, + { + "auxiliary_loss_clip": 0.01113539, + "auxiliary_loss_mlp": 0.01039123, + "balance_loss_clip": 1.02577186, + "balance_loss_mlp": 1.03982544, + "epoch": 0.6438599128212836, + "flos": 18989275852800.0, + "grad_norm": 2.314624765830387, + "language_loss": 0.65632617, + "learning_rate": 1.1892197474596106e-06, + "loss": 0.67785281, + "num_input_tokens_seen": 231069470, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.734375, + "step": 10709, + "time_per_iteration": 2.414792776107788 + }, + { + "auxiliary_loss_clip": 0.01101587, + "auxiliary_loss_mlp": 0.01027315, + "balance_loss_clip": 1.01617527, + "balance_loss_mlp": 1.03474462, + "epoch": 0.6439200360739517, + "flos": 24096347481600.0, + "grad_norm": 2.1321707309255196, + "language_loss": 0.80428755, + "learning_rate": 1.1888637388616929e-06, + "loss": 0.8255766, + "num_input_tokens_seen": 231088205, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66796875, + "step": 10710, + "time_per_iteration": 2.475790500640869 + }, + { + "auxiliary_loss_clip": 0.01101022, + "auxiliary_loss_mlp": 0.01027519, + "balance_loss_clip": 1.01573479, + "balance_loss_mlp": 1.03475547, + "epoch": 0.6439801593266196, + "flos": 31902141697920.0, + "grad_norm": 1.6745994206662376, + "language_loss": 0.66166174, + "learning_rate": 1.1885077610221425e-06, + "loss": 0.68294716, + "num_input_tokens_seen": 231107850, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 10711, + "time_per_iteration": 2.502237319946289 + }, + { + "auxiliary_loss_clip": 0.0110763, + "auxiliary_loss_mlp": 0.01027379, + "balance_loss_clip": 1.0150764, + "balance_loss_mlp": 1.03871155, + "epoch": 0.6440402825792876, + "flos": 27125879351040.0, + "grad_norm": 1.56251052314253, + "language_loss": 0.78744113, + "learning_rate": 1.1881518139544597e-06, + "loss": 0.80879122, + "num_input_tokens_seen": 231127200, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69140625, + "step": 10712, + "time_per_iteration": 2.4865529537200928 + }, + { + "auxiliary_loss_clip": 0.01106973, + "auxiliary_loss_mlp": 0.01033455, + "balance_loss_clip": 1.0215044, + "balance_loss_mlp": 1.03622448, + "epoch": 0.6441004058319555, + "flos": 20667704618880.0, + "grad_norm": 1.5577972768959576, + "language_loss": 0.82686722, + "learning_rate": 1.1877958976721417e-06, + "loss": 0.84827155, + "num_input_tokens_seen": 231146360, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.70703125, + "step": 10713, + "time_per_iteration": 2.4358584880828857 + }, + { + "auxiliary_loss_clip": 0.01101375, + "auxiliary_loss_mlp": 0.01035112, + "balance_loss_clip": 1.02368593, + "balance_loss_mlp": 1.03669071, + "epoch": 0.6441605290846235, + "flos": 26026006947840.0, + "grad_norm": 1.4453495865190145, + "language_loss": 0.78343773, + "learning_rate": 1.187440012188684e-06, + "loss": 0.80480266, + "num_input_tokens_seen": 231168350, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 10714, + "time_per_iteration": 2.4839279651641846 + }, + { + "auxiliary_loss_clip": 0.01102157, + "auxiliary_loss_mlp": 0.01031268, + "balance_loss_clip": 1.02021146, + "balance_loss_mlp": 1.03580499, + "epoch": 0.6442206523372914, + "flos": 24899489631360.0, + "grad_norm": 1.5121330908882218, + "language_loss": 0.81442875, + "learning_rate": 1.187084157517583e-06, + "loss": 0.83576298, + "num_input_tokens_seen": 231188385, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6640625, + "step": 10715, + "time_per_iteration": 2.4751946926116943 + }, + { + "auxiliary_loss_clip": 0.01103061, + "auxiliary_loss_mlp": 0.01029965, + "balance_loss_clip": 1.01841354, + "balance_loss_mlp": 1.03416896, + "epoch": 0.6442807755899594, + "flos": 25156322853120.0, + "grad_norm": 1.858940461069926, + "language_loss": 0.81107575, + "learning_rate": 1.186728333672332e-06, + "loss": 0.83240604, + "num_input_tokens_seen": 231209880, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6875, + "step": 10716, + "time_per_iteration": 2.506404161453247 + }, + { + "auxiliary_loss_clip": 0.01106307, + "auxiliary_loss_mlp": 0.01034501, + "balance_loss_clip": 1.02118576, + "balance_loss_mlp": 1.03650761, + "epoch": 0.6443408988426274, + "flos": 27344503480320.0, + "grad_norm": 1.7227597977263103, + "language_loss": 0.77839047, + "learning_rate": 1.186372540666424e-06, + "loss": 0.79979855, + "num_input_tokens_seen": 231230765, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6953125, + "step": 10717, + "time_per_iteration": 2.4654810428619385 + }, + { + "auxiliary_loss_clip": 0.01102271, + "auxiliary_loss_mlp": 0.01028495, + "balance_loss_clip": 1.01759315, + "balance_loss_mlp": 1.03718793, + "epoch": 0.6444010220952954, + "flos": 27928339142400.0, + "grad_norm": 1.6109335148111539, + "language_loss": 0.68141425, + "learning_rate": 1.1860167785133513e-06, + "loss": 0.70272195, + "num_input_tokens_seen": 231252350, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 10718, + "time_per_iteration": 3.9740819931030273 + }, + { + "auxiliary_loss_clip": 0.0102484, + "auxiliary_loss_mlp": 0.01004792, + "balance_loss_clip": 1.00373709, + "balance_loss_mlp": 1.00415778, + "epoch": 0.6444611453479633, + "flos": 71215024855680.0, + "grad_norm": 0.7588040526175028, + "language_loss": 0.49665093, + "learning_rate": 1.185661047226603e-06, + "loss": 0.51694727, + "num_input_tokens_seen": 231313865, + "router_z_loss_clip": 0.01055908, + "router_z_loss_mlp": 0.20703125, + "step": 10719, + "time_per_iteration": 3.2171850204467773 + }, + { + "auxiliary_loss_clip": 0.01108486, + "auxiliary_loss_mlp": 0.01034143, + "balance_loss_clip": 1.0216434, + "balance_loss_mlp": 1.03927541, + "epoch": 0.6445212686006313, + "flos": 22705131864960.0, + "grad_norm": 2.0805005783182415, + "language_loss": 0.78263915, + "learning_rate": 1.18530534681967e-06, + "loss": 0.80406547, + "num_input_tokens_seen": 231331710, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 10720, + "time_per_iteration": 2.434246301651001 + }, + { + "auxiliary_loss_clip": 0.01102308, + "auxiliary_loss_mlp": 0.01032767, + "balance_loss_clip": 1.02056015, + "balance_loss_mlp": 1.03513026, + "epoch": 0.6445813918532992, + "flos": 21178821196800.0, + "grad_norm": 1.6971626147342385, + "language_loss": 0.76729137, + "learning_rate": 1.18494967730604e-06, + "loss": 0.78864217, + "num_input_tokens_seen": 231350705, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 10721, + "time_per_iteration": 5.301208734512329 + }, + { + "auxiliary_loss_clip": 0.01102301, + "auxiliary_loss_mlp": 0.01031144, + "balance_loss_clip": 1.01885331, + "balance_loss_mlp": 1.03417397, + "epoch": 0.6446415151059672, + "flos": 25191910252800.0, + "grad_norm": 2.4666147768058, + "language_loss": 0.73236001, + "learning_rate": 1.1845940386991995e-06, + "loss": 0.75369453, + "num_input_tokens_seen": 231369550, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6796875, + "step": 10722, + "time_per_iteration": 3.918328046798706 + }, + { + "auxiliary_loss_clip": 0.01101304, + "auxiliary_loss_mlp": 0.01026922, + "balance_loss_clip": 1.01602662, + "balance_loss_mlp": 1.03587341, + "epoch": 0.6447016383586353, + "flos": 25302227898240.0, + "grad_norm": 2.1714179391362074, + "language_loss": 0.78181046, + "learning_rate": 1.184238431012635e-06, + "loss": 0.80309272, + "num_input_tokens_seen": 231389285, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.65625, + "step": 10723, + "time_per_iteration": 2.485879421234131 + }, + { + "auxiliary_loss_clip": 0.01108308, + "auxiliary_loss_mlp": 0.01030662, + "balance_loss_clip": 1.01842475, + "balance_loss_mlp": 1.03774381, + "epoch": 0.6447617616113032, + "flos": 27703142824320.0, + "grad_norm": 1.8069876028647023, + "language_loss": 0.58755672, + "learning_rate": 1.1838828542598312e-06, + "loss": 0.60894638, + "num_input_tokens_seen": 231408820, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.703125, + "step": 10724, + "time_per_iteration": 2.478766679763794 + }, + { + "auxiliary_loss_clip": 0.0110091, + "auxiliary_loss_mlp": 0.01031012, + "balance_loss_clip": 1.02043772, + "balance_loss_mlp": 1.03629243, + "epoch": 0.6448218848639712, + "flos": 23039101543680.0, + "grad_norm": 1.6726755912827203, + "language_loss": 0.83442616, + "learning_rate": 1.183527308454271e-06, + "loss": 0.85574543, + "num_input_tokens_seen": 231428100, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6484375, + "step": 10725, + "time_per_iteration": 2.4473166465759277 + }, + { + "auxiliary_loss_clip": 0.01101664, + "auxiliary_loss_mlp": 0.01033389, + "balance_loss_clip": 1.02134895, + "balance_loss_mlp": 1.03365588, + "epoch": 0.6448820081166391, + "flos": 24496104919680.0, + "grad_norm": 1.7120227863307491, + "language_loss": 0.82104886, + "learning_rate": 1.1831717936094368e-06, + "loss": 0.84239936, + "num_input_tokens_seen": 231445810, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 10726, + "time_per_iteration": 2.4571003913879395 + }, + { + "auxiliary_loss_clip": 0.01106369, + "auxiliary_loss_mlp": 0.01031146, + "balance_loss_clip": 1.01877189, + "balance_loss_mlp": 1.03662455, + "epoch": 0.6449421313693071, + "flos": 22419283432320.0, + "grad_norm": 3.203326603634113, + "language_loss": 0.80919254, + "learning_rate": 1.1828163097388108e-06, + "loss": 0.83056766, + "num_input_tokens_seen": 231463570, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69921875, + "step": 10727, + "time_per_iteration": 2.4684529304504395 + }, + { + "auxiliary_loss_clip": 0.01109129, + "auxiliary_loss_mlp": 0.0103185, + "balance_loss_clip": 1.01939309, + "balance_loss_mlp": 1.03661084, + "epoch": 0.645002254621975, + "flos": 20225715765120.0, + "grad_norm": 2.8311253143889514, + "language_loss": 0.7950902, + "learning_rate": 1.1824608568558717e-06, + "loss": 0.81649995, + "num_input_tokens_seen": 231482155, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7265625, + "step": 10728, + "time_per_iteration": 2.420926094055176 + }, + { + "auxiliary_loss_clip": 0.0110447, + "auxiliary_loss_mlp": 0.01033011, + "balance_loss_clip": 1.02040493, + "balance_loss_mlp": 1.03509378, + "epoch": 0.645062377874643, + "flos": 27855440490240.0, + "grad_norm": 1.688837212564324, + "language_loss": 0.74242163, + "learning_rate": 1.1821054349740988e-06, + "loss": 0.76379651, + "num_input_tokens_seen": 231502465, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 10729, + "time_per_iteration": 2.5284883975982666 + }, + { + "auxiliary_loss_clip": 0.01105519, + "auxiliary_loss_mlp": 0.01032833, + "balance_loss_clip": 1.02004814, + "balance_loss_mlp": 1.03606546, + "epoch": 0.645122501127311, + "flos": 25301509626240.0, + "grad_norm": 1.7461235371462989, + "language_loss": 0.66486406, + "learning_rate": 1.1817500441069706e-06, + "loss": 0.68624759, + "num_input_tokens_seen": 231522740, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 10730, + "time_per_iteration": 2.472608804702759 + }, + { + "auxiliary_loss_clip": 0.01105349, + "auxiliary_loss_mlp": 0.0103343, + "balance_loss_clip": 1.02041864, + "balance_loss_mlp": 1.03703654, + "epoch": 0.645182624379979, + "flos": 18807352444800.0, + "grad_norm": 1.5067900334591022, + "language_loss": 0.63581085, + "learning_rate": 1.1813946842679614e-06, + "loss": 0.65719867, + "num_input_tokens_seen": 231542050, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.68359375, + "step": 10731, + "time_per_iteration": 2.446270704269409 + }, + { + "auxiliary_loss_clip": 0.01103309, + "auxiliary_loss_mlp": 0.01035418, + "balance_loss_clip": 1.0236578, + "balance_loss_mlp": 1.03637123, + "epoch": 0.6452427476326469, + "flos": 18332182402560.0, + "grad_norm": 1.5914748736963724, + "language_loss": 0.67864686, + "learning_rate": 1.1810393554705492e-06, + "loss": 0.70003414, + "num_input_tokens_seen": 231560380, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 10732, + "time_per_iteration": 2.4132513999938965 + }, + { + "auxiliary_loss_clip": 0.0110186, + "auxiliary_loss_mlp": 0.0103276, + "balance_loss_clip": 1.02102399, + "balance_loss_mlp": 1.03576565, + "epoch": 0.6453028708853149, + "flos": 22784746360320.0, + "grad_norm": 2.9402611085528685, + "language_loss": 0.75528163, + "learning_rate": 1.1806840577282055e-06, + "loss": 0.77662778, + "num_input_tokens_seen": 231580810, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66015625, + "step": 10733, + "time_per_iteration": 2.481633186340332 + }, + { + "auxiliary_loss_clip": 0.01109224, + "auxiliary_loss_mlp": 0.01038115, + "balance_loss_clip": 1.02552605, + "balance_loss_mlp": 1.03813672, + "epoch": 0.6453629941379828, + "flos": 23945989150080.0, + "grad_norm": 1.733255021176503, + "language_loss": 0.65421891, + "learning_rate": 1.1803287910544048e-06, + "loss": 0.67569232, + "num_input_tokens_seen": 231600585, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 10734, + "time_per_iteration": 2.458852529525757 + }, + { + "auxiliary_loss_clip": 0.01102096, + "auxiliary_loss_mlp": 0.01040084, + "balance_loss_clip": 1.02868783, + "balance_loss_mlp": 1.03828883, + "epoch": 0.6454231173906508, + "flos": 17676381841920.0, + "grad_norm": 2.35360500847906, + "language_loss": 0.7390331, + "learning_rate": 1.1799735554626191e-06, + "loss": 0.76045489, + "num_input_tokens_seen": 231618765, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.640625, + "step": 10735, + "time_per_iteration": 2.4310169219970703 + }, + { + "auxiliary_loss_clip": 0.01106342, + "auxiliary_loss_mlp": 0.01033097, + "balance_loss_clip": 1.02140272, + "balance_loss_mlp": 1.0381664, + "epoch": 0.6454832406433189, + "flos": 23292774368640.0, + "grad_norm": 1.7357542776809323, + "language_loss": 0.74936789, + "learning_rate": 1.1796183509663176e-06, + "loss": 0.77076226, + "num_input_tokens_seen": 231638525, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 10736, + "time_per_iteration": 2.4535531997680664 + }, + { + "auxiliary_loss_clip": 0.01108598, + "auxiliary_loss_mlp": 0.01030926, + "balance_loss_clip": 1.01834321, + "balance_loss_mlp": 1.03880417, + "epoch": 0.6455433638959868, + "flos": 20157198572160.0, + "grad_norm": 7.331374953548985, + "language_loss": 0.70983565, + "learning_rate": 1.1792631775789708e-06, + "loss": 0.73123091, + "num_input_tokens_seen": 231656785, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 10737, + "time_per_iteration": 2.455932855606079 + }, + { + "auxiliary_loss_clip": 0.01024539, + "auxiliary_loss_mlp": 0.0100647, + "balance_loss_clip": 1.00536776, + "balance_loss_mlp": 1.0038693, + "epoch": 0.6456034871486548, + "flos": 66532922012160.0, + "grad_norm": 0.7756134851395411, + "language_loss": 0.58466899, + "learning_rate": 1.1789080353140464e-06, + "loss": 0.6049791, + "num_input_tokens_seen": 231719075, + "router_z_loss_clip": 0.01104736, + "router_z_loss_mlp": 0.20703125, + "step": 10738, + "time_per_iteration": 3.11362624168396 + }, + { + "auxiliary_loss_clip": 0.01101864, + "auxiliary_loss_mlp": 0.0103012, + "balance_loss_clip": 1.01832366, + "balance_loss_mlp": 1.03569365, + "epoch": 0.6456636104013227, + "flos": 24206090509440.0, + "grad_norm": 1.6796977264879835, + "language_loss": 0.7432248, + "learning_rate": 1.1785529241850118e-06, + "loss": 0.76454461, + "num_input_tokens_seen": 231737810, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66015625, + "step": 10739, + "time_per_iteration": 2.575263261795044 + }, + { + "auxiliary_loss_clip": 0.01106876, + "auxiliary_loss_mlp": 0.01028957, + "balance_loss_clip": 1.01638103, + "balance_loss_mlp": 1.03678012, + "epoch": 0.6457237336539907, + "flos": 23624086440960.0, + "grad_norm": 1.7287512893442607, + "language_loss": 0.71253389, + "learning_rate": 1.1781978442053324e-06, + "loss": 0.7338922, + "num_input_tokens_seen": 231756140, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 10740, + "time_per_iteration": 2.4456567764282227 + }, + { + "auxiliary_loss_clip": 0.01024391, + "auxiliary_loss_mlp": 0.01001433, + "balance_loss_clip": 1.00019932, + "balance_loss_mlp": 1.00384283, + "epoch": 0.6457838569066586, + "flos": 65846023251840.0, + "grad_norm": 0.6634055191842134, + "language_loss": 0.55304271, + "learning_rate": 1.1778427953884733e-06, + "loss": 0.57330096, + "num_input_tokens_seen": 231823665, + "router_z_loss_clip": 0.0123291, + "router_z_loss_mlp": 0.20507812, + "step": 10741, + "time_per_iteration": 3.084655284881592 + }, + { + "auxiliary_loss_clip": 0.01100994, + "auxiliary_loss_mlp": 0.01029872, + "balance_loss_clip": 1.01909518, + "balance_loss_mlp": 1.03560328, + "epoch": 0.6458439801593266, + "flos": 22381972179840.0, + "grad_norm": 4.469668504909254, + "language_loss": 0.80574667, + "learning_rate": 1.1774877777478977e-06, + "loss": 0.82705534, + "num_input_tokens_seen": 231844500, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.65234375, + "step": 10742, + "time_per_iteration": 2.4683938026428223 + }, + { + "auxiliary_loss_clip": 0.01100072, + "auxiliary_loss_mlp": 0.01027999, + "balance_loss_clip": 1.01656711, + "balance_loss_mlp": 1.03513217, + "epoch": 0.6459041034119946, + "flos": 24789243813120.0, + "grad_norm": 1.5091720275231448, + "language_loss": 0.81898236, + "learning_rate": 1.1771327912970678e-06, + "loss": 0.84026313, + "num_input_tokens_seen": 231864510, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 10743, + "time_per_iteration": 2.4860422611236572 + }, + { + "auxiliary_loss_clip": 0.01103144, + "auxiliary_loss_mlp": 0.01028667, + "balance_loss_clip": 1.01716936, + "balance_loss_mlp": 1.03643143, + "epoch": 0.6459642266646626, + "flos": 18325358818560.0, + "grad_norm": 1.8283751590876323, + "language_loss": 0.72072589, + "learning_rate": 1.1767778360494453e-06, + "loss": 0.74204403, + "num_input_tokens_seen": 231881555, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 10744, + "time_per_iteration": 2.401154041290283 + }, + { + "auxiliary_loss_clip": 0.01103143, + "auxiliary_loss_mlp": 0.01024823, + "balance_loss_clip": 1.01339674, + "balance_loss_mlp": 1.0362134, + "epoch": 0.6460243499173305, + "flos": 43581368891520.0, + "grad_norm": 1.6832996887385467, + "language_loss": 0.66680956, + "learning_rate": 1.1764229120184896e-06, + "loss": 0.68808925, + "num_input_tokens_seen": 231905945, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 10745, + "time_per_iteration": 2.648923635482788 + }, + { + "auxiliary_loss_clip": 0.01104749, + "auxiliary_loss_mlp": 0.01030723, + "balance_loss_clip": 1.01873684, + "balance_loss_mlp": 1.03738117, + "epoch": 0.6460844731699985, + "flos": 19244026085760.0, + "grad_norm": 2.3663753891536206, + "language_loss": 0.7367624, + "learning_rate": 1.1760680192176597e-06, + "loss": 0.75811714, + "num_input_tokens_seen": 231922535, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 10746, + "time_per_iteration": 2.414886713027954 + }, + { + "auxiliary_loss_clip": 0.01106121, + "auxiliary_loss_mlp": 0.01031446, + "balance_loss_clip": 1.02040744, + "balance_loss_mlp": 1.03723145, + "epoch": 0.6461445964226664, + "flos": 27453348668160.0, + "grad_norm": 1.4238954510434034, + "language_loss": 0.66682059, + "learning_rate": 1.175713157660413e-06, + "loss": 0.6881963, + "num_input_tokens_seen": 231944800, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6875, + "step": 10747, + "time_per_iteration": 2.5016472339630127 + }, + { + "auxiliary_loss_clip": 0.01103964, + "auxiliary_loss_mlp": 0.01036697, + "balance_loss_clip": 1.02568781, + "balance_loss_mlp": 1.03684711, + "epoch": 0.6462047196753344, + "flos": 20295489934080.0, + "grad_norm": 1.577013961139599, + "language_loss": 0.66913009, + "learning_rate": 1.1753583273602056e-06, + "loss": 0.69053674, + "num_input_tokens_seen": 231962970, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.671875, + "step": 10748, + "time_per_iteration": 2.442237615585327 + }, + { + "auxiliary_loss_clip": 0.0110688, + "auxiliary_loss_mlp": 0.01039659, + "balance_loss_clip": 1.0270108, + "balance_loss_mlp": 1.03662395, + "epoch": 0.6462648429280025, + "flos": 22018340845440.0, + "grad_norm": 1.8120464443443396, + "language_loss": 0.76339692, + "learning_rate": 1.1750035283304937e-06, + "loss": 0.78486234, + "num_input_tokens_seen": 231981195, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 10749, + "time_per_iteration": 2.4924192428588867 + }, + { + "auxiliary_loss_clip": 0.01104279, + "auxiliary_loss_mlp": 0.010303, + "balance_loss_clip": 1.01845694, + "balance_loss_mlp": 1.03520691, + "epoch": 0.6463249661806704, + "flos": 27781141207680.0, + "grad_norm": 1.7469795758698337, + "language_loss": 0.77112448, + "learning_rate": 1.17464876058473e-06, + "loss": 0.79247028, + "num_input_tokens_seen": 232001735, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.69140625, + "step": 10750, + "time_per_iteration": 2.4771273136138916 + }, + { + "auxiliary_loss_clip": 0.01108606, + "auxiliary_loss_mlp": 0.01031442, + "balance_loss_clip": 1.01831079, + "balance_loss_mlp": 1.0382905, + "epoch": 0.6463850894333384, + "flos": 22050588280320.0, + "grad_norm": 2.0857387723701817, + "language_loss": 0.68225217, + "learning_rate": 1.1742940241363683e-06, + "loss": 0.70365262, + "num_input_tokens_seen": 232019830, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 10751, + "time_per_iteration": 2.5023088455200195 + }, + { + "auxiliary_loss_clip": 0.01103858, + "auxiliary_loss_mlp": 0.01029491, + "balance_loss_clip": 1.01730776, + "balance_loss_mlp": 1.03535843, + "epoch": 0.6464452126860063, + "flos": 21106245767040.0, + "grad_norm": 1.6570772228110922, + "language_loss": 0.70823848, + "learning_rate": 1.1739393189988604e-06, + "loss": 0.72957194, + "num_input_tokens_seen": 232039625, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 10752, + "time_per_iteration": 2.4542946815490723 + }, + { + "auxiliary_loss_clip": 0.01106954, + "auxiliary_loss_mlp": 0.01034427, + "balance_loss_clip": 1.02100945, + "balance_loss_mlp": 1.03708041, + "epoch": 0.6465053359386743, + "flos": 16028045694720.0, + "grad_norm": 1.7443402746921521, + "language_loss": 0.7799257, + "learning_rate": 1.1735846451856554e-06, + "loss": 0.80133951, + "num_input_tokens_seen": 232055855, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.69921875, + "step": 10753, + "time_per_iteration": 2.414531707763672 + }, + { + "auxiliary_loss_clip": 0.01102943, + "auxiliary_loss_mlp": 0.01038015, + "balance_loss_clip": 1.02636194, + "balance_loss_mlp": 1.03694177, + "epoch": 0.6465654591913422, + "flos": 23398674641280.0, + "grad_norm": 1.5945794385803833, + "language_loss": 0.85284775, + "learning_rate": 1.1732300027102041e-06, + "loss": 0.87425733, + "num_input_tokens_seen": 232073475, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66015625, + "step": 10754, + "time_per_iteration": 2.4596917629241943 + }, + { + "auxiliary_loss_clip": 0.01102766, + "auxiliary_loss_mlp": 0.01033015, + "balance_loss_clip": 1.02160048, + "balance_loss_mlp": 1.03613544, + "epoch": 0.6466255824440102, + "flos": 15377273038080.0, + "grad_norm": 1.9678569539088453, + "language_loss": 0.59384984, + "learning_rate": 1.1728753915859541e-06, + "loss": 0.61520755, + "num_input_tokens_seen": 232091090, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 10755, + "time_per_iteration": 2.439668893814087 + }, + { + "auxiliary_loss_clip": 0.01102658, + "auxiliary_loss_mlp": 0.01030046, + "balance_loss_clip": 1.01808381, + "balance_loss_mlp": 1.03532171, + "epoch": 0.6466857056966782, + "flos": 16252846963200.0, + "grad_norm": 5.126423165663523, + "language_loss": 0.67684507, + "learning_rate": 1.1725208118263518e-06, + "loss": 0.69817215, + "num_input_tokens_seen": 232107320, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.67578125, + "step": 10756, + "time_per_iteration": 2.405700206756592 + }, + { + "auxiliary_loss_clip": 0.01110332, + "auxiliary_loss_mlp": 0.01031544, + "balance_loss_clip": 1.019063, + "balance_loss_mlp": 1.03889596, + "epoch": 0.6467458289493462, + "flos": 21178246579200.0, + "grad_norm": 3.0387860554111574, + "language_loss": 0.74348402, + "learning_rate": 1.172166263444844e-06, + "loss": 0.76490277, + "num_input_tokens_seen": 232123930, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71484375, + "step": 10757, + "time_per_iteration": 2.4515702724456787 + }, + { + "auxiliary_loss_clip": 0.01102078, + "auxiliary_loss_mlp": 0.01030497, + "balance_loss_clip": 1.01911271, + "balance_loss_mlp": 1.03616095, + "epoch": 0.6468059522020141, + "flos": 17968299672960.0, + "grad_norm": 1.6276488646407918, + "language_loss": 0.74483991, + "learning_rate": 1.1718117464548734e-06, + "loss": 0.76616573, + "num_input_tokens_seen": 232142905, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65625, + "step": 10758, + "time_per_iteration": 2.4118669033050537 + }, + { + "auxiliary_loss_clip": 0.01104769, + "auxiliary_loss_mlp": 0.01029801, + "balance_loss_clip": 1.01735008, + "balance_loss_mlp": 1.03648281, + "epoch": 0.6468660754546821, + "flos": 17890157635200.0, + "grad_norm": 1.54772879655888, + "language_loss": 0.67891282, + "learning_rate": 1.1714572608698845e-06, + "loss": 0.70025849, + "num_input_tokens_seen": 232162230, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.68359375, + "step": 10759, + "time_per_iteration": 2.487632989883423 + }, + { + "auxiliary_loss_clip": 0.01106799, + "auxiliary_loss_mlp": 0.01030763, + "balance_loss_clip": 1.01868701, + "balance_loss_mlp": 1.03644943, + "epoch": 0.64692619870735, + "flos": 22600991358720.0, + "grad_norm": 1.881795853492405, + "language_loss": 0.75285017, + "learning_rate": 1.1711028067033197e-06, + "loss": 0.77422583, + "num_input_tokens_seen": 232182700, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.703125, + "step": 10760, + "time_per_iteration": 3.9272162914276123 + }, + { + "auxiliary_loss_clip": 0.01100222, + "auxiliary_loss_mlp": 0.01026563, + "balance_loss_clip": 1.01540494, + "balance_loss_mlp": 1.03383064, + "epoch": 0.646986321960018, + "flos": 49600786993920.0, + "grad_norm": 1.5052354500877283, + "language_loss": 0.65392292, + "learning_rate": 1.1707483839686194e-06, + "loss": 0.67519075, + "num_input_tokens_seen": 232208235, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6640625, + "step": 10761, + "time_per_iteration": 2.6993539333343506 + }, + { + "auxiliary_loss_clip": 0.01106456, + "auxiliary_loss_mlp": 0.01029889, + "balance_loss_clip": 1.01751542, + "balance_loss_mlp": 1.03747368, + "epoch": 0.6470464452126861, + "flos": 21908454163200.0, + "grad_norm": 2.1055667385281316, + "language_loss": 0.69732755, + "learning_rate": 1.1703939926792235e-06, + "loss": 0.71869099, + "num_input_tokens_seen": 232228720, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 10762, + "time_per_iteration": 2.4523587226867676 + }, + { + "auxiliary_loss_clip": 0.0110606, + "auxiliary_loss_mlp": 0.01032417, + "balance_loss_clip": 1.02048469, + "balance_loss_mlp": 1.03625226, + "epoch": 0.647106568465354, + "flos": 18106124158080.0, + "grad_norm": 2.1633807412884343, + "language_loss": 0.82723743, + "learning_rate": 1.1700396328485705e-06, + "loss": 0.8486222, + "num_input_tokens_seen": 232244655, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69921875, + "step": 10763, + "time_per_iteration": 5.290219306945801 + }, + { + "auxiliary_loss_clip": 0.01024866, + "auxiliary_loss_mlp": 0.01000313, + "balance_loss_clip": 0.99911511, + "balance_loss_mlp": 1.00423336, + "epoch": 0.647166691718022, + "flos": 69480038125440.0, + "grad_norm": 0.7101546065504528, + "language_loss": 0.57767004, + "learning_rate": 1.1696853044900978e-06, + "loss": 0.59792185, + "num_input_tokens_seen": 232308685, + "router_z_loss_clip": 0.01196289, + "router_z_loss_mlp": 0.20703125, + "step": 10764, + "time_per_iteration": 4.603821277618408 + }, + { + "auxiliary_loss_clip": 0.01102286, + "auxiliary_loss_mlp": 0.01034651, + "balance_loss_clip": 1.02277756, + "balance_loss_mlp": 1.03570485, + "epoch": 0.6472268149706899, + "flos": 34095170661120.0, + "grad_norm": 1.8570193979841765, + "language_loss": 0.60458118, + "learning_rate": 1.1693310076172413e-06, + "loss": 0.62595057, + "num_input_tokens_seen": 232327520, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6640625, + "step": 10765, + "time_per_iteration": 2.545964002609253 + }, + { + "auxiliary_loss_clip": 0.01102593, + "auxiliary_loss_mlp": 0.01026242, + "balance_loss_clip": 1.01510835, + "balance_loss_mlp": 1.03606391, + "epoch": 0.6472869382233579, + "flos": 28111232217600.0, + "grad_norm": 1.9785388674295172, + "language_loss": 0.63237435, + "learning_rate": 1.168976742243437e-06, + "loss": 0.65366268, + "num_input_tokens_seen": 232349025, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 10766, + "time_per_iteration": 2.4889070987701416 + }, + { + "auxiliary_loss_clip": 0.01103393, + "auxiliary_loss_mlp": 0.01029515, + "balance_loss_clip": 1.0176481, + "balance_loss_mlp": 1.03667796, + "epoch": 0.6473470614760258, + "flos": 22492146170880.0, + "grad_norm": 1.6243256535427835, + "language_loss": 0.75656283, + "learning_rate": 1.1686225083821174e-06, + "loss": 0.77789199, + "num_input_tokens_seen": 232367835, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 10767, + "time_per_iteration": 2.506972551345825 + }, + { + "auxiliary_loss_clip": 0.01101653, + "auxiliary_loss_mlp": 0.01031252, + "balance_loss_clip": 1.01970649, + "balance_loss_mlp": 1.03562641, + "epoch": 0.6474071847286939, + "flos": 14538938538240.0, + "grad_norm": 2.0284924931052406, + "language_loss": 0.77826148, + "learning_rate": 1.1682683060467153e-06, + "loss": 0.79959053, + "num_input_tokens_seen": 232385840, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66015625, + "step": 10768, + "time_per_iteration": 2.4127895832061768 + }, + { + "auxiliary_loss_clip": 0.01102155, + "auxiliary_loss_mlp": 0.01028073, + "balance_loss_clip": 1.01648641, + "balance_loss_mlp": 1.03510022, + "epoch": 0.6474673079813618, + "flos": 24098214988800.0, + "grad_norm": 1.6952390655728202, + "language_loss": 0.71920127, + "learning_rate": 1.167914135250663e-06, + "loss": 0.74050355, + "num_input_tokens_seen": 232406205, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 10769, + "time_per_iteration": 2.4743292331695557 + }, + { + "auxiliary_loss_clip": 0.01101029, + "auxiliary_loss_mlp": 0.01034947, + "balance_loss_clip": 1.02368212, + "balance_loss_mlp": 1.03668594, + "epoch": 0.6475274312340298, + "flos": 14976186796800.0, + "grad_norm": 1.9257555417687353, + "language_loss": 0.71907532, + "learning_rate": 1.1675599960073895e-06, + "loss": 0.74043512, + "num_input_tokens_seen": 232424995, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.64453125, + "step": 10770, + "time_per_iteration": 2.423251152038574 + }, + { + "auxiliary_loss_clip": 0.01107379, + "auxiliary_loss_mlp": 0.01027825, + "balance_loss_clip": 1.01506996, + "balance_loss_mlp": 1.03676248, + "epoch": 0.6475875544866977, + "flos": 25045322849280.0, + "grad_norm": 1.7207965836379309, + "language_loss": 0.73562384, + "learning_rate": 1.167205888330325e-06, + "loss": 0.75697601, + "num_input_tokens_seen": 232445870, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 10771, + "time_per_iteration": 2.498911142349243 + }, + { + "auxiliary_loss_clip": 0.01104798, + "auxiliary_loss_mlp": 0.01029715, + "balance_loss_clip": 1.01807988, + "balance_loss_mlp": 1.03799939, + "epoch": 0.6476476777393657, + "flos": 16472153450880.0, + "grad_norm": 1.8994664849870517, + "language_loss": 0.7373805, + "learning_rate": 1.1668518122328958e-06, + "loss": 0.75872564, + "num_input_tokens_seen": 232464285, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66796875, + "step": 10772, + "time_per_iteration": 2.4090960025787354 + }, + { + "auxiliary_loss_clip": 0.01100449, + "auxiliary_loss_mlp": 0.01029925, + "balance_loss_clip": 1.0196557, + "balance_loss_mlp": 1.03508711, + "epoch": 0.6477078009920336, + "flos": 25812267068160.0, + "grad_norm": 1.4911839819427335, + "language_loss": 0.83115339, + "learning_rate": 1.1664977677285305e-06, + "loss": 0.85245723, + "num_input_tokens_seen": 232485815, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.65625, + "step": 10773, + "time_per_iteration": 2.4857256412506104 + }, + { + "auxiliary_loss_clip": 0.01100513, + "auxiliary_loss_mlp": 0.0102739, + "balance_loss_clip": 1.01634526, + "balance_loss_mlp": 1.03509998, + "epoch": 0.6477679242447016, + "flos": 17676130446720.0, + "grad_norm": 1.4644145421555252, + "language_loss": 0.78116065, + "learning_rate": 1.1661437548306524e-06, + "loss": 0.80243969, + "num_input_tokens_seen": 232504875, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 10774, + "time_per_iteration": 2.4285647869110107 + }, + { + "auxiliary_loss_clip": 0.01105101, + "auxiliary_loss_mlp": 0.01036946, + "balance_loss_clip": 1.02481055, + "balance_loss_mlp": 1.0360589, + "epoch": 0.6478280474973696, + "flos": 21032305620480.0, + "grad_norm": 2.0390391270124986, + "language_loss": 0.68541199, + "learning_rate": 1.1657897735526867e-06, + "loss": 0.70683241, + "num_input_tokens_seen": 232521945, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69140625, + "step": 10775, + "time_per_iteration": 2.451076030731201 + }, + { + "auxiliary_loss_clip": 0.01106496, + "auxiliary_loss_mlp": 0.01028548, + "balance_loss_clip": 1.01700842, + "balance_loss_mlp": 1.03669178, + "epoch": 0.6478881707500376, + "flos": 21616931381760.0, + "grad_norm": 1.740664481421832, + "language_loss": 0.65512002, + "learning_rate": 1.1654358239080574e-06, + "loss": 0.67647052, + "num_input_tokens_seen": 232541500, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.69921875, + "step": 10776, + "time_per_iteration": 2.457409381866455 + }, + { + "auxiliary_loss_clip": 0.01105388, + "auxiliary_loss_mlp": 0.01033354, + "balance_loss_clip": 1.02126646, + "balance_loss_mlp": 1.03623533, + "epoch": 0.6479482940027056, + "flos": 18442571875200.0, + "grad_norm": 2.790324273409248, + "language_loss": 0.78897285, + "learning_rate": 1.1650819059101839e-06, + "loss": 0.81036025, + "num_input_tokens_seen": 232559720, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69140625, + "step": 10777, + "time_per_iteration": 2.425045967102051 + }, + { + "auxiliary_loss_clip": 0.01105443, + "auxiliary_loss_mlp": 0.01029925, + "balance_loss_clip": 1.01792097, + "balance_loss_mlp": 1.03808999, + "epoch": 0.6480084172553735, + "flos": 22164066322560.0, + "grad_norm": 2.190301315300799, + "language_loss": 0.73786491, + "learning_rate": 1.1647280195724896e-06, + "loss": 0.75921857, + "num_input_tokens_seen": 232579370, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 10778, + "time_per_iteration": 2.459921360015869 + }, + { + "auxiliary_loss_clip": 0.01099736, + "auxiliary_loss_mlp": 0.01030488, + "balance_loss_clip": 1.01919854, + "balance_loss_mlp": 1.033885, + "epoch": 0.6480685405080415, + "flos": 24316228586880.0, + "grad_norm": 1.4021781865585379, + "language_loss": 0.77758849, + "learning_rate": 1.1643741649083923e-06, + "loss": 0.79889071, + "num_input_tokens_seen": 232600495, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65625, + "step": 10779, + "time_per_iteration": 2.4944956302642822 + }, + { + "auxiliary_loss_clip": 0.0102552, + "auxiliary_loss_mlp": 0.0100081, + "balance_loss_clip": 0.99959451, + "balance_loss_mlp": 1.00497544, + "epoch": 0.6481286637607094, + "flos": 59891207760000.0, + "grad_norm": 0.7236484274239682, + "language_loss": 0.59404081, + "learning_rate": 1.1640203419313095e-06, + "loss": 0.61430413, + "num_input_tokens_seen": 232663165, + "router_z_loss_clip": 0.012146, + "router_z_loss_mlp": 0.20507812, + "step": 10780, + "time_per_iteration": 3.0612237453460693 + }, + { + "auxiliary_loss_clip": 0.01101259, + "auxiliary_loss_mlp": 0.01029164, + "balance_loss_clip": 1.01804209, + "balance_loss_mlp": 1.03493273, + "epoch": 0.6481887870133775, + "flos": 25484187219840.0, + "grad_norm": 1.958027941262836, + "language_loss": 0.79607379, + "learning_rate": 1.1636665506546599e-06, + "loss": 0.81737804, + "num_input_tokens_seen": 232683385, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 10781, + "time_per_iteration": 2.5239641666412354 + }, + { + "auxiliary_loss_clip": 0.01107534, + "auxiliary_loss_mlp": 0.01033388, + "balance_loss_clip": 1.01997757, + "balance_loss_mlp": 1.03791904, + "epoch": 0.6482489102660454, + "flos": 19930206574080.0, + "grad_norm": 1.9679764489100238, + "language_loss": 0.78864902, + "learning_rate": 1.1633127910918578e-06, + "loss": 0.81005824, + "num_input_tokens_seen": 232699095, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.6953125, + "step": 10782, + "time_per_iteration": 2.4253900051116943 + }, + { + "auxiliary_loss_clip": 0.01106515, + "auxiliary_loss_mlp": 0.0103164, + "balance_loss_clip": 1.01932609, + "balance_loss_mlp": 1.03778386, + "epoch": 0.6483090335187134, + "flos": 26979471515520.0, + "grad_norm": 2.985749633483, + "language_loss": 0.63785768, + "learning_rate": 1.1629590632563187e-06, + "loss": 0.65923923, + "num_input_tokens_seen": 232717920, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 10783, + "time_per_iteration": 2.5159454345703125 + }, + { + "auxiliary_loss_clip": 0.01108311, + "auxiliary_loss_mlp": 0.01032692, + "balance_loss_clip": 1.01933455, + "balance_loss_mlp": 1.03791237, + "epoch": 0.6483691567713813, + "flos": 25077965333760.0, + "grad_norm": 2.3442009274857387, + "language_loss": 0.88642716, + "learning_rate": 1.1626053671614561e-06, + "loss": 0.90783715, + "num_input_tokens_seen": 232737605, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.703125, + "step": 10784, + "time_per_iteration": 2.4753408432006836 + }, + { + "auxiliary_loss_clip": 0.01102388, + "auxiliary_loss_mlp": 0.01031195, + "balance_loss_clip": 1.01830864, + "balance_loss_mlp": 1.03565788, + "epoch": 0.6484292800240493, + "flos": 16105972250880.0, + "grad_norm": 12.15646159907571, + "language_loss": 0.73281801, + "learning_rate": 1.1622517028206815e-06, + "loss": 0.75415385, + "num_input_tokens_seen": 232755110, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.66796875, + "step": 10785, + "time_per_iteration": 2.4413681030273438 + }, + { + "auxiliary_loss_clip": 0.01101717, + "auxiliary_loss_mlp": 0.01029791, + "balance_loss_clip": 1.01844823, + "balance_loss_mlp": 1.03633511, + "epoch": 0.6484894032767172, + "flos": 28840398307200.0, + "grad_norm": 1.367601959382758, + "language_loss": 0.69167411, + "learning_rate": 1.1618980702474071e-06, + "loss": 0.71298921, + "num_input_tokens_seen": 232779040, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65234375, + "step": 10786, + "time_per_iteration": 2.524073362350464 + }, + { + "auxiliary_loss_clip": 0.01100884, + "auxiliary_loss_mlp": 0.01031055, + "balance_loss_clip": 1.01922917, + "balance_loss_mlp": 1.03379738, + "epoch": 0.6485495265293852, + "flos": 30227052896640.0, + "grad_norm": 1.7579718485158407, + "language_loss": 0.71124583, + "learning_rate": 1.161544469455041e-06, + "loss": 0.73256522, + "num_input_tokens_seen": 232800515, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 10787, + "time_per_iteration": 2.5158114433288574 + }, + { + "auxiliary_loss_clip": 0.01106869, + "auxiliary_loss_mlp": 0.01029835, + "balance_loss_clip": 1.017735, + "balance_loss_mlp": 1.03644013, + "epoch": 0.6486096497820532, + "flos": 20082181017600.0, + "grad_norm": 2.051362245275849, + "language_loss": 0.84114212, + "learning_rate": 1.1611909004569934e-06, + "loss": 0.86250919, + "num_input_tokens_seen": 232818450, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.703125, + "step": 10788, + "time_per_iteration": 2.4534499645233154 + }, + { + "auxiliary_loss_clip": 0.01105049, + "auxiliary_loss_mlp": 0.01028079, + "balance_loss_clip": 1.01589584, + "balance_loss_mlp": 1.03690362, + "epoch": 0.6486697730347212, + "flos": 17129067333120.0, + "grad_norm": 1.7919339269161743, + "language_loss": 0.76950663, + "learning_rate": 1.1608373632666708e-06, + "loss": 0.79083782, + "num_input_tokens_seen": 232834785, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6796875, + "step": 10789, + "time_per_iteration": 2.483477830886841 + }, + { + "auxiliary_loss_clip": 0.01100294, + "auxiliary_loss_mlp": 0.01028178, + "balance_loss_clip": 1.01661515, + "balance_loss_mlp": 1.03485107, + "epoch": 0.6487298962873892, + "flos": 38911940570880.0, + "grad_norm": 1.6011584419095646, + "language_loss": 0.76170266, + "learning_rate": 1.160483857897479e-06, + "loss": 0.78298742, + "num_input_tokens_seen": 232856050, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65234375, + "step": 10790, + "time_per_iteration": 2.589041233062744 + }, + { + "auxiliary_loss_clip": 0.01106166, + "auxiliary_loss_mlp": 0.01032194, + "balance_loss_clip": 1.02138782, + "balance_loss_mlp": 1.03979826, + "epoch": 0.6487900195400571, + "flos": 11947840076160.0, + "grad_norm": 2.041315075509779, + "language_loss": 0.59891582, + "learning_rate": 1.160130384362823e-06, + "loss": 0.6202994, + "num_input_tokens_seen": 232873945, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6640625, + "step": 10791, + "time_per_iteration": 2.432832956314087 + }, + { + "auxiliary_loss_clip": 0.01103609, + "auxiliary_loss_mlp": 0.01028949, + "balance_loss_clip": 1.01708758, + "balance_loss_mlp": 1.03552938, + "epoch": 0.6488501427927251, + "flos": 22344445445760.0, + "grad_norm": 1.6472225462276555, + "language_loss": 0.86154032, + "learning_rate": 1.1597769426761082e-06, + "loss": 0.88286591, + "num_input_tokens_seen": 232892160, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 10792, + "time_per_iteration": 2.446188449859619 + }, + { + "auxiliary_loss_clip": 0.01108514, + "auxiliary_loss_mlp": 0.01037084, + "balance_loss_clip": 1.02510905, + "balance_loss_mlp": 1.03797722, + "epoch": 0.648910266045393, + "flos": 22236282616320.0, + "grad_norm": 2.3897847361162396, + "language_loss": 0.78055567, + "learning_rate": 1.159423532850735e-06, + "loss": 0.80201161, + "num_input_tokens_seen": 232911725, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.703125, + "step": 10793, + "time_per_iteration": 2.5302352905273438 + }, + { + "auxiliary_loss_clip": 0.01108826, + "auxiliary_loss_mlp": 0.01027456, + "balance_loss_clip": 1.01562476, + "balance_loss_mlp": 1.03950644, + "epoch": 0.6489703892980611, + "flos": 25301258231040.0, + "grad_norm": 1.9288429134844602, + "language_loss": 0.75000489, + "learning_rate": 1.1590701549001055e-06, + "loss": 0.77136773, + "num_input_tokens_seen": 232929085, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6953125, + "step": 10794, + "time_per_iteration": 2.487550735473633 + }, + { + "auxiliary_loss_clip": 0.01102282, + "auxiliary_loss_mlp": 0.01030687, + "balance_loss_clip": 1.01906371, + "balance_loss_mlp": 1.03439832, + "epoch": 0.649030512550729, + "flos": 24571912573440.0, + "grad_norm": 1.7036979096858527, + "language_loss": 0.70159793, + "learning_rate": 1.158716808837621e-06, + "loss": 0.72292763, + "num_input_tokens_seen": 232949455, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 10795, + "time_per_iteration": 2.5075082778930664 + }, + { + "auxiliary_loss_clip": 0.01107904, + "auxiliary_loss_mlp": 0.01033056, + "balance_loss_clip": 1.0199964, + "balance_loss_mlp": 1.03854239, + "epoch": 0.649090635803397, + "flos": 26244702904320.0, + "grad_norm": 1.7755045878876892, + "language_loss": 0.54152012, + "learning_rate": 1.158363494676679e-06, + "loss": 0.56292963, + "num_input_tokens_seen": 232969445, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6953125, + "step": 10796, + "time_per_iteration": 2.4778566360473633 + }, + { + "auxiliary_loss_clip": 0.01104118, + "auxiliary_loss_mlp": 0.01027194, + "balance_loss_clip": 1.01583314, + "balance_loss_mlp": 1.03535151, + "epoch": 0.6491507590560649, + "flos": 24937375501440.0, + "grad_norm": 2.2433372918176917, + "language_loss": 0.77806747, + "learning_rate": 1.1580102124306775e-06, + "loss": 0.79938054, + "num_input_tokens_seen": 232988900, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6875, + "step": 10797, + "time_per_iteration": 2.4779365062713623 + }, + { + "auxiliary_loss_clip": 0.01101065, + "auxiliary_loss_mlp": 0.0102751, + "balance_loss_clip": 1.0163343, + "balance_loss_mlp": 1.03683209, + "epoch": 0.6492108823087329, + "flos": 19499781899520.0, + "grad_norm": 2.205335755673093, + "language_loss": 0.70565605, + "learning_rate": 1.1576569621130134e-06, + "loss": 0.72694176, + "num_input_tokens_seen": 233005060, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.640625, + "step": 10798, + "time_per_iteration": 2.4684252738952637 + }, + { + "auxiliary_loss_clip": 0.01101226, + "auxiliary_loss_mlp": 0.0102813, + "balance_loss_clip": 1.01698995, + "balance_loss_mlp": 1.03464842, + "epoch": 0.6492710055614008, + "flos": 19719303868800.0, + "grad_norm": 1.6813115922747512, + "language_loss": 0.76955473, + "learning_rate": 1.1573037437370811e-06, + "loss": 0.79084826, + "num_input_tokens_seen": 233023375, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 10799, + "time_per_iteration": 2.5210940837860107 + }, + { + "auxiliary_loss_clip": 0.01104973, + "auxiliary_loss_mlp": 0.01032808, + "balance_loss_clip": 1.01997542, + "balance_loss_mlp": 1.03435063, + "epoch": 0.6493311288140688, + "flos": 24317018686080.0, + "grad_norm": 1.8153395402518349, + "language_loss": 0.7160871, + "learning_rate": 1.1569505573162755e-06, + "loss": 0.7374649, + "num_input_tokens_seen": 233043130, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.70703125, + "step": 10800, + "time_per_iteration": 2.482504367828369 + }, + { + "auxiliary_loss_clip": 0.01026126, + "auxiliary_loss_mlp": 0.01002417, + "balance_loss_clip": 1.00117147, + "balance_loss_mlp": 1.00504756, + "epoch": 0.6493912520667368, + "flos": 70934635290240.0, + "grad_norm": 0.7657069555877785, + "language_loss": 0.60286164, + "learning_rate": 1.1565974028639897e-06, + "loss": 0.62314713, + "num_input_tokens_seen": 233110560, + "router_z_loss_clip": 0.01245117, + "router_z_loss_mlp": 0.2109375, + "step": 10801, + "time_per_iteration": 3.226260185241699 + }, + { + "auxiliary_loss_clip": 0.01110608, + "auxiliary_loss_mlp": 0.01036145, + "balance_loss_clip": 1.02356291, + "balance_loss_mlp": 1.04023898, + "epoch": 0.6494513753194048, + "flos": 25337779384320.0, + "grad_norm": 1.8073883235159445, + "language_loss": 0.78302824, + "learning_rate": 1.156244280393614e-06, + "loss": 0.80449581, + "num_input_tokens_seen": 233130080, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 10802, + "time_per_iteration": 3.919212579727173 + }, + { + "auxiliary_loss_clip": 0.01103206, + "auxiliary_loss_mlp": 0.01035837, + "balance_loss_clip": 1.02334428, + "balance_loss_mlp": 1.03446245, + "epoch": 0.6495114985720728, + "flos": 24681978823680.0, + "grad_norm": 1.6305174461496863, + "language_loss": 0.74483562, + "learning_rate": 1.155891189918541e-06, + "loss": 0.76622605, + "num_input_tokens_seen": 233150235, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 10803, + "time_per_iteration": 2.4627156257629395 + }, + { + "auxiliary_loss_clip": 0.01104558, + "auxiliary_loss_mlp": 0.01031041, + "balance_loss_clip": 1.01895285, + "balance_loss_mlp": 1.03586698, + "epoch": 0.6495716218247407, + "flos": 23651162317440.0, + "grad_norm": 2.1376614082682104, + "language_loss": 0.70056975, + "learning_rate": 1.1555381314521578e-06, + "loss": 0.72192574, + "num_input_tokens_seen": 233166710, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 10804, + "time_per_iteration": 3.848759889602661 + }, + { + "auxiliary_loss_clip": 0.01103321, + "auxiliary_loss_mlp": 0.01028784, + "balance_loss_clip": 1.01596284, + "balance_loss_mlp": 1.03562534, + "epoch": 0.6496317450774087, + "flos": 22346169298560.0, + "grad_norm": 1.6605919162215552, + "language_loss": 0.72852522, + "learning_rate": 1.1551851050078537e-06, + "loss": 0.74984628, + "num_input_tokens_seen": 233185445, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 10805, + "time_per_iteration": 3.8869080543518066 + }, + { + "auxiliary_loss_clip": 0.01103949, + "auxiliary_loss_mlp": 0.01030707, + "balance_loss_clip": 1.0191083, + "balance_loss_mlp": 1.03534186, + "epoch": 0.6496918683300766, + "flos": 30518647505280.0, + "grad_norm": 2.4377517316486816, + "language_loss": 0.66010499, + "learning_rate": 1.1548321105990155e-06, + "loss": 0.68145156, + "num_input_tokens_seen": 233205805, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6875, + "step": 10806, + "time_per_iteration": 3.955326557159424 + }, + { + "auxiliary_loss_clip": 0.01104962, + "auxiliary_loss_mlp": 0.0103143, + "balance_loss_clip": 1.01891875, + "balance_loss_mlp": 1.0347352, + "epoch": 0.6497519915827447, + "flos": 12458992567680.0, + "grad_norm": 2.0043448276690743, + "language_loss": 0.79282916, + "learning_rate": 1.1544791482390275e-06, + "loss": 0.81419313, + "num_input_tokens_seen": 233224215, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 10807, + "time_per_iteration": 2.39217209815979 + }, + { + "auxiliary_loss_clip": 0.01025408, + "auxiliary_loss_mlp": 0.0099987, + "balance_loss_clip": 0.99852294, + "balance_loss_mlp": 1.0043627, + "epoch": 0.6498121148354126, + "flos": 69093748287360.0, + "grad_norm": 0.8116161107359111, + "language_loss": 0.58930409, + "learning_rate": 1.1541262179412745e-06, + "loss": 0.60955691, + "num_input_tokens_seen": 233294440, + "router_z_loss_clip": 0.01348877, + "router_z_loss_mlp": 0.2109375, + "step": 10808, + "time_per_iteration": 3.230355739593506 + }, + { + "auxiliary_loss_clip": 0.0110383, + "auxiliary_loss_mlp": 0.01027961, + "balance_loss_clip": 1.01633191, + "balance_loss_mlp": 1.03880036, + "epoch": 0.6498722380880806, + "flos": 36897135914880.0, + "grad_norm": 1.7314499567585588, + "language_loss": 0.63442683, + "learning_rate": 1.1537733197191415e-06, + "loss": 0.65574473, + "num_input_tokens_seen": 233316125, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 10809, + "time_per_iteration": 2.5621047019958496 + }, + { + "auxiliary_loss_clip": 0.01103232, + "auxiliary_loss_mlp": 0.01030844, + "balance_loss_clip": 1.01940036, + "balance_loss_mlp": 1.03731823, + "epoch": 0.6499323613407485, + "flos": 29017760688000.0, + "grad_norm": 1.7915412750630062, + "language_loss": 0.81444794, + "learning_rate": 1.153420453586008e-06, + "loss": 0.83578873, + "num_input_tokens_seen": 233336140, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66015625, + "step": 10810, + "time_per_iteration": 2.504213571548462 + }, + { + "auxiliary_loss_clip": 0.01101776, + "auxiliary_loss_mlp": 0.01032614, + "balance_loss_clip": 1.02178955, + "balance_loss_mlp": 1.03596044, + "epoch": 0.6499924845934165, + "flos": 20119240874880.0, + "grad_norm": 1.6107612285139954, + "language_loss": 0.71639317, + "learning_rate": 1.1530676195552561e-06, + "loss": 0.73773706, + "num_input_tokens_seen": 233356095, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.66015625, + "step": 10811, + "time_per_iteration": 2.4460504055023193 + }, + { + "auxiliary_loss_clip": 0.01102886, + "auxiliary_loss_mlp": 0.01026927, + "balance_loss_clip": 1.01604342, + "balance_loss_mlp": 1.03864026, + "epoch": 0.6500526078460844, + "flos": 24421338760320.0, + "grad_norm": 1.696628622759694, + "language_loss": 0.78028226, + "learning_rate": 1.1527148176402649e-06, + "loss": 0.80158031, + "num_input_tokens_seen": 233376830, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.640625, + "step": 10812, + "time_per_iteration": 2.4838054180145264 + }, + { + "auxiliary_loss_clip": 0.01103233, + "auxiliary_loss_mlp": 0.01036542, + "balance_loss_clip": 1.02414393, + "balance_loss_mlp": 1.03522503, + "epoch": 0.6501127310987524, + "flos": 23331019374720.0, + "grad_norm": 1.7227870996833219, + "language_loss": 0.85212648, + "learning_rate": 1.152362047854413e-06, + "loss": 0.87352425, + "num_input_tokens_seen": 233395275, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 10813, + "time_per_iteration": 2.4507973194122314 + }, + { + "auxiliary_loss_clip": 0.01103984, + "auxiliary_loss_mlp": 0.0102924, + "balance_loss_clip": 1.01711679, + "balance_loss_mlp": 1.03622413, + "epoch": 0.6501728543514204, + "flos": 18697824898560.0, + "grad_norm": 1.630969137195917, + "language_loss": 0.80210257, + "learning_rate": 1.1520093102110764e-06, + "loss": 0.82343483, + "num_input_tokens_seen": 233413345, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 10814, + "time_per_iteration": 2.4843356609344482 + }, + { + "auxiliary_loss_clip": 0.0110736, + "auxiliary_loss_mlp": 0.01034219, + "balance_loss_clip": 1.02207136, + "balance_loss_mlp": 1.03762007, + "epoch": 0.6502329776040884, + "flos": 44199858199680.0, + "grad_norm": 1.5728804424803877, + "language_loss": 0.65147841, + "learning_rate": 1.1516566047236328e-06, + "loss": 0.67289424, + "num_input_tokens_seen": 233436105, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6953125, + "step": 10815, + "time_per_iteration": 2.6453187465667725 + }, + { + "auxiliary_loss_clip": 0.01107853, + "auxiliary_loss_mlp": 0.01034444, + "balance_loss_clip": 1.02031219, + "balance_loss_mlp": 1.0368166, + "epoch": 0.6502931008567564, + "flos": 14574741419520.0, + "grad_norm": 2.1717658748812925, + "language_loss": 0.75344497, + "learning_rate": 1.1513039314054546e-06, + "loss": 0.77486801, + "num_input_tokens_seen": 233452320, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7109375, + "step": 10816, + "time_per_iteration": 2.4386065006256104 + }, + { + "auxiliary_loss_clip": 0.01102422, + "auxiliary_loss_mlp": 0.01032135, + "balance_loss_clip": 1.0204587, + "balance_loss_mlp": 1.0362556, + "epoch": 0.6503532241094243, + "flos": 21395003201280.0, + "grad_norm": 1.7229503928288044, + "language_loss": 0.7330451, + "learning_rate": 1.1509512902699174e-06, + "loss": 0.75439066, + "num_input_tokens_seen": 233469920, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 10817, + "time_per_iteration": 2.4583981037139893 + }, + { + "auxiliary_loss_clip": 0.01103563, + "auxiliary_loss_mlp": 0.01036237, + "balance_loss_clip": 1.02363098, + "balance_loss_mlp": 1.03521729, + "epoch": 0.6504133473620923, + "flos": 74740840986240.0, + "grad_norm": 1.4667825090725979, + "language_loss": 0.71944672, + "learning_rate": 1.1505986813303916e-06, + "loss": 0.74084473, + "num_input_tokens_seen": 233499780, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.68359375, + "step": 10818, + "time_per_iteration": 2.862744092941284 + }, + { + "auxiliary_loss_clip": 0.01107713, + "auxiliary_loss_mlp": 0.01028855, + "balance_loss_clip": 1.01682127, + "balance_loss_mlp": 1.03837013, + "epoch": 0.6504734706147602, + "flos": 19713270384000.0, + "grad_norm": 1.8855888512315708, + "language_loss": 0.65002698, + "learning_rate": 1.150246104600249e-06, + "loss": 0.67139268, + "num_input_tokens_seen": 233518235, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6953125, + "step": 10819, + "time_per_iteration": 2.500066041946411 + }, + { + "auxiliary_loss_clip": 0.01105945, + "auxiliary_loss_mlp": 0.01031983, + "balance_loss_clip": 1.01954389, + "balance_loss_mlp": 1.03696644, + "epoch": 0.6505335938674283, + "flos": 25556870390400.0, + "grad_norm": 1.9280601319833375, + "language_loss": 0.83383453, + "learning_rate": 1.14989356009286e-06, + "loss": 0.85521388, + "num_input_tokens_seen": 233535215, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.69140625, + "step": 10820, + "time_per_iteration": 2.5053653717041016 + }, + { + "auxiliary_loss_clip": 0.01105855, + "auxiliary_loss_mlp": 0.01030579, + "balance_loss_clip": 1.01781201, + "balance_loss_mlp": 1.03561532, + "epoch": 0.6505937171200962, + "flos": 17821424960640.0, + "grad_norm": 2.4467285300705166, + "language_loss": 0.78197402, + "learning_rate": 1.1495410478215914e-06, + "loss": 0.80333835, + "num_input_tokens_seen": 233552775, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.703125, + "step": 10821, + "time_per_iteration": 2.431373357772827 + }, + { + "auxiliary_loss_clip": 0.01101047, + "auxiliary_loss_mlp": 0.01029296, + "balance_loss_clip": 1.01875234, + "balance_loss_mlp": 1.03584325, + "epoch": 0.6506538403727642, + "flos": 20668135582080.0, + "grad_norm": 1.457845041613161, + "language_loss": 0.80133367, + "learning_rate": 1.1491885677998126e-06, + "loss": 0.82263708, + "num_input_tokens_seen": 233572080, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.65234375, + "step": 10822, + "time_per_iteration": 2.460176706314087 + }, + { + "auxiliary_loss_clip": 0.01102252, + "auxiliary_loss_mlp": 0.01029965, + "balance_loss_clip": 1.01815748, + "balance_loss_mlp": 1.0353092, + "epoch": 0.6507139636254321, + "flos": 11721422695680.0, + "grad_norm": 1.750587835143927, + "language_loss": 0.87001264, + "learning_rate": 1.1488361200408883e-06, + "loss": 0.89133477, + "num_input_tokens_seen": 233589155, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 10823, + "time_per_iteration": 2.4293131828308105 + }, + { + "auxiliary_loss_clip": 0.01105612, + "auxiliary_loss_mlp": 0.01029251, + "balance_loss_clip": 1.01709723, + "balance_loss_mlp": 1.03624296, + "epoch": 0.6507740868781001, + "flos": 26761745226240.0, + "grad_norm": 1.6365898296789787, + "language_loss": 0.66641533, + "learning_rate": 1.148483704558183e-06, + "loss": 0.68776393, + "num_input_tokens_seen": 233608180, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6953125, + "step": 10824, + "time_per_iteration": 2.4835896492004395 + }, + { + "auxiliary_loss_clip": 0.01106218, + "auxiliary_loss_mlp": 0.01028522, + "balance_loss_clip": 1.01628542, + "balance_loss_mlp": 1.03520238, + "epoch": 0.650834210130768, + "flos": 16471722487680.0, + "grad_norm": 4.8089783891514974, + "language_loss": 0.87194103, + "learning_rate": 1.1481313213650607e-06, + "loss": 0.89328843, + "num_input_tokens_seen": 233625750, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.7109375, + "step": 10825, + "time_per_iteration": 2.4161195755004883 + }, + { + "auxiliary_loss_clip": 0.01106101, + "auxiliary_loss_mlp": 0.01028407, + "balance_loss_clip": 1.01528192, + "balance_loss_mlp": 1.03501797, + "epoch": 0.650894333383436, + "flos": 17128672283520.0, + "grad_norm": 2.113023109439822, + "language_loss": 0.72701895, + "learning_rate": 1.147778970474885e-06, + "loss": 0.74836403, + "num_input_tokens_seen": 233644235, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 10826, + "time_per_iteration": 2.4384891986846924 + }, + { + "auxiliary_loss_clip": 0.01103778, + "auxiliary_loss_mlp": 0.01029196, + "balance_loss_clip": 1.01812768, + "balance_loss_mlp": 1.03663278, + "epoch": 0.650954456636104, + "flos": 18734238311040.0, + "grad_norm": 1.8815234967356322, + "language_loss": 0.69047898, + "learning_rate": 1.1474266519010157e-06, + "loss": 0.71180868, + "num_input_tokens_seen": 233662845, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.671875, + "step": 10827, + "time_per_iteration": 2.4236016273498535 + }, + { + "auxiliary_loss_clip": 0.01103468, + "auxiliary_loss_mlp": 0.01026173, + "balance_loss_clip": 1.01466322, + "balance_loss_mlp": 1.03472352, + "epoch": 0.651014579888772, + "flos": 24528244613760.0, + "grad_norm": 1.912124303976498, + "language_loss": 0.76917899, + "learning_rate": 1.1470743656568136e-06, + "loss": 0.79047537, + "num_input_tokens_seen": 233681990, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6875, + "step": 10828, + "time_per_iteration": 2.501492500305176 + }, + { + "auxiliary_loss_clip": 0.01103546, + "auxiliary_loss_mlp": 0.01025588, + "balance_loss_clip": 1.01449549, + "balance_loss_mlp": 1.03721857, + "epoch": 0.65107470314144, + "flos": 24061083304320.0, + "grad_norm": 1.7405898865071652, + "language_loss": 0.89106113, + "learning_rate": 1.1467221117556362e-06, + "loss": 0.91235244, + "num_input_tokens_seen": 233698930, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6640625, + "step": 10829, + "time_per_iteration": 2.4867043495178223 + }, + { + "auxiliary_loss_clip": 0.01025679, + "auxiliary_loss_mlp": 0.01006089, + "balance_loss_clip": 1.004879, + "balance_loss_mlp": 1.00477648, + "epoch": 0.6511348263941079, + "flos": 72480734352000.0, + "grad_norm": 0.638409366999194, + "language_loss": 0.5535605, + "learning_rate": 1.1463698902108428e-06, + "loss": 0.57387817, + "num_input_tokens_seen": 233769825, + "router_z_loss_clip": 0.01208496, + "router_z_loss_mlp": 0.20898438, + "step": 10830, + "time_per_iteration": 3.2332394123077393 + }, + { + "auxiliary_loss_clip": 0.01105984, + "auxiliary_loss_mlp": 0.01032022, + "balance_loss_clip": 1.01880741, + "balance_loss_mlp": 1.0351963, + "epoch": 0.6511949496467759, + "flos": 23367684182400.0, + "grad_norm": 1.8294925765604486, + "language_loss": 0.74714524, + "learning_rate": 1.1460177010357878e-06, + "loss": 0.76852524, + "num_input_tokens_seen": 233787095, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.70703125, + "step": 10831, + "time_per_iteration": 2.4678196907043457 + }, + { + "auxiliary_loss_clip": 0.01026675, + "auxiliary_loss_mlp": 0.01000885, + "balance_loss_clip": 0.99961585, + "balance_loss_mlp": 1.0056181, + "epoch": 0.6512550728994438, + "flos": 67333191073920.0, + "grad_norm": 0.6414585196656494, + "language_loss": 0.51052123, + "learning_rate": 1.145665544243828e-06, + "loss": 0.53079689, + "num_input_tokens_seen": 233853050, + "router_z_loss_clip": 0.01269531, + "router_z_loss_mlp": 0.2109375, + "step": 10832, + "time_per_iteration": 3.188751697540283 + }, + { + "auxiliary_loss_clip": 0.01105483, + "auxiliary_loss_mlp": 0.01031181, + "balance_loss_clip": 1.01886725, + "balance_loss_mlp": 1.03423023, + "epoch": 0.6513151961521119, + "flos": 21141689512320.0, + "grad_norm": 2.076228287586058, + "language_loss": 0.83391213, + "learning_rate": 1.145313419848316e-06, + "loss": 0.85527885, + "num_input_tokens_seen": 233871385, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.7109375, + "step": 10833, + "time_per_iteration": 2.462529182434082 + }, + { + "auxiliary_loss_clip": 0.01106556, + "auxiliary_loss_mlp": 0.01030981, + "balance_loss_clip": 1.01900601, + "balance_loss_mlp": 1.03788352, + "epoch": 0.6513753194047798, + "flos": 15158828476800.0, + "grad_norm": 2.0595405323959817, + "language_loss": 0.83691829, + "learning_rate": 1.1449613278626049e-06, + "loss": 0.85829365, + "num_input_tokens_seen": 233888175, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6875, + "step": 10834, + "time_per_iteration": 2.4130232334136963 + }, + { + "auxiliary_loss_clip": 0.01105953, + "auxiliary_loss_mlp": 0.01032966, + "balance_loss_clip": 1.02089036, + "balance_loss_mlp": 1.03688574, + "epoch": 0.6514354426574478, + "flos": 30226621933440.0, + "grad_norm": 1.593058398275777, + "language_loss": 0.76863015, + "learning_rate": 1.1446092683000455e-06, + "loss": 0.79001933, + "num_input_tokens_seen": 233911470, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69140625, + "step": 10835, + "time_per_iteration": 2.562690019607544 + }, + { + "auxiliary_loss_clip": 0.01107145, + "auxiliary_loss_mlp": 0.01032273, + "balance_loss_clip": 1.02025032, + "balance_loss_mlp": 1.03849971, + "epoch": 0.6514955659101157, + "flos": 24205587719040.0, + "grad_norm": 1.513196810995274, + "language_loss": 0.7734859, + "learning_rate": 1.1442572411739882e-06, + "loss": 0.79488003, + "num_input_tokens_seen": 233932135, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6875, + "step": 10836, + "time_per_iteration": 2.4830451011657715 + }, + { + "auxiliary_loss_clip": 0.01106015, + "auxiliary_loss_mlp": 0.01030871, + "balance_loss_clip": 1.01907551, + "balance_loss_mlp": 1.03746104, + "epoch": 0.6515556891627837, + "flos": 12377761960320.0, + "grad_norm": 3.377184093609282, + "language_loss": 0.82293916, + "learning_rate": 1.143905246497783e-06, + "loss": 0.84430802, + "num_input_tokens_seen": 233947880, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.68359375, + "step": 10837, + "time_per_iteration": 2.449313163757324 + }, + { + "auxiliary_loss_clip": 0.01101636, + "auxiliary_loss_mlp": 0.01031992, + "balance_loss_clip": 1.01881361, + "balance_loss_mlp": 1.0366106, + "epoch": 0.6516158124154516, + "flos": 49601217957120.0, + "grad_norm": 1.879635988028464, + "language_loss": 0.59214962, + "learning_rate": 1.1435532842847758e-06, + "loss": 0.61348593, + "num_input_tokens_seen": 233971475, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.65234375, + "step": 10838, + "time_per_iteration": 2.7190330028533936 + }, + { + "auxiliary_loss_clip": 0.01025807, + "auxiliary_loss_mlp": 0.01001457, + "balance_loss_clip": 1.00031853, + "balance_loss_mlp": 1.00479698, + "epoch": 0.6516759356681197, + "flos": 59702748076800.0, + "grad_norm": 0.7299756161535264, + "language_loss": 0.60843396, + "learning_rate": 1.1432013545483147e-06, + "loss": 0.62870657, + "num_input_tokens_seen": 234030690, + "router_z_loss_clip": 0.01141357, + "router_z_loss_mlp": 0.2109375, + "step": 10839, + "time_per_iteration": 3.0971086025238037 + }, + { + "auxiliary_loss_clip": 0.0110206, + "auxiliary_loss_mlp": 0.01025474, + "balance_loss_clip": 1.01444113, + "balance_loss_mlp": 1.03583503, + "epoch": 0.6517360589207876, + "flos": 37450807130880.0, + "grad_norm": 1.8264384192259977, + "language_loss": 0.68170393, + "learning_rate": 1.1428494573017439e-06, + "loss": 0.70297927, + "num_input_tokens_seen": 234052470, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6640625, + "step": 10840, + "time_per_iteration": 2.5938761234283447 + }, + { + "auxiliary_loss_clip": 0.01102231, + "auxiliary_loss_mlp": 0.01028806, + "balance_loss_clip": 1.01783288, + "balance_loss_mlp": 1.03470433, + "epoch": 0.6517961821734556, + "flos": 25374911068800.0, + "grad_norm": 2.0940212881125433, + "language_loss": 0.73375624, + "learning_rate": 1.1424975925584071e-06, + "loss": 0.75506657, + "num_input_tokens_seen": 234071495, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.67578125, + "step": 10841, + "time_per_iteration": 2.5096652507781982 + }, + { + "auxiliary_loss_clip": 0.01104442, + "auxiliary_loss_mlp": 0.01031855, + "balance_loss_clip": 1.02016115, + "balance_loss_mlp": 1.03598571, + "epoch": 0.6518563054261236, + "flos": 28766996864640.0, + "grad_norm": 1.444320911302732, + "language_loss": 0.6237874, + "learning_rate": 1.142145760331648e-06, + "loss": 0.64515036, + "num_input_tokens_seen": 234092325, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.68359375, + "step": 10842, + "time_per_iteration": 2.4958693981170654 + }, + { + "auxiliary_loss_clip": 0.01026129, + "auxiliary_loss_mlp": 0.00998688, + "balance_loss_clip": 0.99753761, + "balance_loss_mlp": 1.00511324, + "epoch": 0.6519164286787915, + "flos": 68924750797440.0, + "grad_norm": 0.8080147467318853, + "language_loss": 0.56082183, + "learning_rate": 1.141793960634807e-06, + "loss": 0.58107001, + "num_input_tokens_seen": 234148005, + "router_z_loss_clip": 0.01147461, + "router_z_loss_mlp": 0.2109375, + "step": 10843, + "time_per_iteration": 4.309800863265991 + }, + { + "auxiliary_loss_clip": 0.01107299, + "auxiliary_loss_mlp": 0.01033148, + "balance_loss_clip": 1.02050614, + "balance_loss_mlp": 1.03615665, + "epoch": 0.6519765519314595, + "flos": 20441933683200.0, + "grad_norm": 1.64501007109248, + "language_loss": 0.82562542, + "learning_rate": 1.1414421934812253e-06, + "loss": 0.84702992, + "num_input_tokens_seen": 234164280, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 10844, + "time_per_iteration": 2.4669365882873535 + }, + { + "auxiliary_loss_clip": 0.01103507, + "auxiliary_loss_mlp": 0.01026932, + "balance_loss_clip": 1.0143199, + "balance_loss_mlp": 1.03550506, + "epoch": 0.6520366751841274, + "flos": 28402970480640.0, + "grad_norm": 2.063344534700721, + "language_loss": 0.60069621, + "learning_rate": 1.1410904588842421e-06, + "loss": 0.62200063, + "num_input_tokens_seen": 234185090, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6796875, + "step": 10845, + "time_per_iteration": 2.5032777786254883 + }, + { + "auxiliary_loss_clip": 0.01102843, + "auxiliary_loss_mlp": 0.01026875, + "balance_loss_clip": 1.01482248, + "balance_loss_mlp": 1.03591549, + "epoch": 0.6520967984367955, + "flos": 22273414300800.0, + "grad_norm": 2.814601439051778, + "language_loss": 0.79261941, + "learning_rate": 1.140738756857194e-06, + "loss": 0.81391656, + "num_input_tokens_seen": 234204050, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.66796875, + "step": 10846, + "time_per_iteration": 5.227022171020508 + }, + { + "auxiliary_loss_clip": 0.01025994, + "auxiliary_loss_mlp": 0.01001258, + "balance_loss_clip": 1.00005949, + "balance_loss_mlp": 1.00516367, + "epoch": 0.6521569216894634, + "flos": 68917140092160.0, + "grad_norm": 0.7222516480670771, + "language_loss": 0.60183281, + "learning_rate": 1.1403870874134192e-06, + "loss": 0.6221053, + "num_input_tokens_seen": 234269790, + "router_z_loss_clip": 0.01196289, + "router_z_loss_mlp": 0.20898438, + "step": 10847, + "time_per_iteration": 3.1712331771850586 + }, + { + "auxiliary_loss_clip": 0.01107998, + "auxiliary_loss_mlp": 0.01036888, + "balance_loss_clip": 1.0252527, + "balance_loss_mlp": 1.03767812, + "epoch": 0.6522170449421314, + "flos": 29130520458240.0, + "grad_norm": 1.5760338552649935, + "language_loss": 0.81001323, + "learning_rate": 1.1400354505662514e-06, + "loss": 0.83146203, + "num_input_tokens_seen": 234290135, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.703125, + "step": 10848, + "time_per_iteration": 3.9554522037506104 + }, + { + "auxiliary_loss_clip": 0.01102504, + "auxiliary_loss_mlp": 0.01034881, + "balance_loss_clip": 1.02334166, + "balance_loss_mlp": 1.03565013, + "epoch": 0.6522771681947993, + "flos": 26651930371200.0, + "grad_norm": 2.095194559726116, + "language_loss": 0.75025082, + "learning_rate": 1.1396838463290263e-06, + "loss": 0.77162468, + "num_input_tokens_seen": 234309535, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66796875, + "step": 10849, + "time_per_iteration": 2.4771504402160645 + }, + { + "auxiliary_loss_clip": 0.01101707, + "auxiliary_loss_mlp": 0.01028459, + "balance_loss_clip": 1.01685405, + "balance_loss_mlp": 1.03644204, + "epoch": 0.6523372914474673, + "flos": 25739763465600.0, + "grad_norm": 1.5413673094352514, + "language_loss": 0.68062961, + "learning_rate": 1.1393322747150752e-06, + "loss": 0.70193124, + "num_input_tokens_seen": 234328755, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65234375, + "step": 10850, + "time_per_iteration": 2.5665318965911865 + }, + { + "auxiliary_loss_clip": 0.01102248, + "auxiliary_loss_mlp": 0.01025486, + "balance_loss_clip": 1.01378012, + "balance_loss_mlp": 1.03723216, + "epoch": 0.6523974147001352, + "flos": 24827345164800.0, + "grad_norm": 3.429236792588671, + "language_loss": 0.66494656, + "learning_rate": 1.1389807357377313e-06, + "loss": 0.68622386, + "num_input_tokens_seen": 234348655, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6484375, + "step": 10851, + "time_per_iteration": 2.4702751636505127 + }, + { + "auxiliary_loss_clip": 0.01106489, + "auxiliary_loss_mlp": 0.01027965, + "balance_loss_clip": 1.01643169, + "balance_loss_mlp": 1.03662848, + "epoch": 0.6524575379528033, + "flos": 26317637470080.0, + "grad_norm": 2.15849365590988, + "language_loss": 0.74028027, + "learning_rate": 1.1386292294103235e-06, + "loss": 0.76162481, + "num_input_tokens_seen": 234367445, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.69921875, + "step": 10852, + "time_per_iteration": 2.509229898452759 + }, + { + "auxiliary_loss_clip": 0.01108111, + "auxiliary_loss_mlp": 0.01028926, + "balance_loss_clip": 1.01606905, + "balance_loss_mlp": 1.03742135, + "epoch": 0.6525176612054712, + "flos": 19494143464320.0, + "grad_norm": 7.224145946580318, + "language_loss": 0.66702747, + "learning_rate": 1.1382777557461812e-06, + "loss": 0.68839788, + "num_input_tokens_seen": 234384825, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.70703125, + "step": 10853, + "time_per_iteration": 2.4382283687591553 + }, + { + "auxiliary_loss_clip": 0.01026122, + "auxiliary_loss_mlp": 0.00996827, + "balance_loss_clip": 0.99562865, + "balance_loss_mlp": 1.00521636, + "epoch": 0.6525777844581392, + "flos": 71706894721920.0, + "grad_norm": 0.7308751423910714, + "language_loss": 0.62970364, + "learning_rate": 1.137926314758634e-06, + "loss": 0.64993316, + "num_input_tokens_seen": 234450630, + "router_z_loss_clip": 0.01196289, + "router_z_loss_mlp": 0.20898438, + "step": 10854, + "time_per_iteration": 3.1691970825195312 + }, + { + "auxiliary_loss_clip": 0.01104802, + "auxiliary_loss_mlp": 0.01030611, + "balance_loss_clip": 1.01792085, + "balance_loss_mlp": 1.03625202, + "epoch": 0.6526379077108072, + "flos": 26653115520000.0, + "grad_norm": 1.8459663187588897, + "language_loss": 0.77826589, + "learning_rate": 1.1375749064610072e-06, + "loss": 0.79962003, + "num_input_tokens_seen": 234473505, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 10855, + "time_per_iteration": 2.5133306980133057 + }, + { + "auxiliary_loss_clip": 0.01099784, + "auxiliary_loss_mlp": 0.01026139, + "balance_loss_clip": 1.01462901, + "balance_loss_mlp": 1.03466463, + "epoch": 0.6526980309634751, + "flos": 22820369673600.0, + "grad_norm": 1.7863182329630984, + "language_loss": 0.79166549, + "learning_rate": 1.1372235308666256e-06, + "loss": 0.81292474, + "num_input_tokens_seen": 234492485, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 10856, + "time_per_iteration": 2.521003007888794 + }, + { + "auxiliary_loss_clip": 0.01103089, + "auxiliary_loss_mlp": 0.01029164, + "balance_loss_clip": 1.01608038, + "balance_loss_mlp": 1.03572774, + "epoch": 0.6527581542161431, + "flos": 28365048696960.0, + "grad_norm": 1.7280049220035325, + "language_loss": 0.73561788, + "learning_rate": 1.136872187988815e-06, + "loss": 0.75694042, + "num_input_tokens_seen": 234512645, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.671875, + "step": 10857, + "time_per_iteration": 2.524388074874878 + }, + { + "auxiliary_loss_clip": 0.01103768, + "auxiliary_loss_mlp": 0.01030272, + "balance_loss_clip": 1.01941204, + "balance_loss_mlp": 1.03619289, + "epoch": 0.652818277468811, + "flos": 18369206346240.0, + "grad_norm": 2.287513574647506, + "language_loss": 0.62553668, + "learning_rate": 1.1365208778408965e-06, + "loss": 0.64687705, + "num_input_tokens_seen": 234529310, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.67578125, + "step": 10858, + "time_per_iteration": 2.484292507171631 + }, + { + "auxiliary_loss_clip": 0.01100147, + "auxiliary_loss_mlp": 0.01030536, + "balance_loss_clip": 1.0192945, + "balance_loss_mlp": 1.03388333, + "epoch": 0.6528784007214791, + "flos": 18036170421120.0, + "grad_norm": 1.70957243248878, + "language_loss": 0.78181291, + "learning_rate": 1.1361696004361939e-06, + "loss": 0.80311966, + "num_input_tokens_seen": 234546685, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6640625, + "step": 10859, + "time_per_iteration": 2.4208006858825684 + }, + { + "auxiliary_loss_clip": 0.011057, + "auxiliary_loss_mlp": 0.01030498, + "balance_loss_clip": 1.01820755, + "balance_loss_mlp": 1.03562379, + "epoch": 0.652938523974147, + "flos": 22382008093440.0, + "grad_norm": 1.5618141301411743, + "language_loss": 0.67899007, + "learning_rate": 1.1358183557880256e-06, + "loss": 0.70035207, + "num_input_tokens_seen": 234566255, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 10860, + "time_per_iteration": 2.516052722930908 + }, + { + "auxiliary_loss_clip": 0.01106777, + "auxiliary_loss_mlp": 0.01026586, + "balance_loss_clip": 1.01489735, + "balance_loss_mlp": 1.03654599, + "epoch": 0.652998647226815, + "flos": 16764035368320.0, + "grad_norm": 2.1862353937135732, + "language_loss": 0.66182673, + "learning_rate": 1.135467143909712e-06, + "loss": 0.68316036, + "num_input_tokens_seen": 234585405, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.703125, + "step": 10861, + "time_per_iteration": 2.4207851886749268 + }, + { + "auxiliary_loss_clip": 0.01105314, + "auxiliary_loss_mlp": 0.0103272, + "balance_loss_clip": 1.01948178, + "balance_loss_mlp": 1.03619254, + "epoch": 0.6530587704794829, + "flos": 35772522019200.0, + "grad_norm": 1.7782678366068123, + "language_loss": 0.6507051, + "learning_rate": 1.135115964814572e-06, + "loss": 0.67208546, + "num_input_tokens_seen": 234608095, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.69140625, + "step": 10862, + "time_per_iteration": 2.5804362297058105 + }, + { + "auxiliary_loss_clip": 0.0110242, + "auxiliary_loss_mlp": 0.0103057, + "balance_loss_clip": 1.01891708, + "balance_loss_mlp": 1.03588247, + "epoch": 0.6531188937321509, + "flos": 19316134638720.0, + "grad_norm": 1.5241362686221158, + "language_loss": 0.77193171, + "learning_rate": 1.13476481851592e-06, + "loss": 0.79326159, + "num_input_tokens_seen": 234627335, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 10863, + "time_per_iteration": 2.438044309616089 + }, + { + "auxiliary_loss_clip": 0.01103508, + "auxiliary_loss_mlp": 0.01026948, + "balance_loss_clip": 1.0157485, + "balance_loss_mlp": 1.03619623, + "epoch": 0.6531790169848188, + "flos": 22893771116160.0, + "grad_norm": 1.8164803813000403, + "language_loss": 0.7466498, + "learning_rate": 1.1344137050270739e-06, + "loss": 0.76795435, + "num_input_tokens_seen": 234646540, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.671875, + "step": 10864, + "time_per_iteration": 2.4771134853363037 + }, + { + "auxiliary_loss_clip": 0.01102279, + "auxiliary_loss_mlp": 0.01033829, + "balance_loss_clip": 1.02253413, + "balance_loss_mlp": 1.03580821, + "epoch": 0.6532391402374869, + "flos": 29563530912000.0, + "grad_norm": 1.7514895317957062, + "language_loss": 0.8600319, + "learning_rate": 1.1340626243613458e-06, + "loss": 0.88139296, + "num_input_tokens_seen": 234665470, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.66796875, + "step": 10865, + "time_per_iteration": 2.5002095699310303 + }, + { + "auxiliary_loss_clip": 0.01108321, + "auxiliary_loss_mlp": 0.01036311, + "balance_loss_clip": 1.02430654, + "balance_loss_mlp": 1.03760266, + "epoch": 0.6532992634901548, + "flos": 23105463920640.0, + "grad_norm": 3.5499069425062832, + "language_loss": 0.81403613, + "learning_rate": 1.133711576532051e-06, + "loss": 0.83548248, + "num_input_tokens_seen": 234683955, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.70703125, + "step": 10866, + "time_per_iteration": 2.546633005142212 + }, + { + "auxiliary_loss_clip": 0.01102409, + "auxiliary_loss_mlp": 0.01026983, + "balance_loss_clip": 1.01524687, + "balance_loss_mlp": 1.03626192, + "epoch": 0.6533593867428228, + "flos": 26067340523520.0, + "grad_norm": 1.4960309400225926, + "language_loss": 0.82321596, + "learning_rate": 1.1333605615524995e-06, + "loss": 0.8445099, + "num_input_tokens_seen": 234704595, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66015625, + "step": 10867, + "time_per_iteration": 2.4704959392547607 + }, + { + "auxiliary_loss_clip": 0.01102151, + "auxiliary_loss_mlp": 0.0102709, + "balance_loss_clip": 1.01578307, + "balance_loss_mlp": 1.03344285, + "epoch": 0.6534195099954908, + "flos": 21212469262080.0, + "grad_norm": 1.873401062188488, + "language_loss": 0.81152415, + "learning_rate": 1.1330095794360016e-06, + "loss": 0.8328166, + "num_input_tokens_seen": 234724090, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6875, + "step": 10868, + "time_per_iteration": 2.462496519088745 + }, + { + "auxiliary_loss_clip": 0.01106253, + "auxiliary_loss_mlp": 0.01028186, + "balance_loss_clip": 1.0159198, + "balance_loss_mlp": 1.03690481, + "epoch": 0.6534796332481587, + "flos": 19646584784640.0, + "grad_norm": 1.9591239016591335, + "language_loss": 0.79279351, + "learning_rate": 1.1326586301958675e-06, + "loss": 0.81413788, + "num_input_tokens_seen": 234742560, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 10869, + "time_per_iteration": 2.4351487159729004 + }, + { + "auxiliary_loss_clip": 0.0110718, + "auxiliary_loss_mlp": 0.01034774, + "balance_loss_clip": 1.02241778, + "balance_loss_mlp": 1.03880501, + "epoch": 0.6535397565008267, + "flos": 24022479162240.0, + "grad_norm": 2.040320648065678, + "language_loss": 0.71729898, + "learning_rate": 1.1323077138454063e-06, + "loss": 0.73871845, + "num_input_tokens_seen": 234762315, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6875, + "step": 10870, + "time_per_iteration": 2.5223138332366943 + }, + { + "auxiliary_loss_clip": 0.01104928, + "auxiliary_loss_mlp": 0.01035056, + "balance_loss_clip": 1.02319491, + "balance_loss_mlp": 1.0377295, + "epoch": 0.6535998797534947, + "flos": 24602759377920.0, + "grad_norm": 2.147023101303994, + "language_loss": 0.74992102, + "learning_rate": 1.1319568303979221e-06, + "loss": 0.77132088, + "num_input_tokens_seen": 234781300, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 10871, + "time_per_iteration": 2.469367265701294 + }, + { + "auxiliary_loss_clip": 0.01099729, + "auxiliary_loss_mlp": 0.01029902, + "balance_loss_clip": 1.01870823, + "balance_loss_mlp": 1.03503919, + "epoch": 0.6536600030061627, + "flos": 23364164649600.0, + "grad_norm": 1.7849990892484822, + "language_loss": 0.55615103, + "learning_rate": 1.1316059798667227e-06, + "loss": 0.5774473, + "num_input_tokens_seen": 234801040, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6484375, + "step": 10872, + "time_per_iteration": 2.4673538208007812 + }, + { + "auxiliary_loss_clip": 0.0110205, + "auxiliary_loss_mlp": 0.01034353, + "balance_loss_clip": 1.0230695, + "balance_loss_mlp": 1.03632164, + "epoch": 0.6537201262588306, + "flos": 23878477537920.0, + "grad_norm": 1.8219619398900448, + "language_loss": 0.75073338, + "learning_rate": 1.1312551622651112e-06, + "loss": 0.77209741, + "num_input_tokens_seen": 234821415, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65625, + "step": 10873, + "time_per_iteration": 2.4655163288116455 + }, + { + "auxiliary_loss_clip": 0.01104694, + "auxiliary_loss_mlp": 0.01028464, + "balance_loss_clip": 1.0166738, + "balance_loss_mlp": 1.03662491, + "epoch": 0.6537802495114986, + "flos": 24354760901760.0, + "grad_norm": 1.5897958047644043, + "language_loss": 0.75623226, + "learning_rate": 1.1309043776063917e-06, + "loss": 0.77756387, + "num_input_tokens_seen": 234843795, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6796875, + "step": 10874, + "time_per_iteration": 2.5224883556365967 + }, + { + "auxiliary_loss_clip": 0.01103714, + "auxiliary_loss_mlp": 0.01031841, + "balance_loss_clip": 1.0198009, + "balance_loss_mlp": 1.03682685, + "epoch": 0.6538403727641665, + "flos": 27996892248960.0, + "grad_norm": 2.7439070637520064, + "language_loss": 0.81423092, + "learning_rate": 1.1305536259038642e-06, + "loss": 0.83558643, + "num_input_tokens_seen": 234862350, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.66796875, + "step": 10875, + "time_per_iteration": 2.4869980812072754 + }, + { + "auxiliary_loss_clip": 0.01102459, + "auxiliary_loss_mlp": 0.0103789, + "balance_loss_clip": 1.02635086, + "balance_loss_mlp": 1.03504491, + "epoch": 0.6539004960168345, + "flos": 27563594486400.0, + "grad_norm": 1.6810546720157804, + "language_loss": 0.70045686, + "learning_rate": 1.1302029071708314e-06, + "loss": 0.72186041, + "num_input_tokens_seen": 234881790, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 10876, + "time_per_iteration": 2.5040364265441895 + }, + { + "auxiliary_loss_clip": 0.01103097, + "auxiliary_loss_mlp": 0.01034782, + "balance_loss_clip": 1.02289736, + "balance_loss_mlp": 1.03575683, + "epoch": 0.6539606192695024, + "flos": 14530067879040.0, + "grad_norm": 1.8217122109555075, + "language_loss": 0.7932229, + "learning_rate": 1.1298522214205908e-06, + "loss": 0.81460166, + "num_input_tokens_seen": 234897775, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 10877, + "time_per_iteration": 2.402308940887451 + }, + { + "auxiliary_loss_clip": 0.01103576, + "auxiliary_loss_mlp": 0.01027676, + "balance_loss_clip": 1.01635098, + "balance_loss_mlp": 1.0359726, + "epoch": 0.6540207425221705, + "flos": 21616356764160.0, + "grad_norm": 2.189241924086369, + "language_loss": 0.7987535, + "learning_rate": 1.1295015686664408e-06, + "loss": 0.82006603, + "num_input_tokens_seen": 234918395, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.67578125, + "step": 10878, + "time_per_iteration": 2.4780471324920654 + }, + { + "auxiliary_loss_clip": 0.01100458, + "auxiliary_loss_mlp": 0.01027543, + "balance_loss_clip": 1.0154438, + "balance_loss_mlp": 1.03370023, + "epoch": 0.6540808657748384, + "flos": 17668983640320.0, + "grad_norm": 1.8666542226247762, + "language_loss": 0.84453034, + "learning_rate": 1.1291509489216797e-06, + "loss": 0.86581039, + "num_input_tokens_seen": 234936260, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66796875, + "step": 10879, + "time_per_iteration": 2.4143741130828857 + }, + { + "auxiliary_loss_clip": 0.01104945, + "auxiliary_loss_mlp": 0.01030605, + "balance_loss_clip": 1.01845217, + "balance_loss_mlp": 1.03493488, + "epoch": 0.6541409890275064, + "flos": 14538292093440.0, + "grad_norm": 2.8543762869506004, + "language_loss": 0.71946406, + "learning_rate": 1.128800362199601e-06, + "loss": 0.74081963, + "num_input_tokens_seen": 234952110, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69921875, + "step": 10880, + "time_per_iteration": 2.430192708969116 + }, + { + "auxiliary_loss_clip": 0.01100358, + "auxiliary_loss_mlp": 0.01030494, + "balance_loss_clip": 1.01899612, + "balance_loss_mlp": 1.03472471, + "epoch": 0.6542011122801744, + "flos": 17165301177600.0, + "grad_norm": 1.7514865003733433, + "language_loss": 0.84385759, + "learning_rate": 1.1284498085135005e-06, + "loss": 0.86516607, + "num_input_tokens_seen": 234970810, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 10881, + "time_per_iteration": 2.4801900386810303 + }, + { + "auxiliary_loss_clip": 0.01105434, + "auxiliary_loss_mlp": 0.0103233, + "balance_loss_clip": 1.01909828, + "balance_loss_mlp": 1.03612447, + "epoch": 0.6542612355328423, + "flos": 18186600579840.0, + "grad_norm": 1.8305344772437837, + "language_loss": 0.77706677, + "learning_rate": 1.1280992878766699e-06, + "loss": 0.79844439, + "num_input_tokens_seen": 234989565, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.69140625, + "step": 10882, + "time_per_iteration": 2.4523637294769287 + }, + { + "auxiliary_loss_clip": 0.01106717, + "auxiliary_loss_mlp": 0.01029129, + "balance_loss_clip": 1.01632619, + "balance_loss_mlp": 1.03733766, + "epoch": 0.6543213587855103, + "flos": 19792453916160.0, + "grad_norm": 1.6779149142362604, + "language_loss": 0.82394373, + "learning_rate": 1.1277488003024024e-06, + "loss": 0.84530222, + "num_input_tokens_seen": 235007955, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 10883, + "time_per_iteration": 2.4265058040618896 + }, + { + "auxiliary_loss_clip": 0.01108268, + "auxiliary_loss_mlp": 0.01034275, + "balance_loss_clip": 1.02163935, + "balance_loss_mlp": 1.0390712, + "epoch": 0.6543814820381783, + "flos": 21105096531840.0, + "grad_norm": 2.382020741579914, + "language_loss": 0.85506725, + "learning_rate": 1.127398345803988e-06, + "loss": 0.87649274, + "num_input_tokens_seen": 235024860, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 10884, + "time_per_iteration": 2.4697301387786865 + }, + { + "auxiliary_loss_clip": 0.0110574, + "auxiliary_loss_mlp": 0.01036191, + "balance_loss_clip": 1.02454472, + "balance_loss_mlp": 1.03698301, + "epoch": 0.6544416052908463, + "flos": 20194042947840.0, + "grad_norm": 10.527351582586146, + "language_loss": 0.80486369, + "learning_rate": 1.127047924394715e-06, + "loss": 0.82628304, + "num_input_tokens_seen": 235043815, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6875, + "step": 10885, + "time_per_iteration": 3.9415979385375977 + }, + { + "auxiliary_loss_clip": 0.01103256, + "auxiliary_loss_mlp": 0.01026985, + "balance_loss_clip": 1.01527846, + "balance_loss_mlp": 1.03751159, + "epoch": 0.6545017285435142, + "flos": 23368258800000.0, + "grad_norm": 1.8132591830137343, + "language_loss": 0.72155404, + "learning_rate": 1.1266975360878722e-06, + "loss": 0.74285644, + "num_input_tokens_seen": 235062985, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 10886, + "time_per_iteration": 2.474519729614258 + }, + { + "auxiliary_loss_clip": 0.01101162, + "auxiliary_loss_mlp": 0.01029474, + "balance_loss_clip": 1.01850116, + "balance_loss_mlp": 1.03500915, + "epoch": 0.6545618517961822, + "flos": 19134714021120.0, + "grad_norm": 1.738538225206424, + "language_loss": 0.78089505, + "learning_rate": 1.1263471808967468e-06, + "loss": 0.80220145, + "num_input_tokens_seen": 235081670, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6640625, + "step": 10887, + "time_per_iteration": 2.4567511081695557 + }, + { + "auxiliary_loss_clip": 0.01102786, + "auxiliary_loss_mlp": 0.01030643, + "balance_loss_clip": 1.01913893, + "balance_loss_mlp": 1.03559566, + "epoch": 0.6546219750488501, + "flos": 14938624149120.0, + "grad_norm": 4.496679975000023, + "language_loss": 0.78967035, + "learning_rate": 1.1259968588346234e-06, + "loss": 0.81100464, + "num_input_tokens_seen": 235098510, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 10888, + "time_per_iteration": 5.19985818862915 + }, + { + "auxiliary_loss_clip": 0.01099033, + "auxiliary_loss_mlp": 0.01028783, + "balance_loss_clip": 1.01808405, + "balance_loss_mlp": 1.03421295, + "epoch": 0.6546820983015181, + "flos": 36320518886400.0, + "grad_norm": 1.5919708412571818, + "language_loss": 0.66247272, + "learning_rate": 1.1256465699147874e-06, + "loss": 0.68375087, + "num_input_tokens_seen": 235119990, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6484375, + "step": 10889, + "time_per_iteration": 2.5679409503936768 + }, + { + "auxiliary_loss_clip": 0.01102521, + "auxiliary_loss_mlp": 0.01029157, + "balance_loss_clip": 1.01675916, + "balance_loss_mlp": 1.03473425, + "epoch": 0.654742221554186, + "flos": 20411446014720.0, + "grad_norm": 1.4966214179852624, + "language_loss": 0.79874986, + "learning_rate": 1.1252963141505203e-06, + "loss": 0.82006663, + "num_input_tokens_seen": 235139255, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 10890, + "time_per_iteration": 3.9007346630096436 + }, + { + "auxiliary_loss_clip": 0.01102511, + "auxiliary_loss_mlp": 0.0102901, + "balance_loss_clip": 1.01683879, + "balance_loss_mlp": 1.03386474, + "epoch": 0.6548023448068541, + "flos": 24863650836480.0, + "grad_norm": 2.4806412573813494, + "language_loss": 0.65136874, + "learning_rate": 1.1249460915551052e-06, + "loss": 0.67268395, + "num_input_tokens_seen": 235158455, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6875, + "step": 10891, + "time_per_iteration": 2.4762353897094727 + }, + { + "auxiliary_loss_clip": 0.01103305, + "auxiliary_loss_mlp": 0.01030658, + "balance_loss_clip": 1.01963139, + "balance_loss_mlp": 1.03584743, + "epoch": 0.654862468059522, + "flos": 21427573858560.0, + "grad_norm": 1.713176232540202, + "language_loss": 0.79329646, + "learning_rate": 1.1245959021418214e-06, + "loss": 0.81463599, + "num_input_tokens_seen": 235177350, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.67578125, + "step": 10892, + "time_per_iteration": 2.483430862426758 + }, + { + "auxiliary_loss_clip": 0.01108627, + "auxiliary_loss_mlp": 0.01034578, + "balance_loss_clip": 1.02300262, + "balance_loss_mlp": 1.03826213, + "epoch": 0.65492259131219, + "flos": 26577846570240.0, + "grad_norm": 1.927118370280093, + "language_loss": 0.77688205, + "learning_rate": 1.1242457459239497e-06, + "loss": 0.79831409, + "num_input_tokens_seen": 235196435, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.703125, + "step": 10893, + "time_per_iteration": 2.468653440475464 + }, + { + "auxiliary_loss_clip": 0.01107027, + "auxiliary_loss_mlp": 0.01026547, + "balance_loss_clip": 1.01425672, + "balance_loss_mlp": 1.03698456, + "epoch": 0.6549827145648579, + "flos": 21501334437120.0, + "grad_norm": 1.6133414191995223, + "language_loss": 0.7036956, + "learning_rate": 1.123895622914766e-06, + "loss": 0.72503132, + "num_input_tokens_seen": 235215430, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 10894, + "time_per_iteration": 2.454615592956543 + }, + { + "auxiliary_loss_clip": 0.01106124, + "auxiliary_loss_mlp": 0.0103336, + "balance_loss_clip": 1.02128386, + "balance_loss_mlp": 1.03594112, + "epoch": 0.6550428378175259, + "flos": 22594275515520.0, + "grad_norm": 4.213583210390945, + "language_loss": 0.63007772, + "learning_rate": 1.123545533127549e-06, + "loss": 0.65147251, + "num_input_tokens_seen": 235232015, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.703125, + "step": 10895, + "time_per_iteration": 2.4314959049224854 + }, + { + "auxiliary_loss_clip": 0.01102811, + "auxiliary_loss_mlp": 0.01030375, + "balance_loss_clip": 1.01881742, + "balance_loss_mlp": 1.03532076, + "epoch": 0.655102961070194, + "flos": 12823809050880.0, + "grad_norm": 3.6304048273042717, + "language_loss": 0.7897135, + "learning_rate": 1.1231954765755722e-06, + "loss": 0.81104541, + "num_input_tokens_seen": 235248115, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.67578125, + "step": 10896, + "time_per_iteration": 2.4550769329071045 + }, + { + "auxiliary_loss_clip": 0.01102279, + "auxiliary_loss_mlp": 0.01031456, + "balance_loss_clip": 1.01995277, + "balance_loss_mlp": 1.03664804, + "epoch": 0.6551630843228619, + "flos": 24791075406720.0, + "grad_norm": 1.4344785444999102, + "language_loss": 0.70384824, + "learning_rate": 1.1228454532721111e-06, + "loss": 0.72518563, + "num_input_tokens_seen": 235270785, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 10897, + "time_per_iteration": 2.478304624557495 + }, + { + "auxiliary_loss_clip": 0.01103619, + "auxiliary_loss_mlp": 0.010288, + "balance_loss_clip": 1.0175761, + "balance_loss_mlp": 1.03478158, + "epoch": 0.6552232075755299, + "flos": 16724461559040.0, + "grad_norm": 1.7387642279992266, + "language_loss": 0.75401318, + "learning_rate": 1.1224954632304391e-06, + "loss": 0.77533734, + "num_input_tokens_seen": 235287905, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6875, + "step": 10898, + "time_per_iteration": 2.4487948417663574 + }, + { + "auxiliary_loss_clip": 0.01105468, + "auxiliary_loss_mlp": 0.01034852, + "balance_loss_clip": 1.02299678, + "balance_loss_mlp": 1.03773856, + "epoch": 0.6552833308281978, + "flos": 22016473338240.0, + "grad_norm": 2.1990983943767555, + "language_loss": 0.73518318, + "learning_rate": 1.122145506463827e-06, + "loss": 0.75658637, + "num_input_tokens_seen": 235305525, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.67578125, + "step": 10899, + "time_per_iteration": 2.4304370880126953 + }, + { + "auxiliary_loss_clip": 0.0110359, + "auxiliary_loss_mlp": 0.01026488, + "balance_loss_clip": 1.01528871, + "balance_loss_mlp": 1.0364536, + "epoch": 0.6553434540808658, + "flos": 24863399441280.0, + "grad_norm": 2.1275272720256293, + "language_loss": 0.55958188, + "learning_rate": 1.1217955829855443e-06, + "loss": 0.58088267, + "num_input_tokens_seen": 235324415, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 10900, + "time_per_iteration": 2.484473943710327 + }, + { + "auxiliary_loss_clip": 0.01104316, + "auxiliary_loss_mlp": 0.01032088, + "balance_loss_clip": 1.01939833, + "balance_loss_mlp": 1.03653932, + "epoch": 0.6554035773335337, + "flos": 23221060865280.0, + "grad_norm": 1.8846923286778847, + "language_loss": 0.76933706, + "learning_rate": 1.1214456928088622e-06, + "loss": 0.79070109, + "num_input_tokens_seen": 235341595, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 10901, + "time_per_iteration": 2.4382822513580322 + }, + { + "auxiliary_loss_clip": 0.01101496, + "auxiliary_loss_mlp": 0.01026359, + "balance_loss_clip": 1.01434255, + "balance_loss_mlp": 1.03516734, + "epoch": 0.6554637005862017, + "flos": 22783597125120.0, + "grad_norm": 1.753856944987035, + "language_loss": 0.73216426, + "learning_rate": 1.1210958359470463e-06, + "loss": 0.75344282, + "num_input_tokens_seen": 235361700, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 10902, + "time_per_iteration": 2.49745774269104 + }, + { + "auxiliary_loss_clip": 0.01102831, + "auxiliary_loss_mlp": 0.01026395, + "balance_loss_clip": 1.01536822, + "balance_loss_mlp": 1.03652823, + "epoch": 0.6555238238388696, + "flos": 21507224267520.0, + "grad_norm": 1.6638199342391367, + "language_loss": 0.67729247, + "learning_rate": 1.1207460124133645e-06, + "loss": 0.69858468, + "num_input_tokens_seen": 235382065, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6640625, + "step": 10903, + "time_per_iteration": 2.457672595977783 + }, + { + "auxiliary_loss_clip": 0.01106344, + "auxiliary_loss_mlp": 0.01034393, + "balance_loss_clip": 1.02201295, + "balance_loss_mlp": 1.03555727, + "epoch": 0.6555839470915377, + "flos": 30519473518080.0, + "grad_norm": 1.8258125512154932, + "language_loss": 0.66961503, + "learning_rate": 1.1203962222210832e-06, + "loss": 0.6910224, + "num_input_tokens_seen": 235402130, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 10904, + "time_per_iteration": 2.544079065322876 + }, + { + "auxiliary_loss_clip": 0.01103937, + "auxiliary_loss_mlp": 0.01035362, + "balance_loss_clip": 1.02261209, + "balance_loss_mlp": 1.03435302, + "epoch": 0.6556440703442056, + "flos": 24642943718400.0, + "grad_norm": 1.9965123681804708, + "language_loss": 0.90475762, + "learning_rate": 1.120046465383464e-06, + "loss": 0.92615068, + "num_input_tokens_seen": 235420435, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 10905, + "time_per_iteration": 2.4607133865356445 + }, + { + "auxiliary_loss_clip": 0.01100631, + "auxiliary_loss_mlp": 0.01030435, + "balance_loss_clip": 1.01922941, + "balance_loss_mlp": 1.03564942, + "epoch": 0.6557041935968736, + "flos": 23732464752000.0, + "grad_norm": 2.060465882995779, + "language_loss": 0.75227022, + "learning_rate": 1.1196967419137721e-06, + "loss": 0.77358085, + "num_input_tokens_seen": 235439960, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 10906, + "time_per_iteration": 2.489344358444214 + }, + { + "auxiliary_loss_clip": 0.01108555, + "auxiliary_loss_mlp": 0.01037824, + "balance_loss_clip": 1.02571845, + "balance_loss_mlp": 1.03796065, + "epoch": 0.6557643168495415, + "flos": 11102753819520.0, + "grad_norm": 2.673517900647209, + "language_loss": 0.74337453, + "learning_rate": 1.119347051825267e-06, + "loss": 0.76483834, + "num_input_tokens_seen": 235457495, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.70703125, + "step": 10907, + "time_per_iteration": 2.4216673374176025 + }, + { + "auxiliary_loss_clip": 0.01102218, + "auxiliary_loss_mlp": 0.01030446, + "balance_loss_clip": 1.01732743, + "balance_loss_mlp": 1.03423953, + "epoch": 0.6558244401022095, + "flos": 30191034533760.0, + "grad_norm": 1.4101718899089066, + "language_loss": 0.72367519, + "learning_rate": 1.118997395131211e-06, + "loss": 0.74500179, + "num_input_tokens_seen": 235479525, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6796875, + "step": 10908, + "time_per_iteration": 2.531003952026367 + }, + { + "auxiliary_loss_clip": 0.01105598, + "auxiliary_loss_mlp": 0.01032426, + "balance_loss_clip": 1.01962876, + "balance_loss_mlp": 1.03744864, + "epoch": 0.6558845633548775, + "flos": 17931060247680.0, + "grad_norm": 2.1513013799426868, + "language_loss": 0.81017995, + "learning_rate": 1.118647771844861e-06, + "loss": 0.83156013, + "num_input_tokens_seen": 235496305, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 10909, + "time_per_iteration": 2.4130208492279053 + }, + { + "auxiliary_loss_clip": 0.01105504, + "auxiliary_loss_mlp": 0.01034655, + "balance_loss_clip": 1.02167928, + "balance_loss_mlp": 1.0363667, + "epoch": 0.6559446866075455, + "flos": 21904144531200.0, + "grad_norm": 2.0430689174515098, + "language_loss": 0.63840532, + "learning_rate": 1.1182981819794767e-06, + "loss": 0.65980697, + "num_input_tokens_seen": 235512545, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.69140625, + "step": 10910, + "time_per_iteration": 2.4513769149780273 + }, + { + "auxiliary_loss_clip": 0.01110874, + "auxiliary_loss_mlp": 0.01030758, + "balance_loss_clip": 1.0173471, + "balance_loss_mlp": 1.03761017, + "epoch": 0.6560048098602135, + "flos": 14127976056960.0, + "grad_norm": 3.983049569871041, + "language_loss": 0.76120275, + "learning_rate": 1.117948625548313e-06, + "loss": 0.78261906, + "num_input_tokens_seen": 235526045, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 10911, + "time_per_iteration": 2.421567440032959 + }, + { + "auxiliary_loss_clip": 0.01098552, + "auxiliary_loss_mlp": 0.01027915, + "balance_loss_clip": 1.01696563, + "balance_loss_mlp": 1.03389096, + "epoch": 0.6560649331128814, + "flos": 18807567926400.0, + "grad_norm": 2.6100669832011048, + "language_loss": 0.75670731, + "learning_rate": 1.1175991025646265e-06, + "loss": 0.77797198, + "num_input_tokens_seen": 235545285, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.64453125, + "step": 10912, + "time_per_iteration": 2.4657318592071533 + }, + { + "auxiliary_loss_clip": 0.01111745, + "auxiliary_loss_mlp": 0.01034368, + "balance_loss_clip": 1.02153468, + "balance_loss_mlp": 1.03876007, + "epoch": 0.6561250563655494, + "flos": 17053618815360.0, + "grad_norm": 1.5787420401710588, + "language_loss": 0.77322382, + "learning_rate": 1.1172496130416697e-06, + "loss": 0.79468495, + "num_input_tokens_seen": 235563150, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.73046875, + "step": 10913, + "time_per_iteration": 2.4153146743774414 + }, + { + "auxiliary_loss_clip": 0.01099585, + "auxiliary_loss_mlp": 0.01026757, + "balance_loss_clip": 1.01626611, + "balance_loss_mlp": 1.03441024, + "epoch": 0.6561851796182173, + "flos": 22637656166400.0, + "grad_norm": 1.9167212276506074, + "language_loss": 0.70828009, + "learning_rate": 1.1169001569926961e-06, + "loss": 0.72954357, + "num_input_tokens_seen": 235582535, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.65234375, + "step": 10914, + "time_per_iteration": 2.4597549438476562 + }, + { + "auxiliary_loss_clip": 0.01103262, + "auxiliary_loss_mlp": 0.01030743, + "balance_loss_clip": 1.01875639, + "balance_loss_mlp": 1.03628445, + "epoch": 0.6562453028708853, + "flos": 19239213663360.0, + "grad_norm": 1.851270541448462, + "language_loss": 0.73936331, + "learning_rate": 1.116550734430958e-06, + "loss": 0.76070333, + "num_input_tokens_seen": 235601490, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66796875, + "step": 10915, + "time_per_iteration": 2.4307053089141846 + }, + { + "auxiliary_loss_clip": 0.01100361, + "auxiliary_loss_mlp": 0.01032785, + "balance_loss_clip": 1.02053595, + "balance_loss_mlp": 1.034675, + "epoch": 0.6563054261235532, + "flos": 23801305167360.0, + "grad_norm": 1.6584707758046542, + "language_loss": 0.79572797, + "learning_rate": 1.1162013453697042e-06, + "loss": 0.8170594, + "num_input_tokens_seen": 235619165, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.65625, + "step": 10916, + "time_per_iteration": 2.4956743717193604 + }, + { + "auxiliary_loss_clip": 0.01103152, + "auxiliary_loss_mlp": 0.01034727, + "balance_loss_clip": 1.02357495, + "balance_loss_mlp": 1.03500533, + "epoch": 0.6563655493762213, + "flos": 19240039676160.0, + "grad_norm": 1.9383516308380546, + "language_loss": 0.76153994, + "learning_rate": 1.1158519898221831e-06, + "loss": 0.78291869, + "num_input_tokens_seen": 235637115, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6796875, + "step": 10917, + "time_per_iteration": 2.4713754653930664 + }, + { + "auxiliary_loss_clip": 0.01101411, + "auxiliary_loss_mlp": 0.01027975, + "balance_loss_clip": 1.01656687, + "balance_loss_mlp": 1.03484607, + "epoch": 0.6564256726288892, + "flos": 25556439427200.0, + "grad_norm": 1.8282774447422543, + "language_loss": 0.69401765, + "learning_rate": 1.1155026678016445e-06, + "loss": 0.71531153, + "num_input_tokens_seen": 235656330, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 10918, + "time_per_iteration": 2.500551462173462 + }, + { + "auxiliary_loss_clip": 0.01099041, + "auxiliary_loss_mlp": 0.01037247, + "balance_loss_clip": 1.02611244, + "balance_loss_mlp": 1.03552103, + "epoch": 0.6564857958815572, + "flos": 22200623389440.0, + "grad_norm": 1.7922194863374643, + "language_loss": 0.76487136, + "learning_rate": 1.115153379321332e-06, + "loss": 0.78623426, + "num_input_tokens_seen": 235674510, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6328125, + "step": 10919, + "time_per_iteration": 2.4698684215545654 + }, + { + "auxiliary_loss_clip": 0.01026665, + "auxiliary_loss_mlp": 0.01002269, + "balance_loss_clip": 1.00099361, + "balance_loss_mlp": 1.00584173, + "epoch": 0.6565459191342251, + "flos": 58123144604160.0, + "grad_norm": 0.7207598722602275, + "language_loss": 0.5307852, + "learning_rate": 1.1148041243944931e-06, + "loss": 0.55107456, + "num_input_tokens_seen": 235735050, + "router_z_loss_clip": 0.01275635, + "router_z_loss_mlp": 0.20898438, + "step": 10920, + "time_per_iteration": 3.0821664333343506 + }, + { + "auxiliary_loss_clip": 0.01101918, + "auxiliary_loss_mlp": 0.01028744, + "balance_loss_clip": 1.01682281, + "balance_loss_mlp": 1.03579378, + "epoch": 0.6566060423868931, + "flos": 30809631582720.0, + "grad_norm": 1.482616976222016, + "language_loss": 0.65204817, + "learning_rate": 1.1144549030343697e-06, + "loss": 0.6733548, + "num_input_tokens_seen": 235757545, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66015625, + "step": 10921, + "time_per_iteration": 2.515620231628418 + }, + { + "auxiliary_loss_clip": 0.01100913, + "auxiliary_loss_mlp": 0.0103275, + "balance_loss_clip": 1.01986957, + "balance_loss_mlp": 1.03486526, + "epoch": 0.6566661656395612, + "flos": 23367432787200.0, + "grad_norm": 1.8313420178351358, + "language_loss": 0.81071579, + "learning_rate": 1.114105715254205e-06, + "loss": 0.83205247, + "num_input_tokens_seen": 235777265, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.66015625, + "step": 10922, + "time_per_iteration": 2.454880714416504 + }, + { + "auxiliary_loss_clip": 0.01105049, + "auxiliary_loss_mlp": 0.01032776, + "balance_loss_clip": 1.02074158, + "balance_loss_mlp": 1.03742886, + "epoch": 0.6567262888922291, + "flos": 25735597488000.0, + "grad_norm": 2.376840972990548, + "language_loss": 0.71632755, + "learning_rate": 1.1137565610672414e-06, + "loss": 0.73770583, + "num_input_tokens_seen": 235796565, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 10923, + "time_per_iteration": 2.5216050148010254 + }, + { + "auxiliary_loss_clip": 0.01106548, + "auxiliary_loss_mlp": 0.01030386, + "balance_loss_clip": 1.01902556, + "balance_loss_mlp": 1.03784943, + "epoch": 0.6567864121448971, + "flos": 17123716206720.0, + "grad_norm": 1.9379255151150183, + "language_loss": 0.80668283, + "learning_rate": 1.1134074404867169e-06, + "loss": 0.82805216, + "num_input_tokens_seen": 235814805, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6875, + "step": 10924, + "time_per_iteration": 2.420976400375366 + }, + { + "auxiliary_loss_clip": 0.01103854, + "auxiliary_loss_mlp": 0.01029176, + "balance_loss_clip": 1.01838779, + "balance_loss_mlp": 1.03694773, + "epoch": 0.656846535397565, + "flos": 22419319345920.0, + "grad_norm": 1.6223500631493692, + "language_loss": 0.72360754, + "learning_rate": 1.1130583535258717e-06, + "loss": 0.74493784, + "num_input_tokens_seen": 235833405, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.66796875, + "step": 10925, + "time_per_iteration": 2.48442006111145 + }, + { + "auxiliary_loss_clip": 0.011034, + "auxiliary_loss_mlp": 0.01026622, + "balance_loss_clip": 1.01515365, + "balance_loss_mlp": 1.03553128, + "epoch": 0.656906658650233, + "flos": 17704535126400.0, + "grad_norm": 2.3888033375770266, + "language_loss": 0.72365135, + "learning_rate": 1.112709300197942e-06, + "loss": 0.74495161, + "num_input_tokens_seen": 235848530, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6796875, + "step": 10926, + "time_per_iteration": 2.426408052444458 + }, + { + "auxiliary_loss_clip": 0.01104746, + "auxiliary_loss_mlp": 0.01030909, + "balance_loss_clip": 1.01850533, + "balance_loss_mlp": 1.03482258, + "epoch": 0.6569667819029009, + "flos": 21175158009600.0, + "grad_norm": 1.681586343154767, + "language_loss": 0.72273743, + "learning_rate": 1.1123602805161656e-06, + "loss": 0.74409401, + "num_input_tokens_seen": 235867225, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69921875, + "step": 10927, + "time_per_iteration": 3.80648136138916 + }, + { + "auxiliary_loss_clip": 0.01026322, + "auxiliary_loss_mlp": 0.01001587, + "balance_loss_clip": 1.00027585, + "balance_loss_mlp": 1.00539577, + "epoch": 0.6570269051555689, + "flos": 68761897511040.0, + "grad_norm": 0.7330380682962492, + "language_loss": 0.64455849, + "learning_rate": 1.112011294493775e-06, + "loss": 0.66483754, + "num_input_tokens_seen": 235932925, + "router_z_loss_clip": 0.01312256, + "router_z_loss_mlp": 0.20898438, + "step": 10928, + "time_per_iteration": 3.092785120010376 + }, + { + "auxiliary_loss_clip": 0.01101776, + "auxiliary_loss_mlp": 0.01028054, + "balance_loss_clip": 1.01669884, + "balance_loss_mlp": 1.03520453, + "epoch": 0.6570870284082369, + "flos": 26319289495680.0, + "grad_norm": 1.7549487521997071, + "language_loss": 0.77955842, + "learning_rate": 1.1116623421440063e-06, + "loss": 0.80085671, + "num_input_tokens_seen": 235952680, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6640625, + "step": 10929, + "time_per_iteration": 4.023591041564941 + }, + { + "auxiliary_loss_clip": 0.01102136, + "auxiliary_loss_mlp": 0.01031793, + "balance_loss_clip": 1.02030087, + "balance_loss_mlp": 1.0353775, + "epoch": 0.6571471516609049, + "flos": 26174749167360.0, + "grad_norm": 1.6371374390238511, + "language_loss": 0.65487254, + "learning_rate": 1.1113134234800895e-06, + "loss": 0.67621183, + "num_input_tokens_seen": 235972075, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.66796875, + "step": 10930, + "time_per_iteration": 3.8790106773376465 + }, + { + "auxiliary_loss_clip": 0.01102, + "auxiliary_loss_mlp": 0.01028907, + "balance_loss_clip": 1.01679564, + "balance_loss_mlp": 1.03432441, + "epoch": 0.6572072749135728, + "flos": 20376253664640.0, + "grad_norm": 1.5199914554797245, + "language_loss": 0.70439506, + "learning_rate": 1.110964538515258e-06, + "loss": 0.72570413, + "num_input_tokens_seen": 235990340, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 10931, + "time_per_iteration": 3.8428475856781006 + }, + { + "auxiliary_loss_clip": 0.01105703, + "auxiliary_loss_mlp": 0.01035602, + "balance_loss_clip": 1.02384758, + "balance_loss_mlp": 1.03632128, + "epoch": 0.6572673981662408, + "flos": 17128744110720.0, + "grad_norm": 2.0176400266990147, + "language_loss": 0.68914682, + "learning_rate": 1.1106156872627393e-06, + "loss": 0.71055984, + "num_input_tokens_seen": 236007470, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6953125, + "step": 10932, + "time_per_iteration": 2.427386999130249 + }, + { + "auxiliary_loss_clip": 0.01100641, + "auxiliary_loss_mlp": 0.01028111, + "balance_loss_clip": 1.01668537, + "balance_loss_mlp": 1.03434443, + "epoch": 0.6573275214189087, + "flos": 41275113281280.0, + "grad_norm": 2.41406977097007, + "language_loss": 0.80051857, + "learning_rate": 1.1102668697357626e-06, + "loss": 0.82180607, + "num_input_tokens_seen": 236029030, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 10933, + "time_per_iteration": 2.5989818572998047 + }, + { + "auxiliary_loss_clip": 0.01106278, + "auxiliary_loss_mlp": 0.01030089, + "balance_loss_clip": 1.01818609, + "balance_loss_mlp": 1.03827631, + "epoch": 0.6573876446715767, + "flos": 22890143842560.0, + "grad_norm": 1.7962352646576603, + "language_loss": 0.73653376, + "learning_rate": 1.1099180859475571e-06, + "loss": 0.75789738, + "num_input_tokens_seen": 236047160, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 10934, + "time_per_iteration": 2.513033390045166 + }, + { + "auxiliary_loss_clip": 0.01101364, + "auxiliary_loss_mlp": 0.01031486, + "balance_loss_clip": 1.01991725, + "balance_loss_mlp": 1.03564167, + "epoch": 0.6574477679242448, + "flos": 44018150273280.0, + "grad_norm": 1.5095272560756583, + "language_loss": 0.7590912, + "learning_rate": 1.1095693359113454e-06, + "loss": 0.78041971, + "num_input_tokens_seen": 236069215, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 10935, + "time_per_iteration": 2.7678496837615967 + }, + { + "auxiliary_loss_clip": 0.0110481, + "auxiliary_loss_mlp": 0.0103704, + "balance_loss_clip": 1.02380824, + "balance_loss_mlp": 1.03610992, + "epoch": 0.6575078911769127, + "flos": 24571517523840.0, + "grad_norm": 1.8441545252151383, + "language_loss": 0.78123999, + "learning_rate": 1.1092206196403538e-06, + "loss": 0.8026585, + "num_input_tokens_seen": 236088335, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6875, + "step": 10936, + "time_per_iteration": 2.5077192783355713 + }, + { + "auxiliary_loss_clip": 0.01099758, + "auxiliary_loss_mlp": 0.01029165, + "balance_loss_clip": 1.01789331, + "balance_loss_mlp": 1.03462768, + "epoch": 0.6575680144295807, + "flos": 20924035050240.0, + "grad_norm": 2.0488788051519777, + "language_loss": 0.68872631, + "learning_rate": 1.1088719371478056e-06, + "loss": 0.71001554, + "num_input_tokens_seen": 236108540, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 10937, + "time_per_iteration": 2.5001776218414307 + }, + { + "auxiliary_loss_clip": 0.01102833, + "auxiliary_loss_mlp": 0.01027511, + "balance_loss_clip": 1.01570368, + "balance_loss_mlp": 1.03619266, + "epoch": 0.6576281376822486, + "flos": 10925642833920.0, + "grad_norm": 2.29220645619057, + "language_loss": 0.68323117, + "learning_rate": 1.1085232884469236e-06, + "loss": 0.70453459, + "num_input_tokens_seen": 236124495, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 10938, + "time_per_iteration": 2.4366493225097656 + }, + { + "auxiliary_loss_clip": 0.01105738, + "auxiliary_loss_mlp": 0.01030966, + "balance_loss_clip": 1.01890206, + "balance_loss_mlp": 1.03749824, + "epoch": 0.6576882609349166, + "flos": 19281552819840.0, + "grad_norm": 2.075102589417424, + "language_loss": 0.71458369, + "learning_rate": 1.108174673550927e-06, + "loss": 0.73595071, + "num_input_tokens_seen": 236142550, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.68359375, + "step": 10939, + "time_per_iteration": 2.4688596725463867 + }, + { + "auxiliary_loss_clip": 0.0110619, + "auxiliary_loss_mlp": 0.01029227, + "balance_loss_clip": 1.01679969, + "balance_loss_mlp": 1.03710163, + "epoch": 0.6577483841875845, + "flos": 20220544206720.0, + "grad_norm": 2.217107584857945, + "language_loss": 0.77532256, + "learning_rate": 1.107826092473037e-06, + "loss": 0.7966767, + "num_input_tokens_seen": 236156620, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 10940, + "time_per_iteration": 2.425093412399292 + }, + { + "auxiliary_loss_clip": 0.0110778, + "auxiliary_loss_mlp": 0.01031615, + "balance_loss_clip": 1.01946735, + "balance_loss_mlp": 1.03589988, + "epoch": 0.6578085074402525, + "flos": 34751078962560.0, + "grad_norm": 2.046264853980575, + "language_loss": 0.68482137, + "learning_rate": 1.107477545226471e-06, + "loss": 0.70621532, + "num_input_tokens_seen": 236177095, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71875, + "step": 10941, + "time_per_iteration": 2.5489418506622314 + }, + { + "auxiliary_loss_clip": 0.01100409, + "auxiliary_loss_mlp": 0.01026012, + "balance_loss_clip": 1.01428187, + "balance_loss_mlp": 1.03322697, + "epoch": 0.6578686306929205, + "flos": 23470998675840.0, + "grad_norm": 1.8711951914026155, + "language_loss": 0.68390548, + "learning_rate": 1.1071290318244448e-06, + "loss": 0.70516968, + "num_input_tokens_seen": 236194695, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 10942, + "time_per_iteration": 2.486746072769165 + }, + { + "auxiliary_loss_clip": 0.0110907, + "auxiliary_loss_mlp": 0.01035043, + "balance_loss_clip": 1.02216208, + "balance_loss_mlp": 1.03639185, + "epoch": 0.6579287539455885, + "flos": 18077073033600.0, + "grad_norm": 2.0678514729005544, + "language_loss": 0.71317995, + "learning_rate": 1.1067805522801753e-06, + "loss": 0.73462105, + "num_input_tokens_seen": 236213885, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7265625, + "step": 10943, + "time_per_iteration": 2.4520316123962402 + }, + { + "auxiliary_loss_clip": 0.01102052, + "auxiliary_loss_mlp": 0.01030183, + "balance_loss_clip": 1.01820219, + "balance_loss_mlp": 1.03616333, + "epoch": 0.6579888771982564, + "flos": 28661383900800.0, + "grad_norm": 1.7679689812851298, + "language_loss": 0.59513438, + "learning_rate": 1.1064321066068778e-06, + "loss": 0.61645675, + "num_input_tokens_seen": 236237315, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66015625, + "step": 10944, + "time_per_iteration": 2.5190436840057373 + }, + { + "auxiliary_loss_clip": 0.01108265, + "auxiliary_loss_mlp": 0.01035357, + "balance_loss_clip": 1.02253008, + "balance_loss_mlp": 1.03664446, + "epoch": 0.6580490004509244, + "flos": 25046543911680.0, + "grad_norm": 1.558618410146096, + "language_loss": 0.72308242, + "learning_rate": 1.1060836948177646e-06, + "loss": 0.74451864, + "num_input_tokens_seen": 236256345, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.71484375, + "step": 10945, + "time_per_iteration": 2.463829755783081 + }, + { + "auxiliary_loss_clip": 0.01102, + "auxiliary_loss_mlp": 0.01026302, + "balance_loss_clip": 1.01497078, + "balance_loss_mlp": 1.03548717, + "epoch": 0.6581091237035923, + "flos": 43508793461760.0, + "grad_norm": 1.6917792730430523, + "language_loss": 0.70766807, + "learning_rate": 1.105735316926046e-06, + "loss": 0.7289511, + "num_input_tokens_seen": 236281890, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 10946, + "time_per_iteration": 2.6370081901550293 + }, + { + "auxiliary_loss_clip": 0.01104509, + "auxiliary_loss_mlp": 0.01030715, + "balance_loss_clip": 1.0187701, + "balance_loss_mlp": 1.03649974, + "epoch": 0.6581692469562603, + "flos": 22415404763520.0, + "grad_norm": 1.9998217553522297, + "language_loss": 0.81970888, + "learning_rate": 1.105386972944934e-06, + "loss": 0.84106112, + "num_input_tokens_seen": 236298370, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 10947, + "time_per_iteration": 2.44291090965271 + }, + { + "auxiliary_loss_clip": 0.01105119, + "auxiliary_loss_mlp": 0.01028432, + "balance_loss_clip": 1.01703572, + "balance_loss_mlp": 1.03552985, + "epoch": 0.6582293702089284, + "flos": 24859772167680.0, + "grad_norm": 1.5893547671126769, + "language_loss": 0.77298671, + "learning_rate": 1.1050386628876385e-06, + "loss": 0.79432225, + "num_input_tokens_seen": 236317380, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6953125, + "step": 10948, + "time_per_iteration": 2.52156400680542 + }, + { + "auxiliary_loss_clip": 0.01103491, + "auxiliary_loss_mlp": 0.01024697, + "balance_loss_clip": 1.01318693, + "balance_loss_mlp": 1.03675056, + "epoch": 0.6582894934615963, + "flos": 23039676161280.0, + "grad_norm": 1.5781773720774923, + "language_loss": 0.79309839, + "learning_rate": 1.1046903867673655e-06, + "loss": 0.81438029, + "num_input_tokens_seen": 236336210, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66796875, + "step": 10949, + "time_per_iteration": 2.4466731548309326 + }, + { + "auxiliary_loss_clip": 0.01025722, + "auxiliary_loss_mlp": 0.00999404, + "balance_loss_clip": 0.99824774, + "balance_loss_mlp": 1.0049113, + "epoch": 0.6583496167142643, + "flos": 72551980978560.0, + "grad_norm": 0.7326202101084998, + "language_loss": 0.61823738, + "learning_rate": 1.104342144597323e-06, + "loss": 0.63848865, + "num_input_tokens_seen": 236403090, + "router_z_loss_clip": 0.01153564, + "router_z_loss_mlp": 0.20800781, + "step": 10950, + "time_per_iteration": 3.121711015701294 + }, + { + "auxiliary_loss_clip": 0.01098873, + "auxiliary_loss_mlp": 0.01029132, + "balance_loss_clip": 1.01832557, + "balance_loss_mlp": 1.0340389, + "epoch": 0.6584097399669322, + "flos": 13078846592640.0, + "grad_norm": 2.039519263453104, + "language_loss": 0.67086935, + "learning_rate": 1.1039939363907178e-06, + "loss": 0.69214934, + "num_input_tokens_seen": 236420475, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6484375, + "step": 10951, + "time_per_iteration": 2.4204366207122803 + }, + { + "auxiliary_loss_clip": 0.01103981, + "auxiliary_loss_mlp": 0.01031136, + "balance_loss_clip": 1.01967382, + "balance_loss_mlp": 1.03702927, + "epoch": 0.6584698632196002, + "flos": 28693164458880.0, + "grad_norm": 1.3948057696634335, + "language_loss": 0.76445824, + "learning_rate": 1.1036457621607504e-06, + "loss": 0.7858094, + "num_input_tokens_seen": 236441915, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 10952, + "time_per_iteration": 2.5405352115631104 + }, + { + "auxiliary_loss_clip": 0.01101736, + "auxiliary_loss_mlp": 0.01031049, + "balance_loss_clip": 1.0193491, + "balance_loss_mlp": 1.03628421, + "epoch": 0.6585299864722681, + "flos": 14319272914560.0, + "grad_norm": 1.8480440869895376, + "language_loss": 0.73304069, + "learning_rate": 1.1032976219206257e-06, + "loss": 0.75436854, + "num_input_tokens_seen": 236460340, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 10953, + "time_per_iteration": 2.4275546073913574 + }, + { + "auxiliary_loss_clip": 0.01103382, + "auxiliary_loss_mlp": 0.01035829, + "balance_loss_clip": 1.02360368, + "balance_loss_mlp": 1.0364027, + "epoch": 0.6585901097249361, + "flos": 26797907243520.0, + "grad_norm": 2.01659222308535, + "language_loss": 0.78839052, + "learning_rate": 1.102949515683546e-06, + "loss": 0.80978262, + "num_input_tokens_seen": 236478280, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 10954, + "time_per_iteration": 2.515486478805542 + }, + { + "auxiliary_loss_clip": 0.01105192, + "auxiliary_loss_mlp": 0.01030874, + "balance_loss_clip": 1.01928055, + "balance_loss_mlp": 1.0370729, + "epoch": 0.658650232977604, + "flos": 18733124989440.0, + "grad_norm": 4.542628698192554, + "language_loss": 0.69261253, + "learning_rate": 1.1026014434627096e-06, + "loss": 0.71397316, + "num_input_tokens_seen": 236493225, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 10955, + "time_per_iteration": 2.4162137508392334 + }, + { + "auxiliary_loss_clip": 0.01098966, + "auxiliary_loss_mlp": 0.01031102, + "balance_loss_clip": 1.02065945, + "balance_loss_mlp": 1.03447628, + "epoch": 0.6587103562302721, + "flos": 24753440931840.0, + "grad_norm": 1.9435823457200367, + "language_loss": 0.8063699, + "learning_rate": 1.1022534052713172e-06, + "loss": 0.82767057, + "num_input_tokens_seen": 236514420, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.64453125, + "step": 10956, + "time_per_iteration": 2.501207113265991 + }, + { + "auxiliary_loss_clip": 0.01103712, + "auxiliary_loss_mlp": 0.0103857, + "balance_loss_clip": 1.02636909, + "balance_loss_mlp": 1.03677917, + "epoch": 0.65877047948294, + "flos": 22346133384960.0, + "grad_norm": 2.2587354412030365, + "language_loss": 0.8126533, + "learning_rate": 1.1019054011225648e-06, + "loss": 0.83407611, + "num_input_tokens_seen": 236532785, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66796875, + "step": 10957, + "time_per_iteration": 2.4624950885772705 + }, + { + "auxiliary_loss_clip": 0.01101319, + "auxiliary_loss_mlp": 0.01027592, + "balance_loss_clip": 1.01656473, + "balance_loss_mlp": 1.03620899, + "epoch": 0.658830602735608, + "flos": 45180542298240.0, + "grad_norm": 1.8981628531368988, + "language_loss": 0.76096463, + "learning_rate": 1.1015574310296506e-06, + "loss": 0.78225374, + "num_input_tokens_seen": 236553330, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65234375, + "step": 10958, + "time_per_iteration": 2.6494197845458984 + }, + { + "auxiliary_loss_clip": 0.01101191, + "auxiliary_loss_mlp": 0.01029699, + "balance_loss_clip": 1.01818335, + "balance_loss_mlp": 1.03651094, + "epoch": 0.6588907259882759, + "flos": 19901622326400.0, + "grad_norm": 1.5449360693578584, + "language_loss": 0.7480197, + "learning_rate": 1.1012094950057678e-06, + "loss": 0.76932859, + "num_input_tokens_seen": 236572960, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 10959, + "time_per_iteration": 2.427396535873413 + }, + { + "auxiliary_loss_clip": 0.01102895, + "auxiliary_loss_mlp": 0.0102741, + "balance_loss_clip": 1.01597738, + "balance_loss_mlp": 1.03627992, + "epoch": 0.6589508492409439, + "flos": 24133766474880.0, + "grad_norm": 1.5048251142631304, + "language_loss": 0.64632499, + "learning_rate": 1.1008615930641107e-06, + "loss": 0.66762793, + "num_input_tokens_seen": 236594090, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 10960, + "time_per_iteration": 2.4602410793304443 + }, + { + "auxiliary_loss_clip": 0.01108237, + "auxiliary_loss_mlp": 0.01031743, + "balance_loss_clip": 1.01920843, + "balance_loss_mlp": 1.03767896, + "epoch": 0.659010972493612, + "flos": 18222906251520.0, + "grad_norm": 2.0928832268916064, + "language_loss": 0.81810492, + "learning_rate": 1.1005137252178734e-06, + "loss": 0.83950472, + "num_input_tokens_seen": 236610190, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.70703125, + "step": 10961, + "time_per_iteration": 2.409662961959839 + }, + { + "auxiliary_loss_clip": 0.01105671, + "auxiliary_loss_mlp": 0.01028804, + "balance_loss_clip": 1.01721096, + "balance_loss_mlp": 1.03837204, + "epoch": 0.6590710957462799, + "flos": 27600007898880.0, + "grad_norm": 1.6316286919602636, + "language_loss": 0.73185778, + "learning_rate": 1.1001658914802453e-06, + "loss": 0.7532025, + "num_input_tokens_seen": 236631575, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 10962, + "time_per_iteration": 2.5012168884277344 + }, + { + "auxiliary_loss_clip": 0.01103926, + "auxiliary_loss_mlp": 0.01027399, + "balance_loss_clip": 1.016325, + "balance_loss_mlp": 1.03553998, + "epoch": 0.6591312189989479, + "flos": 20302959962880.0, + "grad_norm": 2.292666509682468, + "language_loss": 0.7991221, + "learning_rate": 1.0998180918644165e-06, + "loss": 0.8204354, + "num_input_tokens_seen": 236649815, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.68359375, + "step": 10963, + "time_per_iteration": 2.4411072731018066 + }, + { + "auxiliary_loss_clip": 0.01101161, + "auxiliary_loss_mlp": 0.01026818, + "balance_loss_clip": 1.01545739, + "balance_loss_mlp": 1.03585351, + "epoch": 0.6591913422516158, + "flos": 12312943868160.0, + "grad_norm": 1.6740266575713383, + "language_loss": 0.78245199, + "learning_rate": 1.0994703263835754e-06, + "loss": 0.8037318, + "num_input_tokens_seen": 236668335, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65625, + "step": 10964, + "time_per_iteration": 2.5599732398986816 + }, + { + "auxiliary_loss_clip": 0.01103059, + "auxiliary_loss_mlp": 0.01032339, + "balance_loss_clip": 1.02130592, + "balance_loss_mlp": 1.03435874, + "epoch": 0.6592514655042838, + "flos": 25884591102720.0, + "grad_norm": 1.7118472944354244, + "language_loss": 0.74207413, + "learning_rate": 1.0991225950509106e-06, + "loss": 0.76342809, + "num_input_tokens_seen": 236688945, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6875, + "step": 10965, + "time_per_iteration": 2.471712112426758 + }, + { + "auxiliary_loss_clip": 0.01107005, + "auxiliary_loss_mlp": 0.01030746, + "balance_loss_clip": 1.01877689, + "balance_loss_mlp": 1.03634071, + "epoch": 0.6593115887569517, + "flos": 14063624841600.0, + "grad_norm": 1.7378396373661993, + "language_loss": 0.73264408, + "learning_rate": 1.0987748978796067e-06, + "loss": 0.75402158, + "num_input_tokens_seen": 236707055, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.70703125, + "step": 10966, + "time_per_iteration": 2.436239004135132 + }, + { + "auxiliary_loss_clip": 0.0110244, + "auxiliary_loss_mlp": 0.01030113, + "balance_loss_clip": 1.01798916, + "balance_loss_mlp": 1.03512931, + "epoch": 0.6593717120096197, + "flos": 24717925359360.0, + "grad_norm": 1.8788551125386406, + "language_loss": 0.77065092, + "learning_rate": 1.0984272348828487e-06, + "loss": 0.79197645, + "num_input_tokens_seen": 236725900, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 10967, + "time_per_iteration": 2.4717586040496826 + }, + { + "auxiliary_loss_clip": 0.01027072, + "auxiliary_loss_mlp": 0.01001789, + "balance_loss_clip": 1.00063896, + "balance_loss_mlp": 1.00624704, + "epoch": 0.6594318352622877, + "flos": 55558083502080.0, + "grad_norm": 0.6916872612313274, + "language_loss": 0.48437804, + "learning_rate": 1.0980796060738221e-06, + "loss": 0.50466669, + "num_input_tokens_seen": 236788415, + "router_z_loss_clip": 0.01147461, + "router_z_loss_mlp": 0.20898438, + "step": 10968, + "time_per_iteration": 4.5336384773254395 + }, + { + "auxiliary_loss_clip": 0.01103459, + "auxiliary_loss_mlp": 0.01030332, + "balance_loss_clip": 1.01853621, + "balance_loss_mlp": 1.03579104, + "epoch": 0.6594919585149557, + "flos": 17456931699840.0, + "grad_norm": 1.9909395686766433, + "language_loss": 0.79144681, + "learning_rate": 1.0977320114657058e-06, + "loss": 0.81278479, + "num_input_tokens_seen": 236805155, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 10969, + "time_per_iteration": 2.4394266605377197 + }, + { + "auxiliary_loss_clip": 0.01101468, + "auxiliary_loss_mlp": 0.01027571, + "balance_loss_clip": 1.0165081, + "balance_loss_mlp": 1.03489542, + "epoch": 0.6595520817676236, + "flos": 18223229473920.0, + "grad_norm": 1.9980021115439661, + "language_loss": 0.65425408, + "learning_rate": 1.0973844510716817e-06, + "loss": 0.6755445, + "num_input_tokens_seen": 236824360, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6640625, + "step": 10970, + "time_per_iteration": 2.421241521835327 + }, + { + "auxiliary_loss_clip": 0.01103326, + "auxiliary_loss_mlp": 0.0102615, + "balance_loss_clip": 1.01499188, + "balance_loss_mlp": 1.0361867, + "epoch": 0.6596122050202916, + "flos": 22199761463040.0, + "grad_norm": 1.9709453771316594, + "language_loss": 0.76396167, + "learning_rate": 1.0970369249049308e-06, + "loss": 0.78525639, + "num_input_tokens_seen": 236844640, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.671875, + "step": 10971, + "time_per_iteration": 5.2941343784332275 + }, + { + "auxiliary_loss_clip": 0.01105265, + "auxiliary_loss_mlp": 0.01031993, + "balance_loss_clip": 1.02074528, + "balance_loss_mlp": 1.03658032, + "epoch": 0.6596723282729595, + "flos": 14173834746240.0, + "grad_norm": 2.436761152631742, + "language_loss": 0.70031983, + "learning_rate": 1.096689432978629e-06, + "loss": 0.72169238, + "num_input_tokens_seen": 236861160, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6875, + "step": 10972, + "time_per_iteration": 2.434751033782959 + }, + { + "auxiliary_loss_clip": 0.01104063, + "auxiliary_loss_mlp": 0.01026316, + "balance_loss_clip": 1.01401901, + "balance_loss_mlp": 1.03706002, + "epoch": 0.6597324515256275, + "flos": 30553193410560.0, + "grad_norm": 2.0552877724786347, + "language_loss": 0.55426097, + "learning_rate": 1.0963419753059556e-06, + "loss": 0.5755648, + "num_input_tokens_seen": 236880465, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.671875, + "step": 10973, + "time_per_iteration": 3.9870107173919678 + }, + { + "auxiliary_loss_clip": 0.01108369, + "auxiliary_loss_mlp": 0.01034143, + "balance_loss_clip": 1.0224663, + "balance_loss_mlp": 1.0379895, + "epoch": 0.6597925747782956, + "flos": 17639860688640.0, + "grad_norm": 1.9173473771897223, + "language_loss": 0.78754056, + "learning_rate": 1.0959945519000839e-06, + "loss": 0.80896568, + "num_input_tokens_seen": 236897730, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.703125, + "step": 10974, + "time_per_iteration": 2.413245916366577 + }, + { + "auxiliary_loss_clip": 0.01104385, + "auxiliary_loss_mlp": 0.01031342, + "balance_loss_clip": 1.01999879, + "balance_loss_mlp": 1.03666687, + "epoch": 0.6598526980309635, + "flos": 22819112697600.0, + "grad_norm": 2.1994599169674016, + "language_loss": 0.69061923, + "learning_rate": 1.0956471627741906e-06, + "loss": 0.71197647, + "num_input_tokens_seen": 236917300, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.67578125, + "step": 10975, + "time_per_iteration": 2.488288164138794 + }, + { + "auxiliary_loss_clip": 0.01101915, + "auxiliary_loss_mlp": 0.01026336, + "balance_loss_clip": 1.01519537, + "balance_loss_mlp": 1.03476441, + "epoch": 0.6599128212836315, + "flos": 21068036674560.0, + "grad_norm": 1.699075414788055, + "language_loss": 0.7082206, + "learning_rate": 1.0952998079414464e-06, + "loss": 0.72950304, + "num_input_tokens_seen": 236935590, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.671875, + "step": 10976, + "time_per_iteration": 2.4436802864074707 + }, + { + "auxiliary_loss_clip": 0.01099428, + "auxiliary_loss_mlp": 0.010308, + "balance_loss_clip": 1.01890898, + "balance_loss_mlp": 1.03462744, + "epoch": 0.6599729445362994, + "flos": 22163527618560.0, + "grad_norm": 1.7471383506629494, + "language_loss": 0.6767379, + "learning_rate": 1.0949524874150243e-06, + "loss": 0.69804019, + "num_input_tokens_seen": 236952830, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6484375, + "step": 10977, + "time_per_iteration": 2.4598448276519775 + }, + { + "auxiliary_loss_clip": 0.01108053, + "auxiliary_loss_mlp": 0.01027322, + "balance_loss_clip": 1.01457834, + "balance_loss_mlp": 1.03748345, + "epoch": 0.6600330677889674, + "flos": 18150079426560.0, + "grad_norm": 2.0162776681697476, + "language_loss": 0.81473112, + "learning_rate": 1.0946052012080952e-06, + "loss": 0.83608478, + "num_input_tokens_seen": 236971930, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 10978, + "time_per_iteration": 2.4228336811065674 + }, + { + "auxiliary_loss_clip": 0.01107046, + "auxiliary_loss_mlp": 0.01037813, + "balance_loss_clip": 1.02570164, + "balance_loss_mlp": 1.03726959, + "epoch": 0.6600931910416353, + "flos": 18150115340160.0, + "grad_norm": 3.1339976235635527, + "language_loss": 0.6725859, + "learning_rate": 1.0942579493338278e-06, + "loss": 0.69403446, + "num_input_tokens_seen": 236989920, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69921875, + "step": 10979, + "time_per_iteration": 2.450756549835205 + }, + { + "auxiliary_loss_clip": 0.01102975, + "auxiliary_loss_mlp": 0.01023928, + "balance_loss_clip": 1.01215005, + "balance_loss_mlp": 1.0349319, + "epoch": 0.6601533142943034, + "flos": 17420733768960.0, + "grad_norm": 2.827162971921963, + "language_loss": 0.72720212, + "learning_rate": 1.0939107318053889e-06, + "loss": 0.74847114, + "num_input_tokens_seen": 237006570, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6796875, + "step": 10980, + "time_per_iteration": 2.406029462814331 + }, + { + "auxiliary_loss_clip": 0.01098591, + "auxiliary_loss_mlp": 0.01031279, + "balance_loss_clip": 1.02074146, + "balance_loss_mlp": 1.03450036, + "epoch": 0.6602134375469713, + "flos": 28219574615040.0, + "grad_norm": 1.5521957632844796, + "language_loss": 0.72807193, + "learning_rate": 1.0935635486359459e-06, + "loss": 0.74937057, + "num_input_tokens_seen": 237028415, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.640625, + "step": 10981, + "time_per_iteration": 2.5201127529144287 + }, + { + "auxiliary_loss_clip": 0.01104778, + "auxiliary_loss_mlp": 0.01032982, + "balance_loss_clip": 1.02119243, + "balance_loss_mlp": 1.03583837, + "epoch": 0.6602735607996393, + "flos": 29418056830080.0, + "grad_norm": 1.966625481577904, + "language_loss": 0.69085824, + "learning_rate": 1.0932163998386647e-06, + "loss": 0.71223581, + "num_input_tokens_seen": 237046595, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 10982, + "time_per_iteration": 2.5098371505737305 + }, + { + "auxiliary_loss_clip": 0.01101832, + "auxiliary_loss_mlp": 0.01026165, + "balance_loss_clip": 1.01473927, + "balance_loss_mlp": 1.03688765, + "epoch": 0.6603336840523072, + "flos": 18588045957120.0, + "grad_norm": 1.50117340695368, + "language_loss": 0.69566637, + "learning_rate": 1.0928692854267075e-06, + "loss": 0.71694636, + "num_input_tokens_seen": 237066150, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 10983, + "time_per_iteration": 2.4642090797424316 + }, + { + "auxiliary_loss_clip": 0.01103785, + "auxiliary_loss_mlp": 0.01027163, + "balance_loss_clip": 1.01571345, + "balance_loss_mlp": 1.03580856, + "epoch": 0.6603938073049752, + "flos": 33254860913280.0, + "grad_norm": 1.6650782937776725, + "language_loss": 0.70871687, + "learning_rate": 1.092522205413239e-06, + "loss": 0.73002636, + "num_input_tokens_seen": 237087060, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6796875, + "step": 10984, + "time_per_iteration": 2.545948028564453 + }, + { + "auxiliary_loss_clip": 0.01099312, + "auxiliary_loss_mlp": 0.01032765, + "balance_loss_clip": 1.02120149, + "balance_loss_mlp": 1.03464043, + "epoch": 0.6604539305576431, + "flos": 17384284442880.0, + "grad_norm": 1.583849922965693, + "language_loss": 0.83839536, + "learning_rate": 1.0921751598114193e-06, + "loss": 0.85971612, + "num_input_tokens_seen": 237103825, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6484375, + "step": 10985, + "time_per_iteration": 2.5026867389678955 + }, + { + "auxiliary_loss_clip": 0.01105227, + "auxiliary_loss_mlp": 0.01032357, + "balance_loss_clip": 1.02026308, + "balance_loss_mlp": 1.03746915, + "epoch": 0.6605140538103111, + "flos": 21251145231360.0, + "grad_norm": 2.805092368411813, + "language_loss": 0.73806614, + "learning_rate": 1.0918281486344077e-06, + "loss": 0.75944197, + "num_input_tokens_seen": 237121740, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 10986, + "time_per_iteration": 2.4697890281677246 + }, + { + "auxiliary_loss_clip": 0.01100417, + "auxiliary_loss_mlp": 0.01026054, + "balance_loss_clip": 1.01450825, + "balance_loss_mlp": 1.03609347, + "epoch": 0.6605741770629792, + "flos": 13881701433600.0, + "grad_norm": 1.6327019217005077, + "language_loss": 0.78796637, + "learning_rate": 1.0914811718953636e-06, + "loss": 0.80923104, + "num_input_tokens_seen": 237139565, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 10987, + "time_per_iteration": 2.417971611022949 + }, + { + "auxiliary_loss_clip": 0.01026194, + "auxiliary_loss_mlp": 0.01004542, + "balance_loss_clip": 1.0033257, + "balance_loss_mlp": 1.00560772, + "epoch": 0.6606343003156471, + "flos": 69316215171840.0, + "grad_norm": 0.8165641821952351, + "language_loss": 0.54130733, + "learning_rate": 1.0911342296074454e-06, + "loss": 0.56161469, + "num_input_tokens_seen": 237201055, + "router_z_loss_clip": 0.012146, + "router_z_loss_mlp": 0.20605469, + "step": 10988, + "time_per_iteration": 3.158214807510376 + }, + { + "auxiliary_loss_clip": 0.01103971, + "auxiliary_loss_mlp": 0.0102934, + "balance_loss_clip": 1.01902199, + "balance_loss_mlp": 1.03813577, + "epoch": 0.6606944235683151, + "flos": 27272394927360.0, + "grad_norm": 1.5008723881290433, + "language_loss": 0.77463698, + "learning_rate": 1.0907873217838077e-06, + "loss": 0.79597014, + "num_input_tokens_seen": 237221805, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.65625, + "step": 10989, + "time_per_iteration": 2.531778573989868 + }, + { + "auxiliary_loss_clip": 0.01105177, + "auxiliary_loss_mlp": 0.01032304, + "balance_loss_clip": 1.02086616, + "balance_loss_mlp": 1.0393579, + "epoch": 0.660754546820983, + "flos": 13772820332160.0, + "grad_norm": 1.9100821463598359, + "language_loss": 0.77224958, + "learning_rate": 1.0904404484376064e-06, + "loss": 0.7936244, + "num_input_tokens_seen": 237238270, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66015625, + "step": 10990, + "time_per_iteration": 2.393866539001465 + }, + { + "auxiliary_loss_clip": 0.01103575, + "auxiliary_loss_mlp": 0.01027755, + "balance_loss_clip": 1.0154345, + "balance_loss_mlp": 1.03490543, + "epoch": 0.660814670073651, + "flos": 15705209232000.0, + "grad_norm": 1.959228938394804, + "language_loss": 0.60573477, + "learning_rate": 1.0900936095819937e-06, + "loss": 0.62704802, + "num_input_tokens_seen": 237255400, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6875, + "step": 10991, + "time_per_iteration": 2.421860933303833 + }, + { + "auxiliary_loss_clip": 0.01106108, + "auxiliary_loss_mlp": 0.01031939, + "balance_loss_clip": 1.01960719, + "balance_loss_mlp": 1.03634095, + "epoch": 0.6608747933263189, + "flos": 20850023076480.0, + "grad_norm": 2.508745269820261, + "language_loss": 0.68313217, + "learning_rate": 1.0897468052301234e-06, + "loss": 0.70451266, + "num_input_tokens_seen": 237273105, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69921875, + "step": 10992, + "time_per_iteration": 2.438251495361328 + }, + { + "auxiliary_loss_clip": 0.01105003, + "auxiliary_loss_mlp": 0.01027261, + "balance_loss_clip": 1.0152688, + "balance_loss_mlp": 1.03565395, + "epoch": 0.660934916578987, + "flos": 20632117219200.0, + "grad_norm": 2.0506508317322036, + "language_loss": 0.87773001, + "learning_rate": 1.0894000353951444e-06, + "loss": 0.89905262, + "num_input_tokens_seen": 237292650, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 10993, + "time_per_iteration": 2.4813613891601562 + }, + { + "auxiliary_loss_clip": 0.01109842, + "auxiliary_loss_mlp": 0.01029551, + "balance_loss_clip": 1.01642656, + "balance_loss_mlp": 1.03765821, + "epoch": 0.6609950398316549, + "flos": 25113588647040.0, + "grad_norm": 1.679596565938276, + "language_loss": 0.66940713, + "learning_rate": 1.0890533000902078e-06, + "loss": 0.69080102, + "num_input_tokens_seen": 237312865, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.72265625, + "step": 10994, + "time_per_iteration": 2.464946985244751 + }, + { + "auxiliary_loss_clip": 0.0110627, + "auxiliary_loss_mlp": 0.0103033, + "balance_loss_clip": 1.01818299, + "balance_loss_mlp": 1.03735578, + "epoch": 0.6610551630843229, + "flos": 18661196004480.0, + "grad_norm": 1.7600806197216516, + "language_loss": 0.76505876, + "learning_rate": 1.0887065993284626e-06, + "loss": 0.78642476, + "num_input_tokens_seen": 237331210, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6875, + "step": 10995, + "time_per_iteration": 2.443978786468506 + }, + { + "auxiliary_loss_clip": 0.01103759, + "auxiliary_loss_mlp": 0.01025776, + "balance_loss_clip": 1.01477861, + "balance_loss_mlp": 1.03649068, + "epoch": 0.6611152863369908, + "flos": 23258192549760.0, + "grad_norm": 1.907480349708707, + "language_loss": 0.74543679, + "learning_rate": 1.088359933123053e-06, + "loss": 0.76673216, + "num_input_tokens_seen": 237349455, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.671875, + "step": 10996, + "time_per_iteration": 2.437030076980591 + }, + { + "auxiliary_loss_clip": 0.01103314, + "auxiliary_loss_mlp": 0.01031676, + "balance_loss_clip": 1.01963055, + "balance_loss_mlp": 1.03681195, + "epoch": 0.6611754095896588, + "flos": 22159720776960.0, + "grad_norm": 1.9556097783969382, + "language_loss": 0.68673009, + "learning_rate": 1.088013301487126e-06, + "loss": 0.70807999, + "num_input_tokens_seen": 237367100, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6640625, + "step": 10997, + "time_per_iteration": 2.4747731685638428 + }, + { + "auxiliary_loss_clip": 0.01107479, + "auxiliary_loss_mlp": 0.01026937, + "balance_loss_clip": 1.01583838, + "balance_loss_mlp": 1.03762627, + "epoch": 0.6612355328423267, + "flos": 13991228979840.0, + "grad_norm": 1.9530622490500587, + "language_loss": 0.68974924, + "learning_rate": 1.0876667044338269e-06, + "loss": 0.71109343, + "num_input_tokens_seen": 237384840, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.69921875, + "step": 10998, + "time_per_iteration": 2.407527208328247 + }, + { + "auxiliary_loss_clip": 0.01026246, + "auxiliary_loss_mlp": 0.0100257, + "balance_loss_clip": 1.00145519, + "balance_loss_mlp": 1.00553703, + "epoch": 0.6612956560949947, + "flos": 61453716359040.0, + "grad_norm": 0.6545620134591473, + "language_loss": 0.5117774, + "learning_rate": 1.087320141976297e-06, + "loss": 0.53206557, + "num_input_tokens_seen": 237443355, + "router_z_loss_clip": 0.01116943, + "router_z_loss_mlp": 0.20703125, + "step": 10999, + "time_per_iteration": 3.0084383487701416 + }, + { + "auxiliary_loss_clip": 0.01105663, + "auxiliary_loss_mlp": 0.01027993, + "balance_loss_clip": 1.01696038, + "balance_loss_mlp": 1.03627193, + "epoch": 0.6613557793476627, + "flos": 21616644072960.0, + "grad_norm": 2.367912839089916, + "language_loss": 0.71249658, + "learning_rate": 1.086973614127679e-06, + "loss": 0.73383313, + "num_input_tokens_seen": 237459205, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6953125, + "step": 11000, + "time_per_iteration": 2.426126480102539 + }, + { + "auxiliary_loss_clip": 0.01100275, + "auxiliary_loss_mlp": 0.010328, + "balance_loss_clip": 1.02214289, + "balance_loss_mlp": 1.03528404, + "epoch": 0.6614159026003307, + "flos": 34020117192960.0, + "grad_norm": 1.5935854519622277, + "language_loss": 0.65334332, + "learning_rate": 1.0866271209011133e-06, + "loss": 0.67467409, + "num_input_tokens_seen": 237483580, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6484375, + "step": 11001, + "time_per_iteration": 2.586193323135376 + }, + { + "auxiliary_loss_clip": 0.01103282, + "auxiliary_loss_mlp": 0.01027047, + "balance_loss_clip": 1.01568055, + "balance_loss_mlp": 1.03593278, + "epoch": 0.6614760258529987, + "flos": 24097281235200.0, + "grad_norm": 1.922146655127119, + "language_loss": 0.73242342, + "learning_rate": 1.086280662309739e-06, + "loss": 0.75372672, + "num_input_tokens_seen": 237502860, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.67578125, + "step": 11002, + "time_per_iteration": 2.4588327407836914 + }, + { + "auxiliary_loss_clip": 0.01101069, + "auxiliary_loss_mlp": 0.01032524, + "balance_loss_clip": 1.02064466, + "balance_loss_mlp": 1.03539062, + "epoch": 0.6615361491056666, + "flos": 14903790935040.0, + "grad_norm": 2.0738499312562215, + "language_loss": 0.78606766, + "learning_rate": 1.0859342383666928e-06, + "loss": 0.80740356, + "num_input_tokens_seen": 237521030, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.65625, + "step": 11003, + "time_per_iteration": 2.470768928527832 + }, + { + "auxiliary_loss_clip": 0.01105808, + "auxiliary_loss_mlp": 0.01034072, + "balance_loss_clip": 1.02114952, + "balance_loss_mlp": 1.03701353, + "epoch": 0.6615962723583346, + "flos": 15304877176320.0, + "grad_norm": 1.8055156139018678, + "language_loss": 0.68872547, + "learning_rate": 1.0855878490851119e-06, + "loss": 0.71012425, + "num_input_tokens_seen": 237539585, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6875, + "step": 11004, + "time_per_iteration": 2.4174275398254395 + }, + { + "auxiliary_loss_clip": 0.01105956, + "auxiliary_loss_mlp": 0.01034853, + "balance_loss_clip": 1.02177572, + "balance_loss_mlp": 1.0356009, + "epoch": 0.6616563956110025, + "flos": 18732586285440.0, + "grad_norm": 2.2557237333346687, + "language_loss": 0.69553763, + "learning_rate": 1.085241494478132e-06, + "loss": 0.71694571, + "num_input_tokens_seen": 237557655, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 11005, + "time_per_iteration": 2.482177495956421 + }, + { + "auxiliary_loss_clip": 0.01102487, + "auxiliary_loss_mlp": 0.01026054, + "balance_loss_clip": 1.01488411, + "balance_loss_mlp": 1.03609776, + "epoch": 0.6617165188636706, + "flos": 24495063425280.0, + "grad_norm": 1.5704694842406037, + "language_loss": 0.78232396, + "learning_rate": 1.0848951745588855e-06, + "loss": 0.80360937, + "num_input_tokens_seen": 237577000, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 11006, + "time_per_iteration": 2.4723048210144043 + }, + { + "auxiliary_loss_clip": 0.01102233, + "auxiliary_loss_mlp": 0.01031605, + "balance_loss_clip": 1.01923108, + "balance_loss_mlp": 1.03596103, + "epoch": 0.6617766421163385, + "flos": 22379673709440.0, + "grad_norm": 1.5007948972384493, + "language_loss": 0.75993907, + "learning_rate": 1.0845488893405068e-06, + "loss": 0.78127748, + "num_input_tokens_seen": 237597960, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6640625, + "step": 11007, + "time_per_iteration": 2.4790470600128174 + }, + { + "auxiliary_loss_clip": 0.01105175, + "auxiliary_loss_mlp": 0.0102811, + "balance_loss_clip": 1.01685643, + "balance_loss_mlp": 1.0384593, + "epoch": 0.6618367653690065, + "flos": 20850418126080.0, + "grad_norm": 1.9253644062666073, + "language_loss": 0.78290129, + "learning_rate": 1.0842026388361248e-06, + "loss": 0.80423415, + "num_input_tokens_seen": 237616385, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.66796875, + "step": 11008, + "time_per_iteration": 2.4340806007385254 + }, + { + "auxiliary_loss_clip": 0.0110631, + "auxiliary_loss_mlp": 0.01029918, + "balance_loss_clip": 1.01736474, + "balance_loss_mlp": 1.03573239, + "epoch": 0.6618968886216744, + "flos": 17712328377600.0, + "grad_norm": 1.8127446377472742, + "language_loss": 0.81780791, + "learning_rate": 1.0838564230588715e-06, + "loss": 0.83917022, + "num_input_tokens_seen": 237634930, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.70703125, + "step": 11009, + "time_per_iteration": 2.4623091220855713 + }, + { + "auxiliary_loss_clip": 0.01026257, + "auxiliary_loss_mlp": 0.01005514, + "balance_loss_clip": 1.0043757, + "balance_loss_mlp": 1.00541437, + "epoch": 0.6619570118743424, + "flos": 67035347498880.0, + "grad_norm": 0.9788733414804485, + "language_loss": 0.67425871, + "learning_rate": 1.0835102420218735e-06, + "loss": 0.69457638, + "num_input_tokens_seen": 237693175, + "router_z_loss_clip": 0.01141357, + "router_z_loss_mlp": 0.20898438, + "step": 11010, + "time_per_iteration": 4.397435188293457 + }, + { + "auxiliary_loss_clip": 0.01104702, + "auxiliary_loss_mlp": 0.01028772, + "balance_loss_clip": 1.01645815, + "balance_loss_mlp": 1.03598547, + "epoch": 0.6620171351270103, + "flos": 18660908695680.0, + "grad_norm": 1.7882832526705355, + "language_loss": 0.71199936, + "learning_rate": 1.0831640957382593e-06, + "loss": 0.73333406, + "num_input_tokens_seen": 237713160, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 11011, + "time_per_iteration": 2.4273481369018555 + }, + { + "auxiliary_loss_clip": 0.01103848, + "auxiliary_loss_mlp": 0.0103319, + "balance_loss_clip": 1.02204967, + "balance_loss_mlp": 1.03780639, + "epoch": 0.6620772583796783, + "flos": 24170503109760.0, + "grad_norm": 1.7492667107704147, + "language_loss": 0.72528613, + "learning_rate": 1.0828179842211557e-06, + "loss": 0.74665654, + "num_input_tokens_seen": 237733600, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66015625, + "step": 11012, + "time_per_iteration": 2.467482566833496 + }, + { + "auxiliary_loss_clip": 0.010978, + "auxiliary_loss_mlp": 0.01031536, + "balance_loss_clip": 1.02084911, + "balance_loss_mlp": 1.03615665, + "epoch": 0.6621373816323463, + "flos": 23623547736960.0, + "grad_norm": 1.7384195449369746, + "language_loss": 0.795021, + "learning_rate": 1.0824719074836845e-06, + "loss": 0.8163144, + "num_input_tokens_seen": 237752135, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6171875, + "step": 11013, + "time_per_iteration": 3.923494577407837 + }, + { + "auxiliary_loss_clip": 0.01102996, + "auxiliary_loss_mlp": 0.01029153, + "balance_loss_clip": 1.01767898, + "balance_loss_mlp": 1.03644931, + "epoch": 0.6621975048850143, + "flos": 18442212739200.0, + "grad_norm": 1.886371512022625, + "language_loss": 0.7088536, + "learning_rate": 1.082125865538971e-06, + "loss": 0.73017514, + "num_input_tokens_seen": 237770735, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6640625, + "step": 11014, + "time_per_iteration": 2.439049482345581 + }, + { + "auxiliary_loss_clip": 0.01100918, + "auxiliary_loss_mlp": 0.01030718, + "balance_loss_clip": 1.02039468, + "balance_loss_mlp": 1.03656077, + "epoch": 0.6622576281376823, + "flos": 14063876236800.0, + "grad_norm": 2.1131368988088504, + "language_loss": 0.76709092, + "learning_rate": 1.081779858400137e-06, + "loss": 0.78840733, + "num_input_tokens_seen": 237789005, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.640625, + "step": 11015, + "time_per_iteration": 3.900524616241455 + }, + { + "auxiliary_loss_clip": 0.01101265, + "auxiliary_loss_mlp": 0.01026817, + "balance_loss_clip": 1.01506257, + "balance_loss_mlp": 1.03580058, + "epoch": 0.6623177513903502, + "flos": 17018965169280.0, + "grad_norm": 1.7610046970273479, + "language_loss": 0.82307482, + "learning_rate": 1.0814338860803021e-06, + "loss": 0.8443557, + "num_input_tokens_seen": 237807740, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.65234375, + "step": 11016, + "time_per_iteration": 2.4373061656951904 + }, + { + "auxiliary_loss_clip": 0.0110303, + "auxiliary_loss_mlp": 0.01031074, + "balance_loss_clip": 1.01933837, + "balance_loss_mlp": 1.03373432, + "epoch": 0.6623778746430182, + "flos": 17271021882240.0, + "grad_norm": 1.888497767792011, + "language_loss": 0.6969018, + "learning_rate": 1.0810879485925864e-06, + "loss": 0.71824282, + "num_input_tokens_seen": 237826340, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.69140625, + "step": 11017, + "time_per_iteration": 2.4477572441101074 + }, + { + "auxiliary_loss_clip": 0.0110184, + "auxiliary_loss_mlp": 0.01034298, + "balance_loss_clip": 1.02280068, + "balance_loss_mlp": 1.03520179, + "epoch": 0.6624379978956861, + "flos": 48792688767360.0, + "grad_norm": 1.7526472003474178, + "language_loss": 0.77214134, + "learning_rate": 1.0807420459501084e-06, + "loss": 0.79350269, + "num_input_tokens_seen": 237848305, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6640625, + "step": 11018, + "time_per_iteration": 2.6970436573028564 + }, + { + "auxiliary_loss_clip": 0.01100319, + "auxiliary_loss_mlp": 0.01036299, + "balance_loss_clip": 1.0244143, + "balance_loss_mlp": 1.03411186, + "epoch": 0.6624981211483542, + "flos": 18952431477120.0, + "grad_norm": 1.9966965859861308, + "language_loss": 0.83007133, + "learning_rate": 1.0803961781659841e-06, + "loss": 0.85143745, + "num_input_tokens_seen": 237867020, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6640625, + "step": 11019, + "time_per_iteration": 2.429482936859131 + }, + { + "auxiliary_loss_clip": 0.011003, + "auxiliary_loss_mlp": 0.01028801, + "balance_loss_clip": 1.01789916, + "balance_loss_mlp": 1.0355196, + "epoch": 0.6625582444010221, + "flos": 23256576437760.0, + "grad_norm": 1.956066495989637, + "language_loss": 0.71813512, + "learning_rate": 1.080050345253328e-06, + "loss": 0.73942614, + "num_input_tokens_seen": 237886710, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6484375, + "step": 11020, + "time_per_iteration": 2.4736745357513428 + }, + { + "auxiliary_loss_clip": 0.01107397, + "auxiliary_loss_mlp": 0.0102918, + "balance_loss_clip": 1.01639438, + "balance_loss_mlp": 1.03652906, + "epoch": 0.6626183676536901, + "flos": 21394823633280.0, + "grad_norm": 1.7164682336590185, + "language_loss": 0.72276735, + "learning_rate": 1.0797045472252554e-06, + "loss": 0.74413311, + "num_input_tokens_seen": 237904795, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7109375, + "step": 11021, + "time_per_iteration": 2.477529525756836 + }, + { + "auxiliary_loss_clip": 0.01103875, + "auxiliary_loss_mlp": 0.01032756, + "balance_loss_clip": 1.02115703, + "balance_loss_mlp": 1.03667212, + "epoch": 0.662678490906358, + "flos": 14571293713920.0, + "grad_norm": 2.3400531031028873, + "language_loss": 0.83128953, + "learning_rate": 1.0793587840948793e-06, + "loss": 0.85265589, + "num_input_tokens_seen": 237921320, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 11022, + "time_per_iteration": 2.507936716079712 + }, + { + "auxiliary_loss_clip": 0.0110951, + "auxiliary_loss_mlp": 0.01032056, + "balance_loss_clip": 1.01896095, + "balance_loss_mlp": 1.03662038, + "epoch": 0.662738614159026, + "flos": 15992350554240.0, + "grad_norm": 2.599884159549939, + "language_loss": 0.73365414, + "learning_rate": 1.0790130558753099e-06, + "loss": 0.75506973, + "num_input_tokens_seen": 237933525, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 11023, + "time_per_iteration": 2.4137043952941895 + }, + { + "auxiliary_loss_clip": 0.0110089, + "auxiliary_loss_mlp": 0.01029067, + "balance_loss_clip": 1.0178678, + "balance_loss_mlp": 1.03488147, + "epoch": 0.6627987374116939, + "flos": 19536338966400.0, + "grad_norm": 1.7959862106394333, + "language_loss": 0.74551922, + "learning_rate": 1.0786673625796574e-06, + "loss": 0.76681882, + "num_input_tokens_seen": 237953395, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6640625, + "step": 11024, + "time_per_iteration": 2.475996255874634 + }, + { + "auxiliary_loss_clip": 0.01105322, + "auxiliary_loss_mlp": 0.01029911, + "balance_loss_clip": 1.01788878, + "balance_loss_mlp": 1.0374223, + "epoch": 0.662858860664362, + "flos": 15702838934400.0, + "grad_norm": 2.1748664614868214, + "language_loss": 0.69700897, + "learning_rate": 1.0783217042210306e-06, + "loss": 0.71836132, + "num_input_tokens_seen": 237971445, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 11025, + "time_per_iteration": 2.4363040924072266 + }, + { + "auxiliary_loss_clip": 0.01105179, + "auxiliary_loss_mlp": 0.01035379, + "balance_loss_clip": 1.02345753, + "balance_loss_mlp": 1.03844023, + "epoch": 0.6629189839170299, + "flos": 20154289570560.0, + "grad_norm": 1.7368551034909252, + "language_loss": 0.78647238, + "learning_rate": 1.0779760808125379e-06, + "loss": 0.8078779, + "num_input_tokens_seen": 237989965, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66796875, + "step": 11026, + "time_per_iteration": 2.4804115295410156 + }, + { + "auxiliary_loss_clip": 0.01102997, + "auxiliary_loss_mlp": 0.01029116, + "balance_loss_clip": 1.01807094, + "balance_loss_mlp": 1.03734887, + "epoch": 0.6629791071696979, + "flos": 20915415786240.0, + "grad_norm": 1.6695781674460857, + "language_loss": 0.7642892, + "learning_rate": 1.0776304923672842e-06, + "loss": 0.78561032, + "num_input_tokens_seen": 238006820, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 11027, + "time_per_iteration": 2.4259533882141113 + }, + { + "auxiliary_loss_clip": 0.01104358, + "auxiliary_loss_mlp": 0.01035525, + "balance_loss_clip": 1.02346063, + "balance_loss_mlp": 1.03656745, + "epoch": 0.6630392304223659, + "flos": 20846898593280.0, + "grad_norm": 2.1060132685452335, + "language_loss": 0.69903147, + "learning_rate": 1.0772849388983742e-06, + "loss": 0.72043025, + "num_input_tokens_seen": 238022560, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6796875, + "step": 11028, + "time_per_iteration": 2.4627115726470947 + }, + { + "auxiliary_loss_clip": 0.01102349, + "auxiliary_loss_mlp": 0.01031493, + "balance_loss_clip": 1.02125263, + "balance_loss_mlp": 1.03578711, + "epoch": 0.6630993536750338, + "flos": 20995820380800.0, + "grad_norm": 1.8773152280466259, + "language_loss": 0.7926842, + "learning_rate": 1.0769394204189138e-06, + "loss": 0.8140226, + "num_input_tokens_seen": 238041895, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.66796875, + "step": 11029, + "time_per_iteration": 2.4524929523468018 + }, + { + "auxiliary_loss_clip": 0.01103713, + "auxiliary_loss_mlp": 0.01028426, + "balance_loss_clip": 1.01583755, + "balance_loss_mlp": 1.03504181, + "epoch": 0.6631594769277018, + "flos": 18259032355200.0, + "grad_norm": 2.11014761642944, + "language_loss": 0.76041275, + "learning_rate": 1.0765939369420012e-06, + "loss": 0.78173411, + "num_input_tokens_seen": 238060445, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 11030, + "time_per_iteration": 2.4383111000061035 + }, + { + "auxiliary_loss_clip": 0.01109452, + "auxiliary_loss_mlp": 0.01031592, + "balance_loss_clip": 1.01958811, + "balance_loss_mlp": 1.03813887, + "epoch": 0.6632196001803697, + "flos": 17820491207040.0, + "grad_norm": 2.37714698139957, + "language_loss": 0.74753916, + "learning_rate": 1.0762484884807391e-06, + "loss": 0.76894963, + "num_input_tokens_seen": 238077080, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.71484375, + "step": 11031, + "time_per_iteration": 2.4041976928710938 + }, + { + "auxiliary_loss_clip": 0.01103516, + "auxiliary_loss_mlp": 0.01031437, + "balance_loss_clip": 1.01942098, + "balance_loss_mlp": 1.03516042, + "epoch": 0.6632797234330378, + "flos": 12670182581760.0, + "grad_norm": 3.9220695320455494, + "language_loss": 0.74872231, + "learning_rate": 1.075903075048228e-06, + "loss": 0.77007186, + "num_input_tokens_seen": 238091045, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 11032, + "time_per_iteration": 2.3847768306732178 + }, + { + "auxiliary_loss_clip": 0.01102597, + "auxiliary_loss_mlp": 0.01028293, + "balance_loss_clip": 1.01723075, + "balance_loss_mlp": 1.03578007, + "epoch": 0.6633398466857057, + "flos": 23584728113280.0, + "grad_norm": 1.77863211463492, + "language_loss": 0.80295861, + "learning_rate": 1.0755576966575635e-06, + "loss": 0.82426751, + "num_input_tokens_seen": 238110220, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.66796875, + "step": 11033, + "time_per_iteration": 2.4669265747070312 + }, + { + "auxiliary_loss_clip": 0.01101844, + "auxiliary_loss_mlp": 0.01029961, + "balance_loss_clip": 1.01801026, + "balance_loss_mlp": 1.03441966, + "epoch": 0.6633999699383737, + "flos": 20631686256000.0, + "grad_norm": 2.0583190629929957, + "language_loss": 0.80057156, + "learning_rate": 1.0752123533218451e-06, + "loss": 0.82188958, + "num_input_tokens_seen": 238130400, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.67578125, + "step": 11034, + "time_per_iteration": 2.4563634395599365 + }, + { + "auxiliary_loss_clip": 0.0109965, + "auxiliary_loss_mlp": 0.0102582, + "balance_loss_clip": 1.01526368, + "balance_loss_mlp": 1.03502083, + "epoch": 0.6634600931910416, + "flos": 21797095023360.0, + "grad_norm": 1.5719715577747368, + "language_loss": 0.75545985, + "learning_rate": 1.074867045054166e-06, + "loss": 0.7767145, + "num_input_tokens_seen": 238148165, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.64453125, + "step": 11035, + "time_per_iteration": 2.513399600982666 + }, + { + "auxiliary_loss_clip": 0.01103249, + "auxiliary_loss_mlp": 0.01027301, + "balance_loss_clip": 1.0156064, + "balance_loss_mlp": 1.0342617, + "epoch": 0.6635202164437096, + "flos": 18732873594240.0, + "grad_norm": 1.7970498153302146, + "language_loss": 0.83235633, + "learning_rate": 1.074521771867622e-06, + "loss": 0.85366178, + "num_input_tokens_seen": 238166360, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.69140625, + "step": 11036, + "time_per_iteration": 2.519704580307007 + }, + { + "auxiliary_loss_clip": 0.01027101, + "auxiliary_loss_mlp": 0.01001243, + "balance_loss_clip": 0.99994338, + "balance_loss_mlp": 1.00646234, + "epoch": 0.6635803396963775, + "flos": 60222771227520.0, + "grad_norm": 0.7769560833184769, + "language_loss": 0.52306348, + "learning_rate": 1.0741765337753044e-06, + "loss": 0.54334688, + "num_input_tokens_seen": 238227630, + "router_z_loss_clip": 0.01300049, + "router_z_loss_mlp": 0.20703125, + "step": 11037, + "time_per_iteration": 3.0515010356903076 + }, + { + "auxiliary_loss_clip": 0.01103588, + "auxiliary_loss_mlp": 0.01036401, + "balance_loss_clip": 1.02405727, + "balance_loss_mlp": 1.03591716, + "epoch": 0.6636404629490456, + "flos": 29167041611520.0, + "grad_norm": 1.842185877925078, + "language_loss": 0.79099, + "learning_rate": 1.0738313307903052e-06, + "loss": 0.81238985, + "num_input_tokens_seen": 238248435, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.67578125, + "step": 11038, + "time_per_iteration": 2.5139565467834473 + }, + { + "auxiliary_loss_clip": 0.0110341, + "auxiliary_loss_mlp": 0.01037399, + "balance_loss_clip": 1.02515566, + "balance_loss_mlp": 1.03648806, + "epoch": 0.6637005862017135, + "flos": 38907702766080.0, + "grad_norm": 1.8255445121908285, + "language_loss": 0.64082795, + "learning_rate": 1.073486162925716e-06, + "loss": 0.66223598, + "num_input_tokens_seen": 238268755, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66796875, + "step": 11039, + "time_per_iteration": 2.623331308364868 + }, + { + "auxiliary_loss_clip": 0.0110572, + "auxiliary_loss_mlp": 0.01030057, + "balance_loss_clip": 1.01841021, + "balance_loss_mlp": 1.03601968, + "epoch": 0.6637607094543815, + "flos": 22783345729920.0, + "grad_norm": 1.7210825984121325, + "language_loss": 0.63687986, + "learning_rate": 1.0731410301946237e-06, + "loss": 0.65823758, + "num_input_tokens_seen": 238290120, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.69921875, + "step": 11040, + "time_per_iteration": 2.472255229949951 + }, + { + "auxiliary_loss_clip": 0.01100331, + "auxiliary_loss_mlp": 0.01029858, + "balance_loss_clip": 1.01893258, + "balance_loss_mlp": 1.03372359, + "epoch": 0.6638208327070495, + "flos": 18114096977280.0, + "grad_norm": 1.9713653362611905, + "language_loss": 0.71843195, + "learning_rate": 1.0727959326101161e-06, + "loss": 0.73973382, + "num_input_tokens_seen": 238309290, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.66796875, + "step": 11041, + "time_per_iteration": 2.4769115447998047 + }, + { + "auxiliary_loss_clip": 0.01102253, + "auxiliary_loss_mlp": 0.01038804, + "balance_loss_clip": 1.02647161, + "balance_loss_mlp": 1.03540432, + "epoch": 0.6638809559597174, + "flos": 29424880414080.0, + "grad_norm": 8.010243162338005, + "language_loss": 0.61716807, + "learning_rate": 1.0724508701852806e-06, + "loss": 0.63857865, + "num_input_tokens_seen": 238327280, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66796875, + "step": 11042, + "time_per_iteration": 2.50669264793396 + }, + { + "auxiliary_loss_clip": 0.01105298, + "auxiliary_loss_mlp": 0.0102732, + "balance_loss_clip": 1.01413536, + "balance_loss_mlp": 1.03500068, + "epoch": 0.6639410792123854, + "flos": 28072699902720.0, + "grad_norm": 2.00393235647331, + "language_loss": 0.68282115, + "learning_rate": 1.0721058429331998e-06, + "loss": 0.70414734, + "num_input_tokens_seen": 238346330, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 11043, + "time_per_iteration": 2.518275499343872 + }, + { + "auxiliary_loss_clip": 0.01099626, + "auxiliary_loss_mlp": 0.01024503, + "balance_loss_clip": 1.01448393, + "balance_loss_mlp": 1.03639984, + "epoch": 0.6640012024650533, + "flos": 25556367600000.0, + "grad_norm": 1.6123860278714182, + "language_loss": 0.83758092, + "learning_rate": 1.0717608508669587e-06, + "loss": 0.85882223, + "num_input_tokens_seen": 238364650, + "router_z_loss_clip": 0.10009766, + "router_z_loss_mlp": 0.6328125, + "step": 11044, + "time_per_iteration": 2.505173444747925 + }, + { + "auxiliary_loss_clip": 0.01102203, + "auxiliary_loss_mlp": 0.01029885, + "balance_loss_clip": 1.01769567, + "balance_loss_mlp": 1.03553414, + "epoch": 0.6640613257177214, + "flos": 14866946559360.0, + "grad_norm": 1.9292668184213282, + "language_loss": 0.69679981, + "learning_rate": 1.0714158939996392e-06, + "loss": 0.71812069, + "num_input_tokens_seen": 238381630, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66796875, + "step": 11045, + "time_per_iteration": 2.4917290210723877 + }, + { + "auxiliary_loss_clip": 0.01106396, + "auxiliary_loss_mlp": 0.01025419, + "balance_loss_clip": 1.01372421, + "balance_loss_mlp": 1.03785038, + "epoch": 0.6641214489703893, + "flos": 23221096778880.0, + "grad_norm": 1.4259906887756533, + "language_loss": 0.6473543, + "learning_rate": 1.0710709723443235e-06, + "loss": 0.66867244, + "num_input_tokens_seen": 238402595, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6875, + "step": 11046, + "time_per_iteration": 2.4937326908111572 + }, + { + "auxiliary_loss_clip": 0.01101037, + "auxiliary_loss_mlp": 0.01026442, + "balance_loss_clip": 1.01489711, + "balance_loss_mlp": 1.03506637, + "epoch": 0.6641815722230573, + "flos": 37742617221120.0, + "grad_norm": 1.4622045705244888, + "language_loss": 0.71289897, + "learning_rate": 1.070726085914088e-06, + "loss": 0.73417372, + "num_input_tokens_seen": 238426860, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66015625, + "step": 11047, + "time_per_iteration": 2.6626944541931152 + }, + { + "auxiliary_loss_clip": 0.01104783, + "auxiliary_loss_mlp": 0.01031561, + "balance_loss_clip": 1.01909184, + "balance_loss_mlp": 1.03837025, + "epoch": 0.6642416954757252, + "flos": 17931132074880.0, + "grad_norm": 1.803867578656826, + "language_loss": 0.77093923, + "learning_rate": 1.0703812347220126e-06, + "loss": 0.79230267, + "num_input_tokens_seen": 238443990, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6640625, + "step": 11048, + "time_per_iteration": 2.3982088565826416 + }, + { + "auxiliary_loss_clip": 0.01026262, + "auxiliary_loss_mlp": 0.00999443, + "balance_loss_clip": 0.99813193, + "balance_loss_mlp": 1.00559723, + "epoch": 0.6643018187283932, + "flos": 51995384104320.0, + "grad_norm": 0.8140473421231088, + "language_loss": 0.55041039, + "learning_rate": 1.0700364187811745e-06, + "loss": 0.57066745, + "num_input_tokens_seen": 238503045, + "router_z_loss_clip": 0.01312256, + "router_z_loss_mlp": 0.20703125, + "step": 11049, + "time_per_iteration": 3.0340354442596436 + }, + { + "auxiliary_loss_clip": 0.01102231, + "auxiliary_loss_mlp": 0.01027108, + "balance_loss_clip": 1.01654625, + "balance_loss_mlp": 1.03627372, + "epoch": 0.6643619419810611, + "flos": 30226657847040.0, + "grad_norm": 1.7091488805060655, + "language_loss": 0.64489448, + "learning_rate": 1.069691638104648e-06, + "loss": 0.66618788, + "num_input_tokens_seen": 238527320, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.66015625, + "step": 11050, + "time_per_iteration": 2.5083260536193848 + }, + { + "auxiliary_loss_clip": 0.01099461, + "auxiliary_loss_mlp": 0.01026918, + "balance_loss_clip": 1.01615942, + "balance_loss_mlp": 1.03481436, + "epoch": 0.6644220652337292, + "flos": 22966131064320.0, + "grad_norm": 2.10593076125299, + "language_loss": 0.78783518, + "learning_rate": 1.0693468927055085e-06, + "loss": 0.80909896, + "num_input_tokens_seen": 238546030, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6484375, + "step": 11051, + "time_per_iteration": 2.462937116622925 + }, + { + "auxiliary_loss_clip": 0.01103355, + "auxiliary_loss_mlp": 0.01031151, + "balance_loss_clip": 1.01975441, + "balance_loss_mlp": 1.03752089, + "epoch": 0.6644821884863971, + "flos": 21142228216320.0, + "grad_norm": 1.6490502352967844, + "language_loss": 0.85132825, + "learning_rate": 1.0690021825968276e-06, + "loss": 0.87267327, + "num_input_tokens_seen": 238564175, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65625, + "step": 11052, + "time_per_iteration": 3.808241128921509 + }, + { + "auxiliary_loss_clip": 0.01106566, + "auxiliary_loss_mlp": 0.01035204, + "balance_loss_clip": 1.02213919, + "balance_loss_mlp": 1.03723979, + "epoch": 0.6645423117390651, + "flos": 20192821885440.0, + "grad_norm": 2.202728029810485, + "language_loss": 0.75382364, + "learning_rate": 1.0686575077916776e-06, + "loss": 0.77524137, + "num_input_tokens_seen": 238581010, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6953125, + "step": 11053, + "time_per_iteration": 2.4659061431884766 + }, + { + "auxiliary_loss_clip": 0.01099205, + "auxiliary_loss_mlp": 0.01024974, + "balance_loss_clip": 1.0138042, + "balance_loss_mlp": 1.03446698, + "epoch": 0.6646024349917331, + "flos": 24351959640960.0, + "grad_norm": 1.6434507479308733, + "language_loss": 0.79397607, + "learning_rate": 1.0683128683031278e-06, + "loss": 0.81521785, + "num_input_tokens_seen": 238601365, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 11054, + "time_per_iteration": 2.4667155742645264 + }, + { + "auxiliary_loss_clip": 0.01100684, + "auxiliary_loss_mlp": 0.01029658, + "balance_loss_clip": 1.0186491, + "balance_loss_mlp": 1.03520536, + "epoch": 0.664662558244401, + "flos": 18806706000000.0, + "grad_norm": 1.4981555869580738, + "language_loss": 0.74050117, + "learning_rate": 1.0679682641442472e-06, + "loss": 0.76180458, + "num_input_tokens_seen": 238619850, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 11055, + "time_per_iteration": 3.8726584911346436 + }, + { + "auxiliary_loss_clip": 0.01104209, + "auxiliary_loss_mlp": 0.01034676, + "balance_loss_clip": 1.02240944, + "balance_loss_mlp": 1.0363059, + "epoch": 0.664722681497069, + "flos": 18952790613120.0, + "grad_norm": 1.7483359396792508, + "language_loss": 0.72639185, + "learning_rate": 1.0676236953281042e-06, + "loss": 0.74778068, + "num_input_tokens_seen": 238637635, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.67578125, + "step": 11056, + "time_per_iteration": 3.913365364074707 + }, + { + "auxiliary_loss_clip": 0.01101489, + "auxiliary_loss_mlp": 0.01027564, + "balance_loss_clip": 1.01610804, + "balance_loss_mlp": 1.03553987, + "epoch": 0.6647828047497369, + "flos": 19571279921280.0, + "grad_norm": 2.080468005748717, + "language_loss": 0.69644797, + "learning_rate": 1.0672791618677641e-06, + "loss": 0.71773851, + "num_input_tokens_seen": 238656200, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 11057, + "time_per_iteration": 2.4554696083068848 + }, + { + "auxiliary_loss_clip": 0.01104048, + "auxiliary_loss_mlp": 0.01029444, + "balance_loss_clip": 1.01761794, + "balance_loss_mlp": 1.0374651, + "epoch": 0.664842928002405, + "flos": 23149455102720.0, + "grad_norm": 2.7208836045736753, + "language_loss": 0.80084372, + "learning_rate": 1.066934663776291e-06, + "loss": 0.8221786, + "num_input_tokens_seen": 238675005, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 11058, + "time_per_iteration": 2.4723973274230957 + }, + { + "auxiliary_loss_clip": 0.01026201, + "auxiliary_loss_mlp": 0.00999951, + "balance_loss_clip": 0.99850267, + "balance_loss_mlp": 1.00571644, + "epoch": 0.6649030512550729, + "flos": 65244913148160.0, + "grad_norm": 0.8197408377002003, + "language_loss": 0.62637091, + "learning_rate": 1.0665902010667496e-06, + "loss": 0.64663243, + "num_input_tokens_seen": 238731425, + "router_z_loss_clip": 0.01446533, + "router_z_loss_mlp": 0.20507812, + "step": 11059, + "time_per_iteration": 2.9666504859924316 + }, + { + "auxiliary_loss_clip": 0.01099741, + "auxiliary_loss_mlp": 0.01034131, + "balance_loss_clip": 1.0232594, + "balance_loss_mlp": 1.03411603, + "epoch": 0.6649631745077409, + "flos": 20194797133440.0, + "grad_norm": 1.4258342030978963, + "language_loss": 0.78922415, + "learning_rate": 1.0662457737522008e-06, + "loss": 0.81056285, + "num_input_tokens_seen": 238752020, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.65625, + "step": 11060, + "time_per_iteration": 2.453782796859741 + }, + { + "auxiliary_loss_clip": 0.01105175, + "auxiliary_loss_mlp": 0.01034567, + "balance_loss_clip": 1.02221072, + "balance_loss_mlp": 1.03826928, + "epoch": 0.6650232977604088, + "flos": 17238558965760.0, + "grad_norm": 1.8106435880803493, + "language_loss": 0.78883487, + "learning_rate": 1.0659013818457055e-06, + "loss": 0.81023228, + "num_input_tokens_seen": 238769665, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.671875, + "step": 11061, + "time_per_iteration": 2.4411821365356445 + }, + { + "auxiliary_loss_clip": 0.01103137, + "auxiliary_loss_mlp": 0.0102691, + "balance_loss_clip": 1.01606226, + "balance_loss_mlp": 1.03756928, + "epoch": 0.6650834210130768, + "flos": 10006867825920.0, + "grad_norm": 2.176969604984505, + "language_loss": 0.57041669, + "learning_rate": 1.0655570253603243e-06, + "loss": 0.59171724, + "num_input_tokens_seen": 238782180, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.65625, + "step": 11062, + "time_per_iteration": 2.389374256134033 + }, + { + "auxiliary_loss_clip": 0.0110523, + "auxiliary_loss_mlp": 0.01027344, + "balance_loss_clip": 1.01472592, + "balance_loss_mlp": 1.03483319, + "epoch": 0.6651435442657447, + "flos": 10452088903680.0, + "grad_norm": 1.8021007966116196, + "language_loss": 0.75658429, + "learning_rate": 1.0652127043091144e-06, + "loss": 0.77791005, + "num_input_tokens_seen": 238800315, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 11063, + "time_per_iteration": 2.4186158180236816 + }, + { + "auxiliary_loss_clip": 0.01103715, + "auxiliary_loss_mlp": 0.01033702, + "balance_loss_clip": 1.02248394, + "balance_loss_mlp": 1.0370208, + "epoch": 0.6652036675184128, + "flos": 22344229964160.0, + "grad_norm": 1.3058140700355754, + "language_loss": 0.7048496, + "learning_rate": 1.0648684187051316e-06, + "loss": 0.72622377, + "num_input_tokens_seen": 238822250, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.66796875, + "step": 11064, + "time_per_iteration": 2.5101113319396973 + }, + { + "auxiliary_loss_clip": 0.01024924, + "auxiliary_loss_mlp": 0.00997873, + "balance_loss_clip": 0.9965679, + "balance_loss_mlp": 1.00459087, + "epoch": 0.6652637907710807, + "flos": 52909633998720.0, + "grad_norm": 0.8487322656758325, + "language_loss": 0.63019937, + "learning_rate": 1.0645241685614322e-06, + "loss": 0.65042734, + "num_input_tokens_seen": 238877190, + "router_z_loss_clip": 0.01306152, + "router_z_loss_mlp": 0.203125, + "step": 11065, + "time_per_iteration": 3.006619691848755 + }, + { + "auxiliary_loss_clip": 0.01104037, + "auxiliary_loss_mlp": 0.01026975, + "balance_loss_clip": 1.01580477, + "balance_loss_mlp": 1.03731883, + "epoch": 0.6653239140237487, + "flos": 23104637907840.0, + "grad_norm": 1.6667915109143088, + "language_loss": 0.62019926, + "learning_rate": 1.0641799538910708e-06, + "loss": 0.64150941, + "num_input_tokens_seen": 238896010, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.66796875, + "step": 11066, + "time_per_iteration": 2.468318223953247 + }, + { + "auxiliary_loss_clip": 0.01102953, + "auxiliary_loss_mlp": 0.01026785, + "balance_loss_clip": 1.01430988, + "balance_loss_mlp": 1.03479779, + "epoch": 0.6653840372764167, + "flos": 25959393175680.0, + "grad_norm": 1.7106058760764156, + "language_loss": 0.70056629, + "learning_rate": 1.0638357747070985e-06, + "loss": 0.72186363, + "num_input_tokens_seen": 238918990, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 11067, + "time_per_iteration": 2.524820566177368 + }, + { + "auxiliary_loss_clip": 0.0102549, + "auxiliary_loss_mlp": 0.00996129, + "balance_loss_clip": 0.99466848, + "balance_loss_mlp": 1.00504017, + "epoch": 0.6654441605290846, + "flos": 66041985899520.0, + "grad_norm": 0.9061893644507588, + "language_loss": 0.72102368, + "learning_rate": 1.0634916310225684e-06, + "loss": 0.74123991, + "num_input_tokens_seen": 238975735, + "router_z_loss_clip": 0.0145874, + "router_z_loss_mlp": 0.20507812, + "step": 11068, + "time_per_iteration": 3.0193986892700195 + }, + { + "auxiliary_loss_clip": 0.01025049, + "auxiliary_loss_mlp": 0.00996802, + "balance_loss_clip": 0.99540693, + "balance_loss_mlp": 1.00446737, + "epoch": 0.6655042837817526, + "flos": 65196112521600.0, + "grad_norm": 0.7087489248971819, + "language_loss": 0.57800353, + "learning_rate": 1.0631475228505285e-06, + "loss": 0.59822208, + "num_input_tokens_seen": 239042360, + "router_z_loss_clip": 0.01397705, + "router_z_loss_mlp": 0.20605469, + "step": 11069, + "time_per_iteration": 3.2124764919281006 + }, + { + "auxiliary_loss_clip": 0.0102455, + "auxiliary_loss_mlp": 0.01000321, + "balance_loss_clip": 0.99886698, + "balance_loss_mlp": 1.0041275, + "epoch": 0.6655644070344205, + "flos": 69008746752000.0, + "grad_norm": 0.7763166900295557, + "language_loss": 0.63506204, + "learning_rate": 1.062803450204029e-06, + "loss": 0.65531075, + "num_input_tokens_seen": 239109410, + "router_z_loss_clip": 0.01452637, + "router_z_loss_mlp": 0.20410156, + "step": 11070, + "time_per_iteration": 3.1373214721679688 + }, + { + "auxiliary_loss_clip": 0.01101179, + "auxiliary_loss_mlp": 0.01026674, + "balance_loss_clip": 1.01531315, + "balance_loss_mlp": 1.03435302, + "epoch": 0.6656245302870886, + "flos": 36315562809600.0, + "grad_norm": 1.7781228106405071, + "language_loss": 0.58826381, + "learning_rate": 1.062459413096116e-06, + "loss": 0.60954237, + "num_input_tokens_seen": 239135345, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66796875, + "step": 11071, + "time_per_iteration": 2.5929718017578125 + }, + { + "auxiliary_loss_clip": 0.01105196, + "auxiliary_loss_mlp": 0.01027293, + "balance_loss_clip": 1.01662362, + "balance_loss_mlp": 1.03853655, + "epoch": 0.6656846535397565, + "flos": 21794832466560.0, + "grad_norm": 2.462730868248946, + "language_loss": 0.72873962, + "learning_rate": 1.0621154115398364e-06, + "loss": 0.75006455, + "num_input_tokens_seen": 239154340, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6640625, + "step": 11072, + "time_per_iteration": 2.457197427749634 + }, + { + "auxiliary_loss_clip": 0.01103868, + "auxiliary_loss_mlp": 0.01029593, + "balance_loss_clip": 1.01775575, + "balance_loss_mlp": 1.03849804, + "epoch": 0.6657447767924245, + "flos": 37487615592960.0, + "grad_norm": 2.0960284851890183, + "language_loss": 0.70686483, + "learning_rate": 1.0617714455482353e-06, + "loss": 0.72819948, + "num_input_tokens_seen": 239177815, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65625, + "step": 11073, + "time_per_iteration": 2.621063709259033 + }, + { + "auxiliary_loss_clip": 0.01105664, + "auxiliary_loss_mlp": 0.0103133, + "balance_loss_clip": 1.01962924, + "balance_loss_mlp": 1.03680611, + "epoch": 0.6658049000450924, + "flos": 16837688206080.0, + "grad_norm": 21.254891604302284, + "language_loss": 0.56184697, + "learning_rate": 1.061427515134354e-06, + "loss": 0.58321697, + "num_input_tokens_seen": 239195735, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6875, + "step": 11074, + "time_per_iteration": 2.417592763900757 + }, + { + "auxiliary_loss_clip": 0.01103413, + "auxiliary_loss_mlp": 0.01029306, + "balance_loss_clip": 1.01795745, + "balance_loss_mlp": 1.03713965, + "epoch": 0.6658650232977604, + "flos": 33510975863040.0, + "grad_norm": 1.4493539029409879, + "language_loss": 0.72269762, + "learning_rate": 1.061083620311235e-06, + "loss": 0.74402475, + "num_input_tokens_seen": 239217535, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 11075, + "time_per_iteration": 2.5621016025543213 + }, + { + "auxiliary_loss_clip": 0.01099577, + "auxiliary_loss_mlp": 0.01035227, + "balance_loss_clip": 1.02400899, + "balance_loss_mlp": 1.03572047, + "epoch": 0.6659251465504283, + "flos": 37706311549440.0, + "grad_norm": 1.47592254117705, + "language_loss": 0.6616652, + "learning_rate": 1.0607397610919202e-06, + "loss": 0.6830132, + "num_input_tokens_seen": 239241975, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 11076, + "time_per_iteration": 2.618560552597046 + }, + { + "auxiliary_loss_clip": 0.01102176, + "auxiliary_loss_mlp": 0.01034767, + "balance_loss_clip": 1.02297735, + "balance_loss_mlp": 1.0359937, + "epoch": 0.6659852698030964, + "flos": 24893420232960.0, + "grad_norm": 1.613817606590062, + "language_loss": 0.75271714, + "learning_rate": 1.0603959374894468e-06, + "loss": 0.77408653, + "num_input_tokens_seen": 239262025, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66015625, + "step": 11077, + "time_per_iteration": 2.487748146057129 + }, + { + "auxiliary_loss_clip": 0.01102302, + "auxiliary_loss_mlp": 0.01030781, + "balance_loss_clip": 1.01956344, + "balance_loss_mlp": 1.03536868, + "epoch": 0.6660453930557643, + "flos": 24352821567360.0, + "grad_norm": 1.863663819937869, + "language_loss": 0.66703588, + "learning_rate": 1.0600521495168538e-06, + "loss": 0.68836671, + "num_input_tokens_seen": 239282775, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.66796875, + "step": 11078, + "time_per_iteration": 2.4835610389709473 + }, + { + "auxiliary_loss_clip": 0.0110385, + "auxiliary_loss_mlp": 0.01029692, + "balance_loss_clip": 1.01786661, + "balance_loss_mlp": 1.03568316, + "epoch": 0.6661055163084323, + "flos": 10597814380800.0, + "grad_norm": 1.962622549544945, + "language_loss": 0.69805777, + "learning_rate": 1.0597083971871783e-06, + "loss": 0.71939325, + "num_input_tokens_seen": 239299775, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.68359375, + "step": 11079, + "time_per_iteration": 2.4517362117767334 + }, + { + "auxiliary_loss_clip": 0.01100691, + "auxiliary_loss_mlp": 0.01027325, + "balance_loss_clip": 1.01656055, + "balance_loss_mlp": 1.03579783, + "epoch": 0.6661656395611003, + "flos": 24057491944320.0, + "grad_norm": 1.4504303029583365, + "language_loss": 0.80272287, + "learning_rate": 1.0593646805134544e-06, + "loss": 0.82400304, + "num_input_tokens_seen": 239319660, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6484375, + "step": 11080, + "time_per_iteration": 2.496086835861206 + }, + { + "auxiliary_loss_clip": 0.01098572, + "auxiliary_loss_mlp": 0.0102897, + "balance_loss_clip": 1.0184257, + "balance_loss_mlp": 1.03518367, + "epoch": 0.6662257628137682, + "flos": 23036192542080.0, + "grad_norm": 1.7747670262807855, + "language_loss": 0.78175783, + "learning_rate": 1.0590209995087157e-06, + "loss": 0.80303317, + "num_input_tokens_seen": 239339215, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6328125, + "step": 11081, + "time_per_iteration": 2.4947092533111572 + }, + { + "auxiliary_loss_clip": 0.01104079, + "auxiliary_loss_mlp": 0.01031001, + "balance_loss_clip": 1.01905608, + "balance_loss_mlp": 1.03641224, + "epoch": 0.6662858860664362, + "flos": 24754446512640.0, + "grad_norm": 1.7096575045073308, + "language_loss": 0.79757982, + "learning_rate": 1.0586773541859946e-06, + "loss": 0.81893063, + "num_input_tokens_seen": 239358545, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 11082, + "time_per_iteration": 2.496314287185669 + }, + { + "auxiliary_loss_clip": 0.01100593, + "auxiliary_loss_mlp": 0.01035918, + "balance_loss_clip": 1.02490342, + "balance_loss_mlp": 1.03489673, + "epoch": 0.6663460093191041, + "flos": 20009066883840.0, + "grad_norm": 1.4408084093775566, + "language_loss": 0.83964407, + "learning_rate": 1.0583337445583234e-06, + "loss": 0.86100918, + "num_input_tokens_seen": 239376665, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 11083, + "time_per_iteration": 2.441714286804199 + }, + { + "auxiliary_loss_clip": 0.0110885, + "auxiliary_loss_mlp": 0.01034762, + "balance_loss_clip": 1.02278709, + "balance_loss_mlp": 1.03879905, + "epoch": 0.6664061325717722, + "flos": 17821389047040.0, + "grad_norm": 2.210335279184582, + "language_loss": 0.85422742, + "learning_rate": 1.057990170638731e-06, + "loss": 0.87566352, + "num_input_tokens_seen": 239394345, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.703125, + "step": 11084, + "time_per_iteration": 2.4179892539978027 + }, + { + "auxiliary_loss_clip": 0.01106729, + "auxiliary_loss_mlp": 0.0102887, + "balance_loss_clip": 1.01672292, + "balance_loss_mlp": 1.03759933, + "epoch": 0.6664662558244401, + "flos": 18076893465600.0, + "grad_norm": 2.2800746471584135, + "language_loss": 0.73236918, + "learning_rate": 1.0576466324402452e-06, + "loss": 0.75372517, + "num_input_tokens_seen": 239410605, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69140625, + "step": 11085, + "time_per_iteration": 2.4865758419036865 + }, + { + "auxiliary_loss_clip": 0.01102626, + "auxiliary_loss_mlp": 0.01029255, + "balance_loss_clip": 1.01763248, + "balance_loss_mlp": 1.03617859, + "epoch": 0.6665263790771081, + "flos": 21574197175680.0, + "grad_norm": 1.9088871569878003, + "language_loss": 0.80301607, + "learning_rate": 1.057303129975894e-06, + "loss": 0.82433486, + "num_input_tokens_seen": 239427155, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 11086, + "time_per_iteration": 2.444645643234253 + }, + { + "auxiliary_loss_clip": 0.01101849, + "auxiliary_loss_mlp": 0.01029998, + "balance_loss_clip": 1.01799965, + "balance_loss_mlp": 1.03593099, + "epoch": 0.666586502329776, + "flos": 24206629213440.0, + "grad_norm": 2.0449845091934753, + "language_loss": 0.74311554, + "learning_rate": 1.056959663258702e-06, + "loss": 0.7644341, + "num_input_tokens_seen": 239445510, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66015625, + "step": 11087, + "time_per_iteration": 2.483962059020996 + }, + { + "auxiliary_loss_clip": 0.0110217, + "auxiliary_loss_mlp": 0.01030474, + "balance_loss_clip": 1.01872563, + "balance_loss_mlp": 1.03587329, + "epoch": 0.666646625582444, + "flos": 22200515648640.0, + "grad_norm": 1.5673899455217954, + "language_loss": 0.64753473, + "learning_rate": 1.0566162323016939e-06, + "loss": 0.66886115, + "num_input_tokens_seen": 239464805, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 11088, + "time_per_iteration": 2.4562034606933594 + }, + { + "auxiliary_loss_clip": 0.01104855, + "auxiliary_loss_mlp": 0.01029085, + "balance_loss_clip": 1.01734924, + "balance_loss_mlp": 1.03735042, + "epoch": 0.6667067488351119, + "flos": 18259930195200.0, + "grad_norm": 1.8332928045753645, + "language_loss": 0.64570332, + "learning_rate": 1.0562728371178928e-06, + "loss": 0.66704261, + "num_input_tokens_seen": 239483890, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.67578125, + "step": 11089, + "time_per_iteration": 2.4386065006256104 + }, + { + "auxiliary_loss_clip": 0.01099875, + "auxiliary_loss_mlp": 0.01031668, + "balance_loss_clip": 1.02059364, + "balance_loss_mlp": 1.03447926, + "epoch": 0.66676687208778, + "flos": 17236547804160.0, + "grad_norm": 2.1527148838753236, + "language_loss": 0.80835247, + "learning_rate": 1.0559294777203221e-06, + "loss": 0.82966793, + "num_input_tokens_seen": 239500080, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.65625, + "step": 11090, + "time_per_iteration": 2.394827365875244 + }, + { + "auxiliary_loss_clip": 0.01105547, + "auxiliary_loss_mlp": 0.01031813, + "balance_loss_clip": 1.02039266, + "balance_loss_mlp": 1.03684211, + "epoch": 0.6668269953404479, + "flos": 19752197748480.0, + "grad_norm": 3.4302717941928806, + "language_loss": 0.7762655, + "learning_rate": 1.0555861541219984e-06, + "loss": 0.79763907, + "num_input_tokens_seen": 239517335, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6875, + "step": 11091, + "time_per_iteration": 2.4357736110687256 + }, + { + "auxiliary_loss_clip": 0.01101701, + "auxiliary_loss_mlp": 0.01030674, + "balance_loss_clip": 1.01943851, + "balance_loss_mlp": 1.03544581, + "epoch": 0.6668871185931159, + "flos": 20558428467840.0, + "grad_norm": 1.7415157953091596, + "language_loss": 0.79347867, + "learning_rate": 1.0552428663359425e-06, + "loss": 0.81480247, + "num_input_tokens_seen": 239536240, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6640625, + "step": 11092, + "time_per_iteration": 2.4493799209594727 + }, + { + "auxiliary_loss_clip": 0.01024657, + "auxiliary_loss_mlp": 0.01010054, + "balance_loss_clip": 1.00873661, + "balance_loss_mlp": 1.00410509, + "epoch": 0.6669472418457839, + "flos": 58088167735680.0, + "grad_norm": 0.7618033983707613, + "language_loss": 0.57674438, + "learning_rate": 1.0548996143751724e-06, + "loss": 0.5970915, + "num_input_tokens_seen": 239598000, + "router_z_loss_clip": 0.01318359, + "router_z_loss_mlp": 0.20507812, + "step": 11093, + "time_per_iteration": 3.060945510864258 + }, + { + "auxiliary_loss_clip": 0.0110198, + "auxiliary_loss_mlp": 0.01029858, + "balance_loss_clip": 1.01859891, + "balance_loss_mlp": 1.03614676, + "epoch": 0.6670073650984518, + "flos": 26065113880320.0, + "grad_norm": 3.0734338086465733, + "language_loss": 0.76404822, + "learning_rate": 1.054556398252703e-06, + "loss": 0.78536654, + "num_input_tokens_seen": 239617650, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.66015625, + "step": 11094, + "time_per_iteration": 3.8702232837677 + }, + { + "auxiliary_loss_clip": 0.01102539, + "auxiliary_loss_mlp": 0.01033592, + "balance_loss_clip": 1.02148628, + "balance_loss_mlp": 1.03533387, + "epoch": 0.6670674883511198, + "flos": 32416849635840.0, + "grad_norm": 1.7962253370500996, + "language_loss": 0.73604453, + "learning_rate": 1.05421321798155e-06, + "loss": 0.75740582, + "num_input_tokens_seen": 239639825, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 11095, + "time_per_iteration": 2.5393593311309814 + }, + { + "auxiliary_loss_clip": 0.01104214, + "auxiliary_loss_mlp": 0.01031091, + "balance_loss_clip": 1.01983809, + "balance_loss_mlp": 1.03839517, + "epoch": 0.6671276116037878, + "flos": 18037786533120.0, + "grad_norm": 2.498006768699264, + "language_loss": 0.73841417, + "learning_rate": 1.053870073574727e-06, + "loss": 0.75976729, + "num_input_tokens_seen": 239656300, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 11096, + "time_per_iteration": 5.295018672943115 + }, + { + "auxiliary_loss_clip": 0.01101592, + "auxiliary_loss_mlp": 0.01030969, + "balance_loss_clip": 1.02000785, + "balance_loss_mlp": 1.03659046, + "epoch": 0.6671877348564558, + "flos": 23767046570880.0, + "grad_norm": 2.1197138558836652, + "language_loss": 0.64377868, + "learning_rate": 1.0535269650452456e-06, + "loss": 0.66510427, + "num_input_tokens_seen": 239676655, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 11097, + "time_per_iteration": 2.4755849838256836 + }, + { + "auxiliary_loss_clip": 0.01105023, + "auxiliary_loss_mlp": 0.01029903, + "balance_loss_clip": 1.01810169, + "balance_loss_mlp": 1.03657043, + "epoch": 0.6672478581091237, + "flos": 20918360701440.0, + "grad_norm": 1.8367279267646714, + "language_loss": 0.75293523, + "learning_rate": 1.0531838924061158e-06, + "loss": 0.77428448, + "num_input_tokens_seen": 239695430, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 11098, + "time_per_iteration": 3.8889780044555664 + }, + { + "auxiliary_loss_clip": 0.01105898, + "auxiliary_loss_mlp": 0.01031065, + "balance_loss_clip": 1.01997876, + "balance_loss_mlp": 1.03809619, + "epoch": 0.6673079813617917, + "flos": 27855799626240.0, + "grad_norm": 1.6239497270406267, + "language_loss": 0.74629354, + "learning_rate": 1.0528408556703476e-06, + "loss": 0.76766318, + "num_input_tokens_seen": 239717070, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6796875, + "step": 11099, + "time_per_iteration": 2.499155282974243 + }, + { + "auxiliary_loss_clip": 0.01099471, + "auxiliary_loss_mlp": 0.01033794, + "balance_loss_clip": 1.02229047, + "balance_loss_mlp": 1.03467488, + "epoch": 0.6673681046144596, + "flos": 21616859554560.0, + "grad_norm": 1.8612331810201734, + "language_loss": 0.78086853, + "learning_rate": 1.0524978548509502e-06, + "loss": 0.80220115, + "num_input_tokens_seen": 239737105, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 11100, + "time_per_iteration": 2.4822754859924316 + }, + { + "auxiliary_loss_clip": 0.01101826, + "auxiliary_loss_mlp": 0.01034027, + "balance_loss_clip": 1.02265465, + "balance_loss_mlp": 1.03608942, + "epoch": 0.6674282278671276, + "flos": 20889884194560.0, + "grad_norm": 2.199541930312583, + "language_loss": 0.60234034, + "learning_rate": 1.0521548899609288e-06, + "loss": 0.62369883, + "num_input_tokens_seen": 239757835, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 11101, + "time_per_iteration": 2.470005750656128 + }, + { + "auxiliary_loss_clip": 0.0110769, + "auxiliary_loss_mlp": 0.01034695, + "balance_loss_clip": 1.02202892, + "balance_loss_mlp": 1.03702366, + "epoch": 0.6674883511197955, + "flos": 23624194181760.0, + "grad_norm": 1.6927482018220132, + "language_loss": 0.711254, + "learning_rate": 1.0518119610132884e-06, + "loss": 0.73267794, + "num_input_tokens_seen": 239775425, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 11102, + "time_per_iteration": 2.5034313201904297 + }, + { + "auxiliary_loss_clip": 0.01104064, + "auxiliary_loss_mlp": 0.01029221, + "balance_loss_clip": 1.01774669, + "balance_loss_mlp": 1.03638661, + "epoch": 0.6675484743724636, + "flos": 19609668581760.0, + "grad_norm": 1.4777736440637246, + "language_loss": 0.84276104, + "learning_rate": 1.051469068021034e-06, + "loss": 0.8640939, + "num_input_tokens_seen": 239794605, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.67578125, + "step": 11103, + "time_per_iteration": 2.430427074432373 + }, + { + "auxiliary_loss_clip": 0.01104082, + "auxiliary_loss_mlp": 0.01024589, + "balance_loss_clip": 1.01338315, + "balance_loss_mlp": 1.03620505, + "epoch": 0.6676085976251315, + "flos": 14319452482560.0, + "grad_norm": 2.187100835254228, + "language_loss": 0.77906835, + "learning_rate": 1.0511262109971668e-06, + "loss": 0.80035502, + "num_input_tokens_seen": 239812135, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6796875, + "step": 11104, + "time_per_iteration": 2.431415557861328 + }, + { + "auxiliary_loss_clip": 0.01105832, + "auxiliary_loss_mlp": 0.01027014, + "balance_loss_clip": 1.01554644, + "balance_loss_mlp": 1.03741312, + "epoch": 0.6676687208777995, + "flos": 38104596529920.0, + "grad_norm": 1.7588653188886298, + "language_loss": 0.58123207, + "learning_rate": 1.0507833899546889e-06, + "loss": 0.60256052, + "num_input_tokens_seen": 239835845, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.68359375, + "step": 11105, + "time_per_iteration": 2.5778300762176514 + }, + { + "auxiliary_loss_clip": 0.01107319, + "auxiliary_loss_mlp": 0.01030859, + "balance_loss_clip": 1.01864648, + "balance_loss_mlp": 1.0369904, + "epoch": 0.6677288441304675, + "flos": 23981576549760.0, + "grad_norm": 1.5584285162619382, + "language_loss": 0.73263156, + "learning_rate": 1.0504406049066e-06, + "loss": 0.75401342, + "num_input_tokens_seen": 239853820, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.703125, + "step": 11106, + "time_per_iteration": 2.502669334411621 + }, + { + "auxiliary_loss_clip": 0.01102707, + "auxiliary_loss_mlp": 0.01027538, + "balance_loss_clip": 1.01568878, + "balance_loss_mlp": 1.03582263, + "epoch": 0.6677889673831354, + "flos": 24170682677760.0, + "grad_norm": 1.612792210414072, + "language_loss": 0.77103424, + "learning_rate": 1.0500978558659e-06, + "loss": 0.7923367, + "num_input_tokens_seen": 239873365, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 11107, + "time_per_iteration": 2.4632906913757324 + }, + { + "auxiliary_loss_clip": 0.01098872, + "auxiliary_loss_mlp": 0.01027525, + "balance_loss_clip": 1.01636124, + "balance_loss_mlp": 1.03531408, + "epoch": 0.6678490906358034, + "flos": 22309648145280.0, + "grad_norm": 2.2458320549685267, + "language_loss": 0.89908957, + "learning_rate": 1.049755142845583e-06, + "loss": 0.92035359, + "num_input_tokens_seen": 239891215, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.63671875, + "step": 11108, + "time_per_iteration": 2.4730093479156494 + }, + { + "auxiliary_loss_clip": 0.01099015, + "auxiliary_loss_mlp": 0.01022867, + "balance_loss_clip": 1.01253176, + "balance_loss_mlp": 1.03418517, + "epoch": 0.6679092138884714, + "flos": 36898752026880.0, + "grad_norm": 1.3985533807105044, + "language_loss": 0.82679069, + "learning_rate": 1.049412465858646e-06, + "loss": 0.84800953, + "num_input_tokens_seen": 239913490, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.6484375, + "step": 11109, + "time_per_iteration": 2.580944061279297 + }, + { + "auxiliary_loss_clip": 0.01102598, + "auxiliary_loss_mlp": 0.01028606, + "balance_loss_clip": 1.01666141, + "balance_loss_mlp": 1.03557515, + "epoch": 0.6679693371411394, + "flos": 18150294908160.0, + "grad_norm": 1.8119039289749856, + "language_loss": 0.69528979, + "learning_rate": 1.0490698249180847e-06, + "loss": 0.71660185, + "num_input_tokens_seen": 239931565, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 11110, + "time_per_iteration": 2.5149457454681396 + }, + { + "auxiliary_loss_clip": 0.01106029, + "auxiliary_loss_mlp": 0.01032156, + "balance_loss_clip": 1.01944864, + "balance_loss_mlp": 1.03632832, + "epoch": 0.6680294603938073, + "flos": 27198167472000.0, + "grad_norm": 1.7594532626452621, + "language_loss": 0.7338779, + "learning_rate": 1.04872722003689e-06, + "loss": 0.75525975, + "num_input_tokens_seen": 239952395, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 11111, + "time_per_iteration": 2.481405258178711 + }, + { + "auxiliary_loss_clip": 0.01097972, + "auxiliary_loss_mlp": 0.01026013, + "balance_loss_clip": 1.01508117, + "balance_loss_mlp": 1.03355026, + "epoch": 0.6680895836464753, + "flos": 21725309692800.0, + "grad_norm": 3.2736780286979488, + "language_loss": 0.64989609, + "learning_rate": 1.0483846512280553e-06, + "loss": 0.6711359, + "num_input_tokens_seen": 239968910, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.64453125, + "step": 11112, + "time_per_iteration": 2.452441930770874 + }, + { + "auxiliary_loss_clip": 0.01102119, + "auxiliary_loss_mlp": 0.01029115, + "balance_loss_clip": 1.01734865, + "balance_loss_mlp": 1.03562021, + "epoch": 0.6681497068991432, + "flos": 19646477043840.0, + "grad_norm": 1.7892928589109056, + "language_loss": 0.63786232, + "learning_rate": 1.048042118504569e-06, + "loss": 0.65917462, + "num_input_tokens_seen": 239987680, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 11113, + "time_per_iteration": 2.4086506366729736 + }, + { + "auxiliary_loss_clip": 0.01100693, + "auxiliary_loss_mlp": 0.01031399, + "balance_loss_clip": 1.02059317, + "balance_loss_mlp": 1.03552222, + "epoch": 0.6682098301518112, + "flos": 17419153570560.0, + "grad_norm": 1.8981901836856618, + "language_loss": 0.66016996, + "learning_rate": 1.047699621879422e-06, + "loss": 0.6814909, + "num_input_tokens_seen": 240005790, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.65234375, + "step": 11114, + "time_per_iteration": 2.4347803592681885 + }, + { + "auxiliary_loss_clip": 0.0110162, + "auxiliary_loss_mlp": 0.01032493, + "balance_loss_clip": 1.02107906, + "balance_loss_mlp": 1.03480756, + "epoch": 0.6682699534044791, + "flos": 22599016110720.0, + "grad_norm": 1.6451209195544332, + "language_loss": 0.78455061, + "learning_rate": 1.0473571613655998e-06, + "loss": 0.80589175, + "num_input_tokens_seen": 240025895, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 11115, + "time_per_iteration": 2.478957414627075 + }, + { + "auxiliary_loss_clip": 0.01101464, + "auxiliary_loss_mlp": 0.01029063, + "balance_loss_clip": 1.01758313, + "balance_loss_mlp": 1.03418374, + "epoch": 0.6683300766571472, + "flos": 24863686750080.0, + "grad_norm": 1.607299826888502, + "language_loss": 0.79468185, + "learning_rate": 1.0470147369760896e-06, + "loss": 0.81598711, + "num_input_tokens_seen": 240044880, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.671875, + "step": 11116, + "time_per_iteration": 2.5263917446136475 + }, + { + "auxiliary_loss_clip": 0.01104708, + "auxiliary_loss_mlp": 0.01031435, + "balance_loss_clip": 1.01905489, + "balance_loss_mlp": 1.03642249, + "epoch": 0.6683901999098151, + "flos": 27126633536640.0, + "grad_norm": 1.793058561798458, + "language_loss": 0.79410267, + "learning_rate": 1.0466723487238768e-06, + "loss": 0.81546414, + "num_input_tokens_seen": 240065785, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 11117, + "time_per_iteration": 2.4854443073272705 + }, + { + "auxiliary_loss_clip": 0.01105696, + "auxiliary_loss_mlp": 0.01031025, + "balance_loss_clip": 1.01769769, + "balance_loss_mlp": 1.03675961, + "epoch": 0.6684503231624831, + "flos": 20739023072640.0, + "grad_norm": 1.507325638356248, + "language_loss": 0.65411663, + "learning_rate": 1.0463299966219441e-06, + "loss": 0.67548382, + "num_input_tokens_seen": 240085130, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6875, + "step": 11118, + "time_per_iteration": 2.472377300262451 + }, + { + "auxiliary_loss_clip": 0.01100857, + "auxiliary_loss_mlp": 0.01028801, + "balance_loss_clip": 1.01816726, + "balance_loss_mlp": 1.03583932, + "epoch": 0.668510446415151, + "flos": 21762189982080.0, + "grad_norm": 2.967647334244501, + "language_loss": 0.68711627, + "learning_rate": 1.0459876806832727e-06, + "loss": 0.70841289, + "num_input_tokens_seen": 240105495, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.65234375, + "step": 11119, + "time_per_iteration": 2.4728288650512695 + }, + { + "auxiliary_loss_clip": 0.01102457, + "auxiliary_loss_mlp": 0.01026404, + "balance_loss_clip": 1.01497746, + "balance_loss_mlp": 1.03634501, + "epoch": 0.668570569667819, + "flos": 30191250015360.0, + "grad_norm": 1.5996077334078893, + "language_loss": 0.66828573, + "learning_rate": 1.0456454009208448e-06, + "loss": 0.68957436, + "num_input_tokens_seen": 240125455, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66015625, + "step": 11120, + "time_per_iteration": 2.546515941619873 + }, + { + "auxiliary_loss_clip": 0.0110248, + "auxiliary_loss_mlp": 0.01029952, + "balance_loss_clip": 1.01742911, + "balance_loss_mlp": 1.03602421, + "epoch": 0.668630692920487, + "flos": 24170646764160.0, + "grad_norm": 1.762800604873663, + "language_loss": 0.72149706, + "learning_rate": 1.045303157347638e-06, + "loss": 0.7428214, + "num_input_tokens_seen": 240143870, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6640625, + "step": 11121, + "time_per_iteration": 2.477660894393921 + }, + { + "auxiliary_loss_clip": 0.01103609, + "auxiliary_loss_mlp": 0.01034095, + "balance_loss_clip": 1.02209687, + "balance_loss_mlp": 1.0351814, + "epoch": 0.668690816173155, + "flos": 17457147181440.0, + "grad_norm": 2.849050741943763, + "language_loss": 0.70147824, + "learning_rate": 1.0449609499766316e-06, + "loss": 0.72285533, + "num_input_tokens_seen": 240161020, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 11122, + "time_per_iteration": 2.490941286087036 + }, + { + "auxiliary_loss_clip": 0.01103575, + "auxiliary_loss_mlp": 0.01035456, + "balance_loss_clip": 1.02377343, + "balance_loss_mlp": 1.03655457, + "epoch": 0.668750939425823, + "flos": 25005102595200.0, + "grad_norm": 1.6701786551201399, + "language_loss": 0.71671915, + "learning_rate": 1.0446187788208015e-06, + "loss": 0.73810941, + "num_input_tokens_seen": 240179820, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 11123, + "time_per_iteration": 2.4819095134735107 + }, + { + "auxiliary_loss_clip": 0.01108577, + "auxiliary_loss_mlp": 0.01035224, + "balance_loss_clip": 1.02273631, + "balance_loss_mlp": 1.0392499, + "epoch": 0.6688110626784909, + "flos": 24096778444800.0, + "grad_norm": 1.6154595834142065, + "language_loss": 0.79180294, + "learning_rate": 1.0442766438931244e-06, + "loss": 0.81324089, + "num_input_tokens_seen": 240200130, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6953125, + "step": 11124, + "time_per_iteration": 2.4734344482421875 + }, + { + "auxiliary_loss_clip": 0.0110496, + "auxiliary_loss_mlp": 0.01036149, + "balance_loss_clip": 1.02444232, + "balance_loss_mlp": 1.03757286, + "epoch": 0.6688711859311589, + "flos": 21759532375680.0, + "grad_norm": 1.7495803882819345, + "language_loss": 0.74282473, + "learning_rate": 1.0439345452065716e-06, + "loss": 0.76423579, + "num_input_tokens_seen": 240217945, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.67578125, + "step": 11125, + "time_per_iteration": 2.444687843322754 + }, + { + "auxiliary_loss_clip": 0.01105662, + "auxiliary_loss_mlp": 0.01034569, + "balance_loss_clip": 1.0227133, + "balance_loss_mlp": 1.03771114, + "epoch": 0.6689313091838268, + "flos": 22929645824640.0, + "grad_norm": 2.3220485163353035, + "language_loss": 0.66047573, + "learning_rate": 1.043592482774116e-06, + "loss": 0.68187803, + "num_input_tokens_seen": 240237220, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 11126, + "time_per_iteration": 2.508352756500244 + }, + { + "auxiliary_loss_clip": 0.01100528, + "auxiliary_loss_mlp": 0.01023616, + "balance_loss_clip": 1.01235676, + "balance_loss_mlp": 1.03333449, + "epoch": 0.6689914324364948, + "flos": 20886149180160.0, + "grad_norm": 3.2519975932516094, + "language_loss": 0.71248001, + "learning_rate": 1.0432504566087305e-06, + "loss": 0.73372149, + "num_input_tokens_seen": 240256000, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 11127, + "time_per_iteration": 2.4746527671813965 + }, + { + "auxiliary_loss_clip": 0.01106513, + "auxiliary_loss_mlp": 0.01032607, + "balance_loss_clip": 1.01952958, + "balance_loss_mlp": 1.03555894, + "epoch": 0.6690515556891627, + "flos": 22748225207040.0, + "grad_norm": 2.0140192417842235, + "language_loss": 0.80290639, + "learning_rate": 1.0429084667233827e-06, + "loss": 0.82429767, + "num_input_tokens_seen": 240275845, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 11128, + "time_per_iteration": 2.476914644241333 + }, + { + "auxiliary_loss_clip": 0.0110363, + "auxiliary_loss_mlp": 0.01029193, + "balance_loss_clip": 1.01713467, + "balance_loss_mlp": 1.03555393, + "epoch": 0.6691116789418308, + "flos": 23331450337920.0, + "grad_norm": 2.0449515592271967, + "language_loss": 0.81091756, + "learning_rate": 1.0425665131310427e-06, + "loss": 0.83224577, + "num_input_tokens_seen": 240294095, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 11129, + "time_per_iteration": 2.457526922225952 + }, + { + "auxiliary_loss_clip": 0.01098446, + "auxiliary_loss_mlp": 0.01033459, + "balance_loss_clip": 1.02288556, + "balance_loss_mlp": 1.0350548, + "epoch": 0.6691718021944987, + "flos": 32447014081920.0, + "grad_norm": 1.6204282208074086, + "language_loss": 0.70266747, + "learning_rate": 1.0422245958446762e-06, + "loss": 0.72398651, + "num_input_tokens_seen": 240313460, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6328125, + "step": 11130, + "time_per_iteration": 2.5508627891540527 + }, + { + "auxiliary_loss_clip": 0.01100261, + "auxiliary_loss_mlp": 0.01034692, + "balance_loss_clip": 1.02409458, + "balance_loss_mlp": 1.03609157, + "epoch": 0.6692319254471667, + "flos": 23731602825600.0, + "grad_norm": 1.5850862701658837, + "language_loss": 0.70004213, + "learning_rate": 1.0418827148772486e-06, + "loss": 0.72139168, + "num_input_tokens_seen": 240333540, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.640625, + "step": 11131, + "time_per_iteration": 2.442675828933716 + }, + { + "auxiliary_loss_clip": 0.01103504, + "auxiliary_loss_mlp": 0.01028409, + "balance_loss_clip": 1.01562405, + "balance_loss_mlp": 1.03573704, + "epoch": 0.6692920486998346, + "flos": 14427902620800.0, + "grad_norm": 2.456945083607925, + "language_loss": 0.65068108, + "learning_rate": 1.0415408702417243e-06, + "loss": 0.67200017, + "num_input_tokens_seen": 240350085, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 11132, + "time_per_iteration": 2.4112234115600586 + }, + { + "auxiliary_loss_clip": 0.01105597, + "auxiliary_loss_mlp": 0.01034611, + "balance_loss_clip": 1.0216949, + "balance_loss_mlp": 1.03693044, + "epoch": 0.6693521719525026, + "flos": 21507475662720.0, + "grad_norm": 1.6075137482523445, + "language_loss": 0.74700105, + "learning_rate": 1.0411990619510661e-06, + "loss": 0.76840317, + "num_input_tokens_seen": 240370015, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6875, + "step": 11133, + "time_per_iteration": 2.4416236877441406 + }, + { + "auxiliary_loss_clip": 0.01109475, + "auxiliary_loss_mlp": 0.01030464, + "balance_loss_clip": 1.01720238, + "balance_loss_mlp": 1.03926897, + "epoch": 0.6694122952051706, + "flos": 25406943022080.0, + "grad_norm": 2.3633346892670266, + "language_loss": 0.66337103, + "learning_rate": 1.0408572900182363e-06, + "loss": 0.68477046, + "num_input_tokens_seen": 240390770, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.703125, + "step": 11134, + "time_per_iteration": 2.4672107696533203 + }, + { + "auxiliary_loss_clip": 0.0111221, + "auxiliary_loss_mlp": 0.01033808, + "balance_loss_clip": 1.02098703, + "balance_loss_mlp": 1.03965247, + "epoch": 0.6694724184578386, + "flos": 25661729168640.0, + "grad_norm": 1.8392889149756566, + "language_loss": 0.77132189, + "learning_rate": 1.0405155544561943e-06, + "loss": 0.79278213, + "num_input_tokens_seen": 240409590, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7265625, + "step": 11135, + "time_per_iteration": 2.4986488819122314 + }, + { + "auxiliary_loss_clip": 0.01101077, + "auxiliary_loss_mlp": 0.01031734, + "balance_loss_clip": 1.01987875, + "balance_loss_mlp": 1.03573108, + "epoch": 0.6695325417105066, + "flos": 17709311635200.0, + "grad_norm": 1.766175864119674, + "language_loss": 0.74168599, + "learning_rate": 1.040173855277898e-06, + "loss": 0.76301408, + "num_input_tokens_seen": 240428180, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65234375, + "step": 11136, + "time_per_iteration": 3.892975091934204 + }, + { + "auxiliary_loss_clip": 0.01108465, + "auxiliary_loss_mlp": 0.010308, + "balance_loss_clip": 1.0182538, + "balance_loss_mlp": 1.03819919, + "epoch": 0.6695926649631745, + "flos": 24460050643200.0, + "grad_norm": 1.743373004526595, + "language_loss": 0.62210536, + "learning_rate": 1.0398321924963061e-06, + "loss": 0.643498, + "num_input_tokens_seen": 240447815, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 11137, + "time_per_iteration": 2.4584341049194336 + }, + { + "auxiliary_loss_clip": 0.01102957, + "auxiliary_loss_mlp": 0.01030402, + "balance_loss_clip": 1.01840353, + "balance_loss_mlp": 1.03640008, + "epoch": 0.6696527882158425, + "flos": 24280138396800.0, + "grad_norm": 2.2042949503897837, + "language_loss": 0.65724766, + "learning_rate": 1.0394905661243724e-06, + "loss": 0.67858124, + "num_input_tokens_seen": 240468635, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 11138, + "time_per_iteration": 4.031554460525513 + }, + { + "auxiliary_loss_clip": 0.01099165, + "auxiliary_loss_mlp": 0.01033153, + "balance_loss_clip": 1.02175677, + "balance_loss_mlp": 1.03467035, + "epoch": 0.6697129114685104, + "flos": 23002759958400.0, + "grad_norm": 1.5685975938909107, + "language_loss": 0.73056483, + "learning_rate": 1.039148976175053e-06, + "loss": 0.75188804, + "num_input_tokens_seen": 240488550, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.64453125, + "step": 11139, + "time_per_iteration": 2.490262746810913 + }, + { + "auxiliary_loss_clip": 0.01099368, + "auxiliary_loss_mlp": 0.0102889, + "balance_loss_clip": 1.01842916, + "balance_loss_mlp": 1.0357821, + "epoch": 0.6697730347211784, + "flos": 22638123043200.0, + "grad_norm": 3.192057111781844, + "language_loss": 0.70166105, + "learning_rate": 1.0388074226613016e-06, + "loss": 0.72294366, + "num_input_tokens_seen": 240508330, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.63671875, + "step": 11140, + "time_per_iteration": 3.9318604469299316 + }, + { + "auxiliary_loss_clip": 0.01104563, + "auxiliary_loss_mlp": 0.01026754, + "balance_loss_clip": 1.01471996, + "balance_loss_mlp": 1.03500891, + "epoch": 0.6698331579738463, + "flos": 28877242682880.0, + "grad_norm": 3.669311669995305, + "language_loss": 0.75779974, + "learning_rate": 1.0384659055960691e-06, + "loss": 0.77911294, + "num_input_tokens_seen": 240528470, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6953125, + "step": 11141, + "time_per_iteration": 2.516190767288208 + }, + { + "auxiliary_loss_clip": 0.01103882, + "auxiliary_loss_mlp": 0.01034793, + "balance_loss_clip": 1.0225563, + "balance_loss_mlp": 1.03589845, + "epoch": 0.6698932812265144, + "flos": 24207096090240.0, + "grad_norm": 1.7275630939402262, + "language_loss": 0.82025433, + "learning_rate": 1.0381244249923052e-06, + "loss": 0.84164113, + "num_input_tokens_seen": 240547815, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6796875, + "step": 11142, + "time_per_iteration": 2.477917432785034 + }, + { + "auxiliary_loss_clip": 0.01099057, + "auxiliary_loss_mlp": 0.0102729, + "balance_loss_clip": 1.01569653, + "balance_loss_mlp": 1.0331465, + "epoch": 0.6699534044791823, + "flos": 22090269830400.0, + "grad_norm": 1.5656493432889642, + "language_loss": 0.70054591, + "learning_rate": 1.037782980862959e-06, + "loss": 0.72180939, + "num_input_tokens_seen": 240567765, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66015625, + "step": 11143, + "time_per_iteration": 2.496873617172241 + }, + { + "auxiliary_loss_clip": 0.01099241, + "auxiliary_loss_mlp": 0.01031347, + "balance_loss_clip": 1.02056444, + "balance_loss_mlp": 1.03546476, + "epoch": 0.6700135277318503, + "flos": 25192377129600.0, + "grad_norm": 1.5042984772488615, + "language_loss": 0.69867527, + "learning_rate": 1.0374415732209796e-06, + "loss": 0.71998119, + "num_input_tokens_seen": 240590750, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.63671875, + "step": 11144, + "time_per_iteration": 2.498004674911499 + }, + { + "auxiliary_loss_clip": 0.01101313, + "auxiliary_loss_mlp": 0.01027386, + "balance_loss_clip": 1.01553071, + "balance_loss_mlp": 1.03556204, + "epoch": 0.6700736509845182, + "flos": 23440187784960.0, + "grad_norm": 1.7755943554148508, + "language_loss": 0.74376822, + "learning_rate": 1.0371002020793114e-06, + "loss": 0.76505524, + "num_input_tokens_seen": 240608875, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.65625, + "step": 11145, + "time_per_iteration": 2.482536554336548 + }, + { + "auxiliary_loss_clip": 0.01105558, + "auxiliary_loss_mlp": 0.01027193, + "balance_loss_clip": 1.01503921, + "balance_loss_mlp": 1.03620577, + "epoch": 0.6701337742371862, + "flos": 24389953251840.0, + "grad_norm": 1.7672711788536422, + "language_loss": 0.70669931, + "learning_rate": 1.0367588674509008e-06, + "loss": 0.72802681, + "num_input_tokens_seen": 240628565, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 11146, + "time_per_iteration": 2.480379819869995 + }, + { + "auxiliary_loss_clip": 0.01098109, + "auxiliary_loss_mlp": 0.01029358, + "balance_loss_clip": 1.01786661, + "balance_loss_mlp": 1.03490484, + "epoch": 0.6701938974898543, + "flos": 14793652857600.0, + "grad_norm": 1.8854886897083816, + "language_loss": 0.7791847, + "learning_rate": 1.0364175693486905e-06, + "loss": 0.80045938, + "num_input_tokens_seen": 240646325, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6328125, + "step": 11147, + "time_per_iteration": 2.4453067779541016 + }, + { + "auxiliary_loss_clip": 0.01104074, + "auxiliary_loss_mlp": 0.01033623, + "balance_loss_clip": 1.02177358, + "balance_loss_mlp": 1.03823161, + "epoch": 0.6702540207425222, + "flos": 20154002261760.0, + "grad_norm": 1.9489637728749547, + "language_loss": 0.70395339, + "learning_rate": 1.0360763077856218e-06, + "loss": 0.72533029, + "num_input_tokens_seen": 240666145, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65625, + "step": 11148, + "time_per_iteration": 2.4539880752563477 + }, + { + "auxiliary_loss_clip": 0.0110278, + "auxiliary_loss_mlp": 0.01032686, + "balance_loss_clip": 1.0209502, + "balance_loss_mlp": 1.03479636, + "epoch": 0.6703141439951902, + "flos": 21214157201280.0, + "grad_norm": 1.6874385150714277, + "language_loss": 0.70091569, + "learning_rate": 1.035735082774636e-06, + "loss": 0.72227037, + "num_input_tokens_seen": 240685570, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 11149, + "time_per_iteration": 2.5368881225585938 + }, + { + "auxiliary_loss_clip": 0.01102045, + "auxiliary_loss_mlp": 0.01025738, + "balance_loss_clip": 1.01511049, + "balance_loss_mlp": 1.03425717, + "epoch": 0.6703742672478581, + "flos": 23112538899840.0, + "grad_norm": 2.0651183620740405, + "language_loss": 0.7356782, + "learning_rate": 1.0353938943286727e-06, + "loss": 0.75695598, + "num_input_tokens_seen": 240706945, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6796875, + "step": 11150, + "time_per_iteration": 2.489635944366455 + }, + { + "auxiliary_loss_clip": 0.01104117, + "auxiliary_loss_mlp": 0.01034109, + "balance_loss_clip": 1.02237308, + "balance_loss_mlp": 1.03698301, + "epoch": 0.6704343905005261, + "flos": 22528918719360.0, + "grad_norm": 1.8066986470751747, + "language_loss": 0.7880882, + "learning_rate": 1.035052742460671e-06, + "loss": 0.80947053, + "num_input_tokens_seen": 240727990, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 11151, + "time_per_iteration": 2.470423698425293 + }, + { + "auxiliary_loss_clip": 0.01028384, + "auxiliary_loss_mlp": 0.01010518, + "balance_loss_clip": 1.00938594, + "balance_loss_mlp": 1.00781679, + "epoch": 0.670494513753194, + "flos": 64793158773120.0, + "grad_norm": 0.8172638110433008, + "language_loss": 0.55524588, + "learning_rate": 1.0347116271835643e-06, + "loss": 0.57563496, + "num_input_tokens_seen": 240790380, + "router_z_loss_clip": 0.01135254, + "router_z_loss_mlp": 0.20507812, + "step": 11152, + "time_per_iteration": 3.123234510421753 + }, + { + "auxiliary_loss_clip": 0.01103764, + "auxiliary_loss_mlp": 0.01030614, + "balance_loss_clip": 1.01884782, + "balance_loss_mlp": 1.03580236, + "epoch": 0.670554637005862, + "flos": 23511506238720.0, + "grad_norm": 1.6208942555378636, + "language_loss": 0.80739468, + "learning_rate": 1.0343705485102896e-06, + "loss": 0.82873851, + "num_input_tokens_seen": 240811545, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6796875, + "step": 11153, + "time_per_iteration": 2.511383533477783 + }, + { + "auxiliary_loss_clip": 0.01102109, + "auxiliary_loss_mlp": 0.0103123, + "balance_loss_clip": 1.0203166, + "balance_loss_mlp": 1.03519535, + "epoch": 0.67061476025853, + "flos": 19463404400640.0, + "grad_norm": 1.5741743783633508, + "language_loss": 0.76160783, + "learning_rate": 1.0340295064537814e-06, + "loss": 0.78294122, + "num_input_tokens_seen": 240831380, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.671875, + "step": 11154, + "time_per_iteration": 2.453047513961792 + }, + { + "auxiliary_loss_clip": 0.01108949, + "auxiliary_loss_mlp": 0.01032067, + "balance_loss_clip": 1.02030683, + "balance_loss_mlp": 1.03847241, + "epoch": 0.670674883511198, + "flos": 20519967980160.0, + "grad_norm": 1.4962510781515113, + "language_loss": 0.75975895, + "learning_rate": 1.0336885010269702e-06, + "loss": 0.78116906, + "num_input_tokens_seen": 240851855, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.703125, + "step": 11155, + "time_per_iteration": 2.507368564605713 + }, + { + "auxiliary_loss_clip": 0.01105615, + "auxiliary_loss_mlp": 0.01034109, + "balance_loss_clip": 1.02230704, + "balance_loss_mlp": 1.03825569, + "epoch": 0.6707350067638659, + "flos": 25483971738240.0, + "grad_norm": 2.76266123008703, + "language_loss": 0.81881839, + "learning_rate": 1.0333475322427878e-06, + "loss": 0.84021568, + "num_input_tokens_seen": 240869980, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.671875, + "step": 11156, + "time_per_iteration": 2.467165946960449 + }, + { + "auxiliary_loss_clip": 0.0110068, + "auxiliary_loss_mlp": 0.01025682, + "balance_loss_clip": 1.01425576, + "balance_loss_mlp": 1.03438997, + "epoch": 0.6707951300165339, + "flos": 22273450214400.0, + "grad_norm": 2.4473397037337237, + "language_loss": 0.74570251, + "learning_rate": 1.033006600114165e-06, + "loss": 0.7669661, + "num_input_tokens_seen": 240888680, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 11157, + "time_per_iteration": 2.4674718379974365 + }, + { + "auxiliary_loss_clip": 0.01105952, + "auxiliary_loss_mlp": 0.01035415, + "balance_loss_clip": 1.02370262, + "balance_loss_mlp": 1.03829253, + "epoch": 0.6708552532692018, + "flos": 23984593292160.0, + "grad_norm": 1.9350697498335474, + "language_loss": 0.7444576, + "learning_rate": 1.0326657046540282e-06, + "loss": 0.76587129, + "num_input_tokens_seen": 240909050, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.67578125, + "step": 11158, + "time_per_iteration": 2.4784538745880127 + }, + { + "auxiliary_loss_clip": 0.01108128, + "auxiliary_loss_mlp": 0.01030262, + "balance_loss_clip": 1.0180732, + "balance_loss_mlp": 1.0385921, + "epoch": 0.6709153765218698, + "flos": 24937519155840.0, + "grad_norm": 2.077178366394848, + "language_loss": 0.81668246, + "learning_rate": 1.0323248458753044e-06, + "loss": 0.83806634, + "num_input_tokens_seen": 240930035, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6953125, + "step": 11159, + "time_per_iteration": 2.476008653640747 + }, + { + "auxiliary_loss_clip": 0.01102735, + "auxiliary_loss_mlp": 0.01029288, + "balance_loss_clip": 1.01749814, + "balance_loss_mlp": 1.0353272, + "epoch": 0.6709754997745379, + "flos": 17530225401600.0, + "grad_norm": 1.6091286648822523, + "language_loss": 0.7708782, + "learning_rate": 1.0319840237909193e-06, + "loss": 0.79219836, + "num_input_tokens_seen": 240948895, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.671875, + "step": 11160, + "time_per_iteration": 2.4390769004821777 + }, + { + "auxiliary_loss_clip": 0.01101773, + "auxiliary_loss_mlp": 0.01026605, + "balance_loss_clip": 1.01520884, + "balance_loss_mlp": 1.03558326, + "epoch": 0.6710356230272058, + "flos": 22090880361600.0, + "grad_norm": 1.9005005299223583, + "language_loss": 0.73766249, + "learning_rate": 1.0316432384137978e-06, + "loss": 0.7589463, + "num_input_tokens_seen": 240967770, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 11161, + "time_per_iteration": 2.5078043937683105 + }, + { + "auxiliary_loss_clip": 0.01105932, + "auxiliary_loss_mlp": 0.01035471, + "balance_loss_clip": 1.0230794, + "balance_loss_mlp": 1.03523338, + "epoch": 0.6710957462798738, + "flos": 24206449645440.0, + "grad_norm": 1.6945637244101817, + "language_loss": 0.67987847, + "learning_rate": 1.0313024897568618e-06, + "loss": 0.70129251, + "num_input_tokens_seen": 240988985, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.703125, + "step": 11162, + "time_per_iteration": 2.5096116065979004 + }, + { + "auxiliary_loss_clip": 0.01102024, + "auxiliary_loss_mlp": 0.01032663, + "balance_loss_clip": 1.02136803, + "balance_loss_mlp": 1.03582597, + "epoch": 0.6711558695325417, + "flos": 19093955063040.0, + "grad_norm": 1.8281474305298504, + "language_loss": 0.70357502, + "learning_rate": 1.030961777833032e-06, + "loss": 0.72492194, + "num_input_tokens_seen": 241005455, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6640625, + "step": 11163, + "time_per_iteration": 2.4419682025909424 + }, + { + "auxiliary_loss_clip": 0.01101063, + "auxiliary_loss_mlp": 0.01029783, + "balance_loss_clip": 1.01849425, + "balance_loss_mlp": 1.03680897, + "epoch": 0.6712159927852097, + "flos": 25557875971200.0, + "grad_norm": 1.5206709527115365, + "language_loss": 0.75686288, + "learning_rate": 1.0306211026552291e-06, + "loss": 0.7781713, + "num_input_tokens_seen": 241026175, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.640625, + "step": 11164, + "time_per_iteration": 2.540302276611328 + }, + { + "auxiliary_loss_clip": 0.01102198, + "auxiliary_loss_mlp": 0.01027109, + "balance_loss_clip": 1.01537251, + "balance_loss_mlp": 1.03613234, + "epoch": 0.6712761160378776, + "flos": 22228812587520.0, + "grad_norm": 2.0013900075408424, + "language_loss": 0.64903474, + "learning_rate": 1.0302804642363704e-06, + "loss": 0.67032778, + "num_input_tokens_seen": 241044040, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66015625, + "step": 11165, + "time_per_iteration": 2.50164532661438 + }, + { + "auxiliary_loss_clip": 0.01101735, + "auxiliary_loss_mlp": 0.0103049, + "balance_loss_clip": 1.0188911, + "balance_loss_mlp": 1.03648162, + "epoch": 0.6713362392905456, + "flos": 22455517276800.0, + "grad_norm": 2.824490258261556, + "language_loss": 0.71357495, + "learning_rate": 1.0299398625893738e-06, + "loss": 0.73489726, + "num_input_tokens_seen": 241063615, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 11166, + "time_per_iteration": 2.4522786140441895 + }, + { + "auxiliary_loss_clip": 0.01102027, + "auxiliary_loss_mlp": 0.0102352, + "balance_loss_clip": 1.01324987, + "balance_loss_mlp": 1.0378294, + "epoch": 0.6713963625432136, + "flos": 25630200005760.0, + "grad_norm": 1.8136989987191092, + "language_loss": 0.77263552, + "learning_rate": 1.0295992977271546e-06, + "loss": 0.79389095, + "num_input_tokens_seen": 241082520, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.640625, + "step": 11167, + "time_per_iteration": 2.5255751609802246 + }, + { + "auxiliary_loss_clip": 0.01101953, + "auxiliary_loss_mlp": 0.0103477, + "balance_loss_clip": 1.02347469, + "balance_loss_mlp": 1.03458977, + "epoch": 0.6714564857958816, + "flos": 35006475640320.0, + "grad_norm": 5.373120607190098, + "language_loss": 0.69078279, + "learning_rate": 1.029258769662629e-06, + "loss": 0.71215004, + "num_input_tokens_seen": 241103505, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.671875, + "step": 11168, + "time_per_iteration": 2.5593607425689697 + }, + { + "auxiliary_loss_clip": 0.01105965, + "auxiliary_loss_mlp": 0.01038556, + "balance_loss_clip": 1.02552032, + "balance_loss_mlp": 1.036659, + "epoch": 0.6715166090485495, + "flos": 26279931168000.0, + "grad_norm": 1.891897557253962, + "language_loss": 0.73191148, + "learning_rate": 1.0289182784087068e-06, + "loss": 0.7533567, + "num_input_tokens_seen": 241122885, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6953125, + "step": 11169, + "time_per_iteration": 2.4835712909698486 + }, + { + "auxiliary_loss_clip": 0.01104514, + "auxiliary_loss_mlp": 0.01031792, + "balance_loss_clip": 1.01917934, + "balance_loss_mlp": 1.03605962, + "epoch": 0.6715767323012175, + "flos": 15924156583680.0, + "grad_norm": 2.050492769021052, + "language_loss": 0.76193798, + "learning_rate": 1.0285778239783005e-06, + "loss": 0.78330112, + "num_input_tokens_seen": 241140865, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 11170, + "time_per_iteration": 2.41772723197937 + }, + { + "auxiliary_loss_clip": 0.01106509, + "auxiliary_loss_mlp": 0.01028049, + "balance_loss_clip": 1.01618147, + "balance_loss_mlp": 1.03668404, + "epoch": 0.6716368555538854, + "flos": 17491441691520.0, + "grad_norm": 4.365942833040682, + "language_loss": 0.74738538, + "learning_rate": 1.0282374063843212e-06, + "loss": 0.768731, + "num_input_tokens_seen": 241158225, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69921875, + "step": 11171, + "time_per_iteration": 2.443998336791992 + }, + { + "auxiliary_loss_clip": 0.01104887, + "auxiliary_loss_mlp": 0.01037908, + "balance_loss_clip": 1.02605891, + "balance_loss_mlp": 1.03686571, + "epoch": 0.6716969788065534, + "flos": 16761521416320.0, + "grad_norm": 6.401963753530839, + "language_loss": 0.86554527, + "learning_rate": 1.0278970256396762e-06, + "loss": 0.88697314, + "num_input_tokens_seen": 241175215, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 11172, + "time_per_iteration": 2.449519395828247 + }, + { + "auxiliary_loss_clip": 0.01101029, + "auxiliary_loss_mlp": 0.0103252, + "balance_loss_clip": 1.02077198, + "balance_loss_mlp": 1.03432322, + "epoch": 0.6717571020592215, + "flos": 22709800632960.0, + "grad_norm": 1.5214923385952612, + "language_loss": 0.63705564, + "learning_rate": 1.0275566817572733e-06, + "loss": 0.65839112, + "num_input_tokens_seen": 241195250, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 11173, + "time_per_iteration": 2.4728994369506836 + }, + { + "auxiliary_loss_clip": 0.01108562, + "auxiliary_loss_mlp": 0.01035239, + "balance_loss_clip": 1.02201891, + "balance_loss_mlp": 1.03632855, + "epoch": 0.6718172253118894, + "flos": 18734094656640.0, + "grad_norm": 2.8011577390317584, + "language_loss": 0.71934807, + "learning_rate": 1.02721637475002e-06, + "loss": 0.74078608, + "num_input_tokens_seen": 241210720, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.72265625, + "step": 11174, + "time_per_iteration": 2.4150753021240234 + }, + { + "auxiliary_loss_clip": 0.01100599, + "auxiliary_loss_mlp": 0.01029318, + "balance_loss_clip": 1.01791573, + "balance_loss_mlp": 1.03507197, + "epoch": 0.6718773485645574, + "flos": 15632526061440.0, + "grad_norm": 1.9034241424773972, + "language_loss": 0.68639195, + "learning_rate": 1.0268761046308178e-06, + "loss": 0.70769107, + "num_input_tokens_seen": 241227395, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 11175, + "time_per_iteration": 2.4914746284484863 + }, + { + "auxiliary_loss_clip": 0.01101682, + "auxiliary_loss_mlp": 0.0103252, + "balance_loss_clip": 1.02143312, + "balance_loss_mlp": 1.0366466, + "epoch": 0.6719374718172253, + "flos": 19354774694400.0, + "grad_norm": 2.444826411678876, + "language_loss": 0.73786706, + "learning_rate": 1.0265358714125714e-06, + "loss": 0.7592091, + "num_input_tokens_seen": 241246355, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.65234375, + "step": 11176, + "time_per_iteration": 2.4306447505950928 + }, + { + "auxiliary_loss_clip": 0.01104157, + "auxiliary_loss_mlp": 0.01028322, + "balance_loss_clip": 1.01596642, + "balance_loss_mlp": 1.035748, + "epoch": 0.6719975950698933, + "flos": 21981316901760.0, + "grad_norm": 1.6959341450848686, + "language_loss": 0.72810507, + "learning_rate": 1.026195675108182e-06, + "loss": 0.74942982, + "num_input_tokens_seen": 241264180, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.68359375, + "step": 11177, + "time_per_iteration": 2.498624086380005 + }, + { + "auxiliary_loss_clip": 0.01103405, + "auxiliary_loss_mlp": 0.010286, + "balance_loss_clip": 1.01617265, + "balance_loss_mlp": 1.0354104, + "epoch": 0.6720577183225612, + "flos": 25228072270080.0, + "grad_norm": 2.080774174197305, + "language_loss": 0.76790631, + "learning_rate": 1.025855515730551e-06, + "loss": 0.78922629, + "num_input_tokens_seen": 241282245, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6796875, + "step": 11178, + "time_per_iteration": 3.880969524383545 + }, + { + "auxiliary_loss_clip": 0.01105896, + "auxiliary_loss_mlp": 0.01030362, + "balance_loss_clip": 1.01926351, + "balance_loss_mlp": 1.0375278, + "epoch": 0.6721178415752292, + "flos": 16945886949120.0, + "grad_norm": 1.9975121194491492, + "language_loss": 0.69893503, + "learning_rate": 1.0255153932925766e-06, + "loss": 0.72029757, + "num_input_tokens_seen": 241300745, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.68359375, + "step": 11179, + "time_per_iteration": 2.4223077297210693 + }, + { + "auxiliary_loss_clip": 0.01102153, + "auxiliary_loss_mlp": 0.01029046, + "balance_loss_clip": 1.01767302, + "balance_loss_mlp": 1.03676152, + "epoch": 0.6721779648278972, + "flos": 21541375123200.0, + "grad_norm": 1.6665783443252085, + "language_loss": 0.74105644, + "learning_rate": 1.0251753078071557e-06, + "loss": 0.76236838, + "num_input_tokens_seen": 241319320, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65625, + "step": 11180, + "time_per_iteration": 3.958832263946533 + }, + { + "auxiliary_loss_clip": 0.01102807, + "auxiliary_loss_mlp": 0.01027033, + "balance_loss_clip": 1.01544547, + "balance_loss_mlp": 1.03720415, + "epoch": 0.6722380880805652, + "flos": 22605444645120.0, + "grad_norm": 1.5017770160927022, + "language_loss": 0.75209451, + "learning_rate": 1.0248352592871848e-06, + "loss": 0.77339292, + "num_input_tokens_seen": 241342225, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 11181, + "time_per_iteration": 3.970757484436035 + }, + { + "auxiliary_loss_clip": 0.0110393, + "auxiliary_loss_mlp": 0.01025138, + "balance_loss_clip": 1.01424241, + "balance_loss_mlp": 1.03628325, + "epoch": 0.6722982113332331, + "flos": 15925269905280.0, + "grad_norm": 1.9826713327422718, + "language_loss": 0.74716818, + "learning_rate": 1.0244952477455585e-06, + "loss": 0.76845884, + "num_input_tokens_seen": 241358240, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.67578125, + "step": 11182, + "time_per_iteration": 2.4164199829101562 + }, + { + "auxiliary_loss_clip": 0.01098753, + "auxiliary_loss_mlp": 0.01032191, + "balance_loss_clip": 1.0213666, + "balance_loss_mlp": 1.03483748, + "epoch": 0.6723583345859011, + "flos": 20596170683520.0, + "grad_norm": 1.6492155923055305, + "language_loss": 0.69678056, + "learning_rate": 1.0241552731951699e-06, + "loss": 0.71808994, + "num_input_tokens_seen": 241378420, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.640625, + "step": 11183, + "time_per_iteration": 2.4825363159179688 + }, + { + "auxiliary_loss_clip": 0.01101908, + "auxiliary_loss_mlp": 0.01032724, + "balance_loss_clip": 1.02102327, + "balance_loss_mlp": 1.0350728, + "epoch": 0.672418457838569, + "flos": 21725848396800.0, + "grad_norm": 1.6819294722428546, + "language_loss": 0.77619171, + "learning_rate": 1.0238153356489112e-06, + "loss": 0.79753804, + "num_input_tokens_seen": 241397185, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66796875, + "step": 11184, + "time_per_iteration": 2.4742484092712402 + }, + { + "auxiliary_loss_clip": 0.01112997, + "auxiliary_loss_mlp": 0.01032631, + "balance_loss_clip": 1.01960111, + "balance_loss_mlp": 1.03978956, + "epoch": 0.672478581091237, + "flos": 21470379891840.0, + "grad_norm": 1.9600702886656058, + "language_loss": 0.65830189, + "learning_rate": 1.0234754351196743e-06, + "loss": 0.67975819, + "num_input_tokens_seen": 241415785, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.734375, + "step": 11185, + "time_per_iteration": 2.6265766620635986 + }, + { + "auxiliary_loss_clip": 0.0110247, + "auxiliary_loss_mlp": 0.01027508, + "balance_loss_clip": 1.01555133, + "balance_loss_mlp": 1.03508019, + "epoch": 0.6725387043439051, + "flos": 30846763267200.0, + "grad_norm": 1.6086008561996032, + "language_loss": 0.8077392, + "learning_rate": 1.023135571620345e-06, + "loss": 0.82903898, + "num_input_tokens_seen": 241437390, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 11186, + "time_per_iteration": 2.5254018306732178 + }, + { + "auxiliary_loss_clip": 0.0110242, + "auxiliary_loss_mlp": 0.01032446, + "balance_loss_clip": 1.02216411, + "balance_loss_mlp": 1.03798425, + "epoch": 0.672598827596573, + "flos": 24055947659520.0, + "grad_norm": 1.4050560740555764, + "language_loss": 0.8022958, + "learning_rate": 1.022795745163813e-06, + "loss": 0.82364446, + "num_input_tokens_seen": 241458085, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.64453125, + "step": 11187, + "time_per_iteration": 2.492206335067749 + }, + { + "auxiliary_loss_clip": 0.01108961, + "auxiliary_loss_mlp": 0.01035039, + "balance_loss_clip": 1.02235556, + "balance_loss_mlp": 1.03917003, + "epoch": 0.672658950849241, + "flos": 21871861182720.0, + "grad_norm": 2.0955662178616663, + "language_loss": 0.70936477, + "learning_rate": 1.022455955762965e-06, + "loss": 0.73080474, + "num_input_tokens_seen": 241476880, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 11188, + "time_per_iteration": 2.4696547985076904 + }, + { + "auxiliary_loss_clip": 0.01100609, + "auxiliary_loss_mlp": 0.01029794, + "balance_loss_clip": 1.01867819, + "balance_loss_mlp": 1.03614163, + "epoch": 0.6727190741019089, + "flos": 23222102359680.0, + "grad_norm": 1.690433236478768, + "language_loss": 0.7567057, + "learning_rate": 1.0221162034306842e-06, + "loss": 0.77800977, + "num_input_tokens_seen": 241496535, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.64453125, + "step": 11189, + "time_per_iteration": 2.502394676208496 + }, + { + "auxiliary_loss_clip": 0.01105784, + "auxiliary_loss_mlp": 0.01026536, + "balance_loss_clip": 1.01342869, + "balance_loss_mlp": 1.03580916, + "epoch": 0.6727791973545769, + "flos": 15778610674560.0, + "grad_norm": 2.0624308015957666, + "language_loss": 0.75735819, + "learning_rate": 1.0217764881798562e-06, + "loss": 0.7786814, + "num_input_tokens_seen": 241513465, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.69921875, + "step": 11190, + "time_per_iteration": 2.4117863178253174 + }, + { + "auxiliary_loss_clip": 0.01101643, + "auxiliary_loss_mlp": 0.01030612, + "balance_loss_clip": 1.01830935, + "balance_loss_mlp": 1.03503203, + "epoch": 0.6728393206072448, + "flos": 21249852341760.0, + "grad_norm": 1.479637189299754, + "language_loss": 0.77305663, + "learning_rate": 1.0214368100233612e-06, + "loss": 0.79437912, + "num_input_tokens_seen": 241534125, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66796875, + "step": 11191, + "time_per_iteration": 2.471383571624756 + }, + { + "auxiliary_loss_clip": 0.01101045, + "auxiliary_loss_mlp": 0.01027006, + "balance_loss_clip": 1.01542521, + "balance_loss_mlp": 1.03620696, + "epoch": 0.6728994438599128, + "flos": 32123279779200.0, + "grad_norm": 1.9484073900919987, + "language_loss": 0.86244619, + "learning_rate": 1.0210971689740802e-06, + "loss": 0.88372666, + "num_input_tokens_seen": 241556340, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6484375, + "step": 11192, + "time_per_iteration": 2.541471481323242 + }, + { + "auxiliary_loss_clip": 0.01105869, + "auxiliary_loss_mlp": 0.01033893, + "balance_loss_clip": 1.02166843, + "balance_loss_mlp": 1.03793001, + "epoch": 0.6729595671125808, + "flos": 23112359331840.0, + "grad_norm": 1.7778605034576032, + "language_loss": 0.76010567, + "learning_rate": 1.0207575650448923e-06, + "loss": 0.78150332, + "num_input_tokens_seen": 241575185, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6796875, + "step": 11193, + "time_per_iteration": 2.4631118774414062 + }, + { + "auxiliary_loss_clip": 0.01104222, + "auxiliary_loss_mlp": 0.01032099, + "balance_loss_clip": 1.0205301, + "balance_loss_mlp": 1.03698504, + "epoch": 0.6730196903652488, + "flos": 14611406227200.0, + "grad_norm": 1.7482449519435526, + "language_loss": 0.78450751, + "learning_rate": 1.0204179982486758e-06, + "loss": 0.80587071, + "num_input_tokens_seen": 241592970, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 11194, + "time_per_iteration": 2.4163994789123535 + }, + { + "auxiliary_loss_clip": 0.01102715, + "auxiliary_loss_mlp": 0.01027293, + "balance_loss_clip": 1.01602221, + "balance_loss_mlp": 1.03523183, + "epoch": 0.6730798136179167, + "flos": 21105922544640.0, + "grad_norm": 2.439402985037115, + "language_loss": 0.89769554, + "learning_rate": 1.0200784685983075e-06, + "loss": 0.91899562, + "num_input_tokens_seen": 241610245, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.67578125, + "step": 11195, + "time_per_iteration": 2.4890894889831543 + }, + { + "auxiliary_loss_clip": 0.01103794, + "auxiliary_loss_mlp": 0.0103069, + "balance_loss_clip": 1.01964474, + "balance_loss_mlp": 1.03698754, + "epoch": 0.6731399368705847, + "flos": 28986267438720.0, + "grad_norm": 3.8315256645626468, + "language_loss": 0.7259835, + "learning_rate": 1.019738976106662e-06, + "loss": 0.74732834, + "num_input_tokens_seen": 241630350, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6640625, + "step": 11196, + "time_per_iteration": 2.4961941242218018 + }, + { + "auxiliary_loss_clip": 0.01026268, + "auxiliary_loss_mlp": 0.00997949, + "balance_loss_clip": 0.99669737, + "balance_loss_mlp": 1.00585961, + "epoch": 0.6732000601232526, + "flos": 64743708723840.0, + "grad_norm": 0.7827982838834083, + "language_loss": 0.56530619, + "learning_rate": 1.0193995207866123e-06, + "loss": 0.58554828, + "num_input_tokens_seen": 241692380, + "router_z_loss_clip": 0.01251221, + "router_z_loss_mlp": 0.20410156, + "step": 11197, + "time_per_iteration": 2.9888203144073486 + }, + { + "auxiliary_loss_clip": 0.01103429, + "auxiliary_loss_mlp": 0.01023702, + "balance_loss_clip": 1.01289546, + "balance_loss_mlp": 1.03899539, + "epoch": 0.6732601833759206, + "flos": 17201642762880.0, + "grad_norm": 2.0080706986846635, + "language_loss": 0.75471473, + "learning_rate": 1.0190601026510312e-06, + "loss": 0.77598602, + "num_input_tokens_seen": 241710430, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.64453125, + "step": 11198, + "time_per_iteration": 2.4266445636749268 + }, + { + "auxiliary_loss_clip": 0.01103371, + "auxiliary_loss_mlp": 0.01026973, + "balance_loss_clip": 1.01493251, + "balance_loss_mlp": 1.03564501, + "epoch": 0.6733203066285887, + "flos": 18658861620480.0, + "grad_norm": 2.2277183364076674, + "language_loss": 0.8092168, + "learning_rate": 1.0187207217127892e-06, + "loss": 0.83052027, + "num_input_tokens_seen": 241724775, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6796875, + "step": 11199, + "time_per_iteration": 2.4250686168670654 + }, + { + "auxiliary_loss_clip": 0.01103852, + "auxiliary_loss_mlp": 0.01032091, + "balance_loss_clip": 1.01972258, + "balance_loss_mlp": 1.03520989, + "epoch": 0.6733804298812566, + "flos": 35809330481280.0, + "grad_norm": 1.7815929608142598, + "language_loss": 0.71828485, + "learning_rate": 1.0183813779847552e-06, + "loss": 0.73964423, + "num_input_tokens_seen": 241744440, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6875, + "step": 11200, + "time_per_iteration": 2.555952787399292 + }, + { + "auxiliary_loss_clip": 0.01106738, + "auxiliary_loss_mlp": 0.01032004, + "balance_loss_clip": 1.0200175, + "balance_loss_mlp": 1.03832173, + "epoch": 0.6734405531339246, + "flos": 61638833099520.0, + "grad_norm": 1.625800733182769, + "language_loss": 0.6466803, + "learning_rate": 1.0180420714797987e-06, + "loss": 0.66806769, + "num_input_tokens_seen": 241771705, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 11201, + "time_per_iteration": 2.8149640560150146 + }, + { + "auxiliary_loss_clip": 0.01106243, + "auxiliary_loss_mlp": 0.0103222, + "balance_loss_clip": 1.01998901, + "balance_loss_mlp": 1.03641796, + "epoch": 0.6735006763865925, + "flos": 20522338277760.0, + "grad_norm": 1.7955061796431357, + "language_loss": 0.63162857, + "learning_rate": 1.0177028022107856e-06, + "loss": 0.65301323, + "num_input_tokens_seen": 241790830, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69921875, + "step": 11202, + "time_per_iteration": 2.437077045440674 + }, + { + "auxiliary_loss_clip": 0.01103951, + "auxiliary_loss_mlp": 0.01026684, + "balance_loss_clip": 1.01568675, + "balance_loss_mlp": 1.03587484, + "epoch": 0.6735607996392605, + "flos": 13918869031680.0, + "grad_norm": 1.8620640282713015, + "language_loss": 0.74766082, + "learning_rate": 1.0173635701905796e-06, + "loss": 0.76896715, + "num_input_tokens_seen": 241808165, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6796875, + "step": 11203, + "time_per_iteration": 2.457798719406128 + }, + { + "auxiliary_loss_clip": 0.01109149, + "auxiliary_loss_mlp": 0.01030349, + "balance_loss_clip": 1.01710498, + "balance_loss_mlp": 1.037606, + "epoch": 0.6736209228919284, + "flos": 18807244704000.0, + "grad_norm": 1.7246428938805878, + "language_loss": 0.67498362, + "learning_rate": 1.0170243754320456e-06, + "loss": 0.69637865, + "num_input_tokens_seen": 241826925, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71484375, + "step": 11204, + "time_per_iteration": 2.4272255897521973 + }, + { + "auxiliary_loss_clip": 0.01110127, + "auxiliary_loss_mlp": 0.01032358, + "balance_loss_clip": 1.01991844, + "balance_loss_mlp": 1.03929329, + "epoch": 0.6736810461445965, + "flos": 20373129181440.0, + "grad_norm": 1.5939578102801788, + "language_loss": 0.7447291, + "learning_rate": 1.0166852179480465e-06, + "loss": 0.76615399, + "num_input_tokens_seen": 241845525, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 11205, + "time_per_iteration": 2.4560165405273438 + }, + { + "auxiliary_loss_clip": 0.0109994, + "auxiliary_loss_mlp": 0.01030681, + "balance_loss_clip": 1.01932693, + "balance_loss_mlp": 1.03492117, + "epoch": 0.6737411693972644, + "flos": 30007530927360.0, + "grad_norm": 1.6470910861724577, + "language_loss": 0.71854442, + "learning_rate": 1.0163460977514416e-06, + "loss": 0.73985064, + "num_input_tokens_seen": 241866815, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 11206, + "time_per_iteration": 2.5040676593780518 + }, + { + "auxiliary_loss_clip": 0.01111631, + "auxiliary_loss_mlp": 0.01032573, + "balance_loss_clip": 1.02033639, + "balance_loss_mlp": 1.03923798, + "epoch": 0.6738012926499324, + "flos": 25447342844160.0, + "grad_norm": 6.529945029855453, + "language_loss": 0.67127562, + "learning_rate": 1.016007014855092e-06, + "loss": 0.69271767, + "num_input_tokens_seen": 241887050, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.7265625, + "step": 11207, + "time_per_iteration": 2.5161397457122803 + }, + { + "auxiliary_loss_clip": 0.01102629, + "auxiliary_loss_mlp": 0.01029952, + "balance_loss_clip": 1.0182395, + "balance_loss_mlp": 1.03757155, + "epoch": 0.6738614159026003, + "flos": 20776873029120.0, + "grad_norm": 2.4663080715904675, + "language_loss": 0.73317289, + "learning_rate": 1.0156679692718553e-06, + "loss": 0.75449866, + "num_input_tokens_seen": 241904280, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6484375, + "step": 11208, + "time_per_iteration": 2.4350569248199463 + }, + { + "auxiliary_loss_clip": 0.01104929, + "auxiliary_loss_mlp": 0.01033002, + "balance_loss_clip": 1.019835, + "balance_loss_mlp": 1.03649032, + "epoch": 0.6739215391552683, + "flos": 19566898462080.0, + "grad_norm": 1.8859944640341983, + "language_loss": 0.75882745, + "learning_rate": 1.0153289610145867e-06, + "loss": 0.78020674, + "num_input_tokens_seen": 241919190, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.6875, + "step": 11209, + "time_per_iteration": 2.4393579959869385 + }, + { + "auxiliary_loss_clip": 0.01100693, + "auxiliary_loss_mlp": 0.01029713, + "balance_loss_clip": 1.01881683, + "balance_loss_mlp": 1.03629994, + "epoch": 0.6739816624079362, + "flos": 24388193485440.0, + "grad_norm": 1.6804143170759391, + "language_loss": 0.66519487, + "learning_rate": 1.0149899900961428e-06, + "loss": 0.68649894, + "num_input_tokens_seen": 241940525, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.64453125, + "step": 11210, + "time_per_iteration": 2.4730069637298584 + }, + { + "auxiliary_loss_clip": 0.01100446, + "auxiliary_loss_mlp": 0.01027448, + "balance_loss_clip": 1.01682603, + "balance_loss_mlp": 1.03569078, + "epoch": 0.6740417856606042, + "flos": 22528164533760.0, + "grad_norm": 2.1720353274754154, + "language_loss": 0.79894733, + "learning_rate": 1.014651056529377e-06, + "loss": 0.82022631, + "num_input_tokens_seen": 241959290, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6484375, + "step": 11211, + "time_per_iteration": 2.468639850616455 + }, + { + "auxiliary_loss_clip": 0.01101219, + "auxiliary_loss_mlp": 0.01030077, + "balance_loss_clip": 1.01841807, + "balance_loss_mlp": 1.03608012, + "epoch": 0.6741019089132723, + "flos": 25775458606080.0, + "grad_norm": 1.549232637169743, + "language_loss": 0.76512897, + "learning_rate": 1.014312160327143e-06, + "loss": 0.78644192, + "num_input_tokens_seen": 241980715, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65234375, + "step": 11212, + "time_per_iteration": 2.478450059890747 + }, + { + "auxiliary_loss_clip": 0.01104217, + "auxiliary_loss_mlp": 0.01029584, + "balance_loss_clip": 1.01728141, + "balance_loss_mlp": 1.03573346, + "epoch": 0.6741620321659402, + "flos": 21105671149440.0, + "grad_norm": 1.6801358890975542, + "language_loss": 0.77888572, + "learning_rate": 1.0139733015022905e-06, + "loss": 0.80022377, + "num_input_tokens_seen": 241999985, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.68359375, + "step": 11213, + "time_per_iteration": 2.4666621685028076 + }, + { + "auxiliary_loss_clip": 0.01107053, + "auxiliary_loss_mlp": 0.01035384, + "balance_loss_clip": 1.0228132, + "balance_loss_mlp": 1.03760529, + "epoch": 0.6742221554186082, + "flos": 20740423703040.0, + "grad_norm": 2.4257892231901765, + "language_loss": 0.67633986, + "learning_rate": 1.0136344800676685e-06, + "loss": 0.69776428, + "num_input_tokens_seen": 242018990, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 11214, + "time_per_iteration": 2.452108860015869 + }, + { + "auxiliary_loss_clip": 0.01103571, + "auxiliary_loss_mlp": 0.01033418, + "balance_loss_clip": 1.02189624, + "balance_loss_mlp": 1.03610945, + "epoch": 0.6742822786712761, + "flos": 37774146384000.0, + "grad_norm": 1.6441501997597023, + "language_loss": 0.72691011, + "learning_rate": 1.0132956960361263e-06, + "loss": 0.74828005, + "num_input_tokens_seen": 242039340, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.67578125, + "step": 11215, + "time_per_iteration": 2.615023374557495 + }, + { + "auxiliary_loss_clip": 0.01105661, + "auxiliary_loss_mlp": 0.01032878, + "balance_loss_clip": 1.02168989, + "balance_loss_mlp": 1.03667545, + "epoch": 0.6743424019239441, + "flos": 37263891732480.0, + "grad_norm": 3.085424201902257, + "language_loss": 0.67325628, + "learning_rate": 1.0129569494205096e-06, + "loss": 0.69464171, + "num_input_tokens_seen": 242062215, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6875, + "step": 11216, + "time_per_iteration": 2.567662477493286 + }, + { + "auxiliary_loss_clip": 0.01026395, + "auxiliary_loss_mlp": 0.01001456, + "balance_loss_clip": 1.00016236, + "balance_loss_mlp": 1.00580978, + "epoch": 0.674402525176612, + "flos": 65997746300160.0, + "grad_norm": 0.6744353438462242, + "language_loss": 0.56309336, + "learning_rate": 1.0126182402336646e-06, + "loss": 0.58337194, + "num_input_tokens_seen": 242131130, + "router_z_loss_clip": 0.01293945, + "router_z_loss_mlp": 0.20605469, + "step": 11217, + "time_per_iteration": 3.1818552017211914 + }, + { + "auxiliary_loss_clip": 0.01102202, + "auxiliary_loss_mlp": 0.01028455, + "balance_loss_clip": 1.0162183, + "balance_loss_mlp": 1.0352273, + "epoch": 0.67446264842928, + "flos": 26461208131200.0, + "grad_norm": 1.9712085707776, + "language_loss": 0.74490952, + "learning_rate": 1.0122795684884363e-06, + "loss": 0.76621616, + "num_input_tokens_seen": 242149720, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.671875, + "step": 11218, + "time_per_iteration": 2.4742777347564697 + }, + { + "auxiliary_loss_clip": 0.01105482, + "auxiliary_loss_mlp": 0.01042974, + "balance_loss_clip": 1.02953315, + "balance_loss_mlp": 1.03671169, + "epoch": 0.674522771681948, + "flos": 23732392924800.0, + "grad_norm": 1.6873790300129339, + "language_loss": 0.66097057, + "learning_rate": 1.0119409341976639e-06, + "loss": 0.68245506, + "num_input_tokens_seen": 242168875, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.6875, + "step": 11219, + "time_per_iteration": 3.9712955951690674 + }, + { + "auxiliary_loss_clip": 0.01105197, + "auxiliary_loss_mlp": 0.0103097, + "balance_loss_clip": 1.01901901, + "balance_loss_mlp": 1.03550935, + "epoch": 0.674582894934616, + "flos": 24754338771840.0, + "grad_norm": 1.9354673669636624, + "language_loss": 0.74431932, + "learning_rate": 1.0116023373741904e-06, + "loss": 0.76568097, + "num_input_tokens_seen": 242188465, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 11220, + "time_per_iteration": 2.4782400131225586 + }, + { + "auxiliary_loss_clip": 0.01104541, + "auxiliary_loss_mlp": 0.0103107, + "balance_loss_clip": 1.01871443, + "balance_loss_mlp": 1.03673649, + "epoch": 0.6746430181872839, + "flos": 24826626892800.0, + "grad_norm": 1.753572378422806, + "language_loss": 0.70208532, + "learning_rate": 1.0112637780308554e-06, + "loss": 0.72344136, + "num_input_tokens_seen": 242208675, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6796875, + "step": 11221, + "time_per_iteration": 3.8499643802642822 + }, + { + "auxiliary_loss_clip": 0.01104329, + "auxiliary_loss_mlp": 0.01027391, + "balance_loss_clip": 1.01634061, + "balance_loss_mlp": 1.03750563, + "epoch": 0.6747031414399519, + "flos": 16873491087360.0, + "grad_norm": 2.083478811055199, + "language_loss": 0.58038485, + "learning_rate": 1.010925256180498e-06, + "loss": 0.60170209, + "num_input_tokens_seen": 242227440, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.66796875, + "step": 11222, + "time_per_iteration": 3.796449661254883 + }, + { + "auxiliary_loss_clip": 0.01105674, + "auxiliary_loss_mlp": 0.01032124, + "balance_loss_clip": 1.02047682, + "balance_loss_mlp": 1.03731191, + "epoch": 0.6747632646926198, + "flos": 22784925928320.0, + "grad_norm": 2.9048479494136266, + "language_loss": 0.76680332, + "learning_rate": 1.0105867718359528e-06, + "loss": 0.7881813, + "num_input_tokens_seen": 242245240, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.68359375, + "step": 11223, + "time_per_iteration": 3.932152271270752 + }, + { + "auxiliary_loss_clip": 0.01107203, + "auxiliary_loss_mlp": 0.01032299, + "balance_loss_clip": 1.02009189, + "balance_loss_mlp": 1.03799176, + "epoch": 0.6748233879452878, + "flos": 20046090827520.0, + "grad_norm": 1.7350565617477662, + "language_loss": 0.75261784, + "learning_rate": 1.0102483250100574e-06, + "loss": 0.77401286, + "num_input_tokens_seen": 242263435, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69140625, + "step": 11224, + "time_per_iteration": 2.4370362758636475 + }, + { + "auxiliary_loss_clip": 0.0109934, + "auxiliary_loss_mlp": 0.01024383, + "balance_loss_clip": 1.01388621, + "balance_loss_mlp": 1.03474987, + "epoch": 0.6748835111979558, + "flos": 23002831785600.0, + "grad_norm": 1.6433655631752735, + "language_loss": 0.63031125, + "learning_rate": 1.0099099157156445e-06, + "loss": 0.6515485, + "num_input_tokens_seen": 242282765, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.64453125, + "step": 11225, + "time_per_iteration": 2.472139835357666 + }, + { + "auxiliary_loss_clip": 0.01099206, + "auxiliary_loss_mlp": 0.01030978, + "balance_loss_clip": 1.02051783, + "balance_loss_mlp": 1.03548217, + "epoch": 0.6749436344506238, + "flos": 12197311009920.0, + "grad_norm": 1.7438523279987848, + "language_loss": 0.64443898, + "learning_rate": 1.0095715439655462e-06, + "loss": 0.66574085, + "num_input_tokens_seen": 242298980, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.63671875, + "step": 11226, + "time_per_iteration": 2.3997251987457275 + }, + { + "auxiliary_loss_clip": 0.01106856, + "auxiliary_loss_mlp": 0.01032206, + "balance_loss_clip": 1.02009439, + "balance_loss_mlp": 1.03833103, + "epoch": 0.6750037577032918, + "flos": 11873720361600.0, + "grad_norm": 2.0520128582030406, + "language_loss": 0.71177256, + "learning_rate": 1.0092332097725945e-06, + "loss": 0.73316324, + "num_input_tokens_seen": 242315420, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 11227, + "time_per_iteration": 2.4354188442230225 + }, + { + "auxiliary_loss_clip": 0.01102719, + "auxiliary_loss_mlp": 0.01028023, + "balance_loss_clip": 1.01566148, + "balance_loss_mlp": 1.03702497, + "epoch": 0.6750638809559597, + "flos": 17019611614080.0, + "grad_norm": 1.9773279438432965, + "language_loss": 0.7113992, + "learning_rate": 1.0088949131496183e-06, + "loss": 0.73270661, + "num_input_tokens_seen": 242332805, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.65625, + "step": 11228, + "time_per_iteration": 2.4065871238708496 + }, + { + "auxiliary_loss_clip": 0.01026271, + "auxiliary_loss_mlp": 0.01000743, + "balance_loss_clip": 0.99950963, + "balance_loss_mlp": 1.00561559, + "epoch": 0.6751240042086277, + "flos": 70951011891840.0, + "grad_norm": 0.7600669046292114, + "language_loss": 0.53283465, + "learning_rate": 1.0085566541094482e-06, + "loss": 0.55310482, + "num_input_tokens_seen": 242396160, + "router_z_loss_clip": 0.0123291, + "router_z_loss_mlp": 0.20703125, + "step": 11229, + "time_per_iteration": 3.113936424255371 + }, + { + "auxiliary_loss_clip": 0.01102392, + "auxiliary_loss_mlp": 0.01029521, + "balance_loss_clip": 1.01778507, + "balance_loss_mlp": 1.03599358, + "epoch": 0.6751841274612956, + "flos": 22675146986880.0, + "grad_norm": 1.668368696112623, + "language_loss": 0.80301458, + "learning_rate": 1.0082184326649072e-06, + "loss": 0.82433373, + "num_input_tokens_seen": 242414660, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 11230, + "time_per_iteration": 2.481586456298828 + }, + { + "auxiliary_loss_clip": 0.01101511, + "auxiliary_loss_mlp": 0.01025242, + "balance_loss_clip": 1.01433396, + "balance_loss_mlp": 1.03644145, + "epoch": 0.6752442507139637, + "flos": 21288636051840.0, + "grad_norm": 1.5254643295267571, + "language_loss": 0.66080362, + "learning_rate": 1.0078802488288228e-06, + "loss": 0.68207115, + "num_input_tokens_seen": 242434225, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6484375, + "step": 11231, + "time_per_iteration": 2.4348020553588867 + }, + { + "auxiliary_loss_clip": 0.01109126, + "auxiliary_loss_mlp": 0.01036053, + "balance_loss_clip": 1.02254665, + "balance_loss_mlp": 1.03815401, + "epoch": 0.6753043739666316, + "flos": 28256921781120.0, + "grad_norm": 1.8895861738799862, + "language_loss": 0.66976327, + "learning_rate": 1.0075421026140198e-06, + "loss": 0.69121504, + "num_input_tokens_seen": 242454355, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7109375, + "step": 11232, + "time_per_iteration": 2.565011501312256 + }, + { + "auxiliary_loss_clip": 0.01102814, + "auxiliary_loss_mlp": 0.01026397, + "balance_loss_clip": 1.01529849, + "balance_loss_mlp": 1.03667426, + "epoch": 0.6753644972192996, + "flos": 21360349555200.0, + "grad_norm": 1.7997945281360064, + "language_loss": 0.72617656, + "learning_rate": 1.0072039940333188e-06, + "loss": 0.74746865, + "num_input_tokens_seen": 242474935, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6640625, + "step": 11233, + "time_per_iteration": 2.451127767562866 + }, + { + "auxiliary_loss_clip": 0.01102933, + "auxiliary_loss_mlp": 0.01032225, + "balance_loss_clip": 1.01986313, + "balance_loss_mlp": 1.035604, + "epoch": 0.6754246204719675, + "flos": 26541971861760.0, + "grad_norm": 1.728016441920487, + "language_loss": 0.76981372, + "learning_rate": 1.0068659230995418e-06, + "loss": 0.79116529, + "num_input_tokens_seen": 242495530, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 11234, + "time_per_iteration": 2.560873031616211 + }, + { + "auxiliary_loss_clip": 0.01103068, + "auxiliary_loss_mlp": 0.01028923, + "balance_loss_clip": 1.01688838, + "balance_loss_mlp": 1.03655529, + "epoch": 0.6754847437246355, + "flos": 25556690822400.0, + "grad_norm": 1.5233618386668848, + "language_loss": 0.7516101, + "learning_rate": 1.0065278898255101e-06, + "loss": 0.77292997, + "num_input_tokens_seen": 242514550, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 11235, + "time_per_iteration": 2.473658323287964 + }, + { + "auxiliary_loss_clip": 0.01025939, + "auxiliary_loss_mlp": 0.00999916, + "balance_loss_clip": 0.99873585, + "balance_loss_mlp": 1.00544596, + "epoch": 0.6755448669773034, + "flos": 59513318726400.0, + "grad_norm": 0.7849137447209698, + "language_loss": 0.51408035, + "learning_rate": 1.0061898942240387e-06, + "loss": 0.53433889, + "num_input_tokens_seen": 242569200, + "router_z_loss_clip": 0.01177979, + "router_z_loss_mlp": 0.20507812, + "step": 11236, + "time_per_iteration": 2.993544340133667 + }, + { + "auxiliary_loss_clip": 0.01102563, + "auxiliary_loss_mlp": 0.01027868, + "balance_loss_clip": 1.01443923, + "balance_loss_mlp": 1.03596473, + "epoch": 0.6756049902299714, + "flos": 23294534135040.0, + "grad_norm": 2.2221952993281335, + "language_loss": 0.75521564, + "learning_rate": 1.0058519363079464e-06, + "loss": 0.77652001, + "num_input_tokens_seen": 242586950, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.6640625, + "step": 11237, + "time_per_iteration": 2.4348740577697754 + }, + { + "auxiliary_loss_clip": 0.01105842, + "auxiliary_loss_mlp": 0.01032869, + "balance_loss_clip": 1.02153206, + "balance_loss_mlp": 1.03944969, + "epoch": 0.6756651134826394, + "flos": 31575426566400.0, + "grad_norm": 2.1736628297595466, + "language_loss": 0.77503932, + "learning_rate": 1.0055140160900482e-06, + "loss": 0.79642648, + "num_input_tokens_seen": 242607380, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 11238, + "time_per_iteration": 2.526988983154297 + }, + { + "auxiliary_loss_clip": 0.01105979, + "auxiliary_loss_mlp": 0.01031121, + "balance_loss_clip": 1.0188483, + "balance_loss_mlp": 1.03556848, + "epoch": 0.6757252367353074, + "flos": 27272287186560.0, + "grad_norm": 1.9142971498049255, + "language_loss": 0.66731274, + "learning_rate": 1.0051761335831587e-06, + "loss": 0.68868375, + "num_input_tokens_seen": 242628025, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 11239, + "time_per_iteration": 2.4696223735809326 + }, + { + "auxiliary_loss_clip": 0.0110246, + "auxiliary_loss_mlp": 0.01026428, + "balance_loss_clip": 1.01447082, + "balance_loss_mlp": 1.03745294, + "epoch": 0.6757853599879754, + "flos": 16830900535680.0, + "grad_norm": 2.923743651844225, + "language_loss": 0.82995439, + "learning_rate": 1.0048382888000898e-06, + "loss": 0.85124326, + "num_input_tokens_seen": 242643825, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6484375, + "step": 11240, + "time_per_iteration": 2.446572780609131 + }, + { + "auxiliary_loss_clip": 0.01111011, + "auxiliary_loss_mlp": 0.01033506, + "balance_loss_clip": 1.01959419, + "balance_loss_mlp": 1.03869963, + "epoch": 0.6758454832406433, + "flos": 23220055284480.0, + "grad_norm": 3.6442496224808933, + "language_loss": 0.74812031, + "learning_rate": 1.0045004817536525e-06, + "loss": 0.76956552, + "num_input_tokens_seen": 242661820, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.72265625, + "step": 11241, + "time_per_iteration": 2.423372268676758 + }, + { + "auxiliary_loss_clip": 0.01104649, + "auxiliary_loss_mlp": 0.01032061, + "balance_loss_clip": 1.02011645, + "balance_loss_mlp": 1.03697479, + "epoch": 0.6759056064933113, + "flos": 16289547684480.0, + "grad_norm": 2.14563763168323, + "language_loss": 0.80052149, + "learning_rate": 1.0041627124566572e-06, + "loss": 0.82188863, + "num_input_tokens_seen": 242679890, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 11242, + "time_per_iteration": 2.434990167617798 + }, + { + "auxiliary_loss_clip": 0.01101563, + "auxiliary_loss_mlp": 0.01028144, + "balance_loss_clip": 1.01693797, + "balance_loss_mlp": 1.034922, + "epoch": 0.6759657297459792, + "flos": 25922297404800.0, + "grad_norm": 1.9802154508142344, + "language_loss": 0.72626722, + "learning_rate": 1.0038249809219109e-06, + "loss": 0.74756432, + "num_input_tokens_seen": 242699495, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.66796875, + "step": 11243, + "time_per_iteration": 2.453474283218384 + }, + { + "auxiliary_loss_clip": 0.0110346, + "auxiliary_loss_mlp": 0.01034848, + "balance_loss_clip": 1.02346909, + "balance_loss_mlp": 1.03676426, + "epoch": 0.6760258529986473, + "flos": 23000820624000.0, + "grad_norm": 1.7073695655292809, + "language_loss": 0.72612441, + "learning_rate": 1.003487287162221e-06, + "loss": 0.74750745, + "num_input_tokens_seen": 242719500, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 11244, + "time_per_iteration": 2.5105230808258057 + }, + { + "auxiliary_loss_clip": 0.01106398, + "auxiliary_loss_mlp": 0.01038693, + "balance_loss_clip": 1.02668309, + "balance_loss_mlp": 1.03746104, + "epoch": 0.6760859762513152, + "flos": 20959335141120.0, + "grad_norm": 1.8087707557146027, + "language_loss": 0.85335118, + "learning_rate": 1.003149631190393e-06, + "loss": 0.87480211, + "num_input_tokens_seen": 242738325, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6875, + "step": 11245, + "time_per_iteration": 2.445233106613159 + }, + { + "auxiliary_loss_clip": 0.01108481, + "auxiliary_loss_mlp": 0.01032399, + "balance_loss_clip": 1.02013278, + "balance_loss_mlp": 1.03733289, + "epoch": 0.6761460995039832, + "flos": 23622937205760.0, + "grad_norm": 2.3183444790940766, + "language_loss": 0.73646373, + "learning_rate": 1.0028120130192327e-06, + "loss": 0.75787258, + "num_input_tokens_seen": 242756620, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.7109375, + "step": 11246, + "time_per_iteration": 2.4863364696502686 + }, + { + "auxiliary_loss_clip": 0.01102215, + "auxiliary_loss_mlp": 0.01025917, + "balance_loss_clip": 1.01430011, + "balance_loss_mlp": 1.0346514, + "epoch": 0.6762062227566511, + "flos": 20770875457920.0, + "grad_norm": 2.2448543978250437, + "language_loss": 0.88085318, + "learning_rate": 1.002474432661539e-06, + "loss": 0.90213448, + "num_input_tokens_seen": 242774505, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.67578125, + "step": 11247, + "time_per_iteration": 2.4308738708496094 + }, + { + "auxiliary_loss_clip": 0.01025674, + "auxiliary_loss_mlp": 0.01003402, + "balance_loss_clip": 1.00222802, + "balance_loss_mlp": 1.0053699, + "epoch": 0.6762663460093191, + "flos": 52818099166080.0, + "grad_norm": 0.8266217559963673, + "language_loss": 0.54048848, + "learning_rate": 1.002136890130115e-06, + "loss": 0.56077927, + "num_input_tokens_seen": 242828645, + "router_z_loss_clip": 0.01171875, + "router_z_loss_mlp": 0.203125, + "step": 11248, + "time_per_iteration": 3.076478958129883 + }, + { + "auxiliary_loss_clip": 0.01098711, + "auxiliary_loss_mlp": 0.01029161, + "balance_loss_clip": 1.01805067, + "balance_loss_mlp": 1.03580928, + "epoch": 0.676326469261987, + "flos": 23696302734720.0, + "grad_norm": 1.69579760819699, + "language_loss": 0.73396099, + "learning_rate": 1.001799385437761e-06, + "loss": 0.75523973, + "num_input_tokens_seen": 242850100, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.62890625, + "step": 11249, + "time_per_iteration": 2.47476863861084 + }, + { + "auxiliary_loss_clip": 0.01103589, + "auxiliary_loss_mlp": 0.01031036, + "balance_loss_clip": 1.01811373, + "balance_loss_mlp": 1.03449488, + "epoch": 0.676386592514655, + "flos": 14063732582400.0, + "grad_norm": 2.3334311767034035, + "language_loss": 0.73674285, + "learning_rate": 1.0014619185972732e-06, + "loss": 0.75808907, + "num_input_tokens_seen": 242867775, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69140625, + "step": 11250, + "time_per_iteration": 2.427795171737671 + }, + { + "auxiliary_loss_clip": 0.01104705, + "auxiliary_loss_mlp": 0.01024882, + "balance_loss_clip": 1.01378322, + "balance_loss_mlp": 1.03724456, + "epoch": 0.676446715767323, + "flos": 20412236113920.0, + "grad_norm": 1.7440150220700932, + "language_loss": 0.75326031, + "learning_rate": 1.0011244896214497e-06, + "loss": 0.77455616, + "num_input_tokens_seen": 242886865, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.67578125, + "step": 11251, + "time_per_iteration": 2.453015089035034 + }, + { + "auxiliary_loss_clip": 0.01103045, + "auxiliary_loss_mlp": 0.01030739, + "balance_loss_clip": 1.01890135, + "balance_loss_mlp": 1.03677213, + "epoch": 0.676506839019991, + "flos": 21288241002240.0, + "grad_norm": 4.794996819995717, + "language_loss": 0.7030319, + "learning_rate": 1.0007870985230873e-06, + "loss": 0.7243697, + "num_input_tokens_seen": 242906705, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6640625, + "step": 11252, + "time_per_iteration": 2.507655382156372 + }, + { + "auxiliary_loss_clip": 0.01104897, + "auxiliary_loss_mlp": 0.01030105, + "balance_loss_clip": 1.01872027, + "balance_loss_mlp": 1.03790915, + "epoch": 0.676566962272659, + "flos": 29932477459200.0, + "grad_norm": 1.7295296864842329, + "language_loss": 0.66713816, + "learning_rate": 1.0004497453149765e-06, + "loss": 0.68848813, + "num_input_tokens_seen": 242925215, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.671875, + "step": 11253, + "time_per_iteration": 2.495661735534668 + }, + { + "auxiliary_loss_clip": 0.01106169, + "auxiliary_loss_mlp": 0.01034304, + "balance_loss_clip": 1.02159083, + "balance_loss_mlp": 1.03755689, + "epoch": 0.6766270855253269, + "flos": 17931203902080.0, + "grad_norm": 1.5712995070705533, + "language_loss": 0.77059627, + "learning_rate": 1.0001124300099115e-06, + "loss": 0.79200101, + "num_input_tokens_seen": 242944750, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 11254, + "time_per_iteration": 2.5303773880004883 + }, + { + "auxiliary_loss_clip": 0.01103059, + "auxiliary_loss_mlp": 0.0103136, + "balance_loss_clip": 1.01923668, + "balance_loss_mlp": 1.0353651, + "epoch": 0.6766872087779949, + "flos": 23104853389440.0, + "grad_norm": 2.008694221276799, + "language_loss": 0.72041488, + "learning_rate": 9.997751526206835e-07, + "loss": 0.74175906, + "num_input_tokens_seen": 242963860, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 11255, + "time_per_iteration": 2.4310834407806396 + }, + { + "auxiliary_loss_clip": 0.01103491, + "auxiliary_loss_mlp": 0.01034987, + "balance_loss_clip": 1.02294099, + "balance_loss_mlp": 1.03527474, + "epoch": 0.6767473320306628, + "flos": 26213137827840.0, + "grad_norm": 2.4309429012787533, + "language_loss": 0.75107753, + "learning_rate": 9.994379131600828e-07, + "loss": 0.77246231, + "num_input_tokens_seen": 242983050, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.68359375, + "step": 11256, + "time_per_iteration": 2.5040993690490723 + }, + { + "auxiliary_loss_clip": 0.01105082, + "auxiliary_loss_mlp": 0.01030951, + "balance_loss_clip": 1.01898217, + "balance_loss_mlp": 1.03802788, + "epoch": 0.6768074552833309, + "flos": 18368739469440.0, + "grad_norm": 2.256626523492283, + "language_loss": 0.64639592, + "learning_rate": 9.991007116408965e-07, + "loss": 0.6677562, + "num_input_tokens_seen": 243001125, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 11257, + "time_per_iteration": 2.4259557723999023 + }, + { + "auxiliary_loss_clip": 0.01097898, + "auxiliary_loss_mlp": 0.01028121, + "balance_loss_clip": 1.01709366, + "balance_loss_mlp": 1.03422582, + "epoch": 0.6768675785359988, + "flos": 23039927556480.0, + "grad_norm": 1.4043820681784667, + "language_loss": 0.75555968, + "learning_rate": 9.987635480759109e-07, + "loss": 0.77681983, + "num_input_tokens_seen": 243021865, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.63671875, + "step": 11258, + "time_per_iteration": 2.4665939807891846 + }, + { + "auxiliary_loss_clip": 0.01100628, + "auxiliary_loss_mlp": 0.01028723, + "balance_loss_clip": 1.01757717, + "balance_loss_mlp": 1.03654146, + "epoch": 0.6769277017886668, + "flos": 33036524092800.0, + "grad_norm": 1.6503685315767886, + "language_loss": 0.66716135, + "learning_rate": 9.984264224779127e-07, + "loss": 0.68845475, + "num_input_tokens_seen": 243042970, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.640625, + "step": 11259, + "time_per_iteration": 2.527073383331299 + }, + { + "auxiliary_loss_clip": 0.01104423, + "auxiliary_loss_mlp": 0.01027196, + "balance_loss_clip": 1.0155077, + "balance_loss_mlp": 1.03676665, + "epoch": 0.6769878250413347, + "flos": 20848406964480.0, + "grad_norm": 2.9058137848386902, + "language_loss": 0.85316312, + "learning_rate": 9.980893348596839e-07, + "loss": 0.87447935, + "num_input_tokens_seen": 243058470, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.67578125, + "step": 11260, + "time_per_iteration": 2.457331418991089 + }, + { + "auxiliary_loss_clip": 0.01104097, + "auxiliary_loss_mlp": 0.01032449, + "balance_loss_clip": 1.02011061, + "balance_loss_mlp": 1.03481388, + "epoch": 0.6770479482940027, + "flos": 15595968994560.0, + "grad_norm": 1.992894296567027, + "language_loss": 0.77366221, + "learning_rate": 9.977522852340081e-07, + "loss": 0.79502773, + "num_input_tokens_seen": 243076630, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6953125, + "step": 11261, + "time_per_iteration": 3.8098442554473877 + }, + { + "auxiliary_loss_clip": 0.01102151, + "auxiliary_loss_mlp": 0.01033066, + "balance_loss_clip": 1.0210495, + "balance_loss_mlp": 1.03392744, + "epoch": 0.6771080715466706, + "flos": 18621011664000.0, + "grad_norm": 1.8771294723417649, + "language_loss": 0.87785065, + "learning_rate": 9.97415273613666e-07, + "loss": 0.89920282, + "num_input_tokens_seen": 243092260, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 11262, + "time_per_iteration": 2.4098682403564453 + }, + { + "auxiliary_loss_clip": 0.0110654, + "auxiliary_loss_mlp": 0.0102957, + "balance_loss_clip": 1.01738644, + "balance_loss_mlp": 1.03773284, + "epoch": 0.6771681947993387, + "flos": 12495441893760.0, + "grad_norm": 2.144843606745404, + "language_loss": 0.73935968, + "learning_rate": 9.97078300011439e-07, + "loss": 0.76072079, + "num_input_tokens_seen": 243109405, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 11263, + "time_per_iteration": 3.836534261703491 + }, + { + "auxiliary_loss_clip": 0.01105867, + "auxiliary_loss_mlp": 0.01033538, + "balance_loss_clip": 1.020401, + "balance_loss_mlp": 1.03613746, + "epoch": 0.6772283180520066, + "flos": 22236964974720.0, + "grad_norm": 3.0336865802259716, + "language_loss": 0.67681348, + "learning_rate": 9.967413644401016e-07, + "loss": 0.6982075, + "num_input_tokens_seen": 243128135, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6953125, + "step": 11264, + "time_per_iteration": 3.8063998222351074 + }, + { + "auxiliary_loss_clip": 0.01104469, + "auxiliary_loss_mlp": 0.01036584, + "balance_loss_clip": 1.02429914, + "balance_loss_mlp": 1.03774631, + "epoch": 0.6772884413046746, + "flos": 16143139848960.0, + "grad_norm": 1.9309030757319006, + "language_loss": 0.72956276, + "learning_rate": 9.964044669124324e-07, + "loss": 0.75097328, + "num_input_tokens_seen": 243146785, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66796875, + "step": 11265, + "time_per_iteration": 3.905475616455078 + }, + { + "auxiliary_loss_clip": 0.01101535, + "auxiliary_loss_mlp": 0.01031399, + "balance_loss_clip": 1.02002645, + "balance_loss_mlp": 1.03592122, + "epoch": 0.6773485645573426, + "flos": 19135755515520.0, + "grad_norm": 1.5488311970116568, + "language_loss": 0.61298478, + "learning_rate": 9.96067607441207e-07, + "loss": 0.63431406, + "num_input_tokens_seen": 243165275, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65625, + "step": 11266, + "time_per_iteration": 2.4533629417419434 + }, + { + "auxiliary_loss_clip": 0.01105454, + "auxiliary_loss_mlp": 0.01035839, + "balance_loss_clip": 1.02384639, + "balance_loss_mlp": 1.03653467, + "epoch": 0.6774086878100105, + "flos": 14136918543360.0, + "grad_norm": 2.0018325327863455, + "language_loss": 0.70975608, + "learning_rate": 9.957307860391976e-07, + "loss": 0.73116899, + "num_input_tokens_seen": 243182845, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 11267, + "time_per_iteration": 2.4130048751831055 + }, + { + "auxiliary_loss_clip": 0.01102815, + "auxiliary_loss_mlp": 0.01027992, + "balance_loss_clip": 1.01627374, + "balance_loss_mlp": 1.03553224, + "epoch": 0.6774688110626785, + "flos": 22197067943040.0, + "grad_norm": 1.995940802920633, + "language_loss": 0.71196496, + "learning_rate": 9.953940027191785e-07, + "loss": 0.73327303, + "num_input_tokens_seen": 243201475, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 11268, + "time_per_iteration": 2.5001561641693115 + }, + { + "auxiliary_loss_clip": 0.01106446, + "auxiliary_loss_mlp": 0.01028349, + "balance_loss_clip": 1.01621413, + "balance_loss_mlp": 1.03911674, + "epoch": 0.6775289343153464, + "flos": 23039963470080.0, + "grad_norm": 1.4505290648176117, + "language_loss": 0.76658797, + "learning_rate": 9.950572574939194e-07, + "loss": 0.78793591, + "num_input_tokens_seen": 243221850, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 11269, + "time_per_iteration": 2.450594902038574 + }, + { + "auxiliary_loss_clip": 0.0110441, + "auxiliary_loss_mlp": 0.01033402, + "balance_loss_clip": 1.02046824, + "balance_loss_mlp": 1.03552103, + "epoch": 0.6775890575680145, + "flos": 18293506433280.0, + "grad_norm": 1.9037033189032353, + "language_loss": 0.74434447, + "learning_rate": 9.94720550376189e-07, + "loss": 0.76572257, + "num_input_tokens_seen": 243239855, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6875, + "step": 11270, + "time_per_iteration": 2.4480292797088623 + }, + { + "auxiliary_loss_clip": 0.01105285, + "auxiliary_loss_mlp": 0.01037183, + "balance_loss_clip": 1.02421904, + "balance_loss_mlp": 1.03799176, + "epoch": 0.6776491808206824, + "flos": 25336450581120.0, + "grad_norm": 1.765961836580733, + "language_loss": 0.72747099, + "learning_rate": 9.94383881378756e-07, + "loss": 0.74889576, + "num_input_tokens_seen": 243260085, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.671875, + "step": 11271, + "time_per_iteration": 2.466099739074707 + }, + { + "auxiliary_loss_clip": 0.01104325, + "auxiliary_loss_mlp": 0.01033539, + "balance_loss_clip": 1.02158785, + "balance_loss_mlp": 1.0367682, + "epoch": 0.6777093040733504, + "flos": 26028233591040.0, + "grad_norm": 1.5327741783409103, + "language_loss": 0.67725623, + "learning_rate": 9.94047250514387e-07, + "loss": 0.69863486, + "num_input_tokens_seen": 243280065, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 11272, + "time_per_iteration": 2.506606340408325 + }, + { + "auxiliary_loss_clip": 0.01107233, + "auxiliary_loss_mlp": 0.01034611, + "balance_loss_clip": 1.02115774, + "balance_loss_mlp": 1.03756714, + "epoch": 0.6777694273260183, + "flos": 18003599763840.0, + "grad_norm": 2.19334323210367, + "language_loss": 0.73699766, + "learning_rate": 9.937106577958481e-07, + "loss": 0.75841612, + "num_input_tokens_seen": 243297775, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.6953125, + "step": 11273, + "time_per_iteration": 2.40608286857605 + }, + { + "auxiliary_loss_clip": 0.01101569, + "auxiliary_loss_mlp": 0.01036562, + "balance_loss_clip": 1.02462888, + "balance_loss_mlp": 1.03617656, + "epoch": 0.6778295505786863, + "flos": 23441085624960.0, + "grad_norm": 2.20814425061036, + "language_loss": 0.70081609, + "learning_rate": 9.933741032359015e-07, + "loss": 0.72219741, + "num_input_tokens_seen": 243315760, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 11274, + "time_per_iteration": 2.476304769515991 + }, + { + "auxiliary_loss_clip": 0.01104951, + "auxiliary_loss_mlp": 0.0103141, + "balance_loss_clip": 1.01921475, + "balance_loss_mlp": 1.03662062, + "epoch": 0.6778896738313542, + "flos": 19098408349440.0, + "grad_norm": 1.6447665363620352, + "language_loss": 0.65597254, + "learning_rate": 9.930375868473093e-07, + "loss": 0.67733622, + "num_input_tokens_seen": 243335715, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 11275, + "time_per_iteration": 2.4458420276641846 + }, + { + "auxiliary_loss_clip": 0.01103666, + "auxiliary_loss_mlp": 0.01032506, + "balance_loss_clip": 1.02177751, + "balance_loss_mlp": 1.03688347, + "epoch": 0.6779497970840223, + "flos": 26103933504000.0, + "grad_norm": 2.26567322463042, + "language_loss": 0.72724402, + "learning_rate": 9.927011086428335e-07, + "loss": 0.74860573, + "num_input_tokens_seen": 243356935, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.66796875, + "step": 11276, + "time_per_iteration": 2.506394624710083 + }, + { + "auxiliary_loss_clip": 0.01103474, + "auxiliary_loss_mlp": 0.01029393, + "balance_loss_clip": 1.01724589, + "balance_loss_mlp": 1.03681684, + "epoch": 0.6780099203366902, + "flos": 19719232041600.0, + "grad_norm": 1.7387203972635623, + "language_loss": 0.76835978, + "learning_rate": 9.923646686352317e-07, + "loss": 0.78968847, + "num_input_tokens_seen": 243375625, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.66796875, + "step": 11277, + "time_per_iteration": 2.4156947135925293 + }, + { + "auxiliary_loss_clip": 0.01106329, + "auxiliary_loss_mlp": 0.01027599, + "balance_loss_clip": 1.01580894, + "balance_loss_mlp": 1.03709924, + "epoch": 0.6780700435893582, + "flos": 18214538382720.0, + "grad_norm": 3.843343867942956, + "language_loss": 0.83494425, + "learning_rate": 9.920282668372627e-07, + "loss": 0.85628355, + "num_input_tokens_seen": 243390195, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.69140625, + "step": 11278, + "time_per_iteration": 2.4242331981658936 + }, + { + "auxiliary_loss_clip": 0.01100898, + "auxiliary_loss_mlp": 0.01030557, + "balance_loss_clip": 1.01966131, + "balance_loss_mlp": 1.03655803, + "epoch": 0.6781301668420262, + "flos": 25376239872000.0, + "grad_norm": 1.546828654628467, + "language_loss": 0.70229775, + "learning_rate": 9.916919032616844e-07, + "loss": 0.72361231, + "num_input_tokens_seen": 243411690, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.640625, + "step": 11279, + "time_per_iteration": 2.4774818420410156 + }, + { + "auxiliary_loss_clip": 0.01103487, + "auxiliary_loss_mlp": 0.01030453, + "balance_loss_clip": 1.01785898, + "balance_loss_mlp": 1.03606427, + "epoch": 0.6781902900946941, + "flos": 24020432087040.0, + "grad_norm": 1.8996542277217034, + "language_loss": 0.74191052, + "learning_rate": 9.913555779212485e-07, + "loss": 0.76324993, + "num_input_tokens_seen": 243430280, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.671875, + "step": 11280, + "time_per_iteration": 2.4954020977020264 + }, + { + "auxiliary_loss_clip": 0.01106782, + "auxiliary_loss_mlp": 0.01029984, + "balance_loss_clip": 1.01768732, + "balance_loss_mlp": 1.03710222, + "epoch": 0.6782504133473621, + "flos": 19646764352640.0, + "grad_norm": 1.8728658209175957, + "language_loss": 0.70118409, + "learning_rate": 9.910192908287104e-07, + "loss": 0.7225517, + "num_input_tokens_seen": 243448690, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 11281, + "time_per_iteration": 2.4171640872955322 + }, + { + "auxiliary_loss_clip": 0.01100593, + "auxiliary_loss_mlp": 0.01025939, + "balance_loss_clip": 1.01519203, + "balance_loss_mlp": 1.03611064, + "epoch": 0.67831053660003, + "flos": 24932742647040.0, + "grad_norm": 1.563642265820809, + "language_loss": 0.63874096, + "learning_rate": 9.906830419968217e-07, + "loss": 0.66000628, + "num_input_tokens_seen": 243470695, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6484375, + "step": 11282, + "time_per_iteration": 2.5364012718200684 + }, + { + "auxiliary_loss_clip": 0.0110743, + "auxiliary_loss_mlp": 0.01036912, + "balance_loss_clip": 1.02427554, + "balance_loss_mlp": 1.03683639, + "epoch": 0.6783706598526981, + "flos": 31208383440000.0, + "grad_norm": 1.5622929992593626, + "language_loss": 0.74648255, + "learning_rate": 9.90346831438334e-07, + "loss": 0.76792598, + "num_input_tokens_seen": 243493345, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 11283, + "time_per_iteration": 2.5009424686431885 + }, + { + "auxiliary_loss_clip": 0.01101134, + "auxiliary_loss_mlp": 0.01027647, + "balance_loss_clip": 1.01622117, + "balance_loss_mlp": 1.03523421, + "epoch": 0.678430783105366, + "flos": 35441317687680.0, + "grad_norm": 1.6182405596102953, + "language_loss": 0.5701533, + "learning_rate": 9.900106591659948e-07, + "loss": 0.59144115, + "num_input_tokens_seen": 243515670, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66015625, + "step": 11284, + "time_per_iteration": 2.5896449089050293 + }, + { + "auxiliary_loss_clip": 0.01101588, + "auxiliary_loss_mlp": 0.0102962, + "balance_loss_clip": 1.01796126, + "balance_loss_mlp": 1.03485477, + "epoch": 0.678490906358034, + "flos": 14428800460800.0, + "grad_norm": 2.4677100655448485, + "language_loss": 0.75404185, + "learning_rate": 9.896745251925535e-07, + "loss": 0.77535391, + "num_input_tokens_seen": 243533625, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66796875, + "step": 11285, + "time_per_iteration": 2.53873872756958 + }, + { + "auxiliary_loss_clip": 0.01102067, + "auxiliary_loss_mlp": 0.01028054, + "balance_loss_clip": 1.01661038, + "balance_loss_mlp": 1.03747129, + "epoch": 0.6785510296107019, + "flos": 24311236596480.0, + "grad_norm": 1.8021221276720163, + "language_loss": 0.66290027, + "learning_rate": 9.893384295307557e-07, + "loss": 0.68420148, + "num_input_tokens_seen": 243553040, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 11286, + "time_per_iteration": 2.498288631439209 + }, + { + "auxiliary_loss_clip": 0.0110234, + "auxiliary_loss_mlp": 0.01029185, + "balance_loss_clip": 1.01754975, + "balance_loss_mlp": 1.03434348, + "epoch": 0.6786111528633699, + "flos": 26977244872320.0, + "grad_norm": 2.2344222526167083, + "language_loss": 0.52489305, + "learning_rate": 9.890023721933447e-07, + "loss": 0.54620832, + "num_input_tokens_seen": 243572590, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 11287, + "time_per_iteration": 2.470860719680786 + }, + { + "auxiliary_loss_clip": 0.01102428, + "auxiliary_loss_mlp": 0.01029695, + "balance_loss_clip": 1.01842999, + "balance_loss_mlp": 1.0358603, + "epoch": 0.6786712761160378, + "flos": 24317557390080.0, + "grad_norm": 2.2748309661133086, + "language_loss": 0.77437216, + "learning_rate": 9.886663531930655e-07, + "loss": 0.7956934, + "num_input_tokens_seen": 243594140, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6640625, + "step": 11288, + "time_per_iteration": 2.507276773452759 + }, + { + "auxiliary_loss_clip": 0.01105773, + "auxiliary_loss_mlp": 0.01034994, + "balance_loss_clip": 1.02353239, + "balance_loss_mlp": 1.03752971, + "epoch": 0.6787313993687059, + "flos": 22930435923840.0, + "grad_norm": 1.9600358072539563, + "language_loss": 0.73192465, + "learning_rate": 9.883303725426593e-07, + "loss": 0.75333238, + "num_input_tokens_seen": 243615170, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.68359375, + "step": 11289, + "time_per_iteration": 2.466587781906128 + }, + { + "auxiliary_loss_clip": 0.01103364, + "auxiliary_loss_mlp": 0.01035376, + "balance_loss_clip": 1.02300215, + "balance_loss_mlp": 1.0357126, + "epoch": 0.6787915226213738, + "flos": 26868435598080.0, + "grad_norm": 1.567844133932764, + "language_loss": 0.80266666, + "learning_rate": 9.879944302548682e-07, + "loss": 0.82405412, + "num_input_tokens_seen": 243635675, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.67578125, + "step": 11290, + "time_per_iteration": 2.5057084560394287 + }, + { + "auxiliary_loss_clip": 0.01100237, + "auxiliary_loss_mlp": 0.01027997, + "balance_loss_clip": 1.01677918, + "balance_loss_mlp": 1.03600717, + "epoch": 0.6788516458740418, + "flos": 20008851402240.0, + "grad_norm": 2.2351562454410034, + "language_loss": 0.75014412, + "learning_rate": 9.87658526342428e-07, + "loss": 0.77142644, + "num_input_tokens_seen": 243654950, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 11291, + "time_per_iteration": 2.4530417919158936 + }, + { + "auxiliary_loss_clip": 0.01105979, + "auxiliary_loss_mlp": 0.01034196, + "balance_loss_clip": 1.02219784, + "balance_loss_mlp": 1.03691578, + "epoch": 0.6789117691267098, + "flos": 28727099832960.0, + "grad_norm": 1.8299710869537638, + "language_loss": 0.75613016, + "learning_rate": 9.873226608180785e-07, + "loss": 0.77753186, + "num_input_tokens_seen": 243674970, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 11292, + "time_per_iteration": 2.560930013656616 + }, + { + "auxiliary_loss_clip": 0.01103978, + "auxiliary_loss_mlp": 0.01028719, + "balance_loss_clip": 1.01666081, + "balance_loss_mlp": 1.03636706, + "epoch": 0.6789718923793777, + "flos": 23403451150080.0, + "grad_norm": 1.9135755383501691, + "language_loss": 0.83619392, + "learning_rate": 9.869868336945556e-07, + "loss": 0.85752094, + "num_input_tokens_seen": 243693440, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 11293, + "time_per_iteration": 2.442145824432373 + }, + { + "auxiliary_loss_clip": 0.01111617, + "auxiliary_loss_mlp": 0.01037557, + "balance_loss_clip": 1.02419984, + "balance_loss_mlp": 1.03933525, + "epoch": 0.6790320156320457, + "flos": 20448865008000.0, + "grad_norm": 2.319599838777995, + "language_loss": 0.79377204, + "learning_rate": 9.866510449845929e-07, + "loss": 0.81526375, + "num_input_tokens_seen": 243710055, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.72265625, + "step": 11294, + "time_per_iteration": 2.487916946411133 + }, + { + "auxiliary_loss_clip": 0.0110334, + "auxiliary_loss_mlp": 0.01027757, + "balance_loss_clip": 1.0165689, + "balance_loss_mlp": 1.0358336, + "epoch": 0.6790921388847136, + "flos": 24167199058560.0, + "grad_norm": 1.670516322497649, + "language_loss": 0.79154253, + "learning_rate": 9.86315294700924e-07, + "loss": 0.81285346, + "num_input_tokens_seen": 243728635, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.67578125, + "step": 11295, + "time_per_iteration": 2.466892957687378 + }, + { + "auxiliary_loss_clip": 0.01099219, + "auxiliary_loss_mlp": 0.01028607, + "balance_loss_clip": 1.01862347, + "balance_loss_mlp": 1.03505814, + "epoch": 0.6791522621373817, + "flos": 21908095027200.0, + "grad_norm": 1.698673678539366, + "language_loss": 0.71407616, + "learning_rate": 9.859795828562823e-07, + "loss": 0.73535442, + "num_input_tokens_seen": 243748330, + "router_z_loss_clip": 0.09960938, + "router_z_loss_mlp": 0.640625, + "step": 11296, + "time_per_iteration": 2.482555866241455 + }, + { + "auxiliary_loss_clip": 0.01101606, + "auxiliary_loss_mlp": 0.0102908, + "balance_loss_clip": 1.01736212, + "balance_loss_mlp": 1.03510296, + "epoch": 0.6792123853900496, + "flos": 24826519152000.0, + "grad_norm": 1.753920624789111, + "language_loss": 0.70683616, + "learning_rate": 9.856439094633949e-07, + "loss": 0.72814304, + "num_input_tokens_seen": 243769380, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 11297, + "time_per_iteration": 2.466238021850586 + }, + { + "auxiliary_loss_clip": 0.01106999, + "auxiliary_loss_mlp": 0.01030897, + "balance_loss_clip": 1.01821899, + "balance_loss_mlp": 1.03667176, + "epoch": 0.6792725086427176, + "flos": 17566279678080.0, + "grad_norm": 2.1069890127028974, + "language_loss": 0.66267467, + "learning_rate": 9.853082745349918e-07, + "loss": 0.6840536, + "num_input_tokens_seen": 243785510, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 11298, + "time_per_iteration": 2.424710273742676 + }, + { + "auxiliary_loss_clip": 0.01103908, + "auxiliary_loss_mlp": 0.01029396, + "balance_loss_clip": 1.01871479, + "balance_loss_mlp": 1.03633463, + "epoch": 0.6793326318953855, + "flos": 26941837040640.0, + "grad_norm": 1.7026224144439064, + "language_loss": 0.71526003, + "learning_rate": 9.84972678083801e-07, + "loss": 0.73659307, + "num_input_tokens_seen": 243805545, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.67578125, + "step": 11299, + "time_per_iteration": 2.4778668880462646 + }, + { + "auxiliary_loss_clip": 0.01105656, + "auxiliary_loss_mlp": 0.01032112, + "balance_loss_clip": 1.02016139, + "balance_loss_mlp": 1.03812611, + "epoch": 0.6793927551480535, + "flos": 24318275662080.0, + "grad_norm": 1.4081485921140142, + "language_loss": 0.77155232, + "learning_rate": 9.846371201225488e-07, + "loss": 0.79293001, + "num_input_tokens_seen": 243825185, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.67578125, + "step": 11300, + "time_per_iteration": 2.492253541946411 + }, + { + "auxiliary_loss_clip": 0.01103159, + "auxiliary_loss_mlp": 0.01029717, + "balance_loss_clip": 1.01748598, + "balance_loss_mlp": 1.03599048, + "epoch": 0.6794528784007214, + "flos": 11436615757440.0, + "grad_norm": 1.7968797031135182, + "language_loss": 0.62885916, + "learning_rate": 9.843016006639577e-07, + "loss": 0.65018791, + "num_input_tokens_seen": 243841600, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 11301, + "time_per_iteration": 2.397135019302368 + }, + { + "auxiliary_loss_clip": 0.01102494, + "auxiliary_loss_mlp": 0.0102808, + "balance_loss_clip": 1.01690459, + "balance_loss_mlp": 1.03594089, + "epoch": 0.6795130016533895, + "flos": 25229688382080.0, + "grad_norm": 1.724284284453245, + "language_loss": 0.82755935, + "learning_rate": 9.839661197207525e-07, + "loss": 0.84886515, + "num_input_tokens_seen": 243862250, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6640625, + "step": 11302, + "time_per_iteration": 2.472766399383545 + }, + { + "auxiliary_loss_clip": 0.01106208, + "auxiliary_loss_mlp": 0.01031247, + "balance_loss_clip": 1.01926029, + "balance_loss_mlp": 1.03716099, + "epoch": 0.6795731249060574, + "flos": 18296415434880.0, + "grad_norm": 2.1762222349963176, + "language_loss": 0.69784915, + "learning_rate": 9.83630677305654e-07, + "loss": 0.71922374, + "num_input_tokens_seen": 243880560, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 11303, + "time_per_iteration": 3.805736780166626 + }, + { + "auxiliary_loss_clip": 0.0110718, + "auxiliary_loss_mlp": 0.01029925, + "balance_loss_clip": 1.01801562, + "balance_loss_mlp": 1.03717601, + "epoch": 0.6796332481587254, + "flos": 20300374183680.0, + "grad_norm": 2.3868097803445383, + "language_loss": 0.69926792, + "learning_rate": 9.832952734313813e-07, + "loss": 0.72063893, + "num_input_tokens_seen": 243900635, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.703125, + "step": 11304, + "time_per_iteration": 2.4878110885620117 + }, + { + "auxiliary_loss_clip": 0.01107692, + "auxiliary_loss_mlp": 0.01031025, + "balance_loss_clip": 1.0188539, + "balance_loss_mlp": 1.03924417, + "epoch": 0.6796933714113934, + "flos": 23586847015680.0, + "grad_norm": 2.7487345535411407, + "language_loss": 0.72523355, + "learning_rate": 9.829599081106536e-07, + "loss": 0.74662066, + "num_input_tokens_seen": 243920160, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.68359375, + "step": 11305, + "time_per_iteration": 3.969510316848755 + }, + { + "auxiliary_loss_clip": 0.01103346, + "auxiliary_loss_mlp": 0.01027504, + "balance_loss_clip": 1.01541042, + "balance_loss_mlp": 1.03585541, + "epoch": 0.6797534946640613, + "flos": 27119917693440.0, + "grad_norm": 1.9643394158396053, + "language_loss": 0.65558803, + "learning_rate": 9.826245813561882e-07, + "loss": 0.67689657, + "num_input_tokens_seen": 243939015, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 11306, + "time_per_iteration": 5.400679111480713 + }, + { + "auxiliary_loss_clip": 0.01101932, + "auxiliary_loss_mlp": 0.01027949, + "balance_loss_clip": 1.01505661, + "balance_loss_mlp": 1.03540945, + "epoch": 0.6798136179167293, + "flos": 22127437428480.0, + "grad_norm": 1.6667606428941142, + "language_loss": 0.79942191, + "learning_rate": 9.822892931807021e-07, + "loss": 0.82072073, + "num_input_tokens_seen": 243958470, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6640625, + "step": 11307, + "time_per_iteration": 2.430248260498047 + }, + { + "auxiliary_loss_clip": 0.01103369, + "auxiliary_loss_mlp": 0.01030053, + "balance_loss_clip": 1.01866865, + "balance_loss_mlp": 1.03694439, + "epoch": 0.6798737411693972, + "flos": 17488640430720.0, + "grad_norm": 1.5435492505708737, + "language_loss": 0.88790625, + "learning_rate": 9.819540435969066e-07, + "loss": 0.90924048, + "num_input_tokens_seen": 243975450, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6640625, + "step": 11308, + "time_per_iteration": 2.456007242202759 + }, + { + "auxiliary_loss_clip": 0.01104873, + "auxiliary_loss_mlp": 0.01036527, + "balance_loss_clip": 1.02374792, + "balance_loss_mlp": 1.03597665, + "epoch": 0.6799338644220653, + "flos": 22892262744960.0, + "grad_norm": 2.037595188669874, + "language_loss": 0.71198809, + "learning_rate": 9.816188326175154e-07, + "loss": 0.73340213, + "num_input_tokens_seen": 243994355, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6875, + "step": 11309, + "time_per_iteration": 2.444063901901245 + }, + { + "auxiliary_loss_clip": 0.01104515, + "auxiliary_loss_mlp": 0.01033962, + "balance_loss_clip": 1.02223217, + "balance_loss_mlp": 1.03636754, + "epoch": 0.6799939876747332, + "flos": 23180409648000.0, + "grad_norm": 2.0611426595675915, + "language_loss": 0.84300488, + "learning_rate": 9.812836602552411e-07, + "loss": 0.86438966, + "num_input_tokens_seen": 244011620, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.68359375, + "step": 11310, + "time_per_iteration": 2.4817349910736084 + }, + { + "auxiliary_loss_clip": 0.01102101, + "auxiliary_loss_mlp": 0.01027846, + "balance_loss_clip": 1.01696813, + "balance_loss_mlp": 1.03708959, + "epoch": 0.6800541109274012, + "flos": 19499925553920.0, + "grad_norm": 2.1934331981692963, + "language_loss": 0.82783055, + "learning_rate": 9.80948526522792e-07, + "loss": 0.84913009, + "num_input_tokens_seen": 244029925, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6484375, + "step": 11311, + "time_per_iteration": 2.4103691577911377 + }, + { + "auxiliary_loss_clip": 0.01107302, + "auxiliary_loss_mlp": 0.01032182, + "balance_loss_clip": 1.01871729, + "balance_loss_mlp": 1.03547812, + "epoch": 0.6801142341800691, + "flos": 22277652105600.0, + "grad_norm": 2.5662813310714268, + "language_loss": 0.76297283, + "learning_rate": 9.806134314328767e-07, + "loss": 0.78436768, + "num_input_tokens_seen": 244051225, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71875, + "step": 11312, + "time_per_iteration": 2.5150935649871826 + }, + { + "auxiliary_loss_clip": 0.01027323, + "auxiliary_loss_mlp": 0.01002804, + "balance_loss_clip": 1.00166547, + "balance_loss_mlp": 1.00670671, + "epoch": 0.6801743574327371, + "flos": 68714817759360.0, + "grad_norm": 0.6868398662733849, + "language_loss": 0.57254708, + "learning_rate": 9.802783749982038e-07, + "loss": 0.59284842, + "num_input_tokens_seen": 244115930, + "router_z_loss_clip": 0.01141357, + "router_z_loss_mlp": 0.20605469, + "step": 11313, + "time_per_iteration": 3.1505696773529053 + }, + { + "auxiliary_loss_clip": 0.01103458, + "auxiliary_loss_mlp": 0.01027729, + "balance_loss_clip": 1.01572418, + "balance_loss_mlp": 1.03516006, + "epoch": 0.680234480685405, + "flos": 29460467813760.0, + "grad_norm": 1.7918563854588148, + "language_loss": 0.68882596, + "learning_rate": 9.799433572314754e-07, + "loss": 0.71013784, + "num_input_tokens_seen": 244137320, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 11314, + "time_per_iteration": 2.5254998207092285 + }, + { + "auxiliary_loss_clip": 0.01099909, + "auxiliary_loss_mlp": 0.01028487, + "balance_loss_clip": 1.01754403, + "balance_loss_mlp": 1.03417087, + "epoch": 0.6802946039380731, + "flos": 15916866122880.0, + "grad_norm": 1.7481645051595534, + "language_loss": 0.81398594, + "learning_rate": 9.796083781453972e-07, + "loss": 0.83526987, + "num_input_tokens_seen": 244152755, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.65625, + "step": 11315, + "time_per_iteration": 2.453127861022949 + }, + { + "auxiliary_loss_clip": 0.01104752, + "auxiliary_loss_mlp": 0.01026651, + "balance_loss_clip": 1.01452708, + "balance_loss_mlp": 1.03766704, + "epoch": 0.680354727190741, + "flos": 22018664067840.0, + "grad_norm": 1.6730986060802988, + "language_loss": 0.69740957, + "learning_rate": 9.792734377526718e-07, + "loss": 0.7187236, + "num_input_tokens_seen": 244171480, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 11316, + "time_per_iteration": 2.483550548553467 + }, + { + "auxiliary_loss_clip": 0.01103992, + "auxiliary_loss_mlp": 0.01026651, + "balance_loss_clip": 1.0155412, + "balance_loss_mlp": 1.03765678, + "epoch": 0.680414850443409, + "flos": 18441494467200.0, + "grad_norm": 2.178074033436339, + "language_loss": 0.66859937, + "learning_rate": 9.789385360660003e-07, + "loss": 0.68990576, + "num_input_tokens_seen": 244187920, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 11317, + "time_per_iteration": 2.4059898853302 + }, + { + "auxiliary_loss_clip": 0.01106005, + "auxiliary_loss_mlp": 0.0103958, + "balance_loss_clip": 1.02807629, + "balance_loss_mlp": 1.0385282, + "epoch": 0.680474973696077, + "flos": 26358611909760.0, + "grad_norm": 1.4508017405477542, + "language_loss": 0.75009024, + "learning_rate": 9.78603673098082e-07, + "loss": 0.77154613, + "num_input_tokens_seen": 244209565, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.67578125, + "step": 11318, + "time_per_iteration": 2.499570608139038 + }, + { + "auxiliary_loss_clip": 0.01097899, + "auxiliary_loss_mlp": 0.01028185, + "balance_loss_clip": 1.01697898, + "balance_loss_mlp": 1.03418541, + "epoch": 0.6805350969487449, + "flos": 18333116156160.0, + "grad_norm": 2.7236911079158985, + "language_loss": 0.6802513, + "learning_rate": 9.782688488616143e-07, + "loss": 0.7015121, + "num_input_tokens_seen": 244228015, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.63671875, + "step": 11319, + "time_per_iteration": 2.4078075885772705 + }, + { + "auxiliary_loss_clip": 0.01101617, + "auxiliary_loss_mlp": 0.01037234, + "balance_loss_clip": 1.02501535, + "balance_loss_mlp": 1.03571796, + "epoch": 0.6805952202014129, + "flos": 19937497034880.0, + "grad_norm": 1.8193525574873417, + "language_loss": 0.76578677, + "learning_rate": 9.779340633692945e-07, + "loss": 0.7871753, + "num_input_tokens_seen": 244245615, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65625, + "step": 11320, + "time_per_iteration": 2.4763078689575195 + }, + { + "auxiliary_loss_clip": 0.011026, + "auxiliary_loss_mlp": 0.01028294, + "balance_loss_clip": 1.01627135, + "balance_loss_mlp": 1.0357213, + "epoch": 0.6806553434540809, + "flos": 25224301342080.0, + "grad_norm": 2.0578108779297732, + "language_loss": 0.74360389, + "learning_rate": 9.77599316633817e-07, + "loss": 0.76491284, + "num_input_tokens_seen": 244263625, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66796875, + "step": 11321, + "time_per_iteration": 2.4495351314544678 + }, + { + "auxiliary_loss_clip": 0.01105692, + "auxiliary_loss_mlp": 0.01034068, + "balance_loss_clip": 1.02243876, + "balance_loss_mlp": 1.03807235, + "epoch": 0.6807154667067489, + "flos": 17785586165760.0, + "grad_norm": 1.8874116924899373, + "language_loss": 0.72533345, + "learning_rate": 9.772646086678758e-07, + "loss": 0.74673104, + "num_input_tokens_seen": 244282745, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.67578125, + "step": 11322, + "time_per_iteration": 2.4374794960021973 + }, + { + "auxiliary_loss_clip": 0.01102931, + "auxiliary_loss_mlp": 0.01028655, + "balance_loss_clip": 1.01677608, + "balance_loss_mlp": 1.03495407, + "epoch": 0.6807755899594168, + "flos": 22199905117440.0, + "grad_norm": 1.6803003695181602, + "language_loss": 0.78470093, + "learning_rate": 9.769299394841638e-07, + "loss": 0.8060168, + "num_input_tokens_seen": 244303770, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 11323, + "time_per_iteration": 2.4333457946777344 + }, + { + "auxiliary_loss_clip": 0.01027457, + "auxiliary_loss_mlp": 0.01001857, + "balance_loss_clip": 1.00065899, + "balance_loss_mlp": 1.00677872, + "epoch": 0.6808357132120848, + "flos": 68631073200000.0, + "grad_norm": 0.7788248321760284, + "language_loss": 0.57097274, + "learning_rate": 9.765953090953714e-07, + "loss": 0.59126586, + "num_input_tokens_seen": 244355910, + "router_z_loss_clip": 0.01196289, + "router_z_loss_mlp": 0.20703125, + "step": 11324, + "time_per_iteration": 2.87032413482666 + }, + { + "auxiliary_loss_clip": 0.01104753, + "auxiliary_loss_mlp": 0.01034165, + "balance_loss_clip": 1.02192771, + "balance_loss_mlp": 1.03705823, + "epoch": 0.6808958364647527, + "flos": 23843357015040.0, + "grad_norm": 1.797689988899455, + "language_loss": 0.68072367, + "learning_rate": 9.76260717514186e-07, + "loss": 0.70211285, + "num_input_tokens_seen": 244376610, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6796875, + "step": 11325, + "time_per_iteration": 2.4791805744171143 + }, + { + "auxiliary_loss_clip": 0.01107083, + "auxiliary_loss_mlp": 0.01031616, + "balance_loss_clip": 1.01901543, + "balance_loss_mlp": 1.03593659, + "epoch": 0.6809559597174207, + "flos": 17711717846400.0, + "grad_norm": 9.902559035776392, + "language_loss": 0.7025001, + "learning_rate": 9.759261647532974e-07, + "loss": 0.72388709, + "num_input_tokens_seen": 244393000, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 11326, + "time_per_iteration": 2.411768913269043 + }, + { + "auxiliary_loss_clip": 0.01103444, + "auxiliary_loss_mlp": 0.01030379, + "balance_loss_clip": 1.01868427, + "balance_loss_mlp": 1.03564632, + "epoch": 0.6810160829700886, + "flos": 22491894775680.0, + "grad_norm": 1.7689960274485943, + "language_loss": 0.72761798, + "learning_rate": 9.75591650825392e-07, + "loss": 0.7489562, + "num_input_tokens_seen": 244409515, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 11327, + "time_per_iteration": 2.4436709880828857 + }, + { + "auxiliary_loss_clip": 0.01101261, + "auxiliary_loss_mlp": 0.01031062, + "balance_loss_clip": 1.01918912, + "balance_loss_mlp": 1.03561234, + "epoch": 0.6810762062227567, + "flos": 16832875783680.0, + "grad_norm": 2.3861554573552533, + "language_loss": 0.77319372, + "learning_rate": 9.752571757431526e-07, + "loss": 0.79451698, + "num_input_tokens_seen": 244427165, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.65625, + "step": 11328, + "time_per_iteration": 2.427549123764038 + }, + { + "auxiliary_loss_clip": 0.01104861, + "auxiliary_loss_mlp": 0.01029367, + "balance_loss_clip": 1.01756525, + "balance_loss_mlp": 1.03677118, + "epoch": 0.6811363294754246, + "flos": 12714676554240.0, + "grad_norm": 3.828786564380187, + "language_loss": 0.64639735, + "learning_rate": 9.74922739519265e-07, + "loss": 0.66773969, + "num_input_tokens_seen": 244445705, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.68359375, + "step": 11329, + "time_per_iteration": 2.4063379764556885 + }, + { + "auxiliary_loss_clip": 0.01106328, + "auxiliary_loss_mlp": 0.01029313, + "balance_loss_clip": 1.01713562, + "balance_loss_mlp": 1.03745294, + "epoch": 0.6811964527280926, + "flos": 17711969241600.0, + "grad_norm": 1.9960449149160304, + "language_loss": 0.79504317, + "learning_rate": 9.745883421664096e-07, + "loss": 0.81639957, + "num_input_tokens_seen": 244460415, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6875, + "step": 11330, + "time_per_iteration": 2.4729740619659424 + }, + { + "auxiliary_loss_clip": 0.0110534, + "auxiliary_loss_mlp": 0.01029907, + "balance_loss_clip": 1.01765263, + "balance_loss_mlp": 1.03767729, + "epoch": 0.6812565759807605, + "flos": 24863471268480.0, + "grad_norm": 3.982985267736798, + "language_loss": 0.63851273, + "learning_rate": 9.742539836972665e-07, + "loss": 0.6598652, + "num_input_tokens_seen": 244480555, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.67578125, + "step": 11331, + "time_per_iteration": 2.4589385986328125 + }, + { + "auxiliary_loss_clip": 0.01104506, + "auxiliary_loss_mlp": 0.01034793, + "balance_loss_clip": 1.02241278, + "balance_loss_mlp": 1.03761506, + "epoch": 0.6813166992334285, + "flos": 17166019449600.0, + "grad_norm": 1.9198633310725437, + "language_loss": 0.7197634, + "learning_rate": 9.739196641245148e-07, + "loss": 0.7411564, + "num_input_tokens_seen": 244498540, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.66796875, + "step": 11332, + "time_per_iteration": 2.48699951171875 + }, + { + "auxiliary_loss_clip": 0.01105323, + "auxiliary_loss_mlp": 0.01032093, + "balance_loss_clip": 1.0197432, + "balance_loss_mlp": 1.03659022, + "epoch": 0.6813768224860965, + "flos": 18843550375680.0, + "grad_norm": 1.8849624776188914, + "language_loss": 0.75043106, + "learning_rate": 9.735853834608326e-07, + "loss": 0.77180523, + "num_input_tokens_seen": 244517015, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6875, + "step": 11333, + "time_per_iteration": 2.4035282135009766 + }, + { + "auxiliary_loss_clip": 0.01109278, + "auxiliary_loss_mlp": 0.01029254, + "balance_loss_clip": 1.01664138, + "balance_loss_mlp": 1.03870749, + "epoch": 0.6814369457387645, + "flos": 24532733813760.0, + "grad_norm": 1.3964934580500172, + "language_loss": 0.71910471, + "learning_rate": 9.732511417188963e-07, + "loss": 0.74048996, + "num_input_tokens_seen": 244537450, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.70703125, + "step": 11334, + "time_per_iteration": 2.514709234237671 + }, + { + "auxiliary_loss_clip": 0.01102183, + "auxiliary_loss_mlp": 0.01031072, + "balance_loss_clip": 1.01966393, + "balance_loss_mlp": 1.03584528, + "epoch": 0.6814970689914325, + "flos": 18222978078720.0, + "grad_norm": 1.6647407719870675, + "language_loss": 0.85981625, + "learning_rate": 9.729169389113791e-07, + "loss": 0.88114882, + "num_input_tokens_seen": 244555640, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 11335, + "time_per_iteration": 2.566171169281006 + }, + { + "auxiliary_loss_clip": 0.0109703, + "auxiliary_loss_mlp": 0.01027373, + "balance_loss_clip": 1.01659656, + "balance_loss_mlp": 1.03387475, + "epoch": 0.6815571922441004, + "flos": 25228790542080.0, + "grad_norm": 2.956835270100481, + "language_loss": 0.81945407, + "learning_rate": 9.725827750509542e-07, + "loss": 0.84069812, + "num_input_tokens_seen": 244574005, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6328125, + "step": 11336, + "time_per_iteration": 2.50917911529541 + }, + { + "auxiliary_loss_clip": 0.01100635, + "auxiliary_loss_mlp": 0.01029151, + "balance_loss_clip": 1.01822007, + "balance_loss_mlp": 1.03596747, + "epoch": 0.6816173154967684, + "flos": 19456078026240.0, + "grad_norm": 1.8358807203128344, + "language_loss": 0.81945646, + "learning_rate": 9.72248650150294e-07, + "loss": 0.84075427, + "num_input_tokens_seen": 244591395, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.64453125, + "step": 11337, + "time_per_iteration": 2.448796510696411 + }, + { + "auxiliary_loss_clip": 0.01099529, + "auxiliary_loss_mlp": 0.0102691, + "balance_loss_clip": 1.0160563, + "balance_loss_mlp": 1.03479064, + "epoch": 0.6816774387494363, + "flos": 17931455297280.0, + "grad_norm": 1.722806796595651, + "language_loss": 0.72469616, + "learning_rate": 9.719145642220673e-07, + "loss": 0.74596059, + "num_input_tokens_seen": 244610400, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6484375, + "step": 11338, + "time_per_iteration": 2.517240047454834 + }, + { + "auxiliary_loss_clip": 0.01105227, + "auxiliary_loss_mlp": 0.01031788, + "balance_loss_clip": 1.02005768, + "balance_loss_mlp": 1.03771722, + "epoch": 0.6817375620021043, + "flos": 22233014478720.0, + "grad_norm": 1.4508555916130568, + "language_loss": 0.77669561, + "learning_rate": 9.715805172789435e-07, + "loss": 0.79806578, + "num_input_tokens_seen": 244630400, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.67578125, + "step": 11339, + "time_per_iteration": 2.436663866043091 + }, + { + "auxiliary_loss_clip": 0.01105389, + "auxiliary_loss_mlp": 0.01032391, + "balance_loss_clip": 1.02076244, + "balance_loss_mlp": 1.03804171, + "epoch": 0.6817976852547722, + "flos": 25374408278400.0, + "grad_norm": 2.0295293442554483, + "language_loss": 0.70622659, + "learning_rate": 9.712465093335901e-07, + "loss": 0.72760439, + "num_input_tokens_seen": 244649155, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 11340, + "time_per_iteration": 2.5092625617980957 + }, + { + "auxiliary_loss_clip": 0.01108606, + "auxiliary_loss_mlp": 0.01032866, + "balance_loss_clip": 1.02090895, + "balance_loss_mlp": 1.03815854, + "epoch": 0.6818578085074403, + "flos": 22265764704000.0, + "grad_norm": 2.203520540229157, + "language_loss": 0.82961929, + "learning_rate": 9.709125403986722e-07, + "loss": 0.85103399, + "num_input_tokens_seen": 244665470, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.703125, + "step": 11341, + "time_per_iteration": 2.470651626586914 + }, + { + "auxiliary_loss_clip": 0.01106072, + "auxiliary_loss_mlp": 0.01036902, + "balance_loss_clip": 1.02358067, + "balance_loss_mlp": 1.03685653, + "epoch": 0.6819179317601082, + "flos": 19318145800320.0, + "grad_norm": 1.764627541247337, + "language_loss": 0.68348753, + "learning_rate": 9.705786104868531e-07, + "loss": 0.70491731, + "num_input_tokens_seen": 244684390, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.69140625, + "step": 11342, + "time_per_iteration": 2.5127713680267334 + }, + { + "auxiliary_loss_clip": 0.01101992, + "auxiliary_loss_mlp": 0.01029003, + "balance_loss_clip": 1.01706433, + "balance_loss_mlp": 1.03569162, + "epoch": 0.6819780550127762, + "flos": 21104126864640.0, + "grad_norm": 1.7075903323008321, + "language_loss": 0.74946058, + "learning_rate": 9.702447196107963e-07, + "loss": 0.77077055, + "num_input_tokens_seen": 244703370, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6640625, + "step": 11343, + "time_per_iteration": 2.5146141052246094 + }, + { + "auxiliary_loss_clip": 0.01104891, + "auxiliary_loss_mlp": 0.01033799, + "balance_loss_clip": 1.02227187, + "balance_loss_mlp": 1.0377264, + "epoch": 0.6820381782654441, + "flos": 29716403195520.0, + "grad_norm": 1.6017732799578648, + "language_loss": 0.79690164, + "learning_rate": 9.699108677831639e-07, + "loss": 0.81828856, + "num_input_tokens_seen": 244723325, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 11344, + "time_per_iteration": 3.9397521018981934 + }, + { + "auxiliary_loss_clip": 0.01102922, + "auxiliary_loss_mlp": 0.01032113, + "balance_loss_clip": 1.0200969, + "balance_loss_mlp": 1.03575659, + "epoch": 0.6820983015181121, + "flos": 29242130993280.0, + "grad_norm": 2.3863241768064416, + "language_loss": 0.66377771, + "learning_rate": 9.695770550166136e-07, + "loss": 0.68512809, + "num_input_tokens_seen": 244745650, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 11345, + "time_per_iteration": 2.5208473205566406 + }, + { + "auxiliary_loss_clip": 0.01106639, + "auxiliary_loss_mlp": 0.01030289, + "balance_loss_clip": 1.01854682, + "balance_loss_mlp": 1.03741777, + "epoch": 0.6821584247707801, + "flos": 18871775487360.0, + "grad_norm": 2.4472974915932637, + "language_loss": 0.64573473, + "learning_rate": 9.692432813238054e-07, + "loss": 0.66710401, + "num_input_tokens_seen": 244760270, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6953125, + "step": 11346, + "time_per_iteration": 3.8512396812438965 + }, + { + "auxiliary_loss_clip": 0.01105563, + "auxiliary_loss_mlp": 0.01030452, + "balance_loss_clip": 1.01776791, + "balance_loss_mlp": 1.03745544, + "epoch": 0.6822185480234481, + "flos": 21324582587520.0, + "grad_norm": 1.5968577060390179, + "language_loss": 0.7844069, + "learning_rate": 9.689095467173952e-07, + "loss": 0.80576706, + "num_input_tokens_seen": 244779565, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 11347, + "time_per_iteration": 3.8028361797332764 + }, + { + "auxiliary_loss_clip": 0.01026659, + "auxiliary_loss_mlp": 0.01001661, + "balance_loss_clip": 1.00046301, + "balance_loss_mlp": 1.0059818, + "epoch": 0.6822786712761161, + "flos": 63488306430720.0, + "grad_norm": 0.7216727103538496, + "language_loss": 0.5250113, + "learning_rate": 9.685758512100378e-07, + "loss": 0.54529452, + "num_input_tokens_seen": 244838480, + "router_z_loss_clip": 0.01196289, + "router_z_loss_mlp": 0.20703125, + "step": 11348, + "time_per_iteration": 4.506226539611816 + }, + { + "auxiliary_loss_clip": 0.01101236, + "auxiliary_loss_mlp": 0.01032556, + "balance_loss_clip": 1.02144003, + "balance_loss_mlp": 1.03572845, + "epoch": 0.682338794528784, + "flos": 21068934514560.0, + "grad_norm": 1.7906697802801645, + "language_loss": 0.79596829, + "learning_rate": 9.682421948143873e-07, + "loss": 0.81730622, + "num_input_tokens_seen": 244855265, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 11349, + "time_per_iteration": 2.4514377117156982 + }, + { + "auxiliary_loss_clip": 0.01111621, + "auxiliary_loss_mlp": 0.01028663, + "balance_loss_clip": 1.01438189, + "balance_loss_mlp": 1.03865266, + "epoch": 0.682398917781452, + "flos": 36283243547520.0, + "grad_norm": 9.523032245657118, + "language_loss": 0.74000543, + "learning_rate": 9.67908577543096e-07, + "loss": 0.76140821, + "num_input_tokens_seen": 244875555, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7265625, + "step": 11350, + "time_per_iteration": 2.6128787994384766 + }, + { + "auxiliary_loss_clip": 0.01102473, + "auxiliary_loss_mlp": 0.01028574, + "balance_loss_clip": 1.01656938, + "balance_loss_mlp": 1.03694868, + "epoch": 0.6824590410341199, + "flos": 24859197550080.0, + "grad_norm": 1.583319505093848, + "language_loss": 0.79434985, + "learning_rate": 9.675749994088161e-07, + "loss": 0.81566036, + "num_input_tokens_seen": 244895270, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.65625, + "step": 11351, + "time_per_iteration": 2.4813127517700195 + }, + { + "auxiliary_loss_clip": 0.01102481, + "auxiliary_loss_mlp": 0.01033209, + "balance_loss_clip": 1.02167511, + "balance_loss_mlp": 1.03581142, + "epoch": 0.6825191642867879, + "flos": 22452392793600.0, + "grad_norm": 1.5951575368956712, + "language_loss": 0.73410577, + "learning_rate": 9.672414604241954e-07, + "loss": 0.75546265, + "num_input_tokens_seen": 244914535, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66796875, + "step": 11352, + "time_per_iteration": 2.471541166305542 + }, + { + "auxiliary_loss_clip": 0.01105327, + "auxiliary_loss_mlp": 0.01034603, + "balance_loss_clip": 1.02216315, + "balance_loss_mlp": 1.03617918, + "epoch": 0.6825792875394558, + "flos": 29424377623680.0, + "grad_norm": 1.5725908722190713, + "language_loss": 0.80191058, + "learning_rate": 9.669079606018814e-07, + "loss": 0.8233099, + "num_input_tokens_seen": 244936095, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69140625, + "step": 11353, + "time_per_iteration": 2.5034008026123047 + }, + { + "auxiliary_loss_clip": 0.01103178, + "auxiliary_loss_mlp": 0.01024386, + "balance_loss_clip": 1.01242352, + "balance_loss_mlp": 1.03601313, + "epoch": 0.6826394107921239, + "flos": 18770974945920.0, + "grad_norm": 1.984510532707265, + "language_loss": 0.78228319, + "learning_rate": 9.665744999545218e-07, + "loss": 0.80355877, + "num_input_tokens_seen": 244955290, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 11354, + "time_per_iteration": 2.4608607292175293 + }, + { + "auxiliary_loss_clip": 0.01102222, + "auxiliary_loss_mlp": 0.0102752, + "balance_loss_clip": 1.01630878, + "balance_loss_mlp": 1.03619695, + "epoch": 0.6826995340447918, + "flos": 16617591619200.0, + "grad_norm": 2.0028339846466445, + "language_loss": 0.61692381, + "learning_rate": 9.662410784947599e-07, + "loss": 0.63822126, + "num_input_tokens_seen": 244972935, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.66015625, + "step": 11355, + "time_per_iteration": 2.40678071975708 + }, + { + "auxiliary_loss_clip": 0.01101274, + "auxiliary_loss_mlp": 0.01026693, + "balance_loss_clip": 1.0152607, + "balance_loss_mlp": 1.03438973, + "epoch": 0.6827596572974598, + "flos": 20848299223680.0, + "grad_norm": 1.9183183626079316, + "language_loss": 0.81905627, + "learning_rate": 9.659076962352398e-07, + "loss": 0.84033597, + "num_input_tokens_seen": 244989440, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 11356, + "time_per_iteration": 2.4604368209838867 + }, + { + "auxiliary_loss_clip": 0.01106625, + "auxiliary_loss_mlp": 0.01028652, + "balance_loss_clip": 1.01654649, + "balance_loss_mlp": 1.03872633, + "epoch": 0.6828197805501277, + "flos": 22748081552640.0, + "grad_norm": 2.562660672921637, + "language_loss": 0.78667843, + "learning_rate": 9.655743531886052e-07, + "loss": 0.8080312, + "num_input_tokens_seen": 245007830, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 11357, + "time_per_iteration": 2.4570956230163574 + }, + { + "auxiliary_loss_clip": 0.01027055, + "auxiliary_loss_mlp": 0.01004376, + "balance_loss_clip": 1.00311232, + "balance_loss_mlp": 1.00636482, + "epoch": 0.6828799038027957, + "flos": 71646565829760.0, + "grad_norm": 0.8170905749226814, + "language_loss": 0.59669131, + "learning_rate": 9.65241049367493e-07, + "loss": 0.61700559, + "num_input_tokens_seen": 245070720, + "router_z_loss_clip": 0.01263428, + "router_z_loss_mlp": 0.20703125, + "step": 11358, + "time_per_iteration": 3.1206090450286865 + }, + { + "auxiliary_loss_clip": 0.01108785, + "auxiliary_loss_mlp": 0.01036705, + "balance_loss_clip": 1.0243305, + "balance_loss_mlp": 1.03812075, + "epoch": 0.6829400270554637, + "flos": 19829154637440.0, + "grad_norm": 1.7308298657289736, + "language_loss": 0.78347307, + "learning_rate": 9.64907784784544e-07, + "loss": 0.804928, + "num_input_tokens_seen": 245089070, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.70703125, + "step": 11359, + "time_per_iteration": 2.4206995964050293 + }, + { + "auxiliary_loss_clip": 0.01102635, + "auxiliary_loss_mlp": 0.0103121, + "balance_loss_clip": 1.01964045, + "balance_loss_mlp": 1.03594446, + "epoch": 0.6830001503081317, + "flos": 21980634543360.0, + "grad_norm": 1.9738432775453243, + "language_loss": 0.81637627, + "learning_rate": 9.645745594523958e-07, + "loss": 0.83771473, + "num_input_tokens_seen": 245106500, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66796875, + "step": 11360, + "time_per_iteration": 2.476433038711548 + }, + { + "auxiliary_loss_clip": 0.01107014, + "auxiliary_loss_mlp": 0.01033044, + "balance_loss_clip": 1.02083063, + "balance_loss_mlp": 1.03856695, + "epoch": 0.6830602735607997, + "flos": 24316767290880.0, + "grad_norm": 1.86444446180785, + "language_loss": 0.75634044, + "learning_rate": 9.642413733836844e-07, + "loss": 0.77774101, + "num_input_tokens_seen": 245125260, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 11361, + "time_per_iteration": 2.4659223556518555 + }, + { + "auxiliary_loss_clip": 0.010268, + "auxiliary_loss_mlp": 0.01003581, + "balance_loss_clip": 1.00227582, + "balance_loss_mlp": 1.00611186, + "epoch": 0.6831203968134676, + "flos": 57690062323200.0, + "grad_norm": 0.8682819030103981, + "language_loss": 0.59711051, + "learning_rate": 9.639082265910437e-07, + "loss": 0.61741436, + "num_input_tokens_seen": 245188730, + "router_z_loss_clip": 0.01306152, + "router_z_loss_mlp": 0.20703125, + "step": 11362, + "time_per_iteration": 3.127232074737549 + }, + { + "auxiliary_loss_clip": 0.01104869, + "auxiliary_loss_mlp": 0.01030633, + "balance_loss_clip": 1.0179255, + "balance_loss_mlp": 1.03573108, + "epoch": 0.6831805200661356, + "flos": 14388436552320.0, + "grad_norm": 2.8459010350172913, + "language_loss": 0.74898708, + "learning_rate": 9.635751190871074e-07, + "loss": 0.77034211, + "num_input_tokens_seen": 245205065, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 11363, + "time_per_iteration": 2.4112236499786377 + }, + { + "auxiliary_loss_clip": 0.01102233, + "auxiliary_loss_mlp": 0.01037618, + "balance_loss_clip": 1.02511919, + "balance_loss_mlp": 1.03508842, + "epoch": 0.6832406433188035, + "flos": 22820297846400.0, + "grad_norm": 2.6445368972435976, + "language_loss": 0.89400429, + "learning_rate": 9.632420508845063e-07, + "loss": 0.91540277, + "num_input_tokens_seen": 245224265, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.671875, + "step": 11364, + "time_per_iteration": 2.4431772232055664 + }, + { + "auxiliary_loss_clip": 0.01101882, + "auxiliary_loss_mlp": 0.01030697, + "balance_loss_clip": 1.01950884, + "balance_loss_mlp": 1.03680646, + "epoch": 0.6833007665714715, + "flos": 17561718650880.0, + "grad_norm": 3.2328498112003503, + "language_loss": 0.88372034, + "learning_rate": 9.629090219958697e-07, + "loss": 0.90504611, + "num_input_tokens_seen": 245243360, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.65234375, + "step": 11365, + "time_per_iteration": 2.4429502487182617 + }, + { + "auxiliary_loss_clip": 0.01110566, + "auxiliary_loss_mlp": 0.01036403, + "balance_loss_clip": 1.02396965, + "balance_loss_mlp": 1.03944576, + "epoch": 0.6833608898241395, + "flos": 22445928345600.0, + "grad_norm": 2.0793788072414734, + "language_loss": 0.81185693, + "learning_rate": 9.625760324338272e-07, + "loss": 0.83332664, + "num_input_tokens_seen": 245256350, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 11366, + "time_per_iteration": 2.472283363342285 + }, + { + "auxiliary_loss_clip": 0.01103514, + "auxiliary_loss_mlp": 0.01028693, + "balance_loss_clip": 1.0166235, + "balance_loss_mlp": 1.03517795, + "epoch": 0.6834210130768075, + "flos": 24534637234560.0, + "grad_norm": 1.7001262791469558, + "language_loss": 0.76775587, + "learning_rate": 9.622430822110062e-07, + "loss": 0.789078, + "num_input_tokens_seen": 245277575, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.68359375, + "step": 11367, + "time_per_iteration": 2.4591305255889893 + }, + { + "auxiliary_loss_clip": 0.01105081, + "auxiliary_loss_mlp": 0.01035748, + "balance_loss_clip": 1.0234282, + "balance_loss_mlp": 1.03755784, + "epoch": 0.6834811363294754, + "flos": 20047132321920.0, + "grad_norm": 1.4398909959276744, + "language_loss": 0.68965262, + "learning_rate": 9.619101713400312e-07, + "loss": 0.71106088, + "num_input_tokens_seen": 245296615, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.67578125, + "step": 11368, + "time_per_iteration": 2.477160692214966 + }, + { + "auxiliary_loss_clip": 0.01102397, + "auxiliary_loss_mlp": 0.01029739, + "balance_loss_clip": 1.01824105, + "balance_loss_mlp": 1.03536785, + "epoch": 0.6835412595821434, + "flos": 24790752184320.0, + "grad_norm": 1.9865162675168815, + "language_loss": 0.73352474, + "learning_rate": 9.615772998335261e-07, + "loss": 0.7548461, + "num_input_tokens_seen": 245316275, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 11369, + "time_per_iteration": 2.4527742862701416 + }, + { + "auxiliary_loss_clip": 0.01102773, + "auxiliary_loss_mlp": 0.0102762, + "balance_loss_clip": 1.01549673, + "balance_loss_mlp": 1.03507197, + "epoch": 0.6836013828348113, + "flos": 19500356517120.0, + "grad_norm": 1.995405258990165, + "language_loss": 0.78393018, + "learning_rate": 9.612444677041138e-07, + "loss": 0.80523407, + "num_input_tokens_seen": 245334595, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 11370, + "time_per_iteration": 2.443544864654541 + }, + { + "auxiliary_loss_clip": 0.01026342, + "auxiliary_loss_mlp": 0.01001936, + "balance_loss_clip": 1.00067234, + "balance_loss_mlp": 1.00567722, + "epoch": 0.6836615060874793, + "flos": 58363999251840.0, + "grad_norm": 0.7476131007411569, + "language_loss": 0.59831941, + "learning_rate": 9.609116749644162e-07, + "loss": 0.61860228, + "num_input_tokens_seen": 245389750, + "router_z_loss_clip": 0.01263428, + "router_z_loss_mlp": 0.20703125, + "step": 11371, + "time_per_iteration": 2.9889161586761475 + }, + { + "auxiliary_loss_clip": 0.01099697, + "auxiliary_loss_mlp": 0.01028049, + "balance_loss_clip": 1.01723075, + "balance_loss_mlp": 1.03550124, + "epoch": 0.6837216293401474, + "flos": 12166895168640.0, + "grad_norm": 1.4552904214885107, + "language_loss": 0.63685644, + "learning_rate": 9.605789216270511e-07, + "loss": 0.65813392, + "num_input_tokens_seen": 245407530, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.640625, + "step": 11372, + "time_per_iteration": 2.424954891204834 + }, + { + "auxiliary_loss_clip": 0.01101447, + "auxiliary_loss_mlp": 0.01025041, + "balance_loss_clip": 1.01319766, + "balance_loss_mlp": 1.03525615, + "epoch": 0.6837817525928153, + "flos": 22127581082880.0, + "grad_norm": 1.4781124923613422, + "language_loss": 0.71735704, + "learning_rate": 9.602462077046375e-07, + "loss": 0.73862189, + "num_input_tokens_seen": 245427000, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66015625, + "step": 11373, + "time_per_iteration": 2.474728584289551 + }, + { + "auxiliary_loss_clip": 0.01026667, + "auxiliary_loss_mlp": 0.01000459, + "balance_loss_clip": 0.99917108, + "balance_loss_mlp": 1.00602746, + "epoch": 0.6838418758454833, + "flos": 65005928985600.0, + "grad_norm": 1.2229800972978824, + "language_loss": 0.56697685, + "learning_rate": 9.599135332097935e-07, + "loss": 0.58724803, + "num_input_tokens_seen": 245491620, + "router_z_loss_clip": 0.01287842, + "router_z_loss_mlp": 0.20703125, + "step": 11374, + "time_per_iteration": 3.22890567779541 + }, + { + "auxiliary_loss_clip": 0.01106754, + "auxiliary_loss_mlp": 0.01026655, + "balance_loss_clip": 1.01422763, + "balance_loss_mlp": 1.03807116, + "epoch": 0.6839019990981512, + "flos": 21030833162880.0, + "grad_norm": 1.6218199942773524, + "language_loss": 0.73614061, + "learning_rate": 9.595808981551312e-07, + "loss": 0.75747472, + "num_input_tokens_seen": 245511285, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 11375, + "time_per_iteration": 2.461625814437866 + }, + { + "auxiliary_loss_clip": 0.01102376, + "auxiliary_loss_mlp": 0.01031136, + "balance_loss_clip": 1.01968646, + "balance_loss_mlp": 1.036448, + "epoch": 0.6839621223508192, + "flos": 24935543907840.0, + "grad_norm": 1.6159856732267652, + "language_loss": 0.70548576, + "learning_rate": 9.592483025532651e-07, + "loss": 0.72682095, + "num_input_tokens_seen": 245532910, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66015625, + "step": 11376, + "time_per_iteration": 2.4842541217803955 + }, + { + "auxiliary_loss_clip": 0.01104932, + "auxiliary_loss_mlp": 0.01037614, + "balance_loss_clip": 1.02520418, + "balance_loss_mlp": 1.03640866, + "epoch": 0.6840222456034871, + "flos": 26358827391360.0, + "grad_norm": 2.0252780909145756, + "language_loss": 0.7449975, + "learning_rate": 9.58915746416808e-07, + "loss": 0.76642299, + "num_input_tokens_seen": 245550540, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 11377, + "time_per_iteration": 2.5335726737976074 + }, + { + "auxiliary_loss_clip": 0.01026236, + "auxiliary_loss_mlp": 0.01001308, + "balance_loss_clip": 1.00019324, + "balance_loss_mlp": 1.00557923, + "epoch": 0.6840823688561551, + "flos": 65988336936960.0, + "grad_norm": 0.7232069780958926, + "language_loss": 0.56829667, + "learning_rate": 9.585832297583707e-07, + "loss": 0.58857214, + "num_input_tokens_seen": 245619570, + "router_z_loss_clip": 0.01116943, + "router_z_loss_mlp": 0.20703125, + "step": 11378, + "time_per_iteration": 3.137204885482788 + }, + { + "auxiliary_loss_clip": 0.01103234, + "auxiliary_loss_mlp": 0.0103223, + "balance_loss_clip": 1.01959991, + "balance_loss_mlp": 1.03537726, + "epoch": 0.684142492108823, + "flos": 21397588980480.0, + "grad_norm": 1.644108790952547, + "language_loss": 0.78129804, + "learning_rate": 9.58250752590561e-07, + "loss": 0.80265266, + "num_input_tokens_seen": 245637980, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 11379, + "time_per_iteration": 2.496009349822998 + }, + { + "auxiliary_loss_clip": 0.01098608, + "auxiliary_loss_mlp": 0.01026904, + "balance_loss_clip": 1.01664054, + "balance_loss_mlp": 1.03623796, + "epoch": 0.6842026153614911, + "flos": 18801426700800.0, + "grad_norm": 2.007866180272703, + "language_loss": 0.68494868, + "learning_rate": 9.57918314925988e-07, + "loss": 0.70620382, + "num_input_tokens_seen": 245655690, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.625, + "step": 11380, + "time_per_iteration": 2.406384229660034 + }, + { + "auxiliary_loss_clip": 0.01101488, + "auxiliary_loss_mlp": 0.0103156, + "balance_loss_clip": 1.01939452, + "balance_loss_mlp": 1.03453815, + "epoch": 0.684262738614159, + "flos": 19646405216640.0, + "grad_norm": 2.132022624853322, + "language_loss": 0.78171045, + "learning_rate": 9.575859167772568e-07, + "loss": 0.80304098, + "num_input_tokens_seen": 245671525, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.66796875, + "step": 11381, + "time_per_iteration": 2.4570810794830322 + }, + { + "auxiliary_loss_clip": 0.01025143, + "auxiliary_loss_mlp": 0.01003104, + "balance_loss_clip": 1.00188828, + "balance_loss_mlp": 1.00454473, + "epoch": 0.684322861866827, + "flos": 62354462739840.0, + "grad_norm": 0.8747752326004012, + "language_loss": 0.67185926, + "learning_rate": 9.572535581569713e-07, + "loss": 0.69214177, + "num_input_tokens_seen": 245724115, + "router_z_loss_clip": 0.012146, + "router_z_loss_mlp": 0.20605469, + "step": 11382, + "time_per_iteration": 2.90439510345459 + }, + { + "auxiliary_loss_clip": 0.0102608, + "auxiliary_loss_mlp": 0.01001227, + "balance_loss_clip": 1.00005233, + "balance_loss_mlp": 1.00557017, + "epoch": 0.6843829851194949, + "flos": 65805048812160.0, + "grad_norm": 0.8179080284964599, + "language_loss": 0.58123773, + "learning_rate": 9.569212390777356e-07, + "loss": 0.60151082, + "num_input_tokens_seen": 245789245, + "router_z_loss_clip": 0.01171875, + "router_z_loss_mlp": 0.20507812, + "step": 11383, + "time_per_iteration": 3.0904266834259033 + }, + { + "auxiliary_loss_clip": 0.01100892, + "auxiliary_loss_mlp": 0.01025381, + "balance_loss_clip": 1.0144496, + "balance_loss_mlp": 1.03393197, + "epoch": 0.6844431083721629, + "flos": 27855153181440.0, + "grad_norm": 6.398458171268355, + "language_loss": 0.7963292, + "learning_rate": 9.565889595521517e-07, + "loss": 0.81759197, + "num_input_tokens_seen": 245812420, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.671875, + "step": 11384, + "time_per_iteration": 2.56005859375 + }, + { + "auxiliary_loss_clip": 0.0110454, + "auxiliary_loss_mlp": 0.01033859, + "balance_loss_clip": 1.02203345, + "balance_loss_mlp": 1.03471613, + "epoch": 0.684503231624831, + "flos": 18255010032000.0, + "grad_norm": 2.1545219517049135, + "language_loss": 0.7672773, + "learning_rate": 9.562567195928187e-07, + "loss": 0.7886613, + "num_input_tokens_seen": 245829135, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.69921875, + "step": 11385, + "time_per_iteration": 2.442094326019287 + }, + { + "auxiliary_loss_clip": 0.0111135, + "auxiliary_loss_mlp": 0.01034562, + "balance_loss_clip": 1.02137756, + "balance_loss_mlp": 1.03792572, + "epoch": 0.6845633548774989, + "flos": 17639681120640.0, + "grad_norm": 2.0113901870570534, + "language_loss": 0.84306657, + "learning_rate": 9.55924519212335e-07, + "loss": 0.86452568, + "num_input_tokens_seen": 245847140, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 11386, + "time_per_iteration": 3.9225666522979736 + }, + { + "auxiliary_loss_clip": 0.01105442, + "auxiliary_loss_mlp": 0.01036021, + "balance_loss_clip": 1.02474415, + "balance_loss_mlp": 1.0376749, + "epoch": 0.6846234781301669, + "flos": 20807576179200.0, + "grad_norm": 2.7843660394813035, + "language_loss": 0.83315331, + "learning_rate": 9.555923584232984e-07, + "loss": 0.854568, + "num_input_tokens_seen": 245862855, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6796875, + "step": 11387, + "time_per_iteration": 2.4256067276000977 + }, + { + "auxiliary_loss_clip": 0.01100691, + "auxiliary_loss_mlp": 0.01028881, + "balance_loss_clip": 1.01747251, + "balance_loss_mlp": 1.03419471, + "epoch": 0.6846836013828348, + "flos": 36101176485120.0, + "grad_norm": 1.6307550098034056, + "language_loss": 0.72258627, + "learning_rate": 9.552602372383047e-07, + "loss": 0.74388194, + "num_input_tokens_seen": 245885415, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 11388, + "time_per_iteration": 3.991851806640625 + }, + { + "auxiliary_loss_clip": 0.01101036, + "auxiliary_loss_mlp": 0.01023785, + "balance_loss_clip": 1.01292491, + "balance_loss_mlp": 1.03534198, + "epoch": 0.6847437246355028, + "flos": 43142468607360.0, + "grad_norm": 1.8327013595289872, + "language_loss": 0.62769783, + "learning_rate": 9.549281556699469e-07, + "loss": 0.64894605, + "num_input_tokens_seen": 245906285, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.65625, + "step": 11389, + "time_per_iteration": 4.031615495681763 + }, + { + "auxiliary_loss_clip": 0.0102484, + "auxiliary_loss_mlp": 0.00998817, + "balance_loss_clip": 0.99756575, + "balance_loss_mlp": 1.00428033, + "epoch": 0.6848038478881707, + "flos": 71663729552640.0, + "grad_norm": 0.7254408078879129, + "language_loss": 0.56007105, + "learning_rate": 9.54596113730818e-07, + "loss": 0.5803076, + "num_input_tokens_seen": 245967620, + "router_z_loss_clip": 0.01251221, + "router_z_loss_mlp": 0.20605469, + "step": 11390, + "time_per_iteration": 4.692908048629761 + }, + { + "auxiliary_loss_clip": 0.01103708, + "auxiliary_loss_mlp": 0.01031834, + "balance_loss_clip": 1.02001452, + "balance_loss_mlp": 1.03709829, + "epoch": 0.6848639711408387, + "flos": 19937820257280.0, + "grad_norm": 2.011305237937575, + "language_loss": 0.8772974, + "learning_rate": 9.542641114335109e-07, + "loss": 0.89865273, + "num_input_tokens_seen": 245985075, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66796875, + "step": 11391, + "time_per_iteration": 2.4319207668304443 + }, + { + "auxiliary_loss_clip": 0.01106215, + "auxiliary_loss_mlp": 0.0103464, + "balance_loss_clip": 1.02271295, + "balance_loss_mlp": 1.03695166, + "epoch": 0.6849240943935067, + "flos": 26867501844480.0, + "grad_norm": 1.6650143278886758, + "language_loss": 0.79346359, + "learning_rate": 9.539321487906117e-07, + "loss": 0.81487215, + "num_input_tokens_seen": 246003560, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69140625, + "step": 11392, + "time_per_iteration": 2.501056671142578 + }, + { + "auxiliary_loss_clip": 0.01101076, + "auxiliary_loss_mlp": 0.01027958, + "balance_loss_clip": 1.01641881, + "balance_loss_mlp": 1.03576994, + "epoch": 0.6849842176461747, + "flos": 13735365425280.0, + "grad_norm": 2.2005866152358977, + "language_loss": 0.70957869, + "learning_rate": 9.536002258147104e-07, + "loss": 0.73086905, + "num_input_tokens_seen": 246019600, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65234375, + "step": 11393, + "time_per_iteration": 2.404430627822876 + }, + { + "auxiliary_loss_clip": 0.0110549, + "auxiliary_loss_mlp": 0.01029919, + "balance_loss_clip": 1.01724112, + "balance_loss_mlp": 1.03636444, + "epoch": 0.6850443408988426, + "flos": 24973070641920.0, + "grad_norm": 1.6151771222215205, + "language_loss": 0.64394313, + "learning_rate": 9.532683425183936e-07, + "loss": 0.66529727, + "num_input_tokens_seen": 246038920, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 11394, + "time_per_iteration": 2.4956462383270264 + }, + { + "auxiliary_loss_clip": 0.01105306, + "auxiliary_loss_mlp": 0.010345, + "balance_loss_clip": 1.02175093, + "balance_loss_mlp": 1.03593922, + "epoch": 0.6851044641515106, + "flos": 27744225004800.0, + "grad_norm": 2.3582380826263303, + "language_loss": 0.80521697, + "learning_rate": 9.529364989142468e-07, + "loss": 0.82661504, + "num_input_tokens_seen": 246060490, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 11395, + "time_per_iteration": 2.510676860809326 + }, + { + "auxiliary_loss_clip": 0.01105245, + "auxiliary_loss_mlp": 0.0103051, + "balance_loss_clip": 1.01755834, + "balance_loss_mlp": 1.03777242, + "epoch": 0.6851645874041785, + "flos": 24351061800960.0, + "grad_norm": 1.764971527643648, + "language_loss": 0.73285419, + "learning_rate": 9.526046950148527e-07, + "loss": 0.75421178, + "num_input_tokens_seen": 246081465, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.67578125, + "step": 11396, + "time_per_iteration": 2.568514823913574 + }, + { + "auxiliary_loss_clip": 0.01106243, + "auxiliary_loss_mlp": 0.0102748, + "balance_loss_clip": 1.01480818, + "balance_loss_mlp": 1.03660202, + "epoch": 0.6852247106568465, + "flos": 15077849264640.0, + "grad_norm": 5.148870058421947, + "language_loss": 0.79048425, + "learning_rate": 9.522729308327931e-07, + "loss": 0.81182146, + "num_input_tokens_seen": 246096110, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 11397, + "time_per_iteration": 2.4331774711608887 + }, + { + "auxiliary_loss_clip": 0.01103305, + "auxiliary_loss_mlp": 0.01026103, + "balance_loss_clip": 1.01383626, + "balance_loss_mlp": 1.03412771, + "epoch": 0.6852848339095146, + "flos": 18770005278720.0, + "grad_norm": 2.4689910585067616, + "language_loss": 0.71553206, + "learning_rate": 9.519412063806493e-07, + "loss": 0.73682612, + "num_input_tokens_seen": 246114785, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69140625, + "step": 11398, + "time_per_iteration": 2.5442934036254883 + }, + { + "auxiliary_loss_clip": 0.0110016, + "auxiliary_loss_mlp": 0.01033134, + "balance_loss_clip": 1.02194667, + "balance_loss_mlp": 1.03415036, + "epoch": 0.6853449571621825, + "flos": 27854363082240.0, + "grad_norm": 1.6631285015848603, + "language_loss": 0.70751739, + "learning_rate": 9.516095216709996e-07, + "loss": 0.72885031, + "num_input_tokens_seen": 246136375, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.66015625, + "step": 11399, + "time_per_iteration": 2.4914610385894775 + }, + { + "auxiliary_loss_clip": 0.01104852, + "auxiliary_loss_mlp": 0.01026727, + "balance_loss_clip": 1.01515758, + "balance_loss_mlp": 1.03707409, + "epoch": 0.6854050804148505, + "flos": 18150510389760.0, + "grad_norm": 1.5329347602462005, + "language_loss": 0.7047379, + "learning_rate": 9.512778767164217e-07, + "loss": 0.72605371, + "num_input_tokens_seen": 246155090, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6796875, + "step": 11400, + "time_per_iteration": 2.5048537254333496 + }, + { + "auxiliary_loss_clip": 0.01113165, + "auxiliary_loss_mlp": 0.01035214, + "balance_loss_clip": 1.02017021, + "balance_loss_mlp": 1.0384146, + "epoch": 0.6854652036675184, + "flos": 16326212492160.0, + "grad_norm": 1.826720269595169, + "language_loss": 0.78065717, + "learning_rate": 9.509462715294927e-07, + "loss": 0.80214089, + "num_input_tokens_seen": 246172645, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.75, + "step": 11401, + "time_per_iteration": 2.441246747970581 + }, + { + "auxiliary_loss_clip": 0.01102237, + "auxiliary_loss_mlp": 0.01028091, + "balance_loss_clip": 1.01642609, + "balance_loss_mlp": 1.03616953, + "epoch": 0.6855253269201864, + "flos": 14940814878720.0, + "grad_norm": 2.0377910237961925, + "language_loss": 0.75284612, + "learning_rate": 9.50614706122786e-07, + "loss": 0.77414942, + "num_input_tokens_seen": 246189055, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66015625, + "step": 11402, + "time_per_iteration": 2.4716646671295166 + }, + { + "auxiliary_loss_clip": 0.01105094, + "auxiliary_loss_mlp": 0.01037038, + "balance_loss_clip": 1.0245446, + "balance_loss_mlp": 1.03575242, + "epoch": 0.6855854501728543, + "flos": 23037736826880.0, + "grad_norm": 1.633024747176301, + "language_loss": 0.7278834, + "learning_rate": 9.502831805088742e-07, + "loss": 0.74930477, + "num_input_tokens_seen": 246207990, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 11403, + "time_per_iteration": 2.4483251571655273 + }, + { + "auxiliary_loss_clip": 0.01101831, + "auxiliary_loss_mlp": 0.01029557, + "balance_loss_clip": 1.01826835, + "balance_loss_mlp": 1.03608656, + "epoch": 0.6856455734255223, + "flos": 13253623194240.0, + "grad_norm": 2.2661790169676284, + "language_loss": 0.81050408, + "learning_rate": 9.499516947003294e-07, + "loss": 0.83181787, + "num_input_tokens_seen": 246221595, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65625, + "step": 11404, + "time_per_iteration": 2.4682669639587402 + }, + { + "auxiliary_loss_clip": 0.01103095, + "auxiliary_loss_mlp": 0.01034189, + "balance_loss_clip": 1.02251863, + "balance_loss_mlp": 1.03651369, + "epoch": 0.6857056966781903, + "flos": 23333461499520.0, + "grad_norm": 1.3732837819876964, + "language_loss": 0.77531087, + "learning_rate": 9.496202487097222e-07, + "loss": 0.79668367, + "num_input_tokens_seen": 246242970, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 11405, + "time_per_iteration": 2.4672837257385254 + }, + { + "auxiliary_loss_clip": 0.01024197, + "auxiliary_loss_mlp": 0.01001171, + "balance_loss_clip": 1.00008011, + "balance_loss_mlp": 1.00375617, + "epoch": 0.6857658199308583, + "flos": 61852647784320.0, + "grad_norm": 0.7944486320456374, + "language_loss": 0.60998279, + "learning_rate": 9.492888425496199e-07, + "loss": 0.63023651, + "num_input_tokens_seen": 246300405, + "router_z_loss_clip": 0.01092529, + "router_z_loss_mlp": 0.20507812, + "step": 11406, + "time_per_iteration": 3.146902084350586 + }, + { + "auxiliary_loss_clip": 0.01102554, + "auxiliary_loss_mlp": 0.01033572, + "balance_loss_clip": 1.02050114, + "balance_loss_mlp": 1.03420663, + "epoch": 0.6858259431835262, + "flos": 16654543735680.0, + "grad_norm": 1.8632160242742672, + "language_loss": 0.76916838, + "learning_rate": 9.489574762325907e-07, + "loss": 0.79052973, + "num_input_tokens_seen": 246318780, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.68359375, + "step": 11407, + "time_per_iteration": 2.4350507259368896 + }, + { + "auxiliary_loss_clip": 0.01106959, + "auxiliary_loss_mlp": 0.01035298, + "balance_loss_clip": 1.02232862, + "balance_loss_mlp": 1.03708422, + "epoch": 0.6858860664361942, + "flos": 21872974504320.0, + "grad_norm": 2.5660412243788153, + "language_loss": 0.71399796, + "learning_rate": 9.486261497711991e-07, + "loss": 0.73542058, + "num_input_tokens_seen": 246339405, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.69921875, + "step": 11408, + "time_per_iteration": 2.5331506729125977 + }, + { + "auxiliary_loss_clip": 0.01104047, + "auxiliary_loss_mlp": 0.01025559, + "balance_loss_clip": 1.01318479, + "balance_loss_mlp": 1.03469181, + "epoch": 0.6859461896888621, + "flos": 15267637751040.0, + "grad_norm": 1.9585659451981918, + "language_loss": 0.69841951, + "learning_rate": 9.482948631780087e-07, + "loss": 0.7197156, + "num_input_tokens_seen": 246357055, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6953125, + "step": 11409, + "time_per_iteration": 2.406949520111084 + }, + { + "auxiliary_loss_clip": 0.01098382, + "auxiliary_loss_mlp": 0.01030381, + "balance_loss_clip": 1.01965857, + "balance_loss_mlp": 1.03563976, + "epoch": 0.6860063129415301, + "flos": 18620293392000.0, + "grad_norm": 1.5737480053745323, + "language_loss": 0.78358257, + "learning_rate": 9.479636164655825e-07, + "loss": 0.80487025, + "num_input_tokens_seen": 246374050, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.625, + "step": 11410, + "time_per_iteration": 2.5127828121185303 + }, + { + "auxiliary_loss_clip": 0.0110556, + "auxiliary_loss_mlp": 0.01034715, + "balance_loss_clip": 1.02162552, + "balance_loss_mlp": 1.03487253, + "epoch": 0.6860664361941982, + "flos": 23951376190080.0, + "grad_norm": 2.0456589939951852, + "language_loss": 0.71620971, + "learning_rate": 9.476324096464821e-07, + "loss": 0.73761249, + "num_input_tokens_seen": 246392910, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.70703125, + "step": 11411, + "time_per_iteration": 2.456273317337036 + }, + { + "auxiliary_loss_clip": 0.01105032, + "auxiliary_loss_mlp": 0.01031198, + "balance_loss_clip": 1.01804924, + "balance_loss_mlp": 1.03671002, + "epoch": 0.6861265594468661, + "flos": 20407782827520.0, + "grad_norm": 1.870472752363858, + "language_loss": 0.696311, + "learning_rate": 9.473012427332654e-07, + "loss": 0.7176733, + "num_input_tokens_seen": 246411540, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.68359375, + "step": 11412, + "time_per_iteration": 2.4815471172332764 + }, + { + "auxiliary_loss_clip": 0.0110396, + "auxiliary_loss_mlp": 0.01030657, + "balance_loss_clip": 1.01843774, + "balance_loss_mlp": 1.03616846, + "epoch": 0.6861866826995341, + "flos": 11428571111040.0, + "grad_norm": 2.8759639216310364, + "language_loss": 0.72033083, + "learning_rate": 9.469701157384919e-07, + "loss": 0.74167705, + "num_input_tokens_seen": 246423295, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6796875, + "step": 11413, + "time_per_iteration": 2.3763904571533203 + }, + { + "auxiliary_loss_clip": 0.01104612, + "auxiliary_loss_mlp": 0.01031654, + "balance_loss_clip": 1.01989388, + "balance_loss_mlp": 1.03653979, + "epoch": 0.686246805952202, + "flos": 15997593939840.0, + "grad_norm": 1.7019599587889749, + "language_loss": 0.73731822, + "learning_rate": 9.466390286747164e-07, + "loss": 0.75868088, + "num_input_tokens_seen": 246441045, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6796875, + "step": 11414, + "time_per_iteration": 2.4849958419799805 + }, + { + "auxiliary_loss_clip": 0.0110805, + "auxiliary_loss_mlp": 0.01030632, + "balance_loss_clip": 1.01831794, + "balance_loss_mlp": 1.03832841, + "epoch": 0.68630692920487, + "flos": 19826712512640.0, + "grad_norm": 2.1354792795106396, + "language_loss": 0.86471385, + "learning_rate": 9.46307981554495e-07, + "loss": 0.88610065, + "num_input_tokens_seen": 246456905, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69921875, + "step": 11415, + "time_per_iteration": 2.419379711151123 + }, + { + "auxiliary_loss_clip": 0.01106633, + "auxiliary_loss_mlp": 0.01036667, + "balance_loss_clip": 1.02393508, + "balance_loss_mlp": 1.03672004, + "epoch": 0.6863670524575379, + "flos": 26286216048000.0, + "grad_norm": 1.5997351133047528, + "language_loss": 0.67188251, + "learning_rate": 9.459769743903801e-07, + "loss": 0.69331551, + "num_input_tokens_seen": 246477545, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 11416, + "time_per_iteration": 2.544360876083374 + }, + { + "auxiliary_loss_clip": 0.01101411, + "auxiliary_loss_mlp": 0.01032124, + "balance_loss_clip": 1.02020359, + "balance_loss_mlp": 1.03366458, + "epoch": 0.686427175710206, + "flos": 19173138595200.0, + "grad_norm": 1.3938350999013296, + "language_loss": 0.75928599, + "learning_rate": 9.456460071949237e-07, + "loss": 0.78062129, + "num_input_tokens_seen": 246496705, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 11417, + "time_per_iteration": 2.420132637023926 + }, + { + "auxiliary_loss_clip": 0.01103442, + "auxiliary_loss_mlp": 0.01030485, + "balance_loss_clip": 1.01882672, + "balance_loss_mlp": 1.03592944, + "epoch": 0.6864872989628739, + "flos": 18916628595840.0, + "grad_norm": 1.7730588079343717, + "language_loss": 0.77459234, + "learning_rate": 9.45315079980678e-07, + "loss": 0.79593164, + "num_input_tokens_seen": 246514860, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.67578125, + "step": 11418, + "time_per_iteration": 2.4872171878814697 + }, + { + "auxiliary_loss_clip": 0.01103813, + "auxiliary_loss_mlp": 0.01026249, + "balance_loss_clip": 1.01471543, + "balance_loss_mlp": 1.03681958, + "epoch": 0.6865474222155419, + "flos": 25956196865280.0, + "grad_norm": 2.2244412162236924, + "language_loss": 0.76546735, + "learning_rate": 9.449841927601887e-07, + "loss": 0.78676796, + "num_input_tokens_seen": 246536145, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 11419, + "time_per_iteration": 2.5004422664642334 + }, + { + "auxiliary_loss_clip": 0.01101876, + "auxiliary_loss_mlp": 0.01032873, + "balance_loss_clip": 1.02173305, + "balance_loss_mlp": 1.03602588, + "epoch": 0.6866075454682098, + "flos": 18478087447680.0, + "grad_norm": 1.9820381057917913, + "language_loss": 0.71707082, + "learning_rate": 9.446533455460044e-07, + "loss": 0.73841834, + "num_input_tokens_seen": 246553265, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66015625, + "step": 11420, + "time_per_iteration": 2.480562925338745 + }, + { + "auxiliary_loss_clip": 0.01101218, + "auxiliary_loss_mlp": 0.01023861, + "balance_loss_clip": 1.01320374, + "balance_loss_mlp": 1.03455591, + "epoch": 0.6866676687208778, + "flos": 34239998298240.0, + "grad_norm": 1.3356950077180587, + "language_loss": 0.7420696, + "learning_rate": 9.443225383506712e-07, + "loss": 0.76332039, + "num_input_tokens_seen": 246575130, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6640625, + "step": 11421, + "time_per_iteration": 2.532064199447632 + }, + { + "auxiliary_loss_clip": 0.01100357, + "auxiliary_loss_mlp": 0.01029807, + "balance_loss_clip": 1.01827979, + "balance_loss_mlp": 1.03495026, + "epoch": 0.6867277919735457, + "flos": 21721754246400.0, + "grad_norm": 1.7634473864986122, + "language_loss": 0.77061129, + "learning_rate": 9.439917711867338e-07, + "loss": 0.79191291, + "num_input_tokens_seen": 246593095, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 11422, + "time_per_iteration": 2.494222402572632 + }, + { + "auxiliary_loss_clip": 0.01105572, + "auxiliary_loss_mlp": 0.01036083, + "balance_loss_clip": 1.02336359, + "balance_loss_mlp": 1.03689635, + "epoch": 0.6867879152262137, + "flos": 24097999507200.0, + "grad_norm": 2.896334528061073, + "language_loss": 0.77752495, + "learning_rate": 9.436610440667334e-07, + "loss": 0.79894149, + "num_input_tokens_seen": 246612165, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 11423, + "time_per_iteration": 2.4580142498016357 + }, + { + "auxiliary_loss_clip": 0.01105867, + "auxiliary_loss_mlp": 0.01028863, + "balance_loss_clip": 1.01655424, + "balance_loss_mlp": 1.03794348, + "epoch": 0.6868480384788818, + "flos": 21615818060160.0, + "grad_norm": 1.4732024211582577, + "language_loss": 0.72956997, + "learning_rate": 9.433303570032129e-07, + "loss": 0.75091726, + "num_input_tokens_seen": 246632065, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6796875, + "step": 11424, + "time_per_iteration": 2.5055267810821533 + }, + { + "auxiliary_loss_clip": 0.01105305, + "auxiliary_loss_mlp": 0.01028679, + "balance_loss_clip": 1.01724076, + "balance_loss_mlp": 1.03695333, + "epoch": 0.6869081617315497, + "flos": 26286144220800.0, + "grad_norm": 1.7308743196557235, + "language_loss": 0.65175045, + "learning_rate": 9.429997100087112e-07, + "loss": 0.67309034, + "num_input_tokens_seen": 246651245, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.68359375, + "step": 11425, + "time_per_iteration": 2.470486640930176 + }, + { + "auxiliary_loss_clip": 0.01102552, + "auxiliary_loss_mlp": 0.01023971, + "balance_loss_clip": 1.01246786, + "balance_loss_mlp": 1.03693807, + "epoch": 0.6869682849842177, + "flos": 21105096531840.0, + "grad_norm": 1.3720059089078416, + "language_loss": 0.71447921, + "learning_rate": 9.426691030957657e-07, + "loss": 0.73574442, + "num_input_tokens_seen": 246672225, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 11426, + "time_per_iteration": 2.5032618045806885 + }, + { + "auxiliary_loss_clip": 0.01102828, + "auxiliary_loss_mlp": 0.01026153, + "balance_loss_clip": 1.01463187, + "balance_loss_mlp": 1.03570724, + "epoch": 0.6870284082368856, + "flos": 17092653920640.0, + "grad_norm": 2.2242612174106737, + "language_loss": 0.85695207, + "learning_rate": 9.423385362769136e-07, + "loss": 0.8782419, + "num_input_tokens_seen": 246688385, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 11427, + "time_per_iteration": 2.4124362468719482 + }, + { + "auxiliary_loss_clip": 0.01102706, + "auxiliary_loss_mlp": 0.01027967, + "balance_loss_clip": 1.01630831, + "balance_loss_mlp": 1.03642297, + "epoch": 0.6870885314895536, + "flos": 27308090067840.0, + "grad_norm": 1.5166850198696897, + "language_loss": 0.75723726, + "learning_rate": 9.420080095646909e-07, + "loss": 0.77854395, + "num_input_tokens_seen": 246710730, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 11428, + "time_per_iteration": 3.971212387084961 + }, + { + "auxiliary_loss_clip": 0.0110684, + "auxiliary_loss_mlp": 0.01035789, + "balance_loss_clip": 1.02289069, + "balance_loss_mlp": 1.03649604, + "epoch": 0.6871486547422215, + "flos": 20814543417600.0, + "grad_norm": 2.165798768763756, + "language_loss": 0.73242265, + "learning_rate": 9.4167752297163e-07, + "loss": 0.75384891, + "num_input_tokens_seen": 246730350, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 11429, + "time_per_iteration": 2.4732346534729004 + }, + { + "auxiliary_loss_clip": 0.01107151, + "auxiliary_loss_mlp": 0.01027831, + "balance_loss_clip": 1.01595795, + "balance_loss_mlp": 1.03874505, + "epoch": 0.6872087779948896, + "flos": 30154118330880.0, + "grad_norm": 2.494094152353352, + "language_loss": 0.83109355, + "learning_rate": 9.413470765102643e-07, + "loss": 0.8524434, + "num_input_tokens_seen": 246751700, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.68359375, + "step": 11430, + "time_per_iteration": 3.9374120235443115 + }, + { + "auxiliary_loss_clip": 0.01102176, + "auxiliary_loss_mlp": 0.01032588, + "balance_loss_clip": 1.02065516, + "balance_loss_mlp": 1.03498435, + "epoch": 0.6872689012475575, + "flos": 20704584908160.0, + "grad_norm": 2.0537474499977746, + "language_loss": 0.700809, + "learning_rate": 9.410166701931225e-07, + "loss": 0.72215664, + "num_input_tokens_seen": 246769860, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 11431, + "time_per_iteration": 5.39936375617981 + }, + { + "auxiliary_loss_clip": 0.0110521, + "auxiliary_loss_mlp": 0.01032138, + "balance_loss_clip": 1.02002013, + "balance_loss_mlp": 1.03624368, + "epoch": 0.6873290245002255, + "flos": 25520852027520.0, + "grad_norm": 1.9154257852528767, + "language_loss": 0.79996437, + "learning_rate": 9.406863040327355e-07, + "loss": 0.82133788, + "num_input_tokens_seen": 246789905, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 11432, + "time_per_iteration": 2.5091586112976074 + }, + { + "auxiliary_loss_clip": 0.0110135, + "auxiliary_loss_mlp": 0.01026907, + "balance_loss_clip": 1.01545095, + "balance_loss_mlp": 1.03639221, + "epoch": 0.6873891477528934, + "flos": 25191479289600.0, + "grad_norm": 1.5073442194689934, + "language_loss": 0.67916226, + "learning_rate": 9.403559780416295e-07, + "loss": 0.70044488, + "num_input_tokens_seen": 246808815, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6484375, + "step": 11433, + "time_per_iteration": 2.4911651611328125 + }, + { + "auxiliary_loss_clip": 0.0110721, + "auxiliary_loss_mlp": 0.01037163, + "balance_loss_clip": 1.02483046, + "balance_loss_mlp": 1.03957868, + "epoch": 0.6874492710055614, + "flos": 35152380685440.0, + "grad_norm": 1.9834703858650884, + "language_loss": 0.72955799, + "learning_rate": 9.400256922323309e-07, + "loss": 0.75100172, + "num_input_tokens_seen": 246829775, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.67578125, + "step": 11434, + "time_per_iteration": 2.601761817932129 + }, + { + "auxiliary_loss_clip": 0.01104287, + "auxiliary_loss_mlp": 0.0102684, + "balance_loss_clip": 1.01488328, + "balance_loss_mlp": 1.03820884, + "epoch": 0.6875093942582293, + "flos": 17822215059840.0, + "grad_norm": 1.6345537065528275, + "language_loss": 0.80520904, + "learning_rate": 9.396954466173657e-07, + "loss": 0.82652032, + "num_input_tokens_seen": 246848045, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66015625, + "step": 11435, + "time_per_iteration": 2.4691109657287598 + }, + { + "auxiliary_loss_clip": 0.01105455, + "auxiliary_loss_mlp": 0.01031499, + "balance_loss_clip": 1.01895833, + "balance_loss_mlp": 1.03661776, + "epoch": 0.6875695175108973, + "flos": 20704548994560.0, + "grad_norm": 2.919181748670558, + "language_loss": 0.8081519, + "learning_rate": 9.393652412092538e-07, + "loss": 0.82952142, + "num_input_tokens_seen": 246866095, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 11436, + "time_per_iteration": 2.4831182956695557 + }, + { + "auxiliary_loss_clip": 0.01098481, + "auxiliary_loss_mlp": 0.01027936, + "balance_loss_clip": 1.01780939, + "balance_loss_mlp": 1.03531957, + "epoch": 0.6876296407635654, + "flos": 25374013228800.0, + "grad_norm": 2.0171807255350056, + "language_loss": 0.82209235, + "learning_rate": 9.390350760205183e-07, + "loss": 0.84335649, + "num_input_tokens_seen": 246883975, + "router_z_loss_clip": 0.1015625, + "router_z_loss_mlp": 0.6328125, + "step": 11437, + "time_per_iteration": 2.476003646850586 + }, + { + "auxiliary_loss_clip": 0.01111133, + "auxiliary_loss_mlp": 0.01034984, + "balance_loss_clip": 1.02188921, + "balance_loss_mlp": 1.03871989, + "epoch": 0.6876897640162333, + "flos": 23222317841280.0, + "grad_norm": 2.5574373753550894, + "language_loss": 0.77940321, + "learning_rate": 9.387049510636793e-07, + "loss": 0.80086446, + "num_input_tokens_seen": 246901560, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.72265625, + "step": 11438, + "time_per_iteration": 2.502321720123291 + }, + { + "auxiliary_loss_clip": 0.01098247, + "auxiliary_loss_mlp": 0.01032589, + "balance_loss_clip": 1.02097225, + "balance_loss_mlp": 1.03480375, + "epoch": 0.6877498872689013, + "flos": 27124335066240.0, + "grad_norm": 1.5853093369472568, + "language_loss": 0.72395837, + "learning_rate": 9.383748663512554e-07, + "loss": 0.74526674, + "num_input_tokens_seen": 246922655, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6328125, + "step": 11439, + "time_per_iteration": 2.4871983528137207 + }, + { + "auxiliary_loss_clip": 0.01104102, + "auxiliary_loss_mlp": 0.01026458, + "balance_loss_clip": 1.0148648, + "balance_loss_mlp": 1.0368948, + "epoch": 0.6878100105215692, + "flos": 11581658876160.0, + "grad_norm": 1.9510407430553642, + "language_loss": 0.75392562, + "learning_rate": 9.380448218957623e-07, + "loss": 0.77523124, + "num_input_tokens_seen": 246940100, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.671875, + "step": 11440, + "time_per_iteration": 2.444061040878296 + }, + { + "auxiliary_loss_clip": 0.01100078, + "auxiliary_loss_mlp": 0.01032398, + "balance_loss_clip": 1.02096558, + "balance_loss_mlp": 1.03482723, + "epoch": 0.6878701337742372, + "flos": 20303175444480.0, + "grad_norm": 1.5583446762430218, + "language_loss": 0.71741056, + "learning_rate": 9.377148177097167e-07, + "loss": 0.73873532, + "num_input_tokens_seen": 246958545, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6484375, + "step": 11441, + "time_per_iteration": 2.42561936378479 + }, + { + "auxiliary_loss_clip": 0.01107766, + "auxiliary_loss_mlp": 0.0103068, + "balance_loss_clip": 1.01703668, + "balance_loss_mlp": 1.03738022, + "epoch": 0.6879302570269051, + "flos": 13840080549120.0, + "grad_norm": 1.6223718684669892, + "language_loss": 0.66661596, + "learning_rate": 9.373848538056317e-07, + "loss": 0.68800044, + "num_input_tokens_seen": 246974805, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.703125, + "step": 11442, + "time_per_iteration": 2.527100086212158 + }, + { + "auxiliary_loss_clip": 0.01104807, + "auxiliary_loss_mlp": 0.0103077, + "balance_loss_clip": 1.01938581, + "balance_loss_mlp": 1.03825164, + "epoch": 0.6879903802795732, + "flos": 21324654414720.0, + "grad_norm": 1.9334719769408109, + "language_loss": 0.69233751, + "learning_rate": 9.370549301960189e-07, + "loss": 0.71369326, + "num_input_tokens_seen": 246992505, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6640625, + "step": 11443, + "time_per_iteration": 2.4346165657043457 + }, + { + "auxiliary_loss_clip": 0.01109303, + "auxiliary_loss_mlp": 0.01033521, + "balance_loss_clip": 1.02084899, + "balance_loss_mlp": 1.04012263, + "epoch": 0.6880505035322411, + "flos": 25152049134720.0, + "grad_norm": 1.4614285926013768, + "language_loss": 0.76507717, + "learning_rate": 9.367250468933893e-07, + "loss": 0.78650534, + "num_input_tokens_seen": 247013370, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69140625, + "step": 11444, + "time_per_iteration": 2.508368968963623 + }, + { + "auxiliary_loss_clip": 0.01101207, + "auxiliary_loss_mlp": 0.01027844, + "balance_loss_clip": 1.01622105, + "balance_loss_mlp": 1.03577399, + "epoch": 0.6881106267849091, + "flos": 23215530170880.0, + "grad_norm": 1.8080804951596867, + "language_loss": 0.76652426, + "learning_rate": 9.363952039102536e-07, + "loss": 0.78781474, + "num_input_tokens_seen": 247029855, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 11445, + "time_per_iteration": 2.4379546642303467 + }, + { + "auxiliary_loss_clip": 0.01025524, + "auxiliary_loss_mlp": 0.01005058, + "balance_loss_clip": 1.00386608, + "balance_loss_mlp": 1.00513721, + "epoch": 0.688170750037577, + "flos": 48484397312640.0, + "grad_norm": 0.8196174893111461, + "language_loss": 0.58379793, + "learning_rate": 9.360654012591183e-07, + "loss": 0.60410374, + "num_input_tokens_seen": 247085030, + "router_z_loss_clip": 0.01190186, + "router_z_loss_mlp": 0.20410156, + "step": 11446, + "time_per_iteration": 3.09559965133667 + }, + { + "auxiliary_loss_clip": 0.01105797, + "auxiliary_loss_mlp": 0.01027584, + "balance_loss_clip": 1.01536548, + "balance_loss_mlp": 1.03552115, + "epoch": 0.688230873290245, + "flos": 22783633038720.0, + "grad_norm": 1.5108741045715646, + "language_loss": 0.75743663, + "learning_rate": 9.357356389524886e-07, + "loss": 0.77877045, + "num_input_tokens_seen": 247104840, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.703125, + "step": 11447, + "time_per_iteration": 2.4388415813446045 + }, + { + "auxiliary_loss_clip": 0.01105525, + "auxiliary_loss_mlp": 0.01034698, + "balance_loss_clip": 1.02256274, + "balance_loss_mlp": 1.035833, + "epoch": 0.6882909965429129, + "flos": 22455660931200.0, + "grad_norm": 2.3058905142845, + "language_loss": 0.73110414, + "learning_rate": 9.354059170028705e-07, + "loss": 0.75250638, + "num_input_tokens_seen": 247121905, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69921875, + "step": 11448, + "time_per_iteration": 2.490492820739746 + }, + { + "auxiliary_loss_clip": 0.01106927, + "auxiliary_loss_mlp": 0.01031125, + "balance_loss_clip": 1.01841772, + "balance_loss_mlp": 1.03607249, + "epoch": 0.688351119795581, + "flos": 26214143408640.0, + "grad_norm": 1.6148138238236993, + "language_loss": 0.74589622, + "learning_rate": 9.350762354227673e-07, + "loss": 0.76727676, + "num_input_tokens_seen": 247142375, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 11449, + "time_per_iteration": 2.5052759647369385 + }, + { + "auxiliary_loss_clip": 0.01102717, + "auxiliary_loss_mlp": 0.01033997, + "balance_loss_clip": 1.02249944, + "balance_loss_mlp": 1.03643203, + "epoch": 0.6884112430482489, + "flos": 22565260304640.0, + "grad_norm": 1.8257091472853513, + "language_loss": 0.69832647, + "learning_rate": 9.34746594224679e-07, + "loss": 0.71969366, + "num_input_tokens_seen": 247161095, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 11450, + "time_per_iteration": 2.4648208618164062 + }, + { + "auxiliary_loss_clip": 0.01108292, + "auxiliary_loss_mlp": 0.0103361, + "balance_loss_clip": 1.02027011, + "balance_loss_mlp": 1.03613949, + "epoch": 0.6884713663009169, + "flos": 17341047446400.0, + "grad_norm": 2.0456347390181366, + "language_loss": 0.76224291, + "learning_rate": 9.344169934211068e-07, + "loss": 0.78366196, + "num_input_tokens_seen": 247178565, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 11451, + "time_per_iteration": 2.430615186691284 + }, + { + "auxiliary_loss_clip": 0.01106472, + "auxiliary_loss_mlp": 0.01029349, + "balance_loss_clip": 1.01746345, + "balance_loss_mlp": 1.03748226, + "epoch": 0.6885314895535849, + "flos": 26470832976000.0, + "grad_norm": 1.5920883527953233, + "language_loss": 0.69262952, + "learning_rate": 9.340874330245505e-07, + "loss": 0.71398771, + "num_input_tokens_seen": 247202345, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6875, + "step": 11452, + "time_per_iteration": 2.5010976791381836 + }, + { + "auxiliary_loss_clip": 0.01103078, + "auxiliary_loss_mlp": 0.01035172, + "balance_loss_clip": 1.02176046, + "balance_loss_mlp": 1.0362519, + "epoch": 0.6885916128062528, + "flos": 20521548178560.0, + "grad_norm": 1.7710041973258575, + "language_loss": 0.72149074, + "learning_rate": 9.337579130475042e-07, + "loss": 0.74287325, + "num_input_tokens_seen": 247219240, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.66796875, + "step": 11453, + "time_per_iteration": 2.450064182281494 + }, + { + "auxiliary_loss_clip": 0.01025423, + "auxiliary_loss_mlp": 0.00997723, + "balance_loss_clip": 0.99648923, + "balance_loss_mlp": 1.0050149, + "epoch": 0.6886517360589208, + "flos": 70715795679360.0, + "grad_norm": 0.7858760559038386, + "language_loss": 0.50753725, + "learning_rate": 9.334284335024644e-07, + "loss": 0.52776867, + "num_input_tokens_seen": 247272010, + "router_z_loss_clip": 0.0123291, + "router_z_loss_mlp": 0.20410156, + "step": 11454, + "time_per_iteration": 2.9117000102996826 + }, + { + "auxiliary_loss_clip": 0.01101076, + "auxiliary_loss_mlp": 0.01028661, + "balance_loss_clip": 1.01732993, + "balance_loss_mlp": 1.03662014, + "epoch": 0.6887118593115887, + "flos": 17893533513600.0, + "grad_norm": 1.7206646308115936, + "language_loss": 0.75241423, + "learning_rate": 9.330989944019263e-07, + "loss": 0.77371156, + "num_input_tokens_seen": 247290630, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.64453125, + "step": 11455, + "time_per_iteration": 2.485668897628784 + }, + { + "auxiliary_loss_clip": 0.01106397, + "auxiliary_loss_mlp": 0.01037929, + "balance_loss_clip": 1.02456009, + "balance_loss_mlp": 1.03585863, + "epoch": 0.6887719825642568, + "flos": 17453017117440.0, + "grad_norm": 2.149117194105129, + "language_loss": 0.72609061, + "learning_rate": 9.327695957583803e-07, + "loss": 0.74753392, + "num_input_tokens_seen": 247304800, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.70703125, + "step": 11456, + "time_per_iteration": 2.393894672393799 + }, + { + "auxiliary_loss_clip": 0.01102522, + "auxiliary_loss_mlp": 0.01030403, + "balance_loss_clip": 1.01892924, + "balance_loss_mlp": 1.03732562, + "epoch": 0.6888321058169247, + "flos": 23070199743360.0, + "grad_norm": 1.623007735916198, + "language_loss": 0.80938387, + "learning_rate": 9.32440237584319e-07, + "loss": 0.83071315, + "num_input_tokens_seen": 247323450, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65234375, + "step": 11457, + "time_per_iteration": 2.495333194732666 + }, + { + "auxiliary_loss_clip": 0.01108692, + "auxiliary_loss_mlp": 0.01028177, + "balance_loss_clip": 1.01552272, + "balance_loss_mlp": 1.03859973, + "epoch": 0.6888922290695927, + "flos": 23368833417600.0, + "grad_norm": 1.5000729460202227, + "language_loss": 0.76153016, + "learning_rate": 9.321109198922301e-07, + "loss": 0.7828989, + "num_input_tokens_seen": 247343845, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 11458, + "time_per_iteration": 2.4778497219085693 + }, + { + "auxiliary_loss_clip": 0.01104516, + "auxiliary_loss_mlp": 0.01029574, + "balance_loss_clip": 1.01787341, + "balance_loss_mlp": 1.03653932, + "epoch": 0.6889523523222606, + "flos": 17631636474240.0, + "grad_norm": 2.658523232455535, + "language_loss": 0.68647993, + "learning_rate": 9.31781642694603e-07, + "loss": 0.70782083, + "num_input_tokens_seen": 247356650, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 11459, + "time_per_iteration": 2.418846845626831 + }, + { + "auxiliary_loss_clip": 0.01104672, + "auxiliary_loss_mlp": 0.01027492, + "balance_loss_clip": 1.01640558, + "balance_loss_mlp": 1.03759336, + "epoch": 0.6890124755749286, + "flos": 25228144097280.0, + "grad_norm": 1.5707154761187223, + "language_loss": 0.68636, + "learning_rate": 9.314524060039221e-07, + "loss": 0.7076816, + "num_input_tokens_seen": 247377340, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.671875, + "step": 11460, + "time_per_iteration": 2.5109915733337402 + }, + { + "auxiliary_loss_clip": 0.01108621, + "auxiliary_loss_mlp": 0.01033572, + "balance_loss_clip": 1.02050114, + "balance_loss_mlp": 1.03564703, + "epoch": 0.6890725988275965, + "flos": 20230240878720.0, + "grad_norm": 1.7818403559528928, + "language_loss": 0.76981837, + "learning_rate": 9.311232098326731e-07, + "loss": 0.79124033, + "num_input_tokens_seen": 247395805, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73046875, + "step": 11461, + "time_per_iteration": 2.467684268951416 + }, + { + "auxiliary_loss_clip": 0.01103615, + "auxiliary_loss_mlp": 0.01034219, + "balance_loss_clip": 1.02204204, + "balance_loss_mlp": 1.03618026, + "epoch": 0.6891327220802645, + "flos": 14535311264640.0, + "grad_norm": 1.6409609736690487, + "language_loss": 0.6973418, + "learning_rate": 9.307940541933401e-07, + "loss": 0.71872014, + "num_input_tokens_seen": 247413165, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 11462, + "time_per_iteration": 2.411785125732422 + }, + { + "auxiliary_loss_clip": 0.01106527, + "auxiliary_loss_mlp": 0.01025599, + "balance_loss_clip": 1.0134095, + "balance_loss_mlp": 1.03735316, + "epoch": 0.6891928453329325, + "flos": 21139139646720.0, + "grad_norm": 1.4661487687088357, + "language_loss": 0.87139171, + "learning_rate": 9.304649390984034e-07, + "loss": 0.89271295, + "num_input_tokens_seen": 247433140, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69140625, + "step": 11463, + "time_per_iteration": 2.4572672843933105 + }, + { + "auxiliary_loss_clip": 0.010984, + "auxiliary_loss_mlp": 0.01027776, + "balance_loss_clip": 1.01702309, + "balance_loss_mlp": 1.03459322, + "epoch": 0.6892529685856005, + "flos": 17858520731520.0, + "grad_norm": 1.8830832637485666, + "language_loss": 0.68394661, + "learning_rate": 9.301358645603428e-07, + "loss": 0.70520842, + "num_input_tokens_seen": 247451265, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.63671875, + "step": 11464, + "time_per_iteration": 2.4330556392669678 + }, + { + "auxiliary_loss_clip": 0.0110454, + "auxiliary_loss_mlp": 0.0103758, + "balance_loss_clip": 1.0248543, + "balance_loss_mlp": 1.0371387, + "epoch": 0.6893130918382685, + "flos": 29934811843200.0, + "grad_norm": 1.8974270807015088, + "language_loss": 0.65594816, + "learning_rate": 9.298068305916373e-07, + "loss": 0.67736936, + "num_input_tokens_seen": 247471645, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.671875, + "step": 11465, + "time_per_iteration": 2.495144844055176 + }, + { + "auxiliary_loss_clip": 0.01106695, + "auxiliary_loss_mlp": 0.01034637, + "balance_loss_clip": 1.02248406, + "balance_loss_mlp": 1.03674364, + "epoch": 0.6893732150909364, + "flos": 24388516707840.0, + "grad_norm": 1.5240764354372476, + "language_loss": 0.72628653, + "learning_rate": 9.294778372047649e-07, + "loss": 0.74769986, + "num_input_tokens_seen": 247491170, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69921875, + "step": 11466, + "time_per_iteration": 2.4766881465911865 + }, + { + "auxiliary_loss_clip": 0.0110566, + "auxiliary_loss_mlp": 0.0103001, + "balance_loss_clip": 1.01822066, + "balance_loss_mlp": 1.03696775, + "epoch": 0.6894333383436044, + "flos": 16982874979200.0, + "grad_norm": 1.714596281960388, + "language_loss": 0.71770334, + "learning_rate": 9.291488844121995e-07, + "loss": 0.73905998, + "num_input_tokens_seen": 247509005, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 11467, + "time_per_iteration": 2.4112367630004883 + }, + { + "auxiliary_loss_clip": 0.0110697, + "auxiliary_loss_mlp": 0.01033327, + "balance_loss_clip": 1.02008295, + "balance_loss_mlp": 1.0355289, + "epoch": 0.6894934615962723, + "flos": 18985540838400.0, + "grad_norm": 2.163503550286246, + "language_loss": 0.81232512, + "learning_rate": 9.288199722264156e-07, + "loss": 0.83372813, + "num_input_tokens_seen": 247527050, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71484375, + "step": 11468, + "time_per_iteration": 2.466501474380493 + }, + { + "auxiliary_loss_clip": 0.01107651, + "auxiliary_loss_mlp": 0.01032816, + "balance_loss_clip": 1.02028728, + "balance_loss_mlp": 1.03816724, + "epoch": 0.6895535848489404, + "flos": 34531664734080.0, + "grad_norm": 1.7452296141639345, + "language_loss": 0.65893084, + "learning_rate": 9.284911006598875e-07, + "loss": 0.68033552, + "num_input_tokens_seen": 247547765, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 11469, + "time_per_iteration": 3.9587156772613525 + }, + { + "auxiliary_loss_clip": 0.01024995, + "auxiliary_loss_mlp": 0.01001889, + "balance_loss_clip": 1.00072718, + "balance_loss_mlp": 1.0044626, + "epoch": 0.6896137081016083, + "flos": 50075852273280.0, + "grad_norm": 0.79151835418889, + "language_loss": 0.55171818, + "learning_rate": 9.281622697250824e-07, + "loss": 0.57198697, + "num_input_tokens_seen": 247603515, + "router_z_loss_clip": 0.01159668, + "router_z_loss_mlp": 0.20507812, + "step": 11470, + "time_per_iteration": 2.9345321655273438 + }, + { + "auxiliary_loss_clip": 0.01101343, + "auxiliary_loss_mlp": 0.01031107, + "balance_loss_clip": 1.0206759, + "balance_loss_mlp": 1.03692126, + "epoch": 0.6896738313542763, + "flos": 19938215306880.0, + "grad_norm": 2.531274937243883, + "language_loss": 0.77590048, + "learning_rate": 9.278334794344715e-07, + "loss": 0.79722488, + "num_input_tokens_seen": 247622110, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.64453125, + "step": 11471, + "time_per_iteration": 3.9249086380004883 + }, + { + "auxiliary_loss_clip": 0.0110492, + "auxiliary_loss_mlp": 0.01032326, + "balance_loss_clip": 1.02015519, + "balance_loss_mlp": 1.03743219, + "epoch": 0.6897339546069442, + "flos": 21725489260800.0, + "grad_norm": 1.771316633109537, + "language_loss": 0.78440964, + "learning_rate": 9.275047298005232e-07, + "loss": 0.80578208, + "num_input_tokens_seen": 247641905, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.67578125, + "step": 11472, + "time_per_iteration": 3.877894401550293 + }, + { + "auxiliary_loss_clip": 0.01100137, + "auxiliary_loss_mlp": 0.01029711, + "balance_loss_clip": 1.01826715, + "balance_loss_mlp": 1.03419447, + "epoch": 0.6897940778596122, + "flos": 19826497031040.0, + "grad_norm": 1.5889671799486909, + "language_loss": 0.76273483, + "learning_rate": 9.271760208357024e-07, + "loss": 0.7840333, + "num_input_tokens_seen": 247660945, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66015625, + "step": 11473, + "time_per_iteration": 3.895129680633545 + }, + { + "auxiliary_loss_clip": 0.01106535, + "auxiliary_loss_mlp": 0.01032871, + "balance_loss_clip": 1.0202589, + "balance_loss_mlp": 1.03657973, + "epoch": 0.6898542011122801, + "flos": 17310056987520.0, + "grad_norm": 3.23937327376226, + "language_loss": 0.75285846, + "learning_rate": 9.268473525524751e-07, + "loss": 0.77425253, + "num_input_tokens_seen": 247678395, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69921875, + "step": 11474, + "time_per_iteration": 2.4117770195007324 + }, + { + "auxiliary_loss_clip": 0.0110508, + "auxiliary_loss_mlp": 0.01032957, + "balance_loss_clip": 1.02095246, + "balance_loss_mlp": 1.03775465, + "epoch": 0.6899143243649482, + "flos": 24754051463040.0, + "grad_norm": 1.4990231187907213, + "language_loss": 0.74082041, + "learning_rate": 9.26518724963303e-07, + "loss": 0.76220077, + "num_input_tokens_seen": 247698380, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 11475, + "time_per_iteration": 2.5168709754943848 + }, + { + "auxiliary_loss_clip": 0.01105263, + "auxiliary_loss_mlp": 0.01028544, + "balance_loss_clip": 1.01619959, + "balance_loss_mlp": 1.03732133, + "epoch": 0.6899744476176161, + "flos": 17234536642560.0, + "grad_norm": 2.0309056655134587, + "language_loss": 0.88638115, + "learning_rate": 9.261901380806491e-07, + "loss": 0.90771919, + "num_input_tokens_seen": 247716370, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6796875, + "step": 11476, + "time_per_iteration": 2.4443247318267822 + }, + { + "auxiliary_loss_clip": 0.01102042, + "auxiliary_loss_mlp": 0.0103205, + "balance_loss_clip": 1.02020097, + "balance_loss_mlp": 1.03498316, + "epoch": 0.6900345708702841, + "flos": 25410678036480.0, + "grad_norm": 1.3153464082970854, + "language_loss": 0.70150822, + "learning_rate": 9.258615919169724e-07, + "loss": 0.72284913, + "num_input_tokens_seen": 247737335, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 11477, + "time_per_iteration": 2.5622828006744385 + }, + { + "auxiliary_loss_clip": 0.01109227, + "auxiliary_loss_mlp": 0.01038844, + "balance_loss_clip": 1.02615404, + "balance_loss_mlp": 1.03800416, + "epoch": 0.6900946941229521, + "flos": 23434190213760.0, + "grad_norm": 2.5064065757946925, + "language_loss": 0.68533587, + "learning_rate": 9.255330864847313e-07, + "loss": 0.70681655, + "num_input_tokens_seen": 247756680, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 11478, + "time_per_iteration": 2.46543288230896 + }, + { + "auxiliary_loss_clip": 0.01105606, + "auxiliary_loss_mlp": 0.01032476, + "balance_loss_clip": 1.02038169, + "balance_loss_mlp": 1.03681922, + "epoch": 0.69015481737562, + "flos": 17820096157440.0, + "grad_norm": 1.887360413937171, + "language_loss": 0.7609849, + "learning_rate": 9.252046217963843e-07, + "loss": 0.78236568, + "num_input_tokens_seen": 247774265, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 11479, + "time_per_iteration": 2.568270683288574 + }, + { + "auxiliary_loss_clip": 0.01107631, + "auxiliary_loss_mlp": 0.01027498, + "balance_loss_clip": 1.01501036, + "balance_loss_mlp": 1.03848529, + "epoch": 0.690214940628288, + "flos": 17456500736640.0, + "grad_norm": 1.594697323523918, + "language_loss": 0.78643298, + "learning_rate": 9.248761978643856e-07, + "loss": 0.80778426, + "num_input_tokens_seen": 247792395, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69140625, + "step": 11480, + "time_per_iteration": 2.4369962215423584 + }, + { + "auxiliary_loss_clip": 0.01104582, + "auxiliary_loss_mlp": 0.0102684, + "balance_loss_clip": 1.01478815, + "balance_loss_mlp": 1.03820038, + "epoch": 0.6902750638809559, + "flos": 29566691308800.0, + "grad_norm": 1.618219832411148, + "language_loss": 0.75485682, + "learning_rate": 9.245478147011885e-07, + "loss": 0.77617109, + "num_input_tokens_seen": 247811985, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6640625, + "step": 11481, + "time_per_iteration": 2.5970773696899414 + }, + { + "auxiliary_loss_clip": 0.01103611, + "auxiliary_loss_mlp": 0.01031414, + "balance_loss_clip": 1.01932073, + "balance_loss_mlp": 1.03630292, + "epoch": 0.690335187133624, + "flos": 25557121785600.0, + "grad_norm": 1.6722041595175992, + "language_loss": 0.6924783, + "learning_rate": 9.24219472319246e-07, + "loss": 0.71382856, + "num_input_tokens_seen": 247831880, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 11482, + "time_per_iteration": 2.4690396785736084 + }, + { + "auxiliary_loss_clip": 0.01104337, + "auxiliary_loss_mlp": 0.01027796, + "balance_loss_clip": 1.01601243, + "balance_loss_mlp": 1.0365622, + "epoch": 0.6903953103862919, + "flos": 22488447070080.0, + "grad_norm": 1.9391931338657746, + "language_loss": 0.82797402, + "learning_rate": 9.238911707310096e-07, + "loss": 0.84929538, + "num_input_tokens_seen": 247851170, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 11483, + "time_per_iteration": 2.479827880859375 + }, + { + "auxiliary_loss_clip": 0.01105727, + "auxiliary_loss_mlp": 0.01028162, + "balance_loss_clip": 1.01693249, + "balance_loss_mlp": 1.03651834, + "epoch": 0.6904554336389599, + "flos": 26100521712000.0, + "grad_norm": 2.252246315768351, + "language_loss": 0.65228778, + "learning_rate": 9.235629099489273e-07, + "loss": 0.67362666, + "num_input_tokens_seen": 247868950, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.69140625, + "step": 11484, + "time_per_iteration": 2.4820756912231445 + }, + { + "auxiliary_loss_clip": 0.01100593, + "auxiliary_loss_mlp": 0.01034816, + "balance_loss_clip": 1.0234673, + "balance_loss_mlp": 1.03430891, + "epoch": 0.6905155568916278, + "flos": 31171754545920.0, + "grad_norm": 1.5009595972061287, + "language_loss": 0.73750043, + "learning_rate": 9.232346899854479e-07, + "loss": 0.75885451, + "num_input_tokens_seen": 247889805, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 11485, + "time_per_iteration": 2.5609304904937744 + }, + { + "auxiliary_loss_clip": 0.0110609, + "auxiliary_loss_mlp": 0.0103398, + "balance_loss_clip": 1.02159464, + "balance_loss_mlp": 1.03691673, + "epoch": 0.6905756801442958, + "flos": 17639681120640.0, + "grad_norm": 1.703754025392432, + "language_loss": 0.85226732, + "learning_rate": 9.22906510853017e-07, + "loss": 0.87366807, + "num_input_tokens_seen": 247908585, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69140625, + "step": 11486, + "time_per_iteration": 2.422380208969116 + }, + { + "auxiliary_loss_clip": 0.01105383, + "auxiliary_loss_mlp": 0.01034252, + "balance_loss_clip": 1.02200925, + "balance_loss_mlp": 1.03721333, + "epoch": 0.6906358033969637, + "flos": 22343691260160.0, + "grad_norm": 1.4802712098189896, + "language_loss": 0.72739094, + "learning_rate": 9.225783725640786e-07, + "loss": 0.74878728, + "num_input_tokens_seen": 247928480, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.68359375, + "step": 11487, + "time_per_iteration": 2.4903013706207275 + }, + { + "auxiliary_loss_clip": 0.01025937, + "auxiliary_loss_mlp": 0.01011443, + "balance_loss_clip": 1.01028049, + "balance_loss_mlp": 1.00546408, + "epoch": 0.6906959266496318, + "flos": 69747789081600.0, + "grad_norm": 0.8945179331036194, + "language_loss": 0.66639161, + "learning_rate": 9.222502751310759e-07, + "loss": 0.68676543, + "num_input_tokens_seen": 247988855, + "router_z_loss_clip": 0.01159668, + "router_z_loss_mlp": 0.20507812, + "step": 11488, + "time_per_iteration": 3.0653343200683594 + }, + { + "auxiliary_loss_clip": 0.01107886, + "auxiliary_loss_mlp": 0.01032838, + "balance_loss_clip": 1.0193913, + "balance_loss_mlp": 1.03628397, + "epoch": 0.6907560499022997, + "flos": 21434253788160.0, + "grad_norm": 1.7595875611490563, + "language_loss": 0.7471655, + "learning_rate": 9.219222185664519e-07, + "loss": 0.76857275, + "num_input_tokens_seen": 248007685, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71484375, + "step": 11489, + "time_per_iteration": 2.5286636352539062 + }, + { + "auxiliary_loss_clip": 0.01107539, + "auxiliary_loss_mlp": 0.01034501, + "balance_loss_clip": 1.02164459, + "balance_loss_mlp": 1.03755021, + "epoch": 0.6908161731549677, + "flos": 14392207480320.0, + "grad_norm": 1.9530912954904702, + "language_loss": 0.62219006, + "learning_rate": 9.215942028826445e-07, + "loss": 0.64361048, + "num_input_tokens_seen": 248025145, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69921875, + "step": 11490, + "time_per_iteration": 2.420513868331909 + }, + { + "auxiliary_loss_clip": 0.01104902, + "auxiliary_loss_mlp": 0.01029569, + "balance_loss_clip": 1.01746964, + "balance_loss_mlp": 1.03648567, + "epoch": 0.6908762964076357, + "flos": 20010970304640.0, + "grad_norm": 1.749287596246761, + "language_loss": 0.72922885, + "learning_rate": 9.212662280920937e-07, + "loss": 0.75057352, + "num_input_tokens_seen": 248043750, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.68359375, + "step": 11491, + "time_per_iteration": 2.481513738632202 + }, + { + "auxiliary_loss_clip": 0.01101839, + "auxiliary_loss_mlp": 0.01037114, + "balance_loss_clip": 1.02441788, + "balance_loss_mlp": 1.03524041, + "epoch": 0.6909364196603036, + "flos": 28769079853440.0, + "grad_norm": 1.8381710188845477, + "language_loss": 0.7008509, + "learning_rate": 9.20938294207235e-07, + "loss": 0.72224045, + "num_input_tokens_seen": 248065765, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6640625, + "step": 11492, + "time_per_iteration": 2.506946325302124 + }, + { + "auxiliary_loss_clip": 0.01107014, + "auxiliary_loss_mlp": 0.01033387, + "balance_loss_clip": 1.02075076, + "balance_loss_mlp": 1.03607428, + "epoch": 0.6909965429129716, + "flos": 22528128620160.0, + "grad_norm": 1.9892003988580658, + "language_loss": 0.74623132, + "learning_rate": 9.206104012405049e-07, + "loss": 0.76763535, + "num_input_tokens_seen": 248083810, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 11493, + "time_per_iteration": 2.485933780670166 + }, + { + "auxiliary_loss_clip": 0.011046, + "auxiliary_loss_mlp": 0.01029054, + "balance_loss_clip": 1.01704955, + "balance_loss_mlp": 1.03709757, + "epoch": 0.6910566661656395, + "flos": 18405942981120.0, + "grad_norm": 1.750158272708012, + "language_loss": 0.74326122, + "learning_rate": 9.20282549204336e-07, + "loss": 0.76459777, + "num_input_tokens_seen": 248103185, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 11494, + "time_per_iteration": 2.4338111877441406 + }, + { + "auxiliary_loss_clip": 0.01102928, + "auxiliary_loss_mlp": 0.01030264, + "balance_loss_clip": 1.01822948, + "balance_loss_mlp": 1.03593969, + "epoch": 0.6911167894183076, + "flos": 30773972355840.0, + "grad_norm": 1.7715754861715476, + "language_loss": 0.68369365, + "learning_rate": 9.19954738111161e-07, + "loss": 0.70502561, + "num_input_tokens_seen": 248125665, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66796875, + "step": 11495, + "time_per_iteration": 2.5815460681915283 + }, + { + "auxiliary_loss_clip": 0.01103437, + "auxiliary_loss_mlp": 0.01029758, + "balance_loss_clip": 1.01754475, + "balance_loss_mlp": 1.03441787, + "epoch": 0.6911769126709755, + "flos": 13735724561280.0, + "grad_norm": 1.7834368050418072, + "language_loss": 0.73899794, + "learning_rate": 9.196269679734119e-07, + "loss": 0.7603299, + "num_input_tokens_seen": 248142545, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69140625, + "step": 11496, + "time_per_iteration": 2.4315319061279297 + }, + { + "auxiliary_loss_clip": 0.01102521, + "auxiliary_loss_mlp": 0.01028635, + "balance_loss_clip": 1.01727474, + "balance_loss_mlp": 1.03553581, + "epoch": 0.6912370359236435, + "flos": 17566854295680.0, + "grad_norm": 1.6258579372444952, + "language_loss": 0.79742873, + "learning_rate": 9.19299238803515e-07, + "loss": 0.81874031, + "num_input_tokens_seen": 248160225, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.671875, + "step": 11497, + "time_per_iteration": 2.4571430683135986 + }, + { + "auxiliary_loss_clip": 0.01107463, + "auxiliary_loss_mlp": 0.01036367, + "balance_loss_clip": 1.0240463, + "balance_loss_mlp": 1.03682327, + "epoch": 0.6912971591763114, + "flos": 22090772620800.0, + "grad_norm": 1.5194582434001807, + "language_loss": 0.80841976, + "learning_rate": 9.189715506138993e-07, + "loss": 0.82985806, + "num_input_tokens_seen": 248180430, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.70703125, + "step": 11498, + "time_per_iteration": 2.4500298500061035 + }, + { + "auxiliary_loss_clip": 0.01100372, + "auxiliary_loss_mlp": 0.01033064, + "balance_loss_clip": 1.0208571, + "balance_loss_mlp": 1.03421736, + "epoch": 0.6913572824289794, + "flos": 29971476650880.0, + "grad_norm": 1.486917569830455, + "language_loss": 0.86061001, + "learning_rate": 9.186439034169915e-07, + "loss": 0.8819443, + "num_input_tokens_seen": 248202365, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6640625, + "step": 11499, + "time_per_iteration": 2.5612852573394775 + }, + { + "auxiliary_loss_clip": 0.01101921, + "auxiliary_loss_mlp": 0.01027697, + "balance_loss_clip": 1.01606178, + "balance_loss_mlp": 1.03633177, + "epoch": 0.6914174056816473, + "flos": 20448936835200.0, + "grad_norm": 1.5487466201601385, + "language_loss": 0.75228941, + "learning_rate": 9.183162972252145e-07, + "loss": 0.77358556, + "num_input_tokens_seen": 248221750, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 11500, + "time_per_iteration": 2.443873405456543 + }, + { + "auxiliary_loss_clip": 0.01106604, + "auxiliary_loss_mlp": 0.01032013, + "balance_loss_clip": 1.0196985, + "balance_loss_mlp": 1.03778219, + "epoch": 0.6914775289343154, + "flos": 21282530739840.0, + "grad_norm": 1.800321839469313, + "language_loss": 0.76985884, + "learning_rate": 9.179887320509921e-07, + "loss": 0.79124504, + "num_input_tokens_seen": 248239535, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 11501, + "time_per_iteration": 2.5296645164489746 + }, + { + "auxiliary_loss_clip": 0.01107203, + "auxiliary_loss_mlp": 0.01033206, + "balance_loss_clip": 1.0208025, + "balance_loss_mlp": 1.03735363, + "epoch": 0.6915376521869833, + "flos": 23878118401920.0, + "grad_norm": 1.9287376377715924, + "language_loss": 0.73522556, + "learning_rate": 9.176612079067458e-07, + "loss": 0.75662971, + "num_input_tokens_seen": 248259055, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69921875, + "step": 11502, + "time_per_iteration": 2.476379632949829 + }, + { + "auxiliary_loss_clip": 0.01108675, + "auxiliary_loss_mlp": 0.0103432, + "balance_loss_clip": 1.02137995, + "balance_loss_mlp": 1.03706694, + "epoch": 0.6915977754396513, + "flos": 11510268595200.0, + "grad_norm": 1.914547972677127, + "language_loss": 0.73439324, + "learning_rate": 9.173337248048953e-07, + "loss": 0.75582325, + "num_input_tokens_seen": 248276765, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71484375, + "step": 11503, + "time_per_iteration": 2.477112293243408 + }, + { + "auxiliary_loss_clip": 0.01103095, + "auxiliary_loss_mlp": 0.01030947, + "balance_loss_clip": 1.01900172, + "balance_loss_mlp": 1.03558373, + "epoch": 0.6916578986923193, + "flos": 22601278667520.0, + "grad_norm": 2.2572840313297067, + "language_loss": 0.77144331, + "learning_rate": 9.170062827578575e-07, + "loss": 0.79278374, + "num_input_tokens_seen": 248295310, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.67578125, + "step": 11504, + "time_per_iteration": 2.434324026107788 + }, + { + "auxiliary_loss_clip": 0.01103184, + "auxiliary_loss_mlp": 0.01029313, + "balance_loss_clip": 1.0178746, + "balance_loss_mlp": 1.03509164, + "epoch": 0.6917180219449872, + "flos": 23477355383040.0, + "grad_norm": 1.6879501017402825, + "language_loss": 0.73243099, + "learning_rate": 9.166788817780499e-07, + "loss": 0.75375593, + "num_input_tokens_seen": 248315230, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6796875, + "step": 11505, + "time_per_iteration": 2.4869065284729004 + }, + { + "auxiliary_loss_clip": 0.01102379, + "auxiliary_loss_mlp": 0.01033942, + "balance_loss_clip": 1.02147281, + "balance_loss_mlp": 1.03443623, + "epoch": 0.6917781451976552, + "flos": 23732536579200.0, + "grad_norm": 1.792057287093782, + "language_loss": 0.87782943, + "learning_rate": 9.163515218778886e-07, + "loss": 0.89919269, + "num_input_tokens_seen": 248332980, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6796875, + "step": 11506, + "time_per_iteration": 2.4522695541381836 + }, + { + "auxiliary_loss_clip": 0.01102604, + "auxiliary_loss_mlp": 0.01026179, + "balance_loss_clip": 1.01465774, + "balance_loss_mlp": 1.03585625, + "epoch": 0.6918382684503231, + "flos": 31466760946560.0, + "grad_norm": 2.803306813867866, + "language_loss": 0.69775116, + "learning_rate": 9.160242030697856e-07, + "loss": 0.71903902, + "num_input_tokens_seen": 248352865, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 11507, + "time_per_iteration": 2.5447754859924316 + }, + { + "auxiliary_loss_clip": 0.01106091, + "auxiliary_loss_mlp": 0.01033389, + "balance_loss_clip": 1.02132535, + "balance_loss_mlp": 1.03596449, + "epoch": 0.6918983917029912, + "flos": 21650471706240.0, + "grad_norm": 2.005563924492128, + "language_loss": 0.76869601, + "learning_rate": 9.156969253661538e-07, + "loss": 0.7900908, + "num_input_tokens_seen": 248371125, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.703125, + "step": 11508, + "time_per_iteration": 2.4350826740264893 + }, + { + "auxiliary_loss_clip": 0.01099817, + "auxiliary_loss_mlp": 0.01029985, + "balance_loss_clip": 1.01885688, + "balance_loss_mlp": 1.03575826, + "epoch": 0.6919585149556591, + "flos": 25550082720000.0, + "grad_norm": 3.1306614754136217, + "language_loss": 0.75215411, + "learning_rate": 9.153696887794027e-07, + "loss": 0.77345216, + "num_input_tokens_seen": 248390455, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.640625, + "step": 11509, + "time_per_iteration": 2.51385760307312 + }, + { + "auxiliary_loss_clip": 0.01104564, + "auxiliary_loss_mlp": 0.01032862, + "balance_loss_clip": 1.02108383, + "balance_loss_mlp": 1.03770804, + "epoch": 0.6920186382083271, + "flos": 23659781581440.0, + "grad_norm": 1.4724116040863566, + "language_loss": 0.64134341, + "learning_rate": 9.150424933219425e-07, + "loss": 0.6627177, + "num_input_tokens_seen": 248411305, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.66796875, + "step": 11510, + "time_per_iteration": 2.45000958442688 + }, + { + "auxiliary_loss_clip": 0.01109479, + "auxiliary_loss_mlp": 0.01032628, + "balance_loss_clip": 1.01910424, + "balance_loss_mlp": 1.03804469, + "epoch": 0.692078761460995, + "flos": 19061959023360.0, + "grad_norm": 4.327241358216876, + "language_loss": 0.75543642, + "learning_rate": 9.147153390061788e-07, + "loss": 0.7768575, + "num_input_tokens_seen": 248430190, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71484375, + "step": 11511, + "time_per_iteration": 3.932948350906372 + }, + { + "auxiliary_loss_clip": 0.01104006, + "auxiliary_loss_mlp": 0.01033598, + "balance_loss_clip": 1.02240431, + "balance_loss_mlp": 1.03698862, + "epoch": 0.692138884713663, + "flos": 29023291382400.0, + "grad_norm": 2.3102277566791614, + "language_loss": 0.62639916, + "learning_rate": 9.143882258445184e-07, + "loss": 0.64777517, + "num_input_tokens_seen": 248450830, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.671875, + "step": 11512, + "time_per_iteration": 2.50154185295105 + }, + { + "auxiliary_loss_clip": 0.01103911, + "auxiliary_loss_mlp": 0.01030377, + "balance_loss_clip": 1.01848567, + "balance_loss_mlp": 1.03483152, + "epoch": 0.6921990079663309, + "flos": 14757849976320.0, + "grad_norm": 1.6663402692023492, + "language_loss": 0.8328855, + "learning_rate": 9.140611538493666e-07, + "loss": 0.85422838, + "num_input_tokens_seen": 248468585, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69140625, + "step": 11513, + "time_per_iteration": 3.906061887741089 + }, + { + "auxiliary_loss_clip": 0.01101955, + "auxiliary_loss_mlp": 0.01029486, + "balance_loss_clip": 1.01814294, + "balance_loss_mlp": 1.03563786, + "epoch": 0.692259131218999, + "flos": 23841848643840.0, + "grad_norm": 1.4134932862806329, + "language_loss": 0.77965999, + "learning_rate": 9.137341230331233e-07, + "loss": 0.80097437, + "num_input_tokens_seen": 248490535, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 11514, + "time_per_iteration": 3.914891481399536 + }, + { + "auxiliary_loss_clip": 0.01104612, + "auxiliary_loss_mlp": 0.01034457, + "balance_loss_clip": 1.02256036, + "balance_loss_mlp": 1.03478587, + "epoch": 0.6923192544716669, + "flos": 19135073157120.0, + "grad_norm": 1.8450575688706539, + "language_loss": 0.74720532, + "learning_rate": 9.134071334081907e-07, + "loss": 0.76859605, + "num_input_tokens_seen": 248508575, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69921875, + "step": 11515, + "time_per_iteration": 3.975337505340576 + }, + { + "auxiliary_loss_clip": 0.01101876, + "auxiliary_loss_mlp": 0.01034689, + "balance_loss_clip": 1.02331054, + "balance_loss_mlp": 1.03631759, + "epoch": 0.6923793777243349, + "flos": 28074639237120.0, + "grad_norm": 2.249358111886257, + "language_loss": 0.53926551, + "learning_rate": 9.130801849869694e-07, + "loss": 0.56063116, + "num_input_tokens_seen": 248527025, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65625, + "step": 11516, + "time_per_iteration": 2.4912428855895996 + }, + { + "auxiliary_loss_clip": 0.01101185, + "auxiliary_loss_mlp": 0.01031903, + "balance_loss_clip": 1.01989245, + "balance_loss_mlp": 1.03666639, + "epoch": 0.6924395009770029, + "flos": 16581250033920.0, + "grad_norm": 1.6422617041097631, + "language_loss": 0.72871542, + "learning_rate": 9.127532777818557e-07, + "loss": 0.75004637, + "num_input_tokens_seen": 248544275, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.64453125, + "step": 11517, + "time_per_iteration": 2.478013277053833 + }, + { + "auxiliary_loss_clip": 0.01105782, + "auxiliary_loss_mlp": 0.01036586, + "balance_loss_clip": 1.02440846, + "balance_loss_mlp": 1.03657305, + "epoch": 0.6924996242296708, + "flos": 16655297921280.0, + "grad_norm": 1.7574015499880917, + "language_loss": 0.76101017, + "learning_rate": 9.124264118052465e-07, + "loss": 0.78243387, + "num_input_tokens_seen": 248561870, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69140625, + "step": 11518, + "time_per_iteration": 2.4453186988830566 + }, + { + "auxiliary_loss_clip": 0.01108029, + "auxiliary_loss_mlp": 0.01033282, + "balance_loss_clip": 1.02049708, + "balance_loss_mlp": 1.03722334, + "epoch": 0.6925597474823388, + "flos": 34754167532160.0, + "grad_norm": 1.3039874531903892, + "language_loss": 0.64442092, + "learning_rate": 9.120995870695376e-07, + "loss": 0.66583401, + "num_input_tokens_seen": 248588190, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.70703125, + "step": 11519, + "time_per_iteration": 2.6372623443603516 + }, + { + "auxiliary_loss_clip": 0.01103223, + "auxiliary_loss_mlp": 0.01035136, + "balance_loss_clip": 1.02304852, + "balance_loss_mlp": 1.03542209, + "epoch": 0.6926198707350067, + "flos": 21871717528320.0, + "grad_norm": 1.9115708642987976, + "language_loss": 0.6239593, + "learning_rate": 9.117728035871212e-07, + "loss": 0.64534283, + "num_input_tokens_seen": 248606460, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 11520, + "time_per_iteration": 2.4893410205841064 + }, + { + "auxiliary_loss_clip": 0.01111126, + "auxiliary_loss_mlp": 0.01036871, + "balance_loss_clip": 1.02313781, + "balance_loss_mlp": 1.03751791, + "epoch": 0.6926799939876748, + "flos": 13006271162880.0, + "grad_norm": 1.8081030789169619, + "language_loss": 0.77767199, + "learning_rate": 9.114460613703887e-07, + "loss": 0.79915196, + "num_input_tokens_seen": 248623715, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.734375, + "step": 11521, + "time_per_iteration": 2.4445972442626953 + }, + { + "auxiliary_loss_clip": 0.01107789, + "auxiliary_loss_mlp": 0.01030231, + "balance_loss_clip": 1.01773214, + "balance_loss_mlp": 1.03593922, + "epoch": 0.6927401172403427, + "flos": 16761234107520.0, + "grad_norm": 1.8501694912434254, + "language_loss": 0.81979275, + "learning_rate": 9.111193604317304e-07, + "loss": 0.84117287, + "num_input_tokens_seen": 248640575, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71875, + "step": 11522, + "time_per_iteration": 2.423020124435425 + }, + { + "auxiliary_loss_clip": 0.01105276, + "auxiliary_loss_mlp": 0.01030165, + "balance_loss_clip": 1.01828539, + "balance_loss_mlp": 1.03786206, + "epoch": 0.6928002404930107, + "flos": 25705648523520.0, + "grad_norm": 1.3469897873257555, + "language_loss": 0.76728314, + "learning_rate": 9.107927007835361e-07, + "loss": 0.78863752, + "num_input_tokens_seen": 248663535, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 11523, + "time_per_iteration": 2.549304246902466 + }, + { + "auxiliary_loss_clip": 0.01101923, + "auxiliary_loss_mlp": 0.01031993, + "balance_loss_clip": 1.02063847, + "balance_loss_mlp": 1.03536248, + "epoch": 0.6928603637456786, + "flos": 18588261438720.0, + "grad_norm": 2.1482280608330355, + "language_loss": 0.68315476, + "learning_rate": 9.104660824381915e-07, + "loss": 0.70449388, + "num_input_tokens_seen": 248681125, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 11524, + "time_per_iteration": 2.443089723587036 + }, + { + "auxiliary_loss_clip": 0.0110548, + "auxiliary_loss_mlp": 0.01034425, + "balance_loss_clip": 1.02197385, + "balance_loss_mlp": 1.03614259, + "epoch": 0.6929204869983466, + "flos": 22200874784640.0, + "grad_norm": 1.8981913764440181, + "language_loss": 0.64524782, + "learning_rate": 9.101395054080815e-07, + "loss": 0.66664684, + "num_input_tokens_seen": 248700555, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.69140625, + "step": 11525, + "time_per_iteration": 2.504351854324341 + }, + { + "auxiliary_loss_clip": 0.01106067, + "auxiliary_loss_mlp": 0.01039081, + "balance_loss_clip": 1.02695775, + "balance_loss_mlp": 1.0376687, + "epoch": 0.6929806102510145, + "flos": 17894754576000.0, + "grad_norm": 1.9735788084293737, + "language_loss": 0.70338595, + "learning_rate": 9.098129697055907e-07, + "loss": 0.72483742, + "num_input_tokens_seen": 248716095, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.68359375, + "step": 11526, + "time_per_iteration": 2.4542391300201416 + }, + { + "auxiliary_loss_clip": 0.01100987, + "auxiliary_loss_mlp": 0.01028995, + "balance_loss_clip": 1.01756251, + "balance_loss_mlp": 1.03445363, + "epoch": 0.6930407335036826, + "flos": 19755178577280.0, + "grad_norm": 1.4787934463099037, + "language_loss": 0.76685685, + "learning_rate": 9.094864753431022e-07, + "loss": 0.78815675, + "num_input_tokens_seen": 248735330, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 11527, + "time_per_iteration": 2.510793685913086 + }, + { + "auxiliary_loss_clip": 0.0110112, + "auxiliary_loss_mlp": 0.0102928, + "balance_loss_clip": 1.01780045, + "balance_loss_mlp": 1.03496742, + "epoch": 0.6931008567563505, + "flos": 21544248211200.0, + "grad_norm": 1.562329187830164, + "language_loss": 0.79614961, + "learning_rate": 9.091600223329952e-07, + "loss": 0.81745368, + "num_input_tokens_seen": 248754530, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.66015625, + "step": 11528, + "time_per_iteration": 2.465226173400879 + }, + { + "auxiliary_loss_clip": 0.01099854, + "auxiliary_loss_mlp": 0.01030348, + "balance_loss_clip": 1.01909447, + "balance_loss_mlp": 1.03573501, + "epoch": 0.6931609800090185, + "flos": 26250018117120.0, + "grad_norm": 1.4331100909898178, + "language_loss": 0.76051259, + "learning_rate": 9.088336106876491e-07, + "loss": 0.78181458, + "num_input_tokens_seen": 248775825, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 11529, + "time_per_iteration": 2.5549967288970947 + }, + { + "auxiliary_loss_clip": 0.01100355, + "auxiliary_loss_mlp": 0.01035247, + "balance_loss_clip": 1.02312326, + "balance_loss_mlp": 1.0351932, + "epoch": 0.6932211032616865, + "flos": 32343376366080.0, + "grad_norm": 1.7201137592726918, + "language_loss": 0.72201979, + "learning_rate": 9.085072404194436e-07, + "loss": 0.74337578, + "num_input_tokens_seen": 248796180, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6484375, + "step": 11530, + "time_per_iteration": 2.531743049621582 + }, + { + "auxiliary_loss_clip": 0.01111171, + "auxiliary_loss_mlp": 0.01036425, + "balance_loss_clip": 1.02228653, + "balance_loss_mlp": 1.0381999, + "epoch": 0.6932812265143544, + "flos": 22049079909120.0, + "grad_norm": 1.645987038290128, + "language_loss": 0.7850855, + "learning_rate": 9.081809115407513e-07, + "loss": 0.80656147, + "num_input_tokens_seen": 248814735, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.73046875, + "step": 11531, + "time_per_iteration": 2.500711679458618 + }, + { + "auxiliary_loss_clip": 0.01101383, + "auxiliary_loss_mlp": 0.01030167, + "balance_loss_clip": 1.01911616, + "balance_loss_mlp": 1.0353266, + "epoch": 0.6933413497670224, + "flos": 26256626219520.0, + "grad_norm": 1.5275750432937483, + "language_loss": 0.69725084, + "learning_rate": 9.078546240639484e-07, + "loss": 0.71856636, + "num_input_tokens_seen": 248839140, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.66015625, + "step": 11532, + "time_per_iteration": 2.527376174926758 + }, + { + "auxiliary_loss_clip": 0.01106351, + "auxiliary_loss_mlp": 0.01028424, + "balance_loss_clip": 1.01601446, + "balance_loss_mlp": 1.0371834, + "epoch": 0.6934014730196904, + "flos": 19573003774080.0, + "grad_norm": 1.8600077248097753, + "language_loss": 0.6705901, + "learning_rate": 9.075283780014082e-07, + "loss": 0.69193786, + "num_input_tokens_seen": 248858300, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69140625, + "step": 11533, + "time_per_iteration": 2.518920421600342 + }, + { + "auxiliary_loss_clip": 0.01105686, + "auxiliary_loss_mlp": 0.01032096, + "balance_loss_clip": 1.01975226, + "balance_loss_mlp": 1.03683567, + "epoch": 0.6934615962723584, + "flos": 22119249127680.0, + "grad_norm": 3.0139531095823893, + "language_loss": 0.58712631, + "learning_rate": 9.072021733655007e-07, + "loss": 0.60850418, + "num_input_tokens_seen": 248876310, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 11534, + "time_per_iteration": 2.4710326194763184 + }, + { + "auxiliary_loss_clip": 0.01101215, + "auxiliary_loss_mlp": 0.01029045, + "balance_loss_clip": 1.01689124, + "balance_loss_mlp": 1.03428173, + "epoch": 0.6935217195250263, + "flos": 21360816432000.0, + "grad_norm": 2.05674594042133, + "language_loss": 0.71339464, + "learning_rate": 9.068760101685971e-07, + "loss": 0.73469722, + "num_input_tokens_seen": 248895650, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.671875, + "step": 11535, + "time_per_iteration": 2.4800782203674316 + }, + { + "auxiliary_loss_clip": 0.01024678, + "auxiliary_loss_mlp": 0.01001067, + "balance_loss_clip": 0.99988097, + "balance_loss_mlp": 1.00423908, + "epoch": 0.6935818427776943, + "flos": 64063813115520.0, + "grad_norm": 0.7160519901112068, + "language_loss": 0.59069979, + "learning_rate": 9.065498884230638e-07, + "loss": 0.61095721, + "num_input_tokens_seen": 248963920, + "router_z_loss_clip": 0.01184082, + "router_z_loss_mlp": 0.20507812, + "step": 11536, + "time_per_iteration": 3.175150156021118 + }, + { + "auxiliary_loss_clip": 0.01107914, + "auxiliary_loss_mlp": 0.01030682, + "balance_loss_clip": 1.01796234, + "balance_loss_mlp": 1.03721535, + "epoch": 0.6936419660303622, + "flos": 20302564913280.0, + "grad_norm": 1.8374101085934587, + "language_loss": 0.72543836, + "learning_rate": 9.062238081412692e-07, + "loss": 0.74682426, + "num_input_tokens_seen": 248983380, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 11537, + "time_per_iteration": 2.4590697288513184 + }, + { + "auxiliary_loss_clip": 0.01024524, + "auxiliary_loss_mlp": 0.00999962, + "balance_loss_clip": 0.99879992, + "balance_loss_mlp": 1.0041244, + "epoch": 0.6937020892830302, + "flos": 67182581347200.0, + "grad_norm": 0.7454400182413451, + "language_loss": 0.55605686, + "learning_rate": 9.058977693355767e-07, + "loss": 0.57630169, + "num_input_tokens_seen": 249044680, + "router_z_loss_clip": 0.01159668, + "router_z_loss_mlp": 0.20410156, + "step": 11538, + "time_per_iteration": 3.05582332611084 + }, + { + "auxiliary_loss_clip": 0.01098329, + "auxiliary_loss_mlp": 0.01030961, + "balance_loss_clip": 1.02029145, + "balance_loss_mlp": 1.03483129, + "epoch": 0.6937622125356981, + "flos": 23878190229120.0, + "grad_norm": 1.5310037982769402, + "language_loss": 0.77299392, + "learning_rate": 9.055717720183505e-07, + "loss": 0.79428679, + "num_input_tokens_seen": 249061060, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.63671875, + "step": 11539, + "time_per_iteration": 2.478339433670044 + }, + { + "auxiliary_loss_clip": 0.01102344, + "auxiliary_loss_mlp": 0.01026627, + "balance_loss_clip": 1.01524878, + "balance_loss_mlp": 1.03527951, + "epoch": 0.6938223357883662, + "flos": 28730619365760.0, + "grad_norm": 1.7857614206632793, + "language_loss": 0.64559513, + "learning_rate": 9.05245816201953e-07, + "loss": 0.66688484, + "num_input_tokens_seen": 249081430, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 11540, + "time_per_iteration": 2.5308845043182373 + }, + { + "auxiliary_loss_clip": 0.01102212, + "auxiliary_loss_mlp": 0.01028001, + "balance_loss_clip": 1.01658714, + "balance_loss_mlp": 1.03576088, + "epoch": 0.6938824590410341, + "flos": 28655027193600.0, + "grad_norm": 1.5373758397394544, + "language_loss": 0.8667385, + "learning_rate": 9.049199018987437e-07, + "loss": 0.88804066, + "num_input_tokens_seen": 249103020, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 11541, + "time_per_iteration": 2.5364692211151123 + }, + { + "auxiliary_loss_clip": 0.01103258, + "auxiliary_loss_mlp": 0.01031894, + "balance_loss_clip": 1.02015162, + "balance_loss_mlp": 1.03593302, + "epoch": 0.6939425822937021, + "flos": 18983062800000.0, + "grad_norm": 1.7924323447912938, + "language_loss": 0.84049714, + "learning_rate": 9.04594029121081e-07, + "loss": 0.86184859, + "num_input_tokens_seen": 249120810, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 11542, + "time_per_iteration": 2.4829962253570557 + }, + { + "auxiliary_loss_clip": 0.01104055, + "auxiliary_loss_mlp": 0.01029315, + "balance_loss_clip": 1.01595759, + "balance_loss_mlp": 1.0352869, + "epoch": 0.6940027055463701, + "flos": 23075838178560.0, + "grad_norm": 1.8414334065280868, + "language_loss": 0.75269711, + "learning_rate": 9.04268197881323e-07, + "loss": 0.77403086, + "num_input_tokens_seen": 249138050, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6875, + "step": 11543, + "time_per_iteration": 2.452195167541504 + }, + { + "auxiliary_loss_clip": 0.01102342, + "auxiliary_loss_mlp": 0.01031344, + "balance_loss_clip": 1.01960182, + "balance_loss_mlp": 1.03582442, + "epoch": 0.694062828799038, + "flos": 18186564666240.0, + "grad_norm": 1.6661945850864863, + "language_loss": 0.76122248, + "learning_rate": 9.039424081918241e-07, + "loss": 0.78255928, + "num_input_tokens_seen": 249155570, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 11544, + "time_per_iteration": 2.461024761199951 + }, + { + "auxiliary_loss_clip": 0.01105964, + "auxiliary_loss_mlp": 0.01033804, + "balance_loss_clip": 1.02216911, + "balance_loss_mlp": 1.03684866, + "epoch": 0.694122952051706, + "flos": 17821532701440.0, + "grad_norm": 1.7008976535157667, + "language_loss": 0.71218264, + "learning_rate": 9.036166600649388e-07, + "loss": 0.73358029, + "num_input_tokens_seen": 249172960, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.69140625, + "step": 11545, + "time_per_iteration": 2.4178249835968018 + }, + { + "auxiliary_loss_clip": 0.01099714, + "auxiliary_loss_mlp": 0.01027089, + "balance_loss_clip": 1.01646137, + "balance_loss_mlp": 1.03581667, + "epoch": 0.694183075304374, + "flos": 21215306436480.0, + "grad_norm": 1.933857108829042, + "language_loss": 0.79382741, + "learning_rate": 9.0329095351302e-07, + "loss": 0.81509542, + "num_input_tokens_seen": 249192450, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.640625, + "step": 11546, + "time_per_iteration": 2.467369794845581 + }, + { + "auxiliary_loss_clip": 0.01105153, + "auxiliary_loss_mlp": 0.0102905, + "balance_loss_clip": 1.01755857, + "balance_loss_mlp": 1.03803396, + "epoch": 0.694243198557042, + "flos": 24060508686720.0, + "grad_norm": 2.1784420231587562, + "language_loss": 0.78471816, + "learning_rate": 9.029652885484194e-07, + "loss": 0.80606019, + "num_input_tokens_seen": 249214320, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 11547, + "time_per_iteration": 2.5005674362182617 + }, + { + "auxiliary_loss_clip": 0.01104152, + "auxiliary_loss_mlp": 0.01033664, + "balance_loss_clip": 1.02177894, + "balance_loss_mlp": 1.03765762, + "epoch": 0.6943033218097099, + "flos": 21141869080320.0, + "grad_norm": 2.1600607182563323, + "language_loss": 0.81004536, + "learning_rate": 9.026396651834834e-07, + "loss": 0.83142352, + "num_input_tokens_seen": 249230925, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 11548, + "time_per_iteration": 2.467039108276367 + }, + { + "auxiliary_loss_clip": 0.01024313, + "auxiliary_loss_mlp": 0.01001552, + "balance_loss_clip": 1.00046158, + "balance_loss_mlp": 1.003824, + "epoch": 0.6943634450623779, + "flos": 57812015975040.0, + "grad_norm": 0.6998312619688671, + "language_loss": 0.53725159, + "learning_rate": 9.023140834305613e-07, + "loss": 0.55751026, + "num_input_tokens_seen": 249293975, + "router_z_loss_clip": 0.01092529, + "router_z_loss_mlp": 0.20507812, + "step": 11549, + "time_per_iteration": 3.049893617630005 + }, + { + "auxiliary_loss_clip": 0.01102026, + "auxiliary_loss_mlp": 0.01031521, + "balance_loss_clip": 1.01926637, + "balance_loss_mlp": 1.03490329, + "epoch": 0.6944235683150458, + "flos": 30590684231040.0, + "grad_norm": 1.4134834791230244, + "language_loss": 0.7344752, + "learning_rate": 9.01988543302e-07, + "loss": 0.75581068, + "num_input_tokens_seen": 249315285, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.671875, + "step": 11550, + "time_per_iteration": 2.5287935733795166 + }, + { + "auxiliary_loss_clip": 0.01105894, + "auxiliary_loss_mlp": 0.01035534, + "balance_loss_clip": 1.02367878, + "balance_loss_mlp": 1.03701949, + "epoch": 0.6944836915677138, + "flos": 19719447523200.0, + "grad_norm": 1.8969044968976483, + "language_loss": 0.73992145, + "learning_rate": 9.016630448101425e-07, + "loss": 0.76133573, + "num_input_tokens_seen": 249333505, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6875, + "step": 11551, + "time_per_iteration": 2.4404563903808594 + }, + { + "auxiliary_loss_clip": 0.01104938, + "auxiliary_loss_mlp": 0.01037921, + "balance_loss_clip": 1.02617919, + "balance_loss_mlp": 1.03671432, + "epoch": 0.6945438148203817, + "flos": 24863579009280.0, + "grad_norm": 1.6277950876042102, + "language_loss": 0.84549385, + "learning_rate": 9.01337587967333e-07, + "loss": 0.86692244, + "num_input_tokens_seen": 249354180, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 11552, + "time_per_iteration": 2.498476028442383 + }, + { + "auxiliary_loss_clip": 0.01102767, + "auxiliary_loss_mlp": 0.01034275, + "balance_loss_clip": 1.02255046, + "balance_loss_mlp": 1.03642297, + "epoch": 0.6946039380730498, + "flos": 33326646243840.0, + "grad_norm": 1.5310970869840324, + "language_loss": 0.67400169, + "learning_rate": 9.010121727859117e-07, + "loss": 0.6953721, + "num_input_tokens_seen": 249377035, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 11553, + "time_per_iteration": 3.92946720123291 + }, + { + "auxiliary_loss_clip": 0.01107649, + "auxiliary_loss_mlp": 0.01028199, + "balance_loss_clip": 1.01543725, + "balance_loss_mlp": 1.03727949, + "epoch": 0.6946640613257177, + "flos": 20850956830080.0, + "grad_norm": 1.5363855656738201, + "language_loss": 0.79580885, + "learning_rate": 9.006867992782195e-07, + "loss": 0.8171674, + "num_input_tokens_seen": 249396155, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.703125, + "step": 11554, + "time_per_iteration": 2.469681978225708 + }, + { + "auxiliary_loss_clip": 0.01103857, + "auxiliary_loss_mlp": 0.01029016, + "balance_loss_clip": 1.0172801, + "balance_loss_mlp": 1.03479338, + "epoch": 0.6947241845783857, + "flos": 19354846521600.0, + "grad_norm": 1.7519879066783155, + "language_loss": 0.72581065, + "learning_rate": 9.003614674565934e-07, + "loss": 0.74713933, + "num_input_tokens_seen": 249414555, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.69140625, + "step": 11555, + "time_per_iteration": 3.862004280090332 + }, + { + "auxiliary_loss_clip": 0.01100586, + "auxiliary_loss_mlp": 0.01027174, + "balance_loss_clip": 1.01587296, + "balance_loss_mlp": 1.0338273, + "epoch": 0.6947843078310536, + "flos": 27120240915840.0, + "grad_norm": 1.9852142507231525, + "language_loss": 0.78025049, + "learning_rate": 9.000361773333705e-07, + "loss": 0.8015281, + "num_input_tokens_seen": 249433570, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.66796875, + "step": 11556, + "time_per_iteration": 5.454412937164307 + }, + { + "auxiliary_loss_clip": 0.01101652, + "auxiliary_loss_mlp": 0.01034952, + "balance_loss_clip": 1.0232873, + "balance_loss_mlp": 1.03403139, + "epoch": 0.6948444310837216, + "flos": 28585109370240.0, + "grad_norm": 2.79871624128239, + "language_loss": 0.60282063, + "learning_rate": 8.997109289208869e-07, + "loss": 0.62418664, + "num_input_tokens_seen": 249453735, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.67578125, + "step": 11557, + "time_per_iteration": 2.5056674480438232 + }, + { + "auxiliary_loss_clip": 0.01100911, + "auxiliary_loss_mlp": 0.01036469, + "balance_loss_clip": 1.02463782, + "balance_loss_mlp": 1.03539312, + "epoch": 0.6949045543363896, + "flos": 15669262696320.0, + "grad_norm": 1.6476789256185396, + "language_loss": 0.8537513, + "learning_rate": 8.993857222314752e-07, + "loss": 0.87512511, + "num_input_tokens_seen": 249470805, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65625, + "step": 11558, + "time_per_iteration": 2.456141948699951 + }, + { + "auxiliary_loss_clip": 0.01105067, + "auxiliary_loss_mlp": 0.01029415, + "balance_loss_clip": 1.01679111, + "balance_loss_mlp": 1.03618479, + "epoch": 0.6949646775890576, + "flos": 23259413612160.0, + "grad_norm": 1.6025671858040744, + "language_loss": 0.70371419, + "learning_rate": 8.990605572774664e-07, + "loss": 0.72505903, + "num_input_tokens_seen": 249491150, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 11559, + "time_per_iteration": 2.5148940086364746 + }, + { + "auxiliary_loss_clip": 0.01101778, + "auxiliary_loss_mlp": 0.01028231, + "balance_loss_clip": 1.01674509, + "balance_loss_mlp": 1.03588152, + "epoch": 0.6950248008417256, + "flos": 22382546797440.0, + "grad_norm": 1.5297645646514304, + "language_loss": 0.78975582, + "learning_rate": 8.987354340711921e-07, + "loss": 0.8110559, + "num_input_tokens_seen": 249511560, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66015625, + "step": 11560, + "time_per_iteration": 2.504146099090576 + }, + { + "auxiliary_loss_clip": 0.01101416, + "auxiliary_loss_mlp": 0.01030876, + "balance_loss_clip": 1.01942587, + "balance_loss_mlp": 1.03616834, + "epoch": 0.6950849240943935, + "flos": 23477355383040.0, + "grad_norm": 1.666384056444463, + "language_loss": 0.76987702, + "learning_rate": 8.9841035262498e-07, + "loss": 0.79119992, + "num_input_tokens_seen": 249531910, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 11561, + "time_per_iteration": 2.480802536010742 + }, + { + "auxiliary_loss_clip": 0.0109923, + "auxiliary_loss_mlp": 0.01030204, + "balance_loss_clip": 1.01806235, + "balance_loss_mlp": 1.03331923, + "epoch": 0.6951450473470615, + "flos": 17420554200960.0, + "grad_norm": 1.7391531347439242, + "language_loss": 0.78634578, + "learning_rate": 8.980853129511577e-07, + "loss": 0.80764008, + "num_input_tokens_seen": 249550300, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66015625, + "step": 11562, + "time_per_iteration": 2.438997268676758 + }, + { + "auxiliary_loss_clip": 0.0110346, + "auxiliary_loss_mlp": 0.01032953, + "balance_loss_clip": 1.02100253, + "balance_loss_mlp": 1.03525412, + "epoch": 0.6952051705997294, + "flos": 20485745297280.0, + "grad_norm": 1.9230268961820236, + "language_loss": 0.69259918, + "learning_rate": 8.977603150620515e-07, + "loss": 0.71396333, + "num_input_tokens_seen": 249567740, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 11563, + "time_per_iteration": 2.4467828273773193 + }, + { + "auxiliary_loss_clip": 0.01097161, + "auxiliary_loss_mlp": 0.01025707, + "balance_loss_clip": 1.01491845, + "balance_loss_mlp": 1.03383183, + "epoch": 0.6952652938523974, + "flos": 13989541040640.0, + "grad_norm": 2.153945918609724, + "language_loss": 0.73383999, + "learning_rate": 8.974353589699846e-07, + "loss": 0.75506866, + "num_input_tokens_seen": 249582700, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6328125, + "step": 11564, + "time_per_iteration": 2.4219517707824707 + }, + { + "auxiliary_loss_clip": 0.01117667, + "auxiliary_loss_mlp": 0.01036182, + "balance_loss_clip": 1.02188849, + "balance_loss_mlp": 1.04055667, + "epoch": 0.6953254171050653, + "flos": 30953956429440.0, + "grad_norm": 1.987939257518994, + "language_loss": 0.71758306, + "learning_rate": 8.971104446872785e-07, + "loss": 0.73912156, + "num_input_tokens_seen": 249602920, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.76953125, + "step": 11565, + "time_per_iteration": 2.5249881744384766 + }, + { + "auxiliary_loss_clip": 0.01024476, + "auxiliary_loss_mlp": 0.01001909, + "balance_loss_clip": 1.00083661, + "balance_loss_mlp": 1.00426412, + "epoch": 0.6953855403577334, + "flos": 61670257499520.0, + "grad_norm": 0.9231095353674287, + "language_loss": 0.58470231, + "learning_rate": 8.96785572226255e-07, + "loss": 0.60496616, + "num_input_tokens_seen": 249660400, + "router_z_loss_clip": 0.01074219, + "router_z_loss_mlp": 0.20214844, + "step": 11566, + "time_per_iteration": 2.9420695304870605 + }, + { + "auxiliary_loss_clip": 0.0110462, + "auxiliary_loss_mlp": 0.01028714, + "balance_loss_clip": 1.01639366, + "balance_loss_mlp": 1.03440809, + "epoch": 0.6954456636104013, + "flos": 23039029716480.0, + "grad_norm": 1.9048250540658576, + "language_loss": 0.74568522, + "learning_rate": 8.964607415992338e-07, + "loss": 0.76701856, + "num_input_tokens_seen": 249679335, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 11567, + "time_per_iteration": 2.4744651317596436 + }, + { + "auxiliary_loss_clip": 0.01100227, + "auxiliary_loss_mlp": 0.01033708, + "balance_loss_clip": 1.02154922, + "balance_loss_mlp": 1.03473878, + "epoch": 0.6955057868630693, + "flos": 23918518224000.0, + "grad_norm": 1.342733224210211, + "language_loss": 0.76978123, + "learning_rate": 8.961359528185313e-07, + "loss": 0.79112065, + "num_input_tokens_seen": 249701805, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.65625, + "step": 11568, + "time_per_iteration": 2.5342469215393066 + }, + { + "auxiliary_loss_clip": 0.01103163, + "auxiliary_loss_mlp": 0.01033796, + "balance_loss_clip": 1.02255452, + "balance_loss_mlp": 1.03756905, + "epoch": 0.6955659101157372, + "flos": 22594634651520.0, + "grad_norm": 4.390531062594107, + "language_loss": 0.72720057, + "learning_rate": 8.958112058964649e-07, + "loss": 0.74857014, + "num_input_tokens_seen": 249720550, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 11569, + "time_per_iteration": 2.44547438621521 + }, + { + "auxiliary_loss_clip": 0.01104961, + "auxiliary_loss_mlp": 0.0102821, + "balance_loss_clip": 1.01642609, + "balance_loss_mlp": 1.03668261, + "epoch": 0.6956260333684052, + "flos": 24572523104640.0, + "grad_norm": 2.456023744681467, + "language_loss": 0.77213609, + "learning_rate": 8.954865008453471e-07, + "loss": 0.79346788, + "num_input_tokens_seen": 249740325, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.68359375, + "step": 11570, + "time_per_iteration": 2.539635419845581 + }, + { + "auxiliary_loss_clip": 0.01104303, + "auxiliary_loss_mlp": 0.01033386, + "balance_loss_clip": 1.02089262, + "balance_loss_mlp": 1.03544307, + "epoch": 0.6956861566210732, + "flos": 25846058787840.0, + "grad_norm": 2.0491810853886125, + "language_loss": 0.74309134, + "learning_rate": 8.95161837677493e-07, + "loss": 0.76446825, + "num_input_tokens_seen": 249760570, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 11571, + "time_per_iteration": 2.5310707092285156 + }, + { + "auxiliary_loss_clip": 0.0109878, + "auxiliary_loss_mlp": 0.01030092, + "balance_loss_clip": 1.01863599, + "balance_loss_mlp": 1.03522277, + "epoch": 0.6957462798737412, + "flos": 15301393557120.0, + "grad_norm": 2.2800160570301395, + "language_loss": 0.74539, + "learning_rate": 8.948372164052118e-07, + "loss": 0.76667869, + "num_input_tokens_seen": 249778290, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.63671875, + "step": 11572, + "time_per_iteration": 2.454315423965454 + }, + { + "auxiliary_loss_clip": 0.01101105, + "auxiliary_loss_mlp": 0.0102909, + "balance_loss_clip": 1.01692498, + "balance_loss_mlp": 1.03309405, + "epoch": 0.6958064031264092, + "flos": 36246830135040.0, + "grad_norm": 1.919471935586269, + "language_loss": 0.7033447, + "learning_rate": 8.94512637040814e-07, + "loss": 0.72464669, + "num_input_tokens_seen": 249800925, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6796875, + "step": 11573, + "time_per_iteration": 2.6062417030334473 + }, + { + "auxiliary_loss_clip": 0.01108794, + "auxiliary_loss_mlp": 0.0103278, + "balance_loss_clip": 1.02028072, + "balance_loss_mlp": 1.03887129, + "epoch": 0.6958665263790771, + "flos": 19208725994880.0, + "grad_norm": 1.9750506885077386, + "language_loss": 0.74985647, + "learning_rate": 8.941880995966095e-07, + "loss": 0.77127224, + "num_input_tokens_seen": 249820500, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69921875, + "step": 11574, + "time_per_iteration": 2.4739365577697754 + }, + { + "auxiliary_loss_clip": 0.01105022, + "auxiliary_loss_mlp": 0.01031169, + "balance_loss_clip": 1.01920092, + "balance_loss_mlp": 1.03574729, + "epoch": 0.6959266496317451, + "flos": 21795838047360.0, + "grad_norm": 1.6163956776113584, + "language_loss": 0.74427664, + "learning_rate": 8.938636040849014e-07, + "loss": 0.76563859, + "num_input_tokens_seen": 249839845, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.69140625, + "step": 11575, + "time_per_iteration": 2.4526143074035645 + }, + { + "auxiliary_loss_clip": 0.01103541, + "auxiliary_loss_mlp": 0.01031176, + "balance_loss_clip": 1.01851606, + "balance_loss_mlp": 1.03498685, + "epoch": 0.695986772884413, + "flos": 20558248899840.0, + "grad_norm": 2.202817220265, + "language_loss": 0.78680444, + "learning_rate": 8.935391505179966e-07, + "loss": 0.80815148, + "num_input_tokens_seen": 249857400, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 11576, + "time_per_iteration": 2.489030122756958 + }, + { + "auxiliary_loss_clip": 0.01104629, + "auxiliary_loss_mlp": 0.01031961, + "balance_loss_clip": 1.0206064, + "balance_loss_mlp": 1.03426623, + "epoch": 0.696046896137081, + "flos": 14936217937920.0, + "grad_norm": 2.167216169901492, + "language_loss": 0.56448716, + "learning_rate": 8.932147389081985e-07, + "loss": 0.5858531, + "num_input_tokens_seen": 249871645, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.703125, + "step": 11577, + "time_per_iteration": 2.402588367462158 + }, + { + "auxiliary_loss_clip": 0.010978, + "auxiliary_loss_mlp": 0.01020474, + "balance_loss_clip": 1.01061571, + "balance_loss_mlp": 1.0344727, + "epoch": 0.696107019389749, + "flos": 30740216549760.0, + "grad_norm": 1.3300447055766056, + "language_loss": 0.76633966, + "learning_rate": 8.928903692678081e-07, + "loss": 0.78752244, + "num_input_tokens_seen": 249894215, + "router_z_loss_clip": 0.09863281, + "router_z_loss_mlp": 0.6328125, + "step": 11578, + "time_per_iteration": 2.5856926441192627 + }, + { + "auxiliary_loss_clip": 0.01103837, + "auxiliary_loss_mlp": 0.01031702, + "balance_loss_clip": 1.02003193, + "balance_loss_mlp": 1.03707981, + "epoch": 0.696167142642417, + "flos": 20776729374720.0, + "grad_norm": 1.9898977429274547, + "language_loss": 0.7948364, + "learning_rate": 8.925660416091254e-07, + "loss": 0.81619179, + "num_input_tokens_seen": 249912850, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66796875, + "step": 11579, + "time_per_iteration": 2.4593424797058105 + }, + { + "auxiliary_loss_clip": 0.01097825, + "auxiliary_loss_mlp": 0.01026526, + "balance_loss_clip": 1.0148437, + "balance_loss_mlp": 1.03269458, + "epoch": 0.6962272658950849, + "flos": 22565152563840.0, + "grad_norm": 1.7711043261793566, + "language_loss": 0.72253591, + "learning_rate": 8.922417559444502e-07, + "loss": 0.74377942, + "num_input_tokens_seen": 249932650, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65234375, + "step": 11580, + "time_per_iteration": 2.5214614868164062 + }, + { + "auxiliary_loss_clip": 0.0110553, + "auxiliary_loss_mlp": 0.01029239, + "balance_loss_clip": 1.01617932, + "balance_loss_mlp": 1.03608978, + "epoch": 0.6962873891477529, + "flos": 22200156512640.0, + "grad_norm": 1.861307366576084, + "language_loss": 0.65531254, + "learning_rate": 8.919175122860787e-07, + "loss": 0.67666024, + "num_input_tokens_seen": 249951205, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6953125, + "step": 11581, + "time_per_iteration": 2.519068479537964 + }, + { + "auxiliary_loss_clip": 0.01102711, + "auxiliary_loss_mlp": 0.01030191, + "balance_loss_clip": 1.01883006, + "balance_loss_mlp": 1.03555655, + "epoch": 0.6963475124004208, + "flos": 12489695717760.0, + "grad_norm": 2.390157722365771, + "language_loss": 0.76223433, + "learning_rate": 8.915933106463056e-07, + "loss": 0.78356332, + "num_input_tokens_seen": 249967045, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 11582, + "time_per_iteration": 2.444866418838501 + }, + { + "auxiliary_loss_clip": 0.01101303, + "auxiliary_loss_mlp": 0.01029814, + "balance_loss_clip": 1.01910901, + "balance_loss_mlp": 1.03478706, + "epoch": 0.6964076356530888, + "flos": 17165085696000.0, + "grad_norm": 1.876033269945707, + "language_loss": 0.69968796, + "learning_rate": 8.91269151037425e-07, + "loss": 0.72099912, + "num_input_tokens_seen": 249984565, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6640625, + "step": 11583, + "time_per_iteration": 2.430619239807129 + }, + { + "auxiliary_loss_clip": 0.01105097, + "auxiliary_loss_mlp": 0.0103149, + "balance_loss_clip": 1.01950979, + "balance_loss_mlp": 1.03693569, + "epoch": 0.6964677589057569, + "flos": 19937317466880.0, + "grad_norm": 2.37757967168826, + "language_loss": 0.82697153, + "learning_rate": 8.909450334717301e-07, + "loss": 0.84833741, + "num_input_tokens_seen": 250004235, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6796875, + "step": 11584, + "time_per_iteration": 2.5077664852142334 + }, + { + "auxiliary_loss_clip": 0.01105057, + "auxiliary_loss_mlp": 0.01035873, + "balance_loss_clip": 1.02336848, + "balance_loss_mlp": 1.03613901, + "epoch": 0.6965278821584248, + "flos": 22784064001920.0, + "grad_norm": 2.430393804317416, + "language_loss": 0.79577053, + "learning_rate": 8.906209579615107e-07, + "loss": 0.8171798, + "num_input_tokens_seen": 250017645, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 11585, + "time_per_iteration": 2.4488959312438965 + }, + { + "auxiliary_loss_clip": 0.01099431, + "auxiliary_loss_mlp": 0.0103141, + "balance_loss_clip": 1.02046049, + "balance_loss_mlp": 1.03464603, + "epoch": 0.6965880054110928, + "flos": 20047563285120.0, + "grad_norm": 1.5234092919525861, + "language_loss": 0.77759147, + "learning_rate": 8.90296924519055e-07, + "loss": 0.79889989, + "num_input_tokens_seen": 250037640, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 11586, + "time_per_iteration": 2.4705069065093994 + }, + { + "auxiliary_loss_clip": 0.01096075, + "auxiliary_loss_mlp": 0.01030607, + "balance_loss_clip": 1.02000952, + "balance_loss_mlp": 1.03367376, + "epoch": 0.6966481286637607, + "flos": 21908238681600.0, + "grad_norm": 1.7766488711687052, + "language_loss": 0.78765887, + "learning_rate": 8.899729331566519e-07, + "loss": 0.80892575, + "num_input_tokens_seen": 250056490, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.625, + "step": 11587, + "time_per_iteration": 2.4538965225219727 + }, + { + "auxiliary_loss_clip": 0.01100978, + "auxiliary_loss_mlp": 0.01030866, + "balance_loss_clip": 1.01916003, + "balance_loss_mlp": 1.03608429, + "epoch": 0.6967082519164287, + "flos": 15633172506240.0, + "grad_norm": 2.0434006837874885, + "language_loss": 0.72847271, + "learning_rate": 8.896489838865857e-07, + "loss": 0.74979115, + "num_input_tokens_seen": 250074285, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6484375, + "step": 11588, + "time_per_iteration": 2.452421188354492 + }, + { + "auxiliary_loss_clip": 0.01101081, + "auxiliary_loss_mlp": 0.0102536, + "balance_loss_clip": 1.01462507, + "balance_loss_mlp": 1.03454709, + "epoch": 0.6967683751690966, + "flos": 24024598064640.0, + "grad_norm": 1.6358395354491653, + "language_loss": 0.75110734, + "learning_rate": 8.893250767211413e-07, + "loss": 0.77237165, + "num_input_tokens_seen": 250093350, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6640625, + "step": 11589, + "time_per_iteration": 2.466801643371582 + }, + { + "auxiliary_loss_clip": 0.01102838, + "auxiliary_loss_mlp": 0.01029576, + "balance_loss_clip": 1.01815021, + "balance_loss_mlp": 1.03571272, + "epoch": 0.6968284984217646, + "flos": 31024700265600.0, + "grad_norm": 1.8223612278895884, + "language_loss": 0.63479555, + "learning_rate": 8.890012116726012e-07, + "loss": 0.6561197, + "num_input_tokens_seen": 250114170, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 11590, + "time_per_iteration": 2.547621011734009 + }, + { + "auxiliary_loss_clip": 0.0102506, + "auxiliary_loss_mlp": 0.00999727, + "balance_loss_clip": 0.99851686, + "balance_loss_mlp": 1.00460005, + "epoch": 0.6968886216744326, + "flos": 67622990002560.0, + "grad_norm": 0.7464434837595778, + "language_loss": 0.61278826, + "learning_rate": 8.88677388753248e-07, + "loss": 0.63303614, + "num_input_tokens_seen": 250178250, + "router_z_loss_clip": 0.01208496, + "router_z_loss_mlp": 0.20507812, + "step": 11591, + "time_per_iteration": 3.138062000274658 + }, + { + "auxiliary_loss_clip": 0.01106658, + "auxiliary_loss_mlp": 0.010328, + "balance_loss_clip": 1.02039623, + "balance_loss_mlp": 1.03897679, + "epoch": 0.6969487449271006, + "flos": 24863686750080.0, + "grad_norm": 2.149264324135608, + "language_loss": 0.69040775, + "learning_rate": 8.883536079753582e-07, + "loss": 0.7118023, + "num_input_tokens_seen": 250198420, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 11592, + "time_per_iteration": 2.4973015785217285 + }, + { + "auxiliary_loss_clip": 0.01102777, + "auxiliary_loss_mlp": 0.01027086, + "balance_loss_clip": 1.01633334, + "balance_loss_mlp": 1.03731585, + "epoch": 0.6970088681797685, + "flos": 28767858791040.0, + "grad_norm": 1.7113840138583603, + "language_loss": 0.62385631, + "learning_rate": 8.880298693512109e-07, + "loss": 0.64515489, + "num_input_tokens_seen": 250220650, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.65625, + "step": 11593, + "time_per_iteration": 2.5094406604766846 + }, + { + "auxiliary_loss_clip": 0.01098813, + "auxiliary_loss_mlp": 0.0102709, + "balance_loss_clip": 1.01652873, + "balance_loss_mlp": 1.03533387, + "epoch": 0.6970689914324365, + "flos": 27308556944640.0, + "grad_norm": 1.6455172692601516, + "language_loss": 0.54323792, + "learning_rate": 8.877061728930832e-07, + "loss": 0.56449699, + "num_input_tokens_seen": 250241750, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6328125, + "step": 11594, + "time_per_iteration": 3.896481513977051 + }, + { + "auxiliary_loss_clip": 0.01100941, + "auxiliary_loss_mlp": 0.01025932, + "balance_loss_clip": 1.01524472, + "balance_loss_mlp": 1.03542423, + "epoch": 0.6971291146851044, + "flos": 19136258305920.0, + "grad_norm": 2.382773789064297, + "language_loss": 0.77469057, + "learning_rate": 8.87382518613248e-07, + "loss": 0.79595929, + "num_input_tokens_seen": 250259445, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.65625, + "step": 11595, + "time_per_iteration": 2.4667396545410156 + }, + { + "auxiliary_loss_clip": 0.01107354, + "auxiliary_loss_mlp": 0.01030534, + "balance_loss_clip": 1.01850617, + "balance_loss_mlp": 1.03804874, + "epoch": 0.6971892379377724, + "flos": 14610508387200.0, + "grad_norm": 2.2493761025640957, + "language_loss": 0.71796727, + "learning_rate": 8.870589065239793e-07, + "loss": 0.73934615, + "num_input_tokens_seen": 250275640, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6953125, + "step": 11596, + "time_per_iteration": 3.921229839324951 + }, + { + "auxiliary_loss_clip": 0.01105557, + "auxiliary_loss_mlp": 0.01031813, + "balance_loss_clip": 1.0197432, + "balance_loss_mlp": 1.03878427, + "epoch": 0.6972493611904405, + "flos": 22307457415680.0, + "grad_norm": 1.6145547078757287, + "language_loss": 0.76072466, + "learning_rate": 8.867353366375492e-07, + "loss": 0.78209841, + "num_input_tokens_seen": 250296435, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66796875, + "step": 11597, + "time_per_iteration": 3.8901522159576416 + }, + { + "auxiliary_loss_clip": 0.01101534, + "auxiliary_loss_mlp": 0.01034099, + "balance_loss_clip": 1.02247071, + "balance_loss_mlp": 1.03553581, + "epoch": 0.6973094844431084, + "flos": 17420374632960.0, + "grad_norm": 1.8362035763244782, + "language_loss": 0.74662215, + "learning_rate": 8.864118089662267e-07, + "loss": 0.76797849, + "num_input_tokens_seen": 250314035, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66015625, + "step": 11598, + "time_per_iteration": 3.8907439708709717 + }, + { + "auxiliary_loss_clip": 0.01105596, + "auxiliary_loss_mlp": 0.0103258, + "balance_loss_clip": 1.02021837, + "balance_loss_mlp": 1.03667629, + "epoch": 0.6973696076957764, + "flos": 27235370983680.0, + "grad_norm": 1.7078147721602885, + "language_loss": 0.89751863, + "learning_rate": 8.860883235222791e-07, + "loss": 0.91890037, + "num_input_tokens_seen": 250332995, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69140625, + "step": 11599, + "time_per_iteration": 2.508460760116577 + }, + { + "auxiliary_loss_clip": 0.01107859, + "auxiliary_loss_mlp": 0.01035136, + "balance_loss_clip": 1.02237415, + "balance_loss_mlp": 1.03705978, + "epoch": 0.6974297309484443, + "flos": 22018089450240.0, + "grad_norm": 2.217668834863667, + "language_loss": 0.69431078, + "learning_rate": 8.85764880317974e-07, + "loss": 0.7157408, + "num_input_tokens_seen": 250352120, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7109375, + "step": 11600, + "time_per_iteration": 2.4692399501800537 + }, + { + "auxiliary_loss_clip": 0.01100885, + "auxiliary_loss_mlp": 0.01030939, + "balance_loss_clip": 1.01923847, + "balance_loss_mlp": 1.03319108, + "epoch": 0.6974898542011123, + "flos": 28366449327360.0, + "grad_norm": 2.0745134651859853, + "language_loss": 0.76886988, + "learning_rate": 8.854414793655771e-07, + "loss": 0.79018807, + "num_input_tokens_seen": 250371705, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 11601, + "time_per_iteration": 2.5153214931488037 + }, + { + "auxiliary_loss_clip": 0.0109772, + "auxiliary_loss_mlp": 0.0103129, + "balance_loss_clip": 1.02020907, + "balance_loss_mlp": 1.03365159, + "epoch": 0.6975499774537802, + "flos": 15232050351360.0, + "grad_norm": 1.7793101834620162, + "language_loss": 0.72061765, + "learning_rate": 8.851181206773508e-07, + "loss": 0.74190778, + "num_input_tokens_seen": 250390485, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.640625, + "step": 11602, + "time_per_iteration": 2.4385433197021484 + }, + { + "auxiliary_loss_clip": 0.01102254, + "auxiliary_loss_mlp": 0.010339, + "balance_loss_clip": 1.0228374, + "balance_loss_mlp": 1.0355022, + "epoch": 0.6976101007064482, + "flos": 22157422306560.0, + "grad_norm": 2.295891013382411, + "language_loss": 0.76406467, + "learning_rate": 8.847948042655567e-07, + "loss": 0.78542626, + "num_input_tokens_seen": 250407020, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.66796875, + "step": 11603, + "time_per_iteration": 2.451995611190796 + }, + { + "auxiliary_loss_clip": 0.01102122, + "auxiliary_loss_mlp": 0.01030282, + "balance_loss_clip": 1.01861763, + "balance_loss_mlp": 1.03604972, + "epoch": 0.6976702239591162, + "flos": 22273522041600.0, + "grad_norm": 3.2492511864977476, + "language_loss": 0.62036002, + "learning_rate": 8.844715301424557e-07, + "loss": 0.64168406, + "num_input_tokens_seen": 250425880, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66015625, + "step": 11604, + "time_per_iteration": 2.4743845462799072 + }, + { + "auxiliary_loss_clip": 0.01102984, + "auxiliary_loss_mlp": 0.01033432, + "balance_loss_clip": 1.02031875, + "balance_loss_mlp": 1.03493071, + "epoch": 0.6977303472117842, + "flos": 25848608653440.0, + "grad_norm": 2.371593906069345, + "language_loss": 0.81601393, + "learning_rate": 8.841482983203057e-07, + "loss": 0.83737808, + "num_input_tokens_seen": 250442925, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6796875, + "step": 11605, + "time_per_iteration": 2.4963574409484863 + }, + { + "auxiliary_loss_clip": 0.01102471, + "auxiliary_loss_mlp": 0.01029368, + "balance_loss_clip": 1.01794219, + "balance_loss_mlp": 1.03550363, + "epoch": 0.6977904704644521, + "flos": 20959586536320.0, + "grad_norm": 1.5505350039714891, + "language_loss": 0.70039761, + "learning_rate": 8.838251088113638e-07, + "loss": 0.72171599, + "num_input_tokens_seen": 250461220, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 11606, + "time_per_iteration": 2.464792490005493 + }, + { + "auxiliary_loss_clip": 0.01105207, + "auxiliary_loss_mlp": 0.01028269, + "balance_loss_clip": 1.0165329, + "balance_loss_mlp": 1.03639364, + "epoch": 0.6978505937171201, + "flos": 22055041566720.0, + "grad_norm": 2.811539216798812, + "language_loss": 0.8241694, + "learning_rate": 8.835019616278856e-07, + "loss": 0.84550416, + "num_input_tokens_seen": 250480975, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6875, + "step": 11607, + "time_per_iteration": 2.4532179832458496 + }, + { + "auxiliary_loss_clip": 0.01108161, + "auxiliary_loss_mlp": 0.01030785, + "balance_loss_clip": 1.01842856, + "balance_loss_mlp": 1.03726959, + "epoch": 0.697910716969788, + "flos": 20043720529920.0, + "grad_norm": 1.8001657478638917, + "language_loss": 0.7874788, + "learning_rate": 8.831788567821265e-07, + "loss": 0.80886829, + "num_input_tokens_seen": 250497980, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.70703125, + "step": 11608, + "time_per_iteration": 2.47961688041687 + }, + { + "auxiliary_loss_clip": 0.01103023, + "auxiliary_loss_mlp": 0.01031974, + "balance_loss_clip": 1.02052379, + "balance_loss_mlp": 1.03606093, + "epoch": 0.697970840222456, + "flos": 15888245961600.0, + "grad_norm": 1.8111202994770392, + "language_loss": 0.89970839, + "learning_rate": 8.828557942863357e-07, + "loss": 0.9210583, + "num_input_tokens_seen": 250511910, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 11609, + "time_per_iteration": 2.408423900604248 + }, + { + "auxiliary_loss_clip": 0.01104617, + "auxiliary_loss_mlp": 0.01027572, + "balance_loss_clip": 1.01522803, + "balance_loss_mlp": 1.03529525, + "epoch": 0.698030963475124, + "flos": 21215629658880.0, + "grad_norm": 2.1159011349331607, + "language_loss": 0.63904428, + "learning_rate": 8.82532774152765e-07, + "loss": 0.66036618, + "num_input_tokens_seen": 250531090, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 11610, + "time_per_iteration": 2.4653687477111816 + }, + { + "auxiliary_loss_clip": 0.01100567, + "auxiliary_loss_mlp": 0.01029938, + "balance_loss_clip": 1.01883924, + "balance_loss_mlp": 1.03393793, + "epoch": 0.698091086727792, + "flos": 33759728524800.0, + "grad_norm": 1.6195278662998478, + "language_loss": 0.84689248, + "learning_rate": 8.822097963936643e-07, + "loss": 0.86819756, + "num_input_tokens_seen": 250551565, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 11611, + "time_per_iteration": 2.5322601795196533 + }, + { + "auxiliary_loss_clip": 0.01104506, + "auxiliary_loss_mlp": 0.01030479, + "balance_loss_clip": 1.01864767, + "balance_loss_mlp": 1.03619266, + "epoch": 0.69815120998046, + "flos": 15887850912000.0, + "grad_norm": 1.902997346306539, + "language_loss": 0.71074033, + "learning_rate": 8.818868610212793e-07, + "loss": 0.73209023, + "num_input_tokens_seen": 250569625, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.68359375, + "step": 11612, + "time_per_iteration": 2.432530641555786 + }, + { + "auxiliary_loss_clip": 0.01100621, + "auxiliary_loss_mlp": 0.01031074, + "balance_loss_clip": 1.01988053, + "balance_loss_mlp": 1.03486013, + "epoch": 0.6982113332331279, + "flos": 18947044437120.0, + "grad_norm": 1.5615931118386375, + "language_loss": 0.80995202, + "learning_rate": 8.815639680478573e-07, + "loss": 0.83126897, + "num_input_tokens_seen": 250586960, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.65625, + "step": 11613, + "time_per_iteration": 2.429049253463745 + }, + { + "auxiliary_loss_clip": 0.01100637, + "auxiliary_loss_mlp": 0.01029075, + "balance_loss_clip": 1.01840007, + "balance_loss_mlp": 1.03550696, + "epoch": 0.6982714564857959, + "flos": 24389594115840.0, + "grad_norm": 1.8186173474764362, + "language_loss": 0.75323808, + "learning_rate": 8.812411174856411e-07, + "loss": 0.77453518, + "num_input_tokens_seen": 250605080, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.65234375, + "step": 11614, + "time_per_iteration": 2.469871997833252 + }, + { + "auxiliary_loss_clip": 0.01102382, + "auxiliary_loss_mlp": 0.01029848, + "balance_loss_clip": 1.01817775, + "balance_loss_mlp": 1.03613019, + "epoch": 0.6983315797384638, + "flos": 20083725302400.0, + "grad_norm": 2.4207105527318125, + "language_loss": 0.77124798, + "learning_rate": 8.809183093468746e-07, + "loss": 0.79257029, + "num_input_tokens_seen": 250623965, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 11615, + "time_per_iteration": 2.4482977390289307 + }, + { + "auxiliary_loss_clip": 0.01098585, + "auxiliary_loss_mlp": 0.01025272, + "balance_loss_clip": 1.01378596, + "balance_loss_mlp": 1.03474522, + "epoch": 0.6983917029911318, + "flos": 13512431664000.0, + "grad_norm": 2.152403248821291, + "language_loss": 0.73121244, + "learning_rate": 8.80595543643797e-07, + "loss": 0.752451, + "num_input_tokens_seen": 250640675, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 11616, + "time_per_iteration": 2.4637510776519775 + }, + { + "auxiliary_loss_clip": 0.01102545, + "auxiliary_loss_mlp": 0.01033011, + "balance_loss_clip": 1.02162004, + "balance_loss_mlp": 1.03698003, + "epoch": 0.6984518262437998, + "flos": 22018412672640.0, + "grad_norm": 1.620744160430393, + "language_loss": 0.84509301, + "learning_rate": 8.802728203886487e-07, + "loss": 0.86644858, + "num_input_tokens_seen": 250660295, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 11617, + "time_per_iteration": 2.4850711822509766 + }, + { + "auxiliary_loss_clip": 0.01105897, + "auxiliary_loss_mlp": 0.01035391, + "balance_loss_clip": 1.02358341, + "balance_loss_mlp": 1.03734601, + "epoch": 0.6985119494964678, + "flos": 18770615809920.0, + "grad_norm": 2.8091395621454884, + "language_loss": 0.59596443, + "learning_rate": 8.799501395936682e-07, + "loss": 0.61737734, + "num_input_tokens_seen": 250678155, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 11618, + "time_per_iteration": 2.4457621574401855 + }, + { + "auxiliary_loss_clip": 0.0110188, + "auxiliary_loss_mlp": 0.01032015, + "balance_loss_clip": 1.02075553, + "balance_loss_mlp": 1.03564835, + "epoch": 0.6985720727491357, + "flos": 22382834106240.0, + "grad_norm": 1.7259844025825606, + "language_loss": 0.82820493, + "learning_rate": 8.796275012710903e-07, + "loss": 0.84954393, + "num_input_tokens_seen": 250697230, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6640625, + "step": 11619, + "time_per_iteration": 2.4546103477478027 + }, + { + "auxiliary_loss_clip": 0.01097255, + "auxiliary_loss_mlp": 0.01029049, + "balance_loss_clip": 1.01878548, + "balance_loss_mlp": 1.0334444, + "epoch": 0.6986321960018037, + "flos": 39567884785920.0, + "grad_norm": 1.7065049310483924, + "language_loss": 0.67252052, + "learning_rate": 8.793049054331494e-07, + "loss": 0.69378352, + "num_input_tokens_seen": 250719865, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.63671875, + "step": 11620, + "time_per_iteration": 2.6086742877960205 + }, + { + "auxiliary_loss_clip": 0.01102039, + "auxiliary_loss_mlp": 0.01028774, + "balance_loss_clip": 1.0171926, + "balance_loss_mlp": 1.03403723, + "epoch": 0.6986923192544716, + "flos": 17967725055360.0, + "grad_norm": 2.0144848908668607, + "language_loss": 0.72543484, + "learning_rate": 8.789823520920794e-07, + "loss": 0.74674302, + "num_input_tokens_seen": 250736565, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 11621, + "time_per_iteration": 2.4109437465667725 + }, + { + "auxiliary_loss_clip": 0.01104286, + "auxiliary_loss_mlp": 0.01033661, + "balance_loss_clip": 1.02206206, + "balance_loss_mlp": 1.03532565, + "epoch": 0.6987524425071396, + "flos": 25594325297280.0, + "grad_norm": 1.8967396853715839, + "language_loss": 0.68434918, + "learning_rate": 8.7865984126011e-07, + "loss": 0.70572865, + "num_input_tokens_seen": 250757235, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6875, + "step": 11622, + "time_per_iteration": 2.4823949337005615 + }, + { + "auxiliary_loss_clip": 0.01097003, + "auxiliary_loss_mlp": 0.0102851, + "balance_loss_clip": 1.01771569, + "balance_loss_mlp": 1.03294408, + "epoch": 0.6988125657598077, + "flos": 17530081747200.0, + "grad_norm": 1.7255143974519898, + "language_loss": 0.62549627, + "learning_rate": 8.783373729494721e-07, + "loss": 0.6467514, + "num_input_tokens_seen": 250775585, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.640625, + "step": 11623, + "time_per_iteration": 2.4188036918640137 + }, + { + "auxiliary_loss_clip": 0.01104383, + "auxiliary_loss_mlp": 0.0102562, + "balance_loss_clip": 1.0135262, + "balance_loss_mlp": 1.03467298, + "epoch": 0.6988726890124756, + "flos": 39165721136640.0, + "grad_norm": 1.7388598441341108, + "language_loss": 0.60939074, + "learning_rate": 8.780149471723932e-07, + "loss": 0.63069075, + "num_input_tokens_seen": 250795725, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69921875, + "step": 11624, + "time_per_iteration": 2.5913877487182617 + }, + { + "auxiliary_loss_clip": 0.01102198, + "auxiliary_loss_mlp": 0.01035945, + "balance_loss_clip": 1.02349353, + "balance_loss_mlp": 1.03341901, + "epoch": 0.6989328122651436, + "flos": 20193468330240.0, + "grad_norm": 1.6753967170861992, + "language_loss": 0.78502715, + "learning_rate": 8.776925639411017e-07, + "loss": 0.80640858, + "num_input_tokens_seen": 250814555, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 11625, + "time_per_iteration": 2.4710693359375 + }, + { + "auxiliary_loss_clip": 0.01098526, + "auxiliary_loss_mlp": 0.01029914, + "balance_loss_clip": 1.01916766, + "balance_loss_mlp": 1.03475714, + "epoch": 0.6989929355178115, + "flos": 21834873152640.0, + "grad_norm": 1.9082516770255042, + "language_loss": 0.66193223, + "learning_rate": 8.773702232678188e-07, + "loss": 0.68321669, + "num_input_tokens_seen": 250833105, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.640625, + "step": 11626, + "time_per_iteration": 2.4523563385009766 + }, + { + "auxiliary_loss_clip": 0.01102348, + "auxiliary_loss_mlp": 0.01030649, + "balance_loss_clip": 1.01857281, + "balance_loss_mlp": 1.03522182, + "epoch": 0.6990530587704795, + "flos": 26322880855680.0, + "grad_norm": 1.7406688014675167, + "language_loss": 0.7007491, + "learning_rate": 8.770479251647697e-07, + "loss": 0.72207904, + "num_input_tokens_seen": 250852570, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.671875, + "step": 11627, + "time_per_iteration": 2.474536895751953 + }, + { + "auxiliary_loss_clip": 0.01098589, + "auxiliary_loss_mlp": 0.01025817, + "balance_loss_clip": 1.01572561, + "balance_loss_mlp": 1.03557801, + "epoch": 0.6991131820231474, + "flos": 19828975069440.0, + "grad_norm": 1.7260870632652867, + "language_loss": 0.62484425, + "learning_rate": 8.767256696441768e-07, + "loss": 0.64608836, + "num_input_tokens_seen": 250870500, + "router_z_loss_clip": 0.10107422, + "router_z_loss_mlp": 0.62890625, + "step": 11628, + "time_per_iteration": 2.466815710067749 + }, + { + "auxiliary_loss_clip": 0.01102216, + "auxiliary_loss_mlp": 0.01031088, + "balance_loss_clip": 1.01934004, + "balance_loss_mlp": 1.03518367, + "epoch": 0.6991733052758154, + "flos": 33984817102080.0, + "grad_norm": 2.3991163930052757, + "language_loss": 0.68365383, + "learning_rate": 8.764034567182581e-07, + "loss": 0.70498693, + "num_input_tokens_seen": 250892745, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 11629, + "time_per_iteration": 2.5539638996124268 + }, + { + "auxiliary_loss_clip": 0.01102664, + "auxiliary_loss_mlp": 0.01032595, + "balance_loss_clip": 1.02066183, + "balance_loss_mlp": 1.03708851, + "epoch": 0.6992334285284834, + "flos": 15633136592640.0, + "grad_norm": 1.6822972614586869, + "language_loss": 0.73017991, + "learning_rate": 8.760812863992337e-07, + "loss": 0.75153255, + "num_input_tokens_seen": 250910225, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 11630, + "time_per_iteration": 2.4794862270355225 + }, + { + "auxiliary_loss_clip": 0.01100869, + "auxiliary_loss_mlp": 0.01034714, + "balance_loss_clip": 1.0236398, + "balance_loss_mlp": 1.03656542, + "epoch": 0.6992935517811514, + "flos": 21726279360000.0, + "grad_norm": 1.6007473169297173, + "language_loss": 0.7410804, + "learning_rate": 8.757591586993196e-07, + "loss": 0.76243627, + "num_input_tokens_seen": 250929715, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.64453125, + "step": 11631, + "time_per_iteration": 2.4957640171051025 + }, + { + "auxiliary_loss_clip": 0.01106043, + "auxiliary_loss_mlp": 0.01029343, + "balance_loss_clip": 1.01692176, + "balance_loss_mlp": 1.03722155, + "epoch": 0.6993536750338193, + "flos": 20115254465280.0, + "grad_norm": 2.1507086916172153, + "language_loss": 0.8977077, + "learning_rate": 8.7543707363073e-07, + "loss": 0.91906154, + "num_input_tokens_seen": 250944230, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 11632, + "time_per_iteration": 2.44950008392334 + }, + { + "auxiliary_loss_clip": 0.01105644, + "auxiliary_loss_mlp": 0.01033481, + "balance_loss_clip": 1.02223396, + "balance_loss_mlp": 1.03784966, + "epoch": 0.6994137982864873, + "flos": 22010547594240.0, + "grad_norm": 1.6745752563732321, + "language_loss": 0.79724801, + "learning_rate": 8.751150312056792e-07, + "loss": 0.81863928, + "num_input_tokens_seen": 250961865, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.67578125, + "step": 11633, + "time_per_iteration": 2.4414355754852295 + }, + { + "auxiliary_loss_clip": 0.0110496, + "auxiliary_loss_mlp": 0.01031046, + "balance_loss_clip": 1.01837361, + "balance_loss_mlp": 1.03629565, + "epoch": 0.6994739215391552, + "flos": 25519020433920.0, + "grad_norm": 1.8513742632089842, + "language_loss": 0.6695196, + "learning_rate": 8.747930314363794e-07, + "loss": 0.69087964, + "num_input_tokens_seen": 250982025, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 11634, + "time_per_iteration": 2.512799024581909 + }, + { + "auxiliary_loss_clip": 0.01025073, + "auxiliary_loss_mlp": 0.01006178, + "balance_loss_clip": 1.0051055, + "balance_loss_mlp": 1.00443375, + "epoch": 0.6995340447918232, + "flos": 59128357691520.0, + "grad_norm": 0.7055663228963396, + "language_loss": 0.53125268, + "learning_rate": 8.744710743350412e-07, + "loss": 0.55156517, + "num_input_tokens_seen": 251046900, + "router_z_loss_clip": 0.01074219, + "router_z_loss_mlp": 0.20703125, + "step": 11635, + "time_per_iteration": 3.1653506755828857 + }, + { + "auxiliary_loss_clip": 0.01100006, + "auxiliary_loss_mlp": 0.01029622, + "balance_loss_clip": 1.01810038, + "balance_loss_mlp": 1.03436577, + "epoch": 0.6995941680444913, + "flos": 17967832796160.0, + "grad_norm": 1.634854939073058, + "language_loss": 0.82167876, + "learning_rate": 8.741491599138726e-07, + "loss": 0.84297502, + "num_input_tokens_seen": 251065050, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 11636, + "time_per_iteration": 3.8652594089508057 + }, + { + "auxiliary_loss_clip": 0.0110217, + "auxiliary_loss_mlp": 0.01025995, + "balance_loss_clip": 1.01429462, + "balance_loss_mlp": 1.03523159, + "epoch": 0.6996542912971592, + "flos": 21980095839360.0, + "grad_norm": 2.0826416356932764, + "language_loss": 0.83018386, + "learning_rate": 8.738272881850801e-07, + "loss": 0.85146558, + "num_input_tokens_seen": 251083355, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 11637, + "time_per_iteration": 2.471907615661621 + }, + { + "auxiliary_loss_clip": 0.01100142, + "auxiliary_loss_mlp": 0.01033162, + "balance_loss_clip": 1.02207518, + "balance_loss_mlp": 1.03530073, + "epoch": 0.6997144145498272, + "flos": 11686158518400.0, + "grad_norm": 2.0103377322341807, + "language_loss": 0.67541957, + "learning_rate": 8.735054591608704e-07, + "loss": 0.69675255, + "num_input_tokens_seen": 251096420, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6484375, + "step": 11638, + "time_per_iteration": 3.8712992668151855 + }, + { + "auxiliary_loss_clip": 0.01105589, + "auxiliary_loss_mlp": 0.01031049, + "balance_loss_clip": 1.01862764, + "balance_loss_mlp": 1.03554988, + "epoch": 0.6997745378024951, + "flos": 29607162958080.0, + "grad_norm": 3.4273717366145293, + "language_loss": 0.78027046, + "learning_rate": 8.731836728534459e-07, + "loss": 0.80163682, + "num_input_tokens_seen": 251115410, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.703125, + "step": 11639, + "time_per_iteration": 4.004430532455444 + }, + { + "auxiliary_loss_clip": 0.01104922, + "auxiliary_loss_mlp": 0.01035722, + "balance_loss_clip": 1.02339602, + "balance_loss_mlp": 1.03788579, + "epoch": 0.6998346610551631, + "flos": 20886616056960.0, + "grad_norm": 2.1417598387130807, + "language_loss": 0.82320189, + "learning_rate": 8.728619292750093e-07, + "loss": 0.84460831, + "num_input_tokens_seen": 251133530, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.671875, + "step": 11640, + "time_per_iteration": 3.938671588897705 + }, + { + "auxiliary_loss_clip": 0.01099361, + "auxiliary_loss_mlp": 0.01028635, + "balance_loss_clip": 1.01750684, + "balance_loss_mlp": 1.03294611, + "epoch": 0.699894784307831, + "flos": 27163046949120.0, + "grad_norm": 1.651631828879974, + "language_loss": 0.7513082, + "learning_rate": 8.725402284377619e-07, + "loss": 0.77258819, + "num_input_tokens_seen": 251153985, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 11641, + "time_per_iteration": 2.5288925170898438 + }, + { + "auxiliary_loss_clip": 0.01103165, + "auxiliary_loss_mlp": 0.01021586, + "balance_loss_clip": 1.00946224, + "balance_loss_mlp": 1.03693998, + "epoch": 0.699954907560499, + "flos": 20923640000640.0, + "grad_norm": 1.9866198731885556, + "language_loss": 0.78112102, + "learning_rate": 8.722185703539022e-07, + "loss": 0.80236852, + "num_input_tokens_seen": 251173225, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 11642, + "time_per_iteration": 2.4836714267730713 + }, + { + "auxiliary_loss_clip": 0.01106745, + "auxiliary_loss_mlp": 0.01033461, + "balance_loss_clip": 1.01997852, + "balance_loss_mlp": 1.03653657, + "epoch": 0.700015030813167, + "flos": 28657792540800.0, + "grad_norm": 3.5463939994986524, + "language_loss": 0.75054216, + "learning_rate": 8.718969550356266e-07, + "loss": 0.77194417, + "num_input_tokens_seen": 251192485, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.703125, + "step": 11643, + "time_per_iteration": 2.5334367752075195 + }, + { + "auxiliary_loss_clip": 0.01102548, + "auxiliary_loss_mlp": 0.01026355, + "balance_loss_clip": 1.01483929, + "balance_loss_mlp": 1.03516173, + "epoch": 0.700075154065835, + "flos": 29205286617600.0, + "grad_norm": 1.4977944271718722, + "language_loss": 0.60428506, + "learning_rate": 8.715753824951315e-07, + "loss": 0.62557411, + "num_input_tokens_seen": 251214965, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.67578125, + "step": 11644, + "time_per_iteration": 2.549466609954834 + }, + { + "auxiliary_loss_clip": 0.01099168, + "auxiliary_loss_mlp": 0.01026944, + "balance_loss_clip": 1.01587558, + "balance_loss_mlp": 1.03423524, + "epoch": 0.7001352773185029, + "flos": 23112431159040.0, + "grad_norm": 1.654773912405309, + "language_loss": 0.8168875, + "learning_rate": 8.712538527446119e-07, + "loss": 0.83814859, + "num_input_tokens_seen": 251234500, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6484375, + "step": 11645, + "time_per_iteration": 2.5374014377593994 + }, + { + "auxiliary_loss_clip": 0.01100534, + "auxiliary_loss_mlp": 0.01025535, + "balance_loss_clip": 1.01418638, + "balance_loss_mlp": 1.03470361, + "epoch": 0.7001954005711709, + "flos": 21322858734720.0, + "grad_norm": 1.9559227219413697, + "language_loss": 0.6827392, + "learning_rate": 8.709323657962584e-07, + "loss": 0.70399988, + "num_input_tokens_seen": 251254360, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 11646, + "time_per_iteration": 2.4721925258636475 + }, + { + "auxiliary_loss_clip": 0.01101074, + "auxiliary_loss_mlp": 0.01033463, + "balance_loss_clip": 1.02236462, + "balance_loss_mlp": 1.03534269, + "epoch": 0.7002555238238388, + "flos": 24535822383360.0, + "grad_norm": 1.4678938287912224, + "language_loss": 0.71031594, + "learning_rate": 8.706109216622635e-07, + "loss": 0.73166132, + "num_input_tokens_seen": 251274790, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.65625, + "step": 11647, + "time_per_iteration": 2.5134873390197754 + }, + { + "auxiliary_loss_clip": 0.01105174, + "auxiliary_loss_mlp": 0.01033483, + "balance_loss_clip": 1.02156842, + "balance_loss_mlp": 1.03716385, + "epoch": 0.7003156470765068, + "flos": 39056552726400.0, + "grad_norm": 1.703178589128687, + "language_loss": 0.71102858, + "learning_rate": 8.702895203548155e-07, + "loss": 0.73241514, + "num_input_tokens_seen": 251296275, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 11648, + "time_per_iteration": 2.5937957763671875 + }, + { + "auxiliary_loss_clip": 0.01099145, + "auxiliary_loss_mlp": 0.01027726, + "balance_loss_clip": 1.01577532, + "balance_loss_mlp": 1.03368604, + "epoch": 0.7003757703291749, + "flos": 28804092635520.0, + "grad_norm": 1.6329252584498772, + "language_loss": 0.77452666, + "learning_rate": 8.699681618861014e-07, + "loss": 0.79579538, + "num_input_tokens_seen": 251317375, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.65625, + "step": 11649, + "time_per_iteration": 2.517803907394409 + }, + { + "auxiliary_loss_clip": 0.01101252, + "auxiliary_loss_mlp": 0.01033228, + "balance_loss_clip": 1.02211761, + "balance_loss_mlp": 1.03584242, + "epoch": 0.7004358935818428, + "flos": 15953854152960.0, + "grad_norm": 1.77714876620496, + "language_loss": 0.78475487, + "learning_rate": 8.69646846268308e-07, + "loss": 0.80609971, + "num_input_tokens_seen": 251333570, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 11650, + "time_per_iteration": 2.40120530128479 + }, + { + "auxiliary_loss_clip": 0.0109906, + "auxiliary_loss_mlp": 0.01025547, + "balance_loss_clip": 1.01452041, + "balance_loss_mlp": 1.03317046, + "epoch": 0.7004960168345108, + "flos": 20411984718720.0, + "grad_norm": 2.032619640135715, + "language_loss": 0.78585541, + "learning_rate": 8.693255735136194e-07, + "loss": 0.80710149, + "num_input_tokens_seen": 251351070, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.66015625, + "step": 11651, + "time_per_iteration": 2.4667370319366455 + }, + { + "auxiliary_loss_clip": 0.01104452, + "auxiliary_loss_mlp": 0.01031415, + "balance_loss_clip": 1.02046514, + "balance_loss_mlp": 1.03640985, + "epoch": 0.7005561400871787, + "flos": 17347547808000.0, + "grad_norm": 1.5029723936879913, + "language_loss": 0.69227219, + "learning_rate": 8.690043436342198e-07, + "loss": 0.71363091, + "num_input_tokens_seen": 251370005, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6796875, + "step": 11652, + "time_per_iteration": 2.4276230335235596 + }, + { + "auxiliary_loss_clip": 0.01102036, + "auxiliary_loss_mlp": 0.01027935, + "balance_loss_clip": 1.01670551, + "balance_loss_mlp": 1.03644037, + "epoch": 0.7006162633398467, + "flos": 25302120157440.0, + "grad_norm": 1.3694191346433118, + "language_loss": 0.74200094, + "learning_rate": 8.686831566422874e-07, + "loss": 0.76330066, + "num_input_tokens_seen": 251391210, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 11653, + "time_per_iteration": 2.515753984451294 + }, + { + "auxiliary_loss_clip": 0.01102535, + "auxiliary_loss_mlp": 0.0102929, + "balance_loss_clip": 1.01641536, + "balance_loss_mlp": 1.03555512, + "epoch": 0.7006763865925146, + "flos": 20668997508480.0, + "grad_norm": 2.227987433936512, + "language_loss": 0.70499587, + "learning_rate": 8.68362012550003e-07, + "loss": 0.72631419, + "num_input_tokens_seen": 251411505, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.671875, + "step": 11654, + "time_per_iteration": 2.4286937713623047 + }, + { + "auxiliary_loss_clip": 0.01104582, + "auxiliary_loss_mlp": 0.01030492, + "balance_loss_clip": 1.01751626, + "balance_loss_mlp": 1.03610516, + "epoch": 0.7007365098451827, + "flos": 20046449963520.0, + "grad_norm": 2.4203729950028063, + "language_loss": 0.73474562, + "learning_rate": 8.680409113695453e-07, + "loss": 0.75609636, + "num_input_tokens_seen": 251428975, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.68359375, + "step": 11655, + "time_per_iteration": 2.4598588943481445 + }, + { + "auxiliary_loss_clip": 0.01110167, + "auxiliary_loss_mlp": 0.01036696, + "balance_loss_clip": 1.0236547, + "balance_loss_mlp": 1.03842175, + "epoch": 0.7007966330978506, + "flos": 20777375819520.0, + "grad_norm": 1.832010728467088, + "language_loss": 0.69950438, + "learning_rate": 8.677198531130889e-07, + "loss": 0.72097301, + "num_input_tokens_seen": 251446940, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 11656, + "time_per_iteration": 2.4319212436676025 + }, + { + "auxiliary_loss_clip": 0.01100001, + "auxiliary_loss_mlp": 0.01028207, + "balance_loss_clip": 1.01765096, + "balance_loss_mlp": 1.03448498, + "epoch": 0.7008567563505186, + "flos": 29638189330560.0, + "grad_norm": 1.5232296652544484, + "language_loss": 0.77772856, + "learning_rate": 8.673988377928092e-07, + "loss": 0.79901063, + "num_input_tokens_seen": 251466205, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.65625, + "step": 11657, + "time_per_iteration": 2.5232887268066406 + }, + { + "auxiliary_loss_clip": 0.01107629, + "auxiliary_loss_mlp": 0.01031267, + "balance_loss_clip": 1.01827931, + "balance_loss_mlp": 1.03665113, + "epoch": 0.7009168796031865, + "flos": 17092007475840.0, + "grad_norm": 2.426278289678233, + "language_loss": 0.77859247, + "learning_rate": 8.670778654208797e-07, + "loss": 0.79998142, + "num_input_tokens_seen": 251484820, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 11658, + "time_per_iteration": 2.5308613777160645 + }, + { + "auxiliary_loss_clip": 0.01098446, + "auxiliary_loss_mlp": 0.01024442, + "balance_loss_clip": 1.01329541, + "balance_loss_mlp": 1.03391457, + "epoch": 0.7009770028558545, + "flos": 20448972748800.0, + "grad_norm": 2.3274246978175803, + "language_loss": 0.82637346, + "learning_rate": 8.667569360094713e-07, + "loss": 0.84760237, + "num_input_tokens_seen": 251502670, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.64453125, + "step": 11659, + "time_per_iteration": 2.4660232067108154 + }, + { + "auxiliary_loss_clip": 0.01100216, + "auxiliary_loss_mlp": 0.01026445, + "balance_loss_clip": 1.01545429, + "balance_loss_mlp": 1.0353384, + "epoch": 0.7010371261085224, + "flos": 19245139407360.0, + "grad_norm": 1.9444226757743717, + "language_loss": 0.69085199, + "learning_rate": 8.664360495707526e-07, + "loss": 0.71211863, + "num_input_tokens_seen": 251521630, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6484375, + "step": 11660, + "time_per_iteration": 2.425694227218628 + }, + { + "auxiliary_loss_clip": 0.01102737, + "auxiliary_loss_mlp": 0.0103262, + "balance_loss_clip": 1.0202167, + "balance_loss_mlp": 1.03413391, + "epoch": 0.7010972493611904, + "flos": 22127581082880.0, + "grad_norm": 1.7015787806945502, + "language_loss": 0.80871427, + "learning_rate": 8.661152061168924e-07, + "loss": 0.83006787, + "num_input_tokens_seen": 251540105, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 11661, + "time_per_iteration": 2.4829437732696533 + }, + { + "auxiliary_loss_clip": 0.01100288, + "auxiliary_loss_mlp": 0.01030814, + "balance_loss_clip": 1.01967359, + "balance_loss_mlp": 1.033602, + "epoch": 0.7011573726138585, + "flos": 31391132860800.0, + "grad_norm": 3.059809361896724, + "language_loss": 0.78862965, + "learning_rate": 8.657944056600579e-07, + "loss": 0.80994064, + "num_input_tokens_seen": 251560530, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66796875, + "step": 11662, + "time_per_iteration": 2.5052289962768555 + }, + { + "auxiliary_loss_clip": 0.01102457, + "auxiliary_loss_mlp": 0.01023605, + "balance_loss_clip": 1.01188052, + "balance_loss_mlp": 1.03489375, + "epoch": 0.7012174958665264, + "flos": 18150582216960.0, + "grad_norm": 1.922970255639485, + "language_loss": 0.8358953, + "learning_rate": 8.654736482124134e-07, + "loss": 0.85715592, + "num_input_tokens_seen": 251577930, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.67578125, + "step": 11663, + "time_per_iteration": 2.4594573974609375 + }, + { + "auxiliary_loss_clip": 0.0102523, + "auxiliary_loss_mlp": 0.01007606, + "balance_loss_clip": 1.00651574, + "balance_loss_mlp": 1.00453377, + "epoch": 0.7012776191191944, + "flos": 60651256567680.0, + "grad_norm": 0.8204387591217913, + "language_loss": 0.53774929, + "learning_rate": 8.651529337861209e-07, + "loss": 0.55807763, + "num_input_tokens_seen": 251638820, + "router_z_loss_clip": 0.01092529, + "router_z_loss_mlp": 0.20703125, + "step": 11664, + "time_per_iteration": 3.0331904888153076 + }, + { + "auxiliary_loss_clip": 0.01104897, + "auxiliary_loss_mlp": 0.01030739, + "balance_loss_clip": 1.01900291, + "balance_loss_mlp": 1.03650737, + "epoch": 0.7013377423718623, + "flos": 27198598435200.0, + "grad_norm": 2.4272507893526143, + "language_loss": 0.78843081, + "learning_rate": 8.64832262393344e-07, + "loss": 0.80978715, + "num_input_tokens_seen": 251658070, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.68359375, + "step": 11665, + "time_per_iteration": 2.4934439659118652 + }, + { + "auxiliary_loss_clip": 0.01099902, + "auxiliary_loss_mlp": 0.01028028, + "balance_loss_clip": 1.01650083, + "balance_loss_mlp": 1.03361416, + "epoch": 0.7013978656245303, + "flos": 16543543731840.0, + "grad_norm": 2.269849765653923, + "language_loss": 0.77034938, + "learning_rate": 8.645116340462404e-07, + "loss": 0.79162872, + "num_input_tokens_seen": 251671575, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 11666, + "time_per_iteration": 2.4027786254882812 + }, + { + "auxiliary_loss_clip": 0.01101042, + "auxiliary_loss_mlp": 0.01026786, + "balance_loss_clip": 1.01623607, + "balance_loss_mlp": 1.0356462, + "epoch": 0.7014579888771982, + "flos": 23143780753920.0, + "grad_norm": 1.878568521742783, + "language_loss": 0.81238604, + "learning_rate": 8.641910487569695e-07, + "loss": 0.8336643, + "num_input_tokens_seen": 251689350, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.65625, + "step": 11667, + "time_per_iteration": 2.4780242443084717 + }, + { + "auxiliary_loss_clip": 0.0110046, + "auxiliary_loss_mlp": 0.01035616, + "balance_loss_clip": 1.0237546, + "balance_loss_mlp": 1.03487873, + "epoch": 0.7015181121298663, + "flos": 25082095397760.0, + "grad_norm": 2.0547760249868685, + "language_loss": 0.65335631, + "learning_rate": 8.638705065376879e-07, + "loss": 0.67471707, + "num_input_tokens_seen": 251704635, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.65625, + "step": 11668, + "time_per_iteration": 2.447939395904541 + }, + { + "auxiliary_loss_clip": 0.01103124, + "auxiliary_loss_mlp": 0.01022731, + "balance_loss_clip": 1.01117384, + "balance_loss_mlp": 1.03469038, + "epoch": 0.7015782353825342, + "flos": 23327894891520.0, + "grad_norm": 2.272329624033439, + "language_loss": 0.76275986, + "learning_rate": 8.635500074005519e-07, + "loss": 0.78401846, + "num_input_tokens_seen": 251723035, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.68359375, + "step": 11669, + "time_per_iteration": 2.4600765705108643 + }, + { + "auxiliary_loss_clip": 0.01025535, + "auxiliary_loss_mlp": 0.01006318, + "balance_loss_clip": 1.00525713, + "balance_loss_mlp": 1.00477183, + "epoch": 0.7016383586352022, + "flos": 70397161107840.0, + "grad_norm": 0.6922095034682588, + "language_loss": 0.54468822, + "learning_rate": 8.632295513577122e-07, + "loss": 0.56500673, + "num_input_tokens_seen": 251791630, + "router_z_loss_clip": 0.01062012, + "router_z_loss_mlp": 0.20703125, + "step": 11670, + "time_per_iteration": 3.1504855155944824 + }, + { + "auxiliary_loss_clip": 0.01100438, + "auxiliary_loss_mlp": 0.01031818, + "balance_loss_clip": 1.0203141, + "balance_loss_mlp": 1.03460622, + "epoch": 0.7016984818878701, + "flos": 19792274348160.0, + "grad_norm": 1.9909569240580678, + "language_loss": 0.81605625, + "learning_rate": 8.629091384213218e-07, + "loss": 0.83737886, + "num_input_tokens_seen": 251809840, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66015625, + "step": 11671, + "time_per_iteration": 2.475792169570923 + }, + { + "auxiliary_loss_clip": 0.011038, + "auxiliary_loss_mlp": 0.01028796, + "balance_loss_clip": 1.01734638, + "balance_loss_mlp": 1.03691864, + "epoch": 0.7017586051405381, + "flos": 12896923184640.0, + "grad_norm": 2.023044603900928, + "language_loss": 0.75000024, + "learning_rate": 8.625887686035313e-07, + "loss": 0.77132618, + "num_input_tokens_seen": 251827550, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 11672, + "time_per_iteration": 2.4228410720825195 + }, + { + "auxiliary_loss_clip": 0.0110057, + "auxiliary_loss_mlp": 0.01030144, + "balance_loss_clip": 1.01794934, + "balance_loss_mlp": 1.0343281, + "epoch": 0.701818728393206, + "flos": 18332828847360.0, + "grad_norm": 1.708219397381251, + "language_loss": 0.87053084, + "learning_rate": 8.622684419164883e-07, + "loss": 0.89183801, + "num_input_tokens_seen": 251844880, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66015625, + "step": 11673, + "time_per_iteration": 2.4504873752593994 + }, + { + "auxiliary_loss_clip": 0.01098005, + "auxiliary_loss_mlp": 0.01024449, + "balance_loss_clip": 1.01308239, + "balance_loss_mlp": 1.0342052, + "epoch": 0.701878851645874, + "flos": 17384212615680.0, + "grad_norm": 2.1494737009789935, + "language_loss": 0.72768337, + "learning_rate": 8.619481583723399e-07, + "loss": 0.74890792, + "num_input_tokens_seen": 251861025, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.63671875, + "step": 11674, + "time_per_iteration": 2.397975444793701 + }, + { + "auxiliary_loss_clip": 0.01100361, + "auxiliary_loss_mlp": 0.01027786, + "balance_loss_clip": 1.01708126, + "balance_loss_mlp": 1.03694451, + "epoch": 0.701938974898542, + "flos": 23915501481600.0, + "grad_norm": 1.5674244409742963, + "language_loss": 0.72100163, + "learning_rate": 8.616279179832329e-07, + "loss": 0.74228311, + "num_input_tokens_seen": 251880175, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6328125, + "step": 11675, + "time_per_iteration": 2.4895689487457275 + }, + { + "auxiliary_loss_clip": 0.01102681, + "auxiliary_loss_mlp": 0.01024344, + "balance_loss_clip": 1.01256597, + "balance_loss_mlp": 1.03593993, + "epoch": 0.70199909815121, + "flos": 21795586652160.0, + "grad_norm": 2.517132712975458, + "language_loss": 0.50993675, + "learning_rate": 8.613077207613078e-07, + "loss": 0.53120697, + "num_input_tokens_seen": 251899005, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.66796875, + "step": 11676, + "time_per_iteration": 2.4392223358154297 + }, + { + "auxiliary_loss_clip": 0.01024806, + "auxiliary_loss_mlp": 0.01002084, + "balance_loss_clip": 1.00087988, + "balance_loss_mlp": 1.00406504, + "epoch": 0.702059221403878, + "flos": 71715047109120.0, + "grad_norm": 0.7321379163768023, + "language_loss": 0.59195387, + "learning_rate": 8.609875667187079e-07, + "loss": 0.61222279, + "num_input_tokens_seen": 251966790, + "router_z_loss_clip": 0.01202393, + "router_z_loss_mlp": 0.20703125, + "step": 11677, + "time_per_iteration": 3.125434398651123 + }, + { + "auxiliary_loss_clip": 0.01103207, + "auxiliary_loss_mlp": 0.01026564, + "balance_loss_clip": 1.01498294, + "balance_loss_mlp": 1.03543353, + "epoch": 0.7021193446565459, + "flos": 28111052649600.0, + "grad_norm": 2.2320710813331304, + "language_loss": 0.62693989, + "learning_rate": 8.606674558675737e-07, + "loss": 0.64823759, + "num_input_tokens_seen": 251989315, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6796875, + "step": 11678, + "time_per_iteration": 4.006704330444336 + }, + { + "auxiliary_loss_clip": 0.01100355, + "auxiliary_loss_mlp": 0.01032338, + "balance_loss_clip": 1.02100134, + "balance_loss_mlp": 1.0344584, + "epoch": 0.7021794679092139, + "flos": 22924905229440.0, + "grad_norm": 1.8460467241007361, + "language_loss": 0.79242504, + "learning_rate": 8.603473882200444e-07, + "loss": 0.81375194, + "num_input_tokens_seen": 252006620, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 11679, + "time_per_iteration": 2.4555304050445557 + }, + { + "auxiliary_loss_clip": 0.01101096, + "auxiliary_loss_mlp": 0.01035801, + "balance_loss_clip": 1.02535808, + "balance_loss_mlp": 1.03703773, + "epoch": 0.7022395911618818, + "flos": 18077827219200.0, + "grad_norm": 2.331847817004221, + "language_loss": 0.70253718, + "learning_rate": 8.600273637882567e-07, + "loss": 0.7239061, + "num_input_tokens_seen": 252024570, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.640625, + "step": 11680, + "time_per_iteration": 3.8396050930023193 + }, + { + "auxiliary_loss_clip": 0.01105234, + "auxiliary_loss_mlp": 0.01031375, + "balance_loss_clip": 1.01941895, + "balance_loss_mlp": 1.03682303, + "epoch": 0.7022997144145499, + "flos": 16034294661120.0, + "grad_norm": 1.6980564631311013, + "language_loss": 0.74690676, + "learning_rate": 8.597073825843446e-07, + "loss": 0.76827282, + "num_input_tokens_seen": 252042775, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.68359375, + "step": 11681, + "time_per_iteration": 5.31316614151001 + }, + { + "auxiliary_loss_clip": 0.0110093, + "auxiliary_loss_mlp": 0.01030271, + "balance_loss_clip": 1.01963735, + "balance_loss_mlp": 1.03458714, + "epoch": 0.7023598376672178, + "flos": 26468678160000.0, + "grad_norm": 1.4988427000417734, + "language_loss": 0.76605582, + "learning_rate": 8.593874446204434e-07, + "loss": 0.78736782, + "num_input_tokens_seen": 252063690, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6640625, + "step": 11682, + "time_per_iteration": 2.4792110919952393 + }, + { + "auxiliary_loss_clip": 0.01103891, + "auxiliary_loss_mlp": 0.01033801, + "balance_loss_clip": 1.02231503, + "balance_loss_mlp": 1.03589272, + "epoch": 0.7024199609198858, + "flos": 17055917285760.0, + "grad_norm": 1.8311880743600102, + "language_loss": 0.73361951, + "learning_rate": 8.590675499086841e-07, + "loss": 0.75499648, + "num_input_tokens_seen": 252080335, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6796875, + "step": 11683, + "time_per_iteration": 2.434879779815674 + }, + { + "auxiliary_loss_clip": 0.0110308, + "auxiliary_loss_mlp": 0.01028117, + "balance_loss_clip": 1.01577854, + "balance_loss_mlp": 1.03725612, + "epoch": 0.7024800841725537, + "flos": 25849039616640.0, + "grad_norm": 1.7668169003154093, + "language_loss": 0.71169794, + "learning_rate": 8.587476984611976e-07, + "loss": 0.73300993, + "num_input_tokens_seen": 252101075, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.65625, + "step": 11684, + "time_per_iteration": 2.486572742462158 + }, + { + "auxiliary_loss_clip": 0.01101245, + "auxiliary_loss_mlp": 0.01031543, + "balance_loss_clip": 1.01969957, + "balance_loss_mlp": 1.03529143, + "epoch": 0.7025402074252217, + "flos": 23513014609920.0, + "grad_norm": 1.8432235400728463, + "language_loss": 0.72046304, + "learning_rate": 8.584278902901128e-07, + "loss": 0.74179095, + "num_input_tokens_seen": 252120510, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66015625, + "step": 11685, + "time_per_iteration": 2.5009102821350098 + }, + { + "auxiliary_loss_clip": 0.01101202, + "auxiliary_loss_mlp": 0.01029995, + "balance_loss_clip": 1.01923084, + "balance_loss_mlp": 1.03449953, + "epoch": 0.7026003306778896, + "flos": 20150985519360.0, + "grad_norm": 1.7057605239318525, + "language_loss": 0.84865069, + "learning_rate": 8.581081254075582e-07, + "loss": 0.86996263, + "num_input_tokens_seen": 252137590, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.66796875, + "step": 11686, + "time_per_iteration": 2.447744846343994 + }, + { + "auxiliary_loss_clip": 0.01025709, + "auxiliary_loss_mlp": 0.01003132, + "balance_loss_clip": 1.00203538, + "balance_loss_mlp": 1.00512934, + "epoch": 0.7026604539305576, + "flos": 64772400712320.0, + "grad_norm": 0.988856355007654, + "language_loss": 0.69923353, + "learning_rate": 8.577884038256566e-07, + "loss": 0.71952194, + "num_input_tokens_seen": 252199830, + "router_z_loss_clip": 0.01098633, + "router_z_loss_mlp": 0.20605469, + "step": 11687, + "time_per_iteration": 3.1910674571990967 + }, + { + "auxiliary_loss_clip": 0.01103018, + "auxiliary_loss_mlp": 0.01027664, + "balance_loss_clip": 1.0161128, + "balance_loss_mlp": 1.03627849, + "epoch": 0.7027205771832256, + "flos": 21871466133120.0, + "grad_norm": 2.17247822122661, + "language_loss": 0.77656871, + "learning_rate": 8.574687255565329e-07, + "loss": 0.79787552, + "num_input_tokens_seen": 252217200, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 11688, + "time_per_iteration": 2.472559928894043 + }, + { + "auxiliary_loss_clip": 0.01101422, + "auxiliary_loss_mlp": 0.01031189, + "balance_loss_clip": 1.01951802, + "balance_loss_mlp": 1.0350461, + "epoch": 0.7027807004358936, + "flos": 23367791923200.0, + "grad_norm": 2.0685575537033207, + "language_loss": 0.68521178, + "learning_rate": 8.571490906123107e-07, + "loss": 0.70653796, + "num_input_tokens_seen": 252236105, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 11689, + "time_per_iteration": 2.4660775661468506 + }, + { + "auxiliary_loss_clip": 0.01103667, + "auxiliary_loss_mlp": 0.01035824, + "balance_loss_clip": 1.02360475, + "balance_loss_mlp": 1.03517842, + "epoch": 0.7028408236885616, + "flos": 15304266645120.0, + "grad_norm": 2.110320581130951, + "language_loss": 0.79499185, + "learning_rate": 8.568294990051086e-07, + "loss": 0.81638682, + "num_input_tokens_seen": 252253315, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 11690, + "time_per_iteration": 2.510883331298828 + }, + { + "auxiliary_loss_clip": 0.01102324, + "auxiliary_loss_mlp": 0.01031703, + "balance_loss_clip": 1.01994324, + "balance_loss_mlp": 1.03600478, + "epoch": 0.7029009469412295, + "flos": 22018197191040.0, + "grad_norm": 1.5848883111705174, + "language_loss": 0.76091731, + "learning_rate": 8.56509950747047e-07, + "loss": 0.78225756, + "num_input_tokens_seen": 252272765, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6640625, + "step": 11691, + "time_per_iteration": 2.4371836185455322 + }, + { + "auxiliary_loss_clip": 0.01102138, + "auxiliary_loss_mlp": 0.01024652, + "balance_loss_clip": 1.01367295, + "balance_loss_mlp": 1.03720069, + "epoch": 0.7029610701938975, + "flos": 21835519597440.0, + "grad_norm": 1.7363845404220049, + "language_loss": 0.81481391, + "learning_rate": 8.561904458502429e-07, + "loss": 0.8360818, + "num_input_tokens_seen": 252290510, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6484375, + "step": 11692, + "time_per_iteration": 2.501248359680176 + }, + { + "auxiliary_loss_clip": 0.01099945, + "auxiliary_loss_mlp": 0.0102586, + "balance_loss_clip": 1.01414728, + "balance_loss_mlp": 1.03468466, + "epoch": 0.7030211934465654, + "flos": 19135647774720.0, + "grad_norm": 1.5395445178386533, + "language_loss": 0.76162529, + "learning_rate": 8.558709843268111e-07, + "loss": 0.78288329, + "num_input_tokens_seen": 252309365, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65234375, + "step": 11693, + "time_per_iteration": 2.452014923095703 + }, + { + "auxiliary_loss_clip": 0.01101571, + "auxiliary_loss_mlp": 0.01030054, + "balance_loss_clip": 1.01904464, + "balance_loss_mlp": 1.03672361, + "epoch": 0.7030813166992335, + "flos": 38546010766080.0, + "grad_norm": 1.51123653242133, + "language_loss": 0.68433905, + "learning_rate": 8.55551566188866e-07, + "loss": 0.70565528, + "num_input_tokens_seen": 252333010, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 11694, + "time_per_iteration": 2.6905438899993896 + }, + { + "auxiliary_loss_clip": 0.01101725, + "auxiliary_loss_mlp": 0.0103067, + "balance_loss_clip": 1.01921415, + "balance_loss_mlp": 1.03518784, + "epoch": 0.7031414399519014, + "flos": 14720897859840.0, + "grad_norm": 2.685426816457134, + "language_loss": 0.75926757, + "learning_rate": 8.552321914485203e-07, + "loss": 0.78059149, + "num_input_tokens_seen": 252351330, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 11695, + "time_per_iteration": 2.4197287559509277 + }, + { + "auxiliary_loss_clip": 0.01104949, + "auxiliary_loss_mlp": 0.0103903, + "balance_loss_clip": 1.02692449, + "balance_loss_mlp": 1.03704011, + "epoch": 0.7032015632045694, + "flos": 14027247342720.0, + "grad_norm": 2.1895380825721595, + "language_loss": 0.73749006, + "learning_rate": 8.549128601178852e-07, + "loss": 0.75892979, + "num_input_tokens_seen": 252369580, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 11696, + "time_per_iteration": 2.438162088394165 + }, + { + "auxiliary_loss_clip": 0.01102914, + "auxiliary_loss_mlp": 0.01027271, + "balance_loss_clip": 1.01507568, + "balance_loss_mlp": 1.03577912, + "epoch": 0.7032616864572373, + "flos": 27637175496960.0, + "grad_norm": 1.6020001034841755, + "language_loss": 0.75352108, + "learning_rate": 8.545935722090693e-07, + "loss": 0.77482289, + "num_input_tokens_seen": 252390525, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 11697, + "time_per_iteration": 2.50844669342041 + }, + { + "auxiliary_loss_clip": 0.01107405, + "auxiliary_loss_mlp": 0.01033634, + "balance_loss_clip": 1.02019286, + "balance_loss_mlp": 1.03933907, + "epoch": 0.7033218097099053, + "flos": 17967294092160.0, + "grad_norm": 1.763301186005729, + "language_loss": 0.8075971, + "learning_rate": 8.542743277341793e-07, + "loss": 0.82900751, + "num_input_tokens_seen": 252407470, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.6796875, + "step": 11698, + "time_per_iteration": 2.4794504642486572 + }, + { + "auxiliary_loss_clip": 0.01102647, + "auxiliary_loss_mlp": 0.01031888, + "balance_loss_clip": 1.01978219, + "balance_loss_mlp": 1.03481781, + "epoch": 0.7033819329625732, + "flos": 19501721233920.0, + "grad_norm": 1.4318828234621686, + "language_loss": 0.84606147, + "learning_rate": 8.539551267053222e-07, + "loss": 0.86740685, + "num_input_tokens_seen": 252427025, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 11699, + "time_per_iteration": 2.466271162033081 + }, + { + "auxiliary_loss_clip": 0.01102469, + "auxiliary_loss_mlp": 0.01028448, + "balance_loss_clip": 1.01603246, + "balance_loss_mlp": 1.03670907, + "epoch": 0.7034420562152413, + "flos": 23987645948160.0, + "grad_norm": 2.1706968176821326, + "language_loss": 0.79156339, + "learning_rate": 8.53635969134601e-07, + "loss": 0.81287259, + "num_input_tokens_seen": 252445410, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.65625, + "step": 11700, + "time_per_iteration": 2.4769561290740967 + }, + { + "auxiliary_loss_clip": 0.01102749, + "auxiliary_loss_mlp": 0.01023696, + "balance_loss_clip": 1.01164412, + "balance_loss_mlp": 1.0352428, + "epoch": 0.7035021794679092, + "flos": 35043427756800.0, + "grad_norm": 1.698709640635861, + "language_loss": 0.74290204, + "learning_rate": 8.533168550341186e-07, + "loss": 0.76416653, + "num_input_tokens_seen": 252463905, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.67578125, + "step": 11701, + "time_per_iteration": 2.5410683155059814 + }, + { + "auxiliary_loss_clip": 0.01105173, + "auxiliary_loss_mlp": 0.01027306, + "balance_loss_clip": 1.01449096, + "balance_loss_mlp": 1.03693128, + "epoch": 0.7035623027205772, + "flos": 10997428164480.0, + "grad_norm": 2.241875664618386, + "language_loss": 0.83804989, + "learning_rate": 8.529977844159769e-07, + "loss": 0.8593747, + "num_input_tokens_seen": 252478655, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.68359375, + "step": 11702, + "time_per_iteration": 2.4136838912963867 + }, + { + "auxiliary_loss_clip": 0.01102777, + "auxiliary_loss_mlp": 0.01031389, + "balance_loss_clip": 1.01974845, + "balance_loss_mlp": 1.03585792, + "epoch": 0.7036224259732452, + "flos": 23623727304960.0, + "grad_norm": 17.73315944125735, + "language_loss": 0.60806382, + "learning_rate": 8.526787572922738e-07, + "loss": 0.62940544, + "num_input_tokens_seen": 252498740, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66796875, + "step": 11703, + "time_per_iteration": 2.4728925228118896 + }, + { + "auxiliary_loss_clip": 0.0109967, + "auxiliary_loss_mlp": 0.01027379, + "balance_loss_clip": 1.01538706, + "balance_loss_mlp": 1.03344357, + "epoch": 0.7036825492259131, + "flos": 31686175175040.0, + "grad_norm": 1.86622111466138, + "language_loss": 0.60721993, + "learning_rate": 8.523597736751067e-07, + "loss": 0.62849051, + "num_input_tokens_seen": 252517800, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 11704, + "time_per_iteration": 2.5538487434387207 + }, + { + "auxiliary_loss_clip": 0.01097343, + "auxiliary_loss_mlp": 0.01030559, + "balance_loss_clip": 1.02000296, + "balance_loss_mlp": 1.03398025, + "epoch": 0.7037426724785811, + "flos": 30192866127360.0, + "grad_norm": 1.6367819423893837, + "language_loss": 0.70355535, + "learning_rate": 8.520408335765719e-07, + "loss": 0.72483432, + "num_input_tokens_seen": 252539620, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6328125, + "step": 11705, + "time_per_iteration": 2.5196011066436768 + }, + { + "auxiliary_loss_clip": 0.01100052, + "auxiliary_loss_mlp": 0.01027822, + "balance_loss_clip": 1.01636624, + "balance_loss_mlp": 1.03497076, + "epoch": 0.703802795731249, + "flos": 24311523905280.0, + "grad_norm": 2.637724615159266, + "language_loss": 0.61509889, + "learning_rate": 8.517219370087645e-07, + "loss": 0.63637763, + "num_input_tokens_seen": 252557300, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65234375, + "step": 11706, + "time_per_iteration": 2.4852991104125977 + }, + { + "auxiliary_loss_clip": 0.01102393, + "auxiliary_loss_mlp": 0.01026458, + "balance_loss_clip": 1.01521683, + "balance_loss_mlp": 1.03553593, + "epoch": 0.7038629189839171, + "flos": 22528954632960.0, + "grad_norm": 2.2484984676875563, + "language_loss": 0.68121183, + "learning_rate": 8.514030839837756e-07, + "loss": 0.70250034, + "num_input_tokens_seen": 252576715, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.66796875, + "step": 11707, + "time_per_iteration": 2.4560024738311768 + }, + { + "auxiliary_loss_clip": 0.01097433, + "auxiliary_loss_mlp": 0.01027441, + "balance_loss_clip": 1.0162648, + "balance_loss_mlp": 1.03335011, + "epoch": 0.703923042236585, + "flos": 26250484993920.0, + "grad_norm": 1.7446259905587083, + "language_loss": 0.76487923, + "learning_rate": 8.510842745136974e-07, + "loss": 0.78612804, + "num_input_tokens_seen": 252596190, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.640625, + "step": 11708, + "time_per_iteration": 2.515327215194702 + }, + { + "auxiliary_loss_clip": 0.01099228, + "auxiliary_loss_mlp": 0.01024966, + "balance_loss_clip": 1.01421952, + "balance_loss_mlp": 1.03512418, + "epoch": 0.703983165489253, + "flos": 19390254353280.0, + "grad_norm": 1.893368388386225, + "language_loss": 0.72055292, + "learning_rate": 8.50765508610619e-07, + "loss": 0.74179482, + "num_input_tokens_seen": 252613410, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.640625, + "step": 11709, + "time_per_iteration": 2.431182384490967 + }, + { + "auxiliary_loss_clip": 0.01099189, + "auxiliary_loss_mlp": 0.01025216, + "balance_loss_clip": 1.01399827, + "balance_loss_mlp": 1.03375983, + "epoch": 0.7040432887419209, + "flos": 16683630773760.0, + "grad_norm": 2.079430411231168, + "language_loss": 0.79054451, + "learning_rate": 8.504467862866267e-07, + "loss": 0.81178856, + "num_input_tokens_seen": 252629150, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 11710, + "time_per_iteration": 2.3997299671173096 + }, + { + "auxiliary_loss_clip": 0.01104493, + "auxiliary_loss_mlp": 0.01030463, + "balance_loss_clip": 1.01852989, + "balance_loss_mlp": 1.03760147, + "epoch": 0.7041034119945889, + "flos": 21141402203520.0, + "grad_norm": 1.6049139638931622, + "language_loss": 0.77447236, + "learning_rate": 8.501281075538076e-07, + "loss": 0.79582191, + "num_input_tokens_seen": 252648225, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.671875, + "step": 11711, + "time_per_iteration": 2.4934744834899902 + }, + { + "auxiliary_loss_clip": 0.01098365, + "auxiliary_loss_mlp": 0.01024434, + "balance_loss_clip": 1.01442647, + "balance_loss_mlp": 1.03375506, + "epoch": 0.7041635352472568, + "flos": 16910299549440.0, + "grad_norm": 2.37459605810246, + "language_loss": 0.73933756, + "learning_rate": 8.498094724242457e-07, + "loss": 0.76056558, + "num_input_tokens_seen": 252665380, + "router_z_loss_clip": 0.10009766, + "router_z_loss_mlp": 0.64453125, + "step": 11712, + "time_per_iteration": 2.4414384365081787 + }, + { + "auxiliary_loss_clip": 0.01025006, + "auxiliary_loss_mlp": 0.01006413, + "balance_loss_clip": 1.00531662, + "balance_loss_mlp": 1.00448298, + "epoch": 0.7042236584999249, + "flos": 71681219475840.0, + "grad_norm": 0.8819337057085826, + "language_loss": 0.64707136, + "learning_rate": 8.494908809100247e-07, + "loss": 0.66738558, + "num_input_tokens_seen": 252727950, + "router_z_loss_clip": 0.01098633, + "router_z_loss_mlp": 0.20507812, + "step": 11713, + "time_per_iteration": 3.1559205055236816 + }, + { + "auxiliary_loss_clip": 0.0109808, + "auxiliary_loss_mlp": 0.01024348, + "balance_loss_clip": 1.01370883, + "balance_loss_mlp": 1.03258777, + "epoch": 0.7042837817525928, + "flos": 28658187590400.0, + "grad_norm": 2.337022160062714, + "language_loss": 0.72537225, + "learning_rate": 8.49172333023225e-07, + "loss": 0.74659657, + "num_input_tokens_seen": 252746770, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.65625, + "step": 11714, + "time_per_iteration": 2.5274534225463867 + }, + { + "auxiliary_loss_clip": 0.01100361, + "auxiliary_loss_mlp": 0.01030547, + "balance_loss_clip": 1.01889992, + "balance_loss_mlp": 1.03500628, + "epoch": 0.7043439050052608, + "flos": 19753562465280.0, + "grad_norm": 1.5791768588768047, + "language_loss": 0.79251838, + "learning_rate": 8.488538287759248e-07, + "loss": 0.81382746, + "num_input_tokens_seen": 252765610, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 11715, + "time_per_iteration": 2.423422336578369 + }, + { + "auxiliary_loss_clip": 0.01102527, + "auxiliary_loss_mlp": 0.01031536, + "balance_loss_clip": 1.02010405, + "balance_loss_mlp": 1.03536105, + "epoch": 0.7044040282579288, + "flos": 11538529620480.0, + "grad_norm": 2.2156697071751204, + "language_loss": 0.71082246, + "learning_rate": 8.485353681802037e-07, + "loss": 0.73216307, + "num_input_tokens_seen": 252781610, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.671875, + "step": 11716, + "time_per_iteration": 2.407350540161133 + }, + { + "auxiliary_loss_clip": 0.01105426, + "auxiliary_loss_mlp": 0.01028892, + "balance_loss_clip": 1.0173167, + "balance_loss_mlp": 1.03666377, + "epoch": 0.7044641515105967, + "flos": 33656126722560.0, + "grad_norm": 1.9148933155218295, + "language_loss": 0.66782308, + "learning_rate": 8.482169512481358e-07, + "loss": 0.68916631, + "num_input_tokens_seen": 252800600, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6875, + "step": 11717, + "time_per_iteration": 2.525740146636963 + }, + { + "auxiliary_loss_clip": 0.01102186, + "auxiliary_loss_mlp": 0.01028377, + "balance_loss_clip": 1.01697445, + "balance_loss_mlp": 1.03591442, + "epoch": 0.7045242747632647, + "flos": 26723859356160.0, + "grad_norm": 1.4782257349417278, + "language_loss": 0.7415244, + "learning_rate": 8.478985779917967e-07, + "loss": 0.76283002, + "num_input_tokens_seen": 252822310, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 11718, + "time_per_iteration": 2.5084335803985596 + }, + { + "auxiliary_loss_clip": 0.01100672, + "auxiliary_loss_mlp": 0.01031212, + "balance_loss_clip": 1.02055478, + "balance_loss_mlp": 1.03563166, + "epoch": 0.7045843980159326, + "flos": 26797655848320.0, + "grad_norm": 1.542276447013311, + "language_loss": 0.79529881, + "learning_rate": 8.475802484232606e-07, + "loss": 0.81661767, + "num_input_tokens_seen": 252842355, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6484375, + "step": 11719, + "time_per_iteration": 3.982532024383545 + }, + { + "auxiliary_loss_clip": 0.011017, + "auxiliary_loss_mlp": 0.01032857, + "balance_loss_clip": 1.02105546, + "balance_loss_mlp": 1.03649902, + "epoch": 0.7046445212686007, + "flos": 41574824363520.0, + "grad_norm": 1.7315117799773545, + "language_loss": 0.65495813, + "learning_rate": 8.472619625545951e-07, + "loss": 0.67630363, + "num_input_tokens_seen": 252866785, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65234375, + "step": 11720, + "time_per_iteration": 2.613939046859741 + }, + { + "auxiliary_loss_clip": 0.01103943, + "auxiliary_loss_mlp": 0.01027827, + "balance_loss_clip": 1.01631165, + "balance_loss_mlp": 1.03645182, + "epoch": 0.7047046445212686, + "flos": 15560166113280.0, + "grad_norm": 2.050842345880835, + "language_loss": 0.79890549, + "learning_rate": 8.46943720397872e-07, + "loss": 0.82022321, + "num_input_tokens_seen": 252881870, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.67578125, + "step": 11721, + "time_per_iteration": 3.8472952842712402 + }, + { + "auxiliary_loss_clip": 0.01025354, + "auxiliary_loss_mlp": 0.0100049, + "balance_loss_clip": 0.99931604, + "balance_loss_mlp": 1.00471067, + "epoch": 0.7047647677739366, + "flos": 70410269571840.0, + "grad_norm": 0.7603746797437617, + "language_loss": 0.64777911, + "learning_rate": 8.466255219651582e-07, + "loss": 0.66803753, + "num_input_tokens_seen": 252951300, + "router_z_loss_clip": 0.01171875, + "router_z_loss_mlp": 0.20703125, + "step": 11722, + "time_per_iteration": 4.5988264083862305 + }, + { + "auxiliary_loss_clip": 0.01101223, + "auxiliary_loss_mlp": 0.01031226, + "balance_loss_clip": 1.02053356, + "balance_loss_mlp": 1.03678107, + "epoch": 0.7048248910266045, + "flos": 23660032976640.0, + "grad_norm": 1.538856016334547, + "language_loss": 0.65742815, + "learning_rate": 8.463073672685211e-07, + "loss": 0.67875266, + "num_input_tokens_seen": 252971400, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.64453125, + "step": 11723, + "time_per_iteration": 3.925845146179199 + }, + { + "auxiliary_loss_clip": 0.01103786, + "auxiliary_loss_mlp": 0.01027901, + "balance_loss_clip": 1.01623046, + "balance_loss_mlp": 1.03655779, + "epoch": 0.7048850142792725, + "flos": 21397158017280.0, + "grad_norm": 1.8916483795909507, + "language_loss": 0.81127882, + "learning_rate": 8.459892563200235e-07, + "loss": 0.83259565, + "num_input_tokens_seen": 252989475, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.671875, + "step": 11724, + "time_per_iteration": 2.4720969200134277 + }, + { + "auxiliary_loss_clip": 0.01100772, + "auxiliary_loss_mlp": 0.01034144, + "balance_loss_clip": 1.02252126, + "balance_loss_mlp": 1.03349257, + "epoch": 0.7049451375319404, + "flos": 21648101408640.0, + "grad_norm": 2.093101088286717, + "language_loss": 0.72902447, + "learning_rate": 8.456711891317296e-07, + "loss": 0.75037366, + "num_input_tokens_seen": 253007220, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 11725, + "time_per_iteration": 2.4452946186065674 + }, + { + "auxiliary_loss_clip": 0.01103396, + "auxiliary_loss_mlp": 0.01029478, + "balance_loss_clip": 1.01771188, + "balance_loss_mlp": 1.03560305, + "epoch": 0.7050052607846085, + "flos": 14866802904960.0, + "grad_norm": 2.2997258543703847, + "language_loss": 0.78231096, + "learning_rate": 8.453531657156998e-07, + "loss": 0.80363971, + "num_input_tokens_seen": 253025410, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6796875, + "step": 11726, + "time_per_iteration": 2.4585561752319336 + }, + { + "auxiliary_loss_clip": 0.0110172, + "auxiliary_loss_mlp": 0.01028095, + "balance_loss_clip": 1.01683593, + "balance_loss_mlp": 1.0345757, + "epoch": 0.7050653840372764, + "flos": 19241763528960.0, + "grad_norm": 1.8306322081887336, + "language_loss": 0.70494819, + "learning_rate": 8.450351860839931e-07, + "loss": 0.72624636, + "num_input_tokens_seen": 253043305, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 11727, + "time_per_iteration": 2.4121358394622803 + }, + { + "auxiliary_loss_clip": 0.01093352, + "auxiliary_loss_mlp": 0.01023523, + "balance_loss_clip": 1.01340246, + "balance_loss_mlp": 1.03211212, + "epoch": 0.7051255072899444, + "flos": 27780422935680.0, + "grad_norm": 1.6678218850336868, + "language_loss": 0.69096273, + "learning_rate": 8.44717250248668e-07, + "loss": 0.7121315, + "num_input_tokens_seen": 253062790, + "router_z_loss_clip": 0.10107422, + "router_z_loss_mlp": 0.61328125, + "step": 11728, + "time_per_iteration": 2.5468525886535645 + }, + { + "auxiliary_loss_clip": 0.0110237, + "auxiliary_loss_mlp": 0.01029013, + "balance_loss_clip": 1.01771235, + "balance_loss_mlp": 1.03713453, + "epoch": 0.7051856305426124, + "flos": 27892033470720.0, + "grad_norm": 3.1019246116397774, + "language_loss": 0.73087037, + "learning_rate": 8.443993582217803e-07, + "loss": 0.75218427, + "num_input_tokens_seen": 253082055, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 11729, + "time_per_iteration": 2.4827933311462402 + }, + { + "auxiliary_loss_clip": 0.01106229, + "auxiliary_loss_mlp": 0.01033191, + "balance_loss_clip": 1.02045989, + "balance_loss_mlp": 1.03594112, + "epoch": 0.7052457537952803, + "flos": 25043563082880.0, + "grad_norm": 1.545567199994104, + "language_loss": 0.77897024, + "learning_rate": 8.440815100153862e-07, + "loss": 0.80036438, + "num_input_tokens_seen": 253102575, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 11730, + "time_per_iteration": 2.493704080581665 + }, + { + "auxiliary_loss_clip": 0.0110104, + "auxiliary_loss_mlp": 0.01030193, + "balance_loss_clip": 1.01871312, + "balance_loss_mlp": 1.03360641, + "epoch": 0.7053058770479483, + "flos": 21871717528320.0, + "grad_norm": 2.123896450725626, + "language_loss": 0.62706244, + "learning_rate": 8.437637056415359e-07, + "loss": 0.64837468, + "num_input_tokens_seen": 253121290, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.671875, + "step": 11731, + "time_per_iteration": 2.459735631942749 + }, + { + "auxiliary_loss_clip": 0.01103723, + "auxiliary_loss_mlp": 0.01025352, + "balance_loss_clip": 1.01358604, + "balance_loss_mlp": 1.03539586, + "epoch": 0.7053660003006162, + "flos": 16398716094720.0, + "grad_norm": 2.3898643418724888, + "language_loss": 0.74733448, + "learning_rate": 8.434459451122815e-07, + "loss": 0.76862514, + "num_input_tokens_seen": 253139720, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.68359375, + "step": 11732, + "time_per_iteration": 2.4383316040039062 + }, + { + "auxiliary_loss_clip": 0.01100804, + "auxiliary_loss_mlp": 0.0102585, + "balance_loss_clip": 1.0146327, + "balance_loss_mlp": 1.03631091, + "epoch": 0.7054261235532843, + "flos": 22711560399360.0, + "grad_norm": 1.6140204941030658, + "language_loss": 0.70913476, + "learning_rate": 8.431282284396735e-07, + "loss": 0.73040134, + "num_input_tokens_seen": 253160250, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.64453125, + "step": 11733, + "time_per_iteration": 2.463106632232666 + }, + { + "auxiliary_loss_clip": 0.0109822, + "auxiliary_loss_mlp": 0.01030378, + "balance_loss_clip": 1.01916051, + "balance_loss_mlp": 1.0332557, + "epoch": 0.7054862468059522, + "flos": 13589711775360.0, + "grad_norm": 1.8693202683913837, + "language_loss": 0.73223364, + "learning_rate": 8.428105556357583e-07, + "loss": 0.75351965, + "num_input_tokens_seen": 253178710, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 11734, + "time_per_iteration": 2.44874906539917 + }, + { + "auxiliary_loss_clip": 0.01105433, + "auxiliary_loss_mlp": 0.01034147, + "balance_loss_clip": 1.02211308, + "balance_loss_mlp": 1.03561354, + "epoch": 0.7055463700586202, + "flos": 15880704105600.0, + "grad_norm": 2.1460182030345423, + "language_loss": 0.69040471, + "learning_rate": 8.424929267125829e-07, + "loss": 0.71180052, + "num_input_tokens_seen": 253194805, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.69921875, + "step": 11735, + "time_per_iteration": 2.3848354816436768 + }, + { + "auxiliary_loss_clip": 0.01103108, + "auxiliary_loss_mlp": 0.01034516, + "balance_loss_clip": 1.02173638, + "balance_loss_mlp": 1.03526986, + "epoch": 0.7056064933112881, + "flos": 23076161400960.0, + "grad_norm": 2.0775841009488105, + "language_loss": 0.72464728, + "learning_rate": 8.421753416821933e-07, + "loss": 0.74602348, + "num_input_tokens_seen": 253213895, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 11736, + "time_per_iteration": 2.4738998413085938 + }, + { + "auxiliary_loss_clip": 0.01101906, + "auxiliary_loss_mlp": 0.01022502, + "balance_loss_clip": 1.01198161, + "balance_loss_mlp": 1.03716493, + "epoch": 0.7056666165639561, + "flos": 24057168721920.0, + "grad_norm": 1.8965770447194195, + "language_loss": 0.69242585, + "learning_rate": 8.41857800556629e-07, + "loss": 0.71366996, + "num_input_tokens_seen": 253231620, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6484375, + "step": 11737, + "time_per_iteration": 2.486891031265259 + }, + { + "auxiliary_loss_clip": 0.01104553, + "auxiliary_loss_mlp": 0.01036427, + "balance_loss_clip": 1.02426147, + "balance_loss_mlp": 1.03642035, + "epoch": 0.705726739816624, + "flos": 17493237371520.0, + "grad_norm": 3.675344969023003, + "language_loss": 0.6783061, + "learning_rate": 8.415403033479332e-07, + "loss": 0.69971591, + "num_input_tokens_seen": 253249590, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.68359375, + "step": 11738, + "time_per_iteration": 2.553422212600708 + }, + { + "auxiliary_loss_clip": 0.01101113, + "auxiliary_loss_mlp": 0.01028831, + "balance_loss_clip": 1.01684439, + "balance_loss_mlp": 1.03525221, + "epoch": 0.7057868630692921, + "flos": 51350426472960.0, + "grad_norm": 1.822626738464323, + "language_loss": 0.75158858, + "learning_rate": 8.41222850068145e-07, + "loss": 0.77288795, + "num_input_tokens_seen": 253273870, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66015625, + "step": 11739, + "time_per_iteration": 2.7234206199645996 + }, + { + "auxiliary_loss_clip": 0.01096979, + "auxiliary_loss_mlp": 0.01024687, + "balance_loss_clip": 1.01327837, + "balance_loss_mlp": 1.03416896, + "epoch": 0.70584698632196, + "flos": 26102963836800.0, + "grad_norm": 1.6386606118434162, + "language_loss": 0.71622884, + "learning_rate": 8.409054407293032e-07, + "loss": 0.73744547, + "num_input_tokens_seen": 253293720, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.62890625, + "step": 11740, + "time_per_iteration": 2.5212409496307373 + }, + { + "auxiliary_loss_clip": 0.01101026, + "auxiliary_loss_mlp": 0.01025615, + "balance_loss_clip": 1.01523209, + "balance_loss_mlp": 1.03545165, + "epoch": 0.705907109574628, + "flos": 21543134889600.0, + "grad_norm": 1.6725006196923968, + "language_loss": 0.81998235, + "learning_rate": 8.405880753434434e-07, + "loss": 0.84124875, + "num_input_tokens_seen": 253313700, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.65625, + "step": 11741, + "time_per_iteration": 2.492769241333008 + }, + { + "auxiliary_loss_clip": 0.01100552, + "auxiliary_loss_mlp": 0.0102846, + "balance_loss_clip": 1.01662874, + "balance_loss_mlp": 1.03408957, + "epoch": 0.705967232827296, + "flos": 22710842127360.0, + "grad_norm": 3.596961466154263, + "language_loss": 0.78171599, + "learning_rate": 8.402707539225993e-07, + "loss": 0.80300617, + "num_input_tokens_seen": 253332425, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 11742, + "time_per_iteration": 2.4635274410247803 + }, + { + "auxiliary_loss_clip": 0.01105195, + "auxiliary_loss_mlp": 0.01028943, + "balance_loss_clip": 1.01668274, + "balance_loss_mlp": 1.03600883, + "epoch": 0.7060273560799639, + "flos": 28691225124480.0, + "grad_norm": 1.573979132261771, + "language_loss": 0.64315516, + "learning_rate": 8.39953476478805e-07, + "loss": 0.66449654, + "num_input_tokens_seen": 253353620, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.69140625, + "step": 11743, + "time_per_iteration": 2.5026400089263916 + }, + { + "auxiliary_loss_clip": 0.01102792, + "auxiliary_loss_mlp": 0.01026845, + "balance_loss_clip": 1.01475716, + "balance_loss_mlp": 1.03465271, + "epoch": 0.7060874793326319, + "flos": 15706178899200.0, + "grad_norm": 2.3718798915613846, + "language_loss": 0.65446359, + "learning_rate": 8.396362430240902e-07, + "loss": 0.67576003, + "num_input_tokens_seen": 253370930, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.68359375, + "step": 11744, + "time_per_iteration": 2.458536386489868 + }, + { + "auxiliary_loss_clip": 0.01099117, + "auxiliary_loss_mlp": 0.01030101, + "balance_loss_clip": 1.0185678, + "balance_loss_mlp": 1.03479218, + "epoch": 0.7061476025852998, + "flos": 21506757390720.0, + "grad_norm": 1.9180320114034342, + "language_loss": 0.6355719, + "learning_rate": 8.393190535704857e-07, + "loss": 0.65686405, + "num_input_tokens_seen": 253389810, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 11745, + "time_per_iteration": 2.462301254272461 + }, + { + "auxiliary_loss_clip": 0.0110159, + "auxiliary_loss_mlp": 0.0102864, + "balance_loss_clip": 1.01734483, + "balance_loss_mlp": 1.03486073, + "epoch": 0.7062077258379679, + "flos": 28181832399360.0, + "grad_norm": 1.843467279794647, + "language_loss": 0.71770209, + "learning_rate": 8.390019081300188e-07, + "loss": 0.73900437, + "num_input_tokens_seen": 253408685, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66796875, + "step": 11746, + "time_per_iteration": 2.528543472290039 + }, + { + "auxiliary_loss_clip": 0.01102256, + "auxiliary_loss_mlp": 0.01030658, + "balance_loss_clip": 1.01882029, + "balance_loss_mlp": 1.03566575, + "epoch": 0.7062678490906358, + "flos": 27853680723840.0, + "grad_norm": 1.4097258428408725, + "language_loss": 0.79373205, + "learning_rate": 8.386848067147175e-07, + "loss": 0.81506121, + "num_input_tokens_seen": 253429685, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 11747, + "time_per_iteration": 2.479778528213501 + }, + { + "auxiliary_loss_clip": 0.01099735, + "auxiliary_loss_mlp": 0.01027659, + "balance_loss_clip": 1.01698387, + "balance_loss_mlp": 1.03513098, + "epoch": 0.7063279723433038, + "flos": 23184862934400.0, + "grad_norm": 1.7869226712906443, + "language_loss": 0.65377176, + "learning_rate": 8.383677493366031e-07, + "loss": 0.67504573, + "num_input_tokens_seen": 253448260, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.64453125, + "step": 11748, + "time_per_iteration": 2.4946584701538086 + }, + { + "auxiliary_loss_clip": 0.01101478, + "auxiliary_loss_mlp": 0.01034387, + "balance_loss_clip": 1.02267504, + "balance_loss_mlp": 1.03426147, + "epoch": 0.7063880955959717, + "flos": 20188655907840.0, + "grad_norm": 1.990623957456742, + "language_loss": 0.79503167, + "learning_rate": 8.380507360077003e-07, + "loss": 0.8163904, + "num_input_tokens_seen": 253467725, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 11749, + "time_per_iteration": 2.4612464904785156 + }, + { + "auxiliary_loss_clip": 0.01024671, + "auxiliary_loss_mlp": 0.010023, + "balance_loss_clip": 1.0011971, + "balance_loss_mlp": 1.00396466, + "epoch": 0.7064482188486397, + "flos": 63668182763520.0, + "grad_norm": 0.788003911856545, + "language_loss": 0.54088426, + "learning_rate": 8.377337667400304e-07, + "loss": 0.56115395, + "num_input_tokens_seen": 253526940, + "router_z_loss_clip": 0.01104736, + "router_z_loss_mlp": 0.20703125, + "step": 11750, + "time_per_iteration": 2.998089075088501 + }, + { + "auxiliary_loss_clip": 0.01103221, + "auxiliary_loss_mlp": 0.01029796, + "balance_loss_clip": 1.01806545, + "balance_loss_mlp": 1.03667092, + "epoch": 0.7065083421013076, + "flos": 25191227894400.0, + "grad_norm": 2.4248797762244725, + "language_loss": 0.7843067, + "learning_rate": 8.37416841545612e-07, + "loss": 0.80563688, + "num_input_tokens_seen": 253546160, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 11751, + "time_per_iteration": 2.4795496463775635 + }, + { + "auxiliary_loss_clip": 0.01096512, + "auxiliary_loss_mlp": 0.01027672, + "balance_loss_clip": 1.01685357, + "balance_loss_mlp": 1.03329563, + "epoch": 0.7065684653539757, + "flos": 22893699288960.0, + "grad_norm": 1.7553518859924266, + "language_loss": 0.67958248, + "learning_rate": 8.370999604364634e-07, + "loss": 0.70082432, + "num_input_tokens_seen": 253565505, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6328125, + "step": 11752, + "time_per_iteration": 2.4724245071411133 + }, + { + "auxiliary_loss_clip": 0.01100964, + "auxiliary_loss_mlp": 0.01034731, + "balance_loss_clip": 1.02317405, + "balance_loss_mlp": 1.03582311, + "epoch": 0.7066285886066436, + "flos": 23550254035200.0, + "grad_norm": 2.8550758527521567, + "language_loss": 0.76533222, + "learning_rate": 8.367831234246025e-07, + "loss": 0.78668916, + "num_input_tokens_seen": 253585125, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65234375, + "step": 11753, + "time_per_iteration": 2.5033509731292725 + }, + { + "auxiliary_loss_clip": 0.01099171, + "auxiliary_loss_mlp": 0.01026978, + "balance_loss_clip": 1.01595759, + "balance_loss_mlp": 1.03566098, + "epoch": 0.7066887118593116, + "flos": 21069293650560.0, + "grad_norm": 1.8063663453491996, + "language_loss": 0.710163, + "learning_rate": 8.364663305220405e-07, + "loss": 0.73142445, + "num_input_tokens_seen": 253604815, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.63671875, + "step": 11754, + "time_per_iteration": 2.47737979888916 + }, + { + "auxiliary_loss_clip": 0.01101217, + "auxiliary_loss_mlp": 0.01032473, + "balance_loss_clip": 1.02061772, + "balance_loss_mlp": 1.03515744, + "epoch": 0.7067488351119796, + "flos": 21176307244800.0, + "grad_norm": 1.555791916243094, + "language_loss": 0.89167392, + "learning_rate": 8.361495817407919e-07, + "loss": 0.91301078, + "num_input_tokens_seen": 253622855, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66015625, + "step": 11755, + "time_per_iteration": 2.4300765991210938 + }, + { + "auxiliary_loss_clip": 0.01099783, + "auxiliary_loss_mlp": 0.01032572, + "balance_loss_clip": 1.0210979, + "balance_loss_mlp": 1.03451729, + "epoch": 0.7068089583646475, + "flos": 20449224144000.0, + "grad_norm": 1.6305430191953068, + "language_loss": 0.79877228, + "learning_rate": 8.358328770928678e-07, + "loss": 0.82009578, + "num_input_tokens_seen": 253642760, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65625, + "step": 11756, + "time_per_iteration": 2.455738067626953 + }, + { + "auxiliary_loss_clip": 0.01025525, + "auxiliary_loss_mlp": 0.00998571, + "balance_loss_clip": 0.99742049, + "balance_loss_mlp": 1.00500059, + "epoch": 0.7068690816173155, + "flos": 59109179829120.0, + "grad_norm": 0.8167477619249136, + "language_loss": 0.60323715, + "learning_rate": 8.355162165902785e-07, + "loss": 0.62347817, + "num_input_tokens_seen": 253695685, + "router_z_loss_clip": 0.01147461, + "router_z_loss_mlp": 0.20507812, + "step": 11757, + "time_per_iteration": 2.8279542922973633 + }, + { + "auxiliary_loss_clip": 0.01103404, + "auxiliary_loss_mlp": 0.01030296, + "balance_loss_clip": 1.0194478, + "balance_loss_mlp": 1.03670585, + "epoch": 0.7069292048699835, + "flos": 16251554073600.0, + "grad_norm": 2.9383193028665335, + "language_loss": 0.80605227, + "learning_rate": 8.351996002450307e-07, + "loss": 0.82738924, + "num_input_tokens_seen": 253713305, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.66796875, + "step": 11758, + "time_per_iteration": 2.438985824584961 + }, + { + "auxiliary_loss_clip": 0.01097896, + "auxiliary_loss_mlp": 0.01032742, + "balance_loss_clip": 1.02111876, + "balance_loss_mlp": 1.03326845, + "epoch": 0.7069893281226515, + "flos": 41172768455040.0, + "grad_norm": 2.302594291056757, + "language_loss": 0.77111626, + "learning_rate": 8.348830280691304e-07, + "loss": 0.79242271, + "num_input_tokens_seen": 253736100, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6484375, + "step": 11759, + "time_per_iteration": 2.6082146167755127 + }, + { + "auxiliary_loss_clip": 0.01099992, + "auxiliary_loss_mlp": 0.01030363, + "balance_loss_clip": 1.01839471, + "balance_loss_mlp": 1.03407705, + "epoch": 0.7070494513753194, + "flos": 24207275658240.0, + "grad_norm": 1.8203560783968598, + "language_loss": 0.67900372, + "learning_rate": 8.34566500074583e-07, + "loss": 0.70030731, + "num_input_tokens_seen": 253757350, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66015625, + "step": 11760, + "time_per_iteration": 2.4875950813293457 + }, + { + "auxiliary_loss_clip": 0.01103064, + "auxiliary_loss_mlp": 0.01033498, + "balance_loss_clip": 1.02223873, + "balance_loss_mlp": 1.03625393, + "epoch": 0.7071095746279874, + "flos": 20185675079040.0, + "grad_norm": 1.8036620557159548, + "language_loss": 0.80104721, + "learning_rate": 8.342500162733899e-07, + "loss": 0.82241285, + "num_input_tokens_seen": 253772855, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.66796875, + "step": 11761, + "time_per_iteration": 3.7999839782714844 + }, + { + "auxiliary_loss_clip": 0.01101999, + "auxiliary_loss_mlp": 0.01030339, + "balance_loss_clip": 1.01776791, + "balance_loss_mlp": 1.03520203, + "epoch": 0.7071696978806553, + "flos": 18183045133440.0, + "grad_norm": 2.4050467781095697, + "language_loss": 0.74975789, + "learning_rate": 8.33933576677553e-07, + "loss": 0.77108127, + "num_input_tokens_seen": 253790360, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.66796875, + "step": 11762, + "time_per_iteration": 2.408281087875366 + }, + { + "auxiliary_loss_clip": 0.01100811, + "auxiliary_loss_mlp": 0.0102852, + "balance_loss_clip": 1.01743984, + "balance_loss_mlp": 1.03630018, + "epoch": 0.7072298211333233, + "flos": 24131719399680.0, + "grad_norm": 1.750455145965042, + "language_loss": 0.76771009, + "learning_rate": 8.336171812990724e-07, + "loss": 0.78900343, + "num_input_tokens_seen": 253810585, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.64453125, + "step": 11763, + "time_per_iteration": 3.8708484172821045 + }, + { + "auxiliary_loss_clip": 0.01101144, + "auxiliary_loss_mlp": 0.01033004, + "balance_loss_clip": 1.02082658, + "balance_loss_mlp": 1.03537869, + "epoch": 0.7072899443859912, + "flos": 27198418867200.0, + "grad_norm": 2.2813098001672527, + "language_loss": 0.78606045, + "learning_rate": 8.333008301499453e-07, + "loss": 0.8074019, + "num_input_tokens_seen": 253829080, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65625, + "step": 11764, + "time_per_iteration": 3.926267623901367 + }, + { + "auxiliary_loss_clip": 0.0110389, + "auxiliary_loss_mlp": 0.01036608, + "balance_loss_clip": 1.02440739, + "balance_loss_mlp": 1.03585315, + "epoch": 0.7073500676386593, + "flos": 16435596384000.0, + "grad_norm": 1.4902481922059967, + "language_loss": 0.79271352, + "learning_rate": 8.32984523242167e-07, + "loss": 0.8141185, + "num_input_tokens_seen": 253846780, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6796875, + "step": 11765, + "time_per_iteration": 3.9003517627716064 + }, + { + "auxiliary_loss_clip": 0.01097952, + "auxiliary_loss_mlp": 0.01025366, + "balance_loss_clip": 1.01503086, + "balance_loss_mlp": 1.03383851, + "epoch": 0.7074101908913272, + "flos": 27673732563840.0, + "grad_norm": 1.6100965300159724, + "language_loss": 0.68550825, + "learning_rate": 8.326682605877324e-07, + "loss": 0.70674151, + "num_input_tokens_seen": 253867075, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.640625, + "step": 11766, + "time_per_iteration": 2.4833571910858154 + }, + { + "auxiliary_loss_clip": 0.01100721, + "auxiliary_loss_mlp": 0.0103238, + "balance_loss_clip": 1.02058399, + "balance_loss_mlp": 1.03390872, + "epoch": 0.7074703141439952, + "flos": 22238078296320.0, + "grad_norm": 1.8537677939151296, + "language_loss": 0.63282174, + "learning_rate": 8.323520421986352e-07, + "loss": 0.65415275, + "num_input_tokens_seen": 253885790, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.66796875, + "step": 11767, + "time_per_iteration": 2.4963812828063965 + }, + { + "auxiliary_loss_clip": 0.01100427, + "auxiliary_loss_mlp": 0.01029022, + "balance_loss_clip": 1.0175842, + "balance_loss_mlp": 1.03403151, + "epoch": 0.7075304373966632, + "flos": 29643217234560.0, + "grad_norm": 1.4756633405104822, + "language_loss": 0.52592945, + "learning_rate": 8.320358680868646e-07, + "loss": 0.54722404, + "num_input_tokens_seen": 253907070, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6640625, + "step": 11768, + "time_per_iteration": 2.5584144592285156 + }, + { + "auxiliary_loss_clip": 0.01098381, + "auxiliary_loss_mlp": 0.01028365, + "balance_loss_clip": 1.01779723, + "balance_loss_mlp": 1.03422117, + "epoch": 0.7075905606493311, + "flos": 19755214490880.0, + "grad_norm": 2.0331888903396296, + "language_loss": 0.75885397, + "learning_rate": 8.317197382644119e-07, + "loss": 0.78012145, + "num_input_tokens_seen": 253927290, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.640625, + "step": 11769, + "time_per_iteration": 2.474039077758789 + }, + { + "auxiliary_loss_clip": 0.01025061, + "auxiliary_loss_mlp": 0.01004429, + "balance_loss_clip": 1.00333822, + "balance_loss_mlp": 1.00454879, + "epoch": 0.7076506839019991, + "flos": 65716132694400.0, + "grad_norm": 0.8547700200374695, + "language_loss": 0.6197865, + "learning_rate": 8.314036527432637e-07, + "loss": 0.64008141, + "num_input_tokens_seen": 253983440, + "router_z_loss_clip": 0.01092529, + "router_z_loss_mlp": 0.20507812, + "step": 11770, + "time_per_iteration": 2.9852561950683594 + }, + { + "auxiliary_loss_clip": 0.01103159, + "auxiliary_loss_mlp": 0.01032983, + "balance_loss_clip": 1.02135992, + "balance_loss_mlp": 1.03515804, + "epoch": 0.707710807154667, + "flos": 23765286804480.0, + "grad_norm": 1.6682974029871904, + "language_loss": 0.76099932, + "learning_rate": 8.310876115354055e-07, + "loss": 0.78236079, + "num_input_tokens_seen": 254003825, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 11771, + "time_per_iteration": 2.4772582054138184 + }, + { + "auxiliary_loss_clip": 0.01096997, + "auxiliary_loss_mlp": 0.01025955, + "balance_loss_clip": 1.01532149, + "balance_loss_mlp": 1.03349578, + "epoch": 0.7077709304073351, + "flos": 21251360712960.0, + "grad_norm": 1.5504616161071019, + "language_loss": 0.71518672, + "learning_rate": 8.307716146528221e-07, + "loss": 0.73641628, + "num_input_tokens_seen": 254023345, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6328125, + "step": 11772, + "time_per_iteration": 2.443416118621826 + }, + { + "auxiliary_loss_clip": 0.011025, + "auxiliary_loss_mlp": 0.01030203, + "balance_loss_clip": 1.01823425, + "balance_loss_mlp": 1.03437555, + "epoch": 0.707831053660003, + "flos": 20740746925440.0, + "grad_norm": 2.392750359759926, + "language_loss": 0.69805288, + "learning_rate": 8.30455662107496e-07, + "loss": 0.7193799, + "num_input_tokens_seen": 254041815, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6796875, + "step": 11773, + "time_per_iteration": 2.4619219303131104 + }, + { + "auxiliary_loss_clip": 0.01101421, + "auxiliary_loss_mlp": 0.0103239, + "balance_loss_clip": 1.02130961, + "balance_loss_mlp": 1.03520298, + "epoch": 0.707891176912671, + "flos": 21980993679360.0, + "grad_norm": 1.496714779410967, + "language_loss": 0.70210946, + "learning_rate": 8.301397539114095e-07, + "loss": 0.72344756, + "num_input_tokens_seen": 254062065, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6640625, + "step": 11774, + "time_per_iteration": 2.446824073791504 + }, + { + "auxiliary_loss_clip": 0.01098362, + "auxiliary_loss_mlp": 0.01027842, + "balance_loss_clip": 1.01658893, + "balance_loss_mlp": 1.03544569, + "epoch": 0.7079513001653389, + "flos": 21068970428160.0, + "grad_norm": 1.5148638748080412, + "language_loss": 0.74460763, + "learning_rate": 8.298238900765407e-07, + "loss": 0.76586962, + "num_input_tokens_seen": 254080605, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.62890625, + "step": 11775, + "time_per_iteration": 2.482792854309082 + }, + { + "auxiliary_loss_clip": 0.01102892, + "auxiliary_loss_mlp": 0.01026682, + "balance_loss_clip": 1.01527333, + "balance_loss_mlp": 1.03621781, + "epoch": 0.7080114234180069, + "flos": 18040659621120.0, + "grad_norm": 1.8403672382430083, + "language_loss": 0.86566663, + "learning_rate": 8.295080706148665e-07, + "loss": 0.88696229, + "num_input_tokens_seen": 254098710, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 11776, + "time_per_iteration": 2.425718069076538 + }, + { + "auxiliary_loss_clip": 0.01098917, + "auxiliary_loss_mlp": 0.01027548, + "balance_loss_clip": 1.01670027, + "balance_loss_mlp": 1.03438497, + "epoch": 0.7080715466706748, + "flos": 15122271409920.0, + "grad_norm": 1.5328522694355011, + "language_loss": 0.74733853, + "learning_rate": 8.291922955383641e-07, + "loss": 0.76860321, + "num_input_tokens_seen": 254117200, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.64453125, + "step": 11777, + "time_per_iteration": 2.4531426429748535 + }, + { + "auxiliary_loss_clip": 0.01106707, + "auxiliary_loss_mlp": 0.01029139, + "balance_loss_clip": 1.01738548, + "balance_loss_mlp": 1.0374651, + "epoch": 0.7081316699233429, + "flos": 14422802889600.0, + "grad_norm": 2.558875929872249, + "language_loss": 0.82017881, + "learning_rate": 8.288765648590066e-07, + "loss": 0.84153724, + "num_input_tokens_seen": 254132115, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6953125, + "step": 11778, + "time_per_iteration": 2.4829678535461426 + }, + { + "auxiliary_loss_clip": 0.01097091, + "auxiliary_loss_mlp": 0.01028381, + "balance_loss_clip": 1.0185461, + "balance_loss_mlp": 1.03495932, + "epoch": 0.7081917931760108, + "flos": 23222389668480.0, + "grad_norm": 1.514152254548671, + "language_loss": 0.84892875, + "learning_rate": 8.285608785887673e-07, + "loss": 0.87018347, + "num_input_tokens_seen": 254152285, + "router_z_loss_clip": 0.09863281, + "router_z_loss_mlp": 0.62109375, + "step": 11779, + "time_per_iteration": 2.484011173248291 + }, + { + "auxiliary_loss_clip": 0.01103023, + "auxiliary_loss_mlp": 0.01037082, + "balance_loss_clip": 1.02578115, + "balance_loss_mlp": 1.03680944, + "epoch": 0.7082519164286788, + "flos": 39308429871360.0, + "grad_norm": 2.0385221770512474, + "language_loss": 0.71657723, + "learning_rate": 8.28245236739618e-07, + "loss": 0.73797828, + "num_input_tokens_seen": 254172805, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66015625, + "step": 11780, + "time_per_iteration": 2.5964436531066895 + }, + { + "auxiliary_loss_clip": 0.0110027, + "auxiliary_loss_mlp": 0.01029233, + "balance_loss_clip": 1.01806879, + "balance_loss_mlp": 1.03559303, + "epoch": 0.7083120396813467, + "flos": 21651154064640.0, + "grad_norm": 1.4808752741388003, + "language_loss": 0.72866988, + "learning_rate": 8.279296393235256e-07, + "loss": 0.74996495, + "num_input_tokens_seen": 254191890, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.64453125, + "step": 11781, + "time_per_iteration": 2.583249807357788 + }, + { + "auxiliary_loss_clip": 0.01100497, + "auxiliary_loss_mlp": 0.01031477, + "balance_loss_clip": 1.02093256, + "balance_loss_mlp": 1.03541338, + "epoch": 0.7083721629340147, + "flos": 17567033863680.0, + "grad_norm": 1.5571808268796947, + "language_loss": 0.77223784, + "learning_rate": 8.276140863524585e-07, + "loss": 0.79355758, + "num_input_tokens_seen": 254210150, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6484375, + "step": 11782, + "time_per_iteration": 2.4219703674316406 + }, + { + "auxiliary_loss_clip": 0.01098336, + "auxiliary_loss_mlp": 0.01025106, + "balance_loss_clip": 1.0149796, + "balance_loss_mlp": 1.03362107, + "epoch": 0.7084322861866827, + "flos": 29350509304320.0, + "grad_norm": 3.8090510781636273, + "language_loss": 0.69602305, + "learning_rate": 8.272985778383828e-07, + "loss": 0.71725744, + "num_input_tokens_seen": 254233015, + "router_z_loss_clip": 0.1015625, + "router_z_loss_mlp": 0.6484375, + "step": 11783, + "time_per_iteration": 2.532317638397217 + }, + { + "auxiliary_loss_clip": 0.01103454, + "auxiliary_loss_mlp": 0.01028711, + "balance_loss_clip": 1.01744008, + "balance_loss_mlp": 1.03593731, + "epoch": 0.7084924094393507, + "flos": 20194294343040.0, + "grad_norm": 1.6689397610891612, + "language_loss": 0.79052562, + "learning_rate": 8.269831137932632e-07, + "loss": 0.81184721, + "num_input_tokens_seen": 254251345, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.67578125, + "step": 11784, + "time_per_iteration": 2.443634271621704 + }, + { + "auxiliary_loss_clip": 0.01100479, + "auxiliary_loss_mlp": 0.01027476, + "balance_loss_clip": 1.01634157, + "balance_loss_mlp": 1.03534245, + "epoch": 0.7085525326920187, + "flos": 23477211728640.0, + "grad_norm": 2.217987534439464, + "language_loss": 0.77291393, + "learning_rate": 8.266676942290609e-07, + "loss": 0.79419351, + "num_input_tokens_seen": 254269905, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 11785, + "time_per_iteration": 2.4818367958068848 + }, + { + "auxiliary_loss_clip": 0.01100759, + "auxiliary_loss_mlp": 0.01029862, + "balance_loss_clip": 1.01869774, + "balance_loss_mlp": 1.03610969, + "epoch": 0.7086126559446866, + "flos": 25958818558080.0, + "grad_norm": 1.6474825992078, + "language_loss": 0.77668089, + "learning_rate": 8.26352319157738e-07, + "loss": 0.7979871, + "num_input_tokens_seen": 254289990, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6484375, + "step": 11786, + "time_per_iteration": 2.4843997955322266 + }, + { + "auxiliary_loss_clip": 0.01103028, + "auxiliary_loss_mlp": 0.01025097, + "balance_loss_clip": 1.01389718, + "balance_loss_mlp": 1.03586793, + "epoch": 0.7086727791973546, + "flos": 26724793109760.0, + "grad_norm": 2.2462918540494865, + "language_loss": 0.78872836, + "learning_rate": 8.260369885912526e-07, + "loss": 0.8100096, + "num_input_tokens_seen": 254309085, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 11787, + "time_per_iteration": 2.5082507133483887 + }, + { + "auxiliary_loss_clip": 0.01100945, + "auxiliary_loss_mlp": 0.01027499, + "balance_loss_clip": 1.01635325, + "balance_loss_mlp": 1.03544235, + "epoch": 0.7087329024500225, + "flos": 21683365585920.0, + "grad_norm": 1.6974940078994716, + "language_loss": 0.76277357, + "learning_rate": 8.257217025415615e-07, + "loss": 0.78405803, + "num_input_tokens_seen": 254327045, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 11788, + "time_per_iteration": 2.4395945072174072 + }, + { + "auxiliary_loss_clip": 0.01105244, + "auxiliary_loss_mlp": 0.01028353, + "balance_loss_clip": 1.01596761, + "balance_loss_mlp": 1.03661728, + "epoch": 0.7087930257026905, + "flos": 17931060247680.0, + "grad_norm": 2.1698748278708644, + "language_loss": 0.67896038, + "learning_rate": 8.254064610206212e-07, + "loss": 0.70029634, + "num_input_tokens_seen": 254344585, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 11789, + "time_per_iteration": 2.4851551055908203 + }, + { + "auxiliary_loss_clip": 0.01102295, + "auxiliary_loss_mlp": 0.01027538, + "balance_loss_clip": 1.01540208, + "balance_loss_mlp": 1.0347805, + "epoch": 0.7088531489553584, + "flos": 18911528864640.0, + "grad_norm": 1.6812027162903995, + "language_loss": 0.77360779, + "learning_rate": 8.250912640403858e-07, + "loss": 0.79490614, + "num_input_tokens_seen": 254362470, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 11790, + "time_per_iteration": 2.42874813079834 + }, + { + "auxiliary_loss_clip": 0.01105386, + "auxiliary_loss_mlp": 0.01028585, + "balance_loss_clip": 1.01652074, + "balance_loss_mlp": 1.03555274, + "epoch": 0.7089132722080265, + "flos": 27380880979200.0, + "grad_norm": 2.1572989383917864, + "language_loss": 0.70921314, + "learning_rate": 8.247761116128085e-07, + "loss": 0.73055279, + "num_input_tokens_seen": 254383190, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.69921875, + "step": 11791, + "time_per_iteration": 2.5331575870513916 + }, + { + "auxiliary_loss_clip": 0.01101819, + "auxiliary_loss_mlp": 0.01028888, + "balance_loss_clip": 1.01735473, + "balance_loss_mlp": 1.03576159, + "epoch": 0.7089733954606944, + "flos": 22162917087360.0, + "grad_norm": 2.1052262710476968, + "language_loss": 0.81886566, + "learning_rate": 8.244610037498376e-07, + "loss": 0.84017277, + "num_input_tokens_seen": 254403115, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66015625, + "step": 11792, + "time_per_iteration": 2.4568569660186768 + }, + { + "auxiliary_loss_clip": 0.01102816, + "auxiliary_loss_mlp": 0.01027492, + "balance_loss_clip": 1.01563632, + "balance_loss_mlp": 1.03356898, + "epoch": 0.7090335187133624, + "flos": 24425827960320.0, + "grad_norm": 1.890918416074432, + "language_loss": 0.64758253, + "learning_rate": 8.241459404634232e-07, + "loss": 0.66888559, + "num_input_tokens_seen": 254421875, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.69140625, + "step": 11793, + "time_per_iteration": 2.508790969848633 + }, + { + "auxiliary_loss_clip": 0.011012, + "auxiliary_loss_mlp": 0.01027317, + "balance_loss_clip": 1.01664209, + "balance_loss_mlp": 1.03602946, + "epoch": 0.7090936419660303, + "flos": 21835232288640.0, + "grad_norm": 2.7723759797175505, + "language_loss": 0.70710409, + "learning_rate": 8.238309217655133e-07, + "loss": 0.7283892, + "num_input_tokens_seen": 254440765, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.65234375, + "step": 11794, + "time_per_iteration": 2.4677059650421143 + }, + { + "auxiliary_loss_clip": 0.01102435, + "auxiliary_loss_mlp": 0.01029941, + "balance_loss_clip": 1.01953435, + "balance_loss_mlp": 1.03833604, + "epoch": 0.7091537652186983, + "flos": 20082360585600.0, + "grad_norm": 1.7023757586214014, + "language_loss": 0.75844228, + "learning_rate": 8.23515947668052e-07, + "loss": 0.77976608, + "num_input_tokens_seen": 254459480, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.640625, + "step": 11795, + "time_per_iteration": 2.451152801513672 + }, + { + "auxiliary_loss_clip": 0.01100363, + "auxiliary_loss_mlp": 0.0102972, + "balance_loss_clip": 1.01907492, + "balance_loss_mlp": 1.03473902, + "epoch": 0.7092138884713663, + "flos": 13151565676800.0, + "grad_norm": 2.342459713927466, + "language_loss": 0.74982113, + "learning_rate": 8.232010181829838e-07, + "loss": 0.77112198, + "num_input_tokens_seen": 254473985, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.65625, + "step": 11796, + "time_per_iteration": 2.402853012084961 + }, + { + "auxiliary_loss_clip": 0.01106679, + "auxiliary_loss_mlp": 0.01031501, + "balance_loss_clip": 1.0188055, + "balance_loss_mlp": 1.03671682, + "epoch": 0.7092740117240343, + "flos": 21645982506240.0, + "grad_norm": 1.6427166102656843, + "language_loss": 0.74295354, + "learning_rate": 8.228861333222523e-07, + "loss": 0.76433539, + "num_input_tokens_seen": 254492135, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 11797, + "time_per_iteration": 2.4772911071777344 + }, + { + "auxiliary_loss_clip": 0.01102045, + "auxiliary_loss_mlp": 0.01028995, + "balance_loss_clip": 1.01778328, + "balance_loss_mlp": 1.03599036, + "epoch": 0.7093341349767023, + "flos": 21032521102080.0, + "grad_norm": 1.5744211833149133, + "language_loss": 0.79336572, + "learning_rate": 8.225712930977953e-07, + "loss": 0.81467617, + "num_input_tokens_seen": 254512865, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6640625, + "step": 11798, + "time_per_iteration": 2.470794677734375 + }, + { + "auxiliary_loss_clip": 0.0110133, + "auxiliary_loss_mlp": 0.01032189, + "balance_loss_clip": 1.02051234, + "balance_loss_mlp": 1.03513288, + "epoch": 0.7093942582293702, + "flos": 22017658487040.0, + "grad_norm": 1.8971965021381223, + "language_loss": 0.66774857, + "learning_rate": 8.222564975215529e-07, + "loss": 0.68908381, + "num_input_tokens_seen": 254532605, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 11799, + "time_per_iteration": 2.4620981216430664 + }, + { + "auxiliary_loss_clip": 0.01102381, + "auxiliary_loss_mlp": 0.01026893, + "balance_loss_clip": 1.01489425, + "balance_loss_mlp": 1.03516233, + "epoch": 0.7094543814820382, + "flos": 27235586465280.0, + "grad_norm": 1.8304913592304672, + "language_loss": 0.81343234, + "learning_rate": 8.219417466054622e-07, + "loss": 0.83472508, + "num_input_tokens_seen": 254553780, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 11800, + "time_per_iteration": 2.5046193599700928 + }, + { + "auxiliary_loss_clip": 0.01097772, + "auxiliary_loss_mlp": 0.01025392, + "balance_loss_clip": 1.01481259, + "balance_loss_mlp": 1.03336954, + "epoch": 0.7095145047347061, + "flos": 12089148180480.0, + "grad_norm": 1.8277069049900614, + "language_loss": 0.8660984, + "learning_rate": 8.21627040361459e-07, + "loss": 0.88733006, + "num_input_tokens_seen": 254567510, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.64453125, + "step": 11801, + "time_per_iteration": 2.4158272743225098 + }, + { + "auxiliary_loss_clip": 0.01100039, + "auxiliary_loss_mlp": 0.01031161, + "balance_loss_clip": 1.01996124, + "balance_loss_mlp": 1.03366089, + "epoch": 0.7095746279873741, + "flos": 19383789905280.0, + "grad_norm": 1.7026819201034897, + "language_loss": 0.76157814, + "learning_rate": 8.213123788014758e-07, + "loss": 0.78289014, + "num_input_tokens_seen": 254585565, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6640625, + "step": 11802, + "time_per_iteration": 2.4612386226654053 + }, + { + "auxiliary_loss_clip": 0.01102987, + "auxiliary_loss_mlp": 0.01036669, + "balance_loss_clip": 1.02519536, + "balance_loss_mlp": 1.03526998, + "epoch": 0.709634751240042, + "flos": 21360600950400.0, + "grad_norm": 3.23871820936019, + "language_loss": 0.81726915, + "learning_rate": 8.209977619374462e-07, + "loss": 0.83866572, + "num_input_tokens_seen": 254603465, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.67578125, + "step": 11803, + "time_per_iteration": 3.975581407546997 + }, + { + "auxiliary_loss_clip": 0.01101959, + "auxiliary_loss_mlp": 0.01027237, + "balance_loss_clip": 1.01518524, + "balance_loss_mlp": 1.03458929, + "epoch": 0.7096948744927101, + "flos": 13917037438080.0, + "grad_norm": 2.0140842961231047, + "language_loss": 0.67451382, + "learning_rate": 8.206831897812995e-07, + "loss": 0.69580579, + "num_input_tokens_seen": 254620500, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 11804, + "time_per_iteration": 2.4457967281341553 + }, + { + "auxiliary_loss_clip": 0.01096545, + "auxiliary_loss_mlp": 0.01024221, + "balance_loss_clip": 1.01398039, + "balance_loss_mlp": 1.03440809, + "epoch": 0.709754997745378, + "flos": 30298335436800.0, + "grad_norm": 1.740193690303794, + "language_loss": 0.78362393, + "learning_rate": 8.203686623449637e-07, + "loss": 0.80483156, + "num_input_tokens_seen": 254638565, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.62109375, + "step": 11805, + "time_per_iteration": 3.905280590057373 + }, + { + "auxiliary_loss_clip": 0.01099173, + "auxiliary_loss_mlp": 0.01028601, + "balance_loss_clip": 1.01693034, + "balance_loss_mlp": 1.03327656, + "epoch": 0.709815120998046, + "flos": 18515147304960.0, + "grad_norm": 3.0979433949045125, + "language_loss": 0.78634393, + "learning_rate": 8.200541796403667e-07, + "loss": 0.8076216, + "num_input_tokens_seen": 254657505, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66015625, + "step": 11806, + "time_per_iteration": 5.279039144515991 + }, + { + "auxiliary_loss_clip": 0.01100527, + "auxiliary_loss_mlp": 0.01031855, + "balance_loss_clip": 1.02109098, + "balance_loss_mlp": 1.03536928, + "epoch": 0.7098752442507139, + "flos": 22272588288000.0, + "grad_norm": 2.519109679125039, + "language_loss": 0.56458282, + "learning_rate": 8.197397416794332e-07, + "loss": 0.58590662, + "num_input_tokens_seen": 254674730, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.65234375, + "step": 11807, + "time_per_iteration": 2.4814159870147705 + }, + { + "auxiliary_loss_clip": 0.01104048, + "auxiliary_loss_mlp": 0.01038917, + "balance_loss_clip": 1.02743721, + "balance_loss_mlp": 1.03456068, + "epoch": 0.7099353675033819, + "flos": 19275447507840.0, + "grad_norm": 2.0844100679096407, + "language_loss": 0.68413723, + "learning_rate": 8.194253484740882e-07, + "loss": 0.70556688, + "num_input_tokens_seen": 254691665, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6953125, + "step": 11808, + "time_per_iteration": 2.425276279449463 + }, + { + "auxiliary_loss_clip": 0.01102073, + "auxiliary_loss_mlp": 0.01029901, + "balance_loss_clip": 1.01882625, + "balance_loss_mlp": 1.03456879, + "epoch": 0.70999549075605, + "flos": 21908525990400.0, + "grad_norm": 1.9066636961835672, + "language_loss": 0.71175826, + "learning_rate": 8.191110000362513e-07, + "loss": 0.733078, + "num_input_tokens_seen": 254711610, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.67578125, + "step": 11809, + "time_per_iteration": 2.4811971187591553 + }, + { + "auxiliary_loss_clip": 0.01025844, + "auxiliary_loss_mlp": 0.00998682, + "balance_loss_clip": 0.99747771, + "balance_loss_mlp": 1.00508428, + "epoch": 0.7100556140087179, + "flos": 70456053456000.0, + "grad_norm": 0.7498079844660932, + "language_loss": 0.59492218, + "learning_rate": 8.187966963778435e-07, + "loss": 0.61516744, + "num_input_tokens_seen": 254772615, + "router_z_loss_clip": 0.01202393, + "router_z_loss_mlp": 0.20703125, + "step": 11810, + "time_per_iteration": 3.1407463550567627 + }, + { + "auxiliary_loss_clip": 0.01102477, + "auxiliary_loss_mlp": 0.01031923, + "balance_loss_clip": 1.02154016, + "balance_loss_mlp": 1.03702438, + "epoch": 0.7101157372613859, + "flos": 23039568420480.0, + "grad_norm": 1.5762923305466447, + "language_loss": 0.73988348, + "learning_rate": 8.18482437510784e-07, + "loss": 0.76122749, + "num_input_tokens_seen": 254791375, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.65625, + "step": 11811, + "time_per_iteration": 2.4921576976776123 + }, + { + "auxiliary_loss_clip": 0.01097734, + "auxiliary_loss_mlp": 0.01024065, + "balance_loss_clip": 1.0132947, + "balance_loss_mlp": 1.03462029, + "epoch": 0.7101758605140538, + "flos": 23185329811200.0, + "grad_norm": 1.6755141879364293, + "language_loss": 0.83260751, + "learning_rate": 8.181682234469882e-07, + "loss": 0.85382551, + "num_input_tokens_seen": 254809300, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6328125, + "step": 11812, + "time_per_iteration": 2.4486024379730225 + }, + { + "auxiliary_loss_clip": 0.0110213, + "auxiliary_loss_mlp": 0.01025057, + "balance_loss_clip": 1.01317763, + "balance_loss_mlp": 1.0353713, + "epoch": 0.7102359837667218, + "flos": 23696123166720.0, + "grad_norm": 1.6424398905568702, + "language_loss": 0.69810915, + "learning_rate": 8.178540541983716e-07, + "loss": 0.71938103, + "num_input_tokens_seen": 254829325, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66796875, + "step": 11813, + "time_per_iteration": 2.4982481002807617 + }, + { + "auxiliary_loss_clip": 0.01096572, + "auxiliary_loss_mlp": 0.0102546, + "balance_loss_clip": 1.01479709, + "balance_loss_mlp": 1.03272831, + "epoch": 0.7102961070193897, + "flos": 19391116279680.0, + "grad_norm": 1.8324166675871492, + "language_loss": 0.81685358, + "learning_rate": 8.175399297768495e-07, + "loss": 0.83807397, + "num_input_tokens_seen": 254847690, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.63671875, + "step": 11814, + "time_per_iteration": 2.4432296752929688 + }, + { + "auxiliary_loss_clip": 0.01100828, + "auxiliary_loss_mlp": 0.01026562, + "balance_loss_clip": 1.01498675, + "balance_loss_mlp": 1.03533602, + "epoch": 0.7103562302720577, + "flos": 21507511576320.0, + "grad_norm": 2.0936967568296594, + "language_loss": 0.75861955, + "learning_rate": 8.172258501943301e-07, + "loss": 0.77989352, + "num_input_tokens_seen": 254865960, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65234375, + "step": 11815, + "time_per_iteration": 2.49507474899292 + }, + { + "auxiliary_loss_clip": 0.01098556, + "auxiliary_loss_mlp": 0.01030404, + "balance_loss_clip": 1.0192579, + "balance_loss_mlp": 1.03366482, + "epoch": 0.7104163535247257, + "flos": 14535059869440.0, + "grad_norm": 1.6038639171669453, + "language_loss": 0.78608739, + "learning_rate": 8.16911815462725e-07, + "loss": 0.80737698, + "num_input_tokens_seen": 254882815, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 11816, + "time_per_iteration": 2.415172815322876 + }, + { + "auxiliary_loss_clip": 0.01101923, + "auxiliary_loss_mlp": 0.01035584, + "balance_loss_clip": 1.02450991, + "balance_loss_mlp": 1.03593814, + "epoch": 0.7104764767773937, + "flos": 11400310085760.0, + "grad_norm": 1.8614231241085628, + "language_loss": 0.8662678, + "learning_rate": 8.165978255939426e-07, + "loss": 0.88764292, + "num_input_tokens_seen": 254898705, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.66015625, + "step": 11817, + "time_per_iteration": 2.4507339000701904 + }, + { + "auxiliary_loss_clip": 0.01099735, + "auxiliary_loss_mlp": 0.01028346, + "balance_loss_clip": 1.01768219, + "balance_loss_mlp": 1.03457141, + "epoch": 0.7105366000300616, + "flos": 11690432236800.0, + "grad_norm": 3.9427784620989437, + "language_loss": 0.84360695, + "learning_rate": 8.162838805998897e-07, + "loss": 0.86488771, + "num_input_tokens_seen": 254913665, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.65234375, + "step": 11818, + "time_per_iteration": 2.451037883758545 + }, + { + "auxiliary_loss_clip": 0.01099201, + "auxiliary_loss_mlp": 0.01027387, + "balance_loss_clip": 1.01552582, + "balance_loss_mlp": 1.03239679, + "epoch": 0.7105967232827296, + "flos": 19354020508800.0, + "grad_norm": 2.103555241678178, + "language_loss": 0.75971746, + "learning_rate": 8.159699804924709e-07, + "loss": 0.78098345, + "num_input_tokens_seen": 254932140, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66796875, + "step": 11819, + "time_per_iteration": 2.4669997692108154 + }, + { + "auxiliary_loss_clip": 0.01101813, + "auxiliary_loss_mlp": 0.01028132, + "balance_loss_clip": 1.01483393, + "balance_loss_mlp": 1.03531337, + "epoch": 0.7106568465353975, + "flos": 22930400010240.0, + "grad_norm": 1.7430720805927078, + "language_loss": 0.70564902, + "learning_rate": 8.156561252835883e-07, + "loss": 0.7269485, + "num_input_tokens_seen": 254951580, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6640625, + "step": 11820, + "time_per_iteration": 2.454805612564087 + }, + { + "auxiliary_loss_clip": 0.01100228, + "auxiliary_loss_mlp": 0.01026521, + "balance_loss_clip": 1.01536262, + "balance_loss_mlp": 1.03519297, + "epoch": 0.7107169697880655, + "flos": 19099665325440.0, + "grad_norm": 1.9533750259905485, + "language_loss": 0.75224185, + "learning_rate": 8.153423149851449e-07, + "loss": 0.77350932, + "num_input_tokens_seen": 254969425, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65234375, + "step": 11821, + "time_per_iteration": 2.4534716606140137 + }, + { + "auxiliary_loss_clip": 0.01025147, + "auxiliary_loss_mlp": 0.01000031, + "balance_loss_clip": 0.99898189, + "balance_loss_mlp": 1.00464201, + "epoch": 0.7107770930407336, + "flos": 63638054231040.0, + "grad_norm": 0.7907699295335275, + "language_loss": 0.55060166, + "learning_rate": 8.150285496090388e-07, + "loss": 0.57085341, + "num_input_tokens_seen": 255032680, + "router_z_loss_clip": 0.01049805, + "router_z_loss_mlp": 0.20507812, + "step": 11822, + "time_per_iteration": 3.0831096172332764 + }, + { + "auxiliary_loss_clip": 0.01095485, + "auxiliary_loss_mlp": 0.01025121, + "balance_loss_clip": 1.01383758, + "balance_loss_mlp": 1.03307807, + "epoch": 0.7108372162934015, + "flos": 22054466949120.0, + "grad_norm": 1.9650661666731581, + "language_loss": 0.60139519, + "learning_rate": 8.147148291671688e-07, + "loss": 0.62260121, + "num_input_tokens_seen": 255054400, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.625, + "step": 11823, + "time_per_iteration": 2.5066399574279785 + }, + { + "auxiliary_loss_clip": 0.01100805, + "auxiliary_loss_mlp": 0.01029098, + "balance_loss_clip": 1.01848853, + "balance_loss_mlp": 1.03523636, + "epoch": 0.7108973395460695, + "flos": 19135144984320.0, + "grad_norm": 2.216168272824083, + "language_loss": 0.71333873, + "learning_rate": 8.144011536714322e-07, + "loss": 0.73463774, + "num_input_tokens_seen": 255072785, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.65625, + "step": 11824, + "time_per_iteration": 2.4382858276367188 + }, + { + "auxiliary_loss_clip": 0.01095465, + "auxiliary_loss_mlp": 0.01028765, + "balance_loss_clip": 1.01859653, + "balance_loss_mlp": 1.03347003, + "epoch": 0.7109574627987374, + "flos": 17894431353600.0, + "grad_norm": 1.655325791752312, + "language_loss": 0.7270785, + "learning_rate": 8.140875231337223e-07, + "loss": 0.74832082, + "num_input_tokens_seen": 255091820, + "router_z_loss_clip": 0.1015625, + "router_z_loss_mlp": 0.62109375, + "step": 11825, + "time_per_iteration": 2.46207332611084 + }, + { + "auxiliary_loss_clip": 0.01102509, + "auxiliary_loss_mlp": 0.01030137, + "balance_loss_clip": 1.01849043, + "balance_loss_mlp": 1.0350585, + "epoch": 0.7110175860514054, + "flos": 28979623422720.0, + "grad_norm": 1.7037190958225141, + "language_loss": 0.79228491, + "learning_rate": 8.137739375659321e-07, + "loss": 0.81361139, + "num_input_tokens_seen": 255111720, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.671875, + "step": 11826, + "time_per_iteration": 2.4977200031280518 + }, + { + "auxiliary_loss_clip": 0.01097466, + "auxiliary_loss_mlp": 0.01031185, + "balance_loss_clip": 1.02055109, + "balance_loss_mlp": 1.03329957, + "epoch": 0.7110777093040733, + "flos": 26173312623360.0, + "grad_norm": 1.8095370005527254, + "language_loss": 0.83191311, + "learning_rate": 8.134603969799527e-07, + "loss": 0.8531996, + "num_input_tokens_seen": 255133495, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.640625, + "step": 11827, + "time_per_iteration": 2.5329458713531494 + }, + { + "auxiliary_loss_clip": 0.01100333, + "auxiliary_loss_mlp": 0.01029649, + "balance_loss_clip": 1.01757264, + "balance_loss_mlp": 1.03426528, + "epoch": 0.7111378325567413, + "flos": 26869943969280.0, + "grad_norm": 27.265917209893804, + "language_loss": 0.62289751, + "learning_rate": 8.131469013876748e-07, + "loss": 0.64419734, + "num_input_tokens_seen": 255156880, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 11828, + "time_per_iteration": 2.601370096206665 + }, + { + "auxiliary_loss_clip": 0.01099233, + "auxiliary_loss_mlp": 0.01031042, + "balance_loss_clip": 1.01993763, + "balance_loss_mlp": 1.03395164, + "epoch": 0.7111979558094093, + "flos": 27271820309760.0, + "grad_norm": 1.4399488675180274, + "language_loss": 0.72070241, + "learning_rate": 8.128334508009846e-07, + "loss": 0.74200517, + "num_input_tokens_seen": 255178920, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65234375, + "step": 11829, + "time_per_iteration": 2.534470796585083 + }, + { + "auxiliary_loss_clip": 0.01098293, + "auxiliary_loss_mlp": 0.01030259, + "balance_loss_clip": 1.01942253, + "balance_loss_mlp": 1.0337075, + "epoch": 0.7112580790620773, + "flos": 25046938961280.0, + "grad_norm": 1.7046572375419429, + "language_loss": 0.80539268, + "learning_rate": 8.125200452317697e-07, + "loss": 0.82667816, + "num_input_tokens_seen": 255198095, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6484375, + "step": 11830, + "time_per_iteration": 2.4941787719726562 + }, + { + "auxiliary_loss_clip": 0.01100972, + "auxiliary_loss_mlp": 0.01032941, + "balance_loss_clip": 1.02192593, + "balance_loss_mlp": 1.03516912, + "epoch": 0.7113182023147452, + "flos": 21646628951040.0, + "grad_norm": 1.6897013308211777, + "language_loss": 0.84117299, + "learning_rate": 8.122066846919138e-07, + "loss": 0.86251217, + "num_input_tokens_seen": 255215860, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 11831, + "time_per_iteration": 2.4908971786499023 + }, + { + "auxiliary_loss_clip": 0.01100644, + "auxiliary_loss_mlp": 0.01028791, + "balance_loss_clip": 1.01750183, + "balance_loss_mlp": 1.03430891, + "epoch": 0.7113783255674132, + "flos": 20996287257600.0, + "grad_norm": 2.068922809184691, + "language_loss": 0.76956964, + "learning_rate": 8.118933691932985e-07, + "loss": 0.79086405, + "num_input_tokens_seen": 255235425, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6640625, + "step": 11832, + "time_per_iteration": 2.4407291412353516 + }, + { + "auxiliary_loss_clip": 0.01024653, + "auxiliary_loss_mlp": 0.0100495, + "balance_loss_clip": 1.00390673, + "balance_loss_mlp": 1.00420582, + "epoch": 0.7114384488200811, + "flos": 66771080161920.0, + "grad_norm": 0.7451484693360029, + "language_loss": 0.56659162, + "learning_rate": 8.115800987478059e-07, + "loss": 0.58688766, + "num_input_tokens_seen": 255291680, + "router_z_loss_clip": 0.01043701, + "router_z_loss_mlp": 0.20507812, + "step": 11833, + "time_per_iteration": 2.9816091060638428 + }, + { + "auxiliary_loss_clip": 0.01097454, + "auxiliary_loss_mlp": 0.01033073, + "balance_loss_clip": 1.02226698, + "balance_loss_mlp": 1.0331707, + "epoch": 0.7114985720727491, + "flos": 25010058672000.0, + "grad_norm": 1.6073221071178434, + "language_loss": 0.70877647, + "learning_rate": 8.11266873367315e-07, + "loss": 0.7300818, + "num_input_tokens_seen": 255313880, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.640625, + "step": 11834, + "time_per_iteration": 2.478980541229248 + }, + { + "auxiliary_loss_clip": 0.01103011, + "auxiliary_loss_mlp": 0.0102967, + "balance_loss_clip": 1.0181601, + "balance_loss_mlp": 1.03596425, + "epoch": 0.7115586953254172, + "flos": 21470128496640.0, + "grad_norm": 1.9914740179798254, + "language_loss": 0.79722375, + "learning_rate": 8.10953693063704e-07, + "loss": 0.81855053, + "num_input_tokens_seen": 255332390, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 11835, + "time_per_iteration": 2.479388952255249 + }, + { + "auxiliary_loss_clip": 0.01096967, + "auxiliary_loss_mlp": 0.01025588, + "balance_loss_clip": 1.0149014, + "balance_loss_mlp": 1.03320408, + "epoch": 0.7116188185780851, + "flos": 28622600190720.0, + "grad_norm": 1.6571407536951757, + "language_loss": 0.7602039, + "learning_rate": 8.10640557848848e-07, + "loss": 0.78142941, + "num_input_tokens_seen": 255354025, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.63671875, + "step": 11836, + "time_per_iteration": 2.4998624324798584 + }, + { + "auxiliary_loss_clip": 0.01098563, + "auxiliary_loss_mlp": 0.01030382, + "balance_loss_clip": 1.01905715, + "balance_loss_mlp": 1.03251767, + "epoch": 0.7116789418307531, + "flos": 25293608634240.0, + "grad_norm": 1.7551754985161803, + "language_loss": 0.70438159, + "learning_rate": 8.103274677346208e-07, + "loss": 0.72567105, + "num_input_tokens_seen": 255371400, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66015625, + "step": 11837, + "time_per_iteration": 2.4985547065734863 + }, + { + "auxiliary_loss_clip": 0.01103208, + "auxiliary_loss_mlp": 0.01032547, + "balance_loss_clip": 1.02022099, + "balance_loss_mlp": 1.03518689, + "epoch": 0.711739065083421, + "flos": 25557301353600.0, + "grad_norm": 1.8053810542915782, + "language_loss": 0.61668026, + "learning_rate": 8.100144227328958e-07, + "loss": 0.63803786, + "num_input_tokens_seen": 255390710, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6796875, + "step": 11838, + "time_per_iteration": 2.4703662395477295 + }, + { + "auxiliary_loss_clip": 0.01101169, + "auxiliary_loss_mlp": 0.0103104, + "balance_loss_clip": 1.01982808, + "balance_loss_mlp": 1.03559279, + "epoch": 0.711799188336089, + "flos": 26140993361280.0, + "grad_norm": 2.6637536928847556, + "language_loss": 0.67472559, + "learning_rate": 8.097014228555426e-07, + "loss": 0.69604766, + "num_input_tokens_seen": 255408790, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 11839, + "time_per_iteration": 2.535466194152832 + }, + { + "auxiliary_loss_clip": 0.01101981, + "auxiliary_loss_mlp": 0.0102972, + "balance_loss_clip": 1.01871097, + "balance_loss_mlp": 1.03578651, + "epoch": 0.7118593115887569, + "flos": 21140648017920.0, + "grad_norm": 1.8263370197913231, + "language_loss": 0.84035689, + "learning_rate": 8.093884681144305e-07, + "loss": 0.86167389, + "num_input_tokens_seen": 255426280, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6640625, + "step": 11840, + "time_per_iteration": 2.4370462894439697 + }, + { + "auxiliary_loss_clip": 0.01104281, + "auxiliary_loss_mlp": 0.01028702, + "balance_loss_clip": 1.01743066, + "balance_loss_mlp": 1.03657627, + "epoch": 0.711919434841425, + "flos": 14975684006400.0, + "grad_norm": 2.0089671894900243, + "language_loss": 0.76980072, + "learning_rate": 8.090755585214277e-07, + "loss": 0.79113054, + "num_input_tokens_seen": 255442935, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6796875, + "step": 11841, + "time_per_iteration": 2.4408881664276123 + }, + { + "auxiliary_loss_clip": 0.01102547, + "auxiliary_loss_mlp": 0.01027846, + "balance_loss_clip": 1.01616335, + "balance_loss_mlp": 1.03546906, + "epoch": 0.7119795580940929, + "flos": 16508997826560.0, + "grad_norm": 2.1117001145117595, + "language_loss": 0.74941587, + "learning_rate": 8.087626940883994e-07, + "loss": 0.77071977, + "num_input_tokens_seen": 255460925, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.671875, + "step": 11842, + "time_per_iteration": 2.4360697269439697 + }, + { + "auxiliary_loss_clip": 0.01024411, + "auxiliary_loss_mlp": 0.01002483, + "balance_loss_clip": 1.00127351, + "balance_loss_mlp": 1.00402236, + "epoch": 0.7120396813467609, + "flos": 66570736055040.0, + "grad_norm": 0.784591330387751, + "language_loss": 0.61587965, + "learning_rate": 8.084498748272082e-07, + "loss": 0.63614863, + "num_input_tokens_seen": 255521360, + "router_z_loss_clip": 0.01208496, + "router_z_loss_mlp": 0.20410156, + "step": 11843, + "time_per_iteration": 3.0296053886413574 + }, + { + "auxiliary_loss_clip": 0.01099529, + "auxiliary_loss_mlp": 0.01027745, + "balance_loss_clip": 1.01671815, + "balance_loss_mlp": 1.03480315, + "epoch": 0.7120998045994288, + "flos": 26432731624320.0, + "grad_norm": 1.734640870802516, + "language_loss": 0.80089492, + "learning_rate": 8.081371007497171e-07, + "loss": 0.82216763, + "num_input_tokens_seen": 255541435, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 11844, + "time_per_iteration": 3.887108325958252 + }, + { + "auxiliary_loss_clip": 0.01098797, + "auxiliary_loss_mlp": 0.01030686, + "balance_loss_clip": 1.01913476, + "balance_loss_mlp": 1.03288889, + "epoch": 0.7121599278520968, + "flos": 16427982700800.0, + "grad_norm": 2.1905334361731326, + "language_loss": 0.78714418, + "learning_rate": 8.078243718677873e-07, + "loss": 0.80843902, + "num_input_tokens_seen": 255558505, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66015625, + "step": 11845, + "time_per_iteration": 2.410975456237793 + }, + { + "auxiliary_loss_clip": 0.01099567, + "auxiliary_loss_mlp": 0.01031851, + "balance_loss_clip": 1.02026939, + "balance_loss_mlp": 1.03620291, + "epoch": 0.7122200511047647, + "flos": 28949889939840.0, + "grad_norm": 1.893878343442594, + "language_loss": 0.76888061, + "learning_rate": 8.075116881932762e-07, + "loss": 0.79019481, + "num_input_tokens_seen": 255577815, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6328125, + "step": 11846, + "time_per_iteration": 3.931493043899536 + }, + { + "auxiliary_loss_clip": 0.01102634, + "auxiliary_loss_mlp": 0.01028937, + "balance_loss_clip": 1.0170877, + "balance_loss_mlp": 1.03620863, + "epoch": 0.7122801743574327, + "flos": 16471866142080.0, + "grad_norm": 1.9372499520787854, + "language_loss": 0.58303821, + "learning_rate": 8.071990497380421e-07, + "loss": 0.6043539, + "num_input_tokens_seen": 255595885, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 11847, + "time_per_iteration": 3.8361809253692627 + }, + { + "auxiliary_loss_clip": 0.01097288, + "auxiliary_loss_mlp": 0.01031113, + "balance_loss_clip": 1.01974046, + "balance_loss_mlp": 1.03439856, + "epoch": 0.7123402976101008, + "flos": 20631039811200.0, + "grad_norm": 1.4312853577961298, + "language_loss": 0.71475565, + "learning_rate": 8.068864565139395e-07, + "loss": 0.7360397, + "num_input_tokens_seen": 255616750, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.62890625, + "step": 11848, + "time_per_iteration": 3.985182523727417 + }, + { + "auxiliary_loss_clip": 0.01025097, + "auxiliary_loss_mlp": 0.01002394, + "balance_loss_clip": 1.00141037, + "balance_loss_mlp": 1.00462532, + "epoch": 0.7124004208627687, + "flos": 62325734837760.0, + "grad_norm": 0.8575731984951991, + "language_loss": 0.63123107, + "learning_rate": 8.065739085328211e-07, + "loss": 0.65150595, + "num_input_tokens_seen": 255677900, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.20507812, + "step": 11849, + "time_per_iteration": 3.0350046157836914 + }, + { + "auxiliary_loss_clip": 0.01102326, + "auxiliary_loss_mlp": 0.01031998, + "balance_loss_clip": 1.02040517, + "balance_loss_mlp": 1.03554058, + "epoch": 0.7124605441154367, + "flos": 39675975788160.0, + "grad_norm": 1.4965357236983527, + "language_loss": 0.63742816, + "learning_rate": 8.0626140580654e-07, + "loss": 0.65877146, + "num_input_tokens_seen": 255699140, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66796875, + "step": 11850, + "time_per_iteration": 2.6502671241760254 + }, + { + "auxiliary_loss_clip": 0.01101252, + "auxiliary_loss_mlp": 0.0103008, + "balance_loss_clip": 1.0185765, + "balance_loss_mlp": 1.03538823, + "epoch": 0.7125206673681046, + "flos": 28181868312960.0, + "grad_norm": 1.4672764564322482, + "language_loss": 0.69679284, + "learning_rate": 8.05948948346946e-07, + "loss": 0.71810615, + "num_input_tokens_seen": 255719640, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66015625, + "step": 11851, + "time_per_iteration": 2.495501756668091 + }, + { + "auxiliary_loss_clip": 0.01100247, + "auxiliary_loss_mlp": 0.01031406, + "balance_loss_clip": 1.02083778, + "balance_loss_mlp": 1.03549206, + "epoch": 0.7125807906207726, + "flos": 26176939896960.0, + "grad_norm": 1.4895655159302474, + "language_loss": 0.83113164, + "learning_rate": 8.056365361658882e-07, + "loss": 0.85244817, + "num_input_tokens_seen": 255740450, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6484375, + "step": 11852, + "time_per_iteration": 2.510340929031372 + }, + { + "auxiliary_loss_clip": 0.01103516, + "auxiliary_loss_mlp": 0.0103163, + "balance_loss_clip": 1.0193572, + "balance_loss_mlp": 1.03595805, + "epoch": 0.7126409138734405, + "flos": 17157328358400.0, + "grad_norm": 2.258616053920704, + "language_loss": 0.73188543, + "learning_rate": 8.053241692752126e-07, + "loss": 0.75323689, + "num_input_tokens_seen": 255758070, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.67578125, + "step": 11853, + "time_per_iteration": 2.4003355503082275 + }, + { + "auxiliary_loss_clip": 0.01096006, + "auxiliary_loss_mlp": 0.01027437, + "balance_loss_clip": 1.0173521, + "balance_loss_mlp": 1.03375578, + "epoch": 0.7127010371261085, + "flos": 18769933451520.0, + "grad_norm": 1.9420602082674068, + "language_loss": 0.92091542, + "learning_rate": 8.050118476867635e-07, + "loss": 0.94214988, + "num_input_tokens_seen": 255775685, + "router_z_loss_clip": 0.10058594, + "router_z_loss_mlp": 0.62109375, + "step": 11854, + "time_per_iteration": 2.4623403549194336 + }, + { + "auxiliary_loss_clip": 0.01099699, + "auxiliary_loss_mlp": 0.01030318, + "balance_loss_clip": 1.01910615, + "balance_loss_mlp": 1.0353142, + "epoch": 0.7127611603787765, + "flos": 20376433232640.0, + "grad_norm": 2.0934387752470403, + "language_loss": 0.79594553, + "learning_rate": 8.046995714123856e-07, + "loss": 0.81724572, + "num_input_tokens_seen": 255794750, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 11855, + "time_per_iteration": 2.442281484603882 + }, + { + "auxiliary_loss_clip": 0.01100914, + "auxiliary_loss_mlp": 0.01033118, + "balance_loss_clip": 1.02083373, + "balance_loss_mlp": 1.0347116, + "epoch": 0.7128212836314445, + "flos": 20449008662400.0, + "grad_norm": 1.6650252891937876, + "language_loss": 0.72577047, + "learning_rate": 8.043873404639192e-07, + "loss": 0.74711072, + "num_input_tokens_seen": 255813325, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.66015625, + "step": 11856, + "time_per_iteration": 2.47229266166687 + }, + { + "auxiliary_loss_clip": 0.01103455, + "auxiliary_loss_mlp": 0.01030145, + "balance_loss_clip": 1.01880825, + "balance_loss_mlp": 1.03564286, + "epoch": 0.7128814068841124, + "flos": 23440834229760.0, + "grad_norm": 1.6411446267606922, + "language_loss": 0.70082289, + "learning_rate": 8.040751548532046e-07, + "loss": 0.72215885, + "num_input_tokens_seen": 255832470, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.67578125, + "step": 11857, + "time_per_iteration": 2.4524147510528564 + }, + { + "auxiliary_loss_clip": 0.01098237, + "auxiliary_loss_mlp": 0.01029295, + "balance_loss_clip": 1.01744556, + "balance_loss_mlp": 1.03391576, + "epoch": 0.7129415301367804, + "flos": 18222942165120.0, + "grad_norm": 2.116428788246258, + "language_loss": 0.85496008, + "learning_rate": 8.03763014592081e-07, + "loss": 0.87623537, + "num_input_tokens_seen": 255849740, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.640625, + "step": 11858, + "time_per_iteration": 2.4527347087860107 + }, + { + "auxiliary_loss_clip": 0.01104991, + "auxiliary_loss_mlp": 0.01029649, + "balance_loss_clip": 1.01795506, + "balance_loss_mlp": 1.03623009, + "epoch": 0.7130016533894483, + "flos": 15523896355200.0, + "grad_norm": 1.608889007430339, + "language_loss": 0.80293894, + "learning_rate": 8.034509196923829e-07, + "loss": 0.82428539, + "num_input_tokens_seen": 255866975, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6875, + "step": 11859, + "time_per_iteration": 2.4199166297912598 + }, + { + "auxiliary_loss_clip": 0.010985, + "auxiliary_loss_mlp": 0.01031707, + "balance_loss_clip": 1.02047169, + "balance_loss_mlp": 1.03418899, + "epoch": 0.7130617766421163, + "flos": 57115668960000.0, + "grad_norm": 1.1635938409015476, + "language_loss": 0.68921995, + "learning_rate": 8.031388701659456e-07, + "loss": 0.710522, + "num_input_tokens_seen": 255892915, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.64453125, + "step": 11860, + "time_per_iteration": 2.779348373413086 + }, + { + "auxiliary_loss_clip": 0.01101605, + "auxiliary_loss_mlp": 0.01028645, + "balance_loss_clip": 1.01661134, + "balance_loss_mlp": 1.03528762, + "epoch": 0.7131218998947844, + "flos": 19788252024960.0, + "grad_norm": 1.9453238784757083, + "language_loss": 0.64468431, + "learning_rate": 8.028268660246023e-07, + "loss": 0.66598678, + "num_input_tokens_seen": 255911480, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 11861, + "time_per_iteration": 2.4438693523406982 + }, + { + "auxiliary_loss_clip": 0.01106949, + "auxiliary_loss_mlp": 0.01027271, + "balance_loss_clip": 1.01536191, + "balance_loss_mlp": 1.03813672, + "epoch": 0.7131820231474523, + "flos": 26651894457600.0, + "grad_norm": 1.665544522358975, + "language_loss": 0.67246974, + "learning_rate": 8.025149072801849e-07, + "loss": 0.69381201, + "num_input_tokens_seen": 255931140, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 11862, + "time_per_iteration": 2.507708787918091 + }, + { + "auxiliary_loss_clip": 0.01099005, + "auxiliary_loss_mlp": 0.01033992, + "balance_loss_clip": 1.02357876, + "balance_loss_mlp": 1.03554285, + "epoch": 0.7132421464001203, + "flos": 29205609840000.0, + "grad_norm": 2.1581150638153117, + "language_loss": 0.66787547, + "learning_rate": 8.022029939445214e-07, + "loss": 0.68920541, + "num_input_tokens_seen": 255951665, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.6328125, + "step": 11863, + "time_per_iteration": 2.508451223373413 + }, + { + "auxiliary_loss_clip": 0.01107413, + "auxiliary_loss_mlp": 0.01035848, + "balance_loss_clip": 1.02361131, + "balance_loss_mlp": 1.03781486, + "epoch": 0.7133022696527882, + "flos": 23073611535360.0, + "grad_norm": 9.155363012323315, + "language_loss": 0.65499818, + "learning_rate": 8.018911260294414e-07, + "loss": 0.67643076, + "num_input_tokens_seen": 255970055, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6953125, + "step": 11864, + "time_per_iteration": 2.4946515560150146 + }, + { + "auxiliary_loss_clip": 0.0110368, + "auxiliary_loss_mlp": 0.01028226, + "balance_loss_clip": 1.01631117, + "balance_loss_mlp": 1.03640735, + "epoch": 0.7133623929054562, + "flos": 17457111267840.0, + "grad_norm": 1.87343338578939, + "language_loss": 0.85730636, + "learning_rate": 8.015793035467697e-07, + "loss": 0.87862539, + "num_input_tokens_seen": 255987720, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 11865, + "time_per_iteration": 2.42283296585083 + }, + { + "auxiliary_loss_clip": 0.01100738, + "auxiliary_loss_mlp": 0.01027678, + "balance_loss_clip": 1.01554251, + "balance_loss_mlp": 1.03419256, + "epoch": 0.7134225161581241, + "flos": 19536554448000.0, + "grad_norm": 1.8472790526640706, + "language_loss": 0.74752319, + "learning_rate": 8.012675265083304e-07, + "loss": 0.76880735, + "num_input_tokens_seen": 256005490, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 11866, + "time_per_iteration": 2.4545392990112305 + }, + { + "auxiliary_loss_clip": 0.01104452, + "auxiliary_loss_mlp": 0.01034488, + "balance_loss_clip": 1.02232265, + "balance_loss_mlp": 1.03757143, + "epoch": 0.7134826394107922, + "flos": 26250089944320.0, + "grad_norm": 2.6643205457919477, + "language_loss": 0.70109868, + "learning_rate": 8.009557949259464e-07, + "loss": 0.72248805, + "num_input_tokens_seen": 256026030, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 11867, + "time_per_iteration": 2.487058639526367 + }, + { + "auxiliary_loss_clip": 0.01098879, + "auxiliary_loss_mlp": 0.0102599, + "balance_loss_clip": 1.01518393, + "balance_loss_mlp": 1.03465641, + "epoch": 0.7135427626634601, + "flos": 15815311395840.0, + "grad_norm": 6.705448377548921, + "language_loss": 0.71701014, + "learning_rate": 8.006441088114397e-07, + "loss": 0.73825878, + "num_input_tokens_seen": 256043680, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.640625, + "step": 11868, + "time_per_iteration": 2.4669320583343506 + }, + { + "auxiliary_loss_clip": 0.01105504, + "auxiliary_loss_mlp": 0.01027422, + "balance_loss_clip": 1.01509547, + "balance_loss_mlp": 1.03705835, + "epoch": 0.7136028859161281, + "flos": 18223409041920.0, + "grad_norm": 2.2157289852805278, + "language_loss": 0.65810573, + "learning_rate": 8.003324681766286e-07, + "loss": 0.67943501, + "num_input_tokens_seen": 256059705, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.68359375, + "step": 11869, + "time_per_iteration": 2.452075242996216 + }, + { + "auxiliary_loss_clip": 0.01100077, + "auxiliary_loss_mlp": 0.01024311, + "balance_loss_clip": 1.01321864, + "balance_loss_mlp": 1.03367877, + "epoch": 0.713663009168796, + "flos": 24314827956480.0, + "grad_norm": 1.5172430207890026, + "language_loss": 0.77797884, + "learning_rate": 8.000208730333298e-07, + "loss": 0.79922271, + "num_input_tokens_seen": 256079785, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6640625, + "step": 11870, + "time_per_iteration": 2.497041940689087 + }, + { + "auxiliary_loss_clip": 0.01101931, + "auxiliary_loss_mlp": 0.01029796, + "balance_loss_clip": 1.0176903, + "balance_loss_mlp": 1.03650808, + "epoch": 0.713723132421464, + "flos": 26538488242560.0, + "grad_norm": 1.6309688506128002, + "language_loss": 0.80767673, + "learning_rate": 7.997093233933597e-07, + "loss": 0.82899404, + "num_input_tokens_seen": 256099000, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.65625, + "step": 11871, + "time_per_iteration": 2.519364595413208 + }, + { + "auxiliary_loss_clip": 0.01102602, + "auxiliary_loss_mlp": 0.01036614, + "balance_loss_clip": 1.02396536, + "balance_loss_mlp": 1.03430688, + "epoch": 0.7137832556741319, + "flos": 19865675790720.0, + "grad_norm": 1.5882335500802451, + "language_loss": 0.78899664, + "learning_rate": 7.993978192685331e-07, + "loss": 0.8103888, + "num_input_tokens_seen": 256117985, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.68359375, + "step": 11872, + "time_per_iteration": 2.4607558250427246 + }, + { + "auxiliary_loss_clip": 0.01102685, + "auxiliary_loss_mlp": 0.01026379, + "balance_loss_clip": 1.01414764, + "balance_loss_mlp": 1.035676, + "epoch": 0.7138433789267999, + "flos": 21688932193920.0, + "grad_norm": 2.27961967349627, + "language_loss": 0.84102201, + "learning_rate": 7.990863606706606e-07, + "loss": 0.86231267, + "num_input_tokens_seen": 256134350, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.671875, + "step": 11873, + "time_per_iteration": 2.4343557357788086 + }, + { + "auxiliary_loss_clip": 0.01097529, + "auxiliary_loss_mlp": 0.0102625, + "balance_loss_clip": 1.01584864, + "balance_loss_mlp": 1.03362751, + "epoch": 0.713903502179468, + "flos": 17602729004160.0, + "grad_norm": 1.9049541609511427, + "language_loss": 0.86355829, + "learning_rate": 7.987749476115539e-07, + "loss": 0.88479608, + "num_input_tokens_seen": 256150610, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.640625, + "step": 11874, + "time_per_iteration": 2.4541850090026855 + }, + { + "auxiliary_loss_clip": 0.01102173, + "auxiliary_loss_mlp": 0.01026012, + "balance_loss_clip": 1.01449037, + "balance_loss_mlp": 1.0344789, + "epoch": 0.7139636254321359, + "flos": 18040336398720.0, + "grad_norm": 1.8939539946065194, + "language_loss": 0.82938111, + "learning_rate": 7.984635801030228e-07, + "loss": 0.85066295, + "num_input_tokens_seen": 256168620, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6796875, + "step": 11875, + "time_per_iteration": 2.4051244258880615 + }, + { + "auxiliary_loss_clip": 0.01106903, + "auxiliary_loss_mlp": 0.01031292, + "balance_loss_clip": 1.01805401, + "balance_loss_mlp": 1.03582454, + "epoch": 0.7140237486848039, + "flos": 23331127115520.0, + "grad_norm": 1.8810853083413022, + "language_loss": 0.69459707, + "learning_rate": 7.981522581568721e-07, + "loss": 0.71597898, + "num_input_tokens_seen": 256186700, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 11876, + "time_per_iteration": 2.461815595626831 + }, + { + "auxiliary_loss_clip": 0.0110347, + "auxiliary_loss_mlp": 0.01028817, + "balance_loss_clip": 1.01702094, + "balance_loss_mlp": 1.0361371, + "epoch": 0.7140838719374718, + "flos": 16837077674880.0, + "grad_norm": 1.9368833564249184, + "language_loss": 0.78070778, + "learning_rate": 7.978409817849079e-07, + "loss": 0.80203062, + "num_input_tokens_seen": 256205390, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 11877, + "time_per_iteration": 2.420319080352783 + }, + { + "auxiliary_loss_clip": 0.01100487, + "auxiliary_loss_mlp": 0.01031738, + "balance_loss_clip": 1.02102709, + "balance_loss_mlp": 1.0355581, + "epoch": 0.7141439951901398, + "flos": 21142012734720.0, + "grad_norm": 6.763182431425842, + "language_loss": 0.69534928, + "learning_rate": 7.97529750998934e-07, + "loss": 0.71667153, + "num_input_tokens_seen": 256224575, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6484375, + "step": 11878, + "time_per_iteration": 2.544290781021118 + }, + { + "auxiliary_loss_clip": 0.01100118, + "auxiliary_loss_mlp": 0.01033007, + "balance_loss_clip": 1.02254677, + "balance_loss_mlp": 1.03579926, + "epoch": 0.7142041184428077, + "flos": 24717709877760.0, + "grad_norm": 1.7269032775367334, + "language_loss": 0.679344, + "learning_rate": 7.972185658107535e-07, + "loss": 0.70067525, + "num_input_tokens_seen": 256242130, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.64453125, + "step": 11879, + "time_per_iteration": 2.4966022968292236 + }, + { + "auxiliary_loss_clip": 0.0110079, + "auxiliary_loss_mlp": 0.01031125, + "balance_loss_clip": 1.01867926, + "balance_loss_mlp": 1.03534412, + "epoch": 0.7142642416954758, + "flos": 21908202768000.0, + "grad_norm": 1.8974430539108489, + "language_loss": 0.68789601, + "learning_rate": 7.969074262321646e-07, + "loss": 0.70921516, + "num_input_tokens_seen": 256261920, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.65625, + "step": 11880, + "time_per_iteration": 2.502960205078125 + }, + { + "auxiliary_loss_clip": 0.01101747, + "auxiliary_loss_mlp": 0.01035023, + "balance_loss_clip": 1.02314401, + "balance_loss_mlp": 1.03362322, + "epoch": 0.7143243649481437, + "flos": 20805636844800.0, + "grad_norm": 2.4282585669500105, + "language_loss": 0.80370951, + "learning_rate": 7.965963322749674e-07, + "loss": 0.82507718, + "num_input_tokens_seen": 256277970, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.68359375, + "step": 11881, + "time_per_iteration": 2.470723867416382 + }, + { + "auxiliary_loss_clip": 0.01100316, + "auxiliary_loss_mlp": 0.01028315, + "balance_loss_clip": 1.01772344, + "balance_loss_mlp": 1.03443766, + "epoch": 0.7143844882008117, + "flos": 27235011847680.0, + "grad_norm": 1.561021120261205, + "language_loss": 0.63214886, + "learning_rate": 7.962852839509579e-07, + "loss": 0.65343523, + "num_input_tokens_seen": 256298205, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.66015625, + "step": 11882, + "time_per_iteration": 2.509657859802246 + }, + { + "auxiliary_loss_clip": 0.01104591, + "auxiliary_loss_mlp": 0.01027179, + "balance_loss_clip": 1.01627707, + "balance_loss_mlp": 1.03739905, + "epoch": 0.7144446114534796, + "flos": 17929623703680.0, + "grad_norm": 2.019106640227393, + "language_loss": 0.68898022, + "learning_rate": 7.959742812719304e-07, + "loss": 0.71029788, + "num_input_tokens_seen": 256316685, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.671875, + "step": 11883, + "time_per_iteration": 2.443070650100708 + }, + { + "auxiliary_loss_clip": 0.01101954, + "auxiliary_loss_mlp": 0.0103422, + "balance_loss_clip": 1.02256155, + "balance_loss_mlp": 1.03674269, + "epoch": 0.7145047347061476, + "flos": 20740962407040.0, + "grad_norm": 1.8254173167373133, + "language_loss": 0.77734333, + "learning_rate": 7.956633242496788e-07, + "loss": 0.7987051, + "num_input_tokens_seen": 256334205, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 11884, + "time_per_iteration": 2.498660087585449 + }, + { + "auxiliary_loss_clip": 0.01107297, + "auxiliary_loss_mlp": 0.0103088, + "balance_loss_clip": 1.01801157, + "balance_loss_mlp": 1.03647792, + "epoch": 0.7145648579588155, + "flos": 21178605715200.0, + "grad_norm": 2.2601581794211456, + "language_loss": 0.73881954, + "learning_rate": 7.953524128959954e-07, + "loss": 0.76020128, + "num_input_tokens_seen": 256353340, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.70703125, + "step": 11885, + "time_per_iteration": 2.4516425132751465 + }, + { + "auxiliary_loss_clip": 0.01024577, + "auxiliary_loss_mlp": 0.01003775, + "balance_loss_clip": 1.00262451, + "balance_loss_mlp": 1.00405157, + "epoch": 0.7146249812114835, + "flos": 64784539509120.0, + "grad_norm": 0.8858821646270937, + "language_loss": 0.66354322, + "learning_rate": 7.95041547222669e-07, + "loss": 0.68382668, + "num_input_tokens_seen": 256411550, + "router_z_loss_clip": 0.01147461, + "router_z_loss_mlp": 0.20507812, + "step": 11886, + "time_per_iteration": 4.428006649017334 + }, + { + "auxiliary_loss_clip": 0.01101529, + "auxiliary_loss_mlp": 0.01029616, + "balance_loss_clip": 1.01769543, + "balance_loss_mlp": 1.03508115, + "epoch": 0.7146851044641516, + "flos": 18113881495680.0, + "grad_norm": 2.6640943514117006, + "language_loss": 0.75138283, + "learning_rate": 7.947307272414874e-07, + "loss": 0.77269423, + "num_input_tokens_seen": 256430360, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 11887, + "time_per_iteration": 2.449885129928589 + }, + { + "auxiliary_loss_clip": 0.01102615, + "auxiliary_loss_mlp": 0.01025326, + "balance_loss_clip": 1.01408505, + "balance_loss_mlp": 1.03539872, + "epoch": 0.7147452277168195, + "flos": 19243846517760.0, + "grad_norm": 1.6754616856197402, + "language_loss": 0.71326733, + "learning_rate": 7.944199529642372e-07, + "loss": 0.73454678, + "num_input_tokens_seen": 256449750, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 11888, + "time_per_iteration": 3.880155086517334 + }, + { + "auxiliary_loss_clip": 0.01103487, + "auxiliary_loss_mlp": 0.01031134, + "balance_loss_clip": 1.01867044, + "balance_loss_mlp": 1.03440201, + "epoch": 0.7148053509694875, + "flos": 23764712186880.0, + "grad_norm": 1.7956471800089868, + "language_loss": 0.84206235, + "learning_rate": 7.941092244027041e-07, + "loss": 0.86340851, + "num_input_tokens_seen": 256467330, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.69140625, + "step": 11889, + "time_per_iteration": 3.8910415172576904 + }, + { + "auxiliary_loss_clip": 0.01102924, + "auxiliary_loss_mlp": 0.01024297, + "balance_loss_clip": 1.0128237, + "balance_loss_mlp": 1.0358007, + "epoch": 0.7148654742221554, + "flos": 22485322586880.0, + "grad_norm": 2.5861869043572994, + "language_loss": 0.75895607, + "learning_rate": 7.937985415686695e-07, + "loss": 0.78022826, + "num_input_tokens_seen": 256485705, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.671875, + "step": 11890, + "time_per_iteration": 3.942615270614624 + }, + { + "auxiliary_loss_clip": 0.01100031, + "auxiliary_loss_mlp": 0.01029052, + "balance_loss_clip": 1.01834106, + "balance_loss_mlp": 1.03469455, + "epoch": 0.7149255974748234, + "flos": 24679213476480.0, + "grad_norm": 1.4697874617816058, + "language_loss": 0.74033976, + "learning_rate": 7.934879044739147e-07, + "loss": 0.76163059, + "num_input_tokens_seen": 256504755, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.65234375, + "step": 11891, + "time_per_iteration": 2.5003371238708496 + }, + { + "auxiliary_loss_clip": 0.01101426, + "auxiliary_loss_mlp": 0.0103332, + "balance_loss_clip": 1.02201867, + "balance_loss_mlp": 1.03495193, + "epoch": 0.7149857207274913, + "flos": 18405583845120.0, + "grad_norm": 2.2548483440838676, + "language_loss": 0.68382698, + "learning_rate": 7.931773131302211e-07, + "loss": 0.70517445, + "num_input_tokens_seen": 256523670, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 11892, + "time_per_iteration": 2.431938409805298 + }, + { + "auxiliary_loss_clip": 0.0110488, + "auxiliary_loss_mlp": 0.01033708, + "balance_loss_clip": 1.02009463, + "balance_loss_mlp": 1.03543699, + "epoch": 0.7150458439801594, + "flos": 24969515195520.0, + "grad_norm": 2.391594593507675, + "language_loss": 0.73810261, + "learning_rate": 7.928667675493632e-07, + "loss": 0.75948846, + "num_input_tokens_seen": 256542225, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.6953125, + "step": 11893, + "time_per_iteration": 2.487308979034424 + }, + { + "auxiliary_loss_clip": 0.01103932, + "auxiliary_loss_mlp": 0.01029609, + "balance_loss_clip": 1.01721692, + "balance_loss_mlp": 1.03571689, + "epoch": 0.7151059672328273, + "flos": 16690777580160.0, + "grad_norm": 2.3568611580959016, + "language_loss": 0.65677148, + "learning_rate": 7.925562677431185e-07, + "loss": 0.6781069, + "num_input_tokens_seen": 256560730, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 11894, + "time_per_iteration": 2.4283459186553955 + }, + { + "auxiliary_loss_clip": 0.01103271, + "auxiliary_loss_mlp": 0.01029058, + "balance_loss_clip": 1.01771545, + "balance_loss_mlp": 1.03522325, + "epoch": 0.7151660904854953, + "flos": 27271820309760.0, + "grad_norm": 1.6791953890758138, + "language_loss": 0.77629852, + "learning_rate": 7.922458137232613e-07, + "loss": 0.79762185, + "num_input_tokens_seen": 256580505, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6796875, + "step": 11895, + "time_per_iteration": 2.478421926498413 + }, + { + "auxiliary_loss_clip": 0.01103559, + "auxiliary_loss_mlp": 0.01030416, + "balance_loss_clip": 1.01749945, + "balance_loss_mlp": 1.03565729, + "epoch": 0.7152262137381632, + "flos": 18332254229760.0, + "grad_norm": 2.101834953638121, + "language_loss": 0.69718951, + "learning_rate": 7.919354055015643e-07, + "loss": 0.71852922, + "num_input_tokens_seen": 256597330, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6796875, + "step": 11896, + "time_per_iteration": 2.4343297481536865 + }, + { + "auxiliary_loss_clip": 0.01102918, + "auxiliary_loss_mlp": 0.01042714, + "balance_loss_clip": 1.03019083, + "balance_loss_mlp": 1.03482461, + "epoch": 0.7152863369908312, + "flos": 21799285752960.0, + "grad_norm": 2.363966655291517, + "language_loss": 0.86399305, + "learning_rate": 7.91625043089798e-07, + "loss": 0.88544941, + "num_input_tokens_seen": 256616030, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 11897, + "time_per_iteration": 2.4417433738708496 + }, + { + "auxiliary_loss_clip": 0.01100281, + "auxiliary_loss_mlp": 0.01032292, + "balance_loss_clip": 1.02097273, + "balance_loss_mlp": 1.0358789, + "epoch": 0.7153464602434991, + "flos": 22158427887360.0, + "grad_norm": 2.1825882164427015, + "language_loss": 0.77925879, + "learning_rate": 7.913147264997304e-07, + "loss": 0.8005845, + "num_input_tokens_seen": 256635570, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.64453125, + "step": 11898, + "time_per_iteration": 2.4770331382751465 + }, + { + "auxiliary_loss_clip": 0.01105608, + "auxiliary_loss_mlp": 0.0102901, + "balance_loss_clip": 1.01695776, + "balance_loss_mlp": 1.03606868, + "epoch": 0.7154065834961671, + "flos": 24716057852160.0, + "grad_norm": 1.8319920355445916, + "language_loss": 0.73037088, + "learning_rate": 7.910044557431302e-07, + "loss": 0.75171709, + "num_input_tokens_seen": 256655290, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6953125, + "step": 11899, + "time_per_iteration": 2.4661285877227783 + }, + { + "auxiliary_loss_clip": 0.01100403, + "auxiliary_loss_mlp": 0.01033704, + "balance_loss_clip": 1.02177763, + "balance_loss_mlp": 1.03431213, + "epoch": 0.7154667067488351, + "flos": 22601494149120.0, + "grad_norm": 3.247812809543318, + "language_loss": 0.76076663, + "learning_rate": 7.906942308317614e-07, + "loss": 0.78210765, + "num_input_tokens_seen": 256671605, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 11900, + "time_per_iteration": 2.4811995029449463 + }, + { + "auxiliary_loss_clip": 0.01103689, + "auxiliary_loss_mlp": 0.01027857, + "balance_loss_clip": 1.01656795, + "balance_loss_mlp": 1.03645658, + "epoch": 0.7155268300015031, + "flos": 18771154513920.0, + "grad_norm": 1.955266248567226, + "language_loss": 0.80275625, + "learning_rate": 7.903840517773886e-07, + "loss": 0.82407176, + "num_input_tokens_seen": 256689680, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.671875, + "step": 11901, + "time_per_iteration": 2.423145294189453 + }, + { + "auxiliary_loss_clip": 0.01105274, + "auxiliary_loss_mlp": 0.01029008, + "balance_loss_clip": 1.01728368, + "balance_loss_mlp": 1.0356729, + "epoch": 0.7155869532541711, + "flos": 18296343607680.0, + "grad_norm": 2.026904555565968, + "language_loss": 0.81071323, + "learning_rate": 7.900739185917744e-07, + "loss": 0.83205605, + "num_input_tokens_seen": 256707760, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6953125, + "step": 11902, + "time_per_iteration": 2.459885835647583 + }, + { + "auxiliary_loss_clip": 0.01101351, + "auxiliary_loss_mlp": 0.0102546, + "balance_loss_clip": 1.01437926, + "balance_loss_mlp": 1.03461826, + "epoch": 0.715647076506839, + "flos": 11980805783040.0, + "grad_norm": 1.7500024281838862, + "language_loss": 0.68114519, + "learning_rate": 7.897638312866785e-07, + "loss": 0.70241332, + "num_input_tokens_seen": 256724150, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.66796875, + "step": 11903, + "time_per_iteration": 2.407540798187256 + }, + { + "auxiliary_loss_clip": 0.01098245, + "auxiliary_loss_mlp": 0.010278, + "balance_loss_clip": 1.01664829, + "balance_loss_mlp": 1.03362346, + "epoch": 0.715707199759507, + "flos": 18951641377920.0, + "grad_norm": 1.6395674800408413, + "language_loss": 0.76098162, + "learning_rate": 7.894537898738589e-07, + "loss": 0.78224206, + "num_input_tokens_seen": 256742780, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.64453125, + "step": 11904, + "time_per_iteration": 2.4763503074645996 + }, + { + "auxiliary_loss_clip": 0.01102193, + "auxiliary_loss_mlp": 0.01036941, + "balance_loss_clip": 1.02438211, + "balance_loss_mlp": 1.03558111, + "epoch": 0.7157673230121749, + "flos": 15304410299520.0, + "grad_norm": 2.193780720610546, + "language_loss": 0.72085339, + "learning_rate": 7.891437943650727e-07, + "loss": 0.74224472, + "num_input_tokens_seen": 256761355, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.66796875, + "step": 11905, + "time_per_iteration": 2.42999267578125 + }, + { + "auxiliary_loss_clip": 0.01099839, + "auxiliary_loss_mlp": 0.01029451, + "balance_loss_clip": 1.01815557, + "balance_loss_mlp": 1.03396761, + "epoch": 0.715827446264843, + "flos": 23221850964480.0, + "grad_norm": 1.8001319449198983, + "language_loss": 0.78033888, + "learning_rate": 7.88833844772076e-07, + "loss": 0.80163181, + "num_input_tokens_seen": 256781335, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65625, + "step": 11906, + "time_per_iteration": 2.483344078063965 + }, + { + "auxiliary_loss_clip": 0.01024215, + "auxiliary_loss_mlp": 0.01002687, + "balance_loss_clip": 1.0016793, + "balance_loss_mlp": 1.00366879, + "epoch": 0.7158875695175109, + "flos": 60975421833600.0, + "grad_norm": 0.733638122069069, + "language_loss": 0.55290663, + "learning_rate": 7.885239411066205e-07, + "loss": 0.57317567, + "num_input_tokens_seen": 256838890, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.20507812, + "step": 11907, + "time_per_iteration": 2.9801692962646484 + }, + { + "auxiliary_loss_clip": 0.01101012, + "auxiliary_loss_mlp": 0.01029943, + "balance_loss_clip": 1.01846945, + "balance_loss_mlp": 1.03456974, + "epoch": 0.7159476927701789, + "flos": 17128780024320.0, + "grad_norm": 1.7110812642484816, + "language_loss": 0.69928622, + "learning_rate": 7.882140833804593e-07, + "loss": 0.72059584, + "num_input_tokens_seen": 256858145, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 11908, + "time_per_iteration": 2.4816782474517822 + }, + { + "auxiliary_loss_clip": 0.01102562, + "auxiliary_loss_mlp": 0.01031677, + "balance_loss_clip": 1.01918399, + "balance_loss_mlp": 1.03589427, + "epoch": 0.7160078160228468, + "flos": 22490601886080.0, + "grad_norm": 1.7432604153438784, + "language_loss": 0.71158898, + "learning_rate": 7.879042716053415e-07, + "loss": 0.73293138, + "num_input_tokens_seen": 256878545, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.66796875, + "step": 11909, + "time_per_iteration": 2.463728189468384 + }, + { + "auxiliary_loss_clip": 0.01102467, + "auxiliary_loss_mlp": 0.01030839, + "balance_loss_clip": 1.01932335, + "balance_loss_mlp": 1.0351603, + "epoch": 0.7160679392755148, + "flos": 30590935626240.0, + "grad_norm": 2.4467362846605014, + "language_loss": 0.75301147, + "learning_rate": 7.875945057930144e-07, + "loss": 0.7743445, + "num_input_tokens_seen": 256899920, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 11910, + "time_per_iteration": 2.552417755126953 + }, + { + "auxiliary_loss_clip": 0.01101078, + "auxiliary_loss_mlp": 0.01030765, + "balance_loss_clip": 1.02067399, + "balance_loss_mlp": 1.03550065, + "epoch": 0.7161280625281827, + "flos": 21323648833920.0, + "grad_norm": 1.495993401769944, + "language_loss": 0.7667104, + "learning_rate": 7.872847859552251e-07, + "loss": 0.78802884, + "num_input_tokens_seen": 256918460, + "router_z_loss_clip": 0.10058594, + "router_z_loss_mlp": 0.65625, + "step": 11911, + "time_per_iteration": 2.441070079803467 + }, + { + "auxiliary_loss_clip": 0.01101901, + "auxiliary_loss_mlp": 0.01028053, + "balance_loss_clip": 1.01610255, + "balance_loss_mlp": 1.03523242, + "epoch": 0.7161881857808508, + "flos": 61860078921600.0, + "grad_norm": 1.748429659384578, + "language_loss": 0.58908474, + "learning_rate": 7.869751121037192e-07, + "loss": 0.61038435, + "num_input_tokens_seen": 256942015, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66796875, + "step": 11912, + "time_per_iteration": 2.857440948486328 + }, + { + "auxiliary_loss_clip": 0.01101647, + "auxiliary_loss_mlp": 0.01032599, + "balance_loss_clip": 1.02072561, + "balance_loss_mlp": 1.03633833, + "epoch": 0.7162483090335187, + "flos": 20812101292800.0, + "grad_norm": 2.5901065267477907, + "language_loss": 0.77851343, + "learning_rate": 7.866654842502376e-07, + "loss": 0.79985595, + "num_input_tokens_seen": 256961065, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.65625, + "step": 11913, + "time_per_iteration": 2.4704270362854004 + }, + { + "auxiliary_loss_clip": 0.01097344, + "auxiliary_loss_mlp": 0.0102545, + "balance_loss_clip": 1.01518047, + "balance_loss_mlp": 1.03362048, + "epoch": 0.7163084322861867, + "flos": 24097532630400.0, + "grad_norm": 1.6674872832299297, + "language_loss": 0.7374261, + "learning_rate": 7.863559024065234e-07, + "loss": 0.75865406, + "num_input_tokens_seen": 256982165, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.63671875, + "step": 11914, + "time_per_iteration": 2.4930355548858643 + }, + { + "auxiliary_loss_clip": 0.01097032, + "auxiliary_loss_mlp": 0.01028588, + "balance_loss_clip": 1.01763296, + "balance_loss_mlp": 1.03384876, + "epoch": 0.7163685555388547, + "flos": 20080888128000.0, + "grad_norm": 1.6897507669283607, + "language_loss": 0.74089867, + "learning_rate": 7.860463665843143e-07, + "loss": 0.76215488, + "num_input_tokens_seen": 256999825, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6328125, + "step": 11915, + "time_per_iteration": 2.4409830570220947 + }, + { + "auxiliary_loss_clip": 0.01101198, + "auxiliary_loss_mlp": 0.01026687, + "balance_loss_clip": 1.01569629, + "balance_loss_mlp": 1.03444886, + "epoch": 0.7164286787915226, + "flos": 17456967613440.0, + "grad_norm": 1.8754792377471143, + "language_loss": 0.81102198, + "learning_rate": 7.85736876795349e-07, + "loss": 0.83230084, + "num_input_tokens_seen": 257017450, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.66796875, + "step": 11916, + "time_per_iteration": 2.459618330001831 + }, + { + "auxiliary_loss_clip": 0.01101693, + "auxiliary_loss_mlp": 0.01030601, + "balance_loss_clip": 1.01945496, + "balance_loss_mlp": 1.03565669, + "epoch": 0.7164888020441906, + "flos": 19718908819200.0, + "grad_norm": 1.9464707558133532, + "language_loss": 0.68163168, + "learning_rate": 7.854274330513626e-07, + "loss": 0.70295465, + "num_input_tokens_seen": 257035465, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66015625, + "step": 11917, + "time_per_iteration": 2.4127745628356934 + }, + { + "auxiliary_loss_clip": 0.01101517, + "auxiliary_loss_mlp": 0.01028462, + "balance_loss_clip": 1.01660061, + "balance_loss_mlp": 1.0357312, + "epoch": 0.7165489252968585, + "flos": 21470523546240.0, + "grad_norm": 1.6865560164096236, + "language_loss": 0.75851363, + "learning_rate": 7.851180353640896e-07, + "loss": 0.77981341, + "num_input_tokens_seen": 257053750, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.65625, + "step": 11918, + "time_per_iteration": 2.4734885692596436 + }, + { + "auxiliary_loss_clip": 0.01024332, + "auxiliary_loss_mlp": 0.00998276, + "balance_loss_clip": 0.99721545, + "balance_loss_mlp": 1.00387406, + "epoch": 0.7166090485495266, + "flos": 69928060464000.0, + "grad_norm": 0.6281271868389183, + "language_loss": 0.53900385, + "learning_rate": 7.848086837452639e-07, + "loss": 0.55922985, + "num_input_tokens_seen": 257121215, + "router_z_loss_clip": 0.01062012, + "router_z_loss_mlp": 0.20507812, + "step": 11919, + "time_per_iteration": 3.0739991664886475 + }, + { + "auxiliary_loss_clip": 0.01103551, + "auxiliary_loss_mlp": 0.010276, + "balance_loss_clip": 1.0166924, + "balance_loss_mlp": 1.03664875, + "epoch": 0.7166691718021945, + "flos": 27343892949120.0, + "grad_norm": 1.814886397013554, + "language_loss": 0.69109583, + "learning_rate": 7.844993782066132e-07, + "loss": 0.71240735, + "num_input_tokens_seen": 257143370, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.66796875, + "step": 11920, + "time_per_iteration": 2.544965982437134 + }, + { + "auxiliary_loss_clip": 0.01101615, + "auxiliary_loss_mlp": 0.01034755, + "balance_loss_clip": 1.02296519, + "balance_loss_mlp": 1.03518677, + "epoch": 0.7167292950548625, + "flos": 30408868563840.0, + "grad_norm": 2.316743559144869, + "language_loss": 0.74621791, + "learning_rate": 7.841901187598678e-07, + "loss": 0.76758158, + "num_input_tokens_seen": 257162160, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 11921, + "time_per_iteration": 2.526437282562256 + }, + { + "auxiliary_loss_clip": 0.01105899, + "auxiliary_loss_mlp": 0.0103267, + "balance_loss_clip": 1.01924133, + "balance_loss_mlp": 1.0359118, + "epoch": 0.7167894183075304, + "flos": 14571257800320.0, + "grad_norm": 2.001999520631163, + "language_loss": 0.75461966, + "learning_rate": 7.83880905416755e-07, + "loss": 0.77600539, + "num_input_tokens_seen": 257179300, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.703125, + "step": 11922, + "time_per_iteration": 2.4796934127807617 + }, + { + "auxiliary_loss_clip": 0.0102356, + "auxiliary_loss_mlp": 0.01004637, + "balance_loss_clip": 1.0036118, + "balance_loss_mlp": 1.00325036, + "epoch": 0.7168495415601984, + "flos": 64110674407680.0, + "grad_norm": 0.7529363745673505, + "language_loss": 0.55118704, + "learning_rate": 7.83571738189001e-07, + "loss": 0.57146901, + "num_input_tokens_seen": 257235470, + "router_z_loss_clip": 0.01025391, + "router_z_loss_mlp": 0.203125, + "step": 11923, + "time_per_iteration": 2.8653676509857178 + }, + { + "auxiliary_loss_clip": 0.01101474, + "auxiliary_loss_mlp": 0.01034043, + "balance_loss_clip": 1.02191389, + "balance_loss_mlp": 1.03463423, + "epoch": 0.7169096648128663, + "flos": 24681440119680.0, + "grad_norm": 1.5657552163313224, + "language_loss": 0.7707153, + "learning_rate": 7.832626170883279e-07, + "loss": 0.79207051, + "num_input_tokens_seen": 257255850, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66796875, + "step": 11924, + "time_per_iteration": 2.4798498153686523 + }, + { + "auxiliary_loss_clip": 0.01099287, + "auxiliary_loss_mlp": 0.01026383, + "balance_loss_clip": 1.01563597, + "balance_loss_mlp": 1.03447676, + "epoch": 0.7169697880655344, + "flos": 20667525050880.0, + "grad_norm": 1.8554693193395075, + "language_loss": 0.68279767, + "learning_rate": 7.829535421264588e-07, + "loss": 0.70405436, + "num_input_tokens_seen": 257275425, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6484375, + "step": 11925, + "time_per_iteration": 2.456970453262329 + }, + { + "auxiliary_loss_clip": 0.01094381, + "auxiliary_loss_mlp": 0.01029664, + "balance_loss_clip": 1.0189774, + "balance_loss_mlp": 1.03209913, + "epoch": 0.7170299113182023, + "flos": 21032700670080.0, + "grad_norm": 1.4556850136555692, + "language_loss": 0.77406371, + "learning_rate": 7.826445133151133e-07, + "loss": 0.79530406, + "num_input_tokens_seen": 257295740, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.62109375, + "step": 11926, + "time_per_iteration": 2.47904109954834 + }, + { + "auxiliary_loss_clip": 0.01104854, + "auxiliary_loss_mlp": 0.01029406, + "balance_loss_clip": 1.01756239, + "balance_loss_mlp": 1.03482664, + "epoch": 0.7170900345708703, + "flos": 22893304239360.0, + "grad_norm": 1.9978148890029475, + "language_loss": 0.77397847, + "learning_rate": 7.823355306660093e-07, + "loss": 0.79532105, + "num_input_tokens_seen": 257315970, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.703125, + "step": 11927, + "time_per_iteration": 2.4695799350738525 + }, + { + "auxiliary_loss_clip": 0.0110007, + "auxiliary_loss_mlp": 0.01029701, + "balance_loss_clip": 1.01752985, + "balance_loss_mlp": 1.03606367, + "epoch": 0.7171501578235383, + "flos": 15518688883200.0, + "grad_norm": 1.633304495033459, + "language_loss": 0.69208646, + "learning_rate": 7.820265941908642e-07, + "loss": 0.71338403, + "num_input_tokens_seen": 257334230, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.640625, + "step": 11928, + "time_per_iteration": 3.8939363956451416 + }, + { + "auxiliary_loss_clip": 0.01097285, + "auxiliary_loss_mlp": 0.01028264, + "balance_loss_clip": 1.01755297, + "balance_loss_mlp": 1.03416717, + "epoch": 0.7172102810762062, + "flos": 26104292640000.0, + "grad_norm": 1.8722089290497335, + "language_loss": 0.65309197, + "learning_rate": 7.817177039013931e-07, + "loss": 0.67434746, + "num_input_tokens_seen": 257352145, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6328125, + "step": 11929, + "time_per_iteration": 2.6483962535858154 + }, + { + "auxiliary_loss_clip": 0.01101349, + "auxiliary_loss_mlp": 0.01027062, + "balance_loss_clip": 1.01543295, + "balance_loss_mlp": 1.03426468, + "epoch": 0.7172704043288742, + "flos": 21506649649920.0, + "grad_norm": 1.9043937603193066, + "language_loss": 0.69810534, + "learning_rate": 7.81408859809308e-07, + "loss": 0.71938944, + "num_input_tokens_seen": 257371460, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 11930, + "time_per_iteration": 3.861077308654785 + }, + { + "auxiliary_loss_clip": 0.01099761, + "auxiliary_loss_mlp": 0.01027844, + "balance_loss_clip": 1.01666808, + "balance_loss_mlp": 1.0326252, + "epoch": 0.7173305275815421, + "flos": 18770939032320.0, + "grad_norm": 1.6949604037705792, + "language_loss": 0.80755305, + "learning_rate": 7.811000619263219e-07, + "loss": 0.82882911, + "num_input_tokens_seen": 257390800, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.671875, + "step": 11931, + "time_per_iteration": 5.3606438636779785 + }, + { + "auxiliary_loss_clip": 0.01098633, + "auxiliary_loss_mlp": 0.01031463, + "balance_loss_clip": 1.02060318, + "balance_loss_mlp": 1.03377175, + "epoch": 0.7173906508342102, + "flos": 16179876483840.0, + "grad_norm": 2.0368865181542843, + "language_loss": 0.78136313, + "learning_rate": 7.80791310264143e-07, + "loss": 0.8026641, + "num_input_tokens_seen": 257407495, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6484375, + "step": 11932, + "time_per_iteration": 2.4471938610076904 + }, + { + "auxiliary_loss_clip": 0.01098455, + "auxiliary_loss_mlp": 0.01027853, + "balance_loss_clip": 1.01697493, + "balance_loss_mlp": 1.03404713, + "epoch": 0.7174507740868781, + "flos": 26613864933120.0, + "grad_norm": 2.4237059069381446, + "language_loss": 0.75071502, + "learning_rate": 7.804826048344803e-07, + "loss": 0.77197808, + "num_input_tokens_seen": 257429675, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.64453125, + "step": 11933, + "time_per_iteration": 2.5671815872192383 + }, + { + "auxiliary_loss_clip": 0.01107402, + "auxiliary_loss_mlp": 0.01034606, + "balance_loss_clip": 1.02070642, + "balance_loss_mlp": 1.03681624, + "epoch": 0.7175108973395461, + "flos": 18432911116800.0, + "grad_norm": 2.920076286079433, + "language_loss": 0.69595957, + "learning_rate": 7.801739456490388e-07, + "loss": 0.71737969, + "num_input_tokens_seen": 257442765, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.703125, + "step": 11934, + "time_per_iteration": 2.4200711250305176 + }, + { + "auxiliary_loss_clip": 0.0109937, + "auxiliary_loss_mlp": 0.01033447, + "balance_loss_clip": 1.02228904, + "balance_loss_mlp": 1.03382134, + "epoch": 0.717571020592214, + "flos": 23914962777600.0, + "grad_norm": 2.1353095308292858, + "language_loss": 0.86605275, + "learning_rate": 7.798653327195237e-07, + "loss": 0.8873809, + "num_input_tokens_seen": 257459310, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.65625, + "step": 11935, + "time_per_iteration": 2.4989066123962402 + }, + { + "auxiliary_loss_clip": 0.0110051, + "auxiliary_loss_mlp": 0.0102799, + "balance_loss_clip": 1.01663518, + "balance_loss_mlp": 1.03355277, + "epoch": 0.717631143844882, + "flos": 38256930109440.0, + "grad_norm": 1.5482941622525788, + "language_loss": 0.73668665, + "learning_rate": 7.795567660576388e-07, + "loss": 0.75797164, + "num_input_tokens_seen": 257484750, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 11936, + "time_per_iteration": 2.5941200256347656 + }, + { + "auxiliary_loss_clip": 0.01023485, + "auxiliary_loss_mlp": 0.00998978, + "balance_loss_clip": 0.99795878, + "balance_loss_mlp": 1.00320697, + "epoch": 0.7176912670975499, + "flos": 65515896328320.0, + "grad_norm": 0.7612162175951352, + "language_loss": 0.5594666, + "learning_rate": 7.79248245675082e-07, + "loss": 0.57969117, + "num_input_tokens_seen": 257543110, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.203125, + "step": 11937, + "time_per_iteration": 3.0358333587646484 + }, + { + "auxiliary_loss_clip": 0.01103398, + "auxiliary_loss_mlp": 0.01031893, + "balance_loss_clip": 1.0194416, + "balance_loss_mlp": 1.03557646, + "epoch": 0.717751390350218, + "flos": 31281066610560.0, + "grad_norm": 1.9834308333096748, + "language_loss": 0.54777831, + "learning_rate": 7.789397715835542e-07, + "loss": 0.56913126, + "num_input_tokens_seen": 257567410, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6796875, + "step": 11938, + "time_per_iteration": 2.498337984085083 + }, + { + "auxiliary_loss_clip": 0.01096235, + "auxiliary_loss_mlp": 0.01028561, + "balance_loss_clip": 1.01791584, + "balance_loss_mlp": 1.03201962, + "epoch": 0.7178115136028859, + "flos": 19859031774720.0, + "grad_norm": 1.6763116198702877, + "language_loss": 0.76891506, + "learning_rate": 7.786313437947527e-07, + "loss": 0.79016298, + "num_input_tokens_seen": 257586270, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.640625, + "step": 11939, + "time_per_iteration": 2.4648613929748535 + }, + { + "auxiliary_loss_clip": 0.0102339, + "auxiliary_loss_mlp": 0.01004556, + "balance_loss_clip": 1.00347769, + "balance_loss_mlp": 1.003003, + "epoch": 0.7178716368555539, + "flos": 64348655967360.0, + "grad_norm": 0.7581492176008457, + "language_loss": 0.61391574, + "learning_rate": 7.783229623203738e-07, + "loss": 0.63419521, + "num_input_tokens_seen": 257647415, + "router_z_loss_clip": 0.01080322, + "router_z_loss_mlp": 0.20410156, + "step": 11940, + "time_per_iteration": 3.0383803844451904 + }, + { + "auxiliary_loss_clip": 0.0109722, + "auxiliary_loss_mlp": 0.01030263, + "balance_loss_clip": 1.01903307, + "balance_loss_mlp": 1.03327632, + "epoch": 0.7179317601082219, + "flos": 26762607152640.0, + "grad_norm": 1.5272164711726817, + "language_loss": 0.58784437, + "learning_rate": 7.780146271721097e-07, + "loss": 0.60911918, + "num_input_tokens_seen": 257669795, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 11941, + "time_per_iteration": 2.5290164947509766 + }, + { + "auxiliary_loss_clip": 0.01100557, + "auxiliary_loss_mlp": 0.01029246, + "balance_loss_clip": 1.0178616, + "balance_loss_mlp": 1.03522658, + "epoch": 0.7179918833608898, + "flos": 23513804709120.0, + "grad_norm": 1.9189885732421792, + "language_loss": 0.79849315, + "learning_rate": 7.777063383616543e-07, + "loss": 0.81979108, + "num_input_tokens_seen": 257687415, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65234375, + "step": 11942, + "time_per_iteration": 2.5065739154815674 + }, + { + "auxiliary_loss_clip": 0.0110185, + "auxiliary_loss_mlp": 0.0103798, + "balance_loss_clip": 1.02638674, + "balance_loss_mlp": 1.03522158, + "epoch": 0.7180520066135578, + "flos": 17165588486400.0, + "grad_norm": 2.0597149659122636, + "language_loss": 0.66328835, + "learning_rate": 7.773980959006968e-07, + "loss": 0.6846866, + "num_input_tokens_seen": 257706215, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 11943, + "time_per_iteration": 2.4654104709625244 + }, + { + "auxiliary_loss_clip": 0.01097892, + "auxiliary_loss_mlp": 0.01028638, + "balance_loss_clip": 1.01696157, + "balance_loss_mlp": 1.03440082, + "epoch": 0.7181121298662257, + "flos": 17566638814080.0, + "grad_norm": 1.9764370465475432, + "language_loss": 0.79013598, + "learning_rate": 7.770898998009254e-07, + "loss": 0.81140125, + "num_input_tokens_seen": 257724740, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6328125, + "step": 11944, + "time_per_iteration": 2.445718765258789 + }, + { + "auxiliary_loss_clip": 0.01102899, + "auxiliary_loss_mlp": 0.01036625, + "balance_loss_clip": 1.02368522, + "balance_loss_mlp": 1.03503132, + "epoch": 0.7181722531188938, + "flos": 11947660508160.0, + "grad_norm": 2.260846776642364, + "language_loss": 0.62923992, + "learning_rate": 7.767817500740277e-07, + "loss": 0.65063506, + "num_input_tokens_seen": 257742060, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6796875, + "step": 11945, + "time_per_iteration": 2.4455084800720215 + }, + { + "auxiliary_loss_clip": 0.01023274, + "auxiliary_loss_mlp": 0.01000772, + "balance_loss_clip": 0.99966967, + "balance_loss_mlp": 1.00287986, + "epoch": 0.7182323763715617, + "flos": 65503649790720.0, + "grad_norm": 0.7012511616617018, + "language_loss": 0.51091176, + "learning_rate": 7.76473646731689e-07, + "loss": 0.53115225, + "num_input_tokens_seen": 257802250, + "router_z_loss_clip": 0.01104736, + "router_z_loss_mlp": 0.20410156, + "step": 11946, + "time_per_iteration": 2.993520498275757 + }, + { + "auxiliary_loss_clip": 0.0110474, + "auxiliary_loss_mlp": 0.01035735, + "balance_loss_clip": 1.0228188, + "balance_loss_mlp": 1.03633344, + "epoch": 0.7182924996242297, + "flos": 20630932070400.0, + "grad_norm": 1.5511387132101104, + "language_loss": 0.74426639, + "learning_rate": 7.761655897855925e-07, + "loss": 0.76567119, + "num_input_tokens_seen": 257821155, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.68359375, + "step": 11947, + "time_per_iteration": 2.5280697345733643 + }, + { + "auxiliary_loss_clip": 0.01098111, + "auxiliary_loss_mlp": 0.01028479, + "balance_loss_clip": 1.01682568, + "balance_loss_mlp": 1.03252912, + "epoch": 0.7183526228768976, + "flos": 16216433550720.0, + "grad_norm": 1.7377460165223417, + "language_loss": 0.72264934, + "learning_rate": 7.758575792474187e-07, + "loss": 0.74391532, + "num_input_tokens_seen": 257839905, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65625, + "step": 11948, + "time_per_iteration": 2.404911994934082 + }, + { + "auxiliary_loss_clip": 0.01103929, + "auxiliary_loss_mlp": 0.01038628, + "balance_loss_clip": 1.0260042, + "balance_loss_mlp": 1.0358839, + "epoch": 0.7184127461295656, + "flos": 22232655342720.0, + "grad_norm": 1.5225277290119825, + "language_loss": 0.71613109, + "learning_rate": 7.755496151288483e-07, + "loss": 0.73755664, + "num_input_tokens_seen": 257860055, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 11949, + "time_per_iteration": 2.4918761253356934 + }, + { + "auxiliary_loss_clip": 0.01099737, + "auxiliary_loss_mlp": 0.01030693, + "balance_loss_clip": 1.01983905, + "balance_loss_mlp": 1.03520155, + "epoch": 0.7184728693822335, + "flos": 27344503480320.0, + "grad_norm": 1.917874476636917, + "language_loss": 0.75913876, + "learning_rate": 7.752416974415598e-07, + "loss": 0.78044307, + "num_input_tokens_seen": 257879315, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6484375, + "step": 11950, + "time_per_iteration": 2.4783732891082764 + }, + { + "auxiliary_loss_clip": 0.01104047, + "auxiliary_loss_mlp": 0.01029869, + "balance_loss_clip": 1.01741719, + "balance_loss_mlp": 1.03651667, + "epoch": 0.7185329926349016, + "flos": 16508530949760.0, + "grad_norm": 2.3664494047814872, + "language_loss": 0.67457062, + "learning_rate": 7.749338261972282e-07, + "loss": 0.69590974, + "num_input_tokens_seen": 257896570, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.67578125, + "step": 11951, + "time_per_iteration": 2.4524526596069336 + }, + { + "auxiliary_loss_clip": 0.01106378, + "auxiliary_loss_mlp": 0.01029859, + "balance_loss_clip": 1.01682329, + "balance_loss_mlp": 1.03777874, + "epoch": 0.7185931158875695, + "flos": 23951052967680.0, + "grad_norm": 1.7288194945229958, + "language_loss": 0.78023463, + "learning_rate": 7.746260014075286e-07, + "loss": 0.80159694, + "num_input_tokens_seen": 257916855, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.68359375, + "step": 11952, + "time_per_iteration": 2.49094295501709 + }, + { + "auxiliary_loss_clip": 0.01103687, + "auxiliary_loss_mlp": 0.01033038, + "balance_loss_clip": 1.02052677, + "balance_loss_mlp": 1.03563547, + "epoch": 0.7186532391402375, + "flos": 26542007775360.0, + "grad_norm": 1.7793096783925773, + "language_loss": 0.74963003, + "learning_rate": 7.743182230841352e-07, + "loss": 0.77099729, + "num_input_tokens_seen": 257937140, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 11953, + "time_per_iteration": 2.500009298324585 + }, + { + "auxiliary_loss_clip": 0.01102038, + "auxiliary_loss_mlp": 0.01028598, + "balance_loss_clip": 1.0169332, + "balance_loss_mlp": 1.03495383, + "epoch": 0.7187133623929055, + "flos": 22383049587840.0, + "grad_norm": 1.7832624252992626, + "language_loss": 0.72971594, + "learning_rate": 7.740104912387164e-07, + "loss": 0.75102234, + "num_input_tokens_seen": 257956785, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.671875, + "step": 11954, + "time_per_iteration": 2.4608652591705322 + }, + { + "auxiliary_loss_clip": 0.01103144, + "auxiliary_loss_mlp": 0.01034266, + "balance_loss_clip": 1.02251804, + "balance_loss_mlp": 1.03668714, + "epoch": 0.7187734856455734, + "flos": 15779580341760.0, + "grad_norm": 1.601255234350909, + "language_loss": 0.74186033, + "learning_rate": 7.737028058829425e-07, + "loss": 0.7632345, + "num_input_tokens_seen": 257975455, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 11955, + "time_per_iteration": 2.474217176437378 + }, + { + "auxiliary_loss_clip": 0.01101929, + "auxiliary_loss_mlp": 0.01030274, + "balance_loss_clip": 1.01877582, + "balance_loss_mlp": 1.0353359, + "epoch": 0.7188336088982414, + "flos": 31759612531200.0, + "grad_norm": 1.6751832358498482, + "language_loss": 0.73376679, + "learning_rate": 7.733951670284817e-07, + "loss": 0.75508881, + "num_input_tokens_seen": 257996850, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 11956, + "time_per_iteration": 2.5315232276916504 + }, + { + "auxiliary_loss_clip": 0.01101581, + "auxiliary_loss_mlp": 0.01028517, + "balance_loss_clip": 1.01688766, + "balance_loss_mlp": 1.03342509, + "epoch": 0.7188937321509093, + "flos": 21465208333440.0, + "grad_norm": 2.7995163806109407, + "language_loss": 0.7065621, + "learning_rate": 7.730875746869987e-07, + "loss": 0.72786307, + "num_input_tokens_seen": 258016145, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.68359375, + "step": 11957, + "time_per_iteration": 2.479146957397461 + }, + { + "auxiliary_loss_clip": 0.01102151, + "auxiliary_loss_mlp": 0.01036925, + "balance_loss_clip": 1.02427661, + "balance_loss_mlp": 1.03408146, + "epoch": 0.7189538554035774, + "flos": 27271497087360.0, + "grad_norm": 1.9581401117139001, + "language_loss": 0.73586559, + "learning_rate": 7.727800288701582e-07, + "loss": 0.75725639, + "num_input_tokens_seen": 258035420, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6796875, + "step": 11958, + "time_per_iteration": 2.50201416015625 + }, + { + "auxiliary_loss_clip": 0.01099164, + "auxiliary_loss_mlp": 0.01034303, + "balance_loss_clip": 1.0223763, + "balance_loss_mlp": 1.03451216, + "epoch": 0.7190139786562453, + "flos": 21580625710080.0, + "grad_norm": 1.602205422840009, + "language_loss": 0.84252381, + "learning_rate": 7.724725295896215e-07, + "loss": 0.86385846, + "num_input_tokens_seen": 258053520, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6484375, + "step": 11959, + "time_per_iteration": 2.4619383811950684 + }, + { + "auxiliary_loss_clip": 0.01107021, + "auxiliary_loss_mlp": 0.01029979, + "balance_loss_clip": 1.01745617, + "balance_loss_mlp": 1.0378958, + "epoch": 0.7190741019089133, + "flos": 26721237663360.0, + "grad_norm": 1.9033832243828488, + "language_loss": 0.81933033, + "learning_rate": 7.7216507685705e-07, + "loss": 0.84070033, + "num_input_tokens_seen": 258073020, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69140625, + "step": 11960, + "time_per_iteration": 2.4611432552337646 + }, + { + "auxiliary_loss_clip": 0.01100369, + "auxiliary_loss_mlp": 0.01031508, + "balance_loss_clip": 1.01926565, + "balance_loss_mlp": 1.03601289, + "epoch": 0.7191342251615812, + "flos": 26104759516800.0, + "grad_norm": 1.6005750914484573, + "language_loss": 0.77382779, + "learning_rate": 7.718576706841013e-07, + "loss": 0.79514658, + "num_input_tokens_seen": 258093155, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.64453125, + "step": 11961, + "time_per_iteration": 2.490257978439331 + }, + { + "auxiliary_loss_clip": 0.01096696, + "auxiliary_loss_mlp": 0.01030381, + "balance_loss_clip": 1.01970601, + "balance_loss_mlp": 1.03359604, + "epoch": 0.7191943484142492, + "flos": 22967028904320.0, + "grad_norm": 1.493885754938081, + "language_loss": 0.75197971, + "learning_rate": 7.715503110824326e-07, + "loss": 0.7732504, + "num_input_tokens_seen": 258113905, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.62890625, + "step": 11962, + "time_per_iteration": 2.444990873336792 + }, + { + "auxiliary_loss_clip": 0.01101992, + "auxiliary_loss_mlp": 0.01031338, + "balance_loss_clip": 1.01888692, + "balance_loss_mlp": 1.03441834, + "epoch": 0.7192544716669171, + "flos": 22565332131840.0, + "grad_norm": 1.6695078722173347, + "language_loss": 0.75041807, + "learning_rate": 7.712429980637001e-07, + "loss": 0.7717514, + "num_input_tokens_seen": 258132820, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.67578125, + "step": 11963, + "time_per_iteration": 2.4661693572998047 + }, + { + "auxiliary_loss_clip": 0.01105424, + "auxiliary_loss_mlp": 0.01033709, + "balance_loss_clip": 1.02045298, + "balance_loss_mlp": 1.03614235, + "epoch": 0.7193145949195852, + "flos": 18982200873600.0, + "grad_norm": 8.488875605489067, + "language_loss": 0.80680382, + "learning_rate": 7.709357316395564e-07, + "loss": 0.82819521, + "num_input_tokens_seen": 258148055, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.69140625, + "step": 11964, + "time_per_iteration": 2.400843620300293 + }, + { + "auxiliary_loss_clip": 0.01098575, + "auxiliary_loss_mlp": 0.01030033, + "balance_loss_clip": 1.01819539, + "balance_loss_mlp": 1.03335524, + "epoch": 0.7193747181722531, + "flos": 18004246208640.0, + "grad_norm": 1.6851421500357613, + "language_loss": 0.74987501, + "learning_rate": 7.70628511821652e-07, + "loss": 0.77116108, + "num_input_tokens_seen": 258165995, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65234375, + "step": 11965, + "time_per_iteration": 2.455549955368042 + }, + { + "auxiliary_loss_clip": 0.01103926, + "auxiliary_loss_mlp": 0.01032188, + "balance_loss_clip": 1.01950407, + "balance_loss_mlp": 1.03589249, + "epoch": 0.7194348414249211, + "flos": 24389414547840.0, + "grad_norm": 1.6225024257282918, + "language_loss": 0.77548587, + "learning_rate": 7.703213386216377e-07, + "loss": 0.79684699, + "num_input_tokens_seen": 258186165, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 11966, + "time_per_iteration": 2.4651193618774414 + }, + { + "auxiliary_loss_clip": 0.01101346, + "auxiliary_loss_mlp": 0.01029248, + "balance_loss_clip": 1.01745796, + "balance_loss_mlp": 1.03470814, + "epoch": 0.7194949646775891, + "flos": 22163455791360.0, + "grad_norm": 1.9626871533411263, + "language_loss": 0.72638512, + "learning_rate": 7.700142120511619e-07, + "loss": 0.74769109, + "num_input_tokens_seen": 258204595, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 11967, + "time_per_iteration": 2.4732322692871094 + }, + { + "auxiliary_loss_clip": 0.01098168, + "auxiliary_loss_mlp": 0.01028177, + "balance_loss_clip": 1.01812136, + "balance_loss_mlp": 1.03623199, + "epoch": 0.719555087930257, + "flos": 20266366982400.0, + "grad_norm": 1.8100027522509434, + "language_loss": 0.81220973, + "learning_rate": 7.6970713212187e-07, + "loss": 0.83347309, + "num_input_tokens_seen": 258223110, + "router_z_loss_clip": 0.10058594, + "router_z_loss_mlp": 0.62109375, + "step": 11968, + "time_per_iteration": 2.4276745319366455 + }, + { + "auxiliary_loss_clip": 0.01098632, + "auxiliary_loss_mlp": 0.01027926, + "balance_loss_clip": 1.01651216, + "balance_loss_mlp": 1.03366137, + "epoch": 0.719615211182925, + "flos": 24716309247360.0, + "grad_norm": 2.0102886054893268, + "language_loss": 0.76459819, + "learning_rate": 7.69400098845407e-07, + "loss": 0.78586376, + "num_input_tokens_seen": 258242660, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 11969, + "time_per_iteration": 3.861771821975708 + }, + { + "auxiliary_loss_clip": 0.01100862, + "auxiliary_loss_mlp": 0.01030429, + "balance_loss_clip": 1.01781702, + "balance_loss_mlp": 1.03329253, + "epoch": 0.719675334435593, + "flos": 20009641501440.0, + "grad_norm": 1.7792544853917616, + "language_loss": 0.70936543, + "learning_rate": 7.69093112233417e-07, + "loss": 0.73067832, + "num_input_tokens_seen": 258261850, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.67578125, + "step": 11970, + "time_per_iteration": 2.421149253845215 + }, + { + "auxiliary_loss_clip": 0.01023909, + "auxiliary_loss_mlp": 0.00997715, + "balance_loss_clip": 0.99661201, + "balance_loss_mlp": 1.00355303, + "epoch": 0.719735457688261, + "flos": 44199861177600.0, + "grad_norm": 0.9239284754087862, + "language_loss": 0.60847962, + "learning_rate": 7.68786172297538e-07, + "loss": 0.62869585, + "num_input_tokens_seen": 258312570, + "router_z_loss_clip": 0.01104736, + "router_z_loss_mlp": 0.203125, + "step": 11971, + "time_per_iteration": 4.394974231719971 + }, + { + "auxiliary_loss_clip": 0.01108101, + "auxiliary_loss_mlp": 0.01032644, + "balance_loss_clip": 1.02012718, + "balance_loss_mlp": 1.03647828, + "epoch": 0.7197955809409289, + "flos": 16802890905600.0, + "grad_norm": 2.2193219685375647, + "language_loss": 0.79842031, + "learning_rate": 7.684792790494105e-07, + "loss": 0.8198278, + "num_input_tokens_seen": 258331600, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71875, + "step": 11972, + "time_per_iteration": 3.8465628623962402 + }, + { + "auxiliary_loss_clip": 0.01104665, + "auxiliary_loss_mlp": 0.01036669, + "balance_loss_clip": 1.02426565, + "balance_loss_mlp": 1.0365268, + "epoch": 0.7198557041935969, + "flos": 24535391420160.0, + "grad_norm": 1.5934226335424646, + "language_loss": 0.75385857, + "learning_rate": 7.681724325006733e-07, + "loss": 0.77527189, + "num_input_tokens_seen": 258351785, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 11973, + "time_per_iteration": 3.967134475708008 + }, + { + "auxiliary_loss_clip": 0.0102351, + "auxiliary_loss_mlp": 0.00997992, + "balance_loss_clip": 0.9969967, + "balance_loss_mlp": 1.00313878, + "epoch": 0.7199158274462648, + "flos": 70710839602560.0, + "grad_norm": 0.8568599946371717, + "language_loss": 0.57251143, + "learning_rate": 7.6786563266296e-07, + "loss": 0.59272635, + "num_input_tokens_seen": 258404035, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.20410156, + "step": 11974, + "time_per_iteration": 2.9041314125061035 + }, + { + "auxiliary_loss_clip": 0.0110113, + "auxiliary_loss_mlp": 0.01032835, + "balance_loss_clip": 1.02024031, + "balance_loss_mlp": 1.03228343, + "epoch": 0.7199759506989328, + "flos": 29347995352320.0, + "grad_norm": 2.0540036125086623, + "language_loss": 0.61555636, + "learning_rate": 7.675588795479062e-07, + "loss": 0.63689601, + "num_input_tokens_seen": 258424850, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 11975, + "time_per_iteration": 2.5565595626831055 + }, + { + "auxiliary_loss_clip": 0.0110015, + "auxiliary_loss_mlp": 0.01031171, + "balance_loss_clip": 1.01964951, + "balance_loss_mlp": 1.03378308, + "epoch": 0.7200360739516007, + "flos": 24640465680000.0, + "grad_norm": 1.7485061825333017, + "language_loss": 0.67644596, + "learning_rate": 7.672521731671425e-07, + "loss": 0.69775921, + "num_input_tokens_seen": 258445485, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 11976, + "time_per_iteration": 2.4791998863220215 + }, + { + "auxiliary_loss_clip": 0.01101483, + "auxiliary_loss_mlp": 0.01028822, + "balance_loss_clip": 1.0175333, + "balance_loss_mlp": 1.03462696, + "epoch": 0.7200961972042688, + "flos": 20812855478400.0, + "grad_norm": 1.9984197913928563, + "language_loss": 0.67032665, + "learning_rate": 7.669455135323004e-07, + "loss": 0.69162977, + "num_input_tokens_seen": 258464505, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.66796875, + "step": 11977, + "time_per_iteration": 2.4562158584594727 + }, + { + "auxiliary_loss_clip": 0.01105574, + "auxiliary_loss_mlp": 0.01030978, + "balance_loss_clip": 1.01912272, + "balance_loss_mlp": 1.03690076, + "epoch": 0.7201563204569367, + "flos": 31245910174080.0, + "grad_norm": 1.7897602101317545, + "language_loss": 0.75156534, + "learning_rate": 7.666389006550074e-07, + "loss": 0.77293086, + "num_input_tokens_seen": 258487190, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 11978, + "time_per_iteration": 2.6318418979644775 + }, + { + "auxiliary_loss_clip": 0.01098931, + "auxiliary_loss_mlp": 0.01031575, + "balance_loss_clip": 1.01953471, + "balance_loss_mlp": 1.03316569, + "epoch": 0.7202164437096047, + "flos": 26651391667200.0, + "grad_norm": 2.125023403243126, + "language_loss": 0.78794968, + "learning_rate": 7.663323345468908e-07, + "loss": 0.80925471, + "num_input_tokens_seen": 258503790, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.65625, + "step": 11979, + "time_per_iteration": 2.4805469512939453 + }, + { + "auxiliary_loss_clip": 0.01103342, + "auxiliary_loss_mlp": 0.01026884, + "balance_loss_clip": 1.0148797, + "balance_loss_mlp": 1.03659976, + "epoch": 0.7202765669622727, + "flos": 25959608657280.0, + "grad_norm": 1.7429736369489133, + "language_loss": 0.65073323, + "learning_rate": 7.660258152195767e-07, + "loss": 0.67203552, + "num_input_tokens_seen": 258527335, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66796875, + "step": 11980, + "time_per_iteration": 2.530036211013794 + }, + { + "auxiliary_loss_clip": 0.0110393, + "auxiliary_loss_mlp": 0.01035093, + "balance_loss_clip": 1.02152729, + "balance_loss_mlp": 1.03610325, + "epoch": 0.7203366902149406, + "flos": 28512354372480.0, + "grad_norm": 1.8302790091648973, + "language_loss": 0.67421222, + "learning_rate": 7.657193426846871e-07, + "loss": 0.69560248, + "num_input_tokens_seen": 258546690, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.6796875, + "step": 11981, + "time_per_iteration": 2.5009641647338867 + }, + { + "auxiliary_loss_clip": 0.01103608, + "auxiliary_loss_mlp": 0.01032928, + "balance_loss_clip": 1.02081633, + "balance_loss_mlp": 1.03605318, + "epoch": 0.7203968134676086, + "flos": 21106030285440.0, + "grad_norm": 1.9266732225953629, + "language_loss": 0.73759854, + "learning_rate": 7.65412916953843e-07, + "loss": 0.75896388, + "num_input_tokens_seen": 258566340, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 11982, + "time_per_iteration": 2.4776506423950195 + }, + { + "auxiliary_loss_clip": 0.01101459, + "auxiliary_loss_mlp": 0.01037116, + "balance_loss_clip": 1.02592814, + "balance_loss_mlp": 1.03360009, + "epoch": 0.7204569367202766, + "flos": 18332146488960.0, + "grad_norm": 1.8065417430122819, + "language_loss": 0.66113031, + "learning_rate": 7.65106538038665e-07, + "loss": 0.68251604, + "num_input_tokens_seen": 258584455, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6796875, + "step": 11983, + "time_per_iteration": 2.441363573074341 + }, + { + "auxiliary_loss_clip": 0.01103087, + "auxiliary_loss_mlp": 0.01031533, + "balance_loss_clip": 1.0197134, + "balance_loss_mlp": 1.03643811, + "epoch": 0.7205170599729446, + "flos": 23255103980160.0, + "grad_norm": 1.5519388922028943, + "language_loss": 0.66470373, + "learning_rate": 7.648002059507715e-07, + "loss": 0.68604994, + "num_input_tokens_seen": 258604725, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66796875, + "step": 11984, + "time_per_iteration": 2.4713308811187744 + }, + { + "auxiliary_loss_clip": 0.01107357, + "auxiliary_loss_mlp": 0.01035242, + "balance_loss_clip": 1.02291024, + "balance_loss_mlp": 1.03795314, + "epoch": 0.7205771832256125, + "flos": 20120892900480.0, + "grad_norm": 1.7856287402136095, + "language_loss": 0.73836136, + "learning_rate": 7.644939207017771e-07, + "loss": 0.75978738, + "num_input_tokens_seen": 258622885, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6953125, + "step": 11985, + "time_per_iteration": 2.4582014083862305 + }, + { + "auxiliary_loss_clip": 0.01101196, + "auxiliary_loss_mlp": 0.01028264, + "balance_loss_clip": 1.01717734, + "balance_loss_mlp": 1.03589368, + "epoch": 0.7206373064782805, + "flos": 27703250565120.0, + "grad_norm": 1.7243225094685473, + "language_loss": 0.62891448, + "learning_rate": 7.641876823032977e-07, + "loss": 0.65020913, + "num_input_tokens_seen": 258644305, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.65234375, + "step": 11986, + "time_per_iteration": 2.525557279586792 + }, + { + "auxiliary_loss_clip": 0.01104045, + "auxiliary_loss_mlp": 0.01034059, + "balance_loss_clip": 1.020702, + "balance_loss_mlp": 1.03693676, + "epoch": 0.7206974297309484, + "flos": 17968156018560.0, + "grad_norm": 1.5922220046222206, + "language_loss": 0.72103626, + "learning_rate": 7.638814907669455e-07, + "loss": 0.74241722, + "num_input_tokens_seen": 258661775, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.671875, + "step": 11987, + "time_per_iteration": 2.4586973190307617 + }, + { + "auxiliary_loss_clip": 0.01104181, + "auxiliary_loss_mlp": 0.01030178, + "balance_loss_clip": 1.01796532, + "balance_loss_mlp": 1.03563333, + "epoch": 0.7207575529836164, + "flos": 16983162288000.0, + "grad_norm": 1.7226788638874178, + "language_loss": 0.78616083, + "learning_rate": 7.635753461043301e-07, + "loss": 0.80750442, + "num_input_tokens_seen": 258679830, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 11988, + "time_per_iteration": 2.425905227661133 + }, + { + "auxiliary_loss_clip": 0.01100213, + "auxiliary_loss_mlp": 0.01030561, + "balance_loss_clip": 1.01888466, + "balance_loss_mlp": 1.03406453, + "epoch": 0.7208176762362843, + "flos": 18727594295040.0, + "grad_norm": 3.553932459688601, + "language_loss": 0.78784275, + "learning_rate": 7.632692483270618e-07, + "loss": 0.80915058, + "num_input_tokens_seen": 258697415, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 11989, + "time_per_iteration": 2.58890700340271 + }, + { + "auxiliary_loss_clip": 0.01100086, + "auxiliary_loss_mlp": 0.01031511, + "balance_loss_clip": 1.01997149, + "balance_loss_mlp": 1.03511739, + "epoch": 0.7208777994889524, + "flos": 18734489706240.0, + "grad_norm": 6.030003130937093, + "language_loss": 0.82572663, + "learning_rate": 7.629631974467481e-07, + "loss": 0.84704268, + "num_input_tokens_seen": 258716755, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 11990, + "time_per_iteration": 2.422929048538208 + }, + { + "auxiliary_loss_clip": 0.01101655, + "auxiliary_loss_mlp": 0.01035975, + "balance_loss_clip": 1.02463794, + "balance_loss_mlp": 1.03581941, + "epoch": 0.7209379227416203, + "flos": 14793437376000.0, + "grad_norm": 2.2646719383287746, + "language_loss": 0.76148689, + "learning_rate": 7.626571934749931e-07, + "loss": 0.78286314, + "num_input_tokens_seen": 258733270, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.66015625, + "step": 11991, + "time_per_iteration": 2.439966917037964 + }, + { + "auxiliary_loss_clip": 0.01099738, + "auxiliary_loss_mlp": 0.01027379, + "balance_loss_clip": 1.0163815, + "balance_loss_mlp": 1.03555298, + "epoch": 0.7209980459942883, + "flos": 29636860527360.0, + "grad_norm": 2.0383069832544263, + "language_loss": 0.72644949, + "learning_rate": 7.623512364234022e-07, + "loss": 0.74772066, + "num_input_tokens_seen": 258755270, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.640625, + "step": 11992, + "time_per_iteration": 2.508730173110962 + }, + { + "auxiliary_loss_clip": 0.01103062, + "auxiliary_loss_mlp": 0.01028315, + "balance_loss_clip": 1.01668572, + "balance_loss_mlp": 1.0353663, + "epoch": 0.7210581692469563, + "flos": 23477175815040.0, + "grad_norm": 1.8344706583489365, + "language_loss": 0.66479945, + "learning_rate": 7.620453263035755e-07, + "loss": 0.68611324, + "num_input_tokens_seen": 258775340, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.67578125, + "step": 11993, + "time_per_iteration": 2.496220350265503 + }, + { + "auxiliary_loss_clip": 0.01101133, + "auxiliary_loss_mlp": 0.01030932, + "balance_loss_clip": 1.01959491, + "balance_loss_mlp": 1.03491402, + "epoch": 0.7211182924996242, + "flos": 26099839353600.0, + "grad_norm": 2.3726873705189786, + "language_loss": 0.65635949, + "learning_rate": 7.61739463127115e-07, + "loss": 0.67768013, + "num_input_tokens_seen": 258794580, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66015625, + "step": 11994, + "time_per_iteration": 2.481267213821411 + }, + { + "auxiliary_loss_clip": 0.01102846, + "auxiliary_loss_mlp": 0.01029077, + "balance_loss_clip": 1.01626778, + "balance_loss_mlp": 1.03604794, + "epoch": 0.7211784157522922, + "flos": 17712076982400.0, + "grad_norm": 1.7186394121352693, + "language_loss": 0.66596985, + "learning_rate": 7.614336469056172e-07, + "loss": 0.68728906, + "num_input_tokens_seen": 258812330, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.66796875, + "step": 11995, + "time_per_iteration": 2.4427177906036377 + }, + { + "auxiliary_loss_clip": 0.01099622, + "auxiliary_loss_mlp": 0.01029445, + "balance_loss_clip": 1.01687467, + "balance_loss_mlp": 1.0355916, + "epoch": 0.7212385390049602, + "flos": 24423637230720.0, + "grad_norm": 1.85436447909986, + "language_loss": 0.79713655, + "learning_rate": 7.6112787765068e-07, + "loss": 0.81842726, + "num_input_tokens_seen": 258831770, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.640625, + "step": 11996, + "time_per_iteration": 2.459115505218506 + }, + { + "auxiliary_loss_clip": 0.01103225, + "auxiliary_loss_mlp": 0.01032367, + "balance_loss_clip": 1.0208993, + "balance_loss_mlp": 1.03676772, + "epoch": 0.7212986622576282, + "flos": 28147250580480.0, + "grad_norm": 5.051284745258933, + "language_loss": 0.81384039, + "learning_rate": 7.60822155373899e-07, + "loss": 0.83519638, + "num_input_tokens_seen": 258849090, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6640625, + "step": 11997, + "time_per_iteration": 2.5205626487731934 + }, + { + "auxiliary_loss_clip": 0.01103756, + "auxiliary_loss_mlp": 0.01034784, + "balance_loss_clip": 1.02231431, + "balance_loss_mlp": 1.03483105, + "epoch": 0.7213587855102961, + "flos": 21835770992640.0, + "grad_norm": 1.8313827039335897, + "language_loss": 0.67091608, + "learning_rate": 7.605164800868646e-07, + "loss": 0.69230151, + "num_input_tokens_seen": 258868230, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 11998, + "time_per_iteration": 2.431267023086548 + }, + { + "auxiliary_loss_clip": 0.01102391, + "auxiliary_loss_mlp": 0.01031921, + "balance_loss_clip": 1.02113891, + "balance_loss_mlp": 1.03637111, + "epoch": 0.7214189087629641, + "flos": 14611549881600.0, + "grad_norm": 1.8599790081910679, + "language_loss": 0.72658986, + "learning_rate": 7.602108518011696e-07, + "loss": 0.74793291, + "num_input_tokens_seen": 258885525, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.66015625, + "step": 11999, + "time_per_iteration": 2.434900999069214 + }, + { + "auxiliary_loss_clip": 0.01103894, + "auxiliary_loss_mlp": 0.01027384, + "balance_loss_clip": 1.01531434, + "balance_loss_mlp": 1.03644443, + "epoch": 0.721479032015632, + "flos": 19390864884480.0, + "grad_norm": 2.3549521640831843, + "language_loss": 0.83203346, + "learning_rate": 7.599052705284039e-07, + "loss": 0.85334623, + "num_input_tokens_seen": 258903245, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.67578125, + "step": 12000, + "time_per_iteration": 2.4203250408172607 + }, + { + "auxiliary_loss_clip": 0.011045, + "auxiliary_loss_mlp": 0.01034844, + "balance_loss_clip": 1.02320933, + "balance_loss_mlp": 1.03663993, + "epoch": 0.7215391552683, + "flos": 18512884748160.0, + "grad_norm": 1.6620327129342116, + "language_loss": 0.77455056, + "learning_rate": 7.59599736280154e-07, + "loss": 0.79594404, + "num_input_tokens_seen": 258921245, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 12001, + "time_per_iteration": 2.446817636489868 + }, + { + "auxiliary_loss_clip": 0.01103076, + "auxiliary_loss_mlp": 0.01034762, + "balance_loss_clip": 1.02331209, + "balance_loss_mlp": 1.0377841, + "epoch": 0.721599278520968, + "flos": 23258731253760.0, + "grad_norm": 1.7518200734535594, + "language_loss": 0.81436306, + "learning_rate": 7.592942490680066e-07, + "loss": 0.83574152, + "num_input_tokens_seen": 258939425, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 12002, + "time_per_iteration": 2.4679903984069824 + }, + { + "auxiliary_loss_clip": 0.01104088, + "auxiliary_loss_mlp": 0.01027156, + "balance_loss_clip": 1.01510406, + "balance_loss_mlp": 1.03641772, + "epoch": 0.721659401773636, + "flos": 39199045979520.0, + "grad_norm": 2.283155803599373, + "language_loss": 0.62498772, + "learning_rate": 7.589888089035462e-07, + "loss": 0.6463002, + "num_input_tokens_seen": 258960710, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.67578125, + "step": 12003, + "time_per_iteration": 2.58776593208313 + }, + { + "auxiliary_loss_clip": 0.01102937, + "auxiliary_loss_mlp": 0.01032279, + "balance_loss_clip": 1.02022064, + "balance_loss_mlp": 1.03539622, + "epoch": 0.7217195250263039, + "flos": 14939917038720.0, + "grad_norm": 2.560985107334089, + "language_loss": 0.68500596, + "learning_rate": 7.586834157983544e-07, + "loss": 0.70635808, + "num_input_tokens_seen": 258978475, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.67578125, + "step": 12004, + "time_per_iteration": 2.3969027996063232 + }, + { + "auxiliary_loss_clip": 0.01025027, + "auxiliary_loss_mlp": 0.0099804, + "balance_loss_clip": 0.99700272, + "balance_loss_mlp": 1.00448203, + "epoch": 0.7217796482789719, + "flos": 70869206666880.0, + "grad_norm": 0.8643975392958543, + "language_loss": 0.54278243, + "learning_rate": 7.583780697640112e-07, + "loss": 0.56301308, + "num_input_tokens_seen": 259037520, + "router_z_loss_clip": 0.01037598, + "router_z_loss_mlp": 0.20507812, + "step": 12005, + "time_per_iteration": 2.9869492053985596 + }, + { + "auxiliary_loss_clip": 0.01102163, + "auxiliary_loss_mlp": 0.01032861, + "balance_loss_clip": 1.02052271, + "balance_loss_mlp": 1.03582788, + "epoch": 0.7218397715316398, + "flos": 37451525402880.0, + "grad_norm": 1.4997790369746062, + "language_loss": 0.62904799, + "learning_rate": 7.580727708120962e-07, + "loss": 0.65039825, + "num_input_tokens_seen": 259061325, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6640625, + "step": 12006, + "time_per_iteration": 2.6116576194763184 + }, + { + "auxiliary_loss_clip": 0.01102198, + "auxiliary_loss_mlp": 0.01031585, + "balance_loss_clip": 1.02048635, + "balance_loss_mlp": 1.0356214, + "epoch": 0.7218998947843078, + "flos": 22710662559360.0, + "grad_norm": 1.702113645244825, + "language_loss": 0.92155731, + "learning_rate": 7.577675189541865e-07, + "loss": 0.94289511, + "num_input_tokens_seen": 259078135, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 12007, + "time_per_iteration": 2.4609286785125732 + }, + { + "auxiliary_loss_clip": 0.01102657, + "auxiliary_loss_mlp": 0.01030059, + "balance_loss_clip": 1.0176847, + "balance_loss_mlp": 1.03450811, + "epoch": 0.7219600180369758, + "flos": 12167182477440.0, + "grad_norm": 2.0030110165156088, + "language_loss": 0.64172041, + "learning_rate": 7.574623142018568e-07, + "loss": 0.66304755, + "num_input_tokens_seen": 259095910, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 12008, + "time_per_iteration": 2.4176084995269775 + }, + { + "auxiliary_loss_clip": 0.0110518, + "auxiliary_loss_mlp": 0.0103247, + "balance_loss_clip": 1.01998281, + "balance_loss_mlp": 1.03712559, + "epoch": 0.7220201412896438, + "flos": 22596573985920.0, + "grad_norm": 1.9142767312180562, + "language_loss": 0.78281379, + "learning_rate": 7.57157156566681e-07, + "loss": 0.80419028, + "num_input_tokens_seen": 259114225, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 12009, + "time_per_iteration": 2.486860752105713 + }, + { + "auxiliary_loss_clip": 0.01105579, + "auxiliary_loss_mlp": 0.01034559, + "balance_loss_clip": 1.0216608, + "balance_loss_mlp": 1.03696656, + "epoch": 0.7220802645423118, + "flos": 26718651884160.0, + "grad_norm": 1.8228551130543398, + "language_loss": 0.63638747, + "learning_rate": 7.568520460602297e-07, + "loss": 0.65778881, + "num_input_tokens_seen": 259134660, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6875, + "step": 12010, + "time_per_iteration": 2.4727206230163574 + }, + { + "auxiliary_loss_clip": 0.01102553, + "auxiliary_loss_mlp": 0.0102775, + "balance_loss_clip": 1.0161804, + "balance_loss_mlp": 1.03521693, + "epoch": 0.7221403877949797, + "flos": 24420548661120.0, + "grad_norm": 2.6555622208181195, + "language_loss": 0.77546549, + "learning_rate": 7.565469826940742e-07, + "loss": 0.79676855, + "num_input_tokens_seen": 259153300, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.671875, + "step": 12011, + "time_per_iteration": 3.9832870960235596 + }, + { + "auxiliary_loss_clip": 0.01101603, + "auxiliary_loss_mlp": 0.01032842, + "balance_loss_clip": 1.02180326, + "balance_loss_mlp": 1.03652728, + "epoch": 0.7222005110476477, + "flos": 23514379326720.0, + "grad_norm": 1.6788129204959028, + "language_loss": 0.79040414, + "learning_rate": 7.56241966479781e-07, + "loss": 0.81174862, + "num_input_tokens_seen": 259172115, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 12012, + "time_per_iteration": 2.5008320808410645 + }, + { + "auxiliary_loss_clip": 0.01105391, + "auxiliary_loss_mlp": 0.01030402, + "balance_loss_clip": 1.01893425, + "balance_loss_mlp": 1.03754234, + "epoch": 0.7222606343003156, + "flos": 23112538899840.0, + "grad_norm": 1.7808047508810358, + "language_loss": 0.75740772, + "learning_rate": 7.559369974289171e-07, + "loss": 0.77876568, + "num_input_tokens_seen": 259191345, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6796875, + "step": 12013, + "time_per_iteration": 3.9566152095794678 + }, + { + "auxiliary_loss_clip": 0.01102634, + "auxiliary_loss_mlp": 0.01025299, + "balance_loss_clip": 1.01445651, + "balance_loss_mlp": 1.03621209, + "epoch": 0.7223207575529836, + "flos": 24351169541760.0, + "grad_norm": 1.471281729007001, + "language_loss": 0.75965142, + "learning_rate": 7.556320755530484e-07, + "loss": 0.78093076, + "num_input_tokens_seen": 259211700, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6640625, + "step": 12014, + "time_per_iteration": 3.9748001098632812 + }, + { + "auxiliary_loss_clip": 0.01104429, + "auxiliary_loss_mlp": 0.01032084, + "balance_loss_clip": 1.02027655, + "balance_loss_mlp": 1.03614628, + "epoch": 0.7223808808056515, + "flos": 28330179569280.0, + "grad_norm": 1.614960921439624, + "language_loss": 0.86782753, + "learning_rate": 7.553272008637346e-07, + "loss": 0.8891927, + "num_input_tokens_seen": 259233825, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.68359375, + "step": 12015, + "time_per_iteration": 3.9988059997558594 + }, + { + "auxiliary_loss_clip": 0.01100793, + "auxiliary_loss_mlp": 0.01035132, + "balance_loss_clip": 1.02386725, + "balance_loss_mlp": 1.03534532, + "epoch": 0.7224410040583196, + "flos": 21069437304960.0, + "grad_norm": 1.879880951075302, + "language_loss": 0.77969182, + "learning_rate": 7.55022373372538e-07, + "loss": 0.80105108, + "num_input_tokens_seen": 259253055, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65625, + "step": 12016, + "time_per_iteration": 2.45281982421875 + }, + { + "auxiliary_loss_clip": 0.01099669, + "auxiliary_loss_mlp": 0.01033938, + "balance_loss_clip": 1.02245855, + "balance_loss_mlp": 1.03527737, + "epoch": 0.7225011273109875, + "flos": 26795429205120.0, + "grad_norm": 1.444882690983208, + "language_loss": 0.77545393, + "learning_rate": 7.547175930910186e-07, + "loss": 0.79679, + "num_input_tokens_seen": 259273420, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.64453125, + "step": 12017, + "time_per_iteration": 2.4577410221099854 + }, + { + "auxiliary_loss_clip": 0.01098758, + "auxiliary_loss_mlp": 0.01025484, + "balance_loss_clip": 1.01503491, + "balance_loss_mlp": 1.03520453, + "epoch": 0.7225612505636555, + "flos": 23583578878080.0, + "grad_norm": 2.637627355867151, + "language_loss": 0.73314553, + "learning_rate": 7.54412860030732e-07, + "loss": 0.75438797, + "num_input_tokens_seen": 259291000, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.63671875, + "step": 12018, + "time_per_iteration": 2.4559662342071533 + }, + { + "auxiliary_loss_clip": 0.01099343, + "auxiliary_loss_mlp": 0.01031186, + "balance_loss_clip": 1.02060056, + "balance_loss_mlp": 1.03665912, + "epoch": 0.7226213738163234, + "flos": 20777627214720.0, + "grad_norm": 2.5981107828035026, + "language_loss": 0.77910566, + "learning_rate": 7.541081742032347e-07, + "loss": 0.80041099, + "num_input_tokens_seen": 259312390, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.625, + "step": 12019, + "time_per_iteration": 2.4371070861816406 + }, + { + "auxiliary_loss_clip": 0.01100393, + "auxiliary_loss_mlp": 0.01027432, + "balance_loss_clip": 1.01560664, + "balance_loss_mlp": 1.0350244, + "epoch": 0.7226814970689914, + "flos": 32635832901120.0, + "grad_norm": 1.6489444745204735, + "language_loss": 0.73905075, + "learning_rate": 7.53803535620081e-07, + "loss": 0.76032901, + "num_input_tokens_seen": 259332645, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65234375, + "step": 12020, + "time_per_iteration": 2.5431694984436035 + }, + { + "auxiliary_loss_clip": 0.01103343, + "auxiliary_loss_mlp": 0.01032271, + "balance_loss_clip": 1.02115512, + "balance_loss_mlp": 1.03456461, + "epoch": 0.7227416203216595, + "flos": 22454368041600.0, + "grad_norm": 1.6675263064788628, + "language_loss": 0.77169615, + "learning_rate": 7.534989442928219e-07, + "loss": 0.79305232, + "num_input_tokens_seen": 259353810, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6875, + "step": 12021, + "time_per_iteration": 2.483078718185425 + }, + { + "auxiliary_loss_clip": 0.0110063, + "auxiliary_loss_mlp": 0.01031983, + "balance_loss_clip": 1.0206815, + "balance_loss_mlp": 1.03491306, + "epoch": 0.7228017435743274, + "flos": 21652303299840.0, + "grad_norm": 2.1826099193920374, + "language_loss": 0.68331528, + "learning_rate": 7.531944002330073e-07, + "loss": 0.70464146, + "num_input_tokens_seen": 259372460, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65625, + "step": 12022, + "time_per_iteration": 2.454972982406616 + }, + { + "auxiliary_loss_clip": 0.01101398, + "auxiliary_loss_mlp": 0.01032038, + "balance_loss_clip": 1.01967645, + "balance_loss_mlp": 1.03452194, + "epoch": 0.7228618668269954, + "flos": 29533474206720.0, + "grad_norm": 1.7453912487460392, + "language_loss": 0.69111204, + "learning_rate": 7.528899034521858e-07, + "loss": 0.71244639, + "num_input_tokens_seen": 259393275, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.66796875, + "step": 12023, + "time_per_iteration": 2.4790570735931396 + }, + { + "auxiliary_loss_clip": 0.01098672, + "auxiliary_loss_mlp": 0.01028619, + "balance_loss_clip": 1.0168705, + "balance_loss_mlp": 1.03245616, + "epoch": 0.7229219900796633, + "flos": 27453815544960.0, + "grad_norm": 1.6879551293116275, + "language_loss": 0.71159554, + "learning_rate": 7.525854539619052e-07, + "loss": 0.73286849, + "num_input_tokens_seen": 259416205, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6640625, + "step": 12024, + "time_per_iteration": 2.5031228065490723 + }, + { + "auxiliary_loss_clip": 0.0110197, + "auxiliary_loss_mlp": 0.01035131, + "balance_loss_clip": 1.02403879, + "balance_loss_mlp": 1.03651297, + "epoch": 0.7229821133323313, + "flos": 16289368116480.0, + "grad_norm": 1.7827113324832673, + "language_loss": 0.75502241, + "learning_rate": 7.522810517737089e-07, + "loss": 0.77639341, + "num_input_tokens_seen": 259433115, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.65625, + "step": 12025, + "time_per_iteration": 2.540117025375366 + }, + { + "auxiliary_loss_clip": 0.01101331, + "auxiliary_loss_mlp": 0.0102966, + "balance_loss_clip": 1.01867485, + "balance_loss_mlp": 1.03641152, + "epoch": 0.7230422365849992, + "flos": 20412343854720.0, + "grad_norm": 2.1646639083011, + "language_loss": 0.7686342, + "learning_rate": 7.519766968991395e-07, + "loss": 0.78994411, + "num_input_tokens_seen": 259450475, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6484375, + "step": 12026, + "time_per_iteration": 2.619121551513672 + }, + { + "auxiliary_loss_clip": 0.01101511, + "auxiliary_loss_mlp": 0.0103796, + "balance_loss_clip": 1.02681398, + "balance_loss_mlp": 1.0340333, + "epoch": 0.7231023598376672, + "flos": 25593499284480.0, + "grad_norm": 1.96713718815872, + "language_loss": 0.67575908, + "learning_rate": 7.516723893497388e-07, + "loss": 0.69715375, + "num_input_tokens_seen": 259469355, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.671875, + "step": 12027, + "time_per_iteration": 2.5705184936523438 + }, + { + "auxiliary_loss_clip": 0.01105426, + "auxiliary_loss_mlp": 0.01030564, + "balance_loss_clip": 1.01864338, + "balance_loss_mlp": 1.03727841, + "epoch": 0.7231624830903352, + "flos": 25149607009920.0, + "grad_norm": 2.266596078102469, + "language_loss": 0.78860784, + "learning_rate": 7.513681291370469e-07, + "loss": 0.8099677, + "num_input_tokens_seen": 259486565, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 12028, + "time_per_iteration": 2.521543502807617 + }, + { + "auxiliary_loss_clip": 0.01099985, + "auxiliary_loss_mlp": 0.01025931, + "balance_loss_clip": 1.0140934, + "balance_loss_mlp": 1.03353393, + "epoch": 0.7232226063430032, + "flos": 21725740656000.0, + "grad_norm": 1.7215623884299298, + "language_loss": 0.81997663, + "learning_rate": 7.510639162726e-07, + "loss": 0.84123576, + "num_input_tokens_seen": 259505070, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 12029, + "time_per_iteration": 2.518493890762329 + }, + { + "auxiliary_loss_clip": 0.01024828, + "auxiliary_loss_mlp": 0.01005824, + "balance_loss_clip": 1.00497139, + "balance_loss_mlp": 1.00435281, + "epoch": 0.7232827295956711, + "flos": 68436798491520.0, + "grad_norm": 0.8108297905714709, + "language_loss": 0.61798579, + "learning_rate": 7.507597507679347e-07, + "loss": 0.63829231, + "num_input_tokens_seen": 259569135, + "router_z_loss_clip": 0.00854492, + "router_z_loss_mlp": 0.20507812, + "step": 12030, + "time_per_iteration": 3.3008005619049072 + }, + { + "auxiliary_loss_clip": 0.01097674, + "auxiliary_loss_mlp": 0.01027221, + "balance_loss_clip": 1.01557982, + "balance_loss_mlp": 1.0335412, + "epoch": 0.7233428528483391, + "flos": 20192642317440.0, + "grad_norm": 1.9017157177210717, + "language_loss": 0.78060263, + "learning_rate": 7.504556326345859e-07, + "loss": 0.80185157, + "num_input_tokens_seen": 259587035, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.640625, + "step": 12031, + "time_per_iteration": 2.410015106201172 + }, + { + "auxiliary_loss_clip": 0.01103629, + "auxiliary_loss_mlp": 0.01030191, + "balance_loss_clip": 1.01835394, + "balance_loss_mlp": 1.03571391, + "epoch": 0.723402976101007, + "flos": 23949472769280.0, + "grad_norm": 2.6817131275089614, + "language_loss": 0.81817293, + "learning_rate": 7.501515618840834e-07, + "loss": 0.83951116, + "num_input_tokens_seen": 259606140, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 12032, + "time_per_iteration": 2.4944539070129395 + }, + { + "auxiliary_loss_clip": 0.01105541, + "auxiliary_loss_mlp": 0.01032741, + "balance_loss_clip": 1.0208137, + "balance_loss_mlp": 1.03620064, + "epoch": 0.723463099353675, + "flos": 20813394182400.0, + "grad_norm": 1.8666102600772807, + "language_loss": 0.74966335, + "learning_rate": 7.498475385279592e-07, + "loss": 0.77104622, + "num_input_tokens_seen": 259624275, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 12033, + "time_per_iteration": 2.4195306301116943 + }, + { + "auxiliary_loss_clip": 0.01098927, + "auxiliary_loss_mlp": 0.01027603, + "balance_loss_clip": 1.01661193, + "balance_loss_mlp": 1.03378749, + "epoch": 0.723523222606343, + "flos": 19098013299840.0, + "grad_norm": 1.53895157270623, + "language_loss": 0.74960071, + "learning_rate": 7.495435625777423e-07, + "loss": 0.77086604, + "num_input_tokens_seen": 259643465, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.65234375, + "step": 12034, + "time_per_iteration": 2.4611551761627197 + }, + { + "auxiliary_loss_clip": 0.01099874, + "auxiliary_loss_mlp": 0.01026544, + "balance_loss_clip": 1.01580346, + "balance_loss_mlp": 1.03402519, + "epoch": 0.723583345859011, + "flos": 26506994993280.0, + "grad_norm": 1.7101429729597608, + "language_loss": 0.80541229, + "learning_rate": 7.492396340449578e-07, + "loss": 0.82667649, + "num_input_tokens_seen": 259662500, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.66015625, + "step": 12035, + "time_per_iteration": 2.4735255241394043 + }, + { + "auxiliary_loss_clip": 0.01102988, + "auxiliary_loss_mlp": 0.0103086, + "balance_loss_clip": 1.01914811, + "balance_loss_mlp": 1.03593981, + "epoch": 0.723643469111679, + "flos": 16033863697920.0, + "grad_norm": 1.6708890033016828, + "language_loss": 0.60718334, + "learning_rate": 7.489357529411326e-07, + "loss": 0.6285218, + "num_input_tokens_seen": 259680140, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 12036, + "time_per_iteration": 2.4652183055877686 + }, + { + "auxiliary_loss_clip": 0.01097804, + "auxiliary_loss_mlp": 0.01029749, + "balance_loss_clip": 1.01916969, + "balance_loss_mlp": 1.03397477, + "epoch": 0.7237035923643469, + "flos": 21945549934080.0, + "grad_norm": 1.8946488922724685, + "language_loss": 0.67484653, + "learning_rate": 7.486319192777883e-07, + "loss": 0.69612211, + "num_input_tokens_seen": 259700160, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.63671875, + "step": 12037, + "time_per_iteration": 2.439401388168335 + }, + { + "auxiliary_loss_clip": 0.0110086, + "auxiliary_loss_mlp": 0.01035034, + "balance_loss_clip": 1.02312541, + "balance_loss_mlp": 1.03511119, + "epoch": 0.7237637156170149, + "flos": 23583112001280.0, + "grad_norm": 1.8161270520180812, + "language_loss": 0.72444439, + "learning_rate": 7.483281330664479e-07, + "loss": 0.74580336, + "num_input_tokens_seen": 259720525, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 12038, + "time_per_iteration": 2.498206853866577 + }, + { + "auxiliary_loss_clip": 0.01102879, + "auxiliary_loss_mlp": 0.01029521, + "balance_loss_clip": 1.01683688, + "balance_loss_mlp": 1.0365181, + "epoch": 0.7238238388696828, + "flos": 20594698225920.0, + "grad_norm": 1.9105264736762722, + "language_loss": 0.72119117, + "learning_rate": 7.480243943186293e-07, + "loss": 0.74251521, + "num_input_tokens_seen": 259738680, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6640625, + "step": 12039, + "time_per_iteration": 2.427929401397705 + }, + { + "auxiliary_loss_clip": 0.01105168, + "auxiliary_loss_mlp": 0.01034722, + "balance_loss_clip": 1.02339756, + "balance_loss_mlp": 1.03659403, + "epoch": 0.7238839621223508, + "flos": 24207024263040.0, + "grad_norm": 2.0387115182112567, + "language_loss": 0.75838852, + "learning_rate": 7.477207030458513e-07, + "loss": 0.77978736, + "num_input_tokens_seen": 259758790, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6875, + "step": 12040, + "time_per_iteration": 2.4932591915130615 + }, + { + "auxiliary_loss_clip": 0.01100807, + "auxiliary_loss_mlp": 0.01033028, + "balance_loss_clip": 1.02060628, + "balance_loss_mlp": 1.03361833, + "epoch": 0.7239440853750188, + "flos": 14209745368320.0, + "grad_norm": 1.913740912847533, + "language_loss": 0.76230586, + "learning_rate": 7.474170592596301e-07, + "loss": 0.7836442, + "num_input_tokens_seen": 259777370, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 12041, + "time_per_iteration": 2.393092393875122 + }, + { + "auxiliary_loss_clip": 0.01101216, + "auxiliary_loss_mlp": 0.01028073, + "balance_loss_clip": 1.01658726, + "balance_loss_mlp": 1.03313875, + "epoch": 0.7240042086276868, + "flos": 21614812479360.0, + "grad_norm": 2.0516689414632348, + "language_loss": 0.63410985, + "learning_rate": 7.471134629714797e-07, + "loss": 0.65540266, + "num_input_tokens_seen": 259794665, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6796875, + "step": 12042, + "time_per_iteration": 2.4641988277435303 + }, + { + "auxiliary_loss_clip": 0.01105282, + "auxiliary_loss_mlp": 0.01033601, + "balance_loss_clip": 1.02106047, + "balance_loss_mlp": 1.03651488, + "epoch": 0.7240643318803547, + "flos": 23331450337920.0, + "grad_norm": 2.5235443155533486, + "language_loss": 0.83237529, + "learning_rate": 7.468099141929116e-07, + "loss": 0.85376412, + "num_input_tokens_seen": 259811110, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 12043, + "time_per_iteration": 2.433598041534424 + }, + { + "auxiliary_loss_clip": 0.0110258, + "auxiliary_loss_mlp": 0.0102984, + "balance_loss_clip": 1.01696599, + "balance_loss_mlp": 1.03478646, + "epoch": 0.7241244551330227, + "flos": 24024849459840.0, + "grad_norm": 1.7620410881767092, + "language_loss": 0.64035821, + "learning_rate": 7.465064129354379e-07, + "loss": 0.66168237, + "num_input_tokens_seen": 259831080, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6796875, + "step": 12044, + "time_per_iteration": 2.4627864360809326 + }, + { + "auxiliary_loss_clip": 0.0110401, + "auxiliary_loss_mlp": 0.01031559, + "balance_loss_clip": 1.01967335, + "balance_loss_mlp": 1.03717875, + "epoch": 0.7241845783856906, + "flos": 18730323728640.0, + "grad_norm": 1.4978020202204398, + "language_loss": 0.81621009, + "learning_rate": 7.462029592105658e-07, + "loss": 0.83756578, + "num_input_tokens_seen": 259850135, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66796875, + "step": 12045, + "time_per_iteration": 2.4192216396331787 + }, + { + "auxiliary_loss_clip": 0.01098967, + "auxiliary_loss_mlp": 0.01033304, + "balance_loss_clip": 1.0214541, + "balance_loss_mlp": 1.0345459, + "epoch": 0.7242447016383586, + "flos": 19498668577920.0, + "grad_norm": 1.5204011665835366, + "language_loss": 0.71989012, + "learning_rate": 7.458995530298034e-07, + "loss": 0.74121284, + "num_input_tokens_seen": 259868185, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.64453125, + "step": 12046, + "time_per_iteration": 2.4425227642059326 + }, + { + "auxiliary_loss_clip": 0.01101516, + "auxiliary_loss_mlp": 0.01028832, + "balance_loss_clip": 1.01617825, + "balance_loss_mlp": 1.03457832, + "epoch": 0.7243048248910267, + "flos": 22163491704960.0, + "grad_norm": 1.7863177787262001, + "language_loss": 0.71125013, + "learning_rate": 7.455961944046553e-07, + "loss": 0.7325536, + "num_input_tokens_seen": 259887055, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.671875, + "step": 12047, + "time_per_iteration": 2.4461426734924316 + }, + { + "auxiliary_loss_clip": 0.01108161, + "auxiliary_loss_mlp": 0.01034212, + "balance_loss_clip": 1.02217817, + "balance_loss_mlp": 1.03864622, + "epoch": 0.7243649481436946, + "flos": 27672762896640.0, + "grad_norm": 1.6964330206566038, + "language_loss": 0.69839394, + "learning_rate": 7.45292883346627e-07, + "loss": 0.71981764, + "num_input_tokens_seen": 259908295, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6953125, + "step": 12048, + "time_per_iteration": 2.500828981399536 + }, + { + "auxiliary_loss_clip": 0.01024144, + "auxiliary_loss_mlp": 0.01003374, + "balance_loss_clip": 1.00239074, + "balance_loss_mlp": 1.00373721, + "epoch": 0.7244250713963626, + "flos": 63244545759360.0, + "grad_norm": 0.8243567714089579, + "language_loss": 0.5377422, + "learning_rate": 7.449896198672168e-07, + "loss": 0.55801743, + "num_input_tokens_seen": 259968475, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.20410156, + "step": 12049, + "time_per_iteration": 3.04441499710083 + }, + { + "auxiliary_loss_clip": 0.01107642, + "auxiliary_loss_mlp": 0.01030185, + "balance_loss_clip": 1.01611245, + "balance_loss_mlp": 1.0363996, + "epoch": 0.7244851946490305, + "flos": 17967114524160.0, + "grad_norm": 3.707915690527614, + "language_loss": 0.59357387, + "learning_rate": 7.446864039779258e-07, + "loss": 0.61495221, + "num_input_tokens_seen": 259984865, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7109375, + "step": 12050, + "time_per_iteration": 2.4253971576690674 + }, + { + "auxiliary_loss_clip": 0.01024067, + "auxiliary_loss_mlp": 0.01001921, + "balance_loss_clip": 1.00082481, + "balance_loss_mlp": 1.00360942, + "epoch": 0.7245453179016985, + "flos": 70943649603840.0, + "grad_norm": 0.7294493469822053, + "language_loss": 0.53312981, + "learning_rate": 7.443832356902528e-07, + "loss": 0.55338979, + "num_input_tokens_seen": 260046735, + "router_z_loss_clip": 0.01098633, + "router_z_loss_mlp": 0.20507812, + "step": 12051, + "time_per_iteration": 3.049221992492676 + }, + { + "auxiliary_loss_clip": 0.01100204, + "auxiliary_loss_mlp": 0.01030999, + "balance_loss_clip": 1.01975131, + "balance_loss_mlp": 1.03494263, + "epoch": 0.7246054411543664, + "flos": 24568464867840.0, + "grad_norm": 1.5472193855827432, + "language_loss": 0.72156775, + "learning_rate": 7.440801150156927e-07, + "loss": 0.74287981, + "num_input_tokens_seen": 260067950, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 12052, + "time_per_iteration": 2.46797776222229 + }, + { + "auxiliary_loss_clip": 0.01102918, + "auxiliary_loss_mlp": 0.01029028, + "balance_loss_clip": 1.01620138, + "balance_loss_mlp": 1.03667867, + "epoch": 0.7246655644070344, + "flos": 32338312548480.0, + "grad_norm": 2.0462685374624088, + "language_loss": 0.74402982, + "learning_rate": 7.437770419657415e-07, + "loss": 0.76534927, + "num_input_tokens_seen": 260087730, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6640625, + "step": 12053, + "time_per_iteration": 3.902531862258911 + }, + { + "auxiliary_loss_clip": 0.01102096, + "auxiliary_loss_mlp": 0.01029811, + "balance_loss_clip": 1.01761603, + "balance_loss_mlp": 1.03548145, + "epoch": 0.7247256876597024, + "flos": 21872471713920.0, + "grad_norm": 2.1030984660426792, + "language_loss": 0.78042889, + "learning_rate": 7.434740165518898e-07, + "loss": 0.80174804, + "num_input_tokens_seen": 260107760, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66796875, + "step": 12054, + "time_per_iteration": 2.4352877140045166 + }, + { + "auxiliary_loss_clip": 0.01101331, + "auxiliary_loss_mlp": 0.01033632, + "balance_loss_clip": 1.02155614, + "balance_loss_mlp": 1.03527296, + "epoch": 0.7247858109123704, + "flos": 16213093585920.0, + "grad_norm": 2.826077293282499, + "language_loss": 0.68607175, + "learning_rate": 7.431710387856301e-07, + "loss": 0.70742142, + "num_input_tokens_seen": 260123660, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 12055, + "time_per_iteration": 3.8767430782318115 + }, + { + "auxiliary_loss_clip": 0.01098671, + "auxiliary_loss_mlp": 0.01028558, + "balance_loss_clip": 1.0167743, + "balance_loss_mlp": 1.03378785, + "epoch": 0.7248459341650383, + "flos": 20850705434880.0, + "grad_norm": 1.7289479887024157, + "language_loss": 0.73999792, + "learning_rate": 7.428681086784496e-07, + "loss": 0.76127023, + "num_input_tokens_seen": 260142690, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6484375, + "step": 12056, + "time_per_iteration": 5.455943822860718 + }, + { + "auxiliary_loss_clip": 0.010981, + "auxiliary_loss_mlp": 0.01023811, + "balance_loss_clip": 1.01225948, + "balance_loss_mlp": 1.03432655, + "epoch": 0.7249060574177063, + "flos": 25921794614400.0, + "grad_norm": 1.6012339855962578, + "language_loss": 0.70800096, + "learning_rate": 7.425652262418368e-07, + "loss": 0.72922009, + "num_input_tokens_seen": 260162590, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.63671875, + "step": 12057, + "time_per_iteration": 2.5277090072631836 + }, + { + "auxiliary_loss_clip": 0.01104249, + "auxiliary_loss_mlp": 0.01033995, + "balance_loss_clip": 1.02146614, + "balance_loss_mlp": 1.03651786, + "epoch": 0.7249661806703742, + "flos": 17345536646400.0, + "grad_norm": 1.9271030457531089, + "language_loss": 0.6256361, + "learning_rate": 7.42262391487277e-07, + "loss": 0.64701855, + "num_input_tokens_seen": 260181065, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 12058, + "time_per_iteration": 2.4183826446533203 + }, + { + "auxiliary_loss_clip": 0.01105608, + "auxiliary_loss_mlp": 0.01029496, + "balance_loss_clip": 1.01710391, + "balance_loss_mlp": 1.03729975, + "epoch": 0.7250263039230422, + "flos": 19574153009280.0, + "grad_norm": 1.9655611905409667, + "language_loss": 0.74991, + "learning_rate": 7.419596044262535e-07, + "loss": 0.7712611, + "num_input_tokens_seen": 260200330, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 12059, + "time_per_iteration": 2.4240307807922363 + }, + { + "auxiliary_loss_clip": 0.01098542, + "auxiliary_loss_mlp": 0.01030731, + "balance_loss_clip": 1.01989508, + "balance_loss_mlp": 1.03418756, + "epoch": 0.7250864271757103, + "flos": 21976648133760.0, + "grad_norm": 1.73148336462866, + "language_loss": 0.79305416, + "learning_rate": 7.416568650702472e-07, + "loss": 0.81434691, + "num_input_tokens_seen": 260219975, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.640625, + "step": 12060, + "time_per_iteration": 2.493459463119507 + }, + { + "auxiliary_loss_clip": 0.0110292, + "auxiliary_loss_mlp": 0.01028467, + "balance_loss_clip": 1.01606321, + "balance_loss_mlp": 1.03522062, + "epoch": 0.7251465504283782, + "flos": 25012608537600.0, + "grad_norm": 2.354515481339918, + "language_loss": 0.76317465, + "learning_rate": 7.413541734307393e-07, + "loss": 0.78448856, + "num_input_tokens_seen": 260242025, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 12061, + "time_per_iteration": 2.4897234439849854 + }, + { + "auxiliary_loss_clip": 0.01100914, + "auxiliary_loss_mlp": 0.01028273, + "balance_loss_clip": 1.01707315, + "balance_loss_mlp": 1.03607178, + "epoch": 0.7252066736810462, + "flos": 16690131135360.0, + "grad_norm": 1.707041727604455, + "language_loss": 0.81039721, + "learning_rate": 7.410515295192068e-07, + "loss": 0.83168906, + "num_input_tokens_seen": 260260015, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 12062, + "time_per_iteration": 2.4312822818756104 + }, + { + "auxiliary_loss_clip": 0.01106743, + "auxiliary_loss_mlp": 0.0103005, + "balance_loss_clip": 1.01713991, + "balance_loss_mlp": 1.03735328, + "epoch": 0.7252667969337141, + "flos": 25703026830720.0, + "grad_norm": 1.9940387151474506, + "language_loss": 0.68844217, + "learning_rate": 7.407489333471262e-07, + "loss": 0.70981008, + "num_input_tokens_seen": 260278635, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 12063, + "time_per_iteration": 2.5078516006469727 + }, + { + "auxiliary_loss_clip": 0.01099308, + "auxiliary_loss_mlp": 0.01029715, + "balance_loss_clip": 1.01788342, + "balance_loss_mlp": 1.03523588, + "epoch": 0.7253269201863821, + "flos": 18259930195200.0, + "grad_norm": 1.3500136523009691, + "language_loss": 0.69967401, + "learning_rate": 7.40446384925973e-07, + "loss": 0.72096425, + "num_input_tokens_seen": 260298510, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.640625, + "step": 12064, + "time_per_iteration": 2.4525294303894043 + }, + { + "auxiliary_loss_clip": 0.01103585, + "auxiliary_loss_mlp": 0.010308, + "balance_loss_clip": 1.0188607, + "balance_loss_mlp": 1.03744543, + "epoch": 0.72538704343905, + "flos": 20411805150720.0, + "grad_norm": 2.2336703023596716, + "language_loss": 0.90039599, + "learning_rate": 7.401438842672192e-07, + "loss": 0.92173982, + "num_input_tokens_seen": 260317405, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 12065, + "time_per_iteration": 2.4503257274627686 + }, + { + "auxiliary_loss_clip": 0.01023945, + "auxiliary_loss_mlp": 0.00999171, + "balance_loss_clip": 0.99806815, + "balance_loss_mlp": 1.00351691, + "epoch": 0.725447166691718, + "flos": 70151209706880.0, + "grad_norm": 0.6543765045930707, + "language_loss": 0.56138921, + "learning_rate": 7.398414313823349e-07, + "loss": 0.58162034, + "num_input_tokens_seen": 260388085, + "router_z_loss_clip": 0.01104736, + "router_z_loss_mlp": 0.20507812, + "step": 12066, + "time_per_iteration": 3.203951120376587 + }, + { + "auxiliary_loss_clip": 0.01100204, + "auxiliary_loss_mlp": 0.01027787, + "balance_loss_clip": 1.01663494, + "balance_loss_mlp": 1.03434396, + "epoch": 0.725507289944386, + "flos": 27052334254080.0, + "grad_norm": 1.9431934533116317, + "language_loss": 0.76573753, + "learning_rate": 7.395390262827897e-07, + "loss": 0.78701746, + "num_input_tokens_seen": 260406165, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 12067, + "time_per_iteration": 2.5001325607299805 + }, + { + "auxiliary_loss_clip": 0.01024325, + "auxiliary_loss_mlp": 0.01000445, + "balance_loss_clip": 0.99928838, + "balance_loss_mlp": 1.00393391, + "epoch": 0.725567413197054, + "flos": 62921924778240.0, + "grad_norm": 0.7268496108336204, + "language_loss": 0.57092577, + "learning_rate": 7.392366689800515e-07, + "loss": 0.59117347, + "num_input_tokens_seen": 260461365, + "router_z_loss_clip": 0.01153564, + "router_z_loss_mlp": 0.20410156, + "step": 12068, + "time_per_iteration": 2.961564779281616 + }, + { + "auxiliary_loss_clip": 0.01023519, + "auxiliary_loss_mlp": 0.00997832, + "balance_loss_clip": 0.99668139, + "balance_loss_mlp": 1.00306845, + "epoch": 0.7256275364497219, + "flos": 60295957188480.0, + "grad_norm": 0.6592626191043454, + "language_loss": 0.55426753, + "learning_rate": 7.389343594855848e-07, + "loss": 0.57448101, + "num_input_tokens_seen": 260523795, + "router_z_loss_clip": 0.01147461, + "router_z_loss_mlp": 0.20507812, + "step": 12069, + "time_per_iteration": 3.111906051635742 + }, + { + "auxiliary_loss_clip": 0.01098503, + "auxiliary_loss_mlp": 0.01026099, + "balance_loss_clip": 1.015275, + "balance_loss_mlp": 1.03479362, + "epoch": 0.7256876597023899, + "flos": 24498511130880.0, + "grad_norm": 1.8188254561357684, + "language_loss": 0.79876685, + "learning_rate": 7.38632097810854e-07, + "loss": 0.82001287, + "num_input_tokens_seen": 260544765, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.63671875, + "step": 12070, + "time_per_iteration": 2.4814393520355225 + }, + { + "auxiliary_loss_clip": 0.0109711, + "auxiliary_loss_mlp": 0.01030083, + "balance_loss_clip": 1.01860952, + "balance_loss_mlp": 1.03523922, + "epoch": 0.7257477829550578, + "flos": 24352749740160.0, + "grad_norm": 2.135024516193614, + "language_loss": 0.72267014, + "learning_rate": 7.383298839673197e-07, + "loss": 0.74394208, + "num_input_tokens_seen": 260564340, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.62109375, + "step": 12071, + "time_per_iteration": 2.5080463886260986 + }, + { + "auxiliary_loss_clip": 0.01099686, + "auxiliary_loss_mlp": 0.01034521, + "balance_loss_clip": 1.02348769, + "balance_loss_mlp": 1.03501189, + "epoch": 0.7258079062077258, + "flos": 17202217380480.0, + "grad_norm": 1.7654284044796786, + "language_loss": 0.6994983, + "learning_rate": 7.380277179664436e-07, + "loss": 0.72084033, + "num_input_tokens_seen": 260582565, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.64453125, + "step": 12072, + "time_per_iteration": 2.430056571960449 + }, + { + "auxiliary_loss_clip": 0.01103966, + "auxiliary_loss_mlp": 0.01029251, + "balance_loss_clip": 1.01702607, + "balance_loss_mlp": 1.03472924, + "epoch": 0.7258680294603939, + "flos": 21580338401280.0, + "grad_norm": 1.7824187520349677, + "language_loss": 0.78317153, + "learning_rate": 7.377255998196821e-07, + "loss": 0.80450368, + "num_input_tokens_seen": 260601700, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69140625, + "step": 12073, + "time_per_iteration": 2.479287624359131 + }, + { + "auxiliary_loss_clip": 0.01100141, + "auxiliary_loss_mlp": 0.01026691, + "balance_loss_clip": 1.01472855, + "balance_loss_mlp": 1.03557312, + "epoch": 0.7259281527130618, + "flos": 34855399036800.0, + "grad_norm": 1.6619094478292162, + "language_loss": 0.70389605, + "learning_rate": 7.374235295384923e-07, + "loss": 0.72516435, + "num_input_tokens_seen": 260623040, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.64453125, + "step": 12074, + "time_per_iteration": 2.674909830093384 + }, + { + "auxiliary_loss_clip": 0.01103212, + "auxiliary_loss_mlp": 0.01027211, + "balance_loss_clip": 1.01514673, + "balance_loss_mlp": 1.03562987, + "epoch": 0.7259882759657298, + "flos": 25404644551680.0, + "grad_norm": 1.6427266790682502, + "language_loss": 0.7405411, + "learning_rate": 7.371215071343302e-07, + "loss": 0.76184535, + "num_input_tokens_seen": 260642735, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.67578125, + "step": 12075, + "time_per_iteration": 2.4879863262176514 + }, + { + "auxiliary_loss_clip": 0.01102234, + "auxiliary_loss_mlp": 0.01030684, + "balance_loss_clip": 1.01842904, + "balance_loss_mlp": 1.03551388, + "epoch": 0.7260483992183977, + "flos": 62953630531200.0, + "grad_norm": 1.5060189576698704, + "language_loss": 0.635382, + "learning_rate": 7.368195326186458e-07, + "loss": 0.65671116, + "num_input_tokens_seen": 260669935, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.66796875, + "step": 12076, + "time_per_iteration": 2.817375659942627 + }, + { + "auxiliary_loss_clip": 0.0110076, + "auxiliary_loss_mlp": 0.01025872, + "balance_loss_clip": 1.0139389, + "balance_loss_mlp": 1.03412342, + "epoch": 0.7261085224710657, + "flos": 26467528924800.0, + "grad_norm": 2.5366204857105332, + "language_loss": 0.79249585, + "learning_rate": 7.365176060028912e-07, + "loss": 0.81376213, + "num_input_tokens_seen": 260689605, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66796875, + "step": 12077, + "time_per_iteration": 2.513556480407715 + }, + { + "auxiliary_loss_clip": 0.01023637, + "auxiliary_loss_mlp": 0.01002866, + "balance_loss_clip": 1.00172806, + "balance_loss_mlp": 1.00323439, + "epoch": 0.7261686457237336, + "flos": 66772732187520.0, + "grad_norm": 0.8858624910390671, + "language_loss": 0.64977288, + "learning_rate": 7.362157272985163e-07, + "loss": 0.67003787, + "num_input_tokens_seen": 260748265, + "router_z_loss_clip": 0.01141357, + "router_z_loss_mlp": 0.20507812, + "step": 12078, + "time_per_iteration": 3.0679736137390137 + }, + { + "auxiliary_loss_clip": 0.01023707, + "auxiliary_loss_mlp": 0.01000415, + "balance_loss_clip": 0.99934798, + "balance_loss_mlp": 1.0032717, + "epoch": 0.7262287689764017, + "flos": 69999594399360.0, + "grad_norm": 0.7121161567572437, + "language_loss": 0.59267461, + "learning_rate": 7.359138965169671e-07, + "loss": 0.61291581, + "num_input_tokens_seen": 260816715, + "router_z_loss_clip": 0.01068115, + "router_z_loss_mlp": 0.20507812, + "step": 12079, + "time_per_iteration": 3.201369524002075 + }, + { + "auxiliary_loss_clip": 0.01099969, + "auxiliary_loss_mlp": 0.01030168, + "balance_loss_clip": 1.01752567, + "balance_loss_mlp": 1.03409278, + "epoch": 0.7262888922290696, + "flos": 23805435231360.0, + "grad_norm": 1.8820513707228834, + "language_loss": 0.65003538, + "learning_rate": 7.356121136696895e-07, + "loss": 0.67133677, + "num_input_tokens_seen": 260836765, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.66015625, + "step": 12080, + "time_per_iteration": 2.4735429286956787 + }, + { + "auxiliary_loss_clip": 0.01101349, + "auxiliary_loss_mlp": 0.01026647, + "balance_loss_clip": 1.01415968, + "balance_loss_mlp": 1.0338223, + "epoch": 0.7263490154817376, + "flos": 19500320603520.0, + "grad_norm": 5.946673694238332, + "language_loss": 0.699211, + "learning_rate": 7.35310378768128e-07, + "loss": 0.72049093, + "num_input_tokens_seen": 260854610, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.67578125, + "step": 12081, + "time_per_iteration": 2.4283978939056396 + }, + { + "auxiliary_loss_clip": 0.01104797, + "auxiliary_loss_mlp": 0.0102899, + "balance_loss_clip": 1.01758754, + "balance_loss_mlp": 1.03677154, + "epoch": 0.7264091387344055, + "flos": 16286243633280.0, + "grad_norm": 4.042667093911173, + "language_loss": 0.81073087, + "learning_rate": 7.350086918237237e-07, + "loss": 0.83206874, + "num_input_tokens_seen": 260871620, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6796875, + "step": 12082, + "time_per_iteration": 2.4518401622772217 + }, + { + "auxiliary_loss_clip": 0.01107339, + "auxiliary_loss_mlp": 0.01033829, + "balance_loss_clip": 1.02072203, + "balance_loss_mlp": 1.03555846, + "epoch": 0.7264692619870735, + "flos": 24352031468160.0, + "grad_norm": 1.773588814829077, + "language_loss": 0.76834166, + "learning_rate": 7.347070528479158e-07, + "loss": 0.78975332, + "num_input_tokens_seen": 260890490, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 12083, + "time_per_iteration": 2.4874460697174072 + }, + { + "auxiliary_loss_clip": 0.01106226, + "auxiliary_loss_mlp": 0.01031622, + "balance_loss_clip": 1.01915908, + "balance_loss_mlp": 1.03815079, + "epoch": 0.7265293852397414, + "flos": 25119478477440.0, + "grad_norm": 1.6288025457613526, + "language_loss": 0.72911334, + "learning_rate": 7.344054618521433e-07, + "loss": 0.75049186, + "num_input_tokens_seen": 260909700, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 12084, + "time_per_iteration": 2.4936935901641846 + }, + { + "auxiliary_loss_clip": 0.01104738, + "auxiliary_loss_mlp": 0.01031261, + "balance_loss_clip": 1.01855981, + "balance_loss_mlp": 1.03661275, + "epoch": 0.7265895084924094, + "flos": 22638230784000.0, + "grad_norm": 1.683298254553577, + "language_loss": 0.77603686, + "learning_rate": 7.34103918847843e-07, + "loss": 0.79739684, + "num_input_tokens_seen": 260929090, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 12085, + "time_per_iteration": 2.461860418319702 + }, + { + "auxiliary_loss_clip": 0.01101384, + "auxiliary_loss_mlp": 0.01032841, + "balance_loss_clip": 1.02104557, + "balance_loss_mlp": 1.03391504, + "epoch": 0.7266496317450775, + "flos": 23368222886400.0, + "grad_norm": 1.8314526850775286, + "language_loss": 0.72461057, + "learning_rate": 7.338024238464493e-07, + "loss": 0.74595284, + "num_input_tokens_seen": 260946615, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 12086, + "time_per_iteration": 2.4804890155792236 + }, + { + "auxiliary_loss_clip": 0.01102997, + "auxiliary_loss_mlp": 0.01033387, + "balance_loss_clip": 1.02163863, + "balance_loss_mlp": 1.03661227, + "epoch": 0.7267097549977454, + "flos": 28074603323520.0, + "grad_norm": 2.0882270871339492, + "language_loss": 0.69382304, + "learning_rate": 7.335009768593938e-07, + "loss": 0.71518683, + "num_input_tokens_seen": 260968515, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6640625, + "step": 12087, + "time_per_iteration": 2.472632884979248 + }, + { + "auxiliary_loss_clip": 0.01105347, + "auxiliary_loss_mlp": 0.01034009, + "balance_loss_clip": 1.02104521, + "balance_loss_mlp": 1.03732419, + "epoch": 0.7267698782504134, + "flos": 22195523658240.0, + "grad_norm": 2.250412175179094, + "language_loss": 0.79011619, + "learning_rate": 7.331995778981088e-07, + "loss": 0.81150979, + "num_input_tokens_seen": 260986790, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6796875, + "step": 12088, + "time_per_iteration": 2.455793857574463 + }, + { + "auxiliary_loss_clip": 0.01103631, + "auxiliary_loss_mlp": 0.01035978, + "balance_loss_clip": 1.02433753, + "balance_loss_mlp": 1.03484094, + "epoch": 0.7268300015030813, + "flos": 18514859996160.0, + "grad_norm": 1.695956180050093, + "language_loss": 0.73965418, + "learning_rate": 7.328982269740221e-07, + "loss": 0.76105028, + "num_input_tokens_seen": 261004925, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6875, + "step": 12089, + "time_per_iteration": 2.4252777099609375 + }, + { + "auxiliary_loss_clip": 0.01103186, + "auxiliary_loss_mlp": 0.01033658, + "balance_loss_clip": 1.0215764, + "balance_loss_mlp": 1.03553808, + "epoch": 0.7268901247557493, + "flos": 23986029836160.0, + "grad_norm": 1.809103044869338, + "language_loss": 0.70920813, + "learning_rate": 7.325969240985616e-07, + "loss": 0.73057657, + "num_input_tokens_seen": 261023895, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 12090, + "time_per_iteration": 2.500497817993164 + }, + { + "auxiliary_loss_clip": 0.01103253, + "auxiliary_loss_mlp": 0.01029106, + "balance_loss_clip": 1.01645172, + "balance_loss_mlp": 1.03472519, + "epoch": 0.7269502480084172, + "flos": 32088087429120.0, + "grad_norm": 1.7365025485289893, + "language_loss": 0.7741468, + "learning_rate": 7.322956692831528e-07, + "loss": 0.79547042, + "num_input_tokens_seen": 261045445, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.68359375, + "step": 12091, + "time_per_iteration": 2.5417003631591797 + }, + { + "auxiliary_loss_clip": 0.01100865, + "auxiliary_loss_mlp": 0.01028025, + "balance_loss_clip": 1.01566255, + "balance_loss_mlp": 1.03411698, + "epoch": 0.7270103712610853, + "flos": 19062785036160.0, + "grad_norm": 3.1465600327537304, + "language_loss": 0.71302813, + "learning_rate": 7.319944625392205e-07, + "loss": 0.73431706, + "num_input_tokens_seen": 261064275, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.66796875, + "step": 12092, + "time_per_iteration": 2.4790890216827393 + }, + { + "auxiliary_loss_clip": 0.01101296, + "auxiliary_loss_mlp": 0.01029757, + "balance_loss_clip": 1.01770473, + "balance_loss_mlp": 1.03515983, + "epoch": 0.7270704945137532, + "flos": 34532921710080.0, + "grad_norm": 1.8134968044947444, + "language_loss": 0.6129632, + "learning_rate": 7.31693303878184e-07, + "loss": 0.63427377, + "num_input_tokens_seen": 261083310, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.66015625, + "step": 12093, + "time_per_iteration": 2.531416654586792 + }, + { + "auxiliary_loss_clip": 0.01102331, + "auxiliary_loss_mlp": 0.01031046, + "balance_loss_clip": 1.01923263, + "balance_loss_mlp": 1.03584278, + "epoch": 0.7271306177664212, + "flos": 21507583403520.0, + "grad_norm": 1.5414395566200807, + "language_loss": 0.75677824, + "learning_rate": 7.313921933114644e-07, + "loss": 0.77811199, + "num_input_tokens_seen": 261103460, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 12094, + "time_per_iteration": 3.885373592376709 + }, + { + "auxiliary_loss_clip": 0.01099162, + "auxiliary_loss_mlp": 0.0103038, + "balance_loss_clip": 1.01925766, + "balance_loss_mlp": 1.03378463, + "epoch": 0.7271907410190891, + "flos": 22272444633600.0, + "grad_norm": 1.9126635522388606, + "language_loss": 0.84773397, + "learning_rate": 7.310911308504808e-07, + "loss": 0.8690294, + "num_input_tokens_seen": 261121375, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 12095, + "time_per_iteration": 2.429746150970459 + }, + { + "auxiliary_loss_clip": 0.01101056, + "auxiliary_loss_mlp": 0.01033976, + "balance_loss_clip": 1.02127481, + "balance_loss_mlp": 1.03374481, + "epoch": 0.7272508642717571, + "flos": 22893124671360.0, + "grad_norm": 1.7505444036152586, + "language_loss": 0.78038371, + "learning_rate": 7.307901165066479e-07, + "loss": 0.80173397, + "num_input_tokens_seen": 261141105, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.671875, + "step": 12096, + "time_per_iteration": 3.8615665435791016 + }, + { + "auxiliary_loss_clip": 0.01103926, + "auxiliary_loss_mlp": 0.01031359, + "balance_loss_clip": 1.0200038, + "balance_loss_mlp": 1.03728688, + "epoch": 0.727310987524425, + "flos": 11655886331520.0, + "grad_norm": 2.3221692333246655, + "language_loss": 0.7232452, + "learning_rate": 7.30489150291381e-07, + "loss": 0.74459803, + "num_input_tokens_seen": 261159255, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 12097, + "time_per_iteration": 3.8505306243896484 + }, + { + "auxiliary_loss_clip": 0.01104342, + "auxiliary_loss_mlp": 0.01034863, + "balance_loss_clip": 1.02190495, + "balance_loss_mlp": 1.03669655, + "epoch": 0.727371110777093, + "flos": 24535319592960.0, + "grad_norm": 2.177278264782312, + "language_loss": 0.7672922, + "learning_rate": 7.301882322160935e-07, + "loss": 0.78868425, + "num_input_tokens_seen": 261177960, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6796875, + "step": 12098, + "time_per_iteration": 4.021664142608643 + }, + { + "auxiliary_loss_clip": 0.01102665, + "auxiliary_loss_mlp": 0.01032278, + "balance_loss_clip": 1.01946902, + "balance_loss_mlp": 1.03345513, + "epoch": 0.7274312340297611, + "flos": 74739835405440.0, + "grad_norm": 1.8124975199898956, + "language_loss": 0.6742186, + "learning_rate": 7.298873622921952e-07, + "loss": 0.69556803, + "num_input_tokens_seen": 261205660, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 12099, + "time_per_iteration": 2.8312809467315674 + }, + { + "auxiliary_loss_clip": 0.01106918, + "auxiliary_loss_mlp": 0.01032961, + "balance_loss_clip": 1.02005613, + "balance_loss_mlp": 1.0350759, + "epoch": 0.727491357282429, + "flos": 22342865247360.0, + "grad_norm": 4.666251767932542, + "language_loss": 0.72614902, + "learning_rate": 7.29586540531095e-07, + "loss": 0.74754786, + "num_input_tokens_seen": 261225185, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 12100, + "time_per_iteration": 2.48777437210083 + }, + { + "auxiliary_loss_clip": 0.01103782, + "auxiliary_loss_mlp": 0.01033607, + "balance_loss_clip": 1.02218103, + "balance_loss_mlp": 1.03623843, + "epoch": 0.727551480535097, + "flos": 23297550877440.0, + "grad_norm": 1.4296037667662786, + "language_loss": 0.74749982, + "learning_rate": 7.292857669442005e-07, + "loss": 0.76887369, + "num_input_tokens_seen": 261247965, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.67578125, + "step": 12101, + "time_per_iteration": 2.460813045501709 + }, + { + "auxiliary_loss_clip": 0.01102809, + "auxiliary_loss_mlp": 0.01030746, + "balance_loss_clip": 1.01962399, + "balance_loss_mlp": 1.03687561, + "epoch": 0.7276116037877649, + "flos": 21470559459840.0, + "grad_norm": 1.6471267556293203, + "language_loss": 0.82180774, + "learning_rate": 7.289850415429177e-07, + "loss": 0.84314322, + "num_input_tokens_seen": 261267585, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66015625, + "step": 12102, + "time_per_iteration": 2.486891031265259 + }, + { + "auxiliary_loss_clip": 0.01101993, + "auxiliary_loss_mlp": 0.01032966, + "balance_loss_clip": 1.02160573, + "balance_loss_mlp": 1.03577983, + "epoch": 0.7276717270404329, + "flos": 21464059098240.0, + "grad_norm": 2.238789262926412, + "language_loss": 0.81434906, + "learning_rate": 7.286843643386495e-07, + "loss": 0.8356986, + "num_input_tokens_seen": 261285200, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66015625, + "step": 12103, + "time_per_iteration": 2.414008855819702 + }, + { + "auxiliary_loss_clip": 0.01102157, + "auxiliary_loss_mlp": 0.01026281, + "balance_loss_clip": 1.01403213, + "balance_loss_mlp": 1.03556037, + "epoch": 0.7277318502931008, + "flos": 16837221329280.0, + "grad_norm": 2.300581534767291, + "language_loss": 0.66380107, + "learning_rate": 7.283837353427968e-07, + "loss": 0.68508548, + "num_input_tokens_seen": 261303645, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6640625, + "step": 12104, + "time_per_iteration": 2.4741268157958984 + }, + { + "auxiliary_loss_clip": 0.01099619, + "auxiliary_loss_mlp": 0.01028412, + "balance_loss_clip": 1.01674151, + "balance_loss_mlp": 1.03588009, + "epoch": 0.7277919735457689, + "flos": 33400550476800.0, + "grad_norm": 1.8448719986078481, + "language_loss": 0.65691745, + "learning_rate": 7.280831545667611e-07, + "loss": 0.67819774, + "num_input_tokens_seen": 261323265, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.63671875, + "step": 12105, + "time_per_iteration": 2.5147173404693604 + }, + { + "auxiliary_loss_clip": 0.01103458, + "auxiliary_loss_mlp": 0.01032555, + "balance_loss_clip": 1.02052665, + "balance_loss_mlp": 1.03698063, + "epoch": 0.7278520967984368, + "flos": 19206499351680.0, + "grad_norm": 2.269554332821791, + "language_loss": 0.75712693, + "learning_rate": 7.27782622021939e-07, + "loss": 0.77848709, + "num_input_tokens_seen": 261339745, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 12106, + "time_per_iteration": 2.435525417327881 + }, + { + "auxiliary_loss_clip": 0.01105516, + "auxiliary_loss_mlp": 0.01028309, + "balance_loss_clip": 1.01580417, + "balance_loss_mlp": 1.03651524, + "epoch": 0.7279122200511048, + "flos": 34094667870720.0, + "grad_norm": 2.027947954090959, + "language_loss": 0.70116639, + "learning_rate": 7.274821377197273e-07, + "loss": 0.72250462, + "num_input_tokens_seen": 261359310, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 12107, + "time_per_iteration": 2.5302398204803467 + }, + { + "auxiliary_loss_clip": 0.01101241, + "auxiliary_loss_mlp": 0.0103217, + "balance_loss_clip": 1.02056551, + "balance_loss_mlp": 1.03459477, + "epoch": 0.7279723433037727, + "flos": 54599049348480.0, + "grad_norm": 1.520569075146339, + "language_loss": 0.75155759, + "learning_rate": 7.271817016715205e-07, + "loss": 0.77289176, + "num_input_tokens_seen": 261384640, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 12108, + "time_per_iteration": 2.7630767822265625 + }, + { + "auxiliary_loss_clip": 0.01102209, + "auxiliary_loss_mlp": 0.01028873, + "balance_loss_clip": 1.01658893, + "balance_loss_mlp": 1.03495109, + "epoch": 0.7280324665564407, + "flos": 36137482156800.0, + "grad_norm": 1.5886355104574046, + "language_loss": 0.66785181, + "learning_rate": 7.268813138887124e-07, + "loss": 0.68916261, + "num_input_tokens_seen": 261405290, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.671875, + "step": 12109, + "time_per_iteration": 2.5576727390289307 + }, + { + "auxiliary_loss_clip": 0.01102344, + "auxiliary_loss_mlp": 0.01030828, + "balance_loss_clip": 1.01853728, + "balance_loss_mlp": 1.03609085, + "epoch": 0.7280925898091086, + "flos": 11618539165440.0, + "grad_norm": 1.9794357831275327, + "language_loss": 0.62950575, + "learning_rate": 7.265809743826912e-07, + "loss": 0.65083742, + "num_input_tokens_seen": 261419710, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6640625, + "step": 12110, + "time_per_iteration": 2.44002366065979 + }, + { + "auxiliary_loss_clip": 0.01102169, + "auxiliary_loss_mlp": 0.01026996, + "balance_loss_clip": 1.01408529, + "balance_loss_mlp": 1.03304601, + "epoch": 0.7281527130617766, + "flos": 34277094069120.0, + "grad_norm": 1.7658774771753212, + "language_loss": 0.58043802, + "learning_rate": 7.26280683164847e-07, + "loss": 0.60172975, + "num_input_tokens_seen": 261442385, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69140625, + "step": 12111, + "time_per_iteration": 2.6210787296295166 + }, + { + "auxiliary_loss_clip": 0.01106335, + "auxiliary_loss_mlp": 0.01030725, + "balance_loss_clip": 1.01827931, + "balance_loss_mlp": 1.03801906, + "epoch": 0.7282128363144446, + "flos": 13918043018880.0, + "grad_norm": 1.9352527589955661, + "language_loss": 0.73992717, + "learning_rate": 7.259804402465677e-07, + "loss": 0.76129776, + "num_input_tokens_seen": 261459805, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.68359375, + "step": 12112, + "time_per_iteration": 2.4524636268615723 + }, + { + "auxiliary_loss_clip": 0.01099679, + "auxiliary_loss_mlp": 0.01029493, + "balance_loss_clip": 1.01777458, + "balance_loss_mlp": 1.03403258, + "epoch": 0.7282729595671126, + "flos": 20777627214720.0, + "grad_norm": 2.0053906619330006, + "language_loss": 0.67298758, + "learning_rate": 7.25680245639237e-07, + "loss": 0.69427931, + "num_input_tokens_seen": 261477175, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 12113, + "time_per_iteration": 2.4597878456115723 + }, + { + "auxiliary_loss_clip": 0.01102125, + "auxiliary_loss_mlp": 0.01030726, + "balance_loss_clip": 1.01829863, + "balance_loss_mlp": 1.03391302, + "epoch": 0.7283330828197806, + "flos": 16325422392960.0, + "grad_norm": 1.6626035833227917, + "language_loss": 0.73243928, + "learning_rate": 7.253800993542399e-07, + "loss": 0.75376785, + "num_input_tokens_seen": 261494990, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.68359375, + "step": 12114, + "time_per_iteration": 2.4250495433807373 + }, + { + "auxiliary_loss_clip": 0.01099798, + "auxiliary_loss_mlp": 0.01029231, + "balance_loss_clip": 1.01685691, + "balance_loss_mlp": 1.03370285, + "epoch": 0.7283932060724485, + "flos": 27490193043840.0, + "grad_norm": 2.0029156408767714, + "language_loss": 0.68175685, + "learning_rate": 7.250800014029564e-07, + "loss": 0.70304716, + "num_input_tokens_seen": 261514445, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6640625, + "step": 12115, + "time_per_iteration": 2.4954171180725098 + }, + { + "auxiliary_loss_clip": 0.01103561, + "auxiliary_loss_mlp": 0.01027892, + "balance_loss_clip": 1.01567912, + "balance_loss_mlp": 1.03449523, + "epoch": 0.7284533293251165, + "flos": 18367877543040.0, + "grad_norm": 1.5749182133229294, + "language_loss": 0.59722745, + "learning_rate": 7.247799517967674e-07, + "loss": 0.61854202, + "num_input_tokens_seen": 261533565, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69140625, + "step": 12116, + "time_per_iteration": 2.5029101371765137 + }, + { + "auxiliary_loss_clip": 0.01100013, + "auxiliary_loss_mlp": 0.01028036, + "balance_loss_clip": 1.01579905, + "balance_loss_mlp": 1.03508806, + "epoch": 0.7285134525777844, + "flos": 21725525174400.0, + "grad_norm": 1.7186518000931694, + "language_loss": 0.72523415, + "learning_rate": 7.2447995054705e-07, + "loss": 0.74651456, + "num_input_tokens_seen": 261553795, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6484375, + "step": 12117, + "time_per_iteration": 2.4426584243774414 + }, + { + "auxiliary_loss_clip": 0.01101836, + "auxiliary_loss_mlp": 0.01024568, + "balance_loss_clip": 1.01234937, + "balance_loss_mlp": 1.03475642, + "epoch": 0.7285735758304525, + "flos": 20741357456640.0, + "grad_norm": 2.143264936763247, + "language_loss": 0.69296616, + "learning_rate": 7.241799976651807e-07, + "loss": 0.71423018, + "num_input_tokens_seen": 261572565, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 12118, + "time_per_iteration": 2.5339369773864746 + }, + { + "auxiliary_loss_clip": 0.01097686, + "auxiliary_loss_mlp": 0.0103422, + "balance_loss_clip": 1.02279413, + "balance_loss_mlp": 1.03442514, + "epoch": 0.7286336990831204, + "flos": 17310954827520.0, + "grad_norm": 1.6909309126085614, + "language_loss": 0.84203392, + "learning_rate": 7.238800931625346e-07, + "loss": 0.86335295, + "num_input_tokens_seen": 261590910, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6328125, + "step": 12119, + "time_per_iteration": 2.3954200744628906 + }, + { + "auxiliary_loss_clip": 0.01102768, + "auxiliary_loss_mlp": 0.01027674, + "balance_loss_clip": 1.01645637, + "balance_loss_mlp": 1.03579891, + "epoch": 0.7286938223357884, + "flos": 19787390098560.0, + "grad_norm": 2.2822251390786312, + "language_loss": 0.82164419, + "learning_rate": 7.235802370504831e-07, + "loss": 0.84294862, + "num_input_tokens_seen": 261606005, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 12120, + "time_per_iteration": 2.4175772666931152 + }, + { + "auxiliary_loss_clip": 0.01104482, + "auxiliary_loss_mlp": 0.0103554, + "balance_loss_clip": 1.02358377, + "balance_loss_mlp": 1.03648496, + "epoch": 0.7287539455884563, + "flos": 15340859625600.0, + "grad_norm": 1.8056895427232635, + "language_loss": 0.78642154, + "learning_rate": 7.232804293403963e-07, + "loss": 0.80782175, + "num_input_tokens_seen": 261622305, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6796875, + "step": 12121, + "time_per_iteration": 2.406684160232544 + }, + { + "auxiliary_loss_clip": 0.01100839, + "auxiliary_loss_mlp": 0.01029423, + "balance_loss_clip": 1.01706123, + "balance_loss_mlp": 1.03222573, + "epoch": 0.7288140688411243, + "flos": 25192484870400.0, + "grad_norm": 1.5367306608153926, + "language_loss": 0.6915673, + "learning_rate": 7.229806700436441e-07, + "loss": 0.71286988, + "num_input_tokens_seen": 261642465, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 12122, + "time_per_iteration": 2.533647060394287 + }, + { + "auxiliary_loss_clip": 0.01097102, + "auxiliary_loss_mlp": 0.01029883, + "balance_loss_clip": 1.01871347, + "balance_loss_mlp": 1.03240955, + "epoch": 0.7288741920937922, + "flos": 23984162328960.0, + "grad_norm": 1.8487795313278665, + "language_loss": 0.8722074, + "learning_rate": 7.226809591715923e-07, + "loss": 0.89347732, + "num_input_tokens_seen": 261661420, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 12123, + "time_per_iteration": 2.4654133319854736 + }, + { + "auxiliary_loss_clip": 0.01099535, + "auxiliary_loss_mlp": 0.01031272, + "balance_loss_clip": 1.01967263, + "balance_loss_mlp": 1.03390992, + "epoch": 0.7289343153464602, + "flos": 22744921155840.0, + "grad_norm": 2.1267005511199604, + "language_loss": 0.8275702, + "learning_rate": 7.223812967356065e-07, + "loss": 0.84887826, + "num_input_tokens_seen": 261680865, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 12124, + "time_per_iteration": 2.5298664569854736 + }, + { + "auxiliary_loss_clip": 0.01100083, + "auxiliary_loss_mlp": 0.01028709, + "balance_loss_clip": 1.01730633, + "balance_loss_mlp": 1.0351851, + "epoch": 0.7289944385991282, + "flos": 24900028335360.0, + "grad_norm": 1.8446613007140906, + "language_loss": 0.67240703, + "learning_rate": 7.220816827470499e-07, + "loss": 0.69369495, + "num_input_tokens_seen": 261701455, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 12125, + "time_per_iteration": 2.4683637619018555 + }, + { + "auxiliary_loss_clip": 0.01105338, + "auxiliary_loss_mlp": 0.01030214, + "balance_loss_clip": 1.01760745, + "balance_loss_mlp": 1.03575897, + "epoch": 0.7290545618517962, + "flos": 22967064817920.0, + "grad_norm": 1.8041889285235344, + "language_loss": 0.74976206, + "learning_rate": 7.217821172172855e-07, + "loss": 0.77111757, + "num_input_tokens_seen": 261721260, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 12126, + "time_per_iteration": 2.4857234954833984 + }, + { + "auxiliary_loss_clip": 0.01023798, + "auxiliary_loss_mlp": 0.01004495, + "balance_loss_clip": 1.00342834, + "balance_loss_mlp": 1.0033108, + "epoch": 0.7291146851044642, + "flos": 61901523216000.0, + "grad_norm": 0.8154544542721714, + "language_loss": 0.58675981, + "learning_rate": 7.2148260015767e-07, + "loss": 0.60704273, + "num_input_tokens_seen": 261779370, + "router_z_loss_clip": 0.01068115, + "router_z_loss_mlp": 0.20507812, + "step": 12127, + "time_per_iteration": 2.9716975688934326 + }, + { + "auxiliary_loss_clip": 0.01100331, + "auxiliary_loss_mlp": 0.01027444, + "balance_loss_clip": 1.0168165, + "balance_loss_mlp": 1.03571177, + "epoch": 0.7291748083571321, + "flos": 23330947547520.0, + "grad_norm": 1.9593385701209045, + "language_loss": 0.69048452, + "learning_rate": 7.21183131579562e-07, + "loss": 0.71176225, + "num_input_tokens_seen": 261798050, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6484375, + "step": 12128, + "time_per_iteration": 2.5162582397460938 + }, + { + "auxiliary_loss_clip": 0.0110308, + "auxiliary_loss_mlp": 0.01032469, + "balance_loss_clip": 1.02043474, + "balance_loss_mlp": 1.03561521, + "epoch": 0.7292349316098001, + "flos": 28330000001280.0, + "grad_norm": 2.0485847355558953, + "language_loss": 0.65249133, + "learning_rate": 7.20883711494319e-07, + "loss": 0.67384678, + "num_input_tokens_seen": 261817660, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 12129, + "time_per_iteration": 2.487868547439575 + }, + { + "auxiliary_loss_clip": 0.01098698, + "auxiliary_loss_mlp": 0.01026271, + "balance_loss_clip": 1.01426673, + "balance_loss_mlp": 1.03446507, + "epoch": 0.729295054862468, + "flos": 24132222190080.0, + "grad_norm": 2.5377483717802485, + "language_loss": 0.74676943, + "learning_rate": 7.205843399132927e-07, + "loss": 0.76801908, + "num_input_tokens_seen": 261837935, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.640625, + "step": 12130, + "time_per_iteration": 2.5030577182769775 + }, + { + "auxiliary_loss_clip": 0.01100647, + "auxiliary_loss_mlp": 0.01029321, + "balance_loss_clip": 1.01728129, + "balance_loss_mlp": 1.0347085, + "epoch": 0.7293551781151361, + "flos": 22816239609600.0, + "grad_norm": 1.617355369468953, + "language_loss": 0.6962043, + "learning_rate": 7.202850168478374e-07, + "loss": 0.71750402, + "num_input_tokens_seen": 261857575, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66015625, + "step": 12131, + "time_per_iteration": 2.4352428913116455 + }, + { + "auxiliary_loss_clip": 0.01101676, + "auxiliary_loss_mlp": 0.01028873, + "balance_loss_clip": 1.01771474, + "balance_loss_mlp": 1.03647351, + "epoch": 0.729415301367804, + "flos": 22126683242880.0, + "grad_norm": 1.4743863900351697, + "language_loss": 0.77282, + "learning_rate": 7.199857423093025e-07, + "loss": 0.79412544, + "num_input_tokens_seen": 261877265, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.65234375, + "step": 12132, + "time_per_iteration": 2.495375156402588 + }, + { + "auxiliary_loss_clip": 0.0110199, + "auxiliary_loss_mlp": 0.01032463, + "balance_loss_clip": 1.02124524, + "balance_loss_mlp": 1.03552151, + "epoch": 0.729475424620472, + "flos": 12349608675840.0, + "grad_norm": 2.217572112042413, + "language_loss": 0.79134017, + "learning_rate": 7.196865163090358e-07, + "loss": 0.81268471, + "num_input_tokens_seen": 261893695, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6640625, + "step": 12133, + "time_per_iteration": 2.403266668319702 + }, + { + "auxiliary_loss_clip": 0.01100314, + "auxiliary_loss_mlp": 0.01030932, + "balance_loss_clip": 1.01922512, + "balance_loss_mlp": 1.03376698, + "epoch": 0.7295355478731399, + "flos": 22195308176640.0, + "grad_norm": 1.8655920091949136, + "language_loss": 0.7224102, + "learning_rate": 7.193873388583846e-07, + "loss": 0.74372262, + "num_input_tokens_seen": 261911825, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66796875, + "step": 12134, + "time_per_iteration": 2.510369300842285 + }, + { + "auxiliary_loss_clip": 0.01103467, + "auxiliary_loss_mlp": 0.01035641, + "balance_loss_clip": 1.02342796, + "balance_loss_mlp": 1.03683078, + "epoch": 0.7295956711258079, + "flos": 23222030532480.0, + "grad_norm": 1.8815102601218348, + "language_loss": 0.71485353, + "learning_rate": 7.190882099686939e-07, + "loss": 0.73624468, + "num_input_tokens_seen": 261931190, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6640625, + "step": 12135, + "time_per_iteration": 2.4513211250305176 + }, + { + "auxiliary_loss_clip": 0.01102275, + "auxiliary_loss_mlp": 0.01032318, + "balance_loss_clip": 1.02063513, + "balance_loss_mlp": 1.03478527, + "epoch": 0.7296557943784758, + "flos": 31869104163840.0, + "grad_norm": 2.3479540644405645, + "language_loss": 0.62245309, + "learning_rate": 7.187891296513075e-07, + "loss": 0.64379901, + "num_input_tokens_seen": 261951240, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.67578125, + "step": 12136, + "time_per_iteration": 3.9409608840942383 + }, + { + "auxiliary_loss_clip": 0.01099061, + "auxiliary_loss_mlp": 0.01034881, + "balance_loss_clip": 1.02353823, + "balance_loss_mlp": 1.03336811, + "epoch": 0.7297159176311439, + "flos": 26651714889600.0, + "grad_norm": 1.8075029483736118, + "language_loss": 0.74606574, + "learning_rate": 7.184900979175654e-07, + "loss": 0.76740515, + "num_input_tokens_seen": 261971605, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 12137, + "time_per_iteration": 2.536616086959839 + }, + { + "auxiliary_loss_clip": 0.01104966, + "auxiliary_loss_mlp": 0.01033839, + "balance_loss_clip": 1.02242422, + "balance_loss_mlp": 1.03774345, + "epoch": 0.7297760408838118, + "flos": 24749562263040.0, + "grad_norm": 1.6283862626280647, + "language_loss": 0.74377739, + "learning_rate": 7.181911147788069e-07, + "loss": 0.76516545, + "num_input_tokens_seen": 261990830, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 12138, + "time_per_iteration": 3.9735019207000732 + }, + { + "auxiliary_loss_clip": 0.01097337, + "auxiliary_loss_mlp": 0.01029617, + "balance_loss_clip": 1.01875067, + "balance_loss_mlp": 1.03234982, + "epoch": 0.7298361641364798, + "flos": 18073768982400.0, + "grad_norm": 2.062700649659985, + "language_loss": 0.71971607, + "learning_rate": 7.178921802463702e-07, + "loss": 0.74098563, + "num_input_tokens_seen": 262008190, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6484375, + "step": 12139, + "time_per_iteration": 4.020869731903076 + }, + { + "auxiliary_loss_clip": 0.01097707, + "auxiliary_loss_mlp": 0.01025679, + "balance_loss_clip": 1.01508093, + "balance_loss_mlp": 1.03471375, + "epoch": 0.7298962873891478, + "flos": 29895597169920.0, + "grad_norm": 1.3852703912405009, + "language_loss": 0.73432374, + "learning_rate": 7.175932943315898e-07, + "loss": 0.75555754, + "num_input_tokens_seen": 262030460, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.62890625, + "step": 12140, + "time_per_iteration": 4.02800989151001 + }, + { + "auxiliary_loss_clip": 0.01101201, + "auxiliary_loss_mlp": 0.01031284, + "balance_loss_clip": 1.01883268, + "balance_loss_mlp": 1.03433836, + "epoch": 0.7299564106418157, + "flos": 32266096254720.0, + "grad_norm": 1.6478138849846053, + "language_loss": 0.55289412, + "learning_rate": 7.172944570458003e-07, + "loss": 0.57421893, + "num_input_tokens_seen": 262050830, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.66796875, + "step": 12141, + "time_per_iteration": 2.7540974617004395 + }, + { + "auxiliary_loss_clip": 0.01098698, + "auxiliary_loss_mlp": 0.01024438, + "balance_loss_clip": 1.01330972, + "balance_loss_mlp": 1.0348109, + "epoch": 0.7300165338944837, + "flos": 22930292269440.0, + "grad_norm": 1.4560422495968448, + "language_loss": 0.72527927, + "learning_rate": 7.169956684003342e-07, + "loss": 0.74651062, + "num_input_tokens_seen": 262071245, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.640625, + "step": 12142, + "time_per_iteration": 2.5032155513763428 + }, + { + "auxiliary_loss_clip": 0.01100592, + "auxiliary_loss_mlp": 0.01031929, + "balance_loss_clip": 1.0208838, + "balance_loss_mlp": 1.03534031, + "epoch": 0.7300766571471516, + "flos": 19828795501440.0, + "grad_norm": 1.7431177644397007, + "language_loss": 0.73784506, + "learning_rate": 7.16696928406521e-07, + "loss": 0.75917029, + "num_input_tokens_seen": 262087525, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65234375, + "step": 12143, + "time_per_iteration": 2.4508650302886963 + }, + { + "auxiliary_loss_clip": 0.01101928, + "auxiliary_loss_mlp": 0.01031479, + "balance_loss_clip": 1.01959443, + "balance_loss_mlp": 1.03553247, + "epoch": 0.7301367803998197, + "flos": 24347829576960.0, + "grad_norm": 2.3241470315915786, + "language_loss": 0.66688013, + "learning_rate": 7.163982370756882e-07, + "loss": 0.68821418, + "num_input_tokens_seen": 262107355, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 12144, + "time_per_iteration": 2.5108768939971924 + }, + { + "auxiliary_loss_clip": 0.01101867, + "auxiliary_loss_mlp": 0.01027765, + "balance_loss_clip": 1.01570094, + "balance_loss_mlp": 1.03569078, + "epoch": 0.7301969036524876, + "flos": 15304518040320.0, + "grad_norm": 1.6911946286278683, + "language_loss": 0.79302132, + "learning_rate": 7.160995944191627e-07, + "loss": 0.81431764, + "num_input_tokens_seen": 262125645, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6640625, + "step": 12145, + "time_per_iteration": 2.4418389797210693 + }, + { + "auxiliary_loss_clip": 0.01100282, + "auxiliary_loss_mlp": 0.01028599, + "balance_loss_clip": 1.0172739, + "balance_loss_mlp": 1.03604698, + "epoch": 0.7302570269051556, + "flos": 23507268433920.0, + "grad_norm": 1.6533125281544103, + "language_loss": 0.91145337, + "learning_rate": 7.158010004482702e-07, + "loss": 0.93274218, + "num_input_tokens_seen": 262144075, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 12146, + "time_per_iteration": 2.4392800331115723 + }, + { + "auxiliary_loss_clip": 0.01098845, + "auxiliary_loss_mlp": 0.01025746, + "balance_loss_clip": 1.01512456, + "balance_loss_mlp": 1.03589582, + "epoch": 0.7303171501578235, + "flos": 20523056549760.0, + "grad_norm": 3.9977008079887275, + "language_loss": 0.61903286, + "learning_rate": 7.155024551743316e-07, + "loss": 0.64027882, + "num_input_tokens_seen": 262165940, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.62890625, + "step": 12147, + "time_per_iteration": 2.4647200107574463 + }, + { + "auxiliary_loss_clip": 0.01102166, + "auxiliary_loss_mlp": 0.01035594, + "balance_loss_clip": 1.02376306, + "balance_loss_mlp": 1.03584671, + "epoch": 0.7303772734104915, + "flos": 18332613365760.0, + "grad_norm": 1.8253831896260186, + "language_loss": 0.75063682, + "learning_rate": 7.152039586086693e-07, + "loss": 0.7720145, + "num_input_tokens_seen": 262184520, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 12148, + "time_per_iteration": 2.4266207218170166 + }, + { + "auxiliary_loss_clip": 0.01024253, + "auxiliary_loss_mlp": 0.01006124, + "balance_loss_clip": 1.00514054, + "balance_loss_mlp": 1.0036819, + "epoch": 0.7304373966631594, + "flos": 60654776100480.0, + "grad_norm": 0.6830351523119454, + "language_loss": 0.56657213, + "learning_rate": 7.149055107626017e-07, + "loss": 0.58687592, + "num_input_tokens_seen": 262247070, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.20507812, + "step": 12149, + "time_per_iteration": 3.027615785598755 + }, + { + "auxiliary_loss_clip": 0.01101256, + "auxiliary_loss_mlp": 0.01030223, + "balance_loss_clip": 1.01849318, + "balance_loss_mlp": 1.03406572, + "epoch": 0.7304975199158275, + "flos": 19828077229440.0, + "grad_norm": 1.6835156550315518, + "language_loss": 0.73653138, + "learning_rate": 7.146071116474451e-07, + "loss": 0.75784624, + "num_input_tokens_seen": 262266605, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 12150, + "time_per_iteration": 2.4099485874176025 + }, + { + "auxiliary_loss_clip": 0.01103316, + "auxiliary_loss_mlp": 0.01027257, + "balance_loss_clip": 1.01468682, + "balance_loss_mlp": 1.03478301, + "epoch": 0.7305576431684954, + "flos": 13223997452160.0, + "grad_norm": 1.944560081629452, + "language_loss": 0.84078568, + "learning_rate": 7.143087612745158e-07, + "loss": 0.86209142, + "num_input_tokens_seen": 262283880, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 12151, + "time_per_iteration": 2.4708986282348633 + }, + { + "auxiliary_loss_clip": 0.01102798, + "auxiliary_loss_mlp": 0.0103432, + "balance_loss_clip": 1.0218451, + "balance_loss_mlp": 1.0358156, + "epoch": 0.7306177664211634, + "flos": 24060472773120.0, + "grad_norm": 1.670544008969589, + "language_loss": 0.77620661, + "learning_rate": 7.14010459655127e-07, + "loss": 0.79757774, + "num_input_tokens_seen": 262304155, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.66796875, + "step": 12152, + "time_per_iteration": 2.4695539474487305 + }, + { + "auxiliary_loss_clip": 0.01103894, + "auxiliary_loss_mlp": 0.01028837, + "balance_loss_clip": 1.01692247, + "balance_loss_mlp": 1.03786087, + "epoch": 0.7306778896738314, + "flos": 27089106802560.0, + "grad_norm": 1.5663619490166691, + "language_loss": 0.79568756, + "learning_rate": 7.137122068005919e-07, + "loss": 0.81701493, + "num_input_tokens_seen": 262325660, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66015625, + "step": 12153, + "time_per_iteration": 2.533879280090332 + }, + { + "auxiliary_loss_clip": 0.01105053, + "auxiliary_loss_mlp": 0.01030327, + "balance_loss_clip": 1.0184778, + "balance_loss_mlp": 1.03624892, + "epoch": 0.7307380129264993, + "flos": 16690669839360.0, + "grad_norm": 1.621227897072943, + "language_loss": 0.67485428, + "learning_rate": 7.134140027222173e-07, + "loss": 0.69620812, + "num_input_tokens_seen": 262344075, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6875, + "step": 12154, + "time_per_iteration": 2.418184995651245 + }, + { + "auxiliary_loss_clip": 0.01102596, + "auxiliary_loss_mlp": 0.01029751, + "balance_loss_clip": 1.01756167, + "balance_loss_mlp": 1.03488839, + "epoch": 0.7307981361791673, + "flos": 21725740656000.0, + "grad_norm": 1.9151300415152432, + "language_loss": 0.65747088, + "learning_rate": 7.131158474313128e-07, + "loss": 0.67879438, + "num_input_tokens_seen": 262363305, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.67578125, + "step": 12155, + "time_per_iteration": 2.4923956394195557 + }, + { + "auxiliary_loss_clip": 0.01096922, + "auxiliary_loss_mlp": 0.01030411, + "balance_loss_clip": 1.01884151, + "balance_loss_mlp": 1.03208816, + "epoch": 0.7308582594318352, + "flos": 18040659621120.0, + "grad_norm": 1.6880646162483905, + "language_loss": 0.81661636, + "learning_rate": 7.128177409391851e-07, + "loss": 0.83788967, + "num_input_tokens_seen": 262380730, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6484375, + "step": 12156, + "time_per_iteration": 2.4129483699798584 + }, + { + "auxiliary_loss_clip": 0.01098675, + "auxiliary_loss_mlp": 0.01030204, + "balance_loss_clip": 1.0193615, + "balance_loss_mlp": 1.03432953, + "epoch": 0.7309183826845033, + "flos": 13844964798720.0, + "grad_norm": 2.405459413664416, + "language_loss": 0.75240982, + "learning_rate": 7.125196832571367e-07, + "loss": 0.77369863, + "num_input_tokens_seen": 262395480, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.640625, + "step": 12157, + "time_per_iteration": 2.4383459091186523 + }, + { + "auxiliary_loss_clip": 0.0109587, + "auxiliary_loss_mlp": 0.0102862, + "balance_loss_clip": 1.01831448, + "balance_loss_mlp": 1.03320694, + "epoch": 0.7309785059371712, + "flos": 17019216564480.0, + "grad_norm": 2.0421552799554457, + "language_loss": 0.72894901, + "learning_rate": 7.122216743964713e-07, + "loss": 0.75019395, + "num_input_tokens_seen": 262413340, + "router_z_loss_clip": 0.10302734, + "router_z_loss_mlp": 0.62890625, + "step": 12158, + "time_per_iteration": 2.409529209136963 + }, + { + "auxiliary_loss_clip": 0.01103494, + "auxiliary_loss_mlp": 0.01030392, + "balance_loss_clip": 1.01861429, + "balance_loss_mlp": 1.03654337, + "epoch": 0.7310386291898392, + "flos": 26502398052480.0, + "grad_norm": 1.5794059929341078, + "language_loss": 0.85767531, + "learning_rate": 7.119237143684896e-07, + "loss": 0.87901425, + "num_input_tokens_seen": 262433455, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.66796875, + "step": 12159, + "time_per_iteration": 2.5144267082214355 + }, + { + "auxiliary_loss_clip": 0.01104084, + "auxiliary_loss_mlp": 0.01029593, + "balance_loss_clip": 1.01700473, + "balance_loss_mlp": 1.03464055, + "epoch": 0.7310987524425071, + "flos": 16945922862720.0, + "grad_norm": 2.076806919622798, + "language_loss": 0.73464298, + "learning_rate": 7.116258031844895e-07, + "loss": 0.75597978, + "num_input_tokens_seen": 262450335, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 12160, + "time_per_iteration": 2.405029535293579 + }, + { + "auxiliary_loss_clip": 0.0110368, + "auxiliary_loss_mlp": 0.01029782, + "balance_loss_clip": 1.01743793, + "balance_loss_mlp": 1.0356549, + "epoch": 0.7311588756951751, + "flos": 13845288021120.0, + "grad_norm": 1.9196235781681743, + "language_loss": 0.72528148, + "learning_rate": 7.113279408557675e-07, + "loss": 0.74661607, + "num_input_tokens_seen": 262468240, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6796875, + "step": 12161, + "time_per_iteration": 2.4075698852539062 + }, + { + "auxiliary_loss_clip": 0.01107154, + "auxiliary_loss_mlp": 0.01029867, + "balance_loss_clip": 1.01682591, + "balance_loss_mlp": 1.03725171, + "epoch": 0.731218998947843, + "flos": 28767894704640.0, + "grad_norm": 5.707259461998225, + "language_loss": 0.69178545, + "learning_rate": 7.110301273936192e-07, + "loss": 0.71315575, + "num_input_tokens_seen": 262487045, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.69921875, + "step": 12162, + "time_per_iteration": 2.5137577056884766 + }, + { + "auxiliary_loss_clip": 0.01103934, + "auxiliary_loss_mlp": 0.01030313, + "balance_loss_clip": 1.01783824, + "balance_loss_mlp": 1.03625202, + "epoch": 0.7312791222005111, + "flos": 27088783580160.0, + "grad_norm": 1.8703565147701806, + "language_loss": 0.66851526, + "learning_rate": 7.107323628093382e-07, + "loss": 0.68985772, + "num_input_tokens_seen": 262504855, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 12163, + "time_per_iteration": 2.4703001976013184 + }, + { + "auxiliary_loss_clip": 0.0110019, + "auxiliary_loss_mlp": 0.01030251, + "balance_loss_clip": 1.01822889, + "balance_loss_mlp": 1.03375793, + "epoch": 0.731339245453179, + "flos": 20924035050240.0, + "grad_norm": 1.4832431428317139, + "language_loss": 0.68488622, + "learning_rate": 7.104346471142153e-07, + "loss": 0.70619065, + "num_input_tokens_seen": 262524920, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 12164, + "time_per_iteration": 2.4578616619110107 + }, + { + "auxiliary_loss_clip": 0.01100044, + "auxiliary_loss_mlp": 0.01031446, + "balance_loss_clip": 1.0206039, + "balance_loss_mlp": 1.0372684, + "epoch": 0.731399368705847, + "flos": 23075694524160.0, + "grad_norm": 1.4717257929564707, + "language_loss": 0.72854477, + "learning_rate": 7.101369803195391e-07, + "loss": 0.74985963, + "num_input_tokens_seen": 262545725, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.62890625, + "step": 12165, + "time_per_iteration": 2.451599359512329 + }, + { + "auxiliary_loss_clip": 0.01102834, + "auxiliary_loss_mlp": 0.01033628, + "balance_loss_clip": 1.0217309, + "balance_loss_mlp": 1.03535652, + "epoch": 0.731459491958515, + "flos": 23582681038080.0, + "grad_norm": 1.8716087020467311, + "language_loss": 0.76773065, + "learning_rate": 7.098393624365988e-07, + "loss": 0.78909522, + "num_input_tokens_seen": 262565480, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 12166, + "time_per_iteration": 2.4635097980499268 + }, + { + "auxiliary_loss_clip": 0.01102127, + "auxiliary_loss_mlp": 0.01032086, + "balance_loss_clip": 1.02040911, + "balance_loss_mlp": 1.03687727, + "epoch": 0.7315196152111829, + "flos": 22379278659840.0, + "grad_norm": 2.0545527072080945, + "language_loss": 0.79531485, + "learning_rate": 7.095417934766781e-07, + "loss": 0.81665695, + "num_input_tokens_seen": 262584145, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65234375, + "step": 12167, + "time_per_iteration": 2.46749210357666 + }, + { + "auxiliary_loss_clip": 0.01101324, + "auxiliary_loss_mlp": 0.01038084, + "balance_loss_clip": 1.02647865, + "balance_loss_mlp": 1.03602624, + "epoch": 0.7315797384638509, + "flos": 26177047637760.0, + "grad_norm": 1.668118675469295, + "language_loss": 0.76923746, + "learning_rate": 7.092442734510622e-07, + "loss": 0.79063153, + "num_input_tokens_seen": 262604045, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 12168, + "time_per_iteration": 2.5427803993225098 + }, + { + "auxiliary_loss_clip": 0.01103242, + "auxiliary_loss_mlp": 0.01033018, + "balance_loss_clip": 1.02010727, + "balance_loss_mlp": 1.03531849, + "epoch": 0.7316398617165188, + "flos": 21506326427520.0, + "grad_norm": 1.6642312861866588, + "language_loss": 0.81803644, + "learning_rate": 7.089468023710326e-07, + "loss": 0.83939904, + "num_input_tokens_seen": 262624540, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6796875, + "step": 12169, + "time_per_iteration": 2.4575917720794678 + }, + { + "auxiliary_loss_clip": 0.0110358, + "auxiliary_loss_mlp": 0.01035859, + "balance_loss_clip": 1.024194, + "balance_loss_mlp": 1.03600168, + "epoch": 0.7316999849691869, + "flos": 30482557315200.0, + "grad_norm": 1.6489053706369026, + "language_loss": 0.69867074, + "learning_rate": 7.08649380247871e-07, + "loss": 0.72006512, + "num_input_tokens_seen": 262644545, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.67578125, + "step": 12170, + "time_per_iteration": 2.5548336505889893 + }, + { + "auxiliary_loss_clip": 0.01100039, + "auxiliary_loss_mlp": 0.01030294, + "balance_loss_clip": 1.01778316, + "balance_loss_mlp": 1.03440404, + "epoch": 0.7317601082218548, + "flos": 21543781334400.0, + "grad_norm": 2.8957173811976022, + "language_loss": 0.69379872, + "learning_rate": 7.083520070928533e-07, + "loss": 0.71510202, + "num_input_tokens_seen": 262662570, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.65625, + "step": 12171, + "time_per_iteration": 2.4312360286712646 + }, + { + "auxiliary_loss_clip": 0.01101881, + "auxiliary_loss_mlp": 0.01033722, + "balance_loss_clip": 1.02222395, + "balance_loss_mlp": 1.03613734, + "epoch": 0.7318202314745228, + "flos": 33251592775680.0, + "grad_norm": 3.1521599416176582, + "language_loss": 0.65645874, + "learning_rate": 7.080546829172564e-07, + "loss": 0.67781472, + "num_input_tokens_seen": 262683245, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 12172, + "time_per_iteration": 2.5476059913635254 + }, + { + "auxiliary_loss_clip": 0.01103925, + "auxiliary_loss_mlp": 0.01027122, + "balance_loss_clip": 1.01507545, + "balance_loss_mlp": 1.03686643, + "epoch": 0.7318803547271907, + "flos": 20157054917760.0, + "grad_norm": 2.237216797653005, + "language_loss": 0.6100843, + "learning_rate": 7.077574077323564e-07, + "loss": 0.63139474, + "num_input_tokens_seen": 262701585, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.671875, + "step": 12173, + "time_per_iteration": 2.4594876766204834 + }, + { + "auxiliary_loss_clip": 0.0110106, + "auxiliary_loss_mlp": 0.01025966, + "balance_loss_clip": 1.0147481, + "balance_loss_mlp": 1.03543413, + "epoch": 0.7319404779798587, + "flos": 20558536208640.0, + "grad_norm": 1.8253545093146943, + "language_loss": 0.73704946, + "learning_rate": 7.074601815494243e-07, + "loss": 0.75831974, + "num_input_tokens_seen": 262719295, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 12174, + "time_per_iteration": 2.515566349029541 + }, + { + "auxiliary_loss_clip": 0.01099205, + "auxiliary_loss_mlp": 0.01025641, + "balance_loss_clip": 1.01454306, + "balance_loss_mlp": 1.03585482, + "epoch": 0.7320006012325266, + "flos": 28695391102080.0, + "grad_norm": 1.5591268998445824, + "language_loss": 0.80786538, + "learning_rate": 7.071630043797317e-07, + "loss": 0.82911384, + "num_input_tokens_seen": 262739995, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6328125, + "step": 12175, + "time_per_iteration": 2.5457139015197754 + }, + { + "auxiliary_loss_clip": 0.01101358, + "auxiliary_loss_mlp": 0.01027139, + "balance_loss_clip": 1.01556993, + "balance_loss_mlp": 1.03506994, + "epoch": 0.7320607244851947, + "flos": 16362697731840.0, + "grad_norm": 1.8633750273009067, + "language_loss": 0.76524568, + "learning_rate": 7.068658762345488e-07, + "loss": 0.78653067, + "num_input_tokens_seen": 262757680, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66015625, + "step": 12176, + "time_per_iteration": 2.4949843883514404 + }, + { + "auxiliary_loss_clip": 0.01101151, + "auxiliary_loss_mlp": 0.01030227, + "balance_loss_clip": 1.01911664, + "balance_loss_mlp": 1.03668857, + "epoch": 0.7321208477378626, + "flos": 20955097336320.0, + "grad_norm": 2.0429703759451074, + "language_loss": 0.76661092, + "learning_rate": 7.065687971251399e-07, + "loss": 0.78792465, + "num_input_tokens_seen": 262776990, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.64453125, + "step": 12177, + "time_per_iteration": 2.5137908458709717 + }, + { + "auxiliary_loss_clip": 0.01097382, + "auxiliary_loss_mlp": 0.01034623, + "balance_loss_clip": 1.02391255, + "balance_loss_mlp": 1.03224051, + "epoch": 0.7321809709905306, + "flos": 13845072539520.0, + "grad_norm": 2.015813751432838, + "language_loss": 0.74164724, + "learning_rate": 7.06271767062772e-07, + "loss": 0.76296735, + "num_input_tokens_seen": 262795440, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6484375, + "step": 12178, + "time_per_iteration": 3.7930397987365723 + }, + { + "auxiliary_loss_clip": 0.01102574, + "auxiliary_loss_mlp": 0.01029204, + "balance_loss_clip": 1.01727104, + "balance_loss_mlp": 1.03461027, + "epoch": 0.7322410942431986, + "flos": 26979938392320.0, + "grad_norm": 3.902615906398373, + "language_loss": 0.82204944, + "learning_rate": 7.059747860587084e-07, + "loss": 0.84336722, + "num_input_tokens_seen": 262816385, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 12179, + "time_per_iteration": 2.4926083087921143 + }, + { + "auxiliary_loss_clip": 0.01096766, + "auxiliary_loss_mlp": 0.01031847, + "balance_loss_clip": 1.0208199, + "balance_loss_mlp": 1.03491974, + "epoch": 0.7323012174958665, + "flos": 17639717034240.0, + "grad_norm": 1.7358162194967635, + "language_loss": 0.74350899, + "learning_rate": 7.056778541242115e-07, + "loss": 0.76479512, + "num_input_tokens_seen": 262834955, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6171875, + "step": 12180, + "time_per_iteration": 3.9542806148529053 + }, + { + "auxiliary_loss_clip": 0.01102785, + "auxiliary_loss_mlp": 0.01028317, + "balance_loss_clip": 1.01565659, + "balance_loss_mlp": 1.03372073, + "epoch": 0.7323613407485345, + "flos": 32342765834880.0, + "grad_norm": 1.8090406286045437, + "language_loss": 0.78966725, + "learning_rate": 7.053809712705396e-07, + "loss": 0.81097823, + "num_input_tokens_seen": 262853555, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 12181, + "time_per_iteration": 5.370461940765381 + }, + { + "auxiliary_loss_clip": 0.01104053, + "auxiliary_loss_mlp": 0.01031532, + "balance_loss_clip": 1.01984382, + "balance_loss_mlp": 1.03627169, + "epoch": 0.7324214640012024, + "flos": 18362777811840.0, + "grad_norm": 1.6926303414905466, + "language_loss": 0.71438134, + "learning_rate": 7.050841375089506e-07, + "loss": 0.7357372, + "num_input_tokens_seen": 262870975, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 12182, + "time_per_iteration": 2.395366668701172 + }, + { + "auxiliary_loss_clip": 0.01104144, + "auxiliary_loss_mlp": 0.01033063, + "balance_loss_clip": 1.02144599, + "balance_loss_mlp": 1.03678739, + "epoch": 0.7324815872538705, + "flos": 30812289189120.0, + "grad_norm": 1.516043869338468, + "language_loss": 0.71126986, + "learning_rate": 7.047873528507015e-07, + "loss": 0.73264194, + "num_input_tokens_seen": 262892635, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 12183, + "time_per_iteration": 2.5406055450439453 + }, + { + "auxiliary_loss_clip": 0.0110482, + "auxiliary_loss_mlp": 0.01036116, + "balance_loss_clip": 1.02371871, + "balance_loss_mlp": 1.03739989, + "epoch": 0.7325417105065384, + "flos": 21505069451520.0, + "grad_norm": 1.782462638135082, + "language_loss": 0.72453171, + "learning_rate": 7.04490617307045e-07, + "loss": 0.74594104, + "num_input_tokens_seen": 262910725, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 12184, + "time_per_iteration": 2.4203481674194336 + }, + { + "auxiliary_loss_clip": 0.01023657, + "auxiliary_loss_mlp": 0.01014002, + "balance_loss_clip": 1.01300097, + "balance_loss_mlp": 1.00301158, + "epoch": 0.7326018337592064, + "flos": 67257742556160.0, + "grad_norm": 0.763876847553094, + "language_loss": 0.65218687, + "learning_rate": 7.041939308892344e-07, + "loss": 0.67256343, + "num_input_tokens_seen": 262974150, + "router_z_loss_clip": 0.01000977, + "router_z_loss_mlp": 0.20703125, + "step": 12185, + "time_per_iteration": 3.0270133018493652 + }, + { + "auxiliary_loss_clip": 0.01100629, + "auxiliary_loss_mlp": 0.01026623, + "balance_loss_clip": 1.01434445, + "balance_loss_mlp": 1.03290069, + "epoch": 0.7326619570118743, + "flos": 22857070394880.0, + "grad_norm": 1.938744837028, + "language_loss": 0.807504, + "learning_rate": 7.038972936085197e-07, + "loss": 0.82877648, + "num_input_tokens_seen": 262993370, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6796875, + "step": 12186, + "time_per_iteration": 2.4389822483062744 + }, + { + "auxiliary_loss_clip": 0.01101959, + "auxiliary_loss_mlp": 0.01031355, + "balance_loss_clip": 1.01886177, + "balance_loss_mlp": 1.03473353, + "epoch": 0.7327220802645423, + "flos": 23327499841920.0, + "grad_norm": 1.9074219827171814, + "language_loss": 0.73762989, + "learning_rate": 7.036007054761508e-07, + "loss": 0.75896305, + "num_input_tokens_seen": 263012665, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.671875, + "step": 12187, + "time_per_iteration": 2.4973368644714355 + }, + { + "auxiliary_loss_clip": 0.01104423, + "auxiliary_loss_mlp": 0.01033786, + "balance_loss_clip": 1.02201378, + "balance_loss_mlp": 1.03718829, + "epoch": 0.7327822035172102, + "flos": 23180661043200.0, + "grad_norm": 1.717563808128471, + "language_loss": 0.88947159, + "learning_rate": 7.033041665033716e-07, + "loss": 0.91085368, + "num_input_tokens_seen": 263031475, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 12188, + "time_per_iteration": 2.4411849975585938 + }, + { + "auxiliary_loss_clip": 0.01103922, + "auxiliary_loss_mlp": 0.01030882, + "balance_loss_clip": 1.01875281, + "balance_loss_mlp": 1.03507185, + "epoch": 0.7328423267698783, + "flos": 21066600130560.0, + "grad_norm": 1.8794202002209792, + "language_loss": 0.7421574, + "learning_rate": 7.030076767014284e-07, + "loss": 0.76350546, + "num_input_tokens_seen": 263051445, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6875, + "step": 12189, + "time_per_iteration": 2.4856882095336914 + }, + { + "auxiliary_loss_clip": 0.0110238, + "auxiliary_loss_mlp": 0.0102823, + "balance_loss_clip": 1.01568341, + "balance_loss_mlp": 1.03474796, + "epoch": 0.7329024500225462, + "flos": 21689578638720.0, + "grad_norm": 1.5825056379011793, + "language_loss": 0.82314098, + "learning_rate": 7.027112360815648e-07, + "loss": 0.84444714, + "num_input_tokens_seen": 263070835, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.67578125, + "step": 12190, + "time_per_iteration": 2.456019878387451 + }, + { + "auxiliary_loss_clip": 0.01102905, + "auxiliary_loss_mlp": 0.01034193, + "balance_loss_clip": 1.02160442, + "balance_loss_mlp": 1.03589582, + "epoch": 0.7329625732752142, + "flos": 24164038661760.0, + "grad_norm": 1.732792680094222, + "language_loss": 0.71868473, + "learning_rate": 7.024148446550204e-07, + "loss": 0.74005568, + "num_input_tokens_seen": 263090070, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.66796875, + "step": 12191, + "time_per_iteration": 2.483551502227783 + }, + { + "auxiliary_loss_clip": 0.01103846, + "auxiliary_loss_mlp": 0.01033545, + "balance_loss_clip": 1.02112985, + "balance_loss_mlp": 1.03651261, + "epoch": 0.7330226965278822, + "flos": 30077915627520.0, + "grad_norm": 1.5577440951602006, + "language_loss": 0.69461203, + "learning_rate": 7.021185024330361e-07, + "loss": 0.71598595, + "num_input_tokens_seen": 263110030, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 12192, + "time_per_iteration": 2.509345531463623 + }, + { + "auxiliary_loss_clip": 0.01100244, + "auxiliary_loss_mlp": 0.01028189, + "balance_loss_clip": 1.01683998, + "balance_loss_mlp": 1.03492808, + "epoch": 0.7330828197805501, + "flos": 23368294713600.0, + "grad_norm": 1.567853507336265, + "language_loss": 0.73125577, + "learning_rate": 7.01822209426848e-07, + "loss": 0.75254017, + "num_input_tokens_seen": 263129735, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65234375, + "step": 12193, + "time_per_iteration": 2.5061562061309814 + }, + { + "auxiliary_loss_clip": 0.01101772, + "auxiliary_loss_mlp": 0.01027657, + "balance_loss_clip": 1.01551533, + "balance_loss_mlp": 1.03417039, + "epoch": 0.7331429430332181, + "flos": 21032808410880.0, + "grad_norm": 4.194654550291271, + "language_loss": 0.76709831, + "learning_rate": 7.015259656476911e-07, + "loss": 0.78839254, + "num_input_tokens_seen": 263149100, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.67578125, + "step": 12194, + "time_per_iteration": 2.429858446121216 + }, + { + "auxiliary_loss_clip": 0.01101072, + "auxiliary_loss_mlp": 0.01026816, + "balance_loss_clip": 1.01485932, + "balance_loss_mlp": 1.03564095, + "epoch": 0.733203066285886, + "flos": 14647891466880.0, + "grad_norm": 1.8657268793695219, + "language_loss": 0.70426142, + "learning_rate": 7.012297711067998e-07, + "loss": 0.72554034, + "num_input_tokens_seen": 263166620, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 12195, + "time_per_iteration": 2.47605299949646 + }, + { + "auxiliary_loss_clip": 0.01103283, + "auxiliary_loss_mlp": 0.01036918, + "balance_loss_clip": 1.02542019, + "balance_loss_mlp": 1.0363059, + "epoch": 0.7332631895385541, + "flos": 17165301177600.0, + "grad_norm": 1.8748815414700573, + "language_loss": 0.72009385, + "learning_rate": 7.009336258154057e-07, + "loss": 0.74149585, + "num_input_tokens_seen": 263184780, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 12196, + "time_per_iteration": 2.4170355796813965 + }, + { + "auxiliary_loss_clip": 0.01100598, + "auxiliary_loss_mlp": 0.0102618, + "balance_loss_clip": 1.01400244, + "balance_loss_mlp": 1.03541434, + "epoch": 0.733323312791222, + "flos": 28658151676800.0, + "grad_norm": 1.6057850533210987, + "language_loss": 0.71647477, + "learning_rate": 7.006375297847394e-07, + "loss": 0.73774254, + "num_input_tokens_seen": 263204625, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65234375, + "step": 12197, + "time_per_iteration": 2.5049266815185547 + }, + { + "auxiliary_loss_clip": 0.01103625, + "auxiliary_loss_mlp": 0.01038235, + "balance_loss_clip": 1.0253787, + "balance_loss_mlp": 1.03410459, + "epoch": 0.73338343604389, + "flos": 16618417632000.0, + "grad_norm": 1.8231283851018831, + "language_loss": 0.78448522, + "learning_rate": 7.003414830260282e-07, + "loss": 0.80590379, + "num_input_tokens_seen": 263221565, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 12198, + "time_per_iteration": 2.4223878383636475 + }, + { + "auxiliary_loss_clip": 0.0110209, + "auxiliary_loss_mlp": 0.01030266, + "balance_loss_clip": 1.01910758, + "balance_loss_mlp": 1.03584075, + "epoch": 0.7334435592965579, + "flos": 21142084561920.0, + "grad_norm": 1.9413444885935378, + "language_loss": 0.74405611, + "learning_rate": 7.000454855504974e-07, + "loss": 0.76537967, + "num_input_tokens_seen": 263240620, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 12199, + "time_per_iteration": 2.503514528274536 + }, + { + "auxiliary_loss_clip": 0.01106436, + "auxiliary_loss_mlp": 0.01034584, + "balance_loss_clip": 1.02240086, + "balance_loss_mlp": 1.03749204, + "epoch": 0.7335036825492259, + "flos": 17125332318720.0, + "grad_norm": 2.410343838162529, + "language_loss": 0.76916027, + "learning_rate": 6.997495373693729e-07, + "loss": 0.79057044, + "num_input_tokens_seen": 263254365, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 12200, + "time_per_iteration": 2.385646104812622 + }, + { + "auxiliary_loss_clip": 0.0110137, + "auxiliary_loss_mlp": 0.01031712, + "balance_loss_clip": 1.02033889, + "balance_loss_mlp": 1.03535485, + "epoch": 0.7335638058018938, + "flos": 23731818307200.0, + "grad_norm": 1.9712263454849892, + "language_loss": 0.61337197, + "learning_rate": 6.994536384938754e-07, + "loss": 0.63470274, + "num_input_tokens_seen": 263275880, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.66015625, + "step": 12201, + "time_per_iteration": 2.494711399078369 + }, + { + "auxiliary_loss_clip": 0.01099322, + "auxiliary_loss_mlp": 0.01022943, + "balance_loss_clip": 1.01186204, + "balance_loss_mlp": 1.03445053, + "epoch": 0.7336239290545619, + "flos": 34933289679360.0, + "grad_norm": 1.770832212268843, + "language_loss": 0.52208602, + "learning_rate": 6.991577889352264e-07, + "loss": 0.54330868, + "num_input_tokens_seen": 263298315, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6484375, + "step": 12202, + "time_per_iteration": 2.5508878231048584 + }, + { + "auxiliary_loss_clip": 0.01100977, + "auxiliary_loss_mlp": 0.01026727, + "balance_loss_clip": 1.01555753, + "balance_loss_mlp": 1.03535819, + "epoch": 0.7336840523072298, + "flos": 21103049456640.0, + "grad_norm": 1.8712183341846977, + "language_loss": 0.68450284, + "learning_rate": 6.98861988704645e-07, + "loss": 0.70577991, + "num_input_tokens_seen": 263318615, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.65625, + "step": 12203, + "time_per_iteration": 2.455225944519043 + }, + { + "auxiliary_loss_clip": 0.01104999, + "auxiliary_loss_mlp": 0.01037444, + "balance_loss_clip": 1.02551746, + "balance_loss_mlp": 1.03558648, + "epoch": 0.7337441755598978, + "flos": 24024418496640.0, + "grad_norm": 2.0115937343101176, + "language_loss": 0.66122192, + "learning_rate": 6.985662378133474e-07, + "loss": 0.68264639, + "num_input_tokens_seen": 263336705, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 12204, + "time_per_iteration": 2.4275307655334473 + }, + { + "auxiliary_loss_clip": 0.01100701, + "auxiliary_loss_mlp": 0.0102869, + "balance_loss_clip": 1.01770449, + "balance_loss_mlp": 1.036098, + "epoch": 0.7338042988125658, + "flos": 22711309004160.0, + "grad_norm": 2.1044017909422434, + "language_loss": 0.77165949, + "learning_rate": 6.982705362725479e-07, + "loss": 0.79295337, + "num_input_tokens_seen": 263355065, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.64453125, + "step": 12205, + "time_per_iteration": 2.465723752975464 + }, + { + "auxiliary_loss_clip": 0.01101796, + "auxiliary_loss_mlp": 0.01028561, + "balance_loss_clip": 1.01719475, + "balance_loss_mlp": 1.03765106, + "epoch": 0.7338644220652337, + "flos": 21360996000000.0, + "grad_norm": 1.633398371679339, + "language_loss": 0.79663754, + "learning_rate": 6.979748840934601e-07, + "loss": 0.81794107, + "num_input_tokens_seen": 263374460, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 12206, + "time_per_iteration": 2.4295356273651123 + }, + { + "auxiliary_loss_clip": 0.01101572, + "auxiliary_loss_mlp": 0.01027593, + "balance_loss_clip": 1.01490951, + "balance_loss_mlp": 1.03436399, + "epoch": 0.7339245453179017, + "flos": 30920236536960.0, + "grad_norm": 1.938197948270063, + "language_loss": 0.71248126, + "learning_rate": 6.976792812872958e-07, + "loss": 0.73377299, + "num_input_tokens_seen": 263393610, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.671875, + "step": 12207, + "time_per_iteration": 2.533963918685913 + }, + { + "auxiliary_loss_clip": 0.01023391, + "auxiliary_loss_mlp": 0.01000694, + "balance_loss_clip": 0.99954408, + "balance_loss_mlp": 1.0029676, + "epoch": 0.7339846685705697, + "flos": 67899429072000.0, + "grad_norm": 0.7861729617868648, + "language_loss": 0.54826534, + "learning_rate": 6.97383727865263e-07, + "loss": 0.56850618, + "num_input_tokens_seen": 263450340, + "router_z_loss_clip": 0.01147461, + "router_z_loss_mlp": 0.20507812, + "step": 12208, + "time_per_iteration": 3.1204357147216797 + }, + { + "auxiliary_loss_clip": 0.01100641, + "auxiliary_loss_mlp": 0.01026824, + "balance_loss_clip": 1.01652431, + "balance_loss_mlp": 1.03539574, + "epoch": 0.7340447918232377, + "flos": 22236749493120.0, + "grad_norm": 1.435103992793476, + "language_loss": 0.80251199, + "learning_rate": 6.970882238385703e-07, + "loss": 0.82378662, + "num_input_tokens_seen": 263471735, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.65234375, + "step": 12209, + "time_per_iteration": 2.4724159240722656 + }, + { + "auxiliary_loss_clip": 0.01097718, + "auxiliary_loss_mlp": 0.01027545, + "balance_loss_clip": 1.01657128, + "balance_loss_mlp": 1.03298545, + "epoch": 0.7341049150759056, + "flos": 23764784014080.0, + "grad_norm": 1.8625549043469913, + "language_loss": 0.78958344, + "learning_rate": 6.96792769218423e-07, + "loss": 0.81083614, + "num_input_tokens_seen": 263493245, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6484375, + "step": 12210, + "time_per_iteration": 2.455946445465088 + }, + { + "auxiliary_loss_clip": 0.011003, + "auxiliary_loss_mlp": 0.0102585, + "balance_loss_clip": 1.01407206, + "balance_loss_mlp": 1.03534794, + "epoch": 0.7341650383285736, + "flos": 17236547804160.0, + "grad_norm": 1.6735159974751206, + "language_loss": 0.7608707, + "learning_rate": 6.964973640160236e-07, + "loss": 0.78213215, + "num_input_tokens_seen": 263511660, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6484375, + "step": 12211, + "time_per_iteration": 2.4627277851104736 + }, + { + "auxiliary_loss_clip": 0.01102174, + "auxiliary_loss_mlp": 0.01025987, + "balance_loss_clip": 1.01444197, + "balance_loss_mlp": 1.03521109, + "epoch": 0.7342251615812415, + "flos": 23403953940480.0, + "grad_norm": 1.9025360413194936, + "language_loss": 0.71490365, + "learning_rate": 6.962020082425748e-07, + "loss": 0.73618519, + "num_input_tokens_seen": 263530875, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 12212, + "time_per_iteration": 2.446685552597046 + }, + { + "auxiliary_loss_clip": 0.01103728, + "auxiliary_loss_mlp": 0.01031481, + "balance_loss_clip": 1.01991129, + "balance_loss_mlp": 1.03784096, + "epoch": 0.7342852848339095, + "flos": 22747183712640.0, + "grad_norm": 1.9034635106886582, + "language_loss": 0.68719161, + "learning_rate": 6.959067019092766e-07, + "loss": 0.70854366, + "num_input_tokens_seen": 263551585, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 12213, + "time_per_iteration": 2.4991095066070557 + }, + { + "auxiliary_loss_clip": 0.01023626, + "auxiliary_loss_mlp": 0.01002854, + "balance_loss_clip": 1.00172174, + "balance_loss_mlp": 1.00317287, + "epoch": 0.7343454080865774, + "flos": 53942353925760.0, + "grad_norm": 0.7248810226626392, + "language_loss": 0.54344672, + "learning_rate": 6.956114450273276e-07, + "loss": 0.56371152, + "num_input_tokens_seen": 263609545, + "router_z_loss_clip": 0.01135254, + "router_z_loss_mlp": 0.20507812, + "step": 12214, + "time_per_iteration": 2.920579433441162 + }, + { + "auxiliary_loss_clip": 0.01103211, + "auxiliary_loss_mlp": 0.01026326, + "balance_loss_clip": 1.01514435, + "balance_loss_mlp": 1.03471541, + "epoch": 0.7344055313392455, + "flos": 12166859255040.0, + "grad_norm": 1.9617721107193735, + "language_loss": 0.70233238, + "learning_rate": 6.953162376079233e-07, + "loss": 0.72362781, + "num_input_tokens_seen": 263627880, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.68359375, + "step": 12215, + "time_per_iteration": 2.4825196266174316 + }, + { + "auxiliary_loss_clip": 0.01098919, + "auxiliary_loss_mlp": 0.01027103, + "balance_loss_clip": 1.01648164, + "balance_loss_mlp": 1.03576207, + "epoch": 0.7344656545919134, + "flos": 18550052346240.0, + "grad_norm": 1.7008791597621735, + "language_loss": 0.72984588, + "learning_rate": 6.950210796622573e-07, + "loss": 0.75110614, + "num_input_tokens_seen": 263645665, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6328125, + "step": 12216, + "time_per_iteration": 2.419165849685669 + }, + { + "auxiliary_loss_clip": 0.01106239, + "auxiliary_loss_mlp": 0.01035051, + "balance_loss_clip": 1.02124095, + "balance_loss_mlp": 1.03503752, + "epoch": 0.7345257778445814, + "flos": 23661649088640.0, + "grad_norm": 1.6841898563593931, + "language_loss": 0.7813915, + "learning_rate": 6.947259712015236e-07, + "loss": 0.80280441, + "num_input_tokens_seen": 263668170, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7109375, + "step": 12217, + "time_per_iteration": 2.519476890563965 + }, + { + "auxiliary_loss_clip": 0.01097824, + "auxiliary_loss_mlp": 0.01025415, + "balance_loss_clip": 1.01500201, + "balance_loss_mlp": 1.03322065, + "epoch": 0.7345859010972494, + "flos": 13808659127040.0, + "grad_norm": 2.0430723318586814, + "language_loss": 0.77478087, + "learning_rate": 6.94430912236911e-07, + "loss": 0.7960133, + "num_input_tokens_seen": 263684190, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.64453125, + "step": 12218, + "time_per_iteration": 2.4323973655700684 + }, + { + "auxiliary_loss_clip": 0.0109922, + "auxiliary_loss_mlp": 0.01029029, + "balance_loss_clip": 1.0175488, + "balance_loss_mlp": 1.03478718, + "epoch": 0.7346460243499173, + "flos": 22272731942400.0, + "grad_norm": 2.4653490702635223, + "language_loss": 0.72245163, + "learning_rate": 6.941359027796092e-07, + "loss": 0.74373412, + "num_input_tokens_seen": 263702095, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.64453125, + "step": 12219, + "time_per_iteration": 3.851811408996582 + }, + { + "auxiliary_loss_clip": 0.0109725, + "auxiliary_loss_mlp": 0.01027919, + "balance_loss_clip": 1.01675463, + "balance_loss_mlp": 1.03373814, + "epoch": 0.7347061476025853, + "flos": 23255247634560.0, + "grad_norm": 1.7840681188410097, + "language_loss": 0.7480529, + "learning_rate": 6.938409428408061e-07, + "loss": 0.76930463, + "num_input_tokens_seen": 263721385, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6328125, + "step": 12220, + "time_per_iteration": 2.450587511062622 + }, + { + "auxiliary_loss_clip": 0.01102396, + "auxiliary_loss_mlp": 0.01030551, + "balance_loss_clip": 1.01934493, + "balance_loss_mlp": 1.03515816, + "epoch": 0.7347662708552533, + "flos": 15267565923840.0, + "grad_norm": 1.5828657801363317, + "language_loss": 0.65927309, + "learning_rate": 6.93546032431684e-07, + "loss": 0.68060255, + "num_input_tokens_seen": 263737835, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 12221, + "time_per_iteration": 3.9862098693847656 + }, + { + "auxiliary_loss_clip": 0.01100484, + "auxiliary_loss_mlp": 0.01028442, + "balance_loss_clip": 1.01736093, + "balance_loss_mlp": 1.03518033, + "epoch": 0.7348263941079213, + "flos": 24859987649280.0, + "grad_norm": 1.690484446007973, + "language_loss": 0.69146597, + "learning_rate": 6.932511715634273e-07, + "loss": 0.71275526, + "num_input_tokens_seen": 263756480, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.65234375, + "step": 12222, + "time_per_iteration": 3.9009041786193848 + }, + { + "auxiliary_loss_clip": 0.01099444, + "auxiliary_loss_mlp": 0.01027554, + "balance_loss_clip": 1.01703954, + "balance_loss_mlp": 1.03489995, + "epoch": 0.7348865173605892, + "flos": 24352103295360.0, + "grad_norm": 1.6021663525354104, + "language_loss": 0.65751356, + "learning_rate": 6.92956360247217e-07, + "loss": 0.67878354, + "num_input_tokens_seen": 263776440, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.64453125, + "step": 12223, + "time_per_iteration": 3.9320757389068604 + }, + { + "auxiliary_loss_clip": 0.01101903, + "auxiliary_loss_mlp": 0.01027965, + "balance_loss_clip": 1.01641989, + "balance_loss_mlp": 1.03491271, + "epoch": 0.7349466406132572, + "flos": 20004613597440.0, + "grad_norm": 1.7805598542267875, + "language_loss": 0.72150576, + "learning_rate": 6.926615984942332e-07, + "loss": 0.74280441, + "num_input_tokens_seen": 263793700, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.671875, + "step": 12224, + "time_per_iteration": 2.424764394760132 + }, + { + "auxiliary_loss_clip": 0.01102425, + "auxiliary_loss_mlp": 0.01030088, + "balance_loss_clip": 1.01862597, + "balance_loss_mlp": 1.0356946, + "epoch": 0.7350067638659251, + "flos": 29825068815360.0, + "grad_norm": 1.667305857067153, + "language_loss": 0.72422898, + "learning_rate": 6.92366886315652e-07, + "loss": 0.74555409, + "num_input_tokens_seen": 263814620, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.66796875, + "step": 12225, + "time_per_iteration": 2.555699110031128 + }, + { + "auxiliary_loss_clip": 0.0110455, + "auxiliary_loss_mlp": 0.01031967, + "balance_loss_clip": 1.01911092, + "balance_loss_mlp": 1.03528094, + "epoch": 0.7350668871185931, + "flos": 21866150920320.0, + "grad_norm": 1.6574802149125882, + "language_loss": 0.76740652, + "learning_rate": 6.920722237226501e-07, + "loss": 0.78877175, + "num_input_tokens_seen": 263832725, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69140625, + "step": 12226, + "time_per_iteration": 2.417281150817871 + }, + { + "auxiliary_loss_clip": 0.0110041, + "auxiliary_loss_mlp": 0.01028316, + "balance_loss_clip": 1.01671648, + "balance_loss_mlp": 1.03516448, + "epoch": 0.735127010371261, + "flos": 22566122231040.0, + "grad_norm": 1.6412947887343436, + "language_loss": 0.66742253, + "learning_rate": 6.917776107264008e-07, + "loss": 0.68870974, + "num_input_tokens_seen": 263853850, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 12227, + "time_per_iteration": 2.493746280670166 + }, + { + "auxiliary_loss_clip": 0.01101958, + "auxiliary_loss_mlp": 0.01034032, + "balance_loss_clip": 1.02254581, + "balance_loss_mlp": 1.03482342, + "epoch": 0.7351871336239291, + "flos": 25884339707520.0, + "grad_norm": 1.3969319271399194, + "language_loss": 0.63719964, + "learning_rate": 6.914830473380749e-07, + "loss": 0.65855956, + "num_input_tokens_seen": 263874760, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 12228, + "time_per_iteration": 2.4691944122314453 + }, + { + "auxiliary_loss_clip": 0.01099398, + "auxiliary_loss_mlp": 0.01033625, + "balance_loss_clip": 1.02261627, + "balance_loss_mlp": 1.03371692, + "epoch": 0.735247256876597, + "flos": 17932173569280.0, + "grad_norm": 2.005632249261944, + "language_loss": 0.63364494, + "learning_rate": 6.911885335688427e-07, + "loss": 0.65497524, + "num_input_tokens_seen": 263893390, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 12229, + "time_per_iteration": 2.44689679145813 + }, + { + "auxiliary_loss_clip": 0.01104076, + "auxiliary_loss_mlp": 0.0103478, + "balance_loss_clip": 1.02215624, + "balance_loss_mlp": 1.03622568, + "epoch": 0.735307380129265, + "flos": 28875159694080.0, + "grad_norm": 1.6720920493620766, + "language_loss": 0.73554301, + "learning_rate": 6.908940694298726e-07, + "loss": 0.7569316, + "num_input_tokens_seen": 263911180, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6796875, + "step": 12230, + "time_per_iteration": 2.471467971801758 + }, + { + "auxiliary_loss_clip": 0.01102648, + "auxiliary_loss_mlp": 0.01028691, + "balance_loss_clip": 1.01664519, + "balance_loss_mlp": 1.03582287, + "epoch": 0.7353675033819329, + "flos": 13625658311040.0, + "grad_norm": 1.9806878096831662, + "language_loss": 0.71668804, + "learning_rate": 6.90599654932332e-07, + "loss": 0.73800141, + "num_input_tokens_seen": 263928975, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.66796875, + "step": 12231, + "time_per_iteration": 2.473133087158203 + }, + { + "auxiliary_loss_clip": 0.01104217, + "auxiliary_loss_mlp": 0.01036138, + "balance_loss_clip": 1.02288222, + "balance_loss_mlp": 1.03647971, + "epoch": 0.7354276266346009, + "flos": 19463081178240.0, + "grad_norm": 3.941316401522165, + "language_loss": 0.64094537, + "learning_rate": 6.903052900873823e-07, + "loss": 0.66234899, + "num_input_tokens_seen": 263944495, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6796875, + "step": 12232, + "time_per_iteration": 2.4203951358795166 + }, + { + "auxiliary_loss_clip": 0.01102255, + "auxiliary_loss_mlp": 0.01030592, + "balance_loss_clip": 1.01851058, + "balance_loss_mlp": 1.03487468, + "epoch": 0.735487749887269, + "flos": 15771858917760.0, + "grad_norm": 1.8455770572081356, + "language_loss": 0.75458562, + "learning_rate": 6.900109749061874e-07, + "loss": 0.77591407, + "num_input_tokens_seen": 263961325, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 12233, + "time_per_iteration": 2.4624409675598145 + }, + { + "auxiliary_loss_clip": 0.01101376, + "auxiliary_loss_mlp": 0.01027274, + "balance_loss_clip": 1.01549006, + "balance_loss_mlp": 1.03507233, + "epoch": 0.7355478731399369, + "flos": 18260648467200.0, + "grad_norm": 1.614964377536134, + "language_loss": 0.73402774, + "learning_rate": 6.897167093999079e-07, + "loss": 0.75531423, + "num_input_tokens_seen": 263980445, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 12234, + "time_per_iteration": 2.4193742275238037 + }, + { + "auxiliary_loss_clip": 0.01103947, + "auxiliary_loss_mlp": 0.01030505, + "balance_loss_clip": 1.01924026, + "balance_loss_mlp": 1.03720987, + "epoch": 0.7356079963926049, + "flos": 26542043688960.0, + "grad_norm": 4.5713288626894455, + "language_loss": 0.59835577, + "learning_rate": 6.894224935797017e-07, + "loss": 0.61970031, + "num_input_tokens_seen": 263999330, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.66796875, + "step": 12235, + "time_per_iteration": 2.5044472217559814 + }, + { + "auxiliary_loss_clip": 0.01101342, + "auxiliary_loss_mlp": 0.01026652, + "balance_loss_clip": 1.01521957, + "balance_loss_mlp": 1.03657699, + "epoch": 0.7356681196452728, + "flos": 10778624467200.0, + "grad_norm": 2.0497651121742115, + "language_loss": 0.8565346, + "learning_rate": 6.891283274567259e-07, + "loss": 0.87781453, + "num_input_tokens_seen": 264014150, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 12236, + "time_per_iteration": 2.3936641216278076 + }, + { + "auxiliary_loss_clip": 0.01102811, + "auxiliary_loss_mlp": 0.01028119, + "balance_loss_clip": 1.01669908, + "balance_loss_mlp": 1.03538775, + "epoch": 0.7357282428979408, + "flos": 19718693337600.0, + "grad_norm": 1.8090519272371215, + "language_loss": 0.69331872, + "learning_rate": 6.888342110421364e-07, + "loss": 0.71462798, + "num_input_tokens_seen": 264033140, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 12237, + "time_per_iteration": 2.473252296447754 + }, + { + "auxiliary_loss_clip": 0.01101452, + "auxiliary_loss_mlp": 0.01025644, + "balance_loss_clip": 1.01471233, + "balance_loss_mlp": 1.03477573, + "epoch": 0.7357883661506087, + "flos": 19464014931840.0, + "grad_norm": 1.6472611180309946, + "language_loss": 0.72134531, + "learning_rate": 6.885401443470839e-07, + "loss": 0.7426163, + "num_input_tokens_seen": 264052105, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6640625, + "step": 12238, + "time_per_iteration": 2.423517942428589 + }, + { + "auxiliary_loss_clip": 0.01106028, + "auxiliary_loss_mlp": 0.01029179, + "balance_loss_clip": 1.01703119, + "balance_loss_mlp": 1.03515995, + "epoch": 0.7358484894032767, + "flos": 27123006263040.0, + "grad_norm": 1.7391094576956916, + "language_loss": 0.72675085, + "learning_rate": 6.882461273827205e-07, + "loss": 0.7481029, + "num_input_tokens_seen": 264070690, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.7109375, + "step": 12239, + "time_per_iteration": 2.523238182067871 + }, + { + "auxiliary_loss_clip": 0.01098843, + "auxiliary_loss_mlp": 0.01029765, + "balance_loss_clip": 1.01849365, + "balance_loss_mlp": 1.03532851, + "epoch": 0.7359086126559446, + "flos": 24502282058880.0, + "grad_norm": 1.5041553602452318, + "language_loss": 0.78892875, + "learning_rate": 6.879521601601954e-07, + "loss": 0.81021476, + "num_input_tokens_seen": 264094225, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6328125, + "step": 12240, + "time_per_iteration": 2.4987194538116455 + }, + { + "auxiliary_loss_clip": 0.0110103, + "auxiliary_loss_mlp": 0.01033295, + "balance_loss_clip": 1.02145731, + "balance_loss_mlp": 1.03596234, + "epoch": 0.7359687359086127, + "flos": 23331270769920.0, + "grad_norm": 1.7320565425934242, + "language_loss": 0.83208013, + "learning_rate": 6.876582426906565e-07, + "loss": 0.85342342, + "num_input_tokens_seen": 264113190, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6484375, + "step": 12241, + "time_per_iteration": 2.499547004699707 + }, + { + "auxiliary_loss_clip": 0.01099431, + "auxiliary_loss_mlp": 0.01026793, + "balance_loss_clip": 1.01507461, + "balance_loss_mlp": 1.03403616, + "epoch": 0.7360288591612806, + "flos": 20193396503040.0, + "grad_norm": 1.8298064214189858, + "language_loss": 0.78645867, + "learning_rate": 6.873643749852484e-07, + "loss": 0.8077209, + "num_input_tokens_seen": 264132050, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65234375, + "step": 12242, + "time_per_iteration": 2.4207592010498047 + }, + { + "auxiliary_loss_clip": 0.01102156, + "auxiliary_loss_mlp": 0.01029374, + "balance_loss_clip": 1.0180552, + "balance_loss_mlp": 1.0359714, + "epoch": 0.7360889824139486, + "flos": 24972783333120.0, + "grad_norm": 1.9546604159013963, + "language_loss": 0.79385024, + "learning_rate": 6.870705570551145e-07, + "loss": 0.81516558, + "num_input_tokens_seen": 264152800, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 12243, + "time_per_iteration": 2.51019024848938 + }, + { + "auxiliary_loss_clip": 0.01102378, + "auxiliary_loss_mlp": 0.01032839, + "balance_loss_clip": 1.02083445, + "balance_loss_mlp": 1.03466713, + "epoch": 0.7361491056666165, + "flos": 15012312900480.0, + "grad_norm": 1.9125543259943414, + "language_loss": 0.74100977, + "learning_rate": 6.867767889113969e-07, + "loss": 0.76236194, + "num_input_tokens_seen": 264169650, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 12244, + "time_per_iteration": 2.4030749797821045 + }, + { + "auxiliary_loss_clip": 0.01101314, + "auxiliary_loss_mlp": 0.01029345, + "balance_loss_clip": 1.01773405, + "balance_loss_mlp": 1.03416705, + "epoch": 0.7362092289192845, + "flos": 22930400010240.0, + "grad_norm": 1.7798055097675247, + "language_loss": 0.6942178, + "learning_rate": 6.864830705652347e-07, + "loss": 0.71552444, + "num_input_tokens_seen": 264190530, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 12245, + "time_per_iteration": 2.4875071048736572 + }, + { + "auxiliary_loss_clip": 0.01098192, + "auxiliary_loss_mlp": 0.01031115, + "balance_loss_clip": 1.01933169, + "balance_loss_mlp": 1.03475428, + "epoch": 0.7362693521719526, + "flos": 20702681487360.0, + "grad_norm": 1.5087221257099204, + "language_loss": 0.73185629, + "learning_rate": 6.861894020277658e-07, + "loss": 0.75314939, + "num_input_tokens_seen": 264210820, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.63671875, + "step": 12246, + "time_per_iteration": 2.4394288063049316 + }, + { + "auxiliary_loss_clip": 0.01096401, + "auxiliary_loss_mlp": 0.01025823, + "balance_loss_clip": 1.01489758, + "balance_loss_mlp": 1.0334698, + "epoch": 0.7363294754246205, + "flos": 13111381336320.0, + "grad_norm": 2.1784937379902787, + "language_loss": 0.73557955, + "learning_rate": 6.858957833101266e-07, + "loss": 0.75680184, + "num_input_tokens_seen": 264227430, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.62890625, + "step": 12247, + "time_per_iteration": 2.4587297439575195 + }, + { + "auxiliary_loss_clip": 0.01101638, + "auxiliary_loss_mlp": 0.01027969, + "balance_loss_clip": 1.01730013, + "balance_loss_mlp": 1.03827024, + "epoch": 0.7363895986772885, + "flos": 14027426910720.0, + "grad_norm": 1.48643381660021, + "language_loss": 0.7409212, + "learning_rate": 6.856022144234526e-07, + "loss": 0.76221728, + "num_input_tokens_seen": 264245230, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6328125, + "step": 12248, + "time_per_iteration": 2.4140796661376953 + }, + { + "auxiliary_loss_clip": 0.01101098, + "auxiliary_loss_mlp": 0.01034256, + "balance_loss_clip": 1.02237701, + "balance_loss_mlp": 1.03480268, + "epoch": 0.7364497219299564, + "flos": 19719986227200.0, + "grad_norm": 4.381127457761843, + "language_loss": 0.72677851, + "learning_rate": 6.853086953788727e-07, + "loss": 0.74813205, + "num_input_tokens_seen": 264263945, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6640625, + "step": 12249, + "time_per_iteration": 2.4724795818328857 + }, + { + "auxiliary_loss_clip": 0.01103324, + "auxiliary_loss_mlp": 0.01030157, + "balance_loss_clip": 1.01859331, + "balance_loss_mlp": 1.03676438, + "epoch": 0.7365098451826244, + "flos": 21361391049600.0, + "grad_norm": 1.708422030858321, + "language_loss": 0.77026933, + "learning_rate": 6.850152261875189e-07, + "loss": 0.79160416, + "num_input_tokens_seen": 264281500, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6640625, + "step": 12250, + "time_per_iteration": 2.4324309825897217 + }, + { + "auxiliary_loss_clip": 0.01102594, + "auxiliary_loss_mlp": 0.0102868, + "balance_loss_clip": 1.01680052, + "balance_loss_mlp": 1.0353688, + "epoch": 0.7365699684352923, + "flos": 23368222886400.0, + "grad_norm": 2.1441444373175687, + "language_loss": 0.71412712, + "learning_rate": 6.8472180686052e-07, + "loss": 0.7354399, + "num_input_tokens_seen": 264301625, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 12251, + "time_per_iteration": 2.4759652614593506 + }, + { + "auxiliary_loss_clip": 0.0110003, + "auxiliary_loss_mlp": 0.01029632, + "balance_loss_clip": 1.01828933, + "balance_loss_mlp": 1.03470254, + "epoch": 0.7366300916879603, + "flos": 59524879927680.0, + "grad_norm": 1.4418314268019194, + "language_loss": 0.65489835, + "learning_rate": 6.844284374090015e-07, + "loss": 0.67619503, + "num_input_tokens_seen": 264323975, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65234375, + "step": 12252, + "time_per_iteration": 2.8028664588928223 + }, + { + "auxiliary_loss_clip": 0.01105105, + "auxiliary_loss_mlp": 0.0102878, + "balance_loss_clip": 1.01736534, + "balance_loss_mlp": 1.03739333, + "epoch": 0.7366902149406283, + "flos": 20923137210240.0, + "grad_norm": 1.657771200645772, + "language_loss": 0.79182792, + "learning_rate": 6.841351178440884e-07, + "loss": 0.8131668, + "num_input_tokens_seen": 264343785, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.67578125, + "step": 12253, + "time_per_iteration": 2.472512722015381 + }, + { + "auxiliary_loss_clip": 0.01096622, + "auxiliary_loss_mlp": 0.010276, + "balance_loss_clip": 1.01669192, + "balance_loss_mlp": 1.03384531, + "epoch": 0.7367503381932963, + "flos": 17348158339200.0, + "grad_norm": 2.145672565702914, + "language_loss": 0.75874883, + "learning_rate": 6.83841848176905e-07, + "loss": 0.77999103, + "num_input_tokens_seen": 264361130, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.625, + "step": 12254, + "time_per_iteration": 2.419156074523926 + }, + { + "auxiliary_loss_clip": 0.01101466, + "auxiliary_loss_mlp": 0.01032545, + "balance_loss_clip": 1.02074361, + "balance_loss_mlp": 1.03581631, + "epoch": 0.7368104614459642, + "flos": 17821317219840.0, + "grad_norm": 2.333279522964119, + "language_loss": 0.68892902, + "learning_rate": 6.835486284185692e-07, + "loss": 0.71026909, + "num_input_tokens_seen": 264376965, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65625, + "step": 12255, + "time_per_iteration": 2.456407308578491 + }, + { + "auxiliary_loss_clip": 0.01101847, + "auxiliary_loss_mlp": 0.01029458, + "balance_loss_clip": 1.01738834, + "balance_loss_mlp": 1.03577256, + "epoch": 0.7368705846986322, + "flos": 24606099342720.0, + "grad_norm": 2.0115502306535404, + "language_loss": 0.7508868, + "learning_rate": 6.832554585802012e-07, + "loss": 0.77219987, + "num_input_tokens_seen": 264396310, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6640625, + "step": 12256, + "time_per_iteration": 2.4806578159332275 + }, + { + "auxiliary_loss_clip": 0.01103736, + "auxiliary_loss_mlp": 0.01029113, + "balance_loss_clip": 1.01691759, + "balance_loss_mlp": 1.0363915, + "epoch": 0.7369307079513001, + "flos": 34970169968640.0, + "grad_norm": 1.5936534045043864, + "language_loss": 0.73533136, + "learning_rate": 6.829623386729182e-07, + "loss": 0.75665981, + "num_input_tokens_seen": 264418085, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.671875, + "step": 12257, + "time_per_iteration": 2.5967447757720947 + }, + { + "auxiliary_loss_clip": 0.01100133, + "auxiliary_loss_mlp": 0.01034318, + "balance_loss_clip": 1.02328479, + "balance_loss_mlp": 1.0344913, + "epoch": 0.7369908312039681, + "flos": 21214588164480.0, + "grad_norm": 1.4666060569830273, + "language_loss": 0.78067857, + "learning_rate": 6.826692687078362e-07, + "loss": 0.80202311, + "num_input_tokens_seen": 264437595, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 12258, + "time_per_iteration": 2.454329252243042 + }, + { + "auxiliary_loss_clip": 0.0110487, + "auxiliary_loss_mlp": 0.01029651, + "balance_loss_clip": 1.0180105, + "balance_loss_mlp": 1.03685117, + "epoch": 0.7370509544566362, + "flos": 23623655477760.0, + "grad_norm": 1.3867663760940814, + "language_loss": 0.66167754, + "learning_rate": 6.823762486960674e-07, + "loss": 0.68302274, + "num_input_tokens_seen": 264457385, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 12259, + "time_per_iteration": 2.517813205718994 + }, + { + "auxiliary_loss_clip": 0.0110186, + "auxiliary_loss_mlp": 0.01032538, + "balance_loss_clip": 1.02026582, + "balance_loss_mlp": 1.03576601, + "epoch": 0.7371110777093041, + "flos": 24827704300800.0, + "grad_norm": 1.584231595020614, + "language_loss": 0.73625088, + "learning_rate": 6.820832786487225e-07, + "loss": 0.75759482, + "num_input_tokens_seen": 264477205, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66015625, + "step": 12260, + "time_per_iteration": 2.5023396015167236 + }, + { + "auxiliary_loss_clip": 0.0110407, + "auxiliary_loss_mlp": 0.01028376, + "balance_loss_clip": 1.01717019, + "balance_loss_mlp": 1.03662717, + "epoch": 0.7371712009619721, + "flos": 23149491016320.0, + "grad_norm": 1.604192195943769, + "language_loss": 0.73533583, + "learning_rate": 6.817903585769125e-07, + "loss": 0.75666034, + "num_input_tokens_seen": 264497195, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.67578125, + "step": 12261, + "time_per_iteration": 3.906297445297241 + }, + { + "auxiliary_loss_clip": 0.01105085, + "auxiliary_loss_mlp": 0.01034539, + "balance_loss_clip": 1.02218294, + "balance_loss_mlp": 1.03563118, + "epoch": 0.73723132421464, + "flos": 23112898035840.0, + "grad_norm": 2.303167962152087, + "language_loss": 0.66901404, + "learning_rate": 6.814974884917438e-07, + "loss": 0.69041032, + "num_input_tokens_seen": 264516950, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6953125, + "step": 12262, + "time_per_iteration": 2.4535868167877197 + }, + { + "auxiliary_loss_clip": 0.01102433, + "auxiliary_loss_mlp": 0.0102981, + "balance_loss_clip": 1.01726305, + "balance_loss_mlp": 1.03487778, + "epoch": 0.737291447467308, + "flos": 19273328605440.0, + "grad_norm": 1.8236008971372257, + "language_loss": 0.88766813, + "learning_rate": 6.81204668404322e-07, + "loss": 0.90899056, + "num_input_tokens_seen": 264532675, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.67578125, + "step": 12263, + "time_per_iteration": 4.029206037521362 + }, + { + "auxiliary_loss_clip": 0.01096266, + "auxiliary_loss_mlp": 0.01026445, + "balance_loss_clip": 1.01636577, + "balance_loss_mlp": 1.03449428, + "epoch": 0.7373515707199759, + "flos": 25118257415040.0, + "grad_norm": 2.309256872894793, + "language_loss": 0.67259324, + "learning_rate": 6.809118983257522e-07, + "loss": 0.69382036, + "num_input_tokens_seen": 264555635, + "router_z_loss_clip": 0.10107422, + "router_z_loss_mlp": 0.6171875, + "step": 12264, + "time_per_iteration": 3.8689637184143066 + }, + { + "auxiliary_loss_clip": 0.01098996, + "auxiliary_loss_mlp": 0.0102669, + "balance_loss_clip": 1.01562762, + "balance_loss_mlp": 1.03491688, + "epoch": 0.737411693972644, + "flos": 32408481767040.0, + "grad_norm": 2.4971579087814066, + "language_loss": 0.80039012, + "learning_rate": 6.806191782671356e-07, + "loss": 0.82164693, + "num_input_tokens_seen": 264573140, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 12265, + "time_per_iteration": 4.065499782562256 + }, + { + "auxiliary_loss_clip": 0.01103678, + "auxiliary_loss_mlp": 0.01030021, + "balance_loss_clip": 1.01788533, + "balance_loss_mlp": 1.03421259, + "epoch": 0.7374718172253119, + "flos": 24315797623680.0, + "grad_norm": 1.6219065104687562, + "language_loss": 0.74228191, + "learning_rate": 6.803265082395711e-07, + "loss": 0.76361895, + "num_input_tokens_seen": 264591610, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 12266, + "time_per_iteration": 2.469236373901367 + }, + { + "auxiliary_loss_clip": 0.01103845, + "auxiliary_loss_mlp": 0.01034958, + "balance_loss_clip": 1.02281022, + "balance_loss_mlp": 1.03720498, + "epoch": 0.7375319404779799, + "flos": 27156115624320.0, + "grad_norm": 1.5661834210732133, + "language_loss": 0.73517638, + "learning_rate": 6.800338882541576e-07, + "loss": 0.75656438, + "num_input_tokens_seen": 264611170, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6640625, + "step": 12267, + "time_per_iteration": 2.504617214202881 + }, + { + "auxiliary_loss_clip": 0.01100734, + "auxiliary_loss_mlp": 0.01032173, + "balance_loss_clip": 1.02071714, + "balance_loss_mlp": 1.03550386, + "epoch": 0.7375920637306478, + "flos": 18879999701760.0, + "grad_norm": 1.9413990473639766, + "language_loss": 0.82913959, + "learning_rate": 6.797413183219923e-07, + "loss": 0.85046864, + "num_input_tokens_seen": 264629365, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65234375, + "step": 12268, + "time_per_iteration": 2.4835684299468994 + }, + { + "auxiliary_loss_clip": 0.01098968, + "auxiliary_loss_mlp": 0.01036468, + "balance_loss_clip": 1.02494073, + "balance_loss_mlp": 1.034657, + "epoch": 0.7376521869833158, + "flos": 15669765486720.0, + "grad_norm": 1.7133544019503224, + "language_loss": 0.7298789, + "learning_rate": 6.794487984541677e-07, + "loss": 0.75123322, + "num_input_tokens_seen": 264647915, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.64453125, + "step": 12269, + "time_per_iteration": 2.467454195022583 + }, + { + "auxiliary_loss_clip": 0.01104784, + "auxiliary_loss_mlp": 0.01032211, + "balance_loss_clip": 1.01989651, + "balance_loss_mlp": 1.03631639, + "epoch": 0.7377123102359837, + "flos": 36971973901440.0, + "grad_norm": 2.1055066962392095, + "language_loss": 0.69917566, + "learning_rate": 6.791563286617776e-07, + "loss": 0.72054565, + "num_input_tokens_seen": 264669620, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.68359375, + "step": 12270, + "time_per_iteration": 2.5774502754211426 + }, + { + "auxiliary_loss_clip": 0.0110007, + "auxiliary_loss_mlp": 0.01028402, + "balance_loss_clip": 1.01778626, + "balance_loss_mlp": 1.03567266, + "epoch": 0.7377724334886517, + "flos": 24496284487680.0, + "grad_norm": 1.7971813672192163, + "language_loss": 0.69534814, + "learning_rate": 6.788639089559119e-07, + "loss": 0.71663284, + "num_input_tokens_seen": 264689345, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.64453125, + "step": 12271, + "time_per_iteration": 2.6254172325134277 + }, + { + "auxiliary_loss_clip": 0.01102484, + "auxiliary_loss_mlp": 0.01029648, + "balance_loss_clip": 1.01770949, + "balance_loss_mlp": 1.03490114, + "epoch": 0.7378325567413198, + "flos": 24390025079040.0, + "grad_norm": 1.9993430148747984, + "language_loss": 0.68443513, + "learning_rate": 6.785715393476586e-07, + "loss": 0.70575643, + "num_input_tokens_seen": 264707625, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.67578125, + "step": 12272, + "time_per_iteration": 2.514380693435669 + }, + { + "auxiliary_loss_clip": 0.0109964, + "auxiliary_loss_mlp": 0.01029561, + "balance_loss_clip": 1.01809931, + "balance_loss_mlp": 1.03528929, + "epoch": 0.7378926799939877, + "flos": 17416388223360.0, + "grad_norm": 1.683058960031114, + "language_loss": 0.77877617, + "learning_rate": 6.782792198481049e-07, + "loss": 0.80006814, + "num_input_tokens_seen": 264725575, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.64453125, + "step": 12273, + "time_per_iteration": 2.4802489280700684 + }, + { + "auxiliary_loss_clip": 0.01098973, + "auxiliary_loss_mlp": 0.01031366, + "balance_loss_clip": 1.01958835, + "balance_loss_mlp": 1.03365088, + "epoch": 0.7379528032466557, + "flos": 18474208778880.0, + "grad_norm": 1.8227934716103082, + "language_loss": 0.83283198, + "learning_rate": 6.779869504683355e-07, + "loss": 0.85413539, + "num_input_tokens_seen": 264742855, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.65234375, + "step": 12274, + "time_per_iteration": 2.4196221828460693 + }, + { + "auxiliary_loss_clip": 0.01106787, + "auxiliary_loss_mlp": 0.01026617, + "balance_loss_clip": 1.01393938, + "balance_loss_mlp": 1.03611016, + "epoch": 0.7380129264993236, + "flos": 17821999578240.0, + "grad_norm": 1.788699432283416, + "language_loss": 0.7346586, + "learning_rate": 6.776947312194341e-07, + "loss": 0.75599259, + "num_input_tokens_seen": 264761155, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 12275, + "time_per_iteration": 2.4947471618652344 + }, + { + "auxiliary_loss_clip": 0.01106269, + "auxiliary_loss_mlp": 0.01039319, + "balance_loss_clip": 1.02698684, + "balance_loss_mlp": 1.03702235, + "epoch": 0.7380730497519916, + "flos": 22997372918400.0, + "grad_norm": 1.805676108210034, + "language_loss": 0.73670596, + "learning_rate": 6.774025621124813e-07, + "loss": 0.75816184, + "num_input_tokens_seen": 264780660, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6953125, + "step": 12276, + "time_per_iteration": 2.460041046142578 + }, + { + "auxiliary_loss_clip": 0.01102916, + "auxiliary_loss_mlp": 0.01028638, + "balance_loss_clip": 1.01706874, + "balance_loss_mlp": 1.03511322, + "epoch": 0.7381331730046595, + "flos": 20266259241600.0, + "grad_norm": 2.2438661310985544, + "language_loss": 0.77184784, + "learning_rate": 6.771104431585551e-07, + "loss": 0.79316336, + "num_input_tokens_seen": 264798850, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6796875, + "step": 12277, + "time_per_iteration": 2.5230605602264404 + }, + { + "auxiliary_loss_clip": 0.01101926, + "auxiliary_loss_mlp": 0.0103486, + "balance_loss_clip": 1.02326107, + "balance_loss_mlp": 1.03710866, + "epoch": 0.7381932962573275, + "flos": 19754532132480.0, + "grad_norm": 1.8274458620386211, + "language_loss": 0.78436172, + "learning_rate": 6.768183743687338e-07, + "loss": 0.80572963, + "num_input_tokens_seen": 264816795, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6484375, + "step": 12278, + "time_per_iteration": 2.4361507892608643 + }, + { + "auxiliary_loss_clip": 0.0110302, + "auxiliary_loss_mlp": 0.0103102, + "balance_loss_clip": 1.01922441, + "balance_loss_mlp": 1.03554451, + "epoch": 0.7382534195099955, + "flos": 17305316392320.0, + "grad_norm": 2.0940191805387722, + "language_loss": 0.72178644, + "learning_rate": 6.765263557540921e-07, + "loss": 0.74312687, + "num_input_tokens_seen": 264834105, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.67578125, + "step": 12279, + "time_per_iteration": 2.454338312149048 + }, + { + "auxiliary_loss_clip": 0.01102728, + "auxiliary_loss_mlp": 0.01033973, + "balance_loss_clip": 1.02146792, + "balance_loss_mlp": 1.03468275, + "epoch": 0.7383135427626635, + "flos": 18697358021760.0, + "grad_norm": 2.207094607312378, + "language_loss": 0.85757834, + "learning_rate": 6.762343873257034e-07, + "loss": 0.87894535, + "num_input_tokens_seen": 264850895, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 12280, + "time_per_iteration": 2.4340832233428955 + }, + { + "auxiliary_loss_clip": 0.01103222, + "auxiliary_loss_mlp": 0.01028536, + "balance_loss_clip": 1.016675, + "balance_loss_mlp": 1.03586102, + "epoch": 0.7383736660153314, + "flos": 20881300844160.0, + "grad_norm": 2.186067036515089, + "language_loss": 0.72367251, + "learning_rate": 6.759424690946408e-07, + "loss": 0.74499011, + "num_input_tokens_seen": 264869505, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 12281, + "time_per_iteration": 2.4844117164611816 + }, + { + "auxiliary_loss_clip": 0.01102088, + "auxiliary_loss_mlp": 0.01033571, + "balance_loss_clip": 1.02173972, + "balance_loss_mlp": 1.03446507, + "epoch": 0.7384337892679994, + "flos": 20663215418880.0, + "grad_norm": 1.9159466937607454, + "language_loss": 0.6074115, + "learning_rate": 6.756506010719711e-07, + "loss": 0.62876809, + "num_input_tokens_seen": 264886915, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.67578125, + "step": 12282, + "time_per_iteration": 2.4337880611419678 + }, + { + "auxiliary_loss_clip": 0.01104133, + "auxiliary_loss_mlp": 0.01031061, + "balance_loss_clip": 1.01902103, + "balance_loss_mlp": 1.03598022, + "epoch": 0.7384939125206673, + "flos": 29169627390720.0, + "grad_norm": 2.224847577186844, + "language_loss": 0.67914271, + "learning_rate": 6.753587832687632e-07, + "loss": 0.70049471, + "num_input_tokens_seen": 264910350, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6796875, + "step": 12283, + "time_per_iteration": 2.5443530082702637 + }, + { + "auxiliary_loss_clip": 0.01103409, + "auxiliary_loss_mlp": 0.01036343, + "balance_loss_clip": 1.02461326, + "balance_loss_mlp": 1.03717303, + "epoch": 0.7385540357733353, + "flos": 36312833376000.0, + "grad_norm": 1.587417277679554, + "language_loss": 0.76002008, + "learning_rate": 6.750670156960832e-07, + "loss": 0.78141761, + "num_input_tokens_seen": 264930705, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 12284, + "time_per_iteration": 2.561150312423706 + }, + { + "auxiliary_loss_clip": 0.01103106, + "auxiliary_loss_mlp": 0.01029878, + "balance_loss_clip": 1.01742673, + "balance_loss_mlp": 1.03535485, + "epoch": 0.7386141590260034, + "flos": 20302600826880.0, + "grad_norm": 1.8705632629894415, + "language_loss": 0.69351077, + "learning_rate": 6.747752983649954e-07, + "loss": 0.71484059, + "num_input_tokens_seen": 264946975, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6796875, + "step": 12285, + "time_per_iteration": 2.5044779777526855 + }, + { + "auxiliary_loss_clip": 0.01105253, + "auxiliary_loss_mlp": 0.01030637, + "balance_loss_clip": 1.01810813, + "balance_loss_mlp": 1.03483808, + "epoch": 0.7386742822786713, + "flos": 25483792170240.0, + "grad_norm": 2.818148859522571, + "language_loss": 0.79595774, + "learning_rate": 6.744836312865602e-07, + "loss": 0.81731659, + "num_input_tokens_seen": 264967665, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 12286, + "time_per_iteration": 2.462742328643799 + }, + { + "auxiliary_loss_clip": 0.01099992, + "auxiliary_loss_mlp": 0.01026401, + "balance_loss_clip": 1.01436138, + "balance_loss_mlp": 1.03468239, + "epoch": 0.7387344055313393, + "flos": 13771958405760.0, + "grad_norm": 2.0998689615756616, + "language_loss": 0.65484864, + "learning_rate": 6.741920144718396e-07, + "loss": 0.67611259, + "num_input_tokens_seen": 264985480, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.65234375, + "step": 12287, + "time_per_iteration": 2.5399184226989746 + }, + { + "auxiliary_loss_clip": 0.01097159, + "auxiliary_loss_mlp": 0.01026905, + "balance_loss_clip": 1.01564598, + "balance_loss_mlp": 1.03362429, + "epoch": 0.7387945287840072, + "flos": 27855189095040.0, + "grad_norm": 1.862112231817168, + "language_loss": 0.76542664, + "learning_rate": 6.739004479318903e-07, + "loss": 0.78666735, + "num_input_tokens_seen": 265004790, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.63671875, + "step": 12288, + "time_per_iteration": 2.483729362487793 + }, + { + "auxiliary_loss_clip": 0.01107844, + "auxiliary_loss_mlp": 0.01031728, + "balance_loss_clip": 1.0192709, + "balance_loss_mlp": 1.03781092, + "epoch": 0.7388546520366752, + "flos": 44233039388160.0, + "grad_norm": 1.6167864576536901, + "language_loss": 0.58242345, + "learning_rate": 6.736089316777684e-07, + "loss": 0.60381913, + "num_input_tokens_seen": 265028790, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.69921875, + "step": 12289, + "time_per_iteration": 2.653754472732544 + }, + { + "auxiliary_loss_clip": 0.01027818, + "auxiliary_loss_mlp": 0.010066, + "balance_loss_clip": 1.00555122, + "balance_loss_mlp": 1.00722313, + "epoch": 0.7389147752893431, + "flos": 70680890638080.0, + "grad_norm": 0.657884434351233, + "language_loss": 0.49320006, + "learning_rate": 6.733174657205287e-07, + "loss": 0.5135442, + "num_input_tokens_seen": 265096660, + "router_z_loss_clip": 0.01049805, + "router_z_loss_mlp": 0.20605469, + "step": 12290, + "time_per_iteration": 3.161417007446289 + }, + { + "auxiliary_loss_clip": 0.01104102, + "auxiliary_loss_mlp": 0.01030184, + "balance_loss_clip": 1.01758409, + "balance_loss_mlp": 1.03600287, + "epoch": 0.7389748985420111, + "flos": 25994980575360.0, + "grad_norm": 1.8618109210971494, + "language_loss": 0.66936404, + "learning_rate": 6.730260500712237e-07, + "loss": 0.69070697, + "num_input_tokens_seen": 265116375, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6796875, + "step": 12291, + "time_per_iteration": 2.504171371459961 + }, + { + "auxiliary_loss_clip": 0.01026631, + "auxiliary_loss_mlp": 0.01008045, + "balance_loss_clip": 1.00700212, + "balance_loss_mlp": 1.00617576, + "epoch": 0.7390350217946791, + "flos": 54403661318400.0, + "grad_norm": 0.9921278078436683, + "language_loss": 0.60870874, + "learning_rate": 6.727346847409052e-07, + "loss": 0.6290555, + "num_input_tokens_seen": 265161230, + "router_z_loss_clip": 0.01043701, + "router_z_loss_mlp": 0.20507812, + "step": 12292, + "time_per_iteration": 2.740140676498413 + }, + { + "auxiliary_loss_clip": 0.0110263, + "auxiliary_loss_mlp": 0.01030054, + "balance_loss_clip": 1.0190208, + "balance_loss_mlp": 1.03666413, + "epoch": 0.7390951450473471, + "flos": 32196968530560.0, + "grad_norm": 2.0283775750990447, + "language_loss": 0.67287552, + "learning_rate": 6.724433697406191e-07, + "loss": 0.6942023, + "num_input_tokens_seen": 265182515, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.66015625, + "step": 12293, + "time_per_iteration": 2.5637433528900146 + }, + { + "auxiliary_loss_clip": 0.01101914, + "auxiliary_loss_mlp": 0.01030871, + "balance_loss_clip": 1.01897979, + "balance_loss_mlp": 1.03533363, + "epoch": 0.739155268300015, + "flos": 16684241304960.0, + "grad_norm": 1.7680717845070275, + "language_loss": 0.83443105, + "learning_rate": 6.721521050814134e-07, + "loss": 0.85575891, + "num_input_tokens_seen": 265198160, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 12294, + "time_per_iteration": 2.452796697616577 + }, + { + "auxiliary_loss_clip": 0.0109896, + "auxiliary_loss_mlp": 0.01030815, + "balance_loss_clip": 1.01865005, + "balance_loss_mlp": 1.03435683, + "epoch": 0.739215391552683, + "flos": 31649761762560.0, + "grad_norm": 1.704234892939925, + "language_loss": 0.72765625, + "learning_rate": 6.718608907743337e-07, + "loss": 0.74895406, + "num_input_tokens_seen": 265218480, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6484375, + "step": 12295, + "time_per_iteration": 2.532444953918457 + }, + { + "auxiliary_loss_clip": 0.01099527, + "auxiliary_loss_mlp": 0.01036709, + "balance_loss_clip": 1.0250864, + "balance_loss_mlp": 1.03585625, + "epoch": 0.7392755148053509, + "flos": 29718522097920.0, + "grad_norm": 1.6789172360591735, + "language_loss": 0.78772449, + "learning_rate": 6.715697268304215e-07, + "loss": 0.8090868, + "num_input_tokens_seen": 265240165, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.63671875, + "step": 12296, + "time_per_iteration": 2.5699706077575684 + }, + { + "auxiliary_loss_clip": 0.01102686, + "auxiliary_loss_mlp": 0.01031782, + "balance_loss_clip": 1.01921093, + "balance_loss_mlp": 1.03617287, + "epoch": 0.7393356380580189, + "flos": 37050475075200.0, + "grad_norm": 1.8636543361899776, + "language_loss": 0.66520232, + "learning_rate": 6.712786132607182e-07, + "loss": 0.68654692, + "num_input_tokens_seen": 265263295, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6640625, + "step": 12297, + "time_per_iteration": 2.5840320587158203 + }, + { + "auxiliary_loss_clip": 0.01103197, + "auxiliary_loss_mlp": 0.0103567, + "balance_loss_clip": 1.0230639, + "balance_loss_mlp": 1.03605783, + "epoch": 0.739395761310687, + "flos": 19719627091200.0, + "grad_norm": 2.2038505631105054, + "language_loss": 0.68769479, + "learning_rate": 6.709875500762645e-07, + "loss": 0.70908344, + "num_input_tokens_seen": 265282740, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.671875, + "step": 12298, + "time_per_iteration": 2.4649643898010254 + }, + { + "auxiliary_loss_clip": 0.01102459, + "auxiliary_loss_mlp": 0.01029321, + "balance_loss_clip": 1.01767373, + "balance_loss_mlp": 1.0349468, + "epoch": 0.7394558845633549, + "flos": 11801504067840.0, + "grad_norm": 1.7869505814548332, + "language_loss": 0.74577737, + "learning_rate": 6.706965372880946e-07, + "loss": 0.76709521, + "num_input_tokens_seen": 265300175, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.67578125, + "step": 12299, + "time_per_iteration": 2.4275574684143066 + }, + { + "auxiliary_loss_clip": 0.01025983, + "auxiliary_loss_mlp": 0.01002146, + "balance_loss_clip": 1.00116849, + "balance_loss_mlp": 1.00569797, + "epoch": 0.7395160078160229, + "flos": 66195827850240.0, + "grad_norm": 0.7180686194551699, + "language_loss": 0.60861343, + "learning_rate": 6.704055749072455e-07, + "loss": 0.62889469, + "num_input_tokens_seen": 265363275, + "router_z_loss_clip": 0.00976562, + "router_z_loss_mlp": 0.203125, + "step": 12300, + "time_per_iteration": 3.1263675689697266 + }, + { + "auxiliary_loss_clip": 0.01102982, + "auxiliary_loss_mlp": 0.01028506, + "balance_loss_clip": 1.01645398, + "balance_loss_mlp": 1.03720665, + "epoch": 0.7395761310686908, + "flos": 21249708687360.0, + "grad_norm": 1.4253075505979764, + "language_loss": 0.80278659, + "learning_rate": 6.7011466294475e-07, + "loss": 0.82410145, + "num_input_tokens_seen": 265382935, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.65625, + "step": 12301, + "time_per_iteration": 2.46708345413208 + }, + { + "auxiliary_loss_clip": 0.01100248, + "auxiliary_loss_mlp": 0.01028329, + "balance_loss_clip": 1.01725399, + "balance_loss_mlp": 1.0343194, + "epoch": 0.7396362543213588, + "flos": 25955299025280.0, + "grad_norm": 1.5951843205733178, + "language_loss": 0.73313689, + "learning_rate": 6.698238014116406e-07, + "loss": 0.75442266, + "num_input_tokens_seen": 265403245, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.66015625, + "step": 12302, + "time_per_iteration": 2.532886028289795 + }, + { + "auxiliary_loss_clip": 0.01102922, + "auxiliary_loss_mlp": 0.01036251, + "balance_loss_clip": 1.02475905, + "balance_loss_mlp": 1.03542805, + "epoch": 0.7396963775740267, + "flos": 27377936064000.0, + "grad_norm": 1.7925873497266347, + "language_loss": 0.7409184, + "learning_rate": 6.695329903189451e-07, + "loss": 0.76231015, + "num_input_tokens_seen": 265423105, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.67578125, + "step": 12303, + "time_per_iteration": 3.842045545578003 + }, + { + "auxiliary_loss_clip": 0.01098981, + "auxiliary_loss_mlp": 0.01026474, + "balance_loss_clip": 1.01557863, + "balance_loss_mlp": 1.03380299, + "epoch": 0.7397565008266948, + "flos": 25520133755520.0, + "grad_norm": 1.7395112572263238, + "language_loss": 0.54232901, + "learning_rate": 6.692422296776927e-07, + "loss": 0.56358361, + "num_input_tokens_seen": 265443445, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6484375, + "step": 12304, + "time_per_iteration": 2.5310745239257812 + }, + { + "auxiliary_loss_clip": 0.01102065, + "auxiliary_loss_mlp": 0.01029526, + "balance_loss_clip": 1.01740789, + "balance_loss_mlp": 1.03500128, + "epoch": 0.7398166240793627, + "flos": 23727760070400.0, + "grad_norm": 1.9555871557250795, + "language_loss": 0.841694, + "learning_rate": 6.689515194989084e-07, + "loss": 0.86300987, + "num_input_tokens_seen": 265462085, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 12305, + "time_per_iteration": 3.97141170501709 + }, + { + "auxiliary_loss_clip": 0.01025514, + "auxiliary_loss_mlp": 0.00998213, + "balance_loss_clip": 0.99718779, + "balance_loss_mlp": 1.0049659, + "epoch": 0.7398767473320307, + "flos": 67267582882560.0, + "grad_norm": 0.8695449825144963, + "language_loss": 0.57674229, + "learning_rate": 6.68660859793615e-07, + "loss": 0.59697956, + "num_input_tokens_seen": 265521190, + "router_z_loss_clip": 0.01025391, + "router_z_loss_mlp": 0.20507812, + "step": 12306, + "time_per_iteration": 4.480564117431641 + }, + { + "auxiliary_loss_clip": 0.01105578, + "auxiliary_loss_mlp": 0.01031897, + "balance_loss_clip": 1.01955891, + "balance_loss_mlp": 1.03752124, + "epoch": 0.7399368705846986, + "flos": 22018699981440.0, + "grad_norm": 1.94634660943293, + "language_loss": 0.81800246, + "learning_rate": 6.683702505728355e-07, + "loss": 0.83937716, + "num_input_tokens_seen": 265539705, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6796875, + "step": 12307, + "time_per_iteration": 3.964345932006836 + }, + { + "auxiliary_loss_clip": 0.01099571, + "auxiliary_loss_mlp": 0.01028725, + "balance_loss_clip": 1.01743591, + "balance_loss_mlp": 1.03615248, + "epoch": 0.7399969938373666, + "flos": 14173870659840.0, + "grad_norm": 1.7625756479889783, + "language_loss": 0.69852555, + "learning_rate": 6.680796918475893e-07, + "loss": 0.71980846, + "num_input_tokens_seen": 265555855, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.63671875, + "step": 12308, + "time_per_iteration": 2.426374912261963 + }, + { + "auxiliary_loss_clip": 0.01097458, + "auxiliary_loss_mlp": 0.01025287, + "balance_loss_clip": 1.01394367, + "balance_loss_mlp": 1.03327668, + "epoch": 0.7400571170900345, + "flos": 25301473712640.0, + "grad_norm": 1.8311869299558743, + "language_loss": 0.81359291, + "learning_rate": 6.67789183628896e-07, + "loss": 0.83482039, + "num_input_tokens_seen": 265575455, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 12309, + "time_per_iteration": 2.47933292388916 + }, + { + "auxiliary_loss_clip": 0.01102906, + "auxiliary_loss_mlp": 0.01033716, + "balance_loss_clip": 1.02118754, + "balance_loss_mlp": 1.03444481, + "epoch": 0.7401172403427025, + "flos": 22711344917760.0, + "grad_norm": 1.7272186323130432, + "language_loss": 0.72933966, + "learning_rate": 6.674987259277692e-07, + "loss": 0.7507059, + "num_input_tokens_seen": 265595250, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 12310, + "time_per_iteration": 2.458360195159912 + }, + { + "auxiliary_loss_clip": 0.01105362, + "auxiliary_loss_mlp": 0.01037153, + "balance_loss_clip": 1.02455902, + "balance_loss_mlp": 1.03706884, + "epoch": 0.7401773635953706, + "flos": 18067448188800.0, + "grad_norm": 2.8138497569314165, + "language_loss": 0.8816393, + "learning_rate": 6.672083187552239e-07, + "loss": 0.90306449, + "num_input_tokens_seen": 265606945, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.68359375, + "step": 12311, + "time_per_iteration": 2.4193923473358154 + }, + { + "auxiliary_loss_clip": 0.0110026, + "auxiliary_loss_mlp": 0.01026797, + "balance_loss_clip": 1.01557398, + "balance_loss_mlp": 1.0338285, + "epoch": 0.7402374868480385, + "flos": 22712135016960.0, + "grad_norm": 1.5281974655269193, + "language_loss": 0.80203426, + "learning_rate": 6.669179621222738e-07, + "loss": 0.82330477, + "num_input_tokens_seen": 265626115, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6640625, + "step": 12312, + "time_per_iteration": 2.5062949657440186 + }, + { + "auxiliary_loss_clip": 0.01100667, + "auxiliary_loss_mlp": 0.01026723, + "balance_loss_clip": 1.0153985, + "balance_loss_mlp": 1.03547597, + "epoch": 0.7402976101007065, + "flos": 22856675345280.0, + "grad_norm": 2.0496860461073676, + "language_loss": 0.7839551, + "learning_rate": 6.666276560399273e-07, + "loss": 0.80522901, + "num_input_tokens_seen": 265646520, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 12313, + "time_per_iteration": 2.4662246704101562 + }, + { + "auxiliary_loss_clip": 0.01102693, + "auxiliary_loss_mlp": 0.01036243, + "balance_loss_clip": 1.02358902, + "balance_loss_mlp": 1.03396571, + "epoch": 0.7403577333533744, + "flos": 12345801834240.0, + "grad_norm": 2.00903442682859, + "language_loss": 0.78872943, + "learning_rate": 6.663374005191937e-07, + "loss": 0.81011879, + "num_input_tokens_seen": 265661875, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 12314, + "time_per_iteration": 2.446385622024536 + }, + { + "auxiliary_loss_clip": 0.01024604, + "auxiliary_loss_mlp": 0.01003964, + "balance_loss_clip": 1.00296831, + "balance_loss_mlp": 1.00410616, + "epoch": 0.7404178566060424, + "flos": 60327270869760.0, + "grad_norm": 0.8412651667201435, + "language_loss": 0.55169189, + "learning_rate": 6.660471955710809e-07, + "loss": 0.57197762, + "num_input_tokens_seen": 265721255, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.20507812, + "step": 12315, + "time_per_iteration": 3.0314457416534424 + }, + { + "auxiliary_loss_clip": 0.01097855, + "auxiliary_loss_mlp": 0.01031672, + "balance_loss_clip": 1.02031732, + "balance_loss_mlp": 1.03454709, + "epoch": 0.7404779798587103, + "flos": 32014650072960.0, + "grad_norm": 1.5280075701489741, + "language_loss": 0.79192966, + "learning_rate": 6.65757041206591e-07, + "loss": 0.81322497, + "num_input_tokens_seen": 265743970, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6328125, + "step": 12316, + "time_per_iteration": 2.5997025966644287 + }, + { + "auxiliary_loss_clip": 0.0109893, + "auxiliary_loss_mlp": 0.01029612, + "balance_loss_clip": 1.01758349, + "balance_loss_mlp": 1.03257847, + "epoch": 0.7405381031113784, + "flos": 12889704551040.0, + "grad_norm": 1.6312870183183517, + "language_loss": 0.74777615, + "learning_rate": 6.654669374367275e-07, + "loss": 0.76906157, + "num_input_tokens_seen": 265760890, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 12317, + "time_per_iteration": 2.409041404724121 + }, + { + "auxiliary_loss_clip": 0.01097259, + "auxiliary_loss_mlp": 0.01034106, + "balance_loss_clip": 1.02258456, + "balance_loss_mlp": 1.03415799, + "epoch": 0.7405982263640463, + "flos": 20229127557120.0, + "grad_norm": 1.5381739579945533, + "language_loss": 0.81140697, + "learning_rate": 6.651768842724917e-07, + "loss": 0.83272064, + "num_input_tokens_seen": 265779600, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.62890625, + "step": 12318, + "time_per_iteration": 2.483341932296753 + }, + { + "auxiliary_loss_clip": 0.01102727, + "auxiliary_loss_mlp": 0.01029982, + "balance_loss_clip": 1.01801968, + "balance_loss_mlp": 1.03532875, + "epoch": 0.7406583496167143, + "flos": 17567213431680.0, + "grad_norm": 2.10976565284071, + "language_loss": 0.76717627, + "learning_rate": 6.648868817248827e-07, + "loss": 0.78850329, + "num_input_tokens_seen": 265797030, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.67578125, + "step": 12319, + "time_per_iteration": 2.4090797901153564 + }, + { + "auxiliary_loss_clip": 0.01100157, + "auxiliary_loss_mlp": 0.01031224, + "balance_loss_clip": 1.02052474, + "balance_loss_mlp": 1.03510928, + "epoch": 0.7407184728693822, + "flos": 18295733076480.0, + "grad_norm": 2.728045021553726, + "language_loss": 0.64247096, + "learning_rate": 6.64596929804897e-07, + "loss": 0.6637848, + "num_input_tokens_seen": 265815055, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.65234375, + "step": 12320, + "time_per_iteration": 2.4777369499206543 + }, + { + "auxiliary_loss_clip": 0.01104796, + "auxiliary_loss_mlp": 0.01034808, + "balance_loss_clip": 1.02257681, + "balance_loss_mlp": 1.03554249, + "epoch": 0.7407785961220502, + "flos": 16690562098560.0, + "grad_norm": 2.5603662317591307, + "language_loss": 0.83399361, + "learning_rate": 6.643070285235288e-07, + "loss": 0.8553896, + "num_input_tokens_seen": 265828480, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69140625, + "step": 12321, + "time_per_iteration": 2.5069942474365234 + }, + { + "auxiliary_loss_clip": 0.01106734, + "auxiliary_loss_mlp": 0.01043827, + "balance_loss_clip": 1.03052354, + "balance_loss_mlp": 1.03583789, + "epoch": 0.7408387193747181, + "flos": 22088330496000.0, + "grad_norm": 1.897257666550991, + "language_loss": 0.71964365, + "learning_rate": 6.640171778917727e-07, + "loss": 0.74114925, + "num_input_tokens_seen": 265845825, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 12322, + "time_per_iteration": 2.4930129051208496 + }, + { + "auxiliary_loss_clip": 0.01101959, + "auxiliary_loss_mlp": 0.0103281, + "balance_loss_clip": 1.02137196, + "balance_loss_mlp": 1.03622496, + "epoch": 0.7408988426273861, + "flos": 24236721832320.0, + "grad_norm": 1.870315243792337, + "language_loss": 0.64078039, + "learning_rate": 6.637273779206183e-07, + "loss": 0.66212809, + "num_input_tokens_seen": 265866335, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 12323, + "time_per_iteration": 2.4777188301086426 + }, + { + "auxiliary_loss_clip": 0.01102632, + "auxiliary_loss_mlp": 0.01026506, + "balance_loss_clip": 1.0141499, + "balance_loss_mlp": 1.03480208, + "epoch": 0.7409589658800542, + "flos": 29023004073600.0, + "grad_norm": 1.4950637015537451, + "language_loss": 0.75935167, + "learning_rate": 6.634376286210559e-07, + "loss": 0.78064305, + "num_input_tokens_seen": 265888945, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6796875, + "step": 12324, + "time_per_iteration": 2.5902748107910156 + }, + { + "auxiliary_loss_clip": 0.01099826, + "auxiliary_loss_mlp": 0.01023896, + "balance_loss_clip": 1.01248217, + "balance_loss_mlp": 1.0326978, + "epoch": 0.7410190891327221, + "flos": 19351362902400.0, + "grad_norm": 1.7779845069008868, + "language_loss": 0.74595994, + "learning_rate": 6.63147930004073e-07, + "loss": 0.76719713, + "num_input_tokens_seen": 265908030, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 12325, + "time_per_iteration": 2.428908586502075 + }, + { + "auxiliary_loss_clip": 0.01105539, + "auxiliary_loss_mlp": 0.01032307, + "balance_loss_clip": 1.01949763, + "balance_loss_mlp": 1.03505337, + "epoch": 0.7410792123853901, + "flos": 22747650589440.0, + "grad_norm": 1.8169030049946526, + "language_loss": 0.68363488, + "learning_rate": 6.628582820806545e-07, + "loss": 0.70501333, + "num_input_tokens_seen": 265927030, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.703125, + "step": 12326, + "time_per_iteration": 2.4834694862365723 + }, + { + "auxiliary_loss_clip": 0.01101938, + "auxiliary_loss_mlp": 0.01027612, + "balance_loss_clip": 1.0159893, + "balance_loss_mlp": 1.03513253, + "epoch": 0.741139335638058, + "flos": 25372433030400.0, + "grad_norm": 2.058459084269704, + "language_loss": 0.89730138, + "learning_rate": 6.625686848617835e-07, + "loss": 0.91859686, + "num_input_tokens_seen": 265945490, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66796875, + "step": 12327, + "time_per_iteration": 2.4705865383148193 + }, + { + "auxiliary_loss_clip": 0.01101392, + "auxiliary_loss_mlp": 0.01031454, + "balance_loss_clip": 1.01925874, + "balance_loss_mlp": 1.03504896, + "epoch": 0.741199458890726, + "flos": 18585639745920.0, + "grad_norm": 1.6496511439188377, + "language_loss": 0.85582221, + "learning_rate": 6.62279138358442e-07, + "loss": 0.87715065, + "num_input_tokens_seen": 265963265, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6640625, + "step": 12328, + "time_per_iteration": 2.440108060836792 + }, + { + "auxiliary_loss_clip": 0.0109826, + "auxiliary_loss_mlp": 0.01029623, + "balance_loss_clip": 1.01708829, + "balance_loss_mlp": 1.03355885, + "epoch": 0.7412595821433939, + "flos": 22127078292480.0, + "grad_norm": 1.676741332984265, + "language_loss": 0.66687691, + "learning_rate": 6.619896425816103e-07, + "loss": 0.68815577, + "num_input_tokens_seen": 265982270, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6484375, + "step": 12329, + "time_per_iteration": 2.433601140975952 + }, + { + "auxiliary_loss_clip": 0.01105654, + "auxiliary_loss_mlp": 0.01029158, + "balance_loss_clip": 1.01754093, + "balance_loss_mlp": 1.03583872, + "epoch": 0.741319705396062, + "flos": 29169699217920.0, + "grad_norm": 1.8984380479185268, + "language_loss": 0.66488492, + "learning_rate": 6.617001975422647e-07, + "loss": 0.68623304, + "num_input_tokens_seen": 266003835, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.69921875, + "step": 12330, + "time_per_iteration": 2.5116231441497803 + }, + { + "auxiliary_loss_clip": 0.01108565, + "auxiliary_loss_mlp": 0.0103297, + "balance_loss_clip": 1.01889706, + "balance_loss_mlp": 1.03731847, + "epoch": 0.7413798286487299, + "flos": 20667489137280.0, + "grad_norm": 1.9345159720147296, + "language_loss": 0.85613048, + "learning_rate": 6.614108032513823e-07, + "loss": 0.87754583, + "num_input_tokens_seen": 266021595, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7109375, + "step": 12331, + "time_per_iteration": 2.4270429611206055 + }, + { + "auxiliary_loss_clip": 0.01102715, + "auxiliary_loss_mlp": 0.01030968, + "balance_loss_clip": 1.01889229, + "balance_loss_mlp": 1.03435421, + "epoch": 0.7414399519013979, + "flos": 16398895662720.0, + "grad_norm": 1.9091499126857316, + "language_loss": 0.69466591, + "learning_rate": 6.611214597199364e-07, + "loss": 0.7160027, + "num_input_tokens_seen": 266039860, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.68359375, + "step": 12332, + "time_per_iteration": 2.422391176223755 + }, + { + "auxiliary_loss_clip": 0.01102435, + "auxiliary_loss_mlp": 0.01035735, + "balance_loss_clip": 1.02290845, + "balance_loss_mlp": 1.03556943, + "epoch": 0.7415000751540658, + "flos": 25630235919360.0, + "grad_norm": 2.2157206056702097, + "language_loss": 0.63370979, + "learning_rate": 6.608321669588984e-07, + "loss": 0.65509146, + "num_input_tokens_seen": 266058050, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.66796875, + "step": 12333, + "time_per_iteration": 2.505436420440674 + }, + { + "auxiliary_loss_clip": 0.01099765, + "auxiliary_loss_mlp": 0.01033254, + "balance_loss_clip": 1.02141094, + "balance_loss_mlp": 1.03644109, + "epoch": 0.7415601984067338, + "flos": 24499732193280.0, + "grad_norm": 1.6374577716994534, + "language_loss": 0.71271133, + "learning_rate": 6.605429249792387e-07, + "loss": 0.73404145, + "num_input_tokens_seen": 266078060, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6328125, + "step": 12334, + "time_per_iteration": 2.5002856254577637 + }, + { + "auxiliary_loss_clip": 0.01101856, + "auxiliary_loss_mlp": 0.01027153, + "balance_loss_clip": 1.01598334, + "balance_loss_mlp": 1.03537202, + "epoch": 0.7416203216594017, + "flos": 20887154760960.0, + "grad_norm": 1.9057001714532567, + "language_loss": 0.82662481, + "learning_rate": 6.602537337919257e-07, + "loss": 0.84791493, + "num_input_tokens_seen": 266097110, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6640625, + "step": 12335, + "time_per_iteration": 2.420285701751709 + }, + { + "auxiliary_loss_clip": 0.01101959, + "auxiliary_loss_mlp": 0.01031092, + "balance_loss_clip": 1.01862848, + "balance_loss_mlp": 1.03514791, + "epoch": 0.7416804449120697, + "flos": 15624265933440.0, + "grad_norm": 2.6318734852412082, + "language_loss": 0.74709713, + "learning_rate": 6.599645934079259e-07, + "loss": 0.76842761, + "num_input_tokens_seen": 266110870, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.66796875, + "step": 12336, + "time_per_iteration": 2.395914077758789 + }, + { + "auxiliary_loss_clip": 0.01105209, + "auxiliary_loss_mlp": 0.0102888, + "balance_loss_clip": 1.01698947, + "balance_loss_mlp": 1.03675711, + "epoch": 0.7417405681647377, + "flos": 17120483982720.0, + "grad_norm": 2.0074082890204803, + "language_loss": 0.73073846, + "learning_rate": 6.596755038382029e-07, + "loss": 0.75207937, + "num_input_tokens_seen": 266127845, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.68359375, + "step": 12337, + "time_per_iteration": 2.4017410278320312 + }, + { + "auxiliary_loss_clip": 0.01101618, + "auxiliary_loss_mlp": 0.0103512, + "balance_loss_clip": 1.02384233, + "balance_loss_mlp": 1.0375526, + "epoch": 0.7418006914174057, + "flos": 18880322924160.0, + "grad_norm": 1.582069295944861, + "language_loss": 0.76476055, + "learning_rate": 6.593864650937186e-07, + "loss": 0.78612792, + "num_input_tokens_seen": 266145400, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.640625, + "step": 12338, + "time_per_iteration": 2.469158172607422 + }, + { + "auxiliary_loss_clip": 0.01098771, + "auxiliary_loss_mlp": 0.01027623, + "balance_loss_clip": 1.01728797, + "balance_loss_mlp": 1.03412902, + "epoch": 0.7418608146700737, + "flos": 21580733450880.0, + "grad_norm": 1.7521644726075343, + "language_loss": 0.73067641, + "learning_rate": 6.590974771854345e-07, + "loss": 0.75194031, + "num_input_tokens_seen": 266164430, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.64453125, + "step": 12339, + "time_per_iteration": 2.4999265670776367 + }, + { + "auxiliary_loss_clip": 0.01103048, + "auxiliary_loss_mlp": 0.01027293, + "balance_loss_clip": 1.01544917, + "balance_loss_mlp": 1.03630698, + "epoch": 0.7419209379227416, + "flos": 22340459036160.0, + "grad_norm": 1.733265242117768, + "language_loss": 0.79821277, + "learning_rate": 6.588085401243077e-07, + "loss": 0.81951618, + "num_input_tokens_seen": 266183855, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66796875, + "step": 12340, + "time_per_iteration": 2.5067059993743896 + }, + { + "auxiliary_loss_clip": 0.0110211, + "auxiliary_loss_mlp": 0.01033939, + "balance_loss_clip": 1.02174389, + "balance_loss_mlp": 1.0347414, + "epoch": 0.7419810611754096, + "flos": 16762275601920.0, + "grad_norm": 1.853046258672694, + "language_loss": 0.75634474, + "learning_rate": 6.585196539212958e-07, + "loss": 0.77770519, + "num_input_tokens_seen": 266202085, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 12341, + "time_per_iteration": 2.419905662536621 + }, + { + "auxiliary_loss_clip": 0.01095271, + "auxiliary_loss_mlp": 0.0103099, + "balance_loss_clip": 1.01980829, + "balance_loss_mlp": 1.03472114, + "epoch": 0.7420411844280775, + "flos": 26212958259840.0, + "grad_norm": 1.6930413865654552, + "language_loss": 0.80139267, + "learning_rate": 6.582308185873535e-07, + "loss": 0.82265526, + "num_input_tokens_seen": 266223445, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.60546875, + "step": 12342, + "time_per_iteration": 2.5155606269836426 + }, + { + "auxiliary_loss_clip": 0.01100642, + "auxiliary_loss_mlp": 0.01028992, + "balance_loss_clip": 1.01748824, + "balance_loss_mlp": 1.03512716, + "epoch": 0.7421013076807456, + "flos": 68529371840640.0, + "grad_norm": 1.6721865826322508, + "language_loss": 0.77694213, + "learning_rate": 6.57942034133433e-07, + "loss": 0.79823846, + "num_input_tokens_seen": 266246575, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 12343, + "time_per_iteration": 2.8234310150146484 + }, + { + "auxiliary_loss_clip": 0.01097938, + "auxiliary_loss_mlp": 0.01031813, + "balance_loss_clip": 1.02027309, + "balance_loss_mlp": 1.03249693, + "epoch": 0.7421614309334135, + "flos": 24425325169920.0, + "grad_norm": 1.7204142149055508, + "language_loss": 0.67798221, + "learning_rate": 6.576533005704843e-07, + "loss": 0.69927979, + "num_input_tokens_seen": 266266055, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 12344, + "time_per_iteration": 3.9860341548919678 + }, + { + "auxiliary_loss_clip": 0.01102936, + "auxiliary_loss_mlp": 0.01033112, + "balance_loss_clip": 1.0204283, + "balance_loss_mlp": 1.03553951, + "epoch": 0.7422215541860815, + "flos": 12311076360960.0, + "grad_norm": 2.3379030417701423, + "language_loss": 0.81033051, + "learning_rate": 6.573646179094572e-07, + "loss": 0.83169097, + "num_input_tokens_seen": 266282240, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.67578125, + "step": 12345, + "time_per_iteration": 2.457531213760376 + }, + { + "auxiliary_loss_clip": 0.01102706, + "auxiliary_loss_mlp": 0.01036384, + "balance_loss_clip": 1.02450442, + "balance_loss_mlp": 1.0354228, + "epoch": 0.7422816774387494, + "flos": 19645579203840.0, + "grad_norm": 1.9598348009853668, + "language_loss": 0.71018803, + "learning_rate": 6.570759861612988e-07, + "loss": 0.73157895, + "num_input_tokens_seen": 266300980, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 12346, + "time_per_iteration": 3.8033220767974854 + }, + { + "auxiliary_loss_clip": 0.01102695, + "auxiliary_loss_mlp": 0.01030351, + "balance_loss_clip": 1.01851344, + "balance_loss_mlp": 1.03597689, + "epoch": 0.7423418006914174, + "flos": 32015978876160.0, + "grad_norm": 1.5893772785658562, + "language_loss": 0.73678845, + "learning_rate": 6.56787405336953e-07, + "loss": 0.75811887, + "num_input_tokens_seen": 266322215, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6640625, + "step": 12347, + "time_per_iteration": 3.922349691390991 + }, + { + "auxiliary_loss_clip": 0.0110556, + "auxiliary_loss_mlp": 0.01030516, + "balance_loss_clip": 1.01888108, + "balance_loss_mlp": 1.03616238, + "epoch": 0.7424019239440853, + "flos": 18916951818240.0, + "grad_norm": 1.7507272785973695, + "language_loss": 0.80773383, + "learning_rate": 6.564988754473642e-07, + "loss": 0.82909453, + "num_input_tokens_seen": 266341600, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6953125, + "step": 12348, + "time_per_iteration": 3.8946139812469482 + }, + { + "auxiliary_loss_clip": 0.01100119, + "auxiliary_loss_mlp": 0.01028617, + "balance_loss_clip": 1.01724422, + "balance_loss_mlp": 1.03434706, + "epoch": 0.7424620471967533, + "flos": 35876518871040.0, + "grad_norm": 1.9451806865791765, + "language_loss": 0.72609961, + "learning_rate": 6.562103965034724e-07, + "loss": 0.74738705, + "num_input_tokens_seen": 266362895, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65625, + "step": 12349, + "time_per_iteration": 2.6297786235809326 + }, + { + "auxiliary_loss_clip": 0.01105402, + "auxiliary_loss_mlp": 0.01032044, + "balance_loss_clip": 1.01891899, + "balance_loss_mlp": 1.03512514, + "epoch": 0.7425221704494213, + "flos": 27016603200000.0, + "grad_norm": 1.884291217596135, + "language_loss": 0.78724527, + "learning_rate": 6.559219685162165e-07, + "loss": 0.80861974, + "num_input_tokens_seen": 266384015, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 12350, + "time_per_iteration": 2.500523567199707 + }, + { + "auxiliary_loss_clip": 0.01100361, + "auxiliary_loss_mlp": 0.01034078, + "balance_loss_clip": 1.02290213, + "balance_loss_mlp": 1.03446043, + "epoch": 0.7425822937020893, + "flos": 34167135559680.0, + "grad_norm": 1.7431994876148182, + "language_loss": 0.74992573, + "learning_rate": 6.556335914965343e-07, + "loss": 0.7712701, + "num_input_tokens_seen": 266405990, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.66015625, + "step": 12351, + "time_per_iteration": 2.570344924926758 + }, + { + "auxiliary_loss_clip": 0.01100715, + "auxiliary_loss_mlp": 0.01024897, + "balance_loss_clip": 1.01363814, + "balance_loss_mlp": 1.03487992, + "epoch": 0.7426424169547573, + "flos": 21283572234240.0, + "grad_norm": 1.8775764813546454, + "language_loss": 0.81292212, + "learning_rate": 6.553452654553611e-07, + "loss": 0.83417821, + "num_input_tokens_seen": 266424260, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 12352, + "time_per_iteration": 2.4442734718322754 + }, + { + "auxiliary_loss_clip": 0.01103269, + "auxiliary_loss_mlp": 0.01037884, + "balance_loss_clip": 1.02641034, + "balance_loss_mlp": 1.0369432, + "epoch": 0.7427025402074252, + "flos": 22448442297600.0, + "grad_norm": 1.9024946732776964, + "language_loss": 0.71716195, + "learning_rate": 6.550569904036307e-07, + "loss": 0.73857349, + "num_input_tokens_seen": 266444580, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6640625, + "step": 12353, + "time_per_iteration": 2.4757235050201416 + }, + { + "auxiliary_loss_clip": 0.01102245, + "auxiliary_loss_mlp": 0.0103172, + "balance_loss_clip": 1.02075243, + "balance_loss_mlp": 1.03749537, + "epoch": 0.7427626634600932, + "flos": 22524609087360.0, + "grad_norm": 1.5592881493961996, + "language_loss": 0.72042692, + "learning_rate": 6.547687663522739e-07, + "loss": 0.74176657, + "num_input_tokens_seen": 266465640, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 12354, + "time_per_iteration": 2.4892525672912598 + }, + { + "auxiliary_loss_clip": 0.01024379, + "auxiliary_loss_mlp": 0.01002171, + "balance_loss_clip": 1.00115824, + "balance_loss_mlp": 1.0041914, + "epoch": 0.7428227867127611, + "flos": 67209477655680.0, + "grad_norm": 0.7195367720859078, + "language_loss": 0.595505, + "learning_rate": 6.544805933122199e-07, + "loss": 0.61577046, + "num_input_tokens_seen": 266531950, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.20214844, + "step": 12355, + "time_per_iteration": 3.1565847396850586 + }, + { + "auxiliary_loss_clip": 0.01101716, + "auxiliary_loss_mlp": 0.01029249, + "balance_loss_clip": 1.01746547, + "balance_loss_mlp": 1.03509939, + "epoch": 0.7428829099654292, + "flos": 14721221082240.0, + "grad_norm": 1.5856742175038152, + "language_loss": 0.67546952, + "learning_rate": 6.541924712943971e-07, + "loss": 0.69677925, + "num_input_tokens_seen": 266550665, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 12356, + "time_per_iteration": 2.4489800930023193 + }, + { + "auxiliary_loss_clip": 0.01102658, + "auxiliary_loss_mlp": 0.01035169, + "balance_loss_clip": 1.02305126, + "balance_loss_mlp": 1.03400218, + "epoch": 0.7429430332180971, + "flos": 48646496413440.0, + "grad_norm": 2.760673613642481, + "language_loss": 0.72485077, + "learning_rate": 6.539044003097301e-07, + "loss": 0.74622905, + "num_input_tokens_seen": 266572455, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 12357, + "time_per_iteration": 2.721644401550293 + }, + { + "auxiliary_loss_clip": 0.01098694, + "auxiliary_loss_mlp": 0.01028573, + "balance_loss_clip": 1.0175041, + "balance_loss_mlp": 1.03629148, + "epoch": 0.7430031564707651, + "flos": 16764071281920.0, + "grad_norm": 2.0039134107579395, + "language_loss": 0.65105826, + "learning_rate": 6.53616380369143e-07, + "loss": 0.67233098, + "num_input_tokens_seen": 266590895, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.625, + "step": 12358, + "time_per_iteration": 2.4294605255126953 + }, + { + "auxiliary_loss_clip": 0.01104584, + "auxiliary_loss_mlp": 0.01035558, + "balance_loss_clip": 1.02243936, + "balance_loss_mlp": 1.03652191, + "epoch": 0.743063279723433, + "flos": 23870576545920.0, + "grad_norm": 1.8081229014020102, + "language_loss": 0.80658948, + "learning_rate": 6.533284114835591e-07, + "loss": 0.82799089, + "num_input_tokens_seen": 266607660, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6796875, + "step": 12359, + "time_per_iteration": 2.4662840366363525 + }, + { + "auxiliary_loss_clip": 0.01100319, + "auxiliary_loss_mlp": 0.01027181, + "balance_loss_clip": 1.01539159, + "balance_loss_mlp": 1.03399527, + "epoch": 0.743123402976101, + "flos": 14391704689920.0, + "grad_norm": 1.9929370638459747, + "language_loss": 0.68443716, + "learning_rate": 6.530404936638956e-07, + "loss": 0.7057122, + "num_input_tokens_seen": 266624260, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 12360, + "time_per_iteration": 2.39972186088562 + }, + { + "auxiliary_loss_clip": 0.01100119, + "auxiliary_loss_mlp": 0.01028769, + "balance_loss_clip": 1.01742589, + "balance_loss_mlp": 1.03408909, + "epoch": 0.7431835262287689, + "flos": 27454318335360.0, + "grad_norm": 1.6105929709695739, + "language_loss": 0.72354007, + "learning_rate": 6.527526269210715e-07, + "loss": 0.74482894, + "num_input_tokens_seen": 266644210, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66015625, + "step": 12361, + "time_per_iteration": 2.53438663482666 + }, + { + "auxiliary_loss_clip": 0.01103295, + "auxiliary_loss_mlp": 0.01031584, + "balance_loss_clip": 1.01991367, + "balance_loss_mlp": 1.03592443, + "epoch": 0.743243649481437, + "flos": 20959514709120.0, + "grad_norm": 1.9313349058571254, + "language_loss": 0.55937529, + "learning_rate": 6.524648112660027e-07, + "loss": 0.58072412, + "num_input_tokens_seen": 266664230, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 12362, + "time_per_iteration": 2.44446063041687 + }, + { + "auxiliary_loss_clip": 0.01103216, + "auxiliary_loss_mlp": 0.01029843, + "balance_loss_clip": 1.01801753, + "balance_loss_mlp": 1.03700173, + "epoch": 0.7433037727341049, + "flos": 22783166161920.0, + "grad_norm": 1.6965020963152944, + "language_loss": 0.77103531, + "learning_rate": 6.521770467096039e-07, + "loss": 0.79236591, + "num_input_tokens_seen": 266683270, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 12363, + "time_per_iteration": 2.4665377140045166 + }, + { + "auxiliary_loss_clip": 0.01100355, + "auxiliary_loss_mlp": 0.01030156, + "balance_loss_clip": 1.01916444, + "balance_loss_mlp": 1.03546381, + "epoch": 0.7433638959867729, + "flos": 22196708807040.0, + "grad_norm": 1.5848696782031413, + "language_loss": 0.781322, + "learning_rate": 6.518893332627862e-07, + "loss": 0.80262709, + "num_input_tokens_seen": 266701235, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6484375, + "step": 12364, + "time_per_iteration": 2.4513514041900635 + }, + { + "auxiliary_loss_clip": 0.01099072, + "auxiliary_loss_mlp": 0.0103225, + "balance_loss_clip": 1.02075863, + "balance_loss_mlp": 1.03311908, + "epoch": 0.7434240192394409, + "flos": 23296760778240.0, + "grad_norm": 1.566466537213553, + "language_loss": 0.78534245, + "learning_rate": 6.516016709364604e-07, + "loss": 0.80665576, + "num_input_tokens_seen": 266721495, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 12365, + "time_per_iteration": 2.5116143226623535 + }, + { + "auxiliary_loss_clip": 0.01103544, + "auxiliary_loss_mlp": 0.01031747, + "balance_loss_clip": 1.01939702, + "balance_loss_mlp": 1.03469706, + "epoch": 0.7434841424921088, + "flos": 54009575251200.0, + "grad_norm": 1.5212918722481565, + "language_loss": 0.76719224, + "learning_rate": 6.513140597415346e-07, + "loss": 0.78854513, + "num_input_tokens_seen": 266747400, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6875, + "step": 12366, + "time_per_iteration": 2.714674711227417 + }, + { + "auxiliary_loss_clip": 0.0109921, + "auxiliary_loss_mlp": 0.01030031, + "balance_loss_clip": 1.01957047, + "balance_loss_mlp": 1.03603196, + "epoch": 0.7435442657447768, + "flos": 21433966479360.0, + "grad_norm": 1.8098497154463502, + "language_loss": 0.7116037, + "learning_rate": 6.510264996889141e-07, + "loss": 0.73289615, + "num_input_tokens_seen": 266767630, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.6328125, + "step": 12367, + "time_per_iteration": 2.4605956077575684 + }, + { + "auxiliary_loss_clip": 0.01104307, + "auxiliary_loss_mlp": 0.01034707, + "balance_loss_clip": 1.02303672, + "balance_loss_mlp": 1.03570354, + "epoch": 0.7436043889974447, + "flos": 24499408970880.0, + "grad_norm": 1.5537878615409826, + "language_loss": 0.74737108, + "learning_rate": 6.507389907895038e-07, + "loss": 0.76876128, + "num_input_tokens_seen": 266788015, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6875, + "step": 12368, + "time_per_iteration": 2.4532225131988525 + }, + { + "auxiliary_loss_clip": 0.01099451, + "auxiliary_loss_mlp": 0.01032106, + "balance_loss_clip": 1.02200878, + "balance_loss_mlp": 1.03582263, + "epoch": 0.7436645122501128, + "flos": 40698388512000.0, + "grad_norm": 2.6437968867522397, + "language_loss": 0.69177192, + "learning_rate": 6.50451533054207e-07, + "loss": 0.7130875, + "num_input_tokens_seen": 266809010, + "router_z_loss_clip": 0.10107422, + "router_z_loss_mlp": 0.63671875, + "step": 12369, + "time_per_iteration": 2.6095521450042725 + }, + { + "auxiliary_loss_clip": 0.01100669, + "auxiliary_loss_mlp": 0.01027152, + "balance_loss_clip": 1.01569033, + "balance_loss_mlp": 1.03491139, + "epoch": 0.7437246355027807, + "flos": 18908835344640.0, + "grad_norm": 1.8225441721973505, + "language_loss": 0.75607926, + "learning_rate": 6.501641264939233e-07, + "loss": 0.77735746, + "num_input_tokens_seen": 266825390, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 12370, + "time_per_iteration": 2.3974015712738037 + }, + { + "auxiliary_loss_clip": 0.01101812, + "auxiliary_loss_mlp": 0.01033435, + "balance_loss_clip": 1.02188921, + "balance_loss_mlp": 1.03682232, + "epoch": 0.7437847587554487, + "flos": 21543817248000.0, + "grad_norm": 1.5003725500414622, + "language_loss": 0.78235525, + "learning_rate": 6.498767711195503e-07, + "loss": 0.80370772, + "num_input_tokens_seen": 266844675, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6484375, + "step": 12371, + "time_per_iteration": 2.587583303451538 + }, + { + "auxiliary_loss_clip": 0.01100563, + "auxiliary_loss_mlp": 0.01024934, + "balance_loss_clip": 1.01351357, + "balance_loss_mlp": 1.03449976, + "epoch": 0.7438448820081166, + "flos": 27782470010880.0, + "grad_norm": 1.5904858963552928, + "language_loss": 0.69456738, + "learning_rate": 6.495894669419857e-07, + "loss": 0.71582228, + "num_input_tokens_seen": 266865160, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 12372, + "time_per_iteration": 2.46589732170105 + }, + { + "auxiliary_loss_clip": 0.01100436, + "auxiliary_loss_mlp": 0.0103015, + "balance_loss_clip": 1.01876593, + "balance_loss_mlp": 1.03523791, + "epoch": 0.7439050052607846, + "flos": 17967832796160.0, + "grad_norm": 2.0303622627769, + "language_loss": 0.74881828, + "learning_rate": 6.493022139721245e-07, + "loss": 0.77012408, + "num_input_tokens_seen": 266883285, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 12373, + "time_per_iteration": 2.429455518722534 + }, + { + "auxiliary_loss_clip": 0.01103553, + "auxiliary_loss_mlp": 0.01031934, + "balance_loss_clip": 1.01918495, + "balance_loss_mlp": 1.03517175, + "epoch": 0.7439651285134525, + "flos": 22958696949120.0, + "grad_norm": 1.8905423318011396, + "language_loss": 0.77127612, + "learning_rate": 6.49015012220858e-07, + "loss": 0.79263097, + "num_input_tokens_seen": 266900960, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.68359375, + "step": 12374, + "time_per_iteration": 2.467027425765991 + }, + { + "auxiliary_loss_clip": 0.01101807, + "auxiliary_loss_mlp": 0.0103348, + "balance_loss_clip": 1.02173197, + "balance_loss_mlp": 1.03450108, + "epoch": 0.7440252517661206, + "flos": 18806777827200.0, + "grad_norm": 2.0275286605601903, + "language_loss": 0.76452887, + "learning_rate": 6.487278616990774e-07, + "loss": 0.7858817, + "num_input_tokens_seen": 266917710, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.671875, + "step": 12375, + "time_per_iteration": 2.4504282474517822 + }, + { + "auxiliary_loss_clip": 0.01098205, + "auxiliary_loss_mlp": 0.01029678, + "balance_loss_clip": 1.01930046, + "balance_loss_mlp": 1.03446364, + "epoch": 0.7440853750187885, + "flos": 20266295155200.0, + "grad_norm": 1.8957308287031664, + "language_loss": 0.77052188, + "learning_rate": 6.484407624176733e-07, + "loss": 0.79180074, + "num_input_tokens_seen": 266934220, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.63671875, + "step": 12376, + "time_per_iteration": 2.426997423171997 + }, + { + "auxiliary_loss_clip": 0.01102657, + "auxiliary_loss_mlp": 0.01026205, + "balance_loss_clip": 1.01435566, + "balance_loss_mlp": 1.03490746, + "epoch": 0.7441454982714565, + "flos": 25337276593920.0, + "grad_norm": 1.648771332644217, + "language_loss": 0.79147625, + "learning_rate": 6.481537143875296e-07, + "loss": 0.81276488, + "num_input_tokens_seen": 266955210, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.67578125, + "step": 12377, + "time_per_iteration": 2.5062367916107178 + }, + { + "auxiliary_loss_clip": 0.01104221, + "auxiliary_loss_mlp": 0.01028645, + "balance_loss_clip": 1.01639605, + "balance_loss_mlp": 1.03595889, + "epoch": 0.7442056215241245, + "flos": 64480910866560.0, + "grad_norm": 1.8728399382870544, + "language_loss": 0.67017269, + "learning_rate": 6.478667176195322e-07, + "loss": 0.69150138, + "num_input_tokens_seen": 266976555, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.68359375, + "step": 12378, + "time_per_iteration": 2.81579327583313 + }, + { + "auxiliary_loss_clip": 0.01103565, + "auxiliary_loss_mlp": 0.01034825, + "balance_loss_clip": 1.02170622, + "balance_loss_mlp": 1.0356729, + "epoch": 0.7442657447767924, + "flos": 31285376242560.0, + "grad_norm": 1.6381441755645296, + "language_loss": 0.71693718, + "learning_rate": 6.475797721245648e-07, + "loss": 0.73832107, + "num_input_tokens_seen": 266997640, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6796875, + "step": 12379, + "time_per_iteration": 2.5361573696136475 + }, + { + "auxiliary_loss_clip": 0.0110076, + "auxiliary_loss_mlp": 0.01032377, + "balance_loss_clip": 1.02015245, + "balance_loss_mlp": 1.0342983, + "epoch": 0.7443258680294604, + "flos": 20807899401600.0, + "grad_norm": 1.779117116222904, + "language_loss": 0.6545527, + "learning_rate": 6.472928779135085e-07, + "loss": 0.67588407, + "num_input_tokens_seen": 267016165, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6640625, + "step": 12380, + "time_per_iteration": 2.4233927726745605 + }, + { + "auxiliary_loss_clip": 0.01103433, + "auxiliary_loss_mlp": 0.01030324, + "balance_loss_clip": 1.0180037, + "balance_loss_mlp": 1.0361979, + "epoch": 0.7443859912821283, + "flos": 22199833290240.0, + "grad_norm": 1.8649656788405269, + "language_loss": 0.78407371, + "learning_rate": 6.470060349972411e-07, + "loss": 0.80541134, + "num_input_tokens_seen": 267034075, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.671875, + "step": 12381, + "time_per_iteration": 2.4858570098876953 + }, + { + "auxiliary_loss_clip": 0.01105177, + "auxiliary_loss_mlp": 0.01030856, + "balance_loss_clip": 1.01844049, + "balance_loss_mlp": 1.03706956, + "epoch": 0.7444461145347964, + "flos": 22017838055040.0, + "grad_norm": 2.020102032989411, + "language_loss": 0.726803, + "learning_rate": 6.467192433866411e-07, + "loss": 0.74816334, + "num_input_tokens_seen": 267053645, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 12382, + "time_per_iteration": 2.4412596225738525 + }, + { + "auxiliary_loss_clip": 0.01025583, + "auxiliary_loss_mlp": 0.0100093, + "balance_loss_clip": 0.99986947, + "balance_loss_mlp": 1.00515175, + "epoch": 0.7445062377874643, + "flos": 70559047704960.0, + "grad_norm": 0.6497921539673587, + "language_loss": 0.5464738, + "learning_rate": 6.464325030925831e-07, + "loss": 0.56673896, + "num_input_tokens_seen": 267121830, + "router_z_loss_clip": 0.01062012, + "router_z_loss_mlp": 0.20507812, + "step": 12383, + "time_per_iteration": 3.222402811050415 + }, + { + "auxiliary_loss_clip": 0.01100878, + "auxiliary_loss_mlp": 0.01026647, + "balance_loss_clip": 1.0151608, + "balance_loss_mlp": 1.03370833, + "epoch": 0.7445663610401323, + "flos": 22164425458560.0, + "grad_norm": 1.9786543947489503, + "language_loss": 0.76230276, + "learning_rate": 6.461458141259395e-07, + "loss": 0.78357792, + "num_input_tokens_seen": 267141145, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.671875, + "step": 12384, + "time_per_iteration": 2.455353021621704 + }, + { + "auxiliary_loss_clip": 0.01100095, + "auxiliary_loss_mlp": 0.01029552, + "balance_loss_clip": 1.01782155, + "balance_loss_mlp": 1.03452992, + "epoch": 0.7446264842928002, + "flos": 24170251714560.0, + "grad_norm": 2.0782969884363816, + "language_loss": 0.79298764, + "learning_rate": 6.458591764975823e-07, + "loss": 0.81428415, + "num_input_tokens_seen": 267159280, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.65625, + "step": 12385, + "time_per_iteration": 2.4987757205963135 + }, + { + "auxiliary_loss_clip": 0.01106725, + "auxiliary_loss_mlp": 0.01032618, + "balance_loss_clip": 1.01921868, + "balance_loss_mlp": 1.03626704, + "epoch": 0.7446866075454682, + "flos": 24134556574080.0, + "grad_norm": 1.6771558108044815, + "language_loss": 0.8143934, + "learning_rate": 6.455725902183813e-07, + "loss": 0.83578682, + "num_input_tokens_seen": 267179390, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.703125, + "step": 12386, + "time_per_iteration": 3.998560667037964 + }, + { + "auxiliary_loss_clip": 0.01099591, + "auxiliary_loss_mlp": 0.01026498, + "balance_loss_clip": 1.0154351, + "balance_loss_mlp": 1.03524506, + "epoch": 0.7447467307981361, + "flos": 23548063305600.0, + "grad_norm": 1.7576352250203031, + "language_loss": 0.71226764, + "learning_rate": 6.452860552992037e-07, + "loss": 0.73352849, + "num_input_tokens_seen": 267198165, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.64453125, + "step": 12387, + "time_per_iteration": 2.4593608379364014 + }, + { + "auxiliary_loss_clip": 0.01101935, + "auxiliary_loss_mlp": 0.0102766, + "balance_loss_clip": 1.01612639, + "balance_loss_mlp": 1.03501618, + "epoch": 0.7448068540508042, + "flos": 19567832215680.0, + "grad_norm": 2.162095578178006, + "language_loss": 0.7053076, + "learning_rate": 6.449995717509138e-07, + "loss": 0.72660351, + "num_input_tokens_seen": 267214520, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 12388, + "time_per_iteration": 3.8914287090301514 + }, + { + "auxiliary_loss_clip": 0.01102008, + "auxiliary_loss_mlp": 0.01030206, + "balance_loss_clip": 1.01879215, + "balance_loss_mlp": 1.03539407, + "epoch": 0.7448669773034721, + "flos": 21839721488640.0, + "grad_norm": 1.5805660577109513, + "language_loss": 0.84949243, + "learning_rate": 6.447131395843761e-07, + "loss": 0.87081456, + "num_input_tokens_seen": 267236555, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 12389, + "time_per_iteration": 4.039583683013916 + }, + { + "auxiliary_loss_clip": 0.01100859, + "auxiliary_loss_mlp": 0.01030695, + "balance_loss_clip": 1.01907206, + "balance_loss_mlp": 1.03446209, + "epoch": 0.7449271005561401, + "flos": 25155389099520.0, + "grad_norm": 1.992620566185106, + "language_loss": 0.79385233, + "learning_rate": 6.444267588104526e-07, + "loss": 0.8151679, + "num_input_tokens_seen": 267254800, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 12390, + "time_per_iteration": 3.9466896057128906 + }, + { + "auxiliary_loss_clip": 0.01104503, + "auxiliary_loss_mlp": 0.0102899, + "balance_loss_clip": 1.01669955, + "balance_loss_mlp": 1.03669739, + "epoch": 0.7449872238088081, + "flos": 22273342473600.0, + "grad_norm": 1.730347550558291, + "language_loss": 0.84698212, + "learning_rate": 6.441404294400014e-07, + "loss": 0.86831707, + "num_input_tokens_seen": 267274610, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6796875, + "step": 12391, + "time_per_iteration": 2.493415117263794 + }, + { + "auxiliary_loss_clip": 0.01100534, + "auxiliary_loss_mlp": 0.01029608, + "balance_loss_clip": 1.01861715, + "balance_loss_mlp": 1.03483033, + "epoch": 0.745047347061476, + "flos": 20594805966720.0, + "grad_norm": 1.8306369594039993, + "language_loss": 0.73786843, + "learning_rate": 6.438541514838811e-07, + "loss": 0.75916982, + "num_input_tokens_seen": 267292600, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.65625, + "step": 12392, + "time_per_iteration": 2.431533098220825 + }, + { + "auxiliary_loss_clip": 0.01098966, + "auxiliary_loss_mlp": 0.01033054, + "balance_loss_clip": 1.02171087, + "balance_loss_mlp": 1.03509498, + "epoch": 0.745107470314144, + "flos": 22127545169280.0, + "grad_norm": 1.6456666698641875, + "language_loss": 0.76718521, + "learning_rate": 6.435679249529487e-07, + "loss": 0.78850538, + "num_input_tokens_seen": 267311295, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.63671875, + "step": 12393, + "time_per_iteration": 2.473604917526245 + }, + { + "auxiliary_loss_clip": 0.01102478, + "auxiliary_loss_mlp": 0.01033816, + "balance_loss_clip": 1.02133441, + "balance_loss_mlp": 1.03579187, + "epoch": 0.745167593566812, + "flos": 22236498097920.0, + "grad_norm": 1.8111060695117658, + "language_loss": 0.72828883, + "learning_rate": 6.432817498580552e-07, + "loss": 0.74965185, + "num_input_tokens_seen": 267328390, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6640625, + "step": 12394, + "time_per_iteration": 2.4453284740448 + }, + { + "auxiliary_loss_clip": 0.01103102, + "auxiliary_loss_mlp": 0.01035289, + "balance_loss_clip": 1.02332675, + "balance_loss_mlp": 1.03558517, + "epoch": 0.74522771681948, + "flos": 20666232161280.0, + "grad_norm": 1.668528755901744, + "language_loss": 0.81820607, + "learning_rate": 6.429956262100535e-07, + "loss": 0.83958995, + "num_input_tokens_seen": 267348185, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.67578125, + "step": 12395, + "time_per_iteration": 2.4907712936401367 + }, + { + "auxiliary_loss_clip": 0.01104977, + "auxiliary_loss_mlp": 0.010329, + "balance_loss_clip": 1.02062798, + "balance_loss_mlp": 1.03574276, + "epoch": 0.7452878400721479, + "flos": 21106999952640.0, + "grad_norm": 5.4481505993838475, + "language_loss": 0.70923871, + "learning_rate": 6.427095540197937e-07, + "loss": 0.73061752, + "num_input_tokens_seen": 267367010, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69140625, + "step": 12396, + "time_per_iteration": 2.5307369232177734 + }, + { + "auxiliary_loss_clip": 0.01103961, + "auxiliary_loss_mlp": 0.01029197, + "balance_loss_clip": 1.0171988, + "balance_loss_mlp": 1.03555429, + "epoch": 0.7453479633248159, + "flos": 26688056474880.0, + "grad_norm": 1.799312565551718, + "language_loss": 0.6829254, + "learning_rate": 6.424235332981245e-07, + "loss": 0.70425701, + "num_input_tokens_seen": 267386605, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 12397, + "time_per_iteration": 2.5126614570617676 + }, + { + "auxiliary_loss_clip": 0.01101329, + "auxiliary_loss_mlp": 0.01040843, + "balance_loss_clip": 1.02871311, + "balance_loss_mlp": 1.03490043, + "epoch": 0.7454080865774838, + "flos": 17016056167680.0, + "grad_norm": 2.004729126431997, + "language_loss": 0.76321107, + "learning_rate": 6.421375640558908e-07, + "loss": 0.7846328, + "num_input_tokens_seen": 267404135, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6640625, + "step": 12398, + "time_per_iteration": 2.4119622707366943 + }, + { + "auxiliary_loss_clip": 0.01098608, + "auxiliary_loss_mlp": 0.01026539, + "balance_loss_clip": 1.01461804, + "balance_loss_mlp": 1.03464854, + "epoch": 0.7454682098301518, + "flos": 21323900229120.0, + "grad_norm": 1.6814125292484552, + "language_loss": 0.77809334, + "learning_rate": 6.418516463039363e-07, + "loss": 0.79934478, + "num_input_tokens_seen": 267423120, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.640625, + "step": 12399, + "time_per_iteration": 2.4987549781799316 + }, + { + "auxiliary_loss_clip": 0.01097189, + "auxiliary_loss_mlp": 0.0103442, + "balance_loss_clip": 1.02345836, + "balance_loss_mlp": 1.03396916, + "epoch": 0.7455283330828197, + "flos": 17858341163520.0, + "grad_norm": 1.9741218645460363, + "language_loss": 0.73963678, + "learning_rate": 6.415657800531038e-07, + "loss": 0.76095283, + "num_input_tokens_seen": 267441250, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6328125, + "step": 12400, + "time_per_iteration": 2.4242513179779053 + }, + { + "auxiliary_loss_clip": 0.01098712, + "auxiliary_loss_mlp": 0.01029354, + "balance_loss_clip": 1.01808882, + "balance_loss_mlp": 1.03357267, + "epoch": 0.7455884563354878, + "flos": 30774259664640.0, + "grad_norm": 1.8638807707826066, + "language_loss": 0.81975746, + "learning_rate": 6.412799653142327e-07, + "loss": 0.84103811, + "num_input_tokens_seen": 267462820, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65234375, + "step": 12401, + "time_per_iteration": 2.5451955795288086 + }, + { + "auxiliary_loss_clip": 0.01100279, + "auxiliary_loss_mlp": 0.01033562, + "balance_loss_clip": 1.02252901, + "balance_loss_mlp": 1.03501511, + "epoch": 0.7456485795881557, + "flos": 23185545292800.0, + "grad_norm": 1.845084112452823, + "language_loss": 0.65197337, + "learning_rate": 6.409942020981611e-07, + "loss": 0.67331183, + "num_input_tokens_seen": 267483065, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65234375, + "step": 12402, + "time_per_iteration": 2.4577367305755615 + }, + { + "auxiliary_loss_clip": 0.01099262, + "auxiliary_loss_mlp": 0.01028921, + "balance_loss_clip": 1.01831102, + "balance_loss_mlp": 1.0342567, + "epoch": 0.7457087028408237, + "flos": 38727144074880.0, + "grad_norm": 1.6576964620220311, + "language_loss": 0.73214388, + "learning_rate": 6.407084904157265e-07, + "loss": 0.75342572, + "num_input_tokens_seen": 267504825, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.6484375, + "step": 12403, + "time_per_iteration": 2.620654821395874 + }, + { + "auxiliary_loss_clip": 0.01024907, + "auxiliary_loss_mlp": 0.01005223, + "balance_loss_clip": 1.00420368, + "balance_loss_mlp": 1.00436723, + "epoch": 0.7457688260934917, + "flos": 56043737337600.0, + "grad_norm": 0.8255474672184773, + "language_loss": 0.58760434, + "learning_rate": 6.404228302777621e-07, + "loss": 0.60790563, + "num_input_tokens_seen": 267559260, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.20507812, + "step": 12404, + "time_per_iteration": 2.8954858779907227 + }, + { + "auxiliary_loss_clip": 0.0110003, + "auxiliary_loss_mlp": 0.01034771, + "balance_loss_clip": 1.02357709, + "balance_loss_mlp": 1.03306055, + "epoch": 0.7458289493461596, + "flos": 20116152305280.0, + "grad_norm": 1.6032592804273305, + "language_loss": 0.77657819, + "learning_rate": 6.401372216950995e-07, + "loss": 0.79792619, + "num_input_tokens_seen": 267578720, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.66796875, + "step": 12405, + "time_per_iteration": 2.470407247543335 + }, + { + "auxiliary_loss_clip": 0.01100531, + "auxiliary_loss_mlp": 0.01032126, + "balance_loss_clip": 1.02069402, + "balance_loss_mlp": 1.03543913, + "epoch": 0.7458890725988276, + "flos": 20193073280640.0, + "grad_norm": 1.5461856417653022, + "language_loss": 0.69148755, + "learning_rate": 6.398516646785698e-07, + "loss": 0.71281415, + "num_input_tokens_seen": 267598250, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 12406, + "time_per_iteration": 2.4450039863586426 + }, + { + "auxiliary_loss_clip": 0.01107001, + "auxiliary_loss_mlp": 0.01034039, + "balance_loss_clip": 1.02102149, + "balance_loss_mlp": 1.03617549, + "epoch": 0.7459491958514956, + "flos": 17018749687680.0, + "grad_norm": 1.505466725953553, + "language_loss": 0.64742386, + "learning_rate": 6.39566159239002e-07, + "loss": 0.66883421, + "num_input_tokens_seen": 267615430, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 12407, + "time_per_iteration": 2.4332051277160645 + }, + { + "auxiliary_loss_clip": 0.01103397, + "auxiliary_loss_mlp": 0.01031734, + "balance_loss_clip": 1.01944911, + "balance_loss_mlp": 1.03494692, + "epoch": 0.7460093191041636, + "flos": 25078719519360.0, + "grad_norm": 1.652287891377431, + "language_loss": 0.72460616, + "learning_rate": 6.392807053872212e-07, + "loss": 0.74595749, + "num_input_tokens_seen": 267635075, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.68359375, + "step": 12408, + "time_per_iteration": 2.4836978912353516 + }, + { + "auxiliary_loss_clip": 0.01106452, + "auxiliary_loss_mlp": 0.01034146, + "balance_loss_clip": 1.02128339, + "balance_loss_mlp": 1.03751123, + "epoch": 0.7460694423568315, + "flos": 21908525990400.0, + "grad_norm": 1.7143768507331778, + "language_loss": 0.72858518, + "learning_rate": 6.38995303134053e-07, + "loss": 0.74999118, + "num_input_tokens_seen": 267654105, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6875, + "step": 12409, + "time_per_iteration": 2.515709400177002 + }, + { + "auxiliary_loss_clip": 0.01098264, + "auxiliary_loss_mlp": 0.01032387, + "balance_loss_clip": 1.02187228, + "balance_loss_mlp": 1.03468859, + "epoch": 0.7461295656094995, + "flos": 21215737399680.0, + "grad_norm": 1.710421587761424, + "language_loss": 0.6618892, + "learning_rate": 6.38709952490319e-07, + "loss": 0.68319571, + "num_input_tokens_seen": 267673090, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.63671875, + "step": 12410, + "time_per_iteration": 2.449406147003174 + }, + { + "auxiliary_loss_clip": 0.01098711, + "auxiliary_loss_mlp": 0.0103109, + "balance_loss_clip": 1.01910925, + "balance_loss_mlp": 1.034163, + "epoch": 0.7461896888621674, + "flos": 22346851656960.0, + "grad_norm": 2.213506116293379, + "language_loss": 0.84104359, + "learning_rate": 6.384246534668396e-07, + "loss": 0.86234152, + "num_input_tokens_seen": 267690605, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6484375, + "step": 12411, + "time_per_iteration": 2.4625163078308105 + }, + { + "auxiliary_loss_clip": 0.01103566, + "auxiliary_loss_mlp": 0.01029489, + "balance_loss_clip": 1.01740742, + "balance_loss_mlp": 1.03515697, + "epoch": 0.7462498121148354, + "flos": 25482930243840.0, + "grad_norm": 1.6692936053556306, + "language_loss": 0.7766965, + "learning_rate": 6.381394060744339e-07, + "loss": 0.79802704, + "num_input_tokens_seen": 267710540, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.68359375, + "step": 12412, + "time_per_iteration": 2.4557554721832275 + }, + { + "auxiliary_loss_clip": 0.01102723, + "auxiliary_loss_mlp": 0.01035248, + "balance_loss_clip": 1.02409029, + "balance_loss_mlp": 1.03520751, + "epoch": 0.7463099353675033, + "flos": 33947936812800.0, + "grad_norm": 1.834679176534713, + "language_loss": 0.6225034, + "learning_rate": 6.378542103239188e-07, + "loss": 0.64388311, + "num_input_tokens_seen": 267730780, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.67578125, + "step": 12413, + "time_per_iteration": 2.559657573699951 + }, + { + "auxiliary_loss_clip": 0.01024964, + "auxiliary_loss_mlp": 0.01002262, + "balance_loss_clip": 1.00121295, + "balance_loss_mlp": 1.00439072, + "epoch": 0.7463700586201714, + "flos": 62767723691520.0, + "grad_norm": 0.7203793484361629, + "language_loss": 0.54924321, + "learning_rate": 6.375690662261082e-07, + "loss": 0.56951547, + "num_input_tokens_seen": 267794240, + "router_z_loss_clip": 0.01049805, + "router_z_loss_mlp": 0.20605469, + "step": 12414, + "time_per_iteration": 3.0637338161468506 + }, + { + "auxiliary_loss_clip": 0.01101199, + "auxiliary_loss_mlp": 0.01030512, + "balance_loss_clip": 1.01846027, + "balance_loss_mlp": 1.03334022, + "epoch": 0.7464301818728393, + "flos": 33432654257280.0, + "grad_norm": 1.860182659182016, + "language_loss": 0.54804456, + "learning_rate": 6.372839737918154e-07, + "loss": 0.56936157, + "num_input_tokens_seen": 267817190, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 12415, + "time_per_iteration": 2.5465588569641113 + }, + { + "auxiliary_loss_clip": 0.01101991, + "auxiliary_loss_mlp": 0.01032607, + "balance_loss_clip": 1.02022743, + "balance_loss_mlp": 1.0359658, + "epoch": 0.7464903051255073, + "flos": 26869872142080.0, + "grad_norm": 1.6660939393048266, + "language_loss": 0.74985796, + "learning_rate": 6.369989330318506e-07, + "loss": 0.77120394, + "num_input_tokens_seen": 267836245, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.65625, + "step": 12416, + "time_per_iteration": 2.514845132827759 + }, + { + "auxiliary_loss_clip": 0.01101809, + "auxiliary_loss_mlp": 0.01034557, + "balance_loss_clip": 1.02302361, + "balance_loss_mlp": 1.03556323, + "epoch": 0.7465504283781753, + "flos": 44086954775040.0, + "grad_norm": 1.4814223642956346, + "language_loss": 0.69489551, + "learning_rate": 6.367139439570233e-07, + "loss": 0.71625924, + "num_input_tokens_seen": 267858310, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66015625, + "step": 12417, + "time_per_iteration": 2.6574227809906006 + }, + { + "auxiliary_loss_clip": 0.01104674, + "auxiliary_loss_mlp": 0.01030343, + "balance_loss_clip": 1.01790345, + "balance_loss_mlp": 1.03659248, + "epoch": 0.7466105516308432, + "flos": 19676102785920.0, + "grad_norm": 1.767590849665872, + "language_loss": 0.73728597, + "learning_rate": 6.364290065781392e-07, + "loss": 0.75863612, + "num_input_tokens_seen": 267876345, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6796875, + "step": 12418, + "time_per_iteration": 2.462244987487793 + }, + { + "auxiliary_loss_clip": 0.01103226, + "auxiliary_loss_mlp": 0.01027321, + "balance_loss_clip": 1.01586497, + "balance_loss_mlp": 1.03675175, + "epoch": 0.7466706748835112, + "flos": 20520722165760.0, + "grad_norm": 1.574966460677448, + "language_loss": 0.69369054, + "learning_rate": 6.361441209060039e-07, + "loss": 0.71499598, + "num_input_tokens_seen": 267896740, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 12419, + "time_per_iteration": 2.4568960666656494 + }, + { + "auxiliary_loss_clip": 0.01095857, + "auxiliary_loss_mlp": 0.01032572, + "balance_loss_clip": 1.02151561, + "balance_loss_mlp": 1.03342533, + "epoch": 0.7467307981361792, + "flos": 21690260997120.0, + "grad_norm": 1.6640874245133943, + "language_loss": 0.74578714, + "learning_rate": 6.358592869514216e-07, + "loss": 0.76707137, + "num_input_tokens_seen": 267914765, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.625, + "step": 12420, + "time_per_iteration": 2.5238821506500244 + }, + { + "auxiliary_loss_clip": 0.01104196, + "auxiliary_loss_mlp": 0.01031831, + "balance_loss_clip": 1.01943922, + "balance_loss_mlp": 1.03683901, + "epoch": 0.7467909213888472, + "flos": 19573686132480.0, + "grad_norm": 1.6177707150337377, + "language_loss": 0.67195189, + "learning_rate": 6.355745047251904e-07, + "loss": 0.69331217, + "num_input_tokens_seen": 267934085, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 12421, + "time_per_iteration": 2.4293341636657715 + }, + { + "auxiliary_loss_clip": 0.01104487, + "auxiliary_loss_mlp": 0.01032623, + "balance_loss_clip": 1.01912296, + "balance_loss_mlp": 1.03556955, + "epoch": 0.7468510446415151, + "flos": 23695225326720.0, + "grad_norm": 1.5639142011030407, + "language_loss": 0.72440511, + "learning_rate": 6.352897742381107e-07, + "loss": 0.74577618, + "num_input_tokens_seen": 267955170, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.6875, + "step": 12422, + "time_per_iteration": 2.509237766265869 + }, + { + "auxiliary_loss_clip": 0.01100612, + "auxiliary_loss_mlp": 0.01030685, + "balance_loss_clip": 1.01877022, + "balance_loss_mlp": 1.03514779, + "epoch": 0.7469111678941831, + "flos": 29315783831040.0, + "grad_norm": 9.98591332499941, + "language_loss": 0.74842906, + "learning_rate": 6.350050955009796e-07, + "loss": 0.76974201, + "num_input_tokens_seen": 267974980, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 12423, + "time_per_iteration": 2.5110628604888916 + }, + { + "auxiliary_loss_clip": 0.01099293, + "auxiliary_loss_mlp": 0.0102642, + "balance_loss_clip": 1.01536298, + "balance_loss_mlp": 1.03383863, + "epoch": 0.746971291146851, + "flos": 21798639308160.0, + "grad_norm": 1.296938244989713, + "language_loss": 0.67754054, + "learning_rate": 6.347204685245929e-07, + "loss": 0.6987977, + "num_input_tokens_seen": 267994985, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.65625, + "step": 12424, + "time_per_iteration": 2.4905362129211426 + }, + { + "auxiliary_loss_clip": 0.01105568, + "auxiliary_loss_mlp": 0.0103483, + "balance_loss_clip": 1.02293932, + "balance_loss_mlp": 1.03707027, + "epoch": 0.747031414399519, + "flos": 36245070368640.0, + "grad_norm": 1.7754548837213033, + "language_loss": 0.74119371, + "learning_rate": 6.344358933197418e-07, + "loss": 0.76259774, + "num_input_tokens_seen": 268014985, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 12425, + "time_per_iteration": 2.5686028003692627 + }, + { + "auxiliary_loss_clip": 0.01101237, + "auxiliary_loss_mlp": 0.0102942, + "balance_loss_clip": 1.01754081, + "balance_loss_mlp": 1.0353744, + "epoch": 0.7470915376521869, + "flos": 19974916028160.0, + "grad_norm": 2.326605643233434, + "language_loss": 0.69533008, + "learning_rate": 6.341513698972194e-07, + "loss": 0.71663666, + "num_input_tokens_seen": 268034395, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66015625, + "step": 12426, + "time_per_iteration": 2.4671969413757324 + }, + { + "auxiliary_loss_clip": 0.01097868, + "auxiliary_loss_mlp": 0.01036163, + "balance_loss_clip": 1.024755, + "balance_loss_mlp": 1.03396261, + "epoch": 0.747151660904855, + "flos": 20084299920000.0, + "grad_norm": 1.6460733379816328, + "language_loss": 0.65486181, + "learning_rate": 6.338668982678139e-07, + "loss": 0.67620206, + "num_input_tokens_seen": 268054485, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.640625, + "step": 12427, + "time_per_iteration": 2.459057092666626 + }, + { + "auxiliary_loss_clip": 0.01102337, + "auxiliary_loss_mlp": 0.01027971, + "balance_loss_clip": 1.01555538, + "balance_loss_mlp": 1.03570294, + "epoch": 0.7472117841575229, + "flos": 16290373697280.0, + "grad_norm": 1.7506429909383225, + "language_loss": 0.74639595, + "learning_rate": 6.335824784423118e-07, + "loss": 0.767699, + "num_input_tokens_seen": 268072250, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6640625, + "step": 12428, + "time_per_iteration": 3.809513807296753 + }, + { + "auxiliary_loss_clip": 0.0110597, + "auxiliary_loss_mlp": 0.0103051, + "balance_loss_clip": 1.01710534, + "balance_loss_mlp": 1.0359993, + "epoch": 0.7472719074101909, + "flos": 21389939383680.0, + "grad_norm": 2.159964503285926, + "language_loss": 0.58328772, + "learning_rate": 6.33298110431499e-07, + "loss": 0.60465252, + "num_input_tokens_seen": 268089840, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.69921875, + "step": 12429, + "time_per_iteration": 2.420081377029419 + }, + { + "auxiliary_loss_clip": 0.01105592, + "auxiliary_loss_mlp": 0.0103229, + "balance_loss_clip": 1.02002382, + "balance_loss_mlp": 1.03655839, + "epoch": 0.7473320306628589, + "flos": 29643289061760.0, + "grad_norm": 1.8822181590488856, + "language_loss": 0.60539925, + "learning_rate": 6.330137942461595e-07, + "loss": 0.62677801, + "num_input_tokens_seen": 268109360, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.69140625, + "step": 12430, + "time_per_iteration": 3.917961359024048 + }, + { + "auxiliary_loss_clip": 0.01102089, + "auxiliary_loss_mlp": 0.01030145, + "balance_loss_clip": 1.01848626, + "balance_loss_mlp": 1.0366466, + "epoch": 0.7473921539155268, + "flos": 24136100858880.0, + "grad_norm": 1.4375442916697652, + "language_loss": 0.75408334, + "learning_rate": 6.327295298970734e-07, + "loss": 0.77540565, + "num_input_tokens_seen": 268131840, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65625, + "step": 12431, + "time_per_iteration": 3.8775863647460938 + }, + { + "auxiliary_loss_clip": 0.01101836, + "auxiliary_loss_mlp": 0.01030243, + "balance_loss_clip": 1.01829195, + "balance_loss_mlp": 1.03413606, + "epoch": 0.7474522771681948, + "flos": 17487958072320.0, + "grad_norm": 1.7750987800998057, + "language_loss": 0.75931549, + "learning_rate": 6.32445317395021e-07, + "loss": 0.78063631, + "num_input_tokens_seen": 268148300, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.67578125, + "step": 12432, + "time_per_iteration": 2.4008095264434814 + }, + { + "auxiliary_loss_clip": 0.01105995, + "auxiliary_loss_mlp": 0.01036957, + "balance_loss_clip": 1.02375436, + "balance_loss_mlp": 1.03559935, + "epoch": 0.7475124004208628, + "flos": 16727298733440.0, + "grad_norm": 4.600278612020183, + "language_loss": 0.69874978, + "learning_rate": 6.321611567507787e-07, + "loss": 0.72017932, + "num_input_tokens_seen": 268166450, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 12433, + "time_per_iteration": 3.879322052001953 + }, + { + "auxiliary_loss_clip": 0.01103347, + "auxiliary_loss_mlp": 0.01031143, + "balance_loss_clip": 1.01847744, + "balance_loss_mlp": 1.03535938, + "epoch": 0.7475725236735308, + "flos": 19720237622400.0, + "grad_norm": 1.4972431185118094, + "language_loss": 0.67169416, + "learning_rate": 6.318770479751232e-07, + "loss": 0.69303912, + "num_input_tokens_seen": 268186165, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 12434, + "time_per_iteration": 2.45617938041687 + }, + { + "auxiliary_loss_clip": 0.01096539, + "auxiliary_loss_mlp": 0.01028681, + "balance_loss_clip": 1.01809549, + "balance_loss_mlp": 1.03466129, + "epoch": 0.7476326469261987, + "flos": 26286000566400.0, + "grad_norm": 1.5115766265302155, + "language_loss": 0.7984153, + "learning_rate": 6.315929910788263e-07, + "loss": 0.81966752, + "num_input_tokens_seen": 268208145, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.6171875, + "step": 12435, + "time_per_iteration": 2.4689295291900635 + }, + { + "auxiliary_loss_clip": 0.01103643, + "auxiliary_loss_mlp": 0.01028444, + "balance_loss_clip": 1.0165174, + "balance_loss_mlp": 1.03551531, + "epoch": 0.7476927701788667, + "flos": 31831828824960.0, + "grad_norm": 1.9192190166141703, + "language_loss": 0.685781, + "learning_rate": 6.313089860726604e-07, + "loss": 0.70710182, + "num_input_tokens_seen": 268228345, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.68359375, + "step": 12436, + "time_per_iteration": 2.5397560596466064 + }, + { + "auxiliary_loss_clip": 0.01105286, + "auxiliary_loss_mlp": 0.01032667, + "balance_loss_clip": 1.02078748, + "balance_loss_mlp": 1.0353477, + "epoch": 0.7477528934315346, + "flos": 31795487239680.0, + "grad_norm": 2.523256251254823, + "language_loss": 0.70543289, + "learning_rate": 6.31025032967396e-07, + "loss": 0.72681236, + "num_input_tokens_seen": 268250260, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.703125, + "step": 12437, + "time_per_iteration": 2.5217578411102295 + }, + { + "auxiliary_loss_clip": 0.01096987, + "auxiliary_loss_mlp": 0.01028586, + "balance_loss_clip": 1.01766062, + "balance_loss_mlp": 1.0336585, + "epoch": 0.7478130166842026, + "flos": 20371979946240.0, + "grad_norm": 1.7258668993948156, + "language_loss": 0.6710937, + "learning_rate": 6.307411317737986e-07, + "loss": 0.69234937, + "num_input_tokens_seen": 268268440, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6328125, + "step": 12438, + "time_per_iteration": 2.4754526615142822 + }, + { + "auxiliary_loss_clip": 0.01101335, + "auxiliary_loss_mlp": 0.01032026, + "balance_loss_clip": 1.02005768, + "balance_loss_mlp": 1.03440166, + "epoch": 0.7478731399368705, + "flos": 18148930191360.0, + "grad_norm": 1.6057176452605648, + "language_loss": 0.80471182, + "learning_rate": 6.304572825026344e-07, + "loss": 0.82604539, + "num_input_tokens_seen": 268285765, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 12439, + "time_per_iteration": 2.4217841625213623 + }, + { + "auxiliary_loss_clip": 0.01100863, + "auxiliary_loss_mlp": 0.01035132, + "balance_loss_clip": 1.02369952, + "balance_loss_mlp": 1.034688, + "epoch": 0.7479332631895386, + "flos": 15267889146240.0, + "grad_norm": 4.3324890021257065, + "language_loss": 0.70790303, + "learning_rate": 6.301734851646674e-07, + "loss": 0.72926295, + "num_input_tokens_seen": 268304015, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 12440, + "time_per_iteration": 2.4390249252319336 + }, + { + "auxiliary_loss_clip": 0.01100888, + "auxiliary_loss_mlp": 0.01028215, + "balance_loss_clip": 1.01672912, + "balance_loss_mlp": 1.03678477, + "epoch": 0.7479933864422065, + "flos": 21142515525120.0, + "grad_norm": 1.6196156366406493, + "language_loss": 0.74209476, + "learning_rate": 6.298897397706597e-07, + "loss": 0.76338577, + "num_input_tokens_seen": 268323290, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 12441, + "time_per_iteration": 2.452240467071533 + }, + { + "auxiliary_loss_clip": 0.01104655, + "auxiliary_loss_mlp": 0.01035515, + "balance_loss_clip": 1.02294469, + "balance_loss_mlp": 1.0354284, + "epoch": 0.7480535096948745, + "flos": 14392027912320.0, + "grad_norm": 2.0647572412884223, + "language_loss": 0.82613641, + "learning_rate": 6.296060463313698e-07, + "loss": 0.84753811, + "num_input_tokens_seen": 268339490, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69140625, + "step": 12442, + "time_per_iteration": 2.4386143684387207 + }, + { + "auxiliary_loss_clip": 0.01104883, + "auxiliary_loss_mlp": 0.01030375, + "balance_loss_clip": 1.01779294, + "balance_loss_mlp": 1.03697157, + "epoch": 0.7481136329475425, + "flos": 27344683048320.0, + "grad_norm": 1.8278548482074275, + "language_loss": 0.62552464, + "learning_rate": 6.293224048575565e-07, + "loss": 0.64687717, + "num_input_tokens_seen": 268359865, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6796875, + "step": 12443, + "time_per_iteration": 2.501383066177368 + }, + { + "auxiliary_loss_clip": 0.01099555, + "auxiliary_loss_mlp": 0.01027213, + "balance_loss_clip": 1.01601326, + "balance_loss_mlp": 1.03445029, + "epoch": 0.7481737562002104, + "flos": 19531454716800.0, + "grad_norm": 2.2374686087677365, + "language_loss": 0.71498984, + "learning_rate": 6.29038815359975e-07, + "loss": 0.73625755, + "num_input_tokens_seen": 268377065, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6484375, + "step": 12444, + "time_per_iteration": 2.4533753395080566 + }, + { + "auxiliary_loss_clip": 0.01101788, + "auxiliary_loss_mlp": 0.01030053, + "balance_loss_clip": 1.01813221, + "balance_loss_mlp": 1.03564715, + "epoch": 0.7482338794528784, + "flos": 21760035166080.0, + "grad_norm": 1.421192180726323, + "language_loss": 0.68887877, + "learning_rate": 6.287552778493786e-07, + "loss": 0.71019721, + "num_input_tokens_seen": 268396935, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 12445, + "time_per_iteration": 2.4437148571014404 + }, + { + "auxiliary_loss_clip": 0.01099024, + "auxiliary_loss_mlp": 0.01025898, + "balance_loss_clip": 1.01420927, + "balance_loss_mlp": 1.0338124, + "epoch": 0.7482940027055464, + "flos": 18697358021760.0, + "grad_norm": 1.6018226461169682, + "language_loss": 0.73926389, + "learning_rate": 6.28471792336519e-07, + "loss": 0.76051313, + "num_input_tokens_seen": 268414460, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6484375, + "step": 12446, + "time_per_iteration": 2.4290761947631836 + }, + { + "auxiliary_loss_clip": 0.01107586, + "auxiliary_loss_mlp": 0.01031072, + "balance_loss_clip": 1.01841235, + "balance_loss_mlp": 1.03757131, + "epoch": 0.7483541259582144, + "flos": 15998024903040.0, + "grad_norm": 1.8678016899713992, + "language_loss": 0.73009384, + "learning_rate": 6.281883588321475e-07, + "loss": 0.75148046, + "num_input_tokens_seen": 268432225, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 12447, + "time_per_iteration": 2.4282591342926025 + }, + { + "auxiliary_loss_clip": 0.01100481, + "auxiliary_loss_mlp": 0.01030723, + "balance_loss_clip": 1.01952291, + "balance_loss_mlp": 1.03436816, + "epoch": 0.7484142492108823, + "flos": 25556295772800.0, + "grad_norm": 2.453147122317507, + "language_loss": 0.71330941, + "learning_rate": 6.279049773470109e-07, + "loss": 0.73462141, + "num_input_tokens_seen": 268449270, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.66015625, + "step": 12448, + "time_per_iteration": 2.492389678955078 + }, + { + "auxiliary_loss_clip": 0.01103975, + "auxiliary_loss_mlp": 0.01035837, + "balance_loss_clip": 1.02396417, + "balance_loss_mlp": 1.03592634, + "epoch": 0.7484743724635503, + "flos": 22887737631360.0, + "grad_norm": 1.681801443430281, + "language_loss": 0.73694456, + "learning_rate": 6.276216478918543e-07, + "loss": 0.75834262, + "num_input_tokens_seen": 268467250, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 12449, + "time_per_iteration": 2.458009958267212 + }, + { + "auxiliary_loss_clip": 0.01107992, + "auxiliary_loss_mlp": 0.0103785, + "balance_loss_clip": 1.02537513, + "balance_loss_mlp": 1.03796268, + "epoch": 0.7485344957162182, + "flos": 25300288563840.0, + "grad_norm": 4.253717763971936, + "language_loss": 0.6114825, + "learning_rate": 6.273383704774225e-07, + "loss": 0.63294089, + "num_input_tokens_seen": 268487270, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69921875, + "step": 12450, + "time_per_iteration": 2.50168776512146 + }, + { + "auxiliary_loss_clip": 0.01095887, + "auxiliary_loss_mlp": 0.01024791, + "balance_loss_clip": 1.01391912, + "balance_loss_mlp": 1.03296888, + "epoch": 0.7485946189688862, + "flos": 27053016612480.0, + "grad_norm": 2.2078562652579445, + "language_loss": 0.70491904, + "learning_rate": 6.270551451144577e-07, + "loss": 0.72612584, + "num_input_tokens_seen": 268508020, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.62890625, + "step": 12451, + "time_per_iteration": 2.4641804695129395 + }, + { + "auxiliary_loss_clip": 0.0110528, + "auxiliary_loss_mlp": 0.01029724, + "balance_loss_clip": 1.01757646, + "balance_loss_mlp": 1.03587961, + "epoch": 0.7486547422215541, + "flos": 26906752431360.0, + "grad_norm": 1.9404174586148812, + "language_loss": 0.80036032, + "learning_rate": 6.267719718136988e-07, + "loss": 0.82171035, + "num_input_tokens_seen": 268527375, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69140625, + "step": 12452, + "time_per_iteration": 2.472050189971924 + }, + { + "auxiliary_loss_clip": 0.0111029, + "auxiliary_loss_mlp": 0.01032553, + "balance_loss_clip": 1.0199945, + "balance_loss_mlp": 1.03968, + "epoch": 0.7487148654742222, + "flos": 22346277039360.0, + "grad_norm": 1.9353512881851993, + "language_loss": 0.71305573, + "learning_rate": 6.264888505858843e-07, + "loss": 0.73448426, + "num_input_tokens_seen": 268544870, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.70703125, + "step": 12453, + "time_per_iteration": 2.4257922172546387 + }, + { + "auxiliary_loss_clip": 0.01104414, + "auxiliary_loss_mlp": 0.0102963, + "balance_loss_clip": 1.01754212, + "balance_loss_mlp": 1.03703308, + "epoch": 0.7487749887268901, + "flos": 23038814234880.0, + "grad_norm": 1.4891462542899447, + "language_loss": 0.74149597, + "learning_rate": 6.262057814417517e-07, + "loss": 0.76283646, + "num_input_tokens_seen": 268564580, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 12454, + "time_per_iteration": 2.468405246734619 + }, + { + "auxiliary_loss_clip": 0.01025662, + "auxiliary_loss_mlp": 0.01006028, + "balance_loss_clip": 1.00498486, + "balance_loss_mlp": 1.00516868, + "epoch": 0.7488351119795581, + "flos": 71525294536320.0, + "grad_norm": 0.7310384566009501, + "language_loss": 0.59401155, + "learning_rate": 6.259227643920322e-07, + "loss": 0.61432838, + "num_input_tokens_seen": 268629550, + "router_z_loss_clip": 0.01043701, + "router_z_loss_mlp": 0.20507812, + "step": 12455, + "time_per_iteration": 3.1971945762634277 + }, + { + "auxiliary_loss_clip": 0.01101497, + "auxiliary_loss_mlp": 0.01029187, + "balance_loss_clip": 1.01709366, + "balance_loss_mlp": 1.03604209, + "epoch": 0.748895235232226, + "flos": 17196255722880.0, + "grad_norm": 4.934936184310134, + "language_loss": 0.79615253, + "learning_rate": 6.256397994474592e-07, + "loss": 0.81745934, + "num_input_tokens_seen": 268646645, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.65625, + "step": 12456, + "time_per_iteration": 2.4296135902404785 + }, + { + "auxiliary_loss_clip": 0.0102509, + "auxiliary_loss_mlp": 0.01006564, + "balance_loss_clip": 1.00547349, + "balance_loss_mlp": 1.00471401, + "epoch": 0.748955358484894, + "flos": 58979256336000.0, + "grad_norm": 0.83989134398578, + "language_loss": 0.61468804, + "learning_rate": 6.25356886618763e-07, + "loss": 0.63500464, + "num_input_tokens_seen": 268702275, + "router_z_loss_clip": 0.01092529, + "router_z_loss_mlp": 0.20410156, + "step": 12457, + "time_per_iteration": 2.974139928817749 + }, + { + "auxiliary_loss_clip": 0.01106258, + "auxiliary_loss_mlp": 0.01030489, + "balance_loss_clip": 1.01888442, + "balance_loss_mlp": 1.03782308, + "epoch": 0.749015481737562, + "flos": 11360413054080.0, + "grad_norm": 2.8899809171397686, + "language_loss": 0.6718834, + "learning_rate": 6.250740259166711e-07, + "loss": 0.69325089, + "num_input_tokens_seen": 268716265, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.68359375, + "step": 12458, + "time_per_iteration": 2.439760684967041 + }, + { + "auxiliary_loss_clip": 0.01100263, + "auxiliary_loss_mlp": 0.01029117, + "balance_loss_clip": 1.01747072, + "balance_loss_mlp": 1.03471386, + "epoch": 0.74907560499023, + "flos": 21106497162240.0, + "grad_norm": 4.815239058798898, + "language_loss": 0.79521596, + "learning_rate": 6.247912173519106e-07, + "loss": 0.81650984, + "num_input_tokens_seen": 268734330, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 12459, + "time_per_iteration": 2.4311044216156006 + }, + { + "auxiliary_loss_clip": 0.01099542, + "auxiliary_loss_mlp": 0.01031519, + "balance_loss_clip": 1.01964533, + "balance_loss_mlp": 1.03522098, + "epoch": 0.749135728242898, + "flos": 22268027260800.0, + "grad_norm": 1.5166660138964414, + "language_loss": 0.80542082, + "learning_rate": 6.245084609352043e-07, + "loss": 0.82673144, + "num_input_tokens_seen": 268753500, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.64453125, + "step": 12460, + "time_per_iteration": 2.467636823654175 + }, + { + "auxiliary_loss_clip": 0.01102889, + "auxiliary_loss_mlp": 0.01030071, + "balance_loss_clip": 1.01772666, + "balance_loss_mlp": 1.03595591, + "epoch": 0.7491958514955659, + "flos": 24057527857920.0, + "grad_norm": 1.8187946605999095, + "language_loss": 0.8621248, + "learning_rate": 6.242257566772755e-07, + "loss": 0.88345432, + "num_input_tokens_seen": 268772055, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.66796875, + "step": 12461, + "time_per_iteration": 2.445946216583252 + }, + { + "auxiliary_loss_clip": 0.01099091, + "auxiliary_loss_mlp": 0.01030727, + "balance_loss_clip": 1.01919913, + "balance_loss_mlp": 1.03504705, + "epoch": 0.7492559747482339, + "flos": 24492118510080.0, + "grad_norm": 4.4069049168427235, + "language_loss": 0.69474328, + "learning_rate": 6.239431045888435e-07, + "loss": 0.71604145, + "num_input_tokens_seen": 268792265, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 12462, + "time_per_iteration": 2.4715051651000977 + }, + { + "auxiliary_loss_clip": 0.01101104, + "auxiliary_loss_mlp": 0.0103147, + "balance_loss_clip": 1.01925659, + "balance_loss_mlp": 1.03515553, + "epoch": 0.7493160980009018, + "flos": 27745338326400.0, + "grad_norm": 2.161569960012567, + "language_loss": 0.70565915, + "learning_rate": 6.236605046806267e-07, + "loss": 0.72698486, + "num_input_tokens_seen": 268812735, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66015625, + "step": 12463, + "time_per_iteration": 2.4890224933624268 + }, + { + "auxiliary_loss_clip": 0.01103139, + "auxiliary_loss_mlp": 0.0103002, + "balance_loss_clip": 1.01886845, + "balance_loss_mlp": 1.03757596, + "epoch": 0.7493762212535698, + "flos": 30226190970240.0, + "grad_norm": 1.815437502169393, + "language_loss": 0.77414626, + "learning_rate": 6.233779569633419e-07, + "loss": 0.79547787, + "num_input_tokens_seen": 268833090, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 12464, + "time_per_iteration": 2.5218935012817383 + }, + { + "auxiliary_loss_clip": 0.01097602, + "auxiliary_loss_mlp": 0.01026218, + "balance_loss_clip": 1.01478601, + "balance_loss_mlp": 1.03183138, + "epoch": 0.7494363445062378, + "flos": 21944472526080.0, + "grad_norm": 1.6191901183341268, + "language_loss": 0.78242761, + "learning_rate": 6.230954614477034e-07, + "loss": 0.80366582, + "num_input_tokens_seen": 268851880, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 12465, + "time_per_iteration": 2.438852071762085 + }, + { + "auxiliary_loss_clip": 0.0111071, + "auxiliary_loss_mlp": 0.01034499, + "balance_loss_clip": 1.02102232, + "balance_loss_mlp": 1.03788424, + "epoch": 0.7494964677589058, + "flos": 12490342162560.0, + "grad_norm": 2.367319558994289, + "language_loss": 0.73263687, + "learning_rate": 6.22813018144422e-07, + "loss": 0.75408894, + "num_input_tokens_seen": 268867910, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7265625, + "step": 12466, + "time_per_iteration": 2.4159023761749268 + }, + { + "auxiliary_loss_clip": 0.01103324, + "auxiliary_loss_mlp": 0.01032506, + "balance_loss_clip": 1.02088344, + "balance_loss_mlp": 1.03596592, + "epoch": 0.7495565910115737, + "flos": 21653057485440.0, + "grad_norm": 2.1301146092024004, + "language_loss": 0.66439664, + "learning_rate": 6.22530627064209e-07, + "loss": 0.6857549, + "num_input_tokens_seen": 268887260, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 12467, + "time_per_iteration": 2.476149320602417 + }, + { + "auxiliary_loss_clip": 0.0110248, + "auxiliary_loss_mlp": 0.01031367, + "balance_loss_clip": 1.01941586, + "balance_loss_mlp": 1.03570294, + "epoch": 0.7496167142642417, + "flos": 15268535591040.0, + "grad_norm": 2.3152910875520982, + "language_loss": 0.76111352, + "learning_rate": 6.222482882177735e-07, + "loss": 0.78245205, + "num_input_tokens_seen": 268902520, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66796875, + "step": 12468, + "time_per_iteration": 2.536062717437744 + }, + { + "auxiliary_loss_clip": 0.01101389, + "auxiliary_loss_mlp": 0.01029093, + "balance_loss_clip": 1.01648641, + "balance_loss_mlp": 1.03586531, + "epoch": 0.7496768375169096, + "flos": 22054933825920.0, + "grad_norm": 2.6980590171523238, + "language_loss": 0.69451874, + "learning_rate": 6.219660016158201e-07, + "loss": 0.71582359, + "num_input_tokens_seen": 268920970, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.65625, + "step": 12469, + "time_per_iteration": 3.8304295539855957 + }, + { + "auxiliary_loss_clip": 0.01102636, + "auxiliary_loss_mlp": 0.0103201, + "balance_loss_clip": 1.01960647, + "balance_loss_mlp": 1.03584695, + "epoch": 0.7497369607695776, + "flos": 19057038860160.0, + "grad_norm": 1.8066582872371235, + "language_loss": 0.68950933, + "learning_rate": 6.216837672690543e-07, + "loss": 0.71085578, + "num_input_tokens_seen": 268936600, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.66796875, + "step": 12470, + "time_per_iteration": 2.478144645690918 + }, + { + "auxiliary_loss_clip": 0.01105048, + "auxiliary_loss_mlp": 0.01031368, + "balance_loss_clip": 1.0178256, + "balance_loss_mlp": 1.03487074, + "epoch": 0.7497970840222457, + "flos": 21617434172160.0, + "grad_norm": 2.8963816737460606, + "language_loss": 0.74823713, + "learning_rate": 6.214015851881793e-07, + "loss": 0.76960123, + "num_input_tokens_seen": 268956560, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.69921875, + "step": 12471, + "time_per_iteration": 3.9513978958129883 + }, + { + "auxiliary_loss_clip": 0.01103416, + "auxiliary_loss_mlp": 0.01027975, + "balance_loss_clip": 1.01577377, + "balance_loss_mlp": 1.03611624, + "epoch": 0.7498572072749136, + "flos": 13735580906880.0, + "grad_norm": 1.9482854997068855, + "language_loss": 0.76652914, + "learning_rate": 6.211194553838929e-07, + "loss": 0.78784305, + "num_input_tokens_seen": 268973945, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 12472, + "time_per_iteration": 3.9247841835021973 + }, + { + "auxiliary_loss_clip": 0.01100064, + "auxiliary_loss_mlp": 0.01029842, + "balance_loss_clip": 1.01829672, + "balance_loss_mlp": 1.03378856, + "epoch": 0.7499173305275816, + "flos": 22966526113920.0, + "grad_norm": 1.4581749540086286, + "language_loss": 0.84420872, + "learning_rate": 6.208373778668951e-07, + "loss": 0.86550772, + "num_input_tokens_seen": 268993245, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66015625, + "step": 12473, + "time_per_iteration": 2.460721492767334 + }, + { + "auxiliary_loss_clip": 0.01107651, + "auxiliary_loss_mlp": 0.01034983, + "balance_loss_clip": 1.0219121, + "balance_loss_mlp": 1.03714895, + "epoch": 0.7499774537802495, + "flos": 22740467869440.0, + "grad_norm": 1.9225859728755545, + "language_loss": 0.73670536, + "learning_rate": 6.205553526478829e-07, + "loss": 0.75813174, + "num_input_tokens_seen": 269012125, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 12474, + "time_per_iteration": 3.8605411052703857 + }, + { + "auxiliary_loss_clip": 0.01106384, + "auxiliary_loss_mlp": 0.01033692, + "balance_loss_clip": 1.02101982, + "balance_loss_mlp": 1.03537238, + "epoch": 0.7500375770329175, + "flos": 18296559089280.0, + "grad_norm": 1.6563775017925497, + "language_loss": 0.74591839, + "learning_rate": 6.202733797375492e-07, + "loss": 0.7673192, + "num_input_tokens_seen": 269030545, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 12475, + "time_per_iteration": 2.42132830619812 + }, + { + "auxiliary_loss_clip": 0.01108313, + "auxiliary_loss_mlp": 0.01039073, + "balance_loss_clip": 1.02606773, + "balance_loss_mlp": 1.0368228, + "epoch": 0.7500977002855854, + "flos": 19169978198400.0, + "grad_norm": 3.53790302868858, + "language_loss": 0.80186552, + "learning_rate": 6.199914591465878e-07, + "loss": 0.82333934, + "num_input_tokens_seen": 269048180, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71484375, + "step": 12476, + "time_per_iteration": 2.4238805770874023 + }, + { + "auxiliary_loss_clip": 0.01101438, + "auxiliary_loss_mlp": 0.01030238, + "balance_loss_clip": 1.01843047, + "balance_loss_mlp": 1.03465772, + "epoch": 0.7501578235382534, + "flos": 22163886754560.0, + "grad_norm": 1.8885808312532115, + "language_loss": 0.77860969, + "learning_rate": 6.19709590885688e-07, + "loss": 0.79992652, + "num_input_tokens_seen": 269068600, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66796875, + "step": 12477, + "time_per_iteration": 2.4582700729370117 + }, + { + "auxiliary_loss_clip": 0.01025103, + "auxiliary_loss_mlp": 0.01003277, + "balance_loss_clip": 1.00223351, + "balance_loss_mlp": 1.00471592, + "epoch": 0.7502179467909214, + "flos": 64465040033280.0, + "grad_norm": 0.8084596961185327, + "language_loss": 0.54396832, + "learning_rate": 6.194277749655394e-07, + "loss": 0.56425214, + "num_input_tokens_seen": 269119045, + "router_z_loss_clip": 0.01043701, + "router_z_loss_mlp": 0.20410156, + "step": 12478, + "time_per_iteration": 3.0614583492279053 + }, + { + "auxiliary_loss_clip": 0.01100592, + "auxiliary_loss_mlp": 0.01032025, + "balance_loss_clip": 1.02021098, + "balance_loss_mlp": 1.0357542, + "epoch": 0.7502780700435894, + "flos": 20478275268480.0, + "grad_norm": 1.732296797104268, + "language_loss": 0.80400872, + "learning_rate": 6.191460113968272e-07, + "loss": 0.82533485, + "num_input_tokens_seen": 269136755, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6484375, + "step": 12479, + "time_per_iteration": 2.4517574310302734 + }, + { + "auxiliary_loss_clip": 0.01106016, + "auxiliary_loss_mlp": 0.01035802, + "balance_loss_clip": 1.0231421, + "balance_loss_mlp": 1.03617644, + "epoch": 0.7503381932962573, + "flos": 20445273648000.0, + "grad_norm": 2.2068384473951386, + "language_loss": 0.62537003, + "learning_rate": 6.188643001902369e-07, + "loss": 0.64678824, + "num_input_tokens_seen": 269156120, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 12480, + "time_per_iteration": 2.464008092880249 + }, + { + "auxiliary_loss_clip": 0.01099382, + "auxiliary_loss_mlp": 0.01033195, + "balance_loss_clip": 1.02187634, + "balance_loss_mlp": 1.03546023, + "epoch": 0.7503983165489253, + "flos": 22381936266240.0, + "grad_norm": 1.8758461375908144, + "language_loss": 0.77756959, + "learning_rate": 6.185826413564512e-07, + "loss": 0.79889536, + "num_input_tokens_seen": 269175650, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 12481, + "time_per_iteration": 2.457960367202759 + }, + { + "auxiliary_loss_clip": 0.01103553, + "auxiliary_loss_mlp": 0.01030844, + "balance_loss_clip": 1.01870799, + "balance_loss_mlp": 1.03513408, + "epoch": 0.7504584398015932, + "flos": 24899453717760.0, + "grad_norm": 1.6027939437318084, + "language_loss": 0.70975888, + "learning_rate": 6.183010349061501e-07, + "loss": 0.73110282, + "num_input_tokens_seen": 269197080, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.68359375, + "step": 12482, + "time_per_iteration": 2.505486011505127 + }, + { + "auxiliary_loss_clip": 0.01103914, + "auxiliary_loss_mlp": 0.01031643, + "balance_loss_clip": 1.02004409, + "balance_loss_mlp": 1.03608012, + "epoch": 0.7505185630542612, + "flos": 25885237547520.0, + "grad_norm": 1.6593432935882615, + "language_loss": 0.70126545, + "learning_rate": 6.180194808500118e-07, + "loss": 0.72262096, + "num_input_tokens_seen": 269218600, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 12483, + "time_per_iteration": 2.5372493267059326 + }, + { + "auxiliary_loss_clip": 0.01101463, + "auxiliary_loss_mlp": 0.01025502, + "balance_loss_clip": 1.01488626, + "balance_loss_mlp": 1.03527784, + "epoch": 0.7505786863069293, + "flos": 23143852581120.0, + "grad_norm": 1.8314217473162897, + "language_loss": 0.74355495, + "learning_rate": 6.177379791987131e-07, + "loss": 0.76482463, + "num_input_tokens_seen": 269239245, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.6640625, + "step": 12484, + "time_per_iteration": 2.482421636581421 + }, + { + "auxiliary_loss_clip": 0.01100626, + "auxiliary_loss_mlp": 0.01026576, + "balance_loss_clip": 1.01453543, + "balance_loss_mlp": 1.03498316, + "epoch": 0.7506388095595972, + "flos": 16983377769600.0, + "grad_norm": 2.0535325266367153, + "language_loss": 0.84864926, + "learning_rate": 6.174565299629295e-07, + "loss": 0.86992133, + "num_input_tokens_seen": 269258520, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.65625, + "step": 12485, + "time_per_iteration": 2.446956157684326 + }, + { + "auxiliary_loss_clip": 0.0110043, + "auxiliary_loss_mlp": 0.01030894, + "balance_loss_clip": 1.01902699, + "balance_loss_mlp": 1.03467631, + "epoch": 0.7506989328122652, + "flos": 22344984149760.0, + "grad_norm": 1.4660860594284646, + "language_loss": 0.77995837, + "learning_rate": 6.171751331533323e-07, + "loss": 0.80127156, + "num_input_tokens_seen": 269278320, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.65625, + "step": 12486, + "time_per_iteration": 2.517058849334717 + }, + { + "auxiliary_loss_clip": 0.01106308, + "auxiliary_loss_mlp": 0.0103261, + "balance_loss_clip": 1.01999187, + "balance_loss_mlp": 1.03714168, + "epoch": 0.7507590560649331, + "flos": 25776069137280.0, + "grad_norm": 1.8190391114760833, + "language_loss": 0.72836137, + "learning_rate": 6.168937887805932e-07, + "loss": 0.74975049, + "num_input_tokens_seen": 269298025, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69140625, + "step": 12487, + "time_per_iteration": 2.5011062622070312 + }, + { + "auxiliary_loss_clip": 0.01102568, + "auxiliary_loss_mlp": 0.01029737, + "balance_loss_clip": 1.01789975, + "balance_loss_mlp": 1.03470707, + "epoch": 0.7508191793176011, + "flos": 24279420124800.0, + "grad_norm": 1.9101645594404746, + "language_loss": 0.67258334, + "learning_rate": 6.166124968553801e-07, + "loss": 0.69390637, + "num_input_tokens_seen": 269316770, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 12488, + "time_per_iteration": 2.4733595848083496 + }, + { + "auxiliary_loss_clip": 0.01102, + "auxiliary_loss_mlp": 0.01030971, + "balance_loss_clip": 1.01859736, + "balance_loss_mlp": 1.03543329, + "epoch": 0.750879302570269, + "flos": 19899575251200.0, + "grad_norm": 1.5954829957007908, + "language_loss": 0.77207714, + "learning_rate": 6.163312573883592e-07, + "loss": 0.79340684, + "num_input_tokens_seen": 269334755, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6640625, + "step": 12489, + "time_per_iteration": 2.41869854927063 + }, + { + "auxiliary_loss_clip": 0.01100051, + "auxiliary_loss_mlp": 0.01029682, + "balance_loss_clip": 1.01907802, + "balance_loss_mlp": 1.03533888, + "epoch": 0.750939425822937, + "flos": 29205681667200.0, + "grad_norm": 1.8920646114871729, + "language_loss": 0.75356829, + "learning_rate": 6.160500703901956e-07, + "loss": 0.77486563, + "num_input_tokens_seen": 269353810, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.6484375, + "step": 12490, + "time_per_iteration": 2.530346155166626 + }, + { + "auxiliary_loss_clip": 0.01102359, + "auxiliary_loss_mlp": 0.01029526, + "balance_loss_clip": 1.01762867, + "balance_loss_mlp": 1.03632128, + "epoch": 0.750999549075605, + "flos": 21142300043520.0, + "grad_norm": 1.6040694673861557, + "language_loss": 0.78232539, + "learning_rate": 6.157689358715527e-07, + "loss": 0.8036443, + "num_input_tokens_seen": 269372910, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66015625, + "step": 12491, + "time_per_iteration": 2.445436954498291 + }, + { + "auxiliary_loss_clip": 0.01097554, + "auxiliary_loss_mlp": 0.01029244, + "balance_loss_clip": 1.01876545, + "balance_loss_mlp": 1.03334594, + "epoch": 0.751059672328273, + "flos": 23547740083200.0, + "grad_norm": 2.0707908886127813, + "language_loss": 0.76477361, + "learning_rate": 6.154878538430899e-07, + "loss": 0.7860415, + "num_input_tokens_seen": 269391545, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.640625, + "step": 12492, + "time_per_iteration": 2.4592933654785156 + }, + { + "auxiliary_loss_clip": 0.01098246, + "auxiliary_loss_mlp": 0.01028344, + "balance_loss_clip": 1.01735926, + "balance_loss_mlp": 1.03225935, + "epoch": 0.7511197955809409, + "flos": 18989742729600.0, + "grad_norm": 2.019943812075004, + "language_loss": 0.71320605, + "learning_rate": 6.152068243154671e-07, + "loss": 0.73447198, + "num_input_tokens_seen": 269408530, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.66015625, + "step": 12493, + "time_per_iteration": 2.420647621154785 + }, + { + "auxiliary_loss_clip": 0.01103819, + "auxiliary_loss_mlp": 0.01025972, + "balance_loss_clip": 1.01421213, + "balance_loss_mlp": 1.03696609, + "epoch": 0.7511799188336089, + "flos": 22046961006720.0, + "grad_norm": 1.620130382276632, + "language_loss": 0.80576169, + "learning_rate": 6.149258472993395e-07, + "loss": 0.82705963, + "num_input_tokens_seen": 269425930, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.671875, + "step": 12494, + "time_per_iteration": 2.4511101245880127 + }, + { + "auxiliary_loss_clip": 0.01102931, + "auxiliary_loss_mlp": 0.01029107, + "balance_loss_clip": 1.01690626, + "balance_loss_mlp": 1.03543544, + "epoch": 0.7512400420862768, + "flos": 16467125546880.0, + "grad_norm": 2.1793596151447208, + "language_loss": 0.78629243, + "learning_rate": 6.146449228053634e-07, + "loss": 0.80761278, + "num_input_tokens_seen": 269443945, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 12495, + "time_per_iteration": 2.4220409393310547 + }, + { + "auxiliary_loss_clip": 0.01101733, + "auxiliary_loss_mlp": 0.0103546, + "balance_loss_clip": 1.02359903, + "balance_loss_mlp": 1.0354476, + "epoch": 0.7513001653389448, + "flos": 20448326304000.0, + "grad_norm": 2.0360130649256183, + "language_loss": 0.70592833, + "learning_rate": 6.143640508441898e-07, + "loss": 0.72730023, + "num_input_tokens_seen": 269463625, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 12496, + "time_per_iteration": 2.4752755165100098 + }, + { + "auxiliary_loss_clip": 0.01102064, + "auxiliary_loss_mlp": 0.01030349, + "balance_loss_clip": 1.01929259, + "balance_loss_mlp": 1.03579581, + "epoch": 0.7513602885916129, + "flos": 23476816679040.0, + "grad_norm": 1.644722371980129, + "language_loss": 0.77970195, + "learning_rate": 6.140832314264705e-07, + "loss": 0.80102611, + "num_input_tokens_seen": 269483415, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6640625, + "step": 12497, + "time_per_iteration": 2.4557857513427734 + }, + { + "auxiliary_loss_clip": 0.01102933, + "auxiliary_loss_mlp": 0.01033562, + "balance_loss_clip": 1.02162886, + "balance_loss_mlp": 1.03516352, + "epoch": 0.7514204118442808, + "flos": 26797224885120.0, + "grad_norm": 1.5625953994029207, + "language_loss": 0.7667886, + "learning_rate": 6.13802464562855e-07, + "loss": 0.78815353, + "num_input_tokens_seen": 269504635, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 12498, + "time_per_iteration": 2.4923367500305176 + }, + { + "auxiliary_loss_clip": 0.01100471, + "auxiliary_loss_mlp": 0.01031296, + "balance_loss_clip": 1.02007866, + "balance_loss_mlp": 1.03681421, + "epoch": 0.7514805350969488, + "flos": 19865639877120.0, + "grad_norm": 1.712775881225065, + "language_loss": 0.74015152, + "learning_rate": 6.135217502639878e-07, + "loss": 0.76146924, + "num_input_tokens_seen": 269523955, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.63671875, + "step": 12499, + "time_per_iteration": 2.42573618888855 + }, + { + "auxiliary_loss_clip": 0.0109767, + "auxiliary_loss_mlp": 0.01027517, + "balance_loss_clip": 1.01657331, + "balance_loss_mlp": 1.03243327, + "epoch": 0.7515406583496167, + "flos": 24571553437440.0, + "grad_norm": 2.6175707927072787, + "language_loss": 0.7927863, + "learning_rate": 6.132410885405148e-07, + "loss": 0.81403816, + "num_input_tokens_seen": 269544410, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.65234375, + "step": 12500, + "time_per_iteration": 2.4984662532806396 + }, + { + "auxiliary_loss_clip": 0.01109495, + "auxiliary_loss_mlp": 0.01033563, + "balance_loss_clip": 1.01993728, + "balance_loss_mlp": 1.03732872, + "epoch": 0.7516007816022847, + "flos": 20120246455680.0, + "grad_norm": 2.410232320418393, + "language_loss": 0.73039198, + "learning_rate": 6.129604794030794e-07, + "loss": 0.75182259, + "num_input_tokens_seen": 269563315, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.72265625, + "step": 12501, + "time_per_iteration": 2.4204771518707275 + }, + { + "auxiliary_loss_clip": 0.01098599, + "auxiliary_loss_mlp": 0.01025182, + "balance_loss_clip": 1.01324964, + "balance_loss_mlp": 1.0327723, + "epoch": 0.7516609048549526, + "flos": 22784638619520.0, + "grad_norm": 1.6630444702124707, + "language_loss": 0.7825129, + "learning_rate": 6.126799228623207e-07, + "loss": 0.80375075, + "num_input_tokens_seen": 269583950, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 12502, + "time_per_iteration": 2.4997878074645996 + }, + { + "auxiliary_loss_clip": 0.01102781, + "auxiliary_loss_mlp": 0.01032155, + "balance_loss_clip": 1.01995397, + "balance_loss_mlp": 1.03561115, + "epoch": 0.7517210281076206, + "flos": 10634012311680.0, + "grad_norm": 2.7088747693103663, + "language_loss": 0.70608878, + "learning_rate": 6.123994189288786e-07, + "loss": 0.72743809, + "num_input_tokens_seen": 269600120, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 12503, + "time_per_iteration": 2.407897472381592 + }, + { + "auxiliary_loss_clip": 0.01024599, + "auxiliary_loss_mlp": 0.01000364, + "balance_loss_clip": 0.99929094, + "balance_loss_mlp": 1.00410652, + "epoch": 0.7517811513602886, + "flos": 66052221275520.0, + "grad_norm": 0.9807627668089319, + "language_loss": 0.63942432, + "learning_rate": 6.121189676133903e-07, + "loss": 0.65967393, + "num_input_tokens_seen": 269659815, + "router_z_loss_clip": 0.01074219, + "router_z_loss_mlp": 0.20507812, + "step": 12504, + "time_per_iteration": 2.995584726333618 + }, + { + "auxiliary_loss_clip": 0.01096816, + "auxiliary_loss_mlp": 0.01029954, + "balance_loss_clip": 1.01842678, + "balance_loss_mlp": 1.03316665, + "epoch": 0.7518412746129566, + "flos": 37268345018880.0, + "grad_norm": 2.135704139669575, + "language_loss": 0.68474889, + "learning_rate": 6.118385689264896e-07, + "loss": 0.70601666, + "num_input_tokens_seen": 269684565, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.63671875, + "step": 12505, + "time_per_iteration": 2.5871872901916504 + }, + { + "auxiliary_loss_clip": 0.01024908, + "auxiliary_loss_mlp": 0.00998595, + "balance_loss_clip": 0.99765915, + "balance_loss_mlp": 1.00445008, + "epoch": 0.7519013978656245, + "flos": 60518567727360.0, + "grad_norm": 0.6625472273588794, + "language_loss": 0.5508914, + "learning_rate": 6.11558222878809e-07, + "loss": 0.57112646, + "num_input_tokens_seen": 269752325, + "router_z_loss_clip": 0.00933838, + "router_z_loss_mlp": 0.20507812, + "step": 12506, + "time_per_iteration": 3.1377921104431152 + }, + { + "auxiliary_loss_clip": 0.01105218, + "auxiliary_loss_mlp": 0.01033148, + "balance_loss_clip": 1.02154922, + "balance_loss_mlp": 1.03739369, + "epoch": 0.7519615211182925, + "flos": 18806885568000.0, + "grad_norm": 2.061903152831647, + "language_loss": 0.78302479, + "learning_rate": 6.112779294809796e-07, + "loss": 0.80440837, + "num_input_tokens_seen": 269770630, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 12507, + "time_per_iteration": 2.4135823249816895 + }, + { + "auxiliary_loss_clip": 0.01100841, + "auxiliary_loss_mlp": 0.01029111, + "balance_loss_clip": 1.01779842, + "balance_loss_mlp": 1.03669238, + "epoch": 0.7520216443709604, + "flos": 14575244209920.0, + "grad_norm": 1.6731769986850884, + "language_loss": 0.71181047, + "learning_rate": 6.10997688743631e-07, + "loss": 0.73311001, + "num_input_tokens_seen": 269787280, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 12508, + "time_per_iteration": 2.4572551250457764 + }, + { + "auxiliary_loss_clip": 0.01099119, + "auxiliary_loss_mlp": 0.01028584, + "balance_loss_clip": 1.01687193, + "balance_loss_mlp": 1.03434396, + "epoch": 0.7520817676236284, + "flos": 17056599644160.0, + "grad_norm": 1.7139417588852437, + "language_loss": 0.71999872, + "learning_rate": 6.107175006773885e-07, + "loss": 0.74127567, + "num_input_tokens_seen": 269805205, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6484375, + "step": 12509, + "time_per_iteration": 2.432441473007202 + }, + { + "auxiliary_loss_clip": 0.01107542, + "auxiliary_loss_mlp": 0.01036341, + "balance_loss_clip": 1.02306163, + "balance_loss_mlp": 1.03668177, + "epoch": 0.7521418908762965, + "flos": 25666397936640.0, + "grad_norm": 1.5641902395179517, + "language_loss": 0.61837184, + "learning_rate": 6.104373652928785e-07, + "loss": 0.63981068, + "num_input_tokens_seen": 269824820, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.70703125, + "step": 12510, + "time_per_iteration": 2.483800172805786 + }, + { + "auxiliary_loss_clip": 0.01098004, + "auxiliary_loss_mlp": 0.01030469, + "balance_loss_clip": 1.01876235, + "balance_loss_mlp": 1.03506911, + "epoch": 0.7522020141289644, + "flos": 20886759711360.0, + "grad_norm": 1.6552475399559823, + "language_loss": 0.81871247, + "learning_rate": 6.10157282600722e-07, + "loss": 0.83999723, + "num_input_tokens_seen": 269842825, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.62890625, + "step": 12511, + "time_per_iteration": 3.882760524749756 + }, + { + "auxiliary_loss_clip": 0.01105136, + "auxiliary_loss_mlp": 0.01033984, + "balance_loss_clip": 1.02165818, + "balance_loss_mlp": 1.03586888, + "epoch": 0.7522621373816324, + "flos": 12640305444480.0, + "grad_norm": 1.8295208531594718, + "language_loss": 0.7603333, + "learning_rate": 6.098772526115412e-07, + "loss": 0.78172445, + "num_input_tokens_seen": 269859000, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69140625, + "step": 12512, + "time_per_iteration": 2.38800048828125 + }, + { + "auxiliary_loss_clip": 0.01094203, + "auxiliary_loss_mlp": 0.01027458, + "balance_loss_clip": 1.01646113, + "balance_loss_mlp": 1.03219318, + "epoch": 0.7523222606343003, + "flos": 25626141768960.0, + "grad_norm": 1.6286622984961852, + "language_loss": 0.82186234, + "learning_rate": 6.095972753359537e-07, + "loss": 0.84307897, + "num_input_tokens_seen": 269878895, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.62109375, + "step": 12513, + "time_per_iteration": 3.94989013671875 + }, + { + "auxiliary_loss_clip": 0.01103819, + "auxiliary_loss_mlp": 0.01033273, + "balance_loss_clip": 1.02108955, + "balance_loss_mlp": 1.03550029, + "epoch": 0.7523823838869683, + "flos": 20448900921600.0, + "grad_norm": 1.990000011048308, + "language_loss": 0.75192893, + "learning_rate": 6.093173507845771e-07, + "loss": 0.77329987, + "num_input_tokens_seen": 269897280, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.68359375, + "step": 12514, + "time_per_iteration": 3.8526079654693604 + }, + { + "auxiliary_loss_clip": 0.01096596, + "auxiliary_loss_mlp": 0.01029324, + "balance_loss_clip": 1.01869035, + "balance_loss_mlp": 1.03373909, + "epoch": 0.7524425071396362, + "flos": 14720610551040.0, + "grad_norm": 1.7973618299480842, + "language_loss": 0.68311769, + "learning_rate": 6.090374789680271e-07, + "loss": 0.70437688, + "num_input_tokens_seen": 269914640, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.62890625, + "step": 12515, + "time_per_iteration": 2.394958257675171 + }, + { + "auxiliary_loss_clip": 0.01101823, + "auxiliary_loss_mlp": 0.01031956, + "balance_loss_clip": 1.0206188, + "balance_loss_mlp": 1.03523326, + "epoch": 0.7525026303923043, + "flos": 30592048947840.0, + "grad_norm": 2.066116424023424, + "language_loss": 0.70559716, + "learning_rate": 6.087576598969137e-07, + "loss": 0.72693491, + "num_input_tokens_seen": 269934960, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 12516, + "time_per_iteration": 3.9556925296783447 + }, + { + "auxiliary_loss_clip": 0.01099405, + "auxiliary_loss_mlp": 0.01031087, + "balance_loss_clip": 1.01887429, + "balance_loss_mlp": 1.0354656, + "epoch": 0.7525627536449722, + "flos": 24791757765120.0, + "grad_norm": 2.2302621688638764, + "language_loss": 0.8934896, + "learning_rate": 6.084778935818495e-07, + "loss": 0.91479456, + "num_input_tokens_seen": 269956655, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.640625, + "step": 12517, + "time_per_iteration": 2.47792387008667 + }, + { + "auxiliary_loss_clip": 0.01103304, + "auxiliary_loss_mlp": 0.01034057, + "balance_loss_clip": 1.02259493, + "balance_loss_mlp": 1.03562522, + "epoch": 0.7526228768976402, + "flos": 20779782030720.0, + "grad_norm": 1.6178525628265004, + "language_loss": 0.74129748, + "learning_rate": 6.081981800334437e-07, + "loss": 0.76267111, + "num_input_tokens_seen": 269976835, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.67578125, + "step": 12518, + "time_per_iteration": 2.462576150894165 + }, + { + "auxiliary_loss_clip": 0.01024303, + "auxiliary_loss_mlp": 0.00999013, + "balance_loss_clip": 0.99803591, + "balance_loss_mlp": 1.00396061, + "epoch": 0.7526830001503081, + "flos": 66559243703040.0, + "grad_norm": 0.7063379492670796, + "language_loss": 0.55728912, + "learning_rate": 6.079185192623017e-07, + "loss": 0.57752228, + "num_input_tokens_seen": 270040630, + "router_z_loss_clip": 0.00976562, + "router_z_loss_mlp": 0.203125, + "step": 12519, + "time_per_iteration": 3.1375198364257812 + }, + { + "auxiliary_loss_clip": 0.01099253, + "auxiliary_loss_mlp": 0.01033203, + "balance_loss_clip": 1.02270663, + "balance_loss_mlp": 1.03384554, + "epoch": 0.7527431234029761, + "flos": 23477894087040.0, + "grad_norm": 1.4310986441379439, + "language_loss": 0.7804352, + "learning_rate": 6.07638911279029e-07, + "loss": 0.80175972, + "num_input_tokens_seen": 270059695, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.65625, + "step": 12520, + "time_per_iteration": 2.456511974334717 + }, + { + "auxiliary_loss_clip": 0.01098797, + "auxiliary_loss_mlp": 0.01034351, + "balance_loss_clip": 1.02335954, + "balance_loss_mlp": 1.03329098, + "epoch": 0.752803246655644, + "flos": 22049546785920.0, + "grad_norm": 4.550524012485904, + "language_loss": 0.74427485, + "learning_rate": 6.07359356094229e-07, + "loss": 0.76560634, + "num_input_tokens_seen": 270078420, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.65625, + "step": 12521, + "time_per_iteration": 2.492000102996826 + }, + { + "auxiliary_loss_clip": 0.01106943, + "auxiliary_loss_mlp": 0.01031135, + "balance_loss_clip": 1.01836157, + "balance_loss_mlp": 1.03684282, + "epoch": 0.752863369908312, + "flos": 30153795108480.0, + "grad_norm": 1.9335055849585505, + "language_loss": 0.67128062, + "learning_rate": 6.070798537185016e-07, + "loss": 0.6926614, + "num_input_tokens_seen": 270097040, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 12522, + "time_per_iteration": 2.4961695671081543 + }, + { + "auxiliary_loss_clip": 0.01105031, + "auxiliary_loss_mlp": 0.01035216, + "balance_loss_clip": 1.02371216, + "balance_loss_mlp": 1.03653431, + "epoch": 0.7529234931609801, + "flos": 24567638855040.0, + "grad_norm": 2.7941692603753565, + "language_loss": 0.78211427, + "learning_rate": 6.068004041624453e-07, + "loss": 0.80351675, + "num_input_tokens_seen": 270116365, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.68359375, + "step": 12523, + "time_per_iteration": 2.507122039794922 + }, + { + "auxiliary_loss_clip": 0.01100044, + "auxiliary_loss_mlp": 0.01028294, + "balance_loss_clip": 1.01675415, + "balance_loss_mlp": 1.03509927, + "epoch": 0.752983616413648, + "flos": 23112395245440.0, + "grad_norm": 2.0548195739736603, + "language_loss": 0.80642009, + "learning_rate": 6.065210074366571e-07, + "loss": 0.82770348, + "num_input_tokens_seen": 270135395, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 12524, + "time_per_iteration": 2.470827579498291 + }, + { + "auxiliary_loss_clip": 0.01100479, + "auxiliary_loss_mlp": 0.01030003, + "balance_loss_clip": 1.01935172, + "balance_loss_mlp": 1.0362587, + "epoch": 0.753043739666316, + "flos": 24316946858880.0, + "grad_norm": 1.5342669106186173, + "language_loss": 0.7387985, + "learning_rate": 6.062416635517326e-07, + "loss": 0.76010329, + "num_input_tokens_seen": 270156425, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.640625, + "step": 12525, + "time_per_iteration": 2.506251335144043 + }, + { + "auxiliary_loss_clip": 0.01100462, + "auxiliary_loss_mlp": 0.01028627, + "balance_loss_clip": 1.01732588, + "balance_loss_mlp": 1.03503311, + "epoch": 0.7531038629189839, + "flos": 24243294021120.0, + "grad_norm": 1.881783434485301, + "language_loss": 0.71693766, + "learning_rate": 6.059623725182641e-07, + "loss": 0.73822856, + "num_input_tokens_seen": 270176905, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 12526, + "time_per_iteration": 2.4697048664093018 + }, + { + "auxiliary_loss_clip": 0.0109865, + "auxiliary_loss_mlp": 0.01025174, + "balance_loss_clip": 1.01402175, + "balance_loss_mlp": 1.0336082, + "epoch": 0.7531639861716519, + "flos": 30188807890560.0, + "grad_norm": 2.5161959473083675, + "language_loss": 0.71867061, + "learning_rate": 6.056831343468414e-07, + "loss": 0.73990887, + "num_input_tokens_seen": 270196640, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 12527, + "time_per_iteration": 2.544797658920288 + }, + { + "auxiliary_loss_clip": 0.01099923, + "auxiliary_loss_mlp": 0.01025133, + "balance_loss_clip": 1.01430297, + "balance_loss_mlp": 1.03523958, + "epoch": 0.7532241094243198, + "flos": 18223193560320.0, + "grad_norm": 1.8815008802332143, + "language_loss": 0.80829144, + "learning_rate": 6.054039490480539e-07, + "loss": 0.82954198, + "num_input_tokens_seen": 270213905, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6484375, + "step": 12528, + "time_per_iteration": 2.4095561504364014 + }, + { + "auxiliary_loss_clip": 0.01100721, + "auxiliary_loss_mlp": 0.01033151, + "balance_loss_clip": 1.0207355, + "balance_loss_mlp": 1.03425789, + "epoch": 0.7532842326769879, + "flos": 20881049448960.0, + "grad_norm": 1.941676529480235, + "language_loss": 0.84620762, + "learning_rate": 6.051248166324892e-07, + "loss": 0.86754632, + "num_input_tokens_seen": 270231995, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6640625, + "step": 12529, + "time_per_iteration": 2.4949631690979004 + }, + { + "auxiliary_loss_clip": 0.01105602, + "auxiliary_loss_mlp": 0.0103173, + "balance_loss_clip": 1.0194391, + "balance_loss_mlp": 1.03682232, + "epoch": 0.7533443559296558, + "flos": 18078689145600.0, + "grad_norm": 1.741456594396521, + "language_loss": 0.73868054, + "learning_rate": 6.048457371107303e-07, + "loss": 0.76005387, + "num_input_tokens_seen": 270251480, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 12530, + "time_per_iteration": 2.414186954498291 + }, + { + "auxiliary_loss_clip": 0.01024012, + "auxiliary_loss_mlp": 0.01001757, + "balance_loss_clip": 1.00077367, + "balance_loss_mlp": 1.00382376, + "epoch": 0.7534044791823238, + "flos": 50254830766080.0, + "grad_norm": 0.8225360852867398, + "language_loss": 0.63598192, + "learning_rate": 6.045667104933612e-07, + "loss": 0.65623963, + "num_input_tokens_seen": 270306480, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.20214844, + "step": 12531, + "time_per_iteration": 2.9014906883239746 + }, + { + "auxiliary_loss_clip": 0.0110411, + "auxiliary_loss_mlp": 0.0102764, + "balance_loss_clip": 1.0154748, + "balance_loss_mlp": 1.03552723, + "epoch": 0.7534646024349917, + "flos": 20850274471680.0, + "grad_norm": 2.4431425943596876, + "language_loss": 0.69780314, + "learning_rate": 6.042877367909633e-07, + "loss": 0.71912062, + "num_input_tokens_seen": 270324595, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6875, + "step": 12532, + "time_per_iteration": 2.4260380268096924 + }, + { + "auxiliary_loss_clip": 0.01097275, + "auxiliary_loss_mlp": 0.01028653, + "balance_loss_clip": 1.01846051, + "balance_loss_mlp": 1.03496122, + "epoch": 0.7535247256876597, + "flos": 23071779941760.0, + "grad_norm": 1.5569948577505761, + "language_loss": 0.77583849, + "learning_rate": 6.040088160141132e-07, + "loss": 0.79709774, + "num_input_tokens_seen": 270344375, + "router_z_loss_clip": 0.10205078, + "router_z_loss_mlp": 0.62109375, + "step": 12533, + "time_per_iteration": 2.454207181930542 + }, + { + "auxiliary_loss_clip": 0.01024523, + "auxiliary_loss_mlp": 0.01002703, + "balance_loss_clip": 1.00167179, + "balance_loss_mlp": 1.00402224, + "epoch": 0.7535848489403276, + "flos": 58623418252800.0, + "grad_norm": 0.7822513714763298, + "language_loss": 0.57376039, + "learning_rate": 6.037299481733886e-07, + "loss": 0.59403265, + "num_input_tokens_seen": 270405235, + "router_z_loss_clip": 0.01031494, + "router_z_loss_mlp": 0.20507812, + "step": 12534, + "time_per_iteration": 3.077544927597046 + }, + { + "auxiliary_loss_clip": 0.01097886, + "auxiliary_loss_mlp": 0.0102723, + "balance_loss_clip": 1.01530933, + "balance_loss_mlp": 1.03252482, + "epoch": 0.7536449721929956, + "flos": 26577882483840.0, + "grad_norm": 1.4171340268037091, + "language_loss": 0.71380311, + "learning_rate": 6.03451133279365e-07, + "loss": 0.73505425, + "num_input_tokens_seen": 270425820, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 12535, + "time_per_iteration": 2.526242971420288 + }, + { + "auxiliary_loss_clip": 0.01100548, + "auxiliary_loss_mlp": 0.01028567, + "balance_loss_clip": 1.01628292, + "balance_loss_mlp": 1.03258336, + "epoch": 0.7537050954456637, + "flos": 25735992537600.0, + "grad_norm": 1.6321998046367074, + "language_loss": 0.80901384, + "learning_rate": 6.031723713426135e-07, + "loss": 0.83030498, + "num_input_tokens_seen": 270447120, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6796875, + "step": 12536, + "time_per_iteration": 2.472864866256714 + }, + { + "auxiliary_loss_clip": 0.01096541, + "auxiliary_loss_mlp": 0.01024408, + "balance_loss_clip": 1.01334548, + "balance_loss_mlp": 1.03342223, + "epoch": 0.7537652186983316, + "flos": 30224431203840.0, + "grad_norm": 1.9374714714672148, + "language_loss": 0.74261057, + "learning_rate": 6.028936623737067e-07, + "loss": 0.76382011, + "num_input_tokens_seen": 270468680, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6328125, + "step": 12537, + "time_per_iteration": 2.5162243843078613 + }, + { + "auxiliary_loss_clip": 0.01101972, + "auxiliary_loss_mlp": 0.01031017, + "balance_loss_clip": 1.01921487, + "balance_loss_mlp": 1.03531504, + "epoch": 0.7538253419509996, + "flos": 12641239198080.0, + "grad_norm": 1.6037731039814345, + "language_loss": 0.74178267, + "learning_rate": 6.026150063832111e-07, + "loss": 0.76311255, + "num_input_tokens_seen": 270486310, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 12538, + "time_per_iteration": 2.3771462440490723 + }, + { + "auxiliary_loss_clip": 0.01102251, + "auxiliary_loss_mlp": 0.01030969, + "balance_loss_clip": 1.01944685, + "balance_loss_mlp": 1.03522778, + "epoch": 0.7538854652036675, + "flos": 23185976256000.0, + "grad_norm": 1.599430575608072, + "language_loss": 0.6738885, + "learning_rate": 6.023364033816956e-07, + "loss": 0.69522071, + "num_input_tokens_seen": 270507210, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 12539, + "time_per_iteration": 2.4771296977996826 + }, + { + "auxiliary_loss_clip": 0.01099836, + "auxiliary_loss_mlp": 0.01026603, + "balance_loss_clip": 1.01467586, + "balance_loss_mlp": 1.03530288, + "epoch": 0.7539455884563355, + "flos": 23186227651200.0, + "grad_norm": 1.910954039527726, + "language_loss": 0.74824083, + "learning_rate": 6.020578533797229e-07, + "loss": 0.7695052, + "num_input_tokens_seen": 270525250, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.64453125, + "step": 12540, + "time_per_iteration": 2.4341037273406982 + }, + { + "auxiliary_loss_clip": 0.01102106, + "auxiliary_loss_mlp": 0.01028012, + "balance_loss_clip": 1.01606107, + "balance_loss_mlp": 1.03418863, + "epoch": 0.7540057117090034, + "flos": 13181155505280.0, + "grad_norm": 1.9945629348385325, + "language_loss": 0.72719324, + "learning_rate": 6.017793563878566e-07, + "loss": 0.74849451, + "num_input_tokens_seen": 270539295, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6796875, + "step": 12541, + "time_per_iteration": 2.393623113632202 + }, + { + "auxiliary_loss_clip": 0.01100227, + "auxiliary_loss_mlp": 0.01030539, + "balance_loss_clip": 1.01907074, + "balance_loss_mlp": 1.03469777, + "epoch": 0.7540658349616715, + "flos": 45478134478080.0, + "grad_norm": 2.0115318030709277, + "language_loss": 0.72047889, + "learning_rate": 6.015009124166576e-07, + "loss": 0.74178648, + "num_input_tokens_seen": 270562815, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65234375, + "step": 12542, + "time_per_iteration": 2.635145902633667 + }, + { + "auxiliary_loss_clip": 0.01098214, + "auxiliary_loss_mlp": 0.01025398, + "balance_loss_clip": 1.01399565, + "balance_loss_mlp": 1.03344584, + "epoch": 0.7541259582143394, + "flos": 19930817105280.0, + "grad_norm": 1.9065173152707051, + "language_loss": 0.84603345, + "learning_rate": 6.012225214766844e-07, + "loss": 0.86726964, + "num_input_tokens_seen": 270579055, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 12543, + "time_per_iteration": 2.428612232208252 + }, + { + "auxiliary_loss_clip": 0.01104276, + "auxiliary_loss_mlp": 0.01029829, + "balance_loss_clip": 1.01840305, + "balance_loss_mlp": 1.03965712, + "epoch": 0.7541860814670074, + "flos": 27198239299200.0, + "grad_norm": 2.1119731634282766, + "language_loss": 0.73896754, + "learning_rate": 6.009441835784927e-07, + "loss": 0.76030856, + "num_input_tokens_seen": 270599080, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.64453125, + "step": 12544, + "time_per_iteration": 2.4670307636260986 + }, + { + "auxiliary_loss_clip": 0.01102346, + "auxiliary_loss_mlp": 0.01029394, + "balance_loss_clip": 1.0182302, + "balance_loss_mlp": 1.03597724, + "epoch": 0.7542462047196753, + "flos": 21324151624320.0, + "grad_norm": 2.101942602107972, + "language_loss": 0.6828922, + "learning_rate": 6.006658987326383e-07, + "loss": 0.70420957, + "num_input_tokens_seen": 270618715, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 12545, + "time_per_iteration": 2.459852933883667 + }, + { + "auxiliary_loss_clip": 0.01100275, + "auxiliary_loss_mlp": 0.01030728, + "balance_loss_clip": 1.01913524, + "balance_loss_mlp": 1.0337708, + "epoch": 0.7543063279723433, + "flos": 11940944664960.0, + "grad_norm": 1.8429570719628683, + "language_loss": 0.68578523, + "learning_rate": 6.003876669496728e-07, + "loss": 0.70709527, + "num_input_tokens_seen": 270635695, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6640625, + "step": 12546, + "time_per_iteration": 2.420004367828369 + }, + { + "auxiliary_loss_clip": 0.01100982, + "auxiliary_loss_mlp": 0.01032925, + "balance_loss_clip": 1.0202651, + "balance_loss_mlp": 1.03451025, + "epoch": 0.7543664512250112, + "flos": 22819974624000.0, + "grad_norm": 2.2369205909253917, + "language_loss": 0.73266494, + "learning_rate": 6.00109488240147e-07, + "loss": 0.75400406, + "num_input_tokens_seen": 270654325, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6640625, + "step": 12547, + "time_per_iteration": 2.4736859798431396 + }, + { + "auxiliary_loss_clip": 0.01101024, + "auxiliary_loss_mlp": 0.01024925, + "balance_loss_clip": 1.01246762, + "balance_loss_mlp": 1.03465009, + "epoch": 0.7544265744776792, + "flos": 20923855482240.0, + "grad_norm": 1.7870453962384887, + "language_loss": 0.67817152, + "learning_rate": 5.998313626146099e-07, + "loss": 0.699431, + "num_input_tokens_seen": 270674260, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6640625, + "step": 12548, + "time_per_iteration": 2.443042755126953 + }, + { + "auxiliary_loss_clip": 0.01103041, + "auxiliary_loss_mlp": 0.01032802, + "balance_loss_clip": 1.02093458, + "balance_loss_mlp": 1.03505886, + "epoch": 0.7544866977303473, + "flos": 15195493284480.0, + "grad_norm": 1.7833036384787766, + "language_loss": 0.87229598, + "learning_rate": 5.995532900836088e-07, + "loss": 0.89365441, + "num_input_tokens_seen": 270692200, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 12549, + "time_per_iteration": 2.4908969402313232 + }, + { + "auxiliary_loss_clip": 0.01097148, + "auxiliary_loss_mlp": 0.01035232, + "balance_loss_clip": 1.02423525, + "balance_loss_mlp": 1.0338217, + "epoch": 0.7545468209830152, + "flos": 27083683848960.0, + "grad_norm": 1.9918391310756007, + "language_loss": 0.76892895, + "learning_rate": 5.992752706576865e-07, + "loss": 0.79025269, + "num_input_tokens_seen": 270709675, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6328125, + "step": 12550, + "time_per_iteration": 2.5220580101013184 + }, + { + "auxiliary_loss_clip": 0.01101116, + "auxiliary_loss_mlp": 0.01025163, + "balance_loss_clip": 1.01411855, + "balance_loss_mlp": 1.0339551, + "epoch": 0.7546069442356832, + "flos": 26871703735680.0, + "grad_norm": 1.4369467492375085, + "language_loss": 0.69346207, + "learning_rate": 5.98997304347386e-07, + "loss": 0.7147249, + "num_input_tokens_seen": 270733055, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.671875, + "step": 12551, + "time_per_iteration": 2.517190933227539 + }, + { + "auxiliary_loss_clip": 0.0110311, + "auxiliary_loss_mlp": 0.01026388, + "balance_loss_clip": 1.01450872, + "balance_loss_mlp": 1.03722537, + "epoch": 0.7546670674883511, + "flos": 15743131015680.0, + "grad_norm": 1.8744654131641019, + "language_loss": 0.86030054, + "learning_rate": 5.987193911632487e-07, + "loss": 0.88159549, + "num_input_tokens_seen": 270749275, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66015625, + "step": 12552, + "time_per_iteration": 2.402366876602173 + }, + { + "auxiliary_loss_clip": 0.01102012, + "auxiliary_loss_mlp": 0.0102913, + "balance_loss_clip": 1.01788902, + "balance_loss_mlp": 1.03502691, + "epoch": 0.7547271907410191, + "flos": 23477714519040.0, + "grad_norm": 2.4665346108502533, + "language_loss": 0.78498495, + "learning_rate": 5.98441531115812e-07, + "loss": 0.80629647, + "num_input_tokens_seen": 270768230, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 12553, + "time_per_iteration": 3.900495767593384 + }, + { + "auxiliary_loss_clip": 0.01103224, + "auxiliary_loss_mlp": 0.01032799, + "balance_loss_clip": 1.02062142, + "balance_loss_mlp": 1.03664863, + "epoch": 0.754787313993687, + "flos": 31722804069120.0, + "grad_norm": 2.2168137149261518, + "language_loss": 0.62832999, + "learning_rate": 5.981637242156135e-07, + "loss": 0.64969027, + "num_input_tokens_seen": 270786285, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6640625, + "step": 12554, + "time_per_iteration": 2.517960786819458 + }, + { + "auxiliary_loss_clip": 0.01099717, + "auxiliary_loss_mlp": 0.01027707, + "balance_loss_clip": 1.01661491, + "balance_loss_mlp": 1.03377116, + "epoch": 0.7548474372463551, + "flos": 27563055782400.0, + "grad_norm": 1.582375661492136, + "language_loss": 0.73297715, + "learning_rate": 5.978859704731864e-07, + "loss": 0.75425136, + "num_input_tokens_seen": 270805505, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.66015625, + "step": 12555, + "time_per_iteration": 3.861729145050049 + }, + { + "auxiliary_loss_clip": 0.01105045, + "auxiliary_loss_mlp": 0.01028913, + "balance_loss_clip": 1.01683104, + "balance_loss_mlp": 1.03707051, + "epoch": 0.754907560499023, + "flos": 19318576763520.0, + "grad_norm": 2.3601676718523956, + "language_loss": 0.78618932, + "learning_rate": 5.976082698990645e-07, + "loss": 0.80752885, + "num_input_tokens_seen": 270824610, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6796875, + "step": 12556, + "time_per_iteration": 3.837012529373169 + }, + { + "auxiliary_loss_clip": 0.01024032, + "auxiliary_loss_mlp": 0.0100246, + "balance_loss_clip": 1.00142884, + "balance_loss_mlp": 1.00368142, + "epoch": 0.754967683751691, + "flos": 69744628684800.0, + "grad_norm": 0.708139285400587, + "language_loss": 0.50455654, + "learning_rate": 5.973306225037769e-07, + "loss": 0.52482152, + "num_input_tokens_seen": 270886155, + "router_z_loss_clip": 0.01031494, + "router_z_loss_mlp": 0.203125, + "step": 12557, + "time_per_iteration": 4.464947462081909 + }, + { + "auxiliary_loss_clip": 0.01105013, + "auxiliary_loss_mlp": 0.01027804, + "balance_loss_clip": 1.01530528, + "balance_loss_mlp": 1.03735423, + "epoch": 0.7550278070043589, + "flos": 24421913377920.0, + "grad_norm": 1.6820502805276656, + "language_loss": 0.71426684, + "learning_rate": 5.970530282978525e-07, + "loss": 0.73559499, + "num_input_tokens_seen": 270905325, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 12558, + "time_per_iteration": 2.4628171920776367 + }, + { + "auxiliary_loss_clip": 0.0109928, + "auxiliary_loss_mlp": 0.01027296, + "balance_loss_clip": 1.01590014, + "balance_loss_mlp": 1.0340848, + "epoch": 0.7550879302570269, + "flos": 32634611838720.0, + "grad_norm": 1.7073621929136382, + "language_loss": 0.80198216, + "learning_rate": 5.967754872918187e-07, + "loss": 0.82324797, + "num_input_tokens_seen": 270927535, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 12559, + "time_per_iteration": 2.5296967029571533 + }, + { + "auxiliary_loss_clip": 0.01102511, + "auxiliary_loss_mlp": 0.01027699, + "balance_loss_clip": 1.01577234, + "balance_loss_mlp": 1.03483188, + "epoch": 0.7551480535096948, + "flos": 21795550738560.0, + "grad_norm": 1.6276492932782158, + "language_loss": 0.78893793, + "learning_rate": 5.96497999496199e-07, + "loss": 0.81024003, + "num_input_tokens_seen": 270946920, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 12560, + "time_per_iteration": 2.5170834064483643 + }, + { + "auxiliary_loss_clip": 0.01098646, + "auxiliary_loss_mlp": 0.01033819, + "balance_loss_clip": 1.02261329, + "balance_loss_mlp": 1.03458691, + "epoch": 0.7552081767623628, + "flos": 18515111391360.0, + "grad_norm": 1.7060183642703433, + "language_loss": 0.70997584, + "learning_rate": 5.96220564921515e-07, + "loss": 0.73130047, + "num_input_tokens_seen": 270965705, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 12561, + "time_per_iteration": 2.491224765777588 + }, + { + "auxiliary_loss_clip": 0.01099644, + "auxiliary_loss_mlp": 0.01028258, + "balance_loss_clip": 1.01652765, + "balance_loss_mlp": 1.03315794, + "epoch": 0.7552683000150308, + "flos": 27634805199360.0, + "grad_norm": 1.5670310978935318, + "language_loss": 0.75664687, + "learning_rate": 5.959431835782889e-07, + "loss": 0.77792597, + "num_input_tokens_seen": 270986550, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 12562, + "time_per_iteration": 2.5043649673461914 + }, + { + "auxiliary_loss_clip": 0.0110067, + "auxiliary_loss_mlp": 0.01029353, + "balance_loss_clip": 1.01738989, + "balance_loss_mlp": 1.03472567, + "epoch": 0.7553284232676988, + "flos": 20302924049280.0, + "grad_norm": 2.5989481487272426, + "language_loss": 0.75632036, + "learning_rate": 5.956658554770371e-07, + "loss": 0.77762067, + "num_input_tokens_seen": 271006250, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66015625, + "step": 12563, + "time_per_iteration": 2.461552143096924 + }, + { + "auxiliary_loss_clip": 0.0110889, + "auxiliary_loss_mlp": 0.01034511, + "balance_loss_clip": 1.02043235, + "balance_loss_mlp": 1.03629291, + "epoch": 0.7553885465203668, + "flos": 33255471444480.0, + "grad_norm": 2.463791742652493, + "language_loss": 0.67465413, + "learning_rate": 5.953885806282768e-07, + "loss": 0.69608808, + "num_input_tokens_seen": 271025575, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7265625, + "step": 12564, + "time_per_iteration": 2.518521785736084 + }, + { + "auxiliary_loss_clip": 0.01104188, + "auxiliary_loss_mlp": 0.01032716, + "balance_loss_clip": 1.02057433, + "balance_loss_mlp": 1.03584766, + "epoch": 0.7554486697730347, + "flos": 21616249023360.0, + "grad_norm": 2.2259446193296943, + "language_loss": 0.68585801, + "learning_rate": 5.951113590425228e-07, + "loss": 0.70722699, + "num_input_tokens_seen": 271045805, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.68359375, + "step": 12565, + "time_per_iteration": 2.473606586456299 + }, + { + "auxiliary_loss_clip": 0.01103455, + "auxiliary_loss_mlp": 0.01027682, + "balance_loss_clip": 1.01519513, + "balance_loss_mlp": 1.0340724, + "epoch": 0.7555087930257027, + "flos": 27632973605760.0, + "grad_norm": 1.6339568808166163, + "language_loss": 0.7538799, + "learning_rate": 5.94834190730287e-07, + "loss": 0.77519131, + "num_input_tokens_seen": 271066065, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 12566, + "time_per_iteration": 2.4602677822113037 + }, + { + "auxiliary_loss_clip": 0.01105793, + "auxiliary_loss_mlp": 0.0103259, + "balance_loss_clip": 1.01922655, + "balance_loss_mlp": 1.03676701, + "epoch": 0.7555689162783706, + "flos": 23621644316160.0, + "grad_norm": 2.446271815399535, + "language_loss": 0.73930967, + "learning_rate": 5.945570757020789e-07, + "loss": 0.76069355, + "num_input_tokens_seen": 271085870, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.6875, + "step": 12567, + "time_per_iteration": 2.482639789581299 + }, + { + "auxiliary_loss_clip": 0.01100485, + "auxiliary_loss_mlp": 0.01028822, + "balance_loss_clip": 1.01764655, + "balance_loss_mlp": 1.03495455, + "epoch": 0.7556290395310387, + "flos": 24863076218880.0, + "grad_norm": 1.8407945721596504, + "language_loss": 0.62615836, + "learning_rate": 5.942800139684073e-07, + "loss": 0.6474514, + "num_input_tokens_seen": 271104260, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.65625, + "step": 12568, + "time_per_iteration": 2.5483205318450928 + }, + { + "auxiliary_loss_clip": 0.01101205, + "auxiliary_loss_mlp": 0.01030606, + "balance_loss_clip": 1.01934648, + "balance_loss_mlp": 1.03582668, + "epoch": 0.7556891627837066, + "flos": 43543770330240.0, + "grad_norm": 1.9963818018777864, + "language_loss": 0.66748881, + "learning_rate": 5.940030055397789e-07, + "loss": 0.68880689, + "num_input_tokens_seen": 271125745, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65234375, + "step": 12569, + "time_per_iteration": 2.659467935562134 + }, + { + "auxiliary_loss_clip": 0.01105651, + "auxiliary_loss_mlp": 0.0103263, + "balance_loss_clip": 1.01936173, + "balance_loss_mlp": 1.03600824, + "epoch": 0.7557492860363746, + "flos": 26650924790400.0, + "grad_norm": 1.6607243680943589, + "language_loss": 0.67248321, + "learning_rate": 5.93726050426697e-07, + "loss": 0.69386601, + "num_input_tokens_seen": 271147145, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6953125, + "step": 12570, + "time_per_iteration": 2.4708566665649414 + }, + { + "auxiliary_loss_clip": 0.01102793, + "auxiliary_loss_mlp": 0.0103299, + "balance_loss_clip": 1.02071762, + "balance_loss_mlp": 1.0357399, + "epoch": 0.7558094092890425, + "flos": 55182885010560.0, + "grad_norm": 1.8220604458329166, + "language_loss": 0.7152952, + "learning_rate": 5.934491486396647e-07, + "loss": 0.73665303, + "num_input_tokens_seen": 271170865, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.671875, + "step": 12571, + "time_per_iteration": 2.7677295207977295 + }, + { + "auxiliary_loss_clip": 0.01102851, + "auxiliary_loss_mlp": 0.01029018, + "balance_loss_clip": 1.01681685, + "balance_loss_mlp": 1.03468394, + "epoch": 0.7558695325417105, + "flos": 23988292392960.0, + "grad_norm": 1.6120967066403376, + "language_loss": 0.73383725, + "learning_rate": 5.931723001891811e-07, + "loss": 0.75515598, + "num_input_tokens_seen": 271191450, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6796875, + "step": 12572, + "time_per_iteration": 2.457766056060791 + }, + { + "auxiliary_loss_clip": 0.01104212, + "auxiliary_loss_mlp": 0.01032566, + "balance_loss_clip": 1.02069271, + "balance_loss_mlp": 1.03641462, + "epoch": 0.7559296557943784, + "flos": 14611262572800.0, + "grad_norm": 1.9236315061860603, + "language_loss": 0.76293039, + "learning_rate": 5.928955050857456e-07, + "loss": 0.78429818, + "num_input_tokens_seen": 271207335, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 12573, + "time_per_iteration": 2.419971466064453 + }, + { + "auxiliary_loss_clip": 0.01105728, + "auxiliary_loss_mlp": 0.01033526, + "balance_loss_clip": 1.02172422, + "balance_loss_mlp": 1.03609872, + "epoch": 0.7559897790470465, + "flos": 18550483309440.0, + "grad_norm": 1.703385006523425, + "language_loss": 0.69107687, + "learning_rate": 5.926187633398527e-07, + "loss": 0.7124694, + "num_input_tokens_seen": 271226895, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6953125, + "step": 12574, + "time_per_iteration": 2.4180386066436768 + }, + { + "auxiliary_loss_clip": 0.01099738, + "auxiliary_loss_mlp": 0.01033214, + "balance_loss_clip": 1.02082789, + "balance_loss_mlp": 1.03441286, + "epoch": 0.7560499022997144, + "flos": 17967868709760.0, + "grad_norm": 2.2423644939518423, + "language_loss": 0.7207917, + "learning_rate": 5.923420749619974e-07, + "loss": 0.74212122, + "num_input_tokens_seen": 271244375, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.65625, + "step": 12575, + "time_per_iteration": 2.455258846282959 + }, + { + "auxiliary_loss_clip": 0.01098947, + "auxiliary_loss_mlp": 0.01033415, + "balance_loss_clip": 1.02219176, + "balance_loss_mlp": 1.03365374, + "epoch": 0.7561100255523824, + "flos": 15737815802880.0, + "grad_norm": 2.02730026321769, + "language_loss": 0.72025073, + "learning_rate": 5.92065439962673e-07, + "loss": 0.74157435, + "num_input_tokens_seen": 271259530, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 12576, + "time_per_iteration": 2.4121248722076416 + }, + { + "auxiliary_loss_clip": 0.01101081, + "auxiliary_loss_mlp": 0.01033414, + "balance_loss_clip": 1.0213263, + "balance_loss_mlp": 1.03535473, + "epoch": 0.7561701488050504, + "flos": 15888102307200.0, + "grad_norm": 1.8488663808999763, + "language_loss": 0.67365032, + "learning_rate": 5.917888583523669e-07, + "loss": 0.69499528, + "num_input_tokens_seen": 271276835, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.65625, + "step": 12577, + "time_per_iteration": 2.4330592155456543 + }, + { + "auxiliary_loss_clip": 0.0110105, + "auxiliary_loss_mlp": 0.01031737, + "balance_loss_clip": 1.02031672, + "balance_loss_mlp": 1.03520989, + "epoch": 0.7562302720577183, + "flos": 20339157893760.0, + "grad_norm": 1.669663040088463, + "language_loss": 0.78626776, + "learning_rate": 5.915123301415685e-07, + "loss": 0.80759561, + "num_input_tokens_seen": 271296275, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 12578, + "time_per_iteration": 2.4133589267730713 + }, + { + "auxiliary_loss_clip": 0.01101874, + "auxiliary_loss_mlp": 0.01030497, + "balance_loss_clip": 1.01851618, + "balance_loss_mlp": 1.03413773, + "epoch": 0.7562903953103863, + "flos": 20812209033600.0, + "grad_norm": 1.4105288225039079, + "language_loss": 0.75553155, + "learning_rate": 5.912358553407641e-07, + "loss": 0.77685523, + "num_input_tokens_seen": 271315685, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 12579, + "time_per_iteration": 2.465855836868286 + }, + { + "auxiliary_loss_clip": 0.01107073, + "auxiliary_loss_mlp": 0.0103106, + "balance_loss_clip": 1.01840019, + "balance_loss_mlp": 1.03599763, + "epoch": 0.7563505185630542, + "flos": 37596999484800.0, + "grad_norm": 1.9246022226121349, + "language_loss": 0.62678003, + "learning_rate": 5.90959433960437e-07, + "loss": 0.64816135, + "num_input_tokens_seen": 271336790, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 12580, + "time_per_iteration": 2.5613341331481934 + }, + { + "auxiliary_loss_clip": 0.01102863, + "auxiliary_loss_mlp": 0.0103228, + "balance_loss_clip": 1.02075255, + "balance_loss_mlp": 1.03644252, + "epoch": 0.7564106418157223, + "flos": 20230995064320.0, + "grad_norm": 1.7113026290728908, + "language_loss": 0.74942124, + "learning_rate": 5.906830660110691e-07, + "loss": 0.7707727, + "num_input_tokens_seen": 271355470, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 12581, + "time_per_iteration": 2.4502360820770264 + }, + { + "auxiliary_loss_clip": 0.01101422, + "auxiliary_loss_mlp": 0.01029963, + "balance_loss_clip": 1.01828623, + "balance_loss_mlp": 1.03389621, + "epoch": 0.7564707650683902, + "flos": 24754877475840.0, + "grad_norm": 2.005641504780856, + "language_loss": 0.6295954, + "learning_rate": 5.904067515031412e-07, + "loss": 0.6509093, + "num_input_tokens_seen": 271375810, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.67578125, + "step": 12582, + "time_per_iteration": 2.4572439193725586 + }, + { + "auxiliary_loss_clip": 0.01023883, + "auxiliary_loss_mlp": 0.01000227, + "balance_loss_clip": 0.99921417, + "balance_loss_mlp": 1.00362778, + "epoch": 0.7565308883210582, + "flos": 48530076433920.0, + "grad_norm": 0.9810901823792554, + "language_loss": 0.60704458, + "learning_rate": 5.901304904471307e-07, + "loss": 0.6272856, + "num_input_tokens_seen": 271424775, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.203125, + "step": 12583, + "time_per_iteration": 2.7996931076049805 + }, + { + "auxiliary_loss_clip": 0.01103397, + "auxiliary_loss_mlp": 0.01035964, + "balance_loss_clip": 1.02372694, + "balance_loss_mlp": 1.03601849, + "epoch": 0.7565910115737261, + "flos": 12495082757760.0, + "grad_norm": 2.0250696621760413, + "language_loss": 0.78582263, + "learning_rate": 5.898542828535125e-07, + "loss": 0.80721629, + "num_input_tokens_seen": 271440500, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.671875, + "step": 12584, + "time_per_iteration": 2.400280475616455 + }, + { + "auxiliary_loss_clip": 0.01099872, + "auxiliary_loss_mlp": 0.01027615, + "balance_loss_clip": 1.01651096, + "balance_loss_mlp": 1.03562188, + "epoch": 0.7566511348263941, + "flos": 21173003193600.0, + "grad_norm": 2.69321954136788, + "language_loss": 0.77584487, + "learning_rate": 5.895781287327612e-07, + "loss": 0.79711974, + "num_input_tokens_seen": 271458180, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.64453125, + "step": 12585, + "time_per_iteration": 2.4472086429595947 + }, + { + "auxiliary_loss_clip": 0.01108261, + "auxiliary_loss_mlp": 0.01034728, + "balance_loss_clip": 1.02196097, + "balance_loss_mlp": 1.03907382, + "epoch": 0.756711258079062, + "flos": 21754827694080.0, + "grad_norm": 1.6081546851080741, + "language_loss": 0.82765162, + "learning_rate": 5.893020280953493e-07, + "loss": 0.84908152, + "num_input_tokens_seen": 271475730, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69140625, + "step": 12586, + "time_per_iteration": 2.4276626110076904 + }, + { + "auxiliary_loss_clip": 0.01106519, + "auxiliary_loss_mlp": 0.01030607, + "balance_loss_clip": 1.01965213, + "balance_loss_mlp": 1.03753841, + "epoch": 0.75677138133173, + "flos": 22382905933440.0, + "grad_norm": 1.873036053279186, + "language_loss": 0.83275306, + "learning_rate": 5.890259809517459e-07, + "loss": 0.85412443, + "num_input_tokens_seen": 271495030, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.69140625, + "step": 12587, + "time_per_iteration": 2.4600062370300293 + }, + { + "auxiliary_loss_clip": 0.01100482, + "auxiliary_loss_mlp": 0.01029229, + "balance_loss_clip": 1.01739788, + "balance_loss_mlp": 1.03461528, + "epoch": 0.756831504584398, + "flos": 22708974620160.0, + "grad_norm": 1.6080398539976855, + "language_loss": 0.71293926, + "learning_rate": 5.88749987312418e-07, + "loss": 0.73423636, + "num_input_tokens_seen": 271515355, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65625, + "step": 12588, + "time_per_iteration": 2.460441827774048 + }, + { + "auxiliary_loss_clip": 0.01105135, + "auxiliary_loss_mlp": 0.01028941, + "balance_loss_clip": 1.01644826, + "balance_loss_mlp": 1.03631103, + "epoch": 0.756891627837066, + "flos": 24098358643200.0, + "grad_norm": 1.7772907750031848, + "language_loss": 0.68223751, + "learning_rate": 5.884740471878327e-07, + "loss": 0.70357823, + "num_input_tokens_seen": 271535090, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 12589, + "time_per_iteration": 2.4796125888824463 + }, + { + "auxiliary_loss_clip": 0.0110204, + "auxiliary_loss_mlp": 0.01029865, + "balance_loss_clip": 1.01805186, + "balance_loss_mlp": 1.0352689, + "epoch": 0.756951751089734, + "flos": 19749001438080.0, + "grad_norm": 2.5553015061472326, + "language_loss": 0.91916406, + "learning_rate": 5.881981605884522e-07, + "loss": 0.94048315, + "num_input_tokens_seen": 271551075, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66796875, + "step": 12590, + "time_per_iteration": 2.4198997020721436 + }, + { + "auxiliary_loss_clip": 0.01098826, + "auxiliary_loss_mlp": 0.0102714, + "balance_loss_clip": 1.01576114, + "balance_loss_mlp": 1.03452909, + "epoch": 0.7570118743424019, + "flos": 35079266551680.0, + "grad_norm": 1.7917701509519888, + "language_loss": 0.65428317, + "learning_rate": 5.879223275247391e-07, + "loss": 0.67554283, + "num_input_tokens_seen": 271571035, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.640625, + "step": 12591, + "time_per_iteration": 2.56341814994812 + }, + { + "auxiliary_loss_clip": 0.01102228, + "auxiliary_loss_mlp": 0.01026308, + "balance_loss_clip": 1.01525116, + "balance_loss_mlp": 1.03707504, + "epoch": 0.7570719975950699, + "flos": 25594540778880.0, + "grad_norm": 1.511094647527582, + "language_loss": 0.73406184, + "learning_rate": 5.876465480071528e-07, + "loss": 0.75534725, + "num_input_tokens_seen": 271592950, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65234375, + "step": 12592, + "time_per_iteration": 2.474759340286255 + }, + { + "auxiliary_loss_clip": 0.01102216, + "auxiliary_loss_mlp": 0.01035375, + "balance_loss_clip": 1.02331686, + "balance_loss_mlp": 1.03412235, + "epoch": 0.7571321208477378, + "flos": 10816223028480.0, + "grad_norm": 2.1319710484730074, + "language_loss": 0.7111423, + "learning_rate": 5.873708220461522e-07, + "loss": 0.7325182, + "num_input_tokens_seen": 271608835, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6796875, + "step": 12593, + "time_per_iteration": 2.4273533821105957 + }, + { + "auxiliary_loss_clip": 0.01104658, + "auxiliary_loss_mlp": 0.01030106, + "balance_loss_clip": 1.01837587, + "balance_loss_mlp": 1.03637433, + "epoch": 0.7571922441004059, + "flos": 18260109763200.0, + "grad_norm": 1.845375608838855, + "language_loss": 0.66037387, + "learning_rate": 5.870951496521903e-07, + "loss": 0.68172151, + "num_input_tokens_seen": 271627730, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.68359375, + "step": 12594, + "time_per_iteration": 3.838972568511963 + }, + { + "auxiliary_loss_clip": 0.01104515, + "auxiliary_loss_mlp": 0.01032453, + "balance_loss_clip": 1.02056789, + "balance_loss_mlp": 1.03537512, + "epoch": 0.7572523673530738, + "flos": 22890502978560.0, + "grad_norm": 1.5603399133411295, + "language_loss": 0.80766582, + "learning_rate": 5.86819530835722e-07, + "loss": 0.82903558, + "num_input_tokens_seen": 271646415, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69140625, + "step": 12595, + "time_per_iteration": 2.4764091968536377 + }, + { + "auxiliary_loss_clip": 0.01102369, + "auxiliary_loss_mlp": 0.01031107, + "balance_loss_clip": 1.01978159, + "balance_loss_mlp": 1.03633952, + "epoch": 0.7573124906057418, + "flos": 20996323171200.0, + "grad_norm": 1.8048420186435026, + "language_loss": 0.71071315, + "learning_rate": 5.865439656071993e-07, + "loss": 0.73204786, + "num_input_tokens_seen": 271666240, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66015625, + "step": 12596, + "time_per_iteration": 3.9183623790740967 + }, + { + "auxiliary_loss_clip": 0.01100386, + "auxiliary_loss_mlp": 0.01030232, + "balance_loss_clip": 1.01918721, + "balance_loss_mlp": 1.0357343, + "epoch": 0.7573726138584097, + "flos": 20886292834560.0, + "grad_norm": 1.5321566367759303, + "language_loss": 0.80469054, + "learning_rate": 5.862684539770706e-07, + "loss": 0.82599676, + "num_input_tokens_seen": 271686370, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 12597, + "time_per_iteration": 3.962346076965332 + }, + { + "auxiliary_loss_clip": 0.01108273, + "auxiliary_loss_mlp": 0.01030578, + "balance_loss_clip": 1.01784658, + "balance_loss_mlp": 1.03885663, + "epoch": 0.7574327371110777, + "flos": 24530507170560.0, + "grad_norm": 1.9840297783183698, + "language_loss": 0.83408284, + "learning_rate": 5.859929959557835e-07, + "loss": 0.85547137, + "num_input_tokens_seen": 271705050, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 12598, + "time_per_iteration": 2.4496231079101562 + }, + { + "auxiliary_loss_clip": 0.01101103, + "auxiliary_loss_mlp": 0.01024155, + "balance_loss_clip": 1.01322365, + "balance_loss_mlp": 1.03596723, + "epoch": 0.7574928603637456, + "flos": 23364523785600.0, + "grad_norm": 1.806795486884082, + "language_loss": 0.62383306, + "learning_rate": 5.857175915537845e-07, + "loss": 0.64508563, + "num_input_tokens_seen": 271724915, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 12599, + "time_per_iteration": 3.9129881858825684 + }, + { + "auxiliary_loss_clip": 0.01106489, + "auxiliary_loss_mlp": 0.01033726, + "balance_loss_clip": 1.02027953, + "balance_loss_mlp": 1.03697991, + "epoch": 0.7575529836164137, + "flos": 13516274419200.0, + "grad_norm": 2.7350879991531523, + "language_loss": 0.62593752, + "learning_rate": 5.854422407815161e-07, + "loss": 0.6473397, + "num_input_tokens_seen": 271742410, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.6953125, + "step": 12600, + "time_per_iteration": 2.3905975818634033 + }, + { + "auxiliary_loss_clip": 0.01100395, + "auxiliary_loss_mlp": 0.01027152, + "balance_loss_clip": 1.01535034, + "balance_loss_mlp": 1.03529775, + "epoch": 0.7576131068690816, + "flos": 19646584784640.0, + "grad_norm": 1.9463870297593193, + "language_loss": 0.66116518, + "learning_rate": 5.851669436494191e-07, + "loss": 0.68244064, + "num_input_tokens_seen": 271761425, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65234375, + "step": 12601, + "time_per_iteration": 2.4491307735443115 + }, + { + "auxiliary_loss_clip": 0.01099051, + "auxiliary_loss_mlp": 0.01031238, + "balance_loss_clip": 1.02029419, + "balance_loss_mlp": 1.03474712, + "epoch": 0.7576732301217496, + "flos": 20048245643520.0, + "grad_norm": 1.5554220634885219, + "language_loss": 0.67926621, + "learning_rate": 5.848917001679335e-07, + "loss": 0.70056915, + "num_input_tokens_seen": 271780875, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.640625, + "step": 12602, + "time_per_iteration": 2.421680450439453 + }, + { + "auxiliary_loss_clip": 0.01103764, + "auxiliary_loss_mlp": 0.01032798, + "balance_loss_clip": 1.0201081, + "balance_loss_mlp": 1.03649664, + "epoch": 0.7577333533744176, + "flos": 15377093470080.0, + "grad_norm": 1.7612852584963323, + "language_loss": 0.67052841, + "learning_rate": 5.846165103474967e-07, + "loss": 0.69189405, + "num_input_tokens_seen": 271799490, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.671875, + "step": 12603, + "time_per_iteration": 2.4140625 + }, + { + "auxiliary_loss_clip": 0.01098496, + "auxiliary_loss_mlp": 0.01029738, + "balance_loss_clip": 1.01902699, + "balance_loss_mlp": 1.03334355, + "epoch": 0.7577934766270855, + "flos": 17894862316800.0, + "grad_norm": 1.9150574683213546, + "language_loss": 0.61476982, + "learning_rate": 5.843413741985439e-07, + "loss": 0.63605225, + "num_input_tokens_seen": 271817040, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.65234375, + "step": 12604, + "time_per_iteration": 2.4143993854522705 + }, + { + "auxiliary_loss_clip": 0.0110333, + "auxiliary_loss_mlp": 0.01032559, + "balance_loss_clip": 1.02098989, + "balance_loss_mlp": 1.03802633, + "epoch": 0.7578535998797535, + "flos": 21613770984960.0, + "grad_norm": 1.9881999977626783, + "language_loss": 0.80013704, + "learning_rate": 5.840662917315076e-07, + "loss": 0.82149595, + "num_input_tokens_seen": 271835480, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 12605, + "time_per_iteration": 2.4703023433685303 + }, + { + "auxiliary_loss_clip": 0.01105019, + "auxiliary_loss_mlp": 0.01028637, + "balance_loss_clip": 1.01616216, + "balance_loss_mlp": 1.03563833, + "epoch": 0.7579137231324214, + "flos": 18478374756480.0, + "grad_norm": 2.509488145051598, + "language_loss": 0.78940737, + "learning_rate": 5.837912629568198e-07, + "loss": 0.81074387, + "num_input_tokens_seen": 271849835, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 12606, + "time_per_iteration": 2.4461817741394043 + }, + { + "auxiliary_loss_clip": 0.01098445, + "auxiliary_loss_mlp": 0.01025586, + "balance_loss_clip": 1.01545978, + "balance_loss_mlp": 1.03513336, + "epoch": 0.7579738463850895, + "flos": 23255032152960.0, + "grad_norm": 1.3687592276329898, + "language_loss": 0.73185945, + "learning_rate": 5.835162878849087e-07, + "loss": 0.75309968, + "num_input_tokens_seen": 271869560, + "router_z_loss_clip": 0.10107422, + "router_z_loss_mlp": 0.6328125, + "step": 12607, + "time_per_iteration": 2.4908721446990967 + }, + { + "auxiliary_loss_clip": 0.01104024, + "auxiliary_loss_mlp": 0.01028582, + "balance_loss_clip": 1.01622605, + "balance_loss_mlp": 1.03433669, + "epoch": 0.7580339696377574, + "flos": 14027031861120.0, + "grad_norm": 2.4968443331698635, + "language_loss": 0.75006789, + "learning_rate": 5.83241366526202e-07, + "loss": 0.7713939, + "num_input_tokens_seen": 271887950, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6953125, + "step": 12608, + "time_per_iteration": 2.408450126647949 + }, + { + "auxiliary_loss_clip": 0.01100229, + "auxiliary_loss_mlp": 0.01032276, + "balance_loss_clip": 1.02053344, + "balance_loss_mlp": 1.03477442, + "epoch": 0.7580940928904254, + "flos": 25082777756160.0, + "grad_norm": 1.5859201905014537, + "language_loss": 0.71409112, + "learning_rate": 5.829664988911245e-07, + "loss": 0.73541617, + "num_input_tokens_seen": 271907700, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 12609, + "time_per_iteration": 2.498211622238159 + }, + { + "auxiliary_loss_clip": 0.01102343, + "auxiliary_loss_mlp": 0.01029809, + "balance_loss_clip": 1.01692224, + "balance_loss_mlp": 1.03438187, + "epoch": 0.7581542161430933, + "flos": 23836425690240.0, + "grad_norm": 2.844859157672467, + "language_loss": 0.81682944, + "learning_rate": 5.826916849901007e-07, + "loss": 0.83815098, + "num_input_tokens_seen": 271926840, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6796875, + "step": 12610, + "time_per_iteration": 2.432453155517578 + }, + { + "auxiliary_loss_clip": 0.01105711, + "auxiliary_loss_mlp": 0.01030053, + "balance_loss_clip": 1.01812613, + "balance_loss_mlp": 1.03694248, + "epoch": 0.7582143393957613, + "flos": 22237000888320.0, + "grad_norm": 1.6924171050782333, + "language_loss": 0.70433235, + "learning_rate": 5.824169248335488e-07, + "loss": 0.72569001, + "num_input_tokens_seen": 271946465, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 12611, + "time_per_iteration": 2.500880479812622 + }, + { + "auxiliary_loss_clip": 0.01102293, + "auxiliary_loss_mlp": 0.01027381, + "balance_loss_clip": 1.01576972, + "balance_loss_mlp": 1.03632438, + "epoch": 0.7582744626484292, + "flos": 21106389421440.0, + "grad_norm": 1.4523660094894448, + "language_loss": 0.70939386, + "learning_rate": 5.821422184318893e-07, + "loss": 0.7306906, + "num_input_tokens_seen": 271967295, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66015625, + "step": 12612, + "time_per_iteration": 2.4539196491241455 + }, + { + "auxiliary_loss_clip": 0.01104666, + "auxiliary_loss_mlp": 0.01035871, + "balance_loss_clip": 1.02454031, + "balance_loss_mlp": 1.03628385, + "epoch": 0.7583345859010973, + "flos": 24604770539520.0, + "grad_norm": 1.557484420274363, + "language_loss": 0.59628952, + "learning_rate": 5.818675657955397e-07, + "loss": 0.61769485, + "num_input_tokens_seen": 271987960, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.68359375, + "step": 12613, + "time_per_iteration": 2.5192790031433105 + }, + { + "auxiliary_loss_clip": 0.01102507, + "auxiliary_loss_mlp": 0.0103357, + "balance_loss_clip": 1.02141631, + "balance_loss_mlp": 1.03547192, + "epoch": 0.7583947091537652, + "flos": 33546814657920.0, + "grad_norm": 1.5699815827869172, + "language_loss": 0.59917688, + "learning_rate": 5.815929669349135e-07, + "loss": 0.62053764, + "num_input_tokens_seen": 272011780, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.671875, + "step": 12614, + "time_per_iteration": 2.5326051712036133 + }, + { + "auxiliary_loss_clip": 0.01102627, + "auxiliary_loss_mlp": 0.01026271, + "balance_loss_clip": 1.01423693, + "balance_loss_mlp": 1.03418064, + "epoch": 0.7584548324064332, + "flos": 20121000641280.0, + "grad_norm": 1.965283793201321, + "language_loss": 0.73299825, + "learning_rate": 5.813184218604246e-07, + "loss": 0.75428724, + "num_input_tokens_seen": 272030825, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 12615, + "time_per_iteration": 2.4653162956237793 + }, + { + "auxiliary_loss_clip": 0.01023549, + "auxiliary_loss_mlp": 0.01002988, + "balance_loss_clip": 1.00207007, + "balance_loss_mlp": 1.00344896, + "epoch": 0.7585149556591012, + "flos": 70402584061440.0, + "grad_norm": 0.8444154589468232, + "language_loss": 0.67707115, + "learning_rate": 5.810439305824828e-07, + "loss": 0.69733649, + "num_input_tokens_seen": 272095825, + "router_z_loss_clip": 0.00915527, + "router_z_loss_mlp": 0.20117188, + "step": 12616, + "time_per_iteration": 3.0754714012145996 + }, + { + "auxiliary_loss_clip": 0.01105053, + "auxiliary_loss_mlp": 0.01035378, + "balance_loss_clip": 1.02318311, + "balance_loss_mlp": 1.03608048, + "epoch": 0.7585750789117691, + "flos": 16143786293760.0, + "grad_norm": 1.7978643606873037, + "language_loss": 0.84971976, + "learning_rate": 5.807694931114979e-07, + "loss": 0.87112409, + "num_input_tokens_seen": 272113950, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69140625, + "step": 12617, + "time_per_iteration": 2.448288917541504 + }, + { + "auxiliary_loss_clip": 0.01103847, + "auxiliary_loss_mlp": 0.01035021, + "balance_loss_clip": 1.02366078, + "balance_loss_mlp": 1.03641772, + "epoch": 0.7586352021644371, + "flos": 17493165544320.0, + "grad_norm": 2.68696985331022, + "language_loss": 0.75189435, + "learning_rate": 5.804951094578757e-07, + "loss": 0.77328306, + "num_input_tokens_seen": 272130315, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.67578125, + "step": 12618, + "time_per_iteration": 2.3945305347442627 + }, + { + "auxiliary_loss_clip": 0.01106053, + "auxiliary_loss_mlp": 0.01033655, + "balance_loss_clip": 1.02109587, + "balance_loss_mlp": 1.03541541, + "epoch": 0.758695325417105, + "flos": 17275187859840.0, + "grad_norm": 2.6724320695855646, + "language_loss": 0.77528578, + "learning_rate": 5.802207796320209e-07, + "loss": 0.79668283, + "num_input_tokens_seen": 272149080, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.70703125, + "step": 12619, + "time_per_iteration": 2.5116357803344727 + }, + { + "auxiliary_loss_clip": 0.01101262, + "auxiliary_loss_mlp": 0.01033753, + "balance_loss_clip": 1.0217483, + "balance_loss_mlp": 1.03520966, + "epoch": 0.7587554486697731, + "flos": 29495660163840.0, + "grad_norm": 1.9430951948294126, + "language_loss": 0.8248623, + "learning_rate": 5.79946503644337e-07, + "loss": 0.84621245, + "num_input_tokens_seen": 272168285, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66015625, + "step": 12620, + "time_per_iteration": 2.4870126247406006 + }, + { + "auxiliary_loss_clip": 0.0110498, + "auxiliary_loss_mlp": 0.0103625, + "balance_loss_clip": 1.02316654, + "balance_loss_mlp": 1.03550339, + "epoch": 0.758815571922441, + "flos": 16100800692480.0, + "grad_norm": 2.128247483649562, + "language_loss": 0.82510465, + "learning_rate": 5.796722815052242e-07, + "loss": 0.84651691, + "num_input_tokens_seen": 272184585, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6953125, + "step": 12621, + "time_per_iteration": 2.407888412475586 + }, + { + "auxiliary_loss_clip": 0.01103126, + "auxiliary_loss_mlp": 0.01031415, + "balance_loss_clip": 1.01944077, + "balance_loss_mlp": 1.03546882, + "epoch": 0.758875695175109, + "flos": 16143714466560.0, + "grad_norm": 1.905238128524311, + "language_loss": 0.73415148, + "learning_rate": 5.7939811322508e-07, + "loss": 0.75549692, + "num_input_tokens_seen": 272200205, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 12622, + "time_per_iteration": 2.392918348312378 + }, + { + "auxiliary_loss_clip": 0.01023365, + "auxiliary_loss_mlp": 0.01001846, + "balance_loss_clip": 1.00088048, + "balance_loss_mlp": 1.00314832, + "epoch": 0.7589358184277769, + "flos": 68462006860800.0, + "grad_norm": 0.8354315652196721, + "language_loss": 0.60838234, + "learning_rate": 5.791239988143024e-07, + "loss": 0.62863445, + "num_input_tokens_seen": 272259670, + "router_z_loss_clip": 0.00964355, + "router_z_loss_mlp": 0.20214844, + "step": 12623, + "time_per_iteration": 3.0560390949249268 + }, + { + "auxiliary_loss_clip": 0.0110073, + "auxiliary_loss_mlp": 0.01033721, + "balance_loss_clip": 1.02278328, + "balance_loss_mlp": 1.03662705, + "epoch": 0.7589959416804449, + "flos": 20047311889920.0, + "grad_norm": 3.5930861717067653, + "language_loss": 0.66990733, + "learning_rate": 5.788499382832847e-07, + "loss": 0.69125187, + "num_input_tokens_seen": 272277925, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.640625, + "step": 12624, + "time_per_iteration": 2.41662335395813 + }, + { + "auxiliary_loss_clip": 0.01099572, + "auxiliary_loss_mlp": 0.01025695, + "balance_loss_clip": 1.01397121, + "balance_loss_mlp": 1.03495693, + "epoch": 0.7590560649331128, + "flos": 18771800958720.0, + "grad_norm": 1.9214697173160005, + "language_loss": 0.75980389, + "learning_rate": 5.785759316424196e-07, + "loss": 0.78105658, + "num_input_tokens_seen": 272296010, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6484375, + "step": 12625, + "time_per_iteration": 2.430710792541504 + }, + { + "auxiliary_loss_clip": 0.01102125, + "auxiliary_loss_mlp": 0.01033951, + "balance_loss_clip": 1.02228665, + "balance_loss_mlp": 1.0369575, + "epoch": 0.7591161881857809, + "flos": 29825284296960.0, + "grad_norm": 1.779846333652066, + "language_loss": 0.6279074, + "learning_rate": 5.783019789020977e-07, + "loss": 0.64926815, + "num_input_tokens_seen": 272318330, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6484375, + "step": 12626, + "time_per_iteration": 2.504363775253296 + }, + { + "auxiliary_loss_clip": 0.01104327, + "auxiliary_loss_mlp": 0.01037762, + "balance_loss_clip": 1.02497673, + "balance_loss_mlp": 1.03691292, + "epoch": 0.7591763114384488, + "flos": 20302708567680.0, + "grad_norm": 2.3505107376172782, + "language_loss": 0.73657954, + "learning_rate": 5.780280800727084e-07, + "loss": 0.75800049, + "num_input_tokens_seen": 272335265, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.67578125, + "step": 12627, + "time_per_iteration": 2.4584648609161377 + }, + { + "auxiliary_loss_clip": 0.01104059, + "auxiliary_loss_mlp": 0.01025855, + "balance_loss_clip": 1.01399338, + "balance_loss_mlp": 1.03618145, + "epoch": 0.7592364346911168, + "flos": 20813609664000.0, + "grad_norm": 1.9976061083215328, + "language_loss": 0.68754119, + "learning_rate": 5.777542351646356e-07, + "loss": 0.70884025, + "num_input_tokens_seen": 272354795, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 12628, + "time_per_iteration": 2.430168390274048 + }, + { + "auxiliary_loss_clip": 0.01111325, + "auxiliary_loss_mlp": 0.01038199, + "balance_loss_clip": 1.02543187, + "balance_loss_mlp": 1.03944075, + "epoch": 0.7592965579437848, + "flos": 21251504367360.0, + "grad_norm": 1.8845310767470707, + "language_loss": 0.63146746, + "learning_rate": 5.774804441882648e-07, + "loss": 0.65296274, + "num_input_tokens_seen": 272372875, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.71875, + "step": 12629, + "time_per_iteration": 2.4647164344787598 + }, + { + "auxiliary_loss_clip": 0.01096357, + "auxiliary_loss_mlp": 0.01029567, + "balance_loss_clip": 1.01772952, + "balance_loss_mlp": 1.03295267, + "epoch": 0.7593566811964527, + "flos": 26213604704640.0, + "grad_norm": 1.5320581360916075, + "language_loss": 0.77814519, + "learning_rate": 5.772067071539786e-07, + "loss": 0.79940444, + "num_input_tokens_seen": 272394715, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6328125, + "step": 12630, + "time_per_iteration": 2.4695019721984863 + }, + { + "auxiliary_loss_clip": 0.01023993, + "auxiliary_loss_mlp": 0.01002903, + "balance_loss_clip": 1.00193775, + "balance_loss_mlp": 1.00382364, + "epoch": 0.7594168044491207, + "flos": 71237255374080.0, + "grad_norm": 0.8096499014530706, + "language_loss": 0.61483628, + "learning_rate": 5.769330240721562e-07, + "loss": 0.63510519, + "num_input_tokens_seen": 272458775, + "router_z_loss_clip": 0.00964355, + "router_z_loss_mlp": 0.20117188, + "step": 12631, + "time_per_iteration": 3.0936625003814697 + }, + { + "auxiliary_loss_clip": 0.01109676, + "auxiliary_loss_mlp": 0.01034458, + "balance_loss_clip": 1.02038503, + "balance_loss_mlp": 1.0382787, + "epoch": 0.7594769277017887, + "flos": 26613326229120.0, + "grad_norm": 1.648732197394605, + "language_loss": 0.73976278, + "learning_rate": 5.766593949531767e-07, + "loss": 0.76120412, + "num_input_tokens_seen": 272479355, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7109375, + "step": 12632, + "time_per_iteration": 2.480149745941162 + }, + { + "auxiliary_loss_clip": 0.01104237, + "auxiliary_loss_mlp": 0.01030871, + "balance_loss_clip": 1.01905131, + "balance_loss_mlp": 1.03713107, + "epoch": 0.7595370509544567, + "flos": 17595941333760.0, + "grad_norm": 1.9673738745547358, + "language_loss": 0.74681813, + "learning_rate": 5.763858198074154e-07, + "loss": 0.76816922, + "num_input_tokens_seen": 272493555, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 12633, + "time_per_iteration": 2.4051129817962646 + }, + { + "auxiliary_loss_clip": 0.01102602, + "auxiliary_loss_mlp": 0.0102895, + "balance_loss_clip": 1.0182507, + "balance_loss_mlp": 1.03637874, + "epoch": 0.7595971742071246, + "flos": 18002953319040.0, + "grad_norm": 1.9622807663436381, + "language_loss": 0.73751974, + "learning_rate": 5.76112298645246e-07, + "loss": 0.75883526, + "num_input_tokens_seen": 272508925, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.66015625, + "step": 12634, + "time_per_iteration": 2.4096055030822754 + }, + { + "auxiliary_loss_clip": 0.01105344, + "auxiliary_loss_mlp": 0.01032122, + "balance_loss_clip": 1.02009416, + "balance_loss_mlp": 1.03842199, + "epoch": 0.7596572974597926, + "flos": 28840326480000.0, + "grad_norm": 1.6454406828041275, + "language_loss": 0.64365327, + "learning_rate": 5.758388314770408e-07, + "loss": 0.66502792, + "num_input_tokens_seen": 272528805, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66796875, + "step": 12635, + "time_per_iteration": 2.50323224067688 + }, + { + "auxiliary_loss_clip": 0.01105903, + "auxiliary_loss_mlp": 0.01030647, + "balance_loss_clip": 1.01841593, + "balance_loss_mlp": 1.03627133, + "epoch": 0.7597174207124605, + "flos": 14282823588480.0, + "grad_norm": 1.7052959170016264, + "language_loss": 0.68446481, + "learning_rate": 5.7556541831317e-07, + "loss": 0.70583028, + "num_input_tokens_seen": 272546655, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69921875, + "step": 12636, + "time_per_iteration": 3.86566424369812 + }, + { + "auxiliary_loss_clip": 0.01106502, + "auxiliary_loss_mlp": 0.01032191, + "balance_loss_clip": 1.02063966, + "balance_loss_mlp": 1.03834045, + "epoch": 0.7597775439651285, + "flos": 21688932193920.0, + "grad_norm": 1.977358934255135, + "language_loss": 0.81089514, + "learning_rate": 5.752920591640018e-07, + "loss": 0.83228207, + "num_input_tokens_seen": 272564010, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.68359375, + "step": 12637, + "time_per_iteration": 2.4373815059661865 + }, + { + "auxiliary_loss_clip": 0.01100493, + "auxiliary_loss_mlp": 0.01032101, + "balance_loss_clip": 1.02041268, + "balance_loss_mlp": 1.0336833, + "epoch": 0.7598376672177964, + "flos": 36101248312320.0, + "grad_norm": 1.8305503551265345, + "language_loss": 0.66367668, + "learning_rate": 5.750187540399017e-07, + "loss": 0.68500262, + "num_input_tokens_seen": 272585840, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66796875, + "step": 12638, + "time_per_iteration": 3.9780218601226807 + }, + { + "auxiliary_loss_clip": 0.01104273, + "auxiliary_loss_mlp": 0.01033993, + "balance_loss_clip": 1.02106452, + "balance_loss_mlp": 1.03667748, + "epoch": 0.7598977904704645, + "flos": 18332326056960.0, + "grad_norm": 2.213704137729046, + "language_loss": 0.65462083, + "learning_rate": 5.747455029512323e-07, + "loss": 0.67600346, + "num_input_tokens_seen": 272602300, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.67578125, + "step": 12639, + "time_per_iteration": 3.9062952995300293 + }, + { + "auxiliary_loss_clip": 0.01101967, + "auxiliary_loss_mlp": 0.01027427, + "balance_loss_clip": 1.01530933, + "balance_loss_mlp": 1.03570402, + "epoch": 0.7599579137231324, + "flos": 20192642317440.0, + "grad_norm": 2.267632288408512, + "language_loss": 0.6999557, + "learning_rate": 5.744723059083572e-07, + "loss": 0.72124958, + "num_input_tokens_seen": 272619595, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 12640, + "time_per_iteration": 2.4175524711608887 + }, + { + "auxiliary_loss_clip": 0.01105941, + "auxiliary_loss_mlp": 0.01032991, + "balance_loss_clip": 1.02036071, + "balance_loss_mlp": 1.03658712, + "epoch": 0.7600180369758004, + "flos": 24024849459840.0, + "grad_norm": 1.786840701662577, + "language_loss": 0.6698308, + "learning_rate": 5.741991629216343e-07, + "loss": 0.69122016, + "num_input_tokens_seen": 272638825, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 12641, + "time_per_iteration": 3.921182632446289 + }, + { + "auxiliary_loss_clip": 0.01102198, + "auxiliary_loss_mlp": 0.01033642, + "balance_loss_clip": 1.02081525, + "balance_loss_mlp": 1.03358555, + "epoch": 0.7600781602284684, + "flos": 18989527248000.0, + "grad_norm": 2.0392329057559433, + "language_loss": 0.66791224, + "learning_rate": 5.73926074001422e-07, + "loss": 0.68927062, + "num_input_tokens_seen": 272657240, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6875, + "step": 12642, + "time_per_iteration": 2.437264919281006 + }, + { + "auxiliary_loss_clip": 0.01102022, + "auxiliary_loss_mlp": 0.01028646, + "balance_loss_clip": 1.01733255, + "balance_loss_mlp": 1.0378716, + "epoch": 0.7601382834811363, + "flos": 26067520091520.0, + "grad_norm": 1.817654182769989, + "language_loss": 0.75470227, + "learning_rate": 5.736530391580765e-07, + "loss": 0.77600896, + "num_input_tokens_seen": 272677520, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 12643, + "time_per_iteration": 2.454752206802368 + }, + { + "auxiliary_loss_clip": 0.01104004, + "auxiliary_loss_mlp": 0.01032717, + "balance_loss_clip": 1.01981854, + "balance_loss_mlp": 1.03661776, + "epoch": 0.7601984067338043, + "flos": 18844232734080.0, + "grad_norm": 1.71435715776806, + "language_loss": 0.78663039, + "learning_rate": 5.733800584019508e-07, + "loss": 0.80799764, + "num_input_tokens_seen": 272696770, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.671875, + "step": 12644, + "time_per_iteration": 2.464467763900757 + }, + { + "auxiliary_loss_clip": 0.01102086, + "auxiliary_loss_mlp": 0.01026874, + "balance_loss_clip": 1.01553774, + "balance_loss_mlp": 1.03507113, + "epoch": 0.7602585299864723, + "flos": 24646391424000.0, + "grad_norm": 1.487007417540331, + "language_loss": 0.80469275, + "learning_rate": 5.731071317433957e-07, + "loss": 0.82598233, + "num_input_tokens_seen": 272718340, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 12645, + "time_per_iteration": 2.46242094039917 + }, + { + "auxiliary_loss_clip": 0.01106779, + "auxiliary_loss_mlp": 0.01028551, + "balance_loss_clip": 1.01639724, + "balance_loss_mlp": 1.03778565, + "epoch": 0.7603186532391403, + "flos": 23842100039040.0, + "grad_norm": 1.4295948404829946, + "language_loss": 0.72978055, + "learning_rate": 5.728342591927611e-07, + "loss": 0.75113386, + "num_input_tokens_seen": 272739575, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69140625, + "step": 12646, + "time_per_iteration": 2.471769332885742 + }, + { + "auxiliary_loss_clip": 0.01100614, + "auxiliary_loss_mlp": 0.01032121, + "balance_loss_clip": 1.02084398, + "balance_loss_mlp": 1.0357635, + "epoch": 0.7603787764918082, + "flos": 22199905117440.0, + "grad_norm": 2.0171184972904426, + "language_loss": 0.67350507, + "learning_rate": 5.725614407603949e-07, + "loss": 0.69483244, + "num_input_tokens_seen": 272758710, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6484375, + "step": 12647, + "time_per_iteration": 2.4212889671325684 + }, + { + "auxiliary_loss_clip": 0.01023895, + "auxiliary_loss_mlp": 0.01003551, + "balance_loss_clip": 1.00256717, + "balance_loss_mlp": 1.00363588, + "epoch": 0.7604388997444762, + "flos": 54086894254080.0, + "grad_norm": 0.6700081607219286, + "language_loss": 0.48957998, + "learning_rate": 5.722886764566415e-07, + "loss": 0.50985444, + "num_input_tokens_seen": 272814855, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.203125, + "step": 12648, + "time_per_iteration": 2.992032766342163 + }, + { + "auxiliary_loss_clip": 0.01099019, + "auxiliary_loss_mlp": 0.01032507, + "balance_loss_clip": 1.02124202, + "balance_loss_mlp": 1.03481627, + "epoch": 0.7604990229971441, + "flos": 19681920789120.0, + "grad_norm": 1.457089881735221, + "language_loss": 0.76486385, + "learning_rate": 5.720159662918451e-07, + "loss": 0.78617918, + "num_input_tokens_seen": 272834400, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.640625, + "step": 12649, + "time_per_iteration": 2.4250268936157227 + }, + { + "auxiliary_loss_clip": 0.01099572, + "auxiliary_loss_mlp": 0.01029511, + "balance_loss_clip": 1.0177089, + "balance_loss_mlp": 1.03462923, + "epoch": 0.7605591462498121, + "flos": 25228036356480.0, + "grad_norm": 1.4982970493787315, + "language_loss": 0.68732083, + "learning_rate": 5.717433102763462e-07, + "loss": 0.70861167, + "num_input_tokens_seen": 272854760, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6484375, + "step": 12650, + "time_per_iteration": 2.488598585128784 + }, + { + "auxiliary_loss_clip": 0.0102378, + "auxiliary_loss_mlp": 0.00999701, + "balance_loss_clip": 0.99867612, + "balance_loss_mlp": 1.00336099, + "epoch": 0.76061926950248, + "flos": 66783757662720.0, + "grad_norm": 0.7616307552749029, + "language_loss": 0.62742424, + "learning_rate": 5.714707084204838e-07, + "loss": 0.64765906, + "num_input_tokens_seen": 272919030, + "router_z_loss_clip": 0.01025391, + "router_z_loss_mlp": 0.20410156, + "step": 12651, + "time_per_iteration": 3.0423130989074707 + }, + { + "auxiliary_loss_clip": 0.01099802, + "auxiliary_loss_mlp": 0.01032274, + "balance_loss_clip": 1.02096677, + "balance_loss_mlp": 1.03473544, + "epoch": 0.7606793927551481, + "flos": 25338354001920.0, + "grad_norm": 1.3759590164717375, + "language_loss": 0.71249425, + "learning_rate": 5.711981607345951e-07, + "loss": 0.73381495, + "num_input_tokens_seen": 272938925, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65234375, + "step": 12652, + "time_per_iteration": 2.4702324867248535 + }, + { + "auxiliary_loss_clip": 0.01103881, + "auxiliary_loss_mlp": 0.01033324, + "balance_loss_clip": 1.0212419, + "balance_loss_mlp": 1.03609121, + "epoch": 0.760739516007816, + "flos": 18223624523520.0, + "grad_norm": 2.2736870535871354, + "language_loss": 0.80135083, + "learning_rate": 5.709256672290152e-07, + "loss": 0.82272291, + "num_input_tokens_seen": 272954945, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6796875, + "step": 12653, + "time_per_iteration": 2.452436685562134 + }, + { + "auxiliary_loss_clip": 0.01106986, + "auxiliary_loss_mlp": 0.01030648, + "balance_loss_clip": 1.01928127, + "balance_loss_mlp": 1.03704405, + "epoch": 0.760799639260484, + "flos": 22559119079040.0, + "grad_norm": 1.5498044874704002, + "language_loss": 0.80112356, + "learning_rate": 5.706532279140785e-07, + "loss": 0.82249987, + "num_input_tokens_seen": 272972855, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.69921875, + "step": 12654, + "time_per_iteration": 2.48616886138916 + }, + { + "auxiliary_loss_clip": 0.01104783, + "auxiliary_loss_mlp": 0.01034864, + "balance_loss_clip": 1.02255547, + "balance_loss_mlp": 1.03588712, + "epoch": 0.760859762513152, + "flos": 22309324922880.0, + "grad_norm": 2.061909970432495, + "language_loss": 0.79397112, + "learning_rate": 5.703808428001136e-07, + "loss": 0.81536764, + "num_input_tokens_seen": 272989895, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69140625, + "step": 12655, + "time_per_iteration": 2.421402931213379 + }, + { + "auxiliary_loss_clip": 0.01098302, + "auxiliary_loss_mlp": 0.01025594, + "balance_loss_clip": 1.01572394, + "balance_loss_mlp": 1.03430891, + "epoch": 0.7609198857658199, + "flos": 24863902231680.0, + "grad_norm": 1.6410708258422424, + "language_loss": 0.68456256, + "learning_rate": 5.701085118974505e-07, + "loss": 0.70580149, + "num_input_tokens_seen": 273011695, + "router_z_loss_clip": 0.09863281, + "router_z_loss_mlp": 0.640625, + "step": 12656, + "time_per_iteration": 2.5130324363708496 + }, + { + "auxiliary_loss_clip": 0.01103079, + "auxiliary_loss_mlp": 0.0102861, + "balance_loss_clip": 1.01629603, + "balance_loss_mlp": 1.03264689, + "epoch": 0.760980009018488, + "flos": 16836790366080.0, + "grad_norm": 1.9462034213744268, + "language_loss": 0.73116565, + "learning_rate": 5.698362352164164e-07, + "loss": 0.75248253, + "num_input_tokens_seen": 273028815, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 12657, + "time_per_iteration": 2.400148391723633 + }, + { + "auxiliary_loss_clip": 0.01024109, + "auxiliary_loss_mlp": 0.01000104, + "balance_loss_clip": 0.99908441, + "balance_loss_mlp": 1.0036025, + "epoch": 0.7610401322711559, + "flos": 61230603029760.0, + "grad_norm": 0.8561186291133048, + "language_loss": 0.64938498, + "learning_rate": 5.695640127673347e-07, + "loss": 0.66962707, + "num_input_tokens_seen": 273084080, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.20507812, + "step": 12658, + "time_per_iteration": 3.001168727874756 + }, + { + "auxiliary_loss_clip": 0.01098421, + "auxiliary_loss_mlp": 0.01030443, + "balance_loss_clip": 1.01878452, + "balance_loss_mlp": 1.03460932, + "epoch": 0.7611002555238239, + "flos": 19640730867840.0, + "grad_norm": 1.8302909281614124, + "language_loss": 0.79259527, + "learning_rate": 5.692918445605293e-07, + "loss": 0.8138839, + "num_input_tokens_seen": 273102295, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.640625, + "step": 12659, + "time_per_iteration": 2.4172587394714355 + }, + { + "auxiliary_loss_clip": 0.01099076, + "auxiliary_loss_mlp": 0.01025102, + "balance_loss_clip": 1.01360416, + "balance_loss_mlp": 1.03339934, + "epoch": 0.7611603787764918, + "flos": 26872206526080.0, + "grad_norm": 1.5401617612635332, + "language_loss": 0.68613267, + "learning_rate": 5.690197306063209e-07, + "loss": 0.70737445, + "num_input_tokens_seen": 273123400, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 12660, + "time_per_iteration": 2.486931085586548 + }, + { + "auxiliary_loss_clip": 0.01102403, + "auxiliary_loss_mlp": 0.01030099, + "balance_loss_clip": 1.01855946, + "balance_loss_mlp": 1.03502679, + "epoch": 0.7612205020291598, + "flos": 27344252085120.0, + "grad_norm": 1.63464824040793, + "language_loss": 0.70508969, + "learning_rate": 5.687476709150281e-07, + "loss": 0.72641468, + "num_input_tokens_seen": 273145150, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 12661, + "time_per_iteration": 2.5559232234954834 + }, + { + "auxiliary_loss_clip": 0.01099871, + "auxiliary_loss_mlp": 0.01027124, + "balance_loss_clip": 1.01578677, + "balance_loss_mlp": 1.03281772, + "epoch": 0.7612806252818277, + "flos": 29314598682240.0, + "grad_norm": 1.568031869440725, + "language_loss": 0.8346833, + "learning_rate": 5.68475665496966e-07, + "loss": 0.85595322, + "num_input_tokens_seen": 273165180, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 12662, + "time_per_iteration": 2.5182721614837646 + }, + { + "auxiliary_loss_clip": 0.01101806, + "auxiliary_loss_mlp": 0.01040729, + "balance_loss_clip": 1.02870047, + "balance_loss_mlp": 1.03437781, + "epoch": 0.7613407485344957, + "flos": 19026048401280.0, + "grad_norm": 1.7160561629790159, + "language_loss": 0.68380648, + "learning_rate": 5.682037143624505e-07, + "loss": 0.70523185, + "num_input_tokens_seen": 273184005, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 12663, + "time_per_iteration": 2.415670871734619 + }, + { + "auxiliary_loss_clip": 0.0110108, + "auxiliary_loss_mlp": 0.01025073, + "balance_loss_clip": 1.01368248, + "balance_loss_mlp": 1.03619945, + "epoch": 0.7614008717871636, + "flos": 23256037733760.0, + "grad_norm": 1.8370977086816516, + "language_loss": 0.70325685, + "learning_rate": 5.67931817521794e-07, + "loss": 0.72451836, + "num_input_tokens_seen": 273203565, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 12664, + "time_per_iteration": 2.4670281410217285 + }, + { + "auxiliary_loss_clip": 0.01107046, + "auxiliary_loss_mlp": 0.01038009, + "balance_loss_clip": 1.02551007, + "balance_loss_mlp": 1.03717303, + "epoch": 0.7614609950398317, + "flos": 21579907438080.0, + "grad_norm": 2.4295435457248575, + "language_loss": 0.79482126, + "learning_rate": 5.676599749853066e-07, + "loss": 0.81627178, + "num_input_tokens_seen": 273221645, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69921875, + "step": 12665, + "time_per_iteration": 2.4276509284973145 + }, + { + "auxiliary_loss_clip": 0.01101745, + "auxiliary_loss_mlp": 0.01031097, + "balance_loss_clip": 1.02022529, + "balance_loss_mlp": 1.03754544, + "epoch": 0.7615211182924996, + "flos": 29277897960960.0, + "grad_norm": 1.6635534140237522, + "language_loss": 0.88047594, + "learning_rate": 5.673881867632959e-07, + "loss": 0.90180439, + "num_input_tokens_seen": 273242040, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.640625, + "step": 12666, + "time_per_iteration": 2.51179575920105 + }, + { + "auxiliary_loss_clip": 0.0110239, + "auxiliary_loss_mlp": 0.0103263, + "balance_loss_clip": 1.0205127, + "balance_loss_mlp": 1.03515267, + "epoch": 0.7615812415451676, + "flos": 13261129136640.0, + "grad_norm": 1.9417407111979526, + "language_loss": 0.8323909, + "learning_rate": 5.671164528660693e-07, + "loss": 0.85374105, + "num_input_tokens_seen": 273257365, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 12667, + "time_per_iteration": 2.4148190021514893 + }, + { + "auxiliary_loss_clip": 0.01100905, + "auxiliary_loss_mlp": 0.01035947, + "balance_loss_clip": 1.02489078, + "balance_loss_mlp": 1.03628147, + "epoch": 0.7616413647978356, + "flos": 18584741905920.0, + "grad_norm": 1.6916218117768351, + "language_loss": 0.78259969, + "learning_rate": 5.668447733039296e-07, + "loss": 0.80396825, + "num_input_tokens_seen": 273274710, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.64453125, + "step": 12668, + "time_per_iteration": 2.4754624366760254 + }, + { + "auxiliary_loss_clip": 0.01100404, + "auxiliary_loss_mlp": 0.01030857, + "balance_loss_clip": 1.01928806, + "balance_loss_mlp": 1.0345788, + "epoch": 0.7617014880505035, + "flos": 18516188799360.0, + "grad_norm": 1.7878935447004587, + "language_loss": 0.63670552, + "learning_rate": 5.6657314808718e-07, + "loss": 0.65801817, + "num_input_tokens_seen": 273292870, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 12669, + "time_per_iteration": 2.406334638595581 + }, + { + "auxiliary_loss_clip": 0.01103168, + "auxiliary_loss_mlp": 0.01035702, + "balance_loss_clip": 1.023, + "balance_loss_mlp": 1.03439915, + "epoch": 0.7617616113031715, + "flos": 24973178382720.0, + "grad_norm": 1.8779652791388421, + "language_loss": 0.66191423, + "learning_rate": 5.663015772261202e-07, + "loss": 0.68330294, + "num_input_tokens_seen": 273312375, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 12670, + "time_per_iteration": 2.479275703430176 + }, + { + "auxiliary_loss_clip": 0.01103769, + "auxiliary_loss_mlp": 0.01032723, + "balance_loss_clip": 1.02112961, + "balance_loss_mlp": 1.0352459, + "epoch": 0.7618217345558395, + "flos": 23295036925440.0, + "grad_norm": 1.5352589226081985, + "language_loss": 0.73205262, + "learning_rate": 5.660300607310493e-07, + "loss": 0.75341749, + "num_input_tokens_seen": 273332590, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.68359375, + "step": 12671, + "time_per_iteration": 2.43534517288208 + }, + { + "auxiliary_loss_clip": 0.01098817, + "auxiliary_loss_mlp": 0.01028667, + "balance_loss_clip": 1.01777697, + "balance_loss_mlp": 1.03416443, + "epoch": 0.7618818578085075, + "flos": 25482894330240.0, + "grad_norm": 2.4136368104172607, + "language_loss": 0.73309898, + "learning_rate": 5.657585986122613e-07, + "loss": 0.75437379, + "num_input_tokens_seen": 273352885, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6484375, + "step": 12672, + "time_per_iteration": 2.4863340854644775 + }, + { + "auxiliary_loss_clip": 0.01023134, + "auxiliary_loss_mlp": 0.01002705, + "balance_loss_clip": 1.00170374, + "balance_loss_mlp": 1.00292134, + "epoch": 0.7619419810611754, + "flos": 61151994115200.0, + "grad_norm": 0.7636907167661546, + "language_loss": 0.56764495, + "learning_rate": 5.654871908800506e-07, + "loss": 0.58790326, + "num_input_tokens_seen": 273411730, + "router_z_loss_clip": 0.01000977, + "router_z_loss_mlp": 0.20214844, + "step": 12673, + "time_per_iteration": 3.0046093463897705 + }, + { + "auxiliary_loss_clip": 0.01103698, + "auxiliary_loss_mlp": 0.01027688, + "balance_loss_clip": 1.01493824, + "balance_loss_mlp": 1.03571641, + "epoch": 0.7620021043138434, + "flos": 23258659426560.0, + "grad_norm": 1.9214444027294126, + "language_loss": 0.74586606, + "learning_rate": 5.652158375447102e-07, + "loss": 0.76717991, + "num_input_tokens_seen": 273430020, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 12674, + "time_per_iteration": 2.4860613346099854 + }, + { + "auxiliary_loss_clip": 0.01099933, + "auxiliary_loss_mlp": 0.01027095, + "balance_loss_clip": 1.01634872, + "balance_loss_mlp": 1.03547001, + "epoch": 0.7620622275665113, + "flos": 25082490447360.0, + "grad_norm": 1.9445116324740603, + "language_loss": 0.72109187, + "learning_rate": 5.649445386165286e-07, + "loss": 0.74236214, + "num_input_tokens_seen": 273448690, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.64453125, + "step": 12675, + "time_per_iteration": 2.4733452796936035 + }, + { + "auxiliary_loss_clip": 0.01099705, + "auxiliary_loss_mlp": 0.01029829, + "balance_loss_clip": 1.01858139, + "balance_loss_mlp": 1.03579307, + "epoch": 0.7621223508191793, + "flos": 20155007842560.0, + "grad_norm": 2.3582627114091417, + "language_loss": 0.72836524, + "learning_rate": 5.646732941057936e-07, + "loss": 0.74966055, + "num_input_tokens_seen": 273465190, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 12676, + "time_per_iteration": 2.464700698852539 + }, + { + "auxiliary_loss_clip": 0.01108509, + "auxiliary_loss_mlp": 0.01030827, + "balance_loss_clip": 1.01853633, + "balance_loss_mlp": 1.0366993, + "epoch": 0.7621824740718472, + "flos": 18000187971840.0, + "grad_norm": 2.4246183918605055, + "language_loss": 0.54263771, + "learning_rate": 5.644021040227927e-07, + "loss": 0.56403106, + "num_input_tokens_seen": 273478620, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.71875, + "step": 12677, + "time_per_iteration": 2.3858957290649414 + }, + { + "auxiliary_loss_clip": 0.01101511, + "auxiliary_loss_mlp": 0.01028312, + "balance_loss_clip": 1.0166893, + "balance_loss_mlp": 1.03496563, + "epoch": 0.7622425973245153, + "flos": 21725668828800.0, + "grad_norm": 2.7484196878623104, + "language_loss": 0.78978539, + "learning_rate": 5.641309683778064e-07, + "loss": 0.81108367, + "num_input_tokens_seen": 273497635, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 12678, + "time_per_iteration": 3.8235199451446533 + }, + { + "auxiliary_loss_clip": 0.01103703, + "auxiliary_loss_mlp": 0.01030473, + "balance_loss_clip": 1.01842141, + "balance_loss_mlp": 1.0358417, + "epoch": 0.7623027205771832, + "flos": 19718549683200.0, + "grad_norm": 1.880562321588857, + "language_loss": 0.7751689, + "learning_rate": 5.638598871811175e-07, + "loss": 0.79651058, + "num_input_tokens_seen": 273513955, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 12679, + "time_per_iteration": 2.406036615371704 + }, + { + "auxiliary_loss_clip": 0.01102397, + "auxiliary_loss_mlp": 0.01024752, + "balance_loss_clip": 1.01288462, + "balance_loss_mlp": 1.03522229, + "epoch": 0.7623628438298512, + "flos": 23988831096960.0, + "grad_norm": 1.3855129030202036, + "language_loss": 0.79996926, + "learning_rate": 5.635888604430059e-07, + "loss": 0.82124078, + "num_input_tokens_seen": 273533970, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 12680, + "time_per_iteration": 3.8292644023895264 + }, + { + "auxiliary_loss_clip": 0.01102879, + "auxiliary_loss_mlp": 0.01027152, + "balance_loss_clip": 1.01448607, + "balance_loss_mlp": 1.03598523, + "epoch": 0.7624229670825191, + "flos": 22345702421760.0, + "grad_norm": 1.8104724953691376, + "language_loss": 0.62750268, + "learning_rate": 5.633178881737493e-07, + "loss": 0.64880306, + "num_input_tokens_seen": 273553090, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.66796875, + "step": 12681, + "time_per_iteration": 3.884755849838257 + }, + { + "auxiliary_loss_clip": 0.01099627, + "auxiliary_loss_mlp": 0.01030882, + "balance_loss_clip": 1.01962304, + "balance_loss_mlp": 1.03471422, + "epoch": 0.7624830903351871, + "flos": 22711775880960.0, + "grad_norm": 2.0185008739532946, + "language_loss": 0.76076877, + "learning_rate": 5.63046970383622e-07, + "loss": 0.78207386, + "num_input_tokens_seen": 273572460, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 12682, + "time_per_iteration": 3.9090828895568848 + }, + { + "auxiliary_loss_clip": 0.01098759, + "auxiliary_loss_mlp": 0.01027158, + "balance_loss_clip": 1.01630437, + "balance_loss_mlp": 1.0342561, + "epoch": 0.7625432135878552, + "flos": 25593714766080.0, + "grad_norm": 1.71259737430395, + "language_loss": 0.68134248, + "learning_rate": 5.627761070828974e-07, + "loss": 0.70260167, + "num_input_tokens_seen": 273592815, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.64453125, + "step": 12683, + "time_per_iteration": 2.4623308181762695 + }, + { + "auxiliary_loss_clip": 0.01103084, + "auxiliary_loss_mlp": 0.01029472, + "balance_loss_clip": 1.01777172, + "balance_loss_mlp": 1.03596735, + "epoch": 0.7626033368405231, + "flos": 23987645948160.0, + "grad_norm": 2.1249879118259285, + "language_loss": 0.83107448, + "learning_rate": 5.625052982818472e-07, + "loss": 0.85240012, + "num_input_tokens_seen": 273611790, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 12684, + "time_per_iteration": 2.4951984882354736 + }, + { + "auxiliary_loss_clip": 0.01103097, + "auxiliary_loss_mlp": 0.01037574, + "balance_loss_clip": 1.02493775, + "balance_loss_mlp": 1.03559566, + "epoch": 0.7626634600931911, + "flos": 12599115523200.0, + "grad_norm": 1.7953521206718834, + "language_loss": 0.82664561, + "learning_rate": 5.622345439907396e-07, + "loss": 0.84805232, + "num_input_tokens_seen": 273628340, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.67578125, + "step": 12685, + "time_per_iteration": 2.397047519683838 + }, + { + "auxiliary_loss_clip": 0.01104402, + "auxiliary_loss_mlp": 0.01025447, + "balance_loss_clip": 1.01402688, + "balance_loss_mlp": 1.03638494, + "epoch": 0.762723583345859, + "flos": 26322593546880.0, + "grad_norm": 1.8540410766605766, + "language_loss": 0.77068198, + "learning_rate": 5.619638442198422e-07, + "loss": 0.79198045, + "num_input_tokens_seen": 273646585, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6796875, + "step": 12686, + "time_per_iteration": 2.484248399734497 + }, + { + "auxiliary_loss_clip": 0.01104273, + "auxiliary_loss_mlp": 0.01038595, + "balance_loss_clip": 1.02545214, + "balance_loss_mlp": 1.03546059, + "epoch": 0.762783706598527, + "flos": 21907053532800.0, + "grad_norm": 1.6147280683220673, + "language_loss": 0.71894288, + "learning_rate": 5.616931989794198e-07, + "loss": 0.74037153, + "num_input_tokens_seen": 273665410, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6875, + "step": 12687, + "time_per_iteration": 2.438870668411255 + }, + { + "auxiliary_loss_clip": 0.01101986, + "auxiliary_loss_mlp": 0.01037451, + "balance_loss_clip": 1.02494013, + "balance_loss_mlp": 1.03586364, + "epoch": 0.7628438298511949, + "flos": 15339782217600.0, + "grad_norm": 1.7893122270206685, + "language_loss": 0.64678234, + "learning_rate": 5.614226082797369e-07, + "loss": 0.66817671, + "num_input_tokens_seen": 273683035, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.66015625, + "step": 12688, + "time_per_iteration": 2.473334550857544 + }, + { + "auxiliary_loss_clip": 0.01100145, + "auxiliary_loss_mlp": 0.01027183, + "balance_loss_clip": 1.01613188, + "balance_loss_mlp": 1.03599501, + "epoch": 0.7629039531038629, + "flos": 13006307076480.0, + "grad_norm": 1.824140660658097, + "language_loss": 0.70988876, + "learning_rate": 5.611520721310515e-07, + "loss": 0.73116207, + "num_input_tokens_seen": 273700130, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 12689, + "time_per_iteration": 2.389702320098877 + }, + { + "auxiliary_loss_clip": 0.01106966, + "auxiliary_loss_mlp": 0.01035843, + "balance_loss_clip": 1.02357674, + "balance_loss_mlp": 1.03706014, + "epoch": 0.7629640763565309, + "flos": 26171660597760.0, + "grad_norm": 1.6778934175859046, + "language_loss": 0.69599509, + "learning_rate": 5.608815905436238e-07, + "loss": 0.7174232, + "num_input_tokens_seen": 273720310, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.69921875, + "step": 12690, + "time_per_iteration": 2.4964652061462402 + }, + { + "auxiliary_loss_clip": 0.01103144, + "auxiliary_loss_mlp": 0.0102945, + "balance_loss_clip": 1.01791096, + "balance_loss_mlp": 1.03643334, + "epoch": 0.7630241996091989, + "flos": 36793713680640.0, + "grad_norm": 1.627452026729889, + "language_loss": 0.69135779, + "learning_rate": 5.606111635277109e-07, + "loss": 0.71268374, + "num_input_tokens_seen": 273744475, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66796875, + "step": 12691, + "time_per_iteration": 2.577179431915283 + }, + { + "auxiliary_loss_clip": 0.0109925, + "auxiliary_loss_mlp": 0.01032646, + "balance_loss_clip": 1.02197647, + "balance_loss_mlp": 1.03412747, + "epoch": 0.7630843228618668, + "flos": 21835160461440.0, + "grad_norm": 1.5885842386967668, + "language_loss": 0.81694877, + "learning_rate": 5.603407910935662e-07, + "loss": 0.83826768, + "num_input_tokens_seen": 273764635, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.65234375, + "step": 12692, + "time_per_iteration": 2.4633901119232178 + }, + { + "auxiliary_loss_clip": 0.01107736, + "auxiliary_loss_mlp": 0.01031236, + "balance_loss_clip": 1.02030492, + "balance_loss_mlp": 1.03841257, + "epoch": 0.7631444461145348, + "flos": 12640520926080.0, + "grad_norm": 2.217828968535983, + "language_loss": 0.76950878, + "learning_rate": 5.600704732514438e-07, + "loss": 0.79089856, + "num_input_tokens_seen": 273780115, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6953125, + "step": 12693, + "time_per_iteration": 2.4009978771209717 + }, + { + "auxiliary_loss_clip": 0.01103157, + "auxiliary_loss_mlp": 0.01030435, + "balance_loss_clip": 1.01835942, + "balance_loss_mlp": 1.03572786, + "epoch": 0.7632045693672027, + "flos": 16836610798080.0, + "grad_norm": 2.173871462173048, + "language_loss": 0.73079503, + "learning_rate": 5.598002100115933e-07, + "loss": 0.75213093, + "num_input_tokens_seen": 273796605, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.671875, + "step": 12694, + "time_per_iteration": 2.462535858154297 + }, + { + "auxiliary_loss_clip": 0.01098607, + "auxiliary_loss_mlp": 0.01027527, + "balance_loss_clip": 1.01585007, + "balance_loss_mlp": 1.03326893, + "epoch": 0.7632646926198707, + "flos": 22017335264640.0, + "grad_norm": 1.6266641771767514, + "language_loss": 0.70343757, + "learning_rate": 5.595300013842625e-07, + "loss": 0.7246989, + "num_input_tokens_seen": 273816515, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65625, + "step": 12695, + "time_per_iteration": 2.436309576034546 + }, + { + "auxiliary_loss_clip": 0.01101174, + "auxiliary_loss_mlp": 0.01029968, + "balance_loss_clip": 1.01889324, + "balance_loss_mlp": 1.03454077, + "epoch": 0.7633248158725388, + "flos": 23114011357440.0, + "grad_norm": 1.4700012541303298, + "language_loss": 0.72275102, + "learning_rate": 5.592598473796985e-07, + "loss": 0.74406242, + "num_input_tokens_seen": 273837060, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6640625, + "step": 12696, + "time_per_iteration": 2.473132371902466 + }, + { + "auxiliary_loss_clip": 0.01101016, + "auxiliary_loss_mlp": 0.01033584, + "balance_loss_clip": 1.02113307, + "balance_loss_mlp": 1.03426933, + "epoch": 0.7633849391252067, + "flos": 10889839952640.0, + "grad_norm": 2.376546359844648, + "language_loss": 0.71416759, + "learning_rate": 5.589897480081453e-07, + "loss": 0.73551357, + "num_input_tokens_seen": 273853365, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.66796875, + "step": 12697, + "time_per_iteration": 2.397484064102173 + }, + { + "auxiliary_loss_clip": 0.01103465, + "auxiliary_loss_mlp": 0.01030875, + "balance_loss_clip": 1.0194068, + "balance_loss_mlp": 1.03697562, + "epoch": 0.7634450623778747, + "flos": 20994168355200.0, + "grad_norm": 1.8987571133249672, + "language_loss": 0.66587389, + "learning_rate": 5.587197032798461e-07, + "loss": 0.6872173, + "num_input_tokens_seen": 273870750, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6640625, + "step": 12698, + "time_per_iteration": 2.4368910789489746 + }, + { + "auxiliary_loss_clip": 0.01099721, + "auxiliary_loss_mlp": 0.01028437, + "balance_loss_clip": 1.01665354, + "balance_loss_mlp": 1.03326559, + "epoch": 0.7635051856305426, + "flos": 18882046776960.0, + "grad_norm": 1.574933939339682, + "language_loss": 0.72529495, + "learning_rate": 5.5844971320504e-07, + "loss": 0.74657655, + "num_input_tokens_seen": 273890890, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6640625, + "step": 12699, + "time_per_iteration": 2.449216842651367 + }, + { + "auxiliary_loss_clip": 0.01099566, + "auxiliary_loss_mlp": 0.01032721, + "balance_loss_clip": 1.02182543, + "balance_loss_mlp": 1.03466082, + "epoch": 0.7635653088832106, + "flos": 34786989584640.0, + "grad_norm": 1.9214661095744658, + "language_loss": 0.73283732, + "learning_rate": 5.581797777939648e-07, + "loss": 0.75416017, + "num_input_tokens_seen": 273914015, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 12700, + "time_per_iteration": 2.614281177520752 + }, + { + "auxiliary_loss_clip": 0.01101231, + "auxiliary_loss_mlp": 0.01029995, + "balance_loss_clip": 1.01834226, + "balance_loss_mlp": 1.03391504, + "epoch": 0.7636254321358785, + "flos": 23178434400000.0, + "grad_norm": 3.347573177390183, + "language_loss": 0.68935323, + "learning_rate": 5.579098970568574e-07, + "loss": 0.71066546, + "num_input_tokens_seen": 273927415, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 12701, + "time_per_iteration": 2.407780170440674 + }, + { + "auxiliary_loss_clip": 0.01102757, + "auxiliary_loss_mlp": 0.01030007, + "balance_loss_clip": 1.01846802, + "balance_loss_mlp": 1.0361433, + "epoch": 0.7636855553885465, + "flos": 21325229032320.0, + "grad_norm": 2.4924220366961145, + "language_loss": 0.64379907, + "learning_rate": 5.576400710039508e-07, + "loss": 0.66512668, + "num_input_tokens_seen": 273946690, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 12702, + "time_per_iteration": 2.444377899169922 + }, + { + "auxiliary_loss_clip": 0.01101798, + "auxiliary_loss_mlp": 0.01031427, + "balance_loss_clip": 1.02000129, + "balance_loss_mlp": 1.0348711, + "epoch": 0.7637456786412145, + "flos": 28658079849600.0, + "grad_norm": 1.9476964019276684, + "language_loss": 0.65595478, + "learning_rate": 5.57370299645477e-07, + "loss": 0.67728704, + "num_input_tokens_seen": 273966870, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 12703, + "time_per_iteration": 2.4628920555114746 + }, + { + "auxiliary_loss_clip": 0.01102971, + "auxiliary_loss_mlp": 0.01023498, + "balance_loss_clip": 1.01204157, + "balance_loss_mlp": 1.03720379, + "epoch": 0.7638058018938825, + "flos": 21907269014400.0, + "grad_norm": 1.7669844217588608, + "language_loss": 0.83665591, + "learning_rate": 5.571005829916668e-07, + "loss": 0.85792065, + "num_input_tokens_seen": 273986360, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66015625, + "step": 12704, + "time_per_iteration": 2.448728561401367 + }, + { + "auxiliary_loss_clip": 0.01104257, + "auxiliary_loss_mlp": 0.01030733, + "balance_loss_clip": 1.01903248, + "balance_loss_mlp": 1.03712642, + "epoch": 0.7638659251465504, + "flos": 29643899592960.0, + "grad_norm": 1.3938959354870066, + "language_loss": 0.67689544, + "learning_rate": 5.568309210527469e-07, + "loss": 0.69824535, + "num_input_tokens_seen": 274009745, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 12705, + "time_per_iteration": 2.4803051948547363 + }, + { + "auxiliary_loss_clip": 0.01100722, + "auxiliary_loss_mlp": 0.0102598, + "balance_loss_clip": 1.01429725, + "balance_loss_mlp": 1.03554821, + "epoch": 0.7639260483992184, + "flos": 26141172929280.0, + "grad_norm": 1.691823975978675, + "language_loss": 0.74275041, + "learning_rate": 5.565613138389427e-07, + "loss": 0.7640174, + "num_input_tokens_seen": 274028775, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65234375, + "step": 12706, + "time_per_iteration": 2.4732961654663086 + }, + { + "auxiliary_loss_clip": 0.01102547, + "auxiliary_loss_mlp": 0.0102807, + "balance_loss_clip": 1.01650715, + "balance_loss_mlp": 1.03575993, + "epoch": 0.7639861716518863, + "flos": 20156695781760.0, + "grad_norm": 1.8728916449529083, + "language_loss": 0.7829448, + "learning_rate": 5.562917613604781e-07, + "loss": 0.80425096, + "num_input_tokens_seen": 274047520, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66796875, + "step": 12707, + "time_per_iteration": 2.44852352142334 + }, + { + "auxiliary_loss_clip": 0.01100823, + "auxiliary_loss_mlp": 0.01025401, + "balance_loss_clip": 1.01379025, + "balance_loss_mlp": 1.03446913, + "epoch": 0.7640462949045543, + "flos": 18583125793920.0, + "grad_norm": 1.8180584415058063, + "language_loss": 0.79873604, + "learning_rate": 5.560222636275751e-07, + "loss": 0.81999826, + "num_input_tokens_seen": 274065350, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 12708, + "time_per_iteration": 2.427623987197876 + }, + { + "auxiliary_loss_clip": 0.01024337, + "auxiliary_loss_mlp": 0.01003138, + "balance_loss_clip": 1.00209475, + "balance_loss_mlp": 1.00414193, + "epoch": 0.7641064181572224, + "flos": 68321991646080.0, + "grad_norm": 0.8188309305581064, + "language_loss": 0.56423205, + "learning_rate": 5.557528206504521e-07, + "loss": 0.58450681, + "num_input_tokens_seen": 274122315, + "router_z_loss_clip": 0.01043701, + "router_z_loss_mlp": 0.20214844, + "step": 12709, + "time_per_iteration": 3.0471227169036865 + }, + { + "auxiliary_loss_clip": 0.01106792, + "auxiliary_loss_mlp": 0.01031993, + "balance_loss_clip": 1.01926708, + "balance_loss_mlp": 1.03640699, + "epoch": 0.7641665414098903, + "flos": 17968982031360.0, + "grad_norm": 1.7746105056549126, + "language_loss": 0.63412935, + "learning_rate": 5.554834324393271e-07, + "loss": 0.65551722, + "num_input_tokens_seen": 274140555, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 12710, + "time_per_iteration": 2.436523199081421 + }, + { + "auxiliary_loss_clip": 0.01104937, + "auxiliary_loss_mlp": 0.01030515, + "balance_loss_clip": 1.01748586, + "balance_loss_mlp": 1.03611827, + "epoch": 0.7642266646625583, + "flos": 21252078984960.0, + "grad_norm": 2.4696813182375994, + "language_loss": 0.64710927, + "learning_rate": 5.552140990044154e-07, + "loss": 0.66846383, + "num_input_tokens_seen": 274161125, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6875, + "step": 12711, + "time_per_iteration": 2.413130760192871 + }, + { + "auxiliary_loss_clip": 0.01100872, + "auxiliary_loss_mlp": 0.01032613, + "balance_loss_clip": 1.02089453, + "balance_loss_mlp": 1.03438199, + "epoch": 0.7642867879152262, + "flos": 22747794243840.0, + "grad_norm": 1.5961757403151435, + "language_loss": 0.72854543, + "learning_rate": 5.549448203559293e-07, + "loss": 0.74988031, + "num_input_tokens_seen": 274180835, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 12712, + "time_per_iteration": 2.4923083782196045 + }, + { + "auxiliary_loss_clip": 0.01100743, + "auxiliary_loss_mlp": 0.01027304, + "balance_loss_clip": 1.01644421, + "balance_loss_mlp": 1.03588057, + "epoch": 0.7643469111678942, + "flos": 23332132696320.0, + "grad_norm": 1.512862256571613, + "language_loss": 0.8010205, + "learning_rate": 5.546755965040804e-07, + "loss": 0.82230103, + "num_input_tokens_seen": 274201190, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6484375, + "step": 12713, + "time_per_iteration": 2.4570553302764893 + }, + { + "auxiliary_loss_clip": 0.01104482, + "auxiliary_loss_mlp": 0.01029802, + "balance_loss_clip": 1.01756477, + "balance_loss_mlp": 1.03663445, + "epoch": 0.7644070344205621, + "flos": 19857092440320.0, + "grad_norm": 2.127063992718731, + "language_loss": 0.83558553, + "learning_rate": 5.544064274590776e-07, + "loss": 0.85692835, + "num_input_tokens_seen": 274217595, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6796875, + "step": 12714, + "time_per_iteration": 2.4317142963409424 + }, + { + "auxiliary_loss_clip": 0.01105545, + "auxiliary_loss_mlp": 0.01038361, + "balance_loss_clip": 1.0259099, + "balance_loss_mlp": 1.03701067, + "epoch": 0.7644671576732301, + "flos": 22090628966400.0, + "grad_norm": 1.592380808570538, + "language_loss": 0.72868395, + "learning_rate": 5.541373132311287e-07, + "loss": 0.75012302, + "num_input_tokens_seen": 274237885, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6875, + "step": 12715, + "time_per_iteration": 2.43247389793396 + }, + { + "auxiliary_loss_clip": 0.01100488, + "auxiliary_loss_mlp": 0.0102946, + "balance_loss_clip": 1.01739025, + "balance_loss_mlp": 1.03394234, + "epoch": 0.7645272809258981, + "flos": 25481421872640.0, + "grad_norm": 1.7023765879093384, + "language_loss": 0.63293636, + "learning_rate": 5.538682538304376e-07, + "loss": 0.65423584, + "num_input_tokens_seen": 274258820, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66796875, + "step": 12716, + "time_per_iteration": 2.519078016281128 + }, + { + "auxiliary_loss_clip": 0.0110555, + "auxiliary_loss_mlp": 0.01034917, + "balance_loss_clip": 1.02227485, + "balance_loss_mlp": 1.03597593, + "epoch": 0.7645874041785661, + "flos": 21541877913600.0, + "grad_norm": 1.4875164699453862, + "language_loss": 0.79791009, + "learning_rate": 5.535992492672068e-07, + "loss": 0.81931472, + "num_input_tokens_seen": 274278835, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 12717, + "time_per_iteration": 2.429151773452759 + }, + { + "auxiliary_loss_clip": 0.01102712, + "auxiliary_loss_mlp": 0.01033337, + "balance_loss_clip": 1.0218451, + "balance_loss_mlp": 1.03612757, + "epoch": 0.764647527431234, + "flos": 20630896156800.0, + "grad_norm": 2.2673772679539486, + "language_loss": 0.66456509, + "learning_rate": 5.53330299551638e-07, + "loss": 0.6859256, + "num_input_tokens_seen": 274297110, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6640625, + "step": 12718, + "time_per_iteration": 2.5357375144958496 + }, + { + "auxiliary_loss_clip": 0.01098639, + "auxiliary_loss_mlp": 0.01031904, + "balance_loss_clip": 1.02124047, + "balance_loss_mlp": 1.03456593, + "epoch": 0.764707650683902, + "flos": 21434074220160.0, + "grad_norm": 1.8849716661729419, + "language_loss": 0.77913976, + "learning_rate": 5.530614046939286e-07, + "loss": 0.8004452, + "num_input_tokens_seen": 274315610, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.640625, + "step": 12719, + "time_per_iteration": 3.9749484062194824 + }, + { + "auxiliary_loss_clip": 0.01102309, + "auxiliary_loss_mlp": 0.01025821, + "balance_loss_clip": 1.01369143, + "balance_loss_mlp": 1.03523517, + "epoch": 0.7647677739365699, + "flos": 22711201263360.0, + "grad_norm": 1.683095995258743, + "language_loss": 0.69655412, + "learning_rate": 5.527925647042754e-07, + "loss": 0.71783549, + "num_input_tokens_seen": 274333975, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.671875, + "step": 12720, + "time_per_iteration": 2.568380117416382 + }, + { + "auxiliary_loss_clip": 0.01102352, + "auxiliary_loss_mlp": 0.01031499, + "balance_loss_clip": 1.01990581, + "balance_loss_mlp": 1.03623235, + "epoch": 0.7648278971892379, + "flos": 21324115710720.0, + "grad_norm": 1.6712048084567594, + "language_loss": 0.73724437, + "learning_rate": 5.52523779592875e-07, + "loss": 0.75858283, + "num_input_tokens_seen": 274353695, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6640625, + "step": 12721, + "time_per_iteration": 3.8811776638031006 + }, + { + "auxiliary_loss_clip": 0.01101929, + "auxiliary_loss_mlp": 0.01028761, + "balance_loss_clip": 1.01676273, + "balance_loss_mlp": 1.03572047, + "epoch": 0.764888020441906, + "flos": 20667345482880.0, + "grad_norm": 1.8878016684824361, + "language_loss": 0.73512298, + "learning_rate": 5.522550493699163e-07, + "loss": 0.75642979, + "num_input_tokens_seen": 274371120, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66015625, + "step": 12722, + "time_per_iteration": 3.989180564880371 + }, + { + "auxiliary_loss_clip": 0.01101721, + "auxiliary_loss_mlp": 0.0103481, + "balance_loss_clip": 1.02355647, + "balance_loss_mlp": 1.03481197, + "epoch": 0.7649481436945739, + "flos": 25082526360960.0, + "grad_norm": 1.7865929213753133, + "language_loss": 0.7357918, + "learning_rate": 5.519863740455912e-07, + "loss": 0.75715715, + "num_input_tokens_seen": 274389665, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.66796875, + "step": 12723, + "time_per_iteration": 2.5361814498901367 + }, + { + "auxiliary_loss_clip": 0.01101913, + "auxiliary_loss_mlp": 0.01028255, + "balance_loss_clip": 1.01642966, + "balance_loss_mlp": 1.03334272, + "epoch": 0.7650082669472419, + "flos": 24900890261760.0, + "grad_norm": 1.812040578255397, + "language_loss": 0.73211122, + "learning_rate": 5.517177536300881e-07, + "loss": 0.7534129, + "num_input_tokens_seen": 274408750, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 12724, + "time_per_iteration": 3.9343338012695312 + }, + { + "auxiliary_loss_clip": 0.01099657, + "auxiliary_loss_mlp": 0.01024123, + "balance_loss_clip": 1.01270843, + "balance_loss_mlp": 1.03587949, + "epoch": 0.7650683901999098, + "flos": 14647388676480.0, + "grad_norm": 1.9420758894203383, + "language_loss": 0.8370254, + "learning_rate": 5.514491881335935e-07, + "loss": 0.85826313, + "num_input_tokens_seen": 274424600, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.63671875, + "step": 12725, + "time_per_iteration": 2.378312587738037 + }, + { + "auxiliary_loss_clip": 0.01102, + "auxiliary_loss_mlp": 0.01032852, + "balance_loss_clip": 1.02064514, + "balance_loss_mlp": 1.03584003, + "epoch": 0.7651285134525778, + "flos": 26352434770560.0, + "grad_norm": 1.7077077280444313, + "language_loss": 0.77513289, + "learning_rate": 5.511806775662901e-07, + "loss": 0.79648137, + "num_input_tokens_seen": 274443075, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66015625, + "step": 12726, + "time_per_iteration": 2.489109992980957 + }, + { + "auxiliary_loss_clip": 0.01103068, + "auxiliary_loss_mlp": 0.0103319, + "balance_loss_clip": 1.02161503, + "balance_loss_mlp": 1.03531957, + "epoch": 0.7651886367052457, + "flos": 26646866553600.0, + "grad_norm": 1.5743856699934278, + "language_loss": 0.7073437, + "learning_rate": 5.509122219383615e-07, + "loss": 0.7287063, + "num_input_tokens_seen": 274463240, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6796875, + "step": 12727, + "time_per_iteration": 2.4679818153381348 + }, + { + "auxiliary_loss_clip": 0.01097102, + "auxiliary_loss_mlp": 0.01024446, + "balance_loss_clip": 1.01324618, + "balance_loss_mlp": 1.03295493, + "epoch": 0.7652487599579137, + "flos": 25702847262720.0, + "grad_norm": 1.683634898596646, + "language_loss": 0.79648662, + "learning_rate": 5.506438212599864e-07, + "loss": 0.81770217, + "num_input_tokens_seen": 274482750, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.640625, + "step": 12728, + "time_per_iteration": 2.5594372749328613 + }, + { + "auxiliary_loss_clip": 0.01104269, + "auxiliary_loss_mlp": 0.01029906, + "balance_loss_clip": 1.01803839, + "balance_loss_mlp": 1.03638935, + "epoch": 0.7653088832105817, + "flos": 28585576247040.0, + "grad_norm": 1.9251474152175339, + "language_loss": 0.55158925, + "learning_rate": 5.503754755413424e-07, + "loss": 0.57293093, + "num_input_tokens_seen": 274503545, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 12729, + "time_per_iteration": 2.4821853637695312 + }, + { + "auxiliary_loss_clip": 0.01101542, + "auxiliary_loss_mlp": 0.01029418, + "balance_loss_clip": 1.01739013, + "balance_loss_mlp": 1.03482783, + "epoch": 0.7653690064632497, + "flos": 23366750428800.0, + "grad_norm": 2.177670439939341, + "language_loss": 0.77752316, + "learning_rate": 5.501071847926055e-07, + "loss": 0.79883277, + "num_input_tokens_seen": 274523825, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 12730, + "time_per_iteration": 2.4903311729431152 + }, + { + "auxiliary_loss_clip": 0.01107568, + "auxiliary_loss_mlp": 0.01037906, + "balance_loss_clip": 1.02540636, + "balance_loss_mlp": 1.03940296, + "epoch": 0.7654291297159176, + "flos": 15773905992960.0, + "grad_norm": 2.6215650166042854, + "language_loss": 0.68980086, + "learning_rate": 5.498389490239495e-07, + "loss": 0.71125555, + "num_input_tokens_seen": 274541625, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 12731, + "time_per_iteration": 2.4075534343719482 + }, + { + "auxiliary_loss_clip": 0.01103331, + "auxiliary_loss_mlp": 0.01029528, + "balance_loss_clip": 1.01775599, + "balance_loss_mlp": 1.03526866, + "epoch": 0.7654892529685856, + "flos": 18033800123520.0, + "grad_norm": 2.0647561779987598, + "language_loss": 0.69921666, + "learning_rate": 5.495707682455471e-07, + "loss": 0.72054529, + "num_input_tokens_seen": 274557580, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 12732, + "time_per_iteration": 2.4208905696868896 + }, + { + "auxiliary_loss_clip": 0.01103869, + "auxiliary_loss_mlp": 0.01029535, + "balance_loss_clip": 1.01700664, + "balance_loss_mlp": 1.03542268, + "epoch": 0.7655493762212535, + "flos": 27236017428480.0, + "grad_norm": 1.4500082329987547, + "language_loss": 0.78334171, + "learning_rate": 5.493026424675653e-07, + "loss": 0.8046757, + "num_input_tokens_seen": 274578135, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 12733, + "time_per_iteration": 2.4912784099578857 + }, + { + "auxiliary_loss_clip": 0.01101688, + "auxiliary_loss_mlp": 0.01031369, + "balance_loss_clip": 1.02012134, + "balance_loss_mlp": 1.03670192, + "epoch": 0.7656094994739215, + "flos": 20773964027520.0, + "grad_norm": 1.7158100423573102, + "language_loss": 0.77660191, + "learning_rate": 5.490345717001726e-07, + "loss": 0.79793251, + "num_input_tokens_seen": 274595655, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 12734, + "time_per_iteration": 2.500473737716675 + }, + { + "auxiliary_loss_clip": 0.01105167, + "auxiliary_loss_mlp": 0.01028995, + "balance_loss_clip": 1.01641846, + "balance_loss_mlp": 1.03554702, + "epoch": 0.7656696227265896, + "flos": 23039245198080.0, + "grad_norm": 1.8249591988641765, + "language_loss": 0.72925597, + "learning_rate": 5.48766555953535e-07, + "loss": 0.7505976, + "num_input_tokens_seen": 274616305, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 12735, + "time_per_iteration": 2.477151870727539 + }, + { + "auxiliary_loss_clip": 0.01100932, + "auxiliary_loss_mlp": 0.01031337, + "balance_loss_clip": 1.01950526, + "balance_loss_mlp": 1.03448582, + "epoch": 0.7657297459792575, + "flos": 27525636789120.0, + "grad_norm": 1.762755938447221, + "language_loss": 0.72515297, + "learning_rate": 5.484985952378145e-07, + "loss": 0.7464757, + "num_input_tokens_seen": 274638110, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 12736, + "time_per_iteration": 2.5486631393432617 + }, + { + "auxiliary_loss_clip": 0.01107585, + "auxiliary_loss_mlp": 0.01036525, + "balance_loss_clip": 1.02288127, + "balance_loss_mlp": 1.03783011, + "epoch": 0.7657898692319255, + "flos": 17128456801920.0, + "grad_norm": 1.7485103277952745, + "language_loss": 0.77891874, + "learning_rate": 5.482306895631728e-07, + "loss": 0.80035985, + "num_input_tokens_seen": 274656565, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.69921875, + "step": 12737, + "time_per_iteration": 2.4112277030944824 + }, + { + "auxiliary_loss_clip": 0.01101521, + "auxiliary_loss_mlp": 0.01028185, + "balance_loss_clip": 1.01596594, + "balance_loss_mlp": 1.0340569, + "epoch": 0.7658499924845934, + "flos": 21465747037440.0, + "grad_norm": 1.6956859838498979, + "language_loss": 0.76673079, + "learning_rate": 5.479628389397699e-07, + "loss": 0.78802776, + "num_input_tokens_seen": 274674215, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.67578125, + "step": 12738, + "time_per_iteration": 2.4841501712799072 + }, + { + "auxiliary_loss_clip": 0.01104744, + "auxiliary_loss_mlp": 0.01029004, + "balance_loss_clip": 1.01677346, + "balance_loss_mlp": 1.03617144, + "epoch": 0.7659101157372614, + "flos": 29496665744640.0, + "grad_norm": 1.8494809749417094, + "language_loss": 0.62757778, + "learning_rate": 5.476950433777603e-07, + "loss": 0.64891523, + "num_input_tokens_seen": 274693445, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 12739, + "time_per_iteration": 2.5342459678649902 + }, + { + "auxiliary_loss_clip": 0.01104187, + "auxiliary_loss_mlp": 0.01034147, + "balance_loss_clip": 1.02121282, + "balance_loss_mlp": 1.03702021, + "epoch": 0.7659702389899293, + "flos": 18551812112640.0, + "grad_norm": 1.9457756189181725, + "language_loss": 0.79532218, + "learning_rate": 5.474273028873004e-07, + "loss": 0.81670547, + "num_input_tokens_seen": 274712815, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.671875, + "step": 12740, + "time_per_iteration": 2.464242458343506 + }, + { + "auxiliary_loss_clip": 0.01101878, + "auxiliary_loss_mlp": 0.0102987, + "balance_loss_clip": 1.01806879, + "balance_loss_mlp": 1.03497076, + "epoch": 0.7660303622425974, + "flos": 23549176627200.0, + "grad_norm": 1.8538704286256995, + "language_loss": 0.65541816, + "learning_rate": 5.471596174785429e-07, + "loss": 0.67673558, + "num_input_tokens_seen": 274732690, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66796875, + "step": 12741, + "time_per_iteration": 2.6027071475982666 + }, + { + "auxiliary_loss_clip": 0.01101685, + "auxiliary_loss_mlp": 0.0102683, + "balance_loss_clip": 1.01482606, + "balance_loss_mlp": 1.03617609, + "epoch": 0.7660904854952653, + "flos": 18916736336640.0, + "grad_norm": 1.883849175475749, + "language_loss": 0.75741291, + "learning_rate": 5.468919871616386e-07, + "loss": 0.77869809, + "num_input_tokens_seen": 274752460, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.65625, + "step": 12742, + "time_per_iteration": 2.483158588409424 + }, + { + "auxiliary_loss_clip": 0.0109981, + "auxiliary_loss_mlp": 0.01028671, + "balance_loss_clip": 1.01749492, + "balance_loss_mlp": 1.03572869, + "epoch": 0.7661506087479333, + "flos": 23147515768320.0, + "grad_norm": 1.3603011041168136, + "language_loss": 0.76397032, + "learning_rate": 5.46624411946736e-07, + "loss": 0.78525507, + "num_input_tokens_seen": 274773070, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.640625, + "step": 12743, + "time_per_iteration": 2.4432547092437744 + }, + { + "auxiliary_loss_clip": 0.01100504, + "auxiliary_loss_mlp": 0.01027763, + "balance_loss_clip": 1.01619387, + "balance_loss_mlp": 1.0345211, + "epoch": 0.7662107320006012, + "flos": 17565776887680.0, + "grad_norm": 1.9126072304780652, + "language_loss": 0.749053, + "learning_rate": 5.463568918439805e-07, + "loss": 0.77033567, + "num_input_tokens_seen": 274790220, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 12744, + "time_per_iteration": 2.4553682804107666 + }, + { + "auxiliary_loss_clip": 0.01103322, + "auxiliary_loss_mlp": 0.01027604, + "balance_loss_clip": 1.01541495, + "balance_loss_mlp": 1.03609204, + "epoch": 0.7662708552532692, + "flos": 22303075956480.0, + "grad_norm": 2.243657219575693, + "language_loss": 0.70895386, + "learning_rate": 5.460894268635181e-07, + "loss": 0.73026311, + "num_input_tokens_seen": 274805095, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 12745, + "time_per_iteration": 2.4222021102905273 + }, + { + "auxiliary_loss_clip": 0.01102421, + "auxiliary_loss_mlp": 0.0103259, + "balance_loss_clip": 1.02016246, + "balance_loss_mlp": 1.03536963, + "epoch": 0.7663309785059371, + "flos": 15742053607680.0, + "grad_norm": 2.2580014777322264, + "language_loss": 0.7671814, + "learning_rate": 5.458220170154896e-07, + "loss": 0.78853154, + "num_input_tokens_seen": 274821800, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.671875, + "step": 12746, + "time_per_iteration": 2.4328715801239014 + }, + { + "auxiliary_loss_clip": 0.01024805, + "auxiliary_loss_mlp": 0.01002921, + "balance_loss_clip": 1.00194991, + "balance_loss_mlp": 1.00455523, + "epoch": 0.7663911017586051, + "flos": 62163312514560.0, + "grad_norm": 0.6617058093404249, + "language_loss": 0.56800187, + "learning_rate": 5.455546623100362e-07, + "loss": 0.58827913, + "num_input_tokens_seen": 274886970, + "router_z_loss_clip": 0.00970459, + "router_z_loss_mlp": 0.20214844, + "step": 12747, + "time_per_iteration": 3.0698306560516357 + }, + { + "auxiliary_loss_clip": 0.01098009, + "auxiliary_loss_mlp": 0.01028323, + "balance_loss_clip": 1.01804721, + "balance_loss_mlp": 1.03344798, + "epoch": 0.7664512250112732, + "flos": 26506025326080.0, + "grad_norm": 1.7111315539475358, + "language_loss": 0.72324377, + "learning_rate": 5.452873627572956e-07, + "loss": 0.74450713, + "num_input_tokens_seen": 274907240, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.6484375, + "step": 12748, + "time_per_iteration": 2.506683588027954 + }, + { + "auxiliary_loss_clip": 0.01099884, + "auxiliary_loss_mlp": 0.01030099, + "balance_loss_clip": 1.01763535, + "balance_loss_mlp": 1.03435397, + "epoch": 0.7665113482639411, + "flos": 16249542912000.0, + "grad_norm": 3.145698976514515, + "language_loss": 0.6893121, + "learning_rate": 5.450201183674052e-07, + "loss": 0.71061194, + "num_input_tokens_seen": 274924650, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.65625, + "step": 12749, + "time_per_iteration": 2.389932155609131 + }, + { + "auxiliary_loss_clip": 0.01101373, + "auxiliary_loss_mlp": 0.01026386, + "balance_loss_clip": 1.01423216, + "balance_loss_mlp": 1.034747, + "epoch": 0.7665714715166091, + "flos": 27197880163200.0, + "grad_norm": 1.5718921115117155, + "language_loss": 0.73633575, + "learning_rate": 5.447529291504967e-07, + "loss": 0.75761336, + "num_input_tokens_seen": 274944550, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6640625, + "step": 12750, + "time_per_iteration": 2.5167572498321533 + }, + { + "auxiliary_loss_clip": 0.01098567, + "auxiliary_loss_mlp": 0.01027406, + "balance_loss_clip": 1.01658773, + "balance_loss_mlp": 1.0340786, + "epoch": 0.766631594769277, + "flos": 21067785279360.0, + "grad_norm": 3.4547507974534937, + "language_loss": 0.75537312, + "learning_rate": 5.444857951167026e-07, + "loss": 0.77663291, + "num_input_tokens_seen": 274961330, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.64453125, + "step": 12751, + "time_per_iteration": 2.476710081100464 + }, + { + "auxiliary_loss_clip": 0.01103164, + "auxiliary_loss_mlp": 0.01036055, + "balance_loss_clip": 1.0242238, + "balance_loss_mlp": 1.03732014, + "epoch": 0.766691718021945, + "flos": 24097963593600.0, + "grad_norm": 2.104179028478291, + "language_loss": 0.61111033, + "learning_rate": 5.442187162761537e-07, + "loss": 0.6325025, + "num_input_tokens_seen": 274981655, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66015625, + "step": 12752, + "time_per_iteration": 2.483185291290283 + }, + { + "auxiliary_loss_clip": 0.01103162, + "auxiliary_loss_mlp": 0.01033869, + "balance_loss_clip": 1.0219605, + "balance_loss_mlp": 1.03612447, + "epoch": 0.7667518412746129, + "flos": 23440654661760.0, + "grad_norm": 1.7425308356363913, + "language_loss": 0.69364887, + "learning_rate": 5.439516926389767e-07, + "loss": 0.71501917, + "num_input_tokens_seen": 274999970, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 12753, + "time_per_iteration": 2.462432384490967 + }, + { + "auxiliary_loss_clip": 0.01102326, + "auxiliary_loss_mlp": 0.01036325, + "balance_loss_clip": 1.02463651, + "balance_loss_mlp": 1.03598034, + "epoch": 0.766811964527281, + "flos": 18148786536960.0, + "grad_norm": 2.935870889400166, + "language_loss": 0.62185645, + "learning_rate": 5.436847242152971e-07, + "loss": 0.64324296, + "num_input_tokens_seen": 275015805, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 12754, + "time_per_iteration": 2.518746852874756 + }, + { + "auxiliary_loss_clip": 0.01103223, + "auxiliary_loss_mlp": 0.01027471, + "balance_loss_clip": 1.01619387, + "balance_loss_mlp": 1.03773046, + "epoch": 0.7668720877799489, + "flos": 19536051657600.0, + "grad_norm": 2.3055221195996065, + "language_loss": 0.79792452, + "learning_rate": 5.434178110152401e-07, + "loss": 0.81923139, + "num_input_tokens_seen": 275031810, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65625, + "step": 12755, + "time_per_iteration": 2.4429805278778076 + }, + { + "auxiliary_loss_clip": 0.01101727, + "auxiliary_loss_mlp": 0.01028616, + "balance_loss_clip": 1.01758885, + "balance_loss_mlp": 1.03660679, + "epoch": 0.7669322110326169, + "flos": 22674320974080.0, + "grad_norm": 1.7360812888518318, + "language_loss": 0.70129168, + "learning_rate": 5.431509530489242e-07, + "loss": 0.7225951, + "num_input_tokens_seen": 275049325, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 12756, + "time_per_iteration": 2.4959518909454346 + }, + { + "auxiliary_loss_clip": 0.01101968, + "auxiliary_loss_mlp": 0.01034868, + "balance_loss_clip": 1.02353144, + "balance_loss_mlp": 1.03610778, + "epoch": 0.7669923342852848, + "flos": 26469396432000.0, + "grad_norm": 2.2706673014793766, + "language_loss": 0.70277941, + "learning_rate": 5.428841503264706e-07, + "loss": 0.7241478, + "num_input_tokens_seen": 275070865, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 12757, + "time_per_iteration": 2.503958225250244 + }, + { + "auxiliary_loss_clip": 0.01103409, + "auxiliary_loss_mlp": 0.01033303, + "balance_loss_clip": 1.02101266, + "balance_loss_mlp": 1.03675115, + "epoch": 0.7670524575379528, + "flos": 22856136641280.0, + "grad_norm": 1.9695287063261235, + "language_loss": 0.75929737, + "learning_rate": 5.426174028579955e-07, + "loss": 0.78066456, + "num_input_tokens_seen": 275088015, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66796875, + "step": 12758, + "time_per_iteration": 2.490203857421875 + }, + { + "auxiliary_loss_clip": 0.01098808, + "auxiliary_loss_mlp": 0.01032432, + "balance_loss_clip": 1.02119648, + "balance_loss_mlp": 1.03469872, + "epoch": 0.7671125807906207, + "flos": 22452141398400.0, + "grad_norm": 1.6224114327929111, + "language_loss": 0.76120728, + "learning_rate": 5.423507106536156e-07, + "loss": 0.7825197, + "num_input_tokens_seen": 275106975, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 12759, + "time_per_iteration": 2.462779998779297 + }, + { + "auxiliary_loss_clip": 0.0109933, + "auxiliary_loss_mlp": 0.01026965, + "balance_loss_clip": 1.01611102, + "balance_loss_mlp": 1.03285909, + "epoch": 0.7671727040432887, + "flos": 35371543518720.0, + "grad_norm": 2.0831597822945738, + "language_loss": 0.68447405, + "learning_rate": 5.420840737234425e-07, + "loss": 0.70573699, + "num_input_tokens_seen": 275129560, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6640625, + "step": 12760, + "time_per_iteration": 2.66218900680542 + }, + { + "auxiliary_loss_clip": 0.01102506, + "auxiliary_loss_mlp": 0.01029066, + "balance_loss_clip": 1.01719248, + "balance_loss_mlp": 1.03628325, + "epoch": 0.7672328272959568, + "flos": 22494947431680.0, + "grad_norm": 1.530930371771359, + "language_loss": 0.79041481, + "learning_rate": 5.418174920775871e-07, + "loss": 0.8117305, + "num_input_tokens_seen": 275151180, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6640625, + "step": 12761, + "time_per_iteration": 3.9318642616271973 + }, + { + "auxiliary_loss_clip": 0.01099585, + "auxiliary_loss_mlp": 0.0102753, + "balance_loss_clip": 1.0160147, + "balance_loss_mlp": 1.03551531, + "epoch": 0.7672929505486247, + "flos": 22815557251200.0, + "grad_norm": 1.7398225752644456, + "language_loss": 0.66273689, + "learning_rate": 5.415509657261589e-07, + "loss": 0.68400806, + "num_input_tokens_seen": 275170605, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 12762, + "time_per_iteration": 2.423274040222168 + }, + { + "auxiliary_loss_clip": 0.01102242, + "auxiliary_loss_mlp": 0.01027373, + "balance_loss_clip": 1.01513004, + "balance_loss_mlp": 1.03505349, + "epoch": 0.7673530738012927, + "flos": 20338834671360.0, + "grad_norm": 1.6795407868504282, + "language_loss": 0.73981798, + "learning_rate": 5.412844946792639e-07, + "loss": 0.76111412, + "num_input_tokens_seen": 275188750, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.671875, + "step": 12763, + "time_per_iteration": 3.845613718032837 + }, + { + "auxiliary_loss_clip": 0.01102131, + "auxiliary_loss_mlp": 0.01032406, + "balance_loss_clip": 1.02071738, + "balance_loss_mlp": 1.0367074, + "epoch": 0.7674131970539606, + "flos": 34933576988160.0, + "grad_norm": 1.585918390915768, + "language_loss": 0.70586705, + "learning_rate": 5.410180789470067e-07, + "loss": 0.72721243, + "num_input_tokens_seen": 275211365, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 12764, + "time_per_iteration": 3.981903314590454 + }, + { + "auxiliary_loss_clip": 0.01100669, + "auxiliary_loss_mlp": 0.01026967, + "balance_loss_clip": 1.01529598, + "balance_loss_mlp": 1.03549826, + "epoch": 0.7674733203066286, + "flos": 28328850766080.0, + "grad_norm": 1.533836649562743, + "language_loss": 0.69619727, + "learning_rate": 5.40751718539491e-07, + "loss": 0.71747363, + "num_input_tokens_seen": 275231670, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65234375, + "step": 12765, + "time_per_iteration": 2.4988484382629395 + }, + { + "auxiliary_loss_clip": 0.01097446, + "auxiliary_loss_mlp": 0.01030323, + "balance_loss_clip": 1.02000558, + "balance_loss_mlp": 1.03249931, + "epoch": 0.7675334435592965, + "flos": 16289727252480.0, + "grad_norm": 1.7341921361954618, + "language_loss": 0.60877311, + "learning_rate": 5.404854134668162e-07, + "loss": 0.63005078, + "num_input_tokens_seen": 275249425, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.65234375, + "step": 12766, + "time_per_iteration": 3.856095790863037 + }, + { + "auxiliary_loss_clip": 0.01024204, + "auxiliary_loss_mlp": 0.01001208, + "balance_loss_clip": 1.00024879, + "balance_loss_mlp": 1.00405478, + "epoch": 0.7675935668119646, + "flos": 64826232220800.0, + "grad_norm": 0.7388978362538794, + "language_loss": 0.60806286, + "learning_rate": 5.402191637390803e-07, + "loss": 0.628317, + "num_input_tokens_seen": 275312485, + "router_z_loss_clip": 0.00958252, + "router_z_loss_mlp": 0.20117188, + "step": 12767, + "time_per_iteration": 3.1863934993743896 + }, + { + "auxiliary_loss_clip": 0.01098543, + "auxiliary_loss_mlp": 0.01024357, + "balance_loss_clip": 1.01363397, + "balance_loss_mlp": 1.03486204, + "epoch": 0.7676536900646325, + "flos": 22675398382080.0, + "grad_norm": 1.9841724465329964, + "language_loss": 0.69505453, + "learning_rate": 5.399529693663801e-07, + "loss": 0.71628356, + "num_input_tokens_seen": 275331680, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.63671875, + "step": 12768, + "time_per_iteration": 2.423121452331543 + }, + { + "auxiliary_loss_clip": 0.01107565, + "auxiliary_loss_mlp": 0.01034083, + "balance_loss_clip": 1.02206123, + "balance_loss_mlp": 1.03830612, + "epoch": 0.7677138133173005, + "flos": 26939682224640.0, + "grad_norm": 1.9774662095092985, + "language_loss": 0.70799577, + "learning_rate": 5.3968683035881e-07, + "loss": 0.7294122, + "num_input_tokens_seen": 275351615, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6953125, + "step": 12769, + "time_per_iteration": 2.4947516918182373 + }, + { + "auxiliary_loss_clip": 0.01103148, + "auxiliary_loss_mlp": 0.01026944, + "balance_loss_clip": 1.0148201, + "balance_loss_mlp": 1.03540611, + "epoch": 0.7677739365699684, + "flos": 23799545400960.0, + "grad_norm": 1.823298760542139, + "language_loss": 0.80289495, + "learning_rate": 5.394207467264611e-07, + "loss": 0.82419586, + "num_input_tokens_seen": 275368815, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 12770, + "time_per_iteration": 2.4479711055755615 + }, + { + "auxiliary_loss_clip": 0.01098048, + "auxiliary_loss_mlp": 0.01030309, + "balance_loss_clip": 1.01986027, + "balance_loss_mlp": 1.0342977, + "epoch": 0.7678340598226364, + "flos": 34455497944320.0, + "grad_norm": 1.520087647586923, + "language_loss": 0.78579485, + "learning_rate": 5.391547184794245e-07, + "loss": 0.80707848, + "num_input_tokens_seen": 275389345, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.63671875, + "step": 12771, + "time_per_iteration": 2.5589637756347656 + }, + { + "auxiliary_loss_clip": 0.01100406, + "auxiliary_loss_mlp": 0.01027097, + "balance_loss_clip": 1.01595068, + "balance_loss_mlp": 1.03527427, + "epoch": 0.7678941830753043, + "flos": 23841740903040.0, + "grad_norm": 1.305591039584481, + "language_loss": 0.68094563, + "learning_rate": 5.388887456277876e-07, + "loss": 0.70222068, + "num_input_tokens_seen": 275411240, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65234375, + "step": 12772, + "time_per_iteration": 2.443350076675415 + }, + { + "auxiliary_loss_clip": 0.01097286, + "auxiliary_loss_mlp": 0.01023769, + "balance_loss_clip": 1.01294541, + "balance_loss_mlp": 1.03465271, + "epoch": 0.7679543063279723, + "flos": 25410929431680.0, + "grad_norm": 1.6667227683698287, + "language_loss": 0.73345917, + "learning_rate": 5.386228281816349e-07, + "loss": 0.75466973, + "num_input_tokens_seen": 275432010, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.62890625, + "step": 12773, + "time_per_iteration": 2.5177359580993652 + }, + { + "auxiliary_loss_clip": 0.01097604, + "auxiliary_loss_mlp": 0.01026807, + "balance_loss_clip": 1.01624548, + "balance_loss_mlp": 1.03416824, + "epoch": 0.7680144295806404, + "flos": 27962382257280.0, + "grad_norm": 1.8287819749313907, + "language_loss": 0.8077029, + "learning_rate": 5.383569661510512e-07, + "loss": 0.82894701, + "num_input_tokens_seen": 275453710, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.63671875, + "step": 12774, + "time_per_iteration": 2.4638662338256836 + }, + { + "auxiliary_loss_clip": 0.01102122, + "auxiliary_loss_mlp": 0.01030311, + "balance_loss_clip": 1.01865244, + "balance_loss_mlp": 1.03650451, + "epoch": 0.7680745528333083, + "flos": 20412810731520.0, + "grad_norm": 1.5141235793881351, + "language_loss": 0.6951592, + "learning_rate": 5.380911595461177e-07, + "loss": 0.71648353, + "num_input_tokens_seen": 275472915, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65625, + "step": 12775, + "time_per_iteration": 2.529325246810913 + }, + { + "auxiliary_loss_clip": 0.01024296, + "auxiliary_loss_mlp": 0.0099915, + "balance_loss_clip": 0.9981491, + "balance_loss_mlp": 1.00394726, + "epoch": 0.7681346760859763, + "flos": 68401103351040.0, + "grad_norm": 0.6956565563059588, + "language_loss": 0.56836295, + "learning_rate": 5.378254083769147e-07, + "loss": 0.58859742, + "num_input_tokens_seen": 275534785, + "router_z_loss_clip": 0.01000977, + "router_z_loss_mlp": 0.203125, + "step": 12776, + "time_per_iteration": 3.10646915435791 + }, + { + "auxiliary_loss_clip": 0.01100161, + "auxiliary_loss_mlp": 0.01031577, + "balance_loss_clip": 1.02058029, + "balance_loss_mlp": 1.03510964, + "epoch": 0.7681947993386442, + "flos": 21251468453760.0, + "grad_norm": 1.821518021735027, + "language_loss": 0.74034452, + "learning_rate": 5.375597126535188e-07, + "loss": 0.76166189, + "num_input_tokens_seen": 275553205, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6484375, + "step": 12777, + "time_per_iteration": 2.4750425815582275 + }, + { + "auxiliary_loss_clip": 0.01100992, + "auxiliary_loss_mlp": 0.01033586, + "balance_loss_clip": 1.02257133, + "balance_loss_mlp": 1.03636885, + "epoch": 0.7682549225913122, + "flos": 21397696721280.0, + "grad_norm": 2.340152185552387, + "language_loss": 0.70033187, + "learning_rate": 5.372940723860043e-07, + "loss": 0.72167766, + "num_input_tokens_seen": 275571490, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.64453125, + "step": 12778, + "time_per_iteration": 2.4316253662109375 + }, + { + "auxiliary_loss_clip": 0.01100934, + "auxiliary_loss_mlp": 0.01028306, + "balance_loss_clip": 1.01741004, + "balance_loss_mlp": 1.03619504, + "epoch": 0.7683150458439801, + "flos": 23038921975680.0, + "grad_norm": 1.7229591710828633, + "language_loss": 0.70021391, + "learning_rate": 5.37028487584446e-07, + "loss": 0.72150636, + "num_input_tokens_seen": 275589665, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6484375, + "step": 12779, + "time_per_iteration": 2.4962258338928223 + }, + { + "auxiliary_loss_clip": 0.01102633, + "auxiliary_loss_mlp": 0.01027115, + "balance_loss_clip": 1.01549852, + "balance_loss_mlp": 1.03702402, + "epoch": 0.7683751690966482, + "flos": 67332397996800.0, + "grad_norm": 1.5025489085425099, + "language_loss": 0.58335769, + "learning_rate": 5.367629582589133e-07, + "loss": 0.60465509, + "num_input_tokens_seen": 275615605, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 12780, + "time_per_iteration": 2.827277898788452 + }, + { + "auxiliary_loss_clip": 0.0110525, + "auxiliary_loss_mlp": 0.01036743, + "balance_loss_clip": 1.02303374, + "balance_loss_mlp": 1.03533888, + "epoch": 0.7684352923493161, + "flos": 21798890703360.0, + "grad_norm": 1.7175154048047394, + "language_loss": 0.68096447, + "learning_rate": 5.364974844194759e-07, + "loss": 0.70238441, + "num_input_tokens_seen": 275634965, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.69921875, + "step": 12781, + "time_per_iteration": 2.450493574142456 + }, + { + "auxiliary_loss_clip": 0.01099247, + "auxiliary_loss_mlp": 0.0102942, + "balance_loss_clip": 1.0181365, + "balance_loss_mlp": 1.03411829, + "epoch": 0.7684954156019841, + "flos": 25847603072640.0, + "grad_norm": 1.4930277529018858, + "language_loss": 0.79351133, + "learning_rate": 5.362320660762016e-07, + "loss": 0.814798, + "num_input_tokens_seen": 275655785, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65234375, + "step": 12782, + "time_per_iteration": 2.473785638809204 + }, + { + "auxiliary_loss_clip": 0.01101943, + "auxiliary_loss_mlp": 0.01028865, + "balance_loss_clip": 1.01648557, + "balance_loss_mlp": 1.03457451, + "epoch": 0.768555538854652, + "flos": 25447378757760.0, + "grad_norm": 3.89329070185187, + "language_loss": 0.6701203, + "learning_rate": 5.35966703239153e-07, + "loss": 0.6914283, + "num_input_tokens_seen": 275676160, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 12783, + "time_per_iteration": 2.496005058288574 + }, + { + "auxiliary_loss_clip": 0.0110336, + "auxiliary_loss_mlp": 0.01032843, + "balance_loss_clip": 1.02068949, + "balance_loss_mlp": 1.0368228, + "epoch": 0.76861566210732, + "flos": 19646369303040.0, + "grad_norm": 2.317173566412315, + "language_loss": 0.68567002, + "learning_rate": 5.357013959183938e-07, + "loss": 0.70703208, + "num_input_tokens_seen": 275695660, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6640625, + "step": 12784, + "time_per_iteration": 2.4193952083587646 + }, + { + "auxiliary_loss_clip": 0.01101224, + "auxiliary_loss_mlp": 0.01027175, + "balance_loss_clip": 1.01677442, + "balance_loss_mlp": 1.03561044, + "epoch": 0.7686757853599879, + "flos": 22419032037120.0, + "grad_norm": 2.4397788203349546, + "language_loss": 0.80600178, + "learning_rate": 5.354361441239843e-07, + "loss": 0.82728577, + "num_input_tokens_seen": 275714025, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.65625, + "step": 12785, + "time_per_iteration": 2.4642157554626465 + }, + { + "auxiliary_loss_clip": 0.01102953, + "auxiliary_loss_mlp": 0.0103025, + "balance_loss_clip": 1.01812065, + "balance_loss_mlp": 1.03647351, + "epoch": 0.768735908612656, + "flos": 47774262453120.0, + "grad_norm": 1.5675219455195206, + "language_loss": 0.77255261, + "learning_rate": 5.351709478659836e-07, + "loss": 0.79388458, + "num_input_tokens_seen": 275737300, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6640625, + "step": 12786, + "time_per_iteration": 2.6608307361602783 + }, + { + "auxiliary_loss_clip": 0.0109997, + "auxiliary_loss_mlp": 0.01029084, + "balance_loss_clip": 1.01797938, + "balance_loss_mlp": 1.03441632, + "epoch": 0.7687960318653239, + "flos": 30263179000320.0, + "grad_norm": 2.029037446974208, + "language_loss": 0.58857298, + "learning_rate": 5.349058071544468e-07, + "loss": 0.60986358, + "num_input_tokens_seen": 275757895, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 12787, + "time_per_iteration": 2.5195324420928955 + }, + { + "auxiliary_loss_clip": 0.01098338, + "auxiliary_loss_mlp": 0.01026426, + "balance_loss_clip": 1.01488662, + "balance_loss_mlp": 1.03323674, + "epoch": 0.7688561551179919, + "flos": 19573434737280.0, + "grad_norm": 1.5842728148921028, + "language_loss": 0.75863254, + "learning_rate": 5.346407219994292e-07, + "loss": 0.77988023, + "num_input_tokens_seen": 275776745, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65234375, + "step": 12788, + "time_per_iteration": 2.427560567855835 + }, + { + "auxiliary_loss_clip": 0.01103523, + "auxiliary_loss_mlp": 0.01037331, + "balance_loss_clip": 1.02540421, + "balance_loss_mlp": 1.03627038, + "epoch": 0.7689162783706599, + "flos": 22783776693120.0, + "grad_norm": 1.6525125671595142, + "language_loss": 0.66358525, + "learning_rate": 5.343756924109821e-07, + "loss": 0.6849938, + "num_input_tokens_seen": 275797205, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 12789, + "time_per_iteration": 2.484055280685425 + }, + { + "auxiliary_loss_clip": 0.01103699, + "auxiliary_loss_mlp": 0.01033442, + "balance_loss_clip": 1.02053142, + "balance_loss_mlp": 1.03660512, + "epoch": 0.7689764016233278, + "flos": 34204195416960.0, + "grad_norm": 1.730155675117843, + "language_loss": 0.68648386, + "learning_rate": 5.341107183991553e-07, + "loss": 0.70785522, + "num_input_tokens_seen": 275817935, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.671875, + "step": 12790, + "time_per_iteration": 2.5284645557403564 + }, + { + "auxiliary_loss_clip": 0.01101213, + "auxiliary_loss_mlp": 0.01031951, + "balance_loss_clip": 1.02015567, + "balance_loss_mlp": 1.03384793, + "epoch": 0.7690365248759958, + "flos": 17274469587840.0, + "grad_norm": 1.6904473195565226, + "language_loss": 0.68665707, + "learning_rate": 5.338457999739969e-07, + "loss": 0.70798862, + "num_input_tokens_seen": 275837145, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 12791, + "time_per_iteration": 2.4484951496124268 + }, + { + "auxiliary_loss_clip": 0.0109967, + "auxiliary_loss_mlp": 0.01032034, + "balance_loss_clip": 1.02082801, + "balance_loss_mlp": 1.03512239, + "epoch": 0.7690966481286637, + "flos": 18223157646720.0, + "grad_norm": 1.7979814428541672, + "language_loss": 0.79704869, + "learning_rate": 5.335809371455526e-07, + "loss": 0.81836575, + "num_input_tokens_seen": 275855705, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.64453125, + "step": 12792, + "time_per_iteration": 2.397611141204834 + }, + { + "auxiliary_loss_clip": 0.01109838, + "auxiliary_loss_mlp": 0.01030341, + "balance_loss_clip": 1.01751399, + "balance_loss_mlp": 1.04006386, + "epoch": 0.7691567713813318, + "flos": 21537568281600.0, + "grad_norm": 1.8065104235700298, + "language_loss": 0.72902393, + "learning_rate": 5.333161299238673e-07, + "loss": 0.7504257, + "num_input_tokens_seen": 275873930, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 12793, + "time_per_iteration": 2.445250988006592 + }, + { + "auxiliary_loss_clip": 0.01102182, + "auxiliary_loss_mlp": 0.01033309, + "balance_loss_clip": 1.02147722, + "balance_loss_mlp": 1.0359565, + "epoch": 0.7692168946339997, + "flos": 39379999720320.0, + "grad_norm": 1.7477925933074476, + "language_loss": 0.63753021, + "learning_rate": 5.330513783189803e-07, + "loss": 0.65888512, + "num_input_tokens_seen": 275895895, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 12794, + "time_per_iteration": 2.609574317932129 + }, + { + "auxiliary_loss_clip": 0.01107559, + "auxiliary_loss_mlp": 0.01031379, + "balance_loss_clip": 1.01955318, + "balance_loss_mlp": 1.03873158, + "epoch": 0.7692770178866677, + "flos": 25009950931200.0, + "grad_norm": 1.4386826522149643, + "language_loss": 0.76442081, + "learning_rate": 5.327866823409319e-07, + "loss": 0.78581011, + "num_input_tokens_seen": 275917825, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 12795, + "time_per_iteration": 2.491729736328125 + }, + { + "auxiliary_loss_clip": 0.01101903, + "auxiliary_loss_mlp": 0.01026935, + "balance_loss_clip": 1.01503158, + "balance_loss_mlp": 1.03450465, + "epoch": 0.7693371411393356, + "flos": 24716273333760.0, + "grad_norm": 1.5564929317372034, + "language_loss": 0.71727788, + "learning_rate": 5.325220419997601e-07, + "loss": 0.73856628, + "num_input_tokens_seen": 275937890, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 12796, + "time_per_iteration": 2.4555909633636475 + }, + { + "auxiliary_loss_clip": 0.01101987, + "auxiliary_loss_mlp": 0.01027607, + "balance_loss_clip": 1.01607311, + "balance_loss_mlp": 1.03496242, + "epoch": 0.7693972643920036, + "flos": 15924803028480.0, + "grad_norm": 2.1139443574880574, + "language_loss": 0.65011704, + "learning_rate": 5.32257457305499e-07, + "loss": 0.671413, + "num_input_tokens_seen": 275954495, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 12797, + "time_per_iteration": 2.4375650882720947 + }, + { + "auxiliary_loss_clip": 0.01102375, + "auxiliary_loss_mlp": 0.01032493, + "balance_loss_clip": 1.01997042, + "balance_loss_mlp": 1.03497744, + "epoch": 0.7694573876446715, + "flos": 25405901527680.0, + "grad_norm": 1.7406268375676737, + "language_loss": 0.91516721, + "learning_rate": 5.319929282681823e-07, + "loss": 0.93651593, + "num_input_tokens_seen": 275972395, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.671875, + "step": 12798, + "time_per_iteration": 2.4546101093292236 + }, + { + "auxiliary_loss_clip": 0.0110163, + "auxiliary_loss_mlp": 0.01027124, + "balance_loss_clip": 1.01569748, + "balance_loss_mlp": 1.03515077, + "epoch": 0.7695175108973396, + "flos": 16654220513280.0, + "grad_norm": 1.9252292535695115, + "language_loss": 0.82239765, + "learning_rate": 5.317284548978418e-07, + "loss": 0.84368521, + "num_input_tokens_seen": 275989020, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 12799, + "time_per_iteration": 2.44386625289917 + }, + { + "auxiliary_loss_clip": 0.01103323, + "auxiliary_loss_mlp": 0.01028131, + "balance_loss_clip": 1.01621604, + "balance_loss_mlp": 1.03646576, + "epoch": 0.7695776341500075, + "flos": 13626520237440.0, + "grad_norm": 2.0094364967525262, + "language_loss": 0.77591789, + "learning_rate": 5.314640372045045e-07, + "loss": 0.79723239, + "num_input_tokens_seen": 276006525, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66796875, + "step": 12800, + "time_per_iteration": 2.397705316543579 + }, + { + "auxiliary_loss_clip": 0.0110665, + "auxiliary_loss_mlp": 0.01028477, + "balance_loss_clip": 1.01594234, + "balance_loss_mlp": 1.03569245, + "epoch": 0.7696377574026755, + "flos": 24276690691200.0, + "grad_norm": 1.660947128359647, + "language_loss": 0.83736777, + "learning_rate": 5.31199675198198e-07, + "loss": 0.85871899, + "num_input_tokens_seen": 276027130, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 12801, + "time_per_iteration": 2.4850664138793945 + }, + { + "auxiliary_loss_clip": 0.01101531, + "auxiliary_loss_mlp": 0.01030973, + "balance_loss_clip": 1.01908183, + "balance_loss_mlp": 1.03610682, + "epoch": 0.7696978806553435, + "flos": 20923137210240.0, + "grad_norm": 1.968794932529363, + "language_loss": 0.72192085, + "learning_rate": 5.30935368888947e-07, + "loss": 0.7432459, + "num_input_tokens_seen": 276045715, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.65625, + "step": 12802, + "time_per_iteration": 2.423994779586792 + }, + { + "auxiliary_loss_clip": 0.01101064, + "auxiliary_loss_mlp": 0.0102928, + "balance_loss_clip": 1.01767504, + "balance_loss_mlp": 1.03590822, + "epoch": 0.7697580039080114, + "flos": 22929609911040.0, + "grad_norm": 1.7968472672418645, + "language_loss": 0.75812244, + "learning_rate": 5.306711182867747e-07, + "loss": 0.77942592, + "num_input_tokens_seen": 276065375, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 12803, + "time_per_iteration": 3.8298709392547607 + }, + { + "auxiliary_loss_clip": 0.0102415, + "auxiliary_loss_mlp": 0.00999256, + "balance_loss_clip": 0.99821299, + "balance_loss_mlp": 1.00390053, + "epoch": 0.7698181271606794, + "flos": 68717654933760.0, + "grad_norm": 0.7302044850934681, + "language_loss": 0.55831051, + "learning_rate": 5.304069234017001e-07, + "loss": 0.57854456, + "num_input_tokens_seen": 276131405, + "router_z_loss_clip": 0.01043701, + "router_z_loss_mlp": 0.203125, + "step": 12804, + "time_per_iteration": 3.058547258377075 + }, + { + "auxiliary_loss_clip": 0.01024727, + "auxiliary_loss_mlp": 0.01002741, + "balance_loss_clip": 1.00166178, + "balance_loss_mlp": 1.00439858, + "epoch": 0.7698782504133473, + "flos": 67409716999680.0, + "grad_norm": 0.9747386199890918, + "language_loss": 0.54020375, + "learning_rate": 5.301427842437429e-07, + "loss": 0.56047845, + "num_input_tokens_seen": 276200755, + "router_z_loss_clip": 0.01080322, + "router_z_loss_mlp": 0.203125, + "step": 12805, + "time_per_iteration": 4.5421671867370605 + }, + { + "auxiliary_loss_clip": 0.01105608, + "auxiliary_loss_mlp": 0.0103352, + "balance_loss_clip": 1.02145565, + "balance_loss_mlp": 1.03835249, + "epoch": 0.7699383736660154, + "flos": 22488842119680.0, + "grad_norm": 2.1701782975166, + "language_loss": 0.72961175, + "learning_rate": 5.298787008229187e-07, + "loss": 0.75100303, + "num_input_tokens_seen": 276217880, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.671875, + "step": 12806, + "time_per_iteration": 3.833503246307373 + }, + { + "auxiliary_loss_clip": 0.01102326, + "auxiliary_loss_mlp": 0.01035726, + "balance_loss_clip": 1.02383482, + "balance_loss_mlp": 1.03555238, + "epoch": 0.7699984969186833, + "flos": 21539723097600.0, + "grad_norm": 3.0939147131077878, + "language_loss": 0.75202084, + "learning_rate": 5.296146731492408e-07, + "loss": 0.77340138, + "num_input_tokens_seen": 276234810, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66796875, + "step": 12807, + "time_per_iteration": 2.453640937805176 + }, + { + "auxiliary_loss_clip": 0.01107207, + "auxiliary_loss_mlp": 0.01033913, + "balance_loss_clip": 1.02156925, + "balance_loss_mlp": 1.03789043, + "epoch": 0.7700586201713513, + "flos": 21719096640000.0, + "grad_norm": 2.25264240922501, + "language_loss": 0.79834819, + "learning_rate": 5.293507012327218e-07, + "loss": 0.81975937, + "num_input_tokens_seen": 276252850, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 12808, + "time_per_iteration": 3.863776206970215 + }, + { + "auxiliary_loss_clip": 0.01106296, + "auxiliary_loss_mlp": 0.01035203, + "balance_loss_clip": 1.02278161, + "balance_loss_mlp": 1.03690052, + "epoch": 0.7701187434240192, + "flos": 27856015107840.0, + "grad_norm": 1.7718685431414871, + "language_loss": 0.79037017, + "learning_rate": 5.290867850833718e-07, + "loss": 0.81178522, + "num_input_tokens_seen": 276272525, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6953125, + "step": 12809, + "time_per_iteration": 2.50119948387146 + }, + { + "auxiliary_loss_clip": 0.01097513, + "auxiliary_loss_mlp": 0.0102508, + "balance_loss_clip": 1.01414251, + "balance_loss_mlp": 1.03431511, + "epoch": 0.7701788666766872, + "flos": 28621307301120.0, + "grad_norm": 1.5273739274998572, + "language_loss": 0.70192695, + "learning_rate": 5.288229247111993e-07, + "loss": 0.72315288, + "num_input_tokens_seen": 276294210, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6328125, + "step": 12810, + "time_per_iteration": 2.4800918102264404 + }, + { + "auxiliary_loss_clip": 0.01104583, + "auxiliary_loss_mlp": 0.01031615, + "balance_loss_clip": 1.01865101, + "balance_loss_mlp": 1.03556144, + "epoch": 0.7702389899293551, + "flos": 14246446089600.0, + "grad_norm": 2.2614131210478465, + "language_loss": 0.78612316, + "learning_rate": 5.285591201262079e-07, + "loss": 0.80748516, + "num_input_tokens_seen": 276310290, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6875, + "step": 12811, + "time_per_iteration": 2.404200792312622 + }, + { + "auxiliary_loss_clip": 0.01024644, + "auxiliary_loss_mlp": 0.01001291, + "balance_loss_clip": 1.00025964, + "balance_loss_mlp": 1.00433743, + "epoch": 0.7702991131820232, + "flos": 70574128439040.0, + "grad_norm": 0.8119263300614926, + "language_loss": 0.56688583, + "learning_rate": 5.28295371338402e-07, + "loss": 0.58714521, + "num_input_tokens_seen": 276371715, + "router_z_loss_clip": 0.01031494, + "router_z_loss_mlp": 0.203125, + "step": 12812, + "time_per_iteration": 3.1152541637420654 + }, + { + "auxiliary_loss_clip": 0.0110341, + "auxiliary_loss_mlp": 0.01034375, + "balance_loss_clip": 1.0224545, + "balance_loss_mlp": 1.0352596, + "epoch": 0.7703592364346911, + "flos": 25480021242240.0, + "grad_norm": 1.6865104586503614, + "language_loss": 0.7190448, + "learning_rate": 5.280316783577836e-07, + "loss": 0.74042261, + "num_input_tokens_seen": 276389895, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 12813, + "time_per_iteration": 2.4738786220550537 + }, + { + "auxiliary_loss_clip": 0.01102625, + "auxiliary_loss_mlp": 0.01029531, + "balance_loss_clip": 1.01718116, + "balance_loss_mlp": 1.03553629, + "epoch": 0.7704193596873591, + "flos": 19280906375040.0, + "grad_norm": 1.808315927971449, + "language_loss": 0.66342986, + "learning_rate": 5.27768041194351e-07, + "loss": 0.68475139, + "num_input_tokens_seen": 276408990, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.671875, + "step": 12814, + "time_per_iteration": 2.454023599624634 + }, + { + "auxiliary_loss_clip": 0.01101607, + "auxiliary_loss_mlp": 0.01031859, + "balance_loss_clip": 1.02005756, + "balance_loss_mlp": 1.03535891, + "epoch": 0.7704794829400271, + "flos": 23658452778240.0, + "grad_norm": 1.9935067754667941, + "language_loss": 0.65677094, + "learning_rate": 5.275044598581018e-07, + "loss": 0.67810559, + "num_input_tokens_seen": 276428190, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 12815, + "time_per_iteration": 2.453657627105713 + }, + { + "auxiliary_loss_clip": 0.01101554, + "auxiliary_loss_mlp": 0.01031447, + "balance_loss_clip": 1.01935291, + "balance_loss_mlp": 1.03516507, + "epoch": 0.770539606192695, + "flos": 18989311766400.0, + "grad_norm": 2.1548232448255566, + "language_loss": 0.6524539, + "learning_rate": 5.272409343590322e-07, + "loss": 0.6737839, + "num_input_tokens_seen": 276446855, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 12816, + "time_per_iteration": 2.407606840133667 + }, + { + "auxiliary_loss_clip": 0.01104205, + "auxiliary_loss_mlp": 0.0103424, + "balance_loss_clip": 1.02282572, + "balance_loss_mlp": 1.03735924, + "epoch": 0.770599729445363, + "flos": 11830160142720.0, + "grad_norm": 2.105850100227776, + "language_loss": 0.71998227, + "learning_rate": 5.26977464707133e-07, + "loss": 0.74136674, + "num_input_tokens_seen": 276462000, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 12817, + "time_per_iteration": 2.4196791648864746 + }, + { + "auxiliary_loss_clip": 0.01101955, + "auxiliary_loss_mlp": 0.01032617, + "balance_loss_clip": 1.02102351, + "balance_loss_mlp": 1.03574193, + "epoch": 0.770659852698031, + "flos": 17822610109440.0, + "grad_norm": 1.9485299894899226, + "language_loss": 0.61153173, + "learning_rate": 5.267140509123957e-07, + "loss": 0.63287747, + "num_input_tokens_seen": 276481190, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 12818, + "time_per_iteration": 2.422590970993042 + }, + { + "auxiliary_loss_clip": 0.0110086, + "auxiliary_loss_mlp": 0.0102778, + "balance_loss_clip": 1.01726627, + "balance_loss_mlp": 1.036057, + "epoch": 0.770719975950699, + "flos": 21871968923520.0, + "grad_norm": 1.7189181201095014, + "language_loss": 0.67140901, + "learning_rate": 5.264506929848093e-07, + "loss": 0.69269538, + "num_input_tokens_seen": 276499520, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.6484375, + "step": 12819, + "time_per_iteration": 2.445463180541992 + }, + { + "auxiliary_loss_clip": 0.01103433, + "auxiliary_loss_mlp": 0.01026789, + "balance_loss_clip": 1.0150826, + "balance_loss_mlp": 1.03642428, + "epoch": 0.7707800992033669, + "flos": 21325049464320.0, + "grad_norm": 1.8084191100945337, + "language_loss": 0.57428622, + "learning_rate": 5.261873909343608e-07, + "loss": 0.59558845, + "num_input_tokens_seen": 276519110, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66796875, + "step": 12820, + "time_per_iteration": 2.4313409328460693 + }, + { + "auxiliary_loss_clip": 0.01101387, + "auxiliary_loss_mlp": 0.01026524, + "balance_loss_clip": 1.01476407, + "balance_loss_mlp": 1.03471613, + "epoch": 0.7708402224560349, + "flos": 28179426188160.0, + "grad_norm": 1.656188868997019, + "language_loss": 0.80691266, + "learning_rate": 5.259241447710343e-07, + "loss": 0.82819176, + "num_input_tokens_seen": 276538805, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6640625, + "step": 12821, + "time_per_iteration": 2.495997190475464 + }, + { + "auxiliary_loss_clip": 0.01102331, + "auxiliary_loss_mlp": 0.01029131, + "balance_loss_clip": 1.01740658, + "balance_loss_mlp": 1.0356462, + "epoch": 0.7709003457087028, + "flos": 15377057556480.0, + "grad_norm": 2.1643932163706388, + "language_loss": 0.68480009, + "learning_rate": 5.256609545048114e-07, + "loss": 0.70611471, + "num_input_tokens_seen": 276554770, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66796875, + "step": 12822, + "time_per_iteration": 2.390167236328125 + }, + { + "auxiliary_loss_clip": 0.01101193, + "auxiliary_loss_mlp": 0.01032018, + "balance_loss_clip": 1.02059174, + "balance_loss_mlp": 1.03602922, + "epoch": 0.7709604689613708, + "flos": 30621854257920.0, + "grad_norm": 1.6982430970073337, + "language_loss": 0.72335845, + "learning_rate": 5.253978201456733e-07, + "loss": 0.74469054, + "num_input_tokens_seen": 276574535, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 12823, + "time_per_iteration": 2.492733955383301 + }, + { + "auxiliary_loss_clip": 0.01108942, + "auxiliary_loss_mlp": 0.01037818, + "balance_loss_clip": 1.02433515, + "balance_loss_mlp": 1.03756452, + "epoch": 0.7710205922140387, + "flos": 20301272023680.0, + "grad_norm": 1.8295063286437603, + "language_loss": 0.76613212, + "learning_rate": 5.251347417035969e-07, + "loss": 0.78759968, + "num_input_tokens_seen": 276592925, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71484375, + "step": 12824, + "time_per_iteration": 2.4176483154296875 + }, + { + "auxiliary_loss_clip": 0.01102375, + "auxiliary_loss_mlp": 0.01027118, + "balance_loss_clip": 1.01542997, + "balance_loss_mlp": 1.03651464, + "epoch": 0.7710807154667068, + "flos": 19644214487040.0, + "grad_norm": 3.0696602507520603, + "language_loss": 0.72657233, + "learning_rate": 5.248717191885592e-07, + "loss": 0.74786729, + "num_input_tokens_seen": 276610540, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 12825, + "time_per_iteration": 2.451836109161377 + }, + { + "auxiliary_loss_clip": 0.0109918, + "auxiliary_loss_mlp": 0.01032495, + "balance_loss_clip": 1.02191544, + "balance_loss_mlp": 1.03549349, + "epoch": 0.7711408387193747, + "flos": 20006337450240.0, + "grad_norm": 1.391969266660785, + "language_loss": 0.73613906, + "learning_rate": 5.246087526105343e-07, + "loss": 0.75745583, + "num_input_tokens_seen": 276629200, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.63671875, + "step": 12826, + "time_per_iteration": 2.4155168533325195 + }, + { + "auxiliary_loss_clip": 0.01102055, + "auxiliary_loss_mlp": 0.01031129, + "balance_loss_clip": 1.01887417, + "balance_loss_mlp": 1.03364134, + "epoch": 0.7712009619720427, + "flos": 24971131307520.0, + "grad_norm": 1.6262733051040712, + "language_loss": 0.81322646, + "learning_rate": 5.243458419794933e-07, + "loss": 0.83455837, + "num_input_tokens_seen": 276648655, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.68359375, + "step": 12827, + "time_per_iteration": 2.459195852279663 + }, + { + "auxiliary_loss_clip": 0.01024065, + "auxiliary_loss_mlp": 0.00999839, + "balance_loss_clip": 0.99881953, + "balance_loss_mlp": 1.00367689, + "epoch": 0.7712610852247107, + "flos": 63249681404160.0, + "grad_norm": 0.8804510230026851, + "language_loss": 0.55191517, + "learning_rate": 5.240829873054051e-07, + "loss": 0.57215428, + "num_input_tokens_seen": 276716500, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.20410156, + "step": 12828, + "time_per_iteration": 3.203558921813965 + }, + { + "auxiliary_loss_clip": 0.01099176, + "auxiliary_loss_mlp": 0.01026642, + "balance_loss_clip": 1.0158478, + "balance_loss_mlp": 1.03485942, + "epoch": 0.7713212084773786, + "flos": 18697860812160.0, + "grad_norm": 1.7353204568908176, + "language_loss": 0.69503725, + "learning_rate": 5.23820188598238e-07, + "loss": 0.71629542, + "num_input_tokens_seen": 276733535, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.640625, + "step": 12829, + "time_per_iteration": 2.447021722793579 + }, + { + "auxiliary_loss_clip": 0.01105724, + "auxiliary_loss_mlp": 0.01031583, + "balance_loss_clip": 1.01863086, + "balance_loss_mlp": 1.036901, + "epoch": 0.7713813317300466, + "flos": 14173367869440.0, + "grad_norm": 4.262950048849265, + "language_loss": 0.79446471, + "learning_rate": 5.235574458679579e-07, + "loss": 0.8158378, + "num_input_tokens_seen": 276749575, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6875, + "step": 12830, + "time_per_iteration": 2.3964903354644775 + }, + { + "auxiliary_loss_clip": 0.01106244, + "auxiliary_loss_mlp": 0.01031562, + "balance_loss_clip": 1.0183301, + "balance_loss_mlp": 1.03630996, + "epoch": 0.7714414549827145, + "flos": 25703960584320.0, + "grad_norm": 1.6021673475847413, + "language_loss": 0.78127801, + "learning_rate": 5.232947591245269e-07, + "loss": 0.80265611, + "num_input_tokens_seen": 276769460, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.69921875, + "step": 12831, + "time_per_iteration": 2.5234055519104004 + }, + { + "auxiliary_loss_clip": 0.01100095, + "auxiliary_loss_mlp": 0.01025829, + "balance_loss_clip": 1.01434898, + "balance_loss_mlp": 1.03424239, + "epoch": 0.7715015782353826, + "flos": 30555312312960.0, + "grad_norm": 1.5450896985633467, + "language_loss": 0.60894483, + "learning_rate": 5.230321283779071e-07, + "loss": 0.63020408, + "num_input_tokens_seen": 276790820, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65625, + "step": 12832, + "time_per_iteration": 2.492701530456543 + }, + { + "auxiliary_loss_clip": 0.0110303, + "auxiliary_loss_mlp": 0.01032842, + "balance_loss_clip": 1.02072477, + "balance_loss_mlp": 1.03454924, + "epoch": 0.7715617014880505, + "flos": 20229343038720.0, + "grad_norm": 1.7425232320118673, + "language_loss": 0.79137206, + "learning_rate": 5.227695536380572e-07, + "loss": 0.81273079, + "num_input_tokens_seen": 276811345, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 12833, + "time_per_iteration": 2.497288942337036 + }, + { + "auxiliary_loss_clip": 0.01024056, + "auxiliary_loss_mlp": 0.01003026, + "balance_loss_clip": 1.00204265, + "balance_loss_mlp": 1.00360727, + "epoch": 0.7716218247407185, + "flos": 63664770971520.0, + "grad_norm": 0.8686662344719275, + "language_loss": 0.55410403, + "learning_rate": 5.22507034914933e-07, + "loss": 0.57437485, + "num_input_tokens_seen": 276870950, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.20507812, + "step": 12834, + "time_per_iteration": 3.03043532371521 + }, + { + "auxiliary_loss_clip": 0.01103044, + "auxiliary_loss_mlp": 0.0102846, + "balance_loss_clip": 1.01647997, + "balance_loss_mlp": 1.03643119, + "epoch": 0.7716819479933864, + "flos": 19791807471360.0, + "grad_norm": 2.117345370793711, + "language_loss": 0.72845638, + "learning_rate": 5.222445722184903e-07, + "loss": 0.74977142, + "num_input_tokens_seen": 276890760, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6640625, + "step": 12835, + "time_per_iteration": 2.446268320083618 + }, + { + "auxiliary_loss_clip": 0.01101161, + "auxiliary_loss_mlp": 0.01036937, + "balance_loss_clip": 1.02434242, + "balance_loss_mlp": 1.03385723, + "epoch": 0.7717420712460544, + "flos": 18442176825600.0, + "grad_norm": 1.6490070086393855, + "language_loss": 0.70007384, + "learning_rate": 5.219821655586814e-07, + "loss": 0.7214548, + "num_input_tokens_seen": 276909625, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.671875, + "step": 12836, + "time_per_iteration": 2.4494271278381348 + }, + { + "auxiliary_loss_clip": 0.01100539, + "auxiliary_loss_mlp": 0.01030913, + "balance_loss_clip": 1.01912892, + "balance_loss_mlp": 1.03515959, + "epoch": 0.7718021944987223, + "flos": 35189476456320.0, + "grad_norm": 1.6293860419166157, + "language_loss": 0.59337658, + "learning_rate": 5.217198149454575e-07, + "loss": 0.61469114, + "num_input_tokens_seen": 276930760, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65234375, + "step": 12837, + "time_per_iteration": 2.5418989658355713 + }, + { + "auxiliary_loss_clip": 0.01023613, + "auxiliary_loss_mlp": 0.00999355, + "balance_loss_clip": 0.99835348, + "balance_loss_mlp": 1.00311017, + "epoch": 0.7718623177513904, + "flos": 67923167961600.0, + "grad_norm": 0.8631972633412854, + "language_loss": 0.5581463, + "learning_rate": 5.214575203887666e-07, + "loss": 0.578376, + "num_input_tokens_seen": 276989580, + "router_z_loss_clip": 0.01000977, + "router_z_loss_mlp": 0.20507812, + "step": 12838, + "time_per_iteration": 3.0269720554351807 + }, + { + "auxiliary_loss_clip": 0.01100948, + "auxiliary_loss_mlp": 0.01025399, + "balance_loss_clip": 1.01461673, + "balance_loss_mlp": 1.03597295, + "epoch": 0.7719224410040583, + "flos": 18581401941120.0, + "grad_norm": 2.4471669974150347, + "language_loss": 0.69294447, + "learning_rate": 5.211952818985538e-07, + "loss": 0.71420795, + "num_input_tokens_seen": 277005450, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6484375, + "step": 12839, + "time_per_iteration": 2.4177730083465576 + }, + { + "auxiliary_loss_clip": 0.01099889, + "auxiliary_loss_mlp": 0.01025095, + "balance_loss_clip": 1.01409793, + "balance_loss_mlp": 1.03574765, + "epoch": 0.7719825642567263, + "flos": 23075802264960.0, + "grad_norm": 1.7653669822284475, + "language_loss": 0.79856348, + "learning_rate": 5.209330994847647e-07, + "loss": 0.81981325, + "num_input_tokens_seen": 277023055, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.640625, + "step": 12840, + "time_per_iteration": 2.5179991722106934 + }, + { + "auxiliary_loss_clip": 0.01102241, + "auxiliary_loss_mlp": 0.01030173, + "balance_loss_clip": 1.01843691, + "balance_loss_mlp": 1.0361371, + "epoch": 0.7720426875093943, + "flos": 20339086066560.0, + "grad_norm": 1.7784222568456114, + "language_loss": 0.79938293, + "learning_rate": 5.206709731573402e-07, + "loss": 0.82070708, + "num_input_tokens_seen": 277041150, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66015625, + "step": 12841, + "time_per_iteration": 2.5245449542999268 + }, + { + "auxiliary_loss_clip": 0.01102151, + "auxiliary_loss_mlp": 0.01029086, + "balance_loss_clip": 1.01720667, + "balance_loss_mlp": 1.03537869, + "epoch": 0.7721028107620622, + "flos": 23880704181120.0, + "grad_norm": 1.4922109541948092, + "language_loss": 0.76314819, + "learning_rate": 5.204089029262208e-07, + "loss": 0.7844606, + "num_input_tokens_seen": 277063895, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66796875, + "step": 12842, + "time_per_iteration": 2.5023560523986816 + }, + { + "auxiliary_loss_clip": 0.01104825, + "auxiliary_loss_mlp": 0.01034383, + "balance_loss_clip": 1.02228308, + "balance_loss_mlp": 1.03711128, + "epoch": 0.7721629340147302, + "flos": 26651571235200.0, + "grad_norm": 2.1043616353717525, + "language_loss": 0.68631554, + "learning_rate": 5.201468888013445e-07, + "loss": 0.70770752, + "num_input_tokens_seen": 277084045, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 12843, + "time_per_iteration": 2.493771553039551 + }, + { + "auxiliary_loss_clip": 0.01103415, + "auxiliary_loss_mlp": 0.01029337, + "balance_loss_clip": 1.01814365, + "balance_loss_mlp": 1.03442502, + "epoch": 0.7722230572673981, + "flos": 21178857110400.0, + "grad_norm": 2.427096295958664, + "language_loss": 0.73946643, + "learning_rate": 5.198849307926465e-07, + "loss": 0.76079392, + "num_input_tokens_seen": 277102625, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6875, + "step": 12844, + "time_per_iteration": 3.8521201610565186 + }, + { + "auxiliary_loss_clip": 0.01099917, + "auxiliary_loss_mlp": 0.01028689, + "balance_loss_clip": 1.01715553, + "balance_loss_mlp": 1.03452098, + "epoch": 0.7722831805200662, + "flos": 27964644814080.0, + "grad_norm": 1.3727417180259405, + "language_loss": 0.7147876, + "learning_rate": 5.196230289100596e-07, + "loss": 0.73607367, + "num_input_tokens_seen": 277123210, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65234375, + "step": 12845, + "time_per_iteration": 2.480782985687256 + }, + { + "auxiliary_loss_clip": 0.01098431, + "auxiliary_loss_mlp": 0.01027462, + "balance_loss_clip": 1.01647639, + "balance_loss_mlp": 1.03456306, + "epoch": 0.7723433037727341, + "flos": 33875576864640.0, + "grad_norm": 1.8692274529253097, + "language_loss": 0.64329362, + "learning_rate": 5.193611831635159e-07, + "loss": 0.66455245, + "num_input_tokens_seen": 277144895, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.640625, + "step": 12846, + "time_per_iteration": 3.939861297607422 + }, + { + "auxiliary_loss_clip": 0.01024017, + "auxiliary_loss_mlp": 0.00999429, + "balance_loss_clip": 0.99847573, + "balance_loss_mlp": 1.00368702, + "epoch": 0.7724034270254021, + "flos": 62848271940480.0, + "grad_norm": 0.7797260608055787, + "language_loss": 0.61791992, + "learning_rate": 5.19099393562945e-07, + "loss": 0.63815439, + "num_input_tokens_seen": 277205160, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.203125, + "step": 12847, + "time_per_iteration": 4.373151063919067 + }, + { + "auxiliary_loss_clip": 0.01099082, + "auxiliary_loss_mlp": 0.01024766, + "balance_loss_clip": 1.01329207, + "balance_loss_mlp": 1.03237033, + "epoch": 0.77246355027807, + "flos": 23295467888640.0, + "grad_norm": 1.8104305553743092, + "language_loss": 0.78874886, + "learning_rate": 5.188376601182732e-07, + "loss": 0.80998737, + "num_input_tokens_seen": 277223005, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.66796875, + "step": 12848, + "time_per_iteration": 2.4621658325195312 + }, + { + "auxiliary_loss_clip": 0.01104725, + "auxiliary_loss_mlp": 0.01031435, + "balance_loss_clip": 1.02015185, + "balance_loss_mlp": 1.03665447, + "epoch": 0.772523673530738, + "flos": 20121287950080.0, + "grad_norm": 1.8870380118998122, + "language_loss": 0.73187292, + "learning_rate": 5.185759828394261e-07, + "loss": 0.75323451, + "num_input_tokens_seen": 277241785, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6796875, + "step": 12849, + "time_per_iteration": 2.423586368560791 + }, + { + "auxiliary_loss_clip": 0.01099584, + "auxiliary_loss_mlp": 0.01029656, + "balance_loss_clip": 1.01813483, + "balance_loss_mlp": 1.03409362, + "epoch": 0.7725837967834059, + "flos": 17820096157440.0, + "grad_norm": 1.7816955634054865, + "language_loss": 0.78761244, + "learning_rate": 5.183143617363261e-07, + "loss": 0.80890489, + "num_input_tokens_seen": 277259050, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 12850, + "time_per_iteration": 3.8340566158294678 + }, + { + "auxiliary_loss_clip": 0.0110087, + "auxiliary_loss_mlp": 0.01034019, + "balance_loss_clip": 1.02208018, + "balance_loss_mlp": 1.03316355, + "epoch": 0.772643920036074, + "flos": 27198921657600.0, + "grad_norm": 1.5411131818733386, + "language_loss": 0.79572296, + "learning_rate": 5.180527968188935e-07, + "loss": 0.81707186, + "num_input_tokens_seen": 277278235, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 12851, + "time_per_iteration": 2.4925901889801025 + }, + { + "auxiliary_loss_clip": 0.01100454, + "auxiliary_loss_mlp": 0.0103028, + "balance_loss_clip": 1.01792979, + "balance_loss_mlp": 1.03538489, + "epoch": 0.7727040432887419, + "flos": 21579512388480.0, + "grad_norm": 1.50632412923142, + "language_loss": 0.73631006, + "learning_rate": 5.177912880970474e-07, + "loss": 0.75761741, + "num_input_tokens_seen": 277298355, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6484375, + "step": 12852, + "time_per_iteration": 2.4682977199554443 + }, + { + "auxiliary_loss_clip": 0.01098009, + "auxiliary_loss_mlp": 0.01033845, + "balance_loss_clip": 1.02231097, + "balance_loss_mlp": 1.03296447, + "epoch": 0.7727641665414099, + "flos": 22236641752320.0, + "grad_norm": 1.8447801118424108, + "language_loss": 0.8239882, + "learning_rate": 5.17529835580704e-07, + "loss": 0.84530675, + "num_input_tokens_seen": 277316095, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 12853, + "time_per_iteration": 2.4569756984710693 + }, + { + "auxiliary_loss_clip": 0.01023792, + "auxiliary_loss_mlp": 0.0099718, + "balance_loss_clip": 0.99619693, + "balance_loss_mlp": 1.00358176, + "epoch": 0.7728242897940779, + "flos": 54832221463680.0, + "grad_norm": 0.9862475584721329, + "language_loss": 0.54506302, + "learning_rate": 5.172684392797786e-07, + "loss": 0.56527275, + "num_input_tokens_seen": 277380130, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.20214844, + "step": 12854, + "time_per_iteration": 3.091365098953247 + }, + { + "auxiliary_loss_clip": 0.01103537, + "auxiliary_loss_mlp": 0.01028551, + "balance_loss_clip": 1.01575994, + "balance_loss_mlp": 1.0352025, + "epoch": 0.7728844130467458, + "flos": 34461962392320.0, + "grad_norm": 1.475002899268902, + "language_loss": 0.71589357, + "learning_rate": 5.170070992041826e-07, + "loss": 0.73721445, + "num_input_tokens_seen": 277404015, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.68359375, + "step": 12855, + "time_per_iteration": 2.563339948654175 + }, + { + "auxiliary_loss_clip": 0.01100584, + "auxiliary_loss_mlp": 0.01029763, + "balance_loss_clip": 1.01721025, + "balance_loss_mlp": 1.03491831, + "epoch": 0.7729445362994138, + "flos": 18916341287040.0, + "grad_norm": 1.6853102907434419, + "language_loss": 0.67508936, + "learning_rate": 5.167458153638254e-07, + "loss": 0.69639283, + "num_input_tokens_seen": 277421375, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.65625, + "step": 12856, + "time_per_iteration": 2.4246950149536133 + }, + { + "auxiliary_loss_clip": 0.01102261, + "auxiliary_loss_mlp": 0.0102718, + "balance_loss_clip": 1.0153966, + "balance_loss_mlp": 1.03492117, + "epoch": 0.7730046595520818, + "flos": 22200048771840.0, + "grad_norm": 1.6664497759881594, + "language_loss": 0.78636038, + "learning_rate": 5.164845877686162e-07, + "loss": 0.8076548, + "num_input_tokens_seen": 277440170, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.671875, + "step": 12857, + "time_per_iteration": 2.4259722232818604 + }, + { + "auxiliary_loss_clip": 0.01099797, + "auxiliary_loss_mlp": 0.01029002, + "balance_loss_clip": 1.0170691, + "balance_loss_mlp": 1.03505707, + "epoch": 0.7730647828047498, + "flos": 13552328695680.0, + "grad_norm": 2.4693745762825627, + "language_loss": 0.78503597, + "learning_rate": 5.162234164284591e-07, + "loss": 0.80632401, + "num_input_tokens_seen": 277456880, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6484375, + "step": 12858, + "time_per_iteration": 2.414808988571167 + }, + { + "auxiliary_loss_clip": 0.0110013, + "auxiliary_loss_mlp": 0.01030418, + "balance_loss_clip": 1.01887894, + "balance_loss_mlp": 1.0332911, + "epoch": 0.7731249060574177, + "flos": 21976037602560.0, + "grad_norm": 2.1506807950165716, + "language_loss": 0.76832533, + "learning_rate": 5.159623013532591e-07, + "loss": 0.78963083, + "num_input_tokens_seen": 277475365, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66796875, + "step": 12859, + "time_per_iteration": 2.4226794242858887 + }, + { + "auxiliary_loss_clip": 0.01098676, + "auxiliary_loss_mlp": 0.01027748, + "balance_loss_clip": 1.01765668, + "balance_loss_mlp": 1.03636694, + "epoch": 0.7731850293100857, + "flos": 22601817371520.0, + "grad_norm": 1.3976193464700644, + "language_loss": 0.67598879, + "learning_rate": 5.157012425529186e-07, + "loss": 0.69725305, + "num_input_tokens_seen": 277494975, + "router_z_loss_clip": 0.10107422, + "router_z_loss_mlp": 0.62109375, + "step": 12860, + "time_per_iteration": 2.4838390350341797 + }, + { + "auxiliary_loss_clip": 0.01102762, + "auxiliary_loss_mlp": 0.01036693, + "balance_loss_clip": 1.02416456, + "balance_loss_mlp": 1.03352654, + "epoch": 0.7732451525627536, + "flos": 14098422142080.0, + "grad_norm": 2.447865183826217, + "language_loss": 0.7403549, + "learning_rate": 5.154402400373343e-07, + "loss": 0.76174939, + "num_input_tokens_seen": 277510520, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69140625, + "step": 12861, + "time_per_iteration": 2.4177722930908203 + }, + { + "auxiliary_loss_clip": 0.01105061, + "auxiliary_loss_mlp": 0.01029699, + "balance_loss_clip": 1.01744413, + "balance_loss_mlp": 1.03674173, + "epoch": 0.7733052758154216, + "flos": 21470020755840.0, + "grad_norm": 1.5943042288451297, + "language_loss": 0.74818659, + "learning_rate": 5.15179293816405e-07, + "loss": 0.76953417, + "num_input_tokens_seen": 277530505, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6796875, + "step": 12862, + "time_per_iteration": 2.502509832382202 + }, + { + "auxiliary_loss_clip": 0.01098685, + "auxiliary_loss_mlp": 0.0102901, + "balance_loss_clip": 1.01839459, + "balance_loss_mlp": 1.03460789, + "epoch": 0.7733653990680895, + "flos": 21394284929280.0, + "grad_norm": 1.605143243310102, + "language_loss": 0.82941031, + "learning_rate": 5.149184039000256e-07, + "loss": 0.85068727, + "num_input_tokens_seen": 277550810, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.640625, + "step": 12863, + "time_per_iteration": 2.435492753982544 + }, + { + "auxiliary_loss_clip": 0.01099256, + "auxiliary_loss_mlp": 0.01029825, + "balance_loss_clip": 1.01841044, + "balance_loss_mlp": 1.03421164, + "epoch": 0.7734255223207576, + "flos": 17676058619520.0, + "grad_norm": 1.686286227621035, + "language_loss": 0.73311162, + "learning_rate": 5.146575702980898e-07, + "loss": 0.7544024, + "num_input_tokens_seen": 277567680, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 12864, + "time_per_iteration": 2.4345412254333496 + }, + { + "auxiliary_loss_clip": 0.01100211, + "auxiliary_loss_mlp": 0.0103261, + "balance_loss_clip": 1.02182722, + "balance_loss_mlp": 1.03336382, + "epoch": 0.7734856455734255, + "flos": 25230837617280.0, + "grad_norm": 1.7236073313381683, + "language_loss": 0.82668412, + "learning_rate": 5.143967930204871e-07, + "loss": 0.84801233, + "num_input_tokens_seen": 277588970, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.66796875, + "step": 12865, + "time_per_iteration": 2.489175796508789 + }, + { + "auxiliary_loss_clip": 0.01106204, + "auxiliary_loss_mlp": 0.01031833, + "balance_loss_clip": 1.01882756, + "balance_loss_mlp": 1.03688681, + "epoch": 0.7735457688260935, + "flos": 23433112805760.0, + "grad_norm": 1.9919400131202358, + "language_loss": 0.71579105, + "learning_rate": 5.141360720771077e-07, + "loss": 0.73717141, + "num_input_tokens_seen": 277605450, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6953125, + "step": 12866, + "time_per_iteration": 2.4729628562927246 + }, + { + "auxiliary_loss_clip": 0.01103336, + "auxiliary_loss_mlp": 0.01026596, + "balance_loss_clip": 1.01476479, + "balance_loss_mlp": 1.03699803, + "epoch": 0.7736058920787615, + "flos": 18729246320640.0, + "grad_norm": 2.21518020983948, + "language_loss": 0.64429164, + "learning_rate": 5.138754074778371e-07, + "loss": 0.66559094, + "num_input_tokens_seen": 277622530, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 12867, + "time_per_iteration": 2.3936469554901123 + }, + { + "auxiliary_loss_clip": 0.01098589, + "auxiliary_loss_mlp": 0.01033636, + "balance_loss_clip": 1.02214408, + "balance_loss_mlp": 1.03422713, + "epoch": 0.7736660153314294, + "flos": 22893304239360.0, + "grad_norm": 1.4977465030205475, + "language_loss": 0.70845938, + "learning_rate": 5.136147992325595e-07, + "loss": 0.72978157, + "num_input_tokens_seen": 277642700, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.640625, + "step": 12868, + "time_per_iteration": 2.5017075538635254 + }, + { + "auxiliary_loss_clip": 0.01103278, + "auxiliary_loss_mlp": 0.01027324, + "balance_loss_clip": 1.0157187, + "balance_loss_mlp": 1.03648961, + "epoch": 0.7737261385840974, + "flos": 13800901789440.0, + "grad_norm": 2.3483431493436506, + "language_loss": 0.78185302, + "learning_rate": 5.133542473511578e-07, + "loss": 0.803159, + "num_input_tokens_seen": 277660005, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66796875, + "step": 12869, + "time_per_iteration": 2.4156572818756104 + }, + { + "auxiliary_loss_clip": 0.01095592, + "auxiliary_loss_mlp": 0.01027488, + "balance_loss_clip": 1.01597238, + "balance_loss_mlp": 1.0325917, + "epoch": 0.7737862618367654, + "flos": 28730727106560.0, + "grad_norm": 2.073469705859901, + "language_loss": 0.73596758, + "learning_rate": 5.130937518435124e-07, + "loss": 0.75719839, + "num_input_tokens_seen": 277682890, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6328125, + "step": 12870, + "time_per_iteration": 2.517237663269043 + }, + { + "auxiliary_loss_clip": 0.01101602, + "auxiliary_loss_mlp": 0.0103016, + "balance_loss_clip": 1.01848328, + "balance_loss_mlp": 1.03500986, + "epoch": 0.7738463850894334, + "flos": 17018570119680.0, + "grad_norm": 2.174151142441679, + "language_loss": 0.75611806, + "learning_rate": 5.12833312719501e-07, + "loss": 0.77743572, + "num_input_tokens_seen": 277699330, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 12871, + "time_per_iteration": 2.400402069091797 + }, + { + "auxiliary_loss_clip": 0.01099358, + "auxiliary_loss_mlp": 0.01030488, + "balance_loss_clip": 1.0195806, + "balance_loss_mlp": 1.03400016, + "epoch": 0.7739065083421013, + "flos": 20704010290560.0, + "grad_norm": 1.515902079714309, + "language_loss": 0.69396317, + "learning_rate": 5.12572929988999e-07, + "loss": 0.71526158, + "num_input_tokens_seen": 277718750, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.65234375, + "step": 12872, + "time_per_iteration": 2.454831838607788 + }, + { + "auxiliary_loss_clip": 0.01101254, + "auxiliary_loss_mlp": 0.01031553, + "balance_loss_clip": 1.01894081, + "balance_loss_mlp": 1.03436494, + "epoch": 0.7739666315947693, + "flos": 20697222620160.0, + "grad_norm": 2.1128263848604303, + "language_loss": 0.85076445, + "learning_rate": 5.123126036618804e-07, + "loss": 0.87209249, + "num_input_tokens_seen": 277734645, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.66796875, + "step": 12873, + "time_per_iteration": 2.413208484649658 + }, + { + "auxiliary_loss_clip": 0.01103416, + "auxiliary_loss_mlp": 0.0103242, + "balance_loss_clip": 1.02108884, + "balance_loss_mlp": 1.03663659, + "epoch": 0.7740267548474372, + "flos": 29570677718400.0, + "grad_norm": 2.3833664106096357, + "language_loss": 0.65228915, + "learning_rate": 5.120523337480174e-07, + "loss": 0.67364746, + "num_input_tokens_seen": 277755535, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 12874, + "time_per_iteration": 2.511897563934326 + }, + { + "auxiliary_loss_clip": 0.01101804, + "auxiliary_loss_mlp": 0.01029077, + "balance_loss_clip": 1.01711988, + "balance_loss_mlp": 1.03627491, + "epoch": 0.7740868781001052, + "flos": 23659099223040.0, + "grad_norm": 1.672939756784885, + "language_loss": 0.62344849, + "learning_rate": 5.117921202572785e-07, + "loss": 0.64475727, + "num_input_tokens_seen": 277775585, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 12875, + "time_per_iteration": 2.4547970294952393 + }, + { + "auxiliary_loss_clip": 0.01100089, + "auxiliary_loss_mlp": 0.01030816, + "balance_loss_clip": 1.01926494, + "balance_loss_mlp": 1.03329086, + "epoch": 0.7741470013527731, + "flos": 24717314828160.0, + "grad_norm": 1.7114118176893034, + "language_loss": 0.65592134, + "learning_rate": 5.115319631995318e-07, + "loss": 0.67723036, + "num_input_tokens_seen": 277794795, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66796875, + "step": 12876, + "time_per_iteration": 2.507066011428833 + }, + { + "auxiliary_loss_clip": 0.01097976, + "auxiliary_loss_mlp": 0.01029435, + "balance_loss_clip": 1.01795566, + "balance_loss_mlp": 1.03334641, + "epoch": 0.7742071246054412, + "flos": 21871645701120.0, + "grad_norm": 2.056913252626623, + "language_loss": 0.71540773, + "learning_rate": 5.112718625846433e-07, + "loss": 0.73668182, + "num_input_tokens_seen": 277813235, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6484375, + "step": 12877, + "time_per_iteration": 2.4201643466949463 + }, + { + "auxiliary_loss_clip": 0.01104162, + "auxiliary_loss_mlp": 0.01031249, + "balance_loss_clip": 1.0190177, + "balance_loss_mlp": 1.03517962, + "epoch": 0.7742672478581091, + "flos": 22674249146880.0, + "grad_norm": 1.8044293280530723, + "language_loss": 0.82859612, + "learning_rate": 5.110118184224736e-07, + "loss": 0.84995025, + "num_input_tokens_seen": 277832560, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6875, + "step": 12878, + "time_per_iteration": 2.4779839515686035 + }, + { + "auxiliary_loss_clip": 0.01101355, + "auxiliary_loss_mlp": 0.01031253, + "balance_loss_clip": 1.01875401, + "balance_loss_mlp": 1.03469586, + "epoch": 0.7743273711107771, + "flos": 18840892769280.0, + "grad_norm": 1.7446777293969558, + "language_loss": 0.73307019, + "learning_rate": 5.10751830722885e-07, + "loss": 0.75439632, + "num_input_tokens_seen": 277850120, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6640625, + "step": 12879, + "time_per_iteration": 2.4160289764404297 + }, + { + "auxiliary_loss_clip": 0.01095247, + "auxiliary_loss_mlp": 0.01026524, + "balance_loss_clip": 1.01504445, + "balance_loss_mlp": 1.03218174, + "epoch": 0.7743874943634451, + "flos": 28729326476160.0, + "grad_norm": 2.0530344125877824, + "language_loss": 0.79587936, + "learning_rate": 5.104918994957364e-07, + "loss": 0.81709713, + "num_input_tokens_seen": 277871020, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6328125, + "step": 12880, + "time_per_iteration": 2.5343987941741943 + }, + { + "auxiliary_loss_clip": 0.01098931, + "auxiliary_loss_mlp": 0.01032432, + "balance_loss_clip": 1.02097631, + "balance_loss_mlp": 1.03499806, + "epoch": 0.774447617616113, + "flos": 21909639312000.0, + "grad_norm": 1.5022230028348473, + "language_loss": 0.69992185, + "learning_rate": 5.102320247508847e-07, + "loss": 0.72123551, + "num_input_tokens_seen": 277891525, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.640625, + "step": 12881, + "time_per_iteration": 2.4520153999328613 + }, + { + "auxiliary_loss_clip": 0.01105007, + "auxiliary_loss_mlp": 0.01035783, + "balance_loss_clip": 1.02330136, + "balance_loss_mlp": 1.0357368, + "epoch": 0.774507740868781, + "flos": 19500643825920.0, + "grad_norm": 2.221505140298077, + "language_loss": 0.84215307, + "learning_rate": 5.099722064981832e-07, + "loss": 0.86356097, + "num_input_tokens_seen": 277910425, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69140625, + "step": 12882, + "time_per_iteration": 2.4450690746307373 + }, + { + "auxiliary_loss_clip": 0.01023891, + "auxiliary_loss_mlp": 0.01012882, + "balance_loss_clip": 1.01180887, + "balance_loss_mlp": 1.00356591, + "epoch": 0.774567864121449, + "flos": 59426560402560.0, + "grad_norm": 0.8021199290846766, + "language_loss": 0.6040681, + "learning_rate": 5.097124447474858e-07, + "loss": 0.62443578, + "num_input_tokens_seen": 277972795, + "router_z_loss_clip": 0.01074219, + "router_z_loss_mlp": 0.203125, + "step": 12883, + "time_per_iteration": 3.0097620487213135 + }, + { + "auxiliary_loss_clip": 0.01103604, + "auxiliary_loss_mlp": 0.01032342, + "balance_loss_clip": 1.02008712, + "balance_loss_mlp": 1.03575671, + "epoch": 0.774627987374117, + "flos": 13225326255360.0, + "grad_norm": 1.7162492869747636, + "language_loss": 0.72789645, + "learning_rate": 5.094527395086416e-07, + "loss": 0.7492559, + "num_input_tokens_seen": 277990675, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6796875, + "step": 12884, + "time_per_iteration": 2.4377074241638184 + }, + { + "auxiliary_loss_clip": 0.01100524, + "auxiliary_loss_mlp": 0.01030847, + "balance_loss_clip": 1.02003515, + "balance_loss_mlp": 1.0354799, + "epoch": 0.7746881106267849, + "flos": 21394033534080.0, + "grad_norm": 3.230363758289503, + "language_loss": 0.80970025, + "learning_rate": 5.091930907914986e-07, + "loss": 0.83101392, + "num_input_tokens_seen": 278010050, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.65234375, + "step": 12885, + "time_per_iteration": 2.4225785732269287 + }, + { + "auxiliary_loss_clip": 0.01098684, + "auxiliary_loss_mlp": 0.01033412, + "balance_loss_clip": 1.02272451, + "balance_loss_mlp": 1.03376412, + "epoch": 0.7747482338794529, + "flos": 25629338079360.0, + "grad_norm": 1.8035422481179095, + "language_loss": 0.64108509, + "learning_rate": 5.089334986059029e-07, + "loss": 0.66240609, + "num_input_tokens_seen": 278030660, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6484375, + "step": 12886, + "time_per_iteration": 3.857712507247925 + }, + { + "auxiliary_loss_clip": 0.01099608, + "auxiliary_loss_mlp": 0.01028414, + "balance_loss_clip": 1.0177393, + "balance_loss_mlp": 1.03219748, + "epoch": 0.7748083571321208, + "flos": 11546933402880.0, + "grad_norm": 2.0473331213234327, + "language_loss": 0.69581932, + "learning_rate": 5.086739629616987e-07, + "loss": 0.71709955, + "num_input_tokens_seen": 278047645, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.671875, + "step": 12887, + "time_per_iteration": 2.423344373703003 + }, + { + "auxiliary_loss_clip": 0.01097443, + "auxiliary_loss_mlp": 0.01028368, + "balance_loss_clip": 1.01763892, + "balance_loss_mlp": 1.03330386, + "epoch": 0.7748684803847888, + "flos": 19062425900160.0, + "grad_norm": 1.7264815005579048, + "language_loss": 0.70614457, + "learning_rate": 5.084144838687275e-07, + "loss": 0.72740269, + "num_input_tokens_seen": 278066170, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.64453125, + "step": 12888, + "time_per_iteration": 3.8539748191833496 + }, + { + "auxiliary_loss_clip": 0.01101208, + "auxiliary_loss_mlp": 0.01029001, + "balance_loss_clip": 1.01708579, + "balance_loss_mlp": 1.03361416, + "epoch": 0.7749286036374567, + "flos": 22273162905600.0, + "grad_norm": 2.628922406260807, + "language_loss": 0.81764227, + "learning_rate": 5.081550613368279e-07, + "loss": 0.83894438, + "num_input_tokens_seen": 278085545, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 12889, + "time_per_iteration": 3.9081172943115234 + }, + { + "auxiliary_loss_clip": 0.01100926, + "auxiliary_loss_mlp": 0.01029423, + "balance_loss_clip": 1.01818752, + "balance_loss_mlp": 1.0352304, + "epoch": 0.7749887268901248, + "flos": 20192462749440.0, + "grad_norm": 1.7934757747385575, + "language_loss": 0.79690224, + "learning_rate": 5.07895695375838e-07, + "loss": 0.81820571, + "num_input_tokens_seen": 278102995, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 12890, + "time_per_iteration": 2.4259889125823975 + }, + { + "auxiliary_loss_clip": 0.0110576, + "auxiliary_loss_mlp": 0.01031786, + "balance_loss_clip": 1.02038956, + "balance_loss_mlp": 1.03786349, + "epoch": 0.7750488501427927, + "flos": 20337541781760.0, + "grad_norm": 1.8078298047405903, + "language_loss": 0.6619277, + "learning_rate": 5.076363859955932e-07, + "loss": 0.68330312, + "num_input_tokens_seen": 278121460, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6796875, + "step": 12891, + "time_per_iteration": 2.421792984008789 + }, + { + "auxiliary_loss_clip": 0.01100105, + "auxiliary_loss_mlp": 0.01027779, + "balance_loss_clip": 1.01623988, + "balance_loss_mlp": 1.03472996, + "epoch": 0.7751089733954607, + "flos": 28364043116160.0, + "grad_norm": 1.4457356185681014, + "language_loss": 0.78705311, + "learning_rate": 5.073771332059257e-07, + "loss": 0.80833197, + "num_input_tokens_seen": 278143905, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65234375, + "step": 12892, + "time_per_iteration": 4.022496223449707 + }, + { + "auxiliary_loss_clip": 0.0110464, + "auxiliary_loss_mlp": 0.01025963, + "balance_loss_clip": 1.01451278, + "balance_loss_mlp": 1.03709579, + "epoch": 0.7751690966481286, + "flos": 16943803960320.0, + "grad_norm": 1.95553815104522, + "language_loss": 0.6747188, + "learning_rate": 5.071179370166669e-07, + "loss": 0.69602484, + "num_input_tokens_seen": 278160850, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.67578125, + "step": 12893, + "time_per_iteration": 2.4064764976501465 + }, + { + "auxiliary_loss_clip": 0.010241, + "auxiliary_loss_mlp": 0.01003293, + "balance_loss_clip": 1.0022974, + "balance_loss_mlp": 1.00361943, + "epoch": 0.7752292199007966, + "flos": 65668050339840.0, + "grad_norm": 0.8057156528399092, + "language_loss": 0.58470869, + "learning_rate": 5.068587974376468e-07, + "loss": 0.60498261, + "num_input_tokens_seen": 278219950, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.20507812, + "step": 12894, + "time_per_iteration": 3.0993287563323975 + }, + { + "auxiliary_loss_clip": 0.01103557, + "auxiliary_loss_mlp": 0.01030628, + "balance_loss_clip": 1.01871347, + "balance_loss_mlp": 1.03571117, + "epoch": 0.7752893431534646, + "flos": 20594662312320.0, + "grad_norm": 1.980811218300561, + "language_loss": 0.78687382, + "learning_rate": 5.065997144786895e-07, + "loss": 0.80821562, + "num_input_tokens_seen": 278237805, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 12895, + "time_per_iteration": 2.4280591011047363 + }, + { + "auxiliary_loss_clip": 0.01101744, + "auxiliary_loss_mlp": 0.01026712, + "balance_loss_clip": 1.01484489, + "balance_loss_mlp": 1.03593993, + "epoch": 0.7753494664061326, + "flos": 20485350247680.0, + "grad_norm": 1.9018795725509905, + "language_loss": 0.67731452, + "learning_rate": 5.063406881496209e-07, + "loss": 0.69859904, + "num_input_tokens_seen": 278257660, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.65625, + "step": 12896, + "time_per_iteration": 2.461527109146118 + }, + { + "auxiliary_loss_clip": 0.01099523, + "auxiliary_loss_mlp": 0.01033427, + "balance_loss_clip": 1.0224123, + "balance_loss_mlp": 1.0342598, + "epoch": 0.7754095896588006, + "flos": 20265900105600.0, + "grad_norm": 1.7046546419810793, + "language_loss": 0.69181269, + "learning_rate": 5.060817184602629e-07, + "loss": 0.71314216, + "num_input_tokens_seen": 278275110, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65234375, + "step": 12897, + "time_per_iteration": 2.4287121295928955 + }, + { + "auxiliary_loss_clip": 0.01103573, + "auxiliary_loss_mlp": 0.01034198, + "balance_loss_clip": 1.02193165, + "balance_loss_mlp": 1.03643906, + "epoch": 0.7754697129114685, + "flos": 23331091201920.0, + "grad_norm": 1.6934570873388384, + "language_loss": 0.75021553, + "learning_rate": 5.058228054204364e-07, + "loss": 0.77159327, + "num_input_tokens_seen": 278293035, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.671875, + "step": 12898, + "time_per_iteration": 2.476008415222168 + }, + { + "auxiliary_loss_clip": 0.01101597, + "auxiliary_loss_mlp": 0.01028302, + "balance_loss_clip": 1.01574945, + "balance_loss_mlp": 1.03492308, + "epoch": 0.7755298361641365, + "flos": 17347619635200.0, + "grad_norm": 2.107133651932301, + "language_loss": 0.70084441, + "learning_rate": 5.055639490399588e-07, + "loss": 0.72214341, + "num_input_tokens_seen": 278311010, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.66796875, + "step": 12899, + "time_per_iteration": 2.4085845947265625 + }, + { + "auxiliary_loss_clip": 0.01100599, + "auxiliary_loss_mlp": 0.01030096, + "balance_loss_clip": 1.01829982, + "balance_loss_mlp": 1.03514957, + "epoch": 0.7755899594168044, + "flos": 19645866512640.0, + "grad_norm": 1.8299634820170116, + "language_loss": 0.74540645, + "learning_rate": 5.053051493286453e-07, + "loss": 0.76671344, + "num_input_tokens_seen": 278329900, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65625, + "step": 12900, + "time_per_iteration": 2.463158369064331 + }, + { + "auxiliary_loss_clip": 0.01097147, + "auxiliary_loss_mlp": 0.010342, + "balance_loss_clip": 1.02339911, + "balance_loss_mlp": 1.03308296, + "epoch": 0.7756500826694724, + "flos": 27414457217280.0, + "grad_norm": 1.7671979453264242, + "language_loss": 0.77766836, + "learning_rate": 5.050464062963113e-07, + "loss": 0.79898179, + "num_input_tokens_seen": 278349980, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.640625, + "step": 12901, + "time_per_iteration": 2.487149715423584 + }, + { + "auxiliary_loss_clip": 0.01103314, + "auxiliary_loss_mlp": 0.01027981, + "balance_loss_clip": 1.01622105, + "balance_loss_mlp": 1.03825235, + "epoch": 0.7757102059221404, + "flos": 28730511624960.0, + "grad_norm": 1.6889669978659576, + "language_loss": 0.77270627, + "learning_rate": 5.047877199527666e-07, + "loss": 0.79401928, + "num_input_tokens_seen": 278372485, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6484375, + "step": 12902, + "time_per_iteration": 2.522047758102417 + }, + { + "auxiliary_loss_clip": 0.01099422, + "auxiliary_loss_mlp": 0.01028642, + "balance_loss_clip": 1.01794279, + "balance_loss_mlp": 1.03434253, + "epoch": 0.7757703291748084, + "flos": 22486795044480.0, + "grad_norm": 1.6966870042115003, + "language_loss": 0.73324692, + "learning_rate": 5.045290903078215e-07, + "loss": 0.75452751, + "num_input_tokens_seen": 278391660, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.65234375, + "step": 12903, + "time_per_iteration": 2.4301648139953613 + }, + { + "auxiliary_loss_clip": 0.01101778, + "auxiliary_loss_mlp": 0.01025962, + "balance_loss_clip": 1.01404119, + "balance_loss_mlp": 1.03656173, + "epoch": 0.7758304524274763, + "flos": 21430159637760.0, + "grad_norm": 2.1229192794074025, + "language_loss": 0.76073396, + "learning_rate": 5.042705173712835e-07, + "loss": 0.78201139, + "num_input_tokens_seen": 278409125, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65234375, + "step": 12904, + "time_per_iteration": 2.4397873878479004 + }, + { + "auxiliary_loss_clip": 0.01096338, + "auxiliary_loss_mlp": 0.01023985, + "balance_loss_clip": 1.01313651, + "balance_loss_mlp": 1.03401458, + "epoch": 0.7758905756801443, + "flos": 23659242877440.0, + "grad_norm": 2.282889081568611, + "language_loss": 0.68131924, + "learning_rate": 5.040120011529576e-07, + "loss": 0.70252246, + "num_input_tokens_seen": 278429450, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.625, + "step": 12905, + "time_per_iteration": 2.444009780883789 + }, + { + "auxiliary_loss_clip": 0.01098458, + "auxiliary_loss_mlp": 0.01027316, + "balance_loss_clip": 1.01636112, + "balance_loss_mlp": 1.03590798, + "epoch": 0.7759506989328122, + "flos": 28365479660160.0, + "grad_norm": 1.6520534873626833, + "language_loss": 0.67321658, + "learning_rate": 5.037535416626459e-07, + "loss": 0.69447428, + "num_input_tokens_seen": 278449925, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.625, + "step": 12906, + "time_per_iteration": 2.5024046897888184 + }, + { + "auxiliary_loss_clip": 0.01101029, + "auxiliary_loss_mlp": 0.01031572, + "balance_loss_clip": 1.02074146, + "balance_loss_mlp": 1.03560805, + "epoch": 0.7760108221854802, + "flos": 14902785354240.0, + "grad_norm": 3.183876280395432, + "language_loss": 0.81314665, + "learning_rate": 5.034951389101498e-07, + "loss": 0.83447266, + "num_input_tokens_seen": 278467255, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.65625, + "step": 12907, + "time_per_iteration": 2.3983490467071533 + }, + { + "auxiliary_loss_clip": 0.01098064, + "auxiliary_loss_mlp": 0.01030989, + "balance_loss_clip": 1.02001524, + "balance_loss_mlp": 1.0352093, + "epoch": 0.7760709454381483, + "flos": 14792503622400.0, + "grad_norm": 2.1955762882014604, + "language_loss": 0.67891413, + "learning_rate": 5.032367929052685e-07, + "loss": 0.70020467, + "num_input_tokens_seen": 278484250, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.62890625, + "step": 12908, + "time_per_iteration": 2.4205586910247803 + }, + { + "auxiliary_loss_clip": 0.0110402, + "auxiliary_loss_mlp": 0.01037159, + "balance_loss_clip": 1.02561998, + "balance_loss_mlp": 1.0367105, + "epoch": 0.7761310686908162, + "flos": 17379831156480.0, + "grad_norm": 1.5072254351199776, + "language_loss": 0.70509684, + "learning_rate": 5.029785036577976e-07, + "loss": 0.72650868, + "num_input_tokens_seen": 278502740, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 12909, + "time_per_iteration": 2.411200523376465 + }, + { + "auxiliary_loss_clip": 0.01098463, + "auxiliary_loss_mlp": 0.01031862, + "balance_loss_clip": 1.02122903, + "balance_loss_mlp": 1.03443766, + "epoch": 0.7761911919434842, + "flos": 25556547168000.0, + "grad_norm": 1.8009791603999328, + "language_loss": 0.677131, + "learning_rate": 5.027202711775324e-07, + "loss": 0.69843423, + "num_input_tokens_seen": 278523890, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.640625, + "step": 12910, + "time_per_iteration": 2.4990389347076416 + }, + { + "auxiliary_loss_clip": 0.01102537, + "auxiliary_loss_mlp": 0.01030888, + "balance_loss_clip": 1.01995671, + "balance_loss_mlp": 1.03720117, + "epoch": 0.7762513151961521, + "flos": 23179763203200.0, + "grad_norm": 1.6715228881797681, + "language_loss": 0.71815217, + "learning_rate": 5.024620954742646e-07, + "loss": 0.73948646, + "num_input_tokens_seen": 278543185, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.65234375, + "step": 12911, + "time_per_iteration": 2.4534413814544678 + }, + { + "auxiliary_loss_clip": 0.01105044, + "auxiliary_loss_mlp": 0.01032193, + "balance_loss_clip": 1.01966429, + "balance_loss_mlp": 1.03769255, + "epoch": 0.7763114384488201, + "flos": 21689614552320.0, + "grad_norm": 2.415333717110697, + "language_loss": 0.63629675, + "learning_rate": 5.022039765577836e-07, + "loss": 0.65766907, + "num_input_tokens_seen": 278559220, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.67578125, + "step": 12912, + "time_per_iteration": 2.485800266265869 + }, + { + "auxiliary_loss_clip": 0.01024108, + "auxiliary_loss_mlp": 0.00998178, + "balance_loss_clip": 0.99713534, + "balance_loss_mlp": 1.00357115, + "epoch": 0.776371561701488, + "flos": 69025554316800.0, + "grad_norm": 1.0865465621016743, + "language_loss": 0.53211093, + "learning_rate": 5.019459144378779e-07, + "loss": 0.55233377, + "num_input_tokens_seen": 278618185, + "router_z_loss_clip": 0.01043701, + "router_z_loss_mlp": 0.20507812, + "step": 12913, + "time_per_iteration": 3.1158273220062256 + }, + { + "auxiliary_loss_clip": 0.01102849, + "auxiliary_loss_mlp": 0.01031884, + "balance_loss_clip": 1.02007604, + "balance_loss_mlp": 1.03618884, + "epoch": 0.776431684954156, + "flos": 22893914770560.0, + "grad_norm": 2.955130949159741, + "language_loss": 0.62075317, + "learning_rate": 5.016879091243338e-07, + "loss": 0.64210051, + "num_input_tokens_seen": 278636210, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6640625, + "step": 12914, + "time_per_iteration": 2.4749767780303955 + }, + { + "auxiliary_loss_clip": 0.01099375, + "auxiliary_loss_mlp": 0.0103267, + "balance_loss_clip": 1.02070785, + "balance_loss_mlp": 1.03413486, + "epoch": 0.776491808206824, + "flos": 20261554560000.0, + "grad_norm": 1.8057060992355358, + "language_loss": 0.82471168, + "learning_rate": 5.014299606269339e-07, + "loss": 0.84603214, + "num_input_tokens_seen": 278653305, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.65234375, + "step": 12915, + "time_per_iteration": 2.439039468765259 + }, + { + "auxiliary_loss_clip": 0.01103501, + "auxiliary_loss_mlp": 0.01031701, + "balance_loss_clip": 1.01918375, + "balance_loss_mlp": 1.03486073, + "epoch": 0.776551931459492, + "flos": 26759051706240.0, + "grad_norm": 1.6623901678084019, + "language_loss": 0.7471149, + "learning_rate": 5.011720689554603e-07, + "loss": 0.76846689, + "num_input_tokens_seen": 278671850, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 12916, + "time_per_iteration": 2.494717836380005 + }, + { + "auxiliary_loss_clip": 0.01099429, + "auxiliary_loss_mlp": 0.01027417, + "balance_loss_clip": 1.01588905, + "balance_loss_mlp": 1.03332853, + "epoch": 0.7766120547121599, + "flos": 52665080250240.0, + "grad_norm": 1.7727217475878263, + "language_loss": 0.65696949, + "learning_rate": 5.009142341196919e-07, + "loss": 0.67823803, + "num_input_tokens_seen": 278697860, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66015625, + "step": 12917, + "time_per_iteration": 2.718024969100952 + }, + { + "auxiliary_loss_clip": 0.01100019, + "auxiliary_loss_mlp": 0.01034508, + "balance_loss_clip": 1.02317166, + "balance_loss_mlp": 1.03343606, + "epoch": 0.7766721779648279, + "flos": 25156215112320.0, + "grad_norm": 1.709981739113561, + "language_loss": 0.64356208, + "learning_rate": 5.006564561294065e-07, + "loss": 0.66490734, + "num_input_tokens_seen": 278720655, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 12918, + "time_per_iteration": 2.5265743732452393 + }, + { + "auxiliary_loss_clip": 0.01099633, + "auxiliary_loss_mlp": 0.01031711, + "balance_loss_clip": 1.02063048, + "balance_loss_mlp": 1.03485835, + "epoch": 0.7767323012174958, + "flos": 23760761690880.0, + "grad_norm": 2.1453981037999386, + "language_loss": 0.73354542, + "learning_rate": 5.003987349943777e-07, + "loss": 0.75485885, + "num_input_tokens_seen": 278737375, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6484375, + "step": 12919, + "time_per_iteration": 2.436053514480591 + }, + { + "auxiliary_loss_clip": 0.01102045, + "auxiliary_loss_mlp": 0.01031685, + "balance_loss_clip": 1.01979434, + "balance_loss_mlp": 1.03540444, + "epoch": 0.7767924244701638, + "flos": 22086642556800.0, + "grad_norm": 2.524282476401475, + "language_loss": 0.79217321, + "learning_rate": 5.001410707243792e-07, + "loss": 0.81351054, + "num_input_tokens_seen": 278756510, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 12920, + "time_per_iteration": 2.4638402462005615 + }, + { + "auxiliary_loss_clip": 0.01101955, + "auxiliary_loss_mlp": 0.01030068, + "balance_loss_clip": 1.01851654, + "balance_loss_mlp": 1.03587747, + "epoch": 0.7768525477228319, + "flos": 21981640124160.0, + "grad_norm": 1.5839883144130948, + "language_loss": 0.70594597, + "learning_rate": 4.998834633291829e-07, + "loss": 0.72726625, + "num_input_tokens_seen": 278775410, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6640625, + "step": 12921, + "time_per_iteration": 2.4318997859954834 + }, + { + "auxiliary_loss_clip": 0.01103624, + "auxiliary_loss_mlp": 0.01027271, + "balance_loss_clip": 1.01492715, + "balance_loss_mlp": 1.03501809, + "epoch": 0.7769126709754998, + "flos": 21794581071360.0, + "grad_norm": 1.7058717810568553, + "language_loss": 0.76330459, + "learning_rate": 4.996259128185547e-07, + "loss": 0.78461355, + "num_input_tokens_seen": 278794260, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 12922, + "time_per_iteration": 2.470374345779419 + }, + { + "auxiliary_loss_clip": 0.01102145, + "auxiliary_loss_mlp": 0.01031584, + "balance_loss_clip": 1.01991987, + "balance_loss_mlp": 1.03619885, + "epoch": 0.7769727942281678, + "flos": 20047994248320.0, + "grad_norm": 1.882909865169764, + "language_loss": 0.80363363, + "learning_rate": 4.993684192022625e-07, + "loss": 0.82497096, + "num_input_tokens_seen": 278813290, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66015625, + "step": 12923, + "time_per_iteration": 2.488701343536377 + }, + { + "auxiliary_loss_clip": 0.01102496, + "auxiliary_loss_mlp": 0.01031608, + "balance_loss_clip": 1.02036023, + "balance_loss_mlp": 1.03716397, + "epoch": 0.7770329174808357, + "flos": 21686777377920.0, + "grad_norm": 1.9867390382218033, + "language_loss": 0.92483282, + "learning_rate": 4.991109824900699e-07, + "loss": 0.94617379, + "num_input_tokens_seen": 278830610, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 12924, + "time_per_iteration": 2.452601194381714 + }, + { + "auxiliary_loss_clip": 0.010996, + "auxiliary_loss_mlp": 0.01029134, + "balance_loss_clip": 1.01750469, + "balance_loss_mlp": 1.03356338, + "epoch": 0.7770930407335037, + "flos": 25849255098240.0, + "grad_norm": 1.980221846763212, + "language_loss": 0.65940827, + "learning_rate": 4.988536026917401e-07, + "loss": 0.68069565, + "num_input_tokens_seen": 278849530, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66015625, + "step": 12925, + "time_per_iteration": 2.4850525856018066 + }, + { + "auxiliary_loss_clip": 0.01103083, + "auxiliary_loss_mlp": 0.01035406, + "balance_loss_clip": 1.02330625, + "balance_loss_mlp": 1.03621173, + "epoch": 0.7771531639861716, + "flos": 24347865490560.0, + "grad_norm": 1.7141356167818045, + "language_loss": 0.71911299, + "learning_rate": 4.985962798170314e-07, + "loss": 0.74049789, + "num_input_tokens_seen": 278869005, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66796875, + "step": 12926, + "time_per_iteration": 2.4577598571777344 + }, + { + "auxiliary_loss_clip": 0.01103729, + "auxiliary_loss_mlp": 0.01026778, + "balance_loss_clip": 1.01420105, + "balance_loss_mlp": 1.03604841, + "epoch": 0.7772132872388396, + "flos": 25629948610560.0, + "grad_norm": 1.8312057216887105, + "language_loss": 0.65467525, + "learning_rate": 4.983390138757027e-07, + "loss": 0.67598033, + "num_input_tokens_seen": 278888790, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.67578125, + "step": 12927, + "time_per_iteration": 2.4614973068237305 + }, + { + "auxiliary_loss_clip": 0.01101116, + "auxiliary_loss_mlp": 0.01036421, + "balance_loss_clip": 1.02413607, + "balance_loss_mlp": 1.03512836, + "epoch": 0.7772734104915076, + "flos": 26067412350720.0, + "grad_norm": 1.745612038393379, + "language_loss": 0.72182518, + "learning_rate": 4.980818048775093e-07, + "loss": 0.74320054, + "num_input_tokens_seen": 278908150, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66015625, + "step": 12928, + "time_per_iteration": 3.8557302951812744 + }, + { + "auxiliary_loss_clip": 0.01097726, + "auxiliary_loss_mlp": 0.01029217, + "balance_loss_clip": 1.01759386, + "balance_loss_mlp": 1.03366995, + "epoch": 0.7773335337441756, + "flos": 22925048883840.0, + "grad_norm": 1.6060667874854504, + "language_loss": 0.73954302, + "learning_rate": 4.978246528322036e-07, + "loss": 0.76081246, + "num_input_tokens_seen": 278927425, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.640625, + "step": 12929, + "time_per_iteration": 2.4402310848236084 + }, + { + "auxiliary_loss_clip": 0.01101677, + "auxiliary_loss_mlp": 0.01029041, + "balance_loss_clip": 1.01716232, + "balance_loss_mlp": 1.03536963, + "epoch": 0.7773936569968435, + "flos": 20776765288320.0, + "grad_norm": 1.8904453576029416, + "language_loss": 0.77982825, + "learning_rate": 4.975675577495377e-07, + "loss": 0.80113542, + "num_input_tokens_seen": 278946475, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 12930, + "time_per_iteration": 3.86580491065979 + }, + { + "auxiliary_loss_clip": 0.01101392, + "auxiliary_loss_mlp": 0.01030849, + "balance_loss_clip": 1.01922631, + "balance_loss_mlp": 1.03665566, + "epoch": 0.7774537802495115, + "flos": 20372267255040.0, + "grad_norm": 1.7459832422973112, + "language_loss": 0.79347777, + "learning_rate": 4.973105196392613e-07, + "loss": 0.81480014, + "num_input_tokens_seen": 278964345, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6484375, + "step": 12931, + "time_per_iteration": 3.8444814682006836 + }, + { + "auxiliary_loss_clip": 0.01023847, + "auxiliary_loss_mlp": 0.01000444, + "balance_loss_clip": 0.99943125, + "balance_loss_mlp": 1.00351429, + "epoch": 0.7775139035021794, + "flos": 53912081738880.0, + "grad_norm": 0.8066088266331831, + "language_loss": 0.59735709, + "learning_rate": 4.970535385111199e-07, + "loss": 0.61760002, + "num_input_tokens_seen": 279022380, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.203125, + "step": 12932, + "time_per_iteration": 3.025099039077759 + }, + { + "auxiliary_loss_clip": 0.01102256, + "auxiliary_loss_mlp": 0.0102973, + "balance_loss_clip": 1.01822686, + "balance_loss_mlp": 1.03569841, + "epoch": 0.7775740267548474, + "flos": 28842481296000.0, + "grad_norm": 1.4815322595088087, + "language_loss": 0.76235545, + "learning_rate": 4.967966143748595e-07, + "loss": 0.78367525, + "num_input_tokens_seen": 279044275, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 12933, + "time_per_iteration": 4.019074440002441 + }, + { + "auxiliary_loss_clip": 0.01102231, + "auxiliary_loss_mlp": 0.01032552, + "balance_loss_clip": 1.02023768, + "balance_loss_mlp": 1.03603268, + "epoch": 0.7776341500075155, + "flos": 21872471713920.0, + "grad_norm": 2.0481953339666026, + "language_loss": 0.73607898, + "learning_rate": 4.965397472402215e-07, + "loss": 0.7574268, + "num_input_tokens_seen": 279063375, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6640625, + "step": 12934, + "time_per_iteration": 2.4444801807403564 + }, + { + "auxiliary_loss_clip": 0.01103168, + "auxiliary_loss_mlp": 0.0102577, + "balance_loss_clip": 1.01344395, + "balance_loss_mlp": 1.03648293, + "epoch": 0.7776942732601834, + "flos": 20229845829120.0, + "grad_norm": 1.8918830226491183, + "language_loss": 0.70461309, + "learning_rate": 4.962829371169475e-07, + "loss": 0.72590244, + "num_input_tokens_seen": 279082680, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66796875, + "step": 12935, + "time_per_iteration": 2.461881637573242 + }, + { + "auxiliary_loss_clip": 0.01103005, + "auxiliary_loss_mlp": 0.01037697, + "balance_loss_clip": 1.02545476, + "balance_loss_mlp": 1.03640771, + "epoch": 0.7777543965128514, + "flos": 22231829329920.0, + "grad_norm": 1.5340308714380857, + "language_loss": 0.83742738, + "learning_rate": 4.960261840147746e-07, + "loss": 0.85883445, + "num_input_tokens_seen": 279099805, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6640625, + "step": 12936, + "time_per_iteration": 2.4495856761932373 + }, + { + "auxiliary_loss_clip": 0.01103271, + "auxiliary_loss_mlp": 0.01027254, + "balance_loss_clip": 1.01662064, + "balance_loss_mlp": 1.03480935, + "epoch": 0.7778145197655193, + "flos": 14501950508160.0, + "grad_norm": 2.0135584494243255, + "language_loss": 0.67168462, + "learning_rate": 4.957694879434397e-07, + "loss": 0.69298995, + "num_input_tokens_seen": 279117975, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6875, + "step": 12937, + "time_per_iteration": 2.4478330612182617 + }, + { + "auxiliary_loss_clip": 0.01101824, + "auxiliary_loss_mlp": 0.01027729, + "balance_loss_clip": 1.01624966, + "balance_loss_mlp": 1.03470469, + "epoch": 0.7778746430181873, + "flos": 21140288881920.0, + "grad_norm": 1.648957424958238, + "language_loss": 0.868128, + "learning_rate": 4.955128489126777e-07, + "loss": 0.88942349, + "num_input_tokens_seen": 279137255, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 12938, + "time_per_iteration": 2.4823062419891357 + }, + { + "auxiliary_loss_clip": 0.01101697, + "auxiliary_loss_mlp": 0.01027803, + "balance_loss_clip": 1.01571488, + "balance_loss_mlp": 1.03527653, + "epoch": 0.7779347662708552, + "flos": 20266366982400.0, + "grad_norm": 1.8176002406557528, + "language_loss": 0.85162985, + "learning_rate": 4.95256266932218e-07, + "loss": 0.8729248, + "num_input_tokens_seen": 279154500, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6640625, + "step": 12939, + "time_per_iteration": 2.465057611465454 + }, + { + "auxiliary_loss_clip": 0.01097955, + "auxiliary_loss_mlp": 0.01031617, + "balance_loss_clip": 1.0204885, + "balance_loss_mlp": 1.03464723, + "epoch": 0.7779948895235232, + "flos": 19209013303680.0, + "grad_norm": 1.9198356417092663, + "language_loss": 0.68793273, + "learning_rate": 4.949997420117915e-07, + "loss": 0.70922846, + "num_input_tokens_seen": 279173635, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6328125, + "step": 12940, + "time_per_iteration": 2.4191107749938965 + }, + { + "auxiliary_loss_clip": 0.01100503, + "auxiliary_loss_mlp": 0.01024954, + "balance_loss_clip": 1.01418972, + "balance_loss_mlp": 1.03387284, + "epoch": 0.7780550127761912, + "flos": 23914711382400.0, + "grad_norm": 1.6186124498470607, + "language_loss": 0.77783638, + "learning_rate": 4.947432741611255e-07, + "loss": 0.79909098, + "num_input_tokens_seen": 279194430, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.66796875, + "step": 12941, + "time_per_iteration": 2.5182301998138428 + }, + { + "auxiliary_loss_clip": 0.01103819, + "auxiliary_loss_mlp": 0.01032841, + "balance_loss_clip": 1.02010357, + "balance_loss_mlp": 1.03493428, + "epoch": 0.7781151360288592, + "flos": 32415951795840.0, + "grad_norm": 2.252237972252455, + "language_loss": 0.73223758, + "learning_rate": 4.944868633899462e-07, + "loss": 0.75360417, + "num_input_tokens_seen": 279212920, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 12942, + "time_per_iteration": 2.5156443119049072 + }, + { + "auxiliary_loss_clip": 0.01097922, + "auxiliary_loss_mlp": 0.01031145, + "balance_loss_clip": 1.02004635, + "balance_loss_mlp": 1.03366685, + "epoch": 0.7781752592815271, + "flos": 22346384780160.0, + "grad_norm": 3.1295555400179653, + "language_loss": 0.6771059, + "learning_rate": 4.942305097079751e-07, + "loss": 0.69839656, + "num_input_tokens_seen": 279232310, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.640625, + "step": 12943, + "time_per_iteration": 2.4742066860198975 + }, + { + "auxiliary_loss_clip": 0.01023917, + "auxiliary_loss_mlp": 0.00999519, + "balance_loss_clip": 0.99852365, + "balance_loss_mlp": 1.00365448, + "epoch": 0.7782353825341951, + "flos": 70460183520000.0, + "grad_norm": 0.7816270653723761, + "language_loss": 0.5855267, + "learning_rate": 4.939742131249347e-07, + "loss": 0.60576105, + "num_input_tokens_seen": 279295375, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.203125, + "step": 12944, + "time_per_iteration": 3.1933257579803467 + }, + { + "auxiliary_loss_clip": 0.01103658, + "auxiliary_loss_mlp": 0.01035194, + "balance_loss_clip": 1.0226059, + "balance_loss_mlp": 1.03550398, + "epoch": 0.778295505786863, + "flos": 19062569554560.0, + "grad_norm": 2.222805879365814, + "language_loss": 0.6770618, + "learning_rate": 4.937179736505428e-07, + "loss": 0.69845027, + "num_input_tokens_seen": 279313660, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6796875, + "step": 12945, + "time_per_iteration": 2.4619064331054688 + }, + { + "auxiliary_loss_clip": 0.01101979, + "auxiliary_loss_mlp": 0.01031112, + "balance_loss_clip": 1.01970923, + "balance_loss_mlp": 1.03608465, + "epoch": 0.778355629039531, + "flos": 20999734963200.0, + "grad_norm": 1.9340302005475807, + "language_loss": 0.69121152, + "learning_rate": 4.93461791294516e-07, + "loss": 0.71254241, + "num_input_tokens_seen": 279334495, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66015625, + "step": 12946, + "time_per_iteration": 2.459763526916504 + }, + { + "auxiliary_loss_clip": 0.0110194, + "auxiliary_loss_mlp": 0.01027375, + "balance_loss_clip": 1.01551938, + "balance_loss_mlp": 1.03543854, + "epoch": 0.7784157522921991, + "flos": 21398091770880.0, + "grad_norm": 2.351828874315234, + "language_loss": 0.65289766, + "learning_rate": 4.932056660665689e-07, + "loss": 0.67419076, + "num_input_tokens_seen": 279352985, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6640625, + "step": 12947, + "time_per_iteration": 2.4477789402008057 + }, + { + "auxiliary_loss_clip": 0.01100664, + "auxiliary_loss_mlp": 0.01033296, + "balance_loss_clip": 1.02189326, + "balance_loss_mlp": 1.0360136, + "epoch": 0.778475875544867, + "flos": 20813861059200.0, + "grad_norm": 2.161531176276814, + "language_loss": 0.65099561, + "learning_rate": 4.929495979764147e-07, + "loss": 0.67233521, + "num_input_tokens_seen": 279371360, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6484375, + "step": 12948, + "time_per_iteration": 2.4290242195129395 + }, + { + "auxiliary_loss_clip": 0.01100958, + "auxiliary_loss_mlp": 0.01030087, + "balance_loss_clip": 1.01845825, + "balance_loss_mlp": 1.03465629, + "epoch": 0.778535998797535, + "flos": 14355363104640.0, + "grad_norm": 1.6859142998281702, + "language_loss": 0.74930477, + "learning_rate": 4.926935870337625e-07, + "loss": 0.77061522, + "num_input_tokens_seen": 279389400, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 12949, + "time_per_iteration": 2.4495837688446045 + }, + { + "auxiliary_loss_clip": 0.01105998, + "auxiliary_loss_mlp": 0.01032538, + "balance_loss_clip": 1.02037239, + "balance_loss_mlp": 1.03724563, + "epoch": 0.7785961220502029, + "flos": 19209552007680.0, + "grad_norm": 1.519597637019559, + "language_loss": 0.68952882, + "learning_rate": 4.924376332483202e-07, + "loss": 0.71091413, + "num_input_tokens_seen": 279409715, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6875, + "step": 12950, + "time_per_iteration": 2.4255573749542236 + }, + { + "auxiliary_loss_clip": 0.01099665, + "auxiliary_loss_mlp": 0.01028219, + "balance_loss_clip": 1.01623845, + "balance_loss_mlp": 1.0328927, + "epoch": 0.7786562453028709, + "flos": 25738757884800.0, + "grad_norm": 1.6317845562293505, + "language_loss": 0.71912777, + "learning_rate": 4.921817366297938e-07, + "loss": 0.74040663, + "num_input_tokens_seen": 279427705, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 12951, + "time_per_iteration": 2.481668710708618 + }, + { + "auxiliary_loss_clip": 0.0109957, + "auxiliary_loss_mlp": 0.0102935, + "balance_loss_clip": 1.017977, + "balance_loss_mlp": 1.03646922, + "epoch": 0.7787163685555388, + "flos": 25739440243200.0, + "grad_norm": 1.6634043770166038, + "language_loss": 0.65471166, + "learning_rate": 4.919258971878877e-07, + "loss": 0.67600083, + "num_input_tokens_seen": 279448215, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6328125, + "step": 12952, + "time_per_iteration": 2.4531540870666504 + }, + { + "auxiliary_loss_clip": 0.01093756, + "auxiliary_loss_mlp": 0.01026755, + "balance_loss_clip": 1.01577616, + "balance_loss_mlp": 1.03269386, + "epoch": 0.7787764918082068, + "flos": 22747722416640.0, + "grad_norm": 1.5845487757509182, + "language_loss": 0.81134123, + "learning_rate": 4.916701149323022e-07, + "loss": 0.83254635, + "num_input_tokens_seen": 279466260, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.609375, + "step": 12953, + "time_per_iteration": 2.463089942932129 + }, + { + "auxiliary_loss_clip": 0.01106842, + "auxiliary_loss_mlp": 0.01031184, + "balance_loss_clip": 1.01938844, + "balance_loss_mlp": 1.03845859, + "epoch": 0.7788366150608748, + "flos": 15190860430080.0, + "grad_norm": 3.927672519957359, + "language_loss": 0.77081442, + "learning_rate": 4.91414389872737e-07, + "loss": 0.79219466, + "num_input_tokens_seen": 279484520, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.68359375, + "step": 12954, + "time_per_iteration": 2.407898187637329 + }, + { + "auxiliary_loss_clip": 0.01103106, + "auxiliary_loss_mlp": 0.0102569, + "balance_loss_clip": 1.01457942, + "balance_loss_mlp": 1.03563237, + "epoch": 0.7788967383135428, + "flos": 21210242618880.0, + "grad_norm": 1.509444537933962, + "language_loss": 0.72937489, + "learning_rate": 4.911587220188905e-07, + "loss": 0.7506628, + "num_input_tokens_seen": 279503130, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.67578125, + "step": 12955, + "time_per_iteration": 2.4522764682769775 + }, + { + "auxiliary_loss_clip": 0.01100775, + "auxiliary_loss_mlp": 0.01028796, + "balance_loss_clip": 1.0172863, + "balance_loss_mlp": 1.03384817, + "epoch": 0.7789568615662107, + "flos": 21682970536320.0, + "grad_norm": 1.439262912645897, + "language_loss": 0.68722045, + "learning_rate": 4.909031113804551e-07, + "loss": 0.70851612, + "num_input_tokens_seen": 279521930, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66796875, + "step": 12956, + "time_per_iteration": 2.4333713054656982 + }, + { + "auxiliary_loss_clip": 0.01100355, + "auxiliary_loss_mlp": 0.01028981, + "balance_loss_clip": 1.01781666, + "balance_loss_mlp": 1.03511453, + "epoch": 0.7790169848188787, + "flos": 26360371676160.0, + "grad_norm": 1.517896090927025, + "language_loss": 0.76230508, + "learning_rate": 4.906475579671252e-07, + "loss": 0.78359848, + "num_input_tokens_seen": 279542375, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.65234375, + "step": 12957, + "time_per_iteration": 2.503735065460205 + }, + { + "auxiliary_loss_clip": 0.01100381, + "auxiliary_loss_mlp": 0.01028257, + "balance_loss_clip": 1.01647925, + "balance_loss_mlp": 1.03468633, + "epoch": 0.7790771080715466, + "flos": 25516183259520.0, + "grad_norm": 1.5979731248356082, + "language_loss": 0.77661026, + "learning_rate": 4.903920617885917e-07, + "loss": 0.79789662, + "num_input_tokens_seen": 279561885, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.65625, + "step": 12958, + "time_per_iteration": 2.470494270324707 + }, + { + "auxiliary_loss_clip": 0.01103674, + "auxiliary_loss_mlp": 0.01037597, + "balance_loss_clip": 1.02521682, + "balance_loss_mlp": 1.03665078, + "epoch": 0.7791372313242146, + "flos": 16034186920320.0, + "grad_norm": 1.8919094933835359, + "language_loss": 0.71729428, + "learning_rate": 4.901366228545418e-07, + "loss": 0.73870701, + "num_input_tokens_seen": 279579965, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 12959, + "time_per_iteration": 2.4404170513153076 + }, + { + "auxiliary_loss_clip": 0.01099647, + "auxiliary_loss_mlp": 0.01031546, + "balance_loss_clip": 1.02000022, + "balance_loss_mlp": 1.03415179, + "epoch": 0.7791973545768827, + "flos": 23842207779840.0, + "grad_norm": 2.165413341103088, + "language_loss": 0.7770282, + "learning_rate": 4.898812411746632e-07, + "loss": 0.79834014, + "num_input_tokens_seen": 279599030, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 12960, + "time_per_iteration": 2.4568068981170654 + }, + { + "auxiliary_loss_clip": 0.01103395, + "auxiliary_loss_mlp": 0.01034613, + "balance_loss_clip": 1.02233458, + "balance_loss_mlp": 1.03674865, + "epoch": 0.7792574778295506, + "flos": 24168384207360.0, + "grad_norm": 1.9020069613466535, + "language_loss": 0.75351453, + "learning_rate": 4.896259167586385e-07, + "loss": 0.7748946, + "num_input_tokens_seen": 279614400, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66796875, + "step": 12961, + "time_per_iteration": 2.4992313385009766 + }, + { + "auxiliary_loss_clip": 0.0109806, + "auxiliary_loss_mlp": 0.01035181, + "balance_loss_clip": 1.02412987, + "balance_loss_mlp": 1.03634429, + "epoch": 0.7793176010822186, + "flos": 21464921024640.0, + "grad_norm": 1.602325654578752, + "language_loss": 0.73415077, + "learning_rate": 4.893706496161511e-07, + "loss": 0.75548315, + "num_input_tokens_seen": 279633745, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6171875, + "step": 12962, + "time_per_iteration": 2.4623515605926514 + }, + { + "auxiliary_loss_clip": 0.01100833, + "auxiliary_loss_mlp": 0.01027912, + "balance_loss_clip": 1.01642597, + "balance_loss_mlp": 1.03580284, + "epoch": 0.7793777243348865, + "flos": 20666699038080.0, + "grad_norm": 4.864590611193701, + "language_loss": 0.6971066, + "learning_rate": 4.891154397568795e-07, + "loss": 0.71839404, + "num_input_tokens_seen": 279651165, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6484375, + "step": 12963, + "time_per_iteration": 2.4501214027404785 + }, + { + "auxiliary_loss_clip": 0.01102284, + "auxiliary_loss_mlp": 0.01030764, + "balance_loss_clip": 1.01936793, + "balance_loss_mlp": 1.0372932, + "epoch": 0.7794378475875545, + "flos": 27125771610240.0, + "grad_norm": 1.8027321276281432, + "language_loss": 0.63654995, + "learning_rate": 4.888602871905019e-07, + "loss": 0.65788043, + "num_input_tokens_seen": 279671175, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6484375, + "step": 12964, + "time_per_iteration": 2.491323709487915 + }, + { + "auxiliary_loss_clip": 0.01102129, + "auxiliary_loss_mlp": 0.01031652, + "balance_loss_clip": 1.02017188, + "balance_loss_mlp": 1.03510189, + "epoch": 0.7794979708402224, + "flos": 28074136446720.0, + "grad_norm": 1.518939457679847, + "language_loss": 0.7682904, + "learning_rate": 4.88605191926694e-07, + "loss": 0.78962815, + "num_input_tokens_seen": 279688675, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.671875, + "step": 12965, + "time_per_iteration": 2.528763771057129 + }, + { + "auxiliary_loss_clip": 0.01094543, + "auxiliary_loss_mlp": 0.01027037, + "balance_loss_clip": 1.01648712, + "balance_loss_mlp": 1.03374982, + "epoch": 0.7795580940928905, + "flos": 26869548919680.0, + "grad_norm": 2.0628769649637273, + "language_loss": 0.73018187, + "learning_rate": 4.883501539751289e-07, + "loss": 0.75139767, + "num_input_tokens_seen": 279710245, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.609375, + "step": 12966, + "time_per_iteration": 2.484900951385498 + }, + { + "auxiliary_loss_clip": 0.01098331, + "auxiliary_loss_mlp": 0.01025441, + "balance_loss_clip": 1.01554668, + "balance_loss_mlp": 1.03585887, + "epoch": 0.7796182173455584, + "flos": 23835384195840.0, + "grad_norm": 1.5008219463106178, + "language_loss": 0.73900837, + "learning_rate": 4.880951733454768e-07, + "loss": 0.76024604, + "num_input_tokens_seen": 279729045, + "router_z_loss_clip": 0.09912109, + "router_z_loss_mlp": 0.625, + "step": 12967, + "time_per_iteration": 2.478590250015259 + }, + { + "auxiliary_loss_clip": 0.01102816, + "auxiliary_loss_mlp": 0.01025264, + "balance_loss_clip": 1.01353419, + "balance_loss_mlp": 1.03645122, + "epoch": 0.7796783405982264, + "flos": 19792238434560.0, + "grad_norm": 2.127947897129968, + "language_loss": 0.72439355, + "learning_rate": 4.878402500474073e-07, + "loss": 0.74567437, + "num_input_tokens_seen": 279748350, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 12968, + "time_per_iteration": 2.4800057411193848 + }, + { + "auxiliary_loss_clip": 0.01099689, + "auxiliary_loss_mlp": 0.01034695, + "balance_loss_clip": 1.02313745, + "balance_loss_mlp": 1.0356847, + "epoch": 0.7797384638508943, + "flos": 15450207603840.0, + "grad_norm": 2.4882382801625114, + "language_loss": 0.6027385, + "learning_rate": 4.875853840905874e-07, + "loss": 0.62408233, + "num_input_tokens_seen": 279765620, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 12969, + "time_per_iteration": 3.8256025314331055 + }, + { + "auxiliary_loss_clip": 0.01095828, + "auxiliary_loss_mlp": 0.01027516, + "balance_loss_clip": 1.01716805, + "balance_loss_mlp": 1.03350222, + "epoch": 0.7797985871035623, + "flos": 20922742160640.0, + "grad_norm": 1.7218656768380223, + "language_loss": 0.70345086, + "learning_rate": 4.873305754846811e-07, + "loss": 0.7246843, + "num_input_tokens_seen": 279782485, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.62109375, + "step": 12970, + "time_per_iteration": 2.4424326419830322 + }, + { + "auxiliary_loss_clip": 0.01102147, + "auxiliary_loss_mlp": 0.01031819, + "balance_loss_clip": 1.01964164, + "balance_loss_mlp": 1.03676975, + "epoch": 0.7798587103562302, + "flos": 36937212514560.0, + "grad_norm": 1.5981872425492996, + "language_loss": 0.72214878, + "learning_rate": 4.870758242393507e-07, + "loss": 0.74348849, + "num_input_tokens_seen": 279804170, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.65625, + "step": 12971, + "time_per_iteration": 4.000694990158081 + }, + { + "auxiliary_loss_clip": 0.01105251, + "auxiliary_loss_mlp": 0.01031706, + "balance_loss_clip": 1.01978493, + "balance_loss_mlp": 1.03616154, + "epoch": 0.7799188336088982, + "flos": 22419283432320.0, + "grad_norm": 1.9065262783110748, + "language_loss": 0.74722421, + "learning_rate": 4.868211303642578e-07, + "loss": 0.76859379, + "num_input_tokens_seen": 279823730, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69140625, + "step": 12972, + "time_per_iteration": 3.843189001083374 + }, + { + "auxiliary_loss_clip": 0.01099808, + "auxiliary_loss_mlp": 0.01024082, + "balance_loss_clip": 1.01216161, + "balance_loss_mlp": 1.03417563, + "epoch": 0.7799789568615663, + "flos": 18880466578560.0, + "grad_norm": 2.146033088576411, + "language_loss": 0.71397805, + "learning_rate": 4.865664938690584e-07, + "loss": 0.73521698, + "num_input_tokens_seen": 279843035, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 12973, + "time_per_iteration": 2.4355766773223877 + }, + { + "auxiliary_loss_clip": 0.01097706, + "auxiliary_loss_mlp": 0.0102725, + "balance_loss_clip": 1.01677775, + "balance_loss_mlp": 1.03420782, + "epoch": 0.7800390801142342, + "flos": 20262272832000.0, + "grad_norm": 2.289500877533027, + "language_loss": 0.77711248, + "learning_rate": 4.863119147634089e-07, + "loss": 0.79836202, + "num_input_tokens_seen": 279861450, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.63671875, + "step": 12974, + "time_per_iteration": 2.4445388317108154 + }, + { + "auxiliary_loss_clip": 0.010981, + "auxiliary_loss_mlp": 0.01029835, + "balance_loss_clip": 1.01833785, + "balance_loss_mlp": 1.03401935, + "epoch": 0.7800992033669022, + "flos": 16690310703360.0, + "grad_norm": 1.507070733985586, + "language_loss": 0.69106656, + "learning_rate": 4.86057393056964e-07, + "loss": 0.71234584, + "num_input_tokens_seen": 279878660, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 12975, + "time_per_iteration": 4.026258230209351 + }, + { + "auxiliary_loss_clip": 0.01098461, + "auxiliary_loss_mlp": 0.01028562, + "balance_loss_clip": 1.01754749, + "balance_loss_mlp": 1.03443432, + "epoch": 0.7801593266195701, + "flos": 18585208782720.0, + "grad_norm": 3.483605083933044, + "language_loss": 0.81612706, + "learning_rate": 4.858029287593739e-07, + "loss": 0.83739734, + "num_input_tokens_seen": 279895685, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 12976, + "time_per_iteration": 2.39786696434021 + }, + { + "auxiliary_loss_clip": 0.01102312, + "auxiliary_loss_mlp": 0.01026506, + "balance_loss_clip": 1.01467419, + "balance_loss_mlp": 1.03479075, + "epoch": 0.7802194498722381, + "flos": 25484941405440.0, + "grad_norm": 1.6747970494866666, + "language_loss": 0.6597501, + "learning_rate": 4.85548521880289e-07, + "loss": 0.68103826, + "num_input_tokens_seen": 279917240, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.67578125, + "step": 12977, + "time_per_iteration": 2.509279489517212 + }, + { + "auxiliary_loss_clip": 0.01097395, + "auxiliary_loss_mlp": 0.01028658, + "balance_loss_clip": 1.01837647, + "balance_loss_mlp": 1.03446436, + "epoch": 0.780279573124906, + "flos": 31176315573120.0, + "grad_norm": 2.0224689564916236, + "language_loss": 0.74458158, + "learning_rate": 4.852941724293554e-07, + "loss": 0.76584208, + "num_input_tokens_seen": 279938665, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.6328125, + "step": 12978, + "time_per_iteration": 2.5191776752471924 + }, + { + "auxiliary_loss_clip": 0.01103093, + "auxiliary_loss_mlp": 0.01029874, + "balance_loss_clip": 1.01786351, + "balance_loss_mlp": 1.03624713, + "epoch": 0.780339696377574, + "flos": 26944027770240.0, + "grad_norm": 2.0922083765089523, + "language_loss": 0.62049854, + "learning_rate": 4.85039880416219e-07, + "loss": 0.64182818, + "num_input_tokens_seen": 279957965, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66796875, + "step": 12979, + "time_per_iteration": 2.5099925994873047 + }, + { + "auxiliary_loss_clip": 0.01099974, + "auxiliary_loss_mlp": 0.01027414, + "balance_loss_clip": 1.01567745, + "balance_loss_mlp": 1.03531623, + "epoch": 0.780399819630242, + "flos": 27957426180480.0, + "grad_norm": 1.9372520913604323, + "language_loss": 0.77348953, + "learning_rate": 4.847856458505217e-07, + "loss": 0.79476345, + "num_input_tokens_seen": 279977490, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.64453125, + "step": 12980, + "time_per_iteration": 2.4801688194274902 + }, + { + "auxiliary_loss_clip": 0.01102229, + "auxiliary_loss_mlp": 0.01031284, + "balance_loss_clip": 1.02032888, + "balance_loss_mlp": 1.03540671, + "epoch": 0.78045994288291, + "flos": 22486795044480.0, + "grad_norm": 3.6673789740050484, + "language_loss": 0.78181487, + "learning_rate": 4.845314687419046e-07, + "loss": 0.80315006, + "num_input_tokens_seen": 279994220, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.66796875, + "step": 12981, + "time_per_iteration": 2.4743919372558594 + }, + { + "auxiliary_loss_clip": 0.01104292, + "auxiliary_loss_mlp": 0.0103052, + "balance_loss_clip": 1.01932621, + "balance_loss_mlp": 1.03766203, + "epoch": 0.7805200661355779, + "flos": 20850849089280.0, + "grad_norm": 1.7572805466494936, + "language_loss": 0.7283631, + "learning_rate": 4.842773491000067e-07, + "loss": 0.74971128, + "num_input_tokens_seen": 280012590, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.66796875, + "step": 12982, + "time_per_iteration": 2.464043140411377 + }, + { + "auxiliary_loss_clip": 0.0109892, + "auxiliary_loss_mlp": 0.01028115, + "balance_loss_clip": 1.01735628, + "balance_loss_mlp": 1.03321373, + "epoch": 0.7805801893882459, + "flos": 25665966973440.0, + "grad_norm": 1.447832651307714, + "language_loss": 0.73497742, + "learning_rate": 4.840232869344636e-07, + "loss": 0.75624776, + "num_input_tokens_seen": 280033700, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.66015625, + "step": 12983, + "time_per_iteration": 2.5320849418640137 + }, + { + "auxiliary_loss_clip": 0.01098957, + "auxiliary_loss_mlp": 0.01029332, + "balance_loss_clip": 1.01825762, + "balance_loss_mlp": 1.03431869, + "epoch": 0.7806403126409138, + "flos": 11327806483200.0, + "grad_norm": 1.994731335047155, + "language_loss": 0.7493751, + "learning_rate": 4.837692822549086e-07, + "loss": 0.77065802, + "num_input_tokens_seen": 280052215, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6484375, + "step": 12984, + "time_per_iteration": 2.4252982139587402 + }, + { + "auxiliary_loss_clip": 0.01098022, + "auxiliary_loss_mlp": 0.01031539, + "balance_loss_clip": 1.02072072, + "balance_loss_mlp": 1.03346229, + "epoch": 0.7807004358935818, + "flos": 19573362910080.0, + "grad_norm": 1.7760899084313728, + "language_loss": 0.81298089, + "learning_rate": 4.835153350709746e-07, + "loss": 0.83427656, + "num_input_tokens_seen": 280070525, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.64453125, + "step": 12985, + "time_per_iteration": 2.442458391189575 + }, + { + "auxiliary_loss_clip": 0.01100867, + "auxiliary_loss_mlp": 0.01030708, + "balance_loss_clip": 1.01956177, + "balance_loss_mlp": 1.03591645, + "epoch": 0.7807605591462499, + "flos": 19135827342720.0, + "grad_norm": 1.639777449127703, + "language_loss": 0.77087915, + "learning_rate": 4.832614453922915e-07, + "loss": 0.79219496, + "num_input_tokens_seen": 280089855, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 12986, + "time_per_iteration": 2.4363291263580322 + }, + { + "auxiliary_loss_clip": 0.01100757, + "auxiliary_loss_mlp": 0.01031618, + "balance_loss_clip": 1.02032864, + "balance_loss_mlp": 1.03434944, + "epoch": 0.7808206823989178, + "flos": 32374654133760.0, + "grad_norm": 2.42025665629093, + "language_loss": 0.73686159, + "learning_rate": 4.830076132284859e-07, + "loss": 0.75818527, + "num_input_tokens_seen": 280109960, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6640625, + "step": 12987, + "time_per_iteration": 2.542191505432129 + }, + { + "auxiliary_loss_clip": 0.01023759, + "auxiliary_loss_mlp": 0.01001114, + "balance_loss_clip": 1.00014293, + "balance_loss_mlp": 1.0034368, + "epoch": 0.7808808056515858, + "flos": 55050235061760.0, + "grad_norm": 0.7329422119144833, + "language_loss": 0.55088633, + "learning_rate": 4.82753838589184e-07, + "loss": 0.57113504, + "num_input_tokens_seen": 280169805, + "router_z_loss_clip": 0.00970459, + "router_z_loss_mlp": 0.203125, + "step": 12988, + "time_per_iteration": 3.1061744689941406 + }, + { + "auxiliary_loss_clip": 0.01096388, + "auxiliary_loss_mlp": 0.01034542, + "balance_loss_clip": 1.02383065, + "balance_loss_mlp": 1.03418314, + "epoch": 0.7809409289042537, + "flos": 12859468277760.0, + "grad_norm": 2.47954830996045, + "language_loss": 0.80945504, + "learning_rate": 4.82500121484009e-07, + "loss": 0.83076429, + "num_input_tokens_seen": 280184630, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.625, + "step": 12989, + "time_per_iteration": 2.4808456897735596 + }, + { + "auxiliary_loss_clip": 0.01096337, + "auxiliary_loss_mlp": 0.01028263, + "balance_loss_clip": 1.01711679, + "balance_loss_mlp": 1.03300154, + "epoch": 0.7810010521569217, + "flos": 21687244254720.0, + "grad_norm": 1.5469006395559106, + "language_loss": 0.70564306, + "learning_rate": 4.822464619225806e-07, + "loss": 0.72688901, + "num_input_tokens_seen": 280203880, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6328125, + "step": 12990, + "time_per_iteration": 2.443657636642456 + }, + { + "auxiliary_loss_clip": 0.01101485, + "auxiliary_loss_mlp": 0.01028511, + "balance_loss_clip": 1.01631021, + "balance_loss_mlp": 1.03604221, + "epoch": 0.7810611754095896, + "flos": 16757068129920.0, + "grad_norm": 1.8688564219914294, + "language_loss": 0.77437395, + "learning_rate": 4.819928599145184e-07, + "loss": 0.79567397, + "num_input_tokens_seen": 280220460, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65625, + "step": 12991, + "time_per_iteration": 2.4350147247314453 + }, + { + "auxiliary_loss_clip": 0.01098523, + "auxiliary_loss_mlp": 0.01033078, + "balance_loss_clip": 1.0213058, + "balance_loss_mlp": 1.0333643, + "epoch": 0.7811212986622577, + "flos": 43507464658560.0, + "grad_norm": 1.6335805671408214, + "language_loss": 0.66026002, + "learning_rate": 4.817393154694398e-07, + "loss": 0.68157601, + "num_input_tokens_seen": 280242680, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.65234375, + "step": 12992, + "time_per_iteration": 2.689131259918213 + }, + { + "auxiliary_loss_clip": 0.01101645, + "auxiliary_loss_mlp": 0.0102982, + "balance_loss_clip": 1.01868546, + "balance_loss_mlp": 1.03544474, + "epoch": 0.7811814219149256, + "flos": 21757700782080.0, + "grad_norm": 1.671791427999923, + "language_loss": 0.6139763, + "learning_rate": 4.814858285969578e-07, + "loss": 0.63529098, + "num_input_tokens_seen": 280260655, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66015625, + "step": 12993, + "time_per_iteration": 2.4541869163513184 + }, + { + "auxiliary_loss_clip": 0.01098832, + "auxiliary_loss_mlp": 0.01028588, + "balance_loss_clip": 1.01672089, + "balance_loss_mlp": 1.03474307, + "epoch": 0.7812415451675936, + "flos": 24061514267520.0, + "grad_norm": 1.5259935915170835, + "language_loss": 0.68686914, + "learning_rate": 4.812323993066862e-07, + "loss": 0.70814335, + "num_input_tokens_seen": 280281185, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.640625, + "step": 12994, + "time_per_iteration": 2.4765658378601074 + }, + { + "auxiliary_loss_clip": 0.01098133, + "auxiliary_loss_mlp": 0.01025809, + "balance_loss_clip": 1.01448953, + "balance_loss_mlp": 1.03380036, + "epoch": 0.7813016684202615, + "flos": 18989706816000.0, + "grad_norm": 1.852574283053805, + "language_loss": 0.68799579, + "learning_rate": 4.809790276082335e-07, + "loss": 0.70923519, + "num_input_tokens_seen": 280298255, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.64453125, + "step": 12995, + "time_per_iteration": 2.4536421298980713 + }, + { + "auxiliary_loss_clip": 0.01095783, + "auxiliary_loss_mlp": 0.01026788, + "balance_loss_clip": 1.01633954, + "balance_loss_mlp": 1.03263307, + "epoch": 0.7813617916729295, + "flos": 25260786581760.0, + "grad_norm": 1.6880507432835572, + "language_loss": 0.74965352, + "learning_rate": 4.807257135112088e-07, + "loss": 0.77087927, + "num_input_tokens_seen": 280319000, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.6328125, + "step": 12996, + "time_per_iteration": 2.5054454803466797 + }, + { + "auxiliary_loss_clip": 0.0110413, + "auxiliary_loss_mlp": 0.01031779, + "balance_loss_clip": 1.01969695, + "balance_loss_mlp": 1.03568673, + "epoch": 0.7814219149255974, + "flos": 17966037116160.0, + "grad_norm": 2.5743234501120424, + "language_loss": 0.6912725, + "learning_rate": 4.804724570252167e-07, + "loss": 0.71263158, + "num_input_tokens_seen": 280336375, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.68359375, + "step": 12997, + "time_per_iteration": 2.4369044303894043 + }, + { + "auxiliary_loss_clip": 0.01104469, + "auxiliary_loss_mlp": 0.01031264, + "balance_loss_clip": 1.01893187, + "balance_loss_mlp": 1.03557801, + "epoch": 0.7814820381782654, + "flos": 25776176878080.0, + "grad_norm": 1.8652008126435036, + "language_loss": 0.82176995, + "learning_rate": 4.802192581598614e-07, + "loss": 0.84312725, + "num_input_tokens_seen": 280358760, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69140625, + "step": 12998, + "time_per_iteration": 2.486489772796631 + }, + { + "auxiliary_loss_clip": 0.01099451, + "auxiliary_loss_mlp": 0.01030802, + "balance_loss_clip": 1.01885128, + "balance_loss_mlp": 1.03346038, + "epoch": 0.7815421614309335, + "flos": 20519572930560.0, + "grad_norm": 1.8319036090536944, + "language_loss": 0.74508494, + "learning_rate": 4.799661169247453e-07, + "loss": 0.76638746, + "num_input_tokens_seen": 280377085, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66015625, + "step": 12999, + "time_per_iteration": 2.4737162590026855 + }, + { + "auxiliary_loss_clip": 0.01103401, + "auxiliary_loss_mlp": 0.01033986, + "balance_loss_clip": 1.02182698, + "balance_loss_mlp": 1.03589118, + "epoch": 0.7816022846836014, + "flos": 21287666384640.0, + "grad_norm": 1.563923642471339, + "language_loss": 0.84530002, + "learning_rate": 4.797130333294652e-07, + "loss": 0.86667389, + "num_input_tokens_seen": 280395465, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.67578125, + "step": 13000, + "time_per_iteration": 2.4414126873016357 + }, + { + "auxiliary_loss_clip": 0.01101696, + "auxiliary_loss_mlp": 0.01030595, + "balance_loss_clip": 1.01921082, + "balance_loss_mlp": 1.03525925, + "epoch": 0.7816624079362694, + "flos": 19208402772480.0, + "grad_norm": 1.979765622408292, + "language_loss": 0.65926194, + "learning_rate": 4.794600073836192e-07, + "loss": 0.68058491, + "num_input_tokens_seen": 280412775, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 13001, + "time_per_iteration": 2.459602117538452 + }, + { + "auxiliary_loss_clip": 0.01100637, + "auxiliary_loss_mlp": 0.01031843, + "balance_loss_clip": 1.02068496, + "balance_loss_mlp": 1.0349071, + "epoch": 0.7817225311889373, + "flos": 26104687689600.0, + "grad_norm": 1.7956850599557053, + "language_loss": 0.6699869, + "learning_rate": 4.792070390968027e-07, + "loss": 0.69131166, + "num_input_tokens_seen": 280432905, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 13002, + "time_per_iteration": 2.476304769515991 + }, + { + "auxiliary_loss_clip": 0.01104712, + "auxiliary_loss_mlp": 0.01035704, + "balance_loss_clip": 1.02302575, + "balance_loss_mlp": 1.0376792, + "epoch": 0.7817826544416053, + "flos": 21250929749760.0, + "grad_norm": 2.585481392916345, + "language_loss": 0.7332117, + "learning_rate": 4.78954128478607e-07, + "loss": 0.75461578, + "num_input_tokens_seen": 280450785, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.671875, + "step": 13003, + "time_per_iteration": 2.4720077514648438 + }, + { + "auxiliary_loss_clip": 0.01103208, + "auxiliary_loss_mlp": 0.01031396, + "balance_loss_clip": 1.0201664, + "balance_loss_mlp": 1.03717935, + "epoch": 0.7818427776942732, + "flos": 19932181822080.0, + "grad_norm": 1.569897666611527, + "language_loss": 0.62077022, + "learning_rate": 4.787012755386233e-07, + "loss": 0.64211631, + "num_input_tokens_seen": 280468400, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.66015625, + "step": 13004, + "time_per_iteration": 2.441561222076416 + }, + { + "auxiliary_loss_clip": 0.01094018, + "auxiliary_loss_mlp": 0.01029156, + "balance_loss_clip": 1.01888061, + "balance_loss_mlp": 1.03251433, + "epoch": 0.7819029009469413, + "flos": 11363753018880.0, + "grad_norm": 2.201816457690377, + "language_loss": 0.82857859, + "learning_rate": 4.784484802864403e-07, + "loss": 0.84981036, + "num_input_tokens_seen": 280483930, + "router_z_loss_clip": 0.10302734, + "router_z_loss_mlp": 0.6171875, + "step": 13005, + "time_per_iteration": 2.463477373123169 + }, + { + "auxiliary_loss_clip": 0.01098144, + "auxiliary_loss_mlp": 0.01029594, + "balance_loss_clip": 1.01770329, + "balance_loss_mlp": 1.033494, + "epoch": 0.7819630241996092, + "flos": 24279276470400.0, + "grad_norm": 1.897683871126404, + "language_loss": 0.72580653, + "learning_rate": 4.781957427316432e-07, + "loss": 0.7470839, + "num_input_tokens_seen": 280503465, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6484375, + "step": 13006, + "time_per_iteration": 2.465083122253418 + }, + { + "auxiliary_loss_clip": 0.01101706, + "auxiliary_loss_mlp": 0.01030088, + "balance_loss_clip": 1.01830435, + "balance_loss_mlp": 1.03508401, + "epoch": 0.7820231474522772, + "flos": 22708902792960.0, + "grad_norm": 1.6366399269872012, + "language_loss": 0.7201829, + "learning_rate": 4.779430628838157e-07, + "loss": 0.74150085, + "num_input_tokens_seen": 280523375, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66796875, + "step": 13007, + "time_per_iteration": 2.459080934524536 + }, + { + "auxiliary_loss_clip": 0.01100835, + "auxiliary_loss_mlp": 0.01027314, + "balance_loss_clip": 1.0149343, + "balance_loss_mlp": 1.03300202, + "epoch": 0.7820832707049451, + "flos": 20047419630720.0, + "grad_norm": 2.036752007618824, + "language_loss": 0.68872929, + "learning_rate": 4.776904407525397e-07, + "loss": 0.71001077, + "num_input_tokens_seen": 280542920, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 13008, + "time_per_iteration": 2.4224483966827393 + }, + { + "auxiliary_loss_clip": 0.01098365, + "auxiliary_loss_mlp": 0.01029334, + "balance_loss_clip": 1.01692426, + "balance_loss_mlp": 1.03345668, + "epoch": 0.7821433939576131, + "flos": 27162795553920.0, + "grad_norm": 1.640160857289297, + "language_loss": 0.69686973, + "learning_rate": 4.774378763473954e-07, + "loss": 0.71814674, + "num_input_tokens_seen": 280561700, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6484375, + "step": 13009, + "time_per_iteration": 2.5025076866149902 + }, + { + "auxiliary_loss_clip": 0.01097253, + "auxiliary_loss_mlp": 0.01027375, + "balance_loss_clip": 1.01588941, + "balance_loss_mlp": 1.03301144, + "epoch": 0.782203517210281, + "flos": 22602068766720.0, + "grad_norm": 1.5960610923342113, + "language_loss": 0.81570321, + "learning_rate": 4.771853696779586e-07, + "loss": 0.83694947, + "num_input_tokens_seen": 280580605, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 13010, + "time_per_iteration": 2.4285366535186768 + }, + { + "auxiliary_loss_clip": 0.01096868, + "auxiliary_loss_mlp": 0.0103215, + "balance_loss_clip": 1.02153432, + "balance_loss_mlp": 1.03357911, + "epoch": 0.782263640462949, + "flos": 29059812535680.0, + "grad_norm": 1.6142519346757356, + "language_loss": 0.62225044, + "learning_rate": 4.76932920753806e-07, + "loss": 0.64354062, + "num_input_tokens_seen": 280601495, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6328125, + "step": 13011, + "time_per_iteration": 3.926089286804199 + }, + { + "auxiliary_loss_clip": 0.01099415, + "auxiliary_loss_mlp": 0.01024998, + "balance_loss_clip": 1.01533031, + "balance_loss_mlp": 1.03547144, + "epoch": 0.782323763715617, + "flos": 25299498464640.0, + "grad_norm": 1.6368138696323526, + "language_loss": 0.6998511, + "learning_rate": 4.7668052958450913e-07, + "loss": 0.72109526, + "num_input_tokens_seen": 280622760, + "router_z_loss_clip": 0.09667969, + "router_z_loss_mlp": 0.640625, + "step": 13012, + "time_per_iteration": 2.4826955795288086 + }, + { + "auxiliary_loss_clip": 0.01023537, + "auxiliary_loss_mlp": 0.01008113, + "balance_loss_clip": 1.00711727, + "balance_loss_mlp": 1.00321245, + "epoch": 0.782383886968285, + "flos": 65194388668800.0, + "grad_norm": 0.7065375253302547, + "language_loss": 0.55039519, + "learning_rate": 4.764281961796395e-07, + "loss": 0.57071167, + "num_input_tokens_seen": 280687115, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.203125, + "step": 13013, + "time_per_iteration": 4.645312786102295 + }, + { + "auxiliary_loss_clip": 0.01104842, + "auxiliary_loss_mlp": 0.01032528, + "balance_loss_clip": 1.02101803, + "balance_loss_mlp": 1.03746831, + "epoch": 0.782444010220953, + "flos": 18405440190720.0, + "grad_norm": 1.6705985916443649, + "language_loss": 0.65102112, + "learning_rate": 4.76175920548765e-07, + "loss": 0.67239481, + "num_input_tokens_seen": 280705000, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 13014, + "time_per_iteration": 3.8477301597595215 + }, + { + "auxiliary_loss_clip": 0.01023801, + "auxiliary_loss_mlp": 0.01001816, + "balance_loss_clip": 1.00088012, + "balance_loss_mlp": 1.00337434, + "epoch": 0.7825041334736209, + "flos": 63955003841280.0, + "grad_norm": 0.727505311889394, + "language_loss": 0.58472216, + "learning_rate": 4.759237027014524e-07, + "loss": 0.60497832, + "num_input_tokens_seen": 280773525, + "router_z_loss_clip": 0.00933838, + "router_z_loss_mlp": 0.20507812, + "step": 13015, + "time_per_iteration": 3.1371023654937744 + }, + { + "auxiliary_loss_clip": 0.01098459, + "auxiliary_loss_mlp": 0.01028458, + "balance_loss_clip": 1.01759779, + "balance_loss_mlp": 1.03401864, + "epoch": 0.7825642567262889, + "flos": 20339373375360.0, + "grad_norm": 1.8961534099857338, + "language_loss": 0.7447719, + "learning_rate": 4.756715426472666e-07, + "loss": 0.76604104, + "num_input_tokens_seen": 280791915, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.64453125, + "step": 13016, + "time_per_iteration": 2.434140682220459 + }, + { + "auxiliary_loss_clip": 0.01101959, + "auxiliary_loss_mlp": 0.01030127, + "balance_loss_clip": 1.01715088, + "balance_loss_mlp": 1.03527248, + "epoch": 0.7826243799789568, + "flos": 20262955190400.0, + "grad_norm": 1.7784650318460415, + "language_loss": 0.75034481, + "learning_rate": 4.7541944039576766e-07, + "loss": 0.77166569, + "num_input_tokens_seen": 280811460, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6640625, + "step": 13017, + "time_per_iteration": 3.9943692684173584 + }, + { + "auxiliary_loss_clip": 0.01099632, + "auxiliary_loss_mlp": 0.01029093, + "balance_loss_clip": 1.01680803, + "balance_loss_mlp": 1.03296256, + "epoch": 0.7826845032316249, + "flos": 21132926593920.0, + "grad_norm": 1.8349392879241557, + "language_loss": 0.75123864, + "learning_rate": 4.7516739595651636e-07, + "loss": 0.77252591, + "num_input_tokens_seen": 280825415, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6640625, + "step": 13018, + "time_per_iteration": 2.4067063331604004 + }, + { + "auxiliary_loss_clip": 0.01098611, + "auxiliary_loss_mlp": 0.01026262, + "balance_loss_clip": 1.01444817, + "balance_loss_mlp": 1.03329933, + "epoch": 0.7827446264842928, + "flos": 22492253911680.0, + "grad_norm": 1.4416632846342243, + "language_loss": 0.77156466, + "learning_rate": 4.749154093390708e-07, + "loss": 0.79281342, + "num_input_tokens_seen": 280845335, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65234375, + "step": 13019, + "time_per_iteration": 2.463075876235962 + }, + { + "auxiliary_loss_clip": 0.01097045, + "auxiliary_loss_mlp": 0.01026432, + "balance_loss_clip": 1.01522064, + "balance_loss_mlp": 1.03263474, + "epoch": 0.7828047497369608, + "flos": 28840649702400.0, + "grad_norm": 1.5659008205546523, + "language_loss": 0.67608422, + "learning_rate": 4.746634805529852e-07, + "loss": 0.69731897, + "num_input_tokens_seen": 280867145, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.64453125, + "step": 13020, + "time_per_iteration": 2.4952075481414795 + }, + { + "auxiliary_loss_clip": 0.01100425, + "auxiliary_loss_mlp": 0.01028675, + "balance_loss_clip": 1.01770771, + "balance_loss_mlp": 1.03600883, + "epoch": 0.7828648729896287, + "flos": 23257689759360.0, + "grad_norm": 2.0993447559615905, + "language_loss": 0.6252991, + "learning_rate": 4.7441160960781325e-07, + "loss": 0.64659011, + "num_input_tokens_seen": 280886185, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.64453125, + "step": 13021, + "time_per_iteration": 2.4579381942749023 + }, + { + "auxiliary_loss_clip": 0.01097567, + "auxiliary_loss_mlp": 0.01030619, + "balance_loss_clip": 1.01984227, + "balance_loss_mlp": 1.03425419, + "epoch": 0.7829249962422967, + "flos": 25265670831360.0, + "grad_norm": 1.6887151004822496, + "language_loss": 0.69123161, + "learning_rate": 4.7415979651310636e-07, + "loss": 0.71251345, + "num_input_tokens_seen": 280907665, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6328125, + "step": 13022, + "time_per_iteration": 2.4774861335754395 + }, + { + "auxiliary_loss_clip": 0.01023146, + "auxiliary_loss_mlp": 0.01003513, + "balance_loss_clip": 1.00248182, + "balance_loss_mlp": 1.00289679, + "epoch": 0.7829851194949646, + "flos": 70722044645760.0, + "grad_norm": 0.6410994514398879, + "language_loss": 0.56181228, + "learning_rate": 4.739080412784131e-07, + "loss": 0.58207887, + "num_input_tokens_seen": 280971405, + "router_z_loss_clip": 0.01031494, + "router_z_loss_mlp": 0.203125, + "step": 13023, + "time_per_iteration": 3.216150999069214 + }, + { + "auxiliary_loss_clip": 0.01092363, + "auxiliary_loss_mlp": 0.01026003, + "balance_loss_clip": 1.01569128, + "balance_loss_mlp": 1.03068089, + "epoch": 0.7830452427476327, + "flos": 25660795415040.0, + "grad_norm": 1.5988888518402644, + "language_loss": 0.67096663, + "learning_rate": 4.736563439132792e-07, + "loss": 0.69215035, + "num_input_tokens_seen": 280989615, + "router_z_loss_clip": 0.10302734, + "router_z_loss_mlp": 0.6171875, + "step": 13024, + "time_per_iteration": 2.4942939281463623 + }, + { + "auxiliary_loss_clip": 0.01101952, + "auxiliary_loss_mlp": 0.01026503, + "balance_loss_clip": 1.01470125, + "balance_loss_mlp": 1.0346812, + "epoch": 0.7831053660003006, + "flos": 22784315397120.0, + "grad_norm": 1.8279349963305433, + "language_loss": 0.77768403, + "learning_rate": 4.734047044272498e-07, + "loss": 0.79896855, + "num_input_tokens_seen": 281009450, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.671875, + "step": 13025, + "time_per_iteration": 2.4907360076904297 + }, + { + "auxiliary_loss_clip": 0.01100969, + "auxiliary_loss_mlp": 0.01030225, + "balance_loss_clip": 1.01925731, + "balance_loss_mlp": 1.0364809, + "epoch": 0.7831654892529686, + "flos": 25812267068160.0, + "grad_norm": 1.6346779993689489, + "language_loss": 0.78158247, + "learning_rate": 4.731531228298673e-07, + "loss": 0.80289435, + "num_input_tokens_seen": 281028120, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.64453125, + "step": 13026, + "time_per_iteration": 2.510455846786499 + }, + { + "auxiliary_loss_clip": 0.01099064, + "auxiliary_loss_mlp": 0.01024558, + "balance_loss_clip": 1.01344812, + "balance_loss_mlp": 1.03539133, + "epoch": 0.7832256125056366, + "flos": 20771557816320.0, + "grad_norm": 1.8424561314636239, + "language_loss": 0.75538385, + "learning_rate": 4.729015991306715e-07, + "loss": 0.77662009, + "num_input_tokens_seen": 281042130, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.63671875, + "step": 13027, + "time_per_iteration": 2.4143946170806885 + }, + { + "auxiliary_loss_clip": 0.01100205, + "auxiliary_loss_mlp": 0.01026641, + "balance_loss_clip": 1.01554847, + "balance_loss_mlp": 1.03557467, + "epoch": 0.7832857357583045, + "flos": 21506541909120.0, + "grad_norm": 1.6598203189142682, + "language_loss": 0.70306528, + "learning_rate": 4.726501333391997e-07, + "loss": 0.72433376, + "num_input_tokens_seen": 281060945, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6484375, + "step": 13028, + "time_per_iteration": 2.4640142917633057 + }, + { + "auxiliary_loss_clip": 0.01103224, + "auxiliary_loss_mlp": 0.01034946, + "balance_loss_clip": 1.0228107, + "balance_loss_mlp": 1.03549385, + "epoch": 0.7833458590109725, + "flos": 18077791305600.0, + "grad_norm": 1.953273334391897, + "language_loss": 0.69041282, + "learning_rate": 4.7239872546498774e-07, + "loss": 0.71179456, + "num_input_tokens_seen": 281079270, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 13029, + "time_per_iteration": 2.4038736820220947 + }, + { + "auxiliary_loss_clip": 0.01101772, + "auxiliary_loss_mlp": 0.01026447, + "balance_loss_clip": 1.01434183, + "balance_loss_mlp": 1.03425694, + "epoch": 0.7834059822636404, + "flos": 28288738252800.0, + "grad_norm": 1.7164794542717685, + "language_loss": 0.81022191, + "learning_rate": 4.721473755175698e-07, + "loss": 0.83150411, + "num_input_tokens_seen": 281099500, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 13030, + "time_per_iteration": 2.5112462043762207 + }, + { + "auxiliary_loss_clip": 0.01102526, + "auxiliary_loss_mlp": 0.01029393, + "balance_loss_clip": 1.01789546, + "balance_loss_mlp": 1.03459156, + "epoch": 0.7834661055163085, + "flos": 31686211088640.0, + "grad_norm": 1.6569423927401024, + "language_loss": 0.70443982, + "learning_rate": 4.71896083506476e-07, + "loss": 0.72575903, + "num_input_tokens_seen": 281121250, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6796875, + "step": 13031, + "time_per_iteration": 2.5177314281463623 + }, + { + "auxiliary_loss_clip": 0.01101312, + "auxiliary_loss_mlp": 0.0103291, + "balance_loss_clip": 1.02162719, + "balance_loss_mlp": 1.03390551, + "epoch": 0.7835262287689764, + "flos": 12933192942720.0, + "grad_norm": 2.080929287511114, + "language_loss": 0.78692496, + "learning_rate": 4.7164484944123574e-07, + "loss": 0.80826724, + "num_input_tokens_seen": 281138760, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.671875, + "step": 13032, + "time_per_iteration": 2.438286066055298 + }, + { + "auxiliary_loss_clip": 0.01104134, + "auxiliary_loss_mlp": 0.01034155, + "balance_loss_clip": 1.02238297, + "balance_loss_mlp": 1.03637064, + "epoch": 0.7835863520216444, + "flos": 16143211676160.0, + "grad_norm": 2.498040083098191, + "language_loss": 0.62467206, + "learning_rate": 4.7139367333137726e-07, + "loss": 0.64605498, + "num_input_tokens_seen": 281157420, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6796875, + "step": 13033, + "time_per_iteration": 2.4075143337249756 + }, + { + "auxiliary_loss_clip": 0.01099306, + "auxiliary_loss_mlp": 0.01026458, + "balance_loss_clip": 1.01454878, + "balance_loss_mlp": 1.03466129, + "epoch": 0.7836464752743123, + "flos": 11509909459200.0, + "grad_norm": 1.5229312558567987, + "language_loss": 0.71800756, + "learning_rate": 4.7114255518642255e-07, + "loss": 0.7392652, + "num_input_tokens_seen": 281174620, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6484375, + "step": 13034, + "time_per_iteration": 2.426010847091675 + }, + { + "auxiliary_loss_clip": 0.01102657, + "auxiliary_loss_mlp": 0.01029907, + "balance_loss_clip": 1.01777768, + "balance_loss_mlp": 1.03581548, + "epoch": 0.7837065985269803, + "flos": 18223696350720.0, + "grad_norm": 1.6809698816895169, + "language_loss": 0.72046518, + "learning_rate": 4.7089149501589555e-07, + "loss": 0.74179089, + "num_input_tokens_seen": 281193865, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 13035, + "time_per_iteration": 2.417221784591675 + }, + { + "auxiliary_loss_clip": 0.0110217, + "auxiliary_loss_mlp": 0.01031304, + "balance_loss_clip": 1.01936555, + "balance_loss_mlp": 1.0355823, + "epoch": 0.7837667217796482, + "flos": 24754410599040.0, + "grad_norm": 1.9215035774038787, + "language_loss": 0.66247499, + "learning_rate": 4.7064049282931664e-07, + "loss": 0.6838097, + "num_input_tokens_seen": 281212250, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 13036, + "time_per_iteration": 2.4644551277160645 + }, + { + "auxiliary_loss_clip": 0.01105291, + "auxiliary_loss_mlp": 0.01032277, + "balance_loss_clip": 1.01995683, + "balance_loss_mlp": 1.03585243, + "epoch": 0.7838268450323163, + "flos": 22383121415040.0, + "grad_norm": 2.2777930341142945, + "language_loss": 0.72937357, + "learning_rate": 4.703895486362031e-07, + "loss": 0.75074923, + "num_input_tokens_seen": 281230850, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 13037, + "time_per_iteration": 2.449385404586792 + }, + { + "auxiliary_loss_clip": 0.01097375, + "auxiliary_loss_mlp": 0.01029582, + "balance_loss_clip": 1.0175482, + "balance_loss_mlp": 1.03236222, + "epoch": 0.7838869682849842, + "flos": 19500284689920.0, + "grad_norm": 2.4737781125187808, + "language_loss": 0.60029399, + "learning_rate": 4.701386624460717e-07, + "loss": 0.62156355, + "num_input_tokens_seen": 281249810, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6484375, + "step": 13038, + "time_per_iteration": 2.467207193374634 + }, + { + "auxiliary_loss_clip": 0.010977, + "auxiliary_loss_mlp": 0.01027526, + "balance_loss_clip": 1.01651084, + "balance_loss_mlp": 1.03378868, + "epoch": 0.7839470915376522, + "flos": 32892845690880.0, + "grad_norm": 1.8286159549617163, + "language_loss": 0.68401051, + "learning_rate": 4.698878342684349e-07, + "loss": 0.70526278, + "num_input_tokens_seen": 281273730, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 13039, + "time_per_iteration": 2.576012372970581 + }, + { + "auxiliary_loss_clip": 0.01095371, + "auxiliary_loss_mlp": 0.01021071, + "balance_loss_clip": 1.01055706, + "balance_loss_mlp": 1.03193581, + "epoch": 0.7840072147903202, + "flos": 29676003373440.0, + "grad_norm": 1.8627494716028734, + "language_loss": 0.68923277, + "learning_rate": 4.6963706411280537e-07, + "loss": 0.71039724, + "num_input_tokens_seen": 281293670, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6328125, + "step": 13040, + "time_per_iteration": 2.5061099529266357 + }, + { + "auxiliary_loss_clip": 0.01100843, + "auxiliary_loss_mlp": 0.01031272, + "balance_loss_clip": 1.01966667, + "balance_loss_mlp": 1.03439748, + "epoch": 0.7840673380429881, + "flos": 18186744234240.0, + "grad_norm": 1.5445420563280179, + "language_loss": 0.67223978, + "learning_rate": 4.6938635198869116e-07, + "loss": 0.6935609, + "num_input_tokens_seen": 281313070, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 13041, + "time_per_iteration": 2.4612159729003906 + }, + { + "auxiliary_loss_clip": 0.01023594, + "auxiliary_loss_mlp": 0.01001116, + "balance_loss_clip": 1.00019228, + "balance_loss_mlp": 1.00344205, + "epoch": 0.7841274612956561, + "flos": 66346006613760.0, + "grad_norm": 0.6599910887916006, + "language_loss": 0.57391232, + "learning_rate": 4.691356979055998e-07, + "loss": 0.59415942, + "num_input_tokens_seen": 281374880, + "router_z_loss_clip": 0.00921631, + "router_z_loss_mlp": 0.20117188, + "step": 13042, + "time_per_iteration": 3.0452370643615723 + }, + { + "auxiliary_loss_clip": 0.0110195, + "auxiliary_loss_mlp": 0.01027245, + "balance_loss_clip": 1.01545572, + "balance_loss_mlp": 1.03551662, + "epoch": 0.784187584548324, + "flos": 26648482665600.0, + "grad_norm": 2.3220034153225235, + "language_loss": 0.83760583, + "learning_rate": 4.688851018730369e-07, + "loss": 0.85889781, + "num_input_tokens_seen": 281392620, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6640625, + "step": 13043, + "time_per_iteration": 2.4752867221832275 + }, + { + "auxiliary_loss_clip": 0.01097987, + "auxiliary_loss_mlp": 0.01025032, + "balance_loss_clip": 1.01391542, + "balance_loss_mlp": 1.03412688, + "epoch": 0.7842477078009921, + "flos": 25740158515200.0, + "grad_norm": 1.3727755929331091, + "language_loss": 0.88437784, + "learning_rate": 4.6863456390050425e-07, + "loss": 0.905608, + "num_input_tokens_seen": 281413140, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.640625, + "step": 13044, + "time_per_iteration": 2.4991369247436523 + }, + { + "auxiliary_loss_clip": 0.01104783, + "auxiliary_loss_mlp": 0.0103004, + "balance_loss_clip": 1.01857805, + "balance_loss_mlp": 1.03586638, + "epoch": 0.78430783105366, + "flos": 21980957765760.0, + "grad_norm": 2.298673788206572, + "language_loss": 0.79098254, + "learning_rate": 4.6838408399750195e-07, + "loss": 0.81233072, + "num_input_tokens_seen": 281430860, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6875, + "step": 13045, + "time_per_iteration": 2.4472832679748535 + }, + { + "auxiliary_loss_clip": 0.01098057, + "auxiliary_loss_mlp": 0.01027036, + "balance_loss_clip": 1.0161643, + "balance_loss_mlp": 1.03325605, + "epoch": 0.784367954306328, + "flos": 23842279607040.0, + "grad_norm": 1.3934452663009353, + "language_loss": 0.72286654, + "learning_rate": 4.6813366217352925e-07, + "loss": 0.7441175, + "num_input_tokens_seen": 281451385, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6484375, + "step": 13046, + "time_per_iteration": 2.487778425216675 + }, + { + "auxiliary_loss_clip": 0.01098961, + "auxiliary_loss_mlp": 0.0103391, + "balance_loss_clip": 1.0218997, + "balance_loss_mlp": 1.03507853, + "epoch": 0.7844280775589959, + "flos": 24826662806400.0, + "grad_norm": 1.566633263646869, + "language_loss": 0.63192189, + "learning_rate": 4.678832984380809e-07, + "loss": 0.65325058, + "num_input_tokens_seen": 281472255, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.63671875, + "step": 13047, + "time_per_iteration": 2.5349674224853516 + }, + { + "auxiliary_loss_clip": 0.01098768, + "auxiliary_loss_mlp": 0.01024033, + "balance_loss_clip": 1.01313078, + "balance_loss_mlp": 1.03501678, + "epoch": 0.7844882008116639, + "flos": 22455660931200.0, + "grad_norm": 1.5581126874211093, + "language_loss": 0.73077911, + "learning_rate": 4.676329928006515e-07, + "loss": 0.75200713, + "num_input_tokens_seen": 281492860, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.63671875, + "step": 13048, + "time_per_iteration": 2.4880495071411133 + }, + { + "auxiliary_loss_clip": 0.01105114, + "auxiliary_loss_mlp": 0.01031215, + "balance_loss_clip": 1.01921093, + "balance_loss_mlp": 1.03758121, + "epoch": 0.7845483240643318, + "flos": 26104041244800.0, + "grad_norm": 2.6312152451554587, + "language_loss": 0.74826312, + "learning_rate": 4.6738274527073243e-07, + "loss": 0.76962638, + "num_input_tokens_seen": 281511815, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 13049, + "time_per_iteration": 2.477346658706665 + }, + { + "auxiliary_loss_clip": 0.01103242, + "auxiliary_loss_mlp": 0.01028189, + "balance_loss_clip": 1.01565409, + "balance_loss_mlp": 1.0343411, + "epoch": 0.7846084473169999, + "flos": 19354307817600.0, + "grad_norm": 1.741709533193149, + "language_loss": 0.72563767, + "learning_rate": 4.6713255585781454e-07, + "loss": 0.746952, + "num_input_tokens_seen": 281530090, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 13050, + "time_per_iteration": 2.4637062549591064 + }, + { + "auxiliary_loss_clip": 0.01099539, + "auxiliary_loss_mlp": 0.01033602, + "balance_loss_clip": 1.0217284, + "balance_loss_mlp": 1.03509378, + "epoch": 0.7846685705696678, + "flos": 23325811902720.0, + "grad_norm": 2.325466593852248, + "language_loss": 0.73197848, + "learning_rate": 4.668824245713825e-07, + "loss": 0.75330985, + "num_input_tokens_seen": 281547075, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.64453125, + "step": 13051, + "time_per_iteration": 2.4410598278045654 + }, + { + "auxiliary_loss_clip": 0.01102687, + "auxiliary_loss_mlp": 0.0103361, + "balance_loss_clip": 1.02142096, + "balance_loss_mlp": 1.03567302, + "epoch": 0.7847286938223358, + "flos": 35809545962880.0, + "grad_norm": 2.1693731979967965, + "language_loss": 0.72507489, + "learning_rate": 4.666323514209227e-07, + "loss": 0.74643779, + "num_input_tokens_seen": 281568080, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.66796875, + "step": 13052, + "time_per_iteration": 2.580509901046753 + }, + { + "auxiliary_loss_clip": 0.01096936, + "auxiliary_loss_mlp": 0.01029725, + "balance_loss_clip": 1.01875806, + "balance_loss_mlp": 1.0346005, + "epoch": 0.7847888170750038, + "flos": 18478159274880.0, + "grad_norm": 1.7569531144927393, + "language_loss": 0.69126081, + "learning_rate": 4.663823364159183e-07, + "loss": 0.71252745, + "num_input_tokens_seen": 281586925, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.625, + "step": 13053, + "time_per_iteration": 3.805539131164551 + }, + { + "auxiliary_loss_clip": 0.01098051, + "auxiliary_loss_mlp": 0.01027333, + "balance_loss_clip": 1.01637769, + "balance_loss_mlp": 1.03426385, + "epoch": 0.7848489403276717, + "flos": 25119155255040.0, + "grad_norm": 2.052215222925797, + "language_loss": 0.70214486, + "learning_rate": 4.6613237956584893e-07, + "loss": 0.72339875, + "num_input_tokens_seen": 281603915, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.640625, + "step": 13054, + "time_per_iteration": 2.4813599586486816 + }, + { + "auxiliary_loss_clip": 0.01103116, + "auxiliary_loss_mlp": 0.01034346, + "balance_loss_clip": 1.02268767, + "balance_loss_mlp": 1.03524971, + "epoch": 0.7849090635803397, + "flos": 26502433966080.0, + "grad_norm": 1.891443504325583, + "language_loss": 0.75708246, + "learning_rate": 4.658824808801938e-07, + "loss": 0.77845711, + "num_input_tokens_seen": 281624220, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 13055, + "time_per_iteration": 3.9307680130004883 + }, + { + "auxiliary_loss_clip": 0.01106616, + "auxiliary_loss_mlp": 0.01028557, + "balance_loss_clip": 1.01664162, + "balance_loss_mlp": 1.03725183, + "epoch": 0.7849691868330076, + "flos": 20959658363520.0, + "grad_norm": 6.321454082407856, + "language_loss": 0.74865484, + "learning_rate": 4.656326403684283e-07, + "loss": 0.77000654, + "num_input_tokens_seen": 281642325, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69140625, + "step": 13056, + "time_per_iteration": 4.0152342319488525 + }, + { + "auxiliary_loss_clip": 0.01101822, + "auxiliary_loss_mlp": 0.01027242, + "balance_loss_clip": 1.01566076, + "balance_loss_mlp": 1.03655851, + "epoch": 0.7850293100856757, + "flos": 26067484177920.0, + "grad_norm": 1.5631013098906712, + "language_loss": 0.70461977, + "learning_rate": 4.6538285804002744e-07, + "loss": 0.72591043, + "num_input_tokens_seen": 281663065, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 13057, + "time_per_iteration": 2.5022852420806885 + }, + { + "auxiliary_loss_clip": 0.01101195, + "auxiliary_loss_mlp": 0.01031025, + "balance_loss_clip": 1.01983142, + "balance_loss_mlp": 1.03427744, + "epoch": 0.7850894333383436, + "flos": 22491894775680.0, + "grad_norm": 2.087059911869826, + "language_loss": 0.7686438, + "learning_rate": 4.6513313390446175e-07, + "loss": 0.78996599, + "num_input_tokens_seen": 281681005, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.66796875, + "step": 13058, + "time_per_iteration": 3.913203716278076 + }, + { + "auxiliary_loss_clip": 0.01101711, + "auxiliary_loss_mlp": 0.01029686, + "balance_loss_clip": 1.01822972, + "balance_loss_mlp": 1.03652596, + "epoch": 0.7851495565910116, + "flos": 20558643949440.0, + "grad_norm": 1.620822282702505, + "language_loss": 0.70728242, + "learning_rate": 4.6488346797120146e-07, + "loss": 0.72859639, + "num_input_tokens_seen": 281697965, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65234375, + "step": 13059, + "time_per_iteration": 2.4571406841278076 + }, + { + "auxiliary_loss_clip": 0.01104562, + "auxiliary_loss_mlp": 0.01038767, + "balance_loss_clip": 1.02604127, + "balance_loss_mlp": 1.03527403, + "epoch": 0.7852096798436795, + "flos": 15924838942080.0, + "grad_norm": 1.7516949433985336, + "language_loss": 0.76551163, + "learning_rate": 4.646338602497144e-07, + "loss": 0.78694499, + "num_input_tokens_seen": 281716035, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 13060, + "time_per_iteration": 2.452622413635254 + }, + { + "auxiliary_loss_clip": 0.0110109, + "auxiliary_loss_mlp": 0.01030961, + "balance_loss_clip": 1.01882577, + "balance_loss_mlp": 1.03516376, + "epoch": 0.7852698030963475, + "flos": 19062282245760.0, + "grad_norm": 2.1122245234180923, + "language_loss": 0.77249229, + "learning_rate": 4.643843107494654e-07, + "loss": 0.79381275, + "num_input_tokens_seen": 281732815, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66015625, + "step": 13061, + "time_per_iteration": 2.4392404556274414 + }, + { + "auxiliary_loss_clip": 0.01100348, + "auxiliary_loss_mlp": 0.01029308, + "balance_loss_clip": 1.01744044, + "balance_loss_mlp": 1.03367698, + "epoch": 0.7853299263490154, + "flos": 24644380262400.0, + "grad_norm": 2.075148531111265, + "language_loss": 0.73844373, + "learning_rate": 4.641348194799164e-07, + "loss": 0.75974035, + "num_input_tokens_seen": 281751980, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66796875, + "step": 13062, + "time_per_iteration": 2.542872428894043 + }, + { + "auxiliary_loss_clip": 0.01097942, + "auxiliary_loss_mlp": 0.01029163, + "balance_loss_clip": 1.01824331, + "balance_loss_mlp": 1.03418064, + "epoch": 0.7853900496016835, + "flos": 22017981709440.0, + "grad_norm": 1.4437360757682784, + "language_loss": 0.68408203, + "learning_rate": 4.638853864505297e-07, + "loss": 0.70535302, + "num_input_tokens_seen": 281772670, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.640625, + "step": 13063, + "time_per_iteration": 2.468757390975952 + }, + { + "auxiliary_loss_clip": 0.01102772, + "auxiliary_loss_mlp": 0.01033084, + "balance_loss_clip": 1.02163374, + "balance_loss_mlp": 1.03934288, + "epoch": 0.7854501728543514, + "flos": 30227412032640.0, + "grad_norm": 2.216322061173653, + "language_loss": 0.7278775, + "learning_rate": 4.636360116707625e-07, + "loss": 0.74923611, + "num_input_tokens_seen": 281792930, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6328125, + "step": 13064, + "time_per_iteration": 2.629014730453491 + }, + { + "auxiliary_loss_clip": 0.01101508, + "auxiliary_loss_mlp": 0.01031412, + "balance_loss_clip": 1.01990271, + "balance_loss_mlp": 1.03406608, + "epoch": 0.7855102961070194, + "flos": 18843694030080.0, + "grad_norm": 1.7428353830367498, + "language_loss": 0.67990673, + "learning_rate": 4.633866951500718e-07, + "loss": 0.70123595, + "num_input_tokens_seen": 281811805, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.67578125, + "step": 13065, + "time_per_iteration": 2.440537691116333 + }, + { + "auxiliary_loss_clip": 0.01101655, + "auxiliary_loss_mlp": 0.01030538, + "balance_loss_clip": 1.01917148, + "balance_loss_mlp": 1.03686762, + "epoch": 0.7855704193596874, + "flos": 22309971367680.0, + "grad_norm": 1.9043114354962565, + "language_loss": 0.76035756, + "learning_rate": 4.6313743689791196e-07, + "loss": 0.78167951, + "num_input_tokens_seen": 281831885, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6484375, + "step": 13066, + "time_per_iteration": 2.4779815673828125 + }, + { + "auxiliary_loss_clip": 0.01023361, + "auxiliary_loss_mlp": 0.0100262, + "balance_loss_clip": 1.00158274, + "balance_loss_mlp": 1.00318313, + "epoch": 0.7856305426123553, + "flos": 60004434407040.0, + "grad_norm": 0.7064057313548338, + "language_loss": 0.53389549, + "learning_rate": 4.628882369237346e-07, + "loss": 0.55415535, + "num_input_tokens_seen": 281900310, + "router_z_loss_clip": 0.01037598, + "router_z_loss_mlp": 0.20214844, + "step": 13067, + "time_per_iteration": 3.158377170562744 + }, + { + "auxiliary_loss_clip": 0.01099339, + "auxiliary_loss_mlp": 0.01030027, + "balance_loss_clip": 1.01784921, + "balance_loss_mlp": 1.03333259, + "epoch": 0.7856906658650233, + "flos": 21868593045120.0, + "grad_norm": 1.7780609677400445, + "language_loss": 0.67590213, + "learning_rate": 4.62639095236989e-07, + "loss": 0.69719583, + "num_input_tokens_seen": 281918870, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.66015625, + "step": 13068, + "time_per_iteration": 2.4604732990264893 + }, + { + "auxiliary_loss_clip": 0.01099845, + "auxiliary_loss_mlp": 0.01030575, + "balance_loss_clip": 1.01966739, + "balance_loss_mlp": 1.03644729, + "epoch": 0.7857507891176913, + "flos": 23622937205760.0, + "grad_norm": 1.9961392096486945, + "language_loss": 0.67999709, + "learning_rate": 4.6239001184712267e-07, + "loss": 0.70130128, + "num_input_tokens_seen": 281936905, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6328125, + "step": 13069, + "time_per_iteration": 2.470776319503784 + }, + { + "auxiliary_loss_clip": 0.01102413, + "auxiliary_loss_mlp": 0.01030592, + "balance_loss_clip": 1.01887965, + "balance_loss_mlp": 1.03625858, + "epoch": 0.7858109123703593, + "flos": 25520061928320.0, + "grad_norm": 1.6342789712373722, + "language_loss": 0.76993471, + "learning_rate": 4.6214098676358195e-07, + "loss": 0.79126477, + "num_input_tokens_seen": 281955625, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 13070, + "time_per_iteration": 2.4821813106536865 + }, + { + "auxiliary_loss_clip": 0.01097348, + "auxiliary_loss_mlp": 0.01030243, + "balance_loss_clip": 1.01948428, + "balance_loss_mlp": 1.0329771, + "epoch": 0.7858710356230272, + "flos": 17457398576640.0, + "grad_norm": 1.5497406441787502, + "language_loss": 0.65501463, + "learning_rate": 4.618920199958083e-07, + "loss": 0.67629051, + "num_input_tokens_seen": 281973285, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.64453125, + "step": 13071, + "time_per_iteration": 2.4392311573028564 + }, + { + "auxiliary_loss_clip": 0.01099716, + "auxiliary_loss_mlp": 0.01031474, + "balance_loss_clip": 1.02051842, + "balance_loss_mlp": 1.03337324, + "epoch": 0.7859311588756952, + "flos": 24679680353280.0, + "grad_norm": 1.7465471589650208, + "language_loss": 0.74096799, + "learning_rate": 4.616431115532442e-07, + "loss": 0.76227987, + "num_input_tokens_seen": 281991410, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6640625, + "step": 13072, + "time_per_iteration": 2.4858996868133545 + }, + { + "auxiliary_loss_clip": 0.0110303, + "auxiliary_loss_mlp": 0.01029417, + "balance_loss_clip": 1.01730585, + "balance_loss_mlp": 1.03666794, + "epoch": 0.7859912821283631, + "flos": 21799142098560.0, + "grad_norm": 2.0042152052909206, + "language_loss": 0.71074873, + "learning_rate": 4.613942614453268e-07, + "loss": 0.73207319, + "num_input_tokens_seen": 282010845, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 13073, + "time_per_iteration": 2.454535961151123 + }, + { + "auxiliary_loss_clip": 0.01099776, + "auxiliary_loss_mlp": 0.0103377, + "balance_loss_clip": 1.0218128, + "balance_loss_mlp": 1.03427434, + "epoch": 0.7860514053810311, + "flos": 20847293642880.0, + "grad_norm": 1.677206170034674, + "language_loss": 0.76719201, + "learning_rate": 4.611454696814938e-07, + "loss": 0.78852749, + "num_input_tokens_seen": 282029635, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 13074, + "time_per_iteration": 2.4688336849212646 + }, + { + "auxiliary_loss_clip": 0.01097672, + "auxiliary_loss_mlp": 0.01030189, + "balance_loss_clip": 1.01888216, + "balance_loss_mlp": 1.03478217, + "epoch": 0.786111528633699, + "flos": 24315689882880.0, + "grad_norm": 1.626029190410932, + "language_loss": 0.74981356, + "learning_rate": 4.608967362711782e-07, + "loss": 0.77109224, + "num_input_tokens_seen": 282050285, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.62890625, + "step": 13075, + "time_per_iteration": 2.4762327671051025 + }, + { + "auxiliary_loss_clip": 0.01100533, + "auxiliary_loss_mlp": 0.01024172, + "balance_loss_clip": 1.01356792, + "balance_loss_mlp": 1.03545177, + "epoch": 0.7861716518863671, + "flos": 24353180703360.0, + "grad_norm": 1.7567110977428382, + "language_loss": 0.6898433, + "learning_rate": 4.6064806122381283e-07, + "loss": 0.71109033, + "num_input_tokens_seen": 282071040, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.65234375, + "step": 13076, + "time_per_iteration": 2.5244879722595215 + }, + { + "auxiliary_loss_clip": 0.01098306, + "auxiliary_loss_mlp": 0.01026682, + "balance_loss_clip": 1.01502383, + "balance_loss_mlp": 1.0347321, + "epoch": 0.786231775139035, + "flos": 14022399006720.0, + "grad_norm": 2.2025280596790395, + "language_loss": 0.80192757, + "learning_rate": 4.603994445488282e-07, + "loss": 0.8231774, + "num_input_tokens_seen": 282086610, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.63671875, + "step": 13077, + "time_per_iteration": 2.491744041442871 + }, + { + "auxiliary_loss_clip": 0.01100583, + "auxiliary_loss_mlp": 0.01032144, + "balance_loss_clip": 1.01986599, + "balance_loss_mlp": 1.03536844, + "epoch": 0.786291898391703, + "flos": 33724248865920.0, + "grad_norm": 1.490748661053691, + "language_loss": 0.70515674, + "learning_rate": 4.6015088625564956e-07, + "loss": 0.72648406, + "num_input_tokens_seen": 282107440, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.65234375, + "step": 13078, + "time_per_iteration": 2.555865526199341 + }, + { + "auxiliary_loss_clip": 0.0109965, + "auxiliary_loss_mlp": 0.01030772, + "balance_loss_clip": 1.01984668, + "balance_loss_mlp": 1.0353632, + "epoch": 0.786352021644371, + "flos": 25811476968960.0, + "grad_norm": 1.565975595125152, + "language_loss": 0.81306797, + "learning_rate": 4.599023863537039e-07, + "loss": 0.83437216, + "num_input_tokens_seen": 282127290, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.640625, + "step": 13079, + "time_per_iteration": 2.49438738822937 + }, + { + "auxiliary_loss_clip": 0.01096305, + "auxiliary_loss_mlp": 0.01026692, + "balance_loss_clip": 1.01586151, + "balance_loss_mlp": 1.03352332, + "epoch": 0.7864121448970389, + "flos": 28910818920960.0, + "grad_norm": 1.6630658201399222, + "language_loss": 0.68445063, + "learning_rate": 4.596539448524146e-07, + "loss": 0.70568061, + "num_input_tokens_seen": 282147505, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.62890625, + "step": 13080, + "time_per_iteration": 2.5388312339782715 + }, + { + "auxiliary_loss_clip": 0.01099497, + "auxiliary_loss_mlp": 0.01031122, + "balance_loss_clip": 1.01981521, + "balance_loss_mlp": 1.03463578, + "epoch": 0.7864722681497069, + "flos": 19208833735680.0, + "grad_norm": 1.6317908200800284, + "language_loss": 0.69513613, + "learning_rate": 4.594055617612016e-07, + "loss": 0.71644235, + "num_input_tokens_seen": 282166450, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 13081, + "time_per_iteration": 2.470564842224121 + }, + { + "auxiliary_loss_clip": 0.01100243, + "auxiliary_loss_mlp": 0.01035049, + "balance_loss_clip": 1.02367032, + "balance_loss_mlp": 1.03415251, + "epoch": 0.7865323914023749, + "flos": 21871573873920.0, + "grad_norm": 1.6215934459039671, + "language_loss": 0.68073553, + "learning_rate": 4.591572370894838e-07, + "loss": 0.70208842, + "num_input_tokens_seen": 282186465, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.66015625, + "step": 13082, + "time_per_iteration": 2.47454833984375 + }, + { + "auxiliary_loss_clip": 0.01099019, + "auxiliary_loss_mlp": 0.01034558, + "balance_loss_clip": 1.02276242, + "balance_loss_mlp": 1.03449476, + "epoch": 0.7865925146550429, + "flos": 25520313323520.0, + "grad_norm": 1.8334733344878817, + "language_loss": 0.66071731, + "learning_rate": 4.589089708466789e-07, + "loss": 0.68205309, + "num_input_tokens_seen": 282207180, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.64453125, + "step": 13083, + "time_per_iteration": 2.4937517642974854 + }, + { + "auxiliary_loss_clip": 0.01103443, + "auxiliary_loss_mlp": 0.01030832, + "balance_loss_clip": 1.01840496, + "balance_loss_mlp": 1.03549075, + "epoch": 0.7866526379077108, + "flos": 19097366855040.0, + "grad_norm": 2.042540926509675, + "language_loss": 0.74778521, + "learning_rate": 4.5866076304220015e-07, + "loss": 0.76912796, + "num_input_tokens_seen": 282225865, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6796875, + "step": 13084, + "time_per_iteration": 2.4672179222106934 + }, + { + "auxiliary_loss_clip": 0.01098876, + "auxiliary_loss_mlp": 0.01028809, + "balance_loss_clip": 1.01814008, + "balance_loss_mlp": 1.03493166, + "epoch": 0.7867127611603788, + "flos": 16173771171840.0, + "grad_norm": 2.928531982319309, + "language_loss": 0.70411515, + "learning_rate": 4.584126136854591e-07, + "loss": 0.72539198, + "num_input_tokens_seen": 282242895, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.640625, + "step": 13085, + "time_per_iteration": 2.427304267883301 + }, + { + "auxiliary_loss_clip": 0.01103417, + "auxiliary_loss_mlp": 0.01028061, + "balance_loss_clip": 1.01565087, + "balance_loss_mlp": 1.03474259, + "epoch": 0.7867728844130467, + "flos": 20773640805120.0, + "grad_norm": 1.8136957772733184, + "language_loss": 0.72376126, + "learning_rate": 4.5816452278586617e-07, + "loss": 0.74507606, + "num_input_tokens_seen": 282260425, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 13086, + "time_per_iteration": 2.480523109436035 + }, + { + "auxiliary_loss_clip": 0.01097734, + "auxiliary_loss_mlp": 0.0102774, + "balance_loss_clip": 1.01654005, + "balance_loss_mlp": 1.03270912, + "epoch": 0.7868330076657147, + "flos": 21760106993280.0, + "grad_norm": 1.9014411249537477, + "language_loss": 0.74928933, + "learning_rate": 4.5791649035282965e-07, + "loss": 0.77054405, + "num_input_tokens_seen": 282279335, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 13087, + "time_per_iteration": 2.469919204711914 + }, + { + "auxiliary_loss_clip": 0.01097848, + "auxiliary_loss_mlp": 0.01032258, + "balance_loss_clip": 1.02146316, + "balance_loss_mlp": 1.03391075, + "epoch": 0.7868931309183826, + "flos": 25700692446720.0, + "grad_norm": 3.8678035141678913, + "language_loss": 0.71336555, + "learning_rate": 4.5766851639575456e-07, + "loss": 0.73466659, + "num_input_tokens_seen": 282299905, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.640625, + "step": 13088, + "time_per_iteration": 2.5597689151763916 + }, + { + "auxiliary_loss_clip": 0.01023649, + "auxiliary_loss_mlp": 0.01006009, + "balance_loss_clip": 1.00502574, + "balance_loss_mlp": 1.00346375, + "epoch": 0.7869532541710507, + "flos": 64644883430400.0, + "grad_norm": 0.6844618253743016, + "language_loss": 0.55505019, + "learning_rate": 4.574206009240431e-07, + "loss": 0.57534683, + "num_input_tokens_seen": 282367620, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.20117188, + "step": 13089, + "time_per_iteration": 3.174372673034668 + }, + { + "auxiliary_loss_clip": 0.01023353, + "auxiliary_loss_mlp": 0.01001352, + "balance_loss_clip": 1.00036299, + "balance_loss_mlp": 1.00316393, + "epoch": 0.7870133774237186, + "flos": 67453600440960.0, + "grad_norm": 0.7253731939477448, + "language_loss": 0.49957851, + "learning_rate": 4.571727439470976e-07, + "loss": 0.51982558, + "num_input_tokens_seen": 282435695, + "router_z_loss_clip": 0.0098877, + "router_z_loss_mlp": 0.20214844, + "step": 13090, + "time_per_iteration": 3.1464152336120605 + }, + { + "auxiliary_loss_clip": 0.01097486, + "auxiliary_loss_mlp": 0.01026378, + "balance_loss_clip": 1.01597738, + "balance_loss_mlp": 1.03442216, + "epoch": 0.7870735006763866, + "flos": 26068310190720.0, + "grad_norm": 2.0009020702147624, + "language_loss": 0.83693981, + "learning_rate": 4.5692494547431583e-07, + "loss": 0.8581785, + "num_input_tokens_seen": 282456025, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.6328125, + "step": 13091, + "time_per_iteration": 2.5320253372192383 + }, + { + "auxiliary_loss_clip": 0.01023736, + "auxiliary_loss_mlp": 0.01003239, + "balance_loss_clip": 1.00224388, + "balance_loss_mlp": 1.00338745, + "epoch": 0.7871336239290546, + "flos": 70289572896000.0, + "grad_norm": 0.7117957030218485, + "language_loss": 0.63994247, + "learning_rate": 4.566772055150947e-07, + "loss": 0.66021222, + "num_input_tokens_seen": 282520995, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.203125, + "step": 13092, + "time_per_iteration": 3.083390474319458 + }, + { + "auxiliary_loss_clip": 0.01102492, + "auxiliary_loss_mlp": 0.01031507, + "balance_loss_clip": 1.01996171, + "balance_loss_mlp": 1.03640008, + "epoch": 0.7871937471817225, + "flos": 15778574760960.0, + "grad_norm": 3.478229156670452, + "language_loss": 0.79910231, + "learning_rate": 4.564295240788285e-07, + "loss": 0.82044232, + "num_input_tokens_seen": 282539355, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66015625, + "step": 13093, + "time_per_iteration": 2.4508519172668457 + }, + { + "auxiliary_loss_clip": 0.01097319, + "auxiliary_loss_mlp": 0.01027817, + "balance_loss_clip": 1.01696348, + "balance_loss_mlp": 1.03387761, + "epoch": 0.7872538704343905, + "flos": 20485242506880.0, + "grad_norm": 2.289206273735693, + "language_loss": 0.7536335, + "learning_rate": 4.561819011749106e-07, + "loss": 0.77488482, + "num_input_tokens_seen": 282555735, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6328125, + "step": 13094, + "time_per_iteration": 2.423264980316162 + }, + { + "auxiliary_loss_clip": 0.01101607, + "auxiliary_loss_mlp": 0.01036216, + "balance_loss_clip": 1.02510548, + "balance_loss_mlp": 1.03562438, + "epoch": 0.7873139936870585, + "flos": 25082670015360.0, + "grad_norm": 1.6408632577371567, + "language_loss": 0.79475707, + "learning_rate": 4.5593433681272884e-07, + "loss": 0.81613529, + "num_input_tokens_seen": 282574550, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66015625, + "step": 13095, + "time_per_iteration": 3.9224746227264404 + }, + { + "auxiliary_loss_clip": 0.01099901, + "auxiliary_loss_mlp": 0.01030817, + "balance_loss_clip": 1.01915216, + "balance_loss_mlp": 1.03335738, + "epoch": 0.7873741169397265, + "flos": 30883176679680.0, + "grad_norm": 2.020167585783757, + "language_loss": 0.67747319, + "learning_rate": 4.556868310016715e-07, + "loss": 0.69878036, + "num_input_tokens_seen": 282596520, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 13096, + "time_per_iteration": 4.006121635437012 + }, + { + "auxiliary_loss_clip": 0.01093799, + "auxiliary_loss_mlp": 0.01025076, + "balance_loss_clip": 1.01535416, + "balance_loss_mlp": 1.03172147, + "epoch": 0.7874342401923944, + "flos": 46791962242560.0, + "grad_norm": 1.5298468077201632, + "language_loss": 0.70352769, + "learning_rate": 4.55439383751125e-07, + "loss": 0.72471642, + "num_input_tokens_seen": 282620560, + "router_z_loss_clip": 0.09716797, + "router_z_loss_mlp": 0.625, + "step": 13097, + "time_per_iteration": 4.101962327957153 + }, + { + "auxiliary_loss_clip": 0.01102049, + "auxiliary_loss_mlp": 0.01031353, + "balance_loss_clip": 1.02018285, + "balance_loss_mlp": 1.0361073, + "epoch": 0.7874943634450624, + "flos": 23584548545280.0, + "grad_norm": 1.6655151068519558, + "language_loss": 0.80427504, + "learning_rate": 4.5519199507047126e-07, + "loss": 0.82560897, + "num_input_tokens_seen": 282639830, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.66015625, + "step": 13098, + "time_per_iteration": 2.46547532081604 + }, + { + "auxiliary_loss_clip": 0.01098922, + "auxiliary_loss_mlp": 0.01029007, + "balance_loss_clip": 1.01834953, + "balance_loss_mlp": 1.03521609, + "epoch": 0.7875544866977303, + "flos": 20191169859840.0, + "grad_norm": 1.645167890556634, + "language_loss": 0.74275064, + "learning_rate": 4.5494466496909177e-07, + "loss": 0.76402998, + "num_input_tokens_seen": 282660130, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.63671875, + "step": 13099, + "time_per_iteration": 2.485710620880127 + }, + { + "auxiliary_loss_clip": 0.01099828, + "auxiliary_loss_mlp": 0.01026234, + "balance_loss_clip": 1.01486731, + "balance_loss_mlp": 1.03532815, + "epoch": 0.7876146099503983, + "flos": 22602571557120.0, + "grad_norm": 1.60096052488611, + "language_loss": 0.78410721, + "learning_rate": 4.5469739345636603e-07, + "loss": 0.80536783, + "num_input_tokens_seen": 282681125, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6484375, + "step": 13100, + "time_per_iteration": 4.035876750946045 + }, + { + "auxiliary_loss_clip": 0.01106007, + "auxiliary_loss_mlp": 0.01028363, + "balance_loss_clip": 1.01570272, + "balance_loss_mlp": 1.03587461, + "epoch": 0.7876747332030662, + "flos": 10705833555840.0, + "grad_norm": 2.2959557681189895, + "language_loss": 0.66067588, + "learning_rate": 4.5445018054167007e-07, + "loss": 0.68201947, + "num_input_tokens_seen": 282696690, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 13101, + "time_per_iteration": 2.4304044246673584 + }, + { + "auxiliary_loss_clip": 0.01098831, + "auxiliary_loss_mlp": 0.01027717, + "balance_loss_clip": 1.01638031, + "balance_loss_mlp": 1.03366089, + "epoch": 0.7877348564557343, + "flos": 38399315621760.0, + "grad_norm": 1.576742328174997, + "language_loss": 0.7767005, + "learning_rate": 4.5420302623437745e-07, + "loss": 0.79796594, + "num_input_tokens_seen": 282721210, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 13102, + "time_per_iteration": 2.587104320526123 + }, + { + "auxiliary_loss_clip": 0.01099699, + "auxiliary_loss_mlp": 0.01033598, + "balance_loss_clip": 1.02300668, + "balance_loss_mlp": 1.03498983, + "epoch": 0.7877949797084022, + "flos": 18329524796160.0, + "grad_norm": 2.03801984289661, + "language_loss": 0.82200575, + "learning_rate": 4.5395593054386093e-07, + "loss": 0.84333879, + "num_input_tokens_seen": 282738505, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.6484375, + "step": 13103, + "time_per_iteration": 2.4504380226135254 + }, + { + "auxiliary_loss_clip": 0.01102423, + "auxiliary_loss_mlp": 0.01033481, + "balance_loss_clip": 1.02108872, + "balance_loss_mlp": 1.03538537, + "epoch": 0.7878551029610702, + "flos": 25806736373760.0, + "grad_norm": 1.9382935639553287, + "language_loss": 0.80800354, + "learning_rate": 4.537088934794913e-07, + "loss": 0.82936251, + "num_input_tokens_seen": 282756895, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 13104, + "time_per_iteration": 2.4761226177215576 + }, + { + "auxiliary_loss_clip": 0.01101176, + "auxiliary_loss_mlp": 0.01032376, + "balance_loss_clip": 1.02072978, + "balance_loss_mlp": 1.03486192, + "epoch": 0.7879152262137382, + "flos": 22342685679360.0, + "grad_norm": 1.5580110951181336, + "language_loss": 0.74400711, + "learning_rate": 4.5346191505063515e-07, + "loss": 0.76534271, + "num_input_tokens_seen": 282774955, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 13105, + "time_per_iteration": 2.458893060684204 + }, + { + "auxiliary_loss_clip": 0.01102329, + "auxiliary_loss_mlp": 0.01033841, + "balance_loss_clip": 1.0220865, + "balance_loss_mlp": 1.03494358, + "epoch": 0.7879753494664061, + "flos": 24785329230720.0, + "grad_norm": 1.6914912151610795, + "language_loss": 0.75718057, + "learning_rate": 4.5321499526665776e-07, + "loss": 0.77854228, + "num_input_tokens_seen": 282793165, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.67578125, + "step": 13106, + "time_per_iteration": 2.4740750789642334 + }, + { + "auxiliary_loss_clip": 0.01101506, + "auxiliary_loss_mlp": 0.01032404, + "balance_loss_clip": 1.02129924, + "balance_loss_mlp": 1.03471053, + "epoch": 0.7880354727190741, + "flos": 16909078487040.0, + "grad_norm": 2.2970900789620767, + "language_loss": 0.73269242, + "learning_rate": 4.5296813413692337e-07, + "loss": 0.75403154, + "num_input_tokens_seen": 282809820, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66796875, + "step": 13107, + "time_per_iteration": 2.465049982070923 + }, + { + "auxiliary_loss_clip": 0.01098914, + "auxiliary_loss_mlp": 0.0103322, + "balance_loss_clip": 1.02140641, + "balance_loss_mlp": 1.03424203, + "epoch": 0.7880955959717421, + "flos": 22230500526720.0, + "grad_norm": 1.8872299288056482, + "language_loss": 0.73182052, + "learning_rate": 4.5272133167079165e-07, + "loss": 0.75314188, + "num_input_tokens_seen": 282828600, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6484375, + "step": 13108, + "time_per_iteration": 2.4711079597473145 + }, + { + "auxiliary_loss_clip": 0.01023267, + "auxiliary_loss_mlp": 0.01002041, + "balance_loss_clip": 1.00098598, + "balance_loss_mlp": 1.00313234, + "epoch": 0.7881557192244101, + "flos": 69183200131200.0, + "grad_norm": 1.661709536618796, + "language_loss": 0.60381085, + "learning_rate": 4.5247458787762216e-07, + "loss": 0.62406397, + "num_input_tokens_seen": 282882775, + "router_z_loss_clip": 0.01055908, + "router_z_loss_mlp": 0.20117188, + "step": 13109, + "time_per_iteration": 3.0089924335479736 + }, + { + "auxiliary_loss_clip": 0.01097142, + "auxiliary_loss_mlp": 0.01028206, + "balance_loss_clip": 1.01739979, + "balance_loss_mlp": 1.03491497, + "epoch": 0.788215842477078, + "flos": 24935436167040.0, + "grad_norm": 1.5824275736461375, + "language_loss": 0.71883583, + "learning_rate": 4.5222790276677126e-07, + "loss": 0.7400893, + "num_input_tokens_seen": 282902680, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.62109375, + "step": 13110, + "time_per_iteration": 2.465576171875 + }, + { + "auxiliary_loss_clip": 0.01098188, + "auxiliary_loss_mlp": 0.01026783, + "balance_loss_clip": 1.01631093, + "balance_loss_mlp": 1.03485966, + "epoch": 0.788275965729746, + "flos": 26106483369600.0, + "grad_norm": 1.3860317758339384, + "language_loss": 0.75074577, + "learning_rate": 4.5198127634759455e-07, + "loss": 0.77199543, + "num_input_tokens_seen": 282923625, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.6328125, + "step": 13111, + "time_per_iteration": 2.4993157386779785 + }, + { + "auxiliary_loss_clip": 0.01098161, + "auxiliary_loss_mlp": 0.01031611, + "balance_loss_clip": 1.02001154, + "balance_loss_mlp": 1.03351355, + "epoch": 0.7883360889824139, + "flos": 21214803646080.0, + "grad_norm": 1.94564104551391, + "language_loss": 0.61333418, + "learning_rate": 4.5173470862944206e-07, + "loss": 0.63463187, + "num_input_tokens_seen": 282941955, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.64453125, + "step": 13112, + "time_per_iteration": 2.43581485748291 + }, + { + "auxiliary_loss_clip": 0.01099376, + "auxiliary_loss_mlp": 0.01027358, + "balance_loss_clip": 1.01515722, + "balance_loss_mlp": 1.0338614, + "epoch": 0.7883962122350819, + "flos": 21142551438720.0, + "grad_norm": 1.7382192958818077, + "language_loss": 0.67246455, + "learning_rate": 4.514881996216644e-07, + "loss": 0.69373184, + "num_input_tokens_seen": 282961280, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65625, + "step": 13113, + "time_per_iteration": 2.4511425495147705 + }, + { + "auxiliary_loss_clip": 0.01098431, + "auxiliary_loss_mlp": 0.01027803, + "balance_loss_clip": 1.0168004, + "balance_loss_mlp": 1.03448272, + "epoch": 0.7884563354877498, + "flos": 15302901928320.0, + "grad_norm": 12.027787303417453, + "language_loss": 0.58199584, + "learning_rate": 4.5124174933361e-07, + "loss": 0.60325825, + "num_input_tokens_seen": 282978210, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 13114, + "time_per_iteration": 2.3941895961761475 + }, + { + "auxiliary_loss_clip": 0.01101584, + "auxiliary_loss_mlp": 0.01028509, + "balance_loss_clip": 1.01636708, + "balance_loss_mlp": 1.03487444, + "epoch": 0.7885164587404179, + "flos": 24388301226240.0, + "grad_norm": 1.6461122480026786, + "language_loss": 0.66887224, + "learning_rate": 4.5099535777462306e-07, + "loss": 0.69017321, + "num_input_tokens_seen": 282998845, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66796875, + "step": 13115, + "time_per_iteration": 2.4768731594085693 + }, + { + "auxiliary_loss_clip": 0.01099861, + "auxiliary_loss_mlp": 0.01025915, + "balance_loss_clip": 1.01442361, + "balance_loss_mlp": 1.03510892, + "epoch": 0.7885765819930858, + "flos": 14385886686720.0, + "grad_norm": 1.909649629635062, + "language_loss": 0.8859247, + "learning_rate": 4.50749024954048e-07, + "loss": 0.90718246, + "num_input_tokens_seen": 283015200, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6484375, + "step": 13116, + "time_per_iteration": 2.4047675132751465 + }, + { + "auxiliary_loss_clip": 0.01106955, + "auxiliary_loss_mlp": 0.01031924, + "balance_loss_clip": 1.01909757, + "balance_loss_mlp": 1.0356214, + "epoch": 0.7886367052457538, + "flos": 18259930195200.0, + "grad_norm": 1.7003920490690876, + "language_loss": 0.72708535, + "learning_rate": 4.505027508812245e-07, + "loss": 0.74847412, + "num_input_tokens_seen": 283033680, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 13117, + "time_per_iteration": 2.4341704845428467 + }, + { + "auxiliary_loss_clip": 0.01097792, + "auxiliary_loss_mlp": 0.01023058, + "balance_loss_clip": 1.01247823, + "balance_loss_mlp": 1.03483558, + "epoch": 0.7886968284984217, + "flos": 15305092657920.0, + "grad_norm": 1.4826682639516906, + "language_loss": 0.79875678, + "learning_rate": 4.502565355654926e-07, + "loss": 0.81996524, + "num_input_tokens_seen": 283050620, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.62890625, + "step": 13118, + "time_per_iteration": 2.394805431365967 + }, + { + "auxiliary_loss_clip": 0.01099322, + "auxiliary_loss_mlp": 0.01027518, + "balance_loss_clip": 1.01643777, + "balance_loss_mlp": 1.03507447, + "epoch": 0.7887569517510897, + "flos": 21215450090880.0, + "grad_norm": 1.7945164673922278, + "language_loss": 0.73091543, + "learning_rate": 4.500103790161878e-07, + "loss": 0.75218379, + "num_input_tokens_seen": 283070215, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.64453125, + "step": 13119, + "time_per_iteration": 2.460057258605957 + }, + { + "auxiliary_loss_clip": 0.01101447, + "auxiliary_loss_mlp": 0.01023623, + "balance_loss_clip": 1.01194072, + "balance_loss_mlp": 1.03509176, + "epoch": 0.7888170750037578, + "flos": 22711237176960.0, + "grad_norm": 1.261657596478895, + "language_loss": 0.71529341, + "learning_rate": 4.4976428124264454e-07, + "loss": 0.73654413, + "num_input_tokens_seen": 283091485, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 13120, + "time_per_iteration": 2.455064058303833 + }, + { + "auxiliary_loss_clip": 0.01100545, + "auxiliary_loss_mlp": 0.01031312, + "balance_loss_clip": 1.01978469, + "balance_loss_mlp": 1.03517127, + "epoch": 0.7888771982564257, + "flos": 36429148592640.0, + "grad_norm": 1.4332103532117941, + "language_loss": 0.78814548, + "learning_rate": 4.4951824225419564e-07, + "loss": 0.8094641, + "num_input_tokens_seen": 283115040, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 13121, + "time_per_iteration": 2.599400281906128 + }, + { + "auxiliary_loss_clip": 0.01098409, + "auxiliary_loss_mlp": 0.01026067, + "balance_loss_clip": 1.01478994, + "balance_loss_mlp": 1.03450656, + "epoch": 0.7889373215090937, + "flos": 27309993488640.0, + "grad_norm": 1.3967660183368626, + "language_loss": 0.80094564, + "learning_rate": 4.4927226206017057e-07, + "loss": 0.8221904, + "num_input_tokens_seen": 283136925, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.640625, + "step": 13122, + "time_per_iteration": 2.4992713928222656 + }, + { + "auxiliary_loss_clip": 0.0110103, + "auxiliary_loss_mlp": 0.01022634, + "balance_loss_clip": 1.01157165, + "balance_loss_mlp": 1.03481627, + "epoch": 0.7889974447617616, + "flos": 19829010983040.0, + "grad_norm": 2.145985677381676, + "language_loss": 0.77920961, + "learning_rate": 4.4902634066989597e-07, + "loss": 0.80044621, + "num_input_tokens_seen": 283155725, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.66015625, + "step": 13123, + "time_per_iteration": 2.4735960960388184 + }, + { + "auxiliary_loss_clip": 0.01104198, + "auxiliary_loss_mlp": 0.01029642, + "balance_loss_clip": 1.01790643, + "balance_loss_mlp": 1.0362196, + "epoch": 0.7890575680144296, + "flos": 17271201450240.0, + "grad_norm": 1.856299947344871, + "language_loss": 0.6726073, + "learning_rate": 4.487804780926985e-07, + "loss": 0.69394577, + "num_input_tokens_seen": 283173845, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6796875, + "step": 13124, + "time_per_iteration": 2.4079813957214355 + }, + { + "auxiliary_loss_clip": 0.01103324, + "auxiliary_loss_mlp": 0.01025655, + "balance_loss_clip": 1.01391327, + "balance_loss_mlp": 1.03546476, + "epoch": 0.7891176912670975, + "flos": 27600151553280.0, + "grad_norm": 2.605711353354914, + "language_loss": 0.72957736, + "learning_rate": 4.4853467433790036e-07, + "loss": 0.75086713, + "num_input_tokens_seen": 283191985, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 13125, + "time_per_iteration": 2.5052480697631836 + }, + { + "auxiliary_loss_clip": 0.01099892, + "auxiliary_loss_mlp": 0.01027655, + "balance_loss_clip": 1.0155673, + "balance_loss_mlp": 1.03235054, + "epoch": 0.7891778145197655, + "flos": 22711668140160.0, + "grad_norm": 2.154516730399549, + "language_loss": 0.72528452, + "learning_rate": 4.4828892941482267e-07, + "loss": 0.74655998, + "num_input_tokens_seen": 283210855, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 13126, + "time_per_iteration": 2.4527993202209473 + }, + { + "auxiliary_loss_clip": 0.01102896, + "auxiliary_loss_mlp": 0.01026431, + "balance_loss_clip": 1.01474881, + "balance_loss_mlp": 1.03575099, + "epoch": 0.7892379377724335, + "flos": 17310775259520.0, + "grad_norm": 2.0791406277804567, + "language_loss": 0.76886559, + "learning_rate": 4.480432433327845e-07, + "loss": 0.79015887, + "num_input_tokens_seen": 283229665, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.671875, + "step": 13127, + "time_per_iteration": 2.4405977725982666 + }, + { + "auxiliary_loss_clip": 0.01098106, + "auxiliary_loss_mlp": 0.01026719, + "balance_loss_clip": 1.0155077, + "balance_loss_mlp": 1.03493667, + "epoch": 0.7892980610251015, + "flos": 25775674087680.0, + "grad_norm": 1.7461753139665992, + "language_loss": 0.85763645, + "learning_rate": 4.47797616101103e-07, + "loss": 0.87888473, + "num_input_tokens_seen": 283248615, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6328125, + "step": 13128, + "time_per_iteration": 2.474844455718994 + }, + { + "auxiliary_loss_clip": 0.01098818, + "auxiliary_loss_mlp": 0.01031045, + "balance_loss_clip": 1.02045906, + "balance_loss_mlp": 1.03425086, + "epoch": 0.7893581842777694, + "flos": 21579943351680.0, + "grad_norm": 2.0767433694769175, + "language_loss": 0.68800604, + "learning_rate": 4.475520477290904e-07, + "loss": 0.70930469, + "num_input_tokens_seen": 283267135, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.64453125, + "step": 13129, + "time_per_iteration": 2.5359485149383545 + }, + { + "auxiliary_loss_clip": 0.01022991, + "auxiliary_loss_mlp": 0.01001965, + "balance_loss_clip": 1.00090396, + "balance_loss_mlp": 1.00285482, + "epoch": 0.7894183075304374, + "flos": 69016468176000.0, + "grad_norm": 0.7130558400515205, + "language_loss": 0.61589611, + "learning_rate": 4.473065382260597e-07, + "loss": 0.63614571, + "num_input_tokens_seen": 283328940, + "router_z_loss_clip": 0.01062012, + "router_z_loss_mlp": 0.20117188, + "step": 13130, + "time_per_iteration": 3.0489916801452637 + }, + { + "auxiliary_loss_clip": 0.01103251, + "auxiliary_loss_mlp": 0.01027204, + "balance_loss_clip": 1.01583779, + "balance_loss_mlp": 1.03717756, + "epoch": 0.7894784307831053, + "flos": 24243258107520.0, + "grad_norm": 1.6182422451860332, + "language_loss": 0.73774695, + "learning_rate": 4.4706108760132124e-07, + "loss": 0.7590515, + "num_input_tokens_seen": 283350000, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6640625, + "step": 13131, + "time_per_iteration": 2.475581169128418 + }, + { + "auxiliary_loss_clip": 0.01108004, + "auxiliary_loss_mlp": 0.01025009, + "balance_loss_clip": 1.01216388, + "balance_loss_mlp": 1.034796, + "epoch": 0.7895385540357733, + "flos": 20266546550400.0, + "grad_norm": 2.199372765286003, + "language_loss": 0.68987596, + "learning_rate": 4.4681569586418153e-07, + "loss": 0.71120608, + "num_input_tokens_seen": 283368020, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.734375, + "step": 13132, + "time_per_iteration": 2.4295406341552734 + }, + { + "auxiliary_loss_clip": 0.01102436, + "auxiliary_loss_mlp": 0.01033148, + "balance_loss_clip": 1.02129269, + "balance_loss_mlp": 1.03545117, + "epoch": 0.7895986772884414, + "flos": 20996574566400.0, + "grad_norm": 2.1121460507768406, + "language_loss": 0.62110436, + "learning_rate": 4.465703630239468e-07, + "loss": 0.64246017, + "num_input_tokens_seen": 283387030, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66796875, + "step": 13133, + "time_per_iteration": 2.483172655105591 + }, + { + "auxiliary_loss_clip": 0.01105396, + "auxiliary_loss_mlp": 0.0103757, + "balance_loss_clip": 1.02418268, + "balance_loss_mlp": 1.03652048, + "epoch": 0.7896588005411093, + "flos": 18657999694080.0, + "grad_norm": 2.3671306381438817, + "language_loss": 0.79635763, + "learning_rate": 4.463250890899195e-07, + "loss": 0.81778735, + "num_input_tokens_seen": 283402090, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.6875, + "step": 13134, + "time_per_iteration": 2.4047813415527344 + }, + { + "auxiliary_loss_clip": 0.01100888, + "auxiliary_loss_mlp": 0.01027831, + "balance_loss_clip": 1.01651824, + "balance_loss_mlp": 1.03489256, + "epoch": 0.7897189237937773, + "flos": 18405907067520.0, + "grad_norm": 1.729726812184161, + "language_loss": 0.79917061, + "learning_rate": 4.460798740713998e-07, + "loss": 0.82045782, + "num_input_tokens_seen": 283421035, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66015625, + "step": 13135, + "time_per_iteration": 2.4462645053863525 + }, + { + "auxiliary_loss_clip": 0.01099492, + "auxiliary_loss_mlp": 0.01028879, + "balance_loss_clip": 1.01702976, + "balance_loss_mlp": 1.03459549, + "epoch": 0.7897790470464452, + "flos": 23731602825600.0, + "grad_norm": 1.7066786377957706, + "language_loss": 0.72467506, + "learning_rate": 4.4583471797768733e-07, + "loss": 0.74595881, + "num_input_tokens_seen": 283441830, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6484375, + "step": 13136, + "time_per_iteration": 3.8541600704193115 + }, + { + "auxiliary_loss_clip": 0.01107278, + "auxiliary_loss_mlp": 0.01033045, + "balance_loss_clip": 1.02079642, + "balance_loss_mlp": 1.03614569, + "epoch": 0.7898391702991132, + "flos": 15918949111680.0, + "grad_norm": 1.8157038606560463, + "language_loss": 0.70418733, + "learning_rate": 4.455896208180778e-07, + "loss": 0.72559059, + "num_input_tokens_seen": 283459540, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.7109375, + "step": 13137, + "time_per_iteration": 2.451396942138672 + }, + { + "auxiliary_loss_clip": 0.01099987, + "auxiliary_loss_mlp": 0.01033834, + "balance_loss_clip": 1.02095389, + "balance_loss_mlp": 1.03527665, + "epoch": 0.7898992935517811, + "flos": 19829046896640.0, + "grad_norm": 1.748688408488967, + "language_loss": 0.74126804, + "learning_rate": 4.4534458260186645e-07, + "loss": 0.7626062, + "num_input_tokens_seen": 283478790, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6484375, + "step": 13138, + "time_per_iteration": 3.8486387729644775 + }, + { + "auxiliary_loss_clip": 0.01099719, + "auxiliary_loss_mlp": 0.01028497, + "balance_loss_clip": 1.01726758, + "balance_loss_mlp": 1.03461611, + "epoch": 0.7899594168044491, + "flos": 16216253982720.0, + "grad_norm": 2.0347678051570046, + "language_loss": 0.68777812, + "learning_rate": 4.4509960333834426e-07, + "loss": 0.70906031, + "num_input_tokens_seen": 283495720, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 13139, + "time_per_iteration": 3.8628947734832764 + }, + { + "auxiliary_loss_clip": 0.01022998, + "auxiliary_loss_mlp": 0.01001993, + "balance_loss_clip": 1.00090218, + "balance_loss_mlp": 1.00276661, + "epoch": 0.790019540057117, + "flos": 68331005959680.0, + "grad_norm": 0.8639772352746394, + "language_loss": 0.60299456, + "learning_rate": 4.448546830368003e-07, + "loss": 0.62324452, + "num_input_tokens_seen": 283558795, + "router_z_loss_clip": 0.01092529, + "router_z_loss_mlp": 0.20214844, + "step": 13140, + "time_per_iteration": 3.12382435798645 + }, + { + "auxiliary_loss_clip": 0.01101176, + "auxiliary_loss_mlp": 0.01031932, + "balance_loss_clip": 1.01973701, + "balance_loss_mlp": 1.03487992, + "epoch": 0.7900796633097851, + "flos": 30332773601280.0, + "grad_norm": 1.6042755472834633, + "language_loss": 0.7596916, + "learning_rate": 4.4460982170652304e-07, + "loss": 0.78102267, + "num_input_tokens_seen": 283579305, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6640625, + "step": 13141, + "time_per_iteration": 2.5595388412475586 + }, + { + "auxiliary_loss_clip": 0.0110272, + "auxiliary_loss_mlp": 0.01032935, + "balance_loss_clip": 1.02109766, + "balance_loss_mlp": 1.0354681, + "epoch": 0.790139786562453, + "flos": 22126790983680.0, + "grad_norm": 2.061867815111243, + "language_loss": 0.68504715, + "learning_rate": 4.4436501935679694e-07, + "loss": 0.70640367, + "num_input_tokens_seen": 283597840, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 13142, + "time_per_iteration": 3.9543938636779785 + }, + { + "auxiliary_loss_clip": 0.0102319, + "auxiliary_loss_mlp": 0.01000022, + "balance_loss_clip": 0.99900836, + "balance_loss_mlp": 1.00304079, + "epoch": 0.790199909815121, + "flos": 58207284213120.0, + "grad_norm": 0.8198553177673825, + "language_loss": 0.60004789, + "learning_rate": 4.441202759969049e-07, + "loss": 0.62028003, + "num_input_tokens_seen": 283647950, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.20117188, + "step": 13143, + "time_per_iteration": 2.863976001739502 + }, + { + "auxiliary_loss_clip": 0.01103929, + "auxiliary_loss_mlp": 0.01029168, + "balance_loss_clip": 1.01715136, + "balance_loss_mlp": 1.03638124, + "epoch": 0.7902600330677889, + "flos": 34533316759680.0, + "grad_norm": 1.589507938557268, + "language_loss": 0.74556917, + "learning_rate": 4.4387559163612875e-07, + "loss": 0.76690018, + "num_input_tokens_seen": 283670645, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 13144, + "time_per_iteration": 2.5839059352874756 + }, + { + "auxiliary_loss_clip": 0.01103839, + "auxiliary_loss_mlp": 0.01032647, + "balance_loss_clip": 1.02020764, + "balance_loss_mlp": 1.03596044, + "epoch": 0.7903201563204569, + "flos": 22346384780160.0, + "grad_norm": 1.7274125688221094, + "language_loss": 0.83230376, + "learning_rate": 4.4363096628374605e-07, + "loss": 0.85366857, + "num_input_tokens_seen": 283688830, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 13145, + "time_per_iteration": 2.468961000442505 + }, + { + "auxiliary_loss_clip": 0.01094904, + "auxiliary_loss_mlp": 0.01029429, + "balance_loss_clip": 1.01889074, + "balance_loss_mlp": 1.03252196, + "epoch": 0.790380279573125, + "flos": 22053533195520.0, + "grad_norm": 1.7706663213688858, + "language_loss": 0.72783786, + "learning_rate": 4.4338639994903235e-07, + "loss": 0.74908125, + "num_input_tokens_seen": 283708625, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.625, + "step": 13146, + "time_per_iteration": 2.483905076980591 + }, + { + "auxiliary_loss_clip": 0.01101036, + "auxiliary_loss_mlp": 0.01028151, + "balance_loss_clip": 1.01676106, + "balance_loss_mlp": 1.03308654, + "epoch": 0.7904404028257929, + "flos": 20302600826880.0, + "grad_norm": 1.9329251437189798, + "language_loss": 0.75868392, + "learning_rate": 4.4314189264126246e-07, + "loss": 0.77997577, + "num_input_tokens_seen": 283725710, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6796875, + "step": 13147, + "time_per_iteration": 2.444445848464966 + }, + { + "auxiliary_loss_clip": 0.01098948, + "auxiliary_loss_mlp": 0.0103655, + "balance_loss_clip": 1.02420568, + "balance_loss_mlp": 1.03389215, + "epoch": 0.7905005260784609, + "flos": 20008923229440.0, + "grad_norm": 1.8432803916429288, + "language_loss": 0.71830833, + "learning_rate": 4.428974443697087e-07, + "loss": 0.7396633, + "num_input_tokens_seen": 283744150, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6484375, + "step": 13148, + "time_per_iteration": 2.4763596057891846 + }, + { + "auxiliary_loss_clip": 0.01099876, + "auxiliary_loss_mlp": 0.01029272, + "balance_loss_clip": 1.0174942, + "balance_loss_mlp": 1.03280914, + "epoch": 0.7905606493311288, + "flos": 26905926418560.0, + "grad_norm": 2.2200316445748536, + "language_loss": 0.71857107, + "learning_rate": 4.4265305514363913e-07, + "loss": 0.73986256, + "num_input_tokens_seen": 283764170, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 13149, + "time_per_iteration": 2.5340046882629395 + }, + { + "auxiliary_loss_clip": 0.01103652, + "auxiliary_loss_mlp": 0.01030142, + "balance_loss_clip": 1.01735091, + "balance_loss_mlp": 1.03590095, + "epoch": 0.7906207725837968, + "flos": 23696230907520.0, + "grad_norm": 2.727710817995862, + "language_loss": 0.65459621, + "learning_rate": 4.424087249723225e-07, + "loss": 0.67593414, + "num_input_tokens_seen": 283784305, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 13150, + "time_per_iteration": 2.4871621131896973 + }, + { + "auxiliary_loss_clip": 0.01098617, + "auxiliary_loss_mlp": 0.01029632, + "balance_loss_clip": 1.01808667, + "balance_loss_mlp": 1.03340101, + "epoch": 0.7906808958364647, + "flos": 20848837927680.0, + "grad_norm": 2.2316729864145035, + "language_loss": 0.69869459, + "learning_rate": 4.421644538650231e-07, + "loss": 0.71997708, + "num_input_tokens_seen": 283804040, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65234375, + "step": 13151, + "time_per_iteration": 2.429283857345581 + }, + { + "auxiliary_loss_clip": 0.01102592, + "auxiliary_loss_mlp": 0.01035884, + "balance_loss_clip": 1.02360559, + "balance_loss_mlp": 1.03463364, + "epoch": 0.7907410190891327, + "flos": 40735196974080.0, + "grad_norm": 1.3770531341531196, + "language_loss": 0.70089221, + "learning_rate": 4.4192024183100306e-07, + "loss": 0.72227693, + "num_input_tokens_seen": 283827120, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6796875, + "step": 13152, + "time_per_iteration": 2.6216795444488525 + }, + { + "auxiliary_loss_clip": 0.01099523, + "auxiliary_loss_mlp": 0.01026461, + "balance_loss_clip": 1.01488543, + "balance_loss_mlp": 1.03391027, + "epoch": 0.7908011423418007, + "flos": 13261165050240.0, + "grad_norm": 1.7997753431488441, + "language_loss": 0.72821844, + "learning_rate": 4.4167608887952367e-07, + "loss": 0.74947822, + "num_input_tokens_seen": 283844820, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 13153, + "time_per_iteration": 2.432175636291504 + }, + { + "auxiliary_loss_clip": 0.01098332, + "auxiliary_loss_mlp": 0.01024691, + "balance_loss_clip": 1.01356864, + "balance_loss_mlp": 1.03256023, + "epoch": 0.7908612655944687, + "flos": 19754747614080.0, + "grad_norm": 1.8282420637025174, + "language_loss": 0.78883809, + "learning_rate": 4.4143199501984306e-07, + "loss": 0.81006831, + "num_input_tokens_seen": 283862870, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 13154, + "time_per_iteration": 2.466029167175293 + }, + { + "auxiliary_loss_clip": 0.01105447, + "auxiliary_loss_mlp": 0.01030297, + "balance_loss_clip": 1.01705313, + "balance_loss_mlp": 1.03479743, + "epoch": 0.7909213888471366, + "flos": 21287738211840.0, + "grad_norm": 1.8238344908904138, + "language_loss": 0.70285016, + "learning_rate": 4.411879602612185e-07, + "loss": 0.72420764, + "num_input_tokens_seen": 283882405, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.70703125, + "step": 13155, + "time_per_iteration": 2.446547746658325 + }, + { + "auxiliary_loss_clip": 0.01100095, + "auxiliary_loss_mlp": 0.01027373, + "balance_loss_clip": 1.01582754, + "balance_loss_mlp": 1.03381193, + "epoch": 0.7909815120998046, + "flos": 22528882805760.0, + "grad_norm": 2.6081718094801003, + "language_loss": 0.7679953, + "learning_rate": 4.4094398461290174e-07, + "loss": 0.78926998, + "num_input_tokens_seen": 283902070, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 13156, + "time_per_iteration": 2.475921154022217 + }, + { + "auxiliary_loss_clip": 0.01099115, + "auxiliary_loss_mlp": 0.01028477, + "balance_loss_clip": 1.01664567, + "balance_loss_mlp": 1.03353715, + "epoch": 0.7910416353524725, + "flos": 26727702111360.0, + "grad_norm": 1.636282955731654, + "language_loss": 0.65013611, + "learning_rate": 4.4070006808414526e-07, + "loss": 0.67141205, + "num_input_tokens_seen": 283924100, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65625, + "step": 13157, + "time_per_iteration": 2.504150867462158 + }, + { + "auxiliary_loss_clip": 0.0110173, + "auxiliary_loss_mlp": 0.01031989, + "balance_loss_clip": 1.01937079, + "balance_loss_mlp": 1.03502417, + "epoch": 0.7911017586051405, + "flos": 24644847139200.0, + "grad_norm": 1.6940743634270539, + "language_loss": 0.73872387, + "learning_rate": 4.4045621068419894e-07, + "loss": 0.76006109, + "num_input_tokens_seen": 283944955, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.66796875, + "step": 13158, + "time_per_iteration": 2.4976253509521484 + }, + { + "auxiliary_loss_clip": 0.01096891, + "auxiliary_loss_mlp": 0.01028906, + "balance_loss_clip": 1.018332, + "balance_loss_mlp": 1.03334785, + "epoch": 0.7911618818578086, + "flos": 17565489578880.0, + "grad_norm": 1.9043976356667784, + "language_loss": 0.6686908, + "learning_rate": 4.40212412422309e-07, + "loss": 0.68994868, + "num_input_tokens_seen": 283963125, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.63671875, + "step": 13159, + "time_per_iteration": 2.4071156978607178 + }, + { + "auxiliary_loss_clip": 0.01098959, + "auxiliary_loss_mlp": 0.01028336, + "balance_loss_clip": 1.01733327, + "balance_loss_mlp": 1.03454971, + "epoch": 0.7912220051104765, + "flos": 16721660298240.0, + "grad_norm": 1.8560150384461531, + "language_loss": 0.67281532, + "learning_rate": 4.399686733077206e-07, + "loss": 0.69408834, + "num_input_tokens_seen": 283982850, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.64453125, + "step": 13160, + "time_per_iteration": 2.4779374599456787 + }, + { + "auxiliary_loss_clip": 0.01092943, + "auxiliary_loss_mlp": 0.01025967, + "balance_loss_clip": 1.01608515, + "balance_loss_mlp": 1.03147316, + "epoch": 0.7912821283631445, + "flos": 13698736531200.0, + "grad_norm": 1.960219382824367, + "language_loss": 0.72932816, + "learning_rate": 4.3972499334967694e-07, + "loss": 0.75051731, + "num_input_tokens_seen": 283998275, + "router_z_loss_clip": 0.09863281, + "router_z_loss_mlp": 0.61328125, + "step": 13161, + "time_per_iteration": 2.393747091293335 + }, + { + "auxiliary_loss_clip": 0.01099171, + "auxiliary_loss_mlp": 0.0102686, + "balance_loss_clip": 1.01512957, + "balance_loss_mlp": 1.03505635, + "epoch": 0.7913422516158124, + "flos": 23769021818880.0, + "grad_norm": 2.030740934223021, + "language_loss": 0.73477876, + "learning_rate": 4.39481372557418e-07, + "loss": 0.75603908, + "num_input_tokens_seen": 284018750, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.640625, + "step": 13162, + "time_per_iteration": 2.476824998855591 + }, + { + "auxiliary_loss_clip": 0.01102454, + "auxiliary_loss_mlp": 0.01030138, + "balance_loss_clip": 1.01868761, + "balance_loss_mlp": 1.03506005, + "epoch": 0.7914023748684804, + "flos": 19938251220480.0, + "grad_norm": 1.6298606745864626, + "language_loss": 0.72000325, + "learning_rate": 4.392378109401811e-07, + "loss": 0.74132919, + "num_input_tokens_seen": 284037850, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 13163, + "time_per_iteration": 2.4319183826446533 + }, + { + "auxiliary_loss_clip": 0.01101866, + "auxiliary_loss_mlp": 0.01031655, + "balance_loss_clip": 1.01945353, + "balance_loss_mlp": 1.03639102, + "epoch": 0.7914624981211483, + "flos": 20594805966720.0, + "grad_norm": 1.9265161616003688, + "language_loss": 0.69604623, + "learning_rate": 4.3899430850720296e-07, + "loss": 0.71738136, + "num_input_tokens_seen": 284056380, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65625, + "step": 13164, + "time_per_iteration": 2.4604907035827637 + }, + { + "auxiliary_loss_clip": 0.01098403, + "auxiliary_loss_mlp": 0.01029886, + "balance_loss_clip": 1.01869857, + "balance_loss_mlp": 1.03331554, + "epoch": 0.7915226213738163, + "flos": 21799465320960.0, + "grad_norm": 1.9521377640863393, + "language_loss": 0.66389132, + "learning_rate": 4.387508652677177e-07, + "loss": 0.68517423, + "num_input_tokens_seen": 284074945, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.65234375, + "step": 13165, + "time_per_iteration": 2.4393765926361084 + }, + { + "auxiliary_loss_clip": 0.01093623, + "auxiliary_loss_mlp": 0.01024396, + "balance_loss_clip": 1.01379871, + "balance_loss_mlp": 1.03140879, + "epoch": 0.7915827446264843, + "flos": 16288362535680.0, + "grad_norm": 1.870206675725358, + "language_loss": 0.72397065, + "learning_rate": 4.385074812309557e-07, + "loss": 0.74515086, + "num_input_tokens_seen": 284092070, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.62109375, + "step": 13166, + "time_per_iteration": 2.42858624458313 + }, + { + "auxiliary_loss_clip": 0.01098429, + "auxiliary_loss_mlp": 0.01029233, + "balance_loss_clip": 1.01669192, + "balance_loss_mlp": 1.03284669, + "epoch": 0.7916428678791523, + "flos": 25702595867520.0, + "grad_norm": 1.6243880882538562, + "language_loss": 0.77239472, + "learning_rate": 4.382641564061462e-07, + "loss": 0.79367137, + "num_input_tokens_seen": 284112255, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.65625, + "step": 13167, + "time_per_iteration": 2.4857194423675537 + }, + { + "auxiliary_loss_clip": 0.0109987, + "auxiliary_loss_mlp": 0.01029913, + "balance_loss_clip": 1.0192678, + "balance_loss_mlp": 1.03484404, + "epoch": 0.7917029911318202, + "flos": 23878513451520.0, + "grad_norm": 1.6932683776062956, + "language_loss": 0.84575874, + "learning_rate": 4.3802089080251713e-07, + "loss": 0.86705655, + "num_input_tokens_seen": 284132330, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6484375, + "step": 13168, + "time_per_iteration": 2.5257365703582764 + }, + { + "auxiliary_loss_clip": 0.01101185, + "auxiliary_loss_mlp": 0.01027494, + "balance_loss_clip": 1.01633573, + "balance_loss_mlp": 1.03501356, + "epoch": 0.7917631143844882, + "flos": 21646593037440.0, + "grad_norm": 1.7075722391650643, + "language_loss": 0.72710097, + "learning_rate": 4.3777768442929155e-07, + "loss": 0.74838775, + "num_input_tokens_seen": 284150640, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66015625, + "step": 13169, + "time_per_iteration": 2.4436428546905518 + }, + { + "auxiliary_loss_clip": 0.01102971, + "auxiliary_loss_mlp": 0.01032186, + "balance_loss_clip": 1.02028275, + "balance_loss_mlp": 1.03484845, + "epoch": 0.7918232376371561, + "flos": 38874198355200.0, + "grad_norm": 1.8243232954035, + "language_loss": 0.67037463, + "learning_rate": 4.3753453729569287e-07, + "loss": 0.69172621, + "num_input_tokens_seen": 284171910, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 13170, + "time_per_iteration": 2.624098777770996 + }, + { + "auxiliary_loss_clip": 0.01099882, + "auxiliary_loss_mlp": 0.01022631, + "balance_loss_clip": 1.01188445, + "balance_loss_mlp": 1.03370655, + "epoch": 0.7918833608898241, + "flos": 20775544225920.0, + "grad_norm": 2.145643776900154, + "language_loss": 0.70821196, + "learning_rate": 4.372914494109412e-07, + "loss": 0.72943711, + "num_input_tokens_seen": 284191340, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6640625, + "step": 13171, + "time_per_iteration": 2.4759225845336914 + }, + { + "auxiliary_loss_clip": 0.01097813, + "auxiliary_loss_mlp": 0.01026979, + "balance_loss_clip": 1.01555896, + "balance_loss_mlp": 1.03287041, + "epoch": 0.7919434841424922, + "flos": 33910122769920.0, + "grad_norm": 1.7808114898510692, + "language_loss": 0.66749847, + "learning_rate": 4.370484207842553e-07, + "loss": 0.68874633, + "num_input_tokens_seen": 284212495, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 13172, + "time_per_iteration": 2.5700619220733643 + }, + { + "auxiliary_loss_clip": 0.0110086, + "auxiliary_loss_mlp": 0.01030852, + "balance_loss_clip": 1.01951575, + "balance_loss_mlp": 1.03532124, + "epoch": 0.7920036073951601, + "flos": 21064660796160.0, + "grad_norm": 1.881471397827846, + "language_loss": 0.79114199, + "learning_rate": 4.3680545142484893e-07, + "loss": 0.81245905, + "num_input_tokens_seen": 284230825, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 13173, + "time_per_iteration": 2.4757769107818604 + }, + { + "auxiliary_loss_clip": 0.01098601, + "auxiliary_loss_mlp": 0.01026298, + "balance_loss_clip": 1.01604629, + "balance_loss_mlp": 1.03356767, + "epoch": 0.7920637306478281, + "flos": 23655974739840.0, + "grad_norm": 1.8257169099577297, + "language_loss": 0.7678805, + "learning_rate": 4.365625413419365e-07, + "loss": 0.7891295, + "num_input_tokens_seen": 284250365, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.6484375, + "step": 13174, + "time_per_iteration": 2.478116989135742 + }, + { + "auxiliary_loss_clip": 0.01097308, + "auxiliary_loss_mlp": 0.01031423, + "balance_loss_clip": 1.02046227, + "balance_loss_mlp": 1.03321493, + "epoch": 0.792123853900496, + "flos": 27195438038400.0, + "grad_norm": 1.6179511988960908, + "language_loss": 0.71719491, + "learning_rate": 4.363196905447297e-07, + "loss": 0.73848224, + "num_input_tokens_seen": 284269635, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.640625, + "step": 13175, + "time_per_iteration": 2.528700590133667 + }, + { + "auxiliary_loss_clip": 0.01099648, + "auxiliary_loss_mlp": 0.01027473, + "balance_loss_clip": 1.01570737, + "balance_loss_mlp": 1.03435004, + "epoch": 0.792183977153164, + "flos": 19098659744640.0, + "grad_norm": 1.9378539521552467, + "language_loss": 0.59763598, + "learning_rate": 4.360768990424364e-07, + "loss": 0.61890721, + "num_input_tokens_seen": 284288380, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.65234375, + "step": 13176, + "time_per_iteration": 2.4653594493865967 + }, + { + "auxiliary_loss_clip": 0.01101303, + "auxiliary_loss_mlp": 0.01030579, + "balance_loss_clip": 1.01922417, + "balance_loss_mlp": 1.03675985, + "epoch": 0.7922441004058319, + "flos": 17128851851520.0, + "grad_norm": 1.8690026492537037, + "language_loss": 0.73695058, + "learning_rate": 4.3583416684426376e-07, + "loss": 0.75826943, + "num_input_tokens_seen": 284306920, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.64453125, + "step": 13177, + "time_per_iteration": 2.439019203186035 + }, + { + "auxiliary_loss_clip": 0.01098632, + "auxiliary_loss_mlp": 0.01032505, + "balance_loss_clip": 1.02159739, + "balance_loss_mlp": 1.0353229, + "epoch": 0.7923042236585, + "flos": 17821640442240.0, + "grad_norm": 2.5597015980871656, + "language_loss": 0.63997006, + "learning_rate": 4.355914939594174e-07, + "loss": 0.66128141, + "num_input_tokens_seen": 284324700, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6328125, + "step": 13178, + "time_per_iteration": 3.8768224716186523 + }, + { + "auxiliary_loss_clip": 0.01098113, + "auxiliary_loss_mlp": 0.01030005, + "balance_loss_clip": 1.0197531, + "balance_loss_mlp": 1.03276765, + "epoch": 0.7923643469111679, + "flos": 29935206892800.0, + "grad_norm": 1.4086658766608762, + "language_loss": 0.68400067, + "learning_rate": 4.3534888039709726e-07, + "loss": 0.70528185, + "num_input_tokens_seen": 284345985, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.65625, + "step": 13179, + "time_per_iteration": 2.5326123237609863 + }, + { + "auxiliary_loss_clip": 0.01099366, + "auxiliary_loss_mlp": 0.01029023, + "balance_loss_clip": 1.01749516, + "balance_loss_mlp": 1.03448081, + "epoch": 0.7924244701638359, + "flos": 22674716023680.0, + "grad_norm": 3.8313461513968408, + "language_loss": 0.74134624, + "learning_rate": 4.3510632616650444e-07, + "loss": 0.76263011, + "num_input_tokens_seen": 284364475, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 13180, + "time_per_iteration": 3.892685651779175 + }, + { + "auxiliary_loss_clip": 0.01104124, + "auxiliary_loss_mlp": 0.01032877, + "balance_loss_clip": 1.02059281, + "balance_loss_mlp": 1.03637862, + "epoch": 0.7924845934165038, + "flos": 17968156018560.0, + "grad_norm": 2.6414763504058936, + "language_loss": 0.81435031, + "learning_rate": 4.3486383127683646e-07, + "loss": 0.8357203, + "num_input_tokens_seen": 284382125, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6796875, + "step": 13181, + "time_per_iteration": 3.8623433113098145 + }, + { + "auxiliary_loss_clip": 0.01098541, + "auxiliary_loss_mlp": 0.01032498, + "balance_loss_clip": 1.02029681, + "balance_loss_mlp": 1.03413761, + "epoch": 0.7925447166691718, + "flos": 23476960333440.0, + "grad_norm": 1.7875723421609098, + "language_loss": 0.77434945, + "learning_rate": 4.346213957372895e-07, + "loss": 0.7956599, + "num_input_tokens_seen": 284401585, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.64453125, + "step": 13182, + "time_per_iteration": 2.4663844108581543 + }, + { + "auxiliary_loss_clip": 0.01103982, + "auxiliary_loss_mlp": 0.01032586, + "balance_loss_clip": 1.01979494, + "balance_loss_mlp": 1.03470898, + "epoch": 0.7926048399218397, + "flos": 20447572118400.0, + "grad_norm": 2.7996855635820013, + "language_loss": 0.74354494, + "learning_rate": 4.34379019557056e-07, + "loss": 0.7649107, + "num_input_tokens_seen": 284419125, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 13183, + "time_per_iteration": 2.490994930267334 + }, + { + "auxiliary_loss_clip": 0.0109888, + "auxiliary_loss_mlp": 0.01024612, + "balance_loss_clip": 1.01273239, + "balance_loss_mlp": 1.03439891, + "epoch": 0.7926649631745077, + "flos": 37160038535040.0, + "grad_norm": 1.6595627925509142, + "language_loss": 0.68164527, + "learning_rate": 4.341367027453264e-07, + "loss": 0.70288026, + "num_input_tokens_seen": 284440445, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.64453125, + "step": 13184, + "time_per_iteration": 4.066596508026123 + }, + { + "auxiliary_loss_clip": 0.01102689, + "auxiliary_loss_mlp": 0.01028631, + "balance_loss_clip": 1.01719308, + "balance_loss_mlp": 1.03515947, + "epoch": 0.7927250864271758, + "flos": 17018606033280.0, + "grad_norm": 1.6953007662822652, + "language_loss": 0.70649928, + "learning_rate": 4.338944453112907e-07, + "loss": 0.72781253, + "num_input_tokens_seen": 284459370, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.67578125, + "step": 13185, + "time_per_iteration": 2.5168697834014893 + }, + { + "auxiliary_loss_clip": 0.01101927, + "auxiliary_loss_mlp": 0.01027341, + "balance_loss_clip": 1.01530719, + "balance_loss_mlp": 1.03461063, + "epoch": 0.7927852096798437, + "flos": 17749208666880.0, + "grad_norm": 2.0010064491427335, + "language_loss": 0.65568876, + "learning_rate": 4.3365224726413375e-07, + "loss": 0.67698145, + "num_input_tokens_seen": 284477525, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 13186, + "time_per_iteration": 2.4313526153564453 + }, + { + "auxiliary_loss_clip": 0.01099917, + "auxiliary_loss_mlp": 0.01028869, + "balance_loss_clip": 1.01786041, + "balance_loss_mlp": 1.03488398, + "epoch": 0.7928453329325117, + "flos": 23838436851840.0, + "grad_norm": 2.458790452958655, + "language_loss": 0.76782525, + "learning_rate": 4.334101086130408e-07, + "loss": 0.78911316, + "num_input_tokens_seen": 284496590, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6484375, + "step": 13187, + "time_per_iteration": 2.4705545902252197 + }, + { + "auxiliary_loss_clip": 0.01097825, + "auxiliary_loss_mlp": 0.01026522, + "balance_loss_clip": 1.01525056, + "balance_loss_mlp": 1.03388309, + "epoch": 0.7929054561851796, + "flos": 17454920538240.0, + "grad_norm": 2.052216881515836, + "language_loss": 0.72776371, + "learning_rate": 4.3316802936719334e-07, + "loss": 0.74900717, + "num_input_tokens_seen": 284511470, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.640625, + "step": 13188, + "time_per_iteration": 2.473217010498047 + }, + { + "auxiliary_loss_clip": 0.01102244, + "auxiliary_loss_mlp": 0.01036567, + "balance_loss_clip": 1.02387083, + "balance_loss_mlp": 1.03462553, + "epoch": 0.7929655794378476, + "flos": 21981280988160.0, + "grad_norm": 2.027455817824797, + "language_loss": 0.62665582, + "learning_rate": 4.329260095357725e-07, + "loss": 0.64804399, + "num_input_tokens_seen": 284531125, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.67578125, + "step": 13189, + "time_per_iteration": 2.442365884780884 + }, + { + "auxiliary_loss_clip": 0.01098917, + "auxiliary_loss_mlp": 0.01028169, + "balance_loss_clip": 1.01705313, + "balance_loss_mlp": 1.03361034, + "epoch": 0.7930257026905155, + "flos": 17273930883840.0, + "grad_norm": 2.5304062276018793, + "language_loss": 0.72505867, + "learning_rate": 4.3268404912795307e-07, + "loss": 0.74632961, + "num_input_tokens_seen": 284549340, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65234375, + "step": 13190, + "time_per_iteration": 2.5228397846221924 + }, + { + "auxiliary_loss_clip": 0.01096381, + "auxiliary_loss_mlp": 0.01027089, + "balance_loss_clip": 1.01708758, + "balance_loss_mlp": 1.03499353, + "epoch": 0.7930858259431836, + "flos": 27300584125440.0, + "grad_norm": 1.8037952214110713, + "language_loss": 0.73300159, + "learning_rate": 4.3244214815291166e-07, + "loss": 0.75423628, + "num_input_tokens_seen": 284567060, + "router_z_loss_clip": 0.10009766, + "router_z_loss_mlp": 0.6171875, + "step": 13191, + "time_per_iteration": 2.5402090549468994 + }, + { + "auxiliary_loss_clip": 0.01099659, + "auxiliary_loss_mlp": 0.01036425, + "balance_loss_clip": 1.02452767, + "balance_loss_mlp": 1.03368807, + "epoch": 0.7931459491958515, + "flos": 19863736456320.0, + "grad_norm": 1.9478523410400206, + "language_loss": 0.69033474, + "learning_rate": 4.322003066198219e-07, + "loss": 0.71169555, + "num_input_tokens_seen": 284586600, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66015625, + "step": 13192, + "time_per_iteration": 2.488039970397949 + }, + { + "auxiliary_loss_clip": 0.01100062, + "auxiliary_loss_mlp": 0.010329, + "balance_loss_clip": 1.02173603, + "balance_loss_mlp": 1.03413558, + "epoch": 0.7932060724485195, + "flos": 23147120718720.0, + "grad_norm": 1.5635403333357274, + "language_loss": 0.75213289, + "learning_rate": 4.3195852453785274e-07, + "loss": 0.77346253, + "num_input_tokens_seen": 284605715, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.66015625, + "step": 13193, + "time_per_iteration": 2.464966297149658 + }, + { + "auxiliary_loss_clip": 0.01100043, + "auxiliary_loss_mlp": 0.01033879, + "balance_loss_clip": 1.02102232, + "balance_loss_mlp": 1.03474998, + "epoch": 0.7932661957011874, + "flos": 29934847756800.0, + "grad_norm": 1.8781856147923044, + "language_loss": 0.72225535, + "learning_rate": 4.317168019161741e-07, + "loss": 0.74359465, + "num_input_tokens_seen": 284628540, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.65234375, + "step": 13194, + "time_per_iteration": 2.55106520652771 + }, + { + "auxiliary_loss_clip": 0.01104878, + "auxiliary_loss_mlp": 0.01030783, + "balance_loss_clip": 1.01911819, + "balance_loss_mlp": 1.03578103, + "epoch": 0.7933263189538554, + "flos": 22559119079040.0, + "grad_norm": 1.9952958516123638, + "language_loss": 0.69781977, + "learning_rate": 4.314751387639517e-07, + "loss": 0.71917635, + "num_input_tokens_seen": 284646040, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.69140625, + "step": 13195, + "time_per_iteration": 2.4327144622802734 + }, + { + "auxiliary_loss_clip": 0.01100264, + "auxiliary_loss_mlp": 0.01025694, + "balance_loss_clip": 1.0142858, + "balance_loss_mlp": 1.03533435, + "epoch": 0.7933864422065233, + "flos": 25479051575040.0, + "grad_norm": 1.5235615459382654, + "language_loss": 0.77706164, + "learning_rate": 4.3123353509034844e-07, + "loss": 0.79832125, + "num_input_tokens_seen": 284665110, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 13196, + "time_per_iteration": 2.4901678562164307 + }, + { + "auxiliary_loss_clip": 0.01103725, + "auxiliary_loss_mlp": 0.01034187, + "balance_loss_clip": 1.02258193, + "balance_loss_mlp": 1.03656614, + "epoch": 0.7934465654591913, + "flos": 33583156243200.0, + "grad_norm": 1.803068943631605, + "language_loss": 0.68970078, + "learning_rate": 4.309919909045268e-07, + "loss": 0.71107984, + "num_input_tokens_seen": 284686515, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 13197, + "time_per_iteration": 2.5378594398498535 + }, + { + "auxiliary_loss_clip": 0.01098819, + "auxiliary_loss_mlp": 0.0102988, + "balance_loss_clip": 1.01860309, + "balance_loss_mlp": 1.03417861, + "epoch": 0.7935066887118594, + "flos": 31432538263680.0, + "grad_norm": 1.7643596771229297, + "language_loss": 0.64804506, + "learning_rate": 4.30750506215646e-07, + "loss": 0.66933215, + "num_input_tokens_seen": 284707300, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.64453125, + "step": 13198, + "time_per_iteration": 2.534534215927124 + }, + { + "auxiliary_loss_clip": 0.01103865, + "auxiliary_loss_mlp": 0.01030473, + "balance_loss_clip": 1.01787245, + "balance_loss_mlp": 1.03533065, + "epoch": 0.7935668119645273, + "flos": 14682616940160.0, + "grad_norm": 2.0561177660493453, + "language_loss": 0.72203559, + "learning_rate": 4.30509081032864e-07, + "loss": 0.743379, + "num_input_tokens_seen": 284723545, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.68359375, + "step": 13199, + "time_per_iteration": 2.409954071044922 + }, + { + "auxiliary_loss_clip": 0.01100244, + "auxiliary_loss_mlp": 0.01028741, + "balance_loss_clip": 1.01794064, + "balance_loss_mlp": 1.03514385, + "epoch": 0.7936269352171953, + "flos": 18004246208640.0, + "grad_norm": 2.5680157152450933, + "language_loss": 0.80811197, + "learning_rate": 4.302677153653349e-07, + "loss": 0.82940185, + "num_input_tokens_seen": 284742650, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.65234375, + "step": 13200, + "time_per_iteration": 2.4604108333587646 + }, + { + "auxiliary_loss_clip": 0.01098579, + "auxiliary_loss_mlp": 0.01028539, + "balance_loss_clip": 1.01745248, + "balance_loss_mlp": 1.0353868, + "epoch": 0.7936870584698632, + "flos": 18880215183360.0, + "grad_norm": 1.627584700503655, + "language_loss": 0.77191329, + "learning_rate": 4.3002640922221077e-07, + "loss": 0.7931844, + "num_input_tokens_seen": 284760955, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6328125, + "step": 13201, + "time_per_iteration": 2.428744077682495 + }, + { + "auxiliary_loss_clip": 0.01098306, + "auxiliary_loss_mlp": 0.01028208, + "balance_loss_clip": 1.01721644, + "balance_loss_mlp": 1.03374922, + "epoch": 0.7937471817225312, + "flos": 23367001824000.0, + "grad_norm": 1.4615967760668465, + "language_loss": 0.67071187, + "learning_rate": 4.2978516261264296e-07, + "loss": 0.69197702, + "num_input_tokens_seen": 284780745, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.64453125, + "step": 13202, + "time_per_iteration": 2.4896771907806396 + }, + { + "auxiliary_loss_clip": 0.0110056, + "auxiliary_loss_mlp": 0.01032272, + "balance_loss_clip": 1.02063727, + "balance_loss_mlp": 1.03468037, + "epoch": 0.7938073049751991, + "flos": 22674428714880.0, + "grad_norm": 1.816192931663621, + "language_loss": 0.74804997, + "learning_rate": 4.2954397554577884e-07, + "loss": 0.7693783, + "num_input_tokens_seen": 284799000, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 13203, + "time_per_iteration": 2.451380729675293 + }, + { + "auxiliary_loss_clip": 0.01100879, + "auxiliary_loss_mlp": 0.01030155, + "balance_loss_clip": 1.01872849, + "balance_loss_mlp": 1.03399134, + "epoch": 0.7938674282278672, + "flos": 22851431959680.0, + "grad_norm": 2.0709813366174807, + "language_loss": 0.6622262, + "learning_rate": 4.293028480307643e-07, + "loss": 0.68353653, + "num_input_tokens_seen": 284817450, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 13204, + "time_per_iteration": 2.4800636768341064 + }, + { + "auxiliary_loss_clip": 0.01097898, + "auxiliary_loss_mlp": 0.01029539, + "balance_loss_clip": 1.01835084, + "balance_loss_mlp": 1.03296351, + "epoch": 0.7939275514805351, + "flos": 27012509049600.0, + "grad_norm": 1.3281882721679232, + "language_loss": 0.7925297, + "learning_rate": 4.290617800767438e-07, + "loss": 0.81380415, + "num_input_tokens_seen": 284838865, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6484375, + "step": 13205, + "time_per_iteration": 2.4787778854370117 + }, + { + "auxiliary_loss_clip": 0.01096536, + "auxiliary_loss_mlp": 0.01026398, + "balance_loss_clip": 1.01493573, + "balance_loss_mlp": 1.03291297, + "epoch": 0.7939876747332031, + "flos": 21142838747520.0, + "grad_norm": 1.7942191439670012, + "language_loss": 0.77874231, + "learning_rate": 4.28820771692858e-07, + "loss": 0.7999717, + "num_input_tokens_seen": 284857975, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.63671875, + "step": 13206, + "time_per_iteration": 2.499706983566284 + }, + { + "auxiliary_loss_clip": 0.01104173, + "auxiliary_loss_mlp": 0.01029487, + "balance_loss_clip": 1.0172863, + "balance_loss_mlp": 1.03587031, + "epoch": 0.794047797985871, + "flos": 23289075267840.0, + "grad_norm": 1.8397672987802902, + "language_loss": 0.79237318, + "learning_rate": 4.285798228882456e-07, + "loss": 0.81370986, + "num_input_tokens_seen": 284877145, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 13207, + "time_per_iteration": 2.4636006355285645 + }, + { + "auxiliary_loss_clip": 0.01099783, + "auxiliary_loss_mlp": 0.01032098, + "balance_loss_clip": 1.02048755, + "balance_loss_mlp": 1.03468966, + "epoch": 0.794107921238539, + "flos": 24608074590720.0, + "grad_norm": 1.9530235791320048, + "language_loss": 0.84002006, + "learning_rate": 4.2833893367204375e-07, + "loss": 0.86133885, + "num_input_tokens_seen": 284895560, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 13208, + "time_per_iteration": 2.5083847045898438 + }, + { + "auxiliary_loss_clip": 0.01022967, + "auxiliary_loss_mlp": 0.00999733, + "balance_loss_clip": 0.99883288, + "balance_loss_mlp": 1.00283718, + "epoch": 0.7941680444912069, + "flos": 64093690252800.0, + "grad_norm": 0.7192767006915639, + "language_loss": 0.58359563, + "learning_rate": 4.280981040533875e-07, + "loss": 0.60382259, + "num_input_tokens_seen": 284963135, + "router_z_loss_clip": 0.00897217, + "router_z_loss_mlp": 0.20117188, + "step": 13209, + "time_per_iteration": 3.1166725158691406 + }, + { + "auxiliary_loss_clip": 0.01105651, + "auxiliary_loss_mlp": 0.0102813, + "balance_loss_clip": 1.01602447, + "balance_loss_mlp": 1.03716731, + "epoch": 0.794228167743875, + "flos": 24388839930240.0, + "grad_norm": 6.276461119543849, + "language_loss": 0.62636811, + "learning_rate": 4.2785733404140825e-07, + "loss": 0.64770591, + "num_input_tokens_seen": 284981755, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.68359375, + "step": 13210, + "time_per_iteration": 2.5011911392211914 + }, + { + "auxiliary_loss_clip": 0.01100308, + "auxiliary_loss_mlp": 0.010306, + "balance_loss_clip": 1.01959693, + "balance_loss_mlp": 1.03402996, + "epoch": 0.794288290996543, + "flos": 28512498026880.0, + "grad_norm": 1.5861692556571285, + "language_loss": 0.68948948, + "learning_rate": 4.2761662364523676e-07, + "loss": 0.71079856, + "num_input_tokens_seen": 285003060, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6640625, + "step": 13211, + "time_per_iteration": 2.5030434131622314 + }, + { + "auxiliary_loss_clip": 0.01102809, + "auxiliary_loss_mlp": 0.01036253, + "balance_loss_clip": 1.02349782, + "balance_loss_mlp": 1.03480554, + "epoch": 0.7943484142492109, + "flos": 25922117836800.0, + "grad_norm": 1.5459525414339919, + "language_loss": 0.72359824, + "learning_rate": 4.2737597287400074e-07, + "loss": 0.7449888, + "num_input_tokens_seen": 285021640, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 13212, + "time_per_iteration": 2.513190984725952 + }, + { + "auxiliary_loss_clip": 0.01098106, + "auxiliary_loss_mlp": 0.01025052, + "balance_loss_clip": 1.01388764, + "balance_loss_mlp": 1.03500962, + "epoch": 0.7944085375018789, + "flos": 23915286000000.0, + "grad_norm": 1.663831013986619, + "language_loss": 0.80758727, + "learning_rate": 4.271353817368246e-07, + "loss": 0.82881892, + "num_input_tokens_seen": 285040490, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6328125, + "step": 13213, + "time_per_iteration": 2.4620864391326904 + }, + { + "auxiliary_loss_clip": 0.01106094, + "auxiliary_loss_mlp": 0.01029417, + "balance_loss_clip": 1.01679885, + "balance_loss_mlp": 1.03663135, + "epoch": 0.7944686607545468, + "flos": 20229953569920.0, + "grad_norm": 2.2825802582203476, + "language_loss": 0.68319535, + "learning_rate": 4.268948502428327e-07, + "loss": 0.70455045, + "num_input_tokens_seen": 285059270, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 13214, + "time_per_iteration": 2.4502992630004883 + }, + { + "auxiliary_loss_clip": 0.0109771, + "auxiliary_loss_mlp": 0.01028761, + "balance_loss_clip": 1.01793051, + "balance_loss_mlp": 1.03391325, + "epoch": 0.7945287840072148, + "flos": 21980993679360.0, + "grad_norm": 1.8169772357963099, + "language_loss": 0.72712231, + "learning_rate": 4.2665437840114535e-07, + "loss": 0.74838698, + "num_input_tokens_seen": 285075390, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.63671875, + "step": 13215, + "time_per_iteration": 2.4472222328186035 + }, + { + "auxiliary_loss_clip": 0.01101234, + "auxiliary_loss_mlp": 0.01028514, + "balance_loss_clip": 1.01751041, + "balance_loss_mlp": 1.03695011, + "epoch": 0.7945889072598827, + "flos": 26397718842240.0, + "grad_norm": 1.5004674854133762, + "language_loss": 0.78918624, + "learning_rate": 4.2641396622088253e-07, + "loss": 0.81048369, + "num_input_tokens_seen": 285096290, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.640625, + "step": 13216, + "time_per_iteration": 2.5075128078460693 + }, + { + "auxiliary_loss_clip": 0.01100883, + "auxiliary_loss_mlp": 0.01029594, + "balance_loss_clip": 1.01874018, + "balance_loss_mlp": 1.03463197, + "epoch": 0.7946490305125508, + "flos": 25810255906560.0, + "grad_norm": 1.6163941804337032, + "language_loss": 0.73908085, + "learning_rate": 4.261736137111598e-07, + "loss": 0.76038563, + "num_input_tokens_seen": 285116020, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6640625, + "step": 13217, + "time_per_iteration": 2.4624104499816895 + }, + { + "auxiliary_loss_clip": 0.01097689, + "auxiliary_loss_mlp": 0.01034045, + "balance_loss_clip": 1.02317882, + "balance_loss_mlp": 1.03437877, + "epoch": 0.7947091537652187, + "flos": 15960965045760.0, + "grad_norm": 1.7536489091121308, + "language_loss": 0.74128562, + "learning_rate": 4.259333208810907e-07, + "loss": 0.76260298, + "num_input_tokens_seen": 285133510, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6328125, + "step": 13218, + "time_per_iteration": 2.4378395080566406 + }, + { + "auxiliary_loss_clip": 0.01101542, + "auxiliary_loss_mlp": 0.0103414, + "balance_loss_clip": 1.02162278, + "balance_loss_mlp": 1.03341603, + "epoch": 0.7947692770178867, + "flos": 18587866389120.0, + "grad_norm": 1.8944799290168057, + "language_loss": 0.83180892, + "learning_rate": 4.2569308773978817e-07, + "loss": 0.85316575, + "num_input_tokens_seen": 285151690, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 13219, + "time_per_iteration": 2.4046013355255127 + }, + { + "auxiliary_loss_clip": 0.01104407, + "auxiliary_loss_mlp": 0.01033611, + "balance_loss_clip": 1.02093291, + "balance_loss_mlp": 1.03578758, + "epoch": 0.7948294002705546, + "flos": 20442220992000.0, + "grad_norm": 1.8955600034556859, + "language_loss": 0.7588414, + "learning_rate": 4.2545291429636123e-07, + "loss": 0.78022164, + "num_input_tokens_seen": 285170485, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 13220, + "time_per_iteration": 3.8154995441436768 + }, + { + "auxiliary_loss_clip": 0.01104021, + "auxiliary_loss_mlp": 0.01033248, + "balance_loss_clip": 1.02123189, + "balance_loss_mlp": 1.03558075, + "epoch": 0.7948895235232226, + "flos": 38181194282880.0, + "grad_norm": 1.997206331366737, + "language_loss": 0.72682828, + "learning_rate": 4.252128005599176e-07, + "loss": 0.74820095, + "num_input_tokens_seen": 285191050, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 13221, + "time_per_iteration": 4.05722188949585 + }, + { + "auxiliary_loss_clip": 0.0109725, + "auxiliary_loss_mlp": 0.01026356, + "balance_loss_clip": 1.01540709, + "balance_loss_mlp": 1.03402424, + "epoch": 0.7949496467758905, + "flos": 15559806977280.0, + "grad_norm": 1.8234441442382394, + "language_loss": 0.7454437, + "learning_rate": 4.249727465395634e-07, + "loss": 0.76667982, + "num_input_tokens_seen": 285208750, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6328125, + "step": 13222, + "time_per_iteration": 3.82381534576416 + }, + { + "auxiliary_loss_clip": 0.01023305, + "auxiliary_loss_mlp": 0.01001588, + "balance_loss_clip": 1.00058103, + "balance_loss_mlp": 1.00324297, + "epoch": 0.7950097700285585, + "flos": 70897036728960.0, + "grad_norm": 0.7682356775746639, + "language_loss": 0.67054129, + "learning_rate": 4.247327522443993e-07, + "loss": 0.6907903, + "num_input_tokens_seen": 285264605, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.20117188, + "step": 13223, + "time_per_iteration": 2.910489797592163 + }, + { + "auxiliary_loss_clip": 0.01098555, + "auxiliary_loss_mlp": 0.0102863, + "balance_loss_clip": 1.01635742, + "balance_loss_mlp": 1.03264594, + "epoch": 0.7950698932812266, + "flos": 23951627585280.0, + "grad_norm": 1.726476210042691, + "language_loss": 0.7146225, + "learning_rate": 4.2449281768352717e-07, + "loss": 0.73589438, + "num_input_tokens_seen": 285283940, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66015625, + "step": 13224, + "time_per_iteration": 2.4960734844207764 + }, + { + "auxiliary_loss_clip": 0.01023049, + "auxiliary_loss_mlp": 0.00999614, + "balance_loss_clip": 0.9985711, + "balance_loss_mlp": 1.00314784, + "epoch": 0.7951300165338945, + "flos": 60282561415680.0, + "grad_norm": 0.6952095607048513, + "language_loss": 0.55011863, + "learning_rate": 4.2425294286604527e-07, + "loss": 0.57034522, + "num_input_tokens_seen": 285349525, + "router_z_loss_clip": 0.01043701, + "router_z_loss_mlp": 0.19921875, + "step": 13225, + "time_per_iteration": 4.49747109413147 + }, + { + "auxiliary_loss_clip": 0.01097582, + "auxiliary_loss_mlp": 0.01023614, + "balance_loss_clip": 1.01296818, + "balance_loss_mlp": 1.03373742, + "epoch": 0.7951901397865625, + "flos": 22819004956800.0, + "grad_norm": 1.932116603626369, + "language_loss": 0.64920199, + "learning_rate": 4.2401312780105034e-07, + "loss": 0.67041391, + "num_input_tokens_seen": 285367355, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.640625, + "step": 13226, + "time_per_iteration": 2.492919921875 + }, + { + "auxiliary_loss_clip": 0.01102867, + "auxiliary_loss_mlp": 0.01036554, + "balance_loss_clip": 1.02517581, + "balance_loss_mlp": 1.03584349, + "epoch": 0.7952502630392304, + "flos": 35695672871040.0, + "grad_norm": 3.097889886505811, + "language_loss": 0.70084739, + "learning_rate": 4.237733724976349e-07, + "loss": 0.72224164, + "num_input_tokens_seen": 285386190, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.671875, + "step": 13227, + "time_per_iteration": 2.55519700050354 + }, + { + "auxiliary_loss_clip": 0.01096905, + "auxiliary_loss_mlp": 0.01027043, + "balance_loss_clip": 1.01701736, + "balance_loss_mlp": 1.03388405, + "epoch": 0.7953103862918984, + "flos": 25629840869760.0, + "grad_norm": 1.6685312506793168, + "language_loss": 0.69431317, + "learning_rate": 4.2353367696489184e-07, + "loss": 0.71555269, + "num_input_tokens_seen": 285406150, + "router_z_loss_clip": 0.10058594, + "router_z_loss_mlp": 0.62890625, + "step": 13228, + "time_per_iteration": 2.5069961547851562 + }, + { + "auxiliary_loss_clip": 0.01100896, + "auxiliary_loss_mlp": 0.01032604, + "balance_loss_clip": 1.02095747, + "balance_loss_mlp": 1.03423619, + "epoch": 0.7953705095445663, + "flos": 40551980676480.0, + "grad_norm": 1.445985556067254, + "language_loss": 0.70922631, + "learning_rate": 4.232940412119095e-07, + "loss": 0.73056132, + "num_input_tokens_seen": 285429900, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66796875, + "step": 13229, + "time_per_iteration": 2.6479508876800537 + }, + { + "auxiliary_loss_clip": 0.01106737, + "auxiliary_loss_mlp": 0.01030695, + "balance_loss_clip": 1.01903689, + "balance_loss_mlp": 1.03793633, + "epoch": 0.7954306327972344, + "flos": 27636672706560.0, + "grad_norm": 1.7589665184565293, + "language_loss": 0.71889889, + "learning_rate": 4.2305446524777457e-07, + "loss": 0.74027318, + "num_input_tokens_seen": 285452555, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6875, + "step": 13230, + "time_per_iteration": 2.5062637329101562 + }, + { + "auxiliary_loss_clip": 0.01022715, + "auxiliary_loss_mlp": 0.01000194, + "balance_loss_clip": 0.9992041, + "balance_loss_mlp": 1.00273073, + "epoch": 0.7954907560499023, + "flos": 59504055995520.0, + "grad_norm": 0.8959170552781407, + "language_loss": 0.63557678, + "learning_rate": 4.2281494908157247e-07, + "loss": 0.65580589, + "num_input_tokens_seen": 285515700, + "router_z_loss_clip": 0.0098877, + "router_z_loss_mlp": 0.19921875, + "step": 13231, + "time_per_iteration": 3.082951784133911 + }, + { + "auxiliary_loss_clip": 0.01099992, + "auxiliary_loss_mlp": 0.01025832, + "balance_loss_clip": 1.01489472, + "balance_loss_mlp": 1.03479195, + "epoch": 0.7955508793025703, + "flos": 20120533764480.0, + "grad_norm": 1.513210283199707, + "language_loss": 0.69656473, + "learning_rate": 4.2257549272238566e-07, + "loss": 0.71782291, + "num_input_tokens_seen": 285533910, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.65234375, + "step": 13232, + "time_per_iteration": 2.440912961959839 + }, + { + "auxiliary_loss_clip": 0.01099299, + "auxiliary_loss_mlp": 0.01027268, + "balance_loss_clip": 1.01572859, + "balance_loss_mlp": 1.03366399, + "epoch": 0.7956110025552382, + "flos": 26505378881280.0, + "grad_norm": 1.5695652916232832, + "language_loss": 0.77775937, + "learning_rate": 4.223360961792952e-07, + "loss": 0.79902506, + "num_input_tokens_seen": 285554080, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 13233, + "time_per_iteration": 2.5125248432159424 + }, + { + "auxiliary_loss_clip": 0.01100048, + "auxiliary_loss_mlp": 0.01029216, + "balance_loss_clip": 1.01780808, + "balance_loss_mlp": 1.03384972, + "epoch": 0.7956711258079062, + "flos": 22565475786240.0, + "grad_norm": 2.4242376989153183, + "language_loss": 0.78652054, + "learning_rate": 4.220967594613769e-07, + "loss": 0.80781317, + "num_input_tokens_seen": 285572325, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66015625, + "step": 13234, + "time_per_iteration": 2.468038558959961 + }, + { + "auxiliary_loss_clip": 0.0109831, + "auxiliary_loss_mlp": 0.01023983, + "balance_loss_clip": 1.01374316, + "balance_loss_mlp": 1.03441608, + "epoch": 0.7957312490605741, + "flos": 17379005143680.0, + "grad_norm": 1.5905892668664205, + "language_loss": 0.70050478, + "learning_rate": 4.218574825777077e-07, + "loss": 0.72172773, + "num_input_tokens_seen": 285589770, + "router_z_loss_clip": 0.10205078, + "router_z_loss_mlp": 0.640625, + "step": 13235, + "time_per_iteration": 2.493274450302124 + }, + { + "auxiliary_loss_clip": 0.01100603, + "auxiliary_loss_mlp": 0.01027765, + "balance_loss_clip": 1.01598716, + "balance_loss_mlp": 1.03456783, + "epoch": 0.7957913723132422, + "flos": 22491427898880.0, + "grad_norm": 1.4327288828899616, + "language_loss": 0.6766414, + "learning_rate": 4.2161826553736145e-07, + "loss": 0.69792509, + "num_input_tokens_seen": 285610065, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6640625, + "step": 13236, + "time_per_iteration": 2.484380006790161 + }, + { + "auxiliary_loss_clip": 0.01097874, + "auxiliary_loss_mlp": 0.01026353, + "balance_loss_clip": 1.01505828, + "balance_loss_mlp": 1.03377748, + "epoch": 0.7958514955659101, + "flos": 22638087129600.0, + "grad_norm": 1.7411950179861415, + "language_loss": 0.75172085, + "learning_rate": 4.2137910834940826e-07, + "loss": 0.77296317, + "num_input_tokens_seen": 285628480, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 13237, + "time_per_iteration": 2.4766552448272705 + }, + { + "auxiliary_loss_clip": 0.01101102, + "auxiliary_loss_mlp": 0.01032287, + "balance_loss_clip": 1.01983571, + "balance_loss_mlp": 1.03548527, + "epoch": 0.7959116188185781, + "flos": 20704225772160.0, + "grad_norm": 1.9189361259680966, + "language_loss": 0.71440208, + "learning_rate": 4.211400110229175e-07, + "loss": 0.73573601, + "num_input_tokens_seen": 285647805, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.65625, + "step": 13238, + "time_per_iteration": 2.456925392150879 + }, + { + "auxiliary_loss_clip": 0.01099911, + "auxiliary_loss_mlp": 0.01026443, + "balance_loss_clip": 1.01485622, + "balance_loss_mlp": 1.0334146, + "epoch": 0.7959717420712461, + "flos": 19024683684480.0, + "grad_norm": 1.8980651664510928, + "language_loss": 0.73918056, + "learning_rate": 4.2090097356695684e-07, + "loss": 0.76044405, + "num_input_tokens_seen": 285665505, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6640625, + "step": 13239, + "time_per_iteration": 2.443584680557251 + }, + { + "auxiliary_loss_clip": 0.01103286, + "auxiliary_loss_mlp": 0.01032954, + "balance_loss_clip": 1.02093184, + "balance_loss_mlp": 1.03495479, + "epoch": 0.796031865323914, + "flos": 26356636661760.0, + "grad_norm": 1.6947466268706028, + "language_loss": 0.69046456, + "learning_rate": 4.2066199599058814e-07, + "loss": 0.71182698, + "num_input_tokens_seen": 285685855, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 13240, + "time_per_iteration": 2.4764912128448486 + }, + { + "auxiliary_loss_clip": 0.01022946, + "auxiliary_loss_mlp": 0.00999029, + "balance_loss_clip": 0.99800378, + "balance_loss_mlp": 1.00302553, + "epoch": 0.796091988576582, + "flos": 62069440320000.0, + "grad_norm": 0.8878157964624488, + "language_loss": 0.58645731, + "learning_rate": 4.2042307830287526e-07, + "loss": 0.60667706, + "num_input_tokens_seen": 285735710, + "router_z_loss_clip": 0.01025391, + "router_z_loss_mlp": 0.19921875, + "step": 13241, + "time_per_iteration": 2.843022108078003 + }, + { + "auxiliary_loss_clip": 0.01099407, + "auxiliary_loss_mlp": 0.01028324, + "balance_loss_clip": 1.01764321, + "balance_loss_mlp": 1.03436446, + "epoch": 0.7961521118292499, + "flos": 39020103400320.0, + "grad_norm": 1.8253771110110306, + "language_loss": 0.64276886, + "learning_rate": 4.201842205128772e-07, + "loss": 0.66404617, + "num_input_tokens_seen": 285757045, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6484375, + "step": 13242, + "time_per_iteration": 2.689807653427124 + }, + { + "auxiliary_loss_clip": 0.01100947, + "auxiliary_loss_mlp": 0.01031927, + "balance_loss_clip": 1.01986313, + "balance_loss_mlp": 1.03429365, + "epoch": 0.796212235081918, + "flos": 21762836426880.0, + "grad_norm": 2.1932509816632235, + "language_loss": 0.75971556, + "learning_rate": 4.199454226296526e-07, + "loss": 0.78104436, + "num_input_tokens_seen": 285776050, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6640625, + "step": 13243, + "time_per_iteration": 2.48710298538208 + }, + { + "auxiliary_loss_clip": 0.01101282, + "auxiliary_loss_mlp": 0.01028309, + "balance_loss_clip": 1.01662683, + "balance_loss_mlp": 1.03448629, + "epoch": 0.7962723583345859, + "flos": 21178857110400.0, + "grad_norm": 1.748658102628615, + "language_loss": 0.7998516, + "learning_rate": 4.1970668466225565e-07, + "loss": 0.8211475, + "num_input_tokens_seen": 285796830, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66796875, + "step": 13244, + "time_per_iteration": 2.475694179534912 + }, + { + "auxiliary_loss_clip": 0.01103079, + "auxiliary_loss_mlp": 0.01028494, + "balance_loss_clip": 1.01651323, + "balance_loss_mlp": 1.03422666, + "epoch": 0.7963324815872539, + "flos": 17128636369920.0, + "grad_norm": 1.9995558497633756, + "language_loss": 0.67953658, + "learning_rate": 4.1946800661973934e-07, + "loss": 0.70085227, + "num_input_tokens_seen": 285814755, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6875, + "step": 13245, + "time_per_iteration": 2.4532089233398438 + }, + { + "auxiliary_loss_clip": 0.01101276, + "auxiliary_loss_mlp": 0.01031868, + "balance_loss_clip": 1.02030444, + "balance_loss_mlp": 1.03515017, + "epoch": 0.7963926048399218, + "flos": 21397481239680.0, + "grad_norm": 1.3612442472486292, + "language_loss": 0.78971922, + "learning_rate": 4.192293885111549e-07, + "loss": 0.81105065, + "num_input_tokens_seen": 285834255, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66015625, + "step": 13246, + "time_per_iteration": 2.440587282180786 + }, + { + "auxiliary_loss_clip": 0.01101796, + "auxiliary_loss_mlp": 0.01026671, + "balance_loss_clip": 1.0149411, + "balance_loss_mlp": 1.0336647, + "epoch": 0.7964527280925898, + "flos": 25184188828800.0, + "grad_norm": 1.6847390016039745, + "language_loss": 0.66190958, + "learning_rate": 4.1899083034555007e-07, + "loss": 0.68319428, + "num_input_tokens_seen": 285853540, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 13247, + "time_per_iteration": 2.487718343734741 + }, + { + "auxiliary_loss_clip": 0.01097373, + "auxiliary_loss_mlp": 0.01028984, + "balance_loss_clip": 1.01829112, + "balance_loss_mlp": 1.03314734, + "epoch": 0.7965128513452577, + "flos": 27015884928000.0, + "grad_norm": 2.8639636552336234, + "language_loss": 0.71457285, + "learning_rate": 4.1875233213197123e-07, + "loss": 0.73583645, + "num_input_tokens_seen": 285872705, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.640625, + "step": 13248, + "time_per_iteration": 2.474893093109131 + }, + { + "auxiliary_loss_clip": 0.0110231, + "auxiliary_loss_mlp": 0.01028868, + "balance_loss_clip": 1.01695275, + "balance_loss_mlp": 1.03439724, + "epoch": 0.7965729745979258, + "flos": 24419578993920.0, + "grad_norm": 2.0200427415060416, + "language_loss": 0.7616542, + "learning_rate": 4.1851389387946255e-07, + "loss": 0.78296602, + "num_input_tokens_seen": 285890290, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 13249, + "time_per_iteration": 2.48595929145813 + }, + { + "auxiliary_loss_clip": 0.01099446, + "auxiliary_loss_mlp": 0.01031046, + "balance_loss_clip": 1.01975679, + "balance_loss_mlp": 1.03507221, + "epoch": 0.7966330978505937, + "flos": 18840389978880.0, + "grad_norm": 2.126182284443467, + "language_loss": 0.61335742, + "learning_rate": 4.1827551559706674e-07, + "loss": 0.63466233, + "num_input_tokens_seen": 285909190, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.64453125, + "step": 13250, + "time_per_iteration": 2.4277217388153076 + }, + { + "auxiliary_loss_clip": 0.01101542, + "auxiliary_loss_mlp": 0.01025247, + "balance_loss_clip": 1.01375568, + "balance_loss_mlp": 1.03543615, + "epoch": 0.7966932211032617, + "flos": 13152319862400.0, + "grad_norm": 2.322541545659239, + "language_loss": 0.72526091, + "learning_rate": 4.180371972938206e-07, + "loss": 0.7465288, + "num_input_tokens_seen": 285927570, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66015625, + "step": 13251, + "time_per_iteration": 2.4575724601745605 + }, + { + "auxiliary_loss_clip": 0.01103859, + "auxiliary_loss_mlp": 0.01027801, + "balance_loss_clip": 1.0152247, + "balance_loss_mlp": 1.03521776, + "epoch": 0.7967533443559297, + "flos": 23949760078080.0, + "grad_norm": 1.8469414013396577, + "language_loss": 0.72915018, + "learning_rate": 4.177989389787624e-07, + "loss": 0.75046682, + "num_input_tokens_seen": 285945810, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 13252, + "time_per_iteration": 2.4559550285339355 + }, + { + "auxiliary_loss_clip": 0.010964, + "auxiliary_loss_mlp": 0.01027003, + "balance_loss_clip": 1.01549911, + "balance_loss_mlp": 1.03332281, + "epoch": 0.7968134676085976, + "flos": 30368791964160.0, + "grad_norm": 1.6873706589511155, + "language_loss": 0.66239917, + "learning_rate": 4.175607406609278e-07, + "loss": 0.68363321, + "num_input_tokens_seen": 285964235, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6328125, + "step": 13253, + "time_per_iteration": 2.538630962371826 + }, + { + "auxiliary_loss_clip": 0.01105101, + "auxiliary_loss_mlp": 0.01035714, + "balance_loss_clip": 1.02353644, + "balance_loss_mlp": 1.03758895, + "epoch": 0.7968735908612656, + "flos": 23075048079360.0, + "grad_norm": 1.5649254820848235, + "language_loss": 0.67826599, + "learning_rate": 4.1732260234934767e-07, + "loss": 0.69967413, + "num_input_tokens_seen": 285983710, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 13254, + "time_per_iteration": 2.4423506259918213 + }, + { + "auxiliary_loss_clip": 0.01098639, + "auxiliary_loss_mlp": 0.01035877, + "balance_loss_clip": 1.02485597, + "balance_loss_mlp": 1.03396869, + "epoch": 0.7969337141139335, + "flos": 23582250074880.0, + "grad_norm": 1.8439634807377834, + "language_loss": 0.69335532, + "learning_rate": 4.1708452405305314e-07, + "loss": 0.71470052, + "num_input_tokens_seen": 286003425, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 13255, + "time_per_iteration": 2.4770302772521973 + }, + { + "auxiliary_loss_clip": 0.01098301, + "auxiliary_loss_mlp": 0.0103042, + "balance_loss_clip": 1.01932132, + "balance_loss_mlp": 1.03357673, + "epoch": 0.7969938373666016, + "flos": 19755860935680.0, + "grad_norm": 2.0839299199597576, + "language_loss": 0.79384631, + "learning_rate": 4.168465057810733e-07, + "loss": 0.81513351, + "num_input_tokens_seen": 286020130, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6484375, + "step": 13256, + "time_per_iteration": 2.435065507888794 + }, + { + "auxiliary_loss_clip": 0.01100559, + "auxiliary_loss_mlp": 0.01025356, + "balance_loss_clip": 1.01379228, + "balance_loss_mlp": 1.03476715, + "epoch": 0.7970539606192695, + "flos": 24134089697280.0, + "grad_norm": 1.7632548016359857, + "language_loss": 0.65341133, + "learning_rate": 4.166085475424315e-07, + "loss": 0.67467046, + "num_input_tokens_seen": 286040230, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 13257, + "time_per_iteration": 2.4952993392944336 + }, + { + "auxiliary_loss_clip": 0.01106098, + "auxiliary_loss_mlp": 0.01033925, + "balance_loss_clip": 1.02211094, + "balance_loss_mlp": 1.03727162, + "epoch": 0.7971140838719375, + "flos": 17968622895360.0, + "grad_norm": 4.269740157114163, + "language_loss": 0.72265047, + "learning_rate": 4.163706493461523e-07, + "loss": 0.74405068, + "num_input_tokens_seen": 286059475, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 13258, + "time_per_iteration": 2.422609806060791 + }, + { + "auxiliary_loss_clip": 0.01103566, + "auxiliary_loss_mlp": 0.01030154, + "balance_loss_clip": 1.01795912, + "balance_loss_mlp": 1.03580558, + "epoch": 0.7971742071246054, + "flos": 19169547235200.0, + "grad_norm": 1.7787889345265135, + "language_loss": 0.68876815, + "learning_rate": 4.1613281120125655e-07, + "loss": 0.7101053, + "num_input_tokens_seen": 286077820, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6796875, + "step": 13259, + "time_per_iteration": 2.454787015914917 + }, + { + "auxiliary_loss_clip": 0.01098869, + "auxiliary_loss_mlp": 0.01029239, + "balance_loss_clip": 1.01853442, + "balance_loss_mlp": 1.03478527, + "epoch": 0.7972343303772734, + "flos": 27125951178240.0, + "grad_norm": 1.6665251005798685, + "language_loss": 0.73773205, + "learning_rate": 4.158950331167641e-07, + "loss": 0.75901318, + "num_input_tokens_seen": 286097285, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.640625, + "step": 13260, + "time_per_iteration": 2.491205930709839 + }, + { + "auxiliary_loss_clip": 0.01097155, + "auxiliary_loss_mlp": 0.01028441, + "balance_loss_clip": 1.01763427, + "balance_loss_mlp": 1.03306603, + "epoch": 0.7972944536299413, + "flos": 20996646393600.0, + "grad_norm": 1.7740121958206554, + "language_loss": 0.78436148, + "learning_rate": 4.1565731510169065e-07, + "loss": 0.80561745, + "num_input_tokens_seen": 286116000, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.640625, + "step": 13261, + "time_per_iteration": 3.8501453399658203 + }, + { + "auxiliary_loss_clip": 0.01097148, + "auxiliary_loss_mlp": 0.01030088, + "balance_loss_clip": 1.02008629, + "balance_loss_mlp": 1.035748, + "epoch": 0.7973545768826094, + "flos": 21580015178880.0, + "grad_norm": 1.4361813599632072, + "language_loss": 0.75999635, + "learning_rate": 4.154196571650501e-07, + "loss": 0.78126872, + "num_input_tokens_seen": 286135110, + "router_z_loss_clip": 0.10009766, + "router_z_loss_mlp": 0.61328125, + "step": 13262, + "time_per_iteration": 2.4577090740203857 + }, + { + "auxiliary_loss_clip": 0.01107624, + "auxiliary_loss_mlp": 0.0102757, + "balance_loss_clip": 1.01496387, + "balance_loss_mlp": 1.03712356, + "epoch": 0.7974147001352773, + "flos": 20558536208640.0, + "grad_norm": 2.1431092546500103, + "language_loss": 0.7052893, + "learning_rate": 4.1518205931585524e-07, + "loss": 0.7266413, + "num_input_tokens_seen": 286152835, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 13263, + "time_per_iteration": 3.8635799884796143 + }, + { + "auxiliary_loss_clip": 0.01106881, + "auxiliary_loss_mlp": 0.01034326, + "balance_loss_clip": 1.021523, + "balance_loss_mlp": 1.03610015, + "epoch": 0.7974748233879453, + "flos": 20996790048000.0, + "grad_norm": 1.9663243641140786, + "language_loss": 0.71254778, + "learning_rate": 4.149445215631153e-07, + "loss": 0.73395979, + "num_input_tokens_seen": 286171785, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.70703125, + "step": 13264, + "time_per_iteration": 3.8191962242126465 + }, + { + "auxiliary_loss_clip": 0.01098223, + "auxiliary_loss_mlp": 0.01033387, + "balance_loss_clip": 1.0220921, + "balance_loss_mlp": 1.03471494, + "epoch": 0.7975349466406133, + "flos": 22565188477440.0, + "grad_norm": 1.6219090858782177, + "language_loss": 0.76819849, + "learning_rate": 4.1470704391583776e-07, + "loss": 0.78951454, + "num_input_tokens_seen": 286190420, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.63671875, + "step": 13265, + "time_per_iteration": 2.4498677253723145 + }, + { + "auxiliary_loss_clip": 0.0110135, + "auxiliary_loss_mlp": 0.01027942, + "balance_loss_clip": 1.01672459, + "balance_loss_mlp": 1.03420353, + "epoch": 0.7975950698932812, + "flos": 21689542725120.0, + "grad_norm": 1.8502756325316978, + "language_loss": 0.75627744, + "learning_rate": 4.144696263830285e-07, + "loss": 0.77757037, + "num_input_tokens_seen": 286210105, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 13266, + "time_per_iteration": 2.4424939155578613 + }, + { + "auxiliary_loss_clip": 0.01097761, + "auxiliary_loss_mlp": 0.01027384, + "balance_loss_clip": 1.01613104, + "balance_loss_mlp": 1.03291893, + "epoch": 0.7976551931459492, + "flos": 19604568850560.0, + "grad_norm": 1.5381451690373484, + "language_loss": 0.83917278, + "learning_rate": 4.1423226897369015e-07, + "loss": 0.86042428, + "num_input_tokens_seen": 286228180, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6484375, + "step": 13267, + "time_per_iteration": 4.030280113220215 + }, + { + "auxiliary_loss_clip": 0.01099973, + "auxiliary_loss_mlp": 0.01032115, + "balance_loss_clip": 1.02045047, + "balance_loss_mlp": 1.03457189, + "epoch": 0.7977153163986171, + "flos": 21687603390720.0, + "grad_norm": 1.7026811780981197, + "language_loss": 0.75749743, + "learning_rate": 4.139949716968223e-07, + "loss": 0.77881831, + "num_input_tokens_seen": 286247305, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65625, + "step": 13268, + "time_per_iteration": 2.4395506381988525 + }, + { + "auxiliary_loss_clip": 0.01101025, + "auxiliary_loss_mlp": 0.01027573, + "balance_loss_clip": 1.01612282, + "balance_loss_mlp": 1.0355351, + "epoch": 0.7977754396512852, + "flos": 23476780765440.0, + "grad_norm": 1.5399567563780987, + "language_loss": 0.77794158, + "learning_rate": 4.1375773456142403e-07, + "loss": 0.7992276, + "num_input_tokens_seen": 286268145, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65625, + "step": 13269, + "time_per_iteration": 2.4894964694976807 + }, + { + "auxiliary_loss_clip": 0.01096838, + "auxiliary_loss_mlp": 0.01030312, + "balance_loss_clip": 1.01972592, + "balance_loss_mlp": 1.03359246, + "epoch": 0.7978355629039531, + "flos": 22382223575040.0, + "grad_norm": 1.7083868858848195, + "language_loss": 0.82055652, + "learning_rate": 4.135205575764922e-07, + "loss": 0.84182805, + "num_input_tokens_seen": 286286775, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6328125, + "step": 13270, + "time_per_iteration": 2.4561750888824463 + }, + { + "auxiliary_loss_clip": 0.01101524, + "auxiliary_loss_mlp": 0.0102809, + "balance_loss_clip": 1.0165925, + "balance_loss_mlp": 1.03613377, + "epoch": 0.7978956861566211, + "flos": 20266331068800.0, + "grad_norm": 1.6705229084811413, + "language_loss": 0.595366, + "learning_rate": 4.1328344075101905e-07, + "loss": 0.61666214, + "num_input_tokens_seen": 286305590, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65234375, + "step": 13271, + "time_per_iteration": 2.4954357147216797 + }, + { + "auxiliary_loss_clip": 0.01104573, + "auxiliary_loss_mlp": 0.01031498, + "balance_loss_clip": 1.01961303, + "balance_loss_mlp": 1.03635263, + "epoch": 0.797955809409289, + "flos": 28112417366400.0, + "grad_norm": 1.5850933882113063, + "language_loss": 0.73206866, + "learning_rate": 4.130463840939975e-07, + "loss": 0.75342935, + "num_input_tokens_seen": 286328050, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.68359375, + "step": 13272, + "time_per_iteration": 2.509640693664551 + }, + { + "auxiliary_loss_clip": 0.0110219, + "auxiliary_loss_mlp": 0.01027919, + "balance_loss_clip": 1.01630759, + "balance_loss_mlp": 1.03711224, + "epoch": 0.798015932661957, + "flos": 15559591495680.0, + "grad_norm": 2.009910797942707, + "language_loss": 0.71586084, + "learning_rate": 4.128093876144161e-07, + "loss": 0.73716193, + "num_input_tokens_seen": 286345265, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 13273, + "time_per_iteration": 2.488239049911499 + }, + { + "auxiliary_loss_clip": 0.01103696, + "auxiliary_loss_mlp": 0.01030621, + "balance_loss_clip": 1.01880777, + "balance_loss_mlp": 1.03583157, + "epoch": 0.7980760559146249, + "flos": 23951196622080.0, + "grad_norm": 1.887561029731992, + "language_loss": 0.7577731, + "learning_rate": 4.1257245132126117e-07, + "loss": 0.77911627, + "num_input_tokens_seen": 286364465, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 13274, + "time_per_iteration": 2.462188482284546 + }, + { + "auxiliary_loss_clip": 0.01095507, + "auxiliary_loss_mlp": 0.01028761, + "balance_loss_clip": 1.01827025, + "balance_loss_mlp": 1.03438878, + "epoch": 0.798136179167293, + "flos": 28038082170240.0, + "grad_norm": 1.3212411504254853, + "language_loss": 0.77607358, + "learning_rate": 4.12335575223518e-07, + "loss": 0.79731625, + "num_input_tokens_seen": 286385565, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.61328125, + "step": 13275, + "time_per_iteration": 2.514090061187744 + }, + { + "auxiliary_loss_clip": 0.0110285, + "auxiliary_loss_mlp": 0.01031831, + "balance_loss_clip": 1.01921892, + "balance_loss_mlp": 1.0345515, + "epoch": 0.7981963024199609, + "flos": 35984538046080.0, + "grad_norm": 1.829043802525264, + "language_loss": 0.64052433, + "learning_rate": 4.1209875933016877e-07, + "loss": 0.66187114, + "num_input_tokens_seen": 286403950, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.68359375, + "step": 13276, + "time_per_iteration": 2.5371670722961426 + }, + { + "auxiliary_loss_clip": 0.01097788, + "auxiliary_loss_mlp": 0.01028439, + "balance_loss_clip": 1.01740062, + "balance_loss_mlp": 1.03425479, + "epoch": 0.7982564256726289, + "flos": 25884914325120.0, + "grad_norm": 1.5295363534394828, + "language_loss": 0.60448158, + "learning_rate": 4.118620036501945e-07, + "loss": 0.62574387, + "num_input_tokens_seen": 286426160, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.63671875, + "step": 13277, + "time_per_iteration": 2.4880197048187256 + }, + { + "auxiliary_loss_clip": 0.01105128, + "auxiliary_loss_mlp": 0.01030377, + "balance_loss_clip": 1.01911783, + "balance_loss_mlp": 1.0375464, + "epoch": 0.7983165489252969, + "flos": 25739152934400.0, + "grad_norm": 2.008631814369184, + "language_loss": 0.79715037, + "learning_rate": 4.1162530819257227e-07, + "loss": 0.81850541, + "num_input_tokens_seen": 286446610, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.67578125, + "step": 13278, + "time_per_iteration": 2.4780664443969727 + }, + { + "auxiliary_loss_clip": 0.01103768, + "auxiliary_loss_mlp": 0.01037367, + "balance_loss_clip": 1.02485633, + "balance_loss_mlp": 1.03559947, + "epoch": 0.7983766721779648, + "flos": 21908202768000.0, + "grad_norm": 1.9965492403610876, + "language_loss": 0.6323722, + "learning_rate": 4.113886729662768e-07, + "loss": 0.65378356, + "num_input_tokens_seen": 286465460, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 13279, + "time_per_iteration": 2.4683034420013428 + }, + { + "auxiliary_loss_clip": 0.01095285, + "auxiliary_loss_mlp": 0.01026468, + "balance_loss_clip": 1.01601911, + "balance_loss_mlp": 1.03389192, + "epoch": 0.7984367954306328, + "flos": 29347420734720.0, + "grad_norm": 1.6504787755208947, + "language_loss": 0.70773625, + "learning_rate": 4.111520979802825e-07, + "loss": 0.72895384, + "num_input_tokens_seen": 286485720, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.61328125, + "step": 13280, + "time_per_iteration": 2.4923903942108154 + }, + { + "auxiliary_loss_clip": 0.01103118, + "auxiliary_loss_mlp": 0.01031209, + "balance_loss_clip": 1.01919234, + "balance_loss_mlp": 1.03547907, + "epoch": 0.7984969186833007, + "flos": 31357772104320.0, + "grad_norm": 1.6234618647236767, + "language_loss": 0.62751859, + "learning_rate": 4.1091558324355955e-07, + "loss": 0.64886189, + "num_input_tokens_seen": 286507465, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 13281, + "time_per_iteration": 2.5414252281188965 + }, + { + "auxiliary_loss_clip": 0.01104951, + "auxiliary_loss_mlp": 0.01033082, + "balance_loss_clip": 1.02107787, + "balance_loss_mlp": 1.03481054, + "epoch": 0.7985570419359688, + "flos": 24312924535680.0, + "grad_norm": 1.7029379552600752, + "language_loss": 0.80491292, + "learning_rate": 4.1067912876507683e-07, + "loss": 0.82629329, + "num_input_tokens_seen": 286526345, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.703125, + "step": 13282, + "time_per_iteration": 2.4520959854125977 + }, + { + "auxiliary_loss_clip": 0.01101884, + "auxiliary_loss_mlp": 0.0103017, + "balance_loss_clip": 1.01801062, + "balance_loss_mlp": 1.03421319, + "epoch": 0.7986171651886367, + "flos": 15742233175680.0, + "grad_norm": 1.8947522031030082, + "language_loss": 0.7154727, + "learning_rate": 4.10442734553802e-07, + "loss": 0.73679316, + "num_input_tokens_seen": 286544095, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6796875, + "step": 13283, + "time_per_iteration": 2.4246160984039307 + }, + { + "auxiliary_loss_clip": 0.01098743, + "auxiliary_loss_mlp": 0.01029389, + "balance_loss_clip": 1.01833189, + "balance_loss_mlp": 1.03302252, + "epoch": 0.7986772884413047, + "flos": 11619401091840.0, + "grad_norm": 1.8968441964994822, + "language_loss": 0.7347362, + "learning_rate": 4.102064006186967e-07, + "loss": 0.75601751, + "num_input_tokens_seen": 286560960, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 13284, + "time_per_iteration": 2.402165174484253 + }, + { + "auxiliary_loss_clip": 0.01100101, + "auxiliary_loss_mlp": 0.01030516, + "balance_loss_clip": 1.02016854, + "balance_loss_mlp": 1.03526652, + "epoch": 0.7987374116939726, + "flos": 22091059929600.0, + "grad_norm": 1.5742258488227296, + "language_loss": 0.70226932, + "learning_rate": 4.0997012696872415e-07, + "loss": 0.72357547, + "num_input_tokens_seen": 286579865, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.6484375, + "step": 13285, + "time_per_iteration": 2.5729713439941406 + }, + { + "auxiliary_loss_clip": 0.0109928, + "auxiliary_loss_mlp": 0.01027172, + "balance_loss_clip": 1.01627612, + "balance_loss_mlp": 1.03425968, + "epoch": 0.7987975349466406, + "flos": 17890696339200.0, + "grad_norm": 1.5373042942121937, + "language_loss": 0.73492497, + "learning_rate": 4.097339136128437e-07, + "loss": 0.75618953, + "num_input_tokens_seen": 286597295, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6484375, + "step": 13286, + "time_per_iteration": 2.435335874557495 + }, + { + "auxiliary_loss_clip": 0.01100513, + "auxiliary_loss_mlp": 0.01032503, + "balance_loss_clip": 1.02102304, + "balance_loss_mlp": 1.03493309, + "epoch": 0.7988576581993085, + "flos": 19719232041600.0, + "grad_norm": 2.2146294120164876, + "language_loss": 0.74433863, + "learning_rate": 4.0949776056001296e-07, + "loss": 0.76566875, + "num_input_tokens_seen": 286616270, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 13287, + "time_per_iteration": 2.4583966732025146 + }, + { + "auxiliary_loss_clip": 0.0109898, + "auxiliary_loss_mlp": 0.01028536, + "balance_loss_clip": 1.0171752, + "balance_loss_mlp": 1.03461981, + "epoch": 0.7989177814519766, + "flos": 28036358317440.0, + "grad_norm": 2.4603095156491457, + "language_loss": 0.61630833, + "learning_rate": 4.092616678191863e-07, + "loss": 0.63758349, + "num_input_tokens_seen": 286638315, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.64453125, + "step": 13288, + "time_per_iteration": 2.5024874210357666 + }, + { + "auxiliary_loss_clip": 0.01100282, + "auxiliary_loss_mlp": 0.0102498, + "balance_loss_clip": 1.0142808, + "balance_loss_mlp": 1.03618968, + "epoch": 0.7989779047046445, + "flos": 28871029630080.0, + "grad_norm": 2.398551145532932, + "language_loss": 0.70419228, + "learning_rate": 4.090256353993169e-07, + "loss": 0.72544491, + "num_input_tokens_seen": 286658630, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.640625, + "step": 13289, + "time_per_iteration": 2.5227341651916504 + }, + { + "auxiliary_loss_clip": 0.01099382, + "auxiliary_loss_mlp": 0.01032264, + "balance_loss_clip": 1.01975262, + "balance_loss_mlp": 1.03570962, + "epoch": 0.7990380279573125, + "flos": 18186887888640.0, + "grad_norm": 3.476010785150094, + "language_loss": 0.62750173, + "learning_rate": 4.0878966330935506e-07, + "loss": 0.64881819, + "num_input_tokens_seen": 286676870, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.63671875, + "step": 13290, + "time_per_iteration": 2.413945436477661 + }, + { + "auxiliary_loss_clip": 0.01103234, + "auxiliary_loss_mlp": 0.0102787, + "balance_loss_clip": 1.0152936, + "balance_loss_mlp": 1.03642523, + "epoch": 0.7990981512099805, + "flos": 20879936127360.0, + "grad_norm": 3.9151132007409513, + "language_loss": 0.71637499, + "learning_rate": 4.08553751558248e-07, + "loss": 0.73768604, + "num_input_tokens_seen": 286694300, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.66796875, + "step": 13291, + "time_per_iteration": 2.4885568618774414 + }, + { + "auxiliary_loss_clip": 0.01097167, + "auxiliary_loss_mlp": 0.01025771, + "balance_loss_clip": 1.01506627, + "balance_loss_mlp": 1.03270483, + "epoch": 0.7991582744626484, + "flos": 26099911180800.0, + "grad_norm": 1.5485118073746154, + "language_loss": 0.6335237, + "learning_rate": 4.083179001549422e-07, + "loss": 0.65475303, + "num_input_tokens_seen": 286714545, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6484375, + "step": 13292, + "time_per_iteration": 2.4616239070892334 + }, + { + "auxiliary_loss_clip": 0.0109979, + "auxiliary_loss_mlp": 0.01029505, + "balance_loss_clip": 1.0185678, + "balance_loss_mlp": 1.0349102, + "epoch": 0.7992183977153164, + "flos": 35295843605760.0, + "grad_norm": 1.555240655733236, + "language_loss": 0.56249213, + "learning_rate": 4.0808210910838105e-07, + "loss": 0.58378512, + "num_input_tokens_seen": 286734525, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 13293, + "time_per_iteration": 2.5668938159942627 + }, + { + "auxiliary_loss_clip": 0.01101281, + "auxiliary_loss_mlp": 0.01032816, + "balance_loss_clip": 1.02145505, + "balance_loss_mlp": 1.03606391, + "epoch": 0.7992785209679844, + "flos": 51853426577280.0, + "grad_norm": 2.987312394872763, + "language_loss": 0.71444452, + "learning_rate": 4.0784637842750704e-07, + "loss": 0.73578554, + "num_input_tokens_seen": 286753430, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65234375, + "step": 13294, + "time_per_iteration": 2.693946361541748 + }, + { + "auxiliary_loss_clip": 0.01101257, + "auxiliary_loss_mlp": 0.01030873, + "balance_loss_clip": 1.0196135, + "balance_loss_mlp": 1.03554058, + "epoch": 0.7993386442206524, + "flos": 22565116650240.0, + "grad_norm": 1.7329593206167035, + "language_loss": 0.72202832, + "learning_rate": 4.0761070812125675e-07, + "loss": 0.74334961, + "num_input_tokens_seen": 286771915, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 13295, + "time_per_iteration": 2.440544605255127 + }, + { + "auxiliary_loss_clip": 0.01096658, + "auxiliary_loss_mlp": 0.01030621, + "balance_loss_clip": 1.01995802, + "balance_loss_mlp": 1.03398633, + "epoch": 0.7993987674733203, + "flos": 18800277465600.0, + "grad_norm": 2.1702200839393395, + "language_loss": 0.76480281, + "learning_rate": 4.0737509819856797e-07, + "loss": 0.78607565, + "num_input_tokens_seen": 286789835, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.625, + "step": 13296, + "time_per_iteration": 2.4405605792999268 + }, + { + "auxiliary_loss_clip": 0.01023152, + "auxiliary_loss_mlp": 0.01002637, + "balance_loss_clip": 1.001755, + "balance_loss_mlp": 1.00325012, + "epoch": 0.7994588907259883, + "flos": 69421720394880.0, + "grad_norm": 0.6861737124330846, + "language_loss": 0.60802543, + "learning_rate": 4.0713954866837573e-07, + "loss": 0.62828332, + "num_input_tokens_seen": 286855580, + "router_z_loss_clip": 0.0088501, + "router_z_loss_mlp": 0.19921875, + "step": 13297, + "time_per_iteration": 3.11775541305542 + }, + { + "auxiliary_loss_clip": 0.01099558, + "auxiliary_loss_mlp": 0.01029569, + "balance_loss_clip": 1.01829767, + "balance_loss_mlp": 1.03441751, + "epoch": 0.7995190139786562, + "flos": 13480327883520.0, + "grad_norm": 1.8885665209520346, + "language_loss": 0.70239675, + "learning_rate": 4.0690405953961073e-07, + "loss": 0.72368801, + "num_input_tokens_seen": 286874360, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65234375, + "step": 13298, + "time_per_iteration": 2.4225876331329346 + }, + { + "auxiliary_loss_clip": 0.01103672, + "auxiliary_loss_mlp": 0.0103276, + "balance_loss_clip": 1.02003491, + "balance_loss_mlp": 1.0351696, + "epoch": 0.7995791372313242, + "flos": 21652842003840.0, + "grad_norm": 1.9550250872317747, + "language_loss": 0.75762308, + "learning_rate": 4.066686308212037e-07, + "loss": 0.77898747, + "num_input_tokens_seen": 286891950, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 13299, + "time_per_iteration": 2.4788718223571777 + }, + { + "auxiliary_loss_clip": 0.01098072, + "auxiliary_loss_mlp": 0.01027232, + "balance_loss_clip": 1.01662207, + "balance_loss_mlp": 1.03498912, + "epoch": 0.7996392604839921, + "flos": 26068130622720.0, + "grad_norm": 1.7719100438283584, + "language_loss": 0.77760887, + "learning_rate": 4.064332625220828e-07, + "loss": 0.79886186, + "num_input_tokens_seen": 286911725, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.6328125, + "step": 13300, + "time_per_iteration": 2.4796881675720215 + }, + { + "auxiliary_loss_clip": 0.01101744, + "auxiliary_loss_mlp": 0.01027746, + "balance_loss_clip": 1.01584315, + "balance_loss_mlp": 1.03424072, + "epoch": 0.7996993837366602, + "flos": 24606889441920.0, + "grad_norm": 2.002040406516657, + "language_loss": 0.63432777, + "learning_rate": 4.0619795465117115e-07, + "loss": 0.65562272, + "num_input_tokens_seen": 286931400, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 13301, + "time_per_iteration": 2.4858558177948 + }, + { + "auxiliary_loss_clip": 0.01098587, + "auxiliary_loss_mlp": 0.01033841, + "balance_loss_clip": 1.02241445, + "balance_loss_mlp": 1.0356431, + "epoch": 0.7997595069893281, + "flos": 20992049452800.0, + "grad_norm": 1.6279257021355094, + "language_loss": 0.71502745, + "learning_rate": 4.059627072173928e-07, + "loss": 0.73635173, + "num_input_tokens_seen": 286949795, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.62890625, + "step": 13302, + "time_per_iteration": 2.4388864040374756 + }, + { + "auxiliary_loss_clip": 0.01104001, + "auxiliary_loss_mlp": 0.01030712, + "balance_loss_clip": 1.01885068, + "balance_loss_mlp": 1.03659701, + "epoch": 0.7998196302419961, + "flos": 24426510318720.0, + "grad_norm": 1.833344875316907, + "language_loss": 0.83622801, + "learning_rate": 4.057275202296684e-07, + "loss": 0.85757518, + "num_input_tokens_seen": 286968805, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 13303, + "time_per_iteration": 3.86017107963562 + }, + { + "auxiliary_loss_clip": 0.01098042, + "auxiliary_loss_mlp": 0.01030765, + "balance_loss_clip": 1.02020907, + "balance_loss_mlp": 1.03435075, + "epoch": 0.7998797534946641, + "flos": 30264651457920.0, + "grad_norm": 1.6100512541022713, + "language_loss": 0.5873881, + "learning_rate": 4.054923936969166e-07, + "loss": 0.60867614, + "num_input_tokens_seen": 286990235, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.640625, + "step": 13304, + "time_per_iteration": 2.5343167781829834 + }, + { + "auxiliary_loss_clip": 0.01101146, + "auxiliary_loss_mlp": 0.01027155, + "balance_loss_clip": 1.01525831, + "balance_loss_mlp": 1.03274465, + "epoch": 0.799939876747332, + "flos": 23513984277120.0, + "grad_norm": 1.7664004546927765, + "language_loss": 0.69075799, + "learning_rate": 4.0525732762805265e-07, + "loss": 0.71204102, + "num_input_tokens_seen": 287011060, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.68359375, + "step": 13305, + "time_per_iteration": 3.846991777420044 + }, + { + "auxiliary_loss_clip": 0.01097982, + "auxiliary_loss_mlp": 0.01026107, + "balance_loss_clip": 1.0154916, + "balance_loss_mlp": 1.03421581, + "epoch": 0.8, + "flos": 19318109886720.0, + "grad_norm": 1.5410766724401597, + "language_loss": 0.6923117, + "learning_rate": 4.0502232203199107e-07, + "loss": 0.71355259, + "num_input_tokens_seen": 287029215, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.63671875, + "step": 13306, + "time_per_iteration": 3.824300527572632 + }, + { + "auxiliary_loss_clip": 0.01101652, + "auxiliary_loss_mlp": 0.01033892, + "balance_loss_clip": 1.02248406, + "balance_loss_mlp": 1.03599465, + "epoch": 0.800060123252668, + "flos": 32412432263040.0, + "grad_norm": 1.5349326427116308, + "language_loss": 0.69361722, + "learning_rate": 4.0478737691764286e-07, + "loss": 0.71497267, + "num_input_tokens_seen": 287050855, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 13307, + "time_per_iteration": 2.641338348388672 + }, + { + "auxiliary_loss_clip": 0.01100663, + "auxiliary_loss_mlp": 0.01029461, + "balance_loss_clip": 1.01857102, + "balance_loss_mlp": 1.03444958, + "epoch": 0.800120246505336, + "flos": 20010611168640.0, + "grad_norm": 1.9021997746458712, + "language_loss": 0.76933712, + "learning_rate": 4.0455249229391677e-07, + "loss": 0.79063845, + "num_input_tokens_seen": 287069915, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6640625, + "step": 13308, + "time_per_iteration": 2.449411630630493 + }, + { + "auxiliary_loss_clip": 0.01102913, + "auxiliary_loss_mlp": 0.01029431, + "balance_loss_clip": 1.01708674, + "balance_loss_mlp": 1.03450274, + "epoch": 0.8001803697580039, + "flos": 31868278151040.0, + "grad_norm": 1.9120896372435958, + "language_loss": 0.78702182, + "learning_rate": 4.0431766816972e-07, + "loss": 0.80834526, + "num_input_tokens_seen": 287091450, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.68359375, + "step": 13309, + "time_per_iteration": 4.006925106048584 + }, + { + "auxiliary_loss_clip": 0.01023336, + "auxiliary_loss_mlp": 0.0100397, + "balance_loss_clip": 1.00295115, + "balance_loss_mlp": 1.00320923, + "epoch": 0.8002404930106719, + "flos": 63392066916480.0, + "grad_norm": 0.9338382930636256, + "language_loss": 0.64702326, + "learning_rate": 4.040829045539571e-07, + "loss": 0.66729629, + "num_input_tokens_seen": 287148365, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.20117188, + "step": 13310, + "time_per_iteration": 2.975738525390625 + }, + { + "auxiliary_loss_clip": 0.01100488, + "auxiliary_loss_mlp": 0.01034343, + "balance_loss_clip": 1.02319705, + "balance_loss_mlp": 1.03539252, + "epoch": 0.8003006162633398, + "flos": 27855476403840.0, + "grad_norm": 4.652395781854749, + "language_loss": 0.82905459, + "learning_rate": 4.0384820145553156e-07, + "loss": 0.85040295, + "num_input_tokens_seen": 287168280, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65234375, + "step": 13311, + "time_per_iteration": 2.5010745525360107 + }, + { + "auxiliary_loss_clip": 0.01100391, + "auxiliary_loss_mlp": 0.0103207, + "balance_loss_clip": 1.0205307, + "balance_loss_mlp": 1.03499472, + "epoch": 0.8003607395160078, + "flos": 18223337214720.0, + "grad_norm": 2.136696844503174, + "language_loss": 0.6653198, + "learning_rate": 4.0361355888334116e-07, + "loss": 0.68664443, + "num_input_tokens_seen": 287185980, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 13312, + "time_per_iteration": 2.413475275039673 + }, + { + "auxiliary_loss_clip": 0.01104828, + "auxiliary_loss_mlp": 0.01030863, + "balance_loss_clip": 1.01855493, + "balance_loss_mlp": 1.03699255, + "epoch": 0.8004208627686757, + "flos": 20886975192960.0, + "grad_norm": 1.6185384671425953, + "language_loss": 0.75226915, + "learning_rate": 4.033789768462843e-07, + "loss": 0.77362603, + "num_input_tokens_seen": 287203875, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6796875, + "step": 13313, + "time_per_iteration": 2.4858338832855225 + }, + { + "auxiliary_loss_clip": 0.01099877, + "auxiliary_loss_mlp": 0.01029079, + "balance_loss_clip": 1.0179745, + "balance_loss_mlp": 1.03416753, + "epoch": 0.8004809860213438, + "flos": 26436143416320.0, + "grad_norm": 1.3383617423886183, + "language_loss": 0.75627804, + "learning_rate": 4.031444553532575e-07, + "loss": 0.77756763, + "num_input_tokens_seen": 287226445, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 13314, + "time_per_iteration": 2.544503927230835 + }, + { + "auxiliary_loss_clip": 0.01023163, + "auxiliary_loss_mlp": 0.00998551, + "balance_loss_clip": 0.99763316, + "balance_loss_mlp": 1.00305307, + "epoch": 0.8005411092740117, + "flos": 63648612829440.0, + "grad_norm": 0.8171555714712136, + "language_loss": 0.53831571, + "learning_rate": 4.029099944131522e-07, + "loss": 0.55853283, + "num_input_tokens_seen": 287286240, + "router_z_loss_clip": 0.00915527, + "router_z_loss_mlp": 0.20117188, + "step": 13315, + "time_per_iteration": 2.9481279850006104 + }, + { + "auxiliary_loss_clip": 0.01099698, + "auxiliary_loss_mlp": 0.01028111, + "balance_loss_clip": 1.0170964, + "balance_loss_mlp": 1.03518033, + "epoch": 0.8006012325266797, + "flos": 36138056774400.0, + "grad_norm": 2.5476348031673157, + "language_loss": 0.71353263, + "learning_rate": 4.026755940348603e-07, + "loss": 0.73481071, + "num_input_tokens_seen": 287310265, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 13316, + "time_per_iteration": 2.573376417160034 + }, + { + "auxiliary_loss_clip": 0.011043, + "auxiliary_loss_mlp": 0.01031222, + "balance_loss_clip": 1.01980805, + "balance_loss_mlp": 1.0358212, + "epoch": 0.8006613557793477, + "flos": 33838947970560.0, + "grad_norm": 1.707434135100754, + "language_loss": 0.64464766, + "learning_rate": 4.024412542272706e-07, + "loss": 0.66600287, + "num_input_tokens_seen": 287331610, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6875, + "step": 13317, + "time_per_iteration": 2.5424327850341797 + }, + { + "auxiliary_loss_clip": 0.01023338, + "auxiliary_loss_mlp": 0.00999, + "balance_loss_clip": 0.99798673, + "balance_loss_mlp": 1.00323308, + "epoch": 0.8007214790320156, + "flos": 67348310699520.0, + "grad_norm": 0.7645299039687239, + "language_loss": 0.59047085, + "learning_rate": 4.0220697499926783e-07, + "loss": 0.61069423, + "num_input_tokens_seen": 287394795, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.20117188, + "step": 13318, + "time_per_iteration": 3.1595919132232666 + }, + { + "auxiliary_loss_clip": 0.01097271, + "auxiliary_loss_mlp": 0.01022623, + "balance_loss_clip": 1.0112803, + "balance_loss_mlp": 1.03338647, + "epoch": 0.8007816022846836, + "flos": 23185653033600.0, + "grad_norm": 1.5349381284748576, + "language_loss": 0.66329014, + "learning_rate": 4.019727563597366e-07, + "loss": 0.68448913, + "num_input_tokens_seen": 287414595, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 13319, + "time_per_iteration": 2.4969546794891357 + }, + { + "auxiliary_loss_clip": 0.01101497, + "auxiliary_loss_mlp": 0.01034852, + "balance_loss_clip": 1.02248383, + "balance_loss_mlp": 1.0337888, + "epoch": 0.8008417255373516, + "flos": 21981388728960.0, + "grad_norm": 1.7719196822913061, + "language_loss": 0.74056709, + "learning_rate": 4.0173859831755873e-07, + "loss": 0.76193058, + "num_input_tokens_seen": 287434395, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.67578125, + "step": 13320, + "time_per_iteration": 2.4454445838928223 + }, + { + "auxiliary_loss_clip": 0.01101301, + "auxiliary_loss_mlp": 0.01025658, + "balance_loss_clip": 1.01393962, + "balance_loss_mlp": 1.03455615, + "epoch": 0.8009018487900196, + "flos": 16727334647040.0, + "grad_norm": 2.1051785916089485, + "language_loss": 0.80298382, + "learning_rate": 4.015045008816138e-07, + "loss": 0.82425332, + "num_input_tokens_seen": 287450590, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66796875, + "step": 13321, + "time_per_iteration": 2.4167563915252686 + }, + { + "auxiliary_loss_clip": 0.01094949, + "auxiliary_loss_mlp": 0.01029655, + "balance_loss_clip": 1.01849723, + "balance_loss_mlp": 1.03163719, + "epoch": 0.8009619720426875, + "flos": 20813609664000.0, + "grad_norm": 1.9091600224392815, + "language_loss": 0.65907997, + "learning_rate": 4.0127046406077825e-07, + "loss": 0.68032598, + "num_input_tokens_seen": 287468455, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6328125, + "step": 13322, + "time_per_iteration": 2.4598209857940674 + }, + { + "auxiliary_loss_clip": 0.01099776, + "auxiliary_loss_mlp": 0.01025985, + "balance_loss_clip": 1.01448131, + "balance_loss_mlp": 1.03424549, + "epoch": 0.8010220952953555, + "flos": 17931096161280.0, + "grad_norm": 1.7799462276417908, + "language_loss": 0.78038085, + "learning_rate": 4.010364878639265e-07, + "loss": 0.80163848, + "num_input_tokens_seen": 287486485, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 13323, + "time_per_iteration": 2.4523849487304688 + }, + { + "auxiliary_loss_clip": 0.01100974, + "auxiliary_loss_mlp": 0.01026925, + "balance_loss_clip": 1.01565957, + "balance_loss_mlp": 1.03384399, + "epoch": 0.8010822185480234, + "flos": 24572235795840.0, + "grad_norm": 2.287441188670043, + "language_loss": 0.7207495, + "learning_rate": 4.00802572299932e-07, + "loss": 0.74202847, + "num_input_tokens_seen": 287503940, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 13324, + "time_per_iteration": 2.4756648540496826 + }, + { + "auxiliary_loss_clip": 0.01101897, + "auxiliary_loss_mlp": 0.01029215, + "balance_loss_clip": 1.01729989, + "balance_loss_mlp": 1.03378117, + "epoch": 0.8011423418006914, + "flos": 21829988903040.0, + "grad_norm": 1.7124158785666046, + "language_loss": 0.76591057, + "learning_rate": 4.005687173776635e-07, + "loss": 0.78722167, + "num_input_tokens_seen": 287521660, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 13325, + "time_per_iteration": 2.4179391860961914 + }, + { + "auxiliary_loss_clip": 0.01093003, + "auxiliary_loss_mlp": 0.01024591, + "balance_loss_clip": 1.01423728, + "balance_loss_mlp": 1.03209794, + "epoch": 0.8012024650533593, + "flos": 23915178259200.0, + "grad_norm": 1.4883986752450347, + "language_loss": 0.79431766, + "learning_rate": 4.003349231059898e-07, + "loss": 0.81549358, + "num_input_tokens_seen": 287541505, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.609375, + "step": 13326, + "time_per_iteration": 2.5107691287994385 + }, + { + "auxiliary_loss_clip": 0.01098238, + "auxiliary_loss_mlp": 0.01031505, + "balance_loss_clip": 1.02084196, + "balance_loss_mlp": 1.03452349, + "epoch": 0.8012625883060274, + "flos": 23587062497280.0, + "grad_norm": 1.8420056555036817, + "language_loss": 0.66117686, + "learning_rate": 4.001011894937765e-07, + "loss": 0.68247426, + "num_input_tokens_seen": 287560015, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.63671875, + "step": 13327, + "time_per_iteration": 2.4520139694213867 + }, + { + "auxiliary_loss_clip": 0.01095786, + "auxiliary_loss_mlp": 0.01026094, + "balance_loss_clip": 1.01527607, + "balance_loss_mlp": 1.03323984, + "epoch": 0.8013227115586953, + "flos": 20813932886400.0, + "grad_norm": 2.067352275023529, + "language_loss": 0.73374075, + "learning_rate": 3.9986751654988636e-07, + "loss": 0.75495958, + "num_input_tokens_seen": 287579150, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.625, + "step": 13328, + "time_per_iteration": 2.4723997116088867 + }, + { + "auxiliary_loss_clip": 0.01101687, + "auxiliary_loss_mlp": 0.0103048, + "balance_loss_clip": 1.01789129, + "balance_loss_mlp": 1.03389966, + "epoch": 0.8013828348113633, + "flos": 15888317788800.0, + "grad_norm": 2.2657788983381573, + "language_loss": 0.73454827, + "learning_rate": 3.996339042831798e-07, + "loss": 0.75586998, + "num_input_tokens_seen": 287597420, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.67578125, + "step": 13329, + "time_per_iteration": 2.3919262886047363 + }, + { + "auxiliary_loss_clip": 0.01023092, + "auxiliary_loss_mlp": 0.01000506, + "balance_loss_clip": 0.99962944, + "balance_loss_mlp": 1.00292683, + "epoch": 0.8014429580640313, + "flos": 71062981562880.0, + "grad_norm": 0.7030773083035402, + "language_loss": 0.52944195, + "learning_rate": 3.9940035270251605e-07, + "loss": 0.54967791, + "num_input_tokens_seen": 287667280, + "router_z_loss_clip": 0.00878906, + "router_z_loss_mlp": 0.20117188, + "step": 13330, + "time_per_iteration": 3.1469671726226807 + }, + { + "auxiliary_loss_clip": 0.0110212, + "auxiliary_loss_mlp": 0.01033988, + "balance_loss_clip": 1.02126801, + "balance_loss_mlp": 1.03404713, + "epoch": 0.8015030813166992, + "flos": 23076340968960.0, + "grad_norm": 1.7749874535506867, + "language_loss": 0.72585219, + "learning_rate": 3.991668618167519e-07, + "loss": 0.74721324, + "num_input_tokens_seen": 287687375, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 13331, + "time_per_iteration": 2.4748101234436035 + }, + { + "auxiliary_loss_clip": 0.01097861, + "auxiliary_loss_mlp": 0.01025208, + "balance_loss_clip": 1.01499796, + "balance_loss_mlp": 1.03351557, + "epoch": 0.8015632045693672, + "flos": 21872328059520.0, + "grad_norm": 1.8459945911210676, + "language_loss": 0.77300894, + "learning_rate": 3.989334316347401e-07, + "loss": 0.79423964, + "num_input_tokens_seen": 287707895, + "router_z_loss_clip": 0.10205078, + "router_z_loss_mlp": 0.64453125, + "step": 13332, + "time_per_iteration": 2.531803846359253 + }, + { + "auxiliary_loss_clip": 0.01100359, + "auxiliary_loss_mlp": 0.01026397, + "balance_loss_clip": 1.01504815, + "balance_loss_mlp": 1.03493059, + "epoch": 0.8016233278220352, + "flos": 23656728925440.0, + "grad_norm": 2.098330097489523, + "language_loss": 0.83255255, + "learning_rate": 3.987000621653338e-07, + "loss": 0.85382015, + "num_input_tokens_seen": 287723990, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65234375, + "step": 13333, + "time_per_iteration": 2.463542938232422 + }, + { + "auxiliary_loss_clip": 0.01099481, + "auxiliary_loss_mlp": 0.01025625, + "balance_loss_clip": 1.01358485, + "balance_loss_mlp": 1.03295958, + "epoch": 0.8016834510747032, + "flos": 16253170185600.0, + "grad_norm": 1.517795063726895, + "language_loss": 0.73388004, + "learning_rate": 3.9846675341738133e-07, + "loss": 0.75513119, + "num_input_tokens_seen": 287742380, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.66796875, + "step": 13334, + "time_per_iteration": 2.433274984359741 + }, + { + "auxiliary_loss_clip": 0.01098378, + "auxiliary_loss_mlp": 0.01026638, + "balance_loss_clip": 1.01558757, + "balance_loss_mlp": 1.03512001, + "epoch": 0.8017435743273711, + "flos": 12276027665280.0, + "grad_norm": 1.9072349752607467, + "language_loss": 0.7468729, + "learning_rate": 3.9823350539972967e-07, + "loss": 0.76812309, + "num_input_tokens_seen": 287760130, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6328125, + "step": 13335, + "time_per_iteration": 2.42297625541687 + }, + { + "auxiliary_loss_clip": 0.01096886, + "auxiliary_loss_mlp": 0.01025738, + "balance_loss_clip": 1.01403213, + "balance_loss_mlp": 1.03252053, + "epoch": 0.8018036975800391, + "flos": 17196112068480.0, + "grad_norm": 3.6392795716902553, + "language_loss": 0.75419021, + "learning_rate": 3.9800031812122416e-07, + "loss": 0.77541637, + "num_input_tokens_seen": 287777565, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.640625, + "step": 13336, + "time_per_iteration": 2.520874261856079 + }, + { + "auxiliary_loss_clip": 0.01106538, + "auxiliary_loss_mlp": 0.01034989, + "balance_loss_clip": 1.02240658, + "balance_loss_mlp": 1.03661346, + "epoch": 0.801863820832707, + "flos": 20631865824000.0, + "grad_norm": 2.0097115847090556, + "language_loss": 0.74682361, + "learning_rate": 3.977671915907068e-07, + "loss": 0.76823884, + "num_input_tokens_seen": 287796310, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69921875, + "step": 13337, + "time_per_iteration": 2.4630508422851562 + }, + { + "auxiliary_loss_clip": 0.0110342, + "auxiliary_loss_mlp": 0.01031638, + "balance_loss_clip": 1.01966929, + "balance_loss_mlp": 1.03606987, + "epoch": 0.801923944085375, + "flos": 30445569285120.0, + "grad_norm": 1.6741859292029853, + "language_loss": 0.80250359, + "learning_rate": 3.9753412581701883e-07, + "loss": 0.82385421, + "num_input_tokens_seen": 287817330, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.671875, + "step": 13338, + "time_per_iteration": 2.527026414871216 + }, + { + "auxiliary_loss_clip": 0.01102243, + "auxiliary_loss_mlp": 0.01029555, + "balance_loss_clip": 1.01716948, + "balance_loss_mlp": 1.03478169, + "epoch": 0.801984067338043, + "flos": 20010575255040.0, + "grad_norm": 1.7639189487021163, + "language_loss": 0.74894798, + "learning_rate": 3.9730112080899733e-07, + "loss": 0.77026594, + "num_input_tokens_seen": 287835095, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 13339, + "time_per_iteration": 2.4820516109466553 + }, + { + "auxiliary_loss_clip": 0.01096664, + "auxiliary_loss_mlp": 0.0102455, + "balance_loss_clip": 1.01394606, + "balance_loss_mlp": 1.03399611, + "epoch": 0.802044190590711, + "flos": 22784028088320.0, + "grad_norm": 1.6460943176335554, + "language_loss": 0.7905581, + "learning_rate": 3.970681765754775e-07, + "loss": 0.81177026, + "num_input_tokens_seen": 287854595, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.625, + "step": 13340, + "time_per_iteration": 2.500736951828003 + }, + { + "auxiliary_loss_clip": 0.01098703, + "auxiliary_loss_mlp": 0.01027823, + "balance_loss_clip": 1.01727295, + "balance_loss_mlp": 1.0331881, + "epoch": 0.8021043138433789, + "flos": 27600115639680.0, + "grad_norm": 2.2084467799286736, + "language_loss": 0.68105626, + "learning_rate": 3.968352931252936e-07, + "loss": 0.70232147, + "num_input_tokens_seen": 287876960, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.65625, + "step": 13341, + "time_per_iteration": 2.5179948806762695 + }, + { + "auxiliary_loss_clip": 0.01023459, + "auxiliary_loss_mlp": 0.00999082, + "balance_loss_clip": 0.99812281, + "balance_loss_mlp": 1.00333941, + "epoch": 0.8021644370960469, + "flos": 62063730057600.0, + "grad_norm": 0.8369751293594621, + "language_loss": 0.6160937, + "learning_rate": 3.9660247046727547e-07, + "loss": 0.6363191, + "num_input_tokens_seen": 287936530, + "router_z_loss_clip": 0.00958252, + "router_z_loss_mlp": 0.20117188, + "step": 13342, + "time_per_iteration": 3.000945806503296 + }, + { + "auxiliary_loss_clip": 0.01101839, + "auxiliary_loss_mlp": 0.01031495, + "balance_loss_clip": 1.01944876, + "balance_loss_mlp": 1.03650236, + "epoch": 0.8022245603487148, + "flos": 23361794352000.0, + "grad_norm": 1.7496140842245578, + "language_loss": 0.63761848, + "learning_rate": 3.963697086102522e-07, + "loss": 0.65895188, + "num_input_tokens_seen": 287954285, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.65234375, + "step": 13343, + "time_per_iteration": 2.4751808643341064 + }, + { + "auxiliary_loss_clip": 0.01095544, + "auxiliary_loss_mlp": 0.01025293, + "balance_loss_clip": 1.01477909, + "balance_loss_mlp": 1.03363252, + "epoch": 0.8022846836013828, + "flos": 10853354712960.0, + "grad_norm": 1.8534538865060244, + "language_loss": 0.68717116, + "learning_rate": 3.96137007563051e-07, + "loss": 0.70837951, + "num_input_tokens_seen": 287971595, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.6171875, + "step": 13344, + "time_per_iteration": 3.879085063934326 + }, + { + "auxiliary_loss_clip": 0.01100911, + "auxiliary_loss_mlp": 0.01026246, + "balance_loss_clip": 1.01449227, + "balance_loss_mlp": 1.03534698, + "epoch": 0.8023448068540509, + "flos": 29240443054080.0, + "grad_norm": 1.4750491175243907, + "language_loss": 0.70234525, + "learning_rate": 3.9590436733449506e-07, + "loss": 0.72361684, + "num_input_tokens_seen": 287992540, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.65625, + "step": 13345, + "time_per_iteration": 2.526529550552368 + }, + { + "auxiliary_loss_clip": 0.01023092, + "auxiliary_loss_mlp": 0.00996047, + "balance_loss_clip": 0.99513531, + "balance_loss_mlp": 1.00295401, + "epoch": 0.8024049301067188, + "flos": 64153588181760.0, + "grad_norm": 0.8675779351996153, + "language_loss": 0.62968004, + "learning_rate": 3.956717879334059e-07, + "loss": 0.64987135, + "num_input_tokens_seen": 288052810, + "router_z_loss_clip": 0.00909424, + "router_z_loss_mlp": 0.20117188, + "step": 13346, + "time_per_iteration": 4.479830980300903 + }, + { + "auxiliary_loss_clip": 0.01099962, + "auxiliary_loss_mlp": 0.01029683, + "balance_loss_clip": 1.01858425, + "balance_loss_mlp": 1.03673768, + "epoch": 0.8024650533593868, + "flos": 28585360765440.0, + "grad_norm": 1.5938834644000792, + "language_loss": 0.72352123, + "learning_rate": 3.9543926936860327e-07, + "loss": 0.74481773, + "num_input_tokens_seen": 288073045, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6328125, + "step": 13347, + "time_per_iteration": 3.90059232711792 + }, + { + "auxiliary_loss_clip": 0.01101618, + "auxiliary_loss_mlp": 0.01028439, + "balance_loss_clip": 1.01647007, + "balance_loss_mlp": 1.03545678, + "epoch": 0.8025251766120547, + "flos": 16982264448000.0, + "grad_norm": 1.9020244610460446, + "language_loss": 0.72658664, + "learning_rate": 3.9520681164890493e-07, + "loss": 0.74788725, + "num_input_tokens_seen": 288091165, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6640625, + "step": 13348, + "time_per_iteration": 2.4068453311920166 + }, + { + "auxiliary_loss_clip": 0.0109995, + "auxiliary_loss_mlp": 0.01028163, + "balance_loss_clip": 1.01632524, + "balance_loss_mlp": 1.03549361, + "epoch": 0.8025852998647227, + "flos": 22163671272960.0, + "grad_norm": 1.8226385974634305, + "language_loss": 0.75890076, + "learning_rate": 3.9497441478312444e-07, + "loss": 0.78018188, + "num_input_tokens_seen": 288110595, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.64453125, + "step": 13349, + "time_per_iteration": 2.4651827812194824 + }, + { + "auxiliary_loss_clip": 0.01102119, + "auxiliary_loss_mlp": 0.01032465, + "balance_loss_clip": 1.02177215, + "balance_loss_mlp": 1.0367322, + "epoch": 0.8026454231173906, + "flos": 22017012042240.0, + "grad_norm": 1.9845904046749108, + "language_loss": 0.83774155, + "learning_rate": 3.947420787800755e-07, + "loss": 0.85908747, + "num_input_tokens_seen": 288128995, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.65625, + "step": 13350, + "time_per_iteration": 3.9758567810058594 + }, + { + "auxiliary_loss_clip": 0.01102317, + "auxiliary_loss_mlp": 0.01032473, + "balance_loss_clip": 1.02151191, + "balance_loss_mlp": 1.03663123, + "epoch": 0.8027055463700586, + "flos": 22491320158080.0, + "grad_norm": 1.6434071857139758, + "language_loss": 0.71458006, + "learning_rate": 3.945098036485679e-07, + "loss": 0.73592794, + "num_input_tokens_seen": 288149265, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.65625, + "step": 13351, + "time_per_iteration": 2.456387519836426 + }, + { + "auxiliary_loss_clip": 0.01100958, + "auxiliary_loss_mlp": 0.01025905, + "balance_loss_clip": 1.0146575, + "balance_loss_mlp": 1.03703523, + "epoch": 0.8027656696227266, + "flos": 28912901909760.0, + "grad_norm": 1.569231510448569, + "language_loss": 0.6175856, + "learning_rate": 3.9427758939740885e-07, + "loss": 0.63885427, + "num_input_tokens_seen": 288170745, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 13352, + "time_per_iteration": 2.5027875900268555 + }, + { + "auxiliary_loss_clip": 0.01099513, + "auxiliary_loss_mlp": 0.01033953, + "balance_loss_clip": 1.02271748, + "balance_loss_mlp": 1.03542566, + "epoch": 0.8028257928753946, + "flos": 18589374760320.0, + "grad_norm": 1.9674575511392012, + "language_loss": 0.76736349, + "learning_rate": 3.940454360354046e-07, + "loss": 0.78869814, + "num_input_tokens_seen": 288189415, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 13353, + "time_per_iteration": 2.4299416542053223 + }, + { + "auxiliary_loss_clip": 0.01107387, + "auxiliary_loss_mlp": 0.01028754, + "balance_loss_clip": 1.01592684, + "balance_loss_mlp": 1.03595626, + "epoch": 0.8028859161280625, + "flos": 19130009339520.0, + "grad_norm": 4.591297164367042, + "language_loss": 0.73969984, + "learning_rate": 3.938133435713582e-07, + "loss": 0.76106119, + "num_input_tokens_seen": 288206900, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.71484375, + "step": 13354, + "time_per_iteration": 2.4425058364868164 + }, + { + "auxiliary_loss_clip": 0.01100936, + "auxiliary_loss_mlp": 0.0103158, + "balance_loss_clip": 1.02040386, + "balance_loss_mlp": 1.03437519, + "epoch": 0.8029460393807305, + "flos": 20229881742720.0, + "grad_norm": 2.057342776688467, + "language_loss": 0.65729123, + "learning_rate": 3.935813120140714e-07, + "loss": 0.6786164, + "num_input_tokens_seen": 288224800, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6640625, + "step": 13355, + "time_per_iteration": 2.4422569274902344 + }, + { + "auxiliary_loss_clip": 0.01103198, + "auxiliary_loss_mlp": 0.01030639, + "balance_loss_clip": 1.01824093, + "balance_loss_mlp": 1.03504288, + "epoch": 0.8030061626333984, + "flos": 49783320933120.0, + "grad_norm": 1.8283444100924588, + "language_loss": 0.68699443, + "learning_rate": 3.9334934137234235e-07, + "loss": 0.70833278, + "num_input_tokens_seen": 288249400, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 13356, + "time_per_iteration": 2.7776293754577637 + }, + { + "auxiliary_loss_clip": 0.01100058, + "auxiliary_loss_mlp": 0.01027162, + "balance_loss_clip": 1.01561069, + "balance_loss_mlp": 1.03510857, + "epoch": 0.8030662858860664, + "flos": 21615243442560.0, + "grad_norm": 1.6343755338343116, + "language_loss": 0.77451766, + "learning_rate": 3.931174316549666e-07, + "loss": 0.79578984, + "num_input_tokens_seen": 288268780, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6484375, + "step": 13357, + "time_per_iteration": 2.4800233840942383 + }, + { + "auxiliary_loss_clip": 0.01100087, + "auxiliary_loss_mlp": 0.0102911, + "balance_loss_clip": 1.01638436, + "balance_loss_mlp": 1.03227997, + "epoch": 0.8031264091387345, + "flos": 25630056351360.0, + "grad_norm": 1.418419476215859, + "language_loss": 0.76987123, + "learning_rate": 3.9288558287073937e-07, + "loss": 0.79116321, + "num_input_tokens_seen": 288290830, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 13358, + "time_per_iteration": 2.500126361846924 + }, + { + "auxiliary_loss_clip": 0.01097171, + "auxiliary_loss_mlp": 0.01029617, + "balance_loss_clip": 1.01845884, + "balance_loss_mlp": 1.03335798, + "epoch": 0.8031865323914024, + "flos": 19646225648640.0, + "grad_norm": 1.4766521740684297, + "language_loss": 0.84945107, + "learning_rate": 3.9265379502845143e-07, + "loss": 0.87071896, + "num_input_tokens_seen": 288308865, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.640625, + "step": 13359, + "time_per_iteration": 2.467322826385498 + }, + { + "auxiliary_loss_clip": 0.01098755, + "auxiliary_loss_mlp": 0.01025457, + "balance_loss_clip": 1.0148114, + "balance_loss_mlp": 1.0347209, + "epoch": 0.8032466556440704, + "flos": 26169110732160.0, + "grad_norm": 5.478885485071422, + "language_loss": 0.73389184, + "learning_rate": 3.924220681368928e-07, + "loss": 0.75513393, + "num_input_tokens_seen": 288327325, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.640625, + "step": 13360, + "time_per_iteration": 2.4644036293029785 + }, + { + "auxiliary_loss_clip": 0.011011, + "auxiliary_loss_mlp": 0.01025712, + "balance_loss_clip": 1.01467943, + "balance_loss_mlp": 1.03494728, + "epoch": 0.8033067788967383, + "flos": 25520026014720.0, + "grad_norm": 1.658575264566963, + "language_loss": 0.69541776, + "learning_rate": 3.921904022048512e-07, + "loss": 0.71668589, + "num_input_tokens_seen": 288347285, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.66015625, + "step": 13361, + "time_per_iteration": 2.4854345321655273 + }, + { + "auxiliary_loss_clip": 0.01102908, + "auxiliary_loss_mlp": 0.01034224, + "balance_loss_clip": 1.02245271, + "balance_loss_mlp": 1.03511453, + "epoch": 0.8033669021494063, + "flos": 24024274842240.0, + "grad_norm": 1.5325871265997801, + "language_loss": 0.6999588, + "learning_rate": 3.919587972411098e-07, + "loss": 0.72133017, + "num_input_tokens_seen": 288367785, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6796875, + "step": 13362, + "time_per_iteration": 2.488231658935547 + }, + { + "auxiliary_loss_clip": 0.01107256, + "auxiliary_loss_mlp": 0.01037143, + "balance_loss_clip": 1.02425015, + "balance_loss_mlp": 1.03642321, + "epoch": 0.8034270254020742, + "flos": 13588059749760.0, + "grad_norm": 3.0174413399707363, + "language_loss": 0.78229916, + "learning_rate": 3.91727253254452e-07, + "loss": 0.80374312, + "num_input_tokens_seen": 288384135, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.70703125, + "step": 13363, + "time_per_iteration": 2.418231964111328 + }, + { + "auxiliary_loss_clip": 0.01099372, + "auxiliary_loss_mlp": 0.01027358, + "balance_loss_clip": 1.01533544, + "balance_loss_mlp": 1.03315711, + "epoch": 0.8034871486547422, + "flos": 27412661537280.0, + "grad_norm": 2.2119044430692725, + "language_loss": 0.7432642, + "learning_rate": 3.9149577025365787e-07, + "loss": 0.76453155, + "num_input_tokens_seen": 288403805, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 13364, + "time_per_iteration": 2.5197720527648926 + }, + { + "auxiliary_loss_clip": 0.01103376, + "auxiliary_loss_mlp": 0.01028234, + "balance_loss_clip": 1.01715374, + "balance_loss_mlp": 1.0384289, + "epoch": 0.8035472719074102, + "flos": 32598593475840.0, + "grad_norm": 1.8638691080959722, + "language_loss": 0.60079575, + "learning_rate": 3.9126434824750596e-07, + "loss": 0.62211186, + "num_input_tokens_seen": 288424895, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6484375, + "step": 13365, + "time_per_iteration": 2.5367517471313477 + }, + { + "auxiliary_loss_clip": 0.01102557, + "auxiliary_loss_mlp": 0.010325, + "balance_loss_clip": 1.02088296, + "balance_loss_mlp": 1.03595102, + "epoch": 0.8036073951600782, + "flos": 21287989607040.0, + "grad_norm": 1.770848682899336, + "language_loss": 0.66261953, + "learning_rate": 3.910329872447706e-07, + "loss": 0.68397009, + "num_input_tokens_seen": 288443865, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 13366, + "time_per_iteration": 2.4602773189544678 + }, + { + "auxiliary_loss_clip": 0.01097083, + "auxiliary_loss_mlp": 0.010267, + "balance_loss_clip": 1.01548815, + "balance_loss_mlp": 1.03327024, + "epoch": 0.8036675184127461, + "flos": 18113845582080.0, + "grad_norm": 2.111763944733339, + "language_loss": 0.75102258, + "learning_rate": 3.908016872542259e-07, + "loss": 0.77226043, + "num_input_tokens_seen": 288461065, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.63671875, + "step": 13367, + "time_per_iteration": 2.41711163520813 + }, + { + "auxiliary_loss_clip": 0.0109804, + "auxiliary_loss_mlp": 0.01024389, + "balance_loss_clip": 1.01354659, + "balance_loss_mlp": 1.0338217, + "epoch": 0.8037276416654141, + "flos": 26030280666240.0, + "grad_norm": 1.54219979673165, + "language_loss": 0.73962986, + "learning_rate": 3.905704482846428e-07, + "loss": 0.76085418, + "num_input_tokens_seen": 288481865, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.640625, + "step": 13368, + "time_per_iteration": 2.501863956451416 + }, + { + "auxiliary_loss_clip": 0.01102548, + "auxiliary_loss_mlp": 0.01033536, + "balance_loss_clip": 1.02176404, + "balance_loss_mlp": 1.03486526, + "epoch": 0.803787764918082, + "flos": 18802180886400.0, + "grad_norm": 1.9813333443993375, + "language_loss": 0.69734561, + "learning_rate": 3.90339270344789e-07, + "loss": 0.71870649, + "num_input_tokens_seen": 288499345, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.67578125, + "step": 13369, + "time_per_iteration": 2.456852674484253 + }, + { + "auxiliary_loss_clip": 0.01096676, + "auxiliary_loss_mlp": 0.01029855, + "balance_loss_clip": 1.01927543, + "balance_loss_mlp": 1.03303206, + "epoch": 0.80384788817075, + "flos": 20225787592320.0, + "grad_norm": 1.6987132641471832, + "language_loss": 0.74007034, + "learning_rate": 3.901081534434312e-07, + "loss": 0.76133567, + "num_input_tokens_seen": 288517660, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.63671875, + "step": 13370, + "time_per_iteration": 2.447308301925659 + }, + { + "auxiliary_loss_clip": 0.0110293, + "auxiliary_loss_mlp": 0.01034302, + "balance_loss_clip": 1.02192795, + "balance_loss_mlp": 1.03425717, + "epoch": 0.8039080114234181, + "flos": 18515290959360.0, + "grad_norm": 2.381680078158343, + "language_loss": 0.87296432, + "learning_rate": 3.898770975893342e-07, + "loss": 0.89433664, + "num_input_tokens_seen": 288534180, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6875, + "step": 13371, + "time_per_iteration": 2.4829249382019043 + }, + { + "auxiliary_loss_clip": 0.01103599, + "auxiliary_loss_mlp": 0.01032892, + "balance_loss_clip": 1.02035713, + "balance_loss_mlp": 1.03440571, + "epoch": 0.803968134676086, + "flos": 22382510883840.0, + "grad_norm": 1.7658004045692555, + "language_loss": 0.74599552, + "learning_rate": 3.89646102791259e-07, + "loss": 0.76736039, + "num_input_tokens_seen": 288553350, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69140625, + "step": 13372, + "time_per_iteration": 2.442962169647217 + }, + { + "auxiliary_loss_clip": 0.01098823, + "auxiliary_loss_mlp": 0.0102821, + "balance_loss_clip": 1.01571703, + "balance_loss_mlp": 1.03405392, + "epoch": 0.804028257928754, + "flos": 23842566915840.0, + "grad_norm": 2.4174801447044807, + "language_loss": 0.79491466, + "learning_rate": 3.894151690579646e-07, + "loss": 0.816185, + "num_input_tokens_seen": 288571325, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6484375, + "step": 13373, + "time_per_iteration": 2.453648090362549 + }, + { + "auxiliary_loss_clip": 0.01097395, + "auxiliary_loss_mlp": 0.01030701, + "balance_loss_clip": 1.01992464, + "balance_loss_mlp": 1.03413832, + "epoch": 0.8040883811814219, + "flos": 23550720912000.0, + "grad_norm": 2.1161878691178244, + "language_loss": 0.74199659, + "learning_rate": 3.8918429639820815e-07, + "loss": 0.76327753, + "num_input_tokens_seen": 288592100, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6328125, + "step": 13374, + "time_per_iteration": 2.4652347564697266 + }, + { + "auxiliary_loss_clip": 0.01101497, + "auxiliary_loss_mlp": 0.01028296, + "balance_loss_clip": 1.01642847, + "balance_loss_mlp": 1.03416824, + "epoch": 0.8041485044340899, + "flos": 19026263882880.0, + "grad_norm": 1.8766085876406744, + "language_loss": 0.68541491, + "learning_rate": 3.889534848207452e-07, + "loss": 0.70671284, + "num_input_tokens_seen": 288612305, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 13375, + "time_per_iteration": 2.4260308742523193 + }, + { + "auxiliary_loss_clip": 0.0102349, + "auxiliary_loss_mlp": 0.01009026, + "balance_loss_clip": 1.00806642, + "balance_loss_mlp": 1.00329185, + "epoch": 0.8042086276867578, + "flos": 70005663797760.0, + "grad_norm": 0.726849315788399, + "language_loss": 0.55648947, + "learning_rate": 3.887227343343271e-07, + "loss": 0.57681465, + "num_input_tokens_seen": 288676015, + "router_z_loss_clip": 0.00958252, + "router_z_loss_mlp": 0.20117188, + "step": 13376, + "time_per_iteration": 3.1642260551452637 + }, + { + "auxiliary_loss_clip": 0.0110284, + "auxiliary_loss_mlp": 0.01028344, + "balance_loss_clip": 1.01679826, + "balance_loss_mlp": 1.03498244, + "epoch": 0.8042687509394258, + "flos": 21872435800320.0, + "grad_norm": 1.6135536468641871, + "language_loss": 0.72961086, + "learning_rate": 3.8849204494770425e-07, + "loss": 0.75092268, + "num_input_tokens_seen": 288696455, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6796875, + "step": 13377, + "time_per_iteration": 2.4822981357574463 + }, + { + "auxiliary_loss_clip": 0.01098741, + "auxiliary_loss_mlp": 0.01026807, + "balance_loss_clip": 1.01541018, + "balance_loss_mlp": 1.03249967, + "epoch": 0.8043288741920938, + "flos": 26614870513920.0, + "grad_norm": 1.981270631697856, + "language_loss": 0.69881338, + "learning_rate": 3.8826141666962567e-07, + "loss": 0.72006881, + "num_input_tokens_seen": 288715560, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6640625, + "step": 13378, + "time_per_iteration": 2.5707526206970215 + }, + { + "auxiliary_loss_clip": 0.01100528, + "auxiliary_loss_mlp": 0.01024589, + "balance_loss_clip": 1.01306105, + "balance_loss_mlp": 1.03421903, + "epoch": 0.8043889974447618, + "flos": 33403387651200.0, + "grad_norm": 1.411246611707562, + "language_loss": 0.69285733, + "learning_rate": 3.880308495088347e-07, + "loss": 0.71410847, + "num_input_tokens_seen": 288739485, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 13379, + "time_per_iteration": 2.5638318061828613 + }, + { + "auxiliary_loss_clip": 0.01104566, + "auxiliary_loss_mlp": 0.01030173, + "balance_loss_clip": 1.01733994, + "balance_loss_mlp": 1.03576803, + "epoch": 0.8044491206974297, + "flos": 20375966355840.0, + "grad_norm": 1.6928321520732015, + "language_loss": 0.76132649, + "learning_rate": 3.8780034347407533e-07, + "loss": 0.7826739, + "num_input_tokens_seen": 288757420, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6875, + "step": 13380, + "time_per_iteration": 2.4436728954315186 + }, + { + "auxiliary_loss_clip": 0.0109777, + "auxiliary_loss_mlp": 0.01024309, + "balance_loss_clip": 1.01334739, + "balance_loss_mlp": 1.03278756, + "epoch": 0.8045092439500977, + "flos": 23403810286080.0, + "grad_norm": 1.762890691541343, + "language_loss": 0.68871969, + "learning_rate": 3.875698985740887e-07, + "loss": 0.70994055, + "num_input_tokens_seen": 288775535, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 13381, + "time_per_iteration": 2.4540696144104004 + }, + { + "auxiliary_loss_clip": 0.01102189, + "auxiliary_loss_mlp": 0.01032677, + "balance_loss_clip": 1.02096522, + "balance_loss_mlp": 1.03659916, + "epoch": 0.8045693672027656, + "flos": 24097245321600.0, + "grad_norm": 2.070272344972077, + "language_loss": 0.63770294, + "learning_rate": 3.873395148176135e-07, + "loss": 0.65905166, + "num_input_tokens_seen": 288795035, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 13382, + "time_per_iteration": 2.476844549179077 + }, + { + "auxiliary_loss_clip": 0.01099779, + "auxiliary_loss_mlp": 0.01031953, + "balance_loss_clip": 1.02138495, + "balance_loss_mlp": 1.03501844, + "epoch": 0.8046294904554336, + "flos": 27707165147520.0, + "grad_norm": 1.9560619657883067, + "language_loss": 0.76228422, + "learning_rate": 3.8710919221338487e-07, + "loss": 0.78360152, + "num_input_tokens_seen": 288816270, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6484375, + "step": 13383, + "time_per_iteration": 2.5023083686828613 + }, + { + "auxiliary_loss_clip": 0.0110036, + "auxiliary_loss_mlp": 0.01033085, + "balance_loss_clip": 1.0221895, + "balance_loss_mlp": 1.03507757, + "epoch": 0.8046896137081017, + "flos": 24972998814720.0, + "grad_norm": 1.704803693538242, + "language_loss": 0.69562024, + "learning_rate": 3.868789307701381e-07, + "loss": 0.71695471, + "num_input_tokens_seen": 288836050, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.65625, + "step": 13384, + "time_per_iteration": 2.4720067977905273 + }, + { + "auxiliary_loss_clip": 0.0110185, + "auxiliary_loss_mlp": 0.01032882, + "balance_loss_clip": 1.02033544, + "balance_loss_mlp": 1.0335815, + "epoch": 0.8047497369607696, + "flos": 17675484001920.0, + "grad_norm": 2.241572450545315, + "language_loss": 0.79350901, + "learning_rate": 3.8664873049660375e-07, + "loss": 0.81485635, + "num_input_tokens_seen": 288852900, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 13385, + "time_per_iteration": 2.439087152481079 + }, + { + "auxiliary_loss_clip": 0.01099604, + "auxiliary_loss_mlp": 0.01029903, + "balance_loss_clip": 1.01754057, + "balance_loss_mlp": 1.03369808, + "epoch": 0.8048098602134376, + "flos": 22382079920640.0, + "grad_norm": 1.6118027909043755, + "language_loss": 0.72186625, + "learning_rate": 3.864185914015108e-07, + "loss": 0.74316132, + "num_input_tokens_seen": 288872625, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.65625, + "step": 13386, + "time_per_iteration": 3.8713440895080566 + }, + { + "auxiliary_loss_clip": 0.01024358, + "auxiliary_loss_mlp": 0.01001397, + "balance_loss_clip": 1.00044346, + "balance_loss_mlp": 1.00412393, + "epoch": 0.8048699834661055, + "flos": 71200949702400.0, + "grad_norm": 0.6667783047012105, + "language_loss": 0.5129301, + "learning_rate": 3.861885134935865e-07, + "loss": 0.53318763, + "num_input_tokens_seen": 288939180, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.203125, + "step": 13387, + "time_per_iteration": 4.501964330673218 + }, + { + "auxiliary_loss_clip": 0.01099839, + "auxiliary_loss_mlp": 0.01032124, + "balance_loss_clip": 1.01966643, + "balance_loss_mlp": 1.03382778, + "epoch": 0.8049301067187735, + "flos": 23660320285440.0, + "grad_norm": 1.6706301828643437, + "language_loss": 0.73789018, + "learning_rate": 3.859584967815559e-07, + "loss": 0.75920987, + "num_input_tokens_seen": 288958925, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.66015625, + "step": 13388, + "time_per_iteration": 3.843517780303955 + }, + { + "auxiliary_loss_clip": 0.0109843, + "auxiliary_loss_mlp": 0.0102773, + "balance_loss_clip": 1.01682854, + "balance_loss_mlp": 1.03459811, + "epoch": 0.8049902299714414, + "flos": 24426330750720.0, + "grad_norm": 1.8701026926914783, + "language_loss": 0.71383917, + "learning_rate": 3.857285412741411e-07, + "loss": 0.73510081, + "num_input_tokens_seen": 288980935, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.640625, + "step": 13389, + "time_per_iteration": 2.5054638385772705 + }, + { + "auxiliary_loss_clip": 0.0110175, + "auxiliary_loss_mlp": 0.01033044, + "balance_loss_clip": 1.02133179, + "balance_loss_mlp": 1.03612518, + "epoch": 0.8050503532241094, + "flos": 17492626840320.0, + "grad_norm": 3.088475766365905, + "language_loss": 0.82746458, + "learning_rate": 3.8549864698006097e-07, + "loss": 0.84881252, + "num_input_tokens_seen": 288996780, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 13390, + "time_per_iteration": 2.423768997192383 + }, + { + "auxiliary_loss_clip": 0.01023418, + "auxiliary_loss_mlp": 0.01000717, + "balance_loss_clip": 0.99975187, + "balance_loss_mlp": 1.00324726, + "epoch": 0.8051104764767774, + "flos": 57658030369920.0, + "grad_norm": 0.7770155709179203, + "language_loss": 0.55552375, + "learning_rate": 3.8526881390803424e-07, + "loss": 0.57576513, + "num_input_tokens_seen": 289057590, + "router_z_loss_clip": 0.00964355, + "router_z_loss_mlp": 0.20214844, + "step": 13391, + "time_per_iteration": 3.0361075401306152 + }, + { + "auxiliary_loss_clip": 0.01096866, + "auxiliary_loss_mlp": 0.01029072, + "balance_loss_clip": 1.01784241, + "balance_loss_mlp": 1.03412104, + "epoch": 0.8051705997294454, + "flos": 18003456109440.0, + "grad_norm": 1.5537305479409738, + "language_loss": 0.84568977, + "learning_rate": 3.850390420667762e-07, + "loss": 0.8669492, + "num_input_tokens_seen": 289076285, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.625, + "step": 13392, + "time_per_iteration": 3.88423490524292 + }, + { + "auxiliary_loss_clip": 0.01099202, + "auxiliary_loss_mlp": 0.01029176, + "balance_loss_clip": 1.01805949, + "balance_loss_mlp": 1.03294063, + "epoch": 0.8052307229821133, + "flos": 26397754755840.0, + "grad_norm": 1.4560269094808058, + "language_loss": 0.70109689, + "learning_rate": 3.8480933146499914e-07, + "loss": 0.72238064, + "num_input_tokens_seen": 289097585, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 13393, + "time_per_iteration": 2.503424644470215 + }, + { + "auxiliary_loss_clip": 0.01101229, + "auxiliary_loss_mlp": 0.01028713, + "balance_loss_clip": 1.01657152, + "balance_loss_mlp": 1.03488159, + "epoch": 0.8052908462347813, + "flos": 21757018423680.0, + "grad_norm": 3.6975183577727937, + "language_loss": 0.76589131, + "learning_rate": 3.84579682111414e-07, + "loss": 0.78719074, + "num_input_tokens_seen": 289116890, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6640625, + "step": 13394, + "time_per_iteration": 2.50583553314209 + }, + { + "auxiliary_loss_clip": 0.01103543, + "auxiliary_loss_mlp": 0.01032691, + "balance_loss_clip": 1.02136636, + "balance_loss_mlp": 1.03756762, + "epoch": 0.8053509694874492, + "flos": 25442279026560.0, + "grad_norm": 1.5906129807299372, + "language_loss": 0.64856386, + "learning_rate": 3.843500940147304e-07, + "loss": 0.66992623, + "num_input_tokens_seen": 289136670, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66015625, + "step": 13395, + "time_per_iteration": 2.482172966003418 + }, + { + "auxiliary_loss_clip": 0.01023951, + "auxiliary_loss_mlp": 0.01003964, + "balance_loss_clip": 1.00302815, + "balance_loss_mlp": 1.00378847, + "epoch": 0.8054110927401172, + "flos": 57668122091520.0, + "grad_norm": 0.7505786157423635, + "language_loss": 0.57311893, + "learning_rate": 3.8412056718365206e-07, + "loss": 0.59339797, + "num_input_tokens_seen": 289200150, + "router_z_loss_clip": 0.00933838, + "router_z_loss_mlp": 0.20117188, + "step": 13396, + "time_per_iteration": 3.1897172927856445 + }, + { + "auxiliary_loss_clip": 0.01100884, + "auxiliary_loss_mlp": 0.01033808, + "balance_loss_clip": 1.02165508, + "balance_loss_mlp": 1.03515673, + "epoch": 0.8054712159927853, + "flos": 19276201693440.0, + "grad_norm": 1.5955257474289526, + "language_loss": 0.77324402, + "learning_rate": 3.8389110162688353e-07, + "loss": 0.79459095, + "num_input_tokens_seen": 289218125, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.65625, + "step": 13397, + "time_per_iteration": 2.457589626312256 + }, + { + "auxiliary_loss_clip": 0.01100616, + "auxiliary_loss_mlp": 0.01025244, + "balance_loss_clip": 1.01415157, + "balance_loss_mlp": 1.03589237, + "epoch": 0.8055313392454532, + "flos": 17967617314560.0, + "grad_norm": 1.4619231404406883, + "language_loss": 0.70318341, + "learning_rate": 3.836616973531266e-07, + "loss": 0.72444201, + "num_input_tokens_seen": 289237115, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 13398, + "time_per_iteration": 2.434720754623413 + }, + { + "auxiliary_loss_clip": 0.01098688, + "auxiliary_loss_mlp": 0.01028967, + "balance_loss_clip": 1.01820254, + "balance_loss_mlp": 1.0339874, + "epoch": 0.8055914624981212, + "flos": 13478352635520.0, + "grad_norm": 2.5249609811736993, + "language_loss": 0.68945122, + "learning_rate": 3.834323543710805e-07, + "loss": 0.71072781, + "num_input_tokens_seen": 289253635, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.64453125, + "step": 13399, + "time_per_iteration": 2.4369473457336426 + }, + { + "auxiliary_loss_clip": 0.01100707, + "auxiliary_loss_mlp": 0.01032742, + "balance_loss_clip": 1.02185178, + "balance_loss_mlp": 1.03535557, + "epoch": 0.8056515857507891, + "flos": 13224787551360.0, + "grad_norm": 2.741407095084056, + "language_loss": 0.72130084, + "learning_rate": 3.8320307268944153e-07, + "loss": 0.74263531, + "num_input_tokens_seen": 289270085, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.65625, + "step": 13400, + "time_per_iteration": 2.45465350151062 + }, + { + "auxiliary_loss_clip": 0.01095975, + "auxiliary_loss_mlp": 0.01029235, + "balance_loss_clip": 1.01795244, + "balance_loss_mlp": 1.03169787, + "epoch": 0.8057117090034571, + "flos": 23878190229120.0, + "grad_norm": 1.762424898680502, + "language_loss": 0.6360321, + "learning_rate": 3.829738523169037e-07, + "loss": 0.6572842, + "num_input_tokens_seen": 289289645, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.640625, + "step": 13401, + "time_per_iteration": 2.46634840965271 + }, + { + "auxiliary_loss_clip": 0.01101558, + "auxiliary_loss_mlp": 0.01026627, + "balance_loss_clip": 1.01541567, + "balance_loss_mlp": 1.03495288, + "epoch": 0.805771832256125, + "flos": 21214300855680.0, + "grad_norm": 2.0776999843399215, + "language_loss": 0.83539009, + "learning_rate": 3.8274469326215985e-07, + "loss": 0.85667193, + "num_input_tokens_seen": 289306630, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.66796875, + "step": 13402, + "time_per_iteration": 2.4731264114379883 + }, + { + "auxiliary_loss_clip": 0.01104077, + "auxiliary_loss_mlp": 0.01029892, + "balance_loss_clip": 1.01843047, + "balance_loss_mlp": 1.03683734, + "epoch": 0.805831955508793, + "flos": 17566818382080.0, + "grad_norm": 1.7791761836453988, + "language_loss": 0.67880774, + "learning_rate": 3.8251559553389876e-07, + "loss": 0.70014745, + "num_input_tokens_seen": 289324960, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.671875, + "step": 13403, + "time_per_iteration": 2.4434680938720703 + }, + { + "auxiliary_loss_clip": 0.01100505, + "auxiliary_loss_mlp": 0.01034064, + "balance_loss_clip": 1.02343667, + "balance_loss_mlp": 1.03693151, + "epoch": 0.805892078761461, + "flos": 26907542530560.0, + "grad_norm": 1.6082788038338753, + "language_loss": 0.84920156, + "learning_rate": 3.822865591408084e-07, + "loss": 0.87054729, + "num_input_tokens_seen": 289344980, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.63671875, + "step": 13404, + "time_per_iteration": 2.531658172607422 + }, + { + "auxiliary_loss_clip": 0.01094833, + "auxiliary_loss_mlp": 0.01027389, + "balance_loss_clip": 1.01671946, + "balance_loss_mlp": 1.03220367, + "epoch": 0.805952202014129, + "flos": 31506442496640.0, + "grad_norm": 1.3891013109645525, + "language_loss": 0.70190167, + "learning_rate": 3.820575840915743e-07, + "loss": 0.72312385, + "num_input_tokens_seen": 289367500, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.625, + "step": 13405, + "time_per_iteration": 2.6014316082000732 + }, + { + "auxiliary_loss_clip": 0.01098677, + "auxiliary_loss_mlp": 0.01023373, + "balance_loss_clip": 1.01291847, + "balance_loss_mlp": 1.0349884, + "epoch": 0.8060123252667969, + "flos": 24389953251840.0, + "grad_norm": 4.539530104716245, + "language_loss": 0.75637108, + "learning_rate": 3.818286703948788e-07, + "loss": 0.77759159, + "num_input_tokens_seen": 289385930, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.63671875, + "step": 13406, + "time_per_iteration": 2.5324466228485107 + }, + { + "auxiliary_loss_clip": 0.01101098, + "auxiliary_loss_mlp": 0.01032193, + "balance_loss_clip": 1.02026033, + "balance_loss_mlp": 1.03502393, + "epoch": 0.8060724485194649, + "flos": 23479941162240.0, + "grad_norm": 1.4950577353974586, + "language_loss": 0.76435769, + "learning_rate": 3.815998180594018e-07, + "loss": 0.78569061, + "num_input_tokens_seen": 289408025, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66015625, + "step": 13407, + "time_per_iteration": 2.477576971054077 + }, + { + "auxiliary_loss_clip": 0.01099093, + "auxiliary_loss_mlp": 0.01031259, + "balance_loss_clip": 1.01961851, + "balance_loss_mlp": 1.03387368, + "epoch": 0.8061325717721328, + "flos": 18624495283200.0, + "grad_norm": 1.6921113118450146, + "language_loss": 0.73480356, + "learning_rate": 3.81371027093822e-07, + "loss": 0.75610703, + "num_input_tokens_seen": 289426575, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 13408, + "time_per_iteration": 2.4257562160491943 + }, + { + "auxiliary_loss_clip": 0.01099181, + "auxiliary_loss_mlp": 0.0102687, + "balance_loss_clip": 1.01519299, + "balance_loss_mlp": 1.03426397, + "epoch": 0.8061926950248008, + "flos": 23582752865280.0, + "grad_norm": 1.9196860165505316, + "language_loss": 0.70829517, + "learning_rate": 3.8114229750681523e-07, + "loss": 0.72955573, + "num_input_tokens_seen": 289447760, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6484375, + "step": 13409, + "time_per_iteration": 2.463796854019165 + }, + { + "auxiliary_loss_clip": 0.01099235, + "auxiliary_loss_mlp": 0.01025918, + "balance_loss_clip": 1.01424718, + "balance_loss_mlp": 1.03369415, + "epoch": 0.8062528182774689, + "flos": 11143333209600.0, + "grad_norm": 1.9740457439669723, + "language_loss": 0.76695901, + "learning_rate": 3.809136293070545e-07, + "loss": 0.78821057, + "num_input_tokens_seen": 289463920, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65625, + "step": 13410, + "time_per_iteration": 2.4198813438415527 + }, + { + "auxiliary_loss_clip": 0.01100494, + "auxiliary_loss_mlp": 0.01033141, + "balance_loss_clip": 1.02152979, + "balance_loss_mlp": 1.03608918, + "epoch": 0.8063129415301368, + "flos": 22346815743360.0, + "grad_norm": 1.7996163634950662, + "language_loss": 0.68654764, + "learning_rate": 3.806850225032117e-07, + "loss": 0.70788395, + "num_input_tokens_seen": 289482635, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.64453125, + "step": 13411, + "time_per_iteration": 2.4347574710845947 + }, + { + "auxiliary_loss_clip": 0.01097282, + "auxiliary_loss_mlp": 0.01026428, + "balance_loss_clip": 1.01528811, + "balance_loss_mlp": 1.03363693, + "epoch": 0.8063730647828048, + "flos": 23988400133760.0, + "grad_norm": 1.6674107139859142, + "language_loss": 0.68204069, + "learning_rate": 3.804564771039551e-07, + "loss": 0.70327783, + "num_input_tokens_seen": 289502040, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.63671875, + "step": 13412, + "time_per_iteration": 2.4598028659820557 + }, + { + "auxiliary_loss_clip": 0.01105517, + "auxiliary_loss_mlp": 0.01031169, + "balance_loss_clip": 1.01803827, + "balance_loss_mlp": 1.03777528, + "epoch": 0.8064331880354727, + "flos": 21321494017920.0, + "grad_norm": 1.6536300325901656, + "language_loss": 0.81038213, + "learning_rate": 3.8022799311795064e-07, + "loss": 0.83174896, + "num_input_tokens_seen": 289520740, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.6796875, + "step": 13413, + "time_per_iteration": 2.4321577548980713 + }, + { + "auxiliary_loss_clip": 0.01098188, + "auxiliary_loss_mlp": 0.01031064, + "balance_loss_clip": 1.01991224, + "balance_loss_mlp": 1.0338769, + "epoch": 0.8064933112881407, + "flos": 19682890456320.0, + "grad_norm": 1.984977186812749, + "language_loss": 0.84957677, + "learning_rate": 3.7999957055386303e-07, + "loss": 0.87086928, + "num_input_tokens_seen": 289535840, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.640625, + "step": 13414, + "time_per_iteration": 2.4548234939575195 + }, + { + "auxiliary_loss_clip": 0.01096994, + "auxiliary_loss_mlp": 0.01031681, + "balance_loss_clip": 1.02094603, + "balance_loss_mlp": 1.03234887, + "epoch": 0.8065534345408086, + "flos": 19279721226240.0, + "grad_norm": 2.244506083720548, + "language_loss": 0.67268044, + "learning_rate": 3.7977120942035467e-07, + "loss": 0.69396722, + "num_input_tokens_seen": 289555205, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6484375, + "step": 13415, + "time_per_iteration": 2.425309181213379 + }, + { + "auxiliary_loss_clip": 0.01096685, + "auxiliary_loss_mlp": 0.01022432, + "balance_loss_clip": 1.01168513, + "balance_loss_mlp": 1.03403616, + "epoch": 0.8066135577934767, + "flos": 19677718897920.0, + "grad_norm": 1.573239663974263, + "language_loss": 0.76294547, + "learning_rate": 3.7954290972608383e-07, + "loss": 0.78413665, + "num_input_tokens_seen": 289573000, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.625, + "step": 13416, + "time_per_iteration": 2.4303643703460693 + }, + { + "auxiliary_loss_clip": 0.01102292, + "auxiliary_loss_mlp": 0.01030702, + "balance_loss_clip": 1.01951385, + "balance_loss_mlp": 1.03429639, + "epoch": 0.8066736810461446, + "flos": 21143592933120.0, + "grad_norm": 1.567703379933568, + "language_loss": 0.65159631, + "learning_rate": 3.793146714797086e-07, + "loss": 0.67292631, + "num_input_tokens_seen": 289592625, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6796875, + "step": 13417, + "time_per_iteration": 2.4838106632232666 + }, + { + "auxiliary_loss_clip": 0.0110189, + "auxiliary_loss_mlp": 0.01034276, + "balance_loss_clip": 1.02290344, + "balance_loss_mlp": 1.03483796, + "epoch": 0.8067338042988126, + "flos": 22598261925120.0, + "grad_norm": 1.7972598852590256, + "language_loss": 0.80653781, + "learning_rate": 3.7908649468988306e-07, + "loss": 0.82789946, + "num_input_tokens_seen": 289610780, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 13418, + "time_per_iteration": 2.4769530296325684 + }, + { + "auxiliary_loss_clip": 0.01102946, + "auxiliary_loss_mlp": 0.01029292, + "balance_loss_clip": 1.01751471, + "balance_loss_mlp": 1.03614044, + "epoch": 0.8067939275514805, + "flos": 16508423208960.0, + "grad_norm": 1.5261451461665583, + "language_loss": 0.845676, + "learning_rate": 3.7885837936526066e-07, + "loss": 0.86699843, + "num_input_tokens_seen": 289628890, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66796875, + "step": 13419, + "time_per_iteration": 2.4280943870544434 + }, + { + "auxiliary_loss_clip": 0.01101257, + "auxiliary_loss_mlp": 0.01029456, + "balance_loss_clip": 1.01786304, + "balance_loss_mlp": 1.03399837, + "epoch": 0.8068540508041485, + "flos": 28541836460160.0, + "grad_norm": 1.6210776482059308, + "language_loss": 0.75624955, + "learning_rate": 3.7863032551449047e-07, + "loss": 0.77755666, + "num_input_tokens_seen": 289647220, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 13420, + "time_per_iteration": 2.489564895629883 + }, + { + "auxiliary_loss_clip": 0.01096685, + "auxiliary_loss_mlp": 0.01026556, + "balance_loss_clip": 1.01652443, + "balance_loss_mlp": 1.03300762, + "epoch": 0.8069141740568164, + "flos": 21652482867840.0, + "grad_norm": 1.6825094643098477, + "language_loss": 0.78326774, + "learning_rate": 3.784023331462207e-07, + "loss": 0.8045001, + "num_input_tokens_seen": 289665800, + "router_z_loss_clip": 0.10009766, + "router_z_loss_mlp": 0.63671875, + "step": 13421, + "time_per_iteration": 2.4398117065429688 + }, + { + "auxiliary_loss_clip": 0.01102139, + "auxiliary_loss_mlp": 0.0102348, + "balance_loss_clip": 1.01245332, + "balance_loss_mlp": 1.03592634, + "epoch": 0.8069742973094844, + "flos": 17529327561600.0, + "grad_norm": 1.6379524455499936, + "language_loss": 0.79461509, + "learning_rate": 3.78174402269098e-07, + "loss": 0.81587136, + "num_input_tokens_seen": 289682705, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6640625, + "step": 13422, + "time_per_iteration": 2.415855646133423 + }, + { + "auxiliary_loss_clip": 0.01098682, + "auxiliary_loss_mlp": 0.010287, + "balance_loss_clip": 1.01785183, + "balance_loss_mlp": 1.03383088, + "epoch": 0.8070344205621525, + "flos": 23367037737600.0, + "grad_norm": 1.5228776054135154, + "language_loss": 0.67973536, + "learning_rate": 3.7794653289176347e-07, + "loss": 0.70100921, + "num_input_tokens_seen": 289702920, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6484375, + "step": 13423, + "time_per_iteration": 2.4429931640625 + }, + { + "auxiliary_loss_clip": 0.01101429, + "auxiliary_loss_mlp": 0.01035247, + "balance_loss_clip": 1.02315295, + "balance_loss_mlp": 1.0344708, + "epoch": 0.8070945438148204, + "flos": 22930184528640.0, + "grad_norm": 1.6344709162264897, + "language_loss": 0.80310905, + "learning_rate": 3.7771872502285904e-07, + "loss": 0.82447577, + "num_input_tokens_seen": 289723280, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 13424, + "time_per_iteration": 2.46093487739563 + }, + { + "auxiliary_loss_clip": 0.01100202, + "auxiliary_loss_mlp": 0.01025321, + "balance_loss_clip": 1.01418114, + "balance_loss_mlp": 1.03266358, + "epoch": 0.8071546670674884, + "flos": 25300683613440.0, + "grad_norm": 1.4436773740307707, + "language_loss": 0.79038882, + "learning_rate": 3.774909786710232e-07, + "loss": 0.81164408, + "num_input_tokens_seen": 289743475, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.67578125, + "step": 13425, + "time_per_iteration": 2.4803316593170166 + }, + { + "auxiliary_loss_clip": 0.01097262, + "auxiliary_loss_mlp": 0.01028852, + "balance_loss_clip": 1.01813531, + "balance_loss_mlp": 1.03308177, + "epoch": 0.8072147903201563, + "flos": 18113701927680.0, + "grad_norm": 2.7525970280950185, + "language_loss": 0.75375247, + "learning_rate": 3.772632938448923e-07, + "loss": 0.77501363, + "num_input_tokens_seen": 289761400, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.640625, + "step": 13426, + "time_per_iteration": 2.450507164001465 + }, + { + "auxiliary_loss_clip": 0.01099759, + "auxiliary_loss_mlp": 0.01023019, + "balance_loss_clip": 1.01242161, + "balance_loss_mlp": 1.0346477, + "epoch": 0.8072749135728243, + "flos": 26688164215680.0, + "grad_norm": 1.7578530732787132, + "language_loss": 0.72718084, + "learning_rate": 3.770356705530997e-07, + "loss": 0.74840856, + "num_input_tokens_seen": 289781025, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.65234375, + "step": 13427, + "time_per_iteration": 2.503213405609131 + }, + { + "auxiliary_loss_clip": 0.01100811, + "auxiliary_loss_mlp": 0.01037925, + "balance_loss_clip": 1.02561057, + "balance_loss_mlp": 1.03508282, + "epoch": 0.8073350368254922, + "flos": 19240291071360.0, + "grad_norm": 1.5806827862127526, + "language_loss": 0.69905955, + "learning_rate": 3.768081088042774e-07, + "loss": 0.72044694, + "num_input_tokens_seen": 289798380, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.65625, + "step": 13428, + "time_per_iteration": 3.9154212474823 + }, + { + "auxiliary_loss_clip": 0.01100554, + "auxiliary_loss_mlp": 0.01028315, + "balance_loss_clip": 1.01791382, + "balance_loss_mlp": 1.03464985, + "epoch": 0.8073951600781603, + "flos": 13334530579200.0, + "grad_norm": 1.7256897581307475, + "language_loss": 0.74537814, + "learning_rate": 3.765806086070544e-07, + "loss": 0.76666689, + "num_input_tokens_seen": 289814515, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.65625, + "step": 13429, + "time_per_iteration": 3.8353562355041504 + }, + { + "auxiliary_loss_clip": 0.01096625, + "auxiliary_loss_mlp": 0.01027974, + "balance_loss_clip": 1.01703608, + "balance_loss_mlp": 1.03373289, + "epoch": 0.8074552833308282, + "flos": 22853191726080.0, + "grad_norm": 1.6508081444527534, + "language_loss": 0.66780758, + "learning_rate": 3.763531699700568e-07, + "loss": 0.68905354, + "num_input_tokens_seen": 289834315, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.62890625, + "step": 13430, + "time_per_iteration": 3.8374648094177246 + }, + { + "auxiliary_loss_clip": 0.01099608, + "auxiliary_loss_mlp": 0.01026452, + "balance_loss_clip": 1.01557398, + "balance_loss_mlp": 1.03463328, + "epoch": 0.8075154065834962, + "flos": 20339409288960.0, + "grad_norm": 1.7401371599211086, + "language_loss": 0.80040669, + "learning_rate": 3.7612579290190994e-07, + "loss": 0.82166731, + "num_input_tokens_seen": 289853770, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6484375, + "step": 13431, + "time_per_iteration": 2.4570858478546143 + }, + { + "auxiliary_loss_clip": 0.0109803, + "auxiliary_loss_mlp": 0.01026665, + "balance_loss_clip": 1.01483989, + "balance_loss_mlp": 1.03383279, + "epoch": 0.8075755298361641, + "flos": 21908059113600.0, + "grad_norm": 1.7148074756954637, + "language_loss": 0.80367452, + "learning_rate": 3.7589847741123593e-07, + "loss": 0.82492149, + "num_input_tokens_seen": 289870480, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.640625, + "step": 13432, + "time_per_iteration": 2.436596155166626 + }, + { + "auxiliary_loss_clip": 0.0110653, + "auxiliary_loss_mlp": 0.01029872, + "balance_loss_clip": 1.01817179, + "balance_loss_mlp": 1.03757977, + "epoch": 0.8076356530888321, + "flos": 15669298609920.0, + "grad_norm": 2.2592964465029524, + "language_loss": 0.70442599, + "learning_rate": 3.7567122350665415e-07, + "loss": 0.72579002, + "num_input_tokens_seen": 289888275, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6875, + "step": 13433, + "time_per_iteration": 3.920189142227173 + }, + { + "auxiliary_loss_clip": 0.01099536, + "auxiliary_loss_mlp": 0.01027383, + "balance_loss_clip": 1.01615393, + "balance_loss_mlp": 1.03418195, + "epoch": 0.8076957763415, + "flos": 37777414521600.0, + "grad_norm": 1.542675300330219, + "language_loss": 0.72662854, + "learning_rate": 3.754440311967828e-07, + "loss": 0.7478978, + "num_input_tokens_seen": 289911495, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 13434, + "time_per_iteration": 2.579868793487549 + }, + { + "auxiliary_loss_clip": 0.01102649, + "auxiliary_loss_mlp": 0.01027174, + "balance_loss_clip": 1.0162847, + "balance_loss_mlp": 1.03770304, + "epoch": 0.807755899594168, + "flos": 19610781903360.0, + "grad_norm": 2.005086491420573, + "language_loss": 0.68262374, + "learning_rate": 3.752169004902361e-07, + "loss": 0.70392191, + "num_input_tokens_seen": 289930045, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 13435, + "time_per_iteration": 2.4222404956817627 + }, + { + "auxiliary_loss_clip": 0.01103995, + "auxiliary_loss_mlp": 0.01032794, + "balance_loss_clip": 1.0194422, + "balance_loss_mlp": 1.03674603, + "epoch": 0.8078160228468361, + "flos": 23294893271040.0, + "grad_norm": 3.3656711098835967, + "language_loss": 0.75132048, + "learning_rate": 3.749898313956279e-07, + "loss": 0.77268833, + "num_input_tokens_seen": 289950815, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.671875, + "step": 13436, + "time_per_iteration": 2.4514195919036865 + }, + { + "auxiliary_loss_clip": 0.0109426, + "auxiliary_loss_mlp": 0.01025813, + "balance_loss_clip": 1.0144937, + "balance_loss_mlp": 1.03109729, + "epoch": 0.807876146099504, + "flos": 27162651899520.0, + "grad_norm": 1.6647863446224558, + "language_loss": 0.70325077, + "learning_rate": 3.747628239215674e-07, + "loss": 0.72445142, + "num_input_tokens_seen": 289971730, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6328125, + "step": 13437, + "time_per_iteration": 2.486090660095215 + }, + { + "auxiliary_loss_clip": 0.01100581, + "auxiliary_loss_mlp": 0.01028394, + "balance_loss_clip": 1.01785624, + "balance_loss_mlp": 1.03698409, + "epoch": 0.807936269352172, + "flos": 27160030206720.0, + "grad_norm": 1.6509450108109354, + "language_loss": 0.73176312, + "learning_rate": 3.745358780766636e-07, + "loss": 0.75305283, + "num_input_tokens_seen": 289992995, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.63671875, + "step": 13438, + "time_per_iteration": 2.50380277633667 + }, + { + "auxiliary_loss_clip": 0.01097642, + "auxiliary_loss_mlp": 0.01028719, + "balance_loss_clip": 1.01776934, + "balance_loss_mlp": 1.03364897, + "epoch": 0.8079963926048399, + "flos": 20740423703040.0, + "grad_norm": 2.024260239106251, + "language_loss": 0.77098519, + "learning_rate": 3.7430899386952344e-07, + "loss": 0.79224879, + "num_input_tokens_seen": 290009405, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.640625, + "step": 13439, + "time_per_iteration": 2.4301347732543945 + }, + { + "auxiliary_loss_clip": 0.01100059, + "auxiliary_loss_mlp": 0.01032366, + "balance_loss_clip": 1.02110648, + "balance_loss_mlp": 1.03528643, + "epoch": 0.8080565158575079, + "flos": 25009663622400.0, + "grad_norm": 1.5099931355487166, + "language_loss": 0.78758991, + "learning_rate": 3.7408217130874786e-07, + "loss": 0.80891412, + "num_input_tokens_seen": 290031085, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 13440, + "time_per_iteration": 2.5148396492004395 + }, + { + "auxiliary_loss_clip": 0.01100626, + "auxiliary_loss_mlp": 0.01028309, + "balance_loss_clip": 1.0156852, + "balance_loss_mlp": 1.03371847, + "epoch": 0.8081166391101758, + "flos": 18698076293760.0, + "grad_norm": 2.4238426069690138, + "language_loss": 0.59080982, + "learning_rate": 3.7385541040293946e-07, + "loss": 0.61209911, + "num_input_tokens_seen": 290048670, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.671875, + "step": 13441, + "time_per_iteration": 2.4081990718841553 + }, + { + "auxiliary_loss_clip": 0.01099503, + "auxiliary_loss_mlp": 0.01029636, + "balance_loss_clip": 1.01740527, + "balance_loss_mlp": 1.0348506, + "epoch": 0.8081767623628439, + "flos": 19828651847040.0, + "grad_norm": 2.0479440790186696, + "language_loss": 0.76248497, + "learning_rate": 3.7362871116069684e-07, + "loss": 0.7837764, + "num_input_tokens_seen": 290064085, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.64453125, + "step": 13442, + "time_per_iteration": 2.4318795204162598 + }, + { + "auxiliary_loss_clip": 0.01099068, + "auxiliary_loss_mlp": 0.01027394, + "balance_loss_clip": 1.01610458, + "balance_loss_mlp": 1.03397799, + "epoch": 0.8082368856155118, + "flos": 35772952982400.0, + "grad_norm": 1.4112567306342216, + "language_loss": 0.7047745, + "learning_rate": 3.734020735906169e-07, + "loss": 0.72603905, + "num_input_tokens_seen": 290086255, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6484375, + "step": 13443, + "time_per_iteration": 2.59014892578125 + }, + { + "auxiliary_loss_clip": 0.01098748, + "auxiliary_loss_mlp": 0.01035834, + "balance_loss_clip": 1.02450943, + "balance_loss_mlp": 1.03480315, + "epoch": 0.8082970088681798, + "flos": 17198015489280.0, + "grad_norm": 1.822282396232332, + "language_loss": 0.82413107, + "learning_rate": 3.7317549770129286e-07, + "loss": 0.84547687, + "num_input_tokens_seen": 290103995, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 13444, + "time_per_iteration": 2.4474878311157227 + }, + { + "auxiliary_loss_clip": 0.01023449, + "auxiliary_loss_mlp": 0.01001686, + "balance_loss_clip": 1.00072086, + "balance_loss_mlp": 1.00341463, + "epoch": 0.8083571321208477, + "flos": 63555207511680.0, + "grad_norm": 0.8424351195501338, + "language_loss": 0.53699923, + "learning_rate": 3.7294898350131754e-07, + "loss": 0.55725062, + "num_input_tokens_seen": 290157245, + "router_z_loss_clip": 0.00964355, + "router_z_loss_mlp": 0.20117188, + "step": 13445, + "time_per_iteration": 2.893291473388672 + }, + { + "auxiliary_loss_clip": 0.0109982, + "auxiliary_loss_mlp": 0.01029106, + "balance_loss_clip": 1.0172683, + "balance_loss_mlp": 1.03552246, + "epoch": 0.8084172553735157, + "flos": 17930701111680.0, + "grad_norm": 3.4905942687321514, + "language_loss": 0.72271657, + "learning_rate": 3.7272253099927964e-07, + "loss": 0.7440058, + "num_input_tokens_seen": 290174970, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.640625, + "step": 13446, + "time_per_iteration": 2.444681167602539 + }, + { + "auxiliary_loss_clip": 0.01104016, + "auxiliary_loss_mlp": 0.01028842, + "balance_loss_clip": 1.01638484, + "balance_loss_mlp": 1.03635025, + "epoch": 0.8084773786261836, + "flos": 24097999507200.0, + "grad_norm": 1.848843080578613, + "language_loss": 0.71273375, + "learning_rate": 3.7249614020376606e-07, + "loss": 0.73406231, + "num_input_tokens_seen": 290194395, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.67578125, + "step": 13447, + "time_per_iteration": 2.4645798206329346 + }, + { + "auxiliary_loss_clip": 0.01101801, + "auxiliary_loss_mlp": 0.01030026, + "balance_loss_clip": 1.01762831, + "balance_loss_mlp": 1.03445625, + "epoch": 0.8085375018788516, + "flos": 15588211656960.0, + "grad_norm": 2.247658895940983, + "language_loss": 0.75123751, + "learning_rate": 3.7226981112336197e-07, + "loss": 0.77255571, + "num_input_tokens_seen": 290209200, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 13448, + "time_per_iteration": 2.393450975418091 + }, + { + "auxiliary_loss_clip": 0.01023124, + "auxiliary_loss_mlp": 0.01004466, + "balance_loss_clip": 1.00351226, + "balance_loss_mlp": 1.00315809, + "epoch": 0.8085976251315197, + "flos": 67561296393600.0, + "grad_norm": 0.7387933827172105, + "language_loss": 0.63826883, + "learning_rate": 3.7204354376665024e-07, + "loss": 0.65854478, + "num_input_tokens_seen": 290274565, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.20019531, + "step": 13449, + "time_per_iteration": 3.089714288711548 + }, + { + "auxiliary_loss_clip": 0.01100909, + "auxiliary_loss_mlp": 0.01025967, + "balance_loss_clip": 1.01415896, + "balance_loss_mlp": 1.03577614, + "epoch": 0.8086577483841876, + "flos": 22561453463040.0, + "grad_norm": 1.6804570803632504, + "language_loss": 0.73693436, + "learning_rate": 3.718173381422105e-07, + "loss": 0.75820303, + "num_input_tokens_seen": 290293630, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65234375, + "step": 13450, + "time_per_iteration": 2.4564168453216553 + }, + { + "auxiliary_loss_clip": 0.01098501, + "auxiliary_loss_mlp": 0.01028693, + "balance_loss_clip": 1.01766062, + "balance_loss_mlp": 1.03300965, + "epoch": 0.8087178716368556, + "flos": 17968084191360.0, + "grad_norm": 1.9099167962984258, + "language_loss": 0.73742312, + "learning_rate": 3.7159119425861986e-07, + "loss": 0.75869507, + "num_input_tokens_seen": 290311450, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 13451, + "time_per_iteration": 2.4158482551574707 + }, + { + "auxiliary_loss_clip": 0.01104266, + "auxiliary_loss_mlp": 0.01027862, + "balance_loss_clip": 1.01489806, + "balance_loss_mlp": 1.03483427, + "epoch": 0.8087779948895235, + "flos": 21719527603200.0, + "grad_norm": 1.7227484700125357, + "language_loss": 0.80100703, + "learning_rate": 3.713651121244543e-07, + "loss": 0.82232833, + "num_input_tokens_seen": 290330165, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6953125, + "step": 13452, + "time_per_iteration": 2.4718620777130127 + }, + { + "auxiliary_loss_clip": 0.01101927, + "auxiliary_loss_mlp": 0.01034298, + "balance_loss_clip": 1.02291918, + "balance_loss_mlp": 1.03578424, + "epoch": 0.8088381181421915, + "flos": 29092885983360.0, + "grad_norm": 3.0126577683381246, + "language_loss": 0.78564459, + "learning_rate": 3.711390917482875e-07, + "loss": 0.80700684, + "num_input_tokens_seen": 290350815, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6640625, + "step": 13453, + "time_per_iteration": 2.4844601154327393 + }, + { + "auxiliary_loss_clip": 0.01098121, + "auxiliary_loss_mlp": 0.01027888, + "balance_loss_clip": 1.01561522, + "balance_loss_mlp": 1.03227544, + "epoch": 0.8088982413948594, + "flos": 22198432659840.0, + "grad_norm": 3.8296608208980762, + "language_loss": 0.77353287, + "learning_rate": 3.709131331386892e-07, + "loss": 0.79479295, + "num_input_tokens_seen": 290367380, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66015625, + "step": 13454, + "time_per_iteration": 2.4381799697875977 + }, + { + "auxiliary_loss_clip": 0.01097801, + "auxiliary_loss_mlp": 0.01030092, + "balance_loss_clip": 1.01824236, + "balance_loss_mlp": 1.03329492, + "epoch": 0.8089583646475275, + "flos": 28036717453440.0, + "grad_norm": 1.6657620401651272, + "language_loss": 0.7656436, + "learning_rate": 3.7068723630422795e-07, + "loss": 0.78692257, + "num_input_tokens_seen": 290387965, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.64453125, + "step": 13455, + "time_per_iteration": 2.489542007446289 + }, + { + "auxiliary_loss_clip": 0.01099517, + "auxiliary_loss_mlp": 0.01026999, + "balance_loss_clip": 1.01534009, + "balance_loss_mlp": 1.03383231, + "epoch": 0.8090184879001954, + "flos": 16617735273600.0, + "grad_norm": 1.6998136101737356, + "language_loss": 0.78545928, + "learning_rate": 3.70461401253471e-07, + "loss": 0.80672443, + "num_input_tokens_seen": 290404150, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 13456, + "time_per_iteration": 2.493177890777588 + }, + { + "auxiliary_loss_clip": 0.0110144, + "auxiliary_loss_mlp": 0.01033934, + "balance_loss_clip": 1.02302623, + "balance_loss_mlp": 1.03721011, + "epoch": 0.8090786111528634, + "flos": 27340804379520.0, + "grad_norm": 2.0933677582295265, + "language_loss": 0.71244174, + "learning_rate": 3.702356279949801e-07, + "loss": 0.7337954, + "num_input_tokens_seen": 290422370, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.640625, + "step": 13457, + "time_per_iteration": 2.50264835357666 + }, + { + "auxiliary_loss_clip": 0.01099652, + "auxiliary_loss_mlp": 0.01029966, + "balance_loss_clip": 1.01955891, + "balance_loss_mlp": 1.03506947, + "epoch": 0.8091387344055313, + "flos": 21105742976640.0, + "grad_norm": 1.8174801771969786, + "language_loss": 0.72725999, + "learning_rate": 3.700099165373176e-07, + "loss": 0.74855614, + "num_input_tokens_seen": 290442645, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.64453125, + "step": 13458, + "time_per_iteration": 2.4687604904174805 + }, + { + "auxiliary_loss_clip": 0.01100692, + "auxiliary_loss_mlp": 0.01030689, + "balance_loss_clip": 1.0188396, + "balance_loss_mlp": 1.03538537, + "epoch": 0.8091988576581993, + "flos": 11655060318720.0, + "grad_norm": 9.489100593414795, + "language_loss": 0.78715897, + "learning_rate": 3.6978426688904275e-07, + "loss": 0.80847281, + "num_input_tokens_seen": 290458520, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65625, + "step": 13459, + "time_per_iteration": 2.459733724594116 + }, + { + "auxiliary_loss_clip": 0.01101626, + "auxiliary_loss_mlp": 0.01029005, + "balance_loss_clip": 1.0167737, + "balance_loss_mlp": 1.03463078, + "epoch": 0.8092589809108672, + "flos": 22963329803520.0, + "grad_norm": 2.336644313106872, + "language_loss": 0.80171156, + "learning_rate": 3.695586790587113e-07, + "loss": 0.82301795, + "num_input_tokens_seen": 290474465, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66796875, + "step": 13460, + "time_per_iteration": 2.4446218013763428 + }, + { + "auxiliary_loss_clip": 0.01101005, + "auxiliary_loss_mlp": 0.01032325, + "balance_loss_clip": 1.01998675, + "balance_loss_mlp": 1.03367543, + "epoch": 0.8093191041635353, + "flos": 13260985482240.0, + "grad_norm": 1.8186095835503757, + "language_loss": 0.84812057, + "learning_rate": 3.693331530548789e-07, + "loss": 0.86945391, + "num_input_tokens_seen": 290492060, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.67578125, + "step": 13461, + "time_per_iteration": 2.400993824005127 + }, + { + "auxiliary_loss_clip": 0.0110315, + "auxiliary_loss_mlp": 0.01036215, + "balance_loss_clip": 1.02418709, + "balance_loss_mlp": 1.03562999, + "epoch": 0.8093792274162032, + "flos": 25516003691520.0, + "grad_norm": 1.848257188475226, + "language_loss": 0.76413333, + "learning_rate": 3.69107688886096e-07, + "loss": 0.78552705, + "num_input_tokens_seen": 290511510, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 13462, + "time_per_iteration": 2.476879358291626 + }, + { + "auxiliary_loss_clip": 0.01102421, + "auxiliary_loss_mlp": 0.01033505, + "balance_loss_clip": 1.02077329, + "balance_loss_mlp": 1.03630662, + "epoch": 0.8094393506688712, + "flos": 23546483107200.0, + "grad_norm": 2.0825422363355948, + "language_loss": 0.82803857, + "learning_rate": 3.6888228656091357e-07, + "loss": 0.84939778, + "num_input_tokens_seen": 290530035, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.66015625, + "step": 13463, + "time_per_iteration": 2.4521071910858154 + }, + { + "auxiliary_loss_clip": 0.01099095, + "auxiliary_loss_mlp": 0.01032745, + "balance_loss_clip": 1.0219866, + "balance_loss_mlp": 1.03470421, + "epoch": 0.8094994739215392, + "flos": 17055917285760.0, + "grad_norm": 5.844094604109171, + "language_loss": 0.62201041, + "learning_rate": 3.686569460878779e-07, + "loss": 0.64332885, + "num_input_tokens_seen": 290548245, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6484375, + "step": 13464, + "time_per_iteration": 2.424069404602051 + }, + { + "auxiliary_loss_clip": 0.01097475, + "auxiliary_loss_mlp": 0.01028202, + "balance_loss_clip": 1.01769936, + "balance_loss_mlp": 1.03367341, + "epoch": 0.8095595971742071, + "flos": 23551223702400.0, + "grad_norm": 1.5079882222781815, + "language_loss": 0.61727977, + "learning_rate": 3.684316674755341e-07, + "loss": 0.63853657, + "num_input_tokens_seen": 290568625, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.63671875, + "step": 13465, + "time_per_iteration": 2.4566633701324463 + }, + { + "auxiliary_loss_clip": 0.0110087, + "auxiliary_loss_mlp": 0.01034182, + "balance_loss_clip": 1.02232695, + "balance_loss_mlp": 1.03666687, + "epoch": 0.8096197204268751, + "flos": 20373201008640.0, + "grad_norm": 1.6661852596704285, + "language_loss": 0.81980264, + "learning_rate": 3.682064507324256e-07, + "loss": 0.84115314, + "num_input_tokens_seen": 290586575, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.640625, + "step": 13466, + "time_per_iteration": 2.4338531494140625 + }, + { + "auxiliary_loss_clip": 0.01103687, + "auxiliary_loss_mlp": 0.01033782, + "balance_loss_clip": 1.02221847, + "balance_loss_mlp": 1.0364902, + "epoch": 0.809679843679543, + "flos": 27818775682560.0, + "grad_norm": 1.9379402602159286, + "language_loss": 0.76123488, + "learning_rate": 3.6798129586709204e-07, + "loss": 0.78260958, + "num_input_tokens_seen": 290606790, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.671875, + "step": 13467, + "time_per_iteration": 2.5050792694091797 + }, + { + "auxiliary_loss_clip": 0.01096837, + "auxiliary_loss_mlp": 0.01024677, + "balance_loss_clip": 1.01335227, + "balance_loss_mlp": 1.03137767, + "epoch": 0.8097399669322111, + "flos": 22014103040640.0, + "grad_norm": 1.827082362684537, + "language_loss": 0.79509449, + "learning_rate": 3.6775620288807073e-07, + "loss": 0.81630957, + "num_input_tokens_seen": 290625525, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 13468, + "time_per_iteration": 2.4531302452087402 + }, + { + "auxiliary_loss_clip": 0.01095055, + "auxiliary_loss_mlp": 0.01028993, + "balance_loss_clip": 1.01831162, + "balance_loss_mlp": 1.03250098, + "epoch": 0.809800090184879, + "flos": 18988988544000.0, + "grad_norm": 2.16796452248889, + "language_loss": 0.67542112, + "learning_rate": 3.675311718038978e-07, + "loss": 0.69666153, + "num_input_tokens_seen": 290644935, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.625, + "step": 13469, + "time_per_iteration": 2.4108262062072754 + }, + { + "auxiliary_loss_clip": 0.01022711, + "auxiliary_loss_mlp": 0.01002994, + "balance_loss_clip": 1.00204587, + "balance_loss_mlp": 1.00256538, + "epoch": 0.809860213437547, + "flos": 66099516508800.0, + "grad_norm": 0.6937103683167268, + "language_loss": 0.54675603, + "learning_rate": 3.6730620262310683e-07, + "loss": 0.56701303, + "num_input_tokens_seen": 290710735, + "router_z_loss_clip": 0.00946045, + "router_z_loss_mlp": 0.20117188, + "step": 13470, + "time_per_iteration": 4.479866027832031 + }, + { + "auxiliary_loss_clip": 0.01098507, + "auxiliary_loss_mlp": 0.01029422, + "balance_loss_clip": 1.01866305, + "balance_loss_mlp": 1.03306627, + "epoch": 0.8099203366902149, + "flos": 20882485992960.0, + "grad_norm": 1.8860877389353394, + "language_loss": 0.69289327, + "learning_rate": 3.670812953542279e-07, + "loss": 0.71417254, + "num_input_tokens_seen": 290729565, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.65625, + "step": 13471, + "time_per_iteration": 3.869608163833618 + }, + { + "auxiliary_loss_clip": 0.01099092, + "auxiliary_loss_mlp": 0.01024651, + "balance_loss_clip": 1.01318324, + "balance_loss_mlp": 1.03399885, + "epoch": 0.8099804599428829, + "flos": 26030927111040.0, + "grad_norm": 1.6760618774214828, + "language_loss": 0.79667246, + "learning_rate": 3.6685645000579003e-07, + "loss": 0.8179099, + "num_input_tokens_seen": 290749360, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65234375, + "step": 13472, + "time_per_iteration": 3.874138355255127 + }, + { + "auxiliary_loss_clip": 0.01022918, + "auxiliary_loss_mlp": 0.01002051, + "balance_loss_clip": 1.00103176, + "balance_loss_mlp": 1.00270104, + "epoch": 0.8100405831955508, + "flos": 69303573584640.0, + "grad_norm": 0.7480344788925887, + "language_loss": 0.57732165, + "learning_rate": 3.666316665863201e-07, + "loss": 0.59757125, + "num_input_tokens_seen": 290812145, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.20214844, + "step": 13473, + "time_per_iteration": 2.9958584308624268 + }, + { + "auxiliary_loss_clip": 0.01101746, + "auxiliary_loss_mlp": 0.01028502, + "balance_loss_clip": 1.01672435, + "balance_loss_mlp": 1.03484774, + "epoch": 0.8101007064482189, + "flos": 15012492468480.0, + "grad_norm": 2.1302900400638727, + "language_loss": 0.73930925, + "learning_rate": 3.664069451043399e-07, + "loss": 0.76061177, + "num_input_tokens_seen": 290829845, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.66796875, + "step": 13474, + "time_per_iteration": 2.4078030586242676 + }, + { + "auxiliary_loss_clip": 0.01103776, + "auxiliary_loss_mlp": 0.01032761, + "balance_loss_clip": 1.02123356, + "balance_loss_mlp": 1.03630209, + "epoch": 0.8101608297008868, + "flos": 21067210661760.0, + "grad_norm": 1.7301806591727757, + "language_loss": 0.79092455, + "learning_rate": 3.661822855683723e-07, + "loss": 0.81228995, + "num_input_tokens_seen": 290848815, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.67578125, + "step": 13475, + "time_per_iteration": 3.9835152626037598 + }, + { + "auxiliary_loss_clip": 0.01098463, + "auxiliary_loss_mlp": 0.01034903, + "balance_loss_clip": 1.02390599, + "balance_loss_mlp": 1.0341773, + "epoch": 0.8102209529535548, + "flos": 23731279603200.0, + "grad_norm": 1.536909800209771, + "language_loss": 0.75346851, + "learning_rate": 3.659576879869364e-07, + "loss": 0.77480221, + "num_input_tokens_seen": 290868580, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.640625, + "step": 13476, + "time_per_iteration": 2.462615728378296 + }, + { + "auxiliary_loss_clip": 0.01102891, + "auxiliary_loss_mlp": 0.01035167, + "balance_loss_clip": 1.02225113, + "balance_loss_mlp": 1.03499579, + "epoch": 0.8102810762062228, + "flos": 10955879107200.0, + "grad_norm": 2.160073073181854, + "language_loss": 0.73751932, + "learning_rate": 3.657331523685485e-07, + "loss": 0.75889993, + "num_input_tokens_seen": 290883540, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6796875, + "step": 13477, + "time_per_iteration": 2.396301031112671 + }, + { + "auxiliary_loss_clip": 0.01098807, + "auxiliary_loss_mlp": 0.01031967, + "balance_loss_clip": 1.02095175, + "balance_loss_mlp": 1.03388894, + "epoch": 0.8103411994588907, + "flos": 14648825220480.0, + "grad_norm": 2.052818200341471, + "language_loss": 0.69685113, + "learning_rate": 3.6550867872172365e-07, + "loss": 0.71815884, + "num_input_tokens_seen": 290901560, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 13478, + "time_per_iteration": 2.429624557495117 + }, + { + "auxiliary_loss_clip": 0.01022393, + "auxiliary_loss_mlp": 0.00998048, + "balance_loss_clip": 0.99711275, + "balance_loss_mlp": 1.00228572, + "epoch": 0.8104013227115587, + "flos": 59153314665600.0, + "grad_norm": 0.6817621273337255, + "language_loss": 0.52143216, + "learning_rate": 3.6528426705497293e-07, + "loss": 0.54163659, + "num_input_tokens_seen": 290959185, + "router_z_loss_clip": 0.00933838, + "router_z_loss_mlp": 0.20117188, + "step": 13479, + "time_per_iteration": 2.9901397228240967 + }, + { + "auxiliary_loss_clip": 0.01099068, + "auxiliary_loss_mlp": 0.01030237, + "balance_loss_clip": 1.0192523, + "balance_loss_mlp": 1.03441501, + "epoch": 0.8104614459642266, + "flos": 19828687760640.0, + "grad_norm": 1.7169368988258746, + "language_loss": 0.71180439, + "learning_rate": 3.650599173768072e-07, + "loss": 0.73309743, + "num_input_tokens_seen": 290979585, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6484375, + "step": 13480, + "time_per_iteration": 2.455625295639038 + }, + { + "auxiliary_loss_clip": 0.01101048, + "auxiliary_loss_mlp": 0.01030896, + "balance_loss_clip": 1.01992917, + "balance_loss_mlp": 1.03478885, + "epoch": 0.8105215692168947, + "flos": 25374264624000.0, + "grad_norm": 1.8294691640440002, + "language_loss": 0.79820704, + "learning_rate": 3.648356296957327e-07, + "loss": 0.81952655, + "num_input_tokens_seen": 291000865, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6640625, + "step": 13481, + "time_per_iteration": 2.4734697341918945 + }, + { + "auxiliary_loss_clip": 0.01098939, + "auxiliary_loss_mlp": 0.01030269, + "balance_loss_clip": 1.01901519, + "balance_loss_mlp": 1.03369451, + "epoch": 0.8105816924695626, + "flos": 20481722974080.0, + "grad_norm": 1.7697725439272614, + "language_loss": 0.72478992, + "learning_rate": 3.646114040202548e-07, + "loss": 0.74608201, + "num_input_tokens_seen": 291018285, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 13482, + "time_per_iteration": 2.439736843109131 + }, + { + "auxiliary_loss_clip": 0.01100486, + "auxiliary_loss_mlp": 0.01025923, + "balance_loss_clip": 1.01400197, + "balance_loss_mlp": 1.03284776, + "epoch": 0.8106418157222306, + "flos": 14538687143040.0, + "grad_norm": 2.090719044205904, + "language_loss": 0.65953445, + "learning_rate": 3.6438724035887705e-07, + "loss": 0.68079859, + "num_input_tokens_seen": 291035745, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 13483, + "time_per_iteration": 2.444854259490967 + }, + { + "auxiliary_loss_clip": 0.01099291, + "auxiliary_loss_mlp": 0.01027053, + "balance_loss_clip": 1.01493549, + "balance_loss_mlp": 1.0334847, + "epoch": 0.8107019389748985, + "flos": 22564470205440.0, + "grad_norm": 1.6031206001682317, + "language_loss": 0.76335526, + "learning_rate": 3.641631387200992e-07, + "loss": 0.78461868, + "num_input_tokens_seen": 291053280, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.65625, + "step": 13484, + "time_per_iteration": 2.464406728744507 + }, + { + "auxiliary_loss_clip": 0.0110487, + "auxiliary_loss_mlp": 0.01034132, + "balance_loss_clip": 1.02137613, + "balance_loss_mlp": 1.03535843, + "epoch": 0.8107620622275665, + "flos": 19609560840960.0, + "grad_norm": 1.6053200724727246, + "language_loss": 0.72207975, + "learning_rate": 3.639390991124183e-07, + "loss": 0.74346977, + "num_input_tokens_seen": 291072855, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 13485, + "time_per_iteration": 2.4401731491088867 + }, + { + "auxiliary_loss_clip": 0.01096529, + "auxiliary_loss_mlp": 0.01026674, + "balance_loss_clip": 1.01576591, + "balance_loss_mlp": 1.03370321, + "epoch": 0.8108221854802344, + "flos": 16143498984960.0, + "grad_norm": 1.8011758886581477, + "language_loss": 0.75758684, + "learning_rate": 3.637151215443308e-07, + "loss": 0.77881885, + "num_input_tokens_seen": 291090285, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.62890625, + "step": 13486, + "time_per_iteration": 2.4450883865356445 + }, + { + "auxiliary_loss_clip": 0.01102508, + "auxiliary_loss_mlp": 0.01029573, + "balance_loss_clip": 1.01787281, + "balance_loss_mlp": 1.03437519, + "epoch": 0.8108823087329025, + "flos": 21106209853440.0, + "grad_norm": 2.036003416638632, + "language_loss": 0.72445893, + "learning_rate": 3.6349120602433045e-07, + "loss": 0.74577975, + "num_input_tokens_seen": 291107675, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 13487, + "time_per_iteration": 2.4479668140411377 + }, + { + "auxiliary_loss_clip": 0.01097974, + "auxiliary_loss_mlp": 0.01030011, + "balance_loss_clip": 1.01929998, + "balance_loss_mlp": 1.03596091, + "epoch": 0.8109424319855704, + "flos": 29199648182400.0, + "grad_norm": 1.6462408552196026, + "language_loss": 0.84215033, + "learning_rate": 3.6326735256090715e-07, + "loss": 0.86343014, + "num_input_tokens_seen": 291126900, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.62109375, + "step": 13488, + "time_per_iteration": 2.4955086708068848 + }, + { + "auxiliary_loss_clip": 0.01102332, + "auxiliary_loss_mlp": 0.01031462, + "balance_loss_clip": 1.01954138, + "balance_loss_mlp": 1.03592181, + "epoch": 0.8110025552382384, + "flos": 23111856541440.0, + "grad_norm": 1.790341978719953, + "language_loss": 0.73587167, + "learning_rate": 3.630435611625502e-07, + "loss": 0.75720966, + "num_input_tokens_seen": 291145285, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 13489, + "time_per_iteration": 2.4345548152923584 + }, + { + "auxiliary_loss_clip": 0.0109709, + "auxiliary_loss_mlp": 0.01028913, + "balance_loss_clip": 1.01749909, + "balance_loss_mlp": 1.03397191, + "epoch": 0.8110626784909064, + "flos": 22379961018240.0, + "grad_norm": 2.056957564654556, + "language_loss": 0.71371531, + "learning_rate": 3.628198318377453e-07, + "loss": 0.73497528, + "num_input_tokens_seen": 291163485, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6328125, + "step": 13490, + "time_per_iteration": 2.458850622177124 + }, + { + "auxiliary_loss_clip": 0.01103063, + "auxiliary_loss_mlp": 0.01038299, + "balance_loss_clip": 1.02602684, + "balance_loss_mlp": 1.03627634, + "epoch": 0.8111228017435743, + "flos": 23368043318400.0, + "grad_norm": 2.627624655824894, + "language_loss": 0.72095811, + "learning_rate": 3.625961645949762e-07, + "loss": 0.74237174, + "num_input_tokens_seen": 291182215, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6640625, + "step": 13491, + "time_per_iteration": 2.435976266860962 + }, + { + "auxiliary_loss_clip": 0.01099382, + "auxiliary_loss_mlp": 0.01029337, + "balance_loss_clip": 1.01806605, + "balance_loss_mlp": 1.03369725, + "epoch": 0.8111829249962423, + "flos": 21286553063040.0, + "grad_norm": 1.475540339254428, + "language_loss": 0.67907929, + "learning_rate": 3.623725594427245e-07, + "loss": 0.70036656, + "num_input_tokens_seen": 291203145, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 13492, + "time_per_iteration": 2.4556221961975098 + }, + { + "auxiliary_loss_clip": 0.0110221, + "auxiliary_loss_mlp": 0.01029201, + "balance_loss_clip": 1.01752472, + "balance_loss_mlp": 1.03487253, + "epoch": 0.8112430482489102, + "flos": 22345558767360.0, + "grad_norm": 2.049238778723241, + "language_loss": 0.72220272, + "learning_rate": 3.6214901638947006e-07, + "loss": 0.7435168, + "num_input_tokens_seen": 291220600, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.671875, + "step": 13493, + "time_per_iteration": 2.447108030319214 + }, + { + "auxiliary_loss_clip": 0.01100034, + "auxiliary_loss_mlp": 0.01036233, + "balance_loss_clip": 1.02388942, + "balance_loss_mlp": 1.03342462, + "epoch": 0.8113031715015783, + "flos": 31138321962240.0, + "grad_norm": 1.6962255282126324, + "language_loss": 0.70346606, + "learning_rate": 3.619255354436885e-07, + "loss": 0.72482872, + "num_input_tokens_seen": 291241195, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6640625, + "step": 13494, + "time_per_iteration": 2.503356456756592 + }, + { + "auxiliary_loss_clip": 0.01104239, + "auxiliary_loss_mlp": 0.01033492, + "balance_loss_clip": 1.02064085, + "balance_loss_mlp": 1.03645778, + "epoch": 0.8113632947542462, + "flos": 25335445000320.0, + "grad_norm": 2.1915991847762966, + "language_loss": 0.76373303, + "learning_rate": 3.6170211661385543e-07, + "loss": 0.78511035, + "num_input_tokens_seen": 291258715, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6796875, + "step": 13495, + "time_per_iteration": 2.4522132873535156 + }, + { + "auxiliary_loss_clip": 0.01100729, + "auxiliary_loss_mlp": 0.010345, + "balance_loss_clip": 1.0229249, + "balance_loss_mlp": 1.03444338, + "epoch": 0.8114234180069142, + "flos": 28439168411520.0, + "grad_norm": 1.9013360274745676, + "language_loss": 0.80117953, + "learning_rate": 3.614787599084417e-07, + "loss": 0.82253182, + "num_input_tokens_seen": 291278030, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6640625, + "step": 13496, + "time_per_iteration": 2.4913132190704346 + }, + { + "auxiliary_loss_clip": 0.01100021, + "auxiliary_loss_mlp": 0.01033487, + "balance_loss_clip": 1.02057636, + "balance_loss_mlp": 1.03446186, + "epoch": 0.8114835412595821, + "flos": 20338870584960.0, + "grad_norm": 1.6264176986100232, + "language_loss": 0.70963192, + "learning_rate": 3.6125546533591787e-07, + "loss": 0.73096704, + "num_input_tokens_seen": 291296740, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.65625, + "step": 13497, + "time_per_iteration": 2.4535224437713623 + }, + { + "auxiliary_loss_clip": 0.01101999, + "auxiliary_loss_mlp": 0.01028348, + "balance_loss_clip": 1.01754749, + "balance_loss_mlp": 1.03544033, + "epoch": 0.8115436645122501, + "flos": 22490889194880.0, + "grad_norm": 1.6440600929050224, + "language_loss": 0.76892304, + "learning_rate": 3.610322329047508e-07, + "loss": 0.79022652, + "num_input_tokens_seen": 291318730, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6640625, + "step": 13498, + "time_per_iteration": 2.4672887325286865 + }, + { + "auxiliary_loss_clip": 0.01099408, + "auxiliary_loss_mlp": 0.01036188, + "balance_loss_clip": 1.02421904, + "balance_loss_mlp": 1.03345525, + "epoch": 0.811603787764918, + "flos": 13845288021120.0, + "grad_norm": 1.8169104035593646, + "language_loss": 0.83573735, + "learning_rate": 3.608090626234055e-07, + "loss": 0.85709327, + "num_input_tokens_seen": 291336755, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66015625, + "step": 13499, + "time_per_iteration": 2.436964273452759 + }, + { + "auxiliary_loss_clip": 0.01098883, + "auxiliary_loss_mlp": 0.01028492, + "balance_loss_clip": 1.01583743, + "balance_loss_mlp": 1.0345273, + "epoch": 0.8116639110175861, + "flos": 21614632911360.0, + "grad_norm": 1.6399516291980092, + "language_loss": 0.7623418, + "learning_rate": 3.6058595450034603e-07, + "loss": 0.78361559, + "num_input_tokens_seen": 291356795, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.640625, + "step": 13500, + "time_per_iteration": 2.4722161293029785 + }, + { + "auxiliary_loss_clip": 0.01022943, + "auxiliary_loss_mlp": 0.01002875, + "balance_loss_clip": 1.00186753, + "balance_loss_mlp": 1.0028348, + "epoch": 0.811724034270254, + "flos": 64459799625600.0, + "grad_norm": 0.8054655192024942, + "language_loss": 0.59980321, + "learning_rate": 3.603629085440303e-07, + "loss": 0.62006134, + "num_input_tokens_seen": 291416005, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.20117188, + "step": 13501, + "time_per_iteration": 3.075920820236206 + }, + { + "auxiliary_loss_clip": 0.01096394, + "auxiliary_loss_mlp": 0.01025554, + "balance_loss_clip": 1.01445556, + "balance_loss_mlp": 1.03376746, + "epoch": 0.811784157522922, + "flos": 24754123290240.0, + "grad_norm": 1.494419100022629, + "language_loss": 0.7909618, + "learning_rate": 3.6013992476291753e-07, + "loss": 0.81218129, + "num_input_tokens_seen": 291434870, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.625, + "step": 13502, + "time_per_iteration": 2.4842851161956787 + }, + { + "auxiliary_loss_clip": 0.01099167, + "auxiliary_loss_mlp": 0.01030482, + "balance_loss_clip": 1.01933587, + "balance_loss_mlp": 1.03465641, + "epoch": 0.81184428077559, + "flos": 12167146563840.0, + "grad_norm": 1.8524419382640553, + "language_loss": 0.71067178, + "learning_rate": 3.599170031654635e-07, + "loss": 0.73196828, + "num_input_tokens_seen": 291452230, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.64453125, + "step": 13503, + "time_per_iteration": 2.4225172996520996 + }, + { + "auxiliary_loss_clip": 0.01100085, + "auxiliary_loss_mlp": 0.01027413, + "balance_loss_clip": 1.01456833, + "balance_loss_mlp": 1.0341773, + "epoch": 0.8119044040282579, + "flos": 44422037775360.0, + "grad_norm": 1.453799987643089, + "language_loss": 0.67700541, + "learning_rate": 3.5969414376012065e-07, + "loss": 0.69828039, + "num_input_tokens_seen": 291477425, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.66015625, + "step": 13504, + "time_per_iteration": 2.6496918201446533 + }, + { + "auxiliary_loss_clip": 0.01098923, + "auxiliary_loss_mlp": 0.0102621, + "balance_loss_clip": 1.01401496, + "balance_loss_mlp": 1.03196406, + "epoch": 0.8119645272809259, + "flos": 52155507957120.0, + "grad_norm": 1.9644950813990756, + "language_loss": 0.7421549, + "learning_rate": 3.594713465553403e-07, + "loss": 0.76340622, + "num_input_tokens_seen": 291501070, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 13505, + "time_per_iteration": 2.7024779319763184 + }, + { + "auxiliary_loss_clip": 0.01101045, + "auxiliary_loss_mlp": 0.01026961, + "balance_loss_clip": 1.01418757, + "balance_loss_mlp": 1.03452218, + "epoch": 0.8120246505335939, + "flos": 30232978640640.0, + "grad_norm": 2.0008882636590863, + "language_loss": 0.72537345, + "learning_rate": 3.5924861155957123e-07, + "loss": 0.7466535, + "num_input_tokens_seen": 291524945, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6640625, + "step": 13506, + "time_per_iteration": 2.6178457736968994 + }, + { + "auxiliary_loss_clip": 0.01103788, + "auxiliary_loss_mlp": 0.01029775, + "balance_loss_clip": 1.01809824, + "balance_loss_mlp": 1.03516591, + "epoch": 0.8120847737862619, + "flos": 22127652910080.0, + "grad_norm": 2.171112313487914, + "language_loss": 0.76039851, + "learning_rate": 3.590259387812593e-07, + "loss": 0.78173417, + "num_input_tokens_seen": 291544605, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6875, + "step": 13507, + "time_per_iteration": 2.4627292156219482 + }, + { + "auxiliary_loss_clip": 0.01100119, + "auxiliary_loss_mlp": 0.01027115, + "balance_loss_clip": 1.01539683, + "balance_loss_mlp": 1.03240228, + "epoch": 0.8121448970389298, + "flos": 23295180579840.0, + "grad_norm": 1.60963129447103, + "language_loss": 0.70528185, + "learning_rate": 3.5880332822884783e-07, + "loss": 0.72655416, + "num_input_tokens_seen": 291563850, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 13508, + "time_per_iteration": 2.5088045597076416 + }, + { + "auxiliary_loss_clip": 0.01098821, + "auxiliary_loss_mlp": 0.01029098, + "balance_loss_clip": 1.01806569, + "balance_loss_mlp": 1.03429413, + "epoch": 0.8122050202915978, + "flos": 22164138149760.0, + "grad_norm": 1.6081819995650735, + "language_loss": 0.75921357, + "learning_rate": 3.585807799107785e-07, + "loss": 0.78049272, + "num_input_tokens_seen": 291581730, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.64453125, + "step": 13509, + "time_per_iteration": 2.5896267890930176 + }, + { + "auxiliary_loss_clip": 0.0110263, + "auxiliary_loss_mlp": 0.01031713, + "balance_loss_clip": 1.01973832, + "balance_loss_mlp": 1.03542197, + "epoch": 0.8122651435442657, + "flos": 23258946735360.0, + "grad_norm": 2.5116531958585377, + "language_loss": 0.76849926, + "learning_rate": 3.58358293835491e-07, + "loss": 0.78984267, + "num_input_tokens_seen": 291601225, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.671875, + "step": 13510, + "time_per_iteration": 2.5139570236206055 + }, + { + "auxiliary_loss_clip": 0.01102069, + "auxiliary_loss_mlp": 0.01031746, + "balance_loss_clip": 1.01926446, + "balance_loss_mlp": 1.03460789, + "epoch": 0.8123252667969337, + "flos": 16140015365760.0, + "grad_norm": 1.7922346850114963, + "language_loss": 0.69833112, + "learning_rate": 3.581358700114212e-07, + "loss": 0.71966922, + "num_input_tokens_seen": 291616995, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.67578125, + "step": 13511, + "time_per_iteration": 3.824244737625122 + }, + { + "auxiliary_loss_clip": 0.01102581, + "auxiliary_loss_mlp": 0.01035583, + "balance_loss_clip": 1.02370417, + "balance_loss_mlp": 1.03556788, + "epoch": 0.8123853900496016, + "flos": 21245399055360.0, + "grad_norm": 1.6910823817880791, + "language_loss": 0.79742736, + "learning_rate": 3.57913508447004e-07, + "loss": 0.81880891, + "num_input_tokens_seen": 291636145, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66796875, + "step": 13512, + "time_per_iteration": 3.9224977493286133 + }, + { + "auxiliary_loss_clip": 0.01096955, + "auxiliary_loss_mlp": 0.01030133, + "balance_loss_clip": 1.01904023, + "balance_loss_mlp": 1.03273702, + "epoch": 0.8124455133022697, + "flos": 64377596373120.0, + "grad_norm": 1.6257879810595937, + "language_loss": 0.63466936, + "learning_rate": 3.5769120915067076e-07, + "loss": 0.65594023, + "num_input_tokens_seen": 291662440, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.640625, + "step": 13513, + "time_per_iteration": 4.236290454864502 + }, + { + "auxiliary_loss_clip": 0.01102479, + "auxiliary_loss_mlp": 0.01032035, + "balance_loss_clip": 1.02036476, + "balance_loss_mlp": 1.03472722, + "epoch": 0.8125056365549376, + "flos": 23842207779840.0, + "grad_norm": 1.7631319357597248, + "language_loss": 0.71392423, + "learning_rate": 3.5746897213085194e-07, + "loss": 0.73526937, + "num_input_tokens_seen": 291680950, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 13514, + "time_per_iteration": 2.4483985900878906 + }, + { + "auxiliary_loss_clip": 0.01096174, + "auxiliary_loss_mlp": 0.01027979, + "balance_loss_clip": 1.016523, + "balance_loss_mlp": 1.03252888, + "epoch": 0.8125657598076056, + "flos": 23550325862400.0, + "grad_norm": 1.5280686394731957, + "language_loss": 0.62873226, + "learning_rate": 3.5724679739597364e-07, + "loss": 0.64997381, + "num_input_tokens_seen": 291702395, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.63671875, + "step": 13515, + "time_per_iteration": 2.4698777198791504 + }, + { + "auxiliary_loss_clip": 0.01093097, + "auxiliary_loss_mlp": 0.01027085, + "balance_loss_clip": 1.01553404, + "balance_loss_mlp": 1.03191626, + "epoch": 0.8126258830602736, + "flos": 20704225772160.0, + "grad_norm": 2.3397384138654482, + "language_loss": 0.7533434, + "learning_rate": 3.570246849544616e-07, + "loss": 0.77454519, + "num_input_tokens_seen": 291721135, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.609375, + "step": 13516, + "time_per_iteration": 2.437889337539673 + }, + { + "auxiliary_loss_clip": 0.01101307, + "auxiliary_loss_mlp": 0.01031935, + "balance_loss_clip": 1.02078295, + "balance_loss_mlp": 1.03491974, + "epoch": 0.8126860063129415, + "flos": 23618160696960.0, + "grad_norm": 1.450755820656369, + "language_loss": 0.91134322, + "learning_rate": 3.5680263481473907e-07, + "loss": 0.9326756, + "num_input_tokens_seen": 291741235, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6640625, + "step": 13517, + "time_per_iteration": 3.8920905590057373 + }, + { + "auxiliary_loss_clip": 0.01103043, + "auxiliary_loss_mlp": 0.01031105, + "balance_loss_clip": 1.01977992, + "balance_loss_mlp": 1.03670573, + "epoch": 0.8127461295656095, + "flos": 25007149670400.0, + "grad_norm": 1.4138276648329293, + "language_loss": 0.78618169, + "learning_rate": 3.565806469852244e-07, + "loss": 0.80752319, + "num_input_tokens_seen": 291761430, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 13518, + "time_per_iteration": 2.4696271419525146 + }, + { + "auxiliary_loss_clip": 0.01101068, + "auxiliary_loss_mlp": 0.01029067, + "balance_loss_clip": 1.01901174, + "balance_loss_mlp": 1.03644419, + "epoch": 0.8128062528182775, + "flos": 27342169096320.0, + "grad_norm": 1.5355148444526316, + "language_loss": 0.7910862, + "learning_rate": 3.56358721474336e-07, + "loss": 0.81238753, + "num_input_tokens_seen": 291781755, + "router_z_loss_clip": 0.10058594, + "router_z_loss_mlp": 0.64453125, + "step": 13519, + "time_per_iteration": 2.4910452365875244 + }, + { + "auxiliary_loss_clip": 0.0109989, + "auxiliary_loss_mlp": 0.01033264, + "balance_loss_clip": 1.02146196, + "balance_loss_mlp": 1.03325295, + "epoch": 0.8128663760709455, + "flos": 26506312634880.0, + "grad_norm": 1.5645727915672079, + "language_loss": 0.70513344, + "learning_rate": 3.561368582904905e-07, + "loss": 0.72646499, + "num_input_tokens_seen": 291804410, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 13520, + "time_per_iteration": 2.485353708267212 + }, + { + "auxiliary_loss_clip": 0.01101276, + "auxiliary_loss_mlp": 0.01029255, + "balance_loss_clip": 1.01775169, + "balance_loss_mlp": 1.03453207, + "epoch": 0.8129264993236134, + "flos": 17931239815680.0, + "grad_norm": 1.408947951983829, + "language_loss": 0.72724366, + "learning_rate": 3.5591505744209925e-07, + "loss": 0.74854898, + "num_input_tokens_seen": 291823285, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66796875, + "step": 13521, + "time_per_iteration": 2.450223684310913 + }, + { + "auxiliary_loss_clip": 0.01100366, + "auxiliary_loss_mlp": 0.01028301, + "balance_loss_clip": 1.01657629, + "balance_loss_mlp": 1.03304505, + "epoch": 0.8129866225762814, + "flos": 26177694082560.0, + "grad_norm": 1.5863649174489216, + "language_loss": 0.70147657, + "learning_rate": 3.5569331893757394e-07, + "loss": 0.7227633, + "num_input_tokens_seen": 291845305, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 13522, + "time_per_iteration": 2.5076496601104736 + }, + { + "auxiliary_loss_clip": 0.01096847, + "auxiliary_loss_mlp": 0.01029918, + "balance_loss_clip": 1.01915908, + "balance_loss_mlp": 1.03422368, + "epoch": 0.8130467458289493, + "flos": 21032197879680.0, + "grad_norm": 1.5124690207001534, + "language_loss": 0.70565176, + "learning_rate": 3.554716427853233e-07, + "loss": 0.72691941, + "num_input_tokens_seen": 291863715, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.625, + "step": 13523, + "time_per_iteration": 2.44834566116333 + }, + { + "auxiliary_loss_clip": 0.01098014, + "auxiliary_loss_mlp": 0.01029649, + "balance_loss_clip": 1.01813328, + "balance_loss_mlp": 1.03282428, + "epoch": 0.8131068690816173, + "flos": 15487051979520.0, + "grad_norm": 2.3974878608066192, + "language_loss": 0.71435654, + "learning_rate": 3.5525002899375256e-07, + "loss": 0.73563313, + "num_input_tokens_seen": 291880735, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65234375, + "step": 13524, + "time_per_iteration": 2.3961422443389893 + }, + { + "auxiliary_loss_clip": 0.01099359, + "auxiliary_loss_mlp": 0.01029777, + "balance_loss_clip": 1.01874423, + "balance_loss_mlp": 1.03370309, + "epoch": 0.8131669923342852, + "flos": 29351227576320.0, + "grad_norm": 1.672589680001, + "language_loss": 0.62591136, + "learning_rate": 3.550284775712653e-07, + "loss": 0.64720273, + "num_input_tokens_seen": 291900535, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 13525, + "time_per_iteration": 2.5271544456481934 + }, + { + "auxiliary_loss_clip": 0.01098837, + "auxiliary_loss_mlp": 0.01031848, + "balance_loss_clip": 1.02077389, + "balance_loss_mlp": 1.03405976, + "epoch": 0.8132271155869533, + "flos": 35256162055680.0, + "grad_norm": 1.6477512621572448, + "language_loss": 0.65588397, + "learning_rate": 3.548069885262628e-07, + "loss": 0.67719084, + "num_input_tokens_seen": 291919760, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6484375, + "step": 13526, + "time_per_iteration": 2.540858745574951 + }, + { + "auxiliary_loss_clip": 0.0109667, + "auxiliary_loss_mlp": 0.01027456, + "balance_loss_clip": 1.01690626, + "balance_loss_mlp": 1.03244853, + "epoch": 0.8132872388396212, + "flos": 27781895393280.0, + "grad_norm": 1.5159907981039755, + "language_loss": 0.74966121, + "learning_rate": 3.5458556186714473e-07, + "loss": 0.77090245, + "num_input_tokens_seen": 291938915, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.640625, + "step": 13527, + "time_per_iteration": 2.5109777450561523 + }, + { + "auxiliary_loss_clip": 0.01097482, + "auxiliary_loss_mlp": 0.01024208, + "balance_loss_clip": 1.01289546, + "balance_loss_mlp": 1.0329324, + "epoch": 0.8133473620922892, + "flos": 27819601695360.0, + "grad_norm": 2.97186527291202, + "language_loss": 0.7050457, + "learning_rate": 3.5436419760230706e-07, + "loss": 0.72626257, + "num_input_tokens_seen": 291958145, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.64453125, + "step": 13528, + "time_per_iteration": 2.485001564025879 + }, + { + "auxiliary_loss_clip": 0.01101089, + "auxiliary_loss_mlp": 0.01028041, + "balance_loss_clip": 1.01688933, + "balance_loss_mlp": 1.03395605, + "epoch": 0.8134074853449572, + "flos": 18989527248000.0, + "grad_norm": 1.7819953771263581, + "language_loss": 0.68812644, + "learning_rate": 3.5414289574014357e-07, + "loss": 0.70941776, + "num_input_tokens_seen": 291976860, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.671875, + "step": 13529, + "time_per_iteration": 2.422464370727539 + }, + { + "auxiliary_loss_clip": 0.01095559, + "auxiliary_loss_mlp": 0.01027778, + "balance_loss_clip": 1.01693559, + "balance_loss_mlp": 1.03227735, + "epoch": 0.8134676085976251, + "flos": 24242863057920.0, + "grad_norm": 1.3700358938115667, + "language_loss": 0.77336764, + "learning_rate": 3.5392165628904635e-07, + "loss": 0.79460108, + "num_input_tokens_seen": 291998085, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6328125, + "step": 13530, + "time_per_iteration": 2.4621317386627197 + }, + { + "auxiliary_loss_clip": 0.01100258, + "auxiliary_loss_mlp": 0.0102878, + "balance_loss_clip": 1.01672745, + "balance_loss_mlp": 1.0348835, + "epoch": 0.8135277318502931, + "flos": 19062389986560.0, + "grad_norm": 1.7581333634439877, + "language_loss": 0.82189894, + "learning_rate": 3.537004792574052e-07, + "loss": 0.8431893, + "num_input_tokens_seen": 292016585, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.65234375, + "step": 13531, + "time_per_iteration": 2.427777051925659 + }, + { + "auxiliary_loss_clip": 0.01100801, + "auxiliary_loss_mlp": 0.01029841, + "balance_loss_clip": 1.01733541, + "balance_loss_mlp": 1.03366113, + "epoch": 0.813587855102961, + "flos": 17269728992640.0, + "grad_norm": 3.204591794551847, + "language_loss": 0.71781331, + "learning_rate": 3.534793646536065e-07, + "loss": 0.73911971, + "num_input_tokens_seen": 292033255, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.671875, + "step": 13532, + "time_per_iteration": 2.3813064098358154 + }, + { + "auxiliary_loss_clip": 0.01098279, + "auxiliary_loss_mlp": 0.01025257, + "balance_loss_clip": 1.0142777, + "balance_loss_mlp": 1.03366661, + "epoch": 0.8136479783556291, + "flos": 20157593621760.0, + "grad_norm": 1.7506531009723905, + "language_loss": 0.76182723, + "learning_rate": 3.5325831248603533e-07, + "loss": 0.78306258, + "num_input_tokens_seen": 292051800, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.64453125, + "step": 13533, + "time_per_iteration": 2.438998222351074 + }, + { + "auxiliary_loss_clip": 0.0110237, + "auxiliary_loss_mlp": 0.0103567, + "balance_loss_clip": 1.02322412, + "balance_loss_mlp": 1.03353691, + "epoch": 0.813708101608297, + "flos": 22052348046720.0, + "grad_norm": 2.51446757453604, + "language_loss": 0.7628231, + "learning_rate": 3.5303732276307495e-07, + "loss": 0.78420353, + "num_input_tokens_seen": 292072215, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6875, + "step": 13534, + "time_per_iteration": 2.441894292831421 + }, + { + "auxiliary_loss_clip": 0.01099028, + "auxiliary_loss_mlp": 0.01024057, + "balance_loss_clip": 1.01414454, + "balance_loss_mlp": 1.03443563, + "epoch": 0.813768224860965, + "flos": 16173412035840.0, + "grad_norm": 2.0308122816810448, + "language_loss": 0.92820883, + "learning_rate": 3.5281639549310336e-07, + "loss": 0.94943964, + "num_input_tokens_seen": 292088830, + "router_z_loss_clip": 0.09912109, + "router_z_loss_mlp": 0.6484375, + "step": 13535, + "time_per_iteration": 2.4160375595092773 + }, + { + "auxiliary_loss_clip": 0.01097801, + "auxiliary_loss_mlp": 0.01027934, + "balance_loss_clip": 1.01700234, + "balance_loss_mlp": 1.03462958, + "epoch": 0.8138283481136329, + "flos": 24352318776960.0, + "grad_norm": 1.8424678375947172, + "language_loss": 0.70300984, + "learning_rate": 3.52595530684499e-07, + "loss": 0.72426724, + "num_input_tokens_seen": 292109225, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6328125, + "step": 13536, + "time_per_iteration": 2.456167459487915 + }, + { + "auxiliary_loss_clip": 0.01099152, + "auxiliary_loss_mlp": 0.01030226, + "balance_loss_clip": 1.01872778, + "balance_loss_mlp": 1.03421807, + "epoch": 0.8138884713663009, + "flos": 25516362827520.0, + "grad_norm": 1.5997718183114498, + "language_loss": 0.7515735, + "learning_rate": 3.5237472834563775e-07, + "loss": 0.77286726, + "num_input_tokens_seen": 292129660, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 13537, + "time_per_iteration": 2.465872287750244 + }, + { + "auxiliary_loss_clip": 0.01098101, + "auxiliary_loss_mlp": 0.01025651, + "balance_loss_clip": 1.01411164, + "balance_loss_mlp": 1.03465509, + "epoch": 0.8139485946189688, + "flos": 22454368041600.0, + "grad_norm": 1.4929419897063716, + "language_loss": 0.76306385, + "learning_rate": 3.5215398848489163e-07, + "loss": 0.78430134, + "num_input_tokens_seen": 292149090, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6328125, + "step": 13538, + "time_per_iteration": 2.436523914337158 + }, + { + "auxiliary_loss_clip": 0.0109732, + "auxiliary_loss_mlp": 0.0102977, + "balance_loss_clip": 1.01836777, + "balance_loss_mlp": 1.03199041, + "epoch": 0.8140087178716369, + "flos": 21250391045760.0, + "grad_norm": 1.552319087544461, + "language_loss": 0.77843738, + "learning_rate": 3.5193331111063176e-07, + "loss": 0.79970831, + "num_input_tokens_seen": 292169260, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 13539, + "time_per_iteration": 2.45881986618042 + }, + { + "auxiliary_loss_clip": 0.01098918, + "auxiliary_loss_mlp": 0.01032867, + "balance_loss_clip": 1.02183998, + "balance_loss_mlp": 1.03521299, + "epoch": 0.8140688411243048, + "flos": 39415730774400.0, + "grad_norm": 2.7567444964119603, + "language_loss": 0.66205287, + "learning_rate": 3.5171269623122533e-07, + "loss": 0.68337071, + "num_input_tokens_seen": 292188145, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.63671875, + "step": 13540, + "time_per_iteration": 2.566528797149658 + }, + { + "auxiliary_loss_clip": 0.01100937, + "auxiliary_loss_mlp": 0.01032367, + "balance_loss_clip": 1.02160239, + "balance_loss_mlp": 1.03553224, + "epoch": 0.8141289643769728, + "flos": 25415885508480.0, + "grad_norm": 1.5718398133314953, + "language_loss": 0.67359984, + "learning_rate": 3.5149214385503913e-07, + "loss": 0.69493288, + "num_input_tokens_seen": 292212135, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.65625, + "step": 13541, + "time_per_iteration": 2.5164880752563477 + }, + { + "auxiliary_loss_clip": 0.01098261, + "auxiliary_loss_mlp": 0.01031047, + "balance_loss_clip": 1.01922774, + "balance_loss_mlp": 1.03353024, + "epoch": 0.8141890876296408, + "flos": 12568053237120.0, + "grad_norm": 2.2070467934510534, + "language_loss": 0.6900422, + "learning_rate": 3.512716539904355e-07, + "loss": 0.7113353, + "num_input_tokens_seen": 292230645, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6484375, + "step": 13542, + "time_per_iteration": 2.4056601524353027 + }, + { + "auxiliary_loss_clip": 0.01103316, + "auxiliary_loss_mlp": 0.01031253, + "balance_loss_clip": 1.01861072, + "balance_loss_mlp": 1.03395188, + "epoch": 0.8142492108823087, + "flos": 14967172483200.0, + "grad_norm": 2.5090271200774317, + "language_loss": 0.79490924, + "learning_rate": 3.5105122664577613e-07, + "loss": 0.81625485, + "num_input_tokens_seen": 292243540, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 13543, + "time_per_iteration": 2.40470814704895 + }, + { + "auxiliary_loss_clip": 0.0110378, + "auxiliary_loss_mlp": 0.01037918, + "balance_loss_clip": 1.02544892, + "balance_loss_mlp": 1.03483176, + "epoch": 0.8143093341349767, + "flos": 12422004537600.0, + "grad_norm": 4.984235141468566, + "language_loss": 0.77592224, + "learning_rate": 3.5083086182942003e-07, + "loss": 0.79733926, + "num_input_tokens_seen": 292261715, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69140625, + "step": 13544, + "time_per_iteration": 2.400831699371338 + }, + { + "auxiliary_loss_clip": 0.01107998, + "auxiliary_loss_mlp": 0.01030568, + "balance_loss_clip": 1.01703811, + "balance_loss_mlp": 1.03668332, + "epoch": 0.8143694573876447, + "flos": 11910564737280.0, + "grad_norm": 2.959736733098292, + "language_loss": 0.73320651, + "learning_rate": 3.5061055954972264e-07, + "loss": 0.75459218, + "num_input_tokens_seen": 292275080, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7109375, + "step": 13545, + "time_per_iteration": 2.4131081104278564 + }, + { + "auxiliary_loss_clip": 0.01097302, + "auxiliary_loss_mlp": 0.01028993, + "balance_loss_clip": 1.01790035, + "balance_loss_mlp": 1.03349757, + "epoch": 0.8144295806403127, + "flos": 21212900225280.0, + "grad_norm": 3.066983178080017, + "language_loss": 0.76798058, + "learning_rate": 3.5039031981503776e-07, + "loss": 0.78924346, + "num_input_tokens_seen": 292294635, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.63671875, + "step": 13546, + "time_per_iteration": 2.468132495880127 + }, + { + "auxiliary_loss_clip": 0.01103793, + "auxiliary_loss_mlp": 0.01027092, + "balance_loss_clip": 1.01630902, + "balance_loss_mlp": 1.03670955, + "epoch": 0.8144897038929806, + "flos": 19865280741120.0, + "grad_norm": 3.13218822549319, + "language_loss": 0.70365715, + "learning_rate": 3.501701426337178e-07, + "loss": 0.72496605, + "num_input_tokens_seen": 292312695, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.671875, + "step": 13547, + "time_per_iteration": 2.4655685424804688 + }, + { + "auxiliary_loss_clip": 0.01103958, + "auxiliary_loss_mlp": 0.01034207, + "balance_loss_clip": 1.02147591, + "balance_loss_mlp": 1.03629994, + "epoch": 0.8145498271456486, + "flos": 24571733005440.0, + "grad_norm": 2.7390285588234913, + "language_loss": 0.70459747, + "learning_rate": 3.49950028014111e-07, + "loss": 0.72597909, + "num_input_tokens_seen": 292332005, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.67578125, + "step": 13548, + "time_per_iteration": 2.470452070236206 + }, + { + "auxiliary_loss_clip": 0.01104253, + "auxiliary_loss_mlp": 0.0103321, + "balance_loss_clip": 1.02054405, + "balance_loss_mlp": 1.03680122, + "epoch": 0.8146099503983165, + "flos": 20193037367040.0, + "grad_norm": 2.3353911072651794, + "language_loss": 0.76804066, + "learning_rate": 3.4972997596456444e-07, + "loss": 0.7894153, + "num_input_tokens_seen": 292348365, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.67578125, + "step": 13549, + "time_per_iteration": 2.4691059589385986 + }, + { + "auxiliary_loss_clip": 0.01101068, + "auxiliary_loss_mlp": 0.01030286, + "balance_loss_clip": 1.01863968, + "balance_loss_mlp": 1.03536999, + "epoch": 0.8146700736509845, + "flos": 19536949497600.0, + "grad_norm": 1.9549625457918085, + "language_loss": 0.71548051, + "learning_rate": 3.4950998649342233e-07, + "loss": 0.73679399, + "num_input_tokens_seen": 292368050, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 13550, + "time_per_iteration": 2.422795057296753 + }, + { + "auxiliary_loss_clip": 0.01096222, + "auxiliary_loss_mlp": 0.01025099, + "balance_loss_clip": 1.01402998, + "balance_loss_mlp": 1.03409493, + "epoch": 0.8147301969036524, + "flos": 18041341979520.0, + "grad_norm": 1.9969484148682712, + "language_loss": 0.71753186, + "learning_rate": 3.4929005960902826e-07, + "loss": 0.73874509, + "num_input_tokens_seen": 292385315, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.62109375, + "step": 13551, + "time_per_iteration": 2.409451723098755 + }, + { + "auxiliary_loss_clip": 0.01106922, + "auxiliary_loss_mlp": 0.0103091, + "balance_loss_clip": 1.01781511, + "balance_loss_mlp": 1.03745246, + "epoch": 0.8147903201563205, + "flos": 18004713085440.0, + "grad_norm": 1.9327416746380717, + "language_loss": 0.68366426, + "learning_rate": 3.4907019531971926e-07, + "loss": 0.7050426, + "num_input_tokens_seen": 292403375, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6953125, + "step": 13552, + "time_per_iteration": 2.406599760055542 + }, + { + "auxiliary_loss_clip": 0.01098814, + "auxiliary_loss_mlp": 0.01037716, + "balance_loss_clip": 1.02616453, + "balance_loss_mlp": 1.03343797, + "epoch": 0.8148504434089884, + "flos": 20259327916800.0, + "grad_norm": 2.3058908997285377, + "language_loss": 0.82212341, + "learning_rate": 3.4885039363383407e-07, + "loss": 0.84348869, + "num_input_tokens_seen": 292419260, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65234375, + "step": 13553, + "time_per_iteration": 3.8188424110412598 + }, + { + "auxiliary_loss_clip": 0.0109982, + "auxiliary_loss_mlp": 0.01025878, + "balance_loss_clip": 1.01406431, + "balance_loss_mlp": 1.03445435, + "epoch": 0.8149105666616564, + "flos": 12494723621760.0, + "grad_norm": 1.7247482274823256, + "language_loss": 0.68057621, + "learning_rate": 3.4863065455970795e-07, + "loss": 0.70183313, + "num_input_tokens_seen": 292436095, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65625, + "step": 13554, + "time_per_iteration": 3.834584951400757 + }, + { + "auxiliary_loss_clip": 0.0110393, + "auxiliary_loss_mlp": 0.01029417, + "balance_loss_clip": 1.01726937, + "balance_loss_mlp": 1.03757811, + "epoch": 0.8149706899143244, + "flos": 32523683662080.0, + "grad_norm": 1.6822328630436465, + "language_loss": 0.66322923, + "learning_rate": 3.484109781056723e-07, + "loss": 0.68456268, + "num_input_tokens_seen": 292457190, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6640625, + "step": 13555, + "time_per_iteration": 3.930266857147217 + }, + { + "auxiliary_loss_clip": 0.0110298, + "auxiliary_loss_mlp": 0.01033513, + "balance_loss_clip": 1.02141881, + "balance_loss_mlp": 1.03490579, + "epoch": 0.8150308131669923, + "flos": 19386088375680.0, + "grad_norm": 1.8062720841760551, + "language_loss": 0.73134083, + "learning_rate": 3.4819136428005844e-07, + "loss": 0.75270575, + "num_input_tokens_seen": 292474300, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 13556, + "time_per_iteration": 2.4044859409332275 + }, + { + "auxiliary_loss_clip": 0.01100509, + "auxiliary_loss_mlp": 0.01026439, + "balance_loss_clip": 1.01595473, + "balance_loss_mlp": 1.03617573, + "epoch": 0.8150909364196604, + "flos": 17421380213760.0, + "grad_norm": 1.547127180086827, + "language_loss": 0.80460906, + "learning_rate": 3.4797181309119307e-07, + "loss": 0.8258785, + "num_input_tokens_seen": 292492420, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.64453125, + "step": 13557, + "time_per_iteration": 2.415175199508667 + }, + { + "auxiliary_loss_clip": 0.01104379, + "auxiliary_loss_mlp": 0.01031154, + "balance_loss_clip": 1.01966846, + "balance_loss_mlp": 1.03623772, + "epoch": 0.8151510596723283, + "flos": 27162795553920.0, + "grad_norm": 3.4496613594864227, + "language_loss": 0.65522265, + "learning_rate": 3.4775232454740255e-07, + "loss": 0.67657804, + "num_input_tokens_seen": 292512895, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6796875, + "step": 13558, + "time_per_iteration": 3.919435977935791 + }, + { + "auxiliary_loss_clip": 0.01022856, + "auxiliary_loss_mlp": 0.01007035, + "balance_loss_clip": 1.00606906, + "balance_loss_mlp": 1.00276268, + "epoch": 0.8152111829249963, + "flos": 64219052718720.0, + "grad_norm": 0.9540585535167397, + "language_loss": 0.568519, + "learning_rate": 3.4753289865700896e-07, + "loss": 0.58881789, + "num_input_tokens_seen": 292566580, + "router_z_loss_clip": 0.00964355, + "router_z_loss_mlp": 0.20117188, + "step": 13559, + "time_per_iteration": 2.9688994884490967 + }, + { + "auxiliary_loss_clip": 0.01023096, + "auxiliary_loss_mlp": 0.0100422, + "balance_loss_clip": 1.00320113, + "balance_loss_mlp": 1.0028497, + "epoch": 0.8152713061776642, + "flos": 67072012306560.0, + "grad_norm": 0.6777740106717581, + "language_loss": 0.5530026, + "learning_rate": 3.473135354283334e-07, + "loss": 0.57327569, + "num_input_tokens_seen": 292621490, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.203125, + "step": 13560, + "time_per_iteration": 2.901609182357788 + }, + { + "auxiliary_loss_clip": 0.01098355, + "auxiliary_loss_mlp": 0.01029382, + "balance_loss_clip": 1.01828361, + "balance_loss_mlp": 1.03364336, + "epoch": 0.8153314294303322, + "flos": 14391130072320.0, + "grad_norm": 1.715897445640704, + "language_loss": 0.67507559, + "learning_rate": 3.470942348696948e-07, + "loss": 0.69635296, + "num_input_tokens_seen": 292638660, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 13561, + "time_per_iteration": 2.423055648803711 + }, + { + "auxiliary_loss_clip": 0.01104045, + "auxiliary_loss_mlp": 0.01030653, + "balance_loss_clip": 1.01900613, + "balance_loss_mlp": 1.03551221, + "epoch": 0.8153915526830001, + "flos": 25623520076160.0, + "grad_norm": 1.570606711113296, + "language_loss": 0.81580901, + "learning_rate": 3.468749969894085e-07, + "loss": 0.83715606, + "num_input_tokens_seen": 292658545, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6875, + "step": 13562, + "time_per_iteration": 2.458662271499634 + }, + { + "auxiliary_loss_clip": 0.0109998, + "auxiliary_loss_mlp": 0.01030188, + "balance_loss_clip": 1.0185647, + "balance_loss_mlp": 1.03420377, + "epoch": 0.8154516759356681, + "flos": 23369156640000.0, + "grad_norm": 1.823972235032081, + "language_loss": 0.72110701, + "learning_rate": 3.4665582179578734e-07, + "loss": 0.74240875, + "num_input_tokens_seen": 292678460, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 13563, + "time_per_iteration": 2.4745733737945557 + }, + { + "auxiliary_loss_clip": 0.01099418, + "auxiliary_loss_mlp": 0.01029075, + "balance_loss_clip": 1.01654577, + "balance_loss_mlp": 1.03244758, + "epoch": 0.815511799188336, + "flos": 28149189914880.0, + "grad_norm": 1.563191815862559, + "language_loss": 0.70054388, + "learning_rate": 3.4643670929714387e-07, + "loss": 0.72182882, + "num_input_tokens_seen": 292699815, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.671875, + "step": 13564, + "time_per_iteration": 2.4845049381256104 + }, + { + "auxiliary_loss_clip": 0.01101322, + "auxiliary_loss_mlp": 0.01026763, + "balance_loss_clip": 1.01533651, + "balance_loss_mlp": 1.0358603, + "epoch": 0.8155719224410041, + "flos": 16983413683200.0, + "grad_norm": 1.9946584729028405, + "language_loss": 0.70459116, + "learning_rate": 3.462176595017854e-07, + "loss": 0.72587204, + "num_input_tokens_seen": 292717370, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 13565, + "time_per_iteration": 2.4145777225494385 + }, + { + "auxiliary_loss_clip": 0.0109904, + "auxiliary_loss_mlp": 0.01034196, + "balance_loss_clip": 1.02238798, + "balance_loss_mlp": 1.03453624, + "epoch": 0.815632045693672, + "flos": 24681727428480.0, + "grad_norm": 1.6936331976057795, + "language_loss": 0.78862619, + "learning_rate": 3.459986724180188e-07, + "loss": 0.80995858, + "num_input_tokens_seen": 292737110, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.64453125, + "step": 13566, + "time_per_iteration": 2.4679157733917236 + }, + { + "auxiliary_loss_clip": 0.01099231, + "auxiliary_loss_mlp": 0.01030299, + "balance_loss_clip": 1.01951647, + "balance_loss_mlp": 1.03541529, + "epoch": 0.81569216894634, + "flos": 19938323047680.0, + "grad_norm": 1.7251861145582532, + "language_loss": 0.82568282, + "learning_rate": 3.457797480541491e-07, + "loss": 0.84697807, + "num_input_tokens_seen": 292756510, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.640625, + "step": 13567, + "time_per_iteration": 2.4739766120910645 + }, + { + "auxiliary_loss_clip": 0.01097184, + "auxiliary_loss_mlp": 0.01025183, + "balance_loss_clip": 1.01482391, + "balance_loss_mlp": 1.03362584, + "epoch": 0.8157522921990079, + "flos": 21799393493760.0, + "grad_norm": 2.02909207934991, + "language_loss": 0.7959435, + "learning_rate": 3.455608864184771e-07, + "loss": 0.81716716, + "num_input_tokens_seen": 292776710, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.63671875, + "step": 13568, + "time_per_iteration": 2.456554889678955 + }, + { + "auxiliary_loss_clip": 0.01095954, + "auxiliary_loss_mlp": 0.01027921, + "balance_loss_clip": 1.01694787, + "balance_loss_mlp": 1.03310943, + "epoch": 0.8158124154516759, + "flos": 18508323720960.0, + "grad_norm": 1.8416659352028353, + "language_loss": 0.77024674, + "learning_rate": 3.453420875193016e-07, + "loss": 0.79148549, + "num_input_tokens_seen": 292794350, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.62890625, + "step": 13569, + "time_per_iteration": 2.476374626159668 + }, + { + "auxiliary_loss_clip": 0.01098074, + "auxiliary_loss_mlp": 0.01033043, + "balance_loss_clip": 1.02225435, + "balance_loss_mlp": 1.03394771, + "epoch": 0.815872538704344, + "flos": 26830801123200.0, + "grad_norm": 2.224542693134122, + "language_loss": 0.58551776, + "learning_rate": 3.451233513649199e-07, + "loss": 0.60682887, + "num_input_tokens_seen": 292814005, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.640625, + "step": 13570, + "time_per_iteration": 2.5146484375 + }, + { + "auxiliary_loss_clip": 0.01103281, + "auxiliary_loss_mlp": 0.01034724, + "balance_loss_clip": 1.02286851, + "balance_loss_mlp": 1.03557253, + "epoch": 0.8159326619570119, + "flos": 21725704742400.0, + "grad_norm": 2.075350535427022, + "language_loss": 0.82674634, + "learning_rate": 3.4490467796362687e-07, + "loss": 0.84812641, + "num_input_tokens_seen": 292833485, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 13571, + "time_per_iteration": 2.4438490867614746 + }, + { + "auxiliary_loss_clip": 0.01100306, + "auxiliary_loss_mlp": 0.01036489, + "balance_loss_clip": 1.02435327, + "balance_loss_mlp": 1.03504193, + "epoch": 0.8159927852096799, + "flos": 13840726993920.0, + "grad_norm": 2.3390171300430223, + "language_loss": 0.78043985, + "learning_rate": 3.446860673237142e-07, + "loss": 0.80180776, + "num_input_tokens_seen": 292848045, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.65234375, + "step": 13572, + "time_per_iteration": 2.3682100772857666 + }, + { + "auxiliary_loss_clip": 0.01101131, + "auxiliary_loss_mlp": 0.01034215, + "balance_loss_clip": 1.02277076, + "balance_loss_mlp": 1.03415191, + "epoch": 0.8160529084623478, + "flos": 24499516711680.0, + "grad_norm": 1.653683230852661, + "language_loss": 0.64836442, + "learning_rate": 3.4446751945347186e-07, + "loss": 0.66971791, + "num_input_tokens_seen": 292869965, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 13573, + "time_per_iteration": 2.4918434619903564 + }, + { + "auxiliary_loss_clip": 0.01099065, + "auxiliary_loss_mlp": 0.0102872, + "balance_loss_clip": 1.01802051, + "balance_loss_mlp": 1.03432262, + "epoch": 0.8161130317150158, + "flos": 24826339584000.0, + "grad_norm": 1.5818326048732438, + "language_loss": 0.75434422, + "learning_rate": 3.442490343611868e-07, + "loss": 0.77562207, + "num_input_tokens_seen": 292889680, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6484375, + "step": 13574, + "time_per_iteration": 2.4578306674957275 + }, + { + "auxiliary_loss_clip": 0.01101338, + "auxiliary_loss_mlp": 0.01032163, + "balance_loss_clip": 1.0202359, + "balance_loss_mlp": 1.03471351, + "epoch": 0.8161731549676837, + "flos": 30956542208640.0, + "grad_norm": 2.0897739522455345, + "language_loss": 0.59801751, + "learning_rate": 3.4403061205514485e-07, + "loss": 0.61935258, + "num_input_tokens_seen": 292912360, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66796875, + "step": 13575, + "time_per_iteration": 2.5205721855163574 + }, + { + "auxiliary_loss_clip": 0.01100012, + "auxiliary_loss_mlp": 0.01030819, + "balance_loss_clip": 1.01865935, + "balance_loss_mlp": 1.03423405, + "epoch": 0.8162332782203517, + "flos": 18551991680640.0, + "grad_norm": 1.797663124908432, + "language_loss": 0.7433396, + "learning_rate": 3.4381225254362736e-07, + "loss": 0.76464796, + "num_input_tokens_seen": 292928325, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.65625, + "step": 13576, + "time_per_iteration": 2.407588243484497 + }, + { + "auxiliary_loss_clip": 0.01022867, + "auxiliary_loss_mlp": 0.01000366, + "balance_loss_clip": 0.99935305, + "balance_loss_mlp": 1.00269318, + "epoch": 0.8162934014730197, + "flos": 70386853904640.0, + "grad_norm": 0.8261597243794896, + "language_loss": 0.58621252, + "learning_rate": 3.435939558349155e-07, + "loss": 0.60644484, + "num_input_tokens_seen": 292992795, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.20214844, + "step": 13577, + "time_per_iteration": 3.03220534324646 + }, + { + "auxiliary_loss_clip": 0.01096665, + "auxiliary_loss_mlp": 0.01028017, + "balance_loss_clip": 1.01725233, + "balance_loss_mlp": 1.03460181, + "epoch": 0.8163535247256877, + "flos": 21214839559680.0, + "grad_norm": 1.6864707181189702, + "language_loss": 0.71403098, + "learning_rate": 3.4337572193728747e-07, + "loss": 0.73527777, + "num_input_tokens_seen": 293011950, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.62109375, + "step": 13578, + "time_per_iteration": 2.424729585647583 + }, + { + "auxiliary_loss_clip": 0.01099052, + "auxiliary_loss_mlp": 0.01026889, + "balance_loss_clip": 1.01565957, + "balance_loss_mlp": 1.03452241, + "epoch": 0.8164136479783556, + "flos": 21098847565440.0, + "grad_norm": 1.7521763142513538, + "language_loss": 0.73644769, + "learning_rate": 3.431575508590172e-07, + "loss": 0.75770712, + "num_input_tokens_seen": 293030175, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.64453125, + "step": 13579, + "time_per_iteration": 2.449126720428467 + }, + { + "auxiliary_loss_clip": 0.0110219, + "auxiliary_loss_mlp": 0.0102731, + "balance_loss_clip": 1.01587224, + "balance_loss_mlp": 1.03481781, + "epoch": 0.8164737712310236, + "flos": 21720640924800.0, + "grad_norm": 1.8185977038329606, + "language_loss": 0.78892076, + "learning_rate": 3.4293944260837873e-07, + "loss": 0.81021571, + "num_input_tokens_seen": 293047980, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 13580, + "time_per_iteration": 2.425795555114746 + }, + { + "auxiliary_loss_clip": 0.0109879, + "auxiliary_loss_mlp": 0.01033251, + "balance_loss_clip": 1.02177167, + "balance_loss_mlp": 1.0351367, + "epoch": 0.8165338944836915, + "flos": 19536805843200.0, + "grad_norm": 1.6552984035314777, + "language_loss": 0.68889928, + "learning_rate": 3.4272139719364314e-07, + "loss": 0.71021968, + "num_input_tokens_seen": 293067030, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.63671875, + "step": 13581, + "time_per_iteration": 2.4383456707000732 + }, + { + "auxiliary_loss_clip": 0.01099114, + "auxiliary_loss_mlp": 0.01026331, + "balance_loss_clip": 1.01510167, + "balance_loss_mlp": 1.03388476, + "epoch": 0.8165940177363595, + "flos": 22928568416640.0, + "grad_norm": 1.7746948932772684, + "language_loss": 0.59413254, + "learning_rate": 3.4250341462307786e-07, + "loss": 0.61538696, + "num_input_tokens_seen": 293085575, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 13582, + "time_per_iteration": 2.446333169937134 + }, + { + "auxiliary_loss_clip": 0.01095885, + "auxiliary_loss_mlp": 0.0102881, + "balance_loss_clip": 1.01781857, + "balance_loss_mlp": 1.0341419, + "epoch": 0.8166541409890276, + "flos": 23370377702400.0, + "grad_norm": 1.3768473846138, + "language_loss": 0.82010365, + "learning_rate": 3.4228549490494897e-07, + "loss": 0.84135062, + "num_input_tokens_seen": 293108200, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6171875, + "step": 13583, + "time_per_iteration": 2.5130319595336914 + }, + { + "auxiliary_loss_clip": 0.01100945, + "auxiliary_loss_mlp": 0.01026345, + "balance_loss_clip": 1.01556253, + "balance_loss_mlp": 1.03554404, + "epoch": 0.8167142642416955, + "flos": 18441997257600.0, + "grad_norm": 1.826376141415004, + "language_loss": 0.7425015, + "learning_rate": 3.4206763804752093e-07, + "loss": 0.76377439, + "num_input_tokens_seen": 293126020, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.65625, + "step": 13584, + "time_per_iteration": 2.409642457962036 + }, + { + "auxiliary_loss_clip": 0.01102581, + "auxiliary_loss_mlp": 0.01028328, + "balance_loss_clip": 1.01652598, + "balance_loss_mlp": 1.03693473, + "epoch": 0.8167743874943635, + "flos": 21214983214080.0, + "grad_norm": 4.717780389008525, + "language_loss": 0.74340463, + "learning_rate": 3.4184984405905405e-07, + "loss": 0.76471376, + "num_input_tokens_seen": 293144620, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65625, + "step": 13585, + "time_per_iteration": 2.4532628059387207 + }, + { + "auxiliary_loss_clip": 0.01101272, + "auxiliary_loss_mlp": 0.01035762, + "balance_loss_clip": 1.02410972, + "balance_loss_mlp": 1.03578067, + "epoch": 0.8168345107470314, + "flos": 18697681244160.0, + "grad_norm": 1.6545337025021891, + "language_loss": 0.69145906, + "learning_rate": 3.416321129478068e-07, + "loss": 0.71282941, + "num_input_tokens_seen": 293162850, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 13586, + "time_per_iteration": 2.4178428649902344 + }, + { + "auxiliary_loss_clip": 0.01100971, + "auxiliary_loss_mlp": 0.01029383, + "balance_loss_clip": 1.01897573, + "balance_loss_mlp": 1.03632236, + "epoch": 0.8168946339996994, + "flos": 16253098358400.0, + "grad_norm": 1.5123353842035532, + "language_loss": 0.60895872, + "learning_rate": 3.4141444472203594e-07, + "loss": 0.63026226, + "num_input_tokens_seen": 293181620, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.6484375, + "step": 13587, + "time_per_iteration": 2.4606432914733887 + }, + { + "auxiliary_loss_clip": 0.01102914, + "auxiliary_loss_mlp": 0.0103289, + "balance_loss_clip": 1.02086794, + "balance_loss_mlp": 1.03446078, + "epoch": 0.8169547572523673, + "flos": 26941585645440.0, + "grad_norm": 2.2917834660377534, + "language_loss": 0.6959576, + "learning_rate": 3.4119683938999624e-07, + "loss": 0.71731567, + "num_input_tokens_seen": 293200270, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6875, + "step": 13588, + "time_per_iteration": 2.4705071449279785 + }, + { + "auxiliary_loss_clip": 0.011024, + "auxiliary_loss_mlp": 0.01032191, + "balance_loss_clip": 1.01979363, + "balance_loss_mlp": 1.03581333, + "epoch": 0.8170148805050353, + "flos": 18952323736320.0, + "grad_norm": 1.5447018635848075, + "language_loss": 0.73065209, + "learning_rate": 3.4097929695993854e-07, + "loss": 0.75199795, + "num_input_tokens_seen": 293218960, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6640625, + "step": 13589, + "time_per_iteration": 2.45959210395813 + }, + { + "auxiliary_loss_clip": 0.01097376, + "auxiliary_loss_mlp": 0.01031015, + "balance_loss_clip": 1.01933265, + "balance_loss_mlp": 1.03389359, + "epoch": 0.8170750037577033, + "flos": 21834909066240.0, + "grad_norm": 1.7225863483804695, + "language_loss": 0.72977889, + "learning_rate": 3.4076181744011166e-07, + "loss": 0.75106275, + "num_input_tokens_seen": 293236450, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.63671875, + "step": 13590, + "time_per_iteration": 2.4645774364471436 + }, + { + "auxiliary_loss_clip": 0.01104182, + "auxiliary_loss_mlp": 0.01031011, + "balance_loss_clip": 1.01877403, + "balance_loss_mlp": 1.03568554, + "epoch": 0.8171351270103713, + "flos": 33507169021440.0, + "grad_norm": 1.9829017797155066, + "language_loss": 0.65020001, + "learning_rate": 3.4054440083876345e-07, + "loss": 0.67155194, + "num_input_tokens_seen": 293256480, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 13591, + "time_per_iteration": 2.536670207977295 + }, + { + "auxiliary_loss_clip": 0.01101496, + "auxiliary_loss_mlp": 0.01033923, + "balance_loss_clip": 1.02190661, + "balance_loss_mlp": 1.03379011, + "epoch": 0.8171952502630392, + "flos": 22708184520960.0, + "grad_norm": 1.7852703265171805, + "language_loss": 0.68164837, + "learning_rate": 3.403270471641373e-07, + "loss": 0.70300251, + "num_input_tokens_seen": 293274960, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 13592, + "time_per_iteration": 2.4266905784606934 + }, + { + "auxiliary_loss_clip": 0.0110127, + "auxiliary_loss_mlp": 0.01025018, + "balance_loss_clip": 1.01322818, + "balance_loss_mlp": 1.03527188, + "epoch": 0.8172553735157072, + "flos": 26723715701760.0, + "grad_norm": 1.9676579409127355, + "language_loss": 0.66395956, + "learning_rate": 3.401097564244759e-07, + "loss": 0.68522245, + "num_input_tokens_seen": 293295945, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.66015625, + "step": 13593, + "time_per_iteration": 2.4653220176696777 + }, + { + "auxiliary_loss_clip": 0.01098248, + "auxiliary_loss_mlp": 0.01030415, + "balance_loss_clip": 1.01953697, + "balance_loss_mlp": 1.0330466, + "epoch": 0.8173154967683751, + "flos": 15961072786560.0, + "grad_norm": 2.1589991907260826, + "language_loss": 0.69275898, + "learning_rate": 3.398925286280188e-07, + "loss": 0.71404564, + "num_input_tokens_seen": 293313300, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.65234375, + "step": 13594, + "time_per_iteration": 2.441347122192383 + }, + { + "auxiliary_loss_clip": 0.01101079, + "auxiliary_loss_mlp": 0.01029785, + "balance_loss_clip": 1.01838267, + "balance_loss_mlp": 1.03391123, + "epoch": 0.8173756200210431, + "flos": 25986720447360.0, + "grad_norm": 1.7250973589133012, + "language_loss": 0.65802509, + "learning_rate": 3.3967536378300456e-07, + "loss": 0.67933369, + "num_input_tokens_seen": 293333085, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 13595, + "time_per_iteration": 3.834423542022705 + }, + { + "auxiliary_loss_clip": 0.01103171, + "auxiliary_loss_mlp": 0.01026496, + "balance_loss_clip": 1.01491535, + "balance_loss_mlp": 1.03471351, + "epoch": 0.8174357432737112, + "flos": 25664422688640.0, + "grad_norm": 1.6805151919740065, + "language_loss": 0.78552544, + "learning_rate": 3.394582618976658e-07, + "loss": 0.80682206, + "num_input_tokens_seen": 293351895, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.68359375, + "step": 13596, + "time_per_iteration": 3.870290517807007 + }, + { + "auxiliary_loss_clip": 0.01096898, + "auxiliary_loss_mlp": 0.0102693, + "balance_loss_clip": 1.01530743, + "balance_loss_mlp": 1.03267086, + "epoch": 0.8174958665263791, + "flos": 21835088634240.0, + "grad_norm": 3.056945403146244, + "language_loss": 0.58674574, + "learning_rate": 3.392412229802362e-07, + "loss": 0.60798407, + "num_input_tokens_seen": 293371165, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.640625, + "step": 13597, + "time_per_iteration": 3.83126163482666 + }, + { + "auxiliary_loss_clip": 0.01096843, + "auxiliary_loss_mlp": 0.0103056, + "balance_loss_clip": 1.0189966, + "balance_loss_mlp": 1.03306866, + "epoch": 0.8175559897790471, + "flos": 22455517276800.0, + "grad_norm": 2.0152722987790117, + "language_loss": 0.82239521, + "learning_rate": 3.390242470389462e-07, + "loss": 0.84366918, + "num_input_tokens_seen": 293391150, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.63671875, + "step": 13598, + "time_per_iteration": 2.461413621902466 + }, + { + "auxiliary_loss_clip": 0.01101172, + "auxiliary_loss_mlp": 0.01028969, + "balance_loss_clip": 1.01804388, + "balance_loss_mlp": 1.03485656, + "epoch": 0.817616113031715, + "flos": 23615790399360.0, + "grad_norm": 1.6794406280078336, + "language_loss": 0.8206194, + "learning_rate": 3.3880733408202277e-07, + "loss": 0.84192085, + "num_input_tokens_seen": 293409440, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6640625, + "step": 13599, + "time_per_iteration": 2.4389727115631104 + }, + { + "auxiliary_loss_clip": 0.01098228, + "auxiliary_loss_mlp": 0.01030677, + "balance_loss_clip": 1.01945949, + "balance_loss_mlp": 1.03399026, + "epoch": 0.817676236284383, + "flos": 27672260106240.0, + "grad_norm": 1.8119321548666836, + "language_loss": 0.83470106, + "learning_rate": 3.3859048411769186e-07, + "loss": 0.85599005, + "num_input_tokens_seen": 293428995, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.64453125, + "step": 13600, + "time_per_iteration": 3.8930134773254395 + }, + { + "auxiliary_loss_clip": 0.01100628, + "auxiliary_loss_mlp": 0.01029833, + "balance_loss_clip": 1.01807261, + "balance_loss_mlp": 1.0340941, + "epoch": 0.8177363595370509, + "flos": 24681009156480.0, + "grad_norm": 1.8552295525617326, + "language_loss": 0.74228668, + "learning_rate": 3.383736971541766e-07, + "loss": 0.76359135, + "num_input_tokens_seen": 293449155, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.66796875, + "step": 13601, + "time_per_iteration": 2.4926183223724365 + }, + { + "auxiliary_loss_clip": 0.01103435, + "auxiliary_loss_mlp": 0.01031251, + "balance_loss_clip": 1.01916885, + "balance_loss_mlp": 1.03450692, + "epoch": 0.817796482789719, + "flos": 17346326745600.0, + "grad_norm": 2.233954620070709, + "language_loss": 0.67695427, + "learning_rate": 3.3815697319969737e-07, + "loss": 0.69830108, + "num_input_tokens_seen": 293466125, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6875, + "step": 13602, + "time_per_iteration": 2.4101457595825195 + }, + { + "auxiliary_loss_clip": 0.01097878, + "auxiliary_loss_mlp": 0.01025934, + "balance_loss_clip": 1.01494884, + "balance_loss_mlp": 1.03410912, + "epoch": 0.8178566060423869, + "flos": 17778475272960.0, + "grad_norm": 2.2019513074937596, + "language_loss": 0.83764672, + "learning_rate": 3.379403122624718e-07, + "loss": 0.85888481, + "num_input_tokens_seen": 293481345, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.63671875, + "step": 13603, + "time_per_iteration": 2.4125587940216064 + }, + { + "auxiliary_loss_clip": 0.01100456, + "auxiliary_loss_mlp": 0.01027642, + "balance_loss_clip": 1.01626372, + "balance_loss_mlp": 1.03461123, + "epoch": 0.8179167292950549, + "flos": 24973250209920.0, + "grad_norm": 1.6014751924938777, + "language_loss": 0.69117272, + "learning_rate": 3.377237143507159e-07, + "loss": 0.71245372, + "num_input_tokens_seen": 293502330, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65625, + "step": 13604, + "time_per_iteration": 2.5184106826782227 + }, + { + "auxiliary_loss_clip": 0.01102038, + "auxiliary_loss_mlp": 0.01032838, + "balance_loss_clip": 1.02132845, + "balance_loss_mlp": 1.03665352, + "epoch": 0.8179768525477228, + "flos": 22856783086080.0, + "grad_norm": 1.681681436468054, + "language_loss": 0.742558, + "learning_rate": 3.3750717947264406e-07, + "loss": 0.76390678, + "num_input_tokens_seen": 293521415, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65625, + "step": 13605, + "time_per_iteration": 2.676684856414795 + }, + { + "auxiliary_loss_clip": 0.01099847, + "auxiliary_loss_mlp": 0.01033888, + "balance_loss_clip": 1.02215791, + "balance_loss_mlp": 1.03632402, + "epoch": 0.8180369758003908, + "flos": 18515147304960.0, + "grad_norm": 1.8459986304630236, + "language_loss": 0.74292308, + "learning_rate": 3.372907076364666e-07, + "loss": 0.76426041, + "num_input_tokens_seen": 293539245, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.63671875, + "step": 13606, + "time_per_iteration": 2.4154725074768066 + }, + { + "auxiliary_loss_clip": 0.01099557, + "auxiliary_loss_mlp": 0.01030629, + "balance_loss_clip": 1.01939964, + "balance_loss_mlp": 1.03485346, + "epoch": 0.8180970990530587, + "flos": 33182105915520.0, + "grad_norm": 1.9141153013042538, + "language_loss": 0.65221226, + "learning_rate": 3.370742988503916e-07, + "loss": 0.67351413, + "num_input_tokens_seen": 293560640, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 13607, + "time_per_iteration": 2.547203302383423 + }, + { + "auxiliary_loss_clip": 0.01101005, + "auxiliary_loss_mlp": 0.01030417, + "balance_loss_clip": 1.0187819, + "balance_loss_mlp": 1.0351789, + "epoch": 0.8181572223057267, + "flos": 25010022758400.0, + "grad_norm": 1.6236111572449494, + "language_loss": 0.70368075, + "learning_rate": 3.3685795312262634e-07, + "loss": 0.72499502, + "num_input_tokens_seen": 293579465, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 13608, + "time_per_iteration": 2.498760461807251 + }, + { + "auxiliary_loss_clip": 0.01097872, + "auxiliary_loss_mlp": 0.01033201, + "balance_loss_clip": 1.02187049, + "balance_loss_mlp": 1.03291011, + "epoch": 0.8182173455583948, + "flos": 28548731871360.0, + "grad_norm": 1.7153740974469267, + "language_loss": 0.79468846, + "learning_rate": 3.366416704613735e-07, + "loss": 0.81599921, + "num_input_tokens_seen": 293600540, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 13609, + "time_per_iteration": 2.5317742824554443 + }, + { + "auxiliary_loss_clip": 0.01023454, + "auxiliary_loss_mlp": 0.01002374, + "balance_loss_clip": 1.00132453, + "balance_loss_mlp": 1.00311923, + "epoch": 0.8182774688110627, + "flos": 72028043245440.0, + "grad_norm": 0.749863599431258, + "language_loss": 0.5588702, + "learning_rate": 3.3642545087483544e-07, + "loss": 0.5791285, + "num_input_tokens_seen": 293665160, + "router_z_loss_clip": 0.01049805, + "router_z_loss_mlp": 0.203125, + "step": 13610, + "time_per_iteration": 3.150242567062378 + }, + { + "auxiliary_loss_clip": 0.01096143, + "auxiliary_loss_mlp": 0.01025663, + "balance_loss_clip": 1.01499379, + "balance_loss_mlp": 1.03376782, + "epoch": 0.8183375920637307, + "flos": 19755358145280.0, + "grad_norm": 1.6981510303220106, + "language_loss": 0.77559108, + "learning_rate": 3.362092943712107e-07, + "loss": 0.79680908, + "num_input_tokens_seen": 293683995, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.625, + "step": 13611, + "time_per_iteration": 2.4540553092956543 + }, + { + "auxiliary_loss_clip": 0.01104443, + "auxiliary_loss_mlp": 0.0103386, + "balance_loss_clip": 1.02056813, + "balance_loss_mlp": 1.0341984, + "epoch": 0.8183977153163986, + "flos": 22341895580160.0, + "grad_norm": 1.7496159183254283, + "language_loss": 0.770311, + "learning_rate": 3.3599320095869745e-07, + "loss": 0.79169405, + "num_input_tokens_seen": 293704115, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.703125, + "step": 13612, + "time_per_iteration": 2.481358766555786 + }, + { + "auxiliary_loss_clip": 0.01096746, + "auxiliary_loss_mlp": 0.01027079, + "balance_loss_clip": 1.01580763, + "balance_loss_mlp": 1.03323674, + "epoch": 0.8184578385690666, + "flos": 17712472032000.0, + "grad_norm": 1.8750318483309736, + "language_loss": 0.86217213, + "learning_rate": 3.3577717064548793e-07, + "loss": 0.88341039, + "num_input_tokens_seen": 293722225, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6328125, + "step": 13613, + "time_per_iteration": 2.4214251041412354 + }, + { + "auxiliary_loss_clip": 0.01102179, + "auxiliary_loss_mlp": 0.01042499, + "balance_loss_clip": 1.03140044, + "balance_loss_mlp": 1.0373559, + "epoch": 0.8185179618217345, + "flos": 25701159323520.0, + "grad_norm": 1.775880767921388, + "language_loss": 0.72751027, + "learning_rate": 3.355612034397746e-07, + "loss": 0.74895704, + "num_input_tokens_seen": 293743995, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6484375, + "step": 13614, + "time_per_iteration": 2.480942487716675 + }, + { + "auxiliary_loss_clip": 0.01100997, + "auxiliary_loss_mlp": 0.01034589, + "balance_loss_clip": 1.02239406, + "balance_loss_mlp": 1.03468037, + "epoch": 0.8185780850744026, + "flos": 25960326929280.0, + "grad_norm": 1.580717147390374, + "language_loss": 0.81211054, + "learning_rate": 3.353452993497479e-07, + "loss": 0.83346641, + "num_input_tokens_seen": 293764935, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6640625, + "step": 13615, + "time_per_iteration": 2.512587308883667 + }, + { + "auxiliary_loss_clip": 0.01098725, + "auxiliary_loss_mlp": 0.01029727, + "balance_loss_clip": 1.0171864, + "balance_loss_mlp": 1.03305793, + "epoch": 0.8186382083270705, + "flos": 25228431406080.0, + "grad_norm": 1.9650123259608059, + "language_loss": 0.75749093, + "learning_rate": 3.3512945838359375e-07, + "loss": 0.77877545, + "num_input_tokens_seen": 293784035, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.65625, + "step": 13616, + "time_per_iteration": 2.4874508380889893 + }, + { + "auxiliary_loss_clip": 0.01096039, + "auxiliary_loss_mlp": 0.01031242, + "balance_loss_clip": 1.01892185, + "balance_loss_mlp": 1.03211665, + "epoch": 0.8186983315797385, + "flos": 22415009713920.0, + "grad_norm": 1.6631144614054594, + "language_loss": 0.75075936, + "learning_rate": 3.349136805494979e-07, + "loss": 0.77203214, + "num_input_tokens_seen": 293803360, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.640625, + "step": 13617, + "time_per_iteration": 2.4808571338653564 + }, + { + "auxiliary_loss_clip": 0.01094749, + "auxiliary_loss_mlp": 0.01028857, + "balance_loss_clip": 1.0177412, + "balance_loss_mlp": 1.03146482, + "epoch": 0.8187584548324064, + "flos": 22018017623040.0, + "grad_norm": 2.3105129174320362, + "language_loss": 0.68536007, + "learning_rate": 3.346979658556415e-07, + "loss": 0.70659614, + "num_input_tokens_seen": 293821325, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6328125, + "step": 13618, + "time_per_iteration": 2.44240665435791 + }, + { + "auxiliary_loss_clip": 0.01103541, + "auxiliary_loss_mlp": 0.01031881, + "balance_loss_clip": 1.01982939, + "balance_loss_mlp": 1.03484082, + "epoch": 0.8188185780850744, + "flos": 29241664116480.0, + "grad_norm": 1.820443995166382, + "language_loss": 0.70164716, + "learning_rate": 3.344823143102058e-07, + "loss": 0.72300136, + "num_input_tokens_seen": 293840315, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6875, + "step": 13619, + "time_per_iteration": 2.519563913345337 + }, + { + "auxiliary_loss_clip": 0.01105113, + "auxiliary_loss_mlp": 0.01026554, + "balance_loss_clip": 1.01481771, + "balance_loss_mlp": 1.03768373, + "epoch": 0.8188787013377423, + "flos": 20696504348160.0, + "grad_norm": 1.7568931118240076, + "language_loss": 0.73624021, + "learning_rate": 3.3426672592136694e-07, + "loss": 0.75755692, + "num_input_tokens_seen": 293855685, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.67578125, + "step": 13620, + "time_per_iteration": 2.4257287979125977 + }, + { + "auxiliary_loss_clip": 0.01097016, + "auxiliary_loss_mlp": 0.01027413, + "balance_loss_clip": 1.01578975, + "balance_loss_mlp": 1.03350806, + "epoch": 0.8189388245904103, + "flos": 23732967542400.0, + "grad_norm": 1.561934577342782, + "language_loss": 0.760234, + "learning_rate": 3.340512006973011e-07, + "loss": 0.78147829, + "num_input_tokens_seen": 293875540, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.63671875, + "step": 13621, + "time_per_iteration": 2.474597454071045 + }, + { + "auxiliary_loss_clip": 0.01098691, + "auxiliary_loss_mlp": 0.01025725, + "balance_loss_clip": 1.01428652, + "balance_loss_mlp": 1.03371286, + "epoch": 0.8189989478430784, + "flos": 28255090187520.0, + "grad_norm": 2.221894221463605, + "language_loss": 0.65659404, + "learning_rate": 3.3383573864618076e-07, + "loss": 0.67783821, + "num_input_tokens_seen": 293896570, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 13622, + "time_per_iteration": 2.4912831783294678 + }, + { + "auxiliary_loss_clip": 0.01103867, + "auxiliary_loss_mlp": 0.01026423, + "balance_loss_clip": 1.01354301, + "balance_loss_mlp": 1.03722382, + "epoch": 0.8190590710957463, + "flos": 21397696721280.0, + "grad_norm": 1.9628662622267186, + "language_loss": 0.74750388, + "learning_rate": 3.3362033977617653e-07, + "loss": 0.76880676, + "num_input_tokens_seen": 293914680, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.66796875, + "step": 13623, + "time_per_iteration": 2.464510440826416 + }, + { + "auxiliary_loss_clip": 0.01101923, + "auxiliary_loss_mlp": 0.01035759, + "balance_loss_clip": 1.02379024, + "balance_loss_mlp": 1.03451788, + "epoch": 0.8191191943484143, + "flos": 38796451367040.0, + "grad_norm": 2.130613473950277, + "language_loss": 0.63448161, + "learning_rate": 3.3340500409545527e-07, + "loss": 0.6558584, + "num_input_tokens_seen": 293936480, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 13624, + "time_per_iteration": 2.597849130630493 + }, + { + "auxiliary_loss_clip": 0.01097755, + "auxiliary_loss_mlp": 0.01032015, + "balance_loss_clip": 1.0206188, + "balance_loss_mlp": 1.03409612, + "epoch": 0.8191793176010822, + "flos": 25446516831360.0, + "grad_norm": 1.5840449954467926, + "language_loss": 0.78271246, + "learning_rate": 3.3318973161218386e-07, + "loss": 0.80401015, + "num_input_tokens_seen": 293957815, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.63671875, + "step": 13625, + "time_per_iteration": 2.5138373374938965 + }, + { + "auxiliary_loss_clip": 0.01104522, + "auxiliary_loss_mlp": 0.01029595, + "balance_loss_clip": 1.01713765, + "balance_loss_mlp": 1.034271, + "epoch": 0.8192394408537502, + "flos": 25083029151360.0, + "grad_norm": 1.8933767850684242, + "language_loss": 0.7606883, + "learning_rate": 3.329745223345244e-07, + "loss": 0.78202951, + "num_input_tokens_seen": 293975440, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 13626, + "time_per_iteration": 2.4585258960723877 + }, + { + "auxiliary_loss_clip": 0.01099817, + "auxiliary_loss_mlp": 0.01031587, + "balance_loss_clip": 1.02102554, + "balance_loss_mlp": 1.03533244, + "epoch": 0.8192995641064181, + "flos": 27673732563840.0, + "grad_norm": 1.454259930167211, + "language_loss": 0.73434258, + "learning_rate": 3.3275937627063823e-07, + "loss": 0.7556566, + "num_input_tokens_seen": 293997540, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.64453125, + "step": 13627, + "time_per_iteration": 2.5106747150421143 + }, + { + "auxiliary_loss_clip": 0.01099718, + "auxiliary_loss_mlp": 0.01032208, + "balance_loss_clip": 1.02029932, + "balance_loss_mlp": 1.03390169, + "epoch": 0.8193596873590862, + "flos": 21288492397440.0, + "grad_norm": 2.370107674869554, + "language_loss": 0.6889739, + "learning_rate": 3.3254429342868353e-07, + "loss": 0.71029305, + "num_input_tokens_seen": 294017030, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 13628, + "time_per_iteration": 2.472726345062256 + }, + { + "auxiliary_loss_clip": 0.01105355, + "auxiliary_loss_mlp": 0.01030355, + "balance_loss_clip": 1.01854777, + "balance_loss_mlp": 1.03616786, + "epoch": 0.8194198106117541, + "flos": 17492626840320.0, + "grad_norm": 1.5708816521142615, + "language_loss": 0.85466886, + "learning_rate": 3.323292738168171e-07, + "loss": 0.87602592, + "num_input_tokens_seen": 294035700, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.69140625, + "step": 13629, + "time_per_iteration": 2.4741504192352295 + }, + { + "auxiliary_loss_clip": 0.01099126, + "auxiliary_loss_mlp": 0.01024154, + "balance_loss_clip": 1.01312077, + "balance_loss_mlp": 1.03430605, + "epoch": 0.8194799338644221, + "flos": 15267925059840.0, + "grad_norm": 2.023051880199768, + "language_loss": 0.73787737, + "learning_rate": 3.3211431744319084e-07, + "loss": 0.75911021, + "num_input_tokens_seen": 294049730, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 13630, + "time_per_iteration": 2.440484046936035 + }, + { + "auxiliary_loss_clip": 0.01102576, + "auxiliary_loss_mlp": 0.01027871, + "balance_loss_clip": 1.01646876, + "balance_loss_mlp": 1.03574312, + "epoch": 0.81954005711709, + "flos": 14718814871040.0, + "grad_norm": 2.5690554004253507, + "language_loss": 0.71959084, + "learning_rate": 3.31899424315957e-07, + "loss": 0.74089527, + "num_input_tokens_seen": 294066545, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 13631, + "time_per_iteration": 2.459568738937378 + }, + { + "auxiliary_loss_clip": 0.01099537, + "auxiliary_loss_mlp": 0.01033114, + "balance_loss_clip": 1.02199757, + "balance_loss_mlp": 1.03329933, + "epoch": 0.819600180369758, + "flos": 23074042498560.0, + "grad_norm": 1.5024755031479913, + "language_loss": 0.76703703, + "learning_rate": 3.3168459444326447e-07, + "loss": 0.78836352, + "num_input_tokens_seen": 294087455, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 13632, + "time_per_iteration": 2.4642910957336426 + }, + { + "auxiliary_loss_clip": 0.01098389, + "auxiliary_loss_mlp": 0.01026774, + "balance_loss_clip": 1.01664722, + "balance_loss_mlp": 1.03358042, + "epoch": 0.8196603036224259, + "flos": 27599792417280.0, + "grad_norm": 1.6696759158330585, + "language_loss": 0.6536504, + "learning_rate": 3.314698278332588e-07, + "loss": 0.67490202, + "num_input_tokens_seen": 294107480, + "router_z_loss_clip": 0.1015625, + "router_z_loss_mlp": 0.6484375, + "step": 13633, + "time_per_iteration": 2.4936697483062744 + }, + { + "auxiliary_loss_clip": 0.01097253, + "auxiliary_loss_mlp": 0.01033244, + "balance_loss_clip": 1.02256322, + "balance_loss_mlp": 1.0340724, + "epoch": 0.8197204268750939, + "flos": 28582020800640.0, + "grad_norm": 1.6333258290671406, + "language_loss": 0.75608504, + "learning_rate": 3.3125512449408513e-07, + "loss": 0.77739, + "num_input_tokens_seen": 294130115, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6328125, + "step": 13634, + "time_per_iteration": 2.490642547607422 + }, + { + "auxiliary_loss_clip": 0.01096629, + "auxiliary_loss_mlp": 0.01027045, + "balance_loss_clip": 1.01635802, + "balance_loss_mlp": 1.03337777, + "epoch": 0.819780550127762, + "flos": 23258300290560.0, + "grad_norm": 1.9576833355326961, + "language_loss": 0.81758225, + "learning_rate": 3.310404844338841e-07, + "loss": 0.83881891, + "num_input_tokens_seen": 294148495, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6328125, + "step": 13635, + "time_per_iteration": 2.4764068126678467 + }, + { + "auxiliary_loss_clip": 0.01099631, + "auxiliary_loss_mlp": 0.01029582, + "balance_loss_clip": 1.01730359, + "balance_loss_mlp": 1.03370953, + "epoch": 0.8198406733804299, + "flos": 26685255214080.0, + "grad_norm": 1.8160936687392844, + "language_loss": 0.75971925, + "learning_rate": 3.308259076607949e-07, + "loss": 0.78101134, + "num_input_tokens_seen": 294169595, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66015625, + "step": 13636, + "time_per_iteration": 3.8742868900299072 + }, + { + "auxiliary_loss_clip": 0.0109741, + "auxiliary_loss_mlp": 0.01030281, + "balance_loss_clip": 1.01852703, + "balance_loss_mlp": 1.03244948, + "epoch": 0.8199007966330979, + "flos": 20084084438400.0, + "grad_norm": 2.523932105098669, + "language_loss": 0.8138752, + "learning_rate": 3.3061139418295445e-07, + "loss": 0.83515203, + "num_input_tokens_seen": 294183885, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6484375, + "step": 13637, + "time_per_iteration": 2.444077491760254 + }, + { + "auxiliary_loss_clip": 0.01100022, + "auxiliary_loss_mlp": 0.01030226, + "balance_loss_clip": 1.01866293, + "balance_loss_mlp": 1.03494465, + "epoch": 0.8199609198857658, + "flos": 31902788142720.0, + "grad_norm": 2.1971126807385617, + "language_loss": 0.71151501, + "learning_rate": 3.3039694400849725e-07, + "loss": 0.73281747, + "num_input_tokens_seen": 294200150, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6484375, + "step": 13638, + "time_per_iteration": 5.341911554336548 + }, + { + "auxiliary_loss_clip": 0.0110428, + "auxiliary_loss_mlp": 0.01032887, + "balance_loss_clip": 1.0195235, + "balance_loss_mlp": 1.03576005, + "epoch": 0.8200210431384338, + "flos": 26470150617600.0, + "grad_norm": 1.87843772598682, + "language_loss": 0.79670238, + "learning_rate": 3.3018255714555564e-07, + "loss": 0.81807411, + "num_input_tokens_seen": 294220385, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.6875, + "step": 13639, + "time_per_iteration": 2.508781909942627 + }, + { + "auxiliary_loss_clip": 0.01099222, + "auxiliary_loss_mlp": 0.01027252, + "balance_loss_clip": 1.0162971, + "balance_loss_mlp": 1.03431201, + "epoch": 0.8200811663911017, + "flos": 22091454979200.0, + "grad_norm": 1.6458792475114847, + "language_loss": 0.7922225, + "learning_rate": 3.299682336022589e-07, + "loss": 0.81348717, + "num_input_tokens_seen": 294239355, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 13640, + "time_per_iteration": 2.4696903228759766 + }, + { + "auxiliary_loss_clip": 0.01104141, + "auxiliary_loss_mlp": 0.01031614, + "balance_loss_clip": 1.01967573, + "balance_loss_mlp": 1.03490579, + "epoch": 0.8201412896437698, + "flos": 37593659520000.0, + "grad_norm": 2.5540262458401086, + "language_loss": 0.63221669, + "learning_rate": 3.297539733867336e-07, + "loss": 0.65357423, + "num_input_tokens_seen": 294259395, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6953125, + "step": 13641, + "time_per_iteration": 2.6128504276275635 + }, + { + "auxiliary_loss_clip": 0.01099233, + "auxiliary_loss_mlp": 0.0102772, + "balance_loss_clip": 1.01587081, + "balance_loss_mlp": 1.03426635, + "epoch": 0.8202014128964377, + "flos": 19646333389440.0, + "grad_norm": 1.7946426536258016, + "language_loss": 0.73509145, + "learning_rate": 3.295397765071055e-07, + "loss": 0.75636101, + "num_input_tokens_seen": 294277365, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6484375, + "step": 13642, + "time_per_iteration": 3.9330053329467773 + }, + { + "auxiliary_loss_clip": 0.01100563, + "auxiliary_loss_mlp": 0.01031235, + "balance_loss_clip": 1.0196774, + "balance_loss_mlp": 1.03578949, + "epoch": 0.8202615361491057, + "flos": 31467335564160.0, + "grad_norm": 1.7449860338382779, + "language_loss": 0.703394, + "learning_rate": 3.2932564297149615e-07, + "loss": 0.72471195, + "num_input_tokens_seen": 294297555, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6484375, + "step": 13643, + "time_per_iteration": 2.5429534912109375 + }, + { + "auxiliary_loss_clip": 0.01099353, + "auxiliary_loss_mlp": 0.01031832, + "balance_loss_clip": 1.02020276, + "balance_loss_mlp": 1.03523421, + "epoch": 0.8203216594017736, + "flos": 24715555061760.0, + "grad_norm": 1.648171035996549, + "language_loss": 0.65431941, + "learning_rate": 3.291115727880256e-07, + "loss": 0.67563128, + "num_input_tokens_seen": 294317600, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.640625, + "step": 13644, + "time_per_iteration": 2.469975709915161 + }, + { + "auxiliary_loss_clip": 0.01102103, + "auxiliary_loss_mlp": 0.01033753, + "balance_loss_clip": 1.02192736, + "balance_loss_mlp": 1.03459549, + "epoch": 0.8203817826544416, + "flos": 26031824951040.0, + "grad_norm": 1.4857161465071853, + "language_loss": 0.70731783, + "learning_rate": 3.2889756596481234e-07, + "loss": 0.72867638, + "num_input_tokens_seen": 294340215, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.67578125, + "step": 13645, + "time_per_iteration": 2.507760763168335 + }, + { + "auxiliary_loss_clip": 0.01097751, + "auxiliary_loss_mlp": 0.01028891, + "balance_loss_clip": 1.01763785, + "balance_loss_mlp": 1.03391588, + "epoch": 0.8204419059071095, + "flos": 25954544839680.0, + "grad_norm": 1.7507779511305261, + "language_loss": 0.71368539, + "learning_rate": 3.286836225099707e-07, + "loss": 0.73495179, + "num_input_tokens_seen": 294358590, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.63671875, + "step": 13646, + "time_per_iteration": 2.4842605590820312 + }, + { + "auxiliary_loss_clip": 0.01103715, + "auxiliary_loss_mlp": 0.0103143, + "balance_loss_clip": 1.01968813, + "balance_loss_mlp": 1.0362978, + "epoch": 0.8205020291597775, + "flos": 23580059345280.0, + "grad_norm": 2.2469307057913124, + "language_loss": 0.78236741, + "learning_rate": 3.284697424316132e-07, + "loss": 0.80371881, + "num_input_tokens_seen": 294375825, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.671875, + "step": 13647, + "time_per_iteration": 2.460960626602173 + }, + { + "auxiliary_loss_clip": 0.01097333, + "auxiliary_loss_mlp": 0.01029884, + "balance_loss_clip": 1.01906538, + "balance_loss_mlp": 1.03564429, + "epoch": 0.8205621524124456, + "flos": 26799164219520.0, + "grad_norm": 2.4969141122611855, + "language_loss": 0.67900592, + "learning_rate": 3.2825592573785034e-07, + "loss": 0.7002781, + "num_input_tokens_seen": 294398500, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6171875, + "step": 13648, + "time_per_iteration": 2.4923458099365234 + }, + { + "auxiliary_loss_clip": 0.01099146, + "auxiliary_loss_mlp": 0.01028588, + "balance_loss_clip": 1.01675045, + "balance_loss_mlp": 1.03283572, + "epoch": 0.8206222756651135, + "flos": 27527863432320.0, + "grad_norm": 1.752469920851942, + "language_loss": 0.80176151, + "learning_rate": 3.28042172436791e-07, + "loss": 0.82303882, + "num_input_tokens_seen": 294418840, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 13649, + "time_per_iteration": 2.499342441558838 + }, + { + "auxiliary_loss_clip": 0.01104146, + "auxiliary_loss_mlp": 0.01034331, + "balance_loss_clip": 1.02167666, + "balance_loss_mlp": 1.03822005, + "epoch": 0.8206823989177815, + "flos": 21178605715200.0, + "grad_norm": 1.5497313587872559, + "language_loss": 0.68704414, + "learning_rate": 3.278284825365396e-07, + "loss": 0.70842898, + "num_input_tokens_seen": 294438215, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.66015625, + "step": 13650, + "time_per_iteration": 2.542250871658325 + }, + { + "auxiliary_loss_clip": 0.0110322, + "auxiliary_loss_mlp": 0.01029357, + "balance_loss_clip": 1.01717925, + "balance_loss_mlp": 1.03656495, + "epoch": 0.8207425221704494, + "flos": 11509622150400.0, + "grad_norm": 1.992917875581952, + "language_loss": 0.60637325, + "learning_rate": 3.276148560452001e-07, + "loss": 0.62769902, + "num_input_tokens_seen": 294455260, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.66796875, + "step": 13651, + "time_per_iteration": 2.427485227584839 + }, + { + "auxiliary_loss_clip": 0.01104379, + "auxiliary_loss_mlp": 0.01031683, + "balance_loss_clip": 1.01947021, + "balance_loss_mlp": 1.03686225, + "epoch": 0.8208026454231174, + "flos": 19791987039360.0, + "grad_norm": 1.8723669979316186, + "language_loss": 0.72488928, + "learning_rate": 3.2740129297087293e-07, + "loss": 0.74624991, + "num_input_tokens_seen": 294473205, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.67578125, + "step": 13652, + "time_per_iteration": 2.4548513889312744 + }, + { + "auxiliary_loss_clip": 0.0109596, + "auxiliary_loss_mlp": 0.010267, + "balance_loss_clip": 1.01649594, + "balance_loss_mlp": 1.0346185, + "epoch": 0.8208627686757853, + "flos": 15667538843520.0, + "grad_norm": 2.0134123084835185, + "language_loss": 0.72847176, + "learning_rate": 3.271877933216558e-07, + "loss": 0.7496984, + "num_input_tokens_seen": 294490645, + "router_z_loss_clip": 0.10205078, + "router_z_loss_mlp": 0.61328125, + "step": 13653, + "time_per_iteration": 2.496058702468872 + }, + { + "auxiliary_loss_clip": 0.01106157, + "auxiliary_loss_mlp": 0.01033652, + "balance_loss_clip": 1.02056861, + "balance_loss_mlp": 1.03688347, + "epoch": 0.8209228919284534, + "flos": 37482659516160.0, + "grad_norm": 1.9633450823694507, + "language_loss": 0.62664771, + "learning_rate": 3.269743571056451e-07, + "loss": 0.64804584, + "num_input_tokens_seen": 294513500, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.69140625, + "step": 13654, + "time_per_iteration": 2.659797430038452 + }, + { + "auxiliary_loss_clip": 0.01099475, + "auxiliary_loss_mlp": 0.01025523, + "balance_loss_clip": 1.01414406, + "balance_loss_mlp": 1.0323689, + "epoch": 0.8209830151811213, + "flos": 23112969863040.0, + "grad_norm": 1.437038379365976, + "language_loss": 0.70098144, + "learning_rate": 3.2676098433093447e-07, + "loss": 0.72223151, + "num_input_tokens_seen": 294535710, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.671875, + "step": 13655, + "time_per_iteration": 2.528794527053833 + }, + { + "auxiliary_loss_clip": 0.01099497, + "auxiliary_loss_mlp": 0.01031567, + "balance_loss_clip": 1.0198307, + "balance_loss_mlp": 1.03530574, + "epoch": 0.8210431384337893, + "flos": 21288169175040.0, + "grad_norm": 2.2263788010004673, + "language_loss": 0.82174385, + "learning_rate": 3.265476750056162e-07, + "loss": 0.84305441, + "num_input_tokens_seen": 294554055, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.640625, + "step": 13656, + "time_per_iteration": 2.4395313262939453 + }, + { + "auxiliary_loss_clip": 0.01097914, + "auxiliary_loss_mlp": 0.01030496, + "balance_loss_clip": 1.01891446, + "balance_loss_mlp": 1.03546, + "epoch": 0.8211032616864572, + "flos": 11502403516800.0, + "grad_norm": 2.174318286029315, + "language_loss": 0.74104166, + "learning_rate": 3.2633442913777654e-07, + "loss": 0.76232576, + "num_input_tokens_seen": 294570390, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.625, + "step": 13657, + "time_per_iteration": 2.421473979949951 + }, + { + "auxiliary_loss_clip": 0.01099474, + "auxiliary_loss_mlp": 0.01028953, + "balance_loss_clip": 1.01778913, + "balance_loss_mlp": 1.03430414, + "epoch": 0.8211633849391252, + "flos": 29821477455360.0, + "grad_norm": 1.6389881975455622, + "language_loss": 0.55704254, + "learning_rate": 3.2612124673550325e-07, + "loss": 0.57832676, + "num_input_tokens_seen": 294593050, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.65234375, + "step": 13658, + "time_per_iteration": 2.515174150466919 + }, + { + "auxiliary_loss_clip": 0.01099693, + "auxiliary_loss_mlp": 0.0103061, + "balance_loss_clip": 1.01874256, + "balance_loss_mlp": 1.03353977, + "epoch": 0.8212235081917931, + "flos": 13115439573120.0, + "grad_norm": 2.0858801212680804, + "language_loss": 0.7889123, + "learning_rate": 3.259081278068805e-07, + "loss": 0.81021535, + "num_input_tokens_seen": 294608550, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66015625, + "step": 13659, + "time_per_iteration": 2.4070687294006348 + }, + { + "auxiliary_loss_clip": 0.01092982, + "auxiliary_loss_mlp": 0.01025586, + "balance_loss_clip": 1.01550722, + "balance_loss_mlp": 1.03138757, + "epoch": 0.8212836314444611, + "flos": 40515351782400.0, + "grad_norm": 1.4937211500780294, + "language_loss": 0.59556949, + "learning_rate": 3.256950723599887e-07, + "loss": 0.61675525, + "num_input_tokens_seen": 294630380, + "router_z_loss_clip": 0.10058594, + "router_z_loss_mlp": 0.6171875, + "step": 13660, + "time_per_iteration": 2.5837912559509277 + }, + { + "auxiliary_loss_clip": 0.01101967, + "auxiliary_loss_mlp": 0.01032748, + "balance_loss_clip": 1.02019513, + "balance_loss_mlp": 1.03470123, + "epoch": 0.8213437546971292, + "flos": 18770543982720.0, + "grad_norm": 2.5721121812428285, + "language_loss": 0.72652888, + "learning_rate": 3.254820804029075e-07, + "loss": 0.74787605, + "num_input_tokens_seen": 294648655, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.671875, + "step": 13661, + "time_per_iteration": 2.4341530799865723 + }, + { + "auxiliary_loss_clip": 0.01101061, + "auxiliary_loss_mlp": 0.01031263, + "balance_loss_clip": 1.01971793, + "balance_loss_mlp": 1.03330231, + "epoch": 0.8214038779497971, + "flos": 19682279925120.0, + "grad_norm": 1.950667378612405, + "language_loss": 0.74900603, + "learning_rate": 3.252691519437143e-07, + "loss": 0.77032924, + "num_input_tokens_seen": 294666915, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.67578125, + "step": 13662, + "time_per_iteration": 2.4195828437805176 + }, + { + "auxiliary_loss_clip": 0.01023814, + "auxiliary_loss_mlp": 0.01002606, + "balance_loss_clip": 1.00160432, + "balance_loss_mlp": 1.00386024, + "epoch": 0.8214640012024651, + "flos": 71602969697280.0, + "grad_norm": 0.745371582589168, + "language_loss": 0.54059064, + "learning_rate": 3.250562869904825e-07, + "loss": 0.56085479, + "num_input_tokens_seen": 294731545, + "router_z_loss_clip": 0.01000977, + "router_z_loss_mlp": 0.19921875, + "step": 13663, + "time_per_iteration": 3.190706729888916 + }, + { + "auxiliary_loss_clip": 0.01099277, + "auxiliary_loss_mlp": 0.01031253, + "balance_loss_clip": 1.01952863, + "balance_loss_mlp": 1.0335598, + "epoch": 0.821524124455133, + "flos": 14757203531520.0, + "grad_norm": 2.126901212447461, + "language_loss": 0.65428329, + "learning_rate": 3.248434855512838e-07, + "loss": 0.67558861, + "num_input_tokens_seen": 294748745, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 13664, + "time_per_iteration": 2.437980890274048 + }, + { + "auxiliary_loss_clip": 0.01098267, + "auxiliary_loss_mlp": 0.01028172, + "balance_loss_clip": 1.01769304, + "balance_loss_mlp": 1.03531146, + "epoch": 0.821584247707801, + "flos": 25082274965760.0, + "grad_norm": 1.764912039303068, + "language_loss": 0.75243938, + "learning_rate": 3.246307476341881e-07, + "loss": 0.77370375, + "num_input_tokens_seen": 294768955, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.6328125, + "step": 13665, + "time_per_iteration": 2.487011194229126 + }, + { + "auxiliary_loss_clip": 0.01100929, + "auxiliary_loss_mlp": 0.01028919, + "balance_loss_clip": 1.01756454, + "balance_loss_mlp": 1.03510714, + "epoch": 0.8216443709604689, + "flos": 36830701710720.0, + "grad_norm": 4.066044981617709, + "language_loss": 0.65191346, + "learning_rate": 3.2441807324726256e-07, + "loss": 0.67321193, + "num_input_tokens_seen": 294789250, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 13666, + "time_per_iteration": 2.570254325866699 + }, + { + "auxiliary_loss_clip": 0.01099902, + "auxiliary_loss_mlp": 0.01030196, + "balance_loss_clip": 1.0189724, + "balance_loss_mlp": 1.03564048, + "epoch": 0.821704494213137, + "flos": 25081808088960.0, + "grad_norm": 1.6810733916735099, + "language_loss": 0.76897776, + "learning_rate": 3.2420546239857174e-07, + "loss": 0.79027867, + "num_input_tokens_seen": 294809760, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 13667, + "time_per_iteration": 2.493004322052002 + }, + { + "auxiliary_loss_clip": 0.01102843, + "auxiliary_loss_mlp": 0.01030551, + "balance_loss_clip": 1.01841545, + "balance_loss_mlp": 1.03575373, + "epoch": 0.8217646174658049, + "flos": 14356117290240.0, + "grad_norm": 1.809243684883085, + "language_loss": 0.77085578, + "learning_rate": 3.239929150961773e-07, + "loss": 0.79218972, + "num_input_tokens_seen": 294826495, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 13668, + "time_per_iteration": 2.4280850887298584 + }, + { + "auxiliary_loss_clip": 0.01098761, + "auxiliary_loss_mlp": 0.01030147, + "balance_loss_clip": 1.01847649, + "balance_loss_mlp": 1.03395486, + "epoch": 0.8218247407184729, + "flos": 22090557139200.0, + "grad_norm": 2.0672708102339894, + "language_loss": 0.73729622, + "learning_rate": 3.2378043134813984e-07, + "loss": 0.75858533, + "num_input_tokens_seen": 294845370, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6484375, + "step": 13669, + "time_per_iteration": 2.4670791625976562 + }, + { + "auxiliary_loss_clip": 0.01098476, + "auxiliary_loss_mlp": 0.01026323, + "balance_loss_clip": 1.01525986, + "balance_loss_mlp": 1.03419805, + "epoch": 0.8218848639711408, + "flos": 16764035368320.0, + "grad_norm": 1.6347371555380708, + "language_loss": 0.78685886, + "learning_rate": 3.235680111625161e-07, + "loss": 0.80810678, + "num_input_tokens_seen": 294863740, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.640625, + "step": 13670, + "time_per_iteration": 2.4091546535491943 + }, + { + "auxiliary_loss_clip": 0.01105531, + "auxiliary_loss_mlp": 0.01036754, + "balance_loss_clip": 1.0243504, + "balance_loss_mlp": 1.03711927, + "epoch": 0.8219449872238088, + "flos": 25994801007360.0, + "grad_norm": 2.4869445787160616, + "language_loss": 0.74846464, + "learning_rate": 3.2335565454736123e-07, + "loss": 0.76988751, + "num_input_tokens_seen": 294882815, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 13671, + "time_per_iteration": 2.4942846298217773 + }, + { + "auxiliary_loss_clip": 0.01105717, + "auxiliary_loss_mlp": 0.01029969, + "balance_loss_clip": 1.01770282, + "balance_loss_mlp": 1.0356338, + "epoch": 0.8220051104764767, + "flos": 20778094091520.0, + "grad_norm": 7.475778377778618, + "language_loss": 0.76535976, + "learning_rate": 3.23143361510728e-07, + "loss": 0.7867167, + "num_input_tokens_seen": 294901985, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69921875, + "step": 13672, + "time_per_iteration": 2.4521193504333496 + }, + { + "auxiliary_loss_clip": 0.01101297, + "auxiliary_loss_mlp": 0.01033533, + "balance_loss_clip": 1.02047908, + "balance_loss_mlp": 1.03531826, + "epoch": 0.8220652337291448, + "flos": 14574849160320.0, + "grad_norm": 2.6550856342382088, + "language_loss": 0.74858975, + "learning_rate": 3.2293113206066733e-07, + "loss": 0.76993799, + "num_input_tokens_seen": 294919705, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.66015625, + "step": 13673, + "time_per_iteration": 2.4321844577789307 + }, + { + "auxiliary_loss_clip": 0.01101553, + "auxiliary_loss_mlp": 0.01031215, + "balance_loss_clip": 1.01893675, + "balance_loss_mlp": 1.03496301, + "epoch": 0.8221253569818128, + "flos": 23805866194560.0, + "grad_norm": 1.7367122005490172, + "language_loss": 0.79398859, + "learning_rate": 3.227189662052254e-07, + "loss": 0.81531632, + "num_input_tokens_seen": 294939900, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66796875, + "step": 13674, + "time_per_iteration": 2.452885150909424 + }, + { + "auxiliary_loss_clip": 0.01099597, + "auxiliary_loss_mlp": 0.01030577, + "balance_loss_clip": 1.01925182, + "balance_loss_mlp": 1.03428209, + "epoch": 0.8221854802344807, + "flos": 21288241002240.0, + "grad_norm": 2.1133835706017403, + "language_loss": 0.70256555, + "learning_rate": 3.225068639524484e-07, + "loss": 0.7238673, + "num_input_tokens_seen": 294959110, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65234375, + "step": 13675, + "time_per_iteration": 2.456704616546631 + }, + { + "auxiliary_loss_clip": 0.01098649, + "auxiliary_loss_mlp": 0.01033846, + "balance_loss_clip": 1.02252114, + "balance_loss_mlp": 1.03533232, + "epoch": 0.8222456034871487, + "flos": 20956785275520.0, + "grad_norm": 1.9196064294997741, + "language_loss": 0.74139565, + "learning_rate": 3.2229482531037965e-07, + "loss": 0.76272058, + "num_input_tokens_seen": 294978660, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6328125, + "step": 13676, + "time_per_iteration": 2.4562814235687256 + }, + { + "auxiliary_loss_clip": 0.01099141, + "auxiliary_loss_mlp": 0.01029688, + "balance_loss_clip": 1.01866674, + "balance_loss_mlp": 1.0337007, + "epoch": 0.8223057267398166, + "flos": 21397517153280.0, + "grad_norm": 1.9554879616181895, + "language_loss": 0.80535352, + "learning_rate": 3.2208285028705893e-07, + "loss": 0.8266418, + "num_input_tokens_seen": 294998075, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65234375, + "step": 13677, + "time_per_iteration": 2.4471378326416016 + }, + { + "auxiliary_loss_clip": 0.01102612, + "auxiliary_loss_mlp": 0.01031479, + "balance_loss_clip": 1.01983237, + "balance_loss_mlp": 1.03584671, + "epoch": 0.8223658499924846, + "flos": 15268212368640.0, + "grad_norm": 1.7803528403739162, + "language_loss": 0.6982736, + "learning_rate": 3.218709388905245e-07, + "loss": 0.71961451, + "num_input_tokens_seen": 295015950, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66796875, + "step": 13678, + "time_per_iteration": 3.784363269805908 + }, + { + "auxiliary_loss_clip": 0.01097789, + "auxiliary_loss_mlp": 0.01034524, + "balance_loss_clip": 1.02262068, + "balance_loss_mlp": 1.03327274, + "epoch": 0.8224259732451525, + "flos": 31249537447680.0, + "grad_norm": 1.7357616205251198, + "language_loss": 0.71496773, + "learning_rate": 3.216590911288133e-07, + "loss": 0.73629081, + "num_input_tokens_seen": 295036800, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.64453125, + "step": 13679, + "time_per_iteration": 2.5227231979370117 + }, + { + "auxiliary_loss_clip": 0.01097414, + "auxiliary_loss_mlp": 0.01025772, + "balance_loss_clip": 1.01444733, + "balance_loss_mlp": 1.03310823, + "epoch": 0.8224860964978206, + "flos": 21574628138880.0, + "grad_norm": 1.9577304134380913, + "language_loss": 0.70049226, + "learning_rate": 3.214473070099564e-07, + "loss": 0.72172415, + "num_input_tokens_seen": 295055300, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 13680, + "time_per_iteration": 5.352876901626587 + }, + { + "auxiliary_loss_clip": 0.01101662, + "auxiliary_loss_mlp": 0.01028207, + "balance_loss_clip": 1.01710844, + "balance_loss_mlp": 1.03647804, + "epoch": 0.8225462197504885, + "flos": 25483217552640.0, + "grad_norm": 1.9159882812306386, + "language_loss": 0.59767008, + "learning_rate": 3.21235586541986e-07, + "loss": 0.61896878, + "num_input_tokens_seen": 295076420, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.65234375, + "step": 13681, + "time_per_iteration": 2.490726947784424 + }, + { + "auxiliary_loss_clip": 0.01104078, + "auxiliary_loss_mlp": 0.01033867, + "balance_loss_clip": 1.0217855, + "balance_loss_mlp": 1.03521645, + "epoch": 0.8226063430031565, + "flos": 39385458587520.0, + "grad_norm": 1.6390397647807602, + "language_loss": 0.69242489, + "learning_rate": 3.2102392973293047e-07, + "loss": 0.7138043, + "num_input_tokens_seen": 295100540, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 13682, + "time_per_iteration": 2.602713108062744 + }, + { + "auxiliary_loss_clip": 0.01102237, + "auxiliary_loss_mlp": 0.01031178, + "balance_loss_clip": 1.01805353, + "balance_loss_mlp": 1.03474033, + "epoch": 0.8226664662558244, + "flos": 22815269942400.0, + "grad_norm": 1.8183335657077608, + "language_loss": 0.79319465, + "learning_rate": 3.20812336590816e-07, + "loss": 0.81452876, + "num_input_tokens_seen": 295120180, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.67578125, + "step": 13683, + "time_per_iteration": 2.473590850830078 + }, + { + "auxiliary_loss_clip": 0.01094969, + "auxiliary_loss_mlp": 0.01025176, + "balance_loss_clip": 1.01506758, + "balance_loss_mlp": 1.0332613, + "epoch": 0.8227265895084924, + "flos": 25665607837440.0, + "grad_norm": 1.9785950303413915, + "language_loss": 0.86425269, + "learning_rate": 3.206008071236661e-07, + "loss": 0.88545412, + "num_input_tokens_seen": 295138530, + "router_z_loss_clip": 0.10107422, + "router_z_loss_mlp": 0.6171875, + "step": 13684, + "time_per_iteration": 3.9735918045043945 + }, + { + "auxiliary_loss_clip": 0.01096791, + "auxiliary_loss_mlp": 0.01027071, + "balance_loss_clip": 1.01606178, + "balance_loss_mlp": 1.03397489, + "epoch": 0.8227867127611603, + "flos": 26179274280960.0, + "grad_norm": 1.4566743169469651, + "language_loss": 0.7976135, + "learning_rate": 3.2038934133950157e-07, + "loss": 0.81885219, + "num_input_tokens_seen": 295160260, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.62890625, + "step": 13685, + "time_per_iteration": 2.5093915462493896 + }, + { + "auxiliary_loss_clip": 0.01099427, + "auxiliary_loss_mlp": 0.01029254, + "balance_loss_clip": 1.01740456, + "balance_loss_mlp": 1.03454077, + "epoch": 0.8228468360138284, + "flos": 22018053536640.0, + "grad_norm": 1.5897457656815852, + "language_loss": 0.68847555, + "learning_rate": 3.2017793924634194e-07, + "loss": 0.70976239, + "num_input_tokens_seen": 295177055, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6484375, + "step": 13686, + "time_per_iteration": 2.485407829284668 + }, + { + "auxiliary_loss_clip": 0.01100256, + "auxiliary_loss_mlp": 0.01031583, + "balance_loss_clip": 1.01950085, + "balance_loss_mlp": 1.03301668, + "epoch": 0.8229069592664963, + "flos": 14903359971840.0, + "grad_norm": 2.097008209143573, + "language_loss": 0.77891821, + "learning_rate": 3.1996660085220263e-07, + "loss": 0.80023664, + "num_input_tokens_seen": 295193870, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 13687, + "time_per_iteration": 2.436474323272705 + }, + { + "auxiliary_loss_clip": 0.0109971, + "auxiliary_loss_mlp": 0.01028497, + "balance_loss_clip": 1.01672459, + "balance_loss_mlp": 1.03359497, + "epoch": 0.8229670825191643, + "flos": 15669478177920.0, + "grad_norm": 1.8388949681321325, + "language_loss": 0.72154832, + "learning_rate": 3.1975532616509825e-07, + "loss": 0.7428304, + "num_input_tokens_seen": 295211040, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66015625, + "step": 13688, + "time_per_iteration": 2.492417097091675 + }, + { + "auxiliary_loss_clip": 0.01100514, + "auxiliary_loss_mlp": 0.01032428, + "balance_loss_clip": 1.02074492, + "balance_loss_mlp": 1.03449452, + "epoch": 0.8230272057718323, + "flos": 23183498217600.0, + "grad_norm": 1.8251885380821353, + "language_loss": 0.73366064, + "learning_rate": 3.1954411519304025e-07, + "loss": 0.75499004, + "num_input_tokens_seen": 295231300, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66015625, + "step": 13689, + "time_per_iteration": 2.5088613033294678 + }, + { + "auxiliary_loss_clip": 0.01100436, + "auxiliary_loss_mlp": 0.01032784, + "balance_loss_clip": 1.02097631, + "balance_loss_mlp": 1.03380239, + "epoch": 0.8230873290245002, + "flos": 21032413361280.0, + "grad_norm": 1.8248664958327294, + "language_loss": 0.689372, + "learning_rate": 3.1933296794403887e-07, + "loss": 0.71070421, + "num_input_tokens_seen": 295251045, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 13690, + "time_per_iteration": 2.492891550064087 + }, + { + "auxiliary_loss_clip": 0.01099884, + "auxiliary_loss_mlp": 0.01032076, + "balance_loss_clip": 1.02075148, + "balance_loss_mlp": 1.03438592, + "epoch": 0.8231474522771682, + "flos": 21250139650560.0, + "grad_norm": 1.9670817251189323, + "language_loss": 0.85403329, + "learning_rate": 3.191218844260988e-07, + "loss": 0.87535292, + "num_input_tokens_seen": 295270225, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 13691, + "time_per_iteration": 2.5006744861602783 + }, + { + "auxiliary_loss_clip": 0.01101672, + "auxiliary_loss_mlp": 0.01031634, + "balance_loss_clip": 1.0202024, + "balance_loss_mlp": 1.03484964, + "epoch": 0.8232075755298361, + "flos": 23842028211840.0, + "grad_norm": 1.942309655074723, + "language_loss": 0.76985818, + "learning_rate": 3.189108646472252e-07, + "loss": 0.79119122, + "num_input_tokens_seen": 295288950, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 13692, + "time_per_iteration": 2.5435726642608643 + }, + { + "auxiliary_loss_clip": 0.01098631, + "auxiliary_loss_mlp": 0.01027484, + "balance_loss_clip": 1.01581907, + "balance_loss_mlp": 1.03404903, + "epoch": 0.8232676987825042, + "flos": 21653955325440.0, + "grad_norm": 1.5338263277775153, + "language_loss": 0.71625656, + "learning_rate": 3.186999086154205e-07, + "loss": 0.73751771, + "num_input_tokens_seen": 295309405, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.64453125, + "step": 13693, + "time_per_iteration": 2.462322473526001 + }, + { + "auxiliary_loss_clip": 0.01095198, + "auxiliary_loss_mlp": 0.01032132, + "balance_loss_clip": 1.02157629, + "balance_loss_mlp": 1.03228104, + "epoch": 0.8233278220351721, + "flos": 26322701287680.0, + "grad_norm": 1.484575932799216, + "language_loss": 0.83818102, + "learning_rate": 3.1848901633868355e-07, + "loss": 0.85945427, + "num_input_tokens_seen": 295331115, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6328125, + "step": 13694, + "time_per_iteration": 2.530102491378784 + }, + { + "auxiliary_loss_clip": 0.01099219, + "auxiliary_loss_mlp": 0.010295, + "balance_loss_clip": 1.01793063, + "balance_loss_mlp": 1.03319895, + "epoch": 0.8233879452878401, + "flos": 21725812483200.0, + "grad_norm": 1.5670764630001808, + "language_loss": 0.76820183, + "learning_rate": 3.182781878250118e-07, + "loss": 0.78948903, + "num_input_tokens_seen": 295350495, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66015625, + "step": 13695, + "time_per_iteration": 2.4689533710479736 + }, + { + "auxiliary_loss_clip": 0.01101143, + "auxiliary_loss_mlp": 0.01032503, + "balance_loss_clip": 1.02139306, + "balance_loss_mlp": 1.03628111, + "epoch": 0.823448068540508, + "flos": 20557746109440.0, + "grad_norm": 1.716943951342175, + "language_loss": 0.80500603, + "learning_rate": 3.1806742308239985e-07, + "loss": 0.82634246, + "num_input_tokens_seen": 295368225, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 13696, + "time_per_iteration": 2.4298195838928223 + }, + { + "auxiliary_loss_clip": 0.01023101, + "auxiliary_loss_mlp": 0.01002114, + "balance_loss_clip": 1.00116658, + "balance_loss_mlp": 1.00284874, + "epoch": 0.823508191793176, + "flos": 67273688194560.0, + "grad_norm": 0.735830017685578, + "language_loss": 0.63844752, + "learning_rate": 3.178567221188393e-07, + "loss": 0.65869963, + "num_input_tokens_seen": 295430035, + "router_z_loss_clip": 0.00946045, + "router_z_loss_mlp": 0.203125, + "step": 13697, + "time_per_iteration": 3.1228291988372803 + }, + { + "auxiliary_loss_clip": 0.01094179, + "auxiliary_loss_mlp": 0.01024098, + "balance_loss_clip": 1.01381028, + "balance_loss_mlp": 1.03191829, + "epoch": 0.8235683150458439, + "flos": 17928402641280.0, + "grad_norm": 1.8408059577999478, + "language_loss": 0.73020118, + "learning_rate": 3.1764608494232037e-07, + "loss": 0.7513839, + "num_input_tokens_seen": 295447765, + "router_z_loss_clip": 0.10302734, + "router_z_loss_mlp": 0.62109375, + "step": 13698, + "time_per_iteration": 2.4486424922943115 + }, + { + "auxiliary_loss_clip": 0.01100262, + "auxiliary_loss_mlp": 0.01027014, + "balance_loss_clip": 1.01482511, + "balance_loss_mlp": 1.03423512, + "epoch": 0.823628438298512, + "flos": 18916089891840.0, + "grad_norm": 1.8648644268579129, + "language_loss": 0.7192139, + "learning_rate": 3.174355115608305e-07, + "loss": 0.74048668, + "num_input_tokens_seen": 295464810, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66015625, + "step": 13699, + "time_per_iteration": 2.4568445682525635 + }, + { + "auxiliary_loss_clip": 0.0109784, + "auxiliary_loss_mlp": 0.0102711, + "balance_loss_clip": 1.01595223, + "balance_loss_mlp": 1.03365541, + "epoch": 0.8236885615511799, + "flos": 18696460181760.0, + "grad_norm": 3.252304717758055, + "language_loss": 0.8196072, + "learning_rate": 3.1722500198235526e-07, + "loss": 0.84085667, + "num_input_tokens_seen": 295482605, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.640625, + "step": 13700, + "time_per_iteration": 2.417517900466919 + }, + { + "auxiliary_loss_clip": 0.01100609, + "auxiliary_loss_mlp": 0.01032273, + "balance_loss_clip": 1.02116919, + "balance_loss_mlp": 1.03394866, + "epoch": 0.8237486848038479, + "flos": 23695009845120.0, + "grad_norm": 1.5761573110612463, + "language_loss": 0.72924078, + "learning_rate": 3.170145562148763e-07, + "loss": 0.75056958, + "num_input_tokens_seen": 295503780, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66796875, + "step": 13701, + "time_per_iteration": 2.467587947845459 + }, + { + "auxiliary_loss_clip": 0.01099008, + "auxiliary_loss_mlp": 0.01031893, + "balance_loss_clip": 1.01987123, + "balance_loss_mlp": 1.03201449, + "epoch": 0.8238088080565159, + "flos": 23441301106560.0, + "grad_norm": 1.687494382676569, + "language_loss": 0.69314957, + "learning_rate": 3.1680417426637384e-07, + "loss": 0.71445858, + "num_input_tokens_seen": 295522035, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 13702, + "time_per_iteration": 2.455983877182007 + }, + { + "auxiliary_loss_clip": 0.01099997, + "auxiliary_loss_mlp": 0.01029664, + "balance_loss_clip": 1.01816654, + "balance_loss_mlp": 1.03495264, + "epoch": 0.8238689313091838, + "flos": 22746537267840.0, + "grad_norm": 1.7567821119392977, + "language_loss": 0.74843061, + "learning_rate": 3.1659385614482603e-07, + "loss": 0.76972723, + "num_input_tokens_seen": 295541190, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6484375, + "step": 13703, + "time_per_iteration": 2.46303129196167 + }, + { + "auxiliary_loss_clip": 0.0110475, + "auxiliary_loss_mlp": 0.01037304, + "balance_loss_clip": 1.02411962, + "balance_loss_mlp": 1.03483748, + "epoch": 0.8239290545618518, + "flos": 25630092264960.0, + "grad_norm": 1.6727554816785857, + "language_loss": 0.697137, + "learning_rate": 3.1638360185820755e-07, + "loss": 0.71855754, + "num_input_tokens_seen": 295558860, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 13704, + "time_per_iteration": 2.4747629165649414 + }, + { + "auxiliary_loss_clip": 0.01097133, + "auxiliary_loss_mlp": 0.01029303, + "balance_loss_clip": 1.01810384, + "balance_loss_mlp": 1.03255594, + "epoch": 0.8239891778145197, + "flos": 26026473824640.0, + "grad_norm": 1.5661062329236886, + "language_loss": 0.63864183, + "learning_rate": 3.161734114144916e-07, + "loss": 0.65990615, + "num_input_tokens_seen": 295578155, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.64453125, + "step": 13705, + "time_per_iteration": 2.487370014190674 + }, + { + "auxiliary_loss_clip": 0.01100513, + "auxiliary_loss_mlp": 0.01027202, + "balance_loss_clip": 1.01513791, + "balance_loss_mlp": 1.03334999, + "epoch": 0.8240493010671878, + "flos": 21833257040640.0, + "grad_norm": 5.668073340397448, + "language_loss": 0.69304025, + "learning_rate": 3.1596328482164915e-07, + "loss": 0.71431744, + "num_input_tokens_seen": 295599170, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.671875, + "step": 13706, + "time_per_iteration": 2.452761173248291 + }, + { + "auxiliary_loss_clip": 0.01103005, + "auxiliary_loss_mlp": 0.01031795, + "balance_loss_clip": 1.0199219, + "balance_loss_mlp": 1.0368948, + "epoch": 0.8241094243198557, + "flos": 18551919853440.0, + "grad_norm": 1.7444306759307577, + "language_loss": 0.69689429, + "learning_rate": 3.157532220876475e-07, + "loss": 0.71824229, + "num_input_tokens_seen": 295617465, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66015625, + "step": 13707, + "time_per_iteration": 2.442892551422119 + }, + { + "auxiliary_loss_clip": 0.0110125, + "auxiliary_loss_mlp": 0.01030414, + "balance_loss_clip": 1.01842141, + "balance_loss_mlp": 1.03448296, + "epoch": 0.8241695475725237, + "flos": 25447163276160.0, + "grad_norm": 1.6784140276510164, + "language_loss": 0.79208684, + "learning_rate": 3.1554322322045226e-07, + "loss": 0.81340349, + "num_input_tokens_seen": 295634960, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66796875, + "step": 13708, + "time_per_iteration": 2.520395517349243 + }, + { + "auxiliary_loss_clip": 0.01099959, + "auxiliary_loss_mlp": 0.01028512, + "balance_loss_clip": 1.01653779, + "balance_loss_mlp": 1.03384626, + "epoch": 0.8242296708251916, + "flos": 18989670902400.0, + "grad_norm": 2.2364535014485996, + "language_loss": 0.68625695, + "learning_rate": 3.1533328822802664e-07, + "loss": 0.70754164, + "num_input_tokens_seen": 295652725, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66015625, + "step": 13709, + "time_per_iteration": 2.433765172958374 + }, + { + "auxiliary_loss_clip": 0.01099313, + "auxiliary_loss_mlp": 0.01032329, + "balance_loss_clip": 1.02135551, + "balance_loss_mlp": 1.03391647, + "epoch": 0.8242897940778596, + "flos": 22600883617920.0, + "grad_norm": 1.7389997143099616, + "language_loss": 0.82326722, + "learning_rate": 3.151234171183319e-07, + "loss": 0.84458363, + "num_input_tokens_seen": 295671195, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.65234375, + "step": 13710, + "time_per_iteration": 2.448054075241089 + }, + { + "auxiliary_loss_clip": 0.01100024, + "auxiliary_loss_mlp": 0.01028924, + "balance_loss_clip": 1.01690805, + "balance_loss_mlp": 1.0341599, + "epoch": 0.8243499173305275, + "flos": 21468153248640.0, + "grad_norm": 2.108497711538903, + "language_loss": 0.78206408, + "learning_rate": 3.149136098993257e-07, + "loss": 0.80335355, + "num_input_tokens_seen": 295689130, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.65625, + "step": 13711, + "time_per_iteration": 2.4674675464630127 + }, + { + "auxiliary_loss_clip": 0.01098795, + "auxiliary_loss_mlp": 0.01028415, + "balance_loss_clip": 1.01678646, + "balance_loss_mlp": 1.03360128, + "epoch": 0.8244100405831956, + "flos": 20010359773440.0, + "grad_norm": 1.7882059263039318, + "language_loss": 0.65825897, + "learning_rate": 3.1470386657896473e-07, + "loss": 0.67953104, + "num_input_tokens_seen": 295706385, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 13712, + "time_per_iteration": 2.436894655227661 + }, + { + "auxiliary_loss_clip": 0.01099114, + "auxiliary_loss_mlp": 0.01026198, + "balance_loss_clip": 1.01524234, + "balance_loss_mlp": 1.03398395, + "epoch": 0.8244701638358635, + "flos": 26430684549120.0, + "grad_norm": 1.522082623848733, + "language_loss": 0.73938203, + "learning_rate": 3.14494187165202e-07, + "loss": 0.76063514, + "num_input_tokens_seen": 295727925, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 13713, + "time_per_iteration": 2.4973256587982178 + }, + { + "auxiliary_loss_clip": 0.01098726, + "auxiliary_loss_mlp": 0.01025141, + "balance_loss_clip": 1.01357198, + "balance_loss_mlp": 1.03296065, + "epoch": 0.8245302870885315, + "flos": 17640004343040.0, + "grad_norm": 1.9600831331302564, + "language_loss": 0.81260616, + "learning_rate": 3.1428457166598833e-07, + "loss": 0.83384484, + "num_input_tokens_seen": 295744420, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 13714, + "time_per_iteration": 2.3998029232025146 + }, + { + "auxiliary_loss_clip": 0.01101569, + "auxiliary_loss_mlp": 0.01031928, + "balance_loss_clip": 1.01970863, + "balance_loss_mlp": 1.03630209, + "epoch": 0.8245904103411995, + "flos": 26209510554240.0, + "grad_norm": 1.8246610478563798, + "language_loss": 0.65964639, + "learning_rate": 3.1407502008927235e-07, + "loss": 0.68098134, + "num_input_tokens_seen": 295765105, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65234375, + "step": 13715, + "time_per_iteration": 2.508072853088379 + }, + { + "auxiliary_loss_clip": 0.01102798, + "auxiliary_loss_mlp": 0.01029868, + "balance_loss_clip": 1.01794744, + "balance_loss_mlp": 1.03492641, + "epoch": 0.8246505335938674, + "flos": 24205084928640.0, + "grad_norm": 1.7553847724499971, + "language_loss": 0.75059605, + "learning_rate": 3.1386553244300086e-07, + "loss": 0.77192277, + "num_input_tokens_seen": 295784200, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 13716, + "time_per_iteration": 2.443873405456543 + }, + { + "auxiliary_loss_clip": 0.01022766, + "auxiliary_loss_mlp": 0.01002387, + "balance_loss_clip": 1.00139761, + "balance_loss_mlp": 1.00258684, + "epoch": 0.8247106568465354, + "flos": 67092195749760.0, + "grad_norm": 0.7319645588496716, + "language_loss": 0.58983648, + "learning_rate": 3.136561087351175e-07, + "loss": 0.61008805, + "num_input_tokens_seen": 295846555, + "router_z_loss_clip": 0.0098877, + "router_z_loss_mlp": 0.20117188, + "step": 13717, + "time_per_iteration": 3.165395975112915 + }, + { + "auxiliary_loss_clip": 0.01100792, + "auxiliary_loss_mlp": 0.01027838, + "balance_loss_clip": 1.01752055, + "balance_loss_mlp": 1.03656578, + "epoch": 0.8247707800992033, + "flos": 12568232805120.0, + "grad_norm": 1.9648404876687129, + "language_loss": 0.79825944, + "learning_rate": 3.1344674897356373e-07, + "loss": 0.81954575, + "num_input_tokens_seen": 295863425, + "router_z_loss_clip": 0.10302734, + "router_z_loss_mlp": 0.640625, + "step": 13718, + "time_per_iteration": 2.4321165084838867 + }, + { + "auxiliary_loss_clip": 0.01096221, + "auxiliary_loss_mlp": 0.0103075, + "balance_loss_clip": 1.01941967, + "balance_loss_mlp": 1.03296947, + "epoch": 0.8248309033518714, + "flos": 15923617879680.0, + "grad_norm": 1.5600124582727455, + "language_loss": 0.69004935, + "learning_rate": 3.132374531662778e-07, + "loss": 0.71131909, + "num_input_tokens_seen": 295880925, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6328125, + "step": 13719, + "time_per_iteration": 2.4147088527679443 + }, + { + "auxiliary_loss_clip": 0.01099942, + "auxiliary_loss_mlp": 0.01028482, + "balance_loss_clip": 1.01622105, + "balance_loss_mlp": 1.03254807, + "epoch": 0.8248910266045393, + "flos": 17564735393280.0, + "grad_norm": 2.151484715276455, + "language_loss": 0.69623858, + "learning_rate": 3.13028221321197e-07, + "loss": 0.71752286, + "num_input_tokens_seen": 295898205, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.67578125, + "step": 13720, + "time_per_iteration": 3.8741157054901123 + }, + { + "auxiliary_loss_clip": 0.01101452, + "auxiliary_loss_mlp": 0.01028839, + "balance_loss_clip": 1.01701367, + "balance_loss_mlp": 1.03437507, + "epoch": 0.8249511498572073, + "flos": 28619655275520.0, + "grad_norm": 2.0074943971532013, + "language_loss": 0.75765574, + "learning_rate": 3.1281905344625467e-07, + "loss": 0.77895862, + "num_input_tokens_seen": 295918130, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 13721, + "time_per_iteration": 2.4852468967437744 + }, + { + "auxiliary_loss_clip": 0.01097082, + "auxiliary_loss_mlp": 0.01024921, + "balance_loss_clip": 1.01376283, + "balance_loss_mlp": 1.03250015, + "epoch": 0.8250112731098752, + "flos": 25556583081600.0, + "grad_norm": 1.8887788155393048, + "language_loss": 0.77601635, + "learning_rate": 3.1260994954938305e-07, + "loss": 0.79723638, + "num_input_tokens_seen": 295937760, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.64453125, + "step": 13722, + "time_per_iteration": 3.9625513553619385 + }, + { + "auxiliary_loss_clip": 0.01098838, + "auxiliary_loss_mlp": 0.01029739, + "balance_loss_clip": 1.01853311, + "balance_loss_mlp": 1.03513193, + "epoch": 0.8250713963625432, + "flos": 27746164339200.0, + "grad_norm": 1.6348425441853751, + "language_loss": 0.63200963, + "learning_rate": 3.1240090963851205e-07, + "loss": 0.65329552, + "num_input_tokens_seen": 295957585, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.63671875, + "step": 13723, + "time_per_iteration": 2.5051403045654297 + }, + { + "auxiliary_loss_clip": 0.01100161, + "auxiliary_loss_mlp": 0.01031711, + "balance_loss_clip": 1.02004635, + "balance_loss_mlp": 1.03461981, + "epoch": 0.8251315196152111, + "flos": 21610610588160.0, + "grad_norm": 1.4987844407336721, + "language_loss": 0.73996544, + "learning_rate": 3.121919337215666e-07, + "loss": 0.76128417, + "num_input_tokens_seen": 295977135, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65625, + "step": 13724, + "time_per_iteration": 2.446829319000244 + }, + { + "auxiliary_loss_clip": 0.01102165, + "auxiliary_loss_mlp": 0.01032146, + "balance_loss_clip": 1.0198431, + "balance_loss_mlp": 1.03561938, + "epoch": 0.8251916428678792, + "flos": 28579363194240.0, + "grad_norm": 1.943405215857899, + "language_loss": 0.64098012, + "learning_rate": 3.1198302180647253e-07, + "loss": 0.6623233, + "num_input_tokens_seen": 295996265, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6640625, + "step": 13725, + "time_per_iteration": 3.9392764568328857 + }, + { + "auxiliary_loss_clip": 0.01096724, + "auxiliary_loss_mlp": 0.01027974, + "balance_loss_clip": 1.01672626, + "balance_loss_mlp": 1.03262889, + "epoch": 0.8252517661205471, + "flos": 23075191733760.0, + "grad_norm": 1.4837980675924767, + "language_loss": 0.81744307, + "learning_rate": 3.1177417390115125e-07, + "loss": 0.83869004, + "num_input_tokens_seen": 296014745, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 13726, + "time_per_iteration": 2.468735456466675 + }, + { + "auxiliary_loss_clip": 0.01093251, + "auxiliary_loss_mlp": 0.01029432, + "balance_loss_clip": 1.01873279, + "balance_loss_mlp": 1.03116345, + "epoch": 0.8253118893732151, + "flos": 31759576617600.0, + "grad_norm": 1.6663240339178054, + "language_loss": 0.70314664, + "learning_rate": 3.1156539001352286e-07, + "loss": 0.72437346, + "num_input_tokens_seen": 296036960, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.62109375, + "step": 13727, + "time_per_iteration": 2.5540802478790283 + }, + { + "auxiliary_loss_clip": 0.01101407, + "auxiliary_loss_mlp": 0.01031517, + "balance_loss_clip": 1.01923275, + "balance_loss_mlp": 1.03472996, + "epoch": 0.8253720126258831, + "flos": 18296415434880.0, + "grad_norm": 1.6751326547454333, + "language_loss": 0.62385333, + "learning_rate": 3.113566701515036e-07, + "loss": 0.64518249, + "num_input_tokens_seen": 296056540, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6640625, + "step": 13728, + "time_per_iteration": 2.4579248428344727 + }, + { + "auxiliary_loss_clip": 0.01105596, + "auxiliary_loss_mlp": 0.010299, + "balance_loss_clip": 1.01763988, + "balance_loss_mlp": 1.03654742, + "epoch": 0.825432135878551, + "flos": 26797332625920.0, + "grad_norm": 1.9717282075375915, + "language_loss": 0.71042085, + "learning_rate": 3.111480143230092e-07, + "loss": 0.73177588, + "num_input_tokens_seen": 296077950, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.69140625, + "step": 13729, + "time_per_iteration": 2.5197970867156982 + }, + { + "auxiliary_loss_clip": 0.01022956, + "auxiliary_loss_mlp": 0.01004312, + "balance_loss_clip": 1.00337625, + "balance_loss_mlp": 1.0028019, + "epoch": 0.825492259131219, + "flos": 54219116217600.0, + "grad_norm": 0.8572305037082905, + "language_loss": 0.6273154, + "learning_rate": 3.109394225359514e-07, + "loss": 0.64758813, + "num_input_tokens_seen": 296127060, + "router_z_loss_clip": 0.00933838, + "router_z_loss_mlp": 0.20117188, + "step": 13730, + "time_per_iteration": 2.863248825073242 + }, + { + "auxiliary_loss_clip": 0.01100543, + "auxiliary_loss_mlp": 0.01030106, + "balance_loss_clip": 1.0182085, + "balance_loss_mlp": 1.03533745, + "epoch": 0.825552382383887, + "flos": 43756145493120.0, + "grad_norm": 2.0536756687084847, + "language_loss": 0.63441122, + "learning_rate": 3.1073089479823945e-07, + "loss": 0.65571773, + "num_input_tokens_seen": 296147775, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65234375, + "step": 13731, + "time_per_iteration": 2.6466689109802246 + }, + { + "auxiliary_loss_clip": 0.0110256, + "auxiliary_loss_mlp": 0.01028814, + "balance_loss_clip": 1.01711965, + "balance_loss_mlp": 1.03353024, + "epoch": 0.825612505636555, + "flos": 12602814624000.0, + "grad_norm": 4.369732122043469, + "language_loss": 0.69833827, + "learning_rate": 3.105224311177812e-07, + "loss": 0.71965206, + "num_input_tokens_seen": 296163560, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6875, + "step": 13732, + "time_per_iteration": 2.431699514389038 + }, + { + "auxiliary_loss_clip": 0.01102382, + "auxiliary_loss_mlp": 0.01032659, + "balance_loss_clip": 1.02082729, + "balance_loss_mlp": 1.03338408, + "epoch": 0.8256726288892229, + "flos": 17595618111360.0, + "grad_norm": 9.416973865724984, + "language_loss": 0.70556611, + "learning_rate": 3.103140315024817e-07, + "loss": 0.72691655, + "num_input_tokens_seen": 296178730, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6875, + "step": 13733, + "time_per_iteration": 2.4896795749664307 + }, + { + "auxiliary_loss_clip": 0.01095159, + "auxiliary_loss_mlp": 0.01026767, + "balance_loss_clip": 1.01482248, + "balance_loss_mlp": 1.03152645, + "epoch": 0.8257327521418909, + "flos": 23805794367360.0, + "grad_norm": 1.4905280980303643, + "language_loss": 0.82499802, + "learning_rate": 3.1010569596024437e-07, + "loss": 0.84621727, + "num_input_tokens_seen": 296200175, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.63671875, + "step": 13734, + "time_per_iteration": 2.4829580783843994 + }, + { + "auxiliary_loss_clip": 0.01096383, + "auxiliary_loss_mlp": 0.01030721, + "balance_loss_clip": 1.01891303, + "balance_loss_mlp": 1.03280544, + "epoch": 0.8257928753945588, + "flos": 19281121856640.0, + "grad_norm": 1.7089910014133873, + "language_loss": 0.82727551, + "learning_rate": 3.098974244989676e-07, + "loss": 0.84854656, + "num_input_tokens_seen": 296219305, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.63671875, + "step": 13735, + "time_per_iteration": 2.447176456451416 + }, + { + "auxiliary_loss_clip": 0.01100896, + "auxiliary_loss_mlp": 0.01026899, + "balance_loss_clip": 1.01627731, + "balance_loss_mlp": 1.03607154, + "epoch": 0.8258529986472268, + "flos": 18478841633280.0, + "grad_norm": 1.741963108109938, + "language_loss": 0.70721442, + "learning_rate": 3.096892171265497e-07, + "loss": 0.72849238, + "num_input_tokens_seen": 296236945, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6484375, + "step": 13736, + "time_per_iteration": 2.410585403442383 + }, + { + "auxiliary_loss_clip": 0.01022898, + "auxiliary_loss_mlp": 0.00996896, + "balance_loss_clip": 0.99593621, + "balance_loss_mlp": 1.00281882, + "epoch": 0.8259131218998947, + "flos": 62137957512960.0, + "grad_norm": 0.8456109831997218, + "language_loss": 0.67959881, + "learning_rate": 3.0948107385088665e-07, + "loss": 0.6997968, + "num_input_tokens_seen": 296294685, + "router_z_loss_clip": 0.00958252, + "router_z_loss_mlp": 0.20117188, + "step": 13737, + "time_per_iteration": 3.0272867679595947 + }, + { + "auxiliary_loss_clip": 0.01101345, + "auxiliary_loss_mlp": 0.01032726, + "balance_loss_clip": 1.0216099, + "balance_loss_mlp": 1.03449416, + "epoch": 0.8259732451525628, + "flos": 22159038418560.0, + "grad_norm": 2.073993143521232, + "language_loss": 0.6973623, + "learning_rate": 3.0927299467987e-07, + "loss": 0.71870303, + "num_input_tokens_seen": 296314790, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.671875, + "step": 13738, + "time_per_iteration": 2.4943878650665283 + }, + { + "auxiliary_loss_clip": 0.01103797, + "auxiliary_loss_mlp": 0.01031248, + "balance_loss_clip": 1.0177238, + "balance_loss_mlp": 1.03640378, + "epoch": 0.8260333684052307, + "flos": 38361645233280.0, + "grad_norm": 2.1714689759020263, + "language_loss": 0.63835168, + "learning_rate": 3.090649796213911e-07, + "loss": 0.65970206, + "num_input_tokens_seen": 296335355, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.671875, + "step": 13739, + "time_per_iteration": 2.5875649452209473 + }, + { + "auxiliary_loss_clip": 0.01022875, + "auxiliary_loss_mlp": 0.01001493, + "balance_loss_clip": 1.00052786, + "balance_loss_mlp": 1.00276709, + "epoch": 0.8260934916578987, + "flos": 62185611882240.0, + "grad_norm": 0.8181807720389914, + "language_loss": 0.59289646, + "learning_rate": 3.0885702868333853e-07, + "loss": 0.61314023, + "num_input_tokens_seen": 296399885, + "router_z_loss_clip": 0.00964355, + "router_z_loss_mlp": 0.20117188, + "step": 13740, + "time_per_iteration": 3.1312007904052734 + }, + { + "auxiliary_loss_clip": 0.01104538, + "auxiliary_loss_mlp": 0.01030141, + "balance_loss_clip": 1.01754618, + "balance_loss_mlp": 1.03531182, + "epoch": 0.8261536149105667, + "flos": 22565475786240.0, + "grad_norm": 1.7686192119989161, + "language_loss": 0.74968207, + "learning_rate": 3.086491418735959e-07, + "loss": 0.77102888, + "num_input_tokens_seen": 296417660, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 13741, + "time_per_iteration": 2.4407901763916016 + }, + { + "auxiliary_loss_clip": 0.01098345, + "auxiliary_loss_mlp": 0.01030197, + "balance_loss_clip": 1.01872921, + "balance_loss_mlp": 1.03380883, + "epoch": 0.8262137381632346, + "flos": 32525479342080.0, + "grad_norm": 1.9409815336260887, + "language_loss": 0.62311375, + "learning_rate": 3.0844131920004726e-07, + "loss": 0.64439917, + "num_input_tokens_seen": 296438255, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.64453125, + "step": 13742, + "time_per_iteration": 2.5456202030181885 + }, + { + "auxiliary_loss_clip": 0.01105188, + "auxiliary_loss_mlp": 0.01034876, + "balance_loss_clip": 1.02133405, + "balance_loss_mlp": 1.03567028, + "epoch": 0.8262738614159026, + "flos": 14136451666560.0, + "grad_norm": 2.5756916838702324, + "language_loss": 0.65460289, + "learning_rate": 3.0823356067057327e-07, + "loss": 0.67600346, + "num_input_tokens_seen": 296454485, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.6953125, + "step": 13743, + "time_per_iteration": 2.4249460697174072 + }, + { + "auxiliary_loss_clip": 0.01102194, + "auxiliary_loss_mlp": 0.01032775, + "balance_loss_clip": 1.02113438, + "balance_loss_mlp": 1.03607357, + "epoch": 0.8263339846685706, + "flos": 19825347795840.0, + "grad_norm": 1.9730878260340954, + "language_loss": 0.6655553, + "learning_rate": 3.0802586629305283e-07, + "loss": 0.68690503, + "num_input_tokens_seen": 296473740, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 13744, + "time_per_iteration": 2.441387414932251 + }, + { + "auxiliary_loss_clip": 0.01100364, + "auxiliary_loss_mlp": 0.01028674, + "balance_loss_clip": 1.01736081, + "balance_loss_mlp": 1.03470361, + "epoch": 0.8263941079212386, + "flos": 22745962650240.0, + "grad_norm": 1.755399157939641, + "language_loss": 0.75241995, + "learning_rate": 3.078182360753612e-07, + "loss": 0.77371031, + "num_input_tokens_seen": 296493355, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65625, + "step": 13745, + "time_per_iteration": 2.4459309577941895 + }, + { + "auxiliary_loss_clip": 0.01095928, + "auxiliary_loss_mlp": 0.01030217, + "balance_loss_clip": 1.02005458, + "balance_loss_mlp": 1.03274918, + "epoch": 0.8264542311739065, + "flos": 20120641505280.0, + "grad_norm": 1.8118192863065001, + "language_loss": 0.79148436, + "learning_rate": 3.076106700253709e-07, + "loss": 0.81274581, + "num_input_tokens_seen": 296510520, + "router_z_loss_clip": 0.1015625, + "router_z_loss_mlp": 0.6328125, + "step": 13746, + "time_per_iteration": 2.4544129371643066 + }, + { + "auxiliary_loss_clip": 0.01104486, + "auxiliary_loss_mlp": 0.01032904, + "balance_loss_clip": 1.02051806, + "balance_loss_mlp": 1.03626895, + "epoch": 0.8265143544265745, + "flos": 16837149502080.0, + "grad_norm": 1.948876431747442, + "language_loss": 0.68665206, + "learning_rate": 3.0740316815095415e-07, + "loss": 0.70802593, + "num_input_tokens_seen": 296528265, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 13747, + "time_per_iteration": 2.4420480728149414 + }, + { + "auxiliary_loss_clip": 0.01101378, + "auxiliary_loss_mlp": 0.01031327, + "balance_loss_clip": 1.01921535, + "balance_loss_mlp": 1.03463233, + "epoch": 0.8265744776792424, + "flos": 22018592240640.0, + "grad_norm": 2.2816524530159756, + "language_loss": 0.75179929, + "learning_rate": 3.0719573045997835e-07, + "loss": 0.7731263, + "num_input_tokens_seen": 296547810, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66796875, + "step": 13748, + "time_per_iteration": 2.4481050968170166 + }, + { + "auxiliary_loss_clip": 0.01096027, + "auxiliary_loss_mlp": 0.01030654, + "balance_loss_clip": 1.02046204, + "balance_loss_mlp": 1.03393388, + "epoch": 0.8266346009319104, + "flos": 19244852098560.0, + "grad_norm": 5.161325368268591, + "language_loss": 0.63943124, + "learning_rate": 3.069883569603102e-07, + "loss": 0.66069806, + "num_input_tokens_seen": 296565940, + "router_z_loss_clip": 0.10205078, + "router_z_loss_mlp": 0.62109375, + "step": 13749, + "time_per_iteration": 2.44142746925354 + }, + { + "auxiliary_loss_clip": 0.01095615, + "auxiliary_loss_mlp": 0.01025699, + "balance_loss_clip": 1.01466656, + "balance_loss_mlp": 1.03140473, + "epoch": 0.8266947241845783, + "flos": 24166768095360.0, + "grad_norm": 1.8454134649317644, + "language_loss": 0.73651314, + "learning_rate": 3.067810476598132e-07, + "loss": 0.75772631, + "num_input_tokens_seen": 296585090, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 13750, + "time_per_iteration": 2.472975969314575 + }, + { + "auxiliary_loss_clip": 0.0110087, + "auxiliary_loss_mlp": 0.01033301, + "balance_loss_clip": 1.02129698, + "balance_loss_mlp": 1.03503251, + "epoch": 0.8267548474372464, + "flos": 21105814803840.0, + "grad_norm": 1.831070973907418, + "language_loss": 0.65703225, + "learning_rate": 3.065738025663496e-07, + "loss": 0.67837399, + "num_input_tokens_seen": 296604950, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.65625, + "step": 13751, + "time_per_iteration": 2.460341215133667 + }, + { + "auxiliary_loss_clip": 0.01096828, + "auxiliary_loss_mlp": 0.01026688, + "balance_loss_clip": 1.01563787, + "balance_loss_mlp": 1.03284085, + "epoch": 0.8268149706899143, + "flos": 39968288668800.0, + "grad_norm": 1.4782058761360306, + "language_loss": 0.60665822, + "learning_rate": 3.0636662168777607e-07, + "loss": 0.62789339, + "num_input_tokens_seen": 296627780, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 13752, + "time_per_iteration": 2.6207900047302246 + }, + { + "auxiliary_loss_clip": 0.01022684, + "auxiliary_loss_mlp": 0.01002368, + "balance_loss_clip": 1.00148606, + "balance_loss_mlp": 1.00261712, + "epoch": 0.8268750939425823, + "flos": 65782423244160.0, + "grad_norm": 0.7696499438638109, + "language_loss": 0.57472384, + "learning_rate": 3.0615950503194986e-07, + "loss": 0.59497434, + "num_input_tokens_seen": 296683850, + "router_z_loss_clip": 0.0088501, + "router_z_loss_mlp": 0.20117188, + "step": 13753, + "time_per_iteration": 3.1323916912078857 + }, + { + "auxiliary_loss_clip": 0.01022837, + "auxiliary_loss_mlp": 0.00999424, + "balance_loss_clip": 0.99852353, + "balance_loss_mlp": 1.0027312, + "epoch": 0.8269352171952503, + "flos": 52981455242880.0, + "grad_norm": 0.6991010254330118, + "language_loss": 0.54898673, + "learning_rate": 3.0595245260672563e-07, + "loss": 0.56920928, + "num_input_tokens_seen": 296741420, + "router_z_loss_clip": 0.00897217, + "router_z_loss_mlp": 0.20117188, + "step": 13754, + "time_per_iteration": 3.19143009185791 + }, + { + "auxiliary_loss_clip": 0.01095849, + "auxiliary_loss_mlp": 0.01028538, + "balance_loss_clip": 1.01835752, + "balance_loss_mlp": 1.03289616, + "epoch": 0.8269953404479182, + "flos": 23076125487360.0, + "grad_norm": 1.901395420853525, + "language_loss": 0.68808734, + "learning_rate": 3.0574546441995354e-07, + "loss": 0.70933127, + "num_input_tokens_seen": 296759620, + "router_z_loss_clip": 0.1015625, + "router_z_loss_mlp": 0.62890625, + "step": 13755, + "time_per_iteration": 2.563145160675049 + }, + { + "auxiliary_loss_clip": 0.01097596, + "auxiliary_loss_mlp": 0.01025322, + "balance_loss_clip": 1.01480806, + "balance_loss_mlp": 1.03408217, + "epoch": 0.8270554637005862, + "flos": 14209996763520.0, + "grad_norm": 2.2342864185465454, + "language_loss": 0.69950449, + "learning_rate": 3.0553854047948324e-07, + "loss": 0.7207337, + "num_input_tokens_seen": 296777275, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6328125, + "step": 13756, + "time_per_iteration": 2.5362203121185303 + }, + { + "auxiliary_loss_clip": 0.01102655, + "auxiliary_loss_mlp": 0.01029917, + "balance_loss_clip": 1.01820481, + "balance_loss_mlp": 1.03683186, + "epoch": 0.8271155869532542, + "flos": 21762046327680.0, + "grad_norm": 1.935436215095936, + "language_loss": 0.71919167, + "learning_rate": 3.053316807931623e-07, + "loss": 0.74051744, + "num_input_tokens_seen": 296796655, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 13757, + "time_per_iteration": 2.4298806190490723 + }, + { + "auxiliary_loss_clip": 0.01102829, + "auxiliary_loss_mlp": 0.0103455, + "balance_loss_clip": 1.02090025, + "balance_loss_mlp": 1.03461754, + "epoch": 0.8271757102059222, + "flos": 15120475729920.0, + "grad_norm": 2.0681881017111734, + "language_loss": 0.6859889, + "learning_rate": 3.0512488536883283e-07, + "loss": 0.70736271, + "num_input_tokens_seen": 296813705, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.6796875, + "step": 13758, + "time_per_iteration": 2.450861692428589 + }, + { + "auxiliary_loss_clip": 0.01094703, + "auxiliary_loss_mlp": 0.01028428, + "balance_loss_clip": 1.01730585, + "balance_loss_mlp": 1.03196728, + "epoch": 0.8272358334585901, + "flos": 24133730561280.0, + "grad_norm": 1.58450668225913, + "language_loss": 0.69761419, + "learning_rate": 3.0491815421433775e-07, + "loss": 0.71884549, + "num_input_tokens_seen": 296833985, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.625, + "step": 13759, + "time_per_iteration": 2.4828851222991943 + }, + { + "auxiliary_loss_clip": 0.01097875, + "auxiliary_loss_mlp": 0.0102834, + "balance_loss_clip": 1.01631212, + "balance_loss_mlp": 1.03378034, + "epoch": 0.8272959567112581, + "flos": 18990712396800.0, + "grad_norm": 1.7473581768937994, + "language_loss": 0.70969361, + "learning_rate": 3.047114873375161e-07, + "loss": 0.73095572, + "num_input_tokens_seen": 296850150, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.640625, + "step": 13760, + "time_per_iteration": 2.4519858360290527 + }, + { + "auxiliary_loss_clip": 0.01098548, + "auxiliary_loss_mlp": 0.01026282, + "balance_loss_clip": 1.01542258, + "balance_loss_mlp": 1.03550053, + "epoch": 0.827356079963926, + "flos": 20631614428800.0, + "grad_norm": 1.664740297618068, + "language_loss": 0.77527195, + "learning_rate": 3.0450488474620505e-07, + "loss": 0.79652023, + "num_input_tokens_seen": 296869585, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6328125, + "step": 13761, + "time_per_iteration": 3.890357255935669 + }, + { + "auxiliary_loss_clip": 0.01095154, + "auxiliary_loss_mlp": 0.01026948, + "balance_loss_clip": 1.01645768, + "balance_loss_mlp": 1.03343439, + "epoch": 0.827416203216594, + "flos": 22416625825920.0, + "grad_norm": 1.6562788022934103, + "language_loss": 0.69847965, + "learning_rate": 3.042983464482387e-07, + "loss": 0.71970069, + "num_input_tokens_seen": 296887710, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.6171875, + "step": 13762, + "time_per_iteration": 2.4446358680725098 + }, + { + "auxiliary_loss_clip": 0.01097413, + "auxiliary_loss_mlp": 0.01021229, + "balance_loss_clip": 1.01057184, + "balance_loss_mlp": 1.03246927, + "epoch": 0.827476326469262, + "flos": 19026192055680.0, + "grad_norm": 1.8662073459015955, + "language_loss": 0.70074844, + "learning_rate": 3.0409187245144853e-07, + "loss": 0.72193485, + "num_input_tokens_seen": 296906265, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6484375, + "step": 13763, + "time_per_iteration": 5.2924604415893555 + }, + { + "auxiliary_loss_clip": 0.01022864, + "auxiliary_loss_mlp": 0.01000148, + "balance_loss_clip": 0.99919397, + "balance_loss_mlp": 1.0026294, + "epoch": 0.82753644972193, + "flos": 68500575089280.0, + "grad_norm": 0.8373654937197863, + "language_loss": 0.65168589, + "learning_rate": 3.038854627636651e-07, + "loss": 0.67191601, + "num_input_tokens_seen": 296971290, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.203125, + "step": 13764, + "time_per_iteration": 3.095142126083374 + }, + { + "auxiliary_loss_clip": 0.01102775, + "auxiliary_loss_mlp": 0.01033107, + "balance_loss_clip": 1.02101338, + "balance_loss_mlp": 1.03624713, + "epoch": 0.8275965729745979, + "flos": 18405404277120.0, + "grad_norm": 2.1372557336076032, + "language_loss": 0.77729869, + "learning_rate": 3.0367911739271423e-07, + "loss": 0.79865754, + "num_input_tokens_seen": 296989060, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 13765, + "time_per_iteration": 2.415915012359619 + }, + { + "auxiliary_loss_clip": 0.01102127, + "auxiliary_loss_mlp": 0.01030033, + "balance_loss_clip": 1.01768899, + "balance_loss_mlp": 1.03454924, + "epoch": 0.8276566962272659, + "flos": 28512067063680.0, + "grad_norm": 1.6221712528738066, + "language_loss": 0.62191451, + "learning_rate": 3.034728363464214e-07, + "loss": 0.64323616, + "num_input_tokens_seen": 297011300, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.67578125, + "step": 13766, + "time_per_iteration": 2.5222461223602295 + }, + { + "auxiliary_loss_clip": 0.01099033, + "auxiliary_loss_mlp": 0.01028817, + "balance_loss_clip": 1.01703894, + "balance_loss_mlp": 1.03382564, + "epoch": 0.8277168194799339, + "flos": 20230240878720.0, + "grad_norm": 2.733674200734292, + "language_loss": 0.82816303, + "learning_rate": 3.03266619632609e-07, + "loss": 0.84944153, + "num_input_tokens_seen": 297030350, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65234375, + "step": 13767, + "time_per_iteration": 3.9097790718078613 + }, + { + "auxiliary_loss_clip": 0.01102172, + "auxiliary_loss_mlp": 0.01026301, + "balance_loss_clip": 1.01461267, + "balance_loss_mlp": 1.03601408, + "epoch": 0.8277769427326018, + "flos": 28476623318400.0, + "grad_norm": 1.8123727599294597, + "language_loss": 0.69225526, + "learning_rate": 3.030604672590964e-07, + "loss": 0.71353996, + "num_input_tokens_seen": 297049710, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66015625, + "step": 13768, + "time_per_iteration": 2.5282175540924072 + }, + { + "auxiliary_loss_clip": 0.0109497, + "auxiliary_loss_mlp": 0.01025867, + "balance_loss_clip": 1.01526916, + "balance_loss_mlp": 1.03135371, + "epoch": 0.8278370659852698, + "flos": 27197628768000.0, + "grad_norm": 1.7692402287789066, + "language_loss": 0.74371201, + "learning_rate": 3.028543792337006e-07, + "loss": 0.76492047, + "num_input_tokens_seen": 297070510, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.63671875, + "step": 13769, + "time_per_iteration": 2.501898765563965 + }, + { + "auxiliary_loss_clip": 0.01098728, + "auxiliary_loss_mlp": 0.01027102, + "balance_loss_clip": 1.01539564, + "balance_loss_mlp": 1.03283179, + "epoch": 0.8278971892379378, + "flos": 37816126404480.0, + "grad_norm": 2.1253743254502977, + "language_loss": 0.74551117, + "learning_rate": 3.0264835556423675e-07, + "loss": 0.76676941, + "num_input_tokens_seen": 297092585, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65625, + "step": 13770, + "time_per_iteration": 2.597078323364258 + }, + { + "auxiliary_loss_clip": 0.01099272, + "auxiliary_loss_mlp": 0.01030783, + "balance_loss_clip": 1.01858819, + "balance_loss_mlp": 1.03407049, + "epoch": 0.8279573124906058, + "flos": 22560160573440.0, + "grad_norm": 1.6790850310045173, + "language_loss": 0.75939202, + "learning_rate": 3.0244239625851785e-07, + "loss": 0.78069258, + "num_input_tokens_seen": 297110055, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65234375, + "step": 13771, + "time_per_iteration": 2.4581122398376465 + }, + { + "auxiliary_loss_clip": 0.01099759, + "auxiliary_loss_mlp": 0.01029427, + "balance_loss_clip": 1.01814413, + "balance_loss_mlp": 1.03393221, + "epoch": 0.8280174357432737, + "flos": 36064619418240.0, + "grad_norm": 1.6138100145294163, + "language_loss": 0.72420895, + "learning_rate": 3.0223650132435284e-07, + "loss": 0.7455008, + "num_input_tokens_seen": 297132170, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65625, + "step": 13772, + "time_per_iteration": 2.573108673095703 + }, + { + "auxiliary_loss_clip": 0.01098054, + "auxiliary_loss_mlp": 0.01029734, + "balance_loss_clip": 1.01759243, + "balance_loss_mlp": 1.03461206, + "epoch": 0.8280775589959417, + "flos": 22961067246720.0, + "grad_norm": 2.2283903024413547, + "language_loss": 0.74291146, + "learning_rate": 3.0203067076955035e-07, + "loss": 0.76418936, + "num_input_tokens_seen": 297149515, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6328125, + "step": 13773, + "time_per_iteration": 2.429135322570801 + }, + { + "auxiliary_loss_clip": 0.01098833, + "auxiliary_loss_mlp": 0.01031828, + "balance_loss_clip": 1.02056837, + "balance_loss_mlp": 1.03527474, + "epoch": 0.8281376822486096, + "flos": 26063282286720.0, + "grad_norm": 1.72037457005478, + "language_loss": 0.75935221, + "learning_rate": 3.01824904601915e-07, + "loss": 0.78065884, + "num_input_tokens_seen": 297170320, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6328125, + "step": 13774, + "time_per_iteration": 2.475358486175537 + }, + { + "auxiliary_loss_clip": 0.01104314, + "auxiliary_loss_mlp": 0.01025788, + "balance_loss_clip": 1.01416469, + "balance_loss_mlp": 1.0359422, + "epoch": 0.8281978055012776, + "flos": 20667776446080.0, + "grad_norm": 1.6086264049463133, + "language_loss": 0.75185502, + "learning_rate": 3.01619202829249e-07, + "loss": 0.77315605, + "num_input_tokens_seen": 297189935, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.68359375, + "step": 13775, + "time_per_iteration": 2.43330717086792 + }, + { + "auxiliary_loss_clip": 0.01103717, + "auxiliary_loss_mlp": 0.01027612, + "balance_loss_clip": 1.01517248, + "balance_loss_mlp": 1.03436899, + "epoch": 0.8282579287539455, + "flos": 29315281040640.0, + "grad_norm": 1.9945621965975238, + "language_loss": 0.73318064, + "learning_rate": 3.01413565459353e-07, + "loss": 0.75449395, + "num_input_tokens_seen": 297210885, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6953125, + "step": 13776, + "time_per_iteration": 2.5265419483184814 + }, + { + "auxiliary_loss_clip": 0.01099687, + "auxiliary_loss_mlp": 0.01023413, + "balance_loss_clip": 1.01215982, + "balance_loss_mlp": 1.03306937, + "epoch": 0.8283180520066136, + "flos": 15706178899200.0, + "grad_norm": 2.0087174485094943, + "language_loss": 0.77922744, + "learning_rate": 3.0120799250002483e-07, + "loss": 0.80045843, + "num_input_tokens_seen": 297228500, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.66796875, + "step": 13777, + "time_per_iteration": 2.438504934310913 + }, + { + "auxiliary_loss_clip": 0.01097696, + "auxiliary_loss_mlp": 0.01029126, + "balance_loss_clip": 1.01874244, + "balance_loss_mlp": 1.03520977, + "epoch": 0.8283781752592815, + "flos": 24791470456320.0, + "grad_norm": 1.4952468649101947, + "language_loss": 0.82470471, + "learning_rate": 3.010024839590604e-07, + "loss": 0.8459729, + "num_input_tokens_seen": 297249470, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.625, + "step": 13778, + "time_per_iteration": 2.4861180782318115 + }, + { + "auxiliary_loss_clip": 0.0109427, + "auxiliary_loss_mlp": 0.0102475, + "balance_loss_clip": 1.01334167, + "balance_loss_mlp": 1.03251445, + "epoch": 0.8284382985119495, + "flos": 18982811404800.0, + "grad_norm": 1.8580201526843971, + "language_loss": 0.74507427, + "learning_rate": 3.0079703984425187e-07, + "loss": 0.7662645, + "num_input_tokens_seen": 297265970, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6171875, + "step": 13779, + "time_per_iteration": 2.413422107696533 + }, + { + "auxiliary_loss_clip": 0.01022695, + "auxiliary_loss_mlp": 0.01001477, + "balance_loss_clip": 1.00053501, + "balance_loss_mlp": 1.00272822, + "epoch": 0.8284984217646175, + "flos": 61034460814080.0, + "grad_norm": 0.8243655994188706, + "language_loss": 0.56794745, + "learning_rate": 3.0059166016338954e-07, + "loss": 0.58818918, + "num_input_tokens_seen": 297325525, + "router_z_loss_clip": 0.00939941, + "router_z_loss_mlp": 0.19921875, + "step": 13780, + "time_per_iteration": 3.070969343185425 + }, + { + "auxiliary_loss_clip": 0.01099038, + "auxiliary_loss_mlp": 0.01026153, + "balance_loss_clip": 1.01467896, + "balance_loss_mlp": 1.03370905, + "epoch": 0.8285585450172854, + "flos": 19714635100800.0, + "grad_norm": 1.663013046822846, + "language_loss": 0.80247319, + "learning_rate": 3.0038634492426205e-07, + "loss": 0.82372504, + "num_input_tokens_seen": 297345025, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65625, + "step": 13781, + "time_per_iteration": 2.4518861770629883 + }, + { + "auxiliary_loss_clip": 0.01102768, + "auxiliary_loss_mlp": 0.01029639, + "balance_loss_clip": 1.01744401, + "balance_loss_mlp": 1.03644729, + "epoch": 0.8286186682699535, + "flos": 21688896280320.0, + "grad_norm": 3.0303524050285557, + "language_loss": 0.75560725, + "learning_rate": 3.001810941346543e-07, + "loss": 0.77693129, + "num_input_tokens_seen": 297363570, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6640625, + "step": 13782, + "time_per_iteration": 2.504301071166992 + }, + { + "auxiliary_loss_clip": 0.01097103, + "auxiliary_loss_mlp": 0.01026716, + "balance_loss_clip": 1.01530743, + "balance_loss_mlp": 1.03192592, + "epoch": 0.8286787915226214, + "flos": 25775566346880.0, + "grad_norm": 1.6551546330725684, + "language_loss": 0.75982195, + "learning_rate": 2.9997590780234983e-07, + "loss": 0.78106016, + "num_input_tokens_seen": 297385385, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 13783, + "time_per_iteration": 2.520042657852173 + }, + { + "auxiliary_loss_clip": 0.01098829, + "auxiliary_loss_mlp": 0.01027701, + "balance_loss_clip": 1.0163579, + "balance_loss_mlp": 1.03363252, + "epoch": 0.8287389147752894, + "flos": 21288348743040.0, + "grad_norm": 1.6853866319611193, + "language_loss": 0.73697698, + "learning_rate": 2.997707859351304e-07, + "loss": 0.75824231, + "num_input_tokens_seen": 297403950, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65234375, + "step": 13784, + "time_per_iteration": 2.462013006210327 + }, + { + "auxiliary_loss_clip": 0.01102405, + "auxiliary_loss_mlp": 0.01034205, + "balance_loss_clip": 1.02154541, + "balance_loss_mlp": 1.03321636, + "epoch": 0.8287990380279573, + "flos": 33544875323520.0, + "grad_norm": 1.4282761232574668, + "language_loss": 0.70307374, + "learning_rate": 2.99565728540772e-07, + "loss": 0.72443986, + "num_input_tokens_seen": 297424565, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 13785, + "time_per_iteration": 2.577817678451538 + }, + { + "auxiliary_loss_clip": 0.01101293, + "auxiliary_loss_mlp": 0.01031549, + "balance_loss_clip": 1.01994348, + "balance_loss_mlp": 1.03573847, + "epoch": 0.8288591612806253, + "flos": 22966346545920.0, + "grad_norm": 1.578728182977374, + "language_loss": 0.68448269, + "learning_rate": 2.993607356270516e-07, + "loss": 0.70581114, + "num_input_tokens_seen": 297445180, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 13786, + "time_per_iteration": 2.4532599449157715 + }, + { + "auxiliary_loss_clip": 0.01102631, + "auxiliary_loss_mlp": 0.01033324, + "balance_loss_clip": 1.0212723, + "balance_loss_mlp": 1.03444433, + "epoch": 0.8289192845332932, + "flos": 18588979710720.0, + "grad_norm": 1.9713140427276798, + "language_loss": 0.7668817, + "learning_rate": 2.991558072017426e-07, + "loss": 0.78824121, + "num_input_tokens_seen": 297463790, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 13787, + "time_per_iteration": 2.4399027824401855 + }, + { + "auxiliary_loss_clip": 0.01098907, + "auxiliary_loss_mlp": 0.01031403, + "balance_loss_clip": 1.02052522, + "balance_loss_mlp": 1.03484738, + "epoch": 0.8289794077859612, + "flos": 15450423085440.0, + "grad_norm": 1.81084531479829, + "language_loss": 0.80682862, + "learning_rate": 2.989509432726163e-07, + "loss": 0.82813168, + "num_input_tokens_seen": 297480100, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.640625, + "step": 13788, + "time_per_iteration": 2.4278197288513184 + }, + { + "auxiliary_loss_clip": 0.0109796, + "auxiliary_loss_mlp": 0.0102936, + "balance_loss_clip": 1.01862526, + "balance_loss_mlp": 1.03381896, + "epoch": 0.8290395310386292, + "flos": 28877853214080.0, + "grad_norm": 1.9710434276893554, + "language_loss": 0.71272284, + "learning_rate": 2.9874614384744014e-07, + "loss": 0.73399603, + "num_input_tokens_seen": 297499890, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.640625, + "step": 13789, + "time_per_iteration": 2.5227370262145996 + }, + { + "auxiliary_loss_clip": 0.01100536, + "auxiliary_loss_mlp": 0.01028676, + "balance_loss_clip": 1.01702929, + "balance_loss_mlp": 1.03298402, + "epoch": 0.8290996542912972, + "flos": 36576274700160.0, + "grad_norm": 2.3533279169078614, + "language_loss": 0.68549865, + "learning_rate": 2.985414089339813e-07, + "loss": 0.7067908, + "num_input_tokens_seen": 297521440, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.67578125, + "step": 13790, + "time_per_iteration": 2.585700511932373 + }, + { + "auxiliary_loss_clip": 0.01101161, + "auxiliary_loss_mlp": 0.01030646, + "balance_loss_clip": 1.01763999, + "balance_loss_mlp": 1.0343529, + "epoch": 0.8291597775439651, + "flos": 23623009032960.0, + "grad_norm": 4.149168157668411, + "language_loss": 0.77716172, + "learning_rate": 2.9833673854000265e-07, + "loss": 0.7984798, + "num_input_tokens_seen": 297539920, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.671875, + "step": 13791, + "time_per_iteration": 2.507520914077759 + }, + { + "auxiliary_loss_clip": 0.0109744, + "auxiliary_loss_mlp": 0.0102597, + "balance_loss_clip": 1.01394224, + "balance_loss_mlp": 1.03434205, + "epoch": 0.8292199007966331, + "flos": 21397481239680.0, + "grad_norm": 1.3726823147791687, + "language_loss": 0.69920421, + "learning_rate": 2.981321326732651e-07, + "loss": 0.72043836, + "num_input_tokens_seen": 297560000, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6328125, + "step": 13792, + "time_per_iteration": 2.4654951095581055 + }, + { + "auxiliary_loss_clip": 0.01099831, + "auxiliary_loss_mlp": 0.01031375, + "balance_loss_clip": 1.01934695, + "balance_loss_mlp": 1.03244185, + "epoch": 0.829280024049301, + "flos": 28767607395840.0, + "grad_norm": 1.877850305327316, + "language_loss": 0.65054023, + "learning_rate": 2.9792759134152736e-07, + "loss": 0.67185235, + "num_input_tokens_seen": 297579300, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 13793, + "time_per_iteration": 2.544015884399414 + }, + { + "auxiliary_loss_clip": 0.01102087, + "auxiliary_loss_mlp": 0.01028709, + "balance_loss_clip": 1.01646078, + "balance_loss_mlp": 1.03449655, + "epoch": 0.829340147301969, + "flos": 19938071652480.0, + "grad_norm": 1.673378626627257, + "language_loss": 0.66431141, + "learning_rate": 2.977231145525461e-07, + "loss": 0.68561947, + "num_input_tokens_seen": 297598095, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.67578125, + "step": 13794, + "time_per_iteration": 2.431690216064453 + }, + { + "auxiliary_loss_clip": 0.0109811, + "auxiliary_loss_mlp": 0.01032069, + "balance_loss_clip": 1.01981437, + "balance_loss_mlp": 1.03217447, + "epoch": 0.829400270554637, + "flos": 25228575060480.0, + "grad_norm": 1.7723681327467673, + "language_loss": 0.65998554, + "learning_rate": 2.975187023140757e-07, + "loss": 0.68128735, + "num_input_tokens_seen": 297615955, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.65625, + "step": 13795, + "time_per_iteration": 2.4870991706848145 + }, + { + "auxiliary_loss_clip": 0.01095736, + "auxiliary_loss_mlp": 0.0102856, + "balance_loss_clip": 1.01775384, + "balance_loss_mlp": 1.03428745, + "epoch": 0.829460393807305, + "flos": 24463570176000.0, + "grad_norm": 1.9001575198437086, + "language_loss": 0.66477525, + "learning_rate": 2.973143546338661e-07, + "loss": 0.68601817, + "num_input_tokens_seen": 297636285, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.61328125, + "step": 13796, + "time_per_iteration": 2.4674506187438965 + }, + { + "auxiliary_loss_clip": 0.01097029, + "auxiliary_loss_mlp": 0.01029082, + "balance_loss_clip": 1.01781702, + "balance_loss_mlp": 1.03385639, + "epoch": 0.829520517059973, + "flos": 15122486891520.0, + "grad_norm": 1.5598295545744347, + "language_loss": 0.71532977, + "learning_rate": 2.971100715196666e-07, + "loss": 0.73659086, + "num_input_tokens_seen": 297653315, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6328125, + "step": 13797, + "time_per_iteration": 2.4169154167175293 + }, + { + "auxiliary_loss_clip": 0.01101059, + "auxiliary_loss_mlp": 0.01030069, + "balance_loss_clip": 1.01906002, + "balance_loss_mlp": 1.03453267, + "epoch": 0.8295806403126409, + "flos": 21579979265280.0, + "grad_norm": 1.750517911189691, + "language_loss": 0.72251916, + "learning_rate": 2.969058529792243e-07, + "loss": 0.74383044, + "num_input_tokens_seen": 297673480, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6640625, + "step": 13798, + "time_per_iteration": 2.4415347576141357 + }, + { + "auxiliary_loss_clip": 0.01094228, + "auxiliary_loss_mlp": 0.01029041, + "balance_loss_clip": 1.01819897, + "balance_loss_mlp": 1.03278971, + "epoch": 0.8296407635653089, + "flos": 21726566668800.0, + "grad_norm": 1.5603064729869331, + "language_loss": 0.76201189, + "learning_rate": 2.967016990202822e-07, + "loss": 0.78324461, + "num_input_tokens_seen": 297693250, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.61328125, + "step": 13799, + "time_per_iteration": 2.463636636734009 + }, + { + "auxiliary_loss_clip": 0.0109965, + "auxiliary_loss_mlp": 0.01030435, + "balance_loss_clip": 1.01907468, + "balance_loss_mlp": 1.0355351, + "epoch": 0.8297008868179768, + "flos": 11181147252480.0, + "grad_norm": 2.1935741637351174, + "language_loss": 0.67862946, + "learning_rate": 2.9649760965058245e-07, + "loss": 0.69993031, + "num_input_tokens_seen": 297710975, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.640625, + "step": 13800, + "time_per_iteration": 2.405726909637451 + }, + { + "auxiliary_loss_clip": 0.01105563, + "auxiliary_loss_mlp": 0.01033853, + "balance_loss_clip": 1.02096081, + "balance_loss_mlp": 1.03694606, + "epoch": 0.8297610100706448, + "flos": 20664041431680.0, + "grad_norm": 1.7039230196700386, + "language_loss": 0.74584657, + "learning_rate": 2.9629358487786515e-07, + "loss": 0.76724076, + "num_input_tokens_seen": 297730860, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6875, + "step": 13801, + "time_per_iteration": 2.4256935119628906 + }, + { + "auxiliary_loss_clip": 0.01100874, + "auxiliary_loss_mlp": 0.01028372, + "balance_loss_clip": 1.01740479, + "balance_loss_mlp": 1.03498697, + "epoch": 0.8298211333233128, + "flos": 20376325491840.0, + "grad_norm": 1.4930307504184834, + "language_loss": 0.73669171, + "learning_rate": 2.9608962470986476e-07, + "loss": 0.75798416, + "num_input_tokens_seen": 297749765, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.66015625, + "step": 13802, + "time_per_iteration": 2.432267427444458 + }, + { + "auxiliary_loss_clip": 0.01100091, + "auxiliary_loss_mlp": 0.0102982, + "balance_loss_clip": 1.01868546, + "balance_loss_mlp": 1.03392529, + "epoch": 0.8298812565759808, + "flos": 21508696725120.0, + "grad_norm": 1.6288424480381258, + "language_loss": 0.74650079, + "learning_rate": 2.9588572915431644e-07, + "loss": 0.76779985, + "num_input_tokens_seen": 297770380, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 13803, + "time_per_iteration": 3.8569204807281494 + }, + { + "auxiliary_loss_clip": 0.01100884, + "auxiliary_loss_mlp": 0.01029893, + "balance_loss_clip": 1.01874709, + "balance_loss_mlp": 1.03565145, + "epoch": 0.8299413798286487, + "flos": 22818681734400.0, + "grad_norm": 1.579848035372401, + "language_loss": 0.79086143, + "learning_rate": 2.9568189821895215e-07, + "loss": 0.81216919, + "num_input_tokens_seen": 297789440, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.65234375, + "step": 13804, + "time_per_iteration": 2.4629805088043213 + }, + { + "auxiliary_loss_clip": 0.01098393, + "auxiliary_loss_mlp": 0.0103112, + "balance_loss_clip": 1.02012861, + "balance_loss_mlp": 1.03375435, + "epoch": 0.8300015030813167, + "flos": 29679199683840.0, + "grad_norm": 1.7198477765468532, + "language_loss": 0.73292375, + "learning_rate": 2.954781319115016e-07, + "loss": 0.75421888, + "num_input_tokens_seen": 297810425, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 13805, + "time_per_iteration": 5.304149389266968 + }, + { + "auxiliary_loss_clip": 0.01100424, + "auxiliary_loss_mlp": 0.01029387, + "balance_loss_clip": 1.017735, + "balance_loss_mlp": 1.03412557, + "epoch": 0.8300616263339846, + "flos": 19719483436800.0, + "grad_norm": 1.9936443323476183, + "language_loss": 0.7744779, + "learning_rate": 2.952744302396906e-07, + "loss": 0.79577601, + "num_input_tokens_seen": 297827680, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 13806, + "time_per_iteration": 2.467191219329834 + }, + { + "auxiliary_loss_clip": 0.01103109, + "auxiliary_loss_mlp": 0.0103205, + "balance_loss_clip": 1.01940179, + "balance_loss_mlp": 1.03535724, + "epoch": 0.8301217495866526, + "flos": 19901945548800.0, + "grad_norm": 1.69387753242653, + "language_loss": 0.63385892, + "learning_rate": 2.950707932112444e-07, + "loss": 0.65521049, + "num_input_tokens_seen": 297848005, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.67578125, + "step": 13807, + "time_per_iteration": 2.4374656677246094 + }, + { + "auxiliary_loss_clip": 0.0110126, + "auxiliary_loss_mlp": 0.0102519, + "balance_loss_clip": 1.01403785, + "balance_loss_mlp": 1.03610516, + "epoch": 0.8301818728393207, + "flos": 19715784336000.0, + "grad_norm": 1.711706373511074, + "language_loss": 0.73087573, + "learning_rate": 2.948672208338847e-07, + "loss": 0.75214028, + "num_input_tokens_seen": 297866730, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65234375, + "step": 13808, + "time_per_iteration": 2.4465322494506836 + }, + { + "auxiliary_loss_clip": 0.0110707, + "auxiliary_loss_mlp": 0.01035806, + "balance_loss_clip": 1.02331913, + "balance_loss_mlp": 1.03772509, + "epoch": 0.8302419960919886, + "flos": 28293658416000.0, + "grad_norm": 1.786221226003615, + "language_loss": 0.66342396, + "learning_rate": 2.9466371311533046e-07, + "loss": 0.68485272, + "num_input_tokens_seen": 297886390, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 13809, + "time_per_iteration": 3.953455686569214 + }, + { + "auxiliary_loss_clip": 0.01100579, + "auxiliary_loss_mlp": 0.01023454, + "balance_loss_clip": 1.0124923, + "balance_loss_mlp": 1.03412163, + "epoch": 0.8303021193446566, + "flos": 18223444955520.0, + "grad_norm": 1.9422535896346522, + "language_loss": 0.73977947, + "learning_rate": 2.9446027006329896e-07, + "loss": 0.76101977, + "num_input_tokens_seen": 297905110, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6640625, + "step": 13810, + "time_per_iteration": 2.4113035202026367 + }, + { + "auxiliary_loss_clip": 0.01098267, + "auxiliary_loss_mlp": 0.0103445, + "balance_loss_clip": 1.02415669, + "balance_loss_mlp": 1.03506088, + "epoch": 0.8303622425973245, + "flos": 23111425578240.0, + "grad_norm": 1.5397227407858767, + "language_loss": 0.81322253, + "learning_rate": 2.94256891685505e-07, + "loss": 0.83454967, + "num_input_tokens_seen": 297925460, + "router_z_loss_clip": 0.10302734, + "router_z_loss_mlp": 0.6328125, + "step": 13811, + "time_per_iteration": 2.5003349781036377 + }, + { + "auxiliary_loss_clip": 0.01102763, + "auxiliary_loss_mlp": 0.01035668, + "balance_loss_clip": 1.02403903, + "balance_loss_mlp": 1.03681374, + "epoch": 0.8304223658499925, + "flos": 19572860119680.0, + "grad_norm": 1.83757541907444, + "language_loss": 0.73298693, + "learning_rate": 2.9405357798966156e-07, + "loss": 0.75437129, + "num_input_tokens_seen": 297941760, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66015625, + "step": 13812, + "time_per_iteration": 2.40651798248291 + }, + { + "auxiliary_loss_clip": 0.01097578, + "auxiliary_loss_mlp": 0.01026036, + "balance_loss_clip": 1.01416874, + "balance_loss_mlp": 1.03434229, + "epoch": 0.8304824891026604, + "flos": 24426115269120.0, + "grad_norm": 1.6568210885633572, + "language_loss": 0.78265715, + "learning_rate": 2.9385032898347664e-07, + "loss": 0.80389333, + "num_input_tokens_seen": 297959745, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6328125, + "step": 13813, + "time_per_iteration": 2.4902334213256836 + }, + { + "auxiliary_loss_clip": 0.01099825, + "auxiliary_loss_mlp": 0.01025617, + "balance_loss_clip": 1.01365399, + "balance_loss_mlp": 1.03272295, + "epoch": 0.8305426123553284, + "flos": 22381792611840.0, + "grad_norm": 2.334398224953377, + "language_loss": 0.71084231, + "learning_rate": 2.93647144674658e-07, + "loss": 0.73209673, + "num_input_tokens_seen": 297977665, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.671875, + "step": 13814, + "time_per_iteration": 2.4226105213165283 + }, + { + "auxiliary_loss_clip": 0.01106932, + "auxiliary_loss_mlp": 0.01039965, + "balance_loss_clip": 1.02605355, + "balance_loss_mlp": 1.03508019, + "epoch": 0.8306027356079964, + "flos": 14903575453440.0, + "grad_norm": 2.038226017442531, + "language_loss": 0.68133175, + "learning_rate": 2.9344402507091116e-07, + "loss": 0.70280063, + "num_input_tokens_seen": 297993525, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.71875, + "step": 13815, + "time_per_iteration": 2.434314250946045 + }, + { + "auxiliary_loss_clip": 0.01101072, + "auxiliary_loss_mlp": 0.01028614, + "balance_loss_clip": 1.01693177, + "balance_loss_mlp": 1.03570247, + "epoch": 0.8306628588606644, + "flos": 19644573623040.0, + "grad_norm": 1.8066947340112232, + "language_loss": 0.75933707, + "learning_rate": 2.9324097017993745e-07, + "loss": 0.78063387, + "num_input_tokens_seen": 298012920, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65234375, + "step": 13816, + "time_per_iteration": 2.442898750305176 + }, + { + "auxiliary_loss_clip": 0.01098329, + "auxiliary_loss_mlp": 0.01029317, + "balance_loss_clip": 1.01845741, + "balance_loss_mlp": 1.03341258, + "epoch": 0.8307229821133323, + "flos": 24389737770240.0, + "grad_norm": 1.7055249793285534, + "language_loss": 0.81462383, + "learning_rate": 2.930379800094371e-07, + "loss": 0.83590031, + "num_input_tokens_seen": 298033310, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6484375, + "step": 13817, + "time_per_iteration": 2.481618881225586 + }, + { + "auxiliary_loss_clip": 0.01102205, + "auxiliary_loss_mlp": 0.01034587, + "balance_loss_clip": 1.02211809, + "balance_loss_mlp": 1.03496206, + "epoch": 0.8307831053660003, + "flos": 20996933702400.0, + "grad_norm": 1.6134967348632454, + "language_loss": 0.78043187, + "learning_rate": 2.9283505456710875e-07, + "loss": 0.80179971, + "num_input_tokens_seen": 298053530, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.671875, + "step": 13818, + "time_per_iteration": 2.456963539123535 + }, + { + "auxiliary_loss_clip": 0.01102673, + "auxiliary_loss_mlp": 0.01033908, + "balance_loss_clip": 1.02229691, + "balance_loss_mlp": 1.03631234, + "epoch": 0.8308432286186682, + "flos": 21397301671680.0, + "grad_norm": 1.7992893057901929, + "language_loss": 0.82007933, + "learning_rate": 2.926321938606453e-07, + "loss": 0.84144515, + "num_input_tokens_seen": 298069305, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 13819, + "time_per_iteration": 2.4531350135803223 + }, + { + "auxiliary_loss_clip": 0.01023094, + "auxiliary_loss_mlp": 0.01001492, + "balance_loss_clip": 1.00050247, + "balance_loss_mlp": 1.00293744, + "epoch": 0.8309033518713362, + "flos": 62533656714240.0, + "grad_norm": 0.7540383320597264, + "language_loss": 0.56269968, + "learning_rate": 2.924293978977399e-07, + "loss": 0.58294547, + "num_input_tokens_seen": 298125830, + "router_z_loss_clip": 0.0098877, + "router_z_loss_mlp": 0.20117188, + "step": 13820, + "time_per_iteration": 3.07944917678833 + }, + { + "auxiliary_loss_clip": 0.01096009, + "auxiliary_loss_mlp": 0.01021975, + "balance_loss_clip": 1.01050711, + "balance_loss_mlp": 1.03318071, + "epoch": 0.8309634751240043, + "flos": 16979104051200.0, + "grad_norm": 1.8012504635990183, + "language_loss": 0.68316829, + "learning_rate": 2.922266666860831e-07, + "loss": 0.70434809, + "num_input_tokens_seen": 298142320, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.62890625, + "step": 13821, + "time_per_iteration": 2.4352285861968994 + }, + { + "auxiliary_loss_clip": 0.01101835, + "auxiliary_loss_mlp": 0.01029369, + "balance_loss_clip": 1.01752543, + "balance_loss_mlp": 1.03386974, + "epoch": 0.8310235983766722, + "flos": 22674464628480.0, + "grad_norm": 2.554596650493425, + "language_loss": 0.68782902, + "learning_rate": 2.920240002333625e-07, + "loss": 0.70914102, + "num_input_tokens_seen": 298161845, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 13822, + "time_per_iteration": 2.4268665313720703 + }, + { + "auxiliary_loss_clip": 0.01097449, + "auxiliary_loss_mlp": 0.01033673, + "balance_loss_clip": 1.02247906, + "balance_loss_mlp": 1.0342207, + "epoch": 0.8310837216293402, + "flos": 30811463176320.0, + "grad_norm": 1.735748965364619, + "language_loss": 0.62030697, + "learning_rate": 2.918213985472631e-07, + "loss": 0.64161813, + "num_input_tokens_seen": 298184165, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6328125, + "step": 13823, + "time_per_iteration": 2.5616307258605957 + }, + { + "auxiliary_loss_clip": 0.01022918, + "auxiliary_loss_mlp": 0.01006024, + "balance_loss_clip": 1.00502288, + "balance_loss_mlp": 1.00274229, + "epoch": 0.8311438448820081, + "flos": 71276074997760.0, + "grad_norm": 1.0144184118719066, + "language_loss": 0.61859858, + "learning_rate": 2.916188616354669e-07, + "loss": 0.638888, + "num_input_tokens_seen": 298251720, + "router_z_loss_clip": 0.01000977, + "router_z_loss_mlp": 0.20214844, + "step": 13824, + "time_per_iteration": 3.1341047286987305 + }, + { + "auxiliary_loss_clip": 0.01099745, + "auxiliary_loss_mlp": 0.01029378, + "balance_loss_clip": 1.01800573, + "balance_loss_mlp": 1.03508544, + "epoch": 0.8312039681346761, + "flos": 20887082933760.0, + "grad_norm": 1.9203244405099236, + "language_loss": 0.7410804, + "learning_rate": 2.914163895056552e-07, + "loss": 0.7623716, + "num_input_tokens_seen": 298271910, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6484375, + "step": 13825, + "time_per_iteration": 2.4680562019348145 + }, + { + "auxiliary_loss_clip": 0.01100678, + "auxiliary_loss_mlp": 0.01031495, + "balance_loss_clip": 1.01961005, + "balance_loss_mlp": 1.03404522, + "epoch": 0.831264091387344, + "flos": 17017528625280.0, + "grad_norm": 2.6831981740546804, + "language_loss": 0.80436289, + "learning_rate": 2.9121398216550486e-07, + "loss": 0.82568467, + "num_input_tokens_seen": 298288105, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66796875, + "step": 13826, + "time_per_iteration": 2.3986868858337402 + }, + { + "auxiliary_loss_clip": 0.01099812, + "auxiliary_loss_mlp": 0.01031012, + "balance_loss_clip": 1.01911521, + "balance_loss_mlp": 1.03373289, + "epoch": 0.831324214640012, + "flos": 24419578993920.0, + "grad_norm": 1.6899698204069646, + "language_loss": 0.67370605, + "learning_rate": 2.910116396226914e-07, + "loss": 0.6950143, + "num_input_tokens_seen": 298307600, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 13827, + "time_per_iteration": 2.556210517883301 + }, + { + "auxiliary_loss_clip": 0.01097618, + "auxiliary_loss_mlp": 0.01026907, + "balance_loss_clip": 1.01631558, + "balance_loss_mlp": 1.03311372, + "epoch": 0.83138433789268, + "flos": 13545576938880.0, + "grad_norm": 1.8718519853122935, + "language_loss": 0.73761111, + "learning_rate": 2.9080936188488834e-07, + "loss": 0.75885636, + "num_input_tokens_seen": 298323055, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.64453125, + "step": 13828, + "time_per_iteration": 2.402517557144165 + }, + { + "auxiliary_loss_clip": 0.01098978, + "auxiliary_loss_mlp": 0.01033392, + "balance_loss_clip": 1.02159595, + "balance_loss_mlp": 1.03244913, + "epoch": 0.831444461145348, + "flos": 44492386561920.0, + "grad_norm": 5.897206901344671, + "language_loss": 0.67066121, + "learning_rate": 2.906071489597657e-07, + "loss": 0.69198495, + "num_input_tokens_seen": 298346950, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66796875, + "step": 13829, + "time_per_iteration": 2.6429443359375 + }, + { + "auxiliary_loss_clip": 0.01102016, + "auxiliary_loss_mlp": 0.01029889, + "balance_loss_clip": 1.01768827, + "balance_loss_mlp": 1.03440702, + "epoch": 0.8315045843980159, + "flos": 22705024124160.0, + "grad_norm": 3.810275572521135, + "language_loss": 0.82567447, + "learning_rate": 2.9040500085499054e-07, + "loss": 0.84699351, + "num_input_tokens_seen": 298366315, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.67578125, + "step": 13830, + "time_per_iteration": 2.4443130493164062 + }, + { + "auxiliary_loss_clip": 0.01098309, + "auxiliary_loss_mlp": 0.01029327, + "balance_loss_clip": 1.0173049, + "balance_loss_mlp": 1.03295636, + "epoch": 0.8315647076506839, + "flos": 16873491087360.0, + "grad_norm": 1.9438123973766057, + "language_loss": 0.74598849, + "learning_rate": 2.9020291757822925e-07, + "loss": 0.76726484, + "num_input_tokens_seen": 298385185, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.65625, + "step": 13831, + "time_per_iteration": 2.4210164546966553 + }, + { + "auxiliary_loss_clip": 0.01100189, + "auxiliary_loss_mlp": 0.01031465, + "balance_loss_clip": 1.01938343, + "balance_loss_mlp": 1.03486538, + "epoch": 0.8316248309033518, + "flos": 13808730954240.0, + "grad_norm": 1.6326296110145166, + "language_loss": 0.71145892, + "learning_rate": 2.9000089913714523e-07, + "loss": 0.73277545, + "num_input_tokens_seen": 298402335, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.65625, + "step": 13832, + "time_per_iteration": 2.394432306289673 + }, + { + "auxiliary_loss_clip": 0.01097955, + "auxiliary_loss_mlp": 0.0102979, + "balance_loss_clip": 1.01818514, + "balance_loss_mlp": 1.03306818, + "epoch": 0.8316849541560198, + "flos": 23512511819520.0, + "grad_norm": 1.671736140785639, + "language_loss": 0.84483445, + "learning_rate": 2.897989455393979e-07, + "loss": 0.86611187, + "num_input_tokens_seen": 298423370, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6484375, + "step": 13833, + "time_per_iteration": 2.504239797592163 + }, + { + "auxiliary_loss_clip": 0.0110191, + "auxiliary_loss_mlp": 0.01033818, + "balance_loss_clip": 1.02147341, + "balance_loss_mlp": 1.03484035, + "epoch": 0.8317450774086879, + "flos": 23771356202880.0, + "grad_norm": 1.54787905183348, + "language_loss": 0.7613343, + "learning_rate": 2.8959705679264625e-07, + "loss": 0.7826916, + "num_input_tokens_seen": 298444835, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.671875, + "step": 13834, + "time_per_iteration": 2.4704878330230713 + }, + { + "auxiliary_loss_clip": 0.01095699, + "auxiliary_loss_mlp": 0.01028887, + "balance_loss_clip": 1.01751471, + "balance_loss_mlp": 1.0322547, + "epoch": 0.8318052006613558, + "flos": 16215535710720.0, + "grad_norm": 2.046199004722401, + "language_loss": 0.79697442, + "learning_rate": 2.893952329045459e-07, + "loss": 0.81822026, + "num_input_tokens_seen": 298461845, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6328125, + "step": 13835, + "time_per_iteration": 2.40838623046875 + }, + { + "auxiliary_loss_clip": 0.01106083, + "auxiliary_loss_mlp": 0.0103428, + "balance_loss_clip": 1.02129853, + "balance_loss_mlp": 1.03730321, + "epoch": 0.8318653239140238, + "flos": 19974556892160.0, + "grad_norm": 1.7818971205631189, + "language_loss": 0.80744654, + "learning_rate": 2.8919347388274905e-07, + "loss": 0.82885015, + "num_input_tokens_seen": 298479095, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6875, + "step": 13836, + "time_per_iteration": 2.4172093868255615 + }, + { + "auxiliary_loss_clip": 0.01098724, + "auxiliary_loss_mlp": 0.01028555, + "balance_loss_clip": 1.01750398, + "balance_loss_mlp": 1.03422546, + "epoch": 0.8319254471666917, + "flos": 17704714694400.0, + "grad_norm": 1.9244544867437152, + "language_loss": 0.77690089, + "learning_rate": 2.8899177973490727e-07, + "loss": 0.79817367, + "num_input_tokens_seen": 298494475, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.64453125, + "step": 13837, + "time_per_iteration": 2.4101178646087646 + }, + { + "auxiliary_loss_clip": 0.01103421, + "auxiliary_loss_mlp": 0.01029609, + "balance_loss_clip": 1.01641822, + "balance_loss_mlp": 1.03414893, + "epoch": 0.8319855704193597, + "flos": 19536554448000.0, + "grad_norm": 1.6930309583903163, + "language_loss": 0.8365382, + "learning_rate": 2.887901504686685e-07, + "loss": 0.85786849, + "num_input_tokens_seen": 298513185, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.69140625, + "step": 13838, + "time_per_iteration": 2.4097585678100586 + }, + { + "auxiliary_loss_clip": 0.01098918, + "auxiliary_loss_mlp": 0.01031867, + "balance_loss_clip": 1.01960075, + "balance_loss_mlp": 1.03409493, + "epoch": 0.8320456936720276, + "flos": 21178067011200.0, + "grad_norm": 2.1542389806886266, + "language_loss": 0.74221098, + "learning_rate": 2.885885860916795e-07, + "loss": 0.76351881, + "num_input_tokens_seen": 298531885, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6484375, + "step": 13839, + "time_per_iteration": 2.4400813579559326 + }, + { + "auxiliary_loss_clip": 0.01101691, + "auxiliary_loss_mlp": 0.01031203, + "balance_loss_clip": 1.01941919, + "balance_loss_mlp": 1.03564334, + "epoch": 0.8321058169246957, + "flos": 33250874503680.0, + "grad_norm": 1.4115307011587832, + "language_loss": 0.67430389, + "learning_rate": 2.8838708661158253e-07, + "loss": 0.69563287, + "num_input_tokens_seen": 298554905, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.66015625, + "step": 13840, + "time_per_iteration": 2.5372142791748047 + }, + { + "auxiliary_loss_clip": 0.01099376, + "auxiliary_loss_mlp": 0.01027212, + "balance_loss_clip": 1.01557088, + "balance_loss_mlp": 1.0327549, + "epoch": 0.8321659401773636, + "flos": 14208129256320.0, + "grad_norm": 1.8532994012873067, + "language_loss": 0.79538697, + "learning_rate": 2.8818565203601843e-07, + "loss": 0.81665289, + "num_input_tokens_seen": 298571185, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 13841, + "time_per_iteration": 2.406419277191162 + }, + { + "auxiliary_loss_clip": 0.01098521, + "auxiliary_loss_mlp": 0.01027656, + "balance_loss_clip": 1.01594353, + "balance_loss_mlp": 1.03416336, + "epoch": 0.8322260634300316, + "flos": 15158253859200.0, + "grad_norm": 1.9703155224282078, + "language_loss": 0.68665957, + "learning_rate": 2.879842823726262e-07, + "loss": 0.70792133, + "num_input_tokens_seen": 298588505, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.640625, + "step": 13842, + "time_per_iteration": 2.4088361263275146 + }, + { + "auxiliary_loss_clip": 0.01099099, + "auxiliary_loss_mlp": 0.01025711, + "balance_loss_clip": 1.01326537, + "balance_loss_mlp": 1.03484809, + "epoch": 0.8322861866826995, + "flos": 25300827267840.0, + "grad_norm": 2.252583895579188, + "language_loss": 0.73118508, + "learning_rate": 2.8778297762904124e-07, + "loss": 0.75243318, + "num_input_tokens_seen": 298609295, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.64453125, + "step": 13843, + "time_per_iteration": 2.4760208129882812 + }, + { + "auxiliary_loss_clip": 0.01100509, + "auxiliary_loss_mlp": 0.0103056, + "balance_loss_clip": 1.01875806, + "balance_loss_mlp": 1.03649437, + "epoch": 0.8323463099353675, + "flos": 17019360218880.0, + "grad_norm": 1.8153804396647903, + "language_loss": 0.77374804, + "learning_rate": 2.875817378128975e-07, + "loss": 0.79505873, + "num_input_tokens_seen": 298625765, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.640625, + "step": 13844, + "time_per_iteration": 2.3891868591308594 + }, + { + "auxiliary_loss_clip": 0.01022573, + "auxiliary_loss_mlp": 0.00999494, + "balance_loss_clip": 0.99842119, + "balance_loss_mlp": 1.00265634, + "epoch": 0.8324064331880354, + "flos": 55607889709440.0, + "grad_norm": 0.8581285826544858, + "language_loss": 0.55275869, + "learning_rate": 2.8738056293182624e-07, + "loss": 0.57297933, + "num_input_tokens_seen": 298683005, + "router_z_loss_clip": 0.01074219, + "router_z_loss_mlp": 0.19921875, + "step": 13845, + "time_per_iteration": 4.351477861404419 + }, + { + "auxiliary_loss_clip": 0.011024, + "auxiliary_loss_mlp": 0.01043293, + "balance_loss_clip": 1.03099656, + "balance_loss_mlp": 1.03529, + "epoch": 0.8324665564407034, + "flos": 26138623063680.0, + "grad_norm": 2.3644606259383782, + "language_loss": 0.75436401, + "learning_rate": 2.871794529934555e-07, + "loss": 0.77582097, + "num_input_tokens_seen": 298703060, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.671875, + "step": 13846, + "time_per_iteration": 2.4772729873657227 + }, + { + "auxiliary_loss_clip": 0.0110107, + "auxiliary_loss_mlp": 0.01026629, + "balance_loss_clip": 1.01367104, + "balance_loss_mlp": 1.03235412, + "epoch": 0.8325266796933715, + "flos": 22049187649920.0, + "grad_norm": 1.56529249468272, + "language_loss": 0.78832293, + "learning_rate": 2.8697840800541115e-07, + "loss": 0.80959988, + "num_input_tokens_seen": 298721765, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6875, + "step": 13847, + "time_per_iteration": 3.8999733924865723 + }, + { + "auxiliary_loss_clip": 0.01099196, + "auxiliary_loss_mlp": 0.01028062, + "balance_loss_clip": 1.01703548, + "balance_loss_mlp": 1.03460264, + "epoch": 0.8325868029460394, + "flos": 22816634659200.0, + "grad_norm": 1.5773716692897488, + "language_loss": 0.74506044, + "learning_rate": 2.867774279753175e-07, + "loss": 0.76633298, + "num_input_tokens_seen": 298740825, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.64453125, + "step": 13848, + "time_per_iteration": 2.4543795585632324 + }, + { + "auxiliary_loss_clip": 0.01099371, + "auxiliary_loss_mlp": 0.01028794, + "balance_loss_clip": 1.01737964, + "balance_loss_mlp": 1.03426051, + "epoch": 0.8326469261987074, + "flos": 14757454926720.0, + "grad_norm": 2.274626323524008, + "language_loss": 0.63361812, + "learning_rate": 2.8657651291079554e-07, + "loss": 0.65489972, + "num_input_tokens_seen": 298758515, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 13849, + "time_per_iteration": 2.4305379390716553 + }, + { + "auxiliary_loss_clip": 0.01101578, + "auxiliary_loss_mlp": 0.01030731, + "balance_loss_clip": 1.01899529, + "balance_loss_mlp": 1.03342891, + "epoch": 0.8327070494513753, + "flos": 22926126291840.0, + "grad_norm": 2.028058790500836, + "language_loss": 0.79350019, + "learning_rate": 2.863756628194638e-07, + "loss": 0.81482327, + "num_input_tokens_seen": 298776375, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 13850, + "time_per_iteration": 3.875143527984619 + }, + { + "auxiliary_loss_clip": 0.01095589, + "auxiliary_loss_mlp": 0.01031181, + "balance_loss_clip": 1.02058375, + "balance_loss_mlp": 1.03317165, + "epoch": 0.8327671727040433, + "flos": 20665334321280.0, + "grad_norm": 1.619138170366384, + "language_loss": 0.7828756, + "learning_rate": 2.8617487770893877e-07, + "loss": 0.80414331, + "num_input_tokens_seen": 298795135, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.625, + "step": 13851, + "time_per_iteration": 2.435199499130249 + }, + { + "auxiliary_loss_clip": 0.01022819, + "auxiliary_loss_mlp": 0.00999775, + "balance_loss_clip": 0.99877405, + "balance_loss_mlp": 1.00280309, + "epoch": 0.8328272959567112, + "flos": 56060760384000.0, + "grad_norm": 0.7623097192798404, + "language_loss": 0.55791199, + "learning_rate": 2.859741575868344e-07, + "loss": 0.57813787, + "num_input_tokens_seen": 298855475, + "router_z_loss_clip": 0.01000977, + "router_z_loss_mlp": 0.20019531, + "step": 13852, + "time_per_iteration": 3.025131940841675 + }, + { + "auxiliary_loss_clip": 0.01098525, + "auxiliary_loss_mlp": 0.01027782, + "balance_loss_clip": 1.01639128, + "balance_loss_mlp": 1.034675, + "epoch": 0.8328874192093793, + "flos": 32303084284800.0, + "grad_norm": 2.229501971998781, + "language_loss": 0.67093384, + "learning_rate": 2.8577350246076125e-07, + "loss": 0.69219691, + "num_input_tokens_seen": 298875875, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.63671875, + "step": 13853, + "time_per_iteration": 2.560558319091797 + }, + { + "auxiliary_loss_clip": 0.01101824, + "auxiliary_loss_mlp": 0.01030236, + "balance_loss_clip": 1.01892853, + "balance_loss_mlp": 1.03612375, + "epoch": 0.8329475424620472, + "flos": 23512691387520.0, + "grad_norm": 1.5146603164888313, + "language_loss": 0.78381944, + "learning_rate": 2.855729123383286e-07, + "loss": 0.80514002, + "num_input_tokens_seen": 298895950, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 13854, + "time_per_iteration": 2.4560956954956055 + }, + { + "auxiliary_loss_clip": 0.01022713, + "auxiliary_loss_mlp": 0.01004861, + "balance_loss_clip": 1.0038538, + "balance_loss_mlp": 1.00278306, + "epoch": 0.8330076657147152, + "flos": 67840680378240.0, + "grad_norm": 0.7920815382427507, + "language_loss": 0.58700705, + "learning_rate": 2.8537238722714295e-07, + "loss": 0.60728288, + "num_input_tokens_seen": 298955770, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.19921875, + "step": 13855, + "time_per_iteration": 2.948824644088745 + }, + { + "auxiliary_loss_clip": 0.01099595, + "auxiliary_loss_mlp": 0.0102813, + "balance_loss_clip": 1.01669788, + "balance_loss_mlp": 1.03500164, + "epoch": 0.8330677889673831, + "flos": 22892801448960.0, + "grad_norm": 1.6173618311844826, + "language_loss": 0.71731192, + "learning_rate": 2.8517192713480853e-07, + "loss": 0.73858917, + "num_input_tokens_seen": 298976545, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.64453125, + "step": 13856, + "time_per_iteration": 2.4495739936828613 + }, + { + "auxiliary_loss_clip": 0.01098834, + "auxiliary_loss_mlp": 0.01028183, + "balance_loss_clip": 1.01712024, + "balance_loss_mlp": 1.03420722, + "epoch": 0.8331279122200511, + "flos": 27345042184320.0, + "grad_norm": 1.5263372259770802, + "language_loss": 0.7549566, + "learning_rate": 2.8497153206892677e-07, + "loss": 0.77622676, + "num_input_tokens_seen": 298996750, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 13857, + "time_per_iteration": 2.48952317237854 + }, + { + "auxiliary_loss_clip": 0.01096823, + "auxiliary_loss_mlp": 0.01024887, + "balance_loss_clip": 1.01476622, + "balance_loss_mlp": 1.03515077, + "epoch": 0.833188035472719, + "flos": 19938179393280.0, + "grad_norm": 1.5444412658086444, + "language_loss": 0.7369523, + "learning_rate": 2.847712020370958e-07, + "loss": 0.75816941, + "num_input_tokens_seen": 299014895, + "router_z_loss_clip": 0.1015625, + "router_z_loss_mlp": 0.6171875, + "step": 13858, + "time_per_iteration": 2.415557622909546 + }, + { + "auxiliary_loss_clip": 0.01102993, + "auxiliary_loss_mlp": 0.01034018, + "balance_loss_clip": 1.02144766, + "balance_loss_mlp": 1.03377628, + "epoch": 0.833248158725387, + "flos": 15232624968960.0, + "grad_norm": 2.882415759044888, + "language_loss": 0.73106527, + "learning_rate": 2.8457093704691316e-07, + "loss": 0.75243539, + "num_input_tokens_seen": 299032855, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69140625, + "step": 13859, + "time_per_iteration": 2.4257359504699707 + }, + { + "auxiliary_loss_clip": 0.01095625, + "auxiliary_loss_mlp": 0.01026152, + "balance_loss_clip": 1.01519704, + "balance_loss_mlp": 1.03375316, + "epoch": 0.8333082819780551, + "flos": 24535535074560.0, + "grad_norm": 1.5687647819077657, + "language_loss": 0.79128706, + "learning_rate": 2.8437073710597205e-07, + "loss": 0.81250489, + "num_input_tokens_seen": 299052055, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6171875, + "step": 13860, + "time_per_iteration": 2.4974732398986816 + }, + { + "auxiliary_loss_clip": 0.01098794, + "auxiliary_loss_mlp": 0.01031344, + "balance_loss_clip": 1.0195545, + "balance_loss_mlp": 1.03446364, + "epoch": 0.833368405230723, + "flos": 31467407391360.0, + "grad_norm": 1.3446987096188727, + "language_loss": 0.82059264, + "learning_rate": 2.841706022218644e-07, + "loss": 0.84189403, + "num_input_tokens_seen": 299075285, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.64453125, + "step": 13861, + "time_per_iteration": 2.5322492122650146 + }, + { + "auxiliary_loss_clip": 0.01103178, + "auxiliary_loss_mlp": 0.01032821, + "balance_loss_clip": 1.02115631, + "balance_loss_mlp": 1.03720117, + "epoch": 0.833428528483391, + "flos": 14902713527040.0, + "grad_norm": 1.926026515251472, + "language_loss": 0.78863573, + "learning_rate": 2.839705324021806e-07, + "loss": 0.80999571, + "num_input_tokens_seen": 299092520, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66015625, + "step": 13862, + "time_per_iteration": 2.4110610485076904 + }, + { + "auxiliary_loss_clip": 0.01099524, + "auxiliary_loss_mlp": 0.01035001, + "balance_loss_clip": 1.02301443, + "balance_loss_mlp": 1.03292191, + "epoch": 0.8334886517360589, + "flos": 22199833290240.0, + "grad_norm": 1.865354968291114, + "language_loss": 0.75375336, + "learning_rate": 2.83770527654505e-07, + "loss": 0.77509862, + "num_input_tokens_seen": 299109450, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 13863, + "time_per_iteration": 2.45611572265625 + }, + { + "auxiliary_loss_clip": 0.01098316, + "auxiliary_loss_mlp": 0.01028979, + "balance_loss_clip": 1.01801133, + "balance_loss_mlp": 1.03540921, + "epoch": 0.8335487749887269, + "flos": 30372562892160.0, + "grad_norm": 2.920899657352717, + "language_loss": 0.74782169, + "learning_rate": 2.835705879864232e-07, + "loss": 0.76909465, + "num_input_tokens_seen": 299129540, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6328125, + "step": 13864, + "time_per_iteration": 2.531675100326538 + }, + { + "auxiliary_loss_clip": 0.01099559, + "auxiliary_loss_mlp": 0.01032767, + "balance_loss_clip": 1.02045846, + "balance_loss_mlp": 1.03386116, + "epoch": 0.8336088982413948, + "flos": 24681152810880.0, + "grad_norm": 1.9906253642830378, + "language_loss": 0.69348955, + "learning_rate": 2.833707134055168e-07, + "loss": 0.71481282, + "num_input_tokens_seen": 299148670, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.65625, + "step": 13865, + "time_per_iteration": 2.471926689147949 + }, + { + "auxiliary_loss_clip": 0.01100902, + "auxiliary_loss_mlp": 0.01030815, + "balance_loss_clip": 1.01927531, + "balance_loss_mlp": 1.03523529, + "epoch": 0.8336690214940629, + "flos": 38177207873280.0, + "grad_norm": 1.8969671678573263, + "language_loss": 0.7543878, + "learning_rate": 2.831709039193653e-07, + "loss": 0.77570498, + "num_input_tokens_seen": 299169330, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 13866, + "time_per_iteration": 2.574395179748535 + }, + { + "auxiliary_loss_clip": 0.01022788, + "auxiliary_loss_mlp": 0.01000732, + "balance_loss_clip": 0.99971908, + "balance_loss_mlp": 1.00291204, + "epoch": 0.8337291447467308, + "flos": 55565119589760.0, + "grad_norm": 0.8706720724584954, + "language_loss": 0.63136578, + "learning_rate": 2.8297115953554465e-07, + "loss": 0.65160096, + "num_input_tokens_seen": 299220980, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.19921875, + "step": 13867, + "time_per_iteration": 2.934981107711792 + }, + { + "auxiliary_loss_clip": 0.01096579, + "auxiliary_loss_mlp": 0.01032744, + "balance_loss_clip": 1.02190161, + "balance_loss_mlp": 1.03341174, + "epoch": 0.8337892679993988, + "flos": 24133550993280.0, + "grad_norm": 1.5625976784768958, + "language_loss": 0.71867061, + "learning_rate": 2.827714802616301e-07, + "loss": 0.73996377, + "num_input_tokens_seen": 299240130, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6328125, + "step": 13868, + "time_per_iteration": 2.4451518058776855 + }, + { + "auxiliary_loss_clip": 0.01101584, + "auxiliary_loss_mlp": 0.01030698, + "balance_loss_clip": 1.01893783, + "balance_loss_mlp": 1.03663313, + "epoch": 0.8338493912520667, + "flos": 28183915388160.0, + "grad_norm": 1.355395480855395, + "language_loss": 0.80121469, + "learning_rate": 2.8257186610519325e-07, + "loss": 0.82253754, + "num_input_tokens_seen": 299260705, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6484375, + "step": 13869, + "time_per_iteration": 2.543779134750366 + }, + { + "auxiliary_loss_clip": 0.01100171, + "auxiliary_loss_mlp": 0.01031265, + "balance_loss_clip": 1.01947522, + "balance_loss_mlp": 1.03504825, + "epoch": 0.8339095145047347, + "flos": 22158356060160.0, + "grad_norm": 2.3494726430887423, + "language_loss": 0.82560599, + "learning_rate": 2.823723170738028e-07, + "loss": 0.84692031, + "num_input_tokens_seen": 299278925, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6484375, + "step": 13870, + "time_per_iteration": 2.461456775665283 + }, + { + "auxiliary_loss_clip": 0.01100457, + "auxiliary_loss_mlp": 0.01026188, + "balance_loss_clip": 1.01389122, + "balance_loss_mlp": 1.03311634, + "epoch": 0.8339696377574026, + "flos": 17307112072320.0, + "grad_norm": 2.2704467550194503, + "language_loss": 0.70611966, + "learning_rate": 2.821728331750264e-07, + "loss": 0.72738612, + "num_input_tokens_seen": 299291580, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.671875, + "step": 13871, + "time_per_iteration": 2.422774076461792 + }, + { + "auxiliary_loss_clip": 0.01099073, + "auxiliary_loss_mlp": 0.01034652, + "balance_loss_clip": 1.02315414, + "balance_loss_mlp": 1.03536725, + "epoch": 0.8340297610100706, + "flos": 20668351063680.0, + "grad_norm": 2.1800938394857257, + "language_loss": 0.68849045, + "learning_rate": 2.8197341441642853e-07, + "loss": 0.70982766, + "num_input_tokens_seen": 299310385, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.63671875, + "step": 13872, + "time_per_iteration": 2.4503636360168457 + }, + { + "auxiliary_loss_clip": 0.01099674, + "auxiliary_loss_mlp": 0.01024709, + "balance_loss_clip": 1.01330113, + "balance_loss_mlp": 1.03414083, + "epoch": 0.8340898842627387, + "flos": 20515442866560.0, + "grad_norm": 2.0378209067910906, + "language_loss": 0.73376065, + "learning_rate": 2.817740608055712e-07, + "loss": 0.75500453, + "num_input_tokens_seen": 299327660, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 13873, + "time_per_iteration": 2.446756601333618 + }, + { + "auxiliary_loss_clip": 0.01101733, + "auxiliary_loss_mlp": 0.01032433, + "balance_loss_clip": 1.01899242, + "balance_loss_mlp": 1.03445745, + "epoch": 0.8341500075154066, + "flos": 21425850005760.0, + "grad_norm": 2.1330772201747354, + "language_loss": 0.75205374, + "learning_rate": 2.81574772350013e-07, + "loss": 0.77339536, + "num_input_tokens_seen": 299343685, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.671875, + "step": 13874, + "time_per_iteration": 2.462092638015747 + }, + { + "auxiliary_loss_clip": 0.01097984, + "auxiliary_loss_mlp": 0.01024849, + "balance_loss_clip": 1.01381576, + "balance_loss_mlp": 1.03387237, + "epoch": 0.8342101307680746, + "flos": 22090988102400.0, + "grad_norm": 1.868204921667949, + "language_loss": 0.65978831, + "learning_rate": 2.813755490573118e-07, + "loss": 0.68101668, + "num_input_tokens_seen": 299363305, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 13875, + "time_per_iteration": 2.5084481239318848 + }, + { + "auxiliary_loss_clip": 0.01100848, + "auxiliary_loss_mlp": 0.01035626, + "balance_loss_clip": 1.02417028, + "balance_loss_mlp": 1.0363009, + "epoch": 0.8342702540207425, + "flos": 21871466133120.0, + "grad_norm": 1.702367531378977, + "language_loss": 0.79506415, + "learning_rate": 2.8117639093502243e-07, + "loss": 0.8164289, + "num_input_tokens_seen": 299382630, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6484375, + "step": 13876, + "time_per_iteration": 2.468416213989258 + }, + { + "auxiliary_loss_clip": 0.01098276, + "auxiliary_loss_mlp": 0.01030323, + "balance_loss_clip": 1.01830101, + "balance_loss_mlp": 1.03345704, + "epoch": 0.8343303772734105, + "flos": 22528487756160.0, + "grad_norm": 2.017532835470735, + "language_loss": 0.87241477, + "learning_rate": 2.8097729799069615e-07, + "loss": 0.89370072, + "num_input_tokens_seen": 299402385, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6484375, + "step": 13877, + "time_per_iteration": 2.4826865196228027 + }, + { + "auxiliary_loss_clip": 0.01100446, + "auxiliary_loss_mlp": 0.01026775, + "balance_loss_clip": 1.01605773, + "balance_loss_mlp": 1.03529167, + "epoch": 0.8343905005260784, + "flos": 14939773384320.0, + "grad_norm": 1.8556989537670767, + "language_loss": 0.6919421, + "learning_rate": 2.807782702318828e-07, + "loss": 0.71321428, + "num_input_tokens_seen": 299419820, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.65234375, + "step": 13878, + "time_per_iteration": 2.4149510860443115 + }, + { + "auxiliary_loss_clip": 0.01097301, + "auxiliary_loss_mlp": 0.01028652, + "balance_loss_clip": 1.01760721, + "balance_loss_mlp": 1.03321266, + "epoch": 0.8344506237787465, + "flos": 15012456554880.0, + "grad_norm": 2.151087200806259, + "language_loss": 0.79375225, + "learning_rate": 2.805793076661309e-07, + "loss": 0.81501174, + "num_input_tokens_seen": 299436265, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 13879, + "time_per_iteration": 2.4393885135650635 + }, + { + "auxiliary_loss_clip": 0.01098676, + "auxiliary_loss_mlp": 0.01030826, + "balance_loss_clip": 1.02046657, + "balance_loss_mlp": 1.03434122, + "epoch": 0.8345107470314144, + "flos": 17560389847680.0, + "grad_norm": 1.9366754289118486, + "language_loss": 0.83347481, + "learning_rate": 2.803804103009828e-07, + "loss": 0.85476983, + "num_input_tokens_seen": 299451660, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.64453125, + "step": 13880, + "time_per_iteration": 2.4007508754730225 + }, + { + "auxiliary_loss_clip": 0.01102102, + "auxiliary_loss_mlp": 0.01028434, + "balance_loss_clip": 1.01709139, + "balance_loss_mlp": 1.03468037, + "epoch": 0.8345708702840824, + "flos": 25187277398400.0, + "grad_norm": 1.5189652772607405, + "language_loss": 0.78158617, + "learning_rate": 2.80181578143982e-07, + "loss": 0.80289149, + "num_input_tokens_seen": 299472070, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 13881, + "time_per_iteration": 2.486856698989868 + }, + { + "auxiliary_loss_clip": 0.01094738, + "auxiliary_loss_mlp": 0.01023668, + "balance_loss_clip": 1.01344538, + "balance_loss_mlp": 1.03388488, + "epoch": 0.8346309935367503, + "flos": 15083559527040.0, + "grad_norm": 2.7414546532232285, + "language_loss": 0.78763664, + "learning_rate": 2.7998281120266807e-07, + "loss": 0.80882066, + "num_input_tokens_seen": 299486725, + "router_z_loss_clip": 0.10205078, + "router_z_loss_mlp": 0.609375, + "step": 13882, + "time_per_iteration": 2.383542060852051 + }, + { + "auxiliary_loss_clip": 0.01102782, + "auxiliary_loss_mlp": 0.01038346, + "balance_loss_clip": 1.02697921, + "balance_loss_mlp": 1.03675485, + "epoch": 0.8346911167894183, + "flos": 22930615491840.0, + "grad_norm": 1.7318453310688504, + "language_loss": 0.80458236, + "learning_rate": 2.79784109484579e-07, + "loss": 0.82599366, + "num_input_tokens_seen": 299505435, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.66015625, + "step": 13883, + "time_per_iteration": 2.4578230381011963 + }, + { + "auxiliary_loss_clip": 0.01100881, + "auxiliary_loss_mlp": 0.01030733, + "balance_loss_clip": 1.01879406, + "balance_loss_mlp": 1.03334713, + "epoch": 0.8347512400420862, + "flos": 20193037367040.0, + "grad_norm": 2.0429837151334795, + "language_loss": 0.74506301, + "learning_rate": 2.795854729972482e-07, + "loss": 0.76637912, + "num_input_tokens_seen": 299523555, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.67578125, + "step": 13884, + "time_per_iteration": 2.4351799488067627 + }, + { + "auxiliary_loss_clip": 0.01107845, + "auxiliary_loss_mlp": 0.01034733, + "balance_loss_clip": 1.02128029, + "balance_loss_mlp": 1.03687263, + "epoch": 0.8348113632947542, + "flos": 25954832148480.0, + "grad_norm": 1.6804118695495678, + "language_loss": 0.70060503, + "learning_rate": 2.7938690174820913e-07, + "loss": 0.72203082, + "num_input_tokens_seen": 299541660, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7109375, + "step": 13885, + "time_per_iteration": 2.4579498767852783 + }, + { + "auxiliary_loss_clip": 0.01101392, + "auxiliary_loss_mlp": 0.01030833, + "balance_loss_clip": 1.01899576, + "balance_loss_mlp": 1.03498375, + "epoch": 0.8348714865474223, + "flos": 34204554552960.0, + "grad_norm": 1.6036472675967848, + "language_loss": 0.69851661, + "learning_rate": 2.791883957449912e-07, + "loss": 0.7198388, + "num_input_tokens_seen": 299562465, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 13886, + "time_per_iteration": 2.5490212440490723 + }, + { + "auxiliary_loss_clip": 0.01099122, + "auxiliary_loss_mlp": 0.01028924, + "balance_loss_clip": 1.01703906, + "balance_loss_mlp": 1.03454471, + "epoch": 0.8349316098000902, + "flos": 24390132819840.0, + "grad_norm": 1.8702911188252411, + "language_loss": 0.79043454, + "learning_rate": 2.7898995499512134e-07, + "loss": 0.81171501, + "num_input_tokens_seen": 299582700, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6484375, + "step": 13887, + "time_per_iteration": 3.8734936714172363 + }, + { + "auxiliary_loss_clip": 0.01105837, + "auxiliary_loss_mlp": 0.0103061, + "balance_loss_clip": 1.01753235, + "balance_loss_mlp": 1.03689051, + "epoch": 0.8349917330527582, + "flos": 23032744836480.0, + "grad_norm": 2.4281394961126277, + "language_loss": 0.64525139, + "learning_rate": 2.7879157950612467e-07, + "loss": 0.66661584, + "num_input_tokens_seen": 299600310, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.69140625, + "step": 13888, + "time_per_iteration": 5.29352068901062 + }, + { + "auxiliary_loss_clip": 0.01102274, + "auxiliary_loss_mlp": 0.01026091, + "balance_loss_clip": 1.01442051, + "balance_loss_mlp": 1.03387511, + "epoch": 0.8350518563054261, + "flos": 13625873792640.0, + "grad_norm": 2.253647584717518, + "language_loss": 0.6737141, + "learning_rate": 2.785932692855244e-07, + "loss": 0.69499779, + "num_input_tokens_seen": 299617025, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.68359375, + "step": 13889, + "time_per_iteration": 2.4108006954193115 + }, + { + "auxiliary_loss_clip": 0.01096996, + "auxiliary_loss_mlp": 0.01028659, + "balance_loss_clip": 1.01737046, + "balance_loss_mlp": 1.03261387, + "epoch": 0.8351119795580941, + "flos": 21579799697280.0, + "grad_norm": 2.354736247882719, + "language_loss": 0.68670756, + "learning_rate": 2.783950243408399e-07, + "loss": 0.70796412, + "num_input_tokens_seen": 299633050, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.64453125, + "step": 13890, + "time_per_iteration": 2.4558017253875732 + }, + { + "auxiliary_loss_clip": 0.01100731, + "auxiliary_loss_mlp": 0.01031277, + "balance_loss_clip": 1.01932621, + "balance_loss_mlp": 1.03489208, + "epoch": 0.835172102810762, + "flos": 20038297576320.0, + "grad_norm": 2.3411484544759187, + "language_loss": 0.58889383, + "learning_rate": 2.7819684467958817e-07, + "loss": 0.61021388, + "num_input_tokens_seen": 299646445, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.65625, + "step": 13891, + "time_per_iteration": 2.4097304344177246 + }, + { + "auxiliary_loss_clip": 0.01100517, + "auxiliary_loss_mlp": 0.01031516, + "balance_loss_clip": 1.02039945, + "balance_loss_mlp": 1.03453255, + "epoch": 0.8352322260634301, + "flos": 25111577485440.0, + "grad_norm": 1.7015467014644545, + "language_loss": 0.71564895, + "learning_rate": 2.779987303092846e-07, + "loss": 0.73696935, + "num_input_tokens_seen": 299662665, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 13892, + "time_per_iteration": 3.8961503505706787 + }, + { + "auxiliary_loss_clip": 0.01096459, + "auxiliary_loss_mlp": 0.01029035, + "balance_loss_clip": 1.01702511, + "balance_loss_mlp": 1.03270864, + "epoch": 0.835292349316098, + "flos": 24863758577280.0, + "grad_norm": 1.5839366576450844, + "language_loss": 0.66044503, + "learning_rate": 2.7780068123744207e-07, + "loss": 0.68169999, + "num_input_tokens_seen": 299683585, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.63671875, + "step": 13893, + "time_per_iteration": 2.5053062438964844 + }, + { + "auxiliary_loss_clip": 0.01097033, + "auxiliary_loss_mlp": 0.01025409, + "balance_loss_clip": 1.01406002, + "balance_loss_mlp": 1.03166842, + "epoch": 0.835352472568766, + "flos": 19865568049920.0, + "grad_norm": 1.9915669403341283, + "language_loss": 0.78155309, + "learning_rate": 2.7760269747156996e-07, + "loss": 0.80277747, + "num_input_tokens_seen": 299702680, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65234375, + "step": 13894, + "time_per_iteration": 2.472388505935669 + }, + { + "auxiliary_loss_clip": 0.01096943, + "auxiliary_loss_mlp": 0.01025896, + "balance_loss_clip": 1.01453519, + "balance_loss_mlp": 1.03513694, + "epoch": 0.8354125958214339, + "flos": 22054754257920.0, + "grad_norm": 1.6322720137686266, + "language_loss": 0.72857749, + "learning_rate": 2.7740477901917625e-07, + "loss": 0.74980593, + "num_input_tokens_seen": 299721050, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6171875, + "step": 13895, + "time_per_iteration": 2.440232515335083 + }, + { + "auxiliary_loss_clip": 0.011002, + "auxiliary_loss_mlp": 0.01040296, + "balance_loss_clip": 1.02728426, + "balance_loss_mlp": 1.03335416, + "epoch": 0.8354727190741019, + "flos": 21397804462080.0, + "grad_norm": 2.155535332480292, + "language_loss": 0.71964091, + "learning_rate": 2.772069258877667e-07, + "loss": 0.74104589, + "num_input_tokens_seen": 299738255, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.66796875, + "step": 13896, + "time_per_iteration": 2.4436943531036377 + }, + { + "auxiliary_loss_clip": 0.01097879, + "auxiliary_loss_mlp": 0.01026397, + "balance_loss_clip": 1.01523876, + "balance_loss_mlp": 1.03357804, + "epoch": 0.8355328423267698, + "flos": 50840997834240.0, + "grad_norm": 2.3454754721795763, + "language_loss": 0.58714581, + "learning_rate": 2.770091380848423e-07, + "loss": 0.6083886, + "num_input_tokens_seen": 299761315, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.640625, + "step": 13897, + "time_per_iteration": 2.700792074203491 + }, + { + "auxiliary_loss_clip": 0.01022191, + "auxiliary_loss_mlp": 0.01000308, + "balance_loss_clip": 0.9993543, + "balance_loss_mlp": 1.00201261, + "epoch": 0.8355929655794379, + "flos": 65551052764800.0, + "grad_norm": 0.6979475154433069, + "language_loss": 0.57681328, + "learning_rate": 2.7681141561790423e-07, + "loss": 0.59703827, + "num_input_tokens_seen": 299828735, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.20117188, + "step": 13898, + "time_per_iteration": 3.0732853412628174 + }, + { + "auxiliary_loss_clip": 0.01101202, + "auxiliary_loss_mlp": 0.01031023, + "balance_loss_clip": 1.01861334, + "balance_loss_mlp": 1.03426445, + "epoch": 0.8356530888321058, + "flos": 19170516902400.0, + "grad_norm": 1.7459873042181069, + "language_loss": 0.79868174, + "learning_rate": 2.7661375849444967e-07, + "loss": 0.82000399, + "num_input_tokens_seen": 299848395, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.66796875, + "step": 13899, + "time_per_iteration": 2.432739734649658 + }, + { + "auxiliary_loss_clip": 0.01100718, + "auxiliary_loss_mlp": 0.01029966, + "balance_loss_clip": 1.01889777, + "balance_loss_mlp": 1.03398609, + "epoch": 0.8357132120847738, + "flos": 44126672238720.0, + "grad_norm": 1.8371741608705614, + "language_loss": 0.68867636, + "learning_rate": 2.764161667219749e-07, + "loss": 0.70998323, + "num_input_tokens_seen": 299871665, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.66796875, + "step": 13900, + "time_per_iteration": 2.623135805130005 + }, + { + "auxiliary_loss_clip": 0.01099818, + "auxiliary_loss_mlp": 0.01032209, + "balance_loss_clip": 1.02103853, + "balance_loss_mlp": 1.03531981, + "epoch": 0.8357733353374418, + "flos": 24389701856640.0, + "grad_norm": 1.5620121464910832, + "language_loss": 0.71323341, + "learning_rate": 2.762186403079716e-07, + "loss": 0.73455364, + "num_input_tokens_seen": 299891960, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.64453125, + "step": 13901, + "time_per_iteration": 2.4815425872802734 + }, + { + "auxiliary_loss_clip": 0.01104099, + "auxiliary_loss_mlp": 0.01034926, + "balance_loss_clip": 1.02242041, + "balance_loss_mlp": 1.03513288, + "epoch": 0.8358334585901097, + "flos": 20916313626240.0, + "grad_norm": 2.012713035482953, + "language_loss": 0.80224025, + "learning_rate": 2.7602117925992963e-07, + "loss": 0.82363057, + "num_input_tokens_seen": 299905070, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 13902, + "time_per_iteration": 2.397468328475952 + }, + { + "auxiliary_loss_clip": 0.01097387, + "auxiliary_loss_mlp": 0.01029112, + "balance_loss_clip": 1.01762605, + "balance_loss_mlp": 1.03423131, + "epoch": 0.8358935818427777, + "flos": 19244169740160.0, + "grad_norm": 1.5609736285597, + "language_loss": 0.62570262, + "learning_rate": 2.758237835853379e-07, + "loss": 0.64696753, + "num_input_tokens_seen": 299925130, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6328125, + "step": 13903, + "time_per_iteration": 2.4553894996643066 + }, + { + "auxiliary_loss_clip": 0.0110081, + "auxiliary_loss_mlp": 0.01031479, + "balance_loss_clip": 1.02035093, + "balance_loss_mlp": 1.03525472, + "epoch": 0.8359537050954456, + "flos": 24134053783680.0, + "grad_norm": 1.9013104570600536, + "language_loss": 0.74193108, + "learning_rate": 2.7562645329168054e-07, + "loss": 0.76325393, + "num_input_tokens_seen": 299943845, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 13904, + "time_per_iteration": 2.442950487136841 + }, + { + "auxiliary_loss_clip": 0.01095719, + "auxiliary_loss_mlp": 0.01031465, + "balance_loss_clip": 1.01960409, + "balance_loss_mlp": 1.03244185, + "epoch": 0.8360138283481137, + "flos": 16180415187840.0, + "grad_norm": 1.7117779805733213, + "language_loss": 0.72669482, + "learning_rate": 2.7542918838644104e-07, + "loss": 0.74796671, + "num_input_tokens_seen": 299961620, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6328125, + "step": 13905, + "time_per_iteration": 2.4159255027770996 + }, + { + "auxiliary_loss_clip": 0.01100321, + "auxiliary_loss_mlp": 0.01036401, + "balance_loss_clip": 1.02577984, + "balance_loss_mlp": 1.0364629, + "epoch": 0.8360739516007816, + "flos": 22198899536640.0, + "grad_norm": 1.860066119718017, + "language_loss": 0.66428232, + "learning_rate": 2.752319888771e-07, + "loss": 0.68564951, + "num_input_tokens_seen": 299982170, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.640625, + "step": 13906, + "time_per_iteration": 2.5206921100616455 + }, + { + "auxiliary_loss_clip": 0.01099727, + "auxiliary_loss_mlp": 0.01027111, + "balance_loss_clip": 1.01541066, + "balance_loss_mlp": 1.03409457, + "epoch": 0.8361340748534496, + "flos": 20923137210240.0, + "grad_norm": 1.450963295791905, + "language_loss": 0.74274147, + "learning_rate": 2.7503485477113475e-07, + "loss": 0.76400983, + "num_input_tokens_seen": 300001330, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 13907, + "time_per_iteration": 2.4509541988372803 + }, + { + "auxiliary_loss_clip": 0.0110021, + "auxiliary_loss_mlp": 0.01034987, + "balance_loss_clip": 1.023036, + "balance_loss_mlp": 1.03234959, + "epoch": 0.8361941981061175, + "flos": 26173599932160.0, + "grad_norm": 1.7096278514940075, + "language_loss": 0.75336194, + "learning_rate": 2.7483778607602005e-07, + "loss": 0.77471387, + "num_input_tokens_seen": 300020645, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 13908, + "time_per_iteration": 2.473710060119629 + }, + { + "auxiliary_loss_clip": 0.01101414, + "auxiliary_loss_mlp": 0.01031459, + "balance_loss_clip": 1.01834011, + "balance_loss_mlp": 1.03433633, + "epoch": 0.8362543213587855, + "flos": 24419363512320.0, + "grad_norm": 2.0866374581819676, + "language_loss": 0.70907331, + "learning_rate": 2.7464078279922964e-07, + "loss": 0.73040199, + "num_input_tokens_seen": 300039945, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.671875, + "step": 13909, + "time_per_iteration": 2.4959874153137207 + }, + { + "auxiliary_loss_clip": 0.01102211, + "auxiliary_loss_mlp": 0.0103805, + "balance_loss_clip": 1.02625394, + "balance_loss_mlp": 1.0341723, + "epoch": 0.8363144446114534, + "flos": 17202396948480.0, + "grad_norm": 1.9337905516009115, + "language_loss": 0.73345798, + "learning_rate": 2.744438449482338e-07, + "loss": 0.75486064, + "num_input_tokens_seen": 300058260, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 13910, + "time_per_iteration": 2.4417479038238525 + }, + { + "auxiliary_loss_clip": 0.01101132, + "auxiliary_loss_mlp": 0.01031168, + "balance_loss_clip": 1.02002239, + "balance_loss_mlp": 1.03492677, + "epoch": 0.8363745678641215, + "flos": 19279398003840.0, + "grad_norm": 1.739738236911797, + "language_loss": 0.73179841, + "learning_rate": 2.742469725305001e-07, + "loss": 0.75312144, + "num_input_tokens_seen": 300076720, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 13911, + "time_per_iteration": 2.4407854080200195 + }, + { + "auxiliary_loss_clip": 0.01103906, + "auxiliary_loss_mlp": 0.01039441, + "balance_loss_clip": 1.02751374, + "balance_loss_mlp": 1.03597605, + "epoch": 0.8364346911167894, + "flos": 11874869596800.0, + "grad_norm": 1.935636032044244, + "language_loss": 0.7883411, + "learning_rate": 2.740501655534946e-07, + "loss": 0.80977458, + "num_input_tokens_seen": 300092950, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 13912, + "time_per_iteration": 2.4071462154388428 + }, + { + "auxiliary_loss_clip": 0.01100355, + "auxiliary_loss_mlp": 0.01029404, + "balance_loss_clip": 1.0182991, + "balance_loss_mlp": 1.0349431, + "epoch": 0.8364948143694574, + "flos": 20225212974720.0, + "grad_norm": 1.6944232118607583, + "language_loss": 0.78812778, + "learning_rate": 2.738534240246797e-07, + "loss": 0.80942535, + "num_input_tokens_seen": 300110950, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.65625, + "step": 13913, + "time_per_iteration": 2.4921114444732666 + }, + { + "auxiliary_loss_clip": 0.0109927, + "auxiliary_loss_mlp": 0.01028996, + "balance_loss_clip": 1.01644349, + "balance_loss_mlp": 1.03274608, + "epoch": 0.8365549376221254, + "flos": 21612909058560.0, + "grad_norm": 3.2282140586828243, + "language_loss": 0.73658252, + "learning_rate": 2.736567479515153e-07, + "loss": 0.75786519, + "num_input_tokens_seen": 300128705, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6640625, + "step": 13914, + "time_per_iteration": 2.4990034103393555 + }, + { + "auxiliary_loss_clip": 0.01100583, + "auxiliary_loss_mlp": 0.01036368, + "balance_loss_clip": 1.02429831, + "balance_loss_mlp": 1.03516841, + "epoch": 0.8366150608747933, + "flos": 23294210912640.0, + "grad_norm": 1.639775835358494, + "language_loss": 0.7142942, + "learning_rate": 2.7346013734146025e-07, + "loss": 0.73566371, + "num_input_tokens_seen": 300148635, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.65625, + "step": 13915, + "time_per_iteration": 2.5020627975463867 + }, + { + "auxiliary_loss_clip": 0.01100305, + "auxiliary_loss_mlp": 0.01030346, + "balance_loss_clip": 1.01923013, + "balance_loss_mlp": 1.03396535, + "epoch": 0.8366751841274613, + "flos": 15267673664640.0, + "grad_norm": 1.83818917702025, + "language_loss": 0.72230256, + "learning_rate": 2.7326359220197035e-07, + "loss": 0.74360901, + "num_input_tokens_seen": 300165490, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 13916, + "time_per_iteration": 2.4200260639190674 + }, + { + "auxiliary_loss_clip": 0.01101017, + "auxiliary_loss_mlp": 0.01027576, + "balance_loss_clip": 1.01569629, + "balance_loss_mlp": 1.03474307, + "epoch": 0.8367353073801292, + "flos": 13224931205760.0, + "grad_norm": 2.1106769436504336, + "language_loss": 0.74262899, + "learning_rate": 2.7306711254049755e-07, + "loss": 0.76391494, + "num_input_tokens_seen": 300182130, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6640625, + "step": 13917, + "time_per_iteration": 2.4369187355041504 + }, + { + "auxiliary_loss_clip": 0.01097995, + "auxiliary_loss_mlp": 0.01031655, + "balance_loss_clip": 1.02054524, + "balance_loss_mlp": 1.03640127, + "epoch": 0.8367954306327973, + "flos": 24205084928640.0, + "grad_norm": 1.8755136087020403, + "language_loss": 0.79014456, + "learning_rate": 2.728706983644933e-07, + "loss": 0.81144106, + "num_input_tokens_seen": 300203050, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.61328125, + "step": 13918, + "time_per_iteration": 2.4455313682556152 + }, + { + "auxiliary_loss_clip": 0.01101776, + "auxiliary_loss_mlp": 0.01034264, + "balance_loss_clip": 1.02224147, + "balance_loss_mlp": 1.03523874, + "epoch": 0.8368555538854652, + "flos": 24534744975360.0, + "grad_norm": 1.477025515229524, + "language_loss": 0.67901552, + "learning_rate": 2.7267434968140457e-07, + "loss": 0.70037591, + "num_input_tokens_seen": 300224380, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 13919, + "time_per_iteration": 2.4661288261413574 + }, + { + "auxiliary_loss_clip": 0.01098166, + "auxiliary_loss_mlp": 0.01028175, + "balance_loss_clip": 1.01685655, + "balance_loss_mlp": 1.03343344, + "epoch": 0.8369156771381332, + "flos": 20259363830400.0, + "grad_norm": 3.3051256361077685, + "language_loss": 0.73841083, + "learning_rate": 2.7247806649867835e-07, + "loss": 0.75967425, + "num_input_tokens_seen": 300242915, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 13920, + "time_per_iteration": 2.4106199741363525 + }, + { + "auxiliary_loss_clip": 0.01100689, + "auxiliary_loss_mlp": 0.01031072, + "balance_loss_clip": 1.01916862, + "balance_loss_mlp": 1.03469241, + "epoch": 0.8369758003908011, + "flos": 21835555511040.0, + "grad_norm": 1.6224503256845468, + "language_loss": 0.68769908, + "learning_rate": 2.722818488237566e-07, + "loss": 0.70901674, + "num_input_tokens_seen": 300261905, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66015625, + "step": 13921, + "time_per_iteration": 2.442763090133667 + }, + { + "auxiliary_loss_clip": 0.01103103, + "auxiliary_loss_mlp": 0.01031989, + "balance_loss_clip": 1.02022922, + "balance_loss_mlp": 1.03511822, + "epoch": 0.8370359236434691, + "flos": 21719312121600.0, + "grad_norm": 1.934901742851694, + "language_loss": 0.85668844, + "learning_rate": 2.720856966640801e-07, + "loss": 0.87803936, + "num_input_tokens_seen": 300281145, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 13922, + "time_per_iteration": 2.4856491088867188 + }, + { + "auxiliary_loss_clip": 0.0109526, + "auxiliary_loss_mlp": 0.01028959, + "balance_loss_clip": 1.01823044, + "balance_loss_mlp": 1.03202581, + "epoch": 0.837096046896137, + "flos": 23148880485120.0, + "grad_norm": 1.6289815235404943, + "language_loss": 0.71753758, + "learning_rate": 2.71889610027088e-07, + "loss": 0.73877978, + "num_input_tokens_seen": 300301610, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6328125, + "step": 13923, + "time_per_iteration": 2.433978319168091 + }, + { + "auxiliary_loss_clip": 0.01098083, + "auxiliary_loss_mlp": 0.01025149, + "balance_loss_clip": 1.01288259, + "balance_loss_mlp": 1.03416324, + "epoch": 0.8371561701488051, + "flos": 24492872695680.0, + "grad_norm": 1.756856954459112, + "language_loss": 0.76217532, + "learning_rate": 2.7169358892021433e-07, + "loss": 0.78340769, + "num_input_tokens_seen": 300319420, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.640625, + "step": 13924, + "time_per_iteration": 2.4699859619140625 + }, + { + "auxiliary_loss_clip": 0.01098831, + "auxiliary_loss_mlp": 0.01027408, + "balance_loss_clip": 1.01606488, + "balance_loss_mlp": 1.03462529, + "epoch": 0.837216293401473, + "flos": 29206723161600.0, + "grad_norm": 1.572870754789481, + "language_loss": 0.64186335, + "learning_rate": 2.7149763335089293e-07, + "loss": 0.66312575, + "num_input_tokens_seen": 300341325, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 13925, + "time_per_iteration": 2.501033067703247 + }, + { + "auxiliary_loss_clip": 0.01102375, + "auxiliary_loss_mlp": 0.01030794, + "balance_loss_clip": 1.01933801, + "balance_loss_mlp": 1.03535914, + "epoch": 0.837276416654141, + "flos": 25265275781760.0, + "grad_norm": 2.4566320285291625, + "language_loss": 0.74334025, + "learning_rate": 2.713017433265543e-07, + "loss": 0.76467204, + "num_input_tokens_seen": 300361620, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.66796875, + "step": 13926, + "time_per_iteration": 2.47856068611145 + }, + { + "auxiliary_loss_clip": 0.01102753, + "auxiliary_loss_mlp": 0.01035641, + "balance_loss_clip": 1.02346361, + "balance_loss_mlp": 1.03702879, + "epoch": 0.837336539906809, + "flos": 13882024656000.0, + "grad_norm": 1.6715120452559071, + "language_loss": 0.71465194, + "learning_rate": 2.711059188546274e-07, + "loss": 0.73603582, + "num_input_tokens_seen": 300378675, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65625, + "step": 13927, + "time_per_iteration": 2.411735773086548 + }, + { + "auxiliary_loss_clip": 0.01022785, + "auxiliary_loss_mlp": 0.01002585, + "balance_loss_clip": 1.00157166, + "balance_loss_mlp": 1.00261497, + "epoch": 0.8373966631594769, + "flos": 68870599044480.0, + "grad_norm": 0.8374200555730595, + "language_loss": 0.58843565, + "learning_rate": 2.7091015994253695e-07, + "loss": 0.60868931, + "num_input_tokens_seen": 300449740, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.20214844, + "step": 13928, + "time_per_iteration": 4.569923639297485 + }, + { + "auxiliary_loss_clip": 0.01103713, + "auxiliary_loss_mlp": 0.01031215, + "balance_loss_clip": 1.01919854, + "balance_loss_mlp": 1.0377028, + "epoch": 0.8374567864121449, + "flos": 20448972748800.0, + "grad_norm": 1.6670643605711446, + "language_loss": 0.69916427, + "learning_rate": 2.707144665977068e-07, + "loss": 0.72051352, + "num_input_tokens_seen": 300470000, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66015625, + "step": 13929, + "time_per_iteration": 2.450941801071167 + }, + { + "auxiliary_loss_clip": 0.01103074, + "auxiliary_loss_mlp": 0.01027744, + "balance_loss_clip": 1.01532209, + "balance_loss_mlp": 1.0351336, + "epoch": 0.8375169096648128, + "flos": 41904197101440.0, + "grad_norm": 1.4407137482124839, + "language_loss": 0.6694839, + "learning_rate": 2.705188388275574e-07, + "loss": 0.69079208, + "num_input_tokens_seen": 300494975, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 13930, + "time_per_iteration": 4.168683052062988 + }, + { + "auxiliary_loss_clip": 0.01100625, + "auxiliary_loss_mlp": 0.01028404, + "balance_loss_clip": 1.01687002, + "balance_loss_mlp": 1.03649974, + "epoch": 0.8375770329174809, + "flos": 20009354192640.0, + "grad_norm": 1.9250443938302013, + "language_loss": 0.71341848, + "learning_rate": 2.703232766395067e-07, + "loss": 0.73470879, + "num_input_tokens_seen": 300513175, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 13931, + "time_per_iteration": 2.4318478107452393 + }, + { + "auxiliary_loss_clip": 0.0109844, + "auxiliary_loss_mlp": 0.01030356, + "balance_loss_clip": 1.01900148, + "balance_loss_mlp": 1.03423381, + "epoch": 0.8376371561701488, + "flos": 22783597125120.0, + "grad_norm": 1.6115485766829456, + "language_loss": 0.71996433, + "learning_rate": 2.701277800409705e-07, + "loss": 0.7412523, + "num_input_tokens_seen": 300533770, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.640625, + "step": 13932, + "time_per_iteration": 2.4718666076660156 + }, + { + "auxiliary_loss_clip": 0.01097692, + "auxiliary_loss_mlp": 0.01029982, + "balance_loss_clip": 1.01924706, + "balance_loss_mlp": 1.03308678, + "epoch": 0.8376972794228168, + "flos": 23914459987200.0, + "grad_norm": 2.572463429726218, + "language_loss": 0.66981155, + "learning_rate": 2.699323490393628e-07, + "loss": 0.69108832, + "num_input_tokens_seen": 300552995, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.64453125, + "step": 13933, + "time_per_iteration": 2.462989091873169 + }, + { + "auxiliary_loss_clip": 0.01099376, + "auxiliary_loss_mlp": 0.01035521, + "balance_loss_clip": 1.02439857, + "balance_loss_mlp": 1.03577971, + "epoch": 0.8377574026754847, + "flos": 13734718980480.0, + "grad_norm": 1.9819338387703926, + "language_loss": 0.76037461, + "learning_rate": 2.697369836420933e-07, + "loss": 0.78172362, + "num_input_tokens_seen": 300570275, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.63671875, + "step": 13934, + "time_per_iteration": 3.8440794944763184 + }, + { + "auxiliary_loss_clip": 0.01101761, + "auxiliary_loss_mlp": 0.01028154, + "balance_loss_clip": 1.01658523, + "balance_loss_mlp": 1.03747773, + "epoch": 0.8378175259281527, + "flos": 21651333632640.0, + "grad_norm": 2.4023414494461206, + "language_loss": 0.77042425, + "learning_rate": 2.6954168385657115e-07, + "loss": 0.79172337, + "num_input_tokens_seen": 300590875, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.64453125, + "step": 13935, + "time_per_iteration": 2.4580769538879395 + }, + { + "auxiliary_loss_clip": 0.01099502, + "auxiliary_loss_mlp": 0.01031487, + "balance_loss_clip": 1.01969159, + "balance_loss_mlp": 1.0334816, + "epoch": 0.8378776491808206, + "flos": 15448806973440.0, + "grad_norm": 2.54000512074222, + "language_loss": 0.55758452, + "learning_rate": 2.6934644969020135e-07, + "loss": 0.57889438, + "num_input_tokens_seen": 300607490, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.66015625, + "step": 13936, + "time_per_iteration": 2.3995320796966553 + }, + { + "auxiliary_loss_clip": 0.01097268, + "auxiliary_loss_mlp": 0.01028442, + "balance_loss_clip": 1.01752198, + "balance_loss_mlp": 1.03285074, + "epoch": 0.8379377724334887, + "flos": 14720395069440.0, + "grad_norm": 1.7997670475804433, + "language_loss": 0.89385533, + "learning_rate": 2.691512811503882e-07, + "loss": 0.91511238, + "num_input_tokens_seen": 300623635, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.64453125, + "step": 13937, + "time_per_iteration": 2.3957390785217285 + }, + { + "auxiliary_loss_clip": 0.0110113, + "auxiliary_loss_mlp": 0.01028341, + "balance_loss_clip": 1.01721907, + "balance_loss_mlp": 1.03569484, + "epoch": 0.8379978956861566, + "flos": 24535247765760.0, + "grad_norm": 1.9085284321860068, + "language_loss": 0.81626403, + "learning_rate": 2.689561782445313e-07, + "loss": 0.83755875, + "num_input_tokens_seen": 300643835, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 13938, + "time_per_iteration": 2.462914228439331 + }, + { + "auxiliary_loss_clip": 0.01101992, + "auxiliary_loss_mlp": 0.01031806, + "balance_loss_clip": 1.01986718, + "balance_loss_mlp": 1.03539038, + "epoch": 0.8380580189388246, + "flos": 18952611045120.0, + "grad_norm": 1.9830063454594962, + "language_loss": 0.70170665, + "learning_rate": 2.6876114098002965e-07, + "loss": 0.72304463, + "num_input_tokens_seen": 300662500, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66796875, + "step": 13939, + "time_per_iteration": 2.419306516647339 + }, + { + "auxiliary_loss_clip": 0.01103831, + "auxiliary_loss_mlp": 0.0103631, + "balance_loss_clip": 1.02409744, + "balance_loss_mlp": 1.03695917, + "epoch": 0.8381181421914926, + "flos": 26540283922560.0, + "grad_norm": 2.042221419683719, + "language_loss": 0.76166761, + "learning_rate": 2.6856616936428e-07, + "loss": 0.78306901, + "num_input_tokens_seen": 300681480, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66796875, + "step": 13940, + "time_per_iteration": 2.465965509414673 + }, + { + "auxiliary_loss_clip": 0.01098815, + "auxiliary_loss_mlp": 0.01029559, + "balance_loss_clip": 1.01821637, + "balance_loss_mlp": 1.03480721, + "epoch": 0.8381782654441605, + "flos": 23291481479040.0, + "grad_norm": 1.701308985819195, + "language_loss": 0.76258647, + "learning_rate": 2.6837126340467374e-07, + "loss": 0.78387022, + "num_input_tokens_seen": 300699165, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 13941, + "time_per_iteration": 2.471020221710205 + }, + { + "auxiliary_loss_clip": 0.01101967, + "auxiliary_loss_mlp": 0.01028195, + "balance_loss_clip": 1.01569009, + "balance_loss_mlp": 1.03386378, + "epoch": 0.8382383886968285, + "flos": 26758800311040.0, + "grad_norm": 2.8349668095914025, + "language_loss": 0.73475212, + "learning_rate": 2.6817642310860276e-07, + "loss": 0.75605369, + "num_input_tokens_seen": 300714615, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 13942, + "time_per_iteration": 2.468085527420044 + }, + { + "auxiliary_loss_clip": 0.01107356, + "auxiliary_loss_mlp": 0.01035548, + "balance_loss_clip": 1.02313805, + "balance_loss_mlp": 1.03645301, + "epoch": 0.8382985119494964, + "flos": 26104544035200.0, + "grad_norm": 1.5830946628416007, + "language_loss": 0.7929855, + "learning_rate": 2.679816484834554e-07, + "loss": 0.81441456, + "num_input_tokens_seen": 300734860, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 13943, + "time_per_iteration": 2.46358323097229 + }, + { + "auxiliary_loss_clip": 0.01098177, + "auxiliary_loss_mlp": 0.0102923, + "balance_loss_clip": 1.01794708, + "balance_loss_mlp": 1.03353262, + "epoch": 0.8383586352021645, + "flos": 16435129507200.0, + "grad_norm": 1.9364854402368852, + "language_loss": 0.85158527, + "learning_rate": 2.6778693953661766e-07, + "loss": 0.87285936, + "num_input_tokens_seen": 300752735, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.64453125, + "step": 13944, + "time_per_iteration": 2.442012310028076 + }, + { + "auxiliary_loss_clip": 0.01022937, + "auxiliary_loss_mlp": 0.01002153, + "balance_loss_clip": 1.00108051, + "balance_loss_mlp": 1.00304651, + "epoch": 0.8384187584548324, + "flos": 64195532288640.0, + "grad_norm": 0.6210896800170687, + "language_loss": 0.50280273, + "learning_rate": 2.6759229627547263e-07, + "loss": 0.52305365, + "num_input_tokens_seen": 300820760, + "router_z_loss_clip": 0.01074219, + "router_z_loss_mlp": 0.19921875, + "step": 13945, + "time_per_iteration": 3.166820526123047 + }, + { + "auxiliary_loss_clip": 0.01098094, + "auxiliary_loss_mlp": 0.01029063, + "balance_loss_clip": 1.0179286, + "balance_loss_mlp": 1.03397751, + "epoch": 0.8384788817075004, + "flos": 22382905933440.0, + "grad_norm": 1.8419185707683658, + "language_loss": 0.6506319, + "learning_rate": 2.673977187074017e-07, + "loss": 0.67190349, + "num_input_tokens_seen": 300840025, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.640625, + "step": 13946, + "time_per_iteration": 2.4629406929016113 + }, + { + "auxiliary_loss_clip": 0.0109974, + "auxiliary_loss_mlp": 0.01031185, + "balance_loss_clip": 1.01890647, + "balance_loss_mlp": 1.03315461, + "epoch": 0.8385390049601683, + "flos": 29496845312640.0, + "grad_norm": 1.5727213172282053, + "language_loss": 0.67289019, + "learning_rate": 2.672032068397829e-07, + "loss": 0.69419944, + "num_input_tokens_seen": 300860380, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.66796875, + "step": 13947, + "time_per_iteration": 2.4871394634246826 + }, + { + "auxiliary_loss_clip": 0.01101642, + "auxiliary_loss_mlp": 0.01028612, + "balance_loss_clip": 1.01655436, + "balance_loss_mlp": 1.03533816, + "epoch": 0.8385991282128363, + "flos": 32707797799680.0, + "grad_norm": 1.4208449303436252, + "language_loss": 0.69888943, + "learning_rate": 2.6700876067999176e-07, + "loss": 0.72019202, + "num_input_tokens_seen": 300881895, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 13948, + "time_per_iteration": 2.5325706005096436 + }, + { + "auxiliary_loss_clip": 0.01096897, + "auxiliary_loss_mlp": 0.0103067, + "balance_loss_clip": 1.01995289, + "balance_loss_mlp": 1.03378117, + "epoch": 0.8386592514655042, + "flos": 25441022050560.0, + "grad_norm": 2.1940873483336927, + "language_loss": 0.84753001, + "learning_rate": 2.6681438023540194e-07, + "loss": 0.86880571, + "num_input_tokens_seen": 300901575, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6328125, + "step": 13949, + "time_per_iteration": 2.4774601459503174 + }, + { + "auxiliary_loss_clip": 0.01097997, + "auxiliary_loss_mlp": 0.01026375, + "balance_loss_clip": 1.01491952, + "balance_loss_mlp": 1.03441536, + "epoch": 0.8387193747181723, + "flos": 22015898720640.0, + "grad_norm": 1.849770284110971, + "language_loss": 0.70397264, + "learning_rate": 2.66620065513385e-07, + "loss": 0.72521639, + "num_input_tokens_seen": 300919735, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6328125, + "step": 13950, + "time_per_iteration": 2.4515769481658936 + }, + { + "auxiliary_loss_clip": 0.01098204, + "auxiliary_loss_mlp": 0.0102659, + "balance_loss_clip": 1.014979, + "balance_loss_mlp": 1.03368354, + "epoch": 0.8387794979708402, + "flos": 18150223080960.0, + "grad_norm": 1.91426323205629, + "language_loss": 0.64385873, + "learning_rate": 2.6642581652130913e-07, + "loss": 0.66510665, + "num_input_tokens_seen": 300939150, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.64453125, + "step": 13951, + "time_per_iteration": 2.413670539855957 + }, + { + "auxiliary_loss_clip": 0.01100218, + "auxiliary_loss_mlp": 0.01030938, + "balance_loss_clip": 1.0197382, + "balance_loss_mlp": 1.03516793, + "epoch": 0.8388396212235082, + "flos": 25411216740480.0, + "grad_norm": 1.4096665039754765, + "language_loss": 0.69953537, + "learning_rate": 2.662316332665393e-07, + "loss": 0.72084689, + "num_input_tokens_seen": 300959730, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6484375, + "step": 13952, + "time_per_iteration": 2.4785561561584473 + }, + { + "auxiliary_loss_clip": 0.01098001, + "auxiliary_loss_mlp": 0.01025504, + "balance_loss_clip": 1.01420259, + "balance_loss_mlp": 1.03371167, + "epoch": 0.8388997444761762, + "flos": 22273055164800.0, + "grad_norm": 2.562220143199556, + "language_loss": 0.72693485, + "learning_rate": 2.6603751575643987e-07, + "loss": 0.7481699, + "num_input_tokens_seen": 300976120, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.64453125, + "step": 13953, + "time_per_iteration": 2.4456255435943604 + }, + { + "auxiliary_loss_clip": 0.01097183, + "auxiliary_loss_mlp": 0.01025707, + "balance_loss_clip": 1.01448941, + "balance_loss_mlp": 1.03371382, + "epoch": 0.8389598677288441, + "flos": 19573219255680.0, + "grad_norm": 2.571526992442188, + "language_loss": 0.68295968, + "learning_rate": 2.6584346399837176e-07, + "loss": 0.70418859, + "num_input_tokens_seen": 300995080, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.63671875, + "step": 13954, + "time_per_iteration": 2.4475159645080566 + }, + { + "auxiliary_loss_clip": 0.01102027, + "auxiliary_loss_mlp": 0.01033062, + "balance_loss_clip": 1.02216661, + "balance_loss_mlp": 1.03607535, + "epoch": 0.8390199909815121, + "flos": 17384715406080.0, + "grad_norm": 1.8685810637104039, + "language_loss": 0.72950685, + "learning_rate": 2.656494779996932e-07, + "loss": 0.75085771, + "num_input_tokens_seen": 301012920, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.65625, + "step": 13955, + "time_per_iteration": 2.4151742458343506 + }, + { + "auxiliary_loss_clip": 0.01100167, + "auxiliary_loss_mlp": 0.01027037, + "balance_loss_clip": 1.01495552, + "balance_loss_mlp": 1.03408492, + "epoch": 0.83908011423418, + "flos": 24639639667200.0, + "grad_norm": 2.2257720048014145, + "language_loss": 0.66271257, + "learning_rate": 2.6545555776775995e-07, + "loss": 0.68398464, + "num_input_tokens_seen": 301028875, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66015625, + "step": 13956, + "time_per_iteration": 2.472264528274536 + }, + { + "auxiliary_loss_clip": 0.0110128, + "auxiliary_loss_mlp": 0.01029711, + "balance_loss_clip": 1.01721215, + "balance_loss_mlp": 1.03407979, + "epoch": 0.8391402374868481, + "flos": 24718356322560.0, + "grad_norm": 1.6130326255768752, + "language_loss": 0.79156423, + "learning_rate": 2.6526170330992667e-07, + "loss": 0.8128742, + "num_input_tokens_seen": 301050115, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.671875, + "step": 13957, + "time_per_iteration": 2.5556459426879883 + }, + { + "auxiliary_loss_clip": 0.01022919, + "auxiliary_loss_mlp": 0.00998362, + "balance_loss_clip": 0.99737281, + "balance_loss_mlp": 1.00284195, + "epoch": 0.839200360739516, + "flos": 56871695784960.0, + "grad_norm": 0.7468906710364033, + "language_loss": 0.53393608, + "learning_rate": 2.6506791463354283e-07, + "loss": 0.55414885, + "num_input_tokens_seen": 301114155, + "router_z_loss_clip": 0.0098877, + "router_z_loss_mlp": 0.20117188, + "step": 13958, + "time_per_iteration": 3.1345131397247314 + }, + { + "auxiliary_loss_clip": 0.01098889, + "auxiliary_loss_mlp": 0.01030882, + "balance_loss_clip": 1.01889539, + "balance_loss_mlp": 1.03390813, + "epoch": 0.839260483992184, + "flos": 18332792933760.0, + "grad_norm": 1.8191948509907279, + "language_loss": 0.73426306, + "learning_rate": 2.648741917459574e-07, + "loss": 0.75556076, + "num_input_tokens_seen": 301133150, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6484375, + "step": 13959, + "time_per_iteration": 2.422290802001953 + }, + { + "auxiliary_loss_clip": 0.01096696, + "auxiliary_loss_mlp": 0.01028323, + "balance_loss_clip": 1.01730168, + "balance_loss_mlp": 1.03430462, + "epoch": 0.8393206072448519, + "flos": 27087921653760.0, + "grad_norm": 1.7739336308149691, + "language_loss": 0.55481756, + "learning_rate": 2.646805346545169e-07, + "loss": 0.57606781, + "num_input_tokens_seen": 301153600, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.625, + "step": 13960, + "time_per_iteration": 2.48284649848938 + }, + { + "auxiliary_loss_clip": 0.01023374, + "auxiliary_loss_mlp": 0.00998479, + "balance_loss_clip": 0.99740618, + "balance_loss_mlp": 1.00327396, + "epoch": 0.8393807304975199, + "flos": 61521192057600.0, + "grad_norm": 0.7837966547782983, + "language_loss": 0.60692465, + "learning_rate": 2.6448694336656397e-07, + "loss": 0.6271432, + "num_input_tokens_seen": 301214335, + "router_z_loss_clip": 0.01074219, + "router_z_loss_mlp": 0.20117188, + "step": 13961, + "time_per_iteration": 3.1125965118408203 + }, + { + "auxiliary_loss_clip": 0.01096869, + "auxiliary_loss_mlp": 0.01028286, + "balance_loss_clip": 1.01716423, + "balance_loss_mlp": 1.03198576, + "epoch": 0.8394408537501878, + "flos": 14894848448640.0, + "grad_norm": 2.2318920850341626, + "language_loss": 0.68340284, + "learning_rate": 2.642934178894405e-07, + "loss": 0.7046544, + "num_input_tokens_seen": 301228960, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 13962, + "time_per_iteration": 2.3924074172973633 + }, + { + "auxiliary_loss_clip": 0.01099112, + "auxiliary_loss_mlp": 0.01029285, + "balance_loss_clip": 1.01776314, + "balance_loss_mlp": 1.03186655, + "epoch": 0.8395009770028559, + "flos": 17412186332160.0, + "grad_norm": 1.8739474188933585, + "language_loss": 0.73263037, + "learning_rate": 2.640999582304841e-07, + "loss": 0.7539143, + "num_input_tokens_seen": 301245875, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 13963, + "time_per_iteration": 2.411219835281372 + }, + { + "auxiliary_loss_clip": 0.01100071, + "auxiliary_loss_mlp": 0.01035213, + "balance_loss_clip": 1.02403665, + "balance_loss_mlp": 1.03410983, + "epoch": 0.8395611002555238, + "flos": 27924747782400.0, + "grad_norm": 1.7340708805723295, + "language_loss": 0.76229376, + "learning_rate": 2.6390656439703173e-07, + "loss": 0.78364658, + "num_input_tokens_seen": 301265550, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.66015625, + "step": 13964, + "time_per_iteration": 2.478710412979126 + }, + { + "auxiliary_loss_clip": 0.01103609, + "auxiliary_loss_mlp": 0.01035903, + "balance_loss_clip": 1.02302241, + "balance_loss_mlp": 1.03555202, + "epoch": 0.8396212235081918, + "flos": 11100922225920.0, + "grad_norm": 1.9577585475000066, + "language_loss": 0.78094041, + "learning_rate": 2.637132363964161e-07, + "loss": 0.8023355, + "num_input_tokens_seen": 301282035, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6796875, + "step": 13965, + "time_per_iteration": 2.4036173820495605 + }, + { + "auxiliary_loss_clip": 0.0109763, + "auxiliary_loss_mlp": 0.01028028, + "balance_loss_clip": 1.01681042, + "balance_loss_mlp": 1.03346133, + "epoch": 0.8396813467608598, + "flos": 35735641729920.0, + "grad_norm": 2.0786855194651714, + "language_loss": 0.66062534, + "learning_rate": 2.635199742359684e-07, + "loss": 0.68188184, + "num_input_tokens_seen": 301305210, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 13966, + "time_per_iteration": 2.558805465698242 + }, + { + "auxiliary_loss_clip": 0.0109852, + "auxiliary_loss_mlp": 0.01031584, + "balance_loss_clip": 1.02015817, + "balance_loss_mlp": 1.03405714, + "epoch": 0.8397414700135277, + "flos": 26176724415360.0, + "grad_norm": 1.664869225278249, + "language_loss": 0.74680585, + "learning_rate": 2.633267779230177e-07, + "loss": 0.76810688, + "num_input_tokens_seen": 301324885, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.64453125, + "step": 13967, + "time_per_iteration": 2.4877943992614746 + }, + { + "auxiliary_loss_clip": 0.01099282, + "auxiliary_loss_mlp": 0.01029925, + "balance_loss_clip": 1.01832557, + "balance_loss_mlp": 1.03487408, + "epoch": 0.8398015932661957, + "flos": 18333116156160.0, + "grad_norm": 2.2246871986534464, + "language_loss": 0.83141935, + "learning_rate": 2.6313364746488974e-07, + "loss": 0.85271138, + "num_input_tokens_seen": 301343070, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.64453125, + "step": 13968, + "time_per_iteration": 2.4127590656280518 + }, + { + "auxiliary_loss_clip": 0.01101548, + "auxiliary_loss_mlp": 0.01032272, + "balance_loss_clip": 1.02051806, + "balance_loss_mlp": 1.03508413, + "epoch": 0.8398617165188637, + "flos": 17379507934080.0, + "grad_norm": 2.3055427477830177, + "language_loss": 0.77584493, + "learning_rate": 2.629405828689075e-07, + "loss": 0.7971831, + "num_input_tokens_seen": 301359280, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6640625, + "step": 13969, + "time_per_iteration": 2.531064033508301 + }, + { + "auxiliary_loss_clip": 0.01101785, + "auxiliary_loss_mlp": 0.01027649, + "balance_loss_clip": 1.01506102, + "balance_loss_mlp": 1.03373933, + "epoch": 0.8399218397715317, + "flos": 22929681738240.0, + "grad_norm": 2.0172098119566026, + "language_loss": 0.77522105, + "learning_rate": 2.627475841423923e-07, + "loss": 0.79651541, + "num_input_tokens_seen": 301376465, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6796875, + "step": 13970, + "time_per_iteration": 3.822666645050049 + }, + { + "auxiliary_loss_clip": 0.01099108, + "auxiliary_loss_mlp": 0.01035452, + "balance_loss_clip": 1.02421093, + "balance_loss_mlp": 1.03340948, + "epoch": 0.8399819630241996, + "flos": 23149562843520.0, + "grad_norm": 2.67779469297833, + "language_loss": 0.72165084, + "learning_rate": 2.625546512926633e-07, + "loss": 0.74299651, + "num_input_tokens_seen": 301396000, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 13971, + "time_per_iteration": 2.4577131271362305 + }, + { + "auxiliary_loss_clip": 0.01098585, + "auxiliary_loss_mlp": 0.01027489, + "balance_loss_clip": 1.01532924, + "balance_loss_mlp": 1.03278506, + "epoch": 0.8400420862768676, + "flos": 16397423205120.0, + "grad_norm": 1.789878655946985, + "language_loss": 0.77530694, + "learning_rate": 2.623617843270358e-07, + "loss": 0.79656768, + "num_input_tokens_seen": 301413160, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.65625, + "step": 13972, + "time_per_iteration": 3.873415231704712 + }, + { + "auxiliary_loss_clip": 0.01097007, + "auxiliary_loss_mlp": 0.01032465, + "balance_loss_clip": 1.02161694, + "balance_loss_mlp": 1.03390861, + "epoch": 0.8401022095295355, + "flos": 21287486816640.0, + "grad_norm": 1.3193607521155475, + "language_loss": 0.68169355, + "learning_rate": 2.6216898325282333e-07, + "loss": 0.70298827, + "num_input_tokens_seen": 301433325, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6328125, + "step": 13973, + "time_per_iteration": 2.4711923599243164 + }, + { + "auxiliary_loss_clip": 0.01101565, + "auxiliary_loss_mlp": 0.01026266, + "balance_loss_clip": 1.01435125, + "balance_loss_mlp": 1.03448188, + "epoch": 0.8401623327822035, + "flos": 17311313963520.0, + "grad_norm": 1.7672136732051997, + "language_loss": 0.78160721, + "learning_rate": 2.619762480773382e-07, + "loss": 0.80288553, + "num_input_tokens_seen": 301450265, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 13974, + "time_per_iteration": 2.471079111099243 + }, + { + "auxiliary_loss_clip": 0.010996, + "auxiliary_loss_mlp": 0.01027292, + "balance_loss_clip": 1.01623559, + "balance_loss_mlp": 1.03378248, + "epoch": 0.8402224560348714, + "flos": 22236677665920.0, + "grad_norm": 1.5307501789030493, + "language_loss": 0.72512347, + "learning_rate": 2.617835788078868e-07, + "loss": 0.74639237, + "num_input_tokens_seen": 301470760, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.66015625, + "step": 13975, + "time_per_iteration": 3.9028189182281494 + }, + { + "auxiliary_loss_clip": 0.01097964, + "auxiliary_loss_mlp": 0.01025214, + "balance_loss_clip": 1.01351976, + "balance_loss_mlp": 1.03358281, + "epoch": 0.8402825792875395, + "flos": 20229953569920.0, + "grad_norm": 1.6696381164550365, + "language_loss": 0.72594655, + "learning_rate": 2.6159097545177645e-07, + "loss": 0.74717832, + "num_input_tokens_seen": 301489425, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.64453125, + "step": 13976, + "time_per_iteration": 2.5141208171844482 + }, + { + "auxiliary_loss_clip": 0.01096247, + "auxiliary_loss_mlp": 0.01027249, + "balance_loss_clip": 1.01650858, + "balance_loss_mlp": 1.0321219, + "epoch": 0.8403427025402074, + "flos": 23289973107840.0, + "grad_norm": 1.6819185940654011, + "language_loss": 0.72135288, + "learning_rate": 2.61398438016311e-07, + "loss": 0.7425878, + "num_input_tokens_seen": 301508885, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.640625, + "step": 13977, + "time_per_iteration": 2.50201153755188 + }, + { + "auxiliary_loss_clip": 0.0109805, + "auxiliary_loss_mlp": 0.01027928, + "balance_loss_clip": 1.01634645, + "balance_loss_mlp": 1.03184259, + "epoch": 0.8404028257928754, + "flos": 32675586278400.0, + "grad_norm": 1.3910921422626445, + "language_loss": 0.68459249, + "learning_rate": 2.6120596650879043e-07, + "loss": 0.70585227, + "num_input_tokens_seen": 301533780, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6640625, + "step": 13978, + "time_per_iteration": 2.5799074172973633 + }, + { + "auxiliary_loss_clip": 0.01094581, + "auxiliary_loss_mlp": 0.01027033, + "balance_loss_clip": 1.0161258, + "balance_loss_mlp": 1.03286028, + "epoch": 0.8404629490455434, + "flos": 16180522928640.0, + "grad_norm": 1.6895591936246208, + "language_loss": 0.77976441, + "learning_rate": 2.610135609365145e-07, + "loss": 0.80098057, + "num_input_tokens_seen": 301551775, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6171875, + "step": 13979, + "time_per_iteration": 2.4045827388763428 + }, + { + "auxiliary_loss_clip": 0.01100503, + "auxiliary_loss_mlp": 0.01028111, + "balance_loss_clip": 1.01674402, + "balance_loss_mlp": 1.03574336, + "epoch": 0.8405230722982113, + "flos": 15194451790080.0, + "grad_norm": 1.8475240517602953, + "language_loss": 0.77947694, + "learning_rate": 2.60821221306778e-07, + "loss": 0.80076307, + "num_input_tokens_seen": 301570495, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6484375, + "step": 13980, + "time_per_iteration": 2.4739646911621094 + }, + { + "auxiliary_loss_clip": 0.01098895, + "auxiliary_loss_mlp": 0.01026959, + "balance_loss_clip": 1.01647449, + "balance_loss_mlp": 1.0354929, + "epoch": 0.8405831955508793, + "flos": 27812418975360.0, + "grad_norm": 1.5275513384227286, + "language_loss": 0.86409223, + "learning_rate": 2.606289476268757e-07, + "loss": 0.88535082, + "num_input_tokens_seen": 301591705, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.6328125, + "step": 13981, + "time_per_iteration": 2.4817118644714355 + }, + { + "auxiliary_loss_clip": 0.01100072, + "auxiliary_loss_mlp": 0.01028176, + "balance_loss_clip": 1.01689255, + "balance_loss_mlp": 1.03534365, + "epoch": 0.8406433188035473, + "flos": 23769452782080.0, + "grad_norm": 2.1699458590209955, + "language_loss": 0.67915559, + "learning_rate": 2.6043673990409745e-07, + "loss": 0.70043814, + "num_input_tokens_seen": 301611670, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6484375, + "step": 13982, + "time_per_iteration": 2.4667885303497314 + }, + { + "auxiliary_loss_clip": 0.01101005, + "auxiliary_loss_mlp": 0.01033317, + "balance_loss_clip": 1.02090764, + "balance_loss_mlp": 1.03581178, + "epoch": 0.8407034420562153, + "flos": 29205681667200.0, + "grad_norm": 1.6697203722458216, + "language_loss": 0.68169171, + "learning_rate": 2.602445981457324e-07, + "loss": 0.70303488, + "num_input_tokens_seen": 301632540, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.65234375, + "step": 13983, + "time_per_iteration": 2.5006182193756104 + }, + { + "auxiliary_loss_clip": 0.01098671, + "auxiliary_loss_mlp": 0.01026021, + "balance_loss_clip": 1.01460695, + "balance_loss_mlp": 1.03199232, + "epoch": 0.8407635653088832, + "flos": 26360084367360.0, + "grad_norm": 1.7762423730618389, + "language_loss": 0.78527683, + "learning_rate": 2.6005252235906684e-07, + "loss": 0.8065238, + "num_input_tokens_seen": 301651480, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 13984, + "time_per_iteration": 2.489741086959839 + }, + { + "auxiliary_loss_clip": 0.01095303, + "auxiliary_loss_mlp": 0.01032124, + "balance_loss_clip": 1.02112675, + "balance_loss_mlp": 1.03048182, + "epoch": 0.8408236885615512, + "flos": 21468799693440.0, + "grad_norm": 1.933267070972905, + "language_loss": 0.60296601, + "learning_rate": 2.598605125513842e-07, + "loss": 0.62424028, + "num_input_tokens_seen": 301670010, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 13985, + "time_per_iteration": 2.4298417568206787 + }, + { + "auxiliary_loss_clip": 0.01101526, + "auxiliary_loss_mlp": 0.01027355, + "balance_loss_clip": 1.01508272, + "balance_loss_mlp": 1.03452444, + "epoch": 0.8408838118142191, + "flos": 22963724853120.0, + "grad_norm": 1.5581250113254055, + "language_loss": 0.8171947, + "learning_rate": 2.5966856872996467e-07, + "loss": 0.83848357, + "num_input_tokens_seen": 301689785, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.671875, + "step": 13986, + "time_per_iteration": 2.475343942642212 + }, + { + "auxiliary_loss_clip": 0.01099857, + "auxiliary_loss_mlp": 0.01028406, + "balance_loss_clip": 1.01718867, + "balance_loss_mlp": 1.03571963, + "epoch": 0.8409439350668871, + "flos": 26800026145920.0, + "grad_norm": 1.6939805128572716, + "language_loss": 0.65535557, + "learning_rate": 2.5947669090208755e-07, + "loss": 0.67663825, + "num_input_tokens_seen": 301712225, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 13987, + "time_per_iteration": 2.5180106163024902 + }, + { + "auxiliary_loss_clip": 0.01099856, + "auxiliary_loss_mlp": 0.01032928, + "balance_loss_clip": 1.02157354, + "balance_loss_mlp": 1.03494871, + "epoch": 0.841004058319555, + "flos": 26578672583040.0, + "grad_norm": 1.8313027359728804, + "language_loss": 0.67391479, + "learning_rate": 2.5928487907502906e-07, + "loss": 0.69524264, + "num_input_tokens_seen": 301730955, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 13988, + "time_per_iteration": 2.536297559738159 + }, + { + "auxiliary_loss_clip": 0.01105067, + "auxiliary_loss_mlp": 0.01035229, + "balance_loss_clip": 1.022843, + "balance_loss_mlp": 1.03692877, + "epoch": 0.8410641815722231, + "flos": 14501878680960.0, + "grad_norm": 2.1164994758777573, + "language_loss": 0.80786854, + "learning_rate": 2.590931332560622e-07, + "loss": 0.82927155, + "num_input_tokens_seen": 301746930, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 13989, + "time_per_iteration": 2.415370225906372 + }, + { + "auxiliary_loss_clip": 0.01100037, + "auxiliary_loss_mlp": 0.01027672, + "balance_loss_clip": 1.01601958, + "balance_loss_mlp": 1.03373408, + "epoch": 0.841124304824891, + "flos": 29166682475520.0, + "grad_norm": 1.624389596887663, + "language_loss": 0.75334507, + "learning_rate": 2.5890145345245826e-07, + "loss": 0.7746222, + "num_input_tokens_seen": 301766945, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 13990, + "time_per_iteration": 2.53814959526062 + }, + { + "auxiliary_loss_clip": 0.01092936, + "auxiliary_loss_mlp": 0.01031853, + "balance_loss_clip": 1.02082646, + "balance_loss_mlp": 1.03118992, + "epoch": 0.841184428077559, + "flos": 22412028885120.0, + "grad_norm": 1.6301250053203777, + "language_loss": 0.80746663, + "learning_rate": 2.5870983967148597e-07, + "loss": 0.82871455, + "num_input_tokens_seen": 301785460, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6171875, + "step": 13991, + "time_per_iteration": 2.450252056121826 + }, + { + "auxiliary_loss_clip": 0.01097513, + "auxiliary_loss_mlp": 0.01032194, + "balance_loss_clip": 1.02135789, + "balance_loss_mlp": 1.03337455, + "epoch": 0.841244551330227, + "flos": 22962791099520.0, + "grad_norm": 1.8798484826886184, + "language_loss": 0.70560163, + "learning_rate": 2.585182919204105e-07, + "loss": 0.72689867, + "num_input_tokens_seen": 301804180, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.640625, + "step": 13992, + "time_per_iteration": 2.4831645488739014 + }, + { + "auxiliary_loss_clip": 0.01099899, + "auxiliary_loss_mlp": 0.01023689, + "balance_loss_clip": 1.01287127, + "balance_loss_mlp": 1.03403449, + "epoch": 0.8413046745828949, + "flos": 21032736583680.0, + "grad_norm": 1.6703212978167075, + "language_loss": 0.76615024, + "learning_rate": 2.583268102064959e-07, + "loss": 0.78738606, + "num_input_tokens_seen": 301823670, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.65625, + "step": 13993, + "time_per_iteration": 2.4382317066192627 + }, + { + "auxiliary_loss_clip": 0.01104675, + "auxiliary_loss_mlp": 0.01033694, + "balance_loss_clip": 1.02032471, + "balance_loss_mlp": 1.03401446, + "epoch": 0.841364797835563, + "flos": 27052082858880.0, + "grad_norm": 1.8832197605446068, + "language_loss": 0.74138421, + "learning_rate": 2.5813539453700393e-07, + "loss": 0.76276791, + "num_input_tokens_seen": 301845890, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.70703125, + "step": 13994, + "time_per_iteration": 2.5059263706207275 + }, + { + "auxiliary_loss_clip": 0.01098149, + "auxiliary_loss_mlp": 0.01028107, + "balance_loss_clip": 1.01743138, + "balance_loss_mlp": 1.03507328, + "epoch": 0.8414249210882309, + "flos": 17895688329600.0, + "grad_norm": 1.4974162052212234, + "language_loss": 0.59372008, + "learning_rate": 2.5794404491919163e-07, + "loss": 0.61498266, + "num_input_tokens_seen": 301863985, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6328125, + "step": 13995, + "time_per_iteration": 2.43625545501709 + }, + { + "auxiliary_loss_clip": 0.01098487, + "auxiliary_loss_mlp": 0.01030117, + "balance_loss_clip": 1.01815403, + "balance_loss_mlp": 1.03378701, + "epoch": 0.8414850443408989, + "flos": 25441201618560.0, + "grad_norm": 1.8212710126456297, + "language_loss": 0.72060537, + "learning_rate": 2.577527613603163e-07, + "loss": 0.74189138, + "num_input_tokens_seen": 301882765, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6484375, + "step": 13996, + "time_per_iteration": 2.503129482269287 + }, + { + "auxiliary_loss_clip": 0.01099556, + "auxiliary_loss_mlp": 0.01029417, + "balance_loss_clip": 1.01885498, + "balance_loss_mlp": 1.03361964, + "epoch": 0.8415451675935668, + "flos": 23220055284480.0, + "grad_norm": 1.64892494823158, + "language_loss": 0.64126182, + "learning_rate": 2.5756154386763017e-07, + "loss": 0.66255158, + "num_input_tokens_seen": 301902720, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.66015625, + "step": 13997, + "time_per_iteration": 2.450742244720459 + }, + { + "auxiliary_loss_clip": 0.01102161, + "auxiliary_loss_mlp": 0.01033597, + "balance_loss_clip": 1.02135468, + "balance_loss_mlp": 1.0345906, + "epoch": 0.8416052908462348, + "flos": 18546496899840.0, + "grad_norm": 2.9556557520305535, + "language_loss": 0.82418084, + "learning_rate": 2.5737039244838565e-07, + "loss": 0.84553838, + "num_input_tokens_seen": 301921245, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.67578125, + "step": 13998, + "time_per_iteration": 2.4852468967437744 + }, + { + "auxiliary_loss_clip": 0.01100506, + "auxiliary_loss_mlp": 0.01032329, + "balance_loss_clip": 1.02012753, + "balance_loss_mlp": 1.03476977, + "epoch": 0.8416654140989027, + "flos": 26105190480000.0, + "grad_norm": 1.6783013419756503, + "language_loss": 0.80256122, + "learning_rate": 2.5717930710982984e-07, + "loss": 0.82388961, + "num_input_tokens_seen": 301942320, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65625, + "step": 13999, + "time_per_iteration": 2.4772074222564697 + }, + { + "auxiliary_loss_clip": 0.01102624, + "auxiliary_loss_mlp": 0.01033112, + "balance_loss_clip": 1.02082753, + "balance_loss_mlp": 1.03548217, + "epoch": 0.8417255373515707, + "flos": 26433270328320.0, + "grad_norm": 1.8918548509901008, + "language_loss": 0.6631999, + "learning_rate": 2.569882878592096e-07, + "loss": 0.68455726, + "num_input_tokens_seen": 301963110, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.671875, + "step": 14000, + "time_per_iteration": 2.4916574954986572 + }, + { + "auxiliary_loss_clip": 0.0110369, + "auxiliary_loss_mlp": 0.01028593, + "balance_loss_clip": 1.0162369, + "balance_loss_mlp": 1.0360285, + "epoch": 0.8417856606042387, + "flos": 24717745791360.0, + "grad_norm": 1.439269404890624, + "language_loss": 0.79670191, + "learning_rate": 2.5679733470376885e-07, + "loss": 0.81802464, + "num_input_tokens_seen": 301984915, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.67578125, + "step": 14001, + "time_per_iteration": 2.506103754043579 + }, + { + "auxiliary_loss_clip": 0.01098118, + "auxiliary_loss_mlp": 0.01026653, + "balance_loss_clip": 1.01570415, + "balance_loss_mlp": 1.03333926, + "epoch": 0.8418457838569067, + "flos": 20850849089280.0, + "grad_norm": 1.8424460389803186, + "language_loss": 0.78693283, + "learning_rate": 2.5660644765074703e-07, + "loss": 0.80818045, + "num_input_tokens_seen": 302004095, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 14002, + "time_per_iteration": 2.518280267715454 + }, + { + "auxiliary_loss_clip": 0.01098226, + "auxiliary_loss_mlp": 0.01027702, + "balance_loss_clip": 1.01548314, + "balance_loss_mlp": 1.03327775, + "epoch": 0.8419059071095746, + "flos": 28660629715200.0, + "grad_norm": 1.4332439734479316, + "language_loss": 0.77908051, + "learning_rate": 2.5641562670738334e-07, + "loss": 0.80033976, + "num_input_tokens_seen": 302027250, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6484375, + "step": 14003, + "time_per_iteration": 2.537221908569336 + }, + { + "auxiliary_loss_clip": 0.01098857, + "auxiliary_loss_mlp": 0.01025752, + "balance_loss_clip": 1.01390839, + "balance_loss_mlp": 1.03347921, + "epoch": 0.8419660303622426, + "flos": 21653596189440.0, + "grad_norm": 1.6966804421866966, + "language_loss": 0.65271151, + "learning_rate": 2.5622487188091436e-07, + "loss": 0.67395759, + "num_input_tokens_seen": 302046950, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65234375, + "step": 14004, + "time_per_iteration": 2.5128061771392822 + }, + { + "auxiliary_loss_clip": 0.01102661, + "auxiliary_loss_mlp": 0.01029562, + "balance_loss_clip": 1.01708663, + "balance_loss_mlp": 1.03558087, + "epoch": 0.8420261536149106, + "flos": 25301114576640.0, + "grad_norm": 1.9855294576572216, + "language_loss": 0.75816196, + "learning_rate": 2.560341831785724e-07, + "loss": 0.77948421, + "num_input_tokens_seen": 302065470, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.671875, + "step": 14005, + "time_per_iteration": 2.52797794342041 + }, + { + "auxiliary_loss_clip": 0.01099832, + "auxiliary_loss_mlp": 0.01030415, + "balance_loss_clip": 1.01821995, + "balance_loss_mlp": 1.03331256, + "epoch": 0.8420862768675785, + "flos": 18763397176320.0, + "grad_norm": 1.6128094911025277, + "language_loss": 0.77796531, + "learning_rate": 2.5584356060758906e-07, + "loss": 0.79926783, + "num_input_tokens_seen": 302083190, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6640625, + "step": 14006, + "time_per_iteration": 2.4496371746063232 + }, + { + "auxiliary_loss_clip": 0.01099985, + "auxiliary_loss_mlp": 0.01033097, + "balance_loss_clip": 1.02158761, + "balance_loss_mlp": 1.03485298, + "epoch": 0.8421464001202466, + "flos": 18328052338560.0, + "grad_norm": 1.7960476944276447, + "language_loss": 0.76950121, + "learning_rate": 2.556530041751932e-07, + "loss": 0.79083204, + "num_input_tokens_seen": 302098820, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 14007, + "time_per_iteration": 2.422987222671509 + }, + { + "auxiliary_loss_clip": 0.01100017, + "auxiliary_loss_mlp": 0.01028707, + "balance_loss_clip": 1.01662517, + "balance_loss_mlp": 1.03380418, + "epoch": 0.8422065233729145, + "flos": 31537181560320.0, + "grad_norm": 1.9383988075415828, + "language_loss": 0.65885502, + "learning_rate": 2.554625138886102e-07, + "loss": 0.68014228, + "num_input_tokens_seen": 302117075, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 14008, + "time_per_iteration": 2.5793111324310303 + }, + { + "auxiliary_loss_clip": 0.01022812, + "auxiliary_loss_mlp": 0.01000595, + "balance_loss_clip": 0.99958724, + "balance_loss_mlp": 1.00263965, + "epoch": 0.8422666466255825, + "flos": 64298128510080.0, + "grad_norm": 0.7113984001609904, + "language_loss": 0.56948996, + "learning_rate": 2.552720897550631e-07, + "loss": 0.589724, + "num_input_tokens_seen": 302179735, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.20214844, + "step": 14009, + "time_per_iteration": 3.0907700061798096 + }, + { + "auxiliary_loss_clip": 0.01095048, + "auxiliary_loss_mlp": 0.01029218, + "balance_loss_clip": 1.01879275, + "balance_loss_mlp": 1.03142929, + "epoch": 0.8423267698782504, + "flos": 24316731377280.0, + "grad_norm": 1.8875851862493795, + "language_loss": 0.77928913, + "learning_rate": 2.5508173178177304e-07, + "loss": 0.80053174, + "num_input_tokens_seen": 302202055, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.63671875, + "step": 14010, + "time_per_iteration": 2.529472589492798 + }, + { + "auxiliary_loss_clip": 0.01103883, + "auxiliary_loss_mlp": 0.01035285, + "balance_loss_clip": 1.02301264, + "balance_loss_mlp": 1.03620028, + "epoch": 0.8423868931309184, + "flos": 18296092212480.0, + "grad_norm": 1.6401509883189613, + "language_loss": 0.72421598, + "learning_rate": 2.548914399759592e-07, + "loss": 0.74560767, + "num_input_tokens_seen": 302221360, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.67578125, + "step": 14011, + "time_per_iteration": 2.447643280029297 + }, + { + "auxiliary_loss_clip": 0.01098963, + "auxiliary_loss_mlp": 0.01034293, + "balance_loss_clip": 1.0230099, + "balance_loss_mlp": 1.03326452, + "epoch": 0.8424470163835863, + "flos": 23550218121600.0, + "grad_norm": 1.9531405231766128, + "language_loss": 0.84154844, + "learning_rate": 2.5470121434483636e-07, + "loss": 0.86288095, + "num_input_tokens_seen": 302240715, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65625, + "step": 14012, + "time_per_iteration": 3.9144818782806396 + }, + { + "auxiliary_loss_clip": 0.01091927, + "auxiliary_loss_mlp": 0.01028828, + "balance_loss_clip": 1.0187788, + "balance_loss_mlp": 1.03218663, + "epoch": 0.8425071396362543, + "flos": 23769488695680.0, + "grad_norm": 1.723607660424782, + "language_loss": 0.6789465, + "learning_rate": 2.5451105489561884e-07, + "loss": 0.70015401, + "num_input_tokens_seen": 302260950, + "router_z_loss_clip": 0.10058594, + "router_z_loss_mlp": 0.59765625, + "step": 14013, + "time_per_iteration": 5.320711612701416 + }, + { + "auxiliary_loss_clip": 0.01103875, + "auxiliary_loss_mlp": 0.01029273, + "balance_loss_clip": 1.01710737, + "balance_loss_mlp": 1.03486574, + "epoch": 0.8425672628889223, + "flos": 16178906816640.0, + "grad_norm": 2.2453340608922003, + "language_loss": 0.78587079, + "learning_rate": 2.5432096163551644e-07, + "loss": 0.80720234, + "num_input_tokens_seen": 302277500, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69140625, + "step": 14014, + "time_per_iteration": 2.436648368835449 + }, + { + "auxiliary_loss_clip": 0.01098868, + "auxiliary_loss_mlp": 0.0102707, + "balance_loss_clip": 1.0159297, + "balance_loss_mlp": 1.03430867, + "epoch": 0.8426273861415903, + "flos": 23149131880320.0, + "grad_norm": 1.6871681799127176, + "language_loss": 0.67591381, + "learning_rate": 2.5413093457173884e-07, + "loss": 0.69717318, + "num_input_tokens_seen": 302297930, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.64453125, + "step": 14015, + "time_per_iteration": 2.471731185913086 + }, + { + "auxiliary_loss_clip": 0.01098465, + "auxiliary_loss_mlp": 0.01028558, + "balance_loss_clip": 1.01645255, + "balance_loss_mlp": 1.0342679, + "epoch": 0.8426875093942582, + "flos": 17457757712640.0, + "grad_norm": 2.6823702306015687, + "language_loss": 0.75894105, + "learning_rate": 2.5394097371149036e-07, + "loss": 0.78021133, + "num_input_tokens_seen": 302315735, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.64453125, + "step": 14016, + "time_per_iteration": 2.5013680458068848 + }, + { + "auxiliary_loss_clip": 0.0109948, + "auxiliary_loss_mlp": 0.01030243, + "balance_loss_clip": 1.01901984, + "balance_loss_mlp": 1.03484464, + "epoch": 0.8427476326469262, + "flos": 19640551299840.0, + "grad_norm": 2.2337330694664264, + "language_loss": 0.79515624, + "learning_rate": 2.5375107906197544e-07, + "loss": 0.81645346, + "num_input_tokens_seen": 302332790, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.64453125, + "step": 14017, + "time_per_iteration": 3.9724068641662598 + }, + { + "auxiliary_loss_clip": 0.01098149, + "auxiliary_loss_mlp": 0.01030572, + "balance_loss_clip": 1.01912832, + "balance_loss_mlp": 1.03382218, + "epoch": 0.8428077558995941, + "flos": 11941160146560.0, + "grad_norm": 2.091031083104406, + "language_loss": 0.62672061, + "learning_rate": 2.5356125063039525e-07, + "loss": 0.64800781, + "num_input_tokens_seen": 302346490, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.640625, + "step": 14018, + "time_per_iteration": 2.411105155944824 + }, + { + "auxiliary_loss_clip": 0.01098996, + "auxiliary_loss_mlp": 0.01033729, + "balance_loss_clip": 1.02257061, + "balance_loss_mlp": 1.03304076, + "epoch": 0.8428678791522621, + "flos": 10451729767680.0, + "grad_norm": 1.7561486170770395, + "language_loss": 0.79493165, + "learning_rate": 2.5337148842394687e-07, + "loss": 0.81625891, + "num_input_tokens_seen": 302363235, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.66015625, + "step": 14019, + "time_per_iteration": 2.4422781467437744 + }, + { + "auxiliary_loss_clip": 0.01100268, + "auxiliary_loss_mlp": 0.01028189, + "balance_loss_clip": 1.01616716, + "balance_loss_mlp": 1.03396749, + "epoch": 0.8429280024049302, + "flos": 28767248259840.0, + "grad_norm": 1.7295630345102972, + "language_loss": 0.78420174, + "learning_rate": 2.531817924498265e-07, + "loss": 0.80548632, + "num_input_tokens_seen": 302383270, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 14020, + "time_per_iteration": 2.504492998123169 + }, + { + "auxiliary_loss_clip": 0.01101741, + "auxiliary_loss_mlp": 0.01026853, + "balance_loss_clip": 1.0155102, + "balance_loss_mlp": 1.03599119, + "epoch": 0.8429881256575981, + "flos": 19537093152000.0, + "grad_norm": 1.740406918389935, + "language_loss": 0.71201503, + "learning_rate": 2.5299216271522805e-07, + "loss": 0.73330098, + "num_input_tokens_seen": 302401355, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 14021, + "time_per_iteration": 2.4488563537597656 + }, + { + "auxiliary_loss_clip": 0.01101185, + "auxiliary_loss_mlp": 0.01035627, + "balance_loss_clip": 1.02362275, + "balance_loss_mlp": 1.03446221, + "epoch": 0.8430482489102661, + "flos": 24790931752320.0, + "grad_norm": 1.60894104728434, + "language_loss": 0.69625163, + "learning_rate": 2.5280259922734125e-07, + "loss": 0.71761978, + "num_input_tokens_seen": 302419515, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66796875, + "step": 14022, + "time_per_iteration": 2.4879534244537354 + }, + { + "auxiliary_loss_clip": 0.01103852, + "auxiliary_loss_mlp": 0.01033331, + "balance_loss_clip": 1.02093339, + "balance_loss_mlp": 1.03596544, + "epoch": 0.843108372162934, + "flos": 21544248211200.0, + "grad_norm": 3.722986578619458, + "language_loss": 0.72199565, + "learning_rate": 2.526131019933553e-07, + "loss": 0.74336749, + "num_input_tokens_seen": 302438280, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 14023, + "time_per_iteration": 2.440702199935913 + }, + { + "auxiliary_loss_clip": 0.01099861, + "auxiliary_loss_mlp": 0.01033807, + "balance_loss_clip": 1.02184463, + "balance_loss_mlp": 1.03477669, + "epoch": 0.843168495415602, + "flos": 24608792862720.0, + "grad_norm": 1.3509908984580676, + "language_loss": 0.66908002, + "learning_rate": 2.524236710204559e-07, + "loss": 0.69041669, + "num_input_tokens_seen": 302460860, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65234375, + "step": 14024, + "time_per_iteration": 2.5194430351257324 + }, + { + "auxiliary_loss_clip": 0.01098712, + "auxiliary_loss_mlp": 0.01030558, + "balance_loss_clip": 1.01882803, + "balance_loss_mlp": 1.03412294, + "epoch": 0.8432286186682699, + "flos": 15122738286720.0, + "grad_norm": 1.8914070683276865, + "language_loss": 0.80512542, + "learning_rate": 2.522343063158261e-07, + "loss": 0.82641816, + "num_input_tokens_seen": 302476980, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.64453125, + "step": 14025, + "time_per_iteration": 2.418902635574341 + }, + { + "auxiliary_loss_clip": 0.01096552, + "auxiliary_loss_mlp": 0.01029052, + "balance_loss_clip": 1.01918721, + "balance_loss_mlp": 1.03351688, + "epoch": 0.843288741920938, + "flos": 20301882554880.0, + "grad_norm": 1.4865758896664674, + "language_loss": 0.77659529, + "learning_rate": 2.5204500788664606e-07, + "loss": 0.79785132, + "num_input_tokens_seen": 302496380, + "router_z_loss_clip": 0.09863281, + "router_z_loss_mlp": 0.62890625, + "step": 14026, + "time_per_iteration": 2.4354312419891357 + }, + { + "auxiliary_loss_clip": 0.01099258, + "auxiliary_loss_mlp": 0.01031113, + "balance_loss_clip": 1.01970434, + "balance_loss_mlp": 1.03534245, + "epoch": 0.8433488651736059, + "flos": 23332096782720.0, + "grad_norm": 1.3796664446051232, + "language_loss": 0.82750577, + "learning_rate": 2.518557757400945e-07, + "loss": 0.84880948, + "num_input_tokens_seen": 302516845, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.640625, + "step": 14027, + "time_per_iteration": 2.5538077354431152 + }, + { + "auxiliary_loss_clip": 0.01098136, + "auxiliary_loss_mlp": 0.01031411, + "balance_loss_clip": 1.02059913, + "balance_loss_mlp": 1.0331111, + "epoch": 0.8434089884262739, + "flos": 39458105844480.0, + "grad_norm": 1.5166229721837947, + "language_loss": 0.56329119, + "learning_rate": 2.5166660988334754e-07, + "loss": 0.58458668, + "num_input_tokens_seen": 302538865, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6484375, + "step": 14028, + "time_per_iteration": 2.616518020629883 + }, + { + "auxiliary_loss_clip": 0.01098938, + "auxiliary_loss_mlp": 0.01026458, + "balance_loss_clip": 1.01574659, + "balance_loss_mlp": 1.03393281, + "epoch": 0.8434691116789418, + "flos": 23768842250880.0, + "grad_norm": 2.148426968087737, + "language_loss": 0.6371001, + "learning_rate": 2.51477510323578e-07, + "loss": 0.65835404, + "num_input_tokens_seen": 302557970, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6484375, + "step": 14029, + "time_per_iteration": 2.473363161087036 + }, + { + "auxiliary_loss_clip": 0.01096698, + "auxiliary_loss_mlp": 0.01028098, + "balance_loss_clip": 1.01784623, + "balance_loss_mlp": 1.03464794, + "epoch": 0.8435292349316098, + "flos": 22671411972480.0, + "grad_norm": 1.5285969366660268, + "language_loss": 0.75408536, + "learning_rate": 2.51288477067956e-07, + "loss": 0.77533334, + "num_input_tokens_seen": 302578915, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.62109375, + "step": 14030, + "time_per_iteration": 2.454810380935669 + }, + { + "auxiliary_loss_clip": 0.01098147, + "auxiliary_loss_mlp": 0.01032966, + "balance_loss_clip": 1.02121234, + "balance_loss_mlp": 1.03436208, + "epoch": 0.8435893581842777, + "flos": 18843622202880.0, + "grad_norm": 1.7934754806619189, + "language_loss": 0.82908231, + "learning_rate": 2.510995101236502e-07, + "loss": 0.85039353, + "num_input_tokens_seen": 302596300, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.640625, + "step": 14031, + "time_per_iteration": 2.4392600059509277 + }, + { + "auxiliary_loss_clip": 0.01096568, + "auxiliary_loss_mlp": 0.01027224, + "balance_loss_clip": 1.01611948, + "balance_loss_mlp": 1.03294599, + "epoch": 0.8436494814369457, + "flos": 20704225772160.0, + "grad_norm": 4.6225832312305135, + "language_loss": 0.79887378, + "learning_rate": 2.509106094978266e-07, + "loss": 0.82011175, + "num_input_tokens_seen": 302614975, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.63671875, + "step": 14032, + "time_per_iteration": 2.4791696071624756 + }, + { + "auxiliary_loss_clip": 0.01099257, + "auxiliary_loss_mlp": 0.01032483, + "balance_loss_clip": 1.019454, + "balance_loss_mlp": 1.03245103, + "epoch": 0.8437096046896138, + "flos": 22674177319680.0, + "grad_norm": 1.3665627856205167, + "language_loss": 0.75488985, + "learning_rate": 2.507217751976478e-07, + "loss": 0.77620721, + "num_input_tokens_seen": 302636415, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.66796875, + "step": 14033, + "time_per_iteration": 2.4676787853240967 + }, + { + "auxiliary_loss_clip": 0.01098, + "auxiliary_loss_mlp": 0.01032734, + "balance_loss_clip": 1.02204669, + "balance_loss_mlp": 1.03290701, + "epoch": 0.8437697279422817, + "flos": 16180127879040.0, + "grad_norm": 1.9639820469891438, + "language_loss": 0.83208501, + "learning_rate": 2.505330072302743e-07, + "loss": 0.85339236, + "num_input_tokens_seen": 302653605, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6484375, + "step": 14034, + "time_per_iteration": 2.469835042953491 + }, + { + "auxiliary_loss_clip": 0.01100301, + "auxiliary_loss_mlp": 0.01023747, + "balance_loss_clip": 1.01229727, + "balance_loss_mlp": 1.03501594, + "epoch": 0.8438298511949497, + "flos": 28765847629440.0, + "grad_norm": 1.4504915159037657, + "language_loss": 0.7833904, + "learning_rate": 2.503443056028656e-07, + "loss": 0.80463088, + "num_input_tokens_seen": 302673965, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 14035, + "time_per_iteration": 2.50130558013916 + }, + { + "auxiliary_loss_clip": 0.0109866, + "auxiliary_loss_mlp": 0.01028359, + "balance_loss_clip": 1.01701045, + "balance_loss_mlp": 1.03443027, + "epoch": 0.8438899744476176, + "flos": 33724284779520.0, + "grad_norm": 1.3198614132572242, + "language_loss": 0.72175288, + "learning_rate": 2.501556703225751e-07, + "loss": 0.74302304, + "num_input_tokens_seen": 302695560, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 14036, + "time_per_iteration": 2.5673165321350098 + }, + { + "auxiliary_loss_clip": 0.01094598, + "auxiliary_loss_mlp": 0.01025043, + "balance_loss_clip": 1.01530933, + "balance_loss_mlp": 1.03373456, + "epoch": 0.8439500977002856, + "flos": 25110787386240.0, + "grad_norm": 2.270504860744628, + "language_loss": 0.69560575, + "learning_rate": 2.49967101396557e-07, + "loss": 0.71680212, + "num_input_tokens_seen": 302713480, + "router_z_loss_clip": 0.09765625, + "router_z_loss_mlp": 0.609375, + "step": 14037, + "time_per_iteration": 2.462125062942505 + }, + { + "auxiliary_loss_clip": 0.0109787, + "auxiliary_loss_mlp": 0.01023197, + "balance_loss_clip": 1.01211691, + "balance_loss_mlp": 1.03348804, + "epoch": 0.8440102209529535, + "flos": 32850362880000.0, + "grad_norm": 1.5784558103110167, + "language_loss": 0.68976426, + "learning_rate": 2.4977859883196227e-07, + "loss": 0.71097493, + "num_input_tokens_seen": 302736860, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.64453125, + "step": 14038, + "time_per_iteration": 2.5513498783111572 + }, + { + "auxiliary_loss_clip": 0.01099747, + "auxiliary_loss_mlp": 0.01034477, + "balance_loss_clip": 1.02248418, + "balance_loss_mlp": 1.03365922, + "epoch": 0.8440703442056215, + "flos": 23730202195200.0, + "grad_norm": 1.6561315096706188, + "language_loss": 0.76345998, + "learning_rate": 2.49590162635938e-07, + "loss": 0.7848022, + "num_input_tokens_seen": 302757745, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66015625, + "step": 14039, + "time_per_iteration": 2.481635093688965 + }, + { + "auxiliary_loss_clip": 0.01104423, + "auxiliary_loss_mlp": 0.01025998, + "balance_loss_clip": 1.01443481, + "balance_loss_mlp": 1.03612375, + "epoch": 0.8441304674582895, + "flos": 20193719725440.0, + "grad_norm": 2.016141716862511, + "language_loss": 0.79202807, + "learning_rate": 2.4940179281563046e-07, + "loss": 0.81333232, + "num_input_tokens_seen": 302774885, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.68359375, + "step": 14040, + "time_per_iteration": 2.501422166824341 + }, + { + "auxiliary_loss_clip": 0.01102231, + "auxiliary_loss_mlp": 0.01031429, + "balance_loss_clip": 1.01974607, + "balance_loss_mlp": 1.03618717, + "epoch": 0.8441905907109575, + "flos": 20219897761920.0, + "grad_norm": 2.0550149763476093, + "language_loss": 0.69268221, + "learning_rate": 2.492134893781821e-07, + "loss": 0.71401882, + "num_input_tokens_seen": 302791035, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66015625, + "step": 14041, + "time_per_iteration": 2.4294750690460205 + }, + { + "auxiliary_loss_clip": 0.01102612, + "auxiliary_loss_mlp": 0.0102986, + "balance_loss_clip": 1.0185411, + "balance_loss_mlp": 1.03519189, + "epoch": 0.8442507139636254, + "flos": 13516453987200.0, + "grad_norm": 1.7597415592284222, + "language_loss": 0.69147003, + "learning_rate": 2.490252523307341e-07, + "loss": 0.71279472, + "num_input_tokens_seen": 302808650, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.67578125, + "step": 14042, + "time_per_iteration": 2.442840814590454 + }, + { + "auxiliary_loss_clip": 0.01097842, + "auxiliary_loss_mlp": 0.01031745, + "balance_loss_clip": 1.02081347, + "balance_loss_mlp": 1.03461182, + "epoch": 0.8443108372162934, + "flos": 18220212731520.0, + "grad_norm": 1.6402432311205208, + "language_loss": 0.74725193, + "learning_rate": 2.4883708168042373e-07, + "loss": 0.76854777, + "num_input_tokens_seen": 302824605, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6328125, + "step": 14043, + "time_per_iteration": 2.5701467990875244 + }, + { + "auxiliary_loss_clip": 0.01098174, + "auxiliary_loss_mlp": 0.01026595, + "balance_loss_clip": 1.01558638, + "balance_loss_mlp": 1.03430986, + "epoch": 0.8443709604689613, + "flos": 16105110324480.0, + "grad_norm": 2.0792624601455127, + "language_loss": 0.71829164, + "learning_rate": 2.486489774343865e-07, + "loss": 0.73953938, + "num_input_tokens_seen": 302840170, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.63671875, + "step": 14044, + "time_per_iteration": 2.4172301292419434 + }, + { + "auxiliary_loss_clip": 0.01095955, + "auxiliary_loss_mlp": 0.01028701, + "balance_loss_clip": 1.01734626, + "balance_loss_mlp": 1.03243351, + "epoch": 0.8444310837216293, + "flos": 18512130562560.0, + "grad_norm": 1.5189186579429734, + "language_loss": 0.74687707, + "learning_rate": 2.484609395997559e-07, + "loss": 0.76812357, + "num_input_tokens_seen": 302858320, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.63671875, + "step": 14045, + "time_per_iteration": 2.427867889404297 + }, + { + "auxiliary_loss_clip": 0.01098274, + "auxiliary_loss_mlp": 0.01030475, + "balance_loss_clip": 1.01913869, + "balance_loss_mlp": 1.03317916, + "epoch": 0.8444912069742974, + "flos": 14939845211520.0, + "grad_norm": 1.6309917453055534, + "language_loss": 0.78394771, + "learning_rate": 2.4827296818366216e-07, + "loss": 0.80523521, + "num_input_tokens_seen": 302875255, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 14046, + "time_per_iteration": 2.4088222980499268 + }, + { + "auxiliary_loss_clip": 0.01101869, + "auxiliary_loss_mlp": 0.01030527, + "balance_loss_clip": 1.01861823, + "balance_loss_mlp": 1.03469396, + "epoch": 0.8445513302269653, + "flos": 20120318282880.0, + "grad_norm": 1.9427954061525838, + "language_loss": 0.7794674, + "learning_rate": 2.4808506319323255e-07, + "loss": 0.80079138, + "num_input_tokens_seen": 302894690, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 14047, + "time_per_iteration": 2.4330637454986572 + }, + { + "auxiliary_loss_clip": 0.01100445, + "auxiliary_loss_mlp": 0.01029203, + "balance_loss_clip": 1.01727068, + "balance_loss_mlp": 1.03626013, + "epoch": 0.8446114534796333, + "flos": 31170928533120.0, + "grad_norm": 1.713262783377482, + "language_loss": 0.71858978, + "learning_rate": 2.478972246355935e-07, + "loss": 0.73988628, + "num_input_tokens_seen": 302912405, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.640625, + "step": 14048, + "time_per_iteration": 2.5051729679107666 + }, + { + "auxiliary_loss_clip": 0.01099733, + "auxiliary_loss_mlp": 0.01031303, + "balance_loss_clip": 1.01973367, + "balance_loss_mlp": 1.03443313, + "epoch": 0.8446715767323012, + "flos": 23948323534080.0, + "grad_norm": 1.9354286067009534, + "language_loss": 0.73582602, + "learning_rate": 2.477094525178667e-07, + "loss": 0.75713634, + "num_input_tokens_seen": 302932525, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65234375, + "step": 14049, + "time_per_iteration": 2.4543259143829346 + }, + { + "auxiliary_loss_clip": 0.01022781, + "auxiliary_loss_mlp": 0.00999339, + "balance_loss_clip": 0.99834388, + "balance_loss_mlp": 1.00275576, + "epoch": 0.8447316999849692, + "flos": 67984897484160.0, + "grad_norm": 0.8078250011122586, + "language_loss": 0.60653841, + "learning_rate": 2.475217468471729e-07, + "loss": 0.62675965, + "num_input_tokens_seen": 302991285, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.20019531, + "step": 14050, + "time_per_iteration": 2.9804251194000244 + }, + { + "auxiliary_loss_clip": 0.01097821, + "auxiliary_loss_mlp": 0.01029422, + "balance_loss_clip": 1.01737618, + "balance_loss_mlp": 1.03271341, + "epoch": 0.8447918232376371, + "flos": 22418924296320.0, + "grad_norm": 3.49479551702144, + "language_loss": 0.72012359, + "learning_rate": 2.473341076306303e-07, + "loss": 0.74139607, + "num_input_tokens_seen": 303009515, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6484375, + "step": 14051, + "time_per_iteration": 2.484506368637085 + }, + { + "auxiliary_loss_clip": 0.01097541, + "auxiliary_loss_mlp": 0.01024468, + "balance_loss_clip": 1.01300573, + "balance_loss_mlp": 1.03342485, + "epoch": 0.8448519464903052, + "flos": 23694147918720.0, + "grad_norm": 1.9860085309394724, + "language_loss": 0.74646604, + "learning_rate": 2.471465348753547e-07, + "loss": 0.76768613, + "num_input_tokens_seen": 303026905, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.640625, + "step": 14052, + "time_per_iteration": 2.480921506881714 + }, + { + "auxiliary_loss_clip": 0.01092244, + "auxiliary_loss_mlp": 0.01026184, + "balance_loss_clip": 1.01610494, + "balance_loss_mlp": 1.03236473, + "epoch": 0.8449120697429731, + "flos": 13735904129280.0, + "grad_norm": 1.7502333612228071, + "language_loss": 0.7411198, + "learning_rate": 2.469590285884575e-07, + "loss": 0.76230407, + "num_input_tokens_seen": 303045245, + "router_z_loss_clip": 0.10058594, + "router_z_loss_mlp": 0.59765625, + "step": 14053, + "time_per_iteration": 3.8867318630218506 + }, + { + "auxiliary_loss_clip": 0.01099171, + "auxiliary_loss_mlp": 0.01025236, + "balance_loss_clip": 1.01445961, + "balance_loss_mlp": 1.03528714, + "epoch": 0.8449721929956411, + "flos": 20886795624960.0, + "grad_norm": 1.6433817636003443, + "language_loss": 0.74101913, + "learning_rate": 2.467715887770494e-07, + "loss": 0.76226318, + "num_input_tokens_seen": 303065205, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.640625, + "step": 14054, + "time_per_iteration": 2.4648666381835938 + }, + { + "auxiliary_loss_clip": 0.01103393, + "auxiliary_loss_mlp": 0.0102962, + "balance_loss_clip": 1.01794338, + "balance_loss_mlp": 1.03570211, + "epoch": 0.845032316248309, + "flos": 33216939129600.0, + "grad_norm": 1.4033979981207616, + "language_loss": 0.78469646, + "learning_rate": 2.4658421544823895e-07, + "loss": 0.80602658, + "num_input_tokens_seen": 303088250, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 14055, + "time_per_iteration": 5.422392845153809 + }, + { + "auxiliary_loss_clip": 0.01097429, + "auxiliary_loss_mlp": 0.01026244, + "balance_loss_clip": 1.01544404, + "balance_loss_mlp": 1.0341053, + "epoch": 0.845092439500977, + "flos": 23585230903680.0, + "grad_norm": 1.6864328278526126, + "language_loss": 0.72890306, + "learning_rate": 2.463969086091302e-07, + "loss": 0.75013983, + "num_input_tokens_seen": 303109280, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6328125, + "step": 14056, + "time_per_iteration": 2.477959394454956 + }, + { + "auxiliary_loss_clip": 0.01105764, + "auxiliary_loss_mlp": 0.01032313, + "balance_loss_clip": 1.02083945, + "balance_loss_mlp": 1.03714287, + "epoch": 0.8451525627536449, + "flos": 13333920048000.0, + "grad_norm": 2.258325426074488, + "language_loss": 0.67587829, + "learning_rate": 2.4620966826682686e-07, + "loss": 0.69725907, + "num_input_tokens_seen": 303126075, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6875, + "step": 14057, + "time_per_iteration": 2.414092779159546 + }, + { + "auxiliary_loss_clip": 0.01101571, + "auxiliary_loss_mlp": 0.01027638, + "balance_loss_clip": 1.01612806, + "balance_loss_mlp": 1.03506553, + "epoch": 0.8452126860063129, + "flos": 27817985583360.0, + "grad_norm": 1.7994228407078163, + "language_loss": 0.77547145, + "learning_rate": 2.460224944284284e-07, + "loss": 0.79676348, + "num_input_tokens_seen": 303146920, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 14058, + "time_per_iteration": 2.4836339950561523 + }, + { + "auxiliary_loss_clip": 0.01099526, + "auxiliary_loss_mlp": 0.01031734, + "balance_loss_clip": 1.0203855, + "balance_loss_mlp": 1.03322566, + "epoch": 0.845272809258981, + "flos": 27124694202240.0, + "grad_norm": 1.4871834521741678, + "language_loss": 0.69746482, + "learning_rate": 2.45835387101033e-07, + "loss": 0.71877742, + "num_input_tokens_seen": 303167885, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 14059, + "time_per_iteration": 3.935227155685425 + }, + { + "auxiliary_loss_clip": 0.01103214, + "auxiliary_loss_mlp": 0.01034961, + "balance_loss_clip": 1.02245557, + "balance_loss_mlp": 1.03492641, + "epoch": 0.8453329325116489, + "flos": 18332577452160.0, + "grad_norm": 1.763536728638446, + "language_loss": 0.57535338, + "learning_rate": 2.4564834629173516e-07, + "loss": 0.59673512, + "num_input_tokens_seen": 303185000, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 14060, + "time_per_iteration": 2.472986936569214 + }, + { + "auxiliary_loss_clip": 0.01101496, + "auxiliary_loss_mlp": 0.010352, + "balance_loss_clip": 1.02237284, + "balance_loss_mlp": 1.03304076, + "epoch": 0.8453930557643169, + "flos": 22675254727680.0, + "grad_norm": 1.5294645823993187, + "language_loss": 0.75755733, + "learning_rate": 2.454613720076277e-07, + "loss": 0.77892435, + "num_input_tokens_seen": 303205210, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6875, + "step": 14061, + "time_per_iteration": 2.459678888320923 + }, + { + "auxiliary_loss_clip": 0.01101612, + "auxiliary_loss_mlp": 0.01027088, + "balance_loss_clip": 1.01516151, + "balance_loss_mlp": 1.03408587, + "epoch": 0.8454531790169848, + "flos": 22487261921280.0, + "grad_norm": 2.0519400058397066, + "language_loss": 0.7084868, + "learning_rate": 2.452744642558013e-07, + "loss": 0.72977388, + "num_input_tokens_seen": 303224655, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 14062, + "time_per_iteration": 2.4579243659973145 + }, + { + "auxiliary_loss_clip": 0.01022787, + "auxiliary_loss_mlp": 0.01001721, + "balance_loss_clip": 1.00071383, + "balance_loss_mlp": 1.00271332, + "epoch": 0.8455133022696528, + "flos": 58277848481280.0, + "grad_norm": 0.6326686900336163, + "language_loss": 0.52631342, + "learning_rate": 2.450876230433432e-07, + "loss": 0.5465585, + "num_input_tokens_seen": 303289645, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.20117188, + "step": 14063, + "time_per_iteration": 3.0987555980682373 + }, + { + "auxiliary_loss_clip": 0.01096616, + "auxiliary_loss_mlp": 0.01023156, + "balance_loss_clip": 1.01282668, + "balance_loss_mlp": 1.03490078, + "epoch": 0.8455734255223207, + "flos": 21361283308800.0, + "grad_norm": 1.7407026281004632, + "language_loss": 0.81590897, + "learning_rate": 2.449008483773378e-07, + "loss": 0.8371067, + "num_input_tokens_seen": 303308350, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.6171875, + "step": 14064, + "time_per_iteration": 2.430516481399536 + }, + { + "auxiliary_loss_clip": 0.01103129, + "auxiliary_loss_mlp": 0.01030903, + "balance_loss_clip": 1.01860666, + "balance_loss_mlp": 1.0363518, + "epoch": 0.8456335487749888, + "flos": 20449260057600.0, + "grad_norm": 1.8631632297123397, + "language_loss": 0.72349954, + "learning_rate": 2.447141402648685e-07, + "loss": 0.74483991, + "num_input_tokens_seen": 303325230, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66796875, + "step": 14065, + "time_per_iteration": 2.4640002250671387 + }, + { + "auxiliary_loss_clip": 0.01098256, + "auxiliary_loss_mlp": 0.0102714, + "balance_loss_clip": 1.0159936, + "balance_loss_mlp": 1.03512609, + "epoch": 0.8456936720276567, + "flos": 28840901097600.0, + "grad_norm": 1.498676898240102, + "language_loss": 0.77308834, + "learning_rate": 2.445274987130146e-07, + "loss": 0.79434228, + "num_input_tokens_seen": 303345810, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.62890625, + "step": 14066, + "time_per_iteration": 2.506878614425659 + }, + { + "auxiliary_loss_clip": 0.01101195, + "auxiliary_loss_mlp": 0.010281, + "balance_loss_clip": 1.01635194, + "balance_loss_mlp": 1.03654039, + "epoch": 0.8457537952803247, + "flos": 22672884430080.0, + "grad_norm": 1.821364194037934, + "language_loss": 0.70122147, + "learning_rate": 2.4434092372885363e-07, + "loss": 0.72251445, + "num_input_tokens_seen": 303365140, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6484375, + "step": 14067, + "time_per_iteration": 2.476407051086426 + }, + { + "auxiliary_loss_clip": 0.01096849, + "auxiliary_loss_mlp": 0.01025297, + "balance_loss_clip": 1.01409197, + "balance_loss_mlp": 1.03197587, + "epoch": 0.8458139185329926, + "flos": 33802929607680.0, + "grad_norm": 6.126876803000313, + "language_loss": 0.7123543, + "learning_rate": 2.4415441531946144e-07, + "loss": 0.7335757, + "num_input_tokens_seen": 303386150, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 14068, + "time_per_iteration": 2.541780948638916 + }, + { + "auxiliary_loss_clip": 0.01022903, + "auxiliary_loss_mlp": 0.00998547, + "balance_loss_clip": 0.997522, + "balance_loss_mlp": 1.00272095, + "epoch": 0.8458740417856606, + "flos": 70295929603200.0, + "grad_norm": 0.6989192180305637, + "language_loss": 0.60597819, + "learning_rate": 2.4396797349190976e-07, + "loss": 0.62619269, + "num_input_tokens_seen": 303453770, + "router_z_loss_clip": 0.01025391, + "router_z_loss_mlp": 0.20214844, + "step": 14069, + "time_per_iteration": 3.1510071754455566 + }, + { + "auxiliary_loss_clip": 0.01100142, + "auxiliary_loss_mlp": 0.01027402, + "balance_loss_clip": 1.01657152, + "balance_loss_mlp": 1.03431201, + "epoch": 0.8459341650383285, + "flos": 24170862245760.0, + "grad_norm": 1.677018602985038, + "language_loss": 0.74419677, + "learning_rate": 2.4378159825326804e-07, + "loss": 0.76547223, + "num_input_tokens_seen": 303474520, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.65625, + "step": 14070, + "time_per_iteration": 2.4841506481170654 + }, + { + "auxiliary_loss_clip": 0.01099817, + "auxiliary_loss_mlp": 0.01033098, + "balance_loss_clip": 1.0215106, + "balance_loss_mlp": 1.03530526, + "epoch": 0.8459942882909965, + "flos": 38181158369280.0, + "grad_norm": 1.6241169837210538, + "language_loss": 0.66860032, + "learning_rate": 2.435952896106039e-07, + "loss": 0.68992949, + "num_input_tokens_seen": 303497345, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.64453125, + "step": 14071, + "time_per_iteration": 2.594825267791748 + }, + { + "auxiliary_loss_clip": 0.01022635, + "auxiliary_loss_mlp": 0.0099954, + "balance_loss_clip": 0.99856204, + "balance_loss_mlp": 1.00254095, + "epoch": 0.8460544115436646, + "flos": 64118252177280.0, + "grad_norm": 0.7322891811634097, + "language_loss": 0.60995638, + "learning_rate": 2.4340904757098313e-07, + "loss": 0.63017821, + "num_input_tokens_seen": 303554890, + "router_z_loss_clip": 0.00976562, + "router_z_loss_mlp": 0.20117188, + "step": 14072, + "time_per_iteration": 2.906951427459717 + }, + { + "auxiliary_loss_clip": 0.01101338, + "auxiliary_loss_mlp": 0.01031787, + "balance_loss_clip": 1.01863873, + "balance_loss_mlp": 1.03404236, + "epoch": 0.8461145347963325, + "flos": 24170826332160.0, + "grad_norm": 1.7449520639436589, + "language_loss": 0.72158128, + "learning_rate": 2.4322287214146664e-07, + "loss": 0.74291253, + "num_input_tokens_seen": 303574380, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.671875, + "step": 14073, + "time_per_iteration": 2.491899013519287 + }, + { + "auxiliary_loss_clip": 0.011067, + "auxiliary_loss_mlp": 0.01034402, + "balance_loss_clip": 1.02137196, + "balance_loss_mlp": 1.03658199, + "epoch": 0.8461746580490005, + "flos": 34893787697280.0, + "grad_norm": 2.211703240876086, + "language_loss": 0.78310221, + "learning_rate": 2.430367633291155e-07, + "loss": 0.80451322, + "num_input_tokens_seen": 303594910, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 14074, + "time_per_iteration": 2.5973291397094727 + }, + { + "auxiliary_loss_clip": 0.01102012, + "auxiliary_loss_mlp": 0.01030857, + "balance_loss_clip": 1.01937723, + "balance_loss_mlp": 1.03654218, + "epoch": 0.8462347813016684, + "flos": 25557014044800.0, + "grad_norm": 1.9799370549513835, + "language_loss": 0.75153923, + "learning_rate": 2.4285072114098583e-07, + "loss": 0.77286798, + "num_input_tokens_seen": 303613520, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65625, + "step": 14075, + "time_per_iteration": 2.5073180198669434 + }, + { + "auxiliary_loss_clip": 0.01098407, + "auxiliary_loss_mlp": 0.01028304, + "balance_loss_clip": 1.01677096, + "balance_loss_mlp": 1.03368163, + "epoch": 0.8462949045543364, + "flos": 21325336773120.0, + "grad_norm": 3.0035376812832966, + "language_loss": 0.73357224, + "learning_rate": 2.4266474558413355e-07, + "loss": 0.7548393, + "num_input_tokens_seen": 303631225, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 14076, + "time_per_iteration": 2.493821859359741 + }, + { + "auxiliary_loss_clip": 0.0110366, + "auxiliary_loss_mlp": 0.01033318, + "balance_loss_clip": 1.02139723, + "balance_loss_mlp": 1.03518665, + "epoch": 0.8463550278070043, + "flos": 22637440684800.0, + "grad_norm": 1.8749402802311503, + "language_loss": 0.77490556, + "learning_rate": 2.4247883666560945e-07, + "loss": 0.79627538, + "num_input_tokens_seen": 303649175, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.68359375, + "step": 14077, + "time_per_iteration": 2.4984679222106934 + }, + { + "auxiliary_loss_clip": 0.01104786, + "auxiliary_loss_mlp": 0.01033509, + "balance_loss_clip": 1.02190399, + "balance_loss_mlp": 1.03648067, + "epoch": 0.8464151510596724, + "flos": 13005588804480.0, + "grad_norm": 2.450375908672133, + "language_loss": 0.75225329, + "learning_rate": 2.422929943924643e-07, + "loss": 0.77363622, + "num_input_tokens_seen": 303665915, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.68359375, + "step": 14078, + "time_per_iteration": 2.474865436553955 + }, + { + "auxiliary_loss_clip": 0.01097121, + "auxiliary_loss_mlp": 0.01024557, + "balance_loss_clip": 1.01263642, + "balance_loss_mlp": 1.03324652, + "epoch": 0.8464752743123403, + "flos": 15704921923200.0, + "grad_norm": 2.129652796655016, + "language_loss": 0.85099643, + "learning_rate": 2.4210721877174565e-07, + "loss": 0.87221324, + "num_input_tokens_seen": 303679985, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.640625, + "step": 14079, + "time_per_iteration": 2.448030471801758 + }, + { + "auxiliary_loss_clip": 0.0110791, + "auxiliary_loss_mlp": 0.01034332, + "balance_loss_clip": 1.02166057, + "balance_loss_mlp": 1.03643155, + "epoch": 0.8465353975650083, + "flos": 21653955325440.0, + "grad_norm": 2.083587711277689, + "language_loss": 0.58946401, + "learning_rate": 2.419215098104965e-07, + "loss": 0.61088645, + "num_input_tokens_seen": 303698470, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 14080, + "time_per_iteration": 2.477292776107788 + }, + { + "auxiliary_loss_clip": 0.01101645, + "auxiliary_loss_mlp": 0.01030543, + "balance_loss_clip": 1.01849711, + "balance_loss_mlp": 1.03358364, + "epoch": 0.8465955208176762, + "flos": 18515650095360.0, + "grad_norm": 2.2918439013364615, + "language_loss": 0.66583252, + "learning_rate": 2.4173586751576014e-07, + "loss": 0.68715435, + "num_input_tokens_seen": 303716415, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 14081, + "time_per_iteration": 2.496119976043701 + }, + { + "auxiliary_loss_clip": 0.011022, + "auxiliary_loss_mlp": 0.01027784, + "balance_loss_clip": 1.01699603, + "balance_loss_mlp": 1.03520298, + "epoch": 0.8466556440703442, + "flos": 24200559815040.0, + "grad_norm": 1.6481433281292062, + "language_loss": 0.73019934, + "learning_rate": 2.41550291894576e-07, + "loss": 0.75149918, + "num_input_tokens_seen": 303734490, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.671875, + "step": 14082, + "time_per_iteration": 2.516955614089966 + }, + { + "auxiliary_loss_clip": 0.01100527, + "auxiliary_loss_mlp": 0.01026426, + "balance_loss_clip": 1.01518404, + "balance_loss_mlp": 1.0332799, + "epoch": 0.8467157673230121, + "flos": 20375894528640.0, + "grad_norm": 1.8611445286872557, + "language_loss": 0.75691915, + "learning_rate": 2.413647829539809e-07, + "loss": 0.77818871, + "num_input_tokens_seen": 303752310, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 14083, + "time_per_iteration": 2.443368673324585 + }, + { + "auxiliary_loss_clip": 0.01103347, + "auxiliary_loss_mlp": 0.01032333, + "balance_loss_clip": 1.01958406, + "balance_loss_mlp": 1.03421068, + "epoch": 0.8467758905756801, + "flos": 28473642489600.0, + "grad_norm": 1.9018584016547608, + "language_loss": 0.66331363, + "learning_rate": 2.411793407010092e-07, + "loss": 0.68467045, + "num_input_tokens_seen": 303776065, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 14084, + "time_per_iteration": 2.5169622898101807 + }, + { + "auxiliary_loss_clip": 0.01103562, + "auxiliary_loss_mlp": 0.01030247, + "balance_loss_clip": 1.01891565, + "balance_loss_mlp": 1.03716993, + "epoch": 0.8468360138283482, + "flos": 11692551139200.0, + "grad_norm": 1.8997830806116998, + "language_loss": 0.69932806, + "learning_rate": 2.409939651426938e-07, + "loss": 0.72066617, + "num_input_tokens_seen": 303793500, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 14085, + "time_per_iteration": 2.4265129566192627 + }, + { + "auxiliary_loss_clip": 0.01099334, + "auxiliary_loss_mlp": 0.01028905, + "balance_loss_clip": 1.01743662, + "balance_loss_mlp": 1.03297186, + "epoch": 0.8468961370810161, + "flos": 24607859109120.0, + "grad_norm": 1.5697280005670382, + "language_loss": 0.71030748, + "learning_rate": 2.408086562860634e-07, + "loss": 0.73158979, + "num_input_tokens_seen": 303814835, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 14086, + "time_per_iteration": 2.5099709033966064 + }, + { + "auxiliary_loss_clip": 0.01099375, + "auxiliary_loss_mlp": 0.0102745, + "balance_loss_clip": 1.01607704, + "balance_loss_mlp": 1.034688, + "epoch": 0.8469562603336841, + "flos": 19609812236160.0, + "grad_norm": 1.65755050150048, + "language_loss": 0.75040638, + "learning_rate": 2.4062341413814445e-07, + "loss": 0.77167463, + "num_input_tokens_seen": 303834505, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6484375, + "step": 14087, + "time_per_iteration": 2.508406639099121 + }, + { + "auxiliary_loss_clip": 0.01099258, + "auxiliary_loss_mlp": 0.01023821, + "balance_loss_clip": 1.01230574, + "balance_loss_mlp": 1.03506553, + "epoch": 0.847016383586352, + "flos": 22638949056000.0, + "grad_norm": 1.3368514111731342, + "language_loss": 0.7384972, + "learning_rate": 2.4043823870596227e-07, + "loss": 0.75972795, + "num_input_tokens_seen": 303855050, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 14088, + "time_per_iteration": 2.532632827758789 + }, + { + "auxiliary_loss_clip": 0.01101991, + "auxiliary_loss_mlp": 0.01034515, + "balance_loss_clip": 1.02231431, + "balance_loss_mlp": 1.03481674, + "epoch": 0.84707650683902, + "flos": 20960161153920.0, + "grad_norm": 1.8296611825235667, + "language_loss": 0.7228626, + "learning_rate": 2.402531299965387e-07, + "loss": 0.74422771, + "num_input_tokens_seen": 303875635, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 14089, + "time_per_iteration": 2.451185703277588 + }, + { + "auxiliary_loss_clip": 0.01097339, + "auxiliary_loss_mlp": 0.0102571, + "balance_loss_clip": 1.0147543, + "balance_loss_mlp": 1.03443742, + "epoch": 0.8471366300916879, + "flos": 24093007516800.0, + "grad_norm": 1.409397008066916, + "language_loss": 0.79183906, + "learning_rate": 2.400680880168928e-07, + "loss": 0.81306958, + "num_input_tokens_seen": 303896750, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.62890625, + "step": 14090, + "time_per_iteration": 2.4930338859558105 + }, + { + "auxiliary_loss_clip": 0.01101917, + "auxiliary_loss_mlp": 0.01038153, + "balance_loss_clip": 1.02512336, + "balance_loss_mlp": 1.03417242, + "epoch": 0.847196753344356, + "flos": 18332900674560.0, + "grad_norm": 1.8243760305256211, + "language_loss": 0.7671752, + "learning_rate": 2.3988311277404085e-07, + "loss": 0.78857589, + "num_input_tokens_seen": 303915435, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6796875, + "step": 14091, + "time_per_iteration": 2.4504411220550537 + }, + { + "auxiliary_loss_clip": 0.01022805, + "auxiliary_loss_mlp": 0.01001176, + "balance_loss_clip": 1.00027013, + "balance_loss_mlp": 1.0028131, + "epoch": 0.8472568765970239, + "flos": 49567536956160.0, + "grad_norm": 0.8882896911860697, + "language_loss": 0.5941655, + "learning_rate": 2.396982042749982e-07, + "loss": 0.61440521, + "num_input_tokens_seen": 303977245, + "router_z_loss_clip": 0.0090332, + "router_z_loss_mlp": 0.20019531, + "step": 14092, + "time_per_iteration": 3.120185613632202 + }, + { + "auxiliary_loss_clip": 0.0109951, + "auxiliary_loss_mlp": 0.0103338, + "balance_loss_clip": 1.02101243, + "balance_loss_mlp": 1.03303409, + "epoch": 0.8473169998496919, + "flos": 19279074781440.0, + "grad_norm": 1.9515587052463406, + "language_loss": 0.70222908, + "learning_rate": 2.395133625267756e-07, + "loss": 0.72355801, + "num_input_tokens_seen": 303996055, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6640625, + "step": 14093, + "time_per_iteration": 2.437931537628174 + }, + { + "auxiliary_loss_clip": 0.01095773, + "auxiliary_loss_mlp": 0.0102644, + "balance_loss_clip": 1.01558602, + "balance_loss_mlp": 1.0323596, + "epoch": 0.8473771231023598, + "flos": 17675555829120.0, + "grad_norm": 2.0380428061341176, + "language_loss": 0.83106399, + "learning_rate": 2.3932858753638263e-07, + "loss": 0.8522861, + "num_input_tokens_seen": 304012205, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6328125, + "step": 14094, + "time_per_iteration": 2.4864912033081055 + }, + { + "auxiliary_loss_clip": 0.01096593, + "auxiliary_loss_mlp": 0.01030533, + "balance_loss_clip": 1.01932764, + "balance_loss_mlp": 1.0341723, + "epoch": 0.8474372463550278, + "flos": 26359761144960.0, + "grad_norm": 1.819291360487763, + "language_loss": 0.71216273, + "learning_rate": 2.3914387931082626e-07, + "loss": 0.73343396, + "num_input_tokens_seen": 304033475, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.625, + "step": 14095, + "time_per_iteration": 3.993910074234009 + }, + { + "auxiliary_loss_clip": 0.01097533, + "auxiliary_loss_mlp": 0.01032238, + "balance_loss_clip": 1.02124119, + "balance_loss_mlp": 1.03403068, + "epoch": 0.8474973696076957, + "flos": 23402050519680.0, + "grad_norm": 1.825062729979651, + "language_loss": 0.81036246, + "learning_rate": 2.3895923785711105e-07, + "loss": 0.83166021, + "num_input_tokens_seen": 304051845, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6328125, + "step": 14096, + "time_per_iteration": 2.4629123210906982 + }, + { + "auxiliary_loss_clip": 0.01102093, + "auxiliary_loss_mlp": 0.0103143, + "balance_loss_clip": 1.018502, + "balance_loss_mlp": 1.03426218, + "epoch": 0.8475574928603637, + "flos": 25075666863360.0, + "grad_norm": 1.7173304527210933, + "language_loss": 0.77041292, + "learning_rate": 2.387746631822374e-07, + "loss": 0.79174817, + "num_input_tokens_seen": 304069965, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6796875, + "step": 14097, + "time_per_iteration": 3.9817750453948975 + }, + { + "auxiliary_loss_clip": 0.01099142, + "auxiliary_loss_mlp": 0.01026496, + "balance_loss_clip": 1.01546907, + "balance_loss_mlp": 1.03521109, + "epoch": 0.8476176161130318, + "flos": 19966691813760.0, + "grad_norm": 1.704153380132331, + "language_loss": 0.8026402, + "learning_rate": 2.385901552932048e-07, + "loss": 0.82389653, + "num_input_tokens_seen": 304086805, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 14098, + "time_per_iteration": 2.429412603378296 + }, + { + "auxiliary_loss_clip": 0.01099681, + "auxiliary_loss_mlp": 0.01030512, + "balance_loss_clip": 1.01887143, + "balance_loss_mlp": 1.03510809, + "epoch": 0.8476777393656997, + "flos": 21285834791040.0, + "grad_norm": 1.8292147668757888, + "language_loss": 0.71778166, + "learning_rate": 2.3840571419701062e-07, + "loss": 0.73908365, + "num_input_tokens_seen": 304105865, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.64453125, + "step": 14099, + "time_per_iteration": 2.4910991191864014 + }, + { + "auxiliary_loss_clip": 0.01099079, + "auxiliary_loss_mlp": 0.01029502, + "balance_loss_clip": 1.01675868, + "balance_loss_mlp": 1.03380799, + "epoch": 0.8477378626183677, + "flos": 29971476650880.0, + "grad_norm": 1.9479832989077739, + "language_loss": 0.63951457, + "learning_rate": 2.3822133990064787e-07, + "loss": 0.6608004, + "num_input_tokens_seen": 304128300, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.65234375, + "step": 14100, + "time_per_iteration": 3.935777425765991 + }, + { + "auxiliary_loss_clip": 0.01102751, + "auxiliary_loss_mlp": 0.01030884, + "balance_loss_clip": 1.0190171, + "balance_loss_mlp": 1.03465199, + "epoch": 0.8477979858710356, + "flos": 24237727413120.0, + "grad_norm": 1.8887642813622785, + "language_loss": 0.73411292, + "learning_rate": 2.380370324111085e-07, + "loss": 0.75544924, + "num_input_tokens_seen": 304143695, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 14101, + "time_per_iteration": 2.4780516624450684 + }, + { + "auxiliary_loss_clip": 0.01099179, + "auxiliary_loss_mlp": 0.01027832, + "balance_loss_clip": 1.01664448, + "balance_loss_mlp": 1.03323436, + "epoch": 0.8478581091237036, + "flos": 25593678852480.0, + "grad_norm": 1.5929891026522867, + "language_loss": 0.71019483, + "learning_rate": 2.3785279173538163e-07, + "loss": 0.73146498, + "num_input_tokens_seen": 304165800, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.66015625, + "step": 14102, + "time_per_iteration": 2.487032651901245 + }, + { + "auxiliary_loss_clip": 0.01104118, + "auxiliary_loss_mlp": 0.01030554, + "balance_loss_clip": 1.01804352, + "balance_loss_mlp": 1.03573275, + "epoch": 0.8479182323763715, + "flos": 12057116227200.0, + "grad_norm": 2.358108614100406, + "language_loss": 0.81502283, + "learning_rate": 2.3766861788045366e-07, + "loss": 0.83636951, + "num_input_tokens_seen": 304182910, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 14103, + "time_per_iteration": 2.421996831893921 + }, + { + "auxiliary_loss_clip": 0.01100518, + "auxiliary_loss_mlp": 0.01028635, + "balance_loss_clip": 1.01750135, + "balance_loss_mlp": 1.03581166, + "epoch": 0.8479783556290396, + "flos": 21433391861760.0, + "grad_norm": 5.145058817930484, + "language_loss": 0.78646743, + "learning_rate": 2.374845108533079e-07, + "loss": 0.80775893, + "num_input_tokens_seen": 304200175, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 14104, + "time_per_iteration": 2.439422607421875 + }, + { + "auxiliary_loss_clip": 0.01103668, + "auxiliary_loss_mlp": 0.01034723, + "balance_loss_clip": 1.0222832, + "balance_loss_mlp": 1.03649271, + "epoch": 0.8480384788817075, + "flos": 19642634288640.0, + "grad_norm": 2.160459351440593, + "language_loss": 0.78862703, + "learning_rate": 2.3730047066092607e-07, + "loss": 0.81001097, + "num_input_tokens_seen": 304217775, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 14105, + "time_per_iteration": 2.460575580596924 + }, + { + "auxiliary_loss_clip": 0.0110572, + "auxiliary_loss_mlp": 0.01028511, + "balance_loss_clip": 1.01601195, + "balance_loss_mlp": 1.03611135, + "epoch": 0.8480986021343755, + "flos": 22489201255680.0, + "grad_norm": 2.266475836383515, + "language_loss": 0.50339055, + "learning_rate": 2.3711649731028749e-07, + "loss": 0.52473295, + "num_input_tokens_seen": 304235760, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 14106, + "time_per_iteration": 2.4719579219818115 + }, + { + "auxiliary_loss_clip": 0.01100101, + "auxiliary_loss_mlp": 0.01030022, + "balance_loss_clip": 1.01833999, + "balance_loss_mlp": 1.03461707, + "epoch": 0.8481587253870434, + "flos": 22090557139200.0, + "grad_norm": 3.1041228706875006, + "language_loss": 0.75183088, + "learning_rate": 2.3693259080836792e-07, + "loss": 0.77313209, + "num_input_tokens_seen": 304253985, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 14107, + "time_per_iteration": 2.4628798961639404 + }, + { + "auxiliary_loss_clip": 0.01099676, + "auxiliary_loss_mlp": 0.01025265, + "balance_loss_clip": 1.01383924, + "balance_loss_mlp": 1.03401649, + "epoch": 0.8482188486397114, + "flos": 33582689366400.0, + "grad_norm": 1.5739400840020021, + "language_loss": 0.73535973, + "learning_rate": 2.3674875116214087e-07, + "loss": 0.75660914, + "num_input_tokens_seen": 304276785, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 14108, + "time_per_iteration": 2.5669243335723877 + }, + { + "auxiliary_loss_clip": 0.01096623, + "auxiliary_loss_mlp": 0.01024357, + "balance_loss_clip": 1.01170897, + "balance_loss_mlp": 1.03377414, + "epoch": 0.8482789718923793, + "flos": 20919402195840.0, + "grad_norm": 1.5922504607484824, + "language_loss": 0.72592628, + "learning_rate": 2.3656497837857836e-07, + "loss": 0.74713612, + "num_input_tokens_seen": 304296310, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.62890625, + "step": 14109, + "time_per_iteration": 2.5060198307037354 + }, + { + "auxiliary_loss_clip": 0.01098271, + "auxiliary_loss_mlp": 0.01031455, + "balance_loss_clip": 1.01955819, + "balance_loss_mlp": 1.03390074, + "epoch": 0.8483390951450474, + "flos": 12896204912640.0, + "grad_norm": 3.767796644059149, + "language_loss": 0.73706329, + "learning_rate": 2.3638127246464811e-07, + "loss": 0.75836062, + "num_input_tokens_seen": 304311715, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.64453125, + "step": 14110, + "time_per_iteration": 2.4130804538726807 + }, + { + "auxiliary_loss_clip": 0.01100273, + "auxiliary_loss_mlp": 0.01029588, + "balance_loss_clip": 1.01855469, + "balance_loss_mlp": 1.0350523, + "epoch": 0.8483992183977154, + "flos": 25081628520960.0, + "grad_norm": 1.6650304644780962, + "language_loss": 0.76256633, + "learning_rate": 2.3619763342731658e-07, + "loss": 0.78386492, + "num_input_tokens_seen": 304331910, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65234375, + "step": 14111, + "time_per_iteration": 2.5073182582855225 + }, + { + "auxiliary_loss_clip": 0.01099201, + "auxiliary_loss_mlp": 0.01026896, + "balance_loss_clip": 1.0163821, + "balance_loss_mlp": 1.03553009, + "epoch": 0.8484593416503833, + "flos": 25557445008000.0, + "grad_norm": 2.146588473142915, + "language_loss": 0.67528129, + "learning_rate": 2.3601406127354772e-07, + "loss": 0.69654226, + "num_input_tokens_seen": 304351405, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.63671875, + "step": 14112, + "time_per_iteration": 2.54217267036438 + }, + { + "auxiliary_loss_clip": 0.01099478, + "auxiliary_loss_mlp": 0.0102891, + "balance_loss_clip": 1.01754332, + "balance_loss_mlp": 1.03301573, + "epoch": 0.8485194649030513, + "flos": 27198454780800.0, + "grad_norm": 1.4581760668083692, + "language_loss": 0.73855281, + "learning_rate": 2.3583055601030312e-07, + "loss": 0.75983667, + "num_input_tokens_seen": 304372935, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 14113, + "time_per_iteration": 2.5123813152313232 + }, + { + "auxiliary_loss_clip": 0.01099678, + "auxiliary_loss_mlp": 0.01031848, + "balance_loss_clip": 1.02015352, + "balance_loss_mlp": 1.03446484, + "epoch": 0.8485795881557192, + "flos": 24205910941440.0, + "grad_norm": 2.114125636315057, + "language_loss": 0.66483456, + "learning_rate": 2.3564711764454003e-07, + "loss": 0.68614984, + "num_input_tokens_seen": 304393070, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65234375, + "step": 14114, + "time_per_iteration": 2.4642868041992188 + }, + { + "auxiliary_loss_clip": 0.01102052, + "auxiliary_loss_mlp": 0.01031511, + "balance_loss_clip": 1.01924419, + "balance_loss_mlp": 1.03529406, + "epoch": 0.8486397114083872, + "flos": 21141653598720.0, + "grad_norm": 1.7472449430374317, + "language_loss": 0.78489804, + "learning_rate": 2.3546374618321495e-07, + "loss": 0.80623364, + "num_input_tokens_seen": 304411195, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.66796875, + "step": 14115, + "time_per_iteration": 2.4579203128814697 + }, + { + "auxiliary_loss_clip": 0.01101492, + "auxiliary_loss_mlp": 0.01031038, + "balance_loss_clip": 1.02001727, + "balance_loss_mlp": 1.03533506, + "epoch": 0.8486998346610551, + "flos": 19974772373760.0, + "grad_norm": 1.999613584500072, + "language_loss": 0.7920562, + "learning_rate": 2.3528044163328187e-07, + "loss": 0.81338149, + "num_input_tokens_seen": 304429425, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.66015625, + "step": 14116, + "time_per_iteration": 2.429086446762085 + }, + { + "auxiliary_loss_clip": 0.01101969, + "auxiliary_loss_mlp": 0.01029409, + "balance_loss_clip": 1.01742232, + "balance_loss_mlp": 1.03365159, + "epoch": 0.8487599579137232, + "flos": 19792310261760.0, + "grad_norm": 1.7431190713062676, + "language_loss": 0.6832031, + "learning_rate": 2.3509720400169076e-07, + "loss": 0.70451689, + "num_input_tokens_seen": 304447460, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.68359375, + "step": 14117, + "time_per_iteration": 2.450892448425293 + }, + { + "auxiliary_loss_clip": 0.01101464, + "auxiliary_loss_mlp": 0.01027274, + "balance_loss_clip": 1.01512623, + "balance_loss_mlp": 1.03329229, + "epoch": 0.8488200811663911, + "flos": 26396030903040.0, + "grad_norm": 2.202306499476065, + "language_loss": 0.64843965, + "learning_rate": 2.3491403329539096e-07, + "loss": 0.66972697, + "num_input_tokens_seen": 304468230, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 14118, + "time_per_iteration": 2.4827311038970947 + }, + { + "auxiliary_loss_clip": 0.01098527, + "auxiliary_loss_mlp": 0.01029479, + "balance_loss_clip": 1.01851785, + "balance_loss_mlp": 1.03415918, + "epoch": 0.8488802044190591, + "flos": 16359285939840.0, + "grad_norm": 1.5391821500879839, + "language_loss": 0.73291403, + "learning_rate": 2.3473092952132757e-07, + "loss": 0.75419414, + "num_input_tokens_seen": 304484860, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.640625, + "step": 14119, + "time_per_iteration": 2.4333455562591553 + }, + { + "auxiliary_loss_clip": 0.01101713, + "auxiliary_loss_mlp": 0.01027791, + "balance_loss_clip": 1.01544058, + "balance_loss_mlp": 1.03465796, + "epoch": 0.848940327671727, + "flos": 19208869649280.0, + "grad_norm": 1.8345474405737905, + "language_loss": 0.7795918, + "learning_rate": 2.345478926864446e-07, + "loss": 0.80088687, + "num_input_tokens_seen": 304503575, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.671875, + "step": 14120, + "time_per_iteration": 2.438777446746826 + }, + { + "auxiliary_loss_clip": 0.01101394, + "auxiliary_loss_mlp": 0.01028536, + "balance_loss_clip": 1.01621592, + "balance_loss_mlp": 1.03509915, + "epoch": 0.849000450924395, + "flos": 21871178824320.0, + "grad_norm": 5.405751180834322, + "language_loss": 0.75322181, + "learning_rate": 2.3436492279768227e-07, + "loss": 0.77452111, + "num_input_tokens_seen": 304525005, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6640625, + "step": 14121, + "time_per_iteration": 2.4752485752105713 + }, + { + "auxiliary_loss_clip": 0.01022863, + "auxiliary_loss_mlp": 0.01000803, + "balance_loss_clip": 0.9997595, + "balance_loss_mlp": 1.00264168, + "epoch": 0.8490605741770629, + "flos": 71166475624320.0, + "grad_norm": 0.8037296020367628, + "language_loss": 0.60112953, + "learning_rate": 2.3418201986197883e-07, + "loss": 0.6213662, + "num_input_tokens_seen": 304585220, + "router_z_loss_clip": 0.01043701, + "router_z_loss_mlp": 0.203125, + "step": 14122, + "time_per_iteration": 3.0530099868774414 + }, + { + "auxiliary_loss_clip": 0.01101962, + "auxiliary_loss_mlp": 0.01030351, + "balance_loss_clip": 1.01896024, + "balance_loss_mlp": 1.03606272, + "epoch": 0.849120697429731, + "flos": 24973357950720.0, + "grad_norm": 1.8836683151356197, + "language_loss": 0.79854351, + "learning_rate": 2.3399918388627048e-07, + "loss": 0.81986666, + "num_input_tokens_seen": 304604665, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.66015625, + "step": 14123, + "time_per_iteration": 2.481076240539551 + }, + { + "auxiliary_loss_clip": 0.01097323, + "auxiliary_loss_mlp": 0.010265, + "balance_loss_clip": 1.01523471, + "balance_loss_mlp": 1.03398323, + "epoch": 0.8491808206823989, + "flos": 23032277959680.0, + "grad_norm": 2.239097832743251, + "language_loss": 0.83009315, + "learning_rate": 2.3381641487749016e-07, + "loss": 0.85133135, + "num_input_tokens_seen": 304620600, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6328125, + "step": 14124, + "time_per_iteration": 2.4361042976379395 + }, + { + "auxiliary_loss_clip": 0.01102837, + "auxiliary_loss_mlp": 0.01030836, + "balance_loss_clip": 1.01883113, + "balance_loss_mlp": 1.03712273, + "epoch": 0.8492409439350669, + "flos": 23878549365120.0, + "grad_norm": 1.9193916566663176, + "language_loss": 0.7154206, + "learning_rate": 2.3363371284256805e-07, + "loss": 0.7367574, + "num_input_tokens_seen": 304639540, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66015625, + "step": 14125, + "time_per_iteration": 2.4565751552581787 + }, + { + "auxiliary_loss_clip": 0.01104988, + "auxiliary_loss_mlp": 0.01034732, + "balance_loss_clip": 1.02196431, + "balance_loss_mlp": 1.03545964, + "epoch": 0.8493010671877349, + "flos": 22419893963520.0, + "grad_norm": 1.5814217786789184, + "language_loss": 0.73540419, + "learning_rate": 2.3345107778843288e-07, + "loss": 0.75680137, + "num_input_tokens_seen": 304660595, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 14126, + "time_per_iteration": 2.4653918743133545 + }, + { + "auxiliary_loss_clip": 0.01099143, + "auxiliary_loss_mlp": 0.01029819, + "balance_loss_clip": 1.01827955, + "balance_loss_mlp": 1.03460646, + "epoch": 0.8493611904404028, + "flos": 17529435302400.0, + "grad_norm": 1.447182138612943, + "language_loss": 0.67323148, + "learning_rate": 2.3326850972200928e-07, + "loss": 0.69452107, + "num_input_tokens_seen": 304679580, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 14127, + "time_per_iteration": 2.4421885013580322 + }, + { + "auxiliary_loss_clip": 0.01102144, + "auxiliary_loss_mlp": 0.01026876, + "balance_loss_clip": 1.01496696, + "balance_loss_mlp": 1.03487849, + "epoch": 0.8494213136930708, + "flos": 19462937523840.0, + "grad_norm": 1.7486949630521547, + "language_loss": 0.69433224, + "learning_rate": 2.330860086502211e-07, + "loss": 0.71562243, + "num_input_tokens_seen": 304698385, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 14128, + "time_per_iteration": 2.447857618331909 + }, + { + "auxiliary_loss_clip": 0.01099421, + "auxiliary_loss_mlp": 0.01029257, + "balance_loss_clip": 1.01758027, + "balance_loss_mlp": 1.03517187, + "epoch": 0.8494814369457387, + "flos": 18770292587520.0, + "grad_norm": 4.278782473141161, + "language_loss": 0.77867216, + "learning_rate": 2.3290357457998855e-07, + "loss": 0.79995894, + "num_input_tokens_seen": 304715430, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.640625, + "step": 14129, + "time_per_iteration": 2.444333076477051 + }, + { + "auxiliary_loss_clip": 0.01101533, + "auxiliary_loss_mlp": 0.01028606, + "balance_loss_clip": 1.01731718, + "balance_loss_mlp": 1.03556049, + "epoch": 0.8495415601984068, + "flos": 23331486251520.0, + "grad_norm": 2.0594648435116234, + "language_loss": 0.68019104, + "learning_rate": 2.3272120751823031e-07, + "loss": 0.70149243, + "num_input_tokens_seen": 304734345, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.66015625, + "step": 14130, + "time_per_iteration": 2.4651596546173096 + }, + { + "auxiliary_loss_clip": 0.01100363, + "auxiliary_loss_mlp": 0.01028314, + "balance_loss_clip": 1.01677465, + "balance_loss_mlp": 1.03450692, + "epoch": 0.8496016834510747, + "flos": 26612859352320.0, + "grad_norm": 2.677757108055573, + "language_loss": 0.70964313, + "learning_rate": 2.3253890747186e-07, + "loss": 0.73092985, + "num_input_tokens_seen": 304755030, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 14131, + "time_per_iteration": 2.4959685802459717 + }, + { + "auxiliary_loss_clip": 0.01097843, + "auxiliary_loss_mlp": 0.01026281, + "balance_loss_clip": 1.0147171, + "balance_loss_mlp": 1.03148651, + "epoch": 0.8496618067037427, + "flos": 25480380378240.0, + "grad_norm": 1.9992354327212007, + "language_loss": 0.68285507, + "learning_rate": 2.3235667444779162e-07, + "loss": 0.70409632, + "num_input_tokens_seen": 304774320, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6640625, + "step": 14132, + "time_per_iteration": 2.4957363605499268 + }, + { + "auxiliary_loss_clip": 0.01096388, + "auxiliary_loss_mlp": 0.01033853, + "balance_loss_clip": 1.02295697, + "balance_loss_mlp": 1.0323298, + "epoch": 0.8497219299564106, + "flos": 25374587846400.0, + "grad_norm": 1.648812223395765, + "language_loss": 0.70260388, + "learning_rate": 2.3217450845293564e-07, + "loss": 0.72390628, + "num_input_tokens_seen": 304795355, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.640625, + "step": 14133, + "time_per_iteration": 2.5129992961883545 + }, + { + "auxiliary_loss_clip": 0.01022484, + "auxiliary_loss_mlp": 0.01003624, + "balance_loss_clip": 1.00258112, + "balance_loss_mlp": 1.00221777, + "epoch": 0.8497820532090786, + "flos": 67780279658880.0, + "grad_norm": 0.7241616741405683, + "language_loss": 0.57608092, + "learning_rate": 2.3199240949419918e-07, + "loss": 0.59634197, + "num_input_tokens_seen": 304863915, + "router_z_loss_clip": 0.01043701, + "router_z_loss_mlp": 0.203125, + "step": 14134, + "time_per_iteration": 3.173593282699585 + }, + { + "auxiliary_loss_clip": 0.01103223, + "auxiliary_loss_mlp": 0.01028585, + "balance_loss_clip": 1.01674151, + "balance_loss_mlp": 1.03537357, + "epoch": 0.8498421764617465, + "flos": 23440546920960.0, + "grad_norm": 1.97452600255058, + "language_loss": 0.78879797, + "learning_rate": 2.3181037757848787e-07, + "loss": 0.81011605, + "num_input_tokens_seen": 304881555, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 14135, + "time_per_iteration": 2.465132236480713 + }, + { + "auxiliary_loss_clip": 0.01101655, + "auxiliary_loss_mlp": 0.01031405, + "balance_loss_clip": 1.01925802, + "balance_loss_mlp": 1.03432846, + "epoch": 0.8499022997144146, + "flos": 17712615686400.0, + "grad_norm": 1.8051952520241694, + "language_loss": 0.63200223, + "learning_rate": 2.316284127127044e-07, + "loss": 0.65333283, + "num_input_tokens_seen": 304898760, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 14136, + "time_per_iteration": 2.4003732204437256 + }, + { + "auxiliary_loss_clip": 0.01104726, + "auxiliary_loss_mlp": 0.0103221, + "balance_loss_clip": 1.01959145, + "balance_loss_mlp": 1.03692389, + "epoch": 0.8499624229670825, + "flos": 18588512833920.0, + "grad_norm": 1.8110537198368084, + "language_loss": 0.83839071, + "learning_rate": 2.3144651490374835e-07, + "loss": 0.85976005, + "num_input_tokens_seen": 304915465, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6796875, + "step": 14137, + "time_per_iteration": 3.834845542907715 + }, + { + "auxiliary_loss_clip": 0.01097523, + "auxiliary_loss_mlp": 0.01026857, + "balance_loss_clip": 1.016343, + "balance_loss_mlp": 1.03378332, + "epoch": 0.8500225462197505, + "flos": 24345854328960.0, + "grad_norm": 2.01797350925578, + "language_loss": 0.78820533, + "learning_rate": 2.3126468415851773e-07, + "loss": 0.80944908, + "num_input_tokens_seen": 304933190, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.63671875, + "step": 14138, + "time_per_iteration": 5.281948566436768 + }, + { + "auxiliary_loss_clip": 0.01102022, + "auxiliary_loss_mlp": 0.01026275, + "balance_loss_clip": 1.01498008, + "balance_loss_mlp": 1.0361867, + "epoch": 0.8500826694724185, + "flos": 16545518979840.0, + "grad_norm": 1.7959695492088958, + "language_loss": 0.64545155, + "learning_rate": 2.310829204839073e-07, + "loss": 0.66673458, + "num_input_tokens_seen": 304951110, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 14139, + "time_per_iteration": 2.4514920711517334 + }, + { + "auxiliary_loss_clip": 0.01098312, + "auxiliary_loss_mlp": 0.01031602, + "balance_loss_clip": 1.02062881, + "balance_loss_mlp": 1.03338087, + "epoch": 0.8501427927250864, + "flos": 16289404030080.0, + "grad_norm": 1.4663932183968211, + "language_loss": 0.70549941, + "learning_rate": 2.3090122388681043e-07, + "loss": 0.72679853, + "num_input_tokens_seen": 304969095, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6484375, + "step": 14140, + "time_per_iteration": 2.456413745880127 + }, + { + "auxiliary_loss_clip": 0.01101673, + "auxiliary_loss_mlp": 0.01028058, + "balance_loss_clip": 1.0160358, + "balance_loss_mlp": 1.03334641, + "epoch": 0.8502029159777544, + "flos": 26687912820480.0, + "grad_norm": 3.118994023074249, + "language_loss": 0.64317191, + "learning_rate": 2.3071959437411648e-07, + "loss": 0.66446924, + "num_input_tokens_seen": 304989315, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 14141, + "time_per_iteration": 2.4837236404418945 + }, + { + "auxiliary_loss_clip": 0.01102087, + "auxiliary_loss_mlp": 0.01030129, + "balance_loss_clip": 1.01862574, + "balance_loss_mlp": 1.03575301, + "epoch": 0.8502630392304223, + "flos": 35590778179200.0, + "grad_norm": 1.485418549582861, + "language_loss": 0.7077021, + "learning_rate": 2.3053803195271214e-07, + "loss": 0.72902429, + "num_input_tokens_seen": 305011020, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 14142, + "time_per_iteration": 4.026219129562378 + }, + { + "auxiliary_loss_clip": 0.01099601, + "auxiliary_loss_mlp": 0.0102499, + "balance_loss_clip": 1.01399326, + "balance_loss_mlp": 1.03333116, + "epoch": 0.8503231624830904, + "flos": 21649466125440.0, + "grad_norm": 1.5471529006166378, + "language_loss": 0.65363872, + "learning_rate": 2.3035653662948375e-07, + "loss": 0.67488462, + "num_input_tokens_seen": 305033550, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6640625, + "step": 14143, + "time_per_iteration": 2.522714138031006 + }, + { + "auxiliary_loss_clip": 0.01103022, + "auxiliary_loss_mlp": 0.01031915, + "balance_loss_clip": 1.02001226, + "balance_loss_mlp": 1.03437936, + "epoch": 0.8503832857357583, + "flos": 22417451838720.0, + "grad_norm": 2.272347066205258, + "language_loss": 0.67796141, + "learning_rate": 2.3017510841131216e-07, + "loss": 0.69931078, + "num_input_tokens_seen": 305052885, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 14144, + "time_per_iteration": 2.464179277420044 + }, + { + "auxiliary_loss_clip": 0.01096846, + "auxiliary_loss_mlp": 0.01029953, + "balance_loss_clip": 1.01843166, + "balance_loss_mlp": 1.03374457, + "epoch": 0.8504434089884263, + "flos": 18697968552960.0, + "grad_norm": 2.0058736259336913, + "language_loss": 0.65126836, + "learning_rate": 2.299937473050777e-07, + "loss": 0.67253637, + "num_input_tokens_seen": 305071995, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6328125, + "step": 14145, + "time_per_iteration": 2.4138495922088623 + }, + { + "auxiliary_loss_clip": 0.01099661, + "auxiliary_loss_mlp": 0.01031309, + "balance_loss_clip": 1.01923287, + "balance_loss_mlp": 1.03460836, + "epoch": 0.8505035322410942, + "flos": 20007989475840.0, + "grad_norm": 1.9741854800625371, + "language_loss": 0.85892701, + "learning_rate": 2.2981245331765842e-07, + "loss": 0.88023674, + "num_input_tokens_seen": 305090190, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.65234375, + "step": 14146, + "time_per_iteration": 2.4394941329956055 + }, + { + "auxiliary_loss_clip": 0.01097854, + "auxiliary_loss_mlp": 0.01024442, + "balance_loss_clip": 1.01284277, + "balance_loss_mlp": 1.03269684, + "epoch": 0.8505636554937622, + "flos": 20812173120000.0, + "grad_norm": 4.7433845551584115, + "language_loss": 0.83587158, + "learning_rate": 2.2963122645592814e-07, + "loss": 0.85709453, + "num_input_tokens_seen": 305109355, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 14147, + "time_per_iteration": 2.4312021732330322 + }, + { + "auxiliary_loss_clip": 0.01103794, + "auxiliary_loss_mlp": 0.01028362, + "balance_loss_clip": 1.01654828, + "balance_loss_mlp": 1.03492069, + "epoch": 0.8506237787464301, + "flos": 14174445277440.0, + "grad_norm": 2.524544245155263, + "language_loss": 0.85632455, + "learning_rate": 2.2945006672675894e-07, + "loss": 0.87764609, + "num_input_tokens_seen": 305124165, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 14148, + "time_per_iteration": 2.408759355545044 + }, + { + "auxiliary_loss_clip": 0.01099695, + "auxiliary_loss_mlp": 0.01029763, + "balance_loss_clip": 1.01789546, + "balance_loss_mlp": 1.03512621, + "epoch": 0.8506839019990982, + "flos": 23258372117760.0, + "grad_norm": 1.583969514237289, + "language_loss": 0.72040755, + "learning_rate": 2.292689741370204e-07, + "loss": 0.74170214, + "num_input_tokens_seen": 305143940, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.64453125, + "step": 14149, + "time_per_iteration": 2.4647998809814453 + }, + { + "auxiliary_loss_clip": 0.01101741, + "auxiliary_loss_mlp": 0.01025822, + "balance_loss_clip": 1.01436639, + "balance_loss_mlp": 1.03563142, + "epoch": 0.8507440252517661, + "flos": 23659206963840.0, + "grad_norm": 1.7435264384090372, + "language_loss": 0.76055348, + "learning_rate": 2.290879486935804e-07, + "loss": 0.78182906, + "num_input_tokens_seen": 305163505, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66015625, + "step": 14150, + "time_per_iteration": 2.4704911708831787 + }, + { + "auxiliary_loss_clip": 0.01100681, + "auxiliary_loss_mlp": 0.01030406, + "balance_loss_clip": 1.01903915, + "balance_loss_mlp": 1.03694618, + "epoch": 0.8508041485044341, + "flos": 18661339658880.0, + "grad_norm": 1.6165351455884314, + "language_loss": 0.72317696, + "learning_rate": 2.2890699040330231e-07, + "loss": 0.74448776, + "num_input_tokens_seen": 305182325, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.640625, + "step": 14151, + "time_per_iteration": 2.4117584228515625 + }, + { + "auxiliary_loss_clip": 0.01022519, + "auxiliary_loss_mlp": 0.01005156, + "balance_loss_clip": 1.00400531, + "balance_loss_mlp": 1.00236225, + "epoch": 0.8508642717571021, + "flos": 52510918055040.0, + "grad_norm": 0.8888130234597027, + "language_loss": 0.59599686, + "learning_rate": 2.2872609927304909e-07, + "loss": 0.61627358, + "num_input_tokens_seen": 305230775, + "router_z_loss_clip": 0.01147461, + "router_z_loss_mlp": 0.20117188, + "step": 14152, + "time_per_iteration": 2.8257334232330322 + }, + { + "auxiliary_loss_clip": 0.01022311, + "auxiliary_loss_mlp": 0.01002793, + "balance_loss_clip": 1.00180936, + "balance_loss_mlp": 1.00222039, + "epoch": 0.85092439500977, + "flos": 69297145050240.0, + "grad_norm": 0.694800012848997, + "language_loss": 0.61128682, + "learning_rate": 2.285452753096797e-07, + "loss": 0.63153785, + "num_input_tokens_seen": 305296000, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.20117188, + "step": 14153, + "time_per_iteration": 3.0687737464904785 + }, + { + "auxiliary_loss_clip": 0.01101332, + "auxiliary_loss_mlp": 0.01028277, + "balance_loss_clip": 1.01596856, + "balance_loss_mlp": 1.03580403, + "epoch": 0.850984518262438, + "flos": 24389737770240.0, + "grad_norm": 1.6380875980431746, + "language_loss": 0.80774456, + "learning_rate": 2.2836451852005067e-07, + "loss": 0.82904065, + "num_input_tokens_seen": 305314705, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.65625, + "step": 14154, + "time_per_iteration": 2.4598207473754883 + }, + { + "auxiliary_loss_clip": 0.01092813, + "auxiliary_loss_mlp": 0.01029637, + "balance_loss_clip": 1.01902699, + "balance_loss_mlp": 1.03123856, + "epoch": 0.851044641515106, + "flos": 23294821443840.0, + "grad_norm": 1.6273243802969442, + "language_loss": 0.79549897, + "learning_rate": 2.281838289110165e-07, + "loss": 0.81672347, + "num_input_tokens_seen": 305333870, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.6171875, + "step": 14155, + "time_per_iteration": 2.472735643386841 + }, + { + "auxiliary_loss_clip": 0.01100526, + "auxiliary_loss_mlp": 0.01028093, + "balance_loss_clip": 1.01641667, + "balance_loss_mlp": 1.03228521, + "epoch": 0.851104764767774, + "flos": 22050085489920.0, + "grad_norm": 1.6278681013298135, + "language_loss": 0.70760596, + "learning_rate": 2.2800320648942904e-07, + "loss": 0.72889221, + "num_input_tokens_seen": 305352780, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.68359375, + "step": 14156, + "time_per_iteration": 2.4720516204833984 + }, + { + "auxiliary_loss_clip": 0.01097071, + "auxiliary_loss_mlp": 0.01030433, + "balance_loss_clip": 1.01863134, + "balance_loss_mlp": 1.03398883, + "epoch": 0.8511648880204419, + "flos": 20704728562560.0, + "grad_norm": 1.945584806630902, + "language_loss": 0.73951316, + "learning_rate": 2.278226512621386e-07, + "loss": 0.76078814, + "num_input_tokens_seen": 305371370, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6328125, + "step": 14157, + "time_per_iteration": 2.445727586746216 + }, + { + "auxiliary_loss_clip": 0.01096578, + "auxiliary_loss_mlp": 0.01023751, + "balance_loss_clip": 1.01308143, + "balance_loss_mlp": 1.03393173, + "epoch": 0.8512250112731099, + "flos": 24024669891840.0, + "grad_norm": 2.049881321854855, + "language_loss": 0.79299182, + "learning_rate": 2.2764216323598995e-07, + "loss": 0.8141951, + "num_input_tokens_seen": 305387955, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.62890625, + "step": 14158, + "time_per_iteration": 2.4651825428009033 + }, + { + "auxiliary_loss_clip": 0.01100355, + "auxiliary_loss_mlp": 0.01030404, + "balance_loss_clip": 1.01841724, + "balance_loss_mlp": 1.03510904, + "epoch": 0.8512851345257778, + "flos": 22015467757440.0, + "grad_norm": 2.0779394687943, + "language_loss": 0.7930764, + "learning_rate": 2.27461742417828e-07, + "loss": 0.81438398, + "num_input_tokens_seen": 305406285, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.65234375, + "step": 14159, + "time_per_iteration": 2.430978775024414 + }, + { + "auxiliary_loss_clip": 0.01101719, + "auxiliary_loss_mlp": 0.01033534, + "balance_loss_clip": 1.02200091, + "balance_loss_mlp": 1.03555334, + "epoch": 0.8513452577784458, + "flos": 14830209924480.0, + "grad_norm": 2.5681034640558433, + "language_loss": 0.71410954, + "learning_rate": 2.2728138881449488e-07, + "loss": 0.73546207, + "num_input_tokens_seen": 305424500, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 14160, + "time_per_iteration": 2.4289729595184326 + }, + { + "auxiliary_loss_clip": 0.01106194, + "auxiliary_loss_mlp": 0.01032101, + "balance_loss_clip": 1.0195365, + "balance_loss_mlp": 1.03638792, + "epoch": 0.8514053810311137, + "flos": 33035662166400.0, + "grad_norm": 1.9043333827354807, + "language_loss": 0.70242059, + "learning_rate": 2.2710110243282866e-07, + "loss": 0.72380352, + "num_input_tokens_seen": 305442990, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69921875, + "step": 14161, + "time_per_iteration": 2.5416407585144043 + }, + { + "auxiliary_loss_clip": 0.01099976, + "auxiliary_loss_mlp": 0.01030032, + "balance_loss_clip": 1.01886177, + "balance_loss_mlp": 1.03187084, + "epoch": 0.8514655042837818, + "flos": 27564456412800.0, + "grad_norm": 2.9260794515175017, + "language_loss": 0.78138113, + "learning_rate": 2.2692088327966653e-07, + "loss": 0.80268127, + "num_input_tokens_seen": 305463065, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6796875, + "step": 14162, + "time_per_iteration": 2.5057663917541504 + }, + { + "auxiliary_loss_clip": 0.01100522, + "auxiliary_loss_mlp": 0.01032359, + "balance_loss_clip": 1.02059305, + "balance_loss_mlp": 1.03527033, + "epoch": 0.8515256275364497, + "flos": 35556052705920.0, + "grad_norm": 2.7045213694102292, + "language_loss": 0.76977819, + "learning_rate": 2.2674073136184235e-07, + "loss": 0.79110706, + "num_input_tokens_seen": 305489070, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.65234375, + "step": 14163, + "time_per_iteration": 2.576014280319214 + }, + { + "auxiliary_loss_clip": 0.01022488, + "auxiliary_loss_mlp": 0.01004361, + "balance_loss_clip": 1.00332379, + "balance_loss_mlp": 1.00239372, + "epoch": 0.8515857507891177, + "flos": 70207372621440.0, + "grad_norm": 0.6917600034361476, + "language_loss": 0.55013472, + "learning_rate": 2.2656064668618735e-07, + "loss": 0.57040328, + "num_input_tokens_seen": 305551490, + "router_z_loss_clip": 0.01037598, + "router_z_loss_mlp": 0.20117188, + "step": 14164, + "time_per_iteration": 3.0798745155334473 + }, + { + "auxiliary_loss_clip": 0.0109938, + "auxiliary_loss_mlp": 0.01031942, + "balance_loss_clip": 1.02049756, + "balance_loss_mlp": 1.03448367, + "epoch": 0.8516458740417857, + "flos": 22675290641280.0, + "grad_norm": 1.8565116591235626, + "language_loss": 0.72916138, + "learning_rate": 2.2638062925953005e-07, + "loss": 0.75047463, + "num_input_tokens_seen": 305570535, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 14165, + "time_per_iteration": 2.4583141803741455 + }, + { + "auxiliary_loss_clip": 0.01096948, + "auxiliary_loss_mlp": 0.01028375, + "balance_loss_clip": 1.01683545, + "balance_loss_mlp": 1.03286302, + "epoch": 0.8517059972944536, + "flos": 22747435107840.0, + "grad_norm": 1.53357605773611, + "language_loss": 0.67339641, + "learning_rate": 2.26200679088697e-07, + "loss": 0.69464964, + "num_input_tokens_seen": 305590800, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 14166, + "time_per_iteration": 2.4731404781341553 + }, + { + "auxiliary_loss_clip": 0.01098945, + "auxiliary_loss_mlp": 0.01028293, + "balance_loss_clip": 1.01718283, + "balance_loss_mlp": 1.03358221, + "epoch": 0.8517661205471216, + "flos": 21689147675520.0, + "grad_norm": 2.1743229619980284, + "language_loss": 0.73408175, + "learning_rate": 2.260207961805125e-07, + "loss": 0.75535411, + "num_input_tokens_seen": 305609495, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 14167, + "time_per_iteration": 2.424105167388916 + }, + { + "auxiliary_loss_clip": 0.01100059, + "auxiliary_loss_mlp": 0.01027185, + "balance_loss_clip": 1.01569331, + "balance_loss_mlp": 1.03490484, + "epoch": 0.8518262437997896, + "flos": 25374839241600.0, + "grad_norm": 1.5567552814604415, + "language_loss": 0.80538321, + "learning_rate": 2.258409805417969e-07, + "loss": 0.82665563, + "num_input_tokens_seen": 305629420, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65234375, + "step": 14168, + "time_per_iteration": 2.501282215118408 + }, + { + "auxiliary_loss_clip": 0.01098651, + "auxiliary_loss_mlp": 0.01025085, + "balance_loss_clip": 1.01371276, + "balance_loss_mlp": 1.03329349, + "epoch": 0.8518863670524576, + "flos": 27235406897280.0, + "grad_norm": 1.8118683841885685, + "language_loss": 0.76072329, + "learning_rate": 2.2566123217936893e-07, + "loss": 0.78196067, + "num_input_tokens_seen": 305649835, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 14169, + "time_per_iteration": 2.466012716293335 + }, + { + "auxiliary_loss_clip": 0.01103905, + "auxiliary_loss_mlp": 0.01031734, + "balance_loss_clip": 1.01977158, + "balance_loss_mlp": 1.03580987, + "epoch": 0.8519464903051255, + "flos": 20959514709120.0, + "grad_norm": 2.198130292070983, + "language_loss": 0.63613892, + "learning_rate": 2.254815511000452e-07, + "loss": 0.65749532, + "num_input_tokens_seen": 305668840, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6796875, + "step": 14170, + "time_per_iteration": 2.4390439987182617 + }, + { + "auxiliary_loss_clip": 0.01096568, + "auxiliary_loss_mlp": 0.01025636, + "balance_loss_clip": 1.01424527, + "balance_loss_mlp": 1.03146672, + "epoch": 0.8520066135577935, + "flos": 18441745862400.0, + "grad_norm": 2.336302348660875, + "language_loss": 0.86398733, + "learning_rate": 2.253019373106384e-07, + "loss": 0.88520932, + "num_input_tokens_seen": 305686955, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6484375, + "step": 14171, + "time_per_iteration": 2.40663480758667 + }, + { + "auxiliary_loss_clip": 0.01101227, + "auxiliary_loss_mlp": 0.01032324, + "balance_loss_clip": 1.02092147, + "balance_loss_mlp": 1.03520513, + "epoch": 0.8520667368104614, + "flos": 29130233149440.0, + "grad_norm": 1.8512649886443278, + "language_loss": 0.5462482, + "learning_rate": 2.2512239081796003e-07, + "loss": 0.56758368, + "num_input_tokens_seen": 305706290, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66015625, + "step": 14172, + "time_per_iteration": 2.4998886585235596 + }, + { + "auxiliary_loss_clip": 0.01095976, + "auxiliary_loss_mlp": 0.01025638, + "balance_loss_clip": 1.01607168, + "balance_loss_mlp": 1.03305733, + "epoch": 0.8521268600631294, + "flos": 16034366488320.0, + "grad_norm": 2.086050566493409, + "language_loss": 0.69540936, + "learning_rate": 2.2494291162881862e-07, + "loss": 0.71662551, + "num_input_tokens_seen": 305723835, + "router_z_loss_clip": 0.09570312, + "router_z_loss_mlp": 0.62890625, + "step": 14173, + "time_per_iteration": 2.4107959270477295 + }, + { + "auxiliary_loss_clip": 0.01100817, + "auxiliary_loss_mlp": 0.01028789, + "balance_loss_clip": 1.01621222, + "balance_loss_mlp": 1.0341866, + "epoch": 0.8521869833157973, + "flos": 22454870832000.0, + "grad_norm": 2.2445717873488027, + "language_loss": 0.77038109, + "learning_rate": 2.247634997500205e-07, + "loss": 0.79167712, + "num_input_tokens_seen": 305741655, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6640625, + "step": 14174, + "time_per_iteration": 2.4528019428253174 + }, + { + "auxiliary_loss_clip": 0.0110205, + "auxiliary_loss_mlp": 0.01028815, + "balance_loss_clip": 1.01738858, + "balance_loss_mlp": 1.03537321, + "epoch": 0.8522471065684654, + "flos": 24972029147520.0, + "grad_norm": 1.531960767298018, + "language_loss": 0.81722677, + "learning_rate": 2.245841551883676e-07, + "loss": 0.83853537, + "num_input_tokens_seen": 305761890, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 14175, + "time_per_iteration": 2.4613003730773926 + }, + { + "auxiliary_loss_clip": 0.01103945, + "auxiliary_loss_mlp": 0.01031278, + "balance_loss_clip": 1.01919007, + "balance_loss_mlp": 1.03601801, + "epoch": 0.8523072298211333, + "flos": 17710604524800.0, + "grad_norm": 2.6143135080090913, + "language_loss": 0.65842164, + "learning_rate": 2.2440487795066153e-07, + "loss": 0.67977381, + "num_input_tokens_seen": 305779190, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 14176, + "time_per_iteration": 2.4280595779418945 + }, + { + "auxiliary_loss_clip": 0.01098874, + "auxiliary_loss_mlp": 0.01028131, + "balance_loss_clip": 1.01610303, + "balance_loss_mlp": 1.03441358, + "epoch": 0.8523673530738013, + "flos": 25446193608960.0, + "grad_norm": 1.5794340083453389, + "language_loss": 0.78320289, + "learning_rate": 2.2422566804370068e-07, + "loss": 0.80447292, + "num_input_tokens_seen": 305799870, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.64453125, + "step": 14177, + "time_per_iteration": 2.4813055992126465 + }, + { + "auxiliary_loss_clip": 0.01100784, + "auxiliary_loss_mlp": 0.01029671, + "balance_loss_clip": 1.01731539, + "balance_loss_mlp": 1.03469777, + "epoch": 0.8524274763264693, + "flos": 31429593348480.0, + "grad_norm": 1.6547287833476916, + "language_loss": 0.73443151, + "learning_rate": 2.2404652547428026e-07, + "loss": 0.75573605, + "num_input_tokens_seen": 305819695, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.66015625, + "step": 14178, + "time_per_iteration": 3.9108073711395264 + }, + { + "auxiliary_loss_clip": 0.01101319, + "auxiliary_loss_mlp": 0.0103325, + "balance_loss_clip": 1.0220679, + "balance_loss_mlp": 1.03550065, + "epoch": 0.8524875995791372, + "flos": 17712651600000.0, + "grad_norm": 1.8377191267924193, + "language_loss": 0.74717975, + "learning_rate": 2.238674502491935e-07, + "loss": 0.76852548, + "num_input_tokens_seen": 305837270, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.66015625, + "step": 14179, + "time_per_iteration": 2.4225523471832275 + }, + { + "auxiliary_loss_clip": 0.01098767, + "auxiliary_loss_mlp": 0.01025724, + "balance_loss_clip": 1.01433325, + "balance_loss_mlp": 1.03464127, + "epoch": 0.8525477228318052, + "flos": 21687316081920.0, + "grad_norm": 2.460422297621149, + "language_loss": 0.81496072, + "learning_rate": 2.2368844237523165e-07, + "loss": 0.83620566, + "num_input_tokens_seen": 305855250, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.640625, + "step": 14180, + "time_per_iteration": 5.2744059562683105 + }, + { + "auxiliary_loss_clip": 0.01100192, + "auxiliary_loss_mlp": 0.01030938, + "balance_loss_clip": 1.02003634, + "balance_loss_mlp": 1.03413081, + "epoch": 0.8526078460844732, + "flos": 24827057856000.0, + "grad_norm": 2.369494147996583, + "language_loss": 0.61639541, + "learning_rate": 2.235095018591815e-07, + "loss": 0.63770676, + "num_input_tokens_seen": 305875660, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.66015625, + "step": 14181, + "time_per_iteration": 2.4872968196868896 + }, + { + "auxiliary_loss_clip": 0.01098397, + "auxiliary_loss_mlp": 0.0103076, + "balance_loss_clip": 1.01989412, + "balance_loss_mlp": 1.03492475, + "epoch": 0.8526679693371412, + "flos": 13516418073600.0, + "grad_norm": 2.119628818212838, + "language_loss": 0.72303843, + "learning_rate": 2.2333062870782894e-07, + "loss": 0.74433005, + "num_input_tokens_seen": 305892415, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.63671875, + "step": 14182, + "time_per_iteration": 2.405911922454834 + }, + { + "auxiliary_loss_clip": 0.0109916, + "auxiliary_loss_mlp": 0.01031468, + "balance_loss_clip": 1.01984537, + "balance_loss_mlp": 1.03547144, + "epoch": 0.8527280925898091, + "flos": 23514092017920.0, + "grad_norm": 1.877933371145743, + "language_loss": 0.70888335, + "learning_rate": 2.2315182292795697e-07, + "loss": 0.73018968, + "num_input_tokens_seen": 305912665, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.63671875, + "step": 14183, + "time_per_iteration": 2.465843439102173 + }, + { + "auxiliary_loss_clip": 0.01099389, + "auxiliary_loss_mlp": 0.0102998, + "balance_loss_clip": 1.01894093, + "balance_loss_mlp": 1.03608322, + "epoch": 0.8527882158424771, + "flos": 20303031790080.0, + "grad_norm": 2.003256962721328, + "language_loss": 0.72409725, + "learning_rate": 2.2297308452634644e-07, + "loss": 0.74539095, + "num_input_tokens_seen": 305931515, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6328125, + "step": 14184, + "time_per_iteration": 3.8862593173980713 + }, + { + "auxiliary_loss_clip": 0.01101013, + "auxiliary_loss_mlp": 0.01031741, + "balance_loss_clip": 1.020082, + "balance_loss_mlp": 1.03550458, + "epoch": 0.852848339095145, + "flos": 17202504689280.0, + "grad_norm": 1.6403093384552982, + "language_loss": 0.76668632, + "learning_rate": 2.2279441350977457e-07, + "loss": 0.78801394, + "num_input_tokens_seen": 305949965, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65625, + "step": 14185, + "time_per_iteration": 2.43679141998291 + }, + { + "auxiliary_loss_clip": 0.01100786, + "auxiliary_loss_mlp": 0.01025879, + "balance_loss_clip": 1.01407111, + "balance_loss_mlp": 1.0342108, + "epoch": 0.852908462347813, + "flos": 18368990864640.0, + "grad_norm": 1.7633313244076745, + "language_loss": 0.79761022, + "learning_rate": 2.2261580988501637e-07, + "loss": 0.81887686, + "num_input_tokens_seen": 305967820, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 14186, + "time_per_iteration": 2.425837755203247 + }, + { + "auxiliary_loss_clip": 0.01098762, + "auxiliary_loss_mlp": 0.01027369, + "balance_loss_clip": 1.01503086, + "balance_loss_mlp": 1.03246689, + "epoch": 0.8529685856004809, + "flos": 18624890332800.0, + "grad_norm": 1.6649429350978724, + "language_loss": 0.62752771, + "learning_rate": 2.224372736588449e-07, + "loss": 0.64878899, + "num_input_tokens_seen": 305985505, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6640625, + "step": 14187, + "time_per_iteration": 2.4106929302215576 + }, + { + "auxiliary_loss_clip": 0.01101676, + "auxiliary_loss_mlp": 0.01028, + "balance_loss_clip": 1.01542985, + "balance_loss_mlp": 1.03296733, + "epoch": 0.853028708853149, + "flos": 29607665748480.0, + "grad_norm": 1.9637586597140755, + "language_loss": 0.7628786, + "learning_rate": 2.2225880483803005e-07, + "loss": 0.7841754, + "num_input_tokens_seen": 306005220, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 14188, + "time_per_iteration": 2.51119327545166 + }, + { + "auxiliary_loss_clip": 0.01101265, + "auxiliary_loss_mlp": 0.0102843, + "balance_loss_clip": 1.015764, + "balance_loss_mlp": 1.03399968, + "epoch": 0.8530888321058169, + "flos": 26353153042560.0, + "grad_norm": 1.568613881178684, + "language_loss": 0.78370714, + "learning_rate": 2.2208040342933932e-07, + "loss": 0.805004, + "num_input_tokens_seen": 306023785, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.671875, + "step": 14189, + "time_per_iteration": 2.467890739440918 + }, + { + "auxiliary_loss_clip": 0.0110047, + "auxiliary_loss_mlp": 0.01031251, + "balance_loss_clip": 1.0192889, + "balance_loss_mlp": 1.03368163, + "epoch": 0.8531489553584849, + "flos": 20521979141760.0, + "grad_norm": 1.8423482276123486, + "language_loss": 0.79671139, + "learning_rate": 2.2190206943953793e-07, + "loss": 0.81802857, + "num_input_tokens_seen": 306041600, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66796875, + "step": 14190, + "time_per_iteration": 2.426774263381958 + }, + { + "auxiliary_loss_clip": 0.01099971, + "auxiliary_loss_mlp": 0.01033211, + "balance_loss_clip": 1.02108765, + "balance_loss_mlp": 1.03475523, + "epoch": 0.8532090786111529, + "flos": 20704297599360.0, + "grad_norm": 2.2393724567806537, + "language_loss": 0.76187646, + "learning_rate": 2.2172380287538894e-07, + "loss": 0.78320825, + "num_input_tokens_seen": 306060345, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.65234375, + "step": 14191, + "time_per_iteration": 2.493112802505493 + }, + { + "auxiliary_loss_clip": 0.01099669, + "auxiliary_loss_mlp": 0.01027383, + "balance_loss_clip": 1.01559925, + "balance_loss_mlp": 1.03472078, + "epoch": 0.8532692018638208, + "flos": 19828903242240.0, + "grad_norm": 1.9447286262265506, + "language_loss": 0.6872884, + "learning_rate": 2.2154560374365073e-07, + "loss": 0.70855892, + "num_input_tokens_seen": 306078285, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6484375, + "step": 14192, + "time_per_iteration": 2.460580825805664 + }, + { + "auxiliary_loss_clip": 0.01105512, + "auxiliary_loss_mlp": 0.01034778, + "balance_loss_clip": 1.0211103, + "balance_loss_mlp": 1.03487968, + "epoch": 0.8533293251164888, + "flos": 20996790048000.0, + "grad_norm": 2.13238273698423, + "language_loss": 0.62750936, + "learning_rate": 2.2136747205108164e-07, + "loss": 0.64891225, + "num_input_tokens_seen": 306093760, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.70703125, + "step": 14193, + "time_per_iteration": 2.422989845275879 + }, + { + "auxiliary_loss_clip": 0.01099517, + "auxiliary_loss_mlp": 0.0102952, + "balance_loss_clip": 1.0180825, + "balance_loss_mlp": 1.03407574, + "epoch": 0.8533894483691568, + "flos": 22419606654720.0, + "grad_norm": 2.376253563391597, + "language_loss": 0.7662459, + "learning_rate": 2.211894078044365e-07, + "loss": 0.78753626, + "num_input_tokens_seen": 306112595, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 14194, + "time_per_iteration": 2.4911811351776123 + }, + { + "auxiliary_loss_clip": 0.01099442, + "auxiliary_loss_mlp": 0.01027223, + "balance_loss_clip": 1.01629698, + "balance_loss_mlp": 1.03374958, + "epoch": 0.8534495716218248, + "flos": 21616536332160.0, + "grad_norm": 2.870797059633662, + "language_loss": 0.69526476, + "learning_rate": 2.2101141101046705e-07, + "loss": 0.7165314, + "num_input_tokens_seen": 306131800, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.65625, + "step": 14195, + "time_per_iteration": 2.43084454536438 + }, + { + "auxiliary_loss_clip": 0.01100087, + "auxiliary_loss_mlp": 0.01030867, + "balance_loss_clip": 1.01892877, + "balance_loss_mlp": 1.03276181, + "epoch": 0.8535096948744927, + "flos": 22346277039360.0, + "grad_norm": 1.951192155992327, + "language_loss": 0.8569665, + "learning_rate": 2.2083348167592343e-07, + "loss": 0.87827611, + "num_input_tokens_seen": 306150590, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.671875, + "step": 14196, + "time_per_iteration": 2.450251340866089 + }, + { + "auxiliary_loss_clip": 0.01022416, + "auxiliary_loss_mlp": 0.01004449, + "balance_loss_clip": 1.00341821, + "balance_loss_mlp": 1.00223291, + "epoch": 0.8535698181271607, + "flos": 52762507891200.0, + "grad_norm": 0.7740707034591511, + "language_loss": 0.55144757, + "learning_rate": 2.2065561980755243e-07, + "loss": 0.57171625, + "num_input_tokens_seen": 306205850, + "router_z_loss_clip": 0.01031494, + "router_z_loss_mlp": 0.20214844, + "step": 14197, + "time_per_iteration": 3.005002975463867 + }, + { + "auxiliary_loss_clip": 0.01096299, + "auxiliary_loss_mlp": 0.01032154, + "balance_loss_clip": 1.02065611, + "balance_loss_mlp": 1.03312826, + "epoch": 0.8536299413798286, + "flos": 19062892776960.0, + "grad_norm": 1.515492209214701, + "language_loss": 0.81483853, + "learning_rate": 2.2047782541209826e-07, + "loss": 0.83612299, + "num_input_tokens_seen": 306225220, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6328125, + "step": 14198, + "time_per_iteration": 2.4649815559387207 + }, + { + "auxiliary_loss_clip": 0.01099001, + "auxiliary_loss_mlp": 0.01026718, + "balance_loss_clip": 1.01638854, + "balance_loss_mlp": 1.03425956, + "epoch": 0.8536900646324966, + "flos": 49344743871360.0, + "grad_norm": 1.4080909116344482, + "language_loss": 0.68194431, + "learning_rate": 2.203000984963035e-07, + "loss": 0.70320153, + "num_input_tokens_seen": 306249865, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.6484375, + "step": 14199, + "time_per_iteration": 2.75403094291687 + }, + { + "auxiliary_loss_clip": 0.01094508, + "auxiliary_loss_mlp": 0.01028493, + "balance_loss_clip": 1.0175674, + "balance_loss_mlp": 1.03173518, + "epoch": 0.8537501878851645, + "flos": 21762333636480.0, + "grad_norm": 1.6037725404598313, + "language_loss": 0.86364204, + "learning_rate": 2.201224390669072e-07, + "loss": 0.88487208, + "num_input_tokens_seen": 306270215, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.62890625, + "step": 14200, + "time_per_iteration": 2.4806206226348877 + }, + { + "auxiliary_loss_clip": 0.01099065, + "auxiliary_loss_mlp": 0.01026575, + "balance_loss_clip": 1.0155009, + "balance_loss_mlp": 1.03298926, + "epoch": 0.8538103111378326, + "flos": 22269176496000.0, + "grad_norm": 1.7777053516959667, + "language_loss": 0.77743292, + "learning_rate": 2.1994484713064666e-07, + "loss": 0.79868931, + "num_input_tokens_seen": 306288960, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.66015625, + "step": 14201, + "time_per_iteration": 2.4462931156158447 + }, + { + "auxiliary_loss_clip": 0.01097721, + "auxiliary_loss_mlp": 0.01026351, + "balance_loss_clip": 1.01518703, + "balance_loss_mlp": 1.0337944, + "epoch": 0.8538704343905005, + "flos": 20303929630080.0, + "grad_norm": 1.7109350279161786, + "language_loss": 0.68886614, + "learning_rate": 2.19767322694256e-07, + "loss": 0.71010685, + "num_input_tokens_seen": 306308735, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.640625, + "step": 14202, + "time_per_iteration": 2.463541030883789 + }, + { + "auxiliary_loss_clip": 0.01099825, + "auxiliary_loss_mlp": 0.01032542, + "balance_loss_clip": 1.02116919, + "balance_loss_mlp": 1.03426242, + "epoch": 0.8539305576431685, + "flos": 24755164784640.0, + "grad_norm": 1.9535951934436555, + "language_loss": 0.80181468, + "learning_rate": 2.195898657644666e-07, + "loss": 0.82313836, + "num_input_tokens_seen": 306329015, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65625, + "step": 14203, + "time_per_iteration": 2.5205531120300293 + }, + { + "auxiliary_loss_clip": 0.01101653, + "auxiliary_loss_mlp": 0.01032256, + "balance_loss_clip": 1.01982856, + "balance_loss_mlp": 1.03427434, + "epoch": 0.8539906808958365, + "flos": 26687625511680.0, + "grad_norm": 1.984669795518607, + "language_loss": 0.65570819, + "learning_rate": 2.1941247634800808e-07, + "loss": 0.67704731, + "num_input_tokens_seen": 306349085, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.671875, + "step": 14204, + "time_per_iteration": 2.4955055713653564 + }, + { + "auxiliary_loss_clip": 0.01101005, + "auxiliary_loss_mlp": 0.01032864, + "balance_loss_clip": 1.02073479, + "balance_loss_mlp": 1.03411329, + "epoch": 0.8540508041485044, + "flos": 13365521038080.0, + "grad_norm": 2.2503266351181885, + "language_loss": 0.59924453, + "learning_rate": 2.1923515445160667e-07, + "loss": 0.62058318, + "num_input_tokens_seen": 306365385, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 14205, + "time_per_iteration": 2.451064348220825 + }, + { + "auxiliary_loss_clip": 0.0109883, + "auxiliary_loss_mlp": 0.01026041, + "balance_loss_clip": 1.01386976, + "balance_loss_mlp": 1.03404224, + "epoch": 0.8541109274011724, + "flos": 32780876019840.0, + "grad_norm": 2.00271599179622, + "language_loss": 0.72058553, + "learning_rate": 2.1905790008198655e-07, + "loss": 0.74183416, + "num_input_tokens_seen": 306384585, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6484375, + "step": 14206, + "time_per_iteration": 2.691939115524292 + }, + { + "auxiliary_loss_clip": 0.01102004, + "auxiliary_loss_mlp": 0.01027685, + "balance_loss_clip": 1.01611567, + "balance_loss_mlp": 1.03535128, + "epoch": 0.8541710506538404, + "flos": 17639286071040.0, + "grad_norm": 3.093713921060051, + "language_loss": 0.76876032, + "learning_rate": 2.1888071324586987e-07, + "loss": 0.79005724, + "num_input_tokens_seen": 306401565, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66796875, + "step": 14207, + "time_per_iteration": 2.4235453605651855 + }, + { + "auxiliary_loss_clip": 0.01101035, + "auxiliary_loss_mlp": 0.01030298, + "balance_loss_clip": 1.0175786, + "balance_loss_mlp": 1.03433371, + "epoch": 0.8542311739065084, + "flos": 20263062931200.0, + "grad_norm": 1.6417850294728733, + "language_loss": 0.85100585, + "learning_rate": 2.1870359394997485e-07, + "loss": 0.87231922, + "num_input_tokens_seen": 306419995, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6640625, + "step": 14208, + "time_per_iteration": 2.4407296180725098 + }, + { + "auxiliary_loss_clip": 0.01101124, + "auxiliary_loss_mlp": 0.01032284, + "balance_loss_clip": 1.02127552, + "balance_loss_mlp": 1.03569424, + "epoch": 0.8542912971591763, + "flos": 17785657992960.0, + "grad_norm": 1.4610514285871214, + "language_loss": 0.65849692, + "learning_rate": 2.1852654220101785e-07, + "loss": 0.67983097, + "num_input_tokens_seen": 306439240, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 14209, + "time_per_iteration": 2.418771266937256 + }, + { + "auxiliary_loss_clip": 0.01099443, + "auxiliary_loss_mlp": 0.01026049, + "balance_loss_clip": 1.01455092, + "balance_loss_mlp": 1.03474426, + "epoch": 0.8543514204118443, + "flos": 26979507429120.0, + "grad_norm": 2.120429439865478, + "language_loss": 0.70436859, + "learning_rate": 2.1834955800571287e-07, + "loss": 0.72562349, + "num_input_tokens_seen": 306458425, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 14210, + "time_per_iteration": 2.474961996078491 + }, + { + "auxiliary_loss_clip": 0.01098268, + "auxiliary_loss_mlp": 0.01030207, + "balance_loss_clip": 1.01849508, + "balance_loss_mlp": 1.03316927, + "epoch": 0.8544115436645122, + "flos": 24024598064640.0, + "grad_norm": 1.342038230302634, + "language_loss": 0.70265722, + "learning_rate": 2.1817264137077141e-07, + "loss": 0.72394198, + "num_input_tokens_seen": 306477210, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65234375, + "step": 14211, + "time_per_iteration": 2.4852607250213623 + }, + { + "auxiliary_loss_clip": 0.01100811, + "auxiliary_loss_mlp": 0.01028983, + "balance_loss_clip": 1.01696706, + "balance_loss_mlp": 1.03383327, + "epoch": 0.8544716669171802, + "flos": 16617986668800.0, + "grad_norm": 6.036072710383437, + "language_loss": 0.811239, + "learning_rate": 2.1799579230290166e-07, + "loss": 0.83253694, + "num_input_tokens_seen": 306495820, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66796875, + "step": 14212, + "time_per_iteration": 2.428615093231201 + }, + { + "auxiliary_loss_clip": 0.01100834, + "auxiliary_loss_mlp": 0.01033389, + "balance_loss_clip": 1.02034807, + "balance_loss_mlp": 1.03400826, + "epoch": 0.8545317901698481, + "flos": 40005779489280.0, + "grad_norm": 2.153668770335833, + "language_loss": 0.66985464, + "learning_rate": 2.178190108088105e-07, + "loss": 0.69119686, + "num_input_tokens_seen": 306516420, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6640625, + "step": 14213, + "time_per_iteration": 2.5968360900878906 + }, + { + "auxiliary_loss_clip": 0.01098117, + "auxiliary_loss_mlp": 0.0102624, + "balance_loss_clip": 1.01437306, + "balance_loss_mlp": 1.0335108, + "epoch": 0.8545919134225162, + "flos": 19902520166400.0, + "grad_norm": 1.5860576713559527, + "language_loss": 0.78203142, + "learning_rate": 2.1764229689520098e-07, + "loss": 0.80327499, + "num_input_tokens_seen": 306534785, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.64453125, + "step": 14214, + "time_per_iteration": 2.437434196472168 + }, + { + "auxiliary_loss_clip": 0.01103195, + "auxiliary_loss_mlp": 0.01028084, + "balance_loss_clip": 1.01534665, + "balance_loss_mlp": 1.0336858, + "epoch": 0.8546520366751841, + "flos": 18952970181120.0, + "grad_norm": 2.3601072733051764, + "language_loss": 0.66634488, + "learning_rate": 2.1746565056877397e-07, + "loss": 0.68765759, + "num_input_tokens_seen": 306552440, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 14215, + "time_per_iteration": 2.4206948280334473 + }, + { + "auxiliary_loss_clip": 0.01100459, + "auxiliary_loss_mlp": 0.01026262, + "balance_loss_clip": 1.01484776, + "balance_loss_mlp": 1.03546536, + "epoch": 0.8547121599278521, + "flos": 35621445415680.0, + "grad_norm": 1.6911842377529425, + "language_loss": 0.62753046, + "learning_rate": 2.172890718362279e-07, + "loss": 0.64879763, + "num_input_tokens_seen": 306573600, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 14216, + "time_per_iteration": 2.573880434036255 + }, + { + "auxiliary_loss_clip": 0.01100323, + "auxiliary_loss_mlp": 0.0102972, + "balance_loss_clip": 1.01847267, + "balance_loss_mlp": 1.0334928, + "epoch": 0.8547722831805201, + "flos": 16910048154240.0, + "grad_norm": 1.9929110810003072, + "language_loss": 0.65539861, + "learning_rate": 2.17112560704259e-07, + "loss": 0.67669904, + "num_input_tokens_seen": 306592840, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.66796875, + "step": 14217, + "time_per_iteration": 2.4209792613983154 + }, + { + "auxiliary_loss_clip": 0.01098009, + "auxiliary_loss_mlp": 0.010284, + "balance_loss_clip": 1.01740897, + "balance_loss_mlp": 1.03479838, + "epoch": 0.854832406433188, + "flos": 23002616304000.0, + "grad_norm": 2.1082100037131184, + "language_loss": 0.64531755, + "learning_rate": 2.1693611717956072e-07, + "loss": 0.66658163, + "num_input_tokens_seen": 306613210, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6328125, + "step": 14218, + "time_per_iteration": 2.4655544757843018 + }, + { + "auxiliary_loss_clip": 0.0110141, + "auxiliary_loss_mlp": 0.01029202, + "balance_loss_clip": 1.01771629, + "balance_loss_mlp": 1.03406596, + "epoch": 0.854892529685856, + "flos": 20412595249920.0, + "grad_norm": 1.7906180605197195, + "language_loss": 0.6969347, + "learning_rate": 2.167597412688238e-07, + "loss": 0.7182408, + "num_input_tokens_seen": 306631620, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.671875, + "step": 14219, + "time_per_iteration": 2.4618418216705322 + }, + { + "auxiliary_loss_clip": 0.01101273, + "auxiliary_loss_mlp": 0.0103232, + "balance_loss_clip": 1.02023816, + "balance_loss_mlp": 1.03277564, + "epoch": 0.854952652938524, + "flos": 16398716094720.0, + "grad_norm": 2.2477262309948722, + "language_loss": 0.67420268, + "learning_rate": 2.1658343297873549e-07, + "loss": 0.69553864, + "num_input_tokens_seen": 306646695, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.68359375, + "step": 14220, + "time_per_iteration": 3.827411651611328 + }, + { + "auxiliary_loss_clip": 0.01096891, + "auxiliary_loss_mlp": 0.0102881, + "balance_loss_clip": 1.01792645, + "balance_loss_mlp": 1.034024, + "epoch": 0.855012776191192, + "flos": 21178677542400.0, + "grad_norm": 1.977915778436477, + "language_loss": 0.71490705, + "learning_rate": 2.164071923159827e-07, + "loss": 0.73616409, + "num_input_tokens_seen": 306665465, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.62890625, + "step": 14221, + "time_per_iteration": 2.4499704837799072 + }, + { + "auxiliary_loss_clip": 0.01100961, + "auxiliary_loss_mlp": 0.01035966, + "balance_loss_clip": 1.02373493, + "balance_loss_mlp": 1.0342536, + "epoch": 0.8550728994438599, + "flos": 26140993361280.0, + "grad_norm": 1.8036420515199072, + "language_loss": 0.59936148, + "learning_rate": 2.1623101928724763e-07, + "loss": 0.62073076, + "num_input_tokens_seen": 306685950, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66796875, + "step": 14222, + "time_per_iteration": 3.9379451274871826 + }, + { + "auxiliary_loss_clip": 0.01098725, + "auxiliary_loss_mlp": 0.01031633, + "balance_loss_clip": 1.01986158, + "balance_loss_mlp": 1.03435004, + "epoch": 0.8551330226965279, + "flos": 22786793435520.0, + "grad_norm": 1.8788661721369253, + "language_loss": 0.8384949, + "learning_rate": 2.1605491389921093e-07, + "loss": 0.85979849, + "num_input_tokens_seen": 306705740, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.640625, + "step": 14223, + "time_per_iteration": 2.4997923374176025 + }, + { + "auxiliary_loss_clip": 0.01099205, + "auxiliary_loss_mlp": 0.01029513, + "balance_loss_clip": 1.0179739, + "balance_loss_mlp": 1.03510475, + "epoch": 0.8551931459491958, + "flos": 22419032037120.0, + "grad_norm": 1.9589548379338593, + "language_loss": 0.74226081, + "learning_rate": 2.158788761585515e-07, + "loss": 0.76354808, + "num_input_tokens_seen": 306725065, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 14224, + "time_per_iteration": 2.4575846195220947 + }, + { + "auxiliary_loss_clip": 0.01099212, + "auxiliary_loss_mlp": 0.01025746, + "balance_loss_clip": 1.01395071, + "balance_loss_mlp": 1.03435862, + "epoch": 0.8552532692018638, + "flos": 19573183342080.0, + "grad_norm": 2.0140938675703404, + "language_loss": 0.75260413, + "learning_rate": 2.1570290607194307e-07, + "loss": 0.77385372, + "num_input_tokens_seen": 306743630, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6484375, + "step": 14225, + "time_per_iteration": 3.9161388874053955 + }, + { + "auxiliary_loss_clip": 0.01098731, + "auxiliary_loss_mlp": 0.01034967, + "balance_loss_clip": 1.02398777, + "balance_loss_mlp": 1.03516436, + "epoch": 0.8553133924545318, + "flos": 26432767537920.0, + "grad_norm": 1.522872889346961, + "language_loss": 0.76993561, + "learning_rate": 2.1552700364605925e-07, + "loss": 0.79127258, + "num_input_tokens_seen": 306763105, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6328125, + "step": 14226, + "time_per_iteration": 2.506988286972046 + }, + { + "auxiliary_loss_clip": 0.01102328, + "auxiliary_loss_mlp": 0.01033106, + "balance_loss_clip": 1.0207684, + "balance_loss_mlp": 1.03414547, + "epoch": 0.8553735157071998, + "flos": 16362446336640.0, + "grad_norm": 2.177222664744404, + "language_loss": 0.54483128, + "learning_rate": 2.153511688875702e-07, + "loss": 0.56618559, + "num_input_tokens_seen": 306779875, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6796875, + "step": 14227, + "time_per_iteration": 2.4459900856018066 + }, + { + "auxiliary_loss_clip": 0.01099961, + "auxiliary_loss_mlp": 0.01027518, + "balance_loss_clip": 1.01631856, + "balance_loss_mlp": 1.03569543, + "epoch": 0.8554336389598677, + "flos": 20887334328960.0, + "grad_norm": 2.2247440152306632, + "language_loss": 0.6510337, + "learning_rate": 2.151754018031442e-07, + "loss": 0.6723085, + "num_input_tokens_seen": 306800015, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.64453125, + "step": 14228, + "time_per_iteration": 2.4324324131011963 + }, + { + "auxiliary_loss_clip": 0.01101168, + "auxiliary_loss_mlp": 0.01030155, + "balance_loss_clip": 1.01824021, + "balance_loss_mlp": 1.03467417, + "epoch": 0.8554937622125357, + "flos": 21284721469440.0, + "grad_norm": 7.206926402956923, + "language_loss": 0.74007577, + "learning_rate": 2.1499970239944542e-07, + "loss": 0.76138902, + "num_input_tokens_seen": 306814160, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 14229, + "time_per_iteration": 2.4285945892333984 + }, + { + "auxiliary_loss_clip": 0.01097864, + "auxiliary_loss_mlp": 0.01025289, + "balance_loss_clip": 1.01422048, + "balance_loss_mlp": 1.0336206, + "epoch": 0.8555538854652037, + "flos": 22413178120320.0, + "grad_norm": 1.6659159348805417, + "language_loss": 0.72572798, + "learning_rate": 2.1482407068313724e-07, + "loss": 0.74695945, + "num_input_tokens_seen": 306833310, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 14230, + "time_per_iteration": 2.4370429515838623 + }, + { + "auxiliary_loss_clip": 0.01099354, + "auxiliary_loss_mlp": 0.01025787, + "balance_loss_clip": 1.01428294, + "balance_loss_mlp": 1.03463602, + "epoch": 0.8556140087178716, + "flos": 20193719725440.0, + "grad_norm": 1.858724204103366, + "language_loss": 0.82625288, + "learning_rate": 2.1464850666087897e-07, + "loss": 0.84750426, + "num_input_tokens_seen": 306851345, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 14231, + "time_per_iteration": 2.416818618774414 + }, + { + "auxiliary_loss_clip": 0.01102308, + "auxiliary_loss_mlp": 0.01033345, + "balance_loss_clip": 1.02078056, + "balance_loss_mlp": 1.03642631, + "epoch": 0.8556741319705397, + "flos": 22638123043200.0, + "grad_norm": 1.8022250670244886, + "language_loss": 0.67731422, + "learning_rate": 2.1447301033932796e-07, + "loss": 0.69867074, + "num_input_tokens_seen": 306871040, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.66015625, + "step": 14232, + "time_per_iteration": 2.4504547119140625 + }, + { + "auxiliary_loss_clip": 0.01102277, + "auxiliary_loss_mlp": 0.01030336, + "balance_loss_clip": 1.0181942, + "balance_loss_mlp": 1.03550363, + "epoch": 0.8557342552232076, + "flos": 23549320281600.0, + "grad_norm": 1.4699321187791279, + "language_loss": 0.66779065, + "learning_rate": 2.1429758172513955e-07, + "loss": 0.68911678, + "num_input_tokens_seen": 306891625, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66796875, + "step": 14233, + "time_per_iteration": 2.478637933731079 + }, + { + "auxiliary_loss_clip": 0.01097787, + "auxiliary_loss_mlp": 0.01031044, + "balance_loss_clip": 1.01955795, + "balance_loss_mlp": 1.032691, + "epoch": 0.8557943784758756, + "flos": 19609884063360.0, + "grad_norm": 1.6274699918849787, + "language_loss": 0.76340926, + "learning_rate": 2.1412222082496556e-07, + "loss": 0.78469753, + "num_input_tokens_seen": 306910020, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65234375, + "step": 14234, + "time_per_iteration": 2.421801805496216 + }, + { + "auxiliary_loss_clip": 0.01022342, + "auxiliary_loss_mlp": 0.01001271, + "balance_loss_clip": 1.00026405, + "balance_loss_mlp": 1.00228214, + "epoch": 0.8558545017285435, + "flos": 70641891446400.0, + "grad_norm": 0.7533011724258024, + "language_loss": 0.58039862, + "learning_rate": 2.1394692764545684e-07, + "loss": 0.60063475, + "num_input_tokens_seen": 306969505, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.20117188, + "step": 14235, + "time_per_iteration": 3.016435146331787 + }, + { + "auxiliary_loss_clip": 0.01022713, + "auxiliary_loss_mlp": 0.0100235, + "balance_loss_clip": 1.00124168, + "balance_loss_mlp": 1.00244868, + "epoch": 0.8559146249812115, + "flos": 56649983086080.0, + "grad_norm": 0.7884525554635639, + "language_loss": 0.56691235, + "learning_rate": 2.1377170219325858e-07, + "loss": 0.58716297, + "num_input_tokens_seen": 307027710, + "router_z_loss_clip": 0.0111084, + "router_z_loss_mlp": 0.203125, + "step": 14236, + "time_per_iteration": 2.9483742713928223 + }, + { + "auxiliary_loss_clip": 0.01100519, + "auxiliary_loss_mlp": 0.01034279, + "balance_loss_clip": 1.02263188, + "balance_loss_mlp": 1.03376889, + "epoch": 0.8559747482338794, + "flos": 22888240421760.0, + "grad_norm": 1.880299009711037, + "language_loss": 0.70168215, + "learning_rate": 2.1359654447501673e-07, + "loss": 0.72303009, + "num_input_tokens_seen": 307045515, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66796875, + "step": 14237, + "time_per_iteration": 2.4361391067504883 + }, + { + "auxiliary_loss_clip": 0.01097899, + "auxiliary_loss_mlp": 0.01028922, + "balance_loss_clip": 1.01747763, + "balance_loss_mlp": 1.03274059, + "epoch": 0.8560348714865474, + "flos": 22601925112320.0, + "grad_norm": 2.07166971946217, + "language_loss": 0.63688266, + "learning_rate": 2.1342145449737314e-07, + "loss": 0.65815091, + "num_input_tokens_seen": 307064470, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 14238, + "time_per_iteration": 2.4701902866363525 + }, + { + "auxiliary_loss_clip": 0.01095366, + "auxiliary_loss_mlp": 0.01030008, + "balance_loss_clip": 1.01998234, + "balance_loss_mlp": 1.03308296, + "epoch": 0.8560949947392154, + "flos": 17931455297280.0, + "grad_norm": 1.599452795131822, + "language_loss": 0.69295937, + "learning_rate": 2.1324643226696648e-07, + "loss": 0.71421313, + "num_input_tokens_seen": 307083900, + "router_z_loss_clip": 0.10058594, + "router_z_loss_mlp": 0.625, + "step": 14239, + "time_per_iteration": 2.4972500801086426 + }, + { + "auxiliary_loss_clip": 0.01102946, + "auxiliary_loss_mlp": 0.01031631, + "balance_loss_clip": 1.01977563, + "balance_loss_mlp": 1.0346055, + "epoch": 0.8561551179918834, + "flos": 31026208636800.0, + "grad_norm": 1.9185715345631495, + "language_loss": 0.66292799, + "learning_rate": 2.1307147779043455e-07, + "loss": 0.68427372, + "num_input_tokens_seen": 307104590, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.68359375, + "step": 14240, + "time_per_iteration": 2.552086114883423 + }, + { + "auxiliary_loss_clip": 0.01100896, + "auxiliary_loss_mlp": 0.01031078, + "balance_loss_clip": 1.01841772, + "balance_loss_mlp": 1.03389883, + "epoch": 0.8562152412445513, + "flos": 30665198995200.0, + "grad_norm": 1.5899388107662171, + "language_loss": 0.62232125, + "learning_rate": 2.1289659107441182e-07, + "loss": 0.64364094, + "num_input_tokens_seen": 307125580, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.671875, + "step": 14241, + "time_per_iteration": 2.50657057762146 + }, + { + "auxiliary_loss_clip": 0.01104391, + "auxiliary_loss_mlp": 0.01033043, + "balance_loss_clip": 1.02027607, + "balance_loss_mlp": 1.03487253, + "epoch": 0.8562753644972193, + "flos": 31576144838400.0, + "grad_norm": 1.708079074256712, + "language_loss": 0.74306595, + "learning_rate": 2.1272177212552855e-07, + "loss": 0.76444036, + "num_input_tokens_seen": 307147625, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 14242, + "time_per_iteration": 2.566230297088623 + }, + { + "auxiliary_loss_clip": 0.011037, + "auxiliary_loss_mlp": 0.01037225, + "balance_loss_clip": 1.02588248, + "balance_loss_mlp": 1.0358814, + "epoch": 0.8563354877498872, + "flos": 26213640618240.0, + "grad_norm": 1.936352707521622, + "language_loss": 0.7619487, + "learning_rate": 2.1254702095041498e-07, + "loss": 0.78335792, + "num_input_tokens_seen": 307164665, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6796875, + "step": 14243, + "time_per_iteration": 2.4888031482696533 + }, + { + "auxiliary_loss_clip": 0.01100916, + "auxiliary_loss_mlp": 0.01032267, + "balance_loss_clip": 1.02029276, + "balance_loss_mlp": 1.03415143, + "epoch": 0.8563956110025552, + "flos": 24134341092480.0, + "grad_norm": 1.9431842479847303, + "language_loss": 0.68101519, + "learning_rate": 2.123723375556974e-07, + "loss": 0.70234704, + "num_input_tokens_seen": 307182530, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66796875, + "step": 14244, + "time_per_iteration": 2.508938789367676 + }, + { + "auxiliary_loss_clip": 0.01022635, + "auxiliary_loss_mlp": 0.01003162, + "balance_loss_clip": 1.00209546, + "balance_loss_mlp": 1.00242233, + "epoch": 0.8564557342552233, + "flos": 56271986311680.0, + "grad_norm": 0.7550815823287989, + "language_loss": 0.584894, + "learning_rate": 2.1219772194800046e-07, + "loss": 0.60515195, + "num_input_tokens_seen": 307241240, + "router_z_loss_clip": 0.01068115, + "router_z_loss_mlp": 0.203125, + "step": 14245, + "time_per_iteration": 2.9439289569854736 + }, + { + "auxiliary_loss_clip": 0.01104025, + "auxiliary_loss_mlp": 0.01029128, + "balance_loss_clip": 1.01670074, + "balance_loss_mlp": 1.03502369, + "epoch": 0.8565158575078912, + "flos": 23440618748160.0, + "grad_norm": 1.6717513194149345, + "language_loss": 0.77544534, + "learning_rate": 2.1202317413394488e-07, + "loss": 0.79677689, + "num_input_tokens_seen": 307261485, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 14246, + "time_per_iteration": 2.499782085418701 + }, + { + "auxiliary_loss_clip": 0.01097341, + "auxiliary_loss_mlp": 0.0102493, + "balance_loss_clip": 1.01326489, + "balance_loss_mlp": 1.03187299, + "epoch": 0.8565759807605592, + "flos": 20375930442240.0, + "grad_norm": 1.8912838995768235, + "language_loss": 0.81099033, + "learning_rate": 2.1184869412014938e-07, + "loss": 0.83221304, + "num_input_tokens_seen": 307279160, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 14247, + "time_per_iteration": 2.416072130203247 + }, + { + "auxiliary_loss_clip": 0.01100135, + "auxiliary_loss_mlp": 0.01030939, + "balance_loss_clip": 1.01893449, + "balance_loss_mlp": 1.03441513, + "epoch": 0.8566361040132271, + "flos": 18807101049600.0, + "grad_norm": 2.3574465797588506, + "language_loss": 0.77318221, + "learning_rate": 2.1167428191323112e-07, + "loss": 0.79449296, + "num_input_tokens_seen": 307297920, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.65625, + "step": 14248, + "time_per_iteration": 2.427107334136963 + }, + { + "auxiliary_loss_clip": 0.01099982, + "auxiliary_loss_mlp": 0.01028728, + "balance_loss_clip": 1.01661038, + "balance_loss_mlp": 1.03290796, + "epoch": 0.8566962272658951, + "flos": 24535355506560.0, + "grad_norm": 1.848881659370633, + "language_loss": 0.77508557, + "learning_rate": 2.1149993751980278e-07, + "loss": 0.79637265, + "num_input_tokens_seen": 307318320, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 14249, + "time_per_iteration": 2.4711170196533203 + }, + { + "auxiliary_loss_clip": 0.01097071, + "auxiliary_loss_mlp": 0.01033579, + "balance_loss_clip": 1.02206969, + "balance_loss_mlp": 1.03354955, + "epoch": 0.856756350518563, + "flos": 23178506227200.0, + "grad_norm": 2.07712303854438, + "language_loss": 0.78380144, + "learning_rate": 2.1132566094647597e-07, + "loss": 0.80510795, + "num_input_tokens_seen": 307336720, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.63671875, + "step": 14250, + "time_per_iteration": 2.4842865467071533 + }, + { + "auxiliary_loss_clip": 0.01096261, + "auxiliary_loss_mlp": 0.01029426, + "balance_loss_clip": 1.01872694, + "balance_loss_mlp": 1.03293228, + "epoch": 0.856816473771231, + "flos": 20808581760000.0, + "grad_norm": 1.6921215208893117, + "language_loss": 0.79659212, + "learning_rate": 2.1115145219985942e-07, + "loss": 0.81784904, + "num_input_tokens_seen": 307354120, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6328125, + "step": 14251, + "time_per_iteration": 2.4252700805664062 + }, + { + "auxiliary_loss_clip": 0.01097886, + "auxiliary_loss_mlp": 0.01027247, + "balance_loss_clip": 1.0159874, + "balance_loss_mlp": 1.03339255, + "epoch": 0.856876597023899, + "flos": 20228157889920.0, + "grad_norm": 1.9109310863518794, + "language_loss": 0.61741138, + "learning_rate": 2.1097731128656005e-07, + "loss": 0.6386627, + "num_input_tokens_seen": 307373165, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.64453125, + "step": 14252, + "time_per_iteration": 2.4443397521972656 + }, + { + "auxiliary_loss_clip": 0.01104246, + "auxiliary_loss_mlp": 0.01031655, + "balance_loss_clip": 1.01910281, + "balance_loss_mlp": 1.0367614, + "epoch": 0.856936720276567, + "flos": 18296128126080.0, + "grad_norm": 1.805516150747159, + "language_loss": 0.69350702, + "learning_rate": 2.1080323821317924e-07, + "loss": 0.71486604, + "num_input_tokens_seen": 307391000, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.67578125, + "step": 14253, + "time_per_iteration": 2.4410698413848877 + }, + { + "auxiliary_loss_clip": 0.01022365, + "auxiliary_loss_mlp": 0.01004805, + "balance_loss_clip": 1.00375557, + "balance_loss_mlp": 1.00232601, + "epoch": 0.8569968435292349, + "flos": 69878394933120.0, + "grad_norm": 0.8052451552240734, + "language_loss": 0.59255153, + "learning_rate": 2.1062923298631907e-07, + "loss": 0.61282325, + "num_input_tokens_seen": 307452865, + "router_z_loss_clip": 0.01049805, + "router_z_loss_mlp": 0.20117188, + "step": 14254, + "time_per_iteration": 3.0850088596343994 + }, + { + "auxiliary_loss_clip": 0.01097856, + "auxiliary_loss_mlp": 0.01033179, + "balance_loss_clip": 1.02067935, + "balance_loss_mlp": 1.03278446, + "epoch": 0.8570569667819029, + "flos": 25848572739840.0, + "grad_norm": 1.6979396756616612, + "language_loss": 0.80917549, + "learning_rate": 2.1045529561257825e-07, + "loss": 0.83048582, + "num_input_tokens_seen": 307471940, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.65234375, + "step": 14255, + "time_per_iteration": 2.4804954528808594 + }, + { + "auxiliary_loss_clip": 0.01098269, + "auxiliary_loss_mlp": 0.01025981, + "balance_loss_clip": 1.01472759, + "balance_loss_mlp": 1.03411806, + "epoch": 0.8571170900345708, + "flos": 23257115141760.0, + "grad_norm": 2.1938924508731823, + "language_loss": 0.67312753, + "learning_rate": 2.1028142609855126e-07, + "loss": 0.69437003, + "num_input_tokens_seen": 307488745, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.640625, + "step": 14256, + "time_per_iteration": 2.4801876544952393 + }, + { + "auxiliary_loss_clip": 0.01101957, + "auxiliary_loss_mlp": 0.01031827, + "balance_loss_clip": 1.02076387, + "balance_loss_mlp": 1.03524375, + "epoch": 0.8571772132872388, + "flos": 18917670090240.0, + "grad_norm": 1.7102679529995346, + "language_loss": 0.69775069, + "learning_rate": 2.1010762445083218e-07, + "loss": 0.71908844, + "num_input_tokens_seen": 307506855, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.66796875, + "step": 14257, + "time_per_iteration": 2.431058406829834 + }, + { + "auxiliary_loss_clip": 0.01098652, + "auxiliary_loss_mlp": 0.01028541, + "balance_loss_clip": 1.01653636, + "balance_loss_mlp": 1.03418648, + "epoch": 0.8572373365399069, + "flos": 33250120318080.0, + "grad_norm": 2.0065019145348204, + "language_loss": 0.77076191, + "learning_rate": 2.0993389067601197e-07, + "loss": 0.79203385, + "num_input_tokens_seen": 307526115, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.64453125, + "step": 14258, + "time_per_iteration": 2.534079074859619 + }, + { + "auxiliary_loss_clip": 0.01098475, + "auxiliary_loss_mlp": 0.01028758, + "balance_loss_clip": 1.01732588, + "balance_loss_mlp": 1.03451884, + "epoch": 0.8572974597925748, + "flos": 23327535755520.0, + "grad_norm": 1.5055834428121542, + "language_loss": 0.67819071, + "learning_rate": 2.0976022478067735e-07, + "loss": 0.69946301, + "num_input_tokens_seen": 307545230, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.640625, + "step": 14259, + "time_per_iteration": 2.485220432281494 + }, + { + "auxiliary_loss_clip": 0.01098298, + "auxiliary_loss_mlp": 0.01028367, + "balance_loss_clip": 1.01689315, + "balance_loss_mlp": 1.03273714, + "epoch": 0.8573575830452428, + "flos": 24535858296960.0, + "grad_norm": 1.6998154571909854, + "language_loss": 0.77415329, + "learning_rate": 2.0958662677141437e-07, + "loss": 0.79541999, + "num_input_tokens_seen": 307564900, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65625, + "step": 14260, + "time_per_iteration": 2.5281169414520264 + }, + { + "auxiliary_loss_clip": 0.01101383, + "auxiliary_loss_mlp": 0.01028533, + "balance_loss_clip": 1.01651108, + "balance_loss_mlp": 1.03417957, + "epoch": 0.8574177062979107, + "flos": 24165403378560.0, + "grad_norm": 2.4235985456915867, + "language_loss": 0.74327439, + "learning_rate": 2.09413096654806e-07, + "loss": 0.76457351, + "num_input_tokens_seen": 307583500, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 14261, + "time_per_iteration": 2.4572179317474365 + }, + { + "auxiliary_loss_clip": 0.01102872, + "auxiliary_loss_mlp": 0.01030228, + "balance_loss_clip": 1.01750875, + "balance_loss_mlp": 1.03488421, + "epoch": 0.8574778295505787, + "flos": 17930737025280.0, + "grad_norm": 1.7840842625945281, + "language_loss": 0.7859261, + "learning_rate": 2.0923963443743276e-07, + "loss": 0.80725712, + "num_input_tokens_seen": 307601430, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 14262, + "time_per_iteration": 3.904160499572754 + }, + { + "auxiliary_loss_clip": 0.01099834, + "auxiliary_loss_mlp": 0.01031195, + "balance_loss_clip": 1.02024531, + "balance_loss_mlp": 1.03637409, + "epoch": 0.8575379528032466, + "flos": 21580697537280.0, + "grad_norm": 1.756622359750573, + "language_loss": 0.67971861, + "learning_rate": 2.0906624012587203e-07, + "loss": 0.70102894, + "num_input_tokens_seen": 307621495, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6328125, + "step": 14263, + "time_per_iteration": 5.447778224945068 + }, + { + "auxiliary_loss_clip": 0.01099256, + "auxiliary_loss_mlp": 0.0102962, + "balance_loss_clip": 1.01801479, + "balance_loss_mlp": 1.03373396, + "epoch": 0.8575980760559146, + "flos": 21761579450880.0, + "grad_norm": 1.4777257983100802, + "language_loss": 0.79465747, + "learning_rate": 2.088929137266986e-07, + "loss": 0.81594616, + "num_input_tokens_seen": 307640840, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 14264, + "time_per_iteration": 2.500290870666504 + }, + { + "auxiliary_loss_clip": 0.01099525, + "auxiliary_loss_mlp": 0.01032748, + "balance_loss_clip": 1.02100539, + "balance_loss_mlp": 1.03500402, + "epoch": 0.8576581993085826, + "flos": 34386442047360.0, + "grad_norm": 1.2906730752175566, + "language_loss": 0.69431353, + "learning_rate": 2.0871965524648582e-07, + "loss": 0.71563625, + "num_input_tokens_seen": 307663820, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.64453125, + "step": 14265, + "time_per_iteration": 2.5555310249328613 + }, + { + "auxiliary_loss_clip": 0.0109498, + "auxiliary_loss_mlp": 0.01024602, + "balance_loss_clip": 1.01414716, + "balance_loss_mlp": 1.03292894, + "epoch": 0.8577183225612506, + "flos": 23222497409280.0, + "grad_norm": 2.230270985718821, + "language_loss": 0.66134441, + "learning_rate": 2.085464646918027e-07, + "loss": 0.6825403, + "num_input_tokens_seen": 307682385, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.62109375, + "step": 14266, + "time_per_iteration": 2.497089147567749 + }, + { + "auxiliary_loss_clip": 0.01098797, + "auxiliary_loss_mlp": 0.01030539, + "balance_loss_clip": 1.01898146, + "balance_loss_mlp": 1.03485322, + "epoch": 0.8577784458139185, + "flos": 28804164462720.0, + "grad_norm": 2.2520575670699468, + "language_loss": 0.75218296, + "learning_rate": 2.0837334206921731e-07, + "loss": 0.77347636, + "num_input_tokens_seen": 307704680, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 14267, + "time_per_iteration": 4.048743963241577 + }, + { + "auxiliary_loss_clip": 0.01098109, + "auxiliary_loss_mlp": 0.0102667, + "balance_loss_clip": 1.01561344, + "balance_loss_mlp": 1.0337677, + "epoch": 0.8578385690665865, + "flos": 19755573626880.0, + "grad_norm": 1.7118804169147943, + "language_loss": 0.87602067, + "learning_rate": 2.082002873852946e-07, + "loss": 0.89726847, + "num_input_tokens_seen": 307723245, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.64453125, + "step": 14268, + "time_per_iteration": 2.439980983734131 + }, + { + "auxiliary_loss_clip": 0.01100897, + "auxiliary_loss_mlp": 0.01032301, + "balance_loss_clip": 1.02029121, + "balance_loss_mlp": 1.0352304, + "epoch": 0.8578986923192544, + "flos": 20704082117760.0, + "grad_norm": 1.6901457258620303, + "language_loss": 0.73087263, + "learning_rate": 2.0802730064659667e-07, + "loss": 0.7522046, + "num_input_tokens_seen": 307742510, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.65625, + "step": 14269, + "time_per_iteration": 2.4319350719451904 + }, + { + "auxiliary_loss_clip": 0.01100626, + "auxiliary_loss_mlp": 0.01030094, + "balance_loss_clip": 1.01835752, + "balance_loss_mlp": 1.03426695, + "epoch": 0.8579588155719224, + "flos": 36101715189120.0, + "grad_norm": 1.4429206578235492, + "language_loss": 0.66260904, + "learning_rate": 2.0785438185968252e-07, + "loss": 0.68391621, + "num_input_tokens_seen": 307766030, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 14270, + "time_per_iteration": 2.5631306171417236 + }, + { + "auxiliary_loss_clip": 0.01097057, + "auxiliary_loss_mlp": 0.01027188, + "balance_loss_clip": 1.01599455, + "balance_loss_mlp": 1.03327835, + "epoch": 0.8580189388245905, + "flos": 22853479034880.0, + "grad_norm": 1.5248509444255016, + "language_loss": 0.73964077, + "learning_rate": 2.0768153103110997e-07, + "loss": 0.76088321, + "num_input_tokens_seen": 307785800, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 14271, + "time_per_iteration": 2.4392032623291016 + }, + { + "auxiliary_loss_clip": 0.01022391, + "auxiliary_loss_mlp": 0.01000539, + "balance_loss_clip": 0.99947166, + "balance_loss_mlp": 1.00218976, + "epoch": 0.8580790620772584, + "flos": 69642104290560.0, + "grad_norm": 0.8025264948079197, + "language_loss": 0.59533787, + "learning_rate": 2.0750874816743358e-07, + "loss": 0.61556721, + "num_input_tokens_seen": 307850995, + "router_z_loss_clip": 0.01068115, + "router_z_loss_mlp": 0.20214844, + "step": 14272, + "time_per_iteration": 3.0923521518707275 + }, + { + "auxiliary_loss_clip": 0.01103984, + "auxiliary_loss_mlp": 0.01030555, + "balance_loss_clip": 1.01855707, + "balance_loss_mlp": 1.03503764, + "epoch": 0.8581391853299264, + "flos": 13334243270400.0, + "grad_norm": 1.8233636053410176, + "language_loss": 0.7532993, + "learning_rate": 2.0733603327520499e-07, + "loss": 0.77464467, + "num_input_tokens_seen": 307868585, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6875, + "step": 14273, + "time_per_iteration": 2.4198429584503174 + }, + { + "auxiliary_loss_clip": 0.01100263, + "auxiliary_loss_mlp": 0.0103004, + "balance_loss_clip": 1.01861334, + "balance_loss_mlp": 1.03461802, + "epoch": 0.8581993085825943, + "flos": 19645651031040.0, + "grad_norm": 1.8234845594487459, + "language_loss": 0.82452077, + "learning_rate": 2.0716338636097385e-07, + "loss": 0.84582376, + "num_input_tokens_seen": 307886820, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 14274, + "time_per_iteration": 2.423983573913574 + }, + { + "auxiliary_loss_clip": 0.01022729, + "auxiliary_loss_mlp": 0.01001298, + "balance_loss_clip": 1.00023675, + "balance_loss_mlp": 1.00258183, + "epoch": 0.8582594318352623, + "flos": 55825077294720.0, + "grad_norm": 0.800704100166407, + "language_loss": 0.60889721, + "learning_rate": 2.0699080743128672e-07, + "loss": 0.62913746, + "num_input_tokens_seen": 307944020, + "router_z_loss_clip": 0.01062012, + "router_z_loss_mlp": 0.20117188, + "step": 14275, + "time_per_iteration": 3.0931267738342285 + }, + { + "auxiliary_loss_clip": 0.01100126, + "auxiliary_loss_mlp": 0.01023241, + "balance_loss_clip": 1.01086688, + "balance_loss_mlp": 1.03334188, + "epoch": 0.8583195550879302, + "flos": 24279563779200.0, + "grad_norm": 2.027592784857776, + "language_loss": 0.5901401, + "learning_rate": 2.0681829649268768e-07, + "loss": 0.61137378, + "num_input_tokens_seen": 307961055, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.66796875, + "step": 14276, + "time_per_iteration": 2.4789164066314697 + }, + { + "auxiliary_loss_clip": 0.0109966, + "auxiliary_loss_mlp": 0.0103046, + "balance_loss_clip": 1.01880693, + "balance_loss_mlp": 1.03338301, + "epoch": 0.8583796783405983, + "flos": 13444129952640.0, + "grad_norm": 1.9301473140646082, + "language_loss": 0.76305163, + "learning_rate": 2.0664585355171838e-07, + "loss": 0.78435278, + "num_input_tokens_seen": 307978690, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 14277, + "time_per_iteration": 2.4284141063690186 + }, + { + "auxiliary_loss_clip": 0.01099699, + "auxiliary_loss_mlp": 0.0103111, + "balance_loss_clip": 1.01927865, + "balance_loss_mlp": 1.03406823, + "epoch": 0.8584398015932662, + "flos": 16180271533440.0, + "grad_norm": 1.7203854489642774, + "language_loss": 0.83213818, + "learning_rate": 2.0647347861491803e-07, + "loss": 0.8534463, + "num_input_tokens_seen": 307995870, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65625, + "step": 14278, + "time_per_iteration": 2.4328911304473877 + }, + { + "auxiliary_loss_clip": 0.01103618, + "auxiliary_loss_mlp": 0.01031741, + "balance_loss_clip": 1.01918292, + "balance_loss_mlp": 1.03535473, + "epoch": 0.8584999248459342, + "flos": 17450431338240.0, + "grad_norm": 1.904094363683198, + "language_loss": 0.74556804, + "learning_rate": 2.0630117168882366e-07, + "loss": 0.76692164, + "num_input_tokens_seen": 308013645, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.68359375, + "step": 14279, + "time_per_iteration": 2.4168436527252197 + }, + { + "auxiliary_loss_clip": 0.01098473, + "auxiliary_loss_mlp": 0.01031893, + "balance_loss_clip": 1.02028763, + "balance_loss_mlp": 1.03394675, + "epoch": 0.8585600480986021, + "flos": 23441013797760.0, + "grad_norm": 2.550357980437511, + "language_loss": 0.66932499, + "learning_rate": 2.0612893277996845e-07, + "loss": 0.69062865, + "num_input_tokens_seen": 308032490, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.64453125, + "step": 14280, + "time_per_iteration": 2.4760711193084717 + }, + { + "auxiliary_loss_clip": 0.01097121, + "auxiliary_loss_mlp": 0.01026889, + "balance_loss_clip": 1.0157609, + "balance_loss_mlp": 1.03308606, + "epoch": 0.8586201713512701, + "flos": 19937927998080.0, + "grad_norm": 1.980183019426352, + "language_loss": 0.62603807, + "learning_rate": 2.0595676189488343e-07, + "loss": 0.64727819, + "num_input_tokens_seen": 308052110, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.640625, + "step": 14281, + "time_per_iteration": 2.4344727993011475 + }, + { + "auxiliary_loss_clip": 0.01097793, + "auxiliary_loss_mlp": 0.01029239, + "balance_loss_clip": 1.01754475, + "balance_loss_mlp": 1.03264332, + "epoch": 0.858680294603938, + "flos": 15304769435520.0, + "grad_norm": 1.8443011587610691, + "language_loss": 0.73088598, + "learning_rate": 2.0578465904009845e-07, + "loss": 0.75215626, + "num_input_tokens_seen": 308070660, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65234375, + "step": 14282, + "time_per_iteration": 2.43040132522583 + }, + { + "auxiliary_loss_clip": 0.01096079, + "auxiliary_loss_mlp": 0.01023811, + "balance_loss_clip": 1.01301026, + "balance_loss_mlp": 1.03145468, + "epoch": 0.858740417856606, + "flos": 22711237176960.0, + "grad_norm": 1.8514176123230044, + "language_loss": 0.75841701, + "learning_rate": 2.0561262422213832e-07, + "loss": 0.77961594, + "num_input_tokens_seen": 308089520, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.64453125, + "step": 14283, + "time_per_iteration": 2.4622461795806885 + }, + { + "auxiliary_loss_clip": 0.01099212, + "auxiliary_loss_mlp": 0.0102773, + "balance_loss_clip": 1.01597571, + "balance_loss_mlp": 1.03318167, + "epoch": 0.8588005411092741, + "flos": 34054303962240.0, + "grad_norm": 1.9075857879793239, + "language_loss": 0.59797478, + "learning_rate": 2.0544065744752736e-07, + "loss": 0.61924422, + "num_input_tokens_seen": 308111545, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66015625, + "step": 14284, + "time_per_iteration": 2.572291135787964 + }, + { + "auxiliary_loss_clip": 0.01098357, + "auxiliary_loss_mlp": 0.01028693, + "balance_loss_clip": 1.01714206, + "balance_loss_mlp": 1.03492391, + "epoch": 0.858860664361942, + "flos": 28913584268160.0, + "grad_norm": 1.7674555640674585, + "language_loss": 0.75632811, + "learning_rate": 2.0526875872278749e-07, + "loss": 0.77759862, + "num_input_tokens_seen": 308129690, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6328125, + "step": 14285, + "time_per_iteration": 2.5170655250549316 + }, + { + "auxiliary_loss_clip": 0.01103712, + "auxiliary_loss_mlp": 0.01032045, + "balance_loss_clip": 1.02010632, + "balance_loss_mlp": 1.03761566, + "epoch": 0.85892078761461, + "flos": 19792525743360.0, + "grad_norm": 2.139576760488393, + "language_loss": 0.74618649, + "learning_rate": 2.0509692805443524e-07, + "loss": 0.76754409, + "num_input_tokens_seen": 308147410, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66015625, + "step": 14286, + "time_per_iteration": 2.470148801803589 + }, + { + "auxiliary_loss_clip": 0.01022355, + "auxiliary_loss_mlp": 0.01002252, + "balance_loss_clip": 1.00119138, + "balance_loss_mlp": 1.00219035, + "epoch": 0.8589809108672779, + "flos": 67106630039040.0, + "grad_norm": 0.7838160557380515, + "language_loss": 0.49406371, + "learning_rate": 2.0492516544898718e-07, + "loss": 0.51430982, + "num_input_tokens_seen": 308204875, + "router_z_loss_clip": 0.01062012, + "router_z_loss_mlp": 0.20117188, + "step": 14287, + "time_per_iteration": 3.0114223957061768 + }, + { + "auxiliary_loss_clip": 0.01101532, + "auxiliary_loss_mlp": 0.01028987, + "balance_loss_clip": 1.01747108, + "balance_loss_mlp": 1.03548408, + "epoch": 0.8590410341199459, + "flos": 29716259541120.0, + "grad_norm": 1.771130529708156, + "language_loss": 0.79141223, + "learning_rate": 2.0475347091295704e-07, + "loss": 0.81271744, + "num_input_tokens_seen": 308225690, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66015625, + "step": 14288, + "time_per_iteration": 2.515892505645752 + }, + { + "auxiliary_loss_clip": 0.01101196, + "auxiliary_loss_mlp": 0.01030639, + "balance_loss_clip": 1.01805007, + "balance_loss_mlp": 1.03466153, + "epoch": 0.8591011573726138, + "flos": 23987430466560.0, + "grad_norm": 1.977299160967245, + "language_loss": 0.80659628, + "learning_rate": 2.045818444528553e-07, + "loss": 0.82791466, + "num_input_tokens_seen": 308245255, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6640625, + "step": 14289, + "time_per_iteration": 2.479477643966675 + }, + { + "auxiliary_loss_clip": 0.01103975, + "auxiliary_loss_mlp": 0.01024939, + "balance_loss_clip": 1.01370382, + "balance_loss_mlp": 1.03834021, + "epoch": 0.8591612806252819, + "flos": 14428656806400.0, + "grad_norm": 1.963202375321621, + "language_loss": 0.65073603, + "learning_rate": 2.0441028607518973e-07, + "loss": 0.6720252, + "num_input_tokens_seen": 308261755, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 14290, + "time_per_iteration": 2.423489809036255 + }, + { + "auxiliary_loss_clip": 0.0110313, + "auxiliary_loss_mlp": 0.01027625, + "balance_loss_clip": 1.01552534, + "balance_loss_mlp": 1.03614926, + "epoch": 0.8592214038779498, + "flos": 31577150419200.0, + "grad_norm": 1.8450218839619243, + "language_loss": 0.55117351, + "learning_rate": 2.0423879578646642e-07, + "loss": 0.57248104, + "num_input_tokens_seen": 308285145, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 14291, + "time_per_iteration": 2.5542116165161133 + }, + { + "auxiliary_loss_clip": 0.011028, + "auxiliary_loss_mlp": 0.01030334, + "balance_loss_clip": 1.01830578, + "balance_loss_mlp": 1.03624988, + "epoch": 0.8592815271306178, + "flos": 17457290835840.0, + "grad_norm": 1.854464069187037, + "language_loss": 0.70960593, + "learning_rate": 2.0406737359318792e-07, + "loss": 0.7309373, + "num_input_tokens_seen": 308304130, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 14292, + "time_per_iteration": 2.443986177444458 + }, + { + "auxiliary_loss_clip": 0.0109989, + "auxiliary_loss_mlp": 0.0103186, + "balance_loss_clip": 1.01981997, + "balance_loss_mlp": 1.0340476, + "epoch": 0.8593416503832857, + "flos": 25411360394880.0, + "grad_norm": 1.5234570799491314, + "language_loss": 0.71305615, + "learning_rate": 2.038960195018542e-07, + "loss": 0.73437369, + "num_input_tokens_seen": 308324670, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.66015625, + "step": 14293, + "time_per_iteration": 2.4947001934051514 + }, + { + "auxiliary_loss_clip": 0.01097643, + "auxiliary_loss_mlp": 0.01027806, + "balance_loss_clip": 1.01639175, + "balance_loss_mlp": 1.03337383, + "epoch": 0.8594017736359537, + "flos": 20996646393600.0, + "grad_norm": 1.4740238785457052, + "language_loss": 0.68373334, + "learning_rate": 2.0372473351896358e-07, + "loss": 0.70498788, + "num_input_tokens_seen": 308344215, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.64453125, + "step": 14294, + "time_per_iteration": 2.457850217819214 + }, + { + "auxiliary_loss_clip": 0.01096656, + "auxiliary_loss_mlp": 0.01027361, + "balance_loss_clip": 1.01627481, + "balance_loss_mlp": 1.03254747, + "epoch": 0.8594618968886216, + "flos": 22091059929600.0, + "grad_norm": 2.085676354981098, + "language_loss": 0.77626079, + "learning_rate": 2.0355351565101087e-07, + "loss": 0.79750097, + "num_input_tokens_seen": 308360520, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.640625, + "step": 14295, + "time_per_iteration": 2.495396614074707 + }, + { + "auxiliary_loss_clip": 0.01104942, + "auxiliary_loss_mlp": 0.01037594, + "balance_loss_clip": 1.02426088, + "balance_loss_mlp": 1.03482556, + "epoch": 0.8595220201412896, + "flos": 11656245467520.0, + "grad_norm": 3.9782673928281707, + "language_loss": 0.68883216, + "learning_rate": 2.0338236590448975e-07, + "loss": 0.71025753, + "num_input_tokens_seen": 308376865, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.703125, + "step": 14296, + "time_per_iteration": 2.450352907180786 + }, + { + "auxiliary_loss_clip": 0.01100285, + "auxiliary_loss_mlp": 0.01027668, + "balance_loss_clip": 1.0159142, + "balance_loss_mlp": 1.03523743, + "epoch": 0.8595821433939577, + "flos": 25040366772480.0, + "grad_norm": 3.015762114877603, + "language_loss": 0.79009968, + "learning_rate": 2.0321128428588842e-07, + "loss": 0.81137919, + "num_input_tokens_seen": 308395870, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6484375, + "step": 14297, + "time_per_iteration": 2.496976375579834 + }, + { + "auxiliary_loss_clip": 0.01095598, + "auxiliary_loss_mlp": 0.0102669, + "balance_loss_clip": 1.01620579, + "balance_loss_mlp": 1.03275037, + "epoch": 0.8596422666466256, + "flos": 28511528359680.0, + "grad_norm": 1.5403409682157543, + "language_loss": 0.67909223, + "learning_rate": 2.030402708016954e-07, + "loss": 0.70031512, + "num_input_tokens_seen": 308417250, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.62890625, + "step": 14298, + "time_per_iteration": 2.538550615310669 + }, + { + "auxiliary_loss_clip": 0.01098875, + "auxiliary_loss_mlp": 0.01031363, + "balance_loss_clip": 1.02002037, + "balance_loss_mlp": 1.03526485, + "epoch": 0.8597023898992936, + "flos": 13589137157760.0, + "grad_norm": 1.9417237901120834, + "language_loss": 0.68884093, + "learning_rate": 2.0286932545839576e-07, + "loss": 0.71014321, + "num_input_tokens_seen": 308434565, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.63671875, + "step": 14299, + "time_per_iteration": 2.474458932876587 + }, + { + "auxiliary_loss_clip": 0.01104456, + "auxiliary_loss_mlp": 0.01034924, + "balance_loss_clip": 1.02301526, + "balance_loss_mlp": 1.03598309, + "epoch": 0.8597625131519615, + "flos": 32300821728000.0, + "grad_norm": 2.8413419910831603, + "language_loss": 0.714288, + "learning_rate": 2.0269844826247096e-07, + "loss": 0.73568177, + "num_input_tokens_seen": 308450040, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.68359375, + "step": 14300, + "time_per_iteration": 2.557711362838745 + }, + { + "auxiliary_loss_clip": 0.01098234, + "auxiliary_loss_mlp": 0.01028358, + "balance_loss_clip": 1.01701558, + "balance_loss_mlp": 1.03311145, + "epoch": 0.8598226364046295, + "flos": 28730367970560.0, + "grad_norm": 1.9517620753263947, + "language_loss": 0.68880975, + "learning_rate": 2.0252763922040116e-07, + "loss": 0.71007574, + "num_input_tokens_seen": 308470545, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 14301, + "time_per_iteration": 2.508291482925415 + }, + { + "auxiliary_loss_clip": 0.0110057, + "auxiliary_loss_mlp": 0.01028957, + "balance_loss_clip": 1.01737559, + "balance_loss_mlp": 1.03456163, + "epoch": 0.8598827596572974, + "flos": 21871825269120.0, + "grad_norm": 1.7412735046511287, + "language_loss": 0.74149466, + "learning_rate": 2.023568983386641e-07, + "loss": 0.76278991, + "num_input_tokens_seen": 308490020, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66015625, + "step": 14302, + "time_per_iteration": 2.4727671146392822 + }, + { + "auxiliary_loss_clip": 0.01094464, + "auxiliary_loss_mlp": 0.01029947, + "balance_loss_clip": 1.01959372, + "balance_loss_mlp": 1.03196108, + "epoch": 0.8599428829099655, + "flos": 23767297966080.0, + "grad_norm": 1.936490583350926, + "language_loss": 0.83610648, + "learning_rate": 2.02186225623733e-07, + "loss": 0.85735059, + "num_input_tokens_seen": 308509065, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.625, + "step": 14303, + "time_per_iteration": 3.903238534927368 + }, + { + "auxiliary_loss_clip": 0.01100544, + "auxiliary_loss_mlp": 0.01034312, + "balance_loss_clip": 1.02172899, + "balance_loss_mlp": 1.0343709, + "epoch": 0.8600030061626334, + "flos": 16212770363520.0, + "grad_norm": 2.4591826021535392, + "language_loss": 0.77123845, + "learning_rate": 2.0201562108208025e-07, + "loss": 0.79258698, + "num_input_tokens_seen": 308524725, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.66015625, + "step": 14304, + "time_per_iteration": 2.416091203689575 + }, + { + "auxiliary_loss_clip": 0.0110075, + "auxiliary_loss_mlp": 0.01034223, + "balance_loss_clip": 1.02202225, + "balance_loss_mlp": 1.03498769, + "epoch": 0.8600631294153014, + "flos": 15669370437120.0, + "grad_norm": 2.157857805274316, + "language_loss": 0.53831017, + "learning_rate": 2.0184508472017537e-07, + "loss": 0.5596599, + "num_input_tokens_seen": 308543525, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65625, + "step": 14305, + "time_per_iteration": 3.9741735458374023 + }, + { + "auxiliary_loss_clip": 0.01100123, + "auxiliary_loss_mlp": 0.0102697, + "balance_loss_clip": 1.01480484, + "balance_loss_mlp": 1.0354023, + "epoch": 0.8601232526679693, + "flos": 17493093717120.0, + "grad_norm": 1.7690199733302432, + "language_loss": 0.83873999, + "learning_rate": 2.0167461654448558e-07, + "loss": 0.86001092, + "num_input_tokens_seen": 308557995, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6484375, + "step": 14306, + "time_per_iteration": 2.4614713191986084 + }, + { + "auxiliary_loss_clip": 0.01095821, + "auxiliary_loss_mlp": 0.01024556, + "balance_loss_clip": 1.01425052, + "balance_loss_mlp": 1.03247511, + "epoch": 0.8601833759206373, + "flos": 26985935963520.0, + "grad_norm": 1.3609792949742232, + "language_loss": 0.71544546, + "learning_rate": 2.01504216561474e-07, + "loss": 0.73664916, + "num_input_tokens_seen": 308582750, + "router_z_loss_clip": 0.10302734, + "router_z_loss_mlp": 0.6328125, + "step": 14307, + "time_per_iteration": 2.544206380844116 + }, + { + "auxiliary_loss_clip": 0.01101911, + "auxiliary_loss_mlp": 0.01038429, + "balance_loss_clip": 1.02579904, + "balance_loss_mlp": 1.03417909, + "epoch": 0.8602434991733052, + "flos": 25229760209280.0, + "grad_norm": 2.1884393758848018, + "language_loss": 0.63564229, + "learning_rate": 2.0133388477760316e-07, + "loss": 0.65704566, + "num_input_tokens_seen": 308603770, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6796875, + "step": 14308, + "time_per_iteration": 2.4838173389434814 + }, + { + "auxiliary_loss_clip": 0.01022727, + "auxiliary_loss_mlp": 0.01000151, + "balance_loss_clip": 0.99907821, + "balance_loss_mlp": 1.00265324, + "epoch": 0.8603036224259732, + "flos": 71015363107200.0, + "grad_norm": 0.6241531755089905, + "language_loss": 0.48517621, + "learning_rate": 2.0116362119933172e-07, + "loss": 0.50540501, + "num_input_tokens_seen": 308667735, + "router_z_loss_clip": 0.01074219, + "router_z_loss_mlp": 0.20117188, + "step": 14309, + "time_per_iteration": 4.55596661567688 + }, + { + "auxiliary_loss_clip": 0.01101397, + "auxiliary_loss_mlp": 0.0103216, + "balance_loss_clip": 1.01948833, + "balance_loss_mlp": 1.03436673, + "epoch": 0.8603637456786413, + "flos": 20300625578880.0, + "grad_norm": 1.614329583296113, + "language_loss": 0.67071158, + "learning_rate": 2.0099342583311563e-07, + "loss": 0.69204712, + "num_input_tokens_seen": 308686300, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.671875, + "step": 14310, + "time_per_iteration": 2.468801498413086 + }, + { + "auxiliary_loss_clip": 0.01101001, + "auxiliary_loss_mlp": 0.0102965, + "balance_loss_clip": 1.01839685, + "balance_loss_mlp": 1.03405249, + "epoch": 0.8604238689313092, + "flos": 21835842819840.0, + "grad_norm": 1.9370711593736145, + "language_loss": 0.77883255, + "learning_rate": 2.0082329868540905e-07, + "loss": 0.80013907, + "num_input_tokens_seen": 308705825, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 14311, + "time_per_iteration": 2.4640908241271973 + }, + { + "auxiliary_loss_clip": 0.01097513, + "auxiliary_loss_mlp": 0.01028007, + "balance_loss_clip": 1.01667011, + "balance_loss_mlp": 1.03325152, + "epoch": 0.8604839921839772, + "flos": 18004210295040.0, + "grad_norm": 2.1891429021237627, + "language_loss": 0.71380526, + "learning_rate": 2.006532397626639e-07, + "loss": 0.73506045, + "num_input_tokens_seen": 308723340, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 14312, + "time_per_iteration": 2.445478916168213 + }, + { + "auxiliary_loss_clip": 0.01096908, + "auxiliary_loss_mlp": 0.01028725, + "balance_loss_clip": 1.01734078, + "balance_loss_mlp": 1.03194928, + "epoch": 0.8605441154366451, + "flos": 16252164604800.0, + "grad_norm": 3.7836745296831364, + "language_loss": 0.77748859, + "learning_rate": 2.0048324907132797e-07, + "loss": 0.79874492, + "num_input_tokens_seen": 308741280, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 14313, + "time_per_iteration": 2.419283866882324 + }, + { + "auxiliary_loss_clip": 0.01100089, + "auxiliary_loss_mlp": 0.01030072, + "balance_loss_clip": 1.01744211, + "balance_loss_mlp": 1.0360148, + "epoch": 0.8606042386893131, + "flos": 32267065921920.0, + "grad_norm": 1.4787616110035209, + "language_loss": 0.72963393, + "learning_rate": 2.003133266178474e-07, + "loss": 0.75093555, + "num_input_tokens_seen": 308762875, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.640625, + "step": 14314, + "time_per_iteration": 2.5891568660736084 + }, + { + "auxiliary_loss_clip": 0.01097226, + "auxiliary_loss_mlp": 0.01028126, + "balance_loss_clip": 1.01601493, + "balance_loss_mlp": 1.03222215, + "epoch": 0.860664361941981, + "flos": 20229774001920.0, + "grad_norm": 1.725480228897019, + "language_loss": 0.69030631, + "learning_rate": 2.001434724086657e-07, + "loss": 0.71155983, + "num_input_tokens_seen": 308780315, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6484375, + "step": 14315, + "time_per_iteration": 2.43929386138916 + }, + { + "auxiliary_loss_clip": 0.01099964, + "auxiliary_loss_mlp": 0.01035331, + "balance_loss_clip": 1.02407146, + "balance_loss_mlp": 1.03532481, + "epoch": 0.8607244851946491, + "flos": 25191622944000.0, + "grad_norm": 1.720640234403056, + "language_loss": 0.72141165, + "learning_rate": 1.9997368645022418e-07, + "loss": 0.74276459, + "num_input_tokens_seen": 308799435, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 14316, + "time_per_iteration": 2.5051429271698 + }, + { + "auxiliary_loss_clip": 0.01103739, + "auxiliary_loss_mlp": 0.01031834, + "balance_loss_clip": 1.02011001, + "balance_loss_mlp": 1.03745365, + "epoch": 0.860784608447317, + "flos": 20482082110080.0, + "grad_norm": 5.919015136820617, + "language_loss": 0.82782209, + "learning_rate": 1.9980396874896056e-07, + "loss": 0.84917772, + "num_input_tokens_seen": 308817730, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 14317, + "time_per_iteration": 2.430666923522949 + }, + { + "auxiliary_loss_clip": 0.01099259, + "auxiliary_loss_mlp": 0.01030362, + "balance_loss_clip": 1.0189358, + "balance_loss_mlp": 1.03569496, + "epoch": 0.860844731699985, + "flos": 50476037696640.0, + "grad_norm": 1.677931985843384, + "language_loss": 0.67345351, + "learning_rate": 1.996343193113108e-07, + "loss": 0.69474971, + "num_input_tokens_seen": 308841735, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.63671875, + "step": 14318, + "time_per_iteration": 2.736192226409912 + }, + { + "auxiliary_loss_clip": 0.01096419, + "auxiliary_loss_mlp": 0.01026294, + "balance_loss_clip": 1.01545799, + "balance_loss_mlp": 1.03341532, + "epoch": 0.8609048549526529, + "flos": 41172768455040.0, + "grad_norm": 1.8345368571584644, + "language_loss": 0.71489882, + "learning_rate": 1.9946473814370911e-07, + "loss": 0.73612595, + "num_input_tokens_seen": 308865050, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.62890625, + "step": 14319, + "time_per_iteration": 2.6309432983398438 + }, + { + "auxiliary_loss_clip": 0.01106052, + "auxiliary_loss_mlp": 0.01030959, + "balance_loss_clip": 1.01912153, + "balance_loss_mlp": 1.03829098, + "epoch": 0.8609649782053209, + "flos": 23951196622080.0, + "grad_norm": 3.6611413698943016, + "language_loss": 0.67307162, + "learning_rate": 1.992952252525839e-07, + "loss": 0.69444174, + "num_input_tokens_seen": 308885375, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.67578125, + "step": 14320, + "time_per_iteration": 2.4838664531707764 + }, + { + "auxiliary_loss_clip": 0.0110247, + "auxiliary_loss_mlp": 0.01036541, + "balance_loss_clip": 1.02404201, + "balance_loss_mlp": 1.03398824, + "epoch": 0.8610251014579888, + "flos": 23112574813440.0, + "grad_norm": 3.6200178923733457, + "language_loss": 0.80436695, + "learning_rate": 1.9912578064436446e-07, + "loss": 0.82575703, + "num_input_tokens_seen": 308904700, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 14321, + "time_per_iteration": 2.4537878036499023 + }, + { + "auxiliary_loss_clip": 0.01097469, + "auxiliary_loss_mlp": 0.01029444, + "balance_loss_clip": 1.01696324, + "balance_loss_mlp": 1.03430629, + "epoch": 0.8610852247106568, + "flos": 19426811420160.0, + "grad_norm": 2.244134105201333, + "language_loss": 0.71181291, + "learning_rate": 1.9895640432547567e-07, + "loss": 0.73308206, + "num_input_tokens_seen": 308922985, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6328125, + "step": 14322, + "time_per_iteration": 2.4386720657348633 + }, + { + "auxiliary_loss_clip": 0.01105065, + "auxiliary_loss_mlp": 0.0103565, + "balance_loss_clip": 1.0233357, + "balance_loss_mlp": 1.03611112, + "epoch": 0.8611453479633249, + "flos": 19312076401920.0, + "grad_norm": 1.9495478111154043, + "language_loss": 0.55936325, + "learning_rate": 1.9878709630234102e-07, + "loss": 0.58077037, + "num_input_tokens_seen": 308940765, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 14323, + "time_per_iteration": 2.459597587585449 + }, + { + "auxiliary_loss_clip": 0.01098786, + "auxiliary_loss_mlp": 0.01028404, + "balance_loss_clip": 1.016716, + "balance_loss_mlp": 1.03410077, + "epoch": 0.8612054712159928, + "flos": 23253667436160.0, + "grad_norm": 1.5772528129595897, + "language_loss": 0.75499862, + "learning_rate": 1.986178565813801e-07, + "loss": 0.77627051, + "num_input_tokens_seen": 308960110, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6484375, + "step": 14324, + "time_per_iteration": 2.4845757484436035 + }, + { + "auxiliary_loss_clip": 0.01101451, + "auxiliary_loss_mlp": 0.01032685, + "balance_loss_clip": 1.02004886, + "balance_loss_mlp": 1.03591871, + "epoch": 0.8612655944686608, + "flos": 16028440744320.0, + "grad_norm": 2.344168191200211, + "language_loss": 0.66626883, + "learning_rate": 1.9844868516901036e-07, + "loss": 0.68761015, + "num_input_tokens_seen": 308976665, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.65625, + "step": 14325, + "time_per_iteration": 2.426171064376831 + }, + { + "auxiliary_loss_clip": 0.01101988, + "auxiliary_loss_mlp": 0.01031303, + "balance_loss_clip": 1.01929832, + "balance_loss_mlp": 1.03482699, + "epoch": 0.8613257177213287, + "flos": 22492720788480.0, + "grad_norm": 1.5550716058490144, + "language_loss": 0.64468634, + "learning_rate": 1.982795820716472e-07, + "loss": 0.6660192, + "num_input_tokens_seen": 308997015, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 14326, + "time_per_iteration": 2.4898109436035156 + }, + { + "auxiliary_loss_clip": 0.01099753, + "auxiliary_loss_mlp": 0.01027672, + "balance_loss_clip": 1.01547134, + "balance_loss_mlp": 1.03399992, + "epoch": 0.8613858409739967, + "flos": 17238056175360.0, + "grad_norm": 1.9452722163445866, + "language_loss": 0.83793277, + "learning_rate": 1.9811054729570253e-07, + "loss": 0.85920697, + "num_input_tokens_seen": 309015250, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65625, + "step": 14327, + "time_per_iteration": 2.468287229537964 + }, + { + "auxiliary_loss_clip": 0.01099492, + "auxiliary_loss_mlp": 0.01027725, + "balance_loss_clip": 1.01605403, + "balance_loss_mlp": 1.03375804, + "epoch": 0.8614459642266646, + "flos": 22821123859200.0, + "grad_norm": 2.187677830204378, + "language_loss": 0.74751425, + "learning_rate": 1.9794158084758661e-07, + "loss": 0.76878637, + "num_input_tokens_seen": 309034140, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65625, + "step": 14328, + "time_per_iteration": 2.4822041988372803 + }, + { + "auxiliary_loss_clip": 0.01098484, + "auxiliary_loss_mlp": 0.01027826, + "balance_loss_clip": 1.01648343, + "balance_loss_mlp": 1.03398705, + "epoch": 0.8615060874793327, + "flos": 26504301473280.0, + "grad_norm": 2.543685258961499, + "language_loss": 0.80284798, + "learning_rate": 1.9777268273370673e-07, + "loss": 0.8241111, + "num_input_tokens_seen": 309055075, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.64453125, + "step": 14329, + "time_per_iteration": 2.494361400604248 + }, + { + "auxiliary_loss_clip": 0.01098414, + "auxiliary_loss_mlp": 0.01026208, + "balance_loss_clip": 1.0146507, + "balance_loss_mlp": 1.03334963, + "epoch": 0.8615662107320006, + "flos": 24061011477120.0, + "grad_norm": 2.041563581055635, + "language_loss": 0.7741102, + "learning_rate": 1.9760385296046757e-07, + "loss": 0.79535639, + "num_input_tokens_seen": 309074650, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6484375, + "step": 14330, + "time_per_iteration": 2.4986650943756104 + }, + { + "auxiliary_loss_clip": 0.01099802, + "auxiliary_loss_mlp": 0.01029112, + "balance_loss_clip": 1.01725626, + "balance_loss_mlp": 1.03433192, + "epoch": 0.8616263339846686, + "flos": 24165044242560.0, + "grad_norm": 1.8440018429661427, + "language_loss": 0.64671254, + "learning_rate": 1.974350915342702e-07, + "loss": 0.66800165, + "num_input_tokens_seen": 309094385, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65625, + "step": 14331, + "time_per_iteration": 2.4618399143218994 + }, + { + "auxiliary_loss_clip": 0.01097847, + "auxiliary_loss_mlp": 0.01029793, + "balance_loss_clip": 1.01903462, + "balance_loss_mlp": 1.03375757, + "epoch": 0.8616864572373365, + "flos": 21724340025600.0, + "grad_norm": 1.617939309613219, + "language_loss": 0.7562784, + "learning_rate": 1.9726639846151506e-07, + "loss": 0.77755475, + "num_input_tokens_seen": 309111815, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.640625, + "step": 14332, + "time_per_iteration": 2.4628806114196777 + }, + { + "auxiliary_loss_clip": 0.01102042, + "auxiliary_loss_mlp": 0.01027637, + "balance_loss_clip": 1.0153228, + "balance_loss_mlp": 1.03375912, + "epoch": 0.8617465804900045, + "flos": 23766651521280.0, + "grad_norm": 2.5076238331768623, + "language_loss": 0.67116582, + "learning_rate": 1.9709777374859904e-07, + "loss": 0.69246262, + "num_input_tokens_seen": 309131385, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.68359375, + "step": 14333, + "time_per_iteration": 2.4831488132476807 + }, + { + "auxiliary_loss_clip": 0.01105944, + "auxiliary_loss_mlp": 0.01037056, + "balance_loss_clip": 1.0235616, + "balance_loss_mlp": 1.03613853, + "epoch": 0.8618067037426724, + "flos": 37703941251840.0, + "grad_norm": 1.640443227867146, + "language_loss": 0.62265992, + "learning_rate": 1.969292174019157e-07, + "loss": 0.64408994, + "num_input_tokens_seen": 309155020, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.6953125, + "step": 14334, + "time_per_iteration": 2.6413328647613525 + }, + { + "auxiliary_loss_clip": 0.01104093, + "auxiliary_loss_mlp": 0.01035775, + "balance_loss_clip": 1.02375257, + "balance_loss_mlp": 1.03698754, + "epoch": 0.8618668269953405, + "flos": 21471026336640.0, + "grad_norm": 2.0607406603383387, + "language_loss": 0.69330931, + "learning_rate": 1.967607294278577e-07, + "loss": 0.71470803, + "num_input_tokens_seen": 309172865, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 14335, + "time_per_iteration": 2.488579273223877 + }, + { + "auxiliary_loss_clip": 0.01102454, + "auxiliary_loss_mlp": 0.01029981, + "balance_loss_clip": 1.01848936, + "balance_loss_mlp": 1.03566599, + "epoch": 0.8619269502480085, + "flos": 22232691256320.0, + "grad_norm": 1.4915762191068862, + "language_loss": 0.82732737, + "learning_rate": 1.965923098328135e-07, + "loss": 0.84865171, + "num_input_tokens_seen": 309193575, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66796875, + "step": 14336, + "time_per_iteration": 2.465843677520752 + }, + { + "auxiliary_loss_clip": 0.0110378, + "auxiliary_loss_mlp": 0.01033334, + "balance_loss_clip": 1.0215981, + "balance_loss_mlp": 1.03505278, + "epoch": 0.8619870735006764, + "flos": 22710626645760.0, + "grad_norm": 1.9872260251064142, + "language_loss": 0.67640537, + "learning_rate": 1.9642395862316907e-07, + "loss": 0.69777656, + "num_input_tokens_seen": 309212680, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6875, + "step": 14337, + "time_per_iteration": 2.4723286628723145 + }, + { + "auxiliary_loss_clip": 0.01098134, + "auxiliary_loss_mlp": 0.01029072, + "balance_loss_clip": 1.01767564, + "balance_loss_mlp": 1.03330922, + "epoch": 0.8620471967533444, + "flos": 37520293991040.0, + "grad_norm": 1.6812153581439713, + "language_loss": 0.66831827, + "learning_rate": 1.962556758053089e-07, + "loss": 0.68959028, + "num_input_tokens_seen": 309234485, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 14338, + "time_per_iteration": 2.7375404834747314 + }, + { + "auxiliary_loss_clip": 0.01101827, + "auxiliary_loss_mlp": 0.01030096, + "balance_loss_clip": 1.01874149, + "balance_loss_mlp": 1.03572762, + "epoch": 0.8621073200060123, + "flos": 19682459493120.0, + "grad_norm": 1.9113561041240386, + "language_loss": 0.61766338, + "learning_rate": 1.9608746138561448e-07, + "loss": 0.63898253, + "num_input_tokens_seen": 309253630, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.66015625, + "step": 14339, + "time_per_iteration": 2.4824411869049072 + }, + { + "auxiliary_loss_clip": 0.01098374, + "auxiliary_loss_mlp": 0.0103256, + "balance_loss_clip": 1.02059722, + "balance_loss_mlp": 1.0335691, + "epoch": 0.8621674432586803, + "flos": 14536855549440.0, + "grad_norm": 2.224022175139818, + "language_loss": 0.62476075, + "learning_rate": 1.9591931537046458e-07, + "loss": 0.64607006, + "num_input_tokens_seen": 309270950, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6484375, + "step": 14340, + "time_per_iteration": 2.4229180812835693 + }, + { + "auxiliary_loss_clip": 0.01094892, + "auxiliary_loss_mlp": 0.01022284, + "balance_loss_clip": 1.01154339, + "balance_loss_mlp": 1.034199, + "epoch": 0.8622275665113482, + "flos": 20740100480640.0, + "grad_norm": 6.918273965774928, + "language_loss": 0.80039394, + "learning_rate": 1.9575123776623493e-07, + "loss": 0.82156569, + "num_input_tokens_seen": 309288780, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.60546875, + "step": 14341, + "time_per_iteration": 2.458031177520752 + }, + { + "auxiliary_loss_clip": 0.01097965, + "auxiliary_loss_mlp": 0.01029866, + "balance_loss_clip": 1.01902366, + "balance_loss_mlp": 1.03404796, + "epoch": 0.8622876897640163, + "flos": 24715914197760.0, + "grad_norm": 1.6209236423079696, + "language_loss": 0.74565721, + "learning_rate": 1.9558322857929887e-07, + "loss": 0.76693547, + "num_input_tokens_seen": 309310875, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.640625, + "step": 14342, + "time_per_iteration": 2.4915459156036377 + }, + { + "auxiliary_loss_clip": 0.01101781, + "auxiliary_loss_mlp": 0.01026411, + "balance_loss_clip": 1.01432872, + "balance_loss_mlp": 1.03483212, + "epoch": 0.8623478130166842, + "flos": 17457362663040.0, + "grad_norm": 1.9153066937981833, + "language_loss": 0.68352622, + "learning_rate": 1.95415287816028e-07, + "loss": 0.70480812, + "num_input_tokens_seen": 309329900, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66796875, + "step": 14343, + "time_per_iteration": 2.4379770755767822 + }, + { + "auxiliary_loss_clip": 0.01099898, + "auxiliary_loss_mlp": 0.01039302, + "balance_loss_clip": 1.0267911, + "balance_loss_mlp": 1.03367257, + "epoch": 0.8624079362693522, + "flos": 18109176814080.0, + "grad_norm": 2.013946395887745, + "language_loss": 0.67857057, + "learning_rate": 1.9524741548278967e-07, + "loss": 0.69996256, + "num_input_tokens_seen": 309347870, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6640625, + "step": 14344, + "time_per_iteration": 2.4314322471618652 + }, + { + "auxiliary_loss_clip": 0.01101335, + "auxiliary_loss_mlp": 0.01033381, + "balance_loss_clip": 1.02183521, + "balance_loss_mlp": 1.03399456, + "epoch": 0.8624680595220201, + "flos": 30666455971200.0, + "grad_norm": 1.3859624698188922, + "language_loss": 0.81348252, + "learning_rate": 1.9507961158595054e-07, + "loss": 0.83482969, + "num_input_tokens_seen": 309371695, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 14345, + "time_per_iteration": 4.005348205566406 + }, + { + "auxiliary_loss_clip": 0.01102538, + "auxiliary_loss_mlp": 0.01030595, + "balance_loss_clip": 1.01834607, + "balance_loss_mlp": 1.03569663, + "epoch": 0.8625281827746881, + "flos": 37998588516480.0, + "grad_norm": 2.749407337350185, + "language_loss": 0.50631642, + "learning_rate": 1.9491187613187355e-07, + "loss": 0.52764773, + "num_input_tokens_seen": 309394645, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.66796875, + "step": 14346, + "time_per_iteration": 2.592672824859619 + }, + { + "auxiliary_loss_clip": 0.01098677, + "auxiliary_loss_mlp": 0.01030095, + "balance_loss_clip": 1.01834738, + "balance_loss_mlp": 1.0334121, + "epoch": 0.862588306027356, + "flos": 26249730808320.0, + "grad_norm": 1.492868040910136, + "language_loss": 0.75041229, + "learning_rate": 1.9474420912691913e-07, + "loss": 0.77169997, + "num_input_tokens_seen": 309413170, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65234375, + "step": 14347, + "time_per_iteration": 3.972740888595581 + }, + { + "auxiliary_loss_clip": 0.01101781, + "auxiliary_loss_mlp": 0.01027989, + "balance_loss_clip": 1.01587152, + "balance_loss_mlp": 1.03619015, + "epoch": 0.862648429280024, + "flos": 25878809013120.0, + "grad_norm": 1.9072390574317508, + "language_loss": 0.80890203, + "learning_rate": 1.945766105774449e-07, + "loss": 0.83019972, + "num_input_tokens_seen": 309431315, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.65625, + "step": 14348, + "time_per_iteration": 2.496711015701294 + }, + { + "auxiliary_loss_clip": 0.01095235, + "auxiliary_loss_mlp": 0.01026238, + "balance_loss_clip": 1.01494908, + "balance_loss_mlp": 1.03236437, + "epoch": 0.862708552532692, + "flos": 37816413713280.0, + "grad_norm": 1.6607225101041632, + "language_loss": 0.66021013, + "learning_rate": 1.9440908048980665e-07, + "loss": 0.68142486, + "num_input_tokens_seen": 309453020, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.62890625, + "step": 14349, + "time_per_iteration": 2.5997297763824463 + }, + { + "auxiliary_loss_clip": 0.01098965, + "auxiliary_loss_mlp": 0.01037723, + "balance_loss_clip": 1.02595711, + "balance_loss_mlp": 1.03370428, + "epoch": 0.86276867578536, + "flos": 19091800247040.0, + "grad_norm": 2.2216960687680865, + "language_loss": 0.7004388, + "learning_rate": 1.942416188703573e-07, + "loss": 0.72180569, + "num_input_tokens_seen": 309469780, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.65234375, + "step": 14350, + "time_per_iteration": 2.4430317878723145 + }, + { + "auxiliary_loss_clip": 0.01098606, + "auxiliary_loss_mlp": 0.01029753, + "balance_loss_clip": 1.01812983, + "balance_loss_mlp": 1.033795, + "epoch": 0.862828799038028, + "flos": 22164281804160.0, + "grad_norm": 1.7403260075665494, + "language_loss": 0.77165568, + "learning_rate": 1.9407422572544618e-07, + "loss": 0.79293925, + "num_input_tokens_seen": 309489610, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6484375, + "step": 14351, + "time_per_iteration": 3.8987040519714355 + }, + { + "auxiliary_loss_clip": 0.01099378, + "auxiliary_loss_mlp": 0.01031637, + "balance_loss_clip": 1.02070606, + "balance_loss_mlp": 1.0340271, + "epoch": 0.8628889222906959, + "flos": 23145576433920.0, + "grad_norm": 1.7490870392415556, + "language_loss": 0.8492626, + "learning_rate": 1.9390690106142204e-07, + "loss": 0.87057269, + "num_input_tokens_seen": 309508295, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.65625, + "step": 14352, + "time_per_iteration": 2.4467623233795166 + }, + { + "auxiliary_loss_clip": 0.01022274, + "auxiliary_loss_mlp": 0.00997547, + "balance_loss_clip": 0.99648613, + "balance_loss_mlp": 1.00222087, + "epoch": 0.8629490455433639, + "flos": 57817762151040.0, + "grad_norm": 0.8026714354601334, + "language_loss": 0.61920941, + "learning_rate": 1.9373964488462913e-07, + "loss": 0.63940763, + "num_input_tokens_seen": 309567960, + "router_z_loss_clip": 0.01062012, + "router_z_loss_mlp": 0.20117188, + "step": 14353, + "time_per_iteration": 3.0518198013305664 + }, + { + "auxiliary_loss_clip": 0.01100741, + "auxiliary_loss_mlp": 0.01028388, + "balance_loss_clip": 1.01759958, + "balance_loss_mlp": 1.03660202, + "epoch": 0.8630091687960318, + "flos": 15919667383680.0, + "grad_norm": 1.6257849908762414, + "language_loss": 0.82292426, + "learning_rate": 1.9357245720140948e-07, + "loss": 0.84421557, + "num_input_tokens_seen": 309586050, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.640625, + "step": 14354, + "time_per_iteration": 2.454332113265991 + }, + { + "auxiliary_loss_clip": 0.01098964, + "auxiliary_loss_mlp": 0.01029623, + "balance_loss_clip": 1.01815522, + "balance_loss_mlp": 1.03391898, + "epoch": 0.8630692920486999, + "flos": 17961691570560.0, + "grad_norm": 2.066989456168094, + "language_loss": 0.85694742, + "learning_rate": 1.934053380181031e-07, + "loss": 0.87823325, + "num_input_tokens_seen": 309602910, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 14355, + "time_per_iteration": 2.4151952266693115 + }, + { + "auxiliary_loss_clip": 0.01100727, + "auxiliary_loss_mlp": 0.01031661, + "balance_loss_clip": 1.01879287, + "balance_loss_mlp": 1.03383946, + "epoch": 0.8631294153013678, + "flos": 22455158140800.0, + "grad_norm": 3.5101420065502404, + "language_loss": 0.58819818, + "learning_rate": 1.9323828734104763e-07, + "loss": 0.6095221, + "num_input_tokens_seen": 309621175, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.66796875, + "step": 14356, + "time_per_iteration": 2.435149908065796 + }, + { + "auxiliary_loss_clip": 0.0110249, + "auxiliary_loss_mlp": 0.01032276, + "balance_loss_clip": 1.01964545, + "balance_loss_mlp": 1.0345161, + "epoch": 0.8631895385540358, + "flos": 16837005847680.0, + "grad_norm": 1.6248412634448182, + "language_loss": 0.76978958, + "learning_rate": 1.9307130517657756e-07, + "loss": 0.79113722, + "num_input_tokens_seen": 309639395, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 14357, + "time_per_iteration": 2.3934738636016846 + }, + { + "auxiliary_loss_clip": 0.01101033, + "auxiliary_loss_mlp": 0.01031615, + "balance_loss_clip": 1.01962256, + "balance_loss_mlp": 1.03499317, + "epoch": 0.8632496618067037, + "flos": 18697214367360.0, + "grad_norm": 2.3756415536738897, + "language_loss": 0.77549875, + "learning_rate": 1.9290439153102468e-07, + "loss": 0.79682523, + "num_input_tokens_seen": 309657265, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66015625, + "step": 14358, + "time_per_iteration": 2.4436261653900146 + }, + { + "auxiliary_loss_clip": 0.01100136, + "auxiliary_loss_mlp": 0.01027101, + "balance_loss_clip": 1.01543641, + "balance_loss_mlp": 1.03392935, + "epoch": 0.8633097850593717, + "flos": 24279922915200.0, + "grad_norm": 1.7651797926248376, + "language_loss": 0.7522471, + "learning_rate": 1.9273754641071816e-07, + "loss": 0.77351952, + "num_input_tokens_seen": 309678610, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 14359, + "time_per_iteration": 2.4768805503845215 + }, + { + "auxiliary_loss_clip": 0.01094416, + "auxiliary_loss_mlp": 0.01025369, + "balance_loss_clip": 1.0139606, + "balance_loss_mlp": 1.0316956, + "epoch": 0.8633699083120396, + "flos": 21178569801600.0, + "grad_norm": 1.7556957611737163, + "language_loss": 0.70558703, + "learning_rate": 1.9257076982198517e-07, + "loss": 0.72678494, + "num_input_tokens_seen": 309697710, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.625, + "step": 14360, + "time_per_iteration": 2.4633822441101074 + }, + { + "auxiliary_loss_clip": 0.01104009, + "auxiliary_loss_mlp": 0.0103215, + "balance_loss_clip": 1.01929903, + "balance_loss_mlp": 1.0365603, + "epoch": 0.8634300315647077, + "flos": 19244888012160.0, + "grad_norm": 1.7204964341930526, + "language_loss": 0.76154602, + "learning_rate": 1.9240406177114953e-07, + "loss": 0.78290761, + "num_input_tokens_seen": 309715985, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.671875, + "step": 14361, + "time_per_iteration": 2.457219123840332 + }, + { + "auxiliary_loss_clip": 0.01022616, + "auxiliary_loss_mlp": 0.00998161, + "balance_loss_clip": 0.99707615, + "balance_loss_mlp": 1.00261354, + "epoch": 0.8634901548173756, + "flos": 66195648282240.0, + "grad_norm": 0.9659533407712392, + "language_loss": 0.58873498, + "learning_rate": 1.922374222645329e-07, + "loss": 0.60894275, + "num_input_tokens_seen": 309779930, + "router_z_loss_clip": 0.01086426, + "router_z_loss_mlp": 0.20019531, + "step": 14362, + "time_per_iteration": 3.0631728172302246 + }, + { + "auxiliary_loss_clip": 0.01101996, + "auxiliary_loss_mlp": 0.0103188, + "balance_loss_clip": 1.01902294, + "balance_loss_mlp": 1.03466797, + "epoch": 0.8635502780700436, + "flos": 24789531121920.0, + "grad_norm": 2.5080162128467394, + "language_loss": 0.8062591, + "learning_rate": 1.9207085130845524e-07, + "loss": 0.82759786, + "num_input_tokens_seen": 309800580, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.671875, + "step": 14363, + "time_per_iteration": 2.5122597217559814 + }, + { + "auxiliary_loss_clip": 0.01101414, + "auxiliary_loss_mlp": 0.01035658, + "balance_loss_clip": 1.02290881, + "balance_loss_mlp": 1.03377891, + "epoch": 0.8636104013227116, + "flos": 25189970918400.0, + "grad_norm": 2.5438276077077013, + "language_loss": 0.72507155, + "learning_rate": 1.9190434890923112e-07, + "loss": 0.74644232, + "num_input_tokens_seen": 309821725, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 14364, + "time_per_iteration": 2.4990742206573486 + }, + { + "auxiliary_loss_clip": 0.01101322, + "auxiliary_loss_mlp": 0.01032806, + "balance_loss_clip": 1.02173114, + "balance_loss_mlp": 1.03382576, + "epoch": 0.8636705245753795, + "flos": 23878441624320.0, + "grad_norm": 1.5907362446785047, + "language_loss": 0.71736836, + "learning_rate": 1.917379150731755e-07, + "loss": 0.73870963, + "num_input_tokens_seen": 309841565, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.671875, + "step": 14365, + "time_per_iteration": 2.5119576454162598 + }, + { + "auxiliary_loss_clip": 0.01104136, + "auxiliary_loss_mlp": 0.01037185, + "balance_loss_clip": 1.0247215, + "balance_loss_mlp": 1.03606164, + "epoch": 0.8637306478280475, + "flos": 23110455911040.0, + "grad_norm": 2.5037691791086902, + "language_loss": 0.70827854, + "learning_rate": 1.915715498065993e-07, + "loss": 0.72969174, + "num_input_tokens_seen": 309858635, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6796875, + "step": 14366, + "time_per_iteration": 2.442091464996338 + }, + { + "auxiliary_loss_clip": 0.01098479, + "auxiliary_loss_mlp": 0.01026254, + "balance_loss_clip": 1.01578689, + "balance_loss_mlp": 1.03511524, + "epoch": 0.8637907710807154, + "flos": 21906802137600.0, + "grad_norm": 1.5905343541325248, + "language_loss": 0.81950366, + "learning_rate": 1.9140525311581146e-07, + "loss": 0.84075105, + "num_input_tokens_seen": 309877885, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.6328125, + "step": 14367, + "time_per_iteration": 2.4821760654449463 + }, + { + "auxiliary_loss_clip": 0.01102272, + "auxiliary_loss_mlp": 0.01027729, + "balance_loss_clip": 1.01515222, + "balance_loss_mlp": 1.03513527, + "epoch": 0.8638508943333835, + "flos": 23580526222080.0, + "grad_norm": 1.9267157021688266, + "language_loss": 0.61380374, + "learning_rate": 1.9123902500711743e-07, + "loss": 0.63510376, + "num_input_tokens_seen": 309893140, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.671875, + "step": 14368, + "time_per_iteration": 2.4707953929901123 + }, + { + "auxiliary_loss_clip": 0.01100887, + "auxiliary_loss_mlp": 0.01030706, + "balance_loss_clip": 1.01909471, + "balance_loss_mlp": 1.03561449, + "epoch": 0.8639110175860514, + "flos": 25775853655680.0, + "grad_norm": 1.945590600384619, + "language_loss": 0.76329541, + "learning_rate": 1.91072865486821e-07, + "loss": 0.78461134, + "num_input_tokens_seen": 309914175, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 14369, + "time_per_iteration": 2.5276546478271484 + }, + { + "auxiliary_loss_clip": 0.01101224, + "auxiliary_loss_mlp": 0.01031574, + "balance_loss_clip": 1.01921809, + "balance_loss_mlp": 1.03340197, + "epoch": 0.8639711408387194, + "flos": 23369443948800.0, + "grad_norm": 1.737706503573944, + "language_loss": 0.6419906, + "learning_rate": 1.9090677456122294e-07, + "loss": 0.66331857, + "num_input_tokens_seen": 309932395, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6796875, + "step": 14370, + "time_per_iteration": 2.452861785888672 + }, + { + "auxiliary_loss_clip": 0.0110173, + "auxiliary_loss_mlp": 0.01032222, + "balance_loss_clip": 1.02055156, + "balance_loss_mlp": 1.03612161, + "epoch": 0.8640312640913873, + "flos": 22127221946880.0, + "grad_norm": 1.5489158214730672, + "language_loss": 0.6619693, + "learning_rate": 1.907407522366209e-07, + "loss": 0.68330884, + "num_input_tokens_seen": 309951720, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 14371, + "time_per_iteration": 2.454864263534546 + }, + { + "auxiliary_loss_clip": 0.01021913, + "auxiliary_loss_mlp": 0.00998207, + "balance_loss_clip": 0.99720526, + "balance_loss_mlp": 1.00191593, + "epoch": 0.8640913873440553, + "flos": 57571735944960.0, + "grad_norm": 0.8716085073446712, + "language_loss": 0.56875324, + "learning_rate": 1.905747985193107e-07, + "loss": 0.58895445, + "num_input_tokens_seen": 310006120, + "router_z_loss_clip": 0.01000977, + "router_z_loss_mlp": 0.20019531, + "step": 14372, + "time_per_iteration": 2.9307870864868164 + }, + { + "auxiliary_loss_clip": 0.01098629, + "auxiliary_loss_mlp": 0.0103017, + "balance_loss_clip": 1.01812363, + "balance_loss_mlp": 1.03571773, + "epoch": 0.8641515105967232, + "flos": 23987430466560.0, + "grad_norm": 1.7171736097417043, + "language_loss": 0.79384911, + "learning_rate": 1.9040891341558597e-07, + "loss": 0.81513715, + "num_input_tokens_seen": 310026740, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.62890625, + "step": 14373, + "time_per_iteration": 2.493739604949951 + }, + { + "auxiliary_loss_clip": 0.01100672, + "auxiliary_loss_mlp": 0.01027536, + "balance_loss_clip": 1.01599669, + "balance_loss_mlp": 1.03470039, + "epoch": 0.8642116338493913, + "flos": 19062749122560.0, + "grad_norm": 1.7213637290522288, + "language_loss": 0.63829684, + "learning_rate": 1.9024309693173656e-07, + "loss": 0.65957886, + "num_input_tokens_seen": 310044135, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66015625, + "step": 14374, + "time_per_iteration": 2.42464017868042 + }, + { + "auxiliary_loss_clip": 0.01100064, + "auxiliary_loss_mlp": 0.0102942, + "balance_loss_clip": 1.01809502, + "balance_loss_mlp": 1.03603303, + "epoch": 0.8642717571020592, + "flos": 18254148105600.0, + "grad_norm": 1.9472850722016972, + "language_loss": 0.77497673, + "learning_rate": 1.9007734907404993e-07, + "loss": 0.79627156, + "num_input_tokens_seen": 310061560, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 14375, + "time_per_iteration": 2.4481828212738037 + }, + { + "auxiliary_loss_clip": 0.0109996, + "auxiliary_loss_mlp": 0.01032779, + "balance_loss_clip": 1.02089977, + "balance_loss_mlp": 1.03409755, + "epoch": 0.8643318803547272, + "flos": 57663270777600.0, + "grad_norm": 1.5689694423673801, + "language_loss": 0.60686284, + "learning_rate": 1.899116698488117e-07, + "loss": 0.62819022, + "num_input_tokens_seen": 310087310, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66015625, + "step": 14376, + "time_per_iteration": 2.776718854904175 + }, + { + "auxiliary_loss_clip": 0.01097662, + "auxiliary_loss_mlp": 0.01032505, + "balance_loss_clip": 1.0215137, + "balance_loss_mlp": 1.03338146, + "epoch": 0.8643920036073952, + "flos": 19609524927360.0, + "grad_norm": 1.6233773898325343, + "language_loss": 0.66225243, + "learning_rate": 1.8974605926230457e-07, + "loss": 0.68355405, + "num_input_tokens_seen": 310106260, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.64453125, + "step": 14377, + "time_per_iteration": 2.4549734592437744 + }, + { + "auxiliary_loss_clip": 0.01100162, + "auxiliary_loss_mlp": 0.01032145, + "balance_loss_clip": 1.02024806, + "balance_loss_mlp": 1.03251028, + "epoch": 0.8644521268600631, + "flos": 20850346298880.0, + "grad_norm": 1.716134367829843, + "language_loss": 0.70389247, + "learning_rate": 1.8958051732080804e-07, + "loss": 0.72521555, + "num_input_tokens_seen": 310125305, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 14378, + "time_per_iteration": 2.4362454414367676 + }, + { + "auxiliary_loss_clip": 0.01022402, + "auxiliary_loss_mlp": 0.00998397, + "balance_loss_clip": 0.99738961, + "balance_loss_mlp": 1.00238144, + "epoch": 0.8645122501127311, + "flos": 66719550101760.0, + "grad_norm": 0.8057206784790626, + "language_loss": 0.60312063, + "learning_rate": 1.894150440305995e-07, + "loss": 0.62332863, + "num_input_tokens_seen": 310189270, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.20019531, + "step": 14379, + "time_per_iteration": 3.062391757965088 + }, + { + "auxiliary_loss_clip": 0.01097844, + "auxiliary_loss_mlp": 0.01029334, + "balance_loss_clip": 1.0178901, + "balance_loss_mlp": 1.03340411, + "epoch": 0.864572373365399, + "flos": 21690009601920.0, + "grad_norm": 1.4504200122603432, + "language_loss": 0.74308336, + "learning_rate": 1.8924963939795478e-07, + "loss": 0.76435512, + "num_input_tokens_seen": 310208395, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.64453125, + "step": 14380, + "time_per_iteration": 2.4468491077423096 + }, + { + "auxiliary_loss_clip": 0.01101324, + "auxiliary_loss_mlp": 0.01029458, + "balance_loss_clip": 1.01766789, + "balance_loss_mlp": 1.03408456, + "epoch": 0.8646324966180671, + "flos": 20266402896000.0, + "grad_norm": 2.1859120044364206, + "language_loss": 0.74855471, + "learning_rate": 1.8908430342914473e-07, + "loss": 0.76986253, + "num_input_tokens_seen": 310227415, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.671875, + "step": 14381, + "time_per_iteration": 2.4468581676483154 + }, + { + "auxiliary_loss_clip": 0.01098259, + "auxiliary_loss_mlp": 0.01034372, + "balance_loss_clip": 1.02303529, + "balance_loss_mlp": 1.03415072, + "epoch": 0.864692619870735, + "flos": 11946188050560.0, + "grad_norm": 2.3474051146654538, + "language_loss": 0.84216976, + "learning_rate": 1.8891903613043892e-07, + "loss": 0.86349607, + "num_input_tokens_seen": 310242625, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 14382, + "time_per_iteration": 2.4011387825012207 + }, + { + "auxiliary_loss_clip": 0.01101348, + "auxiliary_loss_mlp": 0.0103088, + "balance_loss_clip": 1.01876235, + "balance_loss_mlp": 1.03522801, + "epoch": 0.864752743123403, + "flos": 21470703114240.0, + "grad_norm": 1.8738363294425433, + "language_loss": 0.75711656, + "learning_rate": 1.8875383750810504e-07, + "loss": 0.77843881, + "num_input_tokens_seen": 310260585, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66015625, + "step": 14383, + "time_per_iteration": 2.45684814453125 + }, + { + "auxiliary_loss_clip": 0.0110184, + "auxiliary_loss_mlp": 0.01030116, + "balance_loss_clip": 1.01837349, + "balance_loss_mlp": 1.03689098, + "epoch": 0.8648128663760709, + "flos": 19530018172800.0, + "grad_norm": 2.9063267883434047, + "language_loss": 0.84982598, + "learning_rate": 1.8858870756840738e-07, + "loss": 0.87114561, + "num_input_tokens_seen": 310277210, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6484375, + "step": 14384, + "time_per_iteration": 2.4381446838378906 + }, + { + "auxiliary_loss_clip": 0.01096025, + "auxiliary_loss_mlp": 0.01028139, + "balance_loss_clip": 1.01730251, + "balance_loss_mlp": 1.03212559, + "epoch": 0.8648729896287389, + "flos": 21287953693440.0, + "grad_norm": 1.678692131551695, + "language_loss": 0.8082968, + "learning_rate": 1.884236463176072e-07, + "loss": 0.82953846, + "num_input_tokens_seen": 310296610, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.640625, + "step": 14385, + "time_per_iteration": 2.4595248699188232 + }, + { + "auxiliary_loss_clip": 0.01104363, + "auxiliary_loss_mlp": 0.01030099, + "balance_loss_clip": 1.01813602, + "balance_loss_mlp": 1.03674197, + "epoch": 0.8649331128814068, + "flos": 24604483230720.0, + "grad_norm": 2.080462579763744, + "language_loss": 0.7260626, + "learning_rate": 1.8825865376196437e-07, + "loss": 0.74740726, + "num_input_tokens_seen": 310316830, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.67578125, + "step": 14386, + "time_per_iteration": 2.464761972427368 + }, + { + "auxiliary_loss_clip": 0.01100332, + "auxiliary_loss_mlp": 0.0103323, + "balance_loss_clip": 1.02241206, + "balance_loss_mlp": 1.03550792, + "epoch": 0.8649932361340749, + "flos": 15377811742080.0, + "grad_norm": 2.2876758841906026, + "language_loss": 0.82462382, + "learning_rate": 1.8809372990773476e-07, + "loss": 0.84595942, + "num_input_tokens_seen": 310334355, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6484375, + "step": 14387, + "time_per_iteration": 3.922278642654419 + }, + { + "auxiliary_loss_clip": 0.01097667, + "auxiliary_loss_mlp": 0.01026771, + "balance_loss_clip": 1.01570868, + "balance_loss_mlp": 1.03452444, + "epoch": 0.8650533593867428, + "flos": 19901227276800.0, + "grad_norm": 1.9528284009807142, + "language_loss": 0.68743157, + "learning_rate": 1.8792887476117224e-07, + "loss": 0.70867598, + "num_input_tokens_seen": 310352900, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6328125, + "step": 14388, + "time_per_iteration": 5.270868301391602 + }, + { + "auxiliary_loss_clip": 0.01097314, + "auxiliary_loss_mlp": 0.01033896, + "balance_loss_clip": 1.02323222, + "balance_loss_mlp": 1.03490067, + "epoch": 0.8651134826394108, + "flos": 25626931868160.0, + "grad_norm": 1.6968271505710826, + "language_loss": 0.90530205, + "learning_rate": 1.877640883285283e-07, + "loss": 0.92661411, + "num_input_tokens_seen": 310372855, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.625, + "step": 14389, + "time_per_iteration": 2.5115513801574707 + }, + { + "auxiliary_loss_clip": 0.01097489, + "auxiliary_loss_mlp": 0.01028543, + "balance_loss_clip": 1.01771283, + "balance_loss_mlp": 1.03391635, + "epoch": 0.8651736058920788, + "flos": 18734525619840.0, + "grad_norm": 1.4619178104484627, + "language_loss": 0.70866364, + "learning_rate": 1.8759937061605212e-07, + "loss": 0.72992396, + "num_input_tokens_seen": 310391595, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.63671875, + "step": 14390, + "time_per_iteration": 2.5057373046875 + }, + { + "auxiliary_loss_clip": 0.01102072, + "auxiliary_loss_mlp": 0.01033833, + "balance_loss_clip": 1.02169168, + "balance_loss_mlp": 1.03451252, + "epoch": 0.8652337291447467, + "flos": 20776765288320.0, + "grad_norm": 1.7005812844165624, + "language_loss": 0.81973195, + "learning_rate": 1.8743472162998941e-07, + "loss": 0.84109104, + "num_input_tokens_seen": 310410090, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.67578125, + "step": 14391, + "time_per_iteration": 2.4508116245269775 + }, + { + "auxiliary_loss_clip": 0.01022254, + "auxiliary_loss_mlp": 0.01000548, + "balance_loss_clip": 0.99952883, + "balance_loss_mlp": 1.00220108, + "epoch": 0.8652938523974147, + "flos": 64227887464320.0, + "grad_norm": 0.793527060984129, + "language_loss": 0.68029255, + "learning_rate": 1.8727014137658337e-07, + "loss": 0.70052058, + "num_input_tokens_seen": 310470055, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.20019531, + "step": 14392, + "time_per_iteration": 4.455903053283691 + }, + { + "auxiliary_loss_clip": 0.01104699, + "auxiliary_loss_mlp": 0.01029502, + "balance_loss_clip": 1.01706886, + "balance_loss_mlp": 1.03512073, + "epoch": 0.8653539756500827, + "flos": 18040587793920.0, + "grad_norm": 2.799808785659983, + "language_loss": 0.75864339, + "learning_rate": 1.8710562986207523e-07, + "loss": 0.77998543, + "num_input_tokens_seen": 310487665, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6953125, + "step": 14393, + "time_per_iteration": 2.4210777282714844 + }, + { + "auxiliary_loss_clip": 0.0110015, + "auxiliary_loss_mlp": 0.01030711, + "balance_loss_clip": 1.01872993, + "balance_loss_mlp": 1.03299379, + "epoch": 0.8654140989027507, + "flos": 17382416935680.0, + "grad_norm": 2.253323172073062, + "language_loss": 0.73935288, + "learning_rate": 1.8694118709270357e-07, + "loss": 0.76066148, + "num_input_tokens_seen": 310506130, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.671875, + "step": 14394, + "time_per_iteration": 2.4389302730560303 + }, + { + "auxiliary_loss_clip": 0.01101599, + "auxiliary_loss_mlp": 0.01027809, + "balance_loss_clip": 1.01563203, + "balance_loss_mlp": 1.03460002, + "epoch": 0.8654742221554186, + "flos": 53284862448000.0, + "grad_norm": 1.9628002470844093, + "language_loss": 0.65009511, + "learning_rate": 1.867768130747036e-07, + "loss": 0.6713891, + "num_input_tokens_seen": 310532445, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 14395, + "time_per_iteration": 2.7287087440490723 + }, + { + "auxiliary_loss_clip": 0.01100411, + "auxiliary_loss_mlp": 0.01034276, + "balance_loss_clip": 1.02282023, + "balance_loss_mlp": 1.03583765, + "epoch": 0.8655343454080866, + "flos": 23914711382400.0, + "grad_norm": 1.6931560876966212, + "language_loss": 0.67837584, + "learning_rate": 1.8661250781430838e-07, + "loss": 0.69972277, + "num_input_tokens_seen": 310552300, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.64453125, + "step": 14396, + "time_per_iteration": 2.4718122482299805 + }, + { + "auxiliary_loss_clip": 0.01104393, + "auxiliary_loss_mlp": 0.01034049, + "balance_loss_clip": 1.02227736, + "balance_loss_mlp": 1.03622818, + "epoch": 0.8655944686607545, + "flos": 24097209408000.0, + "grad_norm": 2.2875243864017256, + "language_loss": 0.69540256, + "learning_rate": 1.8644827131774954e-07, + "loss": 0.71678698, + "num_input_tokens_seen": 310572710, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.68359375, + "step": 14397, + "time_per_iteration": 2.487607479095459 + }, + { + "auxiliary_loss_clip": 0.01098278, + "auxiliary_loss_mlp": 0.01027254, + "balance_loss_clip": 1.01615524, + "balance_loss_mlp": 1.03279233, + "epoch": 0.8656545919134225, + "flos": 23112718467840.0, + "grad_norm": 2.9815102788946666, + "language_loss": 0.63472062, + "learning_rate": 1.86284103591253e-07, + "loss": 0.65597594, + "num_input_tokens_seen": 310592460, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.65625, + "step": 14398, + "time_per_iteration": 2.527153491973877 + }, + { + "auxiliary_loss_clip": 0.01100558, + "auxiliary_loss_mlp": 0.01031131, + "balance_loss_clip": 1.02000332, + "balance_loss_mlp": 1.03505087, + "epoch": 0.8657147151660904, + "flos": 21141761339520.0, + "grad_norm": 1.8815607100510332, + "language_loss": 0.76581329, + "learning_rate": 1.8612000464104517e-07, + "loss": 0.78713018, + "num_input_tokens_seen": 310609375, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 14399, + "time_per_iteration": 2.460524559020996 + }, + { + "auxiliary_loss_clip": 0.01098284, + "auxiliary_loss_mlp": 0.01027999, + "balance_loss_clip": 1.01721668, + "balance_loss_mlp": 1.03396773, + "epoch": 0.8657748384187585, + "flos": 16289439943680.0, + "grad_norm": 1.9369254428150626, + "language_loss": 0.9345935, + "learning_rate": 1.8595597447334855e-07, + "loss": 0.95585632, + "num_input_tokens_seen": 310627405, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.640625, + "step": 14400, + "time_per_iteration": 2.4347453117370605 + }, + { + "auxiliary_loss_clip": 0.01101332, + "auxiliary_loss_mlp": 0.01030649, + "balance_loss_clip": 1.01947284, + "balance_loss_mlp": 1.03524673, + "epoch": 0.8658349616714264, + "flos": 30843890179200.0, + "grad_norm": 1.8338281378598142, + "language_loss": 0.67362767, + "learning_rate": 1.8579201309438353e-07, + "loss": 0.69494748, + "num_input_tokens_seen": 310649945, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.66015625, + "step": 14401, + "time_per_iteration": 2.535815715789795 + }, + { + "auxiliary_loss_clip": 0.01100666, + "auxiliary_loss_mlp": 0.01029464, + "balance_loss_clip": 1.01766849, + "balance_loss_mlp": 1.03385806, + "epoch": 0.8658950849240944, + "flos": 18952862440320.0, + "grad_norm": 1.9734469645233848, + "language_loss": 0.73550159, + "learning_rate": 1.8562812051036714e-07, + "loss": 0.75680286, + "num_input_tokens_seen": 310668285, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66796875, + "step": 14402, + "time_per_iteration": 2.464496612548828 + }, + { + "auxiliary_loss_clip": 0.01098479, + "auxiliary_loss_mlp": 0.01030801, + "balance_loss_clip": 1.01988101, + "balance_loss_mlp": 1.0342344, + "epoch": 0.8659552081767624, + "flos": 23364344217600.0, + "grad_norm": 1.6389653471656214, + "language_loss": 0.74639928, + "learning_rate": 1.8546429672751397e-07, + "loss": 0.76769209, + "num_input_tokens_seen": 310687015, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.640625, + "step": 14403, + "time_per_iteration": 2.481019973754883 + }, + { + "auxiliary_loss_clip": 0.01101184, + "auxiliary_loss_mlp": 0.0103124, + "balance_loss_clip": 1.01884234, + "balance_loss_mlp": 1.03445876, + "epoch": 0.8660153314294303, + "flos": 23841992298240.0, + "grad_norm": 2.071645757031755, + "language_loss": 0.72956061, + "learning_rate": 1.853005417520368e-07, + "loss": 0.75088489, + "num_input_tokens_seen": 310707580, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.66796875, + "step": 14404, + "time_per_iteration": 2.5011138916015625 + }, + { + "auxiliary_loss_clip": 0.01099247, + "auxiliary_loss_mlp": 0.01031077, + "balance_loss_clip": 1.01935267, + "balance_loss_mlp": 1.03516841, + "epoch": 0.8660754546820983, + "flos": 23112467072640.0, + "grad_norm": 1.7013566949514276, + "language_loss": 0.7065661, + "learning_rate": 1.851368555901447e-07, + "loss": 0.72786927, + "num_input_tokens_seen": 310727300, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.640625, + "step": 14405, + "time_per_iteration": 2.4559895992279053 + }, + { + "auxiliary_loss_clip": 0.01101166, + "auxiliary_loss_mlp": 0.01031658, + "balance_loss_clip": 1.01983237, + "balance_loss_mlp": 1.0338521, + "epoch": 0.8661355779347663, + "flos": 14391991998720.0, + "grad_norm": 1.7861200215864865, + "language_loss": 0.66381979, + "learning_rate": 1.8497323824804467e-07, + "loss": 0.685148, + "num_input_tokens_seen": 310744935, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 14406, + "time_per_iteration": 2.409515142440796 + }, + { + "auxiliary_loss_clip": 0.0109867, + "auxiliary_loss_mlp": 0.01025585, + "balance_loss_clip": 1.01523805, + "balance_loss_mlp": 1.03394818, + "epoch": 0.8661957011874343, + "flos": 21870137329920.0, + "grad_norm": 1.5358857043199197, + "language_loss": 0.83051056, + "learning_rate": 1.8480968973194177e-07, + "loss": 0.85175312, + "num_input_tokens_seen": 310765085, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.6484375, + "step": 14407, + "time_per_iteration": 2.4647037982940674 + }, + { + "auxiliary_loss_clip": 0.01099601, + "auxiliary_loss_mlp": 0.01036617, + "balance_loss_clip": 1.02496433, + "balance_loss_mlp": 1.03503644, + "epoch": 0.8662558244401022, + "flos": 21835160461440.0, + "grad_norm": 1.6444953555030566, + "language_loss": 0.69656581, + "learning_rate": 1.8464621004803748e-07, + "loss": 0.71792799, + "num_input_tokens_seen": 310783260, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.64453125, + "step": 14408, + "time_per_iteration": 2.472282648086548 + }, + { + "auxiliary_loss_clip": 0.01095121, + "auxiliary_loss_mlp": 0.01029858, + "balance_loss_clip": 1.01912928, + "balance_loss_mlp": 1.0326798, + "epoch": 0.8663159476927702, + "flos": 17384104874880.0, + "grad_norm": 1.783525972516364, + "language_loss": 0.77200353, + "learning_rate": 1.844827992025304e-07, + "loss": 0.7932533, + "num_input_tokens_seen": 310801970, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.625, + "step": 14409, + "time_per_iteration": 2.4336869716644287 + }, + { + "auxiliary_loss_clip": 0.01104531, + "auxiliary_loss_mlp": 0.01029768, + "balance_loss_clip": 1.0170958, + "balance_loss_mlp": 1.03682649, + "epoch": 0.8663760709454381, + "flos": 22747722416640.0, + "grad_norm": 2.112427414775802, + "language_loss": 0.77122021, + "learning_rate": 1.8431945720161757e-07, + "loss": 0.7925632, + "num_input_tokens_seen": 310822070, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 14410, + "time_per_iteration": 2.4947590827941895 + }, + { + "auxiliary_loss_clip": 0.01101131, + "auxiliary_loss_mlp": 0.01030563, + "balance_loss_clip": 1.01899338, + "balance_loss_mlp": 1.0351249, + "epoch": 0.8664361941981061, + "flos": 17376850327680.0, + "grad_norm": 1.8818128477280998, + "language_loss": 0.7770704, + "learning_rate": 1.8415618405149315e-07, + "loss": 0.79838735, + "num_input_tokens_seen": 310838355, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66015625, + "step": 14411, + "time_per_iteration": 2.4171528816223145 + }, + { + "auxiliary_loss_clip": 0.0109563, + "auxiliary_loss_mlp": 0.01032813, + "balance_loss_clip": 1.02179205, + "balance_loss_mlp": 1.03105211, + "epoch": 0.866496317450774, + "flos": 16034438315520.0, + "grad_norm": 1.9843804705658286, + "language_loss": 0.73648727, + "learning_rate": 1.8399297975834794e-07, + "loss": 0.75777173, + "num_input_tokens_seen": 310856055, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.64453125, + "step": 14412, + "time_per_iteration": 2.409283399581909 + }, + { + "auxiliary_loss_clip": 0.01097266, + "auxiliary_loss_mlp": 0.01026079, + "balance_loss_clip": 1.01570809, + "balance_loss_mlp": 1.03446996, + "epoch": 0.8665564407034421, + "flos": 20814830726400.0, + "grad_norm": 1.9946534304785197, + "language_loss": 0.69103146, + "learning_rate": 1.83829844328371e-07, + "loss": 0.7122649, + "num_input_tokens_seen": 310876695, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.62890625, + "step": 14413, + "time_per_iteration": 2.45535945892334 + }, + { + "auxiliary_loss_clip": 0.01100064, + "auxiliary_loss_mlp": 0.01028753, + "balance_loss_clip": 1.01704669, + "balance_loss_mlp": 1.03485107, + "epoch": 0.86661656395611, + "flos": 15815167741440.0, + "grad_norm": 2.124342813127657, + "language_loss": 0.63079798, + "learning_rate": 1.8366677776774874e-07, + "loss": 0.65208614, + "num_input_tokens_seen": 310893880, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6484375, + "step": 14414, + "time_per_iteration": 2.426166534423828 + }, + { + "auxiliary_loss_clip": 0.01100431, + "auxiliary_loss_mlp": 0.01029476, + "balance_loss_clip": 1.01786518, + "balance_loss_mlp": 1.03504372, + "epoch": 0.866676687208778, + "flos": 23036910814080.0, + "grad_norm": 5.094417557754872, + "language_loss": 0.64098227, + "learning_rate": 1.8350378008266377e-07, + "loss": 0.66228133, + "num_input_tokens_seen": 310914145, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 14415, + "time_per_iteration": 2.505326271057129 + }, + { + "auxiliary_loss_clip": 0.01022563, + "auxiliary_loss_mlp": 0.00999471, + "balance_loss_clip": 0.99847585, + "balance_loss_mlp": 1.00243139, + "epoch": 0.866736810461446, + "flos": 63802275212160.0, + "grad_norm": 0.994856403197853, + "language_loss": 0.6039449, + "learning_rate": 1.8334085127929754e-07, + "loss": 0.6241653, + "num_input_tokens_seen": 310972825, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.20117188, + "step": 14416, + "time_per_iteration": 3.1169750690460205 + }, + { + "auxiliary_loss_clip": 0.0110175, + "auxiliary_loss_mlp": 0.01031843, + "balance_loss_clip": 1.01969552, + "balance_loss_mlp": 1.03320909, + "epoch": 0.8667969337141139, + "flos": 20449367798400.0, + "grad_norm": 1.709121823786769, + "language_loss": 0.74786484, + "learning_rate": 1.831779913638285e-07, + "loss": 0.76920074, + "num_input_tokens_seen": 310992050, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.68359375, + "step": 14417, + "time_per_iteration": 2.4594523906707764 + }, + { + "auxiliary_loss_clip": 0.01098679, + "auxiliary_loss_mlp": 0.0103671, + "balance_loss_clip": 1.02536714, + "balance_loss_mlp": 1.03374553, + "epoch": 0.866857056966782, + "flos": 21653703930240.0, + "grad_norm": 1.4667054132693154, + "language_loss": 0.74927706, + "learning_rate": 1.830152003424319e-07, + "loss": 0.77063096, + "num_input_tokens_seen": 311011105, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 14418, + "time_per_iteration": 2.4421162605285645 + }, + { + "auxiliary_loss_clip": 0.01099218, + "auxiliary_loss_mlp": 0.01034392, + "balance_loss_clip": 1.02282906, + "balance_loss_mlp": 1.03383338, + "epoch": 0.8669171802194499, + "flos": 22852832590080.0, + "grad_norm": 1.7373317907861416, + "language_loss": 0.68308914, + "learning_rate": 1.8285247822128126e-07, + "loss": 0.70442522, + "num_input_tokens_seen": 311032080, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65234375, + "step": 14419, + "time_per_iteration": 2.4572219848632812 + }, + { + "auxiliary_loss_clip": 0.01099714, + "auxiliary_loss_mlp": 0.01028303, + "balance_loss_clip": 1.01743746, + "balance_loss_mlp": 1.03343856, + "epoch": 0.8669773034721179, + "flos": 18734166483840.0, + "grad_norm": 1.8427566816345506, + "language_loss": 0.78783178, + "learning_rate": 1.826898250065465e-07, + "loss": 0.80911195, + "num_input_tokens_seen": 311049735, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6640625, + "step": 14420, + "time_per_iteration": 2.431793212890625 + }, + { + "auxiliary_loss_clip": 0.01099106, + "auxiliary_loss_mlp": 0.01026008, + "balance_loss_clip": 1.01507688, + "balance_loss_mlp": 1.03495288, + "epoch": 0.8670374267247858, + "flos": 18916018064640.0, + "grad_norm": 1.5228427156861324, + "language_loss": 0.83455002, + "learning_rate": 1.8252724070439586e-07, + "loss": 0.85580111, + "num_input_tokens_seen": 311067675, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.640625, + "step": 14421, + "time_per_iteration": 2.4668033123016357 + }, + { + "auxiliary_loss_clip": 0.01022879, + "auxiliary_loss_mlp": 0.01001113, + "balance_loss_clip": 1.00009346, + "balance_loss_mlp": 1.00281167, + "epoch": 0.8670975499774538, + "flos": 48814527214080.0, + "grad_norm": 0.6988406736665802, + "language_loss": 0.49181524, + "learning_rate": 1.823647253209941e-07, + "loss": 0.51205516, + "num_input_tokens_seen": 311126605, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.20117188, + "step": 14422, + "time_per_iteration": 3.0614736080169678 + }, + { + "auxiliary_loss_clip": 0.01098788, + "auxiliary_loss_mlp": 0.0102549, + "balance_loss_clip": 1.01436234, + "balance_loss_mlp": 1.03404534, + "epoch": 0.8671576732301217, + "flos": 26136145025280.0, + "grad_norm": 1.6515883006582035, + "language_loss": 0.73396868, + "learning_rate": 1.8220227886250417e-07, + "loss": 0.75521147, + "num_input_tokens_seen": 311147325, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 14423, + "time_per_iteration": 2.5444014072418213 + }, + { + "auxiliary_loss_clip": 0.01095039, + "auxiliary_loss_mlp": 0.01024294, + "balance_loss_clip": 1.01376247, + "balance_loss_mlp": 1.03325748, + "epoch": 0.8672177964827897, + "flos": 18367446579840.0, + "grad_norm": 1.6484964407004838, + "language_loss": 0.76470268, + "learning_rate": 1.8203990133508684e-07, + "loss": 0.785896, + "num_input_tokens_seen": 311165385, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6171875, + "step": 14424, + "time_per_iteration": 2.424851179122925 + }, + { + "auxiliary_loss_clip": 0.01095461, + "auxiliary_loss_mlp": 0.01031583, + "balance_loss_clip": 1.02093172, + "balance_loss_mlp": 1.03368878, + "epoch": 0.8672779197354576, + "flos": 28545355992960.0, + "grad_norm": 1.5813311269546795, + "language_loss": 0.71488333, + "learning_rate": 1.8187759274489767e-07, + "loss": 0.73615384, + "num_input_tokens_seen": 311185860, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6171875, + "step": 14425, + "time_per_iteration": 2.511129379272461 + }, + { + "auxiliary_loss_clip": 0.01100644, + "auxiliary_loss_mlp": 0.01031036, + "balance_loss_clip": 1.0186975, + "balance_loss_mlp": 1.03369915, + "epoch": 0.8673380429881257, + "flos": 22382474970240.0, + "grad_norm": 1.7136647691025457, + "language_loss": 0.68201184, + "learning_rate": 1.817153530980926e-07, + "loss": 0.70332867, + "num_input_tokens_seen": 311205810, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.671875, + "step": 14426, + "time_per_iteration": 2.45344614982605 + }, + { + "auxiliary_loss_clip": 0.01100039, + "auxiliary_loss_mlp": 0.01026509, + "balance_loss_clip": 1.01479053, + "balance_loss_mlp": 1.03466105, + "epoch": 0.8673981662407936, + "flos": 20996430912000.0, + "grad_norm": 2.1655107539362732, + "language_loss": 0.71177137, + "learning_rate": 1.815531824008234e-07, + "loss": 0.73303688, + "num_input_tokens_seen": 311226080, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 14427, + "time_per_iteration": 2.4853262901306152 + }, + { + "auxiliary_loss_clip": 0.01100692, + "auxiliary_loss_mlp": 0.01027171, + "balance_loss_clip": 1.01605487, + "balance_loss_mlp": 1.03558111, + "epoch": 0.8674582894934616, + "flos": 24426797627520.0, + "grad_norm": 2.4919308270407967, + "language_loss": 0.67974901, + "learning_rate": 1.8139108065924004e-07, + "loss": 0.70102763, + "num_input_tokens_seen": 311246380, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65234375, + "step": 14428, + "time_per_iteration": 3.881380558013916 + }, + { + "auxiliary_loss_clip": 0.01099393, + "auxiliary_loss_mlp": 0.01027709, + "balance_loss_clip": 1.01625252, + "balance_loss_mlp": 1.03412676, + "epoch": 0.8675184127461296, + "flos": 20737514701440.0, + "grad_norm": 1.9995749889958705, + "language_loss": 0.70442253, + "learning_rate": 1.812290478794889e-07, + "loss": 0.72569358, + "num_input_tokens_seen": 311266465, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 14429, + "time_per_iteration": 2.485006093978882 + }, + { + "auxiliary_loss_clip": 0.01099339, + "auxiliary_loss_mlp": 0.01027402, + "balance_loss_clip": 1.01582026, + "balance_loss_mlp": 1.03418374, + "epoch": 0.8675785359987975, + "flos": 19135647774720.0, + "grad_norm": 1.994306556139827, + "language_loss": 0.66704834, + "learning_rate": 1.810670840677151e-07, + "loss": 0.68831575, + "num_input_tokens_seen": 311285075, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 14430, + "time_per_iteration": 5.315447092056274 + }, + { + "auxiliary_loss_clip": 0.01102359, + "auxiliary_loss_mlp": 0.01039308, + "balance_loss_clip": 1.02674353, + "balance_loss_mlp": 1.03518546, + "epoch": 0.8676386592514655, + "flos": 22710662559360.0, + "grad_norm": 2.1027224790368373, + "language_loss": 0.6922996, + "learning_rate": 1.8090518923005948e-07, + "loss": 0.71371627, + "num_input_tokens_seen": 311303230, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.671875, + "step": 14431, + "time_per_iteration": 2.529296636581421 + }, + { + "auxiliary_loss_clip": 0.0110053, + "auxiliary_loss_mlp": 0.01037421, + "balance_loss_clip": 1.0257802, + "balance_loss_mlp": 1.03467131, + "epoch": 0.8676987825041335, + "flos": 14209853109120.0, + "grad_norm": 2.1202825244865617, + "language_loss": 0.63412476, + "learning_rate": 1.8074336337266116e-07, + "loss": 0.65550429, + "num_input_tokens_seen": 311318070, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66015625, + "step": 14432, + "time_per_iteration": 2.4086527824401855 + }, + { + "auxiliary_loss_clip": 0.01099173, + "auxiliary_loss_mlp": 0.01034176, + "balance_loss_clip": 1.02346504, + "balance_loss_mlp": 1.03396499, + "epoch": 0.8677589057568015, + "flos": 13589927256960.0, + "grad_norm": 1.8814819068968875, + "language_loss": 0.78167719, + "learning_rate": 1.8058160650165656e-07, + "loss": 0.8030107, + "num_input_tokens_seen": 311334885, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.65234375, + "step": 14433, + "time_per_iteration": 2.4273760318756104 + }, + { + "auxiliary_loss_clip": 0.01022423, + "auxiliary_loss_mlp": 0.00999558, + "balance_loss_clip": 0.99843693, + "balance_loss_mlp": 1.00253153, + "epoch": 0.8678190290094694, + "flos": 68933657370240.0, + "grad_norm": 0.7045847422968128, + "language_loss": 0.58498955, + "learning_rate": 1.804199186231805e-07, + "loss": 0.60520935, + "num_input_tokens_seen": 311399780, + "router_z_loss_clip": 0.01123047, + "router_z_loss_mlp": 0.19921875, + "step": 14434, + "time_per_iteration": 4.538690090179443 + }, + { + "auxiliary_loss_clip": 0.01095692, + "auxiliary_loss_mlp": 0.0103128, + "balance_loss_clip": 1.02078414, + "balance_loss_mlp": 1.0331434, + "epoch": 0.8678791522621374, + "flos": 32557726776960.0, + "grad_norm": 1.9536898864428005, + "language_loss": 0.80034566, + "learning_rate": 1.802582997433628e-07, + "loss": 0.8216154, + "num_input_tokens_seen": 311419610, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.625, + "step": 14435, + "time_per_iteration": 2.5227813720703125 + }, + { + "auxiliary_loss_clip": 0.01099769, + "auxiliary_loss_mlp": 0.01024614, + "balance_loss_clip": 1.0133307, + "balance_loss_mlp": 1.03240716, + "epoch": 0.8679392755148053, + "flos": 35042637657600.0, + "grad_norm": 1.931795708002661, + "language_loss": 0.62170672, + "learning_rate": 1.8009674986833322e-07, + "loss": 0.64295053, + "num_input_tokens_seen": 311440045, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 14436, + "time_per_iteration": 2.5728566646575928 + }, + { + "auxiliary_loss_clip": 0.01100272, + "auxiliary_loss_mlp": 0.01029612, + "balance_loss_clip": 1.01709533, + "balance_loss_mlp": 1.03472376, + "epoch": 0.8679993987674733, + "flos": 18552494471040.0, + "grad_norm": 2.343547123058283, + "language_loss": 0.70253652, + "learning_rate": 1.7993526900421706e-07, + "loss": 0.72383535, + "num_input_tokens_seen": 311456660, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.65625, + "step": 14437, + "time_per_iteration": 2.4431545734405518 + }, + { + "auxiliary_loss_clip": 0.01099398, + "auxiliary_loss_mlp": 0.01028868, + "balance_loss_clip": 1.01700628, + "balance_loss_mlp": 1.03465986, + "epoch": 0.8680595220201412, + "flos": 27454390162560.0, + "grad_norm": 2.0341554887187527, + "language_loss": 0.80222631, + "learning_rate": 1.797738571571381e-07, + "loss": 0.82350898, + "num_input_tokens_seen": 311475460, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6484375, + "step": 14438, + "time_per_iteration": 2.494382858276367 + }, + { + "auxiliary_loss_clip": 0.01095122, + "auxiliary_loss_mlp": 0.01025377, + "balance_loss_clip": 1.01420712, + "balance_loss_mlp": 1.03247368, + "epoch": 0.8681196452728093, + "flos": 19208797822080.0, + "grad_norm": 1.989777139729131, + "language_loss": 0.67343026, + "learning_rate": 1.7961251433321656e-07, + "loss": 0.69463527, + "num_input_tokens_seen": 311494575, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.62890625, + "step": 14439, + "time_per_iteration": 2.446855306625366 + }, + { + "auxiliary_loss_clip": 0.01097843, + "auxiliary_loss_mlp": 0.01033073, + "balance_loss_clip": 1.02216566, + "balance_loss_mlp": 1.03362823, + "epoch": 0.8681797685254772, + "flos": 37560442417920.0, + "grad_norm": 1.5866507009228823, + "language_loss": 0.63566774, + "learning_rate": 1.7945124053857085e-07, + "loss": 0.65697688, + "num_input_tokens_seen": 311515805, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.640625, + "step": 14440, + "time_per_iteration": 2.598418951034546 + }, + { + "auxiliary_loss_clip": 0.01097534, + "auxiliary_loss_mlp": 0.01031351, + "balance_loss_clip": 1.01960874, + "balance_loss_mlp": 1.03475738, + "epoch": 0.8682398917781452, + "flos": 23289937194240.0, + "grad_norm": 1.520964551160635, + "language_loss": 0.65901554, + "learning_rate": 1.7929003577931722e-07, + "loss": 0.68030441, + "num_input_tokens_seen": 311536000, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.62890625, + "step": 14441, + "time_per_iteration": 2.4468817710876465 + }, + { + "auxiliary_loss_clip": 0.01098077, + "auxiliary_loss_mlp": 0.01025314, + "balance_loss_clip": 1.01465619, + "balance_loss_mlp": 1.03496826, + "epoch": 0.8683000150308132, + "flos": 21872794936320.0, + "grad_norm": 1.540549688981214, + "language_loss": 0.66407061, + "learning_rate": 1.7912890006156722e-07, + "loss": 0.68530446, + "num_input_tokens_seen": 311556220, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.62890625, + "step": 14442, + "time_per_iteration": 2.474977493286133 + }, + { + "auxiliary_loss_clip": 0.01101636, + "auxiliary_loss_mlp": 0.01030241, + "balance_loss_clip": 1.01771832, + "balance_loss_mlp": 1.0342052, + "epoch": 0.8683601382834811, + "flos": 14647209108480.0, + "grad_norm": 1.8064516397377273, + "language_loss": 0.72410548, + "learning_rate": 1.7896783339143195e-07, + "loss": 0.74542421, + "num_input_tokens_seen": 311572530, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.671875, + "step": 14443, + "time_per_iteration": 2.411228895187378 + }, + { + "auxiliary_loss_clip": 0.01100605, + "auxiliary_loss_mlp": 0.01028407, + "balance_loss_clip": 1.01658189, + "balance_loss_mlp": 1.0347054, + "epoch": 0.8684202615361492, + "flos": 26359904799360.0, + "grad_norm": 1.608459779685937, + "language_loss": 0.83502007, + "learning_rate": 1.7880683577501877e-07, + "loss": 0.85631013, + "num_input_tokens_seen": 311591105, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66015625, + "step": 14444, + "time_per_iteration": 2.4930927753448486 + }, + { + "auxiliary_loss_clip": 0.01100137, + "auxiliary_loss_mlp": 0.01030605, + "balance_loss_clip": 1.01894629, + "balance_loss_mlp": 1.03413963, + "epoch": 0.8684803847888171, + "flos": 20704010290560.0, + "grad_norm": 2.448486590537531, + "language_loss": 0.77183151, + "learning_rate": 1.7864590721843342e-07, + "loss": 0.79313886, + "num_input_tokens_seen": 311608350, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66015625, + "step": 14445, + "time_per_iteration": 2.426669120788574 + }, + { + "auxiliary_loss_clip": 0.01103385, + "auxiliary_loss_mlp": 0.0102966, + "balance_loss_clip": 1.01798332, + "balance_loss_mlp": 1.03616834, + "epoch": 0.8685405080414851, + "flos": 22638123043200.0, + "grad_norm": 1.7750211361688581, + "language_loss": 0.67744529, + "learning_rate": 1.7848504772777728e-07, + "loss": 0.69877577, + "num_input_tokens_seen": 311626380, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 14446, + "time_per_iteration": 2.4654579162597656 + }, + { + "auxiliary_loss_clip": 0.01099868, + "auxiliary_loss_mlp": 0.01032187, + "balance_loss_clip": 1.02041531, + "balance_loss_mlp": 1.03514719, + "epoch": 0.868600631294153, + "flos": 24822065865600.0, + "grad_norm": 1.7454626226513459, + "language_loss": 0.82879949, + "learning_rate": 1.7832425730915102e-07, + "loss": 0.85012007, + "num_input_tokens_seen": 311644345, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6484375, + "step": 14447, + "time_per_iteration": 2.493257522583008 + }, + { + "auxiliary_loss_clip": 0.01097872, + "auxiliary_loss_mlp": 0.01026699, + "balance_loss_clip": 1.01551676, + "balance_loss_mlp": 1.03253877, + "epoch": 0.868660754546821, + "flos": 25113983696640.0, + "grad_norm": 1.6479414858635801, + "language_loss": 0.73994362, + "learning_rate": 1.781635359686515e-07, + "loss": 0.76118934, + "num_input_tokens_seen": 311663340, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.65234375, + "step": 14448, + "time_per_iteration": 2.517547607421875 + }, + { + "auxiliary_loss_clip": 0.01099877, + "auxiliary_loss_mlp": 0.01029061, + "balance_loss_clip": 1.01732528, + "balance_loss_mlp": 1.03410125, + "epoch": 0.8687208777994889, + "flos": 12677832178560.0, + "grad_norm": 2.0155069578364815, + "language_loss": 0.80403781, + "learning_rate": 1.7800288371237303e-07, + "loss": 0.82532716, + "num_input_tokens_seen": 311679860, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 14449, + "time_per_iteration": 2.4734561443328857 + }, + { + "auxiliary_loss_clip": 0.01022701, + "auxiliary_loss_mlp": 0.01001927, + "balance_loss_clip": 1.00090826, + "balance_loss_mlp": 1.00270224, + "epoch": 0.8687810010521569, + "flos": 65617235573760.0, + "grad_norm": 0.8078709150290084, + "language_loss": 0.60570407, + "learning_rate": 1.7784230054640758e-07, + "loss": 0.62595034, + "num_input_tokens_seen": 311738135, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.20019531, + "step": 14450, + "time_per_iteration": 3.00457501411438 + }, + { + "auxiliary_loss_clip": 0.01101764, + "auxiliary_loss_mlp": 0.01025128, + "balance_loss_clip": 1.01391602, + "balance_loss_mlp": 1.03496504, + "epoch": 0.8688411243048249, + "flos": 24244012293120.0, + "grad_norm": 1.6214229272889056, + "language_loss": 0.75951099, + "learning_rate": 1.7768178647684517e-07, + "loss": 0.78077996, + "num_input_tokens_seen": 311756975, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6640625, + "step": 14451, + "time_per_iteration": 2.466799736022949 + }, + { + "auxiliary_loss_clip": 0.01097261, + "auxiliary_loss_mlp": 0.01026963, + "balance_loss_clip": 1.01535177, + "balance_loss_mlp": 1.03310919, + "epoch": 0.8689012475574929, + "flos": 18221828843520.0, + "grad_norm": 2.7255099911966045, + "language_loss": 0.72161841, + "learning_rate": 1.7752134150977205e-07, + "loss": 0.74286067, + "num_input_tokens_seen": 311771830, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.640625, + "step": 14452, + "time_per_iteration": 2.4162471294403076 + }, + { + "auxiliary_loss_clip": 0.01104116, + "auxiliary_loss_mlp": 0.01031339, + "balance_loss_clip": 1.0188818, + "balance_loss_mlp": 1.03695846, + "epoch": 0.8689613708101608, + "flos": 19646728439040.0, + "grad_norm": 1.4737303556350767, + "language_loss": 0.71964049, + "learning_rate": 1.7736096565127201e-07, + "loss": 0.74099505, + "num_input_tokens_seen": 311790130, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.671875, + "step": 14453, + "time_per_iteration": 2.4244184494018555 + }, + { + "auxiliary_loss_clip": 0.01101076, + "auxiliary_loss_mlp": 0.01032389, + "balance_loss_clip": 1.0210638, + "balance_loss_mlp": 1.03651702, + "epoch": 0.8690214940628288, + "flos": 11728749070080.0, + "grad_norm": 1.9633097563791408, + "language_loss": 0.7370978, + "learning_rate": 1.7720065890742664e-07, + "loss": 0.75843245, + "num_input_tokens_seen": 311808360, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 14454, + "time_per_iteration": 2.4440579414367676 + }, + { + "auxiliary_loss_clip": 0.01100252, + "auxiliary_loss_mlp": 0.01026671, + "balance_loss_clip": 1.01540554, + "balance_loss_mlp": 1.03604889, + "epoch": 0.8690816173154968, + "flos": 34936450076160.0, + "grad_norm": 1.857806171796081, + "language_loss": 0.59278631, + "learning_rate": 1.7704042128431552e-07, + "loss": 0.61405551, + "num_input_tokens_seen": 311831325, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.640625, + "step": 14455, + "time_per_iteration": 2.5671346187591553 + }, + { + "auxiliary_loss_clip": 0.01100982, + "auxiliary_loss_mlp": 0.01026904, + "balance_loss_clip": 1.01522756, + "balance_loss_mlp": 1.03404677, + "epoch": 0.8691417405681647, + "flos": 11614804151040.0, + "grad_norm": 2.0707431382841746, + "language_loss": 0.79973984, + "learning_rate": 1.7688025278801378e-07, + "loss": 0.8210187, + "num_input_tokens_seen": 311848090, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66796875, + "step": 14456, + "time_per_iteration": 2.418536424636841 + }, + { + "auxiliary_loss_clip": 0.01104649, + "auxiliary_loss_mlp": 0.01034755, + "balance_loss_clip": 1.02168369, + "balance_loss_mlp": 1.03677177, + "epoch": 0.8692018638208328, + "flos": 24608038677120.0, + "grad_norm": 3.2598349085995713, + "language_loss": 0.74551702, + "learning_rate": 1.7672015342459568e-07, + "loss": 0.76691103, + "num_input_tokens_seen": 311867855, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6796875, + "step": 14457, + "time_per_iteration": 2.4583218097686768 + }, + { + "auxiliary_loss_clip": 0.01096046, + "auxiliary_loss_mlp": 0.01028332, + "balance_loss_clip": 1.01771641, + "balance_loss_mlp": 1.0333178, + "epoch": 0.8692619870735007, + "flos": 25995124229760.0, + "grad_norm": 1.5024493639781449, + "language_loss": 0.78523105, + "learning_rate": 1.765601232001328e-07, + "loss": 0.80647486, + "num_input_tokens_seen": 311888675, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.625, + "step": 14458, + "time_per_iteration": 2.4820399284362793 + }, + { + "auxiliary_loss_clip": 0.01099419, + "auxiliary_loss_mlp": 0.01031789, + "balance_loss_clip": 1.01935577, + "balance_loss_mlp": 1.03463364, + "epoch": 0.8693221103261687, + "flos": 18041808856320.0, + "grad_norm": 1.644405471391384, + "language_loss": 0.70893437, + "learning_rate": 1.7640016212069187e-07, + "loss": 0.73024642, + "num_input_tokens_seen": 311907310, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6484375, + "step": 14459, + "time_per_iteration": 2.4393928050994873 + }, + { + "auxiliary_loss_clip": 0.01093983, + "auxiliary_loss_mlp": 0.01030121, + "balance_loss_clip": 1.01987517, + "balance_loss_mlp": 1.03369355, + "epoch": 0.8693822335788366, + "flos": 27492347859840.0, + "grad_norm": 1.6107626843517802, + "language_loss": 0.73736501, + "learning_rate": 1.762402701923398e-07, + "loss": 0.75860602, + "num_input_tokens_seen": 311929635, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.6015625, + "step": 14460, + "time_per_iteration": 2.503788948059082 + }, + { + "auxiliary_loss_clip": 0.01102012, + "auxiliary_loss_mlp": 0.01030741, + "balance_loss_clip": 1.01904643, + "balance_loss_mlp": 1.03427231, + "epoch": 0.8694423568315046, + "flos": 24097712198400.0, + "grad_norm": 2.041721670778758, + "language_loss": 0.6509198, + "learning_rate": 1.7608044742113947e-07, + "loss": 0.67224729, + "num_input_tokens_seen": 311948800, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.67578125, + "step": 14461, + "time_per_iteration": 2.5010743141174316 + }, + { + "auxiliary_loss_clip": 0.01097505, + "auxiliary_loss_mlp": 0.01033155, + "balance_loss_clip": 1.02073312, + "balance_loss_mlp": 1.0317719, + "epoch": 0.8695024800841725, + "flos": 18362131367040.0, + "grad_norm": 2.2325103269251474, + "language_loss": 0.83019292, + "learning_rate": 1.7592069381315123e-07, + "loss": 0.8514995, + "num_input_tokens_seen": 311964090, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.65625, + "step": 14462, + "time_per_iteration": 2.433208703994751 + }, + { + "auxiliary_loss_clip": 0.01099153, + "auxiliary_loss_mlp": 0.01030132, + "balance_loss_clip": 1.01810336, + "balance_loss_mlp": 1.0335896, + "epoch": 0.8695626033368405, + "flos": 14027750133120.0, + "grad_norm": 1.7962828010788623, + "language_loss": 0.65557456, + "learning_rate": 1.757610093744335e-07, + "loss": 0.67686737, + "num_input_tokens_seen": 311981460, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.65625, + "step": 14463, + "time_per_iteration": 2.4401493072509766 + }, + { + "auxiliary_loss_clip": 0.01105422, + "auxiliary_loss_mlp": 0.01034725, + "balance_loss_clip": 1.02233291, + "balance_loss_mlp": 1.03725314, + "epoch": 0.8696227265895085, + "flos": 16836862193280.0, + "grad_norm": 2.9623729778046357, + "language_loss": 0.66444242, + "learning_rate": 1.7560139411104058e-07, + "loss": 0.68584383, + "num_input_tokens_seen": 312000115, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 14464, + "time_per_iteration": 2.436553716659546 + }, + { + "auxiliary_loss_clip": 0.01103351, + "auxiliary_loss_mlp": 0.01030622, + "balance_loss_clip": 1.01875448, + "balance_loss_mlp": 1.03480899, + "epoch": 0.8696828498421765, + "flos": 21799070271360.0, + "grad_norm": 2.4735109448365376, + "language_loss": 0.63112307, + "learning_rate": 1.7544184802902607e-07, + "loss": 0.65246278, + "num_input_tokens_seen": 312020770, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 14465, + "time_per_iteration": 2.4660658836364746 + }, + { + "auxiliary_loss_clip": 0.01094609, + "auxiliary_loss_mlp": 0.01037505, + "balance_loss_clip": 1.02676392, + "balance_loss_mlp": 1.03304648, + "epoch": 0.8697429730948444, + "flos": 22894812610560.0, + "grad_norm": 1.685225165628944, + "language_loss": 0.84644353, + "learning_rate": 1.7528237113443934e-07, + "loss": 0.86776471, + "num_input_tokens_seen": 312041870, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6171875, + "step": 14466, + "time_per_iteration": 2.480961322784424 + }, + { + "auxiliary_loss_clip": 0.01105582, + "auxiliary_loss_mlp": 0.01038303, + "balance_loss_clip": 1.02561951, + "balance_loss_mlp": 1.03688443, + "epoch": 0.8698030963475124, + "flos": 24717458482560.0, + "grad_norm": 2.282426713545837, + "language_loss": 0.62034947, + "learning_rate": 1.7512296343332779e-07, + "loss": 0.64178836, + "num_input_tokens_seen": 312058210, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 14467, + "time_per_iteration": 2.538341999053955 + }, + { + "auxiliary_loss_clip": 0.01094205, + "auxiliary_loss_mlp": 0.01028408, + "balance_loss_clip": 1.01785159, + "balance_loss_mlp": 1.03268588, + "epoch": 0.8698632196001803, + "flos": 28442221067520.0, + "grad_norm": 1.3777636597290568, + "language_loss": 0.68952703, + "learning_rate": 1.7496362493173655e-07, + "loss": 0.71075314, + "num_input_tokens_seen": 312082665, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6171875, + "step": 14468, + "time_per_iteration": 2.5434041023254395 + }, + { + "auxiliary_loss_clip": 0.0109501, + "auxiliary_loss_mlp": 0.0102873, + "balance_loss_clip": 1.01774478, + "balance_loss_mlp": 1.03186822, + "epoch": 0.8699233428528483, + "flos": 27636457224960.0, + "grad_norm": 1.4228431843567073, + "language_loss": 0.70863521, + "learning_rate": 1.7480435563570773e-07, + "loss": 0.72987258, + "num_input_tokens_seen": 312101960, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.62890625, + "step": 14469, + "time_per_iteration": 2.4869115352630615 + }, + { + "auxiliary_loss_clip": 0.01094893, + "auxiliary_loss_mlp": 0.01026479, + "balance_loss_clip": 1.01596522, + "balance_loss_mlp": 1.03371549, + "epoch": 0.8699834661055164, + "flos": 20045659864320.0, + "grad_norm": 2.363369045660802, + "language_loss": 0.83497709, + "learning_rate": 1.7464515555128024e-07, + "loss": 0.8561908, + "num_input_tokens_seen": 312117125, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.61328125, + "step": 14470, + "time_per_iteration": 3.7966389656066895 + }, + { + "auxiliary_loss_clip": 0.01100686, + "auxiliary_loss_mlp": 0.01028161, + "balance_loss_clip": 1.01659179, + "balance_loss_mlp": 1.0359199, + "epoch": 0.8700435893581843, + "flos": 23732787974400.0, + "grad_norm": 1.7851623388231517, + "language_loss": 0.729653, + "learning_rate": 1.7448602468449148e-07, + "loss": 0.75094146, + "num_input_tokens_seen": 312135775, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 14471, + "time_per_iteration": 2.4429454803466797 + }, + { + "auxiliary_loss_clip": 0.01098438, + "auxiliary_loss_mlp": 0.0102807, + "balance_loss_clip": 1.01722205, + "balance_loss_mlp": 1.03454566, + "epoch": 0.8701037126108523, + "flos": 23548422441600.0, + "grad_norm": 1.5357760114727317, + "language_loss": 0.79059005, + "learning_rate": 1.7432696304137573e-07, + "loss": 0.81185514, + "num_input_tokens_seen": 312156070, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.640625, + "step": 14472, + "time_per_iteration": 3.921783208847046 + }, + { + "auxiliary_loss_clip": 0.01098113, + "auxiliary_loss_mlp": 0.01025833, + "balance_loss_clip": 1.01443648, + "balance_loss_mlp": 1.03344178, + "epoch": 0.8701638358635202, + "flos": 18843442634880.0, + "grad_norm": 3.2533585330072574, + "language_loss": 0.72799373, + "learning_rate": 1.741679706279644e-07, + "loss": 0.74923319, + "num_input_tokens_seen": 312174380, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 14473, + "time_per_iteration": 2.4259376525878906 + }, + { + "auxiliary_loss_clip": 0.01102046, + "auxiliary_loss_mlp": 0.01028818, + "balance_loss_clip": 1.01699805, + "balance_loss_mlp": 1.03496337, + "epoch": 0.8702239591161882, + "flos": 27928339142400.0, + "grad_norm": 1.846236216417208, + "language_loss": 0.72311044, + "learning_rate": 1.7400904745028644e-07, + "loss": 0.7444191, + "num_input_tokens_seen": 312195130, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 14474, + "time_per_iteration": 2.473069190979004 + }, + { + "auxiliary_loss_clip": 0.01097682, + "auxiliary_loss_mlp": 0.01034295, + "balance_loss_clip": 1.02192092, + "balance_loss_mlp": 1.03249419, + "epoch": 0.8702840823688561, + "flos": 17233997938560.0, + "grad_norm": 1.7934338685222735, + "language_loss": 0.67214453, + "learning_rate": 1.7385019351436925e-07, + "loss": 0.69346434, + "num_input_tokens_seen": 312212300, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.65234375, + "step": 14475, + "time_per_iteration": 2.4029717445373535 + }, + { + "auxiliary_loss_clip": 0.01098351, + "auxiliary_loss_mlp": 0.0102784, + "balance_loss_clip": 1.0158658, + "balance_loss_mlp": 1.03214025, + "epoch": 0.8703442056215241, + "flos": 19427565605760.0, + "grad_norm": 1.5313783538807326, + "language_loss": 0.7782256, + "learning_rate": 1.736914088262349e-07, + "loss": 0.79948747, + "num_input_tokens_seen": 312231735, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6640625, + "step": 14476, + "time_per_iteration": 3.8718321323394775 + }, + { + "auxiliary_loss_clip": 0.01097278, + "auxiliary_loss_mlp": 0.01026284, + "balance_loss_clip": 1.01576388, + "balance_loss_mlp": 1.03464425, + "epoch": 0.8704043288741921, + "flos": 22273845264000.0, + "grad_norm": 1.4589876491690197, + "language_loss": 0.72287905, + "learning_rate": 1.7353269339190525e-07, + "loss": 0.74411464, + "num_input_tokens_seen": 312253060, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.625, + "step": 14477, + "time_per_iteration": 2.467496156692505 + }, + { + "auxiliary_loss_clip": 0.01099819, + "auxiliary_loss_mlp": 0.01026827, + "balance_loss_clip": 1.01542509, + "balance_loss_mlp": 1.03454363, + "epoch": 0.8704644521268601, + "flos": 16648725732480.0, + "grad_norm": 2.2041125468373424, + "language_loss": 0.59643745, + "learning_rate": 1.7337404721739946e-07, + "loss": 0.61770391, + "num_input_tokens_seen": 312269460, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 14478, + "time_per_iteration": 2.452988862991333 + }, + { + "auxiliary_loss_clip": 0.01099937, + "auxiliary_loss_mlp": 0.01027655, + "balance_loss_clip": 1.01758146, + "balance_loss_mlp": 1.03778374, + "epoch": 0.870524575379528, + "flos": 24280210224000.0, + "grad_norm": 1.4601612361115293, + "language_loss": 0.71350467, + "learning_rate": 1.732154703087323e-07, + "loss": 0.73478055, + "num_input_tokens_seen": 312289830, + "router_z_loss_clip": 0.10058594, + "router_z_loss_mlp": 0.62109375, + "step": 14479, + "time_per_iteration": 2.4820868968963623 + }, + { + "auxiliary_loss_clip": 0.01099029, + "auxiliary_loss_mlp": 0.01031237, + "balance_loss_clip": 1.01922119, + "balance_loss_mlp": 1.034477, + "epoch": 0.870584698632196, + "flos": 28768684803840.0, + "grad_norm": 1.4445497699948229, + "language_loss": 0.70891637, + "learning_rate": 1.7305696267191805e-07, + "loss": 0.73021901, + "num_input_tokens_seen": 312311320, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.64453125, + "step": 14480, + "time_per_iteration": 2.513767957687378 + }, + { + "auxiliary_loss_clip": 0.01100037, + "auxiliary_loss_mlp": 0.01027419, + "balance_loss_clip": 1.01664293, + "balance_loss_mlp": 1.03382504, + "epoch": 0.8706448218848639, + "flos": 32449635774720.0, + "grad_norm": 1.6114944508469022, + "language_loss": 0.70245749, + "learning_rate": 1.728985243129666e-07, + "loss": 0.72373205, + "num_input_tokens_seen": 312332095, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6640625, + "step": 14481, + "time_per_iteration": 2.554398775100708 + }, + { + "auxiliary_loss_clip": 0.01096608, + "auxiliary_loss_mlp": 0.01027738, + "balance_loss_clip": 1.01684833, + "balance_loss_mlp": 1.03254092, + "epoch": 0.8707049451375319, + "flos": 22748009725440.0, + "grad_norm": 1.964144913348514, + "language_loss": 0.77078795, + "learning_rate": 1.7274015523788643e-07, + "loss": 0.79203141, + "num_input_tokens_seen": 312351225, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.640625, + "step": 14482, + "time_per_iteration": 2.4815468788146973 + }, + { + "auxiliary_loss_clip": 0.01098791, + "auxiliary_loss_mlp": 0.01030512, + "balance_loss_clip": 1.01902604, + "balance_loss_mlp": 1.03534698, + "epoch": 0.8707650683902, + "flos": 15851976203520.0, + "grad_norm": 1.9700716147504154, + "language_loss": 0.76582003, + "learning_rate": 1.7258185545268234e-07, + "loss": 0.78711307, + "num_input_tokens_seen": 312369730, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6328125, + "step": 14483, + "time_per_iteration": 2.436213254928589 + }, + { + "auxiliary_loss_clip": 0.01104536, + "auxiliary_loss_mlp": 0.01035885, + "balance_loss_clip": 1.02293277, + "balance_loss_mlp": 1.0356437, + "epoch": 0.8708251916428679, + "flos": 16468131127680.0, + "grad_norm": 1.9594179999364503, + "language_loss": 0.61840808, + "learning_rate": 1.7242362496335749e-07, + "loss": 0.63981229, + "num_input_tokens_seen": 312386780, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6875, + "step": 14484, + "time_per_iteration": 2.442678213119507 + }, + { + "auxiliary_loss_clip": 0.01099606, + "auxiliary_loss_mlp": 0.01028654, + "balance_loss_clip": 1.01751947, + "balance_loss_mlp": 1.03592515, + "epoch": 0.8708853148955359, + "flos": 15377847655680.0, + "grad_norm": 1.8902385859017612, + "language_loss": 0.67741799, + "learning_rate": 1.7226546377591222e-07, + "loss": 0.69870055, + "num_input_tokens_seen": 312404875, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.63671875, + "step": 14485, + "time_per_iteration": 2.4474589824676514 + }, + { + "auxiliary_loss_clip": 0.01098241, + "auxiliary_loss_mlp": 0.01029846, + "balance_loss_clip": 1.01782358, + "balance_loss_mlp": 1.03395045, + "epoch": 0.8709454381482038, + "flos": 30551325903360.0, + "grad_norm": 1.8241363232690688, + "language_loss": 0.62720448, + "learning_rate": 1.7210737189634373e-07, + "loss": 0.64848542, + "num_input_tokens_seen": 312425280, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.640625, + "step": 14486, + "time_per_iteration": 2.5260205268859863 + }, + { + "auxiliary_loss_clip": 0.01102715, + "auxiliary_loss_mlp": 0.01034103, + "balance_loss_clip": 1.02160406, + "balance_loss_mlp": 1.0344007, + "epoch": 0.8710055614008718, + "flos": 22601422321920.0, + "grad_norm": 2.140129151217502, + "language_loss": 0.61595458, + "learning_rate": 1.7194934933064653e-07, + "loss": 0.63732278, + "num_input_tokens_seen": 312443835, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 14487, + "time_per_iteration": 2.4534120559692383 + }, + { + "auxiliary_loss_clip": 0.01095929, + "auxiliary_loss_mlp": 0.0102546, + "balance_loss_clip": 1.01547635, + "balance_loss_mlp": 1.03313756, + "epoch": 0.8710656846535397, + "flos": 18443146492800.0, + "grad_norm": 1.935883465680325, + "language_loss": 0.67719936, + "learning_rate": 1.7179139608481318e-07, + "loss": 0.69841325, + "num_input_tokens_seen": 312460830, + "router_z_loss_clip": 0.09960938, + "router_z_loss_mlp": 0.62890625, + "step": 14488, + "time_per_iteration": 2.4133338928222656 + }, + { + "auxiliary_loss_clip": 0.01101696, + "auxiliary_loss_mlp": 0.01030683, + "balance_loss_clip": 1.01882124, + "balance_loss_mlp": 1.03593814, + "epoch": 0.8711258079062077, + "flos": 16503862181760.0, + "grad_norm": 2.0140303277845635, + "language_loss": 0.85666835, + "learning_rate": 1.716335121648338e-07, + "loss": 0.87799209, + "num_input_tokens_seen": 312477575, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65625, + "step": 14489, + "time_per_iteration": 2.447730779647827 + }, + { + "auxiliary_loss_clip": 0.01105324, + "auxiliary_loss_mlp": 0.01031789, + "balance_loss_clip": 1.01909935, + "balance_loss_mlp": 1.03548503, + "epoch": 0.8711859311588757, + "flos": 15663336952320.0, + "grad_norm": 2.3883647349321833, + "language_loss": 0.7595576, + "learning_rate": 1.7147569757669445e-07, + "loss": 0.78092867, + "num_input_tokens_seen": 312492140, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 14490, + "time_per_iteration": 2.455131769180298 + }, + { + "auxiliary_loss_clip": 0.01103297, + "auxiliary_loss_mlp": 0.01029879, + "balance_loss_clip": 1.01726079, + "balance_loss_mlp": 1.0360409, + "epoch": 0.8712460544115437, + "flos": 15557544420480.0, + "grad_norm": 2.077474010199246, + "language_loss": 0.76046753, + "learning_rate": 1.7131795232638012e-07, + "loss": 0.78179932, + "num_input_tokens_seen": 312508400, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.671875, + "step": 14491, + "time_per_iteration": 2.444725513458252 + }, + { + "auxiliary_loss_clip": 0.01101932, + "auxiliary_loss_mlp": 0.01026867, + "balance_loss_clip": 1.01542914, + "balance_loss_mlp": 1.03789806, + "epoch": 0.8713061776642116, + "flos": 16763568491520.0, + "grad_norm": 1.5462104355983195, + "language_loss": 0.67157114, + "learning_rate": 1.711602764198723e-07, + "loss": 0.69285911, + "num_input_tokens_seen": 312525915, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.640625, + "step": 14492, + "time_per_iteration": 2.456963539123535 + }, + { + "auxiliary_loss_clip": 0.01097894, + "auxiliary_loss_mlp": 0.01028195, + "balance_loss_clip": 1.01746666, + "balance_loss_mlp": 1.0347116, + "epoch": 0.8713663009168796, + "flos": 24279887001600.0, + "grad_norm": 1.9471643685037383, + "language_loss": 0.69513756, + "learning_rate": 1.7100266986314992e-07, + "loss": 0.71639848, + "num_input_tokens_seen": 312544735, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6328125, + "step": 14493, + "time_per_iteration": 2.486240863800049 + }, + { + "auxiliary_loss_clip": 0.01103732, + "auxiliary_loss_mlp": 0.01032556, + "balance_loss_clip": 1.02045047, + "balance_loss_mlp": 1.03747892, + "epoch": 0.8714264241695475, + "flos": 23795594904960.0, + "grad_norm": 2.373229138100018, + "language_loss": 0.89281845, + "learning_rate": 1.7084513266218936e-07, + "loss": 0.91418135, + "num_input_tokens_seen": 312557910, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66015625, + "step": 14494, + "time_per_iteration": 2.426887273788452 + }, + { + "auxiliary_loss_clip": 0.01100458, + "auxiliary_loss_mlp": 0.01029806, + "balance_loss_clip": 1.01902366, + "balance_loss_mlp": 1.03658473, + "epoch": 0.8714865474222155, + "flos": 37997942071680.0, + "grad_norm": 2.4859036528024387, + "language_loss": 0.59329295, + "learning_rate": 1.7068766482296514e-07, + "loss": 0.61459565, + "num_input_tokens_seen": 312580360, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.640625, + "step": 14495, + "time_per_iteration": 2.609177350997925 + }, + { + "auxiliary_loss_clip": 0.01099752, + "auxiliary_loss_mlp": 0.01032103, + "balance_loss_clip": 1.02050412, + "balance_loss_mlp": 1.03364801, + "epoch": 0.8715466706748836, + "flos": 22455696844800.0, + "grad_norm": 1.8624158816524548, + "language_loss": 0.80186629, + "learning_rate": 1.7053026635144762e-07, + "loss": 0.82318485, + "num_input_tokens_seen": 312597550, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66015625, + "step": 14496, + "time_per_iteration": 2.48067569732666 + }, + { + "auxiliary_loss_clip": 0.01100552, + "auxiliary_loss_mlp": 0.01034326, + "balance_loss_clip": 1.02149367, + "balance_loss_mlp": 1.03474569, + "epoch": 0.8716067939275515, + "flos": 21215126868480.0, + "grad_norm": 1.9430726479329608, + "language_loss": 0.7876395, + "learning_rate": 1.7037293725360624e-07, + "loss": 0.80898833, + "num_input_tokens_seen": 312616435, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.65625, + "step": 14497, + "time_per_iteration": 2.4636995792388916 + }, + { + "auxiliary_loss_clip": 0.01101538, + "auxiliary_loss_mlp": 0.01029752, + "balance_loss_clip": 1.01755071, + "balance_loss_mlp": 1.03472698, + "epoch": 0.8716669171802195, + "flos": 22997732054400.0, + "grad_norm": 2.083369518158611, + "language_loss": 0.66958046, + "learning_rate": 1.70215677535406e-07, + "loss": 0.69089335, + "num_input_tokens_seen": 312632770, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66796875, + "step": 14498, + "time_per_iteration": 2.4272525310516357 + }, + { + "auxiliary_loss_clip": 0.01097343, + "auxiliary_loss_mlp": 0.010302, + "balance_loss_clip": 1.01882744, + "balance_loss_mlp": 1.03298473, + "epoch": 0.8717270404328874, + "flos": 29784058462080.0, + "grad_norm": 1.650352141003142, + "language_loss": 0.57090127, + "learning_rate": 1.700584872028108e-07, + "loss": 0.59217668, + "num_input_tokens_seen": 312651900, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.64453125, + "step": 14499, + "time_per_iteration": 2.5417370796203613 + }, + { + "auxiliary_loss_clip": 0.01100865, + "auxiliary_loss_mlp": 0.01030099, + "balance_loss_clip": 1.01810622, + "balance_loss_mlp": 1.03407812, + "epoch": 0.8717871636855554, + "flos": 22018125363840.0, + "grad_norm": 1.8287657436259406, + "language_loss": 0.79665649, + "learning_rate": 1.6990136626178097e-07, + "loss": 0.81796622, + "num_input_tokens_seen": 312671380, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66796875, + "step": 14500, + "time_per_iteration": 2.4384779930114746 + }, + { + "auxiliary_loss_clip": 0.01099214, + "auxiliary_loss_mlp": 0.01026537, + "balance_loss_clip": 1.01524222, + "balance_loss_mlp": 1.03458488, + "epoch": 0.8718472869382233, + "flos": 16654256426880.0, + "grad_norm": 3.927270356481981, + "language_loss": 0.72778672, + "learning_rate": 1.6974431471827466e-07, + "loss": 0.74904418, + "num_input_tokens_seen": 312689215, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 14501, + "time_per_iteration": 2.4204492568969727 + }, + { + "auxiliary_loss_clip": 0.01105269, + "auxiliary_loss_mlp": 0.01027676, + "balance_loss_clip": 1.01549292, + "balance_loss_mlp": 1.03746784, + "epoch": 0.8719074101908914, + "flos": 19495328613120.0, + "grad_norm": 1.8814783525974537, + "language_loss": 0.64179307, + "learning_rate": 1.695873325782482e-07, + "loss": 0.66312253, + "num_input_tokens_seen": 312706400, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6796875, + "step": 14502, + "time_per_iteration": 2.434103488922119 + }, + { + "auxiliary_loss_clip": 0.01101671, + "auxiliary_loss_mlp": 0.01032355, + "balance_loss_clip": 1.02017736, + "balance_loss_mlp": 1.0345037, + "epoch": 0.8719675334435593, + "flos": 33070890430080.0, + "grad_norm": 1.8726397651262905, + "language_loss": 0.68590897, + "learning_rate": 1.6943041984765262e-07, + "loss": 0.70724928, + "num_input_tokens_seen": 312727985, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.671875, + "step": 14503, + "time_per_iteration": 2.5533761978149414 + }, + { + "auxiliary_loss_clip": 0.01100258, + "auxiliary_loss_mlp": 0.01027095, + "balance_loss_clip": 1.01556742, + "balance_loss_mlp": 1.03493106, + "epoch": 0.8720276566962273, + "flos": 13626268842240.0, + "grad_norm": 2.557034340222278, + "language_loss": 0.69279027, + "learning_rate": 1.6927357653243912e-07, + "loss": 0.71406382, + "num_input_tokens_seen": 312745025, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 14504, + "time_per_iteration": 2.3973300457000732 + }, + { + "auxiliary_loss_clip": 0.01100515, + "auxiliary_loss_mlp": 0.01023828, + "balance_loss_clip": 1.01218152, + "balance_loss_mlp": 1.03470516, + "epoch": 0.8720877799488952, + "flos": 23514163845120.0, + "grad_norm": 1.7855236968382922, + "language_loss": 0.70064294, + "learning_rate": 1.691168026385552e-07, + "loss": 0.7218864, + "num_input_tokens_seen": 312764170, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 14505, + "time_per_iteration": 2.4677655696868896 + }, + { + "auxiliary_loss_clip": 0.01099166, + "auxiliary_loss_mlp": 0.01028441, + "balance_loss_clip": 1.01750898, + "balance_loss_mlp": 1.03490877, + "epoch": 0.8721479032015632, + "flos": 20814148368000.0, + "grad_norm": 1.5833201060926712, + "language_loss": 0.78157365, + "learning_rate": 1.6896009817194545e-07, + "loss": 0.80284977, + "num_input_tokens_seen": 312783830, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.64453125, + "step": 14506, + "time_per_iteration": 2.4351515769958496 + }, + { + "auxiliary_loss_clip": 0.01099602, + "auxiliary_loss_mlp": 0.01028009, + "balance_loss_clip": 1.016541, + "balance_loss_mlp": 1.03255463, + "epoch": 0.8722080264542311, + "flos": 19463655795840.0, + "grad_norm": 2.103988804675608, + "language_loss": 0.73979723, + "learning_rate": 1.6880346313855221e-07, + "loss": 0.76107335, + "num_input_tokens_seen": 312802015, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.671875, + "step": 14507, + "time_per_iteration": 2.455190420150757 + }, + { + "auxiliary_loss_clip": 0.0110337, + "auxiliary_loss_mlp": 0.01029571, + "balance_loss_clip": 1.01671457, + "balance_loss_mlp": 1.03505719, + "epoch": 0.8722681497068991, + "flos": 21761866759680.0, + "grad_norm": 2.4004535186438694, + "language_loss": 0.72314352, + "learning_rate": 1.686468975443156e-07, + "loss": 0.74447292, + "num_input_tokens_seen": 312820650, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.68359375, + "step": 14508, + "time_per_iteration": 2.429776191711426 + }, + { + "auxiliary_loss_clip": 0.01103509, + "auxiliary_loss_mlp": 0.01032368, + "balance_loss_clip": 1.01998162, + "balance_loss_mlp": 1.03545594, + "epoch": 0.8723282729595672, + "flos": 28877134942080.0, + "grad_norm": 1.6736592907716532, + "language_loss": 0.68370092, + "learning_rate": 1.6849040139517202e-07, + "loss": 0.70505971, + "num_input_tokens_seen": 312841310, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 14509, + "time_per_iteration": 2.512343168258667 + }, + { + "auxiliary_loss_clip": 0.01100301, + "auxiliary_loss_mlp": 0.01031497, + "balance_loss_clip": 1.02042294, + "balance_loss_mlp": 1.03505254, + "epoch": 0.8723883962122351, + "flos": 26469145036800.0, + "grad_norm": 1.602744668687492, + "language_loss": 0.58099592, + "learning_rate": 1.683339746970558e-07, + "loss": 0.60231388, + "num_input_tokens_seen": 312862100, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.65234375, + "step": 14510, + "time_per_iteration": 2.5066168308258057 + }, + { + "auxiliary_loss_clip": 0.01106, + "auxiliary_loss_mlp": 0.01030297, + "balance_loss_clip": 1.01718426, + "balance_loss_mlp": 1.03560722, + "epoch": 0.8724485194649031, + "flos": 20521476351360.0, + "grad_norm": 3.01668414840158, + "language_loss": 0.67472696, + "learning_rate": 1.6817761745589865e-07, + "loss": 0.69608998, + "num_input_tokens_seen": 312880220, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 14511, + "time_per_iteration": 2.451087236404419 + }, + { + "auxiliary_loss_clip": 0.01103515, + "auxiliary_loss_mlp": 0.010345, + "balance_loss_clip": 1.02203035, + "balance_loss_mlp": 1.03443432, + "epoch": 0.872508642717571, + "flos": 24353360271360.0, + "grad_norm": 3.2501383047133405, + "language_loss": 0.81799793, + "learning_rate": 1.6802132967763027e-07, + "loss": 0.83937812, + "num_input_tokens_seen": 312900765, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69140625, + "step": 14512, + "time_per_iteration": 3.9438862800598145 + }, + { + "auxiliary_loss_clip": 0.01022617, + "auxiliary_loss_mlp": 0.01002239, + "balance_loss_clip": 1.00121391, + "balance_loss_mlp": 1.00251293, + "epoch": 0.872568765970239, + "flos": 61410012485760.0, + "grad_norm": 0.821541974320149, + "language_loss": 0.58620477, + "learning_rate": 1.6786511136817617e-07, + "loss": 0.6064533, + "num_input_tokens_seen": 312955840, + "router_z_loss_clip": 0.01025391, + "router_z_loss_mlp": 0.20117188, + "step": 14513, + "time_per_iteration": 5.803184747695923 + }, + { + "auxiliary_loss_clip": 0.01100291, + "auxiliary_loss_mlp": 0.01027146, + "balance_loss_clip": 1.01511812, + "balance_loss_mlp": 1.035128, + "epoch": 0.8726288892229069, + "flos": 22598046443520.0, + "grad_norm": 1.7723681458845877, + "language_loss": 0.76434934, + "learning_rate": 1.6770896253346112e-07, + "loss": 0.78562373, + "num_input_tokens_seen": 312973565, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.65234375, + "step": 14514, + "time_per_iteration": 2.43147611618042 + }, + { + "auxiliary_loss_clip": 0.01104793, + "auxiliary_loss_mlp": 0.0102735, + "balance_loss_clip": 1.01603079, + "balance_loss_mlp": 1.0365963, + "epoch": 0.872689012475575, + "flos": 25885201633920.0, + "grad_norm": 1.8948516392444266, + "language_loss": 0.6500389, + "learning_rate": 1.675528831794055e-07, + "loss": 0.67136031, + "num_input_tokens_seen": 312994660, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6796875, + "step": 14515, + "time_per_iteration": 2.4740371704101562 + }, + { + "auxiliary_loss_clip": 0.01102787, + "auxiliary_loss_mlp": 0.01034006, + "balance_loss_clip": 1.02158499, + "balance_loss_mlp": 1.03575802, + "epoch": 0.8727491357282429, + "flos": 21506721477120.0, + "grad_norm": 2.2682848027061837, + "language_loss": 0.79204381, + "learning_rate": 1.6739687331192842e-07, + "loss": 0.81341171, + "num_input_tokens_seen": 313009860, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 14516, + "time_per_iteration": 2.415546417236328 + }, + { + "auxiliary_loss_clip": 0.01104026, + "auxiliary_loss_mlp": 0.01030356, + "balance_loss_clip": 1.01828611, + "balance_loss_mlp": 1.03575611, + "epoch": 0.8728092589809109, + "flos": 19207504932480.0, + "grad_norm": 1.7654736819824852, + "language_loss": 0.71866733, + "learning_rate": 1.672409329369453e-07, + "loss": 0.74001116, + "num_input_tokens_seen": 313027025, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 14517, + "time_per_iteration": 3.8708460330963135 + }, + { + "auxiliary_loss_clip": 0.01096366, + "auxiliary_loss_mlp": 0.01021966, + "balance_loss_clip": 1.01117766, + "balance_loss_mlp": 1.03266263, + "epoch": 0.8728693822335788, + "flos": 20595308757120.0, + "grad_norm": 1.986670256969538, + "language_loss": 0.72410166, + "learning_rate": 1.6708506206036966e-07, + "loss": 0.74528503, + "num_input_tokens_seen": 313046830, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.63671875, + "step": 14518, + "time_per_iteration": 2.4295384883880615 + }, + { + "auxiliary_loss_clip": 0.01097506, + "auxiliary_loss_mlp": 0.01031272, + "balance_loss_clip": 1.02022767, + "balance_loss_mlp": 1.03408229, + "epoch": 0.8729295054862468, + "flos": 21728613744000.0, + "grad_norm": 1.5057549302495625, + "language_loss": 0.74070251, + "learning_rate": 1.6692926068811275e-07, + "loss": 0.76199031, + "num_input_tokens_seen": 313067715, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6328125, + "step": 14519, + "time_per_iteration": 2.4812824726104736 + }, + { + "auxiliary_loss_clip": 0.01102566, + "auxiliary_loss_mlp": 0.01027934, + "balance_loss_clip": 1.0154469, + "balance_loss_mlp": 1.03454578, + "epoch": 0.8729896287389147, + "flos": 17673436926720.0, + "grad_norm": 2.392734857107026, + "language_loss": 0.76474625, + "learning_rate": 1.6677352882608142e-07, + "loss": 0.78605127, + "num_input_tokens_seen": 313082305, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 14520, + "time_per_iteration": 2.395700693130493 + }, + { + "auxiliary_loss_clip": 0.01101699, + "auxiliary_loss_mlp": 0.01033653, + "balance_loss_clip": 1.02091551, + "balance_loss_mlp": 1.03470957, + "epoch": 0.8730497519915827, + "flos": 24571804832640.0, + "grad_norm": 1.7557471092827008, + "language_loss": 0.81959832, + "learning_rate": 1.666178664801816e-07, + "loss": 0.8409518, + "num_input_tokens_seen": 313101190, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.66796875, + "step": 14521, + "time_per_iteration": 2.4852864742279053 + }, + { + "auxiliary_loss_clip": 0.01103057, + "auxiliary_loss_mlp": 0.0103373, + "balance_loss_clip": 1.02129066, + "balance_loss_mlp": 1.03620303, + "epoch": 0.8731098752442508, + "flos": 13443734903040.0, + "grad_norm": 2.742686839679241, + "language_loss": 0.76673812, + "learning_rate": 1.6646227365631616e-07, + "loss": 0.78810602, + "num_input_tokens_seen": 313118965, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 14522, + "time_per_iteration": 2.400723457336426 + }, + { + "auxiliary_loss_clip": 0.01096241, + "auxiliary_loss_mlp": 0.01027206, + "balance_loss_clip": 1.01647091, + "balance_loss_mlp": 1.03311884, + "epoch": 0.8731699984969187, + "flos": 23474446381440.0, + "grad_norm": 1.7966365150802115, + "language_loss": 0.75488186, + "learning_rate": 1.66306750360385e-07, + "loss": 0.77611631, + "num_input_tokens_seen": 313139280, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6328125, + "step": 14523, + "time_per_iteration": 2.4636495113372803 + }, + { + "auxiliary_loss_clip": 0.01096575, + "auxiliary_loss_mlp": 0.01028177, + "balance_loss_clip": 1.01697135, + "balance_loss_mlp": 1.03261673, + "epoch": 0.8732301217495867, + "flos": 17712651600000.0, + "grad_norm": 4.0015763337701715, + "language_loss": 0.78712022, + "learning_rate": 1.6615129659828542e-07, + "loss": 0.80836773, + "num_input_tokens_seen": 313156655, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.640625, + "step": 14524, + "time_per_iteration": 2.4102303981781006 + }, + { + "auxiliary_loss_clip": 0.01096233, + "auxiliary_loss_mlp": 0.01030493, + "balance_loss_clip": 1.01968098, + "balance_loss_mlp": 1.03349209, + "epoch": 0.8732902450022546, + "flos": 22054359208320.0, + "grad_norm": 1.906291591567106, + "language_loss": 0.77577364, + "learning_rate": 1.6599591237591272e-07, + "loss": 0.79704088, + "num_input_tokens_seen": 313174050, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.625, + "step": 14525, + "time_per_iteration": 2.4395744800567627 + }, + { + "auxiliary_loss_clip": 0.01100789, + "auxiliary_loss_mlp": 0.01031139, + "balance_loss_clip": 1.01979661, + "balance_loss_mlp": 1.03422463, + "epoch": 0.8733503682549226, + "flos": 22272983337600.0, + "grad_norm": 1.583399004883713, + "language_loss": 0.68971789, + "learning_rate": 1.6584059769915902e-07, + "loss": 0.71103716, + "num_input_tokens_seen": 313192765, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 14526, + "time_per_iteration": 2.460601329803467 + }, + { + "auxiliary_loss_clip": 0.01104095, + "auxiliary_loss_mlp": 0.0103504, + "balance_loss_clip": 1.02277374, + "balance_loss_mlp": 1.0354166, + "epoch": 0.8734104915075905, + "flos": 23364344217600.0, + "grad_norm": 1.8364570696793545, + "language_loss": 0.61118007, + "learning_rate": 1.6568535257391326e-07, + "loss": 0.6325714, + "num_input_tokens_seen": 313210925, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 14527, + "time_per_iteration": 2.451878547668457 + }, + { + "auxiliary_loss_clip": 0.01107658, + "auxiliary_loss_mlp": 0.01030345, + "balance_loss_clip": 1.01725018, + "balance_loss_mlp": 1.03724599, + "epoch": 0.8734706147602586, + "flos": 17712292464000.0, + "grad_norm": 1.900288046481709, + "language_loss": 0.65428543, + "learning_rate": 1.6553017700606265e-07, + "loss": 0.6756655, + "num_input_tokens_seen": 313228250, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 14528, + "time_per_iteration": 2.4043383598327637 + }, + { + "auxiliary_loss_clip": 0.01099208, + "auxiliary_loss_mlp": 0.01027467, + "balance_loss_clip": 1.01605284, + "balance_loss_mlp": 1.03587151, + "epoch": 0.8735307380129265, + "flos": 22049367217920.0, + "grad_norm": 1.7400708711936286, + "language_loss": 0.89542592, + "learning_rate": 1.6537507100149205e-07, + "loss": 0.91669267, + "num_input_tokens_seen": 313247880, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6328125, + "step": 14529, + "time_per_iteration": 2.4536657333374023 + }, + { + "auxiliary_loss_clip": 0.01098594, + "auxiliary_loss_mlp": 0.01026461, + "balance_loss_clip": 1.01464105, + "balance_loss_mlp": 1.03477812, + "epoch": 0.8735908612655945, + "flos": 25338425829120.0, + "grad_norm": 1.8948501189750897, + "language_loss": 0.85129809, + "learning_rate": 1.6522003456608258e-07, + "loss": 0.8725487, + "num_input_tokens_seen": 313266790, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.63671875, + "step": 14530, + "time_per_iteration": 2.4669382572174072 + }, + { + "auxiliary_loss_clip": 0.01099866, + "auxiliary_loss_mlp": 0.0103023, + "balance_loss_clip": 1.01923275, + "balance_loss_mlp": 1.03396988, + "epoch": 0.8736509845182624, + "flos": 21540908246400.0, + "grad_norm": 1.576558377957066, + "language_loss": 0.74252665, + "learning_rate": 1.650650677057128e-07, + "loss": 0.76382756, + "num_input_tokens_seen": 313286805, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.66015625, + "step": 14531, + "time_per_iteration": 2.4694976806640625 + }, + { + "auxiliary_loss_clip": 0.0109496, + "auxiliary_loss_mlp": 0.01028468, + "balance_loss_clip": 1.01751268, + "balance_loss_mlp": 1.03228939, + "epoch": 0.8737111077709304, + "flos": 22017227523840.0, + "grad_norm": 1.9295326178711187, + "language_loss": 0.61642307, + "learning_rate": 1.6491017042625966e-07, + "loss": 0.63765734, + "num_input_tokens_seen": 313305415, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.625, + "step": 14532, + "time_per_iteration": 2.467203140258789 + }, + { + "auxiliary_loss_clip": 0.01022366, + "auxiliary_loss_mlp": 0.01002055, + "balance_loss_clip": 1.00104749, + "balance_loss_mlp": 1.00228763, + "epoch": 0.8737712310235983, + "flos": 70066315912320.0, + "grad_norm": 1.0435222234743866, + "language_loss": 0.58747792, + "learning_rate": 1.6475534273359704e-07, + "loss": 0.60772216, + "num_input_tokens_seen": 313369940, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.20117188, + "step": 14533, + "time_per_iteration": 3.1370933055877686 + }, + { + "auxiliary_loss_clip": 0.01098118, + "auxiliary_loss_mlp": 0.01028866, + "balance_loss_clip": 1.0177381, + "balance_loss_mlp": 1.03478742, + "epoch": 0.8738313542762663, + "flos": 28658331244800.0, + "grad_norm": 1.6991996292719136, + "language_loss": 0.770051, + "learning_rate": 1.646005846335954e-07, + "loss": 0.7913208, + "num_input_tokens_seen": 313390965, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6328125, + "step": 14534, + "time_per_iteration": 2.4806315898895264 + }, + { + "auxiliary_loss_clip": 0.01099042, + "auxiliary_loss_mlp": 0.01027788, + "balance_loss_clip": 1.01656461, + "balance_loss_mlp": 1.03348231, + "epoch": 0.8738914775289344, + "flos": 22346384780160.0, + "grad_norm": 5.211875046106134, + "language_loss": 0.7515831, + "learning_rate": 1.6444589613212357e-07, + "loss": 0.77285141, + "num_input_tokens_seen": 313409680, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 14535, + "time_per_iteration": 2.4810731410980225 + }, + { + "auxiliary_loss_clip": 0.01098515, + "auxiliary_loss_mlp": 0.01030127, + "balance_loss_clip": 1.01817584, + "balance_loss_mlp": 1.03306127, + "epoch": 0.8739516007816023, + "flos": 31759648444800.0, + "grad_norm": 1.6829802403654797, + "language_loss": 0.74085766, + "learning_rate": 1.64291277235048e-07, + "loss": 0.76214409, + "num_input_tokens_seen": 313431335, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.65625, + "step": 14536, + "time_per_iteration": 2.523463010787964 + }, + { + "auxiliary_loss_clip": 0.01097805, + "auxiliary_loss_mlp": 0.01032126, + "balance_loss_clip": 1.02120638, + "balance_loss_mlp": 1.03282738, + "epoch": 0.8740117240342703, + "flos": 21211715076480.0, + "grad_norm": 1.6182033888035987, + "language_loss": 0.6362291, + "learning_rate": 1.641367279482304e-07, + "loss": 0.6575284, + "num_input_tokens_seen": 313449225, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 14537, + "time_per_iteration": 2.4442684650421143 + }, + { + "auxiliary_loss_clip": 0.01098039, + "auxiliary_loss_mlp": 0.01030257, + "balance_loss_clip": 1.01776958, + "balance_loss_mlp": 1.03407526, + "epoch": 0.8740718472869382, + "flos": 25186666867200.0, + "grad_norm": 1.9755012548468744, + "language_loss": 0.58271295, + "learning_rate": 1.6398224827753216e-07, + "loss": 0.60399592, + "num_input_tokens_seen": 313467715, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.640625, + "step": 14538, + "time_per_iteration": 2.4764134883880615 + }, + { + "auxiliary_loss_clip": 0.01098568, + "auxiliary_loss_mlp": 0.01026406, + "balance_loss_clip": 1.01509345, + "balance_loss_mlp": 1.03636038, + "epoch": 0.8741319705396062, + "flos": 19500931134720.0, + "grad_norm": 1.8688727440620683, + "language_loss": 0.68641996, + "learning_rate": 1.6382783822881142e-07, + "loss": 0.70766973, + "num_input_tokens_seen": 313486805, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.62109375, + "step": 14539, + "time_per_iteration": 2.5020625591278076 + }, + { + "auxiliary_loss_clip": 0.01101347, + "auxiliary_loss_mlp": 0.01029164, + "balance_loss_clip": 1.01667643, + "balance_loss_mlp": 1.03300726, + "epoch": 0.8741920937922741, + "flos": 14100900180480.0, + "grad_norm": 2.6060863933182126, + "language_loss": 0.74274981, + "learning_rate": 1.6367349780792262e-07, + "loss": 0.76405495, + "num_input_tokens_seen": 313504880, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.68359375, + "step": 14540, + "time_per_iteration": 2.404411792755127 + }, + { + "auxiliary_loss_clip": 0.01100315, + "auxiliary_loss_mlp": 0.01032644, + "balance_loss_clip": 1.02098525, + "balance_loss_mlp": 1.03433895, + "epoch": 0.8742522170449422, + "flos": 27709858667520.0, + "grad_norm": 1.6726829694378176, + "language_loss": 0.78856957, + "learning_rate": 1.635192270207193e-07, + "loss": 0.80989909, + "num_input_tokens_seen": 313524995, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66015625, + "step": 14541, + "time_per_iteration": 2.503199338912964 + }, + { + "auxiliary_loss_clip": 0.01104191, + "auxiliary_loss_mlp": 0.01030098, + "balance_loss_clip": 1.01660323, + "balance_loss_mlp": 1.03575325, + "epoch": 0.8743123402976101, + "flos": 21142587352320.0, + "grad_norm": 2.184797101770986, + "language_loss": 0.66509086, + "learning_rate": 1.6336502587305035e-07, + "loss": 0.68643373, + "num_input_tokens_seen": 313541740, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.6875, + "step": 14542, + "time_per_iteration": 2.4233803749084473 + }, + { + "auxiliary_loss_clip": 0.0102268, + "auxiliary_loss_mlp": 0.01004544, + "balance_loss_clip": 1.00348306, + "balance_loss_mlp": 1.00264943, + "epoch": 0.8743724635502781, + "flos": 60870024351360.0, + "grad_norm": 0.78036669727378, + "language_loss": 0.54485124, + "learning_rate": 1.632108943707642e-07, + "loss": 0.5651235, + "num_input_tokens_seen": 313593445, + "router_z_loss_clip": 0.01062012, + "router_z_loss_mlp": 0.20117188, + "step": 14543, + "time_per_iteration": 2.86068058013916 + }, + { + "auxiliary_loss_clip": 0.0110231, + "auxiliary_loss_mlp": 0.01031505, + "balance_loss_clip": 1.01938748, + "balance_loss_mlp": 1.03536141, + "epoch": 0.874432586802946, + "flos": 28109292883200.0, + "grad_norm": 2.471051898904442, + "language_loss": 0.69747186, + "learning_rate": 1.6305683251970458e-07, + "loss": 0.71880996, + "num_input_tokens_seen": 313615640, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 14544, + "time_per_iteration": 2.5185115337371826 + }, + { + "auxiliary_loss_clip": 0.0109533, + "auxiliary_loss_mlp": 0.01024688, + "balance_loss_clip": 1.01388133, + "balance_loss_mlp": 1.03356862, + "epoch": 0.874492710055614, + "flos": 23550289948800.0, + "grad_norm": 1.6364457025901016, + "language_loss": 0.75830984, + "learning_rate": 1.62902840325714e-07, + "loss": 0.77951002, + "num_input_tokens_seen": 313635550, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6171875, + "step": 14545, + "time_per_iteration": 2.4716804027557373 + }, + { + "auxiliary_loss_clip": 0.01100059, + "auxiliary_loss_mlp": 0.01032496, + "balance_loss_clip": 1.01906097, + "balance_loss_mlp": 1.03366208, + "epoch": 0.8745528333082819, + "flos": 40915647924480.0, + "grad_norm": 1.6291355552891738, + "language_loss": 0.65811241, + "learning_rate": 1.6274891779463217e-07, + "loss": 0.67943794, + "num_input_tokens_seen": 313659275, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.6640625, + "step": 14546, + "time_per_iteration": 2.5942723751068115 + }, + { + "auxiliary_loss_clip": 0.01099717, + "auxiliary_loss_mlp": 0.01028731, + "balance_loss_clip": 1.01728725, + "balance_loss_mlp": 1.03421474, + "epoch": 0.87461295656095, + "flos": 23622901292160.0, + "grad_norm": 1.6089408815054664, + "language_loss": 0.72915637, + "learning_rate": 1.6259506493229536e-07, + "loss": 0.75044084, + "num_input_tokens_seen": 313680595, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 14547, + "time_per_iteration": 2.476132869720459 + }, + { + "auxiliary_loss_clip": 0.01106258, + "auxiliary_loss_mlp": 0.01034848, + "balance_loss_clip": 1.02195573, + "balance_loss_mlp": 1.03549254, + "epoch": 0.874673079813618, + "flos": 38794116983040.0, + "grad_norm": 2.091980214733164, + "language_loss": 0.69212079, + "learning_rate": 1.6244128174453752e-07, + "loss": 0.71353185, + "num_input_tokens_seen": 313699730, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.70703125, + "step": 14548, + "time_per_iteration": 2.5924787521362305 + }, + { + "auxiliary_loss_clip": 0.01104345, + "auxiliary_loss_mlp": 0.01031321, + "balance_loss_clip": 1.0189178, + "balance_loss_mlp": 1.03624892, + "epoch": 0.8747332030662859, + "flos": 23696159080320.0, + "grad_norm": 1.7846159993944952, + "language_loss": 0.71013767, + "learning_rate": 1.6228756823719093e-07, + "loss": 0.73149431, + "num_input_tokens_seen": 313720090, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 14549, + "time_per_iteration": 2.662411689758301 + }, + { + "auxiliary_loss_clip": 0.01103895, + "auxiliary_loss_mlp": 0.01034303, + "balance_loss_clip": 1.02070725, + "balance_loss_mlp": 1.03421688, + "epoch": 0.8747933263189539, + "flos": 24462456854400.0, + "grad_norm": 2.2861390343765375, + "language_loss": 0.83157504, + "learning_rate": 1.6213392441608352e-07, + "loss": 0.85295701, + "num_input_tokens_seen": 313736795, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.6953125, + "step": 14550, + "time_per_iteration": 2.5321483612060547 + }, + { + "auxiliary_loss_clip": 0.01102064, + "auxiliary_loss_mlp": 0.01034898, + "balance_loss_clip": 1.02321005, + "balance_loss_mlp": 1.03460526, + "epoch": 0.8748534495716218, + "flos": 13809161917440.0, + "grad_norm": 1.7632417241957978, + "language_loss": 0.71897519, + "learning_rate": 1.6198035028704183e-07, + "loss": 0.74034476, + "num_input_tokens_seen": 313754820, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.67578125, + "step": 14551, + "time_per_iteration": 2.4257118701934814 + }, + { + "auxiliary_loss_clip": 0.01097904, + "auxiliary_loss_mlp": 0.01029093, + "balance_loss_clip": 1.01725554, + "balance_loss_mlp": 1.03376746, + "epoch": 0.8749135728242898, + "flos": 29862092759040.0, + "grad_norm": 1.8511874506751833, + "language_loss": 0.63747656, + "learning_rate": 1.6182684585588934e-07, + "loss": 0.65874648, + "num_input_tokens_seen": 313775830, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.640625, + "step": 14552, + "time_per_iteration": 2.507741689682007 + }, + { + "auxiliary_loss_clip": 0.01103006, + "auxiliary_loss_mlp": 0.01028493, + "balance_loss_clip": 1.01513553, + "balance_loss_mlp": 1.03501391, + "epoch": 0.8749736960769577, + "flos": 24133479166080.0, + "grad_norm": 2.446923250150636, + "language_loss": 0.79332548, + "learning_rate": 1.616734111284479e-07, + "loss": 0.81464052, + "num_input_tokens_seen": 313795745, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.6796875, + "step": 14553, + "time_per_iteration": 3.8591794967651367 + }, + { + "auxiliary_loss_clip": 0.01101263, + "auxiliary_loss_mlp": 0.0102859, + "balance_loss_clip": 1.01679969, + "balance_loss_mlp": 1.03322935, + "epoch": 0.8750338193296258, + "flos": 17202540602880.0, + "grad_norm": 1.9284579962234305, + "language_loss": 0.70292234, + "learning_rate": 1.6152004611053416e-07, + "loss": 0.72422087, + "num_input_tokens_seen": 313813895, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6796875, + "step": 14554, + "time_per_iteration": 2.425405740737915 + }, + { + "auxiliary_loss_clip": 0.01102552, + "auxiliary_loss_mlp": 0.01024857, + "balance_loss_clip": 1.01338291, + "balance_loss_mlp": 1.03636527, + "epoch": 0.8750939425822937, + "flos": 23733218937600.0, + "grad_norm": 1.403685789536988, + "language_loss": 0.83570188, + "learning_rate": 1.6136675080796457e-07, + "loss": 0.85697597, + "num_input_tokens_seen": 313834225, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6640625, + "step": 14555, + "time_per_iteration": 5.303139686584473 + }, + { + "auxiliary_loss_clip": 0.0109872, + "auxiliary_loss_mlp": 0.01029685, + "balance_loss_clip": 1.0174067, + "balance_loss_mlp": 1.03311133, + "epoch": 0.8751540658349617, + "flos": 26541684552960.0, + "grad_norm": 1.5544926685041807, + "language_loss": 0.71064872, + "learning_rate": 1.6121352522655252e-07, + "loss": 0.73193276, + "num_input_tokens_seen": 313854430, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.65625, + "step": 14556, + "time_per_iteration": 2.479682207107544 + }, + { + "auxiliary_loss_clip": 0.01102158, + "auxiliary_loss_mlp": 0.01037412, + "balance_loss_clip": 1.02427554, + "balance_loss_mlp": 1.03392434, + "epoch": 0.8752141890876296, + "flos": 19386806647680.0, + "grad_norm": 2.090988488565758, + "language_loss": 0.76491272, + "learning_rate": 1.6106036937210732e-07, + "loss": 0.78630841, + "num_input_tokens_seen": 313871600, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.68359375, + "step": 14557, + "time_per_iteration": 2.4440789222717285 + }, + { + "auxiliary_loss_clip": 0.01104191, + "auxiliary_loss_mlp": 0.01036641, + "balance_loss_clip": 1.02429736, + "balance_loss_mlp": 1.03754401, + "epoch": 0.8752743123402976, + "flos": 25374408278400.0, + "grad_norm": 2.745271516916585, + "language_loss": 0.82856929, + "learning_rate": 1.6090728325043767e-07, + "loss": 0.84997767, + "num_input_tokens_seen": 313891570, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.66796875, + "step": 14558, + "time_per_iteration": 2.4604156017303467 + }, + { + "auxiliary_loss_clip": 0.01022061, + "auxiliary_loss_mlp": 0.0099888, + "balance_loss_clip": 0.99787825, + "balance_loss_mlp": 1.00195646, + "epoch": 0.8753344355929655, + "flos": 59952398578560.0, + "grad_norm": 0.8098972335456046, + "language_loss": 0.56113648, + "learning_rate": 1.6075426686734784e-07, + "loss": 0.58134592, + "num_input_tokens_seen": 313951290, + "router_z_loss_clip": 0.01000977, + "router_z_loss_mlp": 0.20117188, + "step": 14559, + "time_per_iteration": 4.470167636871338 + }, + { + "auxiliary_loss_clip": 0.01097721, + "auxiliary_loss_mlp": 0.01032387, + "balance_loss_clip": 1.02115154, + "balance_loss_mlp": 1.03361118, + "epoch": 0.8753945588456336, + "flos": 17894646835200.0, + "grad_norm": 1.753910973149056, + "language_loss": 0.65810168, + "learning_rate": 1.606013202286407e-07, + "loss": 0.67940271, + "num_input_tokens_seen": 313968645, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 14560, + "time_per_iteration": 2.4606189727783203 + }, + { + "auxiliary_loss_clip": 0.01098497, + "auxiliary_loss_mlp": 0.01026014, + "balance_loss_clip": 1.0150826, + "balance_loss_mlp": 1.03399324, + "epoch": 0.8754546820983016, + "flos": 30914885410560.0, + "grad_norm": 1.7971412952154644, + "language_loss": 0.78488302, + "learning_rate": 1.6044844334011541e-07, + "loss": 0.80612814, + "num_input_tokens_seen": 313987580, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.64453125, + "step": 14561, + "time_per_iteration": 2.5178775787353516 + }, + { + "auxiliary_loss_clip": 0.0110177, + "auxiliary_loss_mlp": 0.01032414, + "balance_loss_clip": 1.01950347, + "balance_loss_mlp": 1.03332877, + "epoch": 0.8755148053509695, + "flos": 20631075724800.0, + "grad_norm": 2.052648485503804, + "language_loss": 0.7722398, + "learning_rate": 1.6029563620756982e-07, + "loss": 0.7935816, + "num_input_tokens_seen": 314004460, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6875, + "step": 14562, + "time_per_iteration": 2.4635396003723145 + }, + { + "auxiliary_loss_clip": 0.01093542, + "auxiliary_loss_mlp": 0.01027074, + "balance_loss_clip": 1.01629758, + "balance_loss_mlp": 1.03205824, + "epoch": 0.8755749286036375, + "flos": 34969739005440.0, + "grad_norm": 1.5193476217088446, + "language_loss": 0.72028875, + "learning_rate": 1.601428988367981e-07, + "loss": 0.74149489, + "num_input_tokens_seen": 314026855, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.61328125, + "step": 14563, + "time_per_iteration": 2.5659685134887695 + }, + { + "auxiliary_loss_clip": 0.01105043, + "auxiliary_loss_mlp": 0.01031152, + "balance_loss_clip": 1.01930261, + "balance_loss_mlp": 1.03699827, + "epoch": 0.8756350518563054, + "flos": 18186456925440.0, + "grad_norm": 2.447309960034295, + "language_loss": 0.65054131, + "learning_rate": 1.5999023123359235e-07, + "loss": 0.67190331, + "num_input_tokens_seen": 314042830, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 14564, + "time_per_iteration": 2.4639861583709717 + }, + { + "auxiliary_loss_clip": 0.01098108, + "auxiliary_loss_mlp": 0.01035336, + "balance_loss_clip": 1.02411819, + "balance_loss_mlp": 1.03273273, + "epoch": 0.8756951751089734, + "flos": 20084012611200.0, + "grad_norm": 1.7263402838064887, + "language_loss": 0.70455498, + "learning_rate": 1.598376334037408e-07, + "loss": 0.72588944, + "num_input_tokens_seen": 314062225, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 14565, + "time_per_iteration": 2.4354052543640137 + }, + { + "auxiliary_loss_clip": 0.01105002, + "auxiliary_loss_mlp": 0.01030182, + "balance_loss_clip": 1.01703906, + "balance_loss_mlp": 1.03534722, + "epoch": 0.8757552983616413, + "flos": 27525241739520.0, + "grad_norm": 1.653802479617828, + "language_loss": 0.77780795, + "learning_rate": 1.5968510535303102e-07, + "loss": 0.79915977, + "num_input_tokens_seen": 314082325, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6953125, + "step": 14566, + "time_per_iteration": 2.545552968978882 + }, + { + "auxiliary_loss_clip": 0.01101511, + "auxiliary_loss_mlp": 0.01029629, + "balance_loss_clip": 1.01816058, + "balance_loss_mlp": 1.03673196, + "epoch": 0.8758154216143094, + "flos": 18073014796800.0, + "grad_norm": 1.9682068715517365, + "language_loss": 0.71192074, + "learning_rate": 1.5953264708724624e-07, + "loss": 0.73323214, + "num_input_tokens_seen": 314100310, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6484375, + "step": 14567, + "time_per_iteration": 2.4351353645324707 + }, + { + "auxiliary_loss_clip": 0.01098933, + "auxiliary_loss_mlp": 0.01031287, + "balance_loss_clip": 1.01959288, + "balance_loss_mlp": 1.03442371, + "epoch": 0.8758755448669773, + "flos": 25045681985280.0, + "grad_norm": 1.8356377960329546, + "language_loss": 0.74325889, + "learning_rate": 1.5938025861216776e-07, + "loss": 0.76456112, + "num_input_tokens_seen": 314121330, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.64453125, + "step": 14568, + "time_per_iteration": 2.5124824047088623 + }, + { + "auxiliary_loss_clip": 0.01097935, + "auxiliary_loss_mlp": 0.01024092, + "balance_loss_clip": 1.0131247, + "balance_loss_mlp": 1.03408551, + "epoch": 0.8759356681196453, + "flos": 22856818999680.0, + "grad_norm": 2.0142792797007067, + "language_loss": 0.86751103, + "learning_rate": 1.5922793993357475e-07, + "loss": 0.88873136, + "num_input_tokens_seen": 314139875, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.640625, + "step": 14569, + "time_per_iteration": 2.4586262702941895 + }, + { + "auxiliary_loss_clip": 0.01099407, + "auxiliary_loss_mlp": 0.01027017, + "balance_loss_clip": 1.01596665, + "balance_loss_mlp": 1.03311825, + "epoch": 0.8759957913723132, + "flos": 21032521102080.0, + "grad_norm": 1.7415448650731975, + "language_loss": 0.73872113, + "learning_rate": 1.5907569105724284e-07, + "loss": 0.75998533, + "num_input_tokens_seen": 314157850, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6640625, + "step": 14570, + "time_per_iteration": 2.483851194381714 + }, + { + "auxiliary_loss_clip": 0.01101763, + "auxiliary_loss_mlp": 0.01028074, + "balance_loss_clip": 1.01600981, + "balance_loss_mlp": 1.03438187, + "epoch": 0.8760559146249812, + "flos": 20010467514240.0, + "grad_norm": 2.5331748701208454, + "language_loss": 0.67766106, + "learning_rate": 1.5892351198894472e-07, + "loss": 0.69895947, + "num_input_tokens_seen": 314176720, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.671875, + "step": 14571, + "time_per_iteration": 2.465486764907837 + }, + { + "auxiliary_loss_clip": 0.01096204, + "auxiliary_loss_mlp": 0.01027816, + "balance_loss_clip": 1.01672387, + "balance_loss_mlp": 1.03270459, + "epoch": 0.8761160378776491, + "flos": 19974161842560.0, + "grad_norm": 1.8497496461068688, + "language_loss": 0.62435377, + "learning_rate": 1.5877140273445156e-07, + "loss": 0.64559394, + "num_input_tokens_seen": 314196645, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6328125, + "step": 14572, + "time_per_iteration": 2.489522933959961 + }, + { + "auxiliary_loss_clip": 0.01097579, + "auxiliary_loss_mlp": 0.01026518, + "balance_loss_clip": 1.01595557, + "balance_loss_mlp": 1.03467846, + "epoch": 0.8761761611303172, + "flos": 28804415857920.0, + "grad_norm": 1.6601578113112918, + "language_loss": 0.73479891, + "learning_rate": 1.5861936329953162e-07, + "loss": 0.75603998, + "num_input_tokens_seen": 314217430, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.62890625, + "step": 14573, + "time_per_iteration": 2.5072882175445557 + }, + { + "auxiliary_loss_clip": 0.01095801, + "auxiliary_loss_mlp": 0.0102742, + "balance_loss_clip": 1.01690626, + "balance_loss_mlp": 1.03304029, + "epoch": 0.8762362843829851, + "flos": 18332505624960.0, + "grad_norm": 2.044894217816748, + "language_loss": 0.731619, + "learning_rate": 1.5846739368994966e-07, + "loss": 0.75285125, + "num_input_tokens_seen": 314235310, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.625, + "step": 14574, + "time_per_iteration": 2.4775447845458984 + }, + { + "auxiliary_loss_clip": 0.01098302, + "auxiliary_loss_mlp": 0.01033356, + "balance_loss_clip": 1.02203143, + "balance_loss_mlp": 1.03358328, + "epoch": 0.8762964076356531, + "flos": 15779149378560.0, + "grad_norm": 1.6149610801903476, + "language_loss": 0.75919485, + "learning_rate": 1.5831549391146903e-07, + "loss": 0.7805115, + "num_input_tokens_seen": 314252355, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 14575, + "time_per_iteration": 2.4268925189971924 + }, + { + "auxiliary_loss_clip": 0.01099511, + "auxiliary_loss_mlp": 0.0103512, + "balance_loss_clip": 1.02400935, + "balance_loss_mlp": 1.03539479, + "epoch": 0.8763565308883211, + "flos": 33176754789120.0, + "grad_norm": 1.7878452146905504, + "language_loss": 0.66882926, + "learning_rate": 1.5816366396984916e-07, + "loss": 0.69017559, + "num_input_tokens_seen": 314272755, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.640625, + "step": 14576, + "time_per_iteration": 2.5595736503601074 + }, + { + "auxiliary_loss_clip": 0.01096684, + "auxiliary_loss_mlp": 0.01030312, + "balance_loss_clip": 1.01937437, + "balance_loss_mlp": 1.03251886, + "epoch": 0.876416654140989, + "flos": 15888102307200.0, + "grad_norm": 1.7208202540038426, + "language_loss": 0.66684705, + "learning_rate": 1.5801190387084806e-07, + "loss": 0.68811697, + "num_input_tokens_seen": 314291365, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.640625, + "step": 14577, + "time_per_iteration": 2.4102935791015625 + }, + { + "auxiliary_loss_clip": 0.01100979, + "auxiliary_loss_mlp": 0.01032508, + "balance_loss_clip": 1.01999068, + "balance_loss_mlp": 1.03496742, + "epoch": 0.876476777393657, + "flos": 25885237547520.0, + "grad_norm": 2.108641527018096, + "language_loss": 0.70767337, + "learning_rate": 1.5786021362021962e-07, + "loss": 0.72900826, + "num_input_tokens_seen": 314310075, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.66015625, + "step": 14578, + "time_per_iteration": 2.5292482376098633 + }, + { + "auxiliary_loss_clip": 0.01100899, + "auxiliary_loss_mlp": 0.01031417, + "balance_loss_clip": 1.01962769, + "balance_loss_mlp": 1.03398395, + "epoch": 0.876536900646325, + "flos": 13589675861760.0, + "grad_norm": 1.9999789208400311, + "language_loss": 0.71355838, + "learning_rate": 1.5770859322371676e-07, + "loss": 0.73488152, + "num_input_tokens_seen": 314325695, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.671875, + "step": 14579, + "time_per_iteration": 2.4042766094207764 + }, + { + "auxiliary_loss_clip": 0.01096428, + "auxiliary_loss_mlp": 0.01027659, + "balance_loss_clip": 1.01671004, + "balance_loss_mlp": 1.03457344, + "epoch": 0.876597023898993, + "flos": 12203344494720.0, + "grad_norm": 1.902275188035939, + "language_loss": 0.7026614, + "learning_rate": 1.5755704268708912e-07, + "loss": 0.72390223, + "num_input_tokens_seen": 314343605, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6171875, + "step": 14580, + "time_per_iteration": 2.4405770301818848 + }, + { + "auxiliary_loss_clip": 0.01098077, + "auxiliary_loss_mlp": 0.0102868, + "balance_loss_clip": 1.01745093, + "balance_loss_mlp": 1.03479218, + "epoch": 0.8766571471516609, + "flos": 25336773803520.0, + "grad_norm": 1.5728356070217824, + "language_loss": 0.65423614, + "learning_rate": 1.5740556201608256e-07, + "loss": 0.67550373, + "num_input_tokens_seen": 314364275, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6328125, + "step": 14581, + "time_per_iteration": 2.469336986541748 + }, + { + "auxiliary_loss_clip": 0.01097037, + "auxiliary_loss_mlp": 0.01030022, + "balance_loss_clip": 1.01942396, + "balance_loss_mlp": 1.03387427, + "epoch": 0.8767172704043289, + "flos": 30113287545600.0, + "grad_norm": 1.70098505015339, + "language_loss": 0.73786414, + "learning_rate": 1.572541512164416e-07, + "loss": 0.75913477, + "num_input_tokens_seen": 314385140, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.6328125, + "step": 14582, + "time_per_iteration": 2.543093204498291 + }, + { + "auxiliary_loss_clip": 0.01098192, + "auxiliary_loss_mlp": 0.01029559, + "balance_loss_clip": 1.01763248, + "balance_loss_mlp": 1.03288174, + "epoch": 0.8767773936569968, + "flos": 19281157770240.0, + "grad_norm": 2.590977059318644, + "language_loss": 0.67103446, + "learning_rate": 1.5710281029390826e-07, + "loss": 0.692312, + "num_input_tokens_seen": 314403715, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65234375, + "step": 14583, + "time_per_iteration": 2.432237148284912 + }, + { + "auxiliary_loss_clip": 0.01100486, + "auxiliary_loss_mlp": 0.01027321, + "balance_loss_clip": 1.01563215, + "balance_loss_mlp": 1.033885, + "epoch": 0.8768375169096648, + "flos": 21247230648960.0, + "grad_norm": 1.8140039658285496, + "language_loss": 0.79140723, + "learning_rate": 1.5695153925422067e-07, + "loss": 0.81268525, + "num_input_tokens_seen": 314421880, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66796875, + "step": 14584, + "time_per_iteration": 2.4755969047546387 + }, + { + "auxiliary_loss_clip": 0.01100277, + "auxiliary_loss_mlp": 0.01031407, + "balance_loss_clip": 1.02005851, + "balance_loss_mlp": 1.03356349, + "epoch": 0.8768976401623327, + "flos": 23295539715840.0, + "grad_norm": 1.5843541811660464, + "language_loss": 0.72366554, + "learning_rate": 1.5680033810311555e-07, + "loss": 0.74498236, + "num_input_tokens_seen": 314441585, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66796875, + "step": 14585, + "time_per_iteration": 2.4504952430725098 + }, + { + "auxiliary_loss_clip": 0.01100354, + "auxiliary_loss_mlp": 0.01030297, + "balance_loss_clip": 1.01835251, + "balance_loss_mlp": 1.03485799, + "epoch": 0.8769577634150008, + "flos": 21361247395200.0, + "grad_norm": 1.8197564931552062, + "language_loss": 0.74027938, + "learning_rate": 1.5664920684632654e-07, + "loss": 0.76158589, + "num_input_tokens_seen": 314459020, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.65625, + "step": 14586, + "time_per_iteration": 2.476065158843994 + }, + { + "auxiliary_loss_clip": 0.01097699, + "auxiliary_loss_mlp": 0.01028115, + "balance_loss_clip": 1.01619434, + "balance_loss_mlp": 1.03294468, + "epoch": 0.8770178866676687, + "flos": 23514056104320.0, + "grad_norm": 1.8004743742414036, + "language_loss": 0.78392655, + "learning_rate": 1.564981454895844e-07, + "loss": 0.80518472, + "num_input_tokens_seen": 314478935, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6484375, + "step": 14587, + "time_per_iteration": 2.4667489528656006 + }, + { + "auxiliary_loss_clip": 0.01100555, + "auxiliary_loss_mlp": 0.01027789, + "balance_loss_clip": 1.01515913, + "balance_loss_mlp": 1.03499091, + "epoch": 0.8770780099203367, + "flos": 19719052473600.0, + "grad_norm": 1.5237767345911253, + "language_loss": 0.73971182, + "learning_rate": 1.5634715403861697e-07, + "loss": 0.76099527, + "num_input_tokens_seen": 314497635, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.65625, + "step": 14588, + "time_per_iteration": 2.492217779159546 + }, + { + "auxiliary_loss_clip": 0.01098609, + "auxiliary_loss_mlp": 0.01028086, + "balance_loss_clip": 1.0172143, + "balance_loss_mlp": 1.03383243, + "epoch": 0.8771381331730047, + "flos": 21395901041280.0, + "grad_norm": 1.9182743142543304, + "language_loss": 0.66461021, + "learning_rate": 1.5619623249915016e-07, + "loss": 0.68587714, + "num_input_tokens_seen": 314515445, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6484375, + "step": 14589, + "time_per_iteration": 2.4805967807769775 + }, + { + "auxiliary_loss_clip": 0.010995, + "auxiliary_loss_mlp": 0.01032316, + "balance_loss_clip": 1.02103901, + "balance_loss_mlp": 1.03442669, + "epoch": 0.8771982564256726, + "flos": 20261770041600.0, + "grad_norm": 2.249132588118827, + "language_loss": 0.70547277, + "learning_rate": 1.5604538087690732e-07, + "loss": 0.7267909, + "num_input_tokens_seen": 314533040, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65234375, + "step": 14590, + "time_per_iteration": 2.457648992538452 + }, + { + "auxiliary_loss_clip": 0.01105657, + "auxiliary_loss_mlp": 0.01035953, + "balance_loss_clip": 1.02332294, + "balance_loss_mlp": 1.03528619, + "epoch": 0.8772583796783406, + "flos": 12489372495360.0, + "grad_norm": 1.974082203683765, + "language_loss": 0.75044048, + "learning_rate": 1.558945991776086e-07, + "loss": 0.77185655, + "num_input_tokens_seen": 314548280, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 14591, + "time_per_iteration": 2.405331611633301 + }, + { + "auxiliary_loss_clip": 0.01094641, + "auxiliary_loss_mlp": 0.01024689, + "balance_loss_clip": 1.01393628, + "balance_loss_mlp": 1.03357577, + "epoch": 0.8773185029310085, + "flos": 15921103927680.0, + "grad_norm": 2.031313863319318, + "language_loss": 0.79909766, + "learning_rate": 1.5574388740697096e-07, + "loss": 0.82029092, + "num_input_tokens_seen": 314565345, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.609375, + "step": 14592, + "time_per_iteration": 2.454929828643799 + }, + { + "auxiliary_loss_clip": 0.01095316, + "auxiliary_loss_mlp": 0.01026873, + "balance_loss_clip": 1.01629364, + "balance_loss_mlp": 1.03316784, + "epoch": 0.8773786261836766, + "flos": 21504530747520.0, + "grad_norm": 1.614306440417284, + "language_loss": 0.82640499, + "learning_rate": 1.5559324557071052e-07, + "loss": 0.84762686, + "num_input_tokens_seen": 314584190, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.62109375, + "step": 14593, + "time_per_iteration": 2.4428117275238037 + }, + { + "auxiliary_loss_clip": 0.01098816, + "auxiliary_loss_mlp": 0.01022638, + "balance_loss_clip": 1.01165867, + "balance_loss_mlp": 1.03518867, + "epoch": 0.8774387494363445, + "flos": 26761493831040.0, + "grad_norm": 1.553725648674736, + "language_loss": 0.7587297, + "learning_rate": 1.5544267367453845e-07, + "loss": 0.77994418, + "num_input_tokens_seen": 314605625, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6328125, + "step": 14594, + "time_per_iteration": 2.4924726486206055 + }, + { + "auxiliary_loss_clip": 0.01098996, + "auxiliary_loss_mlp": 0.0102714, + "balance_loss_clip": 1.01548719, + "balance_loss_mlp": 1.03252506, + "epoch": 0.8774988726890125, + "flos": 18478841633280.0, + "grad_norm": 2.2507371264600082, + "language_loss": 0.77722549, + "learning_rate": 1.552921717241651e-07, + "loss": 0.79848695, + "num_input_tokens_seen": 314622630, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 14595, + "time_per_iteration": 3.8839778900146484 + }, + { + "auxiliary_loss_clip": 0.01100028, + "auxiliary_loss_mlp": 0.01032517, + "balance_loss_clip": 1.02081716, + "balance_loss_mlp": 1.03495049, + "epoch": 0.8775589959416804, + "flos": 24426366664320.0, + "grad_norm": 1.6010458814684418, + "language_loss": 0.70719904, + "learning_rate": 1.5514173972529743e-07, + "loss": 0.72852451, + "num_input_tokens_seen": 314642460, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6484375, + "step": 14596, + "time_per_iteration": 2.485382080078125 + }, + { + "auxiliary_loss_clip": 0.01099029, + "auxiliary_loss_mlp": 0.01025253, + "balance_loss_clip": 1.01380861, + "balance_loss_mlp": 1.03495514, + "epoch": 0.8776191191943484, + "flos": 23440151871360.0, + "grad_norm": 1.7430220706670174, + "language_loss": 0.86074364, + "learning_rate": 1.5499137768364067e-07, + "loss": 0.88198644, + "num_input_tokens_seen": 314659875, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.640625, + "step": 14597, + "time_per_iteration": 3.946387529373169 + }, + { + "auxiliary_loss_clip": 0.01098851, + "auxiliary_loss_mlp": 0.01028738, + "balance_loss_clip": 1.0176214, + "balance_loss_mlp": 1.03434682, + "epoch": 0.8776792424470163, + "flos": 26830872950400.0, + "grad_norm": 3.272225714206706, + "language_loss": 0.72833431, + "learning_rate": 1.5484108560489494e-07, + "loss": 0.74961019, + "num_input_tokens_seen": 314680260, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.64453125, + "step": 14598, + "time_per_iteration": 2.549870491027832 + }, + { + "auxiliary_loss_clip": 0.01102022, + "auxiliary_loss_mlp": 0.01028103, + "balance_loss_clip": 1.01646245, + "balance_loss_mlp": 1.03658414, + "epoch": 0.8777393656996844, + "flos": 15626169354240.0, + "grad_norm": 2.3090049722541095, + "language_loss": 0.77496958, + "learning_rate": 1.5469086349476036e-07, + "loss": 0.79627085, + "num_input_tokens_seen": 314696260, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65625, + "step": 14599, + "time_per_iteration": 2.409959077835083 + }, + { + "auxiliary_loss_clip": 0.01099573, + "auxiliary_loss_mlp": 0.01029278, + "balance_loss_clip": 1.01803648, + "balance_loss_mlp": 1.03392327, + "epoch": 0.8777994889523523, + "flos": 18879999701760.0, + "grad_norm": 2.428745416701903, + "language_loss": 0.67349386, + "learning_rate": 1.545407113589332e-07, + "loss": 0.69478238, + "num_input_tokens_seen": 314714215, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 14600, + "time_per_iteration": 2.474609613418579 + }, + { + "auxiliary_loss_clip": 0.01100255, + "auxiliary_loss_mlp": 0.01035078, + "balance_loss_clip": 1.02363992, + "balance_loss_mlp": 1.03416967, + "epoch": 0.8778596122050203, + "flos": 48826516400640.0, + "grad_norm": 2.092434255676991, + "language_loss": 0.69479287, + "learning_rate": 1.543906292031072e-07, + "loss": 0.71614623, + "num_input_tokens_seen": 314735700, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66015625, + "step": 14601, + "time_per_iteration": 4.140269994735718 + }, + { + "auxiliary_loss_clip": 0.01103745, + "auxiliary_loss_mlp": 0.01027151, + "balance_loss_clip": 1.01555753, + "balance_loss_mlp": 1.03566706, + "epoch": 0.8779197354576883, + "flos": 25660184883840.0, + "grad_norm": 1.9032016370859126, + "language_loss": 0.73216182, + "learning_rate": 1.542406170329733e-07, + "loss": 0.75347078, + "num_input_tokens_seen": 314753335, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 14602, + "time_per_iteration": 2.48760986328125 + }, + { + "auxiliary_loss_clip": 0.01098268, + "auxiliary_loss_mlp": 0.01030865, + "balance_loss_clip": 1.02021384, + "balance_loss_mlp": 1.03420591, + "epoch": 0.8779798587103562, + "flos": 18843227153280.0, + "grad_norm": 1.8928349541350598, + "language_loss": 0.71194154, + "learning_rate": 1.5409067485422056e-07, + "loss": 0.73323286, + "num_input_tokens_seen": 314770800, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.640625, + "step": 14603, + "time_per_iteration": 2.4426493644714355 + }, + { + "auxiliary_loss_clip": 0.01022978, + "auxiliary_loss_mlp": 0.01002674, + "balance_loss_clip": 1.00168419, + "balance_loss_mlp": 1.00285864, + "epoch": 0.8780399819630242, + "flos": 68613119377920.0, + "grad_norm": 0.7364725724275261, + "language_loss": 0.54201496, + "learning_rate": 1.539408026725344e-07, + "loss": 0.56227148, + "num_input_tokens_seen": 314837275, + "router_z_loss_clip": 0.0098877, + "router_z_loss_mlp": 0.20117188, + "step": 14604, + "time_per_iteration": 3.0395615100860596 + }, + { + "auxiliary_loss_clip": 0.01022902, + "auxiliary_loss_mlp": 0.01002151, + "balance_loss_clip": 1.0011977, + "balance_loss_mlp": 1.00290179, + "epoch": 0.8781001052156922, + "flos": 65734807766400.0, + "grad_norm": 0.7228284416194825, + "language_loss": 0.59237391, + "learning_rate": 1.537910004935976e-07, + "loss": 0.61262447, + "num_input_tokens_seen": 314902220, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.20019531, + "step": 14605, + "time_per_iteration": 3.035781145095825 + }, + { + "auxiliary_loss_clip": 0.01101512, + "auxiliary_loss_mlp": 0.01033244, + "balance_loss_clip": 1.02125812, + "balance_loss_mlp": 1.03448224, + "epoch": 0.8781602284683602, + "flos": 22049654526720.0, + "grad_norm": 1.5739361881333696, + "language_loss": 0.85203683, + "learning_rate": 1.536412683230912e-07, + "loss": 0.87338436, + "num_input_tokens_seen": 314921645, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.671875, + "step": 14606, + "time_per_iteration": 2.4679386615753174 + }, + { + "auxiliary_loss_clip": 0.01102154, + "auxiliary_loss_mlp": 0.01027184, + "balance_loss_clip": 1.01512599, + "balance_loss_mlp": 1.03583789, + "epoch": 0.8782203517210281, + "flos": 17562939713280.0, + "grad_norm": 2.14242469072768, + "language_loss": 0.70639741, + "learning_rate": 1.534916061666931e-07, + "loss": 0.72769076, + "num_input_tokens_seen": 314939390, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 14607, + "time_per_iteration": 2.429849147796631 + }, + { + "auxiliary_loss_clip": 0.0109832, + "auxiliary_loss_mlp": 0.01034086, + "balance_loss_clip": 1.02337539, + "balance_loss_mlp": 1.03447664, + "epoch": 0.8782804749736961, + "flos": 25520421064320.0, + "grad_norm": 1.7322326558038397, + "language_loss": 0.71684766, + "learning_rate": 1.533420140300785e-07, + "loss": 0.7381717, + "num_input_tokens_seen": 314959205, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.63671875, + "step": 14608, + "time_per_iteration": 2.4912965297698975 + }, + { + "auxiliary_loss_clip": 0.01103002, + "auxiliary_loss_mlp": 0.01034288, + "balance_loss_clip": 1.02251017, + "balance_loss_mlp": 1.03509843, + "epoch": 0.878340598226364, + "flos": 21798747048960.0, + "grad_norm": 2.337485246966266, + "language_loss": 0.87112725, + "learning_rate": 1.5319249191891936e-07, + "loss": 0.89250016, + "num_input_tokens_seen": 314977485, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6796875, + "step": 14609, + "time_per_iteration": 2.458808660507202 + }, + { + "auxiliary_loss_clip": 0.01099988, + "auxiliary_loss_mlp": 0.01028854, + "balance_loss_clip": 1.01741576, + "balance_loss_mlp": 1.03460443, + "epoch": 0.878400721479032, + "flos": 21102403011840.0, + "grad_norm": 1.603979145796894, + "language_loss": 0.7021966, + "learning_rate": 1.5304303983888643e-07, + "loss": 0.72348499, + "num_input_tokens_seen": 314997830, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 14610, + "time_per_iteration": 2.4685328006744385 + }, + { + "auxiliary_loss_clip": 0.01099125, + "auxiliary_loss_mlp": 0.01030436, + "balance_loss_clip": 1.01932585, + "balance_loss_mlp": 1.03569698, + "epoch": 0.8784608447316999, + "flos": 20923532259840.0, + "grad_norm": 4.627089606840685, + "language_loss": 0.80114305, + "learning_rate": 1.5289365779564612e-07, + "loss": 0.82243866, + "num_input_tokens_seen": 315016480, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6328125, + "step": 14611, + "time_per_iteration": 2.442768096923828 + }, + { + "auxiliary_loss_clip": 0.01100873, + "auxiliary_loss_mlp": 0.01032791, + "balance_loss_clip": 1.02121592, + "balance_loss_mlp": 1.03429496, + "epoch": 0.878520967984368, + "flos": 23330660238720.0, + "grad_norm": 1.6363638945337065, + "language_loss": 0.76340765, + "learning_rate": 1.5274434579486338e-07, + "loss": 0.78474426, + "num_input_tokens_seen": 315036135, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66796875, + "step": 14612, + "time_per_iteration": 2.4471793174743652 + }, + { + "auxiliary_loss_clip": 0.01098814, + "auxiliary_loss_mlp": 0.01034029, + "balance_loss_clip": 1.02276969, + "balance_loss_mlp": 1.03435552, + "epoch": 0.8785810912370359, + "flos": 25518984520320.0, + "grad_norm": 1.4298080902663715, + "language_loss": 0.72504056, + "learning_rate": 1.525951038422002e-07, + "loss": 0.746369, + "num_input_tokens_seen": 315057995, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.64453125, + "step": 14613, + "time_per_iteration": 2.4921391010284424 + }, + { + "auxiliary_loss_clip": 0.01022277, + "auxiliary_loss_mlp": 0.01002009, + "balance_loss_clip": 1.00101399, + "balance_loss_mlp": 1.0023061, + "epoch": 0.8786412144897039, + "flos": 61841047691520.0, + "grad_norm": 1.04250108997431, + "language_loss": 0.64641011, + "learning_rate": 1.5244593194331667e-07, + "loss": 0.66665304, + "num_input_tokens_seen": 315104010, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.19921875, + "step": 14614, + "time_per_iteration": 2.8672502040863037 + }, + { + "auxiliary_loss_clip": 0.01022982, + "auxiliary_loss_mlp": 0.01001073, + "balance_loss_clip": 1.0000422, + "balance_loss_mlp": 1.00285435, + "epoch": 0.8787013377423719, + "flos": 70989364638720.0, + "grad_norm": 0.6615602022386656, + "language_loss": 0.58617866, + "learning_rate": 1.5229683010386762e-07, + "loss": 0.60641921, + "num_input_tokens_seen": 315174550, + "router_z_loss_clip": 0.01031494, + "router_z_loss_mlp": 0.20117188, + "step": 14615, + "time_per_iteration": 3.120760917663574 + }, + { + "auxiliary_loss_clip": 0.0109898, + "auxiliary_loss_mlp": 0.01031525, + "balance_loss_clip": 1.02014589, + "balance_loss_mlp": 1.03304863, + "epoch": 0.8787614609950398, + "flos": 17347404153600.0, + "grad_norm": 2.277379368567329, + "language_loss": 0.7279399, + "learning_rate": 1.5214779832950807e-07, + "loss": 0.74924493, + "num_input_tokens_seen": 315191825, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65625, + "step": 14616, + "time_per_iteration": 2.4196126461029053 + }, + { + "auxiliary_loss_clip": 0.01022719, + "auxiliary_loss_mlp": 0.01001804, + "balance_loss_clip": 1.00076103, + "balance_loss_mlp": 1.0026381, + "epoch": 0.8788215842477078, + "flos": 72511401588480.0, + "grad_norm": 0.8027403534431957, + "language_loss": 0.57973462, + "learning_rate": 1.5199883662588953e-07, + "loss": 0.59997988, + "num_input_tokens_seen": 315255075, + "router_z_loss_clip": 0.01043701, + "router_z_loss_mlp": 0.20117188, + "step": 14617, + "time_per_iteration": 3.1586780548095703 + }, + { + "auxiliary_loss_clip": 0.01096253, + "auxiliary_loss_mlp": 0.0102978, + "balance_loss_clip": 1.01859808, + "balance_loss_mlp": 1.03347445, + "epoch": 0.8788817075003758, + "flos": 24827452905600.0, + "grad_norm": 1.9129594461835326, + "language_loss": 0.83026248, + "learning_rate": 1.5184994499865987e-07, + "loss": 0.8515228, + "num_input_tokens_seen": 315273995, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.62890625, + "step": 14618, + "time_per_iteration": 2.5385875701904297 + }, + { + "auxiliary_loss_clip": 0.01094322, + "auxiliary_loss_mlp": 0.0102493, + "balance_loss_clip": 1.01411796, + "balance_loss_mlp": 1.03331971, + "epoch": 0.8789418307530438, + "flos": 22638769488000.0, + "grad_norm": 1.5097664240829207, + "language_loss": 0.69104743, + "learning_rate": 1.5170112345346598e-07, + "loss": 0.71223986, + "num_input_tokens_seen": 315294485, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.609375, + "step": 14619, + "time_per_iteration": 2.459087610244751 + }, + { + "auxiliary_loss_clip": 0.01101429, + "auxiliary_loss_mlp": 0.01031966, + "balance_loss_clip": 1.02081394, + "balance_loss_mlp": 1.03381336, + "epoch": 0.8790019540057117, + "flos": 19785738072960.0, + "grad_norm": 2.822522810502768, + "language_loss": 0.77135247, + "learning_rate": 1.5155237199595016e-07, + "loss": 0.7926864, + "num_input_tokens_seen": 315310420, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6796875, + "step": 14620, + "time_per_iteration": 2.4514245986938477 + }, + { + "auxiliary_loss_clip": 0.01101357, + "auxiliary_loss_mlp": 0.01030492, + "balance_loss_clip": 1.01795721, + "balance_loss_mlp": 1.03538573, + "epoch": 0.8790620772583797, + "flos": 20229774001920.0, + "grad_norm": 1.663844262033778, + "language_loss": 0.79417694, + "learning_rate": 1.514036906317542e-07, + "loss": 0.81549543, + "num_input_tokens_seen": 315330110, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.66015625, + "step": 14621, + "time_per_iteration": 2.4423892498016357 + }, + { + "auxiliary_loss_clip": 0.0110234, + "auxiliary_loss_mlp": 0.01034194, + "balance_loss_clip": 1.02238619, + "balance_loss_mlp": 1.03448009, + "epoch": 0.8791222005110476, + "flos": 24130785646080.0, + "grad_norm": 3.159156225679449, + "language_loss": 0.66855097, + "learning_rate": 1.5125507936651506e-07, + "loss": 0.68991637, + "num_input_tokens_seen": 315350080, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 14622, + "time_per_iteration": 2.476047992706299 + }, + { + "auxiliary_loss_clip": 0.01099562, + "auxiliary_loss_mlp": 0.01033553, + "balance_loss_clip": 1.02223372, + "balance_loss_mlp": 1.03490078, + "epoch": 0.8791823237637156, + "flos": 21614201948160.0, + "grad_norm": 1.928437907767961, + "language_loss": 0.7306127, + "learning_rate": 1.511065382058687e-07, + "loss": 0.75194383, + "num_input_tokens_seen": 315366360, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 14623, + "time_per_iteration": 2.4311418533325195 + }, + { + "auxiliary_loss_clip": 0.01094016, + "auxiliary_loss_mlp": 0.01027822, + "balance_loss_clip": 1.01631272, + "balance_loss_mlp": 1.03009653, + "epoch": 0.8792424470163835, + "flos": 24243401761920.0, + "grad_norm": 2.38983757002019, + "language_loss": 0.7877636, + "learning_rate": 1.5095806715544801e-07, + "loss": 0.80898196, + "num_input_tokens_seen": 315385890, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 14624, + "time_per_iteration": 2.454042911529541 + }, + { + "auxiliary_loss_clip": 0.01101284, + "auxiliary_loss_mlp": 0.01034601, + "balance_loss_clip": 1.02183962, + "balance_loss_mlp": 1.03443372, + "epoch": 0.8793025702690516, + "flos": 24893204751360.0, + "grad_norm": 1.8092429403664327, + "language_loss": 0.79949045, + "learning_rate": 1.5080966622088265e-07, + "loss": 0.82084924, + "num_input_tokens_seen": 315403400, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.66796875, + "step": 14625, + "time_per_iteration": 2.468273162841797 + }, + { + "auxiliary_loss_clip": 0.0109769, + "auxiliary_loss_mlp": 0.01035432, + "balance_loss_clip": 1.02404141, + "balance_loss_mlp": 1.03421533, + "epoch": 0.8793626935217195, + "flos": 25373115388800.0, + "grad_norm": 1.5248039405133302, + "language_loss": 0.74116158, + "learning_rate": 1.5066133540779967e-07, + "loss": 0.76249278, + "num_input_tokens_seen": 315423670, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6328125, + "step": 14626, + "time_per_iteration": 2.5007894039154053 + }, + { + "auxiliary_loss_clip": 0.01100657, + "auxiliary_loss_mlp": 0.01030005, + "balance_loss_clip": 1.01841187, + "balance_loss_mlp": 1.03324091, + "epoch": 0.8794228167743875, + "flos": 34678000742400.0, + "grad_norm": 1.5472909001044985, + "language_loss": 0.7117843, + "learning_rate": 1.505130747218246e-07, + "loss": 0.73309094, + "num_input_tokens_seen": 315446265, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 14627, + "time_per_iteration": 2.572488784790039 + }, + { + "auxiliary_loss_clip": 0.0109865, + "auxiliary_loss_mlp": 0.01026703, + "balance_loss_clip": 1.01455545, + "balance_loss_mlp": 1.0333581, + "epoch": 0.8794829400270555, + "flos": 19464014931840.0, + "grad_norm": 1.7161031145560457, + "language_loss": 0.72222739, + "learning_rate": 1.5036488416857873e-07, + "loss": 0.74348092, + "num_input_tokens_seen": 315464655, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.65234375, + "step": 14628, + "time_per_iteration": 2.5836756229400635 + }, + { + "auxiliary_loss_clip": 0.01099882, + "auxiliary_loss_mlp": 0.0103277, + "balance_loss_clip": 1.02062297, + "balance_loss_mlp": 1.0343008, + "epoch": 0.8795430632797234, + "flos": 15231403906560.0, + "grad_norm": 2.974240277215887, + "language_loss": 0.69140917, + "learning_rate": 1.5021676375368175e-07, + "loss": 0.71273565, + "num_input_tokens_seen": 315481090, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.65625, + "step": 14629, + "time_per_iteration": 2.5673904418945312 + }, + { + "auxiliary_loss_clip": 0.01095341, + "auxiliary_loss_mlp": 0.01029354, + "balance_loss_clip": 1.01881027, + "balance_loss_mlp": 1.03196287, + "epoch": 0.8796031865323914, + "flos": 27744727795200.0, + "grad_norm": 1.611077120019427, + "language_loss": 0.68476737, + "learning_rate": 1.5006871348275053e-07, + "loss": 0.70601434, + "num_input_tokens_seen": 315502010, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6328125, + "step": 14630, + "time_per_iteration": 2.6186506748199463 + }, + { + "auxiliary_loss_clip": 0.0109541, + "auxiliary_loss_mlp": 0.01032622, + "balance_loss_clip": 1.02080238, + "balance_loss_mlp": 1.03272772, + "epoch": 0.8796633097850594, + "flos": 31285412156160.0, + "grad_norm": 1.5868483817753165, + "language_loss": 0.74161929, + "learning_rate": 1.499207333613999e-07, + "loss": 0.76289958, + "num_input_tokens_seen": 315523040, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.62890625, + "step": 14631, + "time_per_iteration": 2.5261404514312744 + }, + { + "auxiliary_loss_clip": 0.01096064, + "auxiliary_loss_mlp": 0.01031196, + "balance_loss_clip": 1.01977563, + "balance_loss_mlp": 1.03400874, + "epoch": 0.8797234330377274, + "flos": 24243150366720.0, + "grad_norm": 3.2568719611534367, + "language_loss": 0.69245052, + "learning_rate": 1.4977282339523954e-07, + "loss": 0.71372306, + "num_input_tokens_seen": 315541865, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.62109375, + "step": 14632, + "time_per_iteration": 2.5176477432250977 + }, + { + "auxiliary_loss_clip": 0.01100067, + "auxiliary_loss_mlp": 0.01029856, + "balance_loss_clip": 1.01921034, + "balance_loss_mlp": 1.03524411, + "epoch": 0.8797835562903953, + "flos": 24167414540160.0, + "grad_norm": 1.850853820165369, + "language_loss": 0.64914048, + "learning_rate": 1.4962498358987929e-07, + "loss": 0.67043972, + "num_input_tokens_seen": 315561470, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6484375, + "step": 14633, + "time_per_iteration": 2.4824862480163574 + }, + { + "auxiliary_loss_clip": 0.01099036, + "auxiliary_loss_mlp": 0.01028782, + "balance_loss_clip": 1.01750481, + "balance_loss_mlp": 1.03487611, + "epoch": 0.8798436795430633, + "flos": 19284677303040.0, + "grad_norm": 1.9427253459793778, + "language_loss": 0.84233886, + "learning_rate": 1.4947721395092528e-07, + "loss": 0.863617, + "num_input_tokens_seen": 315583140, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.640625, + "step": 14634, + "time_per_iteration": 2.507662296295166 + }, + { + "auxiliary_loss_clip": 0.01098176, + "auxiliary_loss_mlp": 0.01033773, + "balance_loss_clip": 1.02172065, + "balance_loss_mlp": 1.03326917, + "epoch": 0.8799038027957312, + "flos": 28179390274560.0, + "grad_norm": 1.624999412109894, + "language_loss": 0.79993856, + "learning_rate": 1.4932951448398056e-07, + "loss": 0.82125807, + "num_input_tokens_seen": 315601935, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6484375, + "step": 14635, + "time_per_iteration": 2.518354654312134 + }, + { + "auxiliary_loss_clip": 0.01099052, + "auxiliary_loss_mlp": 0.01024128, + "balance_loss_clip": 1.01243329, + "balance_loss_mlp": 1.03359151, + "epoch": 0.8799639260483992, + "flos": 24644703484800.0, + "grad_norm": 1.8321760419089794, + "language_loss": 0.65398335, + "learning_rate": 1.4918188519464648e-07, + "loss": 0.67521518, + "num_input_tokens_seen": 315619995, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 14636, + "time_per_iteration": 2.479426860809326 + }, + { + "auxiliary_loss_clip": 0.01100885, + "auxiliary_loss_mlp": 0.01034558, + "balance_loss_clip": 1.02273285, + "balance_loss_mlp": 1.03467786, + "epoch": 0.8800240493010671, + "flos": 22200479735040.0, + "grad_norm": 1.6657689764280696, + "language_loss": 0.7029084, + "learning_rate": 1.4903432608852074e-07, + "loss": 0.72426283, + "num_input_tokens_seen": 315637895, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 14637, + "time_per_iteration": 3.868614912033081 + }, + { + "auxiliary_loss_clip": 0.01101928, + "auxiliary_loss_mlp": 0.01029648, + "balance_loss_clip": 1.01857972, + "balance_loss_mlp": 1.03736019, + "epoch": 0.8800841725537352, + "flos": 14246086953600.0, + "grad_norm": 2.8684329646632407, + "language_loss": 0.66271627, + "learning_rate": 1.4888683717119843e-07, + "loss": 0.68403208, + "num_input_tokens_seen": 315655520, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.64453125, + "step": 14638, + "time_per_iteration": 5.274388551712036 + }, + { + "auxiliary_loss_clip": 0.01101226, + "auxiliary_loss_mlp": 0.01029099, + "balance_loss_clip": 1.01731563, + "balance_loss_mlp": 1.03512883, + "epoch": 0.8801442958064031, + "flos": 37415794348800.0, + "grad_norm": 1.846776062468507, + "language_loss": 0.58106345, + "learning_rate": 1.4873941844827286e-07, + "loss": 0.60236669, + "num_input_tokens_seen": 315678955, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66015625, + "step": 14639, + "time_per_iteration": 2.5819764137268066 + }, + { + "auxiliary_loss_clip": 0.01099525, + "auxiliary_loss_mlp": 0.01037199, + "balance_loss_clip": 1.02529562, + "balance_loss_mlp": 1.03383088, + "epoch": 0.8802044190590711, + "flos": 25047334010880.0, + "grad_norm": 1.3914179808577423, + "language_loss": 0.7458142, + "learning_rate": 1.4859206992533402e-07, + "loss": 0.76718146, + "num_input_tokens_seen": 315700360, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 14640, + "time_per_iteration": 2.521860361099243 + }, + { + "auxiliary_loss_clip": 0.01099653, + "auxiliary_loss_mlp": 0.01036259, + "balance_loss_clip": 1.0246067, + "balance_loss_mlp": 1.03402758, + "epoch": 0.8802645423117391, + "flos": 24133874215680.0, + "grad_norm": 1.9721181093875695, + "language_loss": 0.6971339, + "learning_rate": 1.4844479160796985e-07, + "loss": 0.71849298, + "num_input_tokens_seen": 315719270, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65625, + "step": 14641, + "time_per_iteration": 2.465099811553955 + }, + { + "auxiliary_loss_clip": 0.01101581, + "auxiliary_loss_mlp": 0.01025603, + "balance_loss_clip": 1.01319361, + "balance_loss_mlp": 1.03489089, + "epoch": 0.880324665564407, + "flos": 17931203902080.0, + "grad_norm": 2.100901635166447, + "language_loss": 0.84755206, + "learning_rate": 1.4829758350176457e-07, + "loss": 0.86882389, + "num_input_tokens_seen": 315737425, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.66796875, + "step": 14642, + "time_per_iteration": 2.4675049781799316 + }, + { + "auxiliary_loss_clip": 0.01100814, + "auxiliary_loss_mlp": 0.0103406, + "balance_loss_clip": 1.02078056, + "balance_loss_mlp": 1.0357337, + "epoch": 0.880384788817075, + "flos": 21287630471040.0, + "grad_norm": 1.6823727601276586, + "language_loss": 0.78799748, + "learning_rate": 1.4815044561230038e-07, + "loss": 0.80934626, + "num_input_tokens_seen": 315755725, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6484375, + "step": 14643, + "time_per_iteration": 3.872709274291992 + }, + { + "auxiliary_loss_clip": 0.01094296, + "auxiliary_loss_mlp": 0.01023704, + "balance_loss_clip": 1.01277804, + "balance_loss_mlp": 1.03229833, + "epoch": 0.880444912069743, + "flos": 12458489777280.0, + "grad_norm": 1.6378114000618107, + "language_loss": 0.73273623, + "learning_rate": 1.4800337794515705e-07, + "loss": 0.75391626, + "num_input_tokens_seen": 315773835, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.62109375, + "step": 14644, + "time_per_iteration": 2.4105916023254395 + }, + { + "auxiliary_loss_clip": 0.01102632, + "auxiliary_loss_mlp": 0.01033481, + "balance_loss_clip": 1.02102923, + "balance_loss_mlp": 1.0351516, + "epoch": 0.880505035322411, + "flos": 13625945619840.0, + "grad_norm": 1.7663467808717348, + "language_loss": 0.79154408, + "learning_rate": 1.47856380505911e-07, + "loss": 0.81290519, + "num_input_tokens_seen": 315790615, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.67578125, + "step": 14645, + "time_per_iteration": 2.4135661125183105 + }, + { + "auxiliary_loss_clip": 0.01097489, + "auxiliary_loss_mlp": 0.01032803, + "balance_loss_clip": 1.02140081, + "balance_loss_mlp": 1.03396201, + "epoch": 0.8805651585750789, + "flos": 23183067254400.0, + "grad_norm": 1.8153463800706586, + "language_loss": 0.64348304, + "learning_rate": 1.477094533001364e-07, + "loss": 0.66478598, + "num_input_tokens_seen": 315811010, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.63671875, + "step": 14646, + "time_per_iteration": 2.510627031326294 + }, + { + "auxiliary_loss_clip": 0.01103533, + "auxiliary_loss_mlp": 0.01033554, + "balance_loss_clip": 1.02099562, + "balance_loss_mlp": 1.0349977, + "epoch": 0.8806252818277469, + "flos": 14903000835840.0, + "grad_norm": 2.0987215811216617, + "language_loss": 0.77177233, + "learning_rate": 1.475625963334055e-07, + "loss": 0.79314315, + "num_input_tokens_seen": 315828130, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 14647, + "time_per_iteration": 2.455548048019409 + }, + { + "auxiliary_loss_clip": 0.01097739, + "auxiliary_loss_mlp": 0.01025889, + "balance_loss_clip": 1.01538038, + "balance_loss_mlp": 1.03470957, + "epoch": 0.8806854050804148, + "flos": 17639178330240.0, + "grad_norm": 2.016380726471692, + "language_loss": 0.75440037, + "learning_rate": 1.4741580961128652e-07, + "loss": 0.77563667, + "num_input_tokens_seen": 315844900, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.6328125, + "step": 14648, + "time_per_iteration": 2.4378833770751953 + }, + { + "auxiliary_loss_clip": 0.01098481, + "auxiliary_loss_mlp": 0.01028659, + "balance_loss_clip": 1.01756096, + "balance_loss_mlp": 1.03212929, + "epoch": 0.8807455283330828, + "flos": 25332392344320.0, + "grad_norm": 1.800540393972122, + "language_loss": 0.65671074, + "learning_rate": 1.4726909313934522e-07, + "loss": 0.67798209, + "num_input_tokens_seen": 315863745, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6640625, + "step": 14649, + "time_per_iteration": 2.475167751312256 + }, + { + "auxiliary_loss_clip": 0.01099588, + "auxiliary_loss_mlp": 0.01027763, + "balance_loss_clip": 1.01583624, + "balance_loss_mlp": 1.0346787, + "epoch": 0.8808056515857507, + "flos": 25265168040960.0, + "grad_norm": 1.3656118793180194, + "language_loss": 0.62488627, + "learning_rate": 1.4712244692314578e-07, + "loss": 0.64615977, + "num_input_tokens_seen": 315885765, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6484375, + "step": 14650, + "time_per_iteration": 2.5512006282806396 + }, + { + "auxiliary_loss_clip": 0.01097299, + "auxiliary_loss_mlp": 0.01027192, + "balance_loss_clip": 1.0164454, + "balance_loss_mlp": 1.03367639, + "epoch": 0.8808657748384188, + "flos": 26578852151040.0, + "grad_norm": 1.4406035911572534, + "language_loss": 0.72946811, + "learning_rate": 1.4697587096824914e-07, + "loss": 0.75071305, + "num_input_tokens_seen": 315907340, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.63671875, + "step": 14651, + "time_per_iteration": 2.4996044635772705 + }, + { + "auxiliary_loss_clip": 0.01100922, + "auxiliary_loss_mlp": 0.01029237, + "balance_loss_clip": 1.01687539, + "balance_loss_mlp": 1.03476334, + "epoch": 0.8809258980910867, + "flos": 18661231918080.0, + "grad_norm": 1.8309456518372134, + "language_loss": 0.72026336, + "learning_rate": 1.4682936528021284e-07, + "loss": 0.74156499, + "num_input_tokens_seen": 315924935, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.66015625, + "step": 14652, + "time_per_iteration": 2.432687282562256 + }, + { + "auxiliary_loss_clip": 0.01097085, + "auxiliary_loss_mlp": 0.01030977, + "balance_loss_clip": 1.01948547, + "balance_loss_mlp": 1.0323956, + "epoch": 0.8809860213437547, + "flos": 19792274348160.0, + "grad_norm": 1.9095636573568913, + "language_loss": 0.74354553, + "learning_rate": 1.4668292986459286e-07, + "loss": 0.76482618, + "num_input_tokens_seen": 315943165, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6484375, + "step": 14653, + "time_per_iteration": 2.4471185207366943 + }, + { + "auxiliary_loss_clip": 0.01101564, + "auxiliary_loss_mlp": 0.010261, + "balance_loss_clip": 1.0141021, + "balance_loss_mlp": 1.03399158, + "epoch": 0.8810461445964227, + "flos": 17894467267200.0, + "grad_norm": 1.6655759568557502, + "language_loss": 0.71326327, + "learning_rate": 1.465365647269421e-07, + "loss": 0.73453987, + "num_input_tokens_seen": 315961340, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 14654, + "time_per_iteration": 2.463416576385498 + }, + { + "auxiliary_loss_clip": 0.01100107, + "auxiliary_loss_mlp": 0.01034774, + "balance_loss_clip": 1.02197719, + "balance_loss_mlp": 1.03539014, + "epoch": 0.8811062678490906, + "flos": 29163917128320.0, + "grad_norm": 2.5564410851677284, + "language_loss": 0.71378338, + "learning_rate": 1.4639026987281012e-07, + "loss": 0.73513222, + "num_input_tokens_seen": 315981335, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6484375, + "step": 14655, + "time_per_iteration": 2.506082057952881 + }, + { + "auxiliary_loss_clip": 0.01099171, + "auxiliary_loss_mlp": 0.01031333, + "balance_loss_clip": 1.01999068, + "balance_loss_mlp": 1.03464365, + "epoch": 0.8811663911017587, + "flos": 20338834671360.0, + "grad_norm": 1.5670673465427962, + "language_loss": 0.8118304, + "learning_rate": 1.462440453077449e-07, + "loss": 0.83313543, + "num_input_tokens_seen": 316001325, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.64453125, + "step": 14656, + "time_per_iteration": 2.4655163288116455 + }, + { + "auxiliary_loss_clip": 0.01100032, + "auxiliary_loss_mlp": 0.01028991, + "balance_loss_clip": 1.01789308, + "balance_loss_mlp": 1.03457642, + "epoch": 0.8812265143544266, + "flos": 25885704424320.0, + "grad_norm": 1.7915054881037722, + "language_loss": 0.68660492, + "learning_rate": 1.460978910372914e-07, + "loss": 0.70789516, + "num_input_tokens_seen": 316022540, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 14657, + "time_per_iteration": 2.478731393814087 + }, + { + "auxiliary_loss_clip": 0.01101157, + "auxiliary_loss_mlp": 0.01031358, + "balance_loss_clip": 1.0200932, + "balance_loss_mlp": 1.03532481, + "epoch": 0.8812866376070946, + "flos": 27195509865600.0, + "grad_norm": 1.9296351990440808, + "language_loss": 0.83915722, + "learning_rate": 1.4595180706699207e-07, + "loss": 0.86048234, + "num_input_tokens_seen": 316037735, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.66015625, + "step": 14658, + "time_per_iteration": 2.4841041564941406 + }, + { + "auxiliary_loss_clip": 0.01106037, + "auxiliary_loss_mlp": 0.01034789, + "balance_loss_clip": 1.02234912, + "balance_loss_mlp": 1.03650165, + "epoch": 0.8813467608597625, + "flos": 23807194997760.0, + "grad_norm": 2.487290321183497, + "language_loss": 0.77357286, + "learning_rate": 1.4580579340238554e-07, + "loss": 0.79498112, + "num_input_tokens_seen": 316058105, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6953125, + "step": 14659, + "time_per_iteration": 2.461486577987671 + }, + { + "auxiliary_loss_clip": 0.01099162, + "auxiliary_loss_mlp": 0.01032849, + "balance_loss_clip": 1.02099383, + "balance_loss_mlp": 1.03384209, + "epoch": 0.8814068841124305, + "flos": 21105455667840.0, + "grad_norm": 1.8826679554051244, + "language_loss": 0.60173553, + "learning_rate": 1.4565985004900894e-07, + "loss": 0.62305564, + "num_input_tokens_seen": 316074415, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.65625, + "step": 14660, + "time_per_iteration": 2.464398145675659 + }, + { + "auxiliary_loss_clip": 0.01100447, + "auxiliary_loss_mlp": 0.01038937, + "balance_loss_clip": 1.02622342, + "balance_loss_mlp": 1.0339849, + "epoch": 0.8814670073650984, + "flos": 24716991605760.0, + "grad_norm": 1.7496119170804572, + "language_loss": 0.78005695, + "learning_rate": 1.455139770123972e-07, + "loss": 0.80145085, + "num_input_tokens_seen": 316094405, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6640625, + "step": 14661, + "time_per_iteration": 2.498229742050171 + }, + { + "auxiliary_loss_clip": 0.01102652, + "auxiliary_loss_mlp": 0.01042094, + "balance_loss_clip": 1.02973175, + "balance_loss_mlp": 1.03629279, + "epoch": 0.8815271306177664, + "flos": 22966274718720.0, + "grad_norm": 1.6809136256022188, + "language_loss": 0.76650071, + "learning_rate": 1.45368174298081e-07, + "loss": 0.78794813, + "num_input_tokens_seen": 316113390, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6640625, + "step": 14662, + "time_per_iteration": 2.4708175659179688 + }, + { + "auxiliary_loss_clip": 0.01097442, + "auxiliary_loss_mlp": 0.01024383, + "balance_loss_clip": 1.01413739, + "balance_loss_mlp": 1.03349352, + "epoch": 0.8815872538704344, + "flos": 19460064435840.0, + "grad_norm": 1.798644895415272, + "language_loss": 0.74030846, + "learning_rate": 1.4522244191158929e-07, + "loss": 0.7615267, + "num_input_tokens_seen": 316131085, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.640625, + "step": 14663, + "time_per_iteration": 2.4278945922851562 + }, + { + "auxiliary_loss_clip": 0.01099102, + "auxiliary_loss_mlp": 0.01032819, + "balance_loss_clip": 1.02133894, + "balance_loss_mlp": 1.03406262, + "epoch": 0.8816473771231024, + "flos": 32156604622080.0, + "grad_norm": 1.5255495118497213, + "language_loss": 0.69844538, + "learning_rate": 1.450767798584489e-07, + "loss": 0.71976459, + "num_input_tokens_seen": 316151440, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 14664, + "time_per_iteration": 2.5401558876037598 + }, + { + "auxiliary_loss_clip": 0.01095808, + "auxiliary_loss_mlp": 0.01032478, + "balance_loss_clip": 1.02217281, + "balance_loss_mlp": 1.0326488, + "epoch": 0.8817075003757703, + "flos": 19682279925120.0, + "grad_norm": 1.6618340799820441, + "language_loss": 0.81018615, + "learning_rate": 1.449311881441828e-07, + "loss": 0.83146906, + "num_input_tokens_seen": 316170750, + "router_z_loss_clip": 0.10302734, + "router_z_loss_mlp": 0.6328125, + "step": 14665, + "time_per_iteration": 2.433636426925659 + }, + { + "auxiliary_loss_clip": 0.01101369, + "auxiliary_loss_mlp": 0.01030385, + "balance_loss_clip": 1.01962006, + "balance_loss_mlp": 1.03590393, + "epoch": 0.8817676236284383, + "flos": 15668616251520.0, + "grad_norm": 1.9840035014600133, + "language_loss": 0.58445227, + "learning_rate": 1.447856667743117e-07, + "loss": 0.60576975, + "num_input_tokens_seen": 316187265, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.65625, + "step": 14666, + "time_per_iteration": 2.4269118309020996 + }, + { + "auxiliary_loss_clip": 0.01102004, + "auxiliary_loss_mlp": 0.01031848, + "balance_loss_clip": 1.0194447, + "balance_loss_mlp": 1.03639972, + "epoch": 0.8818277468811063, + "flos": 17895185539200.0, + "grad_norm": 1.9152368357070615, + "language_loss": 0.8380903, + "learning_rate": 1.4464021575435403e-07, + "loss": 0.85942888, + "num_input_tokens_seen": 316206555, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.65625, + "step": 14667, + "time_per_iteration": 2.4528279304504395 + }, + { + "auxiliary_loss_clip": 0.01098974, + "auxiliary_loss_mlp": 0.01030246, + "balance_loss_clip": 1.01812816, + "balance_loss_mlp": 1.03405619, + "epoch": 0.8818878701337742, + "flos": 18770508069120.0, + "grad_norm": 1.7252025562955478, + "language_loss": 0.62386823, + "learning_rate": 1.4449483508982563e-07, + "loss": 0.64516038, + "num_input_tokens_seen": 316225210, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6484375, + "step": 14668, + "time_per_iteration": 2.4459190368652344 + }, + { + "auxiliary_loss_clip": 0.01097923, + "auxiliary_loss_mlp": 0.01025692, + "balance_loss_clip": 1.0153687, + "balance_loss_mlp": 1.03508496, + "epoch": 0.8819479933864423, + "flos": 17712292464000.0, + "grad_norm": 2.1655760698377238, + "language_loss": 0.56931686, + "learning_rate": 1.4434952478623918e-07, + "loss": 0.59055305, + "num_input_tokens_seen": 316242685, + "router_z_loss_clip": 0.10302734, + "router_z_loss_mlp": 0.62890625, + "step": 14669, + "time_per_iteration": 2.421549081802368 + }, + { + "auxiliary_loss_clip": 0.0109805, + "auxiliary_loss_mlp": 0.01031814, + "balance_loss_clip": 1.02056646, + "balance_loss_mlp": 1.03313446, + "epoch": 0.8820081166391102, + "flos": 11728749070080.0, + "grad_norm": 2.8416626474645454, + "language_loss": 0.70905107, + "learning_rate": 1.442042848491043e-07, + "loss": 0.73034966, + "num_input_tokens_seen": 316260935, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 14670, + "time_per_iteration": 2.4560811519622803 + }, + { + "auxiliary_loss_clip": 0.0109736, + "auxiliary_loss_mlp": 0.01029663, + "balance_loss_clip": 1.01844513, + "balance_loss_mlp": 1.03294659, + "epoch": 0.8820682398917782, + "flos": 27490372611840.0, + "grad_norm": 2.2004131158768034, + "language_loss": 0.73885584, + "learning_rate": 1.44059115283929e-07, + "loss": 0.76012611, + "num_input_tokens_seen": 316281190, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.64453125, + "step": 14671, + "time_per_iteration": 2.4802374839782715 + }, + { + "auxiliary_loss_clip": 0.01101545, + "auxiliary_loss_mlp": 0.01026485, + "balance_loss_clip": 1.01405191, + "balance_loss_mlp": 1.03317046, + "epoch": 0.8821283631444461, + "flos": 16873850223360.0, + "grad_norm": 2.5737245171058007, + "language_loss": 0.847103, + "learning_rate": 1.43914016096218e-07, + "loss": 0.86838329, + "num_input_tokens_seen": 316297115, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 14672, + "time_per_iteration": 2.4168829917907715 + }, + { + "auxiliary_loss_clip": 0.01097209, + "auxiliary_loss_mlp": 0.01029095, + "balance_loss_clip": 1.01794291, + "balance_loss_mlp": 1.0340246, + "epoch": 0.8821884863971141, + "flos": 24280964409600.0, + "grad_norm": 1.5306407957172687, + "language_loss": 0.72456914, + "learning_rate": 1.4376898729147336e-07, + "loss": 0.74583215, + "num_input_tokens_seen": 316318235, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6328125, + "step": 14673, + "time_per_iteration": 2.460655689239502 + }, + { + "auxiliary_loss_clip": 0.01021936, + "auxiliary_loss_mlp": 0.01000476, + "balance_loss_clip": 0.99948043, + "balance_loss_mlp": 1.00207949, + "epoch": 0.882248609649782, + "flos": 59432342492160.0, + "grad_norm": 0.8084320527661446, + "language_loss": 0.49390993, + "learning_rate": 1.4362402887519487e-07, + "loss": 0.51413405, + "num_input_tokens_seen": 316384705, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.19921875, + "step": 14674, + "time_per_iteration": 3.162792682647705 + }, + { + "auxiliary_loss_clip": 0.01101068, + "auxiliary_loss_mlp": 0.01028636, + "balance_loss_clip": 1.01710272, + "balance_loss_mlp": 1.0344094, + "epoch": 0.88230873290245, + "flos": 19937784343680.0, + "grad_norm": 1.9780221467506172, + "language_loss": 0.76291561, + "learning_rate": 1.4347914085287971e-07, + "loss": 0.78421265, + "num_input_tokens_seen": 316401165, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66796875, + "step": 14675, + "time_per_iteration": 2.4549062252044678 + }, + { + "auxiliary_loss_clip": 0.010958, + "auxiliary_loss_mlp": 0.0103147, + "balance_loss_clip": 1.0201273, + "balance_loss_mlp": 1.03307641, + "epoch": 0.882368856155118, + "flos": 16362769559040.0, + "grad_norm": 1.806810226504955, + "language_loss": 0.79589498, + "learning_rate": 1.4333432323002105e-07, + "loss": 0.8171677, + "num_input_tokens_seen": 316418780, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.62890625, + "step": 14676, + "time_per_iteration": 2.4810338020324707 + }, + { + "auxiliary_loss_clip": 0.01022536, + "auxiliary_loss_mlp": 0.01002158, + "balance_loss_clip": 1.00122857, + "balance_loss_mlp": 1.00247622, + "epoch": 0.882428979407786, + "flos": 70594563277440.0, + "grad_norm": 0.6887169643192462, + "language_loss": 0.54792887, + "learning_rate": 1.431895760121109e-07, + "loss": 0.56817579, + "num_input_tokens_seen": 316482030, + "router_z_loss_clip": 0.00927734, + "router_z_loss_mlp": 0.20117188, + "step": 14677, + "time_per_iteration": 3.141437530517578 + }, + { + "auxiliary_loss_clip": 0.01095907, + "auxiliary_loss_mlp": 0.01027291, + "balance_loss_clip": 1.01610887, + "balance_loss_mlp": 1.0322262, + "epoch": 0.8824891026604539, + "flos": 18150294908160.0, + "grad_norm": 2.313090025905276, + "language_loss": 0.65397072, + "learning_rate": 1.4304489920463847e-07, + "loss": 0.67520267, + "num_input_tokens_seen": 316499175, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.63671875, + "step": 14678, + "time_per_iteration": 3.9388959407806396 + }, + { + "auxiliary_loss_clip": 0.01102187, + "auxiliary_loss_mlp": 0.01031683, + "balance_loss_clip": 1.02029228, + "balance_loss_mlp": 1.03451753, + "epoch": 0.8825492259131219, + "flos": 27232713377280.0, + "grad_norm": 1.7618183642532588, + "language_loss": 0.71121728, + "learning_rate": 1.4290029281308936e-07, + "loss": 0.73255599, + "num_input_tokens_seen": 316519495, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.67578125, + "step": 14679, + "time_per_iteration": 2.5044422149658203 + }, + { + "auxiliary_loss_clip": 0.01097187, + "auxiliary_loss_mlp": 0.01029682, + "balance_loss_clip": 1.01925075, + "balance_loss_mlp": 1.03367972, + "epoch": 0.8826093491657898, + "flos": 22274419881600.0, + "grad_norm": 1.7004762448338653, + "language_loss": 0.6368348, + "learning_rate": 1.4275575684294694e-07, + "loss": 0.65810347, + "num_input_tokens_seen": 316538180, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.6328125, + "step": 14680, + "time_per_iteration": 3.960117816925049 + }, + { + "auxiliary_loss_clip": 0.01099928, + "auxiliary_loss_mlp": 0.01032293, + "balance_loss_clip": 1.02081347, + "balance_loss_mlp": 1.03534234, + "epoch": 0.8826694724184578, + "flos": 14204753377920.0, + "grad_norm": 2.3286079869069423, + "language_loss": 0.77274716, + "learning_rate": 1.4261129129969328e-07, + "loss": 0.79406941, + "num_input_tokens_seen": 316551750, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6484375, + "step": 14681, + "time_per_iteration": 2.396320343017578 + }, + { + "auxiliary_loss_clip": 0.01101169, + "auxiliary_loss_mlp": 0.01028851, + "balance_loss_clip": 1.01660836, + "balance_loss_mlp": 1.03466046, + "epoch": 0.8827295956711259, + "flos": 20631686256000.0, + "grad_norm": 1.7616668495516699, + "language_loss": 0.72610635, + "learning_rate": 1.424668961888047e-07, + "loss": 0.7474066, + "num_input_tokens_seen": 316570680, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6640625, + "step": 14682, + "time_per_iteration": 2.455319404602051 + }, + { + "auxiliary_loss_clip": 0.01104116, + "auxiliary_loss_mlp": 0.01031548, + "balance_loss_clip": 1.0181849, + "balance_loss_mlp": 1.0359025, + "epoch": 0.8827897189237938, + "flos": 18513064316160.0, + "grad_norm": 1.8948306470758551, + "language_loss": 0.74149251, + "learning_rate": 1.4232257151575765e-07, + "loss": 0.76284921, + "num_input_tokens_seen": 316588635, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.68359375, + "step": 14683, + "time_per_iteration": 2.4281208515167236 + }, + { + "auxiliary_loss_clip": 0.01100505, + "auxiliary_loss_mlp": 0.01031099, + "balance_loss_clip": 1.01920223, + "balance_loss_mlp": 1.03486133, + "epoch": 0.8828498421764618, + "flos": 22747399194240.0, + "grad_norm": 1.7117849118733992, + "language_loss": 0.65447652, + "learning_rate": 1.4217831728602492e-07, + "loss": 0.67579257, + "num_input_tokens_seen": 316607550, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 14684, + "time_per_iteration": 3.894663095474243 + }, + { + "auxiliary_loss_clip": 0.0109682, + "auxiliary_loss_mlp": 0.01025184, + "balance_loss_clip": 1.01400256, + "balance_loss_mlp": 1.03307378, + "epoch": 0.8829099654291297, + "flos": 15012384727680.0, + "grad_norm": 1.6699013852482991, + "language_loss": 0.69357675, + "learning_rate": 1.4203413350507677e-07, + "loss": 0.71479678, + "num_input_tokens_seen": 316624460, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.63671875, + "step": 14685, + "time_per_iteration": 2.3994603157043457 + }, + { + "auxiliary_loss_clip": 0.01102745, + "auxiliary_loss_mlp": 0.01030639, + "balance_loss_clip": 1.01808035, + "balance_loss_mlp": 1.03550434, + "epoch": 0.8829700886817977, + "flos": 16720546976640.0, + "grad_norm": 1.7745914045293507, + "language_loss": 0.74189049, + "learning_rate": 1.418900201783806e-07, + "loss": 0.76322436, + "num_input_tokens_seen": 316640765, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.671875, + "step": 14686, + "time_per_iteration": 2.4151484966278076 + }, + { + "auxiliary_loss_clip": 0.01096349, + "auxiliary_loss_mlp": 0.01024315, + "balance_loss_clip": 1.01291823, + "balance_loss_mlp": 1.03275704, + "epoch": 0.8830302119344656, + "flos": 15263256291840.0, + "grad_norm": 2.0272125765642732, + "language_loss": 0.63428628, + "learning_rate": 1.417459773114007e-07, + "loss": 0.6554929, + "num_input_tokens_seen": 316656120, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.63671875, + "step": 14687, + "time_per_iteration": 2.4173166751861572 + }, + { + "auxiliary_loss_clip": 0.01100854, + "auxiliary_loss_mlp": 0.01032119, + "balance_loss_clip": 1.0205617, + "balance_loss_mlp": 1.03395879, + "epoch": 0.8830903351871336, + "flos": 28617751854720.0, + "grad_norm": 1.6968934046619368, + "language_loss": 0.68904001, + "learning_rate": 1.4160200490959984e-07, + "loss": 0.71036971, + "num_input_tokens_seen": 316676095, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66796875, + "step": 14688, + "time_per_iteration": 2.500330686569214 + }, + { + "auxiliary_loss_clip": 0.01096963, + "auxiliary_loss_mlp": 0.01025911, + "balance_loss_clip": 1.01433623, + "balance_loss_mlp": 1.033746, + "epoch": 0.8831504584398016, + "flos": 28001632844160.0, + "grad_norm": 1.493679343363792, + "language_loss": 0.67016995, + "learning_rate": 1.4145810297843697e-07, + "loss": 0.69139874, + "num_input_tokens_seen": 316696235, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6328125, + "step": 14689, + "time_per_iteration": 2.4815356731414795 + }, + { + "auxiliary_loss_clip": 0.01101595, + "auxiliary_loss_mlp": 0.01028938, + "balance_loss_clip": 1.01803041, + "balance_loss_mlp": 1.03819656, + "epoch": 0.8832105816924696, + "flos": 26579642250240.0, + "grad_norm": 1.5915741944618107, + "language_loss": 0.74574995, + "learning_rate": 1.4131427152336905e-07, + "loss": 0.76705527, + "num_input_tokens_seen": 316719680, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6328125, + "step": 14690, + "time_per_iteration": 2.514997959136963 + }, + { + "auxiliary_loss_clip": 0.01099856, + "auxiliary_loss_mlp": 0.0103339, + "balance_loss_clip": 1.02113485, + "balance_loss_mlp": 1.03427589, + "epoch": 0.8832707049451375, + "flos": 24898771359360.0, + "grad_norm": 1.438089789703588, + "language_loss": 0.72641426, + "learning_rate": 1.4117051054985018e-07, + "loss": 0.74774671, + "num_input_tokens_seen": 316739830, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.65625, + "step": 14691, + "time_per_iteration": 2.4770781993865967 + }, + { + "auxiliary_loss_clip": 0.01103509, + "auxiliary_loss_mlp": 0.01030457, + "balance_loss_clip": 1.01801181, + "balance_loss_mlp": 1.03464651, + "epoch": 0.8833308281978055, + "flos": 15451141357440.0, + "grad_norm": 1.838759205957813, + "language_loss": 0.51184076, + "learning_rate": 1.4102682006333243e-07, + "loss": 0.53318036, + "num_input_tokens_seen": 316758105, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6875, + "step": 14692, + "time_per_iteration": 2.426839828491211 + }, + { + "auxiliary_loss_clip": 0.01101344, + "auxiliary_loss_mlp": 0.01029759, + "balance_loss_clip": 1.01842189, + "balance_loss_mlp": 1.03562319, + "epoch": 0.8833909514504734, + "flos": 20301523418880.0, + "grad_norm": 2.5343795474068576, + "language_loss": 0.60240692, + "learning_rate": 1.4088320006926346e-07, + "loss": 0.62371796, + "num_input_tokens_seen": 316777455, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 14693, + "time_per_iteration": 2.427978992462158 + }, + { + "auxiliary_loss_clip": 0.01097522, + "auxiliary_loss_mlp": 0.01026791, + "balance_loss_clip": 1.01653326, + "balance_loss_mlp": 1.03563237, + "epoch": 0.8834510747031414, + "flos": 20374027021440.0, + "grad_norm": 1.4622609213581108, + "language_loss": 0.75340641, + "learning_rate": 1.407396505730898e-07, + "loss": 0.7746495, + "num_input_tokens_seen": 316796300, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.62109375, + "step": 14694, + "time_per_iteration": 2.4537601470947266 + }, + { + "auxiliary_loss_clip": 0.01099823, + "auxiliary_loss_mlp": 0.01029601, + "balance_loss_clip": 1.01875305, + "balance_loss_mlp": 1.03203654, + "epoch": 0.8835111979558095, + "flos": 29752026508800.0, + "grad_norm": 1.6951093668851203, + "language_loss": 0.72519171, + "learning_rate": 1.4059617158025527e-07, + "loss": 0.74648589, + "num_input_tokens_seen": 316819090, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6796875, + "step": 14695, + "time_per_iteration": 2.54256010055542 + }, + { + "auxiliary_loss_clip": 0.01094268, + "auxiliary_loss_mlp": 0.01026407, + "balance_loss_clip": 1.01576185, + "balance_loss_mlp": 1.03320861, + "epoch": 0.8835713212084774, + "flos": 24134556574080.0, + "grad_norm": 1.6044220057517486, + "language_loss": 0.80077511, + "learning_rate": 1.404527630961998e-07, + "loss": 0.82198191, + "num_input_tokens_seen": 316839250, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.609375, + "step": 14696, + "time_per_iteration": 2.476656913757324 + }, + { + "auxiliary_loss_clip": 0.01100456, + "auxiliary_loss_mlp": 0.0102923, + "balance_loss_clip": 1.01835179, + "balance_loss_mlp": 1.034863, + "epoch": 0.8836314444611454, + "flos": 27672331933440.0, + "grad_norm": 1.4070028272927375, + "language_loss": 0.74347401, + "learning_rate": 1.4030942512636236e-07, + "loss": 0.76477087, + "num_input_tokens_seen": 316861315, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.65625, + "step": 14697, + "time_per_iteration": 2.5067691802978516 + }, + { + "auxiliary_loss_clip": 0.01098587, + "auxiliary_loss_mlp": 0.01033486, + "balance_loss_clip": 1.02203012, + "balance_loss_mlp": 1.03380871, + "epoch": 0.8836915677138133, + "flos": 16836969934080.0, + "grad_norm": 1.9574693884985603, + "language_loss": 0.72150856, + "learning_rate": 1.401661576761779e-07, + "loss": 0.74282926, + "num_input_tokens_seen": 316879325, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 14698, + "time_per_iteration": 2.4394617080688477 + }, + { + "auxiliary_loss_clip": 0.01022142, + "auxiliary_loss_mlp": 0.01001525, + "balance_loss_clip": 1.00058353, + "balance_loss_mlp": 1.00201714, + "epoch": 0.8837516909664813, + "flos": 69310540823040.0, + "grad_norm": 0.8043510502151429, + "language_loss": 0.5371002, + "learning_rate": 1.4002296075107856e-07, + "loss": 0.55733687, + "num_input_tokens_seen": 316936425, + "router_z_loss_clip": 0.00939941, + "router_z_loss_mlp": 0.20117188, + "step": 14699, + "time_per_iteration": 3.0387063026428223 + }, + { + "auxiliary_loss_clip": 0.01102957, + "auxiliary_loss_mlp": 0.0102854, + "balance_loss_clip": 1.01674378, + "balance_loss_mlp": 1.03511429, + "epoch": 0.8838118142191492, + "flos": 21324726241920.0, + "grad_norm": 1.8010601059746882, + "language_loss": 0.76841766, + "learning_rate": 1.3987983435649508e-07, + "loss": 0.78973258, + "num_input_tokens_seen": 316956360, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 14700, + "time_per_iteration": 2.4849624633789062 + }, + { + "auxiliary_loss_clip": 0.01098124, + "auxiliary_loss_mlp": 0.01028892, + "balance_loss_clip": 1.0178647, + "balance_loss_mlp": 1.03467011, + "epoch": 0.8838719374718172, + "flos": 21470559459840.0, + "grad_norm": 1.9788606423374575, + "language_loss": 0.72744364, + "learning_rate": 1.3973677849785494e-07, + "loss": 0.74871373, + "num_input_tokens_seen": 316975295, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6328125, + "step": 14701, + "time_per_iteration": 2.439821243286133 + }, + { + "auxiliary_loss_clip": 0.01101947, + "auxiliary_loss_mlp": 0.0103152, + "balance_loss_clip": 1.01981974, + "balance_loss_mlp": 1.03463852, + "epoch": 0.8839320607244852, + "flos": 26468929555200.0, + "grad_norm": 14.71295630109059, + "language_loss": 0.70860976, + "learning_rate": 1.3959379318058262e-07, + "loss": 0.72994447, + "num_input_tokens_seen": 316994520, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 14702, + "time_per_iteration": 2.483827590942383 + }, + { + "auxiliary_loss_clip": 0.01102205, + "auxiliary_loss_mlp": 0.01032007, + "balance_loss_clip": 1.01999676, + "balance_loss_mlp": 1.03664851, + "epoch": 0.8839921839771532, + "flos": 45222270923520.0, + "grad_norm": 2.6898618160604, + "language_loss": 0.71423376, + "learning_rate": 1.3945087841010006e-07, + "loss": 0.73557591, + "num_input_tokens_seen": 317018095, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.65625, + "step": 14703, + "time_per_iteration": 2.644871950149536 + }, + { + "auxiliary_loss_clip": 0.01097307, + "auxiliary_loss_mlp": 0.01025831, + "balance_loss_clip": 1.01494169, + "balance_loss_mlp": 1.03430629, + "epoch": 0.8840523072298211, + "flos": 20006876154240.0, + "grad_norm": 1.7004164598471103, + "language_loss": 0.6647324, + "learning_rate": 1.3930803419182645e-07, + "loss": 0.68596381, + "num_input_tokens_seen": 317035755, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6328125, + "step": 14704, + "time_per_iteration": 2.4443137645721436 + }, + { + "auxiliary_loss_clip": 0.01094574, + "auxiliary_loss_mlp": 0.01024695, + "balance_loss_clip": 1.01381683, + "balance_loss_mlp": 1.03165603, + "epoch": 0.8841124304824891, + "flos": 24426007528320.0, + "grad_norm": 1.6723604475362273, + "language_loss": 0.70644706, + "learning_rate": 1.3916526053117905e-07, + "loss": 0.72763973, + "num_input_tokens_seen": 317055765, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.62890625, + "step": 14705, + "time_per_iteration": 2.469675302505493 + }, + { + "auxiliary_loss_clip": 0.01098911, + "auxiliary_loss_mlp": 0.01031156, + "balance_loss_clip": 1.02092755, + "balance_loss_mlp": 1.0351615, + "epoch": 0.884172553735157, + "flos": 31284622056960.0, + "grad_norm": 1.5306826086983725, + "language_loss": 0.70983511, + "learning_rate": 1.3902255743357104e-07, + "loss": 0.73113579, + "num_input_tokens_seen": 317077955, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.63671875, + "step": 14706, + "time_per_iteration": 2.54547381401062 + }, + { + "auxiliary_loss_clip": 0.01098056, + "auxiliary_loss_mlp": 0.01027398, + "balance_loss_clip": 1.01607299, + "balance_loss_mlp": 1.0337317, + "epoch": 0.884232676987825, + "flos": 21391160446080.0, + "grad_norm": 1.6132809989349575, + "language_loss": 0.7450251, + "learning_rate": 1.3887992490441413e-07, + "loss": 0.76627964, + "num_input_tokens_seen": 317095825, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.64453125, + "step": 14707, + "time_per_iteration": 2.448845624923706 + }, + { + "auxiliary_loss_clip": 0.01021776, + "auxiliary_loss_mlp": 0.01004857, + "balance_loss_clip": 1.00377238, + "balance_loss_mlp": 1.00174427, + "epoch": 0.8842928002404931, + "flos": 57911451799680.0, + "grad_norm": 0.8117028150723945, + "language_loss": 0.60430789, + "learning_rate": 1.387373629491173e-07, + "loss": 0.62457418, + "num_input_tokens_seen": 317152875, + "router_z_loss_clip": 0.01086426, + "router_z_loss_mlp": 0.20019531, + "step": 14708, + "time_per_iteration": 2.9150478839874268 + }, + { + "auxiliary_loss_clip": 0.01093498, + "auxiliary_loss_mlp": 0.01028651, + "balance_loss_clip": 1.01817834, + "balance_loss_mlp": 1.03259778, + "epoch": 0.884352923493161, + "flos": 41463896186880.0, + "grad_norm": 1.827497004044899, + "language_loss": 0.67355728, + "learning_rate": 1.3859487157308625e-07, + "loss": 0.69477868, + "num_input_tokens_seen": 317176725, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.609375, + "step": 14709, + "time_per_iteration": 2.643474817276001 + }, + { + "auxiliary_loss_clip": 0.01104027, + "auxiliary_loss_mlp": 0.01035315, + "balance_loss_clip": 1.02236271, + "balance_loss_mlp": 1.03464079, + "epoch": 0.884413046745829, + "flos": 46541234332800.0, + "grad_norm": 1.5868272680422912, + "language_loss": 0.62517226, + "learning_rate": 1.3845245078172373e-07, + "loss": 0.64656574, + "num_input_tokens_seen": 317206880, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6953125, + "step": 14710, + "time_per_iteration": 2.694308042526245 + }, + { + "auxiliary_loss_clip": 0.01097265, + "auxiliary_loss_mlp": 0.01027868, + "balance_loss_clip": 1.01702642, + "balance_loss_mlp": 1.03435802, + "epoch": 0.8844731699984969, + "flos": 19135324552320.0, + "grad_norm": 2.254524973273371, + "language_loss": 0.63405102, + "learning_rate": 1.38310100580431e-07, + "loss": 0.65530241, + "num_input_tokens_seen": 317224135, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.62890625, + "step": 14711, + "time_per_iteration": 2.454507350921631 + }, + { + "auxiliary_loss_clip": 0.01102557, + "auxiliary_loss_mlp": 0.01033319, + "balance_loss_clip": 1.02133811, + "balance_loss_mlp": 1.03427267, + "epoch": 0.8845332932511649, + "flos": 23260634674560.0, + "grad_norm": 2.2842740849754115, + "language_loss": 0.75539434, + "learning_rate": 1.38167820974606e-07, + "loss": 0.77675307, + "num_input_tokens_seen": 317244505, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 14712, + "time_per_iteration": 2.4946510791778564 + }, + { + "auxiliary_loss_clip": 0.01098374, + "auxiliary_loss_mlp": 0.01024835, + "balance_loss_clip": 1.01309907, + "balance_loss_mlp": 1.03245926, + "epoch": 0.8845934165038328, + "flos": 17564591738880.0, + "grad_norm": 3.708605595590302, + "language_loss": 0.81021023, + "learning_rate": 1.3802561196964368e-07, + "loss": 0.83144236, + "num_input_tokens_seen": 317257830, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66015625, + "step": 14713, + "time_per_iteration": 2.479050397872925 + }, + { + "auxiliary_loss_clip": 0.01097877, + "auxiliary_loss_mlp": 0.01027435, + "balance_loss_clip": 1.01575828, + "balance_loss_mlp": 1.03250861, + "epoch": 0.8846535397565009, + "flos": 27485739757440.0, + "grad_norm": 1.4231578752957819, + "language_loss": 0.55540788, + "learning_rate": 1.3788347357093688e-07, + "loss": 0.57666099, + "num_input_tokens_seen": 317278430, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65234375, + "step": 14714, + "time_per_iteration": 2.521453857421875 + }, + { + "auxiliary_loss_clip": 0.01097743, + "auxiliary_loss_mlp": 0.0103307, + "balance_loss_clip": 1.0212388, + "balance_loss_mlp": 1.03320169, + "epoch": 0.8847136630091688, + "flos": 28761430256640.0, + "grad_norm": 1.743794814906259, + "language_loss": 0.73726749, + "learning_rate": 1.377414057838755e-07, + "loss": 0.75857568, + "num_input_tokens_seen": 317295970, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.64453125, + "step": 14715, + "time_per_iteration": 2.519960641860962 + }, + { + "auxiliary_loss_clip": 0.0109907, + "auxiliary_loss_mlp": 0.01029788, + "balance_loss_clip": 1.01849318, + "balance_loss_mlp": 1.03362608, + "epoch": 0.8847737862618368, + "flos": 23476924419840.0, + "grad_norm": 2.799687006211767, + "language_loss": 0.75298744, + "learning_rate": 1.375994086138461e-07, + "loss": 0.77427602, + "num_input_tokens_seen": 317316185, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 14716, + "time_per_iteration": 2.5302252769470215 + }, + { + "auxiliary_loss_clip": 0.01099052, + "auxiliary_loss_mlp": 0.01034067, + "balance_loss_clip": 1.02258122, + "balance_loss_mlp": 1.03498149, + "epoch": 0.8848339095145047, + "flos": 18660872782080.0, + "grad_norm": 1.9463895585575124, + "language_loss": 0.71236145, + "learning_rate": 1.3745748206623397e-07, + "loss": 0.73369265, + "num_input_tokens_seen": 317333275, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 14717, + "time_per_iteration": 2.4483509063720703 + }, + { + "auxiliary_loss_clip": 0.01095292, + "auxiliary_loss_mlp": 0.01030348, + "balance_loss_clip": 1.01952374, + "balance_loss_mlp": 1.03427327, + "epoch": 0.8848940327671727, + "flos": 32270298145920.0, + "grad_norm": 2.2448423833048667, + "language_loss": 0.74712592, + "learning_rate": 1.373156261464208e-07, + "loss": 0.76838231, + "num_input_tokens_seen": 317351245, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.609375, + "step": 14718, + "time_per_iteration": 2.528916597366333 + }, + { + "auxiliary_loss_clip": 0.0109995, + "auxiliary_loss_mlp": 0.01027778, + "balance_loss_clip": 1.01569605, + "balance_loss_mlp": 1.03310704, + "epoch": 0.8849541560198406, + "flos": 24021832717440.0, + "grad_norm": 1.5894409010966428, + "language_loss": 0.7822836, + "learning_rate": 1.3717384085978602e-07, + "loss": 0.80356085, + "num_input_tokens_seen": 317370740, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.66796875, + "step": 14719, + "time_per_iteration": 2.4806060791015625 + }, + { + "auxiliary_loss_clip": 0.01100885, + "auxiliary_loss_mlp": 0.01025821, + "balance_loss_clip": 1.01418066, + "balance_loss_mlp": 1.03466296, + "epoch": 0.8850142792725086, + "flos": 16873060124160.0, + "grad_norm": 1.5765510176535809, + "language_loss": 0.71778101, + "learning_rate": 1.3703212621170579e-07, + "loss": 0.73904806, + "num_input_tokens_seen": 317388370, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 14720, + "time_per_iteration": 3.864971160888672 + }, + { + "auxiliary_loss_clip": 0.01101707, + "auxiliary_loss_mlp": 0.01027746, + "balance_loss_clip": 1.01578975, + "balance_loss_mlp": 1.03353, + "epoch": 0.8850744025251767, + "flos": 24024059360640.0, + "grad_norm": 2.5959463277738974, + "language_loss": 0.82530278, + "learning_rate": 1.3689048220755383e-07, + "loss": 0.84659731, + "num_input_tokens_seen": 317407390, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6796875, + "step": 14721, + "time_per_iteration": 2.4602034091949463 + }, + { + "auxiliary_loss_clip": 0.01098248, + "auxiliary_loss_mlp": 0.01029899, + "balance_loss_clip": 1.01759672, + "balance_loss_mlp": 1.03253555, + "epoch": 0.8851345257778446, + "flos": 47955575329920.0, + "grad_norm": 1.6462057536303287, + "language_loss": 0.6220575, + "learning_rate": 1.3674890885270186e-07, + "loss": 0.64333898, + "num_input_tokens_seen": 317430825, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.65625, + "step": 14722, + "time_per_iteration": 4.181842565536499 + }, + { + "auxiliary_loss_clip": 0.01100414, + "auxiliary_loss_mlp": 0.01026875, + "balance_loss_clip": 1.01504993, + "balance_loss_mlp": 1.03351831, + "epoch": 0.8851946490305126, + "flos": 36611000173440.0, + "grad_norm": 1.8405460130697608, + "language_loss": 0.68605506, + "learning_rate": 1.3660740615251754e-07, + "loss": 0.70732802, + "num_input_tokens_seen": 317451905, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66796875, + "step": 14723, + "time_per_iteration": 2.5939276218414307 + }, + { + "auxiliary_loss_clip": 0.01098926, + "auxiliary_loss_mlp": 0.01029416, + "balance_loss_clip": 1.01789427, + "balance_loss_mlp": 1.03490961, + "epoch": 0.8852547722831805, + "flos": 21544248211200.0, + "grad_norm": 1.5982216956650597, + "language_loss": 0.77820933, + "learning_rate": 1.3646597411236703e-07, + "loss": 0.79949278, + "num_input_tokens_seen": 317470030, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 14724, + "time_per_iteration": 2.5080296993255615 + }, + { + "auxiliary_loss_clip": 0.0102205, + "auxiliary_loss_mlp": 0.00996579, + "balance_loss_clip": 0.995673, + "balance_loss_mlp": 1.00202656, + "epoch": 0.8853148955358485, + "flos": 63059246472960.0, + "grad_norm": 0.7976703792304201, + "language_loss": 0.58909416, + "learning_rate": 1.363246127376143e-07, + "loss": 0.60928047, + "num_input_tokens_seen": 317527460, + "router_z_loss_clip": 0.0090332, + "router_z_loss_mlp": 0.20019531, + "step": 14725, + "time_per_iteration": 2.942244529724121 + }, + { + "auxiliary_loss_clip": 0.01103081, + "auxiliary_loss_mlp": 0.01037165, + "balance_loss_clip": 1.0250175, + "balance_loss_mlp": 1.03376329, + "epoch": 0.8853750187885164, + "flos": 18149828031360.0, + "grad_norm": 3.7159601069719743, + "language_loss": 0.6908325, + "learning_rate": 1.3618332203361837e-07, + "loss": 0.71223497, + "num_input_tokens_seen": 317544070, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6953125, + "step": 14726, + "time_per_iteration": 3.910275459289551 + }, + { + "auxiliary_loss_clip": 0.01098863, + "auxiliary_loss_mlp": 0.01030877, + "balance_loss_clip": 1.0189023, + "balance_loss_mlp": 1.03549707, + "epoch": 0.8854351420411845, + "flos": 39570542392320.0, + "grad_norm": 1.4685006147064286, + "language_loss": 0.69542432, + "learning_rate": 1.3604210200573785e-07, + "loss": 0.71672177, + "num_input_tokens_seen": 317570275, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6328125, + "step": 14727, + "time_per_iteration": 2.6747992038726807 + }, + { + "auxiliary_loss_clip": 0.0110346, + "auxiliary_loss_mlp": 0.01031311, + "balance_loss_clip": 1.01936054, + "balance_loss_mlp": 1.03817511, + "epoch": 0.8854952652938524, + "flos": 23769309127680.0, + "grad_norm": 1.802344998111036, + "language_loss": 0.70243108, + "learning_rate": 1.3590095265932733e-07, + "loss": 0.72377884, + "num_input_tokens_seen": 317590160, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.65234375, + "step": 14728, + "time_per_iteration": 2.4881274700164795 + }, + { + "auxiliary_loss_clip": 0.0109924, + "auxiliary_loss_mlp": 0.01027662, + "balance_loss_clip": 1.01676655, + "balance_loss_mlp": 1.0337584, + "epoch": 0.8855553885465204, + "flos": 18290310122880.0, + "grad_norm": 2.1419875680940637, + "language_loss": 0.66392922, + "learning_rate": 1.3575987399973987e-07, + "loss": 0.68519825, + "num_input_tokens_seen": 317608340, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.65625, + "step": 14729, + "time_per_iteration": 2.422187566757202 + }, + { + "auxiliary_loss_clip": 0.01099558, + "auxiliary_loss_mlp": 0.01030434, + "balance_loss_clip": 1.01968694, + "balance_loss_mlp": 1.03564954, + "epoch": 0.8856155117991883, + "flos": 36867402432000.0, + "grad_norm": 1.7117430310189854, + "language_loss": 0.62781358, + "learning_rate": 1.3561886603232453e-07, + "loss": 0.64911354, + "num_input_tokens_seen": 317629910, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.640625, + "step": 14730, + "time_per_iteration": 2.5803756713867188 + }, + { + "auxiliary_loss_clip": 0.01096046, + "auxiliary_loss_mlp": 0.01033705, + "balance_loss_clip": 1.02272558, + "balance_loss_mlp": 1.03250694, + "epoch": 0.8856756350518563, + "flos": 22163886754560.0, + "grad_norm": 1.4792332459345614, + "language_loss": 0.79300416, + "learning_rate": 1.3547792876242904e-07, + "loss": 0.81430167, + "num_input_tokens_seen": 317650265, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6328125, + "step": 14731, + "time_per_iteration": 2.4924111366271973 + }, + { + "auxiliary_loss_clip": 0.01099374, + "auxiliary_loss_mlp": 0.01030243, + "balance_loss_clip": 1.0186559, + "balance_loss_mlp": 1.03311884, + "epoch": 0.8857357583045242, + "flos": 20740962407040.0, + "grad_norm": 1.6116048761510777, + "language_loss": 0.83205569, + "learning_rate": 1.3533706219539708e-07, + "loss": 0.85335195, + "num_input_tokens_seen": 317669045, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6640625, + "step": 14732, + "time_per_iteration": 2.4561238288879395 + }, + { + "auxiliary_loss_clip": 0.01022084, + "auxiliary_loss_mlp": 0.01000626, + "balance_loss_clip": 0.99961245, + "balance_loss_mlp": 1.00209713, + "epoch": 0.8857958815571922, + "flos": 69892329409920.0, + "grad_norm": 0.913679809419791, + "language_loss": 0.59908044, + "learning_rate": 1.3519626633657045e-07, + "loss": 0.61930752, + "num_input_tokens_seen": 317728065, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.20019531, + "step": 14733, + "time_per_iteration": 3.068289041519165 + }, + { + "auxiliary_loss_clip": 0.01100673, + "auxiliary_loss_mlp": 0.01032202, + "balance_loss_clip": 1.02050126, + "balance_loss_mlp": 1.03517413, + "epoch": 0.8858560048098603, + "flos": 15121948187520.0, + "grad_norm": 1.8626138238723922, + "language_loss": 0.66439319, + "learning_rate": 1.3505554119128838e-07, + "loss": 0.68572199, + "num_input_tokens_seen": 317746120, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 14734, + "time_per_iteration": 2.4276156425476074 + }, + { + "auxiliary_loss_clip": 0.01099506, + "auxiliary_loss_mlp": 0.01036665, + "balance_loss_clip": 1.02576971, + "balance_loss_mlp": 1.03578985, + "epoch": 0.8859161280625282, + "flos": 16611019430400.0, + "grad_norm": 2.0317348064354213, + "language_loss": 0.75379711, + "learning_rate": 1.3491488676488682e-07, + "loss": 0.77515882, + "num_input_tokens_seen": 317762280, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.63671875, + "step": 14735, + "time_per_iteration": 2.4584195613861084 + }, + { + "auxiliary_loss_clip": 0.01100195, + "auxiliary_loss_mlp": 0.01029168, + "balance_loss_clip": 1.01778936, + "balance_loss_mlp": 1.03425932, + "epoch": 0.8859762513151962, + "flos": 18694484933760.0, + "grad_norm": 1.6977065723830995, + "language_loss": 0.7023108, + "learning_rate": 1.3477430306270066e-07, + "loss": 0.72360444, + "num_input_tokens_seen": 317780615, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.66015625, + "step": 14736, + "time_per_iteration": 2.460245132446289 + }, + { + "auxiliary_loss_clip": 0.01102419, + "auxiliary_loss_mlp": 0.0102873, + "balance_loss_clip": 1.01685667, + "balance_loss_mlp": 1.03579187, + "epoch": 0.8860363745678641, + "flos": 19536877670400.0, + "grad_norm": 1.8186891549833935, + "language_loss": 0.84355164, + "learning_rate": 1.3463379009005892e-07, + "loss": 0.86486316, + "num_input_tokens_seen": 317798830, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6640625, + "step": 14737, + "time_per_iteration": 2.451251745223999 + }, + { + "auxiliary_loss_clip": 0.01106985, + "auxiliary_loss_mlp": 0.01031522, + "balance_loss_clip": 1.01867151, + "balance_loss_mlp": 1.03683579, + "epoch": 0.8860964978205321, + "flos": 35954912304000.0, + "grad_norm": 3.2801540845038777, + "language_loss": 0.68354762, + "learning_rate": 1.3449334785229093e-07, + "loss": 0.70493269, + "num_input_tokens_seen": 317819235, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69921875, + "step": 14738, + "time_per_iteration": 2.5500543117523193 + }, + { + "auxiliary_loss_clip": 0.01103471, + "auxiliary_loss_mlp": 0.01026359, + "balance_loss_clip": 1.01440811, + "balance_loss_mlp": 1.03358066, + "epoch": 0.8861566210732, + "flos": 21212577002880.0, + "grad_norm": 1.7831357907048164, + "language_loss": 0.75100833, + "learning_rate": 1.343529763547222e-07, + "loss": 0.77230668, + "num_input_tokens_seen": 317836785, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.69921875, + "step": 14739, + "time_per_iteration": 2.4511783123016357 + }, + { + "auxiliary_loss_clip": 0.01096933, + "auxiliary_loss_mlp": 0.01028255, + "balance_loss_clip": 1.01746702, + "balance_loss_mlp": 1.03344214, + "epoch": 0.886216744325868, + "flos": 14609071843200.0, + "grad_norm": 1.7409264572632928, + "language_loss": 0.86878449, + "learning_rate": 1.3421267560267559e-07, + "loss": 0.8900364, + "num_input_tokens_seen": 317854225, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6328125, + "step": 14740, + "time_per_iteration": 2.4006471633911133 + }, + { + "auxiliary_loss_clip": 0.0110013, + "auxiliary_loss_mlp": 0.01031657, + "balance_loss_clip": 1.02006936, + "balance_loss_mlp": 1.03563619, + "epoch": 0.886276867578536, + "flos": 26651643062400.0, + "grad_norm": 1.7595112393939192, + "language_loss": 0.63362885, + "learning_rate": 1.34072445601471e-07, + "loss": 0.65494668, + "num_input_tokens_seen": 317874865, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.64453125, + "step": 14741, + "time_per_iteration": 2.4974660873413086 + }, + { + "auxiliary_loss_clip": 0.01099837, + "auxiliary_loss_mlp": 0.0103215, + "balance_loss_clip": 1.02044952, + "balance_loss_mlp": 1.03457093, + "epoch": 0.886336990831204, + "flos": 16764071281920.0, + "grad_norm": 2.724117158165582, + "language_loss": 0.72620136, + "learning_rate": 1.3393228635642717e-07, + "loss": 0.74752122, + "num_input_tokens_seen": 317892830, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65234375, + "step": 14742, + "time_per_iteration": 2.4552924633026123 + }, + { + "auxiliary_loss_clip": 0.01098575, + "auxiliary_loss_mlp": 0.0103209, + "balance_loss_clip": 1.02099192, + "balance_loss_mlp": 1.03406441, + "epoch": 0.8863971140838719, + "flos": 25265275781760.0, + "grad_norm": 1.890478101266105, + "language_loss": 0.59076136, + "learning_rate": 1.3379219787285733e-07, + "loss": 0.61206806, + "num_input_tokens_seen": 317911780, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.64453125, + "step": 14743, + "time_per_iteration": 2.4963841438293457 + }, + { + "auxiliary_loss_clip": 0.01102411, + "auxiliary_loss_mlp": 0.01030911, + "balance_loss_clip": 1.01763701, + "balance_loss_mlp": 1.03440762, + "epoch": 0.8864572373365399, + "flos": 23404313076480.0, + "grad_norm": 1.8739935931766052, + "language_loss": 0.60211849, + "learning_rate": 1.3365218015607437e-07, + "loss": 0.62345171, + "num_input_tokens_seen": 317932855, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6796875, + "step": 14744, + "time_per_iteration": 2.488271474838257 + }, + { + "auxiliary_loss_clip": 0.01099725, + "auxiliary_loss_mlp": 0.01033783, + "balance_loss_clip": 1.02141476, + "balance_loss_mlp": 1.0342828, + "epoch": 0.8865173605892078, + "flos": 18548759456640.0, + "grad_norm": 1.5337573847338424, + "language_loss": 0.76551473, + "learning_rate": 1.3351223321138762e-07, + "loss": 0.78684986, + "num_input_tokens_seen": 317952090, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.65625, + "step": 14745, + "time_per_iteration": 2.4542477130889893 + }, + { + "auxiliary_loss_clip": 0.01098813, + "auxiliary_loss_mlp": 0.01034127, + "balance_loss_clip": 1.02296925, + "balance_loss_mlp": 1.0353713, + "epoch": 0.8865774838418758, + "flos": 19025868833280.0, + "grad_norm": 1.7959141313080134, + "language_loss": 0.77085936, + "learning_rate": 1.3337235704410454e-07, + "loss": 0.79218876, + "num_input_tokens_seen": 317970370, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.63671875, + "step": 14746, + "time_per_iteration": 2.4547431468963623 + }, + { + "auxiliary_loss_clip": 0.01104158, + "auxiliary_loss_mlp": 0.01030457, + "balance_loss_clip": 1.01819599, + "balance_loss_mlp": 1.03627443, + "epoch": 0.8866376070945439, + "flos": 22163168482560.0, + "grad_norm": 1.8871990542262298, + "language_loss": 0.76628375, + "learning_rate": 1.3323255165952873e-07, + "loss": 0.78762996, + "num_input_tokens_seen": 317989125, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6796875, + "step": 14747, + "time_per_iteration": 2.4549856185913086 + }, + { + "auxiliary_loss_clip": 0.01094661, + "auxiliary_loss_mlp": 0.01026241, + "balance_loss_clip": 1.01498127, + "balance_loss_mlp": 1.03204846, + "epoch": 0.8866977303472118, + "flos": 20704261685760.0, + "grad_norm": 1.7887126918220513, + "language_loss": 0.82725775, + "learning_rate": 1.3309281706296127e-07, + "loss": 0.84846675, + "num_input_tokens_seen": 318007820, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.625, + "step": 14748, + "time_per_iteration": 2.4627127647399902 + }, + { + "auxiliary_loss_clip": 0.01100636, + "auxiliary_loss_mlp": 0.01029339, + "balance_loss_clip": 1.01723945, + "balance_loss_mlp": 1.03471351, + "epoch": 0.8867578535998798, + "flos": 48794448533760.0, + "grad_norm": 2.197486200203094, + "language_loss": 0.77274418, + "learning_rate": 1.3295315325970148e-07, + "loss": 0.7940439, + "num_input_tokens_seen": 318030435, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66015625, + "step": 14749, + "time_per_iteration": 2.6969592571258545 + }, + { + "auxiliary_loss_clip": 0.01100997, + "auxiliary_loss_mlp": 0.01033354, + "balance_loss_clip": 1.02116513, + "balance_loss_mlp": 1.03323364, + "epoch": 0.8868179768525477, + "flos": 21105312013440.0, + "grad_norm": 1.8706744703437488, + "language_loss": 0.69848335, + "learning_rate": 1.328135602550451e-07, + "loss": 0.71982694, + "num_input_tokens_seen": 318049465, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6796875, + "step": 14750, + "time_per_iteration": 2.4876596927642822 + }, + { + "auxiliary_loss_clip": 0.0109901, + "auxiliary_loss_mlp": 0.01029706, + "balance_loss_clip": 1.01825023, + "balance_loss_mlp": 1.03434694, + "epoch": 0.8868781001052157, + "flos": 21830922656640.0, + "grad_norm": 2.01913942737069, + "language_loss": 0.59346163, + "learning_rate": 1.3267403805428546e-07, + "loss": 0.61474878, + "num_input_tokens_seen": 318067760, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6484375, + "step": 14751, + "time_per_iteration": 2.4688756465911865 + }, + { + "auxiliary_loss_clip": 0.01100041, + "auxiliary_loss_mlp": 0.01027394, + "balance_loss_clip": 1.01586664, + "balance_loss_mlp": 1.03530931, + "epoch": 0.8869382233578836, + "flos": 13516418073600.0, + "grad_norm": 2.1999031985254436, + "language_loss": 0.81069493, + "learning_rate": 1.3253458666271344e-07, + "loss": 0.83196926, + "num_input_tokens_seen": 318082785, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 14752, + "time_per_iteration": 2.4317078590393066 + }, + { + "auxiliary_loss_clip": 0.01105544, + "auxiliary_loss_mlp": 0.01030661, + "balance_loss_clip": 1.01819205, + "balance_loss_mlp": 1.03595507, + "epoch": 0.8869983466105517, + "flos": 22704988210560.0, + "grad_norm": 1.8695812916150454, + "language_loss": 0.80406618, + "learning_rate": 1.3239520608561793e-07, + "loss": 0.82542825, + "num_input_tokens_seen": 318101925, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69921875, + "step": 14753, + "time_per_iteration": 2.4720840454101562 + }, + { + "auxiliary_loss_clip": 0.01097069, + "auxiliary_loss_mlp": 0.01030109, + "balance_loss_clip": 1.01882041, + "balance_loss_mlp": 1.03314829, + "epoch": 0.8870584698632196, + "flos": 15340751884800.0, + "grad_norm": 1.9768901088990127, + "language_loss": 0.65004474, + "learning_rate": 1.3225589632828248e-07, + "loss": 0.6713165, + "num_input_tokens_seen": 318119945, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 14754, + "time_per_iteration": 2.4193625450134277 + }, + { + "auxiliary_loss_clip": 0.01102106, + "auxiliary_loss_mlp": 0.01031348, + "balance_loss_clip": 1.01968324, + "balance_loss_mlp": 1.03563762, + "epoch": 0.8871185931158876, + "flos": 26615624699520.0, + "grad_norm": 1.8164304969475906, + "language_loss": 0.7455616, + "learning_rate": 1.3211665739599065e-07, + "loss": 0.76689613, + "num_input_tokens_seen": 318139685, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 14755, + "time_per_iteration": 2.493603229522705 + }, + { + "auxiliary_loss_clip": 0.01099041, + "auxiliary_loss_mlp": 0.01030604, + "balance_loss_clip": 1.01815271, + "balance_loss_mlp": 1.03300142, + "epoch": 0.8871787163685555, + "flos": 21799034357760.0, + "grad_norm": 1.634586619423876, + "language_loss": 0.77746713, + "learning_rate": 1.3197748929402262e-07, + "loss": 0.79876363, + "num_input_tokens_seen": 318160375, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.66015625, + "step": 14756, + "time_per_iteration": 2.4780538082122803 + }, + { + "auxiliary_loss_clip": 0.01100814, + "auxiliary_loss_mlp": 0.01032077, + "balance_loss_clip": 1.02048993, + "balance_loss_mlp": 1.03467703, + "epoch": 0.8872388396212235, + "flos": 14902964922240.0, + "grad_norm": 1.8588799430656529, + "language_loss": 0.76319844, + "learning_rate": 1.3183839202765535e-07, + "loss": 0.78452736, + "num_input_tokens_seen": 318177995, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6640625, + "step": 14757, + "time_per_iteration": 2.4160494804382324 + }, + { + "auxiliary_loss_clip": 0.01096707, + "auxiliary_loss_mlp": 0.01032976, + "balance_loss_clip": 1.02180624, + "balance_loss_mlp": 1.03396797, + "epoch": 0.8872989628738914, + "flos": 26432157006720.0, + "grad_norm": 1.8526342000102967, + "language_loss": 0.67985821, + "learning_rate": 1.316993656021632e-07, + "loss": 0.70115507, + "num_input_tokens_seen": 318197030, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.625, + "step": 14758, + "time_per_iteration": 2.512890100479126 + }, + { + "auxiliary_loss_clip": 0.01100758, + "auxiliary_loss_mlp": 0.01031037, + "balance_loss_clip": 1.01833534, + "balance_loss_mlp": 1.03502667, + "epoch": 0.8873590861265594, + "flos": 48142562555520.0, + "grad_norm": 1.5896545360448344, + "language_loss": 0.68797654, + "learning_rate": 1.3156041002281915e-07, + "loss": 0.70929444, + "num_input_tokens_seen": 318221780, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.65625, + "step": 14759, + "time_per_iteration": 2.694464683532715 + }, + { + "auxiliary_loss_clip": 0.01096524, + "auxiliary_loss_mlp": 0.01028283, + "balance_loss_clip": 1.01685667, + "balance_loss_mlp": 1.0320487, + "epoch": 0.8874192093792275, + "flos": 18332972501760.0, + "grad_norm": 1.9328359040343546, + "language_loss": 0.74210972, + "learning_rate": 1.3142152529489092e-07, + "loss": 0.76335776, + "num_input_tokens_seen": 318239710, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.64453125, + "step": 14760, + "time_per_iteration": 2.450502634048462 + }, + { + "auxiliary_loss_clip": 0.01102656, + "auxiliary_loss_mlp": 0.01029431, + "balance_loss_clip": 1.01763535, + "balance_loss_mlp": 1.0351845, + "epoch": 0.8874793326318954, + "flos": 17894215872000.0, + "grad_norm": 2.2540100924587434, + "language_loss": 0.75508064, + "learning_rate": 1.3128271142364565e-07, + "loss": 0.77640146, + "num_input_tokens_seen": 318257425, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.67578125, + "step": 14761, + "time_per_iteration": 2.450575351715088 + }, + { + "auxiliary_loss_clip": 0.01100474, + "auxiliary_loss_mlp": 0.01038175, + "balance_loss_clip": 1.02637935, + "balance_loss_mlp": 1.03368759, + "epoch": 0.8875394558845634, + "flos": 31102231772160.0, + "grad_norm": 2.70563758191793, + "language_loss": 0.61649144, + "learning_rate": 1.3114396841434717e-07, + "loss": 0.63787794, + "num_input_tokens_seen": 318278485, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66796875, + "step": 14762, + "time_per_iteration": 3.9083144664764404 + }, + { + "auxiliary_loss_clip": 0.01099715, + "auxiliary_loss_mlp": 0.01029623, + "balance_loss_clip": 1.01740968, + "balance_loss_mlp": 1.03380537, + "epoch": 0.8875995791372313, + "flos": 21142048648320.0, + "grad_norm": 1.8083411551744764, + "language_loss": 0.64272511, + "learning_rate": 1.3100529627225697e-07, + "loss": 0.66401851, + "num_input_tokens_seen": 318297560, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65625, + "step": 14763, + "time_per_iteration": 5.301745176315308 + }, + { + "auxiliary_loss_clip": 0.01100472, + "auxiliary_loss_mlp": 0.01030098, + "balance_loss_clip": 1.01755691, + "balance_loss_mlp": 1.03406501, + "epoch": 0.8876597023898993, + "flos": 17455136019840.0, + "grad_norm": 2.1398532909429635, + "language_loss": 0.71166742, + "learning_rate": 1.3086669500263335e-07, + "loss": 0.7329731, + "num_input_tokens_seen": 318313060, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6640625, + "step": 14764, + "time_per_iteration": 2.4461166858673096 + }, + { + "auxiliary_loss_clip": 0.01103023, + "auxiliary_loss_mlp": 0.01031847, + "balance_loss_clip": 1.020015, + "balance_loss_mlp": 1.0344727, + "epoch": 0.8877198256425672, + "flos": 22707933125760.0, + "grad_norm": 2.7429750969240043, + "language_loss": 0.66583252, + "learning_rate": 1.3072816461073166e-07, + "loss": 0.68718123, + "num_input_tokens_seen": 318332030, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 14765, + "time_per_iteration": 2.46746563911438 + }, + { + "auxiliary_loss_clip": 0.01096438, + "auxiliary_loss_mlp": 0.01024105, + "balance_loss_clip": 1.0139488, + "balance_loss_mlp": 1.0340445, + "epoch": 0.8877799488952353, + "flos": 24535104111360.0, + "grad_norm": 1.6144347277628304, + "language_loss": 0.76532453, + "learning_rate": 1.3058970510180568e-07, + "loss": 0.78652996, + "num_input_tokens_seen": 318351090, + "router_z_loss_clip": 0.1015625, + "router_z_loss_mlp": 0.625, + "step": 14766, + "time_per_iteration": 2.511964797973633 + }, + { + "auxiliary_loss_clip": 0.01096312, + "auxiliary_loss_mlp": 0.01029078, + "balance_loss_clip": 1.01800895, + "balance_loss_mlp": 1.0334599, + "epoch": 0.8878400721479032, + "flos": 20959191486720.0, + "grad_norm": 2.2390564456183863, + "language_loss": 0.73575568, + "learning_rate": 1.3045131648110496e-07, + "loss": 0.75700963, + "num_input_tokens_seen": 318372000, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.62890625, + "step": 14767, + "time_per_iteration": 2.454369306564331 + }, + { + "auxiliary_loss_clip": 0.01095656, + "auxiliary_loss_mlp": 0.01023366, + "balance_loss_clip": 1.01239324, + "balance_loss_mlp": 1.03359067, + "epoch": 0.8879001954005712, + "flos": 25295260659840.0, + "grad_norm": 1.829789485746044, + "language_loss": 0.71202058, + "learning_rate": 1.303129987538778e-07, + "loss": 0.7332108, + "num_input_tokens_seen": 318391530, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.62109375, + "step": 14768, + "time_per_iteration": 2.5051445960998535 + }, + { + "auxiliary_loss_clip": 0.01097532, + "auxiliary_loss_mlp": 0.01027923, + "balance_loss_clip": 1.016711, + "balance_loss_mlp": 1.03297067, + "epoch": 0.8879603186532391, + "flos": 23185329811200.0, + "grad_norm": 1.8019675582043564, + "language_loss": 0.70299733, + "learning_rate": 1.3017475192536932e-07, + "loss": 0.72425187, + "num_input_tokens_seen": 318410690, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.64453125, + "step": 14769, + "time_per_iteration": 4.037384271621704 + }, + { + "auxiliary_loss_clip": 0.01098828, + "auxiliary_loss_mlp": 0.01031996, + "balance_loss_clip": 1.02146339, + "balance_loss_mlp": 1.0355804, + "epoch": 0.8880204419059071, + "flos": 13655427707520.0, + "grad_norm": 2.2333804344383847, + "language_loss": 0.67153198, + "learning_rate": 1.3003657600082174e-07, + "loss": 0.69284022, + "num_input_tokens_seen": 318427380, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6328125, + "step": 14770, + "time_per_iteration": 2.4550540447235107 + }, + { + "auxiliary_loss_clip": 0.01096046, + "auxiliary_loss_mlp": 0.01028734, + "balance_loss_clip": 1.01742697, + "balance_loss_mlp": 1.03437459, + "epoch": 0.888080565158575, + "flos": 20631865824000.0, + "grad_norm": 1.686571312433287, + "language_loss": 0.65049809, + "learning_rate": 1.2989847098547424e-07, + "loss": 0.6717459, + "num_input_tokens_seen": 318448530, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6171875, + "step": 14771, + "time_per_iteration": 2.506824016571045 + }, + { + "auxiliary_loss_clip": 0.01097555, + "auxiliary_loss_mlp": 0.01026042, + "balance_loss_clip": 1.01502669, + "balance_loss_mlp": 1.0331111, + "epoch": 0.888140688411243, + "flos": 28620014411520.0, + "grad_norm": 1.4703989654139515, + "language_loss": 0.82365024, + "learning_rate": 1.2976043688456396e-07, + "loss": 0.84488624, + "num_input_tokens_seen": 318468655, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.64453125, + "step": 14772, + "time_per_iteration": 2.5196051597595215 + }, + { + "auxiliary_loss_clip": 0.01093264, + "auxiliary_loss_mlp": 0.01022956, + "balance_loss_clip": 1.01254296, + "balance_loss_mlp": 1.03136611, + "epoch": 0.8882008116639111, + "flos": 25520241496320.0, + "grad_norm": 1.5905781508550767, + "language_loss": 0.76286173, + "learning_rate": 1.296224737033258e-07, + "loss": 0.78402388, + "num_input_tokens_seen": 318488740, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.6171875, + "step": 14773, + "time_per_iteration": 2.5159168243408203 + }, + { + "auxiliary_loss_clip": 0.01096414, + "auxiliary_loss_mlp": 0.0102585, + "balance_loss_clip": 1.01539564, + "balance_loss_mlp": 1.03436065, + "epoch": 0.888260934916579, + "flos": 27673696650240.0, + "grad_norm": 1.9010559133370122, + "language_loss": 0.74874908, + "learning_rate": 1.294845814469907e-07, + "loss": 0.76997173, + "num_input_tokens_seen": 318508810, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.62109375, + "step": 14774, + "time_per_iteration": 2.5161659717559814 + }, + { + "auxiliary_loss_clip": 0.01100538, + "auxiliary_loss_mlp": 0.01031054, + "balance_loss_clip": 1.01881158, + "balance_loss_mlp": 1.03431296, + "epoch": 0.888321058169247, + "flos": 21611077464960.0, + "grad_norm": 2.3755667319162383, + "language_loss": 0.72226775, + "learning_rate": 1.2934676012078783e-07, + "loss": 0.74358368, + "num_input_tokens_seen": 318526860, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6640625, + "step": 14775, + "time_per_iteration": 2.4795637130737305 + }, + { + "auxiliary_loss_clip": 0.01097248, + "auxiliary_loss_mlp": 0.0102801, + "balance_loss_clip": 1.01710868, + "balance_loss_mlp": 1.03339446, + "epoch": 0.8883811814219149, + "flos": 18149109759360.0, + "grad_norm": 1.6483138491279807, + "language_loss": 0.80294418, + "learning_rate": 1.292090097299432e-07, + "loss": 0.82419682, + "num_input_tokens_seen": 318545180, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.640625, + "step": 14776, + "time_per_iteration": 2.453660726547241 + }, + { + "auxiliary_loss_clip": 0.01101713, + "auxiliary_loss_mlp": 0.01031535, + "balance_loss_clip": 1.01928067, + "balance_loss_mlp": 1.03330636, + "epoch": 0.8884413046745829, + "flos": 28324648874880.0, + "grad_norm": 3.6584424512501976, + "language_loss": 0.69919568, + "learning_rate": 1.290713302796802e-07, + "loss": 0.72052813, + "num_input_tokens_seen": 318564350, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.68359375, + "step": 14777, + "time_per_iteration": 2.537234306335449 + }, + { + "auxiliary_loss_clip": 0.01096023, + "auxiliary_loss_mlp": 0.01034119, + "balance_loss_clip": 1.02306223, + "balance_loss_mlp": 1.03184962, + "epoch": 0.8885014279272508, + "flos": 15158756649600.0, + "grad_norm": 1.735112860349567, + "language_loss": 0.70467377, + "learning_rate": 1.2893372177522e-07, + "loss": 0.72597522, + "num_input_tokens_seen": 318582275, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 14778, + "time_per_iteration": 2.467770576477051 + }, + { + "auxiliary_loss_clip": 0.01098895, + "auxiliary_loss_mlp": 0.01028221, + "balance_loss_clip": 1.0175935, + "balance_loss_mlp": 1.03429198, + "epoch": 0.8885615511799189, + "flos": 19099593498240.0, + "grad_norm": 1.5537401295663211, + "language_loss": 0.77455193, + "learning_rate": 1.287961842217804e-07, + "loss": 0.7958231, + "num_input_tokens_seen": 318601230, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.6484375, + "step": 14779, + "time_per_iteration": 2.4519615173339844 + }, + { + "auxiliary_loss_clip": 0.01021951, + "auxiliary_loss_mlp": 0.01002391, + "balance_loss_clip": 1.00141323, + "balance_loss_mlp": 1.00185418, + "epoch": 0.8886216744325868, + "flos": 51186567605760.0, + "grad_norm": 0.9099787300950598, + "language_loss": 0.56692004, + "learning_rate": 1.2865871762457747e-07, + "loss": 0.58716345, + "num_input_tokens_seen": 318645595, + "router_z_loss_clip": 0.00976562, + "router_z_loss_mlp": 0.20117188, + "step": 14780, + "time_per_iteration": 2.8395655155181885 + }, + { + "auxiliary_loss_clip": 0.01021748, + "auxiliary_loss_mlp": 0.01001636, + "balance_loss_clip": 1.00069416, + "balance_loss_mlp": 1.00195396, + "epoch": 0.8886817976852548, + "flos": 61612981263360.0, + "grad_norm": 0.7880016601364539, + "language_loss": 0.6246208, + "learning_rate": 1.2852132198882326e-07, + "loss": 0.64485466, + "num_input_tokens_seen": 318707850, + "router_z_loss_clip": 0.00939941, + "router_z_loss_mlp": 0.19824219, + "step": 14781, + "time_per_iteration": 3.128025770187378 + }, + { + "auxiliary_loss_clip": 0.01022011, + "auxiliary_loss_mlp": 0.01002356, + "balance_loss_clip": 1.00142026, + "balance_loss_mlp": 1.00189745, + "epoch": 0.8887419209379227, + "flos": 60646946935680.0, + "grad_norm": 0.7913802698138945, + "language_loss": 0.58146596, + "learning_rate": 1.2838399731972805e-07, + "loss": 0.6017096, + "num_input_tokens_seen": 318764915, + "router_z_loss_clip": 0.00933838, + "router_z_loss_mlp": 0.20117188, + "step": 14782, + "time_per_iteration": 2.9118587970733643 + }, + { + "auxiliary_loss_clip": 0.01097314, + "auxiliary_loss_mlp": 0.0102938, + "balance_loss_clip": 1.01874638, + "balance_loss_mlp": 1.03459406, + "epoch": 0.8888020441905907, + "flos": 29205861235200.0, + "grad_norm": 2.667047910128226, + "language_loss": 0.65728068, + "learning_rate": 1.2824674362249922e-07, + "loss": 0.67854762, + "num_input_tokens_seen": 318785660, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.625, + "step": 14783, + "time_per_iteration": 2.531919002532959 + }, + { + "auxiliary_loss_clip": 0.01102053, + "auxiliary_loss_mlp": 0.0103183, + "balance_loss_clip": 1.01958156, + "balance_loss_mlp": 1.03455818, + "epoch": 0.8888621674432586, + "flos": 22162701605760.0, + "grad_norm": 1.5498583522187301, + "language_loss": 0.77504814, + "learning_rate": 1.281095609023415e-07, + "loss": 0.79638696, + "num_input_tokens_seen": 318806080, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.67578125, + "step": 14784, + "time_per_iteration": 2.468636989593506 + }, + { + "auxiliary_loss_clip": 0.01101877, + "auxiliary_loss_mlp": 0.01029403, + "balance_loss_clip": 1.01757181, + "balance_loss_mlp": 1.03568482, + "epoch": 0.8889222906959267, + "flos": 27672834723840.0, + "grad_norm": 2.219338510928114, + "language_loss": 0.60414922, + "learning_rate": 1.279724491644565e-07, + "loss": 0.62546206, + "num_input_tokens_seen": 318826445, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 14785, + "time_per_iteration": 2.525151252746582 + }, + { + "auxiliary_loss_clip": 0.01100607, + "auxiliary_loss_mlp": 0.01030296, + "balance_loss_clip": 1.0186789, + "balance_loss_mlp": 1.03575349, + "epoch": 0.8889824139485947, + "flos": 14168627274240.0, + "grad_norm": 1.7836289092233713, + "language_loss": 0.64846861, + "learning_rate": 1.278354084140445e-07, + "loss": 0.66977763, + "num_input_tokens_seen": 318843915, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6484375, + "step": 14786, + "time_per_iteration": 2.395446300506592 + }, + { + "auxiliary_loss_clip": 0.01103855, + "auxiliary_loss_mlp": 0.01030306, + "balance_loss_clip": 1.01772904, + "balance_loss_mlp": 1.03465486, + "epoch": 0.8890425372012626, + "flos": 12853003829760.0, + "grad_norm": 2.227669183130591, + "language_loss": 0.85661733, + "learning_rate": 1.276984386563009e-07, + "loss": 0.87795901, + "num_input_tokens_seen": 318859670, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69140625, + "step": 14787, + "time_per_iteration": 2.432615041732788 + }, + { + "auxiliary_loss_clip": 0.01099197, + "auxiliary_loss_mlp": 0.01029873, + "balance_loss_clip": 1.01820874, + "balance_loss_mlp": 1.03418851, + "epoch": 0.8891026604539306, + "flos": 21689291329920.0, + "grad_norm": 1.834557891315271, + "language_loss": 0.71064335, + "learning_rate": 1.2756153989642027e-07, + "loss": 0.73193407, + "num_input_tokens_seen": 318877855, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6484375, + "step": 14788, + "time_per_iteration": 2.4537904262542725 + }, + { + "auxiliary_loss_clip": 0.01095263, + "auxiliary_loss_mlp": 0.01027034, + "balance_loss_clip": 1.01619184, + "balance_loss_mlp": 1.03322816, + "epoch": 0.8891627837065985, + "flos": 21871430219520.0, + "grad_norm": 1.5923121621741885, + "language_loss": 0.70096779, + "learning_rate": 1.274247121395935e-07, + "loss": 0.72219074, + "num_input_tokens_seen": 318896045, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6171875, + "step": 14789, + "time_per_iteration": 2.476649284362793 + }, + { + "auxiliary_loss_clip": 0.01099815, + "auxiliary_loss_mlp": 0.01023209, + "balance_loss_clip": 1.01159167, + "balance_loss_mlp": 1.03555179, + "epoch": 0.8892229069592665, + "flos": 21580230660480.0, + "grad_norm": 5.015147056087475, + "language_loss": 0.70436954, + "learning_rate": 1.2728795539100956e-07, + "loss": 0.72559977, + "num_input_tokens_seen": 318915515, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.640625, + "step": 14790, + "time_per_iteration": 2.4777982234954834 + }, + { + "auxiliary_loss_clip": 0.0109958, + "auxiliary_loss_mlp": 0.01025006, + "balance_loss_clip": 1.01421189, + "balance_loss_mlp": 1.03437293, + "epoch": 0.8892830302119344, + "flos": 23075981832960.0, + "grad_norm": 3.521302837326341, + "language_loss": 0.73018265, + "learning_rate": 1.2715126965585387e-07, + "loss": 0.75142848, + "num_input_tokens_seen": 318934305, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.65234375, + "step": 14791, + "time_per_iteration": 2.489640951156616 + }, + { + "auxiliary_loss_clip": 0.01097004, + "auxiliary_loss_mlp": 0.01033084, + "balance_loss_clip": 1.02203906, + "balance_loss_mlp": 1.03449202, + "epoch": 0.8893431534646025, + "flos": 23072139077760.0, + "grad_norm": 2.988212412187788, + "language_loss": 0.74027723, + "learning_rate": 1.2701465493931008e-07, + "loss": 0.76157808, + "num_input_tokens_seen": 318953880, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.625, + "step": 14792, + "time_per_iteration": 2.4790284633636475 + }, + { + "auxiliary_loss_clip": 0.01102563, + "auxiliary_loss_mlp": 0.01030706, + "balance_loss_clip": 1.01840937, + "balance_loss_mlp": 1.03461504, + "epoch": 0.8894032767172704, + "flos": 22454978572800.0, + "grad_norm": 1.8885669477718985, + "language_loss": 0.65883052, + "learning_rate": 1.2687811124655801e-07, + "loss": 0.68016326, + "num_input_tokens_seen": 318971395, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6796875, + "step": 14793, + "time_per_iteration": 2.4892311096191406 + }, + { + "auxiliary_loss_clip": 0.01101873, + "auxiliary_loss_mlp": 0.01030691, + "balance_loss_clip": 1.01855588, + "balance_loss_mlp": 1.03431058, + "epoch": 0.8894633999699384, + "flos": 25338246261120.0, + "grad_norm": 1.6206939112924994, + "language_loss": 0.71852094, + "learning_rate": 1.2674163858277552e-07, + "loss": 0.73984659, + "num_input_tokens_seen": 318990580, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 14794, + "time_per_iteration": 2.5023927688598633 + }, + { + "auxiliary_loss_clip": 0.01104706, + "auxiliary_loss_mlp": 0.0102926, + "balance_loss_clip": 1.01692796, + "balance_loss_mlp": 1.03636956, + "epoch": 0.8895235232226063, + "flos": 20994096528000.0, + "grad_norm": 2.0233145744012853, + "language_loss": 0.75055683, + "learning_rate": 1.2660523695313785e-07, + "loss": 0.77189648, + "num_input_tokens_seen": 319010040, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.68359375, + "step": 14795, + "time_per_iteration": 2.44732403755188 + }, + { + "auxiliary_loss_clip": 0.0102198, + "auxiliary_loss_mlp": 0.01003025, + "balance_loss_clip": 1.00208306, + "balance_loss_mlp": 1.00205803, + "epoch": 0.8895836464752743, + "flos": 69732956764800.0, + "grad_norm": 0.7697530463267467, + "language_loss": 0.56135261, + "learning_rate": 1.2646890636281727e-07, + "loss": 0.58160269, + "num_input_tokens_seen": 319063860, + "router_z_loss_clip": 0.00939941, + "router_z_loss_mlp": 0.19921875, + "step": 14796, + "time_per_iteration": 2.9481120109558105 + }, + { + "auxiliary_loss_clip": 0.01102738, + "auxiliary_loss_mlp": 0.01031908, + "balance_loss_clip": 1.01900983, + "balance_loss_mlp": 1.03571939, + "epoch": 0.8896437697279422, + "flos": 23221815050880.0, + "grad_norm": 1.8360219306966759, + "language_loss": 0.70659775, + "learning_rate": 1.263326468169843e-07, + "loss": 0.7279442, + "num_input_tokens_seen": 319082335, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.671875, + "step": 14797, + "time_per_iteration": 2.4758121967315674 + }, + { + "auxiliary_loss_clip": 0.01021915, + "auxiliary_loss_mlp": 0.01001904, + "balance_loss_clip": 1.00082493, + "balance_loss_mlp": 1.00191402, + "epoch": 0.8897038929806103, + "flos": 70752711882240.0, + "grad_norm": 0.7562531422498101, + "language_loss": 0.58068562, + "learning_rate": 1.2619645832080417e-07, + "loss": 0.60092378, + "num_input_tokens_seen": 319147075, + "router_z_loss_clip": 0.01080322, + "router_z_loss_mlp": 0.19921875, + "step": 14798, + "time_per_iteration": 3.100543260574341 + }, + { + "auxiliary_loss_clip": 0.01099245, + "auxiliary_loss_mlp": 0.01024813, + "balance_loss_clip": 1.01277268, + "balance_loss_mlp": 1.03444302, + "epoch": 0.8897640162332782, + "flos": 19245103493760.0, + "grad_norm": 1.5369346664635186, + "language_loss": 0.79333103, + "learning_rate": 1.2606034087944251e-07, + "loss": 0.81457162, + "num_input_tokens_seen": 319166630, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6484375, + "step": 14799, + "time_per_iteration": 2.4709973335266113 + }, + { + "auxiliary_loss_clip": 0.01021995, + "auxiliary_loss_mlp": 0.00998421, + "balance_loss_clip": 0.99741381, + "balance_loss_mlp": 1.00197566, + "epoch": 0.8898241394859462, + "flos": 41356275039360.0, + "grad_norm": 0.8801583470464978, + "language_loss": 0.58083129, + "learning_rate": 1.2592429449806053e-07, + "loss": 0.60103536, + "num_input_tokens_seen": 319221865, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.20019531, + "step": 14800, + "time_per_iteration": 3.0016472339630127 + }, + { + "auxiliary_loss_clip": 0.01100463, + "auxiliary_loss_mlp": 0.0102968, + "balance_loss_clip": 1.01881397, + "balance_loss_mlp": 1.03615224, + "epoch": 0.8898842627386142, + "flos": 18986295024000.0, + "grad_norm": 1.9195223698734736, + "language_loss": 0.65940589, + "learning_rate": 1.2578831918181698e-07, + "loss": 0.68070734, + "num_input_tokens_seen": 319240710, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.64453125, + "step": 14801, + "time_per_iteration": 2.4564545154571533 + }, + { + "auxiliary_loss_clip": 0.01103883, + "auxiliary_loss_mlp": 0.01033929, + "balance_loss_clip": 1.02052987, + "balance_loss_mlp": 1.03634536, + "epoch": 0.8899443859912821, + "flos": 13217173868160.0, + "grad_norm": 2.3831690088780797, + "language_loss": 0.75702822, + "learning_rate": 1.256524149358682e-07, + "loss": 0.77840638, + "num_input_tokens_seen": 319256495, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.67578125, + "step": 14802, + "time_per_iteration": 2.4362783432006836 + }, + { + "auxiliary_loss_clip": 0.01098284, + "auxiliary_loss_mlp": 0.01029288, + "balance_loss_clip": 1.01845789, + "balance_loss_mlp": 1.03538465, + "epoch": 0.8900045092439501, + "flos": 22674680110080.0, + "grad_norm": 1.7447945915193968, + "language_loss": 0.73556334, + "learning_rate": 1.2551658176536805e-07, + "loss": 0.7568391, + "num_input_tokens_seen": 319273620, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.62890625, + "step": 14803, + "time_per_iteration": 3.843716859817505 + }, + { + "auxiliary_loss_clip": 0.01097556, + "auxiliary_loss_mlp": 0.01031798, + "balance_loss_clip": 1.02025902, + "balance_loss_mlp": 1.0338726, + "epoch": 0.890064632496618, + "flos": 21141617685120.0, + "grad_norm": 2.2342057214139244, + "language_loss": 0.71535265, + "learning_rate": 1.2538081967546664e-07, + "loss": 0.73664618, + "num_input_tokens_seen": 319291720, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.63671875, + "step": 14804, + "time_per_iteration": 2.4600677490234375 + }, + { + "auxiliary_loss_clip": 0.01099154, + "auxiliary_loss_mlp": 0.01030025, + "balance_loss_clip": 1.01805639, + "balance_loss_mlp": 1.03340125, + "epoch": 0.8901247557492861, + "flos": 23397058529280.0, + "grad_norm": 1.679518955949807, + "language_loss": 0.81240398, + "learning_rate": 1.252451286713123e-07, + "loss": 0.83369577, + "num_input_tokens_seen": 319310380, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.65625, + "step": 14805, + "time_per_iteration": 5.3233935832977295 + }, + { + "auxiliary_loss_clip": 0.01101908, + "auxiliary_loss_mlp": 0.0103062, + "balance_loss_clip": 1.01857388, + "balance_loss_mlp": 1.03456831, + "epoch": 0.890184879001954, + "flos": 29169591477120.0, + "grad_norm": 1.9168109162120714, + "language_loss": 0.67573619, + "learning_rate": 1.251095087580505e-07, + "loss": 0.69706142, + "num_input_tokens_seen": 319331765, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.67578125, + "step": 14806, + "time_per_iteration": 2.516892671585083 + }, + { + "auxiliary_loss_clip": 0.01097771, + "auxiliary_loss_mlp": 0.01029154, + "balance_loss_clip": 1.01762652, + "balance_loss_mlp": 1.03334141, + "epoch": 0.890245002254622, + "flos": 14427830793600.0, + "grad_norm": 1.8087810947787646, + "language_loss": 0.66934985, + "learning_rate": 1.2497395994082438e-07, + "loss": 0.69061911, + "num_input_tokens_seen": 319349135, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 14807, + "time_per_iteration": 2.4431300163269043 + }, + { + "auxiliary_loss_clip": 0.0109679, + "auxiliary_loss_mlp": 0.0102731, + "balance_loss_clip": 1.01656318, + "balance_loss_mlp": 1.03291702, + "epoch": 0.8903051255072899, + "flos": 22382187661440.0, + "grad_norm": 1.7392302676531743, + "language_loss": 0.75443882, + "learning_rate": 1.248384822247732e-07, + "loss": 0.77567983, + "num_input_tokens_seen": 319368410, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.640625, + "step": 14808, + "time_per_iteration": 2.4573440551757812 + }, + { + "auxiliary_loss_clip": 0.01099351, + "auxiliary_loss_mlp": 0.01032305, + "balance_loss_clip": 1.02105141, + "balance_loss_mlp": 1.03359127, + "epoch": 0.8903652487599579, + "flos": 20777375819520.0, + "grad_norm": 1.8850733161628792, + "language_loss": 0.81599617, + "learning_rate": 1.2470307561503513e-07, + "loss": 0.83731276, + "num_input_tokens_seen": 319387535, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 14809, + "time_per_iteration": 2.49157452583313 + }, + { + "auxiliary_loss_clip": 0.01099477, + "auxiliary_loss_mlp": 0.01027824, + "balance_loss_clip": 1.01664197, + "balance_loss_mlp": 1.03431225, + "epoch": 0.8904253720126258, + "flos": 24424499157120.0, + "grad_norm": 1.7329886476679317, + "language_loss": 0.68297541, + "learning_rate": 1.2456774011674442e-07, + "loss": 0.70424849, + "num_input_tokens_seen": 319407210, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.65234375, + "step": 14810, + "time_per_iteration": 4.0772764682769775 + }, + { + "auxiliary_loss_clip": 0.01099319, + "auxiliary_loss_mlp": 0.01026645, + "balance_loss_clip": 1.01500988, + "balance_loss_mlp": 1.03268421, + "epoch": 0.8904854952652939, + "flos": 19463871277440.0, + "grad_norm": 2.030728566700246, + "language_loss": 0.69870633, + "learning_rate": 1.2443247573503257e-07, + "loss": 0.71996593, + "num_input_tokens_seen": 319425340, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66796875, + "step": 14811, + "time_per_iteration": 2.458737850189209 + }, + { + "auxiliary_loss_clip": 0.01101881, + "auxiliary_loss_mlp": 0.01030049, + "balance_loss_clip": 1.01853955, + "balance_loss_mlp": 1.03482771, + "epoch": 0.8905456185179618, + "flos": 50800741666560.0, + "grad_norm": 6.429512242388682, + "language_loss": 0.6537776, + "learning_rate": 1.2429728247502924e-07, + "loss": 0.67509687, + "num_input_tokens_seen": 319448150, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 14812, + "time_per_iteration": 2.740006685256958 + }, + { + "auxiliary_loss_clip": 0.01097646, + "auxiliary_loss_mlp": 0.01028927, + "balance_loss_clip": 1.01789427, + "balance_loss_mlp": 1.03355992, + "epoch": 0.8906057417706298, + "flos": 17784867893760.0, + "grad_norm": 1.6667342349025365, + "language_loss": 0.68745792, + "learning_rate": 1.24162160341861e-07, + "loss": 0.70872366, + "num_input_tokens_seen": 319466115, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 14813, + "time_per_iteration": 2.4327640533447266 + }, + { + "auxiliary_loss_clip": 0.01105069, + "auxiliary_loss_mlp": 0.01033392, + "balance_loss_clip": 1.02008224, + "balance_loss_mlp": 1.03459501, + "epoch": 0.8906658650232978, + "flos": 21944867575680.0, + "grad_norm": 3.7354447140562157, + "language_loss": 0.75532061, + "learning_rate": 1.2402710934065198e-07, + "loss": 0.77670521, + "num_input_tokens_seen": 319485255, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.703125, + "step": 14814, + "time_per_iteration": 2.484541893005371 + }, + { + "auxiliary_loss_clip": 0.01100943, + "auxiliary_loss_mlp": 0.01026811, + "balance_loss_clip": 1.0147295, + "balance_loss_mlp": 1.03317893, + "epoch": 0.8907259882759657, + "flos": 21287810039040.0, + "grad_norm": 2.065309630726402, + "language_loss": 0.74279094, + "learning_rate": 1.2389212947652229e-07, + "loss": 0.76406848, + "num_input_tokens_seen": 319501800, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 14815, + "time_per_iteration": 2.465571165084839 + }, + { + "auxiliary_loss_clip": 0.01097426, + "auxiliary_loss_mlp": 0.01028489, + "balance_loss_clip": 1.01690221, + "balance_loss_mlp": 1.03356385, + "epoch": 0.8907861115286337, + "flos": 20120426023680.0, + "grad_norm": 1.9753473376305755, + "language_loss": 0.75420868, + "learning_rate": 1.237572207545914e-07, + "loss": 0.77546787, + "num_input_tokens_seen": 319520415, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.640625, + "step": 14816, + "time_per_iteration": 2.4814677238464355 + }, + { + "auxiliary_loss_clip": 0.01098854, + "auxiliary_loss_mlp": 0.01027321, + "balance_loss_clip": 1.01619875, + "balance_loss_mlp": 1.03312755, + "epoch": 0.8908462347813016, + "flos": 20084156265600.0, + "grad_norm": 2.0793302281151655, + "language_loss": 0.77708268, + "learning_rate": 1.2362238317997476e-07, + "loss": 0.79834437, + "num_input_tokens_seen": 319538410, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 14817, + "time_per_iteration": 2.486025333404541 + }, + { + "auxiliary_loss_clip": 0.01021999, + "auxiliary_loss_mlp": 0.00998991, + "balance_loss_clip": 0.99791193, + "balance_loss_mlp": 1.00199425, + "epoch": 0.8909063580339697, + "flos": 65503649790720.0, + "grad_norm": 0.754133836270162, + "language_loss": 0.56543994, + "learning_rate": 1.2348761675778517e-07, + "loss": 0.58564985, + "num_input_tokens_seen": 319602565, + "router_z_loss_clip": 0.01080322, + "router_z_loss_mlp": 0.20019531, + "step": 14818, + "time_per_iteration": 3.1222634315490723 + }, + { + "auxiliary_loss_clip": 0.01099653, + "auxiliary_loss_mlp": 0.01030248, + "balance_loss_clip": 1.01856565, + "balance_loss_mlp": 1.03452563, + "epoch": 0.8909664812866376, + "flos": 29863062426240.0, + "grad_norm": 1.7280404234864395, + "language_loss": 0.64667571, + "learning_rate": 1.2335292149313325e-07, + "loss": 0.66797471, + "num_input_tokens_seen": 319624645, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65234375, + "step": 14819, + "time_per_iteration": 2.5655226707458496 + }, + { + "auxiliary_loss_clip": 0.01101351, + "auxiliary_loss_mlp": 0.01029856, + "balance_loss_clip": 1.01726794, + "balance_loss_mlp": 1.03483844, + "epoch": 0.8910266045393056, + "flos": 25447127362560.0, + "grad_norm": 1.673521506671084, + "language_loss": 0.78504813, + "learning_rate": 1.2321829739112731e-07, + "loss": 0.80636024, + "num_input_tokens_seen": 319644040, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6640625, + "step": 14820, + "time_per_iteration": 2.4987428188323975 + }, + { + "auxiliary_loss_clip": 0.01100213, + "auxiliary_loss_mlp": 0.01032381, + "balance_loss_clip": 1.02140188, + "balance_loss_mlp": 1.03441, + "epoch": 0.8910867277919735, + "flos": 24499121662080.0, + "grad_norm": 1.8625788775928358, + "language_loss": 0.76595819, + "learning_rate": 1.2308374445687087e-07, + "loss": 0.78728414, + "num_input_tokens_seen": 319663930, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.65625, + "step": 14821, + "time_per_iteration": 2.47625470161438 + }, + { + "auxiliary_loss_clip": 0.01022043, + "auxiliary_loss_mlp": 0.01000344, + "balance_loss_clip": 0.99936658, + "balance_loss_mlp": 1.00216877, + "epoch": 0.8911468510446415, + "flos": 60688136856960.0, + "grad_norm": 0.7883315331563723, + "language_loss": 0.59294641, + "learning_rate": 1.2294926269546712e-07, + "loss": 0.61317027, + "num_input_tokens_seen": 319721245, + "router_z_loss_clip": 0.00976562, + "router_z_loss_mlp": 0.19921875, + "step": 14822, + "time_per_iteration": 2.965127468109131 + }, + { + "auxiliary_loss_clip": 0.01099976, + "auxiliary_loss_mlp": 0.01028436, + "balance_loss_clip": 1.01686049, + "balance_loss_mlp": 1.03401423, + "epoch": 0.8912069742973094, + "flos": 25337492075520.0, + "grad_norm": 1.9811529785013153, + "language_loss": 0.68799651, + "learning_rate": 1.2281485211201515e-07, + "loss": 0.70928061, + "num_input_tokens_seen": 319741200, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66015625, + "step": 14823, + "time_per_iteration": 2.503521203994751 + }, + { + "auxiliary_loss_clip": 0.01096068, + "auxiliary_loss_mlp": 0.01028171, + "balance_loss_clip": 1.01657248, + "balance_loss_mlp": 1.03248489, + "epoch": 0.8912670975499775, + "flos": 18223516782720.0, + "grad_norm": 1.493967977658863, + "language_loss": 0.69340491, + "learning_rate": 1.2268051271161262e-07, + "loss": 0.71464735, + "num_input_tokens_seen": 319759265, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.63671875, + "step": 14824, + "time_per_iteration": 2.4937334060668945 + }, + { + "auxiliary_loss_clip": 0.01100645, + "auxiliary_loss_mlp": 0.01031923, + "balance_loss_clip": 1.01948369, + "balance_loss_mlp": 1.03348267, + "epoch": 0.8913272208026454, + "flos": 26504481041280.0, + "grad_norm": 1.7886667473291846, + "language_loss": 0.70545679, + "learning_rate": 1.2254624449935303e-07, + "loss": 0.72678244, + "num_input_tokens_seen": 319777560, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 14825, + "time_per_iteration": 2.4795172214508057 + }, + { + "auxiliary_loss_clip": 0.01097621, + "auxiliary_loss_mlp": 0.01029336, + "balance_loss_clip": 1.01749253, + "balance_loss_mlp": 1.03321981, + "epoch": 0.8913873440553134, + "flos": 18802324540800.0, + "grad_norm": 1.8753873126161282, + "language_loss": 0.71137297, + "learning_rate": 1.2241204748032786e-07, + "loss": 0.73264253, + "num_input_tokens_seen": 319794125, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.640625, + "step": 14826, + "time_per_iteration": 2.459636688232422 + }, + { + "auxiliary_loss_clip": 0.01097916, + "auxiliary_loss_mlp": 0.01026074, + "balance_loss_clip": 1.01506472, + "balance_loss_mlp": 1.03418994, + "epoch": 0.8914474673079814, + "flos": 20884892204160.0, + "grad_norm": 1.9750957296989986, + "language_loss": 0.74912608, + "learning_rate": 1.2227792165962615e-07, + "loss": 0.77036595, + "num_input_tokens_seen": 319810310, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.63671875, + "step": 14827, + "time_per_iteration": 2.429797410964966 + }, + { + "auxiliary_loss_clip": 0.01099273, + "auxiliary_loss_mlp": 0.01027727, + "balance_loss_clip": 1.01571679, + "balance_loss_mlp": 1.03379297, + "epoch": 0.8915075905606493, + "flos": 20952439729920.0, + "grad_norm": 1.6925399195324096, + "language_loss": 0.78210777, + "learning_rate": 1.221438670423336e-07, + "loss": 0.80337775, + "num_input_tokens_seen": 319828505, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.65234375, + "step": 14828, + "time_per_iteration": 2.4611334800720215 + }, + { + "auxiliary_loss_clip": 0.01098983, + "auxiliary_loss_mlp": 0.01030227, + "balance_loss_clip": 1.0185442, + "balance_loss_mlp": 1.03426635, + "epoch": 0.8915677138133173, + "flos": 23076305055360.0, + "grad_norm": 1.608358893281869, + "language_loss": 0.75332123, + "learning_rate": 1.2200988363353392e-07, + "loss": 0.77461332, + "num_input_tokens_seen": 319848680, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6484375, + "step": 14829, + "time_per_iteration": 2.450355291366577 + }, + { + "auxiliary_loss_clip": 0.01098587, + "auxiliary_loss_mlp": 0.01034705, + "balance_loss_clip": 1.0239166, + "balance_loss_mlp": 1.03289604, + "epoch": 0.8916278370659853, + "flos": 23440259612160.0, + "grad_norm": 1.535946632179645, + "language_loss": 0.84532714, + "learning_rate": 1.2187597143830773e-07, + "loss": 0.86666012, + "num_input_tokens_seen": 319868835, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.65625, + "step": 14830, + "time_per_iteration": 2.5008816719055176 + }, + { + "auxiliary_loss_clip": 0.01096274, + "auxiliary_loss_mlp": 0.01024693, + "balance_loss_clip": 1.01413131, + "balance_loss_mlp": 1.03340077, + "epoch": 0.8916879603186533, + "flos": 25160488830720.0, + "grad_norm": 1.4505342083014159, + "language_loss": 0.74674547, + "learning_rate": 1.2174213046173299e-07, + "loss": 0.76795518, + "num_input_tokens_seen": 319891585, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.62890625, + "step": 14831, + "time_per_iteration": 2.52681827545166 + }, + { + "auxiliary_loss_clip": 0.01100331, + "auxiliary_loss_mlp": 0.01027206, + "balance_loss_clip": 1.01556492, + "balance_loss_mlp": 1.03387928, + "epoch": 0.8917480835713212, + "flos": 20229845829120.0, + "grad_norm": 1.6745380328604238, + "language_loss": 0.72861183, + "learning_rate": 1.216083607088847e-07, + "loss": 0.74988717, + "num_input_tokens_seen": 319910315, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 14832, + "time_per_iteration": 2.4757449626922607 + }, + { + "auxiliary_loss_clip": 0.01100323, + "auxiliary_loss_mlp": 0.01029449, + "balance_loss_clip": 1.01790977, + "balance_loss_mlp": 1.03276098, + "epoch": 0.8918082068239892, + "flos": 26101922342400.0, + "grad_norm": 1.7806350383888931, + "language_loss": 0.66921455, + "learning_rate": 1.214746621848355e-07, + "loss": 0.6905123, + "num_input_tokens_seen": 319932275, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.67578125, + "step": 14833, + "time_per_iteration": 2.486619234085083 + }, + { + "auxiliary_loss_clip": 0.01103899, + "auxiliary_loss_mlp": 0.01030365, + "balance_loss_clip": 1.01791346, + "balance_loss_mlp": 1.03564548, + "epoch": 0.8918683300766571, + "flos": 24831439315200.0, + "grad_norm": 1.6026570762407482, + "language_loss": 0.73980582, + "learning_rate": 1.2134103489465575e-07, + "loss": 0.76114845, + "num_input_tokens_seen": 319955335, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.68359375, + "step": 14834, + "time_per_iteration": 2.5816140174865723 + }, + { + "auxiliary_loss_clip": 0.01099178, + "auxiliary_loss_mlp": 0.01030289, + "balance_loss_clip": 1.01897621, + "balance_loss_mlp": 1.03406143, + "epoch": 0.8919284533293251, + "flos": 22305158945280.0, + "grad_norm": 2.0716513864685107, + "language_loss": 0.78957003, + "learning_rate": 1.2120747884341188e-07, + "loss": 0.81086469, + "num_input_tokens_seen": 319973990, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 14835, + "time_per_iteration": 2.4538369178771973 + }, + { + "auxiliary_loss_clip": 0.01095585, + "auxiliary_loss_mlp": 0.0102702, + "balance_loss_clip": 1.01586795, + "balance_loss_mlp": 1.03217602, + "epoch": 0.891988576581993, + "flos": 30373532559360.0, + "grad_norm": 1.3453069661779542, + "language_loss": 0.73707056, + "learning_rate": 1.210739940361689e-07, + "loss": 0.75829661, + "num_input_tokens_seen": 319995555, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6328125, + "step": 14836, + "time_per_iteration": 2.5771117210388184 + }, + { + "auxiliary_loss_clip": 0.0109794, + "auxiliary_loss_mlp": 0.01031984, + "balance_loss_clip": 1.02038467, + "balance_loss_mlp": 1.03253198, + "epoch": 0.8920486998346611, + "flos": 15552947479680.0, + "grad_norm": 2.043235250771678, + "language_loss": 0.68709385, + "learning_rate": 1.2094058047798838e-07, + "loss": 0.7083931, + "num_input_tokens_seen": 320012385, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 14837, + "time_per_iteration": 2.414586305618286 + }, + { + "auxiliary_loss_clip": 0.01103306, + "auxiliary_loss_mlp": 0.01029674, + "balance_loss_clip": 1.01693094, + "balance_loss_mlp": 1.03462231, + "epoch": 0.892108823087329, + "flos": 21214983214080.0, + "grad_norm": 1.9615265471061178, + "language_loss": 0.6747911, + "learning_rate": 1.2080723817392913e-07, + "loss": 0.69612092, + "num_input_tokens_seen": 320032390, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 14838, + "time_per_iteration": 2.4969213008880615 + }, + { + "auxiliary_loss_clip": 0.01099744, + "auxiliary_loss_mlp": 0.01026647, + "balance_loss_clip": 1.01435709, + "balance_loss_mlp": 1.03337884, + "epoch": 0.892168946339997, + "flos": 21978982517760.0, + "grad_norm": 1.977366243331741, + "language_loss": 0.76072603, + "learning_rate": 1.2067396712904777e-07, + "loss": 0.78198999, + "num_input_tokens_seen": 320052885, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6640625, + "step": 14839, + "time_per_iteration": 2.4536030292510986 + }, + { + "auxiliary_loss_clip": 0.01022037, + "auxiliary_loss_mlp": 0.01000199, + "balance_loss_clip": 0.99922198, + "balance_loss_mlp": 1.0020833, + "epoch": 0.892229069592665, + "flos": 67475289277440.0, + "grad_norm": 0.6822740725500295, + "language_loss": 0.49385339, + "learning_rate": 1.205407673483978e-07, + "loss": 0.51407576, + "num_input_tokens_seen": 320113685, + "router_z_loss_clip": 0.00976562, + "router_z_loss_mlp": 0.19921875, + "step": 14840, + "time_per_iteration": 3.0283010005950928 + }, + { + "auxiliary_loss_clip": 0.01103846, + "auxiliary_loss_mlp": 0.01028258, + "balance_loss_clip": 1.01541281, + "balance_loss_mlp": 1.03384066, + "epoch": 0.8922891928453329, + "flos": 19459561645440.0, + "grad_norm": 2.2683869685699505, + "language_loss": 0.64067227, + "learning_rate": 1.2040763883703074e-07, + "loss": 0.66199327, + "num_input_tokens_seen": 320130810, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69921875, + "step": 14841, + "time_per_iteration": 2.4450442790985107 + }, + { + "auxiliary_loss_clip": 0.01095053, + "auxiliary_loss_mlp": 0.01033419, + "balance_loss_clip": 1.02289844, + "balance_loss_mlp": 1.03297675, + "epoch": 0.8923493160980009, + "flos": 23367396873600.0, + "grad_norm": 1.605514543360149, + "language_loss": 0.686297, + "learning_rate": 1.2027458159999438e-07, + "loss": 0.70758176, + "num_input_tokens_seen": 320152170, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.62109375, + "step": 14842, + "time_per_iteration": 2.5407049655914307 + }, + { + "auxiliary_loss_clip": 0.01097557, + "auxiliary_loss_mlp": 0.01029792, + "balance_loss_clip": 1.01925397, + "balance_loss_mlp": 1.0342983, + "epoch": 0.8924094393506689, + "flos": 26177047637760.0, + "grad_norm": 1.8531743729129386, + "language_loss": 0.79840702, + "learning_rate": 1.2014159564233373e-07, + "loss": 0.81968051, + "num_input_tokens_seen": 320172360, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6328125, + "step": 14843, + "time_per_iteration": 2.482599973678589 + }, + { + "auxiliary_loss_clip": 0.01101829, + "auxiliary_loss_mlp": 0.01031096, + "balance_loss_clip": 1.01853776, + "balance_loss_mlp": 1.03437209, + "epoch": 0.8924695626033369, + "flos": 22018520413440.0, + "grad_norm": 1.9570611228190977, + "language_loss": 0.68831146, + "learning_rate": 1.2000868096909257e-07, + "loss": 0.70964074, + "num_input_tokens_seen": 320192130, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.67578125, + "step": 14844, + "time_per_iteration": 2.475032091140747 + }, + { + "auxiliary_loss_clip": 0.01101274, + "auxiliary_loss_mlp": 0.0103247, + "balance_loss_clip": 1.02129424, + "balance_loss_mlp": 1.03532469, + "epoch": 0.8925296858560048, + "flos": 14793940166400.0, + "grad_norm": 1.8946118282729945, + "language_loss": 0.91013724, + "learning_rate": 1.1987583758531038e-07, + "loss": 0.93147469, + "num_input_tokens_seen": 320207760, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.66015625, + "step": 14845, + "time_per_iteration": 3.7998101711273193 + }, + { + "auxiliary_loss_clip": 0.01098517, + "auxiliary_loss_mlp": 0.01025686, + "balance_loss_clip": 1.01481438, + "balance_loss_mlp": 1.03497481, + "epoch": 0.8925898091086728, + "flos": 22346636175360.0, + "grad_norm": 2.0813113286417555, + "language_loss": 0.72576404, + "learning_rate": 1.1974306549602476e-07, + "loss": 0.74700606, + "num_input_tokens_seen": 320225325, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6328125, + "step": 14846, + "time_per_iteration": 2.474081039428711 + }, + { + "auxiliary_loss_clip": 0.01101498, + "auxiliary_loss_mlp": 0.01034011, + "balance_loss_clip": 1.02219105, + "balance_loss_mlp": 1.03516674, + "epoch": 0.8926499323613407, + "flos": 45806322067200.0, + "grad_norm": 1.825993599740926, + "language_loss": 0.57318634, + "learning_rate": 1.1961036470627094e-07, + "loss": 0.59454143, + "num_input_tokens_seen": 320247645, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 14847, + "time_per_iteration": 4.101036071777344 + }, + { + "auxiliary_loss_clip": 0.01099025, + "auxiliary_loss_mlp": 0.01030816, + "balance_loss_clip": 1.02008104, + "balance_loss_mlp": 1.03349578, + "epoch": 0.8927100556140087, + "flos": 22127042378880.0, + "grad_norm": 1.8027724109005723, + "language_loss": 0.76794285, + "learning_rate": 1.1947773522108052e-07, + "loss": 0.78924131, + "num_input_tokens_seen": 320266005, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.65625, + "step": 14848, + "time_per_iteration": 2.5357553958892822 + }, + { + "auxiliary_loss_clip": 0.01098164, + "auxiliary_loss_mlp": 0.01027847, + "balance_loss_clip": 1.01653409, + "balance_loss_mlp": 1.03388548, + "epoch": 0.8927701788666766, + "flos": 28330143655680.0, + "grad_norm": 2.852551904806777, + "language_loss": 0.69231212, + "learning_rate": 1.1934517704548251e-07, + "loss": 0.71357226, + "num_input_tokens_seen": 320285555, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 14849, + "time_per_iteration": 2.5289762020111084 + }, + { + "auxiliary_loss_clip": 0.01102332, + "auxiliary_loss_mlp": 0.01033428, + "balance_loss_clip": 1.02210355, + "balance_loss_mlp": 1.03686213, + "epoch": 0.8928303021193447, + "flos": 25294973351040.0, + "grad_norm": 1.5363403291321316, + "language_loss": 0.80896437, + "learning_rate": 1.1921269018450364e-07, + "loss": 0.83032203, + "num_input_tokens_seen": 320305395, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 14850, + "time_per_iteration": 2.539560556411743 + }, + { + "auxiliary_loss_clip": 0.01097951, + "auxiliary_loss_mlp": 0.01033461, + "balance_loss_clip": 1.02183247, + "balance_loss_mlp": 1.03397167, + "epoch": 0.8928904253720126, + "flos": 22236713579520.0, + "grad_norm": 1.4972669302776855, + "language_loss": 0.75046718, + "learning_rate": 1.1908027464316872e-07, + "loss": 0.77178133, + "num_input_tokens_seen": 320324220, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.640625, + "step": 14851, + "time_per_iteration": 3.946723699569702 + }, + { + "auxiliary_loss_clip": 0.01097886, + "auxiliary_loss_mlp": 0.01028153, + "balance_loss_clip": 1.01645279, + "balance_loss_mlp": 1.03404009, + "epoch": 0.8929505486246806, + "flos": 27092374940160.0, + "grad_norm": 1.6306137064929098, + "language_loss": 0.78424543, + "learning_rate": 1.1894793042649775e-07, + "loss": 0.80550581, + "num_input_tokens_seen": 320347195, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.640625, + "step": 14852, + "time_per_iteration": 2.588900089263916 + }, + { + "auxiliary_loss_clip": 0.01097941, + "auxiliary_loss_mlp": 0.01030272, + "balance_loss_clip": 1.01950192, + "balance_loss_mlp": 1.03595543, + "epoch": 0.8930106718773486, + "flos": 23039352938880.0, + "grad_norm": 1.4048830284686333, + "language_loss": 0.69412851, + "learning_rate": 1.1881565753951006e-07, + "loss": 0.71541065, + "num_input_tokens_seen": 320366850, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.62109375, + "step": 14853, + "time_per_iteration": 2.492919921875 + }, + { + "auxiliary_loss_clip": 0.01100668, + "auxiliary_loss_mlp": 0.01031135, + "balance_loss_clip": 1.01944065, + "balance_loss_mlp": 1.03522491, + "epoch": 0.8930707951300165, + "flos": 35626652887680.0, + "grad_norm": 1.7034933673051655, + "language_loss": 0.67261219, + "learning_rate": 1.1868345598722118e-07, + "loss": 0.69393027, + "num_input_tokens_seen": 320388895, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65234375, + "step": 14854, + "time_per_iteration": 2.6161773204803467 + }, + { + "auxiliary_loss_clip": 0.01096124, + "auxiliary_loss_mlp": 0.01028012, + "balance_loss_clip": 1.01752734, + "balance_loss_mlp": 1.03351092, + "epoch": 0.8931309183826845, + "flos": 23039891642880.0, + "grad_norm": 1.452507573496769, + "language_loss": 0.74611282, + "learning_rate": 1.1855132577464399e-07, + "loss": 0.76735425, + "num_input_tokens_seen": 320408520, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.625, + "step": 14855, + "time_per_iteration": 2.473764657974243 + }, + { + "auxiliary_loss_clip": 0.0109814, + "auxiliary_loss_mlp": 0.01030131, + "balance_loss_clip": 1.01865709, + "balance_loss_mlp": 1.0337348, + "epoch": 0.8931910416353525, + "flos": 26504624695680.0, + "grad_norm": 1.9027124813935195, + "language_loss": 0.64368689, + "learning_rate": 1.1841926690678893e-07, + "loss": 0.66496962, + "num_input_tokens_seen": 320427400, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.64453125, + "step": 14856, + "time_per_iteration": 2.532707691192627 + }, + { + "auxiliary_loss_clip": 0.01098751, + "auxiliary_loss_mlp": 0.01027284, + "balance_loss_clip": 1.01631689, + "balance_loss_mlp": 1.03341556, + "epoch": 0.8932511648880205, + "flos": 24973609345920.0, + "grad_norm": 1.8696512418627556, + "language_loss": 0.66240281, + "learning_rate": 1.1828727938866378e-07, + "loss": 0.68366313, + "num_input_tokens_seen": 320447570, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.65625, + "step": 14857, + "time_per_iteration": 2.4812355041503906 + }, + { + "auxiliary_loss_clip": 0.01102247, + "auxiliary_loss_mlp": 0.01031555, + "balance_loss_clip": 1.02011704, + "balance_loss_mlp": 1.0357101, + "epoch": 0.8933112881406884, + "flos": 24460733001600.0, + "grad_norm": 2.3462499304119415, + "language_loss": 0.75313234, + "learning_rate": 1.1815536322527408e-07, + "loss": 0.77447033, + "num_input_tokens_seen": 320464405, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 14858, + "time_per_iteration": 2.4967639446258545 + }, + { + "auxiliary_loss_clip": 0.01098575, + "auxiliary_loss_mlp": 0.01029184, + "balance_loss_clip": 1.01716173, + "balance_loss_mlp": 1.03381801, + "epoch": 0.8933714113933564, + "flos": 28293083798400.0, + "grad_norm": 1.749677970064563, + "language_loss": 0.69162208, + "learning_rate": 1.1802351842162139e-07, + "loss": 0.71289968, + "num_input_tokens_seen": 320485525, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6484375, + "step": 14859, + "time_per_iteration": 2.504290819168091 + }, + { + "auxiliary_loss_clip": 0.01093256, + "auxiliary_loss_mlp": 0.01026074, + "balance_loss_clip": 1.01584625, + "balance_loss_mlp": 1.03289175, + "epoch": 0.8934315346460243, + "flos": 21434864319360.0, + "grad_norm": 1.7863861979655313, + "language_loss": 0.75433087, + "learning_rate": 1.1789174498270526e-07, + "loss": 0.77552414, + "num_input_tokens_seen": 320506725, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.6015625, + "step": 14860, + "time_per_iteration": 2.479966402053833 + }, + { + "auxiliary_loss_clip": 0.01102206, + "auxiliary_loss_mlp": 0.0103045, + "balance_loss_clip": 1.0180645, + "balance_loss_mlp": 1.03548205, + "epoch": 0.8934916578986923, + "flos": 23769596436480.0, + "grad_norm": 2.336029575980188, + "language_loss": 0.57421482, + "learning_rate": 1.1776004291352303e-07, + "loss": 0.59554136, + "num_input_tokens_seen": 320525425, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6640625, + "step": 14861, + "time_per_iteration": 2.4453883171081543 + }, + { + "auxiliary_loss_clip": 0.01097311, + "auxiliary_loss_mlp": 0.01030271, + "balance_loss_clip": 1.0188086, + "balance_loss_mlp": 1.03289747, + "epoch": 0.8935517811513602, + "flos": 18916161719040.0, + "grad_norm": 1.9703362787241279, + "language_loss": 0.63988757, + "learning_rate": 1.176284122190685e-07, + "loss": 0.66116345, + "num_input_tokens_seen": 320543010, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.640625, + "step": 14862, + "time_per_iteration": 2.4691827297210693 + }, + { + "auxiliary_loss_clip": 0.0109601, + "auxiliary_loss_mlp": 0.01026536, + "balance_loss_clip": 1.01505589, + "balance_loss_mlp": 1.03218484, + "epoch": 0.8936119044040283, + "flos": 24061370613120.0, + "grad_norm": 1.5823454170060147, + "language_loss": 0.77867645, + "learning_rate": 1.1749685290433298e-07, + "loss": 0.7999019, + "num_input_tokens_seen": 320562180, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.63671875, + "step": 14863, + "time_per_iteration": 2.540869951248169 + }, + { + "auxiliary_loss_clip": 0.01094615, + "auxiliary_loss_mlp": 0.01024455, + "balance_loss_clip": 1.01424432, + "balance_loss_mlp": 1.0320065, + "epoch": 0.8936720276566962, + "flos": 21324079797120.0, + "grad_norm": 1.9517641145653177, + "language_loss": 0.70929408, + "learning_rate": 1.1736536497430627e-07, + "loss": 0.73048472, + "num_input_tokens_seen": 320580395, + "router_z_loss_clip": 0.10205078, + "router_z_loss_mlp": 0.62890625, + "step": 14864, + "time_per_iteration": 2.5036158561706543 + }, + { + "auxiliary_loss_clip": 0.01107034, + "auxiliary_loss_mlp": 0.01033263, + "balance_loss_clip": 1.021366, + "balance_loss_mlp": 1.03713703, + "epoch": 0.8937321509093642, + "flos": 18406122549120.0, + "grad_norm": 1.960140962390111, + "language_loss": 0.75742739, + "learning_rate": 1.1723394843397283e-07, + "loss": 0.77883035, + "num_input_tokens_seen": 320599505, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.69921875, + "step": 14865, + "time_per_iteration": 2.442366123199463 + }, + { + "auxiliary_loss_clip": 0.01095846, + "auxiliary_loss_mlp": 0.01027973, + "balance_loss_clip": 1.017483, + "balance_loss_mlp": 1.03252757, + "epoch": 0.8937922741620322, + "flos": 22054754257920.0, + "grad_norm": 1.6471559543699055, + "language_loss": 0.71687293, + "learning_rate": 1.1710260328831668e-07, + "loss": 0.73811114, + "num_input_tokens_seen": 320619825, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.6328125, + "step": 14866, + "time_per_iteration": 2.5246763229370117 + }, + { + "auxiliary_loss_clip": 0.01104023, + "auxiliary_loss_mlp": 0.01028647, + "balance_loss_clip": 1.01588607, + "balance_loss_mlp": 1.03644776, + "epoch": 0.8938523974147001, + "flos": 25664386775040.0, + "grad_norm": 1.8105973277463203, + "language_loss": 0.83971083, + "learning_rate": 1.1697132954231869e-07, + "loss": 0.86103749, + "num_input_tokens_seen": 320638515, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.67578125, + "step": 14867, + "time_per_iteration": 2.4837403297424316 + }, + { + "auxiliary_loss_clip": 0.01098392, + "auxiliary_loss_mlp": 0.01027811, + "balance_loss_clip": 1.01751125, + "balance_loss_mlp": 1.03336859, + "epoch": 0.8939125206673681, + "flos": 25742852035200.0, + "grad_norm": 1.5257308716937024, + "language_loss": 0.80485952, + "learning_rate": 1.168401272009567e-07, + "loss": 0.82612157, + "num_input_tokens_seen": 320659430, + "router_z_loss_clip": 0.10302734, + "router_z_loss_mlp": 0.6484375, + "step": 14868, + "time_per_iteration": 2.539396047592163 + }, + { + "auxiliary_loss_clip": 0.01100509, + "auxiliary_loss_mlp": 0.01030561, + "balance_loss_clip": 1.01893783, + "balance_loss_mlp": 1.03468442, + "epoch": 0.8939726439200361, + "flos": 27344503480320.0, + "grad_norm": 1.73703480181996, + "language_loss": 0.77222109, + "learning_rate": 1.167089962692056e-07, + "loss": 0.79353189, + "num_input_tokens_seen": 320679295, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 14869, + "time_per_iteration": 2.503376007080078 + }, + { + "auxiliary_loss_clip": 0.01099024, + "auxiliary_loss_mlp": 0.01022264, + "balance_loss_clip": 1.01077819, + "balance_loss_mlp": 1.03436267, + "epoch": 0.8940327671727041, + "flos": 20338834671360.0, + "grad_norm": 1.4728974184814814, + "language_loss": 0.6547929, + "learning_rate": 1.1657793675203853e-07, + "loss": 0.67600584, + "num_input_tokens_seen": 320697535, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.64453125, + "step": 14870, + "time_per_iteration": 2.4696993827819824 + }, + { + "auxiliary_loss_clip": 0.01021955, + "auxiliary_loss_mlp": 0.01003959, + "balance_loss_clip": 1.00302875, + "balance_loss_mlp": 1.00201225, + "epoch": 0.894092890425372, + "flos": 58410573235200.0, + "grad_norm": 0.8434452713885856, + "language_loss": 0.55948913, + "learning_rate": 1.1644694865442461e-07, + "loss": 0.57974827, + "num_input_tokens_seen": 320758635, + "router_z_loss_clip": 0.00927734, + "router_z_loss_mlp": 0.19921875, + "step": 14871, + "time_per_iteration": 3.098759412765503 + }, + { + "auxiliary_loss_clip": 0.01098394, + "auxiliary_loss_mlp": 0.01032324, + "balance_loss_clip": 1.02132666, + "balance_loss_mlp": 1.03488946, + "epoch": 0.89415301367804, + "flos": 19829657427840.0, + "grad_norm": 1.8033147229452833, + "language_loss": 0.76229548, + "learning_rate": 1.16316031981331e-07, + "loss": 0.78360265, + "num_input_tokens_seen": 320777175, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6328125, + "step": 14872, + "time_per_iteration": 2.4757678508758545 + }, + { + "auxiliary_loss_clip": 0.01095042, + "auxiliary_loss_mlp": 0.01026607, + "balance_loss_clip": 1.0163548, + "balance_loss_mlp": 1.03337288, + "epoch": 0.8942131369307079, + "flos": 25775781828480.0, + "grad_norm": 1.490700807753622, + "language_loss": 0.66794723, + "learning_rate": 1.1618518673772215e-07, + "loss": 0.68916368, + "num_input_tokens_seen": 320797670, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.6171875, + "step": 14873, + "time_per_iteration": 2.5868990421295166 + }, + { + "auxiliary_loss_clip": 0.01096304, + "auxiliary_loss_mlp": 0.01032791, + "balance_loss_clip": 1.02131701, + "balance_loss_mlp": 1.03331888, + "epoch": 0.8942732601833759, + "flos": 23149024139520.0, + "grad_norm": 1.55215288470212, + "language_loss": 0.59791553, + "learning_rate": 1.1605441292856033e-07, + "loss": 0.61920649, + "num_input_tokens_seen": 320817410, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6328125, + "step": 14874, + "time_per_iteration": 2.4845948219299316 + }, + { + "auxiliary_loss_clip": 0.01104539, + "auxiliary_loss_mlp": 0.01030928, + "balance_loss_clip": 1.01850021, + "balance_loss_mlp": 1.03692889, + "epoch": 0.8943333834360438, + "flos": 27855548231040.0, + "grad_norm": 1.8608507472011937, + "language_loss": 0.75573874, + "learning_rate": 1.1592371055880356e-07, + "loss": 0.77709341, + "num_input_tokens_seen": 320836745, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.67578125, + "step": 14875, + "time_per_iteration": 2.552445650100708 + }, + { + "auxiliary_loss_clip": 0.01106238, + "auxiliary_loss_mlp": 0.01031424, + "balance_loss_clip": 1.01806641, + "balance_loss_mlp": 1.03644109, + "epoch": 0.8943935066887119, + "flos": 22163958581760.0, + "grad_norm": 1.7553065513486439, + "language_loss": 0.77431512, + "learning_rate": 1.1579307963340857e-07, + "loss": 0.79569167, + "num_input_tokens_seen": 320853305, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.69921875, + "step": 14876, + "time_per_iteration": 2.479843854904175 + }, + { + "auxiliary_loss_clip": 0.01097857, + "auxiliary_loss_mlp": 0.01027719, + "balance_loss_clip": 1.01707911, + "balance_loss_mlp": 1.03393304, + "epoch": 0.8944536299413798, + "flos": 21470056669440.0, + "grad_norm": 1.7034619965415823, + "language_loss": 0.78767753, + "learning_rate": 1.156625201573287e-07, + "loss": 0.80893332, + "num_input_tokens_seen": 320872885, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.640625, + "step": 14877, + "time_per_iteration": 2.468996524810791 + }, + { + "auxiliary_loss_clip": 0.01099186, + "auxiliary_loss_mlp": 0.01028425, + "balance_loss_clip": 1.01641452, + "balance_loss_mlp": 1.03482389, + "epoch": 0.8945137531940478, + "flos": 17748777703680.0, + "grad_norm": 2.091963059351132, + "language_loss": 0.7505362, + "learning_rate": 1.155320321355151e-07, + "loss": 0.77181232, + "num_input_tokens_seen": 320889755, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.640625, + "step": 14878, + "time_per_iteration": 2.4732110500335693 + }, + { + "auxiliary_loss_clip": 0.0109821, + "auxiliary_loss_mlp": 0.01027029, + "balance_loss_clip": 1.01513195, + "balance_loss_mlp": 1.03284216, + "epoch": 0.8945738764467158, + "flos": 21142264129920.0, + "grad_norm": 1.5818118960503171, + "language_loss": 0.76242149, + "learning_rate": 1.1540161557291539e-07, + "loss": 0.78367388, + "num_input_tokens_seen": 320907860, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 14879, + "time_per_iteration": 2.5296013355255127 + }, + { + "auxiliary_loss_clip": 0.0110191, + "auxiliary_loss_mlp": 0.01030173, + "balance_loss_clip": 1.01895523, + "balance_loss_mlp": 1.03676414, + "epoch": 0.8946339996993837, + "flos": 14903000835840.0, + "grad_norm": 1.8977007401222414, + "language_loss": 0.7420851, + "learning_rate": 1.1527127047447538e-07, + "loss": 0.76340598, + "num_input_tokens_seen": 320925825, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 14880, + "time_per_iteration": 2.501164197921753 + }, + { + "auxiliary_loss_clip": 0.01099433, + "auxiliary_loss_mlp": 0.01025992, + "balance_loss_clip": 1.01408911, + "balance_loss_mlp": 1.03427339, + "epoch": 0.8946941229520518, + "flos": 27382173868800.0, + "grad_norm": 1.5190919090163466, + "language_loss": 0.82769126, + "learning_rate": 1.1514099684513822e-07, + "loss": 0.8489455, + "num_input_tokens_seen": 320946165, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6484375, + "step": 14881, + "time_per_iteration": 2.5418641567230225 + }, + { + "auxiliary_loss_clip": 0.01095788, + "auxiliary_loss_mlp": 0.0102559, + "balance_loss_clip": 1.01472986, + "balance_loss_mlp": 1.03236985, + "epoch": 0.8947542462047197, + "flos": 31796277338880.0, + "grad_norm": 1.649013399005573, + "language_loss": 0.67482835, + "learning_rate": 1.1501079468984287e-07, + "loss": 0.69604212, + "num_input_tokens_seen": 320969330, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6328125, + "step": 14882, + "time_per_iteration": 2.564490795135498 + }, + { + "auxiliary_loss_clip": 0.01104448, + "auxiliary_loss_mlp": 0.01028251, + "balance_loss_clip": 1.01529944, + "balance_loss_mlp": 1.03529155, + "epoch": 0.8948143694573877, + "flos": 20883599314560.0, + "grad_norm": 2.429036760271906, + "language_loss": 0.75044572, + "learning_rate": 1.1488066401352691e-07, + "loss": 0.77177274, + "num_input_tokens_seen": 320985055, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.69140625, + "step": 14883, + "time_per_iteration": 2.4705231189727783 + }, + { + "auxiliary_loss_clip": 0.01094799, + "auxiliary_loss_mlp": 0.01030489, + "balance_loss_clip": 1.01948595, + "balance_loss_mlp": 1.0331018, + "epoch": 0.8948744927100556, + "flos": 28215552291840.0, + "grad_norm": 1.5479121548537522, + "language_loss": 0.72337794, + "learning_rate": 1.147506048211253e-07, + "loss": 0.74463081, + "num_input_tokens_seen": 321004720, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6171875, + "step": 14884, + "time_per_iteration": 2.5401506423950195 + }, + { + "auxiliary_loss_clip": 0.01094217, + "auxiliary_loss_mlp": 0.01025329, + "balance_loss_clip": 1.01457047, + "balance_loss_mlp": 1.03127992, + "epoch": 0.8949346159627236, + "flos": 21902672073600.0, + "grad_norm": 1.5118040441368576, + "language_loss": 0.75339627, + "learning_rate": 1.1462061711756987e-07, + "loss": 0.77459168, + "num_input_tokens_seen": 321022350, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.62890625, + "step": 14885, + "time_per_iteration": 2.4566526412963867 + }, + { + "auxiliary_loss_clip": 0.01099303, + "auxiliary_loss_mlp": 0.01028817, + "balance_loss_clip": 1.01721168, + "balance_loss_mlp": 1.03246248, + "epoch": 0.8949947392153915, + "flos": 21359128492800.0, + "grad_norm": 2.3103597790279053, + "language_loss": 0.81585598, + "learning_rate": 1.1449070090778911e-07, + "loss": 0.83713722, + "num_input_tokens_seen": 321040450, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66796875, + "step": 14886, + "time_per_iteration": 2.4688005447387695 + }, + { + "auxiliary_loss_clip": 0.01099421, + "auxiliary_loss_mlp": 0.01027155, + "balance_loss_clip": 1.01639688, + "balance_loss_mlp": 1.03421152, + "epoch": 0.8950548624680595, + "flos": 52445342799360.0, + "grad_norm": 1.478515652092319, + "language_loss": 0.63619804, + "learning_rate": 1.1436085619671043e-07, + "loss": 0.65746379, + "num_input_tokens_seen": 321063970, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.65234375, + "step": 14887, + "time_per_iteration": 4.133228302001953 + }, + { + "auxiliary_loss_clip": 0.01102604, + "auxiliary_loss_mlp": 0.01032867, + "balance_loss_clip": 1.02122581, + "balance_loss_mlp": 1.03541362, + "epoch": 0.8951149857207275, + "flos": 20121323863680.0, + "grad_norm": 1.7844722249086462, + "language_loss": 0.61070365, + "learning_rate": 1.1423108298925698e-07, + "loss": 0.63205838, + "num_input_tokens_seen": 321083840, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 14888, + "time_per_iteration": 3.9138073921203613 + }, + { + "auxiliary_loss_clip": 0.011003, + "auxiliary_loss_mlp": 0.0102525, + "balance_loss_clip": 1.01431835, + "balance_loss_mlp": 1.03365338, + "epoch": 0.8951751089733955, + "flos": 29862631463040.0, + "grad_norm": 1.7551662985764278, + "language_loss": 0.69682604, + "learning_rate": 1.1410138129034952e-07, + "loss": 0.71808153, + "num_input_tokens_seen": 321104165, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6640625, + "step": 14889, + "time_per_iteration": 3.9413297176361084 + }, + { + "auxiliary_loss_clip": 0.01102211, + "auxiliary_loss_mlp": 0.0102801, + "balance_loss_clip": 1.01619697, + "balance_loss_mlp": 1.03535187, + "epoch": 0.8952352322260634, + "flos": 15262789415040.0, + "grad_norm": 2.3512086286063614, + "language_loss": 0.70814884, + "learning_rate": 1.1397175110490676e-07, + "loss": 0.72945112, + "num_input_tokens_seen": 321117290, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66796875, + "step": 14890, + "time_per_iteration": 2.435168743133545 + }, + { + "auxiliary_loss_clip": 0.01098203, + "auxiliary_loss_mlp": 0.01025849, + "balance_loss_clip": 1.01420212, + "balance_loss_mlp": 1.0328474, + "epoch": 0.8952953554787314, + "flos": 26798338206720.0, + "grad_norm": 1.5590643217837603, + "language_loss": 0.75952852, + "learning_rate": 1.1384219243784454e-07, + "loss": 0.78076905, + "num_input_tokens_seen": 321137115, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65234375, + "step": 14891, + "time_per_iteration": 2.477548122406006 + }, + { + "auxiliary_loss_clip": 0.01101231, + "auxiliary_loss_mlp": 0.01031497, + "balance_loss_clip": 1.0199697, + "balance_loss_mlp": 1.03322709, + "epoch": 0.8953554787313994, + "flos": 14137205852160.0, + "grad_norm": 1.8140090200899526, + "language_loss": 0.76758611, + "learning_rate": 1.1371270529407517e-07, + "loss": 0.78891343, + "num_input_tokens_seen": 321154490, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6796875, + "step": 14892, + "time_per_iteration": 2.452353000640869 + }, + { + "auxiliary_loss_clip": 0.01098634, + "auxiliary_loss_mlp": 0.01030455, + "balance_loss_clip": 1.01936281, + "balance_loss_mlp": 1.03434777, + "epoch": 0.8954156019840673, + "flos": 25703314139520.0, + "grad_norm": 1.3207000845430072, + "language_loss": 0.81841969, + "learning_rate": 1.1358328967850895e-07, + "loss": 0.83971059, + "num_input_tokens_seen": 321175625, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.64453125, + "step": 14893, + "time_per_iteration": 2.5077221393585205 + }, + { + "auxiliary_loss_clip": 0.0109668, + "auxiliary_loss_mlp": 0.01028564, + "balance_loss_clip": 1.01751983, + "balance_loss_mlp": 1.03423703, + "epoch": 0.8954757252367354, + "flos": 21907987286400.0, + "grad_norm": 11.57553130306276, + "language_loss": 0.74789113, + "learning_rate": 1.1345394559605348e-07, + "loss": 0.76914358, + "num_input_tokens_seen": 321193895, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.625, + "step": 14894, + "time_per_iteration": 3.945729970932007 + }, + { + "auxiliary_loss_clip": 0.01102545, + "auxiliary_loss_mlp": 0.01032674, + "balance_loss_clip": 1.02041912, + "balance_loss_mlp": 1.03604054, + "epoch": 0.8955358484894033, + "flos": 12970396454400.0, + "grad_norm": 1.6244269958664943, + "language_loss": 0.66519237, + "learning_rate": 1.1332467305161352e-07, + "loss": 0.68654454, + "num_input_tokens_seen": 321211610, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6640625, + "step": 14895, + "time_per_iteration": 2.4295578002929688 + }, + { + "auxiliary_loss_clip": 0.01102129, + "auxiliary_loss_mlp": 0.01029088, + "balance_loss_clip": 1.01665497, + "balance_loss_mlp": 1.03526545, + "epoch": 0.8955959717420713, + "flos": 17273966797440.0, + "grad_norm": 1.8850172467985669, + "language_loss": 0.67215335, + "learning_rate": 1.1319547205009094e-07, + "loss": 0.69346553, + "num_input_tokens_seen": 321229805, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 14896, + "time_per_iteration": 2.441373586654663 + }, + { + "auxiliary_loss_clip": 0.01099805, + "auxiliary_loss_mlp": 0.0103214, + "balance_loss_clip": 1.02071393, + "balance_loss_mlp": 1.03478193, + "epoch": 0.8956560949947392, + "flos": 14793868339200.0, + "grad_norm": 1.7592330291905882, + "language_loss": 0.75651777, + "learning_rate": 1.1306634259638492e-07, + "loss": 0.77783716, + "num_input_tokens_seen": 321247165, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 14897, + "time_per_iteration": 2.45491361618042 + }, + { + "auxiliary_loss_clip": 0.01021895, + "auxiliary_loss_mlp": 0.01003334, + "balance_loss_clip": 1.00235045, + "balance_loss_mlp": 1.00189352, + "epoch": 0.8957162182474072, + "flos": 63607817957760.0, + "grad_norm": 0.7479901222096683, + "language_loss": 0.55332673, + "learning_rate": 1.129372846953931e-07, + "loss": 0.57357907, + "num_input_tokens_seen": 321308425, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.20019531, + "step": 14898, + "time_per_iteration": 3.0941059589385986 + }, + { + "auxiliary_loss_clip": 0.01100232, + "auxiliary_loss_mlp": 0.01028995, + "balance_loss_clip": 1.01750898, + "balance_loss_mlp": 1.0343554, + "epoch": 0.8957763415000751, + "flos": 25009843190400.0, + "grad_norm": 1.407731502520021, + "language_loss": 0.7033121, + "learning_rate": 1.12808298352008e-07, + "loss": 0.72460437, + "num_input_tokens_seen": 321329295, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.66015625, + "step": 14899, + "time_per_iteration": 2.500845432281494 + }, + { + "auxiliary_loss_clip": 0.01102543, + "auxiliary_loss_mlp": 0.01036157, + "balance_loss_clip": 1.02335942, + "balance_loss_mlp": 1.03636515, + "epoch": 0.8958364647527431, + "flos": 19828615933440.0, + "grad_norm": 2.007406117160179, + "language_loss": 0.73626882, + "learning_rate": 1.1267938357112106e-07, + "loss": 0.7576558, + "num_input_tokens_seen": 321347580, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.66015625, + "step": 14900, + "time_per_iteration": 2.462517738342285 + }, + { + "auxiliary_loss_clip": 0.01022163, + "auxiliary_loss_mlp": 0.0100183, + "balance_loss_clip": 1.00092971, + "balance_loss_mlp": 1.00212479, + "epoch": 0.895896588005411, + "flos": 65537190115200.0, + "grad_norm": 0.7858702190089509, + "language_loss": 0.61846119, + "learning_rate": 1.1255054035762124e-07, + "loss": 0.63870108, + "num_input_tokens_seen": 321407820, + "router_z_loss_clip": 0.00897217, + "router_z_loss_mlp": 0.20019531, + "step": 14901, + "time_per_iteration": 3.0669608116149902 + }, + { + "auxiliary_loss_clip": 0.01099585, + "auxiliary_loss_mlp": 0.01028043, + "balance_loss_clip": 1.01636648, + "balance_loss_mlp": 1.03309727, + "epoch": 0.8959567112580791, + "flos": 25591021246080.0, + "grad_norm": 1.7072266384182382, + "language_loss": 0.70579618, + "learning_rate": 1.1242176871639441e-07, + "loss": 0.72707248, + "num_input_tokens_seen": 321426745, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 14902, + "time_per_iteration": 2.4966113567352295 + }, + { + "auxiliary_loss_clip": 0.01095333, + "auxiliary_loss_mlp": 0.0102621, + "balance_loss_clip": 1.01517129, + "balance_loss_mlp": 1.03252649, + "epoch": 0.896016834510747, + "flos": 24201780877440.0, + "grad_norm": 1.666989732507148, + "language_loss": 0.78098989, + "learning_rate": 1.1229306865232313e-07, + "loss": 0.80220532, + "num_input_tokens_seen": 321446165, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.62890625, + "step": 14903, + "time_per_iteration": 2.4643096923828125 + }, + { + "auxiliary_loss_clip": 0.01103263, + "auxiliary_loss_mlp": 0.01028383, + "balance_loss_clip": 1.01646781, + "balance_loss_mlp": 1.03564441, + "epoch": 0.896076957763415, + "flos": 23075945919360.0, + "grad_norm": 1.6349395372995028, + "language_loss": 0.72710371, + "learning_rate": 1.121644401702877e-07, + "loss": 0.74842012, + "num_input_tokens_seen": 321465285, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 14904, + "time_per_iteration": 2.476510524749756 + }, + { + "auxiliary_loss_clip": 0.01100675, + "auxiliary_loss_mlp": 0.01025875, + "balance_loss_clip": 1.01292312, + "balance_loss_mlp": 1.03407562, + "epoch": 0.8961370810160829, + "flos": 22236605838720.0, + "grad_norm": 2.003019000801922, + "language_loss": 0.74558008, + "learning_rate": 1.12035883275166e-07, + "loss": 0.76684558, + "num_input_tokens_seen": 321483670, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.66796875, + "step": 14905, + "time_per_iteration": 2.5374937057495117 + }, + { + "auxiliary_loss_clip": 0.01097255, + "auxiliary_loss_mlp": 0.0102924, + "balance_loss_clip": 1.01764691, + "balance_loss_mlp": 1.03354824, + "epoch": 0.8961972042687509, + "flos": 23072318645760.0, + "grad_norm": 1.5757434113327204, + "language_loss": 0.76621282, + "learning_rate": 1.1190739797183279e-07, + "loss": 0.78747779, + "num_input_tokens_seen": 321501190, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.640625, + "step": 14906, + "time_per_iteration": 2.4911582469940186 + }, + { + "auxiliary_loss_clip": 0.0109995, + "auxiliary_loss_mlp": 0.01030126, + "balance_loss_clip": 1.01864624, + "balance_loss_mlp": 1.03458023, + "epoch": 0.896257327521419, + "flos": 18185882307840.0, + "grad_norm": 1.9866469364584363, + "language_loss": 0.74468869, + "learning_rate": 1.1177898426515996e-07, + "loss": 0.76598948, + "num_input_tokens_seen": 321518540, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65234375, + "step": 14907, + "time_per_iteration": 2.4224627017974854 + }, + { + "auxiliary_loss_clip": 0.0109858, + "auxiliary_loss_mlp": 0.01032943, + "balance_loss_clip": 1.02196395, + "balance_loss_mlp": 1.03504455, + "epoch": 0.8963174507740869, + "flos": 17895472848000.0, + "grad_norm": 2.42177438179363, + "language_loss": 0.82961619, + "learning_rate": 1.1165064216001785e-07, + "loss": 0.85093141, + "num_input_tokens_seen": 321536555, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.63671875, + "step": 14908, + "time_per_iteration": 2.4348533153533936 + }, + { + "auxiliary_loss_clip": 0.01101575, + "auxiliary_loss_mlp": 0.01028145, + "balance_loss_clip": 1.01572299, + "balance_loss_mlp": 1.03415501, + "epoch": 0.8963775740267549, + "flos": 21032269706880.0, + "grad_norm": 1.7129159855777194, + "language_loss": 0.70255554, + "learning_rate": 1.1152237166127232e-07, + "loss": 0.72385275, + "num_input_tokens_seen": 321557655, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.67578125, + "step": 14909, + "time_per_iteration": 2.458195924758911 + }, + { + "auxiliary_loss_clip": 0.01102257, + "auxiliary_loss_mlp": 0.01032834, + "balance_loss_clip": 1.02075171, + "balance_loss_mlp": 1.03573656, + "epoch": 0.8964376972794228, + "flos": 23179619548800.0, + "grad_norm": 1.709642155357814, + "language_loss": 0.72406387, + "learning_rate": 1.113941727737877e-07, + "loss": 0.74541485, + "num_input_tokens_seen": 321576160, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 14910, + "time_per_iteration": 2.4810500144958496 + }, + { + "auxiliary_loss_clip": 0.0109713, + "auxiliary_loss_mlp": 0.01026641, + "balance_loss_clip": 1.01558399, + "balance_loss_mlp": 1.03219759, + "epoch": 0.8964978205320908, + "flos": 24972998814720.0, + "grad_norm": 2.502367907177224, + "language_loss": 0.63351315, + "learning_rate": 1.1126604550242502e-07, + "loss": 0.65475088, + "num_input_tokens_seen": 321596205, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 14911, + "time_per_iteration": 2.48689866065979 + }, + { + "auxiliary_loss_clip": 0.01102037, + "auxiliary_loss_mlp": 0.01027549, + "balance_loss_clip": 1.01584291, + "balance_loss_mlp": 1.03563142, + "epoch": 0.8965579437847587, + "flos": 19172025273600.0, + "grad_norm": 1.9938612971873675, + "language_loss": 0.74839032, + "learning_rate": 1.111379898520437e-07, + "loss": 0.76968622, + "num_input_tokens_seen": 321614800, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 14912, + "time_per_iteration": 2.4474406242370605 + }, + { + "auxiliary_loss_clip": 0.01099856, + "auxiliary_loss_mlp": 0.01030977, + "balance_loss_clip": 1.01930642, + "balance_loss_mlp": 1.03326905, + "epoch": 0.8966180670374267, + "flos": 24276690691200.0, + "grad_norm": 1.7165060291362908, + "language_loss": 0.81594461, + "learning_rate": 1.1101000582749876e-07, + "loss": 0.83725297, + "num_input_tokens_seen": 321633445, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 14913, + "time_per_iteration": 2.4972469806671143 + }, + { + "auxiliary_loss_clip": 0.01103057, + "auxiliary_loss_mlp": 0.01033203, + "balance_loss_clip": 1.0206089, + "balance_loss_mlp": 1.03490567, + "epoch": 0.8966781902900947, + "flos": 13553190622080.0, + "grad_norm": 2.7571386064915555, + "language_loss": 0.61551863, + "learning_rate": 1.1088209343364407e-07, + "loss": 0.63688123, + "num_input_tokens_seen": 321650890, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.68359375, + "step": 14914, + "time_per_iteration": 2.4439823627471924 + }, + { + "auxiliary_loss_clip": 0.01021938, + "auxiliary_loss_mlp": 0.01003898, + "balance_loss_clip": 1.00294387, + "balance_loss_mlp": 1.00187731, + "epoch": 0.8967383135427627, + "flos": 65066114223360.0, + "grad_norm": 0.74229947958434, + "language_loss": 0.55134475, + "learning_rate": 1.1075425267532956e-07, + "loss": 0.57160312, + "num_input_tokens_seen": 321710960, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.20117188, + "step": 14915, + "time_per_iteration": 3.0520334243774414 + }, + { + "auxiliary_loss_clip": 0.0109578, + "auxiliary_loss_mlp": 0.01029121, + "balance_loss_clip": 1.0185411, + "balance_loss_mlp": 1.03272772, + "epoch": 0.8967984367954306, + "flos": 29713027317120.0, + "grad_norm": 1.557041844193811, + "language_loss": 0.71559089, + "learning_rate": 1.1062648355740289e-07, + "loss": 0.73683989, + "num_input_tokens_seen": 321733290, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.62890625, + "step": 14916, + "time_per_iteration": 2.5423061847686768 + }, + { + "auxiliary_loss_clip": 0.01100034, + "auxiliary_loss_mlp": 0.0102934, + "balance_loss_clip": 1.01840854, + "balance_loss_mlp": 1.03454828, + "epoch": 0.8968585600480986, + "flos": 25702488126720.0, + "grad_norm": 1.5928608101669224, + "language_loss": 0.77743876, + "learning_rate": 1.1049878608470931e-07, + "loss": 0.79873246, + "num_input_tokens_seen": 321753120, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.65625, + "step": 14917, + "time_per_iteration": 2.4978179931640625 + }, + { + "auxiliary_loss_clip": 0.01104708, + "auxiliary_loss_mlp": 0.01035154, + "balance_loss_clip": 1.02280986, + "balance_loss_mlp": 1.03608799, + "epoch": 0.8969186833007665, + "flos": 30044698525440.0, + "grad_norm": 1.9323719767216765, + "language_loss": 0.68000007, + "learning_rate": 1.1037116026209137e-07, + "loss": 0.70139873, + "num_input_tokens_seen": 321772840, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6875, + "step": 14918, + "time_per_iteration": 2.5807528495788574 + }, + { + "auxiliary_loss_clip": 0.01099957, + "auxiliary_loss_mlp": 0.01027656, + "balance_loss_clip": 1.01676011, + "balance_loss_mlp": 1.03353751, + "epoch": 0.8969788065534345, + "flos": 22818143030400.0, + "grad_norm": 1.7530397429371827, + "language_loss": 0.83640873, + "learning_rate": 1.102436060943881e-07, + "loss": 0.85768479, + "num_input_tokens_seen": 321791020, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6640625, + "step": 14919, + "time_per_iteration": 2.4691712856292725 + }, + { + "auxiliary_loss_clip": 0.01100826, + "auxiliary_loss_mlp": 0.01028742, + "balance_loss_clip": 1.01639748, + "balance_loss_mlp": 1.03384709, + "epoch": 0.8970389298061026, + "flos": 13261488272640.0, + "grad_norm": 2.1041581456995018, + "language_loss": 0.71935117, + "learning_rate": 1.1011612358643696e-07, + "loss": 0.74064684, + "num_input_tokens_seen": 321810075, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.671875, + "step": 14920, + "time_per_iteration": 2.450545072555542 + }, + { + "auxiliary_loss_clip": 0.0109998, + "auxiliary_loss_mlp": 0.01029714, + "balance_loss_clip": 1.01801395, + "balance_loss_mlp": 1.03451681, + "epoch": 0.8970990530587705, + "flos": 10266071345280.0, + "grad_norm": 2.364040496753844, + "language_loss": 0.90711236, + "learning_rate": 1.0998871274307164e-07, + "loss": 0.92840934, + "num_input_tokens_seen": 321822635, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 14921, + "time_per_iteration": 2.416222095489502 + }, + { + "auxiliary_loss_clip": 0.01103175, + "auxiliary_loss_mlp": 0.01028402, + "balance_loss_clip": 1.01680899, + "balance_loss_mlp": 1.03518486, + "epoch": 0.8971591763114385, + "flos": 20302708567680.0, + "grad_norm": 1.7934445633449958, + "language_loss": 0.73719668, + "learning_rate": 1.0986137356912384e-07, + "loss": 0.7585125, + "num_input_tokens_seen": 321841130, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 14922, + "time_per_iteration": 2.4588661193847656 + }, + { + "auxiliary_loss_clip": 0.01097034, + "auxiliary_loss_mlp": 0.01029912, + "balance_loss_clip": 1.01772904, + "balance_loss_mlp": 1.03221571, + "epoch": 0.8972192995641064, + "flos": 23257043314560.0, + "grad_norm": 2.0889474548738995, + "language_loss": 0.70069325, + "learning_rate": 1.097341060694219e-07, + "loss": 0.72196275, + "num_input_tokens_seen": 321859855, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6484375, + "step": 14923, + "time_per_iteration": 2.453202724456787 + }, + { + "auxiliary_loss_clip": 0.01100903, + "auxiliary_loss_mlp": 0.01027393, + "balance_loss_clip": 1.01448882, + "balance_loss_mlp": 1.03415108, + "epoch": 0.8972794228167744, + "flos": 18369601395840.0, + "grad_norm": 2.37761241543336, + "language_loss": 0.69968379, + "learning_rate": 1.0960691024879221e-07, + "loss": 0.72096676, + "num_input_tokens_seen": 321877990, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.66796875, + "step": 14924, + "time_per_iteration": 2.4540958404541016 + }, + { + "auxiliary_loss_clip": 0.01097287, + "auxiliary_loss_mlp": 0.01029877, + "balance_loss_clip": 1.01961923, + "balance_loss_mlp": 1.03243768, + "epoch": 0.8973395460694423, + "flos": 23952058548480.0, + "grad_norm": 1.797667243382315, + "language_loss": 0.72386622, + "learning_rate": 1.0947978611205844e-07, + "loss": 0.74513781, + "num_input_tokens_seen": 321898120, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.6484375, + "step": 14925, + "time_per_iteration": 2.484833240509033 + }, + { + "auxiliary_loss_clip": 0.01103822, + "auxiliary_loss_mlp": 0.01027799, + "balance_loss_clip": 1.01577675, + "balance_loss_mlp": 1.03691697, + "epoch": 0.8973996693221103, + "flos": 24970843998720.0, + "grad_norm": 1.7702759423239505, + "language_loss": 0.82245016, + "learning_rate": 1.0935273366404008e-07, + "loss": 0.84376639, + "num_input_tokens_seen": 321918140, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.66796875, + "step": 14926, + "time_per_iteration": 2.492662191390991 + }, + { + "auxiliary_loss_clip": 0.01097674, + "auxiliary_loss_mlp": 0.01027525, + "balance_loss_clip": 1.01665902, + "balance_loss_mlp": 1.03260446, + "epoch": 0.8974597925747783, + "flos": 25738937452800.0, + "grad_norm": 1.6352336400713978, + "language_loss": 0.79072952, + "learning_rate": 1.092257529095555e-07, + "loss": 0.81198144, + "num_input_tokens_seen": 321938580, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.65234375, + "step": 14927, + "time_per_iteration": 2.4794511795043945 + }, + { + "auxiliary_loss_clip": 0.01097797, + "auxiliary_loss_mlp": 0.01026085, + "balance_loss_clip": 1.01563644, + "balance_loss_mlp": 1.03343737, + "epoch": 0.8975199158274463, + "flos": 38071918131840.0, + "grad_norm": 1.5303190462919, + "language_loss": 0.66319346, + "learning_rate": 1.0909884385341994e-07, + "loss": 0.68443227, + "num_input_tokens_seen": 321961135, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.64453125, + "step": 14928, + "time_per_iteration": 3.9788811206817627 + }, + { + "auxiliary_loss_clip": 0.01102325, + "auxiliary_loss_mlp": 0.01039454, + "balance_loss_clip": 1.02505386, + "balance_loss_mlp": 1.03454542, + "epoch": 0.8975800390801142, + "flos": 25411683617280.0, + "grad_norm": 1.753595417229725, + "language_loss": 0.70549512, + "learning_rate": 1.0897200650044602e-07, + "loss": 0.72691292, + "num_input_tokens_seen": 321980945, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.67578125, + "step": 14929, + "time_per_iteration": 2.484863519668579 + }, + { + "auxiliary_loss_clip": 0.01101124, + "auxiliary_loss_mlp": 0.01029057, + "balance_loss_clip": 1.01841164, + "balance_loss_mlp": 1.03583241, + "epoch": 0.8976401623327822, + "flos": 21759604202880.0, + "grad_norm": 1.6065357298131178, + "language_loss": 0.67851043, + "learning_rate": 1.0884524085544256e-07, + "loss": 0.69981223, + "num_input_tokens_seen": 322000350, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.65234375, + "step": 14930, + "time_per_iteration": 3.8712880611419678 + }, + { + "auxiliary_loss_clip": 0.01096092, + "auxiliary_loss_mlp": 0.0103007, + "balance_loss_clip": 1.01850104, + "balance_loss_mlp": 1.03163958, + "epoch": 0.8977002855854501, + "flos": 13845323934720.0, + "grad_norm": 1.6896064423054011, + "language_loss": 0.7473526, + "learning_rate": 1.0871854692321769e-07, + "loss": 0.76861417, + "num_input_tokens_seen": 322018980, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.64453125, + "step": 14931, + "time_per_iteration": 3.825070858001709 + }, + { + "auxiliary_loss_clip": 0.01099993, + "auxiliary_loss_mlp": 0.01025398, + "balance_loss_clip": 1.01441276, + "balance_loss_mlp": 1.035815, + "epoch": 0.8977604088381181, + "flos": 19427529692160.0, + "grad_norm": 2.2133845278517925, + "language_loss": 0.63313723, + "learning_rate": 1.0859192470857492e-07, + "loss": 0.65439111, + "num_input_tokens_seen": 322037675, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.640625, + "step": 14932, + "time_per_iteration": 2.4451212882995605 + }, + { + "auxiliary_loss_clip": 0.01095296, + "auxiliary_loss_mlp": 0.01026455, + "balance_loss_clip": 1.01614976, + "balance_loss_mlp": 1.03391647, + "epoch": 0.8978205320907862, + "flos": 22742083981440.0, + "grad_norm": 1.5636902244201198, + "language_loss": 0.71594745, + "learning_rate": 1.0846537421631552e-07, + "loss": 0.73716497, + "num_input_tokens_seen": 322055130, + "router_z_loss_clip": 0.10302734, + "router_z_loss_mlp": 0.609375, + "step": 14933, + "time_per_iteration": 2.44564151763916 + }, + { + "auxiliary_loss_clip": 0.01100715, + "auxiliary_loss_mlp": 0.01027865, + "balance_loss_clip": 1.01600361, + "balance_loss_mlp": 1.03398991, + "epoch": 0.8978806553434541, + "flos": 21360529123200.0, + "grad_norm": 1.4921679700135908, + "language_loss": 0.74557078, + "learning_rate": 1.0833889545123898e-07, + "loss": 0.76685655, + "num_input_tokens_seen": 322074850, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66796875, + "step": 14934, + "time_per_iteration": 2.472975015640259 + }, + { + "auxiliary_loss_clip": 0.01098138, + "auxiliary_loss_mlp": 0.01026091, + "balance_loss_clip": 1.01478958, + "balance_loss_mlp": 1.03404582, + "epoch": 0.8979407785961221, + "flos": 20924178704640.0, + "grad_norm": 1.7760053421270396, + "language_loss": 0.60305613, + "learning_rate": 1.0821248841814123e-07, + "loss": 0.62429839, + "num_input_tokens_seen": 322093315, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.640625, + "step": 14935, + "time_per_iteration": 3.8992903232574463 + }, + { + "auxiliary_loss_clip": 0.01096724, + "auxiliary_loss_mlp": 0.01025165, + "balance_loss_clip": 1.01354182, + "balance_loss_mlp": 1.0332725, + "epoch": 0.89800090184879, + "flos": 25228934196480.0, + "grad_norm": 2.01940463607824, + "language_loss": 0.76901841, + "learning_rate": 1.0808615312181512e-07, + "loss": 0.79023731, + "num_input_tokens_seen": 322112555, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.63671875, + "step": 14936, + "time_per_iteration": 2.510512351989746 + }, + { + "auxiliary_loss_clip": 0.01098978, + "auxiliary_loss_mlp": 0.01029658, + "balance_loss_clip": 1.01873279, + "balance_loss_mlp": 1.03417063, + "epoch": 0.898061025101458, + "flos": 22562674525440.0, + "grad_norm": 1.7165054154452661, + "language_loss": 0.7398392, + "learning_rate": 1.0795988956705193e-07, + "loss": 0.76112556, + "num_input_tokens_seen": 322130440, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6484375, + "step": 14937, + "time_per_iteration": 2.4757297039031982 + }, + { + "auxiliary_loss_clip": 0.0102209, + "auxiliary_loss_mlp": 0.01004521, + "balance_loss_clip": 1.00356126, + "balance_loss_mlp": 1.00203967, + "epoch": 0.8981211483541259, + "flos": 56192551384320.0, + "grad_norm": 0.8567673559760905, + "language_loss": 0.63504851, + "learning_rate": 1.0783369775863915e-07, + "loss": 0.65531462, + "num_input_tokens_seen": 322187295, + "router_z_loss_clip": 0.00958252, + "router_z_loss_mlp": 0.20117188, + "step": 14938, + "time_per_iteration": 2.9755895137786865 + }, + { + "auxiliary_loss_clip": 0.01098564, + "auxiliary_loss_mlp": 0.01029561, + "balance_loss_clip": 1.01804495, + "balance_loss_mlp": 1.03520513, + "epoch": 0.898181271606794, + "flos": 16392718523520.0, + "grad_norm": 2.285609419822284, + "language_loss": 0.80244672, + "learning_rate": 1.0770757770136251e-07, + "loss": 0.82372797, + "num_input_tokens_seen": 322202965, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6328125, + "step": 14939, + "time_per_iteration": 2.4215035438537598 + }, + { + "auxiliary_loss_clip": 0.0102229, + "auxiliary_loss_mlp": 0.01001638, + "balance_loss_clip": 1.00065446, + "balance_loss_mlp": 1.0022645, + "epoch": 0.8982413948594619, + "flos": 63440259989760.0, + "grad_norm": 0.720038785949184, + "language_loss": 0.52935207, + "learning_rate": 1.0758152940000375e-07, + "loss": 0.5495913, + "num_input_tokens_seen": 322269490, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.20019531, + "step": 14940, + "time_per_iteration": 3.20149302482605 + }, + { + "auxiliary_loss_clip": 0.01098494, + "auxiliary_loss_mlp": 0.0103145, + "balance_loss_clip": 1.0192368, + "balance_loss_mlp": 1.03297186, + "epoch": 0.8983015181121299, + "flos": 21835340029440.0, + "grad_norm": 1.693993393171052, + "language_loss": 0.77516085, + "learning_rate": 1.0745555285934327e-07, + "loss": 0.79646027, + "num_input_tokens_seen": 322288060, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65625, + "step": 14941, + "time_per_iteration": 2.434145212173462 + }, + { + "auxiliary_loss_clip": 0.01100191, + "auxiliary_loss_mlp": 0.01030832, + "balance_loss_clip": 1.01858878, + "balance_loss_mlp": 1.03449476, + "epoch": 0.8983616413647978, + "flos": 28949961767040.0, + "grad_norm": 1.878231548634812, + "language_loss": 0.73163295, + "learning_rate": 1.0732964808415834e-07, + "loss": 0.75294316, + "num_input_tokens_seen": 322307930, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.65625, + "step": 14942, + "time_per_iteration": 2.5305984020233154 + }, + { + "auxiliary_loss_clip": 0.01101917, + "auxiliary_loss_mlp": 0.01032244, + "balance_loss_clip": 1.02054954, + "balance_loss_mlp": 1.03539038, + "epoch": 0.8984217646174658, + "flos": 17785083375360.0, + "grad_norm": 2.259546174056382, + "language_loss": 0.79731816, + "learning_rate": 1.0720381507922205e-07, + "loss": 0.81865978, + "num_input_tokens_seen": 322326155, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 14943, + "time_per_iteration": 2.404646396636963 + }, + { + "auxiliary_loss_clip": 0.01101262, + "auxiliary_loss_mlp": 0.0103413, + "balance_loss_clip": 1.02152395, + "balance_loss_mlp": 1.03429723, + "epoch": 0.8984818878701337, + "flos": 23404528558080.0, + "grad_norm": 1.4369536003517842, + "language_loss": 0.70990932, + "learning_rate": 1.0707805384930701e-07, + "loss": 0.73126322, + "num_input_tokens_seen": 322345850, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.671875, + "step": 14944, + "time_per_iteration": 2.499791383743286 + }, + { + "auxiliary_loss_clip": 0.01105021, + "auxiliary_loss_mlp": 0.0103126, + "balance_loss_clip": 1.01886237, + "balance_loss_mlp": 1.03674543, + "epoch": 0.8985420111228017, + "flos": 22346061557760.0, + "grad_norm": 1.8945410483938205, + "language_loss": 0.75043732, + "learning_rate": 1.0695236439918187e-07, + "loss": 0.77180016, + "num_input_tokens_seen": 322364715, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 14945, + "time_per_iteration": 2.4378676414489746 + }, + { + "auxiliary_loss_clip": 0.0110561, + "auxiliary_loss_mlp": 0.01033207, + "balance_loss_clip": 1.02051139, + "balance_loss_mlp": 1.03481817, + "epoch": 0.8986021343754698, + "flos": 21392776558080.0, + "grad_norm": 1.87755765658424, + "language_loss": 0.73487711, + "learning_rate": 1.0682674673361302e-07, + "loss": 0.75626534, + "num_input_tokens_seen": 322383570, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 14946, + "time_per_iteration": 2.4829940795898438 + }, + { + "auxiliary_loss_clip": 0.01098299, + "auxiliary_loss_mlp": 0.01023691, + "balance_loss_clip": 1.0120796, + "balance_loss_mlp": 1.03329158, + "epoch": 0.8986622576281377, + "flos": 21325372686720.0, + "grad_norm": 2.0014358905556855, + "language_loss": 0.64285457, + "learning_rate": 1.0670120085736334e-07, + "loss": 0.66407442, + "num_input_tokens_seen": 322401375, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 14947, + "time_per_iteration": 2.438631296157837 + }, + { + "auxiliary_loss_clip": 0.01099177, + "auxiliary_loss_mlp": 0.01032773, + "balance_loss_clip": 1.02115035, + "balance_loss_mlp": 1.03433037, + "epoch": 0.8987223808808057, + "flos": 23988292392960.0, + "grad_norm": 3.1927043706660894, + "language_loss": 0.69610405, + "learning_rate": 1.0657572677519411e-07, + "loss": 0.71742362, + "num_input_tokens_seen": 322421890, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6484375, + "step": 14948, + "time_per_iteration": 2.4895057678222656 + }, + { + "auxiliary_loss_clip": 0.01100214, + "auxiliary_loss_mlp": 0.01029544, + "balance_loss_clip": 1.01798058, + "balance_loss_mlp": 1.03443384, + "epoch": 0.8987825041334736, + "flos": 41500956044160.0, + "grad_norm": 1.7633000851401956, + "language_loss": 0.74272358, + "learning_rate": 1.0645032449186309e-07, + "loss": 0.76402116, + "num_input_tokens_seen": 322445730, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 14949, + "time_per_iteration": 2.608767509460449 + }, + { + "auxiliary_loss_clip": 0.01101359, + "auxiliary_loss_mlp": 0.0103208, + "balance_loss_clip": 1.01912796, + "balance_loss_mlp": 1.03402793, + "epoch": 0.8988426273861416, + "flos": 27564276844800.0, + "grad_norm": 1.6829290861590958, + "language_loss": 0.75664008, + "learning_rate": 1.0632499401212513e-07, + "loss": 0.77797437, + "num_input_tokens_seen": 322464595, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.671875, + "step": 14950, + "time_per_iteration": 2.529982328414917 + }, + { + "auxiliary_loss_clip": 0.01100934, + "auxiliary_loss_mlp": 0.01029262, + "balance_loss_clip": 1.01849723, + "balance_loss_mlp": 1.03643644, + "epoch": 0.8989027506388095, + "flos": 17092653920640.0, + "grad_norm": 1.5718302114690097, + "language_loss": 0.66472352, + "learning_rate": 1.0619973534073334e-07, + "loss": 0.6860255, + "num_input_tokens_seen": 322483305, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.64453125, + "step": 14951, + "time_per_iteration": 2.4156932830810547 + }, + { + "auxiliary_loss_clip": 0.01102118, + "auxiliary_loss_mlp": 0.01025294, + "balance_loss_clip": 1.0141542, + "balance_loss_mlp": 1.03302848, + "epoch": 0.8989628738914776, + "flos": 20555124416640.0, + "grad_norm": 5.045918966257551, + "language_loss": 0.73914707, + "learning_rate": 1.0607454848243769e-07, + "loss": 0.76042116, + "num_input_tokens_seen": 322501905, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.69140625, + "step": 14952, + "time_per_iteration": 2.4806807041168213 + }, + { + "auxiliary_loss_clip": 0.01099151, + "auxiliary_loss_mlp": 0.01031976, + "balance_loss_clip": 1.0206635, + "balance_loss_mlp": 1.03456223, + "epoch": 0.8990229971441455, + "flos": 16251087196800.0, + "grad_norm": 2.152704477585915, + "language_loss": 0.56480038, + "learning_rate": 1.0594943344198481e-07, + "loss": 0.58611166, + "num_input_tokens_seen": 322518135, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 14953, + "time_per_iteration": 2.399141788482666 + }, + { + "auxiliary_loss_clip": 0.01099987, + "auxiliary_loss_mlp": 0.01033801, + "balance_loss_clip": 1.02200556, + "balance_loss_mlp": 1.03480887, + "epoch": 0.8990831203968135, + "flos": 21981316901760.0, + "grad_norm": 2.012480548312481, + "language_loss": 0.81600904, + "learning_rate": 1.0582439022411915e-07, + "loss": 0.83734691, + "num_input_tokens_seen": 322537905, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65234375, + "step": 14954, + "time_per_iteration": 2.4930219650268555 + }, + { + "auxiliary_loss_clip": 0.01098756, + "auxiliary_loss_mlp": 0.01030553, + "balance_loss_clip": 1.01884711, + "balance_loss_mlp": 1.03465247, + "epoch": 0.8991432436494814, + "flos": 27447171528960.0, + "grad_norm": 3.2641482155182318, + "language_loss": 0.60263079, + "learning_rate": 1.0569941883358224e-07, + "loss": 0.6239239, + "num_input_tokens_seen": 322557945, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.640625, + "step": 14955, + "time_per_iteration": 2.4798853397369385 + }, + { + "auxiliary_loss_clip": 0.01097787, + "auxiliary_loss_mlp": 0.01028865, + "balance_loss_clip": 1.01803517, + "balance_loss_mlp": 1.03440011, + "epoch": 0.8992033669021494, + "flos": 21579835610880.0, + "grad_norm": 2.1065178857932385, + "language_loss": 0.54606581, + "learning_rate": 1.0557451927511341e-07, + "loss": 0.56733239, + "num_input_tokens_seen": 322575765, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6328125, + "step": 14956, + "time_per_iteration": 2.498508930206299 + }, + { + "auxiliary_loss_clip": 0.01099303, + "auxiliary_loss_mlp": 0.01030831, + "balance_loss_clip": 1.0192976, + "balance_loss_mlp": 1.03451133, + "epoch": 0.8992634901548173, + "flos": 28584211530240.0, + "grad_norm": 1.7585876876511954, + "language_loss": 0.7994734, + "learning_rate": 1.0544969155344863e-07, + "loss": 0.82077473, + "num_input_tokens_seen": 322595665, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 14957, + "time_per_iteration": 2.4910168647766113 + }, + { + "auxiliary_loss_clip": 0.01103886, + "auxiliary_loss_mlp": 0.01032558, + "balance_loss_clip": 1.02023137, + "balance_loss_mlp": 1.0356164, + "epoch": 0.8993236134074853, + "flos": 19867435557120.0, + "grad_norm": 1.8978719607927441, + "language_loss": 0.78686506, + "learning_rate": 1.0532493567332123e-07, + "loss": 0.80822951, + "num_input_tokens_seen": 322614755, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.68359375, + "step": 14958, + "time_per_iteration": 2.5033602714538574 + }, + { + "auxiliary_loss_clip": 0.01099017, + "auxiliary_loss_mlp": 0.01026673, + "balance_loss_clip": 1.01628995, + "balance_loss_mlp": 1.03592014, + "epoch": 0.8993837366601534, + "flos": 19390649402880.0, + "grad_norm": 1.5628926324551675, + "language_loss": 0.749843, + "learning_rate": 1.0520025163946277e-07, + "loss": 0.77109987, + "num_input_tokens_seen": 322633425, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.62890625, + "step": 14959, + "time_per_iteration": 2.421219825744629 + }, + { + "auxiliary_loss_clip": 0.01094435, + "auxiliary_loss_mlp": 0.01028178, + "balance_loss_clip": 1.01707387, + "balance_loss_mlp": 1.03165054, + "epoch": 0.8994438599128213, + "flos": 18551740285440.0, + "grad_norm": 1.9131887389317577, + "language_loss": 0.68210769, + "learning_rate": 1.0507563945660015e-07, + "loss": 0.7033338, + "num_input_tokens_seen": 322652065, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.62890625, + "step": 14960, + "time_per_iteration": 2.4903640747070312 + }, + { + "auxiliary_loss_clip": 0.01099033, + "auxiliary_loss_mlp": 0.01026859, + "balance_loss_clip": 1.01579618, + "balance_loss_mlp": 1.03471529, + "epoch": 0.8995039831654893, + "flos": 24427587726720.0, + "grad_norm": 1.5401076588305127, + "language_loss": 0.6614114, + "learning_rate": 1.049510991294591e-07, + "loss": 0.68267035, + "num_input_tokens_seen": 322673275, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.640625, + "step": 14961, + "time_per_iteration": 2.4783365726470947 + }, + { + "auxiliary_loss_clip": 0.01095925, + "auxiliary_loss_mlp": 0.01027111, + "balance_loss_clip": 1.01650167, + "balance_loss_mlp": 1.03261304, + "epoch": 0.8995641064181572, + "flos": 21251324799360.0, + "grad_norm": 1.6811371424318786, + "language_loss": 0.82988048, + "learning_rate": 1.0482663066276254e-07, + "loss": 0.85111082, + "num_input_tokens_seen": 322693375, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.6328125, + "step": 14962, + "time_per_iteration": 2.471440553665161 + }, + { + "auxiliary_loss_clip": 0.0110458, + "auxiliary_loss_mlp": 0.01027368, + "balance_loss_clip": 1.0148747, + "balance_loss_mlp": 1.03580785, + "epoch": 0.8996242296708252, + "flos": 23513661054720.0, + "grad_norm": 2.1190623548906156, + "language_loss": 0.76490587, + "learning_rate": 1.047022340612298e-07, + "loss": 0.78622532, + "num_input_tokens_seen": 322712615, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6875, + "step": 14963, + "time_per_iteration": 2.4583775997161865 + }, + { + "auxiliary_loss_clip": 0.01022033, + "auxiliary_loss_mlp": 0.01001036, + "balance_loss_clip": 1.00004029, + "balance_loss_mlp": 1.00195932, + "epoch": 0.8996843529234931, + "flos": 62403230430720.0, + "grad_norm": 0.8094632900613583, + "language_loss": 0.57510412, + "learning_rate": 1.0457790932957867e-07, + "loss": 0.59533477, + "num_input_tokens_seen": 322766855, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.20117188, + "step": 14964, + "time_per_iteration": 2.8906238079071045 + }, + { + "auxiliary_loss_clip": 0.01107029, + "auxiliary_loss_mlp": 0.01032612, + "balance_loss_clip": 1.02000546, + "balance_loss_mlp": 1.03737593, + "epoch": 0.8997444761761612, + "flos": 24236829573120.0, + "grad_norm": 2.3957838042157134, + "language_loss": 0.67410362, + "learning_rate": 1.0445365647252269e-07, + "loss": 0.69550008, + "num_input_tokens_seen": 322781130, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 14965, + "time_per_iteration": 2.432751178741455 + }, + { + "auxiliary_loss_clip": 0.01101743, + "auxiliary_loss_mlp": 0.01032532, + "balance_loss_clip": 1.02107596, + "balance_loss_mlp": 1.03458083, + "epoch": 0.8998045994288291, + "flos": 21361103740800.0, + "grad_norm": 1.7764506802390192, + "language_loss": 0.71361762, + "learning_rate": 1.0432947549477433e-07, + "loss": 0.73496038, + "num_input_tokens_seen": 322800310, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.671875, + "step": 14966, + "time_per_iteration": 2.4522528648376465 + }, + { + "auxiliary_loss_clip": 0.01102649, + "auxiliary_loss_mlp": 0.01033604, + "balance_loss_clip": 1.0215342, + "balance_loss_mlp": 1.0366888, + "epoch": 0.8998647226814971, + "flos": 28986159697920.0, + "grad_norm": 1.6727777678710354, + "language_loss": 0.73497134, + "learning_rate": 1.0420536640104205e-07, + "loss": 0.75633389, + "num_input_tokens_seen": 322820955, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.66015625, + "step": 14967, + "time_per_iteration": 2.5017104148864746 + }, + { + "auxiliary_loss_clip": 0.01099365, + "auxiliary_loss_mlp": 0.01028351, + "balance_loss_clip": 1.01702619, + "balance_loss_mlp": 1.03394258, + "epoch": 0.899924845934165, + "flos": 13625909706240.0, + "grad_norm": 1.768165327175364, + "language_loss": 0.719221, + "learning_rate": 1.040813291960323e-07, + "loss": 0.74049813, + "num_input_tokens_seen": 322838780, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 14968, + "time_per_iteration": 2.4704649448394775 + }, + { + "auxiliary_loss_clip": 0.01100587, + "auxiliary_loss_mlp": 0.01029102, + "balance_loss_clip": 1.01798606, + "balance_loss_mlp": 1.03511333, + "epoch": 0.899984969186833, + "flos": 20882629647360.0, + "grad_norm": 2.1535153440352324, + "language_loss": 0.71085668, + "learning_rate": 1.0395736388444864e-07, + "loss": 0.73215359, + "num_input_tokens_seen": 322856710, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 14969, + "time_per_iteration": 2.4344112873077393 + }, + { + "auxiliary_loss_clip": 0.01103451, + "auxiliary_loss_mlp": 0.01028419, + "balance_loss_clip": 1.01689792, + "balance_loss_mlp": 1.03636861, + "epoch": 0.9000450924395009, + "flos": 20921808407040.0, + "grad_norm": 1.7775305510277348, + "language_loss": 0.76438725, + "learning_rate": 1.0383347047099201e-07, + "loss": 0.78570598, + "num_input_tokens_seen": 322876070, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 14970, + "time_per_iteration": 3.8684945106506348 + }, + { + "auxiliary_loss_clip": 0.01101777, + "auxiliary_loss_mlp": 0.01030114, + "balance_loss_clip": 1.01909924, + "balance_loss_mlp": 1.03452706, + "epoch": 0.900105215692169, + "flos": 17165049782400.0, + "grad_norm": 1.6290735872590396, + "language_loss": 0.73082769, + "learning_rate": 1.0370964896035972e-07, + "loss": 0.7521466, + "num_input_tokens_seen": 322895095, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.671875, + "step": 14971, + "time_per_iteration": 2.413736343383789 + }, + { + "auxiliary_loss_clip": 0.01099182, + "auxiliary_loss_mlp": 0.01026185, + "balance_loss_clip": 1.01400757, + "balance_loss_mlp": 1.03376043, + "epoch": 0.900165338944837, + "flos": 19931930426880.0, + "grad_norm": 1.967175811246173, + "language_loss": 0.81928706, + "learning_rate": 1.035858993572476e-07, + "loss": 0.84054077, + "num_input_tokens_seen": 322911845, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65625, + "step": 14972, + "time_per_iteration": 3.8709967136383057 + }, + { + "auxiliary_loss_clip": 0.01102627, + "auxiliary_loss_mlp": 0.01026983, + "balance_loss_clip": 1.01551533, + "balance_loss_mlp": 1.03478956, + "epoch": 0.9002254621975049, + "flos": 16107085572480.0, + "grad_norm": 2.346023909533121, + "language_loss": 0.81425643, + "learning_rate": 1.0346222166634855e-07, + "loss": 0.83555251, + "num_input_tokens_seen": 322928170, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6796875, + "step": 14973, + "time_per_iteration": 3.868040084838867 + }, + { + "auxiliary_loss_clip": 0.01098554, + "auxiliary_loss_mlp": 0.01034054, + "balance_loss_clip": 1.02212131, + "balance_loss_mlp": 1.03365421, + "epoch": 0.9002855854501729, + "flos": 28476120528000.0, + "grad_norm": 1.7893132294646954, + "language_loss": 0.57785386, + "learning_rate": 1.0333861589235193e-07, + "loss": 0.59917992, + "num_input_tokens_seen": 322948165, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6484375, + "step": 14974, + "time_per_iteration": 2.4945571422576904 + }, + { + "auxiliary_loss_clip": 0.01102563, + "auxiliary_loss_mlp": 0.01031866, + "balance_loss_clip": 1.02042198, + "balance_loss_mlp": 1.03682697, + "epoch": 0.9003457087028408, + "flos": 25630307746560.0, + "grad_norm": 1.6956938729099027, + "language_loss": 0.63379133, + "learning_rate": 1.0321508203994489e-07, + "loss": 0.65513563, + "num_input_tokens_seen": 322968880, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 14975, + "time_per_iteration": 2.496723175048828 + }, + { + "auxiliary_loss_clip": 0.01100905, + "auxiliary_loss_mlp": 0.01029003, + "balance_loss_clip": 1.01756501, + "balance_loss_mlp": 1.03466403, + "epoch": 0.9004058319555088, + "flos": 24389414547840.0, + "grad_norm": 1.809232420196502, + "language_loss": 0.7320652, + "learning_rate": 1.0309162011381257e-07, + "loss": 0.75336432, + "num_input_tokens_seen": 322989395, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66015625, + "step": 14976, + "time_per_iteration": 3.913522243499756 + }, + { + "auxiliary_loss_clip": 0.01101866, + "auxiliary_loss_mlp": 0.01029048, + "balance_loss_clip": 1.01757395, + "balance_loss_mlp": 1.03592968, + "epoch": 0.9004659552081767, + "flos": 29059345658880.0, + "grad_norm": 1.8460911703880327, + "language_loss": 0.69739205, + "learning_rate": 1.0296823011863565e-07, + "loss": 0.71870112, + "num_input_tokens_seen": 323009060, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.66015625, + "step": 14977, + "time_per_iteration": 2.4934232234954834 + }, + { + "auxiliary_loss_clip": 0.01101319, + "auxiliary_loss_mlp": 0.01034067, + "balance_loss_clip": 1.02094841, + "balance_loss_mlp": 1.03397775, + "epoch": 0.9005260784608448, + "flos": 16763855800320.0, + "grad_norm": 2.535072369203501, + "language_loss": 0.65230364, + "learning_rate": 1.0284491205909351e-07, + "loss": 0.67365754, + "num_input_tokens_seen": 323027530, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.671875, + "step": 14978, + "time_per_iteration": 2.4551024436950684 + }, + { + "auxiliary_loss_clip": 0.01105073, + "auxiliary_loss_mlp": 0.01031199, + "balance_loss_clip": 1.01883125, + "balance_loss_mlp": 1.03654051, + "epoch": 0.9005862017135127, + "flos": 20376002269440.0, + "grad_norm": 1.7787659821570612, + "language_loss": 0.78990376, + "learning_rate": 1.0272166593986286e-07, + "loss": 0.81126642, + "num_input_tokens_seen": 323045370, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 14979, + "time_per_iteration": 2.487516403198242 + }, + { + "auxiliary_loss_clip": 0.01021612, + "auxiliary_loss_mlp": 0.01001783, + "balance_loss_clip": 1.0008111, + "balance_loss_mlp": 1.00152564, + "epoch": 0.9006463249661807, + "flos": 67580255796480.0, + "grad_norm": 0.7257419902012564, + "language_loss": 0.53625673, + "learning_rate": 1.0259849176561642e-07, + "loss": 0.5564906, + "num_input_tokens_seen": 323105660, + "router_z_loss_clip": 0.00970459, + "router_z_loss_mlp": 0.20117188, + "step": 14980, + "time_per_iteration": 3.094318389892578 + }, + { + "auxiliary_loss_clip": 0.011041, + "auxiliary_loss_mlp": 0.01036054, + "balance_loss_clip": 1.02391291, + "balance_loss_mlp": 1.03616858, + "epoch": 0.9007064482188486, + "flos": 28293335193600.0, + "grad_norm": 1.8547665659485781, + "language_loss": 0.82101512, + "learning_rate": 1.0247538954102553e-07, + "loss": 0.84241676, + "num_input_tokens_seen": 323126365, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6796875, + "step": 14981, + "time_per_iteration": 2.4936017990112305 + }, + { + "auxiliary_loss_clip": 0.01097983, + "auxiliary_loss_mlp": 0.01031157, + "balance_loss_clip": 1.01992702, + "balance_loss_mlp": 1.03415704, + "epoch": 0.9007665714715166, + "flos": 21616320850560.0, + "grad_norm": 2.7203563856425985, + "language_loss": 0.81460178, + "learning_rate": 1.0235235927075758e-07, + "loss": 0.83589315, + "num_input_tokens_seen": 323145655, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.63671875, + "step": 14982, + "time_per_iteration": 2.4565653800964355 + }, + { + "auxiliary_loss_clip": 0.01095335, + "auxiliary_loss_mlp": 0.01029882, + "balance_loss_clip": 1.0194211, + "balance_loss_mlp": 1.03379464, + "epoch": 0.9008266947241845, + "flos": 26541864120960.0, + "grad_norm": 1.8323465775845416, + "language_loss": 0.71544576, + "learning_rate": 1.0222940095947885e-07, + "loss": 0.73669791, + "num_input_tokens_seen": 323164540, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.6171875, + "step": 14983, + "time_per_iteration": 2.4885804653167725 + }, + { + "auxiliary_loss_clip": 0.0109823, + "auxiliary_loss_mlp": 0.0102521, + "balance_loss_clip": 1.0146662, + "balance_loss_mlp": 1.03483844, + "epoch": 0.9008868179768525, + "flos": 23110527738240.0, + "grad_norm": 1.3467294069488691, + "language_loss": 0.75163013, + "learning_rate": 1.0210651461185115e-07, + "loss": 0.77286458, + "num_input_tokens_seen": 323186960, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6328125, + "step": 14984, + "time_per_iteration": 2.49717116355896 + }, + { + "auxiliary_loss_clip": 0.01095786, + "auxiliary_loss_mlp": 0.01031055, + "balance_loss_clip": 1.02003372, + "balance_loss_mlp": 1.03259993, + "epoch": 0.9009469412295206, + "flos": 19060809788160.0, + "grad_norm": 1.3973917605872446, + "language_loss": 0.70561159, + "learning_rate": 1.0198370023253456e-07, + "loss": 0.72688001, + "num_input_tokens_seen": 323206135, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6328125, + "step": 14985, + "time_per_iteration": 2.4311397075653076 + }, + { + "auxiliary_loss_clip": 0.01102011, + "auxiliary_loss_mlp": 0.01030572, + "balance_loss_clip": 1.01862133, + "balance_loss_mlp": 1.03354049, + "epoch": 0.9010070644821885, + "flos": 23222281927680.0, + "grad_norm": 1.8726650085306151, + "language_loss": 0.70216691, + "learning_rate": 1.0186095782618643e-07, + "loss": 0.7234928, + "num_input_tokens_seen": 323225980, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.68359375, + "step": 14986, + "time_per_iteration": 2.4868855476379395 + }, + { + "auxiliary_loss_clip": 0.0110024, + "auxiliary_loss_mlp": 0.01030525, + "balance_loss_clip": 1.01909316, + "balance_loss_mlp": 1.03285623, + "epoch": 0.9010671877348565, + "flos": 17384823146880.0, + "grad_norm": 1.6031253163932484, + "language_loss": 0.76845485, + "learning_rate": 1.0173828739746104e-07, + "loss": 0.7897625, + "num_input_tokens_seen": 323243700, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.67578125, + "step": 14987, + "time_per_iteration": 2.440084457397461 + }, + { + "auxiliary_loss_clip": 0.01098211, + "auxiliary_loss_mlp": 0.01030635, + "balance_loss_clip": 1.0194118, + "balance_loss_mlp": 1.03421617, + "epoch": 0.9011273109875244, + "flos": 21908166854400.0, + "grad_norm": 2.1006548743325673, + "language_loss": 0.74064976, + "learning_rate": 1.0161568895100981e-07, + "loss": 0.76193821, + "num_input_tokens_seen": 323261535, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 14988, + "time_per_iteration": 2.4963526725769043 + }, + { + "auxiliary_loss_clip": 0.01104597, + "auxiliary_loss_mlp": 0.01031072, + "balance_loss_clip": 1.01829827, + "balance_loss_mlp": 1.0362258, + "epoch": 0.9011874342401924, + "flos": 24060831909120.0, + "grad_norm": 1.8626335922164043, + "language_loss": 0.69308305, + "learning_rate": 1.0149316249148188e-07, + "loss": 0.71443975, + "num_input_tokens_seen": 323281855, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.68359375, + "step": 14989, + "time_per_iteration": 2.4539735317230225 + }, + { + "auxiliary_loss_clip": 0.01100876, + "auxiliary_loss_mlp": 0.01026062, + "balance_loss_clip": 1.01488054, + "balance_loss_mlp": 1.03533888, + "epoch": 0.9012475574928603, + "flos": 16758791982720.0, + "grad_norm": 1.8473333523557436, + "language_loss": 0.79848897, + "learning_rate": 1.0137070802352376e-07, + "loss": 0.8197583, + "num_input_tokens_seen": 323299505, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.65625, + "step": 14990, + "time_per_iteration": 2.4267704486846924 + }, + { + "auxiliary_loss_clip": 0.01103975, + "auxiliary_loss_mlp": 0.01027054, + "balance_loss_clip": 1.01531839, + "balance_loss_mlp": 1.03590596, + "epoch": 0.9013076807455284, + "flos": 19971109186560.0, + "grad_norm": 2.32674019018607, + "language_loss": 0.77580243, + "learning_rate": 1.0124832555177842e-07, + "loss": 0.79711276, + "num_input_tokens_seen": 323318365, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6796875, + "step": 14991, + "time_per_iteration": 2.4224207401275635 + }, + { + "auxiliary_loss_clip": 0.01021773, + "auxiliary_loss_mlp": 0.01002102, + "balance_loss_clip": 1.00102293, + "balance_loss_mlp": 1.00179267, + "epoch": 0.9013678039981963, + "flos": 65180274624000.0, + "grad_norm": 0.7778537052322216, + "language_loss": 0.60237074, + "learning_rate": 1.0112601508088726e-07, + "loss": 0.6226095, + "num_input_tokens_seen": 323371835, + "router_z_loss_clip": 0.01080322, + "router_z_loss_mlp": 0.19921875, + "step": 14992, + "time_per_iteration": 2.954866886138916 + }, + { + "auxiliary_loss_clip": 0.01098898, + "auxiliary_loss_mlp": 0.01024572, + "balance_loss_clip": 1.01278269, + "balance_loss_mlp": 1.03394318, + "epoch": 0.9014279272508643, + "flos": 20521224956160.0, + "grad_norm": 1.962840452399395, + "language_loss": 0.82822621, + "learning_rate": 1.0100377661548764e-07, + "loss": 0.8494609, + "num_input_tokens_seen": 323388495, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6484375, + "step": 14993, + "time_per_iteration": 2.443319797515869 + }, + { + "auxiliary_loss_clip": 0.01099362, + "auxiliary_loss_mlp": 0.01030918, + "balance_loss_clip": 1.01925957, + "balance_loss_mlp": 1.03348362, + "epoch": 0.9014880505035322, + "flos": 17309051406720.0, + "grad_norm": 1.9906429893811004, + "language_loss": 0.73098803, + "learning_rate": 1.0088161016021502e-07, + "loss": 0.75229084, + "num_input_tokens_seen": 323405280, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66015625, + "step": 14994, + "time_per_iteration": 2.410883665084839 + }, + { + "auxiliary_loss_clip": 0.01096276, + "auxiliary_loss_mlp": 0.01027611, + "balance_loss_clip": 1.01672149, + "balance_loss_mlp": 1.03312874, + "epoch": 0.9015481737562002, + "flos": 28402862739840.0, + "grad_norm": 1.724933842876544, + "language_loss": 0.64662391, + "learning_rate": 1.0075951571970187e-07, + "loss": 0.66786277, + "num_input_tokens_seen": 323425310, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6328125, + "step": 14995, + "time_per_iteration": 2.489705801010132 + }, + { + "auxiliary_loss_clip": 0.01099565, + "auxiliary_loss_mlp": 0.01029987, + "balance_loss_clip": 1.01805377, + "balance_loss_mlp": 1.03272486, + "epoch": 0.9016082970088681, + "flos": 29752672953600.0, + "grad_norm": 1.5339343417289035, + "language_loss": 0.66576183, + "learning_rate": 1.0063749329857873e-07, + "loss": 0.68705738, + "num_input_tokens_seen": 323447805, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66796875, + "step": 14996, + "time_per_iteration": 2.5253312587738037 + }, + { + "auxiliary_loss_clip": 0.01097858, + "auxiliary_loss_mlp": 0.01029991, + "balance_loss_clip": 1.01918435, + "balance_loss_mlp": 1.03384423, + "epoch": 0.9016684202615362, + "flos": 23513230091520.0, + "grad_norm": 1.737794019911755, + "language_loss": 0.6594162, + "learning_rate": 1.0051554290147168e-07, + "loss": 0.6806947, + "num_input_tokens_seen": 323467150, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.640625, + "step": 14997, + "time_per_iteration": 2.457235097885132 + }, + { + "auxiliary_loss_clip": 0.01099415, + "auxiliary_loss_mlp": 0.01033041, + "balance_loss_clip": 1.02148402, + "balance_loss_mlp": 1.03402448, + "epoch": 0.9017285435142042, + "flos": 16979247705600.0, + "grad_norm": 1.924410935543244, + "language_loss": 0.77711892, + "learning_rate": 1.0039366453300613e-07, + "loss": 0.79844344, + "num_input_tokens_seen": 323484250, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 14998, + "time_per_iteration": 2.4296000003814697 + }, + { + "auxiliary_loss_clip": 0.01099155, + "auxiliary_loss_mlp": 0.01027859, + "balance_loss_clip": 1.01644444, + "balance_loss_mlp": 1.03295517, + "epoch": 0.9017886667668721, + "flos": 21393351175680.0, + "grad_norm": 1.7165767621933876, + "language_loss": 0.74958098, + "learning_rate": 1.0027185819780281e-07, + "loss": 0.77085114, + "num_input_tokens_seen": 323502910, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6640625, + "step": 14999, + "time_per_iteration": 2.4602272510528564 + }, + { + "auxiliary_loss_clip": 0.01098364, + "auxiliary_loss_mlp": 0.01028562, + "balance_loss_clip": 1.01653385, + "balance_loss_mlp": 1.033705, + "epoch": 0.9018487900195401, + "flos": 20996574566400.0, + "grad_norm": 2.1701142203140855, + "language_loss": 0.7583977, + "learning_rate": 1.0015012390048117e-07, + "loss": 0.77966702, + "num_input_tokens_seen": 323521820, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6484375, + "step": 15000, + "time_per_iteration": 2.4680068492889404 + }, + { + "auxiliary_loss_clip": 0.01096199, + "auxiliary_loss_mlp": 0.01023147, + "balance_loss_clip": 1.01187539, + "balance_loss_mlp": 1.03231204, + "epoch": 0.901908913272208, + "flos": 53358443458560.0, + "grad_norm": 2.156487419883931, + "language_loss": 0.8029359, + "learning_rate": 1.0002846164565704e-07, + "loss": 0.82412934, + "num_input_tokens_seen": 323543200, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.640625, + "step": 15001, + "time_per_iteration": 2.746706485748291 + }, + { + "auxiliary_loss_clip": 0.01099004, + "auxiliary_loss_mlp": 0.01026922, + "balance_loss_clip": 1.01602638, + "balance_loss_mlp": 1.03517938, + "epoch": 0.901969036524876, + "flos": 22089838867200.0, + "grad_norm": 1.5066901043573788, + "language_loss": 0.78355694, + "learning_rate": 9.990687143794407e-08, + "loss": 0.80481625, + "num_input_tokens_seen": 323563075, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.63671875, + "step": 15002, + "time_per_iteration": 2.4581403732299805 + }, + { + "auxiliary_loss_clip": 0.01101993, + "auxiliary_loss_mlp": 0.01031757, + "balance_loss_clip": 1.0189364, + "balance_loss_mlp": 1.03616595, + "epoch": 0.9020291597775439, + "flos": 23835025059840.0, + "grad_norm": 1.9922520479802717, + "language_loss": 0.68118757, + "learning_rate": 9.978535328195347e-08, + "loss": 0.70252508, + "num_input_tokens_seen": 323579065, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.65625, + "step": 15003, + "time_per_iteration": 2.4612655639648438 + }, + { + "auxiliary_loss_clip": 0.01101641, + "auxiliary_loss_mlp": 0.01035508, + "balance_loss_clip": 1.02349198, + "balance_loss_mlp": 1.03430641, + "epoch": 0.902089283030212, + "flos": 18326005263360.0, + "grad_norm": 2.2792574133264916, + "language_loss": 0.8624227, + "learning_rate": 9.9663907182292e-08, + "loss": 0.88379425, + "num_input_tokens_seen": 323594835, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 15004, + "time_per_iteration": 2.415255308151245 + }, + { + "auxiliary_loss_clip": 0.01100861, + "auxiliary_loss_mlp": 0.01029911, + "balance_loss_clip": 1.01819897, + "balance_loss_mlp": 1.03495574, + "epoch": 0.9021494062828799, + "flos": 24170359455360.0, + "grad_norm": 2.707915413244076, + "language_loss": 0.72780323, + "learning_rate": 9.954253314356575e-08, + "loss": 0.74911094, + "num_input_tokens_seen": 323611475, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66015625, + "step": 15005, + "time_per_iteration": 2.4494197368621826 + }, + { + "auxiliary_loss_clip": 0.01100686, + "auxiliary_loss_mlp": 0.01027584, + "balance_loss_clip": 1.01526415, + "balance_loss_mlp": 1.03250015, + "epoch": 0.9022095295355479, + "flos": 21616859554560.0, + "grad_norm": 1.9208568535467845, + "language_loss": 0.71333838, + "learning_rate": 9.942123117037748e-08, + "loss": 0.73462105, + "num_input_tokens_seen": 323629730, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.68359375, + "step": 15006, + "time_per_iteration": 2.4571921825408936 + }, + { + "auxiliary_loss_clip": 0.01102333, + "auxiliary_loss_mlp": 0.01027269, + "balance_loss_clip": 1.01600361, + "balance_loss_mlp": 1.03493929, + "epoch": 0.9022696527882158, + "flos": 18726229578240.0, + "grad_norm": 1.8905972356215264, + "language_loss": 0.84425151, + "learning_rate": 9.930000126732618e-08, + "loss": 0.8655476, + "num_input_tokens_seen": 323646000, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.67578125, + "step": 15007, + "time_per_iteration": 2.40256404876709 + }, + { + "auxiliary_loss_clip": 0.01098363, + "auxiliary_loss_mlp": 0.01028058, + "balance_loss_clip": 1.01581562, + "balance_loss_mlp": 1.03405857, + "epoch": 0.9023297760408838, + "flos": 26761206522240.0, + "grad_norm": 1.665209400281313, + "language_loss": 0.78456664, + "learning_rate": 9.917884343900928e-08, + "loss": 0.8058309, + "num_input_tokens_seen": 323667250, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.64453125, + "step": 15008, + "time_per_iteration": 2.5052990913391113 + }, + { + "auxiliary_loss_clip": 0.01094747, + "auxiliary_loss_mlp": 0.01031954, + "balance_loss_clip": 1.02086759, + "balance_loss_mlp": 1.03389013, + "epoch": 0.9023898992935517, + "flos": 20522553759360.0, + "grad_norm": 1.7711717743848543, + "language_loss": 0.73629749, + "learning_rate": 9.905775769002156e-08, + "loss": 0.75756449, + "num_input_tokens_seen": 323687150, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.609375, + "step": 15009, + "time_per_iteration": 2.429013252258301 + }, + { + "auxiliary_loss_clip": 0.01097775, + "auxiliary_loss_mlp": 0.01030275, + "balance_loss_clip": 1.01856875, + "balance_loss_mlp": 1.03367591, + "epoch": 0.9024500225462198, + "flos": 17456644391040.0, + "grad_norm": 1.8285057036627976, + "language_loss": 0.73694813, + "learning_rate": 9.893674402495399e-08, + "loss": 0.75822866, + "num_input_tokens_seen": 323703660, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.640625, + "step": 15010, + "time_per_iteration": 2.474562644958496 + }, + { + "auxiliary_loss_clip": 0.01100669, + "auxiliary_loss_mlp": 0.01027502, + "balance_loss_clip": 1.01529527, + "balance_loss_mlp": 1.03394616, + "epoch": 0.9025101457988878, + "flos": 20813609664000.0, + "grad_norm": 1.9793507655734852, + "language_loss": 0.74131656, + "learning_rate": 9.881580244839538e-08, + "loss": 0.76259828, + "num_input_tokens_seen": 323722060, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66796875, + "step": 15011, + "time_per_iteration": 2.4341142177581787 + }, + { + "auxiliary_loss_clip": 0.01102832, + "auxiliary_loss_mlp": 0.01029835, + "balance_loss_clip": 1.01797962, + "balance_loss_mlp": 1.03446949, + "epoch": 0.9025702690515557, + "flos": 19026371623680.0, + "grad_norm": 1.9745596076843646, + "language_loss": 0.73315668, + "learning_rate": 9.869493296493204e-08, + "loss": 0.75448334, + "num_input_tokens_seen": 323740645, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.68359375, + "step": 15012, + "time_per_iteration": 3.8315765857696533 + }, + { + "auxiliary_loss_clip": 0.01099189, + "auxiliary_loss_mlp": 0.01034355, + "balance_loss_clip": 1.02320898, + "balance_loss_mlp": 1.03451538, + "epoch": 0.9026303923042237, + "flos": 19682818629120.0, + "grad_norm": 1.7128250561733314, + "language_loss": 0.69050443, + "learning_rate": 9.857413557914763e-08, + "loss": 0.71183991, + "num_input_tokens_seen": 323758905, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 15013, + "time_per_iteration": 3.9151742458343506 + }, + { + "auxiliary_loss_clip": 0.01095444, + "auxiliary_loss_mlp": 0.01030147, + "balance_loss_clip": 1.01933491, + "balance_loss_mlp": 1.03227758, + "epoch": 0.9026905155568916, + "flos": 24608110504320.0, + "grad_norm": 1.3635728352284588, + "language_loss": 0.73009402, + "learning_rate": 9.845341029562249e-08, + "loss": 0.75134999, + "num_input_tokens_seen": 323780595, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6328125, + "step": 15014, + "time_per_iteration": 4.017148971557617 + }, + { + "auxiliary_loss_clip": 0.01098941, + "auxiliary_loss_mlp": 0.01027719, + "balance_loss_clip": 1.01563144, + "balance_loss_mlp": 1.03362596, + "epoch": 0.9027506388095596, + "flos": 20521799573760.0, + "grad_norm": 1.7828669233535654, + "language_loss": 0.72010767, + "learning_rate": 9.833275711893474e-08, + "loss": 0.74137437, + "num_input_tokens_seen": 323798160, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.65234375, + "step": 15015, + "time_per_iteration": 2.4381561279296875 + }, + { + "auxiliary_loss_clip": 0.01100433, + "auxiliary_loss_mlp": 0.01029767, + "balance_loss_clip": 1.01884782, + "balance_loss_mlp": 1.03419542, + "epoch": 0.9028107620622275, + "flos": 22784494965120.0, + "grad_norm": 1.9749885749535356, + "language_loss": 0.68876898, + "learning_rate": 9.821217605365895e-08, + "loss": 0.71007097, + "num_input_tokens_seen": 323816810, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6640625, + "step": 15016, + "time_per_iteration": 2.4668843746185303 + }, + { + "auxiliary_loss_clip": 0.0109769, + "auxiliary_loss_mlp": 0.01024064, + "balance_loss_clip": 1.01355553, + "balance_loss_mlp": 1.03386807, + "epoch": 0.9028708853148956, + "flos": 25410534382080.0, + "grad_norm": 2.00411116225002, + "language_loss": 0.70883679, + "learning_rate": 9.809166710436855e-08, + "loss": 0.73005426, + "num_input_tokens_seen": 323836900, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.640625, + "step": 15017, + "time_per_iteration": 2.482185125350952 + }, + { + "auxiliary_loss_clip": 0.01103533, + "auxiliary_loss_mlp": 0.0102941, + "balance_loss_clip": 1.0185684, + "balance_loss_mlp": 1.0386523, + "epoch": 0.9029310085675635, + "flos": 21871322478720.0, + "grad_norm": 1.6650542688248349, + "language_loss": 0.69549167, + "learning_rate": 9.797123027563237e-08, + "loss": 0.71682107, + "num_input_tokens_seen": 323855325, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6484375, + "step": 15018, + "time_per_iteration": 2.4584224224090576 + }, + { + "auxiliary_loss_clip": 0.01101224, + "auxiliary_loss_mlp": 0.01028924, + "balance_loss_clip": 1.01672328, + "balance_loss_mlp": 1.03524184, + "epoch": 0.9029911318202315, + "flos": 26214394803840.0, + "grad_norm": 1.6802563263084365, + "language_loss": 0.68777132, + "learning_rate": 9.785086557201782e-08, + "loss": 0.70907283, + "num_input_tokens_seen": 323875650, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66015625, + "step": 15019, + "time_per_iteration": 3.9201221466064453 + }, + { + "auxiliary_loss_clip": 0.01096083, + "auxiliary_loss_mlp": 0.01034982, + "balance_loss_clip": 1.02395487, + "balance_loss_mlp": 1.03281188, + "epoch": 0.9030512550728994, + "flos": 15961360095360.0, + "grad_norm": 1.798205753990298, + "language_loss": 0.71837938, + "learning_rate": 9.773057299808951e-08, + "loss": 0.73969001, + "num_input_tokens_seen": 323892920, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6328125, + "step": 15020, + "time_per_iteration": 2.4389073848724365 + }, + { + "auxiliary_loss_clip": 0.01099908, + "auxiliary_loss_mlp": 0.01031951, + "balance_loss_clip": 1.02038217, + "balance_loss_mlp": 1.03332424, + "epoch": 0.9031113783255674, + "flos": 23987610034560.0, + "grad_norm": 1.5503421486801767, + "language_loss": 0.74545062, + "learning_rate": 9.7610352558408e-08, + "loss": 0.76676923, + "num_input_tokens_seen": 323913835, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 15021, + "time_per_iteration": 2.4600391387939453 + }, + { + "auxiliary_loss_clip": 0.01103444, + "auxiliary_loss_mlp": 0.01028888, + "balance_loss_clip": 1.01647258, + "balance_loss_mlp": 1.03510642, + "epoch": 0.9031715015782353, + "flos": 22237216369920.0, + "grad_norm": 1.9984076842455942, + "language_loss": 0.72428268, + "learning_rate": 9.749020425753251e-08, + "loss": 0.74560601, + "num_input_tokens_seen": 323933440, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 15022, + "time_per_iteration": 2.459587335586548 + }, + { + "auxiliary_loss_clip": 0.01094092, + "auxiliary_loss_mlp": 0.01026663, + "balance_loss_clip": 1.01590419, + "balance_loss_mlp": 1.03282905, + "epoch": 0.9032316248309034, + "flos": 26323168164480.0, + "grad_norm": 1.75835560133373, + "language_loss": 0.72548139, + "learning_rate": 9.737012810001943e-08, + "loss": 0.74668896, + "num_input_tokens_seen": 323954090, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.61328125, + "step": 15023, + "time_per_iteration": 2.482862710952759 + }, + { + "auxiliary_loss_clip": 0.01099722, + "auxiliary_loss_mlp": 0.01028059, + "balance_loss_clip": 1.01691282, + "balance_loss_mlp": 1.03500438, + "epoch": 0.9032917480835713, + "flos": 22636686499200.0, + "grad_norm": 1.6389407013469093, + "language_loss": 0.82752883, + "learning_rate": 9.725012409042155e-08, + "loss": 0.84880662, + "num_input_tokens_seen": 323974040, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 15024, + "time_per_iteration": 2.4502294063568115 + }, + { + "auxiliary_loss_clip": 0.01100414, + "auxiliary_loss_mlp": 0.01026671, + "balance_loss_clip": 1.01547694, + "balance_loss_mlp": 1.03365493, + "epoch": 0.9033518713362393, + "flos": 23878764846720.0, + "grad_norm": 1.4809591037049192, + "language_loss": 0.69610882, + "learning_rate": 9.713019223328966e-08, + "loss": 0.71737969, + "num_input_tokens_seen": 323996125, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.66796875, + "step": 15025, + "time_per_iteration": 2.4894895553588867 + }, + { + "auxiliary_loss_clip": 0.01096725, + "auxiliary_loss_mlp": 0.01030602, + "balance_loss_clip": 1.01903272, + "balance_loss_mlp": 1.03266263, + "epoch": 0.9034119945889073, + "flos": 26905279973760.0, + "grad_norm": 2.0415991613165314, + "language_loss": 0.76887953, + "learning_rate": 9.70103325331717e-08, + "loss": 0.79015279, + "num_input_tokens_seen": 324017645, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.640625, + "step": 15026, + "time_per_iteration": 2.4938175678253174 + }, + { + "auxiliary_loss_clip": 0.01099584, + "auxiliary_loss_mlp": 0.01026983, + "balance_loss_clip": 1.01609278, + "balance_loss_mlp": 1.03508067, + "epoch": 0.9034721178415752, + "flos": 20850166730880.0, + "grad_norm": 1.603136682334736, + "language_loss": 0.68016422, + "learning_rate": 9.68905449946129e-08, + "loss": 0.7014299, + "num_input_tokens_seen": 324036875, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6484375, + "step": 15027, + "time_per_iteration": 2.4476981163024902 + }, + { + "auxiliary_loss_clip": 0.01095957, + "auxiliary_loss_mlp": 0.01028791, + "balance_loss_clip": 1.01779437, + "balance_loss_mlp": 1.03428614, + "epoch": 0.9035322410942432, + "flos": 22234307368320.0, + "grad_norm": 1.6502009028380256, + "language_loss": 0.76070625, + "learning_rate": 9.677082962215477e-08, + "loss": 0.78195375, + "num_input_tokens_seen": 324057045, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6171875, + "step": 15028, + "time_per_iteration": 2.4905800819396973 + }, + { + "auxiliary_loss_clip": 0.01099349, + "auxiliary_loss_mlp": 0.01033987, + "balance_loss_clip": 1.02226925, + "balance_loss_mlp": 1.03465712, + "epoch": 0.9035923643469111, + "flos": 25923410726400.0, + "grad_norm": 1.6442905394106826, + "language_loss": 0.69341254, + "learning_rate": 9.665118642033765e-08, + "loss": 0.71474588, + "num_input_tokens_seen": 324079735, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6484375, + "step": 15029, + "time_per_iteration": 2.5010592937469482 + }, + { + "auxiliary_loss_clip": 0.01101158, + "auxiliary_loss_mlp": 0.01029375, + "balance_loss_clip": 1.01723909, + "balance_loss_mlp": 1.03424096, + "epoch": 0.9036524875995792, + "flos": 20339804338560.0, + "grad_norm": 1.8286388517310772, + "language_loss": 0.73346627, + "learning_rate": 9.653161539369858e-08, + "loss": 0.75477159, + "num_input_tokens_seen": 324097785, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.66796875, + "step": 15030, + "time_per_iteration": 2.459181308746338 + }, + { + "auxiliary_loss_clip": 0.01101727, + "auxiliary_loss_mlp": 0.01028974, + "balance_loss_clip": 1.01754785, + "balance_loss_mlp": 1.03421438, + "epoch": 0.9037126108522471, + "flos": 40114624677120.0, + "grad_norm": 2.1675373254855548, + "language_loss": 0.68079257, + "learning_rate": 9.641211654677151e-08, + "loss": 0.70209956, + "num_input_tokens_seen": 324121625, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.67578125, + "step": 15031, + "time_per_iteration": 2.5919394493103027 + }, + { + "auxiliary_loss_clip": 0.01097122, + "auxiliary_loss_mlp": 0.01024246, + "balance_loss_clip": 1.01340365, + "balance_loss_mlp": 1.03348684, + "epoch": 0.9037727341049151, + "flos": 23332024955520.0, + "grad_norm": 1.5159455888593432, + "language_loss": 0.76419586, + "learning_rate": 9.629268988408723e-08, + "loss": 0.78540957, + "num_input_tokens_seen": 324142535, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.63671875, + "step": 15032, + "time_per_iteration": 2.489576578140259 + }, + { + "auxiliary_loss_clip": 0.01101137, + "auxiliary_loss_mlp": 0.0103026, + "balance_loss_clip": 1.01882803, + "balance_loss_mlp": 1.03507411, + "epoch": 0.903832857357583, + "flos": 12822659815680.0, + "grad_norm": 2.423158447270867, + "language_loss": 0.74984133, + "learning_rate": 9.617333541017502e-08, + "loss": 0.77115536, + "num_input_tokens_seen": 324159610, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66015625, + "step": 15033, + "time_per_iteration": 2.4186246395111084 + }, + { + "auxiliary_loss_clip": 0.01101634, + "auxiliary_loss_mlp": 0.01033122, + "balance_loss_clip": 1.02090335, + "balance_loss_mlp": 1.03524292, + "epoch": 0.903892980610251, + "flos": 25703026830720.0, + "grad_norm": 1.9290223402014928, + "language_loss": 0.74161011, + "learning_rate": 9.605405312956105e-08, + "loss": 0.76295769, + "num_input_tokens_seen": 324182510, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6640625, + "step": 15034, + "time_per_iteration": 2.5180463790893555 + }, + { + "auxiliary_loss_clip": 0.0110122, + "auxiliary_loss_mlp": 0.01031305, + "balance_loss_clip": 1.02005112, + "balance_loss_mlp": 1.0361948, + "epoch": 0.9039531038629189, + "flos": 14684089397760.0, + "grad_norm": 1.5379813145844734, + "language_loss": 0.63320333, + "learning_rate": 9.593484304676791e-08, + "loss": 0.65452856, + "num_input_tokens_seen": 324200555, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 15035, + "time_per_iteration": 2.409738540649414 + }, + { + "auxiliary_loss_clip": 0.01099917, + "auxiliary_loss_mlp": 0.01030413, + "balance_loss_clip": 1.01796794, + "balance_loss_mlp": 1.03505278, + "epoch": 0.904013227115587, + "flos": 24024921287040.0, + "grad_norm": 1.9696969037347303, + "language_loss": 0.61885166, + "learning_rate": 9.581570516631643e-08, + "loss": 0.64015502, + "num_input_tokens_seen": 324220255, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6484375, + "step": 15036, + "time_per_iteration": 2.470014810562134 + }, + { + "auxiliary_loss_clip": 0.01095707, + "auxiliary_loss_mlp": 0.01028293, + "balance_loss_clip": 1.01708138, + "balance_loss_mlp": 1.03364372, + "epoch": 0.9040733503682549, + "flos": 22856459863680.0, + "grad_norm": 1.9028952034418158, + "language_loss": 0.82219112, + "learning_rate": 9.569663949272455e-08, + "loss": 0.84343117, + "num_input_tokens_seen": 324237855, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.62109375, + "step": 15037, + "time_per_iteration": 2.4479734897613525 + }, + { + "auxiliary_loss_clip": 0.01100932, + "auxiliary_loss_mlp": 0.01026081, + "balance_loss_clip": 1.01457763, + "balance_loss_mlp": 1.03461885, + "epoch": 0.9041334736209229, + "flos": 19974951941760.0, + "grad_norm": 1.906996671867115, + "language_loss": 0.67425549, + "learning_rate": 9.557764603050667e-08, + "loss": 0.69552565, + "num_input_tokens_seen": 324257050, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 15038, + "time_per_iteration": 2.451862335205078 + }, + { + "auxiliary_loss_clip": 0.01099559, + "auxiliary_loss_mlp": 0.01034581, + "balance_loss_clip": 1.02312541, + "balance_loss_mlp": 1.03372645, + "epoch": 0.9041935968735909, + "flos": 17530548624000.0, + "grad_norm": 1.957839871915482, + "language_loss": 0.75246155, + "learning_rate": 9.545872478417494e-08, + "loss": 0.77380288, + "num_input_tokens_seen": 324275510, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 15039, + "time_per_iteration": 2.4133412837982178 + }, + { + "auxiliary_loss_clip": 0.0109794, + "auxiliary_loss_mlp": 0.0102971, + "balance_loss_clip": 1.01852202, + "balance_loss_mlp": 1.03453422, + "epoch": 0.9042537201262588, + "flos": 22780149419520.0, + "grad_norm": 2.283373021673528, + "language_loss": 0.70320803, + "learning_rate": 9.533987575823977e-08, + "loss": 0.72448456, + "num_input_tokens_seen": 324295150, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6328125, + "step": 15040, + "time_per_iteration": 2.4517173767089844 + }, + { + "auxiliary_loss_clip": 0.01096888, + "auxiliary_loss_mlp": 0.01028129, + "balance_loss_clip": 1.01703691, + "balance_loss_mlp": 1.03318739, + "epoch": 0.9043138433789268, + "flos": 20595416497920.0, + "grad_norm": 1.9449826459631894, + "language_loss": 0.68166679, + "learning_rate": 9.522109895720709e-08, + "loss": 0.70291698, + "num_input_tokens_seen": 324313855, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.63671875, + "step": 15041, + "time_per_iteration": 2.4308114051818848 + }, + { + "auxiliary_loss_clip": 0.01098669, + "auxiliary_loss_mlp": 0.01028741, + "balance_loss_clip": 1.01715946, + "balance_loss_mlp": 1.03303921, + "epoch": 0.9043739666315948, + "flos": 32962978995840.0, + "grad_norm": 4.567380169873624, + "language_loss": 0.57341689, + "learning_rate": 9.510239438558155e-08, + "loss": 0.59469104, + "num_input_tokens_seen": 324338465, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 15042, + "time_per_iteration": 2.55881404876709 + }, + { + "auxiliary_loss_clip": 0.01021889, + "auxiliary_loss_mlp": 0.00999686, + "balance_loss_clip": 0.99871486, + "balance_loss_mlp": 1.00187063, + "epoch": 0.9044340898842628, + "flos": 67296418525440.0, + "grad_norm": 0.7814772334169297, + "language_loss": 0.56925297, + "learning_rate": 9.498376204786351e-08, + "loss": 0.58946878, + "num_input_tokens_seen": 324398740, + "router_z_loss_clip": 0.00970459, + "router_z_loss_mlp": 0.20117188, + "step": 15043, + "time_per_iteration": 3.017444610595703 + }, + { + "auxiliary_loss_clip": 0.01101344, + "auxiliary_loss_mlp": 0.01027245, + "balance_loss_clip": 1.01484108, + "balance_loss_mlp": 1.03475761, + "epoch": 0.9044942131369307, + "flos": 17713154390400.0, + "grad_norm": 1.7111729102686908, + "language_loss": 0.69845128, + "learning_rate": 9.486520194855274e-08, + "loss": 0.71973717, + "num_input_tokens_seen": 324417335, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6640625, + "step": 15044, + "time_per_iteration": 2.434213876724243 + }, + { + "auxiliary_loss_clip": 0.01101302, + "auxiliary_loss_mlp": 0.01035972, + "balance_loss_clip": 1.02353251, + "balance_loss_mlp": 1.03485394, + "epoch": 0.9045543363895987, + "flos": 17820563034240.0, + "grad_norm": 3.2925488860468506, + "language_loss": 0.69370788, + "learning_rate": 9.474671409214407e-08, + "loss": 0.71508062, + "num_input_tokens_seen": 324433240, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6640625, + "step": 15045, + "time_per_iteration": 2.3861148357391357 + }, + { + "auxiliary_loss_clip": 0.01103641, + "auxiliary_loss_mlp": 0.01035073, + "balance_loss_clip": 1.02292609, + "balance_loss_mlp": 1.03618574, + "epoch": 0.9046144596422666, + "flos": 21872723109120.0, + "grad_norm": 2.269963118730515, + "language_loss": 0.65502143, + "learning_rate": 9.462829848313081e-08, + "loss": 0.67640865, + "num_input_tokens_seen": 324452675, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.67578125, + "step": 15046, + "time_per_iteration": 2.4733235836029053 + }, + { + "auxiliary_loss_clip": 0.01101419, + "auxiliary_loss_mlp": 0.01031632, + "balance_loss_clip": 1.02004504, + "balance_loss_mlp": 1.03384709, + "epoch": 0.9046745828949346, + "flos": 17672646827520.0, + "grad_norm": 2.0115064560474045, + "language_loss": 0.62080795, + "learning_rate": 9.450995512600379e-08, + "loss": 0.64213848, + "num_input_tokens_seen": 324467865, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.67578125, + "step": 15047, + "time_per_iteration": 2.4283289909362793 + }, + { + "auxiliary_loss_clip": 0.01099821, + "auxiliary_loss_mlp": 0.01030926, + "balance_loss_clip": 1.0202632, + "balance_loss_mlp": 1.03540373, + "epoch": 0.9047347061476025, + "flos": 25702559953920.0, + "grad_norm": 1.454198093610582, + "language_loss": 0.71481526, + "learning_rate": 9.439168402525032e-08, + "loss": 0.73612273, + "num_input_tokens_seen": 324490430, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.64453125, + "step": 15048, + "time_per_iteration": 2.5019192695617676 + }, + { + "auxiliary_loss_clip": 0.01100982, + "auxiliary_loss_mlp": 0.01032451, + "balance_loss_clip": 1.01958215, + "balance_loss_mlp": 1.0330019, + "epoch": 0.9047948294002706, + "flos": 15158146118400.0, + "grad_norm": 2.3424128392967405, + "language_loss": 0.75003755, + "learning_rate": 9.427348518535483e-08, + "loss": 0.77137184, + "num_input_tokens_seen": 324506620, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6796875, + "step": 15049, + "time_per_iteration": 2.400944948196411 + }, + { + "auxiliary_loss_clip": 0.01099611, + "auxiliary_loss_mlp": 0.01027604, + "balance_loss_clip": 1.01615953, + "balance_loss_mlp": 1.03538918, + "epoch": 0.9048549526529385, + "flos": 21872292145920.0, + "grad_norm": 1.6894904207673944, + "language_loss": 0.75737369, + "learning_rate": 9.415535861079993e-08, + "loss": 0.77864587, + "num_input_tokens_seen": 324525505, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.64453125, + "step": 15050, + "time_per_iteration": 2.477255344390869 + }, + { + "auxiliary_loss_clip": 0.01100101, + "auxiliary_loss_mlp": 0.0103241, + "balance_loss_clip": 1.02103782, + "balance_loss_mlp": 1.03422129, + "epoch": 0.9049150759056065, + "flos": 23546626761600.0, + "grad_norm": 1.6844009499475698, + "language_loss": 0.81676704, + "learning_rate": 9.403730430606472e-08, + "loss": 0.83809221, + "num_input_tokens_seen": 324544415, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.66015625, + "step": 15051, + "time_per_iteration": 2.4604718685150146 + }, + { + "auxiliary_loss_clip": 0.01099469, + "auxiliary_loss_mlp": 0.01028744, + "balance_loss_clip": 1.01816452, + "balance_loss_mlp": 1.0342989, + "epoch": 0.9049751991582745, + "flos": 19645902426240.0, + "grad_norm": 2.137215474768832, + "language_loss": 0.88935357, + "learning_rate": 9.391932227562582e-08, + "loss": 0.91063577, + "num_input_tokens_seen": 324562555, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6484375, + "step": 15052, + "time_per_iteration": 2.4719793796539307 + }, + { + "auxiliary_loss_clip": 0.0110445, + "auxiliary_loss_mlp": 0.01031379, + "balance_loss_clip": 1.0199585, + "balance_loss_mlp": 1.0360204, + "epoch": 0.9050353224109424, + "flos": 15596220389760.0, + "grad_norm": 2.0056674164312835, + "language_loss": 0.76978087, + "learning_rate": 9.380141252395724e-08, + "loss": 0.79113925, + "num_input_tokens_seen": 324580865, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.68359375, + "step": 15053, + "time_per_iteration": 3.8475534915924072 + }, + { + "auxiliary_loss_clip": 0.01098067, + "auxiliary_loss_mlp": 0.01033255, + "balance_loss_clip": 1.0216974, + "balance_loss_mlp": 1.03388309, + "epoch": 0.9050954456636104, + "flos": 28183592165760.0, + "grad_norm": 1.68822128465627, + "language_loss": 0.73156083, + "learning_rate": 9.368357505553049e-08, + "loss": 0.75287408, + "num_input_tokens_seen": 324600665, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 15054, + "time_per_iteration": 2.504624128341675 + }, + { + "auxiliary_loss_clip": 0.01098343, + "auxiliary_loss_mlp": 0.01028194, + "balance_loss_clip": 1.01721489, + "balance_loss_mlp": 1.03398204, + "epoch": 0.9051555689162784, + "flos": 25731611078400.0, + "grad_norm": 1.5885770670663444, + "language_loss": 0.82941592, + "learning_rate": 9.356580987481333e-08, + "loss": 0.8506813, + "num_input_tokens_seen": 324618145, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.64453125, + "step": 15055, + "time_per_iteration": 3.941993474960327 + }, + { + "auxiliary_loss_clip": 0.01098634, + "auxiliary_loss_mlp": 0.01033029, + "balance_loss_clip": 1.02112579, + "balance_loss_mlp": 1.03452206, + "epoch": 0.9052156921689464, + "flos": 23257258796160.0, + "grad_norm": 1.6252305322329523, + "language_loss": 0.84889591, + "learning_rate": 9.344811698627176e-08, + "loss": 0.87021255, + "num_input_tokens_seen": 324638165, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.640625, + "step": 15056, + "time_per_iteration": 3.8730804920196533 + }, + { + "auxiliary_loss_clip": 0.01098086, + "auxiliary_loss_mlp": 0.01026272, + "balance_loss_clip": 1.01563239, + "balance_loss_mlp": 1.03365731, + "epoch": 0.9052758154216143, + "flos": 29564285097600.0, + "grad_norm": 1.901364143448049, + "language_loss": 0.71921766, + "learning_rate": 9.333049639436863e-08, + "loss": 0.74046123, + "num_input_tokens_seen": 324658560, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.64453125, + "step": 15057, + "time_per_iteration": 2.5385053157806396 + }, + { + "auxiliary_loss_clip": 0.01097658, + "auxiliary_loss_mlp": 0.01026955, + "balance_loss_clip": 1.01602972, + "balance_loss_mlp": 1.03402162, + "epoch": 0.9053359386742823, + "flos": 22127688823680.0, + "grad_norm": 1.8215014735352213, + "language_loss": 0.80796474, + "learning_rate": 9.321294810356418e-08, + "loss": 0.82921088, + "num_input_tokens_seen": 324679185, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.63671875, + "step": 15058, + "time_per_iteration": 2.4862685203552246 + }, + { + "auxiliary_loss_clip": 0.01021601, + "auxiliary_loss_mlp": 0.00999772, + "balance_loss_clip": 0.99881822, + "balance_loss_mlp": 1.00167274, + "epoch": 0.9053960619269502, + "flos": 67090112760960.0, + "grad_norm": 0.6796033479937826, + "language_loss": 0.51406193, + "learning_rate": 9.309547211831592e-08, + "loss": 0.53427565, + "num_input_tokens_seen": 324744830, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.19921875, + "step": 15059, + "time_per_iteration": 3.1576147079467773 + }, + { + "auxiliary_loss_clip": 0.01098633, + "auxiliary_loss_mlp": 0.01027608, + "balance_loss_clip": 1.01625299, + "balance_loss_mlp": 1.03367758, + "epoch": 0.9054561851796182, + "flos": 15815419136640.0, + "grad_norm": 1.6940162245344546, + "language_loss": 0.6707449, + "learning_rate": 9.297806844307831e-08, + "loss": 0.6920073, + "num_input_tokens_seen": 324762905, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6484375, + "step": 15060, + "time_per_iteration": 3.8765251636505127 + }, + { + "auxiliary_loss_clip": 0.01102869, + "auxiliary_loss_mlp": 0.01030416, + "balance_loss_clip": 1.01882291, + "balance_loss_mlp": 1.03586102, + "epoch": 0.9055163084322861, + "flos": 17566997950080.0, + "grad_norm": 2.552329280643452, + "language_loss": 0.64600372, + "learning_rate": 9.286073708230357e-08, + "loss": 0.66733658, + "num_input_tokens_seen": 324781905, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 15061, + "time_per_iteration": 2.414705514907837 + }, + { + "auxiliary_loss_clip": 0.01101953, + "auxiliary_loss_mlp": 0.0103199, + "balance_loss_clip": 1.02084422, + "balance_loss_mlp": 1.03586221, + "epoch": 0.9055764316849542, + "flos": 17639573379840.0, + "grad_norm": 1.8780376792714681, + "language_loss": 0.71583116, + "learning_rate": 9.274347804044058e-08, + "loss": 0.73717052, + "num_input_tokens_seen": 324799260, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 15062, + "time_per_iteration": 2.453206777572632 + }, + { + "auxiliary_loss_clip": 0.01097235, + "auxiliary_loss_mlp": 0.01030982, + "balance_loss_clip": 1.01993728, + "balance_loss_mlp": 1.03298724, + "epoch": 0.9056365549376221, + "flos": 20120856986880.0, + "grad_norm": 1.594142455041583, + "language_loss": 0.70841157, + "learning_rate": 9.2626291321936e-08, + "loss": 0.72969377, + "num_input_tokens_seen": 324817800, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 15063, + "time_per_iteration": 2.4209649562835693 + }, + { + "auxiliary_loss_clip": 0.01095403, + "auxiliary_loss_mlp": 0.01029371, + "balance_loss_clip": 1.01830244, + "balance_loss_mlp": 1.03247714, + "epoch": 0.9056966781902901, + "flos": 27598786836480.0, + "grad_norm": 2.2069816413887695, + "language_loss": 0.71933818, + "learning_rate": 9.250917693123406e-08, + "loss": 0.74058586, + "num_input_tokens_seen": 324838445, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.62890625, + "step": 15064, + "time_per_iteration": 2.529771566390991 + }, + { + "auxiliary_loss_clip": 0.01099735, + "auxiliary_loss_mlp": 0.0103321, + "balance_loss_clip": 1.02177143, + "balance_loss_mlp": 1.03191257, + "epoch": 0.9057568014429581, + "flos": 25920106675200.0, + "grad_norm": 1.7008106699079477, + "language_loss": 0.69489098, + "learning_rate": 9.23921348727752e-08, + "loss": 0.71622044, + "num_input_tokens_seen": 324859895, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6796875, + "step": 15065, + "time_per_iteration": 2.4695396423339844 + }, + { + "auxiliary_loss_clip": 0.01100826, + "auxiliary_loss_mlp": 0.0103538, + "balance_loss_clip": 1.0242219, + "balance_loss_mlp": 1.036026, + "epoch": 0.905816924695626, + "flos": 22930364096640.0, + "grad_norm": 1.654206312026751, + "language_loss": 0.63057613, + "learning_rate": 9.227516515099743e-08, + "loss": 0.6519382, + "num_input_tokens_seen": 324879580, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 15066, + "time_per_iteration": 2.5075199604034424 + }, + { + "auxiliary_loss_clip": 0.01102024, + "auxiliary_loss_mlp": 0.01028293, + "balance_loss_clip": 1.01550758, + "balance_loss_mlp": 1.03313625, + "epoch": 0.905877047948294, + "flos": 22157422306560.0, + "grad_norm": 1.7954074160752378, + "language_loss": 0.80386341, + "learning_rate": 9.215826777033675e-08, + "loss": 0.82516658, + "num_input_tokens_seen": 324898950, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6875, + "step": 15067, + "time_per_iteration": 2.4531846046447754 + }, + { + "auxiliary_loss_clip": 0.01101013, + "auxiliary_loss_mlp": 0.01034254, + "balance_loss_clip": 1.02201724, + "balance_loss_mlp": 1.03438091, + "epoch": 0.905937171200962, + "flos": 15304805349120.0, + "grad_norm": 1.6316864228138155, + "language_loss": 0.70004576, + "learning_rate": 9.204144273522563e-08, + "loss": 0.72139847, + "num_input_tokens_seen": 324917455, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6640625, + "step": 15068, + "time_per_iteration": 2.454280376434326 + }, + { + "auxiliary_loss_clip": 0.01096195, + "auxiliary_loss_mlp": 0.01027996, + "balance_loss_clip": 1.01653373, + "balance_loss_mlp": 1.03272903, + "epoch": 0.90599729445363, + "flos": 19462973437440.0, + "grad_norm": 1.9832765312866232, + "language_loss": 0.85433835, + "learning_rate": 9.19246900500943e-08, + "loss": 0.87558019, + "num_input_tokens_seen": 324934495, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6328125, + "step": 15069, + "time_per_iteration": 2.446370840072632 + }, + { + "auxiliary_loss_clip": 0.0110437, + "auxiliary_loss_mlp": 0.01028245, + "balance_loss_clip": 1.01588929, + "balance_loss_mlp": 1.03456664, + "epoch": 0.9060574177062979, + "flos": 23732967542400.0, + "grad_norm": 1.757997353290004, + "language_loss": 0.58988464, + "learning_rate": 9.180800971936987e-08, + "loss": 0.61121076, + "num_input_tokens_seen": 324953230, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.69921875, + "step": 15070, + "time_per_iteration": 2.4755237102508545 + }, + { + "auxiliary_loss_clip": 0.01102192, + "auxiliary_loss_mlp": 0.01025212, + "balance_loss_clip": 1.01301742, + "balance_loss_mlp": 1.03411961, + "epoch": 0.9061175409589659, + "flos": 17311134395520.0, + "grad_norm": 3.1017762628694516, + "language_loss": 0.81624448, + "learning_rate": 9.169140174747724e-08, + "loss": 0.83751857, + "num_input_tokens_seen": 324969880, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6796875, + "step": 15071, + "time_per_iteration": 2.4169604778289795 + }, + { + "auxiliary_loss_clip": 0.01102965, + "auxiliary_loss_mlp": 0.01035458, + "balance_loss_clip": 1.02284586, + "balance_loss_mlp": 1.03447402, + "epoch": 0.9061776642116338, + "flos": 17778439359360.0, + "grad_norm": 1.8293370138207419, + "language_loss": 0.62059128, + "learning_rate": 9.157486613883758e-08, + "loss": 0.64197552, + "num_input_tokens_seen": 324987005, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.68359375, + "step": 15072, + "time_per_iteration": 2.4581499099731445 + }, + { + "auxiliary_loss_clip": 0.01098912, + "auxiliary_loss_mlp": 0.01032654, + "balance_loss_clip": 1.02073944, + "balance_loss_mlp": 1.03379536, + "epoch": 0.9062377874643018, + "flos": 42777688037760.0, + "grad_norm": 1.8672804369539069, + "language_loss": 0.73105884, + "learning_rate": 9.145840289787021e-08, + "loss": 0.75237453, + "num_input_tokens_seen": 325010700, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65234375, + "step": 15073, + "time_per_iteration": 2.641338586807251 + }, + { + "auxiliary_loss_clip": 0.01097199, + "auxiliary_loss_mlp": 0.01024734, + "balance_loss_clip": 1.01392126, + "balance_loss_mlp": 1.03387177, + "epoch": 0.9062979107169697, + "flos": 16361620323840.0, + "grad_norm": 1.788469817713978, + "language_loss": 0.80764318, + "learning_rate": 9.134201202899161e-08, + "loss": 0.82886249, + "num_input_tokens_seen": 325028760, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6328125, + "step": 15074, + "time_per_iteration": 2.460603952407837 + }, + { + "auxiliary_loss_clip": 0.01022009, + "auxiliary_loss_mlp": 0.00998758, + "balance_loss_clip": 0.99781018, + "balance_loss_mlp": 1.00188327, + "epoch": 0.9063580339696378, + "flos": 69313988528640.0, + "grad_norm": 0.7466398026663508, + "language_loss": 0.52349371, + "learning_rate": 9.122569353661513e-08, + "loss": 0.54370141, + "num_input_tokens_seen": 325093545, + "router_z_loss_clip": 0.00946045, + "router_z_loss_mlp": 0.20117188, + "step": 15075, + "time_per_iteration": 3.1416776180267334 + }, + { + "auxiliary_loss_clip": 0.01022161, + "auxiliary_loss_mlp": 0.01000705, + "balance_loss_clip": 0.99977523, + "balance_loss_mlp": 1.00210333, + "epoch": 0.9064181572223057, + "flos": 58794747148800.0, + "grad_norm": 0.7362778409223477, + "language_loss": 0.62075734, + "learning_rate": 9.11094474251517e-08, + "loss": 0.64098597, + "num_input_tokens_seen": 325152295, + "router_z_loss_clip": 0.00927734, + "router_z_loss_mlp": 0.20117188, + "step": 15076, + "time_per_iteration": 2.9436872005462646 + }, + { + "auxiliary_loss_clip": 0.01098779, + "auxiliary_loss_mlp": 0.01034374, + "balance_loss_clip": 1.0230608, + "balance_loss_mlp": 1.03310323, + "epoch": 0.9064782804749737, + "flos": 21762692772480.0, + "grad_norm": 1.8036310285355786, + "language_loss": 0.82249612, + "learning_rate": 9.09932736990091e-08, + "loss": 0.84382766, + "num_input_tokens_seen": 325169705, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 15077, + "time_per_iteration": 2.446316719055176 + }, + { + "auxiliary_loss_clip": 0.01095172, + "auxiliary_loss_mlp": 0.01023194, + "balance_loss_clip": 1.01250672, + "balance_loss_mlp": 1.03184319, + "epoch": 0.9065384037276417, + "flos": 21397373498880.0, + "grad_norm": 1.8387781390293605, + "language_loss": 0.83909905, + "learning_rate": 9.08771723625934e-08, + "loss": 0.86028278, + "num_input_tokens_seen": 325189175, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6328125, + "step": 15078, + "time_per_iteration": 2.4532206058502197 + }, + { + "auxiliary_loss_clip": 0.01095731, + "auxiliary_loss_mlp": 0.01029348, + "balance_loss_clip": 1.01802921, + "balance_loss_mlp": 1.03388798, + "epoch": 0.9065985269803096, + "flos": 38283646849920.0, + "grad_norm": 1.4797980658889718, + "language_loss": 0.65172887, + "learning_rate": 9.076114342030617e-08, + "loss": 0.67297965, + "num_input_tokens_seen": 325211020, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6171875, + "step": 15079, + "time_per_iteration": 2.595116376876831 + }, + { + "auxiliary_loss_clip": 0.01096827, + "auxiliary_loss_mlp": 0.0102679, + "balance_loss_clip": 1.01537549, + "balance_loss_mlp": 1.03209925, + "epoch": 0.9066586502329776, + "flos": 44818562989440.0, + "grad_norm": 1.599991637278185, + "language_loss": 0.70963979, + "learning_rate": 9.064518687654765e-08, + "loss": 0.73087597, + "num_input_tokens_seen": 325236970, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 15080, + "time_per_iteration": 2.634389877319336 + }, + { + "auxiliary_loss_clip": 0.01102802, + "auxiliary_loss_mlp": 0.01028039, + "balance_loss_clip": 1.01576686, + "balance_loss_mlp": 1.0354439, + "epoch": 0.9067187734856456, + "flos": 18623992492800.0, + "grad_norm": 2.1816197679044773, + "language_loss": 0.7070353, + "learning_rate": 9.052930273571547e-08, + "loss": 0.72834378, + "num_input_tokens_seen": 325252670, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.671875, + "step": 15081, + "time_per_iteration": 2.4671077728271484 + }, + { + "auxiliary_loss_clip": 0.01097302, + "auxiliary_loss_mlp": 0.01032284, + "balance_loss_clip": 1.02084613, + "balance_loss_mlp": 1.03395283, + "epoch": 0.9067788967383136, + "flos": 22747578762240.0, + "grad_norm": 3.2112042935174565, + "language_loss": 0.74457014, + "learning_rate": 9.04134910022032e-08, + "loss": 0.76586604, + "num_input_tokens_seen": 325273860, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6328125, + "step": 15082, + "time_per_iteration": 2.4647719860076904 + }, + { + "auxiliary_loss_clip": 0.01098042, + "auxiliary_loss_mlp": 0.01032237, + "balance_loss_clip": 1.02134085, + "balance_loss_mlp": 1.03468823, + "epoch": 0.9068390199909815, + "flos": 27670787648640.0, + "grad_norm": 2.1491807058491905, + "language_loss": 0.78196669, + "learning_rate": 9.029775168040266e-08, + "loss": 0.80326951, + "num_input_tokens_seen": 325294140, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6328125, + "step": 15083, + "time_per_iteration": 2.504772424697876 + }, + { + "auxiliary_loss_clip": 0.01096671, + "auxiliary_loss_mlp": 0.01029863, + "balance_loss_clip": 1.01927173, + "balance_loss_mlp": 1.03399706, + "epoch": 0.9068991432436495, + "flos": 24244012293120.0, + "grad_norm": 1.6316555525453107, + "language_loss": 0.69215107, + "learning_rate": 9.01820847747028e-08, + "loss": 0.7134164, + "num_input_tokens_seen": 325313130, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.625, + "step": 15084, + "time_per_iteration": 2.4626307487487793 + }, + { + "auxiliary_loss_clip": 0.01100231, + "auxiliary_loss_mlp": 0.01032, + "balance_loss_clip": 1.0205797, + "balance_loss_mlp": 1.03536558, + "epoch": 0.9069592664963174, + "flos": 28033305661440.0, + "grad_norm": 1.5369468428614528, + "language_loss": 0.66654259, + "learning_rate": 9.006649028948965e-08, + "loss": 0.68786484, + "num_input_tokens_seen": 325334880, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 15085, + "time_per_iteration": 2.526599645614624 + }, + { + "auxiliary_loss_clip": 0.01021996, + "auxiliary_loss_mlp": 0.00999372, + "balance_loss_clip": 0.99840033, + "balance_loss_mlp": 1.00198984, + "epoch": 0.9070193897489854, + "flos": 68778414789120.0, + "grad_norm": 0.7678247485693439, + "language_loss": 0.61296463, + "learning_rate": 8.995096822914638e-08, + "loss": 0.63317835, + "num_input_tokens_seen": 325394175, + "router_z_loss_clip": 0.00970459, + "router_z_loss_mlp": 0.20019531, + "step": 15086, + "time_per_iteration": 3.064495325088501 + }, + { + "auxiliary_loss_clip": 0.01097744, + "auxiliary_loss_mlp": 0.0103656, + "balance_loss_clip": 1.02425778, + "balance_loss_mlp": 1.03301144, + "epoch": 0.9070795130016533, + "flos": 23441624328960.0, + "grad_norm": 1.5347716312449224, + "language_loss": 0.72131354, + "learning_rate": 8.983551859805416e-08, + "loss": 0.74265659, + "num_input_tokens_seen": 325415020, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6484375, + "step": 15087, + "time_per_iteration": 2.4933295249938965 + }, + { + "auxiliary_loss_clip": 0.01100004, + "auxiliary_loss_mlp": 0.01024891, + "balance_loss_clip": 1.01387572, + "balance_loss_mlp": 1.03501546, + "epoch": 0.9071396362543214, + "flos": 18916413114240.0, + "grad_norm": 3.778989757753675, + "language_loss": 0.76889527, + "learning_rate": 8.972014140059058e-08, + "loss": 0.79014421, + "num_input_tokens_seen": 325433595, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6484375, + "step": 15088, + "time_per_iteration": 2.420175552368164 + }, + { + "auxiliary_loss_clip": 0.01095841, + "auxiliary_loss_mlp": 0.01028108, + "balance_loss_clip": 1.01755214, + "balance_loss_mlp": 1.03368163, + "epoch": 0.9071997595069893, + "flos": 25228646887680.0, + "grad_norm": 1.9194225212557636, + "language_loss": 0.73643494, + "learning_rate": 8.960483664113038e-08, + "loss": 0.7576744, + "num_input_tokens_seen": 325451605, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.62109375, + "step": 15089, + "time_per_iteration": 2.472822904586792 + }, + { + "auxiliary_loss_clip": 0.01093778, + "auxiliary_loss_mlp": 0.01030536, + "balance_loss_clip": 1.02005196, + "balance_loss_mlp": 1.03267169, + "epoch": 0.9072598827596573, + "flos": 24346608514560.0, + "grad_norm": 1.8055084062755367, + "language_loss": 0.75715423, + "learning_rate": 8.948960432404628e-08, + "loss": 0.77839744, + "num_input_tokens_seen": 325470645, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.609375, + "step": 15090, + "time_per_iteration": 2.4550294876098633 + }, + { + "auxiliary_loss_clip": 0.01100863, + "auxiliary_loss_mlp": 0.01027406, + "balance_loss_clip": 1.01507401, + "balance_loss_mlp": 1.03407693, + "epoch": 0.9073200060123253, + "flos": 22674967418880.0, + "grad_norm": 2.1593176290056766, + "language_loss": 0.77432215, + "learning_rate": 8.93744444537079e-08, + "loss": 0.79560483, + "num_input_tokens_seen": 325488070, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.66796875, + "step": 15091, + "time_per_iteration": 2.4498257637023926 + }, + { + "auxiliary_loss_clip": 0.01092067, + "auxiliary_loss_mlp": 0.0102371, + "balance_loss_clip": 1.01344657, + "balance_loss_mlp": 1.03182781, + "epoch": 0.9073801292649932, + "flos": 23695476721920.0, + "grad_norm": 1.6802751346458031, + "language_loss": 0.86002195, + "learning_rate": 8.925935703448217e-08, + "loss": 0.88117981, + "num_input_tokens_seen": 325509285, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.6015625, + "step": 15092, + "time_per_iteration": 2.455930233001709 + }, + { + "auxiliary_loss_clip": 0.01100534, + "auxiliary_loss_mlp": 0.01031045, + "balance_loss_clip": 1.01952982, + "balance_loss_mlp": 1.03627157, + "epoch": 0.9074402525176612, + "flos": 25375413859200.0, + "grad_norm": 1.9030334099331545, + "language_loss": 0.78655577, + "learning_rate": 8.914434207073296e-08, + "loss": 0.80787158, + "num_input_tokens_seen": 325529360, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 15093, + "time_per_iteration": 2.4959311485290527 + }, + { + "auxiliary_loss_clip": 0.01021984, + "auxiliary_loss_mlp": 0.00998909, + "balance_loss_clip": 0.9979021, + "balance_loss_mlp": 1.00177145, + "epoch": 0.9075003757703292, + "flos": 67649024384640.0, + "grad_norm": 0.7368950598550581, + "language_loss": 0.57025433, + "learning_rate": 8.902939956682188e-08, + "loss": 0.59046328, + "num_input_tokens_seen": 325583565, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.203125, + "step": 15094, + "time_per_iteration": 2.9761135578155518 + }, + { + "auxiliary_loss_clip": 0.01102196, + "auxiliary_loss_mlp": 0.01031359, + "balance_loss_clip": 1.01930702, + "balance_loss_mlp": 1.03587985, + "epoch": 0.9075604990229972, + "flos": 22453649769600.0, + "grad_norm": 2.2862090497485945, + "language_loss": 0.71629637, + "learning_rate": 8.891452952710742e-08, + "loss": 0.73763192, + "num_input_tokens_seen": 325603690, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6640625, + "step": 15095, + "time_per_iteration": 3.8516845703125 + }, + { + "auxiliary_loss_clip": 0.01099489, + "auxiliary_loss_mlp": 0.01029255, + "balance_loss_clip": 1.01822233, + "balance_loss_mlp": 1.03436017, + "epoch": 0.9076206222756651, + "flos": 19536662188800.0, + "grad_norm": 1.6691319696305897, + "language_loss": 0.74130392, + "learning_rate": 8.879973195594526e-08, + "loss": 0.76259142, + "num_input_tokens_seen": 325622255, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65234375, + "step": 15096, + "time_per_iteration": 2.451765537261963 + }, + { + "auxiliary_loss_clip": 0.01101574, + "auxiliary_loss_mlp": 0.01037614, + "balance_loss_clip": 1.02484095, + "balance_loss_mlp": 1.03521657, + "epoch": 0.9076807455283331, + "flos": 30116914819200.0, + "grad_norm": 1.859736829180508, + "language_loss": 0.57428157, + "learning_rate": 8.868500685768898e-08, + "loss": 0.59567344, + "num_input_tokens_seen": 325640165, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6640625, + "step": 15097, + "time_per_iteration": 3.881094217300415 + }, + { + "auxiliary_loss_clip": 0.01093901, + "auxiliary_loss_mlp": 0.01022633, + "balance_loss_clip": 1.01194012, + "balance_loss_mlp": 1.03041399, + "epoch": 0.907740868781001, + "flos": 18697537589760.0, + "grad_norm": 1.7073383021734174, + "language_loss": 0.80004597, + "learning_rate": 8.857035423668935e-08, + "loss": 0.82121134, + "num_input_tokens_seen": 325659455, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6328125, + "step": 15098, + "time_per_iteration": 3.9118173122406006 + }, + { + "auxiliary_loss_clip": 0.01102439, + "auxiliary_loss_mlp": 0.0102718, + "balance_loss_clip": 1.01512241, + "balance_loss_mlp": 1.03452134, + "epoch": 0.907800992033669, + "flos": 22638805401600.0, + "grad_norm": 1.7264575763342218, + "language_loss": 0.66292477, + "learning_rate": 8.845577409729266e-08, + "loss": 0.68422097, + "num_input_tokens_seen": 325678095, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6796875, + "step": 15099, + "time_per_iteration": 2.48301100730896 + }, + { + "auxiliary_loss_clip": 0.01101737, + "auxiliary_loss_mlp": 0.01033877, + "balance_loss_clip": 1.02177739, + "balance_loss_mlp": 1.03520679, + "epoch": 0.907861115286337, + "flos": 21287666384640.0, + "grad_norm": 2.2265413383772255, + "language_loss": 0.70710111, + "learning_rate": 8.834126644384477e-08, + "loss": 0.72845727, + "num_input_tokens_seen": 325695825, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 15100, + "time_per_iteration": 2.43037486076355 + }, + { + "auxiliary_loss_clip": 0.0102224, + "auxiliary_loss_mlp": 0.01001474, + "balance_loss_clip": 1.00043106, + "balance_loss_mlp": 1.00208926, + "epoch": 0.907921238539005, + "flos": 69739493040000.0, + "grad_norm": 0.62151668648413, + "language_loss": 0.53409314, + "learning_rate": 8.822683128068775e-08, + "loss": 0.55433023, + "num_input_tokens_seen": 325764515, + "router_z_loss_clip": 0.01043701, + "router_z_loss_mlp": 0.20117188, + "step": 15101, + "time_per_iteration": 3.1229333877563477 + }, + { + "auxiliary_loss_clip": 0.01099032, + "auxiliary_loss_mlp": 0.01026926, + "balance_loss_clip": 1.01553512, + "balance_loss_mlp": 1.03384113, + "epoch": 0.9079813617916729, + "flos": 23477391296640.0, + "grad_norm": 1.6634584368490581, + "language_loss": 0.6806314, + "learning_rate": 8.811246861216081e-08, + "loss": 0.70189095, + "num_input_tokens_seen": 325783235, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65234375, + "step": 15102, + "time_per_iteration": 3.915992498397827 + }, + { + "auxiliary_loss_clip": 0.01099332, + "auxiliary_loss_mlp": 0.01028654, + "balance_loss_clip": 1.01760292, + "balance_loss_mlp": 1.03539491, + "epoch": 0.9080414850443409, + "flos": 22929933133440.0, + "grad_norm": 1.866647802447474, + "language_loss": 0.79140002, + "learning_rate": 8.799817844260049e-08, + "loss": 0.81267983, + "num_input_tokens_seen": 325800195, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 15103, + "time_per_iteration": 2.4541261196136475 + }, + { + "auxiliary_loss_clip": 0.01099368, + "auxiliary_loss_mlp": 0.0102728, + "balance_loss_clip": 1.01546621, + "balance_loss_mlp": 1.03330398, + "epoch": 0.9081016082970089, + "flos": 26177083551360.0, + "grad_norm": 1.6943626894395103, + "language_loss": 0.71684384, + "learning_rate": 8.78839607763413e-08, + "loss": 0.7381103, + "num_input_tokens_seen": 325820215, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66015625, + "step": 15104, + "time_per_iteration": 2.482779026031494 + }, + { + "auxiliary_loss_clip": 0.01096986, + "auxiliary_loss_mlp": 0.01026694, + "balance_loss_clip": 1.01615047, + "balance_loss_mlp": 1.03385842, + "epoch": 0.9081617315496768, + "flos": 24462169545600.0, + "grad_norm": 1.8695561249612997, + "language_loss": 0.77266347, + "learning_rate": 8.77698156177138e-08, + "loss": 0.79390025, + "num_input_tokens_seen": 325838415, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6328125, + "step": 15105, + "time_per_iteration": 2.4875688552856445 + }, + { + "auxiliary_loss_clip": 0.01098253, + "auxiliary_loss_mlp": 0.01033196, + "balance_loss_clip": 1.02169788, + "balance_loss_mlp": 1.03278279, + "epoch": 0.9082218548023449, + "flos": 24746868743040.0, + "grad_norm": 1.8708472286705515, + "language_loss": 0.73539734, + "learning_rate": 8.765574297104628e-08, + "loss": 0.75671178, + "num_input_tokens_seen": 325855580, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 15106, + "time_per_iteration": 2.4679880142211914 + }, + { + "auxiliary_loss_clip": 0.01099508, + "auxiliary_loss_mlp": 0.01031628, + "balance_loss_clip": 1.01973677, + "balance_loss_mlp": 1.03357434, + "epoch": 0.9082819780550128, + "flos": 24421302846720.0, + "grad_norm": 1.6236610377303244, + "language_loss": 0.80442846, + "learning_rate": 8.754174284066462e-08, + "loss": 0.82573986, + "num_input_tokens_seen": 325874890, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66015625, + "step": 15107, + "time_per_iteration": 2.4709742069244385 + }, + { + "auxiliary_loss_clip": 0.01021838, + "auxiliary_loss_mlp": 0.01001997, + "balance_loss_clip": 1.00099587, + "balance_loss_mlp": 1.00181663, + "epoch": 0.9083421013076808, + "flos": 59609704872960.0, + "grad_norm": 0.8133672060912172, + "language_loss": 0.59727746, + "learning_rate": 8.742781523089205e-08, + "loss": 0.61751574, + "num_input_tokens_seen": 325935835, + "router_z_loss_clip": 0.01000977, + "router_z_loss_mlp": 0.20019531, + "step": 15108, + "time_per_iteration": 3.0274457931518555 + }, + { + "auxiliary_loss_clip": 0.01098636, + "auxiliary_loss_mlp": 0.01027249, + "balance_loss_clip": 1.01596022, + "balance_loss_mlp": 1.03307295, + "epoch": 0.9084022245603487, + "flos": 33620216100480.0, + "grad_norm": 1.8322800202774943, + "language_loss": 0.73455155, + "learning_rate": 8.73139601460482e-08, + "loss": 0.75581038, + "num_input_tokens_seen": 325958035, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 15109, + "time_per_iteration": 2.544933557510376 + }, + { + "auxiliary_loss_clip": 0.01096619, + "auxiliary_loss_mlp": 0.01028863, + "balance_loss_clip": 1.01830721, + "balance_loss_mlp": 1.0329206, + "epoch": 0.9084623478130167, + "flos": 24971705925120.0, + "grad_norm": 4.344593502193457, + "language_loss": 0.71237719, + "learning_rate": 8.720017759045073e-08, + "loss": 0.73363197, + "num_input_tokens_seen": 325979870, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.63671875, + "step": 15110, + "time_per_iteration": 2.497110605239868 + }, + { + "auxiliary_loss_clip": 0.01096477, + "auxiliary_loss_mlp": 0.01027944, + "balance_loss_clip": 1.01717973, + "balance_loss_mlp": 1.03286374, + "epoch": 0.9085224710656846, + "flos": 31461804869760.0, + "grad_norm": 1.8914312332237635, + "language_loss": 0.68927699, + "learning_rate": 8.708646756841421e-08, + "loss": 0.7105211, + "num_input_tokens_seen": 325998245, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.63671875, + "step": 15111, + "time_per_iteration": 2.505744457244873 + }, + { + "auxiliary_loss_clip": 0.01022286, + "auxiliary_loss_mlp": 0.01003787, + "balance_loss_clip": 1.0028336, + "balance_loss_mlp": 1.00213766, + "epoch": 0.9085825943183526, + "flos": 64917012867840.0, + "grad_norm": 0.6884098110857299, + "language_loss": 0.51761699, + "learning_rate": 8.697283008425026e-08, + "loss": 0.53787768, + "num_input_tokens_seen": 326061770, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.20117188, + "step": 15112, + "time_per_iteration": 3.103571891784668 + }, + { + "auxiliary_loss_clip": 0.01099285, + "auxiliary_loss_mlp": 0.01033775, + "balance_loss_clip": 1.02190137, + "balance_loss_mlp": 1.03310108, + "epoch": 0.9086427175710206, + "flos": 18953221576320.0, + "grad_norm": 2.8732583943790106, + "language_loss": 0.69663835, + "learning_rate": 8.685926514226837e-08, + "loss": 0.717969, + "num_input_tokens_seen": 326080945, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66015625, + "step": 15113, + "time_per_iteration": 2.4426491260528564 + }, + { + "auxiliary_loss_clip": 0.01100751, + "auxiliary_loss_mlp": 0.01030585, + "balance_loss_clip": 1.01948047, + "balance_loss_mlp": 1.03551614, + "epoch": 0.9087028408236886, + "flos": 34014873807360.0, + "grad_norm": 2.070851389766841, + "language_loss": 0.79043949, + "learning_rate": 8.674577274677508e-08, + "loss": 0.81175292, + "num_input_tokens_seen": 326100630, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.65234375, + "step": 15114, + "time_per_iteration": 2.5486881732940674 + }, + { + "auxiliary_loss_clip": 0.01104287, + "auxiliary_loss_mlp": 0.01029821, + "balance_loss_clip": 1.01682091, + "balance_loss_mlp": 1.03576267, + "epoch": 0.9087629640763565, + "flos": 21944580266880.0, + "grad_norm": 1.912898599360711, + "language_loss": 0.70125389, + "learning_rate": 8.663235290207405e-08, + "loss": 0.72259498, + "num_input_tokens_seen": 326120145, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6875, + "step": 15115, + "time_per_iteration": 2.4841859340667725 + }, + { + "auxiliary_loss_clip": 0.01107149, + "auxiliary_loss_mlp": 0.01027577, + "balance_loss_clip": 1.01529849, + "balance_loss_mlp": 1.03841996, + "epoch": 0.9088230873290245, + "flos": 21762908254080.0, + "grad_norm": 1.5429378275773247, + "language_loss": 0.6537832, + "learning_rate": 8.651900561246561e-08, + "loss": 0.67513043, + "num_input_tokens_seen": 326140715, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 15116, + "time_per_iteration": 2.473940849304199 + }, + { + "auxiliary_loss_clip": 0.01098134, + "auxiliary_loss_mlp": 0.01031, + "balance_loss_clip": 1.01934111, + "balance_loss_mlp": 1.03502166, + "epoch": 0.9088832105816925, + "flos": 21541267382400.0, + "grad_norm": 2.042140958343974, + "language_loss": 0.69371068, + "learning_rate": 8.640573088224812e-08, + "loss": 0.71500206, + "num_input_tokens_seen": 326159130, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6328125, + "step": 15117, + "time_per_iteration": 2.446790933609009 + }, + { + "auxiliary_loss_clip": 0.01097454, + "auxiliary_loss_mlp": 0.01026475, + "balance_loss_clip": 1.01577044, + "balance_loss_mlp": 1.03358901, + "epoch": 0.9089433338343604, + "flos": 25996704428160.0, + "grad_norm": 1.3695155958965473, + "language_loss": 0.74376065, + "learning_rate": 8.629252871571745e-08, + "loss": 0.76499993, + "num_input_tokens_seen": 326181375, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.640625, + "step": 15118, + "time_per_iteration": 2.5014708042144775 + }, + { + "auxiliary_loss_clip": 0.0110317, + "auxiliary_loss_mlp": 0.01036197, + "balance_loss_clip": 1.02320337, + "balance_loss_mlp": 1.03314781, + "epoch": 0.9090034570870285, + "flos": 21178426147200.0, + "grad_norm": 1.9496085860270096, + "language_loss": 0.72797048, + "learning_rate": 8.617939911716554e-08, + "loss": 0.74936414, + "num_input_tokens_seen": 326199740, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.69921875, + "step": 15119, + "time_per_iteration": 2.438727617263794 + }, + { + "auxiliary_loss_clip": 0.01103886, + "auxiliary_loss_mlp": 0.01032043, + "balance_loss_clip": 1.01900196, + "balance_loss_mlp": 1.03510284, + "epoch": 0.9090635803396964, + "flos": 16141811045760.0, + "grad_norm": 2.433261916174762, + "language_loss": 0.71455759, + "learning_rate": 8.60663420908827e-08, + "loss": 0.73591691, + "num_input_tokens_seen": 326214350, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6875, + "step": 15120, + "time_per_iteration": 2.4505975246429443 + }, + { + "auxiliary_loss_clip": 0.01100898, + "auxiliary_loss_mlp": 0.01024815, + "balance_loss_clip": 1.01320434, + "balance_loss_mlp": 1.03470683, + "epoch": 0.9091237035923644, + "flos": 20591537829120.0, + "grad_norm": 2.262414929698902, + "language_loss": 0.65746844, + "learning_rate": 8.595335764115596e-08, + "loss": 0.6787256, + "num_input_tokens_seen": 326234580, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 15121, + "time_per_iteration": 2.460604667663574 + }, + { + "auxiliary_loss_clip": 0.0110018, + "auxiliary_loss_mlp": 0.01037, + "balance_loss_clip": 1.02520478, + "balance_loss_mlp": 1.03480613, + "epoch": 0.9091838268450323, + "flos": 52227760164480.0, + "grad_norm": 1.7042236575270435, + "language_loss": 0.70428181, + "learning_rate": 8.58404457722699e-08, + "loss": 0.72565359, + "num_input_tokens_seen": 326259080, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65234375, + "step": 15122, + "time_per_iteration": 2.750230550765991 + }, + { + "auxiliary_loss_clip": 0.01095884, + "auxiliary_loss_mlp": 0.01030523, + "balance_loss_clip": 1.01950181, + "balance_loss_mlp": 1.03195405, + "epoch": 0.9092439500977003, + "flos": 20559613616640.0, + "grad_norm": 1.642128872010493, + "language_loss": 0.74480474, + "learning_rate": 8.572760648850575e-08, + "loss": 0.7660687, + "num_input_tokens_seen": 326280175, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 15123, + "time_per_iteration": 2.4654879570007324 + }, + { + "auxiliary_loss_clip": 0.01096662, + "auxiliary_loss_mlp": 0.01028184, + "balance_loss_clip": 1.01699638, + "balance_loss_mlp": 1.03369832, + "epoch": 0.9093040733503682, + "flos": 28617859595520.0, + "grad_norm": 2.1206798390190262, + "language_loss": 0.75936723, + "learning_rate": 8.561483979414253e-08, + "loss": 0.78061569, + "num_input_tokens_seen": 326297990, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.62890625, + "step": 15124, + "time_per_iteration": 2.52099871635437 + }, + { + "auxiliary_loss_clip": 0.01098724, + "auxiliary_loss_mlp": 0.01027911, + "balance_loss_clip": 1.0162288, + "balance_loss_mlp": 1.03414643, + "epoch": 0.9093641966030362, + "flos": 23440187784960.0, + "grad_norm": 1.8194159456969368, + "language_loss": 0.71981823, + "learning_rate": 8.55021456934566e-08, + "loss": 0.74108458, + "num_input_tokens_seen": 326316735, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.64453125, + "step": 15125, + "time_per_iteration": 2.4560205936431885 + }, + { + "auxiliary_loss_clip": 0.01100093, + "auxiliary_loss_mlp": 0.01033365, + "balance_loss_clip": 1.02221918, + "balance_loss_mlp": 1.03570986, + "epoch": 0.9094243198557042, + "flos": 16800197385600.0, + "grad_norm": 1.7563994353439563, + "language_loss": 0.79251933, + "learning_rate": 8.538952419072143e-08, + "loss": 0.81385386, + "num_input_tokens_seen": 326334370, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.64453125, + "step": 15126, + "time_per_iteration": 2.453873634338379 + }, + { + "auxiliary_loss_clip": 0.01098149, + "auxiliary_loss_mlp": 0.01030363, + "balance_loss_clip": 1.01834726, + "balance_loss_mlp": 1.03453374, + "epoch": 0.9094844431083722, + "flos": 24273278899200.0, + "grad_norm": 1.6858763674197714, + "language_loss": 0.75407279, + "learning_rate": 8.527697529020694e-08, + "loss": 0.77535784, + "num_input_tokens_seen": 326353435, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6328125, + "step": 15127, + "time_per_iteration": 2.4735212326049805 + }, + { + "auxiliary_loss_clip": 0.01099168, + "auxiliary_loss_mlp": 0.01031751, + "balance_loss_clip": 1.0202589, + "balance_loss_mlp": 1.03267837, + "epoch": 0.9095445663610401, + "flos": 21944652094080.0, + "grad_norm": 1.896028987047219, + "language_loss": 0.6233058, + "learning_rate": 8.516449899618173e-08, + "loss": 0.64461499, + "num_input_tokens_seen": 326371810, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 15128, + "time_per_iteration": 2.4959044456481934 + }, + { + "auxiliary_loss_clip": 0.01096673, + "auxiliary_loss_mlp": 0.01023024, + "balance_loss_clip": 1.01165116, + "balance_loss_mlp": 1.03295422, + "epoch": 0.9096046896137081, + "flos": 19792848965760.0, + "grad_norm": 1.5525985783311769, + "language_loss": 0.76395273, + "learning_rate": 8.505209531291013e-08, + "loss": 0.78514969, + "num_input_tokens_seen": 326391380, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.63671875, + "step": 15129, + "time_per_iteration": 2.4258792400360107 + }, + { + "auxiliary_loss_clip": 0.01100603, + "auxiliary_loss_mlp": 0.01027213, + "balance_loss_clip": 1.0159955, + "balance_loss_mlp": 1.03430605, + "epoch": 0.909664812866376, + "flos": 22638087129600.0, + "grad_norm": 2.1640381281313377, + "language_loss": 0.83347154, + "learning_rate": 8.49397642446552e-08, + "loss": 0.85474968, + "num_input_tokens_seen": 326408800, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6640625, + "step": 15130, + "time_per_iteration": 2.474724054336548 + }, + { + "auxiliary_loss_clip": 0.01102705, + "auxiliary_loss_mlp": 0.01029426, + "balance_loss_clip": 1.01755881, + "balance_loss_mlp": 1.03607357, + "epoch": 0.909724936119044, + "flos": 39852153020160.0, + "grad_norm": 1.6552952857551617, + "language_loss": 0.7494061, + "learning_rate": 8.482750579567644e-08, + "loss": 0.7707274, + "num_input_tokens_seen": 326431565, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6640625, + "step": 15131, + "time_per_iteration": 2.592521905899048 + }, + { + "auxiliary_loss_clip": 0.01101645, + "auxiliary_loss_mlp": 0.01027681, + "balance_loss_clip": 1.0159862, + "balance_loss_mlp": 1.03632045, + "epoch": 0.9097850593717121, + "flos": 35071616954880.0, + "grad_norm": 2.576621383349304, + "language_loss": 0.5961653, + "learning_rate": 8.471531997023085e-08, + "loss": 0.61745852, + "num_input_tokens_seen": 326451715, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 15132, + "time_per_iteration": 2.5617480278015137 + }, + { + "auxiliary_loss_clip": 0.01101277, + "auxiliary_loss_mlp": 0.01028373, + "balance_loss_clip": 1.01764417, + "balance_loss_mlp": 1.03624594, + "epoch": 0.90984518262438, + "flos": 23367468700800.0, + "grad_norm": 1.5851128312655174, + "language_loss": 0.82403994, + "learning_rate": 8.460320677257193e-08, + "loss": 0.84533644, + "num_input_tokens_seen": 326470855, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6484375, + "step": 15133, + "time_per_iteration": 2.457850456237793 + }, + { + "auxiliary_loss_clip": 0.01099344, + "auxiliary_loss_mlp": 0.01029584, + "balance_loss_clip": 1.01784801, + "balance_loss_mlp": 1.03306341, + "epoch": 0.909905305877048, + "flos": 27523302405120.0, + "grad_norm": 1.8998682827883817, + "language_loss": 0.7366299, + "learning_rate": 8.449116620695118e-08, + "loss": 0.75791919, + "num_input_tokens_seen": 326490480, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 15134, + "time_per_iteration": 2.5147948265075684 + }, + { + "auxiliary_loss_clip": 0.01104628, + "auxiliary_loss_mlp": 0.01032392, + "balance_loss_clip": 1.02057791, + "balance_loss_mlp": 1.03571355, + "epoch": 0.9099654291297159, + "flos": 24347865490560.0, + "grad_norm": 1.4223380746982386, + "language_loss": 0.72740394, + "learning_rate": 8.437919827761786e-08, + "loss": 0.74877417, + "num_input_tokens_seen": 326509445, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 15135, + "time_per_iteration": 2.4703848361968994 + }, + { + "auxiliary_loss_clip": 0.01098783, + "auxiliary_loss_mlp": 0.01030997, + "balance_loss_clip": 1.01942205, + "balance_loss_mlp": 1.03540444, + "epoch": 0.9100255523823839, + "flos": 21215234609280.0, + "grad_norm": 1.7021694614162164, + "language_loss": 0.70180988, + "learning_rate": 8.426730298881702e-08, + "loss": 0.72310776, + "num_input_tokens_seen": 326528380, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6328125, + "step": 15136, + "time_per_iteration": 2.487475872039795 + }, + { + "auxiliary_loss_clip": 0.01021711, + "auxiliary_loss_mlp": 0.01001642, + "balance_loss_clip": 1.00064075, + "balance_loss_mlp": 1.00169659, + "epoch": 0.9100856756350518, + "flos": 46052276446080.0, + "grad_norm": 0.8227515384333137, + "language_loss": 0.59297395, + "learning_rate": 8.415548034479214e-08, + "loss": 0.61320746, + "num_input_tokens_seen": 326576940, + "router_z_loss_clip": 0.01000977, + "router_z_loss_mlp": 0.20019531, + "step": 15137, + "time_per_iteration": 4.198936700820923 + }, + { + "auxiliary_loss_clip": 0.01100339, + "auxiliary_loss_mlp": 0.010322, + "balance_loss_clip": 1.02104831, + "balance_loss_mlp": 1.03473079, + "epoch": 0.9101457988877198, + "flos": 20229917656320.0, + "grad_norm": 1.6400889092436695, + "language_loss": 0.82225323, + "learning_rate": 8.40437303497834e-08, + "loss": 0.84357858, + "num_input_tokens_seen": 326596100, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 15138, + "time_per_iteration": 3.895996570587158 + }, + { + "auxiliary_loss_clip": 0.01095947, + "auxiliary_loss_mlp": 0.01024499, + "balance_loss_clip": 1.01367462, + "balance_loss_mlp": 1.03384078, + "epoch": 0.9102059221403878, + "flos": 26615157822720.0, + "grad_norm": 1.430309258083403, + "language_loss": 0.81232274, + "learning_rate": 8.39320530080283e-08, + "loss": 0.83352715, + "num_input_tokens_seen": 326615700, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.62109375, + "step": 15139, + "time_per_iteration": 3.915422201156616 + }, + { + "auxiliary_loss_clip": 0.01099972, + "auxiliary_loss_mlp": 0.01029152, + "balance_loss_clip": 1.01819634, + "balance_loss_mlp": 1.0353173, + "epoch": 0.9102660453930558, + "flos": 21908561904000.0, + "grad_norm": 1.715308850913459, + "language_loss": 0.77420986, + "learning_rate": 8.382044832376167e-08, + "loss": 0.79550105, + "num_input_tokens_seen": 326635905, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 15140, + "time_per_iteration": 2.482774019241333 + }, + { + "auxiliary_loss_clip": 0.01098266, + "auxiliary_loss_mlp": 0.0102809, + "balance_loss_clip": 1.01683044, + "balance_loss_mlp": 1.03335404, + "epoch": 0.9103261686457237, + "flos": 36176660916480.0, + "grad_norm": 1.5748509366300032, + "language_loss": 0.66406751, + "learning_rate": 8.370891630121569e-08, + "loss": 0.68533105, + "num_input_tokens_seen": 326661855, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6484375, + "step": 15141, + "time_per_iteration": 2.627438545227051 + }, + { + "auxiliary_loss_clip": 0.01100489, + "auxiliary_loss_mlp": 0.01034279, + "balance_loss_clip": 1.02277482, + "balance_loss_mlp": 1.03375959, + "epoch": 0.9103862918983917, + "flos": 23878549365120.0, + "grad_norm": 1.8835074175782365, + "language_loss": 0.74966937, + "learning_rate": 8.359745694462005e-08, + "loss": 0.77101701, + "num_input_tokens_seen": 326679320, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66796875, + "step": 15142, + "time_per_iteration": 2.4914710521698 + }, + { + "auxiliary_loss_clip": 0.01097437, + "auxiliary_loss_mlp": 0.01031506, + "balance_loss_clip": 1.02074134, + "balance_loss_mlp": 1.03283298, + "epoch": 0.9104464151510596, + "flos": 14939521989120.0, + "grad_norm": 1.5812700495772496, + "language_loss": 0.64177603, + "learning_rate": 8.348607025820076e-08, + "loss": 0.66306543, + "num_input_tokens_seen": 326698110, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6484375, + "step": 15143, + "time_per_iteration": 3.8821182250976562 + }, + { + "auxiliary_loss_clip": 0.01099566, + "auxiliary_loss_mlp": 0.01032309, + "balance_loss_clip": 1.02014375, + "balance_loss_mlp": 1.03280878, + "epoch": 0.9105065384037276, + "flos": 33655803500160.0, + "grad_norm": 2.535789613284141, + "language_loss": 0.61168027, + "learning_rate": 8.337475624618152e-08, + "loss": 0.63299894, + "num_input_tokens_seen": 326718370, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.66796875, + "step": 15144, + "time_per_iteration": 2.569805145263672 + }, + { + "auxiliary_loss_clip": 0.01096299, + "auxiliary_loss_mlp": 0.01023873, + "balance_loss_clip": 1.0129174, + "balance_loss_mlp": 1.03423166, + "epoch": 0.9105666616563957, + "flos": 24316695463680.0, + "grad_norm": 1.5502656726328978, + "language_loss": 0.71112603, + "learning_rate": 8.326351491278382e-08, + "loss": 0.7323277, + "num_input_tokens_seen": 326738445, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.62109375, + "step": 15145, + "time_per_iteration": 2.4523370265960693 + }, + { + "auxiliary_loss_clip": 0.01095165, + "auxiliary_loss_mlp": 0.01029959, + "balance_loss_clip": 1.01897335, + "balance_loss_mlp": 1.03254509, + "epoch": 0.9106267849090636, + "flos": 29971692132480.0, + "grad_norm": 2.6249529159615514, + "language_loss": 0.70575893, + "learning_rate": 8.315234626222545e-08, + "loss": 0.72701019, + "num_input_tokens_seen": 326758855, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.62890625, + "step": 15146, + "time_per_iteration": 2.532625436782837 + }, + { + "auxiliary_loss_clip": 0.0109792, + "auxiliary_loss_mlp": 0.01028691, + "balance_loss_clip": 1.01795065, + "balance_loss_mlp": 1.03296185, + "epoch": 0.9106869081617316, + "flos": 25337743470720.0, + "grad_norm": 2.0360232738172788, + "language_loss": 0.72551036, + "learning_rate": 8.304125029872233e-08, + "loss": 0.74677646, + "num_input_tokens_seen": 326777140, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6484375, + "step": 15147, + "time_per_iteration": 2.4464521408081055 + }, + { + "auxiliary_loss_clip": 0.0110162, + "auxiliary_loss_mlp": 0.01025608, + "balance_loss_clip": 1.01392555, + "balance_loss_mlp": 1.03255963, + "epoch": 0.9107470314143995, + "flos": 18187031543040.0, + "grad_norm": 2.0097623075783235, + "language_loss": 0.80071878, + "learning_rate": 8.293022702648711e-08, + "loss": 0.82199109, + "num_input_tokens_seen": 326794070, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.69140625, + "step": 15148, + "time_per_iteration": 2.459246873855591 + }, + { + "auxiliary_loss_clip": 0.01099453, + "auxiliary_loss_mlp": 0.01032166, + "balance_loss_clip": 1.02068663, + "balance_loss_mlp": 1.03310466, + "epoch": 0.9108071546670675, + "flos": 23550828652800.0, + "grad_norm": 2.144518085707252, + "language_loss": 0.68096125, + "learning_rate": 8.281927644972996e-08, + "loss": 0.70227742, + "num_input_tokens_seen": 326814695, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6640625, + "step": 15149, + "time_per_iteration": 2.4559459686279297 + }, + { + "auxiliary_loss_clip": 0.01100315, + "auxiliary_loss_mlp": 0.01025911, + "balance_loss_clip": 1.0136447, + "balance_loss_mlp": 1.03507054, + "epoch": 0.9108672779197354, + "flos": 25630307746560.0, + "grad_norm": 1.4816254135406823, + "language_loss": 0.63344759, + "learning_rate": 8.270839857265776e-08, + "loss": 0.65470982, + "num_input_tokens_seen": 326835295, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.65234375, + "step": 15150, + "time_per_iteration": 2.5041751861572266 + }, + { + "auxiliary_loss_clip": 0.01097831, + "auxiliary_loss_mlp": 0.01031854, + "balance_loss_clip": 1.02024317, + "balance_loss_mlp": 1.03323102, + "epoch": 0.9109274011724035, + "flos": 22339094319360.0, + "grad_norm": 2.0033802871305166, + "language_loss": 0.72777343, + "learning_rate": 8.259759339947514e-08, + "loss": 0.74907029, + "num_input_tokens_seen": 326853350, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.64453125, + "step": 15151, + "time_per_iteration": 2.4370014667510986 + }, + { + "auxiliary_loss_clip": 0.01099185, + "auxiliary_loss_mlp": 0.01026953, + "balance_loss_clip": 1.01581335, + "balance_loss_mlp": 1.03446126, + "epoch": 0.9109875244250714, + "flos": 26688200129280.0, + "grad_norm": 1.71458072737329, + "language_loss": 0.64443874, + "learning_rate": 8.248686093438429e-08, + "loss": 0.66570008, + "num_input_tokens_seen": 326873425, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 15152, + "time_per_iteration": 2.502570867538452 + }, + { + "auxiliary_loss_clip": 0.01100177, + "auxiliary_loss_mlp": 0.01025715, + "balance_loss_clip": 1.01365113, + "balance_loss_mlp": 1.03537905, + "epoch": 0.9110476476777394, + "flos": 22930112701440.0, + "grad_norm": 2.146338606112044, + "language_loss": 0.73740828, + "learning_rate": 8.23762011815834e-08, + "loss": 0.75866711, + "num_input_tokens_seen": 326893455, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6484375, + "step": 15153, + "time_per_iteration": 2.4459383487701416 + }, + { + "auxiliary_loss_clip": 0.01101084, + "auxiliary_loss_mlp": 0.01029699, + "balance_loss_clip": 1.01843357, + "balance_loss_mlp": 1.03521991, + "epoch": 0.9111077709304073, + "flos": 13472857854720.0, + "grad_norm": 2.0173388878771843, + "language_loss": 0.72387171, + "learning_rate": 8.226561414526956e-08, + "loss": 0.74517953, + "num_input_tokens_seen": 326910210, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65625, + "step": 15154, + "time_per_iteration": 2.4318478107452393 + }, + { + "auxiliary_loss_clip": 0.01099774, + "auxiliary_loss_mlp": 0.01029884, + "balance_loss_clip": 1.01911378, + "balance_loss_mlp": 1.03551435, + "epoch": 0.9111678941830753, + "flos": 20850561780480.0, + "grad_norm": 1.8402378708668206, + "language_loss": 0.81793249, + "learning_rate": 8.215509982963564e-08, + "loss": 0.83922905, + "num_input_tokens_seen": 326929350, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.640625, + "step": 15155, + "time_per_iteration": 2.4388551712036133 + }, + { + "auxiliary_loss_clip": 0.01100349, + "auxiliary_loss_mlp": 0.01027255, + "balance_loss_clip": 1.01544142, + "balance_loss_mlp": 1.03586698, + "epoch": 0.9112280174357432, + "flos": 19682244011520.0, + "grad_norm": 1.9446654713902813, + "language_loss": 0.5985598, + "learning_rate": 8.204465823887252e-08, + "loss": 0.61983585, + "num_input_tokens_seen": 326949060, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.64453125, + "step": 15156, + "time_per_iteration": 2.4715144634246826 + }, + { + "auxiliary_loss_clip": 0.01101793, + "auxiliary_loss_mlp": 0.01028719, + "balance_loss_clip": 1.01614261, + "balance_loss_mlp": 1.03311276, + "epoch": 0.9112881406884112, + "flos": 25447163276160.0, + "grad_norm": 1.8459673274861486, + "language_loss": 0.73944056, + "learning_rate": 8.193428937716796e-08, + "loss": 0.76074564, + "num_input_tokens_seen": 326968950, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 15157, + "time_per_iteration": 2.477900505065918 + }, + { + "auxiliary_loss_clip": 0.01098535, + "auxiliary_loss_mlp": 0.01028969, + "balance_loss_clip": 1.01842475, + "balance_loss_mlp": 1.03296149, + "epoch": 0.9113482639410793, + "flos": 33066975847680.0, + "grad_norm": 1.9741032786689436, + "language_loss": 0.59582591, + "learning_rate": 8.182399324870747e-08, + "loss": 0.61710095, + "num_input_tokens_seen": 326989455, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.65625, + "step": 15158, + "time_per_iteration": 2.5578408241271973 + }, + { + "auxiliary_loss_clip": 0.01099182, + "auxiliary_loss_mlp": 0.01033219, + "balance_loss_clip": 1.02256203, + "balance_loss_mlp": 1.0345186, + "epoch": 0.9114083871937472, + "flos": 21835591424640.0, + "grad_norm": 1.5097525180597062, + "language_loss": 0.67755049, + "learning_rate": 8.171376985767375e-08, + "loss": 0.69887447, + "num_input_tokens_seen": 327009640, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6484375, + "step": 15159, + "time_per_iteration": 2.452134132385254 + }, + { + "auxiliary_loss_clip": 0.01100265, + "auxiliary_loss_mlp": 0.01027178, + "balance_loss_clip": 1.01585364, + "balance_loss_mlp": 1.03467369, + "epoch": 0.9114685104464152, + "flos": 27088999061760.0, + "grad_norm": 1.8061769242302645, + "language_loss": 0.7828775, + "learning_rate": 8.160361920824588e-08, + "loss": 0.80415201, + "num_input_tokens_seen": 327027690, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 15160, + "time_per_iteration": 2.4899487495422363 + }, + { + "auxiliary_loss_clip": 0.01101577, + "auxiliary_loss_mlp": 0.01028261, + "balance_loss_clip": 1.01589894, + "balance_loss_mlp": 1.03610229, + "epoch": 0.9115286336990831, + "flos": 17967042696960.0, + "grad_norm": 1.6956442783060977, + "language_loss": 0.69036943, + "learning_rate": 8.149354130460073e-08, + "loss": 0.71166778, + "num_input_tokens_seen": 327045915, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.65625, + "step": 15161, + "time_per_iteration": 2.3957245349884033 + }, + { + "auxiliary_loss_clip": 0.01101547, + "auxiliary_loss_mlp": 0.01031821, + "balance_loss_clip": 1.01952457, + "balance_loss_mlp": 1.03551626, + "epoch": 0.9115887569517511, + "flos": 22929861306240.0, + "grad_norm": 1.6456282079841216, + "language_loss": 0.76603878, + "learning_rate": 8.138353615091321e-08, + "loss": 0.78737247, + "num_input_tokens_seen": 327066355, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66015625, + "step": 15162, + "time_per_iteration": 2.513727903366089 + }, + { + "auxiliary_loss_clip": 0.01099317, + "auxiliary_loss_mlp": 0.01030473, + "balance_loss_clip": 1.01908267, + "balance_loss_mlp": 1.03456116, + "epoch": 0.911648880204419, + "flos": 23988436047360.0, + "grad_norm": 1.8294416135843556, + "language_loss": 0.66720057, + "learning_rate": 8.127360375135395e-08, + "loss": 0.6884985, + "num_input_tokens_seen": 327086735, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 15163, + "time_per_iteration": 2.4560275077819824 + }, + { + "auxiliary_loss_clip": 0.01103819, + "auxiliary_loss_mlp": 0.01033242, + "balance_loss_clip": 1.02094579, + "balance_loss_mlp": 1.03538442, + "epoch": 0.911709003457087, + "flos": 17055306754560.0, + "grad_norm": 2.184740599613921, + "language_loss": 0.70217323, + "learning_rate": 8.116374411009186e-08, + "loss": 0.72354388, + "num_input_tokens_seen": 327104035, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.68359375, + "step": 15164, + "time_per_iteration": 2.4589922428131104 + }, + { + "auxiliary_loss_clip": 0.01098923, + "auxiliary_loss_mlp": 0.01031914, + "balance_loss_clip": 1.0209229, + "balance_loss_mlp": 1.03687394, + "epoch": 0.911769126709755, + "flos": 21653344794240.0, + "grad_norm": 1.4332025157922594, + "language_loss": 0.75946969, + "learning_rate": 8.105395723129315e-08, + "loss": 0.78077805, + "num_input_tokens_seen": 327124370, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.62109375, + "step": 15165, + "time_per_iteration": 2.459932565689087 + }, + { + "auxiliary_loss_clip": 0.01101128, + "auxiliary_loss_mlp": 0.01030795, + "balance_loss_clip": 1.01934457, + "balance_loss_mlp": 1.03483164, + "epoch": 0.911829249962423, + "flos": 24790321221120.0, + "grad_norm": 2.577228898134307, + "language_loss": 0.72376269, + "learning_rate": 8.094424311912074e-08, + "loss": 0.7450819, + "num_input_tokens_seen": 327140915, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 15166, + "time_per_iteration": 2.5008912086486816 + }, + { + "auxiliary_loss_clip": 0.01100156, + "auxiliary_loss_mlp": 0.01033853, + "balance_loss_clip": 1.02166939, + "balance_loss_mlp": 1.03380466, + "epoch": 0.9118893732150909, + "flos": 20959406968320.0, + "grad_norm": 1.790469764052118, + "language_loss": 0.72797149, + "learning_rate": 8.083460177773482e-08, + "loss": 0.74931157, + "num_input_tokens_seen": 327158940, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6640625, + "step": 15167, + "time_per_iteration": 2.426950216293335 + }, + { + "auxiliary_loss_clip": 0.01022043, + "auxiliary_loss_mlp": 0.01000197, + "balance_loss_clip": 0.9992674, + "balance_loss_mlp": 1.00187171, + "epoch": 0.9119494964677589, + "flos": 67917385872000.0, + "grad_norm": 0.7700712875072107, + "language_loss": 0.65548205, + "learning_rate": 8.072503321129298e-08, + "loss": 0.67570436, + "num_input_tokens_seen": 327217450, + "router_z_loss_clip": 0.00927734, + "router_z_loss_mlp": 0.20214844, + "step": 15168, + "time_per_iteration": 3.0059800148010254 + }, + { + "auxiliary_loss_clip": 0.01097014, + "auxiliary_loss_mlp": 0.01027417, + "balance_loss_clip": 1.0165627, + "balance_loss_mlp": 1.03288174, + "epoch": 0.9120096197204268, + "flos": 18551524803840.0, + "grad_norm": 1.934423390877551, + "language_loss": 0.7840631, + "learning_rate": 8.061553742395033e-08, + "loss": 0.80530739, + "num_input_tokens_seen": 327233905, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.640625, + "step": 15169, + "time_per_iteration": 2.3854548931121826 + }, + { + "auxiliary_loss_clip": 0.01098796, + "auxiliary_loss_mlp": 0.01029198, + "balance_loss_clip": 1.01809335, + "balance_loss_mlp": 1.03389323, + "epoch": 0.9120697429730948, + "flos": 19025725178880.0, + "grad_norm": 1.8822447318945712, + "language_loss": 0.8215884, + "learning_rate": 8.05061144198591e-08, + "loss": 0.84286833, + "num_input_tokens_seen": 327252430, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6484375, + "step": 15170, + "time_per_iteration": 2.4406228065490723 + }, + { + "auxiliary_loss_clip": 0.01101631, + "auxiliary_loss_mlp": 0.01031893, + "balance_loss_clip": 1.01971602, + "balance_loss_mlp": 1.03597105, + "epoch": 0.9121298662257629, + "flos": 17163685065600.0, + "grad_norm": 2.184092599163872, + "language_loss": 0.77514195, + "learning_rate": 8.039676420316799e-08, + "loss": 0.79647714, + "num_input_tokens_seen": 327269215, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.65625, + "step": 15171, + "time_per_iteration": 2.3917133808135986 + }, + { + "auxiliary_loss_clip": 0.01096383, + "auxiliary_loss_mlp": 0.0103098, + "balance_loss_clip": 1.0195179, + "balance_loss_mlp": 1.03134727, + "epoch": 0.9121899894784308, + "flos": 19682710888320.0, + "grad_norm": 1.5433702960401063, + "language_loss": 0.66869926, + "learning_rate": 8.02874867780241e-08, + "loss": 0.68997288, + "num_input_tokens_seen": 327290320, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6484375, + "step": 15172, + "time_per_iteration": 2.472642421722412 + }, + { + "auxiliary_loss_clip": 0.01101924, + "auxiliary_loss_mlp": 0.01030191, + "balance_loss_clip": 1.01863933, + "balance_loss_mlp": 1.0358417, + "epoch": 0.9122501127310988, + "flos": 22235743912320.0, + "grad_norm": 1.6574136999814857, + "language_loss": 0.75031823, + "learning_rate": 8.017828214857103e-08, + "loss": 0.77163935, + "num_input_tokens_seen": 327310150, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66015625, + "step": 15173, + "time_per_iteration": 2.4400486946105957 + }, + { + "auxiliary_loss_clip": 0.01106005, + "auxiliary_loss_mlp": 0.01033432, + "balance_loss_clip": 1.01966953, + "balance_loss_mlp": 1.03647041, + "epoch": 0.9123102359837667, + "flos": 15957122290560.0, + "grad_norm": 2.201313503616394, + "language_loss": 0.65935463, + "learning_rate": 8.00691503189499e-08, + "loss": 0.680749, + "num_input_tokens_seen": 327326660, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.6953125, + "step": 15174, + "time_per_iteration": 2.4405062198638916 + }, + { + "auxiliary_loss_clip": 0.01101949, + "auxiliary_loss_mlp": 0.01032545, + "balance_loss_clip": 1.019593, + "balance_loss_mlp": 1.03539705, + "epoch": 0.9123703592364347, + "flos": 25155784149120.0, + "grad_norm": 1.786987037941784, + "language_loss": 0.74865186, + "learning_rate": 7.996009129329894e-08, + "loss": 0.76999688, + "num_input_tokens_seen": 327346700, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6640625, + "step": 15175, + "time_per_iteration": 2.5217480659484863 + }, + { + "auxiliary_loss_clip": 0.01021805, + "auxiliary_loss_mlp": 0.01000925, + "balance_loss_clip": 0.99997121, + "balance_loss_mlp": 1.00181603, + "epoch": 0.9124304824891026, + "flos": 60801650812800.0, + "grad_norm": 0.9628700747874241, + "language_loss": 0.58435005, + "learning_rate": 7.985110507575421e-08, + "loss": 0.6045773, + "num_input_tokens_seen": 327403050, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.20019531, + "step": 15176, + "time_per_iteration": 3.0988872051239014 + }, + { + "auxiliary_loss_clip": 0.01099776, + "auxiliary_loss_mlp": 0.01032073, + "balance_loss_clip": 1.02064109, + "balance_loss_mlp": 1.03451729, + "epoch": 0.9124906057417707, + "flos": 18150941352960.0, + "grad_norm": 1.8294542789280668, + "language_loss": 0.65551788, + "learning_rate": 7.97421916704475e-08, + "loss": 0.67683637, + "num_input_tokens_seen": 327422225, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 15177, + "time_per_iteration": 2.4310319423675537 + }, + { + "auxiliary_loss_clip": 0.01098513, + "auxiliary_loss_mlp": 0.01026355, + "balance_loss_clip": 1.01526892, + "balance_loss_mlp": 1.0335021, + "epoch": 0.9125507289944386, + "flos": 11686769049600.0, + "grad_norm": 1.8886919295946252, + "language_loss": 0.81066203, + "learning_rate": 7.963335108150926e-08, + "loss": 0.83191073, + "num_input_tokens_seen": 327437025, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6484375, + "step": 15178, + "time_per_iteration": 3.8186910152435303 + }, + { + "auxiliary_loss_clip": 0.01099546, + "auxiliary_loss_mlp": 0.01027725, + "balance_loss_clip": 1.01648307, + "balance_loss_mlp": 1.03465581, + "epoch": 0.9126108522471066, + "flos": 17748813617280.0, + "grad_norm": 2.0463085825275034, + "language_loss": 0.78655928, + "learning_rate": 7.952458331306711e-08, + "loss": 0.807832, + "num_input_tokens_seen": 327453915, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 15179, + "time_per_iteration": 2.4364078044891357 + }, + { + "auxiliary_loss_clip": 0.01097377, + "auxiliary_loss_mlp": 0.01030818, + "balance_loss_clip": 1.0196358, + "balance_loss_mlp": 1.03346872, + "epoch": 0.9126709754997745, + "flos": 27635738952960.0, + "grad_norm": 2.3074557537975626, + "language_loss": 0.68364185, + "learning_rate": 7.941588836924507e-08, + "loss": 0.70492381, + "num_input_tokens_seen": 327474415, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.640625, + "step": 15180, + "time_per_iteration": 3.9669907093048096 + }, + { + "auxiliary_loss_clip": 0.01096034, + "auxiliary_loss_mlp": 0.01026412, + "balance_loss_clip": 1.01565289, + "balance_loss_mlp": 1.03221154, + "epoch": 0.9127310987524425, + "flos": 15924982596480.0, + "grad_norm": 1.6615442827017741, + "language_loss": 0.75214398, + "learning_rate": 7.930726625416495e-08, + "loss": 0.77336842, + "num_input_tokens_seen": 327492750, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.640625, + "step": 15181, + "time_per_iteration": 3.870901584625244 + }, + { + "auxiliary_loss_clip": 0.01103971, + "auxiliary_loss_mlp": 0.01030083, + "balance_loss_clip": 1.01850748, + "balance_loss_mlp": 1.03598988, + "epoch": 0.9127912220051104, + "flos": 21536885923200.0, + "grad_norm": 2.297286307607851, + "language_loss": 0.74843061, + "learning_rate": 7.919871697194614e-08, + "loss": 0.7697711, + "num_input_tokens_seen": 327509470, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 15182, + "time_per_iteration": 2.4776079654693604 + }, + { + "auxiliary_loss_clip": 0.01100627, + "auxiliary_loss_mlp": 0.01029232, + "balance_loss_clip": 1.01718605, + "balance_loss_mlp": 1.03372836, + "epoch": 0.9128513452577784, + "flos": 24063561342720.0, + "grad_norm": 3.985980312543223, + "language_loss": 0.76413208, + "learning_rate": 7.909024052670421e-08, + "loss": 0.78543067, + "num_input_tokens_seen": 327530520, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.66796875, + "step": 15183, + "time_per_iteration": 2.468374729156494 + }, + { + "auxiliary_loss_clip": 0.01102788, + "auxiliary_loss_mlp": 0.01030705, + "balance_loss_clip": 1.01908207, + "balance_loss_mlp": 1.03510523, + "epoch": 0.9129114685104465, + "flos": 16216469464320.0, + "grad_norm": 2.281455397208263, + "language_loss": 0.76592457, + "learning_rate": 7.898183692255256e-08, + "loss": 0.78725952, + "num_input_tokens_seen": 327546960, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 15184, + "time_per_iteration": 2.421410322189331 + }, + { + "auxiliary_loss_clip": 0.01102745, + "auxiliary_loss_mlp": 0.01032849, + "balance_loss_clip": 1.02141702, + "balance_loss_mlp": 1.03666544, + "epoch": 0.9129715917631144, + "flos": 19384364522880.0, + "grad_norm": 1.638011852931889, + "language_loss": 0.74281073, + "learning_rate": 7.887350616360233e-08, + "loss": 0.76416671, + "num_input_tokens_seen": 327564830, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 15185, + "time_per_iteration": 3.940502166748047 + }, + { + "auxiliary_loss_clip": 0.01098799, + "auxiliary_loss_mlp": 0.01029378, + "balance_loss_clip": 1.01797581, + "balance_loss_mlp": 1.03400433, + "epoch": 0.9130317150157824, + "flos": 20590460421120.0, + "grad_norm": 2.0531657489206188, + "language_loss": 0.68440223, + "learning_rate": 7.876524825396158e-08, + "loss": 0.70568401, + "num_input_tokens_seen": 327583675, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 15186, + "time_per_iteration": 2.4343931674957275 + }, + { + "auxiliary_loss_clip": 0.01108195, + "auxiliary_loss_mlp": 0.0103352, + "balance_loss_clip": 1.02074695, + "balance_loss_mlp": 1.03696275, + "epoch": 0.9130918382684503, + "flos": 20189230525440.0, + "grad_norm": 2.6512076231674806, + "language_loss": 0.77220356, + "learning_rate": 7.865706319773502e-08, + "loss": 0.79362077, + "num_input_tokens_seen": 327602280, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7109375, + "step": 15187, + "time_per_iteration": 2.438368558883667 + }, + { + "auxiliary_loss_clip": 0.01098925, + "auxiliary_loss_mlp": 0.01029611, + "balance_loss_clip": 1.01903725, + "balance_loss_mlp": 1.03280544, + "epoch": 0.9131519615211183, + "flos": 25556870390400.0, + "grad_norm": 2.0471267389391024, + "language_loss": 0.66164011, + "learning_rate": 7.854895099902515e-08, + "loss": 0.68292546, + "num_input_tokens_seen": 327623515, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.66015625, + "step": 15188, + "time_per_iteration": 2.484286069869995 + }, + { + "auxiliary_loss_clip": 0.01097061, + "auxiliary_loss_mlp": 0.0103036, + "balance_loss_clip": 1.01933336, + "balance_loss_mlp": 1.03323078, + "epoch": 0.9132120847737862, + "flos": 17931563038080.0, + "grad_norm": 2.3139671403974824, + "language_loss": 0.76142931, + "learning_rate": 7.844091166193157e-08, + "loss": 0.78270352, + "num_input_tokens_seen": 327642875, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.63671875, + "step": 15189, + "time_per_iteration": 2.4128341674804688 + }, + { + "auxiliary_loss_clip": 0.01098834, + "auxiliary_loss_mlp": 0.01028075, + "balance_loss_clip": 1.01762676, + "balance_loss_mlp": 1.03456092, + "epoch": 0.9132722080264543, + "flos": 20047635112320.0, + "grad_norm": 1.6090293232992543, + "language_loss": 0.75407052, + "learning_rate": 7.8332945190551e-08, + "loss": 0.77533972, + "num_input_tokens_seen": 327662450, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.640625, + "step": 15190, + "time_per_iteration": 2.435072660446167 + }, + { + "auxiliary_loss_clip": 0.01021871, + "auxiliary_loss_mlp": 0.01001637, + "balance_loss_clip": 1.00067699, + "balance_loss_mlp": 1.00190675, + "epoch": 0.9133323312791222, + "flos": 70439967141120.0, + "grad_norm": 0.7094888016430416, + "language_loss": 0.57359248, + "learning_rate": 7.822505158897797e-08, + "loss": 0.59382761, + "num_input_tokens_seen": 327723845, + "router_z_loss_clip": 0.00958252, + "router_z_loss_mlp": 0.19921875, + "step": 15191, + "time_per_iteration": 3.087395429611206 + }, + { + "auxiliary_loss_clip": 0.01102347, + "auxiliary_loss_mlp": 0.01034605, + "balance_loss_clip": 1.02263618, + "balance_loss_mlp": 1.03504705, + "epoch": 0.9133924545317902, + "flos": 25483792170240.0, + "grad_norm": 1.6777146532024645, + "language_loss": 0.73936659, + "learning_rate": 7.81172308613034e-08, + "loss": 0.76073611, + "num_input_tokens_seen": 327742590, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 15192, + "time_per_iteration": 2.4690206050872803 + }, + { + "auxiliary_loss_clip": 0.01099289, + "auxiliary_loss_mlp": 0.01024504, + "balance_loss_clip": 1.0133816, + "balance_loss_mlp": 1.03559685, + "epoch": 0.9134525777844581, + "flos": 39930690107520.0, + "grad_norm": 1.6421376517617297, + "language_loss": 0.69312721, + "learning_rate": 7.800948301161647e-08, + "loss": 0.71436512, + "num_input_tokens_seen": 327764350, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.63671875, + "step": 15193, + "time_per_iteration": 2.6223676204681396 + }, + { + "auxiliary_loss_clip": 0.01096991, + "auxiliary_loss_mlp": 0.01036528, + "balance_loss_clip": 1.02556682, + "balance_loss_mlp": 1.03420997, + "epoch": 0.9135127010371261, + "flos": 20886723797760.0, + "grad_norm": 1.485940174271116, + "language_loss": 0.73231626, + "learning_rate": 7.790180804400215e-08, + "loss": 0.7536515, + "num_input_tokens_seen": 327783120, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.62890625, + "step": 15194, + "time_per_iteration": 2.463771343231201 + }, + { + "auxiliary_loss_clip": 0.01101991, + "auxiliary_loss_mlp": 0.01032601, + "balance_loss_clip": 1.01946473, + "balance_loss_mlp": 1.03339386, + "epoch": 0.913572824289794, + "flos": 20813250528000.0, + "grad_norm": 2.0304511645120455, + "language_loss": 0.61398089, + "learning_rate": 7.779420596254383e-08, + "loss": 0.63532686, + "num_input_tokens_seen": 327801960, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.68359375, + "step": 15195, + "time_per_iteration": 2.453814744949341 + }, + { + "auxiliary_loss_clip": 0.01099363, + "auxiliary_loss_mlp": 0.01030955, + "balance_loss_clip": 1.01902771, + "balance_loss_mlp": 1.0335803, + "epoch": 0.913632947542462, + "flos": 25703278225920.0, + "grad_norm": 1.5172036842114138, + "language_loss": 0.7131865, + "learning_rate": 7.768667677132201e-08, + "loss": 0.73448968, + "num_input_tokens_seen": 327823795, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 15196, + "time_per_iteration": 2.4799444675445557 + }, + { + "auxiliary_loss_clip": 0.01100065, + "auxiliary_loss_mlp": 0.01029829, + "balance_loss_clip": 1.01855159, + "balance_loss_mlp": 1.03487432, + "epoch": 0.9136930707951301, + "flos": 26286216048000.0, + "grad_norm": 1.4441541156938638, + "language_loss": 0.71125305, + "learning_rate": 7.757922047441411e-08, + "loss": 0.73255193, + "num_input_tokens_seen": 327845175, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65234375, + "step": 15197, + "time_per_iteration": 2.5215611457824707 + }, + { + "auxiliary_loss_clip": 0.01100431, + "auxiliary_loss_mlp": 0.01026363, + "balance_loss_clip": 1.01447809, + "balance_loss_mlp": 1.03330481, + "epoch": 0.913753194047798, + "flos": 22091885942400.0, + "grad_norm": 1.889161439888729, + "language_loss": 0.77785528, + "learning_rate": 7.747183707589489e-08, + "loss": 0.79912317, + "num_input_tokens_seen": 327863150, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 15198, + "time_per_iteration": 2.442277431488037 + }, + { + "auxiliary_loss_clip": 0.01096101, + "auxiliary_loss_mlp": 0.01029423, + "balance_loss_clip": 1.01853919, + "balance_loss_mlp": 1.03312182, + "epoch": 0.913813317300466, + "flos": 23587206151680.0, + "grad_norm": 1.3365328248407828, + "language_loss": 0.6804055, + "learning_rate": 7.736452657983616e-08, + "loss": 0.70166075, + "num_input_tokens_seen": 327883445, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6328125, + "step": 15199, + "time_per_iteration": 2.4880144596099854 + }, + { + "auxiliary_loss_clip": 0.01100948, + "auxiliary_loss_mlp": 0.01033065, + "balance_loss_clip": 1.02183509, + "balance_loss_mlp": 1.03505015, + "epoch": 0.9138734405531339, + "flos": 28876452583680.0, + "grad_norm": 1.6447593727576186, + "language_loss": 0.67633069, + "learning_rate": 7.725728899030714e-08, + "loss": 0.69767076, + "num_input_tokens_seen": 327905745, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 15200, + "time_per_iteration": 2.5086967945098877 + }, + { + "auxiliary_loss_clip": 0.01098027, + "auxiliary_loss_mlp": 0.0102929, + "balance_loss_clip": 1.01860261, + "balance_loss_mlp": 1.03541744, + "epoch": 0.9139335638058019, + "flos": 22821087945600.0, + "grad_norm": 1.5101416705919046, + "language_loss": 0.71488059, + "learning_rate": 7.715012431137435e-08, + "loss": 0.73615378, + "num_input_tokens_seen": 327925435, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.625, + "step": 15201, + "time_per_iteration": 2.4748075008392334 + }, + { + "auxiliary_loss_clip": 0.01098308, + "auxiliary_loss_mlp": 0.01026384, + "balance_loss_clip": 1.01603663, + "balance_loss_mlp": 1.03388548, + "epoch": 0.9139936870584698, + "flos": 18004174381440.0, + "grad_norm": 1.6982723466196472, + "language_loss": 0.70671141, + "learning_rate": 7.704303254710165e-08, + "loss": 0.72795826, + "num_input_tokens_seen": 327944145, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.64453125, + "step": 15202, + "time_per_iteration": 2.438340902328491 + }, + { + "auxiliary_loss_clip": 0.01099499, + "auxiliary_loss_mlp": 0.01028688, + "balance_loss_clip": 1.01694536, + "balance_loss_mlp": 1.033728, + "epoch": 0.9140538103111379, + "flos": 15813767111040.0, + "grad_norm": 1.8389143289614247, + "language_loss": 0.66278571, + "learning_rate": 7.693601370155001e-08, + "loss": 0.68406761, + "num_input_tokens_seen": 327960565, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.65625, + "step": 15203, + "time_per_iteration": 2.479570150375366 + }, + { + "auxiliary_loss_clip": 0.01101513, + "auxiliary_loss_mlp": 0.01031204, + "balance_loss_clip": 1.01884818, + "balance_loss_mlp": 1.03622997, + "epoch": 0.9141139335638058, + "flos": 23987035416960.0, + "grad_norm": 1.6116143834320078, + "language_loss": 0.68694603, + "learning_rate": 7.682906777877751e-08, + "loss": 0.70827323, + "num_input_tokens_seen": 327981180, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.65234375, + "step": 15204, + "time_per_iteration": 2.4571127891540527 + }, + { + "auxiliary_loss_clip": 0.01097969, + "auxiliary_loss_mlp": 0.0102718, + "balance_loss_clip": 1.01541436, + "balance_loss_mlp": 1.03155589, + "epoch": 0.9141740568164738, + "flos": 24024418496640.0, + "grad_norm": 3.922644674867668, + "language_loss": 0.59307611, + "learning_rate": 7.672219478283915e-08, + "loss": 0.61432767, + "num_input_tokens_seen": 328001500, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6640625, + "step": 15205, + "time_per_iteration": 2.4621520042419434 + }, + { + "auxiliary_loss_clip": 0.0109613, + "auxiliary_loss_mlp": 0.01032774, + "balance_loss_clip": 1.02129984, + "balance_loss_mlp": 1.03344274, + "epoch": 0.9142341800691417, + "flos": 27018291139200.0, + "grad_norm": 1.8519620978555191, + "language_loss": 0.81337631, + "learning_rate": 7.661539471778811e-08, + "loss": 0.8346653, + "num_input_tokens_seen": 328023025, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.625, + "step": 15206, + "time_per_iteration": 2.4877803325653076 + }, + { + "auxiliary_loss_clip": 0.01098654, + "auxiliary_loss_mlp": 0.01028165, + "balance_loss_clip": 1.01667953, + "balance_loss_mlp": 1.03213692, + "epoch": 0.9142943033218097, + "flos": 20412487509120.0, + "grad_norm": 2.0513534201976173, + "language_loss": 0.73153603, + "learning_rate": 7.650866758767382e-08, + "loss": 0.75280422, + "num_input_tokens_seen": 328041410, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66796875, + "step": 15207, + "time_per_iteration": 2.4546096324920654 + }, + { + "auxiliary_loss_clip": 0.01099504, + "auxiliary_loss_mlp": 0.01033304, + "balance_loss_clip": 1.0210495, + "balance_loss_mlp": 1.03416693, + "epoch": 0.9143544265744776, + "flos": 19755322231680.0, + "grad_norm": 1.7864801619442867, + "language_loss": 0.72906077, + "learning_rate": 7.640201339654373e-08, + "loss": 0.75038886, + "num_input_tokens_seen": 328060495, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.65234375, + "step": 15208, + "time_per_iteration": 2.4418113231658936 + }, + { + "auxiliary_loss_clip": 0.01098905, + "auxiliary_loss_mlp": 0.01027437, + "balance_loss_clip": 1.01648736, + "balance_loss_mlp": 1.03465152, + "epoch": 0.9144145498271457, + "flos": 17165444832000.0, + "grad_norm": 2.1695224454551423, + "language_loss": 0.86409903, + "learning_rate": 7.629543214844237e-08, + "loss": 0.88536251, + "num_input_tokens_seen": 328076905, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.640625, + "step": 15209, + "time_per_iteration": 2.413148880004883 + }, + { + "auxiliary_loss_clip": 0.01098935, + "auxiliary_loss_mlp": 0.01035579, + "balance_loss_clip": 1.0244627, + "balance_loss_mlp": 1.03434813, + "epoch": 0.9144746730798137, + "flos": 23726072131200.0, + "grad_norm": 1.735664955022414, + "language_loss": 0.75140452, + "learning_rate": 7.618892384741093e-08, + "loss": 0.77274966, + "num_input_tokens_seen": 328096960, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.64453125, + "step": 15210, + "time_per_iteration": 2.493086099624634 + }, + { + "auxiliary_loss_clip": 0.01097401, + "auxiliary_loss_mlp": 0.01030994, + "balance_loss_clip": 1.01927555, + "balance_loss_mlp": 1.0315125, + "epoch": 0.9145347963324816, + "flos": 25847854467840.0, + "grad_norm": 1.7521937388781827, + "language_loss": 0.77584058, + "learning_rate": 7.6082488497488e-08, + "loss": 0.79712451, + "num_input_tokens_seen": 328115445, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 15211, + "time_per_iteration": 2.4678258895874023 + }, + { + "auxiliary_loss_clip": 0.01100975, + "auxiliary_loss_mlp": 0.01026726, + "balance_loss_clip": 1.01571679, + "balance_loss_mlp": 1.03529775, + "epoch": 0.9145949195851496, + "flos": 19242769109760.0, + "grad_norm": 1.715606657088832, + "language_loss": 0.82844532, + "learning_rate": 7.597612610270986e-08, + "loss": 0.84972233, + "num_input_tokens_seen": 328133965, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 15212, + "time_per_iteration": 2.4359114170074463 + }, + { + "auxiliary_loss_clip": 0.01096761, + "auxiliary_loss_mlp": 0.01027305, + "balance_loss_clip": 1.01680315, + "balance_loss_mlp": 1.03358889, + "epoch": 0.9146550428378175, + "flos": 18296379521280.0, + "grad_norm": 1.7611506527071346, + "language_loss": 0.83891743, + "learning_rate": 7.586983666711022e-08, + "loss": 0.86015809, + "num_input_tokens_seen": 328151520, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.6328125, + "step": 15213, + "time_per_iteration": 2.4092206954956055 + }, + { + "auxiliary_loss_clip": 0.01099693, + "auxiliary_loss_mlp": 0.01025121, + "balance_loss_clip": 1.01386786, + "balance_loss_mlp": 1.03436995, + "epoch": 0.9147151660904855, + "flos": 20084264006400.0, + "grad_norm": 2.085343861067481, + "language_loss": 0.70816439, + "learning_rate": 7.576362019471894e-08, + "loss": 0.72941256, + "num_input_tokens_seen": 328171275, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 15214, + "time_per_iteration": 2.447380781173706 + }, + { + "auxiliary_loss_clip": 0.01103848, + "auxiliary_loss_mlp": 0.01033056, + "balance_loss_clip": 1.02051485, + "balance_loss_mlp": 1.03580141, + "epoch": 0.9147752893431534, + "flos": 24389127239040.0, + "grad_norm": 1.5390932185045392, + "language_loss": 0.62629873, + "learning_rate": 7.565747668956413e-08, + "loss": 0.64766777, + "num_input_tokens_seen": 328192115, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 15215, + "time_per_iteration": 2.461411476135254 + }, + { + "auxiliary_loss_clip": 0.01104348, + "auxiliary_loss_mlp": 0.01029469, + "balance_loss_clip": 1.01738763, + "balance_loss_mlp": 1.03553486, + "epoch": 0.9148354125958215, + "flos": 18150402648960.0, + "grad_norm": 2.721534331397324, + "language_loss": 0.75732076, + "learning_rate": 7.555140615567058e-08, + "loss": 0.77865899, + "num_input_tokens_seen": 328208990, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 15216, + "time_per_iteration": 2.523115873336792 + }, + { + "auxiliary_loss_clip": 0.01100136, + "auxiliary_loss_mlp": 0.01035305, + "balance_loss_clip": 1.02305031, + "balance_loss_mlp": 1.03539312, + "epoch": 0.9148955358484894, + "flos": 23367540528000.0, + "grad_norm": 2.4268000375661773, + "language_loss": 0.68142593, + "learning_rate": 7.544540859706062e-08, + "loss": 0.70278037, + "num_input_tokens_seen": 328227840, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6484375, + "step": 15217, + "time_per_iteration": 2.4448487758636475 + }, + { + "auxiliary_loss_clip": 0.01098239, + "auxiliary_loss_mlp": 0.0102959, + "balance_loss_clip": 1.01877725, + "balance_loss_mlp": 1.03458583, + "epoch": 0.9149556591011574, + "flos": 18076498416000.0, + "grad_norm": 1.8653431496405544, + "language_loss": 0.79877293, + "learning_rate": 7.533948401775347e-08, + "loss": 0.82005119, + "num_input_tokens_seen": 328246250, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.63671875, + "step": 15218, + "time_per_iteration": 2.434863567352295 + }, + { + "auxiliary_loss_clip": 0.01021776, + "auxiliary_loss_mlp": 0.01000225, + "balance_loss_clip": 0.99925387, + "balance_loss_mlp": 1.00182867, + "epoch": 0.9150157823538253, + "flos": 54586374825600.0, + "grad_norm": 0.8896822050183594, + "language_loss": 0.59232152, + "learning_rate": 7.523363242176595e-08, + "loss": 0.61254156, + "num_input_tokens_seen": 328303625, + "router_z_loss_clip": 0.00970459, + "router_z_loss_mlp": 0.19921875, + "step": 15219, + "time_per_iteration": 2.9880809783935547 + }, + { + "auxiliary_loss_clip": 0.01097663, + "auxiliary_loss_mlp": 0.01030559, + "balance_loss_clip": 1.01906729, + "balance_loss_mlp": 1.03414941, + "epoch": 0.9150759056064933, + "flos": 17893102550400.0, + "grad_norm": 1.812367414833016, + "language_loss": 0.78320539, + "learning_rate": 7.512785381311216e-08, + "loss": 0.80448759, + "num_input_tokens_seen": 328322135, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.63671875, + "step": 15220, + "time_per_iteration": 3.818652391433716 + }, + { + "auxiliary_loss_clip": 0.01102017, + "auxiliary_loss_mlp": 0.0103102, + "balance_loss_clip": 1.01826453, + "balance_loss_mlp": 1.03302217, + "epoch": 0.9151360288591612, + "flos": 18073517587200.0, + "grad_norm": 1.8962710431659022, + "language_loss": 0.65642536, + "learning_rate": 7.50221481958031e-08, + "loss": 0.67775571, + "num_input_tokens_seen": 328340750, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 15221, + "time_per_iteration": 2.4236178398132324 + }, + { + "auxiliary_loss_clip": 0.01098425, + "auxiliary_loss_mlp": 0.01027849, + "balance_loss_clip": 1.01696539, + "balance_loss_mlp": 1.03305852, + "epoch": 0.9151961521118293, + "flos": 19354523299200.0, + "grad_norm": 1.6737011453646373, + "language_loss": 0.8425433, + "learning_rate": 7.491651557384692e-08, + "loss": 0.86380607, + "num_input_tokens_seen": 328359995, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.65234375, + "step": 15222, + "time_per_iteration": 3.9798471927642822 + }, + { + "auxiliary_loss_clip": 0.01022041, + "auxiliary_loss_mlp": 0.01001485, + "balance_loss_clip": 1.00054312, + "balance_loss_mlp": 1.00207162, + "epoch": 0.9152562753644973, + "flos": 72146621018880.0, + "grad_norm": 0.7306316562738401, + "language_loss": 0.49642789, + "learning_rate": 7.481095595124953e-08, + "loss": 0.51666313, + "num_input_tokens_seen": 328426865, + "router_z_loss_clip": 0.00939941, + "router_z_loss_mlp": 0.20019531, + "step": 15223, + "time_per_iteration": 4.616261720657349 + }, + { + "auxiliary_loss_clip": 0.01102367, + "auxiliary_loss_mlp": 0.01037801, + "balance_loss_clip": 1.02548087, + "balance_loss_mlp": 1.03582227, + "epoch": 0.9153163986171652, + "flos": 20777016683520.0, + "grad_norm": 1.708315664523414, + "language_loss": 0.7237857, + "learning_rate": 7.470546933201349e-08, + "loss": 0.7451874, + "num_input_tokens_seen": 328445970, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6640625, + "step": 15224, + "time_per_iteration": 2.4585115909576416 + }, + { + "auxiliary_loss_clip": 0.01097737, + "auxiliary_loss_mlp": 0.01025257, + "balance_loss_clip": 1.0136168, + "balance_loss_mlp": 1.03346014, + "epoch": 0.9153765218698332, + "flos": 23040107124480.0, + "grad_norm": 1.7724637972030735, + "language_loss": 0.81216443, + "learning_rate": 7.460005572013895e-08, + "loss": 0.83339441, + "num_input_tokens_seen": 328464585, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.64453125, + "step": 15225, + "time_per_iteration": 2.4403467178344727 + }, + { + "auxiliary_loss_clip": 0.01097606, + "auxiliary_loss_mlp": 0.01025522, + "balance_loss_clip": 1.01470423, + "balance_loss_mlp": 1.03278244, + "epoch": 0.9154366451225011, + "flos": 28990900293120.0, + "grad_norm": 1.266236399456709, + "language_loss": 0.71322179, + "learning_rate": 7.44947151196238e-08, + "loss": 0.73445308, + "num_input_tokens_seen": 328490155, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6484375, + "step": 15226, + "time_per_iteration": 2.541335105895996 + }, + { + "auxiliary_loss_clip": 0.01100546, + "auxiliary_loss_mlp": 0.01030004, + "balance_loss_clip": 1.01837564, + "balance_loss_mlp": 1.03350449, + "epoch": 0.9154967683751691, + "flos": 22309504490880.0, + "grad_norm": 1.9966844593099904, + "language_loss": 0.74624139, + "learning_rate": 7.43894475344613e-08, + "loss": 0.76754689, + "num_input_tokens_seen": 328508275, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 15227, + "time_per_iteration": 3.8971879482269287 + }, + { + "auxiliary_loss_clip": 0.01099091, + "auxiliary_loss_mlp": 0.01027789, + "balance_loss_clip": 1.01703024, + "balance_loss_mlp": 1.03465962, + "epoch": 0.915556891627837, + "flos": 24571481610240.0, + "grad_norm": 1.4148019926474746, + "language_loss": 0.73699552, + "learning_rate": 7.428425296864404e-08, + "loss": 0.75826436, + "num_input_tokens_seen": 328529425, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.64453125, + "step": 15228, + "time_per_iteration": 2.48069429397583 + }, + { + "auxiliary_loss_clip": 0.01095291, + "auxiliary_loss_mlp": 0.01031336, + "balance_loss_clip": 1.02024984, + "balance_loss_mlp": 1.03221262, + "epoch": 0.9156170148805051, + "flos": 22164676853760.0, + "grad_norm": 1.5305780770473314, + "language_loss": 0.71960795, + "learning_rate": 7.417913142616106e-08, + "loss": 0.74087429, + "num_input_tokens_seen": 328550200, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6328125, + "step": 15229, + "time_per_iteration": 2.444805145263672 + }, + { + "auxiliary_loss_clip": 0.011021, + "auxiliary_loss_mlp": 0.01032289, + "balance_loss_clip": 1.02018356, + "balance_loss_mlp": 1.03612995, + "epoch": 0.915677138133173, + "flos": 20920659171840.0, + "grad_norm": 2.0470204935534984, + "language_loss": 0.83144408, + "learning_rate": 7.407408291099848e-08, + "loss": 0.85278797, + "num_input_tokens_seen": 328568540, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66015625, + "step": 15230, + "time_per_iteration": 2.432034730911255 + }, + { + "auxiliary_loss_clip": 0.01098842, + "auxiliary_loss_mlp": 0.01030044, + "balance_loss_clip": 1.01935065, + "balance_loss_mlp": 1.03477907, + "epoch": 0.915737261385841, + "flos": 24345136056960.0, + "grad_norm": 1.6269282697769034, + "language_loss": 0.83418006, + "learning_rate": 7.396910742713957e-08, + "loss": 0.85546893, + "num_input_tokens_seen": 328587300, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.640625, + "step": 15231, + "time_per_iteration": 2.515538215637207 + }, + { + "auxiliary_loss_clip": 0.01095817, + "auxiliary_loss_mlp": 0.01024393, + "balance_loss_clip": 1.01322293, + "balance_loss_mlp": 1.03172684, + "epoch": 0.9157973846385089, + "flos": 26761386090240.0, + "grad_norm": 1.829982250303586, + "language_loss": 0.72207046, + "learning_rate": 7.386420497856516e-08, + "loss": 0.74327254, + "num_input_tokens_seen": 328610055, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.640625, + "step": 15232, + "time_per_iteration": 2.5557878017425537 + }, + { + "auxiliary_loss_clip": 0.01100605, + "auxiliary_loss_mlp": 0.01029482, + "balance_loss_clip": 1.01812696, + "balance_loss_mlp": 1.03404856, + "epoch": 0.9158575078911769, + "flos": 18478733892480.0, + "grad_norm": 4.73970403036583, + "language_loss": 0.67340308, + "learning_rate": 7.375937556925338e-08, + "loss": 0.69470394, + "num_input_tokens_seen": 328626815, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 15233, + "time_per_iteration": 2.4151556491851807 + }, + { + "auxiliary_loss_clip": 0.01101483, + "auxiliary_loss_mlp": 0.01029838, + "balance_loss_clip": 1.01797664, + "balance_loss_mlp": 1.03488588, + "epoch": 0.9159176311438448, + "flos": 21798926616960.0, + "grad_norm": 2.2571803490205564, + "language_loss": 0.6969521, + "learning_rate": 7.365461920317861e-08, + "loss": 0.7182653, + "num_input_tokens_seen": 328643995, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66796875, + "step": 15234, + "time_per_iteration": 2.444852828979492 + }, + { + "auxiliary_loss_clip": 0.01100736, + "auxiliary_loss_mlp": 0.01030082, + "balance_loss_clip": 1.01809597, + "balance_loss_mlp": 1.03512609, + "epoch": 0.9159777543965129, + "flos": 24783749032320.0, + "grad_norm": 1.7294981323630823, + "language_loss": 0.87835944, + "learning_rate": 7.354993588431391e-08, + "loss": 0.89966768, + "num_input_tokens_seen": 328659565, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.65625, + "step": 15235, + "time_per_iteration": 2.4612205028533936 + }, + { + "auxiliary_loss_clip": 0.01102081, + "auxiliary_loss_mlp": 0.01030225, + "balance_loss_clip": 1.01839912, + "balance_loss_mlp": 1.03525317, + "epoch": 0.9160378776491809, + "flos": 26868758820480.0, + "grad_norm": 1.5527464257030497, + "language_loss": 0.76839787, + "learning_rate": 7.344532561662853e-08, + "loss": 0.78972089, + "num_input_tokens_seen": 328679045, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 15236, + "time_per_iteration": 2.474457263946533 + }, + { + "auxiliary_loss_clip": 0.01021315, + "auxiliary_loss_mlp": 0.01003153, + "balance_loss_clip": 1.00213361, + "balance_loss_mlp": 1.0013386, + "epoch": 0.9160980009018488, + "flos": 70578222589440.0, + "grad_norm": 0.6788076551857354, + "language_loss": 0.62263203, + "learning_rate": 7.334078840409019e-08, + "loss": 0.64287663, + "num_input_tokens_seen": 328744565, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.19921875, + "step": 15237, + "time_per_iteration": 3.0201759338378906 + }, + { + "auxiliary_loss_clip": 0.01101293, + "auxiliary_loss_mlp": 0.01029169, + "balance_loss_clip": 1.01718867, + "balance_loss_mlp": 1.03470826, + "epoch": 0.9161581241545168, + "flos": 16289332202880.0, + "grad_norm": 3.0659105416988552, + "language_loss": 0.7453984, + "learning_rate": 7.323632425066151e-08, + "loss": 0.76670301, + "num_input_tokens_seen": 328762455, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 15238, + "time_per_iteration": 2.404824733734131 + }, + { + "auxiliary_loss_clip": 0.01101036, + "auxiliary_loss_mlp": 0.01026664, + "balance_loss_clip": 1.01528561, + "balance_loss_mlp": 1.0344367, + "epoch": 0.9162182474071847, + "flos": 18438154502400.0, + "grad_norm": 1.6853760696818214, + "language_loss": 0.74746668, + "learning_rate": 7.313193316030464e-08, + "loss": 0.76874375, + "num_input_tokens_seen": 328780320, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6640625, + "step": 15239, + "time_per_iteration": 2.4390740394592285 + }, + { + "auxiliary_loss_clip": 0.01100596, + "auxiliary_loss_mlp": 0.0103313, + "balance_loss_clip": 1.02131677, + "balance_loss_mlp": 1.034127, + "epoch": 0.9162783706598527, + "flos": 19167248764800.0, + "grad_norm": 2.2129519581764496, + "language_loss": 0.63188612, + "learning_rate": 7.302761513697819e-08, + "loss": 0.65322334, + "num_input_tokens_seen": 328797570, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 15240, + "time_per_iteration": 2.424992322921753 + }, + { + "auxiliary_loss_clip": 0.01100319, + "auxiliary_loss_mlp": 0.01023378, + "balance_loss_clip": 1.01264906, + "balance_loss_mlp": 1.03647375, + "epoch": 0.9163384939125206, + "flos": 20412990299520.0, + "grad_norm": 1.824123817472358, + "language_loss": 0.76293588, + "learning_rate": 7.292337018463746e-08, + "loss": 0.78417283, + "num_input_tokens_seen": 328814075, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.63671875, + "step": 15241, + "time_per_iteration": 2.453367233276367 + }, + { + "auxiliary_loss_clip": 0.01107929, + "auxiliary_loss_mlp": 0.01028716, + "balance_loss_clip": 1.0154779, + "balance_loss_mlp": 1.03601336, + "epoch": 0.9163986171651887, + "flos": 19645902426240.0, + "grad_norm": 2.7073047066041385, + "language_loss": 0.6746605, + "learning_rate": 7.281919830723549e-08, + "loss": 0.69602692, + "num_input_tokens_seen": 328831990, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 15242, + "time_per_iteration": 2.4336512088775635 + }, + { + "auxiliary_loss_clip": 0.01099212, + "auxiliary_loss_mlp": 0.01027991, + "balance_loss_clip": 1.01624286, + "balance_loss_mlp": 1.03331637, + "epoch": 0.9164587404178566, + "flos": 12823054865280.0, + "grad_norm": 1.754571362997044, + "language_loss": 0.80896854, + "learning_rate": 7.271509950872334e-08, + "loss": 0.83024061, + "num_input_tokens_seen": 328849105, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.66015625, + "step": 15243, + "time_per_iteration": 2.426079750061035 + }, + { + "auxiliary_loss_clip": 0.01099771, + "auxiliary_loss_mlp": 0.01029297, + "balance_loss_clip": 1.01748323, + "balance_loss_mlp": 1.03221726, + "epoch": 0.9165188636705246, + "flos": 22309396750080.0, + "grad_norm": 1.8762223959588424, + "language_loss": 0.8205328, + "learning_rate": 7.261107379304721e-08, + "loss": 0.84182346, + "num_input_tokens_seen": 328866810, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.67578125, + "step": 15244, + "time_per_iteration": 2.506777286529541 + }, + { + "auxiliary_loss_clip": 0.01104451, + "auxiliary_loss_mlp": 0.01034704, + "balance_loss_clip": 1.02243781, + "balance_loss_mlp": 1.03492165, + "epoch": 0.9165789869231925, + "flos": 18223337214720.0, + "grad_norm": 2.648614204029591, + "language_loss": 0.72213554, + "learning_rate": 7.250712116415214e-08, + "loss": 0.74352717, + "num_input_tokens_seen": 328885325, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6953125, + "step": 15245, + "time_per_iteration": 2.4680283069610596 + }, + { + "auxiliary_loss_clip": 0.01098467, + "auxiliary_loss_mlp": 0.01030377, + "balance_loss_clip": 1.01930237, + "balance_loss_mlp": 1.03360033, + "epoch": 0.9166391101758605, + "flos": 13691553811200.0, + "grad_norm": 1.6235640253578716, + "language_loss": 0.74646342, + "learning_rate": 7.240324162598033e-08, + "loss": 0.76775181, + "num_input_tokens_seen": 328902655, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6484375, + "step": 15246, + "time_per_iteration": 2.398216485977173 + }, + { + "auxiliary_loss_clip": 0.01099557, + "auxiliary_loss_mlp": 0.01030274, + "balance_loss_clip": 1.01808476, + "balance_loss_mlp": 1.03437448, + "epoch": 0.9166992334285284, + "flos": 17346793622400.0, + "grad_norm": 2.0593596154006355, + "language_loss": 0.75462282, + "learning_rate": 7.229943518247106e-08, + "loss": 0.77592111, + "num_input_tokens_seen": 328918440, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6484375, + "step": 15247, + "time_per_iteration": 2.4263362884521484 + }, + { + "auxiliary_loss_clip": 0.01103714, + "auxiliary_loss_mlp": 0.01027032, + "balance_loss_clip": 1.01507568, + "balance_loss_mlp": 1.03711426, + "epoch": 0.9167593566811965, + "flos": 23731135948800.0, + "grad_norm": 1.6801453014221095, + "language_loss": 0.75884688, + "learning_rate": 7.219570183756052e-08, + "loss": 0.78015435, + "num_input_tokens_seen": 328938055, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66796875, + "step": 15248, + "time_per_iteration": 2.4508020877838135 + }, + { + "auxiliary_loss_clip": 0.0110139, + "auxiliary_loss_mlp": 0.01034082, + "balance_loss_clip": 1.02130914, + "balance_loss_mlp": 1.03448176, + "epoch": 0.9168194799338644, + "flos": 27818201064960.0, + "grad_norm": 2.1653803876672733, + "language_loss": 0.72892481, + "learning_rate": 7.209204159518178e-08, + "loss": 0.75027955, + "num_input_tokens_seen": 328957895, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.66796875, + "step": 15249, + "time_per_iteration": 2.5009031295776367 + }, + { + "auxiliary_loss_clip": 0.01101285, + "auxiliary_loss_mlp": 0.01027212, + "balance_loss_clip": 1.01516008, + "balance_loss_mlp": 1.03550017, + "epoch": 0.9168796031865324, + "flos": 21717552355200.0, + "grad_norm": 1.997505884872102, + "language_loss": 0.76246959, + "learning_rate": 7.198845445926616e-08, + "loss": 0.78375459, + "num_input_tokens_seen": 328971365, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.65625, + "step": 15250, + "time_per_iteration": 2.407268762588501 + }, + { + "auxiliary_loss_clip": 0.01097645, + "auxiliary_loss_mlp": 0.01025557, + "balance_loss_clip": 1.0139761, + "balance_loss_mlp": 1.03325534, + "epoch": 0.9169397264392004, + "flos": 23404420817280.0, + "grad_norm": 1.9158953461140582, + "language_loss": 0.75737274, + "learning_rate": 7.188494043374138e-08, + "loss": 0.77860475, + "num_input_tokens_seen": 328990830, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6484375, + "step": 15251, + "time_per_iteration": 2.449289083480835 + }, + { + "auxiliary_loss_clip": 0.01103639, + "auxiliary_loss_mlp": 0.01030923, + "balance_loss_clip": 1.01801276, + "balance_loss_mlp": 1.03617382, + "epoch": 0.9169998496918683, + "flos": 23950981140480.0, + "grad_norm": 2.01686517651722, + "language_loss": 0.79905111, + "learning_rate": 7.178149952253298e-08, + "loss": 0.82039672, + "num_input_tokens_seen": 329008345, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.67578125, + "step": 15252, + "time_per_iteration": 2.4550137519836426 + }, + { + "auxiliary_loss_clip": 0.01099547, + "auxiliary_loss_mlp": 0.01034299, + "balance_loss_clip": 1.02278996, + "balance_loss_mlp": 1.03405333, + "epoch": 0.9170599729445363, + "flos": 18332469711360.0, + "grad_norm": 1.525633221993305, + "language_loss": 0.7715137, + "learning_rate": 7.167813172956316e-08, + "loss": 0.79285222, + "num_input_tokens_seen": 329027440, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 15253, + "time_per_iteration": 2.4307291507720947 + }, + { + "auxiliary_loss_clip": 0.01102278, + "auxiliary_loss_mlp": 0.01025708, + "balance_loss_clip": 1.01446629, + "balance_loss_mlp": 1.03608871, + "epoch": 0.9171200961972042, + "flos": 22674859678080.0, + "grad_norm": 1.6220223812959684, + "language_loss": 0.73055267, + "learning_rate": 7.157483705875256e-08, + "loss": 0.7518326, + "num_input_tokens_seen": 329046445, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6640625, + "step": 15254, + "time_per_iteration": 2.4426708221435547 + }, + { + "auxiliary_loss_clip": 0.01095915, + "auxiliary_loss_mlp": 0.01024577, + "balance_loss_clip": 1.01363969, + "balance_loss_mlp": 1.03274751, + "epoch": 0.9171802194498723, + "flos": 26719298328960.0, + "grad_norm": 1.4975724788553886, + "language_loss": 0.79085529, + "learning_rate": 7.14716155140167e-08, + "loss": 0.81206024, + "num_input_tokens_seen": 329065555, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6328125, + "step": 15255, + "time_per_iteration": 2.489227771759033 + }, + { + "auxiliary_loss_clip": 0.01101815, + "auxiliary_loss_mlp": 0.01033112, + "balance_loss_clip": 1.02150059, + "balance_loss_mlp": 1.03471398, + "epoch": 0.9172403427025402, + "flos": 37889240538240.0, + "grad_norm": 2.0932584318696197, + "language_loss": 0.68286502, + "learning_rate": 7.136846709927047e-08, + "loss": 0.70421427, + "num_input_tokens_seen": 329087515, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 15256, + "time_per_iteration": 2.5796866416931152 + }, + { + "auxiliary_loss_clip": 0.01097785, + "auxiliary_loss_mlp": 0.01032305, + "balance_loss_clip": 1.0215044, + "balance_loss_mlp": 1.03404987, + "epoch": 0.9173004659552082, + "flos": 17055163100160.0, + "grad_norm": 1.7133190759079449, + "language_loss": 0.83820814, + "learning_rate": 7.126539181842561e-08, + "loss": 0.85950905, + "num_input_tokens_seen": 329106820, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.63671875, + "step": 15257, + "time_per_iteration": 2.4700734615325928 + }, + { + "auxiliary_loss_clip": 0.0109807, + "auxiliary_loss_mlp": 0.01031988, + "balance_loss_clip": 1.02141452, + "balance_loss_mlp": 1.03438568, + "epoch": 0.9173605892078761, + "flos": 22201593056640.0, + "grad_norm": 1.6589909857452685, + "language_loss": 0.77511317, + "learning_rate": 7.116238967539012e-08, + "loss": 0.79641378, + "num_input_tokens_seen": 329126515, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.63671875, + "step": 15258, + "time_per_iteration": 2.4660658836364746 + }, + { + "auxiliary_loss_clip": 0.0110158, + "auxiliary_loss_mlp": 0.01032721, + "balance_loss_clip": 1.02131248, + "balance_loss_mlp": 1.0362736, + "epoch": 0.9174207124605441, + "flos": 16507776764160.0, + "grad_norm": 1.8473842478714684, + "language_loss": 0.78595388, + "learning_rate": 7.105946067406999e-08, + "loss": 0.80729687, + "num_input_tokens_seen": 329142660, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 15259, + "time_per_iteration": 2.42170786857605 + }, + { + "auxiliary_loss_clip": 0.01096695, + "auxiliary_loss_mlp": 0.01031068, + "balance_loss_clip": 1.0201838, + "balance_loss_mlp": 1.03308225, + "epoch": 0.917480835713212, + "flos": 24535606901760.0, + "grad_norm": 1.5141201420761963, + "language_loss": 0.75849646, + "learning_rate": 7.095660481836895e-08, + "loss": 0.77977407, + "num_input_tokens_seen": 329162575, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.63671875, + "step": 15260, + "time_per_iteration": 2.4748823642730713 + }, + { + "auxiliary_loss_clip": 0.01096998, + "auxiliary_loss_mlp": 0.01028616, + "balance_loss_clip": 1.01732063, + "balance_loss_mlp": 1.0325402, + "epoch": 0.9175409589658801, + "flos": 20880726226560.0, + "grad_norm": 1.439145182657997, + "language_loss": 0.61105782, + "learning_rate": 7.085382211218637e-08, + "loss": 0.63231397, + "num_input_tokens_seen": 329182090, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.64453125, + "step": 15261, + "time_per_iteration": 2.466932535171509 + }, + { + "auxiliary_loss_clip": 0.01097067, + "auxiliary_loss_mlp": 0.01032128, + "balance_loss_clip": 1.02064824, + "balance_loss_mlp": 1.03346276, + "epoch": 0.917601082218548, + "flos": 14276035918080.0, + "grad_norm": 1.6885035135611821, + "language_loss": 0.7386173, + "learning_rate": 7.075111255942002e-08, + "loss": 0.75990927, + "num_input_tokens_seen": 329196535, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6328125, + "step": 15262, + "time_per_iteration": 3.797211170196533 + }, + { + "auxiliary_loss_clip": 0.01101112, + "auxiliary_loss_mlp": 0.01032488, + "balance_loss_clip": 1.02048969, + "balance_loss_mlp": 1.03242636, + "epoch": 0.917661205471216, + "flos": 19099234362240.0, + "grad_norm": 1.9369196881857367, + "language_loss": 0.7737118, + "learning_rate": 7.064847616396496e-08, + "loss": 0.79504776, + "num_input_tokens_seen": 329215135, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6875, + "step": 15263, + "time_per_iteration": 3.865194797515869 + }, + { + "auxiliary_loss_clip": 0.01102159, + "auxiliary_loss_mlp": 0.01032945, + "balance_loss_clip": 1.02153111, + "balance_loss_mlp": 1.0338912, + "epoch": 0.917721328723884, + "flos": 21106568989440.0, + "grad_norm": 1.7551595930253303, + "language_loss": 0.75445127, + "learning_rate": 7.054591292971324e-08, + "loss": 0.77580231, + "num_input_tokens_seen": 329235150, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.68359375, + "step": 15264, + "time_per_iteration": 3.919630527496338 + }, + { + "auxiliary_loss_clip": 0.0109944, + "auxiliary_loss_mlp": 0.01035754, + "balance_loss_clip": 1.02476895, + "balance_loss_mlp": 1.03444493, + "epoch": 0.9177814519765519, + "flos": 21943215550080.0, + "grad_norm": 1.7698435079437604, + "language_loss": 0.8347168, + "learning_rate": 7.044342286055394e-08, + "loss": 0.85606873, + "num_input_tokens_seen": 329254365, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6484375, + "step": 15265, + "time_per_iteration": 2.481045961380005 + }, + { + "auxiliary_loss_clip": 0.01105219, + "auxiliary_loss_mlp": 0.01038332, + "balance_loss_clip": 1.02560663, + "balance_loss_mlp": 1.03556991, + "epoch": 0.9178415752292199, + "flos": 24205982768640.0, + "grad_norm": 1.4754568923998763, + "language_loss": 0.73383772, + "learning_rate": 7.034100596037306e-08, + "loss": 0.75527322, + "num_input_tokens_seen": 329274385, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 15266, + "time_per_iteration": 2.4675867557525635 + }, + { + "auxiliary_loss_clip": 0.0109957, + "auxiliary_loss_mlp": 0.01031178, + "balance_loss_clip": 1.02005005, + "balance_loss_mlp": 1.03352594, + "epoch": 0.9179016984818879, + "flos": 20042068504320.0, + "grad_norm": 1.626905867062865, + "language_loss": 0.7739476, + "learning_rate": 7.023866223305486e-08, + "loss": 0.79525506, + "num_input_tokens_seen": 329292160, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66015625, + "step": 15267, + "time_per_iteration": 2.539769411087036 + }, + { + "auxiliary_loss_clip": 0.01021972, + "auxiliary_loss_mlp": 0.01000807, + "balance_loss_clip": 0.99981195, + "balance_loss_mlp": 1.00176847, + "epoch": 0.9179618217345559, + "flos": 65555901100800.0, + "grad_norm": 0.7378350855044539, + "language_loss": 0.56234527, + "learning_rate": 7.013639168247975e-08, + "loss": 0.58257306, + "num_input_tokens_seen": 329351870, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.20214844, + "step": 15268, + "time_per_iteration": 4.508407115936279 + }, + { + "auxiliary_loss_clip": 0.01102433, + "auxiliary_loss_mlp": 0.01027411, + "balance_loss_clip": 1.01537657, + "balance_loss_mlp": 1.03522551, + "epoch": 0.9180219449872238, + "flos": 21324618501120.0, + "grad_norm": 4.464341061130245, + "language_loss": 0.76722169, + "learning_rate": 7.0034194312526e-08, + "loss": 0.78852016, + "num_input_tokens_seen": 329370930, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 15269, + "time_per_iteration": 2.4662179946899414 + }, + { + "auxiliary_loss_clip": 0.01099948, + "auxiliary_loss_mlp": 0.01030703, + "balance_loss_clip": 1.01819777, + "balance_loss_mlp": 1.03414619, + "epoch": 0.9180820682398918, + "flos": 41060008684800.0, + "grad_norm": 2.0734442027372633, + "language_loss": 0.7271992, + "learning_rate": 6.993207012706936e-08, + "loss": 0.74850571, + "num_input_tokens_seen": 329391275, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.65625, + "step": 15270, + "time_per_iteration": 2.628192186355591 + }, + { + "auxiliary_loss_clip": 0.01096334, + "auxiliary_loss_mlp": 0.01030644, + "balance_loss_clip": 1.01874113, + "balance_loss_mlp": 1.03209162, + "epoch": 0.9181421914925597, + "flos": 28072915384320.0, + "grad_norm": 1.5736026721435314, + "language_loss": 0.79696983, + "learning_rate": 6.98300191299821e-08, + "loss": 0.81823957, + "num_input_tokens_seen": 329412775, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.640625, + "step": 15271, + "time_per_iteration": 2.4931766986846924 + }, + { + "auxiliary_loss_clip": 0.01099187, + "auxiliary_loss_mlp": 0.01030136, + "balance_loss_clip": 1.01856041, + "balance_loss_mlp": 1.03308785, + "epoch": 0.9182023147452277, + "flos": 29169411909120.0, + "grad_norm": 22.73674764658324, + "language_loss": 0.72910154, + "learning_rate": 6.972804132513355e-08, + "loss": 0.75039482, + "num_input_tokens_seen": 329432440, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66015625, + "step": 15272, + "time_per_iteration": 2.5102016925811768 + }, + { + "auxiliary_loss_clip": 0.01098094, + "auxiliary_loss_mlp": 0.01031842, + "balance_loss_clip": 1.02105904, + "balance_loss_mlp": 1.03331065, + "epoch": 0.9182624379978956, + "flos": 24060831909120.0, + "grad_norm": 2.0644570408475404, + "language_loss": 0.72772151, + "learning_rate": 6.962613671639105e-08, + "loss": 0.74902087, + "num_input_tokens_seen": 329450605, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6484375, + "step": 15273, + "time_per_iteration": 2.465676784515381 + }, + { + "auxiliary_loss_clip": 0.01093024, + "auxiliary_loss_mlp": 0.01025395, + "balance_loss_clip": 1.01514888, + "balance_loss_mlp": 1.03164101, + "epoch": 0.9183225612505637, + "flos": 23293528554240.0, + "grad_norm": 1.4208540999933033, + "language_loss": 0.74430341, + "learning_rate": 6.952430530761933e-08, + "loss": 0.76548761, + "num_input_tokens_seen": 329470550, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.61328125, + "step": 15274, + "time_per_iteration": 2.4480597972869873 + }, + { + "auxiliary_loss_clip": 0.01099117, + "auxiliary_loss_mlp": 0.01035602, + "balance_loss_clip": 1.02451599, + "balance_loss_mlp": 1.03252149, + "epoch": 0.9183826845032316, + "flos": 19609237618560.0, + "grad_norm": 1.5689583484539182, + "language_loss": 0.6853776, + "learning_rate": 6.942254710267902e-08, + "loss": 0.70672476, + "num_input_tokens_seen": 329489765, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6640625, + "step": 15275, + "time_per_iteration": 2.424501895904541 + }, + { + "auxiliary_loss_clip": 0.01096749, + "auxiliary_loss_mlp": 0.01028414, + "balance_loss_clip": 1.0169158, + "balance_loss_mlp": 1.03240776, + "epoch": 0.9184428077558996, + "flos": 18479057114880.0, + "grad_norm": 1.7958542567594675, + "language_loss": 0.72359389, + "learning_rate": 6.932086210542953e-08, + "loss": 0.74484551, + "num_input_tokens_seen": 329507040, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.64453125, + "step": 15276, + "time_per_iteration": 2.4307353496551514 + }, + { + "auxiliary_loss_clip": 0.0110093, + "auxiliary_loss_mlp": 0.01027873, + "balance_loss_clip": 1.01707268, + "balance_loss_mlp": 1.03567207, + "epoch": 0.9185029310085676, + "flos": 20741034234240.0, + "grad_norm": 1.745555104125903, + "language_loss": 0.73787761, + "learning_rate": 6.921925031972642e-08, + "loss": 0.75916559, + "num_input_tokens_seen": 329525540, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.65234375, + "step": 15277, + "time_per_iteration": 2.485466718673706 + }, + { + "auxiliary_loss_clip": 0.01021683, + "auxiliary_loss_mlp": 0.01000132, + "balance_loss_clip": 0.99918407, + "balance_loss_mlp": 1.00166464, + "epoch": 0.9185630542612355, + "flos": 68209231875840.0, + "grad_norm": 0.7154549156944336, + "language_loss": 0.59214282, + "learning_rate": 6.91177117494226e-08, + "loss": 0.61236095, + "num_input_tokens_seen": 329592905, + "router_z_loss_clip": 0.00946045, + "router_z_loss_mlp": 0.20019531, + "step": 15278, + "time_per_iteration": 3.1485769748687744 + }, + { + "auxiliary_loss_clip": 0.01093924, + "auxiliary_loss_mlp": 0.01025318, + "balance_loss_clip": 1.01504803, + "balance_loss_mlp": 1.03094137, + "epoch": 0.9186231775139035, + "flos": 12239470598400.0, + "grad_norm": 1.8937597400336486, + "language_loss": 0.64184052, + "learning_rate": 6.901624639836879e-08, + "loss": 0.66303289, + "num_input_tokens_seen": 329610150, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.6328125, + "step": 15279, + "time_per_iteration": 2.446822166442871 + }, + { + "auxiliary_loss_clip": 0.01021727, + "auxiliary_loss_mlp": 0.0100203, + "balance_loss_clip": 1.0009985, + "balance_loss_mlp": 1.00168133, + "epoch": 0.9186833007665715, + "flos": 63939237770880.0, + "grad_norm": 1.2569457019920138, + "language_loss": 0.60211283, + "learning_rate": 6.891485427041211e-08, + "loss": 0.62235039, + "num_input_tokens_seen": 329673650, + "router_z_loss_clip": 0.01031494, + "router_z_loss_mlp": 0.20019531, + "step": 15280, + "time_per_iteration": 3.04021954536438 + }, + { + "auxiliary_loss_clip": 0.0110026, + "auxiliary_loss_mlp": 0.01029657, + "balance_loss_clip": 1.01787281, + "balance_loss_mlp": 1.03354609, + "epoch": 0.9187434240192395, + "flos": 19974700546560.0, + "grad_norm": 3.7758873171427108, + "language_loss": 0.69328892, + "learning_rate": 6.881353536939815e-08, + "loss": 0.71458817, + "num_input_tokens_seen": 329692520, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66796875, + "step": 15281, + "time_per_iteration": 2.468998432159424 + }, + { + "auxiliary_loss_clip": 0.01101585, + "auxiliary_loss_mlp": 0.01028785, + "balance_loss_clip": 1.01645327, + "balance_loss_mlp": 1.03454149, + "epoch": 0.9188035472719074, + "flos": 25227820874880.0, + "grad_norm": 2.9746368961886867, + "language_loss": 0.84552884, + "learning_rate": 6.871228969916831e-08, + "loss": 0.86683255, + "num_input_tokens_seen": 329713750, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.671875, + "step": 15282, + "time_per_iteration": 2.525195360183716 + }, + { + "auxiliary_loss_clip": 0.01097551, + "auxiliary_loss_mlp": 0.01032254, + "balance_loss_clip": 1.02076828, + "balance_loss_mlp": 1.03387153, + "epoch": 0.9188636705245754, + "flos": 18405547931520.0, + "grad_norm": 1.7713920844445745, + "language_loss": 0.59634107, + "learning_rate": 6.861111726356194e-08, + "loss": 0.61763906, + "num_input_tokens_seen": 329730960, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.63671875, + "step": 15283, + "time_per_iteration": 2.451240062713623 + }, + { + "auxiliary_loss_clip": 0.01103442, + "auxiliary_loss_mlp": 0.01030104, + "balance_loss_clip": 1.01844573, + "balance_loss_mlp": 1.03460884, + "epoch": 0.9189237937772433, + "flos": 23769129559680.0, + "grad_norm": 1.5989024200960449, + "language_loss": 0.65525234, + "learning_rate": 6.851001806641554e-08, + "loss": 0.67658782, + "num_input_tokens_seen": 329750975, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6875, + "step": 15284, + "time_per_iteration": 2.4734537601470947 + }, + { + "auxiliary_loss_clip": 0.01098451, + "auxiliary_loss_mlp": 0.01031303, + "balance_loss_clip": 1.01953125, + "balance_loss_mlp": 1.03279424, + "epoch": 0.9189839170299113, + "flos": 21214624078080.0, + "grad_norm": 1.8968992519509786, + "language_loss": 0.7340166, + "learning_rate": 6.840899211156292e-08, + "loss": 0.75531411, + "num_input_tokens_seen": 329769645, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.65625, + "step": 15285, + "time_per_iteration": 2.475170612335205 + }, + { + "auxiliary_loss_clip": 0.01097989, + "auxiliary_loss_mlp": 0.01029771, + "balance_loss_clip": 1.01809406, + "balance_loss_mlp": 1.03306448, + "epoch": 0.9190440402825792, + "flos": 16727370560640.0, + "grad_norm": 1.9961314364578988, + "language_loss": 0.71681088, + "learning_rate": 6.830803940283458e-08, + "loss": 0.73808849, + "num_input_tokens_seen": 329788185, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6484375, + "step": 15286, + "time_per_iteration": 2.4291200637817383 + }, + { + "auxiliary_loss_clip": 0.01100256, + "auxiliary_loss_mlp": 0.0103143, + "balance_loss_clip": 1.0195086, + "balance_loss_mlp": 1.03459299, + "epoch": 0.9191041635352473, + "flos": 23441193365760.0, + "grad_norm": 2.070932444160172, + "language_loss": 0.7353276, + "learning_rate": 6.820715994405945e-08, + "loss": 0.75664449, + "num_input_tokens_seen": 329806780, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 15287, + "time_per_iteration": 2.4521946907043457 + }, + { + "auxiliary_loss_clip": 0.0110378, + "auxiliary_loss_mlp": 0.01028863, + "balance_loss_clip": 1.01641178, + "balance_loss_mlp": 1.03728819, + "epoch": 0.9191642867879152, + "flos": 18807532012800.0, + "grad_norm": 1.955752098372023, + "language_loss": 0.65609306, + "learning_rate": 6.810635373906226e-08, + "loss": 0.67741948, + "num_input_tokens_seen": 329826350, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6640625, + "step": 15288, + "time_per_iteration": 2.41819167137146 + }, + { + "auxiliary_loss_clip": 0.01104589, + "auxiliary_loss_mlp": 0.01033275, + "balance_loss_clip": 1.02197957, + "balance_loss_mlp": 1.0382545, + "epoch": 0.9192244100405832, + "flos": 32160950167680.0, + "grad_norm": 1.9658810334985228, + "language_loss": 0.7114042, + "learning_rate": 6.800562079166549e-08, + "loss": 0.73278284, + "num_input_tokens_seen": 329846160, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6640625, + "step": 15289, + "time_per_iteration": 2.5379581451416016 + }, + { + "auxiliary_loss_clip": 0.01101664, + "auxiliary_loss_mlp": 0.01030872, + "balance_loss_clip": 1.01940393, + "balance_loss_mlp": 1.03530157, + "epoch": 0.9192845332932512, + "flos": 16357669827840.0, + "grad_norm": 1.7860154245672653, + "language_loss": 0.74310684, + "learning_rate": 6.790496110568921e-08, + "loss": 0.76443219, + "num_input_tokens_seen": 329862020, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6640625, + "step": 15290, + "time_per_iteration": 2.424091339111328 + }, + { + "auxiliary_loss_clip": 0.01098296, + "auxiliary_loss_mlp": 0.01028651, + "balance_loss_clip": 1.01781464, + "balance_loss_mlp": 1.03389239, + "epoch": 0.9193446565459191, + "flos": 26614475464320.0, + "grad_norm": 1.8214465731068186, + "language_loss": 0.72021568, + "learning_rate": 6.78043746849506e-08, + "loss": 0.74148518, + "num_input_tokens_seen": 329880185, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.64453125, + "step": 15291, + "time_per_iteration": 2.524446725845337 + }, + { + "auxiliary_loss_clip": 0.01098547, + "auxiliary_loss_mlp": 0.01028062, + "balance_loss_clip": 1.01701772, + "balance_loss_mlp": 1.03402042, + "epoch": 0.9194047797985871, + "flos": 22492182084480.0, + "grad_norm": 1.7029706448967405, + "language_loss": 0.71118617, + "learning_rate": 6.770386153326346e-08, + "loss": 0.73245227, + "num_input_tokens_seen": 329900255, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.64453125, + "step": 15292, + "time_per_iteration": 2.452636957168579 + }, + { + "auxiliary_loss_clip": 0.01099113, + "auxiliary_loss_mlp": 0.01028165, + "balance_loss_clip": 1.01633954, + "balance_loss_mlp": 1.03386974, + "epoch": 0.9194649030512551, + "flos": 25078791346560.0, + "grad_norm": 2.1375406938776416, + "language_loss": 0.73241705, + "learning_rate": 6.760342165443988e-08, + "loss": 0.75368983, + "num_input_tokens_seen": 329919095, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65234375, + "step": 15293, + "time_per_iteration": 2.475736141204834 + }, + { + "auxiliary_loss_clip": 0.01098791, + "auxiliary_loss_mlp": 0.01026354, + "balance_loss_clip": 1.01479709, + "balance_loss_mlp": 1.03441787, + "epoch": 0.9195250263039231, + "flos": 11911139354880.0, + "grad_norm": 2.0643296988885456, + "language_loss": 0.7831043, + "learning_rate": 6.750305505228837e-08, + "loss": 0.80435574, + "num_input_tokens_seen": 329936505, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.64453125, + "step": 15294, + "time_per_iteration": 2.47523832321167 + }, + { + "auxiliary_loss_clip": 0.01103169, + "auxiliary_loss_mlp": 0.01031611, + "balance_loss_clip": 1.0187782, + "balance_loss_mlp": 1.03504705, + "epoch": 0.919585149556591, + "flos": 21834154880640.0, + "grad_norm": 1.4970432039566248, + "language_loss": 0.77283525, + "learning_rate": 6.74027617306141e-08, + "loss": 0.79418302, + "num_input_tokens_seen": 329956795, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6796875, + "step": 15295, + "time_per_iteration": 2.4907798767089844 + }, + { + "auxiliary_loss_clip": 0.01097049, + "auxiliary_loss_mlp": 0.01026614, + "balance_loss_clip": 1.01649904, + "balance_loss_mlp": 1.034621, + "epoch": 0.919645272809259, + "flos": 28184059042560.0, + "grad_norm": 2.4431811448351106, + "language_loss": 0.71476376, + "learning_rate": 6.730254169322114e-08, + "loss": 0.73600036, + "num_input_tokens_seen": 329977195, + "router_z_loss_clip": 0.10107422, + "router_z_loss_mlp": 0.625, + "step": 15296, + "time_per_iteration": 2.4911844730377197 + }, + { + "auxiliary_loss_clip": 0.0109984, + "auxiliary_loss_mlp": 0.0103458, + "balance_loss_clip": 1.0236125, + "balance_loss_mlp": 1.03452754, + "epoch": 0.9197053960619269, + "flos": 18332828847360.0, + "grad_norm": 2.1011046418165704, + "language_loss": 0.75250423, + "learning_rate": 6.720239494390912e-08, + "loss": 0.77384841, + "num_input_tokens_seen": 329992095, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.65625, + "step": 15297, + "time_per_iteration": 2.4321935176849365 + }, + { + "auxiliary_loss_clip": 0.01097997, + "auxiliary_loss_mlp": 0.01028065, + "balance_loss_clip": 1.01621604, + "balance_loss_mlp": 1.0333879, + "epoch": 0.9197655193145949, + "flos": 28183448511360.0, + "grad_norm": 1.7401856236866056, + "language_loss": 0.73939699, + "learning_rate": 6.710232148647676e-08, + "loss": 0.76065761, + "num_input_tokens_seen": 330011490, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6484375, + "step": 15298, + "time_per_iteration": 2.515803098678589 + }, + { + "auxiliary_loss_clip": 0.01101475, + "auxiliary_loss_mlp": 0.01033872, + "balance_loss_clip": 1.02229095, + "balance_loss_mlp": 1.03466356, + "epoch": 0.9198256425672628, + "flos": 17306321973120.0, + "grad_norm": 2.0840712343344823, + "language_loss": 0.79339898, + "learning_rate": 6.70023213247175e-08, + "loss": 0.8147524, + "num_input_tokens_seen": 330027885, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66796875, + "step": 15299, + "time_per_iteration": 2.425823450088501 + }, + { + "auxiliary_loss_clip": 0.01098834, + "auxiliary_loss_mlp": 0.01026097, + "balance_loss_clip": 1.01545727, + "balance_loss_mlp": 1.03452611, + "epoch": 0.9198857658199309, + "flos": 17858520731520.0, + "grad_norm": 2.140408614905867, + "language_loss": 0.63948607, + "learning_rate": 6.690239446242385e-08, + "loss": 0.66073537, + "num_input_tokens_seen": 330046230, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.640625, + "step": 15300, + "time_per_iteration": 2.441720724105835 + }, + { + "auxiliary_loss_clip": 0.01094075, + "auxiliary_loss_mlp": 0.0102783, + "balance_loss_clip": 1.01806712, + "balance_loss_mlp": 1.03322458, + "epoch": 0.9199458890725988, + "flos": 22127545169280.0, + "grad_norm": 2.56598231172926, + "language_loss": 0.69634527, + "learning_rate": 6.680254090338545e-08, + "loss": 0.71756434, + "num_input_tokens_seen": 330065535, + "router_z_loss_clip": 0.09765625, + "router_z_loss_mlp": 0.609375, + "step": 15301, + "time_per_iteration": 2.467337131500244 + }, + { + "auxiliary_loss_clip": 0.01103435, + "auxiliary_loss_mlp": 0.01033344, + "balance_loss_clip": 1.02025485, + "balance_loss_mlp": 1.03593671, + "epoch": 0.9200060123252668, + "flos": 16034043265920.0, + "grad_norm": 1.8550315285188883, + "language_loss": 0.71411103, + "learning_rate": 6.670276065138814e-08, + "loss": 0.73547888, + "num_input_tokens_seen": 330082920, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.67578125, + "step": 15302, + "time_per_iteration": 2.43485426902771 + }, + { + "auxiliary_loss_clip": 0.01100893, + "auxiliary_loss_mlp": 0.01029347, + "balance_loss_clip": 1.01797462, + "balance_loss_mlp": 1.03467202, + "epoch": 0.9200661355779348, + "flos": 26864521015680.0, + "grad_norm": 1.824853642117339, + "language_loss": 0.76358056, + "learning_rate": 6.660305371021579e-08, + "loss": 0.7848829, + "num_input_tokens_seen": 330101165, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66015625, + "step": 15303, + "time_per_iteration": 3.9412145614624023 + }, + { + "auxiliary_loss_clip": 0.01100608, + "auxiliary_loss_mlp": 0.0102885, + "balance_loss_clip": 1.01765084, + "balance_loss_mlp": 1.03600037, + "epoch": 0.9201262588306027, + "flos": 12786749193600.0, + "grad_norm": 2.172207536480081, + "language_loss": 0.8759762, + "learning_rate": 6.650342008365006e-08, + "loss": 0.8972708, + "num_input_tokens_seen": 330118775, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.64453125, + "step": 15304, + "time_per_iteration": 2.4575695991516113 + }, + { + "auxiliary_loss_clip": 0.01103607, + "auxiliary_loss_mlp": 0.01032918, + "balance_loss_clip": 1.01945925, + "balance_loss_mlp": 1.0359385, + "epoch": 0.9201863820832707, + "flos": 20631614428800.0, + "grad_norm": 1.9620748105275532, + "language_loss": 0.7723875, + "learning_rate": 6.64038597754677e-08, + "loss": 0.79375267, + "num_input_tokens_seen": 330135570, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.67578125, + "step": 15305, + "time_per_iteration": 3.863945484161377 + }, + { + "auxiliary_loss_clip": 0.01098868, + "auxiliary_loss_mlp": 0.0103384, + "balance_loss_clip": 1.02194321, + "balance_loss_mlp": 1.03316355, + "epoch": 0.9202465053359387, + "flos": 26395815421440.0, + "grad_norm": 2.2266234622398002, + "language_loss": 0.81643492, + "learning_rate": 6.630437278944501e-08, + "loss": 0.837762, + "num_input_tokens_seen": 330152840, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 15306, + "time_per_iteration": 3.9599132537841797 + }, + { + "auxiliary_loss_clip": 0.01095421, + "auxiliary_loss_mlp": 0.01030223, + "balance_loss_clip": 1.01949441, + "balance_loss_mlp": 1.03234839, + "epoch": 0.9203066285886067, + "flos": 10488179093760.0, + "grad_norm": 1.8130468972893734, + "language_loss": 0.71801835, + "learning_rate": 6.62049591293541e-08, + "loss": 0.73927486, + "num_input_tokens_seen": 330168605, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6328125, + "step": 15307, + "time_per_iteration": 2.4384212493896484 + }, + { + "auxiliary_loss_clip": 0.0110121, + "auxiliary_loss_mlp": 0.01030379, + "balance_loss_clip": 1.01840997, + "balance_loss_mlp": 1.03425837, + "epoch": 0.9203667518412746, + "flos": 19390721230080.0, + "grad_norm": 1.8060477218017867, + "language_loss": 0.78445113, + "learning_rate": 6.610561879896526e-08, + "loss": 0.80576694, + "num_input_tokens_seen": 330186160, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.671875, + "step": 15308, + "time_per_iteration": 2.4439730644226074 + }, + { + "auxiliary_loss_clip": 0.01098129, + "auxiliary_loss_mlp": 0.01028894, + "balance_loss_clip": 1.01717603, + "balance_loss_mlp": 1.0328846, + "epoch": 0.9204268750939426, + "flos": 15924982596480.0, + "grad_norm": 2.0932008233219968, + "language_loss": 0.77898622, + "learning_rate": 6.600635180204484e-08, + "loss": 0.80025649, + "num_input_tokens_seen": 330201780, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65234375, + "step": 15309, + "time_per_iteration": 2.4441962242126465 + }, + { + "auxiliary_loss_clip": 0.01099417, + "auxiliary_loss_mlp": 0.01026816, + "balance_loss_clip": 1.01493728, + "balance_loss_mlp": 1.03330636, + "epoch": 0.9204869983466105, + "flos": 16471758401280.0, + "grad_norm": 2.4269802461242977, + "language_loss": 0.66559213, + "learning_rate": 6.590715814235781e-08, + "loss": 0.68685448, + "num_input_tokens_seen": 330219165, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66015625, + "step": 15310, + "time_per_iteration": 3.8965320587158203 + }, + { + "auxiliary_loss_clip": 0.01099035, + "auxiliary_loss_mlp": 0.01031641, + "balance_loss_clip": 1.0200541, + "balance_loss_mlp": 1.03259516, + "epoch": 0.9205471215992785, + "flos": 21539220307200.0, + "grad_norm": 1.6733324476894091, + "language_loss": 0.66091675, + "learning_rate": 6.580803782366495e-08, + "loss": 0.68222356, + "num_input_tokens_seen": 330238975, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6640625, + "step": 15311, + "time_per_iteration": 2.444620132446289 + }, + { + "auxiliary_loss_clip": 0.01099034, + "auxiliary_loss_mlp": 0.01034246, + "balance_loss_clip": 1.02293885, + "balance_loss_mlp": 1.03247344, + "epoch": 0.9206072448519464, + "flos": 25005892694400.0, + "grad_norm": 1.5846245764827986, + "language_loss": 0.75952655, + "learning_rate": 6.570899084972503e-08, + "loss": 0.78085929, + "num_input_tokens_seen": 330259755, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 15312, + "time_per_iteration": 2.5009913444519043 + }, + { + "auxiliary_loss_clip": 0.01096584, + "auxiliary_loss_mlp": 0.01031198, + "balance_loss_clip": 1.02051091, + "balance_loss_mlp": 1.03418756, + "epoch": 0.9206673681046145, + "flos": 20522661500160.0, + "grad_norm": 1.935220768084578, + "language_loss": 0.7918942, + "learning_rate": 6.561001722429394e-08, + "loss": 0.81317198, + "num_input_tokens_seen": 330277660, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.625, + "step": 15313, + "time_per_iteration": 2.477346420288086 + }, + { + "auxiliary_loss_clip": 0.01101793, + "auxiliary_loss_mlp": 0.01029069, + "balance_loss_clip": 1.01775026, + "balance_loss_mlp": 1.03461695, + "epoch": 0.9207274913572824, + "flos": 20883455660160.0, + "grad_norm": 1.6811896715223988, + "language_loss": 0.78183317, + "learning_rate": 6.55111169511251e-08, + "loss": 0.80314177, + "num_input_tokens_seen": 330295455, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 15314, + "time_per_iteration": 2.543661594390869 + }, + { + "auxiliary_loss_clip": 0.01103944, + "auxiliary_loss_mlp": 0.01032553, + "balance_loss_clip": 1.01982164, + "balance_loss_mlp": 1.03507841, + "epoch": 0.9207876146099504, + "flos": 22708256348160.0, + "grad_norm": 1.775196131409486, + "language_loss": 0.79086602, + "learning_rate": 6.541229003396864e-08, + "loss": 0.81223094, + "num_input_tokens_seen": 330315310, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6875, + "step": 15315, + "time_per_iteration": 2.4820241928100586 + }, + { + "auxiliary_loss_clip": 0.01103595, + "auxiliary_loss_mlp": 0.01028821, + "balance_loss_clip": 1.01689386, + "balance_loss_mlp": 1.03408074, + "epoch": 0.9208477378626184, + "flos": 18507354053760.0, + "grad_norm": 1.7912978645182498, + "language_loss": 0.75935954, + "learning_rate": 6.531353647657156e-08, + "loss": 0.7806837, + "num_input_tokens_seen": 330333260, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 15316, + "time_per_iteration": 2.4458367824554443 + }, + { + "auxiliary_loss_clip": 0.01099953, + "auxiliary_loss_mlp": 0.01034019, + "balance_loss_clip": 1.02208638, + "balance_loss_mlp": 1.03305912, + "epoch": 0.9209078611152863, + "flos": 22999635475200.0, + "grad_norm": 1.7344603926154347, + "language_loss": 0.6935131, + "learning_rate": 6.521485628267931e-08, + "loss": 0.71485275, + "num_input_tokens_seen": 330352465, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 15317, + "time_per_iteration": 2.461711883544922 + }, + { + "auxiliary_loss_clip": 0.01101061, + "auxiliary_loss_mlp": 0.01028604, + "balance_loss_clip": 1.01680803, + "balance_loss_mlp": 1.03546286, + "epoch": 0.9209679843679544, + "flos": 24061514267520.0, + "grad_norm": 1.7038666370863202, + "language_loss": 0.83504558, + "learning_rate": 6.511624945603378e-08, + "loss": 0.8563422, + "num_input_tokens_seen": 330372685, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65625, + "step": 15318, + "time_per_iteration": 2.5033764839172363 + }, + { + "auxiliary_loss_clip": 0.01100705, + "auxiliary_loss_mlp": 0.01029982, + "balance_loss_clip": 1.01856208, + "balance_loss_mlp": 1.03522885, + "epoch": 0.9210281076206223, + "flos": 13553370190080.0, + "grad_norm": 2.0250345502149774, + "language_loss": 0.85513151, + "learning_rate": 6.501771600037354e-08, + "loss": 0.87643838, + "num_input_tokens_seen": 330388860, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 15319, + "time_per_iteration": 2.434962511062622 + }, + { + "auxiliary_loss_clip": 0.01021706, + "auxiliary_loss_mlp": 0.01001621, + "balance_loss_clip": 1.00066721, + "balance_loss_mlp": 1.00159764, + "epoch": 0.9210882308732903, + "flos": 71426289674880.0, + "grad_norm": 0.7706053364589017, + "language_loss": 0.56186169, + "learning_rate": 6.491925591943559e-08, + "loss": 0.58209497, + "num_input_tokens_seen": 330448735, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.20117188, + "step": 15320, + "time_per_iteration": 3.1476900577545166 + }, + { + "auxiliary_loss_clip": 0.01103341, + "auxiliary_loss_mlp": 0.01037848, + "balance_loss_clip": 1.02536726, + "balance_loss_mlp": 1.03501773, + "epoch": 0.9211483541259582, + "flos": 18509113820160.0, + "grad_norm": 2.0044478271622053, + "language_loss": 0.63775176, + "learning_rate": 6.482086921695384e-08, + "loss": 0.65916359, + "num_input_tokens_seen": 330465600, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 15321, + "time_per_iteration": 2.4137425422668457 + }, + { + "auxiliary_loss_clip": 0.01095255, + "auxiliary_loss_mlp": 0.01026901, + "balance_loss_clip": 1.01625538, + "balance_loss_mlp": 1.03391385, + "epoch": 0.9212084773786262, + "flos": 23258228463360.0, + "grad_norm": 1.5355415864744049, + "language_loss": 0.71481681, + "learning_rate": 6.47225558966582e-08, + "loss": 0.73603833, + "num_input_tokens_seen": 330485770, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.61328125, + "step": 15322, + "time_per_iteration": 2.5061845779418945 + }, + { + "auxiliary_loss_clip": 0.01098655, + "auxiliary_loss_mlp": 0.01031099, + "balance_loss_clip": 1.02018511, + "balance_loss_mlp": 1.03329921, + "epoch": 0.9212686006312941, + "flos": 16289511770880.0, + "grad_norm": 1.6914722744606074, + "language_loss": 0.70055711, + "learning_rate": 6.462431596227725e-08, + "loss": 0.72185469, + "num_input_tokens_seen": 330504255, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.65625, + "step": 15323, + "time_per_iteration": 2.4158103466033936 + }, + { + "auxiliary_loss_clip": 0.01102274, + "auxiliary_loss_mlp": 0.01032101, + "balance_loss_clip": 1.01932168, + "balance_loss_mlp": 1.03389645, + "epoch": 0.9213287238839621, + "flos": 19785773986560.0, + "grad_norm": 1.7459918912498436, + "language_loss": 0.74719346, + "learning_rate": 6.452614941753597e-08, + "loss": 0.76853722, + "num_input_tokens_seen": 330520705, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6875, + "step": 15324, + "time_per_iteration": 2.424887180328369 + }, + { + "auxiliary_loss_clip": 0.01101043, + "auxiliary_loss_mlp": 0.01039368, + "balance_loss_clip": 1.02775693, + "balance_loss_mlp": 1.03482819, + "epoch": 0.92138884713663, + "flos": 21030402199680.0, + "grad_norm": 1.9858313128784937, + "language_loss": 0.71462083, + "learning_rate": 6.442805626615744e-08, + "loss": 0.73602492, + "num_input_tokens_seen": 330539245, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 15325, + "time_per_iteration": 2.4648244380950928 + }, + { + "auxiliary_loss_clip": 0.01098648, + "auxiliary_loss_mlp": 0.01031857, + "balance_loss_clip": 1.020401, + "balance_loss_mlp": 1.03404129, + "epoch": 0.9214489703892981, + "flos": 28587264186240.0, + "grad_norm": 1.439709253059829, + "language_loss": 0.78404367, + "learning_rate": 6.433003651186109e-08, + "loss": 0.80534875, + "num_input_tokens_seen": 330561815, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6484375, + "step": 15326, + "time_per_iteration": 2.5329742431640625 + }, + { + "auxiliary_loss_clip": 0.01103679, + "auxiliary_loss_mlp": 0.01032197, + "balance_loss_clip": 1.02008581, + "balance_loss_mlp": 1.03579211, + "epoch": 0.921509093641966, + "flos": 16361476669440.0, + "grad_norm": 3.8016467363656514, + "language_loss": 0.71438289, + "learning_rate": 6.42320901583635e-08, + "loss": 0.73574162, + "num_input_tokens_seen": 330579760, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 15327, + "time_per_iteration": 2.4995455741882324 + }, + { + "auxiliary_loss_clip": 0.01105492, + "auxiliary_loss_mlp": 0.01040397, + "balance_loss_clip": 1.02733731, + "balance_loss_mlp": 1.03710175, + "epoch": 0.921569216894634, + "flos": 26830837036800.0, + "grad_norm": 1.7374024208588212, + "language_loss": 0.78006065, + "learning_rate": 6.413421720937906e-08, + "loss": 0.80151951, + "num_input_tokens_seen": 330598545, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.68359375, + "step": 15328, + "time_per_iteration": 2.4673023223876953 + }, + { + "auxiliary_loss_clip": 0.01098437, + "auxiliary_loss_mlp": 0.01027792, + "balance_loss_clip": 1.01674747, + "balance_loss_mlp": 1.03321588, + "epoch": 0.921629340147302, + "flos": 24645134448000.0, + "grad_norm": 2.2530455333427994, + "language_loss": 0.71567261, + "learning_rate": 6.4036417668619e-08, + "loss": 0.73693484, + "num_input_tokens_seen": 330616700, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65234375, + "step": 15329, + "time_per_iteration": 2.495542526245117 + }, + { + "auxiliary_loss_clip": 0.01098027, + "auxiliary_loss_mlp": 0.01024122, + "balance_loss_clip": 1.01332712, + "balance_loss_mlp": 1.03311157, + "epoch": 0.9216894633999699, + "flos": 15086504442240.0, + "grad_norm": 1.6555034439482308, + "language_loss": 0.86653769, + "learning_rate": 6.393869153979192e-08, + "loss": 0.88775921, + "num_input_tokens_seen": 330633355, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6484375, + "step": 15330, + "time_per_iteration": 2.410320281982422 + }, + { + "auxiliary_loss_clip": 0.01100084, + "auxiliary_loss_mlp": 0.01028037, + "balance_loss_clip": 1.01671255, + "balance_loss_mlp": 1.03359747, + "epoch": 0.921749586652638, + "flos": 19204524103680.0, + "grad_norm": 2.1488192808619555, + "language_loss": 0.75690323, + "learning_rate": 6.384103882660397e-08, + "loss": 0.77818441, + "num_input_tokens_seen": 330651470, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 15331, + "time_per_iteration": 2.4592649936676025 + }, + { + "auxiliary_loss_clip": 0.01098587, + "auxiliary_loss_mlp": 0.01027028, + "balance_loss_clip": 1.01572061, + "balance_loss_mlp": 1.0333581, + "epoch": 0.9218097099053059, + "flos": 20522446018560.0, + "grad_norm": 1.671974459244748, + "language_loss": 0.75502098, + "learning_rate": 6.374345953275794e-08, + "loss": 0.77627707, + "num_input_tokens_seen": 330669170, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65234375, + "step": 15332, + "time_per_iteration": 2.4462203979492188 + }, + { + "auxiliary_loss_clip": 0.01098277, + "auxiliary_loss_mlp": 0.01029499, + "balance_loss_clip": 1.01846027, + "balance_loss_mlp": 1.03282976, + "epoch": 0.9218698331579739, + "flos": 17348625216000.0, + "grad_norm": 1.766508202244264, + "language_loss": 0.75169802, + "learning_rate": 6.364595366195358e-08, + "loss": 0.7729758, + "num_input_tokens_seen": 330686635, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 15333, + "time_per_iteration": 2.4391255378723145 + }, + { + "auxiliary_loss_clip": 0.01021523, + "auxiliary_loss_mlp": 0.01002464, + "balance_loss_clip": 1.00151014, + "balance_loss_mlp": 1.0014323, + "epoch": 0.9219299564106418, + "flos": 61958332575360.0, + "grad_norm": 0.8201974860223076, + "language_loss": 0.52913523, + "learning_rate": 6.354852121788879e-08, + "loss": 0.54937506, + "num_input_tokens_seen": 330749160, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.20117188, + "step": 15334, + "time_per_iteration": 3.0368025302886963 + }, + { + "auxiliary_loss_clip": 0.01096931, + "auxiliary_loss_mlp": 0.01032717, + "balance_loss_clip": 1.02174962, + "balance_loss_mlp": 1.03388023, + "epoch": 0.9219900796633098, + "flos": 15701761526400.0, + "grad_norm": 2.0174871878969425, + "language_loss": 0.62107778, + "learning_rate": 6.345116220425839e-08, + "loss": 0.64237422, + "num_input_tokens_seen": 330766840, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.62890625, + "step": 15335, + "time_per_iteration": 2.4043402671813965 + }, + { + "auxiliary_loss_clip": 0.0109812, + "auxiliary_loss_mlp": 0.01030431, + "balance_loss_clip": 1.01859975, + "balance_loss_mlp": 1.03406358, + "epoch": 0.9220502029159777, + "flos": 24932670819840.0, + "grad_norm": 2.1329898068794906, + "language_loss": 0.71450561, + "learning_rate": 6.335387662475366e-08, + "loss": 0.73579109, + "num_input_tokens_seen": 330785585, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.640625, + "step": 15336, + "time_per_iteration": 2.475250244140625 + }, + { + "auxiliary_loss_clip": 0.01094756, + "auxiliary_loss_mlp": 0.01028396, + "balance_loss_clip": 1.01813853, + "balance_loss_mlp": 1.03188348, + "epoch": 0.9221103261686457, + "flos": 15667215621120.0, + "grad_norm": 1.8206372240538196, + "language_loss": 0.7180149, + "learning_rate": 6.325666448306433e-08, + "loss": 0.73924649, + "num_input_tokens_seen": 330800750, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.62890625, + "step": 15337, + "time_per_iteration": 2.403857707977295 + }, + { + "auxiliary_loss_clip": 0.01021361, + "auxiliary_loss_mlp": 0.01002116, + "balance_loss_clip": 1.00117433, + "balance_loss_mlp": 1.00144243, + "epoch": 0.9221704494213137, + "flos": 67516299630720.0, + "grad_norm": 0.8770991549438161, + "language_loss": 0.65320015, + "learning_rate": 6.31595257828763e-08, + "loss": 0.67343497, + "num_input_tokens_seen": 330863640, + "router_z_loss_clip": 0.00939941, + "router_z_loss_mlp": 0.19921875, + "step": 15338, + "time_per_iteration": 3.0122439861297607 + }, + { + "auxiliary_loss_clip": 0.01101934, + "auxiliary_loss_mlp": 0.01031265, + "balance_loss_clip": 1.01954699, + "balance_loss_mlp": 1.03547251, + "epoch": 0.9222305726739817, + "flos": 30226945155840.0, + "grad_norm": 1.611756335253548, + "language_loss": 0.67253053, + "learning_rate": 6.306246052787289e-08, + "loss": 0.69386256, + "num_input_tokens_seen": 330884675, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 15339, + "time_per_iteration": 2.51116681098938 + }, + { + "auxiliary_loss_clip": 0.01099814, + "auxiliary_loss_mlp": 0.01030808, + "balance_loss_clip": 1.01939344, + "balance_loss_mlp": 1.0349164, + "epoch": 0.9222906959266496, + "flos": 25337204766720.0, + "grad_norm": 1.823316200451707, + "language_loss": 0.71776712, + "learning_rate": 6.296546872173513e-08, + "loss": 0.73907328, + "num_input_tokens_seen": 330904125, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 15340, + "time_per_iteration": 2.497661828994751 + }, + { + "auxiliary_loss_clip": 0.01098023, + "auxiliary_loss_mlp": 0.01028745, + "balance_loss_clip": 1.01765251, + "balance_loss_mlp": 1.03384233, + "epoch": 0.9223508191793176, + "flos": 27599864244480.0, + "grad_norm": 1.494128096822042, + "language_loss": 0.70278209, + "learning_rate": 6.286855036814098e-08, + "loss": 0.72404981, + "num_input_tokens_seen": 330925140, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.640625, + "step": 15341, + "time_per_iteration": 2.558868169784546 + }, + { + "auxiliary_loss_clip": 0.0109525, + "auxiliary_loss_mlp": 0.01030008, + "balance_loss_clip": 1.01931548, + "balance_loss_mlp": 1.03327823, + "epoch": 0.9224109424319856, + "flos": 27307587277440.0, + "grad_norm": 1.548602535002695, + "language_loss": 0.67397153, + "learning_rate": 6.277170547076571e-08, + "loss": 0.69522405, + "num_input_tokens_seen": 330946625, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6171875, + "step": 15342, + "time_per_iteration": 2.5003254413604736 + }, + { + "auxiliary_loss_clip": 0.01099219, + "auxiliary_loss_mlp": 0.01031656, + "balance_loss_clip": 1.02080154, + "balance_loss_mlp": 1.0339365, + "epoch": 0.9224710656846535, + "flos": 48208314401280.0, + "grad_norm": 6.095870438208894, + "language_loss": 0.69328499, + "learning_rate": 6.26749340332815e-08, + "loss": 0.71459371, + "num_input_tokens_seen": 330967795, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.65234375, + "step": 15343, + "time_per_iteration": 2.6598129272460938 + }, + { + "auxiliary_loss_clip": 0.01022024, + "auxiliary_loss_mlp": 0.0099989, + "balance_loss_clip": 0.99891895, + "balance_loss_mlp": 1.0018754, + "epoch": 0.9225311889373216, + "flos": 66722171794560.0, + "grad_norm": 0.7264917660011667, + "language_loss": 0.51998997, + "learning_rate": 6.257823605935786e-08, + "loss": 0.54020911, + "num_input_tokens_seen": 331040850, + "router_z_loss_clip": 0.00970459, + "router_z_loss_mlp": 0.20117188, + "step": 15344, + "time_per_iteration": 3.241743803024292 + }, + { + "auxiliary_loss_clip": 0.01094735, + "auxiliary_loss_mlp": 0.01029203, + "balance_loss_clip": 1.01856375, + "balance_loss_mlp": 1.03361905, + "epoch": 0.9225913121899895, + "flos": 22271295398400.0, + "grad_norm": 1.6429535121798804, + "language_loss": 0.70311445, + "learning_rate": 6.248161155266162e-08, + "loss": 0.72435379, + "num_input_tokens_seen": 331060595, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.609375, + "step": 15345, + "time_per_iteration": 3.881594181060791 + }, + { + "auxiliary_loss_clip": 0.01099254, + "auxiliary_loss_mlp": 0.01034917, + "balance_loss_clip": 1.0229665, + "balance_loss_mlp": 1.03364944, + "epoch": 0.9226514354426575, + "flos": 20082719721600.0, + "grad_norm": 1.7542089435944361, + "language_loss": 0.77480382, + "learning_rate": 6.238506051685677e-08, + "loss": 0.79614556, + "num_input_tokens_seen": 331080195, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.65625, + "step": 15346, + "time_per_iteration": 2.4377188682556152 + }, + { + "auxiliary_loss_clip": 0.01105129, + "auxiliary_loss_mlp": 0.01034379, + "balance_loss_clip": 1.02243412, + "balance_loss_mlp": 1.03608787, + "epoch": 0.9227115586953254, + "flos": 16070851728000.0, + "grad_norm": 2.4232440557125776, + "language_loss": 0.75999713, + "learning_rate": 6.228858295560457e-08, + "loss": 0.78139222, + "num_input_tokens_seen": 331097645, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 15347, + "time_per_iteration": 3.9060075283050537 + }, + { + "auxiliary_loss_clip": 0.01095819, + "auxiliary_loss_mlp": 0.01029382, + "balance_loss_clip": 1.01887941, + "balance_loss_mlp": 1.03427565, + "epoch": 0.9227716819479934, + "flos": 20446027833600.0, + "grad_norm": 1.4881916639419828, + "language_loss": 0.76720476, + "learning_rate": 6.219217887256367e-08, + "loss": 0.7884568, + "num_input_tokens_seen": 331116830, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.6171875, + "step": 15348, + "time_per_iteration": 3.9879612922668457 + }, + { + "auxiliary_loss_clip": 0.01099795, + "auxiliary_loss_mlp": 0.01030966, + "balance_loss_clip": 1.01900303, + "balance_loss_mlp": 1.03291154, + "epoch": 0.9228318052006613, + "flos": 25007401065600.0, + "grad_norm": 1.9357360383703182, + "language_loss": 0.67522502, + "learning_rate": 6.209584827138959e-08, + "loss": 0.69653267, + "num_input_tokens_seen": 331137235, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66796875, + "step": 15349, + "time_per_iteration": 2.4881527423858643 + }, + { + "auxiliary_loss_clip": 0.01098952, + "auxiliary_loss_mlp": 0.01026916, + "balance_loss_clip": 1.01580596, + "balance_loss_mlp": 1.03286695, + "epoch": 0.9228919284533293, + "flos": 12677257560960.0, + "grad_norm": 2.1804574418190135, + "language_loss": 0.86920041, + "learning_rate": 6.199959115573495e-08, + "loss": 0.89045906, + "num_input_tokens_seen": 331153155, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.66015625, + "step": 15350, + "time_per_iteration": 2.4354965686798096 + }, + { + "auxiliary_loss_clip": 0.01022095, + "auxiliary_loss_mlp": 0.00999272, + "balance_loss_clip": 0.9983182, + "balance_loss_mlp": 1.00192451, + "epoch": 0.9229520517059973, + "flos": 69986162712960.0, + "grad_norm": 0.7681060822622773, + "language_loss": 0.60345185, + "learning_rate": 6.190340752924994e-08, + "loss": 0.62366551, + "num_input_tokens_seen": 331214895, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.20214844, + "step": 15351, + "time_per_iteration": 2.9938981533050537 + }, + { + "auxiliary_loss_clip": 0.01099145, + "auxiliary_loss_mlp": 0.01024754, + "balance_loss_clip": 1.01364326, + "balance_loss_mlp": 1.03202951, + "epoch": 0.9230121749586653, + "flos": 14793832425600.0, + "grad_norm": 2.0623223330512737, + "language_loss": 0.78037506, + "learning_rate": 6.180729739558233e-08, + "loss": 0.80161405, + "num_input_tokens_seen": 331232185, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.671875, + "step": 15352, + "time_per_iteration": 3.9149723052978516 + }, + { + "auxiliary_loss_clip": 0.01103603, + "auxiliary_loss_mlp": 0.01034177, + "balance_loss_clip": 1.02197564, + "balance_loss_mlp": 1.03482258, + "epoch": 0.9230722982113332, + "flos": 22967208472320.0, + "grad_norm": 2.1804131199600714, + "language_loss": 0.59960139, + "learning_rate": 6.171126075837585e-08, + "loss": 0.62097919, + "num_input_tokens_seen": 331251065, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 15353, + "time_per_iteration": 2.496880531311035 + }, + { + "auxiliary_loss_clip": 0.01097462, + "auxiliary_loss_mlp": 0.01027824, + "balance_loss_clip": 1.01727974, + "balance_loss_mlp": 1.03385043, + "epoch": 0.9231324214640012, + "flos": 18551452976640.0, + "grad_norm": 1.7905758764270645, + "language_loss": 0.74425894, + "learning_rate": 6.161529762127293e-08, + "loss": 0.76551175, + "num_input_tokens_seen": 331269110, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.63671875, + "step": 15354, + "time_per_iteration": 2.425142526626587 + }, + { + "auxiliary_loss_clip": 0.0110371, + "auxiliary_loss_mlp": 0.01030292, + "balance_loss_clip": 1.01714277, + "balance_loss_mlp": 1.03467274, + "epoch": 0.9231925447166691, + "flos": 22082727974400.0, + "grad_norm": 1.9630173318730952, + "language_loss": 0.64785397, + "learning_rate": 6.1519407987912e-08, + "loss": 0.66919398, + "num_input_tokens_seen": 331286555, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.69140625, + "step": 15355, + "time_per_iteration": 2.4966373443603516 + }, + { + "auxiliary_loss_clip": 0.01096376, + "auxiliary_loss_mlp": 0.0103167, + "balance_loss_clip": 1.02042854, + "balance_loss_mlp": 1.03359032, + "epoch": 0.9232526679693371, + "flos": 26541145848960.0, + "grad_norm": 1.9020455750436218, + "language_loss": 0.74108565, + "learning_rate": 6.142359186192947e-08, + "loss": 0.76236618, + "num_input_tokens_seen": 331307660, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.625, + "step": 15356, + "time_per_iteration": 2.495620012283325 + }, + { + "auxiliary_loss_clip": 0.01101297, + "auxiliary_loss_mlp": 0.01032627, + "balance_loss_clip": 1.02093816, + "balance_loss_mlp": 1.03475368, + "epoch": 0.9233127912220052, + "flos": 14756664827520.0, + "grad_norm": 1.7464532837963378, + "language_loss": 0.60978168, + "learning_rate": 6.132784924695844e-08, + "loss": 0.63112092, + "num_input_tokens_seen": 331324885, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 15357, + "time_per_iteration": 2.3971152305603027 + }, + { + "auxiliary_loss_clip": 0.0110148, + "auxiliary_loss_mlp": 0.01029837, + "balance_loss_clip": 1.01805329, + "balance_loss_mlp": 1.03382421, + "epoch": 0.9233729144746731, + "flos": 25261792162560.0, + "grad_norm": 1.834426423623626, + "language_loss": 0.69739604, + "learning_rate": 6.123218014662956e-08, + "loss": 0.71870929, + "num_input_tokens_seen": 331345885, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.67578125, + "step": 15358, + "time_per_iteration": 2.5024566650390625 + }, + { + "auxiliary_loss_clip": 0.01099424, + "auxiliary_loss_mlp": 0.01029528, + "balance_loss_clip": 1.0182445, + "balance_loss_mlp": 1.03358769, + "epoch": 0.9234330377273411, + "flos": 27849837968640.0, + "grad_norm": 2.364215132336142, + "language_loss": 0.73011422, + "learning_rate": 6.113658456457104e-08, + "loss": 0.75140369, + "num_input_tokens_seen": 331364320, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65625, + "step": 15359, + "time_per_iteration": 2.4889423847198486 + }, + { + "auxiliary_loss_clip": 0.01101489, + "auxiliary_loss_mlp": 0.01031117, + "balance_loss_clip": 1.01995301, + "balance_loss_mlp": 1.03558564, + "epoch": 0.923493160980009, + "flos": 24608361899520.0, + "grad_norm": 2.101856244679429, + "language_loss": 0.64447194, + "learning_rate": 6.104106250440732e-08, + "loss": 0.66579807, + "num_input_tokens_seen": 331384135, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.65625, + "step": 15360, + "time_per_iteration": 2.489089250564575 + }, + { + "auxiliary_loss_clip": 0.01021548, + "auxiliary_loss_mlp": 0.00995886, + "balance_loss_clip": 0.99485475, + "balance_loss_mlp": 1.00148213, + "epoch": 0.923553284232677, + "flos": 67700916558720.0, + "grad_norm": 0.7586585804998485, + "language_loss": 0.55154079, + "learning_rate": 6.094561396976083e-08, + "loss": 0.57171512, + "num_input_tokens_seen": 331440645, + "router_z_loss_clip": 0.01031494, + "router_z_loss_mlp": 0.20117188, + "step": 15361, + "time_per_iteration": 3.001129150390625 + }, + { + "auxiliary_loss_clip": 0.01100288, + "auxiliary_loss_mlp": 0.01027298, + "balance_loss_clip": 1.01519871, + "balance_loss_mlp": 1.03305101, + "epoch": 0.9236134074853449, + "flos": 18807244704000.0, + "grad_norm": 1.6389059955723686, + "language_loss": 0.69725895, + "learning_rate": 6.085023896425112e-08, + "loss": 0.71853483, + "num_input_tokens_seen": 331459580, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 15362, + "time_per_iteration": 2.4049232006073 + }, + { + "auxiliary_loss_clip": 0.0110233, + "auxiliary_loss_mlp": 0.01031802, + "balance_loss_clip": 1.01850414, + "balance_loss_mlp": 1.03439915, + "epoch": 0.923673530738013, + "flos": 27782362270080.0, + "grad_norm": 1.5321644685395488, + "language_loss": 0.7569198, + "learning_rate": 6.075493749149463e-08, + "loss": 0.77826107, + "num_input_tokens_seen": 331481560, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6796875, + "step": 15363, + "time_per_iteration": 2.5406601428985596 + }, + { + "auxiliary_loss_clip": 0.01099534, + "auxiliary_loss_mlp": 0.01027193, + "balance_loss_clip": 1.01585019, + "balance_loss_mlp": 1.03406906, + "epoch": 0.9237336539906809, + "flos": 26797117144320.0, + "grad_norm": 2.197001335564612, + "language_loss": 0.83133066, + "learning_rate": 6.065970955510514e-08, + "loss": 0.85259789, + "num_input_tokens_seen": 331499090, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 15364, + "time_per_iteration": 2.4756619930267334 + }, + { + "auxiliary_loss_clip": 0.01097664, + "auxiliary_loss_mlp": 0.01026364, + "balance_loss_clip": 1.01561093, + "balance_loss_mlp": 1.03372884, + "epoch": 0.9237937772433489, + "flos": 23587708942080.0, + "grad_norm": 1.5348322828842351, + "language_loss": 0.67962128, + "learning_rate": 6.056455515869419e-08, + "loss": 0.70086157, + "num_input_tokens_seen": 331519420, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.640625, + "step": 15365, + "time_per_iteration": 2.497309684753418 + }, + { + "auxiliary_loss_clip": 0.01100931, + "auxiliary_loss_mlp": 0.01030133, + "balance_loss_clip": 1.01817071, + "balance_loss_mlp": 1.03535795, + "epoch": 0.9238539004960168, + "flos": 26140562398080.0, + "grad_norm": 2.194169448976208, + "language_loss": 0.62673676, + "learning_rate": 6.046947430586913e-08, + "loss": 0.64804745, + "num_input_tokens_seen": 331538720, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.65625, + "step": 15366, + "time_per_iteration": 2.5063443183898926 + }, + { + "auxiliary_loss_clip": 0.01099789, + "auxiliary_loss_mlp": 0.01026945, + "balance_loss_clip": 1.01484501, + "balance_loss_mlp": 1.03536332, + "epoch": 0.9239140237486848, + "flos": 21068000760960.0, + "grad_norm": 1.4208418043509794, + "language_loss": 0.74381047, + "learning_rate": 6.037446700023619e-08, + "loss": 0.76507783, + "num_input_tokens_seen": 331558505, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.64453125, + "step": 15367, + "time_per_iteration": 2.4719345569610596 + }, + { + "auxiliary_loss_clip": 0.01094974, + "auxiliary_loss_mlp": 0.0103132, + "balance_loss_clip": 1.02102005, + "balance_loss_mlp": 1.03345788, + "epoch": 0.9239741470013527, + "flos": 24607930936320.0, + "grad_norm": 2.0793504009917445, + "language_loss": 0.64489555, + "learning_rate": 6.027953324539759e-08, + "loss": 0.66615844, + "num_input_tokens_seen": 331578440, + "router_z_loss_clip": 0.10302734, + "router_z_loss_mlp": 0.61328125, + "step": 15368, + "time_per_iteration": 2.4641342163085938 + }, + { + "auxiliary_loss_clip": 0.01102929, + "auxiliary_loss_mlp": 0.0102822, + "balance_loss_clip": 1.01631057, + "balance_loss_mlp": 1.03453827, + "epoch": 0.9240342702540207, + "flos": 24718248581760.0, + "grad_norm": 2.4636100553277895, + "language_loss": 0.74815971, + "learning_rate": 6.018467304495401e-08, + "loss": 0.76947117, + "num_input_tokens_seen": 331598945, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.68359375, + "step": 15369, + "time_per_iteration": 2.4689323902130127 + }, + { + "auxiliary_loss_clip": 0.01104162, + "auxiliary_loss_mlp": 0.01035184, + "balance_loss_clip": 1.02204156, + "balance_loss_mlp": 1.03590214, + "epoch": 0.9240943935066888, + "flos": 20849987162880.0, + "grad_norm": 1.8685080548034616, + "language_loss": 0.76351935, + "learning_rate": 6.008988640250145e-08, + "loss": 0.78491282, + "num_input_tokens_seen": 331616700, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.68359375, + "step": 15370, + "time_per_iteration": 2.422639846801758 + }, + { + "auxiliary_loss_clip": 0.01099737, + "auxiliary_loss_mlp": 0.01032145, + "balance_loss_clip": 1.02084386, + "balance_loss_mlp": 1.03397942, + "epoch": 0.9241545167593567, + "flos": 24462313200000.0, + "grad_norm": 2.4404674499916803, + "language_loss": 0.67358434, + "learning_rate": 5.999517332163528e-08, + "loss": 0.69490314, + "num_input_tokens_seen": 331635625, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 15371, + "time_per_iteration": 2.4798216819763184 + }, + { + "auxiliary_loss_clip": 0.01021681, + "auxiliary_loss_mlp": 0.01000874, + "balance_loss_clip": 0.99990863, + "balance_loss_mlp": 1.00162053, + "epoch": 0.9242146400120247, + "flos": 61827259847040.0, + "grad_norm": 0.7221153992887761, + "language_loss": 0.57649028, + "learning_rate": 5.99005338059464e-08, + "loss": 0.59671581, + "num_input_tokens_seen": 331698595, + "router_z_loss_clip": 0.00964355, + "router_z_loss_mlp": 0.20117188, + "step": 15372, + "time_per_iteration": 3.0151007175445557 + }, + { + "auxiliary_loss_clip": 0.01099368, + "auxiliary_loss_mlp": 0.01030332, + "balance_loss_clip": 1.01962721, + "balance_loss_mlp": 1.03601837, + "epoch": 0.9242747632646926, + "flos": 22048397550720.0, + "grad_norm": 2.471421891520512, + "language_loss": 0.69785196, + "learning_rate": 5.98059678590237e-08, + "loss": 0.71914893, + "num_input_tokens_seen": 331717975, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6328125, + "step": 15373, + "time_per_iteration": 2.436583995819092 + }, + { + "auxiliary_loss_clip": 0.01099684, + "auxiliary_loss_mlp": 0.01033575, + "balance_loss_clip": 1.02204728, + "balance_loss_mlp": 1.03429437, + "epoch": 0.9243348865173606, + "flos": 18478338842880.0, + "grad_norm": 2.1623197384255404, + "language_loss": 0.75304061, + "learning_rate": 5.971147548445299e-08, + "loss": 0.77437317, + "num_input_tokens_seen": 331737220, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65234375, + "step": 15374, + "time_per_iteration": 2.415738582611084 + }, + { + "auxiliary_loss_clip": 0.01101561, + "auxiliary_loss_mlp": 0.01031001, + "balance_loss_clip": 1.01978934, + "balance_loss_mlp": 1.03546357, + "epoch": 0.9243950097700285, + "flos": 23258767167360.0, + "grad_norm": 2.3489424736028974, + "language_loss": 0.64875305, + "learning_rate": 5.961705668581784e-08, + "loss": 0.67007864, + "num_input_tokens_seen": 331757300, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.66015625, + "step": 15375, + "time_per_iteration": 2.4479691982269287 + }, + { + "auxiliary_loss_clip": 0.01099359, + "auxiliary_loss_mlp": 0.01031853, + "balance_loss_clip": 1.02046835, + "balance_loss_mlp": 1.03515816, + "epoch": 0.9244551330226966, + "flos": 29749081593600.0, + "grad_norm": 1.7783255869670582, + "language_loss": 0.66906196, + "learning_rate": 5.952271146669829e-08, + "loss": 0.69037414, + "num_input_tokens_seen": 331776995, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.640625, + "step": 15376, + "time_per_iteration": 2.4910011291503906 + }, + { + "auxiliary_loss_clip": 0.010219, + "auxiliary_loss_mlp": 0.01001278, + "balance_loss_clip": 1.00028849, + "balance_loss_mlp": 1.00179458, + "epoch": 0.9245152562753645, + "flos": 68864960609280.0, + "grad_norm": 0.6519597025269294, + "language_loss": 0.61160791, + "learning_rate": 5.94284398306717e-08, + "loss": 0.63183969, + "num_input_tokens_seen": 331845015, + "router_z_loss_clip": 0.0098877, + "router_z_loss_mlp": 0.20117188, + "step": 15377, + "time_per_iteration": 3.057742118835449 + }, + { + "auxiliary_loss_clip": 0.01099177, + "auxiliary_loss_mlp": 0.01033463, + "balance_loss_clip": 1.02223337, + "balance_loss_mlp": 1.03419769, + "epoch": 0.9245753795280325, + "flos": 21579260993280.0, + "grad_norm": 1.8725219959605253, + "language_loss": 0.73735809, + "learning_rate": 5.933424178131341e-08, + "loss": 0.75868452, + "num_input_tokens_seen": 331862795, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 15378, + "time_per_iteration": 2.425985097885132 + }, + { + "auxiliary_loss_clip": 0.011013, + "auxiliary_loss_mlp": 0.01030775, + "balance_loss_clip": 1.01888347, + "balance_loss_mlp": 1.03506637, + "epoch": 0.9246355027807004, + "flos": 34496077334400.0, + "grad_norm": 2.9917383599465364, + "language_loss": 0.62278056, + "learning_rate": 5.924011732219503e-08, + "loss": 0.64410132, + "num_input_tokens_seen": 331882535, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 15379, + "time_per_iteration": 2.557879686355591 + }, + { + "auxiliary_loss_clip": 0.0109822, + "auxiliary_loss_mlp": 0.0102624, + "balance_loss_clip": 1.01468313, + "balance_loss_mlp": 1.03472924, + "epoch": 0.9246956260333684, + "flos": 15953854152960.0, + "grad_norm": 2.134540215346882, + "language_loss": 0.83972025, + "learning_rate": 5.914606645688591e-08, + "loss": 0.86096483, + "num_input_tokens_seen": 331899335, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.63671875, + "step": 15380, + "time_per_iteration": 2.4178035259246826 + }, + { + "auxiliary_loss_clip": 0.01101277, + "auxiliary_loss_mlp": 0.01033351, + "balance_loss_clip": 1.02084041, + "balance_loss_mlp": 1.03352189, + "epoch": 0.9247557492860363, + "flos": 23368366540800.0, + "grad_norm": 1.4769589190868633, + "language_loss": 0.73472691, + "learning_rate": 5.905208918895233e-08, + "loss": 0.75607318, + "num_input_tokens_seen": 331919030, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 15381, + "time_per_iteration": 2.455674171447754 + }, + { + "auxiliary_loss_clip": 0.01100221, + "auxiliary_loss_mlp": 0.01028933, + "balance_loss_clip": 1.01783454, + "balance_loss_mlp": 1.03460169, + "epoch": 0.9248158725387043, + "flos": 23039855729280.0, + "grad_norm": 1.6990719448021085, + "language_loss": 0.78354275, + "learning_rate": 5.8958185521958524e-08, + "loss": 0.80483425, + "num_input_tokens_seen": 331936465, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 15382, + "time_per_iteration": 2.4381918907165527 + }, + { + "auxiliary_loss_clip": 0.01099044, + "auxiliary_loss_mlp": 0.01031867, + "balance_loss_clip": 1.01986253, + "balance_loss_mlp": 1.03293002, + "epoch": 0.9248759957913724, + "flos": 22522418357760.0, + "grad_norm": 1.7957184237375154, + "language_loss": 0.74939609, + "learning_rate": 5.886435545946455e-08, + "loss": 0.77070516, + "num_input_tokens_seen": 331954625, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66015625, + "step": 15383, + "time_per_iteration": 2.4508137702941895 + }, + { + "auxiliary_loss_clip": 0.01095389, + "auxiliary_loss_mlp": 0.01026924, + "balance_loss_clip": 1.01564097, + "balance_loss_mlp": 1.0316422, + "epoch": 0.9249361190440403, + "flos": 25447271016960.0, + "grad_norm": 1.7969002247576855, + "language_loss": 0.75541508, + "learning_rate": 5.8770599005028456e-08, + "loss": 0.77663815, + "num_input_tokens_seen": 331975865, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.63671875, + "step": 15384, + "time_per_iteration": 2.506045341491699 + }, + { + "auxiliary_loss_clip": 0.01095577, + "auxiliary_loss_mlp": 0.01030739, + "balance_loss_clip": 1.01920581, + "balance_loss_mlp": 1.03235722, + "epoch": 0.9249962422967083, + "flos": 12378623886720.0, + "grad_norm": 1.8321722340960027, + "language_loss": 0.66197598, + "learning_rate": 5.8676916162206045e-08, + "loss": 0.6832391, + "num_input_tokens_seen": 331992760, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6328125, + "step": 15385, + "time_per_iteration": 2.413760185241699 + }, + { + "auxiliary_loss_clip": 0.01097285, + "auxiliary_loss_mlp": 0.01029213, + "balance_loss_clip": 1.01847816, + "balance_loss_mlp": 1.03289402, + "epoch": 0.9250563655493762, + "flos": 22929430343040.0, + "grad_norm": 1.8037603173155325, + "language_loss": 0.80537152, + "learning_rate": 5.85833069345496e-08, + "loss": 0.82663649, + "num_input_tokens_seen": 332011890, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.64453125, + "step": 15386, + "time_per_iteration": 2.4500980377197266 + }, + { + "auxiliary_loss_clip": 0.01098949, + "auxiliary_loss_mlp": 0.01037808, + "balance_loss_clip": 1.02584577, + "balance_loss_mlp": 1.03501868, + "epoch": 0.9251164888020442, + "flos": 18478662065280.0, + "grad_norm": 1.5966888283815128, + "language_loss": 0.75251609, + "learning_rate": 5.8489771325608504e-08, + "loss": 0.7738837, + "num_input_tokens_seen": 332029485, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.640625, + "step": 15387, + "time_per_iteration": 3.821263551712036 + }, + { + "auxiliary_loss_clip": 0.01096172, + "auxiliary_loss_mlp": 0.01031177, + "balance_loss_clip": 1.02039409, + "balance_loss_mlp": 1.03313661, + "epoch": 0.9251766120547121, + "flos": 33037062796800.0, + "grad_norm": 1.5110299147108328, + "language_loss": 0.70130134, + "learning_rate": 5.839630933893014e-08, + "loss": 0.72257483, + "num_input_tokens_seen": 332052970, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6328125, + "step": 15388, + "time_per_iteration": 3.967625141143799 + }, + { + "auxiliary_loss_clip": 0.01100941, + "auxiliary_loss_mlp": 0.01027135, + "balance_loss_clip": 1.01554775, + "balance_loss_mlp": 1.03401148, + "epoch": 0.9252367353073802, + "flos": 24387906176640.0, + "grad_norm": 1.703661518442703, + "language_loss": 0.818995, + "learning_rate": 5.8302920978058115e-08, + "loss": 0.84027576, + "num_input_tokens_seen": 332070395, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66796875, + "step": 15389, + "time_per_iteration": 3.870365858078003 + }, + { + "auxiliary_loss_clip": 0.01107938, + "auxiliary_loss_mlp": 0.01030221, + "balance_loss_clip": 1.01763237, + "balance_loss_mlp": 1.03723955, + "epoch": 0.9252968585600481, + "flos": 18916844077440.0, + "grad_norm": 1.8133156840491251, + "language_loss": 0.7921918, + "learning_rate": 5.820960624653381e-08, + "loss": 0.81357348, + "num_input_tokens_seen": 332090185, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.70703125, + "step": 15390, + "time_per_iteration": 2.43095064163208 + }, + { + "auxiliary_loss_clip": 0.01101708, + "auxiliary_loss_mlp": 0.01039397, + "balance_loss_clip": 1.02741694, + "balance_loss_mlp": 1.03465974, + "epoch": 0.9253569818127161, + "flos": 21725345606400.0, + "grad_norm": 1.7286772201518952, + "language_loss": 0.75258297, + "learning_rate": 5.811636514789597e-08, + "loss": 0.77399403, + "num_input_tokens_seen": 332109050, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 15391, + "time_per_iteration": 2.41867995262146 + }, + { + "auxiliary_loss_clip": 0.01098225, + "auxiliary_loss_mlp": 0.01030171, + "balance_loss_clip": 1.01740921, + "balance_loss_mlp": 1.03291667, + "epoch": 0.925417105065384, + "flos": 34240357434240.0, + "grad_norm": 2.128188979527296, + "language_loss": 0.52005279, + "learning_rate": 5.80231976856802e-08, + "loss": 0.54133677, + "num_input_tokens_seen": 332131180, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.65234375, + "step": 15392, + "time_per_iteration": 2.5652174949645996 + }, + { + "auxiliary_loss_clip": 0.01097761, + "auxiliary_loss_mlp": 0.01027276, + "balance_loss_clip": 1.01597524, + "balance_loss_mlp": 1.03198981, + "epoch": 0.925477228318052, + "flos": 25959536830080.0, + "grad_norm": 1.964965624188704, + "language_loss": 0.77008653, + "learning_rate": 5.7930103863419454e-08, + "loss": 0.79133701, + "num_input_tokens_seen": 332149555, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 15393, + "time_per_iteration": 3.954613447189331 + }, + { + "auxiliary_loss_clip": 0.01096999, + "auxiliary_loss_mlp": 0.01031186, + "balance_loss_clip": 1.01977193, + "balance_loss_mlp": 1.03259718, + "epoch": 0.9255373515707199, + "flos": 11838240702720.0, + "grad_norm": 1.7627839503493286, + "language_loss": 0.69385219, + "learning_rate": 5.783708368464357e-08, + "loss": 0.71513402, + "num_input_tokens_seen": 332165830, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.64453125, + "step": 15394, + "time_per_iteration": 2.4000730514526367 + }, + { + "auxiliary_loss_clip": 0.01100318, + "auxiliary_loss_mlp": 0.01026765, + "balance_loss_clip": 1.01555324, + "balance_loss_mlp": 1.03456926, + "epoch": 0.925597474823388, + "flos": 21434325615360.0, + "grad_norm": 1.7473116250665182, + "language_loss": 0.72601545, + "learning_rate": 5.7744137152879956e-08, + "loss": 0.74728626, + "num_input_tokens_seen": 332185130, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.65625, + "step": 15395, + "time_per_iteration": 2.468665361404419 + }, + { + "auxiliary_loss_clip": 0.01094627, + "auxiliary_loss_mlp": 0.01029581, + "balance_loss_clip": 1.01909065, + "balance_loss_mlp": 1.03195882, + "epoch": 0.925657598076056, + "flos": 22857573185280.0, + "grad_norm": 2.211482629648121, + "language_loss": 0.71316254, + "learning_rate": 5.7651264271653785e-08, + "loss": 0.73440462, + "num_input_tokens_seen": 332203695, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.625, + "step": 15396, + "time_per_iteration": 2.437075614929199 + }, + { + "auxiliary_loss_clip": 0.01097691, + "auxiliary_loss_mlp": 0.01025515, + "balance_loss_clip": 1.0139879, + "balance_loss_mlp": 1.03351045, + "epoch": 0.9257177213287239, + "flos": 25704032411520.0, + "grad_norm": 1.5907781681745499, + "language_loss": 0.8724966, + "learning_rate": 5.755846504448603e-08, + "loss": 0.89372873, + "num_input_tokens_seen": 332224850, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 15397, + "time_per_iteration": 2.497161865234375 + }, + { + "auxiliary_loss_clip": 0.01021793, + "auxiliary_loss_mlp": 0.01004483, + "balance_loss_clip": 1.00352311, + "balance_loss_mlp": 1.00168765, + "epoch": 0.9257778445813919, + "flos": 59592933221760.0, + "grad_norm": 0.821738619907706, + "language_loss": 0.55149096, + "learning_rate": 5.746573947489586e-08, + "loss": 0.57175368, + "num_input_tokens_seen": 332278085, + "router_z_loss_clip": 0.00958252, + "router_z_loss_mlp": 0.20117188, + "step": 15398, + "time_per_iteration": 2.942495107650757 + }, + { + "auxiliary_loss_clip": 0.01104006, + "auxiliary_loss_mlp": 0.01027248, + "balance_loss_clip": 1.0141114, + "balance_loss_mlp": 1.0344249, + "epoch": 0.9258379678340598, + "flos": 27709427704320.0, + "grad_norm": 1.9427050016588183, + "language_loss": 0.75920027, + "learning_rate": 5.7373087566400025e-08, + "loss": 0.78051281, + "num_input_tokens_seen": 332297875, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6953125, + "step": 15399, + "time_per_iteration": 2.491990804672241 + }, + { + "auxiliary_loss_clip": 0.01093745, + "auxiliary_loss_mlp": 0.01027786, + "balance_loss_clip": 1.01742709, + "balance_loss_mlp": 1.0315423, + "epoch": 0.9258980910867278, + "flos": 24863543095680.0, + "grad_norm": 1.5984703912789273, + "language_loss": 0.78239942, + "learning_rate": 5.7280509322510826e-08, + "loss": 0.80361474, + "num_input_tokens_seen": 332318500, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.625, + "step": 15400, + "time_per_iteration": 2.4736785888671875 + }, + { + "auxiliary_loss_clip": 0.01021709, + "auxiliary_loss_mlp": 0.00998028, + "balance_loss_clip": 0.99707454, + "balance_loss_mlp": 1.00160527, + "epoch": 0.9259582143393957, + "flos": 63134587249920.0, + "grad_norm": 0.731962220549327, + "language_loss": 0.51344585, + "learning_rate": 5.718800474673946e-08, + "loss": 0.53364325, + "num_input_tokens_seen": 332381980, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.20117188, + "step": 15401, + "time_per_iteration": 3.0090858936309814 + }, + { + "auxiliary_loss_clip": 0.01096088, + "auxiliary_loss_mlp": 0.01031444, + "balance_loss_clip": 1.02058387, + "balance_loss_mlp": 1.03388858, + "epoch": 0.9260183375920638, + "flos": 24127122458880.0, + "grad_norm": 1.6172522790152049, + "language_loss": 0.8218559, + "learning_rate": 5.709557384259378e-08, + "loss": 0.8431313, + "num_input_tokens_seen": 332399510, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.62109375, + "step": 15402, + "time_per_iteration": 2.4477603435516357 + }, + { + "auxiliary_loss_clip": 0.01021801, + "auxiliary_loss_mlp": 0.01002843, + "balance_loss_clip": 1.00190759, + "balance_loss_mlp": 1.0017004, + "epoch": 0.9260784608447317, + "flos": 63042872849280.0, + "grad_norm": 0.7718721766171598, + "language_loss": 0.5109669, + "learning_rate": 5.700321661357876e-08, + "loss": 0.53121334, + "num_input_tokens_seen": 332459130, + "router_z_loss_clip": 0.00933838, + "router_z_loss_mlp": 0.20117188, + "step": 15403, + "time_per_iteration": 3.1244301795959473 + }, + { + "auxiliary_loss_clip": 0.01021692, + "auxiliary_loss_mlp": 0.00998434, + "balance_loss_clip": 0.99748039, + "balance_loss_mlp": 1.00152445, + "epoch": 0.9261385840973997, + "flos": 70585979927040.0, + "grad_norm": 0.6837569550075934, + "language_loss": 0.58685899, + "learning_rate": 5.69109330631965e-08, + "loss": 0.60706019, + "num_input_tokens_seen": 332526555, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.20117188, + "step": 15404, + "time_per_iteration": 3.094059705734253 + }, + { + "auxiliary_loss_clip": 0.01100562, + "auxiliary_loss_mlp": 0.01031398, + "balance_loss_clip": 1.01923847, + "balance_loss_mlp": 1.03395927, + "epoch": 0.9261987073500676, + "flos": 20229917656320.0, + "grad_norm": 1.9953628404049057, + "language_loss": 0.71774006, + "learning_rate": 5.681872319494596e-08, + "loss": 0.73905957, + "num_input_tokens_seen": 332544005, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66796875, + "step": 15405, + "time_per_iteration": 2.4240143299102783 + }, + { + "auxiliary_loss_clip": 0.01102412, + "auxiliary_loss_mlp": 0.01035822, + "balance_loss_clip": 1.02343011, + "balance_loss_mlp": 1.03474975, + "epoch": 0.9262588306027356, + "flos": 20954163582720.0, + "grad_norm": 1.7576607341023662, + "language_loss": 0.68750131, + "learning_rate": 5.672658701232458e-08, + "loss": 0.7088837, + "num_input_tokens_seen": 332563070, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.67578125, + "step": 15406, + "time_per_iteration": 2.466527223587036 + }, + { + "auxiliary_loss_clip": 0.01101722, + "auxiliary_loss_mlp": 0.01034883, + "balance_loss_clip": 1.0222826, + "balance_loss_mlp": 1.03555775, + "epoch": 0.9263189538554035, + "flos": 22158679282560.0, + "grad_norm": 2.7752973629401856, + "language_loss": 0.76403785, + "learning_rate": 5.663452451882555e-08, + "loss": 0.78540385, + "num_input_tokens_seen": 332579620, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.66015625, + "step": 15407, + "time_per_iteration": 2.4367871284484863 + }, + { + "auxiliary_loss_clip": 0.01103324, + "auxiliary_loss_mlp": 0.01038757, + "balance_loss_clip": 1.02655041, + "balance_loss_mlp": 1.03313541, + "epoch": 0.9263790771080715, + "flos": 18187211111040.0, + "grad_norm": 3.383099092597422, + "language_loss": 0.72512782, + "learning_rate": 5.6542535717940096e-08, + "loss": 0.74654853, + "num_input_tokens_seen": 332597795, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.703125, + "step": 15408, + "time_per_iteration": 2.430126667022705 + }, + { + "auxiliary_loss_clip": 0.01098119, + "auxiliary_loss_mlp": 0.01028103, + "balance_loss_clip": 1.01819086, + "balance_loss_mlp": 1.03438425, + "epoch": 0.9264392003607396, + "flos": 48178545004800.0, + "grad_norm": 1.697930797649794, + "language_loss": 0.68514466, + "learning_rate": 5.645062061315675e-08, + "loss": 0.70640695, + "num_input_tokens_seen": 332620375, + "router_z_loss_clip": 0.09912109, + "router_z_loss_mlp": 0.63671875, + "step": 15409, + "time_per_iteration": 2.662263870239258 + }, + { + "auxiliary_loss_clip": 0.01101595, + "auxiliary_loss_mlp": 0.01029904, + "balance_loss_clip": 1.01756573, + "balance_loss_mlp": 1.03554535, + "epoch": 0.9264993236134075, + "flos": 26389458714240.0, + "grad_norm": 2.2641730930101724, + "language_loss": 0.75665075, + "learning_rate": 5.6358779207960506e-08, + "loss": 0.77796578, + "num_input_tokens_seen": 332639510, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66015625, + "step": 15410, + "time_per_iteration": 2.495643138885498 + }, + { + "auxiliary_loss_clip": 0.01099727, + "auxiliary_loss_mlp": 0.01026725, + "balance_loss_clip": 1.01559091, + "balance_loss_mlp": 1.0341475, + "epoch": 0.9265594468660755, + "flos": 20920084554240.0, + "grad_norm": 1.541705061344522, + "language_loss": 0.82224798, + "learning_rate": 5.6267011505833905e-08, + "loss": 0.84351254, + "num_input_tokens_seen": 332658350, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 15411, + "time_per_iteration": 2.437490940093994 + }, + { + "auxiliary_loss_clip": 0.01104354, + "auxiliary_loss_mlp": 0.01034582, + "balance_loss_clip": 1.02319741, + "balance_loss_mlp": 1.03806257, + "epoch": 0.9266195701187434, + "flos": 17525017929600.0, + "grad_norm": 2.409965861262806, + "language_loss": 0.75620615, + "learning_rate": 5.617531751025728e-08, + "loss": 0.77759552, + "num_input_tokens_seen": 332676715, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6640625, + "step": 15412, + "time_per_iteration": 2.4860680103302 + }, + { + "auxiliary_loss_clip": 0.01096944, + "auxiliary_loss_mlp": 0.01028884, + "balance_loss_clip": 1.01736248, + "balance_loss_mlp": 1.03194141, + "epoch": 0.9266796933714114, + "flos": 33688733293440.0, + "grad_norm": 2.743723110858746, + "language_loss": 0.66987592, + "learning_rate": 5.6083697224707406e-08, + "loss": 0.69113421, + "num_input_tokens_seen": 332701470, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 15413, + "time_per_iteration": 2.5412187576293945 + }, + { + "auxiliary_loss_clip": 0.01100923, + "auxiliary_loss_mlp": 0.01030446, + "balance_loss_clip": 1.01838839, + "balance_loss_mlp": 1.03374481, + "epoch": 0.9267398166240793, + "flos": 18916520855040.0, + "grad_norm": 1.8837967229167019, + "language_loss": 0.76128107, + "learning_rate": 5.5992150652658167e-08, + "loss": 0.78259474, + "num_input_tokens_seen": 332719060, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.671875, + "step": 15414, + "time_per_iteration": 2.435417413711548 + }, + { + "auxiliary_loss_clip": 0.01098362, + "auxiliary_loss_mlp": 0.01029505, + "balance_loss_clip": 1.01819217, + "balance_loss_mlp": 1.03404093, + "epoch": 0.9267999398767474, + "flos": 20478957626880.0, + "grad_norm": 1.974785668209935, + "language_loss": 0.8150264, + "learning_rate": 5.59006777975819e-08, + "loss": 0.83630508, + "num_input_tokens_seen": 332736345, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 15415, + "time_per_iteration": 2.436947822570801 + }, + { + "auxiliary_loss_clip": 0.01100241, + "auxiliary_loss_mlp": 0.01032005, + "balance_loss_clip": 1.02014947, + "balance_loss_mlp": 1.03393376, + "epoch": 0.9268600631294153, + "flos": 24789351553920.0, + "grad_norm": 1.4430461922371247, + "language_loss": 0.54157484, + "learning_rate": 5.580927866294671e-08, + "loss": 0.56289732, + "num_input_tokens_seen": 332756270, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6640625, + "step": 15416, + "time_per_iteration": 2.482398509979248 + }, + { + "auxiliary_loss_clip": 0.0109764, + "auxiliary_loss_mlp": 0.01030069, + "balance_loss_clip": 1.01903081, + "balance_loss_mlp": 1.03377366, + "epoch": 0.9269201863820833, + "flos": 18697178453760.0, + "grad_norm": 1.575334838653751, + "language_loss": 0.72061193, + "learning_rate": 5.571795325221807e-08, + "loss": 0.74188906, + "num_input_tokens_seen": 332775185, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 15417, + "time_per_iteration": 2.421722173690796 + }, + { + "auxiliary_loss_clip": 0.01099529, + "auxiliary_loss_mlp": 0.01029786, + "balance_loss_clip": 1.01774013, + "balance_loss_mlp": 1.03482032, + "epoch": 0.9269803096347512, + "flos": 20923999136640.0, + "grad_norm": 2.1097210915525206, + "language_loss": 0.75657284, + "learning_rate": 5.5626701568859624e-08, + "loss": 0.77786595, + "num_input_tokens_seen": 332794320, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6484375, + "step": 15418, + "time_per_iteration": 2.4550986289978027 + }, + { + "auxiliary_loss_clip": 0.01097568, + "auxiliary_loss_mlp": 0.01028158, + "balance_loss_clip": 1.0163269, + "balance_loss_mlp": 1.03324318, + "epoch": 0.9270404328874192, + "flos": 28002710252160.0, + "grad_norm": 1.4450402960819761, + "language_loss": 0.76005769, + "learning_rate": 5.553552361633174e-08, + "loss": 0.78131491, + "num_input_tokens_seen": 332818095, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.64453125, + "step": 15419, + "time_per_iteration": 2.5159225463867188 + }, + { + "auxiliary_loss_clip": 0.01094814, + "auxiliary_loss_mlp": 0.01032298, + "balance_loss_clip": 1.02189064, + "balance_loss_mlp": 1.03209281, + "epoch": 0.9271005561400871, + "flos": 25889870401920.0, + "grad_norm": 2.0477489170526586, + "language_loss": 0.75719529, + "learning_rate": 5.5444419398091636e-08, + "loss": 0.77846634, + "num_input_tokens_seen": 332839860, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.62890625, + "step": 15420, + "time_per_iteration": 2.5263941287994385 + }, + { + "auxiliary_loss_clip": 0.01102072, + "auxiliary_loss_mlp": 0.01030859, + "balance_loss_clip": 1.01880121, + "balance_loss_mlp": 1.03423715, + "epoch": 0.9271606793927551, + "flos": 27053914452480.0, + "grad_norm": 1.459013416866959, + "language_loss": 0.76789546, + "learning_rate": 5.535338891759389e-08, + "loss": 0.78922474, + "num_input_tokens_seen": 332861155, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6796875, + "step": 15421, + "time_per_iteration": 2.4908981323242188 + }, + { + "auxiliary_loss_clip": 0.01099681, + "auxiliary_loss_mlp": 0.01030277, + "balance_loss_clip": 1.01896989, + "balance_loss_mlp": 1.0345453, + "epoch": 0.9272208026454232, + "flos": 26209869690240.0, + "grad_norm": 1.9925189154946077, + "language_loss": 0.7272985, + "learning_rate": 5.526243217829041e-08, + "loss": 0.7485981, + "num_input_tokens_seen": 332881110, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65234375, + "step": 15422, + "time_per_iteration": 2.504379987716675 + }, + { + "auxiliary_loss_clip": 0.01102396, + "auxiliary_loss_mlp": 0.01034198, + "balance_loss_clip": 1.02169275, + "balance_loss_mlp": 1.03490949, + "epoch": 0.9272809258980911, + "flos": 12458453863680.0, + "grad_norm": 1.894268448401904, + "language_loss": 0.77302563, + "learning_rate": 5.517154918363065e-08, + "loss": 0.79439163, + "num_input_tokens_seen": 332899350, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.671875, + "step": 15423, + "time_per_iteration": 2.4350104331970215 + }, + { + "auxiliary_loss_clip": 0.01101012, + "auxiliary_loss_mlp": 0.0102826, + "balance_loss_clip": 1.01604676, + "balance_loss_mlp": 1.03420961, + "epoch": 0.9273410491507591, + "flos": 22856890826880.0, + "grad_norm": 1.646588555304309, + "language_loss": 0.75237334, + "learning_rate": 5.508073993706053e-08, + "loss": 0.77366608, + "num_input_tokens_seen": 332918105, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66796875, + "step": 15424, + "time_per_iteration": 2.527062177658081 + }, + { + "auxiliary_loss_clip": 0.01021636, + "auxiliary_loss_mlp": 0.01000835, + "balance_loss_clip": 0.99980974, + "balance_loss_mlp": 1.00167572, + "epoch": 0.927401172403427, + "flos": 47665384329600.0, + "grad_norm": 0.7788753343598831, + "language_loss": 0.60629737, + "learning_rate": 5.499000444202351e-08, + "loss": 0.62652206, + "num_input_tokens_seen": 332969490, + "router_z_loss_clip": 0.01025391, + "router_z_loss_mlp": 0.19921875, + "step": 15425, + "time_per_iteration": 2.8316895961761475 + }, + { + "auxiliary_loss_clip": 0.01101174, + "auxiliary_loss_mlp": 0.01032201, + "balance_loss_clip": 1.02046478, + "balance_loss_mlp": 1.03510499, + "epoch": 0.927461295656095, + "flos": 29972374490880.0, + "grad_norm": 4.846350561223134, + "language_loss": 0.70709521, + "learning_rate": 5.489934270196106e-08, + "loss": 0.72842896, + "num_input_tokens_seen": 332988805, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66015625, + "step": 15426, + "time_per_iteration": 2.533698797225952 + }, + { + "auxiliary_loss_clip": 0.01099514, + "auxiliary_loss_mlp": 0.01024527, + "balance_loss_clip": 1.01384544, + "balance_loss_mlp": 1.0349983, + "epoch": 0.9275214189087629, + "flos": 20375427651840.0, + "grad_norm": 1.7589746620636957, + "language_loss": 0.82876408, + "learning_rate": 5.480875472030977e-08, + "loss": 0.85000449, + "num_input_tokens_seen": 333007960, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.64453125, + "step": 15427, + "time_per_iteration": 2.4352564811706543 + }, + { + "auxiliary_loss_clip": 0.01101445, + "auxiliary_loss_mlp": 0.01035207, + "balance_loss_clip": 1.02391815, + "balance_loss_mlp": 1.03641236, + "epoch": 0.927581542161431, + "flos": 22383193242240.0, + "grad_norm": 1.5057993286553948, + "language_loss": 0.76877588, + "learning_rate": 5.471824050050555e-08, + "loss": 0.79014242, + "num_input_tokens_seen": 333026035, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 15428, + "time_per_iteration": 3.8693413734436035 + }, + { + "auxiliary_loss_clip": 0.01096742, + "auxiliary_loss_mlp": 0.01032046, + "balance_loss_clip": 1.02022672, + "balance_loss_mlp": 1.03172433, + "epoch": 0.9276416654140989, + "flos": 23952453598080.0, + "grad_norm": 2.0403868846760447, + "language_loss": 0.74666828, + "learning_rate": 5.4627800045980555e-08, + "loss": 0.7679562, + "num_input_tokens_seen": 333045590, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6484375, + "step": 15429, + "time_per_iteration": 2.4468398094177246 + }, + { + "auxiliary_loss_clip": 0.01097637, + "auxiliary_loss_mlp": 0.01032278, + "balance_loss_clip": 1.02094138, + "balance_loss_mlp": 1.03397834, + "epoch": 0.9277017886667669, + "flos": 13917719796480.0, + "grad_norm": 1.7045835703544736, + "language_loss": 0.74889922, + "learning_rate": 5.45374333601647e-08, + "loss": 0.77019835, + "num_input_tokens_seen": 333063355, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.63671875, + "step": 15430, + "time_per_iteration": 3.9206631183624268 + }, + { + "auxiliary_loss_clip": 0.01098985, + "auxiliary_loss_mlp": 0.01030676, + "balance_loss_clip": 1.01789057, + "balance_loss_mlp": 1.03349423, + "epoch": 0.9277619119194348, + "flos": 35666478092160.0, + "grad_norm": 1.3411362668102724, + "language_loss": 0.76195765, + "learning_rate": 5.444714044648391e-08, + "loss": 0.78325427, + "num_input_tokens_seen": 333088045, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.65625, + "step": 15431, + "time_per_iteration": 3.985266923904419 + }, + { + "auxiliary_loss_clip": 0.01097878, + "auxiliary_loss_mlp": 0.01024488, + "balance_loss_clip": 1.01342511, + "balance_loss_mlp": 1.03424621, + "epoch": 0.9278220351721028, + "flos": 23841238112640.0, + "grad_norm": 1.6017729259542908, + "language_loss": 0.70828962, + "learning_rate": 5.4356921308363e-08, + "loss": 0.72951329, + "num_input_tokens_seen": 333108005, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.63671875, + "step": 15432, + "time_per_iteration": 2.4481770992279053 + }, + { + "auxiliary_loss_clip": 0.01101221, + "auxiliary_loss_mlp": 0.01029967, + "balance_loss_clip": 1.01845121, + "balance_loss_mlp": 1.03413773, + "epoch": 0.9278821584247707, + "flos": 15228135768960.0, + "grad_norm": 2.1590523070587095, + "language_loss": 0.82312065, + "learning_rate": 5.4266775949222354e-08, + "loss": 0.84443253, + "num_input_tokens_seen": 333124335, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 15433, + "time_per_iteration": 2.3907063007354736 + }, + { + "auxiliary_loss_clip": 0.01096167, + "auxiliary_loss_mlp": 0.0102545, + "balance_loss_clip": 1.01522768, + "balance_loss_mlp": 1.03443766, + "epoch": 0.9279422816774388, + "flos": 24681404206080.0, + "grad_norm": 1.7403608716892394, + "language_loss": 0.66221195, + "learning_rate": 5.417670437248056e-08, + "loss": 0.68342805, + "num_input_tokens_seen": 333143995, + "router_z_loss_clip": 0.10205078, + "router_z_loss_mlp": 0.6171875, + "step": 15434, + "time_per_iteration": 2.459033250808716 + }, + { + "auxiliary_loss_clip": 0.01093673, + "auxiliary_loss_mlp": 0.01027627, + "balance_loss_clip": 1.01683831, + "balance_loss_mlp": 1.03230667, + "epoch": 0.9280024049301068, + "flos": 19169188099200.0, + "grad_norm": 1.8005736913066748, + "language_loss": 0.6873616, + "learning_rate": 5.40867065815529e-08, + "loss": 0.70857459, + "num_input_tokens_seen": 333162805, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.61328125, + "step": 15435, + "time_per_iteration": 3.9342024326324463 + }, + { + "auxiliary_loss_clip": 0.01099245, + "auxiliary_loss_mlp": 0.01027343, + "balance_loss_clip": 1.01565433, + "balance_loss_mlp": 1.03373933, + "epoch": 0.9280625281827747, + "flos": 11393701983360.0, + "grad_norm": 2.020400510529268, + "language_loss": 0.72055352, + "learning_rate": 5.399678257985263e-08, + "loss": 0.74181938, + "num_input_tokens_seen": 333175770, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65625, + "step": 15436, + "time_per_iteration": 2.405715227127075 + }, + { + "auxiliary_loss_clip": 0.01098664, + "auxiliary_loss_mlp": 0.01028576, + "balance_loss_clip": 1.0170064, + "balance_loss_mlp": 1.03391147, + "epoch": 0.9281226514354427, + "flos": 24785616539520.0, + "grad_norm": 2.091605034726952, + "language_loss": 0.67294556, + "learning_rate": 5.390693237078925e-08, + "loss": 0.69421792, + "num_input_tokens_seen": 333194775, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6484375, + "step": 15437, + "time_per_iteration": 2.4648404121398926 + }, + { + "auxiliary_loss_clip": 0.01102898, + "auxiliary_loss_mlp": 0.010311, + "balance_loss_clip": 1.01868415, + "balance_loss_mlp": 1.03452563, + "epoch": 0.9281827746881106, + "flos": 15083128563840.0, + "grad_norm": 2.0966698400336896, + "language_loss": 0.71116936, + "learning_rate": 5.3817155957770254e-08, + "loss": 0.73250937, + "num_input_tokens_seen": 333208920, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 15438, + "time_per_iteration": 2.413299798965454 + }, + { + "auxiliary_loss_clip": 0.01100161, + "auxiliary_loss_mlp": 0.0103004, + "balance_loss_clip": 1.01855445, + "balance_loss_mlp": 1.03494883, + "epoch": 0.9282428979407786, + "flos": 24135059364480.0, + "grad_norm": 1.7797926756037903, + "language_loss": 0.64633286, + "learning_rate": 5.3727453344199366e-08, + "loss": 0.66763484, + "num_input_tokens_seen": 333229350, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65234375, + "step": 15439, + "time_per_iteration": 2.461437225341797 + }, + { + "auxiliary_loss_clip": 0.01099241, + "auxiliary_loss_mlp": 0.01027306, + "balance_loss_clip": 1.01593935, + "balance_loss_mlp": 1.03433597, + "epoch": 0.9283030211934465, + "flos": 24823215100800.0, + "grad_norm": 2.3253204491475885, + "language_loss": 0.7027396, + "learning_rate": 5.363782453347876e-08, + "loss": 0.7240051, + "num_input_tokens_seen": 333246125, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 15440, + "time_per_iteration": 2.4703500270843506 + }, + { + "auxiliary_loss_clip": 0.01102231, + "auxiliary_loss_mlp": 0.01035759, + "balance_loss_clip": 1.02361774, + "balance_loss_mlp": 1.03502834, + "epoch": 0.9283631444461146, + "flos": 23981037845760.0, + "grad_norm": 1.612587753570518, + "language_loss": 0.76855183, + "learning_rate": 5.354826952900682e-08, + "loss": 0.78993171, + "num_input_tokens_seen": 333263685, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.671875, + "step": 15441, + "time_per_iteration": 2.527451515197754 + }, + { + "auxiliary_loss_clip": 0.01094431, + "auxiliary_loss_mlp": 0.01026501, + "balance_loss_clip": 1.01657677, + "balance_loss_mlp": 1.03307748, + "epoch": 0.9284232676987825, + "flos": 22784530878720.0, + "grad_norm": 1.90495745755495, + "language_loss": 0.64267159, + "learning_rate": 5.345878833417949e-08, + "loss": 0.66388088, + "num_input_tokens_seen": 333282435, + "router_z_loss_clip": 0.09912109, + "router_z_loss_mlp": 0.61328125, + "step": 15442, + "time_per_iteration": 2.4639720916748047 + }, + { + "auxiliary_loss_clip": 0.01102164, + "auxiliary_loss_mlp": 0.01034514, + "balance_loss_clip": 1.02279592, + "balance_loss_mlp": 1.03435051, + "epoch": 0.9284833909514505, + "flos": 19500500171520.0, + "grad_norm": 1.7797907692602184, + "language_loss": 0.80536753, + "learning_rate": 5.3369380952390295e-08, + "loss": 0.8267343, + "num_input_tokens_seen": 333300400, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 15443, + "time_per_iteration": 2.427996873855591 + }, + { + "auxiliary_loss_clip": 0.01099981, + "auxiliary_loss_mlp": 0.01029047, + "balance_loss_clip": 1.01735878, + "balance_loss_mlp": 1.03426063, + "epoch": 0.9285435142041184, + "flos": 23185976256000.0, + "grad_norm": 1.8653829625255551, + "language_loss": 0.65230483, + "learning_rate": 5.328004738702896e-08, + "loss": 0.67359507, + "num_input_tokens_seen": 333318980, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 15444, + "time_per_iteration": 2.46578049659729 + }, + { + "auxiliary_loss_clip": 0.01098711, + "auxiliary_loss_mlp": 0.01028903, + "balance_loss_clip": 1.01768517, + "balance_loss_mlp": 1.03293288, + "epoch": 0.9286036374567864, + "flos": 17675519915520.0, + "grad_norm": 1.8271492259739264, + "language_loss": 0.73367989, + "learning_rate": 5.3190787641483215e-08, + "loss": 0.75495601, + "num_input_tokens_seen": 333334135, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 15445, + "time_per_iteration": 2.4109835624694824 + }, + { + "auxiliary_loss_clip": 0.0110117, + "auxiliary_loss_mlp": 0.01033091, + "balance_loss_clip": 1.02127099, + "balance_loss_mlp": 1.03590298, + "epoch": 0.9286637607094543, + "flos": 20886687884160.0, + "grad_norm": 1.5979727585178083, + "language_loss": 0.71089745, + "learning_rate": 5.3101601719138135e-08, + "loss": 0.73224002, + "num_input_tokens_seen": 333353325, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65234375, + "step": 15446, + "time_per_iteration": 2.462587594985962 + }, + { + "auxiliary_loss_clip": 0.01102949, + "auxiliary_loss_mlp": 0.01028508, + "balance_loss_clip": 1.01671267, + "balance_loss_mlp": 1.03485143, + "epoch": 0.9287238839621224, + "flos": 19026012487680.0, + "grad_norm": 1.9850834781721192, + "language_loss": 0.69447434, + "learning_rate": 5.301248962337523e-08, + "loss": 0.7157889, + "num_input_tokens_seen": 333371110, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 15447, + "time_per_iteration": 2.427091598510742 + }, + { + "auxiliary_loss_clip": 0.01093107, + "auxiliary_loss_mlp": 0.0102558, + "balance_loss_clip": 1.01545882, + "balance_loss_mlp": 1.03282893, + "epoch": 0.9287840072147904, + "flos": 20557027837440.0, + "grad_norm": 1.6048988598173843, + "language_loss": 0.72284281, + "learning_rate": 5.292345135757403e-08, + "loss": 0.74402964, + "num_input_tokens_seen": 333391420, + "router_z_loss_clip": 0.1015625, + "router_z_loss_mlp": 0.6015625, + "step": 15448, + "time_per_iteration": 2.4651074409484863 + }, + { + "auxiliary_loss_clip": 0.01099744, + "auxiliary_loss_mlp": 0.01027458, + "balance_loss_clip": 1.01506054, + "balance_loss_mlp": 1.03431988, + "epoch": 0.9288441304674583, + "flos": 21250822008960.0, + "grad_norm": 1.5859141437991187, + "language_loss": 0.73905832, + "learning_rate": 5.283448692511072e-08, + "loss": 0.76033032, + "num_input_tokens_seen": 333410365, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.65625, + "step": 15449, + "time_per_iteration": 2.4365196228027344 + }, + { + "auxiliary_loss_clip": 0.0109955, + "auxiliary_loss_mlp": 0.01025285, + "balance_loss_clip": 1.01344728, + "balance_loss_mlp": 1.03404522, + "epoch": 0.9289042537201263, + "flos": 27669853895040.0, + "grad_norm": 2.156577692440534, + "language_loss": 0.67555118, + "learning_rate": 5.27455963293586e-08, + "loss": 0.69679958, + "num_input_tokens_seen": 333430000, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65625, + "step": 15450, + "time_per_iteration": 2.4997141361236572 + }, + { + "auxiliary_loss_clip": 0.01099302, + "auxiliary_loss_mlp": 0.01024774, + "balance_loss_clip": 1.01307964, + "balance_loss_mlp": 1.03357685, + "epoch": 0.9289643769727942, + "flos": 19317750750720.0, + "grad_norm": 2.1026570979427026, + "language_loss": 0.72319663, + "learning_rate": 5.265677957368875e-08, + "loss": 0.7444374, + "num_input_tokens_seen": 333445800, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 15451, + "time_per_iteration": 2.4257543087005615 + }, + { + "auxiliary_loss_clip": 0.01100587, + "auxiliary_loss_mlp": 0.01033818, + "balance_loss_clip": 1.02285671, + "balance_loss_mlp": 1.03431022, + "epoch": 0.9290245002254622, + "flos": 14058058233600.0, + "grad_norm": 1.9644951555843875, + "language_loss": 0.73315656, + "learning_rate": 5.25680366614687e-08, + "loss": 0.75450063, + "num_input_tokens_seen": 333461550, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6640625, + "step": 15452, + "time_per_iteration": 2.404226064682007 + }, + { + "auxiliary_loss_clip": 0.01101725, + "auxiliary_loss_mlp": 0.01027462, + "balance_loss_clip": 1.01583314, + "balance_loss_mlp": 1.0372479, + "epoch": 0.9290846234781301, + "flos": 20047132321920.0, + "grad_norm": 1.8050090440464128, + "language_loss": 0.74203956, + "learning_rate": 5.2479367596064196e-08, + "loss": 0.76333141, + "num_input_tokens_seen": 333478835, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.64453125, + "step": 15453, + "time_per_iteration": 2.4306046962738037 + }, + { + "auxiliary_loss_clip": 0.01021773, + "auxiliary_loss_mlp": 0.01002626, + "balance_loss_clip": 1.00164855, + "balance_loss_mlp": 1.00152075, + "epoch": 0.9291447467307982, + "flos": 61227514460160.0, + "grad_norm": 0.8231858561820261, + "language_loss": 0.60632885, + "learning_rate": 5.2390772380837226e-08, + "loss": 0.62657285, + "num_input_tokens_seen": 333535250, + "router_z_loss_clip": 0.00976562, + "router_z_loss_mlp": 0.203125, + "step": 15454, + "time_per_iteration": 2.995863437652588 + }, + { + "auxiliary_loss_clip": 0.01099994, + "auxiliary_loss_mlp": 0.01031369, + "balance_loss_clip": 1.01989484, + "balance_loss_mlp": 1.03345144, + "epoch": 0.9292048699834661, + "flos": 20553328736640.0, + "grad_norm": 1.4950460872620022, + "language_loss": 0.68971264, + "learning_rate": 5.230225101914709e-08, + "loss": 0.71102631, + "num_input_tokens_seen": 333553805, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 15455, + "time_per_iteration": 2.4303104877471924 + }, + { + "auxiliary_loss_clip": 0.01101049, + "auxiliary_loss_mlp": 0.01029806, + "balance_loss_clip": 1.01793849, + "balance_loss_mlp": 1.03573239, + "epoch": 0.9292649932361341, + "flos": 23623655477760.0, + "grad_norm": 1.7032776336080993, + "language_loss": 0.64673263, + "learning_rate": 5.22138035143509e-08, + "loss": 0.66804117, + "num_input_tokens_seen": 333572800, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.65234375, + "step": 15456, + "time_per_iteration": 2.4663398265838623 + }, + { + "auxiliary_loss_clip": 0.01100142, + "auxiliary_loss_mlp": 0.0102875, + "balance_loss_clip": 1.01669145, + "balance_loss_mlp": 1.03616011, + "epoch": 0.929325116488802, + "flos": 15009942602880.0, + "grad_norm": 1.6544080428744494, + "language_loss": 0.68180311, + "learning_rate": 5.2125429869802615e-08, + "loss": 0.70309204, + "num_input_tokens_seen": 333588520, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.640625, + "step": 15457, + "time_per_iteration": 2.387383460998535 + }, + { + "auxiliary_loss_clip": 0.0109956, + "auxiliary_loss_mlp": 0.01025563, + "balance_loss_clip": 1.01436925, + "balance_loss_mlp": 1.03264999, + "epoch": 0.92938523974147, + "flos": 17967365919360.0, + "grad_norm": 1.9914999600759236, + "language_loss": 0.80684668, + "learning_rate": 5.203713008885291e-08, + "loss": 0.82809794, + "num_input_tokens_seen": 333603435, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 15458, + "time_per_iteration": 2.411698341369629 + }, + { + "auxiliary_loss_clip": 0.01100132, + "auxiliary_loss_mlp": 0.01032442, + "balance_loss_clip": 1.02089047, + "balance_loss_mlp": 1.03419471, + "epoch": 0.9294453629941379, + "flos": 23003047267200.0, + "grad_norm": 1.567399109434874, + "language_loss": 0.72272772, + "learning_rate": 5.194890417485065e-08, + "loss": 0.74405348, + "num_input_tokens_seen": 333623305, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66015625, + "step": 15459, + "time_per_iteration": 2.450777769088745 + }, + { + "auxiliary_loss_clip": 0.01101616, + "auxiliary_loss_mlp": 0.01028744, + "balance_loss_clip": 1.01775837, + "balance_loss_mlp": 1.03589296, + "epoch": 0.929505486246806, + "flos": 17055234927360.0, + "grad_norm": 3.1276525868113665, + "language_loss": 0.58476692, + "learning_rate": 5.1860752131141384e-08, + "loss": 0.60607052, + "num_input_tokens_seen": 333641205, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.65625, + "step": 15460, + "time_per_iteration": 2.406024932861328 + }, + { + "auxiliary_loss_clip": 0.01102163, + "auxiliary_loss_mlp": 0.01032031, + "balance_loss_clip": 1.01987743, + "balance_loss_mlp": 1.03494763, + "epoch": 0.9295656094994739, + "flos": 27340409329920.0, + "grad_norm": 2.5998175218554778, + "language_loss": 0.8040331, + "learning_rate": 5.177267396106733e-08, + "loss": 0.82537508, + "num_input_tokens_seen": 333659615, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 15461, + "time_per_iteration": 2.478937864303589 + }, + { + "auxiliary_loss_clip": 0.01095702, + "auxiliary_loss_mlp": 0.01029192, + "balance_loss_clip": 1.01769996, + "balance_loss_mlp": 1.03278279, + "epoch": 0.9296257327521419, + "flos": 21470954509440.0, + "grad_norm": 2.0121095429582807, + "language_loss": 0.78226018, + "learning_rate": 5.168466966796869e-08, + "loss": 0.80350912, + "num_input_tokens_seen": 333678985, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.62890625, + "step": 15462, + "time_per_iteration": 2.471994400024414 + }, + { + "auxiliary_loss_clip": 0.01098138, + "auxiliary_loss_mlp": 0.01023728, + "balance_loss_clip": 1.01270103, + "balance_loss_mlp": 1.03229225, + "epoch": 0.9296858560048099, + "flos": 16362661818240.0, + "grad_norm": 1.846715465327114, + "language_loss": 0.62358242, + "learning_rate": 5.159673925518282e-08, + "loss": 0.64480114, + "num_input_tokens_seen": 333696410, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 15463, + "time_per_iteration": 2.543200969696045 + }, + { + "auxiliary_loss_clip": 0.01096034, + "auxiliary_loss_mlp": 0.01025972, + "balance_loss_clip": 1.01537442, + "balance_loss_mlp": 1.03193223, + "epoch": 0.9297459792574778, + "flos": 29858609139840.0, + "grad_norm": 1.4319692146419465, + "language_loss": 0.70946103, + "learning_rate": 5.15088827260437e-08, + "loss": 0.73068112, + "num_input_tokens_seen": 333716615, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.640625, + "step": 15464, + "time_per_iteration": 2.557159185409546 + }, + { + "auxiliary_loss_clip": 0.01098841, + "auxiliary_loss_mlp": 0.01026744, + "balance_loss_clip": 1.01610518, + "balance_loss_mlp": 1.0332737, + "epoch": 0.9298061025101458, + "flos": 15924838942080.0, + "grad_norm": 2.308666209262228, + "language_loss": 0.77049506, + "learning_rate": 5.1421100083883115e-08, + "loss": 0.79175085, + "num_input_tokens_seen": 333732800, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.65625, + "step": 15465, + "time_per_iteration": 2.412461280822754 + }, + { + "auxiliary_loss_clip": 0.01021493, + "auxiliary_loss_mlp": 0.00998557, + "balance_loss_clip": 0.99757355, + "balance_loss_mlp": 1.00142288, + "epoch": 0.9298662257628137, + "flos": 64096994304000.0, + "grad_norm": 0.6915312931850184, + "language_loss": 0.56440043, + "learning_rate": 5.133339133202952e-08, + "loss": 0.58460093, + "num_input_tokens_seen": 333799300, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.20117188, + "step": 15466, + "time_per_iteration": 3.1553313732147217 + }, + { + "auxiliary_loss_clip": 0.0109893, + "auxiliary_loss_mlp": 0.01037894, + "balance_loss_clip": 1.02524638, + "balance_loss_mlp": 1.03270507, + "epoch": 0.9299263490154818, + "flos": 24280210224000.0, + "grad_norm": 1.430282713051718, + "language_loss": 0.72837657, + "learning_rate": 5.1245756473809355e-08, + "loss": 0.74974477, + "num_input_tokens_seen": 333820360, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6640625, + "step": 15467, + "time_per_iteration": 2.465402603149414 + }, + { + "auxiliary_loss_clip": 0.01101047, + "auxiliary_loss_mlp": 0.01031832, + "balance_loss_clip": 1.02004814, + "balance_loss_mlp": 1.034567, + "epoch": 0.9299864722681497, + "flos": 23294354567040.0, + "grad_norm": 1.6525069967751043, + "language_loss": 0.7171756, + "learning_rate": 5.1158195512545076e-08, + "loss": 0.73850441, + "num_input_tokens_seen": 333840415, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 15468, + "time_per_iteration": 2.467027187347412 + }, + { + "auxiliary_loss_clip": 0.01100943, + "auxiliary_loss_mlp": 0.01029691, + "balance_loss_clip": 1.0181818, + "balance_loss_mlp": 1.03316689, + "epoch": 0.9300465955208177, + "flos": 21395972868480.0, + "grad_norm": 1.6614618348928094, + "language_loss": 0.7563262, + "learning_rate": 5.107070845155737e-08, + "loss": 0.77763259, + "num_input_tokens_seen": 333859910, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.67578125, + "step": 15469, + "time_per_iteration": 2.4551570415496826 + }, + { + "auxiliary_loss_clip": 0.0109919, + "auxiliary_loss_mlp": 0.01030519, + "balance_loss_clip": 1.01893151, + "balance_loss_mlp": 1.03328323, + "epoch": 0.9301067187734856, + "flos": 24571445696640.0, + "grad_norm": 3.117567963702495, + "language_loss": 0.75602072, + "learning_rate": 5.098329529416379e-08, + "loss": 0.77731776, + "num_input_tokens_seen": 333880495, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66015625, + "step": 15470, + "time_per_iteration": 3.9293041229248047 + }, + { + "auxiliary_loss_clip": 0.01098686, + "auxiliary_loss_mlp": 0.01027864, + "balance_loss_clip": 1.0168792, + "balance_loss_mlp": 1.03350711, + "epoch": 0.9301668420261536, + "flos": 22196960202240.0, + "grad_norm": 1.4761653609508787, + "language_loss": 0.7473954, + "learning_rate": 5.089595604367902e-08, + "loss": 0.7686609, + "num_input_tokens_seen": 333897640, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.65234375, + "step": 15471, + "time_per_iteration": 2.435100793838501 + }, + { + "auxiliary_loss_clip": 0.01098709, + "auxiliary_loss_mlp": 0.01027854, + "balance_loss_clip": 1.01604009, + "balance_loss_mlp": 1.03407836, + "epoch": 0.9302269652788215, + "flos": 17747628468480.0, + "grad_norm": 2.1879069429390006, + "language_loss": 0.69004017, + "learning_rate": 5.080869070341487e-08, + "loss": 0.7113058, + "num_input_tokens_seen": 333913670, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6484375, + "step": 15472, + "time_per_iteration": 3.8225207328796387 + }, + { + "auxiliary_loss_clip": 0.01094581, + "auxiliary_loss_mlp": 0.01029343, + "balance_loss_clip": 1.01865005, + "balance_loss_mlp": 1.03333116, + "epoch": 0.9302870885314896, + "flos": 19390793057280.0, + "grad_norm": 1.609251941802182, + "language_loss": 0.88353068, + "learning_rate": 5.0721499276680233e-08, + "loss": 0.9047699, + "num_input_tokens_seen": 333934105, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.61328125, + "step": 15473, + "time_per_iteration": 3.829770088195801 + }, + { + "auxiliary_loss_clip": 0.01104013, + "auxiliary_loss_mlp": 0.01033904, + "balance_loss_clip": 1.02066016, + "balance_loss_mlp": 1.03545713, + "epoch": 0.9303472117841575, + "flos": 21760286561280.0, + "grad_norm": 1.8613424502001032, + "language_loss": 0.64229345, + "learning_rate": 5.063438176678203e-08, + "loss": 0.66367269, + "num_input_tokens_seen": 333953635, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6875, + "step": 15474, + "time_per_iteration": 2.430070161819458 + }, + { + "auxiliary_loss_clip": 0.01099714, + "auxiliary_loss_mlp": 0.01029633, + "balance_loss_clip": 1.0180459, + "balance_loss_mlp": 1.03443456, + "epoch": 0.9304073350368255, + "flos": 19609740408960.0, + "grad_norm": 1.634183098682429, + "language_loss": 0.7463553, + "learning_rate": 5.054733817702339e-08, + "loss": 0.76764882, + "num_input_tokens_seen": 333971825, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65234375, + "step": 15475, + "time_per_iteration": 2.433985948562622 + }, + { + "auxiliary_loss_clip": 0.01097957, + "auxiliary_loss_mlp": 0.01024983, + "balance_loss_clip": 1.0143497, + "balance_loss_mlp": 1.03309751, + "epoch": 0.9304674582894935, + "flos": 30441582875520.0, + "grad_norm": 2.010592371284976, + "language_loss": 0.66876173, + "learning_rate": 5.0460368510704786e-08, + "loss": 0.68999112, + "num_input_tokens_seen": 333990120, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6484375, + "step": 15476, + "time_per_iteration": 2.505357503890991 + }, + { + "auxiliary_loss_clip": 0.01102035, + "auxiliary_loss_mlp": 0.01032612, + "balance_loss_clip": 1.02085757, + "balance_loss_mlp": 1.03647363, + "epoch": 0.9305275815421614, + "flos": 17785693906560.0, + "grad_norm": 2.173665813572123, + "language_loss": 0.68965471, + "learning_rate": 5.0373472771124914e-08, + "loss": 0.71100122, + "num_input_tokens_seen": 334007970, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.65625, + "step": 15477, + "time_per_iteration": 3.904513120651245 + }, + { + "auxiliary_loss_clip": 0.01098178, + "auxiliary_loss_mlp": 0.01027662, + "balance_loss_clip": 1.01687384, + "balance_loss_mlp": 1.03475642, + "epoch": 0.9305877047948294, + "flos": 25298456970240.0, + "grad_norm": 1.8851394448643317, + "language_loss": 0.58472347, + "learning_rate": 5.0286650961578027e-08, + "loss": 0.60598183, + "num_input_tokens_seen": 334027120, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6328125, + "step": 15478, + "time_per_iteration": 2.4862332344055176 + }, + { + "auxiliary_loss_clip": 0.01104232, + "auxiliary_loss_mlp": 0.0102873, + "balance_loss_clip": 1.01574802, + "balance_loss_mlp": 1.03474784, + "epoch": 0.9306478280474973, + "flos": 16977236544000.0, + "grad_norm": 4.322822362251628, + "language_loss": 0.79143488, + "learning_rate": 5.01999030853566e-08, + "loss": 0.81276453, + "num_input_tokens_seen": 334042785, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6953125, + "step": 15479, + "time_per_iteration": 2.3997929096221924 + }, + { + "auxiliary_loss_clip": 0.01099209, + "auxiliary_loss_mlp": 0.01032056, + "balance_loss_clip": 1.02073741, + "balance_loss_mlp": 1.03325725, + "epoch": 0.9307079513001654, + "flos": 35663353608960.0, + "grad_norm": 2.4723899654075554, + "language_loss": 0.68572581, + "learning_rate": 5.0113229145750445e-08, + "loss": 0.70703846, + "num_input_tokens_seen": 334063480, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66015625, + "step": 15480, + "time_per_iteration": 2.566641092300415 + }, + { + "auxiliary_loss_clip": 0.01099075, + "auxiliary_loss_mlp": 0.01029607, + "balance_loss_clip": 1.01816869, + "balance_loss_mlp": 1.03372073, + "epoch": 0.9307680745528333, + "flos": 19208151377280.0, + "grad_norm": 1.69653427681413, + "language_loss": 0.67943531, + "learning_rate": 5.002662914604583e-08, + "loss": 0.7007221, + "num_input_tokens_seen": 334082005, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 15481, + "time_per_iteration": 2.4178357124328613 + }, + { + "auxiliary_loss_clip": 0.01096176, + "auxiliary_loss_mlp": 0.01025523, + "balance_loss_clip": 1.01472914, + "balance_loss_mlp": 1.03221035, + "epoch": 0.9308281978055013, + "flos": 19062641381760.0, + "grad_norm": 2.0018171339857744, + "language_loss": 0.74707091, + "learning_rate": 4.994010308952701e-08, + "loss": 0.76828778, + "num_input_tokens_seen": 334101375, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.640625, + "step": 15482, + "time_per_iteration": 2.46037220954895 + }, + { + "auxiliary_loss_clip": 0.01094997, + "auxiliary_loss_mlp": 0.010277, + "balance_loss_clip": 1.0167743, + "balance_loss_mlp": 1.03203559, + "epoch": 0.9308883210581692, + "flos": 20521548178560.0, + "grad_norm": 1.7837659675322086, + "language_loss": 0.79909325, + "learning_rate": 4.985365097947469e-08, + "loss": 0.82032025, + "num_input_tokens_seen": 334119460, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.62890625, + "step": 15483, + "time_per_iteration": 2.424943208694458 + }, + { + "auxiliary_loss_clip": 0.01098575, + "auxiliary_loss_mlp": 0.01028205, + "balance_loss_clip": 1.01651073, + "balance_loss_mlp": 1.03387845, + "epoch": 0.9309484443108372, + "flos": 13001422826880.0, + "grad_norm": 2.0304206547366777, + "language_loss": 0.74465203, + "learning_rate": 4.976727281916782e-08, + "loss": 0.76591992, + "num_input_tokens_seen": 334136065, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6484375, + "step": 15484, + "time_per_iteration": 2.427067518234253 + }, + { + "auxiliary_loss_clip": 0.01102814, + "auxiliary_loss_mlp": 0.01030027, + "balance_loss_clip": 1.01819587, + "balance_loss_mlp": 1.03494906, + "epoch": 0.9310085675635051, + "flos": 12567765928320.0, + "grad_norm": 2.305608491408132, + "language_loss": 0.76315653, + "learning_rate": 4.968096861188087e-08, + "loss": 0.78448498, + "num_input_tokens_seen": 334153690, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 15485, + "time_per_iteration": 2.3986244201660156 + }, + { + "auxiliary_loss_clip": 0.01100485, + "auxiliary_loss_mlp": 0.01029104, + "balance_loss_clip": 1.01677799, + "balance_loss_mlp": 1.03375554, + "epoch": 0.9310686908161732, + "flos": 23477570864640.0, + "grad_norm": 1.7799612714984643, + "language_loss": 0.7810412, + "learning_rate": 4.959473836088723e-08, + "loss": 0.80233711, + "num_input_tokens_seen": 334171880, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66796875, + "step": 15486, + "time_per_iteration": 2.4667983055114746 + }, + { + "auxiliary_loss_clip": 0.01104748, + "auxiliary_loss_mlp": 0.01029377, + "balance_loss_clip": 1.01735497, + "balance_loss_mlp": 1.03717089, + "epoch": 0.9311288140688411, + "flos": 24170287628160.0, + "grad_norm": 2.004478258444932, + "language_loss": 0.77159125, + "learning_rate": 4.950858206945674e-08, + "loss": 0.79293251, + "num_input_tokens_seen": 334190005, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 15487, + "time_per_iteration": 2.446272134780884 + }, + { + "auxiliary_loss_clip": 0.01099239, + "auxiliary_loss_mlp": 0.01026054, + "balance_loss_clip": 1.01362669, + "balance_loss_mlp": 1.03425145, + "epoch": 0.9311889373215091, + "flos": 35590203561600.0, + "grad_norm": 2.746205052123672, + "language_loss": 0.66514063, + "learning_rate": 4.942249974085633e-08, + "loss": 0.68639356, + "num_input_tokens_seen": 334209545, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6484375, + "step": 15488, + "time_per_iteration": 2.6267404556274414 + }, + { + "auxiliary_loss_clip": 0.01095561, + "auxiliary_loss_mlp": 0.01029729, + "balance_loss_clip": 1.01797533, + "balance_loss_mlp": 1.03325832, + "epoch": 0.9312490605741771, + "flos": 20230528187520.0, + "grad_norm": 1.8384174962011377, + "language_loss": 0.74991691, + "learning_rate": 4.933649137834983e-08, + "loss": 0.77116984, + "num_input_tokens_seen": 334228900, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.625, + "step": 15489, + "time_per_iteration": 2.465559959411621 + }, + { + "auxiliary_loss_clip": 0.01100415, + "auxiliary_loss_mlp": 0.01028757, + "balance_loss_clip": 1.01684833, + "balance_loss_mlp": 1.0337944, + "epoch": 0.931309183826845, + "flos": 13950577762560.0, + "grad_norm": 2.563733081982058, + "language_loss": 0.80619878, + "learning_rate": 4.925055698519931e-08, + "loss": 0.82749051, + "num_input_tokens_seen": 334245500, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66796875, + "step": 15490, + "time_per_iteration": 2.4163243770599365 + }, + { + "auxiliary_loss_clip": 0.01101263, + "auxiliary_loss_mlp": 0.01032482, + "balance_loss_clip": 1.02013838, + "balance_loss_mlp": 1.03481793, + "epoch": 0.931369307079513, + "flos": 20156731695360.0, + "grad_norm": 1.5785562831132516, + "language_loss": 0.72108269, + "learning_rate": 4.9164696564663264e-08, + "loss": 0.7424202, + "num_input_tokens_seen": 334264370, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6640625, + "step": 15491, + "time_per_iteration": 2.4315860271453857 + }, + { + "auxiliary_loss_clip": 0.01095636, + "auxiliary_loss_mlp": 0.01027277, + "balance_loss_clip": 1.01641083, + "balance_loss_mlp": 1.03252959, + "epoch": 0.931429430332181, + "flos": 25338569483520.0, + "grad_norm": 1.7853730669825214, + "language_loss": 0.74627632, + "learning_rate": 4.9078910119997096e-08, + "loss": 0.76750547, + "num_input_tokens_seen": 334283905, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6328125, + "step": 15492, + "time_per_iteration": 2.512267827987671 + }, + { + "auxiliary_loss_clip": 0.01021891, + "auxiliary_loss_mlp": 0.00998959, + "balance_loss_clip": 0.99805337, + "balance_loss_mlp": 1.0018034, + "epoch": 0.931489553584849, + "flos": 71226193985280.0, + "grad_norm": 0.712039300089528, + "language_loss": 0.53438187, + "learning_rate": 4.899319765445442e-08, + "loss": 0.5545904, + "num_input_tokens_seen": 334339925, + "router_z_loss_clip": 0.0090332, + "router_z_loss_mlp": 0.20117188, + "step": 15493, + "time_per_iteration": 2.916949510574341 + }, + { + "auxiliary_loss_clip": 0.01098383, + "auxiliary_loss_mlp": 0.01027658, + "balance_loss_clip": 1.01680434, + "balance_loss_mlp": 1.03383327, + "epoch": 0.9315496768375169, + "flos": 14643653662080.0, + "grad_norm": 1.6838927054928123, + "language_loss": 0.71094936, + "learning_rate": 4.890755917128531e-08, + "loss": 0.73220974, + "num_input_tokens_seen": 334357225, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.64453125, + "step": 15494, + "time_per_iteration": 2.4050588607788086 + }, + { + "auxiliary_loss_clip": 0.01100667, + "auxiliary_loss_mlp": 0.01024887, + "balance_loss_clip": 1.01355577, + "balance_loss_mlp": 1.03352082, + "epoch": 0.9316098000901849, + "flos": 28329928174080.0, + "grad_norm": 1.881542723418203, + "language_loss": 0.68522328, + "learning_rate": 4.882199467373671e-08, + "loss": 0.70647883, + "num_input_tokens_seen": 334375945, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 15495, + "time_per_iteration": 2.4895150661468506 + }, + { + "auxiliary_loss_clip": 0.01095117, + "auxiliary_loss_mlp": 0.01031644, + "balance_loss_clip": 1.02116537, + "balance_loss_mlp": 1.03182006, + "epoch": 0.9316699233428528, + "flos": 28512677594880.0, + "grad_norm": 1.762424325452625, + "language_loss": 0.61511773, + "learning_rate": 4.8736504165053815e-08, + "loss": 0.63638532, + "num_input_tokens_seen": 334395310, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.6328125, + "step": 15496, + "time_per_iteration": 2.494763135910034 + }, + { + "auxiliary_loss_clip": 0.01099639, + "auxiliary_loss_mlp": 0.01030735, + "balance_loss_clip": 1.01899242, + "balance_loss_mlp": 1.03399265, + "epoch": 0.9317300465955208, + "flos": 33693402061440.0, + "grad_norm": 1.5515145682874567, + "language_loss": 0.77042872, + "learning_rate": 4.865108764847825e-08, + "loss": 0.79173243, + "num_input_tokens_seen": 334416965, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 15497, + "time_per_iteration": 2.5357086658477783 + }, + { + "auxiliary_loss_clip": 0.01102796, + "auxiliary_loss_mlp": 0.01033, + "balance_loss_clip": 1.02090013, + "balance_loss_mlp": 1.03550088, + "epoch": 0.9317901698481887, + "flos": 23658237296640.0, + "grad_norm": 1.9526034329415265, + "language_loss": 0.66362846, + "learning_rate": 4.856574512724898e-08, + "loss": 0.68498641, + "num_input_tokens_seen": 334435620, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 15498, + "time_per_iteration": 2.467374563217163 + }, + { + "auxiliary_loss_clip": 0.01101485, + "auxiliary_loss_mlp": 0.01035741, + "balance_loss_clip": 1.02316427, + "balance_loss_mlp": 1.03501487, + "epoch": 0.9318502931008568, + "flos": 20960017499520.0, + "grad_norm": 1.7610743502537445, + "language_loss": 0.79906923, + "learning_rate": 4.8480476604602305e-08, + "loss": 0.82044148, + "num_input_tokens_seen": 334456210, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6640625, + "step": 15499, + "time_per_iteration": 2.436098337173462 + }, + { + "auxiliary_loss_clip": 0.01098432, + "auxiliary_loss_mlp": 0.01032221, + "balance_loss_clip": 1.0203414, + "balance_loss_mlp": 1.03484273, + "epoch": 0.9319104163535247, + "flos": 23441049711360.0, + "grad_norm": 1.812161869986891, + "language_loss": 0.76557505, + "learning_rate": 4.8395282083771196e-08, + "loss": 0.78688157, + "num_input_tokens_seen": 334475485, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.63671875, + "step": 15500, + "time_per_iteration": 2.46466064453125 + }, + { + "auxiliary_loss_clip": 0.01095242, + "auxiliary_loss_mlp": 0.01025287, + "balance_loss_clip": 1.01396239, + "balance_loss_mlp": 1.03133726, + "epoch": 0.9319705396061927, + "flos": 22347426274560.0, + "grad_norm": 1.6727103835965809, + "language_loss": 0.72225916, + "learning_rate": 4.8310161567987064e-08, + "loss": 0.74346447, + "num_input_tokens_seen": 334494740, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 15501, + "time_per_iteration": 2.4670472145080566 + }, + { + "auxiliary_loss_clip": 0.01101445, + "auxiliary_loss_mlp": 0.01030377, + "balance_loss_clip": 1.01876068, + "balance_loss_mlp": 1.0345726, + "epoch": 0.9320306628588607, + "flos": 20993557824000.0, + "grad_norm": 1.9501321316828164, + "language_loss": 0.6632303, + "learning_rate": 4.822511506047666e-08, + "loss": 0.6845485, + "num_input_tokens_seen": 334511910, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66796875, + "step": 15502, + "time_per_iteration": 2.4331064224243164 + }, + { + "auxiliary_loss_clip": 0.011008, + "auxiliary_loss_mlp": 0.01032297, + "balance_loss_clip": 1.02094793, + "balance_loss_mlp": 1.03379011, + "epoch": 0.9320907861115286, + "flos": 24538300421760.0, + "grad_norm": 1.4373867863425007, + "language_loss": 0.65522575, + "learning_rate": 4.814014256446586e-08, + "loss": 0.67655671, + "num_input_tokens_seen": 334533150, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.671875, + "step": 15503, + "time_per_iteration": 2.4875681400299072 + }, + { + "auxiliary_loss_clip": 0.01100687, + "auxiliary_loss_mlp": 0.01030624, + "balance_loss_clip": 1.01885748, + "balance_loss_mlp": 1.03377271, + "epoch": 0.9321509093641966, + "flos": 19785414850560.0, + "grad_norm": 1.4699248091106074, + "language_loss": 0.74906504, + "learning_rate": 4.805524408317652e-08, + "loss": 0.77037811, + "num_input_tokens_seen": 334550940, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.66796875, + "step": 15504, + "time_per_iteration": 2.4550881385803223 + }, + { + "auxiliary_loss_clip": 0.01101391, + "auxiliary_loss_mlp": 0.01027238, + "balance_loss_clip": 1.0146023, + "balance_loss_mlp": 1.03532541, + "epoch": 0.9322110326168646, + "flos": 24972675592320.0, + "grad_norm": 1.9858740519405689, + "language_loss": 0.71027422, + "learning_rate": 4.797041961982762e-08, + "loss": 0.73156059, + "num_input_tokens_seen": 334570935, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.66015625, + "step": 15505, + "time_per_iteration": 2.471879482269287 + }, + { + "auxiliary_loss_clip": 0.01100408, + "auxiliary_loss_mlp": 0.01029155, + "balance_loss_clip": 1.0168885, + "balance_loss_mlp": 1.03499889, + "epoch": 0.9322711558695326, + "flos": 16143642639360.0, + "grad_norm": 1.9200486869690463, + "language_loss": 0.75246066, + "learning_rate": 4.788566917763614e-08, + "loss": 0.77375627, + "num_input_tokens_seen": 334589315, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.65625, + "step": 15506, + "time_per_iteration": 2.412144660949707 + }, + { + "auxiliary_loss_clip": 0.01097297, + "auxiliary_loss_mlp": 0.01023636, + "balance_loss_clip": 1.01296115, + "balance_loss_mlp": 1.03512335, + "epoch": 0.9323312791222005, + "flos": 23732428838400.0, + "grad_norm": 1.9532236160910172, + "language_loss": 0.83267069, + "learning_rate": 4.780099275981597e-08, + "loss": 0.85388005, + "num_input_tokens_seen": 334608990, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.62109375, + "step": 15507, + "time_per_iteration": 2.4542086124420166 + }, + { + "auxiliary_loss_clip": 0.01099933, + "auxiliary_loss_mlp": 0.01028089, + "balance_loss_clip": 1.01634693, + "balance_loss_mlp": 1.03375268, + "epoch": 0.9323914023748685, + "flos": 20777914523520.0, + "grad_norm": 1.4480770625048591, + "language_loss": 0.67718458, + "learning_rate": 4.771639036957742e-08, + "loss": 0.69846487, + "num_input_tokens_seen": 334628655, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6640625, + "step": 15508, + "time_per_iteration": 2.483059883117676 + }, + { + "auxiliary_loss_clip": 0.01097823, + "auxiliary_loss_mlp": 0.01030103, + "balance_loss_clip": 1.01831305, + "balance_loss_mlp": 1.03426003, + "epoch": 0.9324515256275364, + "flos": 23915178259200.0, + "grad_norm": 1.6151722837664564, + "language_loss": 0.71979308, + "learning_rate": 4.7631862010129033e-08, + "loss": 0.7410723, + "num_input_tokens_seen": 334648295, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.63671875, + "step": 15509, + "time_per_iteration": 2.444472551345825 + }, + { + "auxiliary_loss_clip": 0.01099809, + "auxiliary_loss_mlp": 0.01031167, + "balance_loss_clip": 1.01989579, + "balance_loss_mlp": 1.03414321, + "epoch": 0.9325116488802044, + "flos": 18005215875840.0, + "grad_norm": 1.7453769402729238, + "language_loss": 0.74520022, + "learning_rate": 4.754740768467624e-08, + "loss": 0.76651001, + "num_input_tokens_seen": 334666280, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 15510, + "time_per_iteration": 2.4823153018951416 + }, + { + "auxiliary_loss_clip": 0.01100681, + "auxiliary_loss_mlp": 0.01026225, + "balance_loss_clip": 1.01489401, + "balance_loss_mlp": 1.03300691, + "epoch": 0.9325717721328723, + "flos": 29021603443200.0, + "grad_norm": 1.518393059285664, + "language_loss": 0.70252025, + "learning_rate": 4.746302739642161e-08, + "loss": 0.72378927, + "num_input_tokens_seen": 334688830, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.67578125, + "step": 15511, + "time_per_iteration": 2.5080928802490234 + }, + { + "auxiliary_loss_clip": 0.01100016, + "auxiliary_loss_mlp": 0.01034567, + "balance_loss_clip": 1.02306938, + "balance_loss_mlp": 1.03380418, + "epoch": 0.9326318953855404, + "flos": 21646341642240.0, + "grad_norm": 1.7360403763937744, + "language_loss": 0.78284937, + "learning_rate": 4.737872114856412e-08, + "loss": 0.80419517, + "num_input_tokens_seen": 334705205, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 15512, + "time_per_iteration": 3.827505111694336 + }, + { + "auxiliary_loss_clip": 0.01097608, + "auxiliary_loss_mlp": 0.01028491, + "balance_loss_clip": 1.01638508, + "balance_loss_mlp": 1.03290629, + "epoch": 0.9326920186382083, + "flos": 26065724411520.0, + "grad_norm": 1.4383075090832378, + "language_loss": 0.80445802, + "learning_rate": 4.7294488944301436e-08, + "loss": 0.825719, + "num_input_tokens_seen": 334723830, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6484375, + "step": 15513, + "time_per_iteration": 4.101969003677368 + }, + { + "auxiliary_loss_clip": 0.01105336, + "auxiliary_loss_mlp": 0.01032016, + "balance_loss_clip": 1.01902199, + "balance_loss_mlp": 1.03549969, + "epoch": 0.9327521418908763, + "flos": 12057116227200.0, + "grad_norm": 1.8828331415899686, + "language_loss": 0.80006057, + "learning_rate": 4.721033078682768e-08, + "loss": 0.82143408, + "num_input_tokens_seen": 334740825, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.69921875, + "step": 15514, + "time_per_iteration": 3.816762685775757 + }, + { + "auxiliary_loss_clip": 0.0109669, + "auxiliary_loss_mlp": 0.01037809, + "balance_loss_clip": 1.02678835, + "balance_loss_mlp": 1.03406465, + "epoch": 0.9328122651435443, + "flos": 43834395271680.0, + "grad_norm": 1.811888338938053, + "language_loss": 0.71603918, + "learning_rate": 4.7126246679333626e-08, + "loss": 0.7373842, + "num_input_tokens_seen": 334765825, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.625, + "step": 15515, + "time_per_iteration": 2.6306512355804443 + }, + { + "auxiliary_loss_clip": 0.01103001, + "auxiliary_loss_mlp": 0.01030438, + "balance_loss_clip": 1.01835024, + "balance_loss_mlp": 1.03492391, + "epoch": 0.9328723883962122, + "flos": 15194954580480.0, + "grad_norm": 2.3044810478032054, + "language_loss": 0.81098676, + "learning_rate": 4.704223662500806e-08, + "loss": 0.83232123, + "num_input_tokens_seen": 334782680, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6796875, + "step": 15516, + "time_per_iteration": 2.4116766452789307 + }, + { + "auxiliary_loss_clip": 0.01101179, + "auxiliary_loss_mlp": 0.01032757, + "balance_loss_clip": 1.02131283, + "balance_loss_mlp": 1.03447771, + "epoch": 0.9329325116488802, + "flos": 20261770041600.0, + "grad_norm": 1.8676294206901967, + "language_loss": 0.8110435, + "learning_rate": 4.695830062703643e-08, + "loss": 0.8323828, + "num_input_tokens_seen": 334800160, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 15517, + "time_per_iteration": 2.431884288787842 + }, + { + "auxiliary_loss_clip": 0.01099406, + "auxiliary_loss_mlp": 0.01030065, + "balance_loss_clip": 1.01764393, + "balance_loss_mlp": 1.03308225, + "epoch": 0.9329926349015482, + "flos": 13115008609920.0, + "grad_norm": 4.35442936800558, + "language_loss": 0.74301833, + "learning_rate": 4.687443868860219e-08, + "loss": 0.76431304, + "num_input_tokens_seen": 334815840, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6640625, + "step": 15518, + "time_per_iteration": 3.8434197902679443 + }, + { + "auxiliary_loss_clip": 0.01099221, + "auxiliary_loss_mlp": 0.01030838, + "balance_loss_clip": 1.01914942, + "balance_loss_mlp": 1.0343585, + "epoch": 0.9330527581542162, + "flos": 23040250778880.0, + "grad_norm": 1.9069245404025545, + "language_loss": 0.75698578, + "learning_rate": 4.679065081288458e-08, + "loss": 0.77828634, + "num_input_tokens_seen": 334834735, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6484375, + "step": 15519, + "time_per_iteration": 2.4393157958984375 + }, + { + "auxiliary_loss_clip": 0.0109868, + "auxiliary_loss_mlp": 0.01032583, + "balance_loss_clip": 1.02047777, + "balance_loss_mlp": 1.03313446, + "epoch": 0.9331128814068841, + "flos": 15559627409280.0, + "grad_norm": 2.2060561744279785, + "language_loss": 0.83241522, + "learning_rate": 4.6706937003061275e-08, + "loss": 0.85372788, + "num_input_tokens_seen": 334853490, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.65625, + "step": 15520, + "time_per_iteration": 2.4281809329986572 + }, + { + "auxiliary_loss_clip": 0.01097061, + "auxiliary_loss_mlp": 0.01027896, + "balance_loss_clip": 1.01652932, + "balance_loss_mlp": 1.03303838, + "epoch": 0.9331730046595521, + "flos": 22271762275200.0, + "grad_norm": 1.737833160408125, + "language_loss": 0.762685, + "learning_rate": 4.6623297262306846e-08, + "loss": 0.78393459, + "num_input_tokens_seen": 334873675, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.640625, + "step": 15521, + "time_per_iteration": 2.45866060256958 + }, + { + "auxiliary_loss_clip": 0.01100752, + "auxiliary_loss_mlp": 0.0102937, + "balance_loss_clip": 1.01811028, + "balance_loss_mlp": 1.03619182, + "epoch": 0.93323312791222, + "flos": 15777641007360.0, + "grad_norm": 1.8418545146351015, + "language_loss": 0.77474684, + "learning_rate": 4.6539731593792545e-08, + "loss": 0.79604805, + "num_input_tokens_seen": 334890970, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.64453125, + "step": 15522, + "time_per_iteration": 2.429081678390503 + }, + { + "auxiliary_loss_clip": 0.01099774, + "auxiliary_loss_mlp": 0.01027111, + "balance_loss_clip": 1.01516044, + "balance_loss_mlp": 1.0343529, + "epoch": 0.933293251164888, + "flos": 22010978557440.0, + "grad_norm": 1.8415574051505679, + "language_loss": 0.63218462, + "learning_rate": 4.6456240000687373e-08, + "loss": 0.65345347, + "num_input_tokens_seen": 334906635, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.65625, + "step": 15523, + "time_per_iteration": 2.4532203674316406 + }, + { + "auxiliary_loss_clip": 0.01099046, + "auxiliary_loss_mlp": 0.01029255, + "balance_loss_clip": 1.01813912, + "balance_loss_mlp": 1.03468919, + "epoch": 0.933353374417556, + "flos": 26031358074240.0, + "grad_norm": 1.705594176735913, + "language_loss": 0.68374217, + "learning_rate": 4.63728224861577e-08, + "loss": 0.7050252, + "num_input_tokens_seen": 334926230, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.64453125, + "step": 15524, + "time_per_iteration": 2.5182032585144043 + }, + { + "auxiliary_loss_clip": 0.01100133, + "auxiliary_loss_mlp": 0.01031099, + "balance_loss_clip": 1.01965547, + "balance_loss_mlp": 1.03399134, + "epoch": 0.933413497670224, + "flos": 24900100162560.0, + "grad_norm": 2.2007198193448105, + "language_loss": 0.74041969, + "learning_rate": 4.628947905336589e-08, + "loss": 0.76173198, + "num_input_tokens_seen": 334946680, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 15525, + "time_per_iteration": 2.4798765182495117 + }, + { + "auxiliary_loss_clip": 0.01096428, + "auxiliary_loss_mlp": 0.01035116, + "balance_loss_clip": 1.02354097, + "balance_loss_mlp": 1.03289247, + "epoch": 0.9334736209228919, + "flos": 23688689051520.0, + "grad_norm": 1.793849639760779, + "language_loss": 0.83958673, + "learning_rate": 4.6206209705473175e-08, + "loss": 0.86090219, + "num_input_tokens_seen": 334964785, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6328125, + "step": 15526, + "time_per_iteration": 2.4932663440704346 + }, + { + "auxiliary_loss_clip": 0.01102195, + "auxiliary_loss_mlp": 0.01026302, + "balance_loss_clip": 1.01432729, + "balance_loss_mlp": 1.03541946, + "epoch": 0.9335337441755599, + "flos": 15377344865280.0, + "grad_norm": 1.9882587404334744, + "language_loss": 0.68634391, + "learning_rate": 4.61230144456366e-08, + "loss": 0.70762885, + "num_input_tokens_seen": 334982400, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66796875, + "step": 15527, + "time_per_iteration": 2.437804937362671 + }, + { + "auxiliary_loss_clip": 0.01101792, + "auxiliary_loss_mlp": 0.01028301, + "balance_loss_clip": 1.01533103, + "balance_loss_mlp": 1.03472137, + "epoch": 0.9335938674282279, + "flos": 16106726436480.0, + "grad_norm": 1.7450138693644768, + "language_loss": 0.64867574, + "learning_rate": 4.603989327701141e-08, + "loss": 0.66997665, + "num_input_tokens_seen": 334999685, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.66796875, + "step": 15528, + "time_per_iteration": 2.4500892162323 + }, + { + "auxiliary_loss_clip": 0.01100501, + "auxiliary_loss_mlp": 0.01029752, + "balance_loss_clip": 1.01744926, + "balance_loss_mlp": 1.03338695, + "epoch": 0.9336539906808958, + "flos": 18952898353920.0, + "grad_norm": 1.7859602907094914, + "language_loss": 0.75145864, + "learning_rate": 4.5956846202748867e-08, + "loss": 0.77276123, + "num_input_tokens_seen": 335019160, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.671875, + "step": 15529, + "time_per_iteration": 2.4640285968780518 + }, + { + "auxiliary_loss_clip": 0.01096769, + "auxiliary_loss_mlp": 0.01029427, + "balance_loss_clip": 1.01828122, + "balance_loss_mlp": 1.03274751, + "epoch": 0.9337141139335638, + "flos": 18109104986880.0, + "grad_norm": 1.785310773164946, + "language_loss": 0.62776995, + "learning_rate": 4.5873873225998674e-08, + "loss": 0.64903188, + "num_input_tokens_seen": 335037350, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.640625, + "step": 15530, + "time_per_iteration": 2.455909252166748 + }, + { + "auxiliary_loss_clip": 0.0109778, + "auxiliary_loss_mlp": 0.01028522, + "balance_loss_clip": 1.01763248, + "balance_loss_mlp": 1.0340414, + "epoch": 0.9337742371862318, + "flos": 17345716214400.0, + "grad_norm": 1.8382088228922817, + "language_loss": 0.72503978, + "learning_rate": 4.5790974349907194e-08, + "loss": 0.74630278, + "num_input_tokens_seen": 335056060, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.63671875, + "step": 15531, + "time_per_iteration": 2.4793055057525635 + }, + { + "auxiliary_loss_clip": 0.01099293, + "auxiliary_loss_mlp": 0.01028628, + "balance_loss_clip": 1.01671898, + "balance_loss_mlp": 1.03480935, + "epoch": 0.9338343604388998, + "flos": 29058986522880.0, + "grad_norm": 1.7387644106584443, + "language_loss": 0.70876235, + "learning_rate": 4.5708149577617925e-08, + "loss": 0.7300415, + "num_input_tokens_seen": 335075410, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.64453125, + "step": 15532, + "time_per_iteration": 2.5110278129577637 + }, + { + "auxiliary_loss_clip": 0.01100897, + "auxiliary_loss_mlp": 0.01030357, + "balance_loss_clip": 1.01876354, + "balance_loss_mlp": 1.03404105, + "epoch": 0.9338944836915677, + "flos": 18660908695680.0, + "grad_norm": 1.6017505405314953, + "language_loss": 0.73168802, + "learning_rate": 4.5625398912271016e-08, + "loss": 0.75300056, + "num_input_tokens_seen": 335095190, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66796875, + "step": 15533, + "time_per_iteration": 2.456326961517334 + }, + { + "auxiliary_loss_clip": 0.0109823, + "auxiliary_loss_mlp": 0.01027232, + "balance_loss_clip": 1.01614547, + "balance_loss_mlp": 1.03383231, + "epoch": 0.9339546069442357, + "flos": 16617735273600.0, + "grad_norm": 1.8538496144624586, + "language_loss": 0.79222482, + "learning_rate": 4.554272235700507e-08, + "loss": 0.81347942, + "num_input_tokens_seen": 335113825, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.64453125, + "step": 15534, + "time_per_iteration": 2.462285280227661 + }, + { + "auxiliary_loss_clip": 0.01094139, + "auxiliary_loss_mlp": 0.0102719, + "balance_loss_clip": 1.01709914, + "balance_loss_mlp": 1.03442264, + "epoch": 0.9340147301969036, + "flos": 23693106424320.0, + "grad_norm": 1.6479187829635167, + "language_loss": 0.74347138, + "learning_rate": 4.546011991495513e-08, + "loss": 0.76468462, + "num_input_tokens_seen": 335136425, + "router_z_loss_clip": 0.10107422, + "router_z_loss_mlp": 0.59375, + "step": 15535, + "time_per_iteration": 2.4844884872436523 + }, + { + "auxiliary_loss_clip": 0.01101269, + "auxiliary_loss_mlp": 0.01027396, + "balance_loss_clip": 1.01573169, + "balance_loss_mlp": 1.0350976, + "epoch": 0.9340748534495716, + "flos": 28654452576000.0, + "grad_norm": 1.9207427228488974, + "language_loss": 0.77459687, + "learning_rate": 4.537759158925292e-08, + "loss": 0.79588354, + "num_input_tokens_seen": 335157925, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66015625, + "step": 15536, + "time_per_iteration": 2.521846294403076 + }, + { + "auxiliary_loss_clip": 0.01097297, + "auxiliary_loss_mlp": 0.01026696, + "balance_loss_clip": 1.01527619, + "balance_loss_mlp": 1.03239119, + "epoch": 0.9341349767022396, + "flos": 24899633285760.0, + "grad_norm": 1.444547733533229, + "language_loss": 0.80330276, + "learning_rate": 4.5295137383028593e-08, + "loss": 0.8245427, + "num_input_tokens_seen": 335177840, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 15537, + "time_per_iteration": 2.473996639251709 + }, + { + "auxiliary_loss_clip": 0.01101334, + "auxiliary_loss_mlp": 0.01031661, + "balance_loss_clip": 1.02027667, + "balance_loss_mlp": 1.0340277, + "epoch": 0.9341950999549076, + "flos": 29059525226880.0, + "grad_norm": 1.8807920821154451, + "language_loss": 0.77858669, + "learning_rate": 4.5212757299408764e-08, + "loss": 0.79991663, + "num_input_tokens_seen": 335199470, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.671875, + "step": 15538, + "time_per_iteration": 2.518378973007202 + }, + { + "auxiliary_loss_clip": 0.01097238, + "auxiliary_loss_mlp": 0.01027192, + "balance_loss_clip": 1.01576018, + "balance_loss_mlp": 1.03289402, + "epoch": 0.9342552232075755, + "flos": 23587062497280.0, + "grad_norm": 1.6297482884228507, + "language_loss": 0.73147398, + "learning_rate": 4.513045134151672e-08, + "loss": 0.75271827, + "num_input_tokens_seen": 335218885, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.64453125, + "step": 15539, + "time_per_iteration": 2.4873478412628174 + }, + { + "auxiliary_loss_clip": 0.01096595, + "auxiliary_loss_mlp": 0.01028476, + "balance_loss_clip": 1.01805711, + "balance_loss_mlp": 1.03259134, + "epoch": 0.9343153464602435, + "flos": 36721389646080.0, + "grad_norm": 1.6257546240564933, + "language_loss": 0.64682591, + "learning_rate": 4.504821951247373e-08, + "loss": 0.66807657, + "num_input_tokens_seen": 335239485, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.640625, + "step": 15540, + "time_per_iteration": 2.5722250938415527 + }, + { + "auxiliary_loss_clip": 0.01096636, + "auxiliary_loss_mlp": 0.01028266, + "balance_loss_clip": 1.01734662, + "balance_loss_mlp": 1.03265882, + "epoch": 0.9343754697129115, + "flos": 22236498097920.0, + "grad_norm": 1.6183457842403597, + "language_loss": 0.76627016, + "learning_rate": 4.496606181539864e-08, + "loss": 0.78751922, + "num_input_tokens_seen": 335258355, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.640625, + "step": 15541, + "time_per_iteration": 2.436232089996338 + }, + { + "auxiliary_loss_clip": 0.01101836, + "auxiliary_loss_mlp": 0.01031204, + "balance_loss_clip": 1.02009416, + "balance_loss_mlp": 1.03635538, + "epoch": 0.9344355929655794, + "flos": 29710333797120.0, + "grad_norm": 1.965621595224706, + "language_loss": 0.67185199, + "learning_rate": 4.4883978253406066e-08, + "loss": 0.69318235, + "num_input_tokens_seen": 335276835, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65234375, + "step": 15542, + "time_per_iteration": 2.573796272277832 + }, + { + "auxiliary_loss_clip": 0.01099558, + "auxiliary_loss_mlp": 0.01028965, + "balance_loss_clip": 1.01739001, + "balance_loss_mlp": 1.03383517, + "epoch": 0.9344957162182475, + "flos": 18880394751360.0, + "grad_norm": 1.8093294927002697, + "language_loss": 0.6968419, + "learning_rate": 4.480196882960907e-08, + "loss": 0.71812713, + "num_input_tokens_seen": 335296220, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 15543, + "time_per_iteration": 2.44272518157959 + }, + { + "auxiliary_loss_clip": 0.01101098, + "auxiliary_loss_mlp": 0.01030802, + "balance_loss_clip": 1.01826096, + "balance_loss_mlp": 1.0330987, + "epoch": 0.9345558394709154, + "flos": 27417761268480.0, + "grad_norm": 1.657022098990054, + "language_loss": 0.69621456, + "learning_rate": 4.4720033547117394e-08, + "loss": 0.71753359, + "num_input_tokens_seen": 335316335, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 15544, + "time_per_iteration": 2.5107553005218506 + }, + { + "auxiliary_loss_clip": 0.01102161, + "auxiliary_loss_mlp": 0.01033016, + "balance_loss_clip": 1.02158332, + "balance_loss_mlp": 1.03555083, + "epoch": 0.9346159627235834, + "flos": 20741285629440.0, + "grad_norm": 1.6017319238780592, + "language_loss": 0.77028668, + "learning_rate": 4.463817240903789e-08, + "loss": 0.79163849, + "num_input_tokens_seen": 335335545, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 15545, + "time_per_iteration": 2.4662442207336426 + }, + { + "auxiliary_loss_clip": 0.0110093, + "auxiliary_loss_mlp": 0.01026866, + "balance_loss_clip": 1.01607156, + "balance_loss_mlp": 1.0343653, + "epoch": 0.9346760859762513, + "flos": 21069221823360.0, + "grad_norm": 1.5473427515527929, + "language_loss": 0.68910575, + "learning_rate": 4.455638541847495e-08, + "loss": 0.71038377, + "num_input_tokens_seen": 335355350, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.66796875, + "step": 15546, + "time_per_iteration": 2.4668684005737305 + }, + { + "auxiliary_loss_clip": 0.0109524, + "auxiliary_loss_mlp": 0.01027139, + "balance_loss_clip": 1.01644647, + "balance_loss_mlp": 1.0332005, + "epoch": 0.9347362092289193, + "flos": 29204927481600.0, + "grad_norm": 1.682178435100884, + "language_loss": 0.827672, + "learning_rate": 4.447467257852966e-08, + "loss": 0.84889573, + "num_input_tokens_seen": 335375160, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.62109375, + "step": 15547, + "time_per_iteration": 2.5151429176330566 + }, + { + "auxiliary_loss_clip": 0.01095669, + "auxiliary_loss_mlp": 0.01031203, + "balance_loss_clip": 1.02056944, + "balance_loss_mlp": 1.03179169, + "epoch": 0.9347963324815872, + "flos": 19427350124160.0, + "grad_norm": 1.8674494270918909, + "language_loss": 0.83416784, + "learning_rate": 4.439303389230087e-08, + "loss": 0.85543656, + "num_input_tokens_seen": 335394080, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.640625, + "step": 15548, + "time_per_iteration": 2.4793310165405273 + }, + { + "auxiliary_loss_clip": 0.01101921, + "auxiliary_loss_mlp": 0.0103276, + "balance_loss_clip": 1.01983714, + "balance_loss_mlp": 1.03354442, + "epoch": 0.9348564557342552, + "flos": 36901840596480.0, + "grad_norm": 1.5491704453799409, + "language_loss": 0.6522944, + "learning_rate": 4.4311469362884326e-08, + "loss": 0.6736412, + "num_input_tokens_seen": 335414230, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.68359375, + "step": 15549, + "time_per_iteration": 2.5649001598358154 + }, + { + "auxiliary_loss_clip": 0.01102455, + "auxiliary_loss_mlp": 0.01033366, + "balance_loss_clip": 1.02144527, + "balance_loss_mlp": 1.03610802, + "epoch": 0.9349165789869232, + "flos": 21690117342720.0, + "grad_norm": 2.6870852732968324, + "language_loss": 0.80190766, + "learning_rate": 4.4229978993372665e-08, + "loss": 0.82326579, + "num_input_tokens_seen": 335432890, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 15550, + "time_per_iteration": 2.447848081588745 + }, + { + "auxiliary_loss_clip": 0.01100445, + "auxiliary_loss_mlp": 0.01028313, + "balance_loss_clip": 1.01714873, + "balance_loss_mlp": 1.0355283, + "epoch": 0.9349767022395912, + "flos": 18844053166080.0, + "grad_norm": 1.7715830672341057, + "language_loss": 0.75721681, + "learning_rate": 4.4148562786856524e-08, + "loss": 0.77850437, + "num_input_tokens_seen": 335452085, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 15551, + "time_per_iteration": 2.431541681289673 + }, + { + "auxiliary_loss_clip": 0.01095285, + "auxiliary_loss_mlp": 0.01029049, + "balance_loss_clip": 1.01916671, + "balance_loss_mlp": 1.03277612, + "epoch": 0.9350368254922591, + "flos": 24973429777920.0, + "grad_norm": 1.5972871846705574, + "language_loss": 0.73139381, + "learning_rate": 4.406722074642255e-08, + "loss": 0.75263715, + "num_input_tokens_seen": 335472130, + "router_z_loss_clip": 0.09863281, + "router_z_loss_mlp": 0.625, + "step": 15552, + "time_per_iteration": 2.4882681369781494 + }, + { + "auxiliary_loss_clip": 0.01098605, + "auxiliary_loss_mlp": 0.01031442, + "balance_loss_clip": 1.02037311, + "balance_loss_mlp": 1.03382468, + "epoch": 0.9350969487449271, + "flos": 23070594792960.0, + "grad_norm": 2.1019402577622315, + "language_loss": 0.77461952, + "learning_rate": 4.3985952875155386e-08, + "loss": 0.79592001, + "num_input_tokens_seen": 335489970, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6484375, + "step": 15553, + "time_per_iteration": 3.8587379455566406 + }, + { + "auxiliary_loss_clip": 0.01101745, + "auxiliary_loss_mlp": 0.01034975, + "balance_loss_clip": 1.02267838, + "balance_loss_mlp": 1.03456163, + "epoch": 0.9351570719975951, + "flos": 18625177641600.0, + "grad_norm": 1.5007379284122981, + "language_loss": 0.78357017, + "learning_rate": 4.390475917613723e-08, + "loss": 0.80493736, + "num_input_tokens_seen": 335509125, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.671875, + "step": 15554, + "time_per_iteration": 2.419851303100586 + }, + { + "auxiliary_loss_clip": 0.01093625, + "auxiliary_loss_mlp": 0.01026646, + "balance_loss_clip": 1.01656055, + "balance_loss_mlp": 1.03161645, + "epoch": 0.935217195250263, + "flos": 15888353702400.0, + "grad_norm": 1.7958483110944459, + "language_loss": 0.69293928, + "learning_rate": 4.382363965244695e-08, + "loss": 0.71414196, + "num_input_tokens_seen": 335525620, + "router_z_loss_clip": 0.10058594, + "router_z_loss_mlp": 0.6171875, + "step": 15555, + "time_per_iteration": 3.920722007751465 + }, + { + "auxiliary_loss_clip": 0.01098789, + "auxiliary_loss_mlp": 0.01038798, + "balance_loss_clip": 1.02724099, + "balance_loss_mlp": 1.03382492, + "epoch": 0.935277318502931, + "flos": 24390312387840.0, + "grad_norm": 1.8860733218695758, + "language_loss": 0.7554931, + "learning_rate": 4.374259430715965e-08, + "loss": 0.776869, + "num_input_tokens_seen": 335547565, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6484375, + "step": 15556, + "time_per_iteration": 3.848532199859619 + }, + { + "auxiliary_loss_clip": 0.01098399, + "auxiliary_loss_mlp": 0.01029598, + "balance_loss_clip": 1.01864231, + "balance_loss_mlp": 1.0332365, + "epoch": 0.935337441755599, + "flos": 27600259294080.0, + "grad_norm": 1.6008830857055418, + "language_loss": 0.72704911, + "learning_rate": 4.366162314334953e-08, + "loss": 0.74832916, + "num_input_tokens_seen": 335570285, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.65234375, + "step": 15557, + "time_per_iteration": 2.486417770385742 + }, + { + "auxiliary_loss_clip": 0.01099803, + "auxiliary_loss_mlp": 0.01030957, + "balance_loss_clip": 1.01889324, + "balance_loss_mlp": 1.03413987, + "epoch": 0.935397565008267, + "flos": 20482872209280.0, + "grad_norm": 1.5664177118870293, + "language_loss": 0.63356799, + "learning_rate": 4.358072616408681e-08, + "loss": 0.65487558, + "num_input_tokens_seen": 335588600, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.65625, + "step": 15558, + "time_per_iteration": 2.415761709213257 + }, + { + "auxiliary_loss_clip": 0.01099528, + "auxiliary_loss_mlp": 0.01026906, + "balance_loss_clip": 1.0146395, + "balance_loss_mlp": 1.03434324, + "epoch": 0.9354576882609349, + "flos": 23654394541440.0, + "grad_norm": 1.8958255236053232, + "language_loss": 0.73185015, + "learning_rate": 4.34999033724388e-08, + "loss": 0.75311458, + "num_input_tokens_seen": 335606235, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.65234375, + "step": 15559, + "time_per_iteration": 2.4690446853637695 + }, + { + "auxiliary_loss_clip": 0.01096312, + "auxiliary_loss_mlp": 0.01025743, + "balance_loss_clip": 1.01587224, + "balance_loss_mlp": 1.03340673, + "epoch": 0.9355178115136029, + "flos": 36684904406400.0, + "grad_norm": 1.6349606783173563, + "language_loss": 0.63386834, + "learning_rate": 4.341915477147062e-08, + "loss": 0.6550889, + "num_input_tokens_seen": 335628240, + "router_z_loss_clip": 0.09863281, + "router_z_loss_mlp": 0.62890625, + "step": 15560, + "time_per_iteration": 4.014149188995361 + }, + { + "auxiliary_loss_clip": 0.01107301, + "auxiliary_loss_mlp": 0.010324, + "balance_loss_clip": 1.01869702, + "balance_loss_mlp": 1.03587627, + "epoch": 0.9355779347662708, + "flos": 14460401450880.0, + "grad_norm": 2.2699289713088557, + "language_loss": 0.6402877, + "learning_rate": 4.3338480364244034e-08, + "loss": 0.66168469, + "num_input_tokens_seen": 335643755, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71484375, + "step": 15561, + "time_per_iteration": 2.4404451847076416 + }, + { + "auxiliary_loss_clip": 0.0109953, + "auxiliary_loss_mlp": 0.01034464, + "balance_loss_clip": 1.02250171, + "balance_loss_mlp": 1.03558934, + "epoch": 0.9356380580189388, + "flos": 23185976256000.0, + "grad_norm": 1.676704048305052, + "language_loss": 0.7533828, + "learning_rate": 4.325788015381859e-08, + "loss": 0.77472275, + "num_input_tokens_seen": 335665160, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.640625, + "step": 15562, + "time_per_iteration": 2.477750062942505 + }, + { + "auxiliary_loss_clip": 0.01021581, + "auxiliary_loss_mlp": 0.01002822, + "balance_loss_clip": 1.00183833, + "balance_loss_mlp": 1.00135922, + "epoch": 0.9356981812716068, + "flos": 67471626090240.0, + "grad_norm": 0.9485732180196381, + "language_loss": 0.62341046, + "learning_rate": 4.31773541432503e-08, + "loss": 0.64365447, + "num_input_tokens_seen": 335715240, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.20214844, + "step": 15563, + "time_per_iteration": 2.8820064067840576 + }, + { + "auxiliary_loss_clip": 0.01096826, + "auxiliary_loss_mlp": 0.01032818, + "balance_loss_clip": 1.02134371, + "balance_loss_mlp": 1.03330636, + "epoch": 0.9357583045242748, + "flos": 24681619687680.0, + "grad_norm": 1.6269402183292514, + "language_loss": 0.78099597, + "learning_rate": 4.3096902335592714e-08, + "loss": 0.80229235, + "num_input_tokens_seen": 335734970, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.63671875, + "step": 15564, + "time_per_iteration": 2.534823179244995 + }, + { + "auxiliary_loss_clip": 0.01099525, + "auxiliary_loss_mlp": 0.01030752, + "balance_loss_clip": 1.01794863, + "balance_loss_mlp": 1.03237152, + "epoch": 0.9358184277769427, + "flos": 19463727623040.0, + "grad_norm": 1.7253317771488292, + "language_loss": 0.77913517, + "learning_rate": 4.301652473389694e-08, + "loss": 0.80043793, + "num_input_tokens_seen": 335753435, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.671875, + "step": 15565, + "time_per_iteration": 2.5161406993865967 + }, + { + "auxiliary_loss_clip": 0.01096793, + "auxiliary_loss_mlp": 0.01028269, + "balance_loss_clip": 1.01708746, + "balance_loss_mlp": 1.03307271, + "epoch": 0.9358785510296107, + "flos": 18916987731840.0, + "grad_norm": 2.201571134933277, + "language_loss": 0.72346658, + "learning_rate": 4.2936221341210774e-08, + "loss": 0.74471718, + "num_input_tokens_seen": 335772105, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.63671875, + "step": 15566, + "time_per_iteration": 2.4636313915252686 + }, + { + "auxiliary_loss_clip": 0.01098024, + "auxiliary_loss_mlp": 0.01028693, + "balance_loss_clip": 1.0176307, + "balance_loss_mlp": 1.03258288, + "epoch": 0.9359386742822787, + "flos": 23441265192960.0, + "grad_norm": 1.7833005232055914, + "language_loss": 0.67558104, + "learning_rate": 4.285599216057889e-08, + "loss": 0.69684815, + "num_input_tokens_seen": 335789125, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 15567, + "time_per_iteration": 2.476928234100342 + }, + { + "auxiliary_loss_clip": 0.01100526, + "auxiliary_loss_mlp": 0.01031885, + "balance_loss_clip": 1.02041149, + "balance_loss_mlp": 1.03555894, + "epoch": 0.9359987975349466, + "flos": 32744067557760.0, + "grad_norm": 1.8979470567476942, + "language_loss": 0.62194836, + "learning_rate": 4.277583719504418e-08, + "loss": 0.64327252, + "num_input_tokens_seen": 335810995, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6484375, + "step": 15568, + "time_per_iteration": 2.5609068870544434 + }, + { + "auxiliary_loss_clip": 0.01097511, + "auxiliary_loss_mlp": 0.01031871, + "balance_loss_clip": 1.02045119, + "balance_loss_mlp": 1.03235245, + "epoch": 0.9360589207876147, + "flos": 22819651401600.0, + "grad_norm": 1.8186215129656738, + "language_loss": 0.78508359, + "learning_rate": 4.269575644764556e-08, + "loss": 0.80637741, + "num_input_tokens_seen": 335830580, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 15569, + "time_per_iteration": 2.534830093383789 + }, + { + "auxiliary_loss_clip": 0.01101685, + "auxiliary_loss_mlp": 0.01031131, + "balance_loss_clip": 1.01937127, + "balance_loss_mlp": 1.03493094, + "epoch": 0.9361190440402826, + "flos": 20885251340160.0, + "grad_norm": 2.2083929656127816, + "language_loss": 0.69096726, + "learning_rate": 4.261574992142014e-08, + "loss": 0.71229541, + "num_input_tokens_seen": 335846515, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6640625, + "step": 15570, + "time_per_iteration": 2.446850299835205 + }, + { + "auxiliary_loss_clip": 0.01099152, + "auxiliary_loss_mlp": 0.01032193, + "balance_loss_clip": 1.02060008, + "balance_loss_mlp": 1.032758, + "epoch": 0.9361791672929506, + "flos": 19317822577920.0, + "grad_norm": 1.7500529924071564, + "language_loss": 0.78419554, + "learning_rate": 4.2535817619401726e-08, + "loss": 0.80550903, + "num_input_tokens_seen": 335863350, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 15571, + "time_per_iteration": 2.527392864227295 + }, + { + "auxiliary_loss_clip": 0.01098413, + "auxiliary_loss_mlp": 0.01030272, + "balance_loss_clip": 1.01873899, + "balance_loss_mlp": 1.03342748, + "epoch": 0.9362392905456185, + "flos": 15158182032000.0, + "grad_norm": 1.8544768580864697, + "language_loss": 0.77347147, + "learning_rate": 4.2455959544621224e-08, + "loss": 0.79475832, + "num_input_tokens_seen": 335880510, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 15572, + "time_per_iteration": 2.437767505645752 + }, + { + "auxiliary_loss_clip": 0.0109615, + "auxiliary_loss_mlp": 0.01035602, + "balance_loss_clip": 1.02450323, + "balance_loss_mlp": 1.03296947, + "epoch": 0.9362994137982865, + "flos": 22085888371200.0, + "grad_norm": 1.755875004895135, + "language_loss": 0.77844107, + "learning_rate": 4.237617570010688e-08, + "loss": 0.79975855, + "num_input_tokens_seen": 335899440, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6328125, + "step": 15573, + "time_per_iteration": 2.4989819526672363 + }, + { + "auxiliary_loss_clip": 0.01095269, + "auxiliary_loss_mlp": 0.01026582, + "balance_loss_clip": 1.01550794, + "balance_loss_mlp": 1.03273368, + "epoch": 0.9363595370509544, + "flos": 23512260424320.0, + "grad_norm": 1.629671028600837, + "language_loss": 0.74591202, + "learning_rate": 4.2296466088884044e-08, + "loss": 0.76713055, + "num_input_tokens_seen": 335919540, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.625, + "step": 15574, + "time_per_iteration": 2.4767251014709473 + }, + { + "auxiliary_loss_clip": 0.01095997, + "auxiliary_loss_mlp": 0.01031256, + "balance_loss_clip": 1.01983523, + "balance_loss_mlp": 1.03279662, + "epoch": 0.9364196603036224, + "flos": 27123473139840.0, + "grad_norm": 1.8734211277341717, + "language_loss": 0.67999518, + "learning_rate": 4.221683071397564e-08, + "loss": 0.70126772, + "num_input_tokens_seen": 335939665, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6328125, + "step": 15575, + "time_per_iteration": 2.490079164505005 + }, + { + "auxiliary_loss_clip": 0.01096514, + "auxiliary_loss_mlp": 0.01029364, + "balance_loss_clip": 1.01773548, + "balance_loss_mlp": 1.03346181, + "epoch": 0.9364797835562904, + "flos": 18479057114880.0, + "grad_norm": 2.0740128119343484, + "language_loss": 0.65354764, + "learning_rate": 4.2137269578401026e-08, + "loss": 0.67480642, + "num_input_tokens_seen": 335958580, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6328125, + "step": 15576, + "time_per_iteration": 2.4461007118225098 + }, + { + "auxiliary_loss_clip": 0.01099627, + "auxiliary_loss_mlp": 0.01025994, + "balance_loss_clip": 1.01396561, + "balance_loss_mlp": 1.03255725, + "epoch": 0.9365399068089584, + "flos": 13005552890880.0, + "grad_norm": 2.366361816458, + "language_loss": 0.75638366, + "learning_rate": 4.2057782685177566e-08, + "loss": 0.77763987, + "num_input_tokens_seen": 335974965, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 15577, + "time_per_iteration": 2.4425899982452393 + }, + { + "auxiliary_loss_clip": 0.01098342, + "auxiliary_loss_mlp": 0.01029652, + "balance_loss_clip": 1.01796925, + "balance_loss_mlp": 1.03240597, + "epoch": 0.9366000300616263, + "flos": 25666433850240.0, + "grad_norm": 2.7524540234782857, + "language_loss": 0.52199161, + "learning_rate": 4.1978370037318855e-08, + "loss": 0.54327154, + "num_input_tokens_seen": 335996575, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66015625, + "step": 15578, + "time_per_iteration": 2.5211164951324463 + }, + { + "auxiliary_loss_clip": 0.01098227, + "auxiliary_loss_mlp": 0.01030924, + "balance_loss_clip": 1.02001023, + "balance_loss_mlp": 1.03396976, + "epoch": 0.9366601533142943, + "flos": 21433355948160.0, + "grad_norm": 1.560613386304835, + "language_loss": 0.70552897, + "learning_rate": 4.189903163783692e-08, + "loss": 0.72682047, + "num_input_tokens_seen": 336017265, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.64453125, + "step": 15579, + "time_per_iteration": 2.52437424659729 + }, + { + "auxiliary_loss_clip": 0.01097221, + "auxiliary_loss_mlp": 0.0102686, + "balance_loss_clip": 1.01600599, + "balance_loss_mlp": 1.03381634, + "epoch": 0.9367202765669622, + "flos": 24093222998400.0, + "grad_norm": 1.8413182351344008, + "language_loss": 0.76279169, + "learning_rate": 4.181976748973959e-08, + "loss": 0.78403246, + "num_input_tokens_seen": 336035905, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6328125, + "step": 15580, + "time_per_iteration": 2.4710912704467773 + }, + { + "auxiliary_loss_clip": 0.01104329, + "auxiliary_loss_mlp": 0.0102788, + "balance_loss_clip": 1.0158999, + "balance_loss_mlp": 1.03657007, + "epoch": 0.9367803998196302, + "flos": 20888842700160.0, + "grad_norm": 2.694112745852956, + "language_loss": 0.66185987, + "learning_rate": 4.1740577596033114e-08, + "loss": 0.68318188, + "num_input_tokens_seen": 336055585, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 15581, + "time_per_iteration": 2.470471143722534 + }, + { + "auxiliary_loss_clip": 0.01100513, + "auxiliary_loss_mlp": 0.01027313, + "balance_loss_clip": 1.01533818, + "balance_loss_mlp": 1.0348171, + "epoch": 0.9368405230722983, + "flos": 22564362464640.0, + "grad_norm": 1.5158338005661471, + "language_loss": 0.76600075, + "learning_rate": 4.166146195972042e-08, + "loss": 0.78727901, + "num_input_tokens_seen": 336076695, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.65625, + "step": 15582, + "time_per_iteration": 2.488633632659912 + }, + { + "auxiliary_loss_clip": 0.01098016, + "auxiliary_loss_mlp": 0.01032851, + "balance_loss_clip": 1.02078104, + "balance_loss_mlp": 1.03378606, + "epoch": 0.9369006463249662, + "flos": 18880215183360.0, + "grad_norm": 1.7421816786299127, + "language_loss": 0.73751408, + "learning_rate": 4.1582420583800905e-08, + "loss": 0.7588228, + "num_input_tokens_seen": 336094740, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.640625, + "step": 15583, + "time_per_iteration": 2.494215965270996 + }, + { + "auxiliary_loss_clip": 0.0110384, + "auxiliary_loss_mlp": 0.01030693, + "balance_loss_clip": 1.01834249, + "balance_loss_mlp": 1.03549671, + "epoch": 0.9369607695776342, + "flos": 26432516142720.0, + "grad_norm": 2.4708375978600583, + "language_loss": 0.84226978, + "learning_rate": 4.1503453471272376e-08, + "loss": 0.8636151, + "num_input_tokens_seen": 336113985, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 15584, + "time_per_iteration": 2.5331246852874756 + }, + { + "auxiliary_loss_clip": 0.01103426, + "auxiliary_loss_mlp": 0.01034145, + "balance_loss_clip": 1.02168131, + "balance_loss_mlp": 1.03460801, + "epoch": 0.9370208928303021, + "flos": 39567346081920.0, + "grad_norm": 1.582838620482487, + "language_loss": 0.72438812, + "learning_rate": 4.1424560625129334e-08, + "loss": 0.7457639, + "num_input_tokens_seen": 336136395, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 15585, + "time_per_iteration": 2.6119656562805176 + }, + { + "auxiliary_loss_clip": 0.01094739, + "auxiliary_loss_mlp": 0.01025263, + "balance_loss_clip": 1.01492119, + "balance_loss_mlp": 1.0317564, + "epoch": 0.9370810160829701, + "flos": 22963114321920.0, + "grad_norm": 1.8473682533271836, + "language_loss": 0.80436736, + "learning_rate": 4.134574204836316e-08, + "loss": 0.82556736, + "num_input_tokens_seen": 336156345, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.62890625, + "step": 15586, + "time_per_iteration": 2.484668493270874 + }, + { + "auxiliary_loss_clip": 0.01098095, + "auxiliary_loss_mlp": 0.01030395, + "balance_loss_clip": 1.01892114, + "balance_loss_mlp": 1.03355002, + "epoch": 0.937141139335638, + "flos": 23075048079360.0, + "grad_norm": 1.689751969140814, + "language_loss": 0.76728654, + "learning_rate": 4.126699774396258e-08, + "loss": 0.78857148, + "num_input_tokens_seen": 336176760, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6484375, + "step": 15587, + "time_per_iteration": 2.463247299194336 + }, + { + "auxiliary_loss_clip": 0.01102433, + "auxiliary_loss_mlp": 0.01027962, + "balance_loss_clip": 1.01624346, + "balance_loss_mlp": 1.03459477, + "epoch": 0.937201262588306, + "flos": 16356664247040.0, + "grad_norm": 1.8161578272340377, + "language_loss": 0.87579244, + "learning_rate": 4.118832771491387e-08, + "loss": 0.8970964, + "num_input_tokens_seen": 336193285, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 15588, + "time_per_iteration": 2.412489891052246 + }, + { + "auxiliary_loss_clip": 0.01095862, + "auxiliary_loss_mlp": 0.01027769, + "balance_loss_clip": 1.0171299, + "balance_loss_mlp": 1.03373146, + "epoch": 0.937261385840974, + "flos": 20194078861440.0, + "grad_norm": 1.564245116069396, + "language_loss": 0.78160763, + "learning_rate": 4.11097319642002e-08, + "loss": 0.80284393, + "num_input_tokens_seen": 336211425, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.625, + "step": 15589, + "time_per_iteration": 2.445446014404297 + }, + { + "auxiliary_loss_clip": 0.01097837, + "auxiliary_loss_mlp": 0.01029357, + "balance_loss_clip": 1.01799083, + "balance_loss_mlp": 1.03447211, + "epoch": 0.937321509093642, + "flos": 18295948558080.0, + "grad_norm": 1.7350020828956296, + "language_loss": 0.77957153, + "learning_rate": 4.103121049480163e-08, + "loss": 0.80084348, + "num_input_tokens_seen": 336230205, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6328125, + "step": 15590, + "time_per_iteration": 2.428891897201538 + }, + { + "auxiliary_loss_clip": 0.01102134, + "auxiliary_loss_mlp": 0.01036092, + "balance_loss_clip": 1.0234977, + "balance_loss_mlp": 1.0337714, + "epoch": 0.9373816323463099, + "flos": 25884662929920.0, + "grad_norm": 1.8230621785176295, + "language_loss": 0.71332479, + "learning_rate": 4.095276330969577e-08, + "loss": 0.734707, + "num_input_tokens_seen": 336252440, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.68359375, + "step": 15591, + "time_per_iteration": 2.4772777557373047 + }, + { + "auxiliary_loss_clip": 0.0110454, + "auxiliary_loss_mlp": 0.01033725, + "balance_loss_clip": 1.02017736, + "balance_loss_mlp": 1.03551292, + "epoch": 0.9374417555989779, + "flos": 27198849830400.0, + "grad_norm": 2.4539812821025895, + "language_loss": 0.54102397, + "learning_rate": 4.0874390411857804e-08, + "loss": 0.56240666, + "num_input_tokens_seen": 336273845, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.69140625, + "step": 15592, + "time_per_iteration": 2.541588068008423 + }, + { + "auxiliary_loss_clip": 0.01098357, + "auxiliary_loss_mlp": 0.01026018, + "balance_loss_clip": 1.01555693, + "balance_loss_mlp": 1.03418398, + "epoch": 0.9375018788516458, + "flos": 23621249266560.0, + "grad_norm": 3.3346969261937245, + "language_loss": 0.67238343, + "learning_rate": 4.0796091804259136e-08, + "loss": 0.69362718, + "num_input_tokens_seen": 336292790, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.640625, + "step": 15593, + "time_per_iteration": 2.4413111209869385 + }, + { + "auxiliary_loss_clip": 0.01099567, + "auxiliary_loss_mlp": 0.010267, + "balance_loss_clip": 1.01530337, + "balance_loss_mlp": 1.03389668, + "epoch": 0.9375620021043138, + "flos": 22678774260480.0, + "grad_norm": 1.4835867748670866, + "language_loss": 0.74052262, + "learning_rate": 4.0717867489868715e-08, + "loss": 0.76178527, + "num_input_tokens_seen": 336312600, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 15594, + "time_per_iteration": 2.497950315475464 + }, + { + "auxiliary_loss_clip": 0.0109474, + "auxiliary_loss_mlp": 0.01027543, + "balance_loss_clip": 1.01693356, + "balance_loss_mlp": 1.03231061, + "epoch": 0.9376221253569819, + "flos": 27560254521600.0, + "grad_norm": 1.7070302081902384, + "language_loss": 0.73724419, + "learning_rate": 4.063971747165351e-08, + "loss": 0.75846702, + "num_input_tokens_seen": 336332770, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.625, + "step": 15595, + "time_per_iteration": 3.9548685550689697 + }, + { + "auxiliary_loss_clip": 0.01099741, + "auxiliary_loss_mlp": 0.01025968, + "balance_loss_clip": 1.01471496, + "balance_loss_mlp": 1.03330636, + "epoch": 0.9376822486096498, + "flos": 24129887806080.0, + "grad_norm": 1.9094131028649322, + "language_loss": 0.76069069, + "learning_rate": 4.056164175257626e-08, + "loss": 0.78194779, + "num_input_tokens_seen": 336351445, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6640625, + "step": 15596, + "time_per_iteration": 2.472580671310425 + }, + { + "auxiliary_loss_clip": 0.0109939, + "auxiliary_loss_mlp": 0.01031065, + "balance_loss_clip": 1.01964438, + "balance_loss_mlp": 1.033849, + "epoch": 0.9377423718623178, + "flos": 22784028088320.0, + "grad_norm": 1.6688470570241747, + "language_loss": 0.78528333, + "learning_rate": 4.0483640335597926e-08, + "loss": 0.80658782, + "num_input_tokens_seen": 336368690, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65625, + "step": 15597, + "time_per_iteration": 3.8680100440979004 + }, + { + "auxiliary_loss_clip": 0.01102727, + "auxiliary_loss_mlp": 0.01030591, + "balance_loss_clip": 1.01869369, + "balance_loss_mlp": 1.03471541, + "epoch": 0.9378024951149857, + "flos": 19168900790400.0, + "grad_norm": 1.8551083723100676, + "language_loss": 0.81072772, + "learning_rate": 4.0405713223676363e-08, + "loss": 0.83206093, + "num_input_tokens_seen": 336388165, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 15598, + "time_per_iteration": 3.8231723308563232 + }, + { + "auxiliary_loss_clip": 0.01104728, + "auxiliary_loss_mlp": 0.01031174, + "balance_loss_clip": 1.01905012, + "balance_loss_mlp": 1.03351772, + "epoch": 0.9378626183676537, + "flos": 23505508667520.0, + "grad_norm": 1.7860005158481418, + "language_loss": 0.63344586, + "learning_rate": 4.0327860419766994e-08, + "loss": 0.65480494, + "num_input_tokens_seen": 336406475, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.7109375, + "step": 15599, + "time_per_iteration": 2.4853222370147705 + }, + { + "auxiliary_loss_clip": 0.01100601, + "auxiliary_loss_mlp": 0.01030096, + "balance_loss_clip": 1.01851487, + "balance_loss_mlp": 1.03380525, + "epoch": 0.9379227416203216, + "flos": 18405655672320.0, + "grad_norm": 1.6427203157979469, + "language_loss": 0.73457086, + "learning_rate": 4.0250081926821e-08, + "loss": 0.75587785, + "num_input_tokens_seen": 336424690, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66796875, + "step": 15600, + "time_per_iteration": 2.4055838584899902 + }, + { + "auxiliary_loss_clip": 0.01097706, + "auxiliary_loss_mlp": 0.0102731, + "balance_loss_clip": 1.01668882, + "balance_loss_mlp": 1.03379583, + "epoch": 0.9379828648729897, + "flos": 17821855923840.0, + "grad_norm": 1.82264927435843, + "language_loss": 0.69327891, + "learning_rate": 4.0172377747788474e-08, + "loss": 0.71452916, + "num_input_tokens_seen": 336443055, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.640625, + "step": 15601, + "time_per_iteration": 2.4290764331817627 + }, + { + "auxiliary_loss_clip": 0.01021583, + "auxiliary_loss_mlp": 0.01004526, + "balance_loss_clip": 1.0034945, + "balance_loss_mlp": 1.00159645, + "epoch": 0.9380429881256576, + "flos": 68024399466240.0, + "grad_norm": 0.756115695258228, + "language_loss": 0.58134079, + "learning_rate": 4.009474788561573e-08, + "loss": 0.60160184, + "num_input_tokens_seen": 336510190, + "router_z_loss_clip": 0.01031494, + "router_z_loss_mlp": 0.19921875, + "step": 15602, + "time_per_iteration": 4.649659156799316 + }, + { + "auxiliary_loss_clip": 0.01100223, + "auxiliary_loss_mlp": 0.01034231, + "balance_loss_clip": 1.02322853, + "balance_loss_mlp": 1.03378415, + "epoch": 0.9381031113783256, + "flos": 20776980769920.0, + "grad_norm": 2.0216671165230022, + "language_loss": 0.71774584, + "learning_rate": 4.001719234324663e-08, + "loss": 0.73909038, + "num_input_tokens_seen": 336529250, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6640625, + "step": 15603, + "time_per_iteration": 2.439192295074463 + }, + { + "auxiliary_loss_clip": 0.01091613, + "auxiliary_loss_mlp": 0.01025078, + "balance_loss_clip": 1.01448584, + "balance_loss_mlp": 1.03135061, + "epoch": 0.9381632346309935, + "flos": 19025078734080.0, + "grad_norm": 1.5724913358336257, + "language_loss": 0.7588923, + "learning_rate": 3.993971112362171e-08, + "loss": 0.78005922, + "num_input_tokens_seen": 336548530, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.6015625, + "step": 15604, + "time_per_iteration": 2.430049419403076 + }, + { + "auxiliary_loss_clip": 0.01101631, + "auxiliary_loss_mlp": 0.01029293, + "balance_loss_clip": 1.01753354, + "balance_loss_mlp": 1.03493166, + "epoch": 0.9382233578836615, + "flos": 23513840622720.0, + "grad_norm": 1.820480005637361, + "language_loss": 0.65220332, + "learning_rate": 3.9862304229679734e-08, + "loss": 0.67351258, + "num_input_tokens_seen": 336568510, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.66796875, + "step": 15605, + "time_per_iteration": 2.454102039337158 + }, + { + "auxiliary_loss_clip": 0.01103599, + "auxiliary_loss_mlp": 0.01032486, + "balance_loss_clip": 1.02008247, + "balance_loss_mlp": 1.03473902, + "epoch": 0.9382834811363294, + "flos": 43067882016000.0, + "grad_norm": 1.693925168028821, + "language_loss": 0.67501086, + "learning_rate": 3.9784971664355683e-08, + "loss": 0.69637167, + "num_input_tokens_seen": 336592020, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 15606, + "time_per_iteration": 2.6222922801971436 + }, + { + "auxiliary_loss_clip": 0.01093903, + "auxiliary_loss_mlp": 0.01026117, + "balance_loss_clip": 1.01541805, + "balance_loss_mlp": 1.03187966, + "epoch": 0.9383436043889974, + "flos": 16436242828800.0, + "grad_norm": 1.7830574782726436, + "language_loss": 0.77636516, + "learning_rate": 3.970771343058166e-08, + "loss": 0.79756534, + "num_input_tokens_seen": 336610010, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.62109375, + "step": 15607, + "time_per_iteration": 2.437866449356079 + }, + { + "auxiliary_loss_clip": 0.01099714, + "auxiliary_loss_mlp": 0.01027386, + "balance_loss_clip": 1.01631784, + "balance_loss_mlp": 1.03343678, + "epoch": 0.9384037276416655, + "flos": 20740603271040.0, + "grad_norm": 1.7371260163704862, + "language_loss": 0.82830989, + "learning_rate": 3.963052953128776e-08, + "loss": 0.84958094, + "num_input_tokens_seen": 336628520, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6640625, + "step": 15608, + "time_per_iteration": 2.4567601680755615 + }, + { + "auxiliary_loss_clip": 0.01104286, + "auxiliary_loss_mlp": 0.01033221, + "balance_loss_clip": 1.02112722, + "balance_loss_mlp": 1.03803909, + "epoch": 0.9384638508943334, + "flos": 19062677295360.0, + "grad_norm": 1.768980472763552, + "language_loss": 0.68811715, + "learning_rate": 3.9553419969400536e-08, + "loss": 0.70949221, + "num_input_tokens_seen": 336647365, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 15609, + "time_per_iteration": 2.4339687824249268 + }, + { + "auxiliary_loss_clip": 0.01101203, + "auxiliary_loss_mlp": 0.01027536, + "balance_loss_clip": 1.01521575, + "balance_loss_mlp": 1.0328238, + "epoch": 0.9385239741470014, + "flos": 23404887694080.0, + "grad_norm": 1.968005818386474, + "language_loss": 0.75119251, + "learning_rate": 3.9476384747844316e-08, + "loss": 0.77247989, + "num_input_tokens_seen": 336667165, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.68359375, + "step": 15610, + "time_per_iteration": 2.455913543701172 + }, + { + "auxiliary_loss_clip": 0.01101386, + "auxiliary_loss_mlp": 0.01026005, + "balance_loss_clip": 1.01504338, + "balance_loss_mlp": 1.0345664, + "epoch": 0.9385840973996693, + "flos": 12824742804480.0, + "grad_norm": 1.7573633557024793, + "language_loss": 0.74986607, + "learning_rate": 3.939942386953987e-08, + "loss": 0.77113998, + "num_input_tokens_seen": 336684130, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.66796875, + "step": 15611, + "time_per_iteration": 2.4424426555633545 + }, + { + "auxiliary_loss_clip": 0.01099404, + "auxiliary_loss_mlp": 0.01028523, + "balance_loss_clip": 1.01703119, + "balance_loss_mlp": 1.03506732, + "epoch": 0.9386442206523373, + "flos": 15486980152320.0, + "grad_norm": 2.1565540073741447, + "language_loss": 0.65710843, + "learning_rate": 3.9322537337405756e-08, + "loss": 0.67838764, + "num_input_tokens_seen": 336701520, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 15612, + "time_per_iteration": 2.460383892059326 + }, + { + "auxiliary_loss_clip": 0.01096532, + "auxiliary_loss_mlp": 0.01026706, + "balance_loss_clip": 1.01542926, + "balance_loss_mlp": 1.03311181, + "epoch": 0.9387043439050052, + "flos": 21178821196800.0, + "grad_norm": 1.7800423246546628, + "language_loss": 0.57413054, + "learning_rate": 3.924572515435742e-08, + "loss": 0.5953629, + "num_input_tokens_seen": 336720675, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6328125, + "step": 15613, + "time_per_iteration": 2.4363303184509277 + }, + { + "auxiliary_loss_clip": 0.01098477, + "auxiliary_loss_mlp": 0.01032371, + "balance_loss_clip": 1.02088487, + "balance_loss_mlp": 1.03223801, + "epoch": 0.9387644671576733, + "flos": 27668273696640.0, + "grad_norm": 3.022339916598047, + "language_loss": 0.70700508, + "learning_rate": 3.916898732330764e-08, + "loss": 0.72831357, + "num_input_tokens_seen": 336741005, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6640625, + "step": 15614, + "time_per_iteration": 2.5284812450408936 + }, + { + "auxiliary_loss_clip": 0.01103251, + "auxiliary_loss_mlp": 0.01030689, + "balance_loss_clip": 1.01882124, + "balance_loss_mlp": 1.03525591, + "epoch": 0.9388245904103412, + "flos": 18836331742080.0, + "grad_norm": 1.7543973322908877, + "language_loss": 0.81266332, + "learning_rate": 3.9092323847166544e-08, + "loss": 0.83400273, + "num_input_tokens_seen": 336757990, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 15615, + "time_per_iteration": 2.423703193664551 + }, + { + "auxiliary_loss_clip": 0.01097442, + "auxiliary_loss_mlp": 0.01025593, + "balance_loss_clip": 1.01468527, + "balance_loss_mlp": 1.03362358, + "epoch": 0.9388847136630092, + "flos": 25483828083840.0, + "grad_norm": 1.5887485146879645, + "language_loss": 0.71745086, + "learning_rate": 3.901573472884134e-08, + "loss": 0.7386812, + "num_input_tokens_seen": 336777705, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.63671875, + "step": 15616, + "time_per_iteration": 2.493049144744873 + }, + { + "auxiliary_loss_clip": 0.01100207, + "auxiliary_loss_mlp": 0.01027907, + "balance_loss_clip": 1.01633799, + "balance_loss_mlp": 1.03520691, + "epoch": 0.9389448369156771, + "flos": 18734992496640.0, + "grad_norm": 1.8635102246300295, + "language_loss": 0.66588014, + "learning_rate": 3.89392199712355e-08, + "loss": 0.68716127, + "num_input_tokens_seen": 336798275, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6484375, + "step": 15617, + "time_per_iteration": 2.433169364929199 + }, + { + "auxiliary_loss_clip": 0.01104549, + "auxiliary_loss_mlp": 0.0103554, + "balance_loss_clip": 1.02229548, + "balance_loss_mlp": 1.03535593, + "epoch": 0.9390049601683451, + "flos": 21717839664000.0, + "grad_norm": 2.1232753256513264, + "language_loss": 0.73530006, + "learning_rate": 3.886277957725092e-08, + "loss": 0.75670093, + "num_input_tokens_seen": 336813835, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.69140625, + "step": 15618, + "time_per_iteration": 2.4792399406433105 + }, + { + "auxiliary_loss_clip": 0.01103237, + "auxiliary_loss_mlp": 0.01029258, + "balance_loss_clip": 1.01622224, + "balance_loss_mlp": 1.03472626, + "epoch": 0.939065083421013, + "flos": 19391224020480.0, + "grad_norm": 1.9955954128383109, + "language_loss": 0.70013475, + "learning_rate": 3.878641354978662e-08, + "loss": 0.72145975, + "num_input_tokens_seen": 336832210, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.68359375, + "step": 15619, + "time_per_iteration": 2.449866533279419 + }, + { + "auxiliary_loss_clip": 0.01099798, + "auxiliary_loss_mlp": 0.0102901, + "balance_loss_clip": 1.01731527, + "balance_loss_mlp": 1.03438771, + "epoch": 0.939125206673681, + "flos": 24681511946880.0, + "grad_norm": 1.6070276908213748, + "language_loss": 0.77566183, + "learning_rate": 3.8710121891737834e-08, + "loss": 0.79694998, + "num_input_tokens_seen": 336851380, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 15620, + "time_per_iteration": 2.4847350120544434 + }, + { + "auxiliary_loss_clip": 0.01095352, + "auxiliary_loss_mlp": 0.01028465, + "balance_loss_clip": 1.01688337, + "balance_loss_mlp": 1.03205025, + "epoch": 0.9391853299263491, + "flos": 16325961096960.0, + "grad_norm": 2.459938458959684, + "language_loss": 0.73743159, + "learning_rate": 3.8633904605998025e-08, + "loss": 0.75866973, + "num_input_tokens_seen": 336868525, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6328125, + "step": 15621, + "time_per_iteration": 2.4089574813842773 + }, + { + "auxiliary_loss_clip": 0.01104801, + "auxiliary_loss_mlp": 0.01034061, + "balance_loss_clip": 1.02192533, + "balance_loss_mlp": 1.03624845, + "epoch": 0.939245453179017, + "flos": 11655778590720.0, + "grad_norm": 1.931241274628396, + "language_loss": 0.66069001, + "learning_rate": 3.855776169545688e-08, + "loss": 0.6820786, + "num_input_tokens_seen": 336886200, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 15622, + "time_per_iteration": 2.4193296432495117 + }, + { + "auxiliary_loss_clip": 0.01096904, + "auxiliary_loss_mlp": 0.01027358, + "balance_loss_clip": 1.01664114, + "balance_loss_mlp": 1.03303981, + "epoch": 0.939305576431685, + "flos": 23148700917120.0, + "grad_norm": 1.5757601790448577, + "language_loss": 0.71780264, + "learning_rate": 3.848169316300209e-08, + "loss": 0.73904526, + "num_input_tokens_seen": 336905815, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.640625, + "step": 15623, + "time_per_iteration": 2.4576759338378906 + }, + { + "auxiliary_loss_clip": 0.01103573, + "auxiliary_loss_mlp": 0.01028485, + "balance_loss_clip": 1.01704717, + "balance_loss_mlp": 1.03622472, + "epoch": 0.9393656996843529, + "flos": 33287790706560.0, + "grad_norm": 1.8246277533972777, + "language_loss": 0.72611034, + "learning_rate": 3.84056990115178e-08, + "loss": 0.74743092, + "num_input_tokens_seen": 336928460, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 15624, + "time_per_iteration": 2.574350357055664 + }, + { + "auxiliary_loss_clip": 0.01097672, + "auxiliary_loss_mlp": 0.01029458, + "balance_loss_clip": 1.01789486, + "balance_loss_mlp": 1.03316939, + "epoch": 0.9394258229370209, + "flos": 21689434984320.0, + "grad_norm": 1.781484108648701, + "language_loss": 0.89487529, + "learning_rate": 3.832977924388614e-08, + "loss": 0.91614664, + "num_input_tokens_seen": 336948320, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.64453125, + "step": 15625, + "time_per_iteration": 2.441397190093994 + }, + { + "auxiliary_loss_clip": 0.01099705, + "auxiliary_loss_mlp": 0.01031299, + "balance_loss_clip": 1.01952124, + "balance_loss_mlp": 1.03396618, + "epoch": 0.9394859461896888, + "flos": 23874203819520.0, + "grad_norm": 2.0992089201785076, + "language_loss": 0.83631927, + "learning_rate": 3.825393386298592e-08, + "loss": 0.8576293, + "num_input_tokens_seen": 336967670, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.65625, + "step": 15626, + "time_per_iteration": 2.4755821228027344 + }, + { + "auxiliary_loss_clip": 0.01021753, + "auxiliary_loss_mlp": 0.00999666, + "balance_loss_clip": 0.9987244, + "balance_loss_mlp": 1.00174892, + "epoch": 0.9395460694423569, + "flos": 61566116993280.0, + "grad_norm": 0.77489009158345, + "language_loss": 0.56156707, + "learning_rate": 3.8178162871693284e-08, + "loss": 0.58178127, + "num_input_tokens_seen": 337028395, + "router_z_loss_clip": 0.00939941, + "router_z_loss_mlp": 0.20019531, + "step": 15627, + "time_per_iteration": 2.99603271484375 + }, + { + "auxiliary_loss_clip": 0.01099008, + "auxiliary_loss_mlp": 0.01029639, + "balance_loss_clip": 1.018857, + "balance_loss_mlp": 1.03538132, + "epoch": 0.9396061926950248, + "flos": 20995712640000.0, + "grad_norm": 2.0619040922796605, + "language_loss": 0.69850802, + "learning_rate": 3.810246627288105e-08, + "loss": 0.71979451, + "num_input_tokens_seen": 337048150, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.63671875, + "step": 15628, + "time_per_iteration": 2.4771571159362793 + }, + { + "auxiliary_loss_clip": 0.01098362, + "auxiliary_loss_mlp": 0.01029035, + "balance_loss_clip": 1.01757264, + "balance_loss_mlp": 1.03369975, + "epoch": 0.9396663159476928, + "flos": 27487786832640.0, + "grad_norm": 1.5748669806960962, + "language_loss": 0.75526696, + "learning_rate": 3.8026844069420025e-08, + "loss": 0.77654099, + "num_input_tokens_seen": 337069315, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 15629, + "time_per_iteration": 2.484584331512451 + }, + { + "auxiliary_loss_clip": 0.01095519, + "auxiliary_loss_mlp": 0.01027671, + "balance_loss_clip": 1.01682913, + "balance_loss_mlp": 1.03342628, + "epoch": 0.9397264392003607, + "flos": 19427457864960.0, + "grad_norm": 1.7575351495605849, + "language_loss": 0.74100959, + "learning_rate": 3.795129626417748e-08, + "loss": 0.76224142, + "num_input_tokens_seen": 337087765, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.62109375, + "step": 15630, + "time_per_iteration": 2.438732862472534 + }, + { + "auxiliary_loss_clip": 0.01095471, + "auxiliary_loss_mlp": 0.01028302, + "balance_loss_clip": 1.01762676, + "balance_loss_mlp": 1.0336659, + "epoch": 0.9397865624530287, + "flos": 18004820826240.0, + "grad_norm": 1.9628728384394338, + "language_loss": 0.69608629, + "learning_rate": 3.787582286001845e-08, + "loss": 0.71732402, + "num_input_tokens_seen": 337106265, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6171875, + "step": 15631, + "time_per_iteration": 2.442594289779663 + }, + { + "auxiliary_loss_clip": 0.01098903, + "auxiliary_loss_mlp": 0.01033034, + "balance_loss_clip": 1.02210879, + "balance_loss_mlp": 1.03457558, + "epoch": 0.9398466857056966, + "flos": 22564613859840.0, + "grad_norm": 1.6711410965804296, + "language_loss": 0.7501359, + "learning_rate": 3.7800423859805086e-08, + "loss": 0.77145523, + "num_input_tokens_seen": 337126090, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.64453125, + "step": 15632, + "time_per_iteration": 2.468679428100586 + }, + { + "auxiliary_loss_clip": 0.01105333, + "auxiliary_loss_mlp": 0.01032075, + "balance_loss_clip": 1.01908159, + "balance_loss_mlp": 1.03677893, + "epoch": 0.9399068089583646, + "flos": 24535678728960.0, + "grad_norm": 1.544123558907395, + "language_loss": 0.7436294, + "learning_rate": 3.772509926639622e-08, + "loss": 0.76500344, + "num_input_tokens_seen": 337145655, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6875, + "step": 15633, + "time_per_iteration": 2.484532594680786 + }, + { + "auxiliary_loss_clip": 0.01101475, + "auxiliary_loss_mlp": 0.01035563, + "balance_loss_clip": 1.0230515, + "balance_loss_mlp": 1.03425372, + "epoch": 0.9399669322110327, + "flos": 25630343660160.0, + "grad_norm": 1.9390816781204983, + "language_loss": 0.72402227, + "learning_rate": 3.764984908264823e-08, + "loss": 0.74539268, + "num_input_tokens_seen": 337164805, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.671875, + "step": 15634, + "time_per_iteration": 2.5017216205596924 + }, + { + "auxiliary_loss_clip": 0.01101172, + "auxiliary_loss_mlp": 0.01029764, + "balance_loss_clip": 1.017694, + "balance_loss_mlp": 1.03352332, + "epoch": 0.9400270554637006, + "flos": 17089385783040.0, + "grad_norm": 1.8378932656538167, + "language_loss": 0.689273, + "learning_rate": 3.75746733114144e-08, + "loss": 0.71058238, + "num_input_tokens_seen": 337182280, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.67578125, + "step": 15635, + "time_per_iteration": 2.422240972518921 + }, + { + "auxiliary_loss_clip": 0.01096959, + "auxiliary_loss_mlp": 0.01026692, + "balance_loss_clip": 1.01565313, + "balance_loss_mlp": 1.0343622, + "epoch": 0.9400871787163686, + "flos": 22055113393920.0, + "grad_norm": 1.6117184105576439, + "language_loss": 0.74286044, + "learning_rate": 3.7499571955545985e-08, + "loss": 0.76409698, + "num_input_tokens_seen": 337203495, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.625, + "step": 15636, + "time_per_iteration": 2.4935927391052246 + }, + { + "auxiliary_loss_clip": 0.01101255, + "auxiliary_loss_mlp": 0.01030148, + "balance_loss_clip": 1.01831651, + "balance_loss_mlp": 1.03481007, + "epoch": 0.9401473019690365, + "flos": 16982767238400.0, + "grad_norm": 1.952777040568534, + "language_loss": 0.82884896, + "learning_rate": 3.7424545017890054e-08, + "loss": 0.85016298, + "num_input_tokens_seen": 337220435, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 15637, + "time_per_iteration": 3.8426685333251953 + }, + { + "auxiliary_loss_clip": 0.0109996, + "auxiliary_loss_mlp": 0.01028077, + "balance_loss_clip": 1.01624513, + "balance_loss_mlp": 1.03416872, + "epoch": 0.9402074252217045, + "flos": 19681956702720.0, + "grad_norm": 2.38782195804008, + "language_loss": 0.68863559, + "learning_rate": 3.7349592501292325e-08, + "loss": 0.70991588, + "num_input_tokens_seen": 337238095, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65625, + "step": 15638, + "time_per_iteration": 2.4199607372283936 + }, + { + "auxiliary_loss_clip": 0.01094752, + "auxiliary_loss_mlp": 0.01035807, + "balance_loss_clip": 1.02552509, + "balance_loss_mlp": 1.0335573, + "epoch": 0.9402675484743724, + "flos": 24754302858240.0, + "grad_norm": 1.5928992567887847, + "language_loss": 0.84922618, + "learning_rate": 3.727471440859498e-08, + "loss": 0.8705318, + "num_input_tokens_seen": 337256645, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.609375, + "step": 15639, + "time_per_iteration": 5.2344276905059814 + }, + { + "auxiliary_loss_clip": 0.01097979, + "auxiliary_loss_mlp": 0.01025309, + "balance_loss_clip": 1.01428199, + "balance_loss_mlp": 1.03253627, + "epoch": 0.9403276717270405, + "flos": 25558630156800.0, + "grad_norm": 1.5027378140640861, + "language_loss": 0.78141928, + "learning_rate": 3.719991074263662e-08, + "loss": 0.80265212, + "num_input_tokens_seen": 337278360, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 15640, + "time_per_iteration": 2.494884729385376 + }, + { + "auxiliary_loss_clip": 0.01100943, + "auxiliary_loss_mlp": 0.01031446, + "balance_loss_clip": 1.01984107, + "balance_loss_mlp": 1.0335753, + "epoch": 0.9403877949797084, + "flos": 26689852154880.0, + "grad_norm": 1.5522382410230178, + "language_loss": 0.74184501, + "learning_rate": 3.7125181506254544e-08, + "loss": 0.76316881, + "num_input_tokens_seen": 337302480, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 15641, + "time_per_iteration": 2.5522215366363525 + }, + { + "auxiliary_loss_clip": 0.01103462, + "auxiliary_loss_mlp": 0.01028087, + "balance_loss_clip": 1.01514721, + "balance_loss_mlp": 1.03437018, + "epoch": 0.9404479182323764, + "flos": 15011666455680.0, + "grad_norm": 1.8973680252603045, + "language_loss": 0.82064319, + "learning_rate": 3.7050526702282256e-08, + "loss": 0.84195864, + "num_input_tokens_seen": 337316600, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69140625, + "step": 15642, + "time_per_iteration": 2.452345132827759 + }, + { + "auxiliary_loss_clip": 0.01095842, + "auxiliary_loss_mlp": 0.01028838, + "balance_loss_clip": 1.01798356, + "balance_loss_mlp": 1.0321306, + "epoch": 0.9405080414850443, + "flos": 24973573432320.0, + "grad_norm": 2.054671844986166, + "language_loss": 0.6789223, + "learning_rate": 3.697594633355084e-08, + "loss": 0.70016909, + "num_input_tokens_seen": 337336895, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.63671875, + "step": 15643, + "time_per_iteration": 3.902398109436035 + }, + { + "auxiliary_loss_clip": 0.01104768, + "auxiliary_loss_mlp": 0.0103627, + "balance_loss_clip": 1.02403283, + "balance_loss_mlp": 1.03681779, + "epoch": 0.9405681647377123, + "flos": 20844743777280.0, + "grad_norm": 2.174179012657807, + "language_loss": 0.76626414, + "learning_rate": 3.6901440402888226e-08, + "loss": 0.78767455, + "num_input_tokens_seen": 337355105, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6796875, + "step": 15644, + "time_per_iteration": 2.440704107284546 + }, + { + "auxiliary_loss_clip": 0.0109653, + "auxiliary_loss_mlp": 0.01028228, + "balance_loss_clip": 1.01788664, + "balance_loss_mlp": 1.03375196, + "epoch": 0.9406282879903802, + "flos": 23805578885760.0, + "grad_norm": 1.502080892073022, + "language_loss": 0.67556715, + "learning_rate": 3.682700891311974e-08, + "loss": 0.69681478, + "num_input_tokens_seen": 337374905, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.62890625, + "step": 15645, + "time_per_iteration": 2.4514553546905518 + }, + { + "auxiliary_loss_clip": 0.01094594, + "auxiliary_loss_mlp": 0.01030125, + "balance_loss_clip": 1.01893699, + "balance_loss_mlp": 1.03271198, + "epoch": 0.9406884112430483, + "flos": 27674953626240.0, + "grad_norm": 2.1236359589025944, + "language_loss": 0.702784, + "learning_rate": 3.6752651867067774e-08, + "loss": 0.72403121, + "num_input_tokens_seen": 337397130, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.62109375, + "step": 15646, + "time_per_iteration": 2.5117604732513428 + }, + { + "auxiliary_loss_clip": 0.01095576, + "auxiliary_loss_mlp": 0.01028688, + "balance_loss_clip": 1.01755929, + "balance_loss_mlp": 1.03207064, + "epoch": 0.9407485344957163, + "flos": 23075048079360.0, + "grad_norm": 1.6591127603989193, + "language_loss": 0.74060643, + "learning_rate": 3.667836926755208e-08, + "loss": 0.76184905, + "num_input_tokens_seen": 337418660, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6328125, + "step": 15647, + "time_per_iteration": 2.4559590816497803 + }, + { + "auxiliary_loss_clip": 0.0102153, + "auxiliary_loss_mlp": 0.00997604, + "balance_loss_clip": 0.99657249, + "balance_loss_mlp": 1.00147247, + "epoch": 0.9408086577483842, + "flos": 71014034304000.0, + "grad_norm": 0.8841622693124102, + "language_loss": 0.63519818, + "learning_rate": 3.660416111738907e-08, + "loss": 0.65538949, + "num_input_tokens_seen": 337478055, + "router_z_loss_clip": 0.01031494, + "router_z_loss_mlp": 0.20117188, + "step": 15648, + "time_per_iteration": 3.1404430866241455 + }, + { + "auxiliary_loss_clip": 0.01096098, + "auxiliary_loss_mlp": 0.01029583, + "balance_loss_clip": 1.01900291, + "balance_loss_mlp": 1.03372252, + "epoch": 0.9408687810010522, + "flos": 23730956380800.0, + "grad_norm": 1.3226371584068994, + "language_loss": 0.66610408, + "learning_rate": 3.653002741939337e-08, + "loss": 0.68736088, + "num_input_tokens_seen": 337499405, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.625, + "step": 15649, + "time_per_iteration": 2.475015878677368 + }, + { + "auxiliary_loss_clip": 0.01097478, + "auxiliary_loss_mlp": 0.01026624, + "balance_loss_clip": 1.01572871, + "balance_loss_mlp": 1.03270769, + "epoch": 0.9409289042537201, + "flos": 18369314087040.0, + "grad_norm": 1.8009089263007458, + "language_loss": 0.77365673, + "learning_rate": 3.645596817637586e-08, + "loss": 0.79489779, + "num_input_tokens_seen": 337517195, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 15650, + "time_per_iteration": 2.4524431228637695 + }, + { + "auxiliary_loss_clip": 0.01100587, + "auxiliary_loss_mlp": 0.01028045, + "balance_loss_clip": 1.01703572, + "balance_loss_mlp": 1.03667188, + "epoch": 0.9409890275063881, + "flos": 23878333883520.0, + "grad_norm": 1.6843054637423838, + "language_loss": 0.74430692, + "learning_rate": 3.638198339114451e-08, + "loss": 0.76559329, + "num_input_tokens_seen": 337535245, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.640625, + "step": 15651, + "time_per_iteration": 2.463660478591919 + }, + { + "auxiliary_loss_clip": 0.01097843, + "auxiliary_loss_mlp": 0.01032001, + "balance_loss_clip": 1.02029419, + "balance_loss_mlp": 1.03302097, + "epoch": 0.941049150759056, + "flos": 16545088016640.0, + "grad_norm": 1.7829551002680968, + "language_loss": 0.7249018, + "learning_rate": 3.630807306650507e-08, + "loss": 0.7462002, + "num_input_tokens_seen": 337553040, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6484375, + "step": 15652, + "time_per_iteration": 2.4381537437438965 + }, + { + "auxiliary_loss_clip": 0.01104805, + "auxiliary_loss_mlp": 0.01032666, + "balance_loss_clip": 1.02049518, + "balance_loss_mlp": 1.0356704, + "epoch": 0.9411092740117241, + "flos": 25118401069440.0, + "grad_norm": 1.691105612906213, + "language_loss": 0.66318548, + "learning_rate": 3.6234237205260645e-08, + "loss": 0.68456018, + "num_input_tokens_seen": 337574580, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69140625, + "step": 15653, + "time_per_iteration": 2.4872758388519287 + }, + { + "auxiliary_loss_clip": 0.01100084, + "auxiliary_loss_mlp": 0.01034632, + "balance_loss_clip": 1.02277052, + "balance_loss_mlp": 1.03475976, + "epoch": 0.941169397264392, + "flos": 21142264129920.0, + "grad_norm": 1.8957291513192398, + "language_loss": 0.7746827, + "learning_rate": 3.6160475810210536e-08, + "loss": 0.79602987, + "num_input_tokens_seen": 337593010, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.65234375, + "step": 15654, + "time_per_iteration": 2.446638822555542 + }, + { + "auxiliary_loss_clip": 0.01103532, + "auxiliary_loss_mlp": 0.01028024, + "balance_loss_clip": 1.01635361, + "balance_loss_mlp": 1.03482795, + "epoch": 0.94122952051706, + "flos": 38508914995200.0, + "grad_norm": 1.5569925179074333, + "language_loss": 0.70128828, + "learning_rate": 3.6086788884152065e-08, + "loss": 0.7226038, + "num_input_tokens_seen": 337616170, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6875, + "step": 15655, + "time_per_iteration": 2.6205286979675293 + }, + { + "auxiliary_loss_clip": 0.01099387, + "auxiliary_loss_mlp": 0.01029822, + "balance_loss_clip": 1.01775813, + "balance_loss_mlp": 1.03365254, + "epoch": 0.9412896437697279, + "flos": 18369206346240.0, + "grad_norm": 2.4560073984117587, + "language_loss": 0.71858692, + "learning_rate": 3.601317642987944e-08, + "loss": 0.73987901, + "num_input_tokens_seen": 337635215, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.65625, + "step": 15656, + "time_per_iteration": 2.4531502723693848 + }, + { + "auxiliary_loss_clip": 0.01098082, + "auxiliary_loss_mlp": 0.01023486, + "balance_loss_clip": 1.01273918, + "balance_loss_mlp": 1.03367221, + "epoch": 0.9413497670223959, + "flos": 25884950238720.0, + "grad_norm": 1.7752477061266863, + "language_loss": 0.77574635, + "learning_rate": 3.593963845018377e-08, + "loss": 0.79696202, + "num_input_tokens_seen": 337654195, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.64453125, + "step": 15657, + "time_per_iteration": 2.463580369949341 + }, + { + "auxiliary_loss_clip": 0.01097093, + "auxiliary_loss_mlp": 0.01028309, + "balance_loss_clip": 1.0169487, + "balance_loss_mlp": 1.03154922, + "epoch": 0.9414098902750638, + "flos": 16618309891200.0, + "grad_norm": 2.647654113468961, + "language_loss": 0.84199923, + "learning_rate": 3.586617494785371e-08, + "loss": 0.86325324, + "num_input_tokens_seen": 337671810, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65234375, + "step": 15658, + "time_per_iteration": 2.4232261180877686 + }, + { + "auxiliary_loss_clip": 0.01105471, + "auxiliary_loss_mlp": 0.01032031, + "balance_loss_clip": 1.01844144, + "balance_loss_mlp": 1.03557217, + "epoch": 0.9414700135277319, + "flos": 18625033987200.0, + "grad_norm": 1.771509700042808, + "language_loss": 0.70189822, + "learning_rate": 3.5792785925675254e-08, + "loss": 0.72327328, + "num_input_tokens_seen": 337689410, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.69921875, + "step": 15659, + "time_per_iteration": 2.417872667312622 + }, + { + "auxiliary_loss_clip": 0.01097087, + "auxiliary_loss_mlp": 0.01037967, + "balance_loss_clip": 1.02753043, + "balance_loss_mlp": 1.03271377, + "epoch": 0.9415301367803999, + "flos": 26280146649600.0, + "grad_norm": 1.6741789301448684, + "language_loss": 0.79718721, + "learning_rate": 3.571947138643172e-08, + "loss": 0.81853777, + "num_input_tokens_seen": 337709950, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.64453125, + "step": 15660, + "time_per_iteration": 2.473811626434326 + }, + { + "auxiliary_loss_clip": 0.01095424, + "auxiliary_loss_mlp": 0.01027321, + "balance_loss_clip": 1.01637769, + "balance_loss_mlp": 1.03255498, + "epoch": 0.9415902600330678, + "flos": 23261388860160.0, + "grad_norm": 1.4876002398882395, + "language_loss": 0.67924452, + "learning_rate": 3.564623133290201e-08, + "loss": 0.700472, + "num_input_tokens_seen": 337731320, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.62890625, + "step": 15661, + "time_per_iteration": 2.494828224182129 + }, + { + "auxiliary_loss_clip": 0.01098031, + "auxiliary_loss_mlp": 0.01028231, + "balance_loss_clip": 1.01697206, + "balance_loss_mlp": 1.03291059, + "epoch": 0.9416503832857358, + "flos": 14719138093440.0, + "grad_norm": 2.0808441328825977, + "language_loss": 0.65976989, + "learning_rate": 3.557306576786434e-08, + "loss": 0.68103254, + "num_input_tokens_seen": 337747720, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65234375, + "step": 15662, + "time_per_iteration": 2.4719059467315674 + }, + { + "auxiliary_loss_clip": 0.0102173, + "auxiliary_loss_mlp": 0.0099693, + "balance_loss_clip": 0.9958874, + "balance_loss_mlp": 1.00163436, + "epoch": 0.9417105065384037, + "flos": 70312698276480.0, + "grad_norm": 0.7998608385286157, + "language_loss": 0.59257972, + "learning_rate": 3.5499974694092935e-08, + "loss": 0.61276639, + "num_input_tokens_seen": 337806930, + "router_z_loss_clip": 0.01043701, + "router_z_loss_mlp": 0.20117188, + "step": 15663, + "time_per_iteration": 3.091102361679077 + }, + { + "auxiliary_loss_clip": 0.01104755, + "auxiliary_loss_mlp": 0.01034523, + "balance_loss_clip": 1.02186322, + "balance_loss_mlp": 1.03546786, + "epoch": 0.9417706297910717, + "flos": 34057895322240.0, + "grad_norm": 1.7691136273672572, + "language_loss": 0.66977489, + "learning_rate": 3.542695811435914e-08, + "loss": 0.69116765, + "num_input_tokens_seen": 337828100, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 15664, + "time_per_iteration": 2.551748514175415 + }, + { + "auxiliary_loss_clip": 0.01098686, + "auxiliary_loss_mlp": 0.01029058, + "balance_loss_clip": 1.01798368, + "balance_loss_mlp": 1.03435826, + "epoch": 0.9418307530437396, + "flos": 16471614746880.0, + "grad_norm": 1.9356485075218302, + "language_loss": 0.73331189, + "learning_rate": 3.535401603143207e-08, + "loss": 0.75458938, + "num_input_tokens_seen": 337844805, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.64453125, + "step": 15665, + "time_per_iteration": 2.406175136566162 + }, + { + "auxiliary_loss_clip": 0.01096646, + "auxiliary_loss_mlp": 0.0103092, + "balance_loss_clip": 1.01976252, + "balance_loss_mlp": 1.03395486, + "epoch": 0.9418908762964077, + "flos": 11253543114240.0, + "grad_norm": 1.8537640215449973, + "language_loss": 0.6403262, + "learning_rate": 3.528114844807773e-08, + "loss": 0.6616019, + "num_input_tokens_seen": 337860490, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.625, + "step": 15666, + "time_per_iteration": 2.4373819828033447 + }, + { + "auxiliary_loss_clip": 0.0109862, + "auxiliary_loss_mlp": 0.01029507, + "balance_loss_clip": 1.01766348, + "balance_loss_mlp": 1.03337002, + "epoch": 0.9419509995490756, + "flos": 18438836860800.0, + "grad_norm": 1.6324582019369962, + "language_loss": 0.78879476, + "learning_rate": 3.520835536705902e-08, + "loss": 0.81007606, + "num_input_tokens_seen": 337878360, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65234375, + "step": 15667, + "time_per_iteration": 2.475374937057495 + }, + { + "auxiliary_loss_clip": 0.01096246, + "auxiliary_loss_mlp": 0.01025503, + "balance_loss_clip": 1.01497126, + "balance_loss_mlp": 1.03265738, + "epoch": 0.9420111228017436, + "flos": 20737945664640.0, + "grad_norm": 1.624394565290511, + "language_loss": 0.75196528, + "learning_rate": 3.5135636791136404e-08, + "loss": 0.77318275, + "num_input_tokens_seen": 337895635, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.63671875, + "step": 15668, + "time_per_iteration": 2.4471614360809326 + }, + { + "auxiliary_loss_clip": 0.01100563, + "auxiliary_loss_mlp": 0.01029941, + "balance_loss_clip": 1.01816332, + "balance_loss_mlp": 1.03369188, + "epoch": 0.9420712460544115, + "flos": 21141940907520.0, + "grad_norm": 1.9068150139055333, + "language_loss": 0.58626127, + "learning_rate": 3.506299272306723e-08, + "loss": 0.6075663, + "num_input_tokens_seen": 337913940, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.66796875, + "step": 15669, + "time_per_iteration": 2.4526097774505615 + }, + { + "auxiliary_loss_clip": 0.01094433, + "auxiliary_loss_mlp": 0.01024409, + "balance_loss_clip": 1.01398444, + "balance_loss_mlp": 1.03261268, + "epoch": 0.9421313693070795, + "flos": 15851760721920.0, + "grad_norm": 1.5947911043474419, + "language_loss": 0.76924133, + "learning_rate": 3.4990423165606406e-08, + "loss": 0.79042977, + "num_input_tokens_seen": 337932015, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.6171875, + "step": 15670, + "time_per_iteration": 2.4160544872283936 + }, + { + "auxiliary_loss_clip": 0.01100773, + "auxiliary_loss_mlp": 0.01035593, + "balance_loss_clip": 1.02370214, + "balance_loss_mlp": 1.03572822, + "epoch": 0.9421914925597474, + "flos": 32415915882240.0, + "grad_norm": 1.7512755345783233, + "language_loss": 0.65079868, + "learning_rate": 3.491792812150574e-08, + "loss": 0.67216229, + "num_input_tokens_seen": 337953345, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.65234375, + "step": 15671, + "time_per_iteration": 2.55161714553833 + }, + { + "auxiliary_loss_clip": 0.01098368, + "auxiliary_loss_mlp": 0.01030287, + "balance_loss_clip": 1.018682, + "balance_loss_mlp": 1.03393149, + "epoch": 0.9422516158124155, + "flos": 19718513769600.0, + "grad_norm": 1.6476234457343555, + "language_loss": 0.79277271, + "learning_rate": 3.48455075935139e-08, + "loss": 0.81405926, + "num_input_tokens_seen": 337973685, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.64453125, + "step": 15672, + "time_per_iteration": 2.447295904159546 + }, + { + "auxiliary_loss_clip": 0.01102241, + "auxiliary_loss_mlp": 0.01033911, + "balance_loss_clip": 1.02151322, + "balance_loss_mlp": 1.03375137, + "epoch": 0.9423117390650835, + "flos": 16253277926400.0, + "grad_norm": 2.0043909265560376, + "language_loss": 0.73136175, + "learning_rate": 3.47731615843776e-08, + "loss": 0.75272328, + "num_input_tokens_seen": 337989175, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 15673, + "time_per_iteration": 2.4218337535858154 + }, + { + "auxiliary_loss_clip": 0.01097219, + "auxiliary_loss_mlp": 0.01028159, + "balance_loss_clip": 1.01602328, + "balance_loss_mlp": 1.03284574, + "epoch": 0.9423718623177514, + "flos": 31796564647680.0, + "grad_norm": 1.8096983236306534, + "language_loss": 0.70210505, + "learning_rate": 3.470089009683974e-08, + "loss": 0.72335875, + "num_input_tokens_seen": 338011800, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.64453125, + "step": 15674, + "time_per_iteration": 2.529244899749756 + }, + { + "auxiliary_loss_clip": 0.01098708, + "auxiliary_loss_mlp": 0.01024471, + "balance_loss_clip": 1.01358771, + "balance_loss_mlp": 1.03351128, + "epoch": 0.9424319855704194, + "flos": 23331809473920.0, + "grad_norm": 1.6393358696114226, + "language_loss": 0.81179047, + "learning_rate": 3.462869313364125e-08, + "loss": 0.8330223, + "num_input_tokens_seen": 338032120, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.65234375, + "step": 15675, + "time_per_iteration": 2.4658918380737305 + }, + { + "auxiliary_loss_clip": 0.01099127, + "auxiliary_loss_mlp": 0.01025447, + "balance_loss_clip": 1.01464629, + "balance_loss_mlp": 1.03426433, + "epoch": 0.9424921088230873, + "flos": 20777627214720.0, + "grad_norm": 1.5876960969874918, + "language_loss": 0.62726951, + "learning_rate": 3.4556570697519494e-08, + "loss": 0.64851522, + "num_input_tokens_seen": 338051880, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6484375, + "step": 15676, + "time_per_iteration": 2.4417946338653564 + }, + { + "auxiliary_loss_clip": 0.01098357, + "auxiliary_loss_mlp": 0.01036136, + "balance_loss_clip": 1.02487063, + "balance_loss_mlp": 1.03403687, + "epoch": 0.9425522320757553, + "flos": 19026658932480.0, + "grad_norm": 1.8124440468935443, + "language_loss": 0.67221808, + "learning_rate": 3.448452279120984e-08, + "loss": 0.69356304, + "num_input_tokens_seen": 338069665, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.64453125, + "step": 15677, + "time_per_iteration": 2.468874454498291 + }, + { + "auxiliary_loss_clip": 0.01099749, + "auxiliary_loss_mlp": 0.01033718, + "balance_loss_clip": 1.02086103, + "balance_loss_mlp": 1.03233802, + "epoch": 0.9426123553284232, + "flos": 25155353185920.0, + "grad_norm": 1.9350758720269774, + "language_loss": 0.64217019, + "learning_rate": 3.441254941744387e-08, + "loss": 0.6635049, + "num_input_tokens_seen": 338090490, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.67578125, + "step": 15678, + "time_per_iteration": 4.009870290756226 + }, + { + "auxiliary_loss_clip": 0.01098418, + "auxiliary_loss_mlp": 0.01026173, + "balance_loss_clip": 1.01448464, + "balance_loss_mlp": 1.03428507, + "epoch": 0.9426724785810913, + "flos": 21179359900800.0, + "grad_norm": 1.6704050189510526, + "language_loss": 0.74096805, + "learning_rate": 3.434065057895097e-08, + "loss": 0.76221395, + "num_input_tokens_seen": 338109825, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.640625, + "step": 15679, + "time_per_iteration": 2.480060338973999 + }, + { + "auxiliary_loss_clip": 0.01102722, + "auxiliary_loss_mlp": 0.01034946, + "balance_loss_clip": 1.02322173, + "balance_loss_mlp": 1.03508186, + "epoch": 0.9427326018337592, + "flos": 14756916222720.0, + "grad_norm": 2.2968062400181757, + "language_loss": 0.7742976, + "learning_rate": 3.426882627845762e-08, + "loss": 0.7956742, + "num_input_tokens_seen": 338125790, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 15680, + "time_per_iteration": 3.8283774852752686 + }, + { + "auxiliary_loss_clip": 0.01098292, + "auxiliary_loss_mlp": 0.01032384, + "balance_loss_clip": 1.02092791, + "balance_loss_mlp": 1.03348768, + "epoch": 0.9427927250864272, + "flos": 20923640000640.0, + "grad_norm": 2.224608877845115, + "language_loss": 0.75309384, + "learning_rate": 3.419707651868742e-08, + "loss": 0.77440059, + "num_input_tokens_seen": 338145610, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6484375, + "step": 15681, + "time_per_iteration": 3.931302547454834 + }, + { + "auxiliary_loss_clip": 0.01101593, + "auxiliary_loss_mlp": 0.01031351, + "balance_loss_clip": 1.0193882, + "balance_loss_mlp": 1.035344, + "epoch": 0.9428528483390951, + "flos": 19752520970880.0, + "grad_norm": 2.642338039071416, + "language_loss": 0.65794468, + "learning_rate": 3.412540130236086e-08, + "loss": 0.67927414, + "num_input_tokens_seen": 338165960, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6640625, + "step": 15682, + "time_per_iteration": 2.472961664199829 + }, + { + "auxiliary_loss_clip": 0.01096718, + "auxiliary_loss_mlp": 0.01026149, + "balance_loss_clip": 1.01485944, + "balance_loss_mlp": 1.03221107, + "epoch": 0.9429129715917631, + "flos": 24534996370560.0, + "grad_norm": 3.7877883909728833, + "language_loss": 0.76713276, + "learning_rate": 3.405380063219665e-08, + "loss": 0.78836143, + "num_input_tokens_seen": 338187215, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6484375, + "step": 15683, + "time_per_iteration": 2.4841740131378174 + }, + { + "auxiliary_loss_clip": 0.01103642, + "auxiliary_loss_mlp": 0.01036534, + "balance_loss_clip": 1.02421999, + "balance_loss_mlp": 1.03587162, + "epoch": 0.942973094844431, + "flos": 17959824063360.0, + "grad_norm": 2.6308434304413066, + "language_loss": 0.75243759, + "learning_rate": 3.398227451090885e-08, + "loss": 0.77383941, + "num_input_tokens_seen": 338201825, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6796875, + "step": 15684, + "time_per_iteration": 2.407205104827881 + }, + { + "auxiliary_loss_clip": 0.01096124, + "auxiliary_loss_mlp": 0.01024816, + "balance_loss_clip": 1.01373529, + "balance_loss_mlp": 1.033481, + "epoch": 0.9430332180970991, + "flos": 26137689310080.0, + "grad_norm": 1.5634156526409637, + "language_loss": 0.77202857, + "learning_rate": 3.391082294121017e-08, + "loss": 0.79323792, + "num_input_tokens_seen": 338220865, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.625, + "step": 15685, + "time_per_iteration": 3.919050455093384 + }, + { + "auxiliary_loss_clip": 0.01095885, + "auxiliary_loss_mlp": 0.01028054, + "balance_loss_clip": 1.01716995, + "balance_loss_mlp": 1.03258085, + "epoch": 0.943093341349767, + "flos": 23951376190080.0, + "grad_norm": 1.8298649374723515, + "language_loss": 0.75466609, + "learning_rate": 3.383944592581023e-08, + "loss": 0.77590549, + "num_input_tokens_seen": 338240160, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6328125, + "step": 15686, + "time_per_iteration": 2.4832725524902344 + }, + { + "auxiliary_loss_clip": 0.01101014, + "auxiliary_loss_mlp": 0.01026995, + "balance_loss_clip": 1.01487172, + "balance_loss_mlp": 1.03364956, + "epoch": 0.943153464602435, + "flos": 17968407413760.0, + "grad_norm": 1.8707164561983298, + "language_loss": 0.80791461, + "learning_rate": 3.376814346741575e-08, + "loss": 0.82919466, + "num_input_tokens_seen": 338259305, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 15687, + "time_per_iteration": 2.447073221206665 + }, + { + "auxiliary_loss_clip": 0.01103674, + "auxiliary_loss_mlp": 0.01032562, + "balance_loss_clip": 1.01928234, + "balance_loss_mlp": 1.03497446, + "epoch": 0.943213587855103, + "flos": 14501519544960.0, + "grad_norm": 4.150535052398094, + "language_loss": 0.75942636, + "learning_rate": 3.369691556873011e-08, + "loss": 0.78078878, + "num_input_tokens_seen": 338274950, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6875, + "step": 15688, + "time_per_iteration": 2.449028730392456 + }, + { + "auxiliary_loss_clip": 0.01095339, + "auxiliary_loss_mlp": 0.01024738, + "balance_loss_clip": 1.01338911, + "balance_loss_mlp": 1.03330553, + "epoch": 0.9432737111077709, + "flos": 28986411093120.0, + "grad_norm": 1.647047447068589, + "language_loss": 0.68151128, + "learning_rate": 3.3625762232454504e-08, + "loss": 0.70271206, + "num_input_tokens_seen": 338295585, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.62109375, + "step": 15689, + "time_per_iteration": 2.535231828689575 + }, + { + "auxiliary_loss_clip": 0.01097551, + "auxiliary_loss_mlp": 0.0103356, + "balance_loss_clip": 1.02347493, + "balance_loss_mlp": 1.03406012, + "epoch": 0.9433338343604389, + "flos": 21609066303360.0, + "grad_norm": 1.9633082824839947, + "language_loss": 0.80533433, + "learning_rate": 3.35546834612872e-08, + "loss": 0.82664549, + "num_input_tokens_seen": 338314555, + "router_z_loss_clip": 0.10058594, + "router_z_loss_mlp": 0.6328125, + "step": 15690, + "time_per_iteration": 2.475369691848755 + }, + { + "auxiliary_loss_clip": 0.01098715, + "auxiliary_loss_mlp": 0.01029222, + "balance_loss_clip": 1.01812971, + "balance_loss_mlp": 1.03422964, + "epoch": 0.9433939576131068, + "flos": 33182285483520.0, + "grad_norm": 1.82445533234153, + "language_loss": 0.60167646, + "learning_rate": 3.348367925792317e-08, + "loss": 0.6229558, + "num_input_tokens_seen": 338336260, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.64453125, + "step": 15691, + "time_per_iteration": 2.6009466648101807 + }, + { + "auxiliary_loss_clip": 0.0110339, + "auxiliary_loss_mlp": 0.01027182, + "balance_loss_clip": 1.01595259, + "balance_loss_mlp": 1.03676319, + "epoch": 0.9434540808657749, + "flos": 20486391742080.0, + "grad_norm": 1.6101520183489826, + "language_loss": 0.66512716, + "learning_rate": 3.341274962505514e-08, + "loss": 0.68643284, + "num_input_tokens_seen": 338354680, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6640625, + "step": 15692, + "time_per_iteration": 2.4502696990966797 + }, + { + "auxiliary_loss_clip": 0.01100244, + "auxiliary_loss_mlp": 0.01028364, + "balance_loss_clip": 1.0168364, + "balance_loss_mlp": 1.03405428, + "epoch": 0.9435142041184428, + "flos": 21542955321600.0, + "grad_norm": 2.6572224401023212, + "language_loss": 0.75021255, + "learning_rate": 3.334189456537251e-08, + "loss": 0.77149868, + "num_input_tokens_seen": 338372490, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 15693, + "time_per_iteration": 2.433387517929077 + }, + { + "auxiliary_loss_clip": 0.01100061, + "auxiliary_loss_mlp": 0.01029217, + "balance_loss_clip": 1.01798141, + "balance_loss_mlp": 1.03463674, + "epoch": 0.9435743273711108, + "flos": 25009089004800.0, + "grad_norm": 1.6938865356157475, + "language_loss": 0.72807014, + "learning_rate": 3.327111408156291e-08, + "loss": 0.74936283, + "num_input_tokens_seen": 338390870, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 15694, + "time_per_iteration": 2.5123260021209717 + }, + { + "auxiliary_loss_clip": 0.01021837, + "auxiliary_loss_mlp": 0.01003079, + "balance_loss_clip": 1.00211906, + "balance_loss_mlp": 1.00187385, + "epoch": 0.9436344506237787, + "flos": 60158707320960.0, + "grad_norm": 0.7028121553364509, + "language_loss": 0.5058524, + "learning_rate": 3.3200408176309316e-08, + "loss": 0.52610159, + "num_input_tokens_seen": 338453075, + "router_z_loss_clip": 0.00958252, + "router_z_loss_mlp": 0.19921875, + "step": 15695, + "time_per_iteration": 3.097665786743164 + }, + { + "auxiliary_loss_clip": 0.01094346, + "auxiliary_loss_mlp": 0.01028873, + "balance_loss_clip": 1.01795948, + "balance_loss_mlp": 1.03261745, + "epoch": 0.9436945738764467, + "flos": 22237252283520.0, + "grad_norm": 1.6978442865454357, + "language_loss": 0.64904177, + "learning_rate": 3.312977685229335e-08, + "loss": 0.67027402, + "num_input_tokens_seen": 338471770, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6171875, + "step": 15696, + "time_per_iteration": 2.499131679534912 + }, + { + "auxiliary_loss_clip": 0.01100812, + "auxiliary_loss_mlp": 0.01028395, + "balance_loss_clip": 1.01725507, + "balance_loss_mlp": 1.03525257, + "epoch": 0.9437546971291146, + "flos": 25045179194880.0, + "grad_norm": 1.6440087246701751, + "language_loss": 0.66226554, + "learning_rate": 3.305922011219353e-08, + "loss": 0.68355763, + "num_input_tokens_seen": 338492190, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 15697, + "time_per_iteration": 2.471853733062744 + }, + { + "auxiliary_loss_clip": 0.0102159, + "auxiliary_loss_mlp": 0.01000945, + "balance_loss_clip": 0.99992609, + "balance_loss_mlp": 1.00164413, + "epoch": 0.9438148203817827, + "flos": 56790788400000.0, + "grad_norm": 0.8437845938587906, + "language_loss": 0.63223118, + "learning_rate": 3.298873795868506e-08, + "loss": 0.65245652, + "num_input_tokens_seen": 338552560, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.19921875, + "step": 15698, + "time_per_iteration": 2.9581832885742188 + }, + { + "auxiliary_loss_clip": 0.01102672, + "auxiliary_loss_mlp": 0.01035134, + "balance_loss_clip": 1.02309942, + "balance_loss_mlp": 1.03445995, + "epoch": 0.9438749436344506, + "flos": 22346384780160.0, + "grad_norm": 1.6461006250652415, + "language_loss": 0.69387424, + "learning_rate": 3.291833039444092e-08, + "loss": 0.71525228, + "num_input_tokens_seen": 338571770, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 15699, + "time_per_iteration": 2.4698126316070557 + }, + { + "auxiliary_loss_clip": 0.01094807, + "auxiliary_loss_mlp": 0.01027864, + "balance_loss_clip": 1.01686084, + "balance_loss_mlp": 1.03219104, + "epoch": 0.9439350668871186, + "flos": 13370800337280.0, + "grad_norm": 3.490240775423036, + "language_loss": 0.74452382, + "learning_rate": 3.2847997422130734e-08, + "loss": 0.76575059, + "num_input_tokens_seen": 338587310, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.625, + "step": 15700, + "time_per_iteration": 2.4451019763946533 + }, + { + "auxiliary_loss_clip": 0.0109842, + "auxiliary_loss_mlp": 0.01030003, + "balance_loss_clip": 1.01925063, + "balance_loss_mlp": 1.03485513, + "epoch": 0.9439951901397866, + "flos": 17785334770560.0, + "grad_norm": 1.502855371588381, + "language_loss": 0.69993806, + "learning_rate": 3.2777739044421495e-08, + "loss": 0.72122228, + "num_input_tokens_seen": 338606235, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.63671875, + "step": 15701, + "time_per_iteration": 2.447377920150757 + }, + { + "auxiliary_loss_clip": 0.01102808, + "auxiliary_loss_mlp": 0.01025784, + "balance_loss_clip": 1.01436985, + "balance_loss_mlp": 1.03356802, + "epoch": 0.9440553133924545, + "flos": 18879568738560.0, + "grad_norm": 2.0452971801099764, + "language_loss": 0.77940154, + "learning_rate": 3.2707555263977505e-08, + "loss": 0.80068743, + "num_input_tokens_seen": 338624090, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.69140625, + "step": 15702, + "time_per_iteration": 2.4038772583007812 + }, + { + "auxiliary_loss_clip": 0.0110026, + "auxiliary_loss_mlp": 0.01030959, + "balance_loss_clip": 1.01999187, + "balance_loss_mlp": 1.03408504, + "epoch": 0.9441154366451225, + "flos": 19572967860480.0, + "grad_norm": 1.7859029402689504, + "language_loss": 0.66538978, + "learning_rate": 3.2637446083460194e-08, + "loss": 0.68670201, + "num_input_tokens_seen": 338643695, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.66015625, + "step": 15703, + "time_per_iteration": 2.489464044570923 + }, + { + "auxiliary_loss_clip": 0.01102883, + "auxiliary_loss_mlp": 0.01030561, + "balance_loss_clip": 1.01824689, + "balance_loss_mlp": 1.03595018, + "epoch": 0.9441755598977905, + "flos": 30294995472000.0, + "grad_norm": 3.744799778583578, + "language_loss": 0.72917163, + "learning_rate": 3.256741150552833e-08, + "loss": 0.7505061, + "num_input_tokens_seen": 338664725, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.671875, + "step": 15704, + "time_per_iteration": 2.5033814907073975 + }, + { + "auxiliary_loss_clip": 0.01098437, + "auxiliary_loss_mlp": 0.01030263, + "balance_loss_clip": 1.01902747, + "balance_loss_mlp": 1.03447068, + "epoch": 0.9442356831504585, + "flos": 20667884186880.0, + "grad_norm": 1.907105078977413, + "language_loss": 0.7433669, + "learning_rate": 3.2497451532837336e-08, + "loss": 0.76465392, + "num_input_tokens_seen": 338683990, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 15705, + "time_per_iteration": 2.4515786170959473 + }, + { + "auxiliary_loss_clip": 0.01100917, + "auxiliary_loss_mlp": 0.01032258, + "balance_loss_clip": 1.02148199, + "balance_loss_mlp": 1.03535068, + "epoch": 0.9442958064031264, + "flos": 16107265140480.0, + "grad_norm": 2.196711266527949, + "language_loss": 0.76928145, + "learning_rate": 3.2427566168039986e-08, + "loss": 0.79061323, + "num_input_tokens_seen": 338702025, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.65234375, + "step": 15706, + "time_per_iteration": 2.4352962970733643 + }, + { + "auxiliary_loss_clip": 0.01095703, + "auxiliary_loss_mlp": 0.01025784, + "balance_loss_clip": 1.01473355, + "balance_loss_mlp": 1.03289199, + "epoch": 0.9443559296557944, + "flos": 20447392550400.0, + "grad_norm": 1.43451723106199, + "language_loss": 0.693344, + "learning_rate": 3.23577554137866e-08, + "loss": 0.71455884, + "num_input_tokens_seen": 338720920, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.62890625, + "step": 15707, + "time_per_iteration": 2.453019380569458 + }, + { + "auxiliary_loss_clip": 0.01091425, + "auxiliary_loss_mlp": 0.01024987, + "balance_loss_clip": 1.01502132, + "balance_loss_mlp": 1.02994144, + "epoch": 0.9444160529084623, + "flos": 21610897896960.0, + "grad_norm": 1.7631234566340965, + "language_loss": 0.69443661, + "learning_rate": 3.22880192727244e-08, + "loss": 0.71560073, + "num_input_tokens_seen": 338739590, + "router_z_loss_clip": 0.09960938, + "router_z_loss_mlp": 0.6171875, + "step": 15708, + "time_per_iteration": 2.4164559841156006 + }, + { + "auxiliary_loss_clip": 0.01098199, + "auxiliary_loss_mlp": 0.01030728, + "balance_loss_clip": 1.01954079, + "balance_loss_mlp": 1.03435826, + "epoch": 0.9444761761611303, + "flos": 18441781776000.0, + "grad_norm": 2.6914619241923896, + "language_loss": 0.70139289, + "learning_rate": 3.221835774749748e-08, + "loss": 0.72268212, + "num_input_tokens_seen": 338757240, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.640625, + "step": 15709, + "time_per_iteration": 2.4482839107513428 + }, + { + "auxiliary_loss_clip": 0.01096914, + "auxiliary_loss_mlp": 0.01032791, + "balance_loss_clip": 1.02171016, + "balance_loss_mlp": 1.03418076, + "epoch": 0.9445362994137982, + "flos": 20957144411520.0, + "grad_norm": 9.952214688927834, + "language_loss": 0.84433717, + "learning_rate": 3.214877084074774e-08, + "loss": 0.8656342, + "num_input_tokens_seen": 338773750, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.62890625, + "step": 15710, + "time_per_iteration": 2.4583065509796143 + }, + { + "auxiliary_loss_clip": 0.01103261, + "auxiliary_loss_mlp": 0.01032053, + "balance_loss_clip": 1.01973879, + "balance_loss_mlp": 1.03534627, + "epoch": 0.9445964226664663, + "flos": 20303283185280.0, + "grad_norm": 1.6435224047891799, + "language_loss": 0.71200496, + "learning_rate": 3.2079258555113956e-08, + "loss": 0.73335809, + "num_input_tokens_seen": 338792115, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6796875, + "step": 15711, + "time_per_iteration": 2.4461560249328613 + }, + { + "auxiliary_loss_clip": 0.01101943, + "auxiliary_loss_mlp": 0.01025338, + "balance_loss_clip": 1.01372731, + "balance_loss_mlp": 1.03681183, + "epoch": 0.9446565459191342, + "flos": 26396030903040.0, + "grad_norm": 1.6856508103929682, + "language_loss": 0.69301665, + "learning_rate": 3.200982089323179e-08, + "loss": 0.71428949, + "num_input_tokens_seen": 338812480, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 15712, + "time_per_iteration": 2.557600736618042 + }, + { + "auxiliary_loss_clip": 0.0110423, + "auxiliary_loss_mlp": 0.01036108, + "balance_loss_clip": 1.02303076, + "balance_loss_mlp": 1.03638661, + "epoch": 0.9447166691718022, + "flos": 16544764794240.0, + "grad_norm": 2.9247099808601393, + "language_loss": 0.71096003, + "learning_rate": 3.1940457857734246e-08, + "loss": 0.73236346, + "num_input_tokens_seen": 338829105, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6796875, + "step": 15713, + "time_per_iteration": 2.4392521381378174 + }, + { + "auxiliary_loss_clip": 0.01096369, + "auxiliary_loss_mlp": 0.01032903, + "balance_loss_clip": 1.02093422, + "balance_loss_mlp": 1.03330159, + "epoch": 0.9447767924244702, + "flos": 29164635400320.0, + "grad_norm": 1.5251847757385297, + "language_loss": 0.76915956, + "learning_rate": 3.187116945125212e-08, + "loss": 0.79045224, + "num_input_tokens_seen": 338850670, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6328125, + "step": 15714, + "time_per_iteration": 2.5028111934661865 + }, + { + "auxiliary_loss_clip": 0.01099452, + "auxiliary_loss_mlp": 0.01032222, + "balance_loss_clip": 1.02040792, + "balance_loss_mlp": 1.03315997, + "epoch": 0.9448369156771381, + "flos": 19274908803840.0, + "grad_norm": 1.7713922514994944, + "language_loss": 0.67678571, + "learning_rate": 3.1801955676412194e-08, + "loss": 0.69810236, + "num_input_tokens_seen": 338867795, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 15715, + "time_per_iteration": 2.4388058185577393 + }, + { + "auxiliary_loss_clip": 0.01100087, + "auxiliary_loss_mlp": 0.01029371, + "balance_loss_clip": 1.01737273, + "balance_loss_mlp": 1.03375924, + "epoch": 0.9448970389298061, + "flos": 23841166285440.0, + "grad_norm": 3.553477247442109, + "language_loss": 0.7459079, + "learning_rate": 3.173281653583948e-08, + "loss": 0.76720244, + "num_input_tokens_seen": 338887205, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 15716, + "time_per_iteration": 2.463731288909912 + }, + { + "auxiliary_loss_clip": 0.01103368, + "auxiliary_loss_mlp": 0.01030902, + "balance_loss_clip": 1.01924956, + "balance_loss_mlp": 1.03722072, + "epoch": 0.944957162182474, + "flos": 22382259488640.0, + "grad_norm": 1.6549087243556793, + "language_loss": 0.62538469, + "learning_rate": 3.166375203215565e-08, + "loss": 0.64672738, + "num_input_tokens_seen": 338906130, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 15717, + "time_per_iteration": 2.4671406745910645 + }, + { + "auxiliary_loss_clip": 0.01099863, + "auxiliary_loss_mlp": 0.01031946, + "balance_loss_clip": 1.02062726, + "balance_loss_mlp": 1.03491199, + "epoch": 0.9450172854351421, + "flos": 17383889393280.0, + "grad_norm": 1.6376281628513882, + "language_loss": 0.79284263, + "learning_rate": 3.1594762167979514e-08, + "loss": 0.8141607, + "num_input_tokens_seen": 338923045, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 15718, + "time_per_iteration": 2.462629556655884 + }, + { + "auxiliary_loss_clip": 0.01021525, + "auxiliary_loss_mlp": 0.01003439, + "balance_loss_clip": 1.002545, + "balance_loss_mlp": 1.0015378, + "epoch": 0.94507740868781, + "flos": 68466352406400.0, + "grad_norm": 0.6948733429962052, + "language_loss": 0.578394, + "learning_rate": 3.152584694592719e-08, + "loss": 0.5986436, + "num_input_tokens_seen": 338987545, + "router_z_loss_clip": 0.00891113, + "router_z_loss_mlp": 0.19921875, + "step": 15719, + "time_per_iteration": 3.0780253410339355 + }, + { + "auxiliary_loss_clip": 0.01100233, + "auxiliary_loss_mlp": 0.01027292, + "balance_loss_clip": 1.015764, + "balance_loss_mlp": 1.03423667, + "epoch": 0.945137531940478, + "flos": 21142479611520.0, + "grad_norm": 1.5397760146484176, + "language_loss": 0.75893283, + "learning_rate": 3.145700636861193e-08, + "loss": 0.78020811, + "num_input_tokens_seen": 339007830, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66015625, + "step": 15720, + "time_per_iteration": 3.864163875579834 + }, + { + "auxiliary_loss_clip": 0.01096195, + "auxiliary_loss_mlp": 0.010281, + "balance_loss_clip": 1.01787829, + "balance_loss_mlp": 1.03208733, + "epoch": 0.9451976551931459, + "flos": 24533918962560.0, + "grad_norm": 1.6263485050916464, + "language_loss": 0.72628319, + "learning_rate": 3.138824043864452e-08, + "loss": 0.74752611, + "num_input_tokens_seen": 339028980, + "router_z_loss_clip": 0.10205078, + "router_z_loss_mlp": 0.640625, + "step": 15721, + "time_per_iteration": 2.5096383094787598 + }, + { + "auxiliary_loss_clip": 0.01100377, + "auxiliary_loss_mlp": 0.01033389, + "balance_loss_clip": 1.0211221, + "balance_loss_mlp": 1.0353353, + "epoch": 0.9452577784458139, + "flos": 23440582834560.0, + "grad_norm": 1.7722462509073895, + "language_loss": 0.85373968, + "learning_rate": 3.131954915863244e-08, + "loss": 0.87507731, + "num_input_tokens_seen": 339047950, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6484375, + "step": 15722, + "time_per_iteration": 3.9022328853607178 + }, + { + "auxiliary_loss_clip": 0.01021641, + "auxiliary_loss_mlp": 0.01002369, + "balance_loss_clip": 1.00133801, + "balance_loss_mlp": 1.0015502, + "epoch": 0.9453179016984818, + "flos": 52017686449920.0, + "grad_norm": 0.8900631949326635, + "language_loss": 0.64461863, + "learning_rate": 3.125093253118005e-08, + "loss": 0.66485882, + "num_input_tokens_seen": 339104535, + "router_z_loss_clip": 0.01031494, + "router_z_loss_mlp": 0.20117188, + "step": 15723, + "time_per_iteration": 4.41249418258667 + }, + { + "auxiliary_loss_clip": 0.01101146, + "auxiliary_loss_mlp": 0.01030443, + "balance_loss_clip": 1.01846862, + "balance_loss_mlp": 1.03444242, + "epoch": 0.9453780249511499, + "flos": 13473001509120.0, + "grad_norm": 2.04231745236359, + "language_loss": 0.73194891, + "learning_rate": 3.1182390558889715e-08, + "loss": 0.75326478, + "num_input_tokens_seen": 339122050, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66796875, + "step": 15724, + "time_per_iteration": 2.4040088653564453 + }, + { + "auxiliary_loss_clip": 0.0109916, + "auxiliary_loss_mlp": 0.01025226, + "balance_loss_clip": 1.01462817, + "balance_loss_mlp": 1.03418875, + "epoch": 0.9454381482038178, + "flos": 23258515772160.0, + "grad_norm": 2.5881922825982615, + "language_loss": 0.84684968, + "learning_rate": 3.111392324436024e-08, + "loss": 0.86809349, + "num_input_tokens_seen": 339138940, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.6484375, + "step": 15725, + "time_per_iteration": 2.469430446624756 + }, + { + "auxiliary_loss_clip": 0.01098906, + "auxiliary_loss_mlp": 0.01027108, + "balance_loss_clip": 1.01581264, + "balance_loss_mlp": 1.03359258, + "epoch": 0.9454982714564858, + "flos": 19496621502720.0, + "grad_norm": 1.7062685853482866, + "language_loss": 0.71106911, + "learning_rate": 3.104553059018822e-08, + "loss": 0.73232925, + "num_input_tokens_seen": 339158245, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65625, + "step": 15726, + "time_per_iteration": 2.4455809593200684 + }, + { + "auxiliary_loss_clip": 0.01097846, + "auxiliary_loss_mlp": 0.01028063, + "balance_loss_clip": 1.0157423, + "balance_loss_mlp": 1.03294992, + "epoch": 0.9455583947091538, + "flos": 23258120722560.0, + "grad_norm": 1.6702126364997434, + "language_loss": 0.60863376, + "learning_rate": 3.097721259896735e-08, + "loss": 0.62989283, + "num_input_tokens_seen": 339178200, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6484375, + "step": 15727, + "time_per_iteration": 3.9425292015075684 + }, + { + "auxiliary_loss_clip": 0.01095273, + "auxiliary_loss_mlp": 0.01033964, + "balance_loss_clip": 1.023265, + "balance_loss_mlp": 1.03242397, + "epoch": 0.9456185179618217, + "flos": 17673041877120.0, + "grad_norm": 1.8004377076099485, + "language_loss": 0.81886947, + "learning_rate": 3.0908969273287566e-08, + "loss": 0.8401618, + "num_input_tokens_seen": 339193950, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.62890625, + "step": 15728, + "time_per_iteration": 2.4058585166931152 + }, + { + "auxiliary_loss_clip": 0.01021632, + "auxiliary_loss_mlp": 0.00997147, + "balance_loss_clip": 0.99612808, + "balance_loss_mlp": 1.00159681, + "epoch": 0.9456786412144897, + "flos": 61415040389760.0, + "grad_norm": 0.736051651837185, + "language_loss": 0.59150136, + "learning_rate": 3.08408006157368e-08, + "loss": 0.61168915, + "num_input_tokens_seen": 339252330, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.20117188, + "step": 15729, + "time_per_iteration": 2.9688003063201904 + }, + { + "auxiliary_loss_clip": 0.0109789, + "auxiliary_loss_mlp": 0.01028206, + "balance_loss_clip": 1.01618958, + "balance_loss_mlp": 1.03341413, + "epoch": 0.9457387644671577, + "flos": 18588369179520.0, + "grad_norm": 1.8776807928087538, + "language_loss": 0.762703, + "learning_rate": 3.077270662890052e-08, + "loss": 0.78396392, + "num_input_tokens_seen": 339270325, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.64453125, + "step": 15730, + "time_per_iteration": 2.4220995903015137 + }, + { + "auxiliary_loss_clip": 0.0109928, + "auxiliary_loss_mlp": 0.01031121, + "balance_loss_clip": 1.01887226, + "balance_loss_mlp": 1.03324008, + "epoch": 0.9457988877198257, + "flos": 21108544237440.0, + "grad_norm": 1.4182548688654766, + "language_loss": 0.62411594, + "learning_rate": 3.070468731536047e-08, + "loss": 0.64541996, + "num_input_tokens_seen": 339291980, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66015625, + "step": 15731, + "time_per_iteration": 2.5083041191101074 + }, + { + "auxiliary_loss_clip": 0.01100342, + "auxiliary_loss_mlp": 0.01025659, + "balance_loss_clip": 1.01327908, + "balance_loss_mlp": 1.03371453, + "epoch": 0.9458590109724936, + "flos": 26688379697280.0, + "grad_norm": 1.9378943529039063, + "language_loss": 0.63918054, + "learning_rate": 3.063674267769589e-08, + "loss": 0.66044056, + "num_input_tokens_seen": 339311795, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6640625, + "step": 15732, + "time_per_iteration": 2.4719395637512207 + }, + { + "auxiliary_loss_clip": 0.01103926, + "auxiliary_loss_mlp": 0.01027459, + "balance_loss_clip": 1.01500165, + "balance_loss_mlp": 1.03460908, + "epoch": 0.9459191342251616, + "flos": 18661591054080.0, + "grad_norm": 1.7756445337768159, + "language_loss": 0.83968151, + "learning_rate": 3.056887271848363e-08, + "loss": 0.86099535, + "num_input_tokens_seen": 339327745, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6953125, + "step": 15733, + "time_per_iteration": 2.4443578720092773 + }, + { + "auxiliary_loss_clip": 0.01095213, + "auxiliary_loss_mlp": 0.01026767, + "balance_loss_clip": 1.01640725, + "balance_loss_mlp": 1.03294325, + "epoch": 0.9459792574778295, + "flos": 23398459159680.0, + "grad_norm": 1.4695884497585416, + "language_loss": 0.72089154, + "learning_rate": 3.0501077440297173e-08, + "loss": 0.74211133, + "num_input_tokens_seen": 339346445, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.625, + "step": 15734, + "time_per_iteration": 2.4488422870635986 + }, + { + "auxiliary_loss_clip": 0.01092681, + "auxiliary_loss_mlp": 0.01027919, + "balance_loss_clip": 1.0181973, + "balance_loss_mlp": 1.0311662, + "epoch": 0.9460393807304975, + "flos": 24392969994240.0, + "grad_norm": 1.423173253742331, + "language_loss": 0.86974919, + "learning_rate": 3.043335684570692e-08, + "loss": 0.89095521, + "num_input_tokens_seen": 339367945, + "router_z_loss_clip": 0.09765625, + "router_z_loss_mlp": 0.6171875, + "step": 15735, + "time_per_iteration": 2.5103213787078857 + }, + { + "auxiliary_loss_clip": 0.0109908, + "auxiliary_loss_mlp": 0.01026303, + "balance_loss_clip": 1.01499617, + "balance_loss_mlp": 1.03345919, + "epoch": 0.9460995039831654, + "flos": 21939408708480.0, + "grad_norm": 2.0160825623367975, + "language_loss": 0.67346275, + "learning_rate": 3.036571093728102e-08, + "loss": 0.69471663, + "num_input_tokens_seen": 339386060, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 15736, + "time_per_iteration": 2.4414546489715576 + }, + { + "auxiliary_loss_clip": 0.01021593, + "auxiliary_loss_mlp": 0.01002149, + "balance_loss_clip": 1.00120163, + "balance_loss_mlp": 1.0015769, + "epoch": 0.9461596272358335, + "flos": 70322466775680.0, + "grad_norm": 0.9159483735058672, + "language_loss": 0.65298235, + "learning_rate": 3.029813971758499e-08, + "loss": 0.6732198, + "num_input_tokens_seen": 339446695, + "router_z_loss_clip": 0.00946045, + "router_z_loss_mlp": 0.20019531, + "step": 15737, + "time_per_iteration": 3.1042568683624268 + }, + { + "auxiliary_loss_clip": 0.01021626, + "auxiliary_loss_mlp": 0.00999988, + "balance_loss_clip": 0.99901086, + "balance_loss_mlp": 1.00169897, + "epoch": 0.9462197504885014, + "flos": 58591242645120.0, + "grad_norm": 0.8017129104896167, + "language_loss": 0.58838046, + "learning_rate": 3.0230643189181225e-08, + "loss": 0.60859656, + "num_input_tokens_seen": 339510080, + "router_z_loss_clip": 0.00976562, + "router_z_loss_mlp": 0.19921875, + "step": 15738, + "time_per_iteration": 3.052255153656006 + }, + { + "auxiliary_loss_clip": 0.01095699, + "auxiliary_loss_mlp": 0.01028308, + "balance_loss_clip": 1.01794255, + "balance_loss_mlp": 1.0324806, + "epoch": 0.9462798737411694, + "flos": 23433759250560.0, + "grad_norm": 1.740585721975819, + "language_loss": 0.71850687, + "learning_rate": 3.016322135462834e-08, + "loss": 0.73974693, + "num_input_tokens_seen": 339529335, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.6328125, + "step": 15739, + "time_per_iteration": 2.453784704208374 + }, + { + "auxiliary_loss_clip": 0.01099551, + "auxiliary_loss_mlp": 0.01031808, + "balance_loss_clip": 1.0200305, + "balance_loss_mlp": 1.0342207, + "epoch": 0.9463399969938374, + "flos": 25046077034880.0, + "grad_norm": 2.103403899839581, + "language_loss": 0.64150524, + "learning_rate": 3.009587421648363e-08, + "loss": 0.66281885, + "num_input_tokens_seen": 339548820, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.65234375, + "step": 15740, + "time_per_iteration": 2.516693115234375 + }, + { + "auxiliary_loss_clip": 0.01096961, + "auxiliary_loss_mlp": 0.01027669, + "balance_loss_clip": 1.01670778, + "balance_loss_mlp": 1.03294837, + "epoch": 0.9464001202465053, + "flos": 24352606085760.0, + "grad_norm": 1.9948304801785786, + "language_loss": 0.66507947, + "learning_rate": 3.0028601777301045e-08, + "loss": 0.68632573, + "num_input_tokens_seen": 339566775, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.640625, + "step": 15741, + "time_per_iteration": 2.483225107192993 + }, + { + "auxiliary_loss_clip": 0.01099414, + "auxiliary_loss_mlp": 0.01026135, + "balance_loss_clip": 1.01472712, + "balance_loss_mlp": 1.03407657, + "epoch": 0.9464602434991733, + "flos": 17165444832000.0, + "grad_norm": 2.182631737231146, + "language_loss": 0.75745535, + "learning_rate": 2.9961404039630987e-08, + "loss": 0.7787109, + "num_input_tokens_seen": 339581905, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 15742, + "time_per_iteration": 2.426438093185425 + }, + { + "auxiliary_loss_clip": 0.01097162, + "auxiliary_loss_mlp": 0.01027953, + "balance_loss_clip": 1.01714623, + "balance_loss_mlp": 1.0337882, + "epoch": 0.9465203667518413, + "flos": 19938107566080.0, + "grad_norm": 2.136193371359759, + "language_loss": 0.72182894, + "learning_rate": 2.989428100602187e-08, + "loss": 0.74308008, + "num_input_tokens_seen": 339599870, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6328125, + "step": 15743, + "time_per_iteration": 2.426293134689331 + }, + { + "auxiliary_loss_clip": 0.0110159, + "auxiliary_loss_mlp": 0.01032513, + "balance_loss_clip": 1.0209254, + "balance_loss_mlp": 1.03408003, + "epoch": 0.9465804900045093, + "flos": 20120318282880.0, + "grad_norm": 4.529960691980935, + "language_loss": 0.79481554, + "learning_rate": 2.982723267901943e-08, + "loss": 0.81615651, + "num_input_tokens_seen": 339620250, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.67578125, + "step": 15744, + "time_per_iteration": 2.4723949432373047 + }, + { + "auxiliary_loss_clip": 0.01101299, + "auxiliary_loss_mlp": 0.01033352, + "balance_loss_clip": 1.02148438, + "balance_loss_mlp": 1.0341022, + "epoch": 0.9466406132571772, + "flos": 23911622812800.0, + "grad_norm": 3.3674745996062225, + "language_loss": 0.77996051, + "learning_rate": 2.9760259061165417e-08, + "loss": 0.80130696, + "num_input_tokens_seen": 339639900, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 15745, + "time_per_iteration": 2.47007155418396 + }, + { + "auxiliary_loss_clip": 0.01100036, + "auxiliary_loss_mlp": 0.01029697, + "balance_loss_clip": 1.01782942, + "balance_loss_mlp": 1.03299022, + "epoch": 0.9467007365098452, + "flos": 19933223316480.0, + "grad_norm": 1.808023282855586, + "language_loss": 0.69985926, + "learning_rate": 2.9693360155000014e-08, + "loss": 0.7211566, + "num_input_tokens_seen": 339658970, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 15746, + "time_per_iteration": 2.4556610584259033 + }, + { + "auxiliary_loss_clip": 0.01098496, + "auxiliary_loss_mlp": 0.01027124, + "balance_loss_clip": 1.01516747, + "balance_loss_mlp": 1.03419673, + "epoch": 0.9467608597625131, + "flos": 19310496203520.0, + "grad_norm": 4.04673875503708, + "language_loss": 0.56715882, + "learning_rate": 2.962653596305964e-08, + "loss": 0.58841503, + "num_input_tokens_seen": 339675600, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.640625, + "step": 15747, + "time_per_iteration": 2.4188010692596436 + }, + { + "auxiliary_loss_clip": 0.01021638, + "auxiliary_loss_mlp": 0.00999103, + "balance_loss_clip": 0.99809551, + "balance_loss_mlp": 1.0015198, + "epoch": 0.9468209830151811, + "flos": 69630252802560.0, + "grad_norm": 0.6607046285663145, + "language_loss": 0.53250241, + "learning_rate": 2.955978648787871e-08, + "loss": 0.55270982, + "num_input_tokens_seen": 339744505, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.20117188, + "step": 15748, + "time_per_iteration": 3.210047960281372 + }, + { + "auxiliary_loss_clip": 0.01100624, + "auxiliary_loss_mlp": 0.01033339, + "balance_loss_clip": 1.02208531, + "balance_loss_mlp": 1.03541768, + "epoch": 0.946881106267849, + "flos": 27016639113600.0, + "grad_norm": 1.696117214299738, + "language_loss": 0.66129446, + "learning_rate": 2.9493111731988096e-08, + "loss": 0.68263412, + "num_input_tokens_seen": 339765810, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 15749, + "time_per_iteration": 2.4717953205108643 + }, + { + "auxiliary_loss_clip": 0.01099175, + "auxiliary_loss_mlp": 0.01028814, + "balance_loss_clip": 1.01611233, + "balance_loss_mlp": 1.03256774, + "epoch": 0.9469412295205171, + "flos": 20190092451840.0, + "grad_norm": 1.870451209534139, + "language_loss": 0.75719225, + "learning_rate": 2.942651169791621e-08, + "loss": 0.77847207, + "num_input_tokens_seen": 339784125, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.66796875, + "step": 15750, + "time_per_iteration": 2.4470083713531494 + }, + { + "auxiliary_loss_clip": 0.01099991, + "auxiliary_loss_mlp": 0.01027426, + "balance_loss_clip": 1.01614845, + "balance_loss_mlp": 1.03496587, + "epoch": 0.947001352773185, + "flos": 21324905809920.0, + "grad_norm": 3.2567403535496373, + "language_loss": 0.67666459, + "learning_rate": 2.9359986388188372e-08, + "loss": 0.6979388, + "num_input_tokens_seen": 339803450, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65234375, + "step": 15751, + "time_per_iteration": 2.4272170066833496 + }, + { + "auxiliary_loss_clip": 0.01100684, + "auxiliary_loss_mlp": 0.01026272, + "balance_loss_clip": 1.01553106, + "balance_loss_mlp": 1.03459108, + "epoch": 0.947061476025853, + "flos": 21944041562880.0, + "grad_norm": 2.233176837277438, + "language_loss": 0.65536374, + "learning_rate": 2.929353580532723e-08, + "loss": 0.6766333, + "num_input_tokens_seen": 339823215, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6640625, + "step": 15752, + "time_per_iteration": 2.4499189853668213 + }, + { + "auxiliary_loss_clip": 0.01098995, + "auxiliary_loss_mlp": 0.01027996, + "balance_loss_clip": 1.0164566, + "balance_loss_mlp": 1.03381038, + "epoch": 0.947121599278521, + "flos": 21394715892480.0, + "grad_norm": 1.595970237118896, + "language_loss": 0.71663833, + "learning_rate": 2.9227159951852764e-08, + "loss": 0.73790824, + "num_input_tokens_seen": 339842230, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65234375, + "step": 15753, + "time_per_iteration": 2.4530341625213623 + }, + { + "auxiliary_loss_clip": 0.01100937, + "auxiliary_loss_mlp": 0.01029372, + "balance_loss_clip": 1.01540065, + "balance_loss_mlp": 1.03327668, + "epoch": 0.9471817225311889, + "flos": 23075730437760.0, + "grad_norm": 1.7043436636476592, + "language_loss": 0.70336282, + "learning_rate": 2.9160858830281855e-08, + "loss": 0.72466588, + "num_input_tokens_seen": 339861640, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.67578125, + "step": 15754, + "time_per_iteration": 2.470735549926758 + }, + { + "auxiliary_loss_clip": 0.01101539, + "auxiliary_loss_mlp": 0.01028837, + "balance_loss_clip": 1.01744604, + "balance_loss_mlp": 1.03313601, + "epoch": 0.947241845783857, + "flos": 11910744305280.0, + "grad_norm": 2.2470655637804695, + "language_loss": 0.78706431, + "learning_rate": 2.9094632443129153e-08, + "loss": 0.80836809, + "num_input_tokens_seen": 339878210, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.68359375, + "step": 15755, + "time_per_iteration": 2.3971338272094727 + }, + { + "auxiliary_loss_clip": 0.01104859, + "auxiliary_loss_mlp": 0.0103399, + "balance_loss_clip": 1.01995945, + "balance_loss_mlp": 1.03432608, + "epoch": 0.9473019690365249, + "flos": 20740675098240.0, + "grad_norm": 2.5046918538507037, + "language_loss": 0.75961721, + "learning_rate": 2.9028480792904876e-08, + "loss": 0.78100568, + "num_input_tokens_seen": 339894255, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.703125, + "step": 15756, + "time_per_iteration": 2.426345109939575 + }, + { + "auxiliary_loss_clip": 0.01099898, + "auxiliary_loss_mlp": 0.01028705, + "balance_loss_clip": 1.01795268, + "balance_loss_mlp": 1.03368378, + "epoch": 0.9473620922891929, + "flos": 17639896602240.0, + "grad_norm": 1.9870799305981186, + "language_loss": 0.74695963, + "learning_rate": 2.8962403882118347e-08, + "loss": 0.76824564, + "num_input_tokens_seen": 339912425, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6640625, + "step": 15757, + "time_per_iteration": 2.4045164585113525 + }, + { + "auxiliary_loss_clip": 0.0110339, + "auxiliary_loss_mlp": 0.01031445, + "balance_loss_clip": 1.01900578, + "balance_loss_mlp": 1.03469872, + "epoch": 0.9474222155418608, + "flos": 23550002640000.0, + "grad_norm": 2.378229571033702, + "language_loss": 0.79555655, + "learning_rate": 2.889640171327512e-08, + "loss": 0.8169049, + "num_input_tokens_seen": 339929635, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6875, + "step": 15758, + "time_per_iteration": 2.4659128189086914 + }, + { + "auxiliary_loss_clip": 0.01098428, + "auxiliary_loss_mlp": 0.01029661, + "balance_loss_clip": 1.01864612, + "balance_loss_mlp": 1.03468299, + "epoch": 0.9474823387945288, + "flos": 27089753247360.0, + "grad_norm": 1.5067261773590948, + "language_loss": 0.72213107, + "learning_rate": 2.8830474288877638e-08, + "loss": 0.74341202, + "num_input_tokens_seen": 339951200, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.63671875, + "step": 15759, + "time_per_iteration": 2.5268497467041016 + }, + { + "auxiliary_loss_clip": 0.01095275, + "auxiliary_loss_mlp": 0.01028196, + "balance_loss_clip": 1.01843882, + "balance_loss_mlp": 1.03411698, + "epoch": 0.9475424620471967, + "flos": 22966526113920.0, + "grad_norm": 1.4321356635021014, + "language_loss": 0.75588179, + "learning_rate": 2.8764621611426344e-08, + "loss": 0.77711654, + "num_input_tokens_seen": 339971820, + "router_z_loss_clip": 0.09765625, + "router_z_loss_mlp": 0.61328125, + "step": 15760, + "time_per_iteration": 2.4870219230651855 + }, + { + "auxiliary_loss_clip": 0.01099685, + "auxiliary_loss_mlp": 0.0102963, + "balance_loss_clip": 1.01837659, + "balance_loss_mlp": 1.03509808, + "epoch": 0.9476025852998647, + "flos": 20047671025920.0, + "grad_norm": 1.7659943637470257, + "language_loss": 0.72967952, + "learning_rate": 2.8698843683418128e-08, + "loss": 0.75097269, + "num_input_tokens_seen": 339989420, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 15761, + "time_per_iteration": 2.4621644020080566 + }, + { + "auxiliary_loss_clip": 0.01101443, + "auxiliary_loss_mlp": 0.01033264, + "balance_loss_clip": 1.02264786, + "balance_loss_mlp": 1.03763127, + "epoch": 0.9476627085525327, + "flos": 14975468524800.0, + "grad_norm": 2.0880931998012926, + "language_loss": 0.71599525, + "learning_rate": 2.863314050734722e-08, + "loss": 0.73734236, + "num_input_tokens_seen": 340006690, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.640625, + "step": 15762, + "time_per_iteration": 3.807072877883911 + }, + { + "auxiliary_loss_clip": 0.01102527, + "auxiliary_loss_mlp": 0.01035463, + "balance_loss_clip": 1.02279127, + "balance_loss_mlp": 1.03368092, + "epoch": 0.9477228318052007, + "flos": 18697788984960.0, + "grad_norm": 1.8761919798911448, + "language_loss": 0.66871512, + "learning_rate": 2.856751208570518e-08, + "loss": 0.69009507, + "num_input_tokens_seen": 340025480, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 15763, + "time_per_iteration": 2.4327456951141357 + }, + { + "auxiliary_loss_clip": 0.01098893, + "auxiliary_loss_mlp": 0.01031315, + "balance_loss_clip": 1.02008581, + "balance_loss_mlp": 1.03249335, + "epoch": 0.9477829550578686, + "flos": 23875065745920.0, + "grad_norm": 1.6535306150383073, + "language_loss": 0.69588113, + "learning_rate": 2.8501958420980466e-08, + "loss": 0.71718317, + "num_input_tokens_seen": 340043785, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6640625, + "step": 15764, + "time_per_iteration": 5.365139722824097 + }, + { + "auxiliary_loss_clip": 0.01097061, + "auxiliary_loss_mlp": 0.0102714, + "balance_loss_clip": 1.01689982, + "balance_loss_mlp": 1.03639555, + "epoch": 0.9478430783105366, + "flos": 22562890007040.0, + "grad_norm": 1.6364617025382917, + "language_loss": 0.70810807, + "learning_rate": 2.8436479515659306e-08, + "loss": 0.72935009, + "num_input_tokens_seen": 340064360, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.609375, + "step": 15765, + "time_per_iteration": 2.445587158203125 + }, + { + "auxiliary_loss_clip": 0.01021739, + "auxiliary_loss_mlp": 0.00999534, + "balance_loss_clip": 0.99852639, + "balance_loss_mlp": 1.00167453, + "epoch": 0.9479032015632046, + "flos": 60857885554560.0, + "grad_norm": 0.8043033372730916, + "language_loss": 0.59102297, + "learning_rate": 2.8371075372224384e-08, + "loss": 0.61123562, + "num_input_tokens_seen": 340114425, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.20117188, + "step": 15766, + "time_per_iteration": 2.8118863105773926 + }, + { + "auxiliary_loss_clip": 0.01099537, + "auxiliary_loss_mlp": 0.01034405, + "balance_loss_clip": 1.02343154, + "balance_loss_mlp": 1.03409505, + "epoch": 0.9479633248158725, + "flos": 14683873916160.0, + "grad_norm": 1.8210488332704236, + "language_loss": 0.74425805, + "learning_rate": 2.8305745993155938e-08, + "loss": 0.76559752, + "num_input_tokens_seen": 340132200, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.65625, + "step": 15767, + "time_per_iteration": 2.433103561401367 + }, + { + "auxiliary_loss_clip": 0.01103755, + "auxiliary_loss_mlp": 0.01033165, + "balance_loss_clip": 1.02080297, + "balance_loss_mlp": 1.03559554, + "epoch": 0.9480234480685406, + "flos": 20333878594560.0, + "grad_norm": 2.1086495442960587, + "language_loss": 0.73338264, + "learning_rate": 2.8240491380931096e-08, + "loss": 0.7547518, + "num_input_tokens_seen": 340149175, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6796875, + "step": 15768, + "time_per_iteration": 2.4372289180755615 + }, + { + "auxiliary_loss_clip": 0.0102187, + "auxiliary_loss_mlp": 0.01000121, + "balance_loss_clip": 0.9991194, + "balance_loss_mlp": 1.00185442, + "epoch": 0.9480835713212085, + "flos": 70293092428800.0, + "grad_norm": 0.7343542147395368, + "language_loss": 0.55284411, + "learning_rate": 2.8175311538024326e-08, + "loss": 0.57306397, + "num_input_tokens_seen": 340208155, + "router_z_loss_clip": 0.01000977, + "router_z_loss_mlp": 0.20019531, + "step": 15769, + "time_per_iteration": 4.592373609542847 + }, + { + "auxiliary_loss_clip": 0.01096657, + "auxiliary_loss_mlp": 0.01027843, + "balance_loss_clip": 1.01644039, + "balance_loss_mlp": 1.03143764, + "epoch": 0.9481436945738765, + "flos": 25449749055360.0, + "grad_norm": 1.342521680668915, + "language_loss": 0.77534431, + "learning_rate": 2.8110206466907428e-08, + "loss": 0.79658937, + "num_input_tokens_seen": 340229275, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 15770, + "time_per_iteration": 2.509974956512451 + }, + { + "auxiliary_loss_clip": 0.01103995, + "auxiliary_loss_mlp": 0.01034529, + "balance_loss_clip": 1.02161837, + "balance_loss_mlp": 1.0377177, + "epoch": 0.9482038178265444, + "flos": 26979902478720.0, + "grad_norm": 1.7923861457089987, + "language_loss": 0.79980707, + "learning_rate": 2.8045176170049313e-08, + "loss": 0.82119232, + "num_input_tokens_seen": 340248920, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6640625, + "step": 15771, + "time_per_iteration": 2.4819459915161133 + }, + { + "auxiliary_loss_clip": 0.01097776, + "auxiliary_loss_mlp": 0.01028361, + "balance_loss_clip": 1.017066, + "balance_loss_mlp": 1.03398848, + "epoch": 0.9482639410792124, + "flos": 17785442511360.0, + "grad_norm": 2.453520523449039, + "language_loss": 0.69694543, + "learning_rate": 2.7980220649915566e-08, + "loss": 0.71820688, + "num_input_tokens_seen": 340266775, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.63671875, + "step": 15772, + "time_per_iteration": 2.425267219543457 + }, + { + "auxiliary_loss_clip": 0.01099953, + "auxiliary_loss_mlp": 0.01030819, + "balance_loss_clip": 1.01907134, + "balance_loss_mlp": 1.03535521, + "epoch": 0.9483240643318803, + "flos": 20996682307200.0, + "grad_norm": 1.5802215490409397, + "language_loss": 0.73707336, + "learning_rate": 2.7915339908969327e-08, + "loss": 0.75838113, + "num_input_tokens_seen": 340285295, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6484375, + "step": 15773, + "time_per_iteration": 2.444343328475952 + }, + { + "auxiliary_loss_clip": 0.0110056, + "auxiliary_loss_mlp": 0.01033367, + "balance_loss_clip": 1.02117205, + "balance_loss_mlp": 1.03330648, + "epoch": 0.9483841875845483, + "flos": 20083294339200.0, + "grad_norm": 2.131794605985836, + "language_loss": 0.62298661, + "learning_rate": 2.7850533949671072e-08, + "loss": 0.64432591, + "num_input_tokens_seen": 340304265, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 15774, + "time_per_iteration": 2.4462358951568604 + }, + { + "auxiliary_loss_clip": 0.01099681, + "auxiliary_loss_mlp": 0.0103236, + "balance_loss_clip": 1.02020681, + "balance_loss_mlp": 1.03321493, + "epoch": 0.9484443108372163, + "flos": 20813645577600.0, + "grad_norm": 2.09594864312592, + "language_loss": 0.58812392, + "learning_rate": 2.7785802774478396e-08, + "loss": 0.60944426, + "num_input_tokens_seen": 340323690, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6640625, + "step": 15775, + "time_per_iteration": 2.4414901733398438 + }, + { + "auxiliary_loss_clip": 0.01102102, + "auxiliary_loss_mlp": 0.01029054, + "balance_loss_clip": 1.01669836, + "balance_loss_mlp": 1.03493381, + "epoch": 0.9485044340898843, + "flos": 36429184506240.0, + "grad_norm": 1.8455531404536583, + "language_loss": 0.61595821, + "learning_rate": 2.772114638584555e-08, + "loss": 0.63726979, + "num_input_tokens_seen": 340345830, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.671875, + "step": 15776, + "time_per_iteration": 2.5827388763427734 + }, + { + "auxiliary_loss_clip": 0.01098673, + "auxiliary_loss_mlp": 0.01030668, + "balance_loss_clip": 1.01866937, + "balance_loss_mlp": 1.03275156, + "epoch": 0.9485645573425522, + "flos": 22602535643520.0, + "grad_norm": 1.7272218804811466, + "language_loss": 0.73529625, + "learning_rate": 2.765656478622458e-08, + "loss": 0.75658965, + "num_input_tokens_seen": 340365910, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66015625, + "step": 15777, + "time_per_iteration": 2.46150279045105 + }, + { + "auxiliary_loss_clip": 0.01111124, + "auxiliary_loss_mlp": 0.01034603, + "balance_loss_clip": 1.02140033, + "balance_loss_mlp": 1.03862464, + "epoch": 0.9486246805952202, + "flos": 22017766227840.0, + "grad_norm": 2.8233320899962435, + "language_loss": 0.72577089, + "learning_rate": 2.759205797806441e-08, + "loss": 0.74722815, + "num_input_tokens_seen": 340383935, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7265625, + "step": 15778, + "time_per_iteration": 2.467472553253174 + }, + { + "auxiliary_loss_clip": 0.01094771, + "auxiliary_loss_mlp": 0.01026604, + "balance_loss_clip": 1.01670969, + "balance_loss_mlp": 1.0343349, + "epoch": 0.9486848038478882, + "flos": 16508674604160.0, + "grad_norm": 1.8271409648319303, + "language_loss": 0.69787717, + "learning_rate": 2.7527625963810865e-08, + "loss": 0.71909094, + "num_input_tokens_seen": 340402760, + "router_z_loss_clip": 0.09863281, + "router_z_loss_mlp": 0.6015625, + "step": 15779, + "time_per_iteration": 2.4266300201416016 + }, + { + "auxiliary_loss_clip": 0.01100503, + "auxiliary_loss_mlp": 0.01030815, + "balance_loss_clip": 1.01880431, + "balance_loss_mlp": 1.03467202, + "epoch": 0.9487449271005561, + "flos": 19244385221760.0, + "grad_norm": 2.8607336238305794, + "language_loss": 0.78267539, + "learning_rate": 2.7463268745907542e-08, + "loss": 0.80398858, + "num_input_tokens_seen": 340422105, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.65625, + "step": 15780, + "time_per_iteration": 2.446392774581909 + }, + { + "auxiliary_loss_clip": 0.01101438, + "auxiliary_loss_mlp": 0.01029635, + "balance_loss_clip": 1.01809514, + "balance_loss_mlp": 1.03621566, + "epoch": 0.9488050503532242, + "flos": 21762692772480.0, + "grad_norm": 1.7816825422070612, + "language_loss": 0.66119897, + "learning_rate": 2.7398986326794494e-08, + "loss": 0.68250966, + "num_input_tokens_seen": 340441160, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65234375, + "step": 15781, + "time_per_iteration": 2.4368371963500977 + }, + { + "auxiliary_loss_clip": 0.01097989, + "auxiliary_loss_mlp": 0.01031123, + "balance_loss_clip": 1.01941085, + "balance_loss_mlp": 1.03366685, + "epoch": 0.9488651736058921, + "flos": 18368919037440.0, + "grad_norm": 2.0866286325402306, + "language_loss": 0.7938571, + "learning_rate": 2.733477870890999e-08, + "loss": 0.81514817, + "num_input_tokens_seen": 340458200, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.64453125, + "step": 15782, + "time_per_iteration": 2.4351727962493896 + }, + { + "auxiliary_loss_clip": 0.01021458, + "auxiliary_loss_mlp": 0.00998812, + "balance_loss_clip": 0.99779856, + "balance_loss_mlp": 1.00149429, + "epoch": 0.9489252968585601, + "flos": 70084057230720.0, + "grad_norm": 0.725186072749968, + "language_loss": 0.59841406, + "learning_rate": 2.7270645894688082e-08, + "loss": 0.61861676, + "num_input_tokens_seen": 340526420, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.19921875, + "step": 15783, + "time_per_iteration": 3.145355224609375 + }, + { + "auxiliary_loss_clip": 0.01099804, + "auxiliary_loss_mlp": 0.01032698, + "balance_loss_clip": 1.0206039, + "balance_loss_mlp": 1.03343678, + "epoch": 0.948985420111228, + "flos": 27855440490240.0, + "grad_norm": 1.6893787149575912, + "language_loss": 0.74055898, + "learning_rate": 2.720658788656105e-08, + "loss": 0.76188403, + "num_input_tokens_seen": 340546325, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 15784, + "time_per_iteration": 2.4882519245147705 + }, + { + "auxiliary_loss_clip": 0.01098838, + "auxiliary_loss_mlp": 0.01027468, + "balance_loss_clip": 1.01474261, + "balance_loss_mlp": 1.03261077, + "epoch": 0.949045543363896, + "flos": 24316049018880.0, + "grad_norm": 2.40873132613553, + "language_loss": 0.69824833, + "learning_rate": 2.714260468695806e-08, + "loss": 0.71951145, + "num_input_tokens_seen": 340565145, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.66015625, + "step": 15785, + "time_per_iteration": 2.4379212856292725 + }, + { + "auxiliary_loss_clip": 0.01100555, + "auxiliary_loss_mlp": 0.01027966, + "balance_loss_clip": 1.01652157, + "balance_loss_mlp": 1.03367662, + "epoch": 0.9491056666165639, + "flos": 24241677909120.0, + "grad_norm": 1.5884098203702628, + "language_loss": 0.75856775, + "learning_rate": 2.707869629830495e-08, + "loss": 0.77985299, + "num_input_tokens_seen": 340585465, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 15786, + "time_per_iteration": 2.4647655487060547 + }, + { + "auxiliary_loss_clip": 0.01099885, + "auxiliary_loss_mlp": 0.01028911, + "balance_loss_clip": 1.01817656, + "balance_loss_mlp": 1.03462839, + "epoch": 0.949165789869232, + "flos": 24531261356160.0, + "grad_norm": 1.6402496308438652, + "language_loss": 0.78891599, + "learning_rate": 2.7014862723025335e-08, + "loss": 0.81020397, + "num_input_tokens_seen": 340606010, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.65234375, + "step": 15787, + "time_per_iteration": 2.463150978088379 + }, + { + "auxiliary_loss_clip": 0.01098978, + "auxiliary_loss_mlp": 0.01027508, + "balance_loss_clip": 1.01666558, + "balance_loss_mlp": 1.03643632, + "epoch": 0.9492259131218999, + "flos": 22235348862720.0, + "grad_norm": 1.6263586462249067, + "language_loss": 0.76067448, + "learning_rate": 2.6951103963540388e-08, + "loss": 0.78193933, + "num_input_tokens_seen": 340626135, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.625, + "step": 15788, + "time_per_iteration": 2.4509265422821045 + }, + { + "auxiliary_loss_clip": 0.01100348, + "auxiliary_loss_mlp": 0.01031567, + "balance_loss_clip": 1.01915097, + "balance_loss_mlp": 1.03344178, + "epoch": 0.9492860363745679, + "flos": 22966310632320.0, + "grad_norm": 2.0839053015801476, + "language_loss": 0.71524441, + "learning_rate": 2.6887420022266848e-08, + "loss": 0.73656362, + "num_input_tokens_seen": 340644870, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.66796875, + "step": 15789, + "time_per_iteration": 2.432985544204712 + }, + { + "auxiliary_loss_clip": 0.0109756, + "auxiliary_loss_mlp": 0.01028031, + "balance_loss_clip": 1.01589549, + "balance_loss_mlp": 1.03416276, + "epoch": 0.9493461596272358, + "flos": 18370283754240.0, + "grad_norm": 2.063727201959523, + "language_loss": 0.73046041, + "learning_rate": 2.682381090161989e-08, + "loss": 0.75171626, + "num_input_tokens_seen": 340663695, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6328125, + "step": 15790, + "time_per_iteration": 2.4657516479492188 + }, + { + "auxiliary_loss_clip": 0.01101174, + "auxiliary_loss_mlp": 0.0103117, + "balance_loss_clip": 1.01891565, + "balance_loss_mlp": 1.03377855, + "epoch": 0.9494062828799038, + "flos": 20011724490240.0, + "grad_norm": 1.7938510674280357, + "language_loss": 0.77490807, + "learning_rate": 2.6760276604012033e-08, + "loss": 0.79623151, + "num_input_tokens_seen": 340682970, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.671875, + "step": 15791, + "time_per_iteration": 2.436913013458252 + }, + { + "auxiliary_loss_clip": 0.01102913, + "auxiliary_loss_mlp": 0.01028292, + "balance_loss_clip": 1.0160315, + "balance_loss_mlp": 1.03452277, + "epoch": 0.9494664061325718, + "flos": 27228583313280.0, + "grad_norm": 1.8010482249748228, + "language_loss": 0.73511958, + "learning_rate": 2.6696817131852234e-08, + "loss": 0.75643158, + "num_input_tokens_seen": 340702275, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 15792, + "time_per_iteration": 2.5013904571533203 + }, + { + "auxiliary_loss_clip": 0.0109955, + "auxiliary_loss_mlp": 0.0103204, + "balance_loss_clip": 1.02072704, + "balance_loss_mlp": 1.03471923, + "epoch": 0.9495265293852397, + "flos": 18369816877440.0, + "grad_norm": 1.858360072617374, + "language_loss": 0.78069293, + "learning_rate": 2.663343248754679e-08, + "loss": 0.80200887, + "num_input_tokens_seen": 340719060, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 15793, + "time_per_iteration": 2.4309065341949463 + }, + { + "auxiliary_loss_clip": 0.01099206, + "auxiliary_loss_mlp": 0.01028746, + "balance_loss_clip": 1.01784468, + "balance_loss_mlp": 1.03409159, + "epoch": 0.9495866526379078, + "flos": 23075766351360.0, + "grad_norm": 1.6667153863215733, + "language_loss": 0.77353388, + "learning_rate": 2.6570122673499562e-08, + "loss": 0.79481339, + "num_input_tokens_seen": 340737815, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.65234375, + "step": 15794, + "time_per_iteration": 2.4753899574279785 + }, + { + "auxiliary_loss_clip": 0.01102667, + "auxiliary_loss_mlp": 0.01031883, + "balance_loss_clip": 1.01900196, + "balance_loss_mlp": 1.03453398, + "epoch": 0.9496467758905757, + "flos": 17529902179200.0, + "grad_norm": 1.9706914699233502, + "language_loss": 0.60769325, + "learning_rate": 2.650688769211107e-08, + "loss": 0.62903881, + "num_input_tokens_seen": 340756035, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6796875, + "step": 15795, + "time_per_iteration": 2.4098758697509766 + }, + { + "auxiliary_loss_clip": 0.01097281, + "auxiliary_loss_mlp": 0.01032496, + "balance_loss_clip": 1.02076006, + "balance_loss_mlp": 1.03450537, + "epoch": 0.9497068991432437, + "flos": 24133910129280.0, + "grad_norm": 1.6244448011780146, + "language_loss": 0.79229355, + "learning_rate": 2.644372754577895e-08, + "loss": 0.81359136, + "num_input_tokens_seen": 340775620, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.62890625, + "step": 15796, + "time_per_iteration": 2.4744715690612793 + }, + { + "auxiliary_loss_clip": 0.01099617, + "auxiliary_loss_mlp": 0.01026633, + "balance_loss_clip": 1.01453948, + "balance_loss_mlp": 1.03425527, + "epoch": 0.9497670223959116, + "flos": 20303319098880.0, + "grad_norm": 2.1269036660223186, + "language_loss": 0.75475836, + "learning_rate": 2.6380642236898398e-08, + "loss": 0.77602082, + "num_input_tokens_seen": 340794510, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.65234375, + "step": 15797, + "time_per_iteration": 2.4281978607177734 + }, + { + "auxiliary_loss_clip": 0.01100771, + "auxiliary_loss_mlp": 0.01030925, + "balance_loss_clip": 1.0194509, + "balance_loss_mlp": 1.03495431, + "epoch": 0.9498271456485796, + "flos": 13698916099200.0, + "grad_norm": 5.214544570088492, + "language_loss": 0.6590659, + "learning_rate": 2.6317631767861727e-08, + "loss": 0.68038285, + "num_input_tokens_seen": 340812955, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.66015625, + "step": 15798, + "time_per_iteration": 2.4303300380706787 + }, + { + "auxiliary_loss_clip": 0.01102492, + "auxiliary_loss_mlp": 0.01032243, + "balance_loss_clip": 1.02094197, + "balance_loss_mlp": 1.0353173, + "epoch": 0.9498872689012475, + "flos": 20814004713600.0, + "grad_norm": 1.7260335815330186, + "language_loss": 0.7747848, + "learning_rate": 2.6254696141058575e-08, + "loss": 0.79613221, + "num_input_tokens_seen": 340829200, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 15799, + "time_per_iteration": 2.4504878520965576 + }, + { + "auxiliary_loss_clip": 0.01098618, + "auxiliary_loss_mlp": 0.01031288, + "balance_loss_clip": 1.01981449, + "balance_loss_mlp": 1.03534567, + "epoch": 0.9499473921539155, + "flos": 21032700670080.0, + "grad_norm": 1.758779888402255, + "language_loss": 0.70793021, + "learning_rate": 2.6191835358874814e-08, + "loss": 0.72922921, + "num_input_tokens_seen": 340848035, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6328125, + "step": 15800, + "time_per_iteration": 2.4800631999969482 + }, + { + "auxiliary_loss_clip": 0.01095391, + "auxiliary_loss_mlp": 0.01028754, + "balance_loss_clip": 1.01717317, + "balance_loss_mlp": 1.03154349, + "epoch": 0.9500075154065835, + "flos": 20998693468800.0, + "grad_norm": 1.741889328340764, + "language_loss": 0.71796048, + "learning_rate": 2.6129049423694315e-08, + "loss": 0.73920196, + "num_input_tokens_seen": 340870025, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.63671875, + "step": 15801, + "time_per_iteration": 2.4760396480560303 + }, + { + "auxiliary_loss_clip": 0.01100868, + "auxiliary_loss_mlp": 0.01032172, + "balance_loss_clip": 1.02095401, + "balance_loss_mlp": 1.03574038, + "epoch": 0.9500676386592515, + "flos": 25121956515840.0, + "grad_norm": 1.525596008392139, + "language_loss": 0.8088901, + "learning_rate": 2.6066338337898508e-08, + "loss": 0.83022046, + "num_input_tokens_seen": 340892290, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 15802, + "time_per_iteration": 2.5023224353790283 + }, + { + "auxiliary_loss_clip": 0.01102144, + "auxiliary_loss_mlp": 0.01028674, + "balance_loss_clip": 1.01719987, + "balance_loss_mlp": 1.03577518, + "epoch": 0.9501277619119194, + "flos": 27523625627520.0, + "grad_norm": 1.5961562305678088, + "language_loss": 0.67818773, + "learning_rate": 2.60037021038646e-08, + "loss": 0.69949591, + "num_input_tokens_seen": 340912260, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6640625, + "step": 15803, + "time_per_iteration": 3.909879684448242 + }, + { + "auxiliary_loss_clip": 0.01098126, + "auxiliary_loss_mlp": 0.01031454, + "balance_loss_clip": 1.0200932, + "balance_loss_mlp": 1.03395629, + "epoch": 0.9501878851645874, + "flos": 20813968800000.0, + "grad_norm": 1.7025824496065405, + "language_loss": 0.76297027, + "learning_rate": 2.5941140723968247e-08, + "loss": 0.784266, + "num_input_tokens_seen": 340928930, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.640625, + "step": 15804, + "time_per_iteration": 2.43643856048584 + }, + { + "auxiliary_loss_clip": 0.01102711, + "auxiliary_loss_mlp": 0.01031644, + "balance_loss_clip": 1.020015, + "balance_loss_mlp": 1.03606462, + "epoch": 0.9502480084172553, + "flos": 18369385914240.0, + "grad_norm": 1.6211668044626601, + "language_loss": 0.73356307, + "learning_rate": 2.5878654200581775e-08, + "loss": 0.75490659, + "num_input_tokens_seen": 340946615, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 15805, + "time_per_iteration": 3.8646578788757324 + }, + { + "auxiliary_loss_clip": 0.01101239, + "auxiliary_loss_mlp": 0.01033939, + "balance_loss_clip": 1.02232194, + "balance_loss_mlp": 1.03600073, + "epoch": 0.9503081316699233, + "flos": 23549607590400.0, + "grad_norm": 2.0152125986211598, + "language_loss": 0.80254024, + "learning_rate": 2.5816242536074618e-08, + "loss": 0.82389206, + "num_input_tokens_seen": 340967545, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 15806, + "time_per_iteration": 4.032423257827759 + }, + { + "auxiliary_loss_clip": 0.01102997, + "auxiliary_loss_mlp": 0.01026855, + "balance_loss_clip": 1.01560807, + "balance_loss_mlp": 1.03544569, + "epoch": 0.9503682549225914, + "flos": 18040444139520.0, + "grad_norm": 2.193286707527827, + "language_loss": 0.82814157, + "learning_rate": 2.5753905732813108e-08, + "loss": 0.8494401, + "num_input_tokens_seen": 340984955, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.67578125, + "step": 15807, + "time_per_iteration": 2.414118528366089 + }, + { + "auxiliary_loss_clip": 0.01097126, + "auxiliary_loss_mlp": 0.01029199, + "balance_loss_clip": 1.01739097, + "balance_loss_mlp": 1.03243184, + "epoch": 0.9504283781752593, + "flos": 25886135387520.0, + "grad_norm": 1.7349067366850919, + "language_loss": 0.71784639, + "learning_rate": 2.5691643793161355e-08, + "loss": 0.73910964, + "num_input_tokens_seen": 341007300, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.64453125, + "step": 15808, + "time_per_iteration": 2.5013911724090576 + }, + { + "auxiliary_loss_clip": 0.01098372, + "auxiliary_loss_mlp": 0.01026538, + "balance_loss_clip": 1.01529086, + "balance_loss_mlp": 1.03383148, + "epoch": 0.9504885014279273, + "flos": 22124025636480.0, + "grad_norm": 1.4451667081622699, + "language_loss": 0.69974124, + "learning_rate": 2.562945671948058e-08, + "loss": 0.72099042, + "num_input_tokens_seen": 341026695, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 15809, + "time_per_iteration": 2.460293769836426 + }, + { + "auxiliary_loss_clip": 0.01097419, + "auxiliary_loss_mlp": 0.01025961, + "balance_loss_clip": 1.01474309, + "balance_loss_mlp": 1.03248215, + "epoch": 0.9505486246805952, + "flos": 21615961714560.0, + "grad_norm": 1.651959109851631, + "language_loss": 0.75416887, + "learning_rate": 2.5567344514128452e-08, + "loss": 0.77540267, + "num_input_tokens_seen": 341047080, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 15810, + "time_per_iteration": 2.5347001552581787 + }, + { + "auxiliary_loss_clip": 0.01097724, + "auxiliary_loss_mlp": 0.01037805, + "balance_loss_clip": 1.02580118, + "balance_loss_mlp": 1.03223252, + "epoch": 0.9506087479332632, + "flos": 22528236360960.0, + "grad_norm": 1.397807601359488, + "language_loss": 0.79862857, + "learning_rate": 2.5505307179460643e-08, + "loss": 0.81998384, + "num_input_tokens_seen": 341067310, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.65625, + "step": 15811, + "time_per_iteration": 3.979959487915039 + }, + { + "auxiliary_loss_clip": 0.01099426, + "auxiliary_loss_mlp": 0.01030663, + "balance_loss_clip": 1.01915371, + "balance_loss_mlp": 1.03354287, + "epoch": 0.9506688711859311, + "flos": 27527360641920.0, + "grad_norm": 1.9697509553597836, + "language_loss": 0.70062947, + "learning_rate": 2.5443344717829495e-08, + "loss": 0.72193033, + "num_input_tokens_seen": 341085110, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 15812, + "time_per_iteration": 2.521512508392334 + }, + { + "auxiliary_loss_clip": 0.0110006, + "auxiliary_loss_mlp": 0.01028477, + "balance_loss_clip": 1.01730156, + "balance_loss_mlp": 1.03445292, + "epoch": 0.9507289944385992, + "flos": 19865783531520.0, + "grad_norm": 1.4891615410905001, + "language_loss": 0.65331221, + "learning_rate": 2.538145713158446e-08, + "loss": 0.67459756, + "num_input_tokens_seen": 341103190, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.65625, + "step": 15813, + "time_per_iteration": 2.4236130714416504 + }, + { + "auxiliary_loss_clip": 0.01101884, + "auxiliary_loss_mlp": 0.01035163, + "balance_loss_clip": 1.02317691, + "balance_loss_mlp": 1.03430367, + "epoch": 0.9507891176912671, + "flos": 25193274969600.0, + "grad_norm": 1.3402778954569576, + "language_loss": 0.7040152, + "learning_rate": 2.5319644423072327e-08, + "loss": 0.72538567, + "num_input_tokens_seen": 341125695, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 15814, + "time_per_iteration": 2.5455849170684814 + }, + { + "auxiliary_loss_clip": 0.01096469, + "auxiliary_loss_mlp": 0.01027375, + "balance_loss_clip": 1.01623535, + "balance_loss_mlp": 1.03357434, + "epoch": 0.9508492409439351, + "flos": 24899561458560.0, + "grad_norm": 2.0743353797115094, + "language_loss": 0.62986439, + "learning_rate": 2.5257906594637445e-08, + "loss": 0.65110284, + "num_input_tokens_seen": 341143930, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.62890625, + "step": 15815, + "time_per_iteration": 2.460432529449463 + }, + { + "auxiliary_loss_clip": 0.01097239, + "auxiliary_loss_mlp": 0.01026801, + "balance_loss_clip": 1.01574445, + "balance_loss_mlp": 1.03236914, + "epoch": 0.950909364196603, + "flos": 29784094375680.0, + "grad_norm": 1.9387627349978607, + "language_loss": 0.5886873, + "learning_rate": 2.519624364862061e-08, + "loss": 0.60992765, + "num_input_tokens_seen": 341164280, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 15816, + "time_per_iteration": 2.5585241317749023 + }, + { + "auxiliary_loss_clip": 0.01098859, + "auxiliary_loss_mlp": 0.01039133, + "balance_loss_clip": 1.02797484, + "balance_loss_mlp": 1.03374326, + "epoch": 0.950969487449271, + "flos": 24717781704960.0, + "grad_norm": 1.3857841520956902, + "language_loss": 0.73455548, + "learning_rate": 2.513465558735994e-08, + "loss": 0.75593543, + "num_input_tokens_seen": 341183670, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.65234375, + "step": 15817, + "time_per_iteration": 2.4631595611572266 + }, + { + "auxiliary_loss_clip": 0.01101933, + "auxiliary_loss_mlp": 0.01034108, + "balance_loss_clip": 1.02061391, + "balance_loss_mlp": 1.03445303, + "epoch": 0.9510296107019389, + "flos": 13699167494400.0, + "grad_norm": 1.6087638355681797, + "language_loss": 0.59922737, + "learning_rate": 2.5073142413190918e-08, + "loss": 0.62058777, + "num_input_tokens_seen": 341201900, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.67578125, + "step": 15818, + "time_per_iteration": 2.4381842613220215 + }, + { + "auxiliary_loss_clip": 0.01100649, + "auxiliary_loss_mlp": 0.01029491, + "balance_loss_clip": 1.01787972, + "balance_loss_mlp": 1.03539026, + "epoch": 0.9510897339546069, + "flos": 17311852667520.0, + "grad_norm": 1.7432779059279067, + "language_loss": 0.69244868, + "learning_rate": 2.5011704128446552e-08, + "loss": 0.71375006, + "num_input_tokens_seen": 341218340, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 15819, + "time_per_iteration": 2.462388038635254 + }, + { + "auxiliary_loss_clip": 0.01102916, + "auxiliary_loss_mlp": 0.01028571, + "balance_loss_clip": 1.01690078, + "balance_loss_mlp": 1.03555536, + "epoch": 0.951149857207275, + "flos": 14793940166400.0, + "grad_norm": 1.8294910897534251, + "language_loss": 0.74143231, + "learning_rate": 2.49503407354561e-08, + "loss": 0.76274723, + "num_input_tokens_seen": 341235885, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.671875, + "step": 15820, + "time_per_iteration": 2.4434814453125 + }, + { + "auxiliary_loss_clip": 0.0110219, + "auxiliary_loss_mlp": 0.01032913, + "balance_loss_clip": 1.02099824, + "balance_loss_mlp": 1.0352037, + "epoch": 0.9512099804599429, + "flos": 19391152193280.0, + "grad_norm": 1.7026634144017363, + "language_loss": 0.78670204, + "learning_rate": 2.4889052236546804e-08, + "loss": 0.80805308, + "num_input_tokens_seen": 341255280, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 15821, + "time_per_iteration": 2.4224560260772705 + }, + { + "auxiliary_loss_clip": 0.01097487, + "auxiliary_loss_mlp": 0.01026304, + "balance_loss_clip": 1.01455593, + "balance_loss_mlp": 1.03292096, + "epoch": 0.9512701037126109, + "flos": 36757874885760.0, + "grad_norm": 1.5339289826116789, + "language_loss": 0.71220911, + "learning_rate": 2.4827838634042586e-08, + "loss": 0.73344707, + "num_input_tokens_seen": 341279055, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6484375, + "step": 15822, + "time_per_iteration": 2.5735907554626465 + }, + { + "auxiliary_loss_clip": 0.01100231, + "auxiliary_loss_mlp": 0.01031823, + "balance_loss_clip": 1.02049828, + "balance_loss_mlp": 1.03538275, + "epoch": 0.9513302269652788, + "flos": 22638266697600.0, + "grad_norm": 1.8073194694188124, + "language_loss": 0.66159809, + "learning_rate": 2.47666999302647e-08, + "loss": 0.68291861, + "num_input_tokens_seen": 341298560, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 15823, + "time_per_iteration": 2.465412139892578 + }, + { + "auxiliary_loss_clip": 0.01097407, + "auxiliary_loss_mlp": 0.01030023, + "balance_loss_clip": 1.01931834, + "balance_loss_mlp": 1.03426194, + "epoch": 0.9513903502179468, + "flos": 22893232412160.0, + "grad_norm": 1.5566121914996327, + "language_loss": 0.76921892, + "learning_rate": 2.4705636127531292e-08, + "loss": 0.79049319, + "num_input_tokens_seen": 341316650, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6328125, + "step": 15824, + "time_per_iteration": 2.4632158279418945 + }, + { + "auxiliary_loss_clip": 0.01103042, + "auxiliary_loss_mlp": 0.01029115, + "balance_loss_clip": 1.01674688, + "balance_loss_mlp": 1.03397322, + "epoch": 0.9514504734706147, + "flos": 27928626451200.0, + "grad_norm": 1.8863003514793029, + "language_loss": 0.73595691, + "learning_rate": 2.4644647228158065e-08, + "loss": 0.75727856, + "num_input_tokens_seen": 341336185, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69140625, + "step": 15825, + "time_per_iteration": 2.474846363067627 + }, + { + "auxiliary_loss_clip": 0.01021578, + "auxiliary_loss_mlp": 0.0100021, + "balance_loss_clip": 0.99922067, + "balance_loss_mlp": 1.00160623, + "epoch": 0.9515105967232828, + "flos": 67366767312000.0, + "grad_norm": 0.8541641407387539, + "language_loss": 0.53453624, + "learning_rate": 2.458373323445806e-08, + "loss": 0.55475414, + "num_input_tokens_seen": 341395795, + "router_z_loss_clip": 0.0098877, + "router_z_loss_mlp": 0.20019531, + "step": 15826, + "time_per_iteration": 2.9626259803771973 + }, + { + "auxiliary_loss_clip": 0.01100498, + "auxiliary_loss_mlp": 0.01035518, + "balance_loss_clip": 1.02391326, + "balance_loss_mlp": 1.03486824, + "epoch": 0.9515707199759507, + "flos": 25846525664640.0, + "grad_norm": 2.681001653375095, + "language_loss": 0.72440886, + "learning_rate": 2.452289414874076e-08, + "loss": 0.74576902, + "num_input_tokens_seen": 341415675, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 15827, + "time_per_iteration": 2.4679887294769287 + }, + { + "auxiliary_loss_clip": 0.01099346, + "auxiliary_loss_mlp": 0.01027496, + "balance_loss_clip": 1.01561654, + "balance_loss_mlp": 1.03423381, + "epoch": 0.9516308432286187, + "flos": 21828983322240.0, + "grad_norm": 1.9123918048218376, + "language_loss": 0.74679339, + "learning_rate": 2.4462129973313207e-08, + "loss": 0.76806182, + "num_input_tokens_seen": 341432990, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6484375, + "step": 15828, + "time_per_iteration": 2.444719076156616 + }, + { + "auxiliary_loss_clip": 0.01098432, + "auxiliary_loss_mlp": 0.01031714, + "balance_loss_clip": 1.02086651, + "balance_loss_mlp": 1.03533959, + "epoch": 0.9516909664812866, + "flos": 27269593666560.0, + "grad_norm": 1.4979200700178503, + "language_loss": 0.7287569, + "learning_rate": 2.440144071047978e-08, + "loss": 0.75005829, + "num_input_tokens_seen": 341454100, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6328125, + "step": 15829, + "time_per_iteration": 2.4894516468048096 + }, + { + "auxiliary_loss_clip": 0.01099572, + "auxiliary_loss_mlp": 0.01026804, + "balance_loss_clip": 1.01606297, + "balance_loss_mlp": 1.034266, + "epoch": 0.9517510897339546, + "flos": 21215342350080.0, + "grad_norm": 1.765032292908151, + "language_loss": 0.6078254, + "learning_rate": 2.4340826362541533e-08, + "loss": 0.62908912, + "num_input_tokens_seen": 341472955, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.65234375, + "step": 15830, + "time_per_iteration": 2.4441115856170654 + }, + { + "auxiliary_loss_clip": 0.01102008, + "auxiliary_loss_mlp": 0.01030583, + "balance_loss_clip": 1.0181613, + "balance_loss_mlp": 1.03501642, + "epoch": 0.9518112129866225, + "flos": 18733986915840.0, + "grad_norm": 2.121238764010395, + "language_loss": 0.73090142, + "learning_rate": 2.428028693179729e-08, + "loss": 0.75222731, + "num_input_tokens_seen": 341490165, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 15831, + "time_per_iteration": 2.4257888793945312 + }, + { + "auxiliary_loss_clip": 0.01095715, + "auxiliary_loss_mlp": 0.01023011, + "balance_loss_clip": 1.01229966, + "balance_loss_mlp": 1.03229094, + "epoch": 0.9518713362392905, + "flos": 16763676232320.0, + "grad_norm": 1.8239612191411185, + "language_loss": 0.65346098, + "learning_rate": 2.4219822420542545e-08, + "loss": 0.67464817, + "num_input_tokens_seen": 341508055, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6328125, + "step": 15832, + "time_per_iteration": 2.4187471866607666 + }, + { + "auxiliary_loss_clip": 0.01099237, + "auxiliary_loss_mlp": 0.01029878, + "balance_loss_clip": 1.01906586, + "balance_loss_mlp": 1.03727329, + "epoch": 0.9519314594919586, + "flos": 15230649720960.0, + "grad_norm": 2.0408885296052803, + "language_loss": 0.77953559, + "learning_rate": 2.4159432831070135e-08, + "loss": 0.80082679, + "num_input_tokens_seen": 341526155, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6171875, + "step": 15833, + "time_per_iteration": 2.434434413909912 + }, + { + "auxiliary_loss_clip": 0.01097433, + "auxiliary_loss_mlp": 0.01030915, + "balance_loss_clip": 1.0198164, + "balance_loss_mlp": 1.03424346, + "epoch": 0.9519915827446265, + "flos": 19352943100800.0, + "grad_norm": 2.0874053061362146, + "language_loss": 0.74132979, + "learning_rate": 2.4099118165670007e-08, + "loss": 0.7626133, + "num_input_tokens_seen": 341540450, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6328125, + "step": 15834, + "time_per_iteration": 2.4627585411071777 + }, + { + "auxiliary_loss_clip": 0.01104375, + "auxiliary_loss_mlp": 0.01032918, + "balance_loss_clip": 1.02034688, + "balance_loss_mlp": 1.0350616, + "epoch": 0.9520517059972945, + "flos": 22266303408000.0, + "grad_norm": 1.998991881803621, + "language_loss": 0.76126343, + "learning_rate": 2.4038878426629216e-08, + "loss": 0.78263634, + "num_input_tokens_seen": 341557865, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 15835, + "time_per_iteration": 2.4405393600463867 + }, + { + "auxiliary_loss_clip": 0.01100091, + "auxiliary_loss_mlp": 0.0103236, + "balance_loss_clip": 1.02003992, + "balance_loss_mlp": 1.03354049, + "epoch": 0.9521118292499624, + "flos": 14862313704960.0, + "grad_norm": 1.8271796682870614, + "language_loss": 0.65903687, + "learning_rate": 2.397871361623238e-08, + "loss": 0.68036139, + "num_input_tokens_seen": 341573890, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6640625, + "step": 15836, + "time_per_iteration": 2.449270248413086 + }, + { + "auxiliary_loss_clip": 0.01097308, + "auxiliary_loss_mlp": 0.01024877, + "balance_loss_clip": 1.01343238, + "balance_loss_mlp": 1.03359866, + "epoch": 0.9521719525026304, + "flos": 23508812718720.0, + "grad_norm": 1.6042939226932975, + "language_loss": 0.70522273, + "learning_rate": 2.391862373676057e-08, + "loss": 0.72644454, + "num_input_tokens_seen": 341593770, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.63671875, + "step": 15837, + "time_per_iteration": 2.485703468322754 + }, + { + "auxiliary_loss_clip": 0.01100856, + "auxiliary_loss_mlp": 0.0103009, + "balance_loss_clip": 1.01728129, + "balance_loss_mlp": 1.03319621, + "epoch": 0.9522320757552983, + "flos": 19714922409600.0, + "grad_norm": 1.769143781079073, + "language_loss": 0.73489517, + "learning_rate": 2.3858608790492617e-08, + "loss": 0.75620466, + "num_input_tokens_seen": 341612065, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 15838, + "time_per_iteration": 2.5037167072296143 + }, + { + "auxiliary_loss_clip": 0.01099497, + "auxiliary_loss_mlp": 0.01029417, + "balance_loss_clip": 1.01803803, + "balance_loss_mlp": 1.03289866, + "epoch": 0.9522921990079664, + "flos": 25921291824000.0, + "grad_norm": 1.9893727922388904, + "language_loss": 0.78339815, + "learning_rate": 2.379866877970449e-08, + "loss": 0.80468726, + "num_input_tokens_seen": 341631365, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6640625, + "step": 15839, + "time_per_iteration": 2.4681766033172607 + }, + { + "auxiliary_loss_clip": 0.01101243, + "auxiliary_loss_mlp": 0.01033121, + "balance_loss_clip": 1.0213666, + "balance_loss_mlp": 1.03504133, + "epoch": 0.9523523222606343, + "flos": 19208115463680.0, + "grad_norm": 1.4720319477602668, + "language_loss": 0.80227256, + "learning_rate": 2.3738803706668585e-08, + "loss": 0.82361627, + "num_input_tokens_seen": 341650300, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66015625, + "step": 15840, + "time_per_iteration": 2.4581902027130127 + }, + { + "auxiliary_loss_clip": 0.01093038, + "auxiliary_loss_mlp": 0.0102555, + "balance_loss_clip": 1.01571536, + "balance_loss_mlp": 1.03195643, + "epoch": 0.9524124455133023, + "flos": 20921269703040.0, + "grad_norm": 1.9542207241379934, + "language_loss": 0.72871137, + "learning_rate": 2.3679013573655314e-08, + "loss": 0.74989724, + "num_input_tokens_seen": 341667680, + "router_z_loss_clip": 0.09863281, + "router_z_loss_mlp": 0.609375, + "step": 15841, + "time_per_iteration": 2.465167760848999 + }, + { + "auxiliary_loss_clip": 0.01093774, + "auxiliary_loss_mlp": 0.01024056, + "balance_loss_clip": 1.01342833, + "balance_loss_mlp": 1.03309047, + "epoch": 0.9524725687659702, + "flos": 18843550375680.0, + "grad_norm": 1.8427976481059472, + "language_loss": 0.78926313, + "learning_rate": 2.3619298382931972e-08, + "loss": 0.81044149, + "num_input_tokens_seen": 341685760, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.609375, + "step": 15842, + "time_per_iteration": 2.4842257499694824 + }, + { + "auxiliary_loss_clip": 0.01101716, + "auxiliary_loss_mlp": 0.01031806, + "balance_loss_clip": 1.02011764, + "balance_loss_mlp": 1.03695965, + "epoch": 0.9525326920186382, + "flos": 22674680110080.0, + "grad_norm": 1.8219900566748215, + "language_loss": 0.72275579, + "learning_rate": 2.3559658136762973e-08, + "loss": 0.74409097, + "num_input_tokens_seen": 341705300, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6484375, + "step": 15843, + "time_per_iteration": 2.4471869468688965 + }, + { + "auxiliary_loss_clip": 0.01101194, + "auxiliary_loss_mlp": 0.01030657, + "balance_loss_clip": 1.01852155, + "balance_loss_mlp": 1.03493142, + "epoch": 0.9525928152713061, + "flos": 22086642556800.0, + "grad_norm": 1.6296759970994528, + "language_loss": 0.78324318, + "learning_rate": 2.3500092837409612e-08, + "loss": 0.80456167, + "num_input_tokens_seen": 341724565, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6640625, + "step": 15844, + "time_per_iteration": 2.4472808837890625 + }, + { + "auxiliary_loss_clip": 0.01103251, + "auxiliary_loss_mlp": 0.01031271, + "balance_loss_clip": 1.01777697, + "balance_loss_mlp": 1.03366756, + "epoch": 0.9526529385239741, + "flos": 20704728562560.0, + "grad_norm": 1.8471796733890804, + "language_loss": 0.69943261, + "learning_rate": 2.3440602487130977e-08, + "loss": 0.72077781, + "num_input_tokens_seen": 341743605, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.6953125, + "step": 15845, + "time_per_iteration": 3.8085498809814453 + }, + { + "auxiliary_loss_clip": 0.01101573, + "auxiliary_loss_mlp": 0.01031515, + "balance_loss_clip": 1.02024341, + "balance_loss_mlp": 1.03391755, + "epoch": 0.9527130617766422, + "flos": 23368043318400.0, + "grad_norm": 1.6011167896613763, + "language_loss": 0.75642556, + "learning_rate": 2.338118708818282e-08, + "loss": 0.77775645, + "num_input_tokens_seen": 341763475, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.67578125, + "step": 15846, + "time_per_iteration": 2.473083019256592 + }, + { + "auxiliary_loss_clip": 0.01100923, + "auxiliary_loss_mlp": 0.01027356, + "balance_loss_clip": 1.01582253, + "balance_loss_mlp": 1.03413308, + "epoch": 0.9527731850293101, + "flos": 18985935888000.0, + "grad_norm": 1.7664774219702817, + "language_loss": 0.78162938, + "learning_rate": 2.3321846642817998e-08, + "loss": 0.80291218, + "num_input_tokens_seen": 341781265, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66796875, + "step": 15847, + "time_per_iteration": 3.8917007446289062 + }, + { + "auxiliary_loss_clip": 0.01096033, + "auxiliary_loss_mlp": 0.01032606, + "balance_loss_clip": 1.02181149, + "balance_loss_mlp": 1.03224957, + "epoch": 0.9528333082819781, + "flos": 19318038059520.0, + "grad_norm": 1.7206750490584977, + "language_loss": 0.77701223, + "learning_rate": 2.326258115328672e-08, + "loss": 0.7982986, + "num_input_tokens_seen": 341798825, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.63671875, + "step": 15848, + "time_per_iteration": 3.823594093322754 + }, + { + "auxiliary_loss_clip": 0.01104682, + "auxiliary_loss_mlp": 0.01039029, + "balance_loss_clip": 1.02638733, + "balance_loss_mlp": 1.03632021, + "epoch": 0.952893431534646, + "flos": 23951340276480.0, + "grad_norm": 1.6696974182974789, + "language_loss": 0.72178817, + "learning_rate": 2.320339062183674e-08, + "loss": 0.74322522, + "num_input_tokens_seen": 341819480, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.68359375, + "step": 15849, + "time_per_iteration": 2.4846818447113037 + }, + { + "auxiliary_loss_clip": 0.01107242, + "auxiliary_loss_mlp": 0.0103393, + "balance_loss_clip": 1.02149057, + "balance_loss_mlp": 1.03735495, + "epoch": 0.952953554787314, + "flos": 21030545854080.0, + "grad_norm": 1.6245014301779637, + "language_loss": 0.75090873, + "learning_rate": 2.314427505071226e-08, + "loss": 0.77232051, + "num_input_tokens_seen": 341838035, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69921875, + "step": 15850, + "time_per_iteration": 2.4306790828704834 + }, + { + "auxiliary_loss_clip": 0.01099677, + "auxiliary_loss_mlp": 0.01028737, + "balance_loss_clip": 1.01789474, + "balance_loss_mlp": 1.03360927, + "epoch": 0.9530136780399819, + "flos": 22382870019840.0, + "grad_norm": 2.6338381558530766, + "language_loss": 0.72366798, + "learning_rate": 2.308523444215482e-08, + "loss": 0.7449522, + "num_input_tokens_seen": 341855895, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.66015625, + "step": 15851, + "time_per_iteration": 2.4408111572265625 + }, + { + "auxiliary_loss_clip": 0.01097199, + "auxiliary_loss_mlp": 0.01025055, + "balance_loss_clip": 1.01375353, + "balance_loss_mlp": 1.03315783, + "epoch": 0.95307380129265, + "flos": 22159613036160.0, + "grad_norm": 2.0504097549488027, + "language_loss": 0.7981447, + "learning_rate": 2.3026268798403525e-08, + "loss": 0.81936717, + "num_input_tokens_seen": 341875240, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 15852, + "time_per_iteration": 2.4637861251831055 + }, + { + "auxiliary_loss_clip": 0.01099896, + "auxiliary_loss_mlp": 0.01033288, + "balance_loss_clip": 1.02134967, + "balance_loss_mlp": 1.03417897, + "epoch": 0.9531339245453179, + "flos": 44022747214080.0, + "grad_norm": 1.7073175594849501, + "language_loss": 0.59777415, + "learning_rate": 2.2967378121694138e-08, + "loss": 0.61910605, + "num_input_tokens_seen": 341901020, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 15853, + "time_per_iteration": 4.21125054359436 + }, + { + "auxiliary_loss_clip": 0.01095024, + "auxiliary_loss_mlp": 0.01027344, + "balance_loss_clip": 1.01688933, + "balance_loss_mlp": 1.03267741, + "epoch": 0.9531940477979859, + "flos": 20266690204800.0, + "grad_norm": 1.8692728037781963, + "language_loss": 0.7304824, + "learning_rate": 2.290856241425998e-08, + "loss": 0.75170606, + "num_input_tokens_seen": 341919365, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.625, + "step": 15854, + "time_per_iteration": 2.432305097579956 + }, + { + "auxiliary_loss_clip": 0.01098391, + "auxiliary_loss_mlp": 0.01028917, + "balance_loss_clip": 1.01780701, + "balance_loss_mlp": 1.03201079, + "epoch": 0.9532541710506538, + "flos": 25335732309120.0, + "grad_norm": 2.4146000582030047, + "language_loss": 0.67618144, + "learning_rate": 2.284982167833127e-08, + "loss": 0.69745457, + "num_input_tokens_seen": 341939985, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 15855, + "time_per_iteration": 2.4794416427612305 + }, + { + "auxiliary_loss_clip": 0.01098939, + "auxiliary_loss_mlp": 0.01027176, + "balance_loss_clip": 1.0162859, + "balance_loss_mlp": 1.03353429, + "epoch": 0.9533142943033218, + "flos": 26469288691200.0, + "grad_norm": 1.5011674711012832, + "language_loss": 0.76639926, + "learning_rate": 2.279115591613556e-08, + "loss": 0.78766036, + "num_input_tokens_seen": 341959255, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.65234375, + "step": 15856, + "time_per_iteration": 2.4852230548858643 + }, + { + "auxiliary_loss_clip": 0.01097507, + "auxiliary_loss_mlp": 0.01029936, + "balance_loss_clip": 1.01927257, + "balance_loss_mlp": 1.03294313, + "epoch": 0.9533744175559897, + "flos": 23656944407040.0, + "grad_norm": 1.6032019491566774, + "language_loss": 0.77757066, + "learning_rate": 2.2732565129897075e-08, + "loss": 0.79884511, + "num_input_tokens_seen": 341977205, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.64453125, + "step": 15857, + "time_per_iteration": 2.4635865688323975 + }, + { + "auxiliary_loss_clip": 0.01021553, + "auxiliary_loss_mlp": 0.01002209, + "balance_loss_clip": 1.00120187, + "balance_loss_mlp": 1.00156283, + "epoch": 0.9534345408086577, + "flos": 61052055500160.0, + "grad_norm": 0.704565405960459, + "language_loss": 0.62570769, + "learning_rate": 2.267404932183803e-08, + "loss": 0.64594531, + "num_input_tokens_seen": 342038545, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.19921875, + "step": 15858, + "time_per_iteration": 3.001497268676758 + }, + { + "auxiliary_loss_clip": 0.01097781, + "auxiliary_loss_mlp": 0.0102579, + "balance_loss_clip": 1.01493001, + "balance_loss_mlp": 1.03351498, + "epoch": 0.9534946640613258, + "flos": 18951677291520.0, + "grad_norm": 1.454205662994463, + "language_loss": 0.56674993, + "learning_rate": 2.2615608494177097e-08, + "loss": 0.58798563, + "num_input_tokens_seen": 342058195, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.64453125, + "step": 15859, + "time_per_iteration": 2.518068313598633 + }, + { + "auxiliary_loss_clip": 0.01094952, + "auxiliary_loss_mlp": 0.01027304, + "balance_loss_clip": 1.01677203, + "balance_loss_mlp": 1.03268635, + "epoch": 0.9535547873139937, + "flos": 16654292340480.0, + "grad_norm": 2.021746638389019, + "language_loss": 0.81863093, + "learning_rate": 2.2557242649130504e-08, + "loss": 0.83985353, + "num_input_tokens_seen": 342075025, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.625, + "step": 15860, + "time_per_iteration": 2.493851900100708 + }, + { + "auxiliary_loss_clip": 0.01097997, + "auxiliary_loss_mlp": 0.01024861, + "balance_loss_clip": 1.0143348, + "balance_loss_mlp": 1.03253686, + "epoch": 0.9536149105666617, + "flos": 20667776446080.0, + "grad_norm": 1.7693188133463755, + "language_loss": 0.66683793, + "learning_rate": 2.249895178891159e-08, + "loss": 0.68806648, + "num_input_tokens_seen": 342094595, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.65625, + "step": 15861, + "time_per_iteration": 2.4849302768707275 + }, + { + "auxiliary_loss_clip": 0.01100217, + "auxiliary_loss_mlp": 0.01034093, + "balance_loss_clip": 1.02243447, + "balance_loss_mlp": 1.03482676, + "epoch": 0.9536750338193296, + "flos": 30700499086080.0, + "grad_norm": 2.2855998410592417, + "language_loss": 0.65861797, + "learning_rate": 2.244073591573037e-08, + "loss": 0.67996109, + "num_input_tokens_seen": 342115970, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 15862, + "time_per_iteration": 2.5085601806640625 + }, + { + "auxiliary_loss_clip": 0.01098858, + "auxiliary_loss_mlp": 0.010273, + "balance_loss_clip": 1.01651764, + "balance_loss_mlp": 1.03623557, + "epoch": 0.9537351570719976, + "flos": 20405484357120.0, + "grad_norm": 1.4470053207480973, + "language_loss": 0.6742301, + "learning_rate": 2.238259503179485e-08, + "loss": 0.69549167, + "num_input_tokens_seen": 342134080, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.625, + "step": 15863, + "time_per_iteration": 2.4261577129364014 + }, + { + "auxiliary_loss_clip": 0.01099259, + "auxiliary_loss_mlp": 0.01028199, + "balance_loss_clip": 1.01673734, + "balance_loss_mlp": 1.03436029, + "epoch": 0.9537952803246655, + "flos": 29929245235200.0, + "grad_norm": 1.814028155072979, + "language_loss": 0.7815752, + "learning_rate": 2.2324529139309267e-08, + "loss": 0.80284977, + "num_input_tokens_seen": 342154725, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 15864, + "time_per_iteration": 2.530269145965576 + }, + { + "auxiliary_loss_clip": 0.01097838, + "auxiliary_loss_mlp": 0.01026439, + "balance_loss_clip": 1.01526916, + "balance_loss_mlp": 1.03393769, + "epoch": 0.9538554035773336, + "flos": 20521404524160.0, + "grad_norm": 1.8811083442124992, + "language_loss": 0.5989036, + "learning_rate": 2.226653824047586e-08, + "loss": 0.62014639, + "num_input_tokens_seen": 342172275, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.640625, + "step": 15865, + "time_per_iteration": 2.4256134033203125 + }, + { + "auxiliary_loss_clip": 0.01098516, + "auxiliary_loss_mlp": 0.01027192, + "balance_loss_clip": 1.0161171, + "balance_loss_mlp": 1.0329715, + "epoch": 0.9539155268300015, + "flos": 18406517598720.0, + "grad_norm": 1.8653100688509543, + "language_loss": 0.69772661, + "learning_rate": 2.2208622337493765e-08, + "loss": 0.71898365, + "num_input_tokens_seen": 342190880, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.65625, + "step": 15866, + "time_per_iteration": 2.43904447555542 + }, + { + "auxiliary_loss_clip": 0.01099512, + "auxiliary_loss_mlp": 0.01031315, + "balance_loss_clip": 1.01943624, + "balance_loss_mlp": 1.0335021, + "epoch": 0.9539756500826695, + "flos": 26213281482240.0, + "grad_norm": 2.271920105664109, + "language_loss": 0.84857428, + "learning_rate": 2.215078143255855e-08, + "loss": 0.86988258, + "num_input_tokens_seen": 342208165, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66015625, + "step": 15867, + "time_per_iteration": 2.460845708847046 + }, + { + "auxiliary_loss_clip": 0.01021587, + "auxiliary_loss_mlp": 0.00999883, + "balance_loss_clip": 0.9989357, + "balance_loss_mlp": 1.00168824, + "epoch": 0.9540357733353374, + "flos": 68289097766400.0, + "grad_norm": 0.7526563387722108, + "language_loss": 0.61838603, + "learning_rate": 2.2093015527864024e-08, + "loss": 0.63860077, + "num_input_tokens_seen": 342277110, + "router_z_loss_clip": 0.00946045, + "router_z_loss_mlp": 0.19921875, + "step": 15868, + "time_per_iteration": 3.0988385677337646 + }, + { + "auxiliary_loss_clip": 0.01099704, + "auxiliary_loss_mlp": 0.01027007, + "balance_loss_clip": 1.01490152, + "balance_loss_mlp": 1.03455853, + "epoch": 0.9540958965880054, + "flos": 21288276915840.0, + "grad_norm": 1.7170259212083214, + "language_loss": 0.60134614, + "learning_rate": 2.2035324625600425e-08, + "loss": 0.62261331, + "num_input_tokens_seen": 342294695, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.65234375, + "step": 15869, + "time_per_iteration": 2.4509079456329346 + }, + { + "auxiliary_loss_clip": 0.01099414, + "auxiliary_loss_mlp": 0.01031649, + "balance_loss_clip": 1.02160573, + "balance_loss_mlp": 1.03469789, + "epoch": 0.9541560198406733, + "flos": 19751407649280.0, + "grad_norm": 1.7269587427502697, + "language_loss": 0.70540398, + "learning_rate": 2.197770872795579e-08, + "loss": 0.72671461, + "num_input_tokens_seen": 342314970, + "router_z_loss_clip": 0.10009766, + "router_z_loss_mlp": 0.6484375, + "step": 15870, + "time_per_iteration": 2.494284152984619 + }, + { + "auxiliary_loss_clip": 0.01095736, + "auxiliary_loss_mlp": 0.01027683, + "balance_loss_clip": 1.01579773, + "balance_loss_mlp": 1.03193331, + "epoch": 0.9542161430933414, + "flos": 24715626888960.0, + "grad_norm": 2.5278290831513313, + "language_loss": 0.76707828, + "learning_rate": 2.1920167837114368e-08, + "loss": 0.78831249, + "num_input_tokens_seen": 342334255, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.640625, + "step": 15871, + "time_per_iteration": 2.495964527130127 + }, + { + "auxiliary_loss_clip": 0.01100681, + "auxiliary_loss_mlp": 0.01028549, + "balance_loss_clip": 1.01648521, + "balance_loss_mlp": 1.03446722, + "epoch": 0.9542762663460094, + "flos": 31065818359680.0, + "grad_norm": 1.7535486260785396, + "language_loss": 0.58022785, + "learning_rate": 2.1862701955258634e-08, + "loss": 0.60152018, + "num_input_tokens_seen": 342354730, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.66015625, + "step": 15872, + "time_per_iteration": 2.5454163551330566 + }, + { + "auxiliary_loss_clip": 0.01102984, + "auxiliary_loss_mlp": 0.01029416, + "balance_loss_clip": 1.01681566, + "balance_loss_mlp": 1.03452253, + "epoch": 0.9543363895986773, + "flos": 20776729374720.0, + "grad_norm": 1.4591829449069909, + "language_loss": 0.74832845, + "learning_rate": 2.1805311084567514e-08, + "loss": 0.76965249, + "num_input_tokens_seen": 342374565, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.68359375, + "step": 15873, + "time_per_iteration": 2.4488136768341064 + }, + { + "auxiliary_loss_clip": 0.01101317, + "auxiliary_loss_mlp": 0.01031843, + "balance_loss_clip": 1.01900995, + "balance_loss_mlp": 1.03453755, + "epoch": 0.9543965128513453, + "flos": 24462744163200.0, + "grad_norm": 1.727737631306832, + "language_loss": 0.62304831, + "learning_rate": 2.1747995227217265e-08, + "loss": 0.64437991, + "num_input_tokens_seen": 342394590, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.66796875, + "step": 15874, + "time_per_iteration": 2.4801042079925537 + }, + { + "auxiliary_loss_clip": 0.01098124, + "auxiliary_loss_mlp": 0.01032916, + "balance_loss_clip": 1.02129853, + "balance_loss_mlp": 1.03405643, + "epoch": 0.9544566361040132, + "flos": 15261532439040.0, + "grad_norm": 2.237267296362062, + "language_loss": 0.89501953, + "learning_rate": 2.169075438538104e-08, + "loss": 0.91632992, + "num_input_tokens_seen": 342410445, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.640625, + "step": 15875, + "time_per_iteration": 2.391258716583252 + }, + { + "auxiliary_loss_clip": 0.01103812, + "auxiliary_loss_mlp": 0.01031982, + "balance_loss_clip": 1.01947105, + "balance_loss_mlp": 1.03519917, + "epoch": 0.9545167593566812, + "flos": 25918777872000.0, + "grad_norm": 1.930514194430758, + "language_loss": 0.67863441, + "learning_rate": 2.1633588561229765e-08, + "loss": 0.69999236, + "num_input_tokens_seen": 342430970, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 15876, + "time_per_iteration": 2.6414058208465576 + }, + { + "auxiliary_loss_clip": 0.01101042, + "auxiliary_loss_mlp": 0.01028919, + "balance_loss_clip": 1.01681304, + "balance_loss_mlp": 1.0339992, + "epoch": 0.9545768826093491, + "flos": 25628188844160.0, + "grad_norm": 1.7708559487688424, + "language_loss": 0.6911338, + "learning_rate": 2.1576497756931267e-08, + "loss": 0.7124334, + "num_input_tokens_seen": 342449505, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.671875, + "step": 15877, + "time_per_iteration": 2.509392738342285 + }, + { + "auxiliary_loss_clip": 0.01102036, + "auxiliary_loss_mlp": 0.01028312, + "balance_loss_clip": 1.01631999, + "balance_loss_mlp": 1.03537035, + "epoch": 0.9546370058620172, + "flos": 22491499726080.0, + "grad_norm": 1.672159994427175, + "language_loss": 0.70852697, + "learning_rate": 2.1519481974650035e-08, + "loss": 0.72983038, + "num_input_tokens_seen": 342470390, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 15878, + "time_per_iteration": 2.472141742706299 + }, + { + "auxiliary_loss_clip": 0.01096131, + "auxiliary_loss_mlp": 0.01026374, + "balance_loss_clip": 1.01498389, + "balance_loss_mlp": 1.03232455, + "epoch": 0.9546971291146851, + "flos": 24609582961920.0, + "grad_norm": 1.3589761302170789, + "language_loss": 0.68371421, + "learning_rate": 2.1462541216548335e-08, + "loss": 0.70493931, + "num_input_tokens_seen": 342492560, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.63671875, + "step": 15879, + "time_per_iteration": 2.4805357456207275 + }, + { + "auxiliary_loss_clip": 0.01097447, + "auxiliary_loss_mlp": 0.01027897, + "balance_loss_clip": 1.01687622, + "balance_loss_mlp": 1.0332495, + "epoch": 0.9547572523673531, + "flos": 28657756627200.0, + "grad_norm": 1.9054124900066427, + "language_loss": 0.84860075, + "learning_rate": 2.1405675484785334e-08, + "loss": 0.86985421, + "num_input_tokens_seen": 342512315, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 15880, + "time_per_iteration": 2.5021228790283203 + }, + { + "auxiliary_loss_clip": 0.01099262, + "auxiliary_loss_mlp": 0.01030074, + "balance_loss_clip": 1.0179683, + "balance_loss_mlp": 1.0333271, + "epoch": 0.954817375620021, + "flos": 33802606385280.0, + "grad_norm": 1.7287846320179276, + "language_loss": 0.71575916, + "learning_rate": 2.134888478151753e-08, + "loss": 0.73705256, + "num_input_tokens_seen": 342533060, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.65625, + "step": 15881, + "time_per_iteration": 2.5729317665100098 + }, + { + "auxiliary_loss_clip": 0.01099118, + "auxiliary_loss_mlp": 0.01035277, + "balance_loss_clip": 1.02368426, + "balance_loss_mlp": 1.03515661, + "epoch": 0.954877498872689, + "flos": 14428225843200.0, + "grad_norm": 1.8203154025696195, + "language_loss": 0.71242815, + "learning_rate": 2.1292169108898083e-08, + "loss": 0.7337721, + "num_input_tokens_seen": 342550830, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.640625, + "step": 15882, + "time_per_iteration": 2.4185373783111572 + }, + { + "auxiliary_loss_clip": 0.01100013, + "auxiliary_loss_mlp": 0.01030602, + "balance_loss_clip": 1.01927757, + "balance_loss_mlp": 1.03457165, + "epoch": 0.9549376221253569, + "flos": 59269447336320.0, + "grad_norm": 2.0067236869483644, + "language_loss": 0.66055608, + "learning_rate": 2.1235528469078168e-08, + "loss": 0.68186224, + "num_input_tokens_seen": 342575070, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 15883, + "time_per_iteration": 2.812549114227295 + }, + { + "auxiliary_loss_clip": 0.0110374, + "auxiliary_loss_mlp": 0.01028151, + "balance_loss_clip": 1.01572978, + "balance_loss_mlp": 1.03677058, + "epoch": 0.954997745378025, + "flos": 17274397760640.0, + "grad_norm": 2.8838417421243645, + "language_loss": 0.7817893, + "learning_rate": 2.1178962864205175e-08, + "loss": 0.80310816, + "num_input_tokens_seen": 342592215, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.66796875, + "step": 15884, + "time_per_iteration": 2.4394302368164062 + }, + { + "auxiliary_loss_clip": 0.01100977, + "auxiliary_loss_mlp": 0.01025185, + "balance_loss_clip": 1.01315713, + "balance_loss_mlp": 1.03389931, + "epoch": 0.955057868630693, + "flos": 13006378903680.0, + "grad_norm": 5.22444846040725, + "language_loss": 0.776416, + "learning_rate": 2.1122472296424054e-08, + "loss": 0.79767764, + "num_input_tokens_seen": 342610030, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 15885, + "time_per_iteration": 2.4130163192749023 + }, + { + "auxiliary_loss_clip": 0.01099455, + "auxiliary_loss_mlp": 0.01030292, + "balance_loss_clip": 1.01905656, + "balance_loss_mlp": 1.03313184, + "epoch": 0.9551179918833609, + "flos": 22637692080000.0, + "grad_norm": 1.971891405607794, + "language_loss": 0.69846129, + "learning_rate": 2.1066056767877317e-08, + "loss": 0.71975875, + "num_input_tokens_seen": 342626475, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6640625, + "step": 15886, + "time_per_iteration": 2.4465065002441406 + }, + { + "auxiliary_loss_clip": 0.01103927, + "auxiliary_loss_mlp": 0.01032148, + "balance_loss_clip": 1.01940477, + "balance_loss_mlp": 1.03575993, + "epoch": 0.9551781151360289, + "flos": 21542811667200.0, + "grad_norm": 1.7290024537631783, + "language_loss": 0.72445035, + "learning_rate": 2.1009716280703916e-08, + "loss": 0.7458111, + "num_input_tokens_seen": 342646645, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 15887, + "time_per_iteration": 3.847572088241577 + }, + { + "auxiliary_loss_clip": 0.01095349, + "auxiliary_loss_mlp": 0.01026638, + "balance_loss_clip": 1.01572418, + "balance_loss_mlp": 1.0323261, + "epoch": 0.9552382383886968, + "flos": 20702250524160.0, + "grad_norm": 2.2429397817873085, + "language_loss": 0.56737578, + "learning_rate": 2.0953450837040364e-08, + "loss": 0.58859569, + "num_input_tokens_seen": 342663615, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6328125, + "step": 15888, + "time_per_iteration": 2.4127004146575928 + }, + { + "auxiliary_loss_clip": 0.0102134, + "auxiliary_loss_mlp": 0.01002702, + "balance_loss_clip": 1.00170708, + "balance_loss_mlp": 1.00141358, + "epoch": 0.9552983616413648, + "flos": 67769792887680.0, + "grad_norm": 0.7089096744564684, + "language_loss": 0.57814407, + "learning_rate": 2.0897260439020514e-08, + "loss": 0.5983845, + "num_input_tokens_seen": 342728275, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.19921875, + "step": 15889, + "time_per_iteration": 5.906970977783203 + }, + { + "auxiliary_loss_clip": 0.01100248, + "auxiliary_loss_mlp": 0.0102726, + "balance_loss_clip": 1.01538062, + "balance_loss_mlp": 1.03259969, + "epoch": 0.9553584848940327, + "flos": 21579979265280.0, + "grad_norm": 1.3242300073138324, + "language_loss": 0.66891074, + "learning_rate": 2.084114508877466e-08, + "loss": 0.69018579, + "num_input_tokens_seen": 342748860, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.67578125, + "step": 15890, + "time_per_iteration": 2.4627249240875244 + }, + { + "auxiliary_loss_clip": 0.01100107, + "auxiliary_loss_mlp": 0.01028335, + "balance_loss_clip": 1.01665831, + "balance_loss_mlp": 1.03478599, + "epoch": 0.9554186081467008, + "flos": 24208173498240.0, + "grad_norm": 1.4510635562982561, + "language_loss": 0.74006915, + "learning_rate": 2.0785104788430874e-08, + "loss": 0.76135355, + "num_input_tokens_seen": 342769705, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65625, + "step": 15891, + "time_per_iteration": 2.484015941619873 + }, + { + "auxiliary_loss_clip": 0.01095435, + "auxiliary_loss_mlp": 0.01028968, + "balance_loss_clip": 1.01869869, + "balance_loss_mlp": 1.03344524, + "epoch": 0.9554787313993687, + "flos": 16251554073600.0, + "grad_norm": 2.2414416282964824, + "language_loss": 0.77894902, + "learning_rate": 2.072913954011435e-08, + "loss": 0.80019307, + "num_input_tokens_seen": 342787000, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.6171875, + "step": 15892, + "time_per_iteration": 2.4298534393310547 + }, + { + "auxiliary_loss_clip": 0.01097855, + "auxiliary_loss_mlp": 0.01030441, + "balance_loss_clip": 1.01835918, + "balance_loss_mlp": 1.03325903, + "epoch": 0.9555388546520367, + "flos": 23404133508480.0, + "grad_norm": 1.4859618601332218, + "language_loss": 0.69746578, + "learning_rate": 2.0673249345947386e-08, + "loss": 0.71874869, + "num_input_tokens_seen": 342807795, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6484375, + "step": 15893, + "time_per_iteration": 2.4958953857421875 + }, + { + "auxiliary_loss_clip": 0.01100591, + "auxiliary_loss_mlp": 0.01030848, + "balance_loss_clip": 1.01775336, + "balance_loss_mlp": 1.0359931, + "epoch": 0.9555989779047046, + "flos": 14794047907200.0, + "grad_norm": 1.9274290043089441, + "language_loss": 0.65745211, + "learning_rate": 2.0617434208048955e-08, + "loss": 0.67876649, + "num_input_tokens_seen": 342825490, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6484375, + "step": 15894, + "time_per_iteration": 4.018404960632324 + }, + { + "auxiliary_loss_clip": 0.01101163, + "auxiliary_loss_mlp": 0.01029588, + "balance_loss_clip": 1.01734471, + "balance_loss_mlp": 1.03446078, + "epoch": 0.9556591011573726, + "flos": 22236749493120.0, + "grad_norm": 1.9622102153443857, + "language_loss": 0.81861794, + "learning_rate": 2.056169412853581e-08, + "loss": 0.83992541, + "num_input_tokens_seen": 342844965, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66796875, + "step": 15895, + "time_per_iteration": 2.4529037475585938 + }, + { + "auxiliary_loss_clip": 0.0110047, + "auxiliary_loss_mlp": 0.01030613, + "balance_loss_clip": 1.01922894, + "balance_loss_mlp": 1.0347774, + "epoch": 0.9557192244100405, + "flos": 27855296835840.0, + "grad_norm": 3.12144499892649, + "language_loss": 0.72422135, + "learning_rate": 2.0506029109521593e-08, + "loss": 0.74553216, + "num_input_tokens_seen": 342865915, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65625, + "step": 15896, + "time_per_iteration": 2.531371831893921 + }, + { + "auxiliary_loss_clip": 0.01097836, + "auxiliary_loss_mlp": 0.01027687, + "balance_loss_clip": 1.01624894, + "balance_loss_mlp": 1.03318739, + "epoch": 0.9557793476627086, + "flos": 17602800831360.0, + "grad_norm": 1.891257637063241, + "language_loss": 0.79660171, + "learning_rate": 2.045043915311706e-08, + "loss": 0.81785691, + "num_input_tokens_seen": 342884000, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 15897, + "time_per_iteration": 2.423757791519165 + }, + { + "auxiliary_loss_clip": 0.01098206, + "auxiliary_loss_mlp": 0.01032377, + "balance_loss_clip": 1.02033651, + "balance_loss_mlp": 1.03225029, + "epoch": 0.9558394709153766, + "flos": 23875496709120.0, + "grad_norm": 1.6030681434016965, + "language_loss": 0.72389764, + "learning_rate": 2.03949242614303e-08, + "loss": 0.7452035, + "num_input_tokens_seen": 342903095, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.66015625, + "step": 15898, + "time_per_iteration": 2.4686954021453857 + }, + { + "auxiliary_loss_clip": 0.01021576, + "auxiliary_loss_mlp": 0.0099842, + "balance_loss_clip": 0.9974727, + "balance_loss_mlp": 1.00169182, + "epoch": 0.9558995941680445, + "flos": 53682001171200.0, + "grad_norm": 0.8420099231602695, + "language_loss": 0.52358627, + "learning_rate": 2.033948443656652e-08, + "loss": 0.54378629, + "num_input_tokens_seen": 342958155, + "router_z_loss_clip": 0.00946045, + "router_z_loss_mlp": 0.19921875, + "step": 15899, + "time_per_iteration": 3.0036983489990234 + }, + { + "auxiliary_loss_clip": 0.01104279, + "auxiliary_loss_mlp": 0.01030778, + "balance_loss_clip": 1.0179863, + "balance_loss_mlp": 1.03525329, + "epoch": 0.9559597174207125, + "flos": 13764488376960.0, + "grad_norm": 2.249257849936427, + "language_loss": 0.68539107, + "learning_rate": 2.028411968062782e-08, + "loss": 0.70674157, + "num_input_tokens_seen": 342972500, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69140625, + "step": 15900, + "time_per_iteration": 2.4041638374328613 + }, + { + "auxiliary_loss_clip": 0.01100192, + "auxiliary_loss_mlp": 0.01026875, + "balance_loss_clip": 1.01519287, + "balance_loss_mlp": 1.03403616, + "epoch": 0.9560198406733804, + "flos": 19936347799680.0, + "grad_norm": 2.560017279471282, + "language_loss": 0.82855231, + "learning_rate": 2.0228829995713627e-08, + "loss": 0.849823, + "num_input_tokens_seen": 342989035, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 15901, + "time_per_iteration": 2.488877534866333 + }, + { + "auxiliary_loss_clip": 0.01021779, + "auxiliary_loss_mlp": 0.0100249, + "balance_loss_clip": 1.00149441, + "balance_loss_mlp": 1.00174415, + "epoch": 0.9560799639260484, + "flos": 57289550699520.0, + "grad_norm": 0.7081018961368435, + "language_loss": 0.54319799, + "learning_rate": 2.0173615383920485e-08, + "loss": 0.56344068, + "num_input_tokens_seen": 343051675, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.20117188, + "step": 15902, + "time_per_iteration": 3.1055378913879395 + }, + { + "auxiliary_loss_clip": 0.01093723, + "auxiliary_loss_mlp": 0.01028674, + "balance_loss_clip": 1.01904833, + "balance_loss_mlp": 1.03346229, + "epoch": 0.9561400871787163, + "flos": 18917167299840.0, + "grad_norm": 1.5555768909196643, + "language_loss": 0.85443425, + "learning_rate": 2.01184758473425e-08, + "loss": 0.87565827, + "num_input_tokens_seen": 343068895, + "router_z_loss_clip": 0.09619141, + "router_z_loss_mlp": 0.60546875, + "step": 15903, + "time_per_iteration": 2.4546353816986084 + }, + { + "auxiliary_loss_clip": 0.01097244, + "auxiliary_loss_mlp": 0.01026895, + "balance_loss_clip": 1.01644611, + "balance_loss_mlp": 1.03315914, + "epoch": 0.9562002104313844, + "flos": 18038576632320.0, + "grad_norm": 1.9837154441799645, + "language_loss": 0.80416489, + "learning_rate": 2.0063411388070217e-08, + "loss": 0.82540631, + "num_input_tokens_seen": 343087115, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.640625, + "step": 15904, + "time_per_iteration": 2.410019874572754 + }, + { + "auxiliary_loss_clip": 0.0110219, + "auxiliary_loss_mlp": 0.0103026, + "balance_loss_clip": 1.01825547, + "balance_loss_mlp": 1.03522384, + "epoch": 0.9562603336840523, + "flos": 24717673964160.0, + "grad_norm": 2.367778122852727, + "language_loss": 0.6043731, + "learning_rate": 2.0008422008191972e-08, + "loss": 0.62569761, + "num_input_tokens_seen": 343105575, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 15905, + "time_per_iteration": 2.484440803527832 + }, + { + "auxiliary_loss_clip": 0.01097638, + "auxiliary_loss_mlp": 0.0102839, + "balance_loss_clip": 1.01712513, + "balance_loss_mlp": 1.0328846, + "epoch": 0.9563204569367203, + "flos": 21177205084800.0, + "grad_norm": 1.9836519503290855, + "language_loss": 0.69943386, + "learning_rate": 1.995350770979254e-08, + "loss": 0.72069418, + "num_input_tokens_seen": 343123025, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6484375, + "step": 15906, + "time_per_iteration": 2.436502456665039 + }, + { + "auxiliary_loss_clip": 0.01103642, + "auxiliary_loss_mlp": 0.01028107, + "balance_loss_clip": 1.01620412, + "balance_loss_mlp": 1.03666723, + "epoch": 0.9563805801893882, + "flos": 20229738088320.0, + "grad_norm": 1.5331239973913557, + "language_loss": 0.7067498, + "learning_rate": 1.9898668494954473e-08, + "loss": 0.72806728, + "num_input_tokens_seen": 343141625, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 15907, + "time_per_iteration": 2.43973445892334 + }, + { + "auxiliary_loss_clip": 0.01097674, + "auxiliary_loss_mlp": 0.01028674, + "balance_loss_clip": 1.01680064, + "balance_loss_mlp": 1.03331208, + "epoch": 0.9564407034420562, + "flos": 25411001258880.0, + "grad_norm": 1.8625996431981158, + "language_loss": 0.7003063, + "learning_rate": 1.9843904365757447e-08, + "loss": 0.72156978, + "num_input_tokens_seen": 343161300, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.64453125, + "step": 15908, + "time_per_iteration": 2.477755308151245 + }, + { + "auxiliary_loss_clip": 0.01100131, + "auxiliary_loss_mlp": 0.01029921, + "balance_loss_clip": 1.01866722, + "balance_loss_mlp": 1.03570485, + "epoch": 0.9565008266947241, + "flos": 18623884752000.0, + "grad_norm": 4.607070324323302, + "language_loss": 0.83111703, + "learning_rate": 1.978921532427802e-08, + "loss": 0.85241747, + "num_input_tokens_seen": 343177815, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.64453125, + "step": 15909, + "time_per_iteration": 2.4829578399658203 + }, + { + "auxiliary_loss_clip": 0.01098212, + "auxiliary_loss_mlp": 0.0103104, + "balance_loss_clip": 1.01955426, + "balance_loss_mlp": 1.0328548, + "epoch": 0.9565609499473922, + "flos": 24862142465280.0, + "grad_norm": 2.2246333809539465, + "language_loss": 0.6721313, + "learning_rate": 1.9734601372590086e-08, + "loss": 0.69342375, + "num_input_tokens_seen": 343198140, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65625, + "step": 15910, + "time_per_iteration": 2.4749066829681396 + }, + { + "auxiliary_loss_clip": 0.01102469, + "auxiliary_loss_mlp": 0.010327, + "balance_loss_clip": 1.02111912, + "balance_loss_mlp": 1.03529978, + "epoch": 0.9566210732000601, + "flos": 21798459740160.0, + "grad_norm": 1.6112632328082404, + "language_loss": 0.74234146, + "learning_rate": 1.968006251276444e-08, + "loss": 0.76369315, + "num_input_tokens_seen": 343218280, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.671875, + "step": 15911, + "time_per_iteration": 2.4732449054718018 + }, + { + "auxiliary_loss_clip": 0.01098446, + "auxiliary_loss_mlp": 0.010274, + "balance_loss_clip": 1.01601529, + "balance_loss_mlp": 1.03259337, + "epoch": 0.9566811964527281, + "flos": 18697609416960.0, + "grad_norm": 2.080099232375226, + "language_loss": 0.69968218, + "learning_rate": 1.9625598746869198e-08, + "loss": 0.72094059, + "num_input_tokens_seen": 343236850, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.66015625, + "step": 15912, + "time_per_iteration": 2.4121153354644775 + }, + { + "auxiliary_loss_clip": 0.01100916, + "auxiliary_loss_mlp": 0.01034899, + "balance_loss_clip": 1.02324605, + "balance_loss_mlp": 1.034904, + "epoch": 0.9567413197053961, + "flos": 13000632727680.0, + "grad_norm": 4.337452858375998, + "language_loss": 0.7253468, + "learning_rate": 1.95712100769696e-08, + "loss": 0.74670494, + "num_input_tokens_seen": 343253065, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66015625, + "step": 15913, + "time_per_iteration": 2.4348838329315186 + }, + { + "auxiliary_loss_clip": 0.01099633, + "auxiliary_loss_mlp": 0.0102546, + "balance_loss_clip": 1.01477885, + "balance_loss_mlp": 1.03494883, + "epoch": 0.956801442958064, + "flos": 19719267955200.0, + "grad_norm": 1.9142325578386978, + "language_loss": 0.73507404, + "learning_rate": 1.9516896505128444e-08, + "loss": 0.75632489, + "num_input_tokens_seen": 343270330, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.64453125, + "step": 15914, + "time_per_iteration": 2.4489924907684326 + }, + { + "auxiliary_loss_clip": 0.010974, + "auxiliary_loss_mlp": 0.01027808, + "balance_loss_clip": 1.01634026, + "balance_loss_mlp": 1.03322947, + "epoch": 0.956861566210732, + "flos": 18222834424320.0, + "grad_norm": 1.5054378463207643, + "language_loss": 0.67459226, + "learning_rate": 1.9462658033404965e-08, + "loss": 0.69584435, + "num_input_tokens_seen": 343289625, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.640625, + "step": 15915, + "time_per_iteration": 2.4519832134246826 + }, + { + "auxiliary_loss_clip": 0.01097523, + "auxiliary_loss_mlp": 0.01027131, + "balance_loss_clip": 1.01603842, + "balance_loss_mlp": 1.03358841, + "epoch": 0.9569216894634, + "flos": 22196960202240.0, + "grad_norm": 1.7543278741085384, + "language_loss": 0.64166009, + "learning_rate": 1.9408494663855967e-08, + "loss": 0.66290665, + "num_input_tokens_seen": 343309200, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.640625, + "step": 15916, + "time_per_iteration": 2.4360384941101074 + }, + { + "auxiliary_loss_clip": 0.0109159, + "auxiliary_loss_mlp": 0.01029462, + "balance_loss_clip": 1.01882899, + "balance_loss_mlp": 1.03175974, + "epoch": 0.956981812716068, + "flos": 21689291329920.0, + "grad_norm": 1.904935652985477, + "language_loss": 0.80659258, + "learning_rate": 1.935440639853536e-08, + "loss": 0.82780313, + "num_input_tokens_seen": 343326270, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.59765625, + "step": 15917, + "time_per_iteration": 2.480591297149658 + }, + { + "auxiliary_loss_clip": 0.01098198, + "auxiliary_loss_mlp": 0.01031132, + "balance_loss_clip": 1.01965833, + "balance_loss_mlp": 1.03460228, + "epoch": 0.9570419359687359, + "flos": 13990905757440.0, + "grad_norm": 1.758517813594143, + "language_loss": 0.72947186, + "learning_rate": 1.9300393239494172e-08, + "loss": 0.7507652, + "num_input_tokens_seen": 343344430, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6328125, + "step": 15918, + "time_per_iteration": 2.423100471496582 + }, + { + "auxiliary_loss_clip": 0.01021645, + "auxiliary_loss_mlp": 0.01001396, + "balance_loss_clip": 1.0004425, + "balance_loss_mlp": 1.00167096, + "epoch": 0.9571020592214039, + "flos": 65196938534400.0, + "grad_norm": 0.6313369610735284, + "language_loss": 0.53130996, + "learning_rate": 1.924645518878032e-08, + "loss": 0.55154037, + "num_input_tokens_seen": 343416155, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.19921875, + "step": 15919, + "time_per_iteration": 3.1794583797454834 + }, + { + "auxiliary_loss_clip": 0.01106485, + "auxiliary_loss_mlp": 0.01035749, + "balance_loss_clip": 1.02320194, + "balance_loss_mlp": 1.03811026, + "epoch": 0.9571621824740718, + "flos": 17384068961280.0, + "grad_norm": 2.571633844940979, + "language_loss": 0.75538218, + "learning_rate": 1.919259224843972e-08, + "loss": 0.77680451, + "num_input_tokens_seen": 343431715, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 15920, + "time_per_iteration": 2.4159767627716064 + }, + { + "auxiliary_loss_clip": 0.01102735, + "auxiliary_loss_mlp": 0.01033589, + "balance_loss_clip": 1.02107811, + "balance_loss_mlp": 1.0350672, + "epoch": 0.9572223057267398, + "flos": 14538184352640.0, + "grad_norm": 1.9063096196660445, + "language_loss": 0.7912389, + "learning_rate": 1.9138804420514298e-08, + "loss": 0.8126021, + "num_input_tokens_seen": 343450425, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.67578125, + "step": 15921, + "time_per_iteration": 2.45986008644104 + }, + { + "auxiliary_loss_clip": 0.01103353, + "auxiliary_loss_mlp": 0.01027887, + "balance_loss_clip": 1.01543534, + "balance_loss_mlp": 1.03351963, + "epoch": 0.9572824289794077, + "flos": 33947793158400.0, + "grad_norm": 2.0395877371527718, + "language_loss": 0.50749934, + "learning_rate": 1.9085091707044197e-08, + "loss": 0.52881169, + "num_input_tokens_seen": 343470445, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.69921875, + "step": 15922, + "time_per_iteration": 2.555110454559326 + }, + { + "auxiliary_loss_clip": 0.01101174, + "auxiliary_loss_mlp": 0.01032467, + "balance_loss_clip": 1.0203439, + "balance_loss_mlp": 1.03463745, + "epoch": 0.9573425522320758, + "flos": 18694915896960.0, + "grad_norm": 2.0324325155300844, + "language_loss": 0.83707559, + "learning_rate": 1.903145411006557e-08, + "loss": 0.85841203, + "num_input_tokens_seen": 343485200, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 15923, + "time_per_iteration": 2.4552669525146484 + }, + { + "auxiliary_loss_clip": 0.01096477, + "auxiliary_loss_mlp": 0.01028667, + "balance_loss_clip": 1.01791394, + "balance_loss_mlp": 1.0326041, + "epoch": 0.9574026754847437, + "flos": 28510307297280.0, + "grad_norm": 1.5161968575353546, + "language_loss": 0.74902648, + "learning_rate": 1.8977891631613008e-08, + "loss": 0.77027792, + "num_input_tokens_seen": 343505080, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.640625, + "step": 15924, + "time_per_iteration": 2.5490124225616455 + }, + { + "auxiliary_loss_clip": 0.01098896, + "auxiliary_loss_mlp": 0.01030926, + "balance_loss_clip": 1.01905894, + "balance_loss_mlp": 1.03276801, + "epoch": 0.9574627987374117, + "flos": 24352390604160.0, + "grad_norm": 2.078581919020162, + "language_loss": 0.85878658, + "learning_rate": 1.892440427371711e-08, + "loss": 0.88008475, + "num_input_tokens_seen": 343523995, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66015625, + "step": 15925, + "time_per_iteration": 2.476585865020752 + }, + { + "auxiliary_loss_clip": 0.01104973, + "auxiliary_loss_mlp": 0.0103264, + "balance_loss_clip": 1.02014744, + "balance_loss_mlp": 1.03549838, + "epoch": 0.9575229219900797, + "flos": 23510680225920.0, + "grad_norm": 2.8194945253474297, + "language_loss": 0.75799584, + "learning_rate": 1.8870992038406474e-08, + "loss": 0.77937198, + "num_input_tokens_seen": 343542015, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 15926, + "time_per_iteration": 2.4524552822113037 + }, + { + "auxiliary_loss_clip": 0.01101507, + "auxiliary_loss_mlp": 0.01029116, + "balance_loss_clip": 1.01854181, + "balance_loss_mlp": 1.03607941, + "epoch": 0.9575830452427476, + "flos": 22674823764480.0, + "grad_norm": 1.622745351817711, + "language_loss": 0.77535486, + "learning_rate": 1.8817654927706373e-08, + "loss": 0.79666108, + "num_input_tokens_seen": 343561680, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.65625, + "step": 15927, + "time_per_iteration": 2.5115678310394287 + }, + { + "auxiliary_loss_clip": 0.01102754, + "auxiliary_loss_mlp": 0.01032707, + "balance_loss_clip": 1.0196532, + "balance_loss_mlp": 1.03499341, + "epoch": 0.9576431684954156, + "flos": 30485250835200.0, + "grad_norm": 5.979240549227758, + "language_loss": 0.68711758, + "learning_rate": 1.8764392943639183e-08, + "loss": 0.70847225, + "num_input_tokens_seen": 343585290, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6796875, + "step": 15928, + "time_per_iteration": 4.005671739578247 + }, + { + "auxiliary_loss_clip": 0.01099738, + "auxiliary_loss_mlp": 0.01030764, + "balance_loss_clip": 1.01900387, + "balance_loss_mlp": 1.03447127, + "epoch": 0.9577032917480836, + "flos": 21687387909120.0, + "grad_norm": 1.5556334234006137, + "language_loss": 0.81790125, + "learning_rate": 1.871120608822485e-08, + "loss": 0.83920628, + "num_input_tokens_seen": 343604045, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.65234375, + "step": 15929, + "time_per_iteration": 2.439286470413208 + }, + { + "auxiliary_loss_clip": 0.01103823, + "auxiliary_loss_mlp": 0.0103863, + "balance_loss_clip": 1.02678704, + "balance_loss_mlp": 1.03518653, + "epoch": 0.9577634150007516, + "flos": 29023147728000.0, + "grad_norm": 1.4266689760878288, + "language_loss": 0.72288859, + "learning_rate": 1.8658094363480202e-08, + "loss": 0.74431318, + "num_input_tokens_seen": 343626595, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 15930, + "time_per_iteration": 3.916687250137329 + }, + { + "auxiliary_loss_clip": 0.01098084, + "auxiliary_loss_mlp": 0.01027954, + "balance_loss_clip": 1.01688528, + "balance_loss_mlp": 1.03421974, + "epoch": 0.9578235382534195, + "flos": 19282235178240.0, + "grad_norm": 1.3860575900403753, + "language_loss": 0.61940473, + "learning_rate": 1.8605057771419185e-08, + "loss": 0.64066511, + "num_input_tokens_seen": 343646195, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 15931, + "time_per_iteration": 3.9892077445983887 + }, + { + "auxiliary_loss_clip": 0.01097363, + "auxiliary_loss_mlp": 0.01028291, + "balance_loss_clip": 1.0176214, + "balance_loss_mlp": 1.03452408, + "epoch": 0.9578836615060875, + "flos": 13699275235200.0, + "grad_norm": 1.6941605420085752, + "language_loss": 0.68982953, + "learning_rate": 1.8552096314052633e-08, + "loss": 0.71108609, + "num_input_tokens_seen": 343663665, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.625, + "step": 15932, + "time_per_iteration": 2.398474931716919 + }, + { + "auxiliary_loss_clip": 0.01103128, + "auxiliary_loss_mlp": 0.0103345, + "balance_loss_clip": 1.02058697, + "balance_loss_mlp": 1.03450584, + "epoch": 0.9579437847587554, + "flos": 17054516655360.0, + "grad_norm": 2.9202077613156447, + "language_loss": 0.75383151, + "learning_rate": 1.849920999338961e-08, + "loss": 0.77519727, + "num_input_tokens_seen": 343682145, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6875, + "step": 15933, + "time_per_iteration": 2.4255053997039795 + }, + { + "auxiliary_loss_clip": 0.01021552, + "auxiliary_loss_mlp": 0.01001482, + "balance_loss_clip": 1.0005517, + "balance_loss_mlp": 1.00157118, + "epoch": 0.9580039080114234, + "flos": 60570887886720.0, + "grad_norm": 0.7217589216028398, + "language_loss": 0.57281023, + "learning_rate": 1.8446398811434948e-08, + "loss": 0.59304059, + "num_input_tokens_seen": 343744685, + "router_z_loss_clip": 0.00927734, + "router_z_loss_mlp": 0.20019531, + "step": 15934, + "time_per_iteration": 3.1378121376037598 + }, + { + "auxiliary_loss_clip": 0.01021591, + "auxiliary_loss_mlp": 0.01003298, + "balance_loss_clip": 1.00234401, + "balance_loss_mlp": 1.00169897, + "epoch": 0.9580640312640913, + "flos": 66235365745920.0, + "grad_norm": 0.913712526229365, + "language_loss": 0.65969813, + "learning_rate": 1.8393662770191277e-08, + "loss": 0.67994696, + "num_input_tokens_seen": 343801835, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.19921875, + "step": 15935, + "time_per_iteration": 3.0164973735809326 + }, + { + "auxiliary_loss_clip": 0.01021566, + "auxiliary_loss_mlp": 0.00997813, + "balance_loss_clip": 0.99687093, + "balance_loss_mlp": 1.00159645, + "epoch": 0.9581241545167594, + "flos": 62218002971520.0, + "grad_norm": 0.7963829283211799, + "language_loss": 0.57069677, + "learning_rate": 1.8341001871658546e-08, + "loss": 0.59089053, + "num_input_tokens_seen": 343861515, + "router_z_loss_clip": 0.00939941, + "router_z_loss_mlp": 0.19921875, + "step": 15936, + "time_per_iteration": 4.516096115112305 + }, + { + "auxiliary_loss_clip": 0.0110158, + "auxiliary_loss_mlp": 0.01029217, + "balance_loss_clip": 1.01734924, + "balance_loss_mlp": 1.03508747, + "epoch": 0.9581842777694273, + "flos": 23768088065280.0, + "grad_norm": 1.7549841016566206, + "language_loss": 0.78426778, + "learning_rate": 1.8288416117833825e-08, + "loss": 0.80557573, + "num_input_tokens_seen": 343881240, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6640625, + "step": 15937, + "time_per_iteration": 2.54921555519104 + }, + { + "auxiliary_loss_clip": 0.01100478, + "auxiliary_loss_mlp": 0.0103017, + "balance_loss_clip": 1.01807022, + "balance_loss_mlp": 1.03413606, + "epoch": 0.9582444010220953, + "flos": 21213079793280.0, + "grad_norm": 2.664120819072444, + "language_loss": 0.68353987, + "learning_rate": 1.8235905510710636e-08, + "loss": 0.70484626, + "num_input_tokens_seen": 343900885, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 15938, + "time_per_iteration": 2.568049907684326 + }, + { + "auxiliary_loss_clip": 0.01099803, + "auxiliary_loss_mlp": 0.01027851, + "balance_loss_clip": 1.01622176, + "balance_loss_mlp": 1.03337634, + "epoch": 0.9583045242747633, + "flos": 23805147922560.0, + "grad_norm": 2.516589048128792, + "language_loss": 0.65331376, + "learning_rate": 1.8183470052280712e-08, + "loss": 0.67459035, + "num_input_tokens_seen": 343918460, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 15939, + "time_per_iteration": 2.4567224979400635 + }, + { + "auxiliary_loss_clip": 0.01097061, + "auxiliary_loss_mlp": 0.0103265, + "balance_loss_clip": 1.02127695, + "balance_loss_mlp": 1.03251374, + "epoch": 0.9583646475274312, + "flos": 24131468004480.0, + "grad_norm": 3.037055398213982, + "language_loss": 0.73640996, + "learning_rate": 1.8131109744532025e-08, + "loss": 0.75770706, + "num_input_tokens_seen": 343938030, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.64453125, + "step": 15940, + "time_per_iteration": 2.5022995471954346 + }, + { + "auxiliary_loss_clip": 0.0110072, + "auxiliary_loss_mlp": 0.01029738, + "balance_loss_clip": 1.01716161, + "balance_loss_mlp": 1.03483605, + "epoch": 0.9584247707800992, + "flos": 20886651970560.0, + "grad_norm": 1.6010591427701613, + "language_loss": 0.73068857, + "learning_rate": 1.8078824589450535e-08, + "loss": 0.75199318, + "num_input_tokens_seen": 343956635, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.65625, + "step": 15941, + "time_per_iteration": 2.4811065196990967 + }, + { + "auxiliary_loss_clip": 0.01099466, + "auxiliary_loss_mlp": 0.01034168, + "balance_loss_clip": 1.02316487, + "balance_loss_mlp": 1.03476393, + "epoch": 0.9584848940327672, + "flos": 26067591918720.0, + "grad_norm": 1.4889591298763103, + "language_loss": 0.71140969, + "learning_rate": 1.8026614589018442e-08, + "loss": 0.73274601, + "num_input_tokens_seen": 343976625, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6484375, + "step": 15942, + "time_per_iteration": 2.488670587539673 + }, + { + "auxiliary_loss_clip": 0.01100639, + "auxiliary_loss_mlp": 0.01030485, + "balance_loss_clip": 1.01804519, + "balance_loss_mlp": 1.03398347, + "epoch": 0.9585450172854352, + "flos": 34492988764800.0, + "grad_norm": 1.6261005015311867, + "language_loss": 0.71908909, + "learning_rate": 1.797447974521571e-08, + "loss": 0.74040031, + "num_input_tokens_seen": 343997790, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6640625, + "step": 15943, + "time_per_iteration": 2.564479112625122 + }, + { + "auxiliary_loss_clip": 0.01102364, + "auxiliary_loss_mlp": 0.01034948, + "balance_loss_clip": 1.02263975, + "balance_loss_mlp": 1.03473973, + "epoch": 0.9586051405381031, + "flos": 23110743219840.0, + "grad_norm": 1.7040542813640263, + "language_loss": 0.6800124, + "learning_rate": 1.792242006001965e-08, + "loss": 0.7013855, + "num_input_tokens_seen": 344016935, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.67578125, + "step": 15944, + "time_per_iteration": 2.5587704181671143 + }, + { + "auxiliary_loss_clip": 0.01100009, + "auxiliary_loss_mlp": 0.01033516, + "balance_loss_clip": 1.02160716, + "balance_loss_mlp": 1.0336163, + "epoch": 0.9586652637907711, + "flos": 19603994232960.0, + "grad_norm": 2.087826009089437, + "language_loss": 0.65862542, + "learning_rate": 1.7870435535403795e-08, + "loss": 0.67996073, + "num_input_tokens_seen": 344035590, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 15945, + "time_per_iteration": 2.445241689682007 + }, + { + "auxiliary_loss_clip": 0.01021444, + "auxiliary_loss_mlp": 0.01001575, + "balance_loss_clip": 1.00056767, + "balance_loss_mlp": 1.00148821, + "epoch": 0.958725387043439, + "flos": 72073327317120.0, + "grad_norm": 0.7415668424690911, + "language_loss": 0.61897564, + "learning_rate": 1.7818526173339678e-08, + "loss": 0.63920581, + "num_input_tokens_seen": 344100845, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.19921875, + "step": 15946, + "time_per_iteration": 3.135841131210327 + }, + { + "auxiliary_loss_clip": 0.01096719, + "auxiliary_loss_mlp": 0.01027382, + "balance_loss_clip": 1.01627207, + "balance_loss_mlp": 1.03327739, + "epoch": 0.958785510296107, + "flos": 28911932242560.0, + "grad_norm": 1.7761369112332144, + "language_loss": 0.75568569, + "learning_rate": 1.7766691975795723e-08, + "loss": 0.7769267, + "num_input_tokens_seen": 344121780, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6328125, + "step": 15947, + "time_per_iteration": 2.4884331226348877 + }, + { + "auxiliary_loss_clip": 0.01097515, + "auxiliary_loss_mlp": 0.01025857, + "balance_loss_clip": 1.01469898, + "balance_loss_mlp": 1.03267527, + "epoch": 0.958845633548775, + "flos": 18477189607680.0, + "grad_norm": 2.398089295673863, + "language_loss": 0.70082307, + "learning_rate": 1.771493294473747e-08, + "loss": 0.72205675, + "num_input_tokens_seen": 344140150, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 15948, + "time_per_iteration": 2.450761556625366 + }, + { + "auxiliary_loss_clip": 0.01096726, + "auxiliary_loss_mlp": 0.01027812, + "balance_loss_clip": 1.01652312, + "balance_loss_mlp": 1.03256166, + "epoch": 0.958905756801443, + "flos": 24206916522240.0, + "grad_norm": 1.8775052153716447, + "language_loss": 0.78941453, + "learning_rate": 1.7663249082127574e-08, + "loss": 0.81065995, + "num_input_tokens_seen": 344158200, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.640625, + "step": 15949, + "time_per_iteration": 2.453991413116455 + }, + { + "auxiliary_loss_clip": 0.01101471, + "auxiliary_loss_mlp": 0.01033531, + "balance_loss_clip": 1.02134788, + "balance_loss_mlp": 1.03560996, + "epoch": 0.9589658800541109, + "flos": 25007939769600.0, + "grad_norm": 1.732596967369498, + "language_loss": 0.68670601, + "learning_rate": 1.761164038992602e-08, + "loss": 0.70805597, + "num_input_tokens_seen": 344174720, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65625, + "step": 15950, + "time_per_iteration": 2.4904561042785645 + }, + { + "auxiliary_loss_clip": 0.01100004, + "auxiliary_loss_mlp": 0.01029947, + "balance_loss_clip": 1.01883698, + "balance_loss_mlp": 1.0342288, + "epoch": 0.9590260033067789, + "flos": 23514558894720.0, + "grad_norm": 1.820609185891462, + "language_loss": 0.86225641, + "learning_rate": 1.7560106870089687e-08, + "loss": 0.88355601, + "num_input_tokens_seen": 344192580, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.65625, + "step": 15951, + "time_per_iteration": 2.4510254859924316 + }, + { + "auxiliary_loss_clip": 0.01103854, + "auxiliary_loss_mlp": 0.0103563, + "balance_loss_clip": 1.02368557, + "balance_loss_mlp": 1.03520882, + "epoch": 0.9590861265594469, + "flos": 25520349237120.0, + "grad_norm": 2.084854322647747, + "language_loss": 0.7963227, + "learning_rate": 1.7508648524572568e-08, + "loss": 0.81771755, + "num_input_tokens_seen": 344210345, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6875, + "step": 15952, + "time_per_iteration": 2.5137405395507812 + }, + { + "auxiliary_loss_clip": 0.01100763, + "auxiliary_loss_mlp": 0.0102934, + "balance_loss_clip": 1.0173949, + "balance_loss_mlp": 1.03468966, + "epoch": 0.9591462498121148, + "flos": 21179323987200.0, + "grad_norm": 1.719452431467091, + "language_loss": 0.69882435, + "learning_rate": 1.7457265355326434e-08, + "loss": 0.72012538, + "num_input_tokens_seen": 344229540, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66015625, + "step": 15953, + "time_per_iteration": 2.469686985015869 + }, + { + "auxiliary_loss_clip": 0.01102054, + "auxiliary_loss_mlp": 0.01027827, + "balance_loss_clip": 1.01587033, + "balance_loss_mlp": 1.03441012, + "epoch": 0.9592063730647828, + "flos": 21723047136000.0, + "grad_norm": 2.317829736689624, + "language_loss": 0.57854062, + "learning_rate": 1.7405957364299285e-08, + "loss": 0.59983945, + "num_input_tokens_seen": 344247830, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6796875, + "step": 15954, + "time_per_iteration": 2.4689619541168213 + }, + { + "auxiliary_loss_clip": 0.01102063, + "auxiliary_loss_mlp": 0.01031747, + "balance_loss_clip": 1.01876557, + "balance_loss_mlp": 1.03452051, + "epoch": 0.9592664963174508, + "flos": 29891395278720.0, + "grad_norm": 1.965848373822375, + "language_loss": 0.74012095, + "learning_rate": 1.7354724553437117e-08, + "loss": 0.76145911, + "num_input_tokens_seen": 344267760, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.671875, + "step": 15955, + "time_per_iteration": 2.561746120452881 + }, + { + "auxiliary_loss_clip": 0.01099826, + "auxiliary_loss_mlp": 0.01033323, + "balance_loss_clip": 1.02136004, + "balance_loss_mlp": 1.03310394, + "epoch": 0.9593266195701188, + "flos": 17999613354240.0, + "grad_norm": 2.1926585969680796, + "language_loss": 0.62872529, + "learning_rate": 1.7303566924682378e-08, + "loss": 0.65005678, + "num_input_tokens_seen": 344284905, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66796875, + "step": 15956, + "time_per_iteration": 2.5066936016082764 + }, + { + "auxiliary_loss_clip": 0.0110062, + "auxiliary_loss_mlp": 0.01032107, + "balance_loss_clip": 1.01954842, + "balance_loss_mlp": 1.03487992, + "epoch": 0.9593867428227867, + "flos": 18838271076480.0, + "grad_norm": 1.8584287929461432, + "language_loss": 0.59779477, + "learning_rate": 1.725248447997507e-08, + "loss": 0.61912203, + "num_input_tokens_seen": 344302025, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.65625, + "step": 15957, + "time_per_iteration": 2.4974136352539062 + }, + { + "auxiliary_loss_clip": 0.01099795, + "auxiliary_loss_mlp": 0.0103612, + "balance_loss_clip": 1.02395439, + "balance_loss_mlp": 1.03408015, + "epoch": 0.9594468660754547, + "flos": 29567050444800.0, + "grad_norm": 1.931337896250092, + "language_loss": 0.74394608, + "learning_rate": 1.7201477221252314e-08, + "loss": 0.76530516, + "num_input_tokens_seen": 344321935, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.65625, + "step": 15958, + "time_per_iteration": 2.7763772010803223 + }, + { + "auxiliary_loss_clip": 0.01098821, + "auxiliary_loss_mlp": 0.01026509, + "balance_loss_clip": 1.01513004, + "balance_loss_mlp": 1.03337789, + "epoch": 0.9595069893281226, + "flos": 20703256104960.0, + "grad_norm": 1.470209010768804, + "language_loss": 0.74736482, + "learning_rate": 1.7150545150448116e-08, + "loss": 0.76861811, + "num_input_tokens_seen": 344340405, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65234375, + "step": 15959, + "time_per_iteration": 2.5138134956359863 + }, + { + "auxiliary_loss_clip": 0.01101982, + "auxiliary_loss_mlp": 0.0102754, + "balance_loss_clip": 1.01552415, + "balance_loss_mlp": 1.03473663, + "epoch": 0.9595671125807906, + "flos": 22453613856000.0, + "grad_norm": 1.910701656384312, + "language_loss": 0.64995688, + "learning_rate": 1.7099688269493816e-08, + "loss": 0.67125207, + "num_input_tokens_seen": 344359925, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 15960, + "time_per_iteration": 2.5808286666870117 + }, + { + "auxiliary_loss_clip": 0.01096302, + "auxiliary_loss_mlp": 0.01031218, + "balance_loss_clip": 1.01920176, + "balance_loss_mlp": 1.03344536, + "epoch": 0.9596272358334585, + "flos": 23915214172800.0, + "grad_norm": 1.850697413966108, + "language_loss": 0.77640712, + "learning_rate": 1.7048906580318544e-08, + "loss": 0.79768229, + "num_input_tokens_seen": 344379100, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.62890625, + "step": 15961, + "time_per_iteration": 2.6080851554870605 + }, + { + "auxiliary_loss_clip": 0.01097563, + "auxiliary_loss_mlp": 0.01027144, + "balance_loss_clip": 1.01583743, + "balance_loss_mlp": 1.03365684, + "epoch": 0.9596873590861266, + "flos": 17672539086720.0, + "grad_norm": 1.7282197835839996, + "language_loss": 0.76134586, + "learning_rate": 1.699820008484698e-08, + "loss": 0.78259289, + "num_input_tokens_seen": 344396895, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.640625, + "step": 15962, + "time_per_iteration": 2.4804084300994873 + }, + { + "auxiliary_loss_clip": 0.01101283, + "auxiliary_loss_mlp": 0.01030369, + "balance_loss_clip": 1.01823974, + "balance_loss_mlp": 1.03411567, + "epoch": 0.9597474823387945, + "flos": 25808532053760.0, + "grad_norm": 2.084352656854831, + "language_loss": 0.72044748, + "learning_rate": 1.6947568785002698e-08, + "loss": 0.74176401, + "num_input_tokens_seen": 344415115, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 15963, + "time_per_iteration": 2.614706516265869 + }, + { + "auxiliary_loss_clip": 0.01097072, + "auxiliary_loss_mlp": 0.01030523, + "balance_loss_clip": 1.01976991, + "balance_loss_mlp": 1.03555036, + "epoch": 0.9598076055914625, + "flos": 23768519028480.0, + "grad_norm": 1.5422937274732758, + "language_loss": 0.74315596, + "learning_rate": 1.689701268270527e-08, + "loss": 0.76443183, + "num_input_tokens_seen": 344435185, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6171875, + "step": 15964, + "time_per_iteration": 2.5884809494018555 + }, + { + "auxiliary_loss_clip": 0.01021677, + "auxiliary_loss_mlp": 0.01000233, + "balance_loss_clip": 0.99927884, + "balance_loss_mlp": 1.00162196, + "epoch": 0.9598677288441305, + "flos": 56515962464640.0, + "grad_norm": 0.87740764751359, + "language_loss": 0.57558799, + "learning_rate": 1.684653177987161e-08, + "loss": 0.59580708, + "num_input_tokens_seen": 344488950, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.20019531, + "step": 15965, + "time_per_iteration": 3.032865047454834 + }, + { + "auxiliary_loss_clip": 0.0109898, + "auxiliary_loss_mlp": 0.010295, + "balance_loss_clip": 1.01854491, + "balance_loss_mlp": 1.03277349, + "epoch": 0.9599278520967984, + "flos": 22997480659200.0, + "grad_norm": 4.347845014421521, + "language_loss": 0.78900796, + "learning_rate": 1.6796126078416627e-08, + "loss": 0.81029272, + "num_input_tokens_seen": 344506740, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6640625, + "step": 15966, + "time_per_iteration": 2.4715421199798584 + }, + { + "auxiliary_loss_clip": 0.01095286, + "auxiliary_loss_mlp": 0.01027389, + "balance_loss_clip": 1.01574206, + "balance_loss_mlp": 1.0313921, + "epoch": 0.9599879753494664, + "flos": 23039676161280.0, + "grad_norm": 2.0769488159919423, + "language_loss": 0.79388767, + "learning_rate": 1.674579558025102e-08, + "loss": 0.81511444, + "num_input_tokens_seen": 344526670, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.640625, + "step": 15967, + "time_per_iteration": 2.4780547618865967 + }, + { + "auxiliary_loss_clip": 0.01104282, + "auxiliary_loss_mlp": 0.01026202, + "balance_loss_clip": 1.01377392, + "balance_loss_mlp": 1.0364691, + "epoch": 0.9600480986021344, + "flos": 16392287560320.0, + "grad_norm": 2.049759384254396, + "language_loss": 0.8052963, + "learning_rate": 1.669554028728348e-08, + "loss": 0.82660115, + "num_input_tokens_seen": 344541995, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 15968, + "time_per_iteration": 2.48009991645813 + }, + { + "auxiliary_loss_clip": 0.01103398, + "auxiliary_loss_mlp": 0.01036768, + "balance_loss_clip": 1.02361894, + "balance_loss_mlp": 1.03506923, + "epoch": 0.9601082218548024, + "flos": 24276439296000.0, + "grad_norm": 2.7147848897637794, + "language_loss": 0.67841053, + "learning_rate": 1.6645360201420044e-08, + "loss": 0.69981217, + "num_input_tokens_seen": 344559980, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.68359375, + "step": 15969, + "time_per_iteration": 2.48237681388855 + }, + { + "auxiliary_loss_clip": 0.01100318, + "auxiliary_loss_mlp": 0.01036471, + "balance_loss_clip": 1.02568293, + "balance_loss_mlp": 1.03579783, + "epoch": 0.9601683451074703, + "flos": 19609991804160.0, + "grad_norm": 2.6281665249354553, + "language_loss": 0.79528141, + "learning_rate": 1.6595255324563186e-08, + "loss": 0.81664926, + "num_input_tokens_seen": 344577765, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.64453125, + "step": 15970, + "time_per_iteration": 3.8477537631988525 + }, + { + "auxiliary_loss_clip": 0.01097507, + "auxiliary_loss_mlp": 0.01032057, + "balance_loss_clip": 1.02026701, + "balance_loss_mlp": 1.03499389, + "epoch": 0.9602284683601383, + "flos": 26651104358400.0, + "grad_norm": 1.5295204507537015, + "language_loss": 0.77275121, + "learning_rate": 1.654522565861316e-08, + "loss": 0.79404688, + "num_input_tokens_seen": 344597650, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.625, + "step": 15971, + "time_per_iteration": 2.501603841781616 + }, + { + "auxiliary_loss_clip": 0.01101775, + "auxiliary_loss_mlp": 0.01026363, + "balance_loss_clip": 1.01433444, + "balance_loss_mlp": 1.03340471, + "epoch": 0.9602885916128062, + "flos": 15554096714880.0, + "grad_norm": 2.0074410078313987, + "language_loss": 0.67119515, + "learning_rate": 1.64952712054669e-08, + "loss": 0.69247651, + "num_input_tokens_seen": 344613580, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6875, + "step": 15972, + "time_per_iteration": 3.8172669410705566 + }, + { + "auxiliary_loss_clip": 0.01098207, + "auxiliary_loss_mlp": 0.01024218, + "balance_loss_clip": 1.01264238, + "balance_loss_mlp": 1.03300917, + "epoch": 0.9603487148654742, + "flos": 16502353810560.0, + "grad_norm": 2.5640425645458174, + "language_loss": 0.76354134, + "learning_rate": 1.644539196701844e-08, + "loss": 0.7847656, + "num_input_tokens_seen": 344626910, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65234375, + "step": 15973, + "time_per_iteration": 3.7488813400268555 + }, + { + "auxiliary_loss_clip": 0.01101414, + "auxiliary_loss_mlp": 0.01037122, + "balance_loss_clip": 1.02525496, + "balance_loss_mlp": 1.03684473, + "epoch": 0.9604088381181421, + "flos": 20845354308480.0, + "grad_norm": 1.5915230941554284, + "language_loss": 0.69382858, + "learning_rate": 1.639558794515983e-08, + "loss": 0.71521389, + "num_input_tokens_seen": 344644330, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6484375, + "step": 15974, + "time_per_iteration": 2.459822177886963 + }, + { + "auxiliary_loss_clip": 0.01099172, + "auxiliary_loss_mlp": 0.01026699, + "balance_loss_clip": 1.01501679, + "balance_loss_mlp": 1.03258681, + "epoch": 0.9604689613708102, + "flos": 19683105937920.0, + "grad_norm": 1.6930712798967245, + "language_loss": 0.67391104, + "learning_rate": 1.6345859141779105e-08, + "loss": 0.69516981, + "num_input_tokens_seen": 344663910, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 15975, + "time_per_iteration": 2.4301302433013916 + }, + { + "auxiliary_loss_clip": 0.01096299, + "auxiliary_loss_mlp": 0.01028041, + "balance_loss_clip": 1.01684737, + "balance_loss_mlp": 1.03415847, + "epoch": 0.9605290846234781, + "flos": 24097568544000.0, + "grad_norm": 2.1951271711643554, + "language_loss": 0.55330515, + "learning_rate": 1.6296205558762322e-08, + "loss": 0.5745486, + "num_input_tokens_seen": 344682320, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.62109375, + "step": 15976, + "time_per_iteration": 2.530332326889038 + }, + { + "auxiliary_loss_clip": 0.01094425, + "auxiliary_loss_mlp": 0.01023604, + "balance_loss_clip": 1.01263642, + "balance_loss_mlp": 1.03126621, + "epoch": 0.9605892078761461, + "flos": 27122575299840.0, + "grad_norm": 2.1902101509492633, + "language_loss": 0.68605191, + "learning_rate": 1.624662719799219e-08, + "loss": 0.70723224, + "num_input_tokens_seen": 344701355, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6328125, + "step": 15977, + "time_per_iteration": 2.5073699951171875 + }, + { + "auxiliary_loss_clip": 0.01098235, + "auxiliary_loss_mlp": 0.01035581, + "balance_loss_clip": 1.0238564, + "balance_loss_mlp": 1.03291917, + "epoch": 0.9606493311288141, + "flos": 14136918543360.0, + "grad_norm": 2.8583973827450397, + "language_loss": 0.82103157, + "learning_rate": 1.6197124061348766e-08, + "loss": 0.84236974, + "num_input_tokens_seen": 344717980, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65234375, + "step": 15978, + "time_per_iteration": 3.9228808879852295 + }, + { + "auxiliary_loss_clip": 0.01103256, + "auxiliary_loss_mlp": 0.01029805, + "balance_loss_clip": 1.0179671, + "balance_loss_mlp": 1.03507805, + "epoch": 0.960709454381482, + "flos": 15813336147840.0, + "grad_norm": 2.21939382847535, + "language_loss": 0.83099633, + "learning_rate": 1.614769615070921e-08, + "loss": 0.85232687, + "num_input_tokens_seen": 344733480, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.68359375, + "step": 15979, + "time_per_iteration": 2.4425342082977295 + }, + { + "auxiliary_loss_clip": 0.01098986, + "auxiliary_loss_mlp": 0.01037557, + "balance_loss_clip": 1.0265305, + "balance_loss_mlp": 1.03295469, + "epoch": 0.96076957763415, + "flos": 22565403959040.0, + "grad_norm": 1.5535455683117823, + "language_loss": 0.80101836, + "learning_rate": 1.6098343467947805e-08, + "loss": 0.82238382, + "num_input_tokens_seen": 344752130, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.66015625, + "step": 15980, + "time_per_iteration": 2.488734006881714 + }, + { + "auxiliary_loss_clip": 0.01100084, + "auxiliary_loss_mlp": 0.01029361, + "balance_loss_clip": 1.01745772, + "balance_loss_mlp": 1.03294373, + "epoch": 0.960829700886818, + "flos": 24681260551680.0, + "grad_norm": 1.929420179350021, + "language_loss": 0.68303668, + "learning_rate": 1.6049066014935942e-08, + "loss": 0.70433116, + "num_input_tokens_seen": 344771195, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 15981, + "time_per_iteration": 2.5347800254821777 + }, + { + "auxiliary_loss_clip": 0.01097655, + "auxiliary_loss_mlp": 0.01024293, + "balance_loss_clip": 1.01319432, + "balance_loss_mlp": 1.03369415, + "epoch": 0.960889824139486, + "flos": 26542223256960.0, + "grad_norm": 1.3984693202200493, + "language_loss": 0.69509637, + "learning_rate": 1.5999863793542344e-08, + "loss": 0.71631587, + "num_input_tokens_seen": 344793150, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.640625, + "step": 15982, + "time_per_iteration": 2.553140163421631 + }, + { + "auxiliary_loss_clip": 0.01021661, + "auxiliary_loss_mlp": 0.00998004, + "balance_loss_clip": 0.99696726, + "balance_loss_mlp": 1.00166357, + "epoch": 0.9609499473921539, + "flos": 71114942586240.0, + "grad_norm": 0.6681934068781947, + "language_loss": 0.53323615, + "learning_rate": 1.595073680563286e-08, + "loss": 0.55343282, + "num_input_tokens_seen": 344852855, + "router_z_loss_clip": 0.01037598, + "router_z_loss_mlp": 0.20019531, + "step": 15983, + "time_per_iteration": 3.163548231124878 + }, + { + "auxiliary_loss_clip": 0.01098972, + "auxiliary_loss_mlp": 0.01036637, + "balance_loss_clip": 1.02478802, + "balance_loss_mlp": 1.03438175, + "epoch": 0.9610100706448219, + "flos": 20552466810240.0, + "grad_norm": 2.0538452317245204, + "language_loss": 0.6784721, + "learning_rate": 1.5901685053070212e-08, + "loss": 0.69982827, + "num_input_tokens_seen": 344869830, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6484375, + "step": 15984, + "time_per_iteration": 2.4459614753723145 + }, + { + "auxiliary_loss_clip": 0.01095462, + "auxiliary_loss_mlp": 0.01031702, + "balance_loss_clip": 1.02069306, + "balance_loss_mlp": 1.03341627, + "epoch": 0.9610701938974898, + "flos": 14064199459200.0, + "grad_norm": 1.6237233896189485, + "language_loss": 0.66909266, + "learning_rate": 1.5852708537714477e-08, + "loss": 0.6903643, + "num_input_tokens_seen": 344888905, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6171875, + "step": 15985, + "time_per_iteration": 2.4718329906463623 + }, + { + "auxiliary_loss_clip": 0.01100771, + "auxiliary_loss_mlp": 0.01027832, + "balance_loss_clip": 1.0163815, + "balance_loss_mlp": 1.03461182, + "epoch": 0.9611303171501578, + "flos": 20229989483520.0, + "grad_norm": 1.9043539282595943, + "language_loss": 0.78663325, + "learning_rate": 1.580380726142283e-08, + "loss": 0.80791926, + "num_input_tokens_seen": 344907160, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66015625, + "step": 15986, + "time_per_iteration": 2.5057144165039062 + }, + { + "auxiliary_loss_clip": 0.01100246, + "auxiliary_loss_mlp": 0.01029846, + "balance_loss_clip": 1.01730478, + "balance_loss_mlp": 1.0349791, + "epoch": 0.9611904404028258, + "flos": 20951075013120.0, + "grad_norm": 2.0139764735454984, + "language_loss": 0.63585907, + "learning_rate": 1.5754981226049792e-08, + "loss": 0.65716004, + "num_input_tokens_seen": 344922400, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.65234375, + "step": 15987, + "time_per_iteration": 2.484804630279541 + }, + { + "auxiliary_loss_clip": 0.01096451, + "auxiliary_loss_mlp": 0.01027283, + "balance_loss_clip": 1.01677477, + "balance_loss_mlp": 1.03409028, + "epoch": 0.9612505636554938, + "flos": 24827740214400.0, + "grad_norm": 1.6635144622564184, + "language_loss": 0.67184675, + "learning_rate": 1.5706230433446544e-08, + "loss": 0.69308412, + "num_input_tokens_seen": 344941910, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.62109375, + "step": 15988, + "time_per_iteration": 2.51582932472229 + }, + { + "auxiliary_loss_clip": 0.01098191, + "auxiliary_loss_mlp": 0.01039743, + "balance_loss_clip": 1.02879977, + "balance_loss_mlp": 1.033499, + "epoch": 0.9613106869081617, + "flos": 17164977955200.0, + "grad_norm": 2.0386399724410773, + "language_loss": 0.7444011, + "learning_rate": 1.5657554885462055e-08, + "loss": 0.76578045, + "num_input_tokens_seen": 344960020, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 15989, + "time_per_iteration": 2.4653844833374023 + }, + { + "auxiliary_loss_clip": 0.0102176, + "auxiliary_loss_mlp": 0.00998361, + "balance_loss_clip": 0.99734801, + "balance_loss_mlp": 1.00191987, + "epoch": 0.9613708101608297, + "flos": 61563818522880.0, + "grad_norm": 0.8244627726356378, + "language_loss": 0.63139147, + "learning_rate": 1.5608954583941737e-08, + "loss": 0.65159267, + "num_input_tokens_seen": 345018290, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.19824219, + "step": 15990, + "time_per_iteration": 2.9341416358947754 + }, + { + "auxiliary_loss_clip": 0.01098606, + "auxiliary_loss_mlp": 0.01029602, + "balance_loss_clip": 1.01836634, + "balance_loss_mlp": 1.03330886, + "epoch": 0.9614309334134977, + "flos": 27417904922880.0, + "grad_norm": 1.7929580248033004, + "language_loss": 0.77747667, + "learning_rate": 1.5560429530729003e-08, + "loss": 0.79875869, + "num_input_tokens_seen": 345040235, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 15991, + "time_per_iteration": 2.5114362239837646 + }, + { + "auxiliary_loss_clip": 0.01104631, + "auxiliary_loss_mlp": 0.01031055, + "balance_loss_clip": 1.01856136, + "balance_loss_mlp": 1.03413033, + "epoch": 0.9614910566661656, + "flos": 22819148611200.0, + "grad_norm": 2.3248342677519145, + "language_loss": 0.84501588, + "learning_rate": 1.5511979727663493e-08, + "loss": 0.86637282, + "num_input_tokens_seen": 345054540, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 15992, + "time_per_iteration": 2.4357917308807373 + }, + { + "auxiliary_loss_clip": 0.01098966, + "auxiliary_loss_mlp": 0.01029993, + "balance_loss_clip": 1.01779771, + "balance_loss_mlp": 1.03286505, + "epoch": 0.9615511799188337, + "flos": 20667812359680.0, + "grad_norm": 2.0221041730312583, + "language_loss": 0.72067487, + "learning_rate": 1.5463605176582406e-08, + "loss": 0.7419644, + "num_input_tokens_seen": 345074035, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66015625, + "step": 15993, + "time_per_iteration": 2.458228349685669 + }, + { + "auxiliary_loss_clip": 0.01098416, + "auxiliary_loss_mlp": 0.01031599, + "balance_loss_clip": 1.0197798, + "balance_loss_mlp": 1.03211713, + "epoch": 0.9616113031715016, + "flos": 33149212035840.0, + "grad_norm": 1.425573612625333, + "language_loss": 0.68134975, + "learning_rate": 1.5415305879320716e-08, + "loss": 0.70264989, + "num_input_tokens_seen": 345099270, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 15994, + "time_per_iteration": 2.574979543685913 + }, + { + "auxiliary_loss_clip": 0.01099525, + "auxiliary_loss_mlp": 0.01029522, + "balance_loss_clip": 1.01776767, + "balance_loss_mlp": 1.03487062, + "epoch": 0.9616714264241696, + "flos": 25009807276800.0, + "grad_norm": 1.8311870454132768, + "language_loss": 0.84529275, + "learning_rate": 1.5367081837709183e-08, + "loss": 0.86658323, + "num_input_tokens_seen": 345116975, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.64453125, + "step": 15995, + "time_per_iteration": 2.508324384689331 + }, + { + "auxiliary_loss_clip": 0.01101034, + "auxiliary_loss_mlp": 0.01031715, + "balance_loss_clip": 1.01981795, + "balance_loss_mlp": 1.03394556, + "epoch": 0.9617315496768375, + "flos": 13547480359680.0, + "grad_norm": 1.7535990840626554, + "language_loss": 0.75937271, + "learning_rate": 1.5318933053576788e-08, + "loss": 0.78070021, + "num_input_tokens_seen": 345133645, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 15996, + "time_per_iteration": 2.412166118621826 + }, + { + "auxiliary_loss_clip": 0.01097246, + "auxiliary_loss_mlp": 0.01029472, + "balance_loss_clip": 1.01758718, + "balance_loss_mlp": 1.03245521, + "epoch": 0.9617916729295055, + "flos": 11254512781440.0, + "grad_norm": 1.8443743753530013, + "language_loss": 0.76869327, + "learning_rate": 1.52708595287494e-08, + "loss": 0.78996044, + "num_input_tokens_seen": 345150740, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6484375, + "step": 15997, + "time_per_iteration": 2.427321434020996 + }, + { + "auxiliary_loss_clip": 0.0109466, + "auxiliary_loss_mlp": 0.01029654, + "balance_loss_clip": 1.01854956, + "balance_loss_mlp": 1.03262687, + "epoch": 0.9618517961821734, + "flos": 22819723228800.0, + "grad_norm": 1.5037392359064448, + "language_loss": 0.67111742, + "learning_rate": 1.522286126505001e-08, + "loss": 0.69236064, + "num_input_tokens_seen": 345170365, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.62109375, + "step": 15998, + "time_per_iteration": 2.440931797027588 + }, + { + "auxiliary_loss_clip": 0.01096743, + "auxiliary_loss_mlp": 0.0102854, + "balance_loss_clip": 1.01669025, + "balance_loss_mlp": 1.03206193, + "epoch": 0.9619119194348414, + "flos": 16617340224000.0, + "grad_norm": 1.6944994780105895, + "language_loss": 0.72642672, + "learning_rate": 1.5174938264298498e-08, + "loss": 0.74767953, + "num_input_tokens_seen": 345188930, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6484375, + "step": 15999, + "time_per_iteration": 2.439814329147339 + }, + { + "auxiliary_loss_clip": 0.0109533, + "auxiliary_loss_mlp": 0.01025115, + "balance_loss_clip": 1.01451135, + "balance_loss_mlp": 1.03301597, + "epoch": 0.9619720426875094, + "flos": 24535140024960.0, + "grad_norm": 1.856091141124966, + "language_loss": 0.65324283, + "learning_rate": 1.5127090528312514e-08, + "loss": 0.67444718, + "num_input_tokens_seen": 345209615, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.625, + "step": 16000, + "time_per_iteration": 2.460580587387085 + }, + { + "auxiliary_loss_clip": 0.01098363, + "auxiliary_loss_mlp": 0.01026136, + "balance_loss_clip": 1.01385164, + "balance_loss_mlp": 1.03337932, + "epoch": 0.9620321659401774, + "flos": 20632224960000.0, + "grad_norm": 2.318305441538345, + "language_loss": 0.75454199, + "learning_rate": 1.5079318058905723e-08, + "loss": 0.775787, + "num_input_tokens_seen": 345229175, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6484375, + "step": 16001, + "time_per_iteration": 2.437589645385742 + }, + { + "auxiliary_loss_clip": 0.01097856, + "auxiliary_loss_mlp": 0.01031715, + "balance_loss_clip": 1.0196507, + "balance_loss_mlp": 1.0329746, + "epoch": 0.9620922891928453, + "flos": 18515290959360.0, + "grad_norm": 1.501970694433986, + "language_loss": 0.68156397, + "learning_rate": 1.5031620857890447e-08, + "loss": 0.70285976, + "num_input_tokens_seen": 345247815, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6484375, + "step": 16002, + "time_per_iteration": 2.433119773864746 + }, + { + "auxiliary_loss_clip": 0.0109988, + "auxiliary_loss_mlp": 0.01027372, + "balance_loss_clip": 1.01573122, + "balance_loss_mlp": 1.03520513, + "epoch": 0.9621524124455133, + "flos": 28767391914240.0, + "grad_norm": 1.2603335889169271, + "language_loss": 0.64553183, + "learning_rate": 1.4983998927074804e-08, + "loss": 0.66680431, + "num_input_tokens_seen": 345269935, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6484375, + "step": 16003, + "time_per_iteration": 2.5247597694396973 + }, + { + "auxiliary_loss_clip": 0.01101995, + "auxiliary_loss_mlp": 0.01035834, + "balance_loss_clip": 1.02489662, + "balance_loss_mlp": 1.03565001, + "epoch": 0.9622125356981813, + "flos": 19098875226240.0, + "grad_norm": 1.7694321709596037, + "language_loss": 0.75896275, + "learning_rate": 1.493645226826512e-08, + "loss": 0.78034103, + "num_input_tokens_seen": 345288310, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6640625, + "step": 16004, + "time_per_iteration": 2.433055877685547 + }, + { + "auxiliary_loss_clip": 0.01098006, + "auxiliary_loss_mlp": 0.01028519, + "balance_loss_clip": 1.01662803, + "balance_loss_mlp": 1.03370786, + "epoch": 0.9622726589508492, + "flos": 20302816308480.0, + "grad_norm": 2.192137725260173, + "language_loss": 0.79381818, + "learning_rate": 1.4888980883263958e-08, + "loss": 0.81508344, + "num_input_tokens_seen": 345306615, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.64453125, + "step": 16005, + "time_per_iteration": 2.4529757499694824 + }, + { + "auxiliary_loss_clip": 0.01095875, + "auxiliary_loss_mlp": 0.01027459, + "balance_loss_clip": 1.01653981, + "balance_loss_mlp": 1.03297102, + "epoch": 0.9623327822035173, + "flos": 54929750889600.0, + "grad_norm": 1.9248059922293024, + "language_loss": 0.67267632, + "learning_rate": 1.4841584773871652e-08, + "loss": 0.69390965, + "num_input_tokens_seen": 345331935, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.62890625, + "step": 16006, + "time_per_iteration": 2.776263952255249 + }, + { + "auxiliary_loss_clip": 0.01094469, + "auxiliary_loss_mlp": 0.01031278, + "balance_loss_clip": 1.02053165, + "balance_loss_mlp": 1.03415585, + "epoch": 0.9623929054561852, + "flos": 21759029585280.0, + "grad_norm": 1.5940516998351955, + "language_loss": 0.78056121, + "learning_rate": 1.479426394188521e-08, + "loss": 0.80181879, + "num_input_tokens_seen": 345351510, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6015625, + "step": 16007, + "time_per_iteration": 2.47029972076416 + }, + { + "auxiliary_loss_clip": 0.01100629, + "auxiliary_loss_mlp": 0.0103215, + "balance_loss_clip": 1.02034807, + "balance_loss_mlp": 1.03482246, + "epoch": 0.9624530287088532, + "flos": 17931563038080.0, + "grad_norm": 2.0291702556230438, + "language_loss": 0.68004704, + "learning_rate": 1.4747018389099198e-08, + "loss": 0.70137483, + "num_input_tokens_seen": 345367750, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.65625, + "step": 16008, + "time_per_iteration": 2.529724597930908 + }, + { + "auxiliary_loss_clip": 0.01102821, + "auxiliary_loss_mlp": 0.01032416, + "balance_loss_clip": 1.01991701, + "balance_loss_mlp": 1.03540087, + "epoch": 0.9625131519615211, + "flos": 23253739263360.0, + "grad_norm": 2.1805744652126657, + "language_loss": 0.72793615, + "learning_rate": 1.469984811730529e-08, + "loss": 0.74928856, + "num_input_tokens_seen": 345384790, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.671875, + "step": 16009, + "time_per_iteration": 2.4500856399536133 + }, + { + "auxiliary_loss_clip": 0.01097324, + "auxiliary_loss_mlp": 0.01031527, + "balance_loss_clip": 1.01997614, + "balance_loss_mlp": 1.03245699, + "epoch": 0.9625732752141891, + "flos": 18916628595840.0, + "grad_norm": 2.132459467969933, + "language_loss": 0.75247002, + "learning_rate": 1.4652753128292061e-08, + "loss": 0.77375853, + "num_input_tokens_seen": 345403390, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6484375, + "step": 16010, + "time_per_iteration": 2.4420454502105713 + }, + { + "auxiliary_loss_clip": 0.01105906, + "auxiliary_loss_mlp": 0.01033369, + "balance_loss_clip": 1.01913512, + "balance_loss_mlp": 1.03655696, + "epoch": 0.962633398466857, + "flos": 16252918790400.0, + "grad_norm": 1.8880874815114188, + "language_loss": 0.69513392, + "learning_rate": 1.4605733423845635e-08, + "loss": 0.71652675, + "num_input_tokens_seen": 345418685, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.6953125, + "step": 16011, + "time_per_iteration": 2.466012954711914 + }, + { + "auxiliary_loss_clip": 0.01097648, + "auxiliary_loss_mlp": 0.01030261, + "balance_loss_clip": 1.01957953, + "balance_loss_mlp": 1.03402066, + "epoch": 0.962693521719525, + "flos": 54197424403200.0, + "grad_norm": 1.6867727595710786, + "language_loss": 0.68486851, + "learning_rate": 1.4558789005748585e-08, + "loss": 0.70614755, + "num_input_tokens_seen": 345442380, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.63671875, + "step": 16012, + "time_per_iteration": 4.115834474563599 + }, + { + "auxiliary_loss_clip": 0.01105856, + "auxiliary_loss_mlp": 0.01035194, + "balance_loss_clip": 1.02205706, + "balance_loss_mlp": 1.03603888, + "epoch": 0.962753644972193, + "flos": 33105795471360.0, + "grad_norm": 1.7968742145929515, + "language_loss": 0.7239725, + "learning_rate": 1.4511919875781264e-08, + "loss": 0.74538302, + "num_input_tokens_seen": 345463815, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.69921875, + "step": 16013, + "time_per_iteration": 2.5248327255249023 + }, + { + "auxiliary_loss_clip": 0.01098665, + "auxiliary_loss_mlp": 0.01030707, + "balance_loss_clip": 1.01866698, + "balance_loss_mlp": 1.03396904, + "epoch": 0.962813768224861, + "flos": 42230660837760.0, + "grad_norm": 2.303643784633636, + "language_loss": 0.63361096, + "learning_rate": 1.4465126035720698e-08, + "loss": 0.65490472, + "num_input_tokens_seen": 345484525, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6484375, + "step": 16014, + "time_per_iteration": 5.387859582901001 + }, + { + "auxiliary_loss_clip": 0.01094961, + "auxiliary_loss_mlp": 0.01026344, + "balance_loss_clip": 1.0163964, + "balance_loss_mlp": 1.03386617, + "epoch": 0.9628738914775289, + "flos": 43944677003520.0, + "grad_norm": 1.9022819757041962, + "language_loss": 0.71860576, + "learning_rate": 1.4418407487341688e-08, + "loss": 0.73981875, + "num_input_tokens_seen": 345508295, + "router_z_loss_clip": 0.09960938, + "router_z_loss_mlp": 0.609375, + "step": 16015, + "time_per_iteration": 2.74052095413208 + }, + { + "auxiliary_loss_clip": 0.01097382, + "auxiliary_loss_mlp": 0.0102778, + "balance_loss_clip": 1.01629472, + "balance_loss_mlp": 1.03302288, + "epoch": 0.9629340147301969, + "flos": 15596184476160.0, + "grad_norm": 1.7223276387291737, + "language_loss": 0.77100927, + "learning_rate": 1.4371764232415707e-08, + "loss": 0.79226089, + "num_input_tokens_seen": 345525155, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.64453125, + "step": 16016, + "time_per_iteration": 2.5562946796417236 + }, + { + "auxiliary_loss_clip": 0.01021809, + "auxiliary_loss_mlp": 0.00998645, + "balance_loss_clip": 0.99765599, + "balance_loss_mlp": 1.00184894, + "epoch": 0.9629941379828649, + "flos": 62951011816320.0, + "grad_norm": 0.8230091006411403, + "language_loss": 0.6317451, + "learning_rate": 1.4325196272711337e-08, + "loss": 0.65194964, + "num_input_tokens_seen": 345578905, + "router_z_loss_clip": 0.0098877, + "router_z_loss_mlp": 0.19921875, + "step": 16017, + "time_per_iteration": 2.9330668449401855 + }, + { + "auxiliary_loss_clip": 0.01101685, + "auxiliary_loss_mlp": 0.01026728, + "balance_loss_clip": 1.01551044, + "balance_loss_mlp": 1.03531194, + "epoch": 0.9630542612355328, + "flos": 29899116702720.0, + "grad_norm": 2.7407998859576432, + "language_loss": 0.6571548, + "learning_rate": 1.4278703609994502e-08, + "loss": 0.67843896, + "num_input_tokens_seen": 345598965, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6640625, + "step": 16018, + "time_per_iteration": 2.4978015422821045 + }, + { + "auxiliary_loss_clip": 0.0109954, + "auxiliary_loss_mlp": 0.01031934, + "balance_loss_clip": 1.0205797, + "balance_loss_mlp": 1.03453994, + "epoch": 0.9631143844882009, + "flos": 17894575008000.0, + "grad_norm": 1.9221480896799248, + "language_loss": 0.79585052, + "learning_rate": 1.4232286246028457e-08, + "loss": 0.81716537, + "num_input_tokens_seen": 345617945, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 16019, + "time_per_iteration": 3.83243465423584 + }, + { + "auxiliary_loss_clip": 0.01094234, + "auxiliary_loss_mlp": 0.0102779, + "balance_loss_clip": 1.01744246, + "balance_loss_mlp": 1.03089976, + "epoch": 0.9631745077408688, + "flos": 26139161767680.0, + "grad_norm": 1.4518833945438399, + "language_loss": 0.71567214, + "learning_rate": 1.4185944182572907e-08, + "loss": 0.73689234, + "num_input_tokens_seen": 345637920, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.6328125, + "step": 16020, + "time_per_iteration": 2.4867608547210693 + }, + { + "auxiliary_loss_clip": 0.01099297, + "auxiliary_loss_mlp": 0.0102639, + "balance_loss_clip": 1.01586401, + "balance_loss_mlp": 1.03405941, + "epoch": 0.9632346309935368, + "flos": 24973645259520.0, + "grad_norm": 1.654544667826034, + "language_loss": 0.77078342, + "learning_rate": 1.4139677421385331e-08, + "loss": 0.79204035, + "num_input_tokens_seen": 345656195, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.65234375, + "step": 16021, + "time_per_iteration": 2.45025372505188 + }, + { + "auxiliary_loss_clip": 0.01103509, + "auxiliary_loss_mlp": 0.01029839, + "balance_loss_clip": 1.01649964, + "balance_loss_mlp": 1.03461719, + "epoch": 0.9632947542462047, + "flos": 23617226943360.0, + "grad_norm": 1.9848746389791796, + "language_loss": 0.64672452, + "learning_rate": 1.4093485964220331e-08, + "loss": 0.66805798, + "num_input_tokens_seen": 345676700, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.6875, + "step": 16022, + "time_per_iteration": 2.5040066242218018 + }, + { + "auxiliary_loss_clip": 0.01096934, + "auxiliary_loss_mlp": 0.01030439, + "balance_loss_clip": 1.01966798, + "balance_loss_mlp": 1.03360546, + "epoch": 0.9633548774988727, + "flos": 26395599939840.0, + "grad_norm": 2.0658365642461525, + "language_loss": 0.73443997, + "learning_rate": 1.4047369812829168e-08, + "loss": 0.7557137, + "num_input_tokens_seen": 345696725, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6328125, + "step": 16023, + "time_per_iteration": 2.4626638889312744 + }, + { + "auxiliary_loss_clip": 0.0109734, + "auxiliary_loss_mlp": 0.01026163, + "balance_loss_clip": 1.01538599, + "balance_loss_mlp": 1.03294301, + "epoch": 0.9634150007515406, + "flos": 23767728929280.0, + "grad_norm": 1.449705583303519, + "language_loss": 0.81280053, + "learning_rate": 1.4001328968960891e-08, + "loss": 0.83403552, + "num_input_tokens_seen": 345716245, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.64453125, + "step": 16024, + "time_per_iteration": 2.4662721157073975 + }, + { + "auxiliary_loss_clip": 0.01103249, + "auxiliary_loss_mlp": 0.01031952, + "balance_loss_clip": 1.01964998, + "balance_loss_mlp": 1.03470325, + "epoch": 0.9634751240042086, + "flos": 24135346673280.0, + "grad_norm": 1.3425470745031889, + "language_loss": 0.81449908, + "learning_rate": 1.3955363434361212e-08, + "loss": 0.83585107, + "num_input_tokens_seen": 345739060, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 16025, + "time_per_iteration": 2.571988105773926 + }, + { + "auxiliary_loss_clip": 0.01101207, + "auxiliary_loss_mlp": 0.01030748, + "balance_loss_clip": 1.01897025, + "balance_loss_mlp": 1.03413701, + "epoch": 0.9635352472568766, + "flos": 24349086552960.0, + "grad_norm": 1.8159073378750998, + "language_loss": 0.76695681, + "learning_rate": 1.3909473210773181e-08, + "loss": 0.78827643, + "num_input_tokens_seen": 345758325, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.671875, + "step": 16026, + "time_per_iteration": 2.5653977394104004 + }, + { + "auxiliary_loss_clip": 0.01100402, + "auxiliary_loss_mlp": 0.01030366, + "balance_loss_clip": 1.01804018, + "balance_loss_mlp": 1.03367102, + "epoch": 0.9635953705095446, + "flos": 23984772860160.0, + "grad_norm": 1.6861028464709051, + "language_loss": 0.63083422, + "learning_rate": 1.3863658299936965e-08, + "loss": 0.65214193, + "num_input_tokens_seen": 345778530, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.66796875, + "step": 16027, + "time_per_iteration": 2.544005870819092 + }, + { + "auxiliary_loss_clip": 0.01103438, + "auxiliary_loss_mlp": 0.0102768, + "balance_loss_clip": 1.0156163, + "balance_loss_mlp": 1.03664851, + "epoch": 0.9636554937622125, + "flos": 19828436365440.0, + "grad_norm": 2.0164451231663882, + "language_loss": 0.87208748, + "learning_rate": 1.3817918703589837e-08, + "loss": 0.89339876, + "num_input_tokens_seen": 345796535, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.66796875, + "step": 16028, + "time_per_iteration": 2.5071027278900146 + }, + { + "auxiliary_loss_clip": 0.010217, + "auxiliary_loss_mlp": 0.00999046, + "balance_loss_clip": 0.99811667, + "balance_loss_mlp": 1.00170708, + "epoch": 0.9637156170148805, + "flos": 67435499986560.0, + "grad_norm": 0.7421749318599844, + "language_loss": 0.53201663, + "learning_rate": 1.3772254423466412e-08, + "loss": 0.5522241, + "num_input_tokens_seen": 345859700, + "router_z_loss_clip": 0.00927734, + "router_z_loss_mlp": 0.19921875, + "step": 16029, + "time_per_iteration": 3.038540840148926 + }, + { + "auxiliary_loss_clip": 0.01101, + "auxiliary_loss_mlp": 0.01027698, + "balance_loss_clip": 1.01630187, + "balance_loss_mlp": 1.03434622, + "epoch": 0.9637757402675484, + "flos": 20300912887680.0, + "grad_norm": 1.587969426883562, + "language_loss": 0.73793781, + "learning_rate": 1.372666546129797e-08, + "loss": 0.75922477, + "num_input_tokens_seen": 345878760, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 16030, + "time_per_iteration": 2.512209892272949 + }, + { + "auxiliary_loss_clip": 0.01096772, + "auxiliary_loss_mlp": 0.01027423, + "balance_loss_clip": 1.0165571, + "balance_loss_mlp": 1.03376997, + "epoch": 0.9638358635202164, + "flos": 27234544970880.0, + "grad_norm": 2.091542472961613, + "language_loss": 0.66038525, + "learning_rate": 1.3681151818813575e-08, + "loss": 0.68162721, + "num_input_tokens_seen": 345900445, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.62890625, + "step": 16031, + "time_per_iteration": 2.6668994426727295 + }, + { + "auxiliary_loss_clip": 0.01021545, + "auxiliary_loss_mlp": 0.01000717, + "balance_loss_clip": 0.99978131, + "balance_loss_mlp": 1.001513, + "epoch": 0.9638959867728845, + "flos": 70288998278400.0, + "grad_norm": 0.8415977039530823, + "language_loss": 0.60769111, + "learning_rate": 1.3635713497738955e-08, + "loss": 0.62791371, + "num_input_tokens_seen": 345961020, + "router_z_loss_clip": 0.00933838, + "router_z_loss_mlp": 0.20117188, + "step": 16032, + "time_per_iteration": 3.1774539947509766 + }, + { + "auxiliary_loss_clip": 0.01092096, + "auxiliary_loss_mlp": 0.01029981, + "balance_loss_clip": 1.01961017, + "balance_loss_mlp": 1.03172016, + "epoch": 0.9639561100255524, + "flos": 25407517639680.0, + "grad_norm": 2.336742509809211, + "language_loss": 0.66448474, + "learning_rate": 1.3590350499796954e-08, + "loss": 0.68570554, + "num_input_tokens_seen": 345980210, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.60546875, + "step": 16033, + "time_per_iteration": 2.5393478870391846 + }, + { + "auxiliary_loss_clip": 0.01099204, + "auxiliary_loss_mlp": 0.01029271, + "balance_loss_clip": 1.01792789, + "balance_loss_mlp": 1.0350368, + "epoch": 0.9640162332782204, + "flos": 18113881495680.0, + "grad_norm": 1.8687233450707268, + "language_loss": 0.6541754, + "learning_rate": 1.3545062826707976e-08, + "loss": 0.6754601, + "num_input_tokens_seen": 345998280, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.640625, + "step": 16034, + "time_per_iteration": 2.5119800567626953 + }, + { + "auxiliary_loss_clip": 0.01100794, + "auxiliary_loss_mlp": 0.01027753, + "balance_loss_clip": 1.01654732, + "balance_loss_mlp": 1.03560579, + "epoch": 0.9640763565308883, + "flos": 23440295525760.0, + "grad_norm": 2.2819152294755765, + "language_loss": 0.7378726, + "learning_rate": 1.3499850480189313e-08, + "loss": 0.75915802, + "num_input_tokens_seen": 346015545, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 16035, + "time_per_iteration": 2.514049530029297 + }, + { + "auxiliary_loss_clip": 0.01102242, + "auxiliary_loss_mlp": 0.0102748, + "balance_loss_clip": 1.01587522, + "balance_loss_mlp": 1.03745866, + "epoch": 0.9641364797835563, + "flos": 22419355259520.0, + "grad_norm": 1.9666432901090276, + "language_loss": 0.82240516, + "learning_rate": 1.3454713461955591e-08, + "loss": 0.84370238, + "num_input_tokens_seen": 346034055, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6484375, + "step": 16036, + "time_per_iteration": 2.470296859741211 + }, + { + "auxiliary_loss_clip": 0.01097949, + "auxiliary_loss_mlp": 0.01029518, + "balance_loss_clip": 1.01781738, + "balance_loss_mlp": 1.03251529, + "epoch": 0.9641966030362242, + "flos": 30622357048320.0, + "grad_norm": 1.8384642674416498, + "language_loss": 0.69920629, + "learning_rate": 1.340965177371789e-08, + "loss": 0.72048092, + "num_input_tokens_seen": 346054130, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 16037, + "time_per_iteration": 2.5022101402282715 + }, + { + "auxiliary_loss_clip": 0.01098879, + "auxiliary_loss_mlp": 0.01024612, + "balance_loss_clip": 1.01337099, + "balance_loss_mlp": 1.03303576, + "epoch": 0.9642567262888923, + "flos": 20953122088320.0, + "grad_norm": 1.8647442017039988, + "language_loss": 0.63255847, + "learning_rate": 1.3364665417185506e-08, + "loss": 0.65379345, + "num_input_tokens_seen": 346072990, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 16038, + "time_per_iteration": 2.4083030223846436 + }, + { + "auxiliary_loss_clip": 0.01101312, + "auxiliary_loss_mlp": 0.01031981, + "balance_loss_clip": 1.02006602, + "balance_loss_mlp": 1.03394938, + "epoch": 0.9643168495415602, + "flos": 22639415932800.0, + "grad_norm": 4.3592959082323715, + "language_loss": 0.70973301, + "learning_rate": 1.3319754394064187e-08, + "loss": 0.73106587, + "num_input_tokens_seen": 346093745, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 16039, + "time_per_iteration": 2.4552149772644043 + }, + { + "auxiliary_loss_clip": 0.01099532, + "auxiliary_loss_mlp": 0.01027491, + "balance_loss_clip": 1.01546872, + "balance_loss_mlp": 1.03366244, + "epoch": 0.9643769727942282, + "flos": 20266259241600.0, + "grad_norm": 1.9258933079611011, + "language_loss": 0.72986352, + "learning_rate": 1.327491870605657e-08, + "loss": 0.75113374, + "num_input_tokens_seen": 346110115, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.65625, + "step": 16040, + "time_per_iteration": 2.485323190689087 + }, + { + "auxiliary_loss_clip": 0.01100883, + "auxiliary_loss_mlp": 0.0103019, + "balance_loss_clip": 1.01842415, + "balance_loss_mlp": 1.03421116, + "epoch": 0.9644370960468961, + "flos": 13881845088000.0, + "grad_norm": 1.8131495617267763, + "language_loss": 0.73091221, + "learning_rate": 1.3230158354863296e-08, + "loss": 0.75222296, + "num_input_tokens_seen": 346127165, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.66796875, + "step": 16041, + "time_per_iteration": 2.388808488845825 + }, + { + "auxiliary_loss_clip": 0.01094729, + "auxiliary_loss_mlp": 0.01026692, + "balance_loss_clip": 1.01594496, + "balance_loss_mlp": 1.03319907, + "epoch": 0.9644972192995641, + "flos": 17238199829760.0, + "grad_norm": 1.7600846480855517, + "language_loss": 0.71910304, + "learning_rate": 1.3185473342181674e-08, + "loss": 0.74031723, + "num_input_tokens_seen": 346145950, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6171875, + "step": 16042, + "time_per_iteration": 2.4807844161987305 + }, + { + "auxiliary_loss_clip": 0.01101997, + "auxiliary_loss_mlp": 0.01028281, + "balance_loss_clip": 1.01721215, + "balance_loss_mlp": 1.03423679, + "epoch": 0.964557342552232, + "flos": 23840340272640.0, + "grad_norm": 1.86868776275858, + "language_loss": 0.80611408, + "learning_rate": 1.3140863669705683e-08, + "loss": 0.82741684, + "num_input_tokens_seen": 346165005, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6796875, + "step": 16043, + "time_per_iteration": 2.4247870445251465 + }, + { + "auxiliary_loss_clip": 0.01098965, + "auxiliary_loss_mlp": 0.01027122, + "balance_loss_clip": 1.01601148, + "balance_loss_mlp": 1.03540707, + "epoch": 0.9646174658049, + "flos": 21653129312640.0, + "grad_norm": 1.4757233148834483, + "language_loss": 0.71590781, + "learning_rate": 1.3096329339127522e-08, + "loss": 0.73716873, + "num_input_tokens_seen": 346185095, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.63671875, + "step": 16044, + "time_per_iteration": 2.4749693870544434 + }, + { + "auxiliary_loss_clip": 0.01096636, + "auxiliary_loss_mlp": 0.01027099, + "balance_loss_clip": 1.01526165, + "balance_loss_mlp": 1.0332067, + "epoch": 0.9646775890575681, + "flos": 17129570123520.0, + "grad_norm": 2.0142953791074323, + "language_loss": 0.69947273, + "learning_rate": 1.3051870352135397e-08, + "loss": 0.7207101, + "num_input_tokens_seen": 346202580, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6328125, + "step": 16045, + "time_per_iteration": 2.4325809478759766 + }, + { + "auxiliary_loss_clip": 0.01101043, + "auxiliary_loss_mlp": 0.01031193, + "balance_loss_clip": 1.01903403, + "balance_loss_mlp": 1.03409147, + "epoch": 0.964737712310236, + "flos": 13005732458880.0, + "grad_norm": 2.254760365933983, + "language_loss": 0.74806952, + "learning_rate": 1.3007486710415737e-08, + "loss": 0.76939189, + "num_input_tokens_seen": 346219395, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.671875, + "step": 16046, + "time_per_iteration": 2.427147626876831 + }, + { + "auxiliary_loss_clip": 0.01102537, + "auxiliary_loss_mlp": 0.0103333, + "balance_loss_clip": 1.0209322, + "balance_loss_mlp": 1.03498721, + "epoch": 0.964797835562904, + "flos": 24279240556800.0, + "grad_norm": 1.7329644028293205, + "language_loss": 0.62384462, + "learning_rate": 1.2963178415651199e-08, + "loss": 0.64520335, + "num_input_tokens_seen": 346239715, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.67578125, + "step": 16047, + "time_per_iteration": 2.492799997329712 + }, + { + "auxiliary_loss_clip": 0.01100779, + "auxiliary_loss_mlp": 0.01033595, + "balance_loss_clip": 1.02144754, + "balance_loss_mlp": 1.03558803, + "epoch": 0.9648579588155719, + "flos": 20522697413760.0, + "grad_norm": 1.8850176887881036, + "language_loss": 0.6955775, + "learning_rate": 1.2918945469521992e-08, + "loss": 0.71692121, + "num_input_tokens_seen": 346258500, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.65234375, + "step": 16048, + "time_per_iteration": 2.4344987869262695 + }, + { + "auxiliary_loss_clip": 0.01101251, + "auxiliary_loss_mlp": 0.01028949, + "balance_loss_clip": 1.01668882, + "balance_loss_mlp": 1.03366709, + "epoch": 0.9649180820682399, + "flos": 32154844855680.0, + "grad_norm": 1.7863475924187646, + "language_loss": 0.63913882, + "learning_rate": 1.2874787873705662e-08, + "loss": 0.66044074, + "num_input_tokens_seen": 346279110, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.67578125, + "step": 16049, + "time_per_iteration": 2.4989817142486572 + }, + { + "auxiliary_loss_clip": 0.01101061, + "auxiliary_loss_mlp": 0.010269, + "balance_loss_clip": 1.01546216, + "balance_loss_mlp": 1.03558612, + "epoch": 0.9649782053209078, + "flos": 20522589672960.0, + "grad_norm": 1.6424917186742727, + "language_loss": 0.71067202, + "learning_rate": 1.2830705629876427e-08, + "loss": 0.73195171, + "num_input_tokens_seen": 346297860, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 16050, + "time_per_iteration": 2.5452215671539307 + }, + { + "auxiliary_loss_clip": 0.01102281, + "auxiliary_loss_mlp": 0.01029136, + "balance_loss_clip": 1.01621413, + "balance_loss_mlp": 1.0329442, + "epoch": 0.9650383285735759, + "flos": 43067953843200.0, + "grad_norm": 1.8388027945859817, + "language_loss": 0.69875538, + "learning_rate": 1.278669873970606e-08, + "loss": 0.72006953, + "num_input_tokens_seen": 346319860, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 16051, + "time_per_iteration": 2.636740207672119 + }, + { + "auxiliary_loss_clip": 0.01021624, + "auxiliary_loss_mlp": 0.01001844, + "balance_loss_clip": 1.00084877, + "balance_loss_mlp": 1.00160849, + "epoch": 0.9650984518262438, + "flos": 61748255882880.0, + "grad_norm": 0.8334148755985689, + "language_loss": 0.59121096, + "learning_rate": 1.2742767204863004e-08, + "loss": 0.61144561, + "num_input_tokens_seen": 346379025, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.20019531, + "step": 16052, + "time_per_iteration": 3.075615882873535 + }, + { + "auxiliary_loss_clip": 0.01095214, + "auxiliary_loss_mlp": 0.01026471, + "balance_loss_clip": 1.01512265, + "balance_loss_mlp": 1.03191876, + "epoch": 0.9651585750789118, + "flos": 29789337761280.0, + "grad_norm": 1.64808937771042, + "language_loss": 0.74442101, + "learning_rate": 1.2698911027013482e-08, + "loss": 0.76563787, + "num_input_tokens_seen": 346402250, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6328125, + "step": 16053, + "time_per_iteration": 3.8834474086761475 + }, + { + "auxiliary_loss_clip": 0.01101113, + "auxiliary_loss_mlp": 0.01030715, + "balance_loss_clip": 1.01878238, + "balance_loss_mlp": 1.03472745, + "epoch": 0.9652186983315797, + "flos": 16873060124160.0, + "grad_norm": 2.3993755573637743, + "language_loss": 0.68056464, + "learning_rate": 1.2655130207820386e-08, + "loss": 0.70188296, + "num_input_tokens_seen": 346419555, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 16054, + "time_per_iteration": 2.4599058628082275 + }, + { + "auxiliary_loss_clip": 0.01098543, + "auxiliary_loss_mlp": 0.01031479, + "balance_loss_clip": 1.0204761, + "balance_loss_mlp": 1.03408504, + "epoch": 0.9652788215842477, + "flos": 31649761762560.0, + "grad_norm": 1.5511100121301231, + "language_loss": 0.61763877, + "learning_rate": 1.2611424748943944e-08, + "loss": 0.63893896, + "num_input_tokens_seen": 346441245, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.64453125, + "step": 16055, + "time_per_iteration": 4.0541017055511475 + }, + { + "auxiliary_loss_clip": 0.01096153, + "auxiliary_loss_mlp": 0.01027762, + "balance_loss_clip": 1.01644897, + "balance_loss_mlp": 1.03382039, + "epoch": 0.9653389448369156, + "flos": 24754266944640.0, + "grad_norm": 1.899688570193355, + "language_loss": 0.76835245, + "learning_rate": 1.2567794652041719e-08, + "loss": 0.78959155, + "num_input_tokens_seen": 346460065, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.625, + "step": 16056, + "time_per_iteration": 3.974794626235962 + }, + { + "auxiliary_loss_clip": 0.01098862, + "auxiliary_loss_mlp": 0.01027586, + "balance_loss_clip": 1.01672029, + "balance_loss_mlp": 1.03289866, + "epoch": 0.9653990680895836, + "flos": 20297249700480.0, + "grad_norm": 1.5630374431073517, + "language_loss": 0.71658134, + "learning_rate": 1.2524239918767498e-08, + "loss": 0.73784578, + "num_input_tokens_seen": 346478005, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.66015625, + "step": 16057, + "time_per_iteration": 2.4625744819641113 + }, + { + "auxiliary_loss_clip": 0.01096064, + "auxiliary_loss_mlp": 0.01031729, + "balance_loss_clip": 1.02070796, + "balance_loss_mlp": 1.03262568, + "epoch": 0.9654591913422517, + "flos": 22528775064960.0, + "grad_norm": 1.7750175555369185, + "language_loss": 0.72013068, + "learning_rate": 1.2480760550773295e-08, + "loss": 0.74140859, + "num_input_tokens_seen": 346497575, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6328125, + "step": 16058, + "time_per_iteration": 2.4661831855773926 + }, + { + "auxiliary_loss_clip": 0.01097513, + "auxiliary_loss_mlp": 0.01033158, + "balance_loss_clip": 1.0217855, + "balance_loss_mlp": 1.03324616, + "epoch": 0.9655193145949196, + "flos": 26763002202240.0, + "grad_norm": 1.3805922722599118, + "language_loss": 0.74052727, + "learning_rate": 1.2437356549708011e-08, + "loss": 0.76183391, + "num_input_tokens_seen": 346520000, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.640625, + "step": 16059, + "time_per_iteration": 2.4987194538116455 + }, + { + "auxiliary_loss_clip": 0.01102874, + "auxiliary_loss_mlp": 0.01032015, + "balance_loss_clip": 1.0205586, + "balance_loss_mlp": 1.0346117, + "epoch": 0.9655794378475876, + "flos": 41970703132800.0, + "grad_norm": 1.9693634939713338, + "language_loss": 0.73338103, + "learning_rate": 1.239402791721722e-08, + "loss": 0.75472993, + "num_input_tokens_seen": 346541605, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.68359375, + "step": 16060, + "time_per_iteration": 2.691296100616455 + }, + { + "auxiliary_loss_clip": 0.01094521, + "auxiliary_loss_mlp": 0.01028413, + "balance_loss_clip": 1.0182569, + "balance_loss_mlp": 1.03331041, + "epoch": 0.9656395611002555, + "flos": 27709427704320.0, + "grad_norm": 1.5438406345380868, + "language_loss": 0.76715529, + "learning_rate": 1.2350774654944273e-08, + "loss": 0.78838468, + "num_input_tokens_seen": 346560955, + "router_z_loss_clip": 0.1015625, + "router_z_loss_mlp": 0.609375, + "step": 16061, + "time_per_iteration": 4.037534952163696 + }, + { + "auxiliary_loss_clip": 0.01021687, + "auxiliary_loss_mlp": 0.00999978, + "balance_loss_clip": 0.99901831, + "balance_loss_mlp": 1.0016849, + "epoch": 0.9656996843529235, + "flos": 68968562411520.0, + "grad_norm": 0.7240380472657248, + "language_loss": 0.64163613, + "learning_rate": 1.2307596764528749e-08, + "loss": 0.66185272, + "num_input_tokens_seen": 346621615, + "router_z_loss_clip": 0.00958252, + "router_z_loss_mlp": 0.20019531, + "step": 16062, + "time_per_iteration": 3.075866937637329 + }, + { + "auxiliary_loss_clip": 0.01093621, + "auxiliary_loss_mlp": 0.01025081, + "balance_loss_clip": 1.01454329, + "balance_loss_mlp": 1.03160632, + "epoch": 0.9657598076055914, + "flos": 20631327120000.0, + "grad_norm": 2.199981825340732, + "language_loss": 0.92818987, + "learning_rate": 1.226449424760867e-08, + "loss": 0.94937694, + "num_input_tokens_seen": 346637460, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.62109375, + "step": 16063, + "time_per_iteration": 2.4555232524871826 + }, + { + "auxiliary_loss_clip": 0.01101706, + "auxiliary_loss_mlp": 0.01032317, + "balance_loss_clip": 1.02054524, + "balance_loss_mlp": 1.0358125, + "epoch": 0.9658199308582595, + "flos": 20448577699200.0, + "grad_norm": 1.9157708937264109, + "language_loss": 0.81976312, + "learning_rate": 1.2221467105818062e-08, + "loss": 0.84110343, + "num_input_tokens_seen": 346655625, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65625, + "step": 16064, + "time_per_iteration": 2.4142022132873535 + }, + { + "auxiliary_loss_clip": 0.01100449, + "auxiliary_loss_mlp": 0.01027678, + "balance_loss_clip": 1.01711571, + "balance_loss_mlp": 1.03634119, + "epoch": 0.9658800541109274, + "flos": 24718033100160.0, + "grad_norm": 2.24347865862691, + "language_loss": 0.843117, + "learning_rate": 1.2178515340788731e-08, + "loss": 0.8643983, + "num_input_tokens_seen": 346675220, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.640625, + "step": 16065, + "time_per_iteration": 2.4605534076690674 + }, + { + "auxiliary_loss_clip": 0.01096746, + "auxiliary_loss_mlp": 0.01027695, + "balance_loss_clip": 1.01629925, + "balance_loss_mlp": 1.03209305, + "epoch": 0.9659401773635954, + "flos": 21610035970560.0, + "grad_norm": 1.7134975276082676, + "language_loss": 0.67760193, + "learning_rate": 1.2135638954149151e-08, + "loss": 0.69884634, + "num_input_tokens_seen": 346694710, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 16066, + "time_per_iteration": 2.4299395084381104 + }, + { + "auxiliary_loss_clip": 0.01097275, + "auxiliary_loss_mlp": 0.01024339, + "balance_loss_clip": 1.0133121, + "balance_loss_mlp": 1.03257763, + "epoch": 0.9660003006162633, + "flos": 20301200196480.0, + "grad_norm": 2.103530429938663, + "language_loss": 0.82447511, + "learning_rate": 1.209283794752558e-08, + "loss": 0.8456912, + "num_input_tokens_seen": 346712645, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 16067, + "time_per_iteration": 2.462406873703003 + }, + { + "auxiliary_loss_clip": 0.01097804, + "auxiliary_loss_mlp": 0.01026441, + "balance_loss_clip": 1.01503885, + "balance_loss_mlp": 1.03394961, + "epoch": 0.9660604238689313, + "flos": 24461954064000.0, + "grad_norm": 1.8120779839614523, + "language_loss": 0.68879712, + "learning_rate": 1.2050112322540496e-08, + "loss": 0.71003956, + "num_input_tokens_seen": 346732375, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.63671875, + "step": 16068, + "time_per_iteration": 2.4718844890594482 + }, + { + "auxiliary_loss_clip": 0.01093562, + "auxiliary_loss_mlp": 0.01025202, + "balance_loss_clip": 1.01563549, + "balance_loss_mlp": 1.03256798, + "epoch": 0.9661205471215992, + "flos": 19864023765120.0, + "grad_norm": 1.7691682427953708, + "language_loss": 0.67960203, + "learning_rate": 1.20074620808146e-08, + "loss": 0.70078963, + "num_input_tokens_seen": 346750430, + "router_z_loss_clip": 0.09570312, + "router_z_loss_mlp": 0.609375, + "step": 16069, + "time_per_iteration": 2.4708714485168457 + }, + { + "auxiliary_loss_clip": 0.01101825, + "auxiliary_loss_mlp": 0.01028455, + "balance_loss_clip": 1.01733899, + "balance_loss_mlp": 1.03626013, + "epoch": 0.9661806703742672, + "flos": 20557889763840.0, + "grad_norm": 1.7773485796509647, + "language_loss": 0.88872612, + "learning_rate": 1.1964887223964826e-08, + "loss": 0.91002887, + "num_input_tokens_seen": 346768455, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 16070, + "time_per_iteration": 2.436710834503174 + }, + { + "auxiliary_loss_clip": 0.01102442, + "auxiliary_loss_mlp": 0.01032598, + "balance_loss_clip": 1.02048659, + "balance_loss_mlp": 1.03738046, + "epoch": 0.9662407936269353, + "flos": 21430949736960.0, + "grad_norm": 1.7610963303021612, + "language_loss": 0.77342236, + "learning_rate": 1.1922387753605878e-08, + "loss": 0.7947728, + "num_input_tokens_seen": 346786530, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6484375, + "step": 16071, + "time_per_iteration": 2.4870574474334717 + }, + { + "auxiliary_loss_clip": 0.01097326, + "auxiliary_loss_mlp": 0.01030788, + "balance_loss_clip": 1.01827133, + "balance_loss_mlp": 1.03247118, + "epoch": 0.9663009168796032, + "flos": 14902893095040.0, + "grad_norm": 1.6878615394905503, + "language_loss": 0.66351175, + "learning_rate": 1.1879963671349137e-08, + "loss": 0.68479288, + "num_input_tokens_seen": 346804635, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6484375, + "step": 16072, + "time_per_iteration": 2.4131906032562256 + }, + { + "auxiliary_loss_clip": 0.01101893, + "auxiliary_loss_mlp": 0.01029784, + "balance_loss_clip": 1.01826262, + "balance_loss_mlp": 1.03541517, + "epoch": 0.9663610401322712, + "flos": 24310877460480.0, + "grad_norm": 1.7378428887724273, + "language_loss": 0.77110088, + "learning_rate": 1.1837614978803534e-08, + "loss": 0.79241765, + "num_input_tokens_seen": 346823070, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 16073, + "time_per_iteration": 2.4881250858306885 + }, + { + "auxiliary_loss_clip": 0.01103054, + "auxiliary_loss_mlp": 0.01032875, + "balance_loss_clip": 1.02058411, + "balance_loss_mlp": 1.03565359, + "epoch": 0.9664211633849391, + "flos": 17637849527040.0, + "grad_norm": 2.637485372515987, + "language_loss": 0.75828785, + "learning_rate": 1.1795341677574677e-08, + "loss": 0.77964711, + "num_input_tokens_seen": 346841180, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.67578125, + "step": 16074, + "time_per_iteration": 2.4300355911254883 + }, + { + "auxiliary_loss_clip": 0.01101171, + "auxiliary_loss_mlp": 0.01029268, + "balance_loss_clip": 1.01741314, + "balance_loss_mlp": 1.03474593, + "epoch": 0.9664812866376071, + "flos": 29789409588480.0, + "grad_norm": 1.48623421551312, + "language_loss": 0.75616717, + "learning_rate": 1.1753143769265728e-08, + "loss": 0.77747166, + "num_input_tokens_seen": 346864250, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 16075, + "time_per_iteration": 2.5188755989074707 + }, + { + "auxiliary_loss_clip": 0.01100287, + "auxiliary_loss_mlp": 0.01030547, + "balance_loss_clip": 1.01937151, + "balance_loss_mlp": 1.03515041, + "epoch": 0.966541409890275, + "flos": 14282320798080.0, + "grad_norm": 3.3539989887691215, + "language_loss": 0.78949571, + "learning_rate": 1.171102125547696e-08, + "loss": 0.81080413, + "num_input_tokens_seen": 346881955, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6484375, + "step": 16076, + "time_per_iteration": 2.3969225883483887 + }, + { + "auxiliary_loss_clip": 0.01101671, + "auxiliary_loss_mlp": 0.01036827, + "balance_loss_clip": 1.02472758, + "balance_loss_mlp": 1.035146, + "epoch": 0.9666015331429431, + "flos": 19860432405120.0, + "grad_norm": 1.5504558718428159, + "language_loss": 0.71859056, + "learning_rate": 1.166897413780532e-08, + "loss": 0.73997551, + "num_input_tokens_seen": 346900445, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 16077, + "time_per_iteration": 2.439351797103882 + }, + { + "auxiliary_loss_clip": 0.01098273, + "auxiliary_loss_mlp": 0.01032302, + "balance_loss_clip": 1.02069139, + "balance_loss_mlp": 1.03297472, + "epoch": 0.966661656395611, + "flos": 27125951178240.0, + "grad_norm": 1.7498332359336022, + "language_loss": 0.5911901, + "learning_rate": 1.1627002417845533e-08, + "loss": 0.61249584, + "num_input_tokens_seen": 346920135, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 16078, + "time_per_iteration": 2.4835944175720215 + }, + { + "auxiliary_loss_clip": 0.01102377, + "auxiliary_loss_mlp": 0.01032189, + "balance_loss_clip": 1.02033949, + "balance_loss_mlp": 1.03437603, + "epoch": 0.966721779648279, + "flos": 21508229848320.0, + "grad_norm": 1.7663356554518799, + "language_loss": 0.72015703, + "learning_rate": 1.158510609718899e-08, + "loss": 0.7415027, + "num_input_tokens_seen": 346940450, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 16079, + "time_per_iteration": 2.454651355743408 + }, + { + "auxiliary_loss_clip": 0.01095773, + "auxiliary_loss_mlp": 0.01027357, + "balance_loss_clip": 1.01631236, + "balance_loss_mlp": 1.03369761, + "epoch": 0.9667819029009469, + "flos": 23878118401920.0, + "grad_norm": 1.765653495348509, + "language_loss": 0.7217977, + "learning_rate": 1.1543285177424644e-08, + "loss": 0.743029, + "num_input_tokens_seen": 346960935, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.62109375, + "step": 16080, + "time_per_iteration": 2.4750964641571045 + }, + { + "auxiliary_loss_clip": 0.01098817, + "auxiliary_loss_mlp": 0.0102682, + "balance_loss_clip": 1.01583505, + "balance_loss_mlp": 1.03450656, + "epoch": 0.9668420261536149, + "flos": 21507224267520.0, + "grad_norm": 1.9225357739509432, + "language_loss": 0.73896688, + "learning_rate": 1.1501539660138115e-08, + "loss": 0.76022321, + "num_input_tokens_seen": 346980100, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.64453125, + "step": 16081, + "time_per_iteration": 2.444805145263672 + }, + { + "auxiliary_loss_clip": 0.01097756, + "auxiliary_loss_mlp": 0.01026915, + "balance_loss_clip": 1.01485133, + "balance_loss_mlp": 1.03251266, + "epoch": 0.9669021494062828, + "flos": 26687266375680.0, + "grad_norm": 2.1860138544574417, + "language_loss": 0.67122877, + "learning_rate": 1.145986954691236e-08, + "loss": 0.69247544, + "num_input_tokens_seen": 347001250, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.65625, + "step": 16082, + "time_per_iteration": 2.478701591491699 + }, + { + "auxiliary_loss_clip": 0.01097487, + "auxiliary_loss_mlp": 0.01026763, + "balance_loss_clip": 1.01560545, + "balance_loss_mlp": 1.03359115, + "epoch": 0.9669622726589508, + "flos": 29825032901760.0, + "grad_norm": 1.476092866160406, + "language_loss": 0.76806712, + "learning_rate": 1.141827483932789e-08, + "loss": 0.78930962, + "num_input_tokens_seen": 347022975, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.640625, + "step": 16083, + "time_per_iteration": 2.501891613006592 + }, + { + "auxiliary_loss_clip": 0.01099638, + "auxiliary_loss_mlp": 0.01028225, + "balance_loss_clip": 1.01670313, + "balance_loss_mlp": 1.03356194, + "epoch": 0.9670223959116189, + "flos": 22922499018240.0, + "grad_norm": 1.7992097460517922, + "language_loss": 0.79434943, + "learning_rate": 1.1376755538961669e-08, + "loss": 0.81562805, + "num_input_tokens_seen": 347038780, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 16084, + "time_per_iteration": 2.4433937072753906 + }, + { + "auxiliary_loss_clip": 0.01101573, + "auxiliary_loss_mlp": 0.01027, + "balance_loss_clip": 1.01498938, + "balance_loss_mlp": 1.03329217, + "epoch": 0.9670825191642868, + "flos": 18624495283200.0, + "grad_norm": 2.5576364134525105, + "language_loss": 0.67727828, + "learning_rate": 1.1335311647387991e-08, + "loss": 0.69856399, + "num_input_tokens_seen": 347056705, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 16085, + "time_per_iteration": 2.439408779144287 + }, + { + "auxiliary_loss_clip": 0.01104066, + "auxiliary_loss_mlp": 0.0103049, + "balance_loss_clip": 1.01778781, + "balance_loss_mlp": 1.03539014, + "epoch": 0.9671426424169548, + "flos": 24497936513280.0, + "grad_norm": 1.9103794202550979, + "language_loss": 0.68926775, + "learning_rate": 1.1293943166178709e-08, + "loss": 0.71061325, + "num_input_tokens_seen": 347075710, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 16086, + "time_per_iteration": 2.543067693710327 + }, + { + "auxiliary_loss_clip": 0.01100289, + "auxiliary_loss_mlp": 0.01032596, + "balance_loss_clip": 1.02028179, + "balance_loss_mlp": 1.03610826, + "epoch": 0.9672027656696227, + "flos": 20371189847040.0, + "grad_norm": 1.4406826333266698, + "language_loss": 0.78265107, + "learning_rate": 1.125265009690235e-08, + "loss": 0.80397993, + "num_input_tokens_seen": 347092325, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.640625, + "step": 16087, + "time_per_iteration": 2.479124069213867 + }, + { + "auxiliary_loss_clip": 0.01097717, + "auxiliary_loss_mlp": 0.01025073, + "balance_loss_clip": 1.014148, + "balance_loss_mlp": 1.03304863, + "epoch": 0.9672628889222907, + "flos": 18880179269760.0, + "grad_norm": 2.902915851013034, + "language_loss": 0.71206176, + "learning_rate": 1.1211432441124769e-08, + "loss": 0.73328972, + "num_input_tokens_seen": 347110595, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 16088, + "time_per_iteration": 2.453108310699463 + }, + { + "auxiliary_loss_clip": 0.01097715, + "auxiliary_loss_mlp": 0.01028177, + "balance_loss_clip": 1.01747155, + "balance_loss_mlp": 1.03437591, + "epoch": 0.9673230121749586, + "flos": 28695247447680.0, + "grad_norm": 1.6805016946049898, + "language_loss": 0.70649052, + "learning_rate": 1.117029020040916e-08, + "loss": 0.72774947, + "num_input_tokens_seen": 347131625, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6328125, + "step": 16089, + "time_per_iteration": 2.539914846420288 + }, + { + "auxiliary_loss_clip": 0.01102248, + "auxiliary_loss_mlp": 0.0103043, + "balance_loss_clip": 1.01866364, + "balance_loss_mlp": 1.03477347, + "epoch": 0.9673831354276267, + "flos": 20484452407680.0, + "grad_norm": 2.2645704786578604, + "language_loss": 0.74865729, + "learning_rate": 1.1129223376315167e-08, + "loss": 0.76998407, + "num_input_tokens_seen": 347147910, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.671875, + "step": 16090, + "time_per_iteration": 2.593626022338867 + }, + { + "auxiliary_loss_clip": 0.01103105, + "auxiliary_loss_mlp": 0.01029552, + "balance_loss_clip": 1.01780951, + "balance_loss_mlp": 1.03354084, + "epoch": 0.9674432586802946, + "flos": 26797548107520.0, + "grad_norm": 1.7972690157643232, + "language_loss": 0.69049466, + "learning_rate": 1.1088231970400653e-08, + "loss": 0.71182114, + "num_input_tokens_seen": 347168805, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6953125, + "step": 16091, + "time_per_iteration": 2.530670642852783 + }, + { + "auxiliary_loss_clip": 0.0109713, + "auxiliary_loss_mlp": 0.01032021, + "balance_loss_clip": 1.01994467, + "balance_loss_mlp": 1.0330565, + "epoch": 0.9675033819329626, + "flos": 22310941034880.0, + "grad_norm": 1.727007301138269, + "language_loss": 0.76661873, + "learning_rate": 1.1047315984219484e-08, + "loss": 0.78791022, + "num_input_tokens_seen": 347189455, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.640625, + "step": 16092, + "time_per_iteration": 2.6132729053497314 + }, + { + "auxiliary_loss_clip": 0.01100479, + "auxiliary_loss_mlp": 0.0102533, + "balance_loss_clip": 1.01423764, + "balance_loss_mlp": 1.03612089, + "epoch": 0.9675635051856305, + "flos": 12675713276160.0, + "grad_norm": 1.7194933349495616, + "language_loss": 0.76217842, + "learning_rate": 1.1006475419323313e-08, + "loss": 0.78343654, + "num_input_tokens_seen": 347206030, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.64453125, + "step": 16093, + "time_per_iteration": 2.6711509227752686 + }, + { + "auxiliary_loss_clip": 0.01097824, + "auxiliary_loss_mlp": 0.01024564, + "balance_loss_clip": 1.01199961, + "balance_loss_mlp": 1.03344226, + "epoch": 0.9676236284382985, + "flos": 24608469640320.0, + "grad_norm": 1.4864798894423341, + "language_loss": 0.68974423, + "learning_rate": 1.096571027726112e-08, + "loss": 0.71096814, + "num_input_tokens_seen": 347226250, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.64453125, + "step": 16094, + "time_per_iteration": 2.576261043548584 + }, + { + "auxiliary_loss_clip": 0.01101022, + "auxiliary_loss_mlp": 0.01026609, + "balance_loss_clip": 1.01573682, + "balance_loss_mlp": 1.0338856, + "epoch": 0.9676837516909664, + "flos": 23367145478400.0, + "grad_norm": 1.4390045199014274, + "language_loss": 0.75913978, + "learning_rate": 1.0925020559578557e-08, + "loss": 0.78041601, + "num_input_tokens_seen": 347247350, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.671875, + "step": 16095, + "time_per_iteration": 3.963588237762451 + }, + { + "auxiliary_loss_clip": 0.01104114, + "auxiliary_loss_mlp": 0.01032264, + "balance_loss_clip": 1.02028298, + "balance_loss_mlp": 1.035339, + "epoch": 0.9677438749436345, + "flos": 20486894532480.0, + "grad_norm": 2.016309233770184, + "language_loss": 0.70449293, + "learning_rate": 1.0884406267818392e-08, + "loss": 0.72585666, + "num_input_tokens_seen": 347266870, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6875, + "step": 16096, + "time_per_iteration": 2.521918773651123 + }, + { + "auxiliary_loss_clip": 0.01101756, + "auxiliary_loss_mlp": 0.01025236, + "balance_loss_clip": 1.01357687, + "balance_loss_mlp": 1.03581285, + "epoch": 0.9678039981963025, + "flos": 47555889719040.0, + "grad_norm": 2.9772334686732624, + "language_loss": 0.71572793, + "learning_rate": 1.0843867403520946e-08, + "loss": 0.73699778, + "num_input_tokens_seen": 347290120, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66015625, + "step": 16097, + "time_per_iteration": 4.2033936977386475 + }, + { + "auxiliary_loss_clip": 0.01098779, + "auxiliary_loss_mlp": 0.01030471, + "balance_loss_clip": 1.01911056, + "balance_loss_mlp": 1.03425193, + "epoch": 0.9678641214489704, + "flos": 25040474513280.0, + "grad_norm": 1.6763904743398519, + "language_loss": 0.77971011, + "learning_rate": 1.0803403968223434e-08, + "loss": 0.80100262, + "num_input_tokens_seen": 347308785, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 16098, + "time_per_iteration": 3.855729341506958 + }, + { + "auxiliary_loss_clip": 0.01096588, + "auxiliary_loss_mlp": 0.01027802, + "balance_loss_clip": 1.01722848, + "balance_loss_mlp": 1.03283536, + "epoch": 0.9679242447016384, + "flos": 19240937516160.0, + "grad_norm": 1.7301712267219669, + "language_loss": 0.90408123, + "learning_rate": 1.0763015963459965e-08, + "loss": 0.92532516, + "num_input_tokens_seen": 347326375, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.63671875, + "step": 16099, + "time_per_iteration": 2.468384265899658 + }, + { + "auxiliary_loss_clip": 0.01100288, + "auxiliary_loss_mlp": 0.01033081, + "balance_loss_clip": 1.02081418, + "balance_loss_mlp": 1.03329253, + "epoch": 0.9679843679543063, + "flos": 33254681345280.0, + "grad_norm": 1.6709265367884942, + "language_loss": 0.65798569, + "learning_rate": 1.0722703390762643e-08, + "loss": 0.67931938, + "num_input_tokens_seen": 347348250, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.66796875, + "step": 16100, + "time_per_iteration": 2.6282451152801514 + }, + { + "auxiliary_loss_clip": 0.01099773, + "auxiliary_loss_mlp": 0.01030447, + "balance_loss_clip": 1.01882386, + "balance_loss_mlp": 1.03416276, + "epoch": 0.9680444912069743, + "flos": 22783633038720.0, + "grad_norm": 1.474374726903324, + "language_loss": 0.73381197, + "learning_rate": 1.0682466251659584e-08, + "loss": 0.7551142, + "num_input_tokens_seen": 347367400, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 16101, + "time_per_iteration": 2.5105645656585693 + }, + { + "auxiliary_loss_clip": 0.01098487, + "auxiliary_loss_mlp": 0.01028409, + "balance_loss_clip": 1.01647615, + "balance_loss_mlp": 1.03371549, + "epoch": 0.9681046144596422, + "flos": 24024095274240.0, + "grad_norm": 1.4963336382837327, + "language_loss": 0.73430026, + "learning_rate": 1.0642304547676672e-08, + "loss": 0.75556922, + "num_input_tokens_seen": 347387600, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6484375, + "step": 16102, + "time_per_iteration": 2.5259511470794678 + }, + { + "auxiliary_loss_clip": 0.01101802, + "auxiliary_loss_mlp": 0.01032381, + "balance_loss_clip": 1.01956034, + "balance_loss_mlp": 1.03549552, + "epoch": 0.9681647377123103, + "flos": 23441013797760.0, + "grad_norm": 2.31512304657473, + "language_loss": 0.77183741, + "learning_rate": 1.0602218280337139e-08, + "loss": 0.79317927, + "num_input_tokens_seen": 347406915, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6640625, + "step": 16103, + "time_per_iteration": 4.039667129516602 + }, + { + "auxiliary_loss_clip": 0.01099986, + "auxiliary_loss_mlp": 0.01024799, + "balance_loss_clip": 1.01388574, + "balance_loss_mlp": 1.03492332, + "epoch": 0.9682248609649782, + "flos": 22675075159680.0, + "grad_norm": 1.5693653808008938, + "language_loss": 0.8058641, + "learning_rate": 1.0562207451160655e-08, + "loss": 0.82711196, + "num_input_tokens_seen": 347425140, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.65234375, + "step": 16104, + "time_per_iteration": 2.581583261489868 + }, + { + "auxiliary_loss_clip": 0.010947, + "auxiliary_loss_mlp": 0.0103063, + "balance_loss_clip": 1.02028215, + "balance_loss_mlp": 1.03151107, + "epoch": 0.9682849842176462, + "flos": 24428413739520.0, + "grad_norm": 1.888198110026545, + "language_loss": 0.77700287, + "learning_rate": 1.0522272061664672e-08, + "loss": 0.79825616, + "num_input_tokens_seen": 347446350, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.6328125, + "step": 16105, + "time_per_iteration": 2.5465734004974365 + }, + { + "auxiliary_loss_clip": 0.01021561, + "auxiliary_loss_mlp": 0.01000898, + "balance_loss_clip": 0.99988431, + "balance_loss_mlp": 1.0015564, + "epoch": 0.9683451074703141, + "flos": 59995132784640.0, + "grad_norm": 0.8173661990945631, + "language_loss": 0.56672597, + "learning_rate": 1.0482412113363536e-08, + "loss": 0.58695054, + "num_input_tokens_seen": 347510135, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.20019531, + "step": 16106, + "time_per_iteration": 3.134302854537964 + }, + { + "auxiliary_loss_clip": 0.01021505, + "auxiliary_loss_mlp": 0.00999876, + "balance_loss_clip": 0.99889243, + "balance_loss_mlp": 1.00162327, + "epoch": 0.9684052307229821, + "flos": 52696145514240.0, + "grad_norm": 0.8868946741274136, + "language_loss": 0.61609983, + "learning_rate": 1.0442627607768707e-08, + "loss": 0.63631362, + "num_input_tokens_seen": 347562505, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.19921875, + "step": 16107, + "time_per_iteration": 2.977184295654297 + }, + { + "auxiliary_loss_clip": 0.01101515, + "auxiliary_loss_mlp": 0.01034494, + "balance_loss_clip": 1.02146411, + "balance_loss_mlp": 1.03632665, + "epoch": 0.96846535397565, + "flos": 22783848520320.0, + "grad_norm": 2.080411016997331, + "language_loss": 0.73906231, + "learning_rate": 1.040291854638875e-08, + "loss": 0.76042247, + "num_input_tokens_seen": 347579150, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.65234375, + "step": 16108, + "time_per_iteration": 2.506273031234741 + }, + { + "auxiliary_loss_clip": 0.01101743, + "auxiliary_loss_mlp": 0.0102695, + "balance_loss_clip": 1.01524949, + "balance_loss_mlp": 1.03544784, + "epoch": 0.968525477228318, + "flos": 23323980309120.0, + "grad_norm": 2.6207813838672194, + "language_loss": 0.56951755, + "learning_rate": 1.0363284930729576e-08, + "loss": 0.59080446, + "num_input_tokens_seen": 347596705, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 16109, + "time_per_iteration": 2.483224868774414 + }, + { + "auxiliary_loss_clip": 0.01021716, + "auxiliary_loss_mlp": 0.01003704, + "balance_loss_clip": 1.00268459, + "balance_loss_mlp": 1.00173068, + "epoch": 0.9685856004809861, + "flos": 67882947707520.0, + "grad_norm": 0.6709491279205547, + "language_loss": 0.54244637, + "learning_rate": 1.0323726762294205e-08, + "loss": 0.56270063, + "num_input_tokens_seen": 347661870, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.20019531, + "step": 16110, + "time_per_iteration": 3.065276861190796 + }, + { + "auxiliary_loss_clip": 0.0110392, + "auxiliary_loss_mlp": 0.01037893, + "balance_loss_clip": 1.0250361, + "balance_loss_mlp": 1.035496, + "epoch": 0.968645723733654, + "flos": 33947900899200.0, + "grad_norm": 1.3847067332829404, + "language_loss": 0.62662238, + "learning_rate": 1.0284244042582325e-08, + "loss": 0.64804053, + "num_input_tokens_seen": 347684295, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.68359375, + "step": 16111, + "time_per_iteration": 2.6130480766296387 + }, + { + "auxiliary_loss_clip": 0.01096411, + "auxiliary_loss_mlp": 0.0102627, + "balance_loss_clip": 1.01571369, + "balance_loss_mlp": 1.03207159, + "epoch": 0.968705846986322, + "flos": 18551488890240.0, + "grad_norm": 3.5311052248737096, + "language_loss": 0.74400336, + "learning_rate": 1.024483677309118e-08, + "loss": 0.76523018, + "num_input_tokens_seen": 347702585, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.64453125, + "step": 16112, + "time_per_iteration": 2.4801442623138428 + }, + { + "auxiliary_loss_clip": 0.01095788, + "auxiliary_loss_mlp": 0.01026327, + "balance_loss_clip": 1.01544356, + "balance_loss_mlp": 1.03244877, + "epoch": 0.9687659702389899, + "flos": 17420913336960.0, + "grad_norm": 2.6238571521164777, + "language_loss": 0.66553986, + "learning_rate": 1.020550495531558e-08, + "loss": 0.68676102, + "num_input_tokens_seen": 347721810, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6328125, + "step": 16113, + "time_per_iteration": 2.4916794300079346 + }, + { + "auxiliary_loss_clip": 0.01021806, + "auxiliary_loss_mlp": 0.01000111, + "balance_loss_clip": 0.99910325, + "balance_loss_mlp": 1.00189781, + "epoch": 0.9688260934916579, + "flos": 62047176865920.0, + "grad_norm": 0.6957044667296043, + "language_loss": 0.56507289, + "learning_rate": 1.0166248590746329e-08, + "loss": 0.5852921, + "num_input_tokens_seen": 347782330, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.19921875, + "step": 16114, + "time_per_iteration": 3.085864305496216 + }, + { + "auxiliary_loss_clip": 0.0109922, + "auxiliary_loss_mlp": 0.01031598, + "balance_loss_clip": 1.02025533, + "balance_loss_mlp": 1.03437066, + "epoch": 0.9688862167443258, + "flos": 15076520461440.0, + "grad_norm": 1.941913365229189, + "language_loss": 0.82679498, + "learning_rate": 1.0127067680872458e-08, + "loss": 0.84810317, + "num_input_tokens_seen": 347794835, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 16115, + "time_per_iteration": 2.423555374145508 + }, + { + "auxiliary_loss_clip": 0.01093975, + "auxiliary_loss_mlp": 0.01025186, + "balance_loss_clip": 1.01466632, + "balance_loss_mlp": 1.03341866, + "epoch": 0.9689463399969939, + "flos": 19938215306880.0, + "grad_norm": 1.8462013242923505, + "language_loss": 0.72099042, + "learning_rate": 1.0087962227179448e-08, + "loss": 0.74218202, + "num_input_tokens_seen": 347814320, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.60546875, + "step": 16116, + "time_per_iteration": 2.4753947257995605 + }, + { + "auxiliary_loss_clip": 0.01101391, + "auxiliary_loss_mlp": 0.01031805, + "balance_loss_clip": 1.01981306, + "balance_loss_mlp": 1.03498697, + "epoch": 0.9690064632496618, + "flos": 19573039687680.0, + "grad_norm": 1.9755515538352788, + "language_loss": 0.75565988, + "learning_rate": 1.0048932231150553e-08, + "loss": 0.77699178, + "num_input_tokens_seen": 347832125, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6640625, + "step": 16117, + "time_per_iteration": 2.491583824157715 + }, + { + "auxiliary_loss_clip": 0.01100278, + "auxiliary_loss_mlp": 0.01031427, + "balance_loss_clip": 1.01930988, + "balance_loss_mlp": 1.0331347, + "epoch": 0.9690665865023298, + "flos": 21872292145920.0, + "grad_norm": 1.8834984101771413, + "language_loss": 0.77285224, + "learning_rate": 1.000997769426548e-08, + "loss": 0.79416931, + "num_input_tokens_seen": 347850765, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 16118, + "time_per_iteration": 2.5223984718322754 + }, + { + "auxiliary_loss_clip": 0.01102479, + "auxiliary_loss_mlp": 0.01030735, + "balance_loss_clip": 1.01985121, + "balance_loss_mlp": 1.03636765, + "epoch": 0.9691267097549977, + "flos": 20994491577600.0, + "grad_norm": 1.8607520753959634, + "language_loss": 0.78167307, + "learning_rate": 9.971098618001272e-09, + "loss": 0.80300522, + "num_input_tokens_seen": 347870125, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.66015625, + "step": 16119, + "time_per_iteration": 2.5595972537994385 + }, + { + "auxiliary_loss_clip": 0.01095474, + "auxiliary_loss_mlp": 0.01024928, + "balance_loss_clip": 1.01487851, + "balance_loss_mlp": 1.03318226, + "epoch": 0.9691868330076657, + "flos": 24279132816000.0, + "grad_norm": 1.6605084045495015, + "language_loss": 0.75243753, + "learning_rate": 9.932295003832747e-09, + "loss": 0.77364153, + "num_input_tokens_seen": 347890615, + "router_z_loss_clip": 0.10058594, + "router_z_loss_mlp": 0.625, + "step": 16120, + "time_per_iteration": 2.6170387268066406 + }, + { + "auxiliary_loss_clip": 0.01098759, + "auxiliary_loss_mlp": 0.01028252, + "balance_loss_clip": 1.01705265, + "balance_loss_mlp": 1.03436828, + "epoch": 0.9692469562603336, + "flos": 17675699483520.0, + "grad_norm": 1.8284557302359925, + "language_loss": 0.6938538, + "learning_rate": 9.89356685323095e-09, + "loss": 0.71512389, + "num_input_tokens_seen": 347908685, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.64453125, + "step": 16121, + "time_per_iteration": 2.5049474239349365 + }, + { + "auxiliary_loss_clip": 0.01098484, + "auxiliary_loss_mlp": 0.01030347, + "balance_loss_clip": 1.0194633, + "balance_loss_mlp": 1.03341174, + "epoch": 0.9693070795130017, + "flos": 26834392483200.0, + "grad_norm": 1.8406321763279332, + "language_loss": 0.69080842, + "learning_rate": 9.854914167664486e-09, + "loss": 0.71209669, + "num_input_tokens_seen": 347926385, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.65234375, + "step": 16122, + "time_per_iteration": 2.5901401042938232 + }, + { + "auxiliary_loss_clip": 0.01098492, + "auxiliary_loss_mlp": 0.01024638, + "balance_loss_clip": 1.01409388, + "balance_loss_mlp": 1.03305304, + "epoch": 0.9693672027656697, + "flos": 18077288515200.0, + "grad_norm": 2.0360794405813296, + "language_loss": 0.75851989, + "learning_rate": 9.81633694859907e-09, + "loss": 0.77975118, + "num_input_tokens_seen": 347945290, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.65625, + "step": 16123, + "time_per_iteration": 2.5255179405212402 + }, + { + "auxiliary_loss_clip": 0.0109954, + "auxiliary_loss_mlp": 0.01031283, + "balance_loss_clip": 1.0194335, + "balance_loss_mlp": 1.03278112, + "epoch": 0.9694273260183376, + "flos": 21763015994880.0, + "grad_norm": 1.7443605692530082, + "language_loss": 0.74463332, + "learning_rate": 9.777835197497753e-09, + "loss": 0.76594156, + "num_input_tokens_seen": 347966330, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66796875, + "step": 16124, + "time_per_iteration": 2.6022872924804688 + }, + { + "auxiliary_loss_clip": 0.01100209, + "auxiliary_loss_mlp": 0.01035364, + "balance_loss_clip": 1.02369344, + "balance_loss_mlp": 1.03369021, + "epoch": 0.9694874492710056, + "flos": 24426115269120.0, + "grad_norm": 2.339359667542991, + "language_loss": 0.73955059, + "learning_rate": 9.739408915820258e-09, + "loss": 0.76090634, + "num_input_tokens_seen": 347982590, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 16125, + "time_per_iteration": 2.5527896881103516 + }, + { + "auxiliary_loss_clip": 0.01021717, + "auxiliary_loss_mlp": 0.00999829, + "balance_loss_clip": 0.99881619, + "balance_loss_mlp": 1.00180256, + "epoch": 0.9695475725236735, + "flos": 67650748237440.0, + "grad_norm": 0.874354260024892, + "language_loss": 0.61542535, + "learning_rate": 9.70105810502364e-09, + "loss": 0.63564086, + "num_input_tokens_seen": 348043310, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.19921875, + "step": 16126, + "time_per_iteration": 3.06150484085083 + }, + { + "auxiliary_loss_clip": 0.01097857, + "auxiliary_loss_mlp": 0.01031948, + "balance_loss_clip": 1.02084327, + "balance_loss_mlp": 1.03425908, + "epoch": 0.9696076957763415, + "flos": 19129326981120.0, + "grad_norm": 2.1736850591790065, + "language_loss": 0.74991131, + "learning_rate": 9.662782766562738e-09, + "loss": 0.77120936, + "num_input_tokens_seen": 348062200, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.63671875, + "step": 16127, + "time_per_iteration": 2.525707721710205 + }, + { + "auxiliary_loss_clip": 0.01099234, + "auxiliary_loss_mlp": 0.01032045, + "balance_loss_clip": 1.0201714, + "balance_loss_mlp": 1.03222942, + "epoch": 0.9696678190290094, + "flos": 15486836497920.0, + "grad_norm": 1.554917519282315, + "language_loss": 0.68819076, + "learning_rate": 9.62458290188839e-09, + "loss": 0.70950353, + "num_input_tokens_seen": 348080685, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66796875, + "step": 16128, + "time_per_iteration": 2.5169262886047363 + }, + { + "auxiliary_loss_clip": 0.01099961, + "auxiliary_loss_mlp": 0.01030218, + "balance_loss_clip": 1.01845229, + "balance_loss_mlp": 1.03504729, + "epoch": 0.9697279422816775, + "flos": 36208692869760.0, + "grad_norm": 1.5275283180861672, + "language_loss": 0.65348375, + "learning_rate": 9.586458512449213e-09, + "loss": 0.67478549, + "num_input_tokens_seen": 348102500, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6484375, + "step": 16129, + "time_per_iteration": 2.6302707195281982 + }, + { + "auxiliary_loss_clip": 0.01103145, + "auxiliary_loss_mlp": 0.01029964, + "balance_loss_clip": 1.01782882, + "balance_loss_mlp": 1.03422213, + "epoch": 0.9697880655343454, + "flos": 25484007651840.0, + "grad_norm": 3.3085134277813424, + "language_loss": 0.63307977, + "learning_rate": 9.548409599691166e-09, + "loss": 0.6544109, + "num_input_tokens_seen": 348122515, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6875, + "step": 16130, + "time_per_iteration": 2.547057628631592 + }, + { + "auxiliary_loss_clip": 0.01103028, + "auxiliary_loss_mlp": 0.01027017, + "balance_loss_clip": 1.01538217, + "balance_loss_mlp": 1.03523135, + "epoch": 0.9698481887870134, + "flos": 15333533251200.0, + "grad_norm": 2.5471865072726056, + "language_loss": 0.69608688, + "learning_rate": 9.510436165056867e-09, + "loss": 0.71738738, + "num_input_tokens_seen": 348138775, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.67578125, + "step": 16131, + "time_per_iteration": 2.4412412643432617 + }, + { + "auxiliary_loss_clip": 0.01101007, + "auxiliary_loss_mlp": 0.01031871, + "balance_loss_clip": 1.0200696, + "balance_loss_mlp": 1.03424954, + "epoch": 0.9699083120396813, + "flos": 21982250655360.0, + "grad_norm": 1.86472716215598, + "language_loss": 0.76548707, + "learning_rate": 9.472538209986058e-09, + "loss": 0.78681588, + "num_input_tokens_seen": 348157115, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.66796875, + "step": 16132, + "time_per_iteration": 2.5090508460998535 + }, + { + "auxiliary_loss_clip": 0.01102566, + "auxiliary_loss_mlp": 0.01035559, + "balance_loss_clip": 1.02364969, + "balance_loss_mlp": 1.03595448, + "epoch": 0.9699684352923493, + "flos": 15664055224320.0, + "grad_norm": 3.9786212871576443, + "language_loss": 0.78581774, + "learning_rate": 9.434715735916477e-09, + "loss": 0.80719894, + "num_input_tokens_seen": 348173035, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66796875, + "step": 16133, + "time_per_iteration": 2.4141860008239746 + }, + { + "auxiliary_loss_clip": 0.01095458, + "auxiliary_loss_mlp": 0.01029888, + "balance_loss_clip": 1.01899862, + "balance_loss_mlp": 1.03250551, + "epoch": 0.9700285585450172, + "flos": 21908382336000.0, + "grad_norm": 1.6269378800137178, + "language_loss": 0.64601958, + "learning_rate": 9.396968744281863e-09, + "loss": 0.66727304, + "num_input_tokens_seen": 348192960, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.62890625, + "step": 16134, + "time_per_iteration": 2.532543182373047 + }, + { + "auxiliary_loss_clip": 0.0109814, + "auxiliary_loss_mlp": 0.01029389, + "balance_loss_clip": 1.01761711, + "balance_loss_mlp": 1.03281355, + "epoch": 0.9700886817976853, + "flos": 23914890950400.0, + "grad_norm": 1.9357465351704168, + "language_loss": 0.80777168, + "learning_rate": 9.359297236513519e-09, + "loss": 0.82904708, + "num_input_tokens_seen": 348212805, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.65234375, + "step": 16135, + "time_per_iteration": 2.4684133529663086 + }, + { + "auxiliary_loss_clip": 0.01102843, + "auxiliary_loss_mlp": 0.01031563, + "balance_loss_clip": 1.01883101, + "balance_loss_mlp": 1.03501081, + "epoch": 0.9701488050503532, + "flos": 25447845634560.0, + "grad_norm": 2.1122625470948577, + "language_loss": 0.72945958, + "learning_rate": 9.321701214040079e-09, + "loss": 0.75080359, + "num_input_tokens_seen": 348232900, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 16136, + "time_per_iteration": 2.550011157989502 + }, + { + "auxiliary_loss_clip": 0.0109795, + "auxiliary_loss_mlp": 0.01026, + "balance_loss_clip": 1.01561141, + "balance_loss_mlp": 1.03390837, + "epoch": 0.9702089283030212, + "flos": 20590855470720.0, + "grad_norm": 1.5366970636246593, + "language_loss": 0.76298726, + "learning_rate": 9.28418067828729e-09, + "loss": 0.78422666, + "num_input_tokens_seen": 348253065, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.640625, + "step": 16137, + "time_per_iteration": 3.8292956352233887 + }, + { + "auxiliary_loss_clip": 0.01021726, + "auxiliary_loss_mlp": 0.01002432, + "balance_loss_clip": 1.00152612, + "balance_loss_mlp": 1.00163674, + "epoch": 0.9702690515556892, + "flos": 70651516291200.0, + "grad_norm": 1.5028472325404159, + "language_loss": 0.54901278, + "learning_rate": 9.246735630678015e-09, + "loss": 0.56925428, + "num_input_tokens_seen": 348316075, + "router_z_loss_clip": 0.0090332, + "router_z_loss_mlp": 0.20117188, + "step": 16138, + "time_per_iteration": 3.1473779678344727 + }, + { + "auxiliary_loss_clip": 0.01098109, + "auxiliary_loss_mlp": 0.01031315, + "balance_loss_clip": 1.02032351, + "balance_loss_mlp": 1.03277564, + "epoch": 0.9703291748083571, + "flos": 35881439034240.0, + "grad_norm": 1.6714662487676892, + "language_loss": 0.70472324, + "learning_rate": 9.209366072632007e-09, + "loss": 0.72601748, + "num_input_tokens_seen": 348337605, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.65625, + "step": 16139, + "time_per_iteration": 5.382527828216553 + }, + { + "auxiliary_loss_clip": 0.01101757, + "auxiliary_loss_mlp": 0.01028848, + "balance_loss_clip": 1.01703477, + "balance_loss_mlp": 1.03570795, + "epoch": 0.9703892980610251, + "flos": 24316479982080.0, + "grad_norm": 1.5984229449176695, + "language_loss": 0.72570795, + "learning_rate": 9.172072005566134e-09, + "loss": 0.74701405, + "num_input_tokens_seen": 348359430, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66015625, + "step": 16140, + "time_per_iteration": 2.500535249710083 + }, + { + "auxiliary_loss_clip": 0.01104371, + "auxiliary_loss_mlp": 0.01036206, + "balance_loss_clip": 1.02412963, + "balance_loss_mlp": 1.03588057, + "epoch": 0.970449421313693, + "flos": 18003743418240.0, + "grad_norm": 2.360721566613716, + "language_loss": 0.67877102, + "learning_rate": 9.13485343089504e-09, + "loss": 0.70017684, + "num_input_tokens_seen": 348377890, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.68359375, + "step": 16141, + "time_per_iteration": 2.4640913009643555 + }, + { + "auxiliary_loss_clip": 0.01095646, + "auxiliary_loss_mlp": 0.01029215, + "balance_loss_clip": 1.01804554, + "balance_loss_mlp": 1.03266811, + "epoch": 0.9705095445663611, + "flos": 25337994865920.0, + "grad_norm": 1.8701728936765305, + "language_loss": 0.68670142, + "learning_rate": 9.097710350029597e-09, + "loss": 0.70795, + "num_input_tokens_seen": 348396550, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.62890625, + "step": 16142, + "time_per_iteration": 2.477365493774414 + }, + { + "auxiliary_loss_clip": 0.0109703, + "auxiliary_loss_mlp": 0.01027934, + "balance_loss_clip": 1.01635337, + "balance_loss_mlp": 1.03252649, + "epoch": 0.970569667819029, + "flos": 26833602384000.0, + "grad_norm": 1.7508245521385353, + "language_loss": 0.55955529, + "learning_rate": 9.060642764378457e-09, + "loss": 0.58080494, + "num_input_tokens_seen": 348417120, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.64453125, + "step": 16143, + "time_per_iteration": 2.5553791522979736 + }, + { + "auxiliary_loss_clip": 0.01100849, + "auxiliary_loss_mlp": 0.01029513, + "balance_loss_clip": 1.01848042, + "balance_loss_mlp": 1.034536, + "epoch": 0.970629791071697, + "flos": 25848644567040.0, + "grad_norm": 1.9577876836122245, + "language_loss": 0.67899948, + "learning_rate": 9.023650675347382e-09, + "loss": 0.70030308, + "num_input_tokens_seen": 348437750, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6640625, + "step": 16144, + "time_per_iteration": 4.970671892166138 + }, + { + "auxiliary_loss_clip": 0.01099463, + "auxiliary_loss_mlp": 0.01041949, + "balance_loss_clip": 1.03105295, + "balance_loss_mlp": 1.03441381, + "epoch": 0.9706899143243649, + "flos": 36540184510080.0, + "grad_norm": 1.7212412330787912, + "language_loss": 0.71903557, + "learning_rate": 8.986734084339253e-09, + "loss": 0.74044967, + "num_input_tokens_seen": 348460935, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6484375, + "step": 16145, + "time_per_iteration": 2.5942580699920654 + }, + { + "auxiliary_loss_clip": 0.01100216, + "auxiliary_loss_mlp": 0.01030004, + "balance_loss_clip": 1.01734352, + "balance_loss_mlp": 1.03212011, + "epoch": 0.9707500375770329, + "flos": 12268234414080.0, + "grad_norm": 2.8276003686197018, + "language_loss": 0.79872471, + "learning_rate": 8.949892992753395e-09, + "loss": 0.82002687, + "num_input_tokens_seen": 348474480, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 16146, + "time_per_iteration": 2.401989221572876 + }, + { + "auxiliary_loss_clip": 0.01021551, + "auxiliary_loss_mlp": 0.01001342, + "balance_loss_clip": 1.00041199, + "balance_loss_mlp": 1.00153255, + "epoch": 0.9708101608297008, + "flos": 60853040196480.0, + "grad_norm": 0.7626452621026454, + "language_loss": 0.54555905, + "learning_rate": 8.91312740198713e-09, + "loss": 0.56578797, + "num_input_tokens_seen": 348541220, + "router_z_loss_clip": 0.00927734, + "router_z_loss_mlp": 0.20019531, + "step": 16147, + "time_per_iteration": 3.0902152061462402 + }, + { + "auxiliary_loss_clip": 0.01102002, + "auxiliary_loss_mlp": 0.01031656, + "balance_loss_clip": 1.01913333, + "balance_loss_mlp": 1.0341568, + "epoch": 0.9708702840823689, + "flos": 27124766029440.0, + "grad_norm": 3.8974321244687964, + "language_loss": 0.61855692, + "learning_rate": 8.876437313434682e-09, + "loss": 0.63989353, + "num_input_tokens_seen": 348559230, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 16148, + "time_per_iteration": 2.464473247528076 + }, + { + "auxiliary_loss_clip": 0.01096857, + "auxiliary_loss_mlp": 0.01036108, + "balance_loss_clip": 1.02500391, + "balance_loss_mlp": 1.03314471, + "epoch": 0.9709304073350368, + "flos": 20777699041920.0, + "grad_norm": 1.6335067139956454, + "language_loss": 0.73529303, + "learning_rate": 8.839822728487155e-09, + "loss": 0.75662261, + "num_input_tokens_seen": 348577850, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.63671875, + "step": 16149, + "time_per_iteration": 2.4322702884674072 + }, + { + "auxiliary_loss_clip": 0.01097685, + "auxiliary_loss_mlp": 0.01033949, + "balance_loss_clip": 1.02248096, + "balance_loss_mlp": 1.03168344, + "epoch": 0.9709905305877048, + "flos": 41934541115520.0, + "grad_norm": 3.032692446472873, + "language_loss": 0.75145626, + "learning_rate": 8.803283648533222e-09, + "loss": 0.77277255, + "num_input_tokens_seen": 348598345, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66015625, + "step": 16150, + "time_per_iteration": 2.597921848297119 + }, + { + "auxiliary_loss_clip": 0.01107309, + "auxiliary_loss_mlp": 0.01030192, + "balance_loss_clip": 1.01678681, + "balance_loss_mlp": 1.03694558, + "epoch": 0.9710506538403728, + "flos": 17165588486400.0, + "grad_norm": 2.1049306282297358, + "language_loss": 0.73670769, + "learning_rate": 8.766820074958214e-09, + "loss": 0.75808269, + "num_input_tokens_seen": 348616300, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.703125, + "step": 16151, + "time_per_iteration": 2.422347068786621 + }, + { + "auxiliary_loss_clip": 0.01098403, + "auxiliary_loss_mlp": 0.01027737, + "balance_loss_clip": 1.01606655, + "balance_loss_mlp": 1.03450835, + "epoch": 0.9711107770930407, + "flos": 21173470070400.0, + "grad_norm": 1.7013232135695202, + "language_loss": 0.74849296, + "learning_rate": 8.730432009145027e-09, + "loss": 0.76975429, + "num_input_tokens_seen": 348633845, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.63671875, + "step": 16152, + "time_per_iteration": 2.4549672603607178 + }, + { + "auxiliary_loss_clip": 0.01098407, + "auxiliary_loss_mlp": 0.01032676, + "balance_loss_clip": 1.02126741, + "balance_loss_mlp": 1.03373194, + "epoch": 0.9711709003457087, + "flos": 22237072715520.0, + "grad_norm": 1.6070864987804534, + "language_loss": 0.66789192, + "learning_rate": 8.694119452473448e-09, + "loss": 0.68920273, + "num_input_tokens_seen": 348653070, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 16153, + "time_per_iteration": 2.4515061378479004 + }, + { + "auxiliary_loss_clip": 0.01099334, + "auxiliary_loss_mlp": 0.01029741, + "balance_loss_clip": 1.01902413, + "balance_loss_mlp": 1.03360021, + "epoch": 0.9712310235983767, + "flos": 26213856099840.0, + "grad_norm": 1.5238206763450304, + "language_loss": 0.703457, + "learning_rate": 8.65788240632037e-09, + "loss": 0.72474778, + "num_input_tokens_seen": 348672145, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.65625, + "step": 16154, + "time_per_iteration": 2.478994131088257 + }, + { + "auxiliary_loss_clip": 0.01103679, + "auxiliary_loss_mlp": 0.01030383, + "balance_loss_clip": 1.01762736, + "balance_loss_mlp": 1.03658104, + "epoch": 0.9712911468510447, + "flos": 20668171495680.0, + "grad_norm": 2.2051583809409507, + "language_loss": 0.8076309, + "learning_rate": 8.621720872059812e-09, + "loss": 0.82897151, + "num_input_tokens_seen": 348690615, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.671875, + "step": 16155, + "time_per_iteration": 2.4331750869750977 + }, + { + "auxiliary_loss_clip": 0.01102255, + "auxiliary_loss_mlp": 0.01037292, + "balance_loss_clip": 1.02448285, + "balance_loss_mlp": 1.0339818, + "epoch": 0.9713512701037126, + "flos": 13552903313280.0, + "grad_norm": 2.1467214162660357, + "language_loss": 0.67530596, + "learning_rate": 8.58563485106334e-09, + "loss": 0.69670147, + "num_input_tokens_seen": 348708665, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.68359375, + "step": 16156, + "time_per_iteration": 2.41339111328125 + }, + { + "auxiliary_loss_clip": 0.01100256, + "auxiliary_loss_mlp": 0.01032008, + "balance_loss_clip": 1.02119589, + "balance_loss_mlp": 1.03306246, + "epoch": 0.9714113933563806, + "flos": 25848752307840.0, + "grad_norm": 2.5197090759415994, + "language_loss": 0.90636677, + "learning_rate": 8.54962434469919e-09, + "loss": 0.92768943, + "num_input_tokens_seen": 348726105, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.671875, + "step": 16157, + "time_per_iteration": 2.470351219177246 + }, + { + "auxiliary_loss_clip": 0.01101295, + "auxiliary_loss_mlp": 0.01027685, + "balance_loss_clip": 1.0168134, + "balance_loss_mlp": 1.03504801, + "epoch": 0.9714715166090485, + "flos": 12743081233920.0, + "grad_norm": 1.914722039633016, + "language_loss": 0.72404706, + "learning_rate": 8.513689354332721e-09, + "loss": 0.74533689, + "num_input_tokens_seen": 348743360, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6640625, + "step": 16158, + "time_per_iteration": 2.4336278438568115 + }, + { + "auxiliary_loss_clip": 0.01098334, + "auxiliary_loss_mlp": 0.01036411, + "balance_loss_clip": 1.02518129, + "balance_loss_mlp": 1.03437686, + "epoch": 0.9715316398617165, + "flos": 18405547931520.0, + "grad_norm": 2.239444291406118, + "language_loss": 0.60365427, + "learning_rate": 8.477829881326836e-09, + "loss": 0.62500173, + "num_input_tokens_seen": 348759045, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 16159, + "time_per_iteration": 2.4120450019836426 + }, + { + "auxiliary_loss_clip": 0.01094573, + "auxiliary_loss_mlp": 0.01026284, + "balance_loss_clip": 1.01615119, + "balance_loss_mlp": 1.03301239, + "epoch": 0.9715917631143844, + "flos": 28913799749760.0, + "grad_norm": 1.7275999406739457, + "language_loss": 0.78775787, + "learning_rate": 8.44204592704112e-09, + "loss": 0.80896652, + "num_input_tokens_seen": 348779910, + "router_z_loss_clip": 0.10107422, + "router_z_loss_mlp": 0.61328125, + "step": 16160, + "time_per_iteration": 2.476292133331299 + }, + { + "auxiliary_loss_clip": 0.01021802, + "auxiliary_loss_mlp": 0.00997801, + "balance_loss_clip": 0.99682945, + "balance_loss_mlp": 1.00181723, + "epoch": 0.9716518863670525, + "flos": 65939712900480.0, + "grad_norm": 0.7688722643219095, + "language_loss": 0.54272866, + "learning_rate": 8.406337492832704e-09, + "loss": 0.56292468, + "num_input_tokens_seen": 348838995, + "router_z_loss_clip": 0.00970459, + "router_z_loss_mlp": 0.19921875, + "step": 16161, + "time_per_iteration": 3.047849655151367 + }, + { + "auxiliary_loss_clip": 0.01097904, + "auxiliary_loss_mlp": 0.01032667, + "balance_loss_clip": 1.02122283, + "balance_loss_mlp": 1.0354929, + "epoch": 0.9717120096197204, + "flos": 17712759340800.0, + "grad_norm": 1.7365034375945647, + "language_loss": 0.71583688, + "learning_rate": 8.3707045800554e-09, + "loss": 0.73714256, + "num_input_tokens_seen": 348858090, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.625, + "step": 16162, + "time_per_iteration": 2.3995373249053955 + }, + { + "auxiliary_loss_clip": 0.0109594, + "auxiliary_loss_mlp": 0.01027118, + "balance_loss_clip": 1.01500595, + "balance_loss_mlp": 1.03175616, + "epoch": 0.9717721328723884, + "flos": 24463426521600.0, + "grad_norm": 1.5073534732463694, + "language_loss": 0.7864207, + "learning_rate": 8.335147190060787e-09, + "loss": 0.80765128, + "num_input_tokens_seen": 348877885, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.640625, + "step": 16163, + "time_per_iteration": 2.498755931854248 + }, + { + "auxiliary_loss_clip": 0.01097248, + "auxiliary_loss_mlp": 0.01023869, + "balance_loss_clip": 1.01309824, + "balance_loss_mlp": 1.03388119, + "epoch": 0.9718322561250564, + "flos": 20776477979520.0, + "grad_norm": 2.0532833626708324, + "language_loss": 0.72809923, + "learning_rate": 8.299665324196903e-09, + "loss": 0.74931037, + "num_input_tokens_seen": 348897720, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6328125, + "step": 16164, + "time_per_iteration": 2.435837507247925 + }, + { + "auxiliary_loss_clip": 0.01100258, + "auxiliary_loss_mlp": 0.01034391, + "balance_loss_clip": 1.02304852, + "balance_loss_mlp": 1.03418469, + "epoch": 0.9718923793777243, + "flos": 19025904746880.0, + "grad_norm": 2.27361632758078, + "language_loss": 0.84098649, + "learning_rate": 8.264258983809114e-09, + "loss": 0.86233294, + "num_input_tokens_seen": 348915410, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.66015625, + "step": 16165, + "time_per_iteration": 2.416750907897949 + }, + { + "auxiliary_loss_clip": 0.01097846, + "auxiliary_loss_mlp": 0.01023556, + "balance_loss_clip": 1.01333344, + "balance_loss_mlp": 1.03371215, + "epoch": 0.9719525026303923, + "flos": 21871717528320.0, + "grad_norm": 1.5641110975288823, + "language_loss": 0.79189312, + "learning_rate": 8.228928170240345e-09, + "loss": 0.81310713, + "num_input_tokens_seen": 348934335, + "router_z_loss_clip": 0.10205078, + "router_z_loss_mlp": 0.640625, + "step": 16166, + "time_per_iteration": 2.4303882122039795 + }, + { + "auxiliary_loss_clip": 0.01100301, + "auxiliary_loss_mlp": 0.01025286, + "balance_loss_clip": 1.01418757, + "balance_loss_mlp": 1.03471613, + "epoch": 0.9720126258830603, + "flos": 14429303251200.0, + "grad_norm": 1.6876740333466311, + "language_loss": 0.70820624, + "learning_rate": 8.193672884830195e-09, + "loss": 0.72946215, + "num_input_tokens_seen": 348952405, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.65625, + "step": 16167, + "time_per_iteration": 2.394996166229248 + }, + { + "auxiliary_loss_clip": 0.01099049, + "auxiliary_loss_mlp": 0.01033231, + "balance_loss_clip": 1.02233577, + "balance_loss_mlp": 1.03586316, + "epoch": 0.9720727491357283, + "flos": 26251167352320.0, + "grad_norm": 1.4769510374268846, + "language_loss": 0.75561023, + "learning_rate": 8.158493128915812e-09, + "loss": 0.77693301, + "num_input_tokens_seen": 348973580, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6328125, + "step": 16168, + "time_per_iteration": 2.4698002338409424 + }, + { + "auxiliary_loss_clip": 0.01101935, + "auxiliary_loss_mlp": 0.01033782, + "balance_loss_clip": 1.02172387, + "balance_loss_mlp": 1.03548265, + "epoch": 0.9721328723883962, + "flos": 22674105492480.0, + "grad_norm": 2.5452954441624596, + "language_loss": 0.72678661, + "learning_rate": 8.123388903830797e-09, + "loss": 0.74814385, + "num_input_tokens_seen": 348992035, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 16169, + "time_per_iteration": 2.4278223514556885 + }, + { + "auxiliary_loss_clip": 0.01100414, + "auxiliary_loss_mlp": 0.0103247, + "balance_loss_clip": 1.02016115, + "balance_loss_mlp": 1.03232074, + "epoch": 0.9721929956410642, + "flos": 28074172360320.0, + "grad_norm": 1.704456285146014, + "language_loss": 0.57650185, + "learning_rate": 8.088360210906309e-09, + "loss": 0.59783065, + "num_input_tokens_seen": 349013160, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6796875, + "step": 16170, + "time_per_iteration": 2.481048583984375 + }, + { + "auxiliary_loss_clip": 0.01100814, + "auxiliary_loss_mlp": 0.01028453, + "balance_loss_clip": 1.01628804, + "balance_loss_mlp": 1.03412509, + "epoch": 0.9722531188937321, + "flos": 20996251344000.0, + "grad_norm": 1.7511216977437811, + "language_loss": 0.71781224, + "learning_rate": 8.053407051471062e-09, + "loss": 0.73910493, + "num_input_tokens_seen": 349033485, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.66796875, + "step": 16171, + "time_per_iteration": 2.434035301208496 + }, + { + "auxiliary_loss_clip": 0.01098692, + "auxiliary_loss_mlp": 0.01035091, + "balance_loss_clip": 1.02387321, + "balance_loss_mlp": 1.03350592, + "epoch": 0.9723132421464001, + "flos": 16070600332800.0, + "grad_norm": 1.5945737465594831, + "language_loss": 0.684237, + "learning_rate": 8.018529426850218e-09, + "loss": 0.70557481, + "num_input_tokens_seen": 349051705, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 16172, + "time_per_iteration": 2.4013216495513916 + }, + { + "auxiliary_loss_clip": 0.01097294, + "auxiliary_loss_mlp": 0.01026054, + "balance_loss_clip": 1.01489615, + "balance_loss_mlp": 1.03379488, + "epoch": 0.972373365399068, + "flos": 27745769289600.0, + "grad_norm": 1.7859971927682219, + "language_loss": 0.86250716, + "learning_rate": 7.983727338366274e-09, + "loss": 0.88374066, + "num_input_tokens_seen": 349070825, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6328125, + "step": 16173, + "time_per_iteration": 2.462507486343384 + }, + { + "auxiliary_loss_clip": 0.01105205, + "auxiliary_loss_mlp": 0.0102972, + "balance_loss_clip": 1.01646948, + "balance_loss_mlp": 1.03571761, + "epoch": 0.9724334886517361, + "flos": 23002939526400.0, + "grad_norm": 1.8689935114845415, + "language_loss": 0.64200556, + "learning_rate": 7.949000787339289e-09, + "loss": 0.66335481, + "num_input_tokens_seen": 349089730, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6953125, + "step": 16174, + "time_per_iteration": 2.4644393920898438 + }, + { + "auxiliary_loss_clip": 0.01098429, + "auxiliary_loss_mlp": 0.01025972, + "balance_loss_clip": 1.01502836, + "balance_loss_mlp": 1.03431141, + "epoch": 0.972493611904404, + "flos": 25447055535360.0, + "grad_norm": 1.3755488882571432, + "language_loss": 0.77686203, + "learning_rate": 7.914349775085538e-09, + "loss": 0.79810601, + "num_input_tokens_seen": 349111315, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.640625, + "step": 16175, + "time_per_iteration": 2.4805030822753906 + }, + { + "auxiliary_loss_clip": 0.01098893, + "auxiliary_loss_mlp": 0.01031389, + "balance_loss_clip": 1.01919389, + "balance_loss_mlp": 1.03381467, + "epoch": 0.972553735157072, + "flos": 16983054547200.0, + "grad_norm": 2.3866480046960525, + "language_loss": 0.56767201, + "learning_rate": 7.879774302919307e-09, + "loss": 0.58897483, + "num_input_tokens_seen": 349129495, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65234375, + "step": 16176, + "time_per_iteration": 2.4352569580078125 + }, + { + "auxiliary_loss_clip": 0.01100519, + "auxiliary_loss_mlp": 0.01029542, + "balance_loss_clip": 1.01862288, + "balance_loss_mlp": 1.0360986, + "epoch": 0.97261385840974, + "flos": 26104651776000.0, + "grad_norm": 2.0671972006538066, + "language_loss": 0.72051632, + "learning_rate": 7.845274372151545e-09, + "loss": 0.74181688, + "num_input_tokens_seen": 349148850, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.64453125, + "step": 16177, + "time_per_iteration": 2.479685068130493 + }, + { + "auxiliary_loss_clip": 0.0109915, + "auxiliary_loss_mlp": 0.01029489, + "balance_loss_clip": 1.01790774, + "balance_loss_mlp": 1.0325197, + "epoch": 0.9726739816624079, + "flos": 25447881548160.0, + "grad_norm": 1.6206813566846388, + "language_loss": 0.68881011, + "learning_rate": 7.810849984090984e-09, + "loss": 0.71009654, + "num_input_tokens_seen": 349167620, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6640625, + "step": 16178, + "time_per_iteration": 3.8625214099884033 + }, + { + "auxiliary_loss_clip": 0.01100578, + "auxiliary_loss_mlp": 0.01033252, + "balance_loss_clip": 1.02159894, + "balance_loss_mlp": 1.03372669, + "epoch": 0.972734104915076, + "flos": 29014923513600.0, + "grad_norm": 1.8858437543885507, + "language_loss": 0.67199779, + "learning_rate": 7.776501140042358e-09, + "loss": 0.69333607, + "num_input_tokens_seen": 349185845, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66796875, + "step": 16179, + "time_per_iteration": 2.4749538898468018 + }, + { + "auxiliary_loss_clip": 0.0109792, + "auxiliary_loss_mlp": 0.01031186, + "balance_loss_clip": 1.01961064, + "balance_loss_mlp": 1.0341022, + "epoch": 0.9727942281677439, + "flos": 23437637919360.0, + "grad_norm": 2.0869429380245843, + "language_loss": 0.77196532, + "learning_rate": 7.742227841308624e-09, + "loss": 0.7932564, + "num_input_tokens_seen": 349204525, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.640625, + "step": 16180, + "time_per_iteration": 3.8608553409576416 + }, + { + "auxiliary_loss_clip": 0.01102292, + "auxiliary_loss_mlp": 0.01030323, + "balance_loss_clip": 1.0184679, + "balance_loss_mlp": 1.03368819, + "epoch": 0.9728543514204119, + "flos": 31724599749120.0, + "grad_norm": 1.5558124846538366, + "language_loss": 0.76269901, + "learning_rate": 7.708030089189188e-09, + "loss": 0.78402507, + "num_input_tokens_seen": 349228075, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6875, + "step": 16181, + "time_per_iteration": 3.876532793045044 + }, + { + "auxiliary_loss_clip": 0.01097363, + "auxiliary_loss_mlp": 0.01030041, + "balance_loss_clip": 1.01892495, + "balance_loss_mlp": 1.0327394, + "epoch": 0.9729144746730798, + "flos": 16289368116480.0, + "grad_norm": 1.486326372174707, + "language_loss": 0.63157636, + "learning_rate": 7.67390788498079e-09, + "loss": 0.65285045, + "num_input_tokens_seen": 349246990, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 16182, + "time_per_iteration": 2.413458824157715 + }, + { + "auxiliary_loss_clip": 0.01101279, + "auxiliary_loss_mlp": 0.01033844, + "balance_loss_clip": 1.0226686, + "balance_loss_mlp": 1.03501475, + "epoch": 0.9729745979257478, + "flos": 25041408266880.0, + "grad_norm": 1.6223683298394753, + "language_loss": 0.62082142, + "learning_rate": 7.639861229977507e-09, + "loss": 0.64217269, + "num_input_tokens_seen": 349265890, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6640625, + "step": 16183, + "time_per_iteration": 2.4679312705993652 + }, + { + "auxiliary_loss_clip": 0.01097049, + "auxiliary_loss_mlp": 0.010324, + "balance_loss_clip": 1.02033019, + "balance_loss_mlp": 1.0336585, + "epoch": 0.9730347211784157, + "flos": 22638733574400.0, + "grad_norm": 1.61421361964316, + "language_loss": 0.77789152, + "learning_rate": 7.605890125470527e-09, + "loss": 0.79918599, + "num_input_tokens_seen": 349285275, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6328125, + "step": 16184, + "time_per_iteration": 2.443528652191162 + }, + { + "auxiliary_loss_clip": 0.01096093, + "auxiliary_loss_mlp": 0.01028266, + "balance_loss_clip": 1.01678014, + "balance_loss_mlp": 1.03245926, + "epoch": 0.9730948444310837, + "flos": 10998613313280.0, + "grad_norm": 2.093845903397055, + "language_loss": 0.79169863, + "learning_rate": 7.571994572747709e-09, + "loss": 0.81294221, + "num_input_tokens_seen": 349301515, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.63671875, + "step": 16185, + "time_per_iteration": 2.3952207565307617 + }, + { + "auxiliary_loss_clip": 0.0110047, + "auxiliary_loss_mlp": 0.01029137, + "balance_loss_clip": 1.01785386, + "balance_loss_mlp": 1.03438795, + "epoch": 0.9731549676837516, + "flos": 16799479113600.0, + "grad_norm": 1.7127735335933876, + "language_loss": 0.77540267, + "learning_rate": 7.538174573094469e-09, + "loss": 0.79669875, + "num_input_tokens_seen": 349319590, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.66015625, + "step": 16186, + "time_per_iteration": 3.935059070587158 + }, + { + "auxiliary_loss_clip": 0.01096754, + "auxiliary_loss_mlp": 0.01029157, + "balance_loss_clip": 1.01746225, + "balance_loss_mlp": 1.03295803, + "epoch": 0.9732150909364197, + "flos": 21141761339520.0, + "grad_norm": 1.5472928038095195, + "language_loss": 0.65344584, + "learning_rate": 7.504430127793337e-09, + "loss": 0.67470491, + "num_input_tokens_seen": 349339230, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.63671875, + "step": 16187, + "time_per_iteration": 2.4246881008148193 + }, + { + "auxiliary_loss_clip": 0.01096472, + "auxiliary_loss_mlp": 0.01030691, + "balance_loss_clip": 1.01973534, + "balance_loss_mlp": 1.03258657, + "epoch": 0.9732752141890876, + "flos": 33727337435520.0, + "grad_norm": 1.6847924527129516, + "language_loss": 0.80288476, + "learning_rate": 7.47076123812418e-09, + "loss": 0.8241564, + "num_input_tokens_seen": 349361155, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.640625, + "step": 16188, + "time_per_iteration": 2.5142602920532227 + }, + { + "auxiliary_loss_clip": 0.01095375, + "auxiliary_loss_mlp": 0.01026037, + "balance_loss_clip": 1.01564157, + "balance_loss_mlp": 1.03211975, + "epoch": 0.9733353374417556, + "flos": 23404384903680.0, + "grad_norm": 1.8691744941970168, + "language_loss": 0.78207177, + "learning_rate": 7.437167905363084e-09, + "loss": 0.80328584, + "num_input_tokens_seen": 349379335, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.6328125, + "step": 16189, + "time_per_iteration": 2.514826536178589 + }, + { + "auxiliary_loss_clip": 0.01096117, + "auxiliary_loss_mlp": 0.01027574, + "balance_loss_clip": 1.01595116, + "balance_loss_mlp": 1.03183913, + "epoch": 0.9733954606944236, + "flos": 39165792963840.0, + "grad_norm": 1.7703842346307654, + "language_loss": 0.5137412, + "learning_rate": 7.403650130784367e-09, + "loss": 0.53497809, + "num_input_tokens_seen": 349401575, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.64453125, + "step": 16190, + "time_per_iteration": 2.5810770988464355 + }, + { + "auxiliary_loss_clip": 0.01099538, + "auxiliary_loss_mlp": 0.01026326, + "balance_loss_clip": 1.01520967, + "balance_loss_mlp": 1.03426313, + "epoch": 0.9734555839470915, + "flos": 21981819692160.0, + "grad_norm": 1.6597335248752023, + "language_loss": 0.80833918, + "learning_rate": 7.3702079156590105e-09, + "loss": 0.82959783, + "num_input_tokens_seen": 349420650, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65234375, + "step": 16191, + "time_per_iteration": 2.429949998855591 + }, + { + "auxiliary_loss_clip": 0.01096727, + "auxiliary_loss_mlp": 0.01027303, + "balance_loss_clip": 1.01659191, + "balance_loss_mlp": 1.03181481, + "epoch": 0.9735157071997596, + "flos": 16575539771520.0, + "grad_norm": 1.762042207505243, + "language_loss": 0.82737201, + "learning_rate": 7.336841261255111e-09, + "loss": 0.84861231, + "num_input_tokens_seen": 349436830, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6484375, + "step": 16192, + "time_per_iteration": 2.4036996364593506 + }, + { + "auxiliary_loss_clip": 0.0110223, + "auxiliary_loss_mlp": 0.01026845, + "balance_loss_clip": 1.0156033, + "balance_loss_mlp": 1.0369916, + "epoch": 0.9735758304524275, + "flos": 20223237726720.0, + "grad_norm": 1.7584855932220518, + "language_loss": 0.75289583, + "learning_rate": 7.303550168837658e-09, + "loss": 0.77418661, + "num_input_tokens_seen": 349454325, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 16193, + "time_per_iteration": 2.4855713844299316 + }, + { + "auxiliary_loss_clip": 0.01096028, + "auxiliary_loss_mlp": 0.01032399, + "balance_loss_clip": 1.02176595, + "balance_loss_mlp": 1.03300667, + "epoch": 0.9736359537050955, + "flos": 23653353047040.0, + "grad_norm": 1.7149640266068487, + "language_loss": 0.85318899, + "learning_rate": 7.270334639669417e-09, + "loss": 0.87447321, + "num_input_tokens_seen": 349470230, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.62890625, + "step": 16194, + "time_per_iteration": 2.447998523712158 + }, + { + "auxiliary_loss_clip": 0.01098878, + "auxiliary_loss_mlp": 0.01031872, + "balance_loss_clip": 1.020082, + "balance_loss_mlp": 1.03576303, + "epoch": 0.9736960769577634, + "flos": 15560202026880.0, + "grad_norm": 1.502245025606998, + "language_loss": 0.75605994, + "learning_rate": 7.237194675009828e-09, + "loss": 0.77736747, + "num_input_tokens_seen": 349486250, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6328125, + "step": 16195, + "time_per_iteration": 2.4004433155059814 + }, + { + "auxiliary_loss_clip": 0.010217, + "auxiliary_loss_mlp": 0.00999257, + "balance_loss_clip": 0.99829692, + "balance_loss_mlp": 1.00176632, + "epoch": 0.9737562002104314, + "flos": 65351783088000.0, + "grad_norm": 0.7095880579147238, + "language_loss": 0.52472728, + "learning_rate": 7.204130276115439e-09, + "loss": 0.54493684, + "num_input_tokens_seen": 349545865, + "router_z_loss_clip": 0.00958252, + "router_z_loss_mlp": 0.19921875, + "step": 16196, + "time_per_iteration": 2.985597610473633 + }, + { + "auxiliary_loss_clip": 0.01098243, + "auxiliary_loss_mlp": 0.01029046, + "balance_loss_clip": 1.0176909, + "balance_loss_mlp": 1.03375459, + "epoch": 0.9738163234630993, + "flos": 27196730928000.0, + "grad_norm": 2.152959147231462, + "language_loss": 0.76202309, + "learning_rate": 7.171141444240136e-09, + "loss": 0.78329599, + "num_input_tokens_seen": 349566080, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 16197, + "time_per_iteration": 2.5381948947906494 + }, + { + "auxiliary_loss_clip": 0.01101638, + "auxiliary_loss_mlp": 0.01029388, + "balance_loss_clip": 1.01759267, + "balance_loss_mlp": 1.03324556, + "epoch": 0.9738764467157673, + "flos": 21069365477760.0, + "grad_norm": 1.689607432579003, + "language_loss": 0.67603827, + "learning_rate": 7.13822818063492e-09, + "loss": 0.69734848, + "num_input_tokens_seen": 349585665, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.68359375, + "step": 16198, + "time_per_iteration": 2.473280191421509 + }, + { + "auxiliary_loss_clip": 0.01098612, + "auxiliary_loss_mlp": 0.01026496, + "balance_loss_clip": 1.0147835, + "balance_loss_mlp": 1.03299856, + "epoch": 0.9739365699684353, + "flos": 21361211481600.0, + "grad_norm": 1.7654996276877126, + "language_loss": 0.7798543, + "learning_rate": 7.10539048654768e-09, + "loss": 0.80110532, + "num_input_tokens_seen": 349605125, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 16199, + "time_per_iteration": 2.4409027099609375 + }, + { + "auxiliary_loss_clip": 0.0109881, + "auxiliary_loss_mlp": 0.01029353, + "balance_loss_clip": 1.01776588, + "balance_loss_mlp": 1.03432035, + "epoch": 0.9739966932211033, + "flos": 21902061542400.0, + "grad_norm": 1.6572282578499644, + "language_loss": 0.79276037, + "learning_rate": 7.072628363223865e-09, + "loss": 0.81404197, + "num_input_tokens_seen": 349623360, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.64453125, + "step": 16200, + "time_per_iteration": 2.4363887310028076 + }, + { + "auxiliary_loss_clip": 0.01105141, + "auxiliary_loss_mlp": 0.01034258, + "balance_loss_clip": 1.02201521, + "balance_loss_mlp": 1.0352422, + "epoch": 0.9740568164737712, + "flos": 24827345164800.0, + "grad_norm": 2.133720131745559, + "language_loss": 0.68253577, + "learning_rate": 7.039941811905592e-09, + "loss": 0.70392972, + "num_input_tokens_seen": 349644390, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69921875, + "step": 16201, + "time_per_iteration": 2.4559359550476074 + }, + { + "auxiliary_loss_clip": 0.01098547, + "auxiliary_loss_mlp": 0.01028902, + "balance_loss_clip": 1.01724362, + "balance_loss_mlp": 1.03254795, + "epoch": 0.9741169397264392, + "flos": 23623583650560.0, + "grad_norm": 1.4435582632035373, + "language_loss": 0.7252574, + "learning_rate": 7.0073308338325364e-09, + "loss": 0.74653184, + "num_input_tokens_seen": 349663200, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66015625, + "step": 16202, + "time_per_iteration": 2.464578866958618 + }, + { + "auxiliary_loss_clip": 0.01101296, + "auxiliary_loss_mlp": 0.01029669, + "balance_loss_clip": 1.0178498, + "balance_loss_mlp": 1.03444588, + "epoch": 0.9741770629791072, + "flos": 18841144164480.0, + "grad_norm": 2.6014538346880083, + "language_loss": 0.72974175, + "learning_rate": 6.974795430241265e-09, + "loss": 0.75105143, + "num_input_tokens_seen": 349681975, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66796875, + "step": 16203, + "time_per_iteration": 2.439260959625244 + }, + { + "auxiliary_loss_clip": 0.01100295, + "auxiliary_loss_mlp": 0.01029685, + "balance_loss_clip": 1.01812792, + "balance_loss_mlp": 1.03454626, + "epoch": 0.9742371862317751, + "flos": 22346241125760.0, + "grad_norm": 1.6487633714089436, + "language_loss": 0.77325201, + "learning_rate": 6.942335602365235e-09, + "loss": 0.79455173, + "num_input_tokens_seen": 349701185, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 16204, + "time_per_iteration": 2.4581048488616943 + }, + { + "auxiliary_loss_clip": 0.0110164, + "auxiliary_loss_mlp": 0.01032522, + "balance_loss_clip": 1.0203265, + "balance_loss_mlp": 1.03502774, + "epoch": 0.9742973094844432, + "flos": 21762764599680.0, + "grad_norm": 2.45174500530448, + "language_loss": 0.79808879, + "learning_rate": 6.909951351435905e-09, + "loss": 0.81943041, + "num_input_tokens_seen": 349720360, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6640625, + "step": 16205, + "time_per_iteration": 2.419422149658203 + }, + { + "auxiliary_loss_clip": 0.01098434, + "auxiliary_loss_mlp": 0.010292, + "balance_loss_clip": 1.01800013, + "balance_loss_mlp": 1.034006, + "epoch": 0.9743574327371111, + "flos": 26248725227520.0, + "grad_norm": 1.5139645709473997, + "language_loss": 0.74249279, + "learning_rate": 6.87764267868074e-09, + "loss": 0.76376915, + "num_input_tokens_seen": 349741040, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.64453125, + "step": 16206, + "time_per_iteration": 2.495774030685425 + }, + { + "auxiliary_loss_clip": 0.0109835, + "auxiliary_loss_mlp": 0.01029482, + "balance_loss_clip": 1.0172745, + "balance_loss_mlp": 1.03183436, + "epoch": 0.9744175559897791, + "flos": 12349321367040.0, + "grad_norm": 2.9366709312982087, + "language_loss": 0.84325778, + "learning_rate": 6.8454095853252015e-09, + "loss": 0.86453605, + "num_input_tokens_seen": 349758895, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6640625, + "step": 16207, + "time_per_iteration": 2.4933202266693115 + }, + { + "auxiliary_loss_clip": 0.0109755, + "auxiliary_loss_mlp": 0.01031029, + "balance_loss_clip": 1.0196625, + "balance_loss_mlp": 1.0328232, + "epoch": 0.974477679242447, + "flos": 28397834835840.0, + "grad_norm": 1.5653170726435226, + "language_loss": 0.70784497, + "learning_rate": 6.813252072591425e-09, + "loss": 0.7291308, + "num_input_tokens_seen": 349779740, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6484375, + "step": 16208, + "time_per_iteration": 2.504995822906494 + }, + { + "auxiliary_loss_clip": 0.0109343, + "auxiliary_loss_mlp": 0.01022533, + "balance_loss_clip": 1.01287138, + "balance_loss_mlp": 1.03384209, + "epoch": 0.974537802495115, + "flos": 17785370684160.0, + "grad_norm": 1.828450111947416, + "language_loss": 0.77404773, + "learning_rate": 6.781170141698878e-09, + "loss": 0.79520738, + "num_input_tokens_seen": 349796820, + "router_z_loss_clip": 0.09667969, + "router_z_loss_mlp": 0.59765625, + "step": 16209, + "time_per_iteration": 2.4571237564086914 + }, + { + "auxiliary_loss_clip": 0.01100923, + "auxiliary_loss_mlp": 0.01029847, + "balance_loss_clip": 1.01828933, + "balance_loss_mlp": 1.03378117, + "epoch": 0.9745979257477829, + "flos": 23842315520640.0, + "grad_norm": 1.6228002539298978, + "language_loss": 0.78707743, + "learning_rate": 6.749163793864144e-09, + "loss": 0.80838501, + "num_input_tokens_seen": 349816550, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.671875, + "step": 16210, + "time_per_iteration": 2.4974353313446045 + }, + { + "auxiliary_loss_clip": 0.01099743, + "auxiliary_loss_mlp": 0.0103492, + "balance_loss_clip": 1.02377987, + "balance_loss_mlp": 1.03362608, + "epoch": 0.9746580490004509, + "flos": 27016172236800.0, + "grad_norm": 2.5001536656047536, + "language_loss": 0.78155959, + "learning_rate": 6.7172330303009176e-09, + "loss": 0.80290616, + "num_input_tokens_seen": 349834350, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66015625, + "step": 16211, + "time_per_iteration": 2.7016804218292236 + }, + { + "auxiliary_loss_clip": 0.01104973, + "auxiliary_loss_mlp": 0.01029219, + "balance_loss_clip": 1.01664877, + "balance_loss_mlp": 1.03550124, + "epoch": 0.9747181722531189, + "flos": 19792022952960.0, + "grad_norm": 2.0555509454208583, + "language_loss": 0.78118324, + "learning_rate": 6.685377852219787e-09, + "loss": 0.80252516, + "num_input_tokens_seen": 349853460, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 16212, + "time_per_iteration": 2.567605495452881 + }, + { + "auxiliary_loss_clip": 0.01097708, + "auxiliary_loss_mlp": 0.01031276, + "balance_loss_clip": 1.02040458, + "balance_loss_mlp": 1.03407741, + "epoch": 0.9747782955057869, + "flos": 31430598929280.0, + "grad_norm": 1.6196122257396004, + "language_loss": 0.80419701, + "learning_rate": 6.653598260829118e-09, + "loss": 0.82548684, + "num_input_tokens_seen": 349874830, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.63671875, + "step": 16213, + "time_per_iteration": 2.589813709259033 + }, + { + "auxiliary_loss_clip": 0.01097874, + "auxiliary_loss_mlp": 0.01024364, + "balance_loss_clip": 1.01280618, + "balance_loss_mlp": 1.03220224, + "epoch": 0.9748384187584548, + "flos": 15961288268160.0, + "grad_norm": 2.0508933879729083, + "language_loss": 0.6626724, + "learning_rate": 6.6218942573335044e-09, + "loss": 0.68389475, + "num_input_tokens_seen": 349893690, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 16214, + "time_per_iteration": 2.497565746307373 + }, + { + "auxiliary_loss_clip": 0.01101576, + "auxiliary_loss_mlp": 0.01028842, + "balance_loss_clip": 1.01662934, + "balance_loss_mlp": 1.03467321, + "epoch": 0.9748985420111228, + "flos": 20558715776640.0, + "grad_norm": 1.967600227233748, + "language_loss": 0.74463314, + "learning_rate": 6.5902658429355386e-09, + "loss": 0.76593733, + "num_input_tokens_seen": 349912480, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66796875, + "step": 16215, + "time_per_iteration": 2.4592413902282715 + }, + { + "auxiliary_loss_clip": 0.01097336, + "auxiliary_loss_mlp": 0.01029205, + "balance_loss_clip": 1.01774275, + "balance_loss_mlp": 1.0326885, + "epoch": 0.9749586652637908, + "flos": 36721605127680.0, + "grad_norm": 1.7764212034166489, + "language_loss": 0.67058563, + "learning_rate": 6.558713018834483e-09, + "loss": 0.69185102, + "num_input_tokens_seen": 349932470, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 16216, + "time_per_iteration": 2.5700504779815674 + }, + { + "auxiliary_loss_clip": 0.01101316, + "auxiliary_loss_mlp": 0.0103098, + "balance_loss_clip": 1.01882648, + "balance_loss_mlp": 1.03412616, + "epoch": 0.9750187885164587, + "flos": 10999223844480.0, + "grad_norm": 1.9686853012013303, + "language_loss": 0.71478593, + "learning_rate": 6.527235786226937e-09, + "loss": 0.7361089, + "num_input_tokens_seen": 349949060, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.671875, + "step": 16217, + "time_per_iteration": 2.426276922225952 + }, + { + "auxiliary_loss_clip": 0.01098896, + "auxiliary_loss_mlp": 0.01027681, + "balance_loss_clip": 1.01612353, + "balance_loss_mlp": 1.03396559, + "epoch": 0.9750789117691268, + "flos": 25739512070400.0, + "grad_norm": 1.6447941805042985, + "language_loss": 0.78255022, + "learning_rate": 6.495834146306167e-09, + "loss": 0.80381596, + "num_input_tokens_seen": 349968010, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 16218, + "time_per_iteration": 2.473839282989502 + }, + { + "auxiliary_loss_clip": 0.010964, + "auxiliary_loss_mlp": 0.01029917, + "balance_loss_clip": 1.0179069, + "balance_loss_mlp": 1.03256178, + "epoch": 0.9751390350217947, + "flos": 13333955961600.0, + "grad_norm": 2.5361788769162237, + "language_loss": 0.7754612, + "learning_rate": 6.464508100263222e-09, + "loss": 0.79672432, + "num_input_tokens_seen": 349985270, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.640625, + "step": 16219, + "time_per_iteration": 2.432041645050049 + }, + { + "auxiliary_loss_clip": 0.01101469, + "auxiliary_loss_mlp": 0.01031565, + "balance_loss_clip": 1.0201565, + "balance_loss_mlp": 1.03447962, + "epoch": 0.9751991582744627, + "flos": 22820621068800.0, + "grad_norm": 1.630736434232842, + "language_loss": 0.81259847, + "learning_rate": 6.433257649285817e-09, + "loss": 0.83392882, + "num_input_tokens_seen": 350003935, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 16220, + "time_per_iteration": 3.9300906658172607 + }, + { + "auxiliary_loss_clip": 0.01096566, + "auxiliary_loss_mlp": 0.01025122, + "balance_loss_clip": 1.01427376, + "balance_loss_mlp": 1.03236842, + "epoch": 0.9752592815271306, + "flos": 19646189735040.0, + "grad_norm": 1.8176068692721052, + "language_loss": 0.74883264, + "learning_rate": 6.402082794559227e-09, + "loss": 0.77004945, + "num_input_tokens_seen": 350023595, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.640625, + "step": 16221, + "time_per_iteration": 2.437368869781494 + }, + { + "auxiliary_loss_clip": 0.01095184, + "auxiliary_loss_mlp": 0.0103033, + "balance_loss_clip": 1.01846254, + "balance_loss_mlp": 1.03186214, + "epoch": 0.9753194047797986, + "flos": 26690462686080.0, + "grad_norm": 1.7853777713397307, + "language_loss": 0.66434538, + "learning_rate": 6.370983537265395e-09, + "loss": 0.68560052, + "num_input_tokens_seen": 350045920, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6328125, + "step": 16222, + "time_per_iteration": 3.9962925910949707 + }, + { + "auxiliary_loss_clip": 0.01097085, + "auxiliary_loss_mlp": 0.01029431, + "balance_loss_clip": 1.01823103, + "balance_loss_mlp": 1.03272462, + "epoch": 0.9753795280324665, + "flos": 23221779137280.0, + "grad_norm": 1.7075208088690872, + "language_loss": 0.87882102, + "learning_rate": 6.3399598785836004e-09, + "loss": 0.90008616, + "num_input_tokens_seen": 350063925, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.64453125, + "step": 16223, + "time_per_iteration": 3.828974723815918 + }, + { + "auxiliary_loss_clip": 0.01096799, + "auxiliary_loss_mlp": 0.01027868, + "balance_loss_clip": 1.01694274, + "balance_loss_mlp": 1.03273821, + "epoch": 0.9754396512851345, + "flos": 19463835363840.0, + "grad_norm": 1.585066530051905, + "language_loss": 0.74491924, + "learning_rate": 6.309011819690457e-09, + "loss": 0.76616585, + "num_input_tokens_seen": 350080900, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.640625, + "step": 16224, + "time_per_iteration": 2.4636449813842773 + }, + { + "auxiliary_loss_clip": 0.01021478, + "auxiliary_loss_mlp": 0.01000107, + "balance_loss_clip": 0.99912339, + "balance_loss_mlp": 1.00153255, + "epoch": 0.9754997745378025, + "flos": 68459313340800.0, + "grad_norm": 0.8110453726438787, + "language_loss": 0.59165817, + "learning_rate": 6.278139361759249e-09, + "loss": 0.61187404, + "num_input_tokens_seen": 350144550, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.19921875, + "step": 16225, + "time_per_iteration": 3.0413310527801514 + }, + { + "auxiliary_loss_clip": 0.0109838, + "auxiliary_loss_mlp": 0.01033685, + "balance_loss_clip": 1.02271152, + "balance_loss_mlp": 1.03436458, + "epoch": 0.9755598977904705, + "flos": 26395168976640.0, + "grad_norm": 1.9025953600544858, + "language_loss": 0.68856502, + "learning_rate": 6.247342505960818e-09, + "loss": 0.7098856, + "num_input_tokens_seen": 350164050, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.640625, + "step": 16226, + "time_per_iteration": 2.4803082942962646 + }, + { + "auxiliary_loss_clip": 0.01099306, + "auxiliary_loss_mlp": 0.01036518, + "balance_loss_clip": 1.02476442, + "balance_loss_mlp": 1.0345875, + "epoch": 0.9756200210431384, + "flos": 16617663446400.0, + "grad_norm": 1.6327416633061216, + "language_loss": 0.82874024, + "learning_rate": 6.216621253462894e-09, + "loss": 0.85009849, + "num_input_tokens_seen": 350181350, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6484375, + "step": 16227, + "time_per_iteration": 2.4154109954833984 + }, + { + "auxiliary_loss_clip": 0.01097787, + "auxiliary_loss_mlp": 0.01023816, + "balance_loss_clip": 1.01299191, + "balance_loss_mlp": 1.03345346, + "epoch": 0.9756801442958064, + "flos": 23623044946560.0, + "grad_norm": 1.7854759830371416, + "language_loss": 0.78148073, + "learning_rate": 6.185975605430549e-09, + "loss": 0.80269676, + "num_input_tokens_seen": 350199765, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.640625, + "step": 16228, + "time_per_iteration": 3.987762451171875 + }, + { + "auxiliary_loss_clip": 0.01021739, + "auxiliary_loss_mlp": 0.01001601, + "balance_loss_clip": 1.00065923, + "balance_loss_mlp": 1.00168419, + "epoch": 0.9757402675484744, + "flos": 61625799440640.0, + "grad_norm": 0.8430180611412167, + "language_loss": 0.55817699, + "learning_rate": 6.155405563025962e-09, + "loss": 0.57841039, + "num_input_tokens_seen": 350256420, + "router_z_loss_clip": 0.00939941, + "router_z_loss_mlp": 0.20117188, + "step": 16229, + "time_per_iteration": 2.996128797531128 + }, + { + "auxiliary_loss_clip": 0.0109885, + "auxiliary_loss_mlp": 0.01028544, + "balance_loss_clip": 1.01672459, + "balance_loss_mlp": 1.03362441, + "epoch": 0.9758003908011423, + "flos": 24058964401920.0, + "grad_norm": 1.8129769312332171, + "language_loss": 0.74995404, + "learning_rate": 6.124911127407984e-09, + "loss": 0.77122796, + "num_input_tokens_seen": 350276270, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65234375, + "step": 16230, + "time_per_iteration": 2.4800798892974854 + }, + { + "auxiliary_loss_clip": 0.01095174, + "auxiliary_loss_mlp": 0.01030635, + "balance_loss_clip": 1.02002525, + "balance_loss_mlp": 1.03384256, + "epoch": 0.9758605140538104, + "flos": 17493093717120.0, + "grad_norm": 1.922364806166091, + "language_loss": 0.71574152, + "learning_rate": 6.094492299733245e-09, + "loss": 0.73699963, + "num_input_tokens_seen": 350295000, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.61328125, + "step": 16231, + "time_per_iteration": 2.4648971557617188 + }, + { + "auxiliary_loss_clip": 0.01102814, + "auxiliary_loss_mlp": 0.01029614, + "balance_loss_clip": 1.01759779, + "balance_loss_mlp": 1.03584027, + "epoch": 0.9759206373064783, + "flos": 24826950115200.0, + "grad_norm": 1.9642211900856055, + "language_loss": 0.76472759, + "learning_rate": 6.064149081155267e-09, + "loss": 0.78605187, + "num_input_tokens_seen": 350314980, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 16232, + "time_per_iteration": 2.526571035385132 + }, + { + "auxiliary_loss_clip": 0.01021867, + "auxiliary_loss_mlp": 0.00998904, + "balance_loss_clip": 0.99791414, + "balance_loss_mlp": 1.00185037, + "epoch": 0.9759807605591463, + "flos": 68161182456960.0, + "grad_norm": 0.7369935606950053, + "language_loss": 0.5375663, + "learning_rate": 6.033881472824465e-09, + "loss": 0.55777407, + "num_input_tokens_seen": 350371985, + "router_z_loss_clip": 0.0098877, + "router_z_loss_mlp": 0.20019531, + "step": 16233, + "time_per_iteration": 2.921182632446289 + }, + { + "auxiliary_loss_clip": 0.01097578, + "auxiliary_loss_mlp": 0.01030462, + "balance_loss_clip": 1.01871395, + "balance_loss_mlp": 1.03226352, + "epoch": 0.9760408838118142, + "flos": 18989239939200.0, + "grad_norm": 1.641864888997356, + "language_loss": 0.71351594, + "learning_rate": 6.003689475888807e-09, + "loss": 0.73479629, + "num_input_tokens_seen": 350390590, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.65234375, + "step": 16234, + "time_per_iteration": 2.4427335262298584 + }, + { + "auxiliary_loss_clip": 0.01101418, + "auxiliary_loss_mlp": 0.01027438, + "balance_loss_clip": 1.01523733, + "balance_loss_mlp": 1.03317451, + "epoch": 0.9761010070644822, + "flos": 17125978763520.0, + "grad_norm": 2.664507778412489, + "language_loss": 0.79045486, + "learning_rate": 5.973573091493156e-09, + "loss": 0.8117435, + "num_input_tokens_seen": 350403770, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 16235, + "time_per_iteration": 2.3964405059814453 + }, + { + "auxiliary_loss_clip": 0.01098094, + "auxiliary_loss_mlp": 0.01032535, + "balance_loss_clip": 1.01980996, + "balance_loss_mlp": 1.03295994, + "epoch": 0.9761611303171501, + "flos": 22052599441920.0, + "grad_norm": 1.9655441572234078, + "language_loss": 0.76884139, + "learning_rate": 5.943532320779265e-09, + "loss": 0.79014766, + "num_input_tokens_seen": 350421870, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.65234375, + "step": 16236, + "time_per_iteration": 2.4456753730773926 + }, + { + "auxiliary_loss_clip": 0.01097739, + "auxiliary_loss_mlp": 0.01026086, + "balance_loss_clip": 1.0151124, + "balance_loss_mlp": 1.03347445, + "epoch": 0.9762212535698181, + "flos": 21757521214080.0, + "grad_norm": 1.6942809100848069, + "language_loss": 0.75669736, + "learning_rate": 5.913567164886446e-09, + "loss": 0.77793556, + "num_input_tokens_seen": 350440025, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.64453125, + "step": 16237, + "time_per_iteration": 2.4447457790374756 + }, + { + "auxiliary_loss_clip": 0.01098982, + "auxiliary_loss_mlp": 0.01032306, + "balance_loss_clip": 1.01958013, + "balance_loss_mlp": 1.0322814, + "epoch": 0.9762813768224861, + "flos": 25921615046400.0, + "grad_norm": 1.7060238297486066, + "language_loss": 0.72860193, + "learning_rate": 5.8836776249509e-09, + "loss": 0.74991488, + "num_input_tokens_seen": 350459435, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.66796875, + "step": 16238, + "time_per_iteration": 2.4894490242004395 + }, + { + "auxiliary_loss_clip": 0.01099347, + "auxiliary_loss_mlp": 0.01028506, + "balance_loss_clip": 1.01638234, + "balance_loss_mlp": 1.03404987, + "epoch": 0.9763415000751541, + "flos": 24051853509120.0, + "grad_norm": 2.001677162599297, + "language_loss": 0.83721536, + "learning_rate": 5.8538637021063875e-09, + "loss": 0.85849392, + "num_input_tokens_seen": 350472655, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.65234375, + "step": 16239, + "time_per_iteration": 2.416748285293579 + }, + { + "auxiliary_loss_clip": 0.01100769, + "auxiliary_loss_mlp": 0.01029281, + "balance_loss_clip": 1.017277, + "balance_loss_mlp": 1.03516018, + "epoch": 0.976401623327822, + "flos": 17018677860480.0, + "grad_norm": 5.101623514128856, + "language_loss": 0.59312123, + "learning_rate": 5.824125397483115e-09, + "loss": 0.61442178, + "num_input_tokens_seen": 350488160, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.65625, + "step": 16240, + "time_per_iteration": 2.459441661834717 + }, + { + "auxiliary_loss_clip": 0.01099723, + "auxiliary_loss_mlp": 0.01028806, + "balance_loss_clip": 1.01747513, + "balance_loss_mlp": 1.03588104, + "epoch": 0.97646174658049, + "flos": 16106941918080.0, + "grad_norm": 1.7088989821507206, + "language_loss": 0.82588184, + "learning_rate": 5.7944627122088474e-09, + "loss": 0.84716713, + "num_input_tokens_seen": 350506065, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 16241, + "time_per_iteration": 2.4329097270965576 + }, + { + "auxiliary_loss_clip": 0.01099206, + "auxiliary_loss_mlp": 0.01030425, + "balance_loss_clip": 1.01948154, + "balance_loss_mlp": 1.03410602, + "epoch": 0.9765218698331579, + "flos": 21252725429760.0, + "grad_norm": 1.704721207895292, + "language_loss": 0.83693302, + "learning_rate": 5.764875647408463e-09, + "loss": 0.85822928, + "num_input_tokens_seen": 350524495, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 16242, + "time_per_iteration": 2.4511871337890625 + }, + { + "auxiliary_loss_clip": 0.01100525, + "auxiliary_loss_mlp": 0.01025666, + "balance_loss_clip": 1.01453757, + "balance_loss_mlp": 1.03459525, + "epoch": 0.9765819930858259, + "flos": 18588045957120.0, + "grad_norm": 1.7182851933408332, + "language_loss": 0.7538594, + "learning_rate": 5.7353642042037294e-09, + "loss": 0.77512127, + "num_input_tokens_seen": 350544185, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66015625, + "step": 16243, + "time_per_iteration": 2.4299659729003906 + }, + { + "auxiliary_loss_clip": 0.01098903, + "auxiliary_loss_mlp": 0.01037217, + "balance_loss_clip": 1.02515912, + "balance_loss_mlp": 1.03315616, + "epoch": 0.976642116338494, + "flos": 20266833859200.0, + "grad_norm": 1.7738669375659515, + "language_loss": 0.69590539, + "learning_rate": 5.705928383713754e-09, + "loss": 0.71726656, + "num_input_tokens_seen": 350562675, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.65625, + "step": 16244, + "time_per_iteration": 2.440574884414673 + }, + { + "auxiliary_loss_clip": 0.01102847, + "auxiliary_loss_mlp": 0.01028651, + "balance_loss_clip": 1.01634228, + "balance_loss_mlp": 1.0365603, + "epoch": 0.9767022395911619, + "flos": 25550477769600.0, + "grad_norm": 1.8209615931858283, + "language_loss": 0.83484882, + "learning_rate": 5.676568187055197e-09, + "loss": 0.8561638, + "num_input_tokens_seen": 350581535, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6640625, + "step": 16245, + "time_per_iteration": 2.491964340209961 + }, + { + "auxiliary_loss_clip": 0.01096014, + "auxiliary_loss_mlp": 0.01025395, + "balance_loss_clip": 1.01445746, + "balance_loss_mlp": 1.03261781, + "epoch": 0.9767623628438299, + "flos": 21762656858880.0, + "grad_norm": 1.3301173974373028, + "language_loss": 0.78354228, + "learning_rate": 5.647283615340726e-09, + "loss": 0.80475634, + "num_input_tokens_seen": 350601615, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6328125, + "step": 16246, + "time_per_iteration": 2.439201831817627 + }, + { + "auxiliary_loss_clip": 0.01092456, + "auxiliary_loss_mlp": 0.01027534, + "balance_loss_clip": 1.01759207, + "balance_loss_mlp": 1.03315675, + "epoch": 0.9768224860964978, + "flos": 15851114277120.0, + "grad_norm": 1.4733048814539074, + "language_loss": 0.73865449, + "learning_rate": 5.6180746696812275e-09, + "loss": 0.75985444, + "num_input_tokens_seen": 350619580, + "router_z_loss_clip": 0.09960938, + "router_z_loss_mlp": 0.59375, + "step": 16247, + "time_per_iteration": 2.414113759994507 + }, + { + "auxiliary_loss_clip": 0.01099436, + "auxiliary_loss_mlp": 0.01032501, + "balance_loss_clip": 1.02071714, + "balance_loss_mlp": 1.03369868, + "epoch": 0.9768826093491658, + "flos": 25151151294720.0, + "grad_norm": 1.5425114729506917, + "language_loss": 0.79912806, + "learning_rate": 5.58894135118404e-09, + "loss": 0.82044744, + "num_input_tokens_seen": 350640015, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65625, + "step": 16248, + "time_per_iteration": 2.5048165321350098 + }, + { + "auxiliary_loss_clip": 0.01106324, + "auxiliary_loss_mlp": 0.01040654, + "balance_loss_clip": 1.02789831, + "balance_loss_mlp": 1.03783882, + "epoch": 0.9769427326018337, + "flos": 22967028904320.0, + "grad_norm": 2.093419696491283, + "language_loss": 0.79174924, + "learning_rate": 5.559883660954278e-09, + "loss": 0.81321901, + "num_input_tokens_seen": 350659155, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.68359375, + "step": 16249, + "time_per_iteration": 2.5967299938201904 + }, + { + "auxiliary_loss_clip": 0.01096074, + "auxiliary_loss_mlp": 0.01031147, + "balance_loss_clip": 1.01956034, + "balance_loss_mlp": 1.0337956, + "epoch": 0.9770028558545018, + "flos": 15264297786240.0, + "grad_norm": 2.0929836799102626, + "language_loss": 0.66912627, + "learning_rate": 5.530901600093507e-09, + "loss": 0.69039845, + "num_input_tokens_seen": 350676615, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.625, + "step": 16250, + "time_per_iteration": 2.4212594032287598 + }, + { + "auxiliary_loss_clip": 0.01021381, + "auxiliary_loss_mlp": 0.01003741, + "balance_loss_clip": 1.00278687, + "balance_loss_mlp": 1.00140762, + "epoch": 0.9770629791071697, + "flos": 71450348808960.0, + "grad_norm": 0.775802092014466, + "language_loss": 0.59881055, + "learning_rate": 5.501995169700846e-09, + "loss": 0.61906171, + "num_input_tokens_seen": 350736805, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.20019531, + "step": 16251, + "time_per_iteration": 3.131605863571167 + }, + { + "auxiliary_loss_clip": 0.010985, + "auxiliary_loss_mlp": 0.0102849, + "balance_loss_clip": 1.01706934, + "balance_loss_mlp": 1.03328323, + "epoch": 0.9771231023598377, + "flos": 22412854897920.0, + "grad_norm": 1.8145393283670994, + "language_loss": 0.78657669, + "learning_rate": 5.473164370872307e-09, + "loss": 0.80784655, + "num_input_tokens_seen": 350753600, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 16252, + "time_per_iteration": 2.491278886795044 + }, + { + "auxiliary_loss_clip": 0.0109682, + "auxiliary_loss_mlp": 0.01029378, + "balance_loss_clip": 1.01803493, + "balance_loss_mlp": 1.03293729, + "epoch": 0.9771832256125056, + "flos": 19025940660480.0, + "grad_norm": 2.687078327620969, + "language_loss": 0.64509666, + "learning_rate": 5.444409204701461e-09, + "loss": 0.66635859, + "num_input_tokens_seen": 350771225, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.640625, + "step": 16253, + "time_per_iteration": 2.489243507385254 + }, + { + "auxiliary_loss_clip": 0.01102295, + "auxiliary_loss_mlp": 0.01029244, + "balance_loss_clip": 1.0163399, + "balance_loss_mlp": 1.03592074, + "epoch": 0.9772433488651736, + "flos": 17822143232640.0, + "grad_norm": 1.9709127512699236, + "language_loss": 0.76572144, + "learning_rate": 5.415729672278324e-09, + "loss": 0.78703684, + "num_input_tokens_seen": 350789100, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.66015625, + "step": 16254, + "time_per_iteration": 2.4342849254608154 + }, + { + "auxiliary_loss_clip": 0.01102063, + "auxiliary_loss_mlp": 0.01031471, + "balance_loss_clip": 1.019485, + "balance_loss_mlp": 1.03458083, + "epoch": 0.9773034721178415, + "flos": 37629785623680.0, + "grad_norm": 1.9349490825165467, + "language_loss": 0.64068961, + "learning_rate": 5.387125774690471e-09, + "loss": 0.66202497, + "num_input_tokens_seen": 350811085, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 16255, + "time_per_iteration": 2.597590446472168 + }, + { + "auxiliary_loss_clip": 0.01103968, + "auxiliary_loss_mlp": 0.0103261, + "balance_loss_clip": 1.01989055, + "balance_loss_mlp": 1.03523302, + "epoch": 0.9773635953705095, + "flos": 20302457172480.0, + "grad_norm": 1.5707961835740387, + "language_loss": 0.75804067, + "learning_rate": 5.358597513023033e-09, + "loss": 0.77940643, + "num_input_tokens_seen": 350831065, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 16256, + "time_per_iteration": 2.464634895324707 + }, + { + "auxiliary_loss_clip": 0.01098529, + "auxiliary_loss_mlp": 0.01031073, + "balance_loss_clip": 1.01894903, + "balance_loss_mlp": 1.03593969, + "epoch": 0.9774237186231776, + "flos": 22309253095680.0, + "grad_norm": 3.747088169064556, + "language_loss": 0.77749127, + "learning_rate": 5.330144888357369e-09, + "loss": 0.7987873, + "num_input_tokens_seen": 350849675, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.625, + "step": 16257, + "time_per_iteration": 2.530625820159912 + }, + { + "auxiliary_loss_clip": 0.0109999, + "auxiliary_loss_mlp": 0.01029963, + "balance_loss_clip": 1.0184536, + "balance_loss_mlp": 1.03522408, + "epoch": 0.9774838418758455, + "flos": 24204905360640.0, + "grad_norm": 1.717206349978081, + "language_loss": 0.75214601, + "learning_rate": 5.301767901772391e-09, + "loss": 0.77344555, + "num_input_tokens_seen": 350868955, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 16258, + "time_per_iteration": 2.5173375606536865 + }, + { + "auxiliary_loss_clip": 0.01021907, + "auxiliary_loss_mlp": 0.0100158, + "balance_loss_clip": 1.00060833, + "balance_loss_mlp": 1.00197577, + "epoch": 0.9775439651285135, + "flos": 66357139829760.0, + "grad_norm": 0.6857941213607871, + "language_loss": 0.59782362, + "learning_rate": 5.273466554344353e-09, + "loss": 0.61805856, + "num_input_tokens_seen": 350935110, + "router_z_loss_clip": 0.00970459, + "router_z_loss_mlp": 0.19921875, + "step": 16259, + "time_per_iteration": 3.1181235313415527 + }, + { + "auxiliary_loss_clip": 0.01103425, + "auxiliary_loss_mlp": 0.01031298, + "balance_loss_clip": 1.01912701, + "balance_loss_mlp": 1.03543591, + "epoch": 0.9776040883811814, + "flos": 22601565976320.0, + "grad_norm": 1.7500488558402083, + "language_loss": 0.7345553, + "learning_rate": 5.2452408471461705e-09, + "loss": 0.75590253, + "num_input_tokens_seen": 350953220, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6796875, + "step": 16260, + "time_per_iteration": 2.5194666385650635 + }, + { + "auxiliary_loss_clip": 0.01100608, + "auxiliary_loss_mlp": 0.01031989, + "balance_loss_clip": 1.02000213, + "balance_loss_mlp": 1.0345335, + "epoch": 0.9776642116338494, + "flos": 18442176825600.0, + "grad_norm": 1.9560228584534347, + "language_loss": 0.79390025, + "learning_rate": 5.2170907812485456e-09, + "loss": 0.81522614, + "num_input_tokens_seen": 350971915, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66015625, + "step": 16261, + "time_per_iteration": 2.494131088256836 + }, + { + "auxiliary_loss_clip": 0.01100642, + "auxiliary_loss_mlp": 0.0102413, + "balance_loss_clip": 1.01245975, + "balance_loss_mlp": 1.03458381, + "epoch": 0.9777243348865173, + "flos": 22638446265600.0, + "grad_norm": 2.38180088162508, + "language_loss": 0.74037927, + "learning_rate": 5.189016357718845e-09, + "loss": 0.76162702, + "num_input_tokens_seen": 350990470, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66015625, + "step": 16262, + "time_per_iteration": 4.020437240600586 + }, + { + "auxiliary_loss_clip": 0.01101829, + "auxiliary_loss_mlp": 0.01029855, + "balance_loss_clip": 1.01671231, + "balance_loss_mlp": 1.03477502, + "epoch": 0.9777844581391854, + "flos": 31321394605440.0, + "grad_norm": 2.209424338913731, + "language_loss": 0.700001, + "learning_rate": 5.16101757762133e-09, + "loss": 0.72131789, + "num_input_tokens_seen": 351010755, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.671875, + "step": 16263, + "time_per_iteration": 2.6097006797790527 + }, + { + "auxiliary_loss_clip": 0.01100862, + "auxiliary_loss_mlp": 0.01026256, + "balance_loss_clip": 1.01556909, + "balance_loss_mlp": 1.03478503, + "epoch": 0.9778445813918533, + "flos": 23039101543680.0, + "grad_norm": 1.690080180410736, + "language_loss": 0.66416574, + "learning_rate": 5.133094442018038e-09, + "loss": 0.6854369, + "num_input_tokens_seen": 351029965, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.66015625, + "step": 16264, + "time_per_iteration": 5.414909362792969 + }, + { + "auxiliary_loss_clip": 0.01104855, + "auxiliary_loss_mlp": 0.01032168, + "balance_loss_clip": 1.018942, + "balance_loss_mlp": 1.03560305, + "epoch": 0.9779047046445213, + "flos": 17566351505280.0, + "grad_norm": 2.017252489595847, + "language_loss": 0.72986895, + "learning_rate": 5.105246951967679e-09, + "loss": 0.75123918, + "num_input_tokens_seen": 351046205, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.69140625, + "step": 16265, + "time_per_iteration": 2.4533677101135254 + }, + { + "auxiliary_loss_clip": 0.01095698, + "auxiliary_loss_mlp": 0.01029327, + "balance_loss_clip": 1.01807976, + "balance_loss_mlp": 1.03230691, + "epoch": 0.9779648278971892, + "flos": 20741141975040.0, + "grad_norm": 1.8239082705051328, + "language_loss": 0.68785274, + "learning_rate": 5.077475108526297e-09, + "loss": 0.70910293, + "num_input_tokens_seen": 351065390, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6328125, + "step": 16266, + "time_per_iteration": 2.5144505500793457 + }, + { + "auxiliary_loss_clip": 0.01096607, + "auxiliary_loss_mlp": 0.01028265, + "balance_loss_clip": 1.01792407, + "balance_loss_mlp": 1.03445101, + "epoch": 0.9780249511498572, + "flos": 21026954494080.0, + "grad_norm": 1.6346367496457415, + "language_loss": 0.86829478, + "learning_rate": 5.049778912747049e-09, + "loss": 0.88954347, + "num_input_tokens_seen": 351084355, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.62109375, + "step": 16267, + "time_per_iteration": 2.467357635498047 + }, + { + "auxiliary_loss_clip": 0.0110114, + "auxiliary_loss_mlp": 0.01027299, + "balance_loss_clip": 1.01483595, + "balance_loss_mlp": 1.03381491, + "epoch": 0.9780850744025251, + "flos": 30774223751040.0, + "grad_norm": 1.7912126481892603, + "language_loss": 0.70019847, + "learning_rate": 5.022158365679985e-09, + "loss": 0.72148287, + "num_input_tokens_seen": 351105870, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.671875, + "step": 16268, + "time_per_iteration": 2.611461639404297 + }, + { + "auxiliary_loss_clip": 0.01100318, + "auxiliary_loss_mlp": 0.01025392, + "balance_loss_clip": 1.01431131, + "balance_loss_mlp": 1.03440547, + "epoch": 0.9781451976551931, + "flos": 20302995876480.0, + "grad_norm": 1.8029387675380926, + "language_loss": 0.73841709, + "learning_rate": 4.994613468372711e-09, + "loss": 0.75967419, + "num_input_tokens_seen": 351124760, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.66015625, + "step": 16269, + "time_per_iteration": 2.492509365081787 + }, + { + "auxiliary_loss_clip": 0.01100361, + "auxiliary_loss_mlp": 0.01027825, + "balance_loss_clip": 1.0153085, + "balance_loss_mlp": 1.03405917, + "epoch": 0.9782053209078612, + "flos": 24316479982080.0, + "grad_norm": 1.7343347922609937, + "language_loss": 0.70707202, + "learning_rate": 4.967144221869501e-09, + "loss": 0.72835386, + "num_input_tokens_seen": 351142820, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6640625, + "step": 16270, + "time_per_iteration": 4.16719651222229 + }, + { + "auxiliary_loss_clip": 0.01100199, + "auxiliary_loss_mlp": 0.01030147, + "balance_loss_clip": 1.0187391, + "balance_loss_mlp": 1.0348208, + "epoch": 0.9782654441605291, + "flos": 32489425065600.0, + "grad_norm": 1.6302831298633103, + "language_loss": 0.63994282, + "learning_rate": 4.939750627212191e-09, + "loss": 0.6612463, + "num_input_tokens_seen": 351164805, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 16271, + "time_per_iteration": 2.630716562271118 + }, + { + "auxiliary_loss_clip": 0.01097487, + "auxiliary_loss_mlp": 0.0102891, + "balance_loss_clip": 1.01719773, + "balance_loss_mlp": 1.03479195, + "epoch": 0.9783255674131971, + "flos": 26979076465920.0, + "grad_norm": 1.4434562656033578, + "language_loss": 0.70372558, + "learning_rate": 4.912432685439505e-09, + "loss": 0.72498953, + "num_input_tokens_seen": 351187005, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.62890625, + "step": 16272, + "time_per_iteration": 2.5594773292541504 + }, + { + "auxiliary_loss_clip": 0.0110247, + "auxiliary_loss_mlp": 0.01033208, + "balance_loss_clip": 1.02101922, + "balance_loss_mlp": 1.03539801, + "epoch": 0.978385690665865, + "flos": 23112251591040.0, + "grad_norm": 2.7207036655043733, + "language_loss": 0.66597646, + "learning_rate": 4.88519039758728e-09, + "loss": 0.68733323, + "num_input_tokens_seen": 351208450, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 16273, + "time_per_iteration": 2.5560595989227295 + }, + { + "auxiliary_loss_clip": 0.01099094, + "auxiliary_loss_mlp": 0.01023905, + "balance_loss_clip": 1.01173985, + "balance_loss_mlp": 1.03361404, + "epoch": 0.978445813918533, + "flos": 25409672455680.0, + "grad_norm": 1.8374122302756553, + "language_loss": 0.74000204, + "learning_rate": 4.85802376468869e-09, + "loss": 0.76123202, + "num_input_tokens_seen": 351229585, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.65625, + "step": 16274, + "time_per_iteration": 2.5694611072540283 + }, + { + "auxiliary_loss_clip": 0.01101012, + "auxiliary_loss_mlp": 0.01029244, + "balance_loss_clip": 1.01815104, + "balance_loss_mlp": 1.03633726, + "epoch": 0.9785059371712009, + "flos": 23550218121600.0, + "grad_norm": 1.7449518961905144, + "language_loss": 0.7771135, + "learning_rate": 4.830932787773579e-09, + "loss": 0.79841614, + "num_input_tokens_seen": 351249525, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 16275, + "time_per_iteration": 2.5744869709014893 + }, + { + "auxiliary_loss_clip": 0.0110169, + "auxiliary_loss_mlp": 0.01028389, + "balance_loss_clip": 1.01628375, + "balance_loss_mlp": 1.03521442, + "epoch": 0.978566060423869, + "flos": 34351177870080.0, + "grad_norm": 1.4726802158178436, + "language_loss": 0.70957249, + "learning_rate": 4.803917467869567e-09, + "loss": 0.73087335, + "num_input_tokens_seen": 351272530, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 16276, + "time_per_iteration": 2.72546648979187 + }, + { + "auxiliary_loss_clip": 0.01095364, + "auxiliary_loss_mlp": 0.01029867, + "balance_loss_clip": 1.01873851, + "balance_loss_mlp": 1.03249002, + "epoch": 0.9786261836765369, + "flos": 11618862387840.0, + "grad_norm": 1.8218394979557164, + "language_loss": 0.859927, + "learning_rate": 4.776977806000726e-09, + "loss": 0.88117933, + "num_input_tokens_seen": 351288530, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.62890625, + "step": 16277, + "time_per_iteration": 2.542083740234375 + }, + { + "auxiliary_loss_clip": 0.01098208, + "auxiliary_loss_mlp": 0.01026865, + "balance_loss_clip": 1.01536143, + "balance_loss_mlp": 1.0346185, + "epoch": 0.9786863069292049, + "flos": 17420949250560.0, + "grad_norm": 1.852013929689249, + "language_loss": 0.70972097, + "learning_rate": 4.7501138031891264e-09, + "loss": 0.73097163, + "num_input_tokens_seen": 351305890, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6328125, + "step": 16278, + "time_per_iteration": 2.493483066558838 + }, + { + "auxiliary_loss_clip": 0.01096145, + "auxiliary_loss_mlp": 0.01027994, + "balance_loss_clip": 1.01613855, + "balance_loss_mlp": 1.03165531, + "epoch": 0.9787464301818728, + "flos": 20844923345280.0, + "grad_norm": 1.8162844777370935, + "language_loss": 0.84460557, + "learning_rate": 4.723325460453065e-09, + "loss": 0.86584687, + "num_input_tokens_seen": 351325010, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.64453125, + "step": 16279, + "time_per_iteration": 2.6132097244262695 + }, + { + "auxiliary_loss_clip": 0.01098514, + "auxiliary_loss_mlp": 0.01029625, + "balance_loss_clip": 1.01753092, + "balance_loss_mlp": 1.03278434, + "epoch": 0.9788065534345408, + "flos": 18222942165120.0, + "grad_norm": 1.8612686985344382, + "language_loss": 0.78869414, + "learning_rate": 4.696612778808395e-09, + "loss": 0.8099755, + "num_input_tokens_seen": 351343060, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.65625, + "step": 16280, + "time_per_iteration": 2.5324976444244385 + }, + { + "auxiliary_loss_clip": 0.01096797, + "auxiliary_loss_mlp": 0.01031557, + "balance_loss_clip": 1.02069163, + "balance_loss_mlp": 1.03460717, + "epoch": 0.9788666766872087, + "flos": 21578219498880.0, + "grad_norm": 1.7034724956942773, + "language_loss": 0.7950545, + "learning_rate": 4.669975759268085e-09, + "loss": 0.81633806, + "num_input_tokens_seen": 351363260, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.62109375, + "step": 16281, + "time_per_iteration": 2.543025493621826 + }, + { + "auxiliary_loss_clip": 0.01099177, + "auxiliary_loss_mlp": 0.01030593, + "balance_loss_clip": 1.01827884, + "balance_loss_mlp": 1.03329802, + "epoch": 0.9789267999398767, + "flos": 24900495212160.0, + "grad_norm": 1.6099254109579124, + "language_loss": 0.80462193, + "learning_rate": 4.643414402842216e-09, + "loss": 0.82591969, + "num_input_tokens_seen": 351382610, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66015625, + "step": 16282, + "time_per_iteration": 2.525231122970581 + }, + { + "auxiliary_loss_clip": 0.01100012, + "auxiliary_loss_mlp": 0.0103584, + "balance_loss_clip": 1.02403867, + "balance_loss_mlp": 1.03417039, + "epoch": 0.9789869231925448, + "flos": 19573111514880.0, + "grad_norm": 2.3091950927100813, + "language_loss": 0.83399373, + "learning_rate": 4.616928710538204e-09, + "loss": 0.85535228, + "num_input_tokens_seen": 351401075, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65625, + "step": 16283, + "time_per_iteration": 2.5172736644744873 + }, + { + "auxiliary_loss_clip": 0.01098196, + "auxiliary_loss_mlp": 0.01031053, + "balance_loss_clip": 1.01948416, + "balance_loss_mlp": 1.03322864, + "epoch": 0.9790470464452127, + "flos": 16796641939200.0, + "grad_norm": 1.8868446346869174, + "language_loss": 0.7178874, + "learning_rate": 4.590518683360134e-09, + "loss": 0.73917985, + "num_input_tokens_seen": 351419275, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 16284, + "time_per_iteration": 2.4635121822357178 + }, + { + "auxiliary_loss_clip": 0.01098539, + "auxiliary_loss_mlp": 0.01034979, + "balance_loss_clip": 1.02436984, + "balance_loss_mlp": 1.03568172, + "epoch": 0.9791071696978807, + "flos": 18369350000640.0, + "grad_norm": 1.8562252978598333, + "language_loss": 0.64642346, + "learning_rate": 4.56418432230965e-09, + "loss": 0.66775858, + "num_input_tokens_seen": 351437375, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.62890625, + "step": 16285, + "time_per_iteration": 2.4629716873168945 + }, + { + "auxiliary_loss_clip": 0.0109894, + "auxiliary_loss_mlp": 0.0103082, + "balance_loss_clip": 1.01941144, + "balance_loss_mlp": 1.03462458, + "epoch": 0.9791672929505486, + "flos": 24170323541760.0, + "grad_norm": 1.979309410905623, + "language_loss": 0.70627666, + "learning_rate": 4.537925628385286e-09, + "loss": 0.72757423, + "num_input_tokens_seen": 351457810, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.64453125, + "step": 16286, + "time_per_iteration": 2.519150972366333 + }, + { + "auxiliary_loss_clip": 0.01095816, + "auxiliary_loss_mlp": 0.01030402, + "balance_loss_clip": 1.01952446, + "balance_loss_mlp": 1.03219485, + "epoch": 0.9792274162032166, + "flos": 24354114456960.0, + "grad_norm": 1.38347830602051, + "language_loss": 0.58299065, + "learning_rate": 4.511742602582691e-09, + "loss": 0.60425282, + "num_input_tokens_seen": 351478825, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.63671875, + "step": 16287, + "time_per_iteration": 2.5372016429901123 + }, + { + "auxiliary_loss_clip": 0.01100034, + "auxiliary_loss_mlp": 0.01033961, + "balance_loss_clip": 1.02208781, + "balance_loss_mlp": 1.03487289, + "epoch": 0.9792875394558845, + "flos": 26395779507840.0, + "grad_norm": 1.7701406115909017, + "language_loss": 0.81316799, + "learning_rate": 4.485635245894626e-09, + "loss": 0.83450794, + "num_input_tokens_seen": 351498785, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.65234375, + "step": 16288, + "time_per_iteration": 2.513692617416382 + }, + { + "auxiliary_loss_clip": 0.01098614, + "auxiliary_loss_mlp": 0.01024558, + "balance_loss_clip": 1.01289308, + "balance_loss_mlp": 1.03259087, + "epoch": 0.9793476627085526, + "flos": 28148004766080.0, + "grad_norm": 1.490724806273456, + "language_loss": 0.71809161, + "learning_rate": 4.459603559311631e-09, + "loss": 0.73932338, + "num_input_tokens_seen": 351520235, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66015625, + "step": 16289, + "time_per_iteration": 2.5497584342956543 + }, + { + "auxiliary_loss_clip": 0.01099152, + "auxiliary_loss_mlp": 0.01036871, + "balance_loss_clip": 1.02502751, + "balance_loss_mlp": 1.03522253, + "epoch": 0.9794077859612205, + "flos": 16763927627520.0, + "grad_norm": 2.421114759913103, + "language_loss": 0.7523073, + "learning_rate": 4.43364754382003e-09, + "loss": 0.77366757, + "num_input_tokens_seen": 351538900, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.640625, + "step": 16290, + "time_per_iteration": 2.478057861328125 + }, + { + "auxiliary_loss_clip": 0.01100685, + "auxiliary_loss_mlp": 0.01031841, + "balance_loss_clip": 1.01924038, + "balance_loss_mlp": 1.03389645, + "epoch": 0.9794679092138885, + "flos": 19280834547840.0, + "grad_norm": 1.6419877933765765, + "language_loss": 0.67298269, + "learning_rate": 4.4077672004048105e-09, + "loss": 0.69430792, + "num_input_tokens_seen": 351558715, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.66796875, + "step": 16291, + "time_per_iteration": 2.508269786834717 + }, + { + "auxiliary_loss_clip": 0.01102554, + "auxiliary_loss_mlp": 0.01028221, + "balance_loss_clip": 1.01628256, + "balance_loss_mlp": 1.03450608, + "epoch": 0.9795280324665564, + "flos": 32156640535680.0, + "grad_norm": 1.7353074100910213, + "language_loss": 0.62683344, + "learning_rate": 4.3819625300467456e-09, + "loss": 0.64814121, + "num_input_tokens_seen": 351578450, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 16292, + "time_per_iteration": 2.581599235534668 + }, + { + "auxiliary_loss_clip": 0.01101072, + "auxiliary_loss_mlp": 0.01032017, + "balance_loss_clip": 1.02097225, + "balance_loss_mlp": 1.03556764, + "epoch": 0.9795881557192244, + "flos": 19060953442560.0, + "grad_norm": 2.0682160456993226, + "language_loss": 0.73132885, + "learning_rate": 4.356233533724829e-09, + "loss": 0.75265968, + "num_input_tokens_seen": 351597195, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 16293, + "time_per_iteration": 2.483751058578491 + }, + { + "auxiliary_loss_clip": 0.01100266, + "auxiliary_loss_mlp": 0.01027342, + "balance_loss_clip": 1.01558185, + "balance_loss_mlp": 1.03350306, + "epoch": 0.9796482789718923, + "flos": 28329928174080.0, + "grad_norm": 2.0559306956335948, + "language_loss": 0.83788204, + "learning_rate": 4.330580212414503e-09, + "loss": 0.85915816, + "num_input_tokens_seen": 351617460, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.66796875, + "step": 16294, + "time_per_iteration": 2.550323009490967 + }, + { + "auxiliary_loss_clip": 0.01095885, + "auxiliary_loss_mlp": 0.01028893, + "balance_loss_clip": 1.01821804, + "balance_loss_mlp": 1.03391075, + "epoch": 0.9797084022245603, + "flos": 17967976450560.0, + "grad_norm": 2.290419249779841, + "language_loss": 0.71717238, + "learning_rate": 4.305002567088767e-09, + "loss": 0.73842019, + "num_input_tokens_seen": 351635900, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6171875, + "step": 16295, + "time_per_iteration": 2.4508378505706787 + }, + { + "auxiliary_loss_clip": 0.01105244, + "auxiliary_loss_mlp": 0.0103715, + "balance_loss_clip": 1.02547288, + "balance_loss_mlp": 1.03634858, + "epoch": 0.9797685254772284, + "flos": 20266726118400.0, + "grad_norm": 1.6649015681944959, + "language_loss": 0.80663395, + "learning_rate": 4.2795005987170674e-09, + "loss": 0.82805789, + "num_input_tokens_seen": 351655400, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6875, + "step": 16296, + "time_per_iteration": 2.5006003379821777 + }, + { + "auxiliary_loss_clip": 0.01096989, + "auxiliary_loss_mlp": 0.01032481, + "balance_loss_clip": 1.02125096, + "balance_loss_mlp": 1.03309369, + "epoch": 0.9798286487298963, + "flos": 26907147480960.0, + "grad_norm": 1.7126573036341362, + "language_loss": 0.75474179, + "learning_rate": 4.254074308266853e-09, + "loss": 0.77603638, + "num_input_tokens_seen": 351675505, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 16297, + "time_per_iteration": 2.536893844604492 + }, + { + "auxiliary_loss_clip": 0.01102165, + "auxiliary_loss_mlp": 0.01034134, + "balance_loss_clip": 1.02261257, + "balance_loss_mlp": 1.03400278, + "epoch": 0.9798887719825643, + "flos": 27161071701120.0, + "grad_norm": 1.6213586947116383, + "language_loss": 0.78397214, + "learning_rate": 4.228723696702019e-09, + "loss": 0.80533516, + "num_input_tokens_seen": 351697920, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6796875, + "step": 16298, + "time_per_iteration": 2.525702953338623 + }, + { + "auxiliary_loss_clip": 0.01094464, + "auxiliary_loss_mlp": 0.01024465, + "balance_loss_clip": 1.01362884, + "balance_loss_mlp": 1.03258538, + "epoch": 0.9799488952352322, + "flos": 20668422890880.0, + "grad_norm": 1.4638188813410706, + "language_loss": 0.72470737, + "learning_rate": 4.203448764984019e-09, + "loss": 0.7458967, + "num_input_tokens_seen": 351717615, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6171875, + "step": 16299, + "time_per_iteration": 2.480396032333374 + }, + { + "auxiliary_loss_clip": 0.01100875, + "auxiliary_loss_mlp": 0.01032194, + "balance_loss_clip": 1.01994538, + "balance_loss_mlp": 1.03388453, + "epoch": 0.9800090184879002, + "flos": 21981209160960.0, + "grad_norm": 1.9941262161166102, + "language_loss": 0.89518666, + "learning_rate": 4.178249514071419e-09, + "loss": 0.91651738, + "num_input_tokens_seen": 351735260, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.671875, + "step": 16300, + "time_per_iteration": 2.4887144565582275 + }, + { + "auxiliary_loss_clip": 0.01101113, + "auxiliary_loss_mlp": 0.01029738, + "balance_loss_clip": 1.01779962, + "balance_loss_mlp": 1.03375816, + "epoch": 0.9800691417405681, + "flos": 21288420570240.0, + "grad_norm": 2.0314895800326758, + "language_loss": 0.77960867, + "learning_rate": 4.1531259449194555e-09, + "loss": 0.80091715, + "num_input_tokens_seen": 351755800, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.671875, + "step": 16301, + "time_per_iteration": 2.5138540267944336 + }, + { + "auxiliary_loss_clip": 0.01100158, + "auxiliary_loss_mlp": 0.01035508, + "balance_loss_clip": 1.02306259, + "balance_loss_mlp": 1.03404641, + "epoch": 0.9801292649932362, + "flos": 18439878355200.0, + "grad_norm": 2.000398501552176, + "language_loss": 0.75482309, + "learning_rate": 4.128078058480921e-09, + "loss": 0.77617979, + "num_input_tokens_seen": 351774790, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6640625, + "step": 16302, + "time_per_iteration": 2.4900062084198 + }, + { + "auxiliary_loss_clip": 0.01098806, + "auxiliary_loss_mlp": 0.01028927, + "balance_loss_clip": 1.01688099, + "balance_loss_mlp": 1.03401518, + "epoch": 0.9801893882459041, + "flos": 25046364343680.0, + "grad_norm": 1.6848878091694153, + "language_loss": 0.79394841, + "learning_rate": 4.103105855705724e-09, + "loss": 0.81522572, + "num_input_tokens_seen": 351792855, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6484375, + "step": 16303, + "time_per_iteration": 4.0379838943481445 + }, + { + "auxiliary_loss_clip": 0.01102546, + "auxiliary_loss_mlp": 0.01030204, + "balance_loss_clip": 1.01812756, + "balance_loss_mlp": 1.03442729, + "epoch": 0.9802495114985721, + "flos": 18511484117760.0, + "grad_norm": 2.015058455965645, + "language_loss": 0.82887793, + "learning_rate": 4.078209337540883e-09, + "loss": 0.85020542, + "num_input_tokens_seen": 351811450, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6796875, + "step": 16304, + "time_per_iteration": 2.5293853282928467 + }, + { + "auxiliary_loss_clip": 0.01095069, + "auxiliary_loss_mlp": 0.01026967, + "balance_loss_clip": 1.0165664, + "balance_loss_mlp": 1.03351498, + "epoch": 0.98030963475124, + "flos": 21469841187840.0, + "grad_norm": 1.8806572396287222, + "language_loss": 0.70294923, + "learning_rate": 4.053388504930089e-09, + "loss": 0.72416955, + "num_input_tokens_seen": 351831960, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.6171875, + "step": 16305, + "time_per_iteration": 3.9920012950897217 + }, + { + "auxiliary_loss_clip": 0.01101609, + "auxiliary_loss_mlp": 0.01030105, + "balance_loss_clip": 1.01850629, + "balance_loss_mlp": 1.03499484, + "epoch": 0.980369758003908, + "flos": 20412272027520.0, + "grad_norm": 2.542876871636166, + "language_loss": 0.71830386, + "learning_rate": 4.028643358815032e-09, + "loss": 0.73962104, + "num_input_tokens_seen": 351851585, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66796875, + "step": 16306, + "time_per_iteration": 3.8759777545928955 + }, + { + "auxiliary_loss_clip": 0.01094312, + "auxiliary_loss_mlp": 0.01028834, + "balance_loss_clip": 1.01834977, + "balance_loss_mlp": 1.03180003, + "epoch": 0.9804298812565759, + "flos": 23399177431680.0, + "grad_norm": 1.6763796973864105, + "language_loss": 0.73249525, + "learning_rate": 4.00397390013385e-09, + "loss": 0.75372672, + "num_input_tokens_seen": 351871085, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.625, + "step": 16307, + "time_per_iteration": 2.4559872150421143 + }, + { + "auxiliary_loss_clip": 0.01094645, + "auxiliary_loss_mlp": 0.01030867, + "balance_loss_clip": 1.02071619, + "balance_loss_mlp": 1.03398371, + "epoch": 0.980490004509244, + "flos": 23292666627840.0, + "grad_norm": 1.4089016879713172, + "language_loss": 0.74952251, + "learning_rate": 3.979380129822018e-09, + "loss": 0.77077764, + "num_input_tokens_seen": 351891775, + "router_z_loss_clip": 0.1015625, + "router_z_loss_mlp": 0.609375, + "step": 16308, + "time_per_iteration": 2.4483864307403564 + }, + { + "auxiliary_loss_clip": 0.01021734, + "auxiliary_loss_mlp": 0.01000005, + "balance_loss_clip": 0.99902195, + "balance_loss_mlp": 1.00172949, + "epoch": 0.980550127761912, + "flos": 56051027798400.0, + "grad_norm": 0.75357779305897, + "language_loss": 0.5785439, + "learning_rate": 3.954862048811902e-09, + "loss": 0.59876132, + "num_input_tokens_seen": 351946770, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.20019531, + "step": 16309, + "time_per_iteration": 2.9689579010009766 + }, + { + "auxiliary_loss_clip": 0.01098952, + "auxiliary_loss_mlp": 0.0103338, + "balance_loss_clip": 1.02141714, + "balance_loss_mlp": 1.0326581, + "epoch": 0.9806102510145799, + "flos": 25333290184320.0, + "grad_norm": 1.6693234656111071, + "language_loss": 0.6591835, + "learning_rate": 3.930419658033646e-09, + "loss": 0.68050683, + "num_input_tokens_seen": 351966155, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6640625, + "step": 16310, + "time_per_iteration": 2.468170642852783 + }, + { + "auxiliary_loss_clip": 0.01021706, + "auxiliary_loss_mlp": 0.0100009, + "balance_loss_clip": 0.99913657, + "balance_loss_mlp": 1.00166667, + "epoch": 0.9806703742672479, + "flos": 67274837429760.0, + "grad_norm": 0.8200525059067886, + "language_loss": 0.54590946, + "learning_rate": 3.906052958413841e-09, + "loss": 0.56612742, + "num_input_tokens_seen": 352031655, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.20019531, + "step": 16311, + "time_per_iteration": 4.64594030380249 + }, + { + "auxiliary_loss_clip": 0.01098424, + "auxiliary_loss_mlp": 0.01023662, + "balance_loss_clip": 1.01312995, + "balance_loss_mlp": 1.03379786, + "epoch": 0.9807304975199158, + "flos": 25228970110080.0, + "grad_norm": 1.5926022897704035, + "language_loss": 0.7984302, + "learning_rate": 3.881761950876638e-09, + "loss": 0.81965107, + "num_input_tokens_seen": 352051920, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.64453125, + "step": 16312, + "time_per_iteration": 2.4821081161499023 + }, + { + "auxiliary_loss_clip": 0.01097906, + "auxiliary_loss_mlp": 0.01026927, + "balance_loss_clip": 1.01600158, + "balance_loss_mlp": 1.03469324, + "epoch": 0.9807906207725838, + "flos": 17456392995840.0, + "grad_norm": 1.9862258679310378, + "language_loss": 0.62852752, + "learning_rate": 3.8575466363430785e-09, + "loss": 0.64977586, + "num_input_tokens_seen": 352069315, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6328125, + "step": 16313, + "time_per_iteration": 2.4287753105163574 + }, + { + "auxiliary_loss_clip": 0.01098817, + "auxiliary_loss_mlp": 0.01028705, + "balance_loss_clip": 1.01709437, + "balance_loss_mlp": 1.0344764, + "epoch": 0.9808507440252517, + "flos": 21032413361280.0, + "grad_norm": 1.8027464664706034, + "language_loss": 0.72543561, + "learning_rate": 3.833407015731316e-09, + "loss": 0.7467109, + "num_input_tokens_seen": 352089480, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.64453125, + "step": 16314, + "time_per_iteration": 2.450726270675659 + }, + { + "auxiliary_loss_clip": 0.01021599, + "auxiliary_loss_mlp": 0.01000108, + "balance_loss_clip": 0.99912471, + "balance_loss_mlp": 1.0017128, + "epoch": 0.9809108672779198, + "flos": 64044491598720.0, + "grad_norm": 0.6974943747069026, + "language_loss": 0.51689386, + "learning_rate": 3.80934308995684e-09, + "loss": 0.53711092, + "num_input_tokens_seen": 352150000, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.19921875, + "step": 16315, + "time_per_iteration": 3.039893388748169 + }, + { + "auxiliary_loss_clip": 0.01097985, + "auxiliary_loss_mlp": 0.01028059, + "balance_loss_clip": 1.01716948, + "balance_loss_mlp": 1.03282857, + "epoch": 0.9809709905305877, + "flos": 22780616296320.0, + "grad_norm": 1.702080149406472, + "language_loss": 0.69737405, + "learning_rate": 3.785354859932033e-09, + "loss": 0.71863449, + "num_input_tokens_seen": 352170990, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.65234375, + "step": 16316, + "time_per_iteration": 2.518357038497925 + }, + { + "auxiliary_loss_clip": 0.01100266, + "auxiliary_loss_mlp": 0.01026664, + "balance_loss_clip": 1.01516604, + "balance_loss_mlp": 1.03365529, + "epoch": 0.9810311137832557, + "flos": 37013415217920.0, + "grad_norm": 1.87109155525106, + "language_loss": 0.55548424, + "learning_rate": 3.76144232656661e-09, + "loss": 0.57675356, + "num_input_tokens_seen": 352195335, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 16317, + "time_per_iteration": 2.6049306392669678 + }, + { + "auxiliary_loss_clip": 0.01098549, + "auxiliary_loss_mlp": 0.01027948, + "balance_loss_clip": 1.01721954, + "balance_loss_mlp": 1.03464651, + "epoch": 0.9810912370359236, + "flos": 18916305373440.0, + "grad_norm": 1.5355444284157869, + "language_loss": 0.73103517, + "learning_rate": 3.737605490767404e-09, + "loss": 0.75230014, + "num_input_tokens_seen": 352214170, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.640625, + "step": 16318, + "time_per_iteration": 2.5125892162323 + }, + { + "auxiliary_loss_clip": 0.01096692, + "auxiliary_loss_mlp": 0.01025119, + "balance_loss_clip": 1.01424098, + "balance_loss_mlp": 1.03356123, + "epoch": 0.9811513602885916, + "flos": 18441602208000.0, + "grad_norm": 2.1192338472210173, + "language_loss": 0.82084936, + "learning_rate": 3.7138443534383555e-09, + "loss": 0.84206748, + "num_input_tokens_seen": 352231470, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6328125, + "step": 16319, + "time_per_iteration": 2.447026014328003 + }, + { + "auxiliary_loss_clip": 0.01021624, + "auxiliary_loss_mlp": 0.01003034, + "balance_loss_clip": 1.00209188, + "balance_loss_mlp": 1.00163507, + "epoch": 0.9812114835412595, + "flos": 68058945371520.0, + "grad_norm": 0.7434937814270395, + "language_loss": 0.53610063, + "learning_rate": 3.6901589154803014e-09, + "loss": 0.55634713, + "num_input_tokens_seen": 352291770, + "router_z_loss_clip": 0.00939941, + "router_z_loss_mlp": 0.20019531, + "step": 16320, + "time_per_iteration": 2.943744659423828 + }, + { + "auxiliary_loss_clip": 0.01099346, + "auxiliary_loss_mlp": 0.01031389, + "balance_loss_clip": 1.01992095, + "balance_loss_mlp": 1.03373194, + "epoch": 0.9812716067939276, + "flos": 25373007648000.0, + "grad_norm": 6.500748558217768, + "language_loss": 0.73322588, + "learning_rate": 3.6665491777914116e-09, + "loss": 0.75453323, + "num_input_tokens_seen": 352310735, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65625, + "step": 16321, + "time_per_iteration": 2.4934024810791016 + }, + { + "auxiliary_loss_clip": 0.01100443, + "auxiliary_loss_mlp": 0.01031922, + "balance_loss_clip": 1.02043045, + "balance_loss_mlp": 1.03733802, + "epoch": 0.9813317300465956, + "flos": 22856818999680.0, + "grad_norm": 2.271671638374391, + "language_loss": 0.78664875, + "learning_rate": 3.6430151412669698e-09, + "loss": 0.80797231, + "num_input_tokens_seen": 352329545, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6328125, + "step": 16322, + "time_per_iteration": 2.46575927734375 + }, + { + "auxiliary_loss_clip": 0.01097688, + "auxiliary_loss_mlp": 0.010344, + "balance_loss_clip": 1.0227654, + "balance_loss_mlp": 1.03237772, + "epoch": 0.9813918532992635, + "flos": 23586954756480.0, + "grad_norm": 1.5064148787884066, + "language_loss": 0.80583704, + "learning_rate": 3.619556806799595e-09, + "loss": 0.82715797, + "num_input_tokens_seen": 352352080, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 16323, + "time_per_iteration": 2.514381170272827 + }, + { + "auxiliary_loss_clip": 0.01101495, + "auxiliary_loss_mlp": 0.01030439, + "balance_loss_clip": 1.0194416, + "balance_loss_mlp": 1.03495967, + "epoch": 0.9814519765519315, + "flos": 19606328616960.0, + "grad_norm": 2.143238321382065, + "language_loss": 0.8492884, + "learning_rate": 3.596174175278799e-09, + "loss": 0.87060773, + "num_input_tokens_seen": 352366455, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6640625, + "step": 16324, + "time_per_iteration": 2.4203484058380127 + }, + { + "auxiliary_loss_clip": 0.01099194, + "auxiliary_loss_mlp": 0.01029615, + "balance_loss_clip": 1.01754546, + "balance_loss_mlp": 1.03411317, + "epoch": 0.9815120998045994, + "flos": 33946284787200.0, + "grad_norm": 1.4738035008515573, + "language_loss": 0.74333966, + "learning_rate": 3.5728672475909827e-09, + "loss": 0.76462775, + "num_input_tokens_seen": 352386090, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.65234375, + "step": 16325, + "time_per_iteration": 2.67201828956604 + }, + { + "auxiliary_loss_clip": 0.01094665, + "auxiliary_loss_mlp": 0.01031985, + "balance_loss_clip": 1.02152395, + "balance_loss_mlp": 1.03282595, + "epoch": 0.9815722230572674, + "flos": 20850023076480.0, + "grad_norm": 1.6682468721270072, + "language_loss": 0.76755691, + "learning_rate": 3.5496360246201063e-09, + "loss": 0.78882343, + "num_input_tokens_seen": 352404000, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.6171875, + "step": 16326, + "time_per_iteration": 2.4386472702026367 + }, + { + "auxiliary_loss_clip": 0.01101179, + "auxiliary_loss_mlp": 0.01025364, + "balance_loss_clip": 1.01325238, + "balance_loss_mlp": 1.0354383, + "epoch": 0.9816323463099353, + "flos": 22894525301760.0, + "grad_norm": 1.7666613399101891, + "language_loss": 0.67005306, + "learning_rate": 3.5264805072470205e-09, + "loss": 0.69131851, + "num_input_tokens_seen": 352423540, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66015625, + "step": 16327, + "time_per_iteration": 2.5084118843078613 + }, + { + "auxiliary_loss_clip": 0.01104489, + "auxiliary_loss_mlp": 0.0103366, + "balance_loss_clip": 1.02107108, + "balance_loss_mlp": 1.03541327, + "epoch": 0.9816924695626034, + "flos": 31539444117120.0, + "grad_norm": 1.5963313544140023, + "language_loss": 0.73459053, + "learning_rate": 3.5034006963501337e-09, + "loss": 0.75597197, + "num_input_tokens_seen": 352445530, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69140625, + "step": 16328, + "time_per_iteration": 2.513953685760498 + }, + { + "auxiliary_loss_clip": 0.01105032, + "auxiliary_loss_mlp": 0.01034872, + "balance_loss_clip": 1.02234316, + "balance_loss_mlp": 1.03475928, + "epoch": 0.9817525928152713, + "flos": 21506901045120.0, + "grad_norm": 1.6386198679453556, + "language_loss": 0.80848616, + "learning_rate": 3.4803965928040802e-09, + "loss": 0.82988524, + "num_input_tokens_seen": 352466325, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 16329, + "time_per_iteration": 2.4811136722564697 + }, + { + "auxiliary_loss_clip": 0.0110134, + "auxiliary_loss_mlp": 0.0102829, + "balance_loss_clip": 1.01591635, + "balance_loss_mlp": 1.03310025, + "epoch": 0.9818127160679393, + "flos": 25550513683200.0, + "grad_norm": 2.098740991949754, + "language_loss": 0.76318562, + "learning_rate": 3.4574681974817168e-09, + "loss": 0.78448194, + "num_input_tokens_seen": 352485505, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.68359375, + "step": 16330, + "time_per_iteration": 2.515571117401123 + }, + { + "auxiliary_loss_clip": 0.01107157, + "auxiliary_loss_mlp": 0.01029104, + "balance_loss_clip": 1.01529956, + "balance_loss_mlp": 1.0347935, + "epoch": 0.9818728393206072, + "flos": 28803661672320.0, + "grad_norm": 2.273811022859368, + "language_loss": 0.66393799, + "learning_rate": 3.434615511252126e-09, + "loss": 0.68530059, + "num_input_tokens_seen": 352505360, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7265625, + "step": 16331, + "time_per_iteration": 2.5182230472564697 + }, + { + "auxiliary_loss_clip": 0.01097163, + "auxiliary_loss_mlp": 0.01028643, + "balance_loss_clip": 1.01743126, + "balance_loss_mlp": 1.0327661, + "epoch": 0.9819329625732752, + "flos": 23222246014080.0, + "grad_norm": 1.6306236809447248, + "language_loss": 0.73071647, + "learning_rate": 3.411838534981948e-09, + "loss": 0.75197458, + "num_input_tokens_seen": 352524035, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 16332, + "time_per_iteration": 2.554363965988159 + }, + { + "auxiliary_loss_clip": 0.01098885, + "auxiliary_loss_mlp": 0.01025465, + "balance_loss_clip": 1.01507628, + "balance_loss_mlp": 1.03489494, + "epoch": 0.9819930858259431, + "flos": 17530440883200.0, + "grad_norm": 1.5999608499133222, + "language_loss": 0.76807606, + "learning_rate": 3.389137269534936e-09, + "loss": 0.78931957, + "num_input_tokens_seen": 352543210, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.640625, + "step": 16333, + "time_per_iteration": 2.406327724456787 + }, + { + "auxiliary_loss_clip": 0.01098481, + "auxiliary_loss_mlp": 0.01030122, + "balance_loss_clip": 1.01864195, + "balance_loss_mlp": 1.03352439, + "epoch": 0.9820532090786112, + "flos": 12529915971840.0, + "grad_norm": 2.3386346748180293, + "language_loss": 0.73073453, + "learning_rate": 3.366511715771958e-09, + "loss": 0.75202054, + "num_input_tokens_seen": 352559770, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6484375, + "step": 16334, + "time_per_iteration": 2.4535202980041504 + }, + { + "auxiliary_loss_clip": 0.01100827, + "auxiliary_loss_mlp": 0.01035443, + "balance_loss_clip": 1.02373648, + "balance_loss_mlp": 1.0337584, + "epoch": 0.9821133323312792, + "flos": 18840174497280.0, + "grad_norm": 1.889228221782078, + "language_loss": 0.78478283, + "learning_rate": 3.3439618745509934e-09, + "loss": 0.80614549, + "num_input_tokens_seen": 352577690, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 16335, + "time_per_iteration": 2.455636739730835 + }, + { + "auxiliary_loss_clip": 0.01102519, + "auxiliary_loss_mlp": 0.0103492, + "balance_loss_clip": 1.02213502, + "balance_loss_mlp": 1.03396535, + "epoch": 0.9821734555839471, + "flos": 34824013528320.0, + "grad_norm": 2.3440495087057447, + "language_loss": 0.64146876, + "learning_rate": 3.3214877467271362e-09, + "loss": 0.66284317, + "num_input_tokens_seen": 352598850, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.68359375, + "step": 16336, + "time_per_iteration": 2.595341444015503 + }, + { + "auxiliary_loss_clip": 0.0110517, + "auxiliary_loss_mlp": 0.01035567, + "balance_loss_clip": 1.02200055, + "balance_loss_mlp": 1.03517807, + "epoch": 0.9822335788366151, + "flos": 17128169493120.0, + "grad_norm": 1.9355169649892972, + "language_loss": 0.73395228, + "learning_rate": 3.299089333152372e-09, + "loss": 0.75535965, + "num_input_tokens_seen": 352616130, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.69921875, + "step": 16337, + "time_per_iteration": 2.4344332218170166 + }, + { + "auxiliary_loss_clip": 0.01100641, + "auxiliary_loss_mlp": 0.01027728, + "balance_loss_clip": 1.01541948, + "balance_loss_mlp": 1.03356791, + "epoch": 0.982293702089283, + "flos": 20813250528000.0, + "grad_norm": 1.6801667863354321, + "language_loss": 0.72507012, + "learning_rate": 3.2767666346764645e-09, + "loss": 0.74635381, + "num_input_tokens_seen": 352636885, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.671875, + "step": 16338, + "time_per_iteration": 2.4975478649139404 + }, + { + "auxiliary_loss_clip": 0.01096358, + "auxiliary_loss_mlp": 0.01030229, + "balance_loss_clip": 1.01844525, + "balance_loss_mlp": 1.03190184, + "epoch": 0.982353825341951, + "flos": 24680829588480.0, + "grad_norm": 1.8281230536728026, + "language_loss": 0.81268263, + "learning_rate": 3.2545196521454045e-09, + "loss": 0.83394849, + "num_input_tokens_seen": 352657905, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.64453125, + "step": 16339, + "time_per_iteration": 2.4743876457214355 + }, + { + "auxiliary_loss_clip": 0.01094696, + "auxiliary_loss_mlp": 0.01031203, + "balance_loss_clip": 1.02068281, + "balance_loss_mlp": 1.03254604, + "epoch": 0.982413948594619, + "flos": 20850489953280.0, + "grad_norm": 1.7405720603242414, + "language_loss": 0.62341028, + "learning_rate": 3.232348386403405e-09, + "loss": 0.64466929, + "num_input_tokens_seen": 352676320, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.62109375, + "step": 16340, + "time_per_iteration": 2.478207588195801 + }, + { + "auxiliary_loss_clip": 0.01102859, + "auxiliary_loss_mlp": 0.01032409, + "balance_loss_clip": 1.02079773, + "balance_loss_mlp": 1.03538668, + "epoch": 0.982474071847287, + "flos": 15377380778880.0, + "grad_norm": 2.6974854416597287, + "language_loss": 0.85674942, + "learning_rate": 3.2102528382904613e-09, + "loss": 0.87810206, + "num_input_tokens_seen": 352692665, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.67578125, + "step": 16341, + "time_per_iteration": 2.4368467330932617 + }, + { + "auxiliary_loss_clip": 0.01096331, + "auxiliary_loss_mlp": 0.01026724, + "balance_loss_clip": 1.01563168, + "balance_loss_mlp": 1.03398645, + "epoch": 0.9825341950999549, + "flos": 23774732081280.0, + "grad_norm": 1.426488361267362, + "language_loss": 0.66898513, + "learning_rate": 3.188233008645014e-09, + "loss": 0.69021565, + "num_input_tokens_seen": 352716130, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.625, + "step": 16342, + "time_per_iteration": 2.5108721256256104 + }, + { + "auxiliary_loss_clip": 0.01099917, + "auxiliary_loss_mlp": 0.01024609, + "balance_loss_clip": 1.01293254, + "balance_loss_mlp": 1.03381848, + "epoch": 0.9825943183526229, + "flos": 22746285872640.0, + "grad_norm": 1.5238109255321661, + "language_loss": 0.77271879, + "learning_rate": 3.16628889830195e-09, + "loss": 0.79396409, + "num_input_tokens_seen": 352734705, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 16343, + "time_per_iteration": 2.487384557723999 + }, + { + "auxiliary_loss_clip": 0.01097522, + "auxiliary_loss_mlp": 0.01030497, + "balance_loss_clip": 1.02021468, + "balance_loss_mlp": 1.03368938, + "epoch": 0.9826544416052908, + "flos": 27709966408320.0, + "grad_norm": 1.5085133090122882, + "language_loss": 0.7517612, + "learning_rate": 3.1444205080932707e-09, + "loss": 0.77304137, + "num_input_tokens_seen": 352756225, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.63671875, + "step": 16344, + "time_per_iteration": 2.538987159729004 + }, + { + "auxiliary_loss_clip": 0.01098149, + "auxiliary_loss_mlp": 0.01031521, + "balance_loss_clip": 1.01962423, + "balance_loss_mlp": 1.03374767, + "epoch": 0.9827145648579588, + "flos": 26941657472640.0, + "grad_norm": 2.501007535333455, + "language_loss": 0.66638464, + "learning_rate": 3.122627838848313e-09, + "loss": 0.68768132, + "num_input_tokens_seen": 352776210, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.64453125, + "step": 16345, + "time_per_iteration": 3.9340567588806152 + }, + { + "auxiliary_loss_clip": 0.01092782, + "auxiliary_loss_mlp": 0.01026635, + "balance_loss_clip": 1.01665115, + "balance_loss_mlp": 1.03144765, + "epoch": 0.9827746881106267, + "flos": 21866545969920.0, + "grad_norm": 1.3402147417907175, + "language_loss": 0.79547799, + "learning_rate": 3.1009108913933045e-09, + "loss": 0.81667221, + "num_input_tokens_seen": 352795455, + "router_z_loss_clip": 0.09960938, + "router_z_loss_mlp": 0.61328125, + "step": 16346, + "time_per_iteration": 2.4895803928375244 + }, + { + "auxiliary_loss_clip": 0.01104024, + "auxiliary_loss_mlp": 0.01030984, + "balance_loss_clip": 1.0188067, + "balance_loss_mlp": 1.03385854, + "epoch": 0.9828348113632948, + "flos": 20850777262080.0, + "grad_norm": 1.8224916412255767, + "language_loss": 0.74978042, + "learning_rate": 3.079269666552031e-09, + "loss": 0.7711305, + "num_input_tokens_seen": 352812895, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.703125, + "step": 16347, + "time_per_iteration": 3.956122398376465 + }, + { + "auxiliary_loss_clip": 0.01095315, + "auxiliary_loss_mlp": 0.01032494, + "balance_loss_clip": 1.02184844, + "balance_loss_mlp": 1.03214502, + "epoch": 0.9828949346159628, + "flos": 34569227381760.0, + "grad_norm": 1.9399061780009854, + "language_loss": 0.66402197, + "learning_rate": 3.0577041651449474e-09, + "loss": 0.68530005, + "num_input_tokens_seen": 352835470, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6328125, + "step": 16348, + "time_per_iteration": 3.9791600704193115 + }, + { + "auxiliary_loss_clip": 0.01099713, + "auxiliary_loss_mlp": 0.01026653, + "balance_loss_clip": 1.01496458, + "balance_loss_mlp": 1.03385162, + "epoch": 0.9829550578686307, + "flos": 24457464864000.0, + "grad_norm": 1.8906340007069518, + "language_loss": 0.69143182, + "learning_rate": 3.0362143879898437e-09, + "loss": 0.71269548, + "num_input_tokens_seen": 352854295, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65625, + "step": 16349, + "time_per_iteration": 2.517441987991333 + }, + { + "auxiliary_loss_clip": 0.0109294, + "auxiliary_loss_mlp": 0.01028225, + "balance_loss_clip": 1.01758015, + "balance_loss_mlp": 1.03203154, + "epoch": 0.9830151811212987, + "flos": 16910084067840.0, + "grad_norm": 1.9423722053932548, + "language_loss": 0.76204872, + "learning_rate": 3.0148003359014018e-09, + "loss": 0.78326035, + "num_input_tokens_seen": 352869695, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.609375, + "step": 16350, + "time_per_iteration": 2.423643112182617 + }, + { + "auxiliary_loss_clip": 0.01099798, + "auxiliary_loss_mlp": 0.01029763, + "balance_loss_clip": 1.0178715, + "balance_loss_mlp": 1.03397298, + "epoch": 0.9830753043739666, + "flos": 21288312829440.0, + "grad_norm": 2.1876266892296283, + "language_loss": 0.84113282, + "learning_rate": 2.9934620096920826e-09, + "loss": 0.86242843, + "num_input_tokens_seen": 352887430, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66015625, + "step": 16351, + "time_per_iteration": 2.446887969970703 + }, + { + "auxiliary_loss_clip": 0.01098309, + "auxiliary_loss_mlp": 0.01025634, + "balance_loss_clip": 1.01421404, + "balance_loss_mlp": 1.03314865, + "epoch": 0.9831354276266346, + "flos": 31723522341120.0, + "grad_norm": 1.6852483245981495, + "language_loss": 0.68510699, + "learning_rate": 2.972199410170795e-09, + "loss": 0.70634645, + "num_input_tokens_seen": 352907555, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 16352, + "time_per_iteration": 2.5545663833618164 + }, + { + "auxiliary_loss_clip": 0.01096103, + "auxiliary_loss_mlp": 0.01029225, + "balance_loss_clip": 1.01824594, + "balance_loss_mlp": 1.03311467, + "epoch": 0.9831955508793025, + "flos": 21619050284160.0, + "grad_norm": 1.4338701194753172, + "language_loss": 0.66359127, + "learning_rate": 2.951012538143782e-09, + "loss": 0.68484455, + "num_input_tokens_seen": 352928670, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.62890625, + "step": 16353, + "time_per_iteration": 3.974562883377075 + }, + { + "auxiliary_loss_clip": 0.01095175, + "auxiliary_loss_mlp": 0.01029938, + "balance_loss_clip": 1.01926327, + "balance_loss_mlp": 1.03227568, + "epoch": 0.9832556741319706, + "flos": 22968214053120.0, + "grad_norm": 1.470289829422996, + "language_loss": 0.74282354, + "learning_rate": 2.9299013944144025e-09, + "loss": 0.76407468, + "num_input_tokens_seen": 352948345, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.62890625, + "step": 16354, + "time_per_iteration": 2.4706337451934814 + }, + { + "auxiliary_loss_clip": 0.01096804, + "auxiliary_loss_mlp": 0.0102677, + "balance_loss_clip": 1.01560569, + "balance_loss_mlp": 1.03276682, + "epoch": 0.9833157973846385, + "flos": 21323900229120.0, + "grad_norm": 1.9586241896566348, + "language_loss": 0.77517724, + "learning_rate": 2.9088659797835702e-09, + "loss": 0.796413, + "num_input_tokens_seen": 352967250, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.640625, + "step": 16355, + "time_per_iteration": 2.4410529136657715 + }, + { + "auxiliary_loss_clip": 0.0109692, + "auxiliary_loss_mlp": 0.01028798, + "balance_loss_clip": 1.01734829, + "balance_loss_mlp": 1.03296006, + "epoch": 0.9833759206373065, + "flos": 21068719032960.0, + "grad_norm": 2.282707470444189, + "language_loss": 0.73298937, + "learning_rate": 2.8879062950484256e-09, + "loss": 0.75424653, + "num_input_tokens_seen": 352984725, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.640625, + "step": 16356, + "time_per_iteration": 2.435739517211914 + }, + { + "auxiliary_loss_clip": 0.0109747, + "auxiliary_loss_mlp": 0.01029947, + "balance_loss_clip": 1.01794899, + "balance_loss_mlp": 1.0338732, + "epoch": 0.9834360438899744, + "flos": 18697322108160.0, + "grad_norm": 1.6040337726439833, + "language_loss": 0.75952339, + "learning_rate": 2.8670223410041104e-09, + "loss": 0.7807976, + "num_input_tokens_seen": 353003480, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.63671875, + "step": 16357, + "time_per_iteration": 2.448345899581909 + }, + { + "auxiliary_loss_clip": 0.01097463, + "auxiliary_loss_mlp": 0.01022689, + "balance_loss_clip": 1.01070881, + "balance_loss_mlp": 1.03399682, + "epoch": 0.9834961671426424, + "flos": 21105240186240.0, + "grad_norm": 9.487408286348185, + "language_loss": 0.80191135, + "learning_rate": 2.846214118442436e-09, + "loss": 0.82311285, + "num_input_tokens_seen": 353021425, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6328125, + "step": 16358, + "time_per_iteration": 2.4538917541503906 + }, + { + "auxiliary_loss_clip": 0.01098016, + "auxiliary_loss_mlp": 0.01026763, + "balance_loss_clip": 1.01573038, + "balance_loss_mlp": 1.03340781, + "epoch": 0.9835562903953103, + "flos": 26687625511680.0, + "grad_norm": 3.160438132632366, + "language_loss": 0.67664564, + "learning_rate": 2.8254816281523263e-09, + "loss": 0.69789338, + "num_input_tokens_seen": 353039870, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 16359, + "time_per_iteration": 2.5928866863250732 + }, + { + "auxiliary_loss_clip": 0.01096367, + "auxiliary_loss_mlp": 0.01027638, + "balance_loss_clip": 1.01712346, + "balance_loss_mlp": 1.03287399, + "epoch": 0.9836164136479784, + "flos": 22090162089600.0, + "grad_norm": 2.767710883229253, + "language_loss": 0.6986711, + "learning_rate": 2.804824870920264e-09, + "loss": 0.71991116, + "num_input_tokens_seen": 353059750, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.63671875, + "step": 16360, + "time_per_iteration": 2.546980381011963 + }, + { + "auxiliary_loss_clip": 0.01099201, + "auxiliary_loss_mlp": 0.010292, + "balance_loss_clip": 1.01731467, + "balance_loss_mlp": 1.03346205, + "epoch": 0.9836765369006463, + "flos": 23878405710720.0, + "grad_norm": 1.6812441062486845, + "language_loss": 0.84103167, + "learning_rate": 2.7842438475293996e-09, + "loss": 0.86231565, + "num_input_tokens_seen": 353079940, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 16361, + "time_per_iteration": 2.464859962463379 + }, + { + "auxiliary_loss_clip": 0.01098239, + "auxiliary_loss_mlp": 0.0102529, + "balance_loss_clip": 1.01420975, + "balance_loss_mlp": 1.03314137, + "epoch": 0.9837366601533143, + "flos": 25845017293440.0, + "grad_norm": 1.7057424485209642, + "language_loss": 0.7577697, + "learning_rate": 2.76373855876022e-09, + "loss": 0.77900505, + "num_input_tokens_seen": 353099990, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6484375, + "step": 16362, + "time_per_iteration": 2.5125908851623535 + }, + { + "auxiliary_loss_clip": 0.01099486, + "auxiliary_loss_mlp": 0.01032079, + "balance_loss_clip": 1.02058172, + "balance_loss_mlp": 1.03428173, + "epoch": 0.9837967834059823, + "flos": 21358015171200.0, + "grad_norm": 1.6643095210607834, + "language_loss": 0.71448255, + "learning_rate": 2.7433090053901043e-09, + "loss": 0.73579824, + "num_input_tokens_seen": 353118710, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65234375, + "step": 16363, + "time_per_iteration": 2.432748556137085 + }, + { + "auxiliary_loss_clip": 0.01094357, + "auxiliary_loss_mlp": 0.01027605, + "balance_loss_clip": 1.0168941, + "balance_loss_mlp": 1.03287041, + "epoch": 0.9838569066586502, + "flos": 18515793749760.0, + "grad_norm": 1.7356464514395182, + "language_loss": 0.63440335, + "learning_rate": 2.7229551881937653e-09, + "loss": 0.65562296, + "num_input_tokens_seen": 353136415, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.61328125, + "step": 16364, + "time_per_iteration": 2.4572789669036865 + }, + { + "auxiliary_loss_clip": 0.01099675, + "auxiliary_loss_mlp": 0.01031483, + "balance_loss_clip": 1.02130258, + "balance_loss_mlp": 1.03466845, + "epoch": 0.9839170299113182, + "flos": 22452392793600.0, + "grad_norm": 1.4850865495256305, + "language_loss": 0.74915314, + "learning_rate": 2.702677107943252e-09, + "loss": 0.77046472, + "num_input_tokens_seen": 353154650, + "router_z_loss_clip": 0.1015625, + "router_z_loss_mlp": 0.6484375, + "step": 16365, + "time_per_iteration": 2.4729011058807373 + }, + { + "auxiliary_loss_clip": 0.01096935, + "auxiliary_loss_mlp": 0.01026662, + "balance_loss_clip": 1.01504469, + "balance_loss_mlp": 1.03303897, + "epoch": 0.9839771531639862, + "flos": 27892320779520.0, + "grad_norm": 7.143933962867107, + "language_loss": 0.76209521, + "learning_rate": 2.6824747654072832e-09, + "loss": 0.78333127, + "num_input_tokens_seen": 353174065, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.63671875, + "step": 16366, + "time_per_iteration": 2.5000736713409424 + }, + { + "auxiliary_loss_clip": 0.01095723, + "auxiliary_loss_mlp": 0.01025155, + "balance_loss_clip": 1.01459885, + "balance_loss_mlp": 1.03223205, + "epoch": 0.9840372764166542, + "flos": 28214510797440.0, + "grad_norm": 1.9814409544348766, + "language_loss": 0.77052504, + "learning_rate": 2.662348161352357e-09, + "loss": 0.79173386, + "num_input_tokens_seen": 353193560, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6328125, + "step": 16367, + "time_per_iteration": 2.542595624923706 + }, + { + "auxiliary_loss_clip": 0.01099313, + "auxiliary_loss_mlp": 0.01031393, + "balance_loss_clip": 1.0192219, + "balance_loss_mlp": 1.03569198, + "epoch": 0.9840973996693221, + "flos": 23403989854080.0, + "grad_norm": 1.745180491052293, + "language_loss": 0.61363411, + "learning_rate": 2.642297296540974e-09, + "loss": 0.63494116, + "num_input_tokens_seen": 353213525, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.63671875, + "step": 16368, + "time_per_iteration": 2.4790031909942627 + }, + { + "auxiliary_loss_clip": 0.01094785, + "auxiliary_loss_mlp": 0.01030327, + "balance_loss_clip": 1.01990819, + "balance_loss_mlp": 1.03288722, + "epoch": 0.9841575229219901, + "flos": 21395865127680.0, + "grad_norm": 1.5270024677807728, + "language_loss": 0.65519226, + "learning_rate": 2.6223221717340816e-09, + "loss": 0.67644334, + "num_input_tokens_seen": 353234000, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.62109375, + "step": 16369, + "time_per_iteration": 2.454857110977173 + }, + { + "auxiliary_loss_clip": 0.0110139, + "auxiliary_loss_mlp": 0.01032035, + "balance_loss_clip": 1.02021563, + "balance_loss_mlp": 1.03482819, + "epoch": 0.984217646174658, + "flos": 24464072966400.0, + "grad_norm": 1.4214427672990262, + "language_loss": 0.68732488, + "learning_rate": 2.6024227876886295e-09, + "loss": 0.70865911, + "num_input_tokens_seen": 353254940, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 16370, + "time_per_iteration": 2.517896890640259 + }, + { + "auxiliary_loss_clip": 0.01099591, + "auxiliary_loss_mlp": 0.01033771, + "balance_loss_clip": 1.02143872, + "balance_loss_mlp": 1.03292727, + "epoch": 0.984277769427326, + "flos": 16435057680000.0, + "grad_norm": 1.8084581245849027, + "language_loss": 0.73778242, + "learning_rate": 2.582599145159792e-09, + "loss": 0.75911605, + "num_input_tokens_seen": 353272590, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6640625, + "step": 16371, + "time_per_iteration": 2.464282274246216 + }, + { + "auxiliary_loss_clip": 0.01021782, + "auxiliary_loss_mlp": 0.01000386, + "balance_loss_clip": 0.99944443, + "balance_loss_mlp": 1.00176942, + "epoch": 0.9843378926799939, + "flos": 64530615288960.0, + "grad_norm": 0.7761667176847223, + "language_loss": 0.65162444, + "learning_rate": 2.562851244898745e-09, + "loss": 0.67184615, + "num_input_tokens_seen": 353334380, + "router_z_loss_clip": 0.00939941, + "router_z_loss_mlp": 0.20117188, + "step": 16372, + "time_per_iteration": 3.0799262523651123 + }, + { + "auxiliary_loss_clip": 0.01097301, + "auxiliary_loss_mlp": 0.01025244, + "balance_loss_clip": 1.01412201, + "balance_loss_mlp": 1.03277588, + "epoch": 0.984398015932662, + "flos": 17382811985280.0, + "grad_norm": 1.9497041934294832, + "language_loss": 0.70436323, + "learning_rate": 2.5431790876544456e-09, + "loss": 0.72558868, + "num_input_tokens_seen": 353351640, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.64453125, + "step": 16373, + "time_per_iteration": 2.434091091156006 + }, + { + "auxiliary_loss_clip": 0.01097529, + "auxiliary_loss_mlp": 0.01027896, + "balance_loss_clip": 1.01660144, + "balance_loss_mlp": 1.0344733, + "epoch": 0.9844581391853299, + "flos": 23879088069120.0, + "grad_norm": 1.5923709110598652, + "language_loss": 0.81572837, + "learning_rate": 2.523582674173186e-09, + "loss": 0.83698261, + "num_input_tokens_seen": 353372555, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.62890625, + "step": 16374, + "time_per_iteration": 2.488692045211792 + }, + { + "auxiliary_loss_clip": 0.01101403, + "auxiliary_loss_mlp": 0.01031314, + "balance_loss_clip": 1.02025104, + "balance_loss_mlp": 1.0355829, + "epoch": 0.9845182624379979, + "flos": 19865352568320.0, + "grad_norm": 1.6741401712819997, + "language_loss": 0.69374293, + "learning_rate": 2.504062005197927e-09, + "loss": 0.71507013, + "num_input_tokens_seen": 353391385, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.65625, + "step": 16375, + "time_per_iteration": 2.4871280193328857 + }, + { + "auxiliary_loss_clip": 0.01101374, + "auxiliary_loss_mlp": 0.01033035, + "balance_loss_clip": 1.02090538, + "balance_loss_mlp": 1.03388441, + "epoch": 0.9845783856906659, + "flos": 28254659224320.0, + "grad_norm": 1.7060471688472025, + "language_loss": 0.8095867, + "learning_rate": 2.484617081468521e-09, + "loss": 0.83093083, + "num_input_tokens_seen": 353411630, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 16376, + "time_per_iteration": 2.512218713760376 + }, + { + "auxiliary_loss_clip": 0.01096059, + "auxiliary_loss_mlp": 0.01030868, + "balance_loss_clip": 1.0195905, + "balance_loss_mlp": 1.03252149, + "epoch": 0.9846385089433338, + "flos": 28328383889280.0, + "grad_norm": 1.469956165284788, + "language_loss": 0.62223607, + "learning_rate": 2.4652479037228224e-09, + "loss": 0.64350533, + "num_input_tokens_seen": 353432895, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6328125, + "step": 16377, + "time_per_iteration": 2.5331015586853027 + }, + { + "auxiliary_loss_clip": 0.01100529, + "auxiliary_loss_mlp": 0.01035171, + "balance_loss_clip": 1.02324438, + "balance_loss_mlp": 1.03450775, + "epoch": 0.9846986321960018, + "flos": 24316767290880.0, + "grad_norm": 1.6192423407924923, + "language_loss": 0.728405, + "learning_rate": 2.445954472695133e-09, + "loss": 0.74976194, + "num_input_tokens_seen": 353454195, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66015625, + "step": 16378, + "time_per_iteration": 2.4707744121551514 + }, + { + "auxiliary_loss_clip": 0.01099505, + "auxiliary_loss_mlp": 0.01030908, + "balance_loss_clip": 1.01964295, + "balance_loss_mlp": 1.0338167, + "epoch": 0.9847587554486698, + "flos": 27271999877760.0, + "grad_norm": 1.6532810502944137, + "language_loss": 0.71028608, + "learning_rate": 2.426736789116868e-09, + "loss": 0.73159021, + "num_input_tokens_seen": 353475125, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 16379, + "time_per_iteration": 2.561509132385254 + }, + { + "auxiliary_loss_clip": 0.01101135, + "auxiliary_loss_mlp": 0.01030783, + "balance_loss_clip": 1.01918983, + "balance_loss_mlp": 1.03502083, + "epoch": 0.9848188787013378, + "flos": 16542717719040.0, + "grad_norm": 1.9173817203854187, + "language_loss": 0.68630135, + "learning_rate": 2.407594853716999e-09, + "loss": 0.7076205, + "num_input_tokens_seen": 353493265, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66015625, + "step": 16380, + "time_per_iteration": 2.483130931854248 + }, + { + "auxiliary_loss_clip": 0.01102739, + "auxiliary_loss_mlp": 0.01033361, + "balance_loss_clip": 1.0218395, + "balance_loss_mlp": 1.03448987, + "epoch": 0.9848790019540057, + "flos": 20193647898240.0, + "grad_norm": 1.9554023143101786, + "language_loss": 0.7881375, + "learning_rate": 2.38852866722139e-09, + "loss": 0.80949849, + "num_input_tokens_seen": 353511650, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.68359375, + "step": 16381, + "time_per_iteration": 2.4630978107452393 + }, + { + "auxiliary_loss_clip": 0.01098406, + "auxiliary_loss_mlp": 0.01026778, + "balance_loss_clip": 1.015733, + "balance_loss_mlp": 1.03296387, + "epoch": 0.9849391252066737, + "flos": 28259723041920.0, + "grad_norm": 1.4047755211177806, + "language_loss": 0.82333148, + "learning_rate": 2.3695382303527965e-09, + "loss": 0.84458339, + "num_input_tokens_seen": 353534035, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65234375, + "step": 16382, + "time_per_iteration": 2.5435211658477783 + }, + { + "auxiliary_loss_clip": 0.01102482, + "auxiliary_loss_mlp": 0.01029941, + "balance_loss_clip": 1.01768613, + "balance_loss_mlp": 1.03403974, + "epoch": 0.9849992484593416, + "flos": 22454942659200.0, + "grad_norm": 1.7630013745134487, + "language_loss": 0.74086952, + "learning_rate": 2.3506235438315316e-09, + "loss": 0.76219374, + "num_input_tokens_seen": 353549950, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.68359375, + "step": 16383, + "time_per_iteration": 2.461627244949341 + }, + { + "auxiliary_loss_clip": 0.0110084, + "auxiliary_loss_mlp": 0.01029632, + "balance_loss_clip": 1.01839042, + "balance_loss_mlp": 1.03497994, + "epoch": 0.9850593717120096, + "flos": 34497190656000.0, + "grad_norm": 1.4221795490292306, + "language_loss": 0.65806353, + "learning_rate": 2.3317846083750203e-09, + "loss": 0.67936826, + "num_input_tokens_seen": 353573745, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 16384, + "time_per_iteration": 2.594108819961548 + }, + { + "auxiliary_loss_clip": 0.01104674, + "auxiliary_loss_mlp": 0.01033399, + "balance_loss_clip": 1.01976156, + "balance_loss_mlp": 1.03614712, + "epoch": 0.9851194949646775, + "flos": 38837282152320.0, + "grad_norm": 1.6568999819680295, + "language_loss": 0.69966209, + "learning_rate": 2.313021424697359e-09, + "loss": 0.72104275, + "num_input_tokens_seen": 353595335, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.6875, + "step": 16385, + "time_per_iteration": 2.6449928283691406 + }, + { + "auxiliary_loss_clip": 0.01102637, + "auxiliary_loss_mlp": 0.01032623, + "balance_loss_clip": 1.02161956, + "balance_loss_mlp": 1.03761828, + "epoch": 0.9851796182173456, + "flos": 17712436118400.0, + "grad_norm": 1.929214822007236, + "language_loss": 0.81081849, + "learning_rate": 2.294333993509978e-09, + "loss": 0.83217108, + "num_input_tokens_seen": 353614270, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6484375, + "step": 16386, + "time_per_iteration": 2.4518470764160156 + }, + { + "auxiliary_loss_clip": 0.01100031, + "auxiliary_loss_mlp": 0.01029256, + "balance_loss_clip": 1.01721561, + "balance_loss_mlp": 1.03449285, + "epoch": 0.9852397414700135, + "flos": 27454318335360.0, + "grad_norm": 1.7433612430328327, + "language_loss": 0.67459857, + "learning_rate": 2.2757223155216442e-09, + "loss": 0.6958915, + "num_input_tokens_seen": 353634900, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.65625, + "step": 16387, + "time_per_iteration": 3.853261947631836 + }, + { + "auxiliary_loss_clip": 0.01092752, + "auxiliary_loss_mlp": 0.0102984, + "balance_loss_clip": 1.01898646, + "balance_loss_mlp": 1.03189099, + "epoch": 0.9852998647226815, + "flos": 18296702743680.0, + "grad_norm": 1.6794156400657199, + "language_loss": 0.73679399, + "learning_rate": 2.257186391438237e-09, + "loss": 0.75801992, + "num_input_tokens_seen": 353652890, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.609375, + "step": 16388, + "time_per_iteration": 2.4796459674835205 + }, + { + "auxiliary_loss_clip": 0.01096542, + "auxiliary_loss_mlp": 0.01029303, + "balance_loss_clip": 1.01828194, + "balance_loss_mlp": 1.03178144, + "epoch": 0.9853599879753495, + "flos": 19642562461440.0, + "grad_norm": 1.854112159676643, + "language_loss": 0.8199439, + "learning_rate": 2.238726221962528e-09, + "loss": 0.84120238, + "num_input_tokens_seen": 353671295, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 16389, + "time_per_iteration": 3.912445068359375 + }, + { + "auxiliary_loss_clip": 0.01097312, + "auxiliary_loss_mlp": 0.01025459, + "balance_loss_clip": 1.01399732, + "balance_loss_mlp": 1.03325129, + "epoch": 0.9854201112280174, + "flos": 23841956384640.0, + "grad_norm": 2.001006106345854, + "language_loss": 0.67084408, + "learning_rate": 2.2203418077946234e-09, + "loss": 0.6920718, + "num_input_tokens_seen": 353690560, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.640625, + "step": 16390, + "time_per_iteration": 2.4856414794921875 + }, + { + "auxiliary_loss_clip": 0.01100101, + "auxiliary_loss_mlp": 0.01032358, + "balance_loss_clip": 1.02013946, + "balance_loss_mlp": 1.03467011, + "epoch": 0.9854802344806854, + "flos": 30080573233920.0, + "grad_norm": 1.7503280437691784, + "language_loss": 0.77223754, + "learning_rate": 2.2020331496312994e-09, + "loss": 0.79356205, + "num_input_tokens_seen": 353710660, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65625, + "step": 16391, + "time_per_iteration": 2.5048904418945312 + }, + { + "auxiliary_loss_clip": 0.01093654, + "auxiliary_loss_mlp": 0.0102981, + "balance_loss_clip": 1.01943874, + "balance_loss_mlp": 1.03313243, + "epoch": 0.9855403577333534, + "flos": 21907412668800.0, + "grad_norm": 1.8244273189308011, + "language_loss": 0.68202817, + "learning_rate": 2.1838002481673333e-09, + "loss": 0.70326281, + "num_input_tokens_seen": 353730440, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.60546875, + "step": 16392, + "time_per_iteration": 2.4745800495147705 + }, + { + "auxiliary_loss_clip": 0.01102623, + "auxiliary_loss_mlp": 0.01026369, + "balance_loss_clip": 1.01361322, + "balance_loss_mlp": 1.03380561, + "epoch": 0.9856004809860214, + "flos": 15413794191360.0, + "grad_norm": 2.00746487685818, + "language_loss": 0.55832624, + "learning_rate": 2.1656431040937286e-09, + "loss": 0.57961619, + "num_input_tokens_seen": 353748360, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 16393, + "time_per_iteration": 2.4202845096588135 + }, + { + "auxiliary_loss_clip": 0.01105775, + "auxiliary_loss_mlp": 0.01031286, + "balance_loss_clip": 1.01867962, + "balance_loss_mlp": 1.03597665, + "epoch": 0.9856606042386893, + "flos": 13653201064320.0, + "grad_norm": 5.137455131585941, + "language_loss": 0.79335487, + "learning_rate": 2.1475617180990444e-09, + "loss": 0.81472552, + "num_input_tokens_seen": 353760880, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 16394, + "time_per_iteration": 2.3870041370391846 + }, + { + "auxiliary_loss_clip": 0.01102304, + "auxiliary_loss_mlp": 0.01031803, + "balance_loss_clip": 1.01954222, + "balance_loss_mlp": 1.03348887, + "epoch": 0.9857207274913573, + "flos": 23479151063040.0, + "grad_norm": 1.4204577915939423, + "language_loss": 0.76103747, + "learning_rate": 2.129556090869178e-09, + "loss": 0.78237855, + "num_input_tokens_seen": 353782255, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 16395, + "time_per_iteration": 4.123412370681763 + }, + { + "auxiliary_loss_clip": 0.01097875, + "auxiliary_loss_mlp": 0.01026877, + "balance_loss_clip": 1.01574266, + "balance_loss_mlp": 1.0336237, + "epoch": 0.9857808507440252, + "flos": 21065486808960.0, + "grad_norm": 1.8472320376349611, + "language_loss": 0.75438356, + "learning_rate": 2.1116262230866933e-09, + "loss": 0.77563113, + "num_input_tokens_seen": 353803580, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.64453125, + "step": 16396, + "time_per_iteration": 2.518141508102417 + }, + { + "auxiliary_loss_clip": 0.01097784, + "auxiliary_loss_mlp": 0.01027321, + "balance_loss_clip": 1.01606131, + "balance_loss_mlp": 1.03416276, + "epoch": 0.9858409739966932, + "flos": 25301365971840.0, + "grad_norm": 1.5636907431654377, + "language_loss": 0.70736861, + "learning_rate": 2.0937721154317133e-09, + "loss": 0.72861964, + "num_input_tokens_seen": 353824200, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.63671875, + "step": 16397, + "time_per_iteration": 2.497194528579712 + }, + { + "auxiliary_loss_clip": 0.01096257, + "auxiliary_loss_mlp": 0.0103081, + "balance_loss_clip": 1.02008724, + "balance_loss_mlp": 1.03538656, + "epoch": 0.9859010972493611, + "flos": 20558751690240.0, + "grad_norm": 1.9966865874098016, + "language_loss": 0.71433568, + "learning_rate": 2.0759937685810304e-09, + "loss": 0.73560631, + "num_input_tokens_seen": 353843350, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.609375, + "step": 16398, + "time_per_iteration": 2.4541091918945312 + }, + { + "auxiliary_loss_clip": 0.01097831, + "auxiliary_loss_mlp": 0.01026569, + "balance_loss_clip": 1.0156436, + "balance_loss_mlp": 1.03418803, + "epoch": 0.9859612205020292, + "flos": 24754985216640.0, + "grad_norm": 1.8623253931763133, + "language_loss": 0.73714447, + "learning_rate": 2.058291183208771e-09, + "loss": 0.7583884, + "num_input_tokens_seen": 353864520, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.63671875, + "step": 16399, + "time_per_iteration": 2.503669261932373 + }, + { + "auxiliary_loss_clip": 0.01098469, + "auxiliary_loss_mlp": 0.0102508, + "balance_loss_clip": 1.01344514, + "balance_loss_mlp": 1.03257656, + "epoch": 0.9860213437546971, + "flos": 21105850717440.0, + "grad_norm": 2.080462229184556, + "language_loss": 0.58062029, + "learning_rate": 2.0406643599863993e-09, + "loss": 0.60185581, + "num_input_tokens_seen": 353882240, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66015625, + "step": 16400, + "time_per_iteration": 2.4521939754486084 + }, + { + "auxiliary_loss_clip": 0.01105515, + "auxiliary_loss_mlp": 0.01028696, + "balance_loss_clip": 1.01613116, + "balance_loss_mlp": 1.03492236, + "epoch": 0.9860814670073651, + "flos": 19136078737920.0, + "grad_norm": 1.567548227092974, + "language_loss": 0.80283344, + "learning_rate": 2.023113299582491e-09, + "loss": 0.8241756, + "num_input_tokens_seen": 353901590, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.70703125, + "step": 16401, + "time_per_iteration": 2.4489645957946777 + }, + { + "auxiliary_loss_clip": 0.01097463, + "auxiliary_loss_mlp": 0.01034925, + "balance_loss_clip": 1.02253318, + "balance_loss_mlp": 1.03393412, + "epoch": 0.9861415902600331, + "flos": 17237050594560.0, + "grad_norm": 1.8340920908807528, + "language_loss": 0.77850628, + "learning_rate": 2.005638002662069e-09, + "loss": 0.79983014, + "num_input_tokens_seen": 353918785, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6328125, + "step": 16402, + "time_per_iteration": 2.440742015838623 + }, + { + "auxiliary_loss_clip": 0.01101709, + "auxiliary_loss_mlp": 0.01030608, + "balance_loss_clip": 1.01902652, + "balance_loss_mlp": 1.0353204, + "epoch": 0.986201713512701, + "flos": 27782577751680.0, + "grad_norm": 1.6254994621551133, + "language_loss": 0.69982457, + "learning_rate": 1.9882384698881596e-09, + "loss": 0.72114778, + "num_input_tokens_seen": 353940390, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6640625, + "step": 16403, + "time_per_iteration": 2.531202554702759 + }, + { + "auxiliary_loss_clip": 0.01095747, + "auxiliary_loss_mlp": 0.01028757, + "balance_loss_clip": 1.01799786, + "balance_loss_mlp": 1.03178513, + "epoch": 0.986261836765369, + "flos": 28730403884160.0, + "grad_norm": 3.811739920354137, + "language_loss": 0.74388409, + "learning_rate": 1.9709147019204566e-09, + "loss": 0.76512915, + "num_input_tokens_seen": 353962180, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.640625, + "step": 16404, + "time_per_iteration": 2.4980599880218506 + }, + { + "auxiliary_loss_clip": 0.01099419, + "auxiliary_loss_mlp": 0.01025517, + "balance_loss_clip": 1.01446664, + "balance_loss_mlp": 1.03424644, + "epoch": 0.986321960018037, + "flos": 34313471568000.0, + "grad_norm": 2.2063331228177026, + "language_loss": 0.7017042, + "learning_rate": 1.953666699415768e-09, + "loss": 0.72295356, + "num_input_tokens_seen": 353984305, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65234375, + "step": 16405, + "time_per_iteration": 2.5640861988067627 + }, + { + "auxiliary_loss_clip": 0.01098905, + "auxiliary_loss_mlp": 0.01032199, + "balance_loss_clip": 1.02158928, + "balance_loss_mlp": 1.03562474, + "epoch": 0.986382083270705, + "flos": 25189755436800.0, + "grad_norm": 1.9063114595152784, + "language_loss": 0.69724238, + "learning_rate": 1.93649446302846e-09, + "loss": 0.71855342, + "num_input_tokens_seen": 354004495, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6328125, + "step": 16406, + "time_per_iteration": 2.495396375656128 + }, + { + "auxiliary_loss_clip": 0.0109845, + "auxiliary_loss_mlp": 0.01033359, + "balance_loss_clip": 1.0219388, + "balance_loss_mlp": 1.03573644, + "epoch": 0.9864422065233729, + "flos": 11025904671360.0, + "grad_norm": 3.422415674377729, + "language_loss": 0.74666607, + "learning_rate": 1.9193979934095663e-09, + "loss": 0.76798415, + "num_input_tokens_seen": 354015985, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.62890625, + "step": 16407, + "time_per_iteration": 2.4423317909240723 + }, + { + "auxiliary_loss_clip": 0.01096271, + "auxiliary_loss_mlp": 0.01029776, + "balance_loss_clip": 1.01847541, + "balance_loss_mlp": 1.03227949, + "epoch": 0.9865023297760409, + "flos": 16545590807040.0, + "grad_norm": 2.0280682887317667, + "language_loss": 0.77168655, + "learning_rate": 1.9023772912072357e-09, + "loss": 0.79294705, + "num_input_tokens_seen": 354033260, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 16408, + "time_per_iteration": 2.516061782836914 + }, + { + "auxiliary_loss_clip": 0.01101714, + "auxiliary_loss_mlp": 0.01029351, + "balance_loss_clip": 1.01701331, + "balance_loss_mlp": 1.03434014, + "epoch": 0.9865624530287088, + "flos": 18880179269760.0, + "grad_norm": 1.6301786211339495, + "language_loss": 0.67791158, + "learning_rate": 1.8854323570669515e-09, + "loss": 0.69922221, + "num_input_tokens_seen": 354052825, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.671875, + "step": 16409, + "time_per_iteration": 2.4589552879333496 + }, + { + "auxiliary_loss_clip": 0.0102164, + "auxiliary_loss_mlp": 0.00999411, + "balance_loss_clip": 0.99843997, + "balance_loss_mlp": 1.00167465, + "epoch": 0.9866225762813768, + "flos": 68887798680960.0, + "grad_norm": 0.8043087350098772, + "language_loss": 0.61067098, + "learning_rate": 1.8685631916313118e-09, + "loss": 0.63088149, + "num_input_tokens_seen": 354113920, + "router_z_loss_clip": 0.00970459, + "router_z_loss_mlp": 0.19921875, + "step": 16410, + "time_per_iteration": 3.1711127758026123 + }, + { + "auxiliary_loss_clip": 0.01099821, + "auxiliary_loss_mlp": 0.01029496, + "balance_loss_clip": 1.0180335, + "balance_loss_mlp": 1.03321028, + "epoch": 0.9866826995340447, + "flos": 29023111814400.0, + "grad_norm": 2.895263925191816, + "language_loss": 0.66438043, + "learning_rate": 1.8517697955400258e-09, + "loss": 0.68567365, + "num_input_tokens_seen": 354134210, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6640625, + "step": 16411, + "time_per_iteration": 2.529005765914917 + }, + { + "auxiliary_loss_clip": 0.01021481, + "auxiliary_loss_mlp": 0.01002904, + "balance_loss_clip": 1.00196195, + "balance_loss_mlp": 1.00161529, + "epoch": 0.9867428227867128, + "flos": 65376814867200.0, + "grad_norm": 0.7221177127578288, + "language_loss": 0.56282055, + "learning_rate": 1.8350521694299182e-09, + "loss": 0.58306438, + "num_input_tokens_seen": 354198010, + "router_z_loss_clip": 0.00939941, + "router_z_loss_mlp": 0.19921875, + "step": 16412, + "time_per_iteration": 3.1342015266418457 + }, + { + "auxiliary_loss_clip": 0.01102714, + "auxiliary_loss_mlp": 0.01031697, + "balance_loss_clip": 1.01949, + "balance_loss_mlp": 1.0351032, + "epoch": 0.9868029460393807, + "flos": 26506312634880.0, + "grad_norm": 1.5575339262302221, + "language_loss": 0.73079598, + "learning_rate": 1.818410313934926e-09, + "loss": 0.75214005, + "num_input_tokens_seen": 354220000, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.67578125, + "step": 16413, + "time_per_iteration": 2.504788637161255 + }, + { + "auxiliary_loss_clip": 0.01098204, + "auxiliary_loss_mlp": 0.01026072, + "balance_loss_clip": 1.01468778, + "balance_loss_mlp": 1.03174376, + "epoch": 0.9868630692920487, + "flos": 22967280299520.0, + "grad_norm": 1.7588665592045418, + "language_loss": 0.71731371, + "learning_rate": 1.8018442296858782e-09, + "loss": 0.7385565, + "num_input_tokens_seen": 354240910, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 16414, + "time_per_iteration": 2.459226369857788 + }, + { + "auxiliary_loss_clip": 0.01097317, + "auxiliary_loss_mlp": 0.01030344, + "balance_loss_clip": 1.01959157, + "balance_loss_mlp": 1.03502083, + "epoch": 0.9869231925447167, + "flos": 19828687760640.0, + "grad_norm": 1.8732802081959814, + "language_loss": 0.70089632, + "learning_rate": 1.7853539173111608e-09, + "loss": 0.72217298, + "num_input_tokens_seen": 354259430, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.625, + "step": 16415, + "time_per_iteration": 2.4389734268188477 + }, + { + "auxiliary_loss_clip": 0.01093108, + "auxiliary_loss_mlp": 0.01024293, + "balance_loss_clip": 1.01405334, + "balance_loss_mlp": 1.03190827, + "epoch": 0.9869833157973846, + "flos": 20195228096640.0, + "grad_norm": 1.436678023164937, + "language_loss": 0.75416452, + "learning_rate": 1.7689393774362737e-09, + "loss": 0.77533853, + "num_input_tokens_seen": 354279490, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.61328125, + "step": 16416, + "time_per_iteration": 2.4589333534240723 + }, + { + "auxiliary_loss_clip": 0.01098366, + "auxiliary_loss_mlp": 0.01030917, + "balance_loss_clip": 1.01936018, + "balance_loss_mlp": 1.03430641, + "epoch": 0.9870434390500527, + "flos": 16099507802880.0, + "grad_norm": 2.5382717093176907, + "language_loss": 0.70592904, + "learning_rate": 1.7526006106833858e-09, + "loss": 0.72722185, + "num_input_tokens_seen": 354295080, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 16417, + "time_per_iteration": 2.411501169204712 + }, + { + "auxiliary_loss_clip": 0.01104486, + "auxiliary_loss_mlp": 0.01031386, + "balance_loss_clip": 1.01942897, + "balance_loss_mlp": 1.036502, + "epoch": 0.9871035623027206, + "flos": 21760753438080.0, + "grad_norm": 1.3664906698719754, + "language_loss": 0.70402956, + "learning_rate": 1.7363376176720013e-09, + "loss": 0.72538829, + "num_input_tokens_seen": 354314610, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 16418, + "time_per_iteration": 2.460721731185913 + }, + { + "auxiliary_loss_clip": 0.0102156, + "auxiliary_loss_mlp": 0.0099861, + "balance_loss_clip": 0.99766254, + "balance_loss_mlp": 1.00155318, + "epoch": 0.9871636855553886, + "flos": 70219583245440.0, + "grad_norm": 0.6547409033160193, + "language_loss": 0.53709066, + "learning_rate": 1.7201503990189603e-09, + "loss": 0.55729234, + "num_input_tokens_seen": 354383115, + "router_z_loss_clip": 0.00946045, + "router_z_loss_mlp": 0.20019531, + "step": 16419, + "time_per_iteration": 3.1816153526306152 + }, + { + "auxiliary_loss_clip": 0.01102162, + "auxiliary_loss_mlp": 0.01033112, + "balance_loss_clip": 1.0203383, + "balance_loss_mlp": 1.03322339, + "epoch": 0.9872238088080565, + "flos": 25045825639680.0, + "grad_norm": 1.8013568120326042, + "language_loss": 0.78115129, + "learning_rate": 1.7040389553382162e-09, + "loss": 0.80250394, + "num_input_tokens_seen": 354403115, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69140625, + "step": 16420, + "time_per_iteration": 2.513073682785034 + }, + { + "auxiliary_loss_clip": 0.01100847, + "auxiliary_loss_mlp": 0.01026668, + "balance_loss_clip": 1.01531947, + "balance_loss_mlp": 1.03717494, + "epoch": 0.9872839320607245, + "flos": 19465846525440.0, + "grad_norm": 1.6534906629377784, + "language_loss": 0.70953268, + "learning_rate": 1.6880032872403916e-09, + "loss": 0.73080778, + "num_input_tokens_seen": 354424520, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.63671875, + "step": 16421, + "time_per_iteration": 2.478576898574829 + }, + { + "auxiliary_loss_clip": 0.0110206, + "auxiliary_loss_mlp": 0.01034617, + "balance_loss_clip": 1.02187395, + "balance_loss_mlp": 1.03412378, + "epoch": 0.9873440553133924, + "flos": 26942914448640.0, + "grad_norm": 1.9879851730587292, + "language_loss": 0.82305312, + "learning_rate": 1.6720433953338886e-09, + "loss": 0.8444199, + "num_input_tokens_seen": 354444800, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 16422, + "time_per_iteration": 2.5021886825561523 + }, + { + "auxiliary_loss_clip": 0.01098518, + "auxiliary_loss_mlp": 0.01025282, + "balance_loss_clip": 1.01423693, + "balance_loss_mlp": 1.03484821, + "epoch": 0.9874041785660604, + "flos": 19062210418560.0, + "grad_norm": 1.7189782586075049, + "language_loss": 0.86038244, + "learning_rate": 1.656159280223779e-09, + "loss": 0.88162035, + "num_input_tokens_seen": 354464590, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.63671875, + "step": 16423, + "time_per_iteration": 2.485748052597046 + }, + { + "auxiliary_loss_clip": 0.01102025, + "auxiliary_loss_mlp": 0.0102766, + "balance_loss_clip": 1.016186, + "balance_loss_mlp": 1.03556752, + "epoch": 0.9874643018187284, + "flos": 21105814803840.0, + "grad_norm": 2.8325177104829575, + "language_loss": 0.70638502, + "learning_rate": 1.6403509425122475e-09, + "loss": 0.72768188, + "num_input_tokens_seen": 354484145, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6640625, + "step": 16424, + "time_per_iteration": 2.4713919162750244 + }, + { + "auxiliary_loss_clip": 0.01099037, + "auxiliary_loss_mlp": 0.01029034, + "balance_loss_clip": 1.0172801, + "balance_loss_mlp": 1.03294778, + "epoch": 0.9875244250713964, + "flos": 24426043441920.0, + "grad_norm": 1.9805103537761688, + "language_loss": 0.80257469, + "learning_rate": 1.6246183827990366e-09, + "loss": 0.8238554, + "num_input_tokens_seen": 354502475, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6640625, + "step": 16425, + "time_per_iteration": 2.477077007293701 + }, + { + "auxiliary_loss_clip": 0.01100313, + "auxiliary_loss_mlp": 0.01030155, + "balance_loss_clip": 1.01812065, + "balance_loss_mlp": 1.03379631, + "epoch": 0.9875845483240643, + "flos": 25117610970240.0, + "grad_norm": 1.9649003604605135, + "language_loss": 0.79694617, + "learning_rate": 1.6089616016803364e-09, + "loss": 0.81825078, + "num_input_tokens_seen": 354521855, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6640625, + "step": 16426, + "time_per_iteration": 2.47365403175354 + }, + { + "auxiliary_loss_clip": 0.01100099, + "auxiliary_loss_mlp": 0.01032306, + "balance_loss_clip": 1.02130342, + "balance_loss_mlp": 1.03583455, + "epoch": 0.9876446715767323, + "flos": 16581788737920.0, + "grad_norm": 1.919968888964341, + "language_loss": 0.84918183, + "learning_rate": 1.593380599750338e-09, + "loss": 0.87050593, + "num_input_tokens_seen": 354539535, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 16427, + "time_per_iteration": 2.4481728076934814 + }, + { + "auxiliary_loss_clip": 0.01097771, + "auxiliary_loss_mlp": 0.01031649, + "balance_loss_clip": 1.02047956, + "balance_loss_mlp": 1.03433597, + "epoch": 0.9877047948294003, + "flos": 21616141282560.0, + "grad_norm": 1.808835451302429, + "language_loss": 0.70217133, + "learning_rate": 1.577875377599458e-09, + "loss": 0.72346556, + "num_input_tokens_seen": 354557430, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6328125, + "step": 16428, + "time_per_iteration": 3.8493616580963135 + }, + { + "auxiliary_loss_clip": 0.01096844, + "auxiliary_loss_mlp": 0.01031684, + "balance_loss_clip": 1.02038908, + "balance_loss_mlp": 1.03368545, + "epoch": 0.9877649180820682, + "flos": 21178497974400.0, + "grad_norm": 1.9142096733438485, + "language_loss": 0.79910493, + "learning_rate": 1.5624459358158926e-09, + "loss": 0.82039022, + "num_input_tokens_seen": 354574735, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6328125, + "step": 16429, + "time_per_iteration": 2.4763388633728027 + }, + { + "auxiliary_loss_clip": 0.01097307, + "auxiliary_loss_mlp": 0.01026506, + "balance_loss_clip": 1.01552033, + "balance_loss_mlp": 1.03279054, + "epoch": 0.9878250413347363, + "flos": 39749233576320.0, + "grad_norm": 1.512921455158019, + "language_loss": 0.61957049, + "learning_rate": 1.5470922749845073e-09, + "loss": 0.64080858, + "num_input_tokens_seen": 354597050, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6484375, + "step": 16430, + "time_per_iteration": 2.6241238117218018 + }, + { + "auxiliary_loss_clip": 0.01099232, + "auxiliary_loss_mlp": 0.01031942, + "balance_loss_clip": 1.02070665, + "balance_loss_mlp": 1.03415561, + "epoch": 0.9878851645874042, + "flos": 29425634599680.0, + "grad_norm": 1.4649628696556245, + "language_loss": 0.72812045, + "learning_rate": 1.531814395687725e-09, + "loss": 0.74943221, + "num_input_tokens_seen": 354619095, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 16431, + "time_per_iteration": 5.483947038650513 + }, + { + "auxiliary_loss_clip": 0.01099159, + "auxiliary_loss_mlp": 0.01031109, + "balance_loss_clip": 1.01914072, + "balance_loss_mlp": 1.03423476, + "epoch": 0.9879452878400722, + "flos": 15806261168640.0, + "grad_norm": 2.237292179691481, + "language_loss": 0.81017017, + "learning_rate": 1.5166122985048602e-09, + "loss": 0.83147275, + "num_input_tokens_seen": 354633790, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6484375, + "step": 16432, + "time_per_iteration": 2.508455753326416 + }, + { + "auxiliary_loss_clip": 0.0109516, + "auxiliary_loss_mlp": 0.01030559, + "balance_loss_clip": 1.02015758, + "balance_loss_mlp": 1.0320065, + "epoch": 0.9880054110927401, + "flos": 22233912318720.0, + "grad_norm": 1.573439627141087, + "language_loss": 0.80520278, + "learning_rate": 1.5014859840123405e-09, + "loss": 0.82645994, + "num_input_tokens_seen": 354653180, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.6328125, + "step": 16433, + "time_per_iteration": 2.4864025115966797 + }, + { + "auxiliary_loss_clip": 0.01096891, + "auxiliary_loss_mlp": 0.01032717, + "balance_loss_clip": 1.0212785, + "balance_loss_mlp": 1.03420711, + "epoch": 0.9880655343454081, + "flos": 28763836467840.0, + "grad_norm": 2.5851273192510784, + "language_loss": 0.64777255, + "learning_rate": 1.4864354527837075e-09, + "loss": 0.66906863, + "num_input_tokens_seen": 354669900, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.625, + "step": 16434, + "time_per_iteration": 2.5199642181396484 + }, + { + "auxiliary_loss_clip": 0.010991, + "auxiliary_loss_mlp": 0.01029842, + "balance_loss_clip": 1.017802, + "balance_loss_mlp": 1.03237224, + "epoch": 0.988125657598076, + "flos": 32853379622400.0, + "grad_norm": 1.5409302130170526, + "language_loss": 0.69133604, + "learning_rate": 1.4714607053896154e-09, + "loss": 0.71262544, + "num_input_tokens_seen": 354693165, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.66796875, + "step": 16435, + "time_per_iteration": 2.568521499633789 + }, + { + "auxiliary_loss_clip": 0.01100032, + "auxiliary_loss_mlp": 0.01034501, + "balance_loss_clip": 1.02277112, + "balance_loss_mlp": 1.03586268, + "epoch": 0.988185780850744, + "flos": 19390685316480.0, + "grad_norm": 1.8027754000031349, + "language_loss": 0.75371569, + "learning_rate": 1.4565617423980548e-09, + "loss": 0.77506101, + "num_input_tokens_seen": 354711915, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.640625, + "step": 16436, + "time_per_iteration": 3.9403867721557617 + }, + { + "auxiliary_loss_clip": 0.01100488, + "auxiliary_loss_mlp": 0.01029817, + "balance_loss_clip": 1.01765811, + "balance_loss_mlp": 1.03544521, + "epoch": 0.988245904103412, + "flos": 22528415928960.0, + "grad_norm": 2.2611545636080606, + "language_loss": 0.74154097, + "learning_rate": 1.4417385643741286e-09, + "loss": 0.76284397, + "num_input_tokens_seen": 354729135, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.65234375, + "step": 16437, + "time_per_iteration": 2.4243505001068115 + }, + { + "auxiliary_loss_clip": 0.01094253, + "auxiliary_loss_mlp": 0.01028978, + "balance_loss_clip": 1.01777816, + "balance_loss_mlp": 1.03196597, + "epoch": 0.98830602735608, + "flos": 28659193171200.0, + "grad_norm": 1.57503861235398, + "language_loss": 0.60063571, + "learning_rate": 1.4269911718796103e-09, + "loss": 0.62186807, + "num_input_tokens_seen": 354752530, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.62109375, + "step": 16438, + "time_per_iteration": 2.519336223602295 + }, + { + "auxiliary_loss_clip": 0.01098767, + "auxiliary_loss_mlp": 0.01028757, + "balance_loss_clip": 1.01675236, + "balance_loss_mlp": 1.03432131, + "epoch": 0.9883661506087479, + "flos": 20996035862400.0, + "grad_norm": 1.9343189571400579, + "language_loss": 0.71689999, + "learning_rate": 1.4123195654738295e-09, + "loss": 0.73817527, + "num_input_tokens_seen": 354771135, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.64453125, + "step": 16439, + "time_per_iteration": 2.429018974304199 + }, + { + "auxiliary_loss_clip": 0.01097636, + "auxiliary_loss_mlp": 0.01031286, + "balance_loss_clip": 1.01943684, + "balance_loss_mlp": 1.03360188, + "epoch": 0.9884262738614159, + "flos": 32706109860480.0, + "grad_norm": 1.524025495504474, + "language_loss": 0.60003507, + "learning_rate": 1.3977237457134528e-09, + "loss": 0.6213243, + "num_input_tokens_seen": 354791800, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.640625, + "step": 16440, + "time_per_iteration": 2.5483033657073975 + }, + { + "auxiliary_loss_clip": 0.0110006, + "auxiliary_loss_mlp": 0.0102916, + "balance_loss_clip": 1.01797223, + "balance_loss_mlp": 1.03258252, + "epoch": 0.9884863971140839, + "flos": 17564699479680.0, + "grad_norm": 2.3344058610035954, + "language_loss": 0.75737202, + "learning_rate": 1.3832037131513707e-09, + "loss": 0.77866423, + "num_input_tokens_seen": 354809200, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.67578125, + "step": 16441, + "time_per_iteration": 2.4178669452667236 + }, + { + "auxiliary_loss_clip": 0.01098798, + "auxiliary_loss_mlp": 0.01026943, + "balance_loss_clip": 1.01528406, + "balance_loss_mlp": 1.03330851, + "epoch": 0.9885465203667518, + "flos": 40552519380480.0, + "grad_norm": 2.8695336185475675, + "language_loss": 0.68061352, + "learning_rate": 1.3687594683386982e-09, + "loss": 0.70187092, + "num_input_tokens_seen": 354829945, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 16442, + "time_per_iteration": 2.6177468299865723 + }, + { + "auxiliary_loss_clip": 0.01098647, + "auxiliary_loss_mlp": 0.01028235, + "balance_loss_clip": 1.01696944, + "balance_loss_mlp": 1.03386927, + "epoch": 0.9886066436194199, + "flos": 13807976768640.0, + "grad_norm": 2.091114681520994, + "language_loss": 0.74713242, + "learning_rate": 1.3543910118227753e-09, + "loss": 0.76840127, + "num_input_tokens_seen": 354845055, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6484375, + "step": 16443, + "time_per_iteration": 2.386375904083252 + }, + { + "auxiliary_loss_clip": 0.01100478, + "auxiliary_loss_mlp": 0.0102655, + "balance_loss_clip": 1.01426518, + "balance_loss_mlp": 1.03359616, + "epoch": 0.9886667668720878, + "flos": 23325129544320.0, + "grad_norm": 1.669549874757317, + "language_loss": 0.73382336, + "learning_rate": 1.3400983441487213e-09, + "loss": 0.75509363, + "num_input_tokens_seen": 354864680, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66796875, + "step": 16444, + "time_per_iteration": 2.4635965824127197 + }, + { + "auxiliary_loss_clip": 0.01099254, + "auxiliary_loss_mlp": 0.01029277, + "balance_loss_clip": 1.01845896, + "balance_loss_mlp": 1.03630447, + "epoch": 0.9887268901247558, + "flos": 22706029704960.0, + "grad_norm": 1.8001985272034744, + "language_loss": 0.69300127, + "learning_rate": 1.325881465858547e-09, + "loss": 0.71428657, + "num_input_tokens_seen": 354885685, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.62890625, + "step": 16445, + "time_per_iteration": 2.4620094299316406 + }, + { + "auxiliary_loss_clip": 0.01100718, + "auxiliary_loss_mlp": 0.01024609, + "balance_loss_clip": 1.01272345, + "balance_loss_mlp": 1.03484011, + "epoch": 0.9887870133774237, + "flos": 13041283944960.0, + "grad_norm": 3.924942005630186, + "language_loss": 0.60178292, + "learning_rate": 1.311740377491155e-09, + "loss": 0.62303621, + "num_input_tokens_seen": 354901505, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66015625, + "step": 16446, + "time_per_iteration": 2.4125113487243652 + }, + { + "auxiliary_loss_clip": 0.01097171, + "auxiliary_loss_mlp": 0.01031894, + "balance_loss_clip": 1.02105784, + "balance_loss_mlp": 1.03262711, + "epoch": 0.9888471366300917, + "flos": 15158864390400.0, + "grad_norm": 2.0039707237348914, + "language_loss": 0.7062999, + "learning_rate": 1.297675079582783e-09, + "loss": 0.72759056, + "num_input_tokens_seen": 354920060, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.64453125, + "step": 16447, + "time_per_iteration": 2.428260564804077 + }, + { + "auxiliary_loss_clip": 0.01097888, + "auxiliary_loss_mlp": 0.01026915, + "balance_loss_clip": 1.0161624, + "balance_loss_mlp": 1.03387737, + "epoch": 0.9889072598827596, + "flos": 25118796119040.0, + "grad_norm": 1.8113206322833593, + "language_loss": 0.83943892, + "learning_rate": 1.2836855726667818e-09, + "loss": 0.8606869, + "num_input_tokens_seen": 354938690, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.640625, + "step": 16448, + "time_per_iteration": 2.495060443878174 + }, + { + "auxiliary_loss_clip": 0.01095164, + "auxiliary_loss_mlp": 0.01024451, + "balance_loss_clip": 1.0142343, + "balance_loss_mlp": 1.03284883, + "epoch": 0.9889673831354276, + "flos": 16728663450240.0, + "grad_norm": 1.6079355233530224, + "language_loss": 0.7015419, + "learning_rate": 1.26977185727406e-09, + "loss": 0.72273797, + "num_input_tokens_seen": 354956955, + "router_z_loss_clip": 0.10205078, + "router_z_loss_mlp": 0.625, + "step": 16449, + "time_per_iteration": 2.5300204753875732 + }, + { + "auxiliary_loss_clip": 0.01100835, + "auxiliary_loss_mlp": 0.0102704, + "balance_loss_clip": 1.01573348, + "balance_loss_mlp": 1.03388381, + "epoch": 0.9890275063880956, + "flos": 35585175657600.0, + "grad_norm": 2.23472735213177, + "language_loss": 0.74104172, + "learning_rate": 1.25593393393153e-09, + "loss": 0.76232046, + "num_input_tokens_seen": 354976800, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 16450, + "time_per_iteration": 2.5563437938690186 + }, + { + "auxiliary_loss_clip": 0.0110011, + "auxiliary_loss_mlp": 0.01030912, + "balance_loss_clip": 1.01878238, + "balance_loss_mlp": 1.03238416, + "epoch": 0.9890876296407636, + "flos": 18952359649920.0, + "grad_norm": 2.0590555415637493, + "language_loss": 0.79410666, + "learning_rate": 1.242171803164549e-09, + "loss": 0.81541693, + "num_input_tokens_seen": 354996625, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6796875, + "step": 16451, + "time_per_iteration": 2.4256367683410645 + }, + { + "auxiliary_loss_clip": 0.01099003, + "auxiliary_loss_mlp": 0.01036805, + "balance_loss_clip": 1.02487206, + "balance_loss_mlp": 1.0322752, + "epoch": 0.9891477528934315, + "flos": 23769309127680.0, + "grad_norm": 1.9104470417388077, + "language_loss": 0.6977967, + "learning_rate": 1.2284854654946996e-09, + "loss": 0.71915483, + "num_input_tokens_seen": 355014535, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66796875, + "step": 16452, + "time_per_iteration": 2.4743566513061523 + }, + { + "auxiliary_loss_clip": 0.01096477, + "auxiliary_loss_mlp": 0.01023142, + "balance_loss_clip": 1.01255608, + "balance_loss_mlp": 1.03474927, + "epoch": 0.9892078761460995, + "flos": 20772922533120.0, + "grad_norm": 1.8258741386751924, + "language_loss": 0.73913336, + "learning_rate": 1.2148749214409004e-09, + "loss": 0.7603296, + "num_input_tokens_seen": 355033280, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.6171875, + "step": 16453, + "time_per_iteration": 2.4414727687835693 + }, + { + "auxiliary_loss_clip": 0.01098548, + "auxiliary_loss_mlp": 0.01034916, + "balance_loss_clip": 1.0235076, + "balance_loss_mlp": 1.0325352, + "epoch": 0.9892679993987675, + "flos": 23367827836800.0, + "grad_norm": 2.157949205702443, + "language_loss": 0.69432741, + "learning_rate": 1.2013401715191828e-09, + "loss": 0.71566206, + "num_input_tokens_seen": 355053320, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66015625, + "step": 16454, + "time_per_iteration": 2.470310926437378 + }, + { + "auxiliary_loss_clip": 0.01095091, + "auxiliary_loss_mlp": 0.01029019, + "balance_loss_clip": 1.01800466, + "balance_loss_mlp": 1.03340435, + "epoch": 0.9893281226514354, + "flos": 22705419173760.0, + "grad_norm": 1.9739379495934455, + "language_loss": 0.75967795, + "learning_rate": 1.1878812162433583e-09, + "loss": 0.78091908, + "num_input_tokens_seen": 355070230, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6171875, + "step": 16455, + "time_per_iteration": 2.481872797012329 + }, + { + "auxiliary_loss_clip": 0.01096584, + "auxiliary_loss_mlp": 0.01024963, + "balance_loss_clip": 1.01354313, + "balance_loss_mlp": 1.03333139, + "epoch": 0.9893882459041035, + "flos": 21796664060160.0, + "grad_norm": 1.6907179188654564, + "language_loss": 0.65590852, + "learning_rate": 1.1744980561230188e-09, + "loss": 0.67712402, + "num_input_tokens_seen": 355090125, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6328125, + "step": 16456, + "time_per_iteration": 2.4589202404022217 + }, + { + "auxiliary_loss_clip": 0.01101842, + "auxiliary_loss_mlp": 0.01026745, + "balance_loss_clip": 1.01516438, + "balance_loss_mlp": 1.03582501, + "epoch": 0.9894483691567714, + "flos": 18113773754880.0, + "grad_norm": 1.6581165178501178, + "language_loss": 0.7385301, + "learning_rate": 1.161190691666203e-09, + "loss": 0.75981599, + "num_input_tokens_seen": 355107890, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66015625, + "step": 16457, + "time_per_iteration": 2.409771203994751 + }, + { + "auxiliary_loss_clip": 0.01100137, + "auxiliary_loss_mlp": 0.01026342, + "balance_loss_clip": 1.0151962, + "balance_loss_mlp": 1.03518713, + "epoch": 0.9895084924094394, + "flos": 31211615664000.0, + "grad_norm": 2.0309095473748253, + "language_loss": 0.68817085, + "learning_rate": 1.1479591233773954e-09, + "loss": 0.70943564, + "num_input_tokens_seen": 355126340, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 16458, + "time_per_iteration": 2.502516269683838 + }, + { + "auxiliary_loss_clip": 0.01095123, + "auxiliary_loss_mlp": 0.0102721, + "balance_loss_clip": 1.01597428, + "balance_loss_mlp": 1.03228736, + "epoch": 0.9895686156621073, + "flos": 19678042120320.0, + "grad_norm": 1.6352367515725288, + "language_loss": 0.79176056, + "learning_rate": 1.1348033517581956e-09, + "loss": 0.81298381, + "num_input_tokens_seen": 355144025, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.62890625, + "step": 16459, + "time_per_iteration": 2.4237966537475586 + }, + { + "auxiliary_loss_clip": 0.01098841, + "auxiliary_loss_mlp": 0.01031369, + "balance_loss_clip": 1.02002609, + "balance_loss_mlp": 1.03269553, + "epoch": 0.9896287389147753, + "flos": 23581675457280.0, + "grad_norm": 1.972457674640829, + "language_loss": 0.71052337, + "learning_rate": 1.1217233773075373e-09, + "loss": 0.73182547, + "num_input_tokens_seen": 355163125, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66015625, + "step": 16460, + "time_per_iteration": 2.437721014022827 + }, + { + "auxiliary_loss_clip": 0.01100449, + "auxiliary_loss_mlp": 0.0102517, + "balance_loss_clip": 1.01348758, + "balance_loss_mlp": 1.03307641, + "epoch": 0.9896888621674432, + "flos": 29605331364480.0, + "grad_norm": 1.5309126749149615, + "language_loss": 0.87348777, + "learning_rate": 1.1087192005214685e-09, + "loss": 0.89474398, + "num_input_tokens_seen": 355184060, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.671875, + "step": 16461, + "time_per_iteration": 2.491044759750366 + }, + { + "auxiliary_loss_clip": 0.01098778, + "auxiliary_loss_mlp": 0.01030685, + "balance_loss_clip": 1.01894927, + "balance_loss_mlp": 1.03391469, + "epoch": 0.9897489854201112, + "flos": 23695045758720.0, + "grad_norm": 2.5517214726118924, + "language_loss": 0.63009971, + "learning_rate": 1.09579082189315e-09, + "loss": 0.65139437, + "num_input_tokens_seen": 355204505, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6484375, + "step": 16462, + "time_per_iteration": 2.462428569793701 + }, + { + "auxiliary_loss_clip": 0.01101282, + "auxiliary_loss_mlp": 0.01028389, + "balance_loss_clip": 1.01751733, + "balance_loss_mlp": 1.03612256, + "epoch": 0.9898091086727792, + "flos": 13225146687360.0, + "grad_norm": 1.5403701602068196, + "language_loss": 0.72850609, + "learning_rate": 1.0829382419126343e-09, + "loss": 0.74980283, + "num_input_tokens_seen": 355223055, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.65234375, + "step": 16463, + "time_per_iteration": 2.419002056121826 + }, + { + "auxiliary_loss_clip": 0.01097709, + "auxiliary_loss_mlp": 0.0102841, + "balance_loss_clip": 1.01629817, + "balance_loss_mlp": 1.03312826, + "epoch": 0.9898692319254472, + "flos": 22930400010240.0, + "grad_norm": 1.7490706399263698, + "language_loss": 0.70085156, + "learning_rate": 1.0701614610675314e-09, + "loss": 0.72211272, + "num_input_tokens_seen": 355242000, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.64453125, + "step": 16464, + "time_per_iteration": 2.4515798091888428 + }, + { + "auxiliary_loss_clip": 0.01099892, + "auxiliary_loss_mlp": 0.01029848, + "balance_loss_clip": 1.01814127, + "balance_loss_mlp": 1.03332138, + "epoch": 0.9899293551781151, + "flos": 12458346122880.0, + "grad_norm": 2.0677528811720993, + "language_loss": 0.73172307, + "learning_rate": 1.0574604798421204e-09, + "loss": 0.75302052, + "num_input_tokens_seen": 355260175, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 16465, + "time_per_iteration": 2.5083959102630615 + }, + { + "auxiliary_loss_clip": 0.01095532, + "auxiliary_loss_mlp": 0.01030929, + "balance_loss_clip": 1.02051592, + "balance_loss_mlp": 1.03158927, + "epoch": 0.9899894784307831, + "flos": 26871129118080.0, + "grad_norm": 1.8661862242505183, + "language_loss": 0.86434472, + "learning_rate": 1.0448352987182386e-09, + "loss": 0.88560927, + "num_input_tokens_seen": 355281930, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.640625, + "step": 16466, + "time_per_iteration": 2.516256809234619 + }, + { + "auxiliary_loss_clip": 0.01099567, + "auxiliary_loss_mlp": 0.01023106, + "balance_loss_clip": 1.01206732, + "balance_loss_mlp": 1.03457332, + "epoch": 0.990049601683451, + "flos": 21542093395200.0, + "grad_norm": 1.786963509372796, + "language_loss": 0.71397775, + "learning_rate": 1.0322859181743915e-09, + "loss": 0.73520446, + "num_input_tokens_seen": 355301555, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 16467, + "time_per_iteration": 2.479933738708496 + }, + { + "auxiliary_loss_clip": 0.0109761, + "auxiliary_loss_mlp": 0.01027504, + "balance_loss_clip": 1.0166316, + "balance_loss_mlp": 1.03384113, + "epoch": 0.990109724936119, + "flos": 28771809287040.0, + "grad_norm": 1.2832078969195513, + "language_loss": 0.6496833, + "learning_rate": 1.019812338686643e-09, + "loss": 0.67093444, + "num_input_tokens_seen": 355324925, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.63671875, + "step": 16468, + "time_per_iteration": 2.5504212379455566 + }, + { + "auxiliary_loss_clip": 0.01102252, + "auxiliary_loss_mlp": 0.0102875, + "balance_loss_clip": 1.01737785, + "balance_loss_mlp": 1.03365922, + "epoch": 0.9901698481887871, + "flos": 29274270687360.0, + "grad_norm": 1.6640074455423066, + "language_loss": 0.61527658, + "learning_rate": 1.0074145607281704e-09, + "loss": 0.63658667, + "num_input_tokens_seen": 355343875, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.68359375, + "step": 16469, + "time_per_iteration": 2.4935665130615234 + }, + { + "auxiliary_loss_clip": 0.01101977, + "auxiliary_loss_mlp": 0.01026957, + "balance_loss_clip": 1.01494646, + "balance_loss_mlp": 1.0341469, + "epoch": 0.990229971441455, + "flos": 15959025711360.0, + "grad_norm": 2.656192366065704, + "language_loss": 0.70006144, + "learning_rate": 9.950925847685976e-10, + "loss": 0.72135079, + "num_input_tokens_seen": 355358835, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 16470, + "time_per_iteration": 3.8489348888397217 + }, + { + "auxiliary_loss_clip": 0.01021289, + "auxiliary_loss_mlp": 0.0100051, + "balance_loss_clip": 0.99951476, + "balance_loss_mlp": 1.00131559, + "epoch": 0.990290094694123, + "flos": 69780287911680.0, + "grad_norm": 0.6697157586994648, + "language_loss": 0.55488944, + "learning_rate": 9.828464112755509e-10, + "loss": 0.57510746, + "num_input_tokens_seen": 355431225, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.19921875, + "step": 16471, + "time_per_iteration": 3.2679431438446045 + }, + { + "auxiliary_loss_clip": 0.0110021, + "auxiliary_loss_mlp": 0.01030858, + "balance_loss_clip": 1.01922309, + "balance_loss_mlp": 1.03532815, + "epoch": 0.9903502179467909, + "flos": 16252451913600.0, + "grad_norm": 7.34429735890774, + "language_loss": 0.83630276, + "learning_rate": 9.706760407131032e-10, + "loss": 0.85761344, + "num_input_tokens_seen": 355448250, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6484375, + "step": 16472, + "time_per_iteration": 3.8357088565826416 + }, + { + "auxiliary_loss_clip": 0.01098877, + "auxiliary_loss_mlp": 0.01026717, + "balance_loss_clip": 1.01551747, + "balance_loss_mlp": 1.03430748, + "epoch": 0.9904103411994589, + "flos": 21688393489920.0, + "grad_norm": 1.9915784555897358, + "language_loss": 0.8572318, + "learning_rate": 9.585814735431075e-10, + "loss": 0.87848771, + "num_input_tokens_seen": 355467040, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.64453125, + "step": 16473, + "time_per_iteration": 3.857786178588867 + }, + { + "auxiliary_loss_clip": 0.01096247, + "auxiliary_loss_mlp": 0.01028542, + "balance_loss_clip": 1.01799798, + "balance_loss_mlp": 1.03188705, + "epoch": 0.9904704644521268, + "flos": 25739440243200.0, + "grad_norm": 1.6814272861208508, + "language_loss": 0.84478509, + "learning_rate": 9.465627102240859e-10, + "loss": 0.86603308, + "num_input_tokens_seen": 355487825, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.64453125, + "step": 16474, + "time_per_iteration": 2.5232393741607666 + }, + { + "auxiliary_loss_clip": 0.01094689, + "auxiliary_loss_mlp": 0.010346, + "balance_loss_clip": 1.02355504, + "balance_loss_mlp": 1.02992404, + "epoch": 0.9905305877047949, + "flos": 21908346422400.0, + "grad_norm": 1.7352826066562033, + "language_loss": 0.76060629, + "learning_rate": 9.346197512116738e-10, + "loss": 0.78189915, + "num_input_tokens_seen": 355507445, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 16475, + "time_per_iteration": 2.490631341934204 + }, + { + "auxiliary_loss_clip": 0.01096946, + "auxiliary_loss_mlp": 0.01030426, + "balance_loss_clip": 1.01866627, + "balance_loss_mlp": 1.03057003, + "epoch": 0.9905907109574628, + "flos": 21392417422080.0, + "grad_norm": 1.4602935837993765, + "language_loss": 0.7602495, + "learning_rate": 9.227525969588423e-10, + "loss": 0.78152329, + "num_input_tokens_seen": 355527205, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6640625, + "step": 16476, + "time_per_iteration": 2.4616878032684326 + }, + { + "auxiliary_loss_clip": 0.01102543, + "auxiliary_loss_mlp": 0.01026095, + "balance_loss_clip": 1.01298785, + "balance_loss_mlp": 1.03349328, + "epoch": 0.9906508342101308, + "flos": 20521620005760.0, + "grad_norm": 2.177621847035773, + "language_loss": 0.67150068, + "learning_rate": 9.109612479154538e-10, + "loss": 0.69278705, + "num_input_tokens_seen": 355544740, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6875, + "step": 16477, + "time_per_iteration": 2.4365179538726807 + }, + { + "auxiliary_loss_clip": 0.01104342, + "auxiliary_loss_mlp": 0.01031681, + "balance_loss_clip": 1.01949191, + "balance_loss_mlp": 1.03697991, + "epoch": 0.9907109574627987, + "flos": 21361211481600.0, + "grad_norm": 2.8661825575863564, + "language_loss": 0.71520579, + "learning_rate": 8.992457045289282e-10, + "loss": 0.73656601, + "num_input_tokens_seen": 355564385, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 16478, + "time_per_iteration": 4.016811847686768 + }, + { + "auxiliary_loss_clip": 0.01099828, + "auxiliary_loss_mlp": 0.01036886, + "balance_loss_clip": 1.02429771, + "balance_loss_mlp": 1.03379405, + "epoch": 0.9907710807154667, + "flos": 17338605321600.0, + "grad_norm": 2.5601168693900895, + "language_loss": 0.81092632, + "learning_rate": 8.876059672433545e-10, + "loss": 0.83229345, + "num_input_tokens_seen": 355579260, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.66015625, + "step": 16479, + "time_per_iteration": 2.383894920349121 + }, + { + "auxiliary_loss_clip": 0.01100107, + "auxiliary_loss_mlp": 0.01029723, + "balance_loss_clip": 1.01847529, + "balance_loss_mlp": 1.03430843, + "epoch": 0.9908312039681346, + "flos": 28621881918720.0, + "grad_norm": 1.5250465793653611, + "language_loss": 0.6613217, + "learning_rate": 8.760420364999355e-10, + "loss": 0.68261993, + "num_input_tokens_seen": 355599790, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65625, + "step": 16480, + "time_per_iteration": 2.484576463699341 + }, + { + "auxiliary_loss_clip": 0.01096725, + "auxiliary_loss_mlp": 0.01027987, + "balance_loss_clip": 1.01623344, + "balance_loss_mlp": 1.03243065, + "epoch": 0.9908913272208026, + "flos": 35770654512000.0, + "grad_norm": 1.6952711839759178, + "language_loss": 0.72282261, + "learning_rate": 8.645539127374313e-10, + "loss": 0.7440697, + "num_input_tokens_seen": 355620925, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.640625, + "step": 16481, + "time_per_iteration": 2.59816837310791 + }, + { + "auxiliary_loss_clip": 0.01097429, + "auxiliary_loss_mlp": 0.01024732, + "balance_loss_clip": 1.0137589, + "balance_loss_mlp": 1.03415012, + "epoch": 0.9909514504734707, + "flos": 19902196944000.0, + "grad_norm": 1.6583039054588096, + "language_loss": 0.77450025, + "learning_rate": 8.531415963912713e-10, + "loss": 0.79572183, + "num_input_tokens_seen": 355639165, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6328125, + "step": 16482, + "time_per_iteration": 2.4760994911193848 + }, + { + "auxiliary_loss_clip": 0.01100414, + "auxiliary_loss_mlp": 0.01027222, + "balance_loss_clip": 1.01600456, + "balance_loss_mlp": 1.03429222, + "epoch": 0.9910115737261386, + "flos": 20004793165440.0, + "grad_norm": 1.9891098013725752, + "language_loss": 0.75464189, + "learning_rate": 8.418050878944427e-10, + "loss": 0.77591825, + "num_input_tokens_seen": 355657320, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6640625, + "step": 16483, + "time_per_iteration": 2.438983917236328 + }, + { + "auxiliary_loss_clip": 0.01021514, + "auxiliary_loss_mlp": 0.01001794, + "balance_loss_clip": 1.00084007, + "balance_loss_mlp": 1.00166059, + "epoch": 0.9910716969788066, + "flos": 70688432494080.0, + "grad_norm": 0.6785624181259259, + "language_loss": 0.5365091, + "learning_rate": 8.305443876768237e-10, + "loss": 0.55674213, + "num_input_tokens_seen": 355726370, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.19921875, + "step": 16484, + "time_per_iteration": 3.1859169006347656 + }, + { + "auxiliary_loss_clip": 0.01095081, + "auxiliary_loss_mlp": 0.0102748, + "balance_loss_clip": 1.01633358, + "balance_loss_mlp": 1.03326261, + "epoch": 0.9911318202314745, + "flos": 21434038306560.0, + "grad_norm": 2.132839577495112, + "language_loss": 0.81778204, + "learning_rate": 8.19359496165184e-10, + "loss": 0.83900762, + "num_input_tokens_seen": 355745840, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6171875, + "step": 16485, + "time_per_iteration": 2.5153956413269043 + }, + { + "auxiliary_loss_clip": 0.01098037, + "auxiliary_loss_mlp": 0.01031579, + "balance_loss_clip": 1.01986718, + "balance_loss_mlp": 1.0349462, + "epoch": 0.9911919434841425, + "flos": 19826820253440.0, + "grad_norm": 1.9487999720953917, + "language_loss": 0.81256086, + "learning_rate": 8.082504137836288e-10, + "loss": 0.833857, + "num_input_tokens_seen": 355763385, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6328125, + "step": 16486, + "time_per_iteration": 2.467226982116699 + }, + { + "auxiliary_loss_clip": 0.01099412, + "auxiliary_loss_mlp": 0.0102555, + "balance_loss_clip": 1.01433814, + "balance_loss_mlp": 1.03397942, + "epoch": 0.9912520667368104, + "flos": 41719364691840.0, + "grad_norm": 1.3991972197643134, + "language_loss": 0.65814865, + "learning_rate": 7.972171409538209e-10, + "loss": 0.6793983, + "num_input_tokens_seen": 355786075, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.65625, + "step": 16487, + "time_per_iteration": 2.687784433364868 + }, + { + "auxiliary_loss_clip": 0.01095741, + "auxiliary_loss_mlp": 0.01028615, + "balance_loss_clip": 1.01798725, + "balance_loss_mlp": 1.03274322, + "epoch": 0.9913121899894785, + "flos": 23769668263680.0, + "grad_norm": 1.9951977920658592, + "language_loss": 0.7668978, + "learning_rate": 7.862596780936481e-10, + "loss": 0.78814131, + "num_input_tokens_seen": 355806295, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6328125, + "step": 16488, + "time_per_iteration": 2.4734110832214355 + }, + { + "auxiliary_loss_clip": 0.01102628, + "auxiliary_loss_mlp": 0.01026587, + "balance_loss_clip": 1.01442766, + "balance_loss_mlp": 1.03430152, + "epoch": 0.9913723132421464, + "flos": 23769668263680.0, + "grad_norm": 3.1534533227338946, + "language_loss": 0.68729866, + "learning_rate": 7.753780256190001e-10, + "loss": 0.70859075, + "num_input_tokens_seen": 355825730, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 16489, + "time_per_iteration": 2.467500686645508 + }, + { + "auxiliary_loss_clip": 0.01021406, + "auxiliary_loss_mlp": 0.01001161, + "balance_loss_clip": 1.00017166, + "balance_loss_mlp": 1.0014714, + "epoch": 0.9914324364948144, + "flos": 71267419820160.0, + "grad_norm": 0.6086143053932209, + "language_loss": 0.5259285, + "learning_rate": 7.645721839424357e-10, + "loss": 0.54615414, + "num_input_tokens_seen": 355891545, + "router_z_loss_clip": 0.0098877, + "router_z_loss_mlp": 0.19921875, + "step": 16490, + "time_per_iteration": 3.135390520095825 + }, + { + "auxiliary_loss_clip": 0.0110438, + "auxiliary_loss_mlp": 0.01032782, + "balance_loss_clip": 1.02026534, + "balance_loss_mlp": 1.03562975, + "epoch": 0.9914925597474823, + "flos": 23695440808320.0, + "grad_norm": 2.374861870755257, + "language_loss": 0.75565469, + "learning_rate": 7.538421534734052e-10, + "loss": 0.7770263, + "num_input_tokens_seen": 355909920, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 16491, + "time_per_iteration": 2.4634666442871094 + }, + { + "auxiliary_loss_clip": 0.01105664, + "auxiliary_loss_mlp": 0.01027723, + "balance_loss_clip": 1.01532578, + "balance_loss_mlp": 1.03756356, + "epoch": 0.9915526830001503, + "flos": 13433822749440.0, + "grad_norm": 2.19239694282831, + "language_loss": 0.69975454, + "learning_rate": 7.431879346191383e-10, + "loss": 0.72108841, + "num_input_tokens_seen": 355923130, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 16492, + "time_per_iteration": 2.4108922481536865 + }, + { + "auxiliary_loss_clip": 0.01098681, + "auxiliary_loss_mlp": 0.01029599, + "balance_loss_clip": 1.01737428, + "balance_loss_mlp": 1.03337646, + "epoch": 0.9916128062528182, + "flos": 20740962407040.0, + "grad_norm": 1.8422382567938989, + "language_loss": 0.68127316, + "learning_rate": 7.326095277837563e-10, + "loss": 0.70255595, + "num_input_tokens_seen": 355941960, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65234375, + "step": 16493, + "time_per_iteration": 2.4719016551971436 + }, + { + "auxiliary_loss_clip": 0.0110235, + "auxiliary_loss_mlp": 0.0103173, + "balance_loss_clip": 1.01974964, + "balance_loss_mlp": 1.03438842, + "epoch": 0.9916729295054862, + "flos": 22487082353280.0, + "grad_norm": 1.6813995805543638, + "language_loss": 0.71178663, + "learning_rate": 7.221069333678276e-10, + "loss": 0.73312747, + "num_input_tokens_seen": 355961640, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 16494, + "time_per_iteration": 2.480767250061035 + }, + { + "auxiliary_loss_clip": 0.01100167, + "auxiliary_loss_mlp": 0.01030677, + "balance_loss_clip": 1.01807642, + "balance_loss_mlp": 1.03380418, + "epoch": 0.9917330527581543, + "flos": 14792467708800.0, + "grad_norm": 2.011490694936815, + "language_loss": 0.67974186, + "learning_rate": 7.116801517701443e-10, + "loss": 0.70105028, + "num_input_tokens_seen": 355977980, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6640625, + "step": 16495, + "time_per_iteration": 2.40712308883667 + }, + { + "auxiliary_loss_clip": 0.01021311, + "auxiliary_loss_mlp": 0.01000436, + "balance_loss_clip": 0.99946463, + "balance_loss_mlp": 1.00145388, + "epoch": 0.9917931760108222, + "flos": 59191595585280.0, + "grad_norm": 0.7222922542161719, + "language_loss": 0.53426856, + "learning_rate": 7.013291833859458e-10, + "loss": 0.55448598, + "num_input_tokens_seen": 356042900, + "router_z_loss_clip": 0.00970459, + "router_z_loss_mlp": 0.19921875, + "step": 16496, + "time_per_iteration": 3.1515696048736572 + }, + { + "auxiliary_loss_clip": 0.01100625, + "auxiliary_loss_mlp": 0.01029087, + "balance_loss_clip": 1.01697552, + "balance_loss_mlp": 1.03456199, + "epoch": 0.9918532992634902, + "flos": 26761637485440.0, + "grad_norm": 1.5120941987850633, + "language_loss": 0.71478045, + "learning_rate": 6.91054028607585e-10, + "loss": 0.73607767, + "num_input_tokens_seen": 356063000, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 16497, + "time_per_iteration": 2.478241205215454 + }, + { + "auxiliary_loss_clip": 0.01102575, + "auxiliary_loss_mlp": 0.01031936, + "balance_loss_clip": 1.01924014, + "balance_loss_mlp": 1.03378558, + "epoch": 0.9919134225161581, + "flos": 14975719920000.0, + "grad_norm": 3.1482345724475196, + "language_loss": 0.82058042, + "learning_rate": 6.808546878249721e-10, + "loss": 0.8419255, + "num_input_tokens_seen": 356078130, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 16498, + "time_per_iteration": 2.4786031246185303 + }, + { + "auxiliary_loss_clip": 0.01101575, + "auxiliary_loss_mlp": 0.01034974, + "balance_loss_clip": 1.02315426, + "balance_loss_mlp": 1.03460526, + "epoch": 0.9919735457688261, + "flos": 27818201064960.0, + "grad_norm": 1.560350707366415, + "language_loss": 0.68127578, + "learning_rate": 6.707311614246869e-10, + "loss": 0.70264125, + "num_input_tokens_seen": 356101655, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 16499, + "time_per_iteration": 2.5106289386749268 + }, + { + "auxiliary_loss_clip": 0.01102115, + "auxiliary_loss_mlp": 0.01027117, + "balance_loss_clip": 1.015625, + "balance_loss_mlp": 1.03552794, + "epoch": 0.992033669021494, + "flos": 22562782266240.0, + "grad_norm": 2.0283619276595632, + "language_loss": 0.82292485, + "learning_rate": 6.606834497904223e-10, + "loss": 0.84421718, + "num_input_tokens_seen": 356121425, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6640625, + "step": 16500, + "time_per_iteration": 2.448037624359131 + }, + { + "auxiliary_loss_clip": 0.01102063, + "auxiliary_loss_mlp": 0.0102921, + "balance_loss_clip": 1.01717544, + "balance_loss_mlp": 1.03475332, + "epoch": 0.9920937922741621, + "flos": 25374587846400.0, + "grad_norm": 1.8118228269565941, + "language_loss": 0.81654167, + "learning_rate": 6.507115533036511e-10, + "loss": 0.83785439, + "num_input_tokens_seen": 356140710, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 16501, + "time_per_iteration": 2.4804978370666504 + }, + { + "auxiliary_loss_clip": 0.0109966, + "auxiliary_loss_mlp": 0.0102834, + "balance_loss_clip": 1.01602578, + "balance_loss_mlp": 1.03350425, + "epoch": 0.99215391552683, + "flos": 22054466949120.0, + "grad_norm": 1.8447298881035472, + "language_loss": 0.77077162, + "learning_rate": 6.408154723420711e-10, + "loss": 0.79205161, + "num_input_tokens_seen": 356159835, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66015625, + "step": 16502, + "time_per_iteration": 2.4736790657043457 + }, + { + "auxiliary_loss_clip": 0.01100851, + "auxiliary_loss_mlp": 0.01030936, + "balance_loss_clip": 1.01821661, + "balance_loss_mlp": 1.03371501, + "epoch": 0.992214038779498, + "flos": 15413937845760.0, + "grad_norm": 2.5520061200092914, + "language_loss": 0.71432996, + "learning_rate": 6.309952072811597e-10, + "loss": 0.73564786, + "num_input_tokens_seen": 356177555, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.671875, + "step": 16503, + "time_per_iteration": 2.425827980041504 + }, + { + "auxiliary_loss_clip": 0.01021585, + "auxiliary_loss_mlp": 0.01000758, + "balance_loss_clip": 0.99979258, + "balance_loss_mlp": 1.00154912, + "epoch": 0.9922741620321659, + "flos": 62014498467840.0, + "grad_norm": 0.6296146740963268, + "language_loss": 0.55068082, + "learning_rate": 6.212507584932858e-10, + "loss": 0.57090425, + "num_input_tokens_seen": 356244975, + "router_z_loss_clip": 0.00964355, + "router_z_loss_mlp": 0.20117188, + "step": 16504, + "time_per_iteration": 3.140615940093994 + }, + { + "auxiliary_loss_clip": 0.01096978, + "auxiliary_loss_mlp": 0.01022605, + "balance_loss_clip": 1.01209641, + "balance_loss_mlp": 1.03265607, + "epoch": 0.9923342852848339, + "flos": 17165480745600.0, + "grad_norm": 1.8235699601231674, + "language_loss": 0.69573104, + "learning_rate": 6.115821263481536e-10, + "loss": 0.71692687, + "num_input_tokens_seen": 356262605, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.64453125, + "step": 16505, + "time_per_iteration": 2.4154937267303467 + }, + { + "auxiliary_loss_clip": 0.0110206, + "auxiliary_loss_mlp": 0.01030605, + "balance_loss_clip": 1.01768327, + "balance_loss_mlp": 1.03386188, + "epoch": 0.9923944085375018, + "flos": 23183210908800.0, + "grad_norm": 2.1541396755304576, + "language_loss": 0.65518022, + "learning_rate": 6.019893112119146e-10, + "loss": 0.67650688, + "num_input_tokens_seen": 356278935, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.68359375, + "step": 16506, + "time_per_iteration": 2.4460935592651367 + }, + { + "auxiliary_loss_clip": 0.01097659, + "auxiliary_loss_mlp": 0.01025784, + "balance_loss_clip": 1.01382756, + "balance_loss_mlp": 1.03309059, + "epoch": 0.9924545317901698, + "flos": 20813861059200.0, + "grad_norm": 1.8368921898892858, + "language_loss": 0.62782621, + "learning_rate": 5.924723134487219e-10, + "loss": 0.64906067, + "num_input_tokens_seen": 356295675, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.64453125, + "step": 16507, + "time_per_iteration": 2.442676544189453 + }, + { + "auxiliary_loss_clip": 0.01100781, + "auxiliary_loss_mlp": 0.01032877, + "balance_loss_clip": 1.02104521, + "balance_loss_mlp": 1.03471947, + "epoch": 0.9925146550428379, + "flos": 20083437993600.0, + "grad_norm": 2.6591700136294723, + "language_loss": 0.72890103, + "learning_rate": 5.830311334193983e-10, + "loss": 0.75023758, + "num_input_tokens_seen": 356312885, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66015625, + "step": 16508, + "time_per_iteration": 2.443885564804077 + }, + { + "auxiliary_loss_clip": 0.01099478, + "auxiliary_loss_mlp": 0.01029116, + "balance_loss_clip": 1.01685596, + "balance_loss_mlp": 1.03313184, + "epoch": 0.9925747782955058, + "flos": 24973717086720.0, + "grad_norm": 1.56405746018773, + "language_loss": 0.70219529, + "learning_rate": 5.736657714818793e-10, + "loss": 0.7234813, + "num_input_tokens_seen": 356334070, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6640625, + "step": 16509, + "time_per_iteration": 2.4716854095458984 + }, + { + "auxiliary_loss_clip": 0.01099172, + "auxiliary_loss_mlp": 0.01031187, + "balance_loss_clip": 1.01902127, + "balance_loss_mlp": 1.03302801, + "epoch": 0.9926349015481738, + "flos": 60472526492160.0, + "grad_norm": 1.5891444400263024, + "language_loss": 0.68136442, + "learning_rate": 5.643762279912146e-10, + "loss": 0.70266795, + "num_input_tokens_seen": 356359410, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6640625, + "step": 16510, + "time_per_iteration": 2.8066964149475098 + }, + { + "auxiliary_loss_clip": 0.01101235, + "auxiliary_loss_mlp": 0.01032713, + "balance_loss_clip": 1.02104878, + "balance_loss_mlp": 1.03479171, + "epoch": 0.9926950248008417, + "flos": 20741716592640.0, + "grad_norm": 2.3909482824040054, + "language_loss": 0.81199002, + "learning_rate": 5.551625032997886e-10, + "loss": 0.83332956, + "num_input_tokens_seen": 356378345, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 16511, + "time_per_iteration": 2.4441933631896973 + }, + { + "auxiliary_loss_clip": 0.01096436, + "auxiliary_loss_mlp": 0.01027142, + "balance_loss_clip": 1.01612711, + "balance_loss_mlp": 1.03253233, + "epoch": 0.9927551480535097, + "flos": 24352965221760.0, + "grad_norm": 1.7454834669895913, + "language_loss": 0.91386062, + "learning_rate": 5.460245977570998e-10, + "loss": 0.93509638, + "num_input_tokens_seen": 356397345, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 16512, + "time_per_iteration": 2.5027287006378174 + }, + { + "auxiliary_loss_clip": 0.01021781, + "auxiliary_loss_mlp": 0.00998781, + "balance_loss_clip": 0.99777997, + "balance_loss_mlp": 1.00178456, + "epoch": 0.9928152713061776, + "flos": 71275572207360.0, + "grad_norm": 0.6952001936871817, + "language_loss": 0.55215639, + "learning_rate": 5.369625117095378e-10, + "loss": 0.57236201, + "num_input_tokens_seen": 356459160, + "router_z_loss_clip": 0.01000977, + "router_z_loss_mlp": 0.20019531, + "step": 16513, + "time_per_iteration": 4.509139776229858 + }, + { + "auxiliary_loss_clip": 0.01098001, + "auxiliary_loss_mlp": 0.01028915, + "balance_loss_clip": 1.01759601, + "balance_loss_mlp": 1.03360546, + "epoch": 0.9928753945588457, + "flos": 57809499045120.0, + "grad_norm": 1.6679269237242005, + "language_loss": 0.65108931, + "learning_rate": 5.279762455006054e-10, + "loss": 0.67235851, + "num_input_tokens_seen": 356486405, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.64453125, + "step": 16514, + "time_per_iteration": 4.345771789550781 + }, + { + "auxiliary_loss_clip": 0.01101706, + "auxiliary_loss_mlp": 0.01028023, + "balance_loss_clip": 1.01589918, + "balance_loss_mlp": 1.03492641, + "epoch": 0.9929355178115136, + "flos": 19568981450880.0, + "grad_norm": 1.9155425038175011, + "language_loss": 0.73504049, + "learning_rate": 5.190657994713632e-10, + "loss": 0.75633776, + "num_input_tokens_seen": 356502905, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66796875, + "step": 16515, + "time_per_iteration": 2.4671685695648193 + }, + { + "auxiliary_loss_clip": 0.01100338, + "auxiliary_loss_mlp": 0.01027438, + "balance_loss_clip": 1.01605916, + "balance_loss_mlp": 1.03494489, + "epoch": 0.9929956410641816, + "flos": 22964658606720.0, + "grad_norm": 1.5036430835752834, + "language_loss": 0.77072322, + "learning_rate": 5.102311739593191e-10, + "loss": 0.79200089, + "num_input_tokens_seen": 356523830, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65625, + "step": 16516, + "time_per_iteration": 2.5214271545410156 + }, + { + "auxiliary_loss_clip": 0.01096027, + "auxiliary_loss_mlp": 0.01028354, + "balance_loss_clip": 1.01744592, + "balance_loss_mlp": 1.03210688, + "epoch": 0.9930557643168495, + "flos": 22566409539840.0, + "grad_norm": 1.5405677122582522, + "language_loss": 0.78079957, + "learning_rate": 5.014723692997602e-10, + "loss": 0.80204338, + "num_input_tokens_seen": 356543965, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.640625, + "step": 16517, + "time_per_iteration": 2.4683997631073 + }, + { + "auxiliary_loss_clip": 0.01104403, + "auxiliary_loss_mlp": 0.01037293, + "balance_loss_clip": 1.02406716, + "balance_loss_mlp": 1.03569436, + "epoch": 0.9931158875695175, + "flos": 17201032231680.0, + "grad_norm": 2.467869272528166, + "language_loss": 0.67826927, + "learning_rate": 4.927893858248655e-10, + "loss": 0.69968623, + "num_input_tokens_seen": 356561530, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6875, + "step": 16518, + "time_per_iteration": 2.5322189331054688 + }, + { + "auxiliary_loss_clip": 0.01021417, + "auxiliary_loss_mlp": 0.01004028, + "balance_loss_clip": 1.00303864, + "balance_loss_mlp": 1.00145912, + "epoch": 0.9931760108221854, + "flos": 63711204278400.0, + "grad_norm": 0.7302871663277747, + "language_loss": 0.5342353, + "learning_rate": 4.84182223863483e-10, + "loss": 0.55448973, + "num_input_tokens_seen": 356616845, + "router_z_loss_clip": 0.0098877, + "router_z_loss_mlp": 0.19921875, + "step": 16519, + "time_per_iteration": 2.9954869747161865 + }, + { + "auxiliary_loss_clip": 0.01098347, + "auxiliary_loss_mlp": 0.01027735, + "balance_loss_clip": 1.01674438, + "balance_loss_mlp": 1.03470135, + "epoch": 0.9932361340748534, + "flos": 15304805349120.0, + "grad_norm": 1.7167044262737383, + "language_loss": 0.59850049, + "learning_rate": 4.756508837426842e-10, + "loss": 0.61976135, + "num_input_tokens_seen": 356633560, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6328125, + "step": 16520, + "time_per_iteration": 4.069928884506226 + }, + { + "auxiliary_loss_clip": 0.01100252, + "auxiliary_loss_mlp": 0.01030874, + "balance_loss_clip": 1.01937616, + "balance_loss_mlp": 1.03505707, + "epoch": 0.9932962573275215, + "flos": 36064906727040.0, + "grad_norm": 1.6631510235112372, + "language_loss": 0.61730212, + "learning_rate": 4.671953657853223e-10, + "loss": 0.63861334, + "num_input_tokens_seen": 356657600, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65234375, + "step": 16521, + "time_per_iteration": 2.5684220790863037 + }, + { + "auxiliary_loss_clip": 0.01103581, + "auxiliary_loss_mlp": 0.01035571, + "balance_loss_clip": 1.02342343, + "balance_loss_mlp": 1.03605843, + "epoch": 0.9933563805801894, + "flos": 21470523546240.0, + "grad_norm": 1.6476989131279343, + "language_loss": 0.74009991, + "learning_rate": 4.5881567031225145e-10, + "loss": 0.76149142, + "num_input_tokens_seen": 356675880, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.67578125, + "step": 16522, + "time_per_iteration": 2.436829090118408 + }, + { + "auxiliary_loss_clip": 0.01098481, + "auxiliary_loss_mlp": 0.01028712, + "balance_loss_clip": 1.01750588, + "balance_loss_mlp": 1.03452504, + "epoch": 0.9934165038328574, + "flos": 23986532626560.0, + "grad_norm": 1.4777823452126528, + "language_loss": 0.7283901, + "learning_rate": 4.5051179764143964e-10, + "loss": 0.74966204, + "num_input_tokens_seen": 356696000, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.640625, + "step": 16523, + "time_per_iteration": 2.479238748550415 + }, + { + "auxiliary_loss_clip": 0.01097479, + "auxiliary_loss_mlp": 0.01030754, + "balance_loss_clip": 1.01968598, + "balance_loss_mlp": 1.0324173, + "epoch": 0.9934766270855253, + "flos": 21907807718400.0, + "grad_norm": 1.53854867890714, + "language_loss": 0.70717901, + "learning_rate": 4.422837480875241e-10, + "loss": 0.72846133, + "num_input_tokens_seen": 356716845, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.65234375, + "step": 16524, + "time_per_iteration": 2.444234609603882 + }, + { + "auxiliary_loss_clip": 0.01099088, + "auxiliary_loss_mlp": 0.01030783, + "balance_loss_clip": 1.01931524, + "balance_loss_mlp": 1.03416014, + "epoch": 0.9935367503381933, + "flos": 17129139160320.0, + "grad_norm": 1.790482995534708, + "language_loss": 0.79615587, + "learning_rate": 4.341315219624775e-10, + "loss": 0.81745458, + "num_input_tokens_seen": 356732100, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6484375, + "step": 16525, + "time_per_iteration": 2.4063704013824463 + }, + { + "auxiliary_loss_clip": 0.01098221, + "auxiliary_loss_mlp": 0.0102579, + "balance_loss_clip": 1.01411915, + "balance_loss_mlp": 1.03453732, + "epoch": 0.9935968735908612, + "flos": 22346241125760.0, + "grad_norm": 1.733872474173661, + "language_loss": 0.74672413, + "learning_rate": 4.2605511957582995e-10, + "loss": 0.76796424, + "num_input_tokens_seen": 356751480, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.63671875, + "step": 16526, + "time_per_iteration": 2.438570499420166 + }, + { + "auxiliary_loss_clip": 0.01095722, + "auxiliary_loss_mlp": 0.01027199, + "balance_loss_clip": 1.01614881, + "balance_loss_mlp": 1.03288567, + "epoch": 0.9936569968435293, + "flos": 29460539640960.0, + "grad_norm": 2.7286855441513405, + "language_loss": 0.72363502, + "learning_rate": 4.180545412333369e-10, + "loss": 0.74486423, + "num_input_tokens_seen": 356772650, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.625, + "step": 16527, + "time_per_iteration": 2.5294551849365234 + }, + { + "auxiliary_loss_clip": 0.01099494, + "auxiliary_loss_mlp": 0.01029487, + "balance_loss_clip": 1.01776278, + "balance_loss_mlp": 1.03302014, + "epoch": 0.9937171200961972, + "flos": 16544046522240.0, + "grad_norm": 2.11286094081821, + "language_loss": 0.76350486, + "learning_rate": 4.1012978723875547e-10, + "loss": 0.78479469, + "num_input_tokens_seen": 356788510, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 16528, + "time_per_iteration": 2.3937737941741943 + }, + { + "auxiliary_loss_clip": 0.01100352, + "auxiliary_loss_mlp": 0.01029172, + "balance_loss_clip": 1.01672101, + "balance_loss_mlp": 1.03344357, + "epoch": 0.9937772433488652, + "flos": 24390276474240.0, + "grad_norm": 2.2799255140164227, + "language_loss": 0.66841036, + "learning_rate": 4.022808578922898e-10, + "loss": 0.68970561, + "num_input_tokens_seen": 356809115, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.671875, + "step": 16529, + "time_per_iteration": 2.4863250255584717 + }, + { + "auxiliary_loss_clip": 0.01104753, + "auxiliary_loss_mlp": 0.01031521, + "balance_loss_clip": 1.01808, + "balance_loss_mlp": 1.03608668, + "epoch": 0.9938373666015331, + "flos": 15669909141120.0, + "grad_norm": 2.9215652266283447, + "language_loss": 0.65546691, + "learning_rate": 3.9450775349170186e-10, + "loss": 0.6768297, + "num_input_tokens_seen": 356826410, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.6875, + "step": 16530, + "time_per_iteration": 2.4260799884796143 + }, + { + "auxiliary_loss_clip": 0.01101104, + "auxiliary_loss_mlp": 0.01028039, + "balance_loss_clip": 1.01732826, + "balance_loss_mlp": 1.03536391, + "epoch": 0.9938974898542011, + "flos": 19496190539520.0, + "grad_norm": 2.3806943394415585, + "language_loss": 0.71338522, + "learning_rate": 3.8681047433186676e-10, + "loss": 0.73467672, + "num_input_tokens_seen": 356844990, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.65625, + "step": 16531, + "time_per_iteration": 2.4470114707946777 + }, + { + "auxiliary_loss_clip": 0.01102468, + "auxiliary_loss_mlp": 0.01028393, + "balance_loss_clip": 1.01645386, + "balance_loss_mlp": 1.03573895, + "epoch": 0.993957613106869, + "flos": 26906896085760.0, + "grad_norm": 1.6847640127704915, + "language_loss": 0.74276376, + "learning_rate": 3.791890207045512e-10, + "loss": 0.7640723, + "num_input_tokens_seen": 356866530, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66796875, + "step": 16532, + "time_per_iteration": 2.4952268600463867 + }, + { + "auxiliary_loss_clip": 0.01093194, + "auxiliary_loss_mlp": 0.01028665, + "balance_loss_clip": 1.01831806, + "balance_loss_mlp": 1.03290677, + "epoch": 0.994017736359537, + "flos": 14939593816320.0, + "grad_norm": 1.6183140191849457, + "language_loss": 0.70227963, + "learning_rate": 3.7164339289885717e-10, + "loss": 0.72349823, + "num_input_tokens_seen": 356884660, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.6015625, + "step": 16533, + "time_per_iteration": 2.4223129749298096 + }, + { + "auxiliary_loss_clip": 0.01100959, + "auxiliary_loss_mlp": 0.01028446, + "balance_loss_clip": 1.01636457, + "balance_loss_mlp": 1.03366101, + "epoch": 0.9940778596122051, + "flos": 15377883569280.0, + "grad_norm": 2.495911763822692, + "language_loss": 0.84326804, + "learning_rate": 3.641735912007782e-10, + "loss": 0.86456203, + "num_input_tokens_seen": 356900895, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 16534, + "time_per_iteration": 2.421475410461426 + }, + { + "auxiliary_loss_clip": 0.01093977, + "auxiliary_loss_mlp": 0.01026367, + "balance_loss_clip": 1.01544189, + "balance_loss_mlp": 1.03271604, + "epoch": 0.994137982864873, + "flos": 25228108183680.0, + "grad_norm": 1.3811361665058717, + "language_loss": 0.65835977, + "learning_rate": 3.567796158934211e-10, + "loss": 0.67956328, + "num_input_tokens_seen": 356920985, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.61328125, + "step": 16535, + "time_per_iteration": 2.4900805950164795 + }, + { + "auxiliary_loss_clip": 0.01098474, + "auxiliary_loss_mlp": 0.01026792, + "balance_loss_clip": 1.01655209, + "balance_loss_mlp": 1.03532779, + "epoch": 0.994198106117541, + "flos": 18442140912000.0, + "grad_norm": 1.5464040694380152, + "language_loss": 0.64858508, + "learning_rate": 3.4946146725767235e-10, + "loss": 0.66983771, + "num_input_tokens_seen": 356939800, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.6328125, + "step": 16536, + "time_per_iteration": 2.4714372158050537 + }, + { + "auxiliary_loss_clip": 0.01097217, + "auxiliary_loss_mlp": 0.01027643, + "balance_loss_clip": 1.01602042, + "balance_loss_mlp": 1.03327465, + "epoch": 0.9942582293702089, + "flos": 16654112772480.0, + "grad_norm": 2.6471187803341554, + "language_loss": 0.78560811, + "learning_rate": 3.4221914557064357e-10, + "loss": 0.80685669, + "num_input_tokens_seen": 356957780, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.640625, + "step": 16537, + "time_per_iteration": 2.4144296646118164 + }, + { + "auxiliary_loss_clip": 0.01104051, + "auxiliary_loss_mlp": 0.01030261, + "balance_loss_clip": 1.0181433, + "balance_loss_mlp": 1.03436911, + "epoch": 0.9943183526228769, + "flos": 21944580266880.0, + "grad_norm": 1.5433486041621642, + "language_loss": 0.68369782, + "learning_rate": 3.35052651107004e-10, + "loss": 0.70504093, + "num_input_tokens_seen": 356979185, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 16538, + "time_per_iteration": 2.524678945541382 + }, + { + "auxiliary_loss_clip": 0.01094358, + "auxiliary_loss_mlp": 0.01029791, + "balance_loss_clip": 1.01909792, + "balance_loss_mlp": 1.03162956, + "epoch": 0.9943784758755448, + "flos": 23842566915840.0, + "grad_norm": 1.8615400061160121, + "language_loss": 0.75088692, + "learning_rate": 3.2796198413853614e-10, + "loss": 0.7721284, + "num_input_tokens_seen": 356997735, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.625, + "step": 16539, + "time_per_iteration": 2.646847724914551 + }, + { + "auxiliary_loss_clip": 0.01099417, + "auxiliary_loss_mlp": 0.01031493, + "balance_loss_clip": 1.01966715, + "balance_loss_mlp": 1.03417706, + "epoch": 0.9944385991282129, + "flos": 21469984842240.0, + "grad_norm": 2.7147810890236146, + "language_loss": 0.70484149, + "learning_rate": 3.209471449341361e-10, + "loss": 0.72615063, + "num_input_tokens_seen": 357015660, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65234375, + "step": 16540, + "time_per_iteration": 2.4886109828948975 + }, + { + "auxiliary_loss_clip": 0.0109585, + "auxiliary_loss_mlp": 0.0102303, + "balance_loss_clip": 1.01286149, + "balance_loss_mlp": 1.03206122, + "epoch": 0.9944987223808808, + "flos": 22927024131840.0, + "grad_norm": 2.939680166237685, + "language_loss": 0.75353402, + "learning_rate": 3.140081337600353e-10, + "loss": 0.77472281, + "num_input_tokens_seen": 357034800, + "router_z_loss_clip": 0.1015625, + "router_z_loss_mlp": 0.63671875, + "step": 16541, + "time_per_iteration": 2.484328508377075 + }, + { + "auxiliary_loss_clip": 0.01098166, + "auxiliary_loss_mlp": 0.01034178, + "balance_loss_clip": 1.02246594, + "balance_loss_mlp": 1.03233027, + "epoch": 0.9945588456335488, + "flos": 22383013674240.0, + "grad_norm": 1.7114515319062655, + "language_loss": 0.76576352, + "learning_rate": 3.0714495087891255e-10, + "loss": 0.78708696, + "num_input_tokens_seen": 357053785, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 16542, + "time_per_iteration": 2.5461788177490234 + }, + { + "auxiliary_loss_clip": 0.01102537, + "auxiliary_loss_mlp": 0.01027632, + "balance_loss_clip": 1.01534152, + "balance_loss_mlp": 1.03482652, + "epoch": 0.9946189688862167, + "flos": 21397517153280.0, + "grad_norm": 2.0429647911980595, + "language_loss": 0.74317372, + "learning_rate": 3.0035759655122615e-10, + "loss": 0.76447541, + "num_input_tokens_seen": 357072025, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6796875, + "step": 16543, + "time_per_iteration": 2.449420690536499 + }, + { + "auxiliary_loss_clip": 0.01101837, + "auxiliary_loss_mlp": 0.01028271, + "balance_loss_clip": 1.01610529, + "balance_loss_mlp": 1.03407598, + "epoch": 0.9946790921388847, + "flos": 12416545670400.0, + "grad_norm": 2.3037711031230663, + "language_loss": 0.81437778, + "learning_rate": 2.9364607103454785e-10, + "loss": 0.83567894, + "num_input_tokens_seen": 357086960, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6796875, + "step": 16544, + "time_per_iteration": 2.399273157119751 + }, + { + "auxiliary_loss_clip": 0.01097832, + "auxiliary_loss_mlp": 0.01028072, + "balance_loss_clip": 1.0168128, + "balance_loss_mlp": 1.03306675, + "epoch": 0.9947392153915526, + "flos": 19058295836160.0, + "grad_norm": 1.8738786872300168, + "language_loss": 0.78694546, + "learning_rate": 2.870103745831187e-10, + "loss": 0.80820447, + "num_input_tokens_seen": 357105095, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6484375, + "step": 16545, + "time_per_iteration": 2.5227584838867188 + }, + { + "auxiliary_loss_clip": 0.01102736, + "auxiliary_loss_mlp": 0.0102799, + "balance_loss_clip": 1.01636696, + "balance_loss_mlp": 1.03555512, + "epoch": 0.9947993386442207, + "flos": 27308808339840.0, + "grad_norm": 1.6947418455255971, + "language_loss": 0.72397494, + "learning_rate": 2.8045050744873733e-10, + "loss": 0.74528217, + "num_input_tokens_seen": 357125065, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 16546, + "time_per_iteration": 2.496741533279419 + }, + { + "auxiliary_loss_clip": 0.01096724, + "auxiliary_loss_mlp": 0.01031903, + "balance_loss_clip": 1.02099562, + "balance_loss_mlp": 1.03320479, + "epoch": 0.9948594618968887, + "flos": 20806498771200.0, + "grad_norm": 1.8069969669289252, + "language_loss": 0.77381766, + "learning_rate": 2.739664698798716e-10, + "loss": 0.79510397, + "num_input_tokens_seen": 357141600, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.63671875, + "step": 16547, + "time_per_iteration": 2.4704644680023193 + }, + { + "auxiliary_loss_clip": 0.01099595, + "auxiliary_loss_mlp": 0.01028365, + "balance_loss_clip": 1.01754653, + "balance_loss_mlp": 1.0343287, + "epoch": 0.9949195851495566, + "flos": 23292953936640.0, + "grad_norm": 2.307915611679892, + "language_loss": 0.69766366, + "learning_rate": 2.67558262122769e-10, + "loss": 0.71894336, + "num_input_tokens_seen": 357157880, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.65234375, + "step": 16548, + "time_per_iteration": 2.475226879119873 + }, + { + "auxiliary_loss_clip": 0.01099971, + "auxiliary_loss_mlp": 0.01028963, + "balance_loss_clip": 1.01779294, + "balance_loss_mlp": 1.03472638, + "epoch": 0.9949797084022246, + "flos": 18515470527360.0, + "grad_norm": 1.796397865727554, + "language_loss": 0.75069898, + "learning_rate": 2.6122588442012427e-10, + "loss": 0.77198833, + "num_input_tokens_seen": 357176705, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65234375, + "step": 16549, + "time_per_iteration": 2.4197475910186768 + }, + { + "auxiliary_loss_clip": 0.01102879, + "auxiliary_loss_mlp": 0.01028852, + "balance_loss_clip": 1.0167048, + "balance_loss_mlp": 1.03556645, + "epoch": 0.9950398316548925, + "flos": 30407719328640.0, + "grad_norm": 1.530797233853168, + "language_loss": 0.74324614, + "learning_rate": 2.5496933701241177e-10, + "loss": 0.76456344, + "num_input_tokens_seen": 357197630, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 16550, + "time_per_iteration": 2.498002529144287 + }, + { + "auxiliary_loss_clip": 0.0109823, + "auxiliary_loss_mlp": 0.01025093, + "balance_loss_clip": 1.01437664, + "balance_loss_mlp": 1.0334301, + "epoch": 0.9950999549075605, + "flos": 19900868140800.0, + "grad_norm": 1.8793121441998941, + "language_loss": 0.77961928, + "learning_rate": 2.4878862013655297e-10, + "loss": 0.80085254, + "num_input_tokens_seen": 357215445, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6484375, + "step": 16551, + "time_per_iteration": 2.436533212661743 + }, + { + "auxiliary_loss_clip": 0.01093554, + "auxiliary_loss_mlp": 0.01028132, + "balance_loss_clip": 1.01789141, + "balance_loss_mlp": 1.03328931, + "epoch": 0.9951600781602284, + "flos": 17603555016960.0, + "grad_norm": 1.3682339429908787, + "language_loss": 0.6663608, + "learning_rate": 2.426837340270271e-10, + "loss": 0.68757761, + "num_input_tokens_seen": 357234285, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.6015625, + "step": 16552, + "time_per_iteration": 2.432891368865967 + }, + { + "auxiliary_loss_clip": 0.01101166, + "auxiliary_loss_mlp": 0.01027779, + "balance_loss_clip": 1.0161804, + "balance_loss_mlp": 1.0346899, + "epoch": 0.9952202014128965, + "flos": 28950715952640.0, + "grad_norm": 1.3823867523664939, + "language_loss": 0.81442159, + "learning_rate": 2.3665467891520465e-10, + "loss": 0.835711, + "num_input_tokens_seen": 357257565, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 16553, + "time_per_iteration": 2.5027167797088623 + }, + { + "auxiliary_loss_clip": 0.0102153, + "auxiliary_loss_mlp": 0.01000995, + "balance_loss_clip": 1.0, + "balance_loss_mlp": 1.00160635, + "epoch": 0.9952803246655644, + "flos": 70810386145920.0, + "grad_norm": 0.72272292860588, + "language_loss": 0.57358015, + "learning_rate": 2.3070145503001348e-10, + "loss": 0.59380531, + "num_input_tokens_seen": 357320205, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.19921875, + "step": 16554, + "time_per_iteration": 4.518311500549316 + }, + { + "auxiliary_loss_clip": 0.01099816, + "auxiliary_loss_mlp": 0.01027983, + "balance_loss_clip": 1.01674795, + "balance_loss_mlp": 1.0338006, + "epoch": 0.9953404479182324, + "flos": 21799070271360.0, + "grad_norm": 1.9148999032298457, + "language_loss": 0.76987743, + "learning_rate": 2.24824062597051e-10, + "loss": 0.79115546, + "num_input_tokens_seen": 357340695, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.66015625, + "step": 16555, + "time_per_iteration": 2.476464033126831 + }, + { + "auxiliary_loss_clip": 0.01098218, + "auxiliary_loss_mlp": 0.01029639, + "balance_loss_clip": 1.0180037, + "balance_loss_mlp": 1.0328294, + "epoch": 0.9954005711709003, + "flos": 21937397546880.0, + "grad_norm": 1.7456669794456254, + "language_loss": 0.85952592, + "learning_rate": 2.1902250183902793e-10, + "loss": 0.88080448, + "num_input_tokens_seen": 357357505, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 16556, + "time_per_iteration": 5.243689060211182 + }, + { + "auxiliary_loss_clip": 0.01097284, + "auxiliary_loss_mlp": 0.01029195, + "balance_loss_clip": 1.01753592, + "balance_loss_mlp": 1.03397655, + "epoch": 0.9954606944235683, + "flos": 19354559212800.0, + "grad_norm": 1.9763730454405837, + "language_loss": 0.73122305, + "learning_rate": 2.132967729762125e-10, + "loss": 0.7524879, + "num_input_tokens_seen": 357375395, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6328125, + "step": 16557, + "time_per_iteration": 2.430027484893799 + }, + { + "auxiliary_loss_clip": 0.01098502, + "auxiliary_loss_mlp": 0.01030148, + "balance_loss_clip": 1.01929998, + "balance_loss_mlp": 1.03515077, + "epoch": 0.9955208176762362, + "flos": 30518611591680.0, + "grad_norm": 1.817461567879454, + "language_loss": 0.76426727, + "learning_rate": 2.0764687622554233e-10, + "loss": 0.78555375, + "num_input_tokens_seen": 357397375, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6328125, + "step": 16558, + "time_per_iteration": 2.5219368934631348 + }, + { + "auxiliary_loss_clip": 0.01099064, + "auxiliary_loss_mlp": 0.01031328, + "balance_loss_clip": 1.01932991, + "balance_loss_mlp": 1.03307915, + "epoch": 0.9955809409289043, + "flos": 30008249199360.0, + "grad_norm": 2.2312910836598854, + "language_loss": 0.63569021, + "learning_rate": 2.0207281180129044e-10, + "loss": 0.6569941, + "num_input_tokens_seen": 357418880, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66015625, + "step": 16559, + "time_per_iteration": 2.5117738246917725 + }, + { + "auxiliary_loss_clip": 0.01097276, + "auxiliary_loss_mlp": 0.01024759, + "balance_loss_clip": 1.01383388, + "balance_loss_mlp": 1.03360701, + "epoch": 0.9956410641815723, + "flos": 21543278544000.0, + "grad_norm": 1.7426870102822973, + "language_loss": 0.73885131, + "learning_rate": 1.965745799148433e-10, + "loss": 0.76007164, + "num_input_tokens_seen": 357438310, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.63671875, + "step": 16560, + "time_per_iteration": 2.4488108158111572 + }, + { + "auxiliary_loss_clip": 0.0109778, + "auxiliary_loss_mlp": 0.01028515, + "balance_loss_clip": 1.01738095, + "balance_loss_mlp": 1.03397381, + "epoch": 0.9957011874342402, + "flos": 21689470897920.0, + "grad_norm": 1.7983304898046564, + "language_loss": 0.78763914, + "learning_rate": 1.9115218077470073e-10, + "loss": 0.80890214, + "num_input_tokens_seen": 357457155, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.63671875, + "step": 16561, + "time_per_iteration": 3.9634294509887695 + }, + { + "auxiliary_loss_clip": 0.01096518, + "auxiliary_loss_mlp": 0.01027682, + "balance_loss_clip": 1.01694107, + "balance_loss_mlp": 1.03466511, + "epoch": 0.9957613106869082, + "flos": 17702667619200.0, + "grad_norm": 3.643161069547379, + "language_loss": 0.65290403, + "learning_rate": 1.8580561458647614e-10, + "loss": 0.67414606, + "num_input_tokens_seen": 357468060, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6171875, + "step": 16562, + "time_per_iteration": 2.406337022781372 + }, + { + "auxiliary_loss_clip": 0.01102342, + "auxiliary_loss_mlp": 0.01037884, + "balance_loss_clip": 1.02503276, + "balance_loss_mlp": 1.03487086, + "epoch": 0.9958214339395761, + "flos": 30555994671360.0, + "grad_norm": 1.7113343441863529, + "language_loss": 0.64638877, + "learning_rate": 1.805348815528962e-10, + "loss": 0.66779101, + "num_input_tokens_seen": 357489665, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.67578125, + "step": 16563, + "time_per_iteration": 2.5361878871917725 + }, + { + "auxiliary_loss_clip": 0.01096492, + "auxiliary_loss_mlp": 0.01030567, + "balance_loss_clip": 1.01859283, + "balance_loss_mlp": 1.03311706, + "epoch": 0.9958815571922441, + "flos": 24169174306560.0, + "grad_norm": 1.5892081199071135, + "language_loss": 0.64616358, + "learning_rate": 1.7533998187380105e-10, + "loss": 0.66743422, + "num_input_tokens_seen": 357511975, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6328125, + "step": 16564, + "time_per_iteration": 2.5013222694396973 + }, + { + "auxiliary_loss_clip": 0.01097598, + "auxiliary_loss_mlp": 0.0102463, + "balance_loss_clip": 1.01393127, + "balance_loss_mlp": 1.03449404, + "epoch": 0.995941680444912, + "flos": 15487016065920.0, + "grad_norm": 1.8072037893855308, + "language_loss": 0.74071467, + "learning_rate": 1.7022091574636633e-10, + "loss": 0.76193696, + "num_input_tokens_seen": 357529345, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6328125, + "step": 16565, + "time_per_iteration": 2.428020715713501 + }, + { + "auxiliary_loss_clip": 0.01098477, + "auxiliary_loss_mlp": 0.01027986, + "balance_loss_clip": 1.01667905, + "balance_loss_mlp": 1.03254855, + "epoch": 0.9960018036975801, + "flos": 18621227145600.0, + "grad_norm": 1.7652422017737324, + "language_loss": 0.79023802, + "learning_rate": 1.6517768336443694e-10, + "loss": 0.8115027, + "num_input_tokens_seen": 357547615, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66015625, + "step": 16566, + "time_per_iteration": 2.4422249794006348 + }, + { + "auxiliary_loss_clip": 0.01097067, + "auxiliary_loss_mlp": 0.01027779, + "balance_loss_clip": 1.01725245, + "balance_loss_mlp": 1.0328474, + "epoch": 0.996061926950248, + "flos": 20084120352000.0, + "grad_norm": 1.6111958794194645, + "language_loss": 0.70903325, + "learning_rate": 1.6021028491941535e-10, + "loss": 0.73028171, + "num_input_tokens_seen": 357567380, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.640625, + "step": 16567, + "time_per_iteration": 2.4520092010498047 + }, + { + "auxiliary_loss_clip": 0.01102031, + "auxiliary_loss_mlp": 0.01030294, + "balance_loss_clip": 1.01803339, + "balance_loss_mlp": 1.03490436, + "epoch": 0.996122050202916, + "flos": 24347829576960.0, + "grad_norm": 2.148548690092107, + "language_loss": 0.78551197, + "learning_rate": 1.5531872059959538e-10, + "loss": 0.80683523, + "num_input_tokens_seen": 357586435, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.671875, + "step": 16568, + "time_per_iteration": 2.478513717651367 + }, + { + "auxiliary_loss_clip": 0.01095234, + "auxiliary_loss_mlp": 0.01026304, + "balance_loss_clip": 1.01617098, + "balance_loss_mlp": 1.03350139, + "epoch": 0.9961821734555839, + "flos": 24199302839040.0, + "grad_norm": 1.7226250915847214, + "language_loss": 0.81869441, + "learning_rate": 1.5050299059060634e-10, + "loss": 0.83990985, + "num_input_tokens_seen": 357604720, + "router_z_loss_clip": 0.1015625, + "router_z_loss_mlp": 0.6171875, + "step": 16569, + "time_per_iteration": 2.458204507827759 + }, + { + "auxiliary_loss_clip": 0.01098365, + "auxiliary_loss_mlp": 0.01029328, + "balance_loss_clip": 1.01803255, + "balance_loss_mlp": 1.03522015, + "epoch": 0.9962422967082519, + "flos": 22633741584000.0, + "grad_norm": 1.6857476071497695, + "language_loss": 0.70389342, + "learning_rate": 1.457630950747468e-10, + "loss": 0.72517037, + "num_input_tokens_seen": 357622345, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6328125, + "step": 16570, + "time_per_iteration": 2.4504377841949463 + }, + { + "auxiliary_loss_clip": 0.01098766, + "auxiliary_loss_mlp": 0.01026096, + "balance_loss_clip": 1.01469421, + "balance_loss_mlp": 1.0342046, + "epoch": 0.9963024199609198, + "flos": 26396030903040.0, + "grad_norm": 1.498051028683254, + "language_loss": 0.74896741, + "learning_rate": 1.4109903423209502e-10, + "loss": 0.77021599, + "num_input_tokens_seen": 357642710, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.64453125, + "step": 16571, + "time_per_iteration": 2.4885144233703613 + }, + { + "auxiliary_loss_clip": 0.01098144, + "auxiliary_loss_mlp": 0.01030562, + "balance_loss_clip": 1.018659, + "balance_loss_mlp": 1.0332427, + "epoch": 0.9963625432135879, + "flos": 16581537342720.0, + "grad_norm": 4.293039734836271, + "language_loss": 0.79286802, + "learning_rate": 1.3651080823939843e-10, + "loss": 0.81415516, + "num_input_tokens_seen": 357659870, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6484375, + "step": 16572, + "time_per_iteration": 2.422837495803833 + }, + { + "auxiliary_loss_clip": 0.01098765, + "auxiliary_loss_mlp": 0.01031228, + "balance_loss_clip": 1.01985526, + "balance_loss_mlp": 1.03418255, + "epoch": 0.9964226664662559, + "flos": 26468534505600.0, + "grad_norm": 1.7068316194851922, + "language_loss": 0.70099813, + "learning_rate": 1.3199841727074e-10, + "loss": 0.72229803, + "num_input_tokens_seen": 357677075, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6484375, + "step": 16573, + "time_per_iteration": 2.474600076675415 + }, + { + "auxiliary_loss_clip": 0.0110281, + "auxiliary_loss_mlp": 0.01032815, + "balance_loss_clip": 1.02057242, + "balance_loss_mlp": 1.03448069, + "epoch": 0.9964827897189238, + "flos": 27448320764160.0, + "grad_norm": 3.1405745847261892, + "language_loss": 0.63359118, + "learning_rate": 1.275618614968721e-10, + "loss": 0.6549474, + "num_input_tokens_seen": 357696715, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 16574, + "time_per_iteration": 2.49106502532959 + }, + { + "auxiliary_loss_clip": 0.01105742, + "auxiliary_loss_mlp": 0.01031393, + "balance_loss_clip": 1.01886439, + "balance_loss_mlp": 1.03643692, + "epoch": 0.9965429129715918, + "flos": 11721566350080.0, + "grad_norm": 2.227500988407702, + "language_loss": 0.76397538, + "learning_rate": 1.2320114108654856e-10, + "loss": 0.78534675, + "num_input_tokens_seen": 357712345, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 16575, + "time_per_iteration": 2.4262399673461914 + }, + { + "auxiliary_loss_clip": 0.01098555, + "auxiliary_loss_mlp": 0.01026648, + "balance_loss_clip": 1.01506138, + "balance_loss_mlp": 1.03410196, + "epoch": 0.9966030362242597, + "flos": 19756004590080.0, + "grad_norm": 4.577550443890641, + "language_loss": 0.70150673, + "learning_rate": 1.1891625620474855e-10, + "loss": 0.72275877, + "num_input_tokens_seen": 357731815, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.64453125, + "step": 16576, + "time_per_iteration": 2.451935291290283 + }, + { + "auxiliary_loss_clip": 0.01096621, + "auxiliary_loss_mlp": 0.01025454, + "balance_loss_clip": 1.01396775, + "balance_loss_mlp": 1.03349376, + "epoch": 0.9966631594769277, + "flos": 23915178259200.0, + "grad_norm": 1.6323404485871098, + "language_loss": 0.71913862, + "learning_rate": 1.1470720701400871e-10, + "loss": 0.74035937, + "num_input_tokens_seen": 357751640, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.62890625, + "step": 16577, + "time_per_iteration": 2.4703454971313477 + }, + { + "auxiliary_loss_clip": 0.01097745, + "auxiliary_loss_mlp": 0.01031675, + "balance_loss_clip": 1.02063036, + "balance_loss_mlp": 1.03272855, + "epoch": 0.9967232827295956, + "flos": 15559591495680.0, + "grad_norm": 2.0651068777650927, + "language_loss": 0.78223175, + "learning_rate": 1.1057399367397912e-10, + "loss": 0.80352592, + "num_input_tokens_seen": 357769850, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 16578, + "time_per_iteration": 2.426480770111084 + }, + { + "auxiliary_loss_clip": 0.01099677, + "auxiliary_loss_mlp": 0.01027559, + "balance_loss_clip": 1.01651478, + "balance_loss_mlp": 1.03442752, + "epoch": 0.9967834059822637, + "flos": 20813035046400.0, + "grad_norm": 1.8452406195625735, + "language_loss": 0.76049864, + "learning_rate": 1.0651661634142328e-10, + "loss": 0.78177106, + "num_input_tokens_seen": 357789550, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65234375, + "step": 16579, + "time_per_iteration": 2.432826042175293 + }, + { + "auxiliary_loss_clip": 0.01102874, + "auxiliary_loss_mlp": 0.01036654, + "balance_loss_clip": 1.02306962, + "balance_loss_mlp": 1.03675032, + "epoch": 0.9968435292349316, + "flos": 36719234830080.0, + "grad_norm": 2.4609323658511135, + "language_loss": 0.69146717, + "learning_rate": 1.0253507516999604e-10, + "loss": 0.71286243, + "num_input_tokens_seen": 357809525, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.66015625, + "step": 16580, + "time_per_iteration": 2.581434726715088 + }, + { + "auxiliary_loss_clip": 0.01098839, + "auxiliary_loss_mlp": 0.01026817, + "balance_loss_clip": 1.01583827, + "balance_loss_mlp": 1.03371871, + "epoch": 0.9969036524875996, + "flos": 26760919213440.0, + "grad_norm": 1.8507015089446737, + "language_loss": 0.79869235, + "learning_rate": 9.862937031113184e-11, + "loss": 0.81994891, + "num_input_tokens_seen": 357829795, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6484375, + "step": 16581, + "time_per_iteration": 2.494203567504883 + }, + { + "auxiliary_loss_clip": 0.01096077, + "auxiliary_loss_mlp": 0.01026754, + "balance_loss_clip": 1.01657975, + "balance_loss_mlp": 1.03305769, + "epoch": 0.9969637757402675, + "flos": 24827237424000.0, + "grad_norm": 1.767690643264238, + "language_loss": 0.80186617, + "learning_rate": 9.479950191249031e-11, + "loss": 0.82309449, + "num_input_tokens_seen": 357851655, + "router_z_loss_clip": 0.1015625, + "router_z_loss_mlp": 0.6328125, + "step": 16582, + "time_per_iteration": 2.5011415481567383 + }, + { + "auxiliary_loss_clip": 0.0109477, + "auxiliary_loss_mlp": 0.0102737, + "balance_loss_clip": 1.01649189, + "balance_loss_mlp": 1.03291821, + "epoch": 0.9970238989929355, + "flos": 23038742407680.0, + "grad_norm": 1.569716612794735, + "language_loss": 0.60461831, + "learning_rate": 9.104547011951069e-11, + "loss": 0.62583971, + "num_input_tokens_seen": 357871205, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6171875, + "step": 16583, + "time_per_iteration": 2.4676523208618164 + }, + { + "auxiliary_loss_clip": 0.01099003, + "auxiliary_loss_mlp": 0.01034011, + "balance_loss_clip": 1.02270365, + "balance_loss_mlp": 1.03365004, + "epoch": 0.9970840222456034, + "flos": 25298816106240.0, + "grad_norm": 1.7140467406862439, + "language_loss": 0.77781087, + "learning_rate": 8.736727507452357e-11, + "loss": 0.79914105, + "num_input_tokens_seen": 357892145, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65234375, + "step": 16584, + "time_per_iteration": 2.4774999618530273 + }, + { + "auxiliary_loss_clip": 0.01094708, + "auxiliary_loss_mlp": 0.01026725, + "balance_loss_clip": 1.01621103, + "balance_loss_mlp": 1.03186727, + "epoch": 0.9971441454982715, + "flos": 21615602578560.0, + "grad_norm": 1.4787158618998437, + "language_loss": 0.69567794, + "learning_rate": 8.376491691697297e-11, + "loss": 0.71689224, + "num_input_tokens_seen": 357911205, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.62890625, + "step": 16585, + "time_per_iteration": 2.44138240814209 + }, + { + "auxiliary_loss_clip": 0.01098109, + "auxiliary_loss_mlp": 0.01029706, + "balance_loss_clip": 1.018399, + "balance_loss_mlp": 1.03426003, + "epoch": 0.9972042687509394, + "flos": 14975612179200.0, + "grad_norm": 2.2707359935417797, + "language_loss": 0.81493002, + "learning_rate": 8.023839578363834e-11, + "loss": 0.83620816, + "num_input_tokens_seen": 357928190, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 16586, + "time_per_iteration": 2.421546697616577 + }, + { + "auxiliary_loss_clip": 0.01099853, + "auxiliary_loss_mlp": 0.01030952, + "balance_loss_clip": 1.01990139, + "balance_loss_mlp": 1.03359437, + "epoch": 0.9972643920036074, + "flos": 25806664546560.0, + "grad_norm": 1.5732795893174074, + "language_loss": 0.778898, + "learning_rate": 7.678771180796851e-11, + "loss": 0.80020607, + "num_input_tokens_seen": 357946985, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6640625, + "step": 16587, + "time_per_iteration": 2.4762072563171387 + }, + { + "auxiliary_loss_clip": 0.01102564, + "auxiliary_loss_mlp": 0.01032709, + "balance_loss_clip": 1.02090156, + "balance_loss_mlp": 1.03507805, + "epoch": 0.9973245152562754, + "flos": 23326242865920.0, + "grad_norm": 1.9015065345921054, + "language_loss": 0.72213399, + "learning_rate": 7.341286512074773e-11, + "loss": 0.74348676, + "num_input_tokens_seen": 357966720, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 16588, + "time_per_iteration": 2.4634549617767334 + }, + { + "auxiliary_loss_clip": 0.01104899, + "auxiliary_loss_mlp": 0.01025375, + "balance_loss_clip": 1.01331103, + "balance_loss_mlp": 1.03560162, + "epoch": 0.9973846385089433, + "flos": 12166212810240.0, + "grad_norm": 2.669215149095081, + "language_loss": 0.82404584, + "learning_rate": 7.011385585031781e-11, + "loss": 0.8453486, + "num_input_tokens_seen": 357981375, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6953125, + "step": 16589, + "time_per_iteration": 2.4119436740875244 + }, + { + "auxiliary_loss_clip": 0.01103307, + "auxiliary_loss_mlp": 0.01033409, + "balance_loss_clip": 1.02065957, + "balance_loss_mlp": 1.0352869, + "epoch": 0.9974447617616113, + "flos": 20045157073920.0, + "grad_norm": 1.9948573332908004, + "language_loss": 0.70658422, + "learning_rate": 6.689068412168986e-11, + "loss": 0.72795141, + "num_input_tokens_seen": 358000290, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 16590, + "time_per_iteration": 2.4617862701416016 + }, + { + "auxiliary_loss_clip": 0.01102056, + "auxiliary_loss_mlp": 0.01026363, + "balance_loss_clip": 1.01451993, + "balance_loss_mlp": 1.03504336, + "epoch": 0.9975048850142793, + "flos": 32014614159360.0, + "grad_norm": 1.7363819209230626, + "language_loss": 0.63469762, + "learning_rate": 6.374335005676634e-11, + "loss": 0.65598178, + "num_input_tokens_seen": 358022075, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 16591, + "time_per_iteration": 2.520002841949463 + }, + { + "auxiliary_loss_clip": 0.0109632, + "auxiliary_loss_mlp": 0.01026939, + "balance_loss_clip": 1.01592362, + "balance_loss_mlp": 1.0312531, + "epoch": 0.9975650082669473, + "flos": 36933728895360.0, + "grad_norm": 2.943724599512804, + "language_loss": 0.7296713, + "learning_rate": 6.067185377522933e-11, + "loss": 0.75090384, + "num_input_tokens_seen": 358043940, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 16592, + "time_per_iteration": 2.5603761672973633 + }, + { + "auxiliary_loss_clip": 0.01100374, + "auxiliary_loss_mlp": 0.01028734, + "balance_loss_clip": 1.01680732, + "balance_loss_mlp": 1.03396702, + "epoch": 0.9976251315196152, + "flos": 16472117537280.0, + "grad_norm": 1.5542464002042724, + "language_loss": 0.85096574, + "learning_rate": 5.767619539343016e-11, + "loss": 0.87225676, + "num_input_tokens_seen": 358062720, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 16593, + "time_per_iteration": 2.429603099822998 + }, + { + "auxiliary_loss_clip": 0.01095115, + "auxiliary_loss_mlp": 0.01027135, + "balance_loss_clip": 1.01640582, + "balance_loss_mlp": 1.03307366, + "epoch": 0.9976852547722832, + "flos": 19646836179840.0, + "grad_norm": 1.6719903303496852, + "language_loss": 0.69481122, + "learning_rate": 5.4756375024833656e-11, + "loss": 0.71603376, + "num_input_tokens_seen": 358081560, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.62109375, + "step": 16594, + "time_per_iteration": 2.43540620803833 + }, + { + "auxiliary_loss_clip": 0.0110208, + "auxiliary_loss_mlp": 0.01024833, + "balance_loss_clip": 1.01338243, + "balance_loss_mlp": 1.03504872, + "epoch": 0.9977453780249511, + "flos": 20448434044800.0, + "grad_norm": 2.1178675700771166, + "language_loss": 0.72752357, + "learning_rate": 5.1912392780462113e-11, + "loss": 0.74879265, + "num_input_tokens_seen": 358099065, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 16595, + "time_per_iteration": 3.8551931381225586 + }, + { + "auxiliary_loss_clip": 0.01021727, + "auxiliary_loss_mlp": 0.01002197, + "balance_loss_clip": 1.00119519, + "balance_loss_mlp": 1.00168085, + "epoch": 0.9978055012776191, + "flos": 65455097581440.0, + "grad_norm": 0.7875605849663777, + "language_loss": 0.60373664, + "learning_rate": 4.9144248768007156e-11, + "loss": 0.62397587, + "num_input_tokens_seen": 358156095, + "router_z_loss_clip": 0.01000977, + "router_z_loss_mlp": 0.20117188, + "step": 16596, + "time_per_iteration": 2.9350359439849854 + }, + { + "auxiliary_loss_clip": 0.01099895, + "auxiliary_loss_mlp": 0.01029813, + "balance_loss_clip": 1.0180645, + "balance_loss_mlp": 1.03539467, + "epoch": 0.997865624530287, + "flos": 20631506688000.0, + "grad_norm": 3.029028334744603, + "language_loss": 0.77209026, + "learning_rate": 4.645194309227385e-11, + "loss": 0.79338735, + "num_input_tokens_seen": 358175230, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.64453125, + "step": 16597, + "time_per_iteration": 3.942023515701294 + }, + { + "auxiliary_loss_clip": 0.01099625, + "auxiliary_loss_mlp": 0.01028582, + "balance_loss_clip": 1.01690507, + "balance_loss_mlp": 1.03322697, + "epoch": 0.9979257477829551, + "flos": 29387102284800.0, + "grad_norm": 2.635377639422332, + "language_loss": 0.82367396, + "learning_rate": 4.383547585562475e-11, + "loss": 0.84495604, + "num_input_tokens_seen": 358197075, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 16598, + "time_per_iteration": 3.864666700363159 + }, + { + "auxiliary_loss_clip": 0.011023, + "auxiliary_loss_mlp": 0.01037017, + "balance_loss_clip": 1.02425003, + "balance_loss_mlp": 1.03410494, + "epoch": 0.997985871035623, + "flos": 22635070387200.0, + "grad_norm": 2.796907251913606, + "language_loss": 0.65109944, + "learning_rate": 4.129484715709175e-11, + "loss": 0.67249256, + "num_input_tokens_seen": 358215925, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 16599, + "time_per_iteration": 2.4528656005859375 + }, + { + "auxiliary_loss_clip": 0.01021765, + "auxiliary_loss_mlp": 0.0100066, + "balance_loss_clip": 0.99964696, + "balance_loss_mlp": 1.00174737, + "epoch": 0.998045994288291, + "flos": 61806968663040.0, + "grad_norm": 0.8527082276246827, + "language_loss": 0.62352717, + "learning_rate": 3.8830057093264256e-11, + "loss": 0.64375138, + "num_input_tokens_seen": 358269035, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.20019531, + "step": 16600, + "time_per_iteration": 2.9641520977020264 + }, + { + "auxiliary_loss_clip": 0.01098012, + "auxiliary_loss_mlp": 0.01028294, + "balance_loss_clip": 1.01810217, + "balance_loss_mlp": 1.03394604, + "epoch": 0.998106117540959, + "flos": 19245534456960.0, + "grad_norm": 1.8751595217258485, + "language_loss": 0.78435218, + "learning_rate": 3.644110575717896e-11, + "loss": 0.80561531, + "num_input_tokens_seen": 358287680, + "router_z_loss_clip": 0.10205078, + "router_z_loss_mlp": 0.640625, + "step": 16601, + "time_per_iteration": 2.4237499237060547 + }, + { + "auxiliary_loss_clip": 0.01103104, + "auxiliary_loss_mlp": 0.01029889, + "balance_loss_clip": 1.01854587, + "balance_loss_mlp": 1.0346154, + "epoch": 0.9981662407936269, + "flos": 21106209853440.0, + "grad_norm": 1.7881542375847135, + "language_loss": 0.82285678, + "learning_rate": 3.412799323987414e-11, + "loss": 0.84418672, + "num_input_tokens_seen": 358304080, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6875, + "step": 16602, + "time_per_iteration": 2.4423704147338867 + }, + { + "auxiliary_loss_clip": 0.01101136, + "auxiliary_loss_mlp": 0.01034599, + "balance_loss_clip": 1.02311897, + "balance_loss_mlp": 1.03557825, + "epoch": 0.998226364046295, + "flos": 24316839118080.0, + "grad_norm": 2.4534705060674966, + "language_loss": 0.62488025, + "learning_rate": 3.189071962883538e-11, + "loss": 0.64623755, + "num_input_tokens_seen": 358323670, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 16603, + "time_per_iteration": 3.9773411750793457 + }, + { + "auxiliary_loss_clip": 0.01099863, + "auxiliary_loss_mlp": 0.01026234, + "balance_loss_clip": 1.01466465, + "balance_loss_mlp": 1.0336225, + "epoch": 0.9982864872989629, + "flos": 23836389776640.0, + "grad_norm": 2.745471042635087, + "language_loss": 0.71030104, + "learning_rate": 2.972928500866168e-11, + "loss": 0.73156202, + "num_input_tokens_seen": 358341980, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66015625, + "step": 16604, + "time_per_iteration": 2.465850353240967 + }, + { + "auxiliary_loss_clip": 0.01097913, + "auxiliary_loss_mlp": 0.01025948, + "balance_loss_clip": 1.01399732, + "balance_loss_mlp": 1.03297126, + "epoch": 0.9983466105516309, + "flos": 18333116156160.0, + "grad_norm": 1.5031183797260619, + "language_loss": 0.64503157, + "learning_rate": 2.7643689461953613e-11, + "loss": 0.66627014, + "num_input_tokens_seen": 358360400, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6484375, + "step": 16605, + "time_per_iteration": 2.513810873031616 + }, + { + "auxiliary_loss_clip": 0.01096491, + "auxiliary_loss_mlp": 0.01025316, + "balance_loss_clip": 1.01468289, + "balance_loss_mlp": 1.03334665, + "epoch": 0.9984067338042988, + "flos": 17236763285760.0, + "grad_norm": 1.7165174426414616, + "language_loss": 0.71259665, + "learning_rate": 2.5633933067092938e-11, + "loss": 0.73381472, + "num_input_tokens_seen": 358378990, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.6328125, + "step": 16606, + "time_per_iteration": 2.466052770614624 + }, + { + "auxiliary_loss_clip": 0.01100332, + "auxiliary_loss_mlp": 0.01027992, + "balance_loss_clip": 1.01647663, + "balance_loss_mlp": 1.03490186, + "epoch": 0.9984668570569668, + "flos": 20667884186880.0, + "grad_norm": 2.2852105791991284, + "language_loss": 0.81897211, + "learning_rate": 2.370001590090709e-11, + "loss": 0.84025532, + "num_input_tokens_seen": 358395970, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 16607, + "time_per_iteration": 2.420513868331909 + }, + { + "auxiliary_loss_clip": 0.01098993, + "auxiliary_loss_mlp": 0.01030097, + "balance_loss_clip": 1.01795566, + "balance_loss_mlp": 1.03150964, + "epoch": 0.9985269803096347, + "flos": 30262532555520.0, + "grad_norm": 1.5818378676370355, + "language_loss": 0.67044789, + "learning_rate": 2.184193803622669e-11, + "loss": 0.69173878, + "num_input_tokens_seen": 358417355, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.67578125, + "step": 16608, + "time_per_iteration": 2.5308494567871094 + }, + { + "auxiliary_loss_clip": 0.0110093, + "auxiliary_loss_mlp": 0.01028257, + "balance_loss_clip": 1.01673532, + "balance_loss_mlp": 1.03548384, + "epoch": 0.9985871035623027, + "flos": 10560970005120.0, + "grad_norm": 1.9858303042603545, + "language_loss": 0.80386388, + "learning_rate": 2.0059699543883978e-11, + "loss": 0.82515574, + "num_input_tokens_seen": 358434345, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 16609, + "time_per_iteration": 2.4074668884277344 + }, + { + "auxiliary_loss_clip": 0.01098865, + "auxiliary_loss_mlp": 0.01033806, + "balance_loss_clip": 1.02219498, + "balance_loss_mlp": 1.03368092, + "epoch": 0.9986472268149706, + "flos": 16873455173760.0, + "grad_norm": 1.4407637215037619, + "language_loss": 0.625763, + "learning_rate": 1.8353300491158462e-11, + "loss": 0.64708972, + "num_input_tokens_seen": 358452870, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 16610, + "time_per_iteration": 2.4297850131988525 + }, + { + "auxiliary_loss_clip": 0.01098855, + "auxiliary_loss_mlp": 0.01033769, + "balance_loss_clip": 1.02262878, + "balance_loss_mlp": 1.03305161, + "epoch": 0.9987073500676387, + "flos": 22054538776320.0, + "grad_norm": 2.0766037550542165, + "language_loss": 0.67106199, + "learning_rate": 1.672274094288717e-11, + "loss": 0.69238824, + "num_input_tokens_seen": 358472210, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 16611, + "time_per_iteration": 2.44804310798645 + }, + { + "auxiliary_loss_clip": 0.01098691, + "auxiliary_loss_mlp": 0.01032391, + "balance_loss_clip": 1.02062511, + "balance_loss_mlp": 1.03359318, + "epoch": 0.9987674733203066, + "flos": 30482880537600.0, + "grad_norm": 1.3989424316298207, + "language_loss": 0.69802946, + "learning_rate": 1.5168020961020544e-11, + "loss": 0.71934032, + "num_input_tokens_seen": 358493840, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6484375, + "step": 16612, + "time_per_iteration": 2.50903582572937 + }, + { + "auxiliary_loss_clip": 0.01096405, + "auxiliary_loss_mlp": 0.01026749, + "balance_loss_clip": 1.01591265, + "balance_loss_mlp": 1.03385317, + "epoch": 0.9988275965729746, + "flos": 27745230585600.0, + "grad_norm": 1.4328824978933166, + "language_loss": 0.74061179, + "learning_rate": 1.3689140604400407e-11, + "loss": 0.76184332, + "num_input_tokens_seen": 358515060, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.625, + "step": 16613, + "time_per_iteration": 2.4886481761932373 + }, + { + "auxiliary_loss_clip": 0.01100248, + "auxiliary_loss_mlp": 0.01029146, + "balance_loss_clip": 1.01697528, + "balance_loss_mlp": 1.03376675, + "epoch": 0.9988877198256426, + "flos": 17524191916800.0, + "grad_norm": 1.906372378951036, + "language_loss": 0.73438096, + "learning_rate": 1.2286099928981996e-11, + "loss": 0.7556749, + "num_input_tokens_seen": 358528200, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6640625, + "step": 16614, + "time_per_iteration": 2.400599718093872 + }, + { + "auxiliary_loss_clip": 0.01098843, + "auxiliary_loss_mlp": 0.0103049, + "balance_loss_clip": 1.01953483, + "balance_loss_mlp": 1.03430223, + "epoch": 0.9989478430783105, + "flos": 20996502739200.0, + "grad_norm": 2.131115079088909, + "language_loss": 0.72789717, + "learning_rate": 1.0958898988278065e-11, + "loss": 0.74919045, + "num_input_tokens_seen": 358548360, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.64453125, + "step": 16615, + "time_per_iteration": 2.446946382522583 + }, + { + "auxiliary_loss_clip": 0.01101987, + "auxiliary_loss_mlp": 0.01028575, + "balance_loss_clip": 1.01719022, + "balance_loss_mlp": 1.03495038, + "epoch": 0.9990079663309785, + "flos": 13370620769280.0, + "grad_norm": 2.2411165544017155, + "language_loss": 0.77020514, + "learning_rate": 9.70753783247069e-12, + "loss": 0.79151082, + "num_input_tokens_seen": 358566270, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.671875, + "step": 16616, + "time_per_iteration": 2.412311553955078 + }, + { + "auxiliary_loss_clip": 0.01099813, + "auxiliary_loss_mlp": 0.01026941, + "balance_loss_clip": 1.01558042, + "balance_loss_mlp": 1.03469288, + "epoch": 0.9990680895836465, + "flos": 17310236555520.0, + "grad_norm": 1.899199296891262, + "language_loss": 0.83130789, + "learning_rate": 8.532016508855378e-12, + "loss": 0.85257542, + "num_input_tokens_seen": 358584710, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65234375, + "step": 16617, + "time_per_iteration": 2.410187005996704 + }, + { + "auxiliary_loss_clip": 0.01098748, + "auxiliary_loss_mlp": 0.01027064, + "balance_loss_clip": 1.01609123, + "balance_loss_mlp": 1.03413057, + "epoch": 0.9991282128363145, + "flos": 24207993930240.0, + "grad_norm": 1.546516443425981, + "language_loss": 0.78751385, + "learning_rate": 7.43233506206309e-12, + "loss": 0.80877197, + "num_input_tokens_seen": 358606750, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6484375, + "step": 16618, + "time_per_iteration": 2.492341995239258 + }, + { + "auxiliary_loss_clip": 0.01096564, + "auxiliary_loss_mlp": 0.01028554, + "balance_loss_clip": 1.01755726, + "balance_loss_mlp": 1.0325985, + "epoch": 0.9991883360889824, + "flos": 21175301664000.0, + "grad_norm": 1.6586785536525817, + "language_loss": 0.75025094, + "learning_rate": 6.408493534060255e-12, + "loss": 0.77150214, + "num_input_tokens_seen": 358624675, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.640625, + "step": 16619, + "time_per_iteration": 2.425902843475342 + }, + { + "auxiliary_loss_clip": 0.0109568, + "auxiliary_loss_mlp": 0.01024454, + "balance_loss_clip": 1.01400542, + "balance_loss_mlp": 1.03293276, + "epoch": 0.9992484593416504, + "flos": 19901155449600.0, + "grad_norm": 2.192896469394689, + "language_loss": 0.86634326, + "learning_rate": 5.460491963260594e-12, + "loss": 0.88754463, + "num_input_tokens_seen": 358640715, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.625, + "step": 16620, + "time_per_iteration": 2.4410362243652344 + }, + { + "auxiliary_loss_clip": 0.01094412, + "auxiliary_loss_mlp": 0.01023895, + "balance_loss_clip": 1.01315451, + "balance_loss_mlp": 1.03186941, + "epoch": 0.9993085825943183, + "flos": 24857832833280.0, + "grad_norm": 2.495339856007808, + "language_loss": 0.72616214, + "learning_rate": 4.58833038607942e-12, + "loss": 0.74734521, + "num_input_tokens_seen": 358659630, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.625, + "step": 16621, + "time_per_iteration": 2.4640777111053467 + }, + { + "auxiliary_loss_clip": 0.01021492, + "auxiliary_loss_mlp": 0.01001851, + "balance_loss_clip": 1.00083733, + "balance_loss_mlp": 1.00158083, + "epoch": 0.9993687058469863, + "flos": 71284478780160.0, + "grad_norm": 0.742951217082793, + "language_loss": 0.56556338, + "learning_rate": 3.79200883515729e-12, + "loss": 0.58579683, + "num_input_tokens_seen": 358727840, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.19921875, + "step": 16622, + "time_per_iteration": 3.2356338500976562 + }, + { + "auxiliary_loss_clip": 0.01099663, + "auxiliary_loss_mlp": 0.01026565, + "balance_loss_clip": 1.01551473, + "balance_loss_mlp": 1.03389001, + "epoch": 0.9994288290996542, + "flos": 12199573566720.0, + "grad_norm": 1.8117566744046234, + "language_loss": 0.71488571, + "learning_rate": 3.071527340914315e-12, + "loss": 0.736148, + "num_input_tokens_seen": 358744125, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 16623, + "time_per_iteration": 2.4421582221984863 + }, + { + "auxiliary_loss_clip": 0.01097804, + "auxiliary_loss_mlp": 0.01029483, + "balance_loss_clip": 1.0171864, + "balance_loss_mlp": 1.03373384, + "epoch": 0.9994889523523223, + "flos": 17889942153600.0, + "grad_norm": 2.5657378240797284, + "language_loss": 0.75026071, + "learning_rate": 2.4268859304399368e-12, + "loss": 0.77153361, + "num_input_tokens_seen": 358761420, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.640625, + "step": 16624, + "time_per_iteration": 2.433236598968506 + }, + { + "auxiliary_loss_clip": 0.01097329, + "auxiliary_loss_mlp": 0.01028905, + "balance_loss_clip": 1.01688886, + "balance_loss_mlp": 1.03219914, + "epoch": 0.9995490756049902, + "flos": 26578888064640.0, + "grad_norm": 1.446379729117076, + "language_loss": 0.73516172, + "learning_rate": 1.8580846286031514e-12, + "loss": 0.75642407, + "num_input_tokens_seen": 358782600, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.65234375, + "step": 16625, + "time_per_iteration": 2.4915857315063477 + }, + { + "auxiliary_loss_clip": 0.01095797, + "auxiliary_loss_mlp": 0.01031437, + "balance_loss_clip": 1.02048731, + "balance_loss_mlp": 1.03293371, + "epoch": 0.9996091988576582, + "flos": 22200048771840.0, + "grad_norm": 2.480327678913786, + "language_loss": 0.76776922, + "learning_rate": 1.3651234567202408e-12, + "loss": 0.78904152, + "num_input_tokens_seen": 358801220, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.62890625, + "step": 16626, + "time_per_iteration": 2.4423091411590576 + }, + { + "auxiliary_loss_clip": 0.01097122, + "auxiliary_loss_mlp": 0.01031383, + "balance_loss_clip": 1.02014768, + "balance_loss_mlp": 1.03396559, + "epoch": 0.9996693221103262, + "flos": 27373195468800.0, + "grad_norm": 1.7033964365476697, + "language_loss": 0.82272637, + "learning_rate": 9.480024334429515e-13, + "loss": 0.84401143, + "num_input_tokens_seen": 358819190, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6328125, + "step": 16627, + "time_per_iteration": 2.486177444458008 + }, + { + "auxiliary_loss_clip": 0.0110276, + "auxiliary_loss_mlp": 0.01033814, + "balance_loss_clip": 1.02122557, + "balance_loss_mlp": 1.03508389, + "epoch": 0.9997294453629941, + "flos": 26870410846080.0, + "grad_norm": 2.0058857890661663, + "language_loss": 0.71033239, + "learning_rate": 6.067215747584952e-13, + "loss": 0.73169816, + "num_input_tokens_seen": 358839850, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.67578125, + "step": 16628, + "time_per_iteration": 2.4887261390686035 + }, + { + "auxiliary_loss_clip": 0.01099628, + "auxiliary_loss_mlp": 0.01029216, + "balance_loss_clip": 1.01746821, + "balance_loss_mlp": 1.03319097, + "epoch": 0.9997895686156621, + "flos": 23476996247040.0, + "grad_norm": 1.6302666729818955, + "language_loss": 0.7536037, + "learning_rate": 3.4128089332341456e-13, + "loss": 0.77489209, + "num_input_tokens_seen": 358859805, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6640625, + "step": 16629, + "time_per_iteration": 2.4460158348083496 + }, + { + "auxiliary_loss_clip": 0.01102553, + "auxiliary_loss_mlp": 0.01031537, + "balance_loss_clip": 1.01974154, + "balance_loss_mlp": 1.03498006, + "epoch": 0.9998496918683301, + "flos": 20224961579520.0, + "grad_norm": 2.824647811709247, + "language_loss": 0.60427022, + "learning_rate": 1.5168039935176126e-13, + "loss": 0.62561107, + "num_input_tokens_seen": 358877900, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.67578125, + "step": 16630, + "time_per_iteration": 2.4238274097442627 + }, + { + "auxiliary_loss_clip": 0.01100925, + "auxiliary_loss_mlp": 0.0102694, + "balance_loss_clip": 1.01544178, + "balance_loss_mlp": 1.03468835, + "epoch": 0.9999098151209981, + "flos": 21652913831040.0, + "grad_norm": 2.316151523286849, + "language_loss": 0.60503012, + "learning_rate": 3.792010017100722e-14, + "loss": 0.62630868, + "num_input_tokens_seen": 358897285, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 16631, + "time_per_iteration": 2.4369335174560547 + }, + { + "auxiliary_loss_clip": 0.0109617, + "auxiliary_loss_mlp": 0.01023912, + "balance_loss_clip": 1.01379728, + "balance_loss_mlp": 1.03328824, + "epoch": 0.999969938373666, + "flos": 11544599018880.0, + "grad_norm": 1.9730275609263277, + "language_loss": 0.72405601, + "learning_rate": 0.0, + "loss": 0.74525678, + "num_input_tokens_seen": 358911570, + "router_z_loss_clip": 0.10107422, + "router_z_loss_mlp": 0.62890625, + "step": 16632, + "time_per_iteration": 2.38352370262146 + } + ], + "logging_steps": 1.0, + "max_steps": 16632, + "num_input_tokens_seen": 358911570, + "num_train_epochs": 1, + "save_steps": 3328, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.3992168911420785e+18, + "train_batch_size": 5, + "trial_name": null, + "trial_params": null +} diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/training_args.bin b/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6b9a73eb97a1ef37776f0d97a0590d802e6f8d5a --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a0c59c7a64d6e018f6d41a91f3e718772a260e91597586a7ce64cd9f7d3d0c6 +size 7992 diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/zero_to_fp32.py b/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-16632/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/added_tokens.json b/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/config.json b/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/config.json new file mode 100644 index 0000000000000000000000000000000000000000..97409ed874967d8d79c126c028d286e8fe8e1484 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/config.json @@ -0,0 +1,199 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": true, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": false, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "smoe_sigmoidgating", + "norm_softmax": false, + "normalization": true, + "num_attention_heads": 32, + "num_experts": 4, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 2, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/mm_projector.bin", + "rate_compete": 0.2, + "rate_flip": 0.05, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": true, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": false, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/generation_config.json b/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/latest b/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/latest new file mode 100644 index 0000000000000000000000000000000000000000..2c27d5aabecd1a20f5d8e01a05251ed2cf0a7fec --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/latest @@ -0,0 +1 @@ +global_step3328 \ No newline at end of file diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/model-00001-of-00003.safetensors b/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..fc4e6b43cd92544add4ba1a627e935caafe3fa01 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c102003233b4fe2d4bf67e0f73bfe5c14e428178b49f45f520b2d7ceac939104 +size 4972489328 diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/model-00002-of-00003.safetensors b/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c65be2592f2dfcd25b89eb350c9e96b232528bd9 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5f5636ae30d8869b2cf01736842b5217e026a53d2f1ad2ad324492e4d48911a +size 4985529648 diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/model-00003-of-00003.safetensors b/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aa2ee65df33a47c4ab5def8ac4398aaeca1d1c94 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81b1e2f1b252ccb038d0c7727c6b6fb8d4d122aaf2a8b4eda41e8bd7693792aa +size 248943552 diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/model.safetensors.index.json b/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..aa54419fc0a3eab502aa7c4ad974dca52ed10803 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/model.safetensors.index.json @@ -0,0 +1,1005 @@ +{ + "metadata": { + "total_size": 10206819456 + }, + "weight_map": { + "lm_head.weight": "model-00003-of-00003.safetensors", + "model.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00003-of-00003.safetensors", + "model.norm.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors" + } +} diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/rng_state_0.pth b/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..c1e6773e944015af0e83161fa2d20fe7d469fd7f --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22271cc36f268c0b3e870b3930ac590fd40a4a3cd3a88aed74f78e5f8790aceb +size 15024 diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/rng_state_1.pth b/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..a834a7be015ebd36883cec3bb92a8657936cd0a6 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19435e9be5d4b837d96fc2e9286e23e27344bb6ad3222ef1b9d207e6b2bb8c78 +size 15024 diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/rng_state_2.pth b/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..1f1b991258d274ff5481ace768d5b6702d919d50 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2902ec572b1b2f1a6a78f8979353bf31953eacdc78b129cc34a9f04c1de9b8d5 +size 15024 diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/rng_state_3.pth b/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..ee742fbd21912a77c2d25fe5ca60af4403668637 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8a011e80ba323d1fcabf31eaea4d2bc397efadb23603b4248f0067ff8ca3987 +size 15024 diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/special_tokens_map.json b/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/tokenizer.model b/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/tokenizer_config.json b/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/trainer_state.json b/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d4582dfd390b69d4f782ef8b95717b303397cb40 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/trainer_state.json @@ -0,0 +1,56609 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.20009018487900196, + "eval_steps": 500, + "global_step": 3328, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.05015663, + "auxiliary_loss_mlp": 0.02215404, + "balance_loss_clip": 1.76946592, + "balance_loss_mlp": 2.42247009, + "epoch": 6.012325266796934e-05, + "flos": 24456507091200.0, + "grad_norm": 54.31846269900138, + "language_loss": 2.84849024, + "learning_rate": 0.0, + "loss": 1.94356799, + "num_input_tokens_seen": 19155, + "router_z_loss_clip": 4.4375, + "router_z_loss_mlp": 26.0, + "step": 1, + "time_per_iteration": 14.062297821044922 + }, + { + "auxiliary_loss_clip": 0.03371575, + "auxiliary_loss_mlp": 0.01459085, + "balance_loss_clip": 1.18919563, + "balance_loss_mlp": 1.61943495, + "epoch": 0.00012024650533593868, + "flos": 20225931246720.0, + "grad_norm": 34.71678092445231, + "language_loss": 1.82690942, + "learning_rate": 4.4628432569317594e-07, + "loss": 1.87521601, + "num_input_tokens_seen": 36175, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 17.5, + "step": 2, + "time_per_iteration": 2.4504079818725586 + }, + { + "auxiliary_loss_clip": 0.03311525, + "auxiliary_loss_mlp": 0.014397, + "balance_loss_clip": 1.18697679, + "balance_loss_mlp": 1.61685562, + "epoch": 0.000180369758003908, + "flos": 22309935454080.0, + "grad_norm": 34.59102075188436, + "language_loss": 1.57529902, + "learning_rate": 7.073439208833112e-07, + "loss": 1.62281132, + "num_input_tokens_seen": 54870, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 17.0, + "step": 3, + "time_per_iteration": 2.4145541191101074 + }, + { + "auxiliary_loss_clip": 0.03353861, + "auxiliary_loss_mlp": 0.01449549, + "balance_loss_clip": 1.15390992, + "balance_loss_mlp": 1.61571431, + "epoch": 0.00024049301067187735, + "flos": 22414650577920.0, + "grad_norm": 51.728740512395206, + "language_loss": 1.67595887, + "learning_rate": 8.925686513863519e-07, + "loss": 1.72399294, + "num_input_tokens_seen": 74575, + "router_z_loss_clip": 2.953125, + "router_z_loss_mlp": 17.375, + "step": 4, + "time_per_iteration": 2.466392993927002 + }, + { + "auxiliary_loss_clip": 0.03393634, + "auxiliary_loss_mlp": 0.01505687, + "balance_loss_clip": 1.21710527, + "balance_loss_mlp": 1.61638641, + "epoch": 0.0003006162633398467, + "flos": 21396978449280.0, + "grad_norm": 56.74196654651921, + "language_loss": 1.90851176, + "learning_rate": 1.0362401141348472e-06, + "loss": 1.95750499, + "num_input_tokens_seen": 92580, + "router_z_loss_clip": 2.890625, + "router_z_loss_mlp": 17.75, + "step": 5, + "time_per_iteration": 2.6828246116638184 + }, + { + "auxiliary_loss_clip": 0.03361898, + "auxiliary_loss_mlp": 0.01518906, + "balance_loss_clip": 1.22441149, + "balance_loss_mlp": 1.60614848, + "epoch": 0.000360739516007816, + "flos": 21652375127040.0, + "grad_norm": 33.32400799743486, + "language_loss": 1.6094954, + "learning_rate": 1.153628246576487e-06, + "loss": 1.6583035, + "num_input_tokens_seen": 109705, + "router_z_loss_clip": 2.953125, + "router_z_loss_mlp": 17.5, + "step": 6, + "time_per_iteration": 2.660855770111084 + }, + { + "auxiliary_loss_clip": 0.03345758, + "auxiliary_loss_mlp": 0.01485904, + "balance_loss_clip": 1.20209074, + "balance_loss_mlp": 1.60783124, + "epoch": 0.0004208627686757854, + "flos": 27159742897920.0, + "grad_norm": 26.76365346454933, + "language_loss": 1.53346825, + "learning_rate": 1.2528784983718962e-06, + "loss": 1.58178496, + "num_input_tokens_seen": 129425, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 17.375, + "step": 7, + "time_per_iteration": 2.718822956085205 + }, + { + "auxiliary_loss_clip": 0.03312894, + "auxiliary_loss_mlp": 0.01444018, + "balance_loss_clip": 1.16630852, + "balance_loss_mlp": 1.60320723, + "epoch": 0.0004809860213437547, + "flos": 31319096135040.0, + "grad_norm": 31.923588970831496, + "language_loss": 1.43687642, + "learning_rate": 1.338852977079528e-06, + "loss": 1.48444545, + "num_input_tokens_seen": 149210, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 17.0, + "step": 8, + "time_per_iteration": 2.779961109161377 + }, + { + "auxiliary_loss_clip": 0.03360351, + "auxiliary_loss_mlp": 0.01496215, + "balance_loss_clip": 1.21144783, + "balance_loss_mlp": 1.60258842, + "epoch": 0.000541109274011724, + "flos": 32160411463680.0, + "grad_norm": 28.084887526361417, + "language_loss": 1.49955618, + "learning_rate": 1.4146878417666224e-06, + "loss": 1.54812181, + "num_input_tokens_seen": 169055, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 17.5, + "step": 9, + "time_per_iteration": 2.799635887145996 + }, + { + "auxiliary_loss_clip": 0.03302188, + "auxiliary_loss_mlp": 0.01477479, + "balance_loss_clip": 1.20797062, + "balance_loss_mlp": 1.6070832, + "epoch": 0.0006012325266796934, + "flos": 18916808163840.0, + "grad_norm": 23.45187310710616, + "language_loss": 1.44727731, + "learning_rate": 1.4825244398280232e-06, + "loss": 1.49507403, + "num_input_tokens_seen": 188045, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 17.0, + "step": 10, + "time_per_iteration": 2.6989152431488037 + }, + { + "auxiliary_loss_clip": 0.03356835, + "auxiliary_loss_mlp": 0.01493566, + "balance_loss_clip": 1.21928966, + "balance_loss_mlp": 1.61121845, + "epoch": 0.0006613557793476627, + "flos": 20774861867520.0, + "grad_norm": 18.63867113279811, + "language_loss": 1.45021069, + "learning_rate": 1.5438901072051983e-06, + "loss": 1.4987148, + "num_input_tokens_seen": 207035, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 17.5, + "step": 11, + "time_per_iteration": 2.6820693016052246 + }, + { + "auxiliary_loss_clip": 0.0328584, + "auxiliary_loss_mlp": 0.01449969, + "balance_loss_clip": 1.17378449, + "balance_loss_mlp": 1.59900761, + "epoch": 0.000721479032015632, + "flos": 16581680997120.0, + "grad_norm": 16.861449854609447, + "language_loss": 1.45122719, + "learning_rate": 1.5999125722696629e-06, + "loss": 1.49858522, + "num_input_tokens_seen": 223225, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 16.875, + "step": 12, + "time_per_iteration": 2.631218910217285 + }, + { + "auxiliary_loss_clip": 0.03313605, + "auxiliary_loss_mlp": 0.01404342, + "balance_loss_clip": 1.14589679, + "balance_loss_mlp": 1.60898232, + "epoch": 0.0007816022846836014, + "flos": 23805471144960.0, + "grad_norm": 11.176593153687291, + "language_loss": 1.24100113, + "learning_rate": 1.6514482443788434e-06, + "loss": 1.28818083, + "num_input_tokens_seen": 242570, + "router_z_loss_clip": 2.578125, + "router_z_loss_mlp": 17.125, + "step": 13, + "time_per_iteration": 2.6961779594421387 + }, + { + "auxiliary_loss_clip": 0.03282163, + "auxiliary_loss_mlp": 0.01472629, + "balance_loss_clip": 1.20464635, + "balance_loss_mlp": 1.60534358, + "epoch": 0.0008417255373515708, + "flos": 19172204841600.0, + "grad_norm": 5.7580183597057975, + "language_loss": 1.20611417, + "learning_rate": 1.6991628240650723e-06, + "loss": 1.25366211, + "num_input_tokens_seen": 261215, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 16.75, + "step": 14, + "time_per_iteration": 2.6555092334747314 + }, + { + "auxiliary_loss_clip": 0.0326835, + "auxiliary_loss_mlp": 0.01431945, + "balance_loss_clip": 1.16815877, + "balance_loss_mlp": 1.6104542, + "epoch": 0.00090184879001954, + "flos": 26395564026240.0, + "grad_norm": 6.4839782289009085, + "language_loss": 1.12832427, + "learning_rate": 1.7435840350181584e-06, + "loss": 1.1753273, + "num_input_tokens_seen": 280035, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 16.5, + "step": 15, + "time_per_iteration": 2.717512607574463 + }, + { + "auxiliary_loss_clip": 0.03231722, + "auxiliary_loss_mlp": 0.01412441, + "balance_loss_clip": 1.16257811, + "balance_loss_mlp": 1.59521294, + "epoch": 0.0009619720426875094, + "flos": 24679500785280.0, + "grad_norm": 4.584872954405151, + "language_loss": 1.1119349, + "learning_rate": 1.7851373027727038e-06, + "loss": 1.15837646, + "num_input_tokens_seen": 300265, + "router_z_loss_clip": 2.5, + "router_z_loss_mlp": 16.375, + "step": 16, + "time_per_iteration": 2.7170701026916504 + }, + { + "auxiliary_loss_clip": 0.03220058, + "auxiliary_loss_mlp": 0.0141779, + "balance_loss_clip": 1.17784595, + "balance_loss_mlp": 1.60289145, + "epoch": 0.0010220952953554788, + "flos": 18624531196800.0, + "grad_norm": 5.285773165398426, + "language_loss": 1.1253047, + "learning_rate": 1.8241705979033208e-06, + "loss": 1.17168307, + "num_input_tokens_seen": 317375, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 16.125, + "step": 17, + "time_per_iteration": 2.6125564575195312 + }, + { + "auxiliary_loss_clip": 0.0315575, + "auxiliary_loss_mlp": 0.01378857, + "balance_loss_clip": 1.14730477, + "balance_loss_mlp": 1.60051179, + "epoch": 0.001082218548023448, + "flos": 26142537646080.0, + "grad_norm": 3.8094646515897193, + "language_loss": 1.08149433, + "learning_rate": 1.860972167459798e-06, + "loss": 1.12684035, + "num_input_tokens_seen": 337975, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 15.5625, + "step": 18, + "time_per_iteration": 5.593315362930298 + }, + { + "auxiliary_loss_clip": 0.03181327, + "auxiliary_loss_mlp": 0.01400224, + "balance_loss_clip": 1.13548398, + "balance_loss_mlp": 1.59901524, + "epoch": 0.0011423418006914173, + "flos": 19609776322560.0, + "grad_norm": 4.551402579460018, + "language_loss": 1.02296436, + "learning_rate": 1.89578346593066e-06, + "loss": 1.06877995, + "num_input_tokens_seen": 356635, + "router_z_loss_clip": 2.65625, + "router_z_loss_mlp": 15.8125, + "step": 19, + "time_per_iteration": 2.6462903022766113 + }, + { + "auxiliary_loss_clip": 0.0312444, + "auxiliary_loss_mlp": 0.01341166, + "balance_loss_clip": 1.12096262, + "balance_loss_mlp": 1.60122275, + "epoch": 0.0012024650533593868, + "flos": 17895365107200.0, + "grad_norm": 4.049985155187145, + "language_loss": 1.16660511, + "learning_rate": 1.928808765521199e-06, + "loss": 1.21126115, + "num_input_tokens_seen": 375625, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 15.25, + "step": 20, + "time_per_iteration": 2.687962293624878 + }, + { + "auxiliary_loss_clip": 0.03111088, + "auxiliary_loss_mlp": 0.01380381, + "balance_loss_clip": 1.13109064, + "balance_loss_mlp": 1.58184814, + "epoch": 0.001262588306027356, + "flos": 21252043071360.0, + "grad_norm": 8.855966691950416, + "language_loss": 1.06044388, + "learning_rate": 1.9602224192552076e-06, + "loss": 1.1053586, + "num_input_tokens_seen": 394350, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 15.3125, + "step": 21, + "time_per_iteration": 2.705784320831299 + }, + { + "auxiliary_loss_clip": 0.03006166, + "auxiliary_loss_mlp": 0.0138104, + "balance_loss_clip": 1.14758062, + "balance_loss_mlp": 1.56386232, + "epoch": 0.0013227115586953253, + "flos": 26104077158400.0, + "grad_norm": 3.503731577984969, + "language_loss": 1.05752254, + "learning_rate": 1.9901744328983746e-06, + "loss": 1.10139465, + "num_input_tokens_seen": 413255, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 14.4375, + "step": 22, + "time_per_iteration": 2.714902400970459 + }, + { + "auxiliary_loss_clip": 0.02958535, + "auxiliary_loss_mlp": 0.01337723, + "balance_loss_clip": 1.12743819, + "balance_loss_mlp": 1.56545472, + "epoch": 0.0013828348113632948, + "flos": 23951376190080.0, + "grad_norm": 2.8887485842740657, + "language_loss": 0.91820848, + "learning_rate": 2.018794797290208e-06, + "loss": 0.96117103, + "num_input_tokens_seen": 433065, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 13.9375, + "step": 23, + "time_per_iteration": 2.6802501678466797 + }, + { + "auxiliary_loss_clip": 0.02925568, + "auxiliary_loss_mlp": 0.0136327, + "balance_loss_clip": 1.14306688, + "balance_loss_mlp": 1.55789983, + "epoch": 0.001442958064031264, + "flos": 15959851724160.0, + "grad_norm": 2.888412626700388, + "language_loss": 1.08090949, + "learning_rate": 2.046196897962839e-06, + "loss": 1.12379789, + "num_input_tokens_seen": 451175, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 13.6875, + "step": 24, + "time_per_iteration": 2.6134862899780273 + }, + { + "auxiliary_loss_clip": 0.02818042, + "auxiliary_loss_mlp": 0.01329399, + "balance_loss_clip": 1.11892343, + "balance_loss_mlp": 1.55278993, + "epoch": 0.0015030813166992333, + "flos": 18108350801280.0, + "grad_norm": 3.5526652768314877, + "language_loss": 1.01197755, + "learning_rate": 2.0724802282696944e-06, + "loss": 1.05345201, + "num_input_tokens_seen": 468775, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 12.6875, + "step": 25, + "time_per_iteration": 2.6801955699920654 + }, + { + "auxiliary_loss_clip": 0.02811065, + "auxiliary_loss_mlp": 0.01310914, + "balance_loss_clip": 1.10196424, + "balance_loss_mlp": 1.55557573, + "epoch": 0.0015632045693672028, + "flos": 22234558763520.0, + "grad_norm": 2.8866965715457127, + "language_loss": 1.0650332, + "learning_rate": 2.0977325700720194e-06, + "loss": 1.10625291, + "num_input_tokens_seen": 488530, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 12.5625, + "step": 26, + "time_per_iteration": 2.6561954021453857 + }, + { + "auxiliary_loss_clip": 0.02754337, + "auxiliary_loss_mlp": 0.01325989, + "balance_loss_clip": 1.12600398, + "balance_loss_mlp": 1.54593086, + "epoch": 0.001623327822035172, + "flos": 23991955580160.0, + "grad_norm": 8.480879524297928, + "language_loss": 0.95465469, + "learning_rate": 2.122031762649933e-06, + "loss": 0.99545801, + "num_input_tokens_seen": 510495, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 12.0625, + "step": 27, + "time_per_iteration": 2.717332363128662 + }, + { + "auxiliary_loss_clip": 0.02732017, + "auxiliary_loss_mlp": 0.0131313, + "balance_loss_clip": 1.13174081, + "balance_loss_mlp": 1.55085063, + "epoch": 0.0016834510747031415, + "flos": 19677647070720.0, + "grad_norm": 2.7582152185230338, + "language_loss": 1.06276608, + "learning_rate": 2.1454471497582483e-06, + "loss": 1.1032176, + "num_input_tokens_seen": 528605, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 11.8125, + "step": 28, + "time_per_iteration": 2.6645846366882324 + }, + { + "auxiliary_loss_clip": 0.02698877, + "auxiliary_loss_mlp": 0.01319704, + "balance_loss_clip": 1.1339283, + "balance_loss_mlp": 1.5357703, + "epoch": 0.0017435743273711108, + "flos": 20923819568640.0, + "grad_norm": 2.703793609192777, + "language_loss": 1.02653611, + "learning_rate": 2.1680407726407727e-06, + "loss": 1.06672192, + "num_input_tokens_seen": 548515, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 11.625, + "step": 29, + "time_per_iteration": 2.6647088527679443 + }, + { + "auxiliary_loss_clip": 0.02692806, + "auxiliary_loss_mlp": 0.01313595, + "balance_loss_clip": 1.12667465, + "balance_loss_mlp": 1.53252506, + "epoch": 0.00180369758003908, + "flos": 19528976678400.0, + "grad_norm": 3.824163422844594, + "language_loss": 1.1929419, + "learning_rate": 2.189868360711334e-06, + "loss": 1.233006, + "num_input_tokens_seen": 564025, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 11.625, + "step": 30, + "time_per_iteration": 2.6305816173553467 + }, + { + "auxiliary_loss_clip": 0.02610821, + "auxiliary_loss_mlp": 0.01338782, + "balance_loss_clip": 1.15748882, + "balance_loss_mlp": 1.51829374, + "epoch": 0.0018638208327070496, + "flos": 27453169100160.0, + "grad_norm": 4.55861683808779, + "language_loss": 1.02499342, + "learning_rate": 2.2109801597326265e-06, + "loss": 1.06448936, + "num_input_tokens_seen": 583345, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 10.9375, + "step": 31, + "time_per_iteration": 2.71045184135437 + }, + { + "auxiliary_loss_clip": 0.02583705, + "auxiliary_loss_mlp": 0.01332414, + "balance_loss_clip": 1.15245557, + "balance_loss_mlp": 1.52035046, + "epoch": 0.0019239440853750188, + "flos": 13589460380160.0, + "grad_norm": 2.526137445187824, + "language_loss": 0.95697796, + "learning_rate": 2.2314216284658796e-06, + "loss": 0.99613917, + "num_input_tokens_seen": 600010, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 10.625, + "step": 32, + "time_per_iteration": 2.626783847808838 + }, + { + "auxiliary_loss_clip": 0.02566919, + "auxiliary_loss_mlp": 0.01304168, + "balance_loss_clip": 1.13670313, + "balance_loss_mlp": 1.51655078, + "epoch": 0.001984067338042988, + "flos": 11253866336640.0, + "grad_norm": 3.344933729659458, + "language_loss": 0.95465255, + "learning_rate": 2.2512340280885094e-06, + "loss": 0.99336338, + "num_input_tokens_seen": 616295, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 10.5, + "step": 33, + "time_per_iteration": 2.645725727081299 + }, + { + "auxiliary_loss_clip": 0.02433039, + "auxiliary_loss_mlp": 0.013041, + "balance_loss_clip": 1.14569449, + "balance_loss_mlp": 1.48877192, + "epoch": 0.0020441905907109576, + "flos": 22386245898240.0, + "grad_norm": 4.808068329548225, + "language_loss": 0.91556877, + "learning_rate": 2.270454923596497e-06, + "loss": 0.95294011, + "num_input_tokens_seen": 637640, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 9.4375, + "step": 34, + "time_per_iteration": 2.7327146530151367 + }, + { + "auxiliary_loss_clip": 0.02385913, + "auxiliary_loss_mlp": 0.0127366, + "balance_loss_clip": 1.1172576, + "balance_loss_mlp": 1.45172572, + "epoch": 0.0021043138433789266, + "flos": 49778580337920.0, + "grad_norm": 2.948252640490764, + "language_loss": 0.76639408, + "learning_rate": 2.2891186125067434e-06, + "loss": 0.80298984, + "num_input_tokens_seen": 659710, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 9.375, + "step": 35, + "time_per_iteration": 2.940739870071411 + }, + { + "auxiliary_loss_clip": 0.02360979, + "auxiliary_loss_mlp": 0.0127456, + "balance_loss_clip": 1.12769413, + "balance_loss_mlp": 1.46427846, + "epoch": 0.002164437096046896, + "flos": 20557961591040.0, + "grad_norm": 2.1659182072135064, + "language_loss": 0.89043307, + "learning_rate": 2.307256493152974e-06, + "loss": 0.92678845, + "num_input_tokens_seen": 679670, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 8.9375, + "step": 36, + "time_per_iteration": 2.693335771560669 + }, + { + "auxiliary_loss_clip": 0.02305413, + "auxiliary_loss_mlp": 0.01335093, + "balance_loss_clip": 1.18574798, + "balance_loss_mlp": 1.45221901, + "epoch": 0.0022245603487148656, + "flos": 26542295084160.0, + "grad_norm": 3.3248653771669416, + "language_loss": 0.93231332, + "learning_rate": 2.3248973825097614e-06, + "loss": 0.96871841, + "num_input_tokens_seen": 700170, + "router_z_loss_clip": 1.4921875, + "router_z_loss_mlp": 8.5, + "step": 37, + "time_per_iteration": 2.70194673538208 + }, + { + "auxiliary_loss_clip": 0.02264412, + "auxiliary_loss_mlp": 0.01277806, + "balance_loss_clip": 1.15373349, + "balance_loss_mlp": 1.44697845, + "epoch": 0.0022846836013828346, + "flos": 20338188226560.0, + "grad_norm": 2.1191864106647906, + "language_loss": 1.04275775, + "learning_rate": 2.3420677916238357e-06, + "loss": 1.07817996, + "num_input_tokens_seen": 718545, + "router_z_loss_clip": 1.2421875, + "router_z_loss_mlp": 8.1875, + "step": 38, + "time_per_iteration": 2.674187183380127 + }, + { + "auxiliary_loss_clip": 0.02234117, + "auxiliary_loss_mlp": 0.01257339, + "balance_loss_clip": 1.13164425, + "balance_loss_mlp": 1.44101977, + "epoch": 0.002344806854050804, + "flos": 26247575992320.0, + "grad_norm": 2.2707505194681685, + "language_loss": 0.85635245, + "learning_rate": 2.358792165262154e-06, + "loss": 0.891267, + "num_input_tokens_seen": 739865, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 7.9375, + "step": 39, + "time_per_iteration": 2.716417074203491 + }, + { + "auxiliary_loss_clip": 0.02209554, + "auxiliary_loss_mlp": 0.01248677, + "balance_loss_clip": 1.1173557, + "balance_loss_mlp": 1.43176007, + "epoch": 0.0024049301067187736, + "flos": 11801539981440.0, + "grad_norm": 2.874633531970748, + "language_loss": 0.90416026, + "learning_rate": 2.3750930912143747e-06, + "loss": 0.93874258, + "num_input_tokens_seen": 755770, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 7.78125, + "step": 40, + "time_per_iteration": 2.621108055114746 + }, + { + "auxiliary_loss_clip": 0.02158681, + "auxiliary_loss_mlp": 0.01271709, + "balance_loss_clip": 1.15626693, + "balance_loss_mlp": 1.42207694, + "epoch": 0.0024650533593867426, + "flos": 20631506688000.0, + "grad_norm": 3.842521317695652, + "language_loss": 0.93497038, + "learning_rate": 2.3909914837471044e-06, + "loss": 0.96927428, + "num_input_tokens_seen": 773440, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 7.375, + "step": 41, + "time_per_iteration": 2.66089129447937 + }, + { + "auxiliary_loss_clip": 0.0212207, + "auxiliary_loss_mlp": 0.0125263, + "balance_loss_clip": 1.14720106, + "balance_loss_mlp": 1.41368401, + "epoch": 0.002525176612054712, + "flos": 18406122549120.0, + "grad_norm": 4.5963223670672635, + "language_loss": 0.97454929, + "learning_rate": 2.4065067449483835e-06, + "loss": 1.00829637, + "num_input_tokens_seen": 790455, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 7.09375, + "step": 42, + "time_per_iteration": 2.63149094581604 + }, + { + "auxiliary_loss_clip": 0.02082851, + "auxiliary_loss_mlp": 0.01298258, + "balance_loss_clip": 1.18939614, + "balance_loss_mlp": 1.41430426, + "epoch": 0.0025852998647226816, + "flos": 28184023128960.0, + "grad_norm": 2.9545418034556814, + "language_loss": 0.97656071, + "learning_rate": 2.4216569070848724e-06, + "loss": 1.01037169, + "num_input_tokens_seen": 810645, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 6.6875, + "step": 43, + "time_per_iteration": 2.7244436740875244 + }, + { + "auxiliary_loss_clip": 0.02102024, + "auxiliary_loss_mlp": 0.01311792, + "balance_loss_clip": 1.19706488, + "balance_loss_mlp": 1.4130851, + "epoch": 0.0026454231173906506, + "flos": 14283110897280.0, + "grad_norm": 2.0531245010632473, + "language_loss": 0.93701768, + "learning_rate": 2.4364587585915504e-06, + "loss": 0.97115582, + "num_input_tokens_seen": 827470, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 6.875, + "step": 44, + "time_per_iteration": 2.6628317832946777 + }, + { + "auxiliary_loss_clip": 0.02065563, + "auxiliary_loss_mlp": 0.01272457, + "balance_loss_clip": 1.17236853, + "balance_loss_mlp": 1.41084957, + "epoch": 0.00270554637005862, + "flos": 22419211605120.0, + "grad_norm": 9.3374631511207, + "language_loss": 0.98937047, + "learning_rate": 2.450927955901469e-06, + "loss": 1.02275062, + "num_input_tokens_seen": 847285, + "router_z_loss_clip": 1.0, + "router_z_loss_mlp": 6.5625, + "step": 45, + "time_per_iteration": 2.7355775833129883 + }, + { + "auxiliary_loss_clip": 0.02040064, + "auxiliary_loss_mlp": 0.01227769, + "balance_loss_clip": 1.13831401, + "balance_loss_mlp": 1.39673805, + "epoch": 0.0027656696227265896, + "flos": 23985778440960.0, + "grad_norm": 1.8055823424878037, + "language_loss": 1.02792716, + "learning_rate": 2.465079122983384e-06, + "loss": 1.06060553, + "num_input_tokens_seen": 867545, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 6.4375, + "step": 46, + "time_per_iteration": 2.7488839626312256 + }, + { + "auxiliary_loss_clip": 0.02002379, + "auxiliary_loss_mlp": 0.01270193, + "balance_loss_clip": 1.17773402, + "balance_loss_mlp": 1.38648152, + "epoch": 0.0028257928753945586, + "flos": 37669503087360.0, + "grad_norm": 2.971366079361506, + "language_loss": 0.88043427, + "learning_rate": 2.4789259401737868e-06, + "loss": 0.91315997, + "num_input_tokens_seen": 889915, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 6.15625, + "step": 47, + "time_per_iteration": 2.845005512237549 + }, + { + "auxiliary_loss_clip": 0.01963914, + "auxiliary_loss_mlp": 0.01252908, + "balance_loss_clip": 1.16493094, + "balance_loss_mlp": 1.37624073, + "epoch": 0.002885916128062528, + "flos": 22454547609600.0, + "grad_norm": 2.070099145794898, + "language_loss": 0.87949276, + "learning_rate": 2.492481223656015e-06, + "loss": 0.91166103, + "num_input_tokens_seen": 908975, + "router_z_loss_clip": 0.87890625, + "router_z_loss_mlp": 5.875, + "step": 48, + "time_per_iteration": 2.7514398097991943 + }, + { + "auxiliary_loss_clip": 0.01962956, + "auxiliary_loss_mlp": 0.01244481, + "balance_loss_clip": 1.15078259, + "balance_loss_mlp": 1.36602139, + "epoch": 0.0029460393807304976, + "flos": 27012796358400.0, + "grad_norm": 2.366138839739612, + "language_loss": 0.89877701, + "learning_rate": 2.5057569967437924e-06, + "loss": 0.93085134, + "num_input_tokens_seen": 929810, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 6.0, + "step": 49, + "time_per_iteration": 2.743236541748047 + }, + { + "auxiliary_loss_clip": 0.01955947, + "auxiliary_loss_mlp": 0.01232227, + "balance_loss_clip": 1.14534748, + "balance_loss_mlp": 1.36045313, + "epoch": 0.0030061626333984666, + "flos": 15851832549120.0, + "grad_norm": 2.8158483763506914, + "language_loss": 0.91078663, + "learning_rate": 2.51876455396287e-06, + "loss": 0.94266832, + "num_input_tokens_seen": 948650, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 5.9375, + "step": 50, + "time_per_iteration": 2.6860456466674805 + }, + { + "auxiliary_loss_clip": 0.01953364, + "auxiliary_loss_mlp": 0.01201227, + "balance_loss_clip": 1.11778045, + "balance_loss_mlp": 1.36547732, + "epoch": 0.003066285886066436, + "flos": 31827052316160.0, + "grad_norm": 3.5299735782100026, + "language_loss": 0.87144494, + "learning_rate": 2.5315145187866316e-06, + "loss": 0.90299082, + "num_input_tokens_seen": 966455, + "router_z_loss_clip": 0.83203125, + "router_z_loss_mlp": 5.875, + "step": 51, + "time_per_iteration": 2.7481534481048584 + }, + { + "auxiliary_loss_clip": 0.01909154, + "auxiliary_loss_mlp": 0.01207037, + "balance_loss_clip": 1.12707186, + "balance_loss_mlp": 1.35597348, + "epoch": 0.0031264091387344056, + "flos": 41427482774400.0, + "grad_norm": 2.0262044932375836, + "language_loss": 0.95253396, + "learning_rate": 2.5440168957651953e-06, + "loss": 0.98369586, + "num_input_tokens_seen": 988110, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 5.53125, + "step": 52, + "time_per_iteration": 2.8958797454833984 + }, + { + "auxiliary_loss_clip": 0.01904814, + "auxiliary_loss_mlp": 0.01243661, + "balance_loss_clip": 1.16274214, + "balance_loss_mlp": 1.35173535, + "epoch": 0.0031865323914023747, + "flos": 23440941970560.0, + "grad_norm": 3.3193539013945546, + "language_loss": 0.92261833, + "learning_rate": 2.5562811176888872e-06, + "loss": 0.95410311, + "num_input_tokens_seen": 1008550, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 5.53125, + "step": 53, + "time_per_iteration": 2.7579286098480225 + }, + { + "auxiliary_loss_clip": 0.01893968, + "auxiliary_loss_mlp": 0.01196907, + "balance_loss_clip": 1.11489081, + "balance_loss_mlp": 1.35535884, + "epoch": 0.003246655644070344, + "flos": 14429195510400.0, + "grad_norm": 2.2021865200163, + "language_loss": 0.82945669, + "learning_rate": 2.5683160883431093e-06, + "loss": 0.86036545, + "num_input_tokens_seen": 1026840, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 5.375, + "step": 54, + "time_per_iteration": 2.684718132019043 + }, + { + "auxiliary_loss_clip": 0.01889572, + "auxiliary_loss_mlp": 0.01211293, + "balance_loss_clip": 1.13113666, + "balance_loss_mlp": 1.34359026, + "epoch": 0.0033067788967383136, + "flos": 35918247496320.0, + "grad_norm": 2.4060188817442487, + "language_loss": 0.81305432, + "learning_rate": 2.580130221340046e-06, + "loss": 0.84406298, + "num_input_tokens_seen": 1048875, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 5.4375, + "step": 55, + "time_per_iteration": 2.7722246646881104 + }, + { + "auxiliary_loss_clip": 0.01879346, + "auxiliary_loss_mlp": 0.01199903, + "balance_loss_clip": 1.11926973, + "balance_loss_mlp": 1.33773279, + "epoch": 0.003366902149406283, + "flos": 22958732862720.0, + "grad_norm": 2.497299649397407, + "language_loss": 0.87261844, + "learning_rate": 2.5917314754514246e-06, + "loss": 0.90341091, + "num_input_tokens_seen": 1066435, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 5.40625, + "step": 56, + "time_per_iteration": 2.7031195163726807 + }, + { + "auxiliary_loss_clip": 0.01879922, + "auxiliary_loss_mlp": 0.01161266, + "balance_loss_clip": 1.0864507, + "balance_loss_mlp": 1.33024335, + "epoch": 0.003427025402074252, + "flos": 26582838560640.0, + "grad_norm": 2.4089458733946882, + "language_loss": 0.92949611, + "learning_rate": 2.6031273868139713e-06, + "loss": 0.95990801, + "num_input_tokens_seen": 1090330, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 5.5, + "step": 57, + "time_per_iteration": 2.8580281734466553 + }, + { + "auxiliary_loss_clip": 0.01843074, + "auxiliary_loss_mlp": 0.01217004, + "balance_loss_clip": 1.14395308, + "balance_loss_mlp": 1.33453596, + "epoch": 0.0034871486547422216, + "flos": 23951196622080.0, + "grad_norm": 2.105168727735643, + "language_loss": 0.99725533, + "learning_rate": 2.614325098333948e-06, + "loss": 1.02785611, + "num_input_tokens_seen": 1109840, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 5.09375, + "step": 58, + "time_per_iteration": 2.687504529953003 + }, + { + "auxiliary_loss_clip": 0.01822907, + "auxiliary_loss_mlp": 0.01195384, + "balance_loss_clip": 1.12319088, + "balance_loss_mlp": 1.32094967, + "epoch": 0.003547271907410191, + "flos": 21214983214080.0, + "grad_norm": 2.1328304194940855, + "language_loss": 0.8821373, + "learning_rate": 2.625331386578098e-06, + "loss": 0.9123202, + "num_input_tokens_seen": 1128415, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 5.03125, + "step": 59, + "time_per_iteration": 6.997380495071411 + }, + { + "auxiliary_loss_clip": 0.01844896, + "auxiliary_loss_mlp": 0.01162144, + "balance_loss_clip": 1.08885431, + "balance_loss_mlp": 1.32932925, + "epoch": 0.00360739516007816, + "flos": 16504903676160.0, + "grad_norm": 2.097582115586327, + "language_loss": 0.93430054, + "learning_rate": 2.63615268640451e-06, + "loss": 0.96437097, + "num_input_tokens_seen": 1146515, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 5.15625, + "step": 60, + "time_per_iteration": 2.67743182182312 + }, + { + "auxiliary_loss_clip": 0.0182307, + "auxiliary_loss_mlp": 0.01172385, + "balance_loss_clip": 1.10376787, + "balance_loss_mlp": 1.31307459, + "epoch": 0.0036675184127461296, + "flos": 19464805031040.0, + "grad_norm": 4.241258673484683, + "language_loss": 0.90090871, + "learning_rate": 2.6467951135575943e-06, + "loss": 0.93086326, + "num_input_tokens_seen": 1166330, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 5.09375, + "step": 61, + "time_per_iteration": 2.707247257232666 + }, + { + "auxiliary_loss_clip": 0.01806801, + "auxiliary_loss_mlp": 0.01142561, + "balance_loss_clip": 1.07475519, + "balance_loss_mlp": 1.31002319, + "epoch": 0.003727641665414099, + "flos": 20957323979520.0, + "grad_norm": 3.0487456468745586, + "language_loss": 0.88434047, + "learning_rate": 2.657264485425803e-06, + "loss": 0.9138341, + "num_input_tokens_seen": 1186010, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 4.96875, + "step": 62, + "time_per_iteration": 2.736107587814331 + }, + { + "auxiliary_loss_clip": 0.01787131, + "auxiliary_loss_mlp": 0.01161947, + "balance_loss_clip": 1.09132755, + "balance_loss_mlp": 1.30018497, + "epoch": 0.003787764918082068, + "flos": 18406050721920.0, + "grad_norm": 2.6509198595432406, + "language_loss": 0.96265876, + "learning_rate": 2.6675663401385186e-06, + "loss": 0.99214947, + "num_input_tokens_seen": 1204985, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 4.875, + "step": 63, + "time_per_iteration": 2.6760194301605225 + }, + { + "auxiliary_loss_clip": 0.01795174, + "auxiliary_loss_mlp": 0.01169703, + "balance_loss_clip": 1.10284996, + "balance_loss_mlp": 1.30725491, + "epoch": 0.0038478881707500376, + "flos": 12459243962880.0, + "grad_norm": 2.677484479433752, + "language_loss": 0.99141657, + "learning_rate": 2.677705954159056e-06, + "loss": 1.02106524, + "num_input_tokens_seen": 1223545, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 4.875, + "step": 64, + "time_per_iteration": 2.675295114517212 + }, + { + "auxiliary_loss_clip": 0.01802087, + "auxiliary_loss_mlp": 0.01149441, + "balance_loss_clip": 1.08134842, + "balance_loss_mlp": 1.30652797, + "epoch": 0.003908011423418007, + "flos": 13553334276480.0, + "grad_norm": 2.45939593962701, + "language_loss": 0.85358196, + "learning_rate": 2.6876883585136904e-06, + "loss": 0.88309723, + "num_input_tokens_seen": 1241175, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 4.9375, + "step": 65, + "time_per_iteration": 2.647696018218994 + }, + { + "auxiliary_loss_clip": 0.01779034, + "auxiliary_loss_mlp": 0.01156784, + "balance_loss_clip": 1.0886445, + "balance_loss_mlp": 1.29322505, + "epoch": 0.003968134676085976, + "flos": 18333475292160.0, + "grad_norm": 2.8561979494145033, + "language_loss": 0.85224223, + "learning_rate": 2.697518353781685e-06, + "loss": 0.88160038, + "num_input_tokens_seen": 1259315, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 4.875, + "step": 66, + "time_per_iteration": 2.617143392562866 + }, + { + "auxiliary_loss_clip": 0.01782156, + "auxiliary_loss_mlp": 0.01152634, + "balance_loss_clip": 1.07648349, + "balance_loss_mlp": 1.29168975, + "epoch": 0.004028257928753946, + "flos": 20485242506880.0, + "grad_norm": 2.246759082278279, + "language_loss": 0.96454394, + "learning_rate": 2.7072005239581103e-06, + "loss": 0.99389184, + "num_input_tokens_seen": 1277055, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 4.90625, + "step": 67, + "time_per_iteration": 2.6343421936035156 + }, + { + "auxiliary_loss_clip": 0.01753238, + "auxiliary_loss_mlp": 0.01155696, + "balance_loss_clip": 1.08340704, + "balance_loss_mlp": 1.28524387, + "epoch": 0.004088381181421915, + "flos": 18843837684480.0, + "grad_norm": 2.549207131743101, + "language_loss": 0.94534445, + "learning_rate": 2.7167392492896727e-06, + "loss": 0.97443378, + "num_input_tokens_seen": 1294355, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 4.6875, + "step": 68, + "time_per_iteration": 2.614696741104126 + }, + { + "auxiliary_loss_clip": 0.01748377, + "auxiliary_loss_mlp": 0.01156697, + "balance_loss_clip": 1.08717394, + "balance_loss_mlp": 1.28268003, + "epoch": 0.004148504434089885, + "flos": 19427817000960.0, + "grad_norm": 1.9922029239060344, + "language_loss": 0.95657748, + "learning_rate": 2.7261387181735195e-06, + "loss": 0.98562825, + "num_input_tokens_seen": 1313525, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 4.65625, + "step": 69, + "time_per_iteration": 2.6637492179870605 + }, + { + "auxiliary_loss_clip": 0.01742428, + "auxiliary_loss_mlp": 0.01160645, + "balance_loss_clip": 1.09598637, + "balance_loss_mlp": 1.2855866, + "epoch": 0.004208627686757853, + "flos": 20811023884800.0, + "grad_norm": 2.4176731159017075, + "language_loss": 0.98073572, + "learning_rate": 2.7354029381999196e-06, + "loss": 1.00976658, + "num_input_tokens_seen": 1330505, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 4.5625, + "step": 70, + "time_per_iteration": 2.6395556926727295 + }, + { + "auxiliary_loss_clip": 0.01748999, + "auxiliary_loss_mlp": 0.01146397, + "balance_loss_clip": 1.07673144, + "balance_loss_mlp": 1.2760632, + "epoch": 0.004268750939425823, + "flos": 19098623831040.0, + "grad_norm": 2.71386904393857, + "language_loss": 0.93927777, + "learning_rate": 2.7445357464116983e-06, + "loss": 0.96823174, + "num_input_tokens_seen": 1349615, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 4.75, + "step": 71, + "time_per_iteration": 2.628272294998169 + }, + { + "auxiliary_loss_clip": 0.01838762, + "auxiliary_loss_mlp": 0.01327632, + "balance_loss_clip": 1.28967619, + "balance_loss_mlp": 1.43997037, + "epoch": 0.004328874192093792, + "flos": 52439635514880.0, + "grad_norm": 2.4194543250518663, + "language_loss": 0.65655279, + "learning_rate": 2.75354081884615e-06, + "loss": 0.68821681, + "num_input_tokens_seen": 1410275, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 4.0, + "step": 72, + "time_per_iteration": 3.104635000228882 + }, + { + "auxiliary_loss_clip": 0.01820285, + "auxiliary_loss_mlp": 0.01295248, + "balance_loss_clip": 1.25824571, + "balance_loss_mlp": 1.43420911, + "epoch": 0.004388997444761762, + "flos": 66473239564800.0, + "grad_norm": 2.2482458517722455, + "language_loss": 0.63711512, + "learning_rate": 2.7624216794188286e-06, + "loss": 0.66827047, + "num_input_tokens_seen": 1473020, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 3.859375, + "step": 73, + "time_per_iteration": 3.208836793899536 + }, + { + "auxiliary_loss_clip": 0.01723308, + "auxiliary_loss_mlp": 0.01141966, + "balance_loss_clip": 1.07382631, + "balance_loss_mlp": 1.26790953, + "epoch": 0.004449120697429731, + "flos": 18952970181120.0, + "grad_norm": 2.4515337577309424, + "language_loss": 0.85899854, + "learning_rate": 2.771181708202938e-06, + "loss": 0.88765126, + "num_input_tokens_seen": 1490385, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 4.5625, + "step": 74, + "time_per_iteration": 2.6287550926208496 + }, + { + "auxiliary_loss_clip": 0.01725734, + "auxiliary_loss_mlp": 0.01165418, + "balance_loss_clip": 1.09584761, + "balance_loss_mlp": 1.26750898, + "epoch": 0.004509243950097701, + "flos": 21105491581440.0, + "grad_norm": 2.110493434952054, + "language_loss": 0.9716984, + "learning_rate": 2.779824149153005e-06, + "loss": 1.00060987, + "num_input_tokens_seen": 1509725, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 4.5625, + "step": 75, + "time_per_iteration": 2.635618209838867 + }, + { + "auxiliary_loss_clip": 0.01704277, + "auxiliary_loss_mlp": 0.01144942, + "balance_loss_clip": 1.07875705, + "balance_loss_mlp": 1.26302838, + "epoch": 0.004569367202765669, + "flos": 20698730991360.0, + "grad_norm": 2.60583579179481, + "language_loss": 0.87675405, + "learning_rate": 2.788352117317012e-06, + "loss": 0.9052462, + "num_input_tokens_seen": 1527245, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 4.4375, + "step": 76, + "time_per_iteration": 2.6379826068878174 + }, + { + "auxiliary_loss_clip": 0.01705571, + "auxiliary_loss_mlp": 0.0114831, + "balance_loss_clip": 1.07845366, + "balance_loss_mlp": 1.26138341, + "epoch": 0.004629490455433639, + "flos": 28658474899200.0, + "grad_norm": 1.9080158042054207, + "language_loss": 0.91751724, + "learning_rate": 2.796768605577095e-06, + "loss": 0.94605613, + "num_input_tokens_seen": 1548930, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 4.4375, + "step": 77, + "time_per_iteration": 2.6596872806549072 + }, + { + "auxiliary_loss_clip": 0.01694222, + "auxiliary_loss_mlp": 0.01165235, + "balance_loss_clip": 1.09494948, + "balance_loss_mlp": 1.26167083, + "epoch": 0.004689613708101608, + "flos": 11072409805440.0, + "grad_norm": 2.1229280552318803, + "language_loss": 0.92189825, + "learning_rate": 2.80507649095533e-06, + "loss": 0.95049286, + "num_input_tokens_seen": 1565695, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 4.3125, + "step": 78, + "time_per_iteration": 2.598590850830078 + }, + { + "auxiliary_loss_clip": 0.01690635, + "auxiliary_loss_mlp": 0.01155594, + "balance_loss_clip": 1.08735824, + "balance_loss_mlp": 1.25696921, + "epoch": 0.004749736960769578, + "flos": 21799106184960.0, + "grad_norm": 2.280813483182965, + "language_loss": 0.82480371, + "learning_rate": 2.813278540517843e-06, + "loss": 0.85326606, + "num_input_tokens_seen": 1582625, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 4.34375, + "step": 79, + "time_per_iteration": 2.6215708255767822 + }, + { + "auxiliary_loss_clip": 0.01705122, + "auxiliary_loss_mlp": 0.01133248, + "balance_loss_clip": 1.06315339, + "balance_loss_mlp": 1.26029253, + "epoch": 0.004809860213437547, + "flos": 19792597570560.0, + "grad_norm": 2.4809717100134616, + "language_loss": 0.91311121, + "learning_rate": 2.8213774169075505e-06, + "loss": 0.94149494, + "num_input_tokens_seen": 1601725, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 4.4375, + "step": 80, + "time_per_iteration": 2.639841079711914 + }, + { + "auxiliary_loss_clip": 0.01674552, + "auxiliary_loss_mlp": 0.01142875, + "balance_loss_clip": 1.07254159, + "balance_loss_mlp": 1.25350285, + "epoch": 0.004869983466105517, + "flos": 26574327037440.0, + "grad_norm": 2.165091554789383, + "language_loss": 0.94981706, + "learning_rate": 2.829375683533245e-06, + "loss": 0.97799134, + "num_input_tokens_seen": 1622420, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 4.21875, + "step": 81, + "time_per_iteration": 2.6689717769622803 + }, + { + "auxiliary_loss_clip": 0.01688803, + "auxiliary_loss_mlp": 0.01148831, + "balance_loss_clip": 1.08269382, + "balance_loss_mlp": 1.25745821, + "epoch": 0.004930106718773485, + "flos": 12823378087680.0, + "grad_norm": 2.9914678747629226, + "language_loss": 0.96341741, + "learning_rate": 2.8372758094402803e-06, + "loss": 0.99179375, + "num_input_tokens_seen": 1640715, + "router_z_loss_clip": 0.66015625, + "router_z_loss_mlp": 4.3125, + "step": 82, + "time_per_iteration": 2.629596471786499 + }, + { + "auxiliary_loss_clip": 0.01671229, + "auxiliary_loss_mlp": 0.01159801, + "balance_loss_clip": 1.09013557, + "balance_loss_mlp": 1.24528587, + "epoch": 0.004990229971441455, + "flos": 25774919902080.0, + "grad_norm": 2.533591741594043, + "language_loss": 0.8664127, + "learning_rate": 2.84508017388607e-06, + "loss": 0.894723, + "num_input_tokens_seen": 1662210, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 4.25, + "step": 83, + "time_per_iteration": 2.7277162075042725 + }, + { + "auxiliary_loss_clip": 0.01664198, + "auxiliary_loss_mlp": 0.01156919, + "balance_loss_clip": 1.08663368, + "balance_loss_mlp": 1.24647975, + "epoch": 0.005050353224109424, + "flos": 17457254922240.0, + "grad_norm": 3.373799694341511, + "language_loss": 0.91779828, + "learning_rate": 2.852791070641559e-06, + "loss": 0.94600952, + "num_input_tokens_seen": 1681070, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 4.1875, + "step": 84, + "time_per_iteration": 2.62187123298645 + }, + { + "auxiliary_loss_clip": 0.01645783, + "auxiliary_loss_mlp": 0.01205663, + "balance_loss_clip": 1.17075825, + "balance_loss_mlp": 1.34984684, + "epoch": 0.005110476476777394, + "flos": 69805460367360.0, + "grad_norm": 1.4266053341540552, + "language_loss": 0.62504542, + "learning_rate": 2.8604107120381682e-06, + "loss": 0.65355992, + "num_input_tokens_seen": 1747140, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 2.96875, + "step": 85, + "time_per_iteration": 3.190223217010498 + }, + { + "auxiliary_loss_clip": 0.0165122, + "auxiliary_loss_mlp": 0.01127154, + "balance_loss_clip": 1.05648708, + "balance_loss_mlp": 1.23674285, + "epoch": 0.005170599729445363, + "flos": 24790105739520.0, + "grad_norm": 1.7428139018461835, + "language_loss": 0.90836501, + "learning_rate": 2.8679412327780482e-06, + "loss": 0.93614876, + "num_input_tokens_seen": 1767475, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 4.15625, + "step": 86, + "time_per_iteration": 2.66162109375 + }, + { + "auxiliary_loss_clip": 0.01655877, + "auxiliary_loss_mlp": 0.01161945, + "balance_loss_clip": 1.09065783, + "balance_loss_mlp": 1.24282312, + "epoch": 0.005230722982113333, + "flos": 23258048895360.0, + "grad_norm": 2.38275425723773, + "language_loss": 0.8209877, + "learning_rate": 2.8753846935240833e-06, + "loss": 0.84916592, + "num_input_tokens_seen": 1784980, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 4.125, + "step": 87, + "time_per_iteration": 2.618906021118164 + }, + { + "auxiliary_loss_clip": 0.01644726, + "auxiliary_loss_mlp": 0.01154792, + "balance_loss_clip": 1.08617568, + "balance_loss_mlp": 1.24127626, + "epoch": 0.005290846234781301, + "flos": 16727909264640.0, + "grad_norm": 1.8918921085406437, + "language_loss": 0.95630223, + "learning_rate": 2.8827430842847267e-06, + "loss": 0.98429739, + "num_input_tokens_seen": 1803030, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 4.03125, + "step": 88, + "time_per_iteration": 2.5916671752929688 + }, + { + "auxiliary_loss_clip": 0.01661198, + "auxiliary_loss_mlp": 0.0114963, + "balance_loss_clip": 1.08230066, + "balance_loss_mlp": 1.24101663, + "epoch": 0.005350969487449271, + "flos": 20886077352960.0, + "grad_norm": 1.9438908009999392, + "language_loss": 0.85920149, + "learning_rate": 2.8900183276075957e-06, + "loss": 0.88730979, + "num_input_tokens_seen": 1822865, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 4.1875, + "step": 89, + "time_per_iteration": 2.6486849784851074 + }, + { + "auxiliary_loss_clip": 0.01648909, + "auxiliary_loss_mlp": 0.01132231, + "balance_loss_clip": 1.06547391, + "balance_loss_mlp": 1.23491406, + "epoch": 0.00541109274011724, + "flos": 26209977431040.0, + "grad_norm": 4.519706664825811, + "language_loss": 0.91517568, + "learning_rate": 2.8972122815946455e-06, + "loss": 0.94298708, + "num_input_tokens_seen": 1842435, + "router_z_loss_clip": 0.66796875, + "router_z_loss_mlp": 4.125, + "step": 90, + "time_per_iteration": 2.658997058868408 + }, + { + "auxiliary_loss_clip": 0.01630542, + "auxiliary_loss_mlp": 0.0113282, + "balance_loss_clip": 1.06496572, + "balance_loss_mlp": 1.23102689, + "epoch": 0.00547121599278521, + "flos": 21178569801600.0, + "grad_norm": 2.2090932400382486, + "language_loss": 0.8587057, + "learning_rate": 2.90432674275074e-06, + "loss": 0.88633931, + "num_input_tokens_seen": 1860065, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 3.984375, + "step": 91, + "time_per_iteration": 2.619231939315796 + }, + { + "auxiliary_loss_clip": 0.01629785, + "auxiliary_loss_mlp": 0.01140917, + "balance_loss_clip": 1.07458866, + "balance_loss_mlp": 1.22673059, + "epoch": 0.005531339245453179, + "flos": 19718801078400.0, + "grad_norm": 2.769705373909222, + "language_loss": 0.86930025, + "learning_rate": 2.91136344867656e-06, + "loss": 0.89700729, + "num_input_tokens_seen": 1878135, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 4.03125, + "step": 92, + "time_per_iteration": 2.646968126296997 + }, + { + "auxiliary_loss_clip": 0.01621216, + "auxiliary_loss_mlp": 0.01179948, + "balance_loss_clip": 1.1122849, + "balance_loss_mlp": 1.21872091, + "epoch": 0.005591462498121149, + "flos": 17636089760640.0, + "grad_norm": 2.5030178409929, + "language_loss": 0.92042911, + "learning_rate": 2.918324080615938e-06, + "loss": 0.94844079, + "num_input_tokens_seen": 1894895, + "router_z_loss_clip": 0.67578125, + "router_z_loss_mlp": 4.03125, + "step": 93, + "time_per_iteration": 2.59853196144104 + }, + { + "auxiliary_loss_clip": 0.016342, + "auxiliary_loss_mlp": 0.01152159, + "balance_loss_clip": 1.08120561, + "balance_loss_mlp": 1.22512126, + "epoch": 0.005651585750789117, + "flos": 20011221699840.0, + "grad_norm": 2.2071592078672198, + "language_loss": 0.87372428, + "learning_rate": 2.925210265866963e-06, + "loss": 0.90158784, + "num_input_tokens_seen": 1913220, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 4.09375, + "step": 94, + "time_per_iteration": 2.587707281112671 + }, + { + "auxiliary_loss_clip": 0.01562532, + "auxiliary_loss_mlp": 0.01067909, + "balance_loss_clip": 1.03243279, + "balance_loss_mlp": 1.30452466, + "epoch": 0.005711709003457087, + "flos": 59812957981440.0, + "grad_norm": 1.3851210442303683, + "language_loss": 0.6813519, + "learning_rate": 2.932023580065507e-06, + "loss": 0.70765626, + "num_input_tokens_seen": 1970970, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 2.578125, + "step": 95, + "time_per_iteration": 3.067047595977783 + }, + { + "auxiliary_loss_clip": 0.01611383, + "auxiliary_loss_mlp": 0.01154317, + "balance_loss_clip": 1.08693981, + "balance_loss_mlp": 1.21303511, + "epoch": 0.005771832256125056, + "flos": 15559591495680.0, + "grad_norm": 2.5109536438971976, + "language_loss": 0.89978027, + "learning_rate": 2.9387655493491906e-06, + "loss": 0.92743719, + "num_input_tokens_seen": 1988930, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 3.984375, + "step": 96, + "time_per_iteration": 2.590522289276123 + }, + { + "auxiliary_loss_clip": 0.01603776, + "auxiliary_loss_mlp": 0.01143264, + "balance_loss_clip": 1.08108413, + "balance_loss_mlp": 1.21597803, + "epoch": 0.005831955508793026, + "flos": 22528380015360.0, + "grad_norm": 2.825781473558237, + "language_loss": 0.89798892, + "learning_rate": 2.9454376524092147e-06, + "loss": 0.92545933, + "num_input_tokens_seen": 2006285, + "router_z_loss_clip": 0.62109375, + "router_z_loss_mlp": 3.875, + "step": 97, + "time_per_iteration": 2.630364179611206 + }, + { + "auxiliary_loss_clip": 0.0158997, + "auxiliary_loss_mlp": 0.01139649, + "balance_loss_clip": 1.07103181, + "balance_loss_mlp": 1.20754981, + "epoch": 0.005892078761460995, + "flos": 22049834094720.0, + "grad_norm": 2.1954130163748573, + "language_loss": 0.76553786, + "learning_rate": 2.952041322436969e-06, + "loss": 0.79283404, + "num_input_tokens_seen": 2024905, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 3.8125, + "step": 98, + "time_per_iteration": 2.6088852882385254 + }, + { + "auxiliary_loss_clip": 0.01531856, + "auxiliary_loss_mlp": 0.01047217, + "balance_loss_clip": 1.01250362, + "balance_loss_mlp": 1.28449416, + "epoch": 0.005952202014128965, + "flos": 68539143317760.0, + "grad_norm": 1.0389188302362988, + "language_loss": 0.65464473, + "learning_rate": 2.9585779489718204e-06, + "loss": 0.68043554, + "num_input_tokens_seen": 2086220, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 2.46875, + "step": 99, + "time_per_iteration": 3.196779251098633 + }, + { + "auxiliary_loss_clip": 0.0159215, + "auxiliary_loss_mlp": 0.01143603, + "balance_loss_clip": 1.07312632, + "balance_loss_mlp": 1.20754516, + "epoch": 0.006012325266796933, + "flos": 22960887678720.0, + "grad_norm": 2.02393591458392, + "language_loss": 0.90861535, + "learning_rate": 2.9650488796560464e-06, + "loss": 0.93597281, + "num_input_tokens_seen": 2103365, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 3.84375, + "step": 100, + "time_per_iteration": 2.659716844558716 + }, + { + "auxiliary_loss_clip": 0.01602583, + "auxiliary_loss_mlp": 0.01150362, + "balance_loss_clip": 1.08360529, + "balance_loss_mlp": 1.21008992, + "epoch": 0.006072448519464903, + "flos": 17347942857600.0, + "grad_norm": 9.149928686451464, + "language_loss": 0.91165614, + "learning_rate": 2.971455421902446e-06, + "loss": 0.93918556, + "num_input_tokens_seen": 2121995, + "router_z_loss_clip": 0.66796875, + "router_z_loss_mlp": 3.921875, + "step": 101, + "time_per_iteration": 5.522722959518433 + }, + { + "auxiliary_loss_clip": 0.01592164, + "auxiliary_loss_mlp": 0.01153598, + "balance_loss_clip": 1.08273995, + "balance_loss_mlp": 1.21078956, + "epoch": 0.006132571772132872, + "flos": 24681116897280.0, + "grad_norm": 2.149611483260168, + "language_loss": 0.90634245, + "learning_rate": 2.9777988444798075e-06, + "loss": 0.9338001, + "num_input_tokens_seen": 2141815, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 3.8125, + "step": 102, + "time_per_iteration": 2.7264201641082764 + }, + { + "auxiliary_loss_clip": 0.01586171, + "auxiliary_loss_mlp": 0.01134806, + "balance_loss_clip": 1.06986046, + "balance_loss_mlp": 1.20794034, + "epoch": 0.006192695024800842, + "flos": 21465675210240.0, + "grad_norm": 2.4455555336324135, + "language_loss": 0.87990314, + "learning_rate": 2.9840803790210285e-06, + "loss": 0.9071129, + "num_input_tokens_seen": 2161125, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 3.78125, + "step": 103, + "time_per_iteration": 2.6332345008850098 + }, + { + "auxiliary_loss_clip": 0.01586169, + "auxiliary_loss_mlp": 0.01136721, + "balance_loss_clip": 1.07015502, + "balance_loss_mlp": 1.2100153, + "epoch": 0.006252818277468811, + "flos": 17420410546560.0, + "grad_norm": 1.9653003456434248, + "language_loss": 0.93796182, + "learning_rate": 2.990301221458371e-06, + "loss": 0.96519077, + "num_input_tokens_seen": 2179510, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 3.765625, + "step": 104, + "time_per_iteration": 2.5763180255889893 + }, + { + "auxiliary_loss_clip": 0.01576682, + "auxiliary_loss_mlp": 0.01148107, + "balance_loss_clip": 1.08382916, + "balance_loss_mlp": 1.20004964, + "epoch": 0.006312941530136781, + "flos": 19099557584640.0, + "grad_norm": 2.978383813748495, + "language_loss": 0.96302718, + "learning_rate": 2.9964625333900544e-06, + "loss": 0.99027503, + "num_input_tokens_seen": 2197870, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 3.765625, + "step": 105, + "time_per_iteration": 2.598074197769165 + }, + { + "auxiliary_loss_clip": 0.01576054, + "auxiliary_loss_mlp": 0.01157995, + "balance_loss_clip": 1.08618331, + "balance_loss_mlp": 1.20040035, + "epoch": 0.006373064782804749, + "flos": 24060831909120.0, + "grad_norm": 2.254409296180574, + "language_loss": 0.86981636, + "learning_rate": 3.002565443382063e-06, + "loss": 0.89715683, + "num_input_tokens_seen": 2217495, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 3.75, + "step": 106, + "time_per_iteration": 2.620400905609131 + }, + { + "auxiliary_loss_clip": 0.01558878, + "auxiliary_loss_mlp": 0.01142953, + "balance_loss_clip": 1.07462192, + "balance_loss_mlp": 1.18650925, + "epoch": 0.006433188035472719, + "flos": 18332433797760.0, + "grad_norm": 2.299900982703377, + "language_loss": 0.8342824, + "learning_rate": 3.008611048208843e-06, + "loss": 0.86130083, + "num_input_tokens_seen": 2236520, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 3.71875, + "step": 107, + "time_per_iteration": 2.6031439304351807 + }, + { + "auxiliary_loss_clip": 0.01473949, + "auxiliary_loss_mlp": 0.01044987, + "balance_loss_clip": 1.01294351, + "balance_loss_mlp": 1.24969411, + "epoch": 0.006493311288140688, + "flos": 62562387594240.0, + "grad_norm": 0.9921074222226888, + "language_loss": 0.64829654, + "learning_rate": 3.014600414036285e-06, + "loss": 0.67348593, + "num_input_tokens_seen": 2300140, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 2.25, + "step": 108, + "time_per_iteration": 3.1797876358032227 + }, + { + "auxiliary_loss_clip": 0.01549803, + "auxiliary_loss_mlp": 0.0113223, + "balance_loss_clip": 1.0634706, + "balance_loss_mlp": 1.18794155, + "epoch": 0.006553434540808658, + "flos": 19500141035520.0, + "grad_norm": 3.0292528917398895, + "language_loss": 0.97705221, + "learning_rate": 3.0205345775501937e-06, + "loss": 1.00387263, + "num_input_tokens_seen": 2317320, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 3.625, + "step": 109, + "time_per_iteration": 2.587251663208008 + }, + { + "auxiliary_loss_clip": 0.01548304, + "auxiliary_loss_mlp": 0.01143686, + "balance_loss_clip": 1.07759643, + "balance_loss_mlp": 1.18955791, + "epoch": 0.006613557793476627, + "flos": 21105132445440.0, + "grad_norm": 1.7037490209774204, + "language_loss": 0.84119976, + "learning_rate": 3.0264145470332218e-06, + "loss": 0.86811972, + "num_input_tokens_seen": 2337820, + "router_z_loss_clip": 0.66015625, + "router_z_loss_mlp": 3.59375, + "step": 110, + "time_per_iteration": 2.612900495529175 + }, + { + "auxiliary_loss_clip": 0.01543027, + "auxiliary_loss_mlp": 0.01148771, + "balance_loss_clip": 1.08287191, + "balance_loss_mlp": 1.18348098, + "epoch": 0.006673681046144597, + "flos": 26030747543040.0, + "grad_norm": 2.0686651571732186, + "language_loss": 0.83053756, + "learning_rate": 3.032241303393073e-06, + "loss": 0.85745549, + "num_input_tokens_seen": 2358560, + "router_z_loss_clip": 0.66015625, + "router_z_loss_mlp": 3.59375, + "step": 111, + "time_per_iteration": 2.648775815963745 + }, + { + "auxiliary_loss_clip": 0.01543945, + "auxiliary_loss_mlp": 0.01132291, + "balance_loss_clip": 1.06906247, + "balance_loss_mlp": 1.18600404, + "epoch": 0.006733804298812566, + "flos": 23147767163520.0, + "grad_norm": 1.9360906695559799, + "language_loss": 0.94064176, + "learning_rate": 3.0380158011446e-06, + "loss": 0.96740413, + "num_input_tokens_seen": 2379005, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 3.59375, + "step": 112, + "time_per_iteration": 2.5952305793762207 + }, + { + "auxiliary_loss_clip": 0.01547241, + "auxiliary_loss_mlp": 0.0113746, + "balance_loss_clip": 1.07342076, + "balance_loss_mlp": 1.18214464, + "epoch": 0.006793927551480535, + "flos": 11764444210560.0, + "grad_norm": 2.4119047199233594, + "language_loss": 0.79298341, + "learning_rate": 3.0437389693482466e-06, + "loss": 0.81983036, + "num_input_tokens_seen": 2395610, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 3.65625, + "step": 113, + "time_per_iteration": 2.524744987487793 + }, + { + "auxiliary_loss_clip": 0.01535171, + "auxiliary_loss_mlp": 0.01130123, + "balance_loss_clip": 1.06460583, + "balance_loss_mlp": 1.1784718, + "epoch": 0.006854050804148504, + "flos": 19171953446400.0, + "grad_norm": 2.1108584765070924, + "language_loss": 0.93168736, + "learning_rate": 3.0494117125071475e-06, + "loss": 0.95834035, + "num_input_tokens_seen": 2415005, + "router_z_loss_clip": 0.65625, + "router_z_loss_mlp": 3.5625, + "step": 114, + "time_per_iteration": 2.6716785430908203 + }, + { + "auxiliary_loss_clip": 0.01541748, + "auxiliary_loss_mlp": 0.01138267, + "balance_loss_clip": 1.07828045, + "balance_loss_mlp": 1.17785645, + "epoch": 0.006914174056816474, + "flos": 21981891519360.0, + "grad_norm": 2.266348661789013, + "language_loss": 0.94440514, + "learning_rate": 3.055034911425055e-06, + "loss": 0.97120523, + "num_input_tokens_seen": 2433965, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 3.640625, + "step": 115, + "time_per_iteration": 2.6136229038238525 + }, + { + "auxiliary_loss_clip": 0.01536673, + "auxiliary_loss_mlp": 0.0111845, + "balance_loss_clip": 1.052122, + "balance_loss_mlp": 1.1758287, + "epoch": 0.006974297309484443, + "flos": 16289152634880.0, + "grad_norm": 12.665326776351556, + "language_loss": 0.81903678, + "learning_rate": 3.0606094240271244e-06, + "loss": 0.84558797, + "num_input_tokens_seen": 2451605, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 3.609375, + "step": 116, + "time_per_iteration": 2.577003240585327 + }, + { + "auxiliary_loss_clip": 0.01526673, + "auxiliary_loss_mlp": 0.01127935, + "balance_loss_clip": 1.06375241, + "balance_loss_mlp": 1.17504787, + "epoch": 0.007034420562152413, + "flos": 26104005331200.0, + "grad_norm": 2.0071741256932794, + "language_loss": 0.88063896, + "learning_rate": 3.0661360861454656e-06, + "loss": 0.90718508, + "num_input_tokens_seen": 2472035, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 3.515625, + "step": 117, + "time_per_iteration": 2.611503839492798 + }, + { + "auxiliary_loss_clip": 0.01525448, + "auxiliary_loss_mlp": 0.01143736, + "balance_loss_clip": 1.07840896, + "balance_loss_mlp": 1.17308259, + "epoch": 0.007094543814820382, + "flos": 14204609723520.0, + "grad_norm": 2.5473368597875594, + "language_loss": 0.84470415, + "learning_rate": 3.071615712271274e-06, + "loss": 0.87139601, + "num_input_tokens_seen": 2489285, + "router_z_loss_clip": 0.65234375, + "router_z_loss_mlp": 3.53125, + "step": 118, + "time_per_iteration": 2.577461004257202 + }, + { + "auxiliary_loss_clip": 0.01536798, + "auxiliary_loss_mlp": 0.01163532, + "balance_loss_clip": 1.09930205, + "balance_loss_mlp": 1.1748507, + "epoch": 0.007154667067488351, + "flos": 14976007228800.0, + "grad_norm": 2.057592918726277, + "language_loss": 0.99470234, + "learning_rate": 3.0770490962752172e-06, + "loss": 1.02170563, + "num_input_tokens_seen": 2506460, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 3.625, + "step": 119, + "time_per_iteration": 2.549661636352539 + }, + { + "auxiliary_loss_clip": 0.01537395, + "auxiliary_loss_mlp": 0.0111939, + "balance_loss_clip": 1.05701971, + "balance_loss_mlp": 1.16968298, + "epoch": 0.00721479032015632, + "flos": 20193288762240.0, + "grad_norm": 2.410205702357196, + "language_loss": 0.89085704, + "learning_rate": 3.082437012097686e-06, + "loss": 0.91742492, + "num_input_tokens_seen": 2525565, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 3.6875, + "step": 120, + "time_per_iteration": 2.583630084991455 + }, + { + "auxiliary_loss_clip": 0.01524337, + "auxiliary_loss_mlp": 0.01130091, + "balance_loss_clip": 1.06667209, + "balance_loss_mlp": 1.17169607, + "epoch": 0.00727491357282429, + "flos": 23147228459520.0, + "grad_norm": 1.904240324338801, + "language_loss": 0.93491054, + "learning_rate": 3.0877802144103967e-06, + "loss": 0.96145487, + "num_input_tokens_seen": 2546605, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 3.53125, + "step": 121, + "time_per_iteration": 2.6146788597106934 + }, + { + "auxiliary_loss_clip": 0.01523412, + "auxiliary_loss_mlp": 0.0114605, + "balance_loss_clip": 1.08382273, + "balance_loss_mlp": 1.17073464, + "epoch": 0.007335036825492259, + "flos": 15521669712000.0, + "grad_norm": 3.352658173167552, + "language_loss": 0.90176952, + "learning_rate": 3.09307943925077e-06, + "loss": 0.92846411, + "num_input_tokens_seen": 2560730, + "router_z_loss_clip": 0.62109375, + "router_z_loss_mlp": 3.53125, + "step": 122, + "time_per_iteration": 2.566470146179199 + }, + { + "auxiliary_loss_clip": 0.01520578, + "auxiliary_loss_mlp": 0.01142532, + "balance_loss_clip": 1.07634664, + "balance_loss_mlp": 1.16606736, + "epoch": 0.007395160078160229, + "flos": 24243365848320.0, + "grad_norm": 2.7249964127160764, + "language_loss": 0.92516506, + "learning_rate": 3.0983354046304154e-06, + "loss": 0.95179617, + "num_input_tokens_seen": 2579550, + "router_z_loss_clip": 0.66015625, + "router_z_loss_mlp": 3.546875, + "step": 123, + "time_per_iteration": 2.6002941131591797 + }, + { + "auxiliary_loss_clip": 0.01517776, + "auxiliary_loss_mlp": 0.01125795, + "balance_loss_clip": 1.06433022, + "balance_loss_mlp": 1.1609534, + "epoch": 0.007455283330828198, + "flos": 31759792099200.0, + "grad_norm": 7.583203404073904, + "language_loss": 0.71128142, + "learning_rate": 3.103548811118979e-06, + "loss": 0.73771715, + "num_input_tokens_seen": 2600390, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 3.5625, + "step": 124, + "time_per_iteration": 2.79618763923645 + }, + { + "auxiliary_loss_clip": 0.01505473, + "auxiliary_loss_mlp": 0.01124615, + "balance_loss_clip": 1.06157708, + "balance_loss_mlp": 1.16223335, + "epoch": 0.007515406583496167, + "flos": 26615157822720.0, + "grad_norm": 2.4227692366027855, + "language_loss": 0.88482195, + "learning_rate": 3.108720342404542e-06, + "loss": 0.9111228, + "num_input_tokens_seen": 2620770, + "router_z_loss_clip": 0.62890625, + "router_z_loss_mlp": 3.4375, + "step": 125, + "time_per_iteration": 2.6131536960601807 + }, + { + "auxiliary_loss_clip": 0.0152071, + "auxiliary_loss_mlp": 0.01140137, + "balance_loss_clip": 1.07762396, + "balance_loss_mlp": 1.16211164, + "epoch": 0.007575529836164136, + "flos": 18223696350720.0, + "grad_norm": 2.993097477973623, + "language_loss": 0.82384819, + "learning_rate": 3.1138506658316945e-06, + "loss": 0.8504566, + "num_input_tokens_seen": 2639900, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 3.59375, + "step": 126, + "time_per_iteration": 2.595423936843872 + }, + { + "auxiliary_loss_clip": 0.01514354, + "auxiliary_loss_mlp": 0.01139255, + "balance_loss_clip": 1.077981, + "balance_loss_mlp": 1.16128385, + "epoch": 0.007635653088832106, + "flos": 21580410228480.0, + "grad_norm": 3.7264016399601534, + "language_loss": 0.67276633, + "learning_rate": 3.1189404329183404e-06, + "loss": 0.69930243, + "num_input_tokens_seen": 2657450, + "router_z_loss_clip": 0.61328125, + "router_z_loss_mlp": 3.53125, + "step": 127, + "time_per_iteration": 2.620950937271118 + }, + { + "auxiliary_loss_clip": 0.01504536, + "auxiliary_loss_mlp": 0.01128822, + "balance_loss_clip": 1.06640375, + "balance_loss_mlp": 1.16422939, + "epoch": 0.007695776341500075, + "flos": 25375054723200.0, + "grad_norm": 3.6226937306152496, + "language_loss": 0.8815757, + "learning_rate": 3.1239902798522317e-06, + "loss": 0.90790927, + "num_input_tokens_seen": 2678150, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 3.40625, + "step": 128, + "time_per_iteration": 2.6476521492004395 + }, + { + "auxiliary_loss_clip": 0.01505804, + "auxiliary_loss_mlp": 0.01141266, + "balance_loss_clip": 1.07870471, + "balance_loss_mlp": 1.15920687, + "epoch": 0.007755899594168045, + "flos": 22343906741760.0, + "grad_norm": 1.875185485357673, + "language_loss": 0.84581351, + "learning_rate": 3.129000827968184e-06, + "loss": 0.87228423, + "num_input_tokens_seen": 2698290, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 3.46875, + "step": 129, + "time_per_iteration": 2.611762285232544 + }, + { + "auxiliary_loss_clip": 0.01499869, + "auxiliary_loss_mlp": 0.01133647, + "balance_loss_clip": 1.07122934, + "balance_loss_mlp": 1.1588279, + "epoch": 0.007816022846836013, + "flos": 22638230784000.0, + "grad_norm": 2.023668494136832, + "language_loss": 0.9742806, + "learning_rate": 3.133972684206866e-06, + "loss": 1.00061572, + "num_input_tokens_seen": 2717630, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 3.40625, + "step": 130, + "time_per_iteration": 2.599639415740967 + }, + { + "auxiliary_loss_clip": 0.01493155, + "auxiliary_loss_mlp": 0.0113499, + "balance_loss_clip": 1.07109392, + "balance_loss_mlp": 1.15518749, + "epoch": 0.007876146099503984, + "flos": 18182901479040.0, + "grad_norm": 2.1876581172480285, + "language_loss": 0.82624269, + "learning_rate": 3.138906441556014e-06, + "loss": 0.85252404, + "num_input_tokens_seen": 2735835, + "router_z_loss_clip": 0.63671875, + "router_z_loss_mlp": 3.375, + "step": 131, + "time_per_iteration": 2.6086065769195557 + }, + { + "auxiliary_loss_clip": 0.01502593, + "auxiliary_loss_mlp": 0.01127672, + "balance_loss_clip": 1.06759024, + "balance_loss_mlp": 1.15800536, + "epoch": 0.007936269352171952, + "flos": 27119486730240.0, + "grad_norm": 2.4868851395581677, + "language_loss": 0.82762384, + "learning_rate": 3.143802679474861e-06, + "loss": 0.85392648, + "num_input_tokens_seen": 2756335, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 3.4375, + "step": 132, + "time_per_iteration": 2.673790454864502 + }, + { + "auxiliary_loss_clip": 0.01493849, + "auxiliary_loss_mlp": 0.01128197, + "balance_loss_clip": 1.06716144, + "balance_loss_mlp": 1.15264463, + "epoch": 0.007996392604839923, + "flos": 19026335710080.0, + "grad_norm": 2.7432419346617443, + "language_loss": 0.95486552, + "learning_rate": 3.1486619643025565e-06, + "loss": 0.98108596, + "num_input_tokens_seen": 2775090, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 3.40625, + "step": 133, + "time_per_iteration": 2.6287872791290283 + }, + { + "auxiliary_loss_clip": 0.01490198, + "auxiliary_loss_mlp": 0.01125526, + "balance_loss_clip": 1.06725681, + "balance_loss_mlp": 1.16143155, + "epoch": 0.008056515857507891, + "flos": 25484151306240.0, + "grad_norm": 1.7764051426707919, + "language_loss": 0.73316634, + "learning_rate": 3.153484849651286e-06, + "loss": 0.7593236, + "num_input_tokens_seen": 2795320, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 3.296875, + "step": 134, + "time_per_iteration": 2.6728081703186035 + }, + { + "auxiliary_loss_clip": 0.01486213, + "auxiliary_loss_mlp": 0.01130543, + "balance_loss_clip": 1.06707644, + "balance_loss_mlp": 1.14955854, + "epoch": 0.00811663911017586, + "flos": 20557566541440.0, + "grad_norm": 3.090234736760587, + "language_loss": 0.88808328, + "learning_rate": 3.1582718767847806e-06, + "loss": 0.91425079, + "num_input_tokens_seen": 2812815, + "router_z_loss_clip": 0.63671875, + "router_z_loss_mlp": 3.375, + "step": 135, + "time_per_iteration": 2.6380510330200195 + }, + { + "auxiliary_loss_clip": 0.01489108, + "auxiliary_loss_mlp": 0.01131555, + "balance_loss_clip": 1.06789732, + "balance_loss_mlp": 1.15456343, + "epoch": 0.00817676236284383, + "flos": 18799738761600.0, + "grad_norm": 2.008171494368998, + "language_loss": 0.89123899, + "learning_rate": 3.1630235749828485e-06, + "loss": 0.9174456, + "num_input_tokens_seen": 2830445, + "router_z_loss_clip": 0.63671875, + "router_z_loss_mlp": 3.34375, + "step": 136, + "time_per_iteration": 2.555936813354492 + }, + { + "auxiliary_loss_clip": 0.01486639, + "auxiliary_loss_mlp": 0.01108223, + "balance_loss_clip": 1.04962027, + "balance_loss_mlp": 1.14870429, + "epoch": 0.008236885615511799, + "flos": 23873593288320.0, + "grad_norm": 5.8712537379963345, + "language_loss": 0.8400104, + "learning_rate": 3.1677404618925676e-06, + "loss": 0.86595905, + "num_input_tokens_seen": 2846965, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 3.375, + "step": 137, + "time_per_iteration": 2.6225337982177734 + }, + { + "auxiliary_loss_clip": 0.01482624, + "auxiliary_loss_mlp": 0.01116377, + "balance_loss_clip": 1.05796409, + "balance_loss_mlp": 1.14842129, + "epoch": 0.00829700886817977, + "flos": 24643626076800.0, + "grad_norm": 1.6861384534946333, + "language_loss": 0.90170664, + "learning_rate": 3.1724230438666953e-06, + "loss": 0.9276967, + "num_input_tokens_seen": 2867520, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 3.34375, + "step": 138, + "time_per_iteration": 2.653205156326294 + }, + { + "auxiliary_loss_clip": 0.01472312, + "auxiliary_loss_mlp": 0.01119929, + "balance_loss_clip": 1.0568912, + "balance_loss_mlp": 1.1478796, + "epoch": 0.008357132120847738, + "flos": 25262007644160.0, + "grad_norm": 2.679342832062188, + "language_loss": 0.91253459, + "learning_rate": 3.177071816289865e-06, + "loss": 0.93845713, + "num_input_tokens_seen": 2885675, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 3.234375, + "step": 139, + "time_per_iteration": 2.6182503700256348 + }, + { + "auxiliary_loss_clip": 0.01489087, + "auxiliary_loss_mlp": 0.01123997, + "balance_loss_clip": 1.06229401, + "balance_loss_mlp": 1.154405, + "epoch": 0.008417255373515706, + "flos": 27344898529920.0, + "grad_norm": 2.5553770836970675, + "language_loss": 0.85446793, + "learning_rate": 3.181687263893095e-06, + "loss": 0.88059878, + "num_input_tokens_seen": 2905960, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 3.34375, + "step": 140, + "time_per_iteration": 2.649454116821289 + }, + { + "auxiliary_loss_clip": 0.01476267, + "auxiliary_loss_mlp": 0.0111889, + "balance_loss_clip": 1.0594281, + "balance_loss_mlp": 1.14865911, + "epoch": 0.008477378626183677, + "flos": 17639070589440.0, + "grad_norm": 2.379593217845822, + "language_loss": 0.84156519, + "learning_rate": 3.186269861057098e-06, + "loss": 0.86751676, + "num_input_tokens_seen": 2922780, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 3.28125, + "step": 141, + "time_per_iteration": 2.608603000640869 + }, + { + "auxiliary_loss_clip": 0.01480312, + "auxiliary_loss_mlp": 0.01134333, + "balance_loss_clip": 1.07320273, + "balance_loss_mlp": 1.14624739, + "epoch": 0.008537501878851645, + "flos": 13881342297600.0, + "grad_norm": 2.3283494467369965, + "language_loss": 0.81387591, + "learning_rate": 3.1908200721048745e-06, + "loss": 0.84002233, + "num_input_tokens_seen": 2938765, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 3.34375, + "step": 142, + "time_per_iteration": 4.023308753967285 + }, + { + "auxiliary_loss_clip": 0.01378722, + "auxiliary_loss_mlp": 0.01032728, + "balance_loss_clip": 1.00621629, + "balance_loss_mlp": 1.1918689, + "epoch": 0.008597625131519616, + "flos": 71248101281280.0, + "grad_norm": 1.0451783350372967, + "language_loss": 0.66831523, + "learning_rate": 3.195338351584042e-06, + "loss": 0.69242978, + "num_input_tokens_seen": 3006665, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.8671875, + "step": 143, + "time_per_iteration": 4.718023777008057 + }, + { + "auxiliary_loss_clip": 0.01472184, + "auxiliary_loss_mlp": 0.0112263, + "balance_loss_clip": 1.06283474, + "balance_loss_mlp": 1.14625573, + "epoch": 0.008657748384187584, + "flos": 17602836744960.0, + "grad_norm": 2.2608538764922295, + "language_loss": 0.83954072, + "learning_rate": 3.1998251445393258e-06, + "loss": 0.86548889, + "num_input_tokens_seen": 3024335, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 3.25, + "step": 144, + "time_per_iteration": 2.5878453254699707 + }, + { + "auxiliary_loss_clip": 0.01457808, + "auxiliary_loss_mlp": 0.01111605, + "balance_loss_clip": 1.04890084, + "balance_loss_mlp": 1.13930941, + "epoch": 0.008717871636855555, + "flos": 19715317459200.0, + "grad_norm": 2.241812154138119, + "language_loss": 0.88511693, + "learning_rate": 3.204280886775619e-06, + "loss": 0.91081107, + "num_input_tokens_seen": 3043300, + "router_z_loss_clip": 0.62890625, + "router_z_loss_mlp": 3.1875, + "step": 145, + "time_per_iteration": 2.586512565612793 + }, + { + "auxiliary_loss_clip": 0.01475641, + "auxiliary_loss_mlp": 0.01124002, + "balance_loss_clip": 1.06153631, + "balance_loss_mlp": 1.14211285, + "epoch": 0.008777994889523523, + "flos": 24717422568960.0, + "grad_norm": 1.792984011276012, + "language_loss": 0.85949898, + "learning_rate": 3.208706005112005e-06, + "loss": 0.88549542, + "num_input_tokens_seen": 3064610, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 3.34375, + "step": 146, + "time_per_iteration": 2.6258151531219482 + }, + { + "auxiliary_loss_clip": 0.01359324, + "auxiliary_loss_mlp": 0.01026953, + "balance_loss_clip": 1.00082254, + "balance_loss_mlp": 1.17825258, + "epoch": 0.008838118142191492, + "flos": 70132067758080.0, + "grad_norm": 0.8557738136673508, + "language_loss": 0.60047674, + "learning_rate": 3.213100917627104e-06, + "loss": 0.62433958, + "num_input_tokens_seen": 3130385, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.8125, + "step": 147, + "time_per_iteration": 3.2522764205932617 + }, + { + "auxiliary_loss_clip": 0.01465546, + "auxiliary_loss_mlp": 0.01124118, + "balance_loss_clip": 1.06670642, + "balance_loss_mlp": 1.14550173, + "epoch": 0.008898241394859462, + "flos": 20044797937920.0, + "grad_norm": 1.8343461268862185, + "language_loss": 0.8454501, + "learning_rate": 3.2174660338961135e-06, + "loss": 0.87134671, + "num_input_tokens_seen": 3149760, + "router_z_loss_clip": 0.57421875, + "router_z_loss_mlp": 3.203125, + "step": 148, + "time_per_iteration": 2.635499954223633 + }, + { + "auxiliary_loss_clip": 0.0147086, + "auxiliary_loss_mlp": 0.01143141, + "balance_loss_clip": 1.07914925, + "balance_loss_mlp": 1.14693797, + "epoch": 0.008958364647527431, + "flos": 10743611685120.0, + "grad_norm": 2.2581185064103404, + "language_loss": 0.88802874, + "learning_rate": 3.2218017552198588e-06, + "loss": 0.91416872, + "num_input_tokens_seen": 3164500, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 3.234375, + "step": 149, + "time_per_iteration": 2.5458836555480957 + }, + { + "auxiliary_loss_clip": 0.01466862, + "auxiliary_loss_mlp": 0.01112061, + "balance_loss_clip": 1.05445874, + "balance_loss_mlp": 1.14131117, + "epoch": 0.009018487900195401, + "flos": 29127467802240.0, + "grad_norm": 2.7760320197047097, + "language_loss": 0.93054724, + "learning_rate": 3.226108474846181e-06, + "loss": 0.95633656, + "num_input_tokens_seen": 3182455, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 3.25, + "step": 150, + "time_per_iteration": 2.648111343383789 + }, + { + "auxiliary_loss_clip": 0.01454371, + "auxiliary_loss_mlp": 0.01109463, + "balance_loss_clip": 1.05391192, + "balance_loss_mlp": 1.13663483, + "epoch": 0.00907861115286337, + "flos": 32963661354240.0, + "grad_norm": 1.9005080345968057, + "language_loss": 0.74303263, + "learning_rate": 3.2303865781839817e-06, + "loss": 0.76867104, + "num_input_tokens_seen": 3203995, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 3.171875, + "step": 151, + "time_per_iteration": 2.6953165531158447 + }, + { + "auxiliary_loss_clip": 0.014664, + "auxiliary_loss_mlp": 0.01125146, + "balance_loss_clip": 1.06735289, + "balance_loss_mlp": 1.14143276, + "epoch": 0.009138734405531338, + "flos": 21762441377280.0, + "grad_norm": 2.6241423805649298, + "language_loss": 0.88251799, + "learning_rate": 3.234636443010188e-06, + "loss": 0.90843344, + "num_input_tokens_seen": 3222575, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 3.25, + "step": 152, + "time_per_iteration": 2.6034231185913086 + }, + { + "auxiliary_loss_clip": 0.01466383, + "auxiliary_loss_mlp": 0.01121244, + "balance_loss_clip": 1.0628314, + "balance_loss_mlp": 1.14757276, + "epoch": 0.009198857658199309, + "flos": 20842517134080.0, + "grad_norm": 3.4062301864690196, + "language_loss": 0.83957756, + "learning_rate": 3.238858439669943e-06, + "loss": 0.86545384, + "num_input_tokens_seen": 3240180, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 3.1875, + "step": 153, + "time_per_iteration": 2.6023271083831787 + }, + { + "auxiliary_loss_clip": 0.01456394, + "auxiliary_loss_mlp": 0.01136316, + "balance_loss_clip": 1.0765202, + "balance_loss_mlp": 1.13805962, + "epoch": 0.009258980910867277, + "flos": 24827381078400.0, + "grad_norm": 1.9441527650945287, + "language_loss": 0.89881843, + "learning_rate": 3.2430529312702712e-06, + "loss": 0.92474556, + "num_input_tokens_seen": 3259800, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 3.1875, + "step": 154, + "time_per_iteration": 2.646308183670044 + }, + { + "auxiliary_loss_clip": 0.01460439, + "auxiliary_loss_mlp": 0.01154617, + "balance_loss_clip": 1.09577537, + "balance_loss_mlp": 1.14094579, + "epoch": 0.009319104163535248, + "flos": 28767786963840.0, + "grad_norm": 2.0692323216259187, + "language_loss": 0.89471745, + "learning_rate": 3.2472202738674737e-06, + "loss": 0.92086804, + "num_input_tokens_seen": 3280400, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 3.1875, + "step": 155, + "time_per_iteration": 2.6336286067962646 + }, + { + "auxiliary_loss_clip": 0.01463585, + "auxiliary_loss_mlp": 0.01116238, + "balance_loss_clip": 1.05894589, + "balance_loss_mlp": 1.13895822, + "epoch": 0.009379227416203216, + "flos": 16582004219520.0, + "grad_norm": 3.3077298720636255, + "language_loss": 0.86882627, + "learning_rate": 3.2513608166485063e-06, + "loss": 0.89462447, + "num_input_tokens_seen": 3297600, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 3.25, + "step": 156, + "time_per_iteration": 2.5539867877960205 + }, + { + "auxiliary_loss_clip": 0.01462083, + "auxiliary_loss_mlp": 0.01121969, + "balance_loss_clip": 1.06408143, + "balance_loss_mlp": 1.14298415, + "epoch": 0.009439350668871187, + "flos": 18329919845760.0, + "grad_norm": 2.4916444524903527, + "language_loss": 0.99553013, + "learning_rate": 3.2554749021065498e-06, + "loss": 1.02137065, + "num_input_tokens_seen": 3313635, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 3.1875, + "step": 157, + "time_per_iteration": 2.5249693393707275 + }, + { + "auxiliary_loss_clip": 0.01445636, + "auxiliary_loss_mlp": 0.01139016, + "balance_loss_clip": 1.08146214, + "balance_loss_mlp": 1.1366899, + "epoch": 0.009499473921539155, + "flos": 24349912565760.0, + "grad_norm": 2.0302475566757225, + "language_loss": 0.8847568, + "learning_rate": 3.2595628662110186e-06, + "loss": 0.91060334, + "num_input_tokens_seen": 3333735, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 3.09375, + "step": 158, + "time_per_iteration": 2.6009252071380615 + }, + { + "auxiliary_loss_clip": 0.01452439, + "auxiliary_loss_mlp": 0.01124253, + "balance_loss_clip": 1.06555486, + "balance_loss_mlp": 1.13677907, + "epoch": 0.009559597174207124, + "flos": 16399326625920.0, + "grad_norm": 4.310723443959545, + "language_loss": 0.86534697, + "learning_rate": 3.2636250385721982e-06, + "loss": 0.89111388, + "num_input_tokens_seen": 3348800, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 3.15625, + "step": 159, + "time_per_iteration": 2.6107394695281982 + }, + { + "auxiliary_loss_clip": 0.01442093, + "auxiliary_loss_mlp": 0.01132817, + "balance_loss_clip": 1.07340288, + "balance_loss_mlp": 1.13145089, + "epoch": 0.009619720426875094, + "flos": 22856890826880.0, + "grad_norm": 1.790220267572532, + "language_loss": 0.86825597, + "learning_rate": 3.2676617426007263e-06, + "loss": 0.89400506, + "num_input_tokens_seen": 3368595, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 3.109375, + "step": 160, + "time_per_iteration": 2.574252128601074 + }, + { + "auxiliary_loss_clip": 0.01449537, + "auxiliary_loss_mlp": 0.01117828, + "balance_loss_clip": 1.06318271, + "balance_loss_mlp": 1.13704872, + "epoch": 0.009679843679543063, + "flos": 19135001329920.0, + "grad_norm": 2.6107931748588893, + "language_loss": 0.91542315, + "learning_rate": 3.2716732956621042e-06, + "loss": 0.94109678, + "num_input_tokens_seen": 3384975, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 3.125, + "step": 161, + "time_per_iteration": 2.550865650177002 + }, + { + "auxiliary_loss_clip": 0.01454094, + "auxiliary_loss_mlp": 0.01109765, + "balance_loss_clip": 1.05488133, + "balance_loss_mlp": 1.13759339, + "epoch": 0.009739966932211033, + "flos": 20302995876480.0, + "grad_norm": 2.2107920101940994, + "language_loss": 0.91690832, + "learning_rate": 3.2756600092264203e-06, + "loss": 0.94254684, + "num_input_tokens_seen": 3404755, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 3.15625, + "step": 162, + "time_per_iteration": 2.5527970790863037 + }, + { + "auxiliary_loss_clip": 0.01312712, + "auxiliary_loss_mlp": 0.0102725, + "balance_loss_clip": 1.00331306, + "balance_loss_mlp": 1.14560354, + "epoch": 0.009800090184879002, + "flos": 67034234177280.0, + "grad_norm": 1.2615279464106541, + "language_loss": 0.72354776, + "learning_rate": 3.279622189013474e-06, + "loss": 0.74694741, + "num_input_tokens_seen": 3467210, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.671875, + "step": 163, + "time_per_iteration": 3.143763542175293 + }, + { + "auxiliary_loss_clip": 0.01440764, + "auxiliary_loss_mlp": 0.01113881, + "balance_loss_clip": 1.05804312, + "balance_loss_mlp": 1.13505006, + "epoch": 0.00986021343754697, + "flos": 17164690646400.0, + "grad_norm": 2.1923315312730374, + "language_loss": 0.8427155, + "learning_rate": 3.283560135133457e-06, + "loss": 0.86826193, + "num_input_tokens_seen": 3483220, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 3.0625, + "step": 164, + "time_per_iteration": 2.5536584854125977 + }, + { + "auxiliary_loss_clip": 0.01429878, + "auxiliary_loss_mlp": 0.01100497, + "balance_loss_clip": 1.04585135, + "balance_loss_mlp": 1.12637794, + "epoch": 0.00992033669021494, + "flos": 17749424148480.0, + "grad_norm": 2.006756380443377, + "language_loss": 0.89215541, + "learning_rate": 3.2874741422233565e-06, + "loss": 0.91745919, + "num_input_tokens_seen": 3501465, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 3.03125, + "step": 165, + "time_per_iteration": 2.5313210487365723 + }, + { + "auxiliary_loss_clip": 0.01434156, + "auxiliary_loss_mlp": 0.01127756, + "balance_loss_clip": 1.0692482, + "balance_loss_mlp": 1.12764359, + "epoch": 0.00998045994288291, + "flos": 25297164080640.0, + "grad_norm": 6.432940691763592, + "language_loss": 0.80138129, + "learning_rate": 3.2913644995792465e-06, + "loss": 0.82700044, + "num_input_tokens_seen": 3520480, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 3.0625, + "step": 166, + "time_per_iteration": 2.6461095809936523 + }, + { + "auxiliary_loss_clip": 0.01438531, + "auxiliary_loss_mlp": 0.01125189, + "balance_loss_clip": 1.06749213, + "balance_loss_mlp": 1.13121533, + "epoch": 0.01004058319555088, + "flos": 32298954220800.0, + "grad_norm": 2.334124726802297, + "language_loss": 0.9190954, + "learning_rate": 3.2952314912845914e-06, + "loss": 0.94473255, + "num_input_tokens_seen": 3539570, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 3.078125, + "step": 167, + "time_per_iteration": 2.655597448348999 + }, + { + "auxiliary_loss_clip": 0.01430369, + "auxiliary_loss_mlp": 0.01135101, + "balance_loss_clip": 1.07997894, + "balance_loss_mlp": 1.12960708, + "epoch": 0.010100706448218848, + "flos": 11319941404800.0, + "grad_norm": 3.1870046541457873, + "language_loss": 0.90852308, + "learning_rate": 3.299075396334735e-06, + "loss": 0.93417776, + "num_input_tokens_seen": 3555465, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 3.0, + "step": 168, + "time_per_iteration": 2.5387983322143555 + }, + { + "auxiliary_loss_clip": 0.01424973, + "auxiliary_loss_mlp": 0.01106848, + "balance_loss_clip": 1.05072391, + "balance_loss_mlp": 1.12456727, + "epoch": 0.010160829700886819, + "flos": 29719491765120.0, + "grad_norm": 2.0495813916191077, + "language_loss": 0.87094414, + "learning_rate": 3.3028964887576868e-06, + "loss": 0.89626241, + "num_input_tokens_seen": 3578970, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 3.0, + "step": 169, + "time_per_iteration": 2.6448419094085693 + }, + { + "auxiliary_loss_clip": 0.01426284, + "auxiliary_loss_mlp": 0.01111393, + "balance_loss_clip": 1.05548358, + "balance_loss_mlp": 1.12704372, + "epoch": 0.010220952953554787, + "flos": 20412343854720.0, + "grad_norm": 3.0203817486241973, + "language_loss": 0.84758192, + "learning_rate": 3.306695037731344e-06, + "loss": 0.87295866, + "num_input_tokens_seen": 3597275, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 3.0, + "step": 170, + "time_per_iteration": 2.5596489906311035 + }, + { + "auxiliary_loss_clip": 0.01435879, + "auxiliary_loss_mlp": 0.01136565, + "balance_loss_clip": 1.07963061, + "balance_loss_mlp": 1.12765205, + "epoch": 0.010281076206222756, + "flos": 31285124847360.0, + "grad_norm": 2.124400250788896, + "language_loss": 0.89896494, + "learning_rate": 3.3104713076972827e-06, + "loss": 0.92468935, + "num_input_tokens_seen": 3618905, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 3.078125, + "step": 171, + "time_per_iteration": 2.71183180809021 + }, + { + "auxiliary_loss_clip": 0.01429687, + "auxiliary_loss_mlp": 0.01108406, + "balance_loss_clip": 1.05421364, + "balance_loss_mlp": 1.1300813, + "epoch": 0.010341199458890726, + "flos": 21982286568960.0, + "grad_norm": 2.015577645060998, + "language_loss": 0.88978243, + "learning_rate": 3.314225558471224e-06, + "loss": 0.91516334, + "num_input_tokens_seen": 3639610, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 3.0, + "step": 172, + "time_per_iteration": 2.6193771362304688 + }, + { + "auxiliary_loss_clip": 0.01419105, + "auxiliary_loss_mlp": 0.01124801, + "balance_loss_clip": 1.06986928, + "balance_loss_mlp": 1.12354624, + "epoch": 0.010401322711558695, + "flos": 30810529422720.0, + "grad_norm": 1.6868779107262128, + "language_loss": 0.81148165, + "learning_rate": 3.317958045350308e-06, + "loss": 0.83692074, + "num_input_tokens_seen": 3664030, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 2.953125, + "step": 173, + "time_per_iteration": 2.656935691833496 + }, + { + "auxiliary_loss_clip": 0.01430653, + "auxiliary_loss_mlp": 0.0110718, + "balance_loss_clip": 1.05496693, + "balance_loss_mlp": 1.12733519, + "epoch": 0.010461445964226665, + "flos": 24715124098560.0, + "grad_norm": 2.1134597687554244, + "language_loss": 0.82498932, + "learning_rate": 3.3216690192172596e-06, + "loss": 0.85036767, + "num_input_tokens_seen": 3683615, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 3.03125, + "step": 174, + "time_per_iteration": 2.6050753593444824 + }, + { + "auxiliary_loss_clip": 0.01425822, + "auxiliary_loss_mlp": 0.01124156, + "balance_loss_clip": 1.06984437, + "balance_loss_mlp": 1.12589645, + "epoch": 0.010521569216894634, + "flos": 27710361457920.0, + "grad_norm": 2.6035215697191965, + "language_loss": 0.72699076, + "learning_rate": 3.325358726641591e-06, + "loss": 0.75249052, + "num_input_tokens_seen": 3704540, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 3.0, + "step": 175, + "time_per_iteration": 2.6859946250915527 + }, + { + "auxiliary_loss_clip": 0.01427679, + "auxiliary_loss_mlp": 0.0113274, + "balance_loss_clip": 1.07571054, + "balance_loss_mlp": 1.12603855, + "epoch": 0.010581692469562603, + "flos": 12458346122880.0, + "grad_norm": 2.402827576481816, + "language_loss": 0.98082507, + "learning_rate": 3.329027409977902e-06, + "loss": 1.00642931, + "num_input_tokens_seen": 3721320, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 3.015625, + "step": 176, + "time_per_iteration": 2.5405664443969727 + }, + { + "auxiliary_loss_clip": 0.01412838, + "auxiliary_loss_mlp": 0.01132631, + "balance_loss_clip": 1.08005941, + "balance_loss_mlp": 1.12270594, + "epoch": 0.010641815722230573, + "flos": 19427601519360.0, + "grad_norm": 2.3427037211777115, + "language_loss": 0.76749414, + "learning_rate": 3.3326753074614087e-06, + "loss": 0.79294884, + "num_input_tokens_seen": 3739385, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.90625, + "step": 177, + "time_per_iteration": 2.555553674697876 + }, + { + "auxiliary_loss_clip": 0.01423246, + "auxiliary_loss_mlp": 0.01104615, + "balance_loss_clip": 1.0507797, + "balance_loss_mlp": 1.12089574, + "epoch": 0.010701938974898541, + "flos": 18332577452160.0, + "grad_norm": 2.4108248963401464, + "language_loss": 0.76824659, + "learning_rate": 3.3363026533007716e-06, + "loss": 0.79352522, + "num_input_tokens_seen": 3756360, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 3.015625, + "step": 178, + "time_per_iteration": 2.5799388885498047 + }, + { + "auxiliary_loss_clip": 0.01429506, + "auxiliary_loss_mlp": 0.01108132, + "balance_loss_clip": 1.05224717, + "balance_loss_mlp": 1.12586653, + "epoch": 0.010762062227566512, + "flos": 19203985399680.0, + "grad_norm": 2.1918052506036174, + "language_loss": 0.84004253, + "learning_rate": 3.3399096777683303e-06, + "loss": 0.86541891, + "num_input_tokens_seen": 3773930, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 3.03125, + "step": 179, + "time_per_iteration": 2.5387184619903564 + }, + { + "auxiliary_loss_clip": 0.01420983, + "auxiliary_loss_mlp": 0.01112539, + "balance_loss_clip": 1.05677247, + "balance_loss_mlp": 1.12062979, + "epoch": 0.01082218548023448, + "flos": 31425427370880.0, + "grad_norm": 1.90488055395076, + "language_loss": 0.83719397, + "learning_rate": 3.3434966072878213e-06, + "loss": 0.86252916, + "num_input_tokens_seen": 3793630, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 3.0, + "step": 180, + "time_per_iteration": 2.6149253845214844 + }, + { + "auxiliary_loss_clip": 0.01421575, + "auxiliary_loss_mlp": 0.01120367, + "balance_loss_clip": 1.06503046, + "balance_loss_mlp": 1.1226536, + "epoch": 0.01088230873290245, + "flos": 25046436170880.0, + "grad_norm": 3.784573507260413, + "language_loss": 0.7774682, + "learning_rate": 3.3470636645196674e-06, + "loss": 0.80288756, + "num_input_tokens_seen": 3813610, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 3.0, + "step": 181, + "time_per_iteration": 2.5769712924957275 + }, + { + "auxiliary_loss_clip": 0.01417045, + "auxiliary_loss_mlp": 0.01131731, + "balance_loss_clip": 1.07732356, + "balance_loss_mlp": 1.11938787, + "epoch": 0.01094243198557042, + "flos": 22893411980160.0, + "grad_norm": 3.1835165271024377, + "language_loss": 0.76440376, + "learning_rate": 3.3506110684439156e-06, + "loss": 0.78989148, + "num_input_tokens_seen": 3831390, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 2.96875, + "step": 182, + "time_per_iteration": 2.5641353130340576 + }, + { + "auxiliary_loss_clip": 0.01412704, + "auxiliary_loss_mlp": 0.01127012, + "balance_loss_clip": 1.07122183, + "balance_loss_mlp": 1.11758399, + "epoch": 0.011002555238238388, + "flos": 17165049782400.0, + "grad_norm": 2.172025067133121, + "language_loss": 0.87377435, + "learning_rate": 3.3541390344409054e-06, + "loss": 0.89917147, + "num_input_tokens_seen": 3849705, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 2.953125, + "step": 183, + "time_per_iteration": 2.567457914352417 + }, + { + "auxiliary_loss_clip": 0.01415124, + "auxiliary_loss_mlp": 0.01114516, + "balance_loss_clip": 1.06397092, + "balance_loss_mlp": 1.1209594, + "epoch": 0.011062678490906358, + "flos": 22310150935680.0, + "grad_norm": 2.2669267607504255, + "language_loss": 0.86875558, + "learning_rate": 3.357647774369736e-06, + "loss": 0.89405191, + "num_input_tokens_seen": 3869230, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.953125, + "step": 184, + "time_per_iteration": 5.380701780319214 + }, + { + "auxiliary_loss_clip": 0.01411555, + "auxiliary_loss_mlp": 0.01107942, + "balance_loss_clip": 1.05308247, + "balance_loss_mlp": 1.12176847, + "epoch": 0.011122801743574327, + "flos": 24388373053440.0, + "grad_norm": 1.8448371257401488, + "language_loss": 0.83683228, + "learning_rate": 3.3611374966446085e-06, + "loss": 0.86202729, + "num_input_tokens_seen": 3889735, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 2.90625, + "step": 185, + "time_per_iteration": 2.5522208213806152 + }, + { + "auxiliary_loss_clip": 0.01420908, + "auxiliary_loss_mlp": 0.01109712, + "balance_loss_clip": 1.05253971, + "balance_loss_mlp": 1.11964798, + "epoch": 0.011182924996242297, + "flos": 18150258994560.0, + "grad_norm": 2.4162416092451475, + "language_loss": 0.71111757, + "learning_rate": 3.3646084063091142e-06, + "loss": 0.73642373, + "num_input_tokens_seen": 3908855, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 3.015625, + "step": 186, + "time_per_iteration": 2.536498546600342 + }, + { + "auxiliary_loss_clip": 0.01416319, + "auxiliary_loss_mlp": 0.01107204, + "balance_loss_clip": 1.0558964, + "balance_loss_mlp": 1.11923158, + "epoch": 0.011243048248910266, + "flos": 15486800584320.0, + "grad_norm": 3.342492581434835, + "language_loss": 1.02028871, + "learning_rate": 3.3680607051085194e-06, + "loss": 1.04552388, + "num_input_tokens_seen": 3923865, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 2.96875, + "step": 187, + "time_per_iteration": 2.5189080238342285 + }, + { + "auxiliary_loss_clip": 0.01405552, + "auxiliary_loss_mlp": 0.01110459, + "balance_loss_clip": 1.05597997, + "balance_loss_mlp": 1.11834478, + "epoch": 0.011303171501578235, + "flos": 40916868986880.0, + "grad_norm": 1.6787333311747052, + "language_loss": 0.75107503, + "learning_rate": 3.371494591560139e-06, + "loss": 0.7762351, + "num_input_tokens_seen": 3946870, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 2.875, + "step": 188, + "time_per_iteration": 2.73420786857605 + }, + { + "auxiliary_loss_clip": 0.01292523, + "auxiliary_loss_mlp": 0.01034102, + "balance_loss_clip": 1.01273942, + "balance_loss_mlp": 1.13387585, + "epoch": 0.011363294754246205, + "flos": 66302697790080.0, + "grad_norm": 0.7700467396195164, + "language_loss": 0.56216431, + "learning_rate": 3.3749102610218297e-06, + "loss": 0.5854305, + "num_input_tokens_seen": 4010005, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.5859375, + "step": 189, + "time_per_iteration": 3.176280975341797 + }, + { + "auxiliary_loss_clip": 0.01402635, + "auxiliary_loss_mlp": 0.01121834, + "balance_loss_clip": 1.06742704, + "balance_loss_mlp": 1.1134795, + "epoch": 0.011423418006914174, + "flos": 24900279730560.0, + "grad_norm": 2.292403028528975, + "language_loss": 0.94771594, + "learning_rate": 3.3783079057586833e-06, + "loss": 0.97296059, + "num_input_tokens_seen": 4029035, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 2.90625, + "step": 190, + "time_per_iteration": 2.604132652282715 + }, + { + "auxiliary_loss_clip": 0.01403317, + "auxiliary_loss_mlp": 0.01101291, + "balance_loss_clip": 1.04964972, + "balance_loss_mlp": 1.11493886, + "epoch": 0.011483541259582144, + "flos": 19791879298560.0, + "grad_norm": 2.993049163405909, + "language_loss": 0.84462845, + "learning_rate": 3.3816877150079665e-06, + "loss": 0.8696745, + "num_input_tokens_seen": 4046995, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 2.875, + "step": 191, + "time_per_iteration": 2.569664716720581 + }, + { + "auxiliary_loss_clip": 0.01402316, + "auxiliary_loss_mlp": 0.01121031, + "balance_loss_clip": 1.0698905, + "balance_loss_mlp": 1.11087692, + "epoch": 0.011543664512250112, + "flos": 26176939896960.0, + "grad_norm": 2.0097697123850593, + "language_loss": 0.91439575, + "learning_rate": 3.385049875042367e-06, + "loss": 0.93962914, + "num_input_tokens_seen": 4065865, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 2.90625, + "step": 192, + "time_per_iteration": 2.6416900157928467 + }, + { + "auxiliary_loss_clip": 0.0139743, + "auxiliary_loss_mlp": 0.01113461, + "balance_loss_clip": 1.05776596, + "balance_loss_mlp": 1.11231375, + "epoch": 0.011603787764918083, + "flos": 23768985905280.0, + "grad_norm": 2.095754720056515, + "language_loss": 0.86849445, + "learning_rate": 3.3883945692315938e-06, + "loss": 0.89360332, + "num_input_tokens_seen": 4085305, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 2.84375, + "step": 193, + "time_per_iteration": 2.569899797439575 + }, + { + "auxiliary_loss_clip": 0.01399232, + "auxiliary_loss_mlp": 0.01095137, + "balance_loss_clip": 1.04409146, + "balance_loss_mlp": 1.10937476, + "epoch": 0.011663911017586051, + "flos": 25954688494080.0, + "grad_norm": 2.446553756436178, + "language_loss": 0.92399615, + "learning_rate": 3.3917219781023906e-06, + "loss": 0.9489398, + "num_input_tokens_seen": 4105185, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 2.90625, + "step": 194, + "time_per_iteration": 2.6078743934631348 + }, + { + "auxiliary_loss_clip": 0.01405837, + "auxiliary_loss_mlp": 0.0110398, + "balance_loss_clip": 1.05188549, + "balance_loss_mlp": 1.11522019, + "epoch": 0.01172403427025402, + "flos": 17895149625600.0, + "grad_norm": 3.1413620570060052, + "language_loss": 0.89698559, + "learning_rate": 3.3950322793970014e-06, + "loss": 0.92208374, + "num_input_tokens_seen": 4123160, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.90625, + "step": 195, + "time_per_iteration": 2.5785820484161377 + }, + { + "auxiliary_loss_clip": 0.01400897, + "auxiliary_loss_mlp": 0.01117652, + "balance_loss_clip": 1.06345916, + "balance_loss_mlp": 1.11416054, + "epoch": 0.01178415752292199, + "flos": 17894539094400.0, + "grad_norm": 3.0173579296668813, + "language_loss": 0.8577168, + "learning_rate": 3.3983256481301445e-06, + "loss": 0.88290232, + "num_input_tokens_seen": 4140425, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 2.875, + "step": 196, + "time_per_iteration": 2.5492773056030273 + }, + { + "auxiliary_loss_clip": 0.01397107, + "auxiliary_loss_mlp": 0.01106206, + "balance_loss_clip": 1.05299139, + "balance_loss_mlp": 1.10991478, + "epoch": 0.011844280775589959, + "flos": 22893555634560.0, + "grad_norm": 2.86264810097015, + "language_loss": 0.93367243, + "learning_rate": 3.4016022566445335e-06, + "loss": 0.95870566, + "num_input_tokens_seen": 4159555, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.875, + "step": 197, + "time_per_iteration": 2.5488431453704834 + }, + { + "auxiliary_loss_clip": 0.01394686, + "auxiliary_loss_mlp": 0.0110986, + "balance_loss_clip": 1.05781317, + "balance_loss_mlp": 1.1120131, + "epoch": 0.01190440402825793, + "flos": 26980333441920.0, + "grad_norm": 2.1872318454948045, + "language_loss": 0.79184073, + "learning_rate": 3.4048622746649966e-06, + "loss": 0.81688625, + "num_input_tokens_seen": 4180480, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 2.828125, + "step": 198, + "time_per_iteration": 2.6208834648132324 + }, + { + "auxiliary_loss_clip": 0.01390401, + "auxiliary_loss_mlp": 0.01116361, + "balance_loss_clip": 1.06545901, + "balance_loss_mlp": 1.11265802, + "epoch": 0.011964527280925898, + "flos": 20521584092160.0, + "grad_norm": 3.3720724842630663, + "language_loss": 0.88065112, + "learning_rate": 3.4081058693512278e-06, + "loss": 0.90571868, + "num_input_tokens_seen": 4198835, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.765625, + "step": 199, + "time_per_iteration": 2.5257043838500977 + }, + { + "auxiliary_loss_clip": 0.01403414, + "auxiliary_loss_mlp": 0.01121968, + "balance_loss_clip": 1.0658679, + "balance_loss_mlp": 1.11557496, + "epoch": 0.012024650533593867, + "flos": 27745984771200.0, + "grad_norm": 1.8432610551497841, + "language_loss": 0.81327617, + "learning_rate": 3.411333205349222e-06, + "loss": 0.83853, + "num_input_tokens_seen": 4219335, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 2.875, + "step": 200, + "time_per_iteration": 2.593231201171875 + }, + { + "auxiliary_loss_clip": 0.01400536, + "auxiliary_loss_mlp": 0.01101092, + "balance_loss_clip": 1.04792464, + "balance_loss_mlp": 1.11138511, + "epoch": 0.012084773786261837, + "flos": 10452017076480.0, + "grad_norm": 2.758923223370522, + "language_loss": 0.87688923, + "learning_rate": 3.4145444448414217e-06, + "loss": 0.90190548, + "num_input_tokens_seen": 4236940, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.90625, + "step": 201, + "time_per_iteration": 2.5057122707366943 + }, + { + "auxiliary_loss_clip": 0.01401128, + "auxiliary_loss_mlp": 0.01110995, + "balance_loss_clip": 1.05751753, + "balance_loss_mlp": 1.1152513, + "epoch": 0.012144897038929806, + "flos": 23105751229440.0, + "grad_norm": 3.7927516715708736, + "language_loss": 0.84123611, + "learning_rate": 3.4177397475956223e-06, + "loss": 0.86635733, + "num_input_tokens_seen": 4256755, + "router_z_loss_clip": 0.53515625, + "router_z_loss_mlp": 2.859375, + "step": 202, + "time_per_iteration": 2.555680751800537 + }, + { + "auxiliary_loss_clip": 0.01388205, + "auxiliary_loss_mlp": 0.01109065, + "balance_loss_clip": 1.05639839, + "balance_loss_mlp": 1.10674798, + "epoch": 0.012205020291597776, + "flos": 21033203460480.0, + "grad_norm": 1.9040504717952067, + "language_loss": 0.90116632, + "learning_rate": 3.4209192710126685e-06, + "loss": 0.926139, + "num_input_tokens_seen": 4276505, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 2.8125, + "step": 203, + "time_per_iteration": 2.526937484741211 + }, + { + "auxiliary_loss_clip": 0.01281494, + "auxiliary_loss_mlp": 0.01053133, + "balance_loss_clip": 1.03138971, + "balance_loss_mlp": 1.12054539, + "epoch": 0.012265143544265745, + "flos": 68447785075200.0, + "grad_norm": 1.0150955472927095, + "language_loss": 0.61259121, + "learning_rate": 3.4240831701729837e-06, + "loss": 0.63593745, + "num_input_tokens_seen": 4330965, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.609375, + "step": 204, + "time_per_iteration": 3.051469326019287 + }, + { + "auxiliary_loss_clip": 0.01398264, + "auxiliary_loss_mlp": 0.01111819, + "balance_loss_clip": 1.0593431, + "balance_loss_mlp": 1.11035323, + "epoch": 0.012325266796933715, + "flos": 17019252478080.0, + "grad_norm": 2.269022633654934, + "language_loss": 0.91206741, + "learning_rate": 3.4272315978819516e-06, + "loss": 0.93716824, + "num_input_tokens_seen": 4348200, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.875, + "step": 205, + "time_per_iteration": 2.5105605125427246 + }, + { + "auxiliary_loss_clip": 0.01406073, + "auxiliary_loss_mlp": 0.01120568, + "balance_loss_clip": 1.06675649, + "balance_loss_mlp": 1.11524296, + "epoch": 0.012385390049601683, + "flos": 20190056538240.0, + "grad_norm": 2.2813283317886497, + "language_loss": 0.89215505, + "learning_rate": 3.4303647047142043e-06, + "loss": 0.91742146, + "num_input_tokens_seen": 4365460, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 2.90625, + "step": 206, + "time_per_iteration": 2.524327039718628 + }, + { + "auxiliary_loss_clip": 0.01394865, + "auxiliary_loss_mlp": 0.01101938, + "balance_loss_clip": 1.05039215, + "balance_loss_mlp": 1.10848641, + "epoch": 0.012445513302269652, + "flos": 16253134272000.0, + "grad_norm": 2.502758142715096, + "language_loss": 0.95368809, + "learning_rate": 3.43348263905683e-06, + "loss": 0.97865611, + "num_input_tokens_seen": 4383650, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 2.859375, + "step": 207, + "time_per_iteration": 2.5147407054901123 + }, + { + "auxiliary_loss_clip": 0.01393931, + "auxiliary_loss_mlp": 0.01116307, + "balance_loss_clip": 1.06416512, + "balance_loss_mlp": 1.11335945, + "epoch": 0.012505636554937622, + "flos": 23769380954880.0, + "grad_norm": 2.4565104125033232, + "language_loss": 0.75770479, + "learning_rate": 3.436585547151547e-06, + "loss": 0.78280723, + "num_input_tokens_seen": 4403765, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.8125, + "step": 208, + "time_per_iteration": 2.5426721572875977 + }, + { + "auxiliary_loss_clip": 0.01382601, + "auxiliary_loss_mlp": 0.01107359, + "balance_loss_clip": 1.05497861, + "balance_loss_mlp": 1.10796773, + "epoch": 0.012565759807605591, + "flos": 30591546157440.0, + "grad_norm": 2.79364384939249, + "language_loss": 0.98718858, + "learning_rate": 3.4396735731358586e-06, + "loss": 1.01208818, + "num_input_tokens_seen": 4421935, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.75, + "step": 209, + "time_per_iteration": 2.607238292694092 + }, + { + "auxiliary_loss_clip": 0.01389293, + "auxiliary_loss_mlp": 0.01112212, + "balance_loss_clip": 1.05971253, + "balance_loss_mlp": 1.11020541, + "epoch": 0.012625883060273561, + "flos": 40113511355520.0, + "grad_norm": 7.039976369418198, + "language_loss": 0.85444254, + "learning_rate": 3.4427468590832302e-06, + "loss": 0.87945753, + "num_input_tokens_seen": 4441470, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.78125, + "step": 210, + "time_per_iteration": 2.67632794380188 + }, + { + "auxiliary_loss_clip": 0.01385349, + "auxiliary_loss_mlp": 0.01120301, + "balance_loss_clip": 1.07042408, + "balance_loss_mlp": 1.1073029, + "epoch": 0.01268600631294153, + "flos": 27089178629760.0, + "grad_norm": 2.2334441604414783, + "language_loss": 0.97016168, + "learning_rate": 3.445805545042314e-06, + "loss": 0.99521822, + "num_input_tokens_seen": 4459950, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 2.78125, + "step": 211, + "time_per_iteration": 2.5733633041381836 + }, + { + "auxiliary_loss_clip": 0.01394963, + "auxiliary_loss_mlp": 0.01114691, + "balance_loss_clip": 1.0616188, + "balance_loss_mlp": 1.11342549, + "epoch": 0.012746129565609499, + "flos": 16982767238400.0, + "grad_norm": 3.6563211355425453, + "language_loss": 0.95188707, + "learning_rate": 3.448849769075239e-06, + "loss": 0.97698367, + "num_input_tokens_seen": 4478390, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.8125, + "step": 212, + "time_per_iteration": 2.5224313735961914 + }, + { + "auxiliary_loss_clip": 0.01383511, + "auxiliary_loss_mlp": 0.01116361, + "balance_loss_clip": 1.06376541, + "balance_loss_mlp": 1.10996664, + "epoch": 0.012806252818277469, + "flos": 46533476995200.0, + "grad_norm": 2.0395830195466504, + "language_loss": 0.76049221, + "learning_rate": 3.4518796672950093e-06, + "loss": 0.78549099, + "num_input_tokens_seen": 4501665, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 2.734375, + "step": 213, + "time_per_iteration": 2.76625919342041 + }, + { + "auxiliary_loss_clip": 0.0138732, + "auxiliary_loss_mlp": 0.01103154, + "balance_loss_clip": 1.052037, + "balance_loss_mlp": 1.10833097, + "epoch": 0.012866376070945438, + "flos": 14388616120320.0, + "grad_norm": 8.414558483522654, + "language_loss": 0.86754733, + "learning_rate": 3.4548953739020187e-06, + "loss": 0.89245206, + "num_input_tokens_seen": 4519055, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 2.78125, + "step": 214, + "time_per_iteration": 2.500417470932007 + }, + { + "auxiliary_loss_clip": 0.0138682, + "auxiliary_loss_mlp": 0.01127788, + "balance_loss_clip": 1.07397687, + "balance_loss_mlp": 1.11549139, + "epoch": 0.012926499323613408, + "flos": 26140813793280.0, + "grad_norm": 2.3854037050744057, + "language_loss": 0.77357471, + "learning_rate": 3.4578970212197196e-06, + "loss": 0.79872084, + "num_input_tokens_seen": 4540870, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 2.703125, + "step": 215, + "time_per_iteration": 2.6116256713867188 + }, + { + "auxiliary_loss_clip": 0.01394912, + "auxiliary_loss_mlp": 0.01111048, + "balance_loss_clip": 1.06002641, + "balance_loss_mlp": 1.11393261, + "epoch": 0.012986622576281377, + "flos": 30117202128000.0, + "grad_norm": 2.44498430810385, + "language_loss": 0.90545797, + "learning_rate": 3.460884739729461e-06, + "loss": 0.93051755, + "num_input_tokens_seen": 4560395, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.8125, + "step": 216, + "time_per_iteration": 2.5903706550598145 + }, + { + "auxiliary_loss_clip": 0.0138678, + "auxiliary_loss_mlp": 0.01107632, + "balance_loss_clip": 1.05622888, + "balance_loss_mlp": 1.10772836, + "epoch": 0.013046745828949347, + "flos": 13954025468160.0, + "grad_norm": 2.630220300857062, + "language_loss": 0.93660516, + "learning_rate": 3.463858658104523e-06, + "loss": 0.96154928, + "num_input_tokens_seen": 4575785, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 2.78125, + "step": 217, + "time_per_iteration": 2.5109100341796875 + }, + { + "auxiliary_loss_clip": 0.01381618, + "auxiliary_loss_mlp": 0.01107152, + "balance_loss_clip": 1.05360317, + "balance_loss_mlp": 1.10700643, + "epoch": 0.013106869081617315, + "flos": 17347835116800.0, + "grad_norm": 1.9165712032980975, + "language_loss": 0.93656206, + "learning_rate": 3.4668189032433696e-06, + "loss": 0.96144974, + "num_input_tokens_seen": 4594985, + "router_z_loss_clip": 0.53515625, + "router_z_loss_mlp": 2.75, + "step": 218, + "time_per_iteration": 2.6586077213287354 + }, + { + "auxiliary_loss_clip": 0.01376505, + "auxiliary_loss_mlp": 0.01108753, + "balance_loss_clip": 1.05820787, + "balance_loss_mlp": 1.10663593, + "epoch": 0.013166992334285284, + "flos": 25884914325120.0, + "grad_norm": 1.916363531530835, + "language_loss": 0.86148179, + "learning_rate": 3.46976560030214e-06, + "loss": 0.88633436, + "num_input_tokens_seen": 4616125, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 2.703125, + "step": 219, + "time_per_iteration": 2.584040880203247 + }, + { + "auxiliary_loss_clip": 0.01383955, + "auxiliary_loss_mlp": 0.01101272, + "balance_loss_clip": 1.05056047, + "balance_loss_mlp": 1.110309, + "epoch": 0.013227115586953254, + "flos": 31175956437120.0, + "grad_norm": 1.7731463199764816, + "language_loss": 0.87598741, + "learning_rate": 3.4726988727263976e-06, + "loss": 0.90083969, + "num_input_tokens_seen": 4637795, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.75, + "step": 220, + "time_per_iteration": 2.6294186115264893 + }, + { + "auxiliary_loss_clip": 0.01373821, + "auxiliary_loss_mlp": 0.01103316, + "balance_loss_clip": 1.05663311, + "balance_loss_mlp": 1.10389161, + "epoch": 0.013287238839621223, + "flos": 20409470766720.0, + "grad_norm": 1.991547522293572, + "language_loss": 0.86413074, + "learning_rate": 3.475618842282164e-06, + "loss": 0.88890207, + "num_input_tokens_seen": 4656835, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 2.6875, + "step": 221, + "time_per_iteration": 2.606137990951538 + }, + { + "auxiliary_loss_clip": 0.0137878, + "auxiliary_loss_mlp": 0.01109834, + "balance_loss_clip": 1.05800176, + "balance_loss_mlp": 1.10240269, + "epoch": 0.013347362092289193, + "flos": 14137134024960.0, + "grad_norm": 2.017045003530743, + "language_loss": 0.92153138, + "learning_rate": 3.4785256290862486e-06, + "loss": 0.94641757, + "num_input_tokens_seen": 4673015, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 2.765625, + "step": 222, + "time_per_iteration": 2.6237566471099854 + }, + { + "auxiliary_loss_clip": 0.01377393, + "auxiliary_loss_mlp": 0.01105441, + "balance_loss_clip": 1.05129576, + "balance_loss_mlp": 1.10672021, + "epoch": 0.013407485344957162, + "flos": 21797705554560.0, + "grad_norm": 2.7127164790698606, + "language_loss": 0.95539695, + "learning_rate": 3.481419351635897e-06, + "loss": 0.98022527, + "num_input_tokens_seen": 4692355, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 2.71875, + "step": 223, + "time_per_iteration": 2.679387092590332 + }, + { + "auxiliary_loss_clip": 0.01377947, + "auxiliary_loss_mlp": 0.01106927, + "balance_loss_clip": 1.05612004, + "balance_loss_mlp": 1.10671806, + "epoch": 0.013467608597625132, + "flos": 18621622195200.0, + "grad_norm": 2.5543531214735586, + "language_loss": 0.88022512, + "learning_rate": 3.484300126837776e-06, + "loss": 0.90507382, + "num_input_tokens_seen": 4710080, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.71875, + "step": 224, + "time_per_iteration": 2.6327528953552246 + }, + { + "auxiliary_loss_clip": 0.0137715, + "auxiliary_loss_mlp": 0.01102713, + "balance_loss_clip": 1.04873466, + "balance_loss_mlp": 1.10632586, + "epoch": 0.013527731850293101, + "flos": 18552314903040.0, + "grad_norm": 2.0812591886363183, + "language_loss": 0.89642018, + "learning_rate": 3.487168070036317e-06, + "loss": 0.92121875, + "num_input_tokens_seen": 4728980, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 2.703125, + "step": 225, + "time_per_iteration": 2.511749505996704 + }, + { + "auxiliary_loss_clip": 0.01374075, + "auxiliary_loss_mlp": 0.01115854, + "balance_loss_clip": 1.06273401, + "balance_loss_mlp": 1.10547256, + "epoch": 0.01358785510296107, + "flos": 19165381257600.0, + "grad_norm": 2.1555099546542142, + "language_loss": 0.99022663, + "learning_rate": 3.4900232950414224e-06, + "loss": 1.01512599, + "num_input_tokens_seen": 4747020, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.6875, + "step": 226, + "time_per_iteration": 5.38438868522644 + }, + { + "auxiliary_loss_clip": 0.0137773, + "auxiliary_loss_mlp": 0.01111487, + "balance_loss_clip": 1.0584867, + "balance_loss_mlp": 1.10696185, + "epoch": 0.01364797835562904, + "flos": 23329941966720.0, + "grad_norm": 15.523681056640678, + "language_loss": 0.91210413, + "learning_rate": 3.4928659141555727e-06, + "loss": 0.93699628, + "num_input_tokens_seen": 4765000, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.703125, + "step": 227, + "time_per_iteration": 2.5391762256622314 + }, + { + "auxiliary_loss_clip": 0.01252818, + "auxiliary_loss_mlp": 0.01025357, + "balance_loss_clip": 1.00666487, + "balance_loss_mlp": 1.10911703, + "epoch": 0.013708101608297009, + "flos": 70993746097920.0, + "grad_norm": 0.99230217192713, + "language_loss": 0.57680154, + "learning_rate": 3.4956960382003234e-06, + "loss": 0.59958327, + "num_input_tokens_seen": 4833210, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.4375, + "step": 228, + "time_per_iteration": 3.1981163024902344 + }, + { + "auxiliary_loss_clip": 0.0136686, + "auxiliary_loss_mlp": 0.01110654, + "balance_loss_clip": 1.06127763, + "balance_loss_mlp": 1.10228515, + "epoch": 0.013768224860964979, + "flos": 16325170997760.0, + "grad_norm": 2.2779006264878374, + "language_loss": 0.8759563, + "learning_rate": 3.4985137765422354e-06, + "loss": 0.90073144, + "num_input_tokens_seen": 4850120, + "router_z_loss_clip": 0.49414062, + "router_z_loss_mlp": 2.65625, + "step": 229, + "time_per_iteration": 2.49130916595459 + }, + { + "auxiliary_loss_clip": 0.01377631, + "auxiliary_loss_mlp": 0.01101911, + "balance_loss_clip": 1.05212951, + "balance_loss_mlp": 1.10486007, + "epoch": 0.013828348113632948, + "flos": 20193037367040.0, + "grad_norm": 4.280679608747667, + "language_loss": 0.84247303, + "learning_rate": 3.501319237118231e-06, + "loss": 0.8672685, + "num_input_tokens_seen": 4866215, + "router_z_loss_clip": 0.49804688, + "router_z_loss_mlp": 2.734375, + "step": 230, + "time_per_iteration": 2.501218557357788 + }, + { + "auxiliary_loss_clip": 0.01375417, + "auxiliary_loss_mlp": 0.0111628, + "balance_loss_clip": 1.06671298, + "balance_loss_mlp": 1.10600948, + "epoch": 0.013888471366300916, + "flos": 20741070147840.0, + "grad_norm": 1.78964280876859, + "language_loss": 0.90378422, + "learning_rate": 3.5041125264604056e-06, + "loss": 0.92870116, + "num_input_tokens_seen": 4885630, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 2.6875, + "step": 231, + "time_per_iteration": 2.541137456893921 + }, + { + "auxiliary_loss_clip": 0.01377441, + "auxiliary_loss_mlp": 0.01108629, + "balance_loss_clip": 1.05941916, + "balance_loss_mlp": 1.10821056, + "epoch": 0.013948594618968886, + "flos": 22090628966400.0, + "grad_norm": 2.031489983297281, + "language_loss": 0.83706695, + "learning_rate": 3.5068937497203002e-06, + "loss": 0.86192763, + "num_input_tokens_seen": 4905570, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 2.6875, + "step": 232, + "time_per_iteration": 2.5444753170013428 + }, + { + "auxiliary_loss_clip": 0.0137977, + "auxiliary_loss_mlp": 0.01092372, + "balance_loss_clip": 1.04125488, + "balance_loss_mlp": 1.10017753, + "epoch": 0.014008717871636855, + "flos": 19063108258560.0, + "grad_norm": 2.928489064169697, + "language_loss": 0.74033689, + "learning_rate": 3.509663010692652e-06, + "loss": 0.76505834, + "num_input_tokens_seen": 4923535, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 2.796875, + "step": 233, + "time_per_iteration": 2.5364952087402344 + }, + { + "auxiliary_loss_clip": 0.01382965, + "auxiliary_loss_mlp": 0.0112384, + "balance_loss_clip": 1.07141209, + "balance_loss_mlp": 1.10741055, + "epoch": 0.014068841124304825, + "flos": 14530822064640.0, + "grad_norm": 2.287774019631123, + "language_loss": 0.85867143, + "learning_rate": 3.512420411838642e-06, + "loss": 0.88373953, + "num_input_tokens_seen": 4939200, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.75, + "step": 234, + "time_per_iteration": 2.532949209213257 + }, + { + "auxiliary_loss_clip": 0.01375298, + "auxiliary_loss_mlp": 0.01106064, + "balance_loss_clip": 1.05683041, + "balance_loss_mlp": 1.10759592, + "epoch": 0.014128964376972794, + "flos": 18077396256000.0, + "grad_norm": 2.6527993685177154, + "language_loss": 0.89144391, + "learning_rate": 3.515166054308634e-06, + "loss": 0.9162575, + "num_input_tokens_seen": 4956620, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 2.671875, + "step": 235, + "time_per_iteration": 2.509592294692993 + }, + { + "auxiliary_loss_clip": 0.0137416, + "auxiliary_loss_mlp": 0.01119384, + "balance_loss_clip": 1.06874382, + "balance_loss_mlp": 1.10830367, + "epoch": 0.014189087629640764, + "flos": 25334331678720.0, + "grad_norm": 4.054998173736759, + "language_loss": 0.85780042, + "learning_rate": 3.5179000379644498e-06, + "loss": 0.88273585, + "num_input_tokens_seen": 4975650, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.65625, + "step": 236, + "time_per_iteration": 2.744925022125244 + }, + { + "auxiliary_loss_clip": 0.0137118, + "auxiliary_loss_mlp": 0.01099258, + "balance_loss_clip": 1.04871392, + "balance_loss_mlp": 1.10178149, + "epoch": 0.014249210882308733, + "flos": 36139744713600.0, + "grad_norm": 2.128422813257453, + "language_loss": 0.82452404, + "learning_rate": 3.520622461401154e-06, + "loss": 0.84922838, + "num_input_tokens_seen": 4997415, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.6875, + "step": 237, + "time_per_iteration": 2.67307710647583 + }, + { + "auxiliary_loss_clip": 0.01369116, + "auxiliary_loss_mlp": 0.01116968, + "balance_loss_clip": 1.0643487, + "balance_loss_mlp": 1.10451889, + "epoch": 0.014309334134976702, + "flos": 12932977461120.0, + "grad_norm": 3.103781307849977, + "language_loss": 0.77321362, + "learning_rate": 3.5233334219683935e-06, + "loss": 0.79807448, + "num_input_tokens_seen": 5013905, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 2.65625, + "step": 238, + "time_per_iteration": 2.4973809719085693 + }, + { + "auxiliary_loss_clip": 0.01368178, + "auxiliary_loss_mlp": 0.01112367, + "balance_loss_clip": 1.06566119, + "balance_loss_mlp": 1.10654771, + "epoch": 0.014369457387644672, + "flos": 20777519473920.0, + "grad_norm": 1.992064896075991, + "language_loss": 0.87370872, + "learning_rate": 3.526033015791284e-06, + "loss": 0.89851415, + "num_input_tokens_seen": 5033645, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 2.609375, + "step": 239, + "time_per_iteration": 2.554222583770752 + }, + { + "auxiliary_loss_clip": 0.01352979, + "auxiliary_loss_mlp": 0.01100535, + "balance_loss_clip": 1.05330408, + "balance_loss_mlp": 1.09776592, + "epoch": 0.01442958064031264, + "flos": 25848536826240.0, + "grad_norm": 2.2433371609956283, + "language_loss": 0.93297911, + "learning_rate": 3.528721337790862e-06, + "loss": 0.95751429, + "num_input_tokens_seen": 5052875, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 2.5625, + "step": 240, + "time_per_iteration": 2.588529348373413 + }, + { + "auxiliary_loss_clip": 0.01362634, + "auxiliary_loss_mlp": 0.01104045, + "balance_loss_clip": 1.05736244, + "balance_loss_mlp": 1.10324717, + "epoch": 0.014489703892980611, + "flos": 28219718269440.0, + "grad_norm": 2.299780828803648, + "language_loss": 0.85129881, + "learning_rate": 3.531398481704111e-06, + "loss": 0.8759656, + "num_input_tokens_seen": 5075005, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 2.59375, + "step": 241, + "time_per_iteration": 2.607272148132324 + }, + { + "auxiliary_loss_clip": 0.01360778, + "auxiliary_loss_mlp": 0.01116022, + "balance_loss_clip": 1.06695509, + "balance_loss_mlp": 1.10865557, + "epoch": 0.01454982714564858, + "flos": 22490925108480.0, + "grad_norm": 1.927287768398498, + "language_loss": 0.88410223, + "learning_rate": 3.534064540103573e-06, + "loss": 0.90887022, + "num_input_tokens_seen": 5091875, + "router_z_loss_clip": 0.49023438, + "router_z_loss_mlp": 2.53125, + "step": 242, + "time_per_iteration": 2.522657632827759 + }, + { + "auxiliary_loss_clip": 0.013595, + "auxiliary_loss_mlp": 0.0109979, + "balance_loss_clip": 1.04981756, + "balance_loss_mlp": 1.10147619, + "epoch": 0.014609950398316548, + "flos": 21653201139840.0, + "grad_norm": 2.6384412969740922, + "language_loss": 0.86817086, + "learning_rate": 3.536719604416555e-06, + "loss": 0.89276373, + "num_input_tokens_seen": 5111290, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 2.578125, + "step": 243, + "time_per_iteration": 2.5738751888275146 + }, + { + "auxiliary_loss_clip": 0.01366378, + "auxiliary_loss_mlp": 0.01105289, + "balance_loss_clip": 1.05574584, + "balance_loss_mlp": 1.10421979, + "epoch": 0.014670073650984519, + "flos": 21869993675520.0, + "grad_norm": 1.576084931358892, + "language_loss": 0.84271425, + "learning_rate": 3.5393637649439464e-06, + "loss": 0.86743093, + "num_input_tokens_seen": 5132265, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 2.625, + "step": 244, + "time_per_iteration": 2.51370906829834 + }, + { + "auxiliary_loss_clip": 0.01374385, + "auxiliary_loss_mlp": 0.01115077, + "balance_loss_clip": 1.06403196, + "balance_loss_mlp": 1.10701251, + "epoch": 0.014730196903652487, + "flos": 23183713699200.0, + "grad_norm": 2.2775099056278916, + "language_loss": 0.78689361, + "learning_rate": 3.54199711087864e-06, + "loss": 0.8117882, + "num_input_tokens_seen": 5148575, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 2.671875, + "step": 245, + "time_per_iteration": 2.5579745769500732 + }, + { + "auxiliary_loss_clip": 0.01372772, + "auxiliary_loss_mlp": 0.0110276, + "balance_loss_clip": 1.04961681, + "balance_loss_mlp": 1.10232484, + "epoch": 0.014790320156320457, + "flos": 23222605150080.0, + "grad_norm": 2.2330220282190685, + "language_loss": 0.84241545, + "learning_rate": 3.5446197303235913e-06, + "loss": 0.86717069, + "num_input_tokens_seen": 5170415, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.703125, + "step": 246, + "time_per_iteration": 2.565614700317383 + }, + { + "auxiliary_loss_clip": 0.01367419, + "auxiliary_loss_mlp": 0.01097455, + "balance_loss_clip": 1.04722059, + "balance_loss_mlp": 1.10181057, + "epoch": 0.014850443408988426, + "flos": 15815490963840.0, + "grad_norm": 1.9335653980079095, + "language_loss": 0.9014703, + "learning_rate": 3.5472317103095034e-06, + "loss": 0.92611909, + "num_input_tokens_seen": 5188565, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 2.65625, + "step": 247, + "time_per_iteration": 2.5572896003723145 + }, + { + "auxiliary_loss_clip": 0.01365881, + "auxiliary_loss_mlp": 0.01097755, + "balance_loss_clip": 1.04952252, + "balance_loss_mlp": 1.09689593, + "epoch": 0.014910566661656396, + "flos": 22781657790720.0, + "grad_norm": 2.1205098484246734, + "language_loss": 0.78058362, + "learning_rate": 3.549833136812155e-06, + "loss": 0.80521989, + "num_input_tokens_seen": 5207810, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 2.6875, + "step": 248, + "time_per_iteration": 2.5365517139434814 + }, + { + "auxiliary_loss_clip": 0.0136687, + "auxiliary_loss_mlp": 0.01105288, + "balance_loss_clip": 1.05552983, + "balance_loss_mlp": 1.10545397, + "epoch": 0.014970689914324365, + "flos": 26865023806080.0, + "grad_norm": 2.1747011613954177, + "language_loss": 0.83849227, + "learning_rate": 3.552424094769381e-06, + "loss": 0.86321384, + "num_input_tokens_seen": 5226210, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 2.609375, + "step": 249, + "time_per_iteration": 2.6142020225524902 + }, + { + "auxiliary_loss_clip": 0.01360073, + "auxiliary_loss_mlp": 0.01106063, + "balance_loss_clip": 1.05806887, + "balance_loss_mlp": 1.09971058, + "epoch": 0.015030813166992334, + "flos": 13985662371840.0, + "grad_norm": 2.2137591284686455, + "language_loss": 0.93476778, + "learning_rate": 3.5550046680977174e-06, + "loss": 0.95942914, + "num_input_tokens_seen": 5241660, + "router_z_loss_clip": 0.47851562, + "router_z_loss_mlp": 2.609375, + "step": 250, + "time_per_iteration": 2.485686779022217 + }, + { + "auxiliary_loss_clip": 0.01369254, + "auxiliary_loss_mlp": 0.01114661, + "balance_loss_clip": 1.06351972, + "balance_loss_mlp": 1.10460913, + "epoch": 0.015090936419660304, + "flos": 24717817618560.0, + "grad_norm": 2.2612141068319622, + "language_loss": 0.97030997, + "learning_rate": 3.5575749397087034e-06, + "loss": 0.99514914, + "num_input_tokens_seen": 5261090, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 2.640625, + "step": 251, + "time_per_iteration": 2.5887296199798584 + }, + { + "auxiliary_loss_clip": 0.01362288, + "auxiliary_loss_mlp": 0.01105325, + "balance_loss_clip": 1.05723596, + "balance_loss_mlp": 1.09872079, + "epoch": 0.015151059672328273, + "flos": 25738793798400.0, + "grad_norm": 2.0465178965121136, + "language_loss": 0.8428089, + "learning_rate": 3.5601349915248707e-06, + "loss": 0.86748511, + "num_input_tokens_seen": 5279175, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 2.640625, + "step": 252, + "time_per_iteration": 2.5749199390411377 + }, + { + "auxiliary_loss_clip": 0.01357969, + "auxiliary_loss_mlp": 0.01114738, + "balance_loss_clip": 1.06569552, + "balance_loss_mlp": 1.10169089, + "epoch": 0.015211182924996243, + "flos": 21871214737920.0, + "grad_norm": 2.482990993198259, + "language_loss": 0.98208833, + "learning_rate": 3.5626849044954064e-06, + "loss": 1.00681543, + "num_input_tokens_seen": 5296975, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 2.5625, + "step": 253, + "time_per_iteration": 2.5639333724975586 + }, + { + "auxiliary_loss_clip": 0.01233728, + "auxiliary_loss_mlp": 0.01026961, + "balance_loss_clip": 1.00855541, + "balance_loss_mlp": 1.09965372, + "epoch": 0.015271306177664212, + "flos": 66895080888960.0, + "grad_norm": 0.8505459641429172, + "language_loss": 0.55672622, + "learning_rate": 3.5652247586115167e-06, + "loss": 0.57933319, + "num_input_tokens_seen": 5358375, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.34375, + "step": 254, + "time_per_iteration": 3.1063449382781982 + }, + { + "auxiliary_loss_clip": 0.01362079, + "auxiliary_loss_mlp": 0.01116704, + "balance_loss_clip": 1.06687438, + "balance_loss_mlp": 1.09652638, + "epoch": 0.01533142943033218, + "flos": 26834069260800.0, + "grad_norm": 2.4360968938917065, + "language_loss": 0.90453845, + "learning_rate": 3.567754632921479e-06, + "loss": 0.9293263, + "num_input_tokens_seen": 5377255, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 2.65625, + "step": 255, + "time_per_iteration": 2.5746912956237793 + }, + { + "auxiliary_loss_clip": 0.01358909, + "auxiliary_loss_mlp": 0.01125654, + "balance_loss_clip": 1.07568169, + "balance_loss_mlp": 1.09931397, + "epoch": 0.01539155268300015, + "flos": 20813753318400.0, + "grad_norm": 2.2666703391376903, + "language_loss": 0.8562001, + "learning_rate": 3.5702746055454075e-06, + "loss": 0.8810457, + "num_input_tokens_seen": 5395320, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 2.59375, + "step": 256, + "time_per_iteration": 2.6095149517059326 + }, + { + "auxiliary_loss_clip": 0.01366413, + "auxiliary_loss_mlp": 0.01112529, + "balance_loss_clip": 1.06305718, + "balance_loss_mlp": 1.09961021, + "epoch": 0.01545167593566812, + "flos": 15961862885760.0, + "grad_norm": 2.7442871984488386, + "language_loss": 0.71504897, + "learning_rate": 3.5727847536897254e-06, + "loss": 0.73983842, + "num_input_tokens_seen": 5411970, + "router_z_loss_clip": 0.49414062, + "router_z_loss_mlp": 2.65625, + "step": 257, + "time_per_iteration": 2.5939691066741943 + }, + { + "auxiliary_loss_clip": 0.01357007, + "auxiliary_loss_mlp": 0.01100177, + "balance_loss_clip": 1.05087197, + "balance_loss_mlp": 1.09875202, + "epoch": 0.01551179918833609, + "flos": 22601745544320.0, + "grad_norm": 1.9522192109187282, + "language_loss": 0.94659579, + "learning_rate": 3.5752851536613596e-06, + "loss": 0.97116768, + "num_input_tokens_seen": 5430245, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 2.578125, + "step": 258, + "time_per_iteration": 2.7119739055633545 + }, + { + "auxiliary_loss_clip": 0.01356701, + "auxiliary_loss_mlp": 0.01104272, + "balance_loss_clip": 1.05615926, + "balance_loss_mlp": 1.09608126, + "epoch": 0.015571922441004058, + "flos": 22816706486400.0, + "grad_norm": 3.167214789879638, + "language_loss": 0.93174207, + "learning_rate": 3.577775880881658e-06, + "loss": 0.95635182, + "num_input_tokens_seen": 5448905, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 2.59375, + "step": 259, + "time_per_iteration": 2.6776607036590576 + }, + { + "auxiliary_loss_clip": 0.01351639, + "auxiliary_loss_mlp": 0.010988, + "balance_loss_clip": 1.05297637, + "balance_loss_mlp": 1.10035825, + "epoch": 0.015632045693672027, + "flos": 18947439486720.0, + "grad_norm": 2.1226725879970605, + "language_loss": 0.97360909, + "learning_rate": 3.5802570099000424e-06, + "loss": 0.99811351, + "num_input_tokens_seen": 5466405, + "router_z_loss_clip": 0.45898438, + "router_z_loss_mlp": 2.515625, + "step": 260, + "time_per_iteration": 2.520759105682373 + }, + { + "auxiliary_loss_clip": 0.01365989, + "auxiliary_loss_mlp": 0.01110082, + "balance_loss_clip": 1.06282747, + "balance_loss_mlp": 1.10060608, + "epoch": 0.015692168946339995, + "flos": 29971728046080.0, + "grad_norm": 2.3569711169381, + "language_loss": 0.87644511, + "learning_rate": 3.5827286144073947e-06, + "loss": 0.90120584, + "num_input_tokens_seen": 5487055, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 2.65625, + "step": 261, + "time_per_iteration": 2.5837602615356445 + }, + { + "auxiliary_loss_clip": 0.0135711, + "auxiliary_loss_mlp": 0.01105229, + "balance_loss_clip": 1.05613816, + "balance_loss_mlp": 1.09709311, + "epoch": 0.015752292199007967, + "flos": 19392085946880.0, + "grad_norm": 1.9926513495738176, + "language_loss": 0.67226446, + "learning_rate": 3.5851907672491904e-06, + "loss": 0.69688779, + "num_input_tokens_seen": 5506600, + "router_z_loss_clip": 0.49023438, + "router_z_loss_mlp": 2.59375, + "step": 262, + "time_per_iteration": 2.5490784645080566 + }, + { + "auxiliary_loss_clip": 0.01354995, + "auxiliary_loss_mlp": 0.01121613, + "balance_loss_clip": 1.07145, + "balance_loss_mlp": 1.0984714, + "epoch": 0.015812415451675936, + "flos": 20339804338560.0, + "grad_norm": 2.3019763169045637, + "language_loss": 0.68570435, + "learning_rate": 3.587643540438383e-06, + "loss": 0.71047044, + "num_input_tokens_seen": 5524350, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 2.5625, + "step": 263, + "time_per_iteration": 2.5207104682922363 + }, + { + "auxiliary_loss_clip": 0.01355963, + "auxiliary_loss_mlp": 0.01105396, + "balance_loss_clip": 1.055686, + "balance_loss_mlp": 1.09446979, + "epoch": 0.015872538704343905, + "flos": 17525412979200.0, + "grad_norm": 3.705792502973735, + "language_loss": 0.85120308, + "learning_rate": 3.590087005168037e-06, + "loss": 0.87581658, + "num_input_tokens_seen": 5542145, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 2.625, + "step": 264, + "time_per_iteration": 2.559406280517578 + }, + { + "auxiliary_loss_clip": 0.01361439, + "auxiliary_loss_mlp": 0.01092909, + "balance_loss_clip": 1.04663229, + "balance_loss_mlp": 1.10003614, + "epoch": 0.015932661957011873, + "flos": 15260490944640.0, + "grad_norm": 4.651007312001026, + "language_loss": 1.04371059, + "learning_rate": 3.5925212318237344e-06, + "loss": 1.06825411, + "num_input_tokens_seen": 5557920, + "router_z_loss_clip": 0.46289062, + "router_z_loss_mlp": 2.625, + "step": 265, + "time_per_iteration": 2.5076427459716797 + }, + { + "auxiliary_loss_clip": 0.01364923, + "auxiliary_loss_mlp": 0.01114141, + "balance_loss_clip": 1.06266677, + "balance_loss_mlp": 1.10278761, + "epoch": 0.015992785209679845, + "flos": 20302528999680.0, + "grad_norm": 2.2797174203272705, + "language_loss": 0.75153112, + "learning_rate": 3.5949462899957323e-06, + "loss": 0.77632177, + "num_input_tokens_seen": 5576290, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 2.625, + "step": 266, + "time_per_iteration": 2.52923583984375 + }, + { + "auxiliary_loss_clip": 0.01351984, + "auxiliary_loss_mlp": 0.01101922, + "balance_loss_clip": 1.05321336, + "balance_loss_mlp": 1.10004377, + "epoch": 0.016052908462347814, + "flos": 23362368969600.0, + "grad_norm": 1.7047265515665009, + "language_loss": 0.90568709, + "learning_rate": 3.5973622484909068e-06, + "loss": 0.93022615, + "num_input_tokens_seen": 5595205, + "router_z_loss_clip": 0.48632812, + "router_z_loss_mlp": 2.515625, + "step": 267, + "time_per_iteration": 4.033226251602173 + }, + { + "auxiliary_loss_clip": 0.01359316, + "auxiliary_loss_mlp": 0.01118854, + "balance_loss_clip": 1.07143235, + "balance_loss_mlp": 1.09878063, + "epoch": 0.016113031715015783, + "flos": 21286588976640.0, + "grad_norm": 2.258126572730018, + "language_loss": 0.86044276, + "learning_rate": 3.599769175344462e-06, + "loss": 0.88522446, + "num_input_tokens_seen": 5612645, + "router_z_loss_clip": 0.47460938, + "router_z_loss_mlp": 2.609375, + "step": 268, + "time_per_iteration": 3.9120936393737793 + }, + { + "auxiliary_loss_clip": 0.01352601, + "auxiliary_loss_mlp": 0.01098281, + "balance_loss_clip": 1.05186045, + "balance_loss_mlp": 1.10092831, + "epoch": 0.01617315496768375, + "flos": 18914689261440.0, + "grad_norm": 3.4793793476816335, + "language_loss": 0.88284534, + "learning_rate": 3.602167137831432e-06, + "loss": 0.90735412, + "num_input_tokens_seen": 5628345, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 2.515625, + "step": 269, + "time_per_iteration": 2.5170347690582275 + }, + { + "auxiliary_loss_clip": 0.01357286, + "auxiliary_loss_mlp": 0.01099969, + "balance_loss_clip": 1.04901874, + "balance_loss_mlp": 1.09723783, + "epoch": 0.01623327822035172, + "flos": 16546488647040.0, + "grad_norm": 2.082153756456244, + "language_loss": 0.97073388, + "learning_rate": 3.6045562024779565e-06, + "loss": 0.99530637, + "num_input_tokens_seen": 5645940, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.59375, + "step": 270, + "time_per_iteration": 2.4856350421905518 + }, + { + "auxiliary_loss_clip": 0.01357366, + "auxiliary_loss_mlp": 0.01117767, + "balance_loss_clip": 1.07001138, + "balance_loss_mlp": 1.10259032, + "epoch": 0.016293401473019692, + "flos": 23513481486720.0, + "grad_norm": 2.1071719511680755, + "language_loss": 0.85919821, + "learning_rate": 3.606936435072361e-06, + "loss": 0.88394946, + "num_input_tokens_seen": 5665690, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 2.546875, + "step": 271, + "time_per_iteration": 2.55047345161438 + }, + { + "auxiliary_loss_clip": 0.01355041, + "auxiliary_loss_mlp": 0.0109977, + "balance_loss_clip": 1.05201519, + "balance_loss_mlp": 1.09418058, + "epoch": 0.01635352472568766, + "flos": 29016072748800.0, + "grad_norm": 3.6330072162998523, + "language_loss": 0.81509304, + "learning_rate": 3.609307900676025e-06, + "loss": 0.83964115, + "num_input_tokens_seen": 5683190, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 2.609375, + "step": 272, + "time_per_iteration": 2.563840389251709 + }, + { + "auxiliary_loss_clip": 0.01348825, + "auxiliary_loss_mlp": 0.01118044, + "balance_loss_clip": 1.07229137, + "balance_loss_mlp": 1.09649634, + "epoch": 0.01641364797835563, + "flos": 13370513028480.0, + "grad_norm": 2.4112371858801436, + "language_loss": 0.81101978, + "learning_rate": 3.611670663634051e-06, + "loss": 0.83568847, + "num_input_tokens_seen": 5699780, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 2.515625, + "step": 273, + "time_per_iteration": 2.504791736602783 + }, + { + "auxiliary_loss_clip": 0.01348205, + "auxiliary_loss_mlp": 0.01105988, + "balance_loss_clip": 1.05825627, + "balance_loss_mlp": 1.0930239, + "epoch": 0.016473771231023598, + "flos": 18878239935360.0, + "grad_norm": 2.3125197915452387, + "language_loss": 0.91599321, + "learning_rate": 3.614024787585744e-06, + "loss": 0.94053519, + "num_input_tokens_seen": 5716980, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 2.5625, + "step": 274, + "time_per_iteration": 2.530883312225342 + }, + { + "auxiliary_loss_clip": 0.01346841, + "auxiliary_loss_mlp": 0.01110058, + "balance_loss_clip": 1.06154013, + "balance_loss_mlp": 1.09588742, + "epoch": 0.016533894483691566, + "flos": 22601637803520.0, + "grad_norm": 1.8828740595481548, + "language_loss": 0.87952697, + "learning_rate": 3.6163703354748927e-06, + "loss": 0.90409595, + "num_input_tokens_seen": 5737780, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 2.515625, + "step": 275, + "time_per_iteration": 2.6067841053009033 + }, + { + "auxiliary_loss_clip": 0.01349399, + "auxiliary_loss_mlp": 0.01103927, + "balance_loss_clip": 1.05481219, + "balance_loss_mlp": 1.09579742, + "epoch": 0.01659401773635954, + "flos": 21507188353920.0, + "grad_norm": 1.8814357547622875, + "language_loss": 0.80717576, + "learning_rate": 3.6187073695598707e-06, + "loss": 0.83170903, + "num_input_tokens_seen": 5758330, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 2.53125, + "step": 276, + "time_per_iteration": 2.5251641273498535 + }, + { + "auxiliary_loss_clip": 0.01340258, + "auxiliary_loss_mlp": 0.01100275, + "balance_loss_clip": 1.0561676, + "balance_loss_mlp": 1.0946306, + "epoch": 0.016654140989027507, + "flos": 32850973411200.0, + "grad_norm": 1.7238418569970533, + "language_loss": 0.81033546, + "learning_rate": 3.621035951423551e-06, + "loss": 0.83474076, + "num_input_tokens_seen": 5778340, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 2.46875, + "step": 277, + "time_per_iteration": 2.6796398162841797 + }, + { + "auxiliary_loss_clip": 0.01338755, + "auxiliary_loss_mlp": 0.01095233, + "balance_loss_clip": 1.04828835, + "balance_loss_mlp": 1.08789539, + "epoch": 0.016714264241695476, + "flos": 12306228024960.0, + "grad_norm": 2.810922211495867, + "language_loss": 0.80307728, + "learning_rate": 3.623356141983041e-06, + "loss": 0.82741719, + "num_input_tokens_seen": 5794295, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 2.515625, + "step": 278, + "time_per_iteration": 2.4939208030700684 + }, + { + "auxiliary_loss_clip": 0.01343866, + "auxiliary_loss_mlp": 0.01101481, + "balance_loss_clip": 1.05634809, + "balance_loss_mlp": 1.09381282, + "epoch": 0.016774387494363444, + "flos": 27123796362240.0, + "grad_norm": 1.7778988036026468, + "language_loss": 0.90482658, + "learning_rate": 3.6256680014992486e-06, + "loss": 0.92928004, + "num_input_tokens_seen": 5814405, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 2.5, + "step": 279, + "time_per_iteration": 2.571439504623413 + }, + { + "auxiliary_loss_clip": 0.01348727, + "auxiliary_loss_mlp": 0.01116075, + "balance_loss_clip": 1.06872559, + "balance_loss_mlp": 1.09391451, + "epoch": 0.016834510747031413, + "flos": 20191493082240.0, + "grad_norm": 3.0477743200742387, + "language_loss": 0.94153798, + "learning_rate": 3.6279715895862713e-06, + "loss": 0.96618605, + "num_input_tokens_seen": 5832795, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 2.546875, + "step": 280, + "time_per_iteration": 2.5161728858947754 + }, + { + "auxiliary_loss_clip": 0.0134865, + "auxiliary_loss_mlp": 0.0110692, + "balance_loss_clip": 1.05864, + "balance_loss_mlp": 1.09245062, + "epoch": 0.016894633999699385, + "flos": 27274262434560.0, + "grad_norm": 3.578687135351882, + "language_loss": 0.73929775, + "learning_rate": 3.6302669652206183e-06, + "loss": 0.76385343, + "num_input_tokens_seen": 5855750, + "router_z_loss_clip": 0.48242188, + "router_z_loss_mlp": 2.5625, + "step": 281, + "time_per_iteration": 2.616241931915283 + }, + { + "auxiliary_loss_clip": 0.01343434, + "auxiliary_loss_mlp": 0.0111488, + "balance_loss_clip": 1.06977129, + "balance_loss_mlp": 1.09390783, + "epoch": 0.016954757252367354, + "flos": 14902964922240.0, + "grad_norm": 2.679798242609796, + "language_loss": 0.80207133, + "learning_rate": 3.632554186750274e-06, + "loss": 0.82665443, + "num_input_tokens_seen": 5872610, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 2.5, + "step": 282, + "time_per_iteration": 2.5421135425567627 + }, + { + "auxiliary_loss_clip": 0.01348806, + "auxiliary_loss_mlp": 0.01117348, + "balance_loss_clip": 1.0704273, + "balance_loss_mlp": 1.09599137, + "epoch": 0.017014880505035322, + "flos": 21358805270400.0, + "grad_norm": 2.1184562475367916, + "language_loss": 0.77788174, + "learning_rate": 3.6348333119035937e-06, + "loss": 0.80254328, + "num_input_tokens_seen": 5892985, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 2.53125, + "step": 283, + "time_per_iteration": 2.516474485397339 + }, + { + "auxiliary_loss_clip": 0.01349252, + "auxiliary_loss_mlp": 0.01091995, + "balance_loss_clip": 1.04788804, + "balance_loss_mlp": 1.09700751, + "epoch": 0.01707500375770329, + "flos": 35333154858240.0, + "grad_norm": 2.1009174504018544, + "language_loss": 0.84172702, + "learning_rate": 3.6371043977980503e-06, + "loss": 0.86613953, + "num_input_tokens_seen": 5914060, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 2.515625, + "step": 284, + "time_per_iteration": 2.646301031112671 + }, + { + "auxiliary_loss_clip": 0.01339164, + "auxiliary_loss_mlp": 0.010994, + "balance_loss_clip": 1.05216956, + "balance_loss_mlp": 1.09148788, + "epoch": 0.01713512701037126, + "flos": 23582070506880.0, + "grad_norm": 3.014395623363928, + "language_loss": 0.96993905, + "learning_rate": 3.639367500948819e-06, + "loss": 0.99432468, + "num_input_tokens_seen": 5932860, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 2.46875, + "step": 285, + "time_per_iteration": 2.5412731170654297 + }, + { + "auxiliary_loss_clip": 0.01342544, + "auxiliary_loss_mlp": 0.01093983, + "balance_loss_clip": 1.05025744, + "balance_loss_mlp": 1.09407294, + "epoch": 0.01719525026303923, + "flos": 27634661544960.0, + "grad_norm": 2.2067050643741433, + "language_loss": 0.93951917, + "learning_rate": 3.6416226772772178e-06, + "loss": 0.96388453, + "num_input_tokens_seen": 5952725, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 2.484375, + "step": 286, + "time_per_iteration": 2.5895566940307617 + }, + { + "auxiliary_loss_clip": 0.0133546, + "auxiliary_loss_mlp": 0.01090331, + "balance_loss_clip": 1.04503167, + "balance_loss_mlp": 1.08924019, + "epoch": 0.0172553735157072, + "flos": 26979722910720.0, + "grad_norm": 1.8729510510678706, + "language_loss": 0.92157722, + "learning_rate": 3.643869982119001e-06, + "loss": 0.94583511, + "num_input_tokens_seen": 5970560, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 2.46875, + "step": 287, + "time_per_iteration": 2.6144802570343018 + }, + { + "auxiliary_loss_clip": 0.01338793, + "auxiliary_loss_mlp": 0.01089685, + "balance_loss_clip": 1.04462433, + "balance_loss_mlp": 1.08859432, + "epoch": 0.01731549676837517, + "flos": 14056621689600.0, + "grad_norm": 3.2271144452092564, + "language_loss": 1.02026963, + "learning_rate": 3.646109470232502e-06, + "loss": 1.04455447, + "num_input_tokens_seen": 5982980, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 2.5, + "step": 288, + "time_per_iteration": 2.488274097442627 + }, + { + "auxiliary_loss_clip": 0.01222501, + "auxiliary_loss_mlp": 0.01036501, + "balance_loss_clip": 1.02000237, + "balance_loss_mlp": 1.09325862, + "epoch": 0.017375620021043137, + "flos": 66510694471680.0, + "grad_norm": 0.9131614435254132, + "language_loss": 0.63915455, + "learning_rate": 3.6483411958066417e-06, + "loss": 0.66174459, + "num_input_tokens_seen": 6049445, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 1.296875, + "step": 289, + "time_per_iteration": 3.222426652908325 + }, + { + "auxiliary_loss_clip": 0.01341104, + "auxiliary_loss_mlp": 0.01107523, + "balance_loss_clip": 1.06379664, + "balance_loss_mlp": 1.09403992, + "epoch": 0.01743574327371111, + "flos": 15225154940160.0, + "grad_norm": 2.4014361624695173, + "language_loss": 0.88569438, + "learning_rate": 3.6505652124687957e-06, + "loss": 0.91018069, + "num_input_tokens_seen": 6064150, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 2.46875, + "step": 290, + "time_per_iteration": 2.49294114112854 + }, + { + "auxiliary_loss_clip": 0.01338257, + "auxiliary_loss_mlp": 0.01091523, + "balance_loss_clip": 1.04631877, + "balance_loss_mlp": 1.09248078, + "epoch": 0.017495866526379078, + "flos": 25373869574400.0, + "grad_norm": 2.156562479490788, + "language_loss": 0.84578067, + "learning_rate": 3.6527815732925258e-06, + "loss": 0.87007844, + "num_input_tokens_seen": 6083920, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 2.453125, + "step": 291, + "time_per_iteration": 2.5356485843658447 + }, + { + "auxiliary_loss_clip": 0.01345108, + "auxiliary_loss_mlp": 0.01106973, + "balance_loss_clip": 1.05897939, + "balance_loss_mlp": 1.10042334, + "epoch": 0.017555989779047047, + "flos": 26359473836160.0, + "grad_norm": 1.6617628708439536, + "language_loss": 0.72766221, + "learning_rate": 3.6549903308051806e-06, + "loss": 0.75218308, + "num_input_tokens_seen": 6105460, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 2.453125, + "step": 292, + "time_per_iteration": 2.6524176597595215 + }, + { + "auxiliary_loss_clip": 0.01333825, + "auxiliary_loss_mlp": 0.01101528, + "balance_loss_clip": 1.05625248, + "balance_loss_mlp": 1.09236324, + "epoch": 0.017616113031715015, + "flos": 22338807010560.0, + "grad_norm": 2.2014441192179866, + "language_loss": 0.8726995, + "learning_rate": 3.6571915369953646e-06, + "loss": 0.89705306, + "num_input_tokens_seen": 6122890, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 2.40625, + "step": 293, + "time_per_iteration": 2.531580686569214 + }, + { + "auxiliary_loss_clip": 0.01334314, + "auxiliary_loss_mlp": 0.0110389, + "balance_loss_clip": 1.05959213, + "balance_loss_mlp": 1.09177744, + "epoch": 0.017676236284382984, + "flos": 20156911263360.0, + "grad_norm": 2.3120260424061367, + "language_loss": 0.81276119, + "learning_rate": 3.6593852433202797e-06, + "loss": 0.83714324, + "num_input_tokens_seen": 6142890, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 2.4375, + "step": 294, + "time_per_iteration": 2.568784236907959 + }, + { + "auxiliary_loss_clip": 0.01334452, + "auxiliary_loss_mlp": 0.01107857, + "balance_loss_clip": 1.06274807, + "balance_loss_mlp": 1.08824301, + "epoch": 0.017736359537050956, + "flos": 25223331674880.0, + "grad_norm": 2.9227055740425705, + "language_loss": 0.83710909, + "learning_rate": 3.6615715007129453e-06, + "loss": 0.86153215, + "num_input_tokens_seen": 6162030, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 2.46875, + "step": 295, + "time_per_iteration": 2.5799450874328613 + }, + { + "auxiliary_loss_clip": 0.01339817, + "auxiliary_loss_mlp": 0.01110731, + "balance_loss_clip": 1.06559837, + "balance_loss_mlp": 1.09874845, + "epoch": 0.017796482789718925, + "flos": 20338798757760.0, + "grad_norm": 2.5339269047951727, + "language_loss": 0.84620988, + "learning_rate": 3.6637503595892897e-06, + "loss": 0.87071538, + "num_input_tokens_seen": 6180540, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 2.40625, + "step": 296, + "time_per_iteration": 2.5243051052093506 + }, + { + "auxiliary_loss_clip": 0.01338756, + "auxiliary_loss_mlp": 0.01097832, + "balance_loss_clip": 1.05417752, + "balance_loss_mlp": 1.09317493, + "epoch": 0.017856606042386893, + "flos": 22379206832640.0, + "grad_norm": 2.123858619871597, + "language_loss": 0.87729871, + "learning_rate": 3.665921869855132e-06, + "loss": 0.90166461, + "num_input_tokens_seen": 6199425, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 2.453125, + "step": 297, + "time_per_iteration": 2.5186710357666016 + }, + { + "auxiliary_loss_clip": 0.01337139, + "auxiliary_loss_mlp": 0.01100837, + "balance_loss_clip": 1.05713463, + "balance_loss_mlp": 1.09108877, + "epoch": 0.017916729295054862, + "flos": 20230061310720.0, + "grad_norm": 2.170328911832355, + "language_loss": 0.88528925, + "learning_rate": 3.6680860809130346e-06, + "loss": 0.90966904, + "num_input_tokens_seen": 6219170, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 2.46875, + "step": 298, + "time_per_iteration": 2.5320143699645996 + }, + { + "auxiliary_loss_clip": 0.0133273, + "auxiliary_loss_mlp": 0.01118432, + "balance_loss_clip": 1.07234538, + "balance_loss_mlp": 1.09249902, + "epoch": 0.01797685254772283, + "flos": 19390972625280.0, + "grad_norm": 1.8938405886263965, + "language_loss": 0.88666737, + "learning_rate": 3.6702430416690516e-06, + "loss": 0.91117901, + "num_input_tokens_seen": 6237930, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 2.40625, + "step": 299, + "time_per_iteration": 2.588275671005249 + }, + { + "auxiliary_loss_clip": 0.01340105, + "auxiliary_loss_mlp": 0.01105829, + "balance_loss_clip": 1.06055307, + "balance_loss_mlp": 1.09275746, + "epoch": 0.018036975800390802, + "flos": 24426007528320.0, + "grad_norm": 3.2936483356677253, + "language_loss": 0.64349103, + "learning_rate": 3.672392800539357e-06, + "loss": 0.66795039, + "num_input_tokens_seen": 6257170, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 2.46875, + "step": 300, + "time_per_iteration": 2.592313289642334 + }, + { + "auxiliary_loss_clip": 0.01338706, + "auxiliary_loss_mlp": 0.01105447, + "balance_loss_clip": 1.05986142, + "balance_loss_mlp": 1.09540462, + "epoch": 0.01809709905305877, + "flos": 15778933896960.0, + "grad_norm": 2.310898752337597, + "language_loss": 0.88330823, + "learning_rate": 3.6745354054567686e-06, + "loss": 0.90774977, + "num_input_tokens_seen": 6274780, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 2.4375, + "step": 301, + "time_per_iteration": 2.499481439590454 + }, + { + "auxiliary_loss_clip": 0.01214573, + "auxiliary_loss_mlp": 0.01024582, + "balance_loss_clip": 1.00932336, + "balance_loss_mlp": 1.08753991, + "epoch": 0.01815722230572674, + "flos": 67348382526720.0, + "grad_norm": 0.8370211186232274, + "language_loss": 0.62198341, + "learning_rate": 3.676670903877158e-06, + "loss": 0.64437497, + "num_input_tokens_seen": 6340435, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 1.265625, + "step": 302, + "time_per_iteration": 3.259997844696045 + }, + { + "auxiliary_loss_clip": 0.01329895, + "auxiliary_loss_mlp": 0.01107479, + "balance_loss_clip": 1.06198907, + "balance_loss_mlp": 1.08938098, + "epoch": 0.01821734555839471, + "flos": 15485615435520.0, + "grad_norm": 2.491293816938874, + "language_loss": 0.89617372, + "learning_rate": 3.6787993427857567e-06, + "loss": 0.92054749, + "num_input_tokens_seen": 6358160, + "router_z_loss_clip": 0.45507812, + "router_z_loss_mlp": 2.40625, + "step": 303, + "time_per_iteration": 2.536773920059204 + }, + { + "auxiliary_loss_clip": 0.01336859, + "auxiliary_loss_mlp": 0.01114111, + "balance_loss_clip": 1.06778669, + "balance_loss_mlp": 1.09363747, + "epoch": 0.018277468811062677, + "flos": 24097424889600.0, + "grad_norm": 4.887297609803561, + "language_loss": 0.80314684, + "learning_rate": 3.680920768703364e-06, + "loss": 0.82765651, + "num_input_tokens_seen": 6378485, + "router_z_loss_clip": 0.46289062, + "router_z_loss_mlp": 2.4375, + "step": 304, + "time_per_iteration": 2.563828945159912 + }, + { + "auxiliary_loss_clip": 0.01331614, + "auxiliary_loss_mlp": 0.01094816, + "balance_loss_clip": 1.05144823, + "balance_loss_mlp": 1.09657788, + "epoch": 0.01833759206373065, + "flos": 20959335141120.0, + "grad_norm": 1.8235558005033383, + "language_loss": 0.82894015, + "learning_rate": 3.6830352276924415e-06, + "loss": 0.85320443, + "num_input_tokens_seen": 6397845, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 2.34375, + "step": 305, + "time_per_iteration": 2.5195910930633545 + }, + { + "auxiliary_loss_clip": 0.01332168, + "auxiliary_loss_mlp": 0.01093114, + "balance_loss_clip": 1.04993677, + "balance_loss_mlp": 1.08868921, + "epoch": 0.018397715316398618, + "flos": 19390757143680.0, + "grad_norm": 1.9087210074301977, + "language_loss": 0.90843809, + "learning_rate": 3.685142765363119e-06, + "loss": 0.93269092, + "num_input_tokens_seen": 6416475, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 2.4375, + "step": 306, + "time_per_iteration": 2.501276969909668 + }, + { + "auxiliary_loss_clip": 0.01324982, + "auxiliary_loss_mlp": 0.01090544, + "balance_loss_clip": 1.04815364, + "balance_loss_mlp": 1.08638549, + "epoch": 0.018457838569066586, + "flos": 29132531619840.0, + "grad_norm": 2.1762826783898586, + "language_loss": 0.86435306, + "learning_rate": 3.687243426879095e-06, + "loss": 0.88850832, + "num_input_tokens_seen": 6437520, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 2.390625, + "step": 307, + "time_per_iteration": 2.6048038005828857 + }, + { + "auxiliary_loss_clip": 0.01325097, + "auxiliary_loss_mlp": 0.01106166, + "balance_loss_clip": 1.05817199, + "balance_loss_mlp": 1.09046888, + "epoch": 0.018517961821734555, + "flos": 19208654167680.0, + "grad_norm": 2.221444292833677, + "language_loss": 0.71723771, + "learning_rate": 3.6893372569634466e-06, + "loss": 0.74155033, + "num_input_tokens_seen": 6455680, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 2.34375, + "step": 308, + "time_per_iteration": 2.513774871826172 + }, + { + "auxiliary_loss_clip": 0.01331987, + "auxiliary_loss_mlp": 0.01102938, + "balance_loss_clip": 1.05904555, + "balance_loss_mlp": 1.08861351, + "epoch": 0.018578085074402523, + "flos": 19863018184320.0, + "grad_norm": 2.2254161740825293, + "language_loss": 0.91952753, + "learning_rate": 3.6914242999043395e-06, + "loss": 0.94387674, + "num_input_tokens_seen": 6474880, + "router_z_loss_clip": 0.43945312, + "router_z_loss_mlp": 2.4375, + "step": 309, + "time_per_iteration": 5.224750280380249 + }, + { + "auxiliary_loss_clip": 0.01338325, + "auxiliary_loss_mlp": 0.01104953, + "balance_loss_clip": 1.05896235, + "balance_loss_mlp": 1.08840334, + "epoch": 0.018638208327070496, + "flos": 29606947476480.0, + "grad_norm": 2.8056803187702135, + "language_loss": 0.72399509, + "learning_rate": 3.69350459956065e-06, + "loss": 0.74842793, + "num_input_tokens_seen": 6495945, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 2.5, + "step": 310, + "time_per_iteration": 2.584730863571167 + }, + { + "auxiliary_loss_clip": 0.01330325, + "auxiliary_loss_mlp": 0.01112154, + "balance_loss_clip": 1.06790328, + "balance_loss_mlp": 1.09306264, + "epoch": 0.018698331579738464, + "flos": 45731555907840.0, + "grad_norm": 12.392698164772181, + "language_loss": 0.74104297, + "learning_rate": 3.695578199367497e-06, + "loss": 0.76546776, + "num_input_tokens_seen": 6519930, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 2.375, + "step": 311, + "time_per_iteration": 2.734072208404541 + }, + { + "auxiliary_loss_clip": 0.01337963, + "auxiliary_loss_mlp": 0.0110935, + "balance_loss_clip": 1.06619668, + "balance_loss_mlp": 1.09045064, + "epoch": 0.018758454832406433, + "flos": 20483662308480.0, + "grad_norm": 2.2753160661232603, + "language_loss": 0.91518372, + "learning_rate": 3.6976451423416825e-06, + "loss": 0.93965685, + "num_input_tokens_seen": 6535070, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 2.46875, + "step": 312, + "time_per_iteration": 2.5117411613464355 + }, + { + "auxiliary_loss_clip": 0.01336169, + "auxiliary_loss_mlp": 0.01112089, + "balance_loss_clip": 1.06609774, + "balance_loss_mlp": 1.09088099, + "epoch": 0.0188185780850744, + "flos": 15777784661760.0, + "grad_norm": 2.320247917383294, + "language_loss": 0.89746982, + "learning_rate": 3.699705471087043e-06, + "loss": 0.92195237, + "num_input_tokens_seen": 6554135, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 2.453125, + "step": 313, + "time_per_iteration": 2.4761838912963867 + }, + { + "auxiliary_loss_clip": 0.01340305, + "auxiliary_loss_mlp": 0.01098393, + "balance_loss_clip": 1.05230689, + "balance_loss_mlp": 1.09061432, + "epoch": 0.018878701337742373, + "flos": 22455732758400.0, + "grad_norm": 2.3404867001555236, + "language_loss": 0.73099983, + "learning_rate": 3.7017592277997256e-06, + "loss": 0.75538683, + "num_input_tokens_seen": 6572275, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 2.5, + "step": 314, + "time_per_iteration": 2.5488638877868652 + }, + { + "auxiliary_loss_clip": 0.01326469, + "auxiliary_loss_mlp": 0.01103837, + "balance_loss_clip": 1.06101751, + "balance_loss_mlp": 1.08694446, + "epoch": 0.018938824590410342, + "flos": 30993530238720.0, + "grad_norm": 2.192553769026804, + "language_loss": 0.89887041, + "learning_rate": 3.7038064542733654e-06, + "loss": 0.92317349, + "num_input_tokens_seen": 6594520, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 2.40625, + "step": 315, + "time_per_iteration": 2.5857741832733154 + }, + { + "auxiliary_loss_clip": 0.01329672, + "auxiliary_loss_mlp": 0.01096027, + "balance_loss_clip": 1.05170512, + "balance_loss_mlp": 1.08870411, + "epoch": 0.01899894784307831, + "flos": 23258910821760.0, + "grad_norm": 1.8364758613144732, + "language_loss": 0.80796063, + "learning_rate": 3.7058471919041945e-06, + "loss": 0.83221763, + "num_input_tokens_seen": 6614245, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 2.40625, + "step": 316, + "time_per_iteration": 2.5222342014312744 + }, + { + "auxiliary_loss_clip": 0.01324399, + "auxiliary_loss_mlp": 0.01095063, + "balance_loss_clip": 1.05131364, + "balance_loss_mlp": 1.08633423, + "epoch": 0.01905907109574628, + "flos": 17457901367040.0, + "grad_norm": 2.1363686538021236, + "language_loss": 0.90357143, + "learning_rate": 3.7078814816960605e-06, + "loss": 0.92776608, + "num_input_tokens_seen": 6632015, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 2.375, + "step": 317, + "time_per_iteration": 2.4696123600006104 + }, + { + "auxiliary_loss_clip": 0.01319895, + "auxiliary_loss_mlp": 0.01095564, + "balance_loss_clip": 1.0515281, + "balance_loss_mlp": 1.0845592, + "epoch": 0.019119194348414248, + "flos": 14970225139200.0, + "grad_norm": 2.5260192321083794, + "language_loss": 0.90939772, + "learning_rate": 3.709909364265374e-06, + "loss": 0.93355227, + "num_input_tokens_seen": 6649015, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 2.34375, + "step": 318, + "time_per_iteration": 2.488128185272217 + }, + { + "auxiliary_loss_clip": 0.01324457, + "auxiliary_loss_mlp": 0.01088861, + "balance_loss_clip": 1.04706657, + "balance_loss_mlp": 1.08574772, + "epoch": 0.01917931760108222, + "flos": 25482822503040.0, + "grad_norm": 2.626221841877022, + "language_loss": 0.93980259, + "learning_rate": 3.7119308798459706e-06, + "loss": 0.96393579, + "num_input_tokens_seen": 6669225, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 2.390625, + "step": 319, + "time_per_iteration": 2.5184502601623535 + }, + { + "auxiliary_loss_clip": 0.01205117, + "auxiliary_loss_mlp": 0.01080363, + "balance_loss_clip": 1.06586683, + "balance_loss_mlp": 1.07482553, + "epoch": 0.01923944085375019, + "flos": 71556967353600.0, + "grad_norm": 0.9345393611259016, + "language_loss": 0.59860981, + "learning_rate": 3.7139460682939026e-06, + "loss": 0.62146461, + "num_input_tokens_seen": 6725775, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 1.296875, + "step": 320, + "time_per_iteration": 3.0250258445739746 + }, + { + "auxiliary_loss_clip": 0.01320993, + "auxiliary_loss_mlp": 0.0110086, + "balance_loss_clip": 1.05827808, + "balance_loss_mlp": 1.08425927, + "epoch": 0.019299564106418157, + "flos": 19682495406720.0, + "grad_norm": 3.0799113353921572, + "language_loss": 0.89622325, + "learning_rate": 3.715954969092154e-06, + "loss": 0.92044175, + "num_input_tokens_seen": 6744170, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 2.375, + "step": 321, + "time_per_iteration": 2.476439952850342 + }, + { + "auxiliary_loss_clip": 0.01332068, + "auxiliary_loss_mlp": 0.0112077, + "balance_loss_clip": 1.07620978, + "balance_loss_mlp": 1.08993089, + "epoch": 0.019359687359086126, + "flos": 24387151991040.0, + "grad_norm": 2.068543890023447, + "language_loss": 0.82884163, + "learning_rate": 3.7179576213552805e-06, + "loss": 0.85337007, + "num_input_tokens_seen": 6764565, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 2.421875, + "step": 322, + "time_per_iteration": 2.556302309036255 + }, + { + "auxiliary_loss_clip": 0.01332156, + "auxiliary_loss_mlp": 0.01090343, + "balance_loss_clip": 1.04828596, + "balance_loss_mlp": 1.08754158, + "epoch": 0.019419810611754094, + "flos": 23951376190080.0, + "grad_norm": 2.2506232399398245, + "language_loss": 0.72734368, + "learning_rate": 3.719954063833981e-06, + "loss": 0.75156873, + "num_input_tokens_seen": 6785310, + "router_z_loss_clip": 0.41992188, + "router_z_loss_mlp": 2.453125, + "step": 323, + "time_per_iteration": 2.5033397674560547 + }, + { + "auxiliary_loss_clip": 0.01318896, + "auxiliary_loss_mlp": 0.01090622, + "balance_loss_clip": 1.04763484, + "balance_loss_mlp": 1.08184087, + "epoch": 0.019479933864422067, + "flos": 22160223567360.0, + "grad_norm": 2.023515622890843, + "language_loss": 0.92639947, + "learning_rate": 3.721944334919596e-06, + "loss": 0.95049465, + "num_input_tokens_seen": 6803290, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 2.375, + "step": 324, + "time_per_iteration": 2.5194544792175293 + }, + { + "auxiliary_loss_clip": 0.01328869, + "auxiliary_loss_mlp": 0.01089838, + "balance_loss_clip": 1.04935479, + "balance_loss_mlp": 1.08943164, + "epoch": 0.019540057117090035, + "flos": 22236821320320.0, + "grad_norm": 4.018466874717804, + "language_loss": 0.65336061, + "learning_rate": 3.7239284726485375e-06, + "loss": 0.67754775, + "num_input_tokens_seen": 6822570, + "router_z_loss_clip": 0.40429688, + "router_z_loss_mlp": 2.390625, + "step": 325, + "time_per_iteration": 2.5107386112213135 + }, + { + "auxiliary_loss_clip": 0.0132709, + "auxiliary_loss_mlp": 0.01101196, + "balance_loss_clip": 1.05799484, + "balance_loss_mlp": 1.093485, + "epoch": 0.019600180369758004, + "flos": 23076771932160.0, + "grad_norm": 1.921455060851243, + "language_loss": 0.76449442, + "learning_rate": 3.72590651470665e-06, + "loss": 0.78877723, + "num_input_tokens_seen": 6841910, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 2.34375, + "step": 326, + "time_per_iteration": 2.5080325603485107 + }, + { + "auxiliary_loss_clip": 0.01320399, + "auxiliary_loss_mlp": 0.01103572, + "balance_loss_clip": 1.06015599, + "balance_loss_mlp": 1.08845115, + "epoch": 0.019660303622425972, + "flos": 25410857604480.0, + "grad_norm": 2.1551163890972123, + "language_loss": 0.79176939, + "learning_rate": 3.727878498433505e-06, + "loss": 0.8160091, + "num_input_tokens_seen": 6862480, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 2.3125, + "step": 327, + "time_per_iteration": 2.5449633598327637 + }, + { + "auxiliary_loss_clip": 0.01326802, + "auxiliary_loss_mlp": 0.01111954, + "balance_loss_clip": 1.06984949, + "balance_loss_mlp": 1.08873606, + "epoch": 0.01972042687509394, + "flos": 23657519024640.0, + "grad_norm": 2.1574079642063246, + "language_loss": 0.80725288, + "learning_rate": 3.7298444608266328e-06, + "loss": 0.83164048, + "num_input_tokens_seen": 6882015, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 2.390625, + "step": 328, + "time_per_iteration": 2.5418970584869385 + }, + { + "auxiliary_loss_clip": 0.01325663, + "auxiliary_loss_mlp": 0.01096681, + "balance_loss_clip": 1.05278802, + "balance_loss_mlp": 1.08396721, + "epoch": 0.019780550127761913, + "flos": 18223480869120.0, + "grad_norm": 2.245263087715646, + "language_loss": 0.93704766, + "learning_rate": 3.731804438545683e-06, + "loss": 0.96127105, + "num_input_tokens_seen": 6899785, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 2.40625, + "step": 329, + "time_per_iteration": 2.4910004138946533 + }, + { + "auxiliary_loss_clip": 0.01332781, + "auxiliary_loss_mlp": 0.01105781, + "balance_loss_clip": 1.06253231, + "balance_loss_mlp": 1.08930123, + "epoch": 0.01984067338042988, + "flos": 22418780641920.0, + "grad_norm": 2.9776357674257365, + "language_loss": 0.74277973, + "learning_rate": 3.7337584679165324e-06, + "loss": 0.7671653, + "num_input_tokens_seen": 6918575, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 2.4375, + "step": 330, + "time_per_iteration": 2.51430082321167 + }, + { + "auxiliary_loss_clip": 0.01328701, + "auxiliary_loss_mlp": 0.01120913, + "balance_loss_clip": 1.07814097, + "balance_loss_mlp": 1.08762872, + "epoch": 0.01990079663309785, + "flos": 17055199013760.0, + "grad_norm": 2.972763157156593, + "language_loss": 0.93870068, + "learning_rate": 3.7357065849353186e-06, + "loss": 0.96319681, + "num_input_tokens_seen": 6936965, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 2.40625, + "step": 331, + "time_per_iteration": 2.4759159088134766 + }, + { + "auxiliary_loss_clip": 0.01316192, + "auxiliary_loss_mlp": 0.01089699, + "balance_loss_clip": 1.04938233, + "balance_loss_mlp": 1.0853951, + "epoch": 0.01996091988576582, + "flos": 15961791058560.0, + "grad_norm": 2.6958694906457836, + "language_loss": 0.92730892, + "learning_rate": 3.737648825272422e-06, + "loss": 0.95136791, + "num_input_tokens_seen": 6953475, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 2.3125, + "step": 332, + "time_per_iteration": 2.49817156791687 + }, + { + "auxiliary_loss_clip": 0.01325132, + "auxiliary_loss_mlp": 0.01092519, + "balance_loss_clip": 1.04903162, + "balance_loss_mlp": 1.09081161, + "epoch": 0.02002104313843379, + "flos": 23586451966080.0, + "grad_norm": 2.6289067025313777, + "language_loss": 0.75589794, + "learning_rate": 3.739585224276384e-06, + "loss": 0.78007442, + "num_input_tokens_seen": 6971630, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 2.34375, + "step": 333, + "time_per_iteration": 2.5180609226226807 + }, + { + "auxiliary_loss_clip": 0.01323371, + "auxiliary_loss_mlp": 0.01087623, + "balance_loss_clip": 1.04597139, + "balance_loss_mlp": 1.08625877, + "epoch": 0.02008116639110176, + "flos": 34094883352320.0, + "grad_norm": 2.1766901409232426, + "language_loss": 0.78768885, + "learning_rate": 3.7415158169777673e-06, + "loss": 0.81179881, + "num_input_tokens_seen": 6992775, + "router_z_loss_clip": 0.41601562, + "router_z_loss_mlp": 2.375, + "step": 334, + "time_per_iteration": 2.614708423614502 + }, + { + "auxiliary_loss_clip": 0.01324397, + "auxiliary_loss_mlp": 0.01094838, + "balance_loss_clip": 1.05015838, + "balance_loss_mlp": 1.08276975, + "epoch": 0.020141289643769728, + "flos": 19683716469120.0, + "grad_norm": 2.4059127888346916, + "language_loss": 0.83083838, + "learning_rate": 3.7434406380929575e-06, + "loss": 0.85503072, + "num_input_tokens_seen": 7011425, + "router_z_loss_clip": 0.44726562, + "router_z_loss_mlp": 2.421875, + "step": 335, + "time_per_iteration": 2.495260000228882 + }, + { + "auxiliary_loss_clip": 0.01320649, + "auxiliary_loss_mlp": 0.01090782, + "balance_loss_clip": 1.04934454, + "balance_loss_mlp": 1.08585882, + "epoch": 0.020201412896437697, + "flos": 20740567357440.0, + "grad_norm": 2.166489879958422, + "language_loss": 0.92639577, + "learning_rate": 3.745359722027911e-06, + "loss": 0.95051014, + "num_input_tokens_seen": 7029450, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 2.34375, + "step": 336, + "time_per_iteration": 2.526906967163086 + }, + { + "auxiliary_loss_clip": 0.01321744, + "auxiliary_loss_mlp": 0.01083167, + "balance_loss_clip": 1.04139614, + "balance_loss_mlp": 1.08352447, + "epoch": 0.020261536149105665, + "flos": 20266510636800.0, + "grad_norm": 1.825762702383362, + "language_loss": 0.88474333, + "learning_rate": 3.7472731028818428e-06, + "loss": 0.90879244, + "num_input_tokens_seen": 7047555, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 2.390625, + "step": 337, + "time_per_iteration": 2.5151607990264893 + }, + { + "auxiliary_loss_clip": 0.01310297, + "auxiliary_loss_mlp": 0.01101804, + "balance_loss_clip": 1.05836427, + "balance_loss_mlp": 1.08001363, + "epoch": 0.020321659401773638, + "flos": 25848752307840.0, + "grad_norm": 1.5415234153999902, + "language_loss": 0.89914495, + "learning_rate": 3.7491808144508626e-06, + "loss": 0.92326593, + "num_input_tokens_seen": 7068185, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 2.3125, + "step": 338, + "time_per_iteration": 2.5795979499816895 + }, + { + "auxiliary_loss_clip": 0.01324391, + "auxiliary_loss_mlp": 0.01099602, + "balance_loss_clip": 1.05742574, + "balance_loss_mlp": 1.08479571, + "epoch": 0.020381782654441606, + "flos": 17495033051520.0, + "grad_norm": 2.047046576054304, + "language_loss": 0.84801471, + "learning_rate": 3.7510828902315576e-06, + "loss": 0.87225461, + "num_input_tokens_seen": 7085955, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 2.40625, + "step": 339, + "time_per_iteration": 2.4558403491973877 + }, + { + "auxiliary_loss_clip": 0.01326609, + "auxiliary_loss_mlp": 0.01093427, + "balance_loss_clip": 1.05001152, + "balance_loss_mlp": 1.08709431, + "epoch": 0.020441905907109575, + "flos": 24243940465920.0, + "grad_norm": 1.7544231793273473, + "language_loss": 0.88913274, + "learning_rate": 3.75297936342452e-06, + "loss": 0.91333312, + "num_input_tokens_seen": 7106345, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 2.40625, + "step": 340, + "time_per_iteration": 2.5330188274383545 + }, + { + "auxiliary_loss_clip": 0.01323557, + "auxiliary_loss_mlp": 0.01086176, + "balance_loss_clip": 1.04135346, + "balance_loss_mlp": 1.0859195, + "epoch": 0.020502029159777543, + "flos": 22233301787520.0, + "grad_norm": 2.2340783182785975, + "language_loss": 0.88071406, + "learning_rate": 3.7548702669378253e-06, + "loss": 0.90481138, + "num_input_tokens_seen": 7125070, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 2.375, + "step": 341, + "time_per_iteration": 2.502161979675293 + }, + { + "auxiliary_loss_clip": 0.01325847, + "auxiliary_loss_mlp": 0.01099304, + "balance_loss_clip": 1.05643678, + "balance_loss_mlp": 1.08389783, + "epoch": 0.020562152412445512, + "flos": 23987861429760.0, + "grad_norm": 3.2005009235922572, + "language_loss": 0.80293322, + "learning_rate": 3.756755633390458e-06, + "loss": 0.82718468, + "num_input_tokens_seen": 7144675, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 2.421875, + "step": 342, + "time_per_iteration": 2.5315535068511963 + }, + { + "auxiliary_loss_clip": 0.0131301, + "auxiliary_loss_mlp": 0.01098615, + "balance_loss_clip": 1.05293417, + "balance_loss_mlp": 1.08132875, + "epoch": 0.020622275665113484, + "flos": 26975305537920.0, + "grad_norm": 2.399130254204822, + "language_loss": 0.89451253, + "learning_rate": 3.7586354951156886e-06, + "loss": 0.91862881, + "num_input_tokens_seen": 7165505, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 2.3125, + "step": 343, + "time_per_iteration": 2.554255485534668 + }, + { + "auxiliary_loss_clip": 0.01325104, + "auxiliary_loss_mlp": 0.01094315, + "balance_loss_clip": 1.05342627, + "balance_loss_mlp": 1.08973229, + "epoch": 0.020682398917781453, + "flos": 22600704049920.0, + "grad_norm": 2.3234219523507296, + "language_loss": 0.78252918, + "learning_rate": 3.7605098841644e-06, + "loss": 0.80672336, + "num_input_tokens_seen": 7184605, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 2.359375, + "step": 344, + "time_per_iteration": 2.514665365219116 + }, + { + "auxiliary_loss_clip": 0.01309596, + "auxiliary_loss_mlp": 0.0110098, + "balance_loss_clip": 1.05730188, + "balance_loss_mlp": 1.08079529, + "epoch": 0.02074252217044942, + "flos": 15013605790080.0, + "grad_norm": 1.8371023099908983, + "language_loss": 0.75138956, + "learning_rate": 3.7623788323083666e-06, + "loss": 0.77549529, + "num_input_tokens_seen": 7203065, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 2.28125, + "step": 345, + "time_per_iteration": 2.513394594192505 + }, + { + "auxiliary_loss_clip": 0.01318525, + "auxiliary_loss_mlp": 0.01101003, + "balance_loss_clip": 1.05806339, + "balance_loss_mlp": 1.08789146, + "epoch": 0.02080264542311739, + "flos": 25337958952320.0, + "grad_norm": 2.0741733748571565, + "language_loss": 0.90269232, + "learning_rate": 3.7642423710434837e-06, + "loss": 0.92688763, + "num_input_tokens_seen": 7222995, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 2.3125, + "step": 346, + "time_per_iteration": 2.5487060546875 + }, + { + "auxiliary_loss_clip": 0.01314255, + "auxiliary_loss_mlp": 0.01095048, + "balance_loss_clip": 1.05527973, + "balance_loss_mlp": 1.08358788, + "epoch": 0.02086276867578536, + "flos": 24388804016640.0, + "grad_norm": 2.0766581400667, + "language_loss": 0.78869188, + "learning_rate": 3.7661005315929563e-06, + "loss": 0.81278479, + "num_input_tokens_seen": 7244625, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 2.3125, + "step": 347, + "time_per_iteration": 2.516402006149292 + }, + { + "auxiliary_loss_clip": 0.01317315, + "auxiliary_loss_mlp": 0.01096912, + "balance_loss_clip": 1.05335259, + "balance_loss_mlp": 1.08719826, + "epoch": 0.02092289192845333, + "flos": 24462205459200.0, + "grad_norm": 2.4234628631287927, + "language_loss": 0.71424043, + "learning_rate": 3.7679533449104354e-06, + "loss": 0.7383827, + "num_input_tokens_seen": 7263255, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 2.3125, + "step": 348, + "time_per_iteration": 2.5407540798187256 + }, + { + "auxiliary_loss_clip": 0.01319638, + "auxiliary_loss_mlp": 0.01101899, + "balance_loss_clip": 1.0595324, + "balance_loss_mlp": 1.08435416, + "epoch": 0.0209830151811213, + "flos": 17451185523840.0, + "grad_norm": 4.002924557181807, + "language_loss": 0.76819432, + "learning_rate": 3.7698008416831116e-06, + "loss": 0.79240972, + "num_input_tokens_seen": 7279275, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 2.34375, + "step": 349, + "time_per_iteration": 2.4884049892425537 + }, + { + "auxiliary_loss_clip": 0.0130292, + "auxiliary_loss_mlp": 0.0109884, + "balance_loss_clip": 1.05792725, + "balance_loss_mlp": 1.08141851, + "epoch": 0.021043138433789268, + "flos": 24573995562240.0, + "grad_norm": 1.9115672624672835, + "language_loss": 0.85271406, + "learning_rate": 3.7716430523347664e-06, + "loss": 0.87673163, + "num_input_tokens_seen": 7300180, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 2.21875, + "step": 350, + "time_per_iteration": 2.559812307357788 + }, + { + "auxiliary_loss_clip": 0.01311162, + "auxiliary_loss_mlp": 0.01089483, + "balance_loss_clip": 1.05083585, + "balance_loss_mlp": 1.08571863, + "epoch": 0.021103261686457236, + "flos": 24454053072000.0, + "grad_norm": 2.3355222976898764, + "language_loss": 0.80104828, + "learning_rate": 3.773480007028776e-06, + "loss": 0.82505476, + "num_input_tokens_seen": 7317430, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 2.25, + "step": 351, + "time_per_iteration": 5.468780517578125 + }, + { + "auxiliary_loss_clip": 0.01318524, + "auxiliary_loss_mlp": 0.01103443, + "balance_loss_clip": 1.06048024, + "balance_loss_mlp": 1.08623564, + "epoch": 0.021163384939125205, + "flos": 14683083816960.0, + "grad_norm": 3.8473493260702125, + "language_loss": 0.87258279, + "learning_rate": 3.775311735671078e-06, + "loss": 0.89680254, + "num_input_tokens_seen": 7334875, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 2.328125, + "step": 352, + "time_per_iteration": 2.4787278175354004 + }, + { + "auxiliary_loss_clip": 0.01312545, + "auxiliary_loss_mlp": 0.01105111, + "balance_loss_clip": 1.06248152, + "balance_loss_mlp": 1.08574009, + "epoch": 0.021223508191793177, + "flos": 24493195918080.0, + "grad_norm": 1.8920106465676412, + "language_loss": 0.82386625, + "learning_rate": 3.7771382679130878e-06, + "loss": 0.84804279, + "num_input_tokens_seen": 7355185, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 2.265625, + "step": 353, + "time_per_iteration": 2.5428433418273926 + }, + { + "auxiliary_loss_clip": 0.01307832, + "auxiliary_loss_mlp": 0.01091814, + "balance_loss_clip": 1.05133069, + "balance_loss_mlp": 1.08353949, + "epoch": 0.021283631444461146, + "flos": 24126978804480.0, + "grad_norm": 2.0636001035279694, + "language_loss": 0.8102631, + "learning_rate": 3.7789596331545845e-06, + "loss": 0.83425963, + "num_input_tokens_seen": 7374425, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 2.25, + "step": 354, + "time_per_iteration": 2.5261166095733643 + }, + { + "auxiliary_loss_clip": 0.01315043, + "auxiliary_loss_mlp": 0.01092413, + "balance_loss_clip": 1.04935455, + "balance_loss_mlp": 1.08190715, + "epoch": 0.021343754697129114, + "flos": 25192233475200.0, + "grad_norm": 2.8065821662627575, + "language_loss": 0.80764574, + "learning_rate": 3.780775860546545e-06, + "loss": 0.83172029, + "num_input_tokens_seen": 7394175, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 2.328125, + "step": 355, + "time_per_iteration": 2.56968355178833 + }, + { + "auxiliary_loss_clip": 0.01310125, + "auxiliary_loss_mlp": 0.01086869, + "balance_loss_clip": 1.0454793, + "balance_loss_mlp": 1.08140039, + "epoch": 0.021403877949797083, + "flos": 17274182279040.0, + "grad_norm": 2.2488803729957, + "language_loss": 0.89553398, + "learning_rate": 3.7825869789939474e-06, + "loss": 0.91950381, + "num_input_tokens_seen": 7412645, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 2.28125, + "step": 356, + "time_per_iteration": 2.5510213375091553 + }, + { + "auxiliary_loss_clip": 0.01308646, + "auxiliary_loss_mlp": 0.01083372, + "balance_loss_clip": 1.04117227, + "balance_loss_mlp": 1.08451605, + "epoch": 0.021464001202465055, + "flos": 30917435276160.0, + "grad_norm": 2.7055681522526522, + "language_loss": 0.80032516, + "learning_rate": 3.784393017158528e-06, + "loss": 0.82424533, + "num_input_tokens_seen": 7432275, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 2.234375, + "step": 357, + "time_per_iteration": 2.5834848880767822 + }, + { + "auxiliary_loss_clip": 0.01311386, + "auxiliary_loss_mlp": 0.0108216, + "balance_loss_clip": 1.04336917, + "balance_loss_mlp": 1.08195996, + "epoch": 0.021524124455133024, + "flos": 18186385098240.0, + "grad_norm": 2.3810225918991827, + "language_loss": 0.7661376, + "learning_rate": 3.786194003461506e-06, + "loss": 0.7900731, + "num_input_tokens_seen": 7450245, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 2.296875, + "step": 358, + "time_per_iteration": 2.4937031269073486 + }, + { + "auxiliary_loss_clip": 0.01308618, + "auxiliary_loss_mlp": 0.01088514, + "balance_loss_clip": 1.04574156, + "balance_loss_mlp": 1.08024073, + "epoch": 0.021584247707800992, + "flos": 13805786039040.0, + "grad_norm": 3.004949550769694, + "language_loss": 0.88491321, + "learning_rate": 3.787989966086264e-06, + "loss": 0.90888453, + "num_input_tokens_seen": 7466845, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 2.28125, + "step": 359, + "time_per_iteration": 2.452698230743408 + }, + { + "auxiliary_loss_clip": 0.01316066, + "auxiliary_loss_mlp": 0.01089057, + "balance_loss_clip": 1.05000377, + "balance_loss_mlp": 1.08438587, + "epoch": 0.02164437096046896, + "flos": 23294713703040.0, + "grad_norm": 2.789884231725057, + "language_loss": 0.76007903, + "learning_rate": 3.789780932980997e-06, + "loss": 0.78413033, + "num_input_tokens_seen": 7485450, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.3125, + "step": 360, + "time_per_iteration": 2.490006685256958 + }, + { + "auxiliary_loss_clip": 0.01189834, + "auxiliary_loss_mlp": 0.010797, + "balance_loss_clip": 1.06634831, + "balance_loss_mlp": 1.06162107, + "epoch": 0.02170449421313693, + "flos": 68899578341760.0, + "grad_norm": 0.8685264055585812, + "language_loss": 0.64943242, + "learning_rate": 3.79156693186132e-06, + "loss": 0.67212784, + "num_input_tokens_seen": 7553780, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 1.28125, + "step": 361, + "time_per_iteration": 3.1978280544281006 + }, + { + "auxiliary_loss_clip": 0.01307066, + "auxiliary_loss_mlp": 0.01088482, + "balance_loss_clip": 1.04826093, + "balance_loss_mlp": 1.0776422, + "epoch": 0.0217646174658049, + "flos": 25228539146880.0, + "grad_norm": 2.6839093883440213, + "language_loss": 0.78157276, + "learning_rate": 3.7933479902128433e-06, + "loss": 0.80552828, + "num_input_tokens_seen": 7574155, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 2.296875, + "step": 362, + "time_per_iteration": 2.5401153564453125 + }, + { + "auxiliary_loss_clip": 0.0131339, + "auxiliary_loss_mlp": 0.01092034, + "balance_loss_clip": 1.05171776, + "balance_loss_mlp": 1.08265781, + "epoch": 0.02182474071847287, + "flos": 22893124671360.0, + "grad_norm": 2.163466714708112, + "language_loss": 0.92508751, + "learning_rate": 3.7951241352937077e-06, + "loss": 0.94914174, + "num_input_tokens_seen": 7592320, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 2.3125, + "step": 363, + "time_per_iteration": 2.4868171215057373 + }, + { + "auxiliary_loss_clip": 0.01307593, + "auxiliary_loss_mlp": 0.01102168, + "balance_loss_clip": 1.06270981, + "balance_loss_mlp": 1.08121252, + "epoch": 0.02188486397114084, + "flos": 23658991482240.0, + "grad_norm": 2.137373361500905, + "language_loss": 0.89611077, + "learning_rate": 3.7968953941370915e-06, + "loss": 0.92020839, + "num_input_tokens_seen": 7611185, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 2.265625, + "step": 364, + "time_per_iteration": 2.5251975059509277 + }, + { + "auxiliary_loss_clip": 0.01313873, + "auxiliary_loss_mlp": 0.01094072, + "balance_loss_clip": 1.05232477, + "balance_loss_mlp": 1.08512843, + "epoch": 0.021944987223808807, + "flos": 21543637680000.0, + "grad_norm": 2.0040846596101867, + "language_loss": 0.79597497, + "learning_rate": 3.798661793553676e-06, + "loss": 0.82005441, + "num_input_tokens_seen": 7631970, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 2.28125, + "step": 365, + "time_per_iteration": 2.5358779430389404 + }, + { + "auxiliary_loss_clip": 0.01307321, + "auxiliary_loss_mlp": 0.01094001, + "balance_loss_clip": 1.05218291, + "balance_loss_mlp": 1.08262253, + "epoch": 0.022005110476476776, + "flos": 16070887641600.0, + "grad_norm": 2.4198695758814126, + "language_loss": 0.84312123, + "learning_rate": 3.8004233601340808e-06, + "loss": 0.86713445, + "num_input_tokens_seen": 7649745, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 2.25, + "step": 366, + "time_per_iteration": 2.4834306240081787 + }, + { + "auxiliary_loss_clip": 0.01314411, + "auxiliary_loss_mlp": 0.01089093, + "balance_loss_clip": 1.05008757, + "balance_loss_mlp": 1.08409071, + "epoch": 0.022065233729144748, + "flos": 21433715084160.0, + "grad_norm": 2.4790438398014114, + "language_loss": 0.87009263, + "learning_rate": 3.8021801202512694e-06, + "loss": 0.89412761, + "num_input_tokens_seen": 7668830, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.296875, + "step": 367, + "time_per_iteration": 2.486476421356201 + }, + { + "auxiliary_loss_clip": 0.01315695, + "auxiliary_loss_mlp": 0.01094559, + "balance_loss_clip": 1.05247772, + "balance_loss_mlp": 1.08183074, + "epoch": 0.022125356981812717, + "flos": 21543709507200.0, + "grad_norm": 3.1787846704720906, + "language_loss": 0.84725291, + "learning_rate": 3.803932100062912e-06, + "loss": 0.87135541, + "num_input_tokens_seen": 7687240, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 2.34375, + "step": 368, + "time_per_iteration": 2.522035837173462 + }, + { + "auxiliary_loss_clip": 0.01314671, + "auxiliary_loss_mlp": 0.01085486, + "balance_loss_clip": 1.04559815, + "balance_loss_mlp": 1.07997978, + "epoch": 0.022185480234480685, + "flos": 20704153944960.0, + "grad_norm": 3.205334425353566, + "language_loss": 0.75328851, + "learning_rate": 3.8056793255137264e-06, + "loss": 0.77728999, + "num_input_tokens_seen": 7704440, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 2.34375, + "step": 369, + "time_per_iteration": 2.5247385501861572 + }, + { + "auxiliary_loss_clip": 0.01309465, + "auxiliary_loss_mlp": 0.01102826, + "balance_loss_clip": 1.06241453, + "balance_loss_mlp": 1.08204889, + "epoch": 0.022245603487148654, + "flos": 25193203142400.0, + "grad_norm": 2.195001895084689, + "language_loss": 0.82444763, + "learning_rate": 3.8074218223377844e-06, + "loss": 0.84857059, + "num_input_tokens_seen": 7727160, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 2.28125, + "step": 370, + "time_per_iteration": 2.556654453277588 + }, + { + "auxiliary_loss_clip": 0.01308477, + "auxiliary_loss_mlp": 0.01102256, + "balance_loss_clip": 1.06186807, + "balance_loss_mlp": 1.08148122, + "epoch": 0.022305726739816623, + "flos": 21395936954880.0, + "grad_norm": 1.701167396379405, + "language_loss": 0.81576145, + "learning_rate": 3.8091596160607834e-06, + "loss": 0.83986878, + "num_input_tokens_seen": 7747730, + "router_z_loss_clip": 0.40429688, + "router_z_loss_mlp": 2.265625, + "step": 371, + "time_per_iteration": 2.5303707122802734 + }, + { + "auxiliary_loss_clip": 0.01313813, + "auxiliary_loss_mlp": 0.01097647, + "balance_loss_clip": 1.05611479, + "balance_loss_mlp": 1.08685589, + "epoch": 0.022365849992484595, + "flos": 22492146170880.0, + "grad_norm": 2.421527930745161, + "language_loss": 0.83273733, + "learning_rate": 3.8108927320022896e-06, + "loss": 0.85685182, + "num_input_tokens_seen": 7766765, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 2.28125, + "step": 372, + "time_per_iteration": 2.528141975402832 + }, + { + "auxiliary_loss_clip": 0.01304775, + "auxiliary_loss_mlp": 0.01093239, + "balance_loss_clip": 1.05170679, + "balance_loss_mlp": 1.08068216, + "epoch": 0.022425973245152563, + "flos": 17856581397120.0, + "grad_norm": 3.9515576064335742, + "language_loss": 0.78448784, + "learning_rate": 3.8126211952779548e-06, + "loss": 0.80846798, + "num_input_tokens_seen": 7784010, + "router_z_loss_clip": 0.41601562, + "router_z_loss_mlp": 2.234375, + "step": 373, + "time_per_iteration": 2.4879236221313477 + }, + { + "auxiliary_loss_clip": 0.01310159, + "auxiliary_loss_mlp": 0.01088775, + "balance_loss_clip": 1.04681301, + "balance_loss_mlp": 1.08387947, + "epoch": 0.022486096497820532, + "flos": 15483029656320.0, + "grad_norm": 2.577150517784044, + "language_loss": 0.77507353, + "learning_rate": 3.8143450308016952e-06, + "loss": 0.79906291, + "num_input_tokens_seen": 7801305, + "router_z_loss_clip": 0.41992188, + "router_z_loss_mlp": 2.265625, + "step": 374, + "time_per_iteration": 2.467660665512085 + }, + { + "auxiliary_loss_clip": 0.01300907, + "auxiliary_loss_mlp": 0.01075524, + "balance_loss_clip": 1.03415811, + "balance_loss_mlp": 1.07458413, + "epoch": 0.0225462197504885, + "flos": 27784157950080.0, + "grad_norm": 2.1361288872426187, + "language_loss": 0.85989249, + "learning_rate": 3.8160642632878525e-06, + "loss": 0.8836568, + "num_input_tokens_seen": 7823965, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 2.265625, + "step": 375, + "time_per_iteration": 2.555748224258423 + }, + { + "auxiliary_loss_clip": 0.01307901, + "auxiliary_loss_mlp": 0.01100092, + "balance_loss_clip": 1.05767775, + "balance_loss_mlp": 1.08341241, + "epoch": 0.02260634300315647, + "flos": 19975490645760.0, + "grad_norm": 5.5735447387306785, + "language_loss": 0.89170349, + "learning_rate": 3.817778917253314e-06, + "loss": 0.91578341, + "num_input_tokens_seen": 7842115, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 2.25, + "step": 376, + "time_per_iteration": 2.53151798248291 + }, + { + "auxiliary_loss_clip": 0.01309113, + "auxiliary_loss_mlp": 0.01087831, + "balance_loss_clip": 1.04908752, + "balance_loss_mlp": 1.07899499, + "epoch": 0.02266646625582444, + "flos": 16028189349120.0, + "grad_norm": 4.261190841992283, + "language_loss": 0.74947262, + "learning_rate": 3.8194890170196155e-06, + "loss": 0.77344215, + "num_input_tokens_seen": 7857830, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 2.3125, + "step": 377, + "time_per_iteration": 2.463115692138672 + }, + { + "auxiliary_loss_clip": 0.0129987, + "auxiliary_loss_mlp": 0.01091273, + "balance_loss_clip": 1.05009794, + "balance_loss_mlp": 1.08131123, + "epoch": 0.02272658950849241, + "flos": 20404622430720.0, + "grad_norm": 9.398931100052017, + "language_loss": 0.99195766, + "learning_rate": 3.8211945867150055e-06, + "loss": 1.01586914, + "num_input_tokens_seen": 7875840, + "router_z_loss_clip": 0.41210938, + "router_z_loss_mlp": 2.1875, + "step": 378, + "time_per_iteration": 2.4765851497650146 + }, + { + "auxiliary_loss_clip": 0.01180245, + "auxiliary_loss_mlp": 0.0112236, + "balance_loss_clip": 1.10910404, + "balance_loss_mlp": 1.06006432, + "epoch": 0.02278671276116038, + "flos": 69847332647040.0, + "grad_norm": 0.9843357397114052, + "language_loss": 0.75457036, + "learning_rate": 3.822895650276492e-06, + "loss": 0.77759647, + "num_input_tokens_seen": 7940190, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 1.203125, + "step": 379, + "time_per_iteration": 3.113067388534546 + }, + { + "auxiliary_loss_clip": 0.01308809, + "auxiliary_loss_mlp": 0.01083458, + "balance_loss_clip": 1.0448581, + "balance_loss_mlp": 1.07811105, + "epoch": 0.022846836013828347, + "flos": 38508771340800.0, + "grad_norm": 4.195302770466088, + "language_loss": 0.78423429, + "learning_rate": 3.824592231451859e-06, + "loss": 0.80815697, + "num_input_tokens_seen": 7960840, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 2.3125, + "step": 380, + "time_per_iteration": 2.6457204818725586 + }, + { + "auxiliary_loss_clip": 0.01302565, + "auxiliary_loss_mlp": 0.01083038, + "balance_loss_clip": 1.04527259, + "balance_loss_mlp": 1.08019924, + "epoch": 0.02290695926649632, + "flos": 20959478795520.0, + "grad_norm": 2.272240555091753, + "language_loss": 0.9679752, + "learning_rate": 3.826284353801652e-06, + "loss": 0.99183118, + "num_input_tokens_seen": 7975500, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 2.21875, + "step": 381, + "time_per_iteration": 2.485316038131714 + }, + { + "auxiliary_loss_clip": 0.01312325, + "auxiliary_loss_mlp": 0.01084569, + "balance_loss_clip": 1.04501581, + "balance_loss_mlp": 1.08177519, + "epoch": 0.022967082519164288, + "flos": 24022407335040.0, + "grad_norm": 2.322972014312181, + "language_loss": 0.88035834, + "learning_rate": 3.827972040701142e-06, + "loss": 0.90432727, + "num_input_tokens_seen": 7993880, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 2.3125, + "step": 382, + "time_per_iteration": 2.5361156463623047 + }, + { + "auxiliary_loss_clip": 0.01306631, + "auxiliary_loss_mlp": 0.01099641, + "balance_loss_clip": 1.06080246, + "balance_loss_mlp": 1.08242524, + "epoch": 0.023027205771832256, + "flos": 20997149184000.0, + "grad_norm": 2.197151340607638, + "language_loss": 0.84830511, + "learning_rate": 3.829655315342268e-06, + "loss": 0.87236774, + "num_input_tokens_seen": 8012730, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 2.25, + "step": 383, + "time_per_iteration": 2.479097843170166 + }, + { + "auxiliary_loss_clip": 0.01303681, + "auxiliary_loss_mlp": 0.01106386, + "balance_loss_clip": 1.06673658, + "balance_loss_mlp": 1.08259249, + "epoch": 0.023087329024500225, + "flos": 21360816432000.0, + "grad_norm": 2.2992198386883116, + "language_loss": 0.83199835, + "learning_rate": 3.831334200735543e-06, + "loss": 0.85609907, + "num_input_tokens_seen": 8031275, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 2.203125, + "step": 384, + "time_per_iteration": 2.5008413791656494 + }, + { + "auxiliary_loss_clip": 0.01303616, + "auxiliary_loss_mlp": 0.0109643, + "balance_loss_clip": 1.06030965, + "balance_loss_mlp": 1.08539534, + "epoch": 0.023147452277168194, + "flos": 21872435800320.0, + "grad_norm": 1.8570399395654076, + "language_loss": 0.89240694, + "learning_rate": 3.8330087197119426e-06, + "loss": 0.91640741, + "num_input_tokens_seen": 8051600, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 2.1875, + "step": 385, + "time_per_iteration": 2.4913859367370605 + }, + { + "auxiliary_loss_clip": 0.01306859, + "auxiliary_loss_mlp": 0.01121647, + "balance_loss_clip": 1.08397639, + "balance_loss_mlp": 1.0826149, + "epoch": 0.023207575529836166, + "flos": 18916700423040.0, + "grad_norm": 2.2576284783670357, + "language_loss": 0.70096415, + "learning_rate": 3.83467889492477e-06, + "loss": 0.72524917, + "num_input_tokens_seen": 8070600, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 2.234375, + "step": 386, + "time_per_iteration": 2.5017154216766357 + }, + { + "auxiliary_loss_clip": 0.01308067, + "auxiliary_loss_mlp": 0.01098351, + "balance_loss_clip": 1.06072879, + "balance_loss_mlp": 1.08460176, + "epoch": 0.023267698782504134, + "flos": 25046005207680.0, + "grad_norm": 1.9470877788533054, + "language_loss": 0.87909782, + "learning_rate": 3.836344748851495e-06, + "loss": 0.90316188, + "num_input_tokens_seen": 8090680, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 2.234375, + "step": 387, + "time_per_iteration": 2.5142157077789307 + }, + { + "auxiliary_loss_clip": 0.01308318, + "auxiliary_loss_mlp": 0.01085815, + "balance_loss_clip": 1.04666662, + "balance_loss_mlp": 1.08291698, + "epoch": 0.023327822035172103, + "flos": 28879217930880.0, + "grad_norm": 2.441105853176172, + "language_loss": 0.83429295, + "learning_rate": 3.838006303795566e-06, + "loss": 0.85823429, + "num_input_tokens_seen": 8114610, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.25, + "step": 388, + "time_per_iteration": 2.591242790222168 + }, + { + "auxiliary_loss_clip": 0.01305661, + "auxiliary_loss_mlp": 0.01094305, + "balance_loss_clip": 1.05754054, + "balance_loss_mlp": 1.08271885, + "epoch": 0.02338794528784007, + "flos": 27121533805440.0, + "grad_norm": 3.2646980282386644, + "language_loss": 0.93823689, + "learning_rate": 3.839663581888206e-06, + "loss": 0.96223652, + "num_input_tokens_seen": 8133975, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 2.21875, + "step": 389, + "time_per_iteration": 2.5427236557006836 + }, + { + "auxiliary_loss_clip": 0.01299094, + "auxiliary_loss_mlp": 0.01087693, + "balance_loss_clip": 1.04954624, + "balance_loss_mlp": 1.08334351, + "epoch": 0.02344806854050804, + "flos": 21322355944320.0, + "grad_norm": 2.08298220488583, + "language_loss": 0.87901413, + "learning_rate": 3.841316605090178e-06, + "loss": 0.90288198, + "num_input_tokens_seen": 8153570, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 2.15625, + "step": 390, + "time_per_iteration": 2.53519606590271 + }, + { + "auxiliary_loss_clip": 0.01304239, + "auxiliary_loss_mlp": 0.01095828, + "balance_loss_clip": 1.05927861, + "balance_loss_mlp": 1.08334053, + "epoch": 0.023508191793176012, + "flos": 24789997998720.0, + "grad_norm": 2.2293869448662362, + "language_loss": 0.89346433, + "learning_rate": 3.842965395193529e-06, + "loss": 0.91746497, + "num_input_tokens_seen": 8170075, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 2.203125, + "step": 391, + "time_per_iteration": 2.5662949085235596 + }, + { + "auxiliary_loss_clip": 0.01302453, + "auxiliary_loss_mlp": 0.0107275, + "balance_loss_clip": 1.03560483, + "balance_loss_mlp": 1.08116579, + "epoch": 0.02356831504584398, + "flos": 25995375624960.0, + "grad_norm": 2.022763227206087, + "language_loss": 0.86065882, + "learning_rate": 3.84460997382332e-06, + "loss": 0.88441086, + "num_input_tokens_seen": 8190420, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 2.21875, + "step": 392, + "time_per_iteration": 4.050429105758667 + }, + { + "auxiliary_loss_clip": 0.01297975, + "auxiliary_loss_mlp": 0.01086863, + "balance_loss_clip": 1.04990816, + "balance_loss_mlp": 1.08006191, + "epoch": 0.02362843829851195, + "flos": 19062461813760.0, + "grad_norm": 2.9628480690926318, + "language_loss": 0.88900077, + "learning_rate": 3.8462503624393256e-06, + "loss": 0.91284919, + "num_input_tokens_seen": 8208790, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 2.1875, + "step": 393, + "time_per_iteration": 3.9293932914733887 + }, + { + "auxiliary_loss_clip": 0.01309989, + "auxiliary_loss_mlp": 0.01103908, + "balance_loss_clip": 1.06449771, + "balance_loss_mlp": 1.087502, + "epoch": 0.023688561551179918, + "flos": 16071031296000.0, + "grad_norm": 2.0531375516435943, + "language_loss": 0.81400156, + "learning_rate": 3.84788658233771e-06, + "loss": 0.83814055, + "num_input_tokens_seen": 8226885, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 2.21875, + "step": 394, + "time_per_iteration": 2.552100658416748 + }, + { + "auxiliary_loss_clip": 0.01299653, + "auxiliary_loss_mlp": 0.01084647, + "balance_loss_clip": 1.04611897, + "balance_loss_mlp": 1.08043575, + "epoch": 0.023748684803847887, + "flos": 21724375939200.0, + "grad_norm": 2.0447414784698092, + "language_loss": 0.86189264, + "learning_rate": 3.84951865465269e-06, + "loss": 0.88573563, + "num_input_tokens_seen": 8246825, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 2.1875, + "step": 395, + "time_per_iteration": 2.536823272705078 + }, + { + "auxiliary_loss_clip": 0.01174527, + "auxiliary_loss_mlp": 0.01044608, + "balance_loss_clip": 1.03135228, + "balance_loss_mlp": 1.0590049, + "epoch": 0.02380880805651586, + "flos": 61926192881280.0, + "grad_norm": 0.9487784547172928, + "language_loss": 0.63808912, + "learning_rate": 3.851146600358172e-06, + "loss": 0.66028047, + "num_input_tokens_seen": 8302835, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 1.15625, + "step": 396, + "time_per_iteration": 2.935506582260132 + }, + { + "auxiliary_loss_clip": 0.01296295, + "auxiliary_loss_mlp": 0.01069502, + "balance_loss_clip": 1.03252339, + "balance_loss_mlp": 1.07895613, + "epoch": 0.023868931309183827, + "flos": 20266331068800.0, + "grad_norm": 2.6168641306315172, + "language_loss": 0.83744055, + "learning_rate": 3.852770440269372e-06, + "loss": 0.86109853, + "num_input_tokens_seen": 8320745, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 2.171875, + "step": 397, + "time_per_iteration": 2.5051028728485107 + }, + { + "auxiliary_loss_clip": 0.01302535, + "auxiliary_loss_mlp": 0.01091033, + "balance_loss_clip": 1.05288601, + "balance_loss_mlp": 1.08300877, + "epoch": 0.023929054561851796, + "flos": 21139103733120.0, + "grad_norm": 2.535145802301163, + "language_loss": 0.84050488, + "learning_rate": 3.854390195044404e-06, + "loss": 0.86444056, + "num_input_tokens_seen": 8339540, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 2.1875, + "step": 398, + "time_per_iteration": 2.5234646797180176 + }, + { + "auxiliary_loss_clip": 0.01300466, + "auxiliary_loss_mlp": 0.0108273, + "balance_loss_clip": 1.04427278, + "balance_loss_mlp": 1.07864475, + "epoch": 0.023989177814519765, + "flos": 13698521049600.0, + "grad_norm": 2.904470095612531, + "language_loss": 0.85865271, + "learning_rate": 3.856005885185868e-06, + "loss": 0.88248467, + "num_input_tokens_seen": 8354890, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 2.21875, + "step": 399, + "time_per_iteration": 2.4674201011657715 + }, + { + "auxiliary_loss_clip": 0.01295496, + "auxiliary_loss_mlp": 0.01093118, + "balance_loss_clip": 1.05566239, + "balance_loss_mlp": 1.08021355, + "epoch": 0.024049301067187733, + "flos": 26322018929280.0, + "grad_norm": 2.016759933832732, + "language_loss": 0.86157769, + "learning_rate": 3.857617531042398e-06, + "loss": 0.88546383, + "num_input_tokens_seen": 8375845, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 2.15625, + "step": 400, + "time_per_iteration": 2.554075241088867 + }, + { + "auxiliary_loss_clip": 0.01303599, + "auxiliary_loss_mlp": 0.01083552, + "balance_loss_clip": 1.04652512, + "balance_loss_mlp": 1.0848943, + "epoch": 0.024109424319855705, + "flos": 24425432910720.0, + "grad_norm": 3.068890951588493, + "language_loss": 0.79142016, + "learning_rate": 3.8592251528102065e-06, + "loss": 0.8152917, + "num_input_tokens_seen": 8395240, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 2.1875, + "step": 401, + "time_per_iteration": 2.543750286102295 + }, + { + "auxiliary_loss_clip": 0.01297911, + "auxiliary_loss_mlp": 0.01096359, + "balance_loss_clip": 1.05968988, + "balance_loss_mlp": 1.07987046, + "epoch": 0.024169547572523674, + "flos": 29604397610880.0, + "grad_norm": 2.2009554384450154, + "language_loss": 0.78456193, + "learning_rate": 3.8608287705345976e-06, + "loss": 0.80850464, + "num_input_tokens_seen": 8416950, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 2.1875, + "step": 402, + "time_per_iteration": 2.5531415939331055 + }, + { + "auxiliary_loss_clip": 0.0130167, + "auxiliary_loss_mlp": 0.01084273, + "balance_loss_clip": 1.04529142, + "balance_loss_mlp": 1.07989287, + "epoch": 0.024229670825191642, + "flos": 22601458235520.0, + "grad_norm": 2.7198213535828923, + "language_loss": 0.94637424, + "learning_rate": 3.86242840411147e-06, + "loss": 0.97023368, + "num_input_tokens_seen": 8433660, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.21875, + "step": 403, + "time_per_iteration": 2.4873671531677246 + }, + { + "auxiliary_loss_clip": 0.01306025, + "auxiliary_loss_mlp": 0.01095616, + "balance_loss_clip": 1.05620587, + "balance_loss_mlp": 1.07952547, + "epoch": 0.02428979407785961, + "flos": 18150258994560.0, + "grad_norm": 2.3706875621243246, + "language_loss": 0.99751151, + "learning_rate": 3.864024073288798e-06, + "loss": 1.02152789, + "num_input_tokens_seen": 8450180, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 2.265625, + "step": 404, + "time_per_iteration": 2.5400550365448 + }, + { + "auxiliary_loss_clip": 0.01304501, + "auxiliary_loss_mlp": 0.01104455, + "balance_loss_clip": 1.06716657, + "balance_loss_mlp": 1.08213115, + "epoch": 0.024349917330527583, + "flos": 15304984917120.0, + "grad_norm": 2.480197457162756, + "language_loss": 0.87603909, + "learning_rate": 3.865615797668091e-06, + "loss": 0.90012866, + "num_input_tokens_seen": 8467775, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 2.21875, + "step": 405, + "time_per_iteration": 2.4698479175567627 + }, + { + "auxiliary_loss_clip": 0.01314075, + "auxiliary_loss_mlp": 0.01107285, + "balance_loss_clip": 1.06835127, + "balance_loss_mlp": 1.08775485, + "epoch": 0.024410040583195552, + "flos": 20773892200320.0, + "grad_norm": 3.242686201363518, + "language_loss": 0.93258083, + "learning_rate": 3.867203596705844e-06, + "loss": 0.9567945, + "num_input_tokens_seen": 8486765, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 2.265625, + "step": 406, + "time_per_iteration": 2.505598306655884 + }, + { + "auxiliary_loss_clip": 0.01305046, + "auxiliary_loss_mlp": 0.01092168, + "balance_loss_clip": 1.05330622, + "balance_loss_mlp": 1.08378315, + "epoch": 0.02447016383586352, + "flos": 21798854789760.0, + "grad_norm": 2.059728688773918, + "language_loss": 0.87446553, + "learning_rate": 3.86878748971496e-06, + "loss": 0.89843762, + "num_input_tokens_seen": 8506515, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 2.21875, + "step": 407, + "time_per_iteration": 2.5017173290252686 + }, + { + "auxiliary_loss_clip": 0.01301523, + "auxiliary_loss_mlp": 0.01085362, + "balance_loss_clip": 1.04814506, + "balance_loss_mlp": 1.08445001, + "epoch": 0.02453028708853149, + "flos": 33948116380800.0, + "grad_norm": 2.439524495250932, + "language_loss": 0.7404871, + "learning_rate": 3.8703674958661596e-06, + "loss": 0.76435596, + "num_input_tokens_seen": 8528035, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 2.171875, + "step": 408, + "time_per_iteration": 2.6097092628479004 + }, + { + "auxiliary_loss_clip": 0.013061, + "auxiliary_loss_mlp": 0.01096961, + "balance_loss_clip": 1.05771768, + "balance_loss_mlp": 1.08381224, + "epoch": 0.024590410341199458, + "flos": 21793000872960.0, + "grad_norm": 2.750776221383638, + "language_loss": 0.92393035, + "learning_rate": 3.871943634189376e-06, + "loss": 0.94796097, + "num_input_tokens_seen": 8546455, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 2.21875, + "step": 409, + "time_per_iteration": 2.5198304653167725 + }, + { + "auxiliary_loss_clip": 0.01304769, + "auxiliary_loss_mlp": 0.01080478, + "balance_loss_clip": 1.04488206, + "balance_loss_mlp": 1.0854609, + "epoch": 0.02465053359386743, + "flos": 35114782124160.0, + "grad_norm": 1.9763435283924244, + "language_loss": 0.82926536, + "learning_rate": 3.873515923575128e-06, + "loss": 0.85311788, + "num_input_tokens_seen": 8568450, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 2.1875, + "step": 410, + "time_per_iteration": 2.624333143234253 + }, + { + "auxiliary_loss_clip": 0.01307118, + "auxiliary_loss_mlp": 0.01089288, + "balance_loss_clip": 1.05164146, + "balance_loss_mlp": 1.08556843, + "epoch": 0.0247106568465354, + "flos": 27451409333760.0, + "grad_norm": 4.176812441051998, + "language_loss": 0.77715993, + "learning_rate": 3.875084382775879e-06, + "loss": 0.80112404, + "num_input_tokens_seen": 8589340, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 2.21875, + "step": 411, + "time_per_iteration": 2.571401596069336 + }, + { + "auxiliary_loss_clip": 0.01303549, + "auxiliary_loss_mlp": 0.01102238, + "balance_loss_clip": 1.06311393, + "balance_loss_mlp": 1.08078265, + "epoch": 0.024770780099203367, + "flos": 20703794808960.0, + "grad_norm": 2.1103060729449883, + "language_loss": 0.86276567, + "learning_rate": 3.87664903040738e-06, + "loss": 0.88682353, + "num_input_tokens_seen": 8607150, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.21875, + "step": 412, + "time_per_iteration": 2.4968833923339844 + }, + { + "auxiliary_loss_clip": 0.01168305, + "auxiliary_loss_mlp": 0.01068817, + "balance_loss_clip": 1.05632353, + "balance_loss_mlp": 1.05478358, + "epoch": 0.024830903351871336, + "flos": 69551859369600.0, + "grad_norm": 0.8568818905087673, + "language_loss": 0.58512402, + "learning_rate": 3.878209884949994e-06, + "loss": 0.60749531, + "num_input_tokens_seen": 8669865, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 1.1328125, + "step": 413, + "time_per_iteration": 3.1763217449188232 + }, + { + "auxiliary_loss_clip": 0.01296528, + "auxiliary_loss_mlp": 0.01092205, + "balance_loss_clip": 1.05145931, + "balance_loss_mlp": 1.07941055, + "epoch": 0.024891026604539304, + "flos": 32270477713920.0, + "grad_norm": 1.7554792190049524, + "language_loss": 0.80704832, + "learning_rate": 3.879766964750006e-06, + "loss": 0.83093566, + "num_input_tokens_seen": 8690235, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 2.171875, + "step": 414, + "time_per_iteration": 2.5954627990722656 + }, + { + "auxiliary_loss_clip": 0.01292737, + "auxiliary_loss_mlp": 0.01093441, + "balance_loss_clip": 1.05660486, + "balance_loss_mlp": 1.07739186, + "epoch": 0.024951149857207276, + "flos": 18840282238080.0, + "grad_norm": 2.3796689224247904, + "language_loss": 0.80473328, + "learning_rate": 3.881320288020917e-06, + "loss": 0.82859504, + "num_input_tokens_seen": 8706295, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 2.15625, + "step": 415, + "time_per_iteration": 2.471665620803833 + }, + { + "auxiliary_loss_clip": 0.0131185, + "auxiliary_loss_mlp": 0.01085672, + "balance_loss_clip": 1.0481931, + "balance_loss_mlp": 1.08601356, + "epoch": 0.025011273109875245, + "flos": 15377201210880.0, + "grad_norm": 5.333540620494007, + "language_loss": 0.96179891, + "learning_rate": 3.882869872844723e-06, + "loss": 0.98577416, + "num_input_tokens_seen": 8724200, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 2.25, + "step": 416, + "time_per_iteration": 2.5133068561553955 + }, + { + "auxiliary_loss_clip": 0.01300197, + "auxiliary_loss_mlp": 0.01076153, + "balance_loss_clip": 1.03702867, + "balance_loss_mlp": 1.0806849, + "epoch": 0.025071396362543213, + "flos": 18915515274240.0, + "grad_norm": 2.409464042642492, + "language_loss": 0.77541196, + "learning_rate": 3.884415737173176e-06, + "loss": 0.79917544, + "num_input_tokens_seen": 8744170, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.1875, + "step": 417, + "time_per_iteration": 2.5206987857818604 + }, + { + "auxiliary_loss_clip": 0.01297091, + "auxiliary_loss_mlp": 0.01092626, + "balance_loss_clip": 1.05512297, + "balance_loss_mlp": 1.08281994, + "epoch": 0.025131519615211182, + "flos": 25337958952320.0, + "grad_norm": 1.6345521849457858, + "language_loss": 0.7689445, + "learning_rate": 3.8859578988290344e-06, + "loss": 0.79284167, + "num_input_tokens_seen": 8765120, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 2.140625, + "step": 418, + "time_per_iteration": 2.6002862453460693 + }, + { + "auxiliary_loss_clip": 0.01304842, + "auxiliary_loss_mlp": 0.01075451, + "balance_loss_clip": 1.03873444, + "balance_loss_mlp": 1.08383846, + "epoch": 0.02519164286787915, + "flos": 18953149749120.0, + "grad_norm": 2.548681745998596, + "language_loss": 0.81088459, + "learning_rate": 3.887496375507294e-06, + "loss": 0.83468759, + "num_input_tokens_seen": 8783500, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 2.203125, + "step": 419, + "time_per_iteration": 2.5097553730010986 + }, + { + "auxiliary_loss_clip": 0.01298642, + "auxiliary_loss_mlp": 0.01085388, + "balance_loss_clip": 1.0453577, + "balance_loss_mlp": 1.08236253, + "epoch": 0.025251766120547123, + "flos": 17421092904960.0, + "grad_norm": 1.9166879875817555, + "language_loss": 0.73812175, + "learning_rate": 3.8890311847764065e-06, + "loss": 0.761962, + "num_input_tokens_seen": 8801175, + "router_z_loss_clip": 0.40039062, + "router_z_loss_mlp": 2.15625, + "step": 420, + "time_per_iteration": 2.480468511581421 + }, + { + "auxiliary_loss_clip": 0.01298409, + "auxiliary_loss_mlp": 0.01098321, + "balance_loss_clip": 1.06086528, + "balance_loss_mlp": 1.0791508, + "epoch": 0.02531188937321509, + "flos": 25045430590080.0, + "grad_norm": 1.7246544027149788, + "language_loss": 0.78928417, + "learning_rate": 3.890562344079484e-06, + "loss": 0.8132515, + "num_input_tokens_seen": 8820215, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 2.1875, + "step": 421, + "time_per_iteration": 2.583979845046997 + }, + { + "auxiliary_loss_clip": 0.01300301, + "auxiliary_loss_mlp": 0.01095113, + "balance_loss_clip": 1.05589294, + "balance_loss_mlp": 1.08374381, + "epoch": 0.02537201262588306, + "flos": 30592228515840.0, + "grad_norm": 2.879256315405443, + "language_loss": 0.81915486, + "learning_rate": 3.89208987073549e-06, + "loss": 0.84310895, + "num_input_tokens_seen": 8839660, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 2.171875, + "step": 422, + "time_per_iteration": 2.5834591388702393 + }, + { + "auxiliary_loss_clip": 0.01299282, + "auxiliary_loss_mlp": 0.01079788, + "balance_loss_clip": 1.0445497, + "balance_loss_mlp": 1.07925105, + "epoch": 0.02543213587855103, + "flos": 26065365275520.0, + "grad_norm": 1.9426129656279463, + "language_loss": 0.83468062, + "learning_rate": 3.893613781940409e-06, + "loss": 0.85847133, + "num_input_tokens_seen": 8859280, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 2.203125, + "step": 423, + "time_per_iteration": 2.5526318550109863 + }, + { + "auxiliary_loss_clip": 0.01293361, + "auxiliary_loss_mlp": 0.01086715, + "balance_loss_clip": 1.04978371, + "balance_loss_mlp": 1.07668817, + "epoch": 0.025492259131218997, + "flos": 36022818965760.0, + "grad_norm": 2.7010989411926367, + "language_loss": 0.74435121, + "learning_rate": 3.895134094768415e-06, + "loss": 0.768152, + "num_input_tokens_seen": 8880560, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 2.171875, + "step": 424, + "time_per_iteration": 2.606895923614502 + }, + { + "auxiliary_loss_clip": 0.01303473, + "auxiliary_loss_mlp": 0.01097188, + "balance_loss_clip": 1.06113958, + "balance_loss_mlp": 1.08349586, + "epoch": 0.02555238238388697, + "flos": 18588045957120.0, + "grad_norm": 2.227147445366898, + "language_loss": 0.83008313, + "learning_rate": 3.896650826173015e-06, + "loss": 0.85408974, + "num_input_tokens_seen": 8899155, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 2.203125, + "step": 425, + "time_per_iteration": 2.522517442703247 + }, + { + "auxiliary_loss_clip": 0.01299491, + "auxiliary_loss_mlp": 0.01096328, + "balance_loss_clip": 1.05691719, + "balance_loss_mlp": 1.07528758, + "epoch": 0.025612505636554938, + "flos": 24243186280320.0, + "grad_norm": 2.394258070540652, + "language_loss": 0.85481966, + "learning_rate": 3.898163992988186e-06, + "loss": 0.87877786, + "num_input_tokens_seen": 8917890, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 2.25, + "step": 426, + "time_per_iteration": 2.5039095878601074 + }, + { + "auxiliary_loss_clip": 0.01160068, + "auxiliary_loss_mlp": 0.0104803, + "balance_loss_clip": 1.03663349, + "balance_loss_mlp": 1.04526472, + "epoch": 0.025672628889222907, + "flos": 60586941265920.0, + "grad_norm": 0.8962322500302954, + "language_loss": 0.57186544, + "learning_rate": 3.899673611929491e-06, + "loss": 0.5939464, + "num_input_tokens_seen": 8978260, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 1.1484375, + "step": 427, + "time_per_iteration": 3.2289342880249023 + }, + { + "auxiliary_loss_clip": 0.01297452, + "auxiliary_loss_mlp": 0.01095521, + "balance_loss_clip": 1.05849457, + "balance_loss_mlp": 1.0838623, + "epoch": 0.025732752141890875, + "flos": 19573255169280.0, + "grad_norm": 2.6536896946259816, + "language_loss": 0.88190198, + "learning_rate": 3.901179699595194e-06, + "loss": 0.90583158, + "num_input_tokens_seen": 8994460, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 2.125, + "step": 428, + "time_per_iteration": 2.500389814376831 + }, + { + "auxiliary_loss_clip": 0.01290417, + "auxiliary_loss_mlp": 0.01078869, + "balance_loss_clip": 1.03972101, + "balance_loss_mlp": 1.07718623, + "epoch": 0.025792875394558847, + "flos": 31284262920960.0, + "grad_norm": 1.6692033855414803, + "language_loss": 0.85672665, + "learning_rate": 3.902682272467353e-06, + "loss": 0.88041949, + "num_input_tokens_seen": 9016670, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.125, + "step": 429, + "time_per_iteration": 2.605687379837036 + }, + { + "auxiliary_loss_clip": 0.01297427, + "auxiliary_loss_mlp": 0.01082502, + "balance_loss_clip": 1.04373491, + "balance_loss_mlp": 1.07673144, + "epoch": 0.025852998647226816, + "flos": 32379610210560.0, + "grad_norm": 2.5023850128037672, + "language_loss": 0.88384748, + "learning_rate": 3.904181346912895e-06, + "loss": 0.90764678, + "num_input_tokens_seen": 9039720, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 2.203125, + "step": 430, + "time_per_iteration": 2.593492269515991 + }, + { + "auxiliary_loss_clip": 0.01298542, + "auxiliary_loss_mlp": 0.01083596, + "balance_loss_clip": 1.04799962, + "balance_loss_mlp": 1.08428442, + "epoch": 0.025913121899894784, + "flos": 20193288762240.0, + "grad_norm": 1.9811912271744876, + "language_loss": 0.84202254, + "learning_rate": 3.905676939184698e-06, + "loss": 0.86584389, + "num_input_tokens_seen": 9059850, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 2.140625, + "step": 431, + "time_per_iteration": 2.5326902866363525 + }, + { + "auxiliary_loss_clip": 0.01291302, + "auxiliary_loss_mlp": 0.01073914, + "balance_loss_clip": 1.03886628, + "balance_loss_mlp": 1.0772872, + "epoch": 0.025973245152562753, + "flos": 14720430983040.0, + "grad_norm": 2.686150654607635, + "language_loss": 0.86775959, + "learning_rate": 3.907169065422638e-06, + "loss": 0.89141178, + "num_input_tokens_seen": 9077590, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 2.140625, + "step": 432, + "time_per_iteration": 2.4793269634246826 + }, + { + "auxiliary_loss_clip": 0.01296964, + "auxiliary_loss_mlp": 0.01080084, + "balance_loss_clip": 1.04491723, + "balance_loss_mlp": 1.08109105, + "epoch": 0.02603336840523072, + "flos": 30992991534720.0, + "grad_norm": 2.6953453355349684, + "language_loss": 0.76074433, + "learning_rate": 3.908657741654636e-06, + "loss": 0.78451484, + "num_input_tokens_seen": 9099880, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 2.15625, + "step": 433, + "time_per_iteration": 2.6125545501708984 + }, + { + "auxiliary_loss_clip": 0.01296292, + "auxiliary_loss_mlp": 0.0109282, + "balance_loss_clip": 1.05312383, + "balance_loss_mlp": 1.07772529, + "epoch": 0.026093491657898694, + "flos": 17674262939520.0, + "grad_norm": 2.2540618473103247, + "language_loss": 0.89764363, + "learning_rate": 3.910142983797699e-06, + "loss": 0.92153478, + "num_input_tokens_seen": 9118620, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 2.1875, + "step": 434, + "time_per_iteration": 5.3097922801971436 + }, + { + "auxiliary_loss_clip": 0.01297376, + "auxiliary_loss_mlp": 0.01102904, + "balance_loss_clip": 1.06404209, + "balance_loss_mlp": 1.08362865, + "epoch": 0.026153614910566662, + "flos": 17857874286720.0, + "grad_norm": 6.328317132251919, + "language_loss": 0.7985189, + "learning_rate": 3.9116248076589305e-06, + "loss": 0.82252169, + "num_input_tokens_seen": 9135655, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 2.125, + "step": 435, + "time_per_iteration": 3.9629530906677246 + }, + { + "auxiliary_loss_clip": 0.01291104, + "auxiliary_loss_mlp": 0.01091144, + "balance_loss_clip": 1.05316401, + "balance_loss_mlp": 1.0750463, + "epoch": 0.02621373816323463, + "flos": 20011113959040.0, + "grad_norm": 3.559504815450524, + "language_loss": 0.86357677, + "learning_rate": 3.913103228936546e-06, + "loss": 0.88739926, + "num_input_tokens_seen": 9153520, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 2.15625, + "step": 436, + "time_per_iteration": 2.479033946990967 + }, + { + "auxiliary_loss_clip": 0.01296325, + "auxiliary_loss_mlp": 0.01099771, + "balance_loss_clip": 1.06214869, + "balance_loss_mlp": 1.07964039, + "epoch": 0.0262738614159026, + "flos": 19281193683840.0, + "grad_norm": 2.6168892141891944, + "language_loss": 0.75002837, + "learning_rate": 3.914578263220868e-06, + "loss": 0.77398932, + "num_input_tokens_seen": 9170750, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 2.171875, + "step": 437, + "time_per_iteration": 2.508769989013672 + }, + { + "auxiliary_loss_clip": 0.01293849, + "auxiliary_loss_mlp": 0.01104049, + "balance_loss_clip": 1.06380415, + "balance_loss_mlp": 1.08015561, + "epoch": 0.026333984668570568, + "flos": 18807208790400.0, + "grad_norm": 2.3031145987765758, + "language_loss": 0.91467845, + "learning_rate": 3.916049925995316e-06, + "loss": 0.93865746, + "num_input_tokens_seen": 9188430, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 2.140625, + "step": 438, + "time_per_iteration": 2.4693844318389893 + }, + { + "auxiliary_loss_clip": 0.01155458, + "auxiliary_loss_mlp": 0.01064255, + "balance_loss_clip": 1.05276346, + "balance_loss_mlp": 1.0448494, + "epoch": 0.02639410792123854, + "flos": 64572020691840.0, + "grad_norm": 0.877669139368542, + "language_loss": 0.62577796, + "learning_rate": 3.917518232637377e-06, + "loss": 0.64797509, + "num_input_tokens_seen": 9255835, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 1.109375, + "step": 439, + "time_per_iteration": 3.162259101867676 + }, + { + "auxiliary_loss_clip": 0.01303989, + "auxiliary_loss_mlp": 0.01098096, + "balance_loss_clip": 1.05873275, + "balance_loss_mlp": 1.08440769, + "epoch": 0.02645423117390651, + "flos": 28473462921600.0, + "grad_norm": 2.1384369611317493, + "language_loss": 0.75629139, + "learning_rate": 3.918983198419573e-06, + "loss": 0.78031218, + "num_input_tokens_seen": 9276835, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 2.203125, + "step": 440, + "time_per_iteration": 2.5541677474975586 + }, + { + "auxiliary_loss_clip": 0.01294139, + "auxiliary_loss_mlp": 0.01082398, + "balance_loss_clip": 1.04408443, + "balance_loss_mlp": 1.08003163, + "epoch": 0.026514354426574478, + "flos": 18551237495040.0, + "grad_norm": 1.9583565981573345, + "language_loss": 0.83186466, + "learning_rate": 3.920444838510415e-06, + "loss": 0.85563004, + "num_input_tokens_seen": 9295075, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 2.140625, + "step": 441, + "time_per_iteration": 2.453705072402954 + }, + { + "auxiliary_loss_clip": 0.01298235, + "auxiliary_loss_mlp": 0.01092726, + "balance_loss_clip": 1.05286217, + "balance_loss_mlp": 1.07855892, + "epoch": 0.026574477679242446, + "flos": 20667812359680.0, + "grad_norm": 2.035076381127293, + "language_loss": 0.7850582, + "learning_rate": 3.92190316797534e-06, + "loss": 0.80896777, + "num_input_tokens_seen": 9314205, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 2.203125, + "step": 442, + "time_per_iteration": 2.477555990219116 + }, + { + "auxiliary_loss_clip": 0.01145517, + "auxiliary_loss_mlp": 0.01012445, + "balance_loss_clip": 1.00185919, + "balance_loss_mlp": 1.04045749, + "epoch": 0.026634600931910415, + "flos": 57956125340160.0, + "grad_norm": 0.9584767110468104, + "language_loss": 0.64475185, + "learning_rate": 3.92335820177765e-06, + "loss": 0.66633147, + "num_input_tokens_seen": 9367395, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 1.046875, + "step": 443, + "time_per_iteration": 2.9838714599609375 + }, + { + "auxiliary_loss_clip": 0.01297944, + "auxiliary_loss_mlp": 0.01087685, + "balance_loss_clip": 1.04941845, + "balance_loss_mlp": 1.08318424, + "epoch": 0.026694724184578387, + "flos": 15815131827840.0, + "grad_norm": 2.4335650573352483, + "language_loss": 0.82707053, + "learning_rate": 3.924809954779425e-06, + "loss": 0.85092688, + "num_input_tokens_seen": 9385185, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 2.140625, + "step": 444, + "time_per_iteration": 2.4520323276519775 + }, + { + "auxiliary_loss_clip": 0.0130195, + "auxiliary_loss_mlp": 0.0108515, + "balance_loss_clip": 1.0440464, + "balance_loss_mlp": 1.08103406, + "epoch": 0.026754847437246355, + "flos": 23440259612160.0, + "grad_norm": 2.6903851096875733, + "language_loss": 0.95400113, + "learning_rate": 3.9262584417424425e-06, + "loss": 0.97787213, + "num_input_tokens_seen": 9403225, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 2.21875, + "step": 445, + "time_per_iteration": 2.5113518238067627 + }, + { + "auxiliary_loss_clip": 0.01296406, + "auxiliary_loss_mlp": 0.01096489, + "balance_loss_clip": 1.05657816, + "balance_loss_mlp": 1.08177555, + "epoch": 0.026814970689914324, + "flos": 17341801632000.0, + "grad_norm": 2.416617421630428, + "language_loss": 0.91790259, + "learning_rate": 3.9277036773290725e-06, + "loss": 0.94183153, + "num_input_tokens_seen": 9420540, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 2.15625, + "step": 446, + "time_per_iteration": 2.4585111141204834 + }, + { + "auxiliary_loss_clip": 0.01293099, + "auxiliary_loss_mlp": 0.01085762, + "balance_loss_clip": 1.04718637, + "balance_loss_mlp": 1.08102632, + "epoch": 0.026875093942582293, + "flos": 17894718662400.0, + "grad_norm": 2.3983095061811635, + "language_loss": 0.80024058, + "learning_rate": 3.92914567610317e-06, + "loss": 0.82402921, + "num_input_tokens_seen": 9438840, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 2.125, + "step": 447, + "time_per_iteration": 2.509643316268921 + }, + { + "auxiliary_loss_clip": 0.01292768, + "auxiliary_loss_mlp": 0.01072511, + "balance_loss_clip": 1.03658175, + "balance_loss_mlp": 1.07935369, + "epoch": 0.026935217195250265, + "flos": 21723980889600.0, + "grad_norm": 2.4579217038825423, + "language_loss": 0.86773896, + "learning_rate": 3.930584452530952e-06, + "loss": 0.89139175, + "num_input_tokens_seen": 9457215, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 2.125, + "step": 448, + "time_per_iteration": 2.477384328842163 + }, + { + "auxiliary_loss_clip": 0.01287268, + "auxiliary_loss_mlp": 0.01093327, + "balance_loss_clip": 1.0583508, + "balance_loss_mlp": 1.07870793, + "epoch": 0.026995340447918233, + "flos": 23622685810560.0, + "grad_norm": 2.1426472419274503, + "language_loss": 0.88779259, + "learning_rate": 3.9320200209818755e-06, + "loss": 0.91159856, + "num_input_tokens_seen": 9475615, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 2.078125, + "step": 449, + "time_per_iteration": 2.50108003616333 + }, + { + "auxiliary_loss_clip": 0.01298718, + "auxiliary_loss_mlp": 0.01087936, + "balance_loss_clip": 1.04897857, + "balance_loss_mlp": 1.08056545, + "epoch": 0.027055463700586202, + "flos": 17931275729280.0, + "grad_norm": 1.9975703664508544, + "language_loss": 0.80516291, + "learning_rate": 3.933452395729493e-06, + "loss": 0.82902944, + "num_input_tokens_seen": 9493975, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.1875, + "step": 450, + "time_per_iteration": 2.470536947250366 + }, + { + "auxiliary_loss_clip": 0.01291132, + "auxiliary_loss_mlp": 0.01077476, + "balance_loss_clip": 1.03973413, + "balance_loss_mlp": 1.08217299, + "epoch": 0.02711558695325417, + "flos": 25118903859840.0, + "grad_norm": 2.7768383062811637, + "language_loss": 0.81500483, + "learning_rate": 3.934881590952304e-06, + "loss": 0.83869088, + "num_input_tokens_seen": 9514810, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 2.09375, + "step": 451, + "time_per_iteration": 2.530539035797119 + }, + { + "auxiliary_loss_clip": 0.01289442, + "auxiliary_loss_mlp": 0.0109125, + "balance_loss_clip": 1.0524354, + "balance_loss_mlp": 1.08151317, + "epoch": 0.02717571020592214, + "flos": 24239559006720.0, + "grad_norm": 1.5925691418309382, + "language_loss": 0.76994318, + "learning_rate": 3.936307620734599e-06, + "loss": 0.79375011, + "num_input_tokens_seen": 9533635, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 2.078125, + "step": 452, + "time_per_iteration": 2.5138871669769287 + }, + { + "auxiliary_loss_clip": 0.01292925, + "auxiliary_loss_mlp": 0.01088314, + "balance_loss_clip": 1.0507158, + "balance_loss_mlp": 1.08201516, + "epoch": 0.02723583345859011, + "flos": 25118939773440.0, + "grad_norm": 1.9334646917545748, + "language_loss": 0.73053265, + "learning_rate": 3.937730499067294e-06, + "loss": 0.754345, + "num_input_tokens_seen": 9555420, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 2.109375, + "step": 453, + "time_per_iteration": 2.5271401405334473 + }, + { + "auxiliary_loss_clip": 0.01288113, + "auxiliary_loss_mlp": 0.01086026, + "balance_loss_clip": 1.04952383, + "balance_loss_mlp": 1.08018303, + "epoch": 0.02729595671125808, + "flos": 42741597847680.0, + "grad_norm": 1.845498968311748, + "language_loss": 0.82439983, + "learning_rate": 3.939150239848748e-06, + "loss": 0.84814119, + "num_input_tokens_seen": 9578950, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 2.078125, + "step": 454, + "time_per_iteration": 2.6724069118499756 + }, + { + "auxiliary_loss_clip": 0.01290287, + "auxiliary_loss_mlp": 0.01078957, + "balance_loss_clip": 1.04491115, + "balance_loss_mlp": 1.0808264, + "epoch": 0.02735607996392605, + "flos": 21430985650560.0, + "grad_norm": 2.1414002490484005, + "language_loss": 0.75815403, + "learning_rate": 3.9405668568855866e-06, + "loss": 0.78184646, + "num_input_tokens_seen": 9598160, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 2.09375, + "step": 455, + "time_per_iteration": 2.496913194656372 + }, + { + "auxiliary_loss_clip": 0.01290624, + "auxiliary_loss_mlp": 0.01097119, + "balance_loss_clip": 1.06114161, + "balance_loss_mlp": 1.07846022, + "epoch": 0.027416203216594017, + "flos": 20851280052480.0, + "grad_norm": 2.102028743174525, + "language_loss": 0.80576169, + "learning_rate": 3.941980363893499e-06, + "loss": 0.82963914, + "num_input_tokens_seen": 9616010, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 2.125, + "step": 456, + "time_per_iteration": 2.4748263359069824 + }, + { + "auxiliary_loss_clip": 0.01286184, + "auxiliary_loss_mlp": 0.01078793, + "balance_loss_clip": 1.04152811, + "balance_loss_mlp": 1.07863176, + "epoch": 0.027476326469261986, + "flos": 13224500242560.0, + "grad_norm": 2.479828414472028, + "language_loss": 0.81621009, + "learning_rate": 3.9433907744980384e-06, + "loss": 0.83985978, + "num_input_tokens_seen": 9634000, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 2.078125, + "step": 457, + "time_per_iteration": 2.5122945308685303 + }, + { + "auxiliary_loss_clip": 0.01289671, + "auxiliary_loss_mlp": 0.01084101, + "balance_loss_clip": 1.04728937, + "balance_loss_mlp": 1.07828617, + "epoch": 0.027536449721929958, + "flos": 24024526237440.0, + "grad_norm": 2.0492464691581476, + "language_loss": 0.94062889, + "learning_rate": 3.944798102235412e-06, + "loss": 0.96436661, + "num_input_tokens_seen": 9653455, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 2.109375, + "step": 458, + "time_per_iteration": 2.542919874191284 + }, + { + "auxiliary_loss_clip": 0.01287914, + "auxiliary_loss_mlp": 0.01093849, + "balance_loss_clip": 1.05872989, + "balance_loss_mlp": 1.07926297, + "epoch": 0.027596572974597926, + "flos": 13006055681280.0, + "grad_norm": 2.4293190258203774, + "language_loss": 0.79353511, + "learning_rate": 3.9462023605532545e-06, + "loss": 0.81735277, + "num_input_tokens_seen": 9669650, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 2.09375, + "step": 459, + "time_per_iteration": 2.472830295562744 + }, + { + "auxiliary_loss_clip": 0.01293203, + "auxiliary_loss_mlp": 0.01082653, + "balance_loss_clip": 1.04360008, + "balance_loss_mlp": 1.08543491, + "epoch": 0.027656696227265895, + "flos": 26143076350080.0, + "grad_norm": 1.8472887331493792, + "language_loss": 0.83103061, + "learning_rate": 3.947603562811407e-06, + "loss": 0.85478914, + "num_input_tokens_seen": 9691415, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.078125, + "step": 460, + "time_per_iteration": 2.5376338958740234 + }, + { + "auxiliary_loss_clip": 0.01140517, + "auxiliary_loss_mlp": 0.01043703, + "balance_loss_clip": 1.03488147, + "balance_loss_mlp": 1.03798664, + "epoch": 0.027716819479933864, + "flos": 60697222997760.0, + "grad_norm": 1.5738760379538346, + "language_loss": 0.73565412, + "learning_rate": 3.949001722282675e-06, + "loss": 0.7574963, + "num_input_tokens_seen": 9755605, + "router_z_loss_clip": 0.08837891, + "router_z_loss_mlp": 1.0234375, + "step": 461, + "time_per_iteration": 3.0358285903930664 + }, + { + "auxiliary_loss_clip": 0.01289208, + "auxiliary_loss_mlp": 0.01081781, + "balance_loss_clip": 1.04735351, + "balance_loss_mlp": 1.086905, + "epoch": 0.027776942732601832, + "flos": 31211938886400.0, + "grad_norm": 2.85425781388422, + "language_loss": 0.81291741, + "learning_rate": 3.950396852153582e-06, + "loss": 0.83662736, + "num_input_tokens_seen": 9776270, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 2.015625, + "step": 462, + "time_per_iteration": 2.6079564094543457 + }, + { + "auxiliary_loss_clip": 0.01287586, + "auxiliary_loss_mlp": 0.01073577, + "balance_loss_clip": 1.04096127, + "balance_loss_mlp": 1.08167982, + "epoch": 0.027837065985269804, + "flos": 22674644196480.0, + "grad_norm": 2.2822341634579195, + "language_loss": 0.90235889, + "learning_rate": 3.951788965525118e-06, + "loss": 0.92597055, + "num_input_tokens_seen": 9794465, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 2.0625, + "step": 463, + "time_per_iteration": 2.4881155490875244 + }, + { + "auxiliary_loss_clip": 0.01137482, + "auxiliary_loss_mlp": 0.01014393, + "balance_loss_clip": 1.00561893, + "balance_loss_mlp": 1.03824747, + "epoch": 0.027897189237937773, + "flos": 62182487399040.0, + "grad_norm": 0.8835585057209928, + "language_loss": 0.59031862, + "learning_rate": 3.953178075413476e-06, + "loss": 0.61183739, + "num_input_tokens_seen": 9849685, + "router_z_loss_clip": 0.08789062, + "router_z_loss_mlp": 0.9921875, + "step": 464, + "time_per_iteration": 3.039377212524414 + }, + { + "auxiliary_loss_clip": 0.01299905, + "auxiliary_loss_mlp": 0.01097461, + "balance_loss_clip": 1.06081581, + "balance_loss_mlp": 1.08716702, + "epoch": 0.02795731249060574, + "flos": 24493160004480.0, + "grad_norm": 2.8663863440598525, + "language_loss": 0.81203198, + "learning_rate": 3.954564194750784e-06, + "loss": 0.83600569, + "num_input_tokens_seen": 9869505, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 2.125, + "step": 465, + "time_per_iteration": 2.5197718143463135 + }, + { + "auxiliary_loss_clip": 0.01286546, + "auxiliary_loss_mlp": 0.01082829, + "balance_loss_clip": 1.04708982, + "balance_loss_mlp": 1.08028877, + "epoch": 0.02801743574327371, + "flos": 23733003456000.0, + "grad_norm": 2.004656273762408, + "language_loss": 0.78560221, + "learning_rate": 3.955947336385828e-06, + "loss": 0.80929601, + "num_input_tokens_seen": 9890950, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 2.0625, + "step": 466, + "time_per_iteration": 2.5151565074920654 + }, + { + "auxiliary_loss_clip": 0.01285777, + "auxiliary_loss_mlp": 0.01085472, + "balance_loss_clip": 1.05075812, + "balance_loss_mlp": 1.0816046, + "epoch": 0.02807755899594168, + "flos": 20629100476800.0, + "grad_norm": 2.05931728393333, + "language_loss": 0.87548482, + "learning_rate": 3.957327513084761e-06, + "loss": 0.89919734, + "num_input_tokens_seen": 9911265, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 2.03125, + "step": 467, + "time_per_iteration": 2.4994542598724365 + }, + { + "auxiliary_loss_clip": 0.01289137, + "auxiliary_loss_mlp": 0.01106554, + "balance_loss_clip": 1.06969416, + "balance_loss_mlp": 1.08202362, + "epoch": 0.02813768224860965, + "flos": 19244564789760.0, + "grad_norm": 2.728881931821799, + "language_loss": 0.86217642, + "learning_rate": 3.958704737531818e-06, + "loss": 0.88613331, + "num_input_tokens_seen": 9929025, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 2.0625, + "step": 468, + "time_per_iteration": 2.482377767562866 + }, + { + "auxiliary_loss_clip": 0.01287545, + "auxiliary_loss_mlp": 0.01081999, + "balance_loss_clip": 1.0447104, + "balance_loss_mlp": 1.07984936, + "epoch": 0.02819780550127762, + "flos": 20813968800000.0, + "grad_norm": 3.6924571591440762, + "language_loss": 0.91605878, + "learning_rate": 3.9600790223300065e-06, + "loss": 0.93975413, + "num_input_tokens_seen": 9945190, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 2.078125, + "step": 469, + "time_per_iteration": 2.471510648727417 + }, + { + "auxiliary_loss_clip": 0.01286876, + "auxiliary_loss_mlp": 0.01096778, + "balance_loss_clip": 1.06106234, + "balance_loss_mlp": 1.08290672, + "epoch": 0.028257928753945588, + "flos": 19974125928960.0, + "grad_norm": 8.38112094971343, + "language_loss": 0.81587195, + "learning_rate": 3.96145038000181e-06, + "loss": 0.83970851, + "num_input_tokens_seen": 9962820, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 2.03125, + "step": 470, + "time_per_iteration": 2.5398614406585693 + }, + { + "auxiliary_loss_clip": 0.01286572, + "auxiliary_loss_mlp": 0.01085498, + "balance_loss_clip": 1.04868627, + "balance_loss_mlp": 1.07859015, + "epoch": 0.028318052006613557, + "flos": 20484488321280.0, + "grad_norm": 1.8437898933227894, + "language_loss": 0.93147206, + "learning_rate": 3.962818822989861e-06, + "loss": 0.9551928, + "num_input_tokens_seen": 9982595, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 2.078125, + "step": 471, + "time_per_iteration": 2.5005030632019043 + }, + { + "auxiliary_loss_clip": 0.0128173, + "auxiliary_loss_mlp": 0.01094713, + "balance_loss_clip": 1.05885458, + "balance_loss_mlp": 1.07808042, + "epoch": 0.02837817525928153, + "flos": 28514832410880.0, + "grad_norm": 1.89303735573371, + "language_loss": 0.757568, + "learning_rate": 3.964184363657625e-06, + "loss": 0.78133243, + "num_input_tokens_seen": 10004645, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 2.03125, + "step": 472, + "time_per_iteration": 2.597637176513672 + }, + { + "auxiliary_loss_clip": 0.0128882, + "auxiliary_loss_mlp": 0.01078393, + "balance_loss_clip": 1.04479945, + "balance_loss_mlp": 1.07699013, + "epoch": 0.028438298511949497, + "flos": 18551668458240.0, + "grad_norm": 3.986951446490631, + "language_loss": 0.93354845, + "learning_rate": 3.965547014290071e-06, + "loss": 0.95722055, + "num_input_tokens_seen": 10022555, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 2.125, + "step": 473, + "time_per_iteration": 2.4882545471191406 + }, + { + "auxiliary_loss_clip": 0.01293922, + "auxiliary_loss_mlp": 0.01115319, + "balance_loss_clip": 1.08134401, + "balance_loss_mlp": 1.08149064, + "epoch": 0.028498421764617466, + "flos": 16910227722240.0, + "grad_norm": 4.845992674029067, + "language_loss": 0.88586211, + "learning_rate": 3.96690678709433e-06, + "loss": 0.90995455, + "num_input_tokens_seen": 10041025, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 2.125, + "step": 474, + "time_per_iteration": 2.483210563659668 + }, + { + "auxiliary_loss_clip": 0.01284496, + "auxiliary_loss_mlp": 0.01091761, + "balance_loss_clip": 1.05559278, + "balance_loss_mlp": 1.07983565, + "epoch": 0.028558545017285435, + "flos": 27778699082880.0, + "grad_norm": 2.474550917046853, + "language_loss": 0.78771299, + "learning_rate": 3.968263694200355e-06, + "loss": 0.81147563, + "num_input_tokens_seen": 10060775, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 2.046875, + "step": 475, + "time_per_iteration": 2.5462486743927 + }, + { + "auxiliary_loss_clip": 0.01139312, + "auxiliary_loss_mlp": 0.0107539, + "balance_loss_clip": 1.06647348, + "balance_loss_mlp": 1.03907108, + "epoch": 0.028618668269953403, + "flos": 65654367258240.0, + "grad_norm": 0.9304884927077405, + "language_loss": 0.66880804, + "learning_rate": 3.969617747661569e-06, + "loss": 0.6909551, + "num_input_tokens_seen": 10120225, + "router_z_loss_clip": 0.08935547, + "router_z_loss_mlp": 1.0, + "step": 476, + "time_per_iteration": 5.8287513256073 + }, + { + "auxiliary_loss_clip": 0.01286666, + "auxiliary_loss_mlp": 0.01081774, + "balance_loss_clip": 1.04527175, + "balance_loss_mlp": 1.0796659, + "epoch": 0.028678791522621375, + "flos": 21937074324480.0, + "grad_norm": 2.9569520931335775, + "language_loss": 0.83852398, + "learning_rate": 3.970968959455509e-06, + "loss": 0.86220837, + "num_input_tokens_seen": 10137880, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 2.078125, + "step": 477, + "time_per_iteration": 2.5179195404052734 + }, + { + "auxiliary_loss_clip": 0.01293161, + "auxiliary_loss_mlp": 0.01088101, + "balance_loss_clip": 1.05164671, + "balance_loss_mlp": 1.08298135, + "epoch": 0.028738914775289344, + "flos": 24572128055040.0, + "grad_norm": 2.2048636254017504, + "language_loss": 0.82267237, + "learning_rate": 3.97231734148446e-06, + "loss": 0.84648502, + "num_input_tokens_seen": 10156930, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 2.09375, + "step": 478, + "time_per_iteration": 2.495760679244995 + }, + { + "auxiliary_loss_clip": 0.01283274, + "auxiliary_loss_mlp": 0.01076252, + "balance_loss_clip": 1.0409658, + "balance_loss_mlp": 1.07707858, + "epoch": 0.028799038027957313, + "flos": 23257977068160.0, + "grad_norm": 2.28603697529264, + "language_loss": 0.81010443, + "learning_rate": 3.973662905576082e-06, + "loss": 0.8336997, + "num_input_tokens_seen": 10176295, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 2.0625, + "step": 479, + "time_per_iteration": 2.491910934448242 + }, + { + "auxiliary_loss_clip": 0.01281719, + "auxiliary_loss_mlp": 0.01080307, + "balance_loss_clip": 1.04323328, + "balance_loss_mlp": 1.07729793, + "epoch": 0.02885916128062528, + "flos": 22164102236160.0, + "grad_norm": 2.2385690137770715, + "language_loss": 0.73465097, + "learning_rate": 3.975005663484038e-06, + "loss": 0.75827128, + "num_input_tokens_seen": 10195790, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 2.03125, + "step": 480, + "time_per_iteration": 2.4959068298339844 + }, + { + "auxiliary_loss_clip": 0.01280408, + "auxiliary_loss_mlp": 0.01071287, + "balance_loss_clip": 1.03945768, + "balance_loss_mlp": 1.07837129, + "epoch": 0.02891928453329325, + "flos": 22932842135040.0, + "grad_norm": 1.6612342828976938, + "language_loss": 0.87719476, + "learning_rate": 3.976345626888605e-06, + "loss": 0.90071172, + "num_input_tokens_seen": 10218405, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 2.03125, + "step": 481, + "time_per_iteration": 2.534792184829712 + }, + { + "auxiliary_loss_clip": 0.0113967, + "auxiliary_loss_mlp": 0.01022688, + "balance_loss_clip": 1.01367593, + "balance_loss_mlp": 1.03470159, + "epoch": 0.028979407785961222, + "flos": 57432941792640.0, + "grad_norm": 0.8259666239631118, + "language_loss": 0.66064727, + "learning_rate": 3.9776828073972864e-06, + "loss": 0.68227088, + "num_input_tokens_seen": 10271005, + "router_z_loss_clip": 0.09033203, + "router_z_loss_mlp": 1.046875, + "step": 482, + "time_per_iteration": 2.8219997882843018 + }, + { + "auxiliary_loss_clip": 0.01295379, + "auxiliary_loss_mlp": 0.01073835, + "balance_loss_clip": 1.04014635, + "balance_loss_mlp": 1.08159328, + "epoch": 0.02903953103862919, + "flos": 16722737706240.0, + "grad_norm": 2.373570732629757, + "language_loss": 0.78743541, + "learning_rate": 3.979017216545415e-06, + "loss": 0.81112754, + "num_input_tokens_seen": 10288405, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 2.140625, + "step": 483, + "time_per_iteration": 2.4733006954193115 + }, + { + "auxiliary_loss_clip": 0.01293434, + "auxiliary_loss_mlp": 0.01090935, + "balance_loss_clip": 1.0548625, + "balance_loss_mlp": 1.08311069, + "epoch": 0.02909965429129716, + "flos": 16763640318720.0, + "grad_norm": 2.520023812901894, + "language_loss": 0.75405324, + "learning_rate": 3.980348865796749e-06, + "loss": 0.77789688, + "num_input_tokens_seen": 10306875, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 2.109375, + "step": 484, + "time_per_iteration": 2.466634750366211 + }, + { + "auxiliary_loss_clip": 0.01288089, + "auxiliary_loss_mlp": 0.01078618, + "balance_loss_clip": 1.04459584, + "balance_loss_mlp": 1.08002305, + "epoch": 0.029159777543965128, + "flos": 19785343023360.0, + "grad_norm": 2.0323982063196153, + "language_loss": 0.84021544, + "learning_rate": 3.9816777665440615e-06, + "loss": 0.86388254, + "num_input_tokens_seen": 10323965, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 2.078125, + "step": 485, + "time_per_iteration": 2.511415719985962 + }, + { + "auxiliary_loss_clip": 0.01293039, + "auxiliary_loss_mlp": 0.01081906, + "balance_loss_clip": 1.04740667, + "balance_loss_mlp": 1.08659554, + "epoch": 0.029219900796633096, + "flos": 19642670202240.0, + "grad_norm": 1.9066132168030567, + "language_loss": 0.84465218, + "learning_rate": 3.983003930109732e-06, + "loss": 0.86840165, + "num_input_tokens_seen": 10342620, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 2.0625, + "step": 486, + "time_per_iteration": 2.453583002090454 + }, + { + "auxiliary_loss_clip": 0.01284719, + "auxiliary_loss_mlp": 0.01083872, + "balance_loss_clip": 1.04841876, + "balance_loss_mlp": 1.07841349, + "epoch": 0.02928002404930107, + "flos": 25885704424320.0, + "grad_norm": 1.9228432408219163, + "language_loss": 0.8891986, + "learning_rate": 3.984327367746315e-06, + "loss": 0.91288453, + "num_input_tokens_seen": 10364610, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 2.0625, + "step": 487, + "time_per_iteration": 2.5558598041534424 + }, + { + "auxiliary_loss_clip": 0.0128758, + "auxiliary_loss_mlp": 0.01070867, + "balance_loss_clip": 1.03806067, + "balance_loss_mlp": 1.08095598, + "epoch": 0.029340147301969037, + "flos": 20660234590080.0, + "grad_norm": 2.5260996981700456, + "language_loss": 0.87981069, + "learning_rate": 3.985648090637122e-06, + "loss": 0.90339512, + "num_input_tokens_seen": 10380910, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 2.0625, + "step": 488, + "time_per_iteration": 2.5299952030181885 + }, + { + "auxiliary_loss_clip": 0.01283325, + "auxiliary_loss_mlp": 0.01079627, + "balance_loss_clip": 1.0449605, + "balance_loss_mlp": 1.07794333, + "epoch": 0.029400270554637006, + "flos": 24428018689920.0, + "grad_norm": 2.1862911790042543, + "language_loss": 0.88956475, + "learning_rate": 3.986966109896785e-06, + "loss": 0.9131943, + "num_input_tokens_seen": 10400665, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 2.046875, + "step": 489, + "time_per_iteration": 2.545240879058838 + }, + { + "auxiliary_loss_clip": 0.0127768, + "auxiliary_loss_mlp": 0.01078157, + "balance_loss_clip": 1.04322839, + "balance_loss_mlp": 1.07402337, + "epoch": 0.029460393807304974, + "flos": 20120892900480.0, + "grad_norm": 2.0397830948196756, + "language_loss": 0.88539088, + "learning_rate": 3.988281436571815e-06, + "loss": 0.90894926, + "num_input_tokens_seen": 10420150, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 2.03125, + "step": 490, + "time_per_iteration": 2.4727838039398193 + }, + { + "auxiliary_loss_clip": 0.01284238, + "auxiliary_loss_mlp": 0.01081508, + "balance_loss_clip": 1.04774833, + "balance_loss_mlp": 1.07731342, + "epoch": 0.029520517059972943, + "flos": 17675914965120.0, + "grad_norm": 2.230679327742206, + "language_loss": 0.91299963, + "learning_rate": 3.989594081641164e-06, + "loss": 0.93665713, + "num_input_tokens_seen": 10438210, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 2.0625, + "step": 491, + "time_per_iteration": 2.4900829792022705 + }, + { + "auxiliary_loss_clip": 0.01274874, + "auxiliary_loss_mlp": 0.01070684, + "balance_loss_clip": 1.03804421, + "balance_loss_mlp": 1.0749476, + "epoch": 0.029580640312640915, + "flos": 18953185662720.0, + "grad_norm": 2.419480988494796, + "language_loss": 0.85232413, + "learning_rate": 3.9909040560167675e-06, + "loss": 0.87577969, + "num_input_tokens_seen": 10455125, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 2.0, + "step": 492, + "time_per_iteration": 2.457188844680786 + }, + { + "auxiliary_loss_clip": 0.0128558, + "auxiliary_loss_mlp": 0.01093772, + "balance_loss_clip": 1.05939209, + "balance_loss_mlp": 1.08082771, + "epoch": 0.029640763565308884, + "flos": 18726121837440.0, + "grad_norm": 2.826333733481051, + "language_loss": 0.83989829, + "learning_rate": 3.992211370544093e-06, + "loss": 0.86369187, + "num_input_tokens_seen": 10470990, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 2.046875, + "step": 493, + "time_per_iteration": 2.4821553230285645 + }, + { + "auxiliary_loss_clip": 0.01280126, + "auxiliary_loss_mlp": 0.01079048, + "balance_loss_clip": 1.04586005, + "balance_loss_mlp": 1.07578444, + "epoch": 0.029700886817976852, + "flos": 20595308757120.0, + "grad_norm": 1.8259196989393787, + "language_loss": 0.86575663, + "learning_rate": 3.99351603600268e-06, + "loss": 0.88934839, + "num_input_tokens_seen": 10490685, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 2.046875, + "step": 494, + "time_per_iteration": 2.507068395614624 + }, + { + "auxiliary_loss_clip": 0.01286409, + "auxiliary_loss_mlp": 0.01082408, + "balance_loss_clip": 1.05084157, + "balance_loss_mlp": 1.07973599, + "epoch": 0.02976101007064482, + "flos": 22236857233920.0, + "grad_norm": 4.414490317498679, + "language_loss": 0.86250752, + "learning_rate": 3.994818063106668e-06, + "loss": 0.88619578, + "num_input_tokens_seen": 10509435, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 2.0625, + "step": 495, + "time_per_iteration": 2.498401165008545 + }, + { + "auxiliary_loss_clip": 0.01274095, + "auxiliary_loss_mlp": 0.0107342, + "balance_loss_clip": 1.04144859, + "balance_loss_mlp": 1.07653904, + "epoch": 0.029821133323312793, + "flos": 23732644320000.0, + "grad_norm": 1.893732744603442, + "language_loss": 0.6230706, + "learning_rate": 3.99611746250533e-06, + "loss": 0.64654577, + "num_input_tokens_seen": 10530050, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 1.9765625, + "step": 496, + "time_per_iteration": 2.499669313430786 + }, + { + "auxiliary_loss_clip": 0.01276388, + "auxiliary_loss_mlp": 0.01085353, + "balance_loss_clip": 1.05314219, + "balance_loss_mlp": 1.07830799, + "epoch": 0.02988125657598076, + "flos": 22419498913920.0, + "grad_norm": 1.8423417765009742, + "language_loss": 0.88582325, + "learning_rate": 3.997414244783595e-06, + "loss": 0.90944064, + "num_input_tokens_seen": 10551370, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 1.984375, + "step": 497, + "time_per_iteration": 2.5570924282073975 + }, + { + "auxiliary_loss_clip": 0.01282787, + "auxiliary_loss_mlp": 0.0108035, + "balance_loss_clip": 1.04711461, + "balance_loss_mlp": 1.07822609, + "epoch": 0.02994137982864873, + "flos": 13845108453120.0, + "grad_norm": 3.4064142479622377, + "language_loss": 0.85174376, + "learning_rate": 3.998708420462557e-06, + "loss": 0.87537515, + "num_input_tokens_seen": 10569225, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 2.046875, + "step": 498, + "time_per_iteration": 2.513601541519165 + }, + { + "auxiliary_loss_clip": 0.01281177, + "auxiliary_loss_mlp": 0.01082811, + "balance_loss_clip": 1.05052912, + "balance_loss_mlp": 1.07829463, + "epoch": 0.0300015030813167, + "flos": 23908354675200.0, + "grad_norm": 37.23719619981942, + "language_loss": 0.78152531, + "learning_rate": 4e-06, + "loss": 0.80516517, + "num_input_tokens_seen": 10586170, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 2.03125, + "step": 499, + "time_per_iteration": 2.4924824237823486 + }, + { + "auxiliary_loss_clip": 0.01282354, + "auxiliary_loss_mlp": 0.01080564, + "balance_loss_clip": 1.04818654, + "balance_loss_mlp": 1.08037949, + "epoch": 0.030061626333984667, + "flos": 22016796560640.0, + "grad_norm": 3.687829420060643, + "language_loss": 0.8271451, + "learning_rate": 3.9999999620799e-06, + "loss": 0.85077423, + "num_input_tokens_seen": 10606205, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 2.015625, + "step": 500, + "time_per_iteration": 2.494333028793335 + }, + { + "auxiliary_loss_clip": 0.01274571, + "auxiliary_loss_mlp": 0.01084389, + "balance_loss_clip": 1.04924583, + "balance_loss_mlp": 1.07541978, + "epoch": 0.03012174958665264, + "flos": 23039747988480.0, + "grad_norm": 2.6096117253121447, + "language_loss": 0.88464928, + "learning_rate": 3.9999998483196e-06, + "loss": 0.90823889, + "num_input_tokens_seen": 10625995, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 1.9921875, + "step": 501, + "time_per_iteration": 2.494575262069702 + }, + { + "auxiliary_loss_clip": 0.01283018, + "auxiliary_loss_mlp": 0.01073076, + "balance_loss_clip": 1.04158127, + "balance_loss_mlp": 1.07912767, + "epoch": 0.030181872839320608, + "flos": 18953257489920.0, + "grad_norm": 2.304054979465899, + "language_loss": 0.86586684, + "learning_rate": 3.9999996587191065e-06, + "loss": 0.88942778, + "num_input_tokens_seen": 10644105, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 2.03125, + "step": 502, + "time_per_iteration": 2.4574413299560547 + }, + { + "auxiliary_loss_clip": 0.01278734, + "auxiliary_loss_mlp": 0.01077839, + "balance_loss_clip": 1.0444839, + "balance_loss_mlp": 1.07952762, + "epoch": 0.030241996091988577, + "flos": 16728017005440.0, + "grad_norm": 2.6244890775354976, + "language_loss": 0.84661186, + "learning_rate": 3.999999393278425e-06, + "loss": 0.87017757, + "num_input_tokens_seen": 10661090, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 1.9921875, + "step": 503, + "time_per_iteration": 2.4406938552856445 + }, + { + "auxiliary_loss_clip": 0.0127278, + "auxiliary_loss_mlp": 0.01082796, + "balance_loss_clip": 1.05008519, + "balance_loss_mlp": 1.07727659, + "epoch": 0.030302119344656545, + "flos": 28621271387520.0, + "grad_norm": 1.6755724800263092, + "language_loss": 0.88215417, + "learning_rate": 3.999999051997567e-06, + "loss": 0.90570992, + "num_input_tokens_seen": 10682380, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.953125, + "step": 504, + "time_per_iteration": 2.5319011211395264 + }, + { + "auxiliary_loss_clip": 0.01274883, + "auxiliary_loss_mlp": 0.01087129, + "balance_loss_clip": 1.05556226, + "balance_loss_mlp": 1.07692564, + "epoch": 0.030362242597324514, + "flos": 15669334523520.0, + "grad_norm": 2.2080583468347, + "language_loss": 0.78446162, + "learning_rate": 3.9999986348765425e-06, + "loss": 0.80808175, + "num_input_tokens_seen": 10699925, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.9765625, + "step": 505, + "time_per_iteration": 2.4724690914154053 + }, + { + "auxiliary_loss_clip": 0.01135682, + "auxiliary_loss_mlp": 0.01018291, + "balance_loss_clip": 1.00927854, + "balance_loss_mlp": 1.04092085, + "epoch": 0.030422365849992486, + "flos": 72125973676800.0, + "grad_norm": 0.8461866637376847, + "language_loss": 0.55057126, + "learning_rate": 3.999998141915371e-06, + "loss": 0.57211095, + "num_input_tokens_seen": 10766525, + "router_z_loss_clip": 0.09033203, + "router_z_loss_mlp": 0.9453125, + "step": 506, + "time_per_iteration": 3.2490124702453613 + }, + { + "auxiliary_loss_clip": 0.01274292, + "auxiliary_loss_mlp": 0.01087138, + "balance_loss_clip": 1.05418897, + "balance_loss_mlp": 1.0756762, + "epoch": 0.030482489102660455, + "flos": 19427817000960.0, + "grad_norm": 1.9034614277572226, + "language_loss": 0.83767861, + "learning_rate": 3.999997573114069e-06, + "loss": 0.8612929, + "num_input_tokens_seen": 10786725, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.984375, + "step": 507, + "time_per_iteration": 2.48811674118042 + }, + { + "auxiliary_loss_clip": 0.01280318, + "auxiliary_loss_mlp": 0.01080114, + "balance_loss_clip": 1.04778421, + "balance_loss_mlp": 1.07709789, + "epoch": 0.030542612355328423, + "flos": 20375822701440.0, + "grad_norm": 2.5950154193771526, + "language_loss": 0.88689649, + "learning_rate": 3.999996928472659e-06, + "loss": 0.91050076, + "num_input_tokens_seen": 10805390, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 2.03125, + "step": 508, + "time_per_iteration": 2.4966533184051514 + }, + { + "auxiliary_loss_clip": 0.01281637, + "auxiliary_loss_mlp": 0.01063766, + "balance_loss_clip": 1.0311265, + "balance_loss_mlp": 1.07728887, + "epoch": 0.030602735607996392, + "flos": 34677354297600.0, + "grad_norm": 2.2339008285543227, + "language_loss": 0.71499902, + "learning_rate": 3.999996207991165e-06, + "loss": 0.73845309, + "num_input_tokens_seen": 10828030, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 2.03125, + "step": 509, + "time_per_iteration": 2.5966317653656006 + }, + { + "auxiliary_loss_clip": 0.01274736, + "auxiliary_loss_mlp": 0.01072718, + "balance_loss_clip": 1.04229546, + "balance_loss_mlp": 1.07770133, + "epoch": 0.03066285886066436, + "flos": 23658668259840.0, + "grad_norm": 2.064360756351981, + "language_loss": 0.82369828, + "learning_rate": 3.999995411669614e-06, + "loss": 0.8471728, + "num_input_tokens_seen": 10845240, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.9765625, + "step": 510, + "time_per_iteration": 2.5276355743408203 + }, + { + "auxiliary_loss_clip": 0.01280977, + "auxiliary_loss_mlp": 0.01082699, + "balance_loss_clip": 1.04984498, + "balance_loss_mlp": 1.08235979, + "epoch": 0.030722982113332332, + "flos": 23002975440000.0, + "grad_norm": 2.1614325499153693, + "language_loss": 0.83621502, + "learning_rate": 3.999994539508036e-06, + "loss": 0.85985172, + "num_input_tokens_seen": 10864325, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.984375, + "step": 511, + "time_per_iteration": 2.503779888153076 + }, + { + "auxiliary_loss_clip": 0.01278507, + "auxiliary_loss_mlp": 0.01077898, + "balance_loss_clip": 1.04633093, + "balance_loss_mlp": 1.07648492, + "epoch": 0.0307831053660003, + "flos": 24750855152640.0, + "grad_norm": 2.1059740170821515, + "language_loss": 0.82234836, + "learning_rate": 3.9999935915064655e-06, + "loss": 0.8459124, + "num_input_tokens_seen": 10883860, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 2.03125, + "step": 512, + "time_per_iteration": 2.5306975841522217 + }, + { + "auxiliary_loss_clip": 0.01276149, + "auxiliary_loss_mlp": 0.01077944, + "balance_loss_clip": 1.04499435, + "balance_loss_mlp": 1.0769974, + "epoch": 0.03084322861866827, + "flos": 26140885620480.0, + "grad_norm": 1.9256325141107502, + "language_loss": 0.87030005, + "learning_rate": 3.9999925676649374e-06, + "loss": 0.89384103, + "num_input_tokens_seen": 10904555, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 1.9921875, + "step": 513, + "time_per_iteration": 2.507490634918213 + }, + { + "auxiliary_loss_clip": 0.01281572, + "auxiliary_loss_mlp": 0.01080973, + "balance_loss_clip": 1.04840553, + "balance_loss_mlp": 1.07869625, + "epoch": 0.03090335187133624, + "flos": 18771298168320.0, + "grad_norm": 3.202753983864072, + "language_loss": 0.79141152, + "learning_rate": 3.999991467983491e-06, + "loss": 0.81503695, + "num_input_tokens_seen": 10923700, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 2.03125, + "step": 514, + "time_per_iteration": 2.515496015548706 + }, + { + "auxiliary_loss_clip": 0.01276725, + "auxiliary_loss_mlp": 0.01063014, + "balance_loss_clip": 1.03218651, + "balance_loss_mlp": 1.07966864, + "epoch": 0.030963475124004207, + "flos": 23221886878080.0, + "grad_norm": 2.5461002634459216, + "language_loss": 0.77459693, + "learning_rate": 3.999990292462167e-06, + "loss": 0.79799432, + "num_input_tokens_seen": 10942730, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.96875, + "step": 515, + "time_per_iteration": 2.481903553009033 + }, + { + "auxiliary_loss_clip": 0.01272098, + "auxiliary_loss_mlp": 0.0106896, + "balance_loss_clip": 1.03615332, + "balance_loss_mlp": 1.07318711, + "epoch": 0.03102359837667218, + "flos": 42525595411200.0, + "grad_norm": 1.901518391780262, + "language_loss": 0.82729101, + "learning_rate": 3.999989041101011e-06, + "loss": 0.85070157, + "num_input_tokens_seen": 10967120, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.9921875, + "step": 516, + "time_per_iteration": 2.699577808380127 + }, + { + "auxiliary_loss_clip": 0.01272185, + "auxiliary_loss_mlp": 0.01070291, + "balance_loss_clip": 1.03760433, + "balance_loss_mlp": 1.07659435, + "epoch": 0.031083721629340148, + "flos": 21176953689600.0, + "grad_norm": 2.071844032637654, + "language_loss": 0.79009813, + "learning_rate": 3.999987713900071e-06, + "loss": 0.81352293, + "num_input_tokens_seen": 10986775, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.953125, + "step": 517, + "time_per_iteration": 4.0190205574035645 + }, + { + "auxiliary_loss_clip": 0.01269009, + "auxiliary_loss_mlp": 0.01072314, + "balance_loss_clip": 1.04069996, + "balance_loss_mlp": 1.07610774, + "epoch": 0.031143844882008116, + "flos": 29716187713920.0, + "grad_norm": 1.58218863781409, + "language_loss": 0.90778029, + "learning_rate": 3.999986310859396e-06, + "loss": 0.93119347, + "num_input_tokens_seen": 11011360, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.9296875, + "step": 518, + "time_per_iteration": 4.080751657485962 + }, + { + "auxiliary_loss_clip": 0.0128372, + "auxiliary_loss_mlp": 0.01093666, + "balance_loss_clip": 1.05883288, + "balance_loss_mlp": 1.08518016, + "epoch": 0.031203968134676085, + "flos": 23112467072640.0, + "grad_norm": 3.008779144342936, + "language_loss": 0.86396456, + "learning_rate": 3.999984831979039e-06, + "loss": 0.88773847, + "num_input_tokens_seen": 11030150, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 1.984375, + "step": 519, + "time_per_iteration": 2.510267734527588 + }, + { + "auxiliary_loss_clip": 0.01278708, + "auxiliary_loss_mlp": 0.01092513, + "balance_loss_clip": 1.06092215, + "balance_loss_mlp": 1.07567024, + "epoch": 0.03126409138734405, + "flos": 20954379064320.0, + "grad_norm": 2.0313723427087216, + "language_loss": 0.87156898, + "learning_rate": 3.999983277259057e-06, + "loss": 0.8952812, + "num_input_tokens_seen": 11049145, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 2.03125, + "step": 520, + "time_per_iteration": 2.4891066551208496 + }, + { + "auxiliary_loss_clip": 0.01281744, + "auxiliary_loss_mlp": 0.01089643, + "balance_loss_clip": 1.05633557, + "balance_loss_mlp": 1.07832289, + "epoch": 0.031324214640012026, + "flos": 21650112570240.0, + "grad_norm": 1.6802829394342778, + "language_loss": 0.89362079, + "learning_rate": 3.999981646699509e-06, + "loss": 0.91733468, + "num_input_tokens_seen": 11068835, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 2.03125, + "step": 521, + "time_per_iteration": 2.508524179458618 + }, + { + "auxiliary_loss_clip": 0.01274208, + "auxiliary_loss_mlp": 0.010832, + "balance_loss_clip": 1.04889154, + "balance_loss_mlp": 1.07795191, + "epoch": 0.03138433789267999, + "flos": 23441337020160.0, + "grad_norm": 2.273639697525746, + "language_loss": 0.71327078, + "learning_rate": 3.999979940300456e-06, + "loss": 0.73684484, + "num_input_tokens_seen": 11088980, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.9609375, + "step": 522, + "time_per_iteration": 2.49629282951355 + }, + { + "auxiliary_loss_clip": 0.01278501, + "auxiliary_loss_mlp": 0.01083501, + "balance_loss_clip": 1.05150533, + "balance_loss_mlp": 1.07655358, + "epoch": 0.03144446114534796, + "flos": 18982164960000.0, + "grad_norm": 3.1208656196394706, + "language_loss": 0.84886295, + "learning_rate": 3.999978158061963e-06, + "loss": 0.87248302, + "num_input_tokens_seen": 11104300, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 2.015625, + "step": 523, + "time_per_iteration": 2.4674315452575684 + }, + { + "auxiliary_loss_clip": 0.01280597, + "auxiliary_loss_mlp": 0.01075539, + "balance_loss_clip": 1.04249442, + "balance_loss_mlp": 1.07655168, + "epoch": 0.031504584398015935, + "flos": 22637692080000.0, + "grad_norm": 1.9693639011355857, + "language_loss": 0.90419745, + "learning_rate": 3.999976299984099e-06, + "loss": 0.92775881, + "num_input_tokens_seen": 11123335, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 2.046875, + "step": 524, + "time_per_iteration": 2.480764627456665 + }, + { + "auxiliary_loss_clip": 0.01285248, + "auxiliary_loss_mlp": 0.01083988, + "balance_loss_clip": 1.05034757, + "balance_loss_mlp": 1.08102393, + "epoch": 0.0315647076506839, + "flos": 25297056339840.0, + "grad_norm": 2.4392367222760276, + "language_loss": 0.80040443, + "learning_rate": 3.999974366066933e-06, + "loss": 0.8240968, + "num_input_tokens_seen": 11140880, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 2.046875, + "step": 525, + "time_per_iteration": 2.5409629344940186 + }, + { + "auxiliary_loss_clip": 0.01277675, + "auxiliary_loss_mlp": 0.01082993, + "balance_loss_clip": 1.05025804, + "balance_loss_mlp": 1.07571197, + "epoch": 0.03162483090335187, + "flos": 16982839065600.0, + "grad_norm": 2.8378410017413658, + "language_loss": 0.80693865, + "learning_rate": 3.999972356310538e-06, + "loss": 0.83054531, + "num_input_tokens_seen": 11158710, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 2.03125, + "step": 526, + "time_per_iteration": 2.4509081840515137 + }, + { + "auxiliary_loss_clip": 0.01285808, + "auxiliary_loss_mlp": 0.01072361, + "balance_loss_clip": 1.03655052, + "balance_loss_mlp": 1.08127069, + "epoch": 0.03168495415601984, + "flos": 18734489706240.0, + "grad_norm": 2.27970800213601, + "language_loss": 0.81417823, + "learning_rate": 3.999970270714991e-06, + "loss": 0.83775997, + "num_input_tokens_seen": 11177550, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 2.046875, + "step": 527, + "time_per_iteration": 2.4760756492614746 + }, + { + "auxiliary_loss_clip": 0.01273782, + "auxiliary_loss_mlp": 0.01080634, + "balance_loss_clip": 1.04651666, + "balance_loss_mlp": 1.07408452, + "epoch": 0.03174507740868781, + "flos": 21214875473280.0, + "grad_norm": 2.59751390244888, + "language_loss": 0.93932182, + "learning_rate": 3.999968109280371e-06, + "loss": 0.96286595, + "num_input_tokens_seen": 11196230, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 2.0, + "step": 528, + "time_per_iteration": 2.4721155166625977 + }, + { + "auxiliary_loss_clip": 0.01273884, + "auxiliary_loss_mlp": 0.01073354, + "balance_loss_clip": 1.04083371, + "balance_loss_mlp": 1.07427406, + "epoch": 0.03180520066135578, + "flos": 24787663614720.0, + "grad_norm": 1.8844039207994492, + "language_loss": 0.84143054, + "learning_rate": 3.99996587200676e-06, + "loss": 0.86490291, + "num_input_tokens_seen": 11214935, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 2.0, + "step": 529, + "time_per_iteration": 2.5173239707946777 + }, + { + "auxiliary_loss_clip": 0.01278919, + "auxiliary_loss_mlp": 0.01087129, + "balance_loss_clip": 1.05530047, + "balance_loss_mlp": 1.08254409, + "epoch": 0.03186532391402375, + "flos": 24864261367680.0, + "grad_norm": 2.130233453276154, + "language_loss": 0.90547037, + "learning_rate": 3.999963558894243e-06, + "loss": 0.92913085, + "num_input_tokens_seen": 11235310, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 1.96875, + "step": 530, + "time_per_iteration": 2.5096359252929688 + }, + { + "auxiliary_loss_clip": 0.0127291, + "auxiliary_loss_mlp": 0.01073181, + "balance_loss_clip": 1.03930187, + "balance_loss_mlp": 1.07199419, + "epoch": 0.03192544716669172, + "flos": 21215055041280.0, + "grad_norm": 2.12169085676626, + "language_loss": 0.76197046, + "learning_rate": 3.999961169942907e-06, + "loss": 0.78543139, + "num_input_tokens_seen": 11254425, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 2.015625, + "step": 531, + "time_per_iteration": 2.503265142440796 + }, + { + "auxiliary_loss_clip": 0.01272973, + "auxiliary_loss_mlp": 0.01064442, + "balance_loss_clip": 1.03030038, + "balance_loss_mlp": 1.07424712, + "epoch": 0.03198557041935969, + "flos": 24353216616960.0, + "grad_norm": 2.621085079916904, + "language_loss": 0.9073056, + "learning_rate": 3.999958705152843e-06, + "loss": 0.9306798, + "num_input_tokens_seen": 11274595, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 1.9921875, + "step": 532, + "time_per_iteration": 2.506220817565918 + }, + { + "auxiliary_loss_clip": 0.01137355, + "auxiliary_loss_mlp": 0.01010615, + "balance_loss_clip": 1.00198412, + "balance_loss_mlp": 1.0428524, + "epoch": 0.032045693672027656, + "flos": 61827367587840.0, + "grad_norm": 0.7306749876416057, + "language_loss": 0.57931173, + "learning_rate": 3.9999561645241445e-06, + "loss": 0.60079145, + "num_input_tokens_seen": 11336705, + "router_z_loss_clip": 0.08642578, + "router_z_loss_mlp": 0.9453125, + "step": 533, + "time_per_iteration": 3.154953956604004 + }, + { + "auxiliary_loss_clip": 0.01271016, + "auxiliary_loss_mlp": 0.01084756, + "balance_loss_clip": 1.05209231, + "balance_loss_mlp": 1.07378936, + "epoch": 0.03210581692469563, + "flos": 28401174800640.0, + "grad_norm": 1.8972625930530718, + "language_loss": 0.86725944, + "learning_rate": 3.999953548056907e-06, + "loss": 0.89081717, + "num_input_tokens_seen": 11356820, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 1.96875, + "step": 534, + "time_per_iteration": 2.5384750366210938 + }, + { + "auxiliary_loss_clip": 0.01271847, + "auxiliary_loss_mlp": 0.01066511, + "balance_loss_clip": 1.03468204, + "balance_loss_mlp": 1.07573223, + "epoch": 0.03216594017736359, + "flos": 24717709877760.0, + "grad_norm": 2.118212102173022, + "language_loss": 0.77352351, + "learning_rate": 3.999950855751232e-06, + "loss": 0.79690707, + "num_input_tokens_seen": 11376645, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 1.9609375, + "step": 535, + "time_per_iteration": 2.517940044403076 + }, + { + "auxiliary_loss_clip": 0.01274503, + "auxiliary_loss_mlp": 0.01083227, + "balance_loss_clip": 1.05151725, + "balance_loss_mlp": 1.07644773, + "epoch": 0.032226063430031565, + "flos": 31175453646720.0, + "grad_norm": 2.176836888233088, + "language_loss": 0.8074764, + "learning_rate": 3.999948087607219e-06, + "loss": 0.83105373, + "num_input_tokens_seen": 11397310, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 1.984375, + "step": 536, + "time_per_iteration": 2.546128034591675 + }, + { + "auxiliary_loss_clip": 0.01275643, + "auxiliary_loss_mlp": 0.01077633, + "balance_loss_clip": 1.04361033, + "balance_loss_mlp": 1.07698941, + "epoch": 0.03228618668269954, + "flos": 32198225506560.0, + "grad_norm": 2.3353202427960627, + "language_loss": 0.70118421, + "learning_rate": 3.999945243624975e-06, + "loss": 0.72471696, + "num_input_tokens_seen": 11418475, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 1.9921875, + "step": 537, + "time_per_iteration": 2.578101634979248 + }, + { + "auxiliary_loss_clip": 0.01274556, + "auxiliary_loss_mlp": 0.01081628, + "balance_loss_clip": 1.04877353, + "balance_loss_mlp": 1.08040798, + "epoch": 0.0323463099353675, + "flos": 22670154996480.0, + "grad_norm": 2.1000918694055044, + "language_loss": 0.8250435, + "learning_rate": 3.999942323804607e-06, + "loss": 0.84860539, + "num_input_tokens_seen": 11436630, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.9375, + "step": 538, + "time_per_iteration": 2.4822683334350586 + }, + { + "auxiliary_loss_clip": 0.01280793, + "auxiliary_loss_mlp": 0.01078587, + "balance_loss_clip": 1.0458765, + "balance_loss_mlp": 1.0775007, + "epoch": 0.032406433188035474, + "flos": 26905172232960.0, + "grad_norm": 1.8128048759039839, + "language_loss": 0.78999949, + "learning_rate": 3.999939328146225e-06, + "loss": 0.81359327, + "num_input_tokens_seen": 11457275, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 2.03125, + "step": 539, + "time_per_iteration": 2.5495705604553223 + }, + { + "auxiliary_loss_clip": 0.01274183, + "auxiliary_loss_mlp": 0.01066988, + "balance_loss_clip": 1.03284597, + "balance_loss_mlp": 1.0766232, + "epoch": 0.03246655644070344, + "flos": 31503928544640.0, + "grad_norm": 2.6651388031929835, + "language_loss": 0.77802742, + "learning_rate": 3.999936256649943e-06, + "loss": 0.80143911, + "num_input_tokens_seen": 11476925, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 1.9765625, + "step": 540, + "time_per_iteration": 2.5547144412994385 + }, + { + "auxiliary_loss_clip": 0.01282159, + "auxiliary_loss_mlp": 0.01070004, + "balance_loss_clip": 1.03755546, + "balance_loss_mlp": 1.08122253, + "epoch": 0.03252667969337141, + "flos": 23218331431680.0, + "grad_norm": 2.2422114385304845, + "language_loss": 0.85410464, + "learning_rate": 3.999933109315878e-06, + "loss": 0.8776263, + "num_input_tokens_seen": 11496830, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 2.0, + "step": 541, + "time_per_iteration": 2.517545700073242 + }, + { + "auxiliary_loss_clip": 0.01271503, + "auxiliary_loss_mlp": 0.01083563, + "balance_loss_clip": 1.04906392, + "balance_loss_mlp": 1.07759655, + "epoch": 0.032586802946039384, + "flos": 14757454926720.0, + "grad_norm": 2.210152212848466, + "language_loss": 0.89072484, + "learning_rate": 3.9999298861441496e-06, + "loss": 0.91427547, + "num_input_tokens_seen": 11515605, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.9375, + "step": 542, + "time_per_iteration": 2.437566041946411 + }, + { + "auxiliary_loss_clip": 0.01272694, + "auxiliary_loss_mlp": 0.01075801, + "balance_loss_clip": 1.04289961, + "balance_loss_mlp": 1.07649362, + "epoch": 0.03264692619870735, + "flos": 24280677100800.0, + "grad_norm": 2.3494598042187236, + "language_loss": 0.71096039, + "learning_rate": 3.999926587134879e-06, + "loss": 0.73444533, + "num_input_tokens_seen": 11536230, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.9609375, + "step": 543, + "time_per_iteration": 2.5121288299560547 + }, + { + "auxiliary_loss_clip": 0.0127171, + "auxiliary_loss_mlp": 0.01086873, + "balance_loss_clip": 1.05411386, + "balance_loss_mlp": 1.07139826, + "epoch": 0.03270704945137532, + "flos": 22893160584960.0, + "grad_norm": 2.6617228213889375, + "language_loss": 0.91273057, + "learning_rate": 3.999923212288192e-06, + "loss": 0.93631637, + "num_input_tokens_seen": 11554715, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 2.0, + "step": 544, + "time_per_iteration": 2.529536008834839 + }, + { + "auxiliary_loss_clip": 0.01274727, + "auxiliary_loss_mlp": 0.01082721, + "balance_loss_clip": 1.05186987, + "balance_loss_mlp": 1.07790041, + "epoch": 0.032767172704043286, + "flos": 18041018757120.0, + "grad_norm": 3.144073602630947, + "language_loss": 0.6640051, + "learning_rate": 3.999919761604216e-06, + "loss": 0.68757957, + "num_input_tokens_seen": 11571370, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.96875, + "step": 545, + "time_per_iteration": 2.487250328063965 + }, + { + "auxiliary_loss_clip": 0.01272187, + "auxiliary_loss_mlp": 0.01069604, + "balance_loss_clip": 1.03715563, + "balance_loss_mlp": 1.07393909, + "epoch": 0.03282729595671126, + "flos": 22528739151360.0, + "grad_norm": 2.6288964335615805, + "language_loss": 0.91857421, + "learning_rate": 3.999916235083083e-06, + "loss": 0.94199216, + "num_input_tokens_seen": 11588560, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 1.984375, + "step": 546, + "time_per_iteration": 2.4893922805786133 + }, + { + "auxiliary_loss_clip": 0.0126813, + "auxiliary_loss_mlp": 0.01071134, + "balance_loss_clip": 1.03723049, + "balance_loss_mlp": 1.07095337, + "epoch": 0.03288741920937923, + "flos": 20410620001920.0, + "grad_norm": 2.4455611041839127, + "language_loss": 0.82002354, + "learning_rate": 3.999912632724925e-06, + "loss": 0.84341609, + "num_input_tokens_seen": 11605685, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 1.96875, + "step": 547, + "time_per_iteration": 2.4879701137542725 + }, + { + "auxiliary_loss_clip": 0.01271545, + "auxiliary_loss_mlp": 0.01070995, + "balance_loss_clip": 1.0376637, + "balance_loss_mlp": 1.07550538, + "epoch": 0.032947542462047195, + "flos": 20777986350720.0, + "grad_norm": 3.015836198351779, + "language_loss": 0.80919325, + "learning_rate": 3.999908954529881e-06, + "loss": 0.83261865, + "num_input_tokens_seen": 11626290, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 1.9609375, + "step": 548, + "time_per_iteration": 2.501983404159546 + }, + { + "auxiliary_loss_clip": 0.01270889, + "auxiliary_loss_mlp": 0.01079421, + "balance_loss_clip": 1.04499304, + "balance_loss_mlp": 1.07411838, + "epoch": 0.03300766571471517, + "flos": 19901263190400.0, + "grad_norm": 3.9904289991591217, + "language_loss": 0.67330974, + "learning_rate": 3.999905200498087e-06, + "loss": 0.69681287, + "num_input_tokens_seen": 11643950, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.96875, + "step": 549, + "time_per_iteration": 2.479069948196411 + }, + { + "auxiliary_loss_clip": 0.01265753, + "auxiliary_loss_mlp": 0.01075673, + "balance_loss_clip": 1.04286647, + "balance_loss_mlp": 1.07537639, + "epoch": 0.03306778896738313, + "flos": 17967760968960.0, + "grad_norm": 2.081726350608672, + "language_loss": 0.86137938, + "learning_rate": 3.999901370629689e-06, + "loss": 0.88479364, + "num_input_tokens_seen": 11662560, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.90625, + "step": 550, + "time_per_iteration": 2.435030221939087 + }, + { + "auxiliary_loss_clip": 0.01271779, + "auxiliary_loss_mlp": 0.01089379, + "balance_loss_clip": 1.05712056, + "balance_loss_mlp": 1.07876444, + "epoch": 0.033127912220051105, + "flos": 21653380707840.0, + "grad_norm": 2.0024940554917534, + "language_loss": 0.81302834, + "learning_rate": 3.99989746492483e-06, + "loss": 0.83663994, + "num_input_tokens_seen": 11682265, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 1.9296875, + "step": 551, + "time_per_iteration": 2.474317789077759 + }, + { + "auxiliary_loss_clip": 0.01278525, + "auxiliary_loss_mlp": 0.01080037, + "balance_loss_clip": 1.0469687, + "balance_loss_mlp": 1.0786469, + "epoch": 0.03318803547271908, + "flos": 30188376927360.0, + "grad_norm": 2.5540153370218697, + "language_loss": 0.85907811, + "learning_rate": 3.999893483383658e-06, + "loss": 0.88266373, + "num_input_tokens_seen": 11699300, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 2.0, + "step": 552, + "time_per_iteration": 2.5352437496185303 + }, + { + "auxiliary_loss_clip": 0.01276099, + "auxiliary_loss_mlp": 0.01077197, + "balance_loss_clip": 1.0428648, + "balance_loss_mlp": 1.07894135, + "epoch": 0.03324815872538704, + "flos": 20376038183040.0, + "grad_norm": 2.3148388677976253, + "language_loss": 0.928128, + "learning_rate": 3.999889426006326e-06, + "loss": 0.95166099, + "num_input_tokens_seen": 11716955, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.96875, + "step": 553, + "time_per_iteration": 2.4860291481018066 + }, + { + "auxiliary_loss_clip": 0.01270959, + "auxiliary_loss_mlp": 0.01072703, + "balance_loss_clip": 1.03858554, + "balance_loss_mlp": 1.0755136, + "epoch": 0.033308281978055014, + "flos": 24494560634880.0, + "grad_norm": 2.234190064541142, + "language_loss": 0.78874755, + "learning_rate": 3.999885292792986e-06, + "loss": 0.81218415, + "num_input_tokens_seen": 11736130, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 1.953125, + "step": 554, + "time_per_iteration": 2.4878416061401367 + }, + { + "auxiliary_loss_clip": 0.0126611, + "auxiliary_loss_mlp": 0.01082645, + "balance_loss_clip": 1.04838455, + "balance_loss_mlp": 1.07417822, + "epoch": 0.03336840523072298, + "flos": 23400326666880.0, + "grad_norm": 2.1365458646452424, + "language_loss": 0.82297659, + "learning_rate": 3.999881083743795e-06, + "loss": 0.84646416, + "num_input_tokens_seen": 11754425, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.9140625, + "step": 555, + "time_per_iteration": 2.4846394062042236 + }, + { + "auxiliary_loss_clip": 0.01270081, + "auxiliary_loss_mlp": 0.01075464, + "balance_loss_clip": 1.04156113, + "balance_loss_mlp": 1.07390678, + "epoch": 0.03342852848339095, + "flos": 30550571717760.0, + "grad_norm": 2.781828445596944, + "language_loss": 0.88624835, + "learning_rate": 3.999876798858914e-06, + "loss": 0.90970379, + "num_input_tokens_seen": 11772845, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 1.96875, + "step": 556, + "time_per_iteration": 2.5788674354553223 + }, + { + "auxiliary_loss_clip": 0.01269545, + "auxiliary_loss_mlp": 0.01079314, + "balance_loss_clip": 1.04531527, + "balance_loss_mlp": 1.07534254, + "epoch": 0.03348865173605892, + "flos": 22893304239360.0, + "grad_norm": 2.0860752820949586, + "language_loss": 0.83492053, + "learning_rate": 3.999872438138503e-06, + "loss": 0.85840911, + "num_input_tokens_seen": 11792850, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 1.9375, + "step": 557, + "time_per_iteration": 2.5352954864501953 + }, + { + "auxiliary_loss_clip": 0.01275093, + "auxiliary_loss_mlp": 0.0106652, + "balance_loss_clip": 1.03495288, + "balance_loss_mlp": 1.07979858, + "epoch": 0.03354877498872689, + "flos": 17676022705920.0, + "grad_norm": 9.145612151583265, + "language_loss": 0.94169575, + "learning_rate": 3.999868001582729e-06, + "loss": 0.96511185, + "num_input_tokens_seen": 11809670, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.953125, + "step": 558, + "time_per_iteration": 2.4541964530944824 + }, + { + "auxiliary_loss_clip": 0.01265501, + "auxiliary_loss_mlp": 0.01073065, + "balance_loss_clip": 1.0406878, + "balance_loss_mlp": 1.07178497, + "epoch": 0.03360889824139486, + "flos": 21652985658240.0, + "grad_norm": 2.48174106566098, + "language_loss": 0.7735827, + "learning_rate": 3.99986348919176e-06, + "loss": 0.7969684, + "num_input_tokens_seen": 11829665, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 1.9375, + "step": 559, + "time_per_iteration": 5.362890005111694 + }, + { + "auxiliary_loss_clip": 0.01268387, + "auxiliary_loss_mlp": 0.01078962, + "balance_loss_clip": 1.04818201, + "balance_loss_mlp": 1.07386613, + "epoch": 0.033669021494062826, + "flos": 21795730306560.0, + "grad_norm": 2.071149038386511, + "language_loss": 0.87681198, + "learning_rate": 3.9998589009657675e-06, + "loss": 0.90028548, + "num_input_tokens_seen": 11848190, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.953125, + "step": 560, + "time_per_iteration": 3.9536426067352295 + }, + { + "auxiliary_loss_clip": 0.01264547, + "auxiliary_loss_mlp": 0.01067998, + "balance_loss_clip": 1.0375762, + "balance_loss_mlp": 1.07323277, + "epoch": 0.0337291447467308, + "flos": 21866222747520.0, + "grad_norm": 2.2284071587683463, + "language_loss": 0.81380183, + "learning_rate": 3.999854236904925e-06, + "loss": 0.83712727, + "num_input_tokens_seen": 11864795, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.9140625, + "step": 561, + "time_per_iteration": 2.49826717376709 + }, + { + "auxiliary_loss_clip": 0.01263917, + "auxiliary_loss_mlp": 0.01071053, + "balance_loss_clip": 1.04029727, + "balance_loss_mlp": 1.07403696, + "epoch": 0.03378926799939877, + "flos": 24245951627520.0, + "grad_norm": 1.7768341081574646, + "language_loss": 0.82018232, + "learning_rate": 3.999849497009409e-06, + "loss": 0.84353203, + "num_input_tokens_seen": 11885275, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.90625, + "step": 562, + "time_per_iteration": 2.503990888595581 + }, + { + "auxiliary_loss_clip": 0.01269896, + "auxiliary_loss_mlp": 0.01075498, + "balance_loss_clip": 1.04352641, + "balance_loss_mlp": 1.07592142, + "epoch": 0.033849391252066735, + "flos": 16507812677760.0, + "grad_norm": 1.966221896086353, + "language_loss": 0.84028983, + "learning_rate": 3.999844681279401e-06, + "loss": 0.86374378, + "num_input_tokens_seen": 11903595, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 1.9375, + "step": 563, + "time_per_iteration": 2.464571952819824 + }, + { + "auxiliary_loss_clip": 0.01268432, + "auxiliary_loss_mlp": 0.01079949, + "balance_loss_clip": 1.04866886, + "balance_loss_mlp": 1.07648492, + "epoch": 0.03390951450473471, + "flos": 15669298609920.0, + "grad_norm": 2.359913311978066, + "language_loss": 0.94194812, + "learning_rate": 3.99983978971508e-06, + "loss": 0.96543193, + "num_input_tokens_seen": 11917815, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.921875, + "step": 564, + "time_per_iteration": 2.423762798309326 + }, + { + "auxiliary_loss_clip": 0.01267204, + "auxiliary_loss_mlp": 0.01070001, + "balance_loss_clip": 1.03745687, + "balance_loss_mlp": 1.07225537, + "epoch": 0.03396963775740267, + "flos": 22674787850880.0, + "grad_norm": 3.7666153248687277, + "language_loss": 0.94089758, + "learning_rate": 3.999834822316635e-06, + "loss": 0.96426964, + "num_input_tokens_seen": 11936305, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 1.953125, + "step": 565, + "time_per_iteration": 2.499417543411255 + }, + { + "auxiliary_loss_clip": 0.01140331, + "auxiliary_loss_mlp": 0.0102506, + "balance_loss_clip": 1.01714468, + "balance_loss_mlp": 1.04934859, + "epoch": 0.034029761010070644, + "flos": 64392683063040.0, + "grad_norm": 1.1198796781785882, + "language_loss": 0.54823005, + "learning_rate": 3.9998297790842535e-06, + "loss": 0.569884, + "num_input_tokens_seen": 11998940, + "router_z_loss_clip": 0.07910156, + "router_z_loss_mlp": 0.91015625, + "step": 566, + "time_per_iteration": 3.1322038173675537 + }, + { + "auxiliary_loss_clip": 0.01270043, + "auxiliary_loss_mlp": 0.01072204, + "balance_loss_clip": 1.03837276, + "balance_loss_mlp": 1.0753262, + "epoch": 0.034089884262738616, + "flos": 25004204755200.0, + "grad_norm": 2.6603630269915683, + "language_loss": 0.76780868, + "learning_rate": 3.999824660018126e-06, + "loss": 0.79123116, + "num_input_tokens_seen": 12018860, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 1.9453125, + "step": 567, + "time_per_iteration": 2.5351951122283936 + }, + { + "auxiliary_loss_clip": 0.01261299, + "auxiliary_loss_mlp": 0.01077897, + "balance_loss_clip": 1.04809463, + "balance_loss_mlp": 1.07400167, + "epoch": 0.03415000751540658, + "flos": 28439096584320.0, + "grad_norm": 4.563520524929296, + "language_loss": 0.80796623, + "learning_rate": 3.999819465118447e-06, + "loss": 0.83135819, + "num_input_tokens_seen": 12039675, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.875, + "step": 568, + "time_per_iteration": 2.558093309402466 + }, + { + "auxiliary_loss_clip": 0.01263323, + "auxiliary_loss_mlp": 0.01079335, + "balance_loss_clip": 1.04836476, + "balance_loss_mlp": 1.07628214, + "epoch": 0.034210130768074554, + "flos": 21468727866240.0, + "grad_norm": 1.809578126153619, + "language_loss": 0.86777622, + "learning_rate": 3.999814194385413e-06, + "loss": 0.89120281, + "num_input_tokens_seen": 12057680, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.875, + "step": 569, + "time_per_iteration": 2.500319719314575 + }, + { + "auxiliary_loss_clip": 0.01264002, + "auxiliary_loss_mlp": 0.01073079, + "balance_loss_clip": 1.04227519, + "balance_loss_mlp": 1.07425416, + "epoch": 0.03427025402074252, + "flos": 18697501676160.0, + "grad_norm": 1.8164454228173497, + "language_loss": 0.95802778, + "learning_rate": 3.9998088478192255e-06, + "loss": 0.98139858, + "num_input_tokens_seen": 12076135, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.8984375, + "step": 570, + "time_per_iteration": 2.473808526992798 + }, + { + "auxiliary_loss_clip": 0.01264689, + "auxiliary_loss_mlp": 0.01080759, + "balance_loss_clip": 1.04733253, + "balance_loss_mlp": 1.07053721, + "epoch": 0.03433037727341049, + "flos": 20849987162880.0, + "grad_norm": 2.217921822086313, + "language_loss": 0.79522127, + "learning_rate": 3.9998034254200846e-06, + "loss": 0.81867576, + "num_input_tokens_seen": 12094785, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 1.9375, + "step": 571, + "time_per_iteration": 2.48317813873291 + }, + { + "auxiliary_loss_clip": 0.01265335, + "auxiliary_loss_mlp": 0.01076969, + "balance_loss_clip": 1.04490221, + "balance_loss_mlp": 1.07593679, + "epoch": 0.03439050052607846, + "flos": 25410282986880.0, + "grad_norm": 2.3471183659940555, + "language_loss": 0.79962778, + "learning_rate": 3.999797927188199e-06, + "loss": 0.82305074, + "num_input_tokens_seen": 12114590, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 1.890625, + "step": 572, + "time_per_iteration": 2.5112764835357666 + }, + { + "auxiliary_loss_clip": 0.01270326, + "auxiliary_loss_mlp": 0.0106947, + "balance_loss_clip": 1.03871393, + "balance_loss_mlp": 1.07574439, + "epoch": 0.03445062377874643, + "flos": 17640147997440.0, + "grad_norm": 1.9544136074887903, + "language_loss": 0.84374899, + "learning_rate": 3.999792353123774e-06, + "loss": 0.86714697, + "num_input_tokens_seen": 12132390, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.9453125, + "step": 573, + "time_per_iteration": 2.474212408065796 + }, + { + "auxiliary_loss_clip": 0.01266726, + "auxiliary_loss_mlp": 0.01064214, + "balance_loss_clip": 1.03460276, + "balance_loss_mlp": 1.07282329, + "epoch": 0.0345107470314144, + "flos": 16764502245120.0, + "grad_norm": 3.553507560277694, + "language_loss": 0.76376265, + "learning_rate": 3.999786703227023e-06, + "loss": 0.78707206, + "num_input_tokens_seen": 12149035, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.9375, + "step": 574, + "time_per_iteration": 2.4510116577148438 + }, + { + "auxiliary_loss_clip": 0.01264596, + "auxiliary_loss_mlp": 0.01064423, + "balance_loss_clip": 1.03531194, + "balance_loss_mlp": 1.0731982, + "epoch": 0.03457087028408237, + "flos": 14684448533760.0, + "grad_norm": 2.5278817664157343, + "language_loss": 0.83801597, + "learning_rate": 3.9997809774981606e-06, + "loss": 0.86130619, + "num_input_tokens_seen": 12167530, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.9140625, + "step": 575, + "time_per_iteration": 2.459693193435669 + }, + { + "auxiliary_loss_clip": 0.01260171, + "auxiliary_loss_mlp": 0.01067742, + "balance_loss_clip": 1.03830886, + "balance_loss_mlp": 1.07501364, + "epoch": 0.03463099353675034, + "flos": 20011293527040.0, + "grad_norm": 2.241383472398266, + "language_loss": 0.83726245, + "learning_rate": 3.9997751759374025e-06, + "loss": 0.86054158, + "num_input_tokens_seen": 12186340, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.8515625, + "step": 576, + "time_per_iteration": 2.47292423248291 + }, + { + "auxiliary_loss_clip": 0.01267718, + "auxiliary_loss_mlp": 0.01074956, + "balance_loss_clip": 1.04582155, + "balance_loss_mlp": 1.08247435, + "epoch": 0.03469111678941831, + "flos": 25301150490240.0, + "grad_norm": 2.0876645490308334, + "language_loss": 0.8640908, + "learning_rate": 3.99976929854497e-06, + "loss": 0.88751757, + "num_input_tokens_seen": 12204090, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.8515625, + "step": 577, + "time_per_iteration": 2.529500961303711 + }, + { + "auxiliary_loss_clip": 0.01262371, + "auxiliary_loss_mlp": 0.01069797, + "balance_loss_clip": 1.04028082, + "balance_loss_mlp": 1.0769875, + "epoch": 0.034751240042086275, + "flos": 23259413612160.0, + "grad_norm": 3.2017547958107784, + "language_loss": 0.72333407, + "learning_rate": 3.9997633453210845e-06, + "loss": 0.74665576, + "num_input_tokens_seen": 12224850, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.859375, + "step": 578, + "time_per_iteration": 2.4868762493133545 + }, + { + "auxiliary_loss_clip": 0.01263036, + "auxiliary_loss_mlp": 0.01071071, + "balance_loss_clip": 1.04050565, + "balance_loss_mlp": 1.07441878, + "epoch": 0.03481136329475425, + "flos": 23769237300480.0, + "grad_norm": 1.8544904120227406, + "language_loss": 0.77664137, + "learning_rate": 3.999757316265973e-06, + "loss": 0.79998243, + "num_input_tokens_seen": 12244935, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.8828125, + "step": 579, + "time_per_iteration": 2.50669002532959 + }, + { + "auxiliary_loss_clip": 0.01260844, + "auxiliary_loss_mlp": 0.01077558, + "balance_loss_clip": 1.04634845, + "balance_loss_mlp": 1.07355189, + "epoch": 0.03487148654742222, + "flos": 20157521794560.0, + "grad_norm": 2.5351053977844136, + "language_loss": 0.86927247, + "learning_rate": 3.999751211379863e-06, + "loss": 0.89265645, + "num_input_tokens_seen": 12262140, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.875, + "step": 580, + "time_per_iteration": 2.505908966064453 + }, + { + "auxiliary_loss_clip": 0.01266331, + "auxiliary_loss_mlp": 0.01063953, + "balance_loss_clip": 1.03536677, + "balance_loss_mlp": 1.07510614, + "epoch": 0.034931609800090184, + "flos": 15669585918720.0, + "grad_norm": 4.565959491833327, + "language_loss": 0.82161844, + "learning_rate": 3.999745030662987e-06, + "loss": 0.84492135, + "num_input_tokens_seen": 12280930, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.9140625, + "step": 581, + "time_per_iteration": 2.4735610485076904 + }, + { + "auxiliary_loss_clip": 0.01263493, + "auxiliary_loss_mlp": 0.01067149, + "balance_loss_clip": 1.03832436, + "balance_loss_mlp": 1.07712197, + "epoch": 0.034991733052758156, + "flos": 16362374509440.0, + "grad_norm": 2.2699668532214377, + "language_loss": 0.77498174, + "learning_rate": 3.99973877411558e-06, + "loss": 0.79828823, + "num_input_tokens_seen": 12299125, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.8671875, + "step": 582, + "time_per_iteration": 2.4596173763275146 + }, + { + "auxiliary_loss_clip": 0.01261728, + "auxiliary_loss_mlp": 0.01075668, + "balance_loss_clip": 1.04467332, + "balance_loss_mlp": 1.07715631, + "epoch": 0.03505185630542612, + "flos": 19387309438080.0, + "grad_norm": 2.0991939318744692, + "language_loss": 0.87632537, + "learning_rate": 3.999732441737877e-06, + "loss": 0.89969933, + "num_input_tokens_seen": 12316905, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 1.84375, + "step": 583, + "time_per_iteration": 2.46062970161438 + }, + { + "auxiliary_loss_clip": 0.01268555, + "auxiliary_loss_mlp": 0.01082553, + "balance_loss_clip": 1.05167794, + "balance_loss_mlp": 1.07587278, + "epoch": 0.03511197955809409, + "flos": 21323828401920.0, + "grad_norm": 2.3581841085942004, + "language_loss": 0.80997103, + "learning_rate": 3.99972603353012e-06, + "loss": 0.83348215, + "num_input_tokens_seen": 12335070, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.921875, + "step": 584, + "time_per_iteration": 2.4776926040649414 + }, + { + "auxiliary_loss_clip": 0.01262257, + "auxiliary_loss_mlp": 0.01063212, + "balance_loss_clip": 1.03326654, + "balance_loss_mlp": 1.0725317, + "epoch": 0.035172102810762065, + "flos": 14136595320960.0, + "grad_norm": 2.6245680316153743, + "language_loss": 0.92654932, + "learning_rate": 3.999719549492551e-06, + "loss": 0.94980395, + "num_input_tokens_seen": 12350315, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 1.8984375, + "step": 585, + "time_per_iteration": 2.486678123474121 + }, + { + "auxiliary_loss_clip": 0.01262479, + "auxiliary_loss_mlp": 0.01070226, + "balance_loss_clip": 1.04011369, + "balance_loss_mlp": 1.07368612, + "epoch": 0.03523222606343003, + "flos": 20296890564480.0, + "grad_norm": 2.4855014647160245, + "language_loss": 0.87484592, + "learning_rate": 3.9997129896254165e-06, + "loss": 0.89817297, + "num_input_tokens_seen": 12366030, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.890625, + "step": 586, + "time_per_iteration": 2.457772970199585 + }, + { + "auxiliary_loss_clip": 0.01269677, + "auxiliary_loss_mlp": 0.01071192, + "balance_loss_clip": 1.04137754, + "balance_loss_mlp": 1.07875896, + "epoch": 0.035292349316098, + "flos": 20375822701440.0, + "grad_norm": 1.7854143394247532, + "language_loss": 0.76574278, + "learning_rate": 3.999706353928965e-06, + "loss": 0.78915149, + "num_input_tokens_seen": 12384895, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.90625, + "step": 587, + "time_per_iteration": 2.4794015884399414 + }, + { + "auxiliary_loss_clip": 0.01269924, + "auxiliary_loss_mlp": 0.01061103, + "balance_loss_clip": 1.02991772, + "balance_loss_mlp": 1.07701528, + "epoch": 0.03535247256876597, + "flos": 21468871520640.0, + "grad_norm": 1.6805414217886456, + "language_loss": 0.78441286, + "learning_rate": 3.999699642403449e-06, + "loss": 0.80772316, + "num_input_tokens_seen": 12404980, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.9296875, + "step": 588, + "time_per_iteration": 2.4755733013153076 + }, + { + "auxiliary_loss_clip": 0.01267146, + "auxiliary_loss_mlp": 0.01071411, + "balance_loss_clip": 1.03850961, + "balance_loss_mlp": 1.07600832, + "epoch": 0.03541259582143394, + "flos": 23623044946560.0, + "grad_norm": 2.6477303031273185, + "language_loss": 0.94003904, + "learning_rate": 3.99969285504912e-06, + "loss": 0.96342462, + "num_input_tokens_seen": 12423835, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.9140625, + "step": 589, + "time_per_iteration": 2.515296459197998 + }, + { + "auxiliary_loss_clip": 0.01269747, + "auxiliary_loss_mlp": 0.01067695, + "balance_loss_clip": 1.03803611, + "balance_loss_mlp": 1.07632184, + "epoch": 0.03547271907410191, + "flos": 33726367768320.0, + "grad_norm": 2.4870139863099157, + "language_loss": 0.84060037, + "learning_rate": 3.99968599186624e-06, + "loss": 0.86397475, + "num_input_tokens_seen": 12443135, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.9375, + "step": 590, + "time_per_iteration": 2.583080291748047 + }, + { + "auxiliary_loss_clip": 0.01259593, + "auxiliary_loss_mlp": 0.01062628, + "balance_loss_clip": 1.0342319, + "balance_loss_mlp": 1.07476449, + "epoch": 0.03553284232676988, + "flos": 21142695093120.0, + "grad_norm": 2.031404841890899, + "language_loss": 0.86889851, + "learning_rate": 3.999679052855065e-06, + "loss": 0.89212072, + "num_input_tokens_seen": 12462895, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.8515625, + "step": 591, + "time_per_iteration": 2.497912883758545 + }, + { + "auxiliary_loss_clip": 0.01264593, + "auxiliary_loss_mlp": 0.01070221, + "balance_loss_clip": 1.03917849, + "balance_loss_mlp": 1.07271862, + "epoch": 0.03559296557943785, + "flos": 20046593617920.0, + "grad_norm": 3.1144902928375586, + "language_loss": 0.82980722, + "learning_rate": 3.999672038015861e-06, + "loss": 0.85315537, + "num_input_tokens_seen": 12481515, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 1.921875, + "step": 592, + "time_per_iteration": 2.463977813720703 + }, + { + "auxiliary_loss_clip": 0.01128612, + "auxiliary_loss_mlp": 0.01022486, + "balance_loss_clip": 1.01590526, + "balance_loss_mlp": 1.03881407, + "epoch": 0.035653088832105814, + "flos": 60334597244160.0, + "grad_norm": 0.8806680605255408, + "language_loss": 0.59741807, + "learning_rate": 3.999664947348893e-06, + "loss": 0.61892909, + "num_input_tokens_seen": 12548220, + "router_z_loss_clip": 0.06591797, + "router_z_loss_mlp": 0.8984375, + "step": 593, + "time_per_iteration": 3.1275696754455566 + }, + { + "auxiliary_loss_clip": 0.01262803, + "auxiliary_loss_mlp": 0.01070928, + "balance_loss_clip": 1.03988624, + "balance_loss_mlp": 1.07810974, + "epoch": 0.035713212084773786, + "flos": 20113135562880.0, + "grad_norm": 1.8853114596204945, + "language_loss": 0.87042278, + "learning_rate": 3.999657780854429e-06, + "loss": 0.89376009, + "num_input_tokens_seen": 12566105, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 1.84375, + "step": 594, + "time_per_iteration": 2.522805690765381 + }, + { + "auxiliary_loss_clip": 0.01262874, + "auxiliary_loss_mlp": 0.01064868, + "balance_loss_clip": 1.03539896, + "balance_loss_mlp": 1.07309461, + "epoch": 0.03577333533744176, + "flos": 26285785084800.0, + "grad_norm": 2.3431313884364395, + "language_loss": 0.83481348, + "learning_rate": 3.999650538532742e-06, + "loss": 0.85809088, + "num_input_tokens_seen": 12586680, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.8984375, + "step": 595, + "time_per_iteration": 2.565220832824707 + }, + { + "auxiliary_loss_clip": 0.01261367, + "auxiliary_loss_mlp": 0.01072242, + "balance_loss_clip": 1.04216576, + "balance_loss_mlp": 1.07610273, + "epoch": 0.035833458590109724, + "flos": 10889732211840.0, + "grad_norm": 3.1278930526147426, + "language_loss": 0.96185803, + "learning_rate": 3.999643220384106e-06, + "loss": 0.98519421, + "num_input_tokens_seen": 12601605, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.859375, + "step": 596, + "time_per_iteration": 2.460515260696411 + }, + { + "auxiliary_loss_clip": 0.0126361, + "auxiliary_loss_mlp": 0.0107037, + "balance_loss_clip": 1.04185498, + "balance_loss_mlp": 1.07627654, + "epoch": 0.035893581842777696, + "flos": 22090198003200.0, + "grad_norm": 2.2167421176017204, + "language_loss": 0.82718551, + "learning_rate": 3.999635826408799e-06, + "loss": 0.85052526, + "num_input_tokens_seen": 12620365, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.875, + "step": 597, + "time_per_iteration": 2.5076701641082764 + }, + { + "auxiliary_loss_clip": 0.01261023, + "auxiliary_loss_mlp": 0.01069081, + "balance_loss_clip": 1.03956461, + "balance_loss_mlp": 1.0784421, + "epoch": 0.03595370509544566, + "flos": 23038347358080.0, + "grad_norm": 2.168981908539252, + "language_loss": 0.81386817, + "learning_rate": 3.999628356607101e-06, + "loss": 0.83716923, + "num_input_tokens_seen": 12641140, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.828125, + "step": 598, + "time_per_iteration": 2.531188726425171 + }, + { + "auxiliary_loss_clip": 0.01254264, + "auxiliary_loss_mlp": 0.0106961, + "balance_loss_clip": 1.03894937, + "balance_loss_mlp": 1.07570839, + "epoch": 0.03601382834811363, + "flos": 20777734955520.0, + "grad_norm": 1.9075541218278638, + "language_loss": 0.81387949, + "learning_rate": 3.999620810979295e-06, + "loss": 0.83711827, + "num_input_tokens_seen": 12661080, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.7890625, + "step": 599, + "time_per_iteration": 2.511871576309204 + }, + { + "auxiliary_loss_clip": 0.01262476, + "auxiliary_loss_mlp": 0.01074253, + "balance_loss_clip": 1.04557085, + "balance_loss_mlp": 1.07350755, + "epoch": 0.036073951600781605, + "flos": 23951627585280.0, + "grad_norm": 2.1528215266255604, + "language_loss": 0.86115932, + "learning_rate": 3.999613189525668e-06, + "loss": 0.88452661, + "num_input_tokens_seen": 12678270, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.890625, + "step": 600, + "time_per_iteration": 2.50054669380188 + }, + { + "auxiliary_loss_clip": 0.01254617, + "auxiliary_loss_mlp": 0.01080731, + "balance_loss_clip": 1.05133438, + "balance_loss_mlp": 1.06909621, + "epoch": 0.03613407485344957, + "flos": 18912283050240.0, + "grad_norm": 3.928737875146519, + "language_loss": 0.82175761, + "learning_rate": 3.999605492246508e-06, + "loss": 0.84511113, + "num_input_tokens_seen": 12697295, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.8515625, + "step": 601, + "time_per_iteration": 6.795202255249023 + }, + { + "auxiliary_loss_clip": 0.01253245, + "auxiliary_loss_mlp": 0.01056304, + "balance_loss_clip": 1.02666831, + "balance_loss_mlp": 1.07096183, + "epoch": 0.03619419810611754, + "flos": 23038526926080.0, + "grad_norm": 2.2629653513719252, + "language_loss": 0.75467926, + "learning_rate": 3.999597719142107e-06, + "loss": 0.77777481, + "num_input_tokens_seen": 12716165, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.8203125, + "step": 602, + "time_per_iteration": 2.503629446029663 + }, + { + "auxiliary_loss_clip": 0.01252806, + "auxiliary_loss_mlp": 0.01057069, + "balance_loss_clip": 1.02833962, + "balance_loss_mlp": 1.07078326, + "epoch": 0.03625432135878551, + "flos": 29457774293760.0, + "grad_norm": 1.9962737747137984, + "language_loss": 0.80078572, + "learning_rate": 3.999589870212761e-06, + "loss": 0.82388449, + "num_input_tokens_seen": 12735475, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.8203125, + "step": 603, + "time_per_iteration": 2.568368911743164 + }, + { + "auxiliary_loss_clip": 0.01258325, + "auxiliary_loss_mlp": 0.01061531, + "balance_loss_clip": 1.03320646, + "balance_loss_mlp": 1.07597041, + "epoch": 0.03631444461145348, + "flos": 23508525409920.0, + "grad_norm": 1.9836566776981934, + "language_loss": 0.86801207, + "learning_rate": 3.9995819454587664e-06, + "loss": 0.89121068, + "num_input_tokens_seen": 12754540, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.8203125, + "step": 604, + "time_per_iteration": 2.496415376663208 + }, + { + "auxiliary_loss_clip": 0.01260423, + "auxiliary_loss_mlp": 0.01065702, + "balance_loss_clip": 1.03549433, + "balance_loss_mlp": 1.07688427, + "epoch": 0.03637456786412145, + "flos": 16618130323200.0, + "grad_norm": 3.252638522711271, + "language_loss": 0.81078291, + "learning_rate": 3.999573944880424e-06, + "loss": 0.83404416, + "num_input_tokens_seen": 12773050, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.828125, + "step": 605, + "time_per_iteration": 2.46071457862854 + }, + { + "auxiliary_loss_clip": 0.01255946, + "auxiliary_loss_mlp": 0.01067125, + "balance_loss_clip": 1.04012406, + "balance_loss_mlp": 1.07317901, + "epoch": 0.03643469111678942, + "flos": 15851832549120.0, + "grad_norm": 2.2162807408147964, + "language_loss": 0.85624671, + "learning_rate": 3.9995658684780375e-06, + "loss": 0.87947738, + "num_input_tokens_seen": 12791240, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.828125, + "step": 606, + "time_per_iteration": 2.450775623321533 + }, + { + "auxiliary_loss_clip": 0.01262483, + "auxiliary_loss_mlp": 0.01072166, + "balance_loss_clip": 1.04279351, + "balance_loss_mlp": 1.07551849, + "epoch": 0.03649481436945739, + "flos": 23620387340160.0, + "grad_norm": 2.1498788116147125, + "language_loss": 0.82370651, + "learning_rate": 3.999557716251912e-06, + "loss": 0.84705305, + "num_input_tokens_seen": 12812245, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.8671875, + "step": 607, + "time_per_iteration": 2.4969747066497803 + }, + { + "auxiliary_loss_clip": 0.01255757, + "auxiliary_loss_mlp": 0.01063348, + "balance_loss_clip": 1.035954, + "balance_loss_mlp": 1.07488835, + "epoch": 0.036554937622125354, + "flos": 21755581879680.0, + "grad_norm": 3.329641026295442, + "language_loss": 0.8315016, + "learning_rate": 3.999549488202358e-06, + "loss": 0.8546927, + "num_input_tokens_seen": 12831085, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.8046875, + "step": 608, + "time_per_iteration": 2.4648640155792236 + }, + { + "auxiliary_loss_clip": 0.01260127, + "auxiliary_loss_mlp": 0.0106578, + "balance_loss_clip": 1.03533435, + "balance_loss_mlp": 1.0769459, + "epoch": 0.036615060874793326, + "flos": 17819772935040.0, + "grad_norm": 2.072924568315734, + "language_loss": 0.82258713, + "learning_rate": 3.999541184329688e-06, + "loss": 0.84584618, + "num_input_tokens_seen": 12849115, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.828125, + "step": 609, + "time_per_iteration": 2.4761714935302734 + }, + { + "auxiliary_loss_clip": 0.01266536, + "auxiliary_loss_mlp": 0.01080333, + "balance_loss_clip": 1.05247378, + "balance_loss_mlp": 1.08229148, + "epoch": 0.0366751841274613, + "flos": 26753808320640.0, + "grad_norm": 2.279075715646142, + "language_loss": 0.7924515, + "learning_rate": 3.999532804634215e-06, + "loss": 0.81592017, + "num_input_tokens_seen": 12868005, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.84375, + "step": 610, + "time_per_iteration": 2.512223958969116 + }, + { + "auxiliary_loss_clip": 0.01265179, + "auxiliary_loss_mlp": 0.01076881, + "balance_loss_clip": 1.04767442, + "balance_loss_mlp": 1.07819688, + "epoch": 0.03673530738012926, + "flos": 22196960202240.0, + "grad_norm": 2.108980449215705, + "language_loss": 0.87263799, + "learning_rate": 3.9995243491162575e-06, + "loss": 0.89605856, + "num_input_tokens_seen": 12886890, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.8671875, + "step": 611, + "time_per_iteration": 2.488800525665283 + }, + { + "auxiliary_loss_clip": 0.01257304, + "auxiliary_loss_mlp": 0.01084406, + "balance_loss_clip": 1.05577183, + "balance_loss_mlp": 1.0769043, + "epoch": 0.036795430632797235, + "flos": 24681655601280.0, + "grad_norm": 2.0539399448943145, + "language_loss": 0.72783852, + "learning_rate": 3.999515817776136e-06, + "loss": 0.75125557, + "num_input_tokens_seen": 12906130, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.8046875, + "step": 612, + "time_per_iteration": 2.4950740337371826 + }, + { + "auxiliary_loss_clip": 0.01258776, + "auxiliary_loss_mlp": 0.01069045, + "balance_loss_clip": 1.03999329, + "balance_loss_mlp": 1.07377708, + "epoch": 0.0368555538854652, + "flos": 17748921358080.0, + "grad_norm": 2.903841869182041, + "language_loss": 0.7909385, + "learning_rate": 3.999507210614175e-06, + "loss": 0.81421661, + "num_input_tokens_seen": 12925260, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.8515625, + "step": 613, + "time_per_iteration": 2.4849369525909424 + }, + { + "auxiliary_loss_clip": 0.01253943, + "auxiliary_loss_mlp": 0.01079095, + "balance_loss_clip": 1.05141413, + "balance_loss_mlp": 1.07326341, + "epoch": 0.03691567713813317, + "flos": 20594554571520.0, + "grad_norm": 2.273957434397869, + "language_loss": 0.93266213, + "learning_rate": 3.9994985276307e-06, + "loss": 0.95599246, + "num_input_tokens_seen": 12944590, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.8125, + "step": 614, + "time_per_iteration": 2.4639992713928223 + }, + { + "auxiliary_loss_clip": 0.01263574, + "auxiliary_loss_mlp": 0.01075313, + "balance_loss_clip": 1.04415178, + "balance_loss_mlp": 1.07938302, + "epoch": 0.036975800390801145, + "flos": 33650380546560.0, + "grad_norm": 2.901964177226116, + "language_loss": 0.72534943, + "learning_rate": 3.999489768826041e-06, + "loss": 0.74873829, + "num_input_tokens_seen": 12964785, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.84375, + "step": 615, + "time_per_iteration": 2.601372480392456 + }, + { + "auxiliary_loss_clip": 0.01258092, + "auxiliary_loss_mlp": 0.01071353, + "balance_loss_clip": 1.04299331, + "balance_loss_mlp": 1.07278967, + "epoch": 0.03703592364346911, + "flos": 28293694329600.0, + "grad_norm": 2.023635364571096, + "language_loss": 0.81449711, + "learning_rate": 3.999480934200528e-06, + "loss": 0.83779156, + "num_input_tokens_seen": 12986705, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.8515625, + "step": 616, + "time_per_iteration": 2.5325467586517334 + }, + { + "auxiliary_loss_clip": 0.01256707, + "auxiliary_loss_mlp": 0.01063142, + "balance_loss_clip": 1.03643894, + "balance_loss_mlp": 1.07431316, + "epoch": 0.03709604689613708, + "flos": 31504215853440.0, + "grad_norm": 1.9753277492127743, + "language_loss": 0.67868775, + "learning_rate": 3.999472023754499e-06, + "loss": 0.7018863, + "num_input_tokens_seen": 13010560, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.828125, + "step": 617, + "time_per_iteration": 2.5784177780151367 + }, + { + "auxiliary_loss_clip": 0.01263095, + "auxiliary_loss_mlp": 0.01064585, + "balance_loss_clip": 1.0349381, + "balance_loss_mlp": 1.07892454, + "epoch": 0.03715617014880505, + "flos": 19609381272960.0, + "grad_norm": 3.556814357499394, + "language_loss": 0.80340034, + "learning_rate": 3.99946303748829e-06, + "loss": 0.8266772, + "num_input_tokens_seen": 13028935, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.84375, + "step": 618, + "time_per_iteration": 2.4876792430877686 + }, + { + "auxiliary_loss_clip": 0.01261829, + "auxiliary_loss_mlp": 0.01070874, + "balance_loss_clip": 1.04059458, + "balance_loss_mlp": 1.07458091, + "epoch": 0.03721629340147302, + "flos": 15924192497280.0, + "grad_norm": 2.355648226269084, + "language_loss": 0.91115171, + "learning_rate": 3.999453975402242e-06, + "loss": 0.93447876, + "num_input_tokens_seen": 13046000, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.875, + "step": 619, + "time_per_iteration": 2.4804162979125977 + }, + { + "auxiliary_loss_clip": 0.01259898, + "auxiliary_loss_mlp": 0.01077134, + "balance_loss_clip": 1.04871452, + "balance_loss_mlp": 1.07845378, + "epoch": 0.03727641665414099, + "flos": 21104090951040.0, + "grad_norm": 2.218621959424752, + "language_loss": 0.94397002, + "learning_rate": 3.9994448374967e-06, + "loss": 0.96734041, + "num_input_tokens_seen": 13062995, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.8125, + "step": 620, + "time_per_iteration": 2.4592232704162598 + }, + { + "auxiliary_loss_clip": 0.01257463, + "auxiliary_loss_mlp": 0.01077616, + "balance_loss_clip": 1.04750419, + "balance_loss_mlp": 1.07455909, + "epoch": 0.037336539906808956, + "flos": 24131683486080.0, + "grad_norm": 1.8159025601621845, + "language_loss": 0.77105826, + "learning_rate": 3.999435623772008e-06, + "loss": 0.7944091, + "num_input_tokens_seen": 13084120, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.828125, + "step": 621, + "time_per_iteration": 2.53365159034729 + }, + { + "auxiliary_loss_clip": 0.01255819, + "auxiliary_loss_mlp": 0.01059811, + "balance_loss_clip": 1.03084326, + "balance_loss_mlp": 1.07761526, + "epoch": 0.03739666315947693, + "flos": 22346384780160.0, + "grad_norm": 2.793013868715132, + "language_loss": 0.86895752, + "learning_rate": 3.999426334228518e-06, + "loss": 0.89211386, + "num_input_tokens_seen": 13100035, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.78125, + "step": 622, + "time_per_iteration": 2.472726583480835 + }, + { + "auxiliary_loss_clip": 0.01258428, + "auxiliary_loss_mlp": 0.01064577, + "balance_loss_clip": 1.03591871, + "balance_loss_mlp": 1.07622766, + "epoch": 0.0374567864121449, + "flos": 20449511452800.0, + "grad_norm": 2.261361439009279, + "language_loss": 0.90376818, + "learning_rate": 3.999416968866581e-06, + "loss": 0.9269982, + "num_input_tokens_seen": 13118070, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.8203125, + "step": 623, + "time_per_iteration": 2.486699104309082 + }, + { + "auxiliary_loss_clip": 0.0125978, + "auxiliary_loss_mlp": 0.01075147, + "balance_loss_clip": 1.04626298, + "balance_loss_mlp": 1.07841158, + "epoch": 0.037516909664812866, + "flos": 19208043636480.0, + "grad_norm": 1.9910669563462169, + "language_loss": 0.84149444, + "learning_rate": 3.999407527686551e-06, + "loss": 0.86484373, + "num_input_tokens_seen": 13136355, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.8125, + "step": 624, + "time_per_iteration": 2.4514520168304443 + }, + { + "auxiliary_loss_clip": 0.01261437, + "auxiliary_loss_mlp": 0.01067743, + "balance_loss_clip": 1.03867936, + "balance_loss_mlp": 1.0750618, + "epoch": 0.03757703291748084, + "flos": 35005218664320.0, + "grad_norm": 2.4867963928692554, + "language_loss": 0.66228586, + "learning_rate": 3.999398010688788e-06, + "loss": 0.68557763, + "num_input_tokens_seen": 13155435, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.8671875, + "step": 625, + "time_per_iteration": 2.5765273571014404 + }, + { + "auxiliary_loss_clip": 0.01253583, + "auxiliary_loss_mlp": 0.0106714, + "balance_loss_clip": 1.03697979, + "balance_loss_mlp": 1.07435441, + "epoch": 0.0376371561701488, + "flos": 25483899911040.0, + "grad_norm": 2.071255255654034, + "language_loss": 0.77375329, + "learning_rate": 3.999388417873652e-06, + "loss": 0.79696059, + "num_input_tokens_seen": 13174295, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.7890625, + "step": 626, + "time_per_iteration": 2.5022406578063965 + }, + { + "auxiliary_loss_clip": 0.01258684, + "auxiliary_loss_mlp": 0.01074389, + "balance_loss_clip": 1.04499173, + "balance_loss_mlp": 1.07735705, + "epoch": 0.037697279422816775, + "flos": 18185630912640.0, + "grad_norm": 2.2077512286027288, + "language_loss": 0.81357861, + "learning_rate": 3.999378749241506e-06, + "loss": 0.83690929, + "num_input_tokens_seen": 13192500, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.8125, + "step": 627, + "time_per_iteration": 2.4750607013702393 + }, + { + "auxiliary_loss_clip": 0.01261632, + "auxiliary_loss_mlp": 0.01076941, + "balance_loss_clip": 1.04768682, + "balance_loss_mlp": 1.07859111, + "epoch": 0.03775740267548475, + "flos": 24644272521600.0, + "grad_norm": 3.546199216596373, + "language_loss": 0.88572276, + "learning_rate": 3.999369004792719e-06, + "loss": 0.90910852, + "num_input_tokens_seen": 13213470, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.828125, + "step": 628, + "time_per_iteration": 2.571899890899658 + }, + { + "auxiliary_loss_clip": 0.01253553, + "auxiliary_loss_mlp": 0.01067038, + "balance_loss_clip": 1.03864217, + "balance_loss_mlp": 1.07086658, + "epoch": 0.03781752592815271, + "flos": 21288205088640.0, + "grad_norm": 2.488861546346732, + "language_loss": 0.79683006, + "learning_rate": 3.999359184527658e-06, + "loss": 0.82003593, + "num_input_tokens_seen": 13232365, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.828125, + "step": 629, + "time_per_iteration": 2.486675262451172 + }, + { + "auxiliary_loss_clip": 0.01258011, + "auxiliary_loss_mlp": 0.01067816, + "balance_loss_clip": 1.03977799, + "balance_loss_mlp": 1.07458425, + "epoch": 0.037877649180820684, + "flos": 22089623385600.0, + "grad_norm": 1.7117761504495859, + "language_loss": 0.76808703, + "learning_rate": 3.999349288446696e-06, + "loss": 0.79134536, + "num_input_tokens_seen": 13251920, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.8359375, + "step": 630, + "time_per_iteration": 2.494297742843628 + }, + { + "auxiliary_loss_clip": 0.01262746, + "auxiliary_loss_mlp": 0.01070638, + "balance_loss_clip": 1.04250503, + "balance_loss_mlp": 1.07651484, + "epoch": 0.03793777243348865, + "flos": 14501339976960.0, + "grad_norm": 2.6765452133705403, + "language_loss": 0.91492796, + "learning_rate": 3.99933931655021e-06, + "loss": 0.93826187, + "num_input_tokens_seen": 13267440, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.859375, + "step": 631, + "time_per_iteration": 2.4605348110198975 + }, + { + "auxiliary_loss_clip": 0.01252436, + "auxiliary_loss_mlp": 0.01076716, + "balance_loss_clip": 1.04560196, + "balance_loss_mlp": 1.07244229, + "epoch": 0.03799789568615662, + "flos": 21908418249600.0, + "grad_norm": 1.669704350294595, + "language_loss": 0.9207651, + "learning_rate": 3.999329268838575e-06, + "loss": 0.94405663, + "num_input_tokens_seen": 13287850, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 1.796875, + "step": 632, + "time_per_iteration": 2.518498659133911 + }, + { + "auxiliary_loss_clip": 0.01255106, + "auxiliary_loss_mlp": 0.01058467, + "balance_loss_clip": 1.03069162, + "balance_loss_mlp": 1.07462335, + "epoch": 0.03805801893882459, + "flos": 24827021942400.0, + "grad_norm": 2.0828864645498872, + "language_loss": 0.8341018, + "learning_rate": 3.999319145312175e-06, + "loss": 0.85723758, + "num_input_tokens_seen": 13307760, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.8046875, + "step": 633, + "time_per_iteration": 2.5217537879943848 + }, + { + "auxiliary_loss_clip": 0.01258224, + "auxiliary_loss_mlp": 0.01071025, + "balance_loss_clip": 1.04153264, + "balance_loss_mlp": 1.07408428, + "epoch": 0.03811814219149256, + "flos": 30482952364800.0, + "grad_norm": 1.6987522649376106, + "language_loss": 0.69638437, + "learning_rate": 3.999308945971392e-06, + "loss": 0.71967685, + "num_input_tokens_seen": 13331230, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.84375, + "step": 634, + "time_per_iteration": 2.5694239139556885 + }, + { + "auxiliary_loss_clip": 0.01127675, + "auxiliary_loss_mlp": 0.01017483, + "balance_loss_clip": 1.0106163, + "balance_loss_mlp": 1.04225707, + "epoch": 0.03817826544416053, + "flos": 66992577379200.0, + "grad_norm": 0.8852243261294688, + "language_loss": 0.61585373, + "learning_rate": 3.999298670816614e-06, + "loss": 0.63730532, + "num_input_tokens_seen": 13394760, + "router_z_loss_clip": 0.06884766, + "router_z_loss_mlp": 0.8515625, + "step": 635, + "time_per_iteration": 3.1059212684631348 + }, + { + "auxiliary_loss_clip": 0.01253433, + "auxiliary_loss_mlp": 0.01068627, + "balance_loss_clip": 1.03930187, + "balance_loss_mlp": 1.07354546, + "epoch": 0.038238388696828496, + "flos": 20485350247680.0, + "grad_norm": 2.2313569204055246, + "language_loss": 0.83721048, + "learning_rate": 3.9992883198482294e-06, + "loss": 0.86043108, + "num_input_tokens_seen": 13412775, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.796875, + "step": 636, + "time_per_iteration": 2.4975383281707764 + }, + { + "auxiliary_loss_clip": 0.01258014, + "auxiliary_loss_mlp": 0.01077997, + "balance_loss_clip": 1.04852867, + "balance_loss_mlp": 1.07623935, + "epoch": 0.03829851194949647, + "flos": 17965893461760.0, + "grad_norm": 2.4018992949787847, + "language_loss": 0.79327047, + "learning_rate": 3.999277893066632e-06, + "loss": 0.8166306, + "num_input_tokens_seen": 13427835, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.8203125, + "step": 637, + "time_per_iteration": 2.4560744762420654 + }, + { + "auxiliary_loss_clip": 0.01258084, + "auxiliary_loss_mlp": 0.01073075, + "balance_loss_clip": 1.04342771, + "balance_loss_mlp": 1.07309079, + "epoch": 0.03835863520216444, + "flos": 22456522857600.0, + "grad_norm": 2.8779285506389924, + "language_loss": 0.8410306, + "learning_rate": 3.999267390472215e-06, + "loss": 0.86434221, + "num_input_tokens_seen": 13447295, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.8515625, + "step": 638, + "time_per_iteration": 2.504343271255493 + }, + { + "auxiliary_loss_clip": 0.01263348, + "auxiliary_loss_mlp": 0.01067898, + "balance_loss_clip": 1.03717756, + "balance_loss_mlp": 1.07495832, + "epoch": 0.038418758454832405, + "flos": 22164425458560.0, + "grad_norm": 2.5416523890288976, + "language_loss": 0.70099992, + "learning_rate": 3.999256812065381e-06, + "loss": 0.72431237, + "num_input_tokens_seen": 13468455, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.890625, + "step": 639, + "time_per_iteration": 2.52817964553833 + }, + { + "auxiliary_loss_clip": 0.01259266, + "auxiliary_loss_mlp": 0.01075603, + "balance_loss_clip": 1.04463232, + "balance_loss_mlp": 1.07514286, + "epoch": 0.03847888170750038, + "flos": 22747435107840.0, + "grad_norm": 2.42201861797838, + "language_loss": 0.85030365, + "learning_rate": 3.999246157846526e-06, + "loss": 0.8736524, + "num_input_tokens_seen": 13489085, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.84375, + "step": 640, + "time_per_iteration": 2.503262758255005 + }, + { + "auxiliary_loss_clip": 0.0126167, + "auxiliary_loss_mlp": 0.0107911, + "balance_loss_clip": 1.04725742, + "balance_loss_mlp": 1.07574821, + "epoch": 0.03853900496016834, + "flos": 22711201263360.0, + "grad_norm": 2.3722848939528953, + "language_loss": 0.82117289, + "learning_rate": 3.9992354278160574e-06, + "loss": 0.84458065, + "num_input_tokens_seen": 13509120, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 1.859375, + "step": 641, + "time_per_iteration": 2.51052188873291 + }, + { + "auxiliary_loss_clip": 0.01118992, + "auxiliary_loss_mlp": 0.01008303, + "balance_loss_clip": 1.00205636, + "balance_loss_mlp": 1.03414774, + "epoch": 0.038599128212836314, + "flos": 70399136355840.0, + "grad_norm": 0.9008353353488252, + "language_loss": 0.6540072, + "learning_rate": 3.999224621974381e-06, + "loss": 0.67528021, + "num_input_tokens_seen": 13562005, + "router_z_loss_clip": 0.06225586, + "router_z_loss_mlp": 0.8515625, + "step": 642, + "time_per_iteration": 4.430839538574219 + }, + { + "auxiliary_loss_clip": 0.01256856, + "auxiliary_loss_mlp": 0.01062608, + "balance_loss_clip": 1.03433132, + "balance_loss_mlp": 1.07364345, + "epoch": 0.03865925146550429, + "flos": 23295144666240.0, + "grad_norm": 1.9870813050305103, + "language_loss": 0.79512584, + "learning_rate": 3.999213740321906e-06, + "loss": 0.81832051, + "num_input_tokens_seen": 13582185, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.8359375, + "step": 643, + "time_per_iteration": 5.386199951171875 + }, + { + "auxiliary_loss_clip": 0.01255871, + "auxiliary_loss_mlp": 0.01074414, + "balance_loss_clip": 1.0456841, + "balance_loss_mlp": 1.07266629, + "epoch": 0.03871937471817225, + "flos": 21430446946560.0, + "grad_norm": 2.074949815918338, + "language_loss": 0.82926929, + "learning_rate": 3.999202782859046e-06, + "loss": 0.85257208, + "num_input_tokens_seen": 13599555, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.828125, + "step": 644, + "time_per_iteration": 2.45499587059021 + }, + { + "auxiliary_loss_clip": 0.01260265, + "auxiliary_loss_mlp": 0.0106622, + "balance_loss_clip": 1.03503489, + "balance_loss_mlp": 1.07482159, + "epoch": 0.038779497970840224, + "flos": 34277309550720.0, + "grad_norm": 2.258008571643512, + "language_loss": 0.82131916, + "learning_rate": 3.9991917495862165e-06, + "loss": 0.84458399, + "num_input_tokens_seen": 13621160, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.859375, + "step": 645, + "time_per_iteration": 2.610435724258423 + }, + { + "auxiliary_loss_clip": 0.0126099, + "auxiliary_loss_mlp": 0.01070847, + "balance_loss_clip": 1.04121125, + "balance_loss_mlp": 1.07544899, + "epoch": 0.03883962122350819, + "flos": 22748189293440.0, + "grad_norm": 2.4729923618605554, + "language_loss": 0.82006776, + "learning_rate": 3.9991806405038345e-06, + "loss": 0.84338611, + "num_input_tokens_seen": 13641915, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.8515625, + "step": 646, + "time_per_iteration": 2.4771342277526855 + }, + { + "auxiliary_loss_clip": 0.01260575, + "auxiliary_loss_mlp": 0.01080585, + "balance_loss_clip": 1.05123544, + "balance_loss_mlp": 1.07928514, + "epoch": 0.03889974447617616, + "flos": 21945837242880.0, + "grad_norm": 1.8327945326632593, + "language_loss": 0.81973422, + "learning_rate": 3.999169455612323e-06, + "loss": 0.84314579, + "num_input_tokens_seen": 13661410, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.8125, + "step": 647, + "time_per_iteration": 2.522347927093506 + }, + { + "auxiliary_loss_clip": 0.01260388, + "auxiliary_loss_mlp": 0.01069051, + "balance_loss_clip": 1.03965366, + "balance_loss_mlp": 1.07776546, + "epoch": 0.03895986772884413, + "flos": 31504826384640.0, + "grad_norm": 1.9222642653000834, + "language_loss": 0.84699827, + "learning_rate": 3.999158194912106e-06, + "loss": 0.87029266, + "num_input_tokens_seen": 13681705, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.828125, + "step": 648, + "time_per_iteration": 2.561929941177368 + }, + { + "auxiliary_loss_clip": 0.01258218, + "auxiliary_loss_mlp": 0.01071465, + "balance_loss_clip": 1.041448, + "balance_loss_mlp": 1.07636404, + "epoch": 0.0390199909815121, + "flos": 19901011795200.0, + "grad_norm": 3.7283662397985053, + "language_loss": 0.84446943, + "learning_rate": 3.9991468584036086e-06, + "loss": 0.86776626, + "num_input_tokens_seen": 13700400, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.8203125, + "step": 649, + "time_per_iteration": 2.477027416229248 + }, + { + "auxiliary_loss_clip": 0.01259496, + "auxiliary_loss_mlp": 0.01070031, + "balance_loss_clip": 1.03977561, + "balance_loss_mlp": 1.07551885, + "epoch": 0.03908011423418007, + "flos": 21612478095360.0, + "grad_norm": 1.8508721849532739, + "language_loss": 0.79670662, + "learning_rate": 3.999135446087263e-06, + "loss": 0.8200019, + "num_input_tokens_seen": 13720145, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.84375, + "step": 650, + "time_per_iteration": 2.482440948486328 + }, + { + "auxiliary_loss_clip": 0.0125375, + "auxiliary_loss_mlp": 0.01072982, + "balance_loss_clip": 1.04314423, + "balance_loss_mlp": 1.07259929, + "epoch": 0.039140237486848035, + "flos": 18661411486080.0, + "grad_norm": 2.708739352564946, + "language_loss": 0.78509629, + "learning_rate": 3.9991239579635e-06, + "loss": 0.80836356, + "num_input_tokens_seen": 13737500, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.8125, + "step": 651, + "time_per_iteration": 2.4757516384124756 + }, + { + "auxiliary_loss_clip": 0.01255418, + "auxiliary_loss_mlp": 0.01082545, + "balance_loss_clip": 1.05004883, + "balance_loss_mlp": 1.0719974, + "epoch": 0.03920036073951601, + "flos": 18661124177280.0, + "grad_norm": 2.7896665115169244, + "language_loss": 0.88031149, + "learning_rate": 3.999112394032757e-06, + "loss": 0.90369117, + "num_input_tokens_seen": 13754750, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 1.8359375, + "step": 652, + "time_per_iteration": 2.4425668716430664 + }, + { + "auxiliary_loss_clip": 0.01249027, + "auxiliary_loss_mlp": 0.01069663, + "balance_loss_clip": 1.0411005, + "balance_loss_mlp": 1.07108784, + "epoch": 0.03926048399218398, + "flos": 31354468053120.0, + "grad_norm": 3.185528651545475, + "language_loss": 0.79044777, + "learning_rate": 3.999100754295471e-06, + "loss": 0.81363463, + "num_input_tokens_seen": 13771990, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.78125, + "step": 653, + "time_per_iteration": 2.5651934146881104 + }, + { + "auxiliary_loss_clip": 0.01264568, + "auxiliary_loss_mlp": 0.01070462, + "balance_loss_clip": 1.03996825, + "balance_loss_mlp": 1.07603264, + "epoch": 0.039320607244851945, + "flos": 29603499770880.0, + "grad_norm": 2.207303268368246, + "language_loss": 0.86304128, + "learning_rate": 3.999089038752085e-06, + "loss": 0.88639158, + "num_input_tokens_seen": 13792750, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.8828125, + "step": 654, + "time_per_iteration": 2.533297061920166 + }, + { + "auxiliary_loss_clip": 0.01115043, + "auxiliary_loss_mlp": 0.01012751, + "balance_loss_clip": 1.00710094, + "balance_loss_mlp": 1.03246427, + "epoch": 0.03938073049751992, + "flos": 66534609951360.0, + "grad_norm": 0.7205066186016396, + "language_loss": 0.49900642, + "learning_rate": 3.999077247403041e-06, + "loss": 0.5202843, + "num_input_tokens_seen": 13858570, + "router_z_loss_clip": 0.05639648, + "router_z_loss_mlp": 0.82421875, + "step": 655, + "time_per_iteration": 3.1399919986724854 + }, + { + "auxiliary_loss_clip": 0.01251012, + "auxiliary_loss_mlp": 0.01066863, + "balance_loss_clip": 1.03866971, + "balance_loss_mlp": 1.07330465, + "epoch": 0.03944085375018788, + "flos": 23367827836800.0, + "grad_norm": 2.4228021909793918, + "language_loss": 0.80845964, + "learning_rate": 3.9990653802487886e-06, + "loss": 0.83163846, + "num_input_tokens_seen": 13876335, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.78125, + "step": 656, + "time_per_iteration": 2.5063297748565674 + }, + { + "auxiliary_loss_clip": 0.01264211, + "auxiliary_loss_mlp": 0.0109165, + "balance_loss_clip": 1.0566026, + "balance_loss_mlp": 1.07672703, + "epoch": 0.039500977002855854, + "flos": 18548292579840.0, + "grad_norm": 2.8602268717749526, + "language_loss": 0.76602596, + "learning_rate": 3.999053437289776e-06, + "loss": 0.78958458, + "num_input_tokens_seen": 13892640, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 1.875, + "step": 657, + "time_per_iteration": 2.4405555725097656 + }, + { + "auxiliary_loss_clip": 0.01258331, + "auxiliary_loss_mlp": 0.01071967, + "balance_loss_clip": 1.04192615, + "balance_loss_mlp": 1.07452726, + "epoch": 0.039561100255523826, + "flos": 25338174433920.0, + "grad_norm": 2.1526815744488945, + "language_loss": 0.81690443, + "learning_rate": 3.999041418526457e-06, + "loss": 0.84020746, + "num_input_tokens_seen": 13910085, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.84375, + "step": 658, + "time_per_iteration": 2.5383949279785156 + }, + { + "auxiliary_loss_clip": 0.01252051, + "auxiliary_loss_mlp": 0.01072669, + "balance_loss_clip": 1.04091132, + "balance_loss_mlp": 1.07283425, + "epoch": 0.03962122350819179, + "flos": 18219889509120.0, + "grad_norm": 2.2075021313123777, + "language_loss": 0.91331315, + "learning_rate": 3.999029323959287e-06, + "loss": 0.93656039, + "num_input_tokens_seen": 13928800, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.796875, + "step": 659, + "time_per_iteration": 2.4678854942321777 + }, + { + "auxiliary_loss_clip": 0.01259034, + "auxiliary_loss_mlp": 0.01066414, + "balance_loss_clip": 1.03699267, + "balance_loss_mlp": 1.07427669, + "epoch": 0.03968134676085976, + "flos": 20522230536960.0, + "grad_norm": 2.5412719342676215, + "language_loss": 0.79241848, + "learning_rate": 3.999017153588724e-06, + "loss": 0.81567293, + "num_input_tokens_seen": 13948325, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.8515625, + "step": 660, + "time_per_iteration": 2.5135834217071533 + }, + { + "auxiliary_loss_clip": 0.01255641, + "auxiliary_loss_mlp": 0.01070807, + "balance_loss_clip": 1.04017007, + "balance_loss_mlp": 1.07534087, + "epoch": 0.03974147001352773, + "flos": 22422587483520.0, + "grad_norm": 1.6909533460123631, + "language_loss": 0.81942898, + "learning_rate": 3.999004907415231e-06, + "loss": 0.84269351, + "num_input_tokens_seen": 13969090, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.796875, + "step": 661, + "time_per_iteration": 2.513702154159546 + }, + { + "auxiliary_loss_clip": 0.01112947, + "auxiliary_loss_mlp": 0.01010967, + "balance_loss_clip": 1.00519753, + "balance_loss_mlp": 1.03039932, + "epoch": 0.0398015932661957, + "flos": 71128769322240.0, + "grad_norm": 0.9113020435813882, + "language_loss": 0.69376045, + "learning_rate": 3.998992585439272e-06, + "loss": 0.7149995, + "num_input_tokens_seen": 14037555, + "router_z_loss_clip": 0.05761719, + "router_z_loss_mlp": 0.82421875, + "step": 662, + "time_per_iteration": 3.2435107231140137 + }, + { + "auxiliary_loss_clip": 0.01260063, + "auxiliary_loss_mlp": 0.01071537, + "balance_loss_clip": 1.04113865, + "balance_loss_mlp": 1.0779382, + "epoch": 0.03986171651886367, + "flos": 16800951571200.0, + "grad_norm": 2.025040011333182, + "language_loss": 0.83253002, + "learning_rate": 3.998980187661314e-06, + "loss": 0.85584599, + "num_input_tokens_seen": 14055765, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.8125, + "step": 663, + "time_per_iteration": 2.5213887691497803 + }, + { + "auxiliary_loss_clip": 0.01261822, + "auxiliary_loss_mlp": 0.0106269, + "balance_loss_clip": 1.032125, + "balance_loss_mlp": 1.07768416, + "epoch": 0.03992183977153164, + "flos": 24535068197760.0, + "grad_norm": 2.8595031628608143, + "language_loss": 0.87538105, + "learning_rate": 3.998967714081826e-06, + "loss": 0.89862621, + "num_input_tokens_seen": 14074195, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.84375, + "step": 664, + "time_per_iteration": 2.516810655593872 + }, + { + "auxiliary_loss_clip": 0.0125116, + "auxiliary_loss_mlp": 0.01060628, + "balance_loss_clip": 1.03065872, + "balance_loss_mlp": 1.07347679, + "epoch": 0.03998196302419961, + "flos": 15595897167360.0, + "grad_norm": 2.3519362819230625, + "language_loss": 0.84738994, + "learning_rate": 3.998955164701281e-06, + "loss": 0.87050784, + "num_input_tokens_seen": 14090215, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 1.7734375, + "step": 665, + "time_per_iteration": 2.4348978996276855 + }, + { + "auxiliary_loss_clip": 0.01263346, + "auxiliary_loss_mlp": 0.01087391, + "balance_loss_clip": 1.05525231, + "balance_loss_mlp": 1.07680821, + "epoch": 0.04004208627686758, + "flos": 25305065072640.0, + "grad_norm": 2.1279588772882687, + "language_loss": 0.81491798, + "learning_rate": 3.998942539520158e-06, + "loss": 0.83842534, + "num_input_tokens_seen": 14112150, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 1.8671875, + "step": 666, + "time_per_iteration": 2.564187526702881 + }, + { + "auxiliary_loss_clip": 0.01252779, + "auxiliary_loss_mlp": 0.01074876, + "balance_loss_clip": 1.04276049, + "balance_loss_mlp": 1.07225358, + "epoch": 0.04010220952953555, + "flos": 23475847011840.0, + "grad_norm": 2.9939634291419526, + "language_loss": 0.87121451, + "learning_rate": 3.998929838538932e-06, + "loss": 0.89449108, + "num_input_tokens_seen": 14131475, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 1.8046875, + "step": 667, + "time_per_iteration": 2.547909736633301 + }, + { + "auxiliary_loss_clip": 0.0125258, + "auxiliary_loss_mlp": 0.01066007, + "balance_loss_clip": 1.03661036, + "balance_loss_mlp": 1.07692444, + "epoch": 0.04016233278220352, + "flos": 18617025254400.0, + "grad_norm": 2.627098567014159, + "language_loss": 0.80619991, + "learning_rate": 3.998917061758087e-06, + "loss": 0.82938576, + "num_input_tokens_seen": 14146165, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.7578125, + "step": 668, + "time_per_iteration": 2.441667079925537 + }, + { + "auxiliary_loss_clip": 0.01110895, + "auxiliary_loss_mlp": 0.01011229, + "balance_loss_clip": 1.0053643, + "balance_loss_mlp": 1.02968836, + "epoch": 0.040222456034871484, + "flos": 70906194696960.0, + "grad_norm": 0.7872457900726799, + "language_loss": 0.60042131, + "learning_rate": 3.998904209178107e-06, + "loss": 0.62164247, + "num_input_tokens_seen": 14215005, + "router_z_loss_clip": 0.05859375, + "router_z_loss_mlp": 0.8125, + "step": 669, + "time_per_iteration": 3.200874090194702 + }, + { + "auxiliary_loss_clip": 0.01253738, + "auxiliary_loss_mlp": 0.0107276, + "balance_loss_clip": 1.0431962, + "balance_loss_mlp": 1.07228541, + "epoch": 0.040282579287539456, + "flos": 23764712186880.0, + "grad_norm": 1.7415828974469272, + "language_loss": 0.86405391, + "learning_rate": 3.9988912807994785e-06, + "loss": 0.88731897, + "num_input_tokens_seen": 14235510, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.8125, + "step": 670, + "time_per_iteration": 2.5169434547424316 + }, + { + "auxiliary_loss_clip": 0.0124964, + "auxiliary_loss_mlp": 0.01070621, + "balance_loss_clip": 1.0414381, + "balance_loss_mlp": 1.07305872, + "epoch": 0.04034270254020743, + "flos": 18478518410880.0, + "grad_norm": 1.9261739939324196, + "language_loss": 0.752123, + "learning_rate": 3.998878276622692e-06, + "loss": 0.7753256, + "num_input_tokens_seen": 14254565, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.765625, + "step": 671, + "time_per_iteration": 2.514566421508789 + }, + { + "auxiliary_loss_clip": 0.01259516, + "auxiliary_loss_mlp": 0.01075144, + "balance_loss_clip": 1.04472136, + "balance_loss_mlp": 1.0774349, + "epoch": 0.040402825792875394, + "flos": 17201858244480.0, + "grad_norm": 2.0846907245314688, + "language_loss": 0.92279977, + "learning_rate": 3.998865196648242e-06, + "loss": 0.94614637, + "num_input_tokens_seen": 14271885, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.8203125, + "step": 672, + "time_per_iteration": 2.464036226272583 + }, + { + "auxiliary_loss_clip": 0.01253491, + "auxiliary_loss_mlp": 0.01071171, + "balance_loss_clip": 1.03921115, + "balance_loss_mlp": 1.07329202, + "epoch": 0.040462949045543366, + "flos": 19172168928000.0, + "grad_norm": 1.816355722874097, + "language_loss": 0.90220857, + "learning_rate": 3.998852040876622e-06, + "loss": 0.92545515, + "num_input_tokens_seen": 14289670, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 1.796875, + "step": 673, + "time_per_iteration": 2.450547456741333 + }, + { + "auxiliary_loss_clip": 0.01249229, + "auxiliary_loss_mlp": 0.01077482, + "balance_loss_clip": 1.0463202, + "balance_loss_mlp": 1.07150948, + "epoch": 0.04052307229821133, + "flos": 24019821555840.0, + "grad_norm": 2.117589951798075, + "language_loss": 0.74881005, + "learning_rate": 3.998838809308334e-06, + "loss": 0.77207714, + "num_input_tokens_seen": 14309285, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.78125, + "step": 674, + "time_per_iteration": 2.5444436073303223 + }, + { + "auxiliary_loss_clip": 0.01260981, + "auxiliary_loss_mlp": 0.01061202, + "balance_loss_clip": 1.03036261, + "balance_loss_mlp": 1.07609737, + "epoch": 0.0405831955508793, + "flos": 16436601964800.0, + "grad_norm": 2.2422867770418797, + "language_loss": 0.78305578, + "learning_rate": 3.9988255019438766e-06, + "loss": 0.80627763, + "num_input_tokens_seen": 14328300, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.84375, + "step": 675, + "time_per_iteration": 2.4525954723358154 + }, + { + "auxiliary_loss_clip": 0.01252319, + "auxiliary_loss_mlp": 0.01078615, + "balance_loss_clip": 1.04578447, + "balance_loss_mlp": 1.07254028, + "epoch": 0.040643318803547275, + "flos": 24279922915200.0, + "grad_norm": 1.7072695919905723, + "language_loss": 0.76650077, + "learning_rate": 3.998812118783757e-06, + "loss": 0.78981006, + "num_input_tokens_seen": 14346395, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.796875, + "step": 676, + "time_per_iteration": 2.530043840408325 + }, + { + "auxiliary_loss_clip": 0.01258388, + "auxiliary_loss_mlp": 0.01076398, + "balance_loss_clip": 1.04564214, + "balance_loss_mlp": 1.0750767, + "epoch": 0.04070344205621524, + "flos": 17712076982400.0, + "grad_norm": 2.3168648577819138, + "language_loss": 0.85182011, + "learning_rate": 3.9987986598284804e-06, + "loss": 0.87516803, + "num_input_tokens_seen": 14364605, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.828125, + "step": 677, + "time_per_iteration": 2.4390082359313965 + }, + { + "auxiliary_loss_clip": 0.01249568, + "auxiliary_loss_mlp": 0.01068372, + "balance_loss_clip": 1.03804517, + "balance_loss_mlp": 1.071486, + "epoch": 0.04076356530888321, + "flos": 26177658168960.0, + "grad_norm": 1.7808730288109123, + "language_loss": 0.76348364, + "learning_rate": 3.998785125078559e-06, + "loss": 0.78666306, + "num_input_tokens_seen": 14385265, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.78125, + "step": 678, + "time_per_iteration": 2.5151596069335938 + }, + { + "auxiliary_loss_clip": 0.01250603, + "auxiliary_loss_mlp": 0.01066495, + "balance_loss_clip": 1.03807509, + "balance_loss_mlp": 1.07162285, + "epoch": 0.04082368856155118, + "flos": 35773455772800.0, + "grad_norm": 1.9938089142752387, + "language_loss": 0.82114184, + "learning_rate": 3.998771514534505e-06, + "loss": 0.84431279, + "num_input_tokens_seen": 14406090, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.7890625, + "step": 679, + "time_per_iteration": 2.5701568126678467 + }, + { + "auxiliary_loss_clip": 0.01255726, + "auxiliary_loss_mlp": 0.01057721, + "balance_loss_clip": 1.02799058, + "balance_loss_mlp": 1.07693028, + "epoch": 0.04088381181421915, + "flos": 28146640049280.0, + "grad_norm": 1.893911305727382, + "language_loss": 0.76349533, + "learning_rate": 3.998757828196835e-06, + "loss": 0.7866298, + "num_input_tokens_seen": 14425130, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.7890625, + "step": 680, + "time_per_iteration": 2.5289864540100098 + }, + { + "auxiliary_loss_clip": 0.01255007, + "auxiliary_loss_mlp": 0.01065478, + "balance_loss_clip": 1.03305268, + "balance_loss_mlp": 1.07167506, + "epoch": 0.04094393506688712, + "flos": 27597673514880.0, + "grad_norm": 1.7999776318515568, + "language_loss": 0.83315849, + "learning_rate": 3.9987440660660685e-06, + "loss": 0.8563633, + "num_input_tokens_seen": 14447355, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 1.8359375, + "step": 681, + "time_per_iteration": 2.5313305854797363 + }, + { + "auxiliary_loss_clip": 0.01253144, + "auxiliary_loss_mlp": 0.01064685, + "balance_loss_clip": 1.03302324, + "balance_loss_mlp": 1.07082057, + "epoch": 0.04100405831955509, + "flos": 23112036109440.0, + "grad_norm": 1.6690976928218293, + "language_loss": 0.71312869, + "learning_rate": 3.998730228142726e-06, + "loss": 0.73630697, + "num_input_tokens_seen": 14466790, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.828125, + "step": 682, + "time_per_iteration": 2.5190017223358154 + }, + { + "auxiliary_loss_clip": 0.01251459, + "auxiliary_loss_mlp": 0.01068202, + "balance_loss_clip": 1.03911471, + "balance_loss_mlp": 1.07090235, + "epoch": 0.04106418157222306, + "flos": 20156731695360.0, + "grad_norm": 1.7744847161326498, + "language_loss": 0.72373003, + "learning_rate": 3.998716314427333e-06, + "loss": 0.74692667, + "num_input_tokens_seen": 14485195, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.8046875, + "step": 683, + "time_per_iteration": 2.473156690597534 + }, + { + "auxiliary_loss_clip": 0.01250706, + "auxiliary_loss_mlp": 0.01075324, + "balance_loss_clip": 1.04540253, + "balance_loss_mlp": 1.07707, + "epoch": 0.041124304824891024, + "flos": 17420697855360.0, + "grad_norm": 2.316908811268422, + "language_loss": 0.81263745, + "learning_rate": 3.998702324920417e-06, + "loss": 0.83589774, + "num_input_tokens_seen": 14503370, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 1.734375, + "step": 684, + "time_per_iteration": 5.34027099609375 + }, + { + "auxiliary_loss_clip": 0.01251905, + "auxiliary_loss_mlp": 0.01072266, + "balance_loss_clip": 1.04053211, + "balance_loss_mlp": 1.07572865, + "epoch": 0.041184428077558996, + "flos": 25780163287680.0, + "grad_norm": 1.5327144156887007, + "language_loss": 0.90501672, + "learning_rate": 3.9986882596225085e-06, + "loss": 0.92825842, + "num_input_tokens_seen": 14526415, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 1.765625, + "step": 685, + "time_per_iteration": 3.918776750564575 + }, + { + "auxiliary_loss_clip": 0.01253389, + "auxiliary_loss_mlp": 0.010703, + "balance_loss_clip": 1.04002118, + "balance_loss_mlp": 1.07458997, + "epoch": 0.04124455133022697, + "flos": 22964766347520.0, + "grad_norm": 2.0402082016953234, + "language_loss": 0.87871253, + "learning_rate": 3.998674118534141e-06, + "loss": 0.90194941, + "num_input_tokens_seen": 14546595, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.7890625, + "step": 686, + "time_per_iteration": 2.481177806854248 + }, + { + "auxiliary_loss_clip": 0.01258153, + "auxiliary_loss_mlp": 0.01071669, + "balance_loss_clip": 1.04158103, + "balance_loss_mlp": 1.07474661, + "epoch": 0.04130467458289493, + "flos": 21289067015040.0, + "grad_norm": 1.7716861202834375, + "language_loss": 0.71645427, + "learning_rate": 3.998659901655851e-06, + "loss": 0.73975253, + "num_input_tokens_seen": 14566590, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.8359375, + "step": 687, + "time_per_iteration": 2.4720261096954346 + }, + { + "auxiliary_loss_clip": 0.01252382, + "auxiliary_loss_mlp": 0.01070684, + "balance_loss_clip": 1.04262209, + "balance_loss_mlp": 1.07918715, + "epoch": 0.041364797835562905, + "flos": 19974233669760.0, + "grad_norm": 2.117746024922212, + "language_loss": 0.8642537, + "learning_rate": 3.998645608988177e-06, + "loss": 0.88748431, + "num_input_tokens_seen": 14585965, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.734375, + "step": 688, + "time_per_iteration": 2.46706223487854 + }, + { + "auxiliary_loss_clip": 0.01249454, + "auxiliary_loss_mlp": 0.01083042, + "balance_loss_clip": 1.05338287, + "balance_loss_mlp": 1.07534754, + "epoch": 0.04142492108823087, + "flos": 21906227520000.0, + "grad_norm": 2.6487514234328304, + "language_loss": 0.83326006, + "learning_rate": 3.998631240531661e-06, + "loss": 0.85658503, + "num_input_tokens_seen": 14606015, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.7421875, + "step": 689, + "time_per_iteration": 2.4689462184906006 + }, + { + "auxiliary_loss_clip": 0.01248134, + "auxiliary_loss_mlp": 0.01077255, + "balance_loss_clip": 1.04847789, + "balance_loss_mlp": 1.07176828, + "epoch": 0.04148504434089884, + "flos": 27639617621760.0, + "grad_norm": 1.7821885346326607, + "language_loss": 0.68391848, + "learning_rate": 3.998616796286848e-06, + "loss": 0.70717239, + "num_input_tokens_seen": 14629955, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.765625, + "step": 690, + "time_per_iteration": 2.5583128929138184 + }, + { + "auxiliary_loss_clip": 0.012458, + "auxiliary_loss_mlp": 0.01071299, + "balance_loss_clip": 1.04197323, + "balance_loss_mlp": 1.07094526, + "epoch": 0.041545167593566815, + "flos": 20518387781760.0, + "grad_norm": 1.747700039366933, + "language_loss": 0.74933273, + "learning_rate": 3.998602276254286e-06, + "loss": 0.77250373, + "num_input_tokens_seen": 14648000, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.75, + "step": 691, + "time_per_iteration": 2.4566729068756104 + }, + { + "auxiliary_loss_clip": 0.01246178, + "auxiliary_loss_mlp": 0.0107911, + "balance_loss_clip": 1.04890203, + "balance_loss_mlp": 1.07268727, + "epoch": 0.04160529084623478, + "flos": 11868907939200.0, + "grad_norm": 2.450885846250815, + "language_loss": 0.84518701, + "learning_rate": 3.998587680434526e-06, + "loss": 0.86843991, + "num_input_tokens_seen": 14662235, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.734375, + "step": 692, + "time_per_iteration": 2.4667932987213135 + }, + { + "auxiliary_loss_clip": 0.01252043, + "auxiliary_loss_mlp": 0.01072791, + "balance_loss_clip": 1.04124784, + "balance_loss_mlp": 1.07099986, + "epoch": 0.04166541409890275, + "flos": 14828306503680.0, + "grad_norm": 9.166238009589804, + "language_loss": 0.89107299, + "learning_rate": 3.99857300882812e-06, + "loss": 0.9143213, + "num_input_tokens_seen": 14676065, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.8125, + "step": 693, + "time_per_iteration": 2.4823052883148193 + }, + { + "auxiliary_loss_clip": 0.01254961, + "auxiliary_loss_mlp": 0.01065864, + "balance_loss_clip": 1.03637171, + "balance_loss_mlp": 1.07755136, + "epoch": 0.04172553735157072, + "flos": 25808137004160.0, + "grad_norm": 2.1462970179067646, + "language_loss": 0.82179356, + "learning_rate": 3.998558261435626e-06, + "loss": 0.84500182, + "num_input_tokens_seen": 14694955, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.7734375, + "step": 694, + "time_per_iteration": 2.564098834991455 + }, + { + "auxiliary_loss_clip": 0.01253069, + "auxiliary_loss_mlp": 0.01067691, + "balance_loss_clip": 1.03791225, + "balance_loss_mlp": 1.07214785, + "epoch": 0.04178566060423869, + "flos": 24279815174400.0, + "grad_norm": 2.057768586122239, + "language_loss": 0.83656573, + "learning_rate": 3.9985434382576015e-06, + "loss": 0.85977334, + "num_input_tokens_seen": 14715510, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.8125, + "step": 695, + "time_per_iteration": 2.5122969150543213 + }, + { + "auxiliary_loss_clip": 0.01249861, + "auxiliary_loss_mlp": 0.01073319, + "balance_loss_clip": 1.04270577, + "balance_loss_mlp": 1.07313716, + "epoch": 0.04184578385690666, + "flos": 18222008411520.0, + "grad_norm": 2.138642052855673, + "language_loss": 0.8441087, + "learning_rate": 3.99852853929461e-06, + "loss": 0.86734056, + "num_input_tokens_seen": 14731755, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.765625, + "step": 696, + "time_per_iteration": 2.462756872177124 + }, + { + "auxiliary_loss_clip": 0.01247863, + "auxiliary_loss_mlp": 0.01073791, + "balance_loss_clip": 1.04253471, + "balance_loss_mlp": 1.07146811, + "epoch": 0.041905907109574626, + "flos": 22776342577920.0, + "grad_norm": 2.042298821772003, + "language_loss": 0.93134123, + "learning_rate": 3.998513564547216e-06, + "loss": 0.95455778, + "num_input_tokens_seen": 14750810, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.765625, + "step": 697, + "time_per_iteration": 2.5189502239227295 + }, + { + "auxiliary_loss_clip": 0.0124398, + "auxiliary_loss_mlp": 0.01069004, + "balance_loss_clip": 1.04048967, + "balance_loss_mlp": 1.07146859, + "epoch": 0.0419660303622426, + "flos": 20156947176960.0, + "grad_norm": 2.2837511795811207, + "language_loss": 0.83989406, + "learning_rate": 3.998498514015987e-06, + "loss": 0.86302388, + "num_input_tokens_seen": 14768435, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.71875, + "step": 698, + "time_per_iteration": 2.5080907344818115 + }, + { + "auxiliary_loss_clip": 0.01247569, + "auxiliary_loss_mlp": 0.01086724, + "balance_loss_clip": 1.05551505, + "balance_loss_mlp": 1.0711751, + "epoch": 0.042026153614910564, + "flos": 23076376882560.0, + "grad_norm": 1.9405760650289445, + "language_loss": 0.91369909, + "learning_rate": 3.998483387701495e-06, + "loss": 0.93704206, + "num_input_tokens_seen": 14786690, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.7578125, + "step": 699, + "time_per_iteration": 2.4667766094207764 + }, + { + "auxiliary_loss_clip": 0.01113685, + "auxiliary_loss_mlp": 0.01024099, + "balance_loss_clip": 1.01842487, + "balance_loss_mlp": 1.03384757, + "epoch": 0.042086276867578536, + "flos": 64495243370880.0, + "grad_norm": 0.8964375713204716, + "language_loss": 0.67850006, + "learning_rate": 3.998468185604312e-06, + "loss": 0.69987792, + "num_input_tokens_seen": 14853840, + "router_z_loss_clip": 0.05664062, + "router_z_loss_mlp": 0.796875, + "step": 700, + "time_per_iteration": 3.1214911937713623 + }, + { + "auxiliary_loss_clip": 0.01254452, + "auxiliary_loss_mlp": 0.01078478, + "balance_loss_clip": 1.04695964, + "balance_loss_mlp": 1.07502532, + "epoch": 0.04214640012024651, + "flos": 15487016065920.0, + "grad_norm": 2.6789371965697524, + "language_loss": 0.89020562, + "learning_rate": 3.998452907725016e-06, + "loss": 0.913535, + "num_input_tokens_seen": 14869580, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 1.796875, + "step": 701, + "time_per_iteration": 2.46085524559021 + }, + { + "auxiliary_loss_clip": 0.01250018, + "auxiliary_loss_mlp": 0.0107128, + "balance_loss_clip": 1.04085803, + "balance_loss_mlp": 1.07681179, + "epoch": 0.04220652337291447, + "flos": 23877040993920.0, + "grad_norm": 2.2592774096130794, + "language_loss": 0.67494118, + "learning_rate": 3.998437554064184e-06, + "loss": 0.69815421, + "num_input_tokens_seen": 14891065, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.734375, + "step": 702, + "time_per_iteration": 2.5170979499816895 + }, + { + "auxiliary_loss_clip": 0.01112958, + "auxiliary_loss_mlp": 0.01006834, + "balance_loss_clip": 1.00161314, + "balance_loss_mlp": 1.03296542, + "epoch": 0.042266646625582445, + "flos": 63795451628160.0, + "grad_norm": 0.8426087453226233, + "language_loss": 0.60777819, + "learning_rate": 3.9984221246224006e-06, + "loss": 0.62897617, + "num_input_tokens_seen": 14954815, + "router_z_loss_clip": 0.05224609, + "router_z_loss_mlp": 0.80078125, + "step": 703, + "time_per_iteration": 3.155794143676758 + }, + { + "auxiliary_loss_clip": 0.01112196, + "auxiliary_loss_mlp": 0.01010352, + "balance_loss_clip": 1.0050354, + "balance_loss_mlp": 1.03251982, + "epoch": 0.04232676987825041, + "flos": 50018863345920.0, + "grad_norm": 1.0167549333074237, + "language_loss": 0.5776214, + "learning_rate": 3.9984066194002494e-06, + "loss": 0.59884691, + "num_input_tokens_seen": 15003050, + "router_z_loss_clip": 0.05322266, + "router_z_loss_mlp": 0.796875, + "step": 704, + "time_per_iteration": 2.95633602142334 + }, + { + "auxiliary_loss_clip": 0.01252148, + "auxiliary_loss_mlp": 0.01070665, + "balance_loss_clip": 1.0397656, + "balance_loss_mlp": 1.07432342, + "epoch": 0.04238689313091838, + "flos": 21616105368960.0, + "grad_norm": 2.1970745802550624, + "language_loss": 0.87708455, + "learning_rate": 3.998391038398319e-06, + "loss": 0.90031266, + "num_input_tokens_seen": 15021990, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.78125, + "step": 705, + "time_per_iteration": 2.51802921295166 + }, + { + "auxiliary_loss_clip": 0.01238458, + "auxiliary_loss_mlp": 0.01062417, + "balance_loss_clip": 1.03498721, + "balance_loss_mlp": 1.06876624, + "epoch": 0.042447016383586354, + "flos": 19135109070720.0, + "grad_norm": 1.7054575923778923, + "language_loss": 0.71612352, + "learning_rate": 3.998375381617201e-06, + "loss": 0.73913229, + "num_input_tokens_seen": 15040700, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.6953125, + "step": 706, + "time_per_iteration": 2.464270830154419 + }, + { + "auxiliary_loss_clip": 0.01243119, + "auxiliary_loss_mlp": 0.01068207, + "balance_loss_clip": 1.03816676, + "balance_loss_mlp": 1.07029784, + "epoch": 0.04250713963625432, + "flos": 24426007528320.0, + "grad_norm": 2.0927829932503714, + "language_loss": 0.93480223, + "learning_rate": 3.9983596490574875e-06, + "loss": 0.95791554, + "num_input_tokens_seen": 15056725, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.7265625, + "step": 707, + "time_per_iteration": 2.5087966918945312 + }, + { + "auxiliary_loss_clip": 0.01245928, + "auxiliary_loss_mlp": 0.01065311, + "balance_loss_clip": 1.03441203, + "balance_loss_mlp": 1.0676806, + "epoch": 0.04256726288892229, + "flos": 30367391333760.0, + "grad_norm": 2.3244890877745883, + "language_loss": 0.81275034, + "learning_rate": 3.998343840719776e-06, + "loss": 0.83586276, + "num_input_tokens_seen": 15077550, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.78125, + "step": 708, + "time_per_iteration": 2.557119607925415 + }, + { + "auxiliary_loss_clip": 0.01251091, + "auxiliary_loss_mlp": 0.01073266, + "balance_loss_clip": 1.04239082, + "balance_loss_mlp": 1.07195199, + "epoch": 0.04262738614159026, + "flos": 16362661818240.0, + "grad_norm": 2.2553269788690224, + "language_loss": 0.82229173, + "learning_rate": 3.998327956604666e-06, + "loss": 0.84553528, + "num_input_tokens_seen": 15094955, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.796875, + "step": 709, + "time_per_iteration": 2.4828600883483887 + }, + { + "auxiliary_loss_clip": 0.01256006, + "auxiliary_loss_mlp": 0.01064315, + "balance_loss_clip": 1.03389335, + "balance_loss_mlp": 1.07517564, + "epoch": 0.04268750939425823, + "flos": 20412379768320.0, + "grad_norm": 2.534138916450152, + "language_loss": 0.85063422, + "learning_rate": 3.99831199671276e-06, + "loss": 0.87383747, + "num_input_tokens_seen": 15113395, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.8125, + "step": 710, + "time_per_iteration": 2.453641653060913 + }, + { + "auxiliary_loss_clip": 0.01254724, + "auxiliary_loss_mlp": 0.01070713, + "balance_loss_clip": 1.04114938, + "balance_loss_mlp": 1.07757199, + "epoch": 0.0427476326469262, + "flos": 20302959962880.0, + "grad_norm": 3.316207411440496, + "language_loss": 0.84996349, + "learning_rate": 3.998295961044662e-06, + "loss": 0.87321782, + "num_input_tokens_seen": 15132920, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.7734375, + "step": 711, + "time_per_iteration": 2.4685802459716797 + }, + { + "auxiliary_loss_clip": 0.01246695, + "auxiliary_loss_mlp": 0.01069917, + "balance_loss_clip": 1.03827882, + "balance_loss_mlp": 1.07044697, + "epoch": 0.042807755899594166, + "flos": 21650794928640.0, + "grad_norm": 2.000925777751644, + "language_loss": 0.85439169, + "learning_rate": 3.9982798496009804e-06, + "loss": 0.87755781, + "num_input_tokens_seen": 15153115, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.765625, + "step": 712, + "time_per_iteration": 2.5002598762512207 + }, + { + "auxiliary_loss_clip": 0.01252579, + "auxiliary_loss_mlp": 0.0107294, + "balance_loss_clip": 1.0445205, + "balance_loss_mlp": 1.0701685, + "epoch": 0.04286787915226214, + "flos": 21435007973760.0, + "grad_norm": 3.2453781921901728, + "language_loss": 0.90829903, + "learning_rate": 3.998263662382328e-06, + "loss": 0.9315542, + "num_input_tokens_seen": 15172770, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.8203125, + "step": 713, + "time_per_iteration": 2.4908998012542725 + }, + { + "auxiliary_loss_clip": 0.01108903, + "auxiliary_loss_mlp": 0.01017546, + "balance_loss_clip": 1.01187158, + "balance_loss_mlp": 1.0288384, + "epoch": 0.04292800240493011, + "flos": 66397970615040.0, + "grad_norm": 0.8777811618173876, + "language_loss": 0.63746506, + "learning_rate": 3.9982473993893165e-06, + "loss": 0.65872955, + "num_input_tokens_seen": 15240055, + "router_z_loss_clip": 0.05664062, + "router_z_loss_mlp": 0.80078125, + "step": 714, + "time_per_iteration": 3.158921480178833 + }, + { + "auxiliary_loss_clip": 0.01249012, + "auxiliary_loss_mlp": 0.01080593, + "balance_loss_clip": 1.05076694, + "balance_loss_mlp": 1.07545531, + "epoch": 0.042988125657598075, + "flos": 31650264552960.0, + "grad_norm": 2.1622955343434382, + "language_loss": 0.74528754, + "learning_rate": 3.998231060622563e-06, + "loss": 0.76858354, + "num_input_tokens_seen": 15261585, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 1.734375, + "step": 715, + "time_per_iteration": 2.5759642124176025 + }, + { + "auxiliary_loss_clip": 0.01250142, + "auxiliary_loss_mlp": 0.01077038, + "balance_loss_clip": 1.04534006, + "balance_loss_mlp": 1.07450986, + "epoch": 0.04304824891026605, + "flos": 33248468292480.0, + "grad_norm": 2.2108029839954213, + "language_loss": 0.72630137, + "learning_rate": 3.998214646082688e-06, + "loss": 0.74957311, + "num_input_tokens_seen": 15281160, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.7578125, + "step": 716, + "time_per_iteration": 2.5973668098449707 + }, + { + "auxiliary_loss_clip": 0.01104967, + "auxiliary_loss_mlp": 0.01006876, + "balance_loss_clip": 1.00091577, + "balance_loss_mlp": 1.02687418, + "epoch": 0.04310837216293401, + "flos": 64064782782720.0, + "grad_norm": 0.9052113850529176, + "language_loss": 0.65557301, + "learning_rate": 3.998198155770314e-06, + "loss": 0.67669141, + "num_input_tokens_seen": 15344505, + "router_z_loss_clip": 0.05957031, + "router_z_loss_mlp": 0.78125, + "step": 717, + "time_per_iteration": 3.114957571029663 + }, + { + "auxiliary_loss_clip": 0.01104969, + "auxiliary_loss_mlp": 0.01003955, + "balance_loss_clip": 0.99780369, + "balance_loss_mlp": 1.02667391, + "epoch": 0.043168495415601985, + "flos": 61343757849600.0, + "grad_norm": 0.9880116621267147, + "language_loss": 0.58762264, + "learning_rate": 3.998181589686065e-06, + "loss": 0.60871184, + "num_input_tokens_seen": 15404050, + "router_z_loss_clip": 0.0612793, + "router_z_loss_mlp": 0.78125, + "step": 718, + "time_per_iteration": 2.910278797149658 + }, + { + "auxiliary_loss_clip": 0.01248398, + "auxiliary_loss_mlp": 0.01074809, + "balance_loss_clip": 1.04314709, + "balance_loss_mlp": 1.0758605, + "epoch": 0.04322861866826996, + "flos": 20704261685760.0, + "grad_norm": 1.8513004644505335, + "language_loss": 0.91198725, + "learning_rate": 3.99816494783057e-06, + "loss": 0.93521935, + "num_input_tokens_seen": 15424190, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.7265625, + "step": 719, + "time_per_iteration": 2.492509126663208 + }, + { + "auxiliary_loss_clip": 0.01244347, + "auxiliary_loss_mlp": 0.0107141, + "balance_loss_clip": 1.04208493, + "balance_loss_mlp": 1.06931555, + "epoch": 0.04328874192093792, + "flos": 30373352991360.0, + "grad_norm": 1.803377327315558, + "language_loss": 0.66468138, + "learning_rate": 3.99814823020446e-06, + "loss": 0.68783891, + "num_input_tokens_seen": 15446500, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.75, + "step": 720, + "time_per_iteration": 2.6061203479766846 + }, + { + "auxiliary_loss_clip": 0.01244682, + "auxiliary_loss_mlp": 0.01079523, + "balance_loss_clip": 1.04895782, + "balance_loss_mlp": 1.07152998, + "epoch": 0.043348865173605894, + "flos": 21944795748480.0, + "grad_norm": 1.8832143461121282, + "language_loss": 0.77743989, + "learning_rate": 3.9981314368083684e-06, + "loss": 0.80068195, + "num_input_tokens_seen": 15465830, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.734375, + "step": 721, + "time_per_iteration": 2.5255632400512695 + }, + { + "auxiliary_loss_clip": 0.01251204, + "auxiliary_loss_mlp": 0.0108774, + "balance_loss_clip": 1.05879569, + "balance_loss_mlp": 1.07584524, + "epoch": 0.04340898842627386, + "flos": 15264225959040.0, + "grad_norm": 3.027898330451403, + "language_loss": 0.87873065, + "learning_rate": 3.998114567642933e-06, + "loss": 0.90212011, + "num_input_tokens_seen": 15479985, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.75, + "step": 722, + "time_per_iteration": 2.536283493041992 + }, + { + "auxiliary_loss_clip": 0.0125365, + "auxiliary_loss_mlp": 0.01075404, + "balance_loss_clip": 1.04660296, + "balance_loss_mlp": 1.0758208, + "epoch": 0.04346911167894183, + "flos": 27965434913280.0, + "grad_norm": 30.376200688873947, + "language_loss": 0.84770942, + "learning_rate": 3.998097622708792e-06, + "loss": 0.87099999, + "num_input_tokens_seen": 15501545, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.78125, + "step": 723, + "time_per_iteration": 2.5167360305786133 + }, + { + "auxiliary_loss_clip": 0.01256754, + "auxiliary_loss_mlp": 0.01076494, + "balance_loss_clip": 1.04638171, + "balance_loss_mlp": 1.07828176, + "epoch": 0.0435292349316098, + "flos": 29242202820480.0, + "grad_norm": 1.9203333396820472, + "language_loss": 0.82793808, + "learning_rate": 3.99808060200659e-06, + "loss": 0.85127056, + "num_input_tokens_seen": 15521725, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.78125, + "step": 724, + "time_per_iteration": 2.5561347007751465 + }, + { + "auxiliary_loss_clip": 0.0125067, + "auxiliary_loss_mlp": 0.01090321, + "balance_loss_clip": 1.05975556, + "balance_loss_mlp": 1.07561088, + "epoch": 0.04358935818427777, + "flos": 20558356640640.0, + "grad_norm": 1.8200683460759586, + "language_loss": 0.79530561, + "learning_rate": 3.998063505536971e-06, + "loss": 0.81871551, + "num_input_tokens_seen": 15540910, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.75, + "step": 725, + "time_per_iteration": 2.4551918506622314 + }, + { + "auxiliary_loss_clip": 0.0126067, + "auxiliary_loss_mlp": 0.01076358, + "balance_loss_clip": 1.04529178, + "balance_loss_mlp": 1.07715642, + "epoch": 0.04364948143694574, + "flos": 14464926564480.0, + "grad_norm": 2.8106150104808485, + "language_loss": 0.87100697, + "learning_rate": 3.998046333300584e-06, + "loss": 0.89437729, + "num_input_tokens_seen": 15558640, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 1.8359375, + "step": 726, + "time_per_iteration": 5.350574731826782 + }, + { + "auxiliary_loss_clip": 0.01106916, + "auxiliary_loss_mlp": 0.01011263, + "balance_loss_clip": 1.00542223, + "balance_loss_mlp": 1.02866364, + "epoch": 0.043709604689613706, + "flos": 50067268922880.0, + "grad_norm": 0.9088619113669424, + "language_loss": 0.5587045, + "learning_rate": 3.998029085298079e-06, + "loss": 0.57988632, + "num_input_tokens_seen": 15612975, + "router_z_loss_clip": 0.05834961, + "router_z_loss_mlp": 0.78125, + "step": 727, + "time_per_iteration": 3.1539440155029297 + }, + { + "auxiliary_loss_clip": 0.01251236, + "auxiliary_loss_mlp": 0.01076851, + "balance_loss_clip": 1.04676282, + "balance_loss_mlp": 1.07453549, + "epoch": 0.04376972794228168, + "flos": 13991588115840.0, + "grad_norm": 2.397861957488019, + "language_loss": 0.82248902, + "learning_rate": 3.998011761530112e-06, + "loss": 0.84576982, + "num_input_tokens_seen": 15631070, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.765625, + "step": 728, + "time_per_iteration": 2.4445250034332275 + }, + { + "auxiliary_loss_clip": 0.01244631, + "auxiliary_loss_mlp": 0.01068516, + "balance_loss_clip": 1.0395956, + "balance_loss_mlp": 1.07265663, + "epoch": 0.04382985119494965, + "flos": 22009901149440.0, + "grad_norm": 2.2715062050859745, + "language_loss": 0.77187145, + "learning_rate": 3.997994361997338e-06, + "loss": 0.79500294, + "num_input_tokens_seen": 15647825, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.71875, + "step": 729, + "time_per_iteration": 2.5091514587402344 + }, + { + "auxiliary_loss_clip": 0.0125233, + "auxiliary_loss_mlp": 0.01074681, + "balance_loss_clip": 1.04502177, + "balance_loss_mlp": 1.07452357, + "epoch": 0.043889974447617615, + "flos": 24206521472640.0, + "grad_norm": 2.258754879989397, + "language_loss": 0.9515503, + "learning_rate": 3.997976886700417e-06, + "loss": 0.97482038, + "num_input_tokens_seen": 15668260, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.78125, + "step": 730, + "time_per_iteration": 2.4795522689819336 + }, + { + "auxiliary_loss_clip": 0.0124716, + "auxiliary_loss_mlp": 0.01065838, + "balance_loss_clip": 1.03496313, + "balance_loss_mlp": 1.07000017, + "epoch": 0.04395009770028559, + "flos": 17274541415040.0, + "grad_norm": 2.2097226025839483, + "language_loss": 0.88016784, + "learning_rate": 3.997959335640013e-06, + "loss": 0.90329784, + "num_input_tokens_seen": 15685630, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.7734375, + "step": 731, + "time_per_iteration": 2.4678709506988525 + }, + { + "auxiliary_loss_clip": 0.01251191, + "auxiliary_loss_mlp": 0.01073318, + "balance_loss_clip": 1.04589999, + "balance_loss_mlp": 1.07521737, + "epoch": 0.04401022095295355, + "flos": 12310286261760.0, + "grad_norm": 3.3707184473936587, + "language_loss": 0.88656235, + "learning_rate": 3.997941708816791e-06, + "loss": 0.90980744, + "num_input_tokens_seen": 15698645, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.7578125, + "step": 732, + "time_per_iteration": 2.4135851860046387 + }, + { + "auxiliary_loss_clip": 0.01251086, + "auxiliary_loss_mlp": 0.01073165, + "balance_loss_clip": 1.04288554, + "balance_loss_mlp": 1.07443762, + "epoch": 0.044070344205621524, + "flos": 20959658363520.0, + "grad_norm": 2.131822645051773, + "language_loss": 0.86010063, + "learning_rate": 3.997924006231419e-06, + "loss": 0.88334322, + "num_input_tokens_seen": 15716775, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.765625, + "step": 733, + "time_per_iteration": 2.491278648376465 + }, + { + "auxiliary_loss_clip": 0.01256254, + "auxiliary_loss_mlp": 0.01078649, + "balance_loss_clip": 1.04715347, + "balance_loss_mlp": 1.07624841, + "epoch": 0.044130467458289496, + "flos": 13845288021120.0, + "grad_norm": 2.0564057381838885, + "language_loss": 0.91515708, + "learning_rate": 3.9979062278845685e-06, + "loss": 0.93850613, + "num_input_tokens_seen": 15733320, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 1.796875, + "step": 734, + "time_per_iteration": 2.451258897781372 + }, + { + "auxiliary_loss_clip": 0.01247796, + "auxiliary_loss_mlp": 0.01064289, + "balance_loss_clip": 1.03696656, + "balance_loss_mlp": 1.07613921, + "epoch": 0.04419059071095746, + "flos": 28655063107200.0, + "grad_norm": 1.8863467898976456, + "language_loss": 0.77831066, + "learning_rate": 3.9978883737769125e-06, + "loss": 0.8014316, + "num_input_tokens_seen": 15752705, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.71875, + "step": 735, + "time_per_iteration": 2.558958053588867 + }, + { + "auxiliary_loss_clip": 0.01240634, + "auxiliary_loss_mlp": 0.01063468, + "balance_loss_clip": 1.03526342, + "balance_loss_mlp": 1.06886315, + "epoch": 0.04425071396362543, + "flos": 28183304856960.0, + "grad_norm": 2.1337917025346074, + "language_loss": 0.88456166, + "learning_rate": 3.9978704439091305e-06, + "loss": 0.90760267, + "num_input_tokens_seen": 15772800, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.71875, + "step": 736, + "time_per_iteration": 2.5100033283233643 + }, + { + "auxiliary_loss_clip": 0.01242163, + "auxiliary_loss_mlp": 0.01067088, + "balance_loss_clip": 1.03995562, + "balance_loss_mlp": 1.07473993, + "epoch": 0.0443108372162934, + "flos": 23658452778240.0, + "grad_norm": 1.954630170969084, + "language_loss": 0.84155536, + "learning_rate": 3.997852438281901e-06, + "loss": 0.86464787, + "num_input_tokens_seen": 15793665, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.671875, + "step": 737, + "time_per_iteration": 2.5015766620635986 + }, + { + "auxiliary_loss_clip": 0.01251899, + "auxiliary_loss_mlp": 0.01072468, + "balance_loss_clip": 1.04077065, + "balance_loss_mlp": 1.07667851, + "epoch": 0.04437096046896137, + "flos": 33979861025280.0, + "grad_norm": 2.0376910697928947, + "language_loss": 0.8518666, + "learning_rate": 3.997834356895906e-06, + "loss": 0.87511027, + "num_input_tokens_seen": 15813175, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.75, + "step": 738, + "time_per_iteration": 2.5576610565185547 + }, + { + "auxiliary_loss_clip": 0.01112093, + "auxiliary_loss_mlp": 0.01046041, + "balance_loss_clip": 1.04048622, + "balance_loss_mlp": 1.03298163, + "epoch": 0.04443108372162934, + "flos": 67397506375680.0, + "grad_norm": 0.8684121686227821, + "language_loss": 0.59110028, + "learning_rate": 3.9978161997518324e-06, + "loss": 0.61268163, + "num_input_tokens_seen": 15872050, + "router_z_loss_clip": 0.05566406, + "router_z_loss_mlp": 0.7890625, + "step": 739, + "time_per_iteration": 3.0643718242645264 + }, + { + "auxiliary_loss_clip": 0.0124678, + "auxiliary_loss_mlp": 0.01070548, + "balance_loss_clip": 1.04220033, + "balance_loss_mlp": 1.07513726, + "epoch": 0.04449120697429731, + "flos": 29752672953600.0, + "grad_norm": 2.1860888775648695, + "language_loss": 0.91622591, + "learning_rate": 3.997797966850369e-06, + "loss": 0.93939924, + "num_input_tokens_seen": 15891085, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.71875, + "step": 740, + "time_per_iteration": 2.5448389053344727 + }, + { + "auxiliary_loss_clip": 0.01252276, + "auxiliary_loss_mlp": 0.01063948, + "balance_loss_clip": 1.03693473, + "balance_loss_mlp": 1.07766986, + "epoch": 0.04455133022696528, + "flos": 36502119072000.0, + "grad_norm": 2.01644947055736, + "language_loss": 0.71842492, + "learning_rate": 3.997779658192205e-06, + "loss": 0.74158716, + "num_input_tokens_seen": 15914225, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.7421875, + "step": 741, + "time_per_iteration": 2.5979790687561035 + }, + { + "auxiliary_loss_clip": 0.01240373, + "auxiliary_loss_mlp": 0.01073056, + "balance_loss_clip": 1.04532838, + "balance_loss_mlp": 1.07044411, + "epoch": 0.044611453479633245, + "flos": 28803661672320.0, + "grad_norm": 1.722907957661965, + "language_loss": 0.88555831, + "learning_rate": 3.997761273778037e-06, + "loss": 0.9086926, + "num_input_tokens_seen": 15934540, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.703125, + "step": 742, + "time_per_iteration": 2.6367549896240234 + }, + { + "auxiliary_loss_clip": 0.0124233, + "auxiliary_loss_mlp": 0.01061826, + "balance_loss_clip": 1.03253651, + "balance_loss_mlp": 1.07209873, + "epoch": 0.04467157673230122, + "flos": 20010970304640.0, + "grad_norm": 2.0306401320231693, + "language_loss": 0.83823264, + "learning_rate": 3.997742813608561e-06, + "loss": 0.86127412, + "num_input_tokens_seen": 15952560, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.703125, + "step": 743, + "time_per_iteration": 2.516587972640991 + }, + { + "auxiliary_loss_clip": 0.01249271, + "auxiliary_loss_mlp": 0.01068722, + "balance_loss_clip": 1.04161429, + "balance_loss_mlp": 1.07474804, + "epoch": 0.04473169998496919, + "flos": 18004964480640.0, + "grad_norm": 3.0889105946672704, + "language_loss": 0.79948521, + "learning_rate": 3.997724277684479e-06, + "loss": 0.8226651, + "num_input_tokens_seen": 15970620, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.75, + "step": 744, + "time_per_iteration": 2.44805645942688 + }, + { + "auxiliary_loss_clip": 0.01243449, + "auxiliary_loss_mlp": 0.01067337, + "balance_loss_clip": 1.04037201, + "balance_loss_mlp": 1.07279778, + "epoch": 0.044791823237637154, + "flos": 20631722169600.0, + "grad_norm": 2.388036535067576, + "language_loss": 0.85400093, + "learning_rate": 3.99770566600649e-06, + "loss": 0.87710881, + "num_input_tokens_seen": 15987325, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.703125, + "step": 745, + "time_per_iteration": 2.4790430068969727 + }, + { + "auxiliary_loss_clip": 0.01242131, + "auxiliary_loss_mlp": 0.01064523, + "balance_loss_clip": 1.03569877, + "balance_loss_mlp": 1.0714339, + "epoch": 0.04485194649030513, + "flos": 31176171918720.0, + "grad_norm": 2.1215702602167688, + "language_loss": 0.6866799, + "learning_rate": 3.997686978575302e-06, + "loss": 0.70974648, + "num_input_tokens_seen": 16008310, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.7109375, + "step": 746, + "time_per_iteration": 2.5645759105682373 + }, + { + "auxiliary_loss_clip": 0.01250748, + "auxiliary_loss_mlp": 0.010776, + "balance_loss_clip": 1.04748797, + "balance_loss_mlp": 1.0783143, + "epoch": 0.04491206974297309, + "flos": 26143291831680.0, + "grad_norm": 2.1376273799467547, + "language_loss": 0.68823957, + "learning_rate": 3.997668215391625e-06, + "loss": 0.71152306, + "num_input_tokens_seen": 16029620, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.7265625, + "step": 747, + "time_per_iteration": 2.5267317295074463 + }, + { + "auxiliary_loss_clip": 0.01248685, + "auxiliary_loss_mlp": 0.01079454, + "balance_loss_clip": 1.04984236, + "balance_loss_mlp": 1.07314527, + "epoch": 0.044972192995641064, + "flos": 20667668705280.0, + "grad_norm": 1.9669744064389407, + "language_loss": 0.66721869, + "learning_rate": 3.997649376456168e-06, + "loss": 0.69050002, + "num_input_tokens_seen": 16049065, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.75, + "step": 748, + "time_per_iteration": 2.4818925857543945 + }, + { + "auxiliary_loss_clip": 0.01250197, + "auxiliary_loss_mlp": 0.01082391, + "balance_loss_clip": 1.05320835, + "balance_loss_mlp": 1.07779491, + "epoch": 0.045032316248309036, + "flos": 16106834177280.0, + "grad_norm": 2.650057046326624, + "language_loss": 0.76540357, + "learning_rate": 3.997630461769647e-06, + "loss": 0.78872949, + "num_input_tokens_seen": 16066765, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.7265625, + "step": 749, + "time_per_iteration": 2.4454426765441895 + }, + { + "auxiliary_loss_clip": 0.01251335, + "auxiliary_loss_mlp": 0.01077492, + "balance_loss_clip": 1.04883409, + "balance_loss_mlp": 1.0770005, + "epoch": 0.045092439500977, + "flos": 17858843953920.0, + "grad_norm": 2.0345099055640317, + "language_loss": 0.88970172, + "learning_rate": 3.997611471332778e-06, + "loss": 0.91298997, + "num_input_tokens_seen": 16085980, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.7421875, + "step": 750, + "time_per_iteration": 2.458716630935669 + }, + { + "auxiliary_loss_clip": 0.01247033, + "auxiliary_loss_mlp": 0.01074335, + "balance_loss_clip": 1.04295921, + "balance_loss_mlp": 1.07139015, + "epoch": 0.04515256275364497, + "flos": 24462815990400.0, + "grad_norm": 2.3716924268159367, + "language_loss": 0.74869245, + "learning_rate": 3.9975924051462825e-06, + "loss": 0.77190608, + "num_input_tokens_seen": 16106260, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 1.7578125, + "step": 751, + "time_per_iteration": 2.5231218338012695 + }, + { + "auxiliary_loss_clip": 0.01243504, + "auxiliary_loss_mlp": 0.01073697, + "balance_loss_clip": 1.04573071, + "balance_loss_mlp": 1.07175446, + "epoch": 0.04521268600631294, + "flos": 20916385453440.0, + "grad_norm": 2.2224468826240975, + "language_loss": 0.69360238, + "learning_rate": 3.997573263210883e-06, + "loss": 0.7167744, + "num_input_tokens_seen": 16123475, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.71875, + "step": 752, + "time_per_iteration": 2.4620048999786377 + }, + { + "auxiliary_loss_clip": 0.01244736, + "auxiliary_loss_mlp": 0.01057192, + "balance_loss_clip": 1.02927327, + "balance_loss_mlp": 1.07154715, + "epoch": 0.04527280925898091, + "flos": 13371374954880.0, + "grad_norm": 2.984649176219999, + "language_loss": 0.91634125, + "learning_rate": 3.997554045527305e-06, + "loss": 0.9393605, + "num_input_tokens_seen": 16138335, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.734375, + "step": 753, + "time_per_iteration": 2.4722437858581543 + }, + { + "auxiliary_loss_clip": 0.01249124, + "auxiliary_loss_mlp": 0.01075497, + "balance_loss_clip": 1.04728031, + "balance_loss_mlp": 1.07501864, + "epoch": 0.04533293251164888, + "flos": 23254565276160.0, + "grad_norm": 2.2056938633592975, + "language_loss": 0.91197902, + "learning_rate": 3.997534752096277e-06, + "loss": 0.93522525, + "num_input_tokens_seen": 16157110, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.7421875, + "step": 754, + "time_per_iteration": 2.472975492477417 + }, + { + "auxiliary_loss_clip": 0.01238249, + "auxiliary_loss_mlp": 0.0107062, + "balance_loss_clip": 1.04144955, + "balance_loss_mlp": 1.07163191, + "epoch": 0.04539305576431685, + "flos": 12422004537600.0, + "grad_norm": 2.234660546964849, + "language_loss": 0.78528345, + "learning_rate": 3.997515382918531e-06, + "loss": 0.80837214, + "num_input_tokens_seen": 16174155, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.671875, + "step": 755, + "time_per_iteration": 2.4704174995422363 + }, + { + "auxiliary_loss_clip": 0.01248815, + "auxiliary_loss_mlp": 0.0107981, + "balance_loss_clip": 1.05100918, + "balance_loss_mlp": 1.07416105, + "epoch": 0.04545317901698482, + "flos": 16070995382400.0, + "grad_norm": 1.9667934561660614, + "language_loss": 0.78451371, + "learning_rate": 3.9974959379948015e-06, + "loss": 0.80779994, + "num_input_tokens_seen": 16192240, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.75, + "step": 756, + "time_per_iteration": 2.4873547554016113 + }, + { + "auxiliary_loss_clip": 0.01118229, + "auxiliary_loss_mlp": 0.010118, + "balance_loss_clip": 1.00600612, + "balance_loss_mlp": 1.03558636, + "epoch": 0.045513302269652785, + "flos": 66396139021440.0, + "grad_norm": 0.8118987787253854, + "language_loss": 0.62730747, + "learning_rate": 3.997476417325827e-06, + "loss": 0.64860779, + "num_input_tokens_seen": 16255775, + "router_z_loss_clip": 0.05786133, + "router_z_loss_mlp": 0.828125, + "step": 757, + "time_per_iteration": 3.1292941570281982 + }, + { + "auxiliary_loss_clip": 0.01242797, + "auxiliary_loss_mlp": 0.01069674, + "balance_loss_clip": 1.04220784, + "balance_loss_mlp": 1.0731318, + "epoch": 0.04557342552232076, + "flos": 21471169991040.0, + "grad_norm": 1.5194495460848947, + "language_loss": 0.84329176, + "learning_rate": 3.997456820912346e-06, + "loss": 0.86641645, + "num_input_tokens_seen": 16277015, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.6953125, + "step": 758, + "time_per_iteration": 2.498905658721924 + }, + { + "auxiliary_loss_clip": 0.01237511, + "auxiliary_loss_mlp": 0.01067085, + "balance_loss_clip": 1.0405376, + "balance_loss_mlp": 1.06733441, + "epoch": 0.04563354877498873, + "flos": 23732680233600.0, + "grad_norm": 2.0933163310434963, + "language_loss": 0.88315606, + "learning_rate": 3.997437148755101e-06, + "loss": 0.90620202, + "num_input_tokens_seen": 16296005, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.703125, + "step": 759, + "time_per_iteration": 2.5122711658477783 + }, + { + "auxiliary_loss_clip": 0.01248241, + "auxiliary_loss_mlp": 0.01075804, + "balance_loss_clip": 1.04644299, + "balance_loss_mlp": 1.075526, + "epoch": 0.045693672027656694, + "flos": 25735741142400.0, + "grad_norm": 2.170817451496144, + "language_loss": 0.73644727, + "learning_rate": 3.9974174008548405e-06, + "loss": 0.75968778, + "num_input_tokens_seen": 16315300, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.7265625, + "step": 760, + "time_per_iteration": 2.511322021484375 + }, + { + "auxiliary_loss_clip": 0.01244913, + "auxiliary_loss_mlp": 0.01073409, + "balance_loss_clip": 1.04630077, + "balance_loss_mlp": 1.07509935, + "epoch": 0.045753795280324666, + "flos": 19719016560000.0, + "grad_norm": 2.192184725657734, + "language_loss": 0.82177126, + "learning_rate": 3.9973975772123105e-06, + "loss": 0.84495443, + "num_input_tokens_seen": 16333820, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.6953125, + "step": 761, + "time_per_iteration": 2.4831535816192627 + }, + { + "auxiliary_loss_clip": 0.01238913, + "auxiliary_loss_mlp": 0.0107061, + "balance_loss_clip": 1.04245305, + "balance_loss_mlp": 1.06961203, + "epoch": 0.04581391853299264, + "flos": 23255786338560.0, + "grad_norm": 1.7986428347309282, + "language_loss": 0.79732436, + "learning_rate": 3.997377677828266e-06, + "loss": 0.82041955, + "num_input_tokens_seen": 16355290, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.6953125, + "step": 762, + "time_per_iteration": 2.52453875541687 + }, + { + "auxiliary_loss_clip": 0.01117014, + "auxiliary_loss_mlp": 0.01036706, + "balance_loss_clip": 1.03112733, + "balance_loss_mlp": 1.03455913, + "epoch": 0.0458740417856606, + "flos": 64231155601920.0, + "grad_norm": 1.008821564963746, + "language_loss": 0.58659625, + "learning_rate": 3.9973577027034585e-06, + "loss": 0.60813344, + "num_input_tokens_seen": 16415995, + "router_z_loss_clip": 0.0559082, + "router_z_loss_mlp": 0.82421875, + "step": 763, + "time_per_iteration": 3.1429429054260254 + }, + { + "auxiliary_loss_clip": 0.01245459, + "auxiliary_loss_mlp": 0.01081866, + "balance_loss_clip": 1.05381632, + "balance_loss_mlp": 1.07288039, + "epoch": 0.045934165038328575, + "flos": 20770121272320.0, + "grad_norm": 2.8717486924500517, + "language_loss": 0.87752867, + "learning_rate": 3.9973376518386475e-06, + "loss": 0.9008019, + "num_input_tokens_seen": 16433120, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.7265625, + "step": 764, + "time_per_iteration": 2.4727554321289062 + }, + { + "auxiliary_loss_clip": 0.01248006, + "auxiliary_loss_mlp": 0.01079864, + "balance_loss_clip": 1.05192137, + "balance_loss_mlp": 1.07565248, + "epoch": 0.04599428829099654, + "flos": 30262891691520.0, + "grad_norm": 1.9426139778845304, + "language_loss": 0.86118066, + "learning_rate": 3.997317525234592e-06, + "loss": 0.88445938, + "num_input_tokens_seen": 16453360, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.71875, + "step": 765, + "time_per_iteration": 2.5370731353759766 + }, + { + "auxiliary_loss_clip": 0.01248646, + "auxiliary_loss_mlp": 0.01070241, + "balance_loss_clip": 1.03912735, + "balance_loss_mlp": 1.07336497, + "epoch": 0.04605441154366451, + "flos": 23038921975680.0, + "grad_norm": 3.0624701923152453, + "language_loss": 0.87846982, + "learning_rate": 3.997297322892056e-06, + "loss": 0.90165865, + "num_input_tokens_seen": 16471160, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 1.75, + "step": 766, + "time_per_iteration": 2.475677013397217 + }, + { + "auxiliary_loss_clip": 0.01239894, + "auxiliary_loss_mlp": 0.01067957, + "balance_loss_clip": 1.03979921, + "balance_loss_mlp": 1.06896472, + "epoch": 0.046114534796332485, + "flos": 22017407091840.0, + "grad_norm": 2.616885530601855, + "language_loss": 0.84314167, + "learning_rate": 3.997277044811806e-06, + "loss": 0.86622024, + "num_input_tokens_seen": 16488940, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.703125, + "step": 767, + "time_per_iteration": 2.465449810028076 + }, + { + "auxiliary_loss_clip": 0.01245421, + "auxiliary_loss_mlp": 0.01060911, + "balance_loss_clip": 1.03249097, + "balance_loss_mlp": 1.07569289, + "epoch": 0.04617465804900045, + "flos": 29862380067840.0, + "grad_norm": 2.056931367891973, + "language_loss": 0.87013769, + "learning_rate": 3.99725669099461e-06, + "loss": 0.89320099, + "num_input_tokens_seen": 16509505, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.703125, + "step": 768, + "time_per_iteration": 5.441957235336304 + }, + { + "auxiliary_loss_clip": 0.01238542, + "auxiliary_loss_mlp": 0.01069073, + "balance_loss_clip": 1.04184508, + "balance_loss_mlp": 1.06768477, + "epoch": 0.04623478130166842, + "flos": 25630056351360.0, + "grad_norm": 2.1199205591749033, + "language_loss": 0.75022334, + "learning_rate": 3.9972362614412395e-06, + "loss": 0.77329946, + "num_input_tokens_seen": 16528840, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.703125, + "step": 769, + "time_per_iteration": 2.5294675827026367 + }, + { + "auxiliary_loss_clip": 0.01238179, + "auxiliary_loss_mlp": 0.01063477, + "balance_loss_clip": 1.03734684, + "balance_loss_mlp": 1.07084632, + "epoch": 0.04629490455433639, + "flos": 20449080489600.0, + "grad_norm": 1.886534334963383, + "language_loss": 0.86162585, + "learning_rate": 3.997215756152471e-06, + "loss": 0.88464236, + "num_input_tokens_seen": 16548335, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.671875, + "step": 770, + "time_per_iteration": 2.4646449089050293 + }, + { + "auxiliary_loss_clip": 0.01248004, + "auxiliary_loss_mlp": 0.01066015, + "balance_loss_clip": 1.0385015, + "balance_loss_mlp": 1.07160687, + "epoch": 0.04635502780700436, + "flos": 23148736830720.0, + "grad_norm": 2.8625416592988477, + "language_loss": 0.87259042, + "learning_rate": 3.99719517512908e-06, + "loss": 0.89573061, + "num_input_tokens_seen": 16567725, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.765625, + "step": 771, + "time_per_iteration": 2.512622117996216 + }, + { + "auxiliary_loss_clip": 0.01246333, + "auxiliary_loss_mlp": 0.01076832, + "balance_loss_clip": 1.04726815, + "balance_loss_mlp": 1.06911707, + "epoch": 0.04641515105967233, + "flos": 23292020183040.0, + "grad_norm": 2.3640102097360587, + "language_loss": 0.83736801, + "learning_rate": 3.997174518371848e-06, + "loss": 0.86059964, + "num_input_tokens_seen": 16588175, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.7734375, + "step": 772, + "time_per_iteration": 2.509572982788086 + }, + { + "auxiliary_loss_clip": 0.01243608, + "auxiliary_loss_mlp": 0.01064058, + "balance_loss_clip": 1.03721189, + "balance_loss_mlp": 1.07392263, + "epoch": 0.046475274312340296, + "flos": 25115204759040.0, + "grad_norm": 2.3097217333215694, + "language_loss": 0.73399591, + "learning_rate": 3.997153785881557e-06, + "loss": 0.75707257, + "num_input_tokens_seen": 16607735, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.6953125, + "step": 773, + "time_per_iteration": 2.5539331436157227 + }, + { + "auxiliary_loss_clip": 0.01240234, + "auxiliary_loss_mlp": 0.01065536, + "balance_loss_clip": 1.03624654, + "balance_loss_mlp": 1.07288945, + "epoch": 0.04653539756500827, + "flos": 25264916645760.0, + "grad_norm": 2.066531290075925, + "language_loss": 0.78523052, + "learning_rate": 3.997132977658996e-06, + "loss": 0.80828828, + "num_input_tokens_seen": 16627225, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.671875, + "step": 774, + "time_per_iteration": 2.5350210666656494 + }, + { + "auxiliary_loss_clip": 0.01239038, + "auxiliary_loss_mlp": 0.01065848, + "balance_loss_clip": 1.03955007, + "balance_loss_mlp": 1.07101154, + "epoch": 0.046595520817676234, + "flos": 35404150089600.0, + "grad_norm": 2.187480231527322, + "language_loss": 0.73357666, + "learning_rate": 3.997112093704952e-06, + "loss": 0.75662553, + "num_input_tokens_seen": 16647785, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.6796875, + "step": 775, + "time_per_iteration": 2.6102981567382812 + }, + { + "auxiliary_loss_clip": 0.01240703, + "auxiliary_loss_mlp": 0.01059246, + "balance_loss_clip": 1.03096998, + "balance_loss_mlp": 1.06996655, + "epoch": 0.046655644070344206, + "flos": 18112516778880.0, + "grad_norm": 1.5904648869830247, + "language_loss": 0.77037287, + "learning_rate": 3.997091134020217e-06, + "loss": 0.79337239, + "num_input_tokens_seen": 16667555, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.703125, + "step": 776, + "time_per_iteration": 2.4713642597198486 + }, + { + "auxiliary_loss_clip": 0.01236202, + "auxiliary_loss_mlp": 0.01063775, + "balance_loss_clip": 1.03790653, + "balance_loss_mlp": 1.06914115, + "epoch": 0.04671576732301218, + "flos": 29205286617600.0, + "grad_norm": 1.9751950676431418, + "language_loss": 0.70967531, + "learning_rate": 3.997070098605585e-06, + "loss": 0.73267508, + "num_input_tokens_seen": 16686875, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.671875, + "step": 777, + "time_per_iteration": 2.540151596069336 + }, + { + "auxiliary_loss_clip": 0.01242182, + "auxiliary_loss_mlp": 0.01078178, + "balance_loss_clip": 1.04999709, + "balance_loss_mlp": 1.07221043, + "epoch": 0.04677589057568014, + "flos": 30478319510400.0, + "grad_norm": 1.9852588200641685, + "language_loss": 0.76756501, + "learning_rate": 3.997048987461856e-06, + "loss": 0.79076868, + "num_input_tokens_seen": 16706420, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.703125, + "step": 778, + "time_per_iteration": 2.5299642086029053 + }, + { + "auxiliary_loss_clip": 0.01236882, + "auxiliary_loss_mlp": 0.01068399, + "balance_loss_clip": 1.04049253, + "balance_loss_mlp": 1.06948996, + "epoch": 0.046836013828348115, + "flos": 20557674282240.0, + "grad_norm": 2.9364819041983576, + "language_loss": 0.78900939, + "learning_rate": 3.997027800589829e-06, + "loss": 0.81206226, + "num_input_tokens_seen": 16726390, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.671875, + "step": 779, + "time_per_iteration": 2.4999477863311768 + }, + { + "auxiliary_loss_clip": 0.01230899, + "auxiliary_loss_mlp": 0.01065999, + "balance_loss_clip": 1.03997588, + "balance_loss_mlp": 1.06776333, + "epoch": 0.04689613708101608, + "flos": 25447378757760.0, + "grad_norm": 1.7037291099106273, + "language_loss": 0.77051055, + "learning_rate": 3.997006537990308e-06, + "loss": 0.7934795, + "num_input_tokens_seen": 16748965, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.6328125, + "step": 780, + "time_per_iteration": 2.54770565032959 + }, + { + "auxiliary_loss_clip": 0.01235667, + "auxiliary_loss_mlp": 0.01067194, + "balance_loss_clip": 1.04187369, + "balance_loss_mlp": 1.07070863, + "epoch": 0.04695626033368405, + "flos": 23001395241600.0, + "grad_norm": 2.6789342331958745, + "language_loss": 0.76432645, + "learning_rate": 3.996985199664099e-06, + "loss": 0.78735507, + "num_input_tokens_seen": 16768620, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.6484375, + "step": 781, + "time_per_iteration": 2.5040361881256104 + }, + { + "auxiliary_loss_clip": 0.01245917, + "auxiliary_loss_mlp": 0.01072818, + "balance_loss_clip": 1.04468417, + "balance_loss_mlp": 1.07423282, + "epoch": 0.047016383586352024, + "flos": 29133357632640.0, + "grad_norm": 2.2171800145032736, + "language_loss": 0.74027473, + "learning_rate": 3.99696378561201e-06, + "loss": 0.76346207, + "num_input_tokens_seen": 16789755, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.71875, + "step": 782, + "time_per_iteration": 2.528890371322632 + }, + { + "auxiliary_loss_clip": 0.01241991, + "auxiliary_loss_mlp": 0.01060851, + "balance_loss_clip": 1.03549504, + "balance_loss_mlp": 1.07483578, + "epoch": 0.04707650683901999, + "flos": 14976330451200.0, + "grad_norm": 6.219089205177081, + "language_loss": 0.8032757, + "learning_rate": 3.996942295834855e-06, + "loss": 0.82630414, + "num_input_tokens_seen": 16807585, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.671875, + "step": 783, + "time_per_iteration": 2.4866061210632324 + }, + { + "auxiliary_loss_clip": 0.01232605, + "auxiliary_loss_mlp": 0.01059533, + "balance_loss_clip": 1.03417742, + "balance_loss_mlp": 1.07062817, + "epoch": 0.04713663009168796, + "flos": 21651118151040.0, + "grad_norm": 2.0172272756643816, + "language_loss": 0.81289953, + "learning_rate": 3.996920730333448e-06, + "loss": 0.83582091, + "num_input_tokens_seen": 16827220, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.6171875, + "step": 784, + "time_per_iteration": 2.476659059524536 + }, + { + "auxiliary_loss_clip": 0.01238913, + "auxiliary_loss_mlp": 0.01072248, + "balance_loss_clip": 1.04597473, + "balance_loss_mlp": 1.0683856, + "epoch": 0.04719675334435593, + "flos": 21325408600320.0, + "grad_norm": 2.171254656371271, + "language_loss": 0.8076694, + "learning_rate": 3.996899089108607e-06, + "loss": 0.83078098, + "num_input_tokens_seen": 16846230, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.703125, + "step": 785, + "time_per_iteration": 2.493598461151123 + }, + { + "auxiliary_loss_clip": 0.01241548, + "auxiliary_loss_mlp": 0.01061941, + "balance_loss_clip": 1.03752661, + "balance_loss_mlp": 1.0762614, + "epoch": 0.0472568765970239, + "flos": 17931383470080.0, + "grad_norm": 2.444819858404617, + "language_loss": 0.89981294, + "learning_rate": 3.996877372161152e-06, + "loss": 0.92284781, + "num_input_tokens_seen": 16865325, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.65625, + "step": 786, + "time_per_iteration": 2.4573311805725098 + }, + { + "auxiliary_loss_clip": 0.012413, + "auxiliary_loss_mlp": 0.01070001, + "balance_loss_clip": 1.04055619, + "balance_loss_mlp": 1.06742501, + "epoch": 0.04731699984969187, + "flos": 18077324428800.0, + "grad_norm": 3.379381752409287, + "language_loss": 0.76639462, + "learning_rate": 3.9968555794919065e-06, + "loss": 0.78950763, + "num_input_tokens_seen": 16882930, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.734375, + "step": 787, + "time_per_iteration": 2.447611093521118 + }, + { + "auxiliary_loss_clip": 0.01247236, + "auxiliary_loss_mlp": 0.01071736, + "balance_loss_clip": 1.04431772, + "balance_loss_mlp": 1.0765723, + "epoch": 0.047377123102359836, + "flos": 23185078416000.0, + "grad_norm": 2.4642209511959403, + "language_loss": 0.80851126, + "learning_rate": 3.996833711101698e-06, + "loss": 0.83170098, + "num_input_tokens_seen": 16900710, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.7109375, + "step": 788, + "time_per_iteration": 2.4679956436157227 + }, + { + "auxiliary_loss_clip": 0.01236983, + "auxiliary_loss_mlp": 0.01074337, + "balance_loss_clip": 1.04551244, + "balance_loss_mlp": 1.07285857, + "epoch": 0.04743724635502781, + "flos": 22747794243840.0, + "grad_norm": 2.2318634793178127, + "language_loss": 0.84819949, + "learning_rate": 3.996811766991355e-06, + "loss": 0.87131274, + "num_input_tokens_seen": 16919210, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.640625, + "step": 789, + "time_per_iteration": 2.4982516765594482 + }, + { + "auxiliary_loss_clip": 0.01242053, + "auxiliary_loss_mlp": 0.01066276, + "balance_loss_clip": 1.04006219, + "balance_loss_mlp": 1.07367456, + "epoch": 0.04749736960769577, + "flos": 17238702620160.0, + "grad_norm": 1.948517450129577, + "language_loss": 0.82196069, + "learning_rate": 3.996789747161709e-06, + "loss": 0.84504396, + "num_input_tokens_seen": 16937125, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.6875, + "step": 790, + "time_per_iteration": 2.4380602836608887 + }, + { + "auxiliary_loss_clip": 0.01236299, + "auxiliary_loss_mlp": 0.01062348, + "balance_loss_clip": 1.03524029, + "balance_loss_mlp": 1.06857598, + "epoch": 0.047557492860363745, + "flos": 40479261592320.0, + "grad_norm": 2.8806939749630054, + "language_loss": 0.88245451, + "learning_rate": 3.996767651613597e-06, + "loss": 0.90544093, + "num_input_tokens_seen": 16958610, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.671875, + "step": 791, + "time_per_iteration": 2.6723573207855225 + }, + { + "auxiliary_loss_clip": 0.01239952, + "auxiliary_loss_mlp": 0.010655, + "balance_loss_clip": 1.03826034, + "balance_loss_mlp": 1.07212687, + "epoch": 0.04761761611303172, + "flos": 18698004466560.0, + "grad_norm": 2.2584516419561464, + "language_loss": 0.90245461, + "learning_rate": 3.996745480347854e-06, + "loss": 0.92550921, + "num_input_tokens_seen": 16977300, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.671875, + "step": 792, + "time_per_iteration": 2.4627771377563477 + }, + { + "auxiliary_loss_clip": 0.01241845, + "auxiliary_loss_mlp": 0.01074856, + "balance_loss_clip": 1.04874945, + "balance_loss_mlp": 1.07157969, + "epoch": 0.04767773936569968, + "flos": 20921987975040.0, + "grad_norm": 1.9386484459236437, + "language_loss": 0.7310667, + "learning_rate": 3.996723233365324e-06, + "loss": 0.75423372, + "num_input_tokens_seen": 16994950, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.703125, + "step": 793, + "time_per_iteration": 2.512657403945923 + }, + { + "auxiliary_loss_clip": 0.0124198, + "auxiliary_loss_mlp": 0.01067209, + "balance_loss_clip": 1.03969526, + "balance_loss_mlp": 1.07207203, + "epoch": 0.047737862618367655, + "flos": 23732680233600.0, + "grad_norm": 2.0117940746735123, + "language_loss": 0.86102074, + "learning_rate": 3.996700910666847e-06, + "loss": 0.88411266, + "num_input_tokens_seen": 17014760, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.703125, + "step": 794, + "time_per_iteration": 2.510611057281494 + }, + { + "auxiliary_loss_clip": 0.0123999, + "auxiliary_loss_mlp": 0.01074174, + "balance_loss_clip": 1.04701805, + "balance_loss_mlp": 1.06925917, + "epoch": 0.04779798587103562, + "flos": 23695764030720.0, + "grad_norm": 3.4118642482115384, + "language_loss": 0.69812739, + "learning_rate": 3.996678512253272e-06, + "loss": 0.72126907, + "num_input_tokens_seen": 17032715, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.7109375, + "step": 795, + "time_per_iteration": 2.500420093536377 + }, + { + "auxiliary_loss_clip": 0.01236981, + "auxiliary_loss_mlp": 0.01070364, + "balance_loss_clip": 1.0432204, + "balance_loss_mlp": 1.06999111, + "epoch": 0.04785810912370359, + "flos": 23183641872000.0, + "grad_norm": 2.0479238599532135, + "language_loss": 0.81053579, + "learning_rate": 3.996656038125449e-06, + "loss": 0.83360916, + "num_input_tokens_seen": 17052215, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.671875, + "step": 796, + "time_per_iteration": 2.4838409423828125 + }, + { + "auxiliary_loss_clip": 0.0124002, + "auxiliary_loss_mlp": 0.01058331, + "balance_loss_clip": 1.03129458, + "balance_loss_mlp": 1.07190371, + "epoch": 0.047918232376371564, + "flos": 18040623707520.0, + "grad_norm": 2.3456590334750858, + "language_loss": 0.81249642, + "learning_rate": 3.996633488284228e-06, + "loss": 0.83547997, + "num_input_tokens_seen": 17069225, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.6796875, + "step": 797, + "time_per_iteration": 2.466343402862549 + }, + { + "auxiliary_loss_clip": 0.01122032, + "auxiliary_loss_mlp": 0.0100279, + "balance_loss_clip": 0.9972828, + "balance_loss_mlp": 1.03672731, + "epoch": 0.04797835562903953, + "flos": 62442588758400.0, + "grad_norm": 0.9120921080635288, + "language_loss": 0.64447635, + "learning_rate": 3.996610862730465e-06, + "loss": 0.66572458, + "num_input_tokens_seen": 17126680, + "router_z_loss_clip": 0.05517578, + "router_z_loss_mlp": 0.8515625, + "step": 798, + "time_per_iteration": 3.0081863403320312 + }, + { + "auxiliary_loss_clip": 0.01243937, + "auxiliary_loss_mlp": 0.01070197, + "balance_loss_clip": 1.04285014, + "balance_loss_mlp": 1.06894708, + "epoch": 0.0480384788817075, + "flos": 21507296094720.0, + "grad_norm": 7.0153313624744005, + "language_loss": 0.90794134, + "learning_rate": 3.996588161465018e-06, + "loss": 0.93108267, + "num_input_tokens_seen": 17144835, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.75, + "step": 799, + "time_per_iteration": 2.4872424602508545 + }, + { + "auxiliary_loss_clip": 0.01242621, + "auxiliary_loss_mlp": 0.01069655, + "balance_loss_clip": 1.04220068, + "balance_loss_mlp": 1.07567, + "epoch": 0.048098602134375466, + "flos": 21726710323200.0, + "grad_norm": 2.1467314479540818, + "language_loss": 0.86701, + "learning_rate": 3.996565384488748e-06, + "loss": 0.89013278, + "num_input_tokens_seen": 17165030, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.671875, + "step": 800, + "time_per_iteration": 2.477720022201538 + }, + { + "auxiliary_loss_clip": 0.01243518, + "auxiliary_loss_mlp": 0.0106979, + "balance_loss_clip": 1.04362369, + "balance_loss_mlp": 1.07207572, + "epoch": 0.04815872538704344, + "flos": 22931082368640.0, + "grad_norm": 7.517902152046504, + "language_loss": 0.84513009, + "learning_rate": 3.996542531802518e-06, + "loss": 0.86826313, + "num_input_tokens_seen": 17184895, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.71875, + "step": 801, + "time_per_iteration": 2.487889528274536 + }, + { + "auxiliary_loss_clip": 0.01242116, + "auxiliary_loss_mlp": 0.01071409, + "balance_loss_clip": 1.04470587, + "balance_loss_mlp": 1.07289147, + "epoch": 0.04821884863971141, + "flos": 43174716042240.0, + "grad_norm": 1.97564705550146, + "language_loss": 0.79967415, + "learning_rate": 3.996519603407196e-06, + "loss": 0.82280934, + "num_input_tokens_seen": 17208225, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.6953125, + "step": 802, + "time_per_iteration": 2.6496224403381348 + }, + { + "auxiliary_loss_clip": 0.01238875, + "auxiliary_loss_mlp": 0.01065547, + "balance_loss_clip": 1.03963101, + "balance_loss_mlp": 1.07270598, + "epoch": 0.048278971892379376, + "flos": 18620006083200.0, + "grad_norm": 2.8331626885697725, + "language_loss": 0.86420751, + "learning_rate": 3.996496599303649e-06, + "loss": 0.88725173, + "num_input_tokens_seen": 17226305, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.6640625, + "step": 803, + "time_per_iteration": 2.4806807041168213 + }, + { + "auxiliary_loss_clip": 0.01238315, + "auxiliary_loss_mlp": 0.01061166, + "balance_loss_clip": 1.0346303, + "balance_loss_mlp": 1.07398677, + "epoch": 0.04833909514504735, + "flos": 20230061310720.0, + "grad_norm": 2.229653749186784, + "language_loss": 0.85436332, + "learning_rate": 3.996473519492753e-06, + "loss": 0.87735808, + "num_input_tokens_seen": 17244545, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.640625, + "step": 804, + "time_per_iteration": 2.458303213119507 + }, + { + "auxiliary_loss_clip": 0.01239413, + "auxiliary_loss_mlp": 0.01066878, + "balance_loss_clip": 1.04099822, + "balance_loss_mlp": 1.07286024, + "epoch": 0.04839921839771532, + "flos": 24645170361600.0, + "grad_norm": 2.2509331098011645, + "language_loss": 0.86119306, + "learning_rate": 3.99645036397538e-06, + "loss": 0.88425595, + "num_input_tokens_seen": 17265730, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.6640625, + "step": 805, + "time_per_iteration": 2.5021419525146484 + }, + { + "auxiliary_loss_clip": 0.01235031, + "auxiliary_loss_mlp": 0.01067273, + "balance_loss_clip": 1.04115391, + "balance_loss_mlp": 1.06942892, + "epoch": 0.048459341650383285, + "flos": 24827452905600.0, + "grad_norm": 1.8866019303880346, + "language_loss": 0.68034315, + "learning_rate": 3.9964271327524085e-06, + "loss": 0.70336622, + "num_input_tokens_seen": 17284820, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.65625, + "step": 806, + "time_per_iteration": 2.4904568195343018 + }, + { + "auxiliary_loss_clip": 0.01235579, + "auxiliary_loss_mlp": 0.01064526, + "balance_loss_clip": 1.03847933, + "balance_loss_mlp": 1.07208037, + "epoch": 0.04851946490305126, + "flos": 22163204396160.0, + "grad_norm": 2.221107161276338, + "language_loss": 0.7716608, + "learning_rate": 3.9964038258247214e-06, + "loss": 0.79466188, + "num_input_tokens_seen": 17305085, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.6328125, + "step": 807, + "time_per_iteration": 2.498624563217163 + }, + { + "auxiliary_loss_clip": 0.01232532, + "auxiliary_loss_mlp": 0.01071643, + "balance_loss_clip": 1.04567873, + "balance_loss_mlp": 1.06831741, + "epoch": 0.04857958815571922, + "flos": 19792022952960.0, + "grad_norm": 2.844770488216335, + "language_loss": 0.86509991, + "learning_rate": 3.9963804431932005e-06, + "loss": 0.88814163, + "num_input_tokens_seen": 17322715, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.640625, + "step": 808, + "time_per_iteration": 2.444673538208008 + }, + { + "auxiliary_loss_clip": 0.01242847, + "auxiliary_loss_mlp": 0.01070908, + "balance_loss_clip": 1.04441929, + "balance_loss_mlp": 1.07261682, + "epoch": 0.048639711408387194, + "flos": 18697968552960.0, + "grad_norm": 1.9428867449931826, + "language_loss": 0.90154302, + "learning_rate": 3.996356984858732e-06, + "loss": 0.92468053, + "num_input_tokens_seen": 17341455, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.703125, + "step": 809, + "time_per_iteration": 5.353702545166016 + }, + { + "auxiliary_loss_clip": 0.01242102, + "auxiliary_loss_mlp": 0.01070004, + "balance_loss_clip": 1.0432415, + "balance_loss_mlp": 1.07577538, + "epoch": 0.048699834661055166, + "flos": 24863507182080.0, + "grad_norm": 2.12821080633451, + "language_loss": 0.84360719, + "learning_rate": 3.996333450822208e-06, + "loss": 0.86672825, + "num_input_tokens_seen": 17360765, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.65625, + "step": 810, + "time_per_iteration": 3.8935022354125977 + }, + { + "auxiliary_loss_clip": 0.01240735, + "auxiliary_loss_mlp": 0.01066961, + "balance_loss_clip": 1.04049659, + "balance_loss_mlp": 1.07189715, + "epoch": 0.04875995791372313, + "flos": 20704010290560.0, + "grad_norm": 1.7610993085905569, + "language_loss": 0.80875039, + "learning_rate": 3.99630984108452e-06, + "loss": 0.8318274, + "num_input_tokens_seen": 17380625, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.6875, + "step": 811, + "time_per_iteration": 2.5000643730163574 + }, + { + "auxiliary_loss_clip": 0.01232044, + "auxiliary_loss_mlp": 0.0107527, + "balance_loss_clip": 1.04991412, + "balance_loss_mlp": 1.06997907, + "epoch": 0.048820081166391104, + "flos": 18588297352320.0, + "grad_norm": 2.0417171226218715, + "language_loss": 0.74768531, + "learning_rate": 3.9962861556465615e-06, + "loss": 0.77075845, + "num_input_tokens_seen": 17399355, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.625, + "step": 812, + "time_per_iteration": 2.4853179454803467 + }, + { + "auxiliary_loss_clip": 0.01233917, + "auxiliary_loss_mlp": 0.01074517, + "balance_loss_clip": 1.04924428, + "balance_loss_mlp": 1.07263327, + "epoch": 0.04888020441905907, + "flos": 22707322594560.0, + "grad_norm": 1.8904091966919716, + "language_loss": 0.89845109, + "learning_rate": 3.996262394509233e-06, + "loss": 0.92153537, + "num_input_tokens_seen": 17418240, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.609375, + "step": 813, + "time_per_iteration": 2.6731016635894775 + }, + { + "auxiliary_loss_clip": 0.01232344, + "auxiliary_loss_mlp": 0.01058653, + "balance_loss_clip": 1.03429866, + "balance_loss_mlp": 1.07083082, + "epoch": 0.04894032767172704, + "flos": 22784351310720.0, + "grad_norm": 2.028357820963791, + "language_loss": 0.74551463, + "learning_rate": 3.9962385576734335e-06, + "loss": 0.76842451, + "num_input_tokens_seen": 17436250, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.6171875, + "step": 814, + "time_per_iteration": 2.509963035583496 + }, + { + "auxiliary_loss_clip": 0.01235531, + "auxiliary_loss_mlp": 0.01067085, + "balance_loss_clip": 1.04074001, + "balance_loss_mlp": 1.07073569, + "epoch": 0.04900045092439501, + "flos": 25516147345920.0, + "grad_norm": 2.3605733083261464, + "language_loss": 0.83740532, + "learning_rate": 3.9962146451400675e-06, + "loss": 0.86043149, + "num_input_tokens_seen": 17455750, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.6484375, + "step": 815, + "time_per_iteration": 2.5490894317626953 + }, + { + "auxiliary_loss_clip": 0.01239776, + "auxiliary_loss_mlp": 0.0106033, + "balance_loss_clip": 1.03396082, + "balance_loss_mlp": 1.07326484, + "epoch": 0.04906057417706298, + "flos": 25958136199680.0, + "grad_norm": 2.271155414035229, + "language_loss": 0.90803105, + "learning_rate": 3.996190656910043e-06, + "loss": 0.93103218, + "num_input_tokens_seen": 17474995, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.6640625, + "step": 816, + "time_per_iteration": 2.5273053646087646 + }, + { + "auxiliary_loss_clip": 0.01240454, + "auxiliary_loss_mlp": 0.01060699, + "balance_loss_clip": 1.03410304, + "balance_loss_mlp": 1.0732162, + "epoch": 0.04912069742973095, + "flos": 18624638937600.0, + "grad_norm": 3.2321750342473603, + "language_loss": 0.79924619, + "learning_rate": 3.996166592984268e-06, + "loss": 0.82225776, + "num_input_tokens_seen": 17493395, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.671875, + "step": 817, + "time_per_iteration": 2.5095019340515137 + }, + { + "auxiliary_loss_clip": 0.0123455, + "auxiliary_loss_mlp": 0.01074727, + "balance_loss_clip": 1.04864395, + "balance_loss_mlp": 1.07184172, + "epoch": 0.049180820682398915, + "flos": 23699786353920.0, + "grad_norm": 1.8264850687392937, + "language_loss": 0.84520394, + "learning_rate": 3.996142453363656e-06, + "loss": 0.86829674, + "num_input_tokens_seen": 17514565, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.625, + "step": 818, + "time_per_iteration": 2.5476157665252686 + }, + { + "auxiliary_loss_clip": 0.01243386, + "auxiliary_loss_mlp": 0.01067455, + "balance_loss_clip": 1.04041791, + "balance_loss_mlp": 1.07401037, + "epoch": 0.04924094393506689, + "flos": 22420396753920.0, + "grad_norm": 2.779535734169796, + "language_loss": 0.75307131, + "learning_rate": 3.996118238049124e-06, + "loss": 0.77617967, + "num_input_tokens_seen": 17534590, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.6953125, + "step": 819, + "time_per_iteration": 2.5486624240875244 + }, + { + "auxiliary_loss_clip": 0.01239669, + "auxiliary_loss_mlp": 0.01061583, + "balance_loss_clip": 1.03858793, + "balance_loss_mlp": 1.07577193, + "epoch": 0.04930106718773486, + "flos": 15738246766080.0, + "grad_norm": 2.1475545017813853, + "language_loss": 0.85166955, + "learning_rate": 3.996093947041586e-06, + "loss": 0.87468207, + "num_input_tokens_seen": 17551900, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.640625, + "step": 820, + "time_per_iteration": 2.4565298557281494 + }, + { + "auxiliary_loss_clip": 0.0123627, + "auxiliary_loss_mlp": 0.01061843, + "balance_loss_clip": 1.03602266, + "balance_loss_mlp": 1.07061315, + "epoch": 0.049361190440402825, + "flos": 26250628648320.0, + "grad_norm": 1.902695357085614, + "language_loss": 0.9041872, + "learning_rate": 3.996069580341966e-06, + "loss": 0.92716837, + "num_input_tokens_seen": 17571485, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.65625, + "step": 821, + "time_per_iteration": 2.5412514209747314 + }, + { + "auxiliary_loss_clip": 0.01233424, + "auxiliary_loss_mlp": 0.01073041, + "balance_loss_clip": 1.04773307, + "balance_loss_mlp": 1.06951392, + "epoch": 0.0494213136930708, + "flos": 21252366293760.0, + "grad_norm": 2.0531707528144274, + "language_loss": 0.8941884, + "learning_rate": 3.996045137951188e-06, + "loss": 0.91725308, + "num_input_tokens_seen": 17591410, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.640625, + "step": 822, + "time_per_iteration": 2.5171031951904297 + }, + { + "auxiliary_loss_clip": 0.01237258, + "auxiliary_loss_mlp": 0.01059943, + "balance_loss_clip": 1.03295374, + "balance_loss_mlp": 1.0742538, + "epoch": 0.04948143694573876, + "flos": 27965506740480.0, + "grad_norm": 2.060390808888412, + "language_loss": 0.67537785, + "learning_rate": 3.996020619870178e-06, + "loss": 0.69834983, + "num_input_tokens_seen": 17612010, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.6328125, + "step": 823, + "time_per_iteration": 2.5744235515594482 + }, + { + "auxiliary_loss_clip": 0.01120581, + "auxiliary_loss_mlp": 0.01008389, + "balance_loss_clip": 1.00323892, + "balance_loss_mlp": 1.04174662, + "epoch": 0.049541560198406734, + "flos": 66180995533440.0, + "grad_norm": 1.3777513990451415, + "language_loss": 0.62206292, + "learning_rate": 3.995996026099866e-06, + "loss": 0.64335263, + "num_input_tokens_seen": 17673430, + "router_z_loss_clip": 0.05151367, + "router_z_loss_mlp": 0.7890625, + "step": 824, + "time_per_iteration": 3.13708758354187 + }, + { + "auxiliary_loss_clip": 0.01240025, + "auxiliary_loss_mlp": 0.01070032, + "balance_loss_clip": 1.0431149, + "balance_loss_mlp": 1.07293963, + "epoch": 0.049601683451074706, + "flos": 22892693708160.0, + "grad_norm": 2.021638376413324, + "language_loss": 0.90364408, + "learning_rate": 3.995971356641185e-06, + "loss": 0.92674464, + "num_input_tokens_seen": 17689545, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.671875, + "step": 825, + "time_per_iteration": 2.519487142562866 + }, + { + "auxiliary_loss_clip": 0.01237141, + "auxiliary_loss_mlp": 0.01064311, + "balance_loss_clip": 1.03678548, + "balance_loss_mlp": 1.0713625, + "epoch": 0.04966180670374267, + "flos": 21433643256960.0, + "grad_norm": 23.06748840114486, + "language_loss": 0.66790086, + "learning_rate": 3.9959466114950695e-06, + "loss": 0.69091535, + "num_input_tokens_seen": 17705965, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.65625, + "step": 826, + "time_per_iteration": 2.486837387084961 + }, + { + "auxiliary_loss_clip": 0.01236344, + "auxiliary_loss_mlp": 0.01062091, + "balance_loss_clip": 1.0362581, + "balance_loss_mlp": 1.07166433, + "epoch": 0.04972192995641064, + "flos": 23107367341440.0, + "grad_norm": 5.4656671498779845, + "language_loss": 0.78386623, + "learning_rate": 3.995921790662459e-06, + "loss": 0.80685055, + "num_input_tokens_seen": 17724580, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.640625, + "step": 827, + "time_per_iteration": 2.517092704772949 + }, + { + "auxiliary_loss_clip": 0.0124052, + "auxiliary_loss_mlp": 0.01072288, + "balance_loss_clip": 1.04553795, + "balance_loss_mlp": 1.07333767, + "epoch": 0.04978205320907861, + "flos": 40406147458560.0, + "grad_norm": 2.8940457048653916, + "language_loss": 0.78592682, + "learning_rate": 3.995896894144294e-06, + "loss": 0.80905491, + "num_input_tokens_seen": 17747755, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.671875, + "step": 828, + "time_per_iteration": 2.6536450386047363 + }, + { + "auxiliary_loss_clip": 0.01227721, + "auxiliary_loss_mlp": 0.01058804, + "balance_loss_clip": 1.03428292, + "balance_loss_mlp": 1.06777728, + "epoch": 0.04984217646174658, + "flos": 25228539146880.0, + "grad_norm": 2.330577425067274, + "language_loss": 0.83493364, + "learning_rate": 3.995871921941519e-06, + "loss": 0.85779881, + "num_input_tokens_seen": 17768550, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.6015625, + "step": 829, + "time_per_iteration": 2.5744268894195557 + }, + { + "auxiliary_loss_clip": 0.01235678, + "auxiliary_loss_mlp": 0.01073434, + "balance_loss_clip": 1.04433525, + "balance_loss_mlp": 1.07021666, + "epoch": 0.04990229971441455, + "flos": 15959636242560.0, + "grad_norm": 2.2375926111489743, + "language_loss": 0.75055873, + "learning_rate": 3.99584687405508e-06, + "loss": 0.77364987, + "num_input_tokens_seen": 17786080, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.65625, + "step": 830, + "time_per_iteration": 2.5045461654663086 + }, + { + "auxiliary_loss_clip": 0.01233457, + "auxiliary_loss_mlp": 0.01065961, + "balance_loss_clip": 1.03935385, + "balance_loss_mlp": 1.06966341, + "epoch": 0.04996242296708252, + "flos": 18405116968320.0, + "grad_norm": 1.962979792887244, + "language_loss": 0.79379636, + "learning_rate": 3.995821750485929e-06, + "loss": 0.81679052, + "num_input_tokens_seen": 17803635, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.640625, + "step": 831, + "time_per_iteration": 2.5924267768859863 + }, + { + "auxiliary_loss_clip": 0.01237676, + "auxiliary_loss_mlp": 0.01070014, + "balance_loss_clip": 1.04487276, + "balance_loss_mlp": 1.07213569, + "epoch": 0.05002254621975049, + "flos": 17858053854720.0, + "grad_norm": 2.758266217871517, + "language_loss": 0.91538632, + "learning_rate": 3.995796551235016e-06, + "loss": 0.93846321, + "num_input_tokens_seen": 17822190, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.65625, + "step": 832, + "time_per_iteration": 2.653150796890259 + }, + { + "auxiliary_loss_clip": 0.01230534, + "auxiliary_loss_mlp": 0.01081981, + "balance_loss_clip": 1.05747163, + "balance_loss_mlp": 1.07053018, + "epoch": 0.050082669472418455, + "flos": 45660273367680.0, + "grad_norm": 1.9700093948003867, + "language_loss": 0.83139837, + "learning_rate": 3.9957712763032974e-06, + "loss": 0.85452354, + "num_input_tokens_seen": 17846915, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.6015625, + "step": 833, + "time_per_iteration": 2.73848819732666 + }, + { + "auxiliary_loss_clip": 0.0123523, + "auxiliary_loss_mlp": 0.01058285, + "balance_loss_clip": 1.0318923, + "balance_loss_mlp": 1.06913459, + "epoch": 0.05014279272508643, + "flos": 37962067363200.0, + "grad_norm": 2.433665596415918, + "language_loss": 0.8254565, + "learning_rate": 3.995745925691733e-06, + "loss": 0.84839165, + "num_input_tokens_seen": 17867270, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.65625, + "step": 834, + "time_per_iteration": 2.6406352519989014 + }, + { + "auxiliary_loss_clip": 0.01236789, + "auxiliary_loss_mlp": 0.01063828, + "balance_loss_clip": 1.03710127, + "balance_loss_mlp": 1.07138014, + "epoch": 0.0502029159777544, + "flos": 20996179516800.0, + "grad_norm": 2.099554255469436, + "language_loss": 0.91758966, + "learning_rate": 3.995720499401282e-06, + "loss": 0.94059587, + "num_input_tokens_seen": 17884880, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.65625, + "step": 835, + "time_per_iteration": 2.4750425815582275 + }, + { + "auxiliary_loss_clip": 0.0123437, + "auxiliary_loss_mlp": 0.01071713, + "balance_loss_clip": 1.04372287, + "balance_loss_mlp": 1.06699944, + "epoch": 0.050263039230422364, + "flos": 15888066393600.0, + "grad_norm": 2.4903656252358735, + "language_loss": 0.76346481, + "learning_rate": 3.995694997432911e-06, + "loss": 0.78652561, + "num_input_tokens_seen": 17903695, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.671875, + "step": 836, + "time_per_iteration": 2.4839258193969727 + }, + { + "auxiliary_loss_clip": 0.01229978, + "auxiliary_loss_mlp": 0.01072782, + "balance_loss_clip": 1.04696083, + "balance_loss_mlp": 1.07100809, + "epoch": 0.050323162483090336, + "flos": 23732752060800.0, + "grad_norm": 2.1380784235063066, + "language_loss": 0.8360337, + "learning_rate": 3.9956694197875855e-06, + "loss": 0.85906136, + "num_input_tokens_seen": 17920745, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.5859375, + "step": 837, + "time_per_iteration": 2.5140485763549805 + }, + { + "auxiliary_loss_clip": 0.01233502, + "auxiliary_loss_mlp": 0.01065592, + "balance_loss_clip": 1.0403192, + "balance_loss_mlp": 1.07245386, + "epoch": 0.0503832857357583, + "flos": 20266223328000.0, + "grad_norm": 2.225982034212064, + "language_loss": 0.73137468, + "learning_rate": 3.995643766466275e-06, + "loss": 0.75436556, + "num_input_tokens_seen": 17938220, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.609375, + "step": 838, + "time_per_iteration": 2.5128419399261475 + }, + { + "auxiliary_loss_clip": 0.01229023, + "auxiliary_loss_mlp": 0.0106788, + "balance_loss_clip": 1.04195237, + "balance_loss_mlp": 1.06636167, + "epoch": 0.05044340898842627, + "flos": 17785011548160.0, + "grad_norm": 1.886796600099776, + "language_loss": 0.83328462, + "learning_rate": 3.995618037469953e-06, + "loss": 0.85625362, + "num_input_tokens_seen": 17957325, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.6328125, + "step": 839, + "time_per_iteration": 2.499415874481201 + }, + { + "auxiliary_loss_clip": 0.01228207, + "auxiliary_loss_mlp": 0.01066651, + "balance_loss_clip": 1.04128349, + "balance_loss_mlp": 1.06866539, + "epoch": 0.050503532241094246, + "flos": 22966526113920.0, + "grad_norm": 2.2056506497336765, + "language_loss": 0.85777193, + "learning_rate": 3.995592232799595e-06, + "loss": 0.8807205, + "num_input_tokens_seen": 17975875, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.59375, + "step": 840, + "time_per_iteration": 2.522038698196411 + }, + { + "auxiliary_loss_clip": 0.01235877, + "auxiliary_loss_mlp": 0.01063775, + "balance_loss_clip": 1.03691697, + "balance_loss_mlp": 1.07246661, + "epoch": 0.05056365549376221, + "flos": 22776989022720.0, + "grad_norm": 2.034102412822674, + "language_loss": 0.94658732, + "learning_rate": 3.99556635245618e-06, + "loss": 0.96958393, + "num_input_tokens_seen": 17994340, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.6328125, + "step": 841, + "time_per_iteration": 2.4996211528778076 + }, + { + "auxiliary_loss_clip": 0.01234454, + "auxiliary_loss_mlp": 0.01066453, + "balance_loss_clip": 1.03958321, + "balance_loss_mlp": 1.07130527, + "epoch": 0.05062377874643018, + "flos": 30916968399360.0, + "grad_norm": 2.030819255438432, + "language_loss": 0.77387047, + "learning_rate": 3.995540396440688e-06, + "loss": 0.79687953, + "num_input_tokens_seen": 18015260, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.6328125, + "step": 842, + "time_per_iteration": 2.6253628730773926 + }, + { + "auxiliary_loss_clip": 0.01238804, + "auxiliary_loss_mlp": 0.01067813, + "balance_loss_clip": 1.041659, + "balance_loss_mlp": 1.07278991, + "epoch": 0.05068390199909815, + "flos": 19647159402240.0, + "grad_norm": 2.283727909175907, + "language_loss": 0.78014457, + "learning_rate": 3.995514364754105e-06, + "loss": 0.80321074, + "num_input_tokens_seen": 18033960, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.6640625, + "step": 843, + "time_per_iteration": 2.5158324241638184 + }, + { + "auxiliary_loss_clip": 0.01237695, + "auxiliary_loss_mlp": 0.01061566, + "balance_loss_clip": 1.036461, + "balance_loss_mlp": 1.07266212, + "epoch": 0.05074402525176612, + "flos": 37962103276800.0, + "grad_norm": 2.249210505837228, + "language_loss": 0.82952344, + "learning_rate": 3.995488257397417e-06, + "loss": 0.85251611, + "num_input_tokens_seen": 18056700, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.6484375, + "step": 844, + "time_per_iteration": 2.6476500034332275 + }, + { + "auxiliary_loss_clip": 0.01229818, + "auxiliary_loss_mlp": 0.0106479, + "balance_loss_clip": 1.03935087, + "balance_loss_mlp": 1.06871867, + "epoch": 0.05080414850443409, + "flos": 22054610603520.0, + "grad_norm": 2.3236550986537368, + "language_loss": 0.76042783, + "learning_rate": 3.995462074371614e-06, + "loss": 0.78337395, + "num_input_tokens_seen": 18075815, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.609375, + "step": 845, + "time_per_iteration": 2.502901315689087 + }, + { + "auxiliary_loss_clip": 0.01229682, + "auxiliary_loss_mlp": 0.01075672, + "balance_loss_clip": 1.04924285, + "balance_loss_mlp": 1.06694174, + "epoch": 0.05086427175710206, + "flos": 20225787592320.0, + "grad_norm": 2.2528566199281905, + "language_loss": 0.87468004, + "learning_rate": 3.99543581567769e-06, + "loss": 0.89773357, + "num_input_tokens_seen": 18095095, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.625, + "step": 846, + "time_per_iteration": 2.5271859169006348 + }, + { + "auxiliary_loss_clip": 0.01230653, + "auxiliary_loss_mlp": 0.01070334, + "balance_loss_clip": 1.04521692, + "balance_loss_mlp": 1.06982791, + "epoch": 0.05092439500977003, + "flos": 15159223526400.0, + "grad_norm": 1.95159927266484, + "language_loss": 0.87571466, + "learning_rate": 3.9954094813166394e-06, + "loss": 0.89872456, + "num_input_tokens_seen": 18112675, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.609375, + "step": 847, + "time_per_iteration": 2.4566030502319336 + }, + { + "auxiliary_loss_clip": 0.01226009, + "auxiliary_loss_mlp": 0.01071018, + "balance_loss_clip": 1.04489946, + "balance_loss_mlp": 1.06883907, + "epoch": 0.050984518262437994, + "flos": 22055149307520.0, + "grad_norm": 2.141846591022022, + "language_loss": 0.81706643, + "learning_rate": 3.995383071289462e-06, + "loss": 0.84003675, + "num_input_tokens_seen": 18130745, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.5703125, + "step": 848, + "time_per_iteration": 2.4695050716400146 + }, + { + "auxiliary_loss_clip": 0.0123182, + "auxiliary_loss_mlp": 0.01077851, + "balance_loss_clip": 1.0522449, + "balance_loss_mlp": 1.07167053, + "epoch": 0.05104464151510597, + "flos": 30225329043840.0, + "grad_norm": 1.898868752622741, + "language_loss": 0.87266076, + "learning_rate": 3.995356585597158e-06, + "loss": 0.89575738, + "num_input_tokens_seen": 18152410, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.6015625, + "step": 849, + "time_per_iteration": 2.5472936630249023 + }, + { + "auxiliary_loss_clip": 0.0122487, + "auxiliary_loss_mlp": 0.01062562, + "balance_loss_clip": 1.03743291, + "balance_loss_mlp": 1.06569946, + "epoch": 0.05110476476777394, + "flos": 18332900674560.0, + "grad_norm": 1.8637209623848903, + "language_loss": 0.83340889, + "learning_rate": 3.995330024240732e-06, + "loss": 0.85628319, + "num_input_tokens_seen": 18170870, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.59375, + "step": 850, + "time_per_iteration": 2.493814468383789 + }, + { + "auxiliary_loss_clip": 0.01229016, + "auxiliary_loss_mlp": 0.01064236, + "balance_loss_clip": 1.03847528, + "balance_loss_mlp": 1.06816506, + "epoch": 0.051164888020441904, + "flos": 37998732170880.0, + "grad_norm": 2.1400408414194154, + "language_loss": 0.6501807, + "learning_rate": 3.995303387221192e-06, + "loss": 0.67311323, + "num_input_tokens_seen": 18191555, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.609375, + "step": 851, + "time_per_iteration": 5.443026065826416 + }, + { + "auxiliary_loss_clip": 0.01228781, + "auxiliary_loss_mlp": 0.01071453, + "balance_loss_clip": 1.04424942, + "balance_loss_mlp": 1.0674876, + "epoch": 0.051225011273109876, + "flos": 23038634666880.0, + "grad_norm": 2.2562645326336686, + "language_loss": 0.8376134, + "learning_rate": 3.995276674539547e-06, + "loss": 0.86061573, + "num_input_tokens_seen": 18208620, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.609375, + "step": 852, + "time_per_iteration": 2.4753623008728027 + }, + { + "auxiliary_loss_clip": 0.01231223, + "auxiliary_loss_mlp": 0.01068594, + "balance_loss_clip": 1.04190326, + "balance_loss_mlp": 1.06879044, + "epoch": 0.05128513452577785, + "flos": 18259822454400.0, + "grad_norm": 1.9405819970113303, + "language_loss": 0.80252314, + "learning_rate": 3.995249886196811e-06, + "loss": 0.82552135, + "num_input_tokens_seen": 18226370, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.625, + "step": 853, + "time_per_iteration": 2.5048112869262695 + }, + { + "auxiliary_loss_clip": 0.01226539, + "auxiliary_loss_mlp": 0.01060743, + "balance_loss_clip": 1.03432584, + "balance_loss_mlp": 1.06710184, + "epoch": 0.05134525777844581, + "flos": 27198957571200.0, + "grad_norm": 1.8237562231360178, + "language_loss": 0.75846469, + "learning_rate": 3.995223022193999e-06, + "loss": 0.7813375, + "num_input_tokens_seen": 18247075, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.59375, + "step": 854, + "time_per_iteration": 2.53165602684021 + }, + { + "auxiliary_loss_clip": 0.01233418, + "auxiliary_loss_mlp": 0.0106357, + "balance_loss_clip": 1.03678393, + "balance_loss_mlp": 1.07139039, + "epoch": 0.051405381031113785, + "flos": 28362247436160.0, + "grad_norm": 2.718422527893707, + "language_loss": 0.81173462, + "learning_rate": 3.99519608253213e-06, + "loss": 0.83470446, + "num_input_tokens_seen": 18265680, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.625, + "step": 855, + "time_per_iteration": 2.5610744953155518 + }, + { + "auxiliary_loss_clip": 0.01113278, + "auxiliary_loss_mlp": 0.01020682, + "balance_loss_clip": 1.01534104, + "balance_loss_mlp": 1.03902698, + "epoch": 0.05146550428378175, + "flos": 65618169327360.0, + "grad_norm": 1.0071030268205712, + "language_loss": 0.65609074, + "learning_rate": 3.995169067212227e-06, + "loss": 0.67743033, + "num_input_tokens_seen": 18327015, + "router_z_loss_clip": 0.0534668, + "router_z_loss_mlp": 0.7421875, + "step": 856, + "time_per_iteration": 3.0546581745147705 + }, + { + "auxiliary_loss_clip": 0.01224884, + "auxiliary_loss_mlp": 0.01053813, + "balance_loss_clip": 1.02823043, + "balance_loss_mlp": 1.06811357, + "epoch": 0.05152562753644972, + "flos": 22054861998720.0, + "grad_norm": 1.8111088050205955, + "language_loss": 0.76996124, + "learning_rate": 3.9951419762353116e-06, + "loss": 0.79274821, + "num_input_tokens_seen": 18345235, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.5625, + "step": 857, + "time_per_iteration": 2.6051554679870605 + }, + { + "auxiliary_loss_clip": 0.01229705, + "auxiliary_loss_mlp": 0.01057595, + "balance_loss_clip": 1.03130889, + "balance_loss_mlp": 1.06846082, + "epoch": 0.051585750789117694, + "flos": 18509544783360.0, + "grad_norm": 3.7937823779894377, + "language_loss": 0.88893878, + "learning_rate": 3.995114809602412e-06, + "loss": 0.91181171, + "num_input_tokens_seen": 18362350, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.6171875, + "step": 858, + "time_per_iteration": 2.4517769813537598 + }, + { + "auxiliary_loss_clip": 0.01228685, + "auxiliary_loss_mlp": 0.01056497, + "balance_loss_clip": 1.03000832, + "balance_loss_mlp": 1.06902003, + "epoch": 0.05164587404178566, + "flos": 23730238108800.0, + "grad_norm": 1.9531750101692102, + "language_loss": 0.75199753, + "learning_rate": 3.9950875673145605e-06, + "loss": 0.77484941, + "num_input_tokens_seen": 18383390, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.59375, + "step": 859, + "time_per_iteration": 2.5090014934539795 + }, + { + "auxiliary_loss_clip": 0.01237239, + "auxiliary_loss_mlp": 0.01070917, + "balance_loss_clip": 1.04280758, + "balance_loss_mlp": 1.06980002, + "epoch": 0.05170599729445363, + "flos": 16252882876800.0, + "grad_norm": 2.092452223155828, + "language_loss": 0.90812773, + "learning_rate": 3.995060249372788e-06, + "loss": 0.93120927, + "num_input_tokens_seen": 18399220, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.671875, + "step": 860, + "time_per_iteration": 2.437220335006714 + }, + { + "auxiliary_loss_clip": 0.01231057, + "auxiliary_loss_mlp": 0.01060698, + "balance_loss_clip": 1.03568769, + "balance_loss_mlp": 1.0717634, + "epoch": 0.0517661205471216, + "flos": 23985922095360.0, + "grad_norm": 1.9189860758016508, + "language_loss": 0.82252973, + "learning_rate": 3.99503285577813e-06, + "loss": 0.8454473, + "num_input_tokens_seen": 18419005, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.59375, + "step": 861, + "time_per_iteration": 2.50883412361145 + }, + { + "auxiliary_loss_clip": 0.01233216, + "auxiliary_loss_mlp": 0.01057472, + "balance_loss_clip": 1.03177071, + "balance_loss_mlp": 1.0704143, + "epoch": 0.05182624379978957, + "flos": 29277718392960.0, + "grad_norm": 2.0352629197197762, + "language_loss": 0.78607392, + "learning_rate": 3.995005386531627e-06, + "loss": 0.80898082, + "num_input_tokens_seen": 18440550, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.625, + "step": 862, + "time_per_iteration": 2.548757314682007 + }, + { + "auxiliary_loss_clip": 0.01229413, + "auxiliary_loss_mlp": 0.01068334, + "balance_loss_clip": 1.04402709, + "balance_loss_mlp": 1.07291067, + "epoch": 0.05188636705245754, + "flos": 24170826332160.0, + "grad_norm": 1.9841587361763113, + "language_loss": 0.88999134, + "learning_rate": 3.9949778416343195e-06, + "loss": 0.91296881, + "num_input_tokens_seen": 18461950, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.5625, + "step": 863, + "time_per_iteration": 2.506289005279541 + }, + { + "auxiliary_loss_clip": 0.01238268, + "auxiliary_loss_mlp": 0.01064282, + "balance_loss_clip": 1.03712606, + "balance_loss_mlp": 1.07635331, + "epoch": 0.051946490305125506, + "flos": 26760703731840.0, + "grad_norm": 2.003999649515418, + "language_loss": 0.7575798, + "learning_rate": 3.9949502210872525e-06, + "loss": 0.78060532, + "num_input_tokens_seen": 18480555, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.625, + "step": 864, + "time_per_iteration": 2.515944480895996 + }, + { + "auxiliary_loss_clip": 0.01236545, + "auxiliary_loss_mlp": 0.01069508, + "balance_loss_clip": 1.04228067, + "balance_loss_mlp": 1.07355332, + "epoch": 0.05200661355779348, + "flos": 21502519585920.0, + "grad_norm": 1.9298630836237705, + "language_loss": 0.7919569, + "learning_rate": 3.994922524891474e-06, + "loss": 0.81501746, + "num_input_tokens_seen": 18499645, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.6328125, + "step": 865, + "time_per_iteration": 2.485499620437622 + }, + { + "auxiliary_loss_clip": 0.0123268, + "auxiliary_loss_mlp": 0.0106684, + "balance_loss_clip": 1.04144871, + "balance_loss_mlp": 1.07079291, + "epoch": 0.05206673681046144, + "flos": 18114492026880.0, + "grad_norm": 2.366131428952597, + "language_loss": 0.85700798, + "learning_rate": 3.994894753048032e-06, + "loss": 0.88000321, + "num_input_tokens_seen": 18516810, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.6171875, + "step": 866, + "time_per_iteration": 2.4854226112365723 + }, + { + "auxiliary_loss_clip": 0.01242589, + "auxiliary_loss_mlp": 0.01065926, + "balance_loss_clip": 1.03910398, + "balance_loss_mlp": 1.0804987, + "epoch": 0.052126860063129415, + "flos": 17524191916800.0, + "grad_norm": 2.535209572965093, + "language_loss": 0.8680315, + "learning_rate": 3.9948669055579815e-06, + "loss": 0.89111662, + "num_input_tokens_seen": 18532510, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.625, + "step": 867, + "time_per_iteration": 2.4644970893859863 + }, + { + "auxiliary_loss_clip": 0.01231644, + "auxiliary_loss_mlp": 0.01073847, + "balance_loss_clip": 1.05021977, + "balance_loss_mlp": 1.07513499, + "epoch": 0.05218698331579739, + "flos": 32598054771840.0, + "grad_norm": 1.64188364663517, + "language_loss": 0.63562089, + "learning_rate": 3.9948389824223785e-06, + "loss": 0.65867579, + "num_input_tokens_seen": 18557380, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.5625, + "step": 868, + "time_per_iteration": 2.567958354949951 + }, + { + "auxiliary_loss_clip": 0.01236968, + "auxiliary_loss_mlp": 0.01065922, + "balance_loss_clip": 1.03753829, + "balance_loss_mlp": 1.07263327, + "epoch": 0.05224710656846535, + "flos": 22127293774080.0, + "grad_norm": 2.1448269109564198, + "language_loss": 0.83076257, + "learning_rate": 3.994810983642281e-06, + "loss": 0.85379148, + "num_input_tokens_seen": 18575720, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.6484375, + "step": 869, + "time_per_iteration": 2.5021841526031494 + }, + { + "auxiliary_loss_clip": 0.01237154, + "auxiliary_loss_mlp": 0.01057742, + "balance_loss_clip": 1.03201652, + "balance_loss_mlp": 1.07245827, + "epoch": 0.052307229821133325, + "flos": 11145092976000.0, + "grad_norm": 2.352948725027126, + "language_loss": 0.87544227, + "learning_rate": 3.994782909218751e-06, + "loss": 0.89839119, + "num_input_tokens_seen": 18592185, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.6484375, + "step": 870, + "time_per_iteration": 2.459662437438965 + }, + { + "auxiliary_loss_clip": 0.01238457, + "auxiliary_loss_mlp": 0.01067184, + "balance_loss_clip": 1.04135191, + "balance_loss_mlp": 1.07536197, + "epoch": 0.05236735307380129, + "flos": 19128070005120.0, + "grad_norm": 1.9212028950510787, + "language_loss": 0.80554998, + "learning_rate": 3.994754759152854e-06, + "loss": 0.82860637, + "num_input_tokens_seen": 18609560, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.6328125, + "step": 871, + "time_per_iteration": 2.4701170921325684 + }, + { + "auxiliary_loss_clip": 0.01234905, + "auxiliary_loss_mlp": 0.01064695, + "balance_loss_clip": 1.04009032, + "balance_loss_mlp": 1.07576704, + "epoch": 0.05242747632646926, + "flos": 20960663944320.0, + "grad_norm": 1.5975290841395262, + "language_loss": 0.81374049, + "learning_rate": 3.994726533445656e-06, + "loss": 0.8367365, + "num_input_tokens_seen": 18629405, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.59375, + "step": 872, + "time_per_iteration": 2.4886369705200195 + }, + { + "auxiliary_loss_clip": 0.0111147, + "auxiliary_loss_mlp": 0.0101489, + "balance_loss_clip": 1.00952566, + "balance_loss_mlp": 1.03955865, + "epoch": 0.052487599579137234, + "flos": 65020542842880.0, + "grad_norm": 0.8879269166117758, + "language_loss": 0.61579192, + "learning_rate": 3.9946982320982274e-06, + "loss": 0.63705552, + "num_input_tokens_seen": 18681480, + "router_z_loss_clip": 0.05371094, + "router_z_loss_mlp": 0.71875, + "step": 873, + "time_per_iteration": 2.9913430213928223 + }, + { + "auxiliary_loss_clip": 0.01231663, + "auxiliary_loss_mlp": 0.01058247, + "balance_loss_clip": 1.03245032, + "balance_loss_mlp": 1.07107997, + "epoch": 0.0525477228318052, + "flos": 23288859786240.0, + "grad_norm": 1.8426182555123698, + "language_loss": 0.88426232, + "learning_rate": 3.994669855111643e-06, + "loss": 0.90716141, + "num_input_tokens_seen": 18700390, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.6015625, + "step": 874, + "time_per_iteration": 2.4794461727142334 + }, + { + "auxiliary_loss_clip": 0.0123222, + "auxiliary_loss_mlp": 0.01062298, + "balance_loss_clip": 1.03626251, + "balance_loss_mlp": 1.06908488, + "epoch": 0.05260784608447317, + "flos": 32230221546240.0, + "grad_norm": 2.2494767595307628, + "language_loss": 0.74779439, + "learning_rate": 3.994641402486977e-06, + "loss": 0.77073956, + "num_input_tokens_seen": 18721280, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.6328125, + "step": 875, + "time_per_iteration": 2.5768113136291504 + }, + { + "auxiliary_loss_clip": 0.01228414, + "auxiliary_loss_mlp": 0.01061086, + "balance_loss_clip": 1.03412056, + "balance_loss_mlp": 1.06905699, + "epoch": 0.052667969337141136, + "flos": 24463211040000.0, + "grad_norm": 2.052141253618648, + "language_loss": 0.92836702, + "learning_rate": 3.99461287422531e-06, + "loss": 0.951262, + "num_input_tokens_seen": 18741545, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.59375, + "step": 876, + "time_per_iteration": 2.535587787628174 + }, + { + "auxiliary_loss_clip": 0.0110653, + "auxiliary_loss_mlp": 0.01009036, + "balance_loss_clip": 1.00379074, + "balance_loss_mlp": 1.03698087, + "epoch": 0.05272809258980911, + "flos": 57784329567360.0, + "grad_norm": 0.854570032578524, + "language_loss": 0.62934959, + "learning_rate": 3.994584270327722e-06, + "loss": 0.6505053, + "num_input_tokens_seen": 18801400, + "router_z_loss_clip": 0.05249023, + "router_z_loss_mlp": 0.6953125, + "step": 877, + "time_per_iteration": 3.094581127166748 + }, + { + "auxiliary_loss_clip": 0.01231545, + "auxiliary_loss_mlp": 0.01069359, + "balance_loss_clip": 1.04174972, + "balance_loss_mlp": 1.06975055, + "epoch": 0.05278821584247708, + "flos": 17420805596160.0, + "grad_norm": 2.154366240232031, + "language_loss": 0.85691291, + "learning_rate": 3.994555590795299e-06, + "loss": 0.87992191, + "num_input_tokens_seen": 18819670, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.6171875, + "step": 878, + "time_per_iteration": 2.5052285194396973 + }, + { + "auxiliary_loss_clip": 0.01232133, + "auxiliary_loss_mlp": 0.01063559, + "balance_loss_clip": 1.03754723, + "balance_loss_mlp": 1.06974411, + "epoch": 0.052848339095145046, + "flos": 26137258346880.0, + "grad_norm": 2.0833089409086942, + "language_loss": 0.82790506, + "learning_rate": 3.9945268356291275e-06, + "loss": 0.85086197, + "num_input_tokens_seen": 18840580, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.625, + "step": 879, + "time_per_iteration": 2.564312219619751 + }, + { + "auxiliary_loss_clip": 0.01227867, + "auxiliary_loss_mlp": 0.01066877, + "balance_loss_clip": 1.04011488, + "balance_loss_mlp": 1.06966615, + "epoch": 0.05290846234781302, + "flos": 16472081623680.0, + "grad_norm": 4.271066320440391, + "language_loss": 0.84404933, + "learning_rate": 3.9944980048302985e-06, + "loss": 0.86699677, + "num_input_tokens_seen": 18859295, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.578125, + "step": 880, + "time_per_iteration": 2.4854133129119873 + }, + { + "auxiliary_loss_clip": 0.01233797, + "auxiliary_loss_mlp": 0.01069821, + "balance_loss_clip": 1.04360688, + "balance_loss_mlp": 1.07206059, + "epoch": 0.05296858560048098, + "flos": 19865173000320.0, + "grad_norm": 3.515636761469604, + "language_loss": 0.87156737, + "learning_rate": 3.994469098399906e-06, + "loss": 0.89460361, + "num_input_tokens_seen": 18877485, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.625, + "step": 881, + "time_per_iteration": 2.476846933364868 + }, + { + "auxiliary_loss_clip": 0.01228751, + "auxiliary_loss_mlp": 0.01065941, + "balance_loss_clip": 1.03789103, + "balance_loss_mlp": 1.06813371, + "epoch": 0.053028708853148955, + "flos": 24388588535040.0, + "grad_norm": 1.9345214626214409, + "language_loss": 0.87682849, + "learning_rate": 3.994440116339046e-06, + "loss": 0.89977539, + "num_input_tokens_seen": 18898275, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.609375, + "step": 882, + "time_per_iteration": 2.6449031829833984 + }, + { + "auxiliary_loss_clip": 0.01233714, + "auxiliary_loss_mlp": 0.01065669, + "balance_loss_clip": 1.03825057, + "balance_loss_mlp": 1.07030129, + "epoch": 0.05308883210581693, + "flos": 36393166143360.0, + "grad_norm": 2.7245054008776814, + "language_loss": 0.68869275, + "learning_rate": 3.994411058648816e-06, + "loss": 0.71168661, + "num_input_tokens_seen": 18920665, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.6328125, + "step": 883, + "time_per_iteration": 2.620363235473633 + }, + { + "auxiliary_loss_clip": 0.01225388, + "auxiliary_loss_mlp": 0.01060527, + "balance_loss_clip": 1.03461075, + "balance_loss_mlp": 1.06937146, + "epoch": 0.05314895535848489, + "flos": 22855095146880.0, + "grad_norm": 1.9628498458506696, + "language_loss": 0.75887203, + "learning_rate": 3.994381925330319e-06, + "loss": 0.78173113, + "num_input_tokens_seen": 18939835, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.5546875, + "step": 884, + "time_per_iteration": 2.4948067665100098 + }, + { + "auxiliary_loss_clip": 0.01225729, + "auxiliary_loss_mlp": 0.01063879, + "balance_loss_clip": 1.03870201, + "balance_loss_mlp": 1.06921601, + "epoch": 0.053209078611152864, + "flos": 12860330204160.0, + "grad_norm": 2.00306560312032, + "language_loss": 0.85323638, + "learning_rate": 3.994352716384659e-06, + "loss": 0.87613249, + "num_input_tokens_seen": 18958405, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.5625, + "step": 885, + "time_per_iteration": 2.5159530639648438 + }, + { + "auxiliary_loss_clip": 0.01228523, + "auxiliary_loss_mlp": 0.01068973, + "balance_loss_clip": 1.04205549, + "balance_loss_mlp": 1.06673646, + "epoch": 0.05326920186382083, + "flos": 12164596698240.0, + "grad_norm": 2.6316893825734344, + "language_loss": 0.85726082, + "learning_rate": 3.994323431812945e-06, + "loss": 0.88023585, + "num_input_tokens_seen": 18975445, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.6171875, + "step": 886, + "time_per_iteration": 2.4650700092315674 + }, + { + "auxiliary_loss_clip": 0.01226585, + "auxiliary_loss_mlp": 0.01066459, + "balance_loss_clip": 1.03908896, + "balance_loss_mlp": 1.06944001, + "epoch": 0.0533293251164888, + "flos": 22704485420160.0, + "grad_norm": 2.1517488326805214, + "language_loss": 0.89229804, + "learning_rate": 3.994294071616286e-06, + "loss": 0.91522843, + "num_input_tokens_seen": 18991930, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.5703125, + "step": 887, + "time_per_iteration": 2.5020337104797363 + }, + { + "auxiliary_loss_clip": 0.01227687, + "auxiliary_loss_mlp": 0.01070962, + "balance_loss_clip": 1.04270935, + "balance_loss_mlp": 1.06604195, + "epoch": 0.053389448369156774, + "flos": 26940939200640.0, + "grad_norm": 2.2836036404275593, + "language_loss": 0.75076836, + "learning_rate": 3.994264635795796e-06, + "loss": 0.77375484, + "num_input_tokens_seen": 19009790, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.6171875, + "step": 888, + "time_per_iteration": 2.5055694580078125 + }, + { + "auxiliary_loss_clip": 0.0123028, + "auxiliary_loss_mlp": 0.0107639, + "balance_loss_clip": 1.0480895, + "balance_loss_mlp": 1.07113457, + "epoch": 0.05344957162182474, + "flos": 25556331686400.0, + "grad_norm": 2.032914331295681, + "language_loss": 0.88330352, + "learning_rate": 3.994235124352592e-06, + "loss": 0.90637028, + "num_input_tokens_seen": 19030170, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.59375, + "step": 889, + "time_per_iteration": 2.5147650241851807 + }, + { + "auxiliary_loss_clip": 0.01222875, + "auxiliary_loss_mlp": 0.01053174, + "balance_loss_clip": 1.02748489, + "balance_loss_mlp": 1.06732821, + "epoch": 0.05350969487449271, + "flos": 19719591177600.0, + "grad_norm": 1.9726085703824752, + "language_loss": 0.88269985, + "learning_rate": 3.994205537287791e-06, + "loss": 0.90546036, + "num_input_tokens_seen": 19048075, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.5546875, + "step": 890, + "time_per_iteration": 2.490300416946411 + }, + { + "auxiliary_loss_clip": 0.01225662, + "auxiliary_loss_mlp": 0.0107145, + "balance_loss_clip": 1.04612982, + "balance_loss_mlp": 1.06690812, + "epoch": 0.053569818127160676, + "flos": 27016351804800.0, + "grad_norm": 2.320271972022273, + "language_loss": 0.93251556, + "learning_rate": 3.994175874602517e-06, + "loss": 0.95548671, + "num_input_tokens_seen": 19067465, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.59375, + "step": 891, + "time_per_iteration": 2.5133957862854004 + }, + { + "auxiliary_loss_clip": 0.01225404, + "auxiliary_loss_mlp": 0.01062212, + "balance_loss_clip": 1.03506804, + "balance_loss_mlp": 1.06682086, + "epoch": 0.05362994137982865, + "flos": 13188338225280.0, + "grad_norm": 2.238230674372026, + "language_loss": 0.71759057, + "learning_rate": 3.994146136297893e-06, + "loss": 0.74046671, + "num_input_tokens_seen": 19085505, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.5859375, + "step": 892, + "time_per_iteration": 2.5544779300689697 + }, + { + "auxiliary_loss_clip": 0.01229119, + "auxiliary_loss_mlp": 0.01067529, + "balance_loss_clip": 1.0421617, + "balance_loss_mlp": 1.06946719, + "epoch": 0.05369006463249662, + "flos": 28658008022400.0, + "grad_norm": 2.3204520758070037, + "language_loss": 0.82304287, + "learning_rate": 3.994116322375049e-06, + "loss": 0.84600937, + "num_input_tokens_seen": 19104360, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.6015625, + "step": 893, + "time_per_iteration": 5.3903117179870605 + }, + { + "auxiliary_loss_clip": 0.0122945, + "auxiliary_loss_mlp": 0.01070342, + "balance_loss_clip": 1.04430699, + "balance_loss_mlp": 1.0679965, + "epoch": 0.053750187885164585, + "flos": 28913153304960.0, + "grad_norm": 2.3808217776212937, + "language_loss": 0.81695569, + "learning_rate": 3.994086432835114e-06, + "loss": 0.83995366, + "num_input_tokens_seen": 19124680, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.609375, + "step": 894, + "time_per_iteration": 2.52809476852417 + }, + { + "auxiliary_loss_clip": 0.01227471, + "auxiliary_loss_mlp": 0.01065449, + "balance_loss_clip": 1.03915119, + "balance_loss_mlp": 1.06881404, + "epoch": 0.05381031113783256, + "flos": 15158828476800.0, + "grad_norm": 2.5337894710206093, + "language_loss": 0.76043701, + "learning_rate": 3.994056467679221e-06, + "loss": 0.7833662, + "num_input_tokens_seen": 19142895, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.5859375, + "step": 895, + "time_per_iteration": 2.4810688495635986 + }, + { + "auxiliary_loss_clip": 0.01238307, + "auxiliary_loss_mlp": 0.01057353, + "balance_loss_clip": 1.03022122, + "balance_loss_mlp": 1.07260597, + "epoch": 0.05387043439050053, + "flos": 21835232288640.0, + "grad_norm": 2.2065839001211156, + "language_loss": 0.86456096, + "learning_rate": 3.9940264269085065e-06, + "loss": 0.88751751, + "num_input_tokens_seen": 19163125, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.65625, + "step": 896, + "time_per_iteration": 2.522254467010498 + }, + { + "auxiliary_loss_clip": 0.01231325, + "auxiliary_loss_mlp": 0.01062675, + "balance_loss_clip": 1.03495908, + "balance_loss_mlp": 1.06809413, + "epoch": 0.053930557643168495, + "flos": 17310308382720.0, + "grad_norm": 2.1680285530564274, + "language_loss": 0.87949234, + "learning_rate": 3.9939963105241115e-06, + "loss": 0.90243232, + "num_input_tokens_seen": 19179385, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.6328125, + "step": 897, + "time_per_iteration": 2.457918167114258 + }, + { + "auxiliary_loss_clip": 0.0122574, + "auxiliary_loss_mlp": 0.01063765, + "balance_loss_clip": 1.03570318, + "balance_loss_mlp": 1.06723523, + "epoch": 0.05399068089583647, + "flos": 17348481561600.0, + "grad_norm": 1.7359050724031848, + "language_loss": 0.9035244, + "learning_rate": 3.993966118527175e-06, + "loss": 0.9264195, + "num_input_tokens_seen": 19198725, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.5859375, + "step": 898, + "time_per_iteration": 2.4593143463134766 + }, + { + "auxiliary_loss_clip": 0.01234899, + "auxiliary_loss_mlp": 0.01084595, + "balance_loss_clip": 1.05808282, + "balance_loss_mlp": 1.07024622, + "epoch": 0.05405080414850443, + "flos": 17486952491520.0, + "grad_norm": 3.958355519485596, + "language_loss": 0.91756964, + "learning_rate": 3.993935850918845e-06, + "loss": 0.94076455, + "num_input_tokens_seen": 19212380, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.6484375, + "step": 899, + "time_per_iteration": 2.4461729526519775 + }, + { + "auxiliary_loss_clip": 0.01225208, + "auxiliary_loss_mlp": 0.01065344, + "balance_loss_clip": 1.03964233, + "balance_loss_mlp": 1.06601286, + "epoch": 0.054110927401172404, + "flos": 24496787278080.0, + "grad_norm": 2.6493739136310643, + "language_loss": 0.75594276, + "learning_rate": 3.9939055077002665e-06, + "loss": 0.77884829, + "num_input_tokens_seen": 19232235, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.59375, + "step": 900, + "time_per_iteration": 2.5180957317352295 + }, + { + "auxiliary_loss_clip": 0.01231903, + "auxiliary_loss_mlp": 0.01059763, + "balance_loss_clip": 1.03413296, + "balance_loss_mlp": 1.06860638, + "epoch": 0.054171050653840376, + "flos": 22930040874240.0, + "grad_norm": 2.2496787705299908, + "language_loss": 0.7377668, + "learning_rate": 3.993875088872592e-06, + "loss": 0.76068342, + "num_input_tokens_seen": 19251460, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.6328125, + "step": 901, + "time_per_iteration": 2.49638032913208 + }, + { + "auxiliary_loss_clip": 0.01221671, + "auxiliary_loss_mlp": 0.01074944, + "balance_loss_clip": 1.04982698, + "balance_loss_mlp": 1.06662059, + "epoch": 0.05423117390650834, + "flos": 12933192942720.0, + "grad_norm": 2.0553503619333586, + "language_loss": 0.85004938, + "learning_rate": 3.9938445944369745e-06, + "loss": 0.87301552, + "num_input_tokens_seen": 19269060, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.546875, + "step": 902, + "time_per_iteration": 2.5067105293273926 + }, + { + "auxiliary_loss_clip": 0.01226177, + "auxiliary_loss_mlp": 0.0106299, + "balance_loss_clip": 1.03638279, + "balance_loss_mlp": 1.06769705, + "epoch": 0.05429129715917631, + "flos": 19901335017600.0, + "grad_norm": 2.0002475654879195, + "language_loss": 0.8655951, + "learning_rate": 3.993814024394569e-06, + "loss": 0.8884868, + "num_input_tokens_seen": 19288620, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5859375, + "step": 903, + "time_per_iteration": 2.522193670272827 + }, + { + "auxiliary_loss_clip": 0.01227512, + "auxiliary_loss_mlp": 0.01062198, + "balance_loss_clip": 1.03622222, + "balance_loss_mlp": 1.06904316, + "epoch": 0.05435142041184428, + "flos": 16908611610240.0, + "grad_norm": 2.4298091072226855, + "language_loss": 0.74835998, + "learning_rate": 3.993783378746537e-06, + "loss": 0.77125704, + "num_input_tokens_seen": 19306615, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.578125, + "step": 904, + "time_per_iteration": 2.456969976425171 + }, + { + "auxiliary_loss_clip": 0.0123038, + "auxiliary_loss_mlp": 0.01073252, + "balance_loss_clip": 1.04685879, + "balance_loss_mlp": 1.06905615, + "epoch": 0.05441154366451225, + "flos": 23948323534080.0, + "grad_norm": 2.0843949675352356, + "language_loss": 0.85750329, + "learning_rate": 3.993752657494039e-06, + "loss": 0.8805396, + "num_input_tokens_seen": 19321680, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.609375, + "step": 905, + "time_per_iteration": 2.5358731746673584 + }, + { + "auxiliary_loss_clip": 0.01227222, + "auxiliary_loss_mlp": 0.01078235, + "balance_loss_clip": 1.05317712, + "balance_loss_mlp": 1.07247257, + "epoch": 0.05447166691718022, + "flos": 19975382904960.0, + "grad_norm": 1.7937911991915148, + "language_loss": 0.74028552, + "learning_rate": 3.993721860638241e-06, + "loss": 0.76334012, + "num_input_tokens_seen": 19339760, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.546875, + "step": 906, + "time_per_iteration": 2.468331813812256 + }, + { + "auxiliary_loss_clip": 0.01228766, + "auxiliary_loss_mlp": 0.01065896, + "balance_loss_clip": 1.03909731, + "balance_loss_mlp": 1.06858826, + "epoch": 0.05453179016984819, + "flos": 24936513575040.0, + "grad_norm": 2.220044948377472, + "language_loss": 0.87410975, + "learning_rate": 3.993690988180309e-06, + "loss": 0.89705634, + "num_input_tokens_seen": 19359585, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.6015625, + "step": 907, + "time_per_iteration": 2.5177390575408936 + }, + { + "auxiliary_loss_clip": 0.01227557, + "auxiliary_loss_mlp": 0.01071851, + "balance_loss_clip": 1.04521942, + "balance_loss_mlp": 1.07002556, + "epoch": 0.05459191342251616, + "flos": 18115102558080.0, + "grad_norm": 1.8689281211501179, + "language_loss": 0.86915505, + "learning_rate": 3.9936600401214165e-06, + "loss": 0.89214909, + "num_input_tokens_seen": 19378590, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.578125, + "step": 908, + "time_per_iteration": 2.45135498046875 + }, + { + "auxiliary_loss_clip": 0.01225417, + "auxiliary_loss_mlp": 0.01068459, + "balance_loss_clip": 1.04073071, + "balance_loss_mlp": 1.06842148, + "epoch": 0.054652036675184125, + "flos": 19208295031680.0, + "grad_norm": 2.409525813232516, + "language_loss": 0.89454836, + "learning_rate": 3.9936290164627345e-06, + "loss": 0.91748714, + "num_input_tokens_seen": 19397910, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.5703125, + "step": 909, + "time_per_iteration": 2.4702625274658203 + }, + { + "auxiliary_loss_clip": 0.01231345, + "auxiliary_loss_mlp": 0.01075786, + "balance_loss_clip": 1.04773629, + "balance_loss_mlp": 1.06930447, + "epoch": 0.0547121599278521, + "flos": 16325745615360.0, + "grad_norm": 2.4022545211155593, + "language_loss": 0.70942473, + "learning_rate": 3.99359791720544e-06, + "loss": 0.73249602, + "num_input_tokens_seen": 19415950, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.625, + "step": 910, + "time_per_iteration": 2.4530797004699707 + }, + { + "auxiliary_loss_clip": 0.01224757, + "auxiliary_loss_mlp": 0.01055797, + "balance_loss_clip": 1.03002357, + "balance_loss_mlp": 1.06815219, + "epoch": 0.05477228318052007, + "flos": 20339014239360.0, + "grad_norm": 2.0100188286094745, + "language_loss": 0.8349818, + "learning_rate": 3.993566742350714e-06, + "loss": 0.85778737, + "num_input_tokens_seen": 19435275, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.5625, + "step": 911, + "time_per_iteration": 2.4792025089263916 + }, + { + "auxiliary_loss_clip": 0.01224017, + "auxiliary_loss_mlp": 0.01072081, + "balance_loss_clip": 1.04524732, + "balance_loss_mlp": 1.06649613, + "epoch": 0.054832406433188034, + "flos": 21973092687360.0, + "grad_norm": 2.746196883211308, + "language_loss": 0.76096344, + "learning_rate": 3.993535491899736e-06, + "loss": 0.7839244, + "num_input_tokens_seen": 19452090, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.578125, + "step": 912, + "time_per_iteration": 2.4651522636413574 + }, + { + "auxiliary_loss_clip": 0.01219912, + "auxiliary_loss_mlp": 0.01052416, + "balance_loss_clip": 1.02733433, + "balance_loss_mlp": 1.06664968, + "epoch": 0.054892529685856006, + "flos": 16398931576320.0, + "grad_norm": 2.385296939765248, + "language_loss": 0.82667339, + "learning_rate": 3.993504165853694e-06, + "loss": 0.84939671, + "num_input_tokens_seen": 19470865, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.53125, + "step": 913, + "time_per_iteration": 2.475384473800659 + }, + { + "auxiliary_loss_clip": 0.01224168, + "auxiliary_loss_mlp": 0.01061883, + "balance_loss_clip": 1.03633678, + "balance_loss_mlp": 1.07065797, + "epoch": 0.05495265293852397, + "flos": 23912341084800.0, + "grad_norm": 2.227172084037845, + "language_loss": 0.83470452, + "learning_rate": 3.993472764213772e-06, + "loss": 0.85756505, + "num_input_tokens_seen": 19492145, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.53125, + "step": 914, + "time_per_iteration": 2.5741806030273438 + }, + { + "auxiliary_loss_clip": 0.01229195, + "auxiliary_loss_mlp": 0.01057782, + "balance_loss_clip": 1.03324902, + "balance_loss_mlp": 1.07264161, + "epoch": 0.055012776191191944, + "flos": 23586954756480.0, + "grad_norm": 2.897688985464872, + "language_loss": 0.9010309, + "learning_rate": 3.9934412869811655e-06, + "loss": 0.92390066, + "num_input_tokens_seen": 19511015, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.5625, + "step": 915, + "time_per_iteration": 2.492981433868408 + }, + { + "auxiliary_loss_clip": 0.01225584, + "auxiliary_loss_mlp": 0.01055475, + "balance_loss_clip": 1.03046489, + "balance_loss_mlp": 1.0708915, + "epoch": 0.055072899443859916, + "flos": 17528501548800.0, + "grad_norm": 1.870109983937874, + "language_loss": 0.89555848, + "learning_rate": 3.993409734157064e-06, + "loss": 0.91836905, + "num_input_tokens_seen": 19529040, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.546875, + "step": 916, + "time_per_iteration": 2.4621188640594482 + }, + { + "auxiliary_loss_clip": 0.01228011, + "auxiliary_loss_mlp": 0.01072271, + "balance_loss_clip": 1.04593801, + "balance_loss_mlp": 1.06942379, + "epoch": 0.05513302269652788, + "flos": 21687172427520.0, + "grad_norm": 1.7933741103180343, + "language_loss": 0.80085957, + "learning_rate": 3.993378105742666e-06, + "loss": 0.82386243, + "num_input_tokens_seen": 19549540, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.5859375, + "step": 917, + "time_per_iteration": 2.49455189704895 + }, + { + "auxiliary_loss_clip": 0.01225592, + "auxiliary_loss_mlp": 0.01058516, + "balance_loss_clip": 1.03270769, + "balance_loss_mlp": 1.06678224, + "epoch": 0.05519314594919585, + "flos": 21613340021760.0, + "grad_norm": 1.9216560267302982, + "language_loss": 0.79673612, + "learning_rate": 3.9933464017391705e-06, + "loss": 0.81957722, + "num_input_tokens_seen": 19567570, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.59375, + "step": 918, + "time_per_iteration": 2.504734516143799 + }, + { + "auxiliary_loss_clip": 0.01223712, + "auxiliary_loss_mlp": 0.01059794, + "balance_loss_clip": 1.03414011, + "balance_loss_mlp": 1.06658053, + "epoch": 0.05525326920186382, + "flos": 21798567480960.0, + "grad_norm": 1.9394116717498289, + "language_loss": 0.89132315, + "learning_rate": 3.99331462214778e-06, + "loss": 0.91415823, + "num_input_tokens_seen": 19585330, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.5703125, + "step": 919, + "time_per_iteration": 2.5087900161743164 + }, + { + "auxiliary_loss_clip": 0.01219042, + "auxiliary_loss_mlp": 0.01068553, + "balance_loss_clip": 1.0427916, + "balance_loss_mlp": 1.06515777, + "epoch": 0.05531339245453179, + "flos": 28439635288320.0, + "grad_norm": 2.688355226699252, + "language_loss": 0.87421197, + "learning_rate": 3.993282766969699e-06, + "loss": 0.89708793, + "num_input_tokens_seen": 19604970, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.5390625, + "step": 920, + "time_per_iteration": 2.536914348602295 + }, + { + "auxiliary_loss_clip": 0.01223828, + "auxiliary_loss_mlp": 0.01063036, + "balance_loss_clip": 1.03733468, + "balance_loss_mlp": 1.06937671, + "epoch": 0.05537351570719976, + "flos": 37375143131520.0, + "grad_norm": 2.1255302161497704, + "language_loss": 0.65921712, + "learning_rate": 3.993250836206136e-06, + "loss": 0.68208569, + "num_input_tokens_seen": 19626235, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.546875, + "step": 921, + "time_per_iteration": 2.643416166305542 + }, + { + "auxiliary_loss_clip": 0.01229793, + "auxiliary_loss_mlp": 0.0106877, + "balance_loss_clip": 1.03969455, + "balance_loss_mlp": 1.0698204, + "epoch": 0.05543363895986773, + "flos": 20084479488000.0, + "grad_norm": 2.143682946402907, + "language_loss": 0.71841472, + "learning_rate": 3.993218829858301e-06, + "loss": 0.74140036, + "num_input_tokens_seen": 19644305, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.6015625, + "step": 922, + "time_per_iteration": 2.4544074535369873 + }, + { + "auxiliary_loss_clip": 0.0122536, + "auxiliary_loss_mlp": 0.01070183, + "balance_loss_clip": 1.04346824, + "balance_loss_mlp": 1.0669136, + "epoch": 0.0554937622125357, + "flos": 24533200690560.0, + "grad_norm": 2.766492717488127, + "language_loss": 0.82548857, + "learning_rate": 3.993186747927408e-06, + "loss": 0.84844404, + "num_input_tokens_seen": 19662130, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5859375, + "step": 923, + "time_per_iteration": 2.490915536880493 + }, + { + "auxiliary_loss_clip": 0.01221243, + "auxiliary_loss_mlp": 0.01068053, + "balance_loss_clip": 1.04194593, + "balance_loss_mlp": 1.06429458, + "epoch": 0.055553885465203665, + "flos": 14320063013760.0, + "grad_norm": 2.2095756655687397, + "language_loss": 0.78808558, + "learning_rate": 3.993154590414675e-06, + "loss": 0.81097853, + "num_input_tokens_seen": 19680715, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.5625, + "step": 924, + "time_per_iteration": 2.45884108543396 + }, + { + "auxiliary_loss_clip": 0.0121918, + "auxiliary_loss_mlp": 0.01059373, + "balance_loss_clip": 1.03245521, + "balance_loss_mlp": 1.06480467, + "epoch": 0.05561400871787164, + "flos": 27381132374400.0, + "grad_norm": 1.9513803878946447, + "language_loss": 1.02250028, + "learning_rate": 3.993122357321319e-06, + "loss": 1.04528582, + "num_input_tokens_seen": 19700535, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.546875, + "step": 925, + "time_per_iteration": 2.5296268463134766 + }, + { + "auxiliary_loss_clip": 0.01220429, + "auxiliary_loss_mlp": 0.01055633, + "balance_loss_clip": 1.02975261, + "balance_loss_mlp": 1.0634799, + "epoch": 0.05567413197053961, + "flos": 23221096778880.0, + "grad_norm": 2.3756260245044687, + "language_loss": 0.80808276, + "learning_rate": 3.993090048648564e-06, + "loss": 0.83084333, + "num_input_tokens_seen": 19718825, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.5703125, + "step": 926, + "time_per_iteration": 2.496299982070923 + }, + { + "auxiliary_loss_clip": 0.01229405, + "auxiliary_loss_mlp": 0.01068259, + "balance_loss_clip": 1.04049563, + "balance_loss_mlp": 1.06743848, + "epoch": 0.055734255223207574, + "flos": 25264952559360.0, + "grad_norm": 2.4713559623940924, + "language_loss": 0.73378903, + "learning_rate": 3.993057664397634e-06, + "loss": 0.75676566, + "num_input_tokens_seen": 19739080, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.6171875, + "step": 927, + "time_per_iteration": 2.5607478618621826 + }, + { + "auxiliary_loss_clip": 0.01103967, + "auxiliary_loss_mlp": 0.01014529, + "balance_loss_clip": 1.00837731, + "balance_loss_mlp": 1.03639269, + "epoch": 0.055794378475875546, + "flos": 66503116702080.0, + "grad_norm": 0.7814837823676635, + "language_loss": 0.5989722, + "learning_rate": 3.9930252045697585e-06, + "loss": 0.62015712, + "num_input_tokens_seen": 19802960, + "router_z_loss_clip": 0.0612793, + "router_z_loss_mlp": 0.67578125, + "step": 928, + "time_per_iteration": 3.0945305824279785 + }, + { + "auxiliary_loss_clip": 0.01223562, + "auxiliary_loss_mlp": 0.01066756, + "balance_loss_clip": 1.04035151, + "balance_loss_mlp": 1.06729245, + "epoch": 0.05585450172854351, + "flos": 25337635729920.0, + "grad_norm": 2.3037954576101587, + "language_loss": 0.95011377, + "learning_rate": 3.992992669166168e-06, + "loss": 0.97301698, + "num_input_tokens_seen": 19822765, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.5625, + "step": 929, + "time_per_iteration": 2.527270555496216 + }, + { + "auxiliary_loss_clip": 0.01221186, + "auxiliary_loss_mlp": 0.01067668, + "balance_loss_clip": 1.03924894, + "balance_loss_mlp": 1.06494856, + "epoch": 0.05591462498121148, + "flos": 33911738881920.0, + "grad_norm": 2.1540114832188553, + "language_loss": 0.71827871, + "learning_rate": 3.992960058188094e-06, + "loss": 0.74116725, + "num_input_tokens_seen": 19843590, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.5625, + "step": 930, + "time_per_iteration": 2.57513689994812 + }, + { + "auxiliary_loss_clip": 0.01227654, + "auxiliary_loss_mlp": 0.01062398, + "balance_loss_clip": 1.0355165, + "balance_loss_mlp": 1.06905401, + "epoch": 0.055974748233879455, + "flos": 17930880679680.0, + "grad_norm": 2.336481182624628, + "language_loss": 0.85333288, + "learning_rate": 3.992927371636776e-06, + "loss": 0.87623346, + "num_input_tokens_seen": 19860230, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.5859375, + "step": 931, + "time_per_iteration": 2.459167957305908 + }, + { + "auxiliary_loss_clip": 0.01224553, + "auxiliary_loss_mlp": 0.01072004, + "balance_loss_clip": 1.0448482, + "balance_loss_mlp": 1.06556344, + "epoch": 0.05603487148654742, + "flos": 24021976371840.0, + "grad_norm": 1.9723738142749898, + "language_loss": 0.83577204, + "learning_rate": 3.9928946095134525e-06, + "loss": 0.85873753, + "num_input_tokens_seen": 19880795, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.59375, + "step": 932, + "time_per_iteration": 2.4930593967437744 + }, + { + "auxiliary_loss_clip": 0.01223225, + "auxiliary_loss_mlp": 0.01067756, + "balance_loss_clip": 1.04012322, + "balance_loss_mlp": 1.06712675, + "epoch": 0.05609499473921539, + "flos": 17307758517120.0, + "grad_norm": 2.411257667891357, + "language_loss": 0.73405433, + "learning_rate": 3.992861771819365e-06, + "loss": 0.75696409, + "num_input_tokens_seen": 19897960, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.5625, + "step": 933, + "time_per_iteration": 2.526521682739258 + }, + { + "auxiliary_loss_clip": 0.01219811, + "auxiliary_loss_mlp": 0.01070368, + "balance_loss_clip": 1.04328358, + "balance_loss_mlp": 1.06432819, + "epoch": 0.05615511799188336, + "flos": 20994742972800.0, + "grad_norm": 2.577929883809357, + "language_loss": 0.86850882, + "learning_rate": 3.99282885855576e-06, + "loss": 0.89141059, + "num_input_tokens_seen": 19913315, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.5546875, + "step": 934, + "time_per_iteration": 5.338034391403198 + }, + { + "auxiliary_loss_clip": 0.01220003, + "auxiliary_loss_mlp": 0.01069693, + "balance_loss_clip": 1.04495692, + "balance_loss_mlp": 1.06842983, + "epoch": 0.05621524124455133, + "flos": 17273535834240.0, + "grad_norm": 2.2060919587088965, + "language_loss": 0.80243224, + "learning_rate": 3.992795869723885e-06, + "loss": 0.82532918, + "num_input_tokens_seen": 19928790, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.515625, + "step": 935, + "time_per_iteration": 3.8198087215423584 + }, + { + "auxiliary_loss_clip": 0.01094527, + "auxiliary_loss_mlp": 0.01008984, + "balance_loss_clip": 1.00321388, + "balance_loss_mlp": 1.02876139, + "epoch": 0.0562753644972193, + "flos": 58719370458240.0, + "grad_norm": 0.8225714537835027, + "language_loss": 0.69179416, + "learning_rate": 3.99276280532499e-06, + "loss": 0.71282923, + "num_input_tokens_seen": 19988785, + "router_z_loss_clip": 0.05761719, + "router_z_loss_mlp": 0.65625, + "step": 936, + "time_per_iteration": 2.9585764408111572 + }, + { + "auxiliary_loss_clip": 0.01220636, + "auxiliary_loss_mlp": 0.01067113, + "balance_loss_clip": 1.04123259, + "balance_loss_mlp": 1.06387568, + "epoch": 0.05633548774988727, + "flos": 17457039440640.0, + "grad_norm": 2.5168182860703237, + "language_loss": 0.75900578, + "learning_rate": 3.992729665360331e-06, + "loss": 0.78188324, + "num_input_tokens_seen": 20007685, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.5703125, + "step": 937, + "time_per_iteration": 2.4891855716705322 + }, + { + "auxiliary_loss_clip": 0.01091743, + "auxiliary_loss_mlp": 0.01002728, + "balance_loss_clip": 0.99738711, + "balance_loss_mlp": 1.02642298, + "epoch": 0.05639561100255524, + "flos": 70654928083200.0, + "grad_norm": 0.8631606334327763, + "language_loss": 0.64287508, + "learning_rate": 3.992696449831162e-06, + "loss": 0.66381979, + "num_input_tokens_seen": 20072750, + "router_z_loss_clip": 0.0534668, + "router_z_loss_mlp": 0.65625, + "step": 938, + "time_per_iteration": 3.0239782333374023 + }, + { + "auxiliary_loss_clip": 0.01226335, + "auxiliary_loss_mlp": 0.0107197, + "balance_loss_clip": 1.04487348, + "balance_loss_mlp": 1.06571174, + "epoch": 0.056455734255223204, + "flos": 20485996692480.0, + "grad_norm": 4.570077538128457, + "language_loss": 0.7903074, + "learning_rate": 3.992663158738745e-06, + "loss": 0.81329048, + "num_input_tokens_seen": 20089070, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.609375, + "step": 939, + "time_per_iteration": 2.494706630706787 + }, + { + "auxiliary_loss_clip": 0.012214, + "auxiliary_loss_mlp": 0.01063948, + "balance_loss_clip": 1.03868759, + "balance_loss_mlp": 1.0669229, + "epoch": 0.056515857507891176, + "flos": 22053569109120.0, + "grad_norm": 1.950609958048397, + "language_loss": 0.73893893, + "learning_rate": 3.992629792084341e-06, + "loss": 0.76179242, + "num_input_tokens_seen": 20108790, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.546875, + "step": 940, + "time_per_iteration": 2.5279061794281006 + }, + { + "auxiliary_loss_clip": 0.01220257, + "auxiliary_loss_mlp": 0.01064421, + "balance_loss_clip": 1.03776574, + "balance_loss_mlp": 1.06722569, + "epoch": 0.05657598076055915, + "flos": 24025316336640.0, + "grad_norm": 1.9142676693922898, + "language_loss": 0.70475829, + "learning_rate": 3.992596349869216e-06, + "loss": 0.72760499, + "num_input_tokens_seen": 20128455, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.53125, + "step": 941, + "time_per_iteration": 2.551604747772217 + }, + { + "auxiliary_loss_clip": 0.01218348, + "auxiliary_loss_mlp": 0.01058281, + "balance_loss_clip": 1.03229308, + "balance_loss_mlp": 1.06624675, + "epoch": 0.05663610401322711, + "flos": 20480609652480.0, + "grad_norm": 2.3045436850665917, + "language_loss": 0.80928791, + "learning_rate": 3.992562832094637e-06, + "loss": 0.83205426, + "num_input_tokens_seen": 20145775, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.515625, + "step": 942, + "time_per_iteration": 2.515646457672119 + }, + { + "auxiliary_loss_clip": 0.01214197, + "auxiliary_loss_mlp": 0.01057859, + "balance_loss_clip": 1.03245521, + "balance_loss_mlp": 1.062042, + "epoch": 0.056696227265895086, + "flos": 21069042255360.0, + "grad_norm": 2.7900678467193205, + "language_loss": 0.88067353, + "learning_rate": 3.9925292387618755e-06, + "loss": 0.9033941, + "num_input_tokens_seen": 20164315, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.5234375, + "step": 943, + "time_per_iteration": 2.674614191055298 + }, + { + "auxiliary_loss_clip": 0.01220399, + "auxiliary_loss_mlp": 0.01056577, + "balance_loss_clip": 1.03182912, + "balance_loss_mlp": 1.06757212, + "epoch": 0.05675635051856306, + "flos": 17821317219840.0, + "grad_norm": 2.6837069047913924, + "language_loss": 0.75092185, + "learning_rate": 3.992495569872206e-06, + "loss": 0.77369165, + "num_input_tokens_seen": 20182760, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.5234375, + "step": 944, + "time_per_iteration": 2.5806639194488525 + }, + { + "auxiliary_loss_clip": 0.01215674, + "auxiliary_loss_mlp": 0.01062669, + "balance_loss_clip": 1.0385294, + "balance_loss_mlp": 1.06267428, + "epoch": 0.05681647377123102, + "flos": 23114945111040.0, + "grad_norm": 1.7462690351912153, + "language_loss": 0.79321784, + "learning_rate": 3.992461825426906e-06, + "loss": 0.8160013, + "num_input_tokens_seen": 20203830, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.53125, + "step": 945, + "time_per_iteration": 2.695613384246826 + }, + { + "auxiliary_loss_clip": 0.01218347, + "auxiliary_loss_mlp": 0.01061935, + "balance_loss_clip": 1.03628159, + "balance_loss_mlp": 1.06407309, + "epoch": 0.056876597023898995, + "flos": 16070528505600.0, + "grad_norm": 2.1794845223078556, + "language_loss": 0.82465631, + "learning_rate": 3.992428005427252e-06, + "loss": 0.84745914, + "num_input_tokens_seen": 20220365, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.546875, + "step": 946, + "time_per_iteration": 2.6081790924072266 + }, + { + "auxiliary_loss_clip": 0.01223255, + "auxiliary_loss_mlp": 0.01059618, + "balance_loss_clip": 1.03258097, + "balance_loss_mlp": 1.06615055, + "epoch": 0.05693672027656696, + "flos": 16835641130880.0, + "grad_norm": 2.7693395657309297, + "language_loss": 0.7904911, + "learning_rate": 3.992394109874529e-06, + "loss": 0.8133198, + "num_input_tokens_seen": 20238640, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.5703125, + "step": 947, + "time_per_iteration": 2.460472822189331 + }, + { + "auxiliary_loss_clip": 0.01227462, + "auxiliary_loss_mlp": 0.01065027, + "balance_loss_clip": 1.03890848, + "balance_loss_mlp": 1.06883287, + "epoch": 0.05699684352923493, + "flos": 21389113370880.0, + "grad_norm": 7.046260534289203, + "language_loss": 0.85772789, + "learning_rate": 3.9923601387700225e-06, + "loss": 0.88065279, + "num_input_tokens_seen": 20251025, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.5859375, + "step": 948, + "time_per_iteration": 2.4560892581939697 + }, + { + "auxiliary_loss_clip": 0.01217019, + "auxiliary_loss_mlp": 0.01060985, + "balance_loss_clip": 1.03374553, + "balance_loss_mlp": 1.06329989, + "epoch": 0.057056966781902904, + "flos": 15560309767680.0, + "grad_norm": 1.8055084405958775, + "language_loss": 0.87044799, + "learning_rate": 3.992326092115019e-06, + "loss": 0.89322805, + "num_input_tokens_seen": 20269775, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.5390625, + "step": 949, + "time_per_iteration": 2.4843316078186035 + }, + { + "auxiliary_loss_clip": 0.01212611, + "auxiliary_loss_mlp": 0.01066298, + "balance_loss_clip": 1.04170561, + "balance_loss_mlp": 1.06284809, + "epoch": 0.05711709003457087, + "flos": 19937856170880.0, + "grad_norm": 2.230679935648155, + "language_loss": 0.79035759, + "learning_rate": 3.992291969910811e-06, + "loss": 0.81314665, + "num_input_tokens_seen": 20287715, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.4921875, + "step": 950, + "time_per_iteration": 2.468172311782837 + }, + { + "auxiliary_loss_clip": 0.01221984, + "auxiliary_loss_mlp": 0.01068601, + "balance_loss_clip": 1.04365039, + "balance_loss_mlp": 1.06574106, + "epoch": 0.05717721328723884, + "flos": 30332701774080.0, + "grad_norm": 2.0871877141587682, + "language_loss": 0.8244521, + "learning_rate": 3.992257772158691e-06, + "loss": 0.84735799, + "num_input_tokens_seen": 20307070, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.5625, + "step": 951, + "time_per_iteration": 2.5418505668640137 + }, + { + "auxiliary_loss_clip": 0.01215404, + "auxiliary_loss_mlp": 0.01062639, + "balance_loss_clip": 1.03568625, + "balance_loss_mlp": 1.06129527, + "epoch": 0.05723733653990681, + "flos": 23654358627840.0, + "grad_norm": 2.5400916768099426, + "language_loss": 0.86685216, + "learning_rate": 3.992223498859958e-06, + "loss": 0.88963258, + "num_input_tokens_seen": 20324945, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.5390625, + "step": 952, + "time_per_iteration": 2.513356924057007 + }, + { + "auxiliary_loss_clip": 0.0122001, + "auxiliary_loss_mlp": 0.01062958, + "balance_loss_clip": 1.03415656, + "balance_loss_mlp": 1.06145215, + "epoch": 0.05729745979257478, + "flos": 22055759838720.0, + "grad_norm": 1.725154467975805, + "language_loss": 0.79043579, + "learning_rate": 3.9921891500159084e-06, + "loss": 0.81326544, + "num_input_tokens_seen": 20346135, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.5859375, + "step": 953, + "time_per_iteration": 2.490940570831299 + }, + { + "auxiliary_loss_clip": 0.01223554, + "auxiliary_loss_mlp": 0.01063244, + "balance_loss_clip": 1.03592086, + "balance_loss_mlp": 1.06757712, + "epoch": 0.05735758304524275, + "flos": 19604353368960.0, + "grad_norm": 2.2937199779067106, + "language_loss": 0.87086606, + "learning_rate": 3.992154725627848e-06, + "loss": 0.89373398, + "num_input_tokens_seen": 20364450, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.5625, + "step": 954, + "time_per_iteration": 2.495039701461792 + }, + { + "auxiliary_loss_clip": 0.01221375, + "auxiliary_loss_mlp": 0.01062344, + "balance_loss_clip": 1.03707159, + "balance_loss_mlp": 1.06446028, + "epoch": 0.057417706297910716, + "flos": 19099018880640.0, + "grad_norm": 2.3514674671771933, + "language_loss": 0.87789929, + "learning_rate": 3.9921202256970804e-06, + "loss": 0.90073651, + "num_input_tokens_seen": 20383500, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.5625, + "step": 955, + "time_per_iteration": 2.5018017292022705 + }, + { + "auxiliary_loss_clip": 0.01214985, + "auxiliary_loss_mlp": 0.01065732, + "balance_loss_clip": 1.04000711, + "balance_loss_mlp": 1.06217909, + "epoch": 0.05747782955057869, + "flos": 16654507822080.0, + "grad_norm": 3.7193659196918576, + "language_loss": 0.89682388, + "learning_rate": 3.992085650224914e-06, + "loss": 0.919631, + "num_input_tokens_seen": 20400295, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.53125, + "step": 956, + "time_per_iteration": 2.43306565284729 + }, + { + "auxiliary_loss_clip": 0.01212174, + "auxiliary_loss_mlp": 0.0105844, + "balance_loss_clip": 1.03232098, + "balance_loss_mlp": 1.06344521, + "epoch": 0.05753795280324665, + "flos": 14502058248960.0, + "grad_norm": 1.7667772588634594, + "language_loss": 0.75335747, + "learning_rate": 3.99205099921266e-06, + "loss": 0.77606356, + "num_input_tokens_seen": 20419085, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.484375, + "step": 957, + "time_per_iteration": 2.469240665435791 + }, + { + "auxiliary_loss_clip": 0.01218166, + "auxiliary_loss_mlp": 0.01075955, + "balance_loss_clip": 1.04713011, + "balance_loss_mlp": 1.06214452, + "epoch": 0.057598076055914625, + "flos": 18076318848000.0, + "grad_norm": 1.8974624224625587, + "language_loss": 0.79871029, + "learning_rate": 3.992016272661633e-06, + "loss": 0.82165146, + "num_input_tokens_seen": 20437465, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.5625, + "step": 958, + "time_per_iteration": 2.5016849040985107 + }, + { + "auxiliary_loss_clip": 0.01214009, + "auxiliary_loss_mlp": 0.01062008, + "balance_loss_clip": 1.03780818, + "balance_loss_mlp": 1.06024444, + "epoch": 0.0576581993085826, + "flos": 22124600254080.0, + "grad_norm": 2.5702669091422234, + "language_loss": 0.88410264, + "learning_rate": 3.99198147057315e-06, + "loss": 0.90686285, + "num_input_tokens_seen": 20456235, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.5390625, + "step": 959, + "time_per_iteration": 2.4830191135406494 + }, + { + "auxiliary_loss_clip": 0.01211651, + "auxiliary_loss_mlp": 0.01063458, + "balance_loss_clip": 1.03832912, + "balance_loss_mlp": 1.0626018, + "epoch": 0.05771832256125056, + "flos": 33181746779520.0, + "grad_norm": 2.6997220185951347, + "language_loss": 0.78556621, + "learning_rate": 3.991946592948529e-06, + "loss": 0.8083173, + "num_input_tokens_seen": 20476825, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.4921875, + "step": 960, + "time_per_iteration": 2.569218397140503 + }, + { + "auxiliary_loss_clip": 0.01217172, + "auxiliary_loss_mlp": 0.01063539, + "balance_loss_clip": 1.03685999, + "balance_loss_mlp": 1.06168103, + "epoch": 0.057778445813918534, + "flos": 24170143973760.0, + "grad_norm": 4.159271492638429, + "language_loss": 0.932491, + "learning_rate": 3.991911639789094e-06, + "loss": 0.95529813, + "num_input_tokens_seen": 20496965, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.5546875, + "step": 961, + "time_per_iteration": 2.511765480041504 + }, + { + "auxiliary_loss_clip": 0.01215042, + "auxiliary_loss_mlp": 0.01070899, + "balance_loss_clip": 1.04411268, + "balance_loss_mlp": 1.06039667, + "epoch": 0.0578385690665865, + "flos": 29643037666560.0, + "grad_norm": 2.532017623976099, + "language_loss": 0.6822986, + "learning_rate": 3.991876611096169e-06, + "loss": 0.70515805, + "num_input_tokens_seen": 20518035, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.546875, + "step": 962, + "time_per_iteration": 2.544498920440674 + }, + { + "auxiliary_loss_clip": 0.01214012, + "auxiliary_loss_mlp": 0.01068596, + "balance_loss_clip": 1.04461062, + "balance_loss_mlp": 1.06268489, + "epoch": 0.05789869231925447, + "flos": 20885430908160.0, + "grad_norm": 2.445305128304827, + "language_loss": 0.88187808, + "learning_rate": 3.991841506871084e-06, + "loss": 0.90470415, + "num_input_tokens_seen": 20534740, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.515625, + "step": 963, + "time_per_iteration": 2.459487199783325 + }, + { + "auxiliary_loss_clip": 0.01222623, + "auxiliary_loss_mlp": 0.01058866, + "balance_loss_clip": 1.03337944, + "balance_loss_mlp": 1.06633568, + "epoch": 0.057958815571922444, + "flos": 26031106679040.0, + "grad_norm": 2.5656796350524473, + "language_loss": 0.84858835, + "learning_rate": 3.99180632711517e-06, + "loss": 0.87140322, + "num_input_tokens_seen": 20553485, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.5625, + "step": 964, + "time_per_iteration": 2.5268235206604004 + }, + { + "auxiliary_loss_clip": 0.01216658, + "auxiliary_loss_mlp": 0.01067828, + "balance_loss_clip": 1.04157782, + "balance_loss_mlp": 1.06309247, + "epoch": 0.05801893882459041, + "flos": 18077683564800.0, + "grad_norm": 2.846103019544017, + "language_loss": 0.77748007, + "learning_rate": 3.99177107182976e-06, + "loss": 0.80032492, + "num_input_tokens_seen": 20572155, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.5390625, + "step": 965, + "time_per_iteration": 2.4572315216064453 + }, + { + "auxiliary_loss_clip": 0.01211478, + "auxiliary_loss_mlp": 0.01068539, + "balance_loss_clip": 1.04424393, + "balance_loss_mlp": 1.0614084, + "epoch": 0.05807906207725838, + "flos": 17748885444480.0, + "grad_norm": 2.4479010977704463, + "language_loss": 0.80922461, + "learning_rate": 3.99173574101619e-06, + "loss": 0.83202475, + "num_input_tokens_seen": 20590395, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.5, + "step": 966, + "time_per_iteration": 2.4682776927948 + }, + { + "auxiliary_loss_clip": 0.01212307, + "auxiliary_loss_mlp": 0.01061872, + "balance_loss_clip": 1.03730273, + "balance_loss_mlp": 1.06173599, + "epoch": 0.058139185329926346, + "flos": 18040372312320.0, + "grad_norm": 1.8643875206872442, + "language_loss": 0.76291096, + "learning_rate": 3.9917003346758035e-06, + "loss": 0.78565276, + "num_input_tokens_seen": 20608435, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.5, + "step": 967, + "time_per_iteration": 2.453474521636963 + }, + { + "auxiliary_loss_clip": 0.01084806, + "auxiliary_loss_mlp": 0.01040579, + "balance_loss_clip": 1.03547657, + "balance_loss_mlp": 1.02152586, + "epoch": 0.05819930858259432, + "flos": 62363297485440.0, + "grad_norm": 0.7926144837125159, + "language_loss": 0.57362092, + "learning_rate": 3.991664852809939e-06, + "loss": 0.59487474, + "num_input_tokens_seen": 20668575, + "router_z_loss_clip": 0.05102539, + "router_z_loss_mlp": 0.6328125, + "step": 968, + "time_per_iteration": 2.994419574737549 + }, + { + "auxiliary_loss_clip": 0.01218807, + "auxiliary_loss_mlp": 0.01055354, + "balance_loss_clip": 1.02865148, + "balance_loss_mlp": 1.06574845, + "epoch": 0.05825943183526229, + "flos": 19135360465920.0, + "grad_norm": 2.057389892616485, + "language_loss": 0.82289147, + "learning_rate": 3.991629295419945e-06, + "loss": 0.84563303, + "num_input_tokens_seen": 20687355, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.53125, + "step": 969, + "time_per_iteration": 2.4718844890594482 + }, + { + "auxiliary_loss_clip": 0.01217673, + "auxiliary_loss_mlp": 0.01056826, + "balance_loss_clip": 1.03105259, + "balance_loss_mlp": 1.06392384, + "epoch": 0.058319555087930255, + "flos": 29022465369600.0, + "grad_norm": 2.1897875503845725, + "language_loss": 0.780442, + "learning_rate": 3.991593662507167e-06, + "loss": 0.80318701, + "num_input_tokens_seen": 20705710, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.5390625, + "step": 970, + "time_per_iteration": 2.5632171630859375 + }, + { + "auxiliary_loss_clip": 0.01216631, + "auxiliary_loss_mlp": 0.01054997, + "balance_loss_clip": 1.02809155, + "balance_loss_mlp": 1.06188202, + "epoch": 0.05837967834059823, + "flos": 18879999701760.0, + "grad_norm": 2.6802242915962, + "language_loss": 0.92492616, + "learning_rate": 3.991557954072958e-06, + "loss": 0.94764245, + "num_input_tokens_seen": 20722405, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.546875, + "step": 971, + "time_per_iteration": 2.4642531871795654 + }, + { + "auxiliary_loss_clip": 0.01210603, + "auxiliary_loss_mlp": 0.010597, + "balance_loss_clip": 1.03439212, + "balance_loss_mlp": 1.05865097, + "epoch": 0.05843980159326619, + "flos": 25703062744320.0, + "grad_norm": 3.0470884327064276, + "language_loss": 0.86133701, + "learning_rate": 3.991522170118673e-06, + "loss": 0.88404, + "num_input_tokens_seen": 20741480, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.515625, + "step": 972, + "time_per_iteration": 2.5298526287078857 + }, + { + "auxiliary_loss_clip": 0.01212752, + "auxiliary_loss_mlp": 0.01064681, + "balance_loss_clip": 1.04038596, + "balance_loss_mlp": 1.0636549, + "epoch": 0.058499924845934165, + "flos": 25552129795200.0, + "grad_norm": 2.0754734138997906, + "language_loss": 0.87340444, + "learning_rate": 3.991486310645667e-06, + "loss": 0.89617872, + "num_input_tokens_seen": 20759685, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.4921875, + "step": 973, + "time_per_iteration": 2.5198311805725098 + }, + { + "auxiliary_loss_clip": 0.01213937, + "auxiliary_loss_mlp": 0.01067264, + "balance_loss_clip": 1.04070425, + "balance_loss_mlp": 1.06140256, + "epoch": 0.05856004809860214, + "flos": 16436171001600.0, + "grad_norm": 3.2539468590332707, + "language_loss": 0.74868345, + "learning_rate": 3.991450375655301e-06, + "loss": 0.77149546, + "num_input_tokens_seen": 20778180, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5234375, + "step": 974, + "time_per_iteration": 2.465268850326538 + }, + { + "auxiliary_loss_clip": 0.0121359, + "auxiliary_loss_mlp": 0.01059075, + "balance_loss_clip": 1.03308713, + "balance_loss_mlp": 1.06260133, + "epoch": 0.0586201713512701, + "flos": 39458824116480.0, + "grad_norm": 1.7891188847385684, + "language_loss": 0.76707923, + "learning_rate": 3.991414365148936e-06, + "loss": 0.78980577, + "num_input_tokens_seen": 20802705, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.515625, + "step": 975, + "time_per_iteration": 2.633850336074829 + }, + { + "auxiliary_loss_clip": 0.01216778, + "auxiliary_loss_mlp": 0.01068456, + "balance_loss_clip": 1.04332697, + "balance_loss_mlp": 1.0621978, + "epoch": 0.058680294603938074, + "flos": 23365170230400.0, + "grad_norm": 2.0981769673049326, + "language_loss": 0.76878488, + "learning_rate": 3.99137827912794e-06, + "loss": 0.79163718, + "num_input_tokens_seen": 20822540, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.546875, + "step": 976, + "time_per_iteration": 6.8309245109558105 + }, + { + "auxiliary_loss_clip": 0.01210296, + "auxiliary_loss_mlp": 0.01061517, + "balance_loss_clip": 1.03606534, + "balance_loss_mlp": 1.0585494, + "epoch": 0.05874041785660604, + "flos": 32232017226240.0, + "grad_norm": 1.8109666318996334, + "language_loss": 0.87465948, + "learning_rate": 3.991342117593679e-06, + "loss": 0.89737761, + "num_input_tokens_seen": 20844175, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.515625, + "step": 977, + "time_per_iteration": 2.5693395137786865 + }, + { + "auxiliary_loss_clip": 0.01213396, + "auxiliary_loss_mlp": 0.01064383, + "balance_loss_clip": 1.0380497, + "balance_loss_mlp": 1.06246471, + "epoch": 0.05880054110927401, + "flos": 22310043194880.0, + "grad_norm": 1.7886661734827753, + "language_loss": 0.79517525, + "learning_rate": 3.991305880547527e-06, + "loss": 0.81795299, + "num_input_tokens_seen": 20864730, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.5078125, + "step": 978, + "time_per_iteration": 2.51609206199646 + }, + { + "auxiliary_loss_clip": 0.01218239, + "auxiliary_loss_mlp": 0.01069938, + "balance_loss_clip": 1.04339027, + "balance_loss_mlp": 1.06304932, + "epoch": 0.05886066436194198, + "flos": 27380450016000.0, + "grad_norm": 2.6270410794651102, + "language_loss": 0.80902123, + "learning_rate": 3.991269567990855e-06, + "loss": 0.83190298, + "num_input_tokens_seen": 20885200, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.546875, + "step": 979, + "time_per_iteration": 2.527127504348755 + }, + { + "auxiliary_loss_clip": 0.01085971, + "auxiliary_loss_mlp": 0.01009543, + "balance_loss_clip": 1.0044651, + "balance_loss_mlp": 1.02304745, + "epoch": 0.05892078761460995, + "flos": 59584493525760.0, + "grad_norm": 0.94528472512207, + "language_loss": 0.59059429, + "learning_rate": 3.9912331799250415e-06, + "loss": 0.61154944, + "num_input_tokens_seen": 20940325, + "router_z_loss_clip": 0.05078125, + "router_z_loss_mlp": 0.62890625, + "step": 980, + "time_per_iteration": 2.9545915126800537 + }, + { + "auxiliary_loss_clip": 0.01210703, + "auxiliary_loss_mlp": 0.0106402, + "balance_loss_clip": 1.03747201, + "balance_loss_mlp": 1.0622623, + "epoch": 0.05898091086727792, + "flos": 15414081500160.0, + "grad_norm": 2.3915266710240917, + "language_loss": 0.86397457, + "learning_rate": 3.9911967163514665e-06, + "loss": 0.88672185, + "num_input_tokens_seen": 20958220, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.484375, + "step": 981, + "time_per_iteration": 2.4726293087005615 + }, + { + "auxiliary_loss_clip": 0.01212695, + "auxiliary_loss_mlp": 0.01057503, + "balance_loss_clip": 1.03423381, + "balance_loss_mlp": 1.06214404, + "epoch": 0.059041034119945886, + "flos": 23655328295040.0, + "grad_norm": 1.9485203495729437, + "language_loss": 0.79623365, + "learning_rate": 3.991160177271513e-06, + "loss": 0.81893563, + "num_input_tokens_seen": 20978920, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.5078125, + "step": 982, + "time_per_iteration": 2.5271458625793457 + }, + { + "auxiliary_loss_clip": 0.01219179, + "auxiliary_loss_mlp": 0.01060762, + "balance_loss_clip": 1.03571582, + "balance_loss_mlp": 1.06248748, + "epoch": 0.05910115737261386, + "flos": 24754087376640.0, + "grad_norm": 2.5320957946125437, + "language_loss": 0.84376037, + "learning_rate": 3.9911235626865654e-06, + "loss": 0.86655974, + "num_input_tokens_seen": 20999490, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.5625, + "step": 983, + "time_per_iteration": 2.526364803314209 + }, + { + "auxiliary_loss_clip": 0.01212847, + "auxiliary_loss_mlp": 0.01067453, + "balance_loss_clip": 1.04361129, + "balance_loss_mlp": 1.06317604, + "epoch": 0.05916128062528183, + "flos": 11728749070080.0, + "grad_norm": 1.8446015864025267, + "language_loss": 0.84607553, + "learning_rate": 3.9910868725980125e-06, + "loss": 0.86887848, + "num_input_tokens_seen": 21017865, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.4921875, + "step": 984, + "time_per_iteration": 2.456803321838379 + }, + { + "auxiliary_loss_clip": 0.01211466, + "auxiliary_loss_mlp": 0.01059154, + "balance_loss_clip": 1.03551483, + "balance_loss_mlp": 1.06338882, + "epoch": 0.059221403877949795, + "flos": 21902995296000.0, + "grad_norm": 2.3276500524021495, + "language_loss": 0.77875566, + "learning_rate": 3.9910501070072465e-06, + "loss": 0.80146182, + "num_input_tokens_seen": 21035900, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.484375, + "step": 985, + "time_per_iteration": 2.504096508026123 + }, + { + "auxiliary_loss_clip": 0.01215785, + "auxiliary_loss_mlp": 0.01061307, + "balance_loss_clip": 1.03661919, + "balance_loss_mlp": 1.06191659, + "epoch": 0.05928152713061777, + "flos": 20514580940160.0, + "grad_norm": 2.294716701848832, + "language_loss": 0.90598249, + "learning_rate": 3.991013265915661e-06, + "loss": 0.92875338, + "num_input_tokens_seen": 21053235, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.5390625, + "step": 986, + "time_per_iteration": 2.4882049560546875 + }, + { + "auxiliary_loss_clip": 0.01215421, + "auxiliary_loss_mlp": 0.01062373, + "balance_loss_clip": 1.03534794, + "balance_loss_mlp": 1.06017947, + "epoch": 0.05934165038328574, + "flos": 24495135252480.0, + "grad_norm": 3.8181645576894256, + "language_loss": 0.7589798, + "learning_rate": 3.9909763493246525e-06, + "loss": 0.78175771, + "num_input_tokens_seen": 21073090, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.546875, + "step": 987, + "time_per_iteration": 2.492412805557251 + }, + { + "auxiliary_loss_clip": 0.01216653, + "auxiliary_loss_mlp": 0.01059941, + "balance_loss_clip": 1.03491902, + "balance_loss_mlp": 1.06059265, + "epoch": 0.059401773635953704, + "flos": 38728041914880.0, + "grad_norm": 2.1447391932017843, + "language_loss": 0.71525705, + "learning_rate": 3.990939357235621e-06, + "loss": 0.73802304, + "num_input_tokens_seen": 21094895, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.5625, + "step": 988, + "time_per_iteration": 2.6386756896972656 + }, + { + "auxiliary_loss_clip": 0.01081383, + "auxiliary_loss_mlp": 0.01012054, + "balance_loss_clip": 1.00680876, + "balance_loss_mlp": 1.01888978, + "epoch": 0.059461896888621676, + "flos": 58023565125120.0, + "grad_norm": 0.9344259157338769, + "language_loss": 0.71159971, + "learning_rate": 3.99090228964997e-06, + "loss": 0.73253405, + "num_input_tokens_seen": 21147555, + "router_z_loss_clip": 0.05249023, + "router_z_loss_mlp": 0.625, + "step": 989, + "time_per_iteration": 2.903996706008911 + }, + { + "auxiliary_loss_clip": 0.01219656, + "auxiliary_loss_mlp": 0.01067443, + "balance_loss_clip": 1.0404067, + "balance_loss_mlp": 1.06221163, + "epoch": 0.05952202014128964, + "flos": 22127760650880.0, + "grad_norm": 1.89069901477269, + "language_loss": 0.78102934, + "learning_rate": 3.990865146569105e-06, + "loss": 0.80390036, + "num_input_tokens_seen": 21167845, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.578125, + "step": 990, + "time_per_iteration": 2.6252431869506836 + }, + { + "auxiliary_loss_clip": 0.01208224, + "auxiliary_loss_mlp": 0.0105602, + "balance_loss_clip": 1.0303905, + "balance_loss_mlp": 1.05700588, + "epoch": 0.059582143393957614, + "flos": 20445776438400.0, + "grad_norm": 2.077710223302236, + "language_loss": 0.86406755, + "learning_rate": 3.990827927994434e-06, + "loss": 0.88671005, + "num_input_tokens_seen": 21185085, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.515625, + "step": 991, + "time_per_iteration": 2.483853340148926 + }, + { + "auxiliary_loss_clip": 0.01216429, + "auxiliary_loss_mlp": 0.01065185, + "balance_loss_clip": 1.04030573, + "balance_loss_mlp": 1.06190968, + "epoch": 0.059642266646625586, + "flos": 20594877793920.0, + "grad_norm": 2.866628977756486, + "language_loss": 0.76876801, + "learning_rate": 3.9907906339273674e-06, + "loss": 0.79158413, + "num_input_tokens_seen": 21204230, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 1.546875, + "step": 992, + "time_per_iteration": 2.5149648189544678 + }, + { + "auxiliary_loss_clip": 0.01214781, + "auxiliary_loss_mlp": 0.0106048, + "balance_loss_clip": 1.03701937, + "balance_loss_mlp": 1.06251192, + "epoch": 0.05970238989929355, + "flos": 19352655792000.0, + "grad_norm": 2.726921793738851, + "language_loss": 0.74594641, + "learning_rate": 3.9907532643693215e-06, + "loss": 0.76869899, + "num_input_tokens_seen": 21222655, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.5234375, + "step": 993, + "time_per_iteration": 2.4739816188812256 + }, + { + "auxiliary_loss_clip": 0.01214249, + "auxiliary_loss_mlp": 0.01071365, + "balance_loss_clip": 1.04560351, + "balance_loss_mlp": 1.06326771, + "epoch": 0.05976251315196152, + "flos": 30264040926720.0, + "grad_norm": 3.2517233877247396, + "language_loss": 0.78911841, + "learning_rate": 3.990715819321712e-06, + "loss": 0.81197453, + "num_input_tokens_seen": 21242310, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.515625, + "step": 994, + "time_per_iteration": 2.5408835411071777 + }, + { + "auxiliary_loss_clip": 0.01214677, + "auxiliary_loss_mlp": 0.01082728, + "balance_loss_clip": 1.05768251, + "balance_loss_mlp": 1.06170893, + "epoch": 0.05982263640462949, + "flos": 23185150243200.0, + "grad_norm": 2.42517884603863, + "language_loss": 0.79639304, + "learning_rate": 3.99067829878596e-06, + "loss": 0.81936711, + "num_input_tokens_seen": 21261410, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.53125, + "step": 995, + "time_per_iteration": 2.5062758922576904 + }, + { + "auxiliary_loss_clip": 0.01212085, + "auxiliary_loss_mlp": 0.01065995, + "balance_loss_clip": 1.04022169, + "balance_loss_mlp": 1.05969059, + "epoch": 0.05988275965729746, + "flos": 27850879463040.0, + "grad_norm": 2.536496545288829, + "language_loss": 0.86939722, + "learning_rate": 3.990640702763487e-06, + "loss": 0.89217806, + "num_input_tokens_seen": 21280080, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.53125, + "step": 996, + "time_per_iteration": 2.5236001014709473 + }, + { + "auxiliary_loss_clip": 0.01217352, + "auxiliary_loss_mlp": 0.01070048, + "balance_loss_clip": 1.04098463, + "balance_loss_mlp": 1.06309104, + "epoch": 0.05994288290996543, + "flos": 24680003575680.0, + "grad_norm": 4.013698471354103, + "language_loss": 0.88192105, + "learning_rate": 3.990603031255718e-06, + "loss": 0.90479505, + "num_input_tokens_seen": 21296765, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.546875, + "step": 997, + "time_per_iteration": 2.483116626739502 + }, + { + "auxiliary_loss_clip": 0.01079761, + "auxiliary_loss_mlp": 0.01004407, + "balance_loss_clip": 0.9993524, + "balance_loss_mlp": 1.01837301, + "epoch": 0.0600030061626334, + "flos": 69929568835200.0, + "grad_norm": 1.020759515587473, + "language_loss": 0.75442117, + "learning_rate": 3.990565284264083e-06, + "loss": 0.77526283, + "num_input_tokens_seen": 21363345, + "router_z_loss_clip": 0.05053711, + "router_z_loss_mlp": 0.6171875, + "step": 998, + "time_per_iteration": 3.152331590652466 + }, + { + "auxiliary_loss_clip": 0.01213812, + "auxiliary_loss_mlp": 0.01067708, + "balance_loss_clip": 1.04179215, + "balance_loss_mlp": 1.0626508, + "epoch": 0.06006312941530137, + "flos": 26540140268160.0, + "grad_norm": 1.8375420281697645, + "language_loss": 0.75796127, + "learning_rate": 3.990527461790013e-06, + "loss": 0.7807765, + "num_input_tokens_seen": 21385290, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.515625, + "step": 999, + "time_per_iteration": 2.5051238536834717 + }, + { + "auxiliary_loss_clip": 0.01212853, + "auxiliary_loss_mlp": 0.0106227, + "balance_loss_clip": 1.03575778, + "balance_loss_mlp": 1.05894446, + "epoch": 0.060123252667969335, + "flos": 27344000689920.0, + "grad_norm": 1.9091686508511199, + "language_loss": 0.82658899, + "learning_rate": 3.990489563834943e-06, + "loss": 0.8493402, + "num_input_tokens_seen": 21407625, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5390625, + "step": 1000, + "time_per_iteration": 2.5369935035705566 + }, + { + "auxiliary_loss_clip": 0.01215386, + "auxiliary_loss_mlp": 0.01057348, + "balance_loss_clip": 1.03282714, + "balance_loss_mlp": 1.06143069, + "epoch": 0.06018337592063731, + "flos": 27016710940800.0, + "grad_norm": 3.4065508827059783, + "language_loss": 0.85644853, + "learning_rate": 3.990451590400309e-06, + "loss": 0.8791759, + "num_input_tokens_seen": 21426835, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.5390625, + "step": 1001, + "time_per_iteration": 2.4972190856933594 + }, + { + "auxiliary_loss_clip": 0.0120879, + "auxiliary_loss_mlp": 0.01063055, + "balance_loss_clip": 1.0376749, + "balance_loss_mlp": 1.0587517, + "epoch": 0.06024349917330528, + "flos": 25592960580480.0, + "grad_norm": 2.156321640703371, + "language_loss": 0.74386394, + "learning_rate": 3.990413541487551e-06, + "loss": 0.76658237, + "num_input_tokens_seen": 21444920, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.5, + "step": 1002, + "time_per_iteration": 2.531172275543213 + }, + { + "auxiliary_loss_clip": 0.01211576, + "auxiliary_loss_mlp": 0.01065904, + "balance_loss_clip": 1.04019034, + "balance_loss_mlp": 1.06015134, + "epoch": 0.060303622425973244, + "flos": 26133271937280.0, + "grad_norm": 3.1165374575777145, + "language_loss": 0.75346643, + "learning_rate": 3.990375417098112e-06, + "loss": 0.77624118, + "num_input_tokens_seen": 21463555, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.515625, + "step": 1003, + "time_per_iteration": 2.508817434310913 + }, + { + "auxiliary_loss_clip": 0.01219434, + "auxiliary_loss_mlp": 0.01065938, + "balance_loss_clip": 1.04047489, + "balance_loss_mlp": 1.06255794, + "epoch": 0.060363745678641216, + "flos": 20377187418240.0, + "grad_norm": 2.2578292515807603, + "language_loss": 0.70071733, + "learning_rate": 3.990337217233437e-06, + "loss": 0.723571, + "num_input_tokens_seen": 21481990, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.5625, + "step": 1004, + "time_per_iteration": 2.480116844177246 + }, + { + "auxiliary_loss_clip": 0.01218526, + "auxiliary_loss_mlp": 0.01073584, + "balance_loss_clip": 1.04810917, + "balance_loss_mlp": 1.06360686, + "epoch": 0.06042386893130918, + "flos": 17749172753280.0, + "grad_norm": 2.248554137518493, + "language_loss": 0.83246684, + "learning_rate": 3.990298941894976e-06, + "loss": 0.85538793, + "num_input_tokens_seen": 21500385, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.546875, + "step": 1005, + "time_per_iteration": 2.449733018875122 + }, + { + "auxiliary_loss_clip": 0.01077543, + "auxiliary_loss_mlp": 0.01007523, + "balance_loss_clip": 1.00306416, + "balance_loss_mlp": 1.0157814, + "epoch": 0.06048399218397715, + "flos": 68538496872960.0, + "grad_norm": 0.8959746990508154, + "language_loss": 0.59000289, + "learning_rate": 3.9902605910841794e-06, + "loss": 0.61085355, + "num_input_tokens_seen": 21561040, + "router_z_loss_clip": 0.04467773, + "router_z_loss_mlp": 0.6171875, + "step": 1006, + "time_per_iteration": 3.1583423614501953 + }, + { + "auxiliary_loss_clip": 0.01209886, + "auxiliary_loss_mlp": 0.0105727, + "balance_loss_clip": 1.03203392, + "balance_loss_mlp": 1.05658197, + "epoch": 0.060544115436645125, + "flos": 23258515772160.0, + "grad_norm": 2.271524805944984, + "language_loss": 0.7428897, + "learning_rate": 3.990222164802503e-06, + "loss": 0.76556122, + "num_input_tokens_seen": 21580655, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.53125, + "step": 1007, + "time_per_iteration": 2.49139666557312 + }, + { + "auxiliary_loss_clip": 0.01212867, + "auxiliary_loss_mlp": 0.01055047, + "balance_loss_clip": 1.02930975, + "balance_loss_mlp": 1.05897522, + "epoch": 0.06060423868931309, + "flos": 23878441624320.0, + "grad_norm": 1.8583948299039934, + "language_loss": 0.80739897, + "learning_rate": 3.9901836630514006e-06, + "loss": 0.83007812, + "num_input_tokens_seen": 21599650, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.5390625, + "step": 1008, + "time_per_iteration": 2.4990036487579346 + }, + { + "auxiliary_loss_clip": 0.01213893, + "auxiliary_loss_mlp": 0.01055804, + "balance_loss_clip": 1.03082955, + "balance_loss_mlp": 1.06254637, + "epoch": 0.06066436194198106, + "flos": 18728061171840.0, + "grad_norm": 1.935763632111394, + "language_loss": 0.77840835, + "learning_rate": 3.990145085832335e-06, + "loss": 0.80110532, + "num_input_tokens_seen": 21617550, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.515625, + "step": 1009, + "time_per_iteration": 2.4785048961639404 + }, + { + "auxiliary_loss_clip": 0.01210213, + "auxiliary_loss_mlp": 0.01059495, + "balance_loss_clip": 1.03537917, + "balance_loss_mlp": 1.06082368, + "epoch": 0.06072448519464903, + "flos": 24640465680000.0, + "grad_norm": 2.1058592784097567, + "language_loss": 0.93059653, + "learning_rate": 3.990106433146769e-06, + "loss": 0.95329368, + "num_input_tokens_seen": 21635865, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.4921875, + "step": 1010, + "time_per_iteration": 2.507596015930176 + }, + { + "auxiliary_loss_clip": 0.01219036, + "auxiliary_loss_mlp": 0.01066438, + "balance_loss_clip": 1.0390203, + "balance_loss_mlp": 1.05885124, + "epoch": 0.060784608447317, + "flos": 17378825575680.0, + "grad_norm": 3.1716667034247843, + "language_loss": 0.71846473, + "learning_rate": 3.9900677049961665e-06, + "loss": 0.74131954, + "num_input_tokens_seen": 21653945, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.6015625, + "step": 1011, + "time_per_iteration": 2.4791040420532227 + }, + { + "auxiliary_loss_clip": 0.01214432, + "auxiliary_loss_mlp": 0.01070485, + "balance_loss_clip": 1.04388905, + "balance_loss_mlp": 1.05902421, + "epoch": 0.06084473169998497, + "flos": 23692208584320.0, + "grad_norm": 2.5871469840663535, + "language_loss": 0.87542284, + "learning_rate": 3.990028901381999e-06, + "loss": 0.89827204, + "num_input_tokens_seen": 21671230, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5546875, + "step": 1012, + "time_per_iteration": 2.4876151084899902 + }, + { + "auxiliary_loss_clip": 0.01206171, + "auxiliary_loss_mlp": 0.01060353, + "balance_loss_clip": 1.03577256, + "balance_loss_mlp": 1.05505085, + "epoch": 0.06090485495265294, + "flos": 23546339452800.0, + "grad_norm": 1.8956263482043672, + "language_loss": 0.76679665, + "learning_rate": 3.989990022305734e-06, + "loss": 0.78946191, + "num_input_tokens_seen": 21691155, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.515625, + "step": 1013, + "time_per_iteration": 2.4874446392059326 + }, + { + "auxiliary_loss_clip": 0.01215089, + "auxiliary_loss_mlp": 0.01067055, + "balance_loss_clip": 1.03946972, + "balance_loss_mlp": 1.05924904, + "epoch": 0.06096497820532091, + "flos": 20339301548160.0, + "grad_norm": 2.654718290448769, + "language_loss": 0.85651302, + "learning_rate": 3.98995106776885e-06, + "loss": 0.87933445, + "num_input_tokens_seen": 21707405, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.5546875, + "step": 1014, + "time_per_iteration": 2.483774423599243 + }, + { + "auxiliary_loss_clip": 0.0122011, + "auxiliary_loss_mlp": 0.01067578, + "balance_loss_clip": 1.03996944, + "balance_loss_mlp": 1.06207335, + "epoch": 0.061025101457988874, + "flos": 26939035779840.0, + "grad_norm": 2.4287988001966028, + "language_loss": 0.72807163, + "learning_rate": 3.98991203777282e-06, + "loss": 0.75094855, + "num_input_tokens_seen": 21728090, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.578125, + "step": 1015, + "time_per_iteration": 2.520026206970215 + }, + { + "auxiliary_loss_clip": 0.01207162, + "auxiliary_loss_mlp": 0.01068406, + "balance_loss_clip": 1.04290748, + "balance_loss_mlp": 1.0576005, + "epoch": 0.061085224710656846, + "flos": 25375054723200.0, + "grad_norm": 1.6555956389633335, + "language_loss": 0.79197502, + "learning_rate": 3.9898729323191275e-06, + "loss": 0.8147307, + "num_input_tokens_seen": 21747950, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4921875, + "step": 1016, + "time_per_iteration": 2.5177054405212402 + }, + { + "auxiliary_loss_clip": 0.01210352, + "auxiliary_loss_mlp": 0.01060413, + "balance_loss_clip": 1.03609443, + "balance_loss_mlp": 1.0571332, + "epoch": 0.06114534796332482, + "flos": 24824759385600.0, + "grad_norm": 1.934405213560846, + "language_loss": 0.76170123, + "learning_rate": 3.989833751409254e-06, + "loss": 0.78440881, + "num_input_tokens_seen": 21767900, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.53125, + "step": 1017, + "time_per_iteration": 2.517730951309204 + }, + { + "auxiliary_loss_clip": 0.01220983, + "auxiliary_loss_mlp": 0.01069505, + "balance_loss_clip": 1.04331422, + "balance_loss_mlp": 1.06240773, + "epoch": 0.061205471215992784, + "flos": 20631434860800.0, + "grad_norm": 1.873264658326973, + "language_loss": 0.86145842, + "learning_rate": 3.989794495044685e-06, + "loss": 0.88436329, + "num_input_tokens_seen": 21787375, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.5859375, + "step": 1018, + "time_per_iteration": 5.324457883834839 + }, + { + "auxiliary_loss_clip": 0.01206709, + "auxiliary_loss_mlp": 0.01071464, + "balance_loss_clip": 1.045012, + "balance_loss_mlp": 1.05659163, + "epoch": 0.061265594468660756, + "flos": 16508351381760.0, + "grad_norm": 2.696758126666256, + "language_loss": 0.77535981, + "learning_rate": 3.989755163226909e-06, + "loss": 0.79814154, + "num_input_tokens_seen": 21806275, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.5, + "step": 1019, + "time_per_iteration": 2.453768253326416 + }, + { + "auxiliary_loss_clip": 0.01210848, + "auxiliary_loss_mlp": 0.01061489, + "balance_loss_clip": 1.03559661, + "balance_loss_mlp": 1.05749679, + "epoch": 0.06132571772132872, + "flos": 26246211275520.0, + "grad_norm": 1.8458417378275351, + "language_loss": 0.84254557, + "learning_rate": 3.989715755957418e-06, + "loss": 0.86526895, + "num_input_tokens_seen": 21826430, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.53125, + "step": 1020, + "time_per_iteration": 2.5126123428344727 + }, + { + "auxiliary_loss_clip": 0.01217116, + "auxiliary_loss_mlp": 0.01060663, + "balance_loss_clip": 1.0352596, + "balance_loss_mlp": 1.06234074, + "epoch": 0.06138584097399669, + "flos": 37414788768000.0, + "grad_norm": 2.186416819505148, + "language_loss": 0.79234397, + "learning_rate": 3.989676273237705e-06, + "loss": 0.81512177, + "num_input_tokens_seen": 21847800, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.546875, + "step": 1021, + "time_per_iteration": 2.583918333053589 + }, + { + "auxiliary_loss_clip": 0.01207219, + "auxiliary_loss_mlp": 0.01064403, + "balance_loss_clip": 1.04207504, + "balance_loss_mlp": 1.05748677, + "epoch": 0.061445964226664665, + "flos": 17420661941760.0, + "grad_norm": 2.2026341390443434, + "language_loss": 0.87493509, + "learning_rate": 3.9896367150692705e-06, + "loss": 0.89765131, + "num_input_tokens_seen": 21863385, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.5, + "step": 1022, + "time_per_iteration": 2.441298007965088 + }, + { + "auxiliary_loss_clip": 0.01213359, + "auxiliary_loss_mlp": 0.0106856, + "balance_loss_clip": 1.04353857, + "balance_loss_mlp": 1.06052542, + "epoch": 0.06150608747933263, + "flos": 22600021691520.0, + "grad_norm": 1.752710779550117, + "language_loss": 0.82776564, + "learning_rate": 3.989597081453611e-06, + "loss": 0.85058486, + "num_input_tokens_seen": 21881880, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.53125, + "step": 1023, + "time_per_iteration": 2.5027952194213867 + }, + { + "auxiliary_loss_clip": 0.01079025, + "auxiliary_loss_mlp": 0.01003997, + "balance_loss_clip": 0.99944335, + "balance_loss_mlp": 1.01796818, + "epoch": 0.0615662107320006, + "flos": 56741482005120.0, + "grad_norm": 0.8999264202466762, + "language_loss": 0.65078986, + "learning_rate": 3.989557372392231e-06, + "loss": 0.67162001, + "num_input_tokens_seen": 21940550, + "router_z_loss_clip": 0.0456543, + "router_z_loss_mlp": 0.609375, + "step": 1024, + "time_per_iteration": 3.0969655513763428 + }, + { + "auxiliary_loss_clip": 0.01212272, + "auxiliary_loss_mlp": 0.01066841, + "balance_loss_clip": 1.04123473, + "balance_loss_mlp": 1.05936897, + "epoch": 0.06162633398466857, + "flos": 22564793427840.0, + "grad_norm": 1.9303372998519377, + "language_loss": 0.88293028, + "learning_rate": 3.989517587886636e-06, + "loss": 0.90572149, + "num_input_tokens_seen": 21958390, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.53125, + "step": 1025, + "time_per_iteration": 2.5229876041412354 + }, + { + "auxiliary_loss_clip": 0.01212316, + "auxiliary_loss_mlp": 0.01059432, + "balance_loss_clip": 1.03513718, + "balance_loss_mlp": 1.05916524, + "epoch": 0.06168645723733654, + "flos": 25593104234880.0, + "grad_norm": 1.519276165786755, + "language_loss": 0.84567487, + "learning_rate": 3.989477727938335e-06, + "loss": 0.86839235, + "num_input_tokens_seen": 21978625, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.53125, + "step": 1026, + "time_per_iteration": 2.5304806232452393 + }, + { + "auxiliary_loss_clip": 0.01212365, + "auxiliary_loss_mlp": 0.01071013, + "balance_loss_clip": 1.04614556, + "balance_loss_mlp": 1.05798197, + "epoch": 0.06174658049000451, + "flos": 15997917162240.0, + "grad_norm": 1.9431802827698534, + "language_loss": 0.82320756, + "learning_rate": 3.989437792548839e-06, + "loss": 0.84604132, + "num_input_tokens_seen": 21996035, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 1.546875, + "step": 1027, + "time_per_iteration": 2.4508368968963623 + }, + { + "auxiliary_loss_clip": 0.01209611, + "auxiliary_loss_mlp": 0.01058329, + "balance_loss_clip": 1.03343821, + "balance_loss_mlp": 1.05799866, + "epoch": 0.06180670374267248, + "flos": 11285970117120.0, + "grad_norm": 2.262386050001272, + "language_loss": 0.84232426, + "learning_rate": 3.989397781719663e-06, + "loss": 0.86500365, + "num_input_tokens_seen": 22011625, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 1.515625, + "step": 1028, + "time_per_iteration": 2.4485137462615967 + }, + { + "auxiliary_loss_clip": 0.01077664, + "auxiliary_loss_mlp": 0.01009618, + "balance_loss_clip": 1.00544536, + "balance_loss_mlp": 1.01686025, + "epoch": 0.06186682699534045, + "flos": 65130142216320.0, + "grad_norm": 0.9476883841381922, + "language_loss": 0.60497737, + "learning_rate": 3.989357695452323e-06, + "loss": 0.6258502, + "num_input_tokens_seen": 22066035, + "router_z_loss_clip": 0.04174805, + "router_z_loss_mlp": 0.609375, + "step": 1029, + "time_per_iteration": 2.8714137077331543 + }, + { + "auxiliary_loss_clip": 0.0120304, + "auxiliary_loss_mlp": 0.01066238, + "balance_loss_clip": 1.0419786, + "balance_loss_mlp": 1.05338669, + "epoch": 0.061926950248008414, + "flos": 21105742976640.0, + "grad_norm": 2.297452518318954, + "language_loss": 0.82309926, + "learning_rate": 3.98931753374834e-06, + "loss": 0.84579194, + "num_input_tokens_seen": 22085015, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.4921875, + "step": 1030, + "time_per_iteration": 2.4705348014831543 + }, + { + "auxiliary_loss_clip": 0.01214194, + "auxiliary_loss_mlp": 0.01071397, + "balance_loss_clip": 1.04586279, + "balance_loss_mlp": 1.06025672, + "epoch": 0.061987073500676386, + "flos": 17748454481280.0, + "grad_norm": 2.391039807046215, + "language_loss": 0.80262065, + "learning_rate": 3.989277296609237e-06, + "loss": 0.82547653, + "num_input_tokens_seen": 22102775, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.5390625, + "step": 1031, + "time_per_iteration": 2.447964906692505 + }, + { + "auxiliary_loss_clip": 0.0121101, + "auxiliary_loss_mlp": 0.01075497, + "balance_loss_clip": 1.04919958, + "balance_loss_mlp": 1.05865717, + "epoch": 0.06204719675334436, + "flos": 21836237869440.0, + "grad_norm": 1.6245278130098144, + "language_loss": 0.77141201, + "learning_rate": 3.98923698403654e-06, + "loss": 0.79427713, + "num_input_tokens_seen": 22121680, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.5234375, + "step": 1032, + "time_per_iteration": 2.475891590118408 + }, + { + "auxiliary_loss_clip": 0.01205906, + "auxiliary_loss_mlp": 0.01069412, + "balance_loss_clip": 1.04350805, + "balance_loss_mlp": 1.05307126, + "epoch": 0.06210732000601232, + "flos": 19353697286400.0, + "grad_norm": 3.949793190746779, + "language_loss": 0.89276892, + "learning_rate": 3.989196596031776e-06, + "loss": 0.91552204, + "num_input_tokens_seen": 22138155, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.53125, + "step": 1033, + "time_per_iteration": 2.5332658290863037 + }, + { + "auxiliary_loss_clip": 0.01212647, + "auxiliary_loss_mlp": 0.01059216, + "balance_loss_clip": 1.03437293, + "balance_loss_mlp": 1.05739737, + "epoch": 0.062167443258680295, + "flos": 24749382695040.0, + "grad_norm": 2.160025730572359, + "language_loss": 0.84795135, + "learning_rate": 3.989156132596479e-06, + "loss": 0.87066996, + "num_input_tokens_seen": 22157420, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.5546875, + "step": 1034, + "time_per_iteration": 2.507636785507202 + }, + { + "auxiliary_loss_clip": 0.01202421, + "auxiliary_loss_mlp": 0.01060051, + "balance_loss_clip": 1.03399241, + "balance_loss_mlp": 1.05694687, + "epoch": 0.06222756651134827, + "flos": 34458478773120.0, + "grad_norm": 3.176440156188905, + "language_loss": 0.81156218, + "learning_rate": 3.989115593732182e-06, + "loss": 0.83418697, + "num_input_tokens_seen": 22178620, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.453125, + "step": 1035, + "time_per_iteration": 2.624635696411133 + }, + { + "auxiliary_loss_clip": 0.01212161, + "auxiliary_loss_mlp": 0.01068413, + "balance_loss_clip": 1.04051828, + "balance_loss_mlp": 1.06080353, + "epoch": 0.06228768976401623, + "flos": 25666469763840.0, + "grad_norm": 2.252599829484163, + "language_loss": 0.78701359, + "learning_rate": 3.989074979440421e-06, + "loss": 0.80981934, + "num_input_tokens_seen": 22197125, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.515625, + "step": 1036, + "time_per_iteration": 2.53027081489563 + }, + { + "auxiliary_loss_clip": 0.01204167, + "auxiliary_loss_mlp": 0.01068533, + "balance_loss_clip": 1.04334402, + "balance_loss_mlp": 1.05620134, + "epoch": 0.062347813016684205, + "flos": 25295619795840.0, + "grad_norm": 1.670767972712633, + "language_loss": 0.86802149, + "learning_rate": 3.989034289722739e-06, + "loss": 0.8907485, + "num_input_tokens_seen": 22217575, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.4765625, + "step": 1037, + "time_per_iteration": 2.506011724472046 + }, + { + "auxiliary_loss_clip": 0.01206019, + "auxiliary_loss_mlp": 0.01057504, + "balance_loss_clip": 1.02990723, + "balance_loss_mlp": 1.05728471, + "epoch": 0.06240793626935217, + "flos": 26907039740160.0, + "grad_norm": 2.1914513209480933, + "language_loss": 0.81051469, + "learning_rate": 3.988993524580676e-06, + "loss": 0.83314991, + "num_input_tokens_seen": 22236840, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.484375, + "step": 1038, + "time_per_iteration": 2.486758232116699 + }, + { + "auxiliary_loss_clip": 0.01205947, + "auxiliary_loss_mlp": 0.01072566, + "balance_loss_clip": 1.04587555, + "balance_loss_mlp": 1.05856836, + "epoch": 0.06246805952202014, + "flos": 21615782146560.0, + "grad_norm": 2.3663261426095965, + "language_loss": 0.85336804, + "learning_rate": 3.98895268401578e-06, + "loss": 0.87615323, + "num_input_tokens_seen": 22256465, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.46875, + "step": 1039, + "time_per_iteration": 2.489241123199463 + }, + { + "auxiliary_loss_clip": 0.01207559, + "auxiliary_loss_mlp": 0.01067632, + "balance_loss_clip": 1.0417757, + "balance_loss_mlp": 1.05744672, + "epoch": 0.0625281827746881, + "flos": 19311896833920.0, + "grad_norm": 1.9774289629637263, + "language_loss": 0.80853289, + "learning_rate": 3.9889117680296e-06, + "loss": 0.83128488, + "num_input_tokens_seen": 22274025, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.5, + "step": 1040, + "time_per_iteration": 2.480022668838501 + }, + { + "auxiliary_loss_clip": 0.01213203, + "auxiliary_loss_mlp": 0.01067746, + "balance_loss_clip": 1.04155588, + "balance_loss_mlp": 1.06227219, + "epoch": 0.06258830602735609, + "flos": 27745769289600.0, + "grad_norm": 2.535271913081881, + "language_loss": 0.69440711, + "learning_rate": 3.988870776623685e-06, + "loss": 0.71721661, + "num_input_tokens_seen": 22292245, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.5078125, + "step": 1041, + "time_per_iteration": 2.5417978763580322 + }, + { + "auxiliary_loss_clip": 0.01210541, + "auxiliary_loss_mlp": 0.0106006, + "balance_loss_clip": 1.03360724, + "balance_loss_mlp": 1.05743289, + "epoch": 0.06264842928002405, + "flos": 23222605150080.0, + "grad_norm": 1.9564735382917973, + "language_loss": 0.80983013, + "learning_rate": 3.9888297097995905e-06, + "loss": 0.83253616, + "num_input_tokens_seen": 22311455, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.53125, + "step": 1042, + "time_per_iteration": 2.478926181793213 + }, + { + "auxiliary_loss_clip": 0.01210242, + "auxiliary_loss_mlp": 0.01057893, + "balance_loss_clip": 1.03352678, + "balance_loss_mlp": 1.05925727, + "epoch": 0.06270855253269202, + "flos": 38399495189760.0, + "grad_norm": 1.9466384226705415, + "language_loss": 0.76463902, + "learning_rate": 3.988788567558874e-06, + "loss": 0.78732038, + "num_input_tokens_seen": 22333750, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.515625, + "step": 1043, + "time_per_iteration": 2.6262781620025635 + }, + { + "auxiliary_loss_clip": 0.01203702, + "auxiliary_loss_mlp": 0.01066445, + "balance_loss_clip": 1.04174471, + "balance_loss_mlp": 1.05835676, + "epoch": 0.06276867578535998, + "flos": 22453542028800.0, + "grad_norm": 1.8860277298285366, + "language_loss": 0.92454541, + "learning_rate": 3.988747349903097e-06, + "loss": 0.94724691, + "num_input_tokens_seen": 22351940, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.453125, + "step": 1044, + "time_per_iteration": 2.4886953830718994 + }, + { + "auxiliary_loss_clip": 0.01204359, + "auxiliary_loss_mlp": 0.01073486, + "balance_loss_clip": 1.04824948, + "balance_loss_mlp": 1.05475259, + "epoch": 0.06282879903802796, + "flos": 22930435923840.0, + "grad_norm": 1.9539908597303346, + "language_loss": 0.8581354, + "learning_rate": 3.988706056833821e-06, + "loss": 0.88091385, + "num_input_tokens_seen": 22372085, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.5, + "step": 1045, + "time_per_iteration": 2.5382347106933594 + }, + { + "auxiliary_loss_clip": 0.01203094, + "auxiliary_loss_mlp": 0.01073753, + "balance_loss_clip": 1.04900479, + "balance_loss_mlp": 1.05618775, + "epoch": 0.06288892229069593, + "flos": 34819237019520.0, + "grad_norm": 2.0798822187092094, + "language_loss": 0.77675486, + "learning_rate": 3.9886646883526125e-06, + "loss": 0.79952335, + "num_input_tokens_seen": 22392020, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.46875, + "step": 1046, + "time_per_iteration": 2.548157215118408 + }, + { + "auxiliary_loss_clip": 0.01206605, + "auxiliary_loss_mlp": 0.01074859, + "balance_loss_clip": 1.04981279, + "balance_loss_mlp": 1.05837655, + "epoch": 0.06294904554336389, + "flos": 19427134642560.0, + "grad_norm": 2.197016946040243, + "language_loss": 0.77317166, + "learning_rate": 3.988623244461039e-06, + "loss": 0.79598629, + "num_input_tokens_seen": 22411180, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.4765625, + "step": 1047, + "time_per_iteration": 2.467973232269287 + }, + { + "auxiliary_loss_clip": 0.0121283, + "auxiliary_loss_mlp": 0.01061299, + "balance_loss_clip": 1.03584743, + "balance_loss_mlp": 1.05874014, + "epoch": 0.06300916879603187, + "flos": 40661867358720.0, + "grad_norm": 2.3103480986625753, + "language_loss": 0.7696203, + "learning_rate": 3.988581725160672e-06, + "loss": 0.79236162, + "num_input_tokens_seen": 22435105, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.546875, + "step": 1048, + "time_per_iteration": 2.636072874069214 + }, + { + "auxiliary_loss_clip": 0.01209565, + "auxiliary_loss_mlp": 0.01072791, + "balance_loss_clip": 1.0470655, + "balance_loss_mlp": 1.0583266, + "epoch": 0.06306929204869983, + "flos": 23804142341760.0, + "grad_norm": 2.2069714466600656, + "language_loss": 0.77757037, + "learning_rate": 3.988540130453087e-06, + "loss": 0.80039394, + "num_input_tokens_seen": 22452710, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.515625, + "step": 1049, + "time_per_iteration": 2.5173420906066895 + }, + { + "auxiliary_loss_clip": 0.01207985, + "auxiliary_loss_mlp": 0.01065489, + "balance_loss_clip": 1.04047871, + "balance_loss_mlp": 1.05734015, + "epoch": 0.0631294153013678, + "flos": 18915802583040.0, + "grad_norm": 2.316298014027776, + "language_loss": 0.83165503, + "learning_rate": 3.988498460339862e-06, + "loss": 0.85438979, + "num_input_tokens_seen": 22470175, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.5078125, + "step": 1050, + "time_per_iteration": 2.4742541313171387 + }, + { + "auxiliary_loss_clip": 0.01204381, + "auxiliary_loss_mlp": 0.01062607, + "balance_loss_clip": 1.03852665, + "balance_loss_mlp": 1.05776763, + "epoch": 0.06318953855403578, + "flos": 24280174310400.0, + "grad_norm": 2.1475970013183563, + "language_loss": 0.76909173, + "learning_rate": 3.988456714822575e-06, + "loss": 0.79176152, + "num_input_tokens_seen": 22490020, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.46875, + "step": 1051, + "time_per_iteration": 2.4629740715026855 + }, + { + "auxiliary_loss_clip": 0.01207556, + "auxiliary_loss_mlp": 0.01070398, + "balance_loss_clip": 1.04487562, + "balance_loss_mlp": 1.05788827, + "epoch": 0.06324966180670374, + "flos": 22528918719360.0, + "grad_norm": 2.090947022989376, + "language_loss": 0.80053556, + "learning_rate": 3.98841489390281e-06, + "loss": 0.82331514, + "num_input_tokens_seen": 22509685, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.4921875, + "step": 1052, + "time_per_iteration": 2.4729230403900146 + }, + { + "auxiliary_loss_clip": 0.01209047, + "auxiliary_loss_mlp": 0.01064567, + "balance_loss_clip": 1.03911567, + "balance_loss_mlp": 1.05839717, + "epoch": 0.06330978505937171, + "flos": 15778107884160.0, + "grad_norm": 2.21177767113968, + "language_loss": 0.78088665, + "learning_rate": 3.988372997582155e-06, + "loss": 0.80362272, + "num_input_tokens_seen": 22527905, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.5078125, + "step": 1053, + "time_per_iteration": 2.433969736099243 + }, + { + "auxiliary_loss_clip": 0.01209123, + "auxiliary_loss_mlp": 0.01055135, + "balance_loss_clip": 1.03094769, + "balance_loss_mlp": 1.0578481, + "epoch": 0.06336990831203967, + "flos": 21471098163840.0, + "grad_norm": 1.8421697124920164, + "language_loss": 0.84737611, + "learning_rate": 3.988331025862195e-06, + "loss": 0.8700186, + "num_input_tokens_seen": 22546335, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.515625, + "step": 1054, + "time_per_iteration": 2.4986183643341064 + }, + { + "auxiliary_loss_clip": 0.01205973, + "auxiliary_loss_mlp": 0.01065192, + "balance_loss_clip": 1.04051518, + "balance_loss_mlp": 1.05870843, + "epoch": 0.06343003156470765, + "flos": 18478877546880.0, + "grad_norm": 1.9255333357469135, + "language_loss": 0.8566432, + "learning_rate": 3.9882889787445225e-06, + "loss": 0.87935483, + "num_input_tokens_seen": 22563885, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.4765625, + "step": 1055, + "time_per_iteration": 2.5098037719726562 + }, + { + "auxiliary_loss_clip": 0.0121179, + "auxiliary_loss_mlp": 0.01071097, + "balance_loss_clip": 1.0451932, + "balance_loss_mlp": 1.05891657, + "epoch": 0.06349015481737562, + "flos": 25154886309120.0, + "grad_norm": 2.390503126540762, + "language_loss": 0.80966836, + "learning_rate": 3.988246856230734e-06, + "loss": 0.83249724, + "num_input_tokens_seen": 22583035, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.53125, + "step": 1056, + "time_per_iteration": 2.4944088459014893 + }, + { + "auxiliary_loss_clip": 0.01211818, + "auxiliary_loss_mlp": 0.01065832, + "balance_loss_clip": 1.03816319, + "balance_loss_mlp": 1.05503476, + "epoch": 0.06355027807004358, + "flos": 26871775562880.0, + "grad_norm": 2.70684555522199, + "language_loss": 0.81153649, + "learning_rate": 3.988204658322426e-06, + "loss": 0.83431304, + "num_input_tokens_seen": 22605055, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.5703125, + "step": 1057, + "time_per_iteration": 2.5327882766723633 + }, + { + "auxiliary_loss_clip": 0.0119703, + "auxiliary_loss_mlp": 0.01056395, + "balance_loss_clip": 1.03401923, + "balance_loss_mlp": 1.054492, + "epoch": 0.06361040132271156, + "flos": 21396691140480.0, + "grad_norm": 2.2830641052403826, + "language_loss": 0.8369416, + "learning_rate": 3.988162385021196e-06, + "loss": 0.85947585, + "num_input_tokens_seen": 22623760, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.421875, + "step": 1058, + "time_per_iteration": 2.4742424488067627 + }, + { + "auxiliary_loss_clip": 0.01208572, + "auxiliary_loss_mlp": 0.01067718, + "balance_loss_clip": 1.03934646, + "balance_loss_mlp": 1.05714464, + "epoch": 0.06367052457537953, + "flos": 25733765894400.0, + "grad_norm": 1.9712110015930453, + "language_loss": 0.87264961, + "learning_rate": 3.988120036328651e-06, + "loss": 0.8954125, + "num_input_tokens_seen": 22643000, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.515625, + "step": 1059, + "time_per_iteration": 5.514882564544678 + }, + { + "auxiliary_loss_clip": 0.01213823, + "auxiliary_loss_mlp": 0.01068179, + "balance_loss_clip": 1.04273927, + "balance_loss_mlp": 1.06130195, + "epoch": 0.0637306478280475, + "flos": 17631420992640.0, + "grad_norm": 2.227642611819728, + "language_loss": 0.9117676, + "learning_rate": 3.988077612246394e-06, + "loss": 0.9345876, + "num_input_tokens_seen": 22660460, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.5234375, + "step": 1060, + "time_per_iteration": 3.8977622985839844 + }, + { + "auxiliary_loss_clip": 0.01204952, + "auxiliary_loss_mlp": 0.01062848, + "balance_loss_clip": 1.03691983, + "balance_loss_mlp": 1.05582809, + "epoch": 0.06379077108071547, + "flos": 13662610427520.0, + "grad_norm": 1.9159755464944204, + "language_loss": 0.87713706, + "learning_rate": 3.988035112776035e-06, + "loss": 0.89981508, + "num_input_tokens_seen": 22679270, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.4921875, + "step": 1061, + "time_per_iteration": 2.4825663566589355 + }, + { + "auxiliary_loss_clip": 0.01213048, + "auxiliary_loss_mlp": 0.01066139, + "balance_loss_clip": 1.03862584, + "balance_loss_mlp": 1.05683804, + "epoch": 0.06385089433338344, + "flos": 28478849961600.0, + "grad_norm": 2.167309005799961, + "language_loss": 0.771905, + "learning_rate": 3.987992537919185e-06, + "loss": 0.79469687, + "num_input_tokens_seen": 22699330, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.5625, + "step": 1062, + "time_per_iteration": 2.5576398372650146 + }, + { + "auxiliary_loss_clip": 0.01206834, + "auxiliary_loss_mlp": 0.01063844, + "balance_loss_clip": 1.03896523, + "balance_loss_mlp": 1.05504322, + "epoch": 0.0639110175860514, + "flos": 24311057028480.0, + "grad_norm": 2.0414192004570872, + "language_loss": 0.86835265, + "learning_rate": 3.987949887677459e-06, + "loss": 0.89105946, + "num_input_tokens_seen": 22717945, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 1.515625, + "step": 1063, + "time_per_iteration": 2.472473382949829 + }, + { + "auxiliary_loss_clip": 0.01206458, + "auxiliary_loss_mlp": 0.0106328, + "balance_loss_clip": 1.03747082, + "balance_loss_mlp": 1.05539751, + "epoch": 0.06397114083871938, + "flos": 22090772620800.0, + "grad_norm": 2.0150359019026185, + "language_loss": 0.8051579, + "learning_rate": 3.9879071620524744e-06, + "loss": 0.82785529, + "num_input_tokens_seen": 22736790, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.515625, + "step": 1064, + "time_per_iteration": 2.478205919265747 + }, + { + "auxiliary_loss_clip": 0.01207278, + "auxiliary_loss_mlp": 0.01070567, + "balance_loss_clip": 1.04409075, + "balance_loss_mlp": 1.05682254, + "epoch": 0.06403126409138735, + "flos": 19572824206080.0, + "grad_norm": 2.254194289767691, + "language_loss": 0.84650666, + "learning_rate": 3.987864361045851e-06, + "loss": 0.86928511, + "num_input_tokens_seen": 22754745, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5078125, + "step": 1065, + "time_per_iteration": 2.456456184387207 + }, + { + "auxiliary_loss_clip": 0.01207067, + "auxiliary_loss_mlp": 0.01055171, + "balance_loss_clip": 1.03099585, + "balance_loss_mlp": 1.05966115, + "epoch": 0.06409138734405531, + "flos": 40807413267840.0, + "grad_norm": 1.66169186591579, + "language_loss": 0.68201709, + "learning_rate": 3.987821484659211e-06, + "loss": 0.70463943, + "num_input_tokens_seen": 22776780, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.46875, + "step": 1066, + "time_per_iteration": 2.6294829845428467 + }, + { + "auxiliary_loss_clip": 0.01206291, + "auxiliary_loss_mlp": 0.01076738, + "balance_loss_clip": 1.05003476, + "balance_loss_mlp": 1.05877519, + "epoch": 0.06415151059672328, + "flos": 20441610460800.0, + "grad_norm": 3.704601442813356, + "language_loss": 0.90345579, + "learning_rate": 3.987778532894181e-06, + "loss": 0.9262861, + "num_input_tokens_seen": 22793915, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.4765625, + "step": 1067, + "time_per_iteration": 2.459721565246582 + }, + { + "auxiliary_loss_clip": 0.01207052, + "auxiliary_loss_mlp": 0.01068129, + "balance_loss_clip": 1.04364336, + "balance_loss_mlp": 1.05625772, + "epoch": 0.06421163384939126, + "flos": 18072045129600.0, + "grad_norm": 2.8684947664405436, + "language_loss": 0.8343029, + "learning_rate": 3.987735505752391e-06, + "loss": 0.85705471, + "num_input_tokens_seen": 22812670, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.515625, + "step": 1068, + "time_per_iteration": 2.4611129760742188 + }, + { + "auxiliary_loss_clip": 0.01205753, + "auxiliary_loss_mlp": 0.01064379, + "balance_loss_clip": 1.03963113, + "balance_loss_mlp": 1.05991328, + "epoch": 0.06427175710205922, + "flos": 25119442563840.0, + "grad_norm": 2.4683216708617053, + "language_loss": 0.89402264, + "learning_rate": 3.987692403235471e-06, + "loss": 0.91672397, + "num_input_tokens_seen": 22832440, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.453125, + "step": 1069, + "time_per_iteration": 2.486241340637207 + }, + { + "auxiliary_loss_clip": 0.01206711, + "auxiliary_loss_mlp": 0.01082225, + "balance_loss_clip": 1.05555749, + "balance_loss_mlp": 1.05718124, + "epoch": 0.06433188035472719, + "flos": 17380549428480.0, + "grad_norm": 2.6076700233042396, + "language_loss": 0.95764256, + "learning_rate": 3.987649225345056e-06, + "loss": 0.98053193, + "num_input_tokens_seen": 22845495, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5, + "step": 1070, + "time_per_iteration": 2.413357734680176 + }, + { + "auxiliary_loss_clip": 0.01209924, + "auxiliary_loss_mlp": 0.01057318, + "balance_loss_clip": 1.0309608, + "balance_loss_mlp": 1.05859673, + "epoch": 0.06439200360739517, + "flos": 23546267625600.0, + "grad_norm": 1.8004745601001504, + "language_loss": 0.8819589, + "learning_rate": 3.987605972082782e-06, + "loss": 0.90463126, + "num_input_tokens_seen": 22865390, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.515625, + "step": 1071, + "time_per_iteration": 2.4717295169830322 + }, + { + "auxiliary_loss_clip": 0.01204044, + "auxiliary_loss_mlp": 0.01052011, + "balance_loss_clip": 1.02799058, + "balance_loss_mlp": 1.056633, + "epoch": 0.06445212686006313, + "flos": 21979772616960.0, + "grad_norm": 1.6498592642907823, + "language_loss": 0.75996184, + "learning_rate": 3.987562643450292e-06, + "loss": 0.78252238, + "num_input_tokens_seen": 22885495, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.4765625, + "step": 1072, + "time_per_iteration": 2.486936330795288 + }, + { + "auxiliary_loss_clip": 0.01207782, + "auxiliary_loss_mlp": 0.010661, + "balance_loss_clip": 1.03951669, + "balance_loss_mlp": 1.05679154, + "epoch": 0.0645122501127311, + "flos": 25921291824000.0, + "grad_norm": 1.95165590675185, + "language_loss": 0.80415034, + "learning_rate": 3.987519239449226e-06, + "loss": 0.82688916, + "num_input_tokens_seen": 22904845, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5078125, + "step": 1073, + "time_per_iteration": 2.476189613342285 + }, + { + "auxiliary_loss_clip": 0.01200054, + "auxiliary_loss_mlp": 0.01059954, + "balance_loss_clip": 1.03563547, + "balance_loss_mlp": 1.05634785, + "epoch": 0.06457237336539907, + "flos": 25626034028160.0, + "grad_norm": 1.7105520573330508, + "language_loss": 0.80205524, + "learning_rate": 3.987475760081233e-06, + "loss": 0.82465529, + "num_input_tokens_seen": 22925940, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.4375, + "step": 1074, + "time_per_iteration": 2.499133586883545 + }, + { + "auxiliary_loss_clip": 0.01204265, + "auxiliary_loss_mlp": 0.01060595, + "balance_loss_clip": 1.03469074, + "balance_loss_mlp": 1.05560029, + "epoch": 0.06463249661806704, + "flos": 19463979018240.0, + "grad_norm": 2.398999995550556, + "language_loss": 0.79203326, + "learning_rate": 3.987432205347958e-06, + "loss": 0.81468183, + "num_input_tokens_seen": 22944375, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.484375, + "step": 1075, + "time_per_iteration": 2.46777606010437 + }, + { + "auxiliary_loss_clip": 0.01207545, + "auxiliary_loss_mlp": 0.01064646, + "balance_loss_clip": 1.04086363, + "balance_loss_mlp": 1.05960226, + "epoch": 0.064692619870735, + "flos": 24498044254080.0, + "grad_norm": 2.7671348430420712, + "language_loss": 0.87819242, + "learning_rate": 3.987388575251055e-06, + "loss": 0.90091443, + "num_input_tokens_seen": 22959145, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.484375, + "step": 1076, + "time_per_iteration": 2.487417221069336 + }, + { + "auxiliary_loss_clip": 0.01199028, + "auxiliary_loss_mlp": 0.01053729, + "balance_loss_clip": 1.02918351, + "balance_loss_mlp": 1.05429745, + "epoch": 0.06475274312340297, + "flos": 17018677860480.0, + "grad_norm": 2.1388407300528534, + "language_loss": 0.80692923, + "learning_rate": 3.98734486979218e-06, + "loss": 0.82945681, + "num_input_tokens_seen": 22978100, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.4453125, + "step": 1077, + "time_per_iteration": 2.4290995597839355 + }, + { + "auxiliary_loss_clip": 0.01211867, + "auxiliary_loss_mlp": 0.01071702, + "balance_loss_clip": 1.04566646, + "balance_loss_mlp": 1.05862093, + "epoch": 0.06481286637607095, + "flos": 24572379450240.0, + "grad_norm": 2.618517400605346, + "language_loss": 0.91640681, + "learning_rate": 3.987301088972986e-06, + "loss": 0.93924248, + "num_input_tokens_seen": 22997285, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.53125, + "step": 1078, + "time_per_iteration": 2.500995635986328 + }, + { + "auxiliary_loss_clip": 0.01212712, + "auxiliary_loss_mlp": 0.01062475, + "balance_loss_clip": 1.03698754, + "balance_loss_mlp": 1.05874825, + "epoch": 0.06487298962873891, + "flos": 21105635235840.0, + "grad_norm": 2.106125999672554, + "language_loss": 0.78772497, + "learning_rate": 3.987257232795137e-06, + "loss": 0.81047684, + "num_input_tokens_seen": 23016285, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.5390625, + "step": 1079, + "time_per_iteration": 2.4510841369628906 + }, + { + "auxiliary_loss_clip": 0.01204732, + "auxiliary_loss_mlp": 0.01061369, + "balance_loss_clip": 1.03619218, + "balance_loss_mlp": 1.05602205, + "epoch": 0.06493311288140688, + "flos": 24608182331520.0, + "grad_norm": 2.051955253501364, + "language_loss": 0.69555283, + "learning_rate": 3.987213301260294e-06, + "loss": 0.7182138, + "num_input_tokens_seen": 23036420, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.484375, + "step": 1080, + "time_per_iteration": 2.5024302005767822 + }, + { + "auxiliary_loss_clip": 0.01204586, + "auxiliary_loss_mlp": 0.01063302, + "balance_loss_clip": 1.03649211, + "balance_loss_mlp": 1.05477285, + "epoch": 0.06499323613407486, + "flos": 25337994865920.0, + "grad_norm": 1.85895294752556, + "language_loss": 0.72094852, + "learning_rate": 3.987169294370123e-06, + "loss": 0.74362737, + "num_input_tokens_seen": 23056945, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.5, + "step": 1081, + "time_per_iteration": 2.5032622814178467 + }, + { + "auxiliary_loss_clip": 0.01201777, + "auxiliary_loss_mlp": 0.01064533, + "balance_loss_clip": 1.03867674, + "balance_loss_mlp": 1.0554111, + "epoch": 0.06505335938674282, + "flos": 20375714960640.0, + "grad_norm": 2.6422342029105863, + "language_loss": 0.84621316, + "learning_rate": 3.987125212126294e-06, + "loss": 0.86887628, + "num_input_tokens_seen": 23074940, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.46875, + "step": 1082, + "time_per_iteration": 2.459564447402954 + }, + { + "auxiliary_loss_clip": 0.01214386, + "auxiliary_loss_mlp": 0.01067955, + "balance_loss_clip": 1.04106104, + "balance_loss_mlp": 1.05817008, + "epoch": 0.06511348263941079, + "flos": 25337923038720.0, + "grad_norm": 2.177850298461163, + "language_loss": 0.8303026, + "learning_rate": 3.987081054530478e-06, + "loss": 0.85312605, + "num_input_tokens_seen": 23093420, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.5625, + "step": 1083, + "time_per_iteration": 2.504584550857544 + }, + { + "auxiliary_loss_clip": 0.01206291, + "auxiliary_loss_mlp": 0.01065303, + "balance_loss_clip": 1.03852844, + "balance_loss_mlp": 1.05794787, + "epoch": 0.06517360589207877, + "flos": 20332801186560.0, + "grad_norm": 2.6002614807121227, + "language_loss": 0.79689312, + "learning_rate": 3.987036821584348e-06, + "loss": 0.81960905, + "num_input_tokens_seen": 23111550, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.484375, + "step": 1084, + "time_per_iteration": 2.4530820846557617 + }, + { + "auxiliary_loss_clip": 0.01204762, + "auxiliary_loss_mlp": 0.01060872, + "balance_loss_clip": 1.03489637, + "balance_loss_mlp": 1.05634058, + "epoch": 0.06523372914474673, + "flos": 31681650061440.0, + "grad_norm": 2.1191367521188074, + "language_loss": 0.66211331, + "learning_rate": 3.986992513289584e-06, + "loss": 0.68476963, + "num_input_tokens_seen": 23130335, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.484375, + "step": 1085, + "time_per_iteration": 2.5733256340026855 + }, + { + "auxiliary_loss_clip": 0.01198609, + "auxiliary_loss_mlp": 0.01069188, + "balance_loss_clip": 1.04436827, + "balance_loss_mlp": 1.05400848, + "epoch": 0.0652938523974147, + "flos": 20778165918720.0, + "grad_norm": 1.9997547556569089, + "language_loss": 0.76998973, + "learning_rate": 3.9869481296478645e-06, + "loss": 0.79266769, + "num_input_tokens_seen": 23152380, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.4453125, + "step": 1086, + "time_per_iteration": 2.4958763122558594 + }, + { + "auxiliary_loss_clip": 0.01199669, + "auxiliary_loss_mlp": 0.01063299, + "balance_loss_clip": 1.03763306, + "balance_loss_mlp": 1.05291176, + "epoch": 0.06535397565008266, + "flos": 16690993061760.0, + "grad_norm": 2.1546414392836977, + "language_loss": 0.85154319, + "learning_rate": 3.986903670660872e-06, + "loss": 0.87417287, + "num_input_tokens_seen": 23171630, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.46875, + "step": 1087, + "time_per_iteration": 2.4456934928894043 + }, + { + "auxiliary_loss_clip": 0.01204231, + "auxiliary_loss_mlp": 0.01061167, + "balance_loss_clip": 1.03609776, + "balance_loss_mlp": 1.05594206, + "epoch": 0.06541409890275064, + "flos": 26868220116480.0, + "grad_norm": 1.7775330808837086, + "language_loss": 0.77970594, + "learning_rate": 3.9868591363302945e-06, + "loss": 0.80235994, + "num_input_tokens_seen": 23192520, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.484375, + "step": 1088, + "time_per_iteration": 2.53505277633667 + }, + { + "auxiliary_loss_clip": 0.01204134, + "auxiliary_loss_mlp": 0.01066637, + "balance_loss_clip": 1.04329574, + "balance_loss_mlp": 1.05602646, + "epoch": 0.06547422215541861, + "flos": 20521620005760.0, + "grad_norm": 1.9036978890371752, + "language_loss": 0.71191919, + "learning_rate": 3.9868145266578186e-06, + "loss": 0.73462689, + "num_input_tokens_seen": 23210710, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.484375, + "step": 1089, + "time_per_iteration": 2.4569168090820312 + }, + { + "auxiliary_loss_clip": 0.01204151, + "auxiliary_loss_mlp": 0.0105997, + "balance_loss_clip": 1.03566289, + "balance_loss_mlp": 1.05729651, + "epoch": 0.06553434540808657, + "flos": 22016616992640.0, + "grad_norm": 1.7924808842614686, + "language_loss": 0.85504186, + "learning_rate": 3.9867698416451366e-06, + "loss": 0.8776831, + "num_input_tokens_seen": 23230305, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.46875, + "step": 1090, + "time_per_iteration": 2.4624812602996826 + }, + { + "auxiliary_loss_clip": 0.01204567, + "auxiliary_loss_mlp": 0.01062106, + "balance_loss_clip": 1.0365001, + "balance_loss_mlp": 1.05594897, + "epoch": 0.06559446866075455, + "flos": 24608649208320.0, + "grad_norm": 2.2382380061135945, + "language_loss": 0.72027361, + "learning_rate": 3.9867250812939434e-06, + "loss": 0.74294031, + "num_input_tokens_seen": 23249015, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.484375, + "step": 1091, + "time_per_iteration": 2.4911999702453613 + }, + { + "auxiliary_loss_clip": 0.01201014, + "auxiliary_loss_mlp": 0.0106187, + "balance_loss_clip": 1.03638279, + "balance_loss_mlp": 1.05507159, + "epoch": 0.06565459191342252, + "flos": 24274679529600.0, + "grad_norm": 2.7948943762047525, + "language_loss": 0.82525271, + "learning_rate": 3.986680245605936e-06, + "loss": 0.8478815, + "num_input_tokens_seen": 23265105, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4609375, + "step": 1092, + "time_per_iteration": 2.510835886001587 + }, + { + "auxiliary_loss_clip": 0.01205888, + "auxiliary_loss_mlp": 0.01064535, + "balance_loss_clip": 1.03716493, + "balance_loss_mlp": 1.05484402, + "epoch": 0.06571471516609048, + "flos": 24787124910720.0, + "grad_norm": 4.994634192306823, + "language_loss": 0.71286589, + "learning_rate": 3.986635334582814e-06, + "loss": 0.73557013, + "num_input_tokens_seen": 23283950, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.515625, + "step": 1093, + "time_per_iteration": 2.528994560241699 + }, + { + "auxiliary_loss_clip": 0.01204526, + "auxiliary_loss_mlp": 0.01063177, + "balance_loss_clip": 1.03668869, + "balance_loss_mlp": 1.05701041, + "epoch": 0.06577483841875846, + "flos": 26214071581440.0, + "grad_norm": 1.8259988866114194, + "language_loss": 0.87971264, + "learning_rate": 3.986590348226282e-06, + "loss": 0.90238965, + "num_input_tokens_seen": 23305005, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.4765625, + "step": 1094, + "time_per_iteration": 2.50201678276062 + }, + { + "auxiliary_loss_clip": 0.01205803, + "auxiliary_loss_mlp": 0.0106202, + "balance_loss_clip": 1.0350548, + "balance_loss_mlp": 1.0575459, + "epoch": 0.06583496167142643, + "flos": 25080802508160.0, + "grad_norm": 1.6349502946236962, + "language_loss": 0.81364405, + "learning_rate": 3.986545286538044e-06, + "loss": 0.83632231, + "num_input_tokens_seen": 23323220, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.484375, + "step": 1095, + "time_per_iteration": 2.4947729110717773 + }, + { + "auxiliary_loss_clip": 0.01200923, + "auxiliary_loss_mlp": 0.01057353, + "balance_loss_clip": 1.03414297, + "balance_loss_mlp": 1.05544913, + "epoch": 0.06589508492409439, + "flos": 25629804956160.0, + "grad_norm": 2.4379029944224215, + "language_loss": 0.69712919, + "learning_rate": 3.986500149519811e-06, + "loss": 0.7197119, + "num_input_tokens_seen": 23342235, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.453125, + "step": 1096, + "time_per_iteration": 2.5135879516601562 + }, + { + "auxiliary_loss_clip": 0.01206873, + "auxiliary_loss_mlp": 0.01069815, + "balance_loss_clip": 1.04451883, + "balance_loss_mlp": 1.0592947, + "epoch": 0.06595520817676236, + "flos": 23621249266560.0, + "grad_norm": 1.7715259730160258, + "language_loss": 0.77498722, + "learning_rate": 3.986454937173292e-06, + "loss": 0.79775411, + "num_input_tokens_seen": 23363680, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.4765625, + "step": 1097, + "time_per_iteration": 2.4872820377349854 + }, + { + "auxiliary_loss_clip": 0.0120653, + "auxiliary_loss_mlp": 0.01063548, + "balance_loss_clip": 1.03814423, + "balance_loss_mlp": 1.05785179, + "epoch": 0.06601533142943034, + "flos": 33801708545280.0, + "grad_norm": 1.7376479388989727, + "language_loss": 0.77846545, + "learning_rate": 3.986409649500203e-06, + "loss": 0.80116618, + "num_input_tokens_seen": 23385590, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.484375, + "step": 1098, + "time_per_iteration": 2.583075761795044 + }, + { + "auxiliary_loss_clip": 0.01204454, + "auxiliary_loss_mlp": 0.01071542, + "balance_loss_clip": 1.04483891, + "balance_loss_mlp": 1.05739522, + "epoch": 0.0660754546820983, + "flos": 20259184262400.0, + "grad_norm": 1.9398633669636132, + "language_loss": 0.81675154, + "learning_rate": 3.986364286502261e-06, + "loss": 0.83951151, + "num_input_tokens_seen": 23402945, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.46875, + "step": 1099, + "time_per_iteration": 2.446969985961914 + }, + { + "auxiliary_loss_clip": 0.01195976, + "auxiliary_loss_mlp": 0.01052705, + "balance_loss_clip": 1.02801692, + "balance_loss_mlp": 1.0519135, + "epoch": 0.06613557793476627, + "flos": 19354164163200.0, + "grad_norm": 2.0018625732470245, + "language_loss": 0.82619941, + "learning_rate": 3.986318848181186e-06, + "loss": 0.84868616, + "num_input_tokens_seen": 23421410, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.4375, + "step": 1100, + "time_per_iteration": 2.4545743465423584 + }, + { + "auxiliary_loss_clip": 0.01204382, + "auxiliary_loss_mlp": 0.01060672, + "balance_loss_clip": 1.03630555, + "balance_loss_mlp": 1.05827951, + "epoch": 0.06619570118743424, + "flos": 13772568936960.0, + "grad_norm": 2.362466383483127, + "language_loss": 0.73439336, + "learning_rate": 3.986273334538702e-06, + "loss": 0.7570439, + "num_input_tokens_seen": 23438870, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.4609375, + "step": 1101, + "time_per_iteration": 6.740786790847778 + }, + { + "auxiliary_loss_clip": 0.0119874, + "auxiliary_loss_mlp": 0.01062411, + "balance_loss_clip": 1.03829539, + "balance_loss_mlp": 1.05373132, + "epoch": 0.06625582444010221, + "flos": 17857874286720.0, + "grad_norm": 2.46656505058328, + "language_loss": 0.86047602, + "learning_rate": 3.986227745576533e-06, + "loss": 0.88308758, + "num_input_tokens_seen": 23456975, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.453125, + "step": 1102, + "time_per_iteration": 2.4480903148651123 + }, + { + "auxiliary_loss_clip": 0.01200394, + "auxiliary_loss_mlp": 0.01057443, + "balance_loss_clip": 1.0322063, + "balance_loss_mlp": 1.05588222, + "epoch": 0.06631594769277017, + "flos": 11838707579520.0, + "grad_norm": 2.0494810685505995, + "language_loss": 0.81707513, + "learning_rate": 3.98618208129641e-06, + "loss": 0.83965349, + "num_input_tokens_seen": 23473440, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.4453125, + "step": 1103, + "time_per_iteration": 2.4419338703155518 + }, + { + "auxiliary_loss_clip": 0.01203538, + "auxiliary_loss_mlp": 0.01063441, + "balance_loss_clip": 1.04029047, + "balance_loss_mlp": 1.05891824, + "epoch": 0.06637607094543815, + "flos": 19793351756160.0, + "grad_norm": 1.7865556655629211, + "language_loss": 0.82059169, + "learning_rate": 3.986136341700063e-06, + "loss": 0.84326148, + "num_input_tokens_seen": 23493880, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.4453125, + "step": 1104, + "time_per_iteration": 2.506230115890503 + }, + { + "auxiliary_loss_clip": 0.01195268, + "auxiliary_loss_mlp": 0.01047754, + "balance_loss_clip": 1.02229071, + "balance_loss_mlp": 1.05232382, + "epoch": 0.06643619419810612, + "flos": 25485659677440.0, + "grad_norm": 1.6089454783719872, + "language_loss": 0.80542791, + "learning_rate": 3.986090526789227e-06, + "loss": 0.82785821, + "num_input_tokens_seen": 23514920, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4296875, + "step": 1105, + "time_per_iteration": 2.524385929107666 + }, + { + "auxiliary_loss_clip": 0.01197193, + "auxiliary_loss_mlp": 0.01060252, + "balance_loss_clip": 1.03720832, + "balance_loss_mlp": 1.05697632, + "epoch": 0.06649631745077408, + "flos": 16946533393920.0, + "grad_norm": 1.8452117827451007, + "language_loss": 0.96738935, + "learning_rate": 3.986044636565639e-06, + "loss": 0.98996383, + "num_input_tokens_seen": 23531635, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.40625, + "step": 1106, + "time_per_iteration": 2.455122470855713 + }, + { + "auxiliary_loss_clip": 0.01204143, + "auxiliary_loss_mlp": 0.01060087, + "balance_loss_clip": 1.03436136, + "balance_loss_mlp": 1.05509543, + "epoch": 0.06655644070344206, + "flos": 17858592558720.0, + "grad_norm": 1.9568581550144768, + "language_loss": 0.82766026, + "learning_rate": 3.985998671031039e-06, + "loss": 0.85030258, + "num_input_tokens_seen": 23551020, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.4921875, + "step": 1107, + "time_per_iteration": 2.4554357528686523 + }, + { + "auxiliary_loss_clip": 0.01077187, + "auxiliary_loss_mlp": 0.01010186, + "balance_loss_clip": 1.0061568, + "balance_loss_mlp": 1.01696265, + "epoch": 0.06661656395611003, + "flos": 61419350021760.0, + "grad_norm": 0.8235952583150978, + "language_loss": 0.56729984, + "learning_rate": 3.9859526301871705e-06, + "loss": 0.58817357, + "num_input_tokens_seen": 23610675, + "router_z_loss_clip": 0.0402832, + "router_z_loss_mlp": 0.6015625, + "step": 1108, + "time_per_iteration": 3.0248770713806152 + }, + { + "auxiliary_loss_clip": 0.01200435, + "auxiliary_loss_mlp": 0.01065514, + "balance_loss_clip": 1.04034865, + "balance_loss_mlp": 1.05397463, + "epoch": 0.066676687208778, + "flos": 20662856282880.0, + "grad_norm": 2.4203653272420693, + "language_loss": 0.72493321, + "learning_rate": 3.9859065140357795e-06, + "loss": 0.74759269, + "num_input_tokens_seen": 23628710, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.46875, + "step": 1109, + "time_per_iteration": 2.4559717178344727 + }, + { + "auxiliary_loss_clip": 0.01197389, + "auxiliary_loss_mlp": 0.01063103, + "balance_loss_clip": 1.03759217, + "balance_loss_mlp": 1.05389571, + "epoch": 0.06673681046144596, + "flos": 20923280864640.0, + "grad_norm": 3.084593088047962, + "language_loss": 0.78256035, + "learning_rate": 3.985860322578614e-06, + "loss": 0.80516529, + "num_input_tokens_seen": 23649160, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4375, + "step": 1110, + "time_per_iteration": 2.4989912509918213 + }, + { + "auxiliary_loss_clip": 0.01201522, + "auxiliary_loss_mlp": 0.01057407, + "balance_loss_clip": 1.0334934, + "balance_loss_mlp": 1.05598152, + "epoch": 0.06679693371411394, + "flos": 31065818359680.0, + "grad_norm": 2.197430378352105, + "language_loss": 0.71290207, + "learning_rate": 3.985814055817427e-06, + "loss": 0.73549128, + "num_input_tokens_seen": 23671995, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.453125, + "step": 1111, + "time_per_iteration": 2.5445287227630615 + }, + { + "auxiliary_loss_clip": 0.0120524, + "auxiliary_loss_mlp": 0.01062473, + "balance_loss_clip": 1.03833365, + "balance_loss_mlp": 1.05788755, + "epoch": 0.0668570569667819, + "flos": 21726135705600.0, + "grad_norm": 1.8078370838130353, + "language_loss": 0.78315711, + "learning_rate": 3.985767713753971e-06, + "loss": 0.80583429, + "num_input_tokens_seen": 23690705, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.4765625, + "step": 1112, + "time_per_iteration": 2.4791040420532227 + }, + { + "auxiliary_loss_clip": 0.01203172, + "auxiliary_loss_mlp": 0.01058254, + "balance_loss_clip": 1.03426933, + "balance_loss_mlp": 1.05794001, + "epoch": 0.06691718021944987, + "flos": 22747255539840.0, + "grad_norm": 2.0430507180103943, + "language_loss": 0.78819263, + "learning_rate": 3.985721296390005e-06, + "loss": 0.81080687, + "num_input_tokens_seen": 23709990, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.453125, + "step": 1113, + "time_per_iteration": 2.4637296199798584 + }, + { + "auxiliary_loss_clip": 0.01195153, + "auxiliary_loss_mlp": 0.01053406, + "balance_loss_clip": 1.03056598, + "balance_loss_mlp": 1.05255365, + "epoch": 0.06697730347211785, + "flos": 16545626720640.0, + "grad_norm": 2.035611213247421, + "language_loss": 0.82393003, + "learning_rate": 3.985674803727289e-06, + "loss": 0.84641558, + "num_input_tokens_seen": 23728485, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.4296875, + "step": 1114, + "time_per_iteration": 2.434006452560425 + }, + { + "auxiliary_loss_clip": 0.01076, + "auxiliary_loss_mlp": 0.01008619, + "balance_loss_clip": 1.00463712, + "balance_loss_mlp": 1.0165143, + "epoch": 0.06703742672478581, + "flos": 59782326658560.0, + "grad_norm": 0.8339607525511222, + "language_loss": 0.58126414, + "learning_rate": 3.985628235767584e-06, + "loss": 0.60211033, + "num_input_tokens_seen": 23786650, + "router_z_loss_clip": 0.03979492, + "router_z_loss_mlp": 0.59375, + "step": 1115, + "time_per_iteration": 3.020782709121704 + }, + { + "auxiliary_loss_clip": 0.01200335, + "auxiliary_loss_mlp": 0.01059737, + "balance_loss_clip": 1.03427422, + "balance_loss_mlp": 1.05479646, + "epoch": 0.06709754997745378, + "flos": 16800197385600.0, + "grad_norm": 2.8263674595854464, + "language_loss": 0.91123891, + "learning_rate": 3.985581592512658e-06, + "loss": 0.93383968, + "num_input_tokens_seen": 23802555, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.453125, + "step": 1116, + "time_per_iteration": 2.446439504623413 + }, + { + "auxiliary_loss_clip": 0.01209259, + "auxiliary_loss_mlp": 0.01067721, + "balance_loss_clip": 1.04323506, + "balance_loss_mlp": 1.06065357, + "epoch": 0.06715767323012176, + "flos": 22123917895680.0, + "grad_norm": 2.019283248682947, + "language_loss": 0.8709814, + "learning_rate": 3.985534873964279e-06, + "loss": 0.89375114, + "num_input_tokens_seen": 23822945, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.484375, + "step": 1117, + "time_per_iteration": 2.486212968826294 + }, + { + "auxiliary_loss_clip": 0.01074244, + "auxiliary_loss_mlp": 0.0100646, + "balance_loss_clip": 1.00250196, + "balance_loss_mlp": 1.01550937, + "epoch": 0.06721779648278972, + "flos": 66618100137600.0, + "grad_norm": 0.9454776991467404, + "language_loss": 0.59798217, + "learning_rate": 3.985488080124218e-06, + "loss": 0.6187892, + "num_input_tokens_seen": 23874075, + "router_z_loss_clip": 0.03955078, + "router_z_loss_mlp": 0.5859375, + "step": 1118, + "time_per_iteration": 3.0197594165802 + }, + { + "auxiliary_loss_clip": 0.01201284, + "auxiliary_loss_mlp": 0.01056466, + "balance_loss_clip": 1.03255224, + "balance_loss_mlp": 1.05418777, + "epoch": 0.06727791973545769, + "flos": 22382474970240.0, + "grad_norm": 3.7568577616727468, + "language_loss": 0.83498162, + "learning_rate": 3.985441210994251e-06, + "loss": 0.85755914, + "num_input_tokens_seen": 23889720, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.4765625, + "step": 1119, + "time_per_iteration": 2.4535257816314697 + }, + { + "auxiliary_loss_clip": 0.01199216, + "auxiliary_loss_mlp": 0.01059451, + "balance_loss_clip": 1.03732562, + "balance_loss_mlp": 1.0562222, + "epoch": 0.06733804298812565, + "flos": 24280210224000.0, + "grad_norm": 1.8165724331790314, + "language_loss": 0.8480413, + "learning_rate": 3.9853942665761545e-06, + "loss": 0.87062794, + "num_input_tokens_seen": 23909385, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.4296875, + "step": 1120, + "time_per_iteration": 2.533182382583618 + }, + { + "auxiliary_loss_clip": 0.01208718, + "auxiliary_loss_mlp": 0.01067102, + "balance_loss_clip": 1.04269981, + "balance_loss_mlp": 1.0602659, + "epoch": 0.06739816624079363, + "flos": 15918230839680.0, + "grad_norm": 2.032922437281707, + "language_loss": 0.78959441, + "learning_rate": 3.985347246871708e-06, + "loss": 0.81235266, + "num_input_tokens_seen": 23926830, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.484375, + "step": 1121, + "time_per_iteration": 2.4523215293884277 + }, + { + "auxiliary_loss_clip": 0.01072018, + "auxiliary_loss_mlp": 0.01004747, + "balance_loss_clip": 1.00033593, + "balance_loss_mlp": 1.0132587, + "epoch": 0.0674582894934616, + "flos": 71398567353600.0, + "grad_norm": 0.7615352754050735, + "language_loss": 0.58346939, + "learning_rate": 3.985300151882694e-06, + "loss": 0.60423702, + "num_input_tokens_seen": 23992640, + "router_z_loss_clip": 0.04418945, + "router_z_loss_mlp": 0.5859375, + "step": 1122, + "time_per_iteration": 3.2087855339050293 + }, + { + "auxiliary_loss_clip": 0.0120309, + "auxiliary_loss_mlp": 0.01065913, + "balance_loss_clip": 1.04245234, + "balance_loss_mlp": 1.0584271, + "epoch": 0.06751841274612956, + "flos": 25264952559360.0, + "grad_norm": 2.0430211727412098, + "language_loss": 0.71546745, + "learning_rate": 3.985252981610901e-06, + "loss": 0.73815745, + "num_input_tokens_seen": 24011135, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.4453125, + "step": 1123, + "time_per_iteration": 2.5017640590667725 + }, + { + "auxiliary_loss_clip": 0.01201701, + "auxiliary_loss_mlp": 0.01057362, + "balance_loss_clip": 1.03216124, + "balance_loss_mlp": 1.05484593, + "epoch": 0.06757853599879754, + "flos": 23802741711360.0, + "grad_norm": 1.8376842720828679, + "language_loss": 0.79288971, + "learning_rate": 3.985205736058114e-06, + "loss": 0.81548035, + "num_input_tokens_seen": 24030695, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.46875, + "step": 1124, + "time_per_iteration": 2.4980688095092773 + }, + { + "auxiliary_loss_clip": 0.01196564, + "auxiliary_loss_mlp": 0.01054377, + "balance_loss_clip": 1.03204954, + "balance_loss_mlp": 1.05469489, + "epoch": 0.0676386592514655, + "flos": 21033742164480.0, + "grad_norm": 2.0983993205372253, + "language_loss": 0.71198726, + "learning_rate": 3.985158415226128e-06, + "loss": 0.73449671, + "num_input_tokens_seen": 24050680, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.421875, + "step": 1125, + "time_per_iteration": 2.4704325199127197 + }, + { + "auxiliary_loss_clip": 0.01198895, + "auxiliary_loss_mlp": 0.01068522, + "balance_loss_clip": 1.04247451, + "balance_loss_mlp": 1.05620742, + "epoch": 0.06769878250413347, + "flos": 25556331686400.0, + "grad_norm": 2.9171204901367243, + "language_loss": 0.80814254, + "learning_rate": 3.985111019116736e-06, + "loss": 0.83081663, + "num_input_tokens_seen": 24067205, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.421875, + "step": 1126, + "time_per_iteration": 2.5046803951263428 + }, + { + "auxiliary_loss_clip": 0.01070877, + "auxiliary_loss_mlp": 0.01002736, + "balance_loss_clip": 0.9986586, + "balance_loss_mlp": 1.01286924, + "epoch": 0.06775890575680145, + "flos": 70655251305600.0, + "grad_norm": 0.7804116507992601, + "language_loss": 0.59733766, + "learning_rate": 3.985063547731735e-06, + "loss": 0.61807376, + "num_input_tokens_seen": 24131320, + "router_z_loss_clip": 0.04077148, + "router_z_loss_mlp": 0.578125, + "step": 1127, + "time_per_iteration": 3.0877249240875244 + }, + { + "auxiliary_loss_clip": 0.01199514, + "auxiliary_loss_mlp": 0.01056848, + "balance_loss_clip": 1.03376949, + "balance_loss_mlp": 1.05723238, + "epoch": 0.06781902900946941, + "flos": 24235500769920.0, + "grad_norm": 2.13286114653412, + "language_loss": 0.81392133, + "learning_rate": 3.985016001072925e-06, + "loss": 0.83648497, + "num_input_tokens_seen": 24149930, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.421875, + "step": 1128, + "time_per_iteration": 2.5406885147094727 + }, + { + "auxiliary_loss_clip": 0.01208088, + "auxiliary_loss_mlp": 0.0105195, + "balance_loss_clip": 1.02692807, + "balance_loss_mlp": 1.0598706, + "epoch": 0.06787915226213738, + "flos": 22417523665920.0, + "grad_norm": 3.047918834731733, + "language_loss": 0.76034033, + "learning_rate": 3.984968379142109e-06, + "loss": 0.78294069, + "num_input_tokens_seen": 24169590, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.484375, + "step": 1129, + "time_per_iteration": 2.486829996109009 + }, + { + "auxiliary_loss_clip": 0.01201584, + "auxiliary_loss_mlp": 0.01061333, + "balance_loss_clip": 1.03721654, + "balance_loss_mlp": 1.05536139, + "epoch": 0.06793927551480534, + "flos": 37706922080640.0, + "grad_norm": 1.8621491947103987, + "language_loss": 0.72340226, + "learning_rate": 3.984920681941094e-06, + "loss": 0.74603146, + "num_input_tokens_seen": 24189965, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.4609375, + "step": 1130, + "time_per_iteration": 2.6195991039276123 + }, + { + "auxiliary_loss_clip": 0.01197626, + "auxiliary_loss_mlp": 0.01063599, + "balance_loss_clip": 1.03957844, + "balance_loss_mlp": 1.05584192, + "epoch": 0.06799939876747332, + "flos": 20631398947200.0, + "grad_norm": 2.3479224842049917, + "language_loss": 0.80624223, + "learning_rate": 3.984872909471688e-06, + "loss": 0.82885444, + "num_input_tokens_seen": 24208045, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.421875, + "step": 1131, + "time_per_iteration": 2.4917030334472656 + }, + { + "auxiliary_loss_clip": 0.01196301, + "auxiliary_loss_mlp": 0.0106802, + "balance_loss_clip": 1.04398775, + "balance_loss_mlp": 1.05550814, + "epoch": 0.06805952202014129, + "flos": 14864755829760.0, + "grad_norm": 2.1673533627141652, + "language_loss": 0.8104949, + "learning_rate": 3.984825061735701e-06, + "loss": 0.83313811, + "num_input_tokens_seen": 24223805, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.40625, + "step": 1132, + "time_per_iteration": 2.4325902462005615 + }, + { + "auxiliary_loss_clip": 0.01199688, + "auxiliary_loss_mlp": 0.01069367, + "balance_loss_clip": 1.04525137, + "balance_loss_mlp": 1.05629563, + "epoch": 0.06811964527280925, + "flos": 48909434947200.0, + "grad_norm": 1.450417149602266, + "language_loss": 0.63629937, + "learning_rate": 3.9847771387349495e-06, + "loss": 0.65898991, + "num_input_tokens_seen": 24249475, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.4296875, + "step": 1133, + "time_per_iteration": 2.7164230346679688 + }, + { + "auxiliary_loss_clip": 0.01203203, + "auxiliary_loss_mlp": 0.01059397, + "balance_loss_clip": 1.03194308, + "balance_loss_mlp": 1.05427325, + "epoch": 0.06817976852547723, + "flos": 15377273038080.0, + "grad_norm": 2.5027083277203963, + "language_loss": 0.74811196, + "learning_rate": 3.9847291404712506e-06, + "loss": 0.77073789, + "num_input_tokens_seen": 24267980, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.484375, + "step": 1134, + "time_per_iteration": 2.420506000518799 + }, + { + "auxiliary_loss_clip": 0.01201452, + "auxiliary_loss_mlp": 0.01064371, + "balance_loss_clip": 1.04088652, + "balance_loss_mlp": 1.05952573, + "epoch": 0.0682398917781452, + "flos": 20155690200960.0, + "grad_norm": 2.0759609389962037, + "language_loss": 0.87245119, + "learning_rate": 3.984681066946423e-06, + "loss": 0.89510942, + "num_input_tokens_seen": 24286805, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.421875, + "step": 1135, + "time_per_iteration": 2.464738607406616 + }, + { + "auxiliary_loss_clip": 0.01200809, + "auxiliary_loss_mlp": 0.01055494, + "balance_loss_clip": 1.03010249, + "balance_loss_mlp": 1.05388534, + "epoch": 0.06830001503081316, + "flos": 23440618748160.0, + "grad_norm": 2.383261313924855, + "language_loss": 0.78335494, + "learning_rate": 3.984632918162291e-06, + "loss": 0.80591798, + "num_input_tokens_seen": 24305855, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.46875, + "step": 1136, + "time_per_iteration": 2.4486002922058105 + }, + { + "auxiliary_loss_clip": 0.01206211, + "auxiliary_loss_mlp": 0.01073979, + "balance_loss_clip": 1.04906416, + "balance_loss_mlp": 1.06089664, + "epoch": 0.06836013828348114, + "flos": 34349813153280.0, + "grad_norm": 3.2008110915617207, + "language_loss": 0.83941948, + "learning_rate": 3.984584694120679e-06, + "loss": 0.86222148, + "num_input_tokens_seen": 24326535, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 1.453125, + "step": 1137, + "time_per_iteration": 2.5714635848999023 + }, + { + "auxiliary_loss_clip": 0.01199575, + "auxiliary_loss_mlp": 0.01061827, + "balance_loss_clip": 1.03806889, + "balance_loss_mlp": 1.05628538, + "epoch": 0.06842026153614911, + "flos": 23148844571520.0, + "grad_norm": 2.067587662099544, + "language_loss": 0.78669268, + "learning_rate": 3.984536394823418e-06, + "loss": 0.80930662, + "num_input_tokens_seen": 24345810, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.4375, + "step": 1138, + "time_per_iteration": 2.459437370300293 + }, + { + "auxiliary_loss_clip": 0.01202271, + "auxiliary_loss_mlp": 0.01059469, + "balance_loss_clip": 1.03480506, + "balance_loss_mlp": 1.05729747, + "epoch": 0.06848038478881707, + "flos": 24608972430720.0, + "grad_norm": 2.606905885529735, + "language_loss": 0.85683703, + "learning_rate": 3.984488020272336e-06, + "loss": 0.87945449, + "num_input_tokens_seen": 24366095, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.453125, + "step": 1139, + "time_per_iteration": 2.5198936462402344 + }, + { + "auxiliary_loss_clip": 0.01201061, + "auxiliary_loss_mlp": 0.01057605, + "balance_loss_clip": 1.03297663, + "balance_loss_mlp": 1.05803108, + "epoch": 0.06854050804148504, + "flos": 40880994278400.0, + "grad_norm": 1.7528507300348692, + "language_loss": 0.74826896, + "learning_rate": 3.984439570469271e-06, + "loss": 0.77085567, + "num_input_tokens_seen": 24388665, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.4296875, + "step": 1140, + "time_per_iteration": 2.6609106063842773 + }, + { + "auxiliary_loss_clip": 0.01198151, + "auxiliary_loss_mlp": 0.01062313, + "balance_loss_clip": 1.03698146, + "balance_loss_mlp": 1.05620885, + "epoch": 0.06860063129415302, + "flos": 31686354743040.0, + "grad_norm": 2.210262717529583, + "language_loss": 0.68083167, + "learning_rate": 3.9843910454160574e-06, + "loss": 0.70343632, + "num_input_tokens_seen": 24407705, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.421875, + "step": 1141, + "time_per_iteration": 2.5661122798919678 + }, + { + "auxiliary_loss_clip": 0.01205913, + "auxiliary_loss_mlp": 0.0106664, + "balance_loss_clip": 1.04098654, + "balance_loss_mlp": 1.05848837, + "epoch": 0.06866075454682098, + "flos": 26542007775360.0, + "grad_norm": 1.82433360121009, + "language_loss": 0.79399014, + "learning_rate": 3.984342445114538e-06, + "loss": 0.8167156, + "num_input_tokens_seen": 24428390, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.46875, + "step": 1142, + "time_per_iteration": 2.5499107837677 + }, + { + "auxiliary_loss_clip": 0.0120232, + "auxiliary_loss_mlp": 0.01061074, + "balance_loss_clip": 1.03650475, + "balance_loss_mlp": 1.05730164, + "epoch": 0.06872087779948895, + "flos": 29789768724480.0, + "grad_norm": 1.6821535193321122, + "language_loss": 0.68701231, + "learning_rate": 3.984293769566553e-06, + "loss": 0.70964622, + "num_input_tokens_seen": 24450810, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.4453125, + "step": 1143, + "time_per_iteration": 5.380373239517212 + }, + { + "auxiliary_loss_clip": 0.01196375, + "auxiliary_loss_mlp": 0.01059216, + "balance_loss_clip": 1.03670955, + "balance_loss_mlp": 1.05885804, + "epoch": 0.06878100105215693, + "flos": 26941118768640.0, + "grad_norm": 1.8434796401844256, + "language_loss": 0.74694496, + "learning_rate": 3.98424501877395e-06, + "loss": 0.76950091, + "num_input_tokens_seen": 24469965, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.375, + "step": 1144, + "time_per_iteration": 2.536839485168457 + }, + { + "auxiliary_loss_clip": 0.01204332, + "auxiliary_loss_mlp": 0.01064223, + "balance_loss_clip": 1.03893876, + "balance_loss_mlp": 1.05654943, + "epoch": 0.06884112430482489, + "flos": 10670748946560.0, + "grad_norm": 2.296493270147659, + "language_loss": 0.91720247, + "learning_rate": 3.984196192738577e-06, + "loss": 0.93988806, + "num_input_tokens_seen": 24486370, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4765625, + "step": 1145, + "time_per_iteration": 2.44307017326355 + }, + { + "auxiliary_loss_clip": 0.01206887, + "auxiliary_loss_mlp": 0.01067692, + "balance_loss_clip": 1.04160893, + "balance_loss_mlp": 1.05779576, + "epoch": 0.06890124755749286, + "flos": 20193647898240.0, + "grad_norm": 2.4650333910918865, + "language_loss": 0.82189268, + "learning_rate": 3.984147291462285e-06, + "loss": 0.84463847, + "num_input_tokens_seen": 24503780, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.4921875, + "step": 1146, + "time_per_iteration": 2.4743804931640625 + }, + { + "auxiliary_loss_clip": 0.01198651, + "auxiliary_loss_mlp": 0.01061891, + "balance_loss_clip": 1.03869271, + "balance_loss_mlp": 1.05755806, + "epoch": 0.06896137081016084, + "flos": 20449224144000.0, + "grad_norm": 2.5935722439127744, + "language_loss": 0.85150343, + "learning_rate": 3.98409831494693e-06, + "loss": 0.87410891, + "num_input_tokens_seen": 24522320, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.4140625, + "step": 1147, + "time_per_iteration": 2.48410701751709 + }, + { + "auxiliary_loss_clip": 0.01201275, + "auxiliary_loss_mlp": 0.01064743, + "balance_loss_clip": 1.03988767, + "balance_loss_mlp": 1.05699074, + "epoch": 0.0690214940628288, + "flos": 18368703555840.0, + "grad_norm": 2.3932988353276645, + "language_loss": 0.86235052, + "learning_rate": 3.984049263194367e-06, + "loss": 0.88501072, + "num_input_tokens_seen": 24540445, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.4453125, + "step": 1148, + "time_per_iteration": 2.455441951751709 + }, + { + "auxiliary_loss_clip": 0.01199305, + "auxiliary_loss_mlp": 0.01058033, + "balance_loss_clip": 1.0337863, + "balance_loss_mlp": 1.05560231, + "epoch": 0.06908161731549677, + "flos": 20558033418240.0, + "grad_norm": 2.070658514783469, + "language_loss": 0.69185412, + "learning_rate": 3.9840001362064575e-06, + "loss": 0.71442747, + "num_input_tokens_seen": 24557105, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.4375, + "step": 1149, + "time_per_iteration": 2.461880922317505 + }, + { + "auxiliary_loss_clip": 0.01203307, + "auxiliary_loss_mlp": 0.0105502, + "balance_loss_clip": 1.0289495, + "balance_loss_mlp": 1.05679548, + "epoch": 0.06914174056816474, + "flos": 27563666313600.0, + "grad_norm": 2.828663566846353, + "language_loss": 0.84069788, + "learning_rate": 3.983950933985064e-06, + "loss": 0.86328113, + "num_input_tokens_seen": 24578240, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.4609375, + "step": 1150, + "time_per_iteration": 2.509122371673584 + }, + { + "auxiliary_loss_clip": 0.01206199, + "auxiliary_loss_mlp": 0.01058671, + "balance_loss_clip": 1.03453135, + "balance_loss_mlp": 1.06116164, + "epoch": 0.06920186382083271, + "flos": 15304015249920.0, + "grad_norm": 3.57752822218259, + "language_loss": 0.82044697, + "learning_rate": 3.983901656532052e-06, + "loss": 0.84309566, + "num_input_tokens_seen": 24593585, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.453125, + "step": 1151, + "time_per_iteration": 2.420128345489502 + }, + { + "auxiliary_loss_clip": 0.01201904, + "auxiliary_loss_mlp": 0.01062248, + "balance_loss_clip": 1.03883505, + "balance_loss_mlp": 1.06011868, + "epoch": 0.06926198707350067, + "flos": 25191227894400.0, + "grad_norm": 1.8279979065740934, + "language_loss": 0.85587418, + "learning_rate": 3.983852303849291e-06, + "loss": 0.87851566, + "num_input_tokens_seen": 24613110, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.4140625, + "step": 1152, + "time_per_iteration": 2.498180866241455 + }, + { + "auxiliary_loss_clip": 0.01198565, + "auxiliary_loss_mlp": 0.01060938, + "balance_loss_clip": 1.03747797, + "balance_loss_mlp": 1.05767703, + "epoch": 0.06932211032616864, + "flos": 13256137146240.0, + "grad_norm": 2.1251557516582995, + "language_loss": 0.90536988, + "learning_rate": 3.983802875938651e-06, + "loss": 0.92796487, + "num_input_tokens_seen": 24628795, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.40625, + "step": 1153, + "time_per_iteration": 2.422480821609497 + }, + { + "auxiliary_loss_clip": 0.01200757, + "auxiliary_loss_mlp": 0.01054623, + "balance_loss_clip": 1.03035152, + "balance_loss_mlp": 1.05790865, + "epoch": 0.06938223357883662, + "flos": 24827381078400.0, + "grad_norm": 2.190017778582164, + "language_loss": 0.81363368, + "learning_rate": 3.983753372802008e-06, + "loss": 0.83618748, + "num_input_tokens_seen": 24645480, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.4296875, + "step": 1154, + "time_per_iteration": 2.528118848800659 + }, + { + "auxiliary_loss_clip": 0.01202754, + "auxiliary_loss_mlp": 0.01068044, + "balance_loss_clip": 1.04476249, + "balance_loss_mlp": 1.06078768, + "epoch": 0.06944235683150458, + "flos": 27267977554560.0, + "grad_norm": 32.79102955334026, + "language_loss": 0.7560131, + "learning_rate": 3.983703794441237e-06, + "loss": 0.77872109, + "num_input_tokens_seen": 24664630, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.421875, + "step": 1155, + "time_per_iteration": 2.5010287761688232 + }, + { + "auxiliary_loss_clip": 0.01196108, + "auxiliary_loss_mlp": 0.01059268, + "balance_loss_clip": 1.03595114, + "balance_loss_mlp": 1.05511975, + "epoch": 0.06950248008417255, + "flos": 25808065176960.0, + "grad_norm": 1.6800097473238784, + "language_loss": 0.71119213, + "learning_rate": 3.98365414085822e-06, + "loss": 0.73374593, + "num_input_tokens_seen": 24684210, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.40625, + "step": 1156, + "time_per_iteration": 2.5034549236297607 + }, + { + "auxiliary_loss_clip": 0.01199728, + "auxiliary_loss_mlp": 0.01069841, + "balance_loss_clip": 1.04437828, + "balance_loss_mlp": 1.05711889, + "epoch": 0.06956260333684053, + "flos": 22271546793600.0, + "grad_norm": 2.0301788984863918, + "language_loss": 0.75299567, + "learning_rate": 3.98360441205484e-06, + "loss": 0.77569139, + "num_input_tokens_seen": 24702490, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4296875, + "step": 1157, + "time_per_iteration": 2.4654574394226074 + }, + { + "auxiliary_loss_clip": 0.0119867, + "auxiliary_loss_mlp": 0.0105715, + "balance_loss_clip": 1.03240204, + "balance_loss_mlp": 1.0551796, + "epoch": 0.0696227265895085, + "flos": 29681390413440.0, + "grad_norm": 1.6687264459000366, + "language_loss": 0.71895158, + "learning_rate": 3.983554608032982e-06, + "loss": 0.7415098, + "num_input_tokens_seen": 24724340, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.4375, + "step": 1158, + "time_per_iteration": 2.53495454788208 + }, + { + "auxiliary_loss_clip": 0.01202231, + "auxiliary_loss_mlp": 0.01063046, + "balance_loss_clip": 1.03764284, + "balance_loss_mlp": 1.05718327, + "epoch": 0.06968284984217646, + "flos": 25523545547520.0, + "grad_norm": 1.9777890540291267, + "language_loss": 0.79796576, + "learning_rate": 3.983504728794533e-06, + "loss": 0.82061857, + "num_input_tokens_seen": 24745550, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.453125, + "step": 1159, + "time_per_iteration": 2.511402130126953 + }, + { + "auxiliary_loss_clip": 0.01205534, + "auxiliary_loss_mlp": 0.0106614, + "balance_loss_clip": 1.03938961, + "balance_loss_mlp": 1.05860782, + "epoch": 0.06974297309484444, + "flos": 20698192287360.0, + "grad_norm": 5.094070474761981, + "language_loss": 0.810929, + "learning_rate": 3.983454774341387e-06, + "loss": 0.83364576, + "num_input_tokens_seen": 24762575, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.46875, + "step": 1160, + "time_per_iteration": 2.4580883979797363 + }, + { + "auxiliary_loss_clip": 0.01197544, + "auxiliary_loss_mlp": 0.01059119, + "balance_loss_clip": 1.03373909, + "balance_loss_mlp": 1.05382752, + "epoch": 0.0698030963475124, + "flos": 26505199313280.0, + "grad_norm": 1.8746427931419856, + "language_loss": 0.75958532, + "learning_rate": 3.983404744675437e-06, + "loss": 0.78215194, + "num_input_tokens_seen": 24782605, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4375, + "step": 1161, + "time_per_iteration": 2.5046370029449463 + }, + { + "auxiliary_loss_clip": 0.01195466, + "auxiliary_loss_mlp": 0.01062077, + "balance_loss_clip": 1.03642368, + "balance_loss_mlp": 1.05299318, + "epoch": 0.06986321960018037, + "flos": 23040430346880.0, + "grad_norm": 1.806880077375887, + "language_loss": 0.8285073, + "learning_rate": 3.9833546397985794e-06, + "loss": 0.85108274, + "num_input_tokens_seen": 24802910, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.4296875, + "step": 1162, + "time_per_iteration": 2.4779040813446045 + }, + { + "auxiliary_loss_clip": 0.01193968, + "auxiliary_loss_mlp": 0.01055987, + "balance_loss_clip": 1.03172803, + "balance_loss_mlp": 1.05355024, + "epoch": 0.06992334285284833, + "flos": 28584822061440.0, + "grad_norm": 1.8779282806609423, + "language_loss": 0.79095101, + "learning_rate": 3.983304459712716e-06, + "loss": 0.81345057, + "num_input_tokens_seen": 24823305, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.40625, + "step": 1163, + "time_per_iteration": 2.515899181365967 + }, + { + "auxiliary_loss_clip": 0.01198337, + "auxiliary_loss_mlp": 0.0106386, + "balance_loss_clip": 1.03728819, + "balance_loss_mlp": 1.05438375, + "epoch": 0.06998346610551631, + "flos": 20595344670720.0, + "grad_norm": 2.1142628107327233, + "language_loss": 0.79552305, + "learning_rate": 3.983254204419749e-06, + "loss": 0.81814498, + "num_input_tokens_seen": 24842155, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.4375, + "step": 1164, + "time_per_iteration": 2.476428747177124 + }, + { + "auxiliary_loss_clip": 0.0119937, + "auxiliary_loss_mlp": 0.0106576, + "balance_loss_clip": 1.0401659, + "balance_loss_mlp": 1.05587661, + "epoch": 0.07004358935818428, + "flos": 22528810978560.0, + "grad_norm": 1.4863162511761774, + "language_loss": 0.73198837, + "learning_rate": 3.983203873921583e-06, + "loss": 0.75463963, + "num_input_tokens_seen": 24862080, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.4375, + "step": 1165, + "time_per_iteration": 2.5053012371063232 + }, + { + "auxiliary_loss_clip": 0.01196916, + "auxiliary_loss_mlp": 0.01056731, + "balance_loss_clip": 1.03225732, + "balance_loss_mlp": 1.05550849, + "epoch": 0.07010371261085224, + "flos": 28949997680640.0, + "grad_norm": 1.690867173089168, + "language_loss": 0.81019437, + "learning_rate": 3.983153468220128e-06, + "loss": 0.83273077, + "num_input_tokens_seen": 24886165, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.4140625, + "step": 1166, + "time_per_iteration": 2.5378963947296143 + }, + { + "auxiliary_loss_clip": 0.01194011, + "auxiliary_loss_mlp": 0.0104974, + "balance_loss_clip": 1.02452731, + "balance_loss_mlp": 1.0534389, + "epoch": 0.07016383586352022, + "flos": 23659171050240.0, + "grad_norm": 4.886682439277329, + "language_loss": 0.84443307, + "learning_rate": 3.983102987317295e-06, + "loss": 0.86687052, + "num_input_tokens_seen": 24905775, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.40625, + "step": 1167, + "time_per_iteration": 2.5244622230529785 + }, + { + "auxiliary_loss_clip": 0.01201364, + "auxiliary_loss_mlp": 0.01057063, + "balance_loss_clip": 1.03188586, + "balance_loss_mlp": 1.05693448, + "epoch": 0.07022395911618819, + "flos": 19792130693760.0, + "grad_norm": 3.687845484368313, + "language_loss": 0.89423364, + "learning_rate": 3.983052431214997e-06, + "loss": 0.9168179, + "num_input_tokens_seen": 24924295, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.4453125, + "step": 1168, + "time_per_iteration": 2.49411678314209 + }, + { + "auxiliary_loss_clip": 0.01203973, + "auxiliary_loss_mlp": 0.01068913, + "balance_loss_clip": 1.04078007, + "balance_loss_mlp": 1.05737031, + "epoch": 0.07028408236885615, + "flos": 21689147675520.0, + "grad_norm": 2.629371766417224, + "language_loss": 0.88661098, + "learning_rate": 3.983001799915153e-06, + "loss": 0.9093399, + "num_input_tokens_seen": 24943210, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.46875, + "step": 1169, + "time_per_iteration": 2.4795143604278564 + }, + { + "auxiliary_loss_clip": 0.01203226, + "auxiliary_loss_mlp": 0.01069625, + "balance_loss_clip": 1.04397118, + "balance_loss_mlp": 1.05864179, + "epoch": 0.07034420562152413, + "flos": 25630271832960.0, + "grad_norm": 2.0154006947860705, + "language_loss": 0.84000075, + "learning_rate": 3.982951093419681e-06, + "loss": 0.86272925, + "num_input_tokens_seen": 24960360, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.4375, + "step": 1170, + "time_per_iteration": 2.501016616821289 + }, + { + "auxiliary_loss_clip": 0.01199625, + "auxiliary_loss_mlp": 0.01064997, + "balance_loss_clip": 1.03860402, + "balance_loss_mlp": 1.05753505, + "epoch": 0.0704043288741921, + "flos": 20810449267200.0, + "grad_norm": 1.945268169582358, + "language_loss": 0.75220597, + "learning_rate": 3.982900311730506e-06, + "loss": 0.77485222, + "num_input_tokens_seen": 24978290, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.421875, + "step": 1171, + "time_per_iteration": 2.4456748962402344 + }, + { + "auxiliary_loss_clip": 0.01199689, + "auxiliary_loss_mlp": 0.01058158, + "balance_loss_clip": 1.03393483, + "balance_loss_mlp": 1.05765915, + "epoch": 0.07046445212686006, + "flos": 25593176062080.0, + "grad_norm": 3.2481396571627923, + "language_loss": 0.88848841, + "learning_rate": 3.9828494548495514e-06, + "loss": 0.91106689, + "num_input_tokens_seen": 24997055, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.421875, + "step": 1172, + "time_per_iteration": 2.4970321655273438 + }, + { + "auxiliary_loss_clip": 0.01202846, + "auxiliary_loss_mlp": 0.01053059, + "balance_loss_clip": 1.02776241, + "balance_loss_mlp": 1.05584753, + "epoch": 0.07052457537952803, + "flos": 25556978131200.0, + "grad_norm": 1.6229718682058278, + "language_loss": 0.8212136, + "learning_rate": 3.982798522778748e-06, + "loss": 0.84377271, + "num_input_tokens_seen": 25017490, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.46875, + "step": 1173, + "time_per_iteration": 2.485822916030884 + }, + { + "auxiliary_loss_clip": 0.01200818, + "auxiliary_loss_mlp": 0.01061183, + "balance_loss_clip": 1.03574347, + "balance_loss_mlp": 1.05786848, + "epoch": 0.070584698632196, + "flos": 17968515154560.0, + "grad_norm": 2.056745883983527, + "language_loss": 0.81825697, + "learning_rate": 3.9827475155200245e-06, + "loss": 0.840877, + "num_input_tokens_seen": 25035660, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4296875, + "step": 1174, + "time_per_iteration": 2.4564759731292725 + }, + { + "auxiliary_loss_clip": 0.01199791, + "auxiliary_loss_mlp": 0.01060254, + "balance_loss_clip": 1.03538728, + "balance_loss_mlp": 1.0569849, + "epoch": 0.07064482188486397, + "flos": 25370888745600.0, + "grad_norm": 1.925446476900023, + "language_loss": 0.8511939, + "learning_rate": 3.982696433075317e-06, + "loss": 0.87379438, + "num_input_tokens_seen": 25054785, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 1.421875, + "step": 1175, + "time_per_iteration": 2.487058639526367 + }, + { + "auxiliary_loss_clip": 0.0120243, + "auxiliary_loss_mlp": 0.01067762, + "balance_loss_clip": 1.04362202, + "balance_loss_mlp": 1.05922508, + "epoch": 0.07070494513753194, + "flos": 24899848767360.0, + "grad_norm": 1.9716433558257507, + "language_loss": 0.8303746, + "learning_rate": 3.982645275446563e-06, + "loss": 0.85307658, + "num_input_tokens_seen": 25075180, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.4375, + "step": 1176, + "time_per_iteration": 2.511456251144409 + }, + { + "auxiliary_loss_clip": 0.01197689, + "auxiliary_loss_mlp": 0.01061097, + "balance_loss_clip": 1.03528786, + "balance_loss_mlp": 1.05717707, + "epoch": 0.07076506839019991, + "flos": 22338447874560.0, + "grad_norm": 2.3318965992312, + "language_loss": 0.74563694, + "learning_rate": 3.982594042635701e-06, + "loss": 0.76822478, + "num_input_tokens_seen": 25093035, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.40625, + "step": 1177, + "time_per_iteration": 2.460102081298828 + }, + { + "auxiliary_loss_clip": 0.01207406, + "auxiliary_loss_mlp": 0.01058724, + "balance_loss_clip": 1.033476, + "balance_loss_mlp": 1.06167924, + "epoch": 0.07082519164286788, + "flos": 18660800954880.0, + "grad_norm": 2.2206541819979995, + "language_loss": 0.86031914, + "learning_rate": 3.982542734644673e-06, + "loss": 0.88298053, + "num_input_tokens_seen": 25112520, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.4609375, + "step": 1178, + "time_per_iteration": 2.4605627059936523 + }, + { + "auxiliary_loss_clip": 0.01087089, + "auxiliary_loss_mlp": 0.01007975, + "balance_loss_clip": 1.00349271, + "balance_loss_mlp": 1.02766943, + "epoch": 0.07088531489553584, + "flos": 63654107610240.0, + "grad_norm": 0.8386980392448491, + "language_loss": 0.63242435, + "learning_rate": 3.982491351475427e-06, + "loss": 0.65337497, + "num_input_tokens_seen": 25177760, + "router_z_loss_clip": 0.04492188, + "router_z_loss_mlp": 0.59375, + "step": 1179, + "time_per_iteration": 3.156688690185547 + }, + { + "auxiliary_loss_clip": 0.01207076, + "auxiliary_loss_mlp": 0.01062734, + "balance_loss_clip": 1.03886819, + "balance_loss_mlp": 1.06038809, + "epoch": 0.07094543814820382, + "flos": 21572688804480.0, + "grad_norm": 2.3853497849810945, + "language_loss": 0.83326972, + "learning_rate": 3.98243989312991e-06, + "loss": 0.85596782, + "num_input_tokens_seen": 25195260, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.46875, + "step": 1180, + "time_per_iteration": 2.4823896884918213 + }, + { + "auxiliary_loss_clip": 0.01200915, + "auxiliary_loss_mlp": 0.01065839, + "balance_loss_clip": 1.04087663, + "balance_loss_mlp": 1.05910683, + "epoch": 0.07100556140087179, + "flos": 22089946608000.0, + "grad_norm": 2.1921067510196446, + "language_loss": 0.88595563, + "learning_rate": 3.982388359610074e-06, + "loss": 0.90862316, + "num_input_tokens_seen": 25212740, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.421875, + "step": 1181, + "time_per_iteration": 2.505908727645874 + }, + { + "auxiliary_loss_clip": 0.01200757, + "auxiliary_loss_mlp": 0.01060636, + "balance_loss_clip": 1.03607869, + "balance_loss_mlp": 1.05944347, + "epoch": 0.07106568465353975, + "flos": 47922286400640.0, + "grad_norm": 2.2303634282095257, + "language_loss": 0.83314365, + "learning_rate": 3.9823367509178725e-06, + "loss": 0.85575759, + "num_input_tokens_seen": 25236420, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.4140625, + "step": 1182, + "time_per_iteration": 2.7283623218536377 + }, + { + "auxiliary_loss_clip": 0.01199287, + "auxiliary_loss_mlp": 0.01065564, + "balance_loss_clip": 1.04006529, + "balance_loss_mlp": 1.06100821, + "epoch": 0.07112580790620772, + "flos": 23440798316160.0, + "grad_norm": 2.671395976555463, + "language_loss": 0.7925818, + "learning_rate": 3.982285067055262e-06, + "loss": 0.81523037, + "num_input_tokens_seen": 25255120, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.3828125, + "step": 1183, + "time_per_iteration": 2.5057172775268555 + }, + { + "auxiliary_loss_clip": 0.01201972, + "auxiliary_loss_mlp": 0.01059167, + "balance_loss_clip": 1.03441906, + "balance_loss_mlp": 1.05550563, + "epoch": 0.0711859311588757, + "flos": 31868888682240.0, + "grad_norm": 2.6492838430830963, + "language_loss": 0.78910172, + "learning_rate": 3.982233308024204e-06, + "loss": 0.8117131, + "num_input_tokens_seen": 25275150, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.46875, + "step": 1184, + "time_per_iteration": 5.494150638580322 + }, + { + "auxiliary_loss_clip": 0.01196982, + "auxiliary_loss_mlp": 0.01057128, + "balance_loss_clip": 1.03369188, + "balance_loss_mlp": 1.05884266, + "epoch": 0.07124605441154366, + "flos": 19610315026560.0, + "grad_norm": 2.546293211356889, + "language_loss": 0.7696892, + "learning_rate": 3.98218147382666e-06, + "loss": 0.79223031, + "num_input_tokens_seen": 25293680, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.375, + "step": 1185, + "time_per_iteration": 3.8873486518859863 + }, + { + "auxiliary_loss_clip": 0.01200052, + "auxiliary_loss_mlp": 0.01065088, + "balance_loss_clip": 1.0408771, + "balance_loss_mlp": 1.05808377, + "epoch": 0.07130617766421163, + "flos": 14684448533760.0, + "grad_norm": 2.519913974657541, + "language_loss": 0.65896261, + "learning_rate": 3.982129564464596e-06, + "loss": 0.68161404, + "num_input_tokens_seen": 25310050, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.421875, + "step": 1186, + "time_per_iteration": 2.44986891746521 + }, + { + "auxiliary_loss_clip": 0.01198722, + "auxiliary_loss_mlp": 0.01056267, + "balance_loss_clip": 1.03234124, + "balance_loss_mlp": 1.05906928, + "epoch": 0.07136630091687961, + "flos": 26067915141120.0, + "grad_norm": 2.0047668871213205, + "language_loss": 0.69673246, + "learning_rate": 3.98207757993998e-06, + "loss": 0.71928233, + "num_input_tokens_seen": 25331020, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.3984375, + "step": 1187, + "time_per_iteration": 2.517432451248169 + }, + { + "auxiliary_loss_clip": 0.01194056, + "auxiliary_loss_mlp": 0.01059861, + "balance_loss_clip": 1.03713942, + "balance_loss_mlp": 1.05690861, + "epoch": 0.07142642416954757, + "flos": 15669190869120.0, + "grad_norm": 2.6848541171122307, + "language_loss": 0.78598166, + "learning_rate": 3.9820255202547845e-06, + "loss": 0.80852079, + "num_input_tokens_seen": 25347875, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.375, + "step": 1188, + "time_per_iteration": 2.4682350158691406 + }, + { + "auxiliary_loss_clip": 0.01197809, + "auxiliary_loss_mlp": 0.01056931, + "balance_loss_clip": 1.03282666, + "balance_loss_mlp": 1.0588758, + "epoch": 0.07148654742221554, + "flos": 19755322231680.0, + "grad_norm": 2.0343008635273834, + "language_loss": 0.84854662, + "learning_rate": 3.981973385410981e-06, + "loss": 0.87109399, + "num_input_tokens_seen": 25366715, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.390625, + "step": 1189, + "time_per_iteration": 2.451464891433716 + }, + { + "auxiliary_loss_clip": 0.01193617, + "auxiliary_loss_mlp": 0.01062112, + "balance_loss_clip": 1.03707767, + "balance_loss_mlp": 1.05589187, + "epoch": 0.07154667067488352, + "flos": 23471824688640.0, + "grad_norm": 1.7193907035784557, + "language_loss": 0.77021295, + "learning_rate": 3.9819211754105494e-06, + "loss": 0.79277021, + "num_input_tokens_seen": 25385450, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.375, + "step": 1190, + "time_per_iteration": 2.5028254985809326 + }, + { + "auxiliary_loss_clip": 0.01200514, + "auxiliary_loss_mlp": 0.01065982, + "balance_loss_clip": 1.04018509, + "balance_loss_mlp": 1.0585537, + "epoch": 0.07160679392755148, + "flos": 18332936588160.0, + "grad_norm": 2.3385605637591302, + "language_loss": 0.75145626, + "learning_rate": 3.981868890255468e-06, + "loss": 0.77412122, + "num_input_tokens_seen": 25403940, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.421875, + "step": 1191, + "time_per_iteration": 2.461045980453491 + }, + { + "auxiliary_loss_clip": 0.0119767, + "auxiliary_loss_mlp": 0.01056581, + "balance_loss_clip": 1.03147578, + "balance_loss_mlp": 1.05730891, + "epoch": 0.07166691718021945, + "flos": 17747017937280.0, + "grad_norm": 3.3332115059632583, + "language_loss": 0.7360636, + "learning_rate": 3.981816529947719e-06, + "loss": 0.75860614, + "num_input_tokens_seen": 25420410, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.40625, + "step": 1192, + "time_per_iteration": 2.4944753646850586 + }, + { + "auxiliary_loss_clip": 0.01194068, + "auxiliary_loss_mlp": 0.01052089, + "balance_loss_clip": 1.02884293, + "balance_loss_mlp": 1.05358601, + "epoch": 0.07172704043288743, + "flos": 22451925916800.0, + "grad_norm": 2.1652973689026176, + "language_loss": 0.7830255, + "learning_rate": 3.9817640944892896e-06, + "loss": 0.80548704, + "num_input_tokens_seen": 25439415, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.40625, + "step": 1193, + "time_per_iteration": 2.487025737762451 + }, + { + "auxiliary_loss_clip": 0.01202609, + "auxiliary_loss_mlp": 0.01053593, + "balance_loss_clip": 1.02786815, + "balance_loss_mlp": 1.06034899, + "epoch": 0.07178716368555539, + "flos": 23222210100480.0, + "grad_norm": 1.9678931818636167, + "language_loss": 0.85748619, + "learning_rate": 3.981711583882166e-06, + "loss": 0.88004816, + "num_input_tokens_seen": 25458715, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.421875, + "step": 1194, + "time_per_iteration": 2.493823766708374 + }, + { + "auxiliary_loss_clip": 0.01197363, + "auxiliary_loss_mlp": 0.0106262, + "balance_loss_clip": 1.03886151, + "balance_loss_mlp": 1.05782473, + "epoch": 0.07184728693822336, + "flos": 25150828072320.0, + "grad_norm": 1.9701258602591958, + "language_loss": 0.81425989, + "learning_rate": 3.981658998128341e-06, + "loss": 0.83685976, + "num_input_tokens_seen": 25477985, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.3984375, + "step": 1195, + "time_per_iteration": 2.5168802738189697 + }, + { + "auxiliary_loss_clip": 0.01195742, + "auxiliary_loss_mlp": 0.01051594, + "balance_loss_clip": 1.02979064, + "balance_loss_mlp": 1.05720496, + "epoch": 0.07190741019089132, + "flos": 22711237176960.0, + "grad_norm": 1.9269272748189905, + "language_loss": 0.79917538, + "learning_rate": 3.981606337229808e-06, + "loss": 0.82164884, + "num_input_tokens_seen": 25497110, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.3828125, + "step": 1196, + "time_per_iteration": 2.4749536514282227 + }, + { + "auxiliary_loss_clip": 0.01193553, + "auxiliary_loss_mlp": 0.01069477, + "balance_loss_clip": 1.04418063, + "balance_loss_mlp": 1.05655897, + "epoch": 0.0719675334435593, + "flos": 29349791032320.0, + "grad_norm": 8.862292558474625, + "language_loss": 0.71015084, + "learning_rate": 3.9815536011885655e-06, + "loss": 0.73278111, + "num_input_tokens_seen": 25516555, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.3671875, + "step": 1197, + "time_per_iteration": 2.520514726638794 + }, + { + "auxiliary_loss_clip": 0.01192449, + "auxiliary_loss_mlp": 0.01052157, + "balance_loss_clip": 1.02845871, + "balance_loss_mlp": 1.05429292, + "epoch": 0.07202765669622727, + "flos": 17639788861440.0, + "grad_norm": 2.0584524946763767, + "language_loss": 0.86034989, + "learning_rate": 3.98150079000661e-06, + "loss": 0.88279593, + "num_input_tokens_seen": 25533895, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.3828125, + "step": 1198, + "time_per_iteration": 2.441458225250244 + }, + { + "auxiliary_loss_clip": 0.01194875, + "auxiliary_loss_mlp": 0.01061206, + "balance_loss_clip": 1.03724504, + "balance_loss_mlp": 1.05664325, + "epoch": 0.07208777994889523, + "flos": 21434038306560.0, + "grad_norm": 2.7240513490380307, + "language_loss": 0.83822477, + "learning_rate": 3.981447903685947e-06, + "loss": 0.8607856, + "num_input_tokens_seen": 25554195, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.3828125, + "step": 1199, + "time_per_iteration": 2.462790012359619 + }, + { + "auxiliary_loss_clip": 0.01201627, + "auxiliary_loss_mlp": 0.01055923, + "balance_loss_clip": 1.03351128, + "balance_loss_mlp": 1.06159616, + "epoch": 0.07214790320156321, + "flos": 26940867373440.0, + "grad_norm": 2.0725431151836453, + "language_loss": 0.76464498, + "learning_rate": 3.981394942228581e-06, + "loss": 0.78722042, + "num_input_tokens_seen": 25574155, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.3984375, + "step": 1200, + "time_per_iteration": 2.5007636547088623 + }, + { + "auxiliary_loss_clip": 0.01196382, + "auxiliary_loss_mlp": 0.01061794, + "balance_loss_clip": 1.0376662, + "balance_loss_mlp": 1.05783701, + "epoch": 0.07220802645423118, + "flos": 23879949995520.0, + "grad_norm": 1.959995672067427, + "language_loss": 0.82965535, + "learning_rate": 3.98134190563652e-06, + "loss": 0.85223711, + "num_input_tokens_seen": 25592735, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.390625, + "step": 1201, + "time_per_iteration": 2.4968512058258057 + }, + { + "auxiliary_loss_clip": 0.01198607, + "auxiliary_loss_mlp": 0.01059493, + "balance_loss_clip": 1.03372014, + "balance_loss_mlp": 1.05568862, + "epoch": 0.07226814970689914, + "flos": 19243631036160.0, + "grad_norm": 2.411287508312223, + "language_loss": 0.69041032, + "learning_rate": 3.981288793911775e-06, + "loss": 0.71299136, + "num_input_tokens_seen": 25611510, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.4296875, + "step": 1202, + "time_per_iteration": 2.456216812133789 + }, + { + "auxiliary_loss_clip": 0.01196785, + "auxiliary_loss_mlp": 0.01063607, + "balance_loss_clip": 1.03804839, + "balance_loss_mlp": 1.05721354, + "epoch": 0.07232827295956712, + "flos": 19172025273600.0, + "grad_norm": 1.9411904343348254, + "language_loss": 0.87723774, + "learning_rate": 3.98123560705636e-06, + "loss": 0.89984161, + "num_input_tokens_seen": 25629560, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.3984375, + "step": 1203, + "time_per_iteration": 2.449903964996338 + }, + { + "auxiliary_loss_clip": 0.01198028, + "auxiliary_loss_mlp": 0.01061987, + "balance_loss_clip": 1.03803837, + "balance_loss_mlp": 1.0546416, + "epoch": 0.07238839621223508, + "flos": 17639752947840.0, + "grad_norm": 1.819852916387131, + "language_loss": 0.7844671, + "learning_rate": 3.981182345072293e-06, + "loss": 0.80706728, + "num_input_tokens_seen": 25648330, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.4375, + "step": 1204, + "time_per_iteration": 2.449265480041504 + }, + { + "auxiliary_loss_clip": 0.01194984, + "auxiliary_loss_mlp": 0.01062187, + "balance_loss_clip": 1.0388217, + "balance_loss_mlp": 1.05605316, + "epoch": 0.07244851946490305, + "flos": 28292401440000.0, + "grad_norm": 1.8514893306986777, + "language_loss": 0.81960398, + "learning_rate": 3.981129007961593e-06, + "loss": 0.84217566, + "num_input_tokens_seen": 25669470, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.390625, + "step": 1205, + "time_per_iteration": 2.517423629760742 + }, + { + "auxiliary_loss_clip": 0.01199989, + "auxiliary_loss_mlp": 0.01067422, + "balance_loss_clip": 1.04250705, + "balance_loss_mlp": 1.05852747, + "epoch": 0.07250864271757101, + "flos": 22564829341440.0, + "grad_norm": 2.0830735488163254, + "language_loss": 0.76702261, + "learning_rate": 3.981075595726283e-06, + "loss": 0.78969669, + "num_input_tokens_seen": 25690470, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.4140625, + "step": 1206, + "time_per_iteration": 2.489978313446045 + }, + { + "auxiliary_loss_clip": 0.01193529, + "auxiliary_loss_mlp": 0.01055273, + "balance_loss_clip": 1.03071594, + "balance_loss_mlp": 1.05481935, + "epoch": 0.072568765970239, + "flos": 21762405463680.0, + "grad_norm": 1.8430962541821914, + "language_loss": 0.77246201, + "learning_rate": 3.981022108368387e-06, + "loss": 0.79495007, + "num_input_tokens_seen": 25709205, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.3828125, + "step": 1207, + "time_per_iteration": 2.4895267486572266 + }, + { + "auxiliary_loss_clip": 0.01194673, + "auxiliary_loss_mlp": 0.01049476, + "balance_loss_clip": 1.02816105, + "balance_loss_mlp": 1.05703962, + "epoch": 0.07262888922290696, + "flos": 25519702792320.0, + "grad_norm": 5.768853045708734, + "language_loss": 0.79723513, + "learning_rate": 3.9809685458899345e-06, + "loss": 0.81967664, + "num_input_tokens_seen": 25728485, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.375, + "step": 1208, + "time_per_iteration": 2.509073495864868 + }, + { + "auxiliary_loss_clip": 0.0119292, + "auxiliary_loss_mlp": 0.01054601, + "balance_loss_clip": 1.03204679, + "balance_loss_mlp": 1.05551386, + "epoch": 0.07268901247557492, + "flos": 21246548290560.0, + "grad_norm": 3.6873449148768063, + "language_loss": 0.78595626, + "learning_rate": 3.980914908292955e-06, + "loss": 0.80843151, + "num_input_tokens_seen": 25747730, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.375, + "step": 1209, + "time_per_iteration": 2.506157398223877 + }, + { + "auxiliary_loss_clip": 0.01194158, + "auxiliary_loss_mlp": 0.01056209, + "balance_loss_clip": 1.03409529, + "balance_loss_mlp": 1.05510461, + "epoch": 0.0727491357282429, + "flos": 25479302970240.0, + "grad_norm": 2.6193169355932104, + "language_loss": 0.81117678, + "learning_rate": 3.980861195579486e-06, + "loss": 0.83368045, + "num_input_tokens_seen": 25768050, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.390625, + "step": 1210, + "time_per_iteration": 2.4985666275024414 + }, + { + "auxiliary_loss_clip": 0.01192388, + "auxiliary_loss_mlp": 0.01061033, + "balance_loss_clip": 1.03688109, + "balance_loss_mlp": 1.0565064, + "epoch": 0.07280925898091087, + "flos": 24462169545600.0, + "grad_norm": 2.2378435782703834, + "language_loss": 0.84350932, + "learning_rate": 3.98080740775156e-06, + "loss": 0.86604351, + "num_input_tokens_seen": 25787985, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.359375, + "step": 1211, + "time_per_iteration": 2.4971728324890137 + }, + { + "auxiliary_loss_clip": 0.01189207, + "auxiliary_loss_mlp": 0.01051238, + "balance_loss_clip": 1.02931547, + "balance_loss_mlp": 1.05233216, + "epoch": 0.07286938223357883, + "flos": 18288191220480.0, + "grad_norm": 2.2910402501943516, + "language_loss": 0.90813953, + "learning_rate": 3.98075354481122e-06, + "loss": 0.9305439, + "num_input_tokens_seen": 25803620, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.375, + "step": 1212, + "time_per_iteration": 2.424874782562256 + }, + { + "auxiliary_loss_clip": 0.01191621, + "auxiliary_loss_mlp": 0.01051304, + "balance_loss_clip": 1.0286777, + "balance_loss_mlp": 1.05457211, + "epoch": 0.07292950548624681, + "flos": 21214803646080.0, + "grad_norm": 2.346480404505952, + "language_loss": 0.7238096, + "learning_rate": 3.9806996067605055e-06, + "loss": 0.74623883, + "num_input_tokens_seen": 25823315, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.3671875, + "step": 1213, + "time_per_iteration": 2.443542003631592 + }, + { + "auxiliary_loss_clip": 0.0119423, + "auxiliary_loss_mlp": 0.01051601, + "balance_loss_clip": 1.02848625, + "balance_loss_mlp": 1.05338192, + "epoch": 0.07298962873891478, + "flos": 24642009964800.0, + "grad_norm": 1.9141465843449694, + "language_loss": 0.84441102, + "learning_rate": 3.980645593601465e-06, + "loss": 0.86686933, + "num_input_tokens_seen": 25842605, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.40625, + "step": 1214, + "time_per_iteration": 2.500112295150757 + }, + { + "auxiliary_loss_clip": 0.01197246, + "auxiliary_loss_mlp": 0.0105819, + "balance_loss_clip": 1.03468192, + "balance_loss_mlp": 1.05678558, + "epoch": 0.07304975199158274, + "flos": 27052765217280.0, + "grad_norm": 2.82775499028919, + "language_loss": 0.83929181, + "learning_rate": 3.980591505336144e-06, + "loss": 0.86184609, + "num_input_tokens_seen": 25863030, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.40625, + "step": 1215, + "time_per_iteration": 2.539769411087036 + }, + { + "auxiliary_loss_clip": 0.01194493, + "auxiliary_loss_mlp": 0.01061008, + "balance_loss_clip": 1.03711891, + "balance_loss_mlp": 1.05474758, + "epoch": 0.07310987524425071, + "flos": 33549544091520.0, + "grad_norm": 1.8082751516232567, + "language_loss": 0.80984753, + "learning_rate": 3.980537341966595e-06, + "loss": 0.83240259, + "num_input_tokens_seen": 25888015, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.390625, + "step": 1216, + "time_per_iteration": 2.6036598682403564 + }, + { + "auxiliary_loss_clip": 0.01196444, + "auxiliary_loss_mlp": 0.01050548, + "balance_loss_clip": 1.02863717, + "balance_loss_mlp": 1.05746269, + "epoch": 0.07316999849691869, + "flos": 28110944908800.0, + "grad_norm": 2.8100743600713276, + "language_loss": 0.76112509, + "learning_rate": 3.980483103494872e-06, + "loss": 0.78359497, + "num_input_tokens_seen": 25908660, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.390625, + "step": 1217, + "time_per_iteration": 2.513061046600342 + }, + { + "auxiliary_loss_clip": 0.01192952, + "auxiliary_loss_mlp": 0.01055183, + "balance_loss_clip": 1.0347029, + "balance_loss_mlp": 1.05546904, + "epoch": 0.07323012174958665, + "flos": 14392602529920.0, + "grad_norm": 2.0751842608938142, + "language_loss": 0.86442709, + "learning_rate": 3.98042878992303e-06, + "loss": 0.88690841, + "num_input_tokens_seen": 25927215, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.375, + "step": 1218, + "time_per_iteration": 2.4514572620391846 + }, + { + "auxiliary_loss_clip": 0.01193593, + "auxiliary_loss_mlp": 0.01062446, + "balance_loss_clip": 1.03989124, + "balance_loss_mlp": 1.05405331, + "epoch": 0.07329024500225462, + "flos": 21616428591360.0, + "grad_norm": 1.9036635750322874, + "language_loss": 0.86757988, + "learning_rate": 3.9803744012531305e-06, + "loss": 0.8901403, + "num_input_tokens_seen": 25945500, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.3984375, + "step": 1219, + "time_per_iteration": 2.4501893520355225 + }, + { + "auxiliary_loss_clip": 0.01190573, + "auxiliary_loss_mlp": 0.01058106, + "balance_loss_clip": 1.03654075, + "balance_loss_mlp": 1.05260015, + "epoch": 0.0733503682549226, + "flos": 13224141106560.0, + "grad_norm": 2.320539289810395, + "language_loss": 0.84721315, + "learning_rate": 3.980319937487235e-06, + "loss": 0.86969984, + "num_input_tokens_seen": 25963105, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.375, + "step": 1220, + "time_per_iteration": 2.4651544094085693 + }, + { + "auxiliary_loss_clip": 0.01193314, + "auxiliary_loss_mlp": 0.01062531, + "balance_loss_clip": 1.04015541, + "balance_loss_mlp": 1.05455709, + "epoch": 0.07341049150759056, + "flos": 20886975192960.0, + "grad_norm": 2.803787378453645, + "language_loss": 0.76840538, + "learning_rate": 3.98026539862741e-06, + "loss": 0.79096377, + "num_input_tokens_seen": 25981690, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.390625, + "step": 1221, + "time_per_iteration": 2.4643850326538086 + }, + { + "auxiliary_loss_clip": 0.01195957, + "auxiliary_loss_mlp": 0.01059407, + "balance_loss_clip": 1.0369482, + "balance_loss_mlp": 1.05698907, + "epoch": 0.07347061476025853, + "flos": 15413614623360.0, + "grad_norm": 4.111967976062365, + "language_loss": 0.92201889, + "learning_rate": 3.980210784675722e-06, + "loss": 0.94457251, + "num_input_tokens_seen": 25999890, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.390625, + "step": 1222, + "time_per_iteration": 2.461393117904663 + }, + { + "auxiliary_loss_clip": 0.01197113, + "auxiliary_loss_mlp": 0.01056347, + "balance_loss_clip": 1.03440046, + "balance_loss_mlp": 1.05795276, + "epoch": 0.0735307380129265, + "flos": 11108859131520.0, + "grad_norm": 2.739326433562924, + "language_loss": 0.91106719, + "learning_rate": 3.980156095634242e-06, + "loss": 0.9336018, + "num_input_tokens_seen": 26016445, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.390625, + "step": 1223, + "time_per_iteration": 2.4616212844848633 + }, + { + "auxiliary_loss_clip": 0.01194512, + "auxiliary_loss_mlp": 0.01071192, + "balance_loss_clip": 1.04895926, + "balance_loss_mlp": 1.05628467, + "epoch": 0.07359086126559447, + "flos": 23732392924800.0, + "grad_norm": 2.5538951271380395, + "language_loss": 0.81946027, + "learning_rate": 3.980101331505045e-06, + "loss": 0.84211743, + "num_input_tokens_seen": 26036080, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.3828125, + "step": 1224, + "time_per_iteration": 2.555060386657715 + }, + { + "auxiliary_loss_clip": 0.01191919, + "auxiliary_loss_mlp": 0.01053854, + "balance_loss_clip": 1.02938056, + "balance_loss_mlp": 1.05385065, + "epoch": 0.07365098451826244, + "flos": 20993270515200.0, + "grad_norm": 2.209826315991058, + "language_loss": 0.83313572, + "learning_rate": 3.9800464922902076e-06, + "loss": 0.8555935, + "num_input_tokens_seen": 26055805, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.375, + "step": 1225, + "time_per_iteration": 2.5317656993865967 + }, + { + "auxiliary_loss_clip": 0.01194191, + "auxiliary_loss_mlp": 0.0105208, + "balance_loss_clip": 1.0300144, + "balance_loss_mlp": 1.05566537, + "epoch": 0.0737111077709304, + "flos": 19933582452480.0, + "grad_norm": 2.0864455990649144, + "language_loss": 0.9037565, + "learning_rate": 3.979991577991808e-06, + "loss": 0.92621917, + "num_input_tokens_seen": 26073905, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.3828125, + "step": 1226, + "time_per_iteration": 5.374137878417969 + }, + { + "auxiliary_loss_clip": 0.01201048, + "auxiliary_loss_mlp": 0.01048748, + "balance_loss_clip": 1.02451301, + "balance_loss_mlp": 1.05401981, + "epoch": 0.07377123102359838, + "flos": 16581537342720.0, + "grad_norm": 2.8833434676543, + "language_loss": 0.76944947, + "learning_rate": 3.97993658861193e-06, + "loss": 0.79194742, + "num_input_tokens_seen": 26091700, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.46875, + "step": 1227, + "time_per_iteration": 2.4528942108154297 + }, + { + "auxiliary_loss_clip": 0.01192324, + "auxiliary_loss_mlp": 0.01049537, + "balance_loss_clip": 1.02720916, + "balance_loss_mlp": 1.05810142, + "epoch": 0.07383135427626634, + "flos": 28328563457280.0, + "grad_norm": 1.6041059240123434, + "language_loss": 0.85634637, + "learning_rate": 3.9798815241526575e-06, + "loss": 0.87876499, + "num_input_tokens_seen": 26114105, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.34375, + "step": 1228, + "time_per_iteration": 2.5452229976654053 + }, + { + "auxiliary_loss_clip": 0.01194537, + "auxiliary_loss_mlp": 0.01061009, + "balance_loss_clip": 1.0383954, + "balance_loss_mlp": 1.05448794, + "epoch": 0.07389147752893431, + "flos": 20047168235520.0, + "grad_norm": 4.251776538682485, + "language_loss": 0.79688829, + "learning_rate": 3.97982638461608e-06, + "loss": 0.81944382, + "num_input_tokens_seen": 26131165, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.3984375, + "step": 1229, + "time_per_iteration": 2.501086711883545 + }, + { + "auxiliary_loss_clip": 0.01196019, + "auxiliary_loss_mlp": 0.01061374, + "balance_loss_clip": 1.03777039, + "balance_loss_mlp": 1.05632436, + "epoch": 0.07395160078160229, + "flos": 18114132890880.0, + "grad_norm": 2.028375336194412, + "language_loss": 0.78218549, + "learning_rate": 3.979771170004287e-06, + "loss": 0.8047595, + "num_input_tokens_seen": 26150040, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.3984375, + "step": 1230, + "time_per_iteration": 2.4474098682403564 + }, + { + "auxiliary_loss_clip": 0.01193092, + "auxiliary_loss_mlp": 0.01048754, + "balance_loss_clip": 1.02554393, + "balance_loss_mlp": 1.05599403, + "epoch": 0.07401172403427025, + "flos": 23586918842880.0, + "grad_norm": 1.924374124094053, + "language_loss": 0.81301343, + "learning_rate": 3.979715880319372e-06, + "loss": 0.83543187, + "num_input_tokens_seen": 26169380, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.375, + "step": 1231, + "time_per_iteration": 2.4861042499542236 + }, + { + "auxiliary_loss_clip": 0.01198041, + "auxiliary_loss_mlp": 0.01066474, + "balance_loss_clip": 1.04277539, + "balance_loss_mlp": 1.05443811, + "epoch": 0.07407184728693822, + "flos": 26359904799360.0, + "grad_norm": 2.4882746298902343, + "language_loss": 0.95111585, + "learning_rate": 3.979660515563434e-06, + "loss": 0.97376096, + "num_input_tokens_seen": 26189420, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.4375, + "step": 1232, + "time_per_iteration": 2.5074143409729004 + }, + { + "auxiliary_loss_clip": 0.01194092, + "auxiliary_loss_mlp": 0.01060623, + "balance_loss_clip": 1.03938031, + "balance_loss_mlp": 1.05667329, + "epoch": 0.0741319705396062, + "flos": 22200443821440.0, + "grad_norm": 2.246534337547551, + "language_loss": 0.80640733, + "learning_rate": 3.979605075738569e-06, + "loss": 0.82895458, + "num_input_tokens_seen": 26209300, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.375, + "step": 1233, + "time_per_iteration": 2.490816831588745 + }, + { + "auxiliary_loss_clip": 0.01198611, + "auxiliary_loss_mlp": 0.01060349, + "balance_loss_clip": 1.03488624, + "balance_loss_mlp": 1.05483365, + "epoch": 0.07419209379227416, + "flos": 39200482523520.0, + "grad_norm": 2.357402762223285, + "language_loss": 0.70458734, + "learning_rate": 3.979549560846883e-06, + "loss": 0.72717696, + "num_input_tokens_seen": 26228110, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4375, + "step": 1234, + "time_per_iteration": 2.605139970779419 + }, + { + "auxiliary_loss_clip": 0.01195848, + "auxiliary_loss_mlp": 0.01059615, + "balance_loss_clip": 1.03665543, + "balance_loss_mlp": 1.05792761, + "epoch": 0.07425221704494213, + "flos": 22781657790720.0, + "grad_norm": 2.1034220776692765, + "language_loss": 0.77058101, + "learning_rate": 3.979493970890478e-06, + "loss": 0.79313564, + "num_input_tokens_seen": 26247020, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.3828125, + "step": 1235, + "time_per_iteration": 2.4728853702545166 + }, + { + "auxiliary_loss_clip": 0.01189622, + "auxiliary_loss_mlp": 0.01053872, + "balance_loss_clip": 1.03123438, + "balance_loss_mlp": 1.05414248, + "epoch": 0.0743123402976101, + "flos": 22272983337600.0, + "grad_norm": 5.584514149172867, + "language_loss": 0.82648033, + "learning_rate": 3.979438305871464e-06, + "loss": 0.84891528, + "num_input_tokens_seen": 26265750, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.359375, + "step": 1236, + "time_per_iteration": 2.462069511413574 + }, + { + "auxiliary_loss_clip": 0.0119681, + "auxiliary_loss_mlp": 0.01057366, + "balance_loss_clip": 1.03385794, + "balance_loss_mlp": 1.05572712, + "epoch": 0.07437246355027807, + "flos": 29315029645440.0, + "grad_norm": 2.2536643652174724, + "language_loss": 0.75702679, + "learning_rate": 3.979382565791951e-06, + "loss": 0.77956861, + "num_input_tokens_seen": 26287905, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.40625, + "step": 1237, + "time_per_iteration": 2.5572054386138916 + }, + { + "auxiliary_loss_clip": 0.01192925, + "auxiliary_loss_mlp": 0.01060344, + "balance_loss_clip": 1.03817141, + "balance_loss_mlp": 1.05427146, + "epoch": 0.07443258680294604, + "flos": 31944732249600.0, + "grad_norm": 1.878495773650564, + "language_loss": 0.7740556, + "learning_rate": 3.979326750654053e-06, + "loss": 0.7965883, + "num_input_tokens_seen": 26311795, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.3828125, + "step": 1238, + "time_per_iteration": 2.5915493965148926 + }, + { + "auxiliary_loss_clip": 0.01198337, + "auxiliary_loss_mlp": 0.01055743, + "balance_loss_clip": 1.03222322, + "balance_loss_mlp": 1.05435395, + "epoch": 0.074492710055614, + "flos": 22675290641280.0, + "grad_norm": 2.0695087378138455, + "language_loss": 0.86322856, + "learning_rate": 3.9792708604598854e-06, + "loss": 0.88576937, + "num_input_tokens_seen": 26330330, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.4375, + "step": 1239, + "time_per_iteration": 2.4961507320404053 + }, + { + "auxiliary_loss_clip": 0.01194884, + "auxiliary_loss_mlp": 0.01049072, + "balance_loss_clip": 1.02401412, + "balance_loss_mlp": 1.05433989, + "epoch": 0.07455283330828198, + "flos": 21284901037440.0, + "grad_norm": 2.179426429753772, + "language_loss": 0.89070082, + "learning_rate": 3.979214895211569e-06, + "loss": 0.91314042, + "num_input_tokens_seen": 26348865, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.40625, + "step": 1240, + "time_per_iteration": 2.456801176071167 + }, + { + "auxiliary_loss_clip": 0.01197473, + "auxiliary_loss_mlp": 0.01058447, + "balance_loss_clip": 1.03325772, + "balance_loss_mlp": 1.05600643, + "epoch": 0.07461295656094995, + "flos": 24388408967040.0, + "grad_norm": 2.2624482063672513, + "language_loss": 0.88586551, + "learning_rate": 3.979158854911225e-06, + "loss": 0.90842468, + "num_input_tokens_seen": 26368210, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.4140625, + "step": 1241, + "time_per_iteration": 2.5667178630828857 + }, + { + "auxiliary_loss_clip": 0.01080695, + "auxiliary_loss_mlp": 0.01022083, + "balance_loss_clip": 1.018507, + "balance_loss_mlp": 1.02113318, + "epoch": 0.07467307981361791, + "flos": 62109660574080.0, + "grad_norm": 0.9233978594431768, + "language_loss": 0.63032585, + "learning_rate": 3.979102739560979e-06, + "loss": 0.65135366, + "num_input_tokens_seen": 26424890, + "router_z_loss_clip": 0.03564453, + "router_z_loss_mlp": 0.59375, + "step": 1242, + "time_per_iteration": 3.1321358680725098 + }, + { + "auxiliary_loss_clip": 0.012088, + "auxiliary_loss_mlp": 0.01059736, + "balance_loss_clip": 1.03305697, + "balance_loss_mlp": 1.05792046, + "epoch": 0.07473320306628589, + "flos": 24863148046080.0, + "grad_norm": 2.8956100556858004, + "language_loss": 0.62917286, + "learning_rate": 3.9790465491629595e-06, + "loss": 0.65185821, + "num_input_tokens_seen": 26446405, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.5078125, + "step": 1243, + "time_per_iteration": 2.5571463108062744 + }, + { + "auxiliary_loss_clip": 0.01196196, + "auxiliary_loss_mlp": 0.01052045, + "balance_loss_clip": 1.0280956, + "balance_loss_mlp": 1.05710852, + "epoch": 0.07479332631895386, + "flos": 24897442556160.0, + "grad_norm": 2.504235331520048, + "language_loss": 0.76465732, + "learning_rate": 3.978990283719296e-06, + "loss": 0.78713971, + "num_input_tokens_seen": 26466070, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.390625, + "step": 1244, + "time_per_iteration": 2.501621723175049 + }, + { + "auxiliary_loss_clip": 0.01197755, + "auxiliary_loss_mlp": 0.01058762, + "balance_loss_clip": 1.03462183, + "balance_loss_mlp": 1.05684423, + "epoch": 0.07485344957162182, + "flos": 17815247821440.0, + "grad_norm": 2.8968513367461495, + "language_loss": 0.69149882, + "learning_rate": 3.978933943232123e-06, + "loss": 0.714064, + "num_input_tokens_seen": 26479350, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.40625, + "step": 1245, + "time_per_iteration": 2.417921781539917 + }, + { + "auxiliary_loss_clip": 0.01196347, + "auxiliary_loss_mlp": 0.01052065, + "balance_loss_clip": 1.02768707, + "balance_loss_mlp": 1.05663347, + "epoch": 0.0749135728242898, + "flos": 25010202326400.0, + "grad_norm": 1.9272496045423029, + "language_loss": 0.88344061, + "learning_rate": 3.978877527703576e-06, + "loss": 0.90592474, + "num_input_tokens_seen": 26498255, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.3984375, + "step": 1246, + "time_per_iteration": 2.5631723403930664 + }, + { + "auxiliary_loss_clip": 0.01205457, + "auxiliary_loss_mlp": 0.01067222, + "balance_loss_clip": 1.04055524, + "balance_loss_mlp": 1.05656838, + "epoch": 0.07497369607695777, + "flos": 17822071405440.0, + "grad_norm": 2.4755370190447064, + "language_loss": 0.87921643, + "learning_rate": 3.9788210371357945e-06, + "loss": 0.90194321, + "num_input_tokens_seen": 26515375, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.4921875, + "step": 1247, + "time_per_iteration": 2.4602389335632324 + }, + { + "auxiliary_loss_clip": 0.01194073, + "auxiliary_loss_mlp": 0.01060013, + "balance_loss_clip": 1.03502667, + "balance_loss_mlp": 1.05565107, + "epoch": 0.07503381932962573, + "flos": 15121086261120.0, + "grad_norm": 2.2039165223770194, + "language_loss": 0.6477375, + "learning_rate": 3.978764471530921e-06, + "loss": 0.67027843, + "num_input_tokens_seen": 26533595, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.3828125, + "step": 1248, + "time_per_iteration": 2.4408388137817383 + }, + { + "auxiliary_loss_clip": 0.01192958, + "auxiliary_loss_mlp": 0.0106246, + "balance_loss_clip": 1.04016805, + "balance_loss_mlp": 1.0575254, + "epoch": 0.0750939425822937, + "flos": 12816734071680.0, + "grad_norm": 2.0641418493429713, + "language_loss": 0.73964334, + "learning_rate": 3.978707830891102e-06, + "loss": 0.76219749, + "num_input_tokens_seen": 26549405, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.359375, + "step": 1249, + "time_per_iteration": 2.443767547607422 + }, + { + "auxiliary_loss_clip": 0.01201286, + "auxiliary_loss_mlp": 0.01068388, + "balance_loss_clip": 1.0433774, + "balance_loss_mlp": 1.05842972, + "epoch": 0.07515406583496168, + "flos": 24206844695040.0, + "grad_norm": 2.607815988938315, + "language_loss": 0.81845009, + "learning_rate": 3.978651115218482e-06, + "loss": 0.84114683, + "num_input_tokens_seen": 26567200, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.4296875, + "step": 1250, + "time_per_iteration": 2.491236448287964 + }, + { + "auxiliary_loss_clip": 0.01197565, + "auxiliary_loss_mlp": 0.01061004, + "balance_loss_clip": 1.03703094, + "balance_loss_mlp": 1.05932856, + "epoch": 0.07521418908762964, + "flos": 26688164215680.0, + "grad_norm": 2.308634463940828, + "language_loss": 0.66713893, + "learning_rate": 3.978594324515215e-06, + "loss": 0.68972456, + "num_input_tokens_seen": 26586190, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.3828125, + "step": 1251, + "time_per_iteration": 2.5437874794006348 + }, + { + "auxiliary_loss_clip": 0.0107681, + "auxiliary_loss_mlp": 0.01002851, + "balance_loss_clip": 0.99946529, + "balance_loss_mlp": 1.02021933, + "epoch": 0.0752743123402976, + "flos": 59095140589440.0, + "grad_norm": 0.8978558428983584, + "language_loss": 0.70356798, + "learning_rate": 3.9785374587834515e-06, + "loss": 0.72436458, + "num_input_tokens_seen": 26650710, + "router_z_loss_clip": 0.03393555, + "router_z_loss_mlp": 0.56640625, + "step": 1252, + "time_per_iteration": 3.1170923709869385 + }, + { + "auxiliary_loss_clip": 0.01194007, + "auxiliary_loss_mlp": 0.01061281, + "balance_loss_clip": 1.03698599, + "balance_loss_mlp": 1.05419612, + "epoch": 0.07533443559296558, + "flos": 23477032160640.0, + "grad_norm": 2.9290655276351045, + "language_loss": 0.79516673, + "learning_rate": 3.97848051802535e-06, + "loss": 0.81771958, + "num_input_tokens_seen": 26669000, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.3984375, + "step": 1253, + "time_per_iteration": 2.4821414947509766 + }, + { + "auxiliary_loss_clip": 0.01199953, + "auxiliary_loss_mlp": 0.01065033, + "balance_loss_clip": 1.04125071, + "balance_loss_mlp": 1.05829906, + "epoch": 0.07539455884563355, + "flos": 20879110114560.0, + "grad_norm": 2.5751371148477995, + "language_loss": 0.93441045, + "learning_rate": 3.978423502243069e-06, + "loss": 0.95706034, + "num_input_tokens_seen": 26683075, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.421875, + "step": 1254, + "time_per_iteration": 2.4245519638061523 + }, + { + "auxiliary_loss_clip": 0.01191058, + "auxiliary_loss_mlp": 0.01062028, + "balance_loss_clip": 1.03849554, + "balance_loss_mlp": 1.05566263, + "epoch": 0.07545468209830151, + "flos": 27672906551040.0, + "grad_norm": 1.866823394820361, + "language_loss": 0.88030314, + "learning_rate": 3.97836641143877e-06, + "loss": 0.902834, + "num_input_tokens_seen": 26701875, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.359375, + "step": 1255, + "time_per_iteration": 2.5579185485839844 + }, + { + "auxiliary_loss_clip": 0.01192242, + "auxiliary_loss_mlp": 0.01064619, + "balance_loss_clip": 1.04009795, + "balance_loss_mlp": 1.05518413, + "epoch": 0.0755148053509695, + "flos": 14136990370560.0, + "grad_norm": 1.7574194703288544, + "language_loss": 0.79516619, + "learning_rate": 3.978309245614618e-06, + "loss": 0.81773484, + "num_input_tokens_seen": 26719050, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.3671875, + "step": 1256, + "time_per_iteration": 2.4203784465789795 + }, + { + "auxiliary_loss_clip": 0.01074137, + "auxiliary_loss_mlp": 0.01007102, + "balance_loss_clip": 1.00378788, + "balance_loss_mlp": 1.01769829, + "epoch": 0.07557492860363746, + "flos": 58235257929600.0, + "grad_norm": 0.8283025846018472, + "language_loss": 0.58016127, + "learning_rate": 3.9782520047727825e-06, + "loss": 0.60097361, + "num_input_tokens_seen": 26780650, + "router_z_loss_clip": 0.03320312, + "router_z_loss_mlp": 0.5625, + "step": 1257, + "time_per_iteration": 3.1732118129730225 + }, + { + "auxiliary_loss_clip": 0.0119581, + "auxiliary_loss_mlp": 0.01056297, + "balance_loss_clip": 1.03272927, + "balance_loss_mlp": 1.05982757, + "epoch": 0.07563505185630542, + "flos": 24644380262400.0, + "grad_norm": 3.1336739114125107, + "language_loss": 0.89859951, + "learning_rate": 3.978194688915432e-06, + "loss": 0.92112058, + "num_input_tokens_seen": 26798725, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.359375, + "step": 1258, + "time_per_iteration": 2.516925811767578 + }, + { + "auxiliary_loss_clip": 0.01192364, + "auxiliary_loss_mlp": 0.01054502, + "balance_loss_clip": 1.03181624, + "balance_loss_mlp": 1.05663717, + "epoch": 0.07569517510897339, + "flos": 15522998515200.0, + "grad_norm": 3.28312942247731, + "language_loss": 0.81211507, + "learning_rate": 3.978137298044741e-06, + "loss": 0.83458376, + "num_input_tokens_seen": 26817005, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.359375, + "step": 1259, + "time_per_iteration": 2.449533224105835 + }, + { + "auxiliary_loss_clip": 0.01193912, + "auxiliary_loss_mlp": 0.01058573, + "balance_loss_clip": 1.03593481, + "balance_loss_mlp": 1.05662787, + "epoch": 0.07575529836164137, + "flos": 22928532503040.0, + "grad_norm": 1.9172803769558988, + "language_loss": 0.75733984, + "learning_rate": 3.978079832162885e-06, + "loss": 0.77986467, + "num_input_tokens_seen": 26836655, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.375, + "step": 1260, + "time_per_iteration": 2.5003559589385986 + }, + { + "auxiliary_loss_clip": 0.01192246, + "auxiliary_loss_mlp": 0.01059755, + "balance_loss_clip": 1.03550828, + "balance_loss_mlp": 1.0552032, + "epoch": 0.07581542161430933, + "flos": 19500428344320.0, + "grad_norm": 1.8260195606442358, + "language_loss": 0.84695768, + "learning_rate": 3.978022291272044e-06, + "loss": 0.86947775, + "num_input_tokens_seen": 26854925, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.375, + "step": 1261, + "time_per_iteration": 2.4633476734161377 + }, + { + "auxiliary_loss_clip": 0.01200376, + "auxiliary_loss_mlp": 0.01060967, + "balance_loss_clip": 1.03828108, + "balance_loss_mlp": 1.05969536, + "epoch": 0.0758755448669773, + "flos": 24973465691520.0, + "grad_norm": 2.3160282321136334, + "language_loss": 0.8266682, + "learning_rate": 3.977964675374399e-06, + "loss": 0.84928167, + "num_input_tokens_seen": 26876170, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.40625, + "step": 1262, + "time_per_iteration": 2.5256471633911133 + }, + { + "auxiliary_loss_clip": 0.01192085, + "auxiliary_loss_mlp": 0.01061195, + "balance_loss_clip": 1.03703153, + "balance_loss_mlp": 1.0540688, + "epoch": 0.07593566811964528, + "flos": 22747973811840.0, + "grad_norm": 2.4581964181262776, + "language_loss": 0.8255769, + "learning_rate": 3.977906984472136e-06, + "loss": 0.84810972, + "num_input_tokens_seen": 26895005, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.3828125, + "step": 1263, + "time_per_iteration": 2.470656633377075 + }, + { + "auxiliary_loss_clip": 0.01195735, + "auxiliary_loss_mlp": 0.01056704, + "balance_loss_clip": 1.03381538, + "balance_loss_mlp": 1.05504882, + "epoch": 0.07599579137231324, + "flos": 23112395245440.0, + "grad_norm": 2.324943057092889, + "language_loss": 0.7591399, + "learning_rate": 3.977849218567442e-06, + "loss": 0.78166431, + "num_input_tokens_seen": 26913930, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.40625, + "step": 1264, + "time_per_iteration": 2.4715359210968018 + }, + { + "auxiliary_loss_clip": 0.0119596, + "auxiliary_loss_mlp": 0.01062168, + "balance_loss_clip": 1.03832579, + "balance_loss_mlp": 1.05711412, + "epoch": 0.07605591462498121, + "flos": 14502058248960.0, + "grad_norm": 2.1997185871944356, + "language_loss": 0.81106204, + "learning_rate": 3.977791377662507e-06, + "loss": 0.83364332, + "num_input_tokens_seen": 26931485, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.390625, + "step": 1265, + "time_per_iteration": 2.440000295639038 + }, + { + "auxiliary_loss_clip": 0.01195477, + "auxiliary_loss_mlp": 0.01056708, + "balance_loss_clip": 1.03408241, + "balance_loss_mlp": 1.05631864, + "epoch": 0.07611603787764919, + "flos": 23514199758720.0, + "grad_norm": 2.141616369936441, + "language_loss": 0.64935738, + "learning_rate": 3.977733461759524e-06, + "loss": 0.67187923, + "num_input_tokens_seen": 26951670, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.390625, + "step": 1266, + "time_per_iteration": 2.495001792907715 + }, + { + "auxiliary_loss_clip": 0.01194799, + "auxiliary_loss_mlp": 0.01060988, + "balance_loss_clip": 1.03752804, + "balance_loss_mlp": 1.05550349, + "epoch": 0.07617616113031715, + "flos": 21507188353920.0, + "grad_norm": 2.2514277899416606, + "language_loss": 0.79527593, + "learning_rate": 3.977675470860691e-06, + "loss": 0.81783378, + "num_input_tokens_seen": 26970335, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.390625, + "step": 1267, + "time_per_iteration": 2.4763970375061035 + }, + { + "auxiliary_loss_clip": 0.01194511, + "auxiliary_loss_mlp": 0.01051729, + "balance_loss_clip": 1.02975869, + "balance_loss_mlp": 1.05526185, + "epoch": 0.07623628438298512, + "flos": 14573161221120.0, + "grad_norm": 2.2740159695832682, + "language_loss": 0.7253381, + "learning_rate": 3.977617404968205e-06, + "loss": 0.74780059, + "num_input_tokens_seen": 26986025, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.390625, + "step": 1268, + "time_per_iteration": 3.8910977840423584 + }, + { + "auxiliary_loss_clip": 0.01192554, + "auxiliary_loss_mlp": 0.01057239, + "balance_loss_clip": 1.03447044, + "balance_loss_mlp": 1.05342031, + "epoch": 0.07629640763565308, + "flos": 14720395069440.0, + "grad_norm": 2.163449384012833, + "language_loss": 0.81891817, + "learning_rate": 3.977559264084269e-06, + "loss": 0.84141612, + "num_input_tokens_seen": 27004045, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.390625, + "step": 1269, + "time_per_iteration": 3.8643741607666016 + }, + { + "auxiliary_loss_clip": 0.01192657, + "auxiliary_loss_mlp": 0.01054484, + "balance_loss_clip": 1.03120267, + "balance_loss_mlp": 1.05559695, + "epoch": 0.07635653088832106, + "flos": 14902929008640.0, + "grad_norm": 3.2383492700687078, + "language_loss": 0.88135087, + "learning_rate": 3.977501048211088e-06, + "loss": 0.90382218, + "num_input_tokens_seen": 27022070, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.375, + "step": 1270, + "time_per_iteration": 2.4746575355529785 + }, + { + "auxiliary_loss_clip": 0.01198155, + "auxiliary_loss_mlp": 0.0105921, + "balance_loss_clip": 1.03559494, + "balance_loss_mlp": 1.05707884, + "epoch": 0.07641665414098903, + "flos": 26651571235200.0, + "grad_norm": 2.188682914143081, + "language_loss": 0.71113384, + "learning_rate": 3.977442757350869e-06, + "loss": 0.73370755, + "num_input_tokens_seen": 27041755, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.4140625, + "step": 1271, + "time_per_iteration": 2.529632091522217 + }, + { + "auxiliary_loss_clip": 0.01189637, + "auxiliary_loss_mlp": 0.01066344, + "balance_loss_clip": 1.04351556, + "balance_loss_mlp": 1.05675423, + "epoch": 0.07647677739365699, + "flos": 25192808092800.0, + "grad_norm": 1.9018984880968814, + "language_loss": 0.82745486, + "learning_rate": 3.977384391505823e-06, + "loss": 0.85001469, + "num_input_tokens_seen": 27061540, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.328125, + "step": 1272, + "time_per_iteration": 2.4950368404388428 + }, + { + "auxiliary_loss_clip": 0.01191491, + "auxiliary_loss_mlp": 0.01061838, + "balance_loss_clip": 1.03867579, + "balance_loss_mlp": 1.05351079, + "epoch": 0.07653690064632497, + "flos": 20558141159040.0, + "grad_norm": 2.0211474255264643, + "language_loss": 0.79951203, + "learning_rate": 3.977325950678162e-06, + "loss": 0.82204533, + "num_input_tokens_seen": 27081395, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.3828125, + "step": 1273, + "time_per_iteration": 2.490281105041504 + }, + { + "auxiliary_loss_clip": 0.01194744, + "auxiliary_loss_mlp": 0.01062211, + "balance_loss_clip": 1.03858376, + "balance_loss_mlp": 1.05600715, + "epoch": 0.07659702389899294, + "flos": 22269320150400.0, + "grad_norm": 1.848359088284866, + "language_loss": 0.81545758, + "learning_rate": 3.977267434870103e-06, + "loss": 0.83802712, + "num_input_tokens_seen": 27101175, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.390625, + "step": 1274, + "time_per_iteration": 2.499799966812134 + }, + { + "auxiliary_loss_clip": 0.01191932, + "auxiliary_loss_mlp": 0.01068548, + "balance_loss_clip": 1.04430115, + "balance_loss_mlp": 1.05469346, + "epoch": 0.0766571471516609, + "flos": 32636120209920.0, + "grad_norm": 1.991418246716423, + "language_loss": 0.73099387, + "learning_rate": 3.977208844083865e-06, + "loss": 0.75359869, + "num_input_tokens_seen": 27124505, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.375, + "step": 1275, + "time_per_iteration": 2.557973623275757 + }, + { + "auxiliary_loss_clip": 0.011939, + "auxiliary_loss_mlp": 0.01061514, + "balance_loss_clip": 1.0368377, + "balance_loss_mlp": 1.05536842, + "epoch": 0.07671727040432888, + "flos": 15267386355840.0, + "grad_norm": 2.1093684912214545, + "language_loss": 0.79584897, + "learning_rate": 3.9771501783216685e-06, + "loss": 0.81840312, + "num_input_tokens_seen": 27140960, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.390625, + "step": 1276, + "time_per_iteration": 2.4329752922058105 + }, + { + "auxiliary_loss_clip": 0.01196395, + "auxiliary_loss_mlp": 0.01051332, + "balance_loss_clip": 1.02838457, + "balance_loss_mlp": 1.05656397, + "epoch": 0.07677739365699685, + "flos": 28184094956160.0, + "grad_norm": 2.623540269613024, + "language_loss": 0.59020305, + "learning_rate": 3.97709143758574e-06, + "loss": 0.61268032, + "num_input_tokens_seen": 27160985, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.3984375, + "step": 1277, + "time_per_iteration": 2.5318989753723145 + }, + { + "auxiliary_loss_clip": 0.01200985, + "auxiliary_loss_mlp": 0.01057464, + "balance_loss_clip": 1.03433728, + "balance_loss_mlp": 1.05805659, + "epoch": 0.07683751690966481, + "flos": 18296128126080.0, + "grad_norm": 2.2944749333347096, + "language_loss": 0.74846482, + "learning_rate": 3.977032621878305e-06, + "loss": 0.77104926, + "num_input_tokens_seen": 27178390, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.4296875, + "step": 1278, + "time_per_iteration": 2.448615789413452 + }, + { + "auxiliary_loss_clip": 0.01190146, + "auxiliary_loss_mlp": 0.01051712, + "balance_loss_clip": 1.02943182, + "balance_loss_mlp": 1.05475163, + "epoch": 0.07689764016233278, + "flos": 21981101420160.0, + "grad_norm": 4.0999470067777075, + "language_loss": 0.88656616, + "learning_rate": 3.976973731201596e-06, + "loss": 0.90898478, + "num_input_tokens_seen": 27197505, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.359375, + "step": 1279, + "time_per_iteration": 2.4883790016174316 + }, + { + "auxiliary_loss_clip": 0.01189256, + "auxiliary_loss_mlp": 0.01062556, + "balance_loss_clip": 1.03973901, + "balance_loss_mlp": 1.05507362, + "epoch": 0.07695776341500075, + "flos": 22235995307520.0, + "grad_norm": 3.4596954186847393, + "language_loss": 0.82899994, + "learning_rate": 3.976914765557845e-06, + "loss": 0.85151803, + "num_input_tokens_seen": 27214260, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.34375, + "step": 1280, + "time_per_iteration": 2.459294319152832 + }, + { + "auxiliary_loss_clip": 0.01188755, + "auxiliary_loss_mlp": 0.01061811, + "balance_loss_clip": 1.03874409, + "balance_loss_mlp": 1.05492759, + "epoch": 0.07701788666766872, + "flos": 16143750380160.0, + "grad_norm": 1.9224222656998016, + "language_loss": 0.76059222, + "learning_rate": 3.9768557249492875e-06, + "loss": 0.78309786, + "num_input_tokens_seen": 27232525, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.3359375, + "step": 1281, + "time_per_iteration": 2.453183650970459 + }, + { + "auxiliary_loss_clip": 0.0119548, + "auxiliary_loss_mlp": 0.01054802, + "balance_loss_clip": 1.03128171, + "balance_loss_mlp": 1.05448353, + "epoch": 0.07707800992033668, + "flos": 19463045264640.0, + "grad_norm": 1.8937081587754587, + "language_loss": 0.75307631, + "learning_rate": 3.9767966093781634e-06, + "loss": 0.77557921, + "num_input_tokens_seen": 27249800, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.40625, + "step": 1282, + "time_per_iteration": 2.4526116847991943 + }, + { + "auxiliary_loss_clip": 0.01190337, + "auxiliary_loss_mlp": 0.01070616, + "balance_loss_clip": 1.04734671, + "balance_loss_mlp": 1.054286, + "epoch": 0.07713813317300466, + "flos": 18990281433600.0, + "grad_norm": 2.0304459145795963, + "language_loss": 0.8428033, + "learning_rate": 3.976737418846713e-06, + "loss": 0.86541283, + "num_input_tokens_seen": 27268895, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.359375, + "step": 1283, + "time_per_iteration": 2.468101739883423 + }, + { + "auxiliary_loss_clip": 0.01192768, + "auxiliary_loss_mlp": 0.01062747, + "balance_loss_clip": 1.0375464, + "balance_loss_mlp": 1.05560803, + "epoch": 0.07719825642567263, + "flos": 18113953322880.0, + "grad_norm": 2.622403612740989, + "language_loss": 0.75031364, + "learning_rate": 3.976678153357181e-06, + "loss": 0.77286887, + "num_input_tokens_seen": 27288180, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.375, + "step": 1284, + "time_per_iteration": 2.451749801635742 + }, + { + "auxiliary_loss_clip": 0.01188745, + "auxiliary_loss_mlp": 0.0106155, + "balance_loss_clip": 1.03947222, + "balance_loss_mlp": 1.05330253, + "epoch": 0.0772583796783406, + "flos": 42194426993280.0, + "grad_norm": 1.6448065546510353, + "language_loss": 0.75934827, + "learning_rate": 3.976618812911817e-06, + "loss": 0.78185129, + "num_input_tokens_seen": 27311815, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.359375, + "step": 1285, + "time_per_iteration": 2.664769411087036 + }, + { + "auxiliary_loss_clip": 0.01196484, + "auxiliary_loss_mlp": 0.01062869, + "balance_loss_clip": 1.0406251, + "balance_loss_mlp": 1.05862105, + "epoch": 0.07731850293100857, + "flos": 24753692327040.0, + "grad_norm": 1.8165785508620624, + "language_loss": 0.84204662, + "learning_rate": 3.9765593975128685e-06, + "loss": 0.86464012, + "num_input_tokens_seen": 27331890, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.375, + "step": 1286, + "time_per_iteration": 2.550670862197876 + }, + { + "auxiliary_loss_clip": 0.01196192, + "auxiliary_loss_mlp": 0.01055874, + "balance_loss_clip": 1.03271151, + "balance_loss_mlp": 1.05582845, + "epoch": 0.07737862618367654, + "flos": 17565884628480.0, + "grad_norm": 4.521300853065514, + "language_loss": 0.76725763, + "learning_rate": 3.97649990716259e-06, + "loss": 0.78977823, + "num_input_tokens_seen": 27348320, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.40625, + "step": 1287, + "time_per_iteration": 2.455627918243408 + }, + { + "auxiliary_loss_clip": 0.01190346, + "auxiliary_loss_mlp": 0.01058612, + "balance_loss_clip": 1.03636777, + "balance_loss_mlp": 1.05476642, + "epoch": 0.0774387494363445, + "flos": 25627147349760.0, + "grad_norm": 1.6785000972571258, + "language_loss": 0.84509134, + "learning_rate": 3.976440341863237e-06, + "loss": 0.86758095, + "num_input_tokens_seen": 27367670, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.359375, + "step": 1288, + "time_per_iteration": 2.500218629837036 + }, + { + "auxiliary_loss_clip": 0.01192387, + "auxiliary_loss_mlp": 0.01056799, + "balance_loss_clip": 1.0350318, + "balance_loss_mlp": 1.05364347, + "epoch": 0.07749887268901248, + "flos": 12239865648000.0, + "grad_norm": 2.192533837519805, + "language_loss": 0.85769016, + "learning_rate": 3.976380701617068e-06, + "loss": 0.88018203, + "num_input_tokens_seen": 27385485, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.390625, + "step": 1289, + "time_per_iteration": 2.4759440422058105 + }, + { + "auxiliary_loss_clip": 0.01189023, + "auxiliary_loss_mlp": 0.01047658, + "balance_loss_clip": 1.02563989, + "balance_loss_mlp": 1.05300641, + "epoch": 0.07755899594168045, + "flos": 25081736261760.0, + "grad_norm": 1.8877463184856607, + "language_loss": 0.85053366, + "learning_rate": 3.976320986426344e-06, + "loss": 0.87290049, + "num_input_tokens_seen": 27405110, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.359375, + "step": 1290, + "time_per_iteration": 2.5266401767730713 + }, + { + "auxiliary_loss_clip": 0.01185369, + "auxiliary_loss_mlp": 0.01059291, + "balance_loss_clip": 1.03541303, + "balance_loss_mlp": 1.05397463, + "epoch": 0.07761911919434841, + "flos": 14246410176000.0, + "grad_norm": 2.3980248629455834, + "language_loss": 0.90562832, + "learning_rate": 3.9762611962933315e-06, + "loss": 0.92807496, + "num_input_tokens_seen": 27422855, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.3125, + "step": 1291, + "time_per_iteration": 2.4760262966156006 + }, + { + "auxiliary_loss_clip": 0.01071114, + "auxiliary_loss_mlp": 0.01008288, + "balance_loss_clip": 1.00456893, + "balance_loss_mlp": 1.01656318, + "epoch": 0.07767924244701638, + "flos": 67237202954880.0, + "grad_norm": 0.9429671936579762, + "language_loss": 0.64993972, + "learning_rate": 3.9762013312202955e-06, + "loss": 0.67073375, + "num_input_tokens_seen": 27487190, + "router_z_loss_clip": 0.03710938, + "router_z_loss_mlp": 0.546875, + "step": 1292, + "time_per_iteration": 3.1508371829986572 + }, + { + "auxiliary_loss_clip": 0.0118873, + "auxiliary_loss_mlp": 0.01059018, + "balance_loss_clip": 1.03716707, + "balance_loss_mlp": 1.05293965, + "epoch": 0.07773936569968436, + "flos": 28550635292160.0, + "grad_norm": 1.7960778456946043, + "language_loss": 0.87610948, + "learning_rate": 3.9761413912095075e-06, + "loss": 0.89858699, + "num_input_tokens_seen": 27510465, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.359375, + "step": 1293, + "time_per_iteration": 2.6359729766845703 + }, + { + "auxiliary_loss_clip": 0.01193413, + "auxiliary_loss_mlp": 0.01062236, + "balance_loss_clip": 1.03789377, + "balance_loss_mlp": 1.05659533, + "epoch": 0.07779948895235232, + "flos": 27490264871040.0, + "grad_norm": 2.312065886688882, + "language_loss": 0.85111046, + "learning_rate": 3.976081376263239e-06, + "loss": 0.873667, + "num_input_tokens_seen": 27528645, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.3671875, + "step": 1294, + "time_per_iteration": 2.5151314735412598 + }, + { + "auxiliary_loss_clip": 0.01193943, + "auxiliary_loss_mlp": 0.01054926, + "balance_loss_clip": 1.03188252, + "balance_loss_mlp": 1.05702615, + "epoch": 0.07785961220502029, + "flos": 18223301301120.0, + "grad_norm": 2.728225366024782, + "language_loss": 0.79202414, + "learning_rate": 3.976021286383768e-06, + "loss": 0.81451285, + "num_input_tokens_seen": 27546165, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.3671875, + "step": 1295, + "time_per_iteration": 2.459510326385498 + }, + { + "auxiliary_loss_clip": 0.01188808, + "auxiliary_loss_mlp": 0.01052849, + "balance_loss_clip": 1.02966261, + "balance_loss_mlp": 1.05383039, + "epoch": 0.07791973545768827, + "flos": 24608218245120.0, + "grad_norm": 2.8222308711400834, + "language_loss": 0.88216382, + "learning_rate": 3.975961121573371e-06, + "loss": 0.90458035, + "num_input_tokens_seen": 27566520, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3515625, + "step": 1296, + "time_per_iteration": 2.492892026901245 + }, + { + "auxiliary_loss_clip": 0.01192946, + "auxiliary_loss_mlp": 0.01058016, + "balance_loss_clip": 1.03410244, + "balance_loss_mlp": 1.05591464, + "epoch": 0.07797985871035623, + "flos": 14282069402880.0, + "grad_norm": 3.2140473454082086, + "language_loss": 0.96160841, + "learning_rate": 3.9759008818343305e-06, + "loss": 0.98411804, + "num_input_tokens_seen": 27581960, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.375, + "step": 1297, + "time_per_iteration": 2.4668915271759033 + }, + { + "auxiliary_loss_clip": 0.01189875, + "auxiliary_loss_mlp": 0.01054366, + "balance_loss_clip": 1.032372, + "balance_loss_mlp": 1.05289149, + "epoch": 0.0780399819630242, + "flos": 26610453141120.0, + "grad_norm": 2.460261972702069, + "language_loss": 0.76087165, + "learning_rate": 3.97584056716893e-06, + "loss": 0.78331399, + "num_input_tokens_seen": 27601415, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.3671875, + "step": 1298, + "time_per_iteration": 2.5059781074523926 + }, + { + "auxiliary_loss_clip": 0.01192131, + "auxiliary_loss_mlp": 0.01061793, + "balance_loss_clip": 1.04039502, + "balance_loss_mlp": 1.05696058, + "epoch": 0.07810010521569218, + "flos": 21834514016640.0, + "grad_norm": 1.8752674736144914, + "language_loss": 0.80755305, + "learning_rate": 3.9757801775794575e-06, + "loss": 0.83009231, + "num_input_tokens_seen": 27621490, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.3515625, + "step": 1299, + "time_per_iteration": 2.5036020278930664 + }, + { + "auxiliary_loss_clip": 0.01183493, + "auxiliary_loss_mlp": 0.01056623, + "balance_loss_clip": 1.03402138, + "balance_loss_mlp": 1.05226159, + "epoch": 0.07816022846836014, + "flos": 25081233471360.0, + "grad_norm": 2.1903498852009813, + "language_loss": 0.86459941, + "learning_rate": 3.975719713068202e-06, + "loss": 0.88700056, + "num_input_tokens_seen": 27640600, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.3125, + "step": 1300, + "time_per_iteration": 2.4866278171539307 + }, + { + "auxiliary_loss_clip": 0.0118988, + "auxiliary_loss_mlp": 0.01052064, + "balance_loss_clip": 1.0284245, + "balance_loss_mlp": 1.05393028, + "epoch": 0.0782203517210281, + "flos": 40917515431680.0, + "grad_norm": 1.909902293479526, + "language_loss": 0.71778899, + "learning_rate": 3.975659173637458e-06, + "loss": 0.74020839, + "num_input_tokens_seen": 27663070, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.359375, + "step": 1301, + "time_per_iteration": 2.6491336822509766 + }, + { + "auxiliary_loss_clip": 0.01196178, + "auxiliary_loss_mlp": 0.0106414, + "balance_loss_clip": 1.04106081, + "balance_loss_mlp": 1.0586772, + "epoch": 0.07828047497369607, + "flos": 41172014269440.0, + "grad_norm": 1.5624281437346959, + "language_loss": 0.70860815, + "learning_rate": 3.97559855928952e-06, + "loss": 0.7312113, + "num_input_tokens_seen": 27686425, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.375, + "step": 1302, + "time_per_iteration": 2.635430335998535 + }, + { + "auxiliary_loss_clip": 0.01188946, + "auxiliary_loss_mlp": 0.01060723, + "balance_loss_clip": 1.03702378, + "balance_loss_mlp": 1.05438161, + "epoch": 0.07834059822636405, + "flos": 23508130360320.0, + "grad_norm": 2.152945758623263, + "language_loss": 0.8192755, + "learning_rate": 3.9755378700266864e-06, + "loss": 0.84177226, + "num_input_tokens_seen": 27704900, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.34375, + "step": 1303, + "time_per_iteration": 2.4861090183258057 + }, + { + "auxiliary_loss_clip": 0.01188578, + "auxiliary_loss_mlp": 0.01061933, + "balance_loss_clip": 1.03879452, + "balance_loss_mlp": 1.05351233, + "epoch": 0.07840072147903202, + "flos": 20193899293440.0, + "grad_norm": 1.8425530042965788, + "language_loss": 0.7497822, + "learning_rate": 3.9754771058512585e-06, + "loss": 0.77228731, + "num_input_tokens_seen": 27724890, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3515625, + "step": 1304, + "time_per_iteration": 2.464087963104248 + }, + { + "auxiliary_loss_clip": 0.01191658, + "auxiliary_loss_mlp": 0.0106237, + "balance_loss_clip": 1.03857565, + "balance_loss_mlp": 1.05645108, + "epoch": 0.07846084473169998, + "flos": 21360816432000.0, + "grad_norm": 1.696211405930565, + "language_loss": 0.76397038, + "learning_rate": 3.975416266765542e-06, + "loss": 0.78651059, + "num_input_tokens_seen": 27743115, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.359375, + "step": 1305, + "time_per_iteration": 2.486093521118164 + }, + { + "auxiliary_loss_clip": 0.01192283, + "auxiliary_loss_mlp": 0.01064575, + "balance_loss_clip": 1.04087615, + "balance_loss_mlp": 1.05527782, + "epoch": 0.07852096798436796, + "flos": 25410965345280.0, + "grad_norm": 2.2926357932273866, + "language_loss": 0.85035503, + "learning_rate": 3.975355352771841e-06, + "loss": 0.87292361, + "num_input_tokens_seen": 27763570, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.3671875, + "step": 1306, + "time_per_iteration": 2.496265172958374 + }, + { + "auxiliary_loss_clip": 0.0119039, + "auxiliary_loss_mlp": 0.0104404, + "balance_loss_clip": 1.02299976, + "balance_loss_mlp": 1.05652416, + "epoch": 0.07858109123703592, + "flos": 24571481610240.0, + "grad_norm": 3.0575778567802976, + "language_loss": 0.90087706, + "learning_rate": 3.975294363872468e-06, + "loss": 0.92322135, + "num_input_tokens_seen": 27780030, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.34375, + "step": 1307, + "time_per_iteration": 2.5122623443603516 + }, + { + "auxiliary_loss_clip": 0.01189263, + "auxiliary_loss_mlp": 0.01057091, + "balance_loss_clip": 1.03295124, + "balance_loss_mlp": 1.05417371, + "epoch": 0.07864121448970389, + "flos": 20698874645760.0, + "grad_norm": 1.8540925974151201, + "language_loss": 0.83408689, + "learning_rate": 3.975233300069735e-06, + "loss": 0.85655046, + "num_input_tokens_seen": 27796225, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.3515625, + "step": 1308, + "time_per_iteration": 2.4686944484710693 + }, + { + "auxiliary_loss_clip": 0.01186004, + "auxiliary_loss_mlp": 0.01053628, + "balance_loss_clip": 1.03177738, + "balance_loss_mlp": 1.05289674, + "epoch": 0.07870133774237187, + "flos": 22966526113920.0, + "grad_norm": 1.6283340971904061, + "language_loss": 0.77841777, + "learning_rate": 3.975172161365958e-06, + "loss": 0.80081415, + "num_input_tokens_seen": 27815975, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.328125, + "step": 1309, + "time_per_iteration": 5.444388151168823 + }, + { + "auxiliary_loss_clip": 0.0119416, + "auxiliary_loss_mlp": 0.01062294, + "balance_loss_clip": 1.0380106, + "balance_loss_mlp": 1.05386913, + "epoch": 0.07876146099503983, + "flos": 18842832103680.0, + "grad_norm": 1.9656388899868151, + "language_loss": 0.80146122, + "learning_rate": 3.975110947763453e-06, + "loss": 0.82402575, + "num_input_tokens_seen": 27832255, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.40625, + "step": 1310, + "time_per_iteration": 3.8553466796875 + }, + { + "auxiliary_loss_clip": 0.01185305, + "auxiliary_loss_mlp": 0.01052602, + "balance_loss_clip": 1.03067899, + "balance_loss_mlp": 1.05544043, + "epoch": 0.0788215842477078, + "flos": 23805794367360.0, + "grad_norm": 1.7115323272474947, + "language_loss": 0.73069102, + "learning_rate": 3.9750496592645435e-06, + "loss": 0.75307012, + "num_input_tokens_seen": 27852180, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.296875, + "step": 1311, + "time_per_iteration": 2.5299458503723145 + }, + { + "auxiliary_loss_clip": 0.01190682, + "auxiliary_loss_mlp": 0.01072627, + "balance_loss_clip": 1.04861844, + "balance_loss_mlp": 1.05650353, + "epoch": 0.07888170750037576, + "flos": 21579907438080.0, + "grad_norm": 1.9161215374898264, + "language_loss": 0.85871482, + "learning_rate": 3.974988295871553e-06, + "loss": 0.88134789, + "num_input_tokens_seen": 27871435, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.34375, + "step": 1312, + "time_per_iteration": 2.5490031242370605 + }, + { + "auxiliary_loss_clip": 0.01186476, + "auxiliary_loss_mlp": 0.01059916, + "balance_loss_clip": 1.03811264, + "balance_loss_mlp": 1.0555284, + "epoch": 0.07894183075304374, + "flos": 19864849777920.0, + "grad_norm": 1.7542323177910393, + "language_loss": 0.81968379, + "learning_rate": 3.9749268575868085e-06, + "loss": 0.84214771, + "num_input_tokens_seen": 27890625, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.3125, + "step": 1313, + "time_per_iteration": 2.507046699523926 + }, + { + "auxiliary_loss_clip": 0.0119261, + "auxiliary_loss_mlp": 0.0105996, + "balance_loss_clip": 1.03528404, + "balance_loss_mlp": 1.05271506, + "epoch": 0.07900195400571171, + "flos": 16143463071360.0, + "grad_norm": 3.109477065223649, + "language_loss": 0.73372161, + "learning_rate": 3.97486534441264e-06, + "loss": 0.7562474, + "num_input_tokens_seen": 27906530, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.3984375, + "step": 1314, + "time_per_iteration": 2.4396395683288574 + }, + { + "auxiliary_loss_clip": 0.01185115, + "auxiliary_loss_mlp": 0.01058505, + "balance_loss_clip": 1.03678489, + "balance_loss_mlp": 1.05120206, + "epoch": 0.07906207725837967, + "flos": 23730417676800.0, + "grad_norm": 1.579996187361532, + "language_loss": 0.79460657, + "learning_rate": 3.974803756351379e-06, + "loss": 0.81704271, + "num_input_tokens_seen": 27926725, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.34375, + "step": 1315, + "time_per_iteration": 2.493365526199341 + }, + { + "auxiliary_loss_clip": 0.011877, + "auxiliary_loss_mlp": 0.01060931, + "balance_loss_clip": 1.03592062, + "balance_loss_mlp": 1.05232, + "epoch": 0.07912220051104765, + "flos": 24315905364480.0, + "grad_norm": 1.9411836832725016, + "language_loss": 0.73614991, + "learning_rate": 3.974742093405362e-06, + "loss": 0.75863618, + "num_input_tokens_seen": 27947875, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.3515625, + "step": 1316, + "time_per_iteration": 2.4696316719055176 + }, + { + "auxiliary_loss_clip": 0.01193023, + "auxiliary_loss_mlp": 0.01063129, + "balance_loss_clip": 1.03940618, + "balance_loss_mlp": 1.05415511, + "epoch": 0.07918232376371562, + "flos": 18880035615360.0, + "grad_norm": 2.862910173072837, + "language_loss": 0.65148681, + "learning_rate": 3.974680355576927e-06, + "loss": 0.67404836, + "num_input_tokens_seen": 27965040, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.390625, + "step": 1317, + "time_per_iteration": 2.447847843170166 + }, + { + "auxiliary_loss_clip": 0.01197561, + "auxiliary_loss_mlp": 0.01063488, + "balance_loss_clip": 1.03899026, + "balance_loss_mlp": 1.05774999, + "epoch": 0.07924244701638358, + "flos": 27376284038400.0, + "grad_norm": 2.3478172138868967, + "language_loss": 0.7324174, + "learning_rate": 3.974618542868415e-06, + "loss": 0.75502789, + "num_input_tokens_seen": 27985330, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.3984375, + "step": 1318, + "time_per_iteration": 2.497406482696533 + }, + { + "auxiliary_loss_clip": 0.01188329, + "auxiliary_loss_mlp": 0.01057875, + "balance_loss_clip": 1.03557122, + "balance_loss_mlp": 1.05335736, + "epoch": 0.07930257026905156, + "flos": 25120340403840.0, + "grad_norm": 1.92969491679129, + "language_loss": 0.90610284, + "learning_rate": 3.97455665528217e-06, + "loss": 0.92856491, + "num_input_tokens_seen": 28007615, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.3515625, + "step": 1319, + "time_per_iteration": 2.5007200241088867 + }, + { + "auxiliary_loss_clip": 0.01188786, + "auxiliary_loss_mlp": 0.01054126, + "balance_loss_clip": 1.03086793, + "balance_loss_mlp": 1.05155873, + "epoch": 0.07936269352171953, + "flos": 21834478103040.0, + "grad_norm": 2.95797867210378, + "language_loss": 0.79765761, + "learning_rate": 3.974494692820539e-06, + "loss": 0.82008684, + "num_input_tokens_seen": 28027765, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.375, + "step": 1320, + "time_per_iteration": 2.4683783054351807 + }, + { + "auxiliary_loss_clip": 0.01190918, + "auxiliary_loss_mlp": 0.01057044, + "balance_loss_clip": 1.03448987, + "balance_loss_mlp": 1.05700457, + "epoch": 0.07942281677438749, + "flos": 16939889377920.0, + "grad_norm": 2.6163787894008363, + "language_loss": 0.69574934, + "learning_rate": 3.974432655485872e-06, + "loss": 0.71822894, + "num_input_tokens_seen": 28044225, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.34375, + "step": 1321, + "time_per_iteration": 2.466911554336548 + }, + { + "auxiliary_loss_clip": 0.01184231, + "auxiliary_loss_mlp": 0.01055954, + "balance_loss_clip": 1.03313756, + "balance_loss_mlp": 1.05313718, + "epoch": 0.07948294002705546, + "flos": 18986941468800.0, + "grad_norm": 1.926313653502779, + "language_loss": 0.83559513, + "learning_rate": 3.9743705432805195e-06, + "loss": 0.857997, + "num_input_tokens_seen": 28062915, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.3125, + "step": 1322, + "time_per_iteration": 2.465885639190674 + }, + { + "auxiliary_loss_clip": 0.01188233, + "auxiliary_loss_mlp": 0.01058763, + "balance_loss_clip": 1.03544521, + "balance_loss_mlp": 1.05104756, + "epoch": 0.07954306327972344, + "flos": 21653452535040.0, + "grad_norm": 1.8863777031262867, + "language_loss": 0.90437615, + "learning_rate": 3.974308356206838e-06, + "loss": 0.92684615, + "num_input_tokens_seen": 28082175, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.375, + "step": 1323, + "time_per_iteration": 2.465841293334961 + }, + { + "auxiliary_loss_clip": 0.0118735, + "auxiliary_loss_mlp": 0.01057162, + "balance_loss_clip": 1.03438115, + "balance_loss_mlp": 1.05414796, + "epoch": 0.0796031865323914, + "flos": 23220270766080.0, + "grad_norm": 1.6454981938510795, + "language_loss": 0.82583225, + "learning_rate": 3.974246094267187e-06, + "loss": 0.84827733, + "num_input_tokens_seen": 28102645, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.328125, + "step": 1324, + "time_per_iteration": 2.475486993789673 + }, + { + "auxiliary_loss_clip": 0.01188398, + "auxiliary_loss_mlp": 0.01049438, + "balance_loss_clip": 1.0255841, + "balance_loss_mlp": 1.05264676, + "epoch": 0.07966330978505937, + "flos": 23294534135040.0, + "grad_norm": 2.416918252865386, + "language_loss": 0.79654729, + "learning_rate": 3.974183757463925e-06, + "loss": 0.81892562, + "num_input_tokens_seen": 28122805, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.359375, + "step": 1325, + "time_per_iteration": 2.482555389404297 + }, + { + "auxiliary_loss_clip": 0.01190127, + "auxiliary_loss_mlp": 0.01064919, + "balance_loss_clip": 1.03989661, + "balance_loss_mlp": 1.05474687, + "epoch": 0.07972343303772735, + "flos": 18363783392640.0, + "grad_norm": 2.170521767048619, + "language_loss": 0.8812806, + "learning_rate": 3.974121345799418e-06, + "loss": 0.90383106, + "num_input_tokens_seen": 28140530, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.3515625, + "step": 1326, + "time_per_iteration": 2.466742753982544 + }, + { + "auxiliary_loss_clip": 0.01182901, + "auxiliary_loss_mlp": 0.01052255, + "balance_loss_clip": 1.02823424, + "balance_loss_mlp": 1.05014396, + "epoch": 0.07978355629039531, + "flos": 21762513204480.0, + "grad_norm": 2.3992518634606164, + "language_loss": 0.83013594, + "learning_rate": 3.974058859276032e-06, + "loss": 0.8524875, + "num_input_tokens_seen": 28159640, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.328125, + "step": 1327, + "time_per_iteration": 2.4989237785339355 + }, + { + "auxiliary_loss_clip": 0.0119143, + "auxiliary_loss_mlp": 0.0105424, + "balance_loss_clip": 1.03013575, + "balance_loss_mlp": 1.05436027, + "epoch": 0.07984367954306328, + "flos": 18551309322240.0, + "grad_norm": 2.1664091533416587, + "language_loss": 0.78452092, + "learning_rate": 3.9739962978961354e-06, + "loss": 0.80697763, + "num_input_tokens_seen": 28177050, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.375, + "step": 1328, + "time_per_iteration": 2.4402852058410645 + }, + { + "auxiliary_loss_clip": 0.01191637, + "auxiliary_loss_mlp": 0.01053331, + "balance_loss_clip": 1.02969217, + "balance_loss_mlp": 1.05460131, + "epoch": 0.07990380279573125, + "flos": 16904050583040.0, + "grad_norm": 2.484533735051083, + "language_loss": 0.74277186, + "learning_rate": 3.973933661662101e-06, + "loss": 0.76522154, + "num_input_tokens_seen": 28193245, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.375, + "step": 1329, + "time_per_iteration": 2.425388813018799 + }, + { + "auxiliary_loss_clip": 0.01185759, + "auxiliary_loss_mlp": 0.01060058, + "balance_loss_clip": 1.03731298, + "balance_loss_mlp": 1.05096054, + "epoch": 0.07996392604839922, + "flos": 24098358643200.0, + "grad_norm": 1.5753219993175995, + "language_loss": 0.81090498, + "learning_rate": 3.973870950576305e-06, + "loss": 0.83336312, + "num_input_tokens_seen": 28213570, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.3515625, + "step": 1330, + "time_per_iteration": 2.4831247329711914 + }, + { + "auxiliary_loss_clip": 0.01190834, + "auxiliary_loss_mlp": 0.01062422, + "balance_loss_clip": 1.03924823, + "balance_loss_mlp": 1.05348384, + "epoch": 0.08002404930106718, + "flos": 14278729438080.0, + "grad_norm": 2.322034822225311, + "language_loss": 0.88790143, + "learning_rate": 3.9738081646411255e-06, + "loss": 0.91043401, + "num_input_tokens_seen": 28229980, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.375, + "step": 1331, + "time_per_iteration": 2.4410722255706787 + }, + { + "auxiliary_loss_clip": 0.01193336, + "auxiliary_loss_mlp": 0.01058252, + "balance_loss_clip": 1.03414834, + "balance_loss_mlp": 1.05288279, + "epoch": 0.08008417255373516, + "flos": 40406219285760.0, + "grad_norm": 2.577873328737783, + "language_loss": 0.73332524, + "learning_rate": 3.973745303858942e-06, + "loss": 0.75584114, + "num_input_tokens_seen": 28253840, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.40625, + "step": 1332, + "time_per_iteration": 2.6054465770721436 + }, + { + "auxiliary_loss_clip": 0.01186558, + "auxiliary_loss_mlp": 0.01050644, + "balance_loss_clip": 1.02820885, + "balance_loss_mlp": 1.05179858, + "epoch": 0.08014429580640313, + "flos": 18478913460480.0, + "grad_norm": 1.9568005204239032, + "language_loss": 0.82994795, + "learning_rate": 3.973682368232138e-06, + "loss": 0.85232008, + "num_input_tokens_seen": 28271675, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.34375, + "step": 1333, + "time_per_iteration": 2.453457832336426 + }, + { + "auxiliary_loss_clip": 0.01187412, + "auxiliary_loss_mlp": 0.01055323, + "balance_loss_clip": 1.03272128, + "balance_loss_mlp": 1.05115032, + "epoch": 0.0802044190590711, + "flos": 22053461368320.0, + "grad_norm": 2.7771179443818466, + "language_loss": 0.74698973, + "learning_rate": 3.9736193577631015e-06, + "loss": 0.76941711, + "num_input_tokens_seen": 28291850, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.359375, + "step": 1334, + "time_per_iteration": 2.5768256187438965 + }, + { + "auxiliary_loss_clip": 0.01187182, + "auxiliary_loss_mlp": 0.01060862, + "balance_loss_clip": 1.03831935, + "balance_loss_mlp": 1.05457497, + "epoch": 0.08026454231173906, + "flos": 24572128055040.0, + "grad_norm": 2.0216765528325635, + "language_loss": 0.80279201, + "learning_rate": 3.973556272454221e-06, + "loss": 0.82527244, + "num_input_tokens_seen": 28310780, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.328125, + "step": 1335, + "time_per_iteration": 2.538670301437378 + }, + { + "auxiliary_loss_clip": 0.01078994, + "auxiliary_loss_mlp": 0.01011272, + "balance_loss_clip": 1.00802934, + "balance_loss_mlp": 1.02308655, + "epoch": 0.08032466556440704, + "flos": 52581841459200.0, + "grad_norm": 0.7427722697577622, + "language_loss": 0.56020629, + "learning_rate": 3.973493112307889e-06, + "loss": 0.58110893, + "num_input_tokens_seen": 28369985, + "router_z_loss_clip": 0.0324707, + "router_z_loss_mlp": 0.5625, + "step": 1336, + "time_per_iteration": 3.125026226043701 + }, + { + "auxiliary_loss_clip": 0.01188939, + "auxiliary_loss_mlp": 0.01054834, + "balance_loss_clip": 1.0331738, + "balance_loss_mlp": 1.05371606, + "epoch": 0.080384788817075, + "flos": 23842602829440.0, + "grad_norm": 2.050916847484745, + "language_loss": 0.67764497, + "learning_rate": 3.9734298773265005e-06, + "loss": 0.70008272, + "num_input_tokens_seen": 28388670, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.3515625, + "step": 1337, + "time_per_iteration": 2.506103038787842 + }, + { + "auxiliary_loss_clip": 0.01188826, + "auxiliary_loss_mlp": 0.01065102, + "balance_loss_clip": 1.04313135, + "balance_loss_mlp": 1.05480385, + "epoch": 0.08044491206974297, + "flos": 25300719527040.0, + "grad_norm": 1.8692893317328456, + "language_loss": 0.86701488, + "learning_rate": 3.973366567512453e-06, + "loss": 0.88955414, + "num_input_tokens_seen": 28411845, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.3359375, + "step": 1338, + "time_per_iteration": 2.5451908111572266 + }, + { + "auxiliary_loss_clip": 0.01188004, + "auxiliary_loss_mlp": 0.01060185, + "balance_loss_clip": 1.0368793, + "balance_loss_mlp": 1.05142283, + "epoch": 0.08050503532241095, + "flos": 22376549226240.0, + "grad_norm": 2.6265473040924725, + "language_loss": 0.87246621, + "learning_rate": 3.973303182868147e-06, + "loss": 0.89494807, + "num_input_tokens_seen": 28427875, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.359375, + "step": 1339, + "time_per_iteration": 2.450932502746582 + }, + { + "auxiliary_loss_clip": 0.01181336, + "auxiliary_loss_mlp": 0.01047749, + "balance_loss_clip": 1.02660179, + "balance_loss_mlp": 1.05106449, + "epoch": 0.08056515857507891, + "flos": 18369421827840.0, + "grad_norm": 2.428441908593999, + "language_loss": 0.88819683, + "learning_rate": 3.973239723395988e-06, + "loss": 0.91048771, + "num_input_tokens_seen": 28446615, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.296875, + "step": 1340, + "time_per_iteration": 2.4539895057678223 + }, + { + "auxiliary_loss_clip": 0.01072684, + "auxiliary_loss_mlp": 0.01003041, + "balance_loss_clip": 0.99951285, + "balance_loss_mlp": 1.01727247, + "epoch": 0.08062528182774688, + "flos": 51348130980480.0, + "grad_norm": 0.8886760882983712, + "language_loss": 0.64806795, + "learning_rate": 3.97317618909838e-06, + "loss": 0.66882515, + "num_input_tokens_seen": 28505290, + "router_z_loss_clip": 0.03540039, + "router_z_loss_mlp": 0.5546875, + "step": 1341, + "time_per_iteration": 3.0034360885620117 + }, + { + "auxiliary_loss_clip": 0.01193907, + "auxiliary_loss_mlp": 0.01060938, + "balance_loss_clip": 1.03577328, + "balance_loss_mlp": 1.05301166, + "epoch": 0.08068540508041486, + "flos": 17599712261760.0, + "grad_norm": 2.817345215565239, + "language_loss": 0.89616883, + "learning_rate": 3.973112579977733e-06, + "loss": 0.91871732, + "num_input_tokens_seen": 28522735, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.40625, + "step": 1342, + "time_per_iteration": 2.479701042175293 + }, + { + "auxiliary_loss_clip": 0.01194936, + "auxiliary_loss_mlp": 0.0105815, + "balance_loss_clip": 1.03334308, + "balance_loss_mlp": 1.05721259, + "epoch": 0.08074552833308282, + "flos": 10561185486720.0, + "grad_norm": 2.7453135307928216, + "language_loss": 0.76378155, + "learning_rate": 3.973048896036459e-06, + "loss": 0.78631246, + "num_input_tokens_seen": 28539460, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.375, + "step": 1343, + "time_per_iteration": 2.4969120025634766 + }, + { + "auxiliary_loss_clip": 0.01072196, + "auxiliary_loss_mlp": 0.01008036, + "balance_loss_clip": 1.00446022, + "balance_loss_mlp": 1.01791215, + "epoch": 0.08080565158575079, + "flos": 60840254954880.0, + "grad_norm": 0.8963318804352591, + "language_loss": 0.57395822, + "learning_rate": 3.972985137276974e-06, + "loss": 0.59476054, + "num_input_tokens_seen": 28599855, + "router_z_loss_clip": 0.03564453, + "router_z_loss_mlp": 0.54296875, + "step": 1344, + "time_per_iteration": 2.9917871952056885 + }, + { + "auxiliary_loss_clip": 0.01190985, + "auxiliary_loss_mlp": 0.0105771, + "balance_loss_clip": 1.03452373, + "balance_loss_mlp": 1.05523396, + "epoch": 0.08086577483841875, + "flos": 18332361970560.0, + "grad_norm": 2.677643541218582, + "language_loss": 0.86665964, + "learning_rate": 3.972921303701695e-06, + "loss": 0.88914657, + "num_input_tokens_seen": 28617585, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.359375, + "step": 1345, + "time_per_iteration": 2.4601447582244873 + }, + { + "auxiliary_loss_clip": 0.01187459, + "auxiliary_loss_mlp": 0.01054913, + "balance_loss_clip": 1.03289497, + "balance_loss_mlp": 1.05403256, + "epoch": 0.08092589809108673, + "flos": 21543601766400.0, + "grad_norm": 1.7098835991166323, + "language_loss": 0.87242532, + "learning_rate": 3.972857395313042e-06, + "loss": 0.894849, + "num_input_tokens_seen": 28636355, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.3359375, + "step": 1346, + "time_per_iteration": 2.4809892177581787 + }, + { + "auxiliary_loss_clip": 0.01185898, + "auxiliary_loss_mlp": 0.01054973, + "balance_loss_clip": 1.03256202, + "balance_loss_mlp": 1.05219567, + "epoch": 0.0809860213437547, + "flos": 22128012046080.0, + "grad_norm": 1.6659805361601863, + "language_loss": 0.92606491, + "learning_rate": 3.972793412113439e-06, + "loss": 0.94847363, + "num_input_tokens_seen": 28656260, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.3359375, + "step": 1347, + "time_per_iteration": 2.4802379608154297 + }, + { + "auxiliary_loss_clip": 0.0118757, + "auxiliary_loss_mlp": 0.01057822, + "balance_loss_clip": 1.03318167, + "balance_loss_mlp": 1.05471659, + "epoch": 0.08104614459642266, + "flos": 21725489260800.0, + "grad_norm": 9.453605004454174, + "language_loss": 0.89181751, + "learning_rate": 3.972729354105312e-06, + "loss": 0.91427147, + "num_input_tokens_seen": 28675865, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.328125, + "step": 1348, + "time_per_iteration": 2.4610300064086914 + }, + { + "auxiliary_loss_clip": 0.01185296, + "auxiliary_loss_mlp": 0.01056008, + "balance_loss_clip": 1.03420484, + "balance_loss_mlp": 1.05543983, + "epoch": 0.08110626784909064, + "flos": 23951878980480.0, + "grad_norm": 2.4916215003739355, + "language_loss": 0.76796132, + "learning_rate": 3.97266522129109e-06, + "loss": 0.7903744, + "num_input_tokens_seen": 28696255, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.296875, + "step": 1349, + "time_per_iteration": 2.4789178371429443 + }, + { + "auxiliary_loss_clip": 0.01187103, + "auxiliary_loss_mlp": 0.0105974, + "balance_loss_clip": 1.03669679, + "balance_loss_mlp": 1.05236626, + "epoch": 0.0811663911017586, + "flos": 19025689265280.0, + "grad_norm": 2.126949034470324, + "language_loss": 0.88571703, + "learning_rate": 3.972601013673205e-06, + "loss": 0.90818548, + "num_input_tokens_seen": 28713905, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.34375, + "step": 1350, + "time_per_iteration": 2.43094539642334 + }, + { + "auxiliary_loss_clip": 0.01184059, + "auxiliary_loss_mlp": 0.01061052, + "balance_loss_clip": 1.03773451, + "balance_loss_mlp": 1.05228257, + "epoch": 0.08122651435442657, + "flos": 15341290588800.0, + "grad_norm": 2.044220866897066, + "language_loss": 0.82058489, + "learning_rate": 3.972536731254092e-06, + "loss": 0.843036, + "num_input_tokens_seen": 28732075, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.3125, + "step": 1351, + "time_per_iteration": 6.688653469085693 + }, + { + "auxiliary_loss_clip": 0.01184193, + "auxiliary_loss_mlp": 0.01053712, + "balance_loss_clip": 1.02917862, + "balance_loss_mlp": 1.04863417, + "epoch": 0.08128663760709455, + "flos": 23221563655680.0, + "grad_norm": 1.9894600711485977, + "language_loss": 0.75347674, + "learning_rate": 3.972472374036189e-06, + "loss": 0.77585584, + "num_input_tokens_seen": 28751150, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.359375, + "step": 1352, + "time_per_iteration": 2.4888412952423096 + }, + { + "auxiliary_loss_clip": 0.01192461, + "auxiliary_loss_mlp": 0.01055559, + "balance_loss_clip": 1.03163338, + "balance_loss_mlp": 1.05483341, + "epoch": 0.08134676085976252, + "flos": 22965628273920.0, + "grad_norm": 1.7603053493114211, + "language_loss": 0.82833469, + "learning_rate": 3.972407942021935e-06, + "loss": 0.85081488, + "num_input_tokens_seen": 28773360, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.375, + "step": 1353, + "time_per_iteration": 2.522960901260376 + }, + { + "auxiliary_loss_clip": 0.01073388, + "auxiliary_loss_mlp": 0.01010471, + "balance_loss_clip": 1.00694275, + "balance_loss_mlp": 1.01996851, + "epoch": 0.08140688411243048, + "flos": 64322115816960.0, + "grad_norm": 0.8931676068679675, + "language_loss": 0.5970993, + "learning_rate": 3.972343435213775e-06, + "loss": 0.61793786, + "num_input_tokens_seen": 28833390, + "router_z_loss_clip": 0.03540039, + "router_z_loss_mlp": 0.53125, + "step": 1354, + "time_per_iteration": 3.0639474391937256 + }, + { + "auxiliary_loss_clip": 0.0118665, + "auxiliary_loss_mlp": 0.01060844, + "balance_loss_clip": 1.03764629, + "balance_loss_mlp": 1.05431724, + "epoch": 0.08146700736509845, + "flos": 22491858862080.0, + "grad_norm": 1.7981329827127455, + "language_loss": 0.82785606, + "learning_rate": 3.972278853614154e-06, + "loss": 0.85033101, + "num_input_tokens_seen": 28852430, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.3203125, + "step": 1355, + "time_per_iteration": 2.4664132595062256 + }, + { + "auxiliary_loss_clip": 0.01186535, + "auxiliary_loss_mlp": 0.01062061, + "balance_loss_clip": 1.03619206, + "balance_loss_mlp": 1.05146575, + "epoch": 0.08152713061776642, + "flos": 20447823513600.0, + "grad_norm": 1.9123465925299232, + "language_loss": 0.70799643, + "learning_rate": 3.972214197225521e-06, + "loss": 0.73048234, + "num_input_tokens_seen": 28870685, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.3515625, + "step": 1356, + "time_per_iteration": 2.509061813354492 + }, + { + "auxiliary_loss_clip": 0.01188256, + "auxiliary_loss_mlp": 0.01055944, + "balance_loss_clip": 1.03169644, + "balance_loss_mlp": 1.05148005, + "epoch": 0.08158725387043439, + "flos": 23550218121600.0, + "grad_norm": 2.53580294551395, + "language_loss": 0.70255458, + "learning_rate": 3.972149466050329e-06, + "loss": 0.72499657, + "num_input_tokens_seen": 28889860, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.3671875, + "step": 1357, + "time_per_iteration": 2.476951837539673 + }, + { + "auxiliary_loss_clip": 0.01191615, + "auxiliary_loss_mlp": 0.01053374, + "balance_loss_clip": 1.03067684, + "balance_loss_mlp": 1.05488217, + "epoch": 0.08164737712310235, + "flos": 22017335264640.0, + "grad_norm": 2.6163823683714953, + "language_loss": 0.84186697, + "learning_rate": 3.97208466009103e-06, + "loss": 0.86431682, + "num_input_tokens_seen": 28905865, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.3671875, + "step": 1358, + "time_per_iteration": 2.457376480102539 + }, + { + "auxiliary_loss_clip": 0.01190093, + "auxiliary_loss_mlp": 0.01056216, + "balance_loss_clip": 1.0310626, + "balance_loss_mlp": 1.05484545, + "epoch": 0.08170750037577033, + "flos": 23367827836800.0, + "grad_norm": 1.9894839389786314, + "language_loss": 1.02294087, + "learning_rate": 3.972019779350084e-06, + "loss": 1.04540396, + "num_input_tokens_seen": 28925250, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.3515625, + "step": 1359, + "time_per_iteration": 2.4723212718963623 + }, + { + "auxiliary_loss_clip": 0.01185855, + "auxiliary_loss_mlp": 0.01057366, + "balance_loss_clip": 1.03344035, + "balance_loss_mlp": 1.0511415, + "epoch": 0.0817676236284383, + "flos": 28397978490240.0, + "grad_norm": 2.0666688933075963, + "language_loss": 0.82969773, + "learning_rate": 3.971954823829951e-06, + "loss": 0.85212988, + "num_input_tokens_seen": 28943445, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.34375, + "step": 1360, + "time_per_iteration": 2.5143508911132812 + }, + { + "auxiliary_loss_clip": 0.01190184, + "auxiliary_loss_mlp": 0.01062181, + "balance_loss_clip": 1.03820777, + "balance_loss_mlp": 1.05335808, + "epoch": 0.08182774688110626, + "flos": 19208905562880.0, + "grad_norm": 2.14797754608813, + "language_loss": 0.72352278, + "learning_rate": 3.971889793533093e-06, + "loss": 0.74604642, + "num_input_tokens_seen": 28962695, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.3671875, + "step": 1361, + "time_per_iteration": 2.458034038543701 + }, + { + "auxiliary_loss_clip": 0.01179057, + "auxiliary_loss_mlp": 0.01057287, + "balance_loss_clip": 1.03249121, + "balance_loss_mlp": 1.04741335, + "epoch": 0.08188787013377424, + "flos": 22784099915520.0, + "grad_norm": 5.8589819193374515, + "language_loss": 0.76781029, + "learning_rate": 3.971824688461976e-06, + "loss": 0.79017377, + "num_input_tokens_seen": 28982120, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.3125, + "step": 1362, + "time_per_iteration": 2.472259759902954 + }, + { + "auxiliary_loss_clip": 0.01187551, + "auxiliary_loss_mlp": 0.01052018, + "balance_loss_clip": 1.0291419, + "balance_loss_mlp": 1.05449164, + "epoch": 0.08194799338644221, + "flos": 16468095214080.0, + "grad_norm": 2.631594675791475, + "language_loss": 0.72409523, + "learning_rate": 3.971759508619069e-06, + "loss": 0.74649096, + "num_input_tokens_seen": 28998100, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.328125, + "step": 1363, + "time_per_iteration": 2.4447264671325684 + }, + { + "auxiliary_loss_clip": 0.01189235, + "auxiliary_loss_mlp": 0.01061525, + "balance_loss_clip": 1.03603828, + "balance_loss_mlp": 1.05607057, + "epoch": 0.08200811663911017, + "flos": 23913633974400.0, + "grad_norm": 3.9166951523525464, + "language_loss": 0.77459586, + "learning_rate": 3.971694254006844e-06, + "loss": 0.79710352, + "num_input_tokens_seen": 29017095, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.328125, + "step": 1364, + "time_per_iteration": 2.475141763687134 + }, + { + "auxiliary_loss_clip": 0.01190144, + "auxiliary_loss_mlp": 0.01061328, + "balance_loss_clip": 1.03745019, + "balance_loss_mlp": 1.05500793, + "epoch": 0.08206823989177814, + "flos": 17896550256000.0, + "grad_norm": 2.6241179536013033, + "language_loss": 0.82025397, + "learning_rate": 3.971628924627776e-06, + "loss": 0.84276867, + "num_input_tokens_seen": 29037240, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.3515625, + "step": 1365, + "time_per_iteration": 2.493732452392578 + }, + { + "auxiliary_loss_clip": 0.0118713, + "auxiliary_loss_mlp": 0.0105741, + "balance_loss_clip": 1.03406882, + "balance_loss_mlp": 1.05614781, + "epoch": 0.08212836314444612, + "flos": 22088186841600.0, + "grad_norm": 3.3261283913074884, + "language_loss": 0.82173789, + "learning_rate": 3.97156352048434e-06, + "loss": 0.84418333, + "num_input_tokens_seen": 29056250, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.3125, + "step": 1366, + "time_per_iteration": 2.4809322357177734 + }, + { + "auxiliary_loss_clip": 0.01186928, + "auxiliary_loss_mlp": 0.0105891, + "balance_loss_clip": 1.03703475, + "balance_loss_mlp": 1.05126381, + "epoch": 0.08218848639711408, + "flos": 17597485618560.0, + "grad_norm": 2.8403828718649033, + "language_loss": 0.81534755, + "learning_rate": 3.97149804157902e-06, + "loss": 0.83780599, + "num_input_tokens_seen": 29073380, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.359375, + "step": 1367, + "time_per_iteration": 2.4577715396881104 + }, + { + "auxiliary_loss_clip": 0.01192402, + "auxiliary_loss_mlp": 0.01060775, + "balance_loss_clip": 1.03724277, + "balance_loss_mlp": 1.05413651, + "epoch": 0.08224860964978205, + "flos": 17857838373120.0, + "grad_norm": 2.3540874203263358, + "language_loss": 0.83644414, + "learning_rate": 3.9714324879142946e-06, + "loss": 0.85897589, + "num_input_tokens_seen": 29091330, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.3828125, + "step": 1368, + "time_per_iteration": 2.453547716140747 + }, + { + "auxiliary_loss_clip": 0.01181645, + "auxiliary_loss_mlp": 0.01049123, + "balance_loss_clip": 1.02694988, + "balance_loss_mlp": 1.05349994, + "epoch": 0.08230873290245003, + "flos": 25227533566080.0, + "grad_norm": 1.7360129433802456, + "language_loss": 0.81245828, + "learning_rate": 3.971366859492653e-06, + "loss": 0.83476603, + "num_input_tokens_seen": 29110375, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.28125, + "step": 1369, + "time_per_iteration": 2.527573585510254 + }, + { + "auxiliary_loss_clip": 0.01185735, + "auxiliary_loss_mlp": 0.01051149, + "balance_loss_clip": 1.02979898, + "balance_loss_mlp": 1.05528903, + "epoch": 0.08236885615511799, + "flos": 31759935753600.0, + "grad_norm": 2.240857135161324, + "language_loss": 0.74790901, + "learning_rate": 3.971301156316582e-06, + "loss": 0.77027786, + "num_input_tokens_seen": 29129395, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.3046875, + "step": 1370, + "time_per_iteration": 2.5205185413360596 + }, + { + "auxiliary_loss_clip": 0.01189372, + "auxiliary_loss_mlp": 0.01061396, + "balance_loss_clip": 1.03697038, + "balance_loss_mlp": 1.05480862, + "epoch": 0.08242897940778596, + "flos": 23185832601600.0, + "grad_norm": 1.6313231263601415, + "language_loss": 0.74633086, + "learning_rate": 3.971235378388573e-06, + "loss": 0.76883852, + "num_input_tokens_seen": 29148650, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.34375, + "step": 1371, + "time_per_iteration": 2.4995803833007812 + }, + { + "auxiliary_loss_clip": 0.01188254, + "auxiliary_loss_mlp": 0.01061601, + "balance_loss_clip": 1.03769946, + "balance_loss_mlp": 1.05410123, + "epoch": 0.08248910266045394, + "flos": 34491480393600.0, + "grad_norm": 2.0830704741847423, + "language_loss": 0.71080554, + "learning_rate": 3.971169525711122e-06, + "loss": 0.73330408, + "num_input_tokens_seen": 29170785, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.34375, + "step": 1372, + "time_per_iteration": 2.574457883834839 + }, + { + "auxiliary_loss_clip": 0.0118845, + "auxiliary_loss_mlp": 0.01051797, + "balance_loss_clip": 1.02750254, + "balance_loss_mlp": 1.05397415, + "epoch": 0.0825492259131219, + "flos": 13436228960640.0, + "grad_norm": 3.137320584176607, + "language_loss": 0.88010907, + "learning_rate": 3.9711035982867246e-06, + "loss": 0.90251154, + "num_input_tokens_seen": 29185210, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.34375, + "step": 1373, + "time_per_iteration": 2.485727310180664 + }, + { + "auxiliary_loss_clip": 0.01186594, + "auxiliary_loss_mlp": 0.01058909, + "balance_loss_clip": 1.03575897, + "balance_loss_mlp": 1.05331743, + "epoch": 0.08260934916578987, + "flos": 25812446636160.0, + "grad_norm": 1.7727067520163604, + "language_loss": 0.82349706, + "learning_rate": 3.971037596117882e-06, + "loss": 0.84595209, + "num_input_tokens_seen": 29205210, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.328125, + "step": 1374, + "time_per_iteration": 2.5223724842071533 + }, + { + "auxiliary_loss_clip": 0.01076427, + "auxiliary_loss_mlp": 0.01007461, + "balance_loss_clip": 1.0032891, + "balance_loss_mlp": 1.02371156, + "epoch": 0.08266947241845783, + "flos": 63460009491840.0, + "grad_norm": 0.8248734910296001, + "language_loss": 0.60630989, + "learning_rate": 3.970971519207095e-06, + "loss": 0.62714875, + "num_input_tokens_seen": 29265350, + "router_z_loss_clip": 0.04174805, + "router_z_loss_mlp": 0.5234375, + "step": 1375, + "time_per_iteration": 3.0909183025360107 + }, + { + "auxiliary_loss_clip": 0.01074233, + "auxiliary_loss_mlp": 0.01006319, + "balance_loss_clip": 1.00221813, + "balance_loss_mlp": 1.02162504, + "epoch": 0.08272959567112581, + "flos": 69993704568960.0, + "grad_norm": 0.9071425511101782, + "language_loss": 0.62149519, + "learning_rate": 3.970905367556871e-06, + "loss": 0.64230067, + "num_input_tokens_seen": 29321475, + "router_z_loss_clip": 0.04101562, + "router_z_loss_mlp": 0.52734375, + "step": 1376, + "time_per_iteration": 2.991158962249756 + }, + { + "auxiliary_loss_clip": 0.01195866, + "auxiliary_loss_mlp": 0.01069408, + "balance_loss_clip": 1.04624534, + "balance_loss_mlp": 1.05995989, + "epoch": 0.08278971892379378, + "flos": 20413205781120.0, + "grad_norm": 1.9826192893196872, + "language_loss": 0.82601643, + "learning_rate": 3.970839141169718e-06, + "loss": 0.84866917, + "num_input_tokens_seen": 29341405, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.359375, + "step": 1377, + "time_per_iteration": 2.5851728916168213 + }, + { + "auxiliary_loss_clip": 0.01188463, + "auxiliary_loss_mlp": 0.01057538, + "balance_loss_clip": 1.0342443, + "balance_loss_mlp": 1.05601847, + "epoch": 0.08284984217646174, + "flos": 26250233598720.0, + "grad_norm": 1.8760965133588865, + "language_loss": 0.84516692, + "learning_rate": 3.970772840048147e-06, + "loss": 0.86762691, + "num_input_tokens_seen": 29361955, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.3203125, + "step": 1378, + "time_per_iteration": 2.500251054763794 + }, + { + "auxiliary_loss_clip": 0.01190024, + "auxiliary_loss_mlp": 0.01062419, + "balance_loss_clip": 1.0383265, + "balance_loss_mlp": 1.05516553, + "epoch": 0.08290996542912972, + "flos": 27194683852800.0, + "grad_norm": 1.9551783234852504, + "language_loss": 0.87725681, + "learning_rate": 3.970706464194672e-06, + "loss": 0.89978123, + "num_input_tokens_seen": 29382395, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.3515625, + "step": 1379, + "time_per_iteration": 2.5428385734558105 + }, + { + "auxiliary_loss_clip": 0.01189534, + "auxiliary_loss_mlp": 0.01056049, + "balance_loss_clip": 1.03336358, + "balance_loss_mlp": 1.05776525, + "epoch": 0.08297008868179769, + "flos": 38618191146240.0, + "grad_norm": 1.7573789229703745, + "language_loss": 0.78658688, + "learning_rate": 3.970640013611812e-06, + "loss": 0.80904275, + "num_input_tokens_seen": 29404460, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.3125, + "step": 1380, + "time_per_iteration": 2.6350595951080322 + }, + { + "auxiliary_loss_clip": 0.01190411, + "auxiliary_loss_mlp": 0.01061393, + "balance_loss_clip": 1.03666866, + "balance_loss_mlp": 1.05878401, + "epoch": 0.08303021193446565, + "flos": 19974736460160.0, + "grad_norm": 2.2395713763978002, + "language_loss": 0.86146504, + "learning_rate": 3.970573488302083e-06, + "loss": 0.88398302, + "num_input_tokens_seen": 29422675, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.3125, + "step": 1381, + "time_per_iteration": 2.470153331756592 + }, + { + "auxiliary_loss_clip": 0.0119877, + "auxiliary_loss_mlp": 0.01060106, + "balance_loss_clip": 1.03604937, + "balance_loss_mlp": 1.06063581, + "epoch": 0.08309033518713363, + "flos": 13662646341120.0, + "grad_norm": 3.795546136319442, + "language_loss": 0.8817445, + "learning_rate": 3.970506888268011e-06, + "loss": 0.90433335, + "num_input_tokens_seen": 29439840, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.3828125, + "step": 1382, + "time_per_iteration": 2.4352822303771973 + }, + { + "auxiliary_loss_clip": 0.01190764, + "auxiliary_loss_mlp": 0.01059612, + "balance_loss_clip": 1.03728414, + "balance_loss_mlp": 1.0569818, + "epoch": 0.0831504584398016, + "flos": 17968551068160.0, + "grad_norm": 2.6234570747150734, + "language_loss": 0.77606535, + "learning_rate": 3.970440213512121e-06, + "loss": 0.79856908, + "num_input_tokens_seen": 29457360, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.34375, + "step": 1383, + "time_per_iteration": 2.45939040184021 + }, + { + "auxiliary_loss_clip": 0.01194291, + "auxiliary_loss_mlp": 0.01056287, + "balance_loss_clip": 1.03254008, + "balance_loss_mlp": 1.05730414, + "epoch": 0.08321058169246956, + "flos": 22601386408320.0, + "grad_norm": 2.1508484512905945, + "language_loss": 0.8293128, + "learning_rate": 3.97037346403694e-06, + "loss": 0.85181862, + "num_input_tokens_seen": 29477040, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.3671875, + "step": 1384, + "time_per_iteration": 2.4773356914520264 + }, + { + "auxiliary_loss_clip": 0.01198678, + "auxiliary_loss_mlp": 0.01055169, + "balance_loss_clip": 1.02937245, + "balance_loss_mlp": 1.05890989, + "epoch": 0.08327070494513754, + "flos": 22850426378880.0, + "grad_norm": 2.4890613364481893, + "language_loss": 0.84828049, + "learning_rate": 3.970306639845e-06, + "loss": 0.87081897, + "num_input_tokens_seen": 29492010, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.3984375, + "step": 1385, + "time_per_iteration": 2.5084009170532227 + }, + { + "auxiliary_loss_clip": 0.01194904, + "auxiliary_loss_mlp": 0.01066074, + "balance_loss_clip": 1.04257774, + "balance_loss_mlp": 1.05825758, + "epoch": 0.0833308281978055, + "flos": 22782986593920.0, + "grad_norm": 2.123672194513448, + "language_loss": 0.68744183, + "learning_rate": 3.970239740938835e-06, + "loss": 0.7100516, + "num_input_tokens_seen": 29511850, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.3671875, + "step": 1386, + "time_per_iteration": 2.477592945098877 + }, + { + "auxiliary_loss_clip": 0.01191265, + "auxiliary_loss_mlp": 0.0105612, + "balance_loss_clip": 1.03186047, + "balance_loss_mlp": 1.05579662, + "epoch": 0.08339095145047347, + "flos": 20812604083200.0, + "grad_norm": 1.7726596290820096, + "language_loss": 0.82067239, + "learning_rate": 3.97017276732098e-06, + "loss": 0.84314626, + "num_input_tokens_seen": 29531415, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.359375, + "step": 1387, + "time_per_iteration": 2.529261350631714 + }, + { + "auxiliary_loss_clip": 0.01196512, + "auxiliary_loss_mlp": 0.0107016, + "balance_loss_clip": 1.04474461, + "balance_loss_mlp": 1.05739772, + "epoch": 0.08345107470314143, + "flos": 18515326872960.0, + "grad_norm": 2.385304875072474, + "language_loss": 0.77194649, + "learning_rate": 3.970105718993978e-06, + "loss": 0.79461324, + "num_input_tokens_seen": 29549525, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.390625, + "step": 1388, + "time_per_iteration": 2.4517693519592285 + }, + { + "auxiliary_loss_clip": 0.01187734, + "auxiliary_loss_mlp": 0.01059717, + "balance_loss_clip": 1.0351125, + "balance_loss_mlp": 1.0574429, + "epoch": 0.08351119795580941, + "flos": 18807567926400.0, + "grad_norm": 2.246368739161805, + "language_loss": 0.79078835, + "learning_rate": 3.970038595960369e-06, + "loss": 0.81326282, + "num_input_tokens_seen": 29568705, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.3046875, + "step": 1389, + "time_per_iteration": 2.4999983310699463 + }, + { + "auxiliary_loss_clip": 0.01194109, + "auxiliary_loss_mlp": 0.01056803, + "balance_loss_clip": 1.03368866, + "balance_loss_mlp": 1.05773938, + "epoch": 0.08357132120847738, + "flos": 18441817689600.0, + "grad_norm": 4.533904477221136, + "language_loss": 0.87495124, + "learning_rate": 3.969971398222699e-06, + "loss": 0.89746046, + "num_input_tokens_seen": 29585855, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.359375, + "step": 1390, + "time_per_iteration": 2.438126802444458 + }, + { + "auxiliary_loss_clip": 0.01190886, + "auxiliary_loss_mlp": 0.01063167, + "balance_loss_clip": 1.03902745, + "balance_loss_mlp": 1.05621624, + "epoch": 0.08363144446114534, + "flos": 25922333318400.0, + "grad_norm": 1.6928828016377326, + "language_loss": 0.86753631, + "learning_rate": 3.969904125783517e-06, + "loss": 0.89007682, + "num_input_tokens_seen": 29607280, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.34375, + "step": 1391, + "time_per_iteration": 2.5615429878234863 + }, + { + "auxiliary_loss_clip": 0.01198327, + "auxiliary_loss_mlp": 0.01071606, + "balance_loss_clip": 1.0480268, + "balance_loss_mlp": 1.05904424, + "epoch": 0.08369156771381332, + "flos": 18041306065920.0, + "grad_norm": 4.090701354718017, + "language_loss": 0.87550449, + "learning_rate": 3.969836778645371e-06, + "loss": 0.89820385, + "num_input_tokens_seen": 29624130, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.390625, + "step": 1392, + "time_per_iteration": 2.4343698024749756 + }, + { + "auxiliary_loss_clip": 0.01190277, + "auxiliary_loss_mlp": 0.01060815, + "balance_loss_clip": 1.03682983, + "balance_loss_mlp": 1.05556941, + "epoch": 0.08375169096648129, + "flos": 22675111073280.0, + "grad_norm": 2.9857894096842457, + "language_loss": 0.80519998, + "learning_rate": 3.969769356810819e-06, + "loss": 0.82771087, + "num_input_tokens_seen": 29643210, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.34375, + "step": 1393, + "time_per_iteration": 3.9978342056274414 + }, + { + "auxiliary_loss_clip": 0.01191931, + "auxiliary_loss_mlp": 0.01054176, + "balance_loss_clip": 1.03098941, + "balance_loss_mlp": 1.05832088, + "epoch": 0.08381181421914925, + "flos": 26103215232000.0, + "grad_norm": 1.8413427873168604, + "language_loss": 0.84738398, + "learning_rate": 3.969701860282415e-06, + "loss": 0.86984503, + "num_input_tokens_seen": 29663920, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3359375, + "step": 1394, + "time_per_iteration": 3.995389461517334 + }, + { + "auxiliary_loss_clip": 0.01193271, + "auxiliary_loss_mlp": 0.010537, + "balance_loss_clip": 1.0296433, + "balance_loss_mlp": 1.05856824, + "epoch": 0.08387193747181723, + "flos": 20629782835200.0, + "grad_norm": 1.7688902284368797, + "language_loss": 0.82957625, + "learning_rate": 3.969634289062719e-06, + "loss": 0.85204601, + "num_input_tokens_seen": 29683825, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.34375, + "step": 1395, + "time_per_iteration": 2.5080416202545166 + }, + { + "auxiliary_loss_clip": 0.01194811, + "auxiliary_loss_mlp": 0.01062467, + "balance_loss_clip": 1.03683722, + "balance_loss_mlp": 1.05833054, + "epoch": 0.0839320607244852, + "flos": 13443196199040.0, + "grad_norm": 1.9626395114639965, + "language_loss": 0.82492781, + "learning_rate": 3.969566643154293e-06, + "loss": 0.84750068, + "num_input_tokens_seen": 29698775, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.3671875, + "step": 1396, + "time_per_iteration": 2.51763653755188 + }, + { + "auxiliary_loss_clip": 0.01191589, + "auxiliary_loss_mlp": 0.01058769, + "balance_loss_clip": 1.03253114, + "balance_loss_mlp": 1.05944824, + "epoch": 0.08399218397715316, + "flos": 23477247642240.0, + "grad_norm": 2.3756879295671367, + "language_loss": 0.7702114, + "learning_rate": 3.969498922559703e-06, + "loss": 0.79271495, + "num_input_tokens_seen": 29719430, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.3203125, + "step": 1397, + "time_per_iteration": 2.522019624710083 + }, + { + "auxiliary_loss_clip": 0.01191257, + "auxiliary_loss_mlp": 0.01050826, + "balance_loss_clip": 1.02635193, + "balance_loss_mlp": 1.05688787, + "epoch": 0.08405230722982113, + "flos": 25920717206400.0, + "grad_norm": 2.1333990758799795, + "language_loss": 0.77589226, + "learning_rate": 3.969431127281516e-06, + "loss": 0.79831308, + "num_input_tokens_seen": 29739685, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.34375, + "step": 1398, + "time_per_iteration": 2.499405860900879 + }, + { + "auxiliary_loss_clip": 0.01187104, + "auxiliary_loss_mlp": 0.01057261, + "balance_loss_clip": 1.03366995, + "balance_loss_mlp": 1.05604136, + "epoch": 0.0841124304824891, + "flos": 17967437746560.0, + "grad_norm": 6.547707007931562, + "language_loss": 0.94411373, + "learning_rate": 3.969363257322304e-06, + "loss": 0.96655744, + "num_input_tokens_seen": 29756165, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.3125, + "step": 1399, + "time_per_iteration": 2.458564043045044 + }, + { + "auxiliary_loss_clip": 0.01192876, + "auxiliary_loss_mlp": 0.01060981, + "balance_loss_clip": 1.03585184, + "balance_loss_mlp": 1.05564523, + "epoch": 0.08417255373515707, + "flos": 25629661301760.0, + "grad_norm": 2.3313569082148637, + "language_loss": 0.82052553, + "learning_rate": 3.96929531268464e-06, + "loss": 0.84306407, + "num_input_tokens_seen": 29776425, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.375, + "step": 1400, + "time_per_iteration": 2.511075258255005 + }, + { + "auxiliary_loss_clip": 0.01191821, + "auxiliary_loss_mlp": 0.01061122, + "balance_loss_clip": 1.03713727, + "balance_loss_mlp": 1.05681479, + "epoch": 0.08423267698782504, + "flos": 26249730808320.0, + "grad_norm": 3.6029570836648723, + "language_loss": 0.86615682, + "learning_rate": 3.969227293371099e-06, + "loss": 0.8886863, + "num_input_tokens_seen": 29796440, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.3515625, + "step": 1401, + "time_per_iteration": 2.5328855514526367 + }, + { + "auxiliary_loss_clip": 0.01190636, + "auxiliary_loss_mlp": 0.01063749, + "balance_loss_clip": 1.0383811, + "balance_loss_mlp": 1.05496573, + "epoch": 0.08429280024049302, + "flos": 20119707751680.0, + "grad_norm": 2.2778357332658543, + "language_loss": 0.87128234, + "learning_rate": 3.969159199384263e-06, + "loss": 0.89382625, + "num_input_tokens_seen": 29814755, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.359375, + "step": 1402, + "time_per_iteration": 2.4695520401000977 + }, + { + "auxiliary_loss_clip": 0.0118725, + "auxiliary_loss_mlp": 0.01056626, + "balance_loss_clip": 1.03340352, + "balance_loss_mlp": 1.0542388, + "epoch": 0.08435292349316098, + "flos": 42924526836480.0, + "grad_norm": 2.954964391273458, + "language_loss": 0.88680542, + "learning_rate": 3.9690910307267125e-06, + "loss": 0.90924418, + "num_input_tokens_seen": 29834785, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.328125, + "step": 1403, + "time_per_iteration": 2.6655161380767822 + }, + { + "auxiliary_loss_clip": 0.01189559, + "auxiliary_loss_mlp": 0.01056388, + "balance_loss_clip": 1.03105569, + "balance_loss_mlp": 1.05429792, + "epoch": 0.08441304674582895, + "flos": 22857285876480.0, + "grad_norm": 1.9645692036725415, + "language_loss": 0.80325729, + "learning_rate": 3.969022787401033e-06, + "loss": 0.82571673, + "num_input_tokens_seen": 29854695, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.359375, + "step": 1404, + "time_per_iteration": 2.5011603832244873 + }, + { + "auxiliary_loss_clip": 0.01195719, + "auxiliary_loss_mlp": 0.01066072, + "balance_loss_clip": 1.04089534, + "balance_loss_mlp": 1.05798006, + "epoch": 0.08447316999849692, + "flos": 18697501676160.0, + "grad_norm": 2.1059643070764027, + "language_loss": 0.83845061, + "learning_rate": 3.968954469409811e-06, + "loss": 0.86106849, + "num_input_tokens_seen": 29872180, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.375, + "step": 1405, + "time_per_iteration": 2.4612858295440674 + }, + { + "auxiliary_loss_clip": 0.01188265, + "auxiliary_loss_mlp": 0.01056168, + "balance_loss_clip": 1.03314888, + "balance_loss_mlp": 1.05381966, + "epoch": 0.08453329325116489, + "flos": 25483971738240.0, + "grad_norm": 1.7581309060245893, + "language_loss": 0.80343008, + "learning_rate": 3.968886076755639e-06, + "loss": 0.82587439, + "num_input_tokens_seen": 29893205, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.34375, + "step": 1406, + "time_per_iteration": 2.496676206588745 + }, + { + "auxiliary_loss_clip": 0.01192497, + "auxiliary_loss_mlp": 0.01065969, + "balance_loss_clip": 1.0421989, + "balance_loss_mlp": 1.05858994, + "epoch": 0.08459341650383286, + "flos": 20920048640640.0, + "grad_norm": 1.8241253914082192, + "language_loss": 0.79411483, + "learning_rate": 3.96881760944111e-06, + "loss": 0.8166995, + "num_input_tokens_seen": 29911970, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.3359375, + "step": 1407, + "time_per_iteration": 2.491055727005005 + }, + { + "auxiliary_loss_clip": 0.01188371, + "auxiliary_loss_mlp": 0.01055807, + "balance_loss_clip": 1.03234673, + "balance_loss_mlp": 1.05521655, + "epoch": 0.08465353975650082, + "flos": 13043079624960.0, + "grad_norm": 4.541456574357825, + "language_loss": 0.91929626, + "learning_rate": 3.968749067468819e-06, + "loss": 0.94173807, + "num_input_tokens_seen": 29929925, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.328125, + "step": 1408, + "time_per_iteration": 2.44599986076355 + }, + { + "auxiliary_loss_clip": 0.01074059, + "auxiliary_loss_mlp": 0.01025014, + "balance_loss_clip": 1.02074611, + "balance_loss_mlp": 1.02193737, + "epoch": 0.0847136630091688, + "flos": 60877422552960.0, + "grad_norm": 0.8980094129226197, + "language_loss": 0.61861706, + "learning_rate": 3.968680450841368e-06, + "loss": 0.63960779, + "num_input_tokens_seen": 29985950, + "router_z_loss_clip": 0.04272461, + "router_z_loss_mlp": 0.5234375, + "step": 1409, + "time_per_iteration": 3.1084799766540527 + }, + { + "auxiliary_loss_clip": 0.01180993, + "auxiliary_loss_mlp": 0.01060196, + "balance_loss_clip": 1.03784466, + "balance_loss_mlp": 1.05419254, + "epoch": 0.08477378626183676, + "flos": 22046530043520.0, + "grad_norm": 2.25814404402445, + "language_loss": 0.86819237, + "learning_rate": 3.968611759561355e-06, + "loss": 0.89060426, + "num_input_tokens_seen": 30004330, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.265625, + "step": 1410, + "time_per_iteration": 2.4854791164398193 + }, + { + "auxiliary_loss_clip": 0.01188551, + "auxiliary_loss_mlp": 0.01056537, + "balance_loss_clip": 1.0309782, + "balance_loss_mlp": 1.05453801, + "epoch": 0.08483390951450473, + "flos": 16690059308160.0, + "grad_norm": 2.048224684561652, + "language_loss": 0.74138093, + "learning_rate": 3.968542993631388e-06, + "loss": 0.76383173, + "num_input_tokens_seen": 30022555, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.3359375, + "step": 1411, + "time_per_iteration": 2.484879970550537 + }, + { + "auxiliary_loss_clip": 0.01068033, + "auxiliary_loss_mlp": 0.01005767, + "balance_loss_clip": 1.00169039, + "balance_loss_mlp": 1.01640451, + "epoch": 0.08489403276717271, + "flos": 51584640082560.0, + "grad_norm": 0.9041737870208939, + "language_loss": 0.56723791, + "learning_rate": 3.968474153054073e-06, + "loss": 0.58797586, + "num_input_tokens_seen": 30077220, + "router_z_loss_clip": 0.04077148, + "router_z_loss_mlp": 0.515625, + "step": 1412, + "time_per_iteration": 3.003227949142456 + }, + { + "auxiliary_loss_clip": 0.01183878, + "auxiliary_loss_mlp": 0.01062107, + "balance_loss_clip": 1.03855133, + "balance_loss_mlp": 1.05354273, + "epoch": 0.08495415601984067, + "flos": 17092330698240.0, + "grad_norm": 2.0338814511208883, + "language_loss": 0.89084172, + "learning_rate": 3.96840523783202e-06, + "loss": 0.91330159, + "num_input_tokens_seen": 30094600, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.3046875, + "step": 1413, + "time_per_iteration": 2.4545698165893555 + }, + { + "auxiliary_loss_clip": 0.01186591, + "auxiliary_loss_mlp": 0.01054482, + "balance_loss_clip": 1.03019929, + "balance_loss_mlp": 1.0562067, + "epoch": 0.08501427927250864, + "flos": 23148413608320.0, + "grad_norm": 2.1859301398641415, + "language_loss": 0.8807795, + "learning_rate": 3.968336247967844e-06, + "loss": 0.90319026, + "num_input_tokens_seen": 30114475, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.3046875, + "step": 1414, + "time_per_iteration": 2.4803147315979004 + }, + { + "auxiliary_loss_clip": 0.01185784, + "auxiliary_loss_mlp": 0.01056984, + "balance_loss_clip": 1.03497767, + "balance_loss_mlp": 1.0540117, + "epoch": 0.08507440252517662, + "flos": 19063467394560.0, + "grad_norm": 1.82577143383273, + "language_loss": 0.77434587, + "learning_rate": 3.96826718346416e-06, + "loss": 0.79677355, + "num_input_tokens_seen": 30133350, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.3125, + "step": 1415, + "time_per_iteration": 2.510671615600586 + }, + { + "auxiliary_loss_clip": 0.01185616, + "auxiliary_loss_mlp": 0.010657, + "balance_loss_clip": 1.0441227, + "balance_loss_mlp": 1.05612898, + "epoch": 0.08513452577784458, + "flos": 60182296600320.0, + "grad_norm": 1.848223104879299, + "language_loss": 0.70859981, + "learning_rate": 3.968198044323587e-06, + "loss": 0.73111296, + "num_input_tokens_seen": 30159005, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.296875, + "step": 1416, + "time_per_iteration": 2.827016592025757 + }, + { + "auxiliary_loss_clip": 0.01192002, + "auxiliary_loss_mlp": 0.0106124, + "balance_loss_clip": 1.03587198, + "balance_loss_mlp": 1.05693281, + "epoch": 0.08519464903051255, + "flos": 27308485117440.0, + "grad_norm": 1.9370001986884609, + "language_loss": 0.74855268, + "learning_rate": 3.968128830548748e-06, + "loss": 0.77108514, + "num_input_tokens_seen": 30179450, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.3515625, + "step": 1417, + "time_per_iteration": 2.51518177986145 + }, + { + "auxiliary_loss_clip": 0.01184448, + "auxiliary_loss_mlp": 0.01055419, + "balance_loss_clip": 1.03157723, + "balance_loss_mlp": 1.05394006, + "epoch": 0.08525477228318051, + "flos": 20266438809600.0, + "grad_norm": 2.566029486363868, + "language_loss": 0.82460356, + "learning_rate": 3.968059542142265e-06, + "loss": 0.84700227, + "num_input_tokens_seen": 30197235, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.3046875, + "step": 1418, + "time_per_iteration": 2.4632515907287598 + }, + { + "auxiliary_loss_clip": 0.01078096, + "auxiliary_loss_mlp": 0.01026146, + "balance_loss_clip": 1.02221191, + "balance_loss_mlp": 1.0269177, + "epoch": 0.08531489553584849, + "flos": 67615017183360.0, + "grad_norm": 0.8662062784105238, + "language_loss": 0.56616145, + "learning_rate": 3.9679901791067685e-06, + "loss": 0.58720386, + "num_input_tokens_seen": 30257410, + "router_z_loss_clip": 0.03930664, + "router_z_loss_mlp": 0.51171875, + "step": 1419, + "time_per_iteration": 3.0262646675109863 + }, + { + "auxiliary_loss_clip": 0.01185611, + "auxiliary_loss_mlp": 0.01062944, + "balance_loss_clip": 1.03858972, + "balance_loss_mlp": 1.05284262, + "epoch": 0.08537501878851646, + "flos": 27526965592320.0, + "grad_norm": 2.301787344693911, + "language_loss": 0.69764268, + "learning_rate": 3.967920741444886e-06, + "loss": 0.72012818, + "num_input_tokens_seen": 30277865, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.328125, + "step": 1420, + "time_per_iteration": 2.5173370838165283 + }, + { + "auxiliary_loss_clip": 0.01182824, + "auxiliary_loss_mlp": 0.01051954, + "balance_loss_clip": 1.02912498, + "balance_loss_mlp": 1.05232763, + "epoch": 0.08543514204118442, + "flos": 22784243569920.0, + "grad_norm": 1.56579546013663, + "language_loss": 0.87886292, + "learning_rate": 3.967851229159252e-06, + "loss": 0.90121067, + "num_input_tokens_seen": 30298545, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.3046875, + "step": 1421, + "time_per_iteration": 2.498198986053467 + }, + { + "auxiliary_loss_clip": 0.01069987, + "auxiliary_loss_mlp": 0.01004015, + "balance_loss_clip": 1.00034332, + "balance_loss_mlp": 1.01909983, + "epoch": 0.0854952652938524, + "flos": 60990721027200.0, + "grad_norm": 0.7935144939089421, + "language_loss": 0.63490081, + "learning_rate": 3.967781642252502e-06, + "loss": 0.65564084, + "num_input_tokens_seen": 30361725, + "router_z_loss_clip": 0.03662109, + "router_z_loss_mlp": 0.5078125, + "step": 1422, + "time_per_iteration": 3.050874948501587 + }, + { + "auxiliary_loss_clip": 0.01182797, + "auxiliary_loss_mlp": 0.01065038, + "balance_loss_clip": 1.04182768, + "balance_loss_mlp": 1.05538559, + "epoch": 0.08555538854652037, + "flos": 28038046256640.0, + "grad_norm": 2.040119561169685, + "language_loss": 0.83427018, + "learning_rate": 3.967711980727276e-06, + "loss": 0.85674852, + "num_input_tokens_seen": 30382180, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.2734375, + "step": 1423, + "time_per_iteration": 2.525075674057007 + }, + { + "auxiliary_loss_clip": 0.01190455, + "auxiliary_loss_mlp": 0.01059451, + "balance_loss_clip": 1.0365268, + "balance_loss_mlp": 1.05613029, + "epoch": 0.08561551179918833, + "flos": 23509279595520.0, + "grad_norm": 1.7627385415604107, + "language_loss": 0.74945033, + "learning_rate": 3.967642244586213e-06, + "loss": 0.77194929, + "num_input_tokens_seen": 30402980, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.34375, + "step": 1424, + "time_per_iteration": 2.523231029510498 + }, + { + "auxiliary_loss_clip": 0.01185893, + "auxiliary_loss_mlp": 0.01056266, + "balance_loss_clip": 1.03307986, + "balance_loss_mlp": 1.05510807, + "epoch": 0.08567563505185631, + "flos": 17926930183680.0, + "grad_norm": 1.9395290082560723, + "language_loss": 0.7574805, + "learning_rate": 3.96757243383196e-06, + "loss": 0.7799021, + "num_input_tokens_seen": 30420800, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3046875, + "step": 1425, + "time_per_iteration": 2.441420793533325 + }, + { + "auxiliary_loss_clip": 0.01183386, + "auxiliary_loss_mlp": 0.01053965, + "balance_loss_clip": 1.03092194, + "balance_loss_mlp": 1.05407834, + "epoch": 0.08573575830452428, + "flos": 19719519350400.0, + "grad_norm": 2.579491371045568, + "language_loss": 0.93504989, + "learning_rate": 3.9675025484671624e-06, + "loss": 0.95742333, + "num_input_tokens_seen": 30439620, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.296875, + "step": 1426, + "time_per_iteration": 2.4703657627105713 + }, + { + "auxiliary_loss_clip": 0.0119154, + "auxiliary_loss_mlp": 0.01066598, + "balance_loss_clip": 1.04115915, + "balance_loss_mlp": 1.05764198, + "epoch": 0.08579588155719224, + "flos": 17931563038080.0, + "grad_norm": 2.235647808517122, + "language_loss": 0.75003266, + "learning_rate": 3.967432588494471e-06, + "loss": 0.772614, + "num_input_tokens_seen": 30457300, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.34375, + "step": 1427, + "time_per_iteration": 2.4430549144744873 + }, + { + "auxiliary_loss_clip": 0.01182417, + "auxiliary_loss_mlp": 0.01061112, + "balance_loss_clip": 1.03907049, + "balance_loss_mlp": 1.05315089, + "epoch": 0.08585600480986022, + "flos": 16033324993920.0, + "grad_norm": 2.3372587699614726, + "language_loss": 0.81915152, + "learning_rate": 3.96736255391654e-06, + "loss": 0.84158677, + "num_input_tokens_seen": 30471580, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.296875, + "step": 1428, + "time_per_iteration": 2.454441785812378 + }, + { + "auxiliary_loss_clip": 0.01189987, + "auxiliary_loss_mlp": 0.01066735, + "balance_loss_clip": 1.04258347, + "balance_loss_mlp": 1.05586076, + "epoch": 0.08591612806252819, + "flos": 28657433404800.0, + "grad_norm": 2.395570851050941, + "language_loss": 0.79697371, + "learning_rate": 3.967292444736023e-06, + "loss": 0.81954098, + "num_input_tokens_seen": 30492720, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.34375, + "step": 1429, + "time_per_iteration": 2.5411579608917236 + }, + { + "auxiliary_loss_clip": 0.0119024, + "auxiliary_loss_mlp": 0.01062326, + "balance_loss_clip": 1.03952122, + "balance_loss_mlp": 1.05773449, + "epoch": 0.08597625131519615, + "flos": 20959119659520.0, + "grad_norm": 2.301464625204156, + "language_loss": 0.88055587, + "learning_rate": 3.967222260955578e-06, + "loss": 0.90308148, + "num_input_tokens_seen": 30509535, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.328125, + "step": 1430, + "time_per_iteration": 2.497523546218872 + }, + { + "auxiliary_loss_clip": 0.01184535, + "auxiliary_loss_mlp": 0.01072949, + "balance_loss_clip": 1.04995334, + "balance_loss_mlp": 1.05712664, + "epoch": 0.08603637456786412, + "flos": 23256360956160.0, + "grad_norm": 1.7504719201320615, + "language_loss": 0.81914723, + "learning_rate": 3.96715200257787e-06, + "loss": 0.84172201, + "num_input_tokens_seen": 30529490, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.2734375, + "step": 1431, + "time_per_iteration": 2.478731393814087 + }, + { + "auxiliary_loss_clip": 0.01184756, + "auxiliary_loss_mlp": 0.01056491, + "balance_loss_clip": 1.03337657, + "balance_loss_mlp": 1.05376828, + "epoch": 0.0860964978205321, + "flos": 28694170039680.0, + "grad_norm": 1.9949655353101803, + "language_loss": 0.77759397, + "learning_rate": 3.967081669605559e-06, + "loss": 0.80000651, + "num_input_tokens_seen": 30550205, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3125, + "step": 1432, + "time_per_iteration": 2.5344104766845703 + }, + { + "auxiliary_loss_clip": 0.0118072, + "auxiliary_loss_mlp": 0.01059034, + "balance_loss_clip": 1.03497803, + "balance_loss_mlp": 1.05027151, + "epoch": 0.08615662107320006, + "flos": 19318397195520.0, + "grad_norm": 2.2873036973179603, + "language_loss": 0.73330259, + "learning_rate": 3.967011262041315e-06, + "loss": 0.75570011, + "num_input_tokens_seen": 30568830, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.3046875, + "step": 1433, + "time_per_iteration": 2.4787938594818115 + }, + { + "auxiliary_loss_clip": 0.01188497, + "auxiliary_loss_mlp": 0.01058804, + "balance_loss_clip": 1.03375793, + "balance_loss_mlp": 1.05464733, + "epoch": 0.08621674432586802, + "flos": 15851688894720.0, + "grad_norm": 2.615593579271415, + "language_loss": 0.85741955, + "learning_rate": 3.9669407798878065e-06, + "loss": 0.87989259, + "num_input_tokens_seen": 30585730, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.3359375, + "step": 1434, + "time_per_iteration": 5.500946998596191 + }, + { + "auxiliary_loss_clip": 0.01182383, + "auxiliary_loss_mlp": 0.01054521, + "balance_loss_clip": 1.03139436, + "balance_loss_mlp": 1.05177212, + "epoch": 0.086276867578536, + "flos": 14100648785280.0, + "grad_norm": 3.0513138823403825, + "language_loss": 0.78913063, + "learning_rate": 3.966870223147707e-06, + "loss": 0.81149966, + "num_input_tokens_seen": 30603180, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3046875, + "step": 1435, + "time_per_iteration": 3.899777412414551 + }, + { + "auxiliary_loss_clip": 0.01070575, + "auxiliary_loss_mlp": 0.01027743, + "balance_loss_clip": 1.02428555, + "balance_loss_mlp": 1.02010655, + "epoch": 0.08633699083120397, + "flos": 70184857772160.0, + "grad_norm": 0.8910926846424677, + "language_loss": 0.57930011, + "learning_rate": 3.96679959182369e-06, + "loss": 0.60028332, + "num_input_tokens_seen": 30668895, + "router_z_loss_clip": 0.03466797, + "router_z_loss_mlp": 0.5078125, + "step": 1436, + "time_per_iteration": 3.179255247116089 + }, + { + "auxiliary_loss_clip": 0.01186059, + "auxiliary_loss_mlp": 0.01049386, + "balance_loss_clip": 1.02633083, + "balance_loss_mlp": 1.05314159, + "epoch": 0.08639711408387193, + "flos": 30298874140800.0, + "grad_norm": 2.429993259280604, + "language_loss": 0.68775386, + "learning_rate": 3.966728885918437e-06, + "loss": 0.71010828, + "num_input_tokens_seen": 30688955, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.328125, + "step": 1437, + "time_per_iteration": 2.529339551925659 + }, + { + "auxiliary_loss_clip": 0.01185365, + "auxiliary_loss_mlp": 0.01050306, + "balance_loss_clip": 1.02806163, + "balance_loss_mlp": 1.05388093, + "epoch": 0.08645723733653991, + "flos": 20297680663680.0, + "grad_norm": 2.5641138848438163, + "language_loss": 0.7274068, + "learning_rate": 3.966658105434627e-06, + "loss": 0.74976349, + "num_input_tokens_seen": 30706095, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.3125, + "step": 1438, + "time_per_iteration": 2.4840176105499268 + }, + { + "auxiliary_loss_clip": 0.01183596, + "auxiliary_loss_mlp": 0.01049035, + "balance_loss_clip": 1.02594447, + "balance_loss_mlp": 1.05472374, + "epoch": 0.08651736058920788, + "flos": 32890583134080.0, + "grad_norm": 1.681614476681305, + "language_loss": 0.64628494, + "learning_rate": 3.966587250374945e-06, + "loss": 0.66861117, + "num_input_tokens_seen": 30729025, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.2890625, + "step": 1439, + "time_per_iteration": 2.61686372756958 + }, + { + "auxiliary_loss_clip": 0.01187197, + "auxiliary_loss_mlp": 0.01055218, + "balance_loss_clip": 1.03131676, + "balance_loss_mlp": 1.05638909, + "epoch": 0.08657748384187584, + "flos": 22637368857600.0, + "grad_norm": 2.062065757985673, + "language_loss": 0.87748063, + "learning_rate": 3.966516320742077e-06, + "loss": 0.89990479, + "num_input_tokens_seen": 30746155, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.3125, + "step": 1440, + "time_per_iteration": 2.5116493701934814 + }, + { + "auxiliary_loss_clip": 0.01188419, + "auxiliary_loss_mlp": 0.01059749, + "balance_loss_clip": 1.03538251, + "balance_loss_mlp": 1.0540843, + "epoch": 0.08663760709454381, + "flos": 23658380951040.0, + "grad_norm": 2.4102507257620363, + "language_loss": 0.83243793, + "learning_rate": 3.9664453165387124e-06, + "loss": 0.85491961, + "num_input_tokens_seen": 30761410, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.34375, + "step": 1441, + "time_per_iteration": 2.5058300495147705 + }, + { + "auxiliary_loss_clip": 0.01070024, + "auxiliary_loss_mlp": 0.01009256, + "balance_loss_clip": 1.00525022, + "balance_loss_mlp": 1.01939523, + "epoch": 0.08669773034721179, + "flos": 62686564911360.0, + "grad_norm": 0.8461220926791603, + "language_loss": 0.60426581, + "learning_rate": 3.966374237767545e-06, + "loss": 0.62505859, + "num_input_tokens_seen": 30823010, + "router_z_loss_clip": 0.04003906, + "router_z_loss_mlp": 0.5078125, + "step": 1442, + "time_per_iteration": 3.1946628093719482 + }, + { + "auxiliary_loss_clip": 0.01192002, + "auxiliary_loss_mlp": 0.01057232, + "balance_loss_clip": 1.03379524, + "balance_loss_mlp": 1.05709028, + "epoch": 0.08675785359987975, + "flos": 20667489137280.0, + "grad_norm": 3.2809405592870835, + "language_loss": 0.79264277, + "learning_rate": 3.96630308443127e-06, + "loss": 0.81513512, + "num_input_tokens_seen": 30841980, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.3515625, + "step": 1443, + "time_per_iteration": 2.477691411972046 + }, + { + "auxiliary_loss_clip": 0.01185255, + "auxiliary_loss_mlp": 0.01054103, + "balance_loss_clip": 1.03170311, + "balance_loss_mlp": 1.05261874, + "epoch": 0.08681797685254772, + "flos": 26941118768640.0, + "grad_norm": 1.764762918327591, + "language_loss": 0.82248437, + "learning_rate": 3.966231856532584e-06, + "loss": 0.8448779, + "num_input_tokens_seen": 30863280, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.328125, + "step": 1444, + "time_per_iteration": 2.584773063659668 + }, + { + "auxiliary_loss_clip": 0.01189581, + "auxiliary_loss_mlp": 0.01049918, + "balance_loss_clip": 1.02745867, + "balance_loss_mlp": 1.05537939, + "epoch": 0.0868781001052157, + "flos": 17712831168000.0, + "grad_norm": 1.945627197742621, + "language_loss": 0.86856627, + "learning_rate": 3.966160554074189e-06, + "loss": 0.89096129, + "num_input_tokens_seen": 30881710, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.34375, + "step": 1445, + "time_per_iteration": 2.506258964538574 + }, + { + "auxiliary_loss_clip": 0.01189413, + "auxiliary_loss_mlp": 0.01054326, + "balance_loss_clip": 1.03303528, + "balance_loss_mlp": 1.05808067, + "epoch": 0.08693822335788366, + "flos": 19896522595200.0, + "grad_norm": 1.9763924186655837, + "language_loss": 0.81639445, + "learning_rate": 3.96608917705879e-06, + "loss": 0.8388319, + "num_input_tokens_seen": 30900225, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.3125, + "step": 1446, + "time_per_iteration": 2.531313180923462 + }, + { + "auxiliary_loss_clip": 0.010647, + "auxiliary_loss_mlp": 0.01005416, + "balance_loss_clip": 1.0020541, + "balance_loss_mlp": 1.0147202, + "epoch": 0.08699834661055163, + "flos": 67023747406080.0, + "grad_norm": 0.728477241136595, + "language_loss": 0.54725462, + "learning_rate": 3.966017725489091e-06, + "loss": 0.56795579, + "num_input_tokens_seen": 30959580, + "router_z_loss_clip": 0.03369141, + "router_z_loss_mlp": 0.5, + "step": 1447, + "time_per_iteration": 3.1009976863861084 + }, + { + "auxiliary_loss_clip": 0.01178637, + "auxiliary_loss_mlp": 0.01052877, + "balance_loss_clip": 1.03104973, + "balance_loss_mlp": 1.05198455, + "epoch": 0.0870584698632196, + "flos": 13480507451520.0, + "grad_norm": 2.2332818090387243, + "language_loss": 0.84593046, + "learning_rate": 3.965946199367804e-06, + "loss": 0.8682456, + "num_input_tokens_seen": 30976775, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.265625, + "step": 1448, + "time_per_iteration": 2.483792543411255 + }, + { + "auxiliary_loss_clip": 0.01185215, + "auxiliary_loss_mlp": 0.01056358, + "balance_loss_clip": 1.03386295, + "balance_loss_mlp": 1.0524509, + "epoch": 0.08711859311588757, + "flos": 16107013745280.0, + "grad_norm": 3.099884448391289, + "language_loss": 0.80688727, + "learning_rate": 3.965874598697638e-06, + "loss": 0.82930297, + "num_input_tokens_seen": 30990495, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.328125, + "step": 1449, + "time_per_iteration": 2.4637081623077393 + }, + { + "auxiliary_loss_clip": 0.01182046, + "auxiliary_loss_mlp": 0.01050023, + "balance_loss_clip": 1.02862501, + "balance_loss_mlp": 1.05370414, + "epoch": 0.08717871636855554, + "flos": 38472357928320.0, + "grad_norm": 4.183651889411507, + "language_loss": 0.71012592, + "learning_rate": 3.965802923481313e-06, + "loss": 0.73244655, + "num_input_tokens_seen": 31014080, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.2890625, + "step": 1450, + "time_per_iteration": 2.6521542072296143 + }, + { + "auxiliary_loss_clip": 0.0118314, + "auxiliary_loss_mlp": 0.01053244, + "balance_loss_clip": 1.03057098, + "balance_loss_mlp": 1.05502534, + "epoch": 0.0872388396212235, + "flos": 17600574188160.0, + "grad_norm": 1.8266796466048172, + "language_loss": 0.83492875, + "learning_rate": 3.965731173721542e-06, + "loss": 0.85729253, + "num_input_tokens_seen": 31031210, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.28125, + "step": 1451, + "time_per_iteration": 2.4866271018981934 + }, + { + "auxiliary_loss_clip": 0.01181156, + "auxiliary_loss_mlp": 0.01057138, + "balance_loss_clip": 1.03538203, + "balance_loss_mlp": 1.05371869, + "epoch": 0.08729896287389148, + "flos": 25259385951360.0, + "grad_norm": 1.850339391564711, + "language_loss": 0.74351519, + "learning_rate": 3.965659349421049e-06, + "loss": 0.76589811, + "num_input_tokens_seen": 31049710, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.2734375, + "step": 1452, + "time_per_iteration": 2.5450925827026367 + }, + { + "auxiliary_loss_clip": 0.01182798, + "auxiliary_loss_mlp": 0.01061481, + "balance_loss_clip": 1.03840256, + "balance_loss_mlp": 1.05121017, + "epoch": 0.08735908612655945, + "flos": 15632454234240.0, + "grad_norm": 3.3421371051734474, + "language_loss": 0.79840016, + "learning_rate": 3.965587450582556e-06, + "loss": 0.82084292, + "num_input_tokens_seen": 31066160, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3125, + "step": 1453, + "time_per_iteration": 2.49350643157959 + }, + { + "auxiliary_loss_clip": 0.01182604, + "auxiliary_loss_mlp": 0.0106489, + "balance_loss_clip": 1.04213262, + "balance_loss_mlp": 1.0545752, + "epoch": 0.08741920937922741, + "flos": 20339660684160.0, + "grad_norm": 1.982640213979625, + "language_loss": 0.71298045, + "learning_rate": 3.96551547720879e-06, + "loss": 0.73545539, + "num_input_tokens_seen": 31085270, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.28125, + "step": 1454, + "time_per_iteration": 2.534457206726074 + }, + { + "auxiliary_loss_clip": 0.01070391, + "auxiliary_loss_mlp": 0.01029091, + "balance_loss_clip": 1.02608728, + "balance_loss_mlp": 1.02026677, + "epoch": 0.08747933263189539, + "flos": 62819795433600.0, + "grad_norm": 0.7993884765543664, + "language_loss": 0.58655661, + "learning_rate": 3.96544342930248e-06, + "loss": 0.6075514, + "num_input_tokens_seen": 31148445, + "router_z_loss_clip": 0.0300293, + "router_z_loss_mlp": 0.5, + "step": 1455, + "time_per_iteration": 3.088113307952881 + }, + { + "auxiliary_loss_clip": 0.01182632, + "auxiliary_loss_mlp": 0.01058901, + "balance_loss_clip": 1.03638279, + "balance_loss_mlp": 1.05210626, + "epoch": 0.08753945588456336, + "flos": 33035877648000.0, + "grad_norm": 1.5590098662562957, + "language_loss": 0.77404714, + "learning_rate": 3.965371306866359e-06, + "loss": 0.79646254, + "num_input_tokens_seen": 31168770, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.3046875, + "step": 1456, + "time_per_iteration": 2.6145191192626953 + }, + { + "auxiliary_loss_clip": 0.01182283, + "auxiliary_loss_mlp": 0.01051585, + "balance_loss_clip": 1.02888715, + "balance_loss_mlp": 1.05235434, + "epoch": 0.08759957913723132, + "flos": 35547182046720.0, + "grad_norm": 2.3657198267749777, + "language_loss": 0.72391665, + "learning_rate": 3.96529910990316e-06, + "loss": 0.74625528, + "num_input_tokens_seen": 31189270, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.296875, + "step": 1457, + "time_per_iteration": 2.6438605785369873 + }, + { + "auxiliary_loss_clip": 0.01179054, + "auxiliary_loss_mlp": 0.01047648, + "balance_loss_clip": 1.02623844, + "balance_loss_mlp": 1.05207849, + "epoch": 0.0876597023898993, + "flos": 23911120022400.0, + "grad_norm": 1.5929331180335078, + "language_loss": 0.86215973, + "learning_rate": 3.965226838415622e-06, + "loss": 0.88442671, + "num_input_tokens_seen": 31210385, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.265625, + "step": 1458, + "time_per_iteration": 2.539658546447754 + }, + { + "auxiliary_loss_clip": 0.01189161, + "auxiliary_loss_mlp": 0.01059801, + "balance_loss_clip": 1.03694844, + "balance_loss_mlp": 1.05887103, + "epoch": 0.08771982564256726, + "flos": 18114025150080.0, + "grad_norm": 1.660016084678777, + "language_loss": 0.80662763, + "learning_rate": 3.965154492406486e-06, + "loss": 0.8291173, + "num_input_tokens_seen": 31229745, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.3046875, + "step": 1459, + "time_per_iteration": 2.4880902767181396 + }, + { + "auxiliary_loss_clip": 0.01187526, + "auxiliary_loss_mlp": 0.01054149, + "balance_loss_clip": 1.03057003, + "balance_loss_mlp": 1.05512893, + "epoch": 0.08777994889523523, + "flos": 17712005155200.0, + "grad_norm": 2.474003232718447, + "language_loss": 0.84058738, + "learning_rate": 3.9650820718784945e-06, + "loss": 0.86300415, + "num_input_tokens_seen": 31248280, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.328125, + "step": 1460, + "time_per_iteration": 2.4644060134887695 + }, + { + "auxiliary_loss_clip": 0.01178547, + "auxiliary_loss_mlp": 0.01054299, + "balance_loss_clip": 1.03287745, + "balance_loss_mlp": 1.05051732, + "epoch": 0.0878400721479032, + "flos": 12819930382080.0, + "grad_norm": 2.696872821623283, + "language_loss": 0.81030595, + "learning_rate": 3.965009576834394e-06, + "loss": 0.83263445, + "num_input_tokens_seen": 31262190, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.28125, + "step": 1461, + "time_per_iteration": 2.4456100463867188 + }, + { + "auxiliary_loss_clip": 0.01187345, + "auxiliary_loss_mlp": 0.01059901, + "balance_loss_clip": 1.03795433, + "balance_loss_mlp": 1.05579305, + "epoch": 0.08790019540057117, + "flos": 26392690938240.0, + "grad_norm": 1.656505593412751, + "language_loss": 0.76405656, + "learning_rate": 3.964937007276932e-06, + "loss": 0.786529, + "num_input_tokens_seen": 31283690, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.3125, + "step": 1462, + "time_per_iteration": 2.546812057495117 + }, + { + "auxiliary_loss_clip": 0.01190578, + "auxiliary_loss_mlp": 0.01058183, + "balance_loss_clip": 1.03431702, + "balance_loss_mlp": 1.05753493, + "epoch": 0.08796031865323914, + "flos": 19134031662720.0, + "grad_norm": 2.4277854967530663, + "language_loss": 0.74615479, + "learning_rate": 3.9648643632088634e-06, + "loss": 0.76864231, + "num_input_tokens_seen": 31302505, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.328125, + "step": 1463, + "time_per_iteration": 2.46510648727417 + }, + { + "auxiliary_loss_clip": 0.01189177, + "auxiliary_loss_mlp": 0.0106376, + "balance_loss_clip": 1.03929877, + "balance_loss_mlp": 1.05380559, + "epoch": 0.0880204419059071, + "flos": 26064287867520.0, + "grad_norm": 2.09054267836168, + "language_loss": 0.83423382, + "learning_rate": 3.964791644632941e-06, + "loss": 0.85676318, + "num_input_tokens_seen": 31323070, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.3515625, + "step": 1464, + "time_per_iteration": 2.5343735218048096 + }, + { + "auxiliary_loss_clip": 0.01183588, + "auxiliary_loss_mlp": 0.01069008, + "balance_loss_clip": 1.04659677, + "balance_loss_mlp": 1.05336595, + "epoch": 0.08808056515857508, + "flos": 22377842115840.0, + "grad_norm": 4.267071209901202, + "language_loss": 0.78351951, + "learning_rate": 3.964718851551923e-06, + "loss": 0.80604541, + "num_input_tokens_seen": 31341880, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.296875, + "step": 1465, + "time_per_iteration": 2.4745209217071533 + }, + { + "auxiliary_loss_clip": 0.01190864, + "auxiliary_loss_mlp": 0.01059186, + "balance_loss_clip": 1.0371089, + "balance_loss_mlp": 1.05628061, + "epoch": 0.08814068841124305, + "flos": 23185293897600.0, + "grad_norm": 1.8950228405880263, + "language_loss": 0.84698099, + "learning_rate": 3.9646459839685675e-06, + "loss": 0.86948144, + "num_input_tokens_seen": 31361995, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.34375, + "step": 1466, + "time_per_iteration": 2.4920802116394043 + }, + { + "auxiliary_loss_clip": 0.01185126, + "auxiliary_loss_mlp": 0.0105874, + "balance_loss_clip": 1.03556609, + "balance_loss_mlp": 1.05407715, + "epoch": 0.08820081166391101, + "flos": 25155281358720.0, + "grad_norm": 3.8136580791310783, + "language_loss": 0.84233636, + "learning_rate": 3.964573041885641e-06, + "loss": 0.86477506, + "num_input_tokens_seen": 31381515, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3125, + "step": 1467, + "time_per_iteration": 2.5413413047790527 + }, + { + "auxiliary_loss_clip": 0.01183856, + "auxiliary_loss_mlp": 0.01056021, + "balance_loss_clip": 1.03381276, + "balance_loss_mlp": 1.05462813, + "epoch": 0.08826093491657899, + "flos": 22231685675520.0, + "grad_norm": 1.7481416698073104, + "language_loss": 0.75517243, + "learning_rate": 3.964500025305907e-06, + "loss": 0.7775712, + "num_input_tokens_seen": 31400345, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.296875, + "step": 1468, + "time_per_iteration": 2.496363878250122 + }, + { + "auxiliary_loss_clip": 0.01184448, + "auxiliary_loss_mlp": 0.0105718, + "balance_loss_clip": 1.03623509, + "balance_loss_mlp": 1.05570245, + "epoch": 0.08832105816924696, + "flos": 22126826897280.0, + "grad_norm": 1.7579385887345491, + "language_loss": 0.80601043, + "learning_rate": 3.9644269342321355e-06, + "loss": 0.82842672, + "num_input_tokens_seen": 31419620, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.2890625, + "step": 1469, + "time_per_iteration": 2.5486512184143066 + }, + { + "auxiliary_loss_clip": 0.01187777, + "auxiliary_loss_mlp": 0.01054482, + "balance_loss_clip": 1.0321182, + "balance_loss_mlp": 1.05454695, + "epoch": 0.08838118142191492, + "flos": 17566495159680.0, + "grad_norm": 3.202810753535508, + "language_loss": 0.77607989, + "learning_rate": 3.9643537686670974e-06, + "loss": 0.7985025, + "num_input_tokens_seen": 31437970, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.3359375, + "step": 1470, + "time_per_iteration": 2.6632297039031982 + }, + { + "auxiliary_loss_clip": 0.01182287, + "auxiliary_loss_mlp": 0.0106647, + "balance_loss_clip": 1.04266429, + "balance_loss_mlp": 1.05412459, + "epoch": 0.0884413046745829, + "flos": 20777196251520.0, + "grad_norm": 1.774803600242038, + "language_loss": 0.84233272, + "learning_rate": 3.964280528613569e-06, + "loss": 0.86482024, + "num_input_tokens_seen": 31457040, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.28125, + "step": 1471, + "time_per_iteration": 2.5040950775146484 + }, + { + "auxiliary_loss_clip": 0.01178062, + "auxiliary_loss_mlp": 0.01052705, + "balance_loss_clip": 1.03247499, + "balance_loss_mlp": 1.05459309, + "epoch": 0.08850142792725087, + "flos": 22125462180480.0, + "grad_norm": 1.6761790638208889, + "language_loss": 0.83481324, + "learning_rate": 3.964207214074324e-06, + "loss": 0.85712093, + "num_input_tokens_seen": 31477520, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.234375, + "step": 1472, + "time_per_iteration": 2.5079073905944824 + }, + { + "auxiliary_loss_clip": 0.01185739, + "auxiliary_loss_mlp": 0.0105882, + "balance_loss_clip": 1.03597999, + "balance_loss_mlp": 1.05491877, + "epoch": 0.08856155117991883, + "flos": 22418744728320.0, + "grad_norm": 2.396127276436556, + "language_loss": 0.828246, + "learning_rate": 3.964133825052146e-06, + "loss": 0.85069156, + "num_input_tokens_seen": 31495575, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.3125, + "step": 1473, + "time_per_iteration": 2.4919679164886475 + }, + { + "auxiliary_loss_clip": 0.01183368, + "auxiliary_loss_mlp": 0.01061525, + "balance_loss_clip": 1.04040098, + "balance_loss_mlp": 1.05414963, + "epoch": 0.0886216744325868, + "flos": 29937002572800.0, + "grad_norm": 1.8346488607114506, + "language_loss": 0.78871369, + "learning_rate": 3.964060361549816e-06, + "loss": 0.81116265, + "num_input_tokens_seen": 31520020, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.296875, + "step": 1474, + "time_per_iteration": 2.5806753635406494 + }, + { + "auxiliary_loss_clip": 0.01181812, + "auxiliary_loss_mlp": 0.01057333, + "balance_loss_clip": 1.03413475, + "balance_loss_mlp": 1.05450511, + "epoch": 0.08868179768525478, + "flos": 23982833525760.0, + "grad_norm": 1.918961213895669, + "language_loss": 0.79045832, + "learning_rate": 3.963986823570121e-06, + "loss": 0.81284976, + "num_input_tokens_seen": 31539265, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.2734375, + "step": 1475, + "time_per_iteration": 2.495753765106201 + }, + { + "auxiliary_loss_clip": 0.01184034, + "auxiliary_loss_mlp": 0.01048109, + "balance_loss_clip": 1.0258882, + "balance_loss_mlp": 1.05443335, + "epoch": 0.08874192093792274, + "flos": 43177553216640.0, + "grad_norm": 1.6510632676992876, + "language_loss": 0.73973525, + "learning_rate": 3.963913211115848e-06, + "loss": 0.76205671, + "num_input_tokens_seen": 31563425, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.296875, + "step": 1476, + "time_per_iteration": 6.925957679748535 + }, + { + "auxiliary_loss_clip": 0.0118493, + "auxiliary_loss_mlp": 0.01060562, + "balance_loss_clip": 1.03723264, + "balance_loss_mlp": 1.05454326, + "epoch": 0.0888020441905907, + "flos": 32852445868800.0, + "grad_norm": 1.527991814504802, + "language_loss": 0.74644423, + "learning_rate": 3.9638395241897895e-06, + "loss": 0.76889908, + "num_input_tokens_seen": 31584525, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.3046875, + "step": 1477, + "time_per_iteration": 2.6033589839935303 + }, + { + "auxiliary_loss_clip": 0.01181345, + "auxiliary_loss_mlp": 0.01048179, + "balance_loss_clip": 1.02571976, + "balance_loss_mlp": 1.05315852, + "epoch": 0.08886216744325869, + "flos": 23149347361920.0, + "grad_norm": 2.4237564416671002, + "language_loss": 0.86488914, + "learning_rate": 3.963765762794739e-06, + "loss": 0.88718438, + "num_input_tokens_seen": 31603325, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.28125, + "step": 1478, + "time_per_iteration": 2.5188398361206055 + }, + { + "auxiliary_loss_clip": 0.01182629, + "auxiliary_loss_mlp": 0.01057749, + "balance_loss_clip": 1.03599334, + "balance_loss_mlp": 1.05417609, + "epoch": 0.08892229069592665, + "flos": 23331593992320.0, + "grad_norm": 7.715019285918926, + "language_loss": 0.77988106, + "learning_rate": 3.963691926933495e-06, + "loss": 0.80228484, + "num_input_tokens_seen": 31624820, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.28125, + "step": 1479, + "time_per_iteration": 2.50730562210083 + }, + { + "auxiliary_loss_clip": 0.01180801, + "auxiliary_loss_mlp": 0.01053517, + "balance_loss_clip": 1.02986622, + "balance_loss_mlp": 1.05275774, + "epoch": 0.08898241394859462, + "flos": 26213784272640.0, + "grad_norm": 2.3628139464189815, + "language_loss": 0.78267598, + "learning_rate": 3.9636180166088555e-06, + "loss": 0.80501914, + "num_input_tokens_seen": 31646080, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.28125, + "step": 1480, + "time_per_iteration": 2.512730360031128 + }, + { + "auxiliary_loss_clip": 0.01185027, + "auxiliary_loss_mlp": 0.01063075, + "balance_loss_clip": 1.03901875, + "balance_loss_mlp": 1.05357075, + "epoch": 0.0890425372012626, + "flos": 23550613171200.0, + "grad_norm": 3.1949876590170825, + "language_loss": 0.66627192, + "learning_rate": 3.963544031823624e-06, + "loss": 0.68875289, + "num_input_tokens_seen": 31665770, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.3125, + "step": 1481, + "time_per_iteration": 2.4874138832092285 + }, + { + "auxiliary_loss_clip": 0.0118244, + "auxiliary_loss_mlp": 0.01051994, + "balance_loss_clip": 1.03040504, + "balance_loss_mlp": 1.05519605, + "epoch": 0.08910266045393056, + "flos": 23002795872000.0, + "grad_norm": 1.9560930463008703, + "language_loss": 0.9644348, + "learning_rate": 3.9634699725806065e-06, + "loss": 0.98677909, + "num_input_tokens_seen": 31683805, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.2734375, + "step": 1482, + "time_per_iteration": 2.484274387359619 + }, + { + "auxiliary_loss_clip": 0.01190541, + "auxiliary_loss_mlp": 0.01055727, + "balance_loss_clip": 1.03306508, + "balance_loss_mlp": 1.0577234, + "epoch": 0.08916278370659853, + "flos": 31936508035200.0, + "grad_norm": 2.358614174414972, + "language_loss": 0.78436875, + "learning_rate": 3.96339583888261e-06, + "loss": 0.80683142, + "num_input_tokens_seen": 31704630, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.328125, + "step": 1483, + "time_per_iteration": 2.566199779510498 + }, + { + "auxiliary_loss_clip": 0.01183147, + "auxiliary_loss_mlp": 0.01072522, + "balance_loss_clip": 1.04891825, + "balance_loss_mlp": 1.05463076, + "epoch": 0.08922290695926649, + "flos": 17530404969600.0, + "grad_norm": 2.232834813834399, + "language_loss": 0.86091626, + "learning_rate": 3.963321630732448e-06, + "loss": 0.88347292, + "num_input_tokens_seen": 31723255, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.28125, + "step": 1484, + "time_per_iteration": 2.4742467403411865 + }, + { + "auxiliary_loss_clip": 0.01190947, + "auxiliary_loss_mlp": 0.01064822, + "balance_loss_clip": 1.04152799, + "balance_loss_mlp": 1.0570302, + "epoch": 0.08928303021193447, + "flos": 32125075459200.0, + "grad_norm": 1.7135103732453094, + "language_loss": 0.80460989, + "learning_rate": 3.963247348132932e-06, + "loss": 0.82716757, + "num_input_tokens_seen": 31747045, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.34375, + "step": 1485, + "time_per_iteration": 2.5808591842651367 + }, + { + "auxiliary_loss_clip": 0.01182644, + "auxiliary_loss_mlp": 0.01059654, + "balance_loss_clip": 1.03663421, + "balance_loss_mlp": 1.05256486, + "epoch": 0.08934315346460243, + "flos": 22125210785280.0, + "grad_norm": 2.0833446931013144, + "language_loss": 0.8295821, + "learning_rate": 3.96317299108688e-06, + "loss": 0.852005, + "num_input_tokens_seen": 31766615, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.296875, + "step": 1486, + "time_per_iteration": 2.5060923099517822 + }, + { + "auxiliary_loss_clip": 0.01184012, + "auxiliary_loss_mlp": 0.01060171, + "balance_loss_clip": 1.03749752, + "balance_loss_mlp": 1.05506349, + "epoch": 0.0894032767172704, + "flos": 22565583527040.0, + "grad_norm": 1.6673763915473876, + "language_loss": 0.76653707, + "learning_rate": 3.963098559597111e-06, + "loss": 0.78897893, + "num_input_tokens_seen": 31785855, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.296875, + "step": 1487, + "time_per_iteration": 2.4968059062957764 + }, + { + "auxiliary_loss_clip": 0.01181982, + "auxiliary_loss_mlp": 0.01063322, + "balance_loss_clip": 1.03908658, + "balance_loss_mlp": 1.05203557, + "epoch": 0.08946339996993838, + "flos": 20193396503040.0, + "grad_norm": 2.360836711926668, + "language_loss": 0.83246535, + "learning_rate": 3.963024053666449e-06, + "loss": 0.85491836, + "num_input_tokens_seen": 31804210, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.296875, + "step": 1488, + "time_per_iteration": 2.48189377784729 + }, + { + "auxiliary_loss_clip": 0.01180173, + "auxiliary_loss_mlp": 0.01051663, + "balance_loss_clip": 1.03020549, + "balance_loss_mlp": 1.05375743, + "epoch": 0.08952352322260634, + "flos": 48360181104000.0, + "grad_norm": 1.9508187836998312, + "language_loss": 0.71647823, + "learning_rate": 3.962949473297718e-06, + "loss": 0.73879659, + "num_input_tokens_seen": 31826150, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.265625, + "step": 1489, + "time_per_iteration": 2.701035737991333 + }, + { + "auxiliary_loss_clip": 0.01178824, + "auxiliary_loss_mlp": 0.01053682, + "balance_loss_clip": 1.03087783, + "balance_loss_mlp": 1.05088401, + "epoch": 0.08958364647527431, + "flos": 31793081028480.0, + "grad_norm": 1.8144641128553483, + "language_loss": 0.89490288, + "learning_rate": 3.962874818493745e-06, + "loss": 0.91722786, + "num_input_tokens_seen": 31848060, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.28125, + "step": 1490, + "time_per_iteration": 2.676098108291626 + }, + { + "auxiliary_loss_clip": 0.01187914, + "auxiliary_loss_mlp": 0.01064376, + "balance_loss_clip": 1.0416671, + "balance_loss_mlp": 1.05264366, + "epoch": 0.08964376972794229, + "flos": 23368186972800.0, + "grad_norm": 2.165908760559946, + "language_loss": 0.73276365, + "learning_rate": 3.9628000892573635e-06, + "loss": 0.75528657, + "num_input_tokens_seen": 31870040, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.3515625, + "step": 1491, + "time_per_iteration": 2.5531163215637207 + }, + { + "auxiliary_loss_clip": 0.01181575, + "auxiliary_loss_mlp": 0.01050632, + "balance_loss_clip": 1.02984166, + "balance_loss_mlp": 1.05362582, + "epoch": 0.08970389298061025, + "flos": 23294785530240.0, + "grad_norm": 1.6884120279290091, + "language_loss": 0.77121007, + "learning_rate": 3.9627252855914055e-06, + "loss": 0.79353207, + "num_input_tokens_seen": 31890400, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.28125, + "step": 1492, + "time_per_iteration": 2.485531806945801 + }, + { + "auxiliary_loss_clip": 0.01180742, + "auxiliary_loss_mlp": 0.01055458, + "balance_loss_clip": 1.03324914, + "balance_loss_mlp": 1.05471706, + "epoch": 0.08976401623327822, + "flos": 33761703772800.0, + "grad_norm": 2.0059524225222414, + "language_loss": 0.71168351, + "learning_rate": 3.962650407498707e-06, + "loss": 0.73404551, + "num_input_tokens_seen": 31913435, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.2578125, + "step": 1493, + "time_per_iteration": 2.5819149017333984 + }, + { + "auxiliary_loss_clip": 0.01184961, + "auxiliary_loss_mlp": 0.01056172, + "balance_loss_clip": 1.03304577, + "balance_loss_mlp": 1.05477107, + "epoch": 0.08982413948594618, + "flos": 23911335504000.0, + "grad_norm": 1.7443337417031568, + "language_loss": 0.86910093, + "learning_rate": 3.962575454982109e-06, + "loss": 0.89151227, + "num_input_tokens_seen": 31932435, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3046875, + "step": 1494, + "time_per_iteration": 2.491126775741577 + }, + { + "auxiliary_loss_clip": 0.01180854, + "auxiliary_loss_mlp": 0.01064445, + "balance_loss_clip": 1.04080594, + "balance_loss_mlp": 1.05289626, + "epoch": 0.08988426273861416, + "flos": 16837544551680.0, + "grad_norm": 1.7176751495851263, + "language_loss": 0.83065581, + "learning_rate": 3.962500428044454e-06, + "loss": 0.85310876, + "num_input_tokens_seen": 31950125, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.28125, + "step": 1495, + "time_per_iteration": 2.463747501373291 + }, + { + "auxiliary_loss_clip": 0.01187726, + "auxiliary_loss_mlp": 0.01056179, + "balance_loss_clip": 1.03410196, + "balance_loss_mlp": 1.05825078, + "epoch": 0.08994438599128213, + "flos": 14793365548800.0, + "grad_norm": 1.861203767183833, + "language_loss": 0.69813877, + "learning_rate": 3.962425326688585e-06, + "loss": 0.72057784, + "num_input_tokens_seen": 31968050, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.296875, + "step": 1496, + "time_per_iteration": 2.4409985542297363 + }, + { + "auxiliary_loss_clip": 0.01180533, + "auxiliary_loss_mlp": 0.01051241, + "balance_loss_clip": 1.03035557, + "balance_loss_mlp": 1.05325341, + "epoch": 0.09000450924395009, + "flos": 17384320356480.0, + "grad_norm": 1.6091347390483586, + "language_loss": 0.79913563, + "learning_rate": 3.962350150917351e-06, + "loss": 0.82145333, + "num_input_tokens_seen": 31985675, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.2734375, + "step": 1497, + "time_per_iteration": 2.492732048034668 + }, + { + "auxiliary_loss_clip": 0.01186533, + "auxiliary_loss_mlp": 0.0105809, + "balance_loss_clip": 1.03484416, + "balance_loss_mlp": 1.05299318, + "epoch": 0.09006463249661807, + "flos": 24280317964800.0, + "grad_norm": 2.3611651581227915, + "language_loss": 0.8262192, + "learning_rate": 3.9622749007336035e-06, + "loss": 0.84866548, + "num_input_tokens_seen": 32005180, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.3359375, + "step": 1498, + "time_per_iteration": 2.492124080657959 + }, + { + "auxiliary_loss_clip": 0.01188542, + "auxiliary_loss_mlp": 0.01061597, + "balance_loss_clip": 1.0402112, + "balance_loss_mlp": 1.05628157, + "epoch": 0.09012475574928604, + "flos": 13661928069120.0, + "grad_norm": 2.316244908481527, + "language_loss": 0.7849865, + "learning_rate": 3.962199576140195e-06, + "loss": 0.80748791, + "num_input_tokens_seen": 32022970, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.3203125, + "step": 1499, + "time_per_iteration": 2.455986738204956 + }, + { + "auxiliary_loss_clip": 0.0117942, + "auxiliary_loss_mlp": 0.01055125, + "balance_loss_clip": 1.03348815, + "balance_loss_mlp": 1.05351877, + "epoch": 0.090184879001954, + "flos": 23327751237120.0, + "grad_norm": 1.652937184766999, + "language_loss": 0.93453979, + "learning_rate": 3.962124177139981e-06, + "loss": 0.95688522, + "num_input_tokens_seen": 32043055, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.2578125, + "step": 1500, + "time_per_iteration": 2.481450080871582 + }, + { + "auxiliary_loss_clip": 0.01182931, + "auxiliary_loss_mlp": 0.01050934, + "balance_loss_clip": 1.0268302, + "balance_loss_mlp": 1.05170345, + "epoch": 0.09024500225462198, + "flos": 23002688131200.0, + "grad_norm": 2.9257189866461966, + "language_loss": 0.74465239, + "learning_rate": 3.962048703735822e-06, + "loss": 0.76699102, + "num_input_tokens_seen": 32061900, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.3125, + "step": 1501, + "time_per_iteration": 2.4806344509124756 + }, + { + "auxiliary_loss_clip": 0.01077215, + "auxiliary_loss_mlp": 0.01003378, + "balance_loss_clip": 0.99992049, + "balance_loss_mlp": 1.02834833, + "epoch": 0.09030512550728995, + "flos": 62189203242240.0, + "grad_norm": 0.7322723529864947, + "language_loss": 0.58304042, + "learning_rate": 3.96197315593058e-06, + "loss": 0.60384637, + "num_input_tokens_seen": 32122745, + "router_z_loss_clip": 0.03466797, + "router_z_loss_mlp": 0.48828125, + "step": 1502, + "time_per_iteration": 3.066755771636963 + }, + { + "auxiliary_loss_clip": 0.01178455, + "auxiliary_loss_mlp": 0.01047829, + "balance_loss_clip": 1.02655029, + "balance_loss_mlp": 1.05134845, + "epoch": 0.09036524875995791, + "flos": 38800689171840.0, + "grad_norm": 2.407651446444188, + "language_loss": 0.69502187, + "learning_rate": 3.961897533727119e-06, + "loss": 0.71728474, + "num_input_tokens_seen": 32145125, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.2734375, + "step": 1503, + "time_per_iteration": 2.608006000518799 + }, + { + "auxiliary_loss_clip": 0.01180913, + "auxiliary_loss_mlp": 0.01054911, + "balance_loss_clip": 1.03346539, + "balance_loss_mlp": 1.0508244, + "epoch": 0.09042537201262588, + "flos": 21690081429120.0, + "grad_norm": 2.015182939383952, + "language_loss": 0.86142361, + "learning_rate": 3.961821837128306e-06, + "loss": 0.88378185, + "num_input_tokens_seen": 32166255, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.3046875, + "step": 1504, + "time_per_iteration": 2.489906072616577 + }, + { + "auxiliary_loss_clip": 0.01188306, + "auxiliary_loss_mlp": 0.01064134, + "balance_loss_clip": 1.03871906, + "balance_loss_mlp": 1.05330658, + "epoch": 0.09048549526529386, + "flos": 22267021680000.0, + "grad_norm": 1.9466916160800904, + "language_loss": 0.72267938, + "learning_rate": 3.961746066137014e-06, + "loss": 0.74520379, + "num_input_tokens_seen": 32184010, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.3515625, + "step": 1505, + "time_per_iteration": 2.465965509414673 + }, + { + "auxiliary_loss_clip": 0.01179818, + "auxiliary_loss_mlp": 0.01054589, + "balance_loss_clip": 1.03203487, + "balance_loss_mlp": 1.05332816, + "epoch": 0.09054561851796182, + "flos": 14610939350400.0, + "grad_norm": 2.3726339000283447, + "language_loss": 0.80946511, + "learning_rate": 3.961670220756114e-06, + "loss": 0.83180916, + "num_input_tokens_seen": 32201635, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.265625, + "step": 1506, + "time_per_iteration": 2.4512932300567627 + }, + { + "auxiliary_loss_clip": 0.01179114, + "auxiliary_loss_mlp": 0.01049611, + "balance_loss_clip": 1.02859426, + "balance_loss_mlp": 1.0531404, + "epoch": 0.09060574177062979, + "flos": 27636169916160.0, + "grad_norm": 2.1533698580433254, + "language_loss": 0.76043189, + "learning_rate": 3.961594300988482e-06, + "loss": 0.78271914, + "num_input_tokens_seen": 32221940, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.2578125, + "step": 1507, + "time_per_iteration": 2.5277152061462402 + }, + { + "auxiliary_loss_clip": 0.01067186, + "auxiliary_loss_mlp": 0.01009923, + "balance_loss_clip": 1.00679994, + "balance_loss_mlp": 1.01922798, + "epoch": 0.09066586502329776, + "flos": 66085797513600.0, + "grad_norm": 0.7312512202665958, + "language_loss": 0.57670546, + "learning_rate": 3.961518306836998e-06, + "loss": 0.59747648, + "num_input_tokens_seen": 32276495, + "router_z_loss_clip": 0.03112793, + "router_z_loss_mlp": 0.48046875, + "step": 1508, + "time_per_iteration": 2.9330992698669434 + }, + { + "auxiliary_loss_clip": 0.01182207, + "auxiliary_loss_mlp": 0.01052694, + "balance_loss_clip": 1.0313319, + "balance_loss_mlp": 1.05309892, + "epoch": 0.09072598827596573, + "flos": 18916449027840.0, + "grad_norm": 2.072562238387217, + "language_loss": 0.85046542, + "learning_rate": 3.961442238304543e-06, + "loss": 0.87281442, + "num_input_tokens_seen": 32294130, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.2890625, + "step": 1509, + "time_per_iteration": 2.475606918334961 + }, + { + "auxiliary_loss_clip": 0.01189974, + "auxiliary_loss_mlp": 0.01065674, + "balance_loss_clip": 1.04158139, + "balance_loss_mlp": 1.05606115, + "epoch": 0.0907861115286337, + "flos": 24821742643200.0, + "grad_norm": 2.413703760690829, + "language_loss": 0.84302551, + "learning_rate": 3.961366095394002e-06, + "loss": 0.86558187, + "num_input_tokens_seen": 32313555, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.3359375, + "step": 1510, + "time_per_iteration": 2.576070785522461 + }, + { + "auxiliary_loss_clip": 0.01184002, + "auxiliary_loss_mlp": 0.01055545, + "balance_loss_clip": 1.0335387, + "balance_loss_mlp": 1.05408144, + "epoch": 0.09084623478130167, + "flos": 21652842003840.0, + "grad_norm": 1.9204492801986277, + "language_loss": 0.85558611, + "learning_rate": 3.961289878108262e-06, + "loss": 0.8779816, + "num_input_tokens_seen": 32331430, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.296875, + "step": 1511, + "time_per_iteration": 2.5085484981536865 + }, + { + "auxiliary_loss_clip": 0.01181957, + "auxiliary_loss_mlp": 0.01048579, + "balance_loss_clip": 1.02690685, + "balance_loss_mlp": 1.05469918, + "epoch": 0.09090635803396964, + "flos": 27639258485760.0, + "grad_norm": 1.5775523407684693, + "language_loss": 0.84897017, + "learning_rate": 3.9612135864502135e-06, + "loss": 0.87127548, + "num_input_tokens_seen": 32353705, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.2734375, + "step": 1512, + "time_per_iteration": 2.515565872192383 + }, + { + "auxiliary_loss_clip": 0.01175178, + "auxiliary_loss_mlp": 0.01049482, + "balance_loss_clip": 1.02888274, + "balance_loss_mlp": 1.05033123, + "epoch": 0.0909664812866376, + "flos": 17669127294720.0, + "grad_norm": 2.9006324958480167, + "language_loss": 0.86704344, + "learning_rate": 3.961137220422749e-06, + "loss": 0.88929009, + "num_input_tokens_seen": 32370520, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.25, + "step": 1513, + "time_per_iteration": 2.475271701812744 + }, + { + "auxiliary_loss_clip": 0.01180699, + "auxiliary_loss_mlp": 0.01051925, + "balance_loss_clip": 1.03170729, + "balance_loss_mlp": 1.0536902, + "epoch": 0.09102660453930557, + "flos": 23951448017280.0, + "grad_norm": 1.6716164971548293, + "language_loss": 0.86379707, + "learning_rate": 3.961060780028764e-06, + "loss": 0.8861233, + "num_input_tokens_seen": 32389105, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.2734375, + "step": 1514, + "time_per_iteration": 2.5317347049713135 + }, + { + "auxiliary_loss_clip": 0.0118192, + "auxiliary_loss_mlp": 0.01060131, + "balance_loss_clip": 1.03991365, + "balance_loss_mlp": 1.05550981, + "epoch": 0.09108672779197355, + "flos": 25812949426560.0, + "grad_norm": 1.9279276264910965, + "language_loss": 0.89882755, + "learning_rate": 3.960984265271159e-06, + "loss": 0.92124808, + "num_input_tokens_seen": 32408065, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.265625, + "step": 1515, + "time_per_iteration": 2.5507757663726807 + }, + { + "auxiliary_loss_clip": 0.011822, + "auxiliary_loss_mlp": 0.0105444, + "balance_loss_clip": 1.03174293, + "balance_loss_mlp": 1.05321527, + "epoch": 0.09114685104464151, + "flos": 29639482220160.0, + "grad_norm": 2.0145121179505905, + "language_loss": 0.85567206, + "learning_rate": 3.9609076761528335e-06, + "loss": 0.87803847, + "num_input_tokens_seen": 32427225, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.28125, + "step": 1516, + "time_per_iteration": 2.524787425994873 + }, + { + "auxiliary_loss_clip": 0.01182997, + "auxiliary_loss_mlp": 0.01053148, + "balance_loss_clip": 1.03130913, + "balance_loss_mlp": 1.05217946, + "epoch": 0.09120697429730948, + "flos": 33729635905920.0, + "grad_norm": 1.5232376391767188, + "language_loss": 0.81104374, + "learning_rate": 3.960831012676692e-06, + "loss": 0.83340514, + "num_input_tokens_seen": 32450510, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.3125, + "step": 1517, + "time_per_iteration": 2.5781173706054688 + }, + { + "auxiliary_loss_clip": 0.01185126, + "auxiliary_loss_mlp": 0.01068952, + "balance_loss_clip": 1.04729199, + "balance_loss_mlp": 1.05378699, + "epoch": 0.09126709754997746, + "flos": 18401381953920.0, + "grad_norm": 1.6026665805728266, + "language_loss": 0.78008473, + "learning_rate": 3.960754274845642e-06, + "loss": 0.80262554, + "num_input_tokens_seen": 32468425, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.3125, + "step": 1518, + "time_per_iteration": 4.000938653945923 + }, + { + "auxiliary_loss_clip": 0.01179619, + "auxiliary_loss_mlp": 0.01060053, + "balance_loss_clip": 1.03851235, + "balance_loss_mlp": 1.05189955, + "epoch": 0.09132722080264542, + "flos": 22091957769600.0, + "grad_norm": 1.883609624415087, + "language_loss": 0.86375809, + "learning_rate": 3.960677462662594e-06, + "loss": 0.88615477, + "num_input_tokens_seen": 32487510, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.28125, + "step": 1519, + "time_per_iteration": 3.945183277130127 + }, + { + "auxiliary_loss_clip": 0.01180521, + "auxiliary_loss_mlp": 0.01053198, + "balance_loss_clip": 1.03027439, + "balance_loss_mlp": 1.05196333, + "epoch": 0.09138734405531339, + "flos": 21033131633280.0, + "grad_norm": 2.4149150298084425, + "language_loss": 0.73425877, + "learning_rate": 3.96060057613046e-06, + "loss": 0.75659597, + "num_input_tokens_seen": 32507250, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.28125, + "step": 1520, + "time_per_iteration": 2.470977306365967 + }, + { + "auxiliary_loss_clip": 0.01181506, + "auxiliary_loss_mlp": 0.01055081, + "balance_loss_clip": 1.03299177, + "balance_loss_mlp": 1.0525614, + "epoch": 0.09144746730798137, + "flos": 20083940784000.0, + "grad_norm": 2.6960755220153825, + "language_loss": 0.85296613, + "learning_rate": 3.960523615252156e-06, + "loss": 0.87533194, + "num_input_tokens_seen": 32526045, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.2890625, + "step": 1521, + "time_per_iteration": 2.478440761566162 + }, + { + "auxiliary_loss_clip": 0.01183058, + "auxiliary_loss_mlp": 0.01057495, + "balance_loss_clip": 1.034917, + "balance_loss_mlp": 1.05319118, + "epoch": 0.09150759056064933, + "flos": 22778210085120.0, + "grad_norm": 2.1543470058122876, + "language_loss": 0.83979875, + "learning_rate": 3.960446580030599e-06, + "loss": 0.86220425, + "num_input_tokens_seen": 32546575, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.296875, + "step": 1522, + "time_per_iteration": 2.4761834144592285 + }, + { + "auxiliary_loss_clip": 0.01174804, + "auxiliary_loss_mlp": 0.01057417, + "balance_loss_clip": 1.03500533, + "balance_loss_mlp": 1.05125594, + "epoch": 0.0915677138133173, + "flos": 27564205017600.0, + "grad_norm": 2.174137545904809, + "language_loss": 0.810691, + "learning_rate": 3.960369470468711e-06, + "loss": 0.83301324, + "num_input_tokens_seen": 32568795, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.234375, + "step": 1523, + "time_per_iteration": 2.525385618209839 + }, + { + "auxiliary_loss_clip": 0.01182998, + "auxiliary_loss_mlp": 0.01063543, + "balance_loss_clip": 1.0426811, + "balance_loss_mlp": 1.05365944, + "epoch": 0.09162783706598528, + "flos": 17674765729920.0, + "grad_norm": 2.529065997296093, + "language_loss": 0.74591744, + "learning_rate": 3.960292286569418e-06, + "loss": 0.76838291, + "num_input_tokens_seen": 32587010, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.296875, + "step": 1524, + "time_per_iteration": 2.4293112754821777 + }, + { + "auxiliary_loss_clip": 0.01181121, + "auxiliary_loss_mlp": 0.01060116, + "balance_loss_clip": 1.03822935, + "balance_loss_mlp": 1.05373263, + "epoch": 0.09168796031865324, + "flos": 18478195188480.0, + "grad_norm": 2.0870290485059586, + "language_loss": 0.861516, + "learning_rate": 3.960215028335644e-06, + "loss": 0.88392842, + "num_input_tokens_seen": 32602375, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.2734375, + "step": 1525, + "time_per_iteration": 2.449774980545044 + }, + { + "auxiliary_loss_clip": 0.01181752, + "auxiliary_loss_mlp": 0.01047765, + "balance_loss_clip": 1.02577078, + "balance_loss_mlp": 1.05424511, + "epoch": 0.0917480835713212, + "flos": 29387605075200.0, + "grad_norm": 2.3600448138049597, + "language_loss": 0.74690467, + "learning_rate": 3.96013769577032e-06, + "loss": 0.76919985, + "num_input_tokens_seen": 32621460, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.28125, + "step": 1526, + "time_per_iteration": 2.5295088291168213 + }, + { + "auxiliary_loss_clip": 0.01177679, + "auxiliary_loss_mlp": 0.01052164, + "balance_loss_clip": 1.03058743, + "balance_loss_mlp": 1.05291057, + "epoch": 0.09180820682398917, + "flos": 19829262378240.0, + "grad_norm": 1.970734062299861, + "language_loss": 0.7736311, + "learning_rate": 3.960060288876378e-06, + "loss": 0.79592943, + "num_input_tokens_seen": 32640440, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.25, + "step": 1527, + "time_per_iteration": 2.465484142303467 + }, + { + "auxiliary_loss_clip": 0.01179355, + "auxiliary_loss_mlp": 0.01053495, + "balance_loss_clip": 1.03064227, + "balance_loss_mlp": 1.05090261, + "epoch": 0.09186833007665715, + "flos": 23841848643840.0, + "grad_norm": 1.9755082573034908, + "language_loss": 0.78465801, + "learning_rate": 3.959982807656753e-06, + "loss": 0.80698651, + "num_input_tokens_seen": 32660020, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.28125, + "step": 1528, + "time_per_iteration": 2.5257718563079834 + }, + { + "auxiliary_loss_clip": 0.01177926, + "auxiliary_loss_mlp": 0.01048723, + "balance_loss_clip": 1.0276351, + "balance_loss_mlp": 1.05085492, + "epoch": 0.09192845332932512, + "flos": 12932726065920.0, + "grad_norm": 2.6736868569465813, + "language_loss": 0.76880527, + "learning_rate": 3.959905252114384e-06, + "loss": 0.79107177, + "num_input_tokens_seen": 32678170, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.2734375, + "step": 1529, + "time_per_iteration": 2.4417288303375244 + }, + { + "auxiliary_loss_clip": 0.01180418, + "auxiliary_loss_mlp": 0.01053431, + "balance_loss_clip": 1.0306139, + "balance_loss_mlp": 1.05037212, + "epoch": 0.09198857658199308, + "flos": 24568177559040.0, + "grad_norm": 1.767002219307874, + "language_loss": 0.83118784, + "learning_rate": 3.959827622252211e-06, + "loss": 0.85352623, + "num_input_tokens_seen": 32697540, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.296875, + "step": 1530, + "time_per_iteration": 2.53367018699646 + }, + { + "auxiliary_loss_clip": 0.01173477, + "auxiliary_loss_mlp": 0.01059229, + "balance_loss_clip": 1.03723454, + "balance_loss_mlp": 1.05024123, + "epoch": 0.09204869983466106, + "flos": 20266941600000.0, + "grad_norm": 2.058190265763826, + "language_loss": 0.8408612, + "learning_rate": 3.959749918073179e-06, + "loss": 0.86318833, + "num_input_tokens_seen": 32716805, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.234375, + "step": 1531, + "time_per_iteration": 2.4784743785858154 + }, + { + "auxiliary_loss_clip": 0.01177383, + "auxiliary_loss_mlp": 0.01048961, + "balance_loss_clip": 1.02728868, + "balance_loss_mlp": 1.05083799, + "epoch": 0.09210882308732903, + "flos": 20885646389760.0, + "grad_norm": 1.8347699676368683, + "language_loss": 0.81135088, + "learning_rate": 3.959672139580233e-06, + "loss": 0.83361435, + "num_input_tokens_seen": 32736385, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.265625, + "step": 1532, + "time_per_iteration": 2.506875991821289 + }, + { + "auxiliary_loss_clip": 0.01179012, + "auxiliary_loss_mlp": 0.01052948, + "balance_loss_clip": 1.03044105, + "balance_loss_mlp": 1.05169332, + "epoch": 0.09216894633999699, + "flos": 30956326727040.0, + "grad_norm": 1.8650949584676202, + "language_loss": 0.83489287, + "learning_rate": 3.9595942867763235e-06, + "loss": 0.85721242, + "num_input_tokens_seen": 32757140, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.2734375, + "step": 1533, + "time_per_iteration": 2.5279369354248047 + }, + { + "auxiliary_loss_clip": 0.01181754, + "auxiliary_loss_mlp": 0.01048559, + "balance_loss_clip": 1.02662432, + "balance_loss_mlp": 1.05468941, + "epoch": 0.09222906959266497, + "flos": 13151565676800.0, + "grad_norm": 1.8226281566677605, + "language_loss": 0.89789164, + "learning_rate": 3.959516359664402e-06, + "loss": 0.92019475, + "num_input_tokens_seen": 32774860, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.2734375, + "step": 1534, + "time_per_iteration": 2.498732089996338 + }, + { + "auxiliary_loss_clip": 0.01178154, + "auxiliary_loss_mlp": 0.01064045, + "balance_loss_clip": 1.03958321, + "balance_loss_mlp": 1.04994035, + "epoch": 0.09228919284533293, + "flos": 25994477784960.0, + "grad_norm": 2.6410414613778777, + "language_loss": 0.75911283, + "learning_rate": 3.959438358247424e-06, + "loss": 0.78153479, + "num_input_tokens_seen": 32795250, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.28125, + "step": 1535, + "time_per_iteration": 2.5389468669891357 + }, + { + "auxiliary_loss_clip": 0.01170543, + "auxiliary_loss_mlp": 0.01043965, + "balance_loss_clip": 1.02404571, + "balance_loss_mlp": 1.04907823, + "epoch": 0.0923493160980009, + "flos": 18660800954880.0, + "grad_norm": 1.8388387816947327, + "language_loss": 0.81344318, + "learning_rate": 3.959360282528346e-06, + "loss": 0.83558822, + "num_input_tokens_seen": 32813805, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.2109375, + "step": 1536, + "time_per_iteration": 2.5075631141662598 + }, + { + "auxiliary_loss_clip": 0.01173873, + "auxiliary_loss_mlp": 0.01051939, + "balance_loss_clip": 1.0312202, + "balance_loss_mlp": 1.04995418, + "epoch": 0.09240943935066886, + "flos": 21140576190720.0, + "grad_norm": 2.109198419692537, + "language_loss": 0.8921392, + "learning_rate": 3.959282132510131e-06, + "loss": 0.91439736, + "num_input_tokens_seen": 32830960, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.234375, + "step": 1537, + "time_per_iteration": 2.4454562664031982 + }, + { + "auxiliary_loss_clip": 0.01177438, + "auxiliary_loss_mlp": 0.01059104, + "balance_loss_clip": 1.03638315, + "balance_loss_mlp": 1.05164456, + "epoch": 0.09246956260333684, + "flos": 20592435669120.0, + "grad_norm": 2.1959440535625285, + "language_loss": 0.8072964, + "learning_rate": 3.959203908195741e-06, + "loss": 0.82966185, + "num_input_tokens_seen": 32848275, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.2578125, + "step": 1538, + "time_per_iteration": 2.50838303565979 + }, + { + "auxiliary_loss_clip": 0.01066028, + "auxiliary_loss_mlp": 0.0101212, + "balance_loss_clip": 1.0091517, + "balance_loss_mlp": 1.01794529, + "epoch": 0.09252968585600481, + "flos": 67558710614400.0, + "grad_norm": 0.74443800558722, + "language_loss": 0.57375526, + "learning_rate": 3.959125609588142e-06, + "loss": 0.59453678, + "num_input_tokens_seen": 32917730, + "router_z_loss_clip": 0.02966309, + "router_z_loss_mlp": 0.48046875, + "step": 1539, + "time_per_iteration": 3.16038179397583 + }, + { + "auxiliary_loss_clip": 0.01179737, + "auxiliary_loss_mlp": 0.01051392, + "balance_loss_clip": 1.02958906, + "balance_loss_mlp": 1.05291581, + "epoch": 0.09258980910867277, + "flos": 17383853479680.0, + "grad_norm": 2.903908071477431, + "language_loss": 0.67164814, + "learning_rate": 3.959047236690304e-06, + "loss": 0.69395947, + "num_input_tokens_seen": 32934910, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.265625, + "step": 1540, + "time_per_iteration": 2.488809585571289 + }, + { + "auxiliary_loss_clip": 0.01178592, + "auxiliary_loss_mlp": 0.0104328, + "balance_loss_clip": 1.02154827, + "balance_loss_mlp": 1.05285096, + "epoch": 0.09264993236134075, + "flos": 19865927185920.0, + "grad_norm": 1.797248436862791, + "language_loss": 0.83666921, + "learning_rate": 3.958968789505198e-06, + "loss": 0.85888791, + "num_input_tokens_seen": 32953840, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.2578125, + "step": 1541, + "time_per_iteration": 2.5406758785247803 + }, + { + "auxiliary_loss_clip": 0.01060695, + "auxiliary_loss_mlp": 0.01009011, + "balance_loss_clip": 1.0061146, + "balance_loss_mlp": 1.01339245, + "epoch": 0.09271005561400872, + "flos": 62284401262080.0, + "grad_norm": 0.8904869203130611, + "language_loss": 0.6196329, + "learning_rate": 3.9588902680358e-06, + "loss": 0.64032996, + "num_input_tokens_seen": 33011410, + "router_z_loss_clip": 0.02893066, + "router_z_loss_mlp": 0.47265625, + "step": 1542, + "time_per_iteration": 3.0973262786865234 + }, + { + "auxiliary_loss_clip": 0.01178215, + "auxiliary_loss_mlp": 0.01055032, + "balance_loss_clip": 1.03486192, + "balance_loss_mlp": 1.05283189, + "epoch": 0.09277017886667668, + "flos": 23329870139520.0, + "grad_norm": 1.711071573157868, + "language_loss": 0.82672381, + "learning_rate": 3.958811672285086e-06, + "loss": 0.84905624, + "num_input_tokens_seen": 33031675, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.25, + "step": 1543, + "time_per_iteration": 2.489415168762207 + }, + { + "auxiliary_loss_clip": 0.0117317, + "auxiliary_loss_mlp": 0.01055984, + "balance_loss_clip": 1.03462195, + "balance_loss_mlp": 1.05128777, + "epoch": 0.09283030211934466, + "flos": 54745169875200.0, + "grad_norm": 1.6169278883375504, + "language_loss": 0.72058821, + "learning_rate": 3.958733002256038e-06, + "loss": 0.74287981, + "num_input_tokens_seen": 33056355, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.21875, + "step": 1544, + "time_per_iteration": 2.7986748218536377 + }, + { + "auxiliary_loss_clip": 0.01177062, + "auxiliary_loss_mlp": 0.01047649, + "balance_loss_clip": 1.0257864, + "balance_loss_mlp": 1.05111873, + "epoch": 0.09289042537201263, + "flos": 30334784762880.0, + "grad_norm": 1.7012123784712243, + "language_loss": 0.77617419, + "learning_rate": 3.958654257951637e-06, + "loss": 0.79842126, + "num_input_tokens_seen": 33079520, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.265625, + "step": 1545, + "time_per_iteration": 2.5673069953918457 + }, + { + "auxiliary_loss_clip": 0.01173726, + "auxiliary_loss_mlp": 0.01050414, + "balance_loss_clip": 1.029338, + "balance_loss_mlp": 1.0525856, + "epoch": 0.09295054862468059, + "flos": 17746838369280.0, + "grad_norm": 2.736353511607615, + "language_loss": 0.74531418, + "learning_rate": 3.9585754393748706e-06, + "loss": 0.76755565, + "num_input_tokens_seen": 33096135, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.2109375, + "step": 1546, + "time_per_iteration": 2.456806182861328 + }, + { + "auxiliary_loss_clip": 0.01180806, + "auxiliary_loss_mlp": 0.01051708, + "balance_loss_clip": 1.02968979, + "balance_loss_mlp": 1.05292201, + "epoch": 0.09301067187734856, + "flos": 23658021815040.0, + "grad_norm": 2.1086065935537284, + "language_loss": 0.84392273, + "learning_rate": 3.9584965465287275e-06, + "loss": 0.86624783, + "num_input_tokens_seen": 33115245, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.28125, + "step": 1547, + "time_per_iteration": 2.5041439533233643 + }, + { + "auxiliary_loss_clip": 0.01177454, + "auxiliary_loss_mlp": 0.01053084, + "balance_loss_clip": 1.03136444, + "balance_loss_mlp": 1.05125856, + "epoch": 0.09307079513001654, + "flos": 27527719777920.0, + "grad_norm": 7.120670718523448, + "language_loss": 0.67616034, + "learning_rate": 3.958417579416199e-06, + "loss": 0.6984657, + "num_input_tokens_seen": 33136640, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.2578125, + "step": 1548, + "time_per_iteration": 2.513141393661499 + }, + { + "auxiliary_loss_clip": 0.01178735, + "auxiliary_loss_mlp": 0.01053, + "balance_loss_clip": 1.03083944, + "balance_loss_mlp": 1.05175209, + "epoch": 0.0931309183826845, + "flos": 20627340710400.0, + "grad_norm": 2.761700755369037, + "language_loss": 0.83445251, + "learning_rate": 3.9583385380402795e-06, + "loss": 0.85676992, + "num_input_tokens_seen": 33155060, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.265625, + "step": 1549, + "time_per_iteration": 2.4822285175323486 + }, + { + "auxiliary_loss_clip": 0.01181659, + "auxiliary_loss_mlp": 0.01043887, + "balance_loss_clip": 1.02312112, + "balance_loss_mlp": 1.05560291, + "epoch": 0.09319104163535247, + "flos": 29020921084800.0, + "grad_norm": 1.7822943519837542, + "language_loss": 0.75744081, + "learning_rate": 3.958259422403966e-06, + "loss": 0.77969635, + "num_input_tokens_seen": 33175420, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.2578125, + "step": 1550, + "time_per_iteration": 2.5503265857696533 + }, + { + "auxiliary_loss_clip": 0.01179426, + "auxiliary_loss_mlp": 0.01069184, + "balance_loss_clip": 1.04579496, + "balance_loss_mlp": 1.05118561, + "epoch": 0.09325116488802045, + "flos": 25301545539840.0, + "grad_norm": 2.0184762942100876, + "language_loss": 0.83272278, + "learning_rate": 3.95818023251026e-06, + "loss": 0.85520893, + "num_input_tokens_seen": 33194120, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.28125, + "step": 1551, + "time_per_iteration": 2.4962081909179688 + }, + { + "auxiliary_loss_clip": 0.01060634, + "auxiliary_loss_mlp": 0.01007794, + "balance_loss_clip": 1.0051949, + "balance_loss_mlp": 1.01350796, + "epoch": 0.09331128814068841, + "flos": 61536203942400.0, + "grad_norm": 0.7800746873014213, + "language_loss": 0.6182366, + "learning_rate": 3.958100968362163e-06, + "loss": 0.6389209, + "num_input_tokens_seen": 33261080, + "router_z_loss_clip": 0.02600098, + "router_z_loss_mlp": 0.47070312, + "step": 1552, + "time_per_iteration": 3.2178378105163574 + }, + { + "auxiliary_loss_clip": 0.01059462, + "auxiliary_loss_mlp": 0.01003668, + "balance_loss_clip": 1.00099754, + "balance_loss_mlp": 1.01257896, + "epoch": 0.09337141139335638, + "flos": 53293700171520.0, + "grad_norm": 0.8330449834122059, + "language_loss": 0.5895977, + "learning_rate": 3.958021629962681e-06, + "loss": 0.61022902, + "num_input_tokens_seen": 33330235, + "router_z_loss_clip": 0.0267334, + "router_z_loss_mlp": 0.46875, + "step": 1553, + "time_per_iteration": 3.220923900604248 + }, + { + "auxiliary_loss_clip": 0.01178223, + "auxiliary_loss_mlp": 0.01058803, + "balance_loss_clip": 1.0369525, + "balance_loss_mlp": 1.05040002, + "epoch": 0.09343153464602436, + "flos": 23476852592640.0, + "grad_norm": 2.0753391269624797, + "language_loss": 0.87452686, + "learning_rate": 3.957942217314823e-06, + "loss": 0.89689714, + "num_input_tokens_seen": 33349035, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.28125, + "step": 1554, + "time_per_iteration": 2.5448763370513916 + }, + { + "auxiliary_loss_clip": 0.01174828, + "auxiliary_loss_mlp": 0.01052934, + "balance_loss_clip": 1.0310595, + "balance_loss_mlp": 1.05265594, + "epoch": 0.09349165789869232, + "flos": 19353481804800.0, + "grad_norm": 2.2438919833216913, + "language_loss": 0.81355709, + "learning_rate": 3.957862730421599e-06, + "loss": 0.83583468, + "num_input_tokens_seen": 33368060, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.21875, + "step": 1555, + "time_per_iteration": 2.4939990043640137 + }, + { + "auxiliary_loss_clip": 0.01058772, + "auxiliary_loss_mlp": 0.01008478, + "balance_loss_clip": 1.00581956, + "balance_loss_mlp": 1.01259685, + "epoch": 0.09355178115136029, + "flos": 67502580635520.0, + "grad_norm": 0.8701907042199977, + "language_loss": 0.59583747, + "learning_rate": 3.957783169286024e-06, + "loss": 0.61651003, + "num_input_tokens_seen": 33430825, + "router_z_loss_clip": 0.02661133, + "router_z_loss_mlp": 0.4609375, + "step": 1556, + "time_per_iteration": 3.0923824310302734 + }, + { + "auxiliary_loss_clip": 0.01177126, + "auxiliary_loss_mlp": 0.01056269, + "balance_loss_clip": 1.03518105, + "balance_loss_mlp": 1.05278862, + "epoch": 0.09361190440402825, + "flos": 37341638720640.0, + "grad_norm": 1.5891177576034032, + "language_loss": 0.84455961, + "learning_rate": 3.9577035339111155e-06, + "loss": 0.86689359, + "num_input_tokens_seen": 33454855, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.2421875, + "step": 1557, + "time_per_iteration": 2.5973968505859375 + }, + { + "auxiliary_loss_clip": 0.01175988, + "auxiliary_loss_mlp": 0.01061513, + "balance_loss_clip": 1.03799307, + "balance_loss_mlp": 1.05065048, + "epoch": 0.09367202765669623, + "flos": 24899705112960.0, + "grad_norm": 1.787574567308206, + "language_loss": 0.77987397, + "learning_rate": 3.957623824299893e-06, + "loss": 0.80224895, + "num_input_tokens_seen": 33476000, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.25, + "step": 1558, + "time_per_iteration": 2.5390286445617676 + }, + { + "auxiliary_loss_clip": 0.01178258, + "auxiliary_loss_mlp": 0.01054751, + "balance_loss_clip": 1.03268576, + "balance_loss_mlp": 1.05035424, + "epoch": 0.0937321509093642, + "flos": 15705568368000.0, + "grad_norm": 2.0310113035260873, + "language_loss": 0.7998119, + "learning_rate": 3.957544040455379e-06, + "loss": 0.822142, + "num_input_tokens_seen": 33493845, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.28125, + "step": 1559, + "time_per_iteration": 5.3233802318573 + }, + { + "auxiliary_loss_clip": 0.01172855, + "auxiliary_loss_mlp": 0.01063353, + "balance_loss_clip": 1.04146647, + "balance_loss_mlp": 1.05015147, + "epoch": 0.09379227416203216, + "flos": 20483698222080.0, + "grad_norm": 1.9877315441152976, + "language_loss": 0.76720232, + "learning_rate": 3.957464182380599e-06, + "loss": 0.78956437, + "num_input_tokens_seen": 33510850, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.2265625, + "step": 1560, + "time_per_iteration": 3.863935947418213 + }, + { + "auxiliary_loss_clip": 0.01180546, + "auxiliary_loss_mlp": 0.01059772, + "balance_loss_clip": 1.03793311, + "balance_loss_mlp": 1.05101645, + "epoch": 0.09385239741470014, + "flos": 24352498344960.0, + "grad_norm": 1.6628394684514, + "language_loss": 0.81219828, + "learning_rate": 3.95738425007858e-06, + "loss": 0.83460152, + "num_input_tokens_seen": 33530430, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.296875, + "step": 1561, + "time_per_iteration": 2.5050160884857178 + }, + { + "auxiliary_loss_clip": 0.01175131, + "auxiliary_loss_mlp": 0.01048338, + "balance_loss_clip": 1.02641547, + "balance_loss_mlp": 1.04764926, + "epoch": 0.0939125206673681, + "flos": 33291489807360.0, + "grad_norm": 2.307547697406205, + "language_loss": 0.61553764, + "learning_rate": 3.957304243552354e-06, + "loss": 0.63777232, + "num_input_tokens_seen": 33551975, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.2734375, + "step": 1562, + "time_per_iteration": 2.5884838104248047 + }, + { + "auxiliary_loss_clip": 0.01177686, + "auxiliary_loss_mlp": 0.01059886, + "balance_loss_clip": 1.03920364, + "balance_loss_mlp": 1.0552876, + "epoch": 0.09397264392003607, + "flos": 19244923925760.0, + "grad_norm": 2.5948914783661468, + "language_loss": 0.84981585, + "learning_rate": 3.957224162804956e-06, + "loss": 0.87219155, + "num_input_tokens_seen": 33569850, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.21875, + "step": 1563, + "time_per_iteration": 2.427928924560547 + }, + { + "auxiliary_loss_clip": 0.01172512, + "auxiliary_loss_mlp": 0.01048044, + "balance_loss_clip": 1.02767134, + "balance_loss_mlp": 1.05013323, + "epoch": 0.09403276717270405, + "flos": 19317930318720.0, + "grad_norm": 1.8141046481233785, + "language_loss": 0.76106739, + "learning_rate": 3.9571440078394205e-06, + "loss": 0.78327298, + "num_input_tokens_seen": 33590510, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.21875, + "step": 1564, + "time_per_iteration": 2.4996325969696045 + }, + { + "auxiliary_loss_clip": 0.01177295, + "auxiliary_loss_mlp": 0.01055133, + "balance_loss_clip": 1.03415227, + "balance_loss_mlp": 1.05290008, + "epoch": 0.09409289042537201, + "flos": 23583471137280.0, + "grad_norm": 2.0134268414891388, + "language_loss": 0.7971766, + "learning_rate": 3.9570637786587895e-06, + "loss": 0.81950086, + "num_input_tokens_seen": 33608810, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.25, + "step": 1565, + "time_per_iteration": 2.470870018005371 + }, + { + "auxiliary_loss_clip": 0.01175133, + "auxiliary_loss_mlp": 0.01069432, + "balance_loss_clip": 1.0479629, + "balance_loss_mlp": 1.0497129, + "epoch": 0.09415301367803998, + "flos": 20078446003200.0, + "grad_norm": 1.8353632925340597, + "language_loss": 0.75241816, + "learning_rate": 3.956983475266103e-06, + "loss": 0.77486378, + "num_input_tokens_seen": 33627265, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.2578125, + "step": 1566, + "time_per_iteration": 2.4962053298950195 + }, + { + "auxiliary_loss_clip": 0.0117411, + "auxiliary_loss_mlp": 0.01059014, + "balance_loss_clip": 1.03746092, + "balance_loss_mlp": 1.04822683, + "epoch": 0.09421313693070796, + "flos": 21062075016960.0, + "grad_norm": 2.55149440594841, + "language_loss": 0.77724433, + "learning_rate": 3.956903097664407e-06, + "loss": 0.79957557, + "num_input_tokens_seen": 33644810, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.2578125, + "step": 1567, + "time_per_iteration": 2.448511838912964 + }, + { + "auxiliary_loss_clip": 0.01178494, + "auxiliary_loss_mlp": 0.01054706, + "balance_loss_clip": 1.03504825, + "balance_loss_mlp": 1.05183101, + "epoch": 0.09427326018337592, + "flos": 24316156759680.0, + "grad_norm": 2.293964487000622, + "language_loss": 0.82571244, + "learning_rate": 3.956822645856749e-06, + "loss": 0.8480444, + "num_input_tokens_seen": 33665665, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.265625, + "step": 1568, + "time_per_iteration": 2.5221774578094482 + }, + { + "auxiliary_loss_clip": 0.01179838, + "auxiliary_loss_mlp": 0.01048346, + "balance_loss_clip": 1.02527881, + "balance_loss_mlp": 1.05191278, + "epoch": 0.09433338343604389, + "flos": 20263888944000.0, + "grad_norm": 4.3822924949764515, + "language_loss": 0.7658236, + "learning_rate": 3.9567421198461814e-06, + "loss": 0.78810549, + "num_input_tokens_seen": 33684760, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.28125, + "step": 1569, + "time_per_iteration": 2.464019775390625 + }, + { + "auxiliary_loss_clip": 0.01171203, + "auxiliary_loss_mlp": 0.01052053, + "balance_loss_clip": 1.03004718, + "balance_loss_mlp": 1.04984534, + "epoch": 0.09439350668871185, + "flos": 12742973493120.0, + "grad_norm": 2.11394347406088, + "language_loss": 0.86315012, + "learning_rate": 3.956661519635756e-06, + "loss": 0.88538271, + "num_input_tokens_seen": 33700750, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.21875, + "step": 1570, + "time_per_iteration": 2.479001998901367 + }, + { + "auxiliary_loss_clip": 0.01177967, + "auxiliary_loss_mlp": 0.01049184, + "balance_loss_clip": 1.02764273, + "balance_loss_mlp": 1.05340183, + "epoch": 0.09445362994137983, + "flos": 25962266263680.0, + "grad_norm": 1.6480791038221163, + "language_loss": 0.76531005, + "learning_rate": 3.95658084522853e-06, + "loss": 0.78758156, + "num_input_tokens_seen": 33724430, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.25, + "step": 1571, + "time_per_iteration": 2.5270462036132812 + }, + { + "auxiliary_loss_clip": 0.01169263, + "auxiliary_loss_mlp": 0.01049685, + "balance_loss_clip": 1.02848995, + "balance_loss_mlp": 1.0496099, + "epoch": 0.0945137531940478, + "flos": 19715353372800.0, + "grad_norm": 1.780883866775424, + "language_loss": 0.79518712, + "learning_rate": 3.956500096627561e-06, + "loss": 0.81737661, + "num_input_tokens_seen": 33743455, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1953125, + "step": 1572, + "time_per_iteration": 2.477403163909912 + }, + { + "auxiliary_loss_clip": 0.01172702, + "auxiliary_loss_mlp": 0.01053869, + "balance_loss_clip": 1.03288805, + "balance_loss_mlp": 1.05036175, + "epoch": 0.09457387644671576, + "flos": 23617047375360.0, + "grad_norm": 1.8458711299535766, + "language_loss": 0.87948155, + "learning_rate": 3.956419273835913e-06, + "loss": 0.90174723, + "num_input_tokens_seen": 33763435, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.21875, + "step": 1573, + "time_per_iteration": 2.5164122581481934 + }, + { + "auxiliary_loss_clip": 0.01177194, + "auxiliary_loss_mlp": 0.01059795, + "balance_loss_clip": 1.03533316, + "balance_loss_mlp": 1.05045378, + "epoch": 0.09463399969938374, + "flos": 26907291135360.0, + "grad_norm": 2.770313323609274, + "language_loss": 0.81827116, + "learning_rate": 3.95633837685665e-06, + "loss": 0.84064102, + "num_input_tokens_seen": 33784325, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.265625, + "step": 1574, + "time_per_iteration": 2.5540831089019775 + }, + { + "auxiliary_loss_clip": 0.01178056, + "auxiliary_loss_mlp": 0.01052269, + "balance_loss_clip": 1.03128815, + "balance_loss_mlp": 1.05359375, + "epoch": 0.0946941229520517, + "flos": 23659566099840.0, + "grad_norm": 2.139236970889498, + "language_loss": 0.80922085, + "learning_rate": 3.95625740569284e-06, + "loss": 0.83152413, + "num_input_tokens_seen": 33802510, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.25, + "step": 1575, + "time_per_iteration": 2.4874608516693115 + }, + { + "auxiliary_loss_clip": 0.01172567, + "auxiliary_loss_mlp": 0.01063693, + "balance_loss_clip": 1.04184198, + "balance_loss_mlp": 1.05048943, + "epoch": 0.09475424620471967, + "flos": 24134053783680.0, + "grad_norm": 2.1107661515601, + "language_loss": 0.86745369, + "learning_rate": 3.956176360347553e-06, + "loss": 0.88981628, + "num_input_tokens_seen": 33819980, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.21875, + "step": 1576, + "time_per_iteration": 2.514961004257202 + }, + { + "auxiliary_loss_clip": 0.01058351, + "auxiliary_loss_mlp": 0.01025821, + "balance_loss_clip": 1.02272177, + "balance_loss_mlp": 1.01272786, + "epoch": 0.09481436945738765, + "flos": 68426168065920.0, + "grad_norm": 0.9836929902555142, + "language_loss": 0.65832257, + "learning_rate": 3.956095240823862e-06, + "loss": 0.67916429, + "num_input_tokens_seen": 33878925, + "router_z_loss_clip": 0.03100586, + "router_z_loss_mlp": 0.45703125, + "step": 1577, + "time_per_iteration": 3.042998790740967 + }, + { + "auxiliary_loss_clip": 0.01175806, + "auxiliary_loss_mlp": 0.01045658, + "balance_loss_clip": 1.02504635, + "balance_loss_mlp": 1.05083144, + "epoch": 0.09487449271005562, + "flos": 16654076858880.0, + "grad_norm": 3.158821122445177, + "language_loss": 0.79113019, + "learning_rate": 3.956014047124844e-06, + "loss": 0.81334484, + "num_input_tokens_seen": 33897600, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.25, + "step": 1578, + "time_per_iteration": 2.492605447769165 + }, + { + "auxiliary_loss_clip": 0.01173104, + "auxiliary_loss_mlp": 0.01056494, + "balance_loss_clip": 1.03446436, + "balance_loss_mlp": 1.04935408, + "epoch": 0.09493461596272358, + "flos": 24275685110400.0, + "grad_norm": 1.6941125689582233, + "language_loss": 0.77994359, + "learning_rate": 3.955932779253578e-06, + "loss": 0.80223954, + "num_input_tokens_seen": 33917365, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.234375, + "step": 1579, + "time_per_iteration": 2.5021350383758545 + }, + { + "auxiliary_loss_clip": 0.01176838, + "auxiliary_loss_mlp": 0.01054415, + "balance_loss_clip": 1.0317533, + "balance_loss_mlp": 1.05228639, + "epoch": 0.09499473921539155, + "flos": 21870173243520.0, + "grad_norm": 2.3012950697800747, + "language_loss": 0.73576474, + "learning_rate": 3.955851437213144e-06, + "loss": 0.75807726, + "num_input_tokens_seen": 33936680, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.2421875, + "step": 1580, + "time_per_iteration": 2.500426769256592 + }, + { + "auxiliary_loss_clip": 0.01171524, + "auxiliary_loss_mlp": 0.01053034, + "balance_loss_clip": 1.03235102, + "balance_loss_mlp": 1.05162525, + "epoch": 0.09505486246805953, + "flos": 33547137880320.0, + "grad_norm": 2.820694860574998, + "language_loss": 0.77813822, + "learning_rate": 3.955770021006627e-06, + "loss": 0.80038381, + "num_input_tokens_seen": 33960685, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.203125, + "step": 1581, + "time_per_iteration": 2.569086790084839 + }, + { + "auxiliary_loss_clip": 0.01177083, + "auxiliary_loss_mlp": 0.0105881, + "balance_loss_clip": 1.03779316, + "balance_loss_mlp": 1.05315304, + "epoch": 0.09511498572072749, + "flos": 21215342350080.0, + "grad_norm": 2.1718701740895443, + "language_loss": 0.86914808, + "learning_rate": 3.955688530637116e-06, + "loss": 0.89150703, + "num_input_tokens_seen": 33980015, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.234375, + "step": 1582, + "time_per_iteration": 2.476386785507202 + }, + { + "auxiliary_loss_clip": 0.01178411, + "auxiliary_loss_mlp": 0.01057193, + "balance_loss_clip": 1.03394723, + "balance_loss_mlp": 1.05487967, + "epoch": 0.09517510897339546, + "flos": 14611262572800.0, + "grad_norm": 1.7496793522695477, + "language_loss": 0.66838771, + "learning_rate": 3.955606966107699e-06, + "loss": 0.6907438, + "num_input_tokens_seen": 33997705, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.234375, + "step": 1583, + "time_per_iteration": 2.4433302879333496 + }, + { + "auxiliary_loss_clip": 0.01180705, + "auxiliary_loss_mlp": 0.01052141, + "balance_loss_clip": 1.02919281, + "balance_loss_mlp": 1.0555923, + "epoch": 0.09523523222606343, + "flos": 27817339138560.0, + "grad_norm": 1.8272679383640855, + "language_loss": 0.70314872, + "learning_rate": 3.95552532742147e-06, + "loss": 0.7254771, + "num_input_tokens_seen": 34017465, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.25, + "step": 1584, + "time_per_iteration": 2.5352938175201416 + }, + { + "auxiliary_loss_clip": 0.01177345, + "auxiliary_loss_mlp": 0.01054432, + "balance_loss_clip": 1.0344646, + "balance_loss_mlp": 1.0527246, + "epoch": 0.0952953554787314, + "flos": 20706272847360.0, + "grad_norm": 1.5429491827095454, + "language_loss": 0.80649364, + "learning_rate": 3.955443614581525e-06, + "loss": 0.82881135, + "num_input_tokens_seen": 34038550, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.2421875, + "step": 1585, + "time_per_iteration": 2.5006139278411865 + }, + { + "auxiliary_loss_clip": 0.01179471, + "auxiliary_loss_mlp": 0.01056563, + "balance_loss_clip": 1.03301954, + "balance_loss_mlp": 1.05324364, + "epoch": 0.09535547873139937, + "flos": 24787627701120.0, + "grad_norm": 1.5763794615860258, + "language_loss": 0.7156626, + "learning_rate": 3.955361827590961e-06, + "loss": 0.73802292, + "num_input_tokens_seen": 34058665, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.265625, + "step": 1586, + "time_per_iteration": 2.510941982269287 + }, + { + "auxiliary_loss_clip": 0.01058124, + "auxiliary_loss_mlp": 0.010121, + "balance_loss_clip": 1.00946522, + "balance_loss_mlp": 1.01272035, + "epoch": 0.09541560198406734, + "flos": 71912194905600.0, + "grad_norm": 0.8128409972345002, + "language_loss": 0.55392706, + "learning_rate": 3.955279966452883e-06, + "loss": 0.57462931, + "num_input_tokens_seen": 34109655, + "router_z_loss_clip": 0.02636719, + "router_z_loss_mlp": 0.453125, + "step": 1587, + "time_per_iteration": 2.8747992515563965 + }, + { + "auxiliary_loss_clip": 0.0118109, + "auxiliary_loss_mlp": 0.01056077, + "balance_loss_clip": 1.0345006, + "balance_loss_mlp": 1.0550952, + "epoch": 0.09547572523673531, + "flos": 28982604251520.0, + "grad_norm": 1.813611272618652, + "language_loss": 0.81023234, + "learning_rate": 3.955198031170391e-06, + "loss": 0.83260405, + "num_input_tokens_seen": 34131115, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.265625, + "step": 1588, + "time_per_iteration": 2.5403292179107666 + }, + { + "auxiliary_loss_clip": 0.01178114, + "auxiliary_loss_mlp": 0.01054854, + "balance_loss_clip": 1.03290713, + "balance_loss_mlp": 1.05471849, + "epoch": 0.09553584848940327, + "flos": 24133910129280.0, + "grad_norm": 2.1843830695972835, + "language_loss": 0.81552076, + "learning_rate": 3.955116021746594e-06, + "loss": 0.83785045, + "num_input_tokens_seen": 34151925, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.234375, + "step": 1589, + "time_per_iteration": 2.4995651245117188 + }, + { + "auxiliary_loss_clip": 0.01175474, + "auxiliary_loss_mlp": 0.01051658, + "balance_loss_clip": 1.02901983, + "balance_loss_mlp": 1.05340207, + "epoch": 0.09559597174207124, + "flos": 42851376789120.0, + "grad_norm": 1.4497838373443381, + "language_loss": 0.65005404, + "learning_rate": 3.955033938184601e-06, + "loss": 0.67232537, + "num_input_tokens_seen": 34175395, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.21875, + "step": 1590, + "time_per_iteration": 2.7222375869750977 + }, + { + "auxiliary_loss_clip": 0.01173556, + "auxiliary_loss_mlp": 0.01051921, + "balance_loss_clip": 1.03036785, + "balance_loss_mlp": 1.05178595, + "epoch": 0.09565609499473922, + "flos": 32670845683200.0, + "grad_norm": 1.714913693600035, + "language_loss": 0.83272862, + "learning_rate": 3.954951780487526e-06, + "loss": 0.85498345, + "num_input_tokens_seen": 34197760, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.21875, + "step": 1591, + "time_per_iteration": 2.571277379989624 + }, + { + "auxiliary_loss_clip": 0.01179776, + "auxiliary_loss_mlp": 0.01055769, + "balance_loss_clip": 1.03419209, + "balance_loss_mlp": 1.05280709, + "epoch": 0.09571621824740718, + "flos": 18478410670080.0, + "grad_norm": 2.268244689889179, + "language_loss": 0.74068749, + "learning_rate": 3.9548695486584835e-06, + "loss": 0.76304293, + "num_input_tokens_seen": 34215330, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.265625, + "step": 1592, + "time_per_iteration": 2.446272373199463 + }, + { + "auxiliary_loss_clip": 0.01173297, + "auxiliary_loss_mlp": 0.0104948, + "balance_loss_clip": 1.0282129, + "balance_loss_mlp": 1.05028248, + "epoch": 0.09577634150007515, + "flos": 29387497334400.0, + "grad_norm": 1.9287746031752921, + "language_loss": 0.74135411, + "learning_rate": 3.954787242700592e-06, + "loss": 0.76358187, + "num_input_tokens_seen": 34237745, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.234375, + "step": 1593, + "time_per_iteration": 2.518038749694824 + }, + { + "auxiliary_loss_clip": 0.01175652, + "auxiliary_loss_mlp": 0.01051222, + "balance_loss_clip": 1.03061128, + "balance_loss_mlp": 1.05365515, + "epoch": 0.09583646475274313, + "flos": 22747830157440.0, + "grad_norm": 1.8251705146793997, + "language_loss": 0.69907188, + "learning_rate": 3.954704862616971e-06, + "loss": 0.72134066, + "num_input_tokens_seen": 34256565, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.21875, + "step": 1594, + "time_per_iteration": 2.5454983711242676 + }, + { + "auxiliary_loss_clip": 0.01174594, + "auxiliary_loss_mlp": 0.0105111, + "balance_loss_clip": 1.03062999, + "balance_loss_mlp": 1.05023921, + "epoch": 0.0958965880054111, + "flos": 23218367345280.0, + "grad_norm": 2.596137828422853, + "language_loss": 0.82464099, + "learning_rate": 3.954622408410747e-06, + "loss": 0.84689802, + "num_input_tokens_seen": 34275970, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.2421875, + "step": 1595, + "time_per_iteration": 2.472062826156616 + }, + { + "auxiliary_loss_clip": 0.01176658, + "auxiliary_loss_mlp": 0.01050558, + "balance_loss_clip": 1.02803886, + "balance_loss_mlp": 1.05217803, + "epoch": 0.09595671125807906, + "flos": 21324438933120.0, + "grad_norm": 2.0311987750358953, + "language_loss": 0.84673214, + "learning_rate": 3.954539880085045e-06, + "loss": 0.86900425, + "num_input_tokens_seen": 34295490, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.2421875, + "step": 1596, + "time_per_iteration": 2.4801599979400635 + }, + { + "auxiliary_loss_clip": 0.01181467, + "auxiliary_loss_mlp": 0.01051063, + "balance_loss_clip": 1.02871156, + "balance_loss_mlp": 1.05628884, + "epoch": 0.09601683451074704, + "flos": 39603472185600.0, + "grad_norm": 2.531539932785817, + "language_loss": 0.68993127, + "learning_rate": 3.9544572776429945e-06, + "loss": 0.71225667, + "num_input_tokens_seen": 34319990, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.25, + "step": 1597, + "time_per_iteration": 2.6195101737976074 + }, + { + "auxiliary_loss_clip": 0.01175632, + "auxiliary_loss_mlp": 0.0104509, + "balance_loss_clip": 1.02370429, + "balance_loss_mlp": 1.04902959, + "epoch": 0.096076957763415, + "flos": 23732716147200.0, + "grad_norm": 2.18946094151333, + "language_loss": 0.74929029, + "learning_rate": 3.954374601087729e-06, + "loss": 0.77149749, + "num_input_tokens_seen": 34339225, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.265625, + "step": 1598, + "time_per_iteration": 2.474071502685547 + }, + { + "auxiliary_loss_clip": 0.01179079, + "auxiliary_loss_mlp": 0.01048418, + "balance_loss_clip": 1.02574444, + "balance_loss_mlp": 1.05284083, + "epoch": 0.09613708101608297, + "flos": 34678108483200.0, + "grad_norm": 1.6350676424235815, + "language_loss": 0.69002283, + "learning_rate": 3.954291850422382e-06, + "loss": 0.7122978, + "num_input_tokens_seen": 34361020, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.265625, + "step": 1599, + "time_per_iteration": 2.5599992275238037 + }, + { + "auxiliary_loss_clip": 0.01174972, + "auxiliary_loss_mlp": 0.01056578, + "balance_loss_clip": 1.0358355, + "balance_loss_mlp": 1.05169392, + "epoch": 0.09619720426875093, + "flos": 20740028653440.0, + "grad_norm": 2.013538613147854, + "language_loss": 0.840271, + "learning_rate": 3.954209025650093e-06, + "loss": 0.8625865, + "num_input_tokens_seen": 34378630, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.234375, + "step": 1600, + "time_per_iteration": 2.4882116317749023 + }, + { + "auxiliary_loss_clip": 0.01174537, + "auxiliary_loss_mlp": 0.01052763, + "balance_loss_clip": 1.03162694, + "balance_loss_mlp": 1.05098653, + "epoch": 0.09625732752141891, + "flos": 13042720488960.0, + "grad_norm": 3.038904015519863, + "language_loss": 0.8034178, + "learning_rate": 3.954126126774001e-06, + "loss": 0.82569081, + "num_input_tokens_seen": 34397110, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.234375, + "step": 1601, + "time_per_iteration": 5.328745365142822 + }, + { + "auxiliary_loss_clip": 0.01178453, + "auxiliary_loss_mlp": 0.01052259, + "balance_loss_clip": 1.03031266, + "balance_loss_mlp": 1.05090928, + "epoch": 0.09631745077408688, + "flos": 22273629782400.0, + "grad_norm": 2.183236390866488, + "language_loss": 0.82405198, + "learning_rate": 3.954043153797251e-06, + "loss": 0.84635913, + "num_input_tokens_seen": 34414165, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.2734375, + "step": 1602, + "time_per_iteration": 2.4609556198120117 + }, + { + "auxiliary_loss_clip": 0.01172805, + "auxiliary_loss_mlp": 0.01051791, + "balance_loss_clip": 1.02926075, + "balance_loss_mlp": 1.05170703, + "epoch": 0.09637757402675484, + "flos": 24754266944640.0, + "grad_norm": 1.882331764966583, + "language_loss": 0.62527591, + "learning_rate": 3.953960106722989e-06, + "loss": 0.64752185, + "num_input_tokens_seen": 34434445, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.2109375, + "step": 1603, + "time_per_iteration": 2.4974379539489746 + }, + { + "auxiliary_loss_clip": 0.01178105, + "auxiliary_loss_mlp": 0.01054363, + "balance_loss_clip": 1.03049707, + "balance_loss_mlp": 1.05224609, + "epoch": 0.09643769727942282, + "flos": 22525758322560.0, + "grad_norm": 2.347327571135852, + "language_loss": 0.71259016, + "learning_rate": 3.953876985554364e-06, + "loss": 0.73491484, + "num_input_tokens_seen": 34453095, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.2578125, + "step": 1604, + "time_per_iteration": 2.5012693405151367 + }, + { + "auxiliary_loss_clip": 0.01172586, + "auxiliary_loss_mlp": 0.01056823, + "balance_loss_clip": 1.0368669, + "balance_loss_mlp": 1.05051208, + "epoch": 0.09649782053209079, + "flos": 30921026636160.0, + "grad_norm": 2.129697971326249, + "language_loss": 0.79487669, + "learning_rate": 3.953793790294527e-06, + "loss": 0.8171708, + "num_input_tokens_seen": 34473680, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.21875, + "step": 1605, + "time_per_iteration": 2.5392873287200928 + }, + { + "auxiliary_loss_clip": 0.01176232, + "auxiliary_loss_mlp": 0.01044253, + "balance_loss_clip": 1.02275968, + "balance_loss_mlp": 1.04916394, + "epoch": 0.09655794378475875, + "flos": 25337635729920.0, + "grad_norm": 3.698123586343809, + "language_loss": 0.74810207, + "learning_rate": 3.953710520946634e-06, + "loss": 0.77030694, + "num_input_tokens_seen": 34492610, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.2734375, + "step": 1606, + "time_per_iteration": 2.4922726154327393 + }, + { + "auxiliary_loss_clip": 0.01176161, + "auxiliary_loss_mlp": 0.01044775, + "balance_loss_clip": 1.02391386, + "balance_loss_mlp": 1.05243278, + "epoch": 0.09661806703742673, + "flos": 22346061557760.0, + "grad_norm": 1.649703340967918, + "language_loss": 0.75382137, + "learning_rate": 3.953627177513843e-06, + "loss": 0.77603066, + "num_input_tokens_seen": 34511855, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.234375, + "step": 1607, + "time_per_iteration": 2.4787087440490723 + }, + { + "auxiliary_loss_clip": 0.0117289, + "auxiliary_loss_mlp": 0.01042475, + "balance_loss_clip": 1.02206647, + "balance_loss_mlp": 1.04831934, + "epoch": 0.0966781902900947, + "flos": 17457578144640.0, + "grad_norm": 2.262571531890369, + "language_loss": 0.86648059, + "learning_rate": 3.953543759999312e-06, + "loss": 0.88863426, + "num_input_tokens_seen": 34528905, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.25, + "step": 1608, + "time_per_iteration": 2.435391664505005 + }, + { + "auxiliary_loss_clip": 0.01183391, + "auxiliary_loss_mlp": 0.01056654, + "balance_loss_clip": 1.03513622, + "balance_loss_mlp": 1.05276418, + "epoch": 0.09673831354276266, + "flos": 36903995412480.0, + "grad_norm": 2.2277980990408297, + "language_loss": 0.70968121, + "learning_rate": 3.953460268406207e-06, + "loss": 0.73208165, + "num_input_tokens_seen": 34548480, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.3046875, + "step": 1609, + "time_per_iteration": 2.599719762802124 + }, + { + "auxiliary_loss_clip": 0.01173214, + "auxiliary_loss_mlp": 0.01054271, + "balance_loss_clip": 1.03342104, + "balance_loss_mlp": 1.04860282, + "epoch": 0.09679843679543064, + "flos": 20701388597760.0, + "grad_norm": 3.7787270736621674, + "language_loss": 0.84566712, + "learning_rate": 3.953376702737693e-06, + "loss": 0.86794198, + "num_input_tokens_seen": 34565410, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.2421875, + "step": 1610, + "time_per_iteration": 2.446676254272461 + }, + { + "auxiliary_loss_clip": 0.01177531, + "auxiliary_loss_mlp": 0.01049914, + "balance_loss_clip": 1.02781224, + "balance_loss_mlp": 1.05382621, + "epoch": 0.0968585600480986, + "flos": 23514415240320.0, + "grad_norm": 2.0483419743874682, + "language_loss": 0.67360532, + "learning_rate": 3.953293062996939e-06, + "loss": 0.69587982, + "num_input_tokens_seen": 34584840, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.234375, + "step": 1611, + "time_per_iteration": 2.520211696624756 + }, + { + "auxiliary_loss_clip": 0.01177545, + "auxiliary_loss_mlp": 0.0105068, + "balance_loss_clip": 1.03000879, + "balance_loss_mlp": 1.05313492, + "epoch": 0.09691868330076657, + "flos": 20121072468480.0, + "grad_norm": 1.6625909003061596, + "language_loss": 0.81166416, + "learning_rate": 3.953209349187115e-06, + "loss": 0.83394641, + "num_input_tokens_seen": 34603360, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.2421875, + "step": 1612, + "time_per_iteration": 2.449491262435913 + }, + { + "auxiliary_loss_clip": 0.01180036, + "auxiliary_loss_mlp": 0.01061745, + "balance_loss_clip": 1.04027581, + "balance_loss_mlp": 1.05431938, + "epoch": 0.09697880655343454, + "flos": 16544692967040.0, + "grad_norm": 2.509420249413084, + "language_loss": 0.80708754, + "learning_rate": 3.953125561311398e-06, + "loss": 0.82950538, + "num_input_tokens_seen": 34620760, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.2578125, + "step": 1613, + "time_per_iteration": 2.4753763675689697 + }, + { + "auxiliary_loss_clip": 0.01173718, + "auxiliary_loss_mlp": 0.01052644, + "balance_loss_clip": 1.03019738, + "balance_loss_mlp": 1.05074048, + "epoch": 0.09703892980610251, + "flos": 26104184899200.0, + "grad_norm": 2.0025313344872484, + "language_loss": 0.84173608, + "learning_rate": 3.953041699372964e-06, + "loss": 0.86399966, + "num_input_tokens_seen": 34640695, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.2265625, + "step": 1614, + "time_per_iteration": 2.5492141246795654 + }, + { + "auxiliary_loss_clip": 0.01065917, + "auxiliary_loss_mlp": 0.010187, + "balance_loss_clip": 1.01610088, + "balance_loss_mlp": 1.019063, + "epoch": 0.09709905305877048, + "flos": 60443622000000.0, + "grad_norm": 0.7078098108364695, + "language_loss": 0.54584575, + "learning_rate": 3.952957763374992e-06, + "loss": 0.56669194, + "num_input_tokens_seen": 34702395, + "router_z_loss_clip": 0.02600098, + "router_z_loss_mlp": 0.46875, + "step": 1615, + "time_per_iteration": 3.1041057109832764 + }, + { + "auxiliary_loss_clip": 0.01065912, + "auxiliary_loss_mlp": 0.01007477, + "balance_loss_clip": 1.00491357, + "balance_loss_mlp": 1.01844954, + "epoch": 0.09715917631143844, + "flos": 57639932893440.0, + "grad_norm": 0.7637649269659756, + "language_loss": 0.5822649, + "learning_rate": 3.952873753320666e-06, + "loss": 0.60299873, + "num_input_tokens_seen": 34768910, + "router_z_loss_clip": 0.02563477, + "router_z_loss_mlp": 0.47460938, + "step": 1616, + "time_per_iteration": 3.215376377105713 + }, + { + "auxiliary_loss_clip": 0.01178513, + "auxiliary_loss_mlp": 0.01055808, + "balance_loss_clip": 1.03275275, + "balance_loss_mlp": 1.05275226, + "epoch": 0.09721929956410642, + "flos": 20558212986240.0, + "grad_norm": 1.690325520565165, + "language_loss": 0.69293094, + "learning_rate": 3.952789669213172e-06, + "loss": 0.71527421, + "num_input_tokens_seen": 34787680, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.2578125, + "step": 1617, + "time_per_iteration": 2.458017587661743 + }, + { + "auxiliary_loss_clip": 0.01176727, + "auxiliary_loss_mlp": 0.01055641, + "balance_loss_clip": 1.03116739, + "balance_loss_mlp": 1.05130577, + "epoch": 0.09727942281677439, + "flos": 27344359825920.0, + "grad_norm": 1.7927692696889819, + "language_loss": 0.80748308, + "learning_rate": 3.952705511055698e-06, + "loss": 0.8298068, + "num_input_tokens_seen": 34808330, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.25, + "step": 1618, + "time_per_iteration": 2.5471577644348145 + }, + { + "auxiliary_loss_clip": 0.01169902, + "auxiliary_loss_mlp": 0.01050477, + "balance_loss_clip": 1.03077149, + "balance_loss_mlp": 1.04996848, + "epoch": 0.09733954606944235, + "flos": 24900028335360.0, + "grad_norm": 1.5831304278494804, + "language_loss": 0.9288674, + "learning_rate": 3.952621278851435e-06, + "loss": 0.9510712, + "num_input_tokens_seen": 34830020, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1953125, + "step": 1619, + "time_per_iteration": 2.515282392501831 + }, + { + "auxiliary_loss_clip": 0.01171299, + "auxiliary_loss_mlp": 0.01052594, + "balance_loss_clip": 1.03150594, + "balance_loss_mlp": 1.05216622, + "epoch": 0.09739966932211033, + "flos": 31503928544640.0, + "grad_norm": 1.7974961209450113, + "language_loss": 0.88785303, + "learning_rate": 3.9525369726035784e-06, + "loss": 0.910092, + "num_input_tokens_seen": 34850330, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1875, + "step": 1620, + "time_per_iteration": 2.556744337081909 + }, + { + "auxiliary_loss_clip": 0.01175309, + "auxiliary_loss_mlp": 0.01056801, + "balance_loss_clip": 1.0339601, + "balance_loss_mlp": 1.05045033, + "epoch": 0.0974597925747783, + "flos": 23878764846720.0, + "grad_norm": 1.90931759761679, + "language_loss": 0.77130795, + "learning_rate": 3.952452592315324e-06, + "loss": 0.79362905, + "num_input_tokens_seen": 34871640, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.25, + "step": 1621, + "time_per_iteration": 2.491441011428833 + }, + { + "auxiliary_loss_clip": 0.01171563, + "auxiliary_loss_mlp": 0.01056002, + "balance_loss_clip": 1.03398418, + "balance_loss_mlp": 1.04859447, + "epoch": 0.09751991582744626, + "flos": 17019575700480.0, + "grad_norm": 1.9170880538391684, + "language_loss": 0.77856946, + "learning_rate": 3.952368137989871e-06, + "loss": 0.80084509, + "num_input_tokens_seen": 34888100, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.2265625, + "step": 1622, + "time_per_iteration": 2.4379701614379883 + }, + { + "auxiliary_loss_clip": 0.01177415, + "auxiliary_loss_mlp": 0.01056732, + "balance_loss_clip": 1.0349052, + "balance_loss_mlp": 1.05105746, + "epoch": 0.09758003908011423, + "flos": 28402826826240.0, + "grad_norm": 1.9420709042223125, + "language_loss": 0.85783195, + "learning_rate": 3.9522836096304225e-06, + "loss": 0.88017344, + "num_input_tokens_seen": 34910485, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.265625, + "step": 1623, + "time_per_iteration": 2.51741099357605 + }, + { + "auxiliary_loss_clip": 0.01172696, + "auxiliary_loss_mlp": 0.01056286, + "balance_loss_clip": 1.03498316, + "balance_loss_mlp": 1.05181813, + "epoch": 0.09764016233278221, + "flos": 18144297336960.0, + "grad_norm": 2.2833168401589656, + "language_loss": 0.80328369, + "learning_rate": 3.952199007240184e-06, + "loss": 0.8255735, + "num_input_tokens_seen": 34928615, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.2109375, + "step": 1624, + "time_per_iteration": 2.4646618366241455 + }, + { + "auxiliary_loss_clip": 0.01170952, + "auxiliary_loss_mlp": 0.01044517, + "balance_loss_clip": 1.02450192, + "balance_loss_mlp": 1.04799926, + "epoch": 0.09770028558545017, + "flos": 15265842071040.0, + "grad_norm": 2.7577002662180954, + "language_loss": 0.8575626, + "learning_rate": 3.952114330822364e-06, + "loss": 0.87971735, + "num_input_tokens_seen": 34946045, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.234375, + "step": 1625, + "time_per_iteration": 2.452615976333618 + }, + { + "auxiliary_loss_clip": 0.01176501, + "auxiliary_loss_mlp": 0.01055325, + "balance_loss_clip": 1.03445125, + "balance_loss_mlp": 1.05226421, + "epoch": 0.09776040883811814, + "flos": 23472435219840.0, + "grad_norm": 3.258883448957912, + "language_loss": 0.8539601, + "learning_rate": 3.952029580380172e-06, + "loss": 0.87627834, + "num_input_tokens_seen": 34962865, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.2421875, + "step": 1626, + "time_per_iteration": 2.4931013584136963 + }, + { + "auxiliary_loss_clip": 0.01181466, + "auxiliary_loss_mlp": 0.0105723, + "balance_loss_clip": 1.03493834, + "balance_loss_mlp": 1.05541551, + "epoch": 0.09782053209078612, + "flos": 24499480798080.0, + "grad_norm": 1.979888643217431, + "language_loss": 0.83329904, + "learning_rate": 3.9519447559168234e-06, + "loss": 0.85568601, + "num_input_tokens_seen": 34983505, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.2578125, + "step": 1627, + "time_per_iteration": 2.5056917667388916 + }, + { + "auxiliary_loss_clip": 0.01170161, + "auxiliary_loss_mlp": 0.01050744, + "balance_loss_clip": 1.03065729, + "balance_loss_mlp": 1.0488416, + "epoch": 0.09788065534345408, + "flos": 21580158833280.0, + "grad_norm": 1.7873285490487296, + "language_loss": 0.84291327, + "learning_rate": 3.951859857435534e-06, + "loss": 0.86512232, + "num_input_tokens_seen": 35001825, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.2109375, + "step": 1628, + "time_per_iteration": 2.4835076332092285 + }, + { + "auxiliary_loss_clip": 0.01169153, + "auxiliary_loss_mlp": 0.01052825, + "balance_loss_clip": 1.0321064, + "balance_loss_mlp": 1.04880238, + "epoch": 0.09794077859612205, + "flos": 23842459175040.0, + "grad_norm": 1.6092149858605884, + "language_loss": 0.75609362, + "learning_rate": 3.951774884939523e-06, + "loss": 0.77831334, + "num_input_tokens_seen": 35023075, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.203125, + "step": 1629, + "time_per_iteration": 2.4959983825683594 + }, + { + "auxiliary_loss_clip": 0.01175285, + "auxiliary_loss_mlp": 0.01046701, + "balance_loss_clip": 1.02412319, + "balance_loss_mlp": 1.0530107, + "epoch": 0.09800090184879003, + "flos": 23659889322240.0, + "grad_norm": 1.5982247062153871, + "language_loss": 0.78224194, + "learning_rate": 3.951689838432013e-06, + "loss": 0.80446172, + "num_input_tokens_seen": 35043480, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.21875, + "step": 1630, + "time_per_iteration": 2.5138731002807617 + }, + { + "auxiliary_loss_clip": 0.01177321, + "auxiliary_loss_mlp": 0.01051411, + "balance_loss_clip": 1.0292381, + "balance_loss_mlp": 1.05457997, + "epoch": 0.09806102510145799, + "flos": 17055773631360.0, + "grad_norm": 1.9134334701620013, + "language_loss": 0.86704385, + "learning_rate": 3.951604717916228e-06, + "loss": 0.8893311, + "num_input_tokens_seen": 35061490, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.2265625, + "step": 1631, + "time_per_iteration": 2.443878173828125 + }, + { + "auxiliary_loss_clip": 0.01172712, + "auxiliary_loss_mlp": 0.01050929, + "balance_loss_clip": 1.03065109, + "balance_loss_mlp": 1.05258322, + "epoch": 0.09812114835412596, + "flos": 23878477537920.0, + "grad_norm": 2.096430969489036, + "language_loss": 0.83111286, + "learning_rate": 3.9515195233953975e-06, + "loss": 0.85334921, + "num_input_tokens_seen": 35079670, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.203125, + "step": 1632, + "time_per_iteration": 2.478825807571411 + }, + { + "auxiliary_loss_clip": 0.01174956, + "auxiliary_loss_mlp": 0.01056552, + "balance_loss_clip": 1.0368464, + "balance_loss_mlp": 1.05281615, + "epoch": 0.09818127160679392, + "flos": 20595488325120.0, + "grad_norm": 1.5107232822128822, + "language_loss": 0.7877655, + "learning_rate": 3.951434254872751e-06, + "loss": 0.81008065, + "num_input_tokens_seen": 35099205, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.21875, + "step": 1633, + "time_per_iteration": 2.447930097579956 + }, + { + "auxiliary_loss_clip": 0.01169184, + "auxiliary_loss_mlp": 0.01049402, + "balance_loss_clip": 1.02833819, + "balance_loss_mlp": 1.04989707, + "epoch": 0.0982413948594619, + "flos": 15487339288320.0, + "grad_norm": 2.0663591821232865, + "language_loss": 0.73159611, + "learning_rate": 3.951348912351521e-06, + "loss": 0.75378191, + "num_input_tokens_seen": 35115270, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1875, + "step": 1634, + "time_per_iteration": 2.460265636444092 + }, + { + "auxiliary_loss_clip": 0.01179893, + "auxiliary_loss_mlp": 0.01062758, + "balance_loss_clip": 1.04026294, + "balance_loss_mlp": 1.0516957, + "epoch": 0.09830151811212987, + "flos": 24207958016640.0, + "grad_norm": 2.7516342600991868, + "language_loss": 0.72714394, + "learning_rate": 3.951263495834947e-06, + "loss": 0.74957043, + "num_input_tokens_seen": 35134065, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.28125, + "step": 1635, + "time_per_iteration": 2.4835710525512695 + }, + { + "auxiliary_loss_clip": 0.01177592, + "auxiliary_loss_mlp": 0.01055297, + "balance_loss_clip": 1.03301644, + "balance_loss_mlp": 1.05253148, + "epoch": 0.09836164136479783, + "flos": 20594590485120.0, + "grad_norm": 1.8458745824258636, + "language_loss": 0.7819975, + "learning_rate": 3.951178005326264e-06, + "loss": 0.80432636, + "num_input_tokens_seen": 35154870, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.25, + "step": 1636, + "time_per_iteration": 2.53061842918396 + }, + { + "auxiliary_loss_clip": 0.01173491, + "auxiliary_loss_mlp": 0.01056847, + "balance_loss_clip": 1.03498387, + "balance_loss_mlp": 1.05113721, + "epoch": 0.09842176461746581, + "flos": 19934157070080.0, + "grad_norm": 2.2976115041381386, + "language_loss": 0.70005965, + "learning_rate": 3.951092440828715e-06, + "loss": 0.722363, + "num_input_tokens_seen": 35171850, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.21875, + "step": 1637, + "time_per_iteration": 2.445742130279541 + }, + { + "auxiliary_loss_clip": 0.0117379, + "auxiliary_loss_mlp": 0.01053221, + "balance_loss_clip": 1.03175139, + "balance_loss_mlp": 1.05108416, + "epoch": 0.09848188787013377, + "flos": 21214659991680.0, + "grad_norm": 2.115587702667026, + "language_loss": 0.77395654, + "learning_rate": 3.951006802345545e-06, + "loss": 0.79622668, + "num_input_tokens_seen": 35188795, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.2265625, + "step": 1638, + "time_per_iteration": 2.4725139141082764 + }, + { + "auxiliary_loss_clip": 0.01170234, + "auxiliary_loss_mlp": 0.01046096, + "balance_loss_clip": 1.02524579, + "balance_loss_mlp": 1.05077171, + "epoch": 0.09854201112280174, + "flos": 30154226071680.0, + "grad_norm": 1.4162008179950134, + "language_loss": 0.7263118, + "learning_rate": 3.950921089880003e-06, + "loss": 0.74847507, + "num_input_tokens_seen": 35212100, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1953125, + "step": 1639, + "time_per_iteration": 2.5534512996673584 + }, + { + "auxiliary_loss_clip": 0.01173162, + "auxiliary_loss_mlp": 0.01040763, + "balance_loss_clip": 1.01943696, + "balance_loss_mlp": 1.05003214, + "epoch": 0.09860213437546972, + "flos": 21795730306560.0, + "grad_norm": 1.8280373897837945, + "language_loss": 0.88669002, + "learning_rate": 3.950835303435337e-06, + "loss": 0.90882927, + "num_input_tokens_seen": 35230390, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.234375, + "step": 1640, + "time_per_iteration": 2.4868786334991455 + }, + { + "auxiliary_loss_clip": 0.01173727, + "auxiliary_loss_mlp": 0.01037743, + "balance_loss_clip": 1.01685774, + "balance_loss_mlp": 1.05164635, + "epoch": 0.09866225762813768, + "flos": 21835555511040.0, + "grad_norm": 2.1859335509376527, + "language_loss": 0.8086108, + "learning_rate": 3.950749443014801e-06, + "loss": 0.83072555, + "num_input_tokens_seen": 35250405, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.21875, + "step": 1641, + "time_per_iteration": 2.5081584453582764 + }, + { + "auxiliary_loss_clip": 0.01173536, + "auxiliary_loss_mlp": 0.01054387, + "balance_loss_clip": 1.03130805, + "balance_loss_mlp": 1.05067503, + "epoch": 0.09872238088080565, + "flos": 17599855916160.0, + "grad_norm": 2.4983515693134417, + "language_loss": 0.85826755, + "learning_rate": 3.95066350862165e-06, + "loss": 0.88054669, + "num_input_tokens_seen": 35262820, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.2265625, + "step": 1642, + "time_per_iteration": 2.4351255893707275 + }, + { + "auxiliary_loss_clip": 0.01177694, + "auxiliary_loss_mlp": 0.01053725, + "balance_loss_clip": 1.0326128, + "balance_loss_mlp": 1.05365527, + "epoch": 0.09878250413347361, + "flos": 27636134002560.0, + "grad_norm": 1.7421144196917664, + "language_loss": 0.80859929, + "learning_rate": 3.950577500259144e-06, + "loss": 0.83091342, + "num_input_tokens_seen": 35284490, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.2421875, + "step": 1643, + "time_per_iteration": 3.9550716876983643 + }, + { + "auxiliary_loss_clip": 0.01170472, + "auxiliary_loss_mlp": 0.01063125, + "balance_loss_clip": 1.04138088, + "balance_loss_mlp": 1.0494256, + "epoch": 0.0988426273861416, + "flos": 16544728880640.0, + "grad_norm": 1.9624417465121429, + "language_loss": 0.8262763, + "learning_rate": 3.950491417930543e-06, + "loss": 0.84861231, + "num_input_tokens_seen": 35302815, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.2109375, + "step": 1644, + "time_per_iteration": 3.8253817558288574 + }, + { + "auxiliary_loss_clip": 0.01169448, + "auxiliary_loss_mlp": 0.01048566, + "balance_loss_clip": 1.02733469, + "balance_loss_mlp": 1.05048347, + "epoch": 0.09890275063880956, + "flos": 21215270522880.0, + "grad_norm": 1.7099323885745632, + "language_loss": 0.6819675, + "learning_rate": 3.9504052616391124e-06, + "loss": 0.70414758, + "num_input_tokens_seen": 35321175, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1953125, + "step": 1645, + "time_per_iteration": 2.4549567699432373 + }, + { + "auxiliary_loss_clip": 0.01065531, + "auxiliary_loss_mlp": 0.01023286, + "balance_loss_clip": 1.0206517, + "balance_loss_mlp": 1.01924491, + "epoch": 0.09896287389147752, + "flos": 59379372910080.0, + "grad_norm": 0.9514884974425206, + "language_loss": 0.60854232, + "learning_rate": 3.950319031388119e-06, + "loss": 0.62943053, + "num_input_tokens_seen": 35381740, + "router_z_loss_clip": 0.02636719, + "router_z_loss_mlp": 0.46289062, + "step": 1646, + "time_per_iteration": 2.9953765869140625 + }, + { + "auxiliary_loss_clip": 0.01170253, + "auxiliary_loss_mlp": 0.01049996, + "balance_loss_clip": 1.02725112, + "balance_loss_mlp": 1.04880357, + "epoch": 0.0990229971441455, + "flos": 29642678530560.0, + "grad_norm": 2.5496486678231425, + "language_loss": 0.73046064, + "learning_rate": 3.950232727180833e-06, + "loss": 0.75266314, + "num_input_tokens_seen": 35403760, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.2109375, + "step": 1647, + "time_per_iteration": 2.5241641998291016 + }, + { + "auxiliary_loss_clip": 0.01171762, + "auxiliary_loss_mlp": 0.0105645, + "balance_loss_clip": 1.03663731, + "balance_loss_mlp": 1.04955053, + "epoch": 0.09908312039681347, + "flos": 21834873152640.0, + "grad_norm": 1.8237647662791463, + "language_loss": 0.84120429, + "learning_rate": 3.950146349020525e-06, + "loss": 0.86348635, + "num_input_tokens_seen": 35424050, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.21875, + "step": 1648, + "time_per_iteration": 2.467717170715332 + }, + { + "auxiliary_loss_clip": 0.01061152, + "auxiliary_loss_mlp": 0.01009658, + "balance_loss_clip": 1.00701165, + "balance_loss_mlp": 1.0159142, + "epoch": 0.09914324364948143, + "flos": 57564304807680.0, + "grad_norm": 0.7437092318732932, + "language_loss": 0.55674303, + "learning_rate": 3.950059896910473e-06, + "loss": 0.57745123, + "num_input_tokens_seen": 35481690, + "router_z_loss_clip": 0.02648926, + "router_z_loss_mlp": 0.453125, + "step": 1649, + "time_per_iteration": 2.99874210357666 + }, + { + "auxiliary_loss_clip": 0.01165781, + "auxiliary_loss_mlp": 0.01046657, + "balance_loss_clip": 1.02598572, + "balance_loss_mlp": 1.04597533, + "epoch": 0.09920336690214941, + "flos": 34123934476800.0, + "grad_norm": 2.284847215884091, + "language_loss": 0.89930248, + "learning_rate": 3.949973370853954e-06, + "loss": 0.92142689, + "num_input_tokens_seen": 35498635, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1953125, + "step": 1650, + "time_per_iteration": 2.5542855262756348 + }, + { + "auxiliary_loss_clip": 0.01058202, + "auxiliary_loss_mlp": 0.01003693, + "balance_loss_clip": 1.00105858, + "balance_loss_mlp": 1.01395106, + "epoch": 0.09926349015481738, + "flos": 71216428464000.0, + "grad_norm": 0.8031298543824162, + "language_loss": 0.63733649, + "learning_rate": 3.94988677085425e-06, + "loss": 0.65795547, + "num_input_tokens_seen": 35565720, + "router_z_loss_clip": 0.02636719, + "router_z_loss_mlp": 0.44140625, + "step": 1651, + "time_per_iteration": 3.217806100845337 + }, + { + "auxiliary_loss_clip": 0.01168872, + "auxiliary_loss_mlp": 0.01054978, + "balance_loss_clip": 1.03318655, + "balance_loss_mlp": 1.04885435, + "epoch": 0.09932361340748534, + "flos": 23148700917120.0, + "grad_norm": 1.9462006377707899, + "language_loss": 0.88288587, + "learning_rate": 3.949800096914643e-06, + "loss": 0.90512443, + "num_input_tokens_seen": 35586000, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.203125, + "step": 1652, + "time_per_iteration": 2.5014448165893555 + }, + { + "auxiliary_loss_clip": 0.01174376, + "auxiliary_loss_mlp": 0.01057611, + "balance_loss_clip": 1.03692842, + "balance_loss_mlp": 1.05190849, + "epoch": 0.09938373666015332, + "flos": 19828651847040.0, + "grad_norm": 1.9500387106757973, + "language_loss": 0.82206833, + "learning_rate": 3.949713349038422e-06, + "loss": 0.84438825, + "num_input_tokens_seen": 35604355, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.2265625, + "step": 1653, + "time_per_iteration": 2.4881839752197266 + }, + { + "auxiliary_loss_clip": 0.01172582, + "auxiliary_loss_mlp": 0.010545, + "balance_loss_clip": 1.03330469, + "balance_loss_mlp": 1.04984093, + "epoch": 0.09944385991282129, + "flos": 22090664880000.0, + "grad_norm": 2.0314065071494136, + "language_loss": 0.79399735, + "learning_rate": 3.949626527228875e-06, + "loss": 0.81626815, + "num_input_tokens_seen": 35625495, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.2265625, + "step": 1654, + "time_per_iteration": 2.5269205570220947 + }, + { + "auxiliary_loss_clip": 0.01167439, + "auxiliary_loss_mlp": 0.01055854, + "balance_loss_clip": 1.03700721, + "balance_loss_mlp": 1.05072093, + "epoch": 0.09950398316548925, + "flos": 19828867328640.0, + "grad_norm": 1.5637423809135174, + "language_loss": 0.8088094, + "learning_rate": 3.949539631489295e-06, + "loss": 0.83104229, + "num_input_tokens_seen": 35645030, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.171875, + "step": 1655, + "time_per_iteration": 2.4652602672576904 + }, + { + "auxiliary_loss_clip": 0.01167369, + "auxiliary_loss_mlp": 0.0105576, + "balance_loss_clip": 1.03495777, + "balance_loss_mlp": 1.04891443, + "epoch": 0.09956410641815722, + "flos": 25003701964800.0, + "grad_norm": 1.9082198159511756, + "language_loss": 0.80947387, + "learning_rate": 3.9494526618229765e-06, + "loss": 0.83170521, + "num_input_tokens_seen": 35664305, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.1875, + "step": 1656, + "time_per_iteration": 2.4966416358947754 + }, + { + "auxiliary_loss_clip": 0.01170477, + "auxiliary_loss_mlp": 0.0106116, + "balance_loss_clip": 1.04066813, + "balance_loss_mlp": 1.05147541, + "epoch": 0.0996242296708252, + "flos": 19317714837120.0, + "grad_norm": 1.6268850155063674, + "language_loss": 0.88850212, + "learning_rate": 3.949365618233217e-06, + "loss": 0.91081852, + "num_input_tokens_seen": 35684060, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1875, + "step": 1657, + "time_per_iteration": 2.446124792098999 + }, + { + "auxiliary_loss_clip": 0.01175951, + "auxiliary_loss_mlp": 0.01063236, + "balance_loss_clip": 1.04088378, + "balance_loss_mlp": 1.05091214, + "epoch": 0.09968435292349316, + "flos": 21871609787520.0, + "grad_norm": 2.0057694643168302, + "language_loss": 0.84758937, + "learning_rate": 3.9492785007233195e-06, + "loss": 0.86998123, + "num_input_tokens_seen": 35703250, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.25, + "step": 1658, + "time_per_iteration": 2.457902669906616 + }, + { + "auxiliary_loss_clip": 0.01054631, + "auxiliary_loss_mlp": 0.01077242, + "balance_loss_clip": 1.07460773, + "balance_loss_mlp": 1.0110395, + "epoch": 0.09974447617616113, + "flos": 65384533313280.0, + "grad_norm": 0.9153195332104517, + "language_loss": 0.60843968, + "learning_rate": 3.949191309296585e-06, + "loss": 0.62975848, + "num_input_tokens_seen": 35762165, + "router_z_loss_clip": 0.02636719, + "router_z_loss_mlp": 0.43554688, + "step": 1659, + "time_per_iteration": 3.077805519104004 + }, + { + "auxiliary_loss_clip": 0.01170517, + "auxiliary_loss_mlp": 0.01052954, + "balance_loss_clip": 1.03155613, + "balance_loss_mlp": 1.04999721, + "epoch": 0.0998045994288291, + "flos": 23659817495040.0, + "grad_norm": 1.8691655756599186, + "language_loss": 0.85116851, + "learning_rate": 3.949104043956321e-06, + "loss": 0.87340325, + "num_input_tokens_seen": 35781520, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.2109375, + "step": 1660, + "time_per_iteration": 2.49082612991333 + }, + { + "auxiliary_loss_clip": 0.01171003, + "auxiliary_loss_mlp": 0.01056184, + "balance_loss_clip": 1.03393948, + "balance_loss_mlp": 1.05291247, + "epoch": 0.09986472268149707, + "flos": 19609704495360.0, + "grad_norm": 2.130922035700174, + "language_loss": 0.80037123, + "learning_rate": 3.949016704705836e-06, + "loss": 0.8226431, + "num_input_tokens_seen": 35799565, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.1796875, + "step": 1661, + "time_per_iteration": 2.4412636756896973 + }, + { + "auxiliary_loss_clip": 0.01175671, + "auxiliary_loss_mlp": 0.01050112, + "balance_loss_clip": 1.02801085, + "balance_loss_mlp": 1.05002224, + "epoch": 0.09992484593416504, + "flos": 26213317395840.0, + "grad_norm": 1.8939661728963775, + "language_loss": 0.83592767, + "learning_rate": 3.948929291548443e-06, + "loss": 0.85818553, + "num_input_tokens_seen": 35821085, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.2578125, + "step": 1662, + "time_per_iteration": 2.5200328826904297 + }, + { + "auxiliary_loss_clip": 0.01171098, + "auxiliary_loss_mlp": 0.01052384, + "balance_loss_clip": 1.02972281, + "balance_loss_mlp": 1.05104828, + "epoch": 0.09998496918683301, + "flos": 17493632421120.0, + "grad_norm": 2.1063962968477, + "language_loss": 0.88696563, + "learning_rate": 3.9488418044874546e-06, + "loss": 0.90920055, + "num_input_tokens_seen": 35839840, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.203125, + "step": 1663, + "time_per_iteration": 2.42790150642395 + }, + { + "auxiliary_loss_clip": 0.01174901, + "auxiliary_loss_mlp": 0.01052956, + "balance_loss_clip": 1.03084326, + "balance_loss_mlp": 1.05225635, + "epoch": 0.10004509243950098, + "flos": 22784925928320.0, + "grad_norm": 1.6888490247303796, + "language_loss": 0.7034179, + "learning_rate": 3.948754243526191e-06, + "loss": 0.72569644, + "num_input_tokens_seen": 35861545, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.21875, + "step": 1664, + "time_per_iteration": 2.512596368789673 + }, + { + "auxiliary_loss_clip": 0.01173831, + "auxiliary_loss_mlp": 0.01050685, + "balance_loss_clip": 1.02903676, + "balance_loss_mlp": 1.0535655, + "epoch": 0.10010521569216894, + "flos": 16253385667200.0, + "grad_norm": 2.1773983349048804, + "language_loss": 0.7878316, + "learning_rate": 3.94866660866797e-06, + "loss": 0.81007671, + "num_input_tokens_seen": 35878295, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.203125, + "step": 1665, + "time_per_iteration": 2.4271252155303955 + }, + { + "auxiliary_loss_clip": 0.0117847, + "auxiliary_loss_mlp": 0.01061559, + "balance_loss_clip": 1.0404706, + "balance_loss_mlp": 1.05681181, + "epoch": 0.10016533894483691, + "flos": 23402589223680.0, + "grad_norm": 1.663243771388797, + "language_loss": 0.70152062, + "learning_rate": 3.9485788999161165e-06, + "loss": 0.72392094, + "num_input_tokens_seen": 35898990, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.21875, + "step": 1666, + "time_per_iteration": 2.499131202697754 + }, + { + "auxiliary_loss_clip": 0.01173729, + "auxiliary_loss_mlp": 0.01060981, + "balance_loss_clip": 1.03777063, + "balance_loss_mlp": 1.0506525, + "epoch": 0.10022546219750489, + "flos": 19354164163200.0, + "grad_norm": 1.8121915129470096, + "language_loss": 0.791031, + "learning_rate": 3.948491117273956e-06, + "loss": 0.8133781, + "num_input_tokens_seen": 35916225, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.234375, + "step": 1667, + "time_per_iteration": 2.4429264068603516 + }, + { + "auxiliary_loss_clip": 0.0117317, + "auxiliary_loss_mlp": 0.01050651, + "balance_loss_clip": 1.02810836, + "balance_loss_mlp": 1.05261493, + "epoch": 0.10028558545017285, + "flos": 27085766837760.0, + "grad_norm": 2.9507555712476945, + "language_loss": 0.7715596, + "learning_rate": 3.948403260744817e-06, + "loss": 0.79379785, + "num_input_tokens_seen": 35934630, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.203125, + "step": 1668, + "time_per_iteration": 2.5223031044006348 + }, + { + "auxiliary_loss_clip": 0.01173457, + "auxiliary_loss_mlp": 0.01057389, + "balance_loss_clip": 1.03434563, + "balance_loss_mlp": 1.05256963, + "epoch": 0.10034570870284082, + "flos": 25847136195840.0, + "grad_norm": 1.9809152554972944, + "language_loss": 0.77852714, + "learning_rate": 3.948315330332031e-06, + "loss": 0.80083561, + "num_input_tokens_seen": 35953855, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.2109375, + "step": 1669, + "time_per_iteration": 2.5082881450653076 + }, + { + "auxiliary_loss_clip": 0.01181618, + "auxiliary_loss_mlp": 0.01059972, + "balance_loss_clip": 1.03641593, + "balance_loss_mlp": 1.05464602, + "epoch": 0.1004058319555088, + "flos": 26249587153920.0, + "grad_norm": 2.145889566444559, + "language_loss": 0.85461181, + "learning_rate": 3.948227326038933e-06, + "loss": 0.87702769, + "num_input_tokens_seen": 35974555, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.2734375, + "step": 1670, + "time_per_iteration": 2.5235135555267334 + }, + { + "auxiliary_loss_clip": 0.01166248, + "auxiliary_loss_mlp": 0.01057789, + "balance_loss_clip": 1.03681993, + "balance_loss_mlp": 1.0501771, + "epoch": 0.10046595520817676, + "flos": 25374480105600.0, + "grad_norm": 1.5986093935623644, + "language_loss": 0.76899171, + "learning_rate": 3.9481392478688586e-06, + "loss": 0.79123211, + "num_input_tokens_seen": 35996830, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.15625, + "step": 1671, + "time_per_iteration": 2.505441665649414 + }, + { + "auxiliary_loss_clip": 0.01059926, + "auxiliary_loss_mlp": 0.01022857, + "balance_loss_clip": 1.02019823, + "balance_loss_mlp": 1.01598763, + "epoch": 0.10052607846084473, + "flos": 67461821677440.0, + "grad_norm": 0.7900846916321359, + "language_loss": 0.60719293, + "learning_rate": 3.948051095825149e-06, + "loss": 0.62802076, + "num_input_tokens_seen": 36054465, + "router_z_loss_clip": 0.02661133, + "router_z_loss_mlp": 0.43945312, + "step": 1672, + "time_per_iteration": 3.07255482673645 + }, + { + "auxiliary_loss_clip": 0.01173395, + "auxiliary_loss_mlp": 0.01064348, + "balance_loss_clip": 1.04179382, + "balance_loss_mlp": 1.05045998, + "epoch": 0.10058620171351271, + "flos": 21360493209600.0, + "grad_norm": 2.0407855091156377, + "language_loss": 0.77119517, + "learning_rate": 3.947962869911147e-06, + "loss": 0.79357255, + "num_input_tokens_seen": 36073480, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.234375, + "step": 1673, + "time_per_iteration": 2.4693222045898438 + }, + { + "auxiliary_loss_clip": 0.01171478, + "auxiliary_loss_mlp": 0.01052114, + "balance_loss_clip": 1.03066778, + "balance_loss_mlp": 1.04964709, + "epoch": 0.10064632496618067, + "flos": 16800125558400.0, + "grad_norm": 2.2570599367002835, + "language_loss": 0.72829556, + "learning_rate": 3.947874570130197e-06, + "loss": 0.75053144, + "num_input_tokens_seen": 36091830, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.21875, + "step": 1674, + "time_per_iteration": 2.4534130096435547 + }, + { + "auxiliary_loss_clip": 0.01170516, + "auxiliary_loss_mlp": 0.01051148, + "balance_loss_clip": 1.03047729, + "balance_loss_mlp": 1.04903197, + "epoch": 0.10070644821884864, + "flos": 23624445576960.0, + "grad_norm": 2.043409325490185, + "language_loss": 0.79386973, + "learning_rate": 3.947786196485649e-06, + "loss": 0.81608635, + "num_input_tokens_seen": 36111400, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.21875, + "step": 1675, + "time_per_iteration": 2.496504545211792 + }, + { + "auxiliary_loss_clip": 0.01168157, + "auxiliary_loss_mlp": 0.01064762, + "balance_loss_clip": 1.04449606, + "balance_loss_mlp": 1.04908013, + "epoch": 0.1007665714715166, + "flos": 24462564595200.0, + "grad_norm": 2.0305638084579294, + "language_loss": 0.81565315, + "learning_rate": 3.947697748980853e-06, + "loss": 0.8379823, + "num_input_tokens_seen": 36129345, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1875, + "step": 1676, + "time_per_iteration": 2.5022919178009033 + }, + { + "auxiliary_loss_clip": 0.01174395, + "auxiliary_loss_mlp": 0.01058603, + "balance_loss_clip": 1.03713369, + "balance_loss_mlp": 1.05283856, + "epoch": 0.10082669472418458, + "flos": 16799119977600.0, + "grad_norm": 2.134524944411931, + "language_loss": 0.86155027, + "learning_rate": 3.947609227619163e-06, + "loss": 0.88388026, + "num_input_tokens_seen": 36146255, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.2109375, + "step": 1677, + "time_per_iteration": 2.44887113571167 + }, + { + "auxiliary_loss_clip": 0.01171962, + "auxiliary_loss_mlp": 0.01055328, + "balance_loss_clip": 1.03452563, + "balance_loss_mlp": 1.05113602, + "epoch": 0.10088681797685255, + "flos": 13553513844480.0, + "grad_norm": 5.349815535910457, + "language_loss": 0.86318195, + "learning_rate": 3.947520632403936e-06, + "loss": 0.88545489, + "num_input_tokens_seen": 36164050, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.2109375, + "step": 1678, + "time_per_iteration": 2.4373903274536133 + }, + { + "auxiliary_loss_clip": 0.01172423, + "auxiliary_loss_mlp": 0.01055078, + "balance_loss_clip": 1.03359675, + "balance_loss_mlp": 1.05214512, + "epoch": 0.10094694122952051, + "flos": 25265706744960.0, + "grad_norm": 2.6897314721028867, + "language_loss": 0.89726269, + "learning_rate": 3.947431963338532e-06, + "loss": 0.91953766, + "num_input_tokens_seen": 36183530, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.203125, + "step": 1679, + "time_per_iteration": 2.493959903717041 + }, + { + "auxiliary_loss_clip": 0.01056795, + "auxiliary_loss_mlp": 0.01017317, + "balance_loss_clip": 1.01468229, + "balance_loss_mlp": 1.01327634, + "epoch": 0.10100706448218849, + "flos": 69854299885440.0, + "grad_norm": 0.7831657514235874, + "language_loss": 0.53018153, + "learning_rate": 3.947343220426312e-06, + "loss": 0.55092263, + "num_input_tokens_seen": 36248550, + "router_z_loss_clip": 0.02636719, + "router_z_loss_mlp": 0.43554688, + "step": 1680, + "time_per_iteration": 3.15899658203125 + }, + { + "auxiliary_loss_clip": 0.01168402, + "auxiliary_loss_mlp": 0.01055332, + "balance_loss_clip": 1.03429151, + "balance_loss_mlp": 1.04983318, + "epoch": 0.10106718773485646, + "flos": 20007163463040.0, + "grad_norm": 1.657625192327098, + "language_loss": 0.76889706, + "learning_rate": 3.947254403670641e-06, + "loss": 0.79113436, + "num_input_tokens_seen": 36266065, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.1875, + "step": 1681, + "time_per_iteration": 2.446937322616577 + }, + { + "auxiliary_loss_clip": 0.01175341, + "auxiliary_loss_mlp": 0.01058478, + "balance_loss_clip": 1.03423131, + "balance_loss_mlp": 1.04937744, + "epoch": 0.10112731098752442, + "flos": 13479825093120.0, + "grad_norm": 2.135292201068385, + "language_loss": 0.93928307, + "learning_rate": 3.947165513074889e-06, + "loss": 0.96162128, + "num_input_tokens_seen": 36280960, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.2578125, + "step": 1682, + "time_per_iteration": 2.4357759952545166 + }, + { + "auxiliary_loss_clip": 0.01172101, + "auxiliary_loss_mlp": 0.01053977, + "balance_loss_clip": 1.03315091, + "balance_loss_mlp": 1.05045152, + "epoch": 0.1011874342401924, + "flos": 18515901490560.0, + "grad_norm": 5.112669241194533, + "language_loss": 0.87866408, + "learning_rate": 3.947076548642425e-06, + "loss": 0.90092492, + "num_input_tokens_seen": 36299010, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.21875, + "step": 1683, + "time_per_iteration": 2.427802562713623 + }, + { + "auxiliary_loss_clip": 0.01169341, + "auxiliary_loss_mlp": 0.01059869, + "balance_loss_clip": 1.03888798, + "balance_loss_mlp": 1.05144525, + "epoch": 0.10124755749286037, + "flos": 20702861055360.0, + "grad_norm": 1.7718228637860187, + "language_loss": 0.74768114, + "learning_rate": 3.946987510376624e-06, + "loss": 0.76997328, + "num_input_tokens_seen": 36318400, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.1796875, + "step": 1684, + "time_per_iteration": 5.332470417022705 + }, + { + "auxiliary_loss_clip": 0.01059522, + "auxiliary_loss_mlp": 0.01011499, + "balance_loss_clip": 1.00863802, + "balance_loss_mlp": 1.01624751, + "epoch": 0.10130768074552833, + "flos": 56109456247680.0, + "grad_norm": 0.760003339390084, + "language_loss": 0.61090153, + "learning_rate": 3.9468983982808615e-06, + "loss": 0.6316117, + "num_input_tokens_seen": 36381815, + "router_z_loss_clip": 0.02856445, + "router_z_loss_mlp": 0.43359375, + "step": 1685, + "time_per_iteration": 4.508171081542969 + }, + { + "auxiliary_loss_clip": 0.01169013, + "auxiliary_loss_mlp": 0.01049359, + "balance_loss_clip": 1.02769828, + "balance_loss_mlp": 1.04891801, + "epoch": 0.1013678039981963, + "flos": 33402346156800.0, + "grad_norm": 2.3224629698824075, + "language_loss": 0.61664945, + "learning_rate": 3.946809212358516e-06, + "loss": 0.63883317, + "num_input_tokens_seen": 36404320, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.203125, + "step": 1686, + "time_per_iteration": 2.586596965789795 + }, + { + "auxiliary_loss_clip": 0.01173787, + "auxiliary_loss_mlp": 0.01054454, + "balance_loss_clip": 1.03238797, + "balance_loss_mlp": 1.0545882, + "epoch": 0.10142792725086427, + "flos": 31905338008320.0, + "grad_norm": 2.1992592502117443, + "language_loss": 0.81408226, + "learning_rate": 3.946719952612972e-06, + "loss": 0.83636469, + "num_input_tokens_seen": 36427510, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1875, + "step": 1687, + "time_per_iteration": 2.5495810508728027 + }, + { + "auxiliary_loss_clip": 0.01173812, + "auxiliary_loss_mlp": 0.01051846, + "balance_loss_clip": 1.03055501, + "balance_loss_mlp": 1.0514555, + "epoch": 0.10148805050353224, + "flos": 28475905046400.0, + "grad_norm": 1.783489688966995, + "language_loss": 0.72360015, + "learning_rate": 3.94663061904761e-06, + "loss": 0.74585676, + "num_input_tokens_seen": 36448230, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.21875, + "step": 1688, + "time_per_iteration": 2.5242748260498047 + }, + { + "auxiliary_loss_clip": 0.01169898, + "auxiliary_loss_mlp": 0.01054433, + "balance_loss_clip": 1.03264165, + "balance_loss_mlp": 1.05043888, + "epoch": 0.1015481737562002, + "flos": 25148888737920.0, + "grad_norm": 1.9893327907397977, + "language_loss": 0.86880058, + "learning_rate": 3.94654121166582e-06, + "loss": 0.8910439, + "num_input_tokens_seen": 36464395, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.1875, + "step": 1689, + "time_per_iteration": 2.5283408164978027 + }, + { + "auxiliary_loss_clip": 0.01165961, + "auxiliary_loss_mlp": 0.01045526, + "balance_loss_clip": 1.02585626, + "balance_loss_mlp": 1.04692245, + "epoch": 0.10160829700886818, + "flos": 30882781630080.0, + "grad_norm": 1.8972643802531153, + "language_loss": 0.88054395, + "learning_rate": 3.946451730470993e-06, + "loss": 0.90265882, + "num_input_tokens_seen": 36486475, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1875, + "step": 1690, + "time_per_iteration": 2.5732247829437256 + }, + { + "auxiliary_loss_clip": 0.01170509, + "auxiliary_loss_mlp": 0.01051598, + "balance_loss_clip": 1.02961624, + "balance_loss_mlp": 1.04965854, + "epoch": 0.10166842026153615, + "flos": 20412020632320.0, + "grad_norm": 1.8841763324380914, + "language_loss": 0.83124495, + "learning_rate": 3.946362175466521e-06, + "loss": 0.85346603, + "num_input_tokens_seen": 36505310, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.203125, + "step": 1691, + "time_per_iteration": 2.453263282775879 + }, + { + "auxiliary_loss_clip": 0.01172636, + "auxiliary_loss_mlp": 0.01050561, + "balance_loss_clip": 1.028579, + "balance_loss_mlp": 1.05049825, + "epoch": 0.10172854351420411, + "flos": 33476968661760.0, + "grad_norm": 1.648035623213742, + "language_loss": 0.66938514, + "learning_rate": 3.946272546655801e-06, + "loss": 0.69161713, + "num_input_tokens_seen": 36529820, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.21875, + "step": 1692, + "time_per_iteration": 2.5865867137908936 + }, + { + "auxiliary_loss_clip": 0.01167535, + "auxiliary_loss_mlp": 0.01067279, + "balance_loss_clip": 1.04540372, + "balance_loss_mlp": 1.0471102, + "epoch": 0.1017886667668721, + "flos": 23550325862400.0, + "grad_norm": 1.649284734670808, + "language_loss": 0.75387824, + "learning_rate": 3.94618284404223e-06, + "loss": 0.77622634, + "num_input_tokens_seen": 36549000, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.203125, + "step": 1693, + "time_per_iteration": 2.499476194381714 + }, + { + "auxiliary_loss_clip": 0.01171507, + "auxiliary_loss_mlp": 0.01049517, + "balance_loss_clip": 1.02685595, + "balance_loss_mlp": 1.04984784, + "epoch": 0.10184879001954006, + "flos": 23296078419840.0, + "grad_norm": 1.6930931596653784, + "language_loss": 0.87206519, + "learning_rate": 3.9460930676292105e-06, + "loss": 0.89427543, + "num_input_tokens_seen": 36567515, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.21875, + "step": 1694, + "time_per_iteration": 2.483264923095703 + }, + { + "auxiliary_loss_clip": 0.01177185, + "auxiliary_loss_mlp": 0.01052768, + "balance_loss_clip": 1.03013015, + "balance_loss_mlp": 1.05056214, + "epoch": 0.10190891327220802, + "flos": 18333116156160.0, + "grad_norm": 3.1999162319303274, + "language_loss": 0.79579329, + "learning_rate": 3.946003217420147e-06, + "loss": 0.81809288, + "num_input_tokens_seen": 36586190, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.265625, + "step": 1695, + "time_per_iteration": 2.4574177265167236 + }, + { + "auxiliary_loss_clip": 0.01168528, + "auxiliary_loss_mlp": 0.01055372, + "balance_loss_clip": 1.03280592, + "balance_loss_mlp": 1.04648614, + "epoch": 0.10196903652487599, + "flos": 26465374108800.0, + "grad_norm": 1.7546035908378184, + "language_loss": 0.86581397, + "learning_rate": 3.945913293418447e-06, + "loss": 0.88805294, + "num_input_tokens_seen": 36607495, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.21875, + "step": 1696, + "time_per_iteration": 2.4986772537231445 + }, + { + "auxiliary_loss_clip": 0.01168623, + "auxiliary_loss_mlp": 0.01056747, + "balance_loss_clip": 1.03532469, + "balance_loss_mlp": 1.04927731, + "epoch": 0.10202915977754397, + "flos": 21869526798720.0, + "grad_norm": 1.97196247739744, + "language_loss": 0.82034266, + "learning_rate": 3.945823295627519e-06, + "loss": 0.84259629, + "num_input_tokens_seen": 36628555, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.1953125, + "step": 1697, + "time_per_iteration": 2.483682155609131 + }, + { + "auxiliary_loss_clip": 0.01170239, + "auxiliary_loss_mlp": 0.0104937, + "balance_loss_clip": 1.02674437, + "balance_loss_mlp": 1.0477041, + "epoch": 0.10208928303021193, + "flos": 22309755886080.0, + "grad_norm": 1.9483747561194416, + "language_loss": 0.80650747, + "learning_rate": 3.9457332240507775e-06, + "loss": 0.82870358, + "num_input_tokens_seen": 36646250, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.2265625, + "step": 1698, + "time_per_iteration": 2.4512858390808105 + }, + { + "auxiliary_loss_clip": 0.01172882, + "auxiliary_loss_mlp": 0.01048738, + "balance_loss_clip": 1.02756608, + "balance_loss_mlp": 1.05113077, + "epoch": 0.1021494062828799, + "flos": 22125569921280.0, + "grad_norm": 4.641294823605382, + "language_loss": 0.75680709, + "learning_rate": 3.945643078691637e-06, + "loss": 0.77902329, + "num_input_tokens_seen": 36666675, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.21875, + "step": 1699, + "time_per_iteration": 2.458315849304199 + }, + { + "auxiliary_loss_clip": 0.01171952, + "auxiliary_loss_mlp": 0.01048121, + "balance_loss_clip": 1.02606726, + "balance_loss_mlp": 1.05093145, + "epoch": 0.10220952953554788, + "flos": 19646728439040.0, + "grad_norm": 1.7623204527071121, + "language_loss": 0.79777479, + "learning_rate": 3.945552859553516e-06, + "loss": 0.81997555, + "num_input_tokens_seen": 36685225, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.2109375, + "step": 1700, + "time_per_iteration": 2.4692423343658447 + }, + { + "auxiliary_loss_clip": 0.01170922, + "auxiliary_loss_mlp": 0.01045823, + "balance_loss_clip": 1.02411532, + "balance_loss_mlp": 1.04850125, + "epoch": 0.10226965278821584, + "flos": 29787290686080.0, + "grad_norm": 1.8827887870563835, + "language_loss": 0.76854098, + "learning_rate": 3.945462566639836e-06, + "loss": 0.79070842, + "num_input_tokens_seen": 36705985, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.21875, + "step": 1701, + "time_per_iteration": 2.5015852451324463 + }, + { + "auxiliary_loss_clip": 0.01176415, + "auxiliary_loss_mlp": 0.01048843, + "balance_loss_clip": 1.02708709, + "balance_loss_mlp": 1.05213511, + "epoch": 0.10232977604088381, + "flos": 27016818681600.0, + "grad_norm": 2.1180628790190927, + "language_loss": 0.78123891, + "learning_rate": 3.945372199954019e-06, + "loss": 0.80349147, + "num_input_tokens_seen": 36725815, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.2421875, + "step": 1702, + "time_per_iteration": 2.4999852180480957 + }, + { + "auxiliary_loss_clip": 0.01168217, + "auxiliary_loss_mlp": 0.01046251, + "balance_loss_clip": 1.02586651, + "balance_loss_mlp": 1.0487566, + "epoch": 0.10238989929355179, + "flos": 20777519473920.0, + "grad_norm": 2.3091523831758765, + "language_loss": 0.94838184, + "learning_rate": 3.945281759499494e-06, + "loss": 0.97052652, + "num_input_tokens_seen": 36742345, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.1953125, + "step": 1703, + "time_per_iteration": 2.4586100578308105 + }, + { + "auxiliary_loss_clip": 0.01058115, + "auxiliary_loss_mlp": 0.01013234, + "balance_loss_clip": 1.01077867, + "balance_loss_mlp": 1.01462317, + "epoch": 0.10245002254621975, + "flos": 57698322451200.0, + "grad_norm": 0.8800585598511617, + "language_loss": 0.55092424, + "learning_rate": 3.94519124527969e-06, + "loss": 0.57163775, + "num_input_tokens_seen": 36798775, + "router_z_loss_clip": 0.02453613, + "router_z_loss_mlp": 0.43554688, + "step": 1704, + "time_per_iteration": 2.998384952545166 + }, + { + "auxiliary_loss_clip": 0.01170706, + "auxiliary_loss_mlp": 0.01050153, + "balance_loss_clip": 1.02790844, + "balance_loss_mlp": 1.04962945, + "epoch": 0.10251014579888772, + "flos": 16800125558400.0, + "grad_norm": 3.5257555777633174, + "language_loss": 0.83979154, + "learning_rate": 3.945100657298039e-06, + "loss": 0.86200017, + "num_input_tokens_seen": 36816295, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.2109375, + "step": 1705, + "time_per_iteration": 2.4242281913757324 + }, + { + "auxiliary_loss_clip": 0.01057951, + "auxiliary_loss_mlp": 0.01005039, + "balance_loss_clip": 1.00258374, + "balance_loss_mlp": 1.01514411, + "epoch": 0.1025702690515557, + "flos": 68565500922240.0, + "grad_norm": 0.7733309182053202, + "language_loss": 0.60434854, + "learning_rate": 3.9450099955579765e-06, + "loss": 0.62497854, + "num_input_tokens_seen": 36882030, + "router_z_loss_clip": 0.02453613, + "router_z_loss_mlp": 0.4296875, + "step": 1706, + "time_per_iteration": 3.127495765686035 + }, + { + "auxiliary_loss_clip": 0.01175774, + "auxiliary_loss_mlp": 0.01050349, + "balance_loss_clip": 1.02876019, + "balance_loss_mlp": 1.05214357, + "epoch": 0.10263039230422366, + "flos": 14866623336960.0, + "grad_norm": 2.0444921886168284, + "language_loss": 0.85967243, + "learning_rate": 3.94491926006294e-06, + "loss": 0.88193369, + "num_input_tokens_seen": 36899245, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.234375, + "step": 1707, + "time_per_iteration": 2.4486777782440186 + }, + { + "auxiliary_loss_clip": 0.01169845, + "auxiliary_loss_mlp": 0.01046854, + "balance_loss_clip": 1.02654099, + "balance_loss_mlp": 1.04891372, + "epoch": 0.10269051555689163, + "flos": 25337599816320.0, + "grad_norm": 1.6368034329364625, + "language_loss": 0.72840983, + "learning_rate": 3.944828450816369e-06, + "loss": 0.75057685, + "num_input_tokens_seen": 36920950, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.2109375, + "step": 1708, + "time_per_iteration": 2.5019850730895996 + }, + { + "auxiliary_loss_clip": 0.01168702, + "auxiliary_loss_mlp": 0.01054619, + "balance_loss_clip": 1.0325532, + "balance_loss_mlp": 1.0493356, + "epoch": 0.10275063880955959, + "flos": 21068826773760.0, + "grad_norm": 1.9016884094819633, + "language_loss": 0.90944314, + "learning_rate": 3.944737567821709e-06, + "loss": 0.93167639, + "num_input_tokens_seen": 36938900, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1953125, + "step": 1709, + "time_per_iteration": 2.4590399265289307 + }, + { + "auxiliary_loss_clip": 0.01173643, + "auxiliary_loss_mlp": 0.01055032, + "balance_loss_clip": 1.03357422, + "balance_loss_mlp": 1.05296373, + "epoch": 0.10281076206222757, + "flos": 30366780802560.0, + "grad_norm": 3.826538703219267, + "language_loss": 0.8828221, + "learning_rate": 3.944646611082406e-06, + "loss": 0.90510881, + "num_input_tokens_seen": 36957010, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.203125, + "step": 1710, + "time_per_iteration": 2.533165216445923 + }, + { + "auxiliary_loss_clip": 0.01167248, + "auxiliary_loss_mlp": 0.01053637, + "balance_loss_clip": 1.03229809, + "balance_loss_mlp": 1.04937959, + "epoch": 0.10287088531489554, + "flos": 22418313765120.0, + "grad_norm": 1.824520485293549, + "language_loss": 0.79264998, + "learning_rate": 3.944555580601908e-06, + "loss": 0.81485879, + "num_input_tokens_seen": 36977690, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1796875, + "step": 1711, + "time_per_iteration": 2.4947102069854736 + }, + { + "auxiliary_loss_clip": 0.01171963, + "auxiliary_loss_mlp": 0.01058195, + "balance_loss_clip": 1.03615332, + "balance_loss_mlp": 1.05005431, + "epoch": 0.1029310085675635, + "flos": 25115994858240.0, + "grad_norm": 2.0689984646996016, + "language_loss": 0.73589319, + "learning_rate": 3.944464476383668e-06, + "loss": 0.7581948, + "num_input_tokens_seen": 36997300, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.21875, + "step": 1712, + "time_per_iteration": 2.521899461746216 + }, + { + "auxiliary_loss_clip": 0.01166438, + "auxiliary_loss_mlp": 0.01058704, + "balance_loss_clip": 1.03902221, + "balance_loss_mlp": 1.04961872, + "epoch": 0.10299113182023148, + "flos": 19865639877120.0, + "grad_norm": 1.8460865361447714, + "language_loss": 0.86673403, + "learning_rate": 3.94437329843114e-06, + "loss": 0.8889854, + "num_input_tokens_seen": 37016110, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.1640625, + "step": 1713, + "time_per_iteration": 2.467824935913086 + }, + { + "auxiliary_loss_clip": 0.01166539, + "auxiliary_loss_mlp": 0.01059926, + "balance_loss_clip": 1.04019666, + "balance_loss_mlp": 1.04741335, + "epoch": 0.10305125507289944, + "flos": 20447608032000.0, + "grad_norm": 2.6691144860495126, + "language_loss": 0.72610664, + "learning_rate": 3.944282046747782e-06, + "loss": 0.74837124, + "num_input_tokens_seen": 37036405, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1875, + "step": 1714, + "time_per_iteration": 2.478605031967163 + }, + { + "auxiliary_loss_clip": 0.0117345, + "auxiliary_loss_mlp": 0.01057893, + "balance_loss_clip": 1.03542209, + "balance_loss_mlp": 1.04920006, + "epoch": 0.10311137832556741, + "flos": 26250772302720.0, + "grad_norm": 2.3323118637090605, + "language_loss": 0.91395295, + "learning_rate": 3.944190721337053e-06, + "loss": 0.93626636, + "num_input_tokens_seen": 37057580, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.2421875, + "step": 1715, + "time_per_iteration": 2.5223729610443115 + }, + { + "auxiliary_loss_clip": 0.01167345, + "auxiliary_loss_mlp": 0.01053609, + "balance_loss_clip": 1.03290248, + "balance_loss_mlp": 1.04737377, + "epoch": 0.10317150157823539, + "flos": 35298932175360.0, + "grad_norm": 1.9302110224144968, + "language_loss": 0.75736755, + "learning_rate": 3.944099322202418e-06, + "loss": 0.77957708, + "num_input_tokens_seen": 37079120, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.203125, + "step": 1716, + "time_per_iteration": 2.585345506668091 + }, + { + "auxiliary_loss_clip": 0.01171415, + "auxiliary_loss_mlp": 0.01068189, + "balance_loss_clip": 1.04601645, + "balance_loss_mlp": 1.04868793, + "epoch": 0.10323162483090335, + "flos": 25739943033600.0, + "grad_norm": 2.1161503252482747, + "language_loss": 0.85214567, + "learning_rate": 3.944007849347342e-06, + "loss": 0.87454176, + "num_input_tokens_seen": 37099710, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.2265625, + "step": 1717, + "time_per_iteration": 2.500964879989624 + }, + { + "auxiliary_loss_clip": 0.01169937, + "auxiliary_loss_mlp": 0.01055982, + "balance_loss_clip": 1.0358479, + "balance_loss_mlp": 1.05102515, + "epoch": 0.10329174808357132, + "flos": 16289870906880.0, + "grad_norm": 2.0308520014155746, + "language_loss": 0.82883167, + "learning_rate": 3.943916302775292e-06, + "loss": 0.85109091, + "num_input_tokens_seen": 37117775, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.1875, + "step": 1718, + "time_per_iteration": 2.436836004257202 + }, + { + "auxiliary_loss_clip": 0.01169212, + "auxiliary_loss_mlp": 0.01052655, + "balance_loss_clip": 1.03058898, + "balance_loss_mlp": 1.05092025, + "epoch": 0.10335187133623928, + "flos": 36687166963200.0, + "grad_norm": 1.8725763890619624, + "language_loss": 0.73192763, + "learning_rate": 3.943824682489742e-06, + "loss": 0.75414634, + "num_input_tokens_seen": 37140280, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1796875, + "step": 1719, + "time_per_iteration": 2.606293201446533 + }, + { + "auxiliary_loss_clip": 0.01172065, + "auxiliary_loss_mlp": 0.01046316, + "balance_loss_clip": 1.02637219, + "balance_loss_mlp": 1.05197001, + "epoch": 0.10341199458890726, + "flos": 14975648092800.0, + "grad_norm": 2.356604748076592, + "language_loss": 0.92601806, + "learning_rate": 3.9437329884941665e-06, + "loss": 0.94820189, + "num_input_tokens_seen": 37158350, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.203125, + "step": 1720, + "time_per_iteration": 2.4628992080688477 + }, + { + "auxiliary_loss_clip": 0.01167182, + "auxiliary_loss_mlp": 0.01054246, + "balance_loss_clip": 1.03239512, + "balance_loss_mlp": 1.04656935, + "epoch": 0.10347211784157523, + "flos": 21031587348480.0, + "grad_norm": 2.8075298743139174, + "language_loss": 0.79416633, + "learning_rate": 3.943641220792039e-06, + "loss": 0.81638062, + "num_input_tokens_seen": 37177120, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.2109375, + "step": 1721, + "time_per_iteration": 2.4713754653930664 + }, + { + "auxiliary_loss_clip": 0.0117694, + "auxiliary_loss_mlp": 0.01056525, + "balance_loss_clip": 1.03317165, + "balance_loss_mlp": 1.05172479, + "epoch": 0.1035322410942432, + "flos": 19792094780160.0, + "grad_norm": 2.496468299898097, + "language_loss": 0.80755401, + "learning_rate": 3.9435493793868434e-06, + "loss": 0.82988858, + "num_input_tokens_seen": 37195895, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.25, + "step": 1722, + "time_per_iteration": 2.4676520824432373 + }, + { + "auxiliary_loss_clip": 0.01056762, + "auxiliary_loss_mlp": 0.01049921, + "balance_loss_clip": 1.04772782, + "balance_loss_mlp": 1.013726, + "epoch": 0.10359236434691117, + "flos": 52698874947840.0, + "grad_norm": 0.9564367479099696, + "language_loss": 0.67185652, + "learning_rate": 3.943457464282059e-06, + "loss": 0.69292337, + "num_input_tokens_seen": 37247270, + "router_z_loss_clip": 0.02197266, + "router_z_loss_mlp": 0.4296875, + "step": 1723, + "time_per_iteration": 2.8474721908569336 + }, + { + "auxiliary_loss_clip": 0.01170693, + "auxiliary_loss_mlp": 0.01050183, + "balance_loss_clip": 1.02951217, + "balance_loss_mlp": 1.04747462, + "epoch": 0.10365248759957914, + "flos": 18405404277120.0, + "grad_norm": 2.780632359822339, + "language_loss": 0.77922273, + "learning_rate": 3.9433654754811745e-06, + "loss": 0.80143142, + "num_input_tokens_seen": 37265595, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.234375, + "step": 1724, + "time_per_iteration": 2.4311840534210205 + }, + { + "auxiliary_loss_clip": 0.01175556, + "auxiliary_loss_mlp": 0.01052899, + "balance_loss_clip": 1.03233576, + "balance_loss_mlp": 1.05101144, + "epoch": 0.1037126108522471, + "flos": 47553555335040.0, + "grad_norm": 1.8180629527722856, + "language_loss": 0.74894094, + "learning_rate": 3.943273412987676e-06, + "loss": 0.77122545, + "num_input_tokens_seen": 37286660, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.2421875, + "step": 1725, + "time_per_iteration": 2.6802284717559814 + }, + { + "auxiliary_loss_clip": 0.01170353, + "auxiliary_loss_mlp": 0.01049343, + "balance_loss_clip": 1.02852905, + "balance_loss_mlp": 1.05098462, + "epoch": 0.10377273410491508, + "flos": 22816670572800.0, + "grad_norm": 2.4392097975248244, + "language_loss": 0.75290418, + "learning_rate": 3.943181276805054e-06, + "loss": 0.77510113, + "num_input_tokens_seen": 37304915, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.1875, + "step": 1726, + "time_per_iteration": 5.461729049682617 + }, + { + "auxiliary_loss_clip": 0.01174745, + "auxiliary_loss_mlp": 0.01059612, + "balance_loss_clip": 1.03765321, + "balance_loss_mlp": 1.0527426, + "epoch": 0.10383285735758305, + "flos": 26138694890880.0, + "grad_norm": 1.8824890959349092, + "language_loss": 0.73943913, + "learning_rate": 3.9430890669368035e-06, + "loss": 0.76178271, + "num_input_tokens_seen": 37325265, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.21875, + "step": 1727, + "time_per_iteration": 3.883134126663208 + }, + { + "auxiliary_loss_clip": 0.01169505, + "auxiliary_loss_mlp": 0.01051483, + "balance_loss_clip": 1.03023946, + "balance_loss_mlp": 1.04815936, + "epoch": 0.10389298061025101, + "flos": 17091791994240.0, + "grad_norm": 2.187385195417556, + "language_loss": 0.84670323, + "learning_rate": 3.942996783386422e-06, + "loss": 0.86891311, + "num_input_tokens_seen": 37341650, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.2109375, + "step": 1728, + "time_per_iteration": 2.4405598640441895 + }, + { + "auxiliary_loss_clip": 0.01171168, + "auxiliary_loss_mlp": 0.0105142, + "balance_loss_clip": 1.02980709, + "balance_loss_mlp": 1.05098438, + "epoch": 0.10395310386291898, + "flos": 20776513893120.0, + "grad_norm": 2.4528097766615677, + "language_loss": 0.70985407, + "learning_rate": 3.942904426157406e-06, + "loss": 0.73207992, + "num_input_tokens_seen": 37360270, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.203125, + "step": 1729, + "time_per_iteration": 2.465688467025757 + }, + { + "auxiliary_loss_clip": 0.01170997, + "auxiliary_loss_mlp": 0.01059912, + "balance_loss_clip": 1.03679705, + "balance_loss_mlp": 1.05000722, + "epoch": 0.10401322711558696, + "flos": 12820540913280.0, + "grad_norm": 2.5788681057232625, + "language_loss": 0.81288344, + "learning_rate": 3.9428119952532605e-06, + "loss": 0.8351925, + "num_input_tokens_seen": 37375225, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.2109375, + "step": 1730, + "time_per_iteration": 2.4582717418670654 + }, + { + "auxiliary_loss_clip": 0.01167657, + "auxiliary_loss_mlp": 0.010515, + "balance_loss_clip": 1.03190255, + "balance_loss_mlp": 1.04836845, + "epoch": 0.10407335036825492, + "flos": 23184683366400.0, + "grad_norm": 2.1021084439253723, + "language_loss": 0.75932384, + "learning_rate": 3.942719490677489e-06, + "loss": 0.78151548, + "num_input_tokens_seen": 37395165, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.1953125, + "step": 1731, + "time_per_iteration": 2.4650096893310547 + }, + { + "auxiliary_loss_clip": 0.01164648, + "auxiliary_loss_mlp": 0.01046999, + "balance_loss_clip": 1.02762735, + "balance_loss_mlp": 1.04899907, + "epoch": 0.10413347362092289, + "flos": 26104184899200.0, + "grad_norm": 1.8082651510271561, + "language_loss": 0.82679468, + "learning_rate": 3.9426269124336e-06, + "loss": 0.84891117, + "num_input_tokens_seen": 37414845, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.15625, + "step": 1732, + "time_per_iteration": 2.5152552127838135 + }, + { + "auxiliary_loss_clip": 0.01169252, + "auxiliary_loss_mlp": 0.01048286, + "balance_loss_clip": 1.02881873, + "balance_loss_mlp": 1.05052853, + "epoch": 0.10419359687359087, + "flos": 12641059630080.0, + "grad_norm": 2.755876599624297, + "language_loss": 0.82947195, + "learning_rate": 3.942534260525104e-06, + "loss": 0.85164732, + "num_input_tokens_seen": 37432490, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.1875, + "step": 1733, + "time_per_iteration": 2.4426257610321045 + }, + { + "auxiliary_loss_clip": 0.01171007, + "auxiliary_loss_mlp": 0.01052347, + "balance_loss_clip": 1.03171146, + "balance_loss_mlp": 1.04982805, + "epoch": 0.10425372012625883, + "flos": 12125094716160.0, + "grad_norm": 2.4971959439308336, + "language_loss": 0.76446331, + "learning_rate": 3.942441534955514e-06, + "loss": 0.78669679, + "num_input_tokens_seen": 37449435, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.2109375, + "step": 1734, + "time_per_iteration": 2.4556663036346436 + }, + { + "auxiliary_loss_clip": 0.01165131, + "auxiliary_loss_mlp": 0.01047841, + "balance_loss_clip": 1.02795696, + "balance_loss_mlp": 1.04718471, + "epoch": 0.1043138433789268, + "flos": 25337563902720.0, + "grad_norm": 1.9861442095390862, + "language_loss": 0.74962163, + "learning_rate": 3.9423487357283465e-06, + "loss": 0.7717514, + "num_input_tokens_seen": 37469105, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.1796875, + "step": 1735, + "time_per_iteration": 2.4961798191070557 + }, + { + "auxiliary_loss_clip": 0.01172587, + "auxiliary_loss_mlp": 0.01048204, + "balance_loss_clip": 1.02724743, + "balance_loss_mlp": 1.05081487, + "epoch": 0.10437396663159478, + "flos": 29167149352320.0, + "grad_norm": 1.9829662552727403, + "language_loss": 0.79049939, + "learning_rate": 3.94225586284712e-06, + "loss": 0.8127073, + "num_input_tokens_seen": 37490540, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.21875, + "step": 1736, + "time_per_iteration": 2.530808448791504 + }, + { + "auxiliary_loss_clip": 0.01166906, + "auxiliary_loss_mlp": 0.01057245, + "balance_loss_clip": 1.03655005, + "balance_loss_mlp": 1.0491184, + "epoch": 0.10443408988426274, + "flos": 25080946162560.0, + "grad_norm": 1.8105684861006923, + "language_loss": 0.70339012, + "learning_rate": 3.942162916315356e-06, + "loss": 0.72563159, + "num_input_tokens_seen": 37511905, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.171875, + "step": 1737, + "time_per_iteration": 2.4789419174194336 + }, + { + "auxiliary_loss_clip": 0.01170601, + "auxiliary_loss_mlp": 0.01051121, + "balance_loss_clip": 1.02758932, + "balance_loss_mlp": 1.04718471, + "epoch": 0.1044942131369307, + "flos": 26759662237440.0, + "grad_norm": 2.004598680960266, + "language_loss": 0.81483257, + "learning_rate": 3.942069896136581e-06, + "loss": 0.83704984, + "num_input_tokens_seen": 37533635, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.234375, + "step": 1738, + "time_per_iteration": 2.505312442779541 + }, + { + "auxiliary_loss_clip": 0.0116919, + "auxiliary_loss_mlp": 0.01058357, + "balance_loss_clip": 1.0351944, + "balance_loss_mlp": 1.04712963, + "epoch": 0.10455433638959867, + "flos": 18442571875200.0, + "grad_norm": 4.442978598454381, + "language_loss": 0.750579, + "learning_rate": 3.9419768023143196e-06, + "loss": 0.77285445, + "num_input_tokens_seen": 37552035, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.21875, + "step": 1739, + "time_per_iteration": 2.4544031620025635 + }, + { + "auxiliary_loss_clip": 0.01168087, + "auxiliary_loss_mlp": 0.01055908, + "balance_loss_clip": 1.0349865, + "balance_loss_mlp": 1.04893625, + "epoch": 0.10461445964226665, + "flos": 23218977876480.0, + "grad_norm": 1.676051388115223, + "language_loss": 0.77279431, + "learning_rate": 3.941883634852104e-06, + "loss": 0.79503429, + "num_input_tokens_seen": 37571540, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1875, + "step": 1740, + "time_per_iteration": 2.489302635192871 + }, + { + "auxiliary_loss_clip": 0.01169756, + "auxiliary_loss_mlp": 0.01048525, + "balance_loss_clip": 1.02820003, + "balance_loss_mlp": 1.05093944, + "epoch": 0.10467458289493461, + "flos": 24345243797760.0, + "grad_norm": 2.1911967502326775, + "language_loss": 0.85983682, + "learning_rate": 3.941790393753467e-06, + "loss": 0.88201964, + "num_input_tokens_seen": 37588265, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1875, + "step": 1741, + "time_per_iteration": 2.4571211338043213 + }, + { + "auxiliary_loss_clip": 0.01171445, + "auxiliary_loss_mlp": 0.01053113, + "balance_loss_clip": 1.03091609, + "balance_loss_mlp": 1.04901385, + "epoch": 0.10473470614760258, + "flos": 21287953693440.0, + "grad_norm": 4.086245960730198, + "language_loss": 0.74991679, + "learning_rate": 3.941697079021942e-06, + "loss": 0.77216244, + "num_input_tokens_seen": 37606860, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.2265625, + "step": 1742, + "time_per_iteration": 2.4919426441192627 + }, + { + "auxiliary_loss_clip": 0.01171849, + "auxiliary_loss_mlp": 0.01059576, + "balance_loss_clip": 1.03914368, + "balance_loss_mlp": 1.05323386, + "epoch": 0.10479482940027056, + "flos": 21687208341120.0, + "grad_norm": 1.9550995481311175, + "language_loss": 0.87150526, + "learning_rate": 3.94160369066107e-06, + "loss": 0.89381945, + "num_input_tokens_seen": 37625210, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.1875, + "step": 1743, + "time_per_iteration": 2.470841884613037 + }, + { + "auxiliary_loss_clip": 0.01168292, + "auxiliary_loss_mlp": 0.01049872, + "balance_loss_clip": 1.02760363, + "balance_loss_mlp": 1.04964471, + "epoch": 0.10485495265293852, + "flos": 21573694385280.0, + "grad_norm": 2.1176645115958923, + "language_loss": 0.75532508, + "learning_rate": 3.941510228674391e-06, + "loss": 0.77750671, + "num_input_tokens_seen": 37644110, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.1875, + "step": 1744, + "time_per_iteration": 2.4725873470306396 + }, + { + "auxiliary_loss_clip": 0.01171079, + "auxiliary_loss_mlp": 0.01052914, + "balance_loss_clip": 1.03336394, + "balance_loss_mlp": 1.05184436, + "epoch": 0.10491507590560649, + "flos": 37961923708800.0, + "grad_norm": 2.151699961275852, + "language_loss": 0.79306591, + "learning_rate": 3.941416693065451e-06, + "loss": 0.81530583, + "num_input_tokens_seen": 37665800, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.1875, + "step": 1745, + "time_per_iteration": 2.5937912464141846 + }, + { + "auxiliary_loss_clip": 0.01166892, + "auxiliary_loss_mlp": 0.01062835, + "balance_loss_clip": 1.04194999, + "balance_loss_mlp": 1.047683, + "epoch": 0.10497519915827447, + "flos": 26396282298240.0, + "grad_norm": 2.087314316255438, + "language_loss": 0.82382894, + "learning_rate": 3.941323083837794e-06, + "loss": 0.8461262, + "num_input_tokens_seen": 37685095, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1875, + "step": 1746, + "time_per_iteration": 2.520306348800659 + }, + { + "auxiliary_loss_clip": 0.01170145, + "auxiliary_loss_mlp": 0.01062461, + "balance_loss_clip": 1.04186153, + "balance_loss_mlp": 1.05198646, + "epoch": 0.10503532241094243, + "flos": 40662190581120.0, + "grad_norm": 1.645771273172373, + "language_loss": 0.69951761, + "learning_rate": 3.941229400994971e-06, + "loss": 0.7218436, + "num_input_tokens_seen": 37707445, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1875, + "step": 1747, + "time_per_iteration": 2.618581771850586 + }, + { + "auxiliary_loss_clip": 0.01176288, + "auxiliary_loss_mlp": 0.01062255, + "balance_loss_clip": 1.04140496, + "balance_loss_mlp": 1.05136323, + "epoch": 0.1050954456636104, + "flos": 29789409588480.0, + "grad_norm": 2.3385484358742192, + "language_loss": 0.84245849, + "learning_rate": 3.941135644540535e-06, + "loss": 0.86484385, + "num_input_tokens_seen": 37728325, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.25, + "step": 1748, + "time_per_iteration": 2.539386034011841 + }, + { + "auxiliary_loss_clip": 0.01165269, + "auxiliary_loss_mlp": 0.01049548, + "balance_loss_clip": 1.02797103, + "balance_loss_mlp": 1.04729426, + "epoch": 0.10515556891627838, + "flos": 23948754497280.0, + "grad_norm": 1.8953667439120294, + "language_loss": 0.71491921, + "learning_rate": 3.941041814478041e-06, + "loss": 0.7370674, + "num_input_tokens_seen": 37748910, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.1796875, + "step": 1749, + "time_per_iteration": 2.481700897216797 + }, + { + "auxiliary_loss_clip": 0.01166695, + "auxiliary_loss_mlp": 0.01060715, + "balance_loss_clip": 1.0395906, + "balance_loss_mlp": 1.04953468, + "epoch": 0.10521569216894634, + "flos": 18259606972800.0, + "grad_norm": 1.9760411129591238, + "language_loss": 0.81960011, + "learning_rate": 3.940947910811047e-06, + "loss": 0.84187424, + "num_input_tokens_seen": 37765745, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.171875, + "step": 1750, + "time_per_iteration": 2.4454832077026367 + }, + { + "auxiliary_loss_clip": 0.01173904, + "auxiliary_loss_mlp": 0.01060945, + "balance_loss_clip": 1.03946304, + "balance_loss_mlp": 1.05259562, + "epoch": 0.10527581542161431, + "flos": 15630909949440.0, + "grad_norm": 2.3402404294313524, + "language_loss": 0.91871023, + "learning_rate": 3.940853933543114e-06, + "loss": 0.94105875, + "num_input_tokens_seen": 37780520, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.21875, + "step": 1751, + "time_per_iteration": 2.416607141494751 + }, + { + "auxiliary_loss_clip": 0.01166575, + "auxiliary_loss_mlp": 0.0104776, + "balance_loss_clip": 1.02698207, + "balance_loss_mlp": 1.04889047, + "epoch": 0.10533593867428227, + "flos": 18296559089280.0, + "grad_norm": 2.265296057434122, + "language_loss": 0.79560149, + "learning_rate": 3.940759882677805e-06, + "loss": 0.81774485, + "num_input_tokens_seen": 37799515, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.1796875, + "step": 1752, + "time_per_iteration": 2.46063494682312 + }, + { + "auxiliary_loss_clip": 0.01167711, + "auxiliary_loss_mlp": 0.01052906, + "balance_loss_clip": 1.03202033, + "balance_loss_mlp": 1.05050862, + "epoch": 0.10539606192695025, + "flos": 29023219555200.0, + "grad_norm": 2.1401152378303867, + "language_loss": 0.75782037, + "learning_rate": 3.940665758218686e-06, + "loss": 0.78002656, + "num_input_tokens_seen": 37818695, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.171875, + "step": 1753, + "time_per_iteration": 2.51767635345459 + }, + { + "auxiliary_loss_clip": 0.01172527, + "auxiliary_loss_mlp": 0.01057137, + "balance_loss_clip": 1.03436756, + "balance_loss_mlp": 1.04939532, + "epoch": 0.10545618517961822, + "flos": 19969313506560.0, + "grad_norm": 2.0790136174876546, + "language_loss": 0.84048498, + "learning_rate": 3.940571560169328e-06, + "loss": 0.86278164, + "num_input_tokens_seen": 37837860, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.234375, + "step": 1754, + "time_per_iteration": 2.4683756828308105 + }, + { + "auxiliary_loss_clip": 0.01175207, + "auxiliary_loss_mlp": 0.01053622, + "balance_loss_clip": 1.03044736, + "balance_loss_mlp": 1.05438888, + "epoch": 0.10551630843228618, + "flos": 16143427157760.0, + "grad_norm": 2.8736094439376645, + "language_loss": 0.68956709, + "learning_rate": 3.940477288533302e-06, + "loss": 0.71185535, + "num_input_tokens_seen": 37856260, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.2109375, + "step": 1755, + "time_per_iteration": 2.45597243309021 + }, + { + "auxiliary_loss_clip": 0.01172827, + "auxiliary_loss_mlp": 0.01061763, + "balance_loss_clip": 1.03989983, + "balance_loss_mlp": 1.05102587, + "epoch": 0.10557643168495416, + "flos": 23440115957760.0, + "grad_norm": 5.502613786824721, + "language_loss": 0.76718754, + "learning_rate": 3.940382943314182e-06, + "loss": 0.78953344, + "num_input_tokens_seen": 37876960, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.21875, + "step": 1756, + "time_per_iteration": 2.5239176750183105 + }, + { + "auxiliary_loss_clip": 0.01172125, + "auxiliary_loss_mlp": 0.01058013, + "balance_loss_clip": 1.03712726, + "balance_loss_mlp": 1.04982626, + "epoch": 0.10563655493762213, + "flos": 21799034357760.0, + "grad_norm": 1.7784869470084927, + "language_loss": 0.80162531, + "learning_rate": 3.940288524515547e-06, + "loss": 0.82392669, + "num_input_tokens_seen": 37897070, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.21875, + "step": 1757, + "time_per_iteration": 2.4551706314086914 + }, + { + "auxiliary_loss_clip": 0.01171845, + "auxiliary_loss_mlp": 0.01056344, + "balance_loss_clip": 1.03499317, + "balance_loss_mlp": 1.05132246, + "epoch": 0.10569667819029009, + "flos": 53800863275520.0, + "grad_norm": 1.631431596421375, + "language_loss": 0.78800333, + "learning_rate": 3.940194032140976e-06, + "loss": 0.81028521, + "num_input_tokens_seen": 37923635, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.203125, + "step": 1758, + "time_per_iteration": 2.7955896854400635 + }, + { + "auxiliary_loss_clip": 0.01177436, + "auxiliary_loss_mlp": 0.01050523, + "balance_loss_clip": 1.02865982, + "balance_loss_mlp": 1.05364573, + "epoch": 0.10575680144295807, + "flos": 22925515760640.0, + "grad_norm": 2.609159841262955, + "language_loss": 0.9189958, + "learning_rate": 3.940099466194054e-06, + "loss": 0.94127536, + "num_input_tokens_seen": 37942650, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.234375, + "step": 1759, + "time_per_iteration": 2.4853782653808594 + }, + { + "auxiliary_loss_clip": 0.01173064, + "auxiliary_loss_mlp": 0.01055702, + "balance_loss_clip": 1.03276575, + "balance_loss_mlp": 1.04970741, + "epoch": 0.10581692469562604, + "flos": 14136667148160.0, + "grad_norm": 2.498568213886603, + "language_loss": 0.76932353, + "learning_rate": 3.940004826678365e-06, + "loss": 0.79161119, + "num_input_tokens_seen": 37960660, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.234375, + "step": 1760, + "time_per_iteration": 2.470705509185791 + }, + { + "auxiliary_loss_clip": 0.01173982, + "auxiliary_loss_mlp": 0.01061265, + "balance_loss_clip": 1.03825736, + "balance_loss_mlp": 1.05152941, + "epoch": 0.105877047948294, + "flos": 25958674903680.0, + "grad_norm": 2.349800445259612, + "language_loss": 0.89282435, + "learning_rate": 3.939910113597498e-06, + "loss": 0.91517675, + "num_input_tokens_seen": 37978625, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.2265625, + "step": 1761, + "time_per_iteration": 2.491501569747925 + }, + { + "auxiliary_loss_clip": 0.01173015, + "auxiliary_loss_mlp": 0.01060542, + "balance_loss_clip": 1.03944254, + "balance_loss_mlp": 1.0518589, + "epoch": 0.10593717120096197, + "flos": 30664768032000.0, + "grad_norm": 2.4794664397863877, + "language_loss": 0.78304708, + "learning_rate": 3.9398153269550464e-06, + "loss": 0.80538261, + "num_input_tokens_seen": 38000005, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.2109375, + "step": 1762, + "time_per_iteration": 2.5563831329345703 + }, + { + "auxiliary_loss_clip": 0.01062071, + "auxiliary_loss_mlp": 0.01014008, + "balance_loss_clip": 1.0110991, + "balance_loss_mlp": 1.02000487, + "epoch": 0.10599729445362994, + "flos": 66436682497920.0, + "grad_norm": 0.753444103392694, + "language_loss": 0.60481733, + "learning_rate": 3.939720466754602e-06, + "loss": 0.62557811, + "num_input_tokens_seen": 38066165, + "router_z_loss_clip": 0.02905273, + "router_z_loss_mlp": 0.421875, + "step": 1763, + "time_per_iteration": 3.2239294052124023 + }, + { + "auxiliary_loss_clip": 0.01170891, + "auxiliary_loss_mlp": 0.01048971, + "balance_loss_clip": 1.02777529, + "balance_loss_mlp": 1.04924011, + "epoch": 0.10605741770629791, + "flos": 23948179879680.0, + "grad_norm": 2.054980370260194, + "language_loss": 0.8010751, + "learning_rate": 3.939625532999763e-06, + "loss": 0.82327372, + "num_input_tokens_seen": 38086150, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.21875, + "step": 1764, + "time_per_iteration": 2.476325273513794 + }, + { + "auxiliary_loss_clip": 0.01169028, + "auxiliary_loss_mlp": 0.01049345, + "balance_loss_clip": 1.02745855, + "balance_loss_mlp": 1.04961264, + "epoch": 0.10611754095896588, + "flos": 19387524919680.0, + "grad_norm": 1.7621956234955212, + "language_loss": 0.7999962, + "learning_rate": 3.9395305256941314e-06, + "loss": 0.82217997, + "num_input_tokens_seen": 38104205, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.1953125, + "step": 1765, + "time_per_iteration": 2.446593999862671 + }, + { + "auxiliary_loss_clip": 0.01167126, + "auxiliary_loss_mlp": 0.01054873, + "balance_loss_clip": 1.03394008, + "balance_loss_mlp": 1.04794002, + "epoch": 0.10617766421163385, + "flos": 22237755073920.0, + "grad_norm": 1.867239621884004, + "language_loss": 0.76693732, + "learning_rate": 3.939435444841306e-06, + "loss": 0.78915727, + "num_input_tokens_seen": 38122005, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1875, + "step": 1766, + "time_per_iteration": 2.4462356567382812 + }, + { + "auxiliary_loss_clip": 0.01170332, + "auxiliary_loss_mlp": 0.01059306, + "balance_loss_clip": 1.0366683, + "balance_loss_mlp": 1.05017042, + "epoch": 0.10623778746430182, + "flos": 28404407024640.0, + "grad_norm": 1.6580981789618001, + "language_loss": 0.77319431, + "learning_rate": 3.939340290444895e-06, + "loss": 0.79549068, + "num_input_tokens_seen": 38143365, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.203125, + "step": 1767, + "time_per_iteration": 2.542797088623047 + }, + { + "auxiliary_loss_clip": 0.01060068, + "auxiliary_loss_mlp": 0.01000453, + "balance_loss_clip": 0.99785471, + "balance_loss_mlp": 1.01804066, + "epoch": 0.10629791071696978, + "flos": 64234639221120.0, + "grad_norm": 0.6789245534488961, + "language_loss": 0.57902765, + "learning_rate": 3.939245062508506e-06, + "loss": 0.59963286, + "num_input_tokens_seen": 38210035, + "router_z_loss_clip": 0.02600098, + "router_z_loss_mlp": 0.421875, + "step": 1768, + "time_per_iteration": 6.071596384048462 + }, + { + "auxiliary_loss_clip": 0.01172748, + "auxiliary_loss_mlp": 0.01041813, + "balance_loss_clip": 1.0219171, + "balance_loss_mlp": 1.05201912, + "epoch": 0.10635803396963776, + "flos": 22747578762240.0, + "grad_norm": 1.446404125156032, + "language_loss": 0.86796767, + "learning_rate": 3.939149761035749e-06, + "loss": 0.89011335, + "num_input_tokens_seen": 38231230, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.2109375, + "step": 1769, + "time_per_iteration": 2.5106868743896484 + }, + { + "auxiliary_loss_clip": 0.01175908, + "auxiliary_loss_mlp": 0.01056805, + "balance_loss_clip": 1.03496528, + "balance_loss_mlp": 1.05300689, + "epoch": 0.10641815722230573, + "flos": 31395586147200.0, + "grad_norm": 1.766851816283336, + "language_loss": 0.61890501, + "learning_rate": 3.9390543860302395e-06, + "loss": 0.64123213, + "num_input_tokens_seen": 38253890, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.2265625, + "step": 1770, + "time_per_iteration": 2.5770323276519775 + }, + { + "auxiliary_loss_clip": 0.01061292, + "auxiliary_loss_mlp": 0.01003176, + "balance_loss_clip": 1.00058925, + "balance_loss_mlp": 1.01873469, + "epoch": 0.1064782804749737, + "flos": 58552527784320.0, + "grad_norm": 0.8864779346546747, + "language_loss": 0.57095039, + "learning_rate": 3.9389589374955925e-06, + "loss": 0.59159505, + "num_input_tokens_seen": 38304290, + "router_z_loss_clip": 0.02587891, + "router_z_loss_mlp": 0.42578125, + "step": 1771, + "time_per_iteration": 2.957993507385254 + }, + { + "auxiliary_loss_clip": 0.01174087, + "auxiliary_loss_mlp": 0.01063103, + "balance_loss_clip": 1.04187179, + "balance_loss_mlp": 1.05443954, + "epoch": 0.10653840372764166, + "flos": 23987825516160.0, + "grad_norm": 1.6398085638646198, + "language_loss": 0.88530469, + "learning_rate": 3.938863415435429e-06, + "loss": 0.90767658, + "num_input_tokens_seen": 38324725, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1953125, + "step": 1772, + "time_per_iteration": 2.520744562149048 + }, + { + "auxiliary_loss_clip": 0.01176768, + "auxiliary_loss_mlp": 0.01063607, + "balance_loss_clip": 1.03945482, + "balance_loss_mlp": 1.05091381, + "epoch": 0.10659852698030964, + "flos": 18294655668480.0, + "grad_norm": 2.8236986107629094, + "language_loss": 0.76021719, + "learning_rate": 3.93876781985337e-06, + "loss": 0.78262091, + "num_input_tokens_seen": 38340735, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.2578125, + "step": 1773, + "time_per_iteration": 2.4228129386901855 + }, + { + "auxiliary_loss_clip": 0.01171647, + "auxiliary_loss_mlp": 0.01063224, + "balance_loss_clip": 1.04087257, + "balance_loss_mlp": 1.05147731, + "epoch": 0.1066586502329776, + "flos": 32160591031680.0, + "grad_norm": 2.1931291175477177, + "language_loss": 0.83184093, + "learning_rate": 3.938672150753041e-06, + "loss": 0.85418963, + "num_input_tokens_seen": 38361315, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.203125, + "step": 1774, + "time_per_iteration": 2.5613787174224854 + }, + { + "auxiliary_loss_clip": 0.01177598, + "auxiliary_loss_mlp": 0.01054445, + "balance_loss_clip": 1.03262997, + "balance_loss_mlp": 1.05220413, + "epoch": 0.10671877348564557, + "flos": 17785155202560.0, + "grad_norm": 2.683505024819064, + "language_loss": 0.76297373, + "learning_rate": 3.9385764081380704e-06, + "loss": 0.78529418, + "num_input_tokens_seen": 38377425, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.25, + "step": 1775, + "time_per_iteration": 2.437676429748535 + }, + { + "auxiliary_loss_clip": 0.01057587, + "auxiliary_loss_mlp": 0.01006639, + "balance_loss_clip": 1.00413537, + "balance_loss_mlp": 1.01520467, + "epoch": 0.10677889673831355, + "flos": 63510177813120.0, + "grad_norm": 0.8253045983972309, + "language_loss": 0.57443953, + "learning_rate": 3.9384805920120876e-06, + "loss": 0.59508181, + "num_input_tokens_seen": 38440275, + "router_z_loss_clip": 0.02502441, + "router_z_loss_mlp": 0.42382812, + "step": 1776, + "time_per_iteration": 3.101378917694092 + }, + { + "auxiliary_loss_clip": 0.01176962, + "auxiliary_loss_mlp": 0.01059775, + "balance_loss_clip": 1.0365653, + "balance_loss_mlp": 1.05411029, + "epoch": 0.10683901999098151, + "flos": 22017694400640.0, + "grad_norm": 1.6481869723516467, + "language_loss": 0.83374244, + "learning_rate": 3.938384702378727e-06, + "loss": 0.8561098, + "num_input_tokens_seen": 38461820, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.2265625, + "step": 1777, + "time_per_iteration": 2.5109002590179443 + }, + { + "auxiliary_loss_clip": 0.01170133, + "auxiliary_loss_mlp": 0.01055162, + "balance_loss_clip": 1.03371584, + "balance_loss_mlp": 1.05298579, + "epoch": 0.10689914324364948, + "flos": 25042952551680.0, + "grad_norm": 2.6420984425067013, + "language_loss": 0.87275863, + "learning_rate": 3.938288739241625e-06, + "loss": 0.89501154, + "num_input_tokens_seen": 38482235, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.171875, + "step": 1778, + "time_per_iteration": 2.503103494644165 + }, + { + "auxiliary_loss_clip": 0.01175003, + "auxiliary_loss_mlp": 0.01053847, + "balance_loss_clip": 1.032354, + "balance_loss_mlp": 1.05328, + "epoch": 0.10695926649631746, + "flos": 16435129507200.0, + "grad_norm": 2.213225731734914, + "language_loss": 0.83970487, + "learning_rate": 3.938192702604417e-06, + "loss": 0.86199337, + "num_input_tokens_seen": 38500690, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.21875, + "step": 1779, + "time_per_iteration": 2.4574496746063232 + }, + { + "auxiliary_loss_clip": 0.01169562, + "auxiliary_loss_mlp": 0.01052117, + "balance_loss_clip": 1.03086162, + "balance_loss_mlp": 1.04975557, + "epoch": 0.10701938974898542, + "flos": 16979211792000.0, + "grad_norm": 2.4959309518827655, + "language_loss": 0.67064941, + "learning_rate": 3.9380965924707495e-06, + "loss": 0.69286621, + "num_input_tokens_seen": 38518405, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1953125, + "step": 1780, + "time_per_iteration": 2.447756052017212 + }, + { + "auxiliary_loss_clip": 0.01171847, + "auxiliary_loss_mlp": 0.01046888, + "balance_loss_clip": 1.02553749, + "balance_loss_mlp": 1.05183458, + "epoch": 0.10707951300165339, + "flos": 15888102307200.0, + "grad_norm": 2.25546613947904, + "language_loss": 0.91667759, + "learning_rate": 3.938000408844265e-06, + "loss": 0.93886495, + "num_input_tokens_seen": 38535060, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.203125, + "step": 1781, + "time_per_iteration": 2.4367144107818604 + }, + { + "auxiliary_loss_clip": 0.01175433, + "auxiliary_loss_mlp": 0.01046071, + "balance_loss_clip": 1.02524495, + "balance_loss_mlp": 1.05302, + "epoch": 0.10713963625432135, + "flos": 14247164361600.0, + "grad_norm": 2.202402738572802, + "language_loss": 0.79505372, + "learning_rate": 3.9379041517286105e-06, + "loss": 0.81726873, + "num_input_tokens_seen": 38552855, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.2265625, + "step": 1782, + "time_per_iteration": 2.4340877532958984 + }, + { + "auxiliary_loss_clip": 0.01175468, + "auxiliary_loss_mlp": 0.01052246, + "balance_loss_clip": 1.03055024, + "balance_loss_mlp": 1.0517509, + "epoch": 0.10719975950698933, + "flos": 16756780821120.0, + "grad_norm": 2.0445491568240994, + "language_loss": 0.78994977, + "learning_rate": 3.937807821127436e-06, + "loss": 0.81222689, + "num_input_tokens_seen": 38570075, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.234375, + "step": 1783, + "time_per_iteration": 2.434527635574341 + }, + { + "auxiliary_loss_clip": 0.01176375, + "auxiliary_loss_mlp": 0.01052212, + "balance_loss_clip": 1.02991986, + "balance_loss_mlp": 1.0529108, + "epoch": 0.1072598827596573, + "flos": 22710626645760.0, + "grad_norm": 1.8050343336808015, + "language_loss": 0.85956216, + "learning_rate": 3.937711417044395e-06, + "loss": 0.88184798, + "num_input_tokens_seen": 38587970, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.234375, + "step": 1784, + "time_per_iteration": 2.4867746829986572 + }, + { + "auxiliary_loss_clip": 0.01174134, + "auxiliary_loss_mlp": 0.01054075, + "balance_loss_clip": 1.03188968, + "balance_loss_mlp": 1.05080986, + "epoch": 0.10732000601232526, + "flos": 23258264376960.0, + "grad_norm": 3.0774406347184806, + "language_loss": 1.00899053, + "learning_rate": 3.937614939483143e-06, + "loss": 1.03127265, + "num_input_tokens_seen": 38605840, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.234375, + "step": 1785, + "time_per_iteration": 2.46663498878479 + }, + { + "auxiliary_loss_clip": 0.01171119, + "auxiliary_loss_mlp": 0.01057254, + "balance_loss_clip": 1.03636849, + "balance_loss_mlp": 1.05306709, + "epoch": 0.10738012926499324, + "flos": 24207060176640.0, + "grad_norm": 1.4495948735276882, + "language_loss": 0.85070992, + "learning_rate": 3.937518388447339e-06, + "loss": 0.87299371, + "num_input_tokens_seen": 38627070, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1796875, + "step": 1786, + "time_per_iteration": 2.505018949508667 + }, + { + "auxiliary_loss_clip": 0.01170035, + "auxiliary_loss_mlp": 0.01059108, + "balance_loss_clip": 1.035779, + "balance_loss_mlp": 1.04750311, + "epoch": 0.1074402525176612, + "flos": 20923065383040.0, + "grad_norm": 1.8788886178726656, + "language_loss": 0.78817046, + "learning_rate": 3.937421763940642e-06, + "loss": 0.81046188, + "num_input_tokens_seen": 38645840, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.21875, + "step": 1787, + "time_per_iteration": 2.443532705307007 + }, + { + "auxiliary_loss_clip": 0.01176938, + "auxiliary_loss_mlp": 0.01049821, + "balance_loss_clip": 1.02768385, + "balance_loss_mlp": 1.0517112, + "epoch": 0.10750037577032917, + "flos": 16946928443520.0, + "grad_norm": 2.551869220071384, + "language_loss": 0.82557851, + "learning_rate": 3.937325065966719e-06, + "loss": 0.84784609, + "num_input_tokens_seen": 38664770, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.25, + "step": 1788, + "time_per_iteration": 2.4807305335998535 + }, + { + "auxiliary_loss_clip": 0.01170019, + "auxiliary_loss_mlp": 0.0106343, + "balance_loss_clip": 1.04219902, + "balance_loss_mlp": 1.04939878, + "epoch": 0.10756049902299715, + "flos": 20266546550400.0, + "grad_norm": 2.778852512980128, + "language_loss": 0.77794182, + "learning_rate": 3.9372282945292335e-06, + "loss": 0.80027628, + "num_input_tokens_seen": 38683865, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.203125, + "step": 1789, + "time_per_iteration": 2.482330322265625 + }, + { + "auxiliary_loss_clip": 0.01173111, + "auxiliary_loss_mlp": 0.01061244, + "balance_loss_clip": 1.03631723, + "balance_loss_mlp": 1.05133712, + "epoch": 0.10762062227566511, + "flos": 23586523793280.0, + "grad_norm": 2.434124451319009, + "language_loss": 0.74467903, + "learning_rate": 3.937131449631859e-06, + "loss": 0.76702261, + "num_input_tokens_seen": 38702485, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 1.21875, + "step": 1790, + "time_per_iteration": 2.5921239852905273 + }, + { + "auxiliary_loss_clip": 0.01177807, + "auxiliary_loss_mlp": 0.01072366, + "balance_loss_clip": 1.04766607, + "balance_loss_mlp": 1.05428767, + "epoch": 0.10768074552833308, + "flos": 24310626065280.0, + "grad_norm": 2.5839507236364554, + "language_loss": 0.78495383, + "learning_rate": 3.9370345312782645e-06, + "loss": 0.80745554, + "num_input_tokens_seen": 38722475, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.234375, + "step": 1791, + "time_per_iteration": 2.5242488384246826 + }, + { + "auxiliary_loss_clip": 0.01167341, + "auxiliary_loss_mlp": 0.01053897, + "balance_loss_clip": 1.0330478, + "balance_loss_mlp": 1.05112934, + "epoch": 0.10774086878100106, + "flos": 25299965341440.0, + "grad_norm": 1.8605555947944812, + "language_loss": 0.70855284, + "learning_rate": 3.936937539472126e-06, + "loss": 0.73076522, + "num_input_tokens_seen": 38743285, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.15625, + "step": 1792, + "time_per_iteration": 2.5260751247406006 + }, + { + "auxiliary_loss_clip": 0.01175824, + "auxiliary_loss_mlp": 0.010463, + "balance_loss_clip": 1.02330506, + "balance_loss_mlp": 1.05109024, + "epoch": 0.10780099203366902, + "flos": 22054035985920.0, + "grad_norm": 2.973355145299492, + "language_loss": 0.76029646, + "learning_rate": 3.9368404742171236e-06, + "loss": 0.78251767, + "num_input_tokens_seen": 38763035, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.25, + "step": 1793, + "time_per_iteration": 2.5037007331848145 + }, + { + "auxiliary_loss_clip": 0.01171847, + "auxiliary_loss_mlp": 0.01060242, + "balance_loss_clip": 1.03793848, + "balance_loss_mlp": 1.0537113, + "epoch": 0.10786111528633699, + "flos": 22747471021440.0, + "grad_norm": 1.7251623627880495, + "language_loss": 0.85158944, + "learning_rate": 3.936743335516936e-06, + "loss": 0.87391031, + "num_input_tokens_seen": 38784900, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.1875, + "step": 1794, + "time_per_iteration": 2.5210132598876953 + }, + { + "auxiliary_loss_clip": 0.01180393, + "auxiliary_loss_mlp": 0.01052991, + "balance_loss_clip": 1.02954292, + "balance_loss_mlp": 1.05342674, + "epoch": 0.10792123853900495, + "flos": 20851064570880.0, + "grad_norm": 1.9245153565321482, + "language_loss": 0.74914879, + "learning_rate": 3.936646123375246e-06, + "loss": 0.77148265, + "num_input_tokens_seen": 38804695, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.265625, + "step": 1795, + "time_per_iteration": 2.486111879348755 + }, + { + "auxiliary_loss_clip": 0.0117609, + "auxiliary_loss_mlp": 0.01060963, + "balance_loss_clip": 1.03863525, + "balance_loss_mlp": 1.05227423, + "epoch": 0.10798136179167293, + "flos": 17748705876480.0, + "grad_norm": 2.917857918230487, + "language_loss": 0.8116014, + "learning_rate": 3.936548837795741e-06, + "loss": 0.83397192, + "num_input_tokens_seen": 38822395, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.234375, + "step": 1796, + "time_per_iteration": 2.4357504844665527 + }, + { + "auxiliary_loss_clip": 0.01177296, + "auxiliary_loss_mlp": 0.01075942, + "balance_loss_clip": 1.05260134, + "balance_loss_mlp": 1.05476594, + "epoch": 0.1080414850443409, + "flos": 13589639948160.0, + "grad_norm": 2.4043777768562293, + "language_loss": 0.73476732, + "learning_rate": 3.936451478782111e-06, + "loss": 0.75729972, + "num_input_tokens_seen": 38839865, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.21875, + "step": 1797, + "time_per_iteration": 2.477867841720581 + }, + { + "auxiliary_loss_clip": 0.01172695, + "auxiliary_loss_mlp": 0.01051138, + "balance_loss_clip": 1.03081274, + "balance_loss_mlp": 1.05260658, + "epoch": 0.10810160829700886, + "flos": 16253421580800.0, + "grad_norm": 3.1892188654982396, + "language_loss": 0.81348622, + "learning_rate": 3.936354046338046e-06, + "loss": 0.83572453, + "num_input_tokens_seen": 38857300, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.203125, + "step": 1798, + "time_per_iteration": 2.5060064792633057 + }, + { + "auxiliary_loss_clip": 0.011719, + "auxiliary_loss_mlp": 0.01053896, + "balance_loss_clip": 1.03075755, + "balance_loss_mlp": 1.0508821, + "epoch": 0.10816173154967684, + "flos": 15158002464000.0, + "grad_norm": 2.4195393058725623, + "language_loss": 0.85180116, + "learning_rate": 3.936256540467242e-06, + "loss": 0.87405908, + "num_input_tokens_seen": 38874960, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.2109375, + "step": 1799, + "time_per_iteration": 2.4546945095062256 + }, + { + "auxiliary_loss_clip": 0.01172982, + "auxiliary_loss_mlp": 0.01064124, + "balance_loss_clip": 1.04271412, + "balance_loss_mlp": 1.0546999, + "epoch": 0.10822185480234481, + "flos": 17785334770560.0, + "grad_norm": 2.2474252534922265, + "language_loss": 0.77365196, + "learning_rate": 3.9361589611733955e-06, + "loss": 0.79602301, + "num_input_tokens_seen": 38893610, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.1796875, + "step": 1800, + "time_per_iteration": 2.4650769233703613 + }, + { + "auxiliary_loss_clip": 0.01168665, + "auxiliary_loss_mlp": 0.01044543, + "balance_loss_clip": 1.02443254, + "balance_loss_mlp": 1.05136347, + "epoch": 0.10828197805501277, + "flos": 25556654908800.0, + "grad_norm": 2.2954016650766844, + "language_loss": 0.7287963, + "learning_rate": 3.9360613084602075e-06, + "loss": 0.7509284, + "num_input_tokens_seen": 38913485, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.171875, + "step": 1801, + "time_per_iteration": 2.5045113563537598 + }, + { + "auxiliary_loss_clip": 0.01177863, + "auxiliary_loss_mlp": 0.01048534, + "balance_loss_clip": 1.02785134, + "balance_loss_mlp": 1.05259442, + "epoch": 0.10834210130768075, + "flos": 28984435845120.0, + "grad_norm": 1.8364602771794378, + "language_loss": 0.66427058, + "learning_rate": 3.935963582331381e-06, + "loss": 0.68653458, + "num_input_tokens_seen": 38935650, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.25, + "step": 1802, + "time_per_iteration": 2.5547947883605957 + }, + { + "auxiliary_loss_clip": 0.01170285, + "auxiliary_loss_mlp": 0.01059138, + "balance_loss_clip": 1.03712058, + "balance_loss_mlp": 1.05202222, + "epoch": 0.10840222456034872, + "flos": 20264212166400.0, + "grad_norm": 1.7898565484043845, + "language_loss": 0.8136133, + "learning_rate": 3.935865782790621e-06, + "loss": 0.83590758, + "num_input_tokens_seen": 38954130, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1796875, + "step": 1803, + "time_per_iteration": 2.4758658409118652 + }, + { + "auxiliary_loss_clip": 0.0116949, + "auxiliary_loss_mlp": 0.01052862, + "balance_loss_clip": 1.031106, + "balance_loss_mlp": 1.05126929, + "epoch": 0.10846234781301668, + "flos": 19863054097920.0, + "grad_norm": 2.61974519761109, + "language_loss": 0.9122982, + "learning_rate": 3.9357679098416365e-06, + "loss": 0.93452168, + "num_input_tokens_seen": 38972905, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.1875, + "step": 1804, + "time_per_iteration": 2.4858944416046143 + }, + { + "auxiliary_loss_clip": 0.01175328, + "auxiliary_loss_mlp": 0.01052796, + "balance_loss_clip": 1.03031349, + "balance_loss_mlp": 1.05401301, + "epoch": 0.10852247106568465, + "flos": 26469037296000.0, + "grad_norm": 2.0091269076806078, + "language_loss": 0.7623654, + "learning_rate": 3.935669963488139e-06, + "loss": 0.78464663, + "num_input_tokens_seen": 38993255, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.2109375, + "step": 1805, + "time_per_iteration": 2.5379836559295654 + }, + { + "auxiliary_loss_clip": 0.01172079, + "auxiliary_loss_mlp": 0.01048159, + "balance_loss_clip": 1.02842999, + "balance_loss_mlp": 1.0535754, + "epoch": 0.10858259431835263, + "flos": 30081506987520.0, + "grad_norm": 1.8192828849331855, + "language_loss": 0.860416, + "learning_rate": 3.935571943733843e-06, + "loss": 0.88261837, + "num_input_tokens_seen": 39012610, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1875, + "step": 1806, + "time_per_iteration": 2.5523955821990967 + }, + { + "auxiliary_loss_clip": 0.01170931, + "auxiliary_loss_mlp": 0.01053704, + "balance_loss_clip": 1.03275895, + "balance_loss_mlp": 1.05068612, + "epoch": 0.10864271757102059, + "flos": 19063180085760.0, + "grad_norm": 5.439462316727856, + "language_loss": 0.80572915, + "learning_rate": 3.9354738505824635e-06, + "loss": 0.82797557, + "num_input_tokens_seen": 39030120, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.203125, + "step": 1807, + "time_per_iteration": 2.514390230178833 + }, + { + "auxiliary_loss_clip": 0.01171878, + "auxiliary_loss_mlp": 0.01051305, + "balance_loss_clip": 1.03168321, + "balance_loss_mlp": 1.05415583, + "epoch": 0.10870284082368856, + "flos": 24715052271360.0, + "grad_norm": 1.7684897552837426, + "language_loss": 0.78731525, + "learning_rate": 3.9353756840377225e-06, + "loss": 0.80954707, + "num_input_tokens_seen": 39049875, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.171875, + "step": 1808, + "time_per_iteration": 2.5084331035614014 + }, + { + "auxiliary_loss_clip": 0.01176105, + "auxiliary_loss_mlp": 0.01052005, + "balance_loss_clip": 1.03090501, + "balance_loss_mlp": 1.05633223, + "epoch": 0.10876296407635654, + "flos": 20627663932800.0, + "grad_norm": 1.6609588216066864, + "language_loss": 0.78927523, + "learning_rate": 3.935277444103342e-06, + "loss": 0.81155634, + "num_input_tokens_seen": 39068935, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1953125, + "step": 1809, + "time_per_iteration": 5.368049621582031 + }, + { + "auxiliary_loss_clip": 0.01171492, + "auxiliary_loss_mlp": 0.01053913, + "balance_loss_clip": 1.03318286, + "balance_loss_mlp": 1.05087388, + "epoch": 0.1088230873290245, + "flos": 21579835610880.0, + "grad_norm": 2.0370215842844197, + "language_loss": 0.8468523, + "learning_rate": 3.935179130783046e-06, + "loss": 0.86910635, + "num_input_tokens_seen": 39087370, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.203125, + "step": 1810, + "time_per_iteration": 3.904432535171509 + }, + { + "auxiliary_loss_clip": 0.01180342, + "auxiliary_loss_mlp": 0.01054633, + "balance_loss_clip": 1.03111291, + "balance_loss_mlp": 1.05665135, + "epoch": 0.10888321058169247, + "flos": 26469037296000.0, + "grad_norm": 1.9531179942167565, + "language_loss": 0.63677633, + "learning_rate": 3.935080744080564e-06, + "loss": 0.6591261, + "num_input_tokens_seen": 39106635, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.234375, + "step": 1811, + "time_per_iteration": 2.523650646209717 + }, + { + "auxiliary_loss_clip": 0.01171345, + "auxiliary_loss_mlp": 0.01048747, + "balance_loss_clip": 1.02737319, + "balance_loss_mlp": 1.05139136, + "epoch": 0.10894333383436045, + "flos": 25848608653440.0, + "grad_norm": 3.279966127836369, + "language_loss": 0.74238914, + "learning_rate": 3.934982283999626e-06, + "loss": 0.76459008, + "num_input_tokens_seen": 39126335, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.203125, + "step": 1812, + "time_per_iteration": 2.5579042434692383 + }, + { + "auxiliary_loss_clip": 0.01173457, + "auxiliary_loss_mlp": 0.01047521, + "balance_loss_clip": 1.02587295, + "balance_loss_mlp": 1.05391026, + "epoch": 0.10900345708702841, + "flos": 19537093152000.0, + "grad_norm": 1.9314487748153213, + "language_loss": 0.72647583, + "learning_rate": 3.934883750543966e-06, + "loss": 0.74868566, + "num_input_tokens_seen": 39144820, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.1875, + "step": 1813, + "time_per_iteration": 2.488762617111206 + }, + { + "auxiliary_loss_clip": 0.01174675, + "auxiliary_loss_mlp": 0.01051455, + "balance_loss_clip": 1.02999711, + "balance_loss_mlp": 1.05744648, + "epoch": 0.10906358033969638, + "flos": 23623296341760.0, + "grad_norm": 10.097396236718186, + "language_loss": 0.82224226, + "learning_rate": 3.93478514371732e-06, + "loss": 0.84450358, + "num_input_tokens_seen": 39165945, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.171875, + "step": 1814, + "time_per_iteration": 2.495798349380493 + }, + { + "auxiliary_loss_clip": 0.01176897, + "auxiliary_loss_mlp": 0.01057916, + "balance_loss_clip": 1.03670859, + "balance_loss_mlp": 1.05595291, + "epoch": 0.10912370359236434, + "flos": 21214731818880.0, + "grad_norm": 2.3551509805271422, + "language_loss": 0.84218144, + "learning_rate": 3.934686463523429e-06, + "loss": 0.86452949, + "num_input_tokens_seen": 39183520, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.2109375, + "step": 1815, + "time_per_iteration": 2.462663173675537 + }, + { + "auxiliary_loss_clip": 0.01175955, + "auxiliary_loss_mlp": 0.01053131, + "balance_loss_clip": 1.03054035, + "balance_loss_mlp": 1.05833483, + "epoch": 0.10918382684503232, + "flos": 13553190622080.0, + "grad_norm": 2.3954928768695027, + "language_loss": 0.71048725, + "learning_rate": 3.9345877099660315e-06, + "loss": 0.73277813, + "num_input_tokens_seen": 39201190, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.171875, + "step": 1816, + "time_per_iteration": 2.465953826904297 + }, + { + "auxiliary_loss_clip": 0.01178612, + "auxiliary_loss_mlp": 0.01063123, + "balance_loss_clip": 1.04061651, + "balance_loss_mlp": 1.056674, + "epoch": 0.10924395009770028, + "flos": 27964321591680.0, + "grad_norm": 2.0063973144433067, + "language_loss": 0.72811669, + "learning_rate": 3.9344888830488744e-06, + "loss": 0.75053406, + "num_input_tokens_seen": 39221210, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.21875, + "step": 1817, + "time_per_iteration": 2.5323143005371094 + }, + { + "auxiliary_loss_clip": 0.01178871, + "auxiliary_loss_mlp": 0.01054207, + "balance_loss_clip": 1.03167605, + "balance_loss_mlp": 1.05709267, + "epoch": 0.10930407335036825, + "flos": 25593750679680.0, + "grad_norm": 1.767365755633268, + "language_loss": 0.67279243, + "learning_rate": 3.934389982775706e-06, + "loss": 0.6951232, + "num_input_tokens_seen": 39242025, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.21875, + "step": 1818, + "time_per_iteration": 2.5450243949890137 + }, + { + "auxiliary_loss_clip": 0.01177017, + "auxiliary_loss_mlp": 0.01063325, + "balance_loss_clip": 1.04123521, + "balance_loss_mlp": 1.05534315, + "epoch": 0.10936419660303623, + "flos": 18406194376320.0, + "grad_norm": 2.0802139312896744, + "language_loss": 0.72992313, + "learning_rate": 3.934291009150275e-06, + "loss": 0.75232661, + "num_input_tokens_seen": 39259870, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.21875, + "step": 1819, + "time_per_iteration": 2.487644910812378 + }, + { + "auxiliary_loss_clip": 0.01180831, + "auxiliary_loss_mlp": 0.01051168, + "balance_loss_clip": 1.02994883, + "balance_loss_mlp": 1.06090236, + "epoch": 0.1094243198557042, + "flos": 23840052963840.0, + "grad_norm": 7.240077427900601, + "language_loss": 0.73943537, + "learning_rate": 3.934191962176335e-06, + "loss": 0.76175541, + "num_input_tokens_seen": 39278500, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.203125, + "step": 1820, + "time_per_iteration": 2.5064899921417236 + }, + { + "auxiliary_loss_clip": 0.01177081, + "auxiliary_loss_mlp": 0.0105084, + "balance_loss_clip": 1.02765381, + "balance_loss_mlp": 1.05699766, + "epoch": 0.10948444310837216, + "flos": 14643940970880.0, + "grad_norm": 2.1677198782015887, + "language_loss": 0.82586408, + "learning_rate": 3.934092841857642e-06, + "loss": 0.84814322, + "num_input_tokens_seen": 39294800, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.203125, + "step": 1821, + "time_per_iteration": 2.4487218856811523 + }, + { + "auxiliary_loss_clip": 0.01174491, + "auxiliary_loss_mlp": 0.01049191, + "balance_loss_clip": 1.0280906, + "balance_loss_mlp": 1.05549288, + "epoch": 0.10954456636104014, + "flos": 27818811596160.0, + "grad_norm": 2.4783722356243065, + "language_loss": 0.76171732, + "learning_rate": 3.933993648197955e-06, + "loss": 0.78395414, + "num_input_tokens_seen": 39314625, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1875, + "step": 1822, + "time_per_iteration": 2.5196666717529297 + }, + { + "auxiliary_loss_clip": 0.01175357, + "auxiliary_loss_mlp": 0.01051738, + "balance_loss_clip": 1.03070986, + "balance_loss_mlp": 1.05751145, + "epoch": 0.1096046896137081, + "flos": 33620934372480.0, + "grad_norm": 1.9066217775511896, + "language_loss": 0.79275787, + "learning_rate": 3.933894381201034e-06, + "loss": 0.81502879, + "num_input_tokens_seen": 39336465, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1796875, + "step": 1823, + "time_per_iteration": 2.5665249824523926 + }, + { + "auxiliary_loss_clip": 0.01176588, + "auxiliary_loss_mlp": 0.01047872, + "balance_loss_clip": 1.02583015, + "balance_loss_mlp": 1.05788529, + "epoch": 0.10966481286637607, + "flos": 26980010219520.0, + "grad_norm": 1.7066251744315906, + "language_loss": 0.79424715, + "learning_rate": 3.933795040870645e-06, + "loss": 0.81649172, + "num_input_tokens_seen": 39357930, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1875, + "step": 1824, + "time_per_iteration": 2.5238118171691895 + }, + { + "auxiliary_loss_clip": 0.01173809, + "auxiliary_loss_mlp": 0.01054255, + "balance_loss_clip": 1.03264284, + "balance_loss_mlp": 1.05610347, + "epoch": 0.10972493611904403, + "flos": 23036551678080.0, + "grad_norm": 2.2183246130345, + "language_loss": 0.87992203, + "learning_rate": 3.933695627210554e-06, + "loss": 0.90220273, + "num_input_tokens_seen": 39376380, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.171875, + "step": 1825, + "time_per_iteration": 2.48294734954834 + }, + { + "auxiliary_loss_clip": 0.01171104, + "auxiliary_loss_mlp": 0.01056568, + "balance_loss_clip": 1.03483629, + "balance_loss_mlp": 1.05362988, + "epoch": 0.10978505937171201, + "flos": 38104632443520.0, + "grad_norm": 1.8404731426595848, + "language_loss": 0.76462233, + "learning_rate": 3.933596140224532e-06, + "loss": 0.78689909, + "num_input_tokens_seen": 39399935, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.171875, + "step": 1826, + "time_per_iteration": 2.6397035121917725 + }, + { + "auxiliary_loss_clip": 0.01066703, + "auxiliary_loss_mlp": 0.01001412, + "balance_loss_clip": 0.9983961, + "balance_loss_mlp": 1.02257371, + "epoch": 0.10984518262437998, + "flos": 59849694616320.0, + "grad_norm": 0.8361632453995619, + "language_loss": 0.54999328, + "learning_rate": 3.93349657991635e-06, + "loss": 0.57067442, + "num_input_tokens_seen": 39460685, + "router_z_loss_clip": 0.03015137, + "router_z_loss_mlp": 0.44140625, + "step": 1827, + "time_per_iteration": 3.065896511077881 + }, + { + "auxiliary_loss_clip": 0.01064494, + "auxiliary_loss_mlp": 0.01003719, + "balance_loss_clip": 1.00082231, + "balance_loss_mlp": 1.02098036, + "epoch": 0.10990530587704794, + "flos": 66719837410560.0, + "grad_norm": 0.7348311418426204, + "language_loss": 0.55346334, + "learning_rate": 3.933396946289784e-06, + "loss": 0.57414544, + "num_input_tokens_seen": 39524765, + "router_z_loss_clip": 0.02893066, + "router_z_loss_mlp": 0.43359375, + "step": 1828, + "time_per_iteration": 3.0850460529327393 + }, + { + "auxiliary_loss_clip": 0.01180205, + "auxiliary_loss_mlp": 0.01063699, + "balance_loss_clip": 1.03967869, + "balance_loss_mlp": 1.05754089, + "epoch": 0.10996542912971592, + "flos": 25447199189760.0, + "grad_norm": 2.992065013624077, + "language_loss": 0.84191215, + "learning_rate": 3.933297239348612e-06, + "loss": 0.86435115, + "num_input_tokens_seen": 39543640, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.2265625, + "step": 1829, + "time_per_iteration": 2.5398643016815186 + }, + { + "auxiliary_loss_clip": 0.01181422, + "auxiliary_loss_mlp": 0.01057367, + "balance_loss_clip": 1.03348923, + "balance_loss_mlp": 1.05845475, + "epoch": 0.11002555238238389, + "flos": 44018186186880.0, + "grad_norm": 2.654516298718269, + "language_loss": 0.8878119, + "learning_rate": 3.933197459096614e-06, + "loss": 0.91019976, + "num_input_tokens_seen": 39567525, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.2265625, + "step": 1830, + "time_per_iteration": 2.6912100315093994 + }, + { + "auxiliary_loss_clip": 0.01061018, + "auxiliary_loss_mlp": 0.01017752, + "balance_loss_clip": 1.01497495, + "balance_loss_mlp": 1.01824236, + "epoch": 0.11008567563505185, + "flos": 54065133590400.0, + "grad_norm": 0.6883241829767079, + "language_loss": 0.55492055, + "learning_rate": 3.9330976055375756e-06, + "loss": 0.57570827, + "num_input_tokens_seen": 39628470, + "router_z_loss_clip": 0.02783203, + "router_z_loss_mlp": 0.42773438, + "step": 1831, + "time_per_iteration": 3.075678825378418 + }, + { + "auxiliary_loss_clip": 0.01183643, + "auxiliary_loss_mlp": 0.01072422, + "balance_loss_clip": 1.04829443, + "balance_loss_mlp": 1.05867732, + "epoch": 0.11014579888771983, + "flos": 24243150366720.0, + "grad_norm": 2.054835171188452, + "language_loss": 0.90726995, + "learning_rate": 3.932997678675282e-06, + "loss": 0.92983055, + "num_input_tokens_seen": 39646670, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.25, + "step": 1832, + "time_per_iteration": 2.5084948539733887 + }, + { + "auxiliary_loss_clip": 0.01058943, + "auxiliary_loss_mlp": 0.01015171, + "balance_loss_clip": 1.01245296, + "balance_loss_mlp": 1.01603723, + "epoch": 0.1102059221403878, + "flos": 57743965658880.0, + "grad_norm": 0.7159549093535102, + "language_loss": 0.59889859, + "learning_rate": 3.932897678513523e-06, + "loss": 0.61963969, + "num_input_tokens_seen": 39712915, + "router_z_loss_clip": 0.02722168, + "router_z_loss_mlp": 0.4296875, + "step": 1833, + "time_per_iteration": 3.0748977661132812 + }, + { + "auxiliary_loss_clip": 0.01175273, + "auxiliary_loss_mlp": 0.01050301, + "balance_loss_clip": 1.0277946, + "balance_loss_mlp": 1.05353165, + "epoch": 0.11026604539305576, + "flos": 16795923667200.0, + "grad_norm": 2.6030857455850303, + "language_loss": 0.8095156, + "learning_rate": 3.93279760505609e-06, + "loss": 0.83177137, + "num_input_tokens_seen": 39730650, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.21875, + "step": 1834, + "time_per_iteration": 2.452131509780884 + }, + { + "auxiliary_loss_clip": 0.01179876, + "auxiliary_loss_mlp": 0.0105407, + "balance_loss_clip": 1.0302285, + "balance_loss_mlp": 1.05899858, + "epoch": 0.11032616864572373, + "flos": 23988076911360.0, + "grad_norm": 2.5262438386564807, + "language_loss": 0.90514123, + "learning_rate": 3.932697458306779e-06, + "loss": 0.9274807, + "num_input_tokens_seen": 39751065, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.2109375, + "step": 1835, + "time_per_iteration": 2.5261363983154297 + }, + { + "auxiliary_loss_clip": 0.01177237, + "auxiliary_loss_mlp": 0.0105853, + "balance_loss_clip": 1.03445005, + "balance_loss_mlp": 1.05625033, + "epoch": 0.1103862918983917, + "flos": 19683141851520.0, + "grad_norm": 2.0785934228774003, + "language_loss": 0.63590646, + "learning_rate": 3.932597238269386e-06, + "loss": 0.65826416, + "num_input_tokens_seen": 39769245, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.2109375, + "step": 1836, + "time_per_iteration": 2.502586603164673 + }, + { + "auxiliary_loss_clip": 0.01173672, + "auxiliary_loss_mlp": 0.01057372, + "balance_loss_clip": 1.03547311, + "balance_loss_mlp": 1.05388379, + "epoch": 0.11044641515105967, + "flos": 32160878340480.0, + "grad_norm": 1.9330421575083043, + "language_loss": 0.72814602, + "learning_rate": 3.932496944947711e-06, + "loss": 0.75045645, + "num_input_tokens_seen": 39790830, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.1953125, + "step": 1837, + "time_per_iteration": 2.594910144805908 + }, + { + "auxiliary_loss_clip": 0.01179947, + "auxiliary_loss_mlp": 0.0105928, + "balance_loss_clip": 1.03733349, + "balance_loss_mlp": 1.05921102, + "epoch": 0.11050653840372764, + "flos": 16689233295360.0, + "grad_norm": 2.132041599419941, + "language_loss": 0.79049784, + "learning_rate": 3.93239657834556e-06, + "loss": 0.81289005, + "num_input_tokens_seen": 39809475, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.203125, + "step": 1838, + "time_per_iteration": 2.4922690391540527 + }, + { + "auxiliary_loss_clip": 0.01175098, + "auxiliary_loss_mlp": 0.01061476, + "balance_loss_clip": 1.03883791, + "balance_loss_mlp": 1.05623114, + "epoch": 0.11056666165639562, + "flos": 21208877902080.0, + "grad_norm": 4.130442583787946, + "language_loss": 0.71453696, + "learning_rate": 3.932296138466736e-06, + "loss": 0.73690271, + "num_input_tokens_seen": 39826355, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.1875, + "step": 1839, + "time_per_iteration": 2.5151031017303467 + }, + { + "auxiliary_loss_clip": 0.01183988, + "auxiliary_loss_mlp": 0.01053903, + "balance_loss_clip": 1.03082371, + "balance_loss_mlp": 1.05938148, + "epoch": 0.11062678490906358, + "flos": 19165488998400.0, + "grad_norm": 2.064820600929851, + "language_loss": 0.79099703, + "learning_rate": 3.93219562531505e-06, + "loss": 0.81337595, + "num_input_tokens_seen": 39845335, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.25, + "step": 1840, + "time_per_iteration": 2.487116575241089 + }, + { + "auxiliary_loss_clip": 0.01171241, + "auxiliary_loss_mlp": 0.01053863, + "balance_loss_clip": 1.03234553, + "balance_loss_mlp": 1.05329347, + "epoch": 0.11068690816173155, + "flos": 24895287740160.0, + "grad_norm": 2.0204098875762293, + "language_loss": 0.87691998, + "learning_rate": 3.932095038894311e-06, + "loss": 0.89917111, + "num_input_tokens_seen": 39865065, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.1796875, + "step": 1841, + "time_per_iteration": 2.5141868591308594 + }, + { + "auxiliary_loss_clip": 0.01170262, + "auxiliary_loss_mlp": 0.01053518, + "balance_loss_clip": 1.03126192, + "balance_loss_mlp": 1.05365491, + "epoch": 0.11074703141439952, + "flos": 16472368932480.0, + "grad_norm": 2.3404569451138535, + "language_loss": 0.90582979, + "learning_rate": 3.931994379208334e-06, + "loss": 0.92806768, + "num_input_tokens_seen": 39882780, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.1640625, + "step": 1842, + "time_per_iteration": 2.4583773612976074 + }, + { + "auxiliary_loss_clip": 0.0117179, + "auxiliary_loss_mlp": 0.0105155, + "balance_loss_clip": 1.03080761, + "balance_loss_mlp": 1.05210185, + "epoch": 0.11080715466706749, + "flos": 19172420323200.0, + "grad_norm": 2.171204868901281, + "language_loss": 0.85597986, + "learning_rate": 3.931893646260937e-06, + "loss": 0.87821329, + "num_input_tokens_seen": 39900295, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1953125, + "step": 1843, + "time_per_iteration": 2.4801278114318848 + }, + { + "auxiliary_loss_clip": 0.01174004, + "auxiliary_loss_mlp": 0.01060021, + "balance_loss_clip": 1.03645349, + "balance_loss_mlp": 1.05622911, + "epoch": 0.11086727791973545, + "flos": 27704687109120.0, + "grad_norm": 1.47825888700324, + "language_loss": 0.7494424, + "learning_rate": 3.931792840055941e-06, + "loss": 0.77178264, + "num_input_tokens_seen": 39922075, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.1796875, + "step": 1844, + "time_per_iteration": 2.526383876800537 + }, + { + "auxiliary_loss_clip": 0.01173241, + "auxiliary_loss_mlp": 0.01054334, + "balance_loss_clip": 1.0304563, + "balance_loss_mlp": 1.05405343, + "epoch": 0.11092740117240343, + "flos": 18514967736960.0, + "grad_norm": 2.0036363505702433, + "language_loss": 0.75732028, + "learning_rate": 3.931691960597165e-06, + "loss": 0.77959603, + "num_input_tokens_seen": 39940115, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.1953125, + "step": 1845, + "time_per_iteration": 2.463327169418335 + }, + { + "auxiliary_loss_clip": 0.01171011, + "auxiliary_loss_mlp": 0.01054645, + "balance_loss_clip": 1.03341389, + "balance_loss_mlp": 1.05351365, + "epoch": 0.1109875244250714, + "flos": 20522446018560.0, + "grad_norm": 1.6129010657048202, + "language_loss": 0.76336479, + "learning_rate": 3.9315910078884375e-06, + "loss": 0.7856214, + "num_input_tokens_seen": 39959920, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.171875, + "step": 1846, + "time_per_iteration": 2.465045928955078 + }, + { + "auxiliary_loss_clip": 0.01175917, + "auxiliary_loss_mlp": 0.01053852, + "balance_loss_clip": 1.03262115, + "balance_loss_mlp": 1.05392015, + "epoch": 0.11104764767773936, + "flos": 14098601710080.0, + "grad_norm": 2.9965527726637577, + "language_loss": 0.85611343, + "learning_rate": 3.931489981933584e-06, + "loss": 0.87841111, + "num_input_tokens_seen": 39974755, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.21875, + "step": 1847, + "time_per_iteration": 2.4493908882141113 + }, + { + "auxiliary_loss_clip": 0.01174539, + "auxiliary_loss_mlp": 0.01053148, + "balance_loss_clip": 1.03018796, + "balance_loss_mlp": 1.05326366, + "epoch": 0.11110777093040733, + "flos": 20594518657920.0, + "grad_norm": 3.3740806549350086, + "language_loss": 0.76464605, + "learning_rate": 3.931388882736438e-06, + "loss": 0.78692293, + "num_input_tokens_seen": 39993355, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.2109375, + "step": 1848, + "time_per_iteration": 2.4647536277770996 + }, + { + "auxiliary_loss_clip": 0.01173713, + "auxiliary_loss_mlp": 0.01048417, + "balance_loss_clip": 1.02754378, + "balance_loss_mlp": 1.05833888, + "epoch": 0.11116789418307531, + "flos": 21870065502720.0, + "grad_norm": 2.0750561163348173, + "language_loss": 0.77849847, + "learning_rate": 3.931287710300832e-06, + "loss": 0.8007198, + "num_input_tokens_seen": 40012410, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1484375, + "step": 1849, + "time_per_iteration": 2.514777660369873 + }, + { + "auxiliary_loss_clip": 0.01176001, + "auxiliary_loss_mlp": 0.01056414, + "balance_loss_clip": 1.03496861, + "balance_loss_mlp": 1.05422294, + "epoch": 0.11122801743574327, + "flos": 15523106256000.0, + "grad_norm": 3.6662643697478066, + "language_loss": 0.71315688, + "learning_rate": 3.931186464630601e-06, + "loss": 0.73548102, + "num_input_tokens_seen": 40029315, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.21875, + "step": 1850, + "time_per_iteration": 2.4420053958892822 + }, + { + "auxiliary_loss_clip": 0.01174469, + "auxiliary_loss_mlp": 0.01056777, + "balance_loss_clip": 1.03434181, + "balance_loss_mlp": 1.05444217, + "epoch": 0.11128814068841124, + "flos": 14392279307520.0, + "grad_norm": 2.2721050151861912, + "language_loss": 0.81174368, + "learning_rate": 3.931085145729588e-06, + "loss": 0.83405614, + "num_input_tokens_seen": 40045765, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.203125, + "step": 1851, + "time_per_iteration": 5.341679811477661 + }, + { + "auxiliary_loss_clip": 0.01173679, + "auxiliary_loss_mlp": 0.01051699, + "balance_loss_clip": 1.03126621, + "balance_loss_mlp": 1.05519962, + "epoch": 0.11134826394107922, + "flos": 16653933204480.0, + "grad_norm": 3.240427658931177, + "language_loss": 0.88860446, + "learning_rate": 3.930983753601631e-06, + "loss": 0.91085827, + "num_input_tokens_seen": 40061660, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.1875, + "step": 1852, + "time_per_iteration": 3.8281352519989014 + }, + { + "auxiliary_loss_clip": 0.01176515, + "auxiliary_loss_mlp": 0.01057817, + "balance_loss_clip": 1.03514326, + "balance_loss_mlp": 1.05636191, + "epoch": 0.11140838719374718, + "flos": 16690993061760.0, + "grad_norm": 2.0685366180695848, + "language_loss": 0.72092974, + "learning_rate": 3.930882288250578e-06, + "loss": 0.74327302, + "num_input_tokens_seen": 40080180, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.203125, + "step": 1853, + "time_per_iteration": 2.4896738529205322 + }, + { + "auxiliary_loss_clip": 0.01055744, + "auxiliary_loss_mlp": 0.01002079, + "balance_loss_clip": 0.99923038, + "balance_loss_mlp": 1.0132798, + "epoch": 0.11146851044641515, + "flos": 60976355587200.0, + "grad_norm": 0.7783537669608381, + "language_loss": 0.53647029, + "learning_rate": 3.930780749680273e-06, + "loss": 0.5570485, + "num_input_tokens_seen": 40138910, + "router_z_loss_clip": 0.02844238, + "router_z_loss_mlp": 0.42578125, + "step": 1854, + "time_per_iteration": 3.0189781188964844 + }, + { + "auxiliary_loss_clip": 0.01184355, + "auxiliary_loss_mlp": 0.01052254, + "balance_loss_clip": 1.02937746, + "balance_loss_mlp": 1.057657, + "epoch": 0.11152863369908313, + "flos": 22193835719040.0, + "grad_norm": 2.006296213399466, + "language_loss": 0.8394689, + "learning_rate": 3.9306791378945705e-06, + "loss": 0.861835, + "num_input_tokens_seen": 40157745, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.265625, + "step": 1855, + "time_per_iteration": 2.4908485412597656 + }, + { + "auxiliary_loss_clip": 0.01173641, + "auxiliary_loss_mlp": 0.0106694, + "balance_loss_clip": 1.04588723, + "balance_loss_mlp": 1.05353498, + "epoch": 0.11158875695175109, + "flos": 19537524115200.0, + "grad_norm": 2.2091175797191815, + "language_loss": 0.82098675, + "learning_rate": 3.9305774528973205e-06, + "loss": 0.84339261, + "num_input_tokens_seen": 40175375, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.203125, + "step": 1856, + "time_per_iteration": 2.4653480052948 + }, + { + "auxiliary_loss_clip": 0.01173222, + "auxiliary_loss_mlp": 0.01048519, + "balance_loss_clip": 1.02631092, + "balance_loss_mlp": 1.05662763, + "epoch": 0.11164888020441906, + "flos": 25442709989760.0, + "grad_norm": 2.9605277294776, + "language_loss": 0.8305279, + "learning_rate": 3.93047569469238e-06, + "loss": 0.85274535, + "num_input_tokens_seen": 40195715, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.1640625, + "step": 1857, + "time_per_iteration": 2.5205907821655273 + }, + { + "auxiliary_loss_clip": 0.01173614, + "auxiliary_loss_mlp": 0.01049346, + "balance_loss_clip": 1.0279119, + "balance_loss_mlp": 1.05195725, + "epoch": 0.11170900345708702, + "flos": 15632741543040.0, + "grad_norm": 2.3309612964817923, + "language_loss": 0.83037764, + "learning_rate": 3.930373863283608e-06, + "loss": 0.85260725, + "num_input_tokens_seen": 40213975, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.21875, + "step": 1858, + "time_per_iteration": 2.4567432403564453 + }, + { + "auxiliary_loss_clip": 0.01175678, + "auxiliary_loss_mlp": 0.01062921, + "balance_loss_clip": 1.04205894, + "balance_loss_mlp": 1.05549788, + "epoch": 0.111769126709755, + "flos": 23039424766080.0, + "grad_norm": 2.004830650729854, + "language_loss": 0.91120583, + "learning_rate": 3.930271958674866e-06, + "loss": 0.93359184, + "num_input_tokens_seen": 40233905, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.203125, + "step": 1859, + "time_per_iteration": 2.532048463821411 + }, + { + "auxiliary_loss_clip": 0.01173939, + "auxiliary_loss_mlp": 0.0105127, + "balance_loss_clip": 1.02983618, + "balance_loss_mlp": 1.05344319, + "epoch": 0.11182924996242297, + "flos": 20850705434880.0, + "grad_norm": 2.4768392741235306, + "language_loss": 0.81709313, + "learning_rate": 3.930169980870018e-06, + "loss": 0.83934522, + "num_input_tokens_seen": 40252810, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.203125, + "step": 1860, + "time_per_iteration": 2.4747087955474854 + }, + { + "auxiliary_loss_clip": 0.01170638, + "auxiliary_loss_mlp": 0.01056481, + "balance_loss_clip": 1.0361197, + "balance_loss_mlp": 1.05388653, + "epoch": 0.11188937321509093, + "flos": 17455315587840.0, + "grad_norm": 2.1256274007234937, + "language_loss": 0.75203162, + "learning_rate": 3.930067929872931e-06, + "loss": 0.77430284, + "num_input_tokens_seen": 40272000, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.171875, + "step": 1861, + "time_per_iteration": 2.4773240089416504 + }, + { + "auxiliary_loss_clip": 0.01169857, + "auxiliary_loss_mlp": 0.01052708, + "balance_loss_clip": 1.03318143, + "balance_loss_mlp": 1.05338371, + "epoch": 0.11194949646775891, + "flos": 24095916518400.0, + "grad_norm": 2.0016824982414776, + "language_loss": 0.88759935, + "learning_rate": 3.929965805687474e-06, + "loss": 0.90982509, + "num_input_tokens_seen": 40290660, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.1640625, + "step": 1862, + "time_per_iteration": 2.4750735759735107 + }, + { + "auxiliary_loss_clip": 0.01173358, + "auxiliary_loss_mlp": 0.01059619, + "balance_loss_clip": 1.03880525, + "balance_loss_mlp": 1.05597067, + "epoch": 0.11200961972042688, + "flos": 25153880728320.0, + "grad_norm": 2.1858127473987015, + "language_loss": 0.8707, + "learning_rate": 3.92986360831752e-06, + "loss": 0.89302975, + "num_input_tokens_seen": 40307820, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.171875, + "step": 1863, + "time_per_iteration": 2.510676860809326 + }, + { + "auxiliary_loss_clip": 0.01173984, + "auxiliary_loss_mlp": 0.01051873, + "balance_loss_clip": 1.0283289, + "balance_loss_mlp": 1.05463171, + "epoch": 0.11206974297309484, + "flos": 21288312829440.0, + "grad_norm": 2.0887108243102976, + "language_loss": 0.64630157, + "learning_rate": 3.929761337766945e-06, + "loss": 0.66856015, + "num_input_tokens_seen": 40327430, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.1953125, + "step": 1864, + "time_per_iteration": 2.4843807220458984 + }, + { + "auxiliary_loss_clip": 0.01171142, + "auxiliary_loss_mlp": 0.01051015, + "balance_loss_clip": 1.03169096, + "balance_loss_mlp": 1.05504417, + "epoch": 0.11212986622576282, + "flos": 18915982151040.0, + "grad_norm": 2.0715232833306874, + "language_loss": 0.73895639, + "learning_rate": 3.929658994039627e-06, + "loss": 0.76117796, + "num_input_tokens_seen": 40344545, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.15625, + "step": 1865, + "time_per_iteration": 2.4509596824645996 + }, + { + "auxiliary_loss_clip": 0.01169998, + "auxiliary_loss_mlp": 0.01051954, + "balance_loss_clip": 1.02928007, + "balance_loss_mlp": 1.05253589, + "epoch": 0.11218998947843078, + "flos": 22054754257920.0, + "grad_norm": 2.190736679244475, + "language_loss": 0.84019023, + "learning_rate": 3.929556577139446e-06, + "loss": 0.86240977, + "num_input_tokens_seen": 40362300, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.171875, + "step": 1866, + "time_per_iteration": 2.473715305328369 + }, + { + "auxiliary_loss_clip": 0.01169711, + "auxiliary_loss_mlp": 0.01048459, + "balance_loss_clip": 1.02737069, + "balance_loss_mlp": 1.05260134, + "epoch": 0.11225011273109875, + "flos": 24571697091840.0, + "grad_norm": 1.5419857436109028, + "language_loss": 0.81424987, + "learning_rate": 3.929454087070286e-06, + "loss": 0.83643156, + "num_input_tokens_seen": 40384720, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.171875, + "step": 1867, + "time_per_iteration": 2.5367391109466553 + }, + { + "auxiliary_loss_clip": 0.01172987, + "auxiliary_loss_mlp": 0.01055012, + "balance_loss_clip": 1.03473496, + "balance_loss_mlp": 1.05594015, + "epoch": 0.11231023598376672, + "flos": 28438665621120.0, + "grad_norm": 2.5308159777425976, + "language_loss": 0.86677599, + "learning_rate": 3.929351523836035e-06, + "loss": 0.88905597, + "num_input_tokens_seen": 40404000, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.171875, + "step": 1868, + "time_per_iteration": 2.5044100284576416 + }, + { + "auxiliary_loss_clip": 0.01172172, + "auxiliary_loss_mlp": 0.01049736, + "balance_loss_clip": 1.03026938, + "balance_loss_mlp": 1.05724931, + "epoch": 0.1123703592364347, + "flos": 14426466076800.0, + "grad_norm": 2.333499600894065, + "language_loss": 0.68059367, + "learning_rate": 3.9292488874405795e-06, + "loss": 0.70281279, + "num_input_tokens_seen": 40418665, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.1484375, + "step": 1869, + "time_per_iteration": 2.4462318420410156 + }, + { + "auxiliary_loss_clip": 0.01176659, + "auxiliary_loss_mlp": 0.01061629, + "balance_loss_clip": 1.03969407, + "balance_loss_mlp": 1.05456114, + "epoch": 0.11243048248910266, + "flos": 22236282616320.0, + "grad_norm": 2.049754856307833, + "language_loss": 0.7735095, + "learning_rate": 3.929146177887814e-06, + "loss": 0.79589236, + "num_input_tokens_seen": 40437870, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.21875, + "step": 1870, + "time_per_iteration": 2.488471031188965 + }, + { + "auxiliary_loss_clip": 0.01174025, + "auxiliary_loss_mlp": 0.01053264, + "balance_loss_clip": 1.03177094, + "balance_loss_mlp": 1.05264199, + "epoch": 0.11249060574177062, + "flos": 18584167288320.0, + "grad_norm": 1.8085683914823212, + "language_loss": 0.75747174, + "learning_rate": 3.929043395181631e-06, + "loss": 0.77974463, + "num_input_tokens_seen": 40455570, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.21875, + "step": 1871, + "time_per_iteration": 2.475358486175537 + }, + { + "auxiliary_loss_clip": 0.01171992, + "auxiliary_loss_mlp": 0.01049389, + "balance_loss_clip": 1.02936232, + "balance_loss_mlp": 1.05448031, + "epoch": 0.1125507289944386, + "flos": 22856567604480.0, + "grad_norm": 2.4822417703451265, + "language_loss": 0.81949306, + "learning_rate": 3.928940539325929e-06, + "loss": 0.84170687, + "num_input_tokens_seen": 40473600, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.171875, + "step": 1872, + "time_per_iteration": 2.4984912872314453 + }, + { + "auxiliary_loss_clip": 0.01172929, + "auxiliary_loss_mlp": 0.01052146, + "balance_loss_clip": 1.03183281, + "balance_loss_mlp": 1.05497694, + "epoch": 0.11261085224710657, + "flos": 19676390094720.0, + "grad_norm": 2.7250665555581937, + "language_loss": 0.83564019, + "learning_rate": 3.9288376103246095e-06, + "loss": 0.85789096, + "num_input_tokens_seen": 40490025, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1796875, + "step": 1873, + "time_per_iteration": 2.480172872543335 + }, + { + "auxiliary_loss_clip": 0.01175617, + "auxiliary_loss_mlp": 0.01053305, + "balance_loss_clip": 1.03089404, + "balance_loss_mlp": 1.05352998, + "epoch": 0.11267097549977453, + "flos": 26063246373120.0, + "grad_norm": 2.2103217259008985, + "language_loss": 0.91925669, + "learning_rate": 3.928734608181575e-06, + "loss": 0.9415459, + "num_input_tokens_seen": 40511580, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.21875, + "step": 1874, + "time_per_iteration": 2.5193865299224854 + }, + { + "auxiliary_loss_clip": 0.01168968, + "auxiliary_loss_mlp": 0.01057524, + "balance_loss_clip": 1.0376637, + "balance_loss_mlp": 1.0528394, + "epoch": 0.11273109875244251, + "flos": 21068036674560.0, + "grad_norm": 1.5656160151577971, + "language_loss": 0.7534616, + "learning_rate": 3.928631532900729e-06, + "loss": 0.77572656, + "num_input_tokens_seen": 40530155, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.15625, + "step": 1875, + "time_per_iteration": 2.509000062942505 + }, + { + "auxiliary_loss_clip": 0.01168067, + "auxiliary_loss_mlp": 0.01054767, + "balance_loss_clip": 1.03545499, + "balance_loss_mlp": 1.05498421, + "epoch": 0.11279122200511048, + "flos": 27088999061760.0, + "grad_norm": 1.875753927893446, + "language_loss": 0.71727258, + "learning_rate": 3.928528384485984e-06, + "loss": 0.73950088, + "num_input_tokens_seen": 40549500, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.1328125, + "step": 1876, + "time_per_iteration": 2.5222911834716797 + }, + { + "auxiliary_loss_clip": 0.01170022, + "auxiliary_loss_mlp": 0.01051226, + "balance_loss_clip": 1.03036463, + "balance_loss_mlp": 1.05574679, + "epoch": 0.11285134525777844, + "flos": 20187901722240.0, + "grad_norm": 2.408917627715415, + "language_loss": 0.76760256, + "learning_rate": 3.9284251629412475e-06, + "loss": 0.78981495, + "num_input_tokens_seen": 40567475, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.140625, + "step": 1877, + "time_per_iteration": 2.487870693206787 + }, + { + "auxiliary_loss_clip": 0.01173931, + "auxiliary_loss_mlp": 0.01057623, + "balance_loss_clip": 1.03555715, + "balance_loss_mlp": 1.05530918, + "epoch": 0.11291146851044641, + "flos": 12458453863680.0, + "grad_norm": 2.569804002246691, + "language_loss": 0.88132238, + "learning_rate": 3.928321868270436e-06, + "loss": 0.90363795, + "num_input_tokens_seen": 40583280, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1875, + "step": 1878, + "time_per_iteration": 2.4562089443206787 + }, + { + "auxiliary_loss_clip": 0.01171231, + "auxiliary_loss_mlp": 0.01046868, + "balance_loss_clip": 1.02628088, + "balance_loss_mlp": 1.05382609, + "epoch": 0.11297159176311439, + "flos": 23842315520640.0, + "grad_norm": 2.2792620862185036, + "language_loss": 0.81521666, + "learning_rate": 3.928218500477466e-06, + "loss": 0.83739763, + "num_input_tokens_seen": 40603080, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.171875, + "step": 1879, + "time_per_iteration": 2.515162944793701 + }, + { + "auxiliary_loss_clip": 0.01174903, + "auxiliary_loss_mlp": 0.01056113, + "balance_loss_clip": 1.03513217, + "balance_loss_mlp": 1.05591071, + "epoch": 0.11303171501578235, + "flos": 29930538124800.0, + "grad_norm": 2.9729184409385376, + "language_loss": 0.70101768, + "learning_rate": 3.928115059566259e-06, + "loss": 0.72332788, + "num_input_tokens_seen": 40623255, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.1875, + "step": 1880, + "time_per_iteration": 2.5420267581939697 + }, + { + "auxiliary_loss_clip": 0.01169399, + "auxiliary_loss_mlp": 0.01045441, + "balance_loss_clip": 1.02442431, + "balance_loss_mlp": 1.05396068, + "epoch": 0.11309183826845032, + "flos": 16180558842240.0, + "grad_norm": 1.7442831242084353, + "language_loss": 0.72337204, + "learning_rate": 3.928011545540734e-06, + "loss": 0.74552047, + "num_input_tokens_seen": 40641570, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.15625, + "step": 1881, + "time_per_iteration": 2.4648680686950684 + }, + { + "auxiliary_loss_clip": 0.01172977, + "auxiliary_loss_mlp": 0.01057236, + "balance_loss_clip": 1.03452694, + "balance_loss_mlp": 1.05385113, + "epoch": 0.1131519615211183, + "flos": 12020702814720.0, + "grad_norm": 2.4452990726029533, + "language_loss": 0.74243963, + "learning_rate": 3.927907958404819e-06, + "loss": 0.76474178, + "num_input_tokens_seen": 40658775, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.1953125, + "step": 1882, + "time_per_iteration": 2.459181547164917 + }, + { + "auxiliary_loss_clip": 0.01171271, + "auxiliary_loss_mlp": 0.01052266, + "balance_loss_clip": 1.03045106, + "balance_loss_mlp": 1.05493677, + "epoch": 0.11321208477378626, + "flos": 26250125857920.0, + "grad_norm": 2.8641228673356873, + "language_loss": 0.79328096, + "learning_rate": 3.92780429816244e-06, + "loss": 0.81551635, + "num_input_tokens_seen": 40679555, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.15625, + "step": 1883, + "time_per_iteration": 2.5236945152282715 + }, + { + "auxiliary_loss_clip": 0.01173507, + "auxiliary_loss_mlp": 0.01054538, + "balance_loss_clip": 1.03271067, + "balance_loss_mlp": 1.05288672, + "epoch": 0.11327220802645423, + "flos": 13626376583040.0, + "grad_norm": 3.0524763398538193, + "language_loss": 0.77151698, + "learning_rate": 3.927700564817529e-06, + "loss": 0.79379749, + "num_input_tokens_seen": 40697295, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.203125, + "step": 1884, + "time_per_iteration": 2.4974489212036133 + }, + { + "auxiliary_loss_clip": 0.01063959, + "auxiliary_loss_mlp": 0.01012749, + "balance_loss_clip": 1.0102694, + "balance_loss_mlp": 1.02156711, + "epoch": 0.1133323312791222, + "flos": 57191802814080.0, + "grad_norm": 0.7928734254501784, + "language_loss": 0.55183071, + "learning_rate": 3.927596758374019e-06, + "loss": 0.5725978, + "num_input_tokens_seen": 40758095, + "router_z_loss_clip": 0.02478027, + "router_z_loss_mlp": 0.42382812, + "step": 1885, + "time_per_iteration": 2.9756290912628174 + }, + { + "auxiliary_loss_clip": 0.01166272, + "auxiliary_loss_mlp": 0.01047922, + "balance_loss_clip": 1.02758515, + "balance_loss_mlp": 1.0534817, + "epoch": 0.11339245453179017, + "flos": 24351708245760.0, + "grad_norm": 5.752063942495911, + "language_loss": 0.90240276, + "learning_rate": 3.927492878835848e-06, + "loss": 0.92454469, + "num_input_tokens_seen": 40777140, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.125, + "step": 1886, + "time_per_iteration": 2.5031139850616455 + }, + { + "auxiliary_loss_clip": 0.01168969, + "auxiliary_loss_mlp": 0.01051145, + "balance_loss_clip": 1.03018832, + "balance_loss_mlp": 1.05306387, + "epoch": 0.11345257778445814, + "flos": 22670693700480.0, + "grad_norm": 2.0267704425546036, + "language_loss": 0.85101235, + "learning_rate": 3.927388926206953e-06, + "loss": 0.87321353, + "num_input_tokens_seen": 40797505, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.15625, + "step": 1887, + "time_per_iteration": 2.5177412033081055 + }, + { + "auxiliary_loss_clip": 0.01172698, + "auxiliary_loss_mlp": 0.01061982, + "balance_loss_clip": 1.0417881, + "balance_loss_mlp": 1.05554259, + "epoch": 0.11351270103712612, + "flos": 20988242611200.0, + "grad_norm": 5.5783153731033055, + "language_loss": 0.76168925, + "learning_rate": 3.927284900491277e-06, + "loss": 0.78403604, + "num_input_tokens_seen": 40812970, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.171875, + "step": 1888, + "time_per_iteration": 2.4965853691101074 + }, + { + "auxiliary_loss_clip": 0.01177743, + "auxiliary_loss_mlp": 0.01057849, + "balance_loss_clip": 1.03542566, + "balance_loss_mlp": 1.05632472, + "epoch": 0.11357282428979408, + "flos": 37347923600640.0, + "grad_norm": 2.114301103868513, + "language_loss": 0.68039739, + "learning_rate": 3.927180801692764e-06, + "loss": 0.70275331, + "num_input_tokens_seen": 40837745, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.21875, + "step": 1889, + "time_per_iteration": 2.643867015838623 + }, + { + "auxiliary_loss_clip": 0.01172679, + "auxiliary_loss_mlp": 0.01047613, + "balance_loss_clip": 1.02611947, + "balance_loss_mlp": 1.05620956, + "epoch": 0.11363294754246205, + "flos": 21757018423680.0, + "grad_norm": 2.158184033346157, + "language_loss": 0.84414917, + "learning_rate": 3.927076629815362e-06, + "loss": 0.86635208, + "num_input_tokens_seen": 40856490, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.1640625, + "step": 1890, + "time_per_iteration": 2.5018270015716553 + }, + { + "auxiliary_loss_clip": 0.01168344, + "auxiliary_loss_mlp": 0.01050115, + "balance_loss_clip": 1.02855039, + "balance_loss_mlp": 1.05288363, + "epoch": 0.11369307079513001, + "flos": 22601637803520.0, + "grad_norm": 2.2859967152973373, + "language_loss": 0.65099049, + "learning_rate": 3.926972384863022e-06, + "loss": 0.67317504, + "num_input_tokens_seen": 40874070, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.15625, + "step": 1891, + "time_per_iteration": 2.4870762825012207 + }, + { + "auxiliary_loss_clip": 0.01173219, + "auxiliary_loss_mlp": 0.01043072, + "balance_loss_clip": 1.02274656, + "balance_loss_mlp": 1.05397856, + "epoch": 0.11375319404779799, + "flos": 21944257044480.0, + "grad_norm": 2.358390081637715, + "language_loss": 0.87789619, + "learning_rate": 3.9268680668396956e-06, + "loss": 0.90005904, + "num_input_tokens_seen": 40892425, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1953125, + "step": 1892, + "time_per_iteration": 2.469215154647827 + }, + { + "auxiliary_loss_clip": 0.01173439, + "auxiliary_loss_mlp": 0.01066287, + "balance_loss_clip": 1.04509139, + "balance_loss_mlp": 1.05419993, + "epoch": 0.11381331730046595, + "flos": 26395456285440.0, + "grad_norm": 2.4185703679999775, + "language_loss": 0.72724342, + "learning_rate": 3.926763675749339e-06, + "loss": 0.7496407, + "num_input_tokens_seen": 40912190, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1875, + "step": 1893, + "time_per_iteration": 4.021688222885132 + }, + { + "auxiliary_loss_clip": 0.01169367, + "auxiliary_loss_mlp": 0.0105827, + "balance_loss_clip": 1.03531051, + "balance_loss_mlp": 1.05175805, + "epoch": 0.11387344055313392, + "flos": 23804716959360.0, + "grad_norm": 2.254020248775613, + "language_loss": 0.79367435, + "learning_rate": 3.92665921159591e-06, + "loss": 0.81595069, + "num_input_tokens_seen": 40928395, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.171875, + "step": 1894, + "time_per_iteration": 3.9190711975097656 + }, + { + "auxiliary_loss_clip": 0.01176791, + "auxiliary_loss_mlp": 0.01052535, + "balance_loss_clip": 1.03074312, + "balance_loss_mlp": 1.05530715, + "epoch": 0.1139335638058019, + "flos": 34522865902080.0, + "grad_norm": 2.587114905294773, + "language_loss": 0.78868139, + "learning_rate": 3.926554674383371e-06, + "loss": 0.81097472, + "num_input_tokens_seen": 40946555, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.21875, + "step": 1895, + "time_per_iteration": 2.5924861431121826 + }, + { + "auxiliary_loss_clip": 0.0106161, + "auxiliary_loss_mlp": 0.01001633, + "balance_loss_clip": 0.99917758, + "balance_loss_mlp": 1.01840448, + "epoch": 0.11399368705846986, + "flos": 70587811520640.0, + "grad_norm": 0.8005582337036792, + "language_loss": 0.63316774, + "learning_rate": 3.926450064115686e-06, + "loss": 0.65380025, + "num_input_tokens_seen": 41004910, + "router_z_loss_clip": 0.02453613, + "router_z_loss_mlp": 0.43359375, + "step": 1896, + "time_per_iteration": 3.143843412399292 + }, + { + "auxiliary_loss_clip": 0.01170086, + "auxiliary_loss_mlp": 0.01059473, + "balance_loss_clip": 1.03600097, + "balance_loss_mlp": 1.05385494, + "epoch": 0.11405381031113783, + "flos": 21324259365120.0, + "grad_norm": 1.6058527618620146, + "language_loss": 0.84707338, + "learning_rate": 3.926345380796821e-06, + "loss": 0.86936897, + "num_input_tokens_seen": 41026385, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.15625, + "step": 1897, + "time_per_iteration": 2.5120036602020264 + }, + { + "auxiliary_loss_clip": 0.0117262, + "auxiliary_loss_mlp": 0.01053072, + "balance_loss_clip": 1.03159046, + "balance_loss_mlp": 1.05385423, + "epoch": 0.11411393356380581, + "flos": 19719627091200.0, + "grad_norm": 2.3286063431421926, + "language_loss": 0.79776239, + "learning_rate": 3.9262406244307465e-06, + "loss": 0.8200193, + "num_input_tokens_seen": 41045315, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.1875, + "step": 1898, + "time_per_iteration": 2.5186216831207275 + }, + { + "auxiliary_loss_clip": 0.01174476, + "auxiliary_loss_mlp": 0.01056562, + "balance_loss_clip": 1.03330398, + "balance_loss_mlp": 1.05247831, + "epoch": 0.11417405681647377, + "flos": 17530440883200.0, + "grad_norm": 1.996095488823442, + "language_loss": 0.73049861, + "learning_rate": 3.926135795021435e-06, + "loss": 0.75280899, + "num_input_tokens_seen": 41063390, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.21875, + "step": 1899, + "time_per_iteration": 2.484767198562622 + }, + { + "auxiliary_loss_clip": 0.01059885, + "auxiliary_loss_mlp": 0.01004452, + "balance_loss_clip": 1.00197208, + "balance_loss_mlp": 1.0167762, + "epoch": 0.11423418006914174, + "flos": 59674666619520.0, + "grad_norm": 0.9092154832512579, + "language_loss": 0.63432097, + "learning_rate": 3.92603089257286e-06, + "loss": 0.65496433, + "num_input_tokens_seen": 41124180, + "router_z_loss_clip": 0.02478027, + "router_z_loss_mlp": 0.4296875, + "step": 1900, + "time_per_iteration": 3.0239956378936768 + }, + { + "auxiliary_loss_clip": 0.0117026, + "auxiliary_loss_mlp": 0.01058021, + "balance_loss_clip": 1.03600276, + "balance_loss_mlp": 1.05181098, + "epoch": 0.1142943033218097, + "flos": 22963114321920.0, + "grad_norm": 1.6715138036124124, + "language_loss": 0.78116465, + "learning_rate": 3.925925917089001e-06, + "loss": 0.80344748, + "num_input_tokens_seen": 41143485, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.1875, + "step": 1901, + "time_per_iteration": 2.5007457733154297 + }, + { + "auxiliary_loss_clip": 0.01172182, + "auxiliary_loss_mlp": 0.01059819, + "balance_loss_clip": 1.03894591, + "balance_loss_mlp": 1.05482793, + "epoch": 0.11435442657447768, + "flos": 18256267008000.0, + "grad_norm": 1.9023337273707566, + "language_loss": 0.83676988, + "learning_rate": 3.925820868573839e-06, + "loss": 0.85908997, + "num_input_tokens_seen": 41161695, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.171875, + "step": 1902, + "time_per_iteration": 2.4389002323150635 + }, + { + "auxiliary_loss_clip": 0.0117356, + "auxiliary_loss_mlp": 0.01050952, + "balance_loss_clip": 1.02856493, + "balance_loss_mlp": 1.05356252, + "epoch": 0.11441454982714565, + "flos": 24061191045120.0, + "grad_norm": 1.6958297254772137, + "language_loss": 0.77551281, + "learning_rate": 3.925715747031356e-06, + "loss": 0.79775804, + "num_input_tokens_seen": 41181715, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.203125, + "step": 1903, + "time_per_iteration": 2.503164768218994 + }, + { + "auxiliary_loss_clip": 0.01171838, + "auxiliary_loss_mlp": 0.01045456, + "balance_loss_clip": 1.02651334, + "balance_loss_mlp": 1.05437744, + "epoch": 0.11447467307981361, + "flos": 25337707557120.0, + "grad_norm": 2.553861289811236, + "language_loss": 0.75704938, + "learning_rate": 3.925610552465539e-06, + "loss": 0.77922231, + "num_input_tokens_seen": 41201770, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.171875, + "step": 1904, + "time_per_iteration": 2.5097854137420654 + }, + { + "auxiliary_loss_clip": 0.01171595, + "auxiliary_loss_mlp": 0.01053755, + "balance_loss_clip": 1.03192747, + "balance_loss_mlp": 1.05519056, + "epoch": 0.11453479633248159, + "flos": 21726063878400.0, + "grad_norm": 2.146045336495955, + "language_loss": 0.92476678, + "learning_rate": 3.9255052848803764e-06, + "loss": 0.94702017, + "num_input_tokens_seen": 41220590, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.1640625, + "step": 1905, + "time_per_iteration": 2.4905850887298584 + }, + { + "auxiliary_loss_clip": 0.0117632, + "auxiliary_loss_mlp": 0.01050773, + "balance_loss_clip": 1.02755141, + "balance_loss_mlp": 1.0496794, + "epoch": 0.11459491958514956, + "flos": 12969714096000.0, + "grad_norm": 2.457773566764277, + "language_loss": 0.77108872, + "learning_rate": 3.925399944279861e-06, + "loss": 0.7933597, + "num_input_tokens_seen": 41237250, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.265625, + "step": 1906, + "time_per_iteration": 2.4469265937805176 + }, + { + "auxiliary_loss_clip": 0.01173869, + "auxiliary_loss_mlp": 0.01053097, + "balance_loss_clip": 1.03072143, + "balance_loss_mlp": 1.05375302, + "epoch": 0.11465504283781752, + "flos": 22711273090560.0, + "grad_norm": 2.4555636334810593, + "language_loss": 0.81855345, + "learning_rate": 3.925294530667986e-06, + "loss": 0.84082305, + "num_input_tokens_seen": 41256680, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.203125, + "step": 1907, + "time_per_iteration": 2.499772071838379 + }, + { + "auxiliary_loss_clip": 0.01173651, + "auxiliary_loss_mlp": 0.0106593, + "balance_loss_clip": 1.045784, + "balance_loss_mlp": 1.05599511, + "epoch": 0.1147151660904855, + "flos": 23398387332480.0, + "grad_norm": 4.041607412488977, + "language_loss": 0.84798187, + "learning_rate": 3.92518904404875e-06, + "loss": 0.87037772, + "num_input_tokens_seen": 41270955, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.171875, + "step": 1908, + "time_per_iteration": 2.468519687652588 + }, + { + "auxiliary_loss_clip": 0.0105669, + "auxiliary_loss_mlp": 0.01004376, + "balance_loss_clip": 1.0018847, + "balance_loss_mlp": 1.01344705, + "epoch": 0.11477528934315347, + "flos": 63011843498880.0, + "grad_norm": 0.9477470057539497, + "language_loss": 0.6100027, + "learning_rate": 3.925083484426153e-06, + "loss": 0.63061339, + "num_input_tokens_seen": 41319180, + "router_z_loss_clip": 0.02490234, + "router_z_loss_mlp": 0.43164062, + "step": 1909, + "time_per_iteration": 2.8313472270965576 + }, + { + "auxiliary_loss_clip": 0.01174173, + "auxiliary_loss_mlp": 0.01052438, + "balance_loss_clip": 1.03223228, + "balance_loss_mlp": 1.05660319, + "epoch": 0.11483541259582143, + "flos": 16325601960960.0, + "grad_norm": 2.135894642259737, + "language_loss": 0.78793955, + "learning_rate": 3.924977851804197e-06, + "loss": 0.8102057, + "num_input_tokens_seen": 41337480, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.171875, + "step": 1910, + "time_per_iteration": 2.4613592624664307 + }, + { + "auxiliary_loss_clip": 0.01178149, + "auxiliary_loss_mlp": 0.01051798, + "balance_loss_clip": 1.03005373, + "balance_loss_mlp": 1.05803406, + "epoch": 0.1148955358484894, + "flos": 21580410228480.0, + "grad_norm": 3.035949872237615, + "language_loss": 0.76787984, + "learning_rate": 3.9248721461868875e-06, + "loss": 0.79017925, + "num_input_tokens_seen": 41354650, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.203125, + "step": 1911, + "time_per_iteration": 2.475069761276245 + }, + { + "auxiliary_loss_clip": 0.01166349, + "auxiliary_loss_mlp": 0.01051416, + "balance_loss_clip": 1.03048277, + "balance_loss_mlp": 1.05284548, + "epoch": 0.11495565910115738, + "flos": 27673696650240.0, + "grad_norm": 2.1144124150337023, + "language_loss": 0.7927531, + "learning_rate": 3.9247663675782336e-06, + "loss": 0.81493074, + "num_input_tokens_seen": 41376935, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1328125, + "step": 1912, + "time_per_iteration": 2.543607473373413 + }, + { + "auxiliary_loss_clip": 0.01169469, + "auxiliary_loss_mlp": 0.0105862, + "balance_loss_clip": 1.0369364, + "balance_loss_mlp": 1.05352569, + "epoch": 0.11501578235382534, + "flos": 20632368614400.0, + "grad_norm": 1.9322037304643997, + "language_loss": 0.7777245, + "learning_rate": 3.924660515982246e-06, + "loss": 0.80000544, + "num_input_tokens_seen": 41396105, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.1640625, + "step": 1913, + "time_per_iteration": 2.5093326568603516 + }, + { + "auxiliary_loss_clip": 0.01169525, + "auxiliary_loss_mlp": 0.01051154, + "balance_loss_clip": 1.02889776, + "balance_loss_mlp": 1.05118954, + "epoch": 0.1150759056064933, + "flos": 19829046896640.0, + "grad_norm": 3.783180746712747, + "language_loss": 0.70389271, + "learning_rate": 3.924554591402939e-06, + "loss": 0.72609949, + "num_input_tokens_seen": 41415600, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.1875, + "step": 1914, + "time_per_iteration": 2.5099785327911377 + }, + { + "auxiliary_loss_clip": 0.01058565, + "auxiliary_loss_mlp": 0.01002053, + "balance_loss_clip": 0.99943084, + "balance_loss_mlp": 1.01452589, + "epoch": 0.11513602885916129, + "flos": 70045776311040.0, + "grad_norm": 0.7556045547130329, + "language_loss": 0.61044526, + "learning_rate": 3.92444859384433e-06, + "loss": 0.63105142, + "num_input_tokens_seen": 41478760, + "router_z_loss_clip": 0.02624512, + "router_z_loss_mlp": 0.44140625, + "step": 1915, + "time_per_iteration": 3.1735148429870605 + }, + { + "auxiliary_loss_clip": 0.01172283, + "auxiliary_loss_mlp": 0.01054758, + "balance_loss_clip": 1.03273964, + "balance_loss_mlp": 1.05674434, + "epoch": 0.11519615211182925, + "flos": 15741730385280.0, + "grad_norm": 2.822924091618307, + "language_loss": 0.9323889, + "learning_rate": 3.924342523310436e-06, + "loss": 0.95465934, + "num_input_tokens_seen": 41495720, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.15625, + "step": 1916, + "time_per_iteration": 2.4806342124938965 + }, + { + "auxiliary_loss_clip": 0.01171086, + "auxiliary_loss_mlp": 0.01061893, + "balance_loss_clip": 1.03845596, + "balance_loss_mlp": 1.05340374, + "epoch": 0.11525627536449722, + "flos": 20667632791680.0, + "grad_norm": 1.8768677942494545, + "language_loss": 0.72286755, + "learning_rate": 3.9242363798052806e-06, + "loss": 0.7451973, + "num_input_tokens_seen": 41513585, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.171875, + "step": 1917, + "time_per_iteration": 2.519758701324463 + }, + { + "auxiliary_loss_clip": 0.01171782, + "auxiliary_loss_mlp": 0.0104867, + "balance_loss_clip": 1.02664053, + "balance_loss_mlp": 1.05521619, + "epoch": 0.1153163986171652, + "flos": 20303283185280.0, + "grad_norm": 2.2984335892825594, + "language_loss": 0.74389827, + "learning_rate": 3.92413016333289e-06, + "loss": 0.76610279, + "num_input_tokens_seen": 41533390, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1640625, + "step": 1918, + "time_per_iteration": 2.4867136478424072 + }, + { + "auxiliary_loss_clip": 0.01173604, + "auxiliary_loss_mlp": 0.01045744, + "balance_loss_clip": 1.02394044, + "balance_loss_mlp": 1.05273843, + "epoch": 0.11537652186983316, + "flos": 17639321984640.0, + "grad_norm": 2.1981507651696193, + "language_loss": 0.86515707, + "learning_rate": 3.92402387389729e-06, + "loss": 0.88735056, + "num_input_tokens_seen": 41551015, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.2109375, + "step": 1919, + "time_per_iteration": 2.4838428497314453 + }, + { + "auxiliary_loss_clip": 0.01168988, + "auxiliary_loss_mlp": 0.01054432, + "balance_loss_clip": 1.03190136, + "balance_loss_mlp": 1.05291939, + "epoch": 0.11543664512250112, + "flos": 21069401391360.0, + "grad_norm": 2.516832715272094, + "language_loss": 0.86640596, + "learning_rate": 3.923917511502512e-06, + "loss": 0.88864017, + "num_input_tokens_seen": 41568055, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.15625, + "step": 1920, + "time_per_iteration": 2.524017333984375 + }, + { + "auxiliary_loss_clip": 0.01167627, + "auxiliary_loss_mlp": 0.01047596, + "balance_loss_clip": 1.02549434, + "balance_loss_mlp": 1.05360281, + "epoch": 0.11549676837516909, + "flos": 22747542848640.0, + "grad_norm": 2.2143351457696525, + "language_loss": 0.79792106, + "learning_rate": 3.923811076152589e-06, + "loss": 0.82007331, + "num_input_tokens_seen": 41587435, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.140625, + "step": 1921, + "time_per_iteration": 2.494673252105713 + }, + { + "auxiliary_loss_clip": 0.01174597, + "auxiliary_loss_mlp": 0.01056005, + "balance_loss_clip": 1.03331947, + "balance_loss_mlp": 1.05358851, + "epoch": 0.11555689162783707, + "flos": 19168972617600.0, + "grad_norm": 8.96706495073623, + "language_loss": 0.78418177, + "learning_rate": 3.923704567851557e-06, + "loss": 0.8064878, + "num_input_tokens_seen": 41604975, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.2109375, + "step": 1922, + "time_per_iteration": 2.5293705463409424 + }, + { + "auxiliary_loss_clip": 0.01174074, + "auxiliary_loss_mlp": 0.01060645, + "balance_loss_clip": 1.03910375, + "balance_loss_mlp": 1.05410469, + "epoch": 0.11561701488050503, + "flos": 24572056227840.0, + "grad_norm": 1.8482726295091094, + "language_loss": 0.84187758, + "learning_rate": 3.923597986603456e-06, + "loss": 0.86422473, + "num_input_tokens_seen": 41626155, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.203125, + "step": 1923, + "time_per_iteration": 2.5203118324279785 + }, + { + "auxiliary_loss_clip": 0.01175585, + "auxiliary_loss_mlp": 0.01053498, + "balance_loss_clip": 1.03074098, + "balance_loss_mlp": 1.05742192, + "epoch": 0.115677138133173, + "flos": 17092546179840.0, + "grad_norm": 2.0576366068601666, + "language_loss": 0.80471247, + "learning_rate": 3.9234913324123264e-06, + "loss": 0.82700336, + "num_input_tokens_seen": 41644805, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.1796875, + "step": 1924, + "time_per_iteration": 2.48531436920166 + }, + { + "auxiliary_loss_clip": 0.01058783, + "auxiliary_loss_mlp": 0.01001491, + "balance_loss_clip": 0.99917841, + "balance_loss_mlp": 1.0154866, + "epoch": 0.11573726138584098, + "flos": 62703875266560.0, + "grad_norm": 0.810907468185892, + "language_loss": 0.6115036, + "learning_rate": 3.923384605282212e-06, + "loss": 0.6321063, + "num_input_tokens_seen": 41709345, + "router_z_loss_clip": 0.02307129, + "router_z_loss_mlp": 0.43359375, + "step": 1925, + "time_per_iteration": 3.112396478652954 + }, + { + "auxiliary_loss_clip": 0.01173159, + "auxiliary_loss_mlp": 0.01076027, + "balance_loss_clip": 1.05304384, + "balance_loss_mlp": 1.05447614, + "epoch": 0.11579738463850894, + "flos": 22601135013120.0, + "grad_norm": 2.806943429185086, + "language_loss": 0.7482335, + "learning_rate": 3.923277805217161e-06, + "loss": 0.77072537, + "num_input_tokens_seen": 41730210, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.1875, + "step": 1926, + "time_per_iteration": 2.4890315532684326 + }, + { + "auxiliary_loss_clip": 0.01174997, + "auxiliary_loss_mlp": 0.0106307, + "balance_loss_clip": 1.03873897, + "balance_loss_mlp": 1.0552361, + "epoch": 0.11585750789117691, + "flos": 21726135705600.0, + "grad_norm": 2.429758451090488, + "language_loss": 0.73112315, + "learning_rate": 3.923170932221222e-06, + "loss": 0.7535038, + "num_input_tokens_seen": 41750270, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.203125, + "step": 1927, + "time_per_iteration": 2.4673402309417725 + }, + { + "auxiliary_loss_clip": 0.0117016, + "auxiliary_loss_mlp": 0.01054777, + "balance_loss_clip": 1.03244913, + "balance_loss_mlp": 1.05291271, + "epoch": 0.11591763114384489, + "flos": 26287544851200.0, + "grad_norm": 2.854021270140142, + "language_loss": 0.86824137, + "learning_rate": 3.92306398629845e-06, + "loss": 0.89049077, + "num_input_tokens_seen": 41772975, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.171875, + "step": 1928, + "time_per_iteration": 2.530325412750244 + }, + { + "auxiliary_loss_clip": 0.01173569, + "auxiliary_loss_mlp": 0.01055147, + "balance_loss_clip": 1.03289056, + "balance_loss_mlp": 1.05469573, + "epoch": 0.11597775439651285, + "flos": 23000461488000.0, + "grad_norm": 1.71243688867153, + "language_loss": 0.77567977, + "learning_rate": 3.922956967452898e-06, + "loss": 0.79796684, + "num_input_tokens_seen": 41791765, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.1875, + "step": 1929, + "time_per_iteration": 2.489664316177368 + }, + { + "auxiliary_loss_clip": 0.01168882, + "auxiliary_loss_mlp": 0.01062437, + "balance_loss_clip": 1.04238629, + "balance_loss_mlp": 1.05385804, + "epoch": 0.11603787764918082, + "flos": 31941715507200.0, + "grad_norm": 1.6293868207273203, + "language_loss": 0.76724243, + "learning_rate": 3.922849875688626e-06, + "loss": 0.78955561, + "num_input_tokens_seen": 41815615, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.1484375, + "step": 1930, + "time_per_iteration": 2.5867533683776855 + }, + { + "auxiliary_loss_clip": 0.01169352, + "auxiliary_loss_mlp": 0.0105213, + "balance_loss_clip": 1.03027928, + "balance_loss_mlp": 1.05313969, + "epoch": 0.1160980009018488, + "flos": 22271654534400.0, + "grad_norm": 1.9270697111110349, + "language_loss": 0.72114342, + "learning_rate": 3.922742711009693e-06, + "loss": 0.74335825, + "num_input_tokens_seen": 41834810, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.1640625, + "step": 1931, + "time_per_iteration": 2.5218429565429688 + }, + { + "auxiliary_loss_clip": 0.01173627, + "auxiliary_loss_mlp": 0.0105412, + "balance_loss_clip": 1.03168511, + "balance_loss_mlp": 1.05528855, + "epoch": 0.11615812415451676, + "flos": 22783633038720.0, + "grad_norm": 1.5295866923660926, + "language_loss": 0.82133794, + "learning_rate": 3.922635473420164e-06, + "loss": 0.84361541, + "num_input_tokens_seen": 41854975, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.1875, + "step": 1932, + "time_per_iteration": 2.4879212379455566 + }, + { + "auxiliary_loss_clip": 0.01053319, + "auxiliary_loss_mlp": 0.01007659, + "balance_loss_clip": 1.00539386, + "balance_loss_mlp": 1.0111897, + "epoch": 0.11621824740718473, + "flos": 67146096107520.0, + "grad_norm": 0.7701959329661775, + "language_loss": 0.61053753, + "learning_rate": 3.922528162924105e-06, + "loss": 0.63114727, + "num_input_tokens_seen": 41911105, + "router_z_loss_clip": 0.02270508, + "router_z_loss_mlp": 0.421875, + "step": 1933, + "time_per_iteration": 2.960437059402466 + }, + { + "auxiliary_loss_clip": 0.01172297, + "auxiliary_loss_mlp": 0.01054299, + "balance_loss_clip": 1.03248382, + "balance_loss_mlp": 1.05259895, + "epoch": 0.11627837065985269, + "flos": 20375930442240.0, + "grad_norm": 2.2263920275904425, + "language_loss": 0.85587192, + "learning_rate": 3.922420779525586e-06, + "loss": 0.87813795, + "num_input_tokens_seen": 41931750, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.1953125, + "step": 1934, + "time_per_iteration": 5.3810875415802 + }, + { + "auxiliary_loss_clip": 0.01178805, + "auxiliary_loss_mlp": 0.01059072, + "balance_loss_clip": 1.03667259, + "balance_loss_mlp": 1.05852652, + "epoch": 0.11633849391252067, + "flos": 21725812483200.0, + "grad_norm": 2.481370623449466, + "language_loss": 0.65555394, + "learning_rate": 3.9223133232286776e-06, + "loss": 0.67793274, + "num_input_tokens_seen": 41949400, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.203125, + "step": 1935, + "time_per_iteration": 2.483814239501953 + }, + { + "auxiliary_loss_clip": 0.01176161, + "auxiliary_loss_mlp": 0.01053675, + "balance_loss_clip": 1.03352857, + "balance_loss_mlp": 1.05533004, + "epoch": 0.11639861716518864, + "flos": 18805341283200.0, + "grad_norm": 1.8046174937009931, + "language_loss": 0.75469184, + "learning_rate": 3.922205794037456e-06, + "loss": 0.77699012, + "num_input_tokens_seen": 41968100, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.2109375, + "step": 1936, + "time_per_iteration": 3.8786003589630127 + }, + { + "auxiliary_loss_clip": 0.01173369, + "auxiliary_loss_mlp": 0.0105617, + "balance_loss_clip": 1.0325551, + "balance_loss_mlp": 1.05320179, + "epoch": 0.1164587404178566, + "flos": 21214983214080.0, + "grad_norm": 1.9600676544166102, + "language_loss": 0.84061754, + "learning_rate": 3.922098191955998e-06, + "loss": 0.86291301, + "num_input_tokens_seen": 41986375, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.1953125, + "step": 1937, + "time_per_iteration": 2.5084798336029053 + }, + { + "auxiliary_loss_clip": 0.01166803, + "auxiliary_loss_mlp": 0.01045843, + "balance_loss_clip": 1.02533889, + "balance_loss_mlp": 1.05254185, + "epoch": 0.11651886367052458, + "flos": 27818632028160.0, + "grad_norm": 2.0067941571917927, + "language_loss": 0.76479459, + "learning_rate": 3.921990516988384e-06, + "loss": 0.78692102, + "num_input_tokens_seen": 42006055, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.140625, + "step": 1938, + "time_per_iteration": 2.5770225524902344 + }, + { + "auxiliary_loss_clip": 0.01177239, + "auxiliary_loss_mlp": 0.01051282, + "balance_loss_clip": 1.02963328, + "balance_loss_mlp": 1.05566061, + "epoch": 0.11657898692319255, + "flos": 22889569224960.0, + "grad_norm": 2.0274312317590084, + "language_loss": 0.79127967, + "learning_rate": 3.921882769138696e-06, + "loss": 0.8135649, + "num_input_tokens_seen": 42024995, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.21875, + "step": 1939, + "time_per_iteration": 2.5020864009857178 + }, + { + "auxiliary_loss_clip": 0.01173869, + "auxiliary_loss_mlp": 0.0105151, + "balance_loss_clip": 1.02886081, + "balance_loss_mlp": 1.05530274, + "epoch": 0.11663911017586051, + "flos": 24315905364480.0, + "grad_norm": 3.7077039427391343, + "language_loss": 0.86712289, + "learning_rate": 3.9217749484110215e-06, + "loss": 0.88937664, + "num_input_tokens_seen": 42042640, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.1875, + "step": 1940, + "time_per_iteration": 2.484750270843506 + }, + { + "auxiliary_loss_clip": 0.01172427, + "auxiliary_loss_mlp": 0.0105781, + "balance_loss_clip": 1.03699601, + "balance_loss_mlp": 1.05674481, + "epoch": 0.11669923342852849, + "flos": 42340152470400.0, + "grad_norm": 1.4506595925957548, + "language_loss": 0.75750297, + "learning_rate": 3.921667054809449e-06, + "loss": 0.7798053, + "num_input_tokens_seen": 42067005, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.15625, + "step": 1941, + "time_per_iteration": 2.7000842094421387 + }, + { + "auxiliary_loss_clip": 0.01167731, + "auxiliary_loss_mlp": 0.01059034, + "balance_loss_clip": 1.0375998, + "balance_loss_mlp": 1.05215478, + "epoch": 0.11675935668119646, + "flos": 14642288945280.0, + "grad_norm": 2.1675787105273256, + "language_loss": 0.8828994, + "learning_rate": 3.921559088338068e-06, + "loss": 0.90516704, + "num_input_tokens_seen": 42082295, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.15625, + "step": 1942, + "time_per_iteration": 2.460014581680298 + }, + { + "auxiliary_loss_clip": 0.01170106, + "auxiliary_loss_mlp": 0.01048326, + "balance_loss_clip": 1.02839422, + "balance_loss_mlp": 1.05465341, + "epoch": 0.11681947993386442, + "flos": 35116470063360.0, + "grad_norm": 1.688985931696262, + "language_loss": 0.67729998, + "learning_rate": 3.921451049000975e-06, + "loss": 0.69948429, + "num_input_tokens_seen": 42105295, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.15625, + "step": 1943, + "time_per_iteration": 2.5899837017059326 + }, + { + "auxiliary_loss_clip": 0.01170349, + "auxiliary_loss_mlp": 0.01046897, + "balance_loss_clip": 1.02586865, + "balance_loss_mlp": 1.05437136, + "epoch": 0.11687960318653239, + "flos": 38983259024640.0, + "grad_norm": 2.2767867948110263, + "language_loss": 0.69852126, + "learning_rate": 3.921342936802265e-06, + "loss": 0.72069371, + "num_input_tokens_seen": 42125520, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1640625, + "step": 1944, + "time_per_iteration": 2.6237125396728516 + }, + { + "auxiliary_loss_clip": 0.01166997, + "auxiliary_loss_mlp": 0.01045496, + "balance_loss_clip": 1.02513456, + "balance_loss_mlp": 1.05112338, + "epoch": 0.11693972643920036, + "flos": 25994980575360.0, + "grad_norm": 2.1059371232711572, + "language_loss": 0.82477605, + "learning_rate": 3.921234751746038e-06, + "loss": 0.84690094, + "num_input_tokens_seen": 42146335, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.15625, + "step": 1945, + "time_per_iteration": 2.519148349761963 + }, + { + "auxiliary_loss_clip": 0.01169071, + "auxiliary_loss_mlp": 0.01058941, + "balance_loss_clip": 1.03805542, + "balance_loss_mlp": 1.05241919, + "epoch": 0.11699984969186833, + "flos": 27272107618560.0, + "grad_norm": 2.378189536328268, + "language_loss": 0.7640717, + "learning_rate": 3.9211264938363975e-06, + "loss": 0.7863518, + "num_input_tokens_seen": 42165320, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1640625, + "step": 1946, + "time_per_iteration": 2.516782283782959 + }, + { + "auxiliary_loss_clip": 0.01169578, + "auxiliary_loss_mlp": 0.0105231, + "balance_loss_clip": 1.03249717, + "balance_loss_mlp": 1.05597568, + "epoch": 0.1170599729445363, + "flos": 15267853232640.0, + "grad_norm": 2.040115867247402, + "language_loss": 0.68749321, + "learning_rate": 3.921018163077448e-06, + "loss": 0.70971209, + "num_input_tokens_seen": 42182955, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.140625, + "step": 1947, + "time_per_iteration": 2.443979501724243 + }, + { + "auxiliary_loss_clip": 0.01173266, + "auxiliary_loss_mlp": 0.01062988, + "balance_loss_clip": 1.041924, + "balance_loss_mlp": 1.05761504, + "epoch": 0.11712009619720427, + "flos": 17164439251200.0, + "grad_norm": 1.892409556337103, + "language_loss": 0.84730887, + "learning_rate": 3.920909759473295e-06, + "loss": 0.86967146, + "num_input_tokens_seen": 42200760, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.15625, + "step": 1948, + "time_per_iteration": 2.456883192062378 + }, + { + "auxiliary_loss_clip": 0.01060706, + "auxiliary_loss_mlp": 0.01000375, + "balance_loss_clip": 0.99784815, + "balance_loss_mlp": 1.01743388, + "epoch": 0.11718021944987224, + "flos": 70940991997440.0, + "grad_norm": 0.8146373030628324, + "language_loss": 0.65102834, + "learning_rate": 3.920801283028054e-06, + "loss": 0.6716392, + "num_input_tokens_seen": 42265745, + "router_z_loss_clip": 0.02526855, + "router_z_loss_mlp": 0.43359375, + "step": 1949, + "time_per_iteration": 3.083716630935669 + }, + { + "auxiliary_loss_clip": 0.01168495, + "auxiliary_loss_mlp": 0.01056708, + "balance_loss_clip": 1.03614426, + "balance_loss_mlp": 1.05524707, + "epoch": 0.1172403427025402, + "flos": 27453456408960.0, + "grad_norm": 1.7265339558443402, + "language_loss": 0.71616268, + "learning_rate": 3.920692733745835e-06, + "loss": 0.73841476, + "num_input_tokens_seen": 42286245, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.1328125, + "step": 1950, + "time_per_iteration": 2.5140750408172607 + }, + { + "auxiliary_loss_clip": 0.01174036, + "auxiliary_loss_mlp": 0.0105899, + "balance_loss_clip": 1.03823543, + "balance_loss_mlp": 1.05524027, + "epoch": 0.11730046595520818, + "flos": 15668723992320.0, + "grad_norm": 13.047142281747327, + "language_loss": 0.76811576, + "learning_rate": 3.920584111630755e-06, + "loss": 0.79044604, + "num_input_tokens_seen": 42302710, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1875, + "step": 1951, + "time_per_iteration": 2.4511098861694336 + }, + { + "auxiliary_loss_clip": 0.01172385, + "auxiliary_loss_mlp": 0.0106409, + "balance_loss_clip": 1.04351449, + "balance_loss_mlp": 1.05736876, + "epoch": 0.11736058920787615, + "flos": 25630164092160.0, + "grad_norm": 2.4689531190361858, + "language_loss": 0.75770319, + "learning_rate": 3.9204754166869325e-06, + "loss": 0.78006792, + "num_input_tokens_seen": 42324115, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.15625, + "step": 1952, + "time_per_iteration": 2.5249404907226562 + }, + { + "auxiliary_loss_clip": 0.01170041, + "auxiliary_loss_mlp": 0.01060486, + "balance_loss_clip": 1.04038692, + "balance_loss_mlp": 1.05350161, + "epoch": 0.11742071246054411, + "flos": 21434289701760.0, + "grad_norm": 1.8929141854364566, + "language_loss": 0.71838403, + "learning_rate": 3.920366648918491e-06, + "loss": 0.74068928, + "num_input_tokens_seen": 42342505, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.171875, + "step": 1953, + "time_per_iteration": 2.5321006774902344 + }, + { + "auxiliary_loss_clip": 0.01178671, + "auxiliary_loss_mlp": 0.01054108, + "balance_loss_clip": 1.03186345, + "balance_loss_mlp": 1.05794597, + "epoch": 0.11748083571321208, + "flos": 15997845335040.0, + "grad_norm": 2.5505654209141317, + "language_loss": 0.7939415, + "learning_rate": 3.920257808329552e-06, + "loss": 0.81626928, + "num_input_tokens_seen": 42360525, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.203125, + "step": 1954, + "time_per_iteration": 2.477182149887085 + }, + { + "auxiliary_loss_clip": 0.01174109, + "auxiliary_loss_mlp": 0.01060284, + "balance_loss_clip": 1.03859961, + "balance_loss_mlp": 1.05628419, + "epoch": 0.11754095896588006, + "flos": 16180056051840.0, + "grad_norm": 2.1305529461824344, + "language_loss": 0.85609406, + "learning_rate": 3.920148894924246e-06, + "loss": 0.878438, + "num_input_tokens_seen": 42377045, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.1796875, + "step": 1955, + "time_per_iteration": 2.4685070514678955 + }, + { + "auxiliary_loss_clip": 0.01172636, + "auxiliary_loss_mlp": 0.01049417, + "balance_loss_clip": 1.02949762, + "balance_loss_mlp": 1.05551839, + "epoch": 0.11760108221854802, + "flos": 13261596013440.0, + "grad_norm": 3.149612339355701, + "language_loss": 0.77626467, + "learning_rate": 3.920039908706701e-06, + "loss": 0.79848516, + "num_input_tokens_seen": 42393960, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.171875, + "step": 1956, + "time_per_iteration": 2.469529151916504 + }, + { + "auxiliary_loss_clip": 0.01169266, + "auxiliary_loss_mlp": 0.01054147, + "balance_loss_clip": 1.03357112, + "balance_loss_mlp": 1.05667603, + "epoch": 0.11766120547121599, + "flos": 24498439303680.0, + "grad_norm": 4.253665449575931, + "language_loss": 0.80333984, + "learning_rate": 3.91993084968105e-06, + "loss": 0.82557392, + "num_input_tokens_seen": 42413160, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.125, + "step": 1957, + "time_per_iteration": 2.508272886276245 + }, + { + "auxiliary_loss_clip": 0.01176684, + "auxiliary_loss_mlp": 0.01049845, + "balance_loss_clip": 1.03003287, + "balance_loss_mlp": 1.05895627, + "epoch": 0.11772132872388397, + "flos": 17784005967360.0, + "grad_norm": 3.1587185145349737, + "language_loss": 0.77638769, + "learning_rate": 3.919821717851428e-06, + "loss": 0.79865301, + "num_input_tokens_seen": 42432590, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.1796875, + "step": 1958, + "time_per_iteration": 2.48563551902771 + }, + { + "auxiliary_loss_clip": 0.01174636, + "auxiliary_loss_mlp": 0.01048305, + "balance_loss_clip": 1.02640605, + "balance_loss_mlp": 1.05859971, + "epoch": 0.11778145197655193, + "flos": 13217030213760.0, + "grad_norm": 2.0966272081131985, + "language_loss": 0.76906043, + "learning_rate": 3.919712513221976e-06, + "loss": 0.79128981, + "num_input_tokens_seen": 42450135, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.15625, + "step": 1959, + "time_per_iteration": 2.4826674461364746 + }, + { + "auxiliary_loss_clip": 0.01171719, + "auxiliary_loss_mlp": 0.01050959, + "balance_loss_clip": 1.03128934, + "balance_loss_mlp": 1.05581582, + "epoch": 0.1178415752292199, + "flos": 20230204965120.0, + "grad_norm": 3.13785825532277, + "language_loss": 0.69989765, + "learning_rate": 3.919603235796832e-06, + "loss": 0.72212446, + "num_input_tokens_seen": 42470050, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.15625, + "step": 1960, + "time_per_iteration": 2.4965405464172363 + }, + { + "auxiliary_loss_clip": 0.01178622, + "auxiliary_loss_mlp": 0.01058274, + "balance_loss_clip": 1.03704309, + "balance_loss_mlp": 1.05921102, + "epoch": 0.11790169848188788, + "flos": 13040134709760.0, + "grad_norm": 2.5802576751796327, + "language_loss": 0.81135678, + "learning_rate": 3.9194938855801406e-06, + "loss": 0.83372575, + "num_input_tokens_seen": 42484335, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1953125, + "step": 1961, + "time_per_iteration": 2.456537961959839 + }, + { + "auxiliary_loss_clip": 0.01167569, + "auxiliary_loss_mlp": 0.01055573, + "balance_loss_clip": 1.03640413, + "balance_loss_mlp": 1.05682623, + "epoch": 0.11796182173455584, + "flos": 22265728790400.0, + "grad_norm": 3.5009623449342206, + "language_loss": 0.92335653, + "learning_rate": 3.919384462576049e-06, + "loss": 0.94558799, + "num_input_tokens_seen": 42502720, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.109375, + "step": 1962, + "time_per_iteration": 2.4831955432891846 + }, + { + "auxiliary_loss_clip": 0.01175087, + "auxiliary_loss_mlp": 0.01054037, + "balance_loss_clip": 1.03379536, + "balance_loss_mlp": 1.05849361, + "epoch": 0.1180219449872238, + "flos": 10635017892480.0, + "grad_norm": 2.1891263418172353, + "language_loss": 0.87132198, + "learning_rate": 3.919274966788707e-06, + "loss": 0.89361322, + "num_input_tokens_seen": 42519460, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.1640625, + "step": 1963, + "time_per_iteration": 2.453864097595215 + }, + { + "auxiliary_loss_clip": 0.01174267, + "auxiliary_loss_mlp": 0.01047313, + "balance_loss_clip": 1.02764392, + "balance_loss_mlp": 1.05800569, + "epoch": 0.11808206823989177, + "flos": 20923532259840.0, + "grad_norm": 2.1122466665000155, + "language_loss": 0.84163988, + "learning_rate": 3.919165398222265e-06, + "loss": 0.86385566, + "num_input_tokens_seen": 42539420, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1640625, + "step": 1964, + "time_per_iteration": 2.496471405029297 + }, + { + "auxiliary_loss_clip": 0.01178376, + "auxiliary_loss_mlp": 0.01056634, + "balance_loss_clip": 1.03699994, + "balance_loss_mlp": 1.06327403, + "epoch": 0.11814219149255975, + "flos": 20777770869120.0, + "grad_norm": 1.965243610427017, + "language_loss": 0.82994169, + "learning_rate": 3.919055756880879e-06, + "loss": 0.85229176, + "num_input_tokens_seen": 42558225, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.1484375, + "step": 1965, + "time_per_iteration": 2.46545672416687 + }, + { + "auxiliary_loss_clip": 0.01175057, + "auxiliary_loss_mlp": 0.01049044, + "balance_loss_clip": 1.02883816, + "balance_loss_mlp": 1.05948591, + "epoch": 0.11820231474522772, + "flos": 48759938542080.0, + "grad_norm": 1.6968751772896917, + "language_loss": 0.74517393, + "learning_rate": 3.918946042768707e-06, + "loss": 0.76741493, + "num_input_tokens_seen": 42580790, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.15625, + "step": 1966, + "time_per_iteration": 2.730928421020508 + }, + { + "auxiliary_loss_clip": 0.01185811, + "auxiliary_loss_mlp": 0.01055482, + "balance_loss_clip": 1.03552604, + "balance_loss_mlp": 1.0661025, + "epoch": 0.11826243799789568, + "flos": 16690598012160.0, + "grad_norm": 3.573953561090722, + "language_loss": 0.725128, + "learning_rate": 3.918836255889908e-06, + "loss": 0.74754095, + "num_input_tokens_seen": 42597355, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.1953125, + "step": 1967, + "time_per_iteration": 2.459409713745117 + }, + { + "auxiliary_loss_clip": 0.01174728, + "auxiliary_loss_mlp": 0.01044222, + "balance_loss_clip": 1.02409899, + "balance_loss_mlp": 1.0596199, + "epoch": 0.11832256125056366, + "flos": 16909868586240.0, + "grad_norm": 2.07735233424318, + "language_loss": 0.87874025, + "learning_rate": 3.9187263962486456e-06, + "loss": 0.90092969, + "num_input_tokens_seen": 42616060, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.15625, + "step": 1968, + "time_per_iteration": 2.474860191345215 + }, + { + "auxiliary_loss_clip": 0.0117476, + "auxiliary_loss_mlp": 0.01051094, + "balance_loss_clip": 1.03083992, + "balance_loss_mlp": 1.05980873, + "epoch": 0.11838268450323162, + "flos": 22820405587200.0, + "grad_norm": 2.3710109771053904, + "language_loss": 0.66827953, + "learning_rate": 3.918616463849087e-06, + "loss": 0.69053805, + "num_input_tokens_seen": 42636285, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1484375, + "step": 1969, + "time_per_iteration": 2.5025057792663574 + }, + { + "auxiliary_loss_clip": 0.01177024, + "auxiliary_loss_mlp": 0.01052173, + "balance_loss_clip": 1.03172874, + "balance_loss_mlp": 1.06375933, + "epoch": 0.11844280775589959, + "flos": 33545844990720.0, + "grad_norm": 2.0668162562591013, + "language_loss": 0.81199527, + "learning_rate": 3.918506458695399e-06, + "loss": 0.83428723, + "num_input_tokens_seen": 42658320, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1328125, + "step": 1970, + "time_per_iteration": 2.6005184650421143 + }, + { + "auxiliary_loss_clip": 0.01071753, + "auxiliary_loss_mlp": 0.0102596, + "balance_loss_clip": 1.02306354, + "balance_loss_mlp": 1.02803779, + "epoch": 0.11850293100856757, + "flos": 66350998604160.0, + "grad_norm": 0.8059191438251484, + "language_loss": 0.66145539, + "learning_rate": 3.918396380791754e-06, + "loss": 0.68243253, + "num_input_tokens_seen": 42721500, + "router_z_loss_clip": 0.02893066, + "router_z_loss_mlp": 0.4375, + "step": 1971, + "time_per_iteration": 3.0580737590789795 + }, + { + "auxiliary_loss_clip": 0.01173379, + "auxiliary_loss_mlp": 0.0105069, + "balance_loss_clip": 1.03112769, + "balance_loss_mlp": 1.0578413, + "epoch": 0.11856305426123553, + "flos": 24681045070080.0, + "grad_norm": 1.9720310647047086, + "language_loss": 0.79760695, + "learning_rate": 3.918286230142327e-06, + "loss": 0.81984764, + "num_input_tokens_seen": 42739825, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.15625, + "step": 1972, + "time_per_iteration": 2.5330677032470703 + }, + { + "auxiliary_loss_clip": 0.01174806, + "auxiliary_loss_mlp": 0.01052417, + "balance_loss_clip": 1.03144813, + "balance_loss_mlp": 1.06013465, + "epoch": 0.1186231775139035, + "flos": 24280102483200.0, + "grad_norm": 2.451560144092476, + "language_loss": 0.72162819, + "learning_rate": 3.918176006751292e-06, + "loss": 0.74390036, + "num_input_tokens_seen": 42758695, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.140625, + "step": 1973, + "time_per_iteration": 2.497079372406006 + }, + { + "auxiliary_loss_clip": 0.0117035, + "auxiliary_loss_mlp": 0.01043803, + "balance_loss_clip": 1.02407408, + "balance_loss_mlp": 1.05802357, + "epoch": 0.11868330076657148, + "flos": 21757413473280.0, + "grad_norm": 2.2680636805256897, + "language_loss": 0.71724641, + "learning_rate": 3.918065710622832e-06, + "loss": 0.73938787, + "num_input_tokens_seen": 42778510, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.125, + "step": 1974, + "time_per_iteration": 2.5145771503448486 + }, + { + "auxiliary_loss_clip": 0.01170733, + "auxiliary_loss_mlp": 0.01038557, + "balance_loss_clip": 1.01937568, + "balance_loss_mlp": 1.05660915, + "epoch": 0.11874342401923944, + "flos": 17193274894080.0, + "grad_norm": 2.192039880981389, + "language_loss": 0.77186036, + "learning_rate": 3.917955341761128e-06, + "loss": 0.7939533, + "num_input_tokens_seen": 42793995, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.140625, + "step": 1975, + "time_per_iteration": 2.4483766555786133 + }, + { + "auxiliary_loss_clip": 0.01172653, + "auxiliary_loss_mlp": 0.01051494, + "balance_loss_clip": 1.03212273, + "balance_loss_mlp": 1.06021976, + "epoch": 0.11880354727190741, + "flos": 15229572312960.0, + "grad_norm": 2.2667330410251596, + "language_loss": 0.7498399, + "learning_rate": 3.917844900170364e-06, + "loss": 0.77208138, + "num_input_tokens_seen": 42809000, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.125, + "step": 1976, + "time_per_iteration": 3.9421374797821045 + }, + { + "auxiliary_loss_clip": 0.01172444, + "auxiliary_loss_mlp": 0.01044553, + "balance_loss_clip": 1.02544367, + "balance_loss_mlp": 1.05979395, + "epoch": 0.11886367052457537, + "flos": 27309706179840.0, + "grad_norm": 1.6192257034176818, + "language_loss": 0.75191766, + "learning_rate": 3.91773438585473e-06, + "loss": 0.77408761, + "num_input_tokens_seen": 42831585, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.125, + "step": 1977, + "time_per_iteration": 3.9506070613861084 + }, + { + "auxiliary_loss_clip": 0.01172952, + "auxiliary_loss_mlp": 0.01053238, + "balance_loss_clip": 1.0338068, + "balance_loss_mlp": 1.05777454, + "epoch": 0.11892379377724335, + "flos": 21798280172160.0, + "grad_norm": 7.387040580957373, + "language_loss": 0.7393533, + "learning_rate": 3.9176237988184165e-06, + "loss": 0.76161528, + "num_input_tokens_seen": 42848420, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.15625, + "step": 1978, + "time_per_iteration": 2.4754912853240967 + }, + { + "auxiliary_loss_clip": 0.01168854, + "auxiliary_loss_mlp": 0.01048253, + "balance_loss_clip": 1.02872658, + "balance_loss_mlp": 1.05782461, + "epoch": 0.11898391702991132, + "flos": 13991013498240.0, + "grad_norm": 1.709416576437117, + "language_loss": 0.73273945, + "learning_rate": 3.917513139065616e-06, + "loss": 0.75491059, + "num_input_tokens_seen": 42866645, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.109375, + "step": 1979, + "time_per_iteration": 2.478938579559326 + }, + { + "auxiliary_loss_clip": 0.01172076, + "auxiliary_loss_mlp": 0.01048439, + "balance_loss_clip": 1.0286746, + "balance_loss_mlp": 1.05735934, + "epoch": 0.11904404028257928, + "flos": 32234567091840.0, + "grad_norm": 1.877436937799078, + "language_loss": 0.98387957, + "learning_rate": 3.917402406600525e-06, + "loss": 1.00608468, + "num_input_tokens_seen": 42888515, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1484375, + "step": 1980, + "time_per_iteration": 2.5758843421936035 + }, + { + "auxiliary_loss_clip": 0.01173349, + "auxiliary_loss_mlp": 0.01046819, + "balance_loss_clip": 1.02580202, + "balance_loss_mlp": 1.05741775, + "epoch": 0.11910416353524726, + "flos": 23586272398080.0, + "grad_norm": 1.8930015682875676, + "language_loss": 0.85929906, + "learning_rate": 3.917291601427342e-06, + "loss": 0.88150084, + "num_input_tokens_seen": 42909035, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.15625, + "step": 1981, + "time_per_iteration": 2.505051612854004 + }, + { + "auxiliary_loss_clip": 0.01172656, + "auxiliary_loss_mlp": 0.01057237, + "balance_loss_clip": 1.03601766, + "balance_loss_mlp": 1.057832, + "epoch": 0.11916428678791523, + "flos": 25333038789120.0, + "grad_norm": 1.9242535829958574, + "language_loss": 0.85007018, + "learning_rate": 3.91718072355027e-06, + "loss": 0.87236911, + "num_input_tokens_seen": 42927555, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1484375, + "step": 1982, + "time_per_iteration": 2.513012409210205 + }, + { + "auxiliary_loss_clip": 0.01166906, + "auxiliary_loss_mlp": 0.01046511, + "balance_loss_clip": 1.02667475, + "balance_loss_mlp": 1.05463564, + "epoch": 0.11922441004058319, + "flos": 19788431592960.0, + "grad_norm": 1.926275276354154, + "language_loss": 0.85026526, + "learning_rate": 3.917069772973513e-06, + "loss": 0.87239939, + "num_input_tokens_seen": 42945300, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.125, + "step": 1983, + "time_per_iteration": 2.4627623558044434 + }, + { + "auxiliary_loss_clip": 0.01172266, + "auxiliary_loss_mlp": 0.01049845, + "balance_loss_clip": 1.02980638, + "balance_loss_mlp": 1.05581713, + "epoch": 0.11928453329325117, + "flos": 21536347219200.0, + "grad_norm": 3.2679367356540894, + "language_loss": 0.77020949, + "learning_rate": 3.916958749701277e-06, + "loss": 0.79243064, + "num_input_tokens_seen": 42961295, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.15625, + "step": 1984, + "time_per_iteration": 2.466224193572998 + }, + { + "auxiliary_loss_clip": 0.01168386, + "auxiliary_loss_mlp": 0.01055095, + "balance_loss_clip": 1.03542554, + "balance_loss_mlp": 1.05464029, + "epoch": 0.11934465654591914, + "flos": 20815010294400.0, + "grad_norm": 1.7272493982968635, + "language_loss": 0.83323789, + "learning_rate": 3.9168476537377745e-06, + "loss": 0.85547268, + "num_input_tokens_seen": 42980330, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.140625, + "step": 1985, + "time_per_iteration": 2.485797882080078 + }, + { + "auxiliary_loss_clip": 0.01162278, + "auxiliary_loss_mlp": 0.01046498, + "balance_loss_clip": 1.02659011, + "balance_loss_mlp": 1.05230284, + "epoch": 0.1194047797985871, + "flos": 19060486565760.0, + "grad_norm": 1.9847962315308523, + "language_loss": 0.7379061, + "learning_rate": 3.916736485087216e-06, + "loss": 0.75999391, + "num_input_tokens_seen": 42996125, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.1015625, + "step": 1986, + "time_per_iteration": 2.4477651119232178 + }, + { + "auxiliary_loss_clip": 0.01167211, + "auxiliary_loss_mlp": 0.01055872, + "balance_loss_clip": 1.03664303, + "balance_loss_mlp": 1.05418456, + "epoch": 0.11946490305125507, + "flos": 27190805184000.0, + "grad_norm": 2.0940320364759573, + "language_loss": 0.7209813, + "learning_rate": 3.916625243753819e-06, + "loss": 0.74321216, + "num_input_tokens_seen": 43014180, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.125, + "step": 1987, + "time_per_iteration": 2.528564929962158 + }, + { + "auxiliary_loss_clip": 0.01166851, + "auxiliary_loss_mlp": 0.01053632, + "balance_loss_clip": 1.03256774, + "balance_loss_mlp": 1.05243921, + "epoch": 0.11952502630392305, + "flos": 21140791672320.0, + "grad_norm": 2.544292945564917, + "language_loss": 0.72455966, + "learning_rate": 3.916513929741799e-06, + "loss": 0.74676454, + "num_input_tokens_seen": 43032120, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.140625, + "step": 1988, + "time_per_iteration": 2.482295274734497 + }, + { + "auxiliary_loss_clip": 0.01168039, + "auxiliary_loss_mlp": 0.01063511, + "balance_loss_clip": 1.04195809, + "balance_loss_mlp": 1.05425191, + "epoch": 0.11958514955659101, + "flos": 22124241118080.0, + "grad_norm": 2.3919568417846544, + "language_loss": 0.80848205, + "learning_rate": 3.91640254305538e-06, + "loss": 0.83079755, + "num_input_tokens_seen": 43052215, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.140625, + "step": 1989, + "time_per_iteration": 2.5321335792541504 + }, + { + "auxiliary_loss_clip": 0.01171171, + "auxiliary_loss_mlp": 0.01051003, + "balance_loss_clip": 1.03040385, + "balance_loss_mlp": 1.05518925, + "epoch": 0.11964527280925898, + "flos": 17421452040960.0, + "grad_norm": 2.7848130249027077, + "language_loss": 0.76000333, + "learning_rate": 3.916291083698784e-06, + "loss": 0.78222507, + "num_input_tokens_seen": 43069720, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.15625, + "step": 1990, + "time_per_iteration": 2.4608383178710938 + }, + { + "auxiliary_loss_clip": 0.01060104, + "auxiliary_loss_mlp": 0.0101675, + "balance_loss_clip": 1.01392448, + "balance_loss_mlp": 1.01813149, + "epoch": 0.11970539606192696, + "flos": 70679741402880.0, + "grad_norm": 0.8877551125762418, + "language_loss": 0.55219597, + "learning_rate": 3.916179551676238e-06, + "loss": 0.57296449, + "num_input_tokens_seen": 43123130, + "router_z_loss_clip": 0.02819824, + "router_z_loss_mlp": 0.41992188, + "step": 1991, + "time_per_iteration": 3.0575883388519287 + }, + { + "auxiliary_loss_clip": 0.01166639, + "auxiliary_loss_mlp": 0.01048947, + "balance_loss_clip": 1.02905095, + "balance_loss_mlp": 1.05472517, + "epoch": 0.11976551931459492, + "flos": 21215019127680.0, + "grad_norm": 2.2244739837006797, + "language_loss": 0.78156978, + "learning_rate": 3.916067946991971e-06, + "loss": 0.8037256, + "num_input_tokens_seen": 43140015, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.1171875, + "step": 1992, + "time_per_iteration": 2.5395517349243164 + }, + { + "auxiliary_loss_clip": 0.01170251, + "auxiliary_loss_mlp": 0.0104925, + "balance_loss_clip": 1.02819777, + "balance_loss_mlp": 1.0534482, + "epoch": 0.11982564256726289, + "flos": 25989306226560.0, + "grad_norm": 1.898510109378507, + "language_loss": 0.78694016, + "learning_rate": 3.915956269650216e-06, + "loss": 0.80913514, + "num_input_tokens_seen": 43160105, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1640625, + "step": 1993, + "time_per_iteration": 2.5264625549316406 + }, + { + "auxiliary_loss_clip": 0.01165494, + "auxiliary_loss_mlp": 0.01058458, + "balance_loss_clip": 1.03837109, + "balance_loss_mlp": 1.05150676, + "epoch": 0.11988576581993086, + "flos": 21650866755840.0, + "grad_norm": 1.7590613991113047, + "language_loss": 0.82287014, + "learning_rate": 3.915844519655208e-06, + "loss": 0.8451097, + "num_input_tokens_seen": 43179835, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.140625, + "step": 1994, + "time_per_iteration": 2.4871127605438232 + }, + { + "auxiliary_loss_clip": 0.01166639, + "auxiliary_loss_mlp": 0.01054967, + "balance_loss_clip": 1.03551149, + "balance_loss_mlp": 1.05389762, + "epoch": 0.11994588907259883, + "flos": 17857407409920.0, + "grad_norm": 2.1035856813409786, + "language_loss": 0.87953222, + "learning_rate": 3.915732697011183e-06, + "loss": 0.9017483, + "num_input_tokens_seen": 43197210, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.125, + "step": 1995, + "time_per_iteration": 2.46690034866333 + }, + { + "auxiliary_loss_clip": 0.01169288, + "auxiliary_loss_mlp": 0.01057862, + "balance_loss_clip": 1.03692937, + "balance_loss_mlp": 1.05346155, + "epoch": 0.1200060123252668, + "flos": 24462744163200.0, + "grad_norm": 2.783456627489481, + "language_loss": 0.74206698, + "learning_rate": 3.9156208017223825e-06, + "loss": 0.76433849, + "num_input_tokens_seen": 43215050, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.15625, + "step": 1996, + "time_per_iteration": 2.5115768909454346 + }, + { + "auxiliary_loss_clip": 0.01167539, + "auxiliary_loss_mlp": 0.01052549, + "balance_loss_clip": 1.03138888, + "balance_loss_mlp": 1.05337763, + "epoch": 0.12006613557793476, + "flos": 18732191235840.0, + "grad_norm": 1.9342712291191904, + "language_loss": 0.88266122, + "learning_rate": 3.915508833793048e-06, + "loss": 0.90486217, + "num_input_tokens_seen": 43233900, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.140625, + "step": 1997, + "time_per_iteration": 2.4716532230377197 + }, + { + "auxiliary_loss_clip": 0.01167703, + "auxiliary_loss_mlp": 0.01063842, + "balance_loss_clip": 1.04287314, + "balance_loss_mlp": 1.05315256, + "epoch": 0.12012625883060274, + "flos": 22267739952000.0, + "grad_norm": 3.8633631849497054, + "language_loss": 0.78929418, + "learning_rate": 3.915396793227428e-06, + "loss": 0.81160963, + "num_input_tokens_seen": 43252105, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1484375, + "step": 1998, + "time_per_iteration": 2.4798996448516846 + }, + { + "auxiliary_loss_clip": 0.01170638, + "auxiliary_loss_mlp": 0.01048668, + "balance_loss_clip": 1.027318, + "balance_loss_mlp": 1.05610394, + "epoch": 0.1201863820832707, + "flos": 21758885930880.0, + "grad_norm": 2.053047413592738, + "language_loss": 0.73435485, + "learning_rate": 3.915284680029769e-06, + "loss": 0.75654793, + "num_input_tokens_seen": 43270315, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1484375, + "step": 1999, + "time_per_iteration": 2.5017611980438232 + }, + { + "auxiliary_loss_clip": 0.01169689, + "auxiliary_loss_mlp": 0.01065385, + "balance_loss_clip": 1.04436839, + "balance_loss_mlp": 1.05347967, + "epoch": 0.12024650533593867, + "flos": 21907987286400.0, + "grad_norm": 3.6093884580795677, + "language_loss": 0.74955112, + "learning_rate": 3.915172494204323e-06, + "loss": 0.77190185, + "num_input_tokens_seen": 43289935, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.15625, + "step": 2000, + "time_per_iteration": 2.5060245990753174 + }, + { + "auxiliary_loss_clip": 0.01170552, + "auxiliary_loss_mlp": 0.01050835, + "balance_loss_clip": 1.02997398, + "balance_loss_mlp": 1.05408299, + "epoch": 0.12030662858860665, + "flos": 21689219502720.0, + "grad_norm": 1.5368563042333518, + "language_loss": 0.84667969, + "learning_rate": 3.915060235755344e-06, + "loss": 0.86889356, + "num_input_tokens_seen": 43309325, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.171875, + "step": 2001, + "time_per_iteration": 2.499922752380371 + }, + { + "auxiliary_loss_clip": 0.01168457, + "auxiliary_loss_mlp": 0.0105136, + "balance_loss_clip": 1.03176236, + "balance_loss_mlp": 1.05330753, + "epoch": 0.12036675184127461, + "flos": 12933228856320.0, + "grad_norm": 2.074842616733997, + "language_loss": 0.73982531, + "learning_rate": 3.91494790468709e-06, + "loss": 0.76202351, + "num_input_tokens_seen": 43327010, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.15625, + "step": 2002, + "time_per_iteration": 2.486853837966919 + }, + { + "auxiliary_loss_clip": 0.01175825, + "auxiliary_loss_mlp": 0.01058049, + "balance_loss_clip": 1.03599501, + "balance_loss_mlp": 1.05508709, + "epoch": 0.12042687509394258, + "flos": 20851028657280.0, + "grad_norm": 2.832741043586106, + "language_loss": 0.78091669, + "learning_rate": 3.9148355010038185e-06, + "loss": 0.80325544, + "num_input_tokens_seen": 43345650, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.2109375, + "step": 2003, + "time_per_iteration": 2.4740982055664062 + }, + { + "auxiliary_loss_clip": 0.01166397, + "auxiliary_loss_mlp": 0.01050741, + "balance_loss_clip": 1.02979612, + "balance_loss_mlp": 1.0521121, + "epoch": 0.12048699834661056, + "flos": 23878513451520.0, + "grad_norm": 1.9652989098821625, + "language_loss": 0.72093791, + "learning_rate": 3.914723024709793e-06, + "loss": 0.74310923, + "num_input_tokens_seen": 43365555, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.140625, + "step": 2004, + "time_per_iteration": 2.5126965045928955 + }, + { + "auxiliary_loss_clip": 0.01174991, + "auxiliary_loss_mlp": 0.01061179, + "balance_loss_clip": 1.03877997, + "balance_loss_mlp": 1.0546937, + "epoch": 0.12054712159927852, + "flos": 19756363726080.0, + "grad_norm": 2.2150760255497945, + "language_loss": 0.78260767, + "learning_rate": 3.914610475809279e-06, + "loss": 0.80496937, + "num_input_tokens_seen": 43384990, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.203125, + "step": 2005, + "time_per_iteration": 2.4991190433502197 + }, + { + "auxiliary_loss_clip": 0.01056255, + "auxiliary_loss_mlp": 0.01005501, + "balance_loss_clip": 1.00284314, + "balance_loss_mlp": 1.01496482, + "epoch": 0.12060724485194649, + "flos": 51672763123200.0, + "grad_norm": 0.9233110616682776, + "language_loss": 0.58020771, + "learning_rate": 3.914497854306543e-06, + "loss": 0.60082525, + "num_input_tokens_seen": 43436335, + "router_z_loss_clip": 0.02661133, + "router_z_loss_mlp": 0.4140625, + "step": 2006, + "time_per_iteration": 2.8520798683166504 + }, + { + "auxiliary_loss_clip": 0.01165745, + "auxiliary_loss_mlp": 0.01049181, + "balance_loss_clip": 1.02958333, + "balance_loss_mlp": 1.05345094, + "epoch": 0.12066736810461445, + "flos": 18990425088000.0, + "grad_norm": 1.7247761793975513, + "language_loss": 0.76275218, + "learning_rate": 3.9143851602058575e-06, + "loss": 0.78490144, + "num_input_tokens_seen": 43456495, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.125, + "step": 2007, + "time_per_iteration": 2.50325083732605 + }, + { + "auxiliary_loss_clip": 0.01170732, + "auxiliary_loss_mlp": 0.01058411, + "balance_loss_clip": 1.03653646, + "balance_loss_mlp": 1.05348623, + "epoch": 0.12072749135728243, + "flos": 16471973882880.0, + "grad_norm": 3.332475401193337, + "language_loss": 0.82973194, + "learning_rate": 3.914272393511494e-06, + "loss": 0.85202336, + "num_input_tokens_seen": 43473085, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.171875, + "step": 2008, + "time_per_iteration": 2.4650609493255615 + }, + { + "auxiliary_loss_clip": 0.0116834, + "auxiliary_loss_mlp": 0.01054228, + "balance_loss_clip": 1.03319979, + "balance_loss_mlp": 1.05225682, + "epoch": 0.1207876146099504, + "flos": 18077108947200.0, + "grad_norm": 2.236244219024357, + "language_loss": 0.84184098, + "learning_rate": 3.91415955422773e-06, + "loss": 0.86406672, + "num_input_tokens_seen": 43491135, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1640625, + "step": 2009, + "time_per_iteration": 2.4602744579315186 + }, + { + "auxiliary_loss_clip": 0.01170159, + "auxiliary_loss_mlp": 0.01053411, + "balance_loss_clip": 1.03083277, + "balance_loss_mlp": 1.0551877, + "epoch": 0.12084773786261836, + "flos": 21871573873920.0, + "grad_norm": 1.7312486930792712, + "language_loss": 0.83945864, + "learning_rate": 3.914046642358844e-06, + "loss": 0.86169434, + "num_input_tokens_seen": 43510440, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.1484375, + "step": 2010, + "time_per_iteration": 2.480238437652588 + }, + { + "auxiliary_loss_clip": 0.01171814, + "auxiliary_loss_mlp": 0.01056176, + "balance_loss_clip": 1.03437304, + "balance_loss_mlp": 1.05634403, + "epoch": 0.12090786111528634, + "flos": 18333044328960.0, + "grad_norm": 1.658807365911602, + "language_loss": 0.84157598, + "learning_rate": 3.9139336579091174e-06, + "loss": 0.8638559, + "num_input_tokens_seen": 43530145, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.15625, + "step": 2011, + "time_per_iteration": 2.454406499862671 + }, + { + "auxiliary_loss_clip": 0.01172165, + "auxiliary_loss_mlp": 0.01055064, + "balance_loss_clip": 1.03386891, + "balance_loss_mlp": 1.055547, + "epoch": 0.1209679843679543, + "flos": 21105850717440.0, + "grad_norm": 1.879921554869875, + "language_loss": 0.96007967, + "learning_rate": 3.913820600882834e-06, + "loss": 0.9823519, + "num_input_tokens_seen": 43549315, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.171875, + "step": 2012, + "time_per_iteration": 2.479583740234375 + }, + { + "auxiliary_loss_clip": 0.01166488, + "auxiliary_loss_mlp": 0.01047788, + "balance_loss_clip": 1.026914, + "balance_loss_mlp": 1.05365777, + "epoch": 0.12102810762062227, + "flos": 29241053585280.0, + "grad_norm": 2.6055417591736036, + "language_loss": 0.80619711, + "learning_rate": 3.913707471284283e-06, + "loss": 0.82833993, + "num_input_tokens_seen": 43569240, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.125, + "step": 2013, + "time_per_iteration": 2.538651704788208 + }, + { + "auxiliary_loss_clip": 0.01172968, + "auxiliary_loss_mlp": 0.0104686, + "balance_loss_clip": 1.02444816, + "balance_loss_mlp": 1.05412138, + "epoch": 0.12108823087329025, + "flos": 17930701111680.0, + "grad_norm": 3.9791821612033953, + "language_loss": 0.77157021, + "learning_rate": 3.9135942691177515e-06, + "loss": 0.79376847, + "num_input_tokens_seen": 43587710, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.1875, + "step": 2014, + "time_per_iteration": 2.4411396980285645 + }, + { + "auxiliary_loss_clip": 0.01169091, + "auxiliary_loss_mlp": 0.01046827, + "balance_loss_clip": 1.02509499, + "balance_loss_mlp": 1.05448556, + "epoch": 0.12114835412595822, + "flos": 22091850028800.0, + "grad_norm": 2.028780359370303, + "language_loss": 0.86930937, + "learning_rate": 3.913480994387535e-06, + "loss": 0.89146852, + "num_input_tokens_seen": 43606000, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.140625, + "step": 2015, + "time_per_iteration": 2.4546844959259033 + }, + { + "auxiliary_loss_clip": 0.01159471, + "auxiliary_loss_mlp": 0.01047561, + "balance_loss_clip": 1.0268662, + "balance_loss_mlp": 1.04779112, + "epoch": 0.12120847737862618, + "flos": 20412343854720.0, + "grad_norm": 2.0866681231001762, + "language_loss": 0.69274801, + "learning_rate": 3.913367647097926e-06, + "loss": 0.71481836, + "num_input_tokens_seen": 43624815, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.109375, + "step": 2016, + "time_per_iteration": 2.469177007675171 + }, + { + "auxiliary_loss_clip": 0.01169041, + "auxiliary_loss_mlp": 0.01043193, + "balance_loss_clip": 1.02042413, + "balance_loss_mlp": 1.05407953, + "epoch": 0.12126860063129415, + "flos": 22309037614080.0, + "grad_norm": 3.095255398319528, + "language_loss": 0.80049825, + "learning_rate": 3.913254227253225e-06, + "loss": 0.82262057, + "num_input_tokens_seen": 43643960, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.15625, + "step": 2017, + "time_per_iteration": 2.459447145462036 + }, + { + "auxiliary_loss_clip": 0.01168347, + "auxiliary_loss_mlp": 0.01051308, + "balance_loss_clip": 1.0292666, + "balance_loss_mlp": 1.05315137, + "epoch": 0.12132872388396213, + "flos": 13699275235200.0, + "grad_norm": 2.364451122732105, + "language_loss": 0.69343489, + "learning_rate": 3.913140734857731e-06, + "loss": 0.71563143, + "num_input_tokens_seen": 43662650, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1484375, + "step": 2018, + "time_per_iteration": 3.919508695602417 + }, + { + "auxiliary_loss_clip": 0.0117102, + "auxiliary_loss_mlp": 0.0105213, + "balance_loss_clip": 1.03226995, + "balance_loss_mlp": 1.05712008, + "epoch": 0.12138884713663009, + "flos": 26466954307200.0, + "grad_norm": 2.162901456551013, + "language_loss": 0.72318506, + "learning_rate": 3.91302716991575e-06, + "loss": 0.74541652, + "num_input_tokens_seen": 43684205, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.140625, + "step": 2019, + "time_per_iteration": 3.910888433456421 + }, + { + "auxiliary_loss_clip": 0.01168573, + "auxiliary_loss_mlp": 0.01057878, + "balance_loss_clip": 1.03615856, + "balance_loss_mlp": 1.05187333, + "epoch": 0.12144897038929806, + "flos": 26141603892480.0, + "grad_norm": 1.8061721544245042, + "language_loss": 0.92484713, + "learning_rate": 3.912913532431586e-06, + "loss": 0.94711161, + "num_input_tokens_seen": 43706320, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.1640625, + "step": 2020, + "time_per_iteration": 2.5007998943328857 + }, + { + "auxiliary_loss_clip": 0.01168404, + "auxiliary_loss_mlp": 0.0105088, + "balance_loss_clip": 1.03064966, + "balance_loss_mlp": 1.05388308, + "epoch": 0.12150909364196603, + "flos": 24717530309760.0, + "grad_norm": 1.9478588429028871, + "language_loss": 0.77149868, + "learning_rate": 3.912799822409549e-06, + "loss": 0.79369152, + "num_input_tokens_seen": 43724805, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.140625, + "step": 2021, + "time_per_iteration": 2.522216796875 + }, + { + "auxiliary_loss_clip": 0.01165897, + "auxiliary_loss_mlp": 0.01046456, + "balance_loss_clip": 1.02586901, + "balance_loss_mlp": 1.05312037, + "epoch": 0.121569216894634, + "flos": 25186990089600.0, + "grad_norm": 2.0305604143992944, + "language_loss": 0.80324662, + "learning_rate": 3.912686039853952e-06, + "loss": 0.82537007, + "num_input_tokens_seen": 43742320, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.125, + "step": 2022, + "time_per_iteration": 2.518737316131592 + }, + { + "auxiliary_loss_clip": 0.01173528, + "auxiliary_loss_mlp": 0.01051897, + "balance_loss_clip": 1.03094029, + "balance_loss_mlp": 1.057019, + "epoch": 0.12162934014730196, + "flos": 13444094039040.0, + "grad_norm": 1.9019957932594662, + "language_loss": 0.8458122, + "learning_rate": 3.912572184769108e-06, + "loss": 0.86806649, + "num_input_tokens_seen": 43760665, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1640625, + "step": 2023, + "time_per_iteration": 2.4534339904785156 + }, + { + "auxiliary_loss_clip": 0.01169339, + "auxiliary_loss_mlp": 0.0104975, + "balance_loss_clip": 1.02916241, + "balance_loss_mlp": 1.05421007, + "epoch": 0.12168946339996994, + "flos": 16946138344320.0, + "grad_norm": 2.2004951084054234, + "language_loss": 0.85155022, + "learning_rate": 3.912458257159335e-06, + "loss": 0.87374109, + "num_input_tokens_seen": 43779020, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.15625, + "step": 2024, + "time_per_iteration": 2.436833143234253 + }, + { + "auxiliary_loss_clip": 0.0116415, + "auxiliary_loss_mlp": 0.010498, + "balance_loss_clip": 1.02974951, + "balance_loss_mlp": 1.04884946, + "epoch": 0.12174958665263791, + "flos": 29821585196160.0, + "grad_norm": 2.043367551334066, + "language_loss": 0.71662712, + "learning_rate": 3.912344257028954e-06, + "loss": 0.73876667, + "num_input_tokens_seen": 43798850, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.15625, + "step": 2025, + "time_per_iteration": 2.541215658187866 + }, + { + "auxiliary_loss_clip": 0.01168343, + "auxiliary_loss_mlp": 0.0104585, + "balance_loss_clip": 1.02564383, + "balance_loss_mlp": 1.05309796, + "epoch": 0.12180970990530587, + "flos": 24641902224000.0, + "grad_norm": 2.0848974538483755, + "language_loss": 0.75976777, + "learning_rate": 3.912230184382286e-06, + "loss": 0.7819097, + "num_input_tokens_seen": 43820130, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.15625, + "step": 2026, + "time_per_iteration": 2.529049873352051 + }, + { + "auxiliary_loss_clip": 0.01167307, + "auxiliary_loss_mlp": 0.01045796, + "balance_loss_clip": 1.02570963, + "balance_loss_mlp": 1.05251837, + "epoch": 0.12186983315797385, + "flos": 20521691832960.0, + "grad_norm": 2.6572777094172597, + "language_loss": 0.88875067, + "learning_rate": 3.912116039223659e-06, + "loss": 0.9108817, + "num_input_tokens_seen": 43838485, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.1484375, + "step": 2027, + "time_per_iteration": 2.472158432006836 + }, + { + "auxiliary_loss_clip": 0.01165413, + "auxiliary_loss_mlp": 0.01052054, + "balance_loss_clip": 1.03375518, + "balance_loss_mlp": 1.05316114, + "epoch": 0.12192995641064182, + "flos": 27818344719360.0, + "grad_norm": 2.343330799439898, + "language_loss": 0.75515145, + "learning_rate": 3.912001821557399e-06, + "loss": 0.77732611, + "num_input_tokens_seen": 43859080, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.125, + "step": 2028, + "time_per_iteration": 2.5286035537719727 + }, + { + "auxiliary_loss_clip": 0.01164893, + "auxiliary_loss_mlp": 0.010582, + "balance_loss_clip": 1.03758836, + "balance_loss_mlp": 1.05089998, + "epoch": 0.12199007966330978, + "flos": 22017119783040.0, + "grad_norm": 2.270604294931249, + "language_loss": 0.766294, + "learning_rate": 3.911887531387839e-06, + "loss": 0.78852487, + "num_input_tokens_seen": 43879030, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.140625, + "step": 2029, + "time_per_iteration": 2.479799747467041 + }, + { + "auxiliary_loss_clip": 0.0116289, + "auxiliary_loss_mlp": 0.01051159, + "balance_loss_clip": 1.03113246, + "balance_loss_mlp": 1.05001879, + "epoch": 0.12205020291597775, + "flos": 23295216493440.0, + "grad_norm": 2.2290592341985747, + "language_loss": 0.7955277, + "learning_rate": 3.911773168719313e-06, + "loss": 0.81766814, + "num_input_tokens_seen": 43898505, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.125, + "step": 2030, + "time_per_iteration": 2.479250431060791 + }, + { + "auxiliary_loss_clip": 0.01164659, + "auxiliary_loss_mlp": 0.01054283, + "balance_loss_clip": 1.03301597, + "balance_loss_mlp": 1.0526309, + "epoch": 0.12211032616864573, + "flos": 26031609469440.0, + "grad_norm": 3.9595633959777694, + "language_loss": 0.74556369, + "learning_rate": 3.911658733556155e-06, + "loss": 0.76775312, + "num_input_tokens_seen": 43917945, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.125, + "step": 2031, + "time_per_iteration": 2.4966888427734375 + }, + { + "auxiliary_loss_clip": 0.01166064, + "auxiliary_loss_mlp": 0.01045008, + "balance_loss_clip": 1.0269599, + "balance_loss_mlp": 1.05319047, + "epoch": 0.12217044942131369, + "flos": 20410943224320.0, + "grad_norm": 1.9774178696035418, + "language_loss": 0.75045705, + "learning_rate": 3.911544225902707e-06, + "loss": 0.77256775, + "num_input_tokens_seen": 43937385, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.1328125, + "step": 2032, + "time_per_iteration": 2.4545648097991943 + }, + { + "auxiliary_loss_clip": 0.01156748, + "auxiliary_loss_mlp": 0.01043308, + "balance_loss_clip": 1.02398455, + "balance_loss_mlp": 1.04844511, + "epoch": 0.12223057267398166, + "flos": 22857142222080.0, + "grad_norm": 1.6143118682838826, + "language_loss": 0.88853258, + "learning_rate": 3.911429645763311e-06, + "loss": 0.91053319, + "num_input_tokens_seen": 43958130, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0859375, + "step": 2033, + "time_per_iteration": 2.505521535873413 + }, + { + "auxiliary_loss_clip": 0.01170793, + "auxiliary_loss_mlp": 0.01050252, + "balance_loss_clip": 1.03059459, + "balance_loss_mlp": 1.05660009, + "epoch": 0.12229069592664964, + "flos": 20047563285120.0, + "grad_norm": 2.1152048244965096, + "language_loss": 0.65517056, + "learning_rate": 3.911314993142311e-06, + "loss": 0.67738092, + "num_input_tokens_seen": 43976800, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.140625, + "step": 2034, + "time_per_iteration": 2.4647884368896484 + }, + { + "auxiliary_loss_clip": 0.01167041, + "auxiliary_loss_mlp": 0.01055195, + "balance_loss_clip": 1.03425026, + "balance_loss_mlp": 1.05399358, + "epoch": 0.1223508191793176, + "flos": 22274240313600.0, + "grad_norm": 1.59634219760927, + "language_loss": 0.76435542, + "learning_rate": 3.911200268044055e-06, + "loss": 0.78657782, + "num_input_tokens_seen": 43996620, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.125, + "step": 2035, + "time_per_iteration": 2.483016014099121 + }, + { + "auxiliary_loss_clip": 0.01169828, + "auxiliary_loss_mlp": 0.01051267, + "balance_loss_clip": 1.03104889, + "balance_loss_mlp": 1.0543201, + "epoch": 0.12241094243198557, + "flos": 21285978445440.0, + "grad_norm": 1.8316823187763973, + "language_loss": 0.71407682, + "learning_rate": 3.911085470472892e-06, + "loss": 0.73628777, + "num_input_tokens_seen": 44016175, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.15625, + "step": 2036, + "time_per_iteration": 2.476471185684204 + }, + { + "auxiliary_loss_clip": 0.01168411, + "auxiliary_loss_mlp": 0.01051825, + "balance_loss_clip": 1.0309397, + "balance_loss_mlp": 1.05532706, + "epoch": 0.12247106568465355, + "flos": 17382381022080.0, + "grad_norm": 1.632988910709452, + "language_loss": 0.83352619, + "learning_rate": 3.910970600433178e-06, + "loss": 0.85572863, + "num_input_tokens_seen": 44035060, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.125, + "step": 2037, + "time_per_iteration": 2.476040840148926 + }, + { + "auxiliary_loss_clip": 0.0117386, + "auxiliary_loss_mlp": 0.01058093, + "balance_loss_clip": 1.03625405, + "balance_loss_mlp": 1.05652785, + "epoch": 0.12253118893732151, + "flos": 27045438842880.0, + "grad_norm": 2.722283338591856, + "language_loss": 0.80255699, + "learning_rate": 3.910855657929267e-06, + "loss": 0.82487655, + "num_input_tokens_seen": 44053330, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.171875, + "step": 2038, + "time_per_iteration": 2.5043163299560547 + }, + { + "auxiliary_loss_clip": 0.01058546, + "auxiliary_loss_mlp": 0.01007425, + "balance_loss_clip": 1.0051837, + "balance_loss_mlp": 1.01638949, + "epoch": 0.12259131218998948, + "flos": 53861518368000.0, + "grad_norm": 0.832889593555193, + "language_loss": 0.58671033, + "learning_rate": 3.910740642965518e-06, + "loss": 0.60737002, + "num_input_tokens_seen": 44107575, + "router_z_loss_clip": 0.02246094, + "router_z_loss_mlp": 0.421875, + "step": 2039, + "time_per_iteration": 2.9495608806610107 + }, + { + "auxiliary_loss_clip": 0.01172242, + "auxiliary_loss_mlp": 0.01049387, + "balance_loss_clip": 1.0277977, + "balance_loss_mlp": 1.05559754, + "epoch": 0.12265143544265744, + "flos": 17891917401600.0, + "grad_norm": 2.6229044060505298, + "language_loss": 0.80485016, + "learning_rate": 3.910625555546292e-06, + "loss": 0.82706642, + "num_input_tokens_seen": 44126075, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.171875, + "step": 2040, + "time_per_iteration": 2.4483039379119873 + }, + { + "auxiliary_loss_clip": 0.01166059, + "auxiliary_loss_mlp": 0.01050675, + "balance_loss_clip": 1.02977788, + "balance_loss_mlp": 1.05270815, + "epoch": 0.12271155869532542, + "flos": 21799932197760.0, + "grad_norm": 1.8235003945490114, + "language_loss": 0.82753873, + "learning_rate": 3.910510395675953e-06, + "loss": 0.84970617, + "num_input_tokens_seen": 44145605, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1328125, + "step": 2041, + "time_per_iteration": 2.4804372787475586 + }, + { + "auxiliary_loss_clip": 0.01170766, + "auxiliary_loss_mlp": 0.01049407, + "balance_loss_clip": 1.02846217, + "balance_loss_mlp": 1.05399048, + "epoch": 0.12277168194799339, + "flos": 19828759587840.0, + "grad_norm": 1.7522185366152092, + "language_loss": 0.66806722, + "learning_rate": 3.9103951633588694e-06, + "loss": 0.69026893, + "num_input_tokens_seen": 44164770, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1640625, + "step": 2042, + "time_per_iteration": 2.4683480262756348 + }, + { + "auxiliary_loss_clip": 0.01165609, + "auxiliary_loss_mlp": 0.01051247, + "balance_loss_clip": 1.03032589, + "balance_loss_mlp": 1.05184031, + "epoch": 0.12283180520066135, + "flos": 23221024951680.0, + "grad_norm": 1.8478924147346443, + "language_loss": 0.81661081, + "learning_rate": 3.910279858599409e-06, + "loss": 0.83877933, + "num_input_tokens_seen": 44184025, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.140625, + "step": 2043, + "time_per_iteration": 2.5265614986419678 + }, + { + "auxiliary_loss_clip": 0.01166463, + "auxiliary_loss_mlp": 0.01049773, + "balance_loss_clip": 1.02792168, + "balance_loss_mlp": 1.05028844, + "epoch": 0.12289192845332933, + "flos": 18588476920320.0, + "grad_norm": 2.0920421188484095, + "language_loss": 0.8049221, + "learning_rate": 3.910164481401946e-06, + "loss": 0.82708442, + "num_input_tokens_seen": 44202950, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.1640625, + "step": 2044, + "time_per_iteration": 2.45843768119812 + }, + { + "auxiliary_loss_clip": 0.0116264, + "auxiliary_loss_mlp": 0.01046352, + "balance_loss_clip": 1.02577674, + "balance_loss_mlp": 1.05169511, + "epoch": 0.1229520517059973, + "flos": 25769532862080.0, + "grad_norm": 1.7057283877293323, + "language_loss": 0.7796452, + "learning_rate": 3.910049031770853e-06, + "loss": 0.8017351, + "num_input_tokens_seen": 44221115, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.109375, + "step": 2045, + "time_per_iteration": 2.5117220878601074 + }, + { + "auxiliary_loss_clip": 0.01172524, + "auxiliary_loss_mlp": 0.01063382, + "balance_loss_clip": 1.04210341, + "balance_loss_mlp": 1.05461311, + "epoch": 0.12301217495866526, + "flos": 20887154760960.0, + "grad_norm": 2.0659302798736436, + "language_loss": 0.67135215, + "learning_rate": 3.90993350971051e-06, + "loss": 0.69371116, + "num_input_tokens_seen": 44240575, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1796875, + "step": 2046, + "time_per_iteration": 2.466304063796997 + }, + { + "auxiliary_loss_clip": 0.01166597, + "auxiliary_loss_mlp": 0.01058908, + "balance_loss_clip": 1.03793919, + "balance_loss_mlp": 1.05408335, + "epoch": 0.12307229821133324, + "flos": 22378811783040.0, + "grad_norm": 2.3143924335245654, + "language_loss": 0.72491664, + "learning_rate": 3.909817915225297e-06, + "loss": 0.7471717, + "num_input_tokens_seen": 44257145, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.125, + "step": 2047, + "time_per_iteration": 2.4625275135040283 + }, + { + "auxiliary_loss_clip": 0.01163998, + "auxiliary_loss_mlp": 0.0106421, + "balance_loss_clip": 1.04232347, + "balance_loss_mlp": 1.05105257, + "epoch": 0.1231324214640012, + "flos": 23367396873600.0, + "grad_norm": 1.6458989790549132, + "language_loss": 0.76394033, + "learning_rate": 3.909702248319597e-06, + "loss": 0.7862224, + "num_input_tokens_seen": 44278035, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.125, + "step": 2048, + "time_per_iteration": 2.508790969848633 + }, + { + "auxiliary_loss_clip": 0.01165989, + "auxiliary_loss_mlp": 0.01048998, + "balance_loss_clip": 1.03061616, + "balance_loss_mlp": 1.05322123, + "epoch": 0.12319254471666917, + "flos": 23767154311680.0, + "grad_norm": 2.118548028298143, + "language_loss": 0.84626836, + "learning_rate": 3.909586508997797e-06, + "loss": 0.86841822, + "num_input_tokens_seen": 44296980, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.125, + "step": 2049, + "time_per_iteration": 2.538325071334839 + }, + { + "auxiliary_loss_clip": 0.01164402, + "auxiliary_loss_mlp": 0.01053692, + "balance_loss_clip": 1.0336647, + "balance_loss_mlp": 1.05051267, + "epoch": 0.12325266796933713, + "flos": 23550146294400.0, + "grad_norm": 3.176509780932849, + "language_loss": 0.75351131, + "learning_rate": 3.909470697264285e-06, + "loss": 0.77569222, + "num_input_tokens_seen": 44318005, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.140625, + "step": 2050, + "time_per_iteration": 2.499915599822998 + }, + { + "auxiliary_loss_clip": 0.0116542, + "auxiliary_loss_mlp": 0.01054604, + "balance_loss_clip": 1.03431439, + "balance_loss_mlp": 1.05127048, + "epoch": 0.12331279122200511, + "flos": 24423996366720.0, + "grad_norm": 1.9728027261326873, + "language_loss": 0.80877042, + "learning_rate": 3.909354813123452e-06, + "loss": 0.83097064, + "num_input_tokens_seen": 44335260, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.140625, + "step": 2051, + "time_per_iteration": 2.5018789768218994 + }, + { + "auxiliary_loss_clip": 0.01164273, + "auxiliary_loss_mlp": 0.01053369, + "balance_loss_clip": 1.03338933, + "balance_loss_mlp": 1.05348301, + "epoch": 0.12337291447467308, + "flos": 25484294960640.0, + "grad_norm": 1.7756923294305167, + "language_loss": 0.79991698, + "learning_rate": 3.909238856579693e-06, + "loss": 0.82209337, + "num_input_tokens_seen": 44355315, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.109375, + "step": 2052, + "time_per_iteration": 2.4962196350097656 + }, + { + "auxiliary_loss_clip": 0.01167428, + "auxiliary_loss_mlp": 0.01059063, + "balance_loss_clip": 1.03793955, + "balance_loss_mlp": 1.0515492, + "epoch": 0.12343303772734104, + "flos": 23550002640000.0, + "grad_norm": 2.071130498978609, + "language_loss": 0.73757279, + "learning_rate": 3.909122827637406e-06, + "loss": 0.75983769, + "num_input_tokens_seen": 44373020, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.15625, + "step": 2053, + "time_per_iteration": 2.4748997688293457 + }, + { + "auxiliary_loss_clip": 0.01164856, + "auxiliary_loss_mlp": 0.01054483, + "balance_loss_clip": 1.03337085, + "balance_loss_mlp": 1.04912996, + "epoch": 0.12349316098000902, + "flos": 47557074867840.0, + "grad_norm": 2.5139588428492408, + "language_loss": 0.73835206, + "learning_rate": 3.909006726300991e-06, + "loss": 0.76054543, + "num_input_tokens_seen": 44397525, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.15625, + "step": 2054, + "time_per_iteration": 2.7009665966033936 + }, + { + "auxiliary_loss_clip": 0.01161738, + "auxiliary_loss_mlp": 0.01042094, + "balance_loss_clip": 1.02381933, + "balance_loss_mlp": 1.04980421, + "epoch": 0.12355328423267699, + "flos": 25045969294080.0, + "grad_norm": 2.0020033330801863, + "language_loss": 0.85107529, + "learning_rate": 3.908890552574849e-06, + "loss": 0.87311363, + "num_input_tokens_seen": 44415890, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.1171875, + "step": 2055, + "time_per_iteration": 2.5038392543792725 + }, + { + "auxiliary_loss_clip": 0.01164626, + "auxiliary_loss_mlp": 0.01053304, + "balance_loss_clip": 1.03445673, + "balance_loss_mlp": 1.05093932, + "epoch": 0.12361340748534495, + "flos": 27709140395520.0, + "grad_norm": 1.9818000135561404, + "language_loss": 0.77465194, + "learning_rate": 3.908774306463384e-06, + "loss": 0.79683125, + "num_input_tokens_seen": 44436625, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.140625, + "step": 2056, + "time_per_iteration": 2.5265629291534424 + }, + { + "auxiliary_loss_clip": 0.01162241, + "auxiliary_loss_mlp": 0.01055177, + "balance_loss_clip": 1.03486395, + "balance_loss_mlp": 1.04937708, + "epoch": 0.12367353073801293, + "flos": 26140598311680.0, + "grad_norm": 1.9976131339644834, + "language_loss": 0.83188522, + "learning_rate": 3.908657987971009e-06, + "loss": 0.85405934, + "num_input_tokens_seen": 44455265, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1328125, + "step": 2057, + "time_per_iteration": 2.502987861633301 + }, + { + "auxiliary_loss_clip": 0.0116756, + "auxiliary_loss_mlp": 0.01053922, + "balance_loss_clip": 1.03272629, + "balance_loss_mlp": 1.05169332, + "epoch": 0.1237336539906809, + "flos": 25156035544320.0, + "grad_norm": 1.751792200322901, + "language_loss": 0.78356105, + "learning_rate": 3.90854159710213e-06, + "loss": 0.80577588, + "num_input_tokens_seen": 44475815, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1640625, + "step": 2058, + "time_per_iteration": 2.5236053466796875 + }, + { + "auxiliary_loss_clip": 0.01167574, + "auxiliary_loss_mlp": 0.01052354, + "balance_loss_clip": 1.03086066, + "balance_loss_mlp": 1.05105174, + "epoch": 0.12379377724334886, + "flos": 15304589867520.0, + "grad_norm": 2.1327254817813124, + "language_loss": 0.83191061, + "learning_rate": 3.9084251338611624e-06, + "loss": 0.85410988, + "num_input_tokens_seen": 44494045, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.1640625, + "step": 2059, + "time_per_iteration": 5.313246726989746 + }, + { + "auxiliary_loss_clip": 0.01169117, + "auxiliary_loss_mlp": 0.01056711, + "balance_loss_clip": 1.0344671, + "balance_loss_mlp": 1.05206418, + "epoch": 0.12385390049601683, + "flos": 21316717509120.0, + "grad_norm": 2.990324814625926, + "language_loss": 0.81387389, + "learning_rate": 3.908308598252523e-06, + "loss": 0.83613217, + "num_input_tokens_seen": 44509120, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.171875, + "step": 2060, + "time_per_iteration": 3.8617331981658936 + }, + { + "auxiliary_loss_clip": 0.01163462, + "auxiliary_loss_mlp": 0.01050537, + "balance_loss_clip": 1.02928221, + "balance_loss_mlp": 1.04859161, + "epoch": 0.1239140237486848, + "flos": 15116309752320.0, + "grad_norm": 2.0129231677956105, + "language_loss": 0.86278749, + "learning_rate": 3.9081919902806306e-06, + "loss": 0.88492751, + "num_input_tokens_seen": 44525780, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1484375, + "step": 2061, + "time_per_iteration": 2.4531033039093018 + }, + { + "auxiliary_loss_clip": 0.01163888, + "auxiliary_loss_mlp": 0.01045306, + "balance_loss_clip": 1.02552915, + "balance_loss_mlp": 1.05163288, + "epoch": 0.12397414700135277, + "flos": 21976791788160.0, + "grad_norm": 2.146204871859891, + "language_loss": 0.84992719, + "learning_rate": 3.908075309949906e-06, + "loss": 0.87201917, + "num_input_tokens_seen": 44543125, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.125, + "step": 2062, + "time_per_iteration": 2.475050449371338 + }, + { + "auxiliary_loss_clip": 0.01167091, + "auxiliary_loss_mlp": 0.01057701, + "balance_loss_clip": 1.03600502, + "balance_loss_mlp": 1.05348217, + "epoch": 0.12403427025402074, + "flos": 13400892956160.0, + "grad_norm": 2.194910982672458, + "language_loss": 0.78651118, + "learning_rate": 3.907958557264774e-06, + "loss": 0.80875909, + "num_input_tokens_seen": 44560275, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.140625, + "step": 2063, + "time_per_iteration": 2.4638655185699463 + }, + { + "auxiliary_loss_clip": 0.01167155, + "auxiliary_loss_mlp": 0.01058063, + "balance_loss_clip": 1.03590226, + "balance_loss_mlp": 1.05330634, + "epoch": 0.12409439350668872, + "flos": 15304374385920.0, + "grad_norm": 2.133219584666701, + "language_loss": 0.79411167, + "learning_rate": 3.907841732229663e-06, + "loss": 0.81636381, + "num_input_tokens_seen": 44577640, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.1328125, + "step": 2064, + "time_per_iteration": 2.4441418647766113 + }, + { + "auxiliary_loss_clip": 0.01163006, + "auxiliary_loss_mlp": 0.01051291, + "balance_loss_clip": 1.03083503, + "balance_loss_mlp": 1.04955256, + "epoch": 0.12415451675935668, + "flos": 25009376313600.0, + "grad_norm": 2.2298036351802533, + "language_loss": 0.92358226, + "learning_rate": 3.907724834849002e-06, + "loss": 0.9457252, + "num_input_tokens_seen": 44594860, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1328125, + "step": 2065, + "time_per_iteration": 2.4909794330596924 + }, + { + "auxiliary_loss_clip": 0.01166011, + "auxiliary_loss_mlp": 0.01050011, + "balance_loss_clip": 1.02880335, + "balance_loss_mlp": 1.05061674, + "epoch": 0.12421464001202465, + "flos": 23659673840640.0, + "grad_norm": 1.7134253508315578, + "language_loss": 0.8042016, + "learning_rate": 3.907607865127225e-06, + "loss": 0.82636184, + "num_input_tokens_seen": 44614780, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.15625, + "step": 2066, + "time_per_iteration": 2.484276056289673 + }, + { + "auxiliary_loss_clip": 0.01052523, + "auxiliary_loss_mlp": 0.0100337, + "balance_loss_clip": 1.00111723, + "balance_loss_mlp": 1.01144505, + "epoch": 0.12427476326469263, + "flos": 65732904345600.0, + "grad_norm": 0.8687209975293121, + "language_loss": 0.63275361, + "learning_rate": 3.907490823068766e-06, + "loss": 0.65331256, + "num_input_tokens_seen": 44671240, + "router_z_loss_clip": 0.02258301, + "router_z_loss_mlp": 0.41015625, + "step": 2067, + "time_per_iteration": 3.0286524295806885 + }, + { + "auxiliary_loss_clip": 0.01166519, + "auxiliary_loss_mlp": 0.0105175, + "balance_loss_clip": 1.03103137, + "balance_loss_mlp": 1.05087852, + "epoch": 0.12433488651736059, + "flos": 24535427333760.0, + "grad_norm": 1.9774411847970965, + "language_loss": 0.93209147, + "learning_rate": 3.907373708678063e-06, + "loss": 0.95427418, + "num_input_tokens_seen": 44691050, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.15625, + "step": 2068, + "time_per_iteration": 2.4971697330474854 + }, + { + "auxiliary_loss_clip": 0.01167817, + "auxiliary_loss_mlp": 0.01049229, + "balance_loss_clip": 1.03079867, + "balance_loss_mlp": 1.053213, + "epoch": 0.12439500977002856, + "flos": 21031659175680.0, + "grad_norm": 1.9835561743386452, + "language_loss": 0.81277847, + "learning_rate": 3.9072565219595596e-06, + "loss": 0.83494884, + "num_input_tokens_seen": 44709850, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.1484375, + "step": 2069, + "time_per_iteration": 2.4772391319274902 + }, + { + "auxiliary_loss_clip": 0.01166777, + "auxiliary_loss_mlp": 0.01055339, + "balance_loss_clip": 1.03519261, + "balance_loss_mlp": 1.05177176, + "epoch": 0.12445513302269653, + "flos": 26830621555200.0, + "grad_norm": 1.606173275168009, + "language_loss": 0.77390277, + "learning_rate": 3.907139262917696e-06, + "loss": 0.79612398, + "num_input_tokens_seen": 44731475, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.1484375, + "step": 2070, + "time_per_iteration": 2.4962410926818848 + }, + { + "auxiliary_loss_clip": 0.01172971, + "auxiliary_loss_mlp": 0.01046497, + "balance_loss_clip": 1.02598071, + "balance_loss_mlp": 1.05637431, + "epoch": 0.1245152562753645, + "flos": 18368919037440.0, + "grad_norm": 2.418044156181854, + "language_loss": 0.80847198, + "learning_rate": 3.907021931556922e-06, + "loss": 0.83066666, + "num_input_tokens_seen": 44749685, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1640625, + "step": 2071, + "time_per_iteration": 2.452148199081421 + }, + { + "auxiliary_loss_clip": 0.01162159, + "auxiliary_loss_mlp": 0.01051578, + "balance_loss_clip": 1.03063262, + "balance_loss_mlp": 1.05134583, + "epoch": 0.12457537952803246, + "flos": 33107986200960.0, + "grad_norm": 1.802846280579791, + "language_loss": 0.77933639, + "learning_rate": 3.906904527881684e-06, + "loss": 0.80147374, + "num_input_tokens_seen": 44772165, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.109375, + "step": 2072, + "time_per_iteration": 2.5763509273529053 + }, + { + "auxiliary_loss_clip": 0.01166298, + "auxiliary_loss_mlp": 0.01054628, + "balance_loss_clip": 1.03480363, + "balance_loss_mlp": 1.05423427, + "epoch": 0.12463550278070043, + "flos": 22270217990400.0, + "grad_norm": 2.6278132513508976, + "language_loss": 0.74839735, + "learning_rate": 3.9067870518964355e-06, + "loss": 0.77060658, + "num_input_tokens_seen": 44790580, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.125, + "step": 2073, + "time_per_iteration": 2.4676945209503174 + }, + { + "auxiliary_loss_clip": 0.01162135, + "auxiliary_loss_mlp": 0.01050014, + "balance_loss_clip": 1.02904546, + "balance_loss_mlp": 1.04915833, + "epoch": 0.12469562603336841, + "flos": 14679025580160.0, + "grad_norm": 1.9457561725453951, + "language_loss": 0.90556443, + "learning_rate": 3.906669503605631e-06, + "loss": 0.92768592, + "num_input_tokens_seen": 44806730, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1328125, + "step": 2074, + "time_per_iteration": 2.4873156547546387 + }, + { + "auxiliary_loss_clip": 0.01168793, + "auxiliary_loss_mlp": 0.01051059, + "balance_loss_clip": 1.02843285, + "balance_loss_mlp": 1.05183172, + "epoch": 0.12475574928603637, + "flos": 24644775312000.0, + "grad_norm": 2.3814572559525877, + "language_loss": 0.83753067, + "learning_rate": 3.906551883013728e-06, + "loss": 0.85972917, + "num_input_tokens_seen": 44825550, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.171875, + "step": 2075, + "time_per_iteration": 2.500657320022583 + }, + { + "auxiliary_loss_clip": 0.01164838, + "auxiliary_loss_mlp": 0.01056086, + "balance_loss_clip": 1.0341754, + "balance_loss_mlp": 1.05080831, + "epoch": 0.12481587253870434, + "flos": 21762980081280.0, + "grad_norm": 2.1638910845289567, + "language_loss": 0.73802024, + "learning_rate": 3.9064341901251865e-06, + "loss": 0.76022947, + "num_input_tokens_seen": 44844155, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.140625, + "step": 2076, + "time_per_iteration": 2.5686564445495605 + }, + { + "auxiliary_loss_clip": 0.01161577, + "auxiliary_loss_mlp": 0.01043023, + "balance_loss_clip": 1.02346063, + "balance_loss_mlp": 1.05219531, + "epoch": 0.12487599579137232, + "flos": 21432529935360.0, + "grad_norm": 1.967733683791653, + "language_loss": 0.7551648, + "learning_rate": 3.906316424944469e-06, + "loss": 0.77721083, + "num_input_tokens_seen": 44863780, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.09375, + "step": 2077, + "time_per_iteration": 2.489954710006714 + }, + { + "auxiliary_loss_clip": 0.01163633, + "auxiliary_loss_mlp": 0.0105265, + "balance_loss_clip": 1.03104901, + "balance_loss_mlp": 1.05015802, + "epoch": 0.12493611904404028, + "flos": 16107624276480.0, + "grad_norm": 4.043491061132511, + "language_loss": 0.82077563, + "learning_rate": 3.906198587476043e-06, + "loss": 0.84293842, + "num_input_tokens_seen": 44881480, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.1328125, + "step": 2078, + "time_per_iteration": 2.445270299911499 + }, + { + "auxiliary_loss_clip": 0.01168396, + "auxiliary_loss_mlp": 0.0104761, + "balance_loss_clip": 1.02629507, + "balance_loss_mlp": 1.05372512, + "epoch": 0.12499624229670825, + "flos": 21580266574080.0, + "grad_norm": 2.023726857078381, + "language_loss": 0.75024784, + "learning_rate": 3.906080677724374e-06, + "loss": 0.77240789, + "num_input_tokens_seen": 44900390, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1484375, + "step": 2079, + "time_per_iteration": 2.4694364070892334 + }, + { + "auxiliary_loss_clip": 0.01173002, + "auxiliary_loss_mlp": 0.01056904, + "balance_loss_clip": 1.03578043, + "balance_loss_mlp": 1.05697465, + "epoch": 0.1250563655493762, + "flos": 25699040421120.0, + "grad_norm": 2.9314739831996124, + "language_loss": 0.83961046, + "learning_rate": 3.905962695693935e-06, + "loss": 0.86190951, + "num_input_tokens_seen": 44920375, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1640625, + "step": 2080, + "time_per_iteration": 2.52955961227417 + }, + { + "auxiliary_loss_clip": 0.01166024, + "auxiliary_loss_mlp": 0.0105753, + "balance_loss_clip": 1.0364058, + "balance_loss_mlp": 1.05275226, + "epoch": 0.12511648880204418, + "flos": 16909509450240.0, + "grad_norm": 2.0357346796271307, + "language_loss": 0.84575123, + "learning_rate": 3.9058446413892e-06, + "loss": 0.8679868, + "num_input_tokens_seen": 44938415, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1328125, + "step": 2081, + "time_per_iteration": 2.4380433559417725 + }, + { + "auxiliary_loss_clip": 0.0116507, + "auxiliary_loss_mlp": 0.01044581, + "balance_loss_clip": 1.02430391, + "balance_loss_mlp": 1.05154538, + "epoch": 0.12517661205471217, + "flos": 17567500740480.0, + "grad_norm": 1.660916229819668, + "language_loss": 0.76882648, + "learning_rate": 3.905726514814646e-06, + "loss": 0.790923, + "num_input_tokens_seen": 44957135, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1328125, + "step": 2082, + "time_per_iteration": 2.454939842224121 + }, + { + "auxiliary_loss_clip": 0.01182882, + "auxiliary_loss_mlp": 0.01052846, + "balance_loss_clip": 1.03117347, + "balance_loss_mlp": 1.06035674, + "epoch": 0.12523673530738014, + "flos": 16033791870720.0, + "grad_norm": 2.833832134330164, + "language_loss": 0.78994107, + "learning_rate": 3.9056083159747495e-06, + "loss": 0.81229836, + "num_input_tokens_seen": 44974480, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.2265625, + "step": 2083, + "time_per_iteration": 2.4439167976379395 + }, + { + "auxiliary_loss_clip": 0.01168103, + "auxiliary_loss_mlp": 0.01051445, + "balance_loss_clip": 1.02855682, + "balance_loss_mlp": 1.05132031, + "epoch": 0.1252968585600481, + "flos": 18807747494400.0, + "grad_norm": 2.376124844090109, + "language_loss": 0.89690113, + "learning_rate": 3.9054900448739966e-06, + "loss": 0.91909659, + "num_input_tokens_seen": 44990310, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.171875, + "step": 2084, + "time_per_iteration": 2.492274045944214 + }, + { + "auxiliary_loss_clip": 0.01168755, + "auxiliary_loss_mlp": 0.01049772, + "balance_loss_clip": 1.02876747, + "balance_loss_mlp": 1.05379784, + "epoch": 0.12535698181271607, + "flos": 27271568914560.0, + "grad_norm": 1.9059704425119062, + "language_loss": 0.79718572, + "learning_rate": 3.905371701516869e-06, + "loss": 0.81937099, + "num_input_tokens_seen": 45010720, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.1484375, + "step": 2085, + "time_per_iteration": 2.5295538902282715 + }, + { + "auxiliary_loss_clip": 0.0116658, + "auxiliary_loss_mlp": 0.01051649, + "balance_loss_clip": 1.03011954, + "balance_loss_mlp": 1.05235541, + "epoch": 0.12541710506538403, + "flos": 22054107813120.0, + "grad_norm": 1.9580642243137214, + "language_loss": 0.88227898, + "learning_rate": 3.905253285907856e-06, + "loss": 0.90446126, + "num_input_tokens_seen": 45030360, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.140625, + "step": 2086, + "time_per_iteration": 2.4508614540100098 + }, + { + "auxiliary_loss_clip": 0.01162238, + "auxiliary_loss_mlp": 0.01045013, + "balance_loss_clip": 1.02541506, + "balance_loss_mlp": 1.05238986, + "epoch": 0.125477228318052, + "flos": 12603173760000.0, + "grad_norm": 2.3707303368435957, + "language_loss": 0.87088495, + "learning_rate": 3.905134798051447e-06, + "loss": 0.89295745, + "num_input_tokens_seen": 45045085, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.09375, + "step": 2087, + "time_per_iteration": 2.4342494010925293 + }, + { + "auxiliary_loss_clip": 0.01166252, + "auxiliary_loss_mlp": 0.01056999, + "balance_loss_clip": 1.03444421, + "balance_loss_mlp": 1.05230761, + "epoch": 0.12553735157071996, + "flos": 23878549365120.0, + "grad_norm": 3.239876707553976, + "language_loss": 0.73480451, + "learning_rate": 3.905016237952136e-06, + "loss": 0.75703704, + "num_input_tokens_seen": 45065145, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.140625, + "step": 2088, + "time_per_iteration": 2.4926228523254395 + }, + { + "auxiliary_loss_clip": 0.01053685, + "auxiliary_loss_mlp": 0.01004858, + "balance_loss_clip": 1.00259304, + "balance_loss_mlp": 1.01231122, + "epoch": 0.12559747482338796, + "flos": 69920841830400.0, + "grad_norm": 0.759594920780347, + "language_loss": 0.61699253, + "learning_rate": 3.904897605614418e-06, + "loss": 0.63757795, + "num_input_tokens_seen": 45126230, + "router_z_loss_clip": 0.02270508, + "router_z_loss_mlp": 0.4140625, + "step": 2089, + "time_per_iteration": 3.0373222827911377 + }, + { + "auxiliary_loss_clip": 0.01165987, + "auxiliary_loss_mlp": 0.01057326, + "balance_loss_clip": 1.03522468, + "balance_loss_mlp": 1.05317736, + "epoch": 0.12565759807605592, + "flos": 24279563779200.0, + "grad_norm": 2.0159960445234746, + "language_loss": 0.78266793, + "learning_rate": 3.904778901042793e-06, + "loss": 0.80490106, + "num_input_tokens_seen": 45145545, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.125, + "step": 2090, + "time_per_iteration": 2.5307860374450684 + }, + { + "auxiliary_loss_clip": 0.01051163, + "auxiliary_loss_mlp": 0.01005786, + "balance_loss_clip": 1.00381935, + "balance_loss_mlp": 1.01062346, + "epoch": 0.12571772132872389, + "flos": 56451180286080.0, + "grad_norm": 0.749206069507312, + "language_loss": 0.59394926, + "learning_rate": 3.90466012424176e-06, + "loss": 0.61451876, + "num_input_tokens_seen": 45206845, + "router_z_loss_clip": 0.01965332, + "router_z_loss_mlp": 0.40625, + "step": 2091, + "time_per_iteration": 2.976081609725952 + }, + { + "auxiliary_loss_clip": 0.01166574, + "auxiliary_loss_mlp": 0.0105012, + "balance_loss_clip": 1.03016472, + "balance_loss_mlp": 1.0538522, + "epoch": 0.12577784458139185, + "flos": 41245846675200.0, + "grad_norm": 1.8692826570762828, + "language_loss": 0.63588953, + "learning_rate": 3.904541275215825e-06, + "loss": 0.6580565, + "num_input_tokens_seen": 45228495, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.125, + "step": 2092, + "time_per_iteration": 2.633054733276367 + }, + { + "auxiliary_loss_clip": 0.01169654, + "auxiliary_loss_mlp": 0.01059319, + "balance_loss_clip": 1.03770638, + "balance_loss_mlp": 1.05095637, + "epoch": 0.12583796783405982, + "flos": 19755501799680.0, + "grad_norm": 3.3800613541528257, + "language_loss": 0.80149096, + "learning_rate": 3.904422353969493e-06, + "loss": 0.82378066, + "num_input_tokens_seen": 45245720, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.1875, + "step": 2093, + "time_per_iteration": 2.4769086837768555 + }, + { + "auxiliary_loss_clip": 0.01166637, + "auxiliary_loss_mlp": 0.01065148, + "balance_loss_clip": 1.04385769, + "balance_loss_mlp": 1.05323935, + "epoch": 0.12589809108672778, + "flos": 22602104680320.0, + "grad_norm": 1.7179534274341421, + "language_loss": 0.75928843, + "learning_rate": 3.904303360507276e-06, + "loss": 0.78160632, + "num_input_tokens_seen": 45265650, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1328125, + "step": 2094, + "time_per_iteration": 2.4775569438934326 + }, + { + "auxiliary_loss_clip": 0.01163905, + "auxiliary_loss_mlp": 0.01053098, + "balance_loss_clip": 1.03322637, + "balance_loss_mlp": 1.05116057, + "epoch": 0.12595821433939577, + "flos": 45222845541120.0, + "grad_norm": 1.654740537988477, + "language_loss": 0.76833487, + "learning_rate": 3.9041842948336835e-06, + "loss": 0.79050487, + "num_input_tokens_seen": 45287790, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.125, + "step": 2095, + "time_per_iteration": 2.669593095779419 + }, + { + "auxiliary_loss_clip": 0.01166425, + "auxiliary_loss_mlp": 0.01064344, + "balance_loss_clip": 1.04330409, + "balance_loss_mlp": 1.05012596, + "epoch": 0.12601833759206374, + "flos": 14319811618560.0, + "grad_norm": 2.7658625824396568, + "language_loss": 0.8312341, + "learning_rate": 3.904065156953232e-06, + "loss": 0.85354173, + "num_input_tokens_seen": 45305720, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1640625, + "step": 2096, + "time_per_iteration": 2.446169853210449 + }, + { + "auxiliary_loss_clip": 0.01167307, + "auxiliary_loss_mlp": 0.0105403, + "balance_loss_clip": 1.03317988, + "balance_loss_mlp": 1.05236387, + "epoch": 0.1260784608447317, + "flos": 21288241002240.0, + "grad_norm": 1.9365429623482773, + "language_loss": 0.7532599, + "learning_rate": 3.903945946870439e-06, + "loss": 0.77547324, + "num_input_tokens_seen": 45325290, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1484375, + "step": 2097, + "time_per_iteration": 2.46520733833313 + }, + { + "auxiliary_loss_clip": 0.0116818, + "auxiliary_loss_mlp": 0.0105919, + "balance_loss_clip": 1.0399375, + "balance_loss_mlp": 1.05366278, + "epoch": 0.12613858409739967, + "flos": 26251311006720.0, + "grad_norm": 2.0415683165998004, + "language_loss": 0.8696878, + "learning_rate": 3.9038266645898246e-06, + "loss": 0.89196146, + "num_input_tokens_seen": 45344465, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.1484375, + "step": 2098, + "time_per_iteration": 2.488985061645508 + }, + { + "auxiliary_loss_clip": 0.01171506, + "auxiliary_loss_mlp": 0.01063691, + "balance_loss_clip": 1.03984964, + "balance_loss_mlp": 1.05263424, + "epoch": 0.12619870735006763, + "flos": 21579979265280.0, + "grad_norm": 1.8810788789855342, + "language_loss": 0.69538295, + "learning_rate": 3.903707310115912e-06, + "loss": 0.71773493, + "num_input_tokens_seen": 45362465, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.1875, + "step": 2099, + "time_per_iteration": 2.4791061878204346 + }, + { + "auxiliary_loss_clip": 0.01167442, + "auxiliary_loss_mlp": 0.01058165, + "balance_loss_clip": 1.03538442, + "balance_loss_mlp": 1.05016196, + "epoch": 0.1262588306027356, + "flos": 23367037737600.0, + "grad_norm": 3.489186386071109, + "language_loss": 0.81622505, + "learning_rate": 3.903587883453228e-06, + "loss": 0.83848113, + "num_input_tokens_seen": 45382700, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.171875, + "step": 2100, + "time_per_iteration": 2.4970083236694336 + }, + { + "auxiliary_loss_clip": 0.01170444, + "auxiliary_loss_mlp": 0.01056399, + "balance_loss_clip": 1.03558493, + "balance_loss_mlp": 1.05375385, + "epoch": 0.12631895385540357, + "flos": 23949185460480.0, + "grad_norm": 21.240028764463403, + "language_loss": 0.80653214, + "learning_rate": 3.903468384606302e-06, + "loss": 0.82880062, + "num_input_tokens_seen": 45401005, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.1640625, + "step": 2101, + "time_per_iteration": 5.441275596618652 + }, + { + "auxiliary_loss_clip": 0.01053889, + "auxiliary_loss_mlp": 0.01009667, + "balance_loss_clip": 1.00753367, + "balance_loss_mlp": 1.01423335, + "epoch": 0.12637907710807156, + "flos": 70282138780800.0, + "grad_norm": 0.7055092704674581, + "language_loss": 0.57077372, + "learning_rate": 3.903348813579662e-06, + "loss": 0.59140933, + "num_input_tokens_seen": 45466555, + "router_z_loss_clip": 0.0213623, + "router_z_loss_mlp": 0.39648438, + "step": 2102, + "time_per_iteration": 4.4595959186553955 + }, + { + "auxiliary_loss_clip": 0.01172982, + "auxiliary_loss_mlp": 0.0105633, + "balance_loss_clip": 1.03513408, + "balance_loss_mlp": 1.05443108, + "epoch": 0.12643920036073952, + "flos": 18915084311040.0, + "grad_norm": 1.9163731362545673, + "language_loss": 0.93033105, + "learning_rate": 3.903229170377845e-06, + "loss": 0.9526242, + "num_input_tokens_seen": 45485165, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1875, + "step": 2103, + "time_per_iteration": 2.4612908363342285 + }, + { + "auxiliary_loss_clip": 0.01160763, + "auxiliary_loss_mlp": 0.01039063, + "balance_loss_clip": 1.01929784, + "balance_loss_mlp": 1.05146646, + "epoch": 0.1264993236134075, + "flos": 27782470010880.0, + "grad_norm": 1.70771861982282, + "language_loss": 0.7804687, + "learning_rate": 3.903109455005387e-06, + "loss": 0.80246699, + "num_input_tokens_seen": 45504630, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.09375, + "step": 2104, + "time_per_iteration": 2.556351661682129 + }, + { + "auxiliary_loss_clip": 0.01173017, + "auxiliary_loss_mlp": 0.01056721, + "balance_loss_clip": 1.03659892, + "balance_loss_mlp": 1.05698192, + "epoch": 0.12655944686607545, + "flos": 24754697907840.0, + "grad_norm": 1.9983303318130716, + "language_loss": 0.81274837, + "learning_rate": 3.902989667466828e-06, + "loss": 0.83504581, + "num_input_tokens_seen": 45524885, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.15625, + "step": 2105, + "time_per_iteration": 2.4998059272766113 + }, + { + "auxiliary_loss_clip": 0.01177911, + "auxiliary_loss_mlp": 0.01057389, + "balance_loss_clip": 1.03515697, + "balance_loss_mlp": 1.05756688, + "epoch": 0.12661957011874342, + "flos": 24133048202880.0, + "grad_norm": 2.6618923007939728, + "language_loss": 0.83258855, + "learning_rate": 3.90286980776671e-06, + "loss": 0.85494161, + "num_input_tokens_seen": 45545000, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.203125, + "step": 2106, + "time_per_iteration": 2.4816856384277344 + }, + { + "auxiliary_loss_clip": 0.01170292, + "auxiliary_loss_mlp": 0.01048713, + "balance_loss_clip": 1.02755296, + "balance_loss_mlp": 1.05664992, + "epoch": 0.12667969337141138, + "flos": 24569614103040.0, + "grad_norm": 2.017673348074064, + "language_loss": 0.73717511, + "learning_rate": 3.902749875909578e-06, + "loss": 0.75936514, + "num_input_tokens_seen": 45564210, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.140625, + "step": 2107, + "time_per_iteration": 2.503575325012207 + }, + { + "auxiliary_loss_clip": 0.01166119, + "auxiliary_loss_mlp": 0.01046685, + "balance_loss_clip": 1.02683651, + "balance_loss_mlp": 1.05330598, + "epoch": 0.12673981662407935, + "flos": 22961677777920.0, + "grad_norm": 2.8409726657459213, + "language_loss": 0.79492414, + "learning_rate": 3.90262987189998e-06, + "loss": 0.81705213, + "num_input_tokens_seen": 45583030, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.125, + "step": 2108, + "time_per_iteration": 2.448009967803955 + }, + { + "auxiliary_loss_clip": 0.01168328, + "auxiliary_loss_mlp": 0.01048086, + "balance_loss_clip": 1.02635407, + "balance_loss_mlp": 1.05213785, + "epoch": 0.12679993987674734, + "flos": 17274864637440.0, + "grad_norm": 2.700834997101356, + "language_loss": 0.75458848, + "learning_rate": 3.902509795742467e-06, + "loss": 0.77675259, + "num_input_tokens_seen": 45602265, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.1640625, + "step": 2109, + "time_per_iteration": 2.463996171951294 + }, + { + "auxiliary_loss_clip": 0.01165378, + "auxiliary_loss_mlp": 0.01046335, + "balance_loss_clip": 1.02641523, + "balance_loss_mlp": 1.05309939, + "epoch": 0.1268600631294153, + "flos": 17275080119040.0, + "grad_norm": 5.620565406896926, + "language_loss": 0.82876229, + "learning_rate": 3.902389647441592e-06, + "loss": 0.85087943, + "num_input_tokens_seen": 45620595, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.125, + "step": 2110, + "time_per_iteration": 2.4536476135253906 + }, + { + "auxiliary_loss_clip": 0.01166918, + "auxiliary_loss_mlp": 0.01055332, + "balance_loss_clip": 1.03271818, + "balance_loss_mlp": 1.0524385, + "epoch": 0.12692018638208327, + "flos": 24061047390720.0, + "grad_norm": 1.8108257578185059, + "language_loss": 0.78553301, + "learning_rate": 3.90226942700191e-06, + "loss": 0.80775553, + "num_input_tokens_seen": 45641140, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.140625, + "step": 2111, + "time_per_iteration": 2.4898500442504883 + }, + { + "auxiliary_loss_clip": 0.01178398, + "auxiliary_loss_mlp": 0.01069762, + "balance_loss_clip": 1.04634905, + "balance_loss_mlp": 1.05599511, + "epoch": 0.12698030963475124, + "flos": 31831900652160.0, + "grad_norm": 2.2255287569010567, + "language_loss": 0.76852119, + "learning_rate": 3.902149134427982e-06, + "loss": 0.79100275, + "num_input_tokens_seen": 45662315, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.2265625, + "step": 2112, + "time_per_iteration": 2.534062623977661 + }, + { + "auxiliary_loss_clip": 0.0116691, + "auxiliary_loss_mlp": 0.01060346, + "balance_loss_clip": 1.03878117, + "balance_loss_mlp": 1.05138493, + "epoch": 0.1270404328874192, + "flos": 25187744275200.0, + "grad_norm": 1.901101750436338, + "language_loss": 0.85764933, + "learning_rate": 3.902028769724367e-06, + "loss": 0.87992191, + "num_input_tokens_seen": 45680335, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.15625, + "step": 2113, + "time_per_iteration": 2.4980924129486084 + }, + { + "auxiliary_loss_clip": 0.01166421, + "auxiliary_loss_mlp": 0.01057595, + "balance_loss_clip": 1.03581548, + "balance_loss_mlp": 1.05287683, + "epoch": 0.12710055614008717, + "flos": 15997342544640.0, + "grad_norm": 2.270588429793272, + "language_loss": 0.74000478, + "learning_rate": 3.9019083328956315e-06, + "loss": 0.76224494, + "num_input_tokens_seen": 45696240, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.1328125, + "step": 2114, + "time_per_iteration": 2.422631025314331 + }, + { + "auxiliary_loss_clip": 0.01170563, + "auxiliary_loss_mlp": 0.01057942, + "balance_loss_clip": 1.03504217, + "balance_loss_mlp": 1.05601084, + "epoch": 0.12716067939275516, + "flos": 15085642515840.0, + "grad_norm": 1.7902572486589996, + "language_loss": 0.83236456, + "learning_rate": 3.901787823946341e-06, + "loss": 0.85464966, + "num_input_tokens_seen": 45713695, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.1484375, + "step": 2115, + "time_per_iteration": 2.4601340293884277 + }, + { + "auxiliary_loss_clip": 0.01169954, + "auxiliary_loss_mlp": 0.01060607, + "balance_loss_clip": 1.03953075, + "balance_loss_mlp": 1.05397201, + "epoch": 0.12722080264542313, + "flos": 28366736636160.0, + "grad_norm": 1.532692301262898, + "language_loss": 0.86615002, + "learning_rate": 3.901667242881065e-06, + "loss": 0.88845563, + "num_input_tokens_seen": 45736655, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1640625, + "step": 2116, + "time_per_iteration": 2.5315732955932617 + }, + { + "auxiliary_loss_clip": 0.01164638, + "auxiliary_loss_mlp": 0.01050843, + "balance_loss_clip": 1.03062534, + "balance_loss_mlp": 1.05188024, + "epoch": 0.1272809258980911, + "flos": 32379897519360.0, + "grad_norm": 1.8525451323112498, + "language_loss": 0.70492947, + "learning_rate": 3.9015465897043775e-06, + "loss": 0.72708428, + "num_input_tokens_seen": 45758195, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.125, + "step": 2117, + "time_per_iteration": 2.6213905811309814 + }, + { + "auxiliary_loss_clip": 0.01168229, + "auxiliary_loss_mlp": 0.01055103, + "balance_loss_clip": 1.03346658, + "balance_loss_mlp": 1.05461121, + "epoch": 0.12734104915075906, + "flos": 16034402401920.0, + "grad_norm": 2.4058915352959294, + "language_loss": 0.86858076, + "learning_rate": 3.901425864420852e-06, + "loss": 0.89081407, + "num_input_tokens_seen": 45774280, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.140625, + "step": 2118, + "time_per_iteration": 2.4760360717773438 + }, + { + "auxiliary_loss_clip": 0.01164532, + "auxiliary_loss_mlp": 0.01048268, + "balance_loss_clip": 1.0279547, + "balance_loss_mlp": 1.0518508, + "epoch": 0.12740117240342702, + "flos": 18260325244800.0, + "grad_norm": 1.7933295144796901, + "language_loss": 0.87325591, + "learning_rate": 3.901305067035068e-06, + "loss": 0.89538383, + "num_input_tokens_seen": 45792760, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.125, + "step": 2119, + "time_per_iteration": 2.547213315963745 + }, + { + "auxiliary_loss_clip": 0.01167828, + "auxiliary_loss_mlp": 0.01051741, + "balance_loss_clip": 1.03024805, + "balance_loss_mlp": 1.05369782, + "epoch": 0.127461295656095, + "flos": 12121790664960.0, + "grad_norm": 2.4444945117671018, + "language_loss": 0.8769815, + "learning_rate": 3.901184197551605e-06, + "loss": 0.89917719, + "num_input_tokens_seen": 45804300, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.140625, + "step": 2120, + "time_per_iteration": 2.4568872451782227 + }, + { + "auxiliary_loss_clip": 0.01169401, + "auxiliary_loss_mlp": 0.0104623, + "balance_loss_clip": 1.02553487, + "balance_loss_mlp": 1.05405664, + "epoch": 0.12752141890876295, + "flos": 23149095966720.0, + "grad_norm": 1.8558714180118523, + "language_loss": 0.75193042, + "learning_rate": 3.901063255975046e-06, + "loss": 0.77408671, + "num_input_tokens_seen": 45823780, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1484375, + "step": 2121, + "time_per_iteration": 2.508117437362671 + }, + { + "auxiliary_loss_clip": 0.01167335, + "auxiliary_loss_mlp": 0.01050063, + "balance_loss_clip": 1.02895081, + "balance_loss_mlp": 1.05228865, + "epoch": 0.12758154216143094, + "flos": 21615997628160.0, + "grad_norm": 2.458066848563671, + "language_loss": 0.8294577, + "learning_rate": 3.900942242309978e-06, + "loss": 0.8516317, + "num_input_tokens_seen": 45840495, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.15625, + "step": 2122, + "time_per_iteration": 2.4878990650177 + }, + { + "auxiliary_loss_clip": 0.01168476, + "auxiliary_loss_mlp": 0.01050394, + "balance_loss_clip": 1.02924609, + "balance_loss_mlp": 1.05379128, + "epoch": 0.1276416654140989, + "flos": 15924874855680.0, + "grad_norm": 2.1208761223769375, + "language_loss": 0.79040462, + "learning_rate": 3.90082115656099e-06, + "loss": 0.81259328, + "num_input_tokens_seen": 45857735, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1484375, + "step": 2123, + "time_per_iteration": 2.512085199356079 + }, + { + "auxiliary_loss_clip": 0.0117181, + "auxiliary_loss_mlp": 0.01056255, + "balance_loss_clip": 1.03411841, + "balance_loss_mlp": 1.05565643, + "epoch": 0.12770178866676687, + "flos": 22382690451840.0, + "grad_norm": 1.7846776317234667, + "language_loss": 0.79227948, + "learning_rate": 3.900699998732673e-06, + "loss": 0.81456017, + "num_input_tokens_seen": 45876485, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1640625, + "step": 2124, + "time_per_iteration": 2.4865264892578125 + }, + { + "auxiliary_loss_clip": 0.01168084, + "auxiliary_loss_mlp": 0.01054179, + "balance_loss_clip": 1.03267348, + "balance_loss_mlp": 1.05149364, + "epoch": 0.12776191191943484, + "flos": 21652482867840.0, + "grad_norm": 2.8175561910153215, + "language_loss": 0.75565529, + "learning_rate": 3.900578768829623e-06, + "loss": 0.77787793, + "num_input_tokens_seen": 45894645, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.1640625, + "step": 2125, + "time_per_iteration": 2.514455795288086 + }, + { + "auxiliary_loss_clip": 0.01166899, + "auxiliary_loss_mlp": 0.01047376, + "balance_loss_clip": 1.02645469, + "balance_loss_mlp": 1.05262208, + "epoch": 0.1278220351721028, + "flos": 25735561574400.0, + "grad_norm": 2.1990589160087493, + "language_loss": 0.77811432, + "learning_rate": 3.900457466856434e-06, + "loss": 0.80025709, + "num_input_tokens_seen": 45913755, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.140625, + "step": 2126, + "time_per_iteration": 2.556657075881958 + }, + { + "auxiliary_loss_clip": 0.01167875, + "auxiliary_loss_mlp": 0.01050746, + "balance_loss_clip": 1.03124356, + "balance_loss_mlp": 1.05559683, + "epoch": 0.12788215842477077, + "flos": 41243224982400.0, + "grad_norm": 1.702389562623477, + "language_loss": 0.69255161, + "learning_rate": 3.9003360928177085e-06, + "loss": 0.71473777, + "num_input_tokens_seen": 45936095, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.125, + "step": 2127, + "time_per_iteration": 2.629990339279175 + }, + { + "auxiliary_loss_clip": 0.01050691, + "auxiliary_loss_mlp": 0.01005275, + "balance_loss_clip": 1.00326061, + "balance_loss_mlp": 1.01139402, + "epoch": 0.12794228167743876, + "flos": 70877430881280.0, + "grad_norm": 0.8552720802624753, + "language_loss": 0.62738979, + "learning_rate": 3.900214646718047e-06, + "loss": 0.64794946, + "num_input_tokens_seen": 46004655, + "router_z_loss_clip": 0.0201416, + "router_z_loss_mlp": 0.39257812, + "step": 2128, + "time_per_iteration": 3.1237356662750244 + }, + { + "auxiliary_loss_clip": 0.01168478, + "auxiliary_loss_mlp": 0.01048721, + "balance_loss_clip": 1.02646422, + "balance_loss_mlp": 1.05287039, + "epoch": 0.12800240493010673, + "flos": 16289727252480.0, + "grad_norm": 2.3711218915030368, + "language_loss": 0.77148604, + "learning_rate": 3.900093128562056e-06, + "loss": 0.79365802, + "num_input_tokens_seen": 46023610, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.15625, + "step": 2129, + "time_per_iteration": 2.4499564170837402 + }, + { + "auxiliary_loss_clip": 0.01179121, + "auxiliary_loss_mlp": 0.01052089, + "balance_loss_clip": 1.02902186, + "balance_loss_mlp": 1.05744195, + "epoch": 0.1280625281827747, + "flos": 20631542601600.0, + "grad_norm": 2.273395516882369, + "language_loss": 0.79321349, + "learning_rate": 3.899971538354343e-06, + "loss": 0.81552559, + "num_input_tokens_seen": 46041725, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.21875, + "step": 2130, + "time_per_iteration": 2.4536893367767334 + }, + { + "auxiliary_loss_clip": 0.0116812, + "auxiliary_loss_mlp": 0.01044456, + "balance_loss_clip": 1.02463198, + "balance_loss_mlp": 1.05328345, + "epoch": 0.12812265143544266, + "flos": 22638230784000.0, + "grad_norm": 2.267455405666958, + "language_loss": 0.70879477, + "learning_rate": 3.899849876099518e-06, + "loss": 0.73092055, + "num_input_tokens_seen": 46061095, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.1484375, + "step": 2131, + "time_per_iteration": 2.514155149459839 + }, + { + "auxiliary_loss_clip": 0.01166691, + "auxiliary_loss_mlp": 0.01051797, + "balance_loss_clip": 1.03007698, + "balance_loss_mlp": 1.05375445, + "epoch": 0.12818277468811062, + "flos": 34714701463680.0, + "grad_norm": 2.2952793086030376, + "language_loss": 0.72266257, + "learning_rate": 3.899728141802197e-06, + "loss": 0.74484742, + "num_input_tokens_seen": 46082670, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.125, + "step": 2132, + "time_per_iteration": 2.5662834644317627 + }, + { + "auxiliary_loss_clip": 0.01163765, + "auxiliary_loss_mlp": 0.01054914, + "balance_loss_clip": 1.03396928, + "balance_loss_mlp": 1.05281162, + "epoch": 0.1282428979407786, + "flos": 23112107936640.0, + "grad_norm": 2.1162344308699828, + "language_loss": 0.82306767, + "learning_rate": 3.8996063354669935e-06, + "loss": 0.84525442, + "num_input_tokens_seen": 46102410, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.109375, + "step": 2133, + "time_per_iteration": 2.488302230834961 + }, + { + "auxiliary_loss_clip": 0.01174206, + "auxiliary_loss_mlp": 0.01061813, + "balance_loss_clip": 1.03871, + "balance_loss_mlp": 1.05329132, + "epoch": 0.12830302119344655, + "flos": 20886508316160.0, + "grad_norm": 2.538367341661163, + "language_loss": 0.79631573, + "learning_rate": 3.899484457098528e-06, + "loss": 0.81867594, + "num_input_tokens_seen": 46121145, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.203125, + "step": 2134, + "time_per_iteration": 2.4610936641693115 + }, + { + "auxiliary_loss_clip": 0.01172893, + "auxiliary_loss_mlp": 0.01045118, + "balance_loss_clip": 1.02393413, + "balance_loss_mlp": 1.05650806, + "epoch": 0.12836314444611455, + "flos": 21397768548480.0, + "grad_norm": 2.033800341734765, + "language_loss": 0.83015293, + "learning_rate": 3.899362506701421e-06, + "loss": 0.85233301, + "num_input_tokens_seen": 46140740, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1640625, + "step": 2135, + "time_per_iteration": 2.4743056297302246 + }, + { + "auxiliary_loss_clip": 0.01165668, + "auxiliary_loss_mlp": 0.01061205, + "balance_loss_clip": 1.03842425, + "balance_loss_mlp": 1.05173945, + "epoch": 0.1284232676987825, + "flos": 13662466773120.0, + "grad_norm": 2.9021762622464853, + "language_loss": 0.77293968, + "learning_rate": 3.899240484280298e-06, + "loss": 0.79520839, + "num_input_tokens_seen": 46156805, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.140625, + "step": 2136, + "time_per_iteration": 2.4412362575531006 + }, + { + "auxiliary_loss_clip": 0.01051729, + "auxiliary_loss_mlp": 0.01001869, + "balance_loss_clip": 0.99983084, + "balance_loss_mlp": 1.01248765, + "epoch": 0.12848339095145048, + "flos": 59994737735040.0, + "grad_norm": 0.8943310105061408, + "language_loss": 0.59115362, + "learning_rate": 3.899118389839785e-06, + "loss": 0.61168963, + "num_input_tokens_seen": 46222085, + "router_z_loss_clip": 0.02038574, + "router_z_loss_mlp": 0.39257812, + "step": 2137, + "time_per_iteration": 3.2407264709472656 + }, + { + "auxiliary_loss_clip": 0.01164926, + "auxiliary_loss_mlp": 0.01052629, + "balance_loss_clip": 1.03207743, + "balance_loss_mlp": 1.04970789, + "epoch": 0.12854351420411844, + "flos": 13881378211200.0, + "grad_norm": 2.4694787743163404, + "language_loss": 0.81923193, + "learning_rate": 3.898996223384512e-06, + "loss": 0.84140748, + "num_input_tokens_seen": 46239970, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.15625, + "step": 2138, + "time_per_iteration": 2.441708564758301 + }, + { + "auxiliary_loss_clip": 0.01170897, + "auxiliary_loss_mlp": 0.01055556, + "balance_loss_clip": 1.03207207, + "balance_loss_mlp": 1.05353928, + "epoch": 0.1286036374567864, + "flos": 22637943475200.0, + "grad_norm": 2.804990264663657, + "language_loss": 0.79418135, + "learning_rate": 3.898873984919113e-06, + "loss": 0.81644583, + "num_input_tokens_seen": 46257740, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.171875, + "step": 2139, + "time_per_iteration": 2.5321907997131348 + }, + { + "auxiliary_loss_clip": 0.01169458, + "auxiliary_loss_mlp": 0.0104552, + "balance_loss_clip": 1.02488446, + "balance_loss_mlp": 1.05315363, + "epoch": 0.12866376070945437, + "flos": 16324775948160.0, + "grad_norm": 2.1742564972583667, + "language_loss": 0.84761363, + "learning_rate": 3.8987516744482215e-06, + "loss": 0.86976337, + "num_input_tokens_seen": 46275445, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.1640625, + "step": 2140, + "time_per_iteration": 2.469543933868408 + }, + { + "auxiliary_loss_clip": 0.01164368, + "auxiliary_loss_mlp": 0.01045521, + "balance_loss_clip": 1.02524316, + "balance_loss_mlp": 1.05079114, + "epoch": 0.12872388396212234, + "flos": 11874546374400.0, + "grad_norm": 2.376703775404894, + "language_loss": 0.85850012, + "learning_rate": 3.898629291976476e-06, + "loss": 0.88059902, + "num_input_tokens_seen": 46291710, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1328125, + "step": 2141, + "time_per_iteration": 2.473205327987671 + }, + { + "auxiliary_loss_clip": 0.0116884, + "auxiliary_loss_mlp": 0.01049008, + "balance_loss_clip": 1.0278126, + "balance_loss_mlp": 1.05059922, + "epoch": 0.12878400721479033, + "flos": 28366700722560.0, + "grad_norm": 3.411777854813752, + "language_loss": 0.68245387, + "learning_rate": 3.898506837508518e-06, + "loss": 0.7046324, + "num_input_tokens_seen": 46311335, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1796875, + "step": 2142, + "time_per_iteration": 2.5327556133270264 + }, + { + "auxiliary_loss_clip": 0.01171992, + "auxiliary_loss_mlp": 0.01048809, + "balance_loss_clip": 1.02702951, + "balance_loss_mlp": 1.05430341, + "epoch": 0.1288441304674583, + "flos": 25885632597120.0, + "grad_norm": 2.0295098459565692, + "language_loss": 0.82883704, + "learning_rate": 3.89838431104899e-06, + "loss": 0.85104507, + "num_input_tokens_seen": 46330985, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.171875, + "step": 2143, + "time_per_iteration": 4.014873743057251 + }, + { + "auxiliary_loss_clip": 0.01171398, + "auxiliary_loss_mlp": 0.01053828, + "balance_loss_clip": 1.03262091, + "balance_loss_mlp": 1.05572712, + "epoch": 0.12890425372012626, + "flos": 20813789232000.0, + "grad_norm": 1.7367706894947552, + "language_loss": 0.81788546, + "learning_rate": 3.898261712602539e-06, + "loss": 0.84013772, + "num_input_tokens_seen": 46351295, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.15625, + "step": 2144, + "time_per_iteration": 4.002255439758301 + }, + { + "auxiliary_loss_clip": 0.0116509, + "auxiliary_loss_mlp": 0.0105384, + "balance_loss_clip": 1.03108335, + "balance_loss_mlp": 1.04864693, + "epoch": 0.12896437697279423, + "flos": 22565870835840.0, + "grad_norm": 3.8817809862500727, + "language_loss": 0.78257203, + "learning_rate": 3.898139042173813e-06, + "loss": 0.80476135, + "num_input_tokens_seen": 46368600, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.1640625, + "step": 2145, + "time_per_iteration": 2.4952287673950195 + }, + { + "auxiliary_loss_clip": 0.01167211, + "auxiliary_loss_mlp": 0.01049919, + "balance_loss_clip": 1.02825832, + "balance_loss_mlp": 1.05031526, + "epoch": 0.1290245002254622, + "flos": 17493776075520.0, + "grad_norm": 2.1659704609946897, + "language_loss": 0.82622325, + "learning_rate": 3.898016299767465e-06, + "loss": 0.84839463, + "num_input_tokens_seen": 46387370, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.171875, + "step": 2146, + "time_per_iteration": 2.4898681640625 + }, + { + "auxiliary_loss_clip": 0.01165601, + "auxiliary_loss_mlp": 0.01051615, + "balance_loss_clip": 1.02959681, + "balance_loss_mlp": 1.05129158, + "epoch": 0.12908462347813016, + "flos": 36315957859200.0, + "grad_norm": 2.717320122986492, + "language_loss": 0.70446974, + "learning_rate": 3.897893485388149e-06, + "loss": 0.72664189, + "num_input_tokens_seen": 46409570, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.140625, + "step": 2147, + "time_per_iteration": 2.5964484214782715 + }, + { + "auxiliary_loss_clip": 0.01165989, + "auxiliary_loss_mlp": 0.01051149, + "balance_loss_clip": 1.03069305, + "balance_loss_mlp": 1.05166912, + "epoch": 0.12914474673079815, + "flos": 22528703237760.0, + "grad_norm": 2.443887417123452, + "language_loss": 0.71685153, + "learning_rate": 3.897770599040521e-06, + "loss": 0.73902297, + "num_input_tokens_seen": 46429320, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.140625, + "step": 2148, + "time_per_iteration": 2.4910573959350586 + }, + { + "auxiliary_loss_clip": 0.01165944, + "auxiliary_loss_mlp": 0.01046939, + "balance_loss_clip": 1.02681684, + "balance_loss_mlp": 1.05413008, + "epoch": 0.12920486998346611, + "flos": 21471888263040.0, + "grad_norm": 1.666574129953403, + "language_loss": 0.79379606, + "learning_rate": 3.897647640729242e-06, + "loss": 0.81592482, + "num_input_tokens_seen": 46450155, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.1171875, + "step": 2149, + "time_per_iteration": 2.495443820953369 + }, + { + "auxiliary_loss_clip": 0.01167493, + "auxiliary_loss_mlp": 0.01046346, + "balance_loss_clip": 1.02455473, + "balance_loss_mlp": 1.05306077, + "epoch": 0.12926499323613408, + "flos": 27308556944640.0, + "grad_norm": 2.1379132369478313, + "language_loss": 0.76475441, + "learning_rate": 3.897524610458975e-06, + "loss": 0.78689277, + "num_input_tokens_seen": 46470280, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.140625, + "step": 2150, + "time_per_iteration": 2.524395704269409 + }, + { + "auxiliary_loss_clip": 0.01166143, + "auxiliary_loss_mlp": 0.0105244, + "balance_loss_clip": 1.03124499, + "balance_loss_mlp": 1.05094671, + "epoch": 0.12932511648880204, + "flos": 22091131756800.0, + "grad_norm": 2.417935370690141, + "language_loss": 0.70735669, + "learning_rate": 3.8974015082343835e-06, + "loss": 0.72954249, + "num_input_tokens_seen": 46487605, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1484375, + "step": 2151, + "time_per_iteration": 2.5213184356689453 + }, + { + "auxiliary_loss_clip": 0.01165721, + "auxiliary_loss_mlp": 0.01044761, + "balance_loss_clip": 1.02502, + "balance_loss_mlp": 1.05457592, + "epoch": 0.12938523974147, + "flos": 20302780394880.0, + "grad_norm": 1.9866869590783298, + "language_loss": 0.84050369, + "learning_rate": 3.897278334060137e-06, + "loss": 0.86260849, + "num_input_tokens_seen": 46505100, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.109375, + "step": 2152, + "time_per_iteration": 2.4494428634643555 + }, + { + "auxiliary_loss_clip": 0.01167192, + "auxiliary_loss_mlp": 0.01057934, + "balance_loss_clip": 1.03689384, + "balance_loss_mlp": 1.05128813, + "epoch": 0.12944536299413797, + "flos": 19499961467520.0, + "grad_norm": 2.226463520109079, + "language_loss": 0.78646791, + "learning_rate": 3.897155087940906e-06, + "loss": 0.80871922, + "num_input_tokens_seen": 46524020, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.15625, + "step": 2153, + "time_per_iteration": 2.46975040435791 + }, + { + "auxiliary_loss_clip": 0.01163518, + "auxiliary_loss_mlp": 0.01052865, + "balance_loss_clip": 1.03220654, + "balance_loss_mlp": 1.05069268, + "epoch": 0.12950548624680594, + "flos": 27707919333120.0, + "grad_norm": 2.482522823334948, + "language_loss": 0.80135351, + "learning_rate": 3.897031769881364e-06, + "loss": 0.82351738, + "num_input_tokens_seen": 46544640, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.125, + "step": 2154, + "time_per_iteration": 2.558769941329956 + }, + { + "auxiliary_loss_clip": 0.01170487, + "auxiliary_loss_mlp": 0.01051039, + "balance_loss_clip": 1.02998686, + "balance_loss_mlp": 1.05522227, + "epoch": 0.12956560949947393, + "flos": 17565740974080.0, + "grad_norm": 2.0988715261553774, + "language_loss": 0.83128881, + "learning_rate": 3.896908379886188e-06, + "loss": 0.85350406, + "num_input_tokens_seen": 46561395, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1484375, + "step": 2155, + "time_per_iteration": 2.476299524307251 + }, + { + "auxiliary_loss_clip": 0.01164922, + "auxiliary_loss_mlp": 0.01050282, + "balance_loss_clip": 1.02961075, + "balance_loss_mlp": 1.05010283, + "epoch": 0.1296257327521419, + "flos": 20740711011840.0, + "grad_norm": 2.842594732542889, + "language_loss": 0.76062953, + "learning_rate": 3.896784917960055e-06, + "loss": 0.7827816, + "num_input_tokens_seen": 46579395, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1484375, + "step": 2156, + "time_per_iteration": 2.6024632453918457 + }, + { + "auxiliary_loss_clip": 0.01161875, + "auxiliary_loss_mlp": 0.01051596, + "balance_loss_clip": 1.03078222, + "balance_loss_mlp": 1.05121815, + "epoch": 0.12968585600480986, + "flos": 16395735265920.0, + "grad_norm": 1.9934077258859366, + "language_loss": 0.86546719, + "learning_rate": 3.896661384107648e-06, + "loss": 0.88760191, + "num_input_tokens_seen": 46597090, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.109375, + "step": 2157, + "time_per_iteration": 2.452826976776123 + }, + { + "auxiliary_loss_clip": 0.01164359, + "auxiliary_loss_mlp": 0.01059125, + "balance_loss_clip": 1.03745282, + "balance_loss_mlp": 1.04796743, + "epoch": 0.12974597925747783, + "flos": 28329533124480.0, + "grad_norm": 2.339899004847696, + "language_loss": 0.80590808, + "learning_rate": 3.896537778333651e-06, + "loss": 0.82814288, + "num_input_tokens_seen": 46617355, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.1640625, + "step": 2158, + "time_per_iteration": 2.5332443714141846 + }, + { + "auxiliary_loss_clip": 0.01169288, + "auxiliary_loss_mlp": 0.01055971, + "balance_loss_clip": 1.03510916, + "balance_loss_mlp": 1.05294585, + "epoch": 0.1298061025101458, + "flos": 9683025782400.0, + "grad_norm": 2.254282600322574, + "language_loss": 0.74603379, + "learning_rate": 3.896414100642752e-06, + "loss": 0.76828635, + "num_input_tokens_seen": 46633130, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1640625, + "step": 2159, + "time_per_iteration": 2.469038963317871 + }, + { + "auxiliary_loss_clip": 0.01158286, + "auxiliary_loss_mlp": 0.0105286, + "balance_loss_clip": 1.0323323, + "balance_loss_mlp": 1.04777908, + "epoch": 0.12986622576281376, + "flos": 27709535445120.0, + "grad_norm": 2.1260113568932746, + "language_loss": 0.8227706, + "learning_rate": 3.89629035103964e-06, + "loss": 0.84488213, + "num_input_tokens_seen": 46650575, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.109375, + "step": 2160, + "time_per_iteration": 2.516723155975342 + }, + { + "auxiliary_loss_clip": 0.01159917, + "auxiliary_loss_mlp": 0.01044215, + "balance_loss_clip": 1.02450943, + "balance_loss_mlp": 1.05318654, + "epoch": 0.12992634901548175, + "flos": 18802719590400.0, + "grad_norm": 1.6308358458278915, + "language_loss": 0.81877828, + "learning_rate": 3.896166529529008e-06, + "loss": 0.8408196, + "num_input_tokens_seen": 46668780, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0625, + "step": 2161, + "time_per_iteration": 2.4677131175994873 + }, + { + "auxiliary_loss_clip": 0.01161774, + "auxiliary_loss_mlp": 0.01056265, + "balance_loss_clip": 1.03479493, + "balance_loss_mlp": 1.05035043, + "epoch": 0.12998647226814972, + "flos": 29127575543040.0, + "grad_norm": 2.2782308625037686, + "language_loss": 0.82592809, + "learning_rate": 3.896042636115551e-06, + "loss": 0.84810847, + "num_input_tokens_seen": 46687550, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.109375, + "step": 2162, + "time_per_iteration": 2.5702993869781494 + }, + { + "auxiliary_loss_clip": 0.01164237, + "auxiliary_loss_mlp": 0.01054699, + "balance_loss_clip": 1.03454113, + "balance_loss_mlp": 1.04993796, + "epoch": 0.13004659552081768, + "flos": 19573686132480.0, + "grad_norm": 2.619296712638915, + "language_loss": 0.72762972, + "learning_rate": 3.895918670803968e-06, + "loss": 0.7498191, + "num_input_tokens_seen": 46706730, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.140625, + "step": 2163, + "time_per_iteration": 2.4531478881835938 + }, + { + "auxiliary_loss_clip": 0.0116607, + "auxiliary_loss_mlp": 0.01053845, + "balance_loss_clip": 1.03183889, + "balance_loss_mlp": 1.05107188, + "epoch": 0.13010671877348565, + "flos": 22490709626880.0, + "grad_norm": 2.0773433264348435, + "language_loss": 0.81498116, + "learning_rate": 3.895794633598958e-06, + "loss": 0.83718032, + "num_input_tokens_seen": 46724250, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1484375, + "step": 2164, + "time_per_iteration": 2.497072458267212 + }, + { + "auxiliary_loss_clip": 0.01164255, + "auxiliary_loss_mlp": 0.01042951, + "balance_loss_clip": 1.02381766, + "balance_loss_mlp": 1.05107093, + "epoch": 0.1301668420261536, + "flos": 23878226142720.0, + "grad_norm": 2.2040156749440523, + "language_loss": 0.72564822, + "learning_rate": 3.8956705245052256e-06, + "loss": 0.7477203, + "num_input_tokens_seen": 46744105, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.1328125, + "step": 2165, + "time_per_iteration": 2.515026807785034 + }, + { + "auxiliary_loss_clip": 0.01167589, + "auxiliary_loss_mlp": 0.01047652, + "balance_loss_clip": 1.02599204, + "balance_loss_mlp": 1.05286038, + "epoch": 0.13022696527882158, + "flos": 23150065633920.0, + "grad_norm": 2.8786436091142913, + "language_loss": 0.74697578, + "learning_rate": 3.8955463435274765e-06, + "loss": 0.76912814, + "num_input_tokens_seen": 46764250, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.1484375, + "step": 2166, + "time_per_iteration": 2.5301709175109863 + }, + { + "auxiliary_loss_clip": 0.01165477, + "auxiliary_loss_mlp": 0.01047606, + "balance_loss_clip": 1.02751899, + "balance_loss_mlp": 1.05156064, + "epoch": 0.13028708853148954, + "flos": 26908548111360.0, + "grad_norm": 1.5708346768068926, + "language_loss": 0.83053899, + "learning_rate": 3.895422090670421e-06, + "loss": 0.85266984, + "num_input_tokens_seen": 46786865, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.140625, + "step": 2167, + "time_per_iteration": 2.632035732269287 + }, + { + "auxiliary_loss_clip": 0.01163335, + "auxiliary_loss_mlp": 0.01060394, + "balance_loss_clip": 1.03931761, + "balance_loss_mlp": 1.05201721, + "epoch": 0.13034721178415754, + "flos": 21251468453760.0, + "grad_norm": 1.9158171210349437, + "language_loss": 0.83286303, + "learning_rate": 3.89529776593877e-06, + "loss": 0.85510027, + "num_input_tokens_seen": 46807030, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.109375, + "step": 2168, + "time_per_iteration": 2.4766387939453125 + }, + { + "auxiliary_loss_clip": 0.0116626, + "auxiliary_loss_mlp": 0.01052307, + "balance_loss_clip": 1.03075409, + "balance_loss_mlp": 1.05258656, + "epoch": 0.1304073350368255, + "flos": 18767239931520.0, + "grad_norm": 2.304013454801214, + "language_loss": 0.80027354, + "learning_rate": 3.8951733693372375e-06, + "loss": 0.82245922, + "num_input_tokens_seen": 46826280, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.140625, + "step": 2169, + "time_per_iteration": 2.5185413360595703 + }, + { + "auxiliary_loss_clip": 0.01166949, + "auxiliary_loss_mlp": 0.01045126, + "balance_loss_clip": 1.02329922, + "balance_loss_mlp": 1.05451608, + "epoch": 0.13046745828949347, + "flos": 28364653647360.0, + "grad_norm": 4.565704621626811, + "language_loss": 0.66456163, + "learning_rate": 3.8950489008705406e-06, + "loss": 0.68668246, + "num_input_tokens_seen": 46846505, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.125, + "step": 2170, + "time_per_iteration": 2.5556788444519043 + }, + { + "auxiliary_loss_clip": 0.01165384, + "auxiliary_loss_mlp": 0.01044241, + "balance_loss_clip": 1.02397573, + "balance_loss_mlp": 1.05294132, + "epoch": 0.13052758154216143, + "flos": 29605044055680.0, + "grad_norm": 1.848772151746763, + "language_loss": 0.66935396, + "learning_rate": 3.8949243605434e-06, + "loss": 0.69145024, + "num_input_tokens_seen": 46867380, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.125, + "step": 2171, + "time_per_iteration": 2.553422451019287 + }, + { + "auxiliary_loss_clip": 0.01164709, + "auxiliary_loss_mlp": 0.01048224, + "balance_loss_clip": 1.02649236, + "balance_loss_mlp": 1.05211711, + "epoch": 0.1305877047948294, + "flos": 19390864884480.0, + "grad_norm": 1.9479804069383955, + "language_loss": 0.71952963, + "learning_rate": 3.894799748360537e-06, + "loss": 0.74165899, + "num_input_tokens_seen": 46886810, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.125, + "step": 2172, + "time_per_iteration": 2.4801840782165527 + }, + { + "auxiliary_loss_clip": 0.01161466, + "auxiliary_loss_mlp": 0.01043706, + "balance_loss_clip": 1.02508521, + "balance_loss_mlp": 1.05435848, + "epoch": 0.13064782804749736, + "flos": 16873527000960.0, + "grad_norm": 1.8616776845407013, + "language_loss": 0.75547618, + "learning_rate": 3.894675064326678e-06, + "loss": 0.77752787, + "num_input_tokens_seen": 46905620, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.0703125, + "step": 2173, + "time_per_iteration": 2.4639194011688232 + }, + { + "auxiliary_loss_clip": 0.01165867, + "auxiliary_loss_mlp": 0.01055263, + "balance_loss_clip": 1.03406715, + "balance_loss_mlp": 1.05319107, + "epoch": 0.13070795130016533, + "flos": 24499085748480.0, + "grad_norm": 2.777389952877741, + "language_loss": 0.70484382, + "learning_rate": 3.894550308446551e-06, + "loss": 0.72705513, + "num_input_tokens_seen": 46925120, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.125, + "step": 2174, + "time_per_iteration": 2.4914908409118652 + }, + { + "auxiliary_loss_clip": 0.01055025, + "auxiliary_loss_mlp": 0.01011158, + "balance_loss_clip": 1.0087378, + "balance_loss_mlp": 1.0165, + "epoch": 0.13076807455283332, + "flos": 71054505953280.0, + "grad_norm": 0.8022263951171452, + "language_loss": 0.59071571, + "learning_rate": 3.894425480724886e-06, + "loss": 0.61137754, + "num_input_tokens_seen": 46988195, + "router_z_loss_clip": 0.02416992, + "router_z_loss_mlp": 0.38671875, + "step": 2175, + "time_per_iteration": 3.244633913040161 + }, + { + "auxiliary_loss_clip": 0.01164931, + "auxiliary_loss_mlp": 0.01051735, + "balance_loss_clip": 1.03214908, + "balance_loss_mlp": 1.05474329, + "epoch": 0.13082819780550128, + "flos": 20264499475200.0, + "grad_norm": 2.247504257537708, + "language_loss": 0.79946023, + "learning_rate": 3.894300581166417e-06, + "loss": 0.8216269, + "num_input_tokens_seen": 47004720, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.1015625, + "step": 2176, + "time_per_iteration": 2.439883232116699 + }, + { + "auxiliary_loss_clip": 0.01163907, + "auxiliary_loss_mlp": 0.01050259, + "balance_loss_clip": 1.02806199, + "balance_loss_mlp": 1.05234194, + "epoch": 0.13088832105816925, + "flos": 34203441231360.0, + "grad_norm": 1.8562517641565577, + "language_loss": 0.74595284, + "learning_rate": 3.894175609775881e-06, + "loss": 0.76809454, + "num_input_tokens_seen": 47024255, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.1171875, + "step": 2177, + "time_per_iteration": 2.561140298843384 + }, + { + "auxiliary_loss_clip": 0.01163674, + "auxiliary_loss_mlp": 0.01046693, + "balance_loss_clip": 1.024472, + "balance_loss_mlp": 1.05222929, + "epoch": 0.13094844431083721, + "flos": 17894970057600.0, + "grad_norm": 2.128567307625778, + "language_loss": 0.81855309, + "learning_rate": 3.894050566558015e-06, + "loss": 0.84065676, + "num_input_tokens_seen": 47042465, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.1171875, + "step": 2178, + "time_per_iteration": 2.458812713623047 + }, + { + "auxiliary_loss_clip": 0.01166111, + "auxiliary_loss_mlp": 0.01047933, + "balance_loss_clip": 1.02695179, + "balance_loss_mlp": 1.05466795, + "epoch": 0.13100856756350518, + "flos": 17311313963520.0, + "grad_norm": 2.66972533149016, + "language_loss": 0.74942935, + "learning_rate": 3.893925451517562e-06, + "loss": 0.77156973, + "num_input_tokens_seen": 47060370, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.109375, + "step": 2179, + "time_per_iteration": 2.4679782390594482 + }, + { + "auxiliary_loss_clip": 0.01161603, + "auxiliary_loss_mlp": 0.0105054, + "balance_loss_clip": 1.03079903, + "balance_loss_mlp": 1.05280709, + "epoch": 0.13106869081617314, + "flos": 22200551562240.0, + "grad_norm": 2.0560779031919636, + "language_loss": 0.84319234, + "learning_rate": 3.893800264659266e-06, + "loss": 0.86531377, + "num_input_tokens_seen": 47081415, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0859375, + "step": 2180, + "time_per_iteration": 2.567873477935791 + }, + { + "auxiliary_loss_clip": 0.01166279, + "auxiliary_loss_mlp": 0.01054601, + "balance_loss_clip": 1.03483582, + "balance_loss_mlp": 1.05700839, + "epoch": 0.13112881406884114, + "flos": 21763123735680.0, + "grad_norm": 2.214126283525484, + "language_loss": 0.8987745, + "learning_rate": 3.8936750059878746e-06, + "loss": 0.92098325, + "num_input_tokens_seen": 47099860, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.09375, + "step": 2181, + "time_per_iteration": 2.4802486896514893 + }, + { + "auxiliary_loss_clip": 0.01166281, + "auxiliary_loss_mlp": 0.01043829, + "balance_loss_clip": 1.02438569, + "balance_loss_mlp": 1.0557189, + "epoch": 0.1311889373215091, + "flos": 23331091201920.0, + "grad_norm": 1.8993602522657917, + "language_loss": 0.68657839, + "learning_rate": 3.893549675508137e-06, + "loss": 0.70867944, + "num_input_tokens_seen": 47118540, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.109375, + "step": 2182, + "time_per_iteration": 2.460148572921753 + }, + { + "auxiliary_loss_clip": 0.01167141, + "auxiliary_loss_mlp": 0.01048146, + "balance_loss_clip": 1.02745128, + "balance_loss_mlp": 1.05504203, + "epoch": 0.13124906057417707, + "flos": 21467363149440.0, + "grad_norm": 2.6442759836393277, + "language_loss": 0.78435183, + "learning_rate": 3.893424273224806e-06, + "loss": 0.80650467, + "num_input_tokens_seen": 47136710, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.125, + "step": 2183, + "time_per_iteration": 2.5462143421173096 + }, + { + "auxiliary_loss_clip": 0.01162472, + "auxiliary_loss_mlp": 0.01043905, + "balance_loss_clip": 1.02375841, + "balance_loss_mlp": 1.05238128, + "epoch": 0.13130918382684503, + "flos": 23255319461760.0, + "grad_norm": 2.788927255894662, + "language_loss": 0.85543215, + "learning_rate": 3.893298799142636e-06, + "loss": 0.87749588, + "num_input_tokens_seen": 47157155, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.1015625, + "step": 2184, + "time_per_iteration": 3.8904993534088135 + }, + { + "auxiliary_loss_clip": 0.01165934, + "auxiliary_loss_mlp": 0.01047649, + "balance_loss_clip": 1.0265255, + "balance_loss_mlp": 1.0529201, + "epoch": 0.131369307079513, + "flos": 20850274471680.0, + "grad_norm": 2.505672435211917, + "language_loss": 0.82206696, + "learning_rate": 3.893173253266387e-06, + "loss": 0.84420282, + "num_input_tokens_seen": 47176820, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1328125, + "step": 2185, + "time_per_iteration": 5.3855485916137695 + }, + { + "auxiliary_loss_clip": 0.01168066, + "auxiliary_loss_mlp": 0.0105393, + "balance_loss_clip": 1.03323543, + "balance_loss_mlp": 1.05440092, + "epoch": 0.13142943033218096, + "flos": 17858341163520.0, + "grad_norm": 2.0294565364346235, + "language_loss": 0.73037684, + "learning_rate": 3.893047635600818e-06, + "loss": 0.7525968, + "num_input_tokens_seen": 47195855, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1328125, + "step": 2186, + "time_per_iteration": 2.4839119911193848 + }, + { + "auxiliary_loss_clip": 0.01165928, + "auxiliary_loss_mlp": 0.01048235, + "balance_loss_clip": 1.02601433, + "balance_loss_mlp": 1.05449164, + "epoch": 0.13148955358484893, + "flos": 20996035862400.0, + "grad_norm": 2.0525608711513614, + "language_loss": 0.80174023, + "learning_rate": 3.892921946150693e-06, + "loss": 0.82388186, + "num_input_tokens_seen": 47214535, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.109375, + "step": 2187, + "time_per_iteration": 2.463906764984131 + }, + { + "auxiliary_loss_clip": 0.01053378, + "auxiliary_loss_mlp": 0.01005839, + "balance_loss_clip": 1.00344312, + "balance_loss_mlp": 1.01508641, + "epoch": 0.13154967683751692, + "flos": 70172467580160.0, + "grad_norm": 0.8435449169341035, + "language_loss": 0.58977342, + "learning_rate": 3.892796184920778e-06, + "loss": 0.61036563, + "num_input_tokens_seen": 47270300, + "router_z_loss_clip": 0.02392578, + "router_z_loss_mlp": 0.3828125, + "step": 2188, + "time_per_iteration": 3.1052041053771973 + }, + { + "auxiliary_loss_clip": 0.01169813, + "auxiliary_loss_mlp": 0.01050803, + "balance_loss_clip": 1.03037024, + "balance_loss_mlp": 1.05918622, + "epoch": 0.1316098000901849, + "flos": 20376145923840.0, + "grad_norm": 2.1443848583942846, + "language_loss": 0.74199927, + "learning_rate": 3.892670351915842e-06, + "loss": 0.76420546, + "num_input_tokens_seen": 47290720, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.109375, + "step": 2189, + "time_per_iteration": 2.5137264728546143 + }, + { + "auxiliary_loss_clip": 0.01166605, + "auxiliary_loss_mlp": 0.01049022, + "balance_loss_clip": 1.02894759, + "balance_loss_mlp": 1.05678558, + "epoch": 0.13166992334285285, + "flos": 23221132692480.0, + "grad_norm": 1.7642431940848833, + "language_loss": 0.72561657, + "learning_rate": 3.892544447140657e-06, + "loss": 0.74777287, + "num_input_tokens_seen": 47311820, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.1015625, + "step": 2190, + "time_per_iteration": 2.5053412914276123 + }, + { + "auxiliary_loss_clip": 0.01169095, + "auxiliary_loss_mlp": 0.01051343, + "balance_loss_clip": 1.03094649, + "balance_loss_mlp": 1.05706906, + "epoch": 0.13173004659552082, + "flos": 23330947547520.0, + "grad_norm": 8.700182749243472, + "language_loss": 0.74395585, + "learning_rate": 3.892418470599996e-06, + "loss": 0.76616025, + "num_input_tokens_seen": 47331605, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.1171875, + "step": 2191, + "time_per_iteration": 2.507687568664551 + }, + { + "auxiliary_loss_clip": 0.01168877, + "auxiliary_loss_mlp": 0.01049305, + "balance_loss_clip": 1.02841949, + "balance_loss_mlp": 1.05689156, + "epoch": 0.13179016984818878, + "flos": 21251504367360.0, + "grad_norm": 2.0250128968483403, + "language_loss": 0.79286075, + "learning_rate": 3.892292422298637e-06, + "loss": 0.8150425, + "num_input_tokens_seen": 47350455, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1171875, + "step": 2192, + "time_per_iteration": 2.5068893432617188 + }, + { + "auxiliary_loss_clip": 0.01168449, + "auxiliary_loss_mlp": 0.01053422, + "balance_loss_clip": 1.03290629, + "balance_loss_mlp": 1.05564141, + "epoch": 0.13185029310085675, + "flos": 17778690754560.0, + "grad_norm": 1.9285179647135495, + "language_loss": 0.84827602, + "learning_rate": 3.892166302241361e-06, + "loss": 0.87049472, + "num_input_tokens_seen": 47368225, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.125, + "step": 2193, + "time_per_iteration": 2.456409215927124 + }, + { + "auxiliary_loss_clip": 0.0105585, + "auxiliary_loss_mlp": 0.01002145, + "balance_loss_clip": 0.99976075, + "balance_loss_mlp": 1.0179081, + "epoch": 0.1319104163535247, + "flos": 69851785933440.0, + "grad_norm": 0.7727203010194038, + "language_loss": 0.54049635, + "learning_rate": 3.8920401104329475e-06, + "loss": 0.56107628, + "num_input_tokens_seen": 47427125, + "router_z_loss_clip": 0.02380371, + "router_z_loss_mlp": 0.37890625, + "step": 2194, + "time_per_iteration": 3.0569794178009033 + }, + { + "auxiliary_loss_clip": 0.01165932, + "auxiliary_loss_mlp": 0.01046302, + "balance_loss_clip": 1.02566671, + "balance_loss_mlp": 1.05514359, + "epoch": 0.1319705396061927, + "flos": 25193095401600.0, + "grad_norm": 1.7688784093808256, + "language_loss": 0.72086227, + "learning_rate": 3.891913846878185e-06, + "loss": 0.74298465, + "num_input_tokens_seen": 47450275, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.109375, + "step": 2195, + "time_per_iteration": 2.527435541152954 + }, + { + "auxiliary_loss_clip": 0.01173804, + "auxiliary_loss_mlp": 0.01045041, + "balance_loss_clip": 1.02310634, + "balance_loss_mlp": 1.05663633, + "epoch": 0.13203066285886067, + "flos": 20740459616640.0, + "grad_norm": 1.7664998702658374, + "language_loss": 0.78195536, + "learning_rate": 3.891787511581859e-06, + "loss": 0.80414379, + "num_input_tokens_seen": 47469155, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.171875, + "step": 2196, + "time_per_iteration": 2.4685165882110596 + }, + { + "auxiliary_loss_clip": 0.01169552, + "auxiliary_loss_mlp": 0.01046979, + "balance_loss_clip": 1.02714252, + "balance_loss_mlp": 1.05638218, + "epoch": 0.13209078611152864, + "flos": 22054395121920.0, + "grad_norm": 2.1663119445052295, + "language_loss": 0.74861938, + "learning_rate": 3.89166110454876e-06, + "loss": 0.77078474, + "num_input_tokens_seen": 47488405, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.1328125, + "step": 2197, + "time_per_iteration": 2.489504814147949 + }, + { + "auxiliary_loss_clip": 0.01170611, + "auxiliary_loss_mlp": 0.01044966, + "balance_loss_clip": 1.02430725, + "balance_loss_mlp": 1.05543399, + "epoch": 0.1321509093641966, + "flos": 16284950743680.0, + "grad_norm": 2.4378795089069674, + "language_loss": 0.8011694, + "learning_rate": 3.891534625783685e-06, + "loss": 0.82332516, + "num_input_tokens_seen": 47505650, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1484375, + "step": 2198, + "time_per_iteration": 2.437718391418457 + }, + { + "auxiliary_loss_clip": 0.0116676, + "auxiliary_loss_mlp": 0.01061419, + "balance_loss_clip": 1.04173732, + "balance_loss_mlp": 1.05483699, + "epoch": 0.13221103261686457, + "flos": 16983018633600.0, + "grad_norm": 2.4514815632850038, + "language_loss": 0.82552117, + "learning_rate": 3.891408075291425e-06, + "loss": 0.847803, + "num_input_tokens_seen": 47521540, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1171875, + "step": 2199, + "time_per_iteration": 2.47356915473938 + }, + { + "auxiliary_loss_clip": 0.01167277, + "auxiliary_loss_mlp": 0.01047633, + "balance_loss_clip": 1.02724838, + "balance_loss_mlp": 1.05458844, + "epoch": 0.13227115586953253, + "flos": 34233605677440.0, + "grad_norm": 2.465688895758548, + "language_loss": 0.68963099, + "learning_rate": 3.8912814530767826e-06, + "loss": 0.71178007, + "num_input_tokens_seen": 47543625, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.125, + "step": 2200, + "time_per_iteration": 2.5828843116760254 + }, + { + "auxiliary_loss_clip": 0.01166491, + "auxiliary_loss_mlp": 0.01055533, + "balance_loss_clip": 1.03420663, + "balance_loss_mlp": 1.05397916, + "epoch": 0.13233127912220052, + "flos": 20704656735360.0, + "grad_norm": 2.591612522060186, + "language_loss": 0.84600091, + "learning_rate": 3.891154759144557e-06, + "loss": 0.86822116, + "num_input_tokens_seen": 47563740, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.125, + "step": 2201, + "time_per_iteration": 2.5546202659606934 + }, + { + "auxiliary_loss_clip": 0.01168797, + "auxiliary_loss_mlp": 0.01054072, + "balance_loss_clip": 1.03315115, + "balance_loss_mlp": 1.05466592, + "epoch": 0.1323914023748685, + "flos": 25805048434560.0, + "grad_norm": 1.901870031688447, + "language_loss": 0.86978126, + "learning_rate": 3.891027993499554e-06, + "loss": 0.89200991, + "num_input_tokens_seen": 47582655, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.140625, + "step": 2202, + "time_per_iteration": 2.509300470352173 + }, + { + "auxiliary_loss_clip": 0.01164666, + "auxiliary_loss_mlp": 0.01042993, + "balance_loss_clip": 1.02364576, + "balance_loss_mlp": 1.05389142, + "epoch": 0.13245152562753645, + "flos": 21251540280960.0, + "grad_norm": 2.3614014237187084, + "language_loss": 0.72746712, + "learning_rate": 3.89090115614658e-06, + "loss": 0.74954367, + "num_input_tokens_seen": 47600875, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.109375, + "step": 2203, + "time_per_iteration": 2.5267388820648193 + }, + { + "auxiliary_loss_clip": 0.01167891, + "auxiliary_loss_mlp": 0.0105678, + "balance_loss_clip": 1.03781366, + "balance_loss_mlp": 1.05453348, + "epoch": 0.13251164888020442, + "flos": 26610955931520.0, + "grad_norm": 2.5436302639516, + "language_loss": 0.73248756, + "learning_rate": 3.890774247090444e-06, + "loss": 0.75473428, + "num_input_tokens_seen": 47619250, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.1328125, + "step": 2204, + "time_per_iteration": 2.5298051834106445 + }, + { + "auxiliary_loss_clip": 0.01168712, + "auxiliary_loss_mlp": 0.01053403, + "balance_loss_clip": 1.03211212, + "balance_loss_mlp": 1.05558085, + "epoch": 0.13257177213287238, + "flos": 29826541272960.0, + "grad_norm": 1.7540271848273767, + "language_loss": 0.78627133, + "learning_rate": 3.89064726633596e-06, + "loss": 0.80849254, + "num_input_tokens_seen": 47639445, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1328125, + "step": 2205, + "time_per_iteration": 2.5343189239501953 + }, + { + "auxiliary_loss_clip": 0.01166449, + "auxiliary_loss_mlp": 0.01053788, + "balance_loss_clip": 1.033391, + "balance_loss_mlp": 1.05560231, + "epoch": 0.13263189538554035, + "flos": 21288456483840.0, + "grad_norm": 2.234297854715259, + "language_loss": 0.78748876, + "learning_rate": 3.890520213887941e-06, + "loss": 0.80969107, + "num_input_tokens_seen": 47658740, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.109375, + "step": 2206, + "time_per_iteration": 2.473229169845581 + }, + { + "auxiliary_loss_clip": 0.01170953, + "auxiliary_loss_mlp": 0.01046503, + "balance_loss_clip": 1.02750087, + "balance_loss_mlp": 1.05758011, + "epoch": 0.13269201863820831, + "flos": 16874101618560.0, + "grad_norm": 2.3028539815574494, + "language_loss": 0.73993444, + "learning_rate": 3.890393089751208e-06, + "loss": 0.76210898, + "num_input_tokens_seen": 47676880, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.125, + "step": 2207, + "time_per_iteration": 2.479421854019165 + }, + { + "auxiliary_loss_clip": 0.01160402, + "auxiliary_loss_mlp": 0.0104899, + "balance_loss_clip": 1.02822387, + "balance_loss_mlp": 1.05323017, + "epoch": 0.1327521418908763, + "flos": 23768914078080.0, + "grad_norm": 2.4105539478543454, + "language_loss": 0.84151787, + "learning_rate": 3.890265893930578e-06, + "loss": 0.86361182, + "num_input_tokens_seen": 47696635, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.0703125, + "step": 2208, + "time_per_iteration": 2.501969337463379 + }, + { + "auxiliary_loss_clip": 0.01161894, + "auxiliary_loss_mlp": 0.01055633, + "balance_loss_clip": 1.03621435, + "balance_loss_mlp": 1.05553222, + "epoch": 0.13281226514354427, + "flos": 26505594362880.0, + "grad_norm": 1.9362156368998853, + "language_loss": 0.85323346, + "learning_rate": 3.890138626430876e-06, + "loss": 0.87540877, + "num_input_tokens_seen": 47717760, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0625, + "step": 2209, + "time_per_iteration": 2.509761333465576 + }, + { + "auxiliary_loss_clip": 0.01166975, + "auxiliary_loss_mlp": 0.01049621, + "balance_loss_clip": 1.03039217, + "balance_loss_mlp": 1.05628705, + "epoch": 0.13287238839621224, + "flos": 24498762526080.0, + "grad_norm": 2.055387861012722, + "language_loss": 0.81545013, + "learning_rate": 3.890011287256929e-06, + "loss": 0.83761609, + "num_input_tokens_seen": 47737685, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.109375, + "step": 2210, + "time_per_iteration": 2.4920527935028076 + }, + { + "auxiliary_loss_clip": 0.0106048, + "auxiliary_loss_mlp": 0.01003994, + "balance_loss_clip": 1.0016222, + "balance_loss_mlp": 1.02205014, + "epoch": 0.1329325116488802, + "flos": 67694344369920.0, + "grad_norm": 0.7616894664797615, + "language_loss": 0.57984382, + "learning_rate": 3.889883876413563e-06, + "loss": 0.6004886, + "num_input_tokens_seen": 47802415, + "router_z_loss_clip": 0.02368164, + "router_z_loss_mlp": 0.3828125, + "step": 2211, + "time_per_iteration": 3.1735260486602783 + }, + { + "auxiliary_loss_clip": 0.01059664, + "auxiliary_loss_mlp": 0.01005439, + "balance_loss_clip": 1.00312614, + "balance_loss_mlp": 1.02081084, + "epoch": 0.13299263490154817, + "flos": 72261894741120.0, + "grad_norm": 0.7970523185699088, + "language_loss": 0.55364317, + "learning_rate": 3.889756393905611e-06, + "loss": 0.57429421, + "num_input_tokens_seen": 47871485, + "router_z_loss_clip": 0.02307129, + "router_z_loss_mlp": 0.38671875, + "step": 2212, + "time_per_iteration": 3.142056465148926 + }, + { + "auxiliary_loss_clip": 0.01170665, + "auxiliary_loss_mlp": 0.01052255, + "balance_loss_clip": 1.03164423, + "balance_loss_mlp": 1.056463, + "epoch": 0.13305275815421613, + "flos": 17931275729280.0, + "grad_norm": 4.2694742121271645, + "language_loss": 0.74779308, + "learning_rate": 3.889628839737908e-06, + "loss": 0.77002227, + "num_input_tokens_seen": 47888315, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.140625, + "step": 2213, + "time_per_iteration": 2.4599013328552246 + }, + { + "auxiliary_loss_clip": 0.0115893, + "auxiliary_loss_mlp": 0.01047564, + "balance_loss_clip": 1.02889609, + "balance_loss_mlp": 1.05235839, + "epoch": 0.13311288140688413, + "flos": 22340889999360.0, + "grad_norm": 2.0343460890824927, + "language_loss": 0.79269958, + "learning_rate": 3.889501213915291e-06, + "loss": 0.81476456, + "num_input_tokens_seen": 47906600, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.0625, + "step": 2214, + "time_per_iteration": 2.4635097980499268 + }, + { + "auxiliary_loss_clip": 0.01166328, + "auxiliary_loss_mlp": 0.01051317, + "balance_loss_clip": 1.03062189, + "balance_loss_mlp": 1.05593503, + "epoch": 0.1331730046595521, + "flos": 31868888682240.0, + "grad_norm": 2.0399610331480407, + "language_loss": 0.69410872, + "learning_rate": 3.889373516442597e-06, + "loss": 0.71628523, + "num_input_tokens_seen": 47927630, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1015625, + "step": 2215, + "time_per_iteration": 2.5798754692077637 + }, + { + "auxiliary_loss_clip": 0.01166771, + "auxiliary_loss_mlp": 0.01046808, + "balance_loss_clip": 1.02725816, + "balance_loss_mlp": 1.05576539, + "epoch": 0.13323312791222006, + "flos": 22566589107840.0, + "grad_norm": 2.4518621177772175, + "language_loss": 0.81136751, + "learning_rate": 3.889245747324671e-06, + "loss": 0.83350337, + "num_input_tokens_seen": 47947935, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.109375, + "step": 2216, + "time_per_iteration": 2.49601674079895 + }, + { + "auxiliary_loss_clip": 0.01166215, + "auxiliary_loss_mlp": 0.01057297, + "balance_loss_clip": 1.03668606, + "balance_loss_mlp": 1.05610895, + "epoch": 0.13329325116488802, + "flos": 15085319293440.0, + "grad_norm": 3.5729384628186307, + "language_loss": 0.87350845, + "learning_rate": 3.889117906566356e-06, + "loss": 0.89574361, + "num_input_tokens_seen": 47965515, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.1015625, + "step": 2217, + "time_per_iteration": 2.435224771499634 + }, + { + "auxiliary_loss_clip": 0.01165439, + "auxiliary_loss_mlp": 0.01048261, + "balance_loss_clip": 1.02716112, + "balance_loss_mlp": 1.05609739, + "epoch": 0.133353374417556, + "flos": 27453671890560.0, + "grad_norm": 2.6393181601709057, + "language_loss": 0.73460543, + "learning_rate": 3.888989994172501e-06, + "loss": 0.75674248, + "num_input_tokens_seen": 47985675, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.09375, + "step": 2218, + "time_per_iteration": 2.4984188079833984 + }, + { + "auxiliary_loss_clip": 0.01164132, + "auxiliary_loss_mlp": 0.01044805, + "balance_loss_clip": 1.02401495, + "balance_loss_mlp": 1.05406141, + "epoch": 0.13341349767022395, + "flos": 24094695456000.0, + "grad_norm": 1.803125703936159, + "language_loss": 0.87483871, + "learning_rate": 3.8888620101479565e-06, + "loss": 0.89692807, + "num_input_tokens_seen": 48004985, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.09375, + "step": 2219, + "time_per_iteration": 2.4761111736297607 + }, + { + "auxiliary_loss_clip": 0.01166927, + "auxiliary_loss_mlp": 0.01051114, + "balance_loss_clip": 1.03198123, + "balance_loss_mlp": 1.05804753, + "epoch": 0.13347362092289192, + "flos": 24133335511680.0, + "grad_norm": 1.5604165479120375, + "language_loss": 0.77241862, + "learning_rate": 3.888733954497574e-06, + "loss": 0.79459906, + "num_input_tokens_seen": 48024965, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0859375, + "step": 2220, + "time_per_iteration": 2.5172770023345947 + }, + { + "auxiliary_loss_clip": 0.01158357, + "auxiliary_loss_mlp": 0.01044474, + "balance_loss_clip": 1.02599692, + "balance_loss_mlp": 1.05065227, + "epoch": 0.1335337441755599, + "flos": 18436538390400.0, + "grad_norm": 2.752699726256429, + "language_loss": 0.79361391, + "learning_rate": 3.888605827226212e-06, + "loss": 0.81564224, + "num_input_tokens_seen": 48040890, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.078125, + "step": 2221, + "time_per_iteration": 2.4729459285736084 + }, + { + "auxiliary_loss_clip": 0.01056162, + "auxiliary_loss_mlp": 0.01004009, + "balance_loss_clip": 1.00177944, + "balance_loss_mlp": 1.01797867, + "epoch": 0.13359386742822787, + "flos": 50611997652480.0, + "grad_norm": 0.9620212456786271, + "language_loss": 0.6890744, + "learning_rate": 3.8884776283387275e-06, + "loss": 0.70967615, + "num_input_tokens_seen": 48091855, + "router_z_loss_clip": 0.02233887, + "router_z_loss_mlp": 0.3828125, + "step": 2222, + "time_per_iteration": 2.9102694988250732 + }, + { + "auxiliary_loss_clip": 0.011664, + "auxiliary_loss_mlp": 0.01047762, + "balance_loss_clip": 1.02885592, + "balance_loss_mlp": 1.05645049, + "epoch": 0.13365399068089584, + "flos": 22778569221120.0, + "grad_norm": 1.8990549263762904, + "language_loss": 0.66966134, + "learning_rate": 3.888349357839982e-06, + "loss": 0.69180298, + "num_input_tokens_seen": 48111350, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.1015625, + "step": 2223, + "time_per_iteration": 2.4860363006591797 + }, + { + "auxiliary_loss_clip": 0.01162257, + "auxiliary_loss_mlp": 0.01055999, + "balance_loss_clip": 1.03584075, + "balance_loss_mlp": 1.05173874, + "epoch": 0.1337141139335638, + "flos": 12531603911040.0, + "grad_norm": 2.0940561003244738, + "language_loss": 0.82572883, + "learning_rate": 3.88822101573484e-06, + "loss": 0.84791142, + "num_input_tokens_seen": 48129840, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.109375, + "step": 2224, + "time_per_iteration": 2.453310966491699 + }, + { + "auxiliary_loss_clip": 0.01167505, + "auxiliary_loss_mlp": 0.01047104, + "balance_loss_clip": 1.0266118, + "balance_loss_mlp": 1.05410361, + "epoch": 0.13377423718623177, + "flos": 23038957889280.0, + "grad_norm": 2.0797940389634624, + "language_loss": 0.66006851, + "learning_rate": 3.888092602028167e-06, + "loss": 0.68221462, + "num_input_tokens_seen": 48149240, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1328125, + "step": 2225, + "time_per_iteration": 2.505760669708252 + }, + { + "auxiliary_loss_clip": 0.01164479, + "auxiliary_loss_mlp": 0.01054978, + "balance_loss_clip": 1.03491461, + "balance_loss_mlp": 1.05366707, + "epoch": 0.13383436043889974, + "flos": 16216397637120.0, + "grad_norm": 2.2490181158076545, + "language_loss": 0.89484501, + "learning_rate": 3.887964116724835e-06, + "loss": 0.91703951, + "num_input_tokens_seen": 48166330, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.109375, + "step": 2226, + "time_per_iteration": 3.827432632446289 + }, + { + "auxiliary_loss_clip": 0.01166768, + "auxiliary_loss_mlp": 0.01050683, + "balance_loss_clip": 1.03132319, + "balance_loss_mlp": 1.05492473, + "epoch": 0.1338944836915677, + "flos": 24279671520000.0, + "grad_norm": 2.0692514385202947, + "language_loss": 0.73874348, + "learning_rate": 3.887835559829712e-06, + "loss": 0.76091796, + "num_input_tokens_seen": 48187600, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.1171875, + "step": 2227, + "time_per_iteration": 5.469221115112305 + }, + { + "auxiliary_loss_clip": 0.01166021, + "auxiliary_loss_mlp": 0.01049972, + "balance_loss_clip": 1.02971888, + "balance_loss_mlp": 1.05582607, + "epoch": 0.1339546069442357, + "flos": 17598742594560.0, + "grad_norm": 2.597241668203809, + "language_loss": 0.8519839, + "learning_rate": 3.8877069313476764e-06, + "loss": 0.87414384, + "num_input_tokens_seen": 48204400, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1015625, + "step": 2228, + "time_per_iteration": 2.449289560317993 + }, + { + "auxiliary_loss_clip": 0.01162737, + "auxiliary_loss_mlp": 0.01047632, + "balance_loss_clip": 1.0275687, + "balance_loss_mlp": 1.05501461, + "epoch": 0.13401473019690366, + "flos": 18990065952000.0, + "grad_norm": 2.700498827765594, + "language_loss": 0.8100034, + "learning_rate": 3.8875782312836054e-06, + "loss": 0.83210707, + "num_input_tokens_seen": 48222180, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.078125, + "step": 2229, + "time_per_iteration": 2.454185962677002 + }, + { + "auxiliary_loss_clip": 0.01165405, + "auxiliary_loss_mlp": 0.01055372, + "balance_loss_clip": 1.03528547, + "balance_loss_mlp": 1.05576682, + "epoch": 0.13407485344957162, + "flos": 26943812288640.0, + "grad_norm": 2.350850930683171, + "language_loss": 0.73814881, + "learning_rate": 3.887449459642378e-06, + "loss": 0.76035661, + "num_input_tokens_seen": 48243245, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.09375, + "step": 2230, + "time_per_iteration": 2.538679838180542 + }, + { + "auxiliary_loss_clip": 0.0116587, + "auxiliary_loss_mlp": 0.01055192, + "balance_loss_clip": 1.03551102, + "balance_loss_mlp": 1.0541544, + "epoch": 0.1341349767022396, + "flos": 20339373375360.0, + "grad_norm": 8.27737726970052, + "language_loss": 0.79914325, + "learning_rate": 3.8873206164288785e-06, + "loss": 0.82135391, + "num_input_tokens_seen": 48262600, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.1171875, + "step": 2231, + "time_per_iteration": 2.436964273452759 + }, + { + "auxiliary_loss_clip": 0.0116777, + "auxiliary_loss_mlp": 0.01049092, + "balance_loss_clip": 1.02906466, + "balance_loss_mlp": 1.05716896, + "epoch": 0.13419509995490755, + "flos": 29862020931840.0, + "grad_norm": 1.9954658779127024, + "language_loss": 0.72341192, + "learning_rate": 3.887191701647992e-06, + "loss": 0.74558049, + "num_input_tokens_seen": 48285075, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.109375, + "step": 2232, + "time_per_iteration": 2.5315330028533936 + }, + { + "auxiliary_loss_clip": 0.01169038, + "auxiliary_loss_mlp": 0.01047761, + "balance_loss_clip": 1.02664888, + "balance_loss_mlp": 1.05505097, + "epoch": 0.13425522320757552, + "flos": 26942986275840.0, + "grad_norm": 2.53729194427275, + "language_loss": 0.65508974, + "learning_rate": 3.8870627153046066e-06, + "loss": 0.67725778, + "num_input_tokens_seen": 48301285, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.140625, + "step": 2233, + "time_per_iteration": 2.480006694793701 + }, + { + "auxiliary_loss_clip": 0.01161612, + "auxiliary_loss_mlp": 0.01047371, + "balance_loss_clip": 1.02687883, + "balance_loss_mlp": 1.05011904, + "epoch": 0.1343153464602435, + "flos": 15777281871360.0, + "grad_norm": 4.541384002557222, + "language_loss": 0.81492066, + "learning_rate": 3.886933657403615e-06, + "loss": 0.8370105, + "num_input_tokens_seen": 48317835, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1171875, + "step": 2234, + "time_per_iteration": 2.431962490081787 + }, + { + "auxiliary_loss_clip": 0.01165653, + "auxiliary_loss_mlp": 0.01054939, + "balance_loss_clip": 1.03466105, + "balance_loss_mlp": 1.05424869, + "epoch": 0.13437546971291148, + "flos": 24314756129280.0, + "grad_norm": 1.9481483268780417, + "language_loss": 0.82361299, + "learning_rate": 3.886804527949909e-06, + "loss": 0.84581894, + "num_input_tokens_seen": 48335670, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1171875, + "step": 2235, + "time_per_iteration": 2.4478979110717773 + }, + { + "auxiliary_loss_clip": 0.0116322, + "auxiliary_loss_mlp": 0.01055841, + "balance_loss_clip": 1.03378713, + "balance_loss_mlp": 1.05170834, + "epoch": 0.13443559296557944, + "flos": 26650673395200.0, + "grad_norm": 1.6568048404288893, + "language_loss": 0.86399209, + "learning_rate": 3.8866753269483864e-06, + "loss": 0.88618279, + "num_input_tokens_seen": 48357805, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1171875, + "step": 2236, + "time_per_iteration": 2.534761428833008 + }, + { + "auxiliary_loss_clip": 0.01166625, + "auxiliary_loss_mlp": 0.0104777, + "balance_loss_clip": 1.02712345, + "balance_loss_mlp": 1.05506372, + "epoch": 0.1344957162182474, + "flos": 21796197183360.0, + "grad_norm": 1.5401183277834882, + "language_loss": 0.76936173, + "learning_rate": 3.886546054403946e-06, + "loss": 0.79150563, + "num_input_tokens_seen": 48377845, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1171875, + "step": 2237, + "time_per_iteration": 2.454881191253662 + }, + { + "auxiliary_loss_clip": 0.0116441, + "auxiliary_loss_mlp": 0.01051932, + "balance_loss_clip": 1.02974725, + "balance_loss_mlp": 1.05312407, + "epoch": 0.13455583947091537, + "flos": 19865568049920.0, + "grad_norm": 1.976295310563951, + "language_loss": 0.78737688, + "learning_rate": 3.886416710321491e-06, + "loss": 0.80954033, + "num_input_tokens_seen": 48394735, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.1171875, + "step": 2238, + "time_per_iteration": 2.508364200592041 + }, + { + "auxiliary_loss_clip": 0.01162005, + "auxiliary_loss_mlp": 0.01051856, + "balance_loss_clip": 1.03057706, + "balance_loss_mlp": 1.0530107, + "epoch": 0.13461596272358334, + "flos": 30846835094400.0, + "grad_norm": 2.3078790626960246, + "language_loss": 0.67977941, + "learning_rate": 3.886287294705924e-06, + "loss": 0.70191795, + "num_input_tokens_seen": 48414200, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.09375, + "step": 2239, + "time_per_iteration": 2.533148765563965 + }, + { + "auxiliary_loss_clip": 0.01165153, + "auxiliary_loss_mlp": 0.01049226, + "balance_loss_clip": 1.02888918, + "balance_loss_mlp": 1.05296254, + "epoch": 0.1346760859762513, + "flos": 12494436312960.0, + "grad_norm": 2.7482132203763245, + "language_loss": 0.81085825, + "learning_rate": 3.8861578075621555e-06, + "loss": 0.83300203, + "num_input_tokens_seen": 48431065, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.125, + "step": 2240, + "time_per_iteration": 2.458702802658081 + }, + { + "auxiliary_loss_clip": 0.01166075, + "auxiliary_loss_mlp": 0.01050419, + "balance_loss_clip": 1.02958083, + "balance_loss_mlp": 1.05302262, + "epoch": 0.1347362092289193, + "flos": 21836022387840.0, + "grad_norm": 1.775061814751768, + "language_loss": 0.77491653, + "learning_rate": 3.886028248895093e-06, + "loss": 0.79708141, + "num_input_tokens_seen": 48450335, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1328125, + "step": 2241, + "time_per_iteration": 2.4814610481262207 + }, + { + "auxiliary_loss_clip": 0.01160364, + "auxiliary_loss_mlp": 0.01044969, + "balance_loss_clip": 1.0265156, + "balance_loss_mlp": 1.05368328, + "epoch": 0.13479633248158726, + "flos": 23509459163520.0, + "grad_norm": 1.708340264075402, + "language_loss": 0.83106101, + "learning_rate": 3.88589861870965e-06, + "loss": 0.85311437, + "num_input_tokens_seen": 48468555, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.0625, + "step": 2242, + "time_per_iteration": 2.531010627746582 + }, + { + "auxiliary_loss_clip": 0.01166889, + "auxiliary_loss_mlp": 0.01052169, + "balance_loss_clip": 1.03056788, + "balance_loss_mlp": 1.05465889, + "epoch": 0.13485645573425523, + "flos": 29344332165120.0, + "grad_norm": 3.594763109819468, + "language_loss": 0.64927268, + "learning_rate": 3.885768917010744e-06, + "loss": 0.67146331, + "num_input_tokens_seen": 48488515, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.125, + "step": 2243, + "time_per_iteration": 2.5215864181518555 + }, + { + "auxiliary_loss_clip": 0.01158013, + "auxiliary_loss_mlp": 0.01045929, + "balance_loss_clip": 1.02573538, + "balance_loss_mlp": 1.05214143, + "epoch": 0.1349165789869232, + "flos": 28037112503040.0, + "grad_norm": 1.6702464572283469, + "language_loss": 0.72275442, + "learning_rate": 3.8856391438032895e-06, + "loss": 0.74479383, + "num_input_tokens_seen": 48510515, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0625, + "step": 2244, + "time_per_iteration": 2.572275161743164 + }, + { + "auxiliary_loss_clip": 0.01161264, + "auxiliary_loss_mlp": 0.01052624, + "balance_loss_clip": 1.03339577, + "balance_loss_mlp": 1.0510093, + "epoch": 0.13497670223959116, + "flos": 22853730430080.0, + "grad_norm": 1.6251739599249553, + "language_loss": 0.86419517, + "learning_rate": 3.88550929909221e-06, + "loss": 0.886334, + "num_input_tokens_seen": 48529940, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.1015625, + "step": 2245, + "time_per_iteration": 2.4847447872161865 + }, + { + "auxiliary_loss_clip": 0.0115964, + "auxiliary_loss_mlp": 0.01049956, + "balance_loss_clip": 1.029953, + "balance_loss_mlp": 1.0534606, + "epoch": 0.13503682549225912, + "flos": 16504580453760.0, + "grad_norm": 1.986035604010071, + "language_loss": 0.79054129, + "learning_rate": 3.88537938288243e-06, + "loss": 0.81263721, + "num_input_tokens_seen": 48548190, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0625, + "step": 2246, + "time_per_iteration": 2.521500825881958 + }, + { + "auxiliary_loss_clip": 0.01053943, + "auxiliary_loss_mlp": 0.01006436, + "balance_loss_clip": 1.00378919, + "balance_loss_mlp": 1.01705432, + "epoch": 0.1350969487449271, + "flos": 70756303242240.0, + "grad_norm": 0.7498554605470831, + "language_loss": 0.60597092, + "learning_rate": 3.885249395178874e-06, + "loss": 0.6265747, + "num_input_tokens_seen": 48613165, + "router_z_loss_clip": 0.02648926, + "router_z_loss_mlp": 0.3671875, + "step": 2247, + "time_per_iteration": 3.209567070007324 + }, + { + "auxiliary_loss_clip": 0.0117261, + "auxiliary_loss_mlp": 0.01058621, + "balance_loss_clip": 1.03629315, + "balance_loss_mlp": 1.05673957, + "epoch": 0.13515707199759508, + "flos": 23075981832960.0, + "grad_norm": 2.930333372025318, + "language_loss": 0.81250268, + "learning_rate": 3.885119335986473e-06, + "loss": 0.83481503, + "num_input_tokens_seen": 48631705, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.15625, + "step": 2248, + "time_per_iteration": 2.5274717807769775 + }, + { + "auxiliary_loss_clip": 0.01157631, + "auxiliary_loss_mlp": 0.0104321, + "balance_loss_clip": 1.02503014, + "balance_loss_mlp": 1.0515008, + "epoch": 0.13521719525026304, + "flos": 23186371305600.0, + "grad_norm": 2.1598236051462383, + "language_loss": 0.77427459, + "learning_rate": 3.884989205310157e-06, + "loss": 0.79628301, + "num_input_tokens_seen": 48649740, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.0625, + "step": 2249, + "time_per_iteration": 2.475325345993042 + }, + { + "auxiliary_loss_clip": 0.01161564, + "auxiliary_loss_mlp": 0.01053869, + "balance_loss_clip": 1.03477216, + "balance_loss_mlp": 1.05408192, + "epoch": 0.135277318502931, + "flos": 24790931752320.0, + "grad_norm": 1.4620260499768896, + "language_loss": 0.84598488, + "learning_rate": 3.884859003154862e-06, + "loss": 0.86813927, + "num_input_tokens_seen": 48671565, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0703125, + "step": 2250, + "time_per_iteration": 2.5579018592834473 + }, + { + "auxiliary_loss_clip": 0.01162926, + "auxiliary_loss_mlp": 0.0104688, + "balance_loss_clip": 1.02586317, + "balance_loss_mlp": 1.05311561, + "epoch": 0.13533744175559898, + "flos": 21908525990400.0, + "grad_norm": 1.9830962049575767, + "language_loss": 0.8213973, + "learning_rate": 3.884728729525524e-06, + "loss": 0.84349537, + "num_input_tokens_seen": 48690425, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.1015625, + "step": 2251, + "time_per_iteration": 2.459254503250122 + }, + { + "auxiliary_loss_clip": 0.01160349, + "auxiliary_loss_mlp": 0.01053163, + "balance_loss_clip": 1.03144348, + "balance_loss_mlp": 1.05075097, + "epoch": 0.13539756500826694, + "flos": 21211643249280.0, + "grad_norm": 1.6927381248236872, + "language_loss": 0.85981321, + "learning_rate": 3.884598384427084e-06, + "loss": 0.88194835, + "num_input_tokens_seen": 48707505, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.09375, + "step": 2252, + "time_per_iteration": 2.508246421813965 + }, + { + "auxiliary_loss_clip": 0.01050697, + "auxiliary_loss_mlp": 0.0100648, + "balance_loss_clip": 1.00398886, + "balance_loss_mlp": 1.01368976, + "epoch": 0.1354576882609349, + "flos": 63242103634560.0, + "grad_norm": 0.7502755191421498, + "language_loss": 0.61736262, + "learning_rate": 3.884467967864485e-06, + "loss": 0.63793439, + "num_input_tokens_seen": 48775895, + "router_z_loss_clip": 0.02490234, + "router_z_loss_mlp": 0.37109375, + "step": 2253, + "time_per_iteration": 3.1357691287994385 + }, + { + "auxiliary_loss_clip": 0.01163708, + "auxiliary_loss_mlp": 0.01055809, + "balance_loss_clip": 1.0357219, + "balance_loss_mlp": 1.05454588, + "epoch": 0.1355178115136029, + "flos": 25483037984640.0, + "grad_norm": 2.033104819567641, + "language_loss": 0.89383745, + "learning_rate": 3.884337479842671e-06, + "loss": 0.91603261, + "num_input_tokens_seen": 48798370, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.09375, + "step": 2254, + "time_per_iteration": 2.4983997344970703 + }, + { + "auxiliary_loss_clip": 0.01166229, + "auxiliary_loss_mlp": 0.01050811, + "balance_loss_clip": 1.02786362, + "balance_loss_mlp": 1.05202925, + "epoch": 0.13557793476627086, + "flos": 21616967295360.0, + "grad_norm": 2.0851597725495843, + "language_loss": 0.84461302, + "learning_rate": 3.884206920366591e-06, + "loss": 0.86678338, + "num_input_tokens_seen": 48817955, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.140625, + "step": 2255, + "time_per_iteration": 2.4466094970703125 + }, + { + "auxiliary_loss_clip": 0.01159898, + "auxiliary_loss_mlp": 0.01046769, + "balance_loss_clip": 1.02632451, + "balance_loss_mlp": 1.05059099, + "epoch": 0.13563805801893883, + "flos": 24928253447040.0, + "grad_norm": 2.8290739743459126, + "language_loss": 0.7493006, + "learning_rate": 3.884076289441196e-06, + "loss": 0.77136725, + "num_input_tokens_seen": 48836330, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.09375, + "step": 2256, + "time_per_iteration": 2.49464750289917 + }, + { + "auxiliary_loss_clip": 0.01164681, + "auxiliary_loss_mlp": 0.01049021, + "balance_loss_clip": 1.02806377, + "balance_loss_mlp": 1.05080438, + "epoch": 0.1356981812716068, + "flos": 14750272206720.0, + "grad_norm": 4.107811937736733, + "language_loss": 0.83023381, + "learning_rate": 3.88394558707144e-06, + "loss": 0.85237086, + "num_input_tokens_seen": 48851890, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.140625, + "step": 2257, + "time_per_iteration": 2.4069128036499023 + }, + { + "auxiliary_loss_clip": 0.0116793, + "auxiliary_loss_mlp": 0.01054876, + "balance_loss_clip": 1.03272712, + "balance_loss_mlp": 1.05211377, + "epoch": 0.13575830452427476, + "flos": 11108571822720.0, + "grad_norm": 2.2162023158830655, + "language_loss": 0.82266492, + "learning_rate": 3.883814813262277e-06, + "loss": 0.84489298, + "num_input_tokens_seen": 48865510, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.15625, + "step": 2258, + "time_per_iteration": 2.4187939167022705 + }, + { + "auxiliary_loss_clip": 0.01161942, + "auxiliary_loss_mlp": 0.01051916, + "balance_loss_clip": 1.02890849, + "balance_loss_mlp": 1.05117583, + "epoch": 0.13581842777694272, + "flos": 17960290940160.0, + "grad_norm": 2.3528312033652434, + "language_loss": 0.82556236, + "learning_rate": 3.883683968018669e-06, + "loss": 0.84770095, + "num_input_tokens_seen": 48882360, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.109375, + "step": 2259, + "time_per_iteration": 2.4182498455047607 + }, + { + "auxiliary_loss_clip": 0.01162398, + "auxiliary_loss_mlp": 0.0105006, + "balance_loss_clip": 1.0313561, + "balance_loss_mlp": 1.05370188, + "epoch": 0.1358785510296107, + "flos": 22857142222080.0, + "grad_norm": 1.9951846625000045, + "language_loss": 0.73434722, + "learning_rate": 3.8835530513455755e-06, + "loss": 0.75647175, + "num_input_tokens_seen": 48902700, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0859375, + "step": 2260, + "time_per_iteration": 2.5117952823638916 + }, + { + "auxiliary_loss_clip": 0.01160597, + "auxiliary_loss_mlp": 0.01053624, + "balance_loss_clip": 1.03389525, + "balance_loss_mlp": 1.05164778, + "epoch": 0.13593867428227868, + "flos": 25739404329600.0, + "grad_norm": 2.6406640236232826, + "language_loss": 0.75450647, + "learning_rate": 3.883422063247961e-06, + "loss": 0.77664864, + "num_input_tokens_seen": 48922525, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.09375, + "step": 2261, + "time_per_iteration": 2.4773809909820557 + }, + { + "auxiliary_loss_clip": 0.01164897, + "auxiliary_loss_mlp": 0.01048665, + "balance_loss_clip": 1.02887654, + "balance_loss_mlp": 1.05329657, + "epoch": 0.13599879753494665, + "flos": 31249214225280.0, + "grad_norm": 1.9984757312973846, + "language_loss": 0.63141024, + "learning_rate": 3.883291003730794e-06, + "loss": 0.65354586, + "num_input_tokens_seen": 48942510, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.1171875, + "step": 2262, + "time_per_iteration": 2.5423331260681152 + }, + { + "auxiliary_loss_clip": 0.01161423, + "auxiliary_loss_mlp": 0.01043862, + "balance_loss_clip": 1.02458549, + "balance_loss_mlp": 1.05046105, + "epoch": 0.1360589207876146, + "flos": 23915034604800.0, + "grad_norm": 2.598036861128168, + "language_loss": 0.82363462, + "learning_rate": 3.883159872799043e-06, + "loss": 0.84568739, + "num_input_tokens_seen": 48962625, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.109375, + "step": 2263, + "time_per_iteration": 2.472050428390503 + }, + { + "auxiliary_loss_clip": 0.01166147, + "auxiliary_loss_mlp": 0.01061822, + "balance_loss_clip": 1.03914785, + "balance_loss_mlp": 1.05306447, + "epoch": 0.13611904404028258, + "flos": 19974197756160.0, + "grad_norm": 1.7757676532235749, + "language_loss": 0.87984985, + "learning_rate": 3.8830286704576815e-06, + "loss": 0.90212959, + "num_input_tokens_seen": 48982525, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.1328125, + "step": 2264, + "time_per_iteration": 2.4857943058013916 + }, + { + "auxiliary_loss_clip": 0.01163519, + "auxiliary_loss_mlp": 0.01048759, + "balance_loss_clip": 1.02700329, + "balance_loss_mlp": 1.05115557, + "epoch": 0.13617916729295054, + "flos": 15340644144000.0, + "grad_norm": 2.9904691281538693, + "language_loss": 0.7103616, + "learning_rate": 3.882897396711683e-06, + "loss": 0.73248434, + "num_input_tokens_seen": 48997605, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.125, + "step": 2265, + "time_per_iteration": 2.428753614425659 + }, + { + "auxiliary_loss_clip": 0.01160486, + "auxiliary_loss_mlp": 0.01041679, + "balance_loss_clip": 1.02187812, + "balance_loss_mlp": 1.05258036, + "epoch": 0.1362392905456185, + "flos": 27451445247360.0, + "grad_norm": 2.049615390343222, + "language_loss": 0.66760135, + "learning_rate": 3.882766051566027e-06, + "loss": 0.689623, + "num_input_tokens_seen": 49018535, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.078125, + "step": 2266, + "time_per_iteration": 2.4990508556365967 + }, + { + "auxiliary_loss_clip": 0.01159505, + "auxiliary_loss_mlp": 0.01060297, + "balance_loss_clip": 1.04079425, + "balance_loss_mlp": 1.05220675, + "epoch": 0.1362994137982865, + "flos": 25009017177600.0, + "grad_norm": 1.7538751206895893, + "language_loss": 0.76376909, + "learning_rate": 3.882634635025694e-06, + "loss": 0.78596711, + "num_input_tokens_seen": 49038865, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0703125, + "step": 2267, + "time_per_iteration": 2.485907554626465 + }, + { + "auxiliary_loss_clip": 0.0116058, + "auxiliary_loss_mlp": 0.01046136, + "balance_loss_clip": 1.02639508, + "balance_loss_mlp": 1.05051804, + "epoch": 0.13635953705095447, + "flos": 20303031790080.0, + "grad_norm": 2.002795226804265, + "language_loss": 0.81781995, + "learning_rate": 3.882503147095667e-06, + "loss": 0.83988714, + "num_input_tokens_seen": 49058010, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1015625, + "step": 2268, + "time_per_iteration": 3.890936851501465 + }, + { + "auxiliary_loss_clip": 0.01161581, + "auxiliary_loss_mlp": 0.01046085, + "balance_loss_clip": 1.02567649, + "balance_loss_mlp": 1.0542717, + "epoch": 0.13641966030362243, + "flos": 31358418549120.0, + "grad_norm": 2.071095479959133, + "language_loss": 0.76078153, + "learning_rate": 3.882371587780931e-06, + "loss": 0.78285825, + "num_input_tokens_seen": 49080330, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0703125, + "step": 2269, + "time_per_iteration": 4.03081202507019 + }, + { + "auxiliary_loss_clip": 0.01165717, + "auxiliary_loss_mlp": 0.0104628, + "balance_loss_clip": 1.02612138, + "balance_loss_mlp": 1.05518508, + "epoch": 0.1364797835562904, + "flos": 20478095700480.0, + "grad_norm": 2.039865659244694, + "language_loss": 0.80856502, + "learning_rate": 3.882239957086477e-06, + "loss": 0.83068502, + "num_input_tokens_seen": 49097035, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.109375, + "step": 2270, + "time_per_iteration": 2.431426525115967 + }, + { + "auxiliary_loss_clip": 0.01164608, + "auxiliary_loss_mlp": 0.01055428, + "balance_loss_clip": 1.03463817, + "balance_loss_mlp": 1.05227089, + "epoch": 0.13653990680895836, + "flos": 13078343802240.0, + "grad_norm": 2.715242097566801, + "language_loss": 0.75720018, + "learning_rate": 3.882108255017295e-06, + "loss": 0.77940053, + "num_input_tokens_seen": 49113945, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.125, + "step": 2271, + "time_per_iteration": 2.440701961517334 + }, + { + "auxiliary_loss_clip": 0.01161613, + "auxiliary_loss_mlp": 0.01052373, + "balance_loss_clip": 1.03149939, + "balance_loss_mlp": 1.05171776, + "epoch": 0.13660003006162633, + "flos": 16946712961920.0, + "grad_norm": 2.2487551674667565, + "language_loss": 0.80084515, + "learning_rate": 3.881976481578379e-06, + "loss": 0.82298499, + "num_input_tokens_seen": 49132855, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1015625, + "step": 2272, + "time_per_iteration": 2.4305598735809326 + }, + { + "auxiliary_loss_clip": 0.01056084, + "auxiliary_loss_mlp": 0.01011943, + "balance_loss_clip": 1.00937963, + "balance_loss_mlp": 1.01818228, + "epoch": 0.1366601533142943, + "flos": 68682749892480.0, + "grad_norm": 0.7032235049035468, + "language_loss": 0.60682511, + "learning_rate": 3.8818446367747255e-06, + "loss": 0.62750536, + "num_input_tokens_seen": 49198310, + "router_z_loss_clip": 0.02563477, + "router_z_loss_mlp": 0.37890625, + "step": 2273, + "time_per_iteration": 3.1601598262786865 + }, + { + "auxiliary_loss_clip": 0.01158579, + "auxiliary_loss_mlp": 0.01047766, + "balance_loss_clip": 1.02732205, + "balance_loss_mlp": 1.05170178, + "epoch": 0.13672027656696228, + "flos": 19244241567360.0, + "grad_norm": 1.7482195510707834, + "language_loss": 0.77978206, + "learning_rate": 3.881712720611336e-06, + "loss": 0.80184555, + "num_input_tokens_seen": 49217250, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0625, + "step": 2274, + "time_per_iteration": 2.448374032974243 + }, + { + "auxiliary_loss_clip": 0.01163563, + "auxiliary_loss_mlp": 0.01046845, + "balance_loss_clip": 1.02613878, + "balance_loss_mlp": 1.0536654, + "epoch": 0.13678039981963025, + "flos": 24534924543360.0, + "grad_norm": 2.152740159395537, + "language_loss": 0.78435361, + "learning_rate": 3.881580733093211e-06, + "loss": 0.80645764, + "num_input_tokens_seen": 49236615, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1015625, + "step": 2275, + "time_per_iteration": 2.4761078357696533 + }, + { + "auxiliary_loss_clip": 0.01161418, + "auxiliary_loss_mlp": 0.01039, + "balance_loss_clip": 1.02003431, + "balance_loss_mlp": 1.05312562, + "epoch": 0.13684052307229821, + "flos": 15669334523520.0, + "grad_norm": 2.879456622893362, + "language_loss": 0.81436646, + "learning_rate": 3.881448674225356e-06, + "loss": 0.83637059, + "num_input_tokens_seen": 49253935, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0859375, + "step": 2276, + "time_per_iteration": 2.453623056411743 + }, + { + "auxiliary_loss_clip": 0.01169888, + "auxiliary_loss_mlp": 0.01054109, + "balance_loss_clip": 1.03082716, + "balance_loss_mlp": 1.05443549, + "epoch": 0.13690064632496618, + "flos": 28364689560960.0, + "grad_norm": 2.7308629221608576, + "language_loss": 0.69347179, + "learning_rate": 3.881316544012779e-06, + "loss": 0.71571183, + "num_input_tokens_seen": 49273605, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.15625, + "step": 2277, + "time_per_iteration": 2.537464141845703 + }, + { + "auxiliary_loss_clip": 0.01162034, + "auxiliary_loss_mlp": 0.01051118, + "balance_loss_clip": 1.03056657, + "balance_loss_mlp": 1.05136657, + "epoch": 0.13696076957763414, + "flos": 23404779953280.0, + "grad_norm": 2.1796180013972384, + "language_loss": 0.80487186, + "learning_rate": 3.88118434246049e-06, + "loss": 0.82700336, + "num_input_tokens_seen": 49291785, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.109375, + "step": 2278, + "time_per_iteration": 2.478158950805664 + }, + { + "auxiliary_loss_clip": 0.01164216, + "auxiliary_loss_mlp": 0.01047731, + "balance_loss_clip": 1.02788246, + "balance_loss_mlp": 1.05658543, + "epoch": 0.1370208928303021, + "flos": 37196595601920.0, + "grad_norm": 2.2222454745927744, + "language_loss": 0.74863833, + "learning_rate": 3.881052069573502e-06, + "loss": 0.77075779, + "num_input_tokens_seen": 49311405, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.078125, + "step": 2279, + "time_per_iteration": 2.5930991172790527 + }, + { + "auxiliary_loss_clip": 0.01166611, + "auxiliary_loss_mlp": 0.0105256, + "balance_loss_clip": 1.03232992, + "balance_loss_mlp": 1.05331779, + "epoch": 0.13708101608297008, + "flos": 26976311118720.0, + "grad_norm": 2.3437990696634916, + "language_loss": 0.76614088, + "learning_rate": 3.880919725356831e-06, + "loss": 0.78833258, + "num_input_tokens_seen": 49331835, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.1328125, + "step": 2280, + "time_per_iteration": 2.527808666229248 + }, + { + "auxiliary_loss_clip": 0.01156674, + "auxiliary_loss_mlp": 0.01046301, + "balance_loss_clip": 1.0272876, + "balance_loss_mlp": 1.04930711, + "epoch": 0.13714113933563807, + "flos": 32556864850560.0, + "grad_norm": 1.7035700975942816, + "language_loss": 0.79808372, + "learning_rate": 3.880787309815496e-06, + "loss": 0.82011348, + "num_input_tokens_seen": 49352290, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.078125, + "step": 2281, + "time_per_iteration": 2.5486884117126465 + }, + { + "auxiliary_loss_clip": 0.01167882, + "auxiliary_loss_mlp": 0.01055632, + "balance_loss_clip": 1.03618872, + "balance_loss_mlp": 1.05488086, + "epoch": 0.13720126258830603, + "flos": 16101267569280.0, + "grad_norm": 1.697672260024265, + "language_loss": 0.83955061, + "learning_rate": 3.880654822954518e-06, + "loss": 0.86178571, + "num_input_tokens_seen": 49370285, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.125, + "step": 2282, + "time_per_iteration": 2.4731719493865967 + }, + { + "auxiliary_loss_clip": 0.01157557, + "auxiliary_loss_mlp": 0.01055496, + "balance_loss_clip": 1.03664923, + "balance_loss_mlp": 1.05028629, + "epoch": 0.137261385840974, + "flos": 18953544798720.0, + "grad_norm": 1.8152250836173982, + "language_loss": 0.73821312, + "learning_rate": 3.8805222647789195e-06, + "loss": 0.76034367, + "num_input_tokens_seen": 49389610, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0703125, + "step": 2283, + "time_per_iteration": 2.5041310787200928 + }, + { + "auxiliary_loss_clip": 0.01161767, + "auxiliary_loss_mlp": 0.01048138, + "balance_loss_clip": 1.02991104, + "balance_loss_mlp": 1.05546188, + "epoch": 0.13732150909364196, + "flos": 23295360147840.0, + "grad_norm": 2.845966051455131, + "language_loss": 0.83875519, + "learning_rate": 3.880389635293729e-06, + "loss": 0.86085427, + "num_input_tokens_seen": 49408390, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.0625, + "step": 2284, + "time_per_iteration": 2.489459991455078 + }, + { + "auxiliary_loss_clip": 0.01164156, + "auxiliary_loss_mlp": 0.01049019, + "balance_loss_clip": 1.02784729, + "balance_loss_mlp": 1.05016088, + "epoch": 0.13738163234630993, + "flos": 29351263489920.0, + "grad_norm": 1.9356174938409232, + "language_loss": 0.74778754, + "learning_rate": 3.880256934503974e-06, + "loss": 0.76991928, + "num_input_tokens_seen": 49427725, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.140625, + "step": 2285, + "time_per_iteration": 2.542114734649658 + }, + { + "auxiliary_loss_clip": 0.01158945, + "auxiliary_loss_mlp": 0.01045355, + "balance_loss_clip": 1.02680647, + "balance_loss_mlp": 1.05192137, + "epoch": 0.1374417555989779, + "flos": 26651319840000.0, + "grad_norm": 1.7476035379248278, + "language_loss": 0.74461651, + "learning_rate": 3.880124162414689e-06, + "loss": 0.7666595, + "num_input_tokens_seen": 49449000, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0703125, + "step": 2286, + "time_per_iteration": 2.52837872505188 + }, + { + "auxiliary_loss_clip": 0.01165905, + "auxiliary_loss_mlp": 0.01045032, + "balance_loss_clip": 1.02407491, + "balance_loss_mlp": 1.05466056, + "epoch": 0.1375018788516459, + "flos": 28403401443840.0, + "grad_norm": 2.4229799840234936, + "language_loss": 0.86074513, + "learning_rate": 3.879991319030908e-06, + "loss": 0.88285446, + "num_input_tokens_seen": 49468360, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.109375, + "step": 2287, + "time_per_iteration": 2.5267093181610107 + }, + { + "auxiliary_loss_clip": 0.01162503, + "auxiliary_loss_mlp": 0.01046382, + "balance_loss_clip": 1.02724862, + "balance_loss_mlp": 1.05281329, + "epoch": 0.13756200210431385, + "flos": 37413783187200.0, + "grad_norm": 2.1686670508464783, + "language_loss": 0.68304116, + "learning_rate": 3.879858404357666e-06, + "loss": 0.70512998, + "num_input_tokens_seen": 49493450, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.09375, + "step": 2288, + "time_per_iteration": 2.6589176654815674 + }, + { + "auxiliary_loss_clip": 0.01162886, + "auxiliary_loss_mlp": 0.01054184, + "balance_loss_clip": 1.03410959, + "balance_loss_mlp": 1.05404294, + "epoch": 0.13762212535698182, + "flos": 22711021695360.0, + "grad_norm": 3.8263362529629896, + "language_loss": 0.87251699, + "learning_rate": 3.879725418400005e-06, + "loss": 0.89468765, + "num_input_tokens_seen": 49511220, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.0859375, + "step": 2289, + "time_per_iteration": 2.4834415912628174 + }, + { + "auxiliary_loss_clip": 0.01154414, + "auxiliary_loss_mlp": 0.01045882, + "balance_loss_clip": 1.02735722, + "balance_loss_mlp": 1.0496552, + "epoch": 0.13768224860964978, + "flos": 23952130375680.0, + "grad_norm": 1.801469753111382, + "language_loss": 0.74045157, + "learning_rate": 3.879592361162969e-06, + "loss": 0.76245451, + "num_input_tokens_seen": 49529820, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.046875, + "step": 2290, + "time_per_iteration": 2.4901175498962402 + }, + { + "auxiliary_loss_clip": 0.01056657, + "auxiliary_loss_mlp": 0.01003238, + "balance_loss_clip": 1.00099707, + "balance_loss_mlp": 1.01923215, + "epoch": 0.13774237186231775, + "flos": 63590438753280.0, + "grad_norm": 0.7021136788609851, + "language_loss": 0.5160234, + "learning_rate": 3.8794592326516015e-06, + "loss": 0.53662229, + "num_input_tokens_seen": 49595325, + "router_z_loss_clip": 0.02246094, + "router_z_loss_mlp": 0.375, + "step": 2291, + "time_per_iteration": 3.1141176223754883 + }, + { + "auxiliary_loss_clip": 0.01158988, + "auxiliary_loss_mlp": 0.01049009, + "balance_loss_clip": 1.02856493, + "balance_loss_mlp": 1.05007744, + "epoch": 0.1378024951149857, + "flos": 24279456038400.0, + "grad_norm": 2.104305633549435, + "language_loss": 0.7090801, + "learning_rate": 3.879326032870952e-06, + "loss": 0.73116004, + "num_input_tokens_seen": 49615850, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.09375, + "step": 2292, + "time_per_iteration": 2.5535075664520264 + }, + { + "auxiliary_loss_clip": 0.01160381, + "auxiliary_loss_mlp": 0.01044896, + "balance_loss_clip": 1.02613211, + "balance_loss_mlp": 1.05272794, + "epoch": 0.13786261836765368, + "flos": 14021537080320.0, + "grad_norm": 2.835181445389694, + "language_loss": 0.79774708, + "learning_rate": 3.879192761826071e-06, + "loss": 0.81979978, + "num_input_tokens_seen": 49631860, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.078125, + "step": 2293, + "time_per_iteration": 2.4434242248535156 + }, + { + "auxiliary_loss_clip": 0.01159833, + "auxiliary_loss_mlp": 0.01050431, + "balance_loss_clip": 1.03065419, + "balance_loss_mlp": 1.0489893, + "epoch": 0.13792274162032167, + "flos": 28878679226880.0, + "grad_norm": 2.8100583587938566, + "language_loss": 0.78455698, + "learning_rate": 3.879059419522011e-06, + "loss": 0.80665964, + "num_input_tokens_seen": 49652145, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.109375, + "step": 2294, + "time_per_iteration": 2.5279018878936768 + }, + { + "auxiliary_loss_clip": 0.01156302, + "auxiliary_loss_mlp": 0.01044594, + "balance_loss_clip": 1.02679634, + "balance_loss_mlp": 1.05053687, + "epoch": 0.13798286487298964, + "flos": 21141150808320.0, + "grad_norm": 2.844605455172751, + "language_loss": 0.80448526, + "learning_rate": 3.878926005963831e-06, + "loss": 0.82649422, + "num_input_tokens_seen": 49669880, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 1.0546875, + "step": 2295, + "time_per_iteration": 2.46471905708313 + }, + { + "auxiliary_loss_clip": 0.01158353, + "auxiliary_loss_mlp": 0.01045588, + "balance_loss_clip": 1.02604938, + "balance_loss_mlp": 1.04990947, + "epoch": 0.1380429881256576, + "flos": 22487477402880.0, + "grad_norm": 1.905081494696058, + "language_loss": 0.78027165, + "learning_rate": 3.878792521156588e-06, + "loss": 0.80231106, + "num_input_tokens_seen": 49687255, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0859375, + "step": 2296, + "time_per_iteration": 2.489081859588623 + }, + { + "auxiliary_loss_clip": 0.0116031, + "auxiliary_loss_mlp": 0.01052914, + "balance_loss_clip": 1.03356612, + "balance_loss_mlp": 1.05272174, + "epoch": 0.13810311137832557, + "flos": 21393674398080.0, + "grad_norm": 1.8577842545242083, + "language_loss": 0.78632545, + "learning_rate": 3.8786589651053446e-06, + "loss": 0.80845773, + "num_input_tokens_seen": 49706650, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.078125, + "step": 2297, + "time_per_iteration": 2.479617118835449 + }, + { + "auxiliary_loss_clip": 0.01157952, + "auxiliary_loss_mlp": 0.01050362, + "balance_loss_clip": 1.03187263, + "balance_loss_mlp": 1.05133367, + "epoch": 0.13816323463099353, + "flos": 25989844930560.0, + "grad_norm": 2.1383795008624946, + "language_loss": 0.69005466, + "learning_rate": 3.878525337815164e-06, + "loss": 0.71213776, + "num_input_tokens_seen": 49725715, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0625, + "step": 2298, + "time_per_iteration": 2.4894726276397705 + }, + { + "auxiliary_loss_clip": 0.01163842, + "auxiliary_loss_mlp": 0.0105021, + "balance_loss_clip": 1.03075552, + "balance_loss_mlp": 1.05287397, + "epoch": 0.1382233578836615, + "flos": 19244313394560.0, + "grad_norm": 1.7932718261070644, + "language_loss": 0.86958891, + "learning_rate": 3.878391639291116e-06, + "loss": 0.89172935, + "num_input_tokens_seen": 49744710, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.109375, + "step": 2299, + "time_per_iteration": 2.4343175888061523 + }, + { + "auxiliary_loss_clip": 0.01158457, + "auxiliary_loss_mlp": 0.01052611, + "balance_loss_clip": 1.03221393, + "balance_loss_mlp": 1.05076718, + "epoch": 0.1382834811363295, + "flos": 25666290195840.0, + "grad_norm": 2.6477233854648015, + "language_loss": 0.7542398, + "learning_rate": 3.878257869538267e-06, + "loss": 0.7763505, + "num_input_tokens_seen": 49764300, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.078125, + "step": 2300, + "time_per_iteration": 2.5398943424224854 + }, + { + "auxiliary_loss_clip": 0.01160789, + "auxiliary_loss_mlp": 0.01050356, + "balance_loss_clip": 1.03088915, + "balance_loss_mlp": 1.05409729, + "epoch": 0.13834360438899745, + "flos": 19784193788160.0, + "grad_norm": 2.6084363319634956, + "language_loss": 0.82612532, + "learning_rate": 3.878124028561692e-06, + "loss": 0.8482368, + "num_input_tokens_seen": 49778380, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0625, + "step": 2301, + "time_per_iteration": 2.435732841491699 + }, + { + "auxiliary_loss_clip": 0.01155849, + "auxiliary_loss_mlp": 0.01043651, + "balance_loss_clip": 1.02461374, + "balance_loss_mlp": 1.04986811, + "epoch": 0.13840372764166542, + "flos": 26651858544000.0, + "grad_norm": 2.0886382571109987, + "language_loss": 0.85972583, + "learning_rate": 3.877990116366466e-06, + "loss": 0.8817209, + "num_input_tokens_seen": 49797460, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0625, + "step": 2302, + "time_per_iteration": 2.504011869430542 + }, + { + "auxiliary_loss_clip": 0.01055451, + "auxiliary_loss_mlp": 0.01009124, + "balance_loss_clip": 1.00688314, + "balance_loss_mlp": 1.0189817, + "epoch": 0.13846385089433338, + "flos": 70510998286080.0, + "grad_norm": 0.7554932596602951, + "language_loss": 0.65648526, + "learning_rate": 3.877856132957667e-06, + "loss": 0.677131, + "num_input_tokens_seen": 49868005, + "router_z_loss_clip": 0.02246094, + "router_z_loss_mlp": 0.36328125, + "step": 2303, + "time_per_iteration": 3.2563750743865967 + }, + { + "auxiliary_loss_clip": 0.0115535, + "auxiliary_loss_mlp": 0.01038432, + "balance_loss_clip": 1.01971662, + "balance_loss_mlp": 1.05022073, + "epoch": 0.13852397414700135, + "flos": 17348732956800.0, + "grad_norm": 2.0694955360834912, + "language_loss": 0.78234196, + "learning_rate": 3.877722078340374e-06, + "loss": 0.80427974, + "num_input_tokens_seen": 49885825, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.046875, + "step": 2304, + "time_per_iteration": 2.461975574493408 + }, + { + "auxiliary_loss_clip": 0.01161783, + "auxiliary_loss_mlp": 0.01038842, + "balance_loss_clip": 1.01991165, + "balance_loss_mlp": 1.05225086, + "epoch": 0.13858409739966931, + "flos": 21543781334400.0, + "grad_norm": 1.838077080535218, + "language_loss": 0.77824223, + "learning_rate": 3.877587952519672e-06, + "loss": 0.8002485, + "num_input_tokens_seen": 49905975, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.09375, + "step": 2305, + "time_per_iteration": 2.468254804611206 + }, + { + "auxiliary_loss_clip": 0.0115424, + "auxiliary_loss_mlp": 0.01045678, + "balance_loss_clip": 1.02732027, + "balance_loss_mlp": 1.04923558, + "epoch": 0.13864422065233728, + "flos": 21579907438080.0, + "grad_norm": 3.2063314507866947, + "language_loss": 0.87484217, + "learning_rate": 3.877453755500647e-06, + "loss": 0.89684129, + "num_input_tokens_seen": 49925800, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.046875, + "step": 2306, + "time_per_iteration": 2.4840242862701416 + }, + { + "auxiliary_loss_clip": 0.0105475, + "auxiliary_loss_mlp": 0.01012102, + "balance_loss_clip": 1.00995588, + "balance_loss_mlp": 1.01749539, + "epoch": 0.13870434390500527, + "flos": 53371156872960.0, + "grad_norm": 0.8793018572536648, + "language_loss": 0.59049129, + "learning_rate": 3.877319487288387e-06, + "loss": 0.6111598, + "num_input_tokens_seen": 49977620, + "router_z_loss_clip": 0.02148438, + "router_z_loss_mlp": 0.37304688, + "step": 2307, + "time_per_iteration": 3.1098880767822266 + }, + { + "auxiliary_loss_clip": 0.01164649, + "auxiliary_loss_mlp": 0.01043993, + "balance_loss_clip": 1.0233345, + "balance_loss_mlp": 1.05279016, + "epoch": 0.13876446715767324, + "flos": 22565906749440.0, + "grad_norm": 1.7539420555734833, + "language_loss": 0.79683769, + "learning_rate": 3.877185147887984e-06, + "loss": 0.81892413, + "num_input_tokens_seen": 49996650, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1171875, + "step": 2308, + "time_per_iteration": 2.5119385719299316 + }, + { + "auxiliary_loss_clip": 0.01159668, + "auxiliary_loss_mlp": 0.01042487, + "balance_loss_clip": 1.02331865, + "balance_loss_mlp": 1.05292201, + "epoch": 0.1388245904103412, + "flos": 20705231352960.0, + "grad_norm": 2.1876242684272342, + "language_loss": 0.78186178, + "learning_rate": 3.877050737304533e-06, + "loss": 0.80388331, + "num_input_tokens_seen": 50015640, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0625, + "step": 2309, + "time_per_iteration": 3.889902353286743 + }, + { + "auxiliary_loss_clip": 0.01164667, + "auxiliary_loss_mlp": 0.01044971, + "balance_loss_clip": 1.02517033, + "balance_loss_mlp": 1.05319023, + "epoch": 0.13888471366300917, + "flos": 20554729367040.0, + "grad_norm": 1.9671645437439387, + "language_loss": 0.67473733, + "learning_rate": 3.876916255543129e-06, + "loss": 0.69683367, + "num_input_tokens_seen": 50033500, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1171875, + "step": 2310, + "time_per_iteration": 5.331011056900024 + }, + { + "auxiliary_loss_clip": 0.01159907, + "auxiliary_loss_mlp": 0.01051301, + "balance_loss_clip": 1.03067827, + "balance_loss_mlp": 1.0511837, + "epoch": 0.13894483691567713, + "flos": 13838033473920.0, + "grad_norm": 1.8339330301012977, + "language_loss": 0.83962393, + "learning_rate": 3.8767817026088725e-06, + "loss": 0.86173606, + "num_input_tokens_seen": 50050075, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.0859375, + "step": 2311, + "time_per_iteration": 2.4287211894989014 + }, + { + "auxiliary_loss_clip": 0.01165031, + "auxiliary_loss_mlp": 0.01046165, + "balance_loss_clip": 1.02629256, + "balance_loss_mlp": 1.05262017, + "epoch": 0.1390049601683451, + "flos": 28031186759040.0, + "grad_norm": 2.2677083380951997, + "language_loss": 0.81788063, + "learning_rate": 3.876647078506866e-06, + "loss": 0.83999264, + "num_input_tokens_seen": 50070080, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.125, + "step": 2312, + "time_per_iteration": 2.5261852741241455 + }, + { + "auxiliary_loss_clip": 0.01165344, + "auxiliary_loss_mlp": 0.01045576, + "balance_loss_clip": 1.02634764, + "balance_loss_mlp": 1.05353236, + "epoch": 0.13906508342101306, + "flos": 26756860976640.0, + "grad_norm": 2.1868066623869202, + "language_loss": 0.86641061, + "learning_rate": 3.876512383242215e-06, + "loss": 0.88851982, + "num_input_tokens_seen": 50090040, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.1171875, + "step": 2313, + "time_per_iteration": 2.491847515106201 + }, + { + "auxiliary_loss_clip": 0.0116138, + "auxiliary_loss_mlp": 0.01052556, + "balance_loss_clip": 1.03208828, + "balance_loss_mlp": 1.05377281, + "epoch": 0.13912520667368106, + "flos": 24535104111360.0, + "grad_norm": 2.199884337980412, + "language_loss": 0.79629153, + "learning_rate": 3.876377616820024e-06, + "loss": 0.8184309, + "num_input_tokens_seen": 50110595, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.078125, + "step": 2314, + "time_per_iteration": 2.513545036315918 + }, + { + "auxiliary_loss_clip": 0.0116003, + "auxiliary_loss_mlp": 0.0104633, + "balance_loss_clip": 1.02668452, + "balance_loss_mlp": 1.05130863, + "epoch": 0.13918532992634902, + "flos": 19383215287680.0, + "grad_norm": 2.30759926974498, + "language_loss": 0.86246645, + "learning_rate": 3.876242779245409e-06, + "loss": 0.88453007, + "num_input_tokens_seen": 50125430, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0859375, + "step": 2315, + "time_per_iteration": 2.4236056804656982 + }, + { + "auxiliary_loss_clip": 0.01159066, + "auxiliary_loss_mlp": 0.01052564, + "balance_loss_clip": 1.03192866, + "balance_loss_mlp": 1.05146074, + "epoch": 0.139245453179017, + "flos": 21323756574720.0, + "grad_norm": 2.162038852448813, + "language_loss": 0.77074778, + "learning_rate": 3.876107870523477e-06, + "loss": 0.79286408, + "num_input_tokens_seen": 50144120, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.078125, + "step": 2316, + "time_per_iteration": 2.4574813842773438 + }, + { + "auxiliary_loss_clip": 0.01157842, + "auxiliary_loss_mlp": 0.01058721, + "balance_loss_clip": 1.03733492, + "balance_loss_mlp": 1.05045736, + "epoch": 0.13930557643168495, + "flos": 19500607912320.0, + "grad_norm": 1.6719823206156588, + "language_loss": 0.76972795, + "learning_rate": 3.875972890659349e-06, + "loss": 0.7918936, + "num_input_tokens_seen": 50162500, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.078125, + "step": 2317, + "time_per_iteration": 2.448096990585327 + }, + { + "auxiliary_loss_clip": 0.01162372, + "auxiliary_loss_mlp": 0.01049791, + "balance_loss_clip": 1.02993095, + "balance_loss_mlp": 1.05272126, + "epoch": 0.13936569968435292, + "flos": 25410821690880.0, + "grad_norm": 2.004328537884534, + "language_loss": 0.80159998, + "learning_rate": 3.875837839658139e-06, + "loss": 0.82372165, + "num_input_tokens_seen": 50182415, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.09375, + "step": 2318, + "time_per_iteration": 2.5152556896209717 + }, + { + "auxiliary_loss_clip": 0.01050424, + "auxiliary_loss_mlp": 0.0100261, + "balance_loss_clip": 1.00027394, + "balance_loss_mlp": 1.01373565, + "epoch": 0.13942582293702088, + "flos": 70771063731840.0, + "grad_norm": 0.8654041988705774, + "language_loss": 0.59008324, + "learning_rate": 3.87570271752497e-06, + "loss": 0.61061358, + "num_input_tokens_seen": 50245160, + "router_z_loss_clip": 0.02331543, + "router_z_loss_mlp": 0.3671875, + "step": 2319, + "time_per_iteration": 3.101083993911743 + }, + { + "auxiliary_loss_clip": 0.01162526, + "auxiliary_loss_mlp": 0.01053809, + "balance_loss_clip": 1.03365111, + "balance_loss_mlp": 1.05213809, + "epoch": 0.13948594618968888, + "flos": 35590885920000.0, + "grad_norm": 2.2307371496542356, + "language_loss": 0.65372109, + "learning_rate": 3.875567524264967e-06, + "loss": 0.67588449, + "num_input_tokens_seen": 50268215, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.109375, + "step": 2320, + "time_per_iteration": 2.580655336380005 + }, + { + "auxiliary_loss_clip": 0.01157047, + "auxiliary_loss_mlp": 0.01043656, + "balance_loss_clip": 1.02407002, + "balance_loss_mlp": 1.0507009, + "epoch": 0.13954606944235684, + "flos": 21105204272640.0, + "grad_norm": 1.6249908375914148, + "language_loss": 0.70695353, + "learning_rate": 3.875432259883256e-06, + "loss": 0.72896051, + "num_input_tokens_seen": 50288575, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0625, + "step": 2321, + "time_per_iteration": 2.4594380855560303 + }, + { + "auxiliary_loss_clip": 0.01158572, + "auxiliary_loss_mlp": 0.01055348, + "balance_loss_clip": 1.0345459, + "balance_loss_mlp": 1.04883599, + "epoch": 0.1396061926950248, + "flos": 25044425009280.0, + "grad_norm": 43.01057366099128, + "language_loss": 0.86161166, + "learning_rate": 3.875296924384965e-06, + "loss": 0.88375086, + "num_input_tokens_seen": 50308735, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.09375, + "step": 2322, + "time_per_iteration": 2.4912750720977783 + }, + { + "auxiliary_loss_clip": 0.01152207, + "auxiliary_loss_mlp": 0.01056736, + "balance_loss_clip": 1.0373404, + "balance_loss_mlp": 1.04840016, + "epoch": 0.13966631594769277, + "flos": 37634023428480.0, + "grad_norm": 1.7187096085030618, + "language_loss": 0.6682983, + "learning_rate": 3.875161517775226e-06, + "loss": 0.69038773, + "num_input_tokens_seen": 50331025, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0390625, + "step": 2323, + "time_per_iteration": 2.606084108352661 + }, + { + "auxiliary_loss_clip": 0.0116621, + "auxiliary_loss_mlp": 0.01051125, + "balance_loss_clip": 1.03068066, + "balance_loss_mlp": 1.05250573, + "epoch": 0.13972643920036074, + "flos": 16690993061760.0, + "grad_norm": 2.0268681764850665, + "language_loss": 0.89011461, + "learning_rate": 3.875026040059175e-06, + "loss": 0.91228795, + "num_input_tokens_seen": 50349725, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1328125, + "step": 2324, + "time_per_iteration": 2.458172559738159 + }, + { + "auxiliary_loss_clip": 0.01159494, + "auxiliary_loss_mlp": 0.01056649, + "balance_loss_clip": 1.03626466, + "balance_loss_mlp": 1.04949069, + "epoch": 0.1397865624530287, + "flos": 23331055288320.0, + "grad_norm": 4.4201897818475775, + "language_loss": 0.70700991, + "learning_rate": 3.8748904912419485e-06, + "loss": 0.7291714, + "num_input_tokens_seen": 50367965, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1015625, + "step": 2325, + "time_per_iteration": 2.4608585834503174 + }, + { + "auxiliary_loss_clip": 0.01161715, + "auxiliary_loss_mlp": 0.01055057, + "balance_loss_clip": 1.03568554, + "balance_loss_mlp": 1.05384755, + "epoch": 0.13984668570569667, + "flos": 22778317825920.0, + "grad_norm": 1.8512202881484865, + "language_loss": 0.81165004, + "learning_rate": 3.874754871328688e-06, + "loss": 0.83381784, + "num_input_tokens_seen": 50385605, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.078125, + "step": 2326, + "time_per_iteration": 2.474729537963867 + }, + { + "auxiliary_loss_clip": 0.01155088, + "auxiliary_loss_mlp": 0.01047016, + "balance_loss_clip": 1.02880073, + "balance_loss_mlp": 1.05092621, + "epoch": 0.13990680895836466, + "flos": 19464553635840.0, + "grad_norm": 1.806872548679543, + "language_loss": 0.88955671, + "learning_rate": 3.874619180324534e-06, + "loss": 0.9115777, + "num_input_tokens_seen": 50403985, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.0390625, + "step": 2327, + "time_per_iteration": 2.4512577056884766 + }, + { + "auxiliary_loss_clip": 0.01155487, + "auxiliary_loss_mlp": 0.0105816, + "balance_loss_clip": 1.03790593, + "balance_loss_mlp": 1.05021226, + "epoch": 0.13996693221103262, + "flos": 20303283185280.0, + "grad_norm": 2.4750320646827992, + "language_loss": 0.85236871, + "learning_rate": 3.874483418234632e-06, + "loss": 0.87450516, + "num_input_tokens_seen": 50421590, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.046875, + "step": 2328, + "time_per_iteration": 2.4724884033203125 + }, + { + "auxiliary_loss_clip": 0.01158673, + "auxiliary_loss_mlp": 0.0104927, + "balance_loss_clip": 1.02926636, + "balance_loss_mlp": 1.05120313, + "epoch": 0.1400270554637006, + "flos": 26617707688320.0, + "grad_norm": 1.653872228613324, + "language_loss": 0.74084997, + "learning_rate": 3.874347585064131e-06, + "loss": 0.76292944, + "num_input_tokens_seen": 50443945, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.078125, + "step": 2329, + "time_per_iteration": 2.5238442420959473 + }, + { + "auxiliary_loss_clip": 0.01156952, + "auxiliary_loss_mlp": 0.01050364, + "balance_loss_clip": 1.03070641, + "balance_loss_mlp": 1.04729962, + "epoch": 0.14008717871636855, + "flos": 19391475415680.0, + "grad_norm": 1.840223813628444, + "language_loss": 0.77969897, + "learning_rate": 3.874211680818183e-06, + "loss": 0.80177212, + "num_input_tokens_seen": 50462065, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.09375, + "step": 2330, + "time_per_iteration": 2.468606948852539 + }, + { + "auxiliary_loss_clip": 0.01156133, + "auxiliary_loss_mlp": 0.01046075, + "balance_loss_clip": 1.02738333, + "balance_loss_mlp": 1.0495398, + "epoch": 0.14014730196903652, + "flos": 15304266645120.0, + "grad_norm": 2.6993483396219506, + "language_loss": 0.72030222, + "learning_rate": 3.87407570550194e-06, + "loss": 0.74232423, + "num_input_tokens_seen": 50479565, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0625, + "step": 2331, + "time_per_iteration": 2.504417896270752 + }, + { + "auxiliary_loss_clip": 0.01150975, + "auxiliary_loss_mlp": 0.01053113, + "balance_loss_clip": 1.03333664, + "balance_loss_mlp": 1.05008936, + "epoch": 0.14020742522170448, + "flos": 14939701557120.0, + "grad_norm": 1.585347596838152, + "language_loss": 0.72609055, + "learning_rate": 3.873939659120557e-06, + "loss": 0.74813151, + "num_input_tokens_seen": 50497305, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0078125, + "step": 2332, + "time_per_iteration": 2.4244635105133057 + }, + { + "auxiliary_loss_clip": 0.01047328, + "auxiliary_loss_mlp": 0.01002801, + "balance_loss_clip": 1.00048828, + "balance_loss_mlp": 1.01059568, + "epoch": 0.14026754847437245, + "flos": 48824580044160.0, + "grad_norm": 0.8290843953692559, + "language_loss": 0.56071591, + "learning_rate": 3.873803541679196e-06, + "loss": 0.58121729, + "num_input_tokens_seen": 50549735, + "router_z_loss_clip": 0.02307129, + "router_z_loss_mlp": 0.3671875, + "step": 2333, + "time_per_iteration": 2.8934712409973145 + }, + { + "auxiliary_loss_clip": 0.01155339, + "auxiliary_loss_mlp": 0.01046057, + "balance_loss_clip": 1.02629185, + "balance_loss_mlp": 1.05001664, + "epoch": 0.14032767172704044, + "flos": 25773267876480.0, + "grad_norm": 1.7851490004805215, + "language_loss": 0.82529652, + "learning_rate": 3.873667353183016e-06, + "loss": 0.84731042, + "num_input_tokens_seen": 50570100, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0546875, + "step": 2334, + "time_per_iteration": 2.495786428451538 + }, + { + "auxiliary_loss_clip": 0.01155545, + "auxiliary_loss_mlp": 0.01048248, + "balance_loss_clip": 1.0293529, + "balance_loss_mlp": 1.05012262, + "epoch": 0.1403877949797084, + "flos": 21216312017280.0, + "grad_norm": 1.8251700419130605, + "language_loss": 0.81237197, + "learning_rate": 3.8735310936371825e-06, + "loss": 0.83440989, + "num_input_tokens_seen": 50589185, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0546875, + "step": 2335, + "time_per_iteration": 2.483055591583252 + }, + { + "auxiliary_loss_clip": 0.01163426, + "auxiliary_loss_mlp": 0.01051429, + "balance_loss_clip": 1.02829087, + "balance_loss_mlp": 1.05328035, + "epoch": 0.14044791823237637, + "flos": 22747973811840.0, + "grad_norm": 1.83822789048078, + "language_loss": 0.82159901, + "learning_rate": 3.873394763046862e-06, + "loss": 0.8437475, + "num_input_tokens_seen": 50609645, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.1015625, + "step": 2336, + "time_per_iteration": 2.4732770919799805 + }, + { + "auxiliary_loss_clip": 0.01157668, + "auxiliary_loss_mlp": 0.01044768, + "balance_loss_clip": 1.02526581, + "balance_loss_mlp": 1.05202782, + "epoch": 0.14050804148504434, + "flos": 22964443125120.0, + "grad_norm": 1.8506426201256954, + "language_loss": 0.80081403, + "learning_rate": 3.873258361417225e-06, + "loss": 0.82283843, + "num_input_tokens_seen": 50628385, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0546875, + "step": 2337, + "time_per_iteration": 2.4599671363830566 + }, + { + "auxiliary_loss_clip": 0.01155582, + "auxiliary_loss_mlp": 0.0104864, + "balance_loss_clip": 1.02911353, + "balance_loss_mlp": 1.04861474, + "epoch": 0.1405681647377123, + "flos": 22200336080640.0, + "grad_norm": 2.2474896580124963, + "language_loss": 0.7927807, + "learning_rate": 3.873121888753442e-06, + "loss": 0.81482291, + "num_input_tokens_seen": 50647260, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0703125, + "step": 2338, + "time_per_iteration": 2.4892208576202393 + }, + { + "auxiliary_loss_clip": 0.01165053, + "auxiliary_loss_mlp": 0.01046329, + "balance_loss_clip": 1.02577746, + "balance_loss_mlp": 1.05685067, + "epoch": 0.14062828799038027, + "flos": 23732787974400.0, + "grad_norm": 2.148660398501072, + "language_loss": 0.79827893, + "learning_rate": 3.87298534506069e-06, + "loss": 0.82039273, + "num_input_tokens_seen": 50666130, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.078125, + "step": 2339, + "time_per_iteration": 2.4672555923461914 + }, + { + "auxiliary_loss_clip": 0.01159986, + "auxiliary_loss_mlp": 0.01055012, + "balance_loss_clip": 1.03506875, + "balance_loss_mlp": 1.0527122, + "epoch": 0.14068841124304826, + "flos": 39202493685120.0, + "grad_norm": 1.7979240482106922, + "language_loss": 0.6582588, + "learning_rate": 3.872848730344146e-06, + "loss": 0.68040884, + "num_input_tokens_seen": 50687440, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0703125, + "step": 2340, + "time_per_iteration": 2.614506483078003 + }, + { + "auxiliary_loss_clip": 0.01155238, + "auxiliary_loss_mlp": 0.01048788, + "balance_loss_clip": 1.02936912, + "balance_loss_mlp": 1.05242825, + "epoch": 0.14074853449571623, + "flos": 20192283181440.0, + "grad_norm": 2.5431372850663334, + "language_loss": 0.78670812, + "learning_rate": 3.87271204460899e-06, + "loss": 0.80874836, + "num_input_tokens_seen": 50704030, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.03125, + "step": 2341, + "time_per_iteration": 2.4420077800750732 + }, + { + "auxiliary_loss_clip": 0.01156345, + "auxiliary_loss_mlp": 0.01050043, + "balance_loss_clip": 1.03058767, + "balance_loss_mlp": 1.05246425, + "epoch": 0.1408086577483842, + "flos": 18405871153920.0, + "grad_norm": 11.570217446637303, + "language_loss": 0.80154169, + "learning_rate": 3.8725752878604066e-06, + "loss": 0.82360554, + "num_input_tokens_seen": 50723305, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.03125, + "step": 2342, + "time_per_iteration": 2.4961190223693848 + }, + { + "auxiliary_loss_clip": 0.01159304, + "auxiliary_loss_mlp": 0.010435, + "balance_loss_clip": 1.02486777, + "balance_loss_mlp": 1.05673313, + "epoch": 0.14086878100105216, + "flos": 25264593423360.0, + "grad_norm": 1.9358851833739352, + "language_loss": 0.77974075, + "learning_rate": 3.87243846010358e-06, + "loss": 0.80176884, + "num_input_tokens_seen": 50743270, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.0234375, + "step": 2343, + "time_per_iteration": 2.479679584503174 + }, + { + "auxiliary_loss_clip": 0.01049361, + "auxiliary_loss_mlp": 0.0100492, + "balance_loss_clip": 1.0025475, + "balance_loss_mlp": 1.01255798, + "epoch": 0.14092890425372012, + "flos": 65978388869760.0, + "grad_norm": 0.8341361150670269, + "language_loss": 0.6155628, + "learning_rate": 3.872301561343699e-06, + "loss": 0.63610566, + "num_input_tokens_seen": 50802710, + "router_z_loss_clip": 0.02368164, + "router_z_loss_mlp": 0.3671875, + "step": 2344, + "time_per_iteration": 3.048691987991333 + }, + { + "auxiliary_loss_clip": 0.01151538, + "auxiliary_loss_mlp": 0.01040868, + "balance_loss_clip": 1.02309346, + "balance_loss_mlp": 1.04911709, + "epoch": 0.1409890275063881, + "flos": 23694973931520.0, + "grad_norm": 1.886714907416039, + "language_loss": 0.64591062, + "learning_rate": 3.872164591585956e-06, + "loss": 0.6678347, + "num_input_tokens_seen": 50822625, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 1.0234375, + "step": 2345, + "time_per_iteration": 2.509552240371704 + }, + { + "auxiliary_loss_clip": 0.01162324, + "auxiliary_loss_mlp": 0.01044831, + "balance_loss_clip": 1.023803, + "balance_loss_mlp": 1.05019534, + "epoch": 0.14104915075905605, + "flos": 23623152687360.0, + "grad_norm": 2.502398022219224, + "language_loss": 0.736485, + "learning_rate": 3.8720275508355435e-06, + "loss": 0.7585566, + "num_input_tokens_seen": 50842330, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1171875, + "step": 2346, + "time_per_iteration": 2.4962430000305176 + }, + { + "auxiliary_loss_clip": 0.01160187, + "auxiliary_loss_mlp": 0.01046171, + "balance_loss_clip": 1.02600074, + "balance_loss_mlp": 1.05144429, + "epoch": 0.14110927401172405, + "flos": 20595165102720.0, + "grad_norm": 2.4324488814849703, + "language_loss": 0.77868927, + "learning_rate": 3.8718904390976585e-06, + "loss": 0.80075288, + "num_input_tokens_seen": 50861035, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0859375, + "step": 2347, + "time_per_iteration": 2.4663050174713135 + }, + { + "auxiliary_loss_clip": 0.01155281, + "auxiliary_loss_mlp": 0.01046804, + "balance_loss_clip": 1.02852941, + "balance_loss_mlp": 1.04918981, + "epoch": 0.141169397264392, + "flos": 28548049512960.0, + "grad_norm": 1.7514485331985392, + "language_loss": 0.76446569, + "learning_rate": 3.8717532563775e-06, + "loss": 0.78648651, + "num_input_tokens_seen": 50880105, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.0625, + "step": 2348, + "time_per_iteration": 2.508002758026123 + }, + { + "auxiliary_loss_clip": 0.0115565, + "auxiliary_loss_mlp": 0.01043027, + "balance_loss_clip": 1.02346444, + "balance_loss_mlp": 1.0508523, + "epoch": 0.14122952051705998, + "flos": 17092258871040.0, + "grad_norm": 1.8350283773112115, + "language_loss": 0.8686446, + "learning_rate": 3.871616002680272e-06, + "loss": 0.89063132, + "num_input_tokens_seen": 50897720, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.046875, + "step": 2349, + "time_per_iteration": 2.4446985721588135 + }, + { + "auxiliary_loss_clip": 0.01156083, + "auxiliary_loss_mlp": 0.0104419, + "balance_loss_clip": 1.02478313, + "balance_loss_mlp": 1.05220377, + "epoch": 0.14128964376972794, + "flos": 28946801370240.0, + "grad_norm": 1.7285118920158233, + "language_loss": 0.8895669, + "learning_rate": 3.871478678011177e-06, + "loss": 0.9115696, + "num_input_tokens_seen": 50918385, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0390625, + "step": 2350, + "time_per_iteration": 2.49725341796875 + }, + { + "auxiliary_loss_clip": 0.0115943, + "auxiliary_loss_mlp": 0.01046195, + "balance_loss_clip": 1.02542889, + "balance_loss_mlp": 1.05281878, + "epoch": 0.1413497670223959, + "flos": 18989778643200.0, + "grad_norm": 1.8656651100546833, + "language_loss": 0.814816, + "learning_rate": 3.871341282375423e-06, + "loss": 0.83687228, + "num_input_tokens_seen": 50938270, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.0703125, + "step": 2351, + "time_per_iteration": 3.941416025161743 + }, + { + "auxiliary_loss_clip": 0.01157242, + "auxiliary_loss_mlp": 0.01040097, + "balance_loss_clip": 1.02102304, + "balance_loss_mlp": 1.05032706, + "epoch": 0.14140989027506387, + "flos": 29862236413440.0, + "grad_norm": 2.6782915885510286, + "language_loss": 0.82935351, + "learning_rate": 3.871203815778219e-06, + "loss": 0.85132694, + "num_input_tokens_seen": 50958155, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0703125, + "step": 2352, + "time_per_iteration": 5.431722640991211 + }, + { + "auxiliary_loss_clip": 0.01047453, + "auxiliary_loss_mlp": 0.01006216, + "balance_loss_clip": 1.00387907, + "balance_loss_mlp": 1.01053333, + "epoch": 0.14147001352773186, + "flos": 62079532041600.0, + "grad_norm": 0.90864091090638, + "language_loss": 0.61894125, + "learning_rate": 3.87106627822478e-06, + "loss": 0.63947791, + "num_input_tokens_seen": 51020705, + "router_z_loss_clip": 0.02331543, + "router_z_loss_mlp": 0.36914062, + "step": 2353, + "time_per_iteration": 3.0071640014648438 + }, + { + "auxiliary_loss_clip": 0.01154516, + "auxiliary_loss_mlp": 0.01047207, + "balance_loss_clip": 1.02807426, + "balance_loss_mlp": 1.05024958, + "epoch": 0.14153013678039983, + "flos": 22017514832640.0, + "grad_norm": 1.8535903324814498, + "language_loss": 0.87264848, + "learning_rate": 3.8709286697203196e-06, + "loss": 0.89466572, + "num_input_tokens_seen": 51039995, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0390625, + "step": 2354, + "time_per_iteration": 2.4613726139068604 + }, + { + "auxiliary_loss_clip": 0.01157155, + "auxiliary_loss_mlp": 0.01046298, + "balance_loss_clip": 1.02607965, + "balance_loss_mlp": 1.04953241, + "epoch": 0.1415902600330678, + "flos": 19720093968000.0, + "grad_norm": 1.9651075901387003, + "language_loss": 0.74872321, + "learning_rate": 3.870790990270057e-06, + "loss": 0.77075779, + "num_input_tokens_seen": 51059075, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.078125, + "step": 2355, + "time_per_iteration": 2.442379951477051 + }, + { + "auxiliary_loss_clip": 0.01047047, + "auxiliary_loss_mlp": 0.01002716, + "balance_loss_clip": 1.00052261, + "balance_loss_mlp": 1.01023293, + "epoch": 0.14165038328573576, + "flos": 65900929190400.0, + "grad_norm": 0.6790475533637321, + "language_loss": 0.5182299, + "learning_rate": 3.870653239879212e-06, + "loss": 0.53872752, + "num_input_tokens_seen": 51120380, + "router_z_loss_clip": 0.02197266, + "router_z_loss_mlp": 0.3671875, + "step": 2356, + "time_per_iteration": 2.9892258644104004 + }, + { + "auxiliary_loss_clip": 0.01156071, + "auxiliary_loss_mlp": 0.01053896, + "balance_loss_clip": 1.03495359, + "balance_loss_mlp": 1.05080867, + "epoch": 0.14171050653840372, + "flos": 12130158533760.0, + "grad_norm": 3.0630792396255053, + "language_loss": 0.70576489, + "learning_rate": 3.8705154185530095e-06, + "loss": 0.72786456, + "num_input_tokens_seen": 51136950, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0546875, + "step": 2357, + "time_per_iteration": 2.421844005584717 + }, + { + "auxiliary_loss_clip": 0.0116013, + "auxiliary_loss_mlp": 0.01050288, + "balance_loss_clip": 1.03169179, + "balance_loss_mlp": 1.05012453, + "epoch": 0.1417706297910717, + "flos": 20412487509120.0, + "grad_norm": 1.8720076771552743, + "language_loss": 0.82205695, + "learning_rate": 3.870377526296674e-06, + "loss": 0.84416115, + "num_input_tokens_seen": 51155175, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.09375, + "step": 2358, + "time_per_iteration": 2.4519011974334717 + }, + { + "auxiliary_loss_clip": 0.01160902, + "auxiliary_loss_mlp": 0.01047176, + "balance_loss_clip": 1.02663624, + "balance_loss_mlp": 1.051018, + "epoch": 0.14183075304373965, + "flos": 22380607463040.0, + "grad_norm": 6.439592826280342, + "language_loss": 0.7129705, + "learning_rate": 3.870239563115436e-06, + "loss": 0.73505127, + "num_input_tokens_seen": 51174500, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1015625, + "step": 2359, + "time_per_iteration": 2.4797613620758057 + }, + { + "auxiliary_loss_clip": 0.0115558, + "auxiliary_loss_mlp": 0.01043529, + "balance_loss_clip": 1.02374041, + "balance_loss_mlp": 1.04988599, + "epoch": 0.14189087629640765, + "flos": 21580913018880.0, + "grad_norm": 5.514404455287625, + "language_loss": 0.76040578, + "learning_rate": 3.870101529014526e-06, + "loss": 0.78239685, + "num_input_tokens_seen": 51194270, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.0546875, + "step": 2360, + "time_per_iteration": 2.4538815021514893 + }, + { + "auxiliary_loss_clip": 0.011559, + "auxiliary_loss_mlp": 0.01041926, + "balance_loss_clip": 1.02173233, + "balance_loss_mlp": 1.05221295, + "epoch": 0.1419509995490756, + "flos": 20008564093440.0, + "grad_norm": 2.1535632205539135, + "language_loss": 0.8188749, + "learning_rate": 3.869963423999178e-06, + "loss": 0.84085315, + "num_input_tokens_seen": 51211850, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0390625, + "step": 2361, + "time_per_iteration": 2.4411346912384033 + }, + { + "auxiliary_loss_clip": 0.01152529, + "auxiliary_loss_mlp": 0.01047315, + "balance_loss_clip": 1.02826524, + "balance_loss_mlp": 1.04964995, + "epoch": 0.14201112280174358, + "flos": 31941464112000.0, + "grad_norm": 2.775663525053056, + "language_loss": 0.74489617, + "learning_rate": 3.86982524807463e-06, + "loss": 0.76689464, + "num_input_tokens_seen": 51233545, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.03125, + "step": 2362, + "time_per_iteration": 2.530163049697876 + }, + { + "auxiliary_loss_clip": 0.01158195, + "auxiliary_loss_mlp": 0.01046424, + "balance_loss_clip": 1.0265274, + "balance_loss_mlp": 1.05187464, + "epoch": 0.14207124605441154, + "flos": 41464147582080.0, + "grad_norm": 4.478599792998506, + "language_loss": 0.73748112, + "learning_rate": 3.869687001246122e-06, + "loss": 0.75952733, + "num_input_tokens_seen": 51257615, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0625, + "step": 2363, + "time_per_iteration": 2.646651029586792 + }, + { + "auxiliary_loss_clip": 0.01156109, + "auxiliary_loss_mlp": 0.01045606, + "balance_loss_clip": 1.02605534, + "balance_loss_mlp": 1.05005693, + "epoch": 0.1421313693070795, + "flos": 31905086613120.0, + "grad_norm": 1.8353407682080387, + "language_loss": 0.72971261, + "learning_rate": 3.8695486835188946e-06, + "loss": 0.75172973, + "num_input_tokens_seen": 51279645, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0625, + "step": 2364, + "time_per_iteration": 2.5670576095581055 + }, + { + "auxiliary_loss_clip": 0.01152213, + "auxiliary_loss_mlp": 0.01048707, + "balance_loss_clip": 1.031183, + "balance_loss_mlp": 1.05015445, + "epoch": 0.14219149255974747, + "flos": 26871165031680.0, + "grad_norm": 4.452075303519762, + "language_loss": 0.90230036, + "learning_rate": 3.869410294898195e-06, + "loss": 0.92430955, + "num_input_tokens_seen": 51299775, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 1.015625, + "step": 2365, + "time_per_iteration": 2.5130062103271484 + }, + { + "auxiliary_loss_clip": 0.01155172, + "auxiliary_loss_mlp": 0.0104726, + "balance_loss_clip": 1.02735198, + "balance_loss_mlp": 1.04896259, + "epoch": 0.14225161581241544, + "flos": 27454426076160.0, + "grad_norm": 1.956458588852685, + "language_loss": 0.65377176, + "learning_rate": 3.869271835389268e-06, + "loss": 0.67579615, + "num_input_tokens_seen": 51319430, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0625, + "step": 2366, + "time_per_iteration": 2.5081095695495605 + }, + { + "auxiliary_loss_clip": 0.01152693, + "auxiliary_loss_mlp": 0.01056429, + "balance_loss_clip": 1.03640223, + "balance_loss_mlp": 1.04979372, + "epoch": 0.14231173906508343, + "flos": 10561436881920.0, + "grad_norm": 2.190613479881076, + "language_loss": 0.80414236, + "learning_rate": 3.8691333049973665e-06, + "loss": 0.82623357, + "num_input_tokens_seen": 51336045, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.03125, + "step": 2367, + "time_per_iteration": 2.4398317337036133 + }, + { + "auxiliary_loss_clip": 0.01158941, + "auxiliary_loss_mlp": 0.01054295, + "balance_loss_clip": 1.0333972, + "balance_loss_mlp": 1.05221498, + "epoch": 0.1423718623177514, + "flos": 28360882719360.0, + "grad_norm": 2.898581267606924, + "language_loss": 0.82619941, + "learning_rate": 3.868994703727742e-06, + "loss": 0.84833181, + "num_input_tokens_seen": 51357030, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.0703125, + "step": 2368, + "time_per_iteration": 2.512401580810547 + }, + { + "auxiliary_loss_clip": 0.01157439, + "auxiliary_loss_mlp": 0.01050054, + "balance_loss_clip": 1.0298835, + "balance_loss_mlp": 1.05165803, + "epoch": 0.14243198557041936, + "flos": 19354235990400.0, + "grad_norm": 2.7587049982231675, + "language_loss": 0.86971414, + "learning_rate": 3.868856031585652e-06, + "loss": 0.89178908, + "num_input_tokens_seen": 51374890, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0625, + "step": 2369, + "time_per_iteration": 2.444784164428711 + }, + { + "auxiliary_loss_clip": 0.01158905, + "auxiliary_loss_mlp": 0.01042779, + "balance_loss_clip": 1.02303767, + "balance_loss_mlp": 1.04913163, + "epoch": 0.14249210882308733, + "flos": 28806857982720.0, + "grad_norm": 1.4370193327140612, + "language_loss": 0.75704634, + "learning_rate": 3.868717288576354e-06, + "loss": 0.77906322, + "num_input_tokens_seen": 51398100, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.09375, + "step": 2370, + "time_per_iteration": 2.527740240097046 + }, + { + "auxiliary_loss_clip": 0.01154457, + "auxiliary_loss_mlp": 0.01058445, + "balance_loss_clip": 1.0384295, + "balance_loss_mlp": 1.04879546, + "epoch": 0.1425522320757553, + "flos": 21835016807040.0, + "grad_norm": 1.7319048865171518, + "language_loss": 0.82923144, + "learning_rate": 3.868578474705109e-06, + "loss": 0.85136044, + "num_input_tokens_seen": 51418745, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.0625, + "step": 2371, + "time_per_iteration": 2.4644808769226074 + }, + { + "auxiliary_loss_clip": 0.01158835, + "auxiliary_loss_mlp": 0.01051346, + "balance_loss_clip": 1.03171265, + "balance_loss_mlp": 1.05157602, + "epoch": 0.14261235532842326, + "flos": 17311457617920.0, + "grad_norm": 1.956158386855541, + "language_loss": 0.82575452, + "learning_rate": 3.868439589977181e-06, + "loss": 0.84785628, + "num_input_tokens_seen": 51437455, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0703125, + "step": 2372, + "time_per_iteration": 2.42240047454834 + }, + { + "auxiliary_loss_clip": 0.01157732, + "auxiliary_loss_mlp": 0.01051664, + "balance_loss_clip": 1.03175569, + "balance_loss_mlp": 1.05134308, + "epoch": 0.14267247858109125, + "flos": 18806741913600.0, + "grad_norm": 2.19442784605527, + "language_loss": 0.8396256, + "learning_rate": 3.868300634397836e-06, + "loss": 0.86171949, + "num_input_tokens_seen": 51455710, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0625, + "step": 2373, + "time_per_iteration": 2.444695472717285 + }, + { + "auxiliary_loss_clip": 0.01154816, + "auxiliary_loss_mlp": 0.01050946, + "balance_loss_clip": 1.03294528, + "balance_loss_mlp": 1.05012143, + "epoch": 0.14273260183375922, + "flos": 11358904682880.0, + "grad_norm": 2.034088541649992, + "language_loss": 0.86271042, + "learning_rate": 3.8681616079723445e-06, + "loss": 0.88476801, + "num_input_tokens_seen": 51471270, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.046875, + "step": 2374, + "time_per_iteration": 2.428062915802002 + }, + { + "auxiliary_loss_clip": 0.01161306, + "auxiliary_loss_mlp": 0.01050996, + "balance_loss_clip": 1.03024197, + "balance_loss_mlp": 1.05125451, + "epoch": 0.14279272508642718, + "flos": 27567688636800.0, + "grad_norm": 4.612229602439842, + "language_loss": 0.7919687, + "learning_rate": 3.868022510705977e-06, + "loss": 0.81409162, + "num_input_tokens_seen": 51492705, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1015625, + "step": 2375, + "time_per_iteration": 2.526838541030884 + }, + { + "auxiliary_loss_clip": 0.01157834, + "auxiliary_loss_mlp": 0.01056869, + "balance_loss_clip": 1.03655601, + "balance_loss_mlp": 1.05240607, + "epoch": 0.14285284833909515, + "flos": 16252559654400.0, + "grad_norm": 2.386247922788535, + "language_loss": 0.76400912, + "learning_rate": 3.867883342604009e-06, + "loss": 0.78615618, + "num_input_tokens_seen": 51510780, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.046875, + "step": 2376, + "time_per_iteration": 2.4554591178894043 + }, + { + "auxiliary_loss_clip": 0.01156552, + "auxiliary_loss_mlp": 0.0104949, + "balance_loss_clip": 1.02995205, + "balance_loss_mlp": 1.05075741, + "epoch": 0.1429129715917631, + "flos": 19755609540480.0, + "grad_norm": 2.9035160782842753, + "language_loss": 0.93037754, + "learning_rate": 3.867744103671717e-06, + "loss": 0.952438, + "num_input_tokens_seen": 51531400, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0625, + "step": 2377, + "time_per_iteration": 2.51385760307312 + }, + { + "auxiliary_loss_clip": 0.01157682, + "auxiliary_loss_mlp": 0.01051526, + "balance_loss_clip": 1.02991319, + "balance_loss_mlp": 1.05085003, + "epoch": 0.14297309484443108, + "flos": 21137092571520.0, + "grad_norm": 1.9751577144221115, + "language_loss": 0.91598773, + "learning_rate": 3.867604793914382e-06, + "loss": 0.93807983, + "num_input_tokens_seen": 51548215, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.0703125, + "step": 2378, + "time_per_iteration": 2.558563470840454 + }, + { + "auxiliary_loss_clip": 0.01159674, + "auxiliary_loss_mlp": 0.01044299, + "balance_loss_clip": 1.02410531, + "balance_loss_mlp": 1.051296, + "epoch": 0.14303321809709904, + "flos": 23586667447680.0, + "grad_norm": 1.745891074970689, + "language_loss": 0.73947102, + "learning_rate": 3.8674654133372864e-06, + "loss": 0.76151079, + "num_input_tokens_seen": 51566820, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0859375, + "step": 2379, + "time_per_iteration": 2.511359214782715 + }, + { + "auxiliary_loss_clip": 0.01156473, + "auxiliary_loss_mlp": 0.01056109, + "balance_loss_clip": 1.03636849, + "balance_loss_mlp": 1.05014992, + "epoch": 0.14309334134976703, + "flos": 15888281875200.0, + "grad_norm": 1.8640465231226504, + "language_loss": 0.79013336, + "learning_rate": 3.867325961945714e-06, + "loss": 0.81225914, + "num_input_tokens_seen": 51585075, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0625, + "step": 2380, + "time_per_iteration": 2.466219663619995 + }, + { + "auxiliary_loss_clip": 0.01162977, + "auxiliary_loss_mlp": 0.0105089, + "balance_loss_clip": 1.03124452, + "balance_loss_mlp": 1.05528164, + "epoch": 0.143153464602435, + "flos": 16325601960960.0, + "grad_norm": 2.3244590707621073, + "language_loss": 0.87958229, + "learning_rate": 3.867186439744955e-06, + "loss": 0.90172088, + "num_input_tokens_seen": 51603185, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.078125, + "step": 2381, + "time_per_iteration": 2.4476850032806396 + }, + { + "auxiliary_loss_clip": 0.01156941, + "auxiliary_loss_mlp": 0.01051059, + "balance_loss_clip": 1.03084123, + "balance_loss_mlp": 1.0517571, + "epoch": 0.14321358785510296, + "flos": 17092079303040.0, + "grad_norm": 2.599935932772449, + "language_loss": 0.76852649, + "learning_rate": 3.867046846740299e-06, + "loss": 0.7906065, + "num_input_tokens_seen": 51620880, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.046875, + "step": 2382, + "time_per_iteration": 2.4389045238494873 + }, + { + "auxiliary_loss_clip": 0.01157847, + "auxiliary_loss_mlp": 0.01053474, + "balance_loss_clip": 1.03338671, + "balance_loss_mlp": 1.05068171, + "epoch": 0.14327371110777093, + "flos": 26322916769280.0, + "grad_norm": 2.461149819336849, + "language_loss": 0.76948071, + "learning_rate": 3.866907182937039e-06, + "loss": 0.79159391, + "num_input_tokens_seen": 51640170, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0703125, + "step": 2383, + "time_per_iteration": 2.516038179397583 + }, + { + "auxiliary_loss_clip": 0.01158748, + "auxiliary_loss_mlp": 0.01050878, + "balance_loss_clip": 1.0299803, + "balance_loss_mlp": 1.05114412, + "epoch": 0.1433338343604389, + "flos": 18076462502400.0, + "grad_norm": 2.169581662424978, + "language_loss": 0.88202822, + "learning_rate": 3.866767448340471e-06, + "loss": 0.9041245, + "num_input_tokens_seen": 51656580, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.078125, + "step": 2384, + "time_per_iteration": 2.42138934135437 + }, + { + "auxiliary_loss_clip": 0.01164338, + "auxiliary_loss_mlp": 0.01049242, + "balance_loss_clip": 1.02780819, + "balance_loss_mlp": 1.05382657, + "epoch": 0.14339395761310686, + "flos": 15522783033600.0, + "grad_norm": 4.175812514986151, + "language_loss": 0.79225606, + "learning_rate": 3.866627642955895e-06, + "loss": 0.81439185, + "num_input_tokens_seen": 51674645, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.109375, + "step": 2385, + "time_per_iteration": 2.4439244270324707 + }, + { + "auxiliary_loss_clip": 0.01156029, + "auxiliary_loss_mlp": 0.01046717, + "balance_loss_clip": 1.02692771, + "balance_loss_mlp": 1.04881537, + "epoch": 0.14345408086577485, + "flos": 28548767784960.0, + "grad_norm": 1.9672730758223058, + "language_loss": 0.74989617, + "learning_rate": 3.866487766788612e-06, + "loss": 0.77192366, + "num_input_tokens_seen": 51695770, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.078125, + "step": 2386, + "time_per_iteration": 2.533304214477539 + }, + { + "auxiliary_loss_clip": 0.01159067, + "auxiliary_loss_mlp": 0.01047419, + "balance_loss_clip": 1.02777338, + "balance_loss_mlp": 1.05180025, + "epoch": 0.14351420411844282, + "flos": 20230061310720.0, + "grad_norm": 2.5174427688568626, + "language_loss": 0.78475344, + "learning_rate": 3.866347819843925e-06, + "loss": 0.80681831, + "num_input_tokens_seen": 51714165, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0703125, + "step": 2387, + "time_per_iteration": 2.4568724632263184 + }, + { + "auxiliary_loss_clip": 0.01157837, + "auxiliary_loss_mlp": 0.010548, + "balance_loss_clip": 1.03389072, + "balance_loss_mlp": 1.05092847, + "epoch": 0.14357432737111078, + "flos": 19865029345920.0, + "grad_norm": 2.559937991009886, + "language_loss": 0.82087159, + "learning_rate": 3.866207802127143e-06, + "loss": 0.84299791, + "num_input_tokens_seen": 51734440, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.0703125, + "step": 2388, + "time_per_iteration": 2.5136237144470215 + }, + { + "auxiliary_loss_clip": 0.01161514, + "auxiliary_loss_mlp": 0.01044981, + "balance_loss_clip": 1.02633715, + "balance_loss_mlp": 1.05393136, + "epoch": 0.14363445062377875, + "flos": 28256814040320.0, + "grad_norm": 2.471836270672028, + "language_loss": 0.82267237, + "learning_rate": 3.866067713643573e-06, + "loss": 0.84473729, + "num_input_tokens_seen": 51753730, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.078125, + "step": 2389, + "time_per_iteration": 2.576352119445801 + }, + { + "auxiliary_loss_clip": 0.01161426, + "auxiliary_loss_mlp": 0.01051291, + "balance_loss_clip": 1.03020322, + "balance_loss_mlp": 1.05032301, + "epoch": 0.1436945738764467, + "flos": 18186672407040.0, + "grad_norm": 2.165584666776674, + "language_loss": 0.82654548, + "learning_rate": 3.8659275543985285e-06, + "loss": 0.84867263, + "num_input_tokens_seen": 51771195, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.109375, + "step": 2390, + "time_per_iteration": 2.5145435333251953 + }, + { + "auxiliary_loss_clip": 0.01158378, + "auxiliary_loss_mlp": 0.01054186, + "balance_loss_clip": 1.03406334, + "balance_loss_mlp": 1.0510571, + "epoch": 0.14375469712911468, + "flos": 27307910499840.0, + "grad_norm": 3.0575281215329086, + "language_loss": 0.74616158, + "learning_rate": 3.865787324397324e-06, + "loss": 0.76828718, + "num_input_tokens_seen": 51792290, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.078125, + "step": 2391, + "time_per_iteration": 2.5368545055389404 + }, + { + "auxiliary_loss_clip": 0.01050934, + "auxiliary_loss_mlp": 0.01014282, + "balance_loss_clip": 1.0121367, + "balance_loss_mlp": 1.01461065, + "epoch": 0.14381482038178264, + "flos": 56891445287040.0, + "grad_norm": 0.8732258813949081, + "language_loss": 0.61769497, + "learning_rate": 3.865647023645277e-06, + "loss": 0.63834715, + "num_input_tokens_seen": 51843675, + "router_z_loss_clip": 0.02148438, + "router_z_loss_mlp": 0.36328125, + "step": 2392, + "time_per_iteration": 2.9315476417541504 + }, + { + "auxiliary_loss_clip": 0.01161818, + "auxiliary_loss_mlp": 0.01056559, + "balance_loss_clip": 1.03449333, + "balance_loss_mlp": 1.04981267, + "epoch": 0.14387494363445064, + "flos": 14282177143680.0, + "grad_norm": 2.638581894381379, + "language_loss": 0.76172751, + "learning_rate": 3.865506652147709e-06, + "loss": 0.78391123, + "num_input_tokens_seen": 51860285, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1171875, + "step": 2393, + "time_per_iteration": 3.857799530029297 + }, + { + "auxiliary_loss_clip": 0.01161345, + "auxiliary_loss_mlp": 0.01049065, + "balance_loss_clip": 1.02908611, + "balance_loss_mlp": 1.05249143, + "epoch": 0.1439350668871186, + "flos": 26761493831040.0, + "grad_norm": 1.8778469598095298, + "language_loss": 0.76782668, + "learning_rate": 3.865366209909941e-06, + "loss": 0.78993082, + "num_input_tokens_seen": 51880105, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.09375, + "step": 2394, + "time_per_iteration": 3.979130983352661 + }, + { + "auxiliary_loss_clip": 0.01158023, + "auxiliary_loss_mlp": 0.01048603, + "balance_loss_clip": 1.02836156, + "balance_loss_mlp": 1.05062532, + "epoch": 0.14399519013978657, + "flos": 40700040537600.0, + "grad_norm": 1.605706810552395, + "language_loss": 0.85831755, + "learning_rate": 3.8652256969372994e-06, + "loss": 0.88038385, + "num_input_tokens_seen": 51905175, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.078125, + "step": 2395, + "time_per_iteration": 2.652092933654785 + }, + { + "auxiliary_loss_clip": 0.01157831, + "auxiliary_loss_mlp": 0.01049814, + "balance_loss_clip": 1.03040648, + "balance_loss_mlp": 1.05241179, + "epoch": 0.14405531339245453, + "flos": 20557530627840.0, + "grad_norm": 1.5230484666362787, + "language_loss": 0.82984561, + "learning_rate": 3.865085113235113e-06, + "loss": 0.85192204, + "num_input_tokens_seen": 51924490, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0546875, + "step": 2396, + "time_per_iteration": 2.4647467136383057 + }, + { + "auxiliary_loss_clip": 0.01152766, + "auxiliary_loss_mlp": 0.01046059, + "balance_loss_clip": 1.02691364, + "balance_loss_mlp": 1.04947495, + "epoch": 0.1441154366451225, + "flos": 19572931946880.0, + "grad_norm": 2.435366869769497, + "language_loss": 0.82564163, + "learning_rate": 3.864944458808712e-06, + "loss": 0.8476299, + "num_input_tokens_seen": 51940490, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.03125, + "step": 2397, + "time_per_iteration": 2.4151055812835693 + }, + { + "auxiliary_loss_clip": 0.01161338, + "auxiliary_loss_mlp": 0.01047669, + "balance_loss_clip": 1.02689052, + "balance_loss_mlp": 1.05216622, + "epoch": 0.14417555989779046, + "flos": 18515721922560.0, + "grad_norm": 1.6104109289920625, + "language_loss": 0.79418427, + "learning_rate": 3.86480373366343e-06, + "loss": 0.81627429, + "num_input_tokens_seen": 51957910, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.09375, + "step": 2398, + "time_per_iteration": 2.4470388889312744 + }, + { + "auxiliary_loss_clip": 0.01158929, + "auxiliary_loss_mlp": 0.01052066, + "balance_loss_clip": 1.03246808, + "balance_loss_mlp": 1.05359757, + "epoch": 0.14423568315045843, + "flos": 26031681296640.0, + "grad_norm": 2.7500042291552433, + "language_loss": 0.64847696, + "learning_rate": 3.864662937804603e-06, + "loss": 0.67058688, + "num_input_tokens_seen": 51978010, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0546875, + "step": 2399, + "time_per_iteration": 2.5123891830444336 + }, + { + "auxiliary_loss_clip": 0.01157977, + "auxiliary_loss_mlp": 0.01044487, + "balance_loss_clip": 1.02472198, + "balance_loss_mlp": 1.05306005, + "epoch": 0.14429580640312642, + "flos": 21288743792640.0, + "grad_norm": 1.4896130870957418, + "language_loss": 0.82329226, + "learning_rate": 3.864522071237571e-06, + "loss": 0.84531689, + "num_input_tokens_seen": 51998515, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0546875, + "step": 2400, + "time_per_iteration": 2.4825797080993652 + }, + { + "auxiliary_loss_clip": 0.01162323, + "auxiliary_loss_mlp": 0.01052957, + "balance_loss_clip": 1.03165436, + "balance_loss_mlp": 1.053689, + "epoch": 0.14435592965579438, + "flos": 25627865621760.0, + "grad_norm": 1.540874002782335, + "language_loss": 0.74606794, + "learning_rate": 3.864381133967676e-06, + "loss": 0.76822078, + "num_input_tokens_seen": 52019270, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.0859375, + "step": 2401, + "time_per_iteration": 2.507983684539795 + }, + { + "auxiliary_loss_clip": 0.01156636, + "auxiliary_loss_mlp": 0.01046459, + "balance_loss_clip": 1.0269084, + "balance_loss_mlp": 1.05109596, + "epoch": 0.14441605290846235, + "flos": 22965053656320.0, + "grad_norm": 1.7568662987329828, + "language_loss": 0.80577219, + "learning_rate": 3.86424012600026e-06, + "loss": 0.82780313, + "num_input_tokens_seen": 52039315, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.046875, + "step": 2402, + "time_per_iteration": 2.4913880825042725 + }, + { + "auxiliary_loss_clip": 0.01156436, + "auxiliary_loss_mlp": 0.01048893, + "balance_loss_clip": 1.02880669, + "balance_loss_mlp": 1.05137098, + "epoch": 0.14447617616113032, + "flos": 17347655548800.0, + "grad_norm": 2.1115432529250753, + "language_loss": 0.84918672, + "learning_rate": 3.864099047340673e-06, + "loss": 0.87124002, + "num_input_tokens_seen": 52056555, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.046875, + "step": 2403, + "time_per_iteration": 2.4267525672912598 + }, + { + "auxiliary_loss_clip": 0.01155438, + "auxiliary_loss_mlp": 0.01053748, + "balance_loss_clip": 1.03312445, + "balance_loss_mlp": 1.04934669, + "epoch": 0.14453629941379828, + "flos": 24060185464320.0, + "grad_norm": 3.423742001713465, + "language_loss": 0.70017314, + "learning_rate": 3.863957897994262e-06, + "loss": 0.72226501, + "num_input_tokens_seen": 52075800, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.0625, + "step": 2404, + "time_per_iteration": 2.487827777862549 + }, + { + "auxiliary_loss_clip": 0.01151274, + "auxiliary_loss_mlp": 0.0104872, + "balance_loss_clip": 1.02976513, + "balance_loss_mlp": 1.0473218, + "epoch": 0.14459642266646625, + "flos": 14429554646400.0, + "grad_norm": 2.368746641876408, + "language_loss": 0.72847003, + "learning_rate": 3.863816677966381e-06, + "loss": 0.75046992, + "num_input_tokens_seen": 52092585, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0390625, + "step": 2405, + "time_per_iteration": 2.458444833755493 + }, + { + "auxiliary_loss_clip": 0.01152813, + "auxiliary_loss_mlp": 0.0104761, + "balance_loss_clip": 1.02879858, + "balance_loss_mlp": 1.04891181, + "epoch": 0.14465654591913424, + "flos": 9867032179200.0, + "grad_norm": 2.2064790582144473, + "language_loss": 0.73115766, + "learning_rate": 3.863675387262386e-06, + "loss": 0.75316191, + "num_input_tokens_seen": 52108990, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0390625, + "step": 2406, + "time_per_iteration": 2.4501168727874756 + }, + { + "auxiliary_loss_clip": 0.0115439, + "auxiliary_loss_mlp": 0.01052848, + "balance_loss_clip": 1.03161645, + "balance_loss_mlp": 1.04889357, + "epoch": 0.1447166691718022, + "flos": 24972926987520.0, + "grad_norm": 4.997473868200426, + "language_loss": 0.75399184, + "learning_rate": 3.8635340258876325e-06, + "loss": 0.77606416, + "num_input_tokens_seen": 52125385, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.0546875, + "step": 2407, + "time_per_iteration": 2.482008934020996 + }, + { + "auxiliary_loss_clip": 0.01151849, + "auxiliary_loss_mlp": 0.01043439, + "balance_loss_clip": 1.02418649, + "balance_loss_mlp": 1.04607177, + "epoch": 0.14477679242447017, + "flos": 21908023200000.0, + "grad_norm": 1.6082248834480546, + "language_loss": 0.79472804, + "learning_rate": 3.8633925938474826e-06, + "loss": 0.81668091, + "num_input_tokens_seen": 52144985, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.0625, + "step": 2408, + "time_per_iteration": 2.4657323360443115 + }, + { + "auxiliary_loss_clip": 0.01155517, + "auxiliary_loss_mlp": 0.01051689, + "balance_loss_clip": 1.03006482, + "balance_loss_mlp": 1.05088127, + "epoch": 0.14483691567713813, + "flos": 20740746925440.0, + "grad_norm": 2.1979655558708893, + "language_loss": 0.82594806, + "learning_rate": 3.863251091147299e-06, + "loss": 0.84802014, + "num_input_tokens_seen": 52163885, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.046875, + "step": 2409, + "time_per_iteration": 2.450345039367676 + }, + { + "auxiliary_loss_clip": 0.01156412, + "auxiliary_loss_mlp": 0.01053711, + "balance_loss_clip": 1.03411365, + "balance_loss_mlp": 1.05046105, + "epoch": 0.1448970389298061, + "flos": 35407705536000.0, + "grad_norm": 1.954409921875598, + "language_loss": 0.74561608, + "learning_rate": 3.863109517792446e-06, + "loss": 0.7677173, + "num_input_tokens_seen": 52184325, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0625, + "step": 2410, + "time_per_iteration": 2.5861988067626953 + }, + { + "auxiliary_loss_clip": 0.01154014, + "auxiliary_loss_mlp": 0.01047443, + "balance_loss_clip": 1.02883387, + "balance_loss_mlp": 1.04858971, + "epoch": 0.14495716218247406, + "flos": 15414368808960.0, + "grad_norm": 2.3844352739280597, + "language_loss": 0.81135416, + "learning_rate": 3.8629678737882945e-06, + "loss": 0.83336866, + "num_input_tokens_seen": 52202740, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0546875, + "step": 2411, + "time_per_iteration": 2.4708898067474365 + }, + { + "auxiliary_loss_clip": 0.0115486, + "auxiliary_loss_mlp": 0.01053033, + "balance_loss_clip": 1.03403103, + "balance_loss_mlp": 1.05123138, + "epoch": 0.14501728543514203, + "flos": 33693222493440.0, + "grad_norm": 1.954560524414831, + "language_loss": 0.69816971, + "learning_rate": 3.862826159140214e-06, + "loss": 0.7202487, + "num_input_tokens_seen": 52223100, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.03125, + "step": 2412, + "time_per_iteration": 2.5614776611328125 + }, + { + "auxiliary_loss_clip": 0.0115476, + "auxiliary_loss_mlp": 0.01046078, + "balance_loss_clip": 1.02640891, + "balance_loss_mlp": 1.05100143, + "epoch": 0.14507740868781002, + "flos": 15596112648960.0, + "grad_norm": 2.1541085269745803, + "language_loss": 0.77347231, + "learning_rate": 3.862684373853579e-06, + "loss": 0.79548067, + "num_input_tokens_seen": 52239690, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0390625, + "step": 2413, + "time_per_iteration": 2.4292590618133545 + }, + { + "auxiliary_loss_clip": 0.01049286, + "auxiliary_loss_mlp": 0.0100403, + "balance_loss_clip": 1.00175285, + "balance_loss_mlp": 1.01294982, + "epoch": 0.145137531940478, + "flos": 66675343438080.0, + "grad_norm": 0.9152840666775347, + "language_loss": 0.58887923, + "learning_rate": 3.8625425179337656e-06, + "loss": 0.60941237, + "num_input_tokens_seen": 52296705, + "router_z_loss_clip": 0.02282715, + "router_z_loss_mlp": 0.36328125, + "step": 2414, + "time_per_iteration": 2.9752402305603027 + }, + { + "auxiliary_loss_clip": 0.01048826, + "auxiliary_loss_mlp": 0.01001535, + "balance_loss_clip": 0.99943656, + "balance_loss_mlp": 1.01240802, + "epoch": 0.14519765519314595, + "flos": 67521578929920.0, + "grad_norm": 0.8348908268898737, + "language_loss": 0.6218617, + "learning_rate": 3.862400591386154e-06, + "loss": 0.64236534, + "num_input_tokens_seen": 52361830, + "router_z_loss_clip": 0.02099609, + "router_z_loss_mlp": 0.36328125, + "step": 2415, + "time_per_iteration": 3.039710521697998 + }, + { + "auxiliary_loss_clip": 0.01151709, + "auxiliary_loss_mlp": 0.01046414, + "balance_loss_clip": 1.02637458, + "balance_loss_mlp": 1.04699647, + "epoch": 0.14525777844581392, + "flos": 17198913329280.0, + "grad_norm": 1.8743578134099377, + "language_loss": 0.72001135, + "learning_rate": 3.8622585942161245e-06, + "loss": 0.74199259, + "num_input_tokens_seen": 52379420, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.046875, + "step": 2416, + "time_per_iteration": 2.4283041954040527 + }, + { + "auxiliary_loss_clip": 0.0104556, + "auxiliary_loss_mlp": 0.01005813, + "balance_loss_clip": 1.00379848, + "balance_loss_mlp": 1.01002693, + "epoch": 0.14531790169848188, + "flos": 65404609015680.0, + "grad_norm": 0.711670432605859, + "language_loss": 0.60392165, + "learning_rate": 3.8621165264290635e-06, + "loss": 0.62443542, + "num_input_tokens_seen": 52446290, + "router_z_loss_clip": 0.0201416, + "router_z_loss_mlp": 0.35546875, + "step": 2417, + "time_per_iteration": 3.0824739933013916 + }, + { + "auxiliary_loss_clip": 0.01155799, + "auxiliary_loss_mlp": 0.01055986, + "balance_loss_clip": 1.03639972, + "balance_loss_mlp": 1.04795754, + "epoch": 0.14537802495114985, + "flos": 32562467372160.0, + "grad_norm": 2.9144560714513363, + "language_loss": 0.79237175, + "learning_rate": 3.861974388030356e-06, + "loss": 0.8144896, + "num_input_tokens_seen": 52467295, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.078125, + "step": 2418, + "time_per_iteration": 2.564497947692871 + }, + { + "auxiliary_loss_clip": 0.01150145, + "auxiliary_loss_mlp": 0.01051645, + "balance_loss_clip": 1.03267837, + "balance_loss_mlp": 1.04712582, + "epoch": 0.1454381482038178, + "flos": 20226685432320.0, + "grad_norm": 1.8755047341617508, + "language_loss": 0.72032261, + "learning_rate": 3.861832179025394e-06, + "loss": 0.74234051, + "num_input_tokens_seen": 52487295, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.03125, + "step": 2419, + "time_per_iteration": 2.457617998123169 + }, + { + "auxiliary_loss_clip": 0.01153915, + "auxiliary_loss_mlp": 0.01053899, + "balance_loss_clip": 1.0335021, + "balance_loss_mlp": 1.05042267, + "epoch": 0.1454982714564858, + "flos": 22893124671360.0, + "grad_norm": 2.3659429121693525, + "language_loss": 0.90125811, + "learning_rate": 3.861689899419569e-06, + "loss": 0.92333627, + "num_input_tokens_seen": 52504220, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.03125, + "step": 2420, + "time_per_iteration": 2.456087827682495 + }, + { + "auxiliary_loss_clip": 0.01154143, + "auxiliary_loss_mlp": 0.01057012, + "balance_loss_clip": 1.0382725, + "balance_loss_mlp": 1.04868603, + "epoch": 0.14555839470915377, + "flos": 20229845829120.0, + "grad_norm": 2.2940003535379057, + "language_loss": 0.83309549, + "learning_rate": 3.861547549218276e-06, + "loss": 0.85520703, + "num_input_tokens_seen": 52521900, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0546875, + "step": 2421, + "time_per_iteration": 2.441432476043701 + }, + { + "auxiliary_loss_clip": 0.01153189, + "auxiliary_loss_mlp": 0.01053683, + "balance_loss_clip": 1.03400183, + "balance_loss_mlp": 1.04684627, + "epoch": 0.14561851796182174, + "flos": 22236282616320.0, + "grad_norm": 1.6167157199382733, + "language_loss": 0.81511533, + "learning_rate": 3.861405128426914e-06, + "loss": 0.83718407, + "num_input_tokens_seen": 52540495, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0625, + "step": 2422, + "time_per_iteration": 2.473010540008545 + }, + { + "auxiliary_loss_clip": 0.01046424, + "auxiliary_loss_mlp": 0.01017838, + "balance_loss_clip": 1.01558518, + "balance_loss_mlp": 1.01065397, + "epoch": 0.1456786412144897, + "flos": 52636786289280.0, + "grad_norm": 0.9226410759759552, + "language_loss": 0.63245702, + "learning_rate": 3.861262637050883e-06, + "loss": 0.65309966, + "num_input_tokens_seen": 52603305, + "router_z_loss_clip": 0.02258301, + "router_z_loss_mlp": 0.35742188, + "step": 2423, + "time_per_iteration": 3.0516433715820312 + }, + { + "auxiliary_loss_clip": 0.01155109, + "auxiliary_loss_mlp": 0.01045363, + "balance_loss_clip": 1.02756512, + "balance_loss_mlp": 1.05096769, + "epoch": 0.14573876446715767, + "flos": 23221671396480.0, + "grad_norm": 1.7656587875688796, + "language_loss": 0.8267172, + "learning_rate": 3.861120075095585e-06, + "loss": 0.84872198, + "num_input_tokens_seen": 52623435, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 1.046875, + "step": 2424, + "time_per_iteration": 2.4918792247772217 + }, + { + "auxiliary_loss_clip": 0.01153149, + "auxiliary_loss_mlp": 0.0104962, + "balance_loss_clip": 1.03071296, + "balance_loss_mlp": 1.04970837, + "epoch": 0.14579888771982563, + "flos": 18114384286080.0, + "grad_norm": 2.0603730404595915, + "language_loss": 0.79317909, + "learning_rate": 3.860977442566429e-06, + "loss": 0.81520677, + "num_input_tokens_seen": 52642255, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.03125, + "step": 2425, + "time_per_iteration": 2.4607083797454834 + }, + { + "auxiliary_loss_clip": 0.01155851, + "auxiliary_loss_mlp": 0.01048544, + "balance_loss_clip": 1.030007, + "balance_loss_mlp": 1.05136847, + "epoch": 0.14585901097249362, + "flos": 23001107932800.0, + "grad_norm": 2.4026453111661703, + "language_loss": 0.83269531, + "learning_rate": 3.860834739468821e-06, + "loss": 0.85473925, + "num_input_tokens_seen": 52658700, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0390625, + "step": 2426, + "time_per_iteration": 2.4615883827209473 + }, + { + "auxiliary_loss_clip": 0.01153183, + "auxiliary_loss_mlp": 0.0104253, + "balance_loss_clip": 1.02420735, + "balance_loss_mlp": 1.05100346, + "epoch": 0.1459191342251616, + "flos": 21908669644800.0, + "grad_norm": 1.78851961601388, + "language_loss": 0.86878085, + "learning_rate": 3.860691965808173e-06, + "loss": 0.89073801, + "num_input_tokens_seen": 52678140, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.0234375, + "step": 2427, + "time_per_iteration": 2.46846866607666 + }, + { + "auxiliary_loss_clip": 0.01159617, + "auxiliary_loss_mlp": 0.01046481, + "balance_loss_clip": 1.0264895, + "balance_loss_mlp": 1.05060291, + "epoch": 0.14597925747782955, + "flos": 14975504438400.0, + "grad_norm": 1.9424277979169204, + "language_loss": 0.66795039, + "learning_rate": 3.8605491215899e-06, + "loss": 0.69001138, + "num_input_tokens_seen": 52696825, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.0859375, + "step": 2428, + "time_per_iteration": 2.4277987480163574 + }, + { + "auxiliary_loss_clip": 0.01154279, + "auxiliary_loss_mlp": 0.01048778, + "balance_loss_clip": 1.02870345, + "balance_loss_mlp": 1.05036306, + "epoch": 0.14603938073049752, + "flos": 21068898600960.0, + "grad_norm": 1.7447652065053452, + "language_loss": 0.8363744, + "learning_rate": 3.860406206819417e-06, + "loss": 0.85840499, + "num_input_tokens_seen": 52715125, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0390625, + "step": 2429, + "time_per_iteration": 2.5208661556243896 + }, + { + "auxiliary_loss_clip": 0.01152615, + "auxiliary_loss_mlp": 0.01048492, + "balance_loss_clip": 1.02972817, + "balance_loss_mlp": 1.04804671, + "epoch": 0.14609950398316549, + "flos": 19864777950720.0, + "grad_norm": 1.723947749216575, + "language_loss": 0.78811824, + "learning_rate": 3.860263221502145e-06, + "loss": 0.8101294, + "num_input_tokens_seen": 52734015, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.046875, + "step": 2430, + "time_per_iteration": 2.460575580596924 + }, + { + "auxiliary_loss_clip": 0.0115835, + "auxiliary_loss_mlp": 0.01049311, + "balance_loss_clip": 1.03014231, + "balance_loss_mlp": 1.0529238, + "epoch": 0.14615962723583345, + "flos": 22418852469120.0, + "grad_norm": 2.3723861833809767, + "language_loss": 0.83178174, + "learning_rate": 3.860120165643504e-06, + "loss": 0.85385835, + "num_input_tokens_seen": 52753025, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0546875, + "step": 2431, + "time_per_iteration": 2.468472480773926 + }, + { + "auxiliary_loss_clip": 0.01158923, + "auxiliary_loss_mlp": 0.01053127, + "balance_loss_clip": 1.03244448, + "balance_loss_mlp": 1.05131185, + "epoch": 0.14621975048850142, + "flos": 22346241125760.0, + "grad_norm": 1.7402379411604871, + "language_loss": 0.78777766, + "learning_rate": 3.859977039248921e-06, + "loss": 0.80989814, + "num_input_tokens_seen": 52773420, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.078125, + "step": 2432, + "time_per_iteration": 2.4618513584136963 + }, + { + "auxiliary_loss_clip": 0.01153865, + "auxiliary_loss_mlp": 0.01052087, + "balance_loss_clip": 1.03158331, + "balance_loss_mlp": 1.04917812, + "epoch": 0.1462798737411694, + "flos": 24389163152640.0, + "grad_norm": 1.9105383938395448, + "language_loss": 0.79940903, + "learning_rate": 3.859833842323822e-06, + "loss": 0.82146859, + "num_input_tokens_seen": 52792870, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.046875, + "step": 2433, + "time_per_iteration": 2.4901435375213623 + }, + { + "auxiliary_loss_clip": 0.01152814, + "auxiliary_loss_mlp": 0.01051119, + "balance_loss_clip": 1.03149712, + "balance_loss_mlp": 1.05186844, + "epoch": 0.14633999699383737, + "flos": 19244672530560.0, + "grad_norm": 1.8984055506020234, + "language_loss": 0.78421938, + "learning_rate": 3.859690574873638e-06, + "loss": 0.80625868, + "num_input_tokens_seen": 52811615, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0078125, + "step": 2434, + "time_per_iteration": 3.833007335662842 + }, + { + "auxiliary_loss_clip": 0.01046525, + "auxiliary_loss_mlp": 0.01005945, + "balance_loss_clip": 1.00356054, + "balance_loss_mlp": 1.01038933, + "epoch": 0.14640012024650534, + "flos": 62660638270080.0, + "grad_norm": 0.8674820067375166, + "language_loss": 0.58373666, + "learning_rate": 3.8595472369038e-06, + "loss": 0.60426134, + "num_input_tokens_seen": 52873230, + "router_z_loss_clip": 0.02380371, + "router_z_loss_mlp": 0.36132812, + "step": 2435, + "time_per_iteration": 5.911077499389648 + }, + { + "auxiliary_loss_clip": 0.01147895, + "auxiliary_loss_mlp": 0.01045492, + "balance_loss_clip": 1.02620411, + "balance_loss_mlp": 1.04662895, + "epoch": 0.1464602434991733, + "flos": 12276243146880.0, + "grad_norm": 2.2832294661951753, + "language_loss": 0.88395989, + "learning_rate": 3.859403828419744e-06, + "loss": 0.90589368, + "num_input_tokens_seen": 52889325, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.015625, + "step": 2436, + "time_per_iteration": 2.440303325653076 + }, + { + "auxiliary_loss_clip": 0.01156296, + "auxiliary_loss_mlp": 0.01046658, + "balance_loss_clip": 1.02697682, + "balance_loss_mlp": 1.05032742, + "epoch": 0.14652036675184127, + "flos": 20922311197440.0, + "grad_norm": 2.0196076648737, + "language_loss": 0.74832988, + "learning_rate": 3.85926034942691e-06, + "loss": 0.7703594, + "num_input_tokens_seen": 52909705, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0625, + "step": 2437, + "time_per_iteration": 2.460806369781494 + }, + { + "auxiliary_loss_clip": 0.01153675, + "auxiliary_loss_mlp": 0.01044961, + "balance_loss_clip": 1.02374196, + "balance_loss_mlp": 1.04798007, + "epoch": 0.14658049000450923, + "flos": 27703681528320.0, + "grad_norm": 2.346268485469047, + "language_loss": 0.73932636, + "learning_rate": 3.859116799930736e-06, + "loss": 0.76131272, + "num_input_tokens_seen": 52930300, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.0546875, + "step": 2438, + "time_per_iteration": 2.5051729679107666 + }, + { + "auxiliary_loss_clip": 0.01154512, + "auxiliary_loss_mlp": 0.01041271, + "balance_loss_clip": 1.02310383, + "balance_loss_mlp": 1.05231857, + "epoch": 0.14664061325717723, + "flos": 24936513575040.0, + "grad_norm": 1.8289443089735578, + "language_loss": 0.74791402, + "learning_rate": 3.858973179936668e-06, + "loss": 0.76987189, + "num_input_tokens_seen": 52949955, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.0234375, + "step": 2439, + "time_per_iteration": 2.4596338272094727 + }, + { + "auxiliary_loss_clip": 0.01151843, + "auxiliary_loss_mlp": 0.01047986, + "balance_loss_clip": 1.02872145, + "balance_loss_mlp": 1.04913521, + "epoch": 0.1467007365098452, + "flos": 40297661406720.0, + "grad_norm": 2.106046924266039, + "language_loss": 0.74542844, + "learning_rate": 3.85882948945015e-06, + "loss": 0.76742673, + "num_input_tokens_seen": 52972905, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.03125, + "step": 2440, + "time_per_iteration": 2.613889217376709 + }, + { + "auxiliary_loss_clip": 0.01146734, + "auxiliary_loss_mlp": 0.01048348, + "balance_loss_clip": 1.02964425, + "balance_loss_mlp": 1.04660702, + "epoch": 0.14676085976251316, + "flos": 26541074021760.0, + "grad_norm": 1.6151911954653986, + "language_loss": 0.83047861, + "learning_rate": 3.85868572847663e-06, + "loss": 0.85242939, + "num_input_tokens_seen": 52994850, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0, + "step": 2441, + "time_per_iteration": 2.508570432662964 + }, + { + "auxiliary_loss_clip": 0.01157481, + "auxiliary_loss_mlp": 0.0104725, + "balance_loss_clip": 1.0275681, + "balance_loss_mlp": 1.04952955, + "epoch": 0.14682098301518112, + "flos": 23550110380800.0, + "grad_norm": 3.362343971731744, + "language_loss": 0.71562135, + "learning_rate": 3.858541897021563e-06, + "loss": 0.73766863, + "num_input_tokens_seen": 53014740, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.078125, + "step": 2442, + "time_per_iteration": 2.4903416633605957 + }, + { + "auxiliary_loss_clip": 0.01160717, + "auxiliary_loss_mlp": 0.01042253, + "balance_loss_clip": 1.02257109, + "balance_loss_mlp": 1.0510819, + "epoch": 0.1468811062678491, + "flos": 11651073909120.0, + "grad_norm": 3.2762909335645043, + "language_loss": 0.80804002, + "learning_rate": 3.8583979950904e-06, + "loss": 0.83006966, + "num_input_tokens_seen": 53029780, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.09375, + "step": 2443, + "time_per_iteration": 2.424539089202881 + }, + { + "auxiliary_loss_clip": 0.01154468, + "auxiliary_loss_mlp": 0.01049831, + "balance_loss_clip": 1.03005409, + "balance_loss_mlp": 1.0504694, + "epoch": 0.14694122952051705, + "flos": 23002616304000.0, + "grad_norm": 2.077049554342068, + "language_loss": 0.8297509, + "learning_rate": 3.858254022688599e-06, + "loss": 0.85179389, + "num_input_tokens_seen": 53048620, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0390625, + "step": 2444, + "time_per_iteration": 2.4937214851379395 + }, + { + "auxiliary_loss_clip": 0.01154781, + "auxiliary_loss_mlp": 0.01048939, + "balance_loss_clip": 1.02961493, + "balance_loss_mlp": 1.05025554, + "epoch": 0.14700135277318502, + "flos": 26502972670080.0, + "grad_norm": 1.763635964291881, + "language_loss": 0.71218902, + "learning_rate": 3.85810997982162e-06, + "loss": 0.73422623, + "num_input_tokens_seen": 53070055, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.046875, + "step": 2445, + "time_per_iteration": 2.491645336151123 + }, + { + "auxiliary_loss_clip": 0.01045345, + "auxiliary_loss_mlp": 0.01028611, + "balance_loss_clip": 1.02659595, + "balance_loss_mlp": 1.00942683, + "epoch": 0.147061476025853, + "flos": 59449434387840.0, + "grad_norm": 0.8232649654452494, + "language_loss": 0.63138294, + "learning_rate": 3.857965866494923e-06, + "loss": 0.6521225, + "num_input_tokens_seen": 53126945, + "router_z_loss_clip": 0.0201416, + "router_z_loss_mlp": 0.359375, + "step": 2446, + "time_per_iteration": 2.9610531330108643 + }, + { + "auxiliary_loss_clip": 0.01158924, + "auxiliary_loss_mlp": 0.0104308, + "balance_loss_clip": 1.02355385, + "balance_loss_mlp": 1.05348802, + "epoch": 0.14712159927852098, + "flos": 28330897841280.0, + "grad_norm": 1.8119571313268434, + "language_loss": 0.74937665, + "learning_rate": 3.857821682713975e-06, + "loss": 0.7713967, + "num_input_tokens_seen": 53149130, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0546875, + "step": 2447, + "time_per_iteration": 2.547112226486206 + }, + { + "auxiliary_loss_clip": 0.0115445, + "auxiliary_loss_mlp": 0.01046965, + "balance_loss_clip": 1.02838051, + "balance_loss_mlp": 1.04998112, + "epoch": 0.14718172253118894, + "flos": 27089825074560.0, + "grad_norm": 2.0554455972062744, + "language_loss": 0.85722244, + "learning_rate": 3.857677428484242e-06, + "loss": 0.87923658, + "num_input_tokens_seen": 53167120, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.046875, + "step": 2448, + "time_per_iteration": 2.519530773162842 + }, + { + "auxiliary_loss_clip": 0.01045412, + "auxiliary_loss_mlp": 0.01010534, + "balance_loss_clip": 1.0081377, + "balance_loss_mlp": 1.00952029, + "epoch": 0.1472418457838569, + "flos": 66706764860160.0, + "grad_norm": 0.7649510042513386, + "language_loss": 0.56836212, + "learning_rate": 3.857533103811195e-06, + "loss": 0.58892155, + "num_input_tokens_seen": 53227945, + "router_z_loss_clip": 0.02392578, + "router_z_loss_mlp": 0.359375, + "step": 2449, + "time_per_iteration": 3.0049068927764893 + }, + { + "auxiliary_loss_clip": 0.01150109, + "auxiliary_loss_mlp": 0.01044261, + "balance_loss_clip": 1.02462673, + "balance_loss_mlp": 1.04850447, + "epoch": 0.14730196903652487, + "flos": 19573578391680.0, + "grad_norm": 1.900224172693126, + "language_loss": 0.85544562, + "learning_rate": 3.857388708700307e-06, + "loss": 0.87738931, + "num_input_tokens_seen": 53244615, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.015625, + "step": 2450, + "time_per_iteration": 2.5826945304870605 + }, + { + "auxiliary_loss_clip": 0.01155696, + "auxiliary_loss_mlp": 0.01049879, + "balance_loss_clip": 1.03009009, + "balance_loss_mlp": 1.05074143, + "epoch": 0.14736209228919284, + "flos": 16071031296000.0, + "grad_norm": 2.029178420182481, + "language_loss": 0.74693608, + "learning_rate": 3.857244243157052e-06, + "loss": 0.76899183, + "num_input_tokens_seen": 53262205, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0546875, + "step": 2451, + "time_per_iteration": 2.4345250129699707 + }, + { + "auxiliary_loss_clip": 0.01146898, + "auxiliary_loss_mlp": 0.01039395, + "balance_loss_clip": 1.02092934, + "balance_loss_mlp": 1.04758763, + "epoch": 0.1474222155418608, + "flos": 23039460679680.0, + "grad_norm": 1.6073898366987713, + "language_loss": 0.82240498, + "learning_rate": 3.85709970718691e-06, + "loss": 0.8442679, + "num_input_tokens_seen": 53282445, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.9921875, + "step": 2452, + "time_per_iteration": 2.468869924545288 + }, + { + "auxiliary_loss_clip": 0.01154267, + "auxiliary_loss_mlp": 0.01038485, + "balance_loss_clip": 1.02032936, + "balance_loss_mlp": 1.05154371, + "epoch": 0.1474823387945288, + "flos": 17018641946880.0, + "grad_norm": 1.7191329381743174, + "language_loss": 0.74021572, + "learning_rate": 3.856955100795361e-06, + "loss": 0.76214325, + "num_input_tokens_seen": 53299060, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.03125, + "step": 2453, + "time_per_iteration": 2.433424472808838 + }, + { + "auxiliary_loss_clip": 0.01154761, + "auxiliary_loss_mlp": 0.01050025, + "balance_loss_clip": 1.03048682, + "balance_loss_mlp": 1.04918802, + "epoch": 0.14754246204719676, + "flos": 17895041884800.0, + "grad_norm": 2.171465059586897, + "language_loss": 0.76326835, + "learning_rate": 3.856810423987889e-06, + "loss": 0.78531623, + "num_input_tokens_seen": 53315970, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0546875, + "step": 2454, + "time_per_iteration": 2.419368028640747 + }, + { + "auxiliary_loss_clip": 0.01155198, + "auxiliary_loss_mlp": 0.01038866, + "balance_loss_clip": 1.01980472, + "balance_loss_mlp": 1.04922831, + "epoch": 0.14760258529986472, + "flos": 13079097987840.0, + "grad_norm": 2.006370127686132, + "language_loss": 0.8301537, + "learning_rate": 3.856665676769979e-06, + "loss": 0.85209435, + "num_input_tokens_seen": 53332940, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0625, + "step": 2455, + "time_per_iteration": 2.426819324493408 + }, + { + "auxiliary_loss_clip": 0.01157227, + "auxiliary_loss_mlp": 0.01044033, + "balance_loss_clip": 1.02519834, + "balance_loss_mlp": 1.04846048, + "epoch": 0.1476627085525327, + "flos": 30806399358720.0, + "grad_norm": 2.442844218228049, + "language_loss": 0.83938581, + "learning_rate": 3.85652085914712e-06, + "loss": 0.8613984, + "num_input_tokens_seen": 53353295, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.09375, + "step": 2456, + "time_per_iteration": 2.525296926498413 + }, + { + "auxiliary_loss_clip": 0.01151791, + "auxiliary_loss_mlp": 0.01043419, + "balance_loss_clip": 1.02459574, + "balance_loss_mlp": 1.04980254, + "epoch": 0.14772283180520066, + "flos": 21689434984320.0, + "grad_norm": 1.8839437807359896, + "language_loss": 0.84325618, + "learning_rate": 3.856375971124805e-06, + "loss": 0.86520827, + "num_input_tokens_seen": 53373410, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0234375, + "step": 2457, + "time_per_iteration": 2.471068859100342 + }, + { + "auxiliary_loss_clip": 0.01149123, + "auxiliary_loss_mlp": 0.01041136, + "balance_loss_clip": 1.02237296, + "balance_loss_mlp": 1.04932761, + "epoch": 0.14778295505786862, + "flos": 18770400328320.0, + "grad_norm": 1.9862753985638202, + "language_loss": 0.75645256, + "learning_rate": 3.856231012708527e-06, + "loss": 0.77835512, + "num_input_tokens_seen": 53391430, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.99609375, + "step": 2458, + "time_per_iteration": 2.44146466255188 + }, + { + "auxiliary_loss_clip": 0.01160318, + "auxiliary_loss_mlp": 0.01049421, + "balance_loss_clip": 1.0284996, + "balance_loss_mlp": 1.05119324, + "epoch": 0.1478430783105366, + "flos": 22893555634560.0, + "grad_norm": 2.405388225865701, + "language_loss": 0.83817005, + "learning_rate": 3.856085983903782e-06, + "loss": 0.86026746, + "num_input_tokens_seen": 53409960, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.09375, + "step": 2459, + "time_per_iteration": 2.470345973968506 + }, + { + "auxiliary_loss_clip": 0.01149406, + "auxiliary_loss_mlp": 0.01041796, + "balance_loss_clip": 1.02359271, + "balance_loss_mlp": 1.0489651, + "epoch": 0.14790320156320458, + "flos": 15085319293440.0, + "grad_norm": 2.6666731923680733, + "language_loss": 0.75856471, + "learning_rate": 3.855940884716071e-06, + "loss": 0.78047681, + "num_input_tokens_seen": 53426160, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.0, + "step": 2460, + "time_per_iteration": 2.4294657707214355 + }, + { + "auxiliary_loss_clip": 0.01157631, + "auxiliary_loss_mlp": 0.01042027, + "balance_loss_clip": 1.02260733, + "balance_loss_mlp": 1.05102873, + "epoch": 0.14796332481587254, + "flos": 26504768350080.0, + "grad_norm": 1.6904429322803973, + "language_loss": 0.81591463, + "learning_rate": 3.855795715150896e-06, + "loss": 0.83791113, + "num_input_tokens_seen": 53448530, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0703125, + "step": 2461, + "time_per_iteration": 2.4993178844451904 + }, + { + "auxiliary_loss_clip": 0.01159506, + "auxiliary_loss_mlp": 0.01046713, + "balance_loss_clip": 1.02611399, + "balance_loss_mlp": 1.05356562, + "epoch": 0.1480234480685405, + "flos": 17563191108480.0, + "grad_norm": 3.2471604819605036, + "language_loss": 0.65689576, + "learning_rate": 3.855650475213761e-06, + "loss": 0.678958, + "num_input_tokens_seen": 53465915, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.0625, + "step": 2462, + "time_per_iteration": 2.4197235107421875 + }, + { + "auxiliary_loss_clip": 0.0115574, + "auxiliary_loss_mlp": 0.01048819, + "balance_loss_clip": 1.02929282, + "balance_loss_mlp": 1.05148113, + "epoch": 0.14808357132120847, + "flos": 53582203232640.0, + "grad_norm": 1.4717210360784851, + "language_loss": 0.67368174, + "learning_rate": 3.8555051649101745e-06, + "loss": 0.69572735, + "num_input_tokens_seen": 53496055, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0390625, + "step": 2463, + "time_per_iteration": 2.774268865585327 + }, + { + "auxiliary_loss_clip": 0.01154664, + "auxiliary_loss_mlp": 0.01050077, + "balance_loss_clip": 1.03071713, + "balance_loss_mlp": 1.04978383, + "epoch": 0.14814369457387644, + "flos": 19829190551040.0, + "grad_norm": 2.177919724516607, + "language_loss": 0.76567936, + "learning_rate": 3.855359784245646e-06, + "loss": 0.78772676, + "num_input_tokens_seen": 53513790, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.046875, + "step": 2464, + "time_per_iteration": 2.4522674083709717 + }, + { + "auxiliary_loss_clip": 0.01152455, + "auxiliary_loss_mlp": 0.01049156, + "balance_loss_clip": 1.03089297, + "balance_loss_mlp": 1.05009413, + "epoch": 0.1482038178265444, + "flos": 23914962777600.0, + "grad_norm": 1.623144605896263, + "language_loss": 0.79623306, + "learning_rate": 3.855214333225688e-06, + "loss": 0.81824923, + "num_input_tokens_seen": 53533410, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.0234375, + "step": 2465, + "time_per_iteration": 2.4946794509887695 + }, + { + "auxiliary_loss_clip": 0.01159963, + "auxiliary_loss_mlp": 0.01045929, + "balance_loss_clip": 1.02543747, + "balance_loss_mlp": 1.0522809, + "epoch": 0.1482639410792124, + "flos": 24170503109760.0, + "grad_norm": 2.8838905575360925, + "language_loss": 0.76230991, + "learning_rate": 3.855068811855817e-06, + "loss": 0.78436887, + "num_input_tokens_seen": 53554775, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.078125, + "step": 2466, + "time_per_iteration": 2.4722483158111572 + }, + { + "auxiliary_loss_clip": 0.01052707, + "auxiliary_loss_mlp": 0.01020247, + "balance_loss_clip": 1.01781487, + "balance_loss_mlp": 1.01613474, + "epoch": 0.14832406433188036, + "flos": 66191051341440.0, + "grad_norm": 0.8013334536894682, + "language_loss": 0.60022712, + "learning_rate": 3.854923220141551e-06, + "loss": 0.62095666, + "num_input_tokens_seen": 53609675, + "router_z_loss_clip": 0.02429199, + "router_z_loss_mlp": 0.3671875, + "step": 2467, + "time_per_iteration": 3.0702927112579346 + }, + { + "auxiliary_loss_clip": 0.01154799, + "auxiliary_loss_mlp": 0.01043072, + "balance_loss_clip": 1.02393889, + "balance_loss_mlp": 1.05059397, + "epoch": 0.14838418758454833, + "flos": 25411252654080.0, + "grad_norm": 2.3345318496369405, + "language_loss": 0.87671721, + "learning_rate": 3.85477755808841e-06, + "loss": 0.89869595, + "num_input_tokens_seen": 53626950, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.046875, + "step": 2468, + "time_per_iteration": 2.5042202472686768 + }, + { + "auxiliary_loss_clip": 0.0115781, + "auxiliary_loss_mlp": 0.01052711, + "balance_loss_clip": 1.0322901, + "balance_loss_mlp": 1.05078602, + "epoch": 0.1484443108372163, + "flos": 23289901280640.0, + "grad_norm": 4.884804263226826, + "language_loss": 0.75884396, + "learning_rate": 3.854631825701919e-06, + "loss": 0.78094912, + "num_input_tokens_seen": 53644200, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0703125, + "step": 2469, + "time_per_iteration": 2.4750967025756836 + }, + { + "auxiliary_loss_clip": 0.01153722, + "auxiliary_loss_mlp": 0.01053888, + "balance_loss_clip": 1.03425384, + "balance_loss_mlp": 1.04954958, + "epoch": 0.14850443408988426, + "flos": 14647675985280.0, + "grad_norm": 2.457578452134473, + "language_loss": 0.76183128, + "learning_rate": 3.854486022987603e-06, + "loss": 0.78390741, + "num_input_tokens_seen": 53659650, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.046875, + "step": 2470, + "time_per_iteration": 2.4312937259674072 + }, + { + "auxiliary_loss_clip": 0.01152707, + "auxiliary_loss_mlp": 0.01045721, + "balance_loss_clip": 1.02651668, + "balance_loss_mlp": 1.05050206, + "epoch": 0.14856455734255222, + "flos": 23548314700800.0, + "grad_norm": 1.9398758609720104, + "language_loss": 0.72121894, + "learning_rate": 3.8543401499509905e-06, + "loss": 0.74320322, + "num_input_tokens_seen": 53680275, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0234375, + "step": 2471, + "time_per_iteration": 2.519866466522217 + }, + { + "auxiliary_loss_clip": 0.01160204, + "auxiliary_loss_mlp": 0.01047639, + "balance_loss_clip": 1.0272181, + "balance_loss_mlp": 1.0499022, + "epoch": 0.1486246805952202, + "flos": 18077288515200.0, + "grad_norm": 2.11598070664324, + "language_loss": 0.89739621, + "learning_rate": 3.854194206597615e-06, + "loss": 0.91947466, + "num_input_tokens_seen": 53698270, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.1015625, + "step": 2472, + "time_per_iteration": 2.4281632900238037 + }, + { + "auxiliary_loss_clip": 0.01155174, + "auxiliary_loss_mlp": 0.01049471, + "balance_loss_clip": 1.030123, + "balance_loss_mlp": 1.05059123, + "epoch": 0.14868480384788818, + "flos": 19353625459200.0, + "grad_norm": 4.013793804030176, + "language_loss": 0.80734539, + "learning_rate": 3.854048192933008e-06, + "loss": 0.82939184, + "num_input_tokens_seen": 53716845, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.046875, + "step": 2473, + "time_per_iteration": 2.4329466819763184 + }, + { + "auxiliary_loss_clip": 0.0115911, + "auxiliary_loss_mlp": 0.01063152, + "balance_loss_clip": 1.04358959, + "balance_loss_mlp": 1.05129409, + "epoch": 0.14874492710055615, + "flos": 22200192426240.0, + "grad_norm": 2.5981192604624526, + "language_loss": 0.77540123, + "learning_rate": 3.853902108962709e-06, + "loss": 0.79762381, + "num_input_tokens_seen": 53734970, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.078125, + "step": 2474, + "time_per_iteration": 2.453432083129883 + }, + { + "auxiliary_loss_clip": 0.01157718, + "auxiliary_loss_mlp": 0.01057886, + "balance_loss_clip": 1.03763211, + "balance_loss_mlp": 1.04955983, + "epoch": 0.1488050503532241, + "flos": 21103444506240.0, + "grad_norm": 1.8103491271764227, + "language_loss": 0.82315612, + "learning_rate": 3.853755954692255e-06, + "loss": 0.84531218, + "num_input_tokens_seen": 53753415, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.0859375, + "step": 2475, + "time_per_iteration": 2.4591174125671387 + }, + { + "auxiliary_loss_clip": 0.01157844, + "auxiliary_loss_mlp": 0.01058234, + "balance_loss_clip": 1.03985167, + "balance_loss_mlp": 1.05399168, + "epoch": 0.14886517360589208, + "flos": 12786569625600.0, + "grad_norm": 1.9240192853863896, + "language_loss": 0.80811602, + "learning_rate": 3.85360973012719e-06, + "loss": 0.83027685, + "num_input_tokens_seen": 53770305, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.0390625, + "step": 2476, + "time_per_iteration": 3.810553789138794 + }, + { + "auxiliary_loss_clip": 0.01148934, + "auxiliary_loss_mlp": 0.01053022, + "balance_loss_clip": 1.03467607, + "balance_loss_mlp": 1.05016851, + "epoch": 0.14892529685856004, + "flos": 29022860419200.0, + "grad_norm": 1.8396010916090604, + "language_loss": 0.77889222, + "learning_rate": 3.853463435273058e-06, + "loss": 0.80091178, + "num_input_tokens_seen": 53788895, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.98828125, + "step": 2477, + "time_per_iteration": 4.031312942504883 + }, + { + "auxiliary_loss_clip": 0.01048753, + "auxiliary_loss_mlp": 0.01018076, + "balance_loss_clip": 1.01581085, + "balance_loss_mlp": 1.01302671, + "epoch": 0.148985420111228, + "flos": 61926121054080.0, + "grad_norm": 0.8050876444063699, + "language_loss": 0.60130364, + "learning_rate": 3.853317070135407e-06, + "loss": 0.62197196, + "num_input_tokens_seen": 53850260, + "router_z_loss_clip": 0.02270508, + "router_z_loss_mlp": 0.35742188, + "step": 2478, + "time_per_iteration": 3.1073787212371826 + }, + { + "auxiliary_loss_clip": 0.01154836, + "auxiliary_loss_mlp": 0.01044957, + "balance_loss_clip": 1.02695656, + "balance_loss_mlp": 1.05078554, + "epoch": 0.149045543363896, + "flos": 23915106432000.0, + "grad_norm": 2.232556799389181, + "language_loss": 0.70951897, + "learning_rate": 3.853170634719787e-06, + "loss": 0.7315169, + "num_input_tokens_seen": 53867520, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0390625, + "step": 2479, + "time_per_iteration": 2.475215435028076 + }, + { + "auxiliary_loss_clip": 0.01153193, + "auxiliary_loss_mlp": 0.01050847, + "balance_loss_clip": 1.0313679, + "balance_loss_mlp": 1.04886127, + "epoch": 0.14910566661656396, + "flos": 23654394541440.0, + "grad_norm": 1.5896653051626852, + "language_loss": 0.80748487, + "learning_rate": 3.853024129031751e-06, + "loss": 0.82952535, + "num_input_tokens_seen": 53886620, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.046875, + "step": 2480, + "time_per_iteration": 2.4618492126464844 + }, + { + "auxiliary_loss_clip": 0.01156746, + "auxiliary_loss_mlp": 0.01047338, + "balance_loss_clip": 1.02838397, + "balance_loss_mlp": 1.05017209, + "epoch": 0.14916578986923193, + "flos": 20515299212160.0, + "grad_norm": 2.4101793906634894, + "language_loss": 0.84132183, + "learning_rate": 3.852877553076854e-06, + "loss": 0.86336267, + "num_input_tokens_seen": 53902230, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0625, + "step": 2481, + "time_per_iteration": 2.437391519546509 + }, + { + "auxiliary_loss_clip": 0.01152664, + "auxiliary_loss_mlp": 0.01051193, + "balance_loss_clip": 1.03046227, + "balance_loss_mlp": 1.04808569, + "epoch": 0.1492259131218999, + "flos": 22491822948480.0, + "grad_norm": 3.194199563979109, + "language_loss": 0.77347398, + "learning_rate": 3.8527309068606546e-06, + "loss": 0.79551256, + "num_input_tokens_seen": 53919475, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.046875, + "step": 2482, + "time_per_iteration": 2.4710068702697754 + }, + { + "auxiliary_loss_clip": 0.01161857, + "auxiliary_loss_mlp": 0.01040162, + "balance_loss_clip": 1.01939583, + "balance_loss_mlp": 1.05186439, + "epoch": 0.14928603637456786, + "flos": 23185868515200.0, + "grad_norm": 2.968394626295353, + "language_loss": 0.78719991, + "learning_rate": 3.852584190388713e-06, + "loss": 0.80922014, + "num_input_tokens_seen": 53939150, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1015625, + "step": 2483, + "time_per_iteration": 2.5075182914733887 + }, + { + "auxiliary_loss_clip": 0.0114759, + "auxiliary_loss_mlp": 0.01040314, + "balance_loss_clip": 1.02304077, + "balance_loss_mlp": 1.04774714, + "epoch": 0.14934615962723582, + "flos": 21653237053440.0, + "grad_norm": 1.642113570978582, + "language_loss": 0.70521605, + "learning_rate": 3.852437403666595e-06, + "loss": 0.72709513, + "num_input_tokens_seen": 53958735, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 1.0, + "step": 2484, + "time_per_iteration": 2.4810657501220703 + }, + { + "auxiliary_loss_clip": 0.01154341, + "auxiliary_loss_mlp": 0.01041731, + "balance_loss_clip": 1.02049971, + "balance_loss_mlp": 1.04769683, + "epoch": 0.1494062828799038, + "flos": 27010066924800.0, + "grad_norm": 2.5518326423103654, + "language_loss": 0.84396368, + "learning_rate": 3.852290546699863e-06, + "loss": 0.86592442, + "num_input_tokens_seen": 53975065, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.0703125, + "step": 2485, + "time_per_iteration": 2.47004771232605 + }, + { + "auxiliary_loss_clip": 0.01155612, + "auxiliary_loss_mlp": 0.01044521, + "balance_loss_clip": 1.02442229, + "balance_loss_mlp": 1.04906201, + "epoch": 0.14946640613257178, + "flos": 21214947300480.0, + "grad_norm": 2.1854599778658663, + "language_loss": 0.84902173, + "learning_rate": 3.8521436194940894e-06, + "loss": 0.87102306, + "num_input_tokens_seen": 53993330, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0625, + "step": 2486, + "time_per_iteration": 2.4553234577178955 + }, + { + "auxiliary_loss_clip": 0.011482, + "auxiliary_loss_mlp": 0.01038818, + "balance_loss_clip": 1.02208114, + "balance_loss_mlp": 1.04672825, + "epoch": 0.14952652938523975, + "flos": 13370872164480.0, + "grad_norm": 2.4579579723442855, + "language_loss": 0.74329305, + "learning_rate": 3.851996622054842e-06, + "loss": 0.76516318, + "num_input_tokens_seen": 54010515, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 1.015625, + "step": 2487, + "time_per_iteration": 2.436316967010498 + }, + { + "auxiliary_loss_clip": 0.01148703, + "auxiliary_loss_mlp": 0.01048053, + "balance_loss_clip": 1.02934861, + "balance_loss_mlp": 1.04707325, + "epoch": 0.1495866526379077, + "flos": 35517699959040.0, + "grad_norm": 2.1423480103066375, + "language_loss": 0.71837348, + "learning_rate": 3.8518495543877e-06, + "loss": 0.74034101, + "num_input_tokens_seen": 54031315, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 2488, + "time_per_iteration": 2.649794816970825 + }, + { + "auxiliary_loss_clip": 0.01156424, + "auxiliary_loss_mlp": 0.01046549, + "balance_loss_clip": 1.02780962, + "balance_loss_mlp": 1.04946375, + "epoch": 0.14964677589057568, + "flos": 17632749795840.0, + "grad_norm": 2.5167610907777513, + "language_loss": 0.70519507, + "learning_rate": 3.851702416498235e-06, + "loss": 0.72722483, + "num_input_tokens_seen": 54045965, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0703125, + "step": 2489, + "time_per_iteration": 2.416708469390869 + }, + { + "auxiliary_loss_clip": 0.01153385, + "auxiliary_loss_mlp": 0.01045512, + "balance_loss_clip": 1.02637911, + "balance_loss_mlp": 1.04785299, + "epoch": 0.14970689914324364, + "flos": 20185280029440.0, + "grad_norm": 6.063777716142612, + "language_loss": 0.81789696, + "learning_rate": 3.8515552083920295e-06, + "loss": 0.83988589, + "num_input_tokens_seen": 54059960, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0546875, + "step": 2490, + "time_per_iteration": 2.433284282684326 + }, + { + "auxiliary_loss_clip": 0.0115747, + "auxiliary_loss_mlp": 0.01042151, + "balance_loss_clip": 1.02357852, + "balance_loss_mlp": 1.05097246, + "epoch": 0.1497670223959116, + "flos": 37228699382400.0, + "grad_norm": 1.781748843431282, + "language_loss": 0.79878485, + "learning_rate": 3.851407930074666e-06, + "loss": 0.82078111, + "num_input_tokens_seen": 54079330, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0625, + "step": 2491, + "time_per_iteration": 2.616642475128174 + }, + { + "auxiliary_loss_clip": 0.0115457, + "auxiliary_loss_mlp": 0.01046038, + "balance_loss_clip": 1.02491403, + "balance_loss_mlp": 1.04683256, + "epoch": 0.1498271456485796, + "flos": 24455848752000.0, + "grad_norm": 2.263792295832721, + "language_loss": 0.90779251, + "learning_rate": 3.851260581551727e-06, + "loss": 0.9297986, + "num_input_tokens_seen": 54097555, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.078125, + "step": 2492, + "time_per_iteration": 2.508188009262085 + }, + { + "auxiliary_loss_clip": 0.01152347, + "auxiliary_loss_mlp": 0.01059815, + "balance_loss_clip": 1.04028893, + "balance_loss_mlp": 1.04883122, + "epoch": 0.14988726890124757, + "flos": 16253601148800.0, + "grad_norm": 2.7210225604175116, + "language_loss": 0.79162109, + "learning_rate": 3.851113162828802e-06, + "loss": 0.8137427, + "num_input_tokens_seen": 54115600, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.03125, + "step": 2493, + "time_per_iteration": 2.4228014945983887 + }, + { + "auxiliary_loss_clip": 0.01150881, + "auxiliary_loss_mlp": 0.01042857, + "balance_loss_clip": 1.02299631, + "balance_loss_mlp": 1.04643607, + "epoch": 0.14994739215391553, + "flos": 20666555383680.0, + "grad_norm": 2.8095511996528297, + "language_loss": 0.80186284, + "learning_rate": 3.85096567391148e-06, + "loss": 0.82380015, + "num_input_tokens_seen": 54135220, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.046875, + "step": 2494, + "time_per_iteration": 2.4774162769317627 + }, + { + "auxiliary_loss_clip": 0.01149241, + "auxiliary_loss_mlp": 0.01046465, + "balance_loss_clip": 1.02613974, + "balance_loss_mlp": 1.04731214, + "epoch": 0.1500075154065835, + "flos": 70652375239680.0, + "grad_norm": 1.9697458415941205, + "language_loss": 0.65825832, + "learning_rate": 3.850818114805354e-06, + "loss": 0.68021536, + "num_input_tokens_seen": 54161065, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.015625, + "step": 2495, + "time_per_iteration": 2.87758207321167 + }, + { + "auxiliary_loss_clip": 0.01053312, + "auxiliary_loss_mlp": 0.01002116, + "balance_loss_clip": 0.99992257, + "balance_loss_mlp": 1.01668406, + "epoch": 0.15006763865925146, + "flos": 68011937447040.0, + "grad_norm": 1.1924806916138095, + "language_loss": 0.59488082, + "learning_rate": 3.850670485516019e-06, + "loss": 0.61543506, + "num_input_tokens_seen": 54225095, + "router_z_loss_clip": 0.02197266, + "router_z_loss_mlp": 0.3671875, + "step": 2496, + "time_per_iteration": 3.0807061195373535 + }, + { + "auxiliary_loss_clip": 0.01152427, + "auxiliary_loss_mlp": 0.01054598, + "balance_loss_clip": 1.03467774, + "balance_loss_mlp": 1.0468092, + "epoch": 0.15012776191191943, + "flos": 18916269459840.0, + "grad_norm": 2.296903755979897, + "language_loss": 0.65457296, + "learning_rate": 3.850522786049075e-06, + "loss": 0.67664325, + "num_input_tokens_seen": 54243750, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0546875, + "step": 2497, + "time_per_iteration": 2.4403655529022217 + }, + { + "auxiliary_loss_clip": 0.01155934, + "auxiliary_loss_mlp": 0.01048581, + "balance_loss_clip": 1.03021121, + "balance_loss_mlp": 1.05125117, + "epoch": 0.1501878851645874, + "flos": 23701330638720.0, + "grad_norm": 1.4500790349521295, + "language_loss": 0.75247943, + "learning_rate": 3.850375016410121e-06, + "loss": 0.77452457, + "num_input_tokens_seen": 54266185, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.046875, + "step": 2498, + "time_per_iteration": 2.5286927223205566 + }, + { + "auxiliary_loss_clip": 0.01155949, + "auxiliary_loss_mlp": 0.01043093, + "balance_loss_clip": 1.02256477, + "balance_loss_mlp": 1.04910398, + "epoch": 0.15024800841725539, + "flos": 20412523422720.0, + "grad_norm": 2.1627878003877257, + "language_loss": 0.72073609, + "learning_rate": 3.850227176604761e-06, + "loss": 0.74272656, + "num_input_tokens_seen": 54283940, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0625, + "step": 2499, + "time_per_iteration": 2.4415009021759033 + }, + { + "auxiliary_loss_clip": 0.01153017, + "auxiliary_loss_mlp": 0.01049378, + "balance_loss_clip": 1.03001857, + "balance_loss_mlp": 1.04765654, + "epoch": 0.15030813166992335, + "flos": 31831002812160.0, + "grad_norm": 1.7935878764928508, + "language_loss": 0.7195605, + "learning_rate": 3.850079266638601e-06, + "loss": 0.74158442, + "num_input_tokens_seen": 54304830, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0625, + "step": 2500, + "time_per_iteration": 2.5504300594329834 + }, + { + "auxiliary_loss_clip": 0.01152715, + "auxiliary_loss_mlp": 0.01058033, + "balance_loss_clip": 1.03831601, + "balance_loss_mlp": 1.04960001, + "epoch": 0.15036825492259132, + "flos": 35657822914560.0, + "grad_norm": 2.491284008551419, + "language_loss": 0.64973354, + "learning_rate": 3.849931286517249e-06, + "loss": 0.67184103, + "num_input_tokens_seen": 54325595, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.03125, + "step": 2501, + "time_per_iteration": 2.587292432785034 + }, + { + "auxiliary_loss_clip": 0.01153217, + "auxiliary_loss_mlp": 0.01059755, + "balance_loss_clip": 1.03940582, + "balance_loss_mlp": 1.04861319, + "epoch": 0.15042837817525928, + "flos": 18838163335680.0, + "grad_norm": 2.0240839018319, + "language_loss": 0.83043593, + "learning_rate": 3.849783236246318e-06, + "loss": 0.85256565, + "num_input_tokens_seen": 54342180, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.046875, + "step": 2502, + "time_per_iteration": 2.470350980758667 + }, + { + "auxiliary_loss_clip": 0.01149694, + "auxiliary_loss_mlp": 0.01050766, + "balance_loss_clip": 1.03272963, + "balance_loss_mlp": 1.04702473, + "epoch": 0.15048850142792725, + "flos": 19535548867200.0, + "grad_norm": 2.3174234065433597, + "language_loss": 0.77197748, + "learning_rate": 3.849635115831421e-06, + "loss": 0.79398209, + "num_input_tokens_seen": 54360255, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.0234375, + "step": 2503, + "time_per_iteration": 2.6598432064056396 + }, + { + "auxiliary_loss_clip": 0.01151836, + "auxiliary_loss_mlp": 0.01043843, + "balance_loss_clip": 1.02585387, + "balance_loss_mlp": 1.04901898, + "epoch": 0.1505486246805952, + "flos": 22017550746240.0, + "grad_norm": 2.1270494317377007, + "language_loss": 0.85432625, + "learning_rate": 3.849486925278176e-06, + "loss": 0.87628305, + "num_input_tokens_seen": 54378260, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0234375, + "step": 2504, + "time_per_iteration": 2.7323355674743652 + }, + { + "auxiliary_loss_clip": 0.01150395, + "auxiliary_loss_mlp": 0.01041023, + "balance_loss_clip": 1.02322519, + "balance_loss_mlp": 1.04855871, + "epoch": 0.15060874793326318, + "flos": 20743153136640.0, + "grad_norm": 1.6383963769174188, + "language_loss": 0.83226919, + "learning_rate": 3.8493386645922e-06, + "loss": 0.85418344, + "num_input_tokens_seen": 54399745, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 1.015625, + "step": 2505, + "time_per_iteration": 2.4866323471069336 + }, + { + "auxiliary_loss_clip": 0.01150621, + "auxiliary_loss_mlp": 0.01046549, + "balance_loss_clip": 1.02851272, + "balance_loss_mlp": 1.04672468, + "epoch": 0.15066887118593117, + "flos": 16471902055680.0, + "grad_norm": 2.268670074130615, + "language_loss": 0.7639147, + "learning_rate": 3.849190333779117e-06, + "loss": 0.78588635, + "num_input_tokens_seen": 54417105, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.0390625, + "step": 2506, + "time_per_iteration": 2.4266390800476074 + }, + { + "auxiliary_loss_clip": 0.01156061, + "auxiliary_loss_mlp": 0.01043099, + "balance_loss_clip": 1.02452636, + "balance_loss_mlp": 1.04987144, + "epoch": 0.15072899443859913, + "flos": 19859319083520.0, + "grad_norm": 4.189374997051622, + "language_loss": 0.76202261, + "learning_rate": 3.849041932844552e-06, + "loss": 0.78401417, + "num_input_tokens_seen": 54433920, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0625, + "step": 2507, + "time_per_iteration": 2.477936029434204 + }, + { + "auxiliary_loss_clip": 0.01145213, + "auxiliary_loss_mlp": 0.01043256, + "balance_loss_clip": 1.02519584, + "balance_loss_mlp": 1.04538798, + "epoch": 0.1507891176912671, + "flos": 20776226584320.0, + "grad_norm": 2.4120052182021503, + "language_loss": 0.69041586, + "learning_rate": 3.848893461794131e-06, + "loss": 0.71230054, + "num_input_tokens_seen": 54451540, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.0, + "step": 2508, + "time_per_iteration": 2.4462738037109375 + }, + { + "auxiliary_loss_clip": 0.01156095, + "auxiliary_loss_mlp": 0.01046654, + "balance_loss_clip": 1.02870142, + "balance_loss_mlp": 1.05190873, + "epoch": 0.15084924094393506, + "flos": 23586631534080.0, + "grad_norm": 1.8904486830015208, + "language_loss": 0.77516425, + "learning_rate": 3.8487449206334845e-06, + "loss": 0.79719174, + "num_input_tokens_seen": 54470800, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0390625, + "step": 2509, + "time_per_iteration": 2.47723126411438 + }, + { + "auxiliary_loss_clip": 0.01160822, + "auxiliary_loss_mlp": 0.01052281, + "balance_loss_clip": 1.0307281, + "balance_loss_mlp": 1.05027628, + "epoch": 0.15090936419660303, + "flos": 18911313383040.0, + "grad_norm": 2.607083522867767, + "language_loss": 0.80497003, + "learning_rate": 3.848596309368246e-06, + "loss": 0.82710105, + "num_input_tokens_seen": 54486525, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.1015625, + "step": 2510, + "time_per_iteration": 2.4445176124572754 + }, + { + "auxiliary_loss_clip": 0.0115714, + "auxiliary_loss_mlp": 0.01053415, + "balance_loss_clip": 1.0336144, + "balance_loss_mlp": 1.05078745, + "epoch": 0.150969487449271, + "flos": 17928223073280.0, + "grad_norm": 2.033214689307001, + "language_loss": 0.73913604, + "learning_rate": 3.8484476280040495e-06, + "loss": 0.76124156, + "num_input_tokens_seen": 54503795, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0625, + "step": 2511, + "time_per_iteration": 2.4372222423553467 + }, + { + "auxiliary_loss_clip": 0.01152059, + "auxiliary_loss_mlp": 0.01040964, + "balance_loss_clip": 1.02332115, + "balance_loss_mlp": 1.04880548, + "epoch": 0.151029610701939, + "flos": 24243078539520.0, + "grad_norm": 2.077792778828972, + "language_loss": 0.6935091, + "learning_rate": 3.848298876546534e-06, + "loss": 0.71543926, + "num_input_tokens_seen": 54523025, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 1.03125, + "step": 2512, + "time_per_iteration": 2.5126166343688965 + }, + { + "auxiliary_loss_clip": 0.01154623, + "auxiliary_loss_mlp": 0.01047016, + "balance_loss_clip": 1.02903962, + "balance_loss_mlp": 1.05130434, + "epoch": 0.15108973395460695, + "flos": 30262496641920.0, + "grad_norm": 3.0703205269170364, + "language_loss": 0.73833334, + "learning_rate": 3.84815005500134e-06, + "loss": 0.76034975, + "num_input_tokens_seen": 54545025, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.03125, + "step": 2513, + "time_per_iteration": 2.5560262203216553 + }, + { + "auxiliary_loss_clip": 0.01052097, + "auxiliary_loss_mlp": 0.01001905, + "balance_loss_clip": 0.99995023, + "balance_loss_mlp": 1.01588845, + "epoch": 0.15114985720727492, + "flos": 60437624428800.0, + "grad_norm": 0.8742342414591, + "language_loss": 0.64759278, + "learning_rate": 3.84800116337411e-06, + "loss": 0.6681329, + "num_input_tokens_seen": 54604545, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.36328125, + "step": 2514, + "time_per_iteration": 3.0147135257720947 + }, + { + "auxiliary_loss_clip": 0.01150943, + "auxiliary_loss_mlp": 0.01043819, + "balance_loss_clip": 1.02588964, + "balance_loss_mlp": 1.04910421, + "epoch": 0.15120998045994288, + "flos": 20521691832960.0, + "grad_norm": 2.6951033245551597, + "language_loss": 0.73257691, + "learning_rate": 3.8478522016704916e-06, + "loss": 0.75452447, + "num_input_tokens_seen": 54620590, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0234375, + "step": 2515, + "time_per_iteration": 2.4640309810638428 + }, + { + "auxiliary_loss_clip": 0.01151787, + "auxiliary_loss_mlp": 0.0104255, + "balance_loss_clip": 1.02361989, + "balance_loss_mlp": 1.04967082, + "epoch": 0.15127010371261085, + "flos": 21178893024000.0, + "grad_norm": 1.8637331039353218, + "language_loss": 0.76990104, + "learning_rate": 3.8477031698961325e-06, + "loss": 0.79184443, + "num_input_tokens_seen": 54640410, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.015625, + "step": 2516, + "time_per_iteration": 2.4672725200653076 + }, + { + "auxiliary_loss_clip": 0.01049641, + "auxiliary_loss_mlp": 0.01003705, + "balance_loss_clip": 1.00177407, + "balance_loss_mlp": 1.01351547, + "epoch": 0.1513302269652788, + "flos": 65320648974720.0, + "grad_norm": 0.745436195681612, + "language_loss": 0.54673135, + "learning_rate": 3.8475540680566835e-06, + "loss": 0.56726485, + "num_input_tokens_seen": 54701430, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.36132812, + "step": 2517, + "time_per_iteration": 3.0677855014801025 + }, + { + "auxiliary_loss_clip": 0.01151686, + "auxiliary_loss_mlp": 0.0104095, + "balance_loss_clip": 1.02126849, + "balance_loss_mlp": 1.04780149, + "epoch": 0.15139035021794678, + "flos": 19135827342720.0, + "grad_norm": 2.2326216563166983, + "language_loss": 0.78515786, + "learning_rate": 3.8474048961577995e-06, + "loss": 0.8070842, + "num_input_tokens_seen": 54720845, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0390625, + "step": 2518, + "time_per_iteration": 3.8305110931396484 + }, + { + "auxiliary_loss_clip": 0.01159011, + "auxiliary_loss_mlp": 0.01048517, + "balance_loss_clip": 1.02851379, + "balance_loss_mlp": 1.05163026, + "epoch": 0.15145047347061477, + "flos": 26578564842240.0, + "grad_norm": 2.1364726943924772, + "language_loss": 0.70153689, + "learning_rate": 3.847255654205137e-06, + "loss": 0.72361219, + "num_input_tokens_seen": 54740495, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0703125, + "step": 2519, + "time_per_iteration": 3.9920616149902344 + }, + { + "auxiliary_loss_clip": 0.01151572, + "auxiliary_loss_mlp": 0.01044317, + "balance_loss_clip": 1.02549386, + "balance_loss_mlp": 1.04812384, + "epoch": 0.15151059672328274, + "flos": 20302959962880.0, + "grad_norm": 1.9802508383478334, + "language_loss": 0.79219216, + "learning_rate": 3.847106342204354e-06, + "loss": 0.81415105, + "num_input_tokens_seen": 54758415, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.03125, + "step": 2520, + "time_per_iteration": 2.4853925704956055 + }, + { + "auxiliary_loss_clip": 0.01155647, + "auxiliary_loss_mlp": 0.01050752, + "balance_loss_clip": 1.03090394, + "balance_loss_mlp": 1.05067897, + "epoch": 0.1515707199759507, + "flos": 27228367831680.0, + "grad_norm": 2.075013959426641, + "language_loss": 0.74324691, + "learning_rate": 3.846956960161114e-06, + "loss": 0.76531088, + "num_input_tokens_seen": 54779355, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.046875, + "step": 2521, + "time_per_iteration": 2.6154706478118896 + }, + { + "auxiliary_loss_clip": 0.01160623, + "auxiliary_loss_mlp": 0.01045817, + "balance_loss_clip": 1.02587366, + "balance_loss_mlp": 1.05273759, + "epoch": 0.15163084322861867, + "flos": 23587349806080.0, + "grad_norm": 2.7623729867934737, + "language_loss": 0.81996739, + "learning_rate": 3.84680750808108e-06, + "loss": 0.84203184, + "num_input_tokens_seen": 54799465, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.078125, + "step": 2522, + "time_per_iteration": 2.4873530864715576 + }, + { + "auxiliary_loss_clip": 0.0104876, + "auxiliary_loss_mlp": 0.01001752, + "balance_loss_clip": 0.99982071, + "balance_loss_mlp": 1.01252866, + "epoch": 0.15169096648128663, + "flos": 66889622021760.0, + "grad_norm": 0.824359498034346, + "language_loss": 0.57915509, + "learning_rate": 3.846657985969922e-06, + "loss": 0.59966022, + "num_input_tokens_seen": 54857665, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.36328125, + "step": 2523, + "time_per_iteration": 2.998990774154663 + }, + { + "auxiliary_loss_clip": 0.01153336, + "auxiliary_loss_mlp": 0.01050595, + "balance_loss_clip": 1.03147376, + "balance_loss_mlp": 1.04972816, + "epoch": 0.1517510897339546, + "flos": 29095435848960.0, + "grad_norm": 1.970015434384356, + "language_loss": 0.7485956, + "learning_rate": 3.8465083938333066e-06, + "loss": 0.77063495, + "num_input_tokens_seen": 54879895, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0390625, + "step": 2524, + "time_per_iteration": 2.570068836212158 + }, + { + "auxiliary_loss_clip": 0.0115237, + "auxiliary_loss_mlp": 0.01044934, + "balance_loss_clip": 1.02603889, + "balance_loss_mlp": 1.0488894, + "epoch": 0.1518112129866226, + "flos": 18406553512320.0, + "grad_norm": 1.8388163356316347, + "language_loss": 0.74780655, + "learning_rate": 3.8463587316769085e-06, + "loss": 0.76977956, + "num_input_tokens_seen": 54898245, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.03125, + "step": 2525, + "time_per_iteration": 2.431143283843994 + }, + { + "auxiliary_loss_clip": 0.01157293, + "auxiliary_loss_mlp": 0.01043467, + "balance_loss_clip": 1.02432156, + "balance_loss_mlp": 1.05145812, + "epoch": 0.15187133623929056, + "flos": 19425410789760.0, + "grad_norm": 1.8962457769996104, + "language_loss": 0.79644465, + "learning_rate": 3.846208999506402e-06, + "loss": 0.81845224, + "num_input_tokens_seen": 54917060, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0625, + "step": 2526, + "time_per_iteration": 2.5167391300201416 + }, + { + "auxiliary_loss_clip": 0.01151222, + "auxiliary_loss_mlp": 0.01044184, + "balance_loss_clip": 1.0271492, + "balance_loss_mlp": 1.05228162, + "epoch": 0.15193145949195852, + "flos": 17566207850880.0, + "grad_norm": 1.8025865198757494, + "language_loss": 0.84928662, + "learning_rate": 3.846059197327466e-06, + "loss": 0.87124068, + "num_input_tokens_seen": 54936365, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9921875, + "step": 2527, + "time_per_iteration": 2.4550719261169434 + }, + { + "auxiliary_loss_clip": 0.01151683, + "auxiliary_loss_mlp": 0.01041065, + "balance_loss_clip": 1.02321947, + "balance_loss_mlp": 1.04876995, + "epoch": 0.15199158274462649, + "flos": 36176265866880.0, + "grad_norm": 2.2810224367730156, + "language_loss": 0.69326001, + "learning_rate": 3.845909325145779e-06, + "loss": 0.71518755, + "num_input_tokens_seen": 54961365, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 1.03125, + "step": 2528, + "time_per_iteration": 2.610042095184326 + }, + { + "auxiliary_loss_clip": 0.01153606, + "auxiliary_loss_mlp": 0.01047581, + "balance_loss_clip": 1.0288415, + "balance_loss_mlp": 1.05137038, + "epoch": 0.15205170599729445, + "flos": 23074042498560.0, + "grad_norm": 2.490892546855648, + "language_loss": 0.86502308, + "learning_rate": 3.845759382967026e-06, + "loss": 0.88703495, + "num_input_tokens_seen": 54980750, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0234375, + "step": 2529, + "time_per_iteration": 2.4695634841918945 + }, + { + "auxiliary_loss_clip": 0.01147713, + "auxiliary_loss_mlp": 0.0103836, + "balance_loss_clip": 1.01989472, + "balance_loss_mlp": 1.04683101, + "epoch": 0.15211182924996242, + "flos": 21908382336000.0, + "grad_norm": 1.8772276619965056, + "language_loss": 0.83002013, + "learning_rate": 3.845609370796893e-06, + "loss": 0.85188091, + "num_input_tokens_seen": 54999675, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.0078125, + "step": 2530, + "time_per_iteration": 2.476238489151001 + }, + { + "auxiliary_loss_clip": 0.01153377, + "auxiliary_loss_mlp": 0.01044599, + "balance_loss_clip": 1.02550209, + "balance_loss_mlp": 1.04987955, + "epoch": 0.15217195250263038, + "flos": 13881521865600.0, + "grad_norm": 2.344030506991615, + "language_loss": 0.80540878, + "learning_rate": 3.845459288641066e-06, + "loss": 0.82738853, + "num_input_tokens_seen": 55018295, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.03125, + "step": 2531, + "time_per_iteration": 2.443617105484009 + }, + { + "auxiliary_loss_clip": 0.01149745, + "auxiliary_loss_mlp": 0.01049084, + "balance_loss_clip": 1.03138137, + "balance_loss_mlp": 1.04895151, + "epoch": 0.15223207575529837, + "flos": 24535319592960.0, + "grad_norm": 2.0816362099746017, + "language_loss": 0.79241651, + "learning_rate": 3.8453091365052394e-06, + "loss": 0.81440473, + "num_input_tokens_seen": 55037975, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 1.0078125, + "step": 2532, + "time_per_iteration": 2.5071239471435547 + }, + { + "auxiliary_loss_clip": 0.0115001, + "auxiliary_loss_mlp": 0.01046515, + "balance_loss_clip": 1.02694106, + "balance_loss_mlp": 1.04952455, + "epoch": 0.15229219900796634, + "flos": 25556798563200.0, + "grad_norm": 1.8298502444413876, + "language_loss": 0.87712961, + "learning_rate": 3.845158914395105e-06, + "loss": 0.89909488, + "num_input_tokens_seen": 55057135, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0, + "step": 2533, + "time_per_iteration": 2.5262463092803955 + }, + { + "auxiliary_loss_clip": 0.01150696, + "auxiliary_loss_mlp": 0.01047398, + "balance_loss_clip": 1.02932572, + "balance_loss_mlp": 1.04766071, + "epoch": 0.1523523222606343, + "flos": 18217806520320.0, + "grad_norm": 2.2606742211331556, + "language_loss": 0.79057097, + "learning_rate": 3.84500862231636e-06, + "loss": 0.81255192, + "num_input_tokens_seen": 55075525, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.03125, + "step": 2534, + "time_per_iteration": 2.4421815872192383 + }, + { + "auxiliary_loss_clip": 0.01156406, + "auxiliary_loss_mlp": 0.01041573, + "balance_loss_clip": 1.02177238, + "balance_loss_mlp": 1.04847312, + "epoch": 0.15241244551330227, + "flos": 13260087642240.0, + "grad_norm": 2.8989864742133933, + "language_loss": 0.76862979, + "learning_rate": 3.844858260274702e-06, + "loss": 0.7906096, + "num_input_tokens_seen": 55090845, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.078125, + "step": 2535, + "time_per_iteration": 2.4193530082702637 + }, + { + "auxiliary_loss_clip": 0.01156147, + "auxiliary_loss_mlp": 0.01040468, + "balance_loss_clip": 1.02153718, + "balance_loss_mlp": 1.04885459, + "epoch": 0.15247256876597023, + "flos": 19715568854400.0, + "grad_norm": 2.234687708038525, + "language_loss": 0.78185135, + "learning_rate": 3.844707828275835e-06, + "loss": 0.80381751, + "num_input_tokens_seen": 55108750, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0703125, + "step": 2536, + "time_per_iteration": 2.478066921234131 + }, + { + "auxiliary_loss_clip": 0.01150448, + "auxiliary_loss_mlp": 0.0105096, + "balance_loss_clip": 1.03305459, + "balance_loss_mlp": 1.05067229, + "epoch": 0.1525326920186382, + "flos": 20375858615040.0, + "grad_norm": 2.124557148089124, + "language_loss": 0.74979979, + "learning_rate": 3.844557326325461e-06, + "loss": 0.77181387, + "num_input_tokens_seen": 55126750, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0, + "step": 2537, + "time_per_iteration": 2.455779552459717 + }, + { + "auxiliary_loss_clip": 0.01152934, + "auxiliary_loss_mlp": 0.01043794, + "balance_loss_clip": 1.02545929, + "balance_loss_mlp": 1.04965043, + "epoch": 0.15259281527130616, + "flos": 13589963170560.0, + "grad_norm": 2.005826380833244, + "language_loss": 0.77631724, + "learning_rate": 3.8444067544292896e-06, + "loss": 0.79828459, + "num_input_tokens_seen": 55144690, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.03125, + "step": 2538, + "time_per_iteration": 2.527730941772461 + }, + { + "auxiliary_loss_clip": 0.01147714, + "auxiliary_loss_mlp": 0.01040159, + "balance_loss_clip": 1.02308786, + "balance_loss_mlp": 1.04806781, + "epoch": 0.15265293852397416, + "flos": 22860374446080.0, + "grad_norm": 1.6961003069906246, + "language_loss": 0.89707708, + "learning_rate": 3.844256112593029e-06, + "loss": 0.9189558, + "num_input_tokens_seen": 55166055, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.99609375, + "step": 2539, + "time_per_iteration": 2.485410451889038 + }, + { + "auxiliary_loss_clip": 0.01151642, + "auxiliary_loss_mlp": 0.01043152, + "balance_loss_clip": 1.02491331, + "balance_loss_mlp": 1.05028892, + "epoch": 0.15271306177664212, + "flos": 29238108670080.0, + "grad_norm": 2.1834515010765627, + "language_loss": 0.93514961, + "learning_rate": 3.844105400822391e-06, + "loss": 0.95709753, + "num_input_tokens_seen": 55186285, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.015625, + "step": 2540, + "time_per_iteration": 2.5399627685546875 + }, + { + "auxiliary_loss_clip": 0.01144897, + "auxiliary_loss_mlp": 0.01043966, + "balance_loss_clip": 1.0266571, + "balance_loss_mlp": 1.04625463, + "epoch": 0.1527731850293101, + "flos": 31246269310080.0, + "grad_norm": 1.9271166035098393, + "language_loss": 0.75039941, + "learning_rate": 3.843954619123092e-06, + "loss": 0.77228808, + "num_input_tokens_seen": 55207915, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.984375, + "step": 2541, + "time_per_iteration": 2.516559362411499 + }, + { + "auxiliary_loss_clip": 0.01147451, + "auxiliary_loss_mlp": 0.01048877, + "balance_loss_clip": 1.03025603, + "balance_loss_mlp": 1.04787207, + "epoch": 0.15283330828197805, + "flos": 22382079920640.0, + "grad_norm": 1.7480154890803248, + "language_loss": 0.81308234, + "learning_rate": 3.84380376750085e-06, + "loss": 0.83504558, + "num_input_tokens_seen": 55227860, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.99609375, + "step": 2542, + "time_per_iteration": 2.4681694507598877 + }, + { + "auxiliary_loss_clip": 0.01150381, + "auxiliary_loss_mlp": 0.01050782, + "balance_loss_clip": 1.03213799, + "balance_loss_mlp": 1.04772067, + "epoch": 0.15289343153464602, + "flos": 25520133755520.0, + "grad_norm": 2.009812895323552, + "language_loss": 0.77568293, + "learning_rate": 3.843652845961383e-06, + "loss": 0.79769456, + "num_input_tokens_seen": 55247330, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.0234375, + "step": 2543, + "time_per_iteration": 2.4899120330810547 + }, + { + "auxiliary_loss_clip": 0.01147346, + "auxiliary_loss_mlp": 0.01045177, + "balance_loss_clip": 1.0266397, + "balance_loss_mlp": 1.04692626, + "epoch": 0.15295355478731398, + "flos": 22710016114560.0, + "grad_norm": 2.3128696364379935, + "language_loss": 0.86483204, + "learning_rate": 3.843501854510416e-06, + "loss": 0.88675725, + "num_input_tokens_seen": 55266195, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0, + "step": 2544, + "time_per_iteration": 2.4774844646453857 + }, + { + "auxiliary_loss_clip": 0.01152485, + "auxiliary_loss_mlp": 0.010531, + "balance_loss_clip": 1.03287029, + "balance_loss_mlp": 1.04675508, + "epoch": 0.15301367803998198, + "flos": 23251907669760.0, + "grad_norm": 2.0966566192890106, + "language_loss": 0.8228749, + "learning_rate": 3.843350793153673e-06, + "loss": 0.84493077, + "num_input_tokens_seen": 55283305, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0546875, + "step": 2545, + "time_per_iteration": 2.4526925086975098 + }, + { + "auxiliary_loss_clip": 0.01149503, + "auxiliary_loss_mlp": 0.01044491, + "balance_loss_clip": 1.02614498, + "balance_loss_mlp": 1.04802954, + "epoch": 0.15307380129264994, + "flos": 25886279041920.0, + "grad_norm": 2.540509049886226, + "language_loss": 0.70711339, + "learning_rate": 3.843199661896884e-06, + "loss": 0.72905338, + "num_input_tokens_seen": 55303035, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.015625, + "step": 2546, + "time_per_iteration": 2.5009732246398926 + }, + { + "auxiliary_loss_clip": 0.01152354, + "auxiliary_loss_mlp": 0.01043417, + "balance_loss_clip": 1.02423596, + "balance_loss_mlp": 1.04967904, + "epoch": 0.1531339245453179, + "flos": 46973239205760.0, + "grad_norm": 1.5770850469719229, + "language_loss": 0.77521312, + "learning_rate": 3.843048460745779e-06, + "loss": 0.79717076, + "num_input_tokens_seen": 55327570, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.03125, + "step": 2547, + "time_per_iteration": 2.6822421550750732 + }, + { + "auxiliary_loss_clip": 0.01152263, + "auxiliary_loss_mlp": 0.01047861, + "balance_loss_clip": 1.02932382, + "balance_loss_mlp": 1.04904902, + "epoch": 0.15319404779798587, + "flos": 35882049565440.0, + "grad_norm": 2.0900989153424976, + "language_loss": 0.73985445, + "learning_rate": 3.842897189706092e-06, + "loss": 0.76185566, + "num_input_tokens_seen": 55351090, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.03125, + "step": 2548, + "time_per_iteration": 2.59080171585083 + }, + { + "auxiliary_loss_clip": 0.01150578, + "auxiliary_loss_mlp": 0.01050674, + "balance_loss_clip": 1.03158915, + "balance_loss_mlp": 1.04806828, + "epoch": 0.15325417105065384, + "flos": 25664638170240.0, + "grad_norm": 1.499185349529517, + "language_loss": 0.80589813, + "learning_rate": 3.842745848783558e-06, + "loss": 0.82791066, + "num_input_tokens_seen": 55371050, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0234375, + "step": 2549, + "time_per_iteration": 2.498096227645874 + }, + { + "auxiliary_loss_clip": 0.01150664, + "auxiliary_loss_mlp": 0.01048572, + "balance_loss_clip": 1.02951026, + "balance_loss_mlp": 1.04750037, + "epoch": 0.1533142943033218, + "flos": 18770831291520.0, + "grad_norm": 1.687491024735964, + "language_loss": 0.74760693, + "learning_rate": 3.842594437983917e-06, + "loss": 0.76959932, + "num_input_tokens_seen": 55390375, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.03125, + "step": 2550, + "time_per_iteration": 2.4740684032440186 + }, + { + "auxiliary_loss_clip": 0.01153822, + "auxiliary_loss_mlp": 0.01039682, + "balance_loss_clip": 1.02035773, + "balance_loss_mlp": 1.04903841, + "epoch": 0.15337441755598977, + "flos": 23107367341440.0, + "grad_norm": 2.205632522725416, + "language_loss": 0.76839805, + "learning_rate": 3.8424429573129115e-06, + "loss": 0.79033309, + "num_input_tokens_seen": 55408890, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.046875, + "step": 2551, + "time_per_iteration": 2.468886375427246 + }, + { + "auxiliary_loss_clip": 0.01045401, + "auxiliary_loss_mlp": 0.01020401, + "balance_loss_clip": 1.01873255, + "balance_loss_mlp": 1.0102303, + "epoch": 0.15343454080865776, + "flos": 59861079227520.0, + "grad_norm": 0.9464853846906186, + "language_loss": 0.56666422, + "learning_rate": 3.842291406776283e-06, + "loss": 0.58732224, + "num_input_tokens_seen": 55463815, + "router_z_loss_clip": 0.01672363, + "router_z_loss_mlp": 0.3515625, + "step": 2552, + "time_per_iteration": 3.0059380531311035 + }, + { + "auxiliary_loss_clip": 0.01152358, + "auxiliary_loss_mlp": 0.010458, + "balance_loss_clip": 1.02684569, + "balance_loss_mlp": 1.04793155, + "epoch": 0.15349466406132573, + "flos": 11910887959680.0, + "grad_norm": 3.2490122092843947, + "language_loss": 0.88505352, + "learning_rate": 3.84213978637978e-06, + "loss": 0.90703511, + "num_input_tokens_seen": 55481050, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.046875, + "step": 2553, + "time_per_iteration": 2.4523322582244873 + }, + { + "auxiliary_loss_clip": 0.01153624, + "auxiliary_loss_mlp": 0.01047537, + "balance_loss_clip": 1.02858269, + "balance_loss_mlp": 1.04771137, + "epoch": 0.1535547873139937, + "flos": 24096922099200.0, + "grad_norm": 1.8003580088176259, + "language_loss": 0.78462374, + "learning_rate": 3.841988096129152e-06, + "loss": 0.80663538, + "num_input_tokens_seen": 55500050, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0625, + "step": 2554, + "time_per_iteration": 2.48526668548584 + }, + { + "auxiliary_loss_clip": 0.01154341, + "auxiliary_loss_mlp": 0.01052395, + "balance_loss_clip": 1.03212881, + "balance_loss_mlp": 1.04941773, + "epoch": 0.15361491056666166, + "flos": 17566459246080.0, + "grad_norm": 2.4926146542113763, + "language_loss": 0.78344929, + "learning_rate": 3.841836336030151e-06, + "loss": 0.80551672, + "num_input_tokens_seen": 55518125, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.046875, + "step": 2555, + "time_per_iteration": 2.4687228202819824 + }, + { + "auxiliary_loss_clip": 0.01149124, + "auxiliary_loss_mlp": 0.01053536, + "balance_loss_clip": 1.03543973, + "balance_loss_mlp": 1.04890609, + "epoch": 0.15367503381932962, + "flos": 25046041121280.0, + "grad_norm": 1.6634961059278193, + "language_loss": 0.76901627, + "learning_rate": 3.8416845060885305e-06, + "loss": 0.7910428, + "num_input_tokens_seen": 55540960, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.0, + "step": 2556, + "time_per_iteration": 2.5006635189056396 + }, + { + "auxiliary_loss_clip": 0.01145988, + "auxiliary_loss_mlp": 0.0104239, + "balance_loss_clip": 1.02362633, + "balance_loss_mlp": 1.04657805, + "epoch": 0.15373515707199759, + "flos": 21507332008320.0, + "grad_norm": 1.8623555031997667, + "language_loss": 0.89489496, + "learning_rate": 3.84153260631005e-06, + "loss": 0.9167788, + "num_input_tokens_seen": 55559210, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.99609375, + "step": 2557, + "time_per_iteration": 2.4434657096862793 + }, + { + "auxiliary_loss_clip": 0.01151609, + "auxiliary_loss_mlp": 0.01046416, + "balance_loss_clip": 1.0263536, + "balance_loss_mlp": 1.04834831, + "epoch": 0.15379528032466555, + "flos": 25994729180160.0, + "grad_norm": 2.0348980361104587, + "language_loss": 0.7119934, + "learning_rate": 3.841380636700468e-06, + "loss": 0.73397368, + "num_input_tokens_seen": 55578925, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.03125, + "step": 2558, + "time_per_iteration": 2.490226984024048 + }, + { + "auxiliary_loss_clip": 0.01152232, + "auxiliary_loss_mlp": 0.01047681, + "balance_loss_clip": 1.02863097, + "balance_loss_mlp": 1.04888546, + "epoch": 0.15385540357733354, + "flos": 19277315015040.0, + "grad_norm": 2.2935483083292705, + "language_loss": 0.92370701, + "learning_rate": 3.841228597265548e-06, + "loss": 0.94570613, + "num_input_tokens_seen": 55597255, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.03125, + "step": 2559, + "time_per_iteration": 3.885131597518921 + }, + { + "auxiliary_loss_clip": 0.01155373, + "auxiliary_loss_mlp": 0.01053347, + "balance_loss_clip": 1.03331971, + "balance_loss_mlp": 1.05068171, + "epoch": 0.1539155268300015, + "flos": 28549126920960.0, + "grad_norm": 5.140445938018919, + "language_loss": 0.63637704, + "learning_rate": 3.841076488011055e-06, + "loss": 0.65846419, + "num_input_tokens_seen": 55619515, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.046875, + "step": 2560, + "time_per_iteration": 5.343889236450195 + }, + { + "auxiliary_loss_clip": 0.01153839, + "auxiliary_loss_mlp": 0.01046849, + "balance_loss_clip": 1.02725124, + "balance_loss_mlp": 1.04950392, + "epoch": 0.15397565008266947, + "flos": 23547883737600.0, + "grad_norm": 1.8613162525264346, + "language_loss": 0.88230681, + "learning_rate": 3.8409243089427574e-06, + "loss": 0.90431374, + "num_input_tokens_seen": 55640050, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.046875, + "step": 2561, + "time_per_iteration": 2.4648611545562744 + }, + { + "auxiliary_loss_clip": 0.01145331, + "auxiliary_loss_mlp": 0.01041909, + "balance_loss_clip": 1.02433765, + "balance_loss_mlp": 1.0477581, + "epoch": 0.15403577333533744, + "flos": 17129821518720.0, + "grad_norm": 1.8458305826175445, + "language_loss": 0.82909077, + "learning_rate": 3.840772060066425e-06, + "loss": 0.85096323, + "num_input_tokens_seen": 55658695, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9765625, + "step": 2562, + "time_per_iteration": 2.4327874183654785 + }, + { + "auxiliary_loss_clip": 0.01160792, + "auxiliary_loss_mlp": 0.0104767, + "balance_loss_clip": 1.02614117, + "balance_loss_mlp": 1.05274105, + "epoch": 0.1540958965880054, + "flos": 17894503180800.0, + "grad_norm": 1.8513620412223286, + "language_loss": 0.74713194, + "learning_rate": 3.840619741387832e-06, + "loss": 0.7692166, + "num_input_tokens_seen": 55676340, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.078125, + "step": 2563, + "time_per_iteration": 2.4246435165405273 + }, + { + "auxiliary_loss_clip": 0.01152598, + "auxiliary_loss_mlp": 0.01044039, + "balance_loss_clip": 1.02425051, + "balance_loss_mlp": 1.04708791, + "epoch": 0.15415601984067337, + "flos": 32161057908480.0, + "grad_norm": 4.308351588789828, + "language_loss": 0.75896233, + "learning_rate": 3.8404673529127534e-06, + "loss": 0.78092873, + "num_input_tokens_seen": 55698890, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.0546875, + "step": 2564, + "time_per_iteration": 2.5528018474578857 + }, + { + "auxiliary_loss_clip": 0.01149402, + "auxiliary_loss_mlp": 0.01050825, + "balance_loss_clip": 1.03233564, + "balance_loss_mlp": 1.04782677, + "epoch": 0.15421614309334136, + "flos": 24024418496640.0, + "grad_norm": 1.9915177170702032, + "language_loss": 0.70825899, + "learning_rate": 3.840314894646969e-06, + "loss": 0.73026133, + "num_input_tokens_seen": 55718535, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.015625, + "step": 2565, + "time_per_iteration": 2.5007505416870117 + }, + { + "auxiliary_loss_clip": 0.01149717, + "auxiliary_loss_mlp": 0.01050801, + "balance_loss_clip": 1.0315845, + "balance_loss_mlp": 1.04728019, + "epoch": 0.15427626634600933, + "flos": 24386290064640.0, + "grad_norm": 2.308308002927142, + "language_loss": 0.71535969, + "learning_rate": 3.840162366596259e-06, + "loss": 0.73736489, + "num_input_tokens_seen": 55738970, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.0234375, + "step": 2566, + "time_per_iteration": 2.498033285140991 + }, + { + "auxiliary_loss_clip": 0.01143736, + "auxiliary_loss_mlp": 0.01042132, + "balance_loss_clip": 1.02379811, + "balance_loss_mlp": 1.04381752, + "epoch": 0.1543363895986773, + "flos": 23331522165120.0, + "grad_norm": 1.7584763964610812, + "language_loss": 0.85129261, + "learning_rate": 3.840009768766408e-06, + "loss": 0.87315124, + "num_input_tokens_seen": 55759585, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.0, + "step": 2567, + "time_per_iteration": 2.46708083152771 + }, + { + "auxiliary_loss_clip": 0.01150763, + "auxiliary_loss_mlp": 0.01050725, + "balance_loss_clip": 1.03266454, + "balance_loss_mlp": 1.0491097, + "epoch": 0.15439651285134526, + "flos": 24274284480000.0, + "grad_norm": 2.4904852760766127, + "language_loss": 0.78025472, + "learning_rate": 3.839857101163202e-06, + "loss": 0.80226958, + "num_input_tokens_seen": 55779250, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.015625, + "step": 2568, + "time_per_iteration": 2.476029634475708 + }, + { + "auxiliary_loss_clip": 0.01150703, + "auxiliary_loss_mlp": 0.01039967, + "balance_loss_clip": 1.01974905, + "balance_loss_mlp": 1.04835856, + "epoch": 0.15445663610401322, + "flos": 22456163721600.0, + "grad_norm": 1.967048361077992, + "language_loss": 0.70183134, + "learning_rate": 3.83970436379243e-06, + "loss": 0.72373807, + "num_input_tokens_seen": 55800470, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0234375, + "step": 2569, + "time_per_iteration": 2.4566383361816406 + }, + { + "auxiliary_loss_clip": 0.011445, + "auxiliary_loss_mlp": 0.0104299, + "balance_loss_clip": 1.0243814, + "balance_loss_mlp": 1.04563344, + "epoch": 0.1545167593566812, + "flos": 22049510872320.0, + "grad_norm": 1.7954711420319855, + "language_loss": 0.76502788, + "learning_rate": 3.839551556659884e-06, + "loss": 0.78690279, + "num_input_tokens_seen": 55817795, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.98828125, + "step": 2570, + "time_per_iteration": 2.4543209075927734 + }, + { + "auxiliary_loss_clip": 0.01149071, + "auxiliary_loss_mlp": 0.01044712, + "balance_loss_clip": 1.02532816, + "balance_loss_mlp": 1.04811645, + "epoch": 0.15457688260934915, + "flos": 19318253541120.0, + "grad_norm": 7.2402617485583525, + "language_loss": 0.77214551, + "learning_rate": 3.839398679771359e-06, + "loss": 0.7940833, + "num_input_tokens_seen": 55836125, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0078125, + "step": 2571, + "time_per_iteration": 2.4532222747802734 + }, + { + "auxiliary_loss_clip": 0.0114992, + "auxiliary_loss_mlp": 0.01049579, + "balance_loss_clip": 1.03086352, + "balance_loss_mlp": 1.04835165, + "epoch": 0.15463700586201715, + "flos": 24133981956480.0, + "grad_norm": 1.949392721600437, + "language_loss": 0.82254899, + "learning_rate": 3.839245733132652e-06, + "loss": 0.84454399, + "num_input_tokens_seen": 55855280, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 2572, + "time_per_iteration": 2.4919703006744385 + }, + { + "auxiliary_loss_clip": 0.01152049, + "auxiliary_loss_mlp": 0.01047577, + "balance_loss_clip": 1.02838445, + "balance_loss_mlp": 1.04827368, + "epoch": 0.1546971291146851, + "flos": 22420935457920.0, + "grad_norm": 1.621727953381826, + "language_loss": 0.90506172, + "learning_rate": 3.839092716749563e-06, + "loss": 0.92705798, + "num_input_tokens_seen": 55875695, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.03125, + "step": 2573, + "time_per_iteration": 2.4679911136627197 + }, + { + "auxiliary_loss_clip": 0.01152025, + "auxiliary_loss_mlp": 0.01056653, + "balance_loss_clip": 1.03724563, + "balance_loss_mlp": 1.04919529, + "epoch": 0.15475725236735308, + "flos": 17530225401600.0, + "grad_norm": 1.7899098306423509, + "language_loss": 0.70378339, + "learning_rate": 3.838939630627893e-06, + "loss": 0.72587025, + "num_input_tokens_seen": 55894575, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0234375, + "step": 2574, + "time_per_iteration": 2.448148012161255 + }, + { + "auxiliary_loss_clip": 0.01150284, + "auxiliary_loss_mlp": 0.01048729, + "balance_loss_clip": 1.02798676, + "balance_loss_mlp": 1.04641008, + "epoch": 0.15481737562002104, + "flos": 22561740771840.0, + "grad_norm": 1.761755301023602, + "language_loss": 0.82718939, + "learning_rate": 3.838786474773448e-06, + "loss": 0.84917951, + "num_input_tokens_seen": 55912855, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.0390625, + "step": 2575, + "time_per_iteration": 2.4515788555145264 + }, + { + "auxiliary_loss_clip": 0.011498, + "auxiliary_loss_mlp": 0.01047927, + "balance_loss_clip": 1.02937794, + "balance_loss_mlp": 1.0456214, + "epoch": 0.154877498872689, + "flos": 24900567039360.0, + "grad_norm": 2.21774000772259, + "language_loss": 0.84661531, + "learning_rate": 3.838633249192036e-06, + "loss": 0.86859256, + "num_input_tokens_seen": 55932375, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0390625, + "step": 2576, + "time_per_iteration": 2.5052003860473633 + }, + { + "auxiliary_loss_clip": 0.01149148, + "auxiliary_loss_mlp": 0.01043114, + "balance_loss_clip": 1.02414751, + "balance_loss_mlp": 1.04679108, + "epoch": 0.15493762212535697, + "flos": 28147501975680.0, + "grad_norm": 1.816317520286285, + "language_loss": 0.81942815, + "learning_rate": 3.838479953889465e-06, + "loss": 0.84135079, + "num_input_tokens_seen": 55953970, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0234375, + "step": 2577, + "time_per_iteration": 2.5133895874023438 + }, + { + "auxiliary_loss_clip": 0.01151988, + "auxiliary_loss_mlp": 0.01049852, + "balance_loss_clip": 1.03090954, + "balance_loss_mlp": 1.04980743, + "epoch": 0.15499774537802496, + "flos": 25411073086080.0, + "grad_norm": 2.384736720709717, + "language_loss": 0.76260924, + "learning_rate": 3.8383265888715525e-06, + "loss": 0.78462768, + "num_input_tokens_seen": 55973120, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0234375, + "step": 2578, + "time_per_iteration": 2.5140793323516846 + }, + { + "auxiliary_loss_clip": 0.01151489, + "auxiliary_loss_mlp": 0.01045761, + "balance_loss_clip": 1.02630556, + "balance_loss_mlp": 1.04832911, + "epoch": 0.15505786863069293, + "flos": 22091562720000.0, + "grad_norm": 2.651100693067537, + "language_loss": 0.82420707, + "learning_rate": 3.83817315414411e-06, + "loss": 0.84617954, + "num_input_tokens_seen": 55993260, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.03125, + "step": 2579, + "time_per_iteration": 2.4410548210144043 + }, + { + "auxiliary_loss_clip": 0.01152359, + "auxiliary_loss_mlp": 0.01049402, + "balance_loss_clip": 1.03056741, + "balance_loss_mlp": 1.05137682, + "epoch": 0.1551179918833609, + "flos": 18917131386240.0, + "grad_norm": 1.6356270056083286, + "language_loss": 0.80460835, + "learning_rate": 3.838019649712958e-06, + "loss": 0.82662606, + "num_input_tokens_seen": 56012130, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0078125, + "step": 2580, + "time_per_iteration": 2.457929849624634 + }, + { + "auxiliary_loss_clip": 0.01050492, + "auxiliary_loss_mlp": 0.01014696, + "balance_loss_clip": 1.0128479, + "balance_loss_mlp": 1.01473403, + "epoch": 0.15517811513602886, + "flos": 66239172587520.0, + "grad_norm": 0.84873853717235, + "language_loss": 0.58840239, + "learning_rate": 3.8378660755839166e-06, + "loss": 0.60905427, + "num_input_tokens_seen": 56079045, + "router_z_loss_clip": 0.01843262, + "router_z_loss_mlp": 0.35742188, + "step": 2581, + "time_per_iteration": 3.1725480556488037 + }, + { + "auxiliary_loss_clip": 0.01152966, + "auxiliary_loss_mlp": 0.01044952, + "balance_loss_clip": 1.02615237, + "balance_loss_mlp": 1.04869819, + "epoch": 0.15523823838869683, + "flos": 24021078531840.0, + "grad_norm": 1.8637973548327127, + "language_loss": 0.85214508, + "learning_rate": 3.8377124317628095e-06, + "loss": 0.87412429, + "num_input_tokens_seen": 56098745, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0390625, + "step": 2582, + "time_per_iteration": 2.486454963684082 + }, + { + "auxiliary_loss_clip": 0.01150766, + "auxiliary_loss_mlp": 0.01055198, + "balance_loss_clip": 1.03534937, + "balance_loss_mlp": 1.04837251, + "epoch": 0.1552983616413648, + "flos": 20485062938880.0, + "grad_norm": 2.457099081417407, + "language_loss": 0.78432047, + "learning_rate": 3.8375587182554625e-06, + "loss": 0.80638009, + "num_input_tokens_seen": 56117655, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0234375, + "step": 2583, + "time_per_iteration": 2.468686580657959 + }, + { + "auxiliary_loss_clip": 0.01151702, + "auxiliary_loss_mlp": 0.01054386, + "balance_loss_clip": 1.03458571, + "balance_loss_mlp": 1.04853427, + "epoch": 0.15535848489403276, + "flos": 32123710742400.0, + "grad_norm": 1.6727812592242826, + "language_loss": 0.76121294, + "learning_rate": 3.837404935067705e-06, + "loss": 0.78327382, + "num_input_tokens_seen": 56141960, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.03125, + "step": 2584, + "time_per_iteration": 2.5471444129943848 + }, + { + "auxiliary_loss_clip": 0.01149066, + "auxiliary_loss_mlp": 0.01046504, + "balance_loss_clip": 1.02746594, + "balance_loss_mlp": 1.04740906, + "epoch": 0.15541860814670075, + "flos": 19098444263040.0, + "grad_norm": 2.0194610159936324, + "language_loss": 0.75623107, + "learning_rate": 3.837251082205368e-06, + "loss": 0.7781868, + "num_input_tokens_seen": 56161430, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.015625, + "step": 2585, + "time_per_iteration": 2.4448020458221436 + }, + { + "auxiliary_loss_clip": 0.01146182, + "auxiliary_loss_mlp": 0.01049826, + "balance_loss_clip": 1.03101528, + "balance_loss_mlp": 1.04662418, + "epoch": 0.1554787313993687, + "flos": 19172097100800.0, + "grad_norm": 2.233481730992117, + "language_loss": 0.611651, + "learning_rate": 3.837097159674286e-06, + "loss": 0.63361114, + "num_input_tokens_seen": 56179390, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.9921875, + "step": 2586, + "time_per_iteration": 2.4375994205474854 + }, + { + "auxiliary_loss_clip": 0.01150781, + "auxiliary_loss_mlp": 0.01047148, + "balance_loss_clip": 1.02814651, + "balance_loss_mlp": 1.04623449, + "epoch": 0.15553885465203668, + "flos": 16143822207360.0, + "grad_norm": 1.8194244944539537, + "language_loss": 0.8108865, + "learning_rate": 3.836943167480296e-06, + "loss": 0.83286583, + "num_input_tokens_seen": 56198020, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.046875, + "step": 2587, + "time_per_iteration": 2.4394617080688477 + }, + { + "auxiliary_loss_clip": 0.01155076, + "auxiliary_loss_mlp": 0.01058653, + "balance_loss_clip": 1.03512144, + "balance_loss_mlp": 1.04851258, + "epoch": 0.15559897790470464, + "flos": 25337779384320.0, + "grad_norm": 1.8978014455674168, + "language_loss": 0.88844347, + "learning_rate": 3.836789105629236e-06, + "loss": 0.91058075, + "num_input_tokens_seen": 56218165, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.0625, + "step": 2588, + "time_per_iteration": 2.519864559173584 + }, + { + "auxiliary_loss_clip": 0.01150101, + "auxiliary_loss_mlp": 0.01053957, + "balance_loss_clip": 1.03351235, + "balance_loss_mlp": 1.04859662, + "epoch": 0.1556591011573726, + "flos": 23148772744320.0, + "grad_norm": 2.6765596364055266, + "language_loss": 0.64950025, + "learning_rate": 3.83663497412695e-06, + "loss": 0.6715408, + "num_input_tokens_seen": 56237160, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.015625, + "step": 2589, + "time_per_iteration": 2.5106732845306396 + }, + { + "auxiliary_loss_clip": 0.01150618, + "auxiliary_loss_mlp": 0.01044948, + "balance_loss_clip": 1.02451587, + "balance_loss_mlp": 1.0483036, + "epoch": 0.15571922441004057, + "flos": 25370888745600.0, + "grad_norm": 1.7614316666112095, + "language_loss": 0.82610166, + "learning_rate": 3.836480772979281e-06, + "loss": 0.84805739, + "num_input_tokens_seen": 56257610, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0234375, + "step": 2590, + "time_per_iteration": 2.519573211669922 + }, + { + "auxiliary_loss_clip": 0.01151128, + "auxiliary_loss_mlp": 0.01047405, + "balance_loss_clip": 1.02761662, + "balance_loss_mlp": 1.04740536, + "epoch": 0.15577934766270854, + "flos": 14501375890560.0, + "grad_norm": 2.1478399705358195, + "language_loss": 0.78919029, + "learning_rate": 3.836326502192077e-06, + "loss": 0.81117558, + "num_input_tokens_seen": 56275215, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.0390625, + "step": 2591, + "time_per_iteration": 2.446871519088745 + }, + { + "auxiliary_loss_clip": 0.01150021, + "auxiliary_loss_mlp": 0.01051358, + "balance_loss_clip": 1.03271413, + "balance_loss_mlp": 1.04902434, + "epoch": 0.15583947091537653, + "flos": 37414537372800.0, + "grad_norm": 1.9877262596002243, + "language_loss": 0.64780253, + "learning_rate": 3.836172161771189e-06, + "loss": 0.66981632, + "num_input_tokens_seen": 56297130, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0078125, + "step": 2592, + "time_per_iteration": 2.5992095470428467 + }, + { + "auxiliary_loss_clip": 0.01156577, + "auxiliary_loss_mlp": 0.01052338, + "balance_loss_clip": 1.03195322, + "balance_loss_mlp": 1.0518856, + "epoch": 0.1558995941680445, + "flos": 21834729498240.0, + "grad_norm": 2.6077304694487062, + "language_loss": 0.81806099, + "learning_rate": 3.836017751722467e-06, + "loss": 0.84015012, + "num_input_tokens_seen": 56314995, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.046875, + "step": 2593, + "time_per_iteration": 2.4317471981048584 + }, + { + "auxiliary_loss_clip": 0.01148564, + "auxiliary_loss_mlp": 0.01049732, + "balance_loss_clip": 1.02876306, + "balance_loss_mlp": 1.04862404, + "epoch": 0.15595971742071246, + "flos": 19792633484160.0, + "grad_norm": 2.3131099691306445, + "language_loss": 0.72585857, + "learning_rate": 3.8358632720517695e-06, + "loss": 0.7478416, + "num_input_tokens_seen": 56334005, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.0, + "step": 2594, + "time_per_iteration": 2.454946994781494 + }, + { + "auxiliary_loss_clip": 0.01145676, + "auxiliary_loss_mlp": 0.01044453, + "balance_loss_clip": 1.02514088, + "balance_loss_mlp": 1.0476191, + "epoch": 0.15601984067338043, + "flos": 26722135503360.0, + "grad_norm": 1.980280068020953, + "language_loss": 0.8170377, + "learning_rate": 3.835708722764952e-06, + "loss": 0.83893895, + "num_input_tokens_seen": 56353795, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.98046875, + "step": 2595, + "time_per_iteration": 2.4859232902526855 + }, + { + "auxiliary_loss_clip": 0.01149214, + "auxiliary_loss_mlp": 0.01047121, + "balance_loss_clip": 1.02761889, + "balance_loss_mlp": 1.04722846, + "epoch": 0.1560799639260484, + "flos": 18369278173440.0, + "grad_norm": 2.3729637830877177, + "language_loss": 0.86587811, + "learning_rate": 3.835554103867876e-06, + "loss": 0.88784146, + "num_input_tokens_seen": 56373195, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.015625, + "step": 2596, + "time_per_iteration": 2.431851387023926 + }, + { + "auxiliary_loss_clip": 0.0114636, + "auxiliary_loss_mlp": 0.01043935, + "balance_loss_clip": 1.02558839, + "balance_loss_mlp": 1.04831815, + "epoch": 0.15614008717871636, + "flos": 22598980197120.0, + "grad_norm": 1.6624104890405602, + "language_loss": 0.68610018, + "learning_rate": 3.835399415366404e-06, + "loss": 0.70800316, + "num_input_tokens_seen": 56391525, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.98046875, + "step": 2597, + "time_per_iteration": 2.447265625 + }, + { + "auxiliary_loss_clip": 0.01144111, + "auxiliary_loss_mlp": 0.01040539, + "balance_loss_clip": 1.02210891, + "balance_loss_mlp": 1.04714298, + "epoch": 0.15620021043138435, + "flos": 22746860490240.0, + "grad_norm": 1.638980754682227, + "language_loss": 0.79885375, + "learning_rate": 3.8352446572664035e-06, + "loss": 0.82070029, + "num_input_tokens_seen": 56410715, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.97265625, + "step": 2598, + "time_per_iteration": 2.4641571044921875 + }, + { + "auxiliary_loss_clip": 0.01141262, + "auxiliary_loss_mlp": 0.01039052, + "balance_loss_clip": 1.02003777, + "balance_loss_mlp": 1.04484367, + "epoch": 0.15626033368405232, + "flos": 13114936782720.0, + "grad_norm": 2.19687533686526, + "language_loss": 0.82877028, + "learning_rate": 3.8350898295737405e-06, + "loss": 0.85057342, + "num_input_tokens_seen": 56429170, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.96484375, + "step": 2599, + "time_per_iteration": 2.419464111328125 + }, + { + "auxiliary_loss_clip": 0.01155461, + "auxiliary_loss_mlp": 0.01052363, + "balance_loss_clip": 1.03115571, + "balance_loss_mlp": 1.04991198, + "epoch": 0.15632045693672028, + "flos": 16472297105280.0, + "grad_norm": 3.412785735027946, + "language_loss": 0.81813747, + "learning_rate": 3.834934932294287e-06, + "loss": 0.84021574, + "num_input_tokens_seen": 56445685, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.0546875, + "step": 2600, + "time_per_iteration": 2.408848524093628 + }, + { + "auxiliary_loss_clip": 0.01152936, + "auxiliary_loss_mlp": 0.0104778, + "balance_loss_clip": 1.02813435, + "balance_loss_mlp": 1.05145574, + "epoch": 0.15638058018938825, + "flos": 20850346298880.0, + "grad_norm": 1.8570517134994367, + "language_loss": 0.8869983, + "learning_rate": 3.834779965433917e-06, + "loss": 0.90900552, + "num_input_tokens_seen": 56465900, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.015625, + "step": 2601, + "time_per_iteration": 3.8960022926330566 + }, + { + "auxiliary_loss_clip": 0.01155618, + "auxiliary_loss_mlp": 0.01064496, + "balance_loss_clip": 1.04250216, + "balance_loss_mlp": 1.05294669, + "epoch": 0.1564407034420562, + "flos": 21872220318720.0, + "grad_norm": 1.6572791804428935, + "language_loss": 0.78657669, + "learning_rate": 3.834624928998508e-06, + "loss": 0.80877781, + "num_input_tokens_seen": 56485020, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.0234375, + "step": 2602, + "time_per_iteration": 5.330498456954956 + }, + { + "auxiliary_loss_clip": 0.01148351, + "auxiliary_loss_mlp": 0.01041482, + "balance_loss_clip": 1.02178836, + "balance_loss_mlp": 1.04872918, + "epoch": 0.15650082669472418, + "flos": 21834549930240.0, + "grad_norm": 1.9481072701353659, + "language_loss": 0.73668396, + "learning_rate": 3.8344698229939376e-06, + "loss": 0.75858229, + "num_input_tokens_seen": 56505205, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.99609375, + "step": 2603, + "time_per_iteration": 2.4632985591888428 + }, + { + "auxiliary_loss_clip": 0.01152236, + "auxiliary_loss_mlp": 0.01051929, + "balance_loss_clip": 1.03205693, + "balance_loss_mlp": 1.05066442, + "epoch": 0.15656094994739214, + "flos": 13800542653440.0, + "grad_norm": 3.4624008692922583, + "language_loss": 0.87223339, + "learning_rate": 3.8343146474260865e-06, + "loss": 0.89427507, + "num_input_tokens_seen": 56521495, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.015625, + "step": 2604, + "time_per_iteration": 2.449589490890503 + }, + { + "auxiliary_loss_clip": 0.01151636, + "auxiliary_loss_mlp": 0.01043178, + "balance_loss_clip": 1.02404523, + "balance_loss_mlp": 1.04892218, + "epoch": 0.15662107320006013, + "flos": 27308197808640.0, + "grad_norm": 1.883819023069068, + "language_loss": 0.85465723, + "learning_rate": 3.834159402300841e-06, + "loss": 0.87660539, + "num_input_tokens_seen": 56540665, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0234375, + "step": 2605, + "time_per_iteration": 2.4958839416503906 + }, + { + "auxiliary_loss_clip": 0.01153078, + "auxiliary_loss_mlp": 0.01047461, + "balance_loss_clip": 1.0274334, + "balance_loss_mlp": 1.04840827, + "epoch": 0.1566811964527281, + "flos": 26685075646080.0, + "grad_norm": 2.4518366617864897, + "language_loss": 0.72954321, + "learning_rate": 3.834004087624087e-06, + "loss": 0.75154853, + "num_input_tokens_seen": 56560805, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.046875, + "step": 2606, + "time_per_iteration": 2.5142898559570312 + }, + { + "auxiliary_loss_clip": 0.01153185, + "auxiliary_loss_mlp": 0.01052184, + "balance_loss_clip": 1.03406429, + "balance_loss_mlp": 1.05257165, + "epoch": 0.15674131970539606, + "flos": 16103422385280.0, + "grad_norm": 1.9820673877795116, + "language_loss": 0.7643044, + "learning_rate": 3.8338487034017145e-06, + "loss": 0.78635812, + "num_input_tokens_seen": 56576335, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.0, + "step": 2607, + "time_per_iteration": 2.433779239654541 + }, + { + "auxiliary_loss_clip": 0.01150219, + "auxiliary_loss_mlp": 0.01046695, + "balance_loss_clip": 1.0282656, + "balance_loss_mlp": 1.05097091, + "epoch": 0.15680144295806403, + "flos": 19169690889600.0, + "grad_norm": 1.7850270515341367, + "language_loss": 0.8191157, + "learning_rate": 3.833693249639615e-06, + "loss": 0.8410849, + "num_input_tokens_seen": 56595880, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.9921875, + "step": 2608, + "time_per_iteration": 2.4599456787109375 + }, + { + "auxiliary_loss_clip": 0.0115477, + "auxiliary_loss_mlp": 0.01051079, + "balance_loss_clip": 1.03001475, + "balance_loss_mlp": 1.05087662, + "epoch": 0.156861566210732, + "flos": 20813430096000.0, + "grad_norm": 1.762197880640894, + "language_loss": 0.72479111, + "learning_rate": 3.833537726343684e-06, + "loss": 0.74684954, + "num_input_tokens_seen": 56615130, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.0390625, + "step": 2609, + "time_per_iteration": 2.478262424468994 + }, + { + "auxiliary_loss_clip": 0.0115339, + "auxiliary_loss_mlp": 0.01043612, + "balance_loss_clip": 1.02415729, + "balance_loss_mlp": 1.04881263, + "epoch": 0.15692168946339996, + "flos": 20047922421120.0, + "grad_norm": 1.8833233307981396, + "language_loss": 0.71974212, + "learning_rate": 3.833382133519818e-06, + "loss": 0.74171209, + "num_input_tokens_seen": 56634005, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.046875, + "step": 2610, + "time_per_iteration": 2.468616247177124 + }, + { + "auxiliary_loss_clip": 0.01153055, + "auxiliary_loss_mlp": 0.01052533, + "balance_loss_clip": 1.03119481, + "balance_loss_mlp": 1.04865789, + "epoch": 0.15698181271606793, + "flos": 21398019943680.0, + "grad_norm": 2.0486839750324117, + "language_loss": 0.72148776, + "learning_rate": 3.833226471173919e-06, + "loss": 0.74354362, + "num_input_tokens_seen": 56653480, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.046875, + "step": 2611, + "time_per_iteration": 2.4812967777252197 + }, + { + "auxiliary_loss_clip": 0.01152967, + "auxiliary_loss_mlp": 0.01044873, + "balance_loss_clip": 1.02517986, + "balance_loss_mlp": 1.05081797, + "epoch": 0.15704193596873592, + "flos": 20845785271680.0, + "grad_norm": 2.1526303920645153, + "language_loss": 0.70732605, + "learning_rate": 3.833070739311887e-06, + "loss": 0.72930443, + "num_input_tokens_seen": 56672270, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0234375, + "step": 2612, + "time_per_iteration": 2.4659905433654785 + }, + { + "auxiliary_loss_clip": 0.0115345, + "auxiliary_loss_mlp": 0.01053573, + "balance_loss_clip": 1.03448749, + "balance_loss_mlp": 1.05112672, + "epoch": 0.15710205922140388, + "flos": 21762908254080.0, + "grad_norm": 1.98698506128839, + "language_loss": 0.75649011, + "learning_rate": 3.83291493793963e-06, + "loss": 0.77856034, + "num_input_tokens_seen": 56691510, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0234375, + "step": 2613, + "time_per_iteration": 2.5053935050964355 + }, + { + "auxiliary_loss_clip": 0.01150247, + "auxiliary_loss_mlp": 0.01054631, + "balance_loss_clip": 1.03454411, + "balance_loss_mlp": 1.04870725, + "epoch": 0.15716218247407185, + "flos": 25007760201600.0, + "grad_norm": 1.7256548803860323, + "language_loss": 0.6593504, + "learning_rate": 3.832759067063055e-06, + "loss": 0.68139917, + "num_input_tokens_seen": 56712230, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.015625, + "step": 2614, + "time_per_iteration": 2.49568772315979 + }, + { + "auxiliary_loss_clip": 0.01154976, + "auxiliary_loss_mlp": 0.01050381, + "balance_loss_clip": 1.02972233, + "balance_loss_mlp": 1.04979289, + "epoch": 0.1572223057267398, + "flos": 20191780391040.0, + "grad_norm": 2.1509467282749055, + "language_loss": 0.7554003, + "learning_rate": 3.832603126688072e-06, + "loss": 0.7774539, + "num_input_tokens_seen": 56727490, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.0546875, + "step": 2615, + "time_per_iteration": 2.529383420944214 + }, + { + "auxiliary_loss_clip": 0.0115204, + "auxiliary_loss_mlp": 0.01052516, + "balance_loss_clip": 1.03374028, + "balance_loss_mlp": 1.05295634, + "epoch": 0.15728242897940778, + "flos": 20959514709120.0, + "grad_norm": 1.616950748432624, + "language_loss": 0.72989607, + "learning_rate": 3.832447116820594e-06, + "loss": 0.75194162, + "num_input_tokens_seen": 56747385, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9921875, + "step": 2616, + "time_per_iteration": 2.5096960067749023 + }, + { + "auxiliary_loss_clip": 0.01152584, + "auxiliary_loss_mlp": 0.01055054, + "balance_loss_clip": 1.03453839, + "balance_loss_mlp": 1.04991412, + "epoch": 0.15734255223207574, + "flos": 23038275530880.0, + "grad_norm": 3.5663633553154774, + "language_loss": 0.72316766, + "learning_rate": 3.832291037466539e-06, + "loss": 0.74524403, + "num_input_tokens_seen": 56768055, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0234375, + "step": 2617, + "time_per_iteration": 2.46756911277771 + }, + { + "auxiliary_loss_clip": 0.01151577, + "auxiliary_loss_mlp": 0.01043789, + "balance_loss_clip": 1.02453637, + "balance_loss_mlp": 1.05169988, + "epoch": 0.15740267548474374, + "flos": 20551281661440.0, + "grad_norm": 2.0296559288157563, + "language_loss": 0.74336463, + "learning_rate": 3.8321348886318235e-06, + "loss": 0.76531827, + "num_input_tokens_seen": 56785110, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.0, + "step": 2618, + "time_per_iteration": 2.4584109783172607 + }, + { + "auxiliary_loss_clip": 0.01156356, + "auxiliary_loss_mlp": 0.01051737, + "balance_loss_clip": 1.02976644, + "balance_loss_mlp": 1.05079079, + "epoch": 0.1574627987374117, + "flos": 22666922772480.0, + "grad_norm": 2.116136233608656, + "language_loss": 0.78624105, + "learning_rate": 3.8319786703223695e-06, + "loss": 0.80832201, + "num_input_tokens_seen": 56804975, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.0546875, + "step": 2619, + "time_per_iteration": 2.481902837753296 + }, + { + "auxiliary_loss_clip": 0.01151953, + "auxiliary_loss_mlp": 0.01052764, + "balance_loss_clip": 1.03373837, + "balance_loss_mlp": 1.05213726, + "epoch": 0.15752292199007967, + "flos": 16800664262400.0, + "grad_norm": 1.705564128099723, + "language_loss": 0.76632881, + "learning_rate": 3.831822382544101e-06, + "loss": 0.78837597, + "num_input_tokens_seen": 56822470, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0, + "step": 2620, + "time_per_iteration": 2.432645082473755 + }, + { + "auxiliary_loss_clip": 0.01153614, + "auxiliary_loss_mlp": 0.01050007, + "balance_loss_clip": 1.02901375, + "balance_loss_mlp": 1.05096626, + "epoch": 0.15758304524274763, + "flos": 29826002568960.0, + "grad_norm": 1.7942321132139696, + "language_loss": 0.70836174, + "learning_rate": 3.831666025302944e-06, + "loss": 0.73039794, + "num_input_tokens_seen": 56842100, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.03125, + "step": 2621, + "time_per_iteration": 2.5259244441986084 + }, + { + "auxiliary_loss_clip": 0.01156472, + "auxiliary_loss_mlp": 0.01049198, + "balance_loss_clip": 1.0277524, + "balance_loss_mlp": 1.05222857, + "epoch": 0.1576431684954156, + "flos": 53577426723840.0, + "grad_norm": 2.5825564073202467, + "language_loss": 0.71880406, + "learning_rate": 3.831509598604828e-06, + "loss": 0.74086076, + "num_input_tokens_seen": 56865920, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.046875, + "step": 2622, + "time_per_iteration": 2.738351583480835 + }, + { + "auxiliary_loss_clip": 0.01153726, + "auxiliary_loss_mlp": 0.01047401, + "balance_loss_clip": 1.02826762, + "balance_loss_mlp": 1.05162704, + "epoch": 0.15770329174808356, + "flos": 20813609664000.0, + "grad_norm": 1.7275011876813262, + "language_loss": 0.87603116, + "learning_rate": 3.831353102455684e-06, + "loss": 0.89804244, + "num_input_tokens_seen": 56885265, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0234375, + "step": 2623, + "time_per_iteration": 2.439276695251465 + }, + { + "auxiliary_loss_clip": 0.01153997, + "auxiliary_loss_mlp": 0.01046147, + "balance_loss_clip": 1.02774143, + "balance_loss_mlp": 1.05301619, + "epoch": 0.15776341500075153, + "flos": 24974004395520.0, + "grad_norm": 1.7488793041913886, + "language_loss": 0.82132548, + "learning_rate": 3.831196536861448e-06, + "loss": 0.84332693, + "num_input_tokens_seen": 56906710, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.0078125, + "step": 2624, + "time_per_iteration": 2.5011823177337646 + }, + { + "auxiliary_loss_clip": 0.01156666, + "auxiliary_loss_mlp": 0.01047764, + "balance_loss_clip": 1.02720022, + "balance_loss_mlp": 1.0518285, + "epoch": 0.15782353825341952, + "flos": 21907915459200.0, + "grad_norm": 2.213311097116894, + "language_loss": 0.79965818, + "learning_rate": 3.831039901828054e-06, + "loss": 0.82170242, + "num_input_tokens_seen": 56924275, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.046875, + "step": 2625, + "time_per_iteration": 2.469705581665039 + }, + { + "auxiliary_loss_clip": 0.01152837, + "auxiliary_loss_mlp": 0.01050956, + "balance_loss_clip": 1.03215635, + "balance_loss_mlp": 1.05189955, + "epoch": 0.15788366150608749, + "flos": 26177191292160.0, + "grad_norm": 2.0497226184185044, + "language_loss": 0.80393386, + "learning_rate": 3.830883197361445e-06, + "loss": 0.82597172, + "num_input_tokens_seen": 56941525, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0078125, + "step": 2626, + "time_per_iteration": 2.4822630882263184 + }, + { + "auxiliary_loss_clip": 0.01157567, + "auxiliary_loss_mlp": 0.01046921, + "balance_loss_clip": 1.02703679, + "balance_loss_mlp": 1.05660009, + "epoch": 0.15794378475875545, + "flos": 27709822753920.0, + "grad_norm": 1.8439314798963051, + "language_loss": 0.73819017, + "learning_rate": 3.830726423467561e-06, + "loss": 0.76023501, + "num_input_tokens_seen": 56962145, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0078125, + "step": 2627, + "time_per_iteration": 2.5146384239196777 + }, + { + "auxiliary_loss_clip": 0.01153645, + "auxiliary_loss_mlp": 0.01055765, + "balance_loss_clip": 1.03515387, + "balance_loss_mlp": 1.05136025, + "epoch": 0.15800390801142342, + "flos": 12130158533760.0, + "grad_norm": 2.581375347872909, + "language_loss": 0.84926289, + "learning_rate": 3.830569580152348e-06, + "loss": 0.87135696, + "num_input_tokens_seen": 56977505, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.0234375, + "step": 2628, + "time_per_iteration": 2.476461172103882 + }, + { + "auxiliary_loss_clip": 0.01152526, + "auxiliary_loss_mlp": 0.01039179, + "balance_loss_clip": 1.02045107, + "balance_loss_mlp": 1.05181646, + "epoch": 0.15806403126409138, + "flos": 20704728562560.0, + "grad_norm": 1.9330212081502065, + "language_loss": 0.76414472, + "learning_rate": 3.830412667421752e-06, + "loss": 0.78606176, + "num_input_tokens_seen": 56996770, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0078125, + "step": 2629, + "time_per_iteration": 2.4604575634002686 + }, + { + "auxiliary_loss_clip": 0.01157301, + "auxiliary_loss_mlp": 0.01053673, + "balance_loss_clip": 1.03277516, + "balance_loss_mlp": 1.05376625, + "epoch": 0.15812415451675935, + "flos": 17821712269440.0, + "grad_norm": 2.3335878107949624, + "language_loss": 0.73786485, + "learning_rate": 3.8302556852817245e-06, + "loss": 0.7599746, + "num_input_tokens_seen": 57014970, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.0390625, + "step": 2630, + "time_per_iteration": 2.4556961059570312 + }, + { + "auxiliary_loss_clip": 0.01159154, + "auxiliary_loss_mlp": 0.01049527, + "balance_loss_clip": 1.02934527, + "balance_loss_mlp": 1.05278432, + "epoch": 0.15818427776942734, + "flos": 20084048524800.0, + "grad_norm": 3.0799062126580385, + "language_loss": 0.83732498, + "learning_rate": 3.8300986337382184e-06, + "loss": 0.85941184, + "num_input_tokens_seen": 57034045, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0625, + "step": 2631, + "time_per_iteration": 2.46466326713562 + }, + { + "auxiliary_loss_clip": 0.0115417, + "auxiliary_loss_mlp": 0.01047476, + "balance_loss_clip": 1.02800894, + "balance_loss_mlp": 1.05072045, + "epoch": 0.1582444010220953, + "flos": 21214911386880.0, + "grad_norm": 1.8231521117013414, + "language_loss": 0.78509778, + "learning_rate": 3.8299415127971895e-06, + "loss": 0.80711424, + "num_input_tokens_seen": 57053695, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0390625, + "step": 2632, + "time_per_iteration": 2.4678170680999756 + }, + { + "auxiliary_loss_clip": 0.01160199, + "auxiliary_loss_mlp": 0.01058182, + "balance_loss_clip": 1.03766572, + "balance_loss_mlp": 1.05516291, + "epoch": 0.15830452427476327, + "flos": 17858341163520.0, + "grad_norm": 2.1429957658458374, + "language_loss": 0.83250827, + "learning_rate": 3.829784322464594e-06, + "loss": 0.8546921, + "num_input_tokens_seen": 57071290, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.046875, + "step": 2633, + "time_per_iteration": 2.4329495429992676 + }, + { + "auxiliary_loss_clip": 0.01161566, + "auxiliary_loss_mlp": 0.01046458, + "balance_loss_clip": 1.02641928, + "balance_loss_mlp": 1.05591452, + "epoch": 0.15836464752743123, + "flos": 24534960456960.0, + "grad_norm": 1.9651575849984717, + "language_loss": 0.77401066, + "learning_rate": 3.829627062746394e-06, + "loss": 0.79609084, + "num_input_tokens_seen": 57091465, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.0625, + "step": 2634, + "time_per_iteration": 2.4989452362060547 + }, + { + "auxiliary_loss_clip": 0.01158347, + "auxiliary_loss_mlp": 0.01049894, + "balance_loss_clip": 1.02961695, + "balance_loss_mlp": 1.05281138, + "epoch": 0.1584247707800992, + "flos": 20120821073280.0, + "grad_norm": 2.178604932363088, + "language_loss": 0.89144027, + "learning_rate": 3.829469733648552e-06, + "loss": 0.91352272, + "num_input_tokens_seen": 57110075, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.0546875, + "step": 2635, + "time_per_iteration": 2.45926570892334 + }, + { + "auxiliary_loss_clip": 0.0115666, + "auxiliary_loss_mlp": 0.0105615, + "balance_loss_clip": 1.03518081, + "balance_loss_mlp": 1.05145168, + "epoch": 0.15848489403276717, + "flos": 20375966355840.0, + "grad_norm": 2.07071202721755, + "language_loss": 0.75814605, + "learning_rate": 3.829312335177034e-06, + "loss": 0.78027415, + "num_input_tokens_seen": 57128945, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.046875, + "step": 2636, + "time_per_iteration": 2.4601919651031494 + }, + { + "auxiliary_loss_clip": 0.01159967, + "auxiliary_loss_mlp": 0.01046973, + "balance_loss_clip": 1.0252409, + "balance_loss_mlp": 1.05383635, + "epoch": 0.15854501728543513, + "flos": 39346890359040.0, + "grad_norm": 2.192817266182781, + "language_loss": 0.72065628, + "learning_rate": 3.82915486733781e-06, + "loss": 0.74272561, + "num_input_tokens_seen": 57152385, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.0625, + "step": 2637, + "time_per_iteration": 2.6509416103363037 + }, + { + "auxiliary_loss_clip": 0.01154792, + "auxiliary_loss_mlp": 0.01042754, + "balance_loss_clip": 1.02395523, + "balance_loss_mlp": 1.05307317, + "epoch": 0.15860514053810312, + "flos": 24864225454080.0, + "grad_norm": 1.9644709833035638, + "language_loss": 0.77938193, + "learning_rate": 3.82899733013685e-06, + "loss": 0.80135739, + "num_input_tokens_seen": 57172620, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 2638, + "time_per_iteration": 2.516597032546997 + }, + { + "auxiliary_loss_clip": 0.01160159, + "auxiliary_loss_mlp": 0.01062214, + "balance_loss_clip": 1.04074407, + "balance_loss_mlp": 1.05348861, + "epoch": 0.1586652637907711, + "flos": 26177694082560.0, + "grad_norm": 1.8473853011869859, + "language_loss": 0.75521988, + "learning_rate": 3.828839723580128e-06, + "loss": 0.77744359, + "num_input_tokens_seen": 57194680, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.0625, + "step": 2639, + "time_per_iteration": 2.5517024993896484 + }, + { + "auxiliary_loss_clip": 0.01159513, + "auxiliary_loss_mlp": 0.01061213, + "balance_loss_clip": 1.04115009, + "balance_loss_mlp": 1.0541048, + "epoch": 0.15872538704343905, + "flos": 19792058866560.0, + "grad_norm": 2.7935559917311212, + "language_loss": 0.81487972, + "learning_rate": 3.82868204767362e-06, + "loss": 0.83708692, + "num_input_tokens_seen": 57214675, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0546875, + "step": 2640, + "time_per_iteration": 2.5613112449645996 + }, + { + "auxiliary_loss_clip": 0.01152653, + "auxiliary_loss_mlp": 0.01050922, + "balance_loss_clip": 1.030406, + "balance_loss_mlp": 1.05107331, + "epoch": 0.15878551029610702, + "flos": 28475366342400.0, + "grad_norm": 1.4887809421561018, + "language_loss": 0.67051661, + "learning_rate": 3.828524302423306e-06, + "loss": 0.69255233, + "num_input_tokens_seen": 57235830, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.015625, + "step": 2641, + "time_per_iteration": 2.5603220462799072 + }, + { + "auxiliary_loss_clip": 0.01163302, + "auxiliary_loss_mlp": 0.01057677, + "balance_loss_clip": 1.03670835, + "balance_loss_mlp": 1.05338526, + "epoch": 0.15884563354877498, + "flos": 24206701040640.0, + "grad_norm": 2.894977763056953, + "language_loss": 0.7508198, + "learning_rate": 3.828366487835167e-06, + "loss": 0.77302957, + "num_input_tokens_seen": 57255970, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.09375, + "step": 2642, + "time_per_iteration": 2.4783003330230713 + }, + { + "auxiliary_loss_clip": 0.01154514, + "auxiliary_loss_mlp": 0.01054374, + "balance_loss_clip": 1.0343703, + "balance_loss_mlp": 1.05342579, + "epoch": 0.15890575680144295, + "flos": 23949795991680.0, + "grad_norm": 2.1233146618452046, + "language_loss": 0.70096999, + "learning_rate": 3.828208603915186e-06, + "loss": 0.72305882, + "num_input_tokens_seen": 57274435, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.015625, + "step": 2643, + "time_per_iteration": 3.8417530059814453 + }, + { + "auxiliary_loss_clip": 0.0115474, + "auxiliary_loss_mlp": 0.01046992, + "balance_loss_clip": 1.02801371, + "balance_loss_mlp": 1.05399418, + "epoch": 0.15896588005411091, + "flos": 21215019127680.0, + "grad_norm": 2.266510625665779, + "language_loss": 0.78172421, + "learning_rate": 3.828050650669353e-06, + "loss": 0.80374151, + "num_input_tokens_seen": 57293115, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0078125, + "step": 2644, + "time_per_iteration": 3.918332099914551 + }, + { + "auxiliary_loss_clip": 0.01155626, + "auxiliary_loss_mlp": 0.01054747, + "balance_loss_clip": 1.03432608, + "balance_loss_mlp": 1.05189228, + "epoch": 0.1590260033067789, + "flos": 24352390604160.0, + "grad_norm": 1.8745538844001242, + "language_loss": 0.82203078, + "learning_rate": 3.827892628103657e-06, + "loss": 0.84413457, + "num_input_tokens_seen": 57312565, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0390625, + "step": 2645, + "time_per_iteration": 2.484264373779297 + }, + { + "auxiliary_loss_clip": 0.01156639, + "auxiliary_loss_mlp": 0.01055562, + "balance_loss_clip": 1.0340929, + "balance_loss_mlp": 1.05192447, + "epoch": 0.15908612655944687, + "flos": 32048944583040.0, + "grad_norm": 1.974907168100252, + "language_loss": 0.69778836, + "learning_rate": 3.827734536224087e-06, + "loss": 0.71991032, + "num_input_tokens_seen": 57333360, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.046875, + "step": 2646, + "time_per_iteration": 2.5406665802001953 + }, + { + "auxiliary_loss_clip": 0.01151139, + "auxiliary_loss_mlp": 0.01046289, + "balance_loss_clip": 1.02738249, + "balance_loss_mlp": 1.05206954, + "epoch": 0.15914624981211484, + "flos": 17785370684160.0, + "grad_norm": 2.5066454352116914, + "language_loss": 0.62659109, + "learning_rate": 3.827576375036642e-06, + "loss": 0.64856541, + "num_input_tokens_seen": 57350575, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.9921875, + "step": 2647, + "time_per_iteration": 2.442711353302002 + }, + { + "auxiliary_loss_clip": 0.01155368, + "auxiliary_loss_mlp": 0.01052347, + "balance_loss_clip": 1.03226066, + "balance_loss_mlp": 1.05410099, + "epoch": 0.1592063730647828, + "flos": 17712507945600.0, + "grad_norm": 2.1253745247586204, + "language_loss": 0.8942067, + "learning_rate": 3.827418144547318e-06, + "loss": 0.91628385, + "num_input_tokens_seen": 57367570, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.015625, + "step": 2648, + "time_per_iteration": 2.4649319648742676 + }, + { + "auxiliary_loss_clip": 0.01152722, + "auxiliary_loss_mlp": 0.01049569, + "balance_loss_clip": 1.03141308, + "balance_loss_mlp": 1.05391204, + "epoch": 0.15926649631745077, + "flos": 18803545603200.0, + "grad_norm": 1.8651001097947648, + "language_loss": 0.91716385, + "learning_rate": 3.827259844762114e-06, + "loss": 0.93918669, + "num_input_tokens_seen": 57383980, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.98828125, + "step": 2649, + "time_per_iteration": 2.451261520385742 + }, + { + "auxiliary_loss_clip": 0.01163223, + "auxiliary_loss_mlp": 0.01049063, + "balance_loss_clip": 1.02802217, + "balance_loss_mlp": 1.05272281, + "epoch": 0.15932661957011873, + "flos": 17566243764480.0, + "grad_norm": 3.3226984417644028, + "language_loss": 0.71273595, + "learning_rate": 3.827101475687033e-06, + "loss": 0.73485881, + "num_input_tokens_seen": 57400840, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1015625, + "step": 2650, + "time_per_iteration": 2.43603253364563 + }, + { + "auxiliary_loss_clip": 0.01153823, + "auxiliary_loss_mlp": 0.01044738, + "balance_loss_clip": 1.02695203, + "balance_loss_mlp": 1.05372715, + "epoch": 0.15938674282278673, + "flos": 13334351011200.0, + "grad_norm": 2.4247432930640898, + "language_loss": 0.71116996, + "learning_rate": 3.826943037328082e-06, + "loss": 0.73315561, + "num_input_tokens_seen": 57419230, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 1.0, + "step": 2651, + "time_per_iteration": 2.467451572418213 + }, + { + "auxiliary_loss_clip": 0.01155622, + "auxiliary_loss_mlp": 0.01049144, + "balance_loss_clip": 1.02912855, + "balance_loss_mlp": 1.0513978, + "epoch": 0.1594468660754547, + "flos": 22488842119680.0, + "grad_norm": 1.909821572556346, + "language_loss": 0.7997523, + "learning_rate": 3.8267845296912674e-06, + "loss": 0.82179999, + "num_input_tokens_seen": 57439315, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.046875, + "step": 2652, + "time_per_iteration": 2.519624948501587 + }, + { + "auxiliary_loss_clip": 0.01153837, + "auxiliary_loss_mlp": 0.01045946, + "balance_loss_clip": 1.02665794, + "balance_loss_mlp": 1.05385149, + "epoch": 0.15950698932812266, + "flos": 15007320910080.0, + "grad_norm": 2.695147262103697, + "language_loss": 0.70050812, + "learning_rate": 3.826625952782601e-06, + "loss": 0.72250587, + "num_input_tokens_seen": 57454635, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.0, + "step": 2653, + "time_per_iteration": 2.439445972442627 + }, + { + "auxiliary_loss_clip": 0.01154814, + "auxiliary_loss_mlp": 0.01043059, + "balance_loss_clip": 1.02309155, + "balance_loss_mlp": 1.05308652, + "epoch": 0.15956711258079062, + "flos": 30155052084480.0, + "grad_norm": 2.046273350718209, + "language_loss": 0.76509416, + "learning_rate": 3.826467306608095e-06, + "loss": 0.7870729, + "num_input_tokens_seen": 57476805, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.015625, + "step": 2654, + "time_per_iteration": 2.529644012451172 + }, + { + "auxiliary_loss_clip": 0.01154147, + "auxiliary_loss_mlp": 0.01047134, + "balance_loss_clip": 1.02750051, + "balance_loss_mlp": 1.0526185, + "epoch": 0.1596272358334586, + "flos": 21032700670080.0, + "grad_norm": 1.961582700797155, + "language_loss": 0.8208828, + "learning_rate": 3.826308591173765e-06, + "loss": 0.84289569, + "num_input_tokens_seen": 57496400, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.015625, + "step": 2655, + "time_per_iteration": 2.4841158390045166 + }, + { + "auxiliary_loss_clip": 0.01154679, + "auxiliary_loss_mlp": 0.01049984, + "balance_loss_clip": 1.03166127, + "balance_loss_mlp": 1.05125904, + "epoch": 0.15968735908612655, + "flos": 15268032800640.0, + "grad_norm": 2.077546195878165, + "language_loss": 0.73565602, + "learning_rate": 3.826149806485631e-06, + "loss": 0.75770259, + "num_input_tokens_seen": 57513700, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.03125, + "step": 2656, + "time_per_iteration": 2.4727072715759277 + }, + { + "auxiliary_loss_clip": 0.01149623, + "auxiliary_loss_mlp": 0.01046235, + "balance_loss_clip": 1.02766216, + "balance_loss_mlp": 1.05170095, + "epoch": 0.15974748233879452, + "flos": 52665726695040.0, + "grad_norm": 1.884771930829773, + "language_loss": 0.77508467, + "learning_rate": 3.825990952549713e-06, + "loss": 0.79704326, + "num_input_tokens_seen": 57536180, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.98046875, + "step": 2657, + "time_per_iteration": 2.801560401916504 + }, + { + "auxiliary_loss_clip": 0.01154211, + "auxiliary_loss_mlp": 0.01048143, + "balance_loss_clip": 1.02910495, + "balance_loss_mlp": 1.05459499, + "epoch": 0.1598076055914625, + "flos": 18733232730240.0, + "grad_norm": 1.6493844029380673, + "language_loss": 0.74807733, + "learning_rate": 3.825832029372035e-06, + "loss": 0.77010089, + "num_input_tokens_seen": 57555025, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.99609375, + "step": 2658, + "time_per_iteration": 2.4434328079223633 + }, + { + "auxiliary_loss_clip": 0.01155878, + "auxiliary_loss_mlp": 0.01050595, + "balance_loss_clip": 1.02912521, + "balance_loss_mlp": 1.05291355, + "epoch": 0.15986772884413047, + "flos": 34349238535680.0, + "grad_norm": 1.8153435843839463, + "language_loss": 0.75194407, + "learning_rate": 3.825673036958624e-06, + "loss": 0.77400887, + "num_input_tokens_seen": 57577660, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.03125, + "step": 2659, + "time_per_iteration": 2.587700366973877 + }, + { + "auxiliary_loss_clip": 0.01159224, + "auxiliary_loss_mlp": 0.01052946, + "balance_loss_clip": 1.03295422, + "balance_loss_mlp": 1.05531979, + "epoch": 0.15992785209679844, + "flos": 22054969739520.0, + "grad_norm": 2.4521775760186526, + "language_loss": 0.90417045, + "learning_rate": 3.825513975315508e-06, + "loss": 0.92629218, + "num_input_tokens_seen": 57596335, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0390625, + "step": 2660, + "time_per_iteration": 2.45237398147583 + }, + { + "auxiliary_loss_clip": 0.01161295, + "auxiliary_loss_mlp": 0.0105014, + "balance_loss_clip": 1.0300889, + "balance_loss_mlp": 1.05822825, + "epoch": 0.1599879753494664, + "flos": 33066652625280.0, + "grad_norm": 2.0123178843036373, + "language_loss": 0.77552611, + "learning_rate": 3.82535484444872e-06, + "loss": 0.79764044, + "num_input_tokens_seen": 57616830, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.03125, + "step": 2661, + "time_per_iteration": 2.574652910232544 + }, + { + "auxiliary_loss_clip": 0.01158998, + "auxiliary_loss_mlp": 0.01048944, + "balance_loss_clip": 1.02913153, + "balance_loss_mlp": 1.05460262, + "epoch": 0.16004809860213437, + "flos": 28038010343040.0, + "grad_norm": 1.7348749157972516, + "language_loss": 0.74735796, + "learning_rate": 3.825195644364292e-06, + "loss": 0.76943737, + "num_input_tokens_seen": 57635515, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.046875, + "step": 2662, + "time_per_iteration": 2.506974935531616 + }, + { + "auxiliary_loss_clip": 0.01158039, + "auxiliary_loss_mlp": 0.01051532, + "balance_loss_clip": 1.03233898, + "balance_loss_mlp": 1.05416894, + "epoch": 0.16010822185480234, + "flos": 22780113505920.0, + "grad_norm": 2.0770925688556074, + "language_loss": 0.82047677, + "learning_rate": 3.825036375068263e-06, + "loss": 0.84257245, + "num_input_tokens_seen": 57654250, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0390625, + "step": 2663, + "time_per_iteration": 2.459630012512207 + }, + { + "auxiliary_loss_clip": 0.0116012, + "auxiliary_loss_mlp": 0.0104966, + "balance_loss_clip": 1.02978826, + "balance_loss_mlp": 1.05576038, + "epoch": 0.16016834510747033, + "flos": 20084012611200.0, + "grad_norm": 2.5815812177362454, + "language_loss": 0.7910682, + "learning_rate": 3.824877036566672e-06, + "loss": 0.81316602, + "num_input_tokens_seen": 57672645, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0390625, + "step": 2664, + "time_per_iteration": 2.4978790283203125 + }, + { + "auxiliary_loss_clip": 0.01156133, + "auxiliary_loss_mlp": 0.01051164, + "balance_loss_clip": 1.03222167, + "balance_loss_mlp": 1.05318165, + "epoch": 0.1602284683601383, + "flos": 21173829206400.0, + "grad_norm": 1.8148985254226184, + "language_loss": 0.93767202, + "learning_rate": 3.824717628865561e-06, + "loss": 0.95974499, + "num_input_tokens_seen": 57691055, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.03125, + "step": 2665, + "time_per_iteration": 2.467349052429199 + }, + { + "auxiliary_loss_clip": 0.01157965, + "auxiliary_loss_mlp": 0.01047224, + "balance_loss_clip": 1.02750635, + "balance_loss_mlp": 1.05352151, + "epoch": 0.16028859161280626, + "flos": 14647568244480.0, + "grad_norm": 1.9534389472193405, + "language_loss": 0.85255575, + "learning_rate": 3.824558151970974e-06, + "loss": 0.87460762, + "num_input_tokens_seen": 57707235, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.046875, + "step": 2666, + "time_per_iteration": 2.4229867458343506 + }, + { + "auxiliary_loss_clip": 0.01155877, + "auxiliary_loss_mlp": 0.01047711, + "balance_loss_clip": 1.02899504, + "balance_loss_mlp": 1.05404496, + "epoch": 0.16034871486547422, + "flos": 20990325600000.0, + "grad_norm": 1.873987360542769, + "language_loss": 0.81461811, + "learning_rate": 3.8243986058889595e-06, + "loss": 0.83665401, + "num_input_tokens_seen": 57724190, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 2667, + "time_per_iteration": 2.4989583492279053 + }, + { + "auxiliary_loss_clip": 0.01157612, + "auxiliary_loss_mlp": 0.01050501, + "balance_loss_clip": 1.03104627, + "balance_loss_mlp": 1.05707479, + "epoch": 0.1604088381181422, + "flos": 21397732634880.0, + "grad_norm": 2.676276626789842, + "language_loss": 0.74079859, + "learning_rate": 3.824238990625567e-06, + "loss": 0.76287973, + "num_input_tokens_seen": 57743620, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0078125, + "step": 2668, + "time_per_iteration": 2.463395357131958 + }, + { + "auxiliary_loss_clip": 0.01158531, + "auxiliary_loss_mlp": 0.01051768, + "balance_loss_clip": 1.03175282, + "balance_loss_mlp": 1.05527806, + "epoch": 0.16046896137081015, + "flos": 23877040993920.0, + "grad_norm": 1.6382268793433732, + "language_loss": 0.77214229, + "learning_rate": 3.824079306186848e-06, + "loss": 0.79424524, + "num_input_tokens_seen": 57764810, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.03125, + "step": 2669, + "time_per_iteration": 2.5107781887054443 + }, + { + "auxiliary_loss_clip": 0.01059914, + "auxiliary_loss_mlp": 0.01008943, + "balance_loss_clip": 1.0062964, + "balance_loss_mlp": 1.0249362, + "epoch": 0.16052908462347812, + "flos": 59806709015040.0, + "grad_norm": 0.8072457077707946, + "language_loss": 0.55571371, + "learning_rate": 3.823919552578861e-06, + "loss": 0.57640231, + "num_input_tokens_seen": 57824390, + "router_z_loss_clip": 0.02648926, + "router_z_loss_mlp": 0.34960938, + "step": 2670, + "time_per_iteration": 2.964386463165283 + }, + { + "auxiliary_loss_clip": 0.01157188, + "auxiliary_loss_mlp": 0.01043938, + "balance_loss_clip": 1.02544856, + "balance_loss_mlp": 1.05379438, + "epoch": 0.1605892078761461, + "flos": 18296559089280.0, + "grad_norm": 8.31640977393562, + "language_loss": 0.77088535, + "learning_rate": 3.82375972980766e-06, + "loss": 0.79289663, + "num_input_tokens_seen": 57843665, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.03125, + "step": 2671, + "time_per_iteration": 2.4722845554351807 + }, + { + "auxiliary_loss_clip": 0.01159298, + "auxiliary_loss_mlp": 0.01045605, + "balance_loss_clip": 1.02684164, + "balance_loss_mlp": 1.05666459, + "epoch": 0.16064933112881408, + "flos": 32160734686080.0, + "grad_norm": 1.9636142117953166, + "language_loss": 0.64497644, + "learning_rate": 3.8235998378793086e-06, + "loss": 0.66702545, + "num_input_tokens_seen": 57863305, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.03125, + "step": 2672, + "time_per_iteration": 2.5702145099639893 + }, + { + "auxiliary_loss_clip": 0.01157155, + "auxiliary_loss_mlp": 0.01042294, + "balance_loss_clip": 1.02128983, + "balance_loss_mlp": 1.05270457, + "epoch": 0.16070945438148204, + "flos": 19828795501440.0, + "grad_norm": 1.885579538712505, + "language_loss": 0.8533771, + "learning_rate": 3.8234398767998675e-06, + "loss": 0.87537158, + "num_input_tokens_seen": 57883025, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.046875, + "step": 2673, + "time_per_iteration": 2.4754209518432617 + }, + { + "auxiliary_loss_clip": 0.01156938, + "auxiliary_loss_mlp": 0.01055602, + "balance_loss_clip": 1.03718424, + "balance_loss_mlp": 1.05537605, + "epoch": 0.16076957763415, + "flos": 18913144976640.0, + "grad_norm": 2.484212796080384, + "language_loss": 0.72797197, + "learning_rate": 3.823279846575403e-06, + "loss": 0.75009739, + "num_input_tokens_seen": 57901430, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.015625, + "step": 2674, + "time_per_iteration": 2.4771230220794678 + }, + { + "auxiliary_loss_clip": 0.01153717, + "auxiliary_loss_mlp": 0.01047616, + "balance_loss_clip": 1.02745771, + "balance_loss_mlp": 1.05242229, + "epoch": 0.16082970088681797, + "flos": 16764358590720.0, + "grad_norm": 2.0917218572710143, + "language_loss": 0.84550452, + "learning_rate": 3.823119747211986e-06, + "loss": 0.86751789, + "num_input_tokens_seen": 57919550, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.015625, + "step": 2675, + "time_per_iteration": 2.4583237171173096 + }, + { + "auxiliary_loss_clip": 0.01158822, + "auxiliary_loss_mlp": 0.0104935, + "balance_loss_clip": 1.02890563, + "balance_loss_mlp": 1.0566349, + "epoch": 0.16088982413948594, + "flos": 35150261783040.0, + "grad_norm": 1.979365293626276, + "language_loss": 0.82605797, + "learning_rate": 3.822959578715685e-06, + "loss": 0.84813964, + "num_input_tokens_seen": 57939890, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0234375, + "step": 2676, + "time_per_iteration": 2.5966403484344482 + }, + { + "auxiliary_loss_clip": 0.01157172, + "auxiliary_loss_mlp": 0.01050632, + "balance_loss_clip": 1.03263116, + "balance_loss_mlp": 1.05701363, + "epoch": 0.1609499473921539, + "flos": 18625105814400.0, + "grad_norm": 1.9372140801278581, + "language_loss": 0.73252106, + "learning_rate": 3.822799341092573e-06, + "loss": 0.75459909, + "num_input_tokens_seen": 57957410, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0, + "step": 2677, + "time_per_iteration": 2.459545135498047 + }, + { + "auxiliary_loss_clip": 0.01153742, + "auxiliary_loss_mlp": 0.01046774, + "balance_loss_clip": 1.02774811, + "balance_loss_mlp": 1.05381799, + "epoch": 0.1610100706448219, + "flos": 33145728416640.0, + "grad_norm": 3.4714871699848, + "language_loss": 0.76175338, + "learning_rate": 3.822639034348728e-06, + "loss": 0.78375852, + "num_input_tokens_seen": 57977900, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0, + "step": 2678, + "time_per_iteration": 2.6220550537109375 + }, + { + "auxiliary_loss_clip": 0.01154476, + "auxiliary_loss_mlp": 0.01048242, + "balance_loss_clip": 1.02835846, + "balance_loss_mlp": 1.05157948, + "epoch": 0.16107019389748986, + "flos": 34676707852800.0, + "grad_norm": 1.6939354956764687, + "language_loss": 0.70202518, + "learning_rate": 3.822478658490228e-06, + "loss": 0.72405231, + "num_input_tokens_seen": 57998210, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.03125, + "step": 2679, + "time_per_iteration": 2.580995559692383 + }, + { + "auxiliary_loss_clip": 0.01054747, + "auxiliary_loss_mlp": 0.01023179, + "balance_loss_clip": 1.020854, + "balance_loss_mlp": 1.02026391, + "epoch": 0.16113031715015783, + "flos": 65713403260800.0, + "grad_norm": 0.8161414687228778, + "language_loss": 0.51844025, + "learning_rate": 3.822318213523154e-06, + "loss": 0.5392195, + "num_input_tokens_seen": 58059420, + "router_z_loss_clip": 0.02319336, + "router_z_loss_mlp": 0.34375, + "step": 2680, + "time_per_iteration": 3.105682849884033 + }, + { + "auxiliary_loss_clip": 0.01155604, + "auxiliary_loss_mlp": 0.01047691, + "balance_loss_clip": 1.02750874, + "balance_loss_mlp": 1.05157876, + "epoch": 0.1611904404028258, + "flos": 20810413353600.0, + "grad_norm": 1.8335073832427007, + "language_loss": 0.80319828, + "learning_rate": 3.8221576994535925e-06, + "loss": 0.82523119, + "num_input_tokens_seen": 58078370, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0390625, + "step": 2681, + "time_per_iteration": 2.4695565700531006 + }, + { + "auxiliary_loss_clip": 0.01151045, + "auxiliary_loss_mlp": 0.01058971, + "balance_loss_clip": 1.04031444, + "balance_loss_mlp": 1.05258918, + "epoch": 0.16125056365549376, + "flos": 27013335062400.0, + "grad_norm": 1.8021457293712753, + "language_loss": 0.69142133, + "learning_rate": 3.821997116287627e-06, + "loss": 0.71352148, + "num_input_tokens_seen": 58097395, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.984375, + "step": 2682, + "time_per_iteration": 2.5027854442596436 + }, + { + "auxiliary_loss_clip": 0.011576, + "auxiliary_loss_mlp": 0.01048243, + "balance_loss_clip": 1.02800107, + "balance_loss_mlp": 1.0559957, + "epoch": 0.16131068690816172, + "flos": 19276524915840.0, + "grad_norm": 1.8107912193408944, + "language_loss": 0.87568235, + "learning_rate": 3.821836464031348e-06, + "loss": 0.89774084, + "num_input_tokens_seen": 58115630, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.015625, + "step": 2683, + "time_per_iteration": 2.461944341659546 + }, + { + "auxiliary_loss_clip": 0.01156212, + "auxiliary_loss_mlp": 0.0105566, + "balance_loss_clip": 1.03587174, + "balance_loss_mlp": 1.05452991, + "epoch": 0.16137081016082971, + "flos": 35337931367040.0, + "grad_norm": 3.5824209574719035, + "language_loss": 0.74160969, + "learning_rate": 3.821675742690849e-06, + "loss": 0.76372838, + "num_input_tokens_seen": 58138655, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.015625, + "step": 2684, + "time_per_iteration": 4.005981206893921 + }, + { + "auxiliary_loss_clip": 0.01159701, + "auxiliary_loss_mlp": 0.01048543, + "balance_loss_clip": 1.02811038, + "balance_loss_mlp": 1.05543995, + "epoch": 0.16143093341349768, + "flos": 34235257703040.0, + "grad_norm": 1.919238603617177, + "language_loss": 0.70244128, + "learning_rate": 3.821514952272223e-06, + "loss": 0.72452366, + "num_input_tokens_seen": 58157440, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.046875, + "step": 2685, + "time_per_iteration": 5.387023448944092 + }, + { + "auxiliary_loss_clip": 0.0115036, + "auxiliary_loss_mlp": 0.01047397, + "balance_loss_clip": 1.0282284, + "balance_loss_mlp": 1.0518229, + "epoch": 0.16149105666616564, + "flos": 27999262546560.0, + "grad_norm": 1.8016019482814314, + "language_loss": 0.71518582, + "learning_rate": 3.821354092781567e-06, + "loss": 0.73716336, + "num_input_tokens_seen": 58176660, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.984375, + "step": 2686, + "time_per_iteration": 2.5451064109802246 + }, + { + "auxiliary_loss_clip": 0.01157161, + "auxiliary_loss_mlp": 0.01051189, + "balance_loss_clip": 1.03191292, + "balance_loss_mlp": 1.05551481, + "epoch": 0.1615511799188336, + "flos": 19422214479360.0, + "grad_norm": 1.8631629169214377, + "language_loss": 0.81521869, + "learning_rate": 3.821193164224981e-06, + "loss": 0.83730221, + "num_input_tokens_seen": 58195085, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.015625, + "step": 2687, + "time_per_iteration": 2.4542620182037354 + }, + { + "auxiliary_loss_clip": 0.01155843, + "auxiliary_loss_mlp": 0.01044301, + "balance_loss_clip": 1.02327275, + "balance_loss_mlp": 1.04894984, + "epoch": 0.16161130317150157, + "flos": 22854915578880.0, + "grad_norm": 1.8081463969498348, + "language_loss": 0.71823454, + "learning_rate": 3.821032166608568e-06, + "loss": 0.74023592, + "num_input_tokens_seen": 58213540, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.0625, + "step": 2688, + "time_per_iteration": 2.493476152420044 + }, + { + "auxiliary_loss_clip": 0.0115191, + "auxiliary_loss_mlp": 0.01045785, + "balance_loss_clip": 1.02730739, + "balance_loss_mlp": 1.05067098, + "epoch": 0.16167142642416954, + "flos": 26110577520000.0, + "grad_norm": 2.2392978206929555, + "language_loss": 0.76041406, + "learning_rate": 3.8208710999384325e-06, + "loss": 0.78239101, + "num_input_tokens_seen": 58236995, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.015625, + "step": 2689, + "time_per_iteration": 2.5840976238250732 + }, + { + "auxiliary_loss_clip": 0.01155388, + "auxiliary_loss_mlp": 0.01046669, + "balance_loss_clip": 1.02704763, + "balance_loss_mlp": 1.05417943, + "epoch": 0.1617315496768375, + "flos": 22779646629120.0, + "grad_norm": 1.9258973882551216, + "language_loss": 0.87260234, + "learning_rate": 3.820709964220683e-06, + "loss": 0.89462292, + "num_input_tokens_seen": 58257230, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.015625, + "step": 2690, + "time_per_iteration": 2.496943473815918 + }, + { + "auxiliary_loss_clip": 0.01151534, + "auxiliary_loss_mlp": 0.01047712, + "balance_loss_clip": 1.02980638, + "balance_loss_mlp": 1.05211663, + "epoch": 0.1617916729295055, + "flos": 22017299351040.0, + "grad_norm": 1.562024048541713, + "language_loss": 0.87728393, + "learning_rate": 3.8205487594614284e-06, + "loss": 0.89927632, + "num_input_tokens_seen": 58277080, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9921875, + "step": 2691, + "time_per_iteration": 2.510960817337036 + }, + { + "auxiliary_loss_clip": 0.01157097, + "auxiliary_loss_mlp": 0.01049167, + "balance_loss_clip": 1.02764988, + "balance_loss_mlp": 1.05021381, + "epoch": 0.16185179618217346, + "flos": 23438248450560.0, + "grad_norm": 2.082856606872889, + "language_loss": 0.82327259, + "learning_rate": 3.820387485666784e-06, + "loss": 0.84533525, + "num_input_tokens_seen": 58294815, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.0703125, + "step": 2692, + "time_per_iteration": 2.481032371520996 + }, + { + "auxiliary_loss_clip": 0.0115716, + "auxiliary_loss_mlp": 0.01048999, + "balance_loss_clip": 1.02835155, + "balance_loss_mlp": 1.05069244, + "epoch": 0.16191191943484143, + "flos": 25666110627840.0, + "grad_norm": 3.0763505181853454, + "language_loss": 0.80942917, + "learning_rate": 3.820226142842862e-06, + "loss": 0.83149081, + "num_input_tokens_seen": 58313215, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.0625, + "step": 2693, + "time_per_iteration": 2.493278980255127 + }, + { + "auxiliary_loss_clip": 0.01150662, + "auxiliary_loss_mlp": 0.01054953, + "balance_loss_clip": 1.03670192, + "balance_loss_mlp": 1.05223358, + "epoch": 0.1619720426875094, + "flos": 23477355383040.0, + "grad_norm": 1.7139740211881158, + "language_loss": 0.83639967, + "learning_rate": 3.820064730995783e-06, + "loss": 0.85845578, + "num_input_tokens_seen": 58333215, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.984375, + "step": 2694, + "time_per_iteration": 2.5051510334014893 + }, + { + "auxiliary_loss_clip": 0.01156309, + "auxiliary_loss_mlp": 0.01048183, + "balance_loss_clip": 1.02764273, + "balance_loss_mlp": 1.0509156, + "epoch": 0.16203216594017736, + "flos": 24133658734080.0, + "grad_norm": 1.9608549080280004, + "language_loss": 0.69125426, + "learning_rate": 3.819903250131667e-06, + "loss": 0.71329916, + "num_input_tokens_seen": 58351160, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0546875, + "step": 2695, + "time_per_iteration": 2.495098352432251 + }, + { + "auxiliary_loss_clip": 0.01159947, + "auxiliary_loss_mlp": 0.01054922, + "balance_loss_clip": 1.03391731, + "balance_loss_mlp": 1.05520689, + "epoch": 0.16209228919284532, + "flos": 22340889999360.0, + "grad_norm": 2.466913217352614, + "language_loss": 0.82403111, + "learning_rate": 3.819741700256637e-06, + "loss": 0.84617984, + "num_input_tokens_seen": 58368505, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.046875, + "step": 2696, + "time_per_iteration": 2.484523296356201 + }, + { + "auxiliary_loss_clip": 0.01161904, + "auxiliary_loss_mlp": 0.01056335, + "balance_loss_clip": 1.03529406, + "balance_loss_mlp": 1.05316591, + "epoch": 0.1621524124455133, + "flos": 15815131827840.0, + "grad_norm": 1.9982919021229957, + "language_loss": 0.8852337, + "learning_rate": 3.8195800813768194e-06, + "loss": 0.90741605, + "num_input_tokens_seen": 58385085, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.09375, + "step": 2697, + "time_per_iteration": 2.4806151390075684 + }, + { + "auxiliary_loss_clip": 0.01147135, + "auxiliary_loss_mlp": 0.01046149, + "balance_loss_clip": 1.02756453, + "balance_loss_mlp": 1.04989469, + "epoch": 0.16221253569818128, + "flos": 30186688988160.0, + "grad_norm": 1.4702975792509376, + "language_loss": 0.80172735, + "learning_rate": 3.819418393498343e-06, + "loss": 0.82366014, + "num_input_tokens_seen": 58406985, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.97265625, + "step": 2698, + "time_per_iteration": 2.532137393951416 + }, + { + "auxiliary_loss_clip": 0.01149805, + "auxiliary_loss_mlp": 0.01049018, + "balance_loss_clip": 1.02957439, + "balance_loss_mlp": 1.05167758, + "epoch": 0.16227265895084925, + "flos": 24605991601920.0, + "grad_norm": 1.5576448961090323, + "language_loss": 0.77258182, + "learning_rate": 3.819256636627339e-06, + "loss": 0.79456997, + "num_input_tokens_seen": 58426205, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.98046875, + "step": 2699, + "time_per_iteration": 2.514084577560425 + }, + { + "auxiliary_loss_clip": 0.01150261, + "auxiliary_loss_mlp": 0.0104371, + "balance_loss_clip": 1.0251497, + "balance_loss_mlp": 1.04891944, + "epoch": 0.1623327822035172, + "flos": 19573326996480.0, + "grad_norm": 2.038036982956784, + "language_loss": 0.85697722, + "learning_rate": 3.81909481076994e-06, + "loss": 0.87891692, + "num_input_tokens_seen": 58443830, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.015625, + "step": 2700, + "time_per_iteration": 2.4434289932250977 + }, + { + "auxiliary_loss_clip": 0.01147712, + "auxiliary_loss_mlp": 0.0104585, + "balance_loss_clip": 1.0247376, + "balance_loss_mlp": 1.04878318, + "epoch": 0.16239290545618518, + "flos": 26468462678400.0, + "grad_norm": 1.6982179557795123, + "language_loss": 0.80378878, + "learning_rate": 3.818932915932284e-06, + "loss": 0.82572436, + "num_input_tokens_seen": 58464405, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.98828125, + "step": 2701, + "time_per_iteration": 2.5267322063446045 + }, + { + "auxiliary_loss_clip": 0.01156296, + "auxiliary_loss_mlp": 0.01048895, + "balance_loss_clip": 1.02945244, + "balance_loss_mlp": 1.05514598, + "epoch": 0.16245302870885314, + "flos": 15851940289920.0, + "grad_norm": 1.5999982166608073, + "language_loss": 0.73006868, + "learning_rate": 3.818770952120511e-06, + "loss": 0.75212055, + "num_input_tokens_seen": 58483295, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0078125, + "step": 2702, + "time_per_iteration": 2.44750714302063 + }, + { + "auxiliary_loss_clip": 0.01153204, + "auxiliary_loss_mlp": 0.01050741, + "balance_loss_clip": 1.02986753, + "balance_loss_mlp": 1.05053687, + "epoch": 0.1625131519615211, + "flos": 14756521173120.0, + "grad_norm": 2.5386207662450464, + "language_loss": 0.73164749, + "learning_rate": 3.81860891934076e-06, + "loss": 0.7536869, + "num_input_tokens_seen": 58501205, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.0234375, + "step": 2703, + "time_per_iteration": 2.469242811203003 + }, + { + "auxiliary_loss_clip": 0.01150736, + "auxiliary_loss_mlp": 0.01046914, + "balance_loss_clip": 1.02538514, + "balance_loss_mlp": 1.04765964, + "epoch": 0.1625732752141891, + "flos": 28220508368640.0, + "grad_norm": 1.9216464968932823, + "language_loss": 0.70681584, + "learning_rate": 3.818446817599176e-06, + "loss": 0.72879231, + "num_input_tokens_seen": 58522315, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.03125, + "step": 2704, + "time_per_iteration": 2.5236263275146484 + }, + { + "auxiliary_loss_clip": 0.0105028, + "auxiliary_loss_mlp": 0.01003507, + "balance_loss_clip": 1.00091982, + "balance_loss_mlp": 1.01563144, + "epoch": 0.16263339846685707, + "flos": 67327947688320.0, + "grad_norm": 0.7797469934396678, + "language_loss": 0.53369009, + "learning_rate": 3.818284646901907e-06, + "loss": 0.55422795, + "num_input_tokens_seen": 58586695, + "router_z_loss_clip": 0.02587891, + "router_z_loss_mlp": 0.34765625, + "step": 2705, + "time_per_iteration": 3.0887868404388428 + }, + { + "auxiliary_loss_clip": 0.0115608, + "auxiliary_loss_mlp": 0.01048272, + "balance_loss_clip": 1.02873373, + "balance_loss_mlp": 1.05151534, + "epoch": 0.16269352171952503, + "flos": 14319165173760.0, + "grad_norm": 2.4525976943058896, + "language_loss": 0.75060308, + "learning_rate": 3.818122407255102e-06, + "loss": 0.77264655, + "num_input_tokens_seen": 58602435, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.046875, + "step": 2706, + "time_per_iteration": 2.439283847808838 + }, + { + "auxiliary_loss_clip": 0.01154579, + "auxiliary_loss_mlp": 0.01051686, + "balance_loss_clip": 1.03248119, + "balance_loss_mlp": 1.05240536, + "epoch": 0.162753644972193, + "flos": 28361205941760.0, + "grad_norm": 1.9153778871117788, + "language_loss": 0.7234174, + "learning_rate": 3.817960098664914e-06, + "loss": 0.74547994, + "num_input_tokens_seen": 58621275, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.015625, + "step": 2707, + "time_per_iteration": 2.51819109916687 + }, + { + "auxiliary_loss_clip": 0.01155215, + "auxiliary_loss_mlp": 0.01050366, + "balance_loss_clip": 1.03154302, + "balance_loss_mlp": 1.05275822, + "epoch": 0.16281376822486096, + "flos": 19937856170880.0, + "grad_norm": 3.869992791268662, + "language_loss": 0.83790398, + "learning_rate": 3.817797721137495e-06, + "loss": 0.85995972, + "num_input_tokens_seen": 58637550, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.03125, + "step": 2708, + "time_per_iteration": 2.4592010974884033 + }, + { + "auxiliary_loss_clip": 0.0115992, + "auxiliary_loss_mlp": 0.01049095, + "balance_loss_clip": 1.02768469, + "balance_loss_mlp": 1.05268705, + "epoch": 0.16287389147752893, + "flos": 21251719848960.0, + "grad_norm": 2.162290718142945, + "language_loss": 0.86529553, + "learning_rate": 3.817635274679006e-06, + "loss": 0.88738573, + "num_input_tokens_seen": 58654135, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.0703125, + "step": 2709, + "time_per_iteration": 2.4745054244995117 + }, + { + "auxiliary_loss_clip": 0.01154974, + "auxiliary_loss_mlp": 0.01054439, + "balance_loss_clip": 1.0353297, + "balance_loss_mlp": 1.05096519, + "epoch": 0.1629340147301969, + "flos": 19244672530560.0, + "grad_norm": 1.6782807127870958, + "language_loss": 0.91449893, + "learning_rate": 3.817472759295605e-06, + "loss": 0.93659306, + "num_input_tokens_seen": 58674320, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0390625, + "step": 2710, + "time_per_iteration": 2.4846651554107666 + }, + { + "auxiliary_loss_clip": 0.0115562, + "auxiliary_loss_mlp": 0.01054818, + "balance_loss_clip": 1.03549433, + "balance_loss_mlp": 1.05447197, + "epoch": 0.16299413798286488, + "flos": 21249816428160.0, + "grad_norm": 1.99410407833921, + "language_loss": 0.8129673, + "learning_rate": 3.817310174993453e-06, + "loss": 0.83507168, + "num_input_tokens_seen": 58691000, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0078125, + "step": 2711, + "time_per_iteration": 2.4878618717193604 + }, + { + "auxiliary_loss_clip": 0.01153665, + "auxiliary_loss_mlp": 0.01046499, + "balance_loss_clip": 1.02731824, + "balance_loss_mlp": 1.04737568, + "epoch": 0.16305426123553285, + "flos": 18770579896320.0, + "grad_norm": 2.7794575527068077, + "language_loss": 0.81605875, + "learning_rate": 3.817147521778719e-06, + "loss": 0.83806038, + "num_input_tokens_seen": 58710230, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0625, + "step": 2712, + "time_per_iteration": 2.4479072093963623 + }, + { + "auxiliary_loss_clip": 0.01158025, + "auxiliary_loss_mlp": 0.01058532, + "balance_loss_clip": 1.03858864, + "balance_loss_mlp": 1.05211174, + "epoch": 0.16311438448820081, + "flos": 22087648137600.0, + "grad_norm": 2.1959953506899774, + "language_loss": 0.76885653, + "learning_rate": 3.816984799657568e-06, + "loss": 0.79102206, + "num_input_tokens_seen": 58728610, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0625, + "step": 2713, + "time_per_iteration": 2.493394374847412 + }, + { + "auxiliary_loss_clip": 0.01155185, + "auxiliary_loss_mlp": 0.01062558, + "balance_loss_clip": 1.04290032, + "balance_loss_mlp": 1.05623782, + "epoch": 0.16317450774086878, + "flos": 16467700164480.0, + "grad_norm": 2.081844956712308, + "language_loss": 0.78926778, + "learning_rate": 3.8168220086361715e-06, + "loss": 0.8114453, + "num_input_tokens_seen": 58744385, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.98828125, + "step": 2714, + "time_per_iteration": 2.442214012145996 + }, + { + "auxiliary_loss_clip": 0.01155305, + "auxiliary_loss_mlp": 0.01059199, + "balance_loss_clip": 1.04011369, + "balance_loss_mlp": 1.05286288, + "epoch": 0.16323463099353674, + "flos": 24352929308160.0, + "grad_norm": 2.259619309439112, + "language_loss": 0.78143466, + "learning_rate": 3.816659148720702e-06, + "loss": 0.80357969, + "num_input_tokens_seen": 58763905, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0234375, + "step": 2715, + "time_per_iteration": 2.499178409576416 + }, + { + "auxiliary_loss_clip": 0.01150615, + "auxiliary_loss_mlp": 0.01047807, + "balance_loss_clip": 1.02973497, + "balance_loss_mlp": 1.04868412, + "epoch": 0.1632947542462047, + "flos": 24900782520960.0, + "grad_norm": 2.0916631483814783, + "language_loss": 0.81397748, + "learning_rate": 3.816496219917336e-06, + "loss": 0.8359617, + "num_input_tokens_seen": 58785580, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.015625, + "step": 2716, + "time_per_iteration": 2.5004689693450928 + }, + { + "auxiliary_loss_clip": 0.01158421, + "auxiliary_loss_mlp": 0.01057354, + "balance_loss_clip": 1.03853106, + "balance_loss_mlp": 1.05482328, + "epoch": 0.1633548774988727, + "flos": 24900279730560.0, + "grad_norm": 1.8793848003912939, + "language_loss": 0.86203027, + "learning_rate": 3.816333222232251e-06, + "loss": 0.88418794, + "num_input_tokens_seen": 58806075, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0390625, + "step": 2717, + "time_per_iteration": 2.5112617015838623 + }, + { + "auxiliary_loss_clip": 0.0115161, + "auxiliary_loss_mlp": 0.01046152, + "balance_loss_clip": 1.02725708, + "balance_loss_mlp": 1.05153894, + "epoch": 0.16341500075154067, + "flos": 30441798357120.0, + "grad_norm": 1.652261986612604, + "language_loss": 0.76514149, + "learning_rate": 3.816170155671629e-06, + "loss": 0.78711915, + "num_input_tokens_seen": 58827405, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0, + "step": 2718, + "time_per_iteration": 2.549654245376587 + }, + { + "auxiliary_loss_clip": 0.01156654, + "auxiliary_loss_mlp": 0.01045361, + "balance_loss_clip": 1.02696729, + "balance_loss_mlp": 1.05180717, + "epoch": 0.16347512400420863, + "flos": 22784530878720.0, + "grad_norm": 2.080955072975882, + "language_loss": 0.73027492, + "learning_rate": 3.816007020241652e-06, + "loss": 0.75229508, + "num_input_tokens_seen": 58847205, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.046875, + "step": 2719, + "time_per_iteration": 2.4911599159240723 + }, + { + "auxiliary_loss_clip": 0.01151759, + "auxiliary_loss_mlp": 0.01049636, + "balance_loss_clip": 1.03084862, + "balance_loss_mlp": 1.0492239, + "epoch": 0.1635352472568766, + "flos": 22633274707200.0, + "grad_norm": 1.6610037254914274, + "language_loss": 0.72384167, + "learning_rate": 3.815843815948507e-06, + "loss": 0.74585563, + "num_input_tokens_seen": 58866865, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0234375, + "step": 2720, + "time_per_iteration": 2.4733760356903076 + }, + { + "auxiliary_loss_clip": 0.01150132, + "auxiliary_loss_mlp": 0.01048266, + "balance_loss_clip": 1.02789283, + "balance_loss_mlp": 1.05076206, + "epoch": 0.16359537050954456, + "flos": 15522998515200.0, + "grad_norm": 2.2797021453727893, + "language_loss": 0.75100243, + "learning_rate": 3.8156805427983824e-06, + "loss": 0.77298641, + "num_input_tokens_seen": 58885200, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.9921875, + "step": 2721, + "time_per_iteration": 2.44942569732666 + }, + { + "auxiliary_loss_clip": 0.01155245, + "auxiliary_loss_mlp": 0.01049168, + "balance_loss_clip": 1.02893853, + "balance_loss_mlp": 1.0502317, + "epoch": 0.16365549376221253, + "flos": 22090162089600.0, + "grad_norm": 1.74959220753002, + "language_loss": 0.79254043, + "learning_rate": 3.8155172007974695e-06, + "loss": 0.81458461, + "num_input_tokens_seen": 58906385, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.046875, + "step": 2722, + "time_per_iteration": 2.4775915145874023 + }, + { + "auxiliary_loss_clip": 0.01158964, + "auxiliary_loss_mlp": 0.01049216, + "balance_loss_clip": 1.02675653, + "balance_loss_mlp": 1.05248678, + "epoch": 0.1637156170148805, + "flos": 24060400945920.0, + "grad_norm": 2.0539311275727634, + "language_loss": 0.8477816, + "learning_rate": 3.8153537899519624e-06, + "loss": 0.86986339, + "num_input_tokens_seen": 58925040, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.0625, + "step": 2723, + "time_per_iteration": 2.5084922313690186 + }, + { + "auxiliary_loss_clip": 0.01146914, + "auxiliary_loss_mlp": 0.01037208, + "balance_loss_clip": 1.0177772, + "balance_loss_mlp": 1.04940808, + "epoch": 0.1637757402675485, + "flos": 26685362954880.0, + "grad_norm": 2.0049787201865503, + "language_loss": 0.70883536, + "learning_rate": 3.815190310268058e-06, + "loss": 0.73067659, + "num_input_tokens_seen": 58944790, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.9765625, + "step": 2724, + "time_per_iteration": 2.5094263553619385 + }, + { + "auxiliary_loss_clip": 0.01150034, + "auxiliary_loss_mlp": 0.01044202, + "balance_loss_clip": 1.02583206, + "balance_loss_mlp": 1.05113125, + "epoch": 0.16383586352021645, + "flos": 16106941918080.0, + "grad_norm": 2.04326868324577, + "language_loss": 0.70914948, + "learning_rate": 3.815026761751955e-06, + "loss": 0.73109186, + "num_input_tokens_seen": 58962500, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.9921875, + "step": 2725, + "time_per_iteration": 2.495342254638672 + }, + { + "auxiliary_loss_clip": 0.01149794, + "auxiliary_loss_mlp": 0.01042547, + "balance_loss_clip": 1.02437937, + "balance_loss_mlp": 1.05219352, + "epoch": 0.16389598677288442, + "flos": 19165991788800.0, + "grad_norm": 1.9381311422505, + "language_loss": 0.8873682, + "learning_rate": 3.814863144409855e-06, + "loss": 0.90929163, + "num_input_tokens_seen": 58980355, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9765625, + "step": 2726, + "time_per_iteration": 3.983738660812378 + }, + { + "auxiliary_loss_clip": 0.01156798, + "auxiliary_loss_mlp": 0.01049309, + "balance_loss_clip": 1.02965117, + "balance_loss_mlp": 1.05406547, + "epoch": 0.16395611002555238, + "flos": 21507008785920.0, + "grad_norm": 1.8502717081228044, + "language_loss": 0.7439661, + "learning_rate": 3.814699458247963e-06, + "loss": 0.76602715, + "num_input_tokens_seen": 58999505, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.03125, + "step": 2727, + "time_per_iteration": 5.52494215965271 + }, + { + "auxiliary_loss_clip": 0.01150784, + "auxiliary_loss_mlp": 0.0105177, + "balance_loss_clip": 1.03429413, + "balance_loss_mlp": 1.05145037, + "epoch": 0.16401623327822035, + "flos": 21470918595840.0, + "grad_norm": 1.6814144838265654, + "language_loss": 0.82321334, + "learning_rate": 3.8145357032724855e-06, + "loss": 0.84523886, + "num_input_tokens_seen": 59017930, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9921875, + "step": 2728, + "time_per_iteration": 2.4621498584747314 + }, + { + "auxiliary_loss_clip": 0.01156146, + "auxiliary_loss_mlp": 0.01050932, + "balance_loss_clip": 1.03131044, + "balance_loss_mlp": 1.05167341, + "epoch": 0.1640763565308883, + "flos": 13626232928640.0, + "grad_norm": 2.4458707176630425, + "language_loss": 0.84766865, + "learning_rate": 3.814371879489633e-06, + "loss": 0.86973941, + "num_input_tokens_seen": 59035130, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0390625, + "step": 2729, + "time_per_iteration": 2.459495782852173 + }, + { + "auxiliary_loss_clip": 0.01151277, + "auxiliary_loss_mlp": 0.01044659, + "balance_loss_clip": 1.02661061, + "balance_loss_mlp": 1.04923487, + "epoch": 0.16413647978355628, + "flos": 15451464579840.0, + "grad_norm": 1.9327126112676087, + "language_loss": 0.72569054, + "learning_rate": 3.814207986905616e-06, + "loss": 0.74764991, + "num_input_tokens_seen": 59053080, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.0234375, + "step": 2730, + "time_per_iteration": 2.451016902923584 + }, + { + "auxiliary_loss_clip": 0.01153124, + "auxiliary_loss_mlp": 0.01053311, + "balance_loss_clip": 1.03243709, + "balance_loss_mlp": 1.04862678, + "epoch": 0.16419660303622427, + "flos": 45878682015360.0, + "grad_norm": 2.2141787283307854, + "language_loss": 0.74431163, + "learning_rate": 3.814044025526651e-06, + "loss": 0.76637596, + "num_input_tokens_seen": 59075610, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.046875, + "step": 2731, + "time_per_iteration": 2.6857874393463135 + }, + { + "auxiliary_loss_clip": 0.0115844, + "auxiliary_loss_mlp": 0.01048812, + "balance_loss_clip": 1.02818894, + "balance_loss_mlp": 1.05408466, + "epoch": 0.16425672628889224, + "flos": 18952826526720.0, + "grad_norm": 2.15833206643789, + "language_loss": 0.78783584, + "learning_rate": 3.8138799953589548e-06, + "loss": 0.80990839, + "num_input_tokens_seen": 59094555, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.046875, + "step": 2732, + "time_per_iteration": 2.44146728515625 + }, + { + "auxiliary_loss_clip": 0.01155842, + "auxiliary_loss_mlp": 0.01051717, + "balance_loss_clip": 1.03166568, + "balance_loss_mlp": 1.05211556, + "epoch": 0.1643168495415602, + "flos": 24312996362880.0, + "grad_norm": 1.9937390498547816, + "language_loss": 0.68943298, + "learning_rate": 3.8137158964087473e-06, + "loss": 0.71150857, + "num_input_tokens_seen": 59113515, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.0390625, + "step": 2733, + "time_per_iteration": 2.4981601238250732 + }, + { + "auxiliary_loss_clip": 0.01151384, + "auxiliary_loss_mlp": 0.01048132, + "balance_loss_clip": 1.02792621, + "balance_loss_mlp": 1.05054927, + "epoch": 0.16437697279422817, + "flos": 26428421992320.0, + "grad_norm": 2.20018793155086, + "language_loss": 0.80626202, + "learning_rate": 3.8135517286822508e-06, + "loss": 0.8282572, + "num_input_tokens_seen": 59133275, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0078125, + "step": 2734, + "time_per_iteration": 2.495030641555786 + }, + { + "auxiliary_loss_clip": 0.01152713, + "auxiliary_loss_mlp": 0.0105599, + "balance_loss_clip": 1.03638041, + "balance_loss_mlp": 1.05143905, + "epoch": 0.16443709604689613, + "flos": 34532239351680.0, + "grad_norm": 4.0691467716051175, + "language_loss": 0.82265377, + "learning_rate": 3.8133874921856914e-06, + "loss": 0.84474081, + "num_input_tokens_seen": 59154095, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0078125, + "step": 2735, + "time_per_iteration": 2.5911896228790283 + }, + { + "auxiliary_loss_clip": 0.01150004, + "auxiliary_loss_mlp": 0.01044021, + "balance_loss_clip": 1.02556753, + "balance_loss_mlp": 1.05158913, + "epoch": 0.1644972192995641, + "flos": 23258048895360.0, + "grad_norm": 2.5735103485950077, + "language_loss": 0.78697491, + "learning_rate": 3.813223186925296e-06, + "loss": 0.80891526, + "num_input_tokens_seen": 59173795, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.984375, + "step": 2736, + "time_per_iteration": 2.4699559211730957 + }, + { + "auxiliary_loss_clip": 0.01155005, + "auxiliary_loss_mlp": 0.01052588, + "balance_loss_clip": 1.03438449, + "balance_loss_mlp": 1.05231023, + "epoch": 0.1645573425522321, + "flos": 26979543342720.0, + "grad_norm": 1.680513335410081, + "language_loss": 0.81409019, + "learning_rate": 3.8130588129072964e-06, + "loss": 0.83616614, + "num_input_tokens_seen": 59191610, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.03125, + "step": 2737, + "time_per_iteration": 2.4892401695251465 + }, + { + "auxiliary_loss_clip": 0.0115392, + "auxiliary_loss_mlp": 0.01046744, + "balance_loss_clip": 1.02819467, + "balance_loss_mlp": 1.05107307, + "epoch": 0.16461746580490005, + "flos": 28731768600960.0, + "grad_norm": 1.8393773079816103, + "language_loss": 0.87291563, + "learning_rate": 3.8128943701379246e-06, + "loss": 0.89492232, + "num_input_tokens_seen": 59213000, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.03125, + "step": 2738, + "time_per_iteration": 2.54569935798645 + }, + { + "auxiliary_loss_clip": 0.01154293, + "auxiliary_loss_mlp": 0.01055893, + "balance_loss_clip": 1.03653371, + "balance_loss_mlp": 1.05139303, + "epoch": 0.16467758905756802, + "flos": 24930156867840.0, + "grad_norm": 2.0122721864238438, + "language_loss": 0.72351867, + "learning_rate": 3.8127298586234167e-06, + "loss": 0.74562055, + "num_input_tokens_seen": 59232340, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.03125, + "step": 2739, + "time_per_iteration": 2.5309460163116455 + }, + { + "auxiliary_loss_clip": 0.01148442, + "auxiliary_loss_mlp": 0.0104888, + "balance_loss_clip": 1.02991343, + "balance_loss_mlp": 1.04766631, + "epoch": 0.16473771231023598, + "flos": 24826519152000.0, + "grad_norm": 1.690107638621115, + "language_loss": 0.81735384, + "learning_rate": 3.8125652783700104e-06, + "loss": 0.8393271, + "num_input_tokens_seen": 59253950, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0078125, + "step": 2740, + "time_per_iteration": 2.5005404949188232 + }, + { + "auxiliary_loss_clip": 0.01157284, + "auxiliary_loss_mlp": 0.01053239, + "balance_loss_clip": 1.03176928, + "balance_loss_mlp": 1.05347896, + "epoch": 0.16479783556290395, + "flos": 39896072375040.0, + "grad_norm": 2.8033984026588756, + "language_loss": 0.69098473, + "learning_rate": 3.8124006293839475e-06, + "loss": 0.71308994, + "num_input_tokens_seen": 59275545, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.0390625, + "step": 2741, + "time_per_iteration": 2.6353659629821777 + }, + { + "auxiliary_loss_clip": 0.01151645, + "auxiliary_loss_mlp": 0.01044648, + "balance_loss_clip": 1.02588463, + "balance_loss_mlp": 1.04987025, + "epoch": 0.16485795881557191, + "flos": 19897061299200.0, + "grad_norm": 2.1078448839323167, + "language_loss": 0.79967189, + "learning_rate": 3.812235911671472e-06, + "loss": 0.82163477, + "num_input_tokens_seen": 59293480, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 2742, + "time_per_iteration": 2.4471442699432373 + }, + { + "auxiliary_loss_clip": 0.01150824, + "auxiliary_loss_mlp": 0.01053847, + "balance_loss_clip": 1.03373659, + "balance_loss_mlp": 1.05117011, + "epoch": 0.16491808206823988, + "flos": 20556129997440.0, + "grad_norm": 2.1468697804747823, + "language_loss": 0.84769481, + "learning_rate": 3.8120711252388274e-06, + "loss": 0.86974156, + "num_input_tokens_seen": 59313435, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0, + "step": 2743, + "time_per_iteration": 2.459146022796631 + }, + { + "auxiliary_loss_clip": 0.01149398, + "auxiliary_loss_mlp": 0.01052609, + "balance_loss_clip": 1.03359556, + "balance_loss_mlp": 1.05074859, + "epoch": 0.16497820532090787, + "flos": 23800802376960.0, + "grad_norm": 1.5853616537097488, + "language_loss": 0.85723281, + "learning_rate": 3.811906270092265e-06, + "loss": 0.87925285, + "num_input_tokens_seen": 59331535, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.984375, + "step": 2744, + "time_per_iteration": 2.4920642375946045 + }, + { + "auxiliary_loss_clip": 0.01147114, + "auxiliary_loss_mlp": 0.01046312, + "balance_loss_clip": 1.0283947, + "balance_loss_mlp": 1.05124998, + "epoch": 0.16503832857357584, + "flos": 25482642935040.0, + "grad_norm": 1.7300129139105382, + "language_loss": 0.82973897, + "learning_rate": 3.811741346238036e-06, + "loss": 0.85167319, + "num_input_tokens_seen": 59350680, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9609375, + "step": 2745, + "time_per_iteration": 2.490399122238159 + }, + { + "auxiliary_loss_clip": 0.0115758, + "auxiliary_loss_mlp": 0.01054165, + "balance_loss_clip": 1.03548467, + "balance_loss_mlp": 1.05477679, + "epoch": 0.1650984518262438, + "flos": 17676058619520.0, + "grad_norm": 2.19754759855213, + "language_loss": 0.76411253, + "learning_rate": 3.8115763536823923e-06, + "loss": 0.78622997, + "num_input_tokens_seen": 59367020, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.03125, + "step": 2746, + "time_per_iteration": 2.46258282661438 + }, + { + "auxiliary_loss_clip": 0.01152266, + "auxiliary_loss_mlp": 0.01052583, + "balance_loss_clip": 1.03387904, + "balance_loss_mlp": 1.05164099, + "epoch": 0.16515857507891177, + "flos": 18698327688960.0, + "grad_norm": 1.5978428663850568, + "language_loss": 0.80686736, + "learning_rate": 3.811411292431592e-06, + "loss": 0.82891583, + "num_input_tokens_seen": 59386075, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0078125, + "step": 2747, + "time_per_iteration": 2.4612972736358643 + }, + { + "auxiliary_loss_clip": 0.01158238, + "auxiliary_loss_mlp": 0.01048108, + "balance_loss_clip": 1.02848577, + "balance_loss_mlp": 1.05559731, + "epoch": 0.16521869833157973, + "flos": 15010481306880.0, + "grad_norm": 1.853069559467639, + "language_loss": 0.69463658, + "learning_rate": 3.8112461624918945e-06, + "loss": 0.71670008, + "num_input_tokens_seen": 59402690, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0234375, + "step": 2748, + "time_per_iteration": 2.4235999584198 + }, + { + "auxiliary_loss_clip": 0.01155731, + "auxiliary_loss_mlp": 0.0105142, + "balance_loss_clip": 1.03314471, + "balance_loss_mlp": 1.05482006, + "epoch": 0.1652788215842477, + "flos": 22121152548480.0, + "grad_norm": 2.265414403061137, + "language_loss": 0.87653661, + "learning_rate": 3.811080963869561e-06, + "loss": 0.89860809, + "num_input_tokens_seen": 59421130, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.0078125, + "step": 2749, + "time_per_iteration": 2.4706709384918213 + }, + { + "auxiliary_loss_clip": 0.01153325, + "auxiliary_loss_mlp": 0.01048781, + "balance_loss_clip": 1.02905142, + "balance_loss_mlp": 1.0509429, + "epoch": 0.16533894483691566, + "flos": 18333080242560.0, + "grad_norm": 2.3451981357461444, + "language_loss": 0.79248077, + "learning_rate": 3.8109156965708557e-06, + "loss": 0.81450188, + "num_input_tokens_seen": 59438970, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0234375, + "step": 2750, + "time_per_iteration": 2.4588990211486816 + }, + { + "auxiliary_loss_clip": 0.01151699, + "auxiliary_loss_mlp": 0.01045956, + "balance_loss_clip": 1.02657294, + "balance_loss_mlp": 1.05188382, + "epoch": 0.16539906808958366, + "flos": 22382115834240.0, + "grad_norm": 1.7653411133265118, + "language_loss": 0.95010567, + "learning_rate": 3.8107503606020455e-06, + "loss": 0.9720822, + "num_input_tokens_seen": 59458510, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.99609375, + "step": 2751, + "time_per_iteration": 2.4776439666748047 + }, + { + "auxiliary_loss_clip": 0.01152135, + "auxiliary_loss_mlp": 0.01045797, + "balance_loss_clip": 1.02762985, + "balance_loss_mlp": 1.05480134, + "epoch": 0.16545919134225162, + "flos": 22711093522560.0, + "grad_norm": 1.9833662518999209, + "language_loss": 0.71080822, + "learning_rate": 3.8105849559693997e-06, + "loss": 0.73278749, + "num_input_tokens_seen": 59477110, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.97265625, + "step": 2752, + "time_per_iteration": 2.4609227180480957 + }, + { + "auxiliary_loss_clip": 0.01051961, + "auxiliary_loss_mlp": 0.01021231, + "balance_loss_clip": 1.01878762, + "balance_loss_mlp": 1.01785779, + "epoch": 0.1655193145949196, + "flos": 67802974076160.0, + "grad_norm": 0.7698122762266473, + "language_loss": 0.54079807, + "learning_rate": 3.810419482679192e-06, + "loss": 0.56152999, + "num_input_tokens_seen": 59541155, + "router_z_loss_clip": 0.02441406, + "router_z_loss_mlp": 0.33984375, + "step": 2753, + "time_per_iteration": 3.161339282989502 + }, + { + "auxiliary_loss_clip": 0.01152964, + "auxiliary_loss_mlp": 0.01042006, + "balance_loss_clip": 1.02282572, + "balance_loss_mlp": 1.05254793, + "epoch": 0.16557943784758755, + "flos": 24280389792000.0, + "grad_norm": 1.9686645345026932, + "language_loss": 0.75467873, + "learning_rate": 3.8102539407376954e-06, + "loss": 0.77662838, + "num_input_tokens_seen": 59561155, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.0, + "step": 2754, + "time_per_iteration": 2.4837357997894287 + }, + { + "auxiliary_loss_clip": 0.01160718, + "auxiliary_loss_mlp": 0.01060834, + "balance_loss_clip": 1.03875661, + "balance_loss_mlp": 1.05358946, + "epoch": 0.16563956110025552, + "flos": 20083617561600.0, + "grad_norm": 3.81944507319113, + "language_loss": 0.87154973, + "learning_rate": 3.810088330151188e-06, + "loss": 0.89376527, + "num_input_tokens_seen": 59580460, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.0703125, + "step": 2755, + "time_per_iteration": 2.482780933380127 + }, + { + "auxiliary_loss_clip": 0.01148695, + "auxiliary_loss_mlp": 0.01052509, + "balance_loss_clip": 1.03348362, + "balance_loss_mlp": 1.04862666, + "epoch": 0.16569968435292348, + "flos": 28034454896640.0, + "grad_norm": 1.859731734913831, + "language_loss": 0.73258269, + "learning_rate": 3.80992265092595e-06, + "loss": 0.7545948, + "num_input_tokens_seen": 59600025, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0, + "step": 2756, + "time_per_iteration": 2.519432783126831 + }, + { + "auxiliary_loss_clip": 0.01149207, + "auxiliary_loss_mlp": 0.01049415, + "balance_loss_clip": 1.02999544, + "balance_loss_mlp": 1.05331099, + "epoch": 0.16575980760559147, + "flos": 26250233598720.0, + "grad_norm": 1.6628427585054586, + "language_loss": 0.74967468, + "learning_rate": 3.8097569030682636e-06, + "loss": 0.77166092, + "num_input_tokens_seen": 59620600, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.9609375, + "step": 2757, + "time_per_iteration": 2.5122530460357666 + }, + { + "auxiliary_loss_clip": 0.01154145, + "auxiliary_loss_mlp": 0.01044644, + "balance_loss_clip": 1.02590466, + "balance_loss_mlp": 1.05359447, + "epoch": 0.16581993085825944, + "flos": 26943955943040.0, + "grad_norm": 2.101183789218018, + "language_loss": 0.84532511, + "learning_rate": 3.8095910865844137e-06, + "loss": 0.86731303, + "num_input_tokens_seen": 59641385, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0078125, + "step": 2758, + "time_per_iteration": 2.5268592834472656 + }, + { + "auxiliary_loss_clip": 0.01153935, + "auxiliary_loss_mlp": 0.01051485, + "balance_loss_clip": 1.03382993, + "balance_loss_mlp": 1.05355358, + "epoch": 0.1658800541109274, + "flos": 21653632103040.0, + "grad_norm": 3.016772390052645, + "language_loss": 0.79003322, + "learning_rate": 3.809425201480689e-06, + "loss": 0.81208748, + "num_input_tokens_seen": 59659865, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 1.0, + "step": 2759, + "time_per_iteration": 2.468798875808716 + }, + { + "auxiliary_loss_clip": 0.01151828, + "auxiliary_loss_mlp": 0.01048294, + "balance_loss_clip": 1.02953088, + "balance_loss_mlp": 1.05121255, + "epoch": 0.16594017736359537, + "flos": 16435488643200.0, + "grad_norm": 4.81235802271706, + "language_loss": 0.75059134, + "learning_rate": 3.8092592477633793e-06, + "loss": 0.77259254, + "num_input_tokens_seen": 59678780, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0, + "step": 2760, + "time_per_iteration": 2.459453582763672 + }, + { + "auxiliary_loss_clip": 0.01158028, + "auxiliary_loss_mlp": 0.0104013, + "balance_loss_clip": 1.02139056, + "balance_loss_mlp": 1.05363011, + "epoch": 0.16600030061626334, + "flos": 22637297030400.0, + "grad_norm": 1.843496656605, + "language_loss": 0.73409051, + "learning_rate": 3.8090932254387774e-06, + "loss": 0.75607204, + "num_input_tokens_seen": 59698795, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.046875, + "step": 2761, + "time_per_iteration": 2.473264455795288 + }, + { + "auxiliary_loss_clip": 0.01154594, + "auxiliary_loss_mlp": 0.01046157, + "balance_loss_clip": 1.02709532, + "balance_loss_mlp": 1.05460942, + "epoch": 0.1660604238689313, + "flos": 26396569607040.0, + "grad_norm": 2.076392836835936, + "language_loss": 0.89255953, + "learning_rate": 3.8089271345131788e-06, + "loss": 0.91456699, + "num_input_tokens_seen": 59718795, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0, + "step": 2762, + "time_per_iteration": 2.4917852878570557 + }, + { + "auxiliary_loss_clip": 0.01153346, + "auxiliary_loss_mlp": 0.01052721, + "balance_loss_clip": 1.03327847, + "balance_loss_mlp": 1.0517025, + "epoch": 0.16612054712159927, + "flos": 23039999383680.0, + "grad_norm": 1.6634533311047424, + "language_loss": 0.87782222, + "learning_rate": 3.8087609749928822e-06, + "loss": 0.89988291, + "num_input_tokens_seen": 59737555, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.015625, + "step": 2763, + "time_per_iteration": 2.48002028465271 + }, + { + "auxiliary_loss_clip": 0.01051809, + "auxiliary_loss_mlp": 0.01013596, + "balance_loss_clip": 1.01105642, + "balance_loss_mlp": 1.01786494, + "epoch": 0.16618067037426726, + "flos": 59241225202560.0, + "grad_norm": 0.7771287992078079, + "language_loss": 0.59777391, + "learning_rate": 3.8085947468841885e-06, + "loss": 0.61842799, + "num_input_tokens_seen": 59800915, + "router_z_loss_clip": 0.02539062, + "router_z_loss_mlp": 0.33984375, + "step": 2764, + "time_per_iteration": 3.0722031593322754 + }, + { + "auxiliary_loss_clip": 0.01154679, + "auxiliary_loss_mlp": 0.01052765, + "balance_loss_clip": 1.03183234, + "balance_loss_mlp": 1.05292118, + "epoch": 0.16624079362693522, + "flos": 27198813916800.0, + "grad_norm": 1.8564974944455146, + "language_loss": 0.82349414, + "learning_rate": 3.808428450193401e-06, + "loss": 0.8455686, + "num_input_tokens_seen": 59822910, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.015625, + "step": 2765, + "time_per_iteration": 2.5071089267730713 + }, + { + "auxiliary_loss_clip": 0.01161301, + "auxiliary_loss_mlp": 0.01048817, + "balance_loss_clip": 1.02758563, + "balance_loss_mlp": 1.05308914, + "epoch": 0.1663009168796032, + "flos": 10925068216320.0, + "grad_norm": 2.1954568630881566, + "language_loss": 0.70029616, + "learning_rate": 3.8082620849268244e-06, + "loss": 0.72239733, + "num_input_tokens_seen": 59838805, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.078125, + "step": 2766, + "time_per_iteration": 2.417538642883301 + }, + { + "auxiliary_loss_clip": 0.01153227, + "auxiliary_loss_mlp": 0.01045171, + "balance_loss_clip": 1.02669311, + "balance_loss_mlp": 1.05449462, + "epoch": 0.16636104013227115, + "flos": 17894431353600.0, + "grad_norm": 2.3642497854018174, + "language_loss": 0.88693011, + "learning_rate": 3.808095651090769e-06, + "loss": 0.90891409, + "num_input_tokens_seen": 59855345, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.98828125, + "step": 2767, + "time_per_iteration": 2.447087287902832 + }, + { + "auxiliary_loss_clip": 0.01048639, + "auxiliary_loss_mlp": 0.01007692, + "balance_loss_clip": 1.0051651, + "balance_loss_mlp": 1.01474071, + "epoch": 0.16642116338493912, + "flos": 66726050463360.0, + "grad_norm": 0.659533193053428, + "language_loss": 0.52894622, + "learning_rate": 3.8079291486915447e-06, + "loss": 0.54950953, + "num_input_tokens_seen": 59917710, + "router_z_loss_clip": 0.02526855, + "router_z_loss_mlp": 0.33984375, + "step": 2768, + "time_per_iteration": 4.540286064147949 + }, + { + "auxiliary_loss_clip": 0.01156575, + "auxiliary_loss_mlp": 0.01051889, + "balance_loss_clip": 1.03196931, + "balance_loss_mlp": 1.05233693, + "epoch": 0.16648128663760708, + "flos": 19026048401280.0, + "grad_norm": 2.4421243199538543, + "language_loss": 0.84964579, + "learning_rate": 3.8077625777354667e-06, + "loss": 0.87173045, + "num_input_tokens_seen": 59935105, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.046875, + "step": 2769, + "time_per_iteration": 3.9888546466827393 + }, + { + "auxiliary_loss_clip": 0.01046706, + "auxiliary_loss_mlp": 0.01007405, + "balance_loss_clip": 1.00486565, + "balance_loss_mlp": 1.01284146, + "epoch": 0.16654140989027508, + "flos": 70134976759680.0, + "grad_norm": 0.809970645404753, + "language_loss": 0.57417655, + "learning_rate": 3.80759593822885e-06, + "loss": 0.59471762, + "num_input_tokens_seen": 59984085, + "router_z_loss_clip": 0.02539062, + "router_z_loss_mlp": 0.33984375, + "step": 2770, + "time_per_iteration": 2.909212350845337 + }, + { + "auxiliary_loss_clip": 0.01045765, + "auxiliary_loss_mlp": 0.01004174, + "balance_loss_clip": 1.00161099, + "balance_loss_mlp": 1.0120976, + "epoch": 0.16660153314294304, + "flos": 70272406195200.0, + "grad_norm": 0.8642108743281017, + "language_loss": 0.5621168, + "learning_rate": 3.807429230178015e-06, + "loss": 0.58261615, + "num_input_tokens_seen": 60043470, + "router_z_loss_clip": 0.02563477, + "router_z_loss_mlp": 0.3359375, + "step": 2771, + "time_per_iteration": 2.9000375270843506 + }, + { + "auxiliary_loss_clip": 0.01152287, + "auxiliary_loss_mlp": 0.01058074, + "balance_loss_clip": 1.03741515, + "balance_loss_mlp": 1.05137527, + "epoch": 0.166661656395611, + "flos": 23075048079360.0, + "grad_norm": 2.4271023422086593, + "language_loss": 0.70461071, + "learning_rate": 3.8072624535892817e-06, + "loss": 0.72671425, + "num_input_tokens_seen": 60063045, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.0078125, + "step": 2772, + "time_per_iteration": 2.45868182182312 + }, + { + "auxiliary_loss_clip": 0.01150213, + "auxiliary_loss_mlp": 0.01052488, + "balance_loss_clip": 1.03305721, + "balance_loss_mlp": 1.04914951, + "epoch": 0.16672177964827897, + "flos": 28366341586560.0, + "grad_norm": 1.8764675289735346, + "language_loss": 0.86201918, + "learning_rate": 3.807095608468975e-06, + "loss": 0.8840462, + "num_input_tokens_seen": 60081945, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0078125, + "step": 2773, + "time_per_iteration": 2.513784885406494 + }, + { + "auxiliary_loss_clip": 0.01152492, + "auxiliary_loss_mlp": 0.01046232, + "balance_loss_clip": 1.02808821, + "balance_loss_mlp": 1.05230188, + "epoch": 0.16678190290094694, + "flos": 19091010147840.0, + "grad_norm": 2.2216439453760595, + "language_loss": 0.81859678, + "learning_rate": 3.8069286948234224e-06, + "loss": 0.84058398, + "num_input_tokens_seen": 60096820, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.0, + "step": 2774, + "time_per_iteration": 2.4288830757141113 + }, + { + "auxiliary_loss_clip": 0.01155539, + "auxiliary_loss_mlp": 0.0104957, + "balance_loss_clip": 1.02955508, + "balance_loss_mlp": 1.05290627, + "epoch": 0.1668420261536149, + "flos": 21799106184960.0, + "grad_norm": 2.1125697386324576, + "language_loss": 0.83287829, + "learning_rate": 3.806761712658952e-06, + "loss": 0.85492939, + "num_input_tokens_seen": 60116140, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.0234375, + "step": 2775, + "time_per_iteration": 2.4773504734039307 + }, + { + "auxiliary_loss_clip": 0.01151, + "auxiliary_loss_mlp": 0.01053902, + "balance_loss_clip": 1.03599668, + "balance_loss_mlp": 1.0527029, + "epoch": 0.16690214940628287, + "flos": 19062533640960.0, + "grad_norm": 1.9011936520028738, + "language_loss": 0.80721045, + "learning_rate": 3.806594661981897e-06, + "loss": 0.82925946, + "num_input_tokens_seen": 60134235, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.984375, + "step": 2776, + "time_per_iteration": 2.4736995697021484 + }, + { + "auxiliary_loss_clip": 0.01147621, + "auxiliary_loss_mlp": 0.01053383, + "balance_loss_clip": 1.03457189, + "balance_loss_mlp": 1.05260348, + "epoch": 0.16696227265895086, + "flos": 18588548747520.0, + "grad_norm": 1.7922512358148395, + "language_loss": 0.798361, + "learning_rate": 3.8064275427985906e-06, + "loss": 0.82037103, + "num_input_tokens_seen": 60153275, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.953125, + "step": 2777, + "time_per_iteration": 2.4625258445739746 + }, + { + "auxiliary_loss_clip": 0.01149386, + "auxiliary_loss_mlp": 0.0105028, + "balance_loss_clip": 1.0313735, + "balance_loss_mlp": 1.05002642, + "epoch": 0.16702239591161883, + "flos": 23294139085440.0, + "grad_norm": 1.8218923631286437, + "language_loss": 0.85132945, + "learning_rate": 3.806260355115371e-06, + "loss": 0.87332618, + "num_input_tokens_seen": 60173215, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.99609375, + "step": 2778, + "time_per_iteration": 2.4819412231445312 + }, + { + "auxiliary_loss_clip": 0.01154381, + "auxiliary_loss_mlp": 0.0104532, + "balance_loss_clip": 1.02626991, + "balance_loss_mlp": 1.05222583, + "epoch": 0.1670825191642868, + "flos": 24425648392320.0, + "grad_norm": 2.6489491047564826, + "language_loss": 0.74133682, + "learning_rate": 3.8060930989385778e-06, + "loss": 0.76333386, + "num_input_tokens_seen": 60190515, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0234375, + "step": 2779, + "time_per_iteration": 2.510207176208496 + }, + { + "auxiliary_loss_clip": 0.0115174, + "auxiliary_loss_mlp": 0.01045601, + "balance_loss_clip": 1.02625358, + "balance_loss_mlp": 1.05116367, + "epoch": 0.16714264241695476, + "flos": 26797512193920.0, + "grad_norm": 2.2761441742273663, + "language_loss": 0.65382051, + "learning_rate": 3.805925774274554e-06, + "loss": 0.67579395, + "num_input_tokens_seen": 60211655, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0078125, + "step": 2780, + "time_per_iteration": 2.5250439643859863 + }, + { + "auxiliary_loss_clip": 0.01150325, + "auxiliary_loss_mlp": 0.01048314, + "balance_loss_clip": 1.02856088, + "balance_loss_mlp": 1.05120933, + "epoch": 0.16720276566962272, + "flos": 21835304115840.0, + "grad_norm": 2.0602280440022382, + "language_loss": 0.78563058, + "learning_rate": 3.805758381129643e-06, + "loss": 0.80761701, + "num_input_tokens_seen": 60230860, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.9921875, + "step": 2781, + "time_per_iteration": 2.4921979904174805 + }, + { + "auxiliary_loss_clip": 0.01153739, + "auxiliary_loss_mlp": 0.01049181, + "balance_loss_clip": 1.03065586, + "balance_loss_mlp": 1.05227423, + "epoch": 0.1672628889222907, + "flos": 21470415805440.0, + "grad_norm": 1.480266857331911, + "language_loss": 0.75262564, + "learning_rate": 3.805590919510193e-06, + "loss": 0.77465487, + "num_input_tokens_seen": 60250535, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.015625, + "step": 2782, + "time_per_iteration": 2.468590021133423 + }, + { + "auxiliary_loss_clip": 0.01159372, + "auxiliary_loss_mlp": 0.01052642, + "balance_loss_clip": 1.03141046, + "balance_loss_mlp": 1.05443954, + "epoch": 0.16732301217495865, + "flos": 30774008269440.0, + "grad_norm": 1.999958464394936, + "language_loss": 0.67841566, + "learning_rate": 3.8054233894225547e-06, + "loss": 0.70053571, + "num_input_tokens_seen": 60269530, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.0546875, + "step": 2783, + "time_per_iteration": 2.5312225818634033 + }, + { + "auxiliary_loss_clip": 0.01153889, + "auxiliary_loss_mlp": 0.01050749, + "balance_loss_clip": 1.03193808, + "balance_loss_mlp": 1.0538497, + "epoch": 0.16738313542762664, + "flos": 23474625949440.0, + "grad_norm": 2.209785525271013, + "language_loss": 0.70028126, + "learning_rate": 3.805255790873081e-06, + "loss": 0.72232759, + "num_input_tokens_seen": 60289900, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0, + "step": 2784, + "time_per_iteration": 2.4932820796966553 + }, + { + "auxiliary_loss_clip": 0.01154602, + "auxiliary_loss_mlp": 0.0105186, + "balance_loss_clip": 1.03087902, + "balance_loss_mlp": 1.05120277, + "epoch": 0.1674432586802946, + "flos": 29789086366080.0, + "grad_norm": 1.9638597335511054, + "language_loss": 0.60441053, + "learning_rate": 3.805088123868126e-06, + "loss": 0.62647516, + "num_input_tokens_seen": 60310025, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.03125, + "step": 2785, + "time_per_iteration": 2.527010440826416 + }, + { + "auxiliary_loss_clip": 0.0104901, + "auxiliary_loss_mlp": 0.01029558, + "balance_loss_clip": 1.02681625, + "balance_loss_mlp": 1.01595187, + "epoch": 0.16750338193296258, + "flos": 66136073575680.0, + "grad_norm": 0.8343482124814343, + "language_loss": 0.588, + "learning_rate": 3.8049203884140492e-06, + "loss": 0.60878569, + "num_input_tokens_seen": 60377800, + "router_z_loss_clip": 0.02746582, + "router_z_loss_mlp": 0.33007812, + "step": 2786, + "time_per_iteration": 3.1062281131744385 + }, + { + "auxiliary_loss_clip": 0.0115343, + "auxiliary_loss_mlp": 0.01044844, + "balance_loss_clip": 1.0253408, + "balance_loss_mlp": 1.05108333, + "epoch": 0.16756350518563054, + "flos": 25696777864320.0, + "grad_norm": 1.9494651562196093, + "language_loss": 0.75846571, + "learning_rate": 3.80475258451721e-06, + "loss": 0.78044844, + "num_input_tokens_seen": 60398215, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0234375, + "step": 2787, + "time_per_iteration": 2.51383900642395 + }, + { + "auxiliary_loss_clip": 0.0115361, + "auxiliary_loss_mlp": 0.0104169, + "balance_loss_clip": 1.02287841, + "balance_loss_mlp": 1.05218899, + "epoch": 0.1676236284382985, + "flos": 23836102467840.0, + "grad_norm": 2.088538847955111, + "language_loss": 0.77615869, + "learning_rate": 3.804584712183972e-06, + "loss": 0.79811174, + "num_input_tokens_seen": 60416910, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 2788, + "time_per_iteration": 2.4926373958587646 + }, + { + "auxiliary_loss_clip": 0.01048965, + "auxiliary_loss_mlp": 0.01004104, + "balance_loss_clip": 1.00154078, + "balance_loss_mlp": 1.01582766, + "epoch": 0.16768375169096647, + "flos": 59874902985600.0, + "grad_norm": 0.861309286667726, + "language_loss": 0.59360403, + "learning_rate": 3.8044167714207013e-06, + "loss": 0.61413473, + "num_input_tokens_seen": 60468660, + "router_z_loss_clip": 0.02563477, + "router_z_loss_mlp": 0.33203125, + "step": 2789, + "time_per_iteration": 2.9390883445739746 + }, + { + "auxiliary_loss_clip": 0.01153417, + "auxiliary_loss_mlp": 0.01052728, + "balance_loss_clip": 1.03262937, + "balance_loss_mlp": 1.05115533, + "epoch": 0.16774387494363446, + "flos": 38435657207040.0, + "grad_norm": 1.8582032581880512, + "language_loss": 0.70117038, + "learning_rate": 3.804248762233765e-06, + "loss": 0.72323185, + "num_input_tokens_seen": 60492370, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.015625, + "step": 2790, + "time_per_iteration": 2.6337287425994873 + }, + { + "auxiliary_loss_clip": 0.01154528, + "auxiliary_loss_mlp": 0.01057043, + "balance_loss_clip": 1.03852975, + "balance_loss_mlp": 1.05254579, + "epoch": 0.16780399819630243, + "flos": 22637620252800.0, + "grad_norm": 1.9267324208283758, + "language_loss": 0.7914235, + "learning_rate": 3.8040806846295356e-06, + "loss": 0.81353921, + "num_input_tokens_seen": 60512655, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0234375, + "step": 2791, + "time_per_iteration": 2.4992258548736572 + }, + { + "auxiliary_loss_clip": 0.01154296, + "auxiliary_loss_mlp": 0.0104755, + "balance_loss_clip": 1.02807093, + "balance_loss_mlp": 1.05311096, + "epoch": 0.1678641214489704, + "flos": 32891516887680.0, + "grad_norm": 1.670563786806713, + "language_loss": 0.71465087, + "learning_rate": 3.8039125386143853e-06, + "loss": 0.73666936, + "num_input_tokens_seen": 60533090, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.015625, + "step": 2792, + "time_per_iteration": 2.5886104106903076 + }, + { + "auxiliary_loss_clip": 0.01154826, + "auxiliary_loss_mlp": 0.01045884, + "balance_loss_clip": 1.02648878, + "balance_loss_mlp": 1.05179656, + "epoch": 0.16792424470163836, + "flos": 19974916028160.0, + "grad_norm": 2.423044729867527, + "language_loss": 0.72166264, + "learning_rate": 3.803744324194691e-06, + "loss": 0.74366981, + "num_input_tokens_seen": 60553190, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.03125, + "step": 2793, + "time_per_iteration": 2.5197043418884277 + }, + { + "auxiliary_loss_clip": 0.01153184, + "auxiliary_loss_mlp": 0.01054586, + "balance_loss_clip": 1.03502417, + "balance_loss_mlp": 1.05135465, + "epoch": 0.16798436795430632, + "flos": 19719878486400.0, + "grad_norm": 1.9474647186442988, + "language_loss": 0.77305138, + "learning_rate": 3.803576041376831e-06, + "loss": 0.79512912, + "num_input_tokens_seen": 60571995, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.015625, + "step": 2794, + "time_per_iteration": 2.467292547225952 + }, + { + "auxiliary_loss_clip": 0.01154384, + "auxiliary_loss_mlp": 0.01054467, + "balance_loss_clip": 1.03558397, + "balance_loss_mlp": 1.05253601, + "epoch": 0.1680444912069743, + "flos": 28104839596800.0, + "grad_norm": 2.2742759048834578, + "language_loss": 0.71613103, + "learning_rate": 3.803407690167187e-06, + "loss": 0.7382195, + "num_input_tokens_seen": 60591275, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.015625, + "step": 2795, + "time_per_iteration": 2.5272278785705566 + }, + { + "auxiliary_loss_clip": 0.01149377, + "auxiliary_loss_mlp": 0.01044626, + "balance_loss_clip": 1.02592218, + "balance_loss_mlp": 1.04932868, + "epoch": 0.16810461445964225, + "flos": 18075205526400.0, + "grad_norm": 1.942494339721957, + "language_loss": 0.83784455, + "learning_rate": 3.803239270572142e-06, + "loss": 0.8597846, + "num_input_tokens_seen": 60609235, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0, + "step": 2796, + "time_per_iteration": 2.448528289794922 + }, + { + "auxiliary_loss_clip": 0.01152862, + "auxiliary_loss_mlp": 0.01059215, + "balance_loss_clip": 1.03911614, + "balance_loss_mlp": 1.04904127, + "epoch": 0.16816473771231025, + "flos": 23878657105920.0, + "grad_norm": 1.6778887705488965, + "language_loss": 0.8109591, + "learning_rate": 3.8030707825980838e-06, + "loss": 0.83307993, + "num_input_tokens_seen": 60629880, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0390625, + "step": 2797, + "time_per_iteration": 2.5044567584991455 + }, + { + "auxiliary_loss_clip": 0.01147186, + "auxiliary_loss_mlp": 0.01044345, + "balance_loss_clip": 1.02766752, + "balance_loss_mlp": 1.05142093, + "epoch": 0.1682248609649782, + "flos": 22783597125120.0, + "grad_norm": 1.4189820060365406, + "language_loss": 0.74740726, + "learning_rate": 3.802902226251401e-06, + "loss": 0.76932257, + "num_input_tokens_seen": 60651175, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.95703125, + "step": 2798, + "time_per_iteration": 2.4913666248321533 + }, + { + "auxiliary_loss_clip": 0.01154688, + "auxiliary_loss_mlp": 0.01049917, + "balance_loss_clip": 1.03250098, + "balance_loss_mlp": 1.05462337, + "epoch": 0.16828498421764618, + "flos": 20705123612160.0, + "grad_norm": 1.8962576537558784, + "language_loss": 0.79592311, + "learning_rate": 3.8027336015384845e-06, + "loss": 0.81796914, + "num_input_tokens_seen": 60670210, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 1.0, + "step": 2799, + "time_per_iteration": 2.4844021797180176 + }, + { + "auxiliary_loss_clip": 0.01153892, + "auxiliary_loss_mlp": 0.01046392, + "balance_loss_clip": 1.02597189, + "balance_loss_mlp": 1.04983997, + "epoch": 0.16834510747031414, + "flos": 29420606695680.0, + "grad_norm": 2.7819182919151455, + "language_loss": 0.70778632, + "learning_rate": 3.8025649084657296e-06, + "loss": 0.72978926, + "num_input_tokens_seen": 60690895, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0390625, + "step": 2800, + "time_per_iteration": 2.548715829849243 + }, + { + "auxiliary_loss_clip": 0.01148463, + "auxiliary_loss_mlp": 0.01043839, + "balance_loss_clip": 1.02365637, + "balance_loss_mlp": 1.04882574, + "epoch": 0.1684052307229821, + "flos": 18145374744960.0, + "grad_norm": 1.9135359518782422, + "language_loss": 0.83549178, + "learning_rate": 3.8023961470395326e-06, + "loss": 0.85741478, + "num_input_tokens_seen": 60708280, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.99609375, + "step": 2801, + "time_per_iteration": 2.456601858139038 + }, + { + "auxiliary_loss_clip": 0.01149997, + "auxiliary_loss_mlp": 0.01052315, + "balance_loss_clip": 1.03355145, + "balance_loss_mlp": 1.04947591, + "epoch": 0.16846535397565007, + "flos": 16574929240320.0, + "grad_norm": 2.757874152621573, + "language_loss": 0.822721, + "learning_rate": 3.8022273172662933e-06, + "loss": 0.84474415, + "num_input_tokens_seen": 60724150, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0078125, + "step": 2802, + "time_per_iteration": 2.4426534175872803 + }, + { + "auxiliary_loss_clip": 0.01153107, + "auxiliary_loss_mlp": 0.01047694, + "balance_loss_clip": 1.02764344, + "balance_loss_mlp": 1.05123353, + "epoch": 0.16852547722831807, + "flos": 30408868563840.0, + "grad_norm": 1.4855905624355255, + "language_loss": 0.81064272, + "learning_rate": 3.802058419152413e-06, + "loss": 0.83265072, + "num_input_tokens_seen": 60746485, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.015625, + "step": 2803, + "time_per_iteration": 2.5615930557250977 + }, + { + "auxiliary_loss_clip": 0.01150255, + "auxiliary_loss_mlp": 0.01045652, + "balance_loss_clip": 1.02693641, + "balance_loss_mlp": 1.05246449, + "epoch": 0.16858560048098603, + "flos": 33507420416640.0, + "grad_norm": 2.2799183114600545, + "language_loss": 0.7645762, + "learning_rate": 3.801889452704297e-06, + "loss": 0.78653532, + "num_input_tokens_seen": 60762875, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9765625, + "step": 2804, + "time_per_iteration": 2.541059970855713 + }, + { + "auxiliary_loss_clip": 0.01045818, + "auxiliary_loss_mlp": 0.01026702, + "balance_loss_clip": 1.02452028, + "balance_loss_mlp": 1.01328063, + "epoch": 0.168645723733654, + "flos": 67370502326400.0, + "grad_norm": 0.8620881286764229, + "language_loss": 0.55414748, + "learning_rate": 3.8017204179283526e-06, + "loss": 0.57487267, + "num_input_tokens_seen": 60825510, + "router_z_loss_clip": 0.02185059, + "router_z_loss_mlp": 0.32421875, + "step": 2805, + "time_per_iteration": 3.033358573913574 + }, + { + "auxiliary_loss_clip": 0.01144187, + "auxiliary_loss_mlp": 0.01039064, + "balance_loss_clip": 1.02161169, + "balance_loss_mlp": 1.04741919, + "epoch": 0.16870584698632196, + "flos": 21324618501120.0, + "grad_norm": 1.9122963285347783, + "language_loss": 0.73038024, + "learning_rate": 3.8015513148309892e-06, + "loss": 0.75221276, + "num_input_tokens_seen": 60844440, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.96484375, + "step": 2806, + "time_per_iteration": 2.4699463844299316 + }, + { + "auxiliary_loss_clip": 0.01148467, + "auxiliary_loss_mlp": 0.01045307, + "balance_loss_clip": 1.02712786, + "balance_loss_mlp": 1.05072176, + "epoch": 0.16876597023898993, + "flos": 20740746925440.0, + "grad_norm": 1.9407491705316076, + "language_loss": 0.69966477, + "learning_rate": 3.80138214341862e-06, + "loss": 0.7216025, + "num_input_tokens_seen": 60863210, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.98046875, + "step": 2807, + "time_per_iteration": 2.4583139419555664 + }, + { + "auxiliary_loss_clip": 0.01149832, + "auxiliary_loss_mlp": 0.01051611, + "balance_loss_clip": 1.03196526, + "balance_loss_mlp": 1.05013919, + "epoch": 0.1688260934916579, + "flos": 20303498666880.0, + "grad_norm": 2.8028706291815912, + "language_loss": 0.70265883, + "learning_rate": 3.8012129036976587e-06, + "loss": 0.72467327, + "num_input_tokens_seen": 60882510, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.9921875, + "step": 2808, + "time_per_iteration": 2.4724719524383545 + }, + { + "auxiliary_loss_clip": 0.01154296, + "auxiliary_loss_mlp": 0.0104775, + "balance_loss_clip": 1.02792549, + "balance_loss_mlp": 1.05130935, + "epoch": 0.16888621674432586, + "flos": 20340702178560.0, + "grad_norm": 2.1293629398657954, + "language_loss": 0.80103064, + "learning_rate": 3.8010435956745236e-06, + "loss": 0.8230511, + "num_input_tokens_seen": 60901105, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.03125, + "step": 2809, + "time_per_iteration": 3.844451427459717 + }, + { + "auxiliary_loss_clip": 0.01155336, + "auxiliary_loss_mlp": 0.01051942, + "balance_loss_clip": 1.03301144, + "balance_loss_mlp": 1.050385, + "epoch": 0.16894633999699385, + "flos": 16244802316800.0, + "grad_norm": 2.0909159229075245, + "language_loss": 0.88465077, + "learning_rate": 3.8008742193556358e-06, + "loss": 0.9067235, + "num_input_tokens_seen": 60915340, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.046875, + "step": 2810, + "time_per_iteration": 5.43256688117981 + }, + { + "auxiliary_loss_clip": 0.0115459, + "auxiliary_loss_mlp": 0.01052284, + "balance_loss_clip": 1.03238845, + "balance_loss_mlp": 1.05188894, + "epoch": 0.16900646324966181, + "flos": 19610171372160.0, + "grad_norm": 2.324870160833927, + "language_loss": 0.92483926, + "learning_rate": 3.800704774747416e-06, + "loss": 0.94690794, + "num_input_tokens_seen": 60933735, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.03125, + "step": 2811, + "time_per_iteration": 2.4633538722991943 + }, + { + "auxiliary_loss_clip": 0.01157458, + "auxiliary_loss_mlp": 0.01049775, + "balance_loss_clip": 1.03154814, + "balance_loss_mlp": 1.05537057, + "epoch": 0.16906658650232978, + "flos": 22018089450240.0, + "grad_norm": 20.150047321728213, + "language_loss": 0.78719699, + "learning_rate": 3.800535261856291e-06, + "loss": 0.80926931, + "num_input_tokens_seen": 60953105, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.0234375, + "step": 2812, + "time_per_iteration": 2.475893974304199 + }, + { + "auxiliary_loss_clip": 0.01154531, + "auxiliary_loss_mlp": 0.01053249, + "balance_loss_clip": 1.0353322, + "balance_loss_mlp": 1.05427527, + "epoch": 0.16912670975499774, + "flos": 11763690024960.0, + "grad_norm": 2.3708558754635103, + "language_loss": 0.7492249, + "learning_rate": 3.8003656806886887e-06, + "loss": 0.7713027, + "num_input_tokens_seen": 60969150, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 1.0078125, + "step": 2813, + "time_per_iteration": 2.4622457027435303 + }, + { + "auxiliary_loss_clip": 0.01155154, + "auxiliary_loss_mlp": 0.01047749, + "balance_loss_clip": 1.02862835, + "balance_loss_mlp": 1.05231524, + "epoch": 0.1691868330076657, + "flos": 17161386595200.0, + "grad_norm": 2.6643465032783955, + "language_loss": 0.69000697, + "learning_rate": 3.8001960312510396e-06, + "loss": 0.71203601, + "num_input_tokens_seen": 60982825, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.03125, + "step": 2814, + "time_per_iteration": 2.442352771759033 + }, + { + "auxiliary_loss_clip": 0.01152587, + "auxiliary_loss_mlp": 0.01048898, + "balance_loss_clip": 1.03032494, + "balance_loss_mlp": 1.05269694, + "epoch": 0.16924695626033368, + "flos": 22416553998720.0, + "grad_norm": 3.3683342322522543, + "language_loss": 0.61842358, + "learning_rate": 3.800026313549776e-06, + "loss": 0.64043844, + "num_input_tokens_seen": 61000875, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0, + "step": 2815, + "time_per_iteration": 2.4859516620635986 + }, + { + "auxiliary_loss_clip": 0.01149812, + "auxiliary_loss_mlp": 0.01050269, + "balance_loss_clip": 1.03179121, + "balance_loss_mlp": 1.05104065, + "epoch": 0.16930707951300164, + "flos": 25739655724800.0, + "grad_norm": 1.9947957584318596, + "language_loss": 0.81983805, + "learning_rate": 3.7998565275913342e-06, + "loss": 0.84183884, + "num_input_tokens_seen": 61021940, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9921875, + "step": 2816, + "time_per_iteration": 2.5549440383911133 + }, + { + "auxiliary_loss_clip": 0.01156016, + "auxiliary_loss_mlp": 0.01049677, + "balance_loss_clip": 1.03072321, + "balance_loss_mlp": 1.05379295, + "epoch": 0.16936720276566963, + "flos": 22747040058240.0, + "grad_norm": 2.502019531770294, + "language_loss": 0.8722589, + "learning_rate": 3.799686673382153e-06, + "loss": 0.89431584, + "num_input_tokens_seen": 61040285, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0234375, + "step": 2817, + "time_per_iteration": 2.4906835556030273 + }, + { + "auxiliary_loss_clip": 0.01152128, + "auxiliary_loss_mlp": 0.01051154, + "balance_loss_clip": 1.03200889, + "balance_loss_mlp": 1.05302715, + "epoch": 0.1694273260183376, + "flos": 19573973441280.0, + "grad_norm": 1.7787508021643152, + "language_loss": 0.81666476, + "learning_rate": 3.799516750928672e-06, + "loss": 0.83869755, + "num_input_tokens_seen": 61059020, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.98828125, + "step": 2818, + "time_per_iteration": 2.4673428535461426 + }, + { + "auxiliary_loss_clip": 0.01151603, + "auxiliary_loss_mlp": 0.01052661, + "balance_loss_clip": 1.03339636, + "balance_loss_mlp": 1.05154157, + "epoch": 0.16948744927100556, + "flos": 12457843332480.0, + "grad_norm": 5.791836374282792, + "language_loss": 0.80712807, + "learning_rate": 3.799346760237336e-06, + "loss": 0.8291707, + "num_input_tokens_seen": 61074245, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0, + "step": 2819, + "time_per_iteration": 2.43947434425354 + }, + { + "auxiliary_loss_clip": 0.01048844, + "auxiliary_loss_mlp": 0.01007246, + "balance_loss_clip": 1.00504076, + "balance_loss_mlp": 1.01552486, + "epoch": 0.16954757252367353, + "flos": 71291694435840.0, + "grad_norm": 0.9491282523447765, + "language_loss": 0.61080176, + "learning_rate": 3.7991767013145902e-06, + "loss": 0.63136268, + "num_input_tokens_seen": 61127080, + "router_z_loss_clip": 0.02209473, + "router_z_loss_mlp": 0.33203125, + "step": 2820, + "time_per_iteration": 3.008953809738159 + }, + { + "auxiliary_loss_clip": 0.01152835, + "auxiliary_loss_mlp": 0.01049908, + "balance_loss_clip": 1.031335, + "balance_loss_mlp": 1.05163527, + "epoch": 0.1696076957763415, + "flos": 29606516513280.0, + "grad_norm": 2.1013484538112097, + "language_loss": 0.78625357, + "learning_rate": 3.7990065741668844e-06, + "loss": 0.808281, + "num_input_tokens_seen": 61146955, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.015625, + "step": 2821, + "time_per_iteration": 2.5363481044769287 + }, + { + "auxiliary_loss_clip": 0.01152889, + "auxiliary_loss_mlp": 0.01056486, + "balance_loss_clip": 1.03667343, + "balance_loss_mlp": 1.05229986, + "epoch": 0.16966781902900946, + "flos": 24388588535040.0, + "grad_norm": 2.87583667245789, + "language_loss": 0.78450388, + "learning_rate": 3.7988363788006685e-06, + "loss": 0.80659759, + "num_input_tokens_seen": 61166605, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.0078125, + "step": 2822, + "time_per_iteration": 2.4969065189361572 + }, + { + "auxiliary_loss_clip": 0.01147495, + "auxiliary_loss_mlp": 0.01050996, + "balance_loss_clip": 1.03299582, + "balance_loss_mlp": 1.04956698, + "epoch": 0.16972794228167745, + "flos": 23038814234880.0, + "grad_norm": 1.9220487825624015, + "language_loss": 0.75016022, + "learning_rate": 3.7986661152223967e-06, + "loss": 0.77214515, + "num_input_tokens_seen": 61186535, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9765625, + "step": 2823, + "time_per_iteration": 2.491588830947876 + }, + { + "auxiliary_loss_clip": 0.01151822, + "auxiliary_loss_mlp": 0.0105186, + "balance_loss_clip": 1.03198779, + "balance_loss_mlp": 1.05209637, + "epoch": 0.16978806553434542, + "flos": 35228691129600.0, + "grad_norm": 1.9648811068121905, + "language_loss": 0.60514438, + "learning_rate": 3.7984957834385257e-06, + "loss": 0.62718117, + "num_input_tokens_seen": 61208965, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.99609375, + "step": 2824, + "time_per_iteration": 2.6178910732269287 + }, + { + "auxiliary_loss_clip": 0.01151958, + "auxiliary_loss_mlp": 0.01040113, + "balance_loss_clip": 1.02030015, + "balance_loss_mlp": 1.05367076, + "epoch": 0.16984818878701338, + "flos": 32014290936960.0, + "grad_norm": 1.6856049786717988, + "language_loss": 0.73004806, + "learning_rate": 3.7983253834555144e-06, + "loss": 0.75196874, + "num_input_tokens_seen": 61230670, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.98046875, + "step": 2825, + "time_per_iteration": 2.559774398803711 + }, + { + "auxiliary_loss_clip": 0.01155697, + "auxiliary_loss_mlp": 0.01054546, + "balance_loss_clip": 1.03321934, + "balance_loss_mlp": 1.0505774, + "epoch": 0.16990831203968135, + "flos": 22818609907200.0, + "grad_norm": 1.7849035157466668, + "language_loss": 0.85660541, + "learning_rate": 3.7981549152798245e-06, + "loss": 0.87870789, + "num_input_tokens_seen": 61249510, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.0546875, + "step": 2826, + "time_per_iteration": 2.4860360622406006 + }, + { + "auxiliary_loss_clip": 0.0115502, + "auxiliary_loss_mlp": 0.01050706, + "balance_loss_clip": 1.03164482, + "balance_loss_mlp": 1.0515151, + "epoch": 0.1699684352923493, + "flos": 23039604334080.0, + "grad_norm": 2.3205594057943175, + "language_loss": 0.8232255, + "learning_rate": 3.7979843789179196e-06, + "loss": 0.84528267, + "num_input_tokens_seen": 61269440, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.03125, + "step": 2827, + "time_per_iteration": 2.4965107440948486 + }, + { + "auxiliary_loss_clip": 0.01153252, + "auxiliary_loss_mlp": 0.01049837, + "balance_loss_clip": 1.02965498, + "balance_loss_mlp": 1.05059743, + "epoch": 0.17002855854501728, + "flos": 21434110133760.0, + "grad_norm": 2.393760877815214, + "language_loss": 0.73652613, + "learning_rate": 3.797813774376267e-06, + "loss": 0.75855708, + "num_input_tokens_seen": 61288195, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0234375, + "step": 2828, + "time_per_iteration": 2.5726237297058105 + }, + { + "auxiliary_loss_clip": 0.01046718, + "auxiliary_loss_mlp": 0.01008554, + "balance_loss_clip": 1.00625372, + "balance_loss_mlp": 1.01360035, + "epoch": 0.17008868179768524, + "flos": 71453509205760.0, + "grad_norm": 0.76062911359866, + "language_loss": 0.56446254, + "learning_rate": 3.797643101661336e-06, + "loss": 0.5850153, + "num_input_tokens_seen": 61350850, + "router_z_loss_clip": 0.02294922, + "router_z_loss_mlp": 0.33203125, + "step": 2829, + "time_per_iteration": 3.1035284996032715 + }, + { + "auxiliary_loss_clip": 0.01148906, + "auxiliary_loss_mlp": 0.01048452, + "balance_loss_clip": 1.02912867, + "balance_loss_mlp": 1.04916263, + "epoch": 0.17014880505035324, + "flos": 24900315644160.0, + "grad_norm": 1.7229604876305038, + "language_loss": 0.83673382, + "learning_rate": 3.7974723607795983e-06, + "loss": 0.85870743, + "num_input_tokens_seen": 61370765, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.99609375, + "step": 2830, + "time_per_iteration": 2.5140810012817383 + }, + { + "auxiliary_loss_clip": 0.01150805, + "auxiliary_loss_mlp": 0.01048567, + "balance_loss_clip": 1.02792013, + "balance_loss_mlp": 1.04919207, + "epoch": 0.1702089283030212, + "flos": 29862415981440.0, + "grad_norm": 2.0065309441313337, + "language_loss": 0.77852297, + "learning_rate": 3.797301551737529e-06, + "loss": 0.80051666, + "num_input_tokens_seen": 61388935, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.015625, + "step": 2831, + "time_per_iteration": 2.524578094482422 + }, + { + "auxiliary_loss_clip": 0.01152075, + "auxiliary_loss_mlp": 0.01050912, + "balance_loss_clip": 1.03013349, + "balance_loss_mlp": 1.04948521, + "epoch": 0.17026905155568917, + "flos": 17744180762880.0, + "grad_norm": 2.1211873867699285, + "language_loss": 0.79345167, + "learning_rate": 3.7971306745416044e-06, + "loss": 0.81548154, + "num_input_tokens_seen": 61407350, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.0234375, + "step": 2832, + "time_per_iteration": 2.459954261779785 + }, + { + "auxiliary_loss_clip": 0.01151972, + "auxiliary_loss_mlp": 0.0104718, + "balance_loss_clip": 1.02836847, + "balance_loss_mlp": 1.05050385, + "epoch": 0.17032917480835713, + "flos": 23148665003520.0, + "grad_norm": 1.9382017652854369, + "language_loss": 0.89026237, + "learning_rate": 3.7969597291983046e-06, + "loss": 0.91225392, + "num_input_tokens_seen": 61429010, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 2833, + "time_per_iteration": 2.4812114238739014 + }, + { + "auxiliary_loss_clip": 0.0115284, + "auxiliary_loss_mlp": 0.01048999, + "balance_loss_clip": 1.02963924, + "balance_loss_mlp": 1.05124569, + "epoch": 0.1703892980610251, + "flos": 39202565512320.0, + "grad_norm": 2.853060698790674, + "language_loss": 0.72425497, + "learning_rate": 3.7967887157141115e-06, + "loss": 0.74627328, + "num_input_tokens_seen": 61450040, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.015625, + "step": 2834, + "time_per_iteration": 2.6271297931671143 + }, + { + "auxiliary_loss_clip": 0.01156378, + "auxiliary_loss_mlp": 0.01058486, + "balance_loss_clip": 1.03894782, + "balance_loss_mlp": 1.05294132, + "epoch": 0.17044942131369306, + "flos": 23039101543680.0, + "grad_norm": 1.9954265429463485, + "language_loss": 0.86434042, + "learning_rate": 3.7966176340955106e-06, + "loss": 0.88648909, + "num_input_tokens_seen": 61468585, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.03125, + "step": 2835, + "time_per_iteration": 2.4804999828338623 + }, + { + "auxiliary_loss_clip": 0.01155592, + "auxiliary_loss_mlp": 0.01051963, + "balance_loss_clip": 1.03007674, + "balance_loss_mlp": 1.05081642, + "epoch": 0.17050954456636103, + "flos": 17054983532160.0, + "grad_norm": 1.9180646463430515, + "language_loss": 0.73242748, + "learning_rate": 3.796446484348989e-06, + "loss": 0.75450307, + "num_input_tokens_seen": 61486330, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.046875, + "step": 2836, + "time_per_iteration": 2.4694178104400635 + }, + { + "auxiliary_loss_clip": 0.01156947, + "auxiliary_loss_mlp": 0.01048414, + "balance_loss_clip": 1.02599072, + "balance_loss_mlp": 1.05033076, + "epoch": 0.17056966781902902, + "flos": 16836969934080.0, + "grad_norm": 2.1253309510576717, + "language_loss": 0.79653537, + "learning_rate": 3.796275266481036e-06, + "loss": 0.81858897, + "num_input_tokens_seen": 61503950, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.0703125, + "step": 2837, + "time_per_iteration": 2.452153444290161 + }, + { + "auxiliary_loss_clip": 0.01150588, + "auxiliary_loss_mlp": 0.01045279, + "balance_loss_clip": 1.02550185, + "balance_loss_mlp": 1.05232143, + "epoch": 0.17062979107169698, + "flos": 17712543859200.0, + "grad_norm": 2.19906443062711, + "language_loss": 0.83575213, + "learning_rate": 3.7961039804981456e-06, + "loss": 0.85771078, + "num_input_tokens_seen": 61523550, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.98046875, + "step": 2838, + "time_per_iteration": 2.479573965072632 + }, + { + "auxiliary_loss_clip": 0.01148981, + "auxiliary_loss_mlp": 0.01045249, + "balance_loss_clip": 1.02660489, + "balance_loss_mlp": 1.05069315, + "epoch": 0.17068991432436495, + "flos": 22525040050560.0, + "grad_norm": 1.7423496230624245, + "language_loss": 0.93620354, + "learning_rate": 3.795932626406812e-06, + "loss": 0.95814586, + "num_input_tokens_seen": 61542720, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.984375, + "step": 2839, + "time_per_iteration": 2.5399010181427 + }, + { + "auxiliary_loss_clip": 0.01154457, + "auxiliary_loss_mlp": 0.01049077, + "balance_loss_clip": 1.0277859, + "balance_loss_mlp": 1.05050242, + "epoch": 0.17075003757703291, + "flos": 25882939077120.0, + "grad_norm": 2.8052720148780894, + "language_loss": 0.83847374, + "learning_rate": 3.7957612042135336e-06, + "loss": 0.86050916, + "num_input_tokens_seen": 61563040, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.0390625, + "step": 2840, + "time_per_iteration": 2.5449130535125732 + }, + { + "auxiliary_loss_clip": 0.01155521, + "auxiliary_loss_mlp": 0.01047778, + "balance_loss_clip": 1.02647519, + "balance_loss_mlp": 1.05213881, + "epoch": 0.17081016082970088, + "flos": 20120713332480.0, + "grad_norm": 2.014300966058614, + "language_loss": 0.76390004, + "learning_rate": 3.79558971392481e-06, + "loss": 0.78593302, + "num_input_tokens_seen": 61581890, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.03125, + "step": 2841, + "time_per_iteration": 2.4836723804473877 + }, + { + "auxiliary_loss_clip": 0.01152003, + "auxiliary_loss_mlp": 0.01052533, + "balance_loss_clip": 1.03243482, + "balance_loss_mlp": 1.04932261, + "epoch": 0.17087028408236885, + "flos": 24936477661440.0, + "grad_norm": 1.8874127741110907, + "language_loss": 0.77000463, + "learning_rate": 3.7954181555471443e-06, + "loss": 0.79205, + "num_input_tokens_seen": 61602095, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.03125, + "step": 2842, + "time_per_iteration": 2.5051841735839844 + }, + { + "auxiliary_loss_clip": 0.01148386, + "auxiliary_loss_mlp": 0.01046299, + "balance_loss_clip": 1.02647448, + "balance_loss_mlp": 1.0497905, + "epoch": 0.17093040733503684, + "flos": 19057864872960.0, + "grad_norm": 2.05566421297988, + "language_loss": 0.86086738, + "learning_rate": 3.795246529087043e-06, + "loss": 0.88281423, + "num_input_tokens_seen": 61620400, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.98828125, + "step": 2843, + "time_per_iteration": 2.4487509727478027 + }, + { + "auxiliary_loss_clip": 0.01150009, + "auxiliary_loss_mlp": 0.01046155, + "balance_loss_clip": 1.02696228, + "balance_loss_mlp": 1.05090249, + "epoch": 0.1709905305877048, + "flos": 13078954333440.0, + "grad_norm": 1.8875494657309706, + "language_loss": 0.6826812, + "learning_rate": 3.7950748345510126e-06, + "loss": 0.70464289, + "num_input_tokens_seen": 61637680, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.9921875, + "step": 2844, + "time_per_iteration": 2.4429779052734375 + }, + { + "auxiliary_loss_clip": 0.01150851, + "auxiliary_loss_mlp": 0.01054229, + "balance_loss_clip": 1.03371274, + "balance_loss_mlp": 1.05040824, + "epoch": 0.17105065384037277, + "flos": 19209336526080.0, + "grad_norm": 1.8058232236820264, + "language_loss": 0.78258789, + "learning_rate": 3.7949030719455646e-06, + "loss": 0.80463862, + "num_input_tokens_seen": 61655630, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0, + "step": 2845, + "time_per_iteration": 2.4377951622009277 + }, + { + "auxiliary_loss_clip": 0.01151786, + "auxiliary_loss_mlp": 0.01045909, + "balance_loss_clip": 1.02687097, + "balance_loss_mlp": 1.05064154, + "epoch": 0.17111077709304073, + "flos": 18515183218560.0, + "grad_norm": 2.746386155528142, + "language_loss": 0.77959955, + "learning_rate": 3.7947312412772127e-06, + "loss": 0.8015765, + "num_input_tokens_seen": 61673475, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0078125, + "step": 2846, + "time_per_iteration": 2.4196622371673584 + }, + { + "auxiliary_loss_clip": 0.01152165, + "auxiliary_loss_mlp": 0.0104791, + "balance_loss_clip": 1.02895534, + "balance_loss_mlp": 1.05158973, + "epoch": 0.1711709003457087, + "flos": 25082670015360.0, + "grad_norm": 1.7441395807388675, + "language_loss": 0.7942031, + "learning_rate": 3.794559342552472e-06, + "loss": 0.81620383, + "num_input_tokens_seen": 61693370, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0078125, + "step": 2847, + "time_per_iteration": 2.504087448120117 + }, + { + "auxiliary_loss_clip": 0.01148457, + "auxiliary_loss_mlp": 0.0104865, + "balance_loss_clip": 1.02913523, + "balance_loss_mlp": 1.04612017, + "epoch": 0.17123102359837666, + "flos": 17566387418880.0, + "grad_norm": 2.239997254259111, + "language_loss": 0.86818451, + "learning_rate": 3.7943873757778614e-06, + "loss": 0.89015555, + "num_input_tokens_seen": 61710820, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0234375, + "step": 2848, + "time_per_iteration": 2.438711643218994 + }, + { + "auxiliary_loss_clip": 0.0115323, + "auxiliary_loss_mlp": 0.01044307, + "balance_loss_clip": 1.02438748, + "balance_loss_mlp": 1.05133212, + "epoch": 0.17129114685104463, + "flos": 26173635845760.0, + "grad_norm": 1.715396677859901, + "language_loss": 0.75223613, + "learning_rate": 3.794215340959902e-06, + "loss": 0.77421153, + "num_input_tokens_seen": 61729855, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.015625, + "step": 2849, + "time_per_iteration": 2.4918415546417236 + }, + { + "auxiliary_loss_clip": 0.01047678, + "auxiliary_loss_mlp": 0.01003312, + "balance_loss_clip": 1.00107098, + "balance_loss_mlp": 1.01492834, + "epoch": 0.17135127010371262, + "flos": 69269710037760.0, + "grad_norm": 0.7949737728021388, + "language_loss": 0.57471085, + "learning_rate": 3.7940432381051163e-06, + "loss": 0.59522074, + "num_input_tokens_seen": 61790290, + "router_z_loss_clip": 0.02246094, + "router_z_loss_mlp": 0.328125, + "step": 2850, + "time_per_iteration": 3.057778835296631 + }, + { + "auxiliary_loss_clip": 0.01146039, + "auxiliary_loss_mlp": 0.0105304, + "balance_loss_clip": 1.03332317, + "balance_loss_mlp": 1.04852295, + "epoch": 0.1714113933563806, + "flos": 23550110380800.0, + "grad_norm": 2.4364727127987704, + "language_loss": 0.80988616, + "learning_rate": 3.793871067220031e-06, + "loss": 0.83187693, + "num_input_tokens_seen": 61809265, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.9765625, + "step": 2851, + "time_per_iteration": 3.887600898742676 + }, + { + "auxiliary_loss_clip": 0.01146778, + "auxiliary_loss_mlp": 0.0104369, + "balance_loss_clip": 1.02549911, + "balance_loss_mlp": 1.04858351, + "epoch": 0.17147151660904855, + "flos": 21142443697920.0, + "grad_norm": 2.035620688428962, + "language_loss": 0.93063158, + "learning_rate": 3.7936988283111764e-06, + "loss": 0.95253623, + "num_input_tokens_seen": 61828980, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.98046875, + "step": 2852, + "time_per_iteration": 3.920153856277466 + }, + { + "auxiliary_loss_clip": 0.01149404, + "auxiliary_loss_mlp": 0.01053071, + "balance_loss_clip": 1.03374732, + "balance_loss_mlp": 1.04728949, + "epoch": 0.17153163986171652, + "flos": 18624890332800.0, + "grad_norm": 1.8406206656402175, + "language_loss": 0.69480836, + "learning_rate": 3.7935265213850817e-06, + "loss": 0.71683311, + "num_input_tokens_seen": 61847915, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.015625, + "step": 2853, + "time_per_iteration": 2.4457037448883057 + }, + { + "auxiliary_loss_clip": 0.0115316, + "auxiliary_loss_mlp": 0.01050964, + "balance_loss_clip": 1.03150904, + "balance_loss_mlp": 1.05059445, + "epoch": 0.17159176311438448, + "flos": 18223265387520.0, + "grad_norm": 2.187977199847503, + "language_loss": 0.66505128, + "learning_rate": 3.7933541464482815e-06, + "loss": 0.68709248, + "num_input_tokens_seen": 61865570, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0234375, + "step": 2854, + "time_per_iteration": 2.4421632289886475 + }, + { + "auxiliary_loss_clip": 0.01144359, + "auxiliary_loss_mlp": 0.01044047, + "balance_loss_clip": 1.02520037, + "balance_loss_mlp": 1.04574227, + "epoch": 0.17165188636705245, + "flos": 20738987159040.0, + "grad_norm": 1.8257227486643586, + "language_loss": 0.89394444, + "learning_rate": 3.7931817035073124e-06, + "loss": 0.91582847, + "num_input_tokens_seen": 61883340, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.984375, + "step": 2855, + "time_per_iteration": 2.4601552486419678 + }, + { + "auxiliary_loss_clip": 0.01148566, + "auxiliary_loss_mlp": 0.01051381, + "balance_loss_clip": 1.03286791, + "balance_loss_mlp": 1.04792452, + "epoch": 0.17171200961972044, + "flos": 24899884680960.0, + "grad_norm": 2.515892939250119, + "language_loss": 0.83822739, + "learning_rate": 3.7930091925687134e-06, + "loss": 0.86022681, + "num_input_tokens_seen": 61900610, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0078125, + "step": 2856, + "time_per_iteration": 2.4747347831726074 + }, + { + "auxiliary_loss_clip": 0.01151618, + "auxiliary_loss_mlp": 0.01048758, + "balance_loss_clip": 1.02906466, + "balance_loss_mlp": 1.05112195, + "epoch": 0.1717721328723884, + "flos": 20157234485760.0, + "grad_norm": 1.9053156238546485, + "language_loss": 0.8645792, + "learning_rate": 3.792836613639026e-06, + "loss": 0.88658297, + "num_input_tokens_seen": 61916795, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0078125, + "step": 2857, + "time_per_iteration": 2.4460220336914062 + }, + { + "auxiliary_loss_clip": 0.01148045, + "auxiliary_loss_mlp": 0.0105234, + "balance_loss_clip": 1.03327847, + "balance_loss_mlp": 1.04805577, + "epoch": 0.17183225612505637, + "flos": 23361650697600.0, + "grad_norm": 2.139076633770832, + "language_loss": 0.77919662, + "learning_rate": 3.7926639667247947e-06, + "loss": 0.80120051, + "num_input_tokens_seen": 61936665, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0, + "step": 2858, + "time_per_iteration": 2.4459195137023926 + }, + { + "auxiliary_loss_clip": 0.01156262, + "auxiliary_loss_mlp": 0.01058687, + "balance_loss_clip": 1.03761101, + "balance_loss_mlp": 1.04760742, + "epoch": 0.17189237937772434, + "flos": 18114240631680.0, + "grad_norm": 2.423579883765011, + "language_loss": 0.77235049, + "learning_rate": 3.7924912518325663e-06, + "loss": 0.79449999, + "num_input_tokens_seen": 61954415, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.0859375, + "step": 2859, + "time_per_iteration": 2.43471360206604 + }, + { + "auxiliary_loss_clip": 0.01148379, + "auxiliary_loss_mlp": 0.01050312, + "balance_loss_clip": 1.03069019, + "balance_loss_mlp": 1.04920983, + "epoch": 0.1719525026303923, + "flos": 23258408031360.0, + "grad_norm": 3.774880148287903, + "language_loss": 0.77179611, + "learning_rate": 3.7923184689688902e-06, + "loss": 0.79378301, + "num_input_tokens_seen": 61973940, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.9921875, + "step": 2860, + "time_per_iteration": 2.463344097137451 + }, + { + "auxiliary_loss_clip": 0.01149457, + "auxiliary_loss_mlp": 0.01051045, + "balance_loss_clip": 1.03217435, + "balance_loss_mlp": 1.04703689, + "epoch": 0.17201262588306027, + "flos": 20810413353600.0, + "grad_norm": 2.1505291491255463, + "language_loss": 0.81964719, + "learning_rate": 3.792145618140317e-06, + "loss": 0.84165227, + "num_input_tokens_seen": 61991845, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0234375, + "step": 2861, + "time_per_iteration": 2.4505395889282227 + }, + { + "auxiliary_loss_clip": 0.01149339, + "auxiliary_loss_mlp": 0.01050609, + "balance_loss_clip": 1.03163123, + "balance_loss_mlp": 1.04897118, + "epoch": 0.17207274913572823, + "flos": 20375858615040.0, + "grad_norm": 4.22955926449596, + "language_loss": 0.85649675, + "learning_rate": 3.7919726993534038e-06, + "loss": 0.87849623, + "num_input_tokens_seen": 62009395, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0078125, + "step": 2862, + "time_per_iteration": 2.4392077922821045 + }, + { + "auxiliary_loss_clip": 0.01144423, + "auxiliary_loss_mlp": 0.01046105, + "balance_loss_clip": 1.02867651, + "balance_loss_mlp": 1.04785109, + "epoch": 0.17213287238839622, + "flos": 26797727675520.0, + "grad_norm": 2.3146804122881037, + "language_loss": 0.77874523, + "learning_rate": 3.7917997126147054e-06, + "loss": 0.80065054, + "num_input_tokens_seen": 62029005, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.96484375, + "step": 2863, + "time_per_iteration": 2.4745166301727295 + }, + { + "auxiliary_loss_clip": 0.01147347, + "auxiliary_loss_mlp": 0.01048138, + "balance_loss_clip": 1.02935052, + "balance_loss_mlp": 1.04726493, + "epoch": 0.1721929956410642, + "flos": 26030819370240.0, + "grad_norm": 1.7012031973405044, + "language_loss": 0.72191179, + "learning_rate": 3.7916266579307823e-06, + "loss": 0.74386668, + "num_input_tokens_seen": 62048730, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0, + "step": 2864, + "time_per_iteration": 2.496522903442383 + }, + { + "auxiliary_loss_clip": 0.01151447, + "auxiliary_loss_mlp": 0.01053526, + "balance_loss_clip": 1.03497648, + "balance_loss_mlp": 1.04935968, + "epoch": 0.17225311889373215, + "flos": 22273091078400.0, + "grad_norm": 1.6688219876641972, + "language_loss": 0.72896975, + "learning_rate": 3.7914535353081973e-06, + "loss": 0.75101948, + "num_input_tokens_seen": 62069000, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.015625, + "step": 2865, + "time_per_iteration": 2.468726396560669 + }, + { + "auxiliary_loss_clip": 0.01151587, + "auxiliary_loss_mlp": 0.01044873, + "balance_loss_clip": 1.02608538, + "balance_loss_mlp": 1.05194211, + "epoch": 0.17231324214640012, + "flos": 21287774125440.0, + "grad_norm": 3.1747822479918764, + "language_loss": 0.79011786, + "learning_rate": 3.7912803447535145e-06, + "loss": 0.81208247, + "num_input_tokens_seen": 62086750, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.99609375, + "step": 2866, + "time_per_iteration": 2.445716381072998 + }, + { + "auxiliary_loss_clip": 0.01149563, + "auxiliary_loss_mlp": 0.01046901, + "balance_loss_clip": 1.02651668, + "balance_loss_mlp": 1.04966402, + "epoch": 0.17237336539906808, + "flos": 19680735640320.0, + "grad_norm": 1.797659045411876, + "language_loss": 0.79865277, + "learning_rate": 3.7911070862733016e-06, + "loss": 0.82061744, + "num_input_tokens_seen": 62106240, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0, + "step": 2867, + "time_per_iteration": 2.4745590686798096 + }, + { + "auxiliary_loss_clip": 0.0114836, + "auxiliary_loss_mlp": 0.01037447, + "balance_loss_clip": 1.01887417, + "balance_loss_mlp": 1.04821014, + "epoch": 0.17243348865173605, + "flos": 17529650784000.0, + "grad_norm": 1.717941409951427, + "language_loss": 0.79707634, + "learning_rate": 3.7909337598741276e-06, + "loss": 0.81893444, + "num_input_tokens_seen": 62124895, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0, + "step": 2868, + "time_per_iteration": 2.4545693397521973 + }, + { + "auxiliary_loss_clip": 0.01157442, + "auxiliary_loss_mlp": 0.0104461, + "balance_loss_clip": 1.02645397, + "balance_loss_mlp": 1.0538218, + "epoch": 0.17249361190440402, + "flos": 18259858368000.0, + "grad_norm": 1.9332967921770021, + "language_loss": 0.84265673, + "learning_rate": 3.7907603655625674e-06, + "loss": 0.86467719, + "num_input_tokens_seen": 62143510, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.03125, + "step": 2869, + "time_per_iteration": 2.445429563522339 + }, + { + "auxiliary_loss_clip": 0.01151297, + "auxiliary_loss_mlp": 0.01052302, + "balance_loss_clip": 1.03226328, + "balance_loss_mlp": 1.04971075, + "epoch": 0.172553735157072, + "flos": 21174367910400.0, + "grad_norm": 2.3539211413688954, + "language_loss": 0.77522051, + "learning_rate": 3.7905869033451932e-06, + "loss": 0.79725653, + "num_input_tokens_seen": 62162285, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.015625, + "step": 2870, + "time_per_iteration": 2.4975087642669678 + }, + { + "auxiliary_loss_clip": 0.01146931, + "auxiliary_loss_mlp": 0.01043287, + "balance_loss_clip": 1.02609706, + "balance_loss_mlp": 1.05132568, + "epoch": 0.17261385840973997, + "flos": 22273270646400.0, + "grad_norm": 1.897031493968697, + "language_loss": 0.7680704, + "learning_rate": 3.7904133732285857e-06, + "loss": 0.78997254, + "num_input_tokens_seen": 62180970, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.95703125, + "step": 2871, + "time_per_iteration": 2.4777348041534424 + }, + { + "auxiliary_loss_clip": 0.01150344, + "auxiliary_loss_mlp": 0.01043916, + "balance_loss_clip": 1.02442563, + "balance_loss_mlp": 1.05061746, + "epoch": 0.17267398166240794, + "flos": 27922233830400.0, + "grad_norm": 2.240934958328371, + "language_loss": 0.74448204, + "learning_rate": 3.7902397752193228e-06, + "loss": 0.76642466, + "num_input_tokens_seen": 62198965, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0, + "step": 2872, + "time_per_iteration": 2.5021097660064697 + }, + { + "auxiliary_loss_clip": 0.01147343, + "auxiliary_loss_mlp": 0.01039942, + "balance_loss_clip": 1.02117848, + "balance_loss_mlp": 1.05127549, + "epoch": 0.1727341049150759, + "flos": 21945118970880.0, + "grad_norm": 1.8155923086100165, + "language_loss": 0.82694656, + "learning_rate": 3.790066109323988e-06, + "loss": 0.84881938, + "num_input_tokens_seen": 62219890, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9609375, + "step": 2873, + "time_per_iteration": 2.4852540493011475 + }, + { + "auxiliary_loss_clip": 0.01147589, + "auxiliary_loss_mlp": 0.01043231, + "balance_loss_clip": 1.0229888, + "balance_loss_mlp": 1.049196, + "epoch": 0.17279422816774387, + "flos": 18107883924480.0, + "grad_norm": 2.0464410919173814, + "language_loss": 0.75083232, + "learning_rate": 3.7898923755491678e-06, + "loss": 0.77274048, + "num_input_tokens_seen": 62237140, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.984375, + "step": 2874, + "time_per_iteration": 2.440610885620117 + }, + { + "auxiliary_loss_clip": 0.01151305, + "auxiliary_loss_mlp": 0.01044062, + "balance_loss_clip": 1.0238322, + "balance_loss_mlp": 1.0515728, + "epoch": 0.17285435142041183, + "flos": 21835447770240.0, + "grad_norm": 1.9230852666364326, + "language_loss": 0.8067199, + "learning_rate": 3.7897185739014487e-06, + "loss": 0.8286736, + "num_input_tokens_seen": 62255405, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.99609375, + "step": 2875, + "time_per_iteration": 2.478473424911499 + }, + { + "auxiliary_loss_clip": 0.01153488, + "auxiliary_loss_mlp": 0.01049908, + "balance_loss_clip": 1.02984488, + "balance_loss_mlp": 1.05083489, + "epoch": 0.17291447467307983, + "flos": 18368452160640.0, + "grad_norm": 2.5699127680633542, + "language_loss": 0.87525117, + "learning_rate": 3.7895447043874217e-06, + "loss": 0.89728516, + "num_input_tokens_seen": 62271280, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.03125, + "step": 2876, + "time_per_iteration": 2.4580602645874023 + }, + { + "auxiliary_loss_clip": 0.01150473, + "auxiliary_loss_mlp": 0.01042457, + "balance_loss_clip": 1.02384901, + "balance_loss_mlp": 1.05273616, + "epoch": 0.1729745979257478, + "flos": 18624638937600.0, + "grad_norm": 1.9567138745888089, + "language_loss": 0.84561193, + "learning_rate": 3.789370767013681e-06, + "loss": 0.86754125, + "num_input_tokens_seen": 62289140, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9765625, + "step": 2877, + "time_per_iteration": 2.4696123600006104 + }, + { + "auxiliary_loss_clip": 0.01151589, + "auxiliary_loss_mlp": 0.01041028, + "balance_loss_clip": 1.02179909, + "balance_loss_mlp": 1.05281305, + "epoch": 0.17303472117841576, + "flos": 22998234844800.0, + "grad_norm": 3.0724129461132406, + "language_loss": 0.79527134, + "learning_rate": 3.7891967617868204e-06, + "loss": 0.81719756, + "num_input_tokens_seen": 62307490, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.984375, + "step": 2878, + "time_per_iteration": 2.4739902019500732 + }, + { + "auxiliary_loss_clip": 0.01147004, + "auxiliary_loss_mlp": 0.01042956, + "balance_loss_clip": 1.02450228, + "balance_loss_mlp": 1.04968572, + "epoch": 0.17309484443108372, + "flos": 25664386775040.0, + "grad_norm": 1.9694378769308076, + "language_loss": 0.70306808, + "learning_rate": 3.78902268871344e-06, + "loss": 0.72496772, + "num_input_tokens_seen": 62328570, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.97265625, + "step": 2879, + "time_per_iteration": 2.5014665126800537 + }, + { + "auxiliary_loss_clip": 0.01151101, + "auxiliary_loss_mlp": 0.01050497, + "balance_loss_clip": 1.03156662, + "balance_loss_mlp": 1.05038834, + "epoch": 0.1731549676837517, + "flos": 13552903313280.0, + "grad_norm": 2.4431111997211734, + "language_loss": 0.83465785, + "learning_rate": 3.78884854780014e-06, + "loss": 0.85667384, + "num_input_tokens_seen": 62345735, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0078125, + "step": 2880, + "time_per_iteration": 2.433776378631592 + }, + { + "auxiliary_loss_clip": 0.01153087, + "auxiliary_loss_mlp": 0.01044219, + "balance_loss_clip": 1.0250026, + "balance_loss_mlp": 1.05171311, + "epoch": 0.17321509093641965, + "flos": 22857070394880.0, + "grad_norm": 2.135155165507549, + "language_loss": 0.80866969, + "learning_rate": 3.7886743390535236e-06, + "loss": 0.8306427, + "num_input_tokens_seen": 62365525, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.0078125, + "step": 2881, + "time_per_iteration": 2.4944772720336914 + }, + { + "auxiliary_loss_clip": 0.01148623, + "auxiliary_loss_mlp": 0.01043966, + "balance_loss_clip": 1.0263828, + "balance_loss_mlp": 1.05030859, + "epoch": 0.17327521418908762, + "flos": 24352785653760.0, + "grad_norm": 2.5502275528368066, + "language_loss": 0.77372867, + "learning_rate": 3.788500062480197e-06, + "loss": 0.79565454, + "num_input_tokens_seen": 62385160, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.984375, + "step": 2882, + "time_per_iteration": 2.5426836013793945 + }, + { + "auxiliary_loss_clip": 0.011482, + "auxiliary_loss_mlp": 0.01051627, + "balance_loss_clip": 1.03276825, + "balance_loss_mlp": 1.05005169, + "epoch": 0.1733353374417556, + "flos": 33105651816960.0, + "grad_norm": 1.8718611847068298, + "language_loss": 0.76652586, + "learning_rate": 3.788325718086769e-06, + "loss": 0.78852415, + "num_input_tokens_seen": 62405280, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.984375, + "step": 2883, + "time_per_iteration": 2.5733277797698975 + }, + { + "auxiliary_loss_clip": 0.01146516, + "auxiliary_loss_mlp": 0.01044391, + "balance_loss_clip": 1.0265696, + "balance_loss_mlp": 1.04944682, + "epoch": 0.17339546069442358, + "flos": 24388947671040.0, + "grad_norm": 1.945193845574475, + "language_loss": 0.85463524, + "learning_rate": 3.7881513058798503e-06, + "loss": 0.87654424, + "num_input_tokens_seen": 62423665, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.97265625, + "step": 2884, + "time_per_iteration": 2.4708735942840576 + }, + { + "auxiliary_loss_clip": 0.01149646, + "auxiliary_loss_mlp": 0.01039486, + "balance_loss_clip": 1.02122355, + "balance_loss_mlp": 1.05114794, + "epoch": 0.17345558394709154, + "flos": 27454174680960.0, + "grad_norm": 1.6148586475999513, + "language_loss": 0.73758793, + "learning_rate": 3.787976825866055e-06, + "loss": 0.75947917, + "num_input_tokens_seen": 62445170, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.984375, + "step": 2885, + "time_per_iteration": 2.5266878604888916 + }, + { + "auxiliary_loss_clip": 0.01147273, + "auxiliary_loss_mlp": 0.01044711, + "balance_loss_clip": 1.02775908, + "balance_loss_mlp": 1.05269074, + "epoch": 0.1735157071997595, + "flos": 24682158391680.0, + "grad_norm": 1.9690054244815705, + "language_loss": 0.70377076, + "learning_rate": 3.7878022780519998e-06, + "loss": 0.72569054, + "num_input_tokens_seen": 62466135, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9453125, + "step": 2886, + "time_per_iteration": 2.508695363998413 + }, + { + "auxiliary_loss_clip": 0.01146959, + "auxiliary_loss_mlp": 0.01040101, + "balance_loss_clip": 1.0212425, + "balance_loss_mlp": 1.04799545, + "epoch": 0.17357583045242747, + "flos": 21688932193920.0, + "grad_norm": 1.9665325510573808, + "language_loss": 0.69294798, + "learning_rate": 3.7876276624443024e-06, + "loss": 0.7148186, + "num_input_tokens_seen": 62483910, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.98828125, + "step": 2887, + "time_per_iteration": 2.4787776470184326 + }, + { + "auxiliary_loss_clip": 0.01149915, + "auxiliary_loss_mlp": 0.01049822, + "balance_loss_clip": 1.03180945, + "balance_loss_mlp": 1.05075955, + "epoch": 0.17363595370509544, + "flos": 15375728753280.0, + "grad_norm": 1.791000255721863, + "language_loss": 0.85391176, + "learning_rate": 3.787452979049585e-06, + "loss": 0.87590909, + "num_input_tokens_seen": 62501530, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9921875, + "step": 2888, + "time_per_iteration": 2.4234085083007812 + }, + { + "auxiliary_loss_clip": 0.01149617, + "auxiliary_loss_mlp": 0.01046928, + "balance_loss_clip": 1.02668667, + "balance_loss_mlp": 1.05046952, + "epoch": 0.1736960769577634, + "flos": 23440941970560.0, + "grad_norm": 3.660213605651755, + "language_loss": 0.78465497, + "learning_rate": 3.7872782278744718e-06, + "loss": 0.80662042, + "num_input_tokens_seen": 62521295, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.9921875, + "step": 2889, + "time_per_iteration": 2.5042123794555664 + }, + { + "auxiliary_loss_clip": 0.01146581, + "auxiliary_loss_mlp": 0.01047944, + "balance_loss_clip": 1.02913308, + "balance_loss_mlp": 1.05222893, + "epoch": 0.1737562002104314, + "flos": 18587830475520.0, + "grad_norm": 2.9081348702485723, + "language_loss": 0.83860242, + "learning_rate": 3.7871034089255883e-06, + "loss": 0.86054766, + "num_input_tokens_seen": 62539615, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.9453125, + "step": 2890, + "time_per_iteration": 2.4698500633239746 + }, + { + "auxiliary_loss_clip": 0.01150813, + "auxiliary_loss_mlp": 0.01047378, + "balance_loss_clip": 1.02880502, + "balance_loss_mlp": 1.05083108, + "epoch": 0.17381632346309936, + "flos": 15998060816640.0, + "grad_norm": 1.9935479009749588, + "language_loss": 0.82253492, + "learning_rate": 3.7869285222095653e-06, + "loss": 0.84451687, + "num_input_tokens_seen": 62556820, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0, + "step": 2891, + "time_per_iteration": 2.4478886127471924 + }, + { + "auxiliary_loss_clip": 0.01150226, + "auxiliary_loss_mlp": 0.01045776, + "balance_loss_clip": 1.02608275, + "balance_loss_mlp": 1.04824781, + "epoch": 0.17387644671576732, + "flos": 13369830670080.0, + "grad_norm": 2.3073165362682873, + "language_loss": 0.81479478, + "learning_rate": 3.7867535677330334e-06, + "loss": 0.8367548, + "num_input_tokens_seen": 62572450, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.015625, + "step": 2892, + "time_per_iteration": 2.4094645977020264 + }, + { + "auxiliary_loss_clip": 0.01154909, + "auxiliary_loss_mlp": 0.01055666, + "balance_loss_clip": 1.03519785, + "balance_loss_mlp": 1.05379355, + "epoch": 0.1739365699684353, + "flos": 26615516958720.0, + "grad_norm": 2.24459564009462, + "language_loss": 0.74480057, + "learning_rate": 3.786578545502627e-06, + "loss": 0.76690638, + "num_input_tokens_seen": 62592580, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0078125, + "step": 2893, + "time_per_iteration": 3.8296191692352295 + }, + { + "auxiliary_loss_clip": 0.01152082, + "auxiliary_loss_mlp": 0.010434, + "balance_loss_clip": 1.02375412, + "balance_loss_mlp": 1.05193436, + "epoch": 0.17399669322110325, + "flos": 23367971491200.0, + "grad_norm": 2.117368029368179, + "language_loss": 0.83073241, + "learning_rate": 3.7864034555249828e-06, + "loss": 0.85268712, + "num_input_tokens_seen": 62611220, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0, + "step": 2894, + "time_per_iteration": 3.9817075729370117 + }, + { + "auxiliary_loss_clip": 0.01150382, + "auxiliary_loss_mlp": 0.01047312, + "balance_loss_clip": 1.02523482, + "balance_loss_mlp": 1.05032384, + "epoch": 0.17405681647377122, + "flos": 22054107813120.0, + "grad_norm": 2.157907065313142, + "language_loss": 0.74051547, + "learning_rate": 3.786228297806741e-06, + "loss": 0.76249242, + "num_input_tokens_seen": 62629185, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.0, + "step": 2895, + "time_per_iteration": 2.461857318878174 + }, + { + "auxiliary_loss_clip": 0.01048544, + "auxiliary_loss_mlp": 0.01006984, + "balance_loss_clip": 1.00467134, + "balance_loss_mlp": 1.01600659, + "epoch": 0.1741169397264392, + "flos": 61457559114240.0, + "grad_norm": 0.8715266336267762, + "language_loss": 0.6273998, + "learning_rate": 3.7860530723545435e-06, + "loss": 0.64795506, + "num_input_tokens_seen": 62691895, + "router_z_loss_clip": 0.02307129, + "router_z_loss_mlp": 0.32421875, + "step": 2896, + "time_per_iteration": 3.1462173461914062 + }, + { + "auxiliary_loss_clip": 0.01148575, + "auxiliary_loss_mlp": 0.01041696, + "balance_loss_clip": 1.02160895, + "balance_loss_mlp": 1.04787612, + "epoch": 0.17417706297910718, + "flos": 27017680608000.0, + "grad_norm": 2.3238967096174923, + "language_loss": 0.75600475, + "learning_rate": 3.785877779175034e-06, + "loss": 0.77790749, + "num_input_tokens_seen": 62713790, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0078125, + "step": 2897, + "time_per_iteration": 2.4974682331085205 + }, + { + "auxiliary_loss_clip": 0.01147676, + "auxiliary_loss_mlp": 0.01042715, + "balance_loss_clip": 1.02354646, + "balance_loss_mlp": 1.05000067, + "epoch": 0.17423718623177514, + "flos": 33508856960640.0, + "grad_norm": 1.9004029304223122, + "language_loss": 0.69384712, + "learning_rate": 3.7857024182748606e-06, + "loss": 0.71575105, + "num_input_tokens_seen": 62736285, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.9765625, + "step": 2898, + "time_per_iteration": 2.5650558471679688 + }, + { + "auxiliary_loss_clip": 0.0115334, + "auxiliary_loss_mlp": 0.01049615, + "balance_loss_clip": 1.03026772, + "balance_loss_mlp": 1.05215359, + "epoch": 0.1742973094844431, + "flos": 27198634348800.0, + "grad_norm": 2.315885710988465, + "language_loss": 0.76069367, + "learning_rate": 3.7855269896606717e-06, + "loss": 0.78272319, + "num_input_tokens_seen": 62756240, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.015625, + "step": 2899, + "time_per_iteration": 2.5006191730499268 + }, + { + "auxiliary_loss_clip": 0.01145178, + "auxiliary_loss_mlp": 0.01045245, + "balance_loss_clip": 1.02571905, + "balance_loss_mlp": 1.04929495, + "epoch": 0.17435743273711107, + "flos": 22710734386560.0, + "grad_norm": 1.9440585306650153, + "language_loss": 0.72821134, + "learning_rate": 3.785351493339121e-06, + "loss": 0.75011557, + "num_input_tokens_seen": 62775910, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.9609375, + "step": 2900, + "time_per_iteration": 2.5199801921844482 + }, + { + "auxiliary_loss_clip": 0.01150074, + "auxiliary_loss_mlp": 0.01051215, + "balance_loss_clip": 1.03261876, + "balance_loss_mlp": 1.04989529, + "epoch": 0.17441755598977904, + "flos": 41646466039680.0, + "grad_norm": 1.6677330343015109, + "language_loss": 0.70085949, + "learning_rate": 3.785175929316863e-06, + "loss": 0.72287238, + "num_input_tokens_seen": 62799385, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0, + "step": 2901, + "time_per_iteration": 2.624864101409912 + }, + { + "auxiliary_loss_clip": 0.01152064, + "auxiliary_loss_mlp": 0.01048884, + "balance_loss_clip": 1.03022778, + "balance_loss_mlp": 1.05087507, + "epoch": 0.174477679242447, + "flos": 26287077974400.0, + "grad_norm": 1.7643324639769489, + "language_loss": 0.76549768, + "learning_rate": 3.7850002976005543e-06, + "loss": 0.78750718, + "num_input_tokens_seen": 62819380, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.0078125, + "step": 2902, + "time_per_iteration": 2.51680850982666 + }, + { + "auxiliary_loss_clip": 0.01150394, + "auxiliary_loss_mlp": 0.01056591, + "balance_loss_clip": 1.03741002, + "balance_loss_mlp": 1.04885221, + "epoch": 0.174537802495115, + "flos": 17858412990720.0, + "grad_norm": 2.129298660499851, + "language_loss": 0.81787169, + "learning_rate": 3.7848245981968558e-06, + "loss": 0.8399415, + "num_input_tokens_seen": 62836205, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.015625, + "step": 2903, + "time_per_iteration": 2.436877727508545 + }, + { + "auxiliary_loss_clip": 0.01148467, + "auxiliary_loss_mlp": 0.01041626, + "balance_loss_clip": 1.02255297, + "balance_loss_mlp": 1.04978609, + "epoch": 0.17459792574778296, + "flos": 16940715390720.0, + "grad_norm": 2.1703016783079327, + "language_loss": 0.73228866, + "learning_rate": 3.784648831112429e-06, + "loss": 0.75418955, + "num_input_tokens_seen": 62854045, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.98828125, + "step": 2904, + "time_per_iteration": 2.462775707244873 + }, + { + "auxiliary_loss_clip": 0.0114756, + "auxiliary_loss_mlp": 0.01045319, + "balance_loss_clip": 1.02719879, + "balance_loss_mlp": 1.04777265, + "epoch": 0.17465804900045093, + "flos": 25520026014720.0, + "grad_norm": 1.9374721445221084, + "language_loss": 0.64526325, + "learning_rate": 3.7844729963539406e-06, + "loss": 0.6671921, + "num_input_tokens_seen": 62873075, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.0, + "step": 2905, + "time_per_iteration": 2.468395233154297 + }, + { + "auxiliary_loss_clip": 0.01157304, + "auxiliary_loss_mlp": 0.01050089, + "balance_loss_clip": 1.0292747, + "balance_loss_mlp": 1.05202341, + "epoch": 0.1747181722531189, + "flos": 24129708238080.0, + "grad_norm": 1.804147248272645, + "language_loss": 0.79236615, + "learning_rate": 3.7842970939280566e-06, + "loss": 0.81444013, + "num_input_tokens_seen": 62892675, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.0546875, + "step": 2906, + "time_per_iteration": 2.4632725715637207 + }, + { + "auxiliary_loss_clip": 0.01150693, + "auxiliary_loss_mlp": 0.01055346, + "balance_loss_clip": 1.03577161, + "balance_loss_mlp": 1.05044913, + "epoch": 0.17477829550578686, + "flos": 17748813617280.0, + "grad_norm": 1.7929508882228948, + "language_loss": 0.81010377, + "learning_rate": 3.784121123841449e-06, + "loss": 0.83216417, + "num_input_tokens_seen": 62910675, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0, + "step": 2907, + "time_per_iteration": 2.4214229583740234 + }, + { + "auxiliary_loss_clip": 0.01152007, + "auxiliary_loss_mlp": 0.01050463, + "balance_loss_clip": 1.03161621, + "balance_loss_mlp": 1.05040026, + "epoch": 0.17483841875845482, + "flos": 15377344865280.0, + "grad_norm": 2.7402312811515515, + "language_loss": 0.81315112, + "learning_rate": 3.7839450861007886e-06, + "loss": 0.83517587, + "num_input_tokens_seen": 62928130, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.015625, + "step": 2908, + "time_per_iteration": 2.4340970516204834 + }, + { + "auxiliary_loss_clip": 0.01150458, + "auxiliary_loss_mlp": 0.01051278, + "balance_loss_clip": 1.03047633, + "balance_loss_mlp": 1.04978228, + "epoch": 0.17489854201112282, + "flos": 17163254102400.0, + "grad_norm": 2.419675279893618, + "language_loss": 0.80399191, + "learning_rate": 3.7837689807127518e-06, + "loss": 0.82600915, + "num_input_tokens_seen": 62944290, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.0078125, + "step": 2909, + "time_per_iteration": 2.4170033931732178 + }, + { + "auxiliary_loss_clip": 0.0115308, + "auxiliary_loss_mlp": 0.01053412, + "balance_loss_clip": 1.03319383, + "balance_loss_mlp": 1.05133021, + "epoch": 0.17495866526379078, + "flos": 19755286318080.0, + "grad_norm": 1.6998329053727648, + "language_loss": 0.76530939, + "learning_rate": 3.783592807684017e-06, + "loss": 0.78737426, + "num_input_tokens_seen": 62963505, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.015625, + "step": 2910, + "time_per_iteration": 2.457628011703491 + }, + { + "auxiliary_loss_clip": 0.01151475, + "auxiliary_loss_mlp": 0.01049527, + "balance_loss_clip": 1.02901077, + "balance_loss_mlp": 1.05060935, + "epoch": 0.17501878851645875, + "flos": 28511133310080.0, + "grad_norm": 1.6502133484544155, + "language_loss": 0.87255991, + "learning_rate": 3.7834165670212645e-06, + "loss": 0.89456993, + "num_input_tokens_seen": 62985020, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0078125, + "step": 2911, + "time_per_iteration": 2.5302672386169434 + }, + { + "auxiliary_loss_clip": 0.01148398, + "auxiliary_loss_mlp": 0.0105451, + "balance_loss_clip": 1.03349352, + "balance_loss_mlp": 1.04746377, + "epoch": 0.1750789117691267, + "flos": 17931203902080.0, + "grad_norm": 2.260601647926804, + "language_loss": 0.89586449, + "learning_rate": 3.7832402587311764e-06, + "loss": 0.91789353, + "num_input_tokens_seen": 63001745, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.0078125, + "step": 2912, + "time_per_iteration": 2.447650194168091 + }, + { + "auxiliary_loss_clip": 0.01150608, + "auxiliary_loss_mlp": 0.01050267, + "balance_loss_clip": 1.0302161, + "balance_loss_mlp": 1.04871392, + "epoch": 0.17513903502179468, + "flos": 18259427404800.0, + "grad_norm": 2.8836544870459813, + "language_loss": 0.7262938, + "learning_rate": 3.783063882820439e-06, + "loss": 0.74830252, + "num_input_tokens_seen": 63019750, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.015625, + "step": 2913, + "time_per_iteration": 2.423595666885376 + }, + { + "auxiliary_loss_clip": 0.01150722, + "auxiliary_loss_mlp": 0.01047113, + "balance_loss_clip": 1.02738369, + "balance_loss_mlp": 1.0522244, + "epoch": 0.17519915827446264, + "flos": 20704728562560.0, + "grad_norm": 2.243393227782369, + "language_loss": 0.68799925, + "learning_rate": 3.782887439295741e-06, + "loss": 0.70997757, + "num_input_tokens_seen": 63039500, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.984375, + "step": 2914, + "time_per_iteration": 2.46085262298584 + }, + { + "auxiliary_loss_clip": 0.01149727, + "auxiliary_loss_mlp": 0.01056566, + "balance_loss_clip": 1.03616977, + "balance_loss_mlp": 1.05143356, + "epoch": 0.1752592815271306, + "flos": 20523415685760.0, + "grad_norm": 1.8218690011087264, + "language_loss": 0.93755293, + "learning_rate": 3.782710928163772e-06, + "loss": 0.95961595, + "num_input_tokens_seen": 63059785, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.98046875, + "step": 2915, + "time_per_iteration": 2.457148551940918 + }, + { + "auxiliary_loss_clip": 0.01143068, + "auxiliary_loss_mlp": 0.01047095, + "balance_loss_clip": 1.02744889, + "balance_loss_mlp": 1.04722261, + "epoch": 0.1753194047797986, + "flos": 21799178012160.0, + "grad_norm": 1.8144768789670476, + "language_loss": 0.80869162, + "learning_rate": 3.782534349431226e-06, + "loss": 0.83059323, + "num_input_tokens_seen": 63079385, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.95703125, + "step": 2916, + "time_per_iteration": 2.4740476608276367 + }, + { + "auxiliary_loss_clip": 0.01150429, + "auxiliary_loss_mlp": 0.01056449, + "balance_loss_clip": 1.03663611, + "balance_loss_mlp": 1.04854608, + "epoch": 0.17537952803246656, + "flos": 20668351063680.0, + "grad_norm": 1.67512565222408, + "language_loss": 0.73645711, + "learning_rate": 3.782357703104799e-06, + "loss": 0.75852591, + "num_input_tokens_seen": 63098970, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.015625, + "step": 2917, + "time_per_iteration": 2.4484915733337402 + }, + { + "auxiliary_loss_clip": 0.01144993, + "auxiliary_loss_mlp": 0.01055794, + "balance_loss_clip": 1.03517044, + "balance_loss_mlp": 1.04897738, + "epoch": 0.17543965128513453, + "flos": 23295072839040.0, + "grad_norm": 12.675743752905372, + "language_loss": 0.77019119, + "learning_rate": 3.7821809891911897e-06, + "loss": 0.79219908, + "num_input_tokens_seen": 63118750, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 0.9609375, + "step": 2918, + "time_per_iteration": 2.4723429679870605 + }, + { + "auxiliary_loss_clip": 0.01154194, + "auxiliary_loss_mlp": 0.01046159, + "balance_loss_clip": 1.0260129, + "balance_loss_mlp": 1.05131745, + "epoch": 0.1754997745378025, + "flos": 29095615416960.0, + "grad_norm": 3.415786226656528, + "language_loss": 0.74196291, + "learning_rate": 3.782004207697098e-06, + "loss": 0.76396644, + "num_input_tokens_seen": 63136865, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.03125, + "step": 2919, + "time_per_iteration": 2.5049829483032227 + }, + { + "auxiliary_loss_clip": 0.01154792, + "auxiliary_loss_mlp": 0.01049917, + "balance_loss_clip": 1.03080809, + "balance_loss_mlp": 1.05090559, + "epoch": 0.17555989779047046, + "flos": 30371844620160.0, + "grad_norm": 1.7754050788280298, + "language_loss": 0.74211872, + "learning_rate": 3.781827358629228e-06, + "loss": 0.76416576, + "num_input_tokens_seen": 63158325, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0390625, + "step": 2920, + "time_per_iteration": 2.565361738204956 + }, + { + "auxiliary_loss_clip": 0.01144387, + "auxiliary_loss_mlp": 0.01039886, + "balance_loss_clip": 1.0219686, + "balance_loss_mlp": 1.04717219, + "epoch": 0.17562002104313842, + "flos": 23287746464640.0, + "grad_norm": 2.3164139995284834, + "language_loss": 0.7949307, + "learning_rate": 3.7816504419942873e-06, + "loss": 0.81677347, + "num_input_tokens_seen": 63173115, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.97265625, + "step": 2921, + "time_per_iteration": 2.4471213817596436 + }, + { + "auxiliary_loss_clip": 0.01153986, + "auxiliary_loss_mlp": 0.0104563, + "balance_loss_clip": 1.02569795, + "balance_loss_mlp": 1.05029321, + "epoch": 0.1756801442958064, + "flos": 24790500789120.0, + "grad_norm": 1.6170497741380607, + "language_loss": 0.87493849, + "learning_rate": 3.7814734577989823e-06, + "loss": 0.89693457, + "num_input_tokens_seen": 63192880, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0390625, + "step": 2922, + "time_per_iteration": 2.5042173862457275 + }, + { + "auxiliary_loss_clip": 0.01149338, + "auxiliary_loss_mlp": 0.01050477, + "balance_loss_clip": 1.03074801, + "balance_loss_mlp": 1.04808784, + "epoch": 0.17574026754847438, + "flos": 25771651764480.0, + "grad_norm": 2.3811708545321735, + "language_loss": 0.62097687, + "learning_rate": 3.7812964060500253e-06, + "loss": 0.64297503, + "num_input_tokens_seen": 63214395, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.015625, + "step": 2923, + "time_per_iteration": 2.5067484378814697 + }, + { + "auxiliary_loss_clip": 0.01154551, + "auxiliary_loss_mlp": 0.01048297, + "balance_loss_clip": 1.02775693, + "balance_loss_mlp": 1.05287814, + "epoch": 0.17580039080114235, + "flos": 17456608477440.0, + "grad_norm": 2.1344206016331797, + "language_loss": 0.80602306, + "learning_rate": 3.78111928675413e-06, + "loss": 0.82805157, + "num_input_tokens_seen": 63231020, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.015625, + "step": 2924, + "time_per_iteration": 2.453174114227295 + }, + { + "auxiliary_loss_clip": 0.0115147, + "auxiliary_loss_mlp": 0.01053673, + "balance_loss_clip": 1.03214407, + "balance_loss_mlp": 1.04809761, + "epoch": 0.1758605140538103, + "flos": 14864648088960.0, + "grad_norm": 3.672968077353321, + "language_loss": 0.70954067, + "learning_rate": 3.7809420999180126e-06, + "loss": 0.73159206, + "num_input_tokens_seen": 63246245, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.03125, + "step": 2925, + "time_per_iteration": 2.4666385650634766 + }, + { + "auxiliary_loss_clip": 0.01148763, + "auxiliary_loss_mlp": 0.0104438, + "balance_loss_clip": 1.02538979, + "balance_loss_mlp": 1.05147243, + "epoch": 0.17592063730647828, + "flos": 23004268329600.0, + "grad_norm": 1.6622274839000213, + "language_loss": 0.71700275, + "learning_rate": 3.7807648455483934e-06, + "loss": 0.73893416, + "num_input_tokens_seen": 63267790, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.97265625, + "step": 2926, + "time_per_iteration": 2.50289249420166 + }, + { + "auxiliary_loss_clip": 0.01150931, + "auxiliary_loss_mlp": 0.01043072, + "balance_loss_clip": 1.02191257, + "balance_loss_mlp": 1.04857433, + "epoch": 0.17598076055914624, + "flos": 20741501111040.0, + "grad_norm": 1.8916391197618272, + "language_loss": 0.84433806, + "learning_rate": 3.7805875236519918e-06, + "loss": 0.86627805, + "num_input_tokens_seen": 63286830, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.0234375, + "step": 2927, + "time_per_iteration": 2.447207450866699 + }, + { + "auxiliary_loss_clip": 0.01149947, + "auxiliary_loss_mlp": 0.01043802, + "balance_loss_clip": 1.02568233, + "balance_loss_mlp": 1.0506475, + "epoch": 0.1760408838118142, + "flos": 34092441227520.0, + "grad_norm": 1.8156588356210406, + "language_loss": 0.71879232, + "learning_rate": 3.7804101342355336e-06, + "loss": 0.74072987, + "num_input_tokens_seen": 63308870, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9921875, + "step": 2928, + "time_per_iteration": 2.585942029953003 + }, + { + "auxiliary_loss_clip": 0.01150116, + "auxiliary_loss_mlp": 0.01048544, + "balance_loss_clip": 1.028934, + "balance_loss_mlp": 1.05230594, + "epoch": 0.1761010070644822, + "flos": 24168384207360.0, + "grad_norm": 2.0402577824357886, + "language_loss": 0.83222824, + "learning_rate": 3.780232677305744e-06, + "loss": 0.85421479, + "num_input_tokens_seen": 63329005, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.9765625, + "step": 2929, + "time_per_iteration": 2.461101770401001 + }, + { + "auxiliary_loss_clip": 0.01149627, + "auxiliary_loss_mlp": 0.0104173, + "balance_loss_clip": 1.02298999, + "balance_loss_mlp": 1.0493536, + "epoch": 0.17616113031715017, + "flos": 26576697335040.0, + "grad_norm": 1.817429721867852, + "language_loss": 0.7933988, + "learning_rate": 3.7800551528693535e-06, + "loss": 0.81531239, + "num_input_tokens_seen": 63349390, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0, + "step": 2930, + "time_per_iteration": 2.491748571395874 + }, + { + "auxiliary_loss_clip": 0.01154203, + "auxiliary_loss_mlp": 0.0104708, + "balance_loss_clip": 1.02671921, + "balance_loss_mlp": 1.05319881, + "epoch": 0.17622125356981813, + "flos": 25666685245440.0, + "grad_norm": 2.194829469856105, + "language_loss": 0.76142448, + "learning_rate": 3.7798775609330927e-06, + "loss": 0.78343737, + "num_input_tokens_seen": 63368835, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0078125, + "step": 2931, + "time_per_iteration": 2.4907379150390625 + }, + { + "auxiliary_loss_clip": 0.01151699, + "auxiliary_loss_mlp": 0.0104003, + "balance_loss_clip": 1.0211587, + "balance_loss_mlp": 1.05108666, + "epoch": 0.1762813768224861, + "flos": 16508530949760.0, + "grad_norm": 2.8261445455709153, + "language_loss": 0.74740392, + "learning_rate": 3.779699901503696e-06, + "loss": 0.7693212, + "num_input_tokens_seen": 63385220, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0078125, + "step": 2932, + "time_per_iteration": 2.4252588748931885 + }, + { + "auxiliary_loss_clip": 0.01157373, + "auxiliary_loss_mlp": 0.01043179, + "balance_loss_clip": 1.0221262, + "balance_loss_mlp": 1.05086923, + "epoch": 0.17634150007515406, + "flos": 11211850402560.0, + "grad_norm": 2.4930669650063355, + "language_loss": 0.8968839, + "learning_rate": 3.7795221745879016e-06, + "loss": 0.9188894, + "num_input_tokens_seen": 63400865, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.0625, + "step": 2933, + "time_per_iteration": 2.4334278106689453 + }, + { + "auxiliary_loss_clip": 0.01147962, + "auxiliary_loss_mlp": 0.01047507, + "balance_loss_clip": 1.02980459, + "balance_loss_mlp": 1.05053639, + "epoch": 0.17640162332782203, + "flos": 23659925235840.0, + "grad_norm": 1.6616334836184845, + "language_loss": 0.88273364, + "learning_rate": 3.779344380192448e-06, + "loss": 0.90468836, + "num_input_tokens_seen": 63421390, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9765625, + "step": 2934, + "time_per_iteration": 3.891472578048706 + }, + { + "auxiliary_loss_clip": 0.01147552, + "auxiliary_loss_mlp": 0.0104641, + "balance_loss_clip": 1.02827823, + "balance_loss_mlp": 1.04972959, + "epoch": 0.17646174658049, + "flos": 53796984606720.0, + "grad_norm": 1.7575209177187046, + "language_loss": 0.70843625, + "learning_rate": 3.779166518324077e-06, + "loss": 0.73037589, + "num_input_tokens_seen": 63444715, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.98046875, + "step": 2935, + "time_per_iteration": 5.650984287261963 + }, + { + "auxiliary_loss_clip": 0.01157572, + "auxiliary_loss_mlp": 0.0104314, + "balance_loss_clip": 1.02405488, + "balance_loss_mlp": 1.05251908, + "epoch": 0.17652186983315798, + "flos": 24243868638720.0, + "grad_norm": 2.2448658169111795, + "language_loss": 0.69255942, + "learning_rate": 3.7789885889895325e-06, + "loss": 0.71456659, + "num_input_tokens_seen": 63465525, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0546875, + "step": 2936, + "time_per_iteration": 2.4864091873168945 + }, + { + "auxiliary_loss_clip": 0.01154775, + "auxiliary_loss_mlp": 0.01045313, + "balance_loss_clip": 1.02758646, + "balance_loss_mlp": 1.05530488, + "epoch": 0.17658199308582595, + "flos": 27454282421760.0, + "grad_norm": 1.883537128373794, + "language_loss": 0.71391022, + "learning_rate": 3.7788105921955634e-06, + "loss": 0.73591107, + "num_input_tokens_seen": 63485815, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.99609375, + "step": 2937, + "time_per_iteration": 2.5096240043640137 + }, + { + "auxiliary_loss_clip": 0.01158892, + "auxiliary_loss_mlp": 0.010448, + "balance_loss_clip": 1.02461779, + "balance_loss_mlp": 1.05530524, + "epoch": 0.17664211633849392, + "flos": 22418672901120.0, + "grad_norm": 2.165923066719211, + "language_loss": 0.7584855, + "learning_rate": 3.7786325279489184e-06, + "loss": 0.78052241, + "num_input_tokens_seen": 63503905, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.03125, + "step": 2938, + "time_per_iteration": 2.475069284439087 + }, + { + "auxiliary_loss_clip": 0.01153261, + "auxiliary_loss_mlp": 0.01043059, + "balance_loss_clip": 1.02466512, + "balance_loss_mlp": 1.05156195, + "epoch": 0.17670223959116188, + "flos": 24715124098560.0, + "grad_norm": 2.20477923303766, + "language_loss": 0.71130306, + "learning_rate": 3.7784543962563495e-06, + "loss": 0.73326623, + "num_input_tokens_seen": 63521985, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.015625, + "step": 2939, + "time_per_iteration": 2.4806766510009766 + }, + { + "auxiliary_loss_clip": 0.01153772, + "auxiliary_loss_mlp": 0.0104332, + "balance_loss_clip": 1.02421093, + "balance_loss_mlp": 1.0538342, + "epoch": 0.17676236284382985, + "flos": 22527051212160.0, + "grad_norm": 3.125031265469358, + "language_loss": 0.73781312, + "learning_rate": 3.7782761971246115e-06, + "loss": 0.7597841, + "num_input_tokens_seen": 63539830, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0, + "step": 2940, + "time_per_iteration": 2.5438694953918457 + }, + { + "auxiliary_loss_clip": 0.01154904, + "auxiliary_loss_mlp": 0.01045748, + "balance_loss_clip": 1.02568471, + "balance_loss_mlp": 1.05372643, + "epoch": 0.1768224860964978, + "flos": 12385160161920.0, + "grad_norm": 2.4976558026918703, + "language_loss": 0.85003591, + "learning_rate": 3.7780979305604616e-06, + "loss": 0.87204242, + "num_input_tokens_seen": 63555495, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0078125, + "step": 2941, + "time_per_iteration": 2.4616622924804688 + }, + { + "auxiliary_loss_clip": 0.01154492, + "auxiliary_loss_mlp": 0.0104577, + "balance_loss_clip": 1.02687514, + "balance_loss_mlp": 1.05292201, + "epoch": 0.1768826093491658, + "flos": 24353360271360.0, + "grad_norm": 2.199835477442084, + "language_loss": 0.7711162, + "learning_rate": 3.7779195965706607e-06, + "loss": 0.79311877, + "num_input_tokens_seen": 63575290, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.015625, + "step": 2942, + "time_per_iteration": 2.512493848800659 + }, + { + "auxiliary_loss_clip": 0.01154308, + "auxiliary_loss_mlp": 0.01044542, + "balance_loss_clip": 1.02514625, + "balance_loss_mlp": 1.05181623, + "epoch": 0.17694273260183377, + "flos": 23587062497280.0, + "grad_norm": 1.9811917296629065, + "language_loss": 0.80591762, + "learning_rate": 3.77774119516197e-06, + "loss": 0.82790613, + "num_input_tokens_seen": 63594670, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0234375, + "step": 2943, + "time_per_iteration": 2.4898416996002197 + }, + { + "auxiliary_loss_clip": 0.01154834, + "auxiliary_loss_mlp": 0.01050893, + "balance_loss_clip": 1.02953053, + "balance_loss_mlp": 1.05046725, + "epoch": 0.17700285585450173, + "flos": 26760991040640.0, + "grad_norm": 2.9958912509352866, + "language_loss": 0.80558729, + "learning_rate": 3.777562726341155e-06, + "loss": 0.82764459, + "num_input_tokens_seen": 63614780, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.046875, + "step": 2944, + "time_per_iteration": 2.533968448638916 + }, + { + "auxiliary_loss_clip": 0.01154843, + "auxiliary_loss_mlp": 0.01062464, + "balance_loss_clip": 1.04353368, + "balance_loss_mlp": 1.05239737, + "epoch": 0.1770629791071697, + "flos": 42776323320960.0, + "grad_norm": 1.992535786356086, + "language_loss": 0.73450243, + "learning_rate": 3.7773841901149835e-06, + "loss": 0.75667548, + "num_input_tokens_seen": 63637190, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0234375, + "step": 2945, + "time_per_iteration": 2.641890287399292 + }, + { + "auxiliary_loss_clip": 0.01152525, + "auxiliary_loss_mlp": 0.01050215, + "balance_loss_clip": 1.03179753, + "balance_loss_mlp": 1.05274916, + "epoch": 0.17712310235983766, + "flos": 17345572560000.0, + "grad_norm": 2.3259800829895028, + "language_loss": 0.7778489, + "learning_rate": 3.7772055864902256e-06, + "loss": 0.79987633, + "num_input_tokens_seen": 63652140, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.99609375, + "step": 2946, + "time_per_iteration": 2.420511484146118 + }, + { + "auxiliary_loss_clip": 0.01150621, + "auxiliary_loss_mlp": 0.01051141, + "balance_loss_clip": 1.03190041, + "balance_loss_mlp": 1.05060697, + "epoch": 0.17718322561250563, + "flos": 23878477537920.0, + "grad_norm": 1.9846715459481197, + "language_loss": 0.76240218, + "learning_rate": 3.7770269154736535e-06, + "loss": 0.78441978, + "num_input_tokens_seen": 63671700, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.0, + "step": 2947, + "time_per_iteration": 2.485795259475708 + }, + { + "auxiliary_loss_clip": 0.01148639, + "auxiliary_loss_mlp": 0.01046512, + "balance_loss_clip": 1.02725959, + "balance_loss_mlp": 1.04881549, + "epoch": 0.1772433488651736, + "flos": 36466352104320.0, + "grad_norm": 2.7031010106606654, + "language_loss": 0.71890748, + "learning_rate": 3.7768481770720424e-06, + "loss": 0.74085903, + "num_input_tokens_seen": 63691685, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.99609375, + "step": 2948, + "time_per_iteration": 2.598586320877075 + }, + { + "auxiliary_loss_clip": 0.01151482, + "auxiliary_loss_mlp": 0.01051629, + "balance_loss_clip": 1.03313947, + "balance_loss_mlp": 1.05261326, + "epoch": 0.1773034721178416, + "flos": 26684716510080.0, + "grad_norm": 1.809900152556277, + "language_loss": 0.81843233, + "learning_rate": 3.776669371292171e-06, + "loss": 0.8404634, + "num_input_tokens_seen": 63711720, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.98828125, + "step": 2949, + "time_per_iteration": 2.496962547302246 + }, + { + "auxiliary_loss_clip": 0.01051917, + "auxiliary_loss_mlp": 0.01007586, + "balance_loss_clip": 1.00552368, + "balance_loss_mlp": 1.01889789, + "epoch": 0.17736359537050955, + "flos": 57117467617920.0, + "grad_norm": 0.7669309197050882, + "language_loss": 0.64973593, + "learning_rate": 3.7764904981408186e-06, + "loss": 0.670331, + "num_input_tokens_seen": 63776280, + "router_z_loss_clip": 0.02062988, + "router_z_loss_mlp": 0.33007812, + "step": 2950, + "time_per_iteration": 3.1220879554748535 + }, + { + "auxiliary_loss_clip": 0.01145274, + "auxiliary_loss_mlp": 0.01049164, + "balance_loss_clip": 1.02992332, + "balance_loss_mlp": 1.04777181, + "epoch": 0.17742371862317752, + "flos": 27198203385600.0, + "grad_norm": 1.9502306021254343, + "language_loss": 0.83540517, + "learning_rate": 3.7763115576247686e-06, + "loss": 0.85734957, + "num_input_tokens_seen": 63797535, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.9765625, + "step": 2951, + "time_per_iteration": 2.5360641479492188 + }, + { + "auxiliary_loss_clip": 0.01153398, + "auxiliary_loss_mlp": 0.01055919, + "balance_loss_clip": 1.03710794, + "balance_loss_mlp": 1.04963326, + "epoch": 0.17748384187584548, + "flos": 20959694277120.0, + "grad_norm": 3.175759961241781, + "language_loss": 0.80564123, + "learning_rate": 3.776132549750806e-06, + "loss": 0.82773435, + "num_input_tokens_seen": 63817045, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0390625, + "step": 2952, + "time_per_iteration": 2.478635787963867 + }, + { + "auxiliary_loss_clip": 0.01150606, + "auxiliary_loss_mlp": 0.01051207, + "balance_loss_clip": 1.03157318, + "balance_loss_mlp": 1.05045855, + "epoch": 0.17754396512851345, + "flos": 25009986844800.0, + "grad_norm": 2.157061982289712, + "language_loss": 0.79982865, + "learning_rate": 3.7759534745257194e-06, + "loss": 0.82184678, + "num_input_tokens_seen": 63837665, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0, + "step": 2953, + "time_per_iteration": 2.5143978595733643 + }, + { + "auxiliary_loss_clip": 0.01152559, + "auxiliary_loss_mlp": 0.01048489, + "balance_loss_clip": 1.03003526, + "balance_loss_mlp": 1.05173969, + "epoch": 0.1776040883811814, + "flos": 32051566275840.0, + "grad_norm": 1.8943960347088487, + "language_loss": 0.88006002, + "learning_rate": 3.7757743319562994e-06, + "loss": 0.90207046, + "num_input_tokens_seen": 63858455, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.0078125, + "step": 2954, + "time_per_iteration": 2.575603485107422 + }, + { + "auxiliary_loss_clip": 0.01150383, + "auxiliary_loss_mlp": 0.01052239, + "balance_loss_clip": 1.0327127, + "balance_loss_mlp": 1.05101538, + "epoch": 0.17766421163384938, + "flos": 21574125348480.0, + "grad_norm": 2.123866524492404, + "language_loss": 0.84441978, + "learning_rate": 3.7755951220493386e-06, + "loss": 0.86644602, + "num_input_tokens_seen": 63876935, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.9921875, + "step": 2955, + "time_per_iteration": 2.476022958755493 + }, + { + "auxiliary_loss_clip": 0.01148363, + "auxiliary_loss_mlp": 0.0104412, + "balance_loss_clip": 1.02591681, + "balance_loss_mlp": 1.04843807, + "epoch": 0.17772433488651737, + "flos": 22419319345920.0, + "grad_norm": 2.0229859139182382, + "language_loss": 0.71172267, + "learning_rate": 3.7754158448116327e-06, + "loss": 0.73364747, + "num_input_tokens_seen": 63896815, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.0, + "step": 2956, + "time_per_iteration": 2.4795608520507812 + }, + { + "auxiliary_loss_clip": 0.0114893, + "auxiliary_loss_mlp": 0.01052163, + "balance_loss_clip": 1.03226662, + "balance_loss_mlp": 1.04974461, + "epoch": 0.17778445813918534, + "flos": 25629445820160.0, + "grad_norm": 1.891261769499534, + "language_loss": 0.82908547, + "learning_rate": 3.7752365002499795e-06, + "loss": 0.85109639, + "num_input_tokens_seen": 63916140, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.9921875, + "step": 2957, + "time_per_iteration": 2.494279384613037 + }, + { + "auxiliary_loss_clip": 0.01146796, + "auxiliary_loss_mlp": 0.01046422, + "balance_loss_clip": 1.02819514, + "balance_loss_mlp": 1.04814482, + "epoch": 0.1778445813918533, + "flos": 25628871202560.0, + "grad_norm": 1.926043663168548, + "language_loss": 0.75286758, + "learning_rate": 3.7750570883711807e-06, + "loss": 0.7747997, + "num_input_tokens_seen": 63935220, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.984375, + "step": 2958, + "time_per_iteration": 2.532339572906494 + }, + { + "auxiliary_loss_clip": 0.01153516, + "auxiliary_loss_mlp": 0.01043348, + "balance_loss_clip": 1.02483475, + "balance_loss_mlp": 1.05278933, + "epoch": 0.17790470464452127, + "flos": 22345522853760.0, + "grad_norm": 2.0794730574663265, + "language_loss": 0.79558724, + "learning_rate": 3.7748776091820397e-06, + "loss": 0.8175559, + "num_input_tokens_seen": 63954550, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.0078125, + "step": 2959, + "time_per_iteration": 2.45941424369812 + }, + { + "auxiliary_loss_clip": 0.01152953, + "auxiliary_loss_mlp": 0.01045372, + "balance_loss_clip": 1.02573824, + "balance_loss_mlp": 1.04968762, + "epoch": 0.17796482789718923, + "flos": 18765875214720.0, + "grad_norm": 2.284306220471852, + "language_loss": 0.52288693, + "learning_rate": 3.774698062689362e-06, + "loss": 0.5448702, + "num_input_tokens_seen": 63972425, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.03125, + "step": 2960, + "time_per_iteration": 2.4603421688079834 + }, + { + "auxiliary_loss_clip": 0.01154348, + "auxiliary_loss_mlp": 0.01055908, + "balance_loss_clip": 1.03685808, + "balance_loss_mlp": 1.05185843, + "epoch": 0.1780249511498572, + "flos": 23440941970560.0, + "grad_norm": 1.9615261009939866, + "language_loss": 0.89047921, + "learning_rate": 3.7745184488999548e-06, + "loss": 0.9125818, + "num_input_tokens_seen": 63992165, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0234375, + "step": 2961, + "time_per_iteration": 2.475848913192749 + }, + { + "auxiliary_loss_clip": 0.01151915, + "auxiliary_loss_mlp": 0.01051556, + "balance_loss_clip": 1.0313381, + "balance_loss_mlp": 1.04849648, + "epoch": 0.1780850744025252, + "flos": 23367468700800.0, + "grad_norm": 2.2193748892921517, + "language_loss": 0.79186273, + "learning_rate": 3.774338767820631e-06, + "loss": 0.81389749, + "num_input_tokens_seen": 64013470, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.03125, + "step": 2962, + "time_per_iteration": 2.508054256439209 + }, + { + "auxiliary_loss_clip": 0.011535, + "auxiliary_loss_mlp": 0.01051549, + "balance_loss_clip": 1.03175986, + "balance_loss_mlp": 1.0524615, + "epoch": 0.17814519765519315, + "flos": 13771994319360.0, + "grad_norm": 1.9550413638631114, + "language_loss": 0.74514943, + "learning_rate": 3.774159019458203e-06, + "loss": 0.76719993, + "num_input_tokens_seen": 64030975, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.015625, + "step": 2963, + "time_per_iteration": 2.4414234161376953 + }, + { + "auxiliary_loss_clip": 0.01156042, + "auxiliary_loss_mlp": 0.01048013, + "balance_loss_clip": 1.02822399, + "balance_loss_mlp": 1.05221784, + "epoch": 0.17820532090786112, + "flos": 21976396738560.0, + "grad_norm": 1.541363360665875, + "language_loss": 0.78624183, + "learning_rate": 3.7739792038194877e-06, + "loss": 0.80828238, + "num_input_tokens_seen": 64050075, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.0390625, + "step": 2964, + "time_per_iteration": 2.502497911453247 + }, + { + "auxiliary_loss_clip": 0.0115044, + "auxiliary_loss_mlp": 0.01056098, + "balance_loss_clip": 1.03661871, + "balance_loss_mlp": 1.05026746, + "epoch": 0.17826544416052909, + "flos": 24790752184320.0, + "grad_norm": 1.923237578914178, + "language_loss": 0.81686175, + "learning_rate": 3.7737993209113027e-06, + "loss": 0.83892715, + "num_input_tokens_seen": 64071920, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0, + "step": 2965, + "time_per_iteration": 2.538076400756836 + }, + { + "auxiliary_loss_clip": 0.01147349, + "auxiliary_loss_mlp": 0.01049832, + "balance_loss_clip": 1.03273785, + "balance_loss_mlp": 1.04941893, + "epoch": 0.17832556741319705, + "flos": 13879582531200.0, + "grad_norm": 2.2408088539265183, + "language_loss": 0.94580686, + "learning_rate": 3.7736193707404698e-06, + "loss": 0.96777868, + "num_input_tokens_seen": 64086835, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.98046875, + "step": 2966, + "time_per_iteration": 2.43082332611084 + }, + { + "auxiliary_loss_clip": 0.01149854, + "auxiliary_loss_mlp": 0.01045128, + "balance_loss_clip": 1.02470756, + "balance_loss_mlp": 1.05002928, + "epoch": 0.17838569066586502, + "flos": 36641703323520.0, + "grad_norm": 2.145285080590972, + "language_loss": 0.72469354, + "learning_rate": 3.7734393533138127e-06, + "loss": 0.74664342, + "num_input_tokens_seen": 64107360, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0, + "step": 2967, + "time_per_iteration": 2.5735998153686523 + }, + { + "auxiliary_loss_clip": 0.01145139, + "auxiliary_loss_mlp": 0.01044527, + "balance_loss_clip": 1.02613282, + "balance_loss_mlp": 1.04889679, + "epoch": 0.17844581391853298, + "flos": 18727271072640.0, + "grad_norm": 2.088672387523525, + "language_loss": 0.76831949, + "learning_rate": 3.773259268638157e-06, + "loss": 0.79021615, + "num_input_tokens_seen": 64124690, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.9609375, + "step": 2968, + "time_per_iteration": 2.437344789505005 + }, + { + "auxiliary_loss_clip": 0.01147824, + "auxiliary_loss_mlp": 0.01047277, + "balance_loss_clip": 1.0287044, + "balance_loss_mlp": 1.04982233, + "epoch": 0.17850593717120097, + "flos": 27378259286400.0, + "grad_norm": 3.3962137266502075, + "language_loss": 0.75934523, + "learning_rate": 3.7730791167203333e-06, + "loss": 0.78129619, + "num_input_tokens_seen": 64146315, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.98046875, + "step": 2969, + "time_per_iteration": 2.5003507137298584 + }, + { + "auxiliary_loss_clip": 0.01047445, + "auxiliary_loss_mlp": 0.01001591, + "balance_loss_clip": 0.99940914, + "balance_loss_mlp": 1.01426291, + "epoch": 0.17856606042386894, + "flos": 66996025084800.0, + "grad_norm": 0.8459028719848601, + "language_loss": 0.69080526, + "learning_rate": 3.772898897567171e-06, + "loss": 0.7112956, + "num_input_tokens_seen": 64210875, + "router_z_loss_clip": 0.02185059, + "router_z_loss_mlp": 0.33203125, + "step": 2970, + "time_per_iteration": 3.1193249225616455 + }, + { + "auxiliary_loss_clip": 0.01153596, + "auxiliary_loss_mlp": 0.01041832, + "balance_loss_clip": 1.0229373, + "balance_loss_mlp": 1.0498271, + "epoch": 0.1786261836765369, + "flos": 36977001805440.0, + "grad_norm": 2.0858657386647614, + "language_loss": 0.67452097, + "learning_rate": 3.772718611185505e-06, + "loss": 0.69647527, + "num_input_tokens_seen": 64230740, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0390625, + "step": 2971, + "time_per_iteration": 2.580946683883667 + }, + { + "auxiliary_loss_clip": 0.01146095, + "auxiliary_loss_mlp": 0.0105018, + "balance_loss_clip": 1.03059363, + "balance_loss_mlp": 1.04643905, + "epoch": 0.17868630692920487, + "flos": 24825441744000.0, + "grad_norm": 1.713623966203784, + "language_loss": 0.89631712, + "learning_rate": 3.7725382575821717e-06, + "loss": 0.91827983, + "num_input_tokens_seen": 64252300, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.99609375, + "step": 2972, + "time_per_iteration": 2.491608142852783 + }, + { + "auxiliary_loss_clip": 0.01150348, + "auxiliary_loss_mlp": 0.01056161, + "balance_loss_clip": 1.03762364, + "balance_loss_mlp": 1.05058205, + "epoch": 0.17874643018187283, + "flos": 16981977139200.0, + "grad_norm": 2.067523530387673, + "language_loss": 0.88030291, + "learning_rate": 3.77235783676401e-06, + "loss": 0.90236795, + "num_input_tokens_seen": 64270105, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0, + "step": 2973, + "time_per_iteration": 2.4357106685638428 + }, + { + "auxiliary_loss_clip": 0.01148396, + "auxiliary_loss_mlp": 0.01051569, + "balance_loss_clip": 1.03282917, + "balance_loss_mlp": 1.04979324, + "epoch": 0.1788065534345408, + "flos": 21032233793280.0, + "grad_norm": 2.1406659419236176, + "language_loss": 0.75648922, + "learning_rate": 3.7721773487378615e-06, + "loss": 0.77848881, + "num_input_tokens_seen": 64287250, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.984375, + "step": 2974, + "time_per_iteration": 2.484236478805542 + }, + { + "auxiliary_loss_clip": 0.01148515, + "auxiliary_loss_mlp": 0.01044686, + "balance_loss_clip": 1.02560067, + "balance_loss_mlp": 1.04925394, + "epoch": 0.17886667668720876, + "flos": 23987717775360.0, + "grad_norm": 2.8019304252630453, + "language_loss": 0.74556506, + "learning_rate": 3.7719967935105705e-06, + "loss": 0.76749712, + "num_input_tokens_seen": 64307140, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.9921875, + "step": 2975, + "time_per_iteration": 2.4658849239349365 + }, + { + "auxiliary_loss_clip": 0.01145454, + "auxiliary_loss_mlp": 0.0104533, + "balance_loss_clip": 1.02692378, + "balance_loss_mlp": 1.04805982, + "epoch": 0.17892679993987676, + "flos": 25739476156800.0, + "grad_norm": 1.5963289978134585, + "language_loss": 0.73245859, + "learning_rate": 3.7718161710889833e-06, + "loss": 0.7543664, + "num_input_tokens_seen": 64328760, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.97265625, + "step": 2976, + "time_per_iteration": 3.921170949935913 + }, + { + "auxiliary_loss_clip": 0.01140857, + "auxiliary_loss_mlp": 0.01040265, + "balance_loss_clip": 1.02455354, + "balance_loss_mlp": 1.04732931, + "epoch": 0.17898692319254472, + "flos": 25699686865920.0, + "grad_norm": 1.5556273460638488, + "language_loss": 0.77324069, + "learning_rate": 3.7716354814799495e-06, + "loss": 0.79505193, + "num_input_tokens_seen": 64348800, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.9375, + "step": 2977, + "time_per_iteration": 5.459218978881836 + }, + { + "auxiliary_loss_clip": 0.01150602, + "auxiliary_loss_mlp": 0.0105157, + "balance_loss_clip": 1.03352153, + "balance_loss_mlp": 1.05327988, + "epoch": 0.1790470464452127, + "flos": 19317786664320.0, + "grad_norm": 2.814268655584857, + "language_loss": 0.79470795, + "learning_rate": 3.7714547246903203e-06, + "loss": 0.81672966, + "num_input_tokens_seen": 64367955, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.97265625, + "step": 2978, + "time_per_iteration": 2.4917376041412354 + }, + { + "auxiliary_loss_clip": 0.01152273, + "auxiliary_loss_mlp": 0.0104187, + "balance_loss_clip": 1.022892, + "balance_loss_mlp": 1.04982674, + "epoch": 0.17910716969788065, + "flos": 30044267562240.0, + "grad_norm": 1.6585859201367117, + "language_loss": 0.76166439, + "learning_rate": 3.7712739007269508e-06, + "loss": 0.78360581, + "num_input_tokens_seen": 64389805, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0234375, + "step": 2979, + "time_per_iteration": 2.5283753871917725 + }, + { + "auxiliary_loss_clip": 0.01144584, + "auxiliary_loss_mlp": 0.01046117, + "balance_loss_clip": 1.0283196, + "balance_loss_mlp": 1.04760695, + "epoch": 0.17916729295054862, + "flos": 19427709260160.0, + "grad_norm": 2.3100878996861014, + "language_loss": 0.69246143, + "learning_rate": 3.7710930095966976e-06, + "loss": 0.7143684, + "num_input_tokens_seen": 64408220, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.96875, + "step": 2980, + "time_per_iteration": 2.452199935913086 + }, + { + "auxiliary_loss_clip": 0.01148553, + "auxiliary_loss_mlp": 0.01047507, + "balance_loss_clip": 1.02703881, + "balance_loss_mlp": 1.04957294, + "epoch": 0.17922741620321658, + "flos": 14611549881600.0, + "grad_norm": 1.6769030770257147, + "language_loss": 0.7077347, + "learning_rate": 3.7709120513064196e-06, + "loss": 0.72969532, + "num_input_tokens_seen": 64426380, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.98828125, + "step": 2981, + "time_per_iteration": 2.453328847885132 + }, + { + "auxiliary_loss_clip": 0.01151272, + "auxiliary_loss_mlp": 0.01057949, + "balance_loss_clip": 1.03929293, + "balance_loss_mlp": 1.05124855, + "epoch": 0.17928753945588458, + "flos": 17165301177600.0, + "grad_norm": 2.4096510966801916, + "language_loss": 0.82313269, + "learning_rate": 3.7707310258629796e-06, + "loss": 0.84522492, + "num_input_tokens_seen": 64444355, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.0, + "step": 2982, + "time_per_iteration": 2.4727423191070557 + }, + { + "auxiliary_loss_clip": 0.01145202, + "auxiliary_loss_mlp": 0.01048964, + "balance_loss_clip": 1.0309453, + "balance_loss_mlp": 1.04754186, + "epoch": 0.17934766270855254, + "flos": 31395622060800.0, + "grad_norm": 2.0170018574221404, + "language_loss": 0.82899523, + "learning_rate": 3.7705499332732413e-06, + "loss": 0.85093689, + "num_input_tokens_seen": 64467800, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9765625, + "step": 2983, + "time_per_iteration": 2.5544486045837402 + }, + { + "auxiliary_loss_clip": 0.01148269, + "auxiliary_loss_mlp": 0.01051569, + "balance_loss_clip": 1.03234076, + "balance_loss_mlp": 1.04676509, + "epoch": 0.1794077859612205, + "flos": 20814184281600.0, + "grad_norm": 2.0025677466759175, + "language_loss": 0.84977567, + "learning_rate": 3.7703687735440718e-06, + "loss": 0.87177408, + "num_input_tokens_seen": 64487230, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.015625, + "step": 2984, + "time_per_iteration": 2.461451530456543 + }, + { + "auxiliary_loss_clip": 0.01146636, + "auxiliary_loss_mlp": 0.01044432, + "balance_loss_clip": 1.02558494, + "balance_loss_mlp": 1.04734373, + "epoch": 0.17946790921388847, + "flos": 28986447006720.0, + "grad_norm": 2.5972673531528874, + "language_loss": 0.89526331, + "learning_rate": 3.7701875466823416e-06, + "loss": 0.91717398, + "num_input_tokens_seen": 64509165, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.9921875, + "step": 2985, + "time_per_iteration": 2.5644643306732178 + }, + { + "auxiliary_loss_clip": 0.01142965, + "auxiliary_loss_mlp": 0.01045202, + "balance_loss_clip": 1.02879906, + "balance_loss_mlp": 1.0478375, + "epoch": 0.17952803246655644, + "flos": 20737406960640.0, + "grad_norm": 1.9029387971382474, + "language_loss": 0.69863129, + "learning_rate": 3.770006252694922e-06, + "loss": 0.72051299, + "num_input_tokens_seen": 64527940, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.953125, + "step": 2986, + "time_per_iteration": 2.4629499912261963 + }, + { + "auxiliary_loss_clip": 0.01144523, + "auxiliary_loss_mlp": 0.01043434, + "balance_loss_clip": 1.02507591, + "balance_loss_mlp": 1.04828227, + "epoch": 0.1795881557192244, + "flos": 28255988027520.0, + "grad_norm": 2.203273814413497, + "language_loss": 0.77872753, + "learning_rate": 3.769824891588688e-06, + "loss": 0.80060714, + "num_input_tokens_seen": 64545230, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.96484375, + "step": 2987, + "time_per_iteration": 2.524712562561035 + }, + { + "auxiliary_loss_clip": 0.01149287, + "auxiliary_loss_mlp": 0.01043881, + "balance_loss_clip": 1.02412844, + "balance_loss_mlp": 1.04834962, + "epoch": 0.17964827897189237, + "flos": 18552027594240.0, + "grad_norm": 2.225668764256514, + "language_loss": 0.78012109, + "learning_rate": 3.7696434633705164e-06, + "loss": 0.8020528, + "num_input_tokens_seen": 64563820, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0078125, + "step": 2988, + "time_per_iteration": 2.4608163833618164 + }, + { + "auxiliary_loss_clip": 0.01048374, + "auxiliary_loss_mlp": 0.01007691, + "balance_loss_clip": 1.00570035, + "balance_loss_mlp": 1.0154314, + "epoch": 0.17970840222456036, + "flos": 58165088711040.0, + "grad_norm": 0.7961406236538413, + "language_loss": 0.62767559, + "learning_rate": 3.7694619680472875e-06, + "loss": 0.64823627, + "num_input_tokens_seen": 64621315, + "router_z_loss_clip": 0.01989746, + "router_z_loss_mlp": 0.33007812, + "step": 2989, + "time_per_iteration": 2.9831957817077637 + }, + { + "auxiliary_loss_clip": 0.01146079, + "auxiliary_loss_mlp": 0.01041184, + "balance_loss_clip": 1.02363658, + "balance_loss_mlp": 1.04836369, + "epoch": 0.17976852547722832, + "flos": 20300805146880.0, + "grad_norm": 3.4434429944335525, + "language_loss": 0.70464563, + "learning_rate": 3.7692804056258837e-06, + "loss": 0.72651821, + "num_input_tokens_seen": 64639885, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.98046875, + "step": 2990, + "time_per_iteration": 2.556100606918335 + }, + { + "auxiliary_loss_clip": 0.01146243, + "auxiliary_loss_mlp": 0.01039011, + "balance_loss_clip": 1.0210464, + "balance_loss_mlp": 1.04735422, + "epoch": 0.1798286487298963, + "flos": 39669367685760.0, + "grad_norm": 1.7649502456354873, + "language_loss": 0.68110204, + "learning_rate": 3.7690987761131893e-06, + "loss": 0.70295459, + "num_input_tokens_seen": 64661220, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.98828125, + "step": 2991, + "time_per_iteration": 2.6224544048309326 + }, + { + "auxiliary_loss_clip": 0.01145545, + "auxiliary_loss_mlp": 0.01040119, + "balance_loss_clip": 1.02194011, + "balance_loss_mlp": 1.04794931, + "epoch": 0.17988877198256426, + "flos": 25520313323520.0, + "grad_norm": 1.5716432326573742, + "language_loss": 0.82754636, + "learning_rate": 3.7689170795160924e-06, + "loss": 0.84940296, + "num_input_tokens_seen": 64682530, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9765625, + "step": 2992, + "time_per_iteration": 2.51824951171875 + }, + { + "auxiliary_loss_clip": 0.01138637, + "auxiliary_loss_mlp": 0.01040458, + "balance_loss_clip": 1.02301776, + "balance_loss_mlp": 1.04464579, + "epoch": 0.17994889523523222, + "flos": 18807496099200.0, + "grad_norm": 2.1353598877924806, + "language_loss": 0.81958085, + "learning_rate": 3.7687353158414822e-06, + "loss": 0.84137177, + "num_input_tokens_seen": 64701025, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.94140625, + "step": 2993, + "time_per_iteration": 2.4349074363708496 + }, + { + "auxiliary_loss_clip": 0.01143824, + "auxiliary_loss_mlp": 0.01047314, + "balance_loss_clip": 1.02889621, + "balance_loss_mlp": 1.04586673, + "epoch": 0.18000901848790019, + "flos": 21104450087040.0, + "grad_norm": 1.7254805142405878, + "language_loss": 0.78390837, + "learning_rate": 3.7685534850962517e-06, + "loss": 0.80581975, + "num_input_tokens_seen": 64719570, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.98046875, + "step": 2994, + "time_per_iteration": 2.4898691177368164 + }, + { + "auxiliary_loss_clip": 0.01148185, + "auxiliary_loss_mlp": 0.01043708, + "balance_loss_clip": 1.02642226, + "balance_loss_mlp": 1.04966068, + "epoch": 0.18006914174056818, + "flos": 19646441130240.0, + "grad_norm": 1.8689491925476576, + "language_loss": 0.80392146, + "learning_rate": 3.768371587287296e-06, + "loss": 0.82584035, + "num_input_tokens_seen": 64738110, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.984375, + "step": 2995, + "time_per_iteration": 2.4521572589874268 + }, + { + "auxiliary_loss_clip": 0.01144196, + "auxiliary_loss_mlp": 0.01046311, + "balance_loss_clip": 1.02939498, + "balance_loss_mlp": 1.04679298, + "epoch": 0.18012926499323614, + "flos": 19499889640320.0, + "grad_norm": 1.5635152056288029, + "language_loss": 0.84467834, + "learning_rate": 3.768189622421512e-06, + "loss": 0.86658335, + "num_input_tokens_seen": 64756345, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.97265625, + "step": 2996, + "time_per_iteration": 2.46993088722229 + }, + { + "auxiliary_loss_clip": 0.01139788, + "auxiliary_loss_mlp": 0.01041997, + "balance_loss_clip": 1.02493799, + "balance_loss_mlp": 1.04656756, + "epoch": 0.1801893882459041, + "flos": 19464553635840.0, + "grad_norm": 2.9197857622903793, + "language_loss": 0.88254511, + "learning_rate": 3.7680075905058006e-06, + "loss": 0.90436304, + "num_input_tokens_seen": 64776375, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9296875, + "step": 2997, + "time_per_iteration": 2.470113515853882 + }, + { + "auxiliary_loss_clip": 0.01148809, + "auxiliary_loss_mlp": 0.01043259, + "balance_loss_clip": 1.02435279, + "balance_loss_mlp": 1.04666877, + "epoch": 0.18024951149857207, + "flos": 26870590414080.0, + "grad_norm": 2.5635961030192935, + "language_loss": 0.8504566, + "learning_rate": 3.7678254915470643e-06, + "loss": 0.87237728, + "num_input_tokens_seen": 64796210, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0234375, + "step": 2998, + "time_per_iteration": 2.5252864360809326 + }, + { + "auxiliary_loss_clip": 0.0114547, + "auxiliary_loss_mlp": 0.01045025, + "balance_loss_clip": 1.02783537, + "balance_loss_mlp": 1.05022454, + "epoch": 0.18030963475124004, + "flos": 30226621933440.0, + "grad_norm": 1.8695557812200347, + "language_loss": 0.84270376, + "learning_rate": 3.7676433255522084e-06, + "loss": 0.86460871, + "num_input_tokens_seen": 64818590, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.953125, + "step": 2999, + "time_per_iteration": 2.5272696018218994 + }, + { + "auxiliary_loss_clip": 0.01143823, + "auxiliary_loss_mlp": 0.01044085, + "balance_loss_clip": 1.02577412, + "balance_loss_mlp": 1.04662383, + "epoch": 0.180369758003908, + "flos": 22307493329280.0, + "grad_norm": 1.7700032623605295, + "language_loss": 0.74753368, + "learning_rate": 3.76746109252814e-06, + "loss": 0.76941276, + "num_input_tokens_seen": 64838350, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.97265625, + "step": 3000, + "time_per_iteration": 2.4800922870635986 + }, + { + "auxiliary_loss_clip": 0.01143329, + "auxiliary_loss_mlp": 0.01060132, + "balance_loss_clip": 1.04111791, + "balance_loss_mlp": 1.04825568, + "epoch": 0.18042988125657597, + "flos": 23732033788800.0, + "grad_norm": 2.369063359757221, + "language_loss": 0.71625632, + "learning_rate": 3.76727879248177e-06, + "loss": 0.73829091, + "num_input_tokens_seen": 64858065, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.953125, + "step": 3001, + "time_per_iteration": 2.484309434890747 + }, + { + "auxiliary_loss_clip": 0.01148499, + "auxiliary_loss_mlp": 0.01048814, + "balance_loss_clip": 1.03010964, + "balance_loss_mlp": 1.04815364, + "epoch": 0.18049000450924396, + "flos": 24093582134400.0, + "grad_norm": 2.7240097708601225, + "language_loss": 0.87795258, + "learning_rate": 3.767096425420011e-06, + "loss": 0.89992571, + "num_input_tokens_seen": 64877305, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.0078125, + "step": 3002, + "time_per_iteration": 2.4881784915924072 + }, + { + "auxiliary_loss_clip": 0.011444, + "auxiliary_loss_mlp": 0.01044754, + "balance_loss_clip": 1.02689672, + "balance_loss_mlp": 1.04694915, + "epoch": 0.18055012776191193, + "flos": 22163168482560.0, + "grad_norm": 1.6880476069492312, + "language_loss": 0.80563951, + "learning_rate": 3.7669139913497788e-06, + "loss": 0.8275311, + "num_input_tokens_seen": 64896955, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.9765625, + "step": 3003, + "time_per_iteration": 2.452103614807129 + }, + { + "auxiliary_loss_clip": 0.0114812, + "auxiliary_loss_mlp": 0.01044755, + "balance_loss_clip": 1.02673101, + "balance_loss_mlp": 1.04780829, + "epoch": 0.1806102510145799, + "flos": 28913512440960.0, + "grad_norm": 2.4630533980116804, + "language_loss": 0.66931474, + "learning_rate": 3.7667314902779907e-06, + "loss": 0.69124347, + "num_input_tokens_seen": 64917080, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0, + "step": 3004, + "time_per_iteration": 2.5085701942443848 + }, + { + "auxiliary_loss_clip": 0.0114685, + "auxiliary_loss_mlp": 0.01050762, + "balance_loss_clip": 1.03184342, + "balance_loss_mlp": 1.04860806, + "epoch": 0.18067037426724786, + "flos": 19025689265280.0, + "grad_norm": 1.8927608809249736, + "language_loss": 0.85172975, + "learning_rate": 3.7665489222115677e-06, + "loss": 0.87370586, + "num_input_tokens_seen": 64935215, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.984375, + "step": 3005, + "time_per_iteration": 2.44529128074646 + }, + { + "auxiliary_loss_clip": 0.01141782, + "auxiliary_loss_mlp": 0.01042658, + "balance_loss_clip": 1.02611172, + "balance_loss_mlp": 1.04684031, + "epoch": 0.18073049751991582, + "flos": 27453635976960.0, + "grad_norm": 1.553419886600377, + "language_loss": 0.82951266, + "learning_rate": 3.766366287157432e-06, + "loss": 0.85135704, + "num_input_tokens_seen": 64956275, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.94921875, + "step": 3006, + "time_per_iteration": 2.532597780227661 + }, + { + "auxiliary_loss_clip": 0.01143778, + "auxiliary_loss_mlp": 0.01050753, + "balance_loss_clip": 1.0315007, + "balance_loss_mlp": 1.04581141, + "epoch": 0.1807906207725838, + "flos": 28729039167360.0, + "grad_norm": 1.6363768703600998, + "language_loss": 0.76883924, + "learning_rate": 3.7661835851225103e-06, + "loss": 0.79078454, + "num_input_tokens_seen": 64979390, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.98046875, + "step": 3007, + "time_per_iteration": 2.5265002250671387 + }, + { + "auxiliary_loss_clip": 0.01046842, + "auxiliary_loss_mlp": 0.01004593, + "balance_loss_clip": 1.00238752, + "balance_loss_mlp": 1.01358199, + "epoch": 0.18085074402525175, + "flos": 64466515468800.0, + "grad_norm": 0.8067080511403597, + "language_loss": 0.56949043, + "learning_rate": 3.7660008161137294e-06, + "loss": 0.59000474, + "num_input_tokens_seen": 65043135, + "router_z_loss_clip": 0.02209473, + "router_z_loss_mlp": 0.33203125, + "step": 3008, + "time_per_iteration": 3.1923961639404297 + }, + { + "auxiliary_loss_clip": 0.01148419, + "auxiliary_loss_mlp": 0.01048421, + "balance_loss_clip": 1.02878737, + "balance_loss_mlp": 1.04951596, + "epoch": 0.18091086727791975, + "flos": 23476960333440.0, + "grad_norm": 1.8063105677439477, + "language_loss": 0.67226636, + "learning_rate": 3.765817980138021e-06, + "loss": 0.69423479, + "num_input_tokens_seen": 65062845, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.98828125, + "step": 3009, + "time_per_iteration": 2.467525005340576 + }, + { + "auxiliary_loss_clip": 0.01147918, + "auxiliary_loss_mlp": 0.01047401, + "balance_loss_clip": 1.02993655, + "balance_loss_mlp": 1.04874969, + "epoch": 0.1809709905305877, + "flos": 24170467196160.0, + "grad_norm": 1.842230928142314, + "language_loss": 0.75573891, + "learning_rate": 3.7656350772023177e-06, + "loss": 0.77769208, + "num_input_tokens_seen": 65082110, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.98828125, + "step": 3010, + "time_per_iteration": 2.486067533493042 + }, + { + "auxiliary_loss_clip": 0.01141012, + "auxiliary_loss_mlp": 0.0104251, + "balance_loss_clip": 1.02585649, + "balance_loss_mlp": 1.04816866, + "epoch": 0.18103111378325568, + "flos": 21650902669440.0, + "grad_norm": 1.6130539386655762, + "language_loss": 0.66672593, + "learning_rate": 3.7654521073135553e-06, + "loss": 0.6885612, + "num_input_tokens_seen": 65101985, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9296875, + "step": 3011, + "time_per_iteration": 2.461749792098999 + }, + { + "auxiliary_loss_clip": 0.01142359, + "auxiliary_loss_mlp": 0.01048591, + "balance_loss_clip": 1.0309006, + "balance_loss_mlp": 1.04706419, + "epoch": 0.18109123703592364, + "flos": 53686918356480.0, + "grad_norm": 2.1517129990512927, + "language_loss": 0.71184897, + "learning_rate": 3.7652690704786723e-06, + "loss": 0.73375839, + "num_input_tokens_seen": 65129295, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.953125, + "step": 3012, + "time_per_iteration": 2.7380943298339844 + }, + { + "auxiliary_loss_clip": 0.01145213, + "auxiliary_loss_mlp": 0.01048493, + "balance_loss_clip": 1.03045654, + "balance_loss_mlp": 1.05109787, + "epoch": 0.1811513602885916, + "flos": 35845564325760.0, + "grad_norm": 2.2489260815019447, + "language_loss": 0.62039113, + "learning_rate": 3.765085966704609e-06, + "loss": 0.64232826, + "num_input_tokens_seen": 65150625, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.94140625, + "step": 3013, + "time_per_iteration": 2.5800936222076416 + }, + { + "auxiliary_loss_clip": 0.01145888, + "auxiliary_loss_mlp": 0.01050021, + "balance_loss_clip": 1.03303385, + "balance_loss_mlp": 1.04870379, + "epoch": 0.18121148354125957, + "flos": 23732572492800.0, + "grad_norm": 1.5535403171237991, + "language_loss": 0.76026124, + "learning_rate": 3.764902795998309e-06, + "loss": 0.7822203, + "num_input_tokens_seen": 65170880, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.97265625, + "step": 3014, + "time_per_iteration": 2.5049405097961426 + }, + { + "auxiliary_loss_clip": 0.01151342, + "auxiliary_loss_mlp": 0.01046668, + "balance_loss_clip": 1.02697504, + "balance_loss_mlp": 1.05086446, + "epoch": 0.18127160679392756, + "flos": 28728320895360.0, + "grad_norm": 1.7733972454950666, + "language_loss": 0.65696967, + "learning_rate": 3.7647195583667184e-06, + "loss": 0.67894971, + "num_input_tokens_seen": 65192530, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0, + "step": 3015, + "time_per_iteration": 2.52614426612854 + }, + { + "auxiliary_loss_clip": 0.01143858, + "auxiliary_loss_mlp": 0.01043977, + "balance_loss_clip": 1.0262742, + "balance_loss_mlp": 1.0490694, + "epoch": 0.18133173004659553, + "flos": 20485062938880.0, + "grad_norm": 1.7500400577379265, + "language_loss": 0.7809943, + "learning_rate": 3.764536253816785e-06, + "loss": 0.80287266, + "num_input_tokens_seen": 65211675, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9453125, + "step": 3016, + "time_per_iteration": 2.4736039638519287 + }, + { + "auxiliary_loss_clip": 0.01152649, + "auxiliary_loss_mlp": 0.01050768, + "balance_loss_clip": 1.03214788, + "balance_loss_mlp": 1.05294776, + "epoch": 0.1813918532992635, + "flos": 22852078404480.0, + "grad_norm": 1.6390488083316745, + "language_loss": 0.83498454, + "learning_rate": 3.7643528823554602e-06, + "loss": 0.85701871, + "num_input_tokens_seen": 65231185, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0, + "step": 3017, + "time_per_iteration": 2.454888105392456 + }, + { + "auxiliary_loss_clip": 0.01142751, + "auxiliary_loss_mlp": 0.01039497, + "balance_loss_clip": 1.02192545, + "balance_loss_mlp": 1.0486486, + "epoch": 0.18145197655193146, + "flos": 36065122208640.0, + "grad_norm": 2.2301629944757964, + "language_loss": 0.67067724, + "learning_rate": 3.764169443989697e-06, + "loss": 0.69249976, + "num_input_tokens_seen": 65251645, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.94140625, + "step": 3018, + "time_per_iteration": 3.950299024581909 + }, + { + "auxiliary_loss_clip": 0.01146405, + "auxiliary_loss_mlp": 0.0103774, + "balance_loss_clip": 1.01953697, + "balance_loss_mlp": 1.04928112, + "epoch": 0.18151209980459942, + "flos": 24023951619840.0, + "grad_norm": 2.174717508383113, + "language_loss": 0.75745898, + "learning_rate": 3.7639859387264518e-06, + "loss": 0.77930045, + "num_input_tokens_seen": 65271125, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.97265625, + "step": 3019, + "time_per_iteration": 3.9721574783325195 + }, + { + "auxiliary_loss_clip": 0.01149794, + "auxiliary_loss_mlp": 0.01045611, + "balance_loss_clip": 1.02653718, + "balance_loss_mlp": 1.05230832, + "epoch": 0.1815722230572674, + "flos": 23951627585280.0, + "grad_norm": 2.1373464597463574, + "language_loss": 0.81687438, + "learning_rate": 3.7638023665726834e-06, + "loss": 0.83882844, + "num_input_tokens_seen": 65290600, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.9765625, + "step": 3020, + "time_per_iteration": 2.510564088821411 + }, + { + "auxiliary_loss_clip": 0.01148401, + "auxiliary_loss_mlp": 0.01042965, + "balance_loss_clip": 1.02373672, + "balance_loss_mlp": 1.05124021, + "epoch": 0.18163234630993536, + "flos": 24386469632640.0, + "grad_norm": 2.9178918869439654, + "language_loss": 0.77220714, + "learning_rate": 3.763618727535352e-06, + "loss": 0.79412079, + "num_input_tokens_seen": 65311040, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.96875, + "step": 3021, + "time_per_iteration": 2.4856297969818115 + }, + { + "auxiliary_loss_clip": 0.01141247, + "auxiliary_loss_mlp": 0.0104233, + "balance_loss_clip": 1.02419829, + "balance_loss_mlp": 1.04617524, + "epoch": 0.18169246956260335, + "flos": 24681332378880.0, + "grad_norm": 1.7066661124221545, + "language_loss": 0.84841502, + "learning_rate": 3.763435021621422e-06, + "loss": 0.87025082, + "num_input_tokens_seen": 65332115, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.94921875, + "step": 3022, + "time_per_iteration": 2.4933700561523438 + }, + { + "auxiliary_loss_clip": 0.01148694, + "auxiliary_loss_mlp": 0.01042276, + "balance_loss_clip": 1.02296424, + "balance_loss_mlp": 1.0491302, + "epoch": 0.1817525928152713, + "flos": 24243294021120.0, + "grad_norm": 1.9452352079001236, + "language_loss": 0.69178426, + "learning_rate": 3.763251248837859e-06, + "loss": 0.7136941, + "num_input_tokens_seen": 65352210, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.9921875, + "step": 3023, + "time_per_iteration": 2.495107412338257 + }, + { + "auxiliary_loss_clip": 0.01144443, + "auxiliary_loss_mlp": 0.01044488, + "balance_loss_clip": 1.0261296, + "balance_loss_mlp": 1.04748738, + "epoch": 0.18181271606793928, + "flos": 16472081623680.0, + "grad_norm": 1.9417078000950883, + "language_loss": 0.73956865, + "learning_rate": 3.7630674091916317e-06, + "loss": 0.76145792, + "num_input_tokens_seen": 65370600, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.96875, + "step": 3024, + "time_per_iteration": 2.4531846046447754 + }, + { + "auxiliary_loss_clip": 0.01145205, + "auxiliary_loss_mlp": 0.01043186, + "balance_loss_clip": 1.02549553, + "balance_loss_mlp": 1.0490942, + "epoch": 0.18187283932060724, + "flos": 18581042805120.0, + "grad_norm": 2.344564071286257, + "language_loss": 0.88167858, + "learning_rate": 3.7628835026897123e-06, + "loss": 0.90356255, + "num_input_tokens_seen": 65387270, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9609375, + "step": 3025, + "time_per_iteration": 2.4708051681518555 + }, + { + "auxiliary_loss_clip": 0.01145802, + "auxiliary_loss_mlp": 0.01052568, + "balance_loss_clip": 1.03356552, + "balance_loss_mlp": 1.05046904, + "epoch": 0.1819329625732752, + "flos": 20266833859200.0, + "grad_norm": 2.755473586939447, + "language_loss": 0.79284346, + "learning_rate": 3.7626995293390735e-06, + "loss": 0.8148272, + "num_input_tokens_seen": 65406550, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.953125, + "step": 3026, + "time_per_iteration": 2.482987403869629 + }, + { + "auxiliary_loss_clip": 0.01149047, + "auxiliary_loss_mlp": 0.01053602, + "balance_loss_clip": 1.03424227, + "balance_loss_mlp": 1.0502665, + "epoch": 0.18199308582594317, + "flos": 25915186512000.0, + "grad_norm": 1.6571051349992714, + "language_loss": 0.76047945, + "learning_rate": 3.762515489146692e-06, + "loss": 0.78250599, + "num_input_tokens_seen": 65425955, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.98828125, + "step": 3027, + "time_per_iteration": 2.4952149391174316 + }, + { + "auxiliary_loss_clip": 0.01151758, + "auxiliary_loss_mlp": 0.01049071, + "balance_loss_clip": 1.03055763, + "balance_loss_mlp": 1.05106115, + "epoch": 0.18205320907861114, + "flos": 15377524433280.0, + "grad_norm": 1.7989426432275553, + "language_loss": 0.85400331, + "learning_rate": 3.762331382119546e-06, + "loss": 0.87601155, + "num_input_tokens_seen": 65442820, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0078125, + "step": 3028, + "time_per_iteration": 2.438113212585449 + }, + { + "auxiliary_loss_clip": 0.01144845, + "auxiliary_loss_mlp": 0.01043225, + "balance_loss_clip": 1.02543902, + "balance_loss_mlp": 1.04937243, + "epoch": 0.18211333233127913, + "flos": 25624310175360.0, + "grad_norm": 1.8205418995180693, + "language_loss": 0.82655656, + "learning_rate": 3.7621472082646183e-06, + "loss": 0.84843719, + "num_input_tokens_seen": 65461825, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.953125, + "step": 3029, + "time_per_iteration": 2.4866995811462402 + }, + { + "auxiliary_loss_clip": 0.01151958, + "auxiliary_loss_mlp": 0.01045395, + "balance_loss_clip": 1.02640462, + "balance_loss_mlp": 1.05306637, + "epoch": 0.1821734555839471, + "flos": 14976007228800.0, + "grad_norm": 2.0975281503542433, + "language_loss": 0.78150737, + "learning_rate": 3.761962967588891e-06, + "loss": 0.80348092, + "num_input_tokens_seen": 65479480, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.9921875, + "step": 3030, + "time_per_iteration": 2.458627700805664 + }, + { + "auxiliary_loss_clip": 0.01150751, + "auxiliary_loss_mlp": 0.01043659, + "balance_loss_clip": 1.02495515, + "balance_loss_mlp": 1.05141127, + "epoch": 0.18223357883661506, + "flos": 20194007034240.0, + "grad_norm": 1.955618442063123, + "language_loss": 0.85318518, + "learning_rate": 3.761778660099352e-06, + "loss": 0.87512928, + "num_input_tokens_seen": 65497775, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.99609375, + "step": 3031, + "time_per_iteration": 2.4492268562316895 + }, + { + "auxiliary_loss_clip": 0.01150203, + "auxiliary_loss_mlp": 0.01045881, + "balance_loss_clip": 1.02824974, + "balance_loss_mlp": 1.05232072, + "epoch": 0.18229370208928303, + "flos": 15231978524160.0, + "grad_norm": 1.8744751837074634, + "language_loss": 0.79713088, + "learning_rate": 3.76159428580299e-06, + "loss": 0.81909174, + "num_input_tokens_seen": 65516505, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.98046875, + "step": 3032, + "time_per_iteration": 2.4449126720428467 + }, + { + "auxiliary_loss_clip": 0.0115633, + "auxiliary_loss_mlp": 0.01044461, + "balance_loss_clip": 1.0260191, + "balance_loss_mlp": 1.05395341, + "epoch": 0.182353825341951, + "flos": 23840483927040.0, + "grad_norm": 2.0774072235136964, + "language_loss": 0.81420642, + "learning_rate": 3.761409844706795e-06, + "loss": 0.8362143, + "num_input_tokens_seen": 65536160, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.0234375, + "step": 3033, + "time_per_iteration": 2.47562575340271 + }, + { + "auxiliary_loss_clip": 0.01052781, + "auxiliary_loss_mlp": 0.01006645, + "balance_loss_clip": 1.00452268, + "balance_loss_mlp": 1.01995599, + "epoch": 0.18241394859461896, + "flos": 61190957393280.0, + "grad_norm": 0.8883360043233282, + "language_loss": 0.63479006, + "learning_rate": 3.7612253368177625e-06, + "loss": 0.6553843, + "num_input_tokens_seen": 65589375, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.328125, + "step": 3034, + "time_per_iteration": 2.9712142944335938 + }, + { + "auxiliary_loss_clip": 0.01148548, + "auxiliary_loss_mlp": 0.01043903, + "balance_loss_clip": 1.0263083, + "balance_loss_mlp": 1.05033147, + "epoch": 0.18247407184728695, + "flos": 18471694826880.0, + "grad_norm": 2.0132790953316113, + "language_loss": 0.79684323, + "learning_rate": 3.7610407621428893e-06, + "loss": 0.81876773, + "num_input_tokens_seen": 65606720, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.984375, + "step": 3035, + "time_per_iteration": 2.4517030715942383 + }, + { + "auxiliary_loss_clip": 0.01147231, + "auxiliary_loss_mlp": 0.01044908, + "balance_loss_clip": 1.02792096, + "balance_loss_mlp": 1.05231702, + "epoch": 0.18253419509995492, + "flos": 21795191602560.0, + "grad_norm": 2.217606261766961, + "language_loss": 0.84895855, + "learning_rate": 3.7608561206891735e-06, + "loss": 0.87087989, + "num_input_tokens_seen": 65625495, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.94921875, + "step": 3036, + "time_per_iteration": 2.5017378330230713 + }, + { + "auxiliary_loss_clip": 0.01142577, + "auxiliary_loss_mlp": 0.01042615, + "balance_loss_clip": 1.02524662, + "balance_loss_mlp": 1.04940438, + "epoch": 0.18259431835262288, + "flos": 20149764456960.0, + "grad_norm": 2.216717642760365, + "language_loss": 0.79836094, + "learning_rate": 3.760671412463617e-06, + "loss": 0.82021284, + "num_input_tokens_seen": 65643515, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9296875, + "step": 3037, + "time_per_iteration": 2.4591338634490967 + }, + { + "auxiliary_loss_clip": 0.01149617, + "auxiliary_loss_mlp": 0.01047298, + "balance_loss_clip": 1.02686584, + "balance_loss_mlp": 1.05208671, + "epoch": 0.18265444160529085, + "flos": 16981653916800.0, + "grad_norm": 2.68131613553598, + "language_loss": 0.79450762, + "learning_rate": 3.7604866374732246e-06, + "loss": 0.81647676, + "num_input_tokens_seen": 65658155, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.9765625, + "step": 3038, + "time_per_iteration": 2.440664768218994 + }, + { + "auxiliary_loss_clip": 0.0114731, + "auxiliary_loss_mlp": 0.01048244, + "balance_loss_clip": 1.03069699, + "balance_loss_mlp": 1.05140162, + "epoch": 0.1827145648579588, + "flos": 34423250509440.0, + "grad_norm": 2.3213350225315748, + "language_loss": 0.67311364, + "learning_rate": 3.7603017957250023e-06, + "loss": 0.69506919, + "num_input_tokens_seen": 65679310, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.95703125, + "step": 3039, + "time_per_iteration": 2.573272466659546 + }, + { + "auxiliary_loss_clip": 0.01148566, + "auxiliary_loss_mlp": 0.01051856, + "balance_loss_clip": 1.03323567, + "balance_loss_mlp": 1.05112875, + "epoch": 0.18277468811062678, + "flos": 53287017264000.0, + "grad_norm": 1.9125298187860031, + "language_loss": 0.73687911, + "learning_rate": 3.7601168872259593e-06, + "loss": 0.75888336, + "num_input_tokens_seen": 65705235, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9765625, + "step": 3040, + "time_per_iteration": 2.771242618560791 + }, + { + "auxiliary_loss_clip": 0.0114474, + "auxiliary_loss_mlp": 0.01042775, + "balance_loss_clip": 1.02418995, + "balance_loss_mlp": 1.04849768, + "epoch": 0.18283481136329474, + "flos": 31650659602560.0, + "grad_norm": 1.8780343880464916, + "language_loss": 0.60176188, + "learning_rate": 3.7599319119831075e-06, + "loss": 0.62363702, + "num_input_tokens_seen": 65727575, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9609375, + "step": 3041, + "time_per_iteration": 2.670691967010498 + }, + { + "auxiliary_loss_clip": 0.01146425, + "auxiliary_loss_mlp": 0.01055713, + "balance_loss_clip": 1.03756928, + "balance_loss_mlp": 1.05012786, + "epoch": 0.18289493461596273, + "flos": 53137664513280.0, + "grad_norm": 1.7488247873746179, + "language_loss": 0.60361505, + "learning_rate": 3.7597468700034616e-06, + "loss": 0.6256364, + "num_input_tokens_seen": 65751370, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9609375, + "step": 3042, + "time_per_iteration": 2.7942960262298584 + }, + { + "auxiliary_loss_clip": 0.01144442, + "auxiliary_loss_mlp": 0.0104919, + "balance_loss_clip": 1.03143954, + "balance_loss_mlp": 1.04945385, + "epoch": 0.1829550578686307, + "flos": 25589369220480.0, + "grad_norm": 1.6831322617730042, + "language_loss": 0.8769263, + "learning_rate": 3.7595617612940374e-06, + "loss": 0.8988626, + "num_input_tokens_seen": 65771040, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.94921875, + "step": 3043, + "time_per_iteration": 2.524871587753296 + }, + { + "auxiliary_loss_clip": 0.01149409, + "auxiliary_loss_mlp": 0.01049211, + "balance_loss_clip": 1.03005409, + "balance_loss_mlp": 1.05107832, + "epoch": 0.18301518112129866, + "flos": 22601422321920.0, + "grad_norm": 1.9464603469819268, + "language_loss": 0.707008, + "learning_rate": 3.7593765858618552e-06, + "loss": 0.72899425, + "num_input_tokens_seen": 65789345, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.984375, + "step": 3044, + "time_per_iteration": 2.483335018157959 + }, + { + "auxiliary_loss_clip": 0.01150484, + "auxiliary_loss_mlp": 0.01055406, + "balance_loss_clip": 1.03552175, + "balance_loss_mlp": 1.04929996, + "epoch": 0.18307530437396663, + "flos": 34020799551360.0, + "grad_norm": 2.0901220952627497, + "language_loss": 0.64385587, + "learning_rate": 3.7591913437139365e-06, + "loss": 0.66591471, + "num_input_tokens_seen": 65810990, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.015625, + "step": 3045, + "time_per_iteration": 2.592855453491211 + }, + { + "auxiliary_loss_clip": 0.01145205, + "auxiliary_loss_mlp": 0.01054969, + "balance_loss_clip": 1.0368377, + "balance_loss_mlp": 1.04977548, + "epoch": 0.1831354276266346, + "flos": 21279765392640.0, + "grad_norm": 2.998731206361719, + "language_loss": 0.79165137, + "learning_rate": 3.7590060348573066e-06, + "loss": 0.81365317, + "num_input_tokens_seen": 65827230, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.953125, + "step": 3046, + "time_per_iteration": 2.5034587383270264 + }, + { + "auxiliary_loss_clip": 0.01146985, + "auxiliary_loss_mlp": 0.01048107, + "balance_loss_clip": 1.02908087, + "balance_loss_mlp": 1.04764223, + "epoch": 0.18319555087930256, + "flos": 21032952065280.0, + "grad_norm": 3.3529268295267016, + "language_loss": 0.78991181, + "learning_rate": 3.7588206592989903e-06, + "loss": 0.81186271, + "num_input_tokens_seen": 65845900, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.9921875, + "step": 3047, + "time_per_iteration": 2.5140535831451416 + }, + { + "auxiliary_loss_clip": 0.01145799, + "auxiliary_loss_mlp": 0.01046901, + "balance_loss_clip": 1.02923381, + "balance_loss_mlp": 1.05111742, + "epoch": 0.18325567413197055, + "flos": 34382958428160.0, + "grad_norm": 1.5613113238500957, + "language_loss": 0.80888635, + "learning_rate": 3.7586352170460194e-06, + "loss": 0.83081341, + "num_input_tokens_seen": 65868730, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9453125, + "step": 3048, + "time_per_iteration": 2.5848963260650635 + }, + { + "auxiliary_loss_clip": 0.01148052, + "auxiliary_loss_mlp": 0.01042108, + "balance_loss_clip": 1.02283192, + "balance_loss_mlp": 1.0502528, + "epoch": 0.18331579738463852, + "flos": 20558464381440.0, + "grad_norm": 1.8161394933049422, + "language_loss": 0.86232805, + "learning_rate": 3.758449708105424e-06, + "loss": 0.88422966, + "num_input_tokens_seen": 65888420, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.9765625, + "step": 3049, + "time_per_iteration": 2.4665114879608154 + }, + { + "auxiliary_loss_clip": 0.01154476, + "auxiliary_loss_mlp": 0.01043247, + "balance_loss_clip": 1.02364874, + "balance_loss_mlp": 1.05159521, + "epoch": 0.18337592063730648, + "flos": 19607872901760.0, + "grad_norm": 2.2703740748038066, + "language_loss": 0.77160966, + "learning_rate": 3.75826413248424e-06, + "loss": 0.79358685, + "num_input_tokens_seen": 65905840, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.03125, + "step": 3050, + "time_per_iteration": 2.4525256156921387 + }, + { + "auxiliary_loss_clip": 0.01146286, + "auxiliary_loss_mlp": 0.01045385, + "balance_loss_clip": 1.02683592, + "balance_loss_mlp": 1.04867804, + "epoch": 0.18343604388997445, + "flos": 20850885002880.0, + "grad_norm": 2.010292972394078, + "language_loss": 0.99174476, + "learning_rate": 3.7580784901895035e-06, + "loss": 1.0136615, + "num_input_tokens_seen": 65922845, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9765625, + "step": 3051, + "time_per_iteration": 2.4559926986694336 + }, + { + "auxiliary_loss_clip": 0.01145751, + "auxiliary_loss_mlp": 0.01039078, + "balance_loss_clip": 1.02096963, + "balance_loss_mlp": 1.050529, + "epoch": 0.1834961671426424, + "flos": 24394370624640.0, + "grad_norm": 1.5992624239842805, + "language_loss": 0.86153144, + "learning_rate": 3.7578927812282542e-06, + "loss": 0.8833797, + "num_input_tokens_seen": 65945555, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.953125, + "step": 3052, + "time_per_iteration": 2.559396505355835 + }, + { + "auxiliary_loss_clip": 0.01145626, + "auxiliary_loss_mlp": 0.0105059, + "balance_loss_clip": 1.03267264, + "balance_loss_mlp": 1.04985499, + "epoch": 0.18355629039531038, + "flos": 21251612108160.0, + "grad_norm": 1.8182752776897229, + "language_loss": 0.73004341, + "learning_rate": 3.7577070056075356e-06, + "loss": 0.75200558, + "num_input_tokens_seen": 65963965, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.95703125, + "step": 3053, + "time_per_iteration": 2.4481074810028076 + }, + { + "auxiliary_loss_clip": 0.01150169, + "auxiliary_loss_mlp": 0.01049972, + "balance_loss_clip": 1.03051662, + "balance_loss_mlp": 1.05208337, + "epoch": 0.18361641364797834, + "flos": 28656499651200.0, + "grad_norm": 1.6467304764216655, + "language_loss": 0.62212563, + "learning_rate": 3.7575211633343902e-06, + "loss": 0.64412701, + "num_input_tokens_seen": 65985965, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.98046875, + "step": 3054, + "time_per_iteration": 2.5701377391815186 + }, + { + "auxiliary_loss_clip": 0.01146023, + "auxiliary_loss_mlp": 0.01042771, + "balance_loss_clip": 1.02510393, + "balance_loss_mlp": 1.04962707, + "epoch": 0.18367653690064634, + "flos": 20918827578240.0, + "grad_norm": 2.2210920593094325, + "language_loss": 0.78501689, + "learning_rate": 3.7573352544158663e-06, + "loss": 0.80690485, + "num_input_tokens_seen": 66005645, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9609375, + "step": 3055, + "time_per_iteration": 2.47776198387146 + }, + { + "auxiliary_loss_clip": 0.0114232, + "auxiliary_loss_mlp": 0.01053937, + "balance_loss_clip": 1.03622222, + "balance_loss_mlp": 1.04779387, + "epoch": 0.1837366601533143, + "flos": 28765596234240.0, + "grad_norm": 1.894881128028073, + "language_loss": 0.70218527, + "learning_rate": 3.757149278859014e-06, + "loss": 0.72414786, + "num_input_tokens_seen": 66025675, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9453125, + "step": 3056, + "time_per_iteration": 2.541361093521118 + }, + { + "auxiliary_loss_clip": 0.0114918, + "auxiliary_loss_mlp": 0.01043721, + "balance_loss_clip": 1.02612543, + "balance_loss_mlp": 1.05066419, + "epoch": 0.18379678340598227, + "flos": 21251432540160.0, + "grad_norm": 1.4932354373853338, + "language_loss": 0.8028152, + "learning_rate": 3.7569632366708842e-06, + "loss": 0.82474422, + "num_input_tokens_seen": 66046125, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.984375, + "step": 3057, + "time_per_iteration": 2.4718995094299316 + }, + { + "auxiliary_loss_clip": 0.0115229, + "auxiliary_loss_mlp": 0.01049373, + "balance_loss_clip": 1.02864265, + "balance_loss_mlp": 1.04847729, + "epoch": 0.18385690665865023, + "flos": 20449619193600.0, + "grad_norm": 2.0112890674266914, + "language_loss": 0.82289785, + "learning_rate": 3.756777127858533e-06, + "loss": 0.84491444, + "num_input_tokens_seen": 66064375, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.0390625, + "step": 3058, + "time_per_iteration": 2.4653379917144775 + }, + { + "auxiliary_loss_clip": 0.01148012, + "auxiliary_loss_mlp": 0.01046535, + "balance_loss_clip": 1.02818882, + "balance_loss_mlp": 1.04893029, + "epoch": 0.1839170299113182, + "flos": 26140562398080.0, + "grad_norm": 2.205773819593527, + "language_loss": 0.85894352, + "learning_rate": 3.756590952429017e-06, + "loss": 0.88088906, + "num_input_tokens_seen": 66084590, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.9921875, + "step": 3059, + "time_per_iteration": 4.0151047706604 + }, + { + "auxiliary_loss_clip": 0.01145706, + "auxiliary_loss_mlp": 0.01045338, + "balance_loss_clip": 1.02724195, + "balance_loss_mlp": 1.04931092, + "epoch": 0.18397715316398616, + "flos": 31758032332800.0, + "grad_norm": 1.70952354928268, + "language_loss": 0.72799402, + "learning_rate": 3.756404710389396e-06, + "loss": 0.74990445, + "num_input_tokens_seen": 66107105, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9609375, + "step": 3060, + "time_per_iteration": 5.466471195220947 + }, + { + "auxiliary_loss_clip": 0.01151276, + "auxiliary_loss_mlp": 0.01042783, + "balance_loss_clip": 1.02375722, + "balance_loss_mlp": 1.05253565, + "epoch": 0.18403727641665413, + "flos": 24611989173120.0, + "grad_norm": 1.7373746338425942, + "language_loss": 0.72797298, + "learning_rate": 3.7562184017467323e-06, + "loss": 0.74991357, + "num_input_tokens_seen": 66129295, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.98828125, + "step": 3061, + "time_per_iteration": 2.5244035720825195 + }, + { + "auxiliary_loss_clip": 0.01146537, + "auxiliary_loss_mlp": 0.0104557, + "balance_loss_clip": 1.02697313, + "balance_loss_mlp": 1.05087519, + "epoch": 0.18409739966932212, + "flos": 23439900476160.0, + "grad_norm": 1.8714044833418495, + "language_loss": 0.81622046, + "learning_rate": 3.7560320265080906e-06, + "loss": 0.83814156, + "num_input_tokens_seen": 66146910, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.95703125, + "step": 3062, + "time_per_iteration": 2.4767649173736572 + }, + { + "auxiliary_loss_clip": 0.01154667, + "auxiliary_loss_mlp": 0.01045371, + "balance_loss_clip": 1.02681041, + "balance_loss_mlp": 1.05394542, + "epoch": 0.18415752292199009, + "flos": 21872112577920.0, + "grad_norm": 1.7582970194369052, + "language_loss": 0.72718614, + "learning_rate": 3.7558455846805383e-06, + "loss": 0.74918652, + "num_input_tokens_seen": 66165370, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0078125, + "step": 3063, + "time_per_iteration": 2.5082144737243652 + }, + { + "auxiliary_loss_clip": 0.01146453, + "auxiliary_loss_mlp": 0.01041784, + "balance_loss_clip": 1.02516627, + "balance_loss_mlp": 1.04935837, + "epoch": 0.18421764617465805, + "flos": 25410678036480.0, + "grad_norm": 2.1216519555610183, + "language_loss": 0.65496099, + "learning_rate": 3.7556590762711463e-06, + "loss": 0.6768434, + "num_input_tokens_seen": 66186210, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.96875, + "step": 3064, + "time_per_iteration": 2.523141622543335 + }, + { + "auxiliary_loss_clip": 0.01149329, + "auxiliary_loss_mlp": 0.01048992, + "balance_loss_clip": 1.03081298, + "balance_loss_mlp": 1.05274165, + "epoch": 0.18427776942732602, + "flos": 27198131558400.0, + "grad_norm": 2.6163412642887947, + "language_loss": 0.68768656, + "learning_rate": 3.7554725012869853e-06, + "loss": 0.70966971, + "num_input_tokens_seen": 66204800, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.96875, + "step": 3065, + "time_per_iteration": 2.5244293212890625 + }, + { + "auxiliary_loss_clip": 0.01151353, + "auxiliary_loss_mlp": 0.01047403, + "balance_loss_clip": 1.02819824, + "balance_loss_mlp": 1.05120087, + "epoch": 0.18433789267999398, + "flos": 27852351920640.0, + "grad_norm": 4.932084281869228, + "language_loss": 0.72561431, + "learning_rate": 3.7552858597351318e-06, + "loss": 0.74760187, + "num_input_tokens_seen": 66222195, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0, + "step": 3066, + "time_per_iteration": 2.5428919792175293 + }, + { + "auxiliary_loss_clip": 0.01148706, + "auxiliary_loss_mlp": 0.01043732, + "balance_loss_clip": 1.02608871, + "balance_loss_mlp": 1.05074954, + "epoch": 0.18439801593266195, + "flos": 17856940533120.0, + "grad_norm": 1.9825677919996112, + "language_loss": 0.82477474, + "learning_rate": 3.7550991516226622e-06, + "loss": 0.84669906, + "num_input_tokens_seen": 66239505, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.98046875, + "step": 3067, + "time_per_iteration": 2.4500880241394043 + }, + { + "auxiliary_loss_clip": 0.01048916, + "auxiliary_loss_mlp": 0.01007477, + "balance_loss_clip": 1.00535476, + "balance_loss_mlp": 1.01668859, + "epoch": 0.18445813918532994, + "flos": 56389522590720.0, + "grad_norm": 0.7924805733675573, + "language_loss": 0.59706604, + "learning_rate": 3.754912376956657e-06, + "loss": 0.61763, + "num_input_tokens_seen": 66295695, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.32226562, + "step": 3068, + "time_per_iteration": 2.9375104904174805 + }, + { + "auxiliary_loss_clip": 0.01153283, + "auxiliary_loss_mlp": 0.01040124, + "balance_loss_clip": 1.02232563, + "balance_loss_mlp": 1.05714762, + "epoch": 0.1845182624379979, + "flos": 20957180325120.0, + "grad_norm": 1.8708990955689164, + "language_loss": 0.76227212, + "learning_rate": 3.7547255357441987e-06, + "loss": 0.78420615, + "num_input_tokens_seen": 66315315, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9609375, + "step": 3069, + "time_per_iteration": 2.462446451187134 + }, + { + "auxiliary_loss_clip": 0.01151625, + "auxiliary_loss_mlp": 0.01040566, + "balance_loss_clip": 1.02233863, + "balance_loss_mlp": 1.05299067, + "epoch": 0.18457838569066587, + "flos": 20485170679680.0, + "grad_norm": 1.7428293735192475, + "language_loss": 0.84803855, + "learning_rate": 3.7545386279923718e-06, + "loss": 0.86996043, + "num_input_tokens_seen": 66333675, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.984375, + "step": 3070, + "time_per_iteration": 2.4887194633483887 + }, + { + "auxiliary_loss_clip": 0.01152145, + "auxiliary_loss_mlp": 0.01042624, + "balance_loss_clip": 1.02462363, + "balance_loss_mlp": 1.05298758, + "epoch": 0.18463850894333383, + "flos": 25010022758400.0, + "grad_norm": 1.9722863584187038, + "language_loss": 0.77370453, + "learning_rate": 3.754351653708265e-06, + "loss": 0.79565221, + "num_input_tokens_seen": 66354075, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.98828125, + "step": 3071, + "time_per_iteration": 2.482213258743286 + }, + { + "auxiliary_loss_clip": 0.01152228, + "auxiliary_loss_mlp": 0.01048541, + "balance_loss_clip": 1.03042173, + "balance_loss_mlp": 1.05342758, + "epoch": 0.1846986321960018, + "flos": 16800628348800.0, + "grad_norm": 2.705053980849468, + "language_loss": 0.77691031, + "learning_rate": 3.7541646128989674e-06, + "loss": 0.79891801, + "num_input_tokens_seen": 66372520, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9921875, + "step": 3072, + "time_per_iteration": 2.466387987136841 + }, + { + "auxiliary_loss_clip": 0.01150023, + "auxiliary_loss_mlp": 0.01042519, + "balance_loss_clip": 1.02339804, + "balance_loss_mlp": 1.05013216, + "epoch": 0.18475875544866976, + "flos": 20814327936000.0, + "grad_norm": 1.8173375196390826, + "language_loss": 0.8607235, + "learning_rate": 3.7539775055715715e-06, + "loss": 0.88264889, + "num_input_tokens_seen": 66390745, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0, + "step": 3073, + "time_per_iteration": 2.4510810375213623 + }, + { + "auxiliary_loss_clip": 0.01151045, + "auxiliary_loss_mlp": 0.01045152, + "balance_loss_clip": 1.02851045, + "balance_loss_mlp": 1.05339348, + "epoch": 0.18481887870133773, + "flos": 22601422321920.0, + "grad_norm": 2.2059027996031877, + "language_loss": 0.92005521, + "learning_rate": 3.7537903317331732e-06, + "loss": 0.9420172, + "num_input_tokens_seen": 66410525, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.97265625, + "step": 3074, + "time_per_iteration": 2.473710298538208 + }, + { + "auxiliary_loss_clip": 0.01146992, + "auxiliary_loss_mlp": 0.01044255, + "balance_loss_clip": 1.02490735, + "balance_loss_mlp": 1.05028176, + "epoch": 0.18487900195400572, + "flos": 29458815788160.0, + "grad_norm": 1.9913742546968862, + "language_loss": 0.65041798, + "learning_rate": 3.75360309139087e-06, + "loss": 0.67233044, + "num_input_tokens_seen": 66432535, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.96875, + "step": 3075, + "time_per_iteration": 2.533724784851074 + }, + { + "auxiliary_loss_clip": 0.01149493, + "auxiliary_loss_mlp": 0.01043367, + "balance_loss_clip": 1.02578402, + "balance_loss_mlp": 1.053177, + "epoch": 0.1849391252066737, + "flos": 20628777254400.0, + "grad_norm": 1.709240712607824, + "language_loss": 0.72323918, + "learning_rate": 3.753415784551761e-06, + "loss": 0.74516779, + "num_input_tokens_seen": 66450620, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9609375, + "step": 3076, + "time_per_iteration": 2.4544899463653564 + }, + { + "auxiliary_loss_clip": 0.01153692, + "auxiliary_loss_mlp": 0.01046042, + "balance_loss_clip": 1.0280292, + "balance_loss_mlp": 1.05341136, + "epoch": 0.18499924845934165, + "flos": 14428549065600.0, + "grad_norm": 2.4900368363969854, + "language_loss": 0.80860448, + "learning_rate": 3.7532284112229507e-06, + "loss": 0.83060181, + "num_input_tokens_seen": 66467865, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0, + "step": 3077, + "time_per_iteration": 2.45137882232666 + }, + { + "auxiliary_loss_clip": 0.01146798, + "auxiliary_loss_mlp": 0.01045715, + "balance_loss_clip": 1.02816749, + "balance_loss_mlp": 1.05103469, + "epoch": 0.18505937171200962, + "flos": 23727652329600.0, + "grad_norm": 1.7908770900539794, + "language_loss": 0.78764129, + "learning_rate": 3.7530409714115424e-06, + "loss": 0.8095665, + "num_input_tokens_seen": 66486245, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.95703125, + "step": 3078, + "time_per_iteration": 2.477393865585327 + }, + { + "auxiliary_loss_clip": 0.01147874, + "auxiliary_loss_mlp": 0.01044325, + "balance_loss_clip": 1.02714717, + "balance_loss_mlp": 1.05057585, + "epoch": 0.18511949496467758, + "flos": 25957489754880.0, + "grad_norm": 1.8549646444276375, + "language_loss": 0.7758081, + "learning_rate": 3.7528534651246453e-06, + "loss": 0.79773009, + "num_input_tokens_seen": 66506510, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9765625, + "step": 3079, + "time_per_iteration": 2.5069448947906494 + }, + { + "auxiliary_loss_clip": 0.01143899, + "auxiliary_loss_mlp": 0.0104323, + "balance_loss_clip": 1.02581406, + "balance_loss_mlp": 1.04723024, + "epoch": 0.18517961821734555, + "flos": 42413553912960.0, + "grad_norm": 2.3452692712375893, + "language_loss": 0.81668431, + "learning_rate": 3.752665892369369e-06, + "loss": 0.83855557, + "num_input_tokens_seen": 66530960, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.96484375, + "step": 3080, + "time_per_iteration": 2.688206911087036 + }, + { + "auxiliary_loss_clip": 0.01149652, + "auxiliary_loss_mlp": 0.01046927, + "balance_loss_clip": 1.02812803, + "balance_loss_mlp": 1.05079699, + "epoch": 0.18523974147001354, + "flos": 24097568544000.0, + "grad_norm": 2.0276132956863764, + "language_loss": 0.7435087, + "learning_rate": 3.7524782531528266e-06, + "loss": 0.7654745, + "num_input_tokens_seen": 66550275, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.98828125, + "step": 3081, + "time_per_iteration": 2.5003983974456787 + }, + { + "auxiliary_loss_clip": 0.01151656, + "auxiliary_loss_mlp": 0.01050271, + "balance_loss_clip": 1.03124547, + "balance_loss_mlp": 1.05527234, + "epoch": 0.1852998647226815, + "flos": 27375278457600.0, + "grad_norm": 2.070281784994394, + "language_loss": 0.71532816, + "learning_rate": 3.7522905474821334e-06, + "loss": 0.73734742, + "num_input_tokens_seen": 66569040, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.9609375, + "step": 3082, + "time_per_iteration": 2.514004707336426 + }, + { + "auxiliary_loss_clip": 0.011545, + "auxiliary_loss_mlp": 0.01050471, + "balance_loss_clip": 1.03155267, + "balance_loss_mlp": 1.05488813, + "epoch": 0.18535998797534947, + "flos": 18332757020160.0, + "grad_norm": 1.869200996989063, + "language_loss": 0.69338834, + "learning_rate": 3.752102775364407e-06, + "loss": 0.71543807, + "num_input_tokens_seen": 66587775, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.9921875, + "step": 3083, + "time_per_iteration": 2.446418523788452 + }, + { + "auxiliary_loss_clip": 0.0114679, + "auxiliary_loss_mlp": 0.01049301, + "balance_loss_clip": 1.03187287, + "balance_loss_mlp": 1.05216169, + "epoch": 0.18542011122801744, + "flos": 37845859887360.0, + "grad_norm": 4.022344342016001, + "language_loss": 0.68854296, + "learning_rate": 3.751914936806767e-06, + "loss": 0.71050388, + "num_input_tokens_seen": 66610800, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9453125, + "step": 3084, + "time_per_iteration": 2.5964090824127197 + }, + { + "auxiliary_loss_clip": 0.01145496, + "auxiliary_loss_mlp": 0.01043341, + "balance_loss_clip": 1.02541232, + "balance_loss_mlp": 1.04961908, + "epoch": 0.1854802344806854, + "flos": 25186128163200.0, + "grad_norm": 1.5883609883793584, + "language_loss": 0.77831411, + "learning_rate": 3.7517270318163377e-06, + "loss": 0.80020249, + "num_input_tokens_seen": 66630960, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9609375, + "step": 3085, + "time_per_iteration": 2.500401020050049 + }, + { + "auxiliary_loss_clip": 0.01145039, + "auxiliary_loss_mlp": 0.01053452, + "balance_loss_clip": 1.03557014, + "balance_loss_mlp": 1.04887915, + "epoch": 0.18554035773335337, + "flos": 26684788337280.0, + "grad_norm": 1.8880953488015286, + "language_loss": 0.73488086, + "learning_rate": 3.751539060400244e-06, + "loss": 0.7568658, + "num_input_tokens_seen": 66650585, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.9609375, + "step": 3086, + "time_per_iteration": 2.5121798515319824 + }, + { + "auxiliary_loss_clip": 0.0114717, + "auxiliary_loss_mlp": 0.01048198, + "balance_loss_clip": 1.02949429, + "balance_loss_mlp": 1.05223882, + "epoch": 0.18560048098602133, + "flos": 22346887570560.0, + "grad_norm": 4.074676999617497, + "language_loss": 0.70087367, + "learning_rate": 3.7513510225656132e-06, + "loss": 0.72282737, + "num_input_tokens_seen": 66670045, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.953125, + "step": 3087, + "time_per_iteration": 2.469980001449585 + }, + { + "auxiliary_loss_clip": 0.01149215, + "auxiliary_loss_mlp": 0.01048668, + "balance_loss_clip": 1.02928519, + "balance_loss_mlp": 1.05118215, + "epoch": 0.18566060423868933, + "flos": 17748526308480.0, + "grad_norm": 2.299065028063824, + "language_loss": 0.72731185, + "learning_rate": 3.7511629183195764e-06, + "loss": 0.74929065, + "num_input_tokens_seen": 66688790, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.98046875, + "step": 3088, + "time_per_iteration": 2.4569249153137207 + }, + { + "auxiliary_loss_clip": 0.01144422, + "auxiliary_loss_mlp": 0.01045089, + "balance_loss_clip": 1.02733839, + "balance_loss_mlp": 1.05015588, + "epoch": 0.1857207274913573, + "flos": 24677274142080.0, + "grad_norm": 2.023411505730453, + "language_loss": 0.91849768, + "learning_rate": 3.7509747476692663e-06, + "loss": 0.94039273, + "num_input_tokens_seen": 66708090, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.94140625, + "step": 3089, + "time_per_iteration": 2.5086276531219482 + }, + { + "auxiliary_loss_clip": 0.01146464, + "auxiliary_loss_mlp": 0.0104377, + "balance_loss_clip": 1.02573323, + "balance_loss_mlp": 1.05124271, + "epoch": 0.18578085074402526, + "flos": 28147825198080.0, + "grad_norm": 2.7535733421879174, + "language_loss": 0.57406759, + "learning_rate": 3.7507865106218176e-06, + "loss": 0.59596992, + "num_input_tokens_seen": 66727320, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.953125, + "step": 3090, + "time_per_iteration": 2.544934034347534 + }, + { + "auxiliary_loss_clip": 0.011443, + "auxiliary_loss_mlp": 0.01049477, + "balance_loss_clip": 1.03133333, + "balance_loss_mlp": 1.04945779, + "epoch": 0.18584097399669322, + "flos": 23951878980480.0, + "grad_norm": 1.9526543189913628, + "language_loss": 0.82229531, + "learning_rate": 3.7505982071843695e-06, + "loss": 0.84423304, + "num_input_tokens_seen": 66747505, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9453125, + "step": 3091, + "time_per_iteration": 2.5339536666870117 + }, + { + "auxiliary_loss_clip": 0.01149127, + "auxiliary_loss_mlp": 0.01049478, + "balance_loss_clip": 1.03165662, + "balance_loss_mlp": 1.05212235, + "epoch": 0.18590109724936119, + "flos": 17201678676480.0, + "grad_norm": 2.0588011246991127, + "language_loss": 0.83561456, + "learning_rate": 3.7504098373640617e-06, + "loss": 0.85760063, + "num_input_tokens_seen": 66766425, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.96875, + "step": 3092, + "time_per_iteration": 2.5091474056243896 + }, + { + "auxiliary_loss_clip": 0.01151013, + "auxiliary_loss_mlp": 0.01044436, + "balance_loss_clip": 1.02562487, + "balance_loss_mlp": 1.05010569, + "epoch": 0.18596122050202915, + "flos": 17234644383360.0, + "grad_norm": 4.142827775979207, + "language_loss": 0.93487823, + "learning_rate": 3.750221401168038e-06, + "loss": 0.95683277, + "num_input_tokens_seen": 66781130, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0078125, + "step": 3093, + "time_per_iteration": 2.4307475090026855 + }, + { + "auxiliary_loss_clip": 0.01146588, + "auxiliary_loss_mlp": 0.01038182, + "balance_loss_clip": 1.02115917, + "balance_loss_mlp": 1.05090082, + "epoch": 0.18602134375469712, + "flos": 19020733188480.0, + "grad_norm": 2.060946690404802, + "language_loss": 0.77380008, + "learning_rate": 3.750032898603443e-06, + "loss": 0.79564774, + "num_input_tokens_seen": 66797535, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.953125, + "step": 3094, + "time_per_iteration": 2.4520375728607178 + }, + { + "auxiliary_loss_clip": 0.01147212, + "auxiliary_loss_mlp": 0.01047698, + "balance_loss_clip": 1.03098452, + "balance_loss_mlp": 1.05099964, + "epoch": 0.1860814670073651, + "flos": 50950094417280.0, + "grad_norm": 1.6535165555915046, + "language_loss": 0.69985378, + "learning_rate": 3.749844329677425e-06, + "loss": 0.72180283, + "num_input_tokens_seen": 66821720, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9609375, + "step": 3095, + "time_per_iteration": 2.7395834922790527 + }, + { + "auxiliary_loss_clip": 0.01149572, + "auxiliary_loss_mlp": 0.01045107, + "balance_loss_clip": 1.02614033, + "balance_loss_mlp": 1.05169249, + "epoch": 0.18614159026003307, + "flos": 19390972625280.0, + "grad_norm": 1.9053555001005595, + "language_loss": 0.8077082, + "learning_rate": 3.749655694397135e-06, + "loss": 0.82965505, + "num_input_tokens_seen": 66839060, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.98046875, + "step": 3096, + "time_per_iteration": 2.4506232738494873 + }, + { + "auxiliary_loss_clip": 0.01147695, + "auxiliary_loss_mlp": 0.01047208, + "balance_loss_clip": 1.02883816, + "balance_loss_mlp": 1.05086875, + "epoch": 0.18620171351270104, + "flos": 21798782962560.0, + "grad_norm": 2.061308652340225, + "language_loss": 0.75101036, + "learning_rate": 3.7494669927697255e-06, + "loss": 0.77295941, + "num_input_tokens_seen": 66857760, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.96875, + "step": 3097, + "time_per_iteration": 2.46639347076416 + }, + { + "auxiliary_loss_clip": 0.01147181, + "auxiliary_loss_mlp": 0.01045993, + "balance_loss_clip": 1.02789688, + "balance_loss_mlp": 1.05196047, + "epoch": 0.186261836765369, + "flos": 16362877299840.0, + "grad_norm": 2.5365100966912664, + "language_loss": 0.66038394, + "learning_rate": 3.749278224802352e-06, + "loss": 0.68231571, + "num_input_tokens_seen": 66876460, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.953125, + "step": 3098, + "time_per_iteration": 2.46763014793396 + }, + { + "auxiliary_loss_clip": 0.01148744, + "auxiliary_loss_mlp": 0.01048857, + "balance_loss_clip": 1.02973545, + "balance_loss_mlp": 1.04978585, + "epoch": 0.18632196001803697, + "flos": 23370054480000.0, + "grad_norm": 1.6025275160282182, + "language_loss": 0.69907904, + "learning_rate": 3.7490893905021733e-06, + "loss": 0.72105503, + "num_input_tokens_seen": 66897960, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.98828125, + "step": 3099, + "time_per_iteration": 2.469336748123169 + }, + { + "auxiliary_loss_clip": 0.01147788, + "auxiliary_loss_mlp": 0.01052362, + "balance_loss_clip": 1.03290749, + "balance_loss_mlp": 1.04985309, + "epoch": 0.18638208327070493, + "flos": 22492002516480.0, + "grad_norm": 1.4888180158498334, + "language_loss": 0.71623552, + "learning_rate": 3.7489004898763494e-06, + "loss": 0.73823702, + "num_input_tokens_seen": 66917675, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.9765625, + "step": 3100, + "time_per_iteration": 2.510803699493408 + }, + { + "auxiliary_loss_clip": 0.01150621, + "auxiliary_loss_mlp": 0.01050424, + "balance_loss_clip": 1.03104091, + "balance_loss_mlp": 1.05147338, + "epoch": 0.18644220652337293, + "flos": 29165245931520.0, + "grad_norm": 2.2181859131844757, + "language_loss": 0.80163074, + "learning_rate": 3.7487115229320444e-06, + "loss": 0.82364118, + "num_input_tokens_seen": 66936000, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.9921875, + "step": 3101, + "time_per_iteration": 4.007607936859131 + }, + { + "auxiliary_loss_clip": 0.0114449, + "auxiliary_loss_mlp": 0.01043434, + "balance_loss_clip": 1.02606487, + "balance_loss_mlp": 1.05100489, + "epoch": 0.1865023297760409, + "flos": 24243796811520.0, + "grad_norm": 2.082156961368248, + "language_loss": 0.76803768, + "learning_rate": 3.7485224896764222e-06, + "loss": 0.78991693, + "num_input_tokens_seen": 66955700, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9375, + "step": 3102, + "time_per_iteration": 5.438685894012451 + }, + { + "auxiliary_loss_clip": 0.0114717, + "auxiliary_loss_mlp": 0.01041158, + "balance_loss_clip": 1.02322865, + "balance_loss_mlp": 1.04973269, + "epoch": 0.18656245302870886, + "flos": 19128716449920.0, + "grad_norm": 2.5595226686006565, + "language_loss": 0.76962835, + "learning_rate": 3.7483333901166525e-06, + "loss": 0.79151165, + "num_input_tokens_seen": 66972815, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9765625, + "step": 3103, + "time_per_iteration": 2.4742202758789062 + }, + { + "auxiliary_loss_clip": 0.0114783, + "auxiliary_loss_mlp": 0.0104302, + "balance_loss_clip": 1.02540123, + "balance_loss_mlp": 1.05014729, + "epoch": 0.18662257628137682, + "flos": 17786088956160.0, + "grad_norm": 1.966347666558745, + "language_loss": 0.79074025, + "learning_rate": 3.7481442242599054e-06, + "loss": 0.81264877, + "num_input_tokens_seen": 66992280, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9765625, + "step": 3104, + "time_per_iteration": 2.4873924255371094 + }, + { + "auxiliary_loss_clip": 0.01148715, + "auxiliary_loss_mlp": 0.01043939, + "balance_loss_clip": 1.02653468, + "balance_loss_mlp": 1.05237842, + "epoch": 0.1866826995340448, + "flos": 24024382583040.0, + "grad_norm": 1.943867006204371, + "language_loss": 0.8519029, + "learning_rate": 3.747954992113354e-06, + "loss": 0.87382948, + "num_input_tokens_seen": 67012220, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9609375, + "step": 3105, + "time_per_iteration": 2.488638162612915 + }, + { + "auxiliary_loss_clip": 0.01152184, + "auxiliary_loss_mlp": 0.01047951, + "balance_loss_clip": 1.02872288, + "balance_loss_mlp": 1.0491997, + "epoch": 0.18674282278671275, + "flos": 26141244756480.0, + "grad_norm": 1.7838474228223986, + "language_loss": 0.86952424, + "learning_rate": 3.7477656936841742e-06, + "loss": 0.89152563, + "num_input_tokens_seen": 67032030, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.03125, + "step": 3106, + "time_per_iteration": 2.5103402137756348 + }, + { + "auxiliary_loss_clip": 0.0115436, + "auxiliary_loss_mlp": 0.01044282, + "balance_loss_clip": 1.02623367, + "balance_loss_mlp": 1.05296755, + "epoch": 0.18680294603938072, + "flos": 19201938324480.0, + "grad_norm": 1.9680738799082358, + "language_loss": 0.78253353, + "learning_rate": 3.7475763289795445e-06, + "loss": 0.80451989, + "num_input_tokens_seen": 67048920, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.015625, + "step": 3107, + "time_per_iteration": 2.44567608833313 + }, + { + "auxiliary_loss_clip": 0.01150298, + "auxiliary_loss_mlp": 0.01051545, + "balance_loss_clip": 1.03179181, + "balance_loss_mlp": 1.05040216, + "epoch": 0.1868630692920487, + "flos": 28544889116160.0, + "grad_norm": 1.9125203241398734, + "language_loss": 0.74114668, + "learning_rate": 3.7473868980066446e-06, + "loss": 0.76316506, + "num_input_tokens_seen": 67068645, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0, + "step": 3108, + "time_per_iteration": 2.5254971981048584 + }, + { + "auxiliary_loss_clip": 0.0115125, + "auxiliary_loss_mlp": 0.01045574, + "balance_loss_clip": 1.02684629, + "balance_loss_mlp": 1.05332017, + "epoch": 0.18692319254471668, + "flos": 17238020261760.0, + "grad_norm": 1.6536820415924105, + "language_loss": 0.74707133, + "learning_rate": 3.747197400772658e-06, + "loss": 0.76903957, + "num_input_tokens_seen": 67087075, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.98046875, + "step": 3109, + "time_per_iteration": 2.426945924758911 + }, + { + "auxiliary_loss_clip": 0.01147996, + "auxiliary_loss_mlp": 0.0104719, + "balance_loss_clip": 1.02845001, + "balance_loss_mlp": 1.05078959, + "epoch": 0.18698331579738464, + "flos": 23185186156800.0, + "grad_norm": 1.4293009008592994, + "language_loss": 0.84324062, + "learning_rate": 3.747007837284772e-06, + "loss": 0.86519247, + "num_input_tokens_seen": 67108040, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.97265625, + "step": 3110, + "time_per_iteration": 2.4744956493377686 + }, + { + "auxiliary_loss_clip": 0.01154611, + "auxiliary_loss_mlp": 0.0104307, + "balance_loss_clip": 1.02472341, + "balance_loss_mlp": 1.05598927, + "epoch": 0.1870434390500526, + "flos": 25516721963520.0, + "grad_norm": 1.633662412254079, + "language_loss": 0.84753799, + "learning_rate": 3.7468182075501737e-06, + "loss": 0.86951482, + "num_input_tokens_seen": 67127605, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.984375, + "step": 3111, + "time_per_iteration": 2.4757230281829834 + }, + { + "auxiliary_loss_clip": 0.01149935, + "auxiliary_loss_mlp": 0.01042098, + "balance_loss_clip": 1.02408528, + "balance_loss_mlp": 1.05231404, + "epoch": 0.18710356230272057, + "flos": 19500823393920.0, + "grad_norm": 1.8513735900463348, + "language_loss": 0.76565534, + "learning_rate": 3.7466285115760536e-06, + "loss": 0.78757566, + "num_input_tokens_seen": 67145785, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9765625, + "step": 3112, + "time_per_iteration": 2.465552806854248 + }, + { + "auxiliary_loss_clip": 0.01150842, + "auxiliary_loss_mlp": 0.0104724, + "balance_loss_clip": 1.02907228, + "balance_loss_mlp": 1.0516355, + "epoch": 0.18716368555538854, + "flos": 26760847386240.0, + "grad_norm": 1.8580615351340177, + "language_loss": 0.64277315, + "learning_rate": 3.7464387493696046e-06, + "loss": 0.66475397, + "num_input_tokens_seen": 67165930, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9921875, + "step": 3113, + "time_per_iteration": 2.491805076599121 + }, + { + "auxiliary_loss_clip": 0.01155946, + "auxiliary_loss_mlp": 0.01047625, + "balance_loss_clip": 1.02817035, + "balance_loss_mlp": 1.0528996, + "epoch": 0.1872238088080565, + "flos": 25189827264000.0, + "grad_norm": 2.238258329288858, + "language_loss": 0.81043601, + "learning_rate": 3.746248920938024e-06, + "loss": 0.83247173, + "num_input_tokens_seen": 67185830, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.03125, + "step": 3114, + "time_per_iteration": 2.4947290420532227 + }, + { + "auxiliary_loss_clip": 0.01153492, + "auxiliary_loss_mlp": 0.01054246, + "balance_loss_clip": 1.03361082, + "balance_loss_mlp": 1.05319226, + "epoch": 0.1872839320607245, + "flos": 24134305178880.0, + "grad_norm": 2.2102322241331467, + "language_loss": 0.57819968, + "learning_rate": 3.74605902628851e-06, + "loss": 0.60027713, + "num_input_tokens_seen": 67206930, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.0, + "step": 3115, + "time_per_iteration": 2.4892075061798096 + }, + { + "auxiliary_loss_clip": 0.01151062, + "auxiliary_loss_mlp": 0.01056747, + "balance_loss_clip": 1.03873444, + "balance_loss_mlp": 1.05434299, + "epoch": 0.18734405531339246, + "flos": 21173793292800.0, + "grad_norm": 1.8141768865365742, + "language_loss": 0.71160758, + "learning_rate": 3.745869065428261e-06, + "loss": 0.73368567, + "num_input_tokens_seen": 67226290, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.96484375, + "step": 3116, + "time_per_iteration": 2.4705467224121094 + }, + { + "auxiliary_loss_clip": 0.01142667, + "auxiliary_loss_mlp": 0.01035702, + "balance_loss_clip": 1.01751065, + "balance_loss_mlp": 1.04771161, + "epoch": 0.18740417856606043, + "flos": 17237697039360.0, + "grad_norm": 1.8736078530078255, + "language_loss": 0.78733885, + "learning_rate": 3.7456790383644833e-06, + "loss": 0.80912256, + "num_input_tokens_seen": 67244410, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.94921875, + "step": 3117, + "time_per_iteration": 2.418527126312256 + }, + { + "auxiliary_loss_clip": 0.01151486, + "auxiliary_loss_mlp": 0.01048418, + "balance_loss_clip": 1.02898717, + "balance_loss_mlp": 1.05421317, + "epoch": 0.1874643018187284, + "flos": 32558049999360.0, + "grad_norm": 1.743274375857092, + "language_loss": 0.83945131, + "learning_rate": 3.745488945104381e-06, + "loss": 0.86145031, + "num_input_tokens_seen": 67264470, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.97265625, + "step": 3118, + "time_per_iteration": 2.5691416263580322 + }, + { + "auxiliary_loss_clip": 0.01151442, + "auxiliary_loss_mlp": 0.01049226, + "balance_loss_clip": 1.03109384, + "balance_loss_mlp": 1.0525409, + "epoch": 0.18752442507139636, + "flos": 23258156636160.0, + "grad_norm": 1.7594323212393352, + "language_loss": 0.76151264, + "learning_rate": 3.7452987856551636e-06, + "loss": 0.78351927, + "num_input_tokens_seen": 67284315, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.98828125, + "step": 3119, + "time_per_iteration": 2.459648847579956 + }, + { + "auxiliary_loss_clip": 0.01150997, + "auxiliary_loss_mlp": 0.01053702, + "balance_loss_clip": 1.03549838, + "balance_loss_mlp": 1.05181718, + "epoch": 0.18758454832406432, + "flos": 21760933006080.0, + "grad_norm": 1.593515591831454, + "language_loss": 0.81975627, + "learning_rate": 3.7451085600240406e-06, + "loss": 0.84180319, + "num_input_tokens_seen": 67302780, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9921875, + "step": 3120, + "time_per_iteration": 2.478870153427124 + }, + { + "auxiliary_loss_clip": 0.01148213, + "auxiliary_loss_mlp": 0.01043273, + "balance_loss_clip": 1.02526081, + "balance_loss_mlp": 1.05178094, + "epoch": 0.1876446715767323, + "flos": 29570210841600.0, + "grad_norm": 1.7598733043788508, + "language_loss": 0.8513701, + "learning_rate": 3.7449182682182263e-06, + "loss": 0.873285, + "num_input_tokens_seen": 67323405, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9609375, + "step": 3121, + "time_per_iteration": 2.5178277492523193 + }, + { + "auxiliary_loss_clip": 0.0115058, + "auxiliary_loss_mlp": 0.0104859, + "balance_loss_clip": 1.02976704, + "balance_loss_mlp": 1.05281448, + "epoch": 0.18770479482940028, + "flos": 30339992234880.0, + "grad_norm": 2.163070382320244, + "language_loss": 0.70038795, + "learning_rate": 3.744727910244937e-06, + "loss": 0.72237968, + "num_input_tokens_seen": 67345800, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9765625, + "step": 3122, + "time_per_iteration": 2.5523242950439453 + }, + { + "auxiliary_loss_clip": 0.0114817, + "auxiliary_loss_mlp": 0.01045591, + "balance_loss_clip": 1.02524245, + "balance_loss_mlp": 1.05194402, + "epoch": 0.18776491808206824, + "flos": 14465357527680.0, + "grad_norm": 2.352571744641408, + "language_loss": 0.7034744, + "learning_rate": 3.7445374861113905e-06, + "loss": 0.72541201, + "num_input_tokens_seen": 67363575, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.9609375, + "step": 3123, + "time_per_iteration": 2.4145569801330566 + }, + { + "auxiliary_loss_clip": 0.01149047, + "auxiliary_loss_mlp": 0.01047451, + "balance_loss_clip": 1.02968884, + "balance_loss_mlp": 1.05238771, + "epoch": 0.1878250413347362, + "flos": 24498547044480.0, + "grad_norm": 2.0330816469172097, + "language_loss": 0.73851109, + "learning_rate": 3.7443469958248066e-06, + "loss": 0.76047611, + "num_input_tokens_seen": 67381765, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.96875, + "step": 3124, + "time_per_iteration": 2.497352123260498 + }, + { + "auxiliary_loss_clip": 0.01153024, + "auxiliary_loss_mlp": 0.01050586, + "balance_loss_clip": 1.03027248, + "balance_loss_mlp": 1.05275774, + "epoch": 0.18788516458740417, + "flos": 39786185692800.0, + "grad_norm": 1.9990758157966066, + "language_loss": 0.80601895, + "learning_rate": 3.7441564393924106e-06, + "loss": 0.82805508, + "num_input_tokens_seen": 67405000, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.0, + "step": 3125, + "time_per_iteration": 2.605851411819458 + }, + { + "auxiliary_loss_clip": 0.01056257, + "auxiliary_loss_mlp": 0.01009024, + "balance_loss_clip": 1.00697315, + "balance_loss_mlp": 1.02352476, + "epoch": 0.18794528784007214, + "flos": 64699250664960.0, + "grad_norm": 0.9386177249275542, + "language_loss": 0.63591504, + "learning_rate": 3.7439658168214273e-06, + "loss": 0.65656781, + "num_input_tokens_seen": 67467140, + "router_z_loss_clip": 0.02050781, + "router_z_loss_mlp": 0.328125, + "step": 3126, + "time_per_iteration": 3.0943961143493652 + }, + { + "auxiliary_loss_clip": 0.01150221, + "auxiliary_loss_mlp": 0.01042071, + "balance_loss_clip": 1.02366543, + "balance_loss_mlp": 1.05439222, + "epoch": 0.1880054110927401, + "flos": 28622061486720.0, + "grad_norm": 1.7984129752859428, + "language_loss": 0.81274688, + "learning_rate": 3.7437751281190857e-06, + "loss": 0.83466977, + "num_input_tokens_seen": 67487980, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.9609375, + "step": 3127, + "time_per_iteration": 2.535048723220825 + }, + { + "auxiliary_loss_clip": 0.01054784, + "auxiliary_loss_mlp": 0.0100739, + "balance_loss_clip": 1.00543487, + "balance_loss_mlp": 1.02235639, + "epoch": 0.1880655343454081, + "flos": 64488958490880.0, + "grad_norm": 0.7620779230288282, + "language_loss": 0.6191628, + "learning_rate": 3.7435843732926164e-06, + "loss": 0.63978451, + "num_input_tokens_seen": 67552500, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.32421875, + "step": 3128, + "time_per_iteration": 3.1384503841400146 + }, + { + "auxiliary_loss_clip": 0.01153999, + "auxiliary_loss_mlp": 0.0104217, + "balance_loss_clip": 1.02329898, + "balance_loss_mlp": 1.05182266, + "epoch": 0.18812565759807606, + "flos": 32124464928000.0, + "grad_norm": 2.171302965646948, + "language_loss": 0.71237707, + "learning_rate": 3.7433935523492536e-06, + "loss": 0.73433876, + "num_input_tokens_seen": 67573295, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0234375, + "step": 3129, + "time_per_iteration": 2.560601234436035 + }, + { + "auxiliary_loss_clip": 0.01149923, + "auxiliary_loss_mlp": 0.01051091, + "balance_loss_clip": 1.03206491, + "balance_loss_mlp": 1.05224252, + "epoch": 0.18818578085074403, + "flos": 20624539449600.0, + "grad_norm": 2.040923932078449, + "language_loss": 0.85375232, + "learning_rate": 3.7432026652962314e-06, + "loss": 0.87576246, + "num_input_tokens_seen": 67590010, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.9765625, + "step": 3130, + "time_per_iteration": 2.4366040229797363 + }, + { + "auxiliary_loss_clip": 0.0114816, + "auxiliary_loss_mlp": 0.01044498, + "balance_loss_clip": 1.02507877, + "balance_loss_mlp": 1.04844868, + "epoch": 0.188245904103412, + "flos": 28840506048000.0, + "grad_norm": 1.9842347260172397, + "language_loss": 0.77227372, + "learning_rate": 3.7430117121407897e-06, + "loss": 0.7942003, + "num_input_tokens_seen": 67611110, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0, + "step": 3131, + "time_per_iteration": 2.503112554550171 + }, + { + "auxiliary_loss_clip": 0.01151098, + "auxiliary_loss_mlp": 0.0104766, + "balance_loss_clip": 1.02800202, + "balance_loss_mlp": 1.05402517, + "epoch": 0.18830602735607996, + "flos": 29420319386880.0, + "grad_norm": 1.8095346888628816, + "language_loss": 0.81244844, + "learning_rate": 3.74282069289017e-06, + "loss": 0.834436, + "num_input_tokens_seen": 67631990, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.96875, + "step": 3132, + "time_per_iteration": 2.5265986919403076 + }, + { + "auxiliary_loss_clip": 0.01154443, + "auxiliary_loss_mlp": 0.01048532, + "balance_loss_clip": 1.02939904, + "balance_loss_mlp": 1.05395401, + "epoch": 0.18836615060874792, + "flos": 28872933050880.0, + "grad_norm": 2.3595669444771135, + "language_loss": 0.79035556, + "learning_rate": 3.742629607551614e-06, + "loss": 0.81238532, + "num_input_tokens_seen": 67650490, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0, + "step": 3133, + "time_per_iteration": 2.500927209854126 + }, + { + "auxiliary_loss_clip": 0.01151251, + "auxiliary_loss_mlp": 0.0105121, + "balance_loss_clip": 1.03224421, + "balance_loss_mlp": 1.05204821, + "epoch": 0.18842627386141592, + "flos": 22601673717120.0, + "grad_norm": 4.024150314183157, + "language_loss": 0.82826144, + "learning_rate": 3.7424384561323698e-06, + "loss": 0.85028601, + "num_input_tokens_seen": 67668860, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.9921875, + "step": 3134, + "time_per_iteration": 2.4773380756378174 + }, + { + "auxiliary_loss_clip": 0.01146819, + "auxiliary_loss_mlp": 0.01047119, + "balance_loss_clip": 1.02847505, + "balance_loss_mlp": 1.05027199, + "epoch": 0.18848639711408388, + "flos": 24573600512640.0, + "grad_norm": 1.4735244825899, + "language_loss": 0.82783771, + "learning_rate": 3.742247238639684e-06, + "loss": 0.8497771, + "num_input_tokens_seen": 67690220, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.96484375, + "step": 3135, + "time_per_iteration": 2.4957115650177 + }, + { + "auxiliary_loss_clip": 0.01149872, + "auxiliary_loss_mlp": 0.01052674, + "balance_loss_clip": 1.03343356, + "balance_loss_mlp": 1.0503304, + "epoch": 0.18854652036675185, + "flos": 34166920078080.0, + "grad_norm": 1.8513380433423674, + "language_loss": 0.79031271, + "learning_rate": 3.7420559550808083e-06, + "loss": 0.81233823, + "num_input_tokens_seen": 67709820, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.9921875, + "step": 3136, + "time_per_iteration": 2.556800127029419 + }, + { + "auxiliary_loss_clip": 0.01150763, + "auxiliary_loss_mlp": 0.01048681, + "balance_loss_clip": 1.02947617, + "balance_loss_mlp": 1.05327463, + "epoch": 0.1886066436194198, + "flos": 24200236592640.0, + "grad_norm": 1.9366242888645147, + "language_loss": 0.81049621, + "learning_rate": 3.741864605462996e-06, + "loss": 0.83249068, + "num_input_tokens_seen": 67729490, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.9765625, + "step": 3137, + "time_per_iteration": 2.487513542175293 + }, + { + "auxiliary_loss_clip": 0.01151307, + "auxiliary_loss_mlp": 0.01057024, + "balance_loss_clip": 1.03913093, + "balance_loss_mlp": 1.05406666, + "epoch": 0.18866676687208778, + "flos": 21251109317760.0, + "grad_norm": 1.5870634004860276, + "language_loss": 0.8119483, + "learning_rate": 3.741673189793504e-06, + "loss": 0.83403158, + "num_input_tokens_seen": 67749665, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.97265625, + "step": 3138, + "time_per_iteration": 2.4554855823516846 + }, + { + "auxiliary_loss_clip": 0.01152889, + "auxiliary_loss_mlp": 0.01050697, + "balance_loss_clip": 1.03162408, + "balance_loss_mlp": 1.05190897, + "epoch": 0.18872689012475574, + "flos": 37308673013760.0, + "grad_norm": 1.760814692015778, + "language_loss": 0.636096, + "learning_rate": 3.7414817080795896e-06, + "loss": 0.6581319, + "num_input_tokens_seen": 67776230, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0078125, + "step": 3139, + "time_per_iteration": 2.6103553771972656 + }, + { + "auxiliary_loss_clip": 0.01146092, + "auxiliary_loss_mlp": 0.0105006, + "balance_loss_clip": 1.03046215, + "balance_loss_mlp": 1.04812348, + "epoch": 0.1887870133774237, + "flos": 21652303299840.0, + "grad_norm": 2.433795452320061, + "language_loss": 0.71546841, + "learning_rate": 3.741290160328514e-06, + "loss": 0.73742986, + "num_input_tokens_seen": 67795080, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.98046875, + "step": 3140, + "time_per_iteration": 2.4519457817077637 + }, + { + "auxiliary_loss_clip": 0.01147517, + "auxiliary_loss_mlp": 0.01047268, + "balance_loss_clip": 1.02764606, + "balance_loss_mlp": 1.04848385, + "epoch": 0.1888471366300917, + "flos": 15924659374080.0, + "grad_norm": 3.1391974719951574, + "language_loss": 0.87001872, + "learning_rate": 3.7410985465475412e-06, + "loss": 0.89196658, + "num_input_tokens_seen": 67813110, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.98828125, + "step": 3141, + "time_per_iteration": 2.4811747074127197 + }, + { + "auxiliary_loss_clip": 0.01153623, + "auxiliary_loss_mlp": 0.0104492, + "balance_loss_clip": 1.02460694, + "balance_loss_mlp": 1.05144691, + "epoch": 0.18890725988275966, + "flos": 18551955767040.0, + "grad_norm": 2.021325930100965, + "language_loss": 0.77418405, + "learning_rate": 3.7409068667439378e-06, + "loss": 0.79616946, + "num_input_tokens_seen": 67831070, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.0234375, + "step": 3142, + "time_per_iteration": 2.5001184940338135 + }, + { + "auxiliary_loss_clip": 0.01148279, + "auxiliary_loss_mlp": 0.01042631, + "balance_loss_clip": 1.02542925, + "balance_loss_mlp": 1.05104184, + "epoch": 0.18896738313542763, + "flos": 28840865184000.0, + "grad_norm": 1.6841374820722228, + "language_loss": 0.78446913, + "learning_rate": 3.740715120924971e-06, + "loss": 0.80637825, + "num_input_tokens_seen": 67852170, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.97265625, + "step": 3143, + "time_per_iteration": 3.9074132442474365 + }, + { + "auxiliary_loss_clip": 0.01150084, + "auxiliary_loss_mlp": 0.01049426, + "balance_loss_clip": 1.03081727, + "balance_loss_mlp": 1.05069065, + "epoch": 0.1890275063880956, + "flos": 22412747157120.0, + "grad_norm": 4.1822349926512485, + "language_loss": 0.71507585, + "learning_rate": 3.740523309097912e-06, + "loss": 0.73707104, + "num_input_tokens_seen": 67869945, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9921875, + "step": 3144, + "time_per_iteration": 3.981715679168701 + }, + { + "auxiliary_loss_clip": 0.01152034, + "auxiliary_loss_mlp": 0.01045652, + "balance_loss_clip": 1.02605355, + "balance_loss_mlp": 1.0513736, + "epoch": 0.18908762964076356, + "flos": 24243904552320.0, + "grad_norm": 2.6203593578621893, + "language_loss": 0.73683178, + "learning_rate": 3.7403314312700356e-06, + "loss": 0.75880861, + "num_input_tokens_seen": 67890240, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0078125, + "step": 3145, + "time_per_iteration": 2.5101706981658936 + }, + { + "auxiliary_loss_clip": 0.01143872, + "auxiliary_loss_mlp": 0.01045631, + "balance_loss_clip": 1.02783298, + "balance_loss_mlp": 1.04759097, + "epoch": 0.18914775289343153, + "flos": 16982910892800.0, + "grad_norm": 2.6756165752276027, + "language_loss": 0.77081764, + "learning_rate": 3.740139487448616e-06, + "loss": 0.79271269, + "num_input_tokens_seen": 67907825, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9609375, + "step": 3146, + "time_per_iteration": 2.4278056621551514 + }, + { + "auxiliary_loss_clip": 0.01148489, + "auxiliary_loss_mlp": 0.01046339, + "balance_loss_clip": 1.02811205, + "balance_loss_mlp": 1.04947495, + "epoch": 0.1892078761460995, + "flos": 21543781334400.0, + "grad_norm": 1.794796296308648, + "language_loss": 0.78377169, + "learning_rate": 3.7399474776409326e-06, + "loss": 0.80571997, + "num_input_tokens_seen": 67926670, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.98828125, + "step": 3147, + "time_per_iteration": 2.467607259750366 + }, + { + "auxiliary_loss_clip": 0.0114757, + "auxiliary_loss_mlp": 0.01048988, + "balance_loss_clip": 1.0310235, + "balance_loss_mlp": 1.0499115, + "epoch": 0.18926799939876748, + "flos": 23001538896000.0, + "grad_norm": 3.2769360880247853, + "language_loss": 0.67016155, + "learning_rate": 3.739755401854267e-06, + "loss": 0.69212711, + "num_input_tokens_seen": 67943645, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9765625, + "step": 3148, + "time_per_iteration": 2.4846954345703125 + }, + { + "auxiliary_loss_clip": 0.01145427, + "auxiliary_loss_mlp": 0.01037742, + "balance_loss_clip": 1.02037382, + "balance_loss_mlp": 1.04898858, + "epoch": 0.18932812265143545, + "flos": 22273019251200.0, + "grad_norm": 4.644784357412393, + "language_loss": 0.75978655, + "learning_rate": 3.739563260095902e-06, + "loss": 0.78161824, + "num_input_tokens_seen": 67962345, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.96484375, + "step": 3149, + "time_per_iteration": 2.4768459796905518 + }, + { + "auxiliary_loss_clip": 0.01143839, + "auxiliary_loss_mlp": 0.01047607, + "balance_loss_clip": 1.03028584, + "balance_loss_mlp": 1.05033517, + "epoch": 0.1893882459041034, + "flos": 18624423456000.0, + "grad_norm": 2.9181295874949735, + "language_loss": 0.81229341, + "learning_rate": 3.7393710523731245e-06, + "loss": 0.83420789, + "num_input_tokens_seen": 67979760, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9375, + "step": 3150, + "time_per_iteration": 2.42832088470459 + }, + { + "auxiliary_loss_clip": 0.01148187, + "auxiliary_loss_mlp": 0.01046446, + "balance_loss_clip": 1.02886271, + "balance_loss_mlp": 1.05068374, + "epoch": 0.18944836915677138, + "flos": 22892981016960.0, + "grad_norm": 2.066054594612055, + "language_loss": 0.84966886, + "learning_rate": 3.7391787786932215e-06, + "loss": 0.87161517, + "num_input_tokens_seen": 67996895, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9765625, + "step": 3151, + "time_per_iteration": 2.458054542541504 + }, + { + "auxiliary_loss_clip": 0.01148364, + "auxiliary_loss_mlp": 0.01052715, + "balance_loss_clip": 1.03441668, + "balance_loss_mlp": 1.04896331, + "epoch": 0.18950849240943934, + "flos": 26796542526720.0, + "grad_norm": 1.9128881662164896, + "language_loss": 0.7443462, + "learning_rate": 3.7389864390634857e-06, + "loss": 0.76635695, + "num_input_tokens_seen": 68018365, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.99609375, + "step": 3152, + "time_per_iteration": 2.4904792308807373 + }, + { + "auxiliary_loss_clip": 0.01146776, + "auxiliary_loss_mlp": 0.01048229, + "balance_loss_clip": 1.02937067, + "balance_loss_mlp": 1.0502255, + "epoch": 0.1895686156621073, + "flos": 24971239048320.0, + "grad_norm": 1.8661622565083957, + "language_loss": 0.75719136, + "learning_rate": 3.738794033491209e-06, + "loss": 0.77914143, + "num_input_tokens_seen": 68037985, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.96484375, + "step": 3153, + "time_per_iteration": 2.5026283264160156 + }, + { + "auxiliary_loss_clip": 0.01148349, + "auxiliary_loss_mlp": 0.01048305, + "balance_loss_clip": 1.03007817, + "balance_loss_mlp": 1.04962945, + "epoch": 0.1896287389147753, + "flos": 21944544353280.0, + "grad_norm": 1.8393709351558127, + "language_loss": 0.79529279, + "learning_rate": 3.7386015619836887e-06, + "loss": 0.81725931, + "num_input_tokens_seen": 68057975, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.984375, + "step": 3154, + "time_per_iteration": 2.4544081687927246 + }, + { + "auxiliary_loss_clip": 0.01151316, + "auxiliary_loss_mlp": 0.01047877, + "balance_loss_clip": 1.02919698, + "balance_loss_mlp": 1.04986668, + "epoch": 0.18968886216744327, + "flos": 18179058723840.0, + "grad_norm": 2.673670363277482, + "language_loss": 0.72798991, + "learning_rate": 3.738409024548223e-06, + "loss": 0.74998182, + "num_input_tokens_seen": 68074175, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 3155, + "time_per_iteration": 2.425431728363037 + }, + { + "auxiliary_loss_clip": 0.01145009, + "auxiliary_loss_mlp": 0.01048344, + "balance_loss_clip": 1.03042662, + "balance_loss_mlp": 1.04930019, + "epoch": 0.18974898542011123, + "flos": 20412487509120.0, + "grad_norm": 1.676026678838244, + "language_loss": 0.73911691, + "learning_rate": 3.7382164211921136e-06, + "loss": 0.76105046, + "num_input_tokens_seen": 68095230, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.95703125, + "step": 3156, + "time_per_iteration": 2.4683640003204346 + }, + { + "auxiliary_loss_clip": 0.01149399, + "auxiliary_loss_mlp": 0.01050259, + "balance_loss_clip": 1.03281915, + "balance_loss_mlp": 1.05195308, + "epoch": 0.1898091086727792, + "flos": 23985024255360.0, + "grad_norm": 1.5984593201401434, + "language_loss": 0.68251741, + "learning_rate": 3.7380237519226623e-06, + "loss": 0.70451397, + "num_input_tokens_seen": 68113805, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9765625, + "step": 3157, + "time_per_iteration": 2.472182512283325 + }, + { + "auxiliary_loss_clip": 0.01146139, + "auxiliary_loss_mlp": 0.01043457, + "balance_loss_clip": 1.02539706, + "balance_loss_mlp": 1.04914486, + "epoch": 0.18986923192544716, + "flos": 27637067756160.0, + "grad_norm": 1.9937577865402571, + "language_loss": 0.80197155, + "learning_rate": 3.737831016747176e-06, + "loss": 0.82386756, + "num_input_tokens_seen": 68133190, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.96875, + "step": 3158, + "time_per_iteration": 2.4978723526000977 + }, + { + "auxiliary_loss_clip": 0.01152812, + "auxiliary_loss_mlp": 0.01045212, + "balance_loss_clip": 1.02624583, + "balance_loss_mlp": 1.05201745, + "epoch": 0.18992935517811513, + "flos": 25484151306240.0, + "grad_norm": 1.9065090881698699, + "language_loss": 0.71940476, + "learning_rate": 3.737638215672964e-06, + "loss": 0.74138498, + "num_input_tokens_seen": 68152330, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0078125, + "step": 3159, + "time_per_iteration": 2.503129720687866 + }, + { + "auxiliary_loss_clip": 0.01150054, + "auxiliary_loss_mlp": 0.01049079, + "balance_loss_clip": 1.02987432, + "balance_loss_mlp": 1.05255282, + "epoch": 0.1899894784307831, + "flos": 17420805596160.0, + "grad_norm": 1.8597759984302606, + "language_loss": 0.85071993, + "learning_rate": 3.7374453487073366e-06, + "loss": 0.8727113, + "num_input_tokens_seen": 68170185, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.9765625, + "step": 3160, + "time_per_iteration": 2.45534348487854 + }, + { + "auxiliary_loss_clip": 0.01143204, + "auxiliary_loss_mlp": 0.01049047, + "balance_loss_clip": 1.03235734, + "balance_loss_mlp": 1.050807, + "epoch": 0.19004960168345109, + "flos": 27492240119040.0, + "grad_norm": 1.7120140162377986, + "language_loss": 0.73554128, + "learning_rate": 3.7372524158576074e-06, + "loss": 0.75746381, + "num_input_tokens_seen": 68191665, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.92578125, + "step": 3161, + "time_per_iteration": 2.5551726818084717 + }, + { + "auxiliary_loss_clip": 0.01150414, + "auxiliary_loss_mlp": 0.01047878, + "balance_loss_clip": 1.02982974, + "balance_loss_mlp": 1.05420387, + "epoch": 0.19010972493611905, + "flos": 38654676385920.0, + "grad_norm": 1.554139282497156, + "language_loss": 0.80939364, + "learning_rate": 3.7370594171310926e-06, + "loss": 0.83137655, + "num_input_tokens_seen": 68214635, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9609375, + "step": 3162, + "time_per_iteration": 2.609764337539673 + }, + { + "auxiliary_loss_clip": 0.01149524, + "auxiliary_loss_mlp": 0.01043018, + "balance_loss_clip": 1.02486265, + "balance_loss_mlp": 1.05257571, + "epoch": 0.19016984818878702, + "flos": 19244744357760.0, + "grad_norm": 1.8884975109329094, + "language_loss": 0.75600141, + "learning_rate": 3.73686635253511e-06, + "loss": 0.77792686, + "num_input_tokens_seen": 68232150, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.96875, + "step": 3163, + "time_per_iteration": 2.4494824409484863 + }, + { + "auxiliary_loss_clip": 0.01149917, + "auxiliary_loss_mlp": 0.0103951, + "balance_loss_clip": 1.02161682, + "balance_loss_mlp": 1.05577397, + "epoch": 0.19022997144145498, + "flos": 37596891744000.0, + "grad_norm": 1.5980783305445414, + "language_loss": 0.74197054, + "learning_rate": 3.736673222076982e-06, + "loss": 0.76386476, + "num_input_tokens_seen": 68253370, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.94140625, + "step": 3164, + "time_per_iteration": 2.5901739597320557 + }, + { + "auxiliary_loss_clip": 0.01148415, + "auxiliary_loss_mlp": 0.01039529, + "balance_loss_clip": 1.02151656, + "balance_loss_mlp": 1.05402589, + "epoch": 0.19029009469412295, + "flos": 61530921665280.0, + "grad_norm": 1.5830796140792522, + "language_loss": 0.66913098, + "learning_rate": 3.7364800257640313e-06, + "loss": 0.69101042, + "num_input_tokens_seen": 68278895, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9453125, + "step": 3165, + "time_per_iteration": 2.899500608444214 + }, + { + "auxiliary_loss_clip": 0.01148214, + "auxiliary_loss_mlp": 0.01045421, + "balance_loss_clip": 1.02624011, + "balance_loss_mlp": 1.05282831, + "epoch": 0.1903502179467909, + "flos": 13954851480960.0, + "grad_norm": 2.1716027754337257, + "language_loss": 0.7452209, + "learning_rate": 3.7362867636035835e-06, + "loss": 0.76715726, + "num_input_tokens_seen": 68294880, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.953125, + "step": 3166, + "time_per_iteration": 2.4325685501098633 + }, + { + "auxiliary_loss_clip": 0.01062623, + "auxiliary_loss_mlp": 0.01017161, + "balance_loss_clip": 1.01490772, + "balance_loss_mlp": 1.02902174, + "epoch": 0.1904103411994589, + "flos": 66899641916160.0, + "grad_norm": 0.8067170187870535, + "language_loss": 0.50396568, + "learning_rate": 3.736093435602968e-06, + "loss": 0.52476352, + "num_input_tokens_seen": 68359665, + "router_z_loss_clip": 0.02258301, + "router_z_loss_mlp": 0.3359375, + "step": 3167, + "time_per_iteration": 3.1095221042633057 + }, + { + "auxiliary_loss_clip": 0.01146367, + "auxiliary_loss_mlp": 0.01049594, + "balance_loss_clip": 1.03184342, + "balance_loss_mlp": 1.05208659, + "epoch": 0.19047046445212687, + "flos": 21908741472000.0, + "grad_norm": 1.7496006549093657, + "language_loss": 0.74235475, + "learning_rate": 3.7359000417695156e-06, + "loss": 0.76431435, + "num_input_tokens_seen": 68378950, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9453125, + "step": 3168, + "time_per_iteration": 2.483551502227783 + }, + { + "auxiliary_loss_clip": 0.01059618, + "auxiliary_loss_mlp": 0.01004786, + "balance_loss_clip": 1.00246131, + "balance_loss_mlp": 1.02649927, + "epoch": 0.19053058770479483, + "flos": 59255156701440.0, + "grad_norm": 0.8615778549663292, + "language_loss": 0.60097563, + "learning_rate": 3.73570658211056e-06, + "loss": 0.6216197, + "num_input_tokens_seen": 68434235, + "router_z_loss_clip": 0.02319336, + "router_z_loss_mlp": 0.33203125, + "step": 3169, + "time_per_iteration": 2.958176851272583 + }, + { + "auxiliary_loss_clip": 0.01152665, + "auxiliary_loss_mlp": 0.01051299, + "balance_loss_clip": 1.03371537, + "balance_loss_mlp": 1.05302989, + "epoch": 0.1905907109574628, + "flos": 23951304362880.0, + "grad_norm": 1.550337238497042, + "language_loss": 0.77976263, + "learning_rate": 3.735513056633436e-06, + "loss": 0.80180222, + "num_input_tokens_seen": 68453830, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.99609375, + "step": 3170, + "time_per_iteration": 2.5174756050109863 + }, + { + "auxiliary_loss_clip": 0.01145075, + "auxiliary_loss_mlp": 0.01046915, + "balance_loss_clip": 1.02960575, + "balance_loss_mlp": 1.05185819, + "epoch": 0.19065083421013077, + "flos": 20812316774400.0, + "grad_norm": 1.7193055204742105, + "language_loss": 0.78597021, + "learning_rate": 3.7353194653454834e-06, + "loss": 0.80789012, + "num_input_tokens_seen": 68473005, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.93359375, + "step": 3171, + "time_per_iteration": 2.4895551204681396 + }, + { + "auxiliary_loss_clip": 0.01149187, + "auxiliary_loss_mlp": 0.0104474, + "balance_loss_clip": 1.02617931, + "balance_loss_mlp": 1.05111575, + "epoch": 0.19071095746279873, + "flos": 31284981192960.0, + "grad_norm": 3.5246110250440386, + "language_loss": 0.78578937, + "learning_rate": 3.7351258082540426e-06, + "loss": 0.80772865, + "num_input_tokens_seen": 68493470, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.98046875, + "step": 3172, + "time_per_iteration": 2.574558973312378 + }, + { + "auxiliary_loss_clip": 0.01148239, + "auxiliary_loss_mlp": 0.01054453, + "balance_loss_clip": 1.03711963, + "balance_loss_mlp": 1.05253482, + "epoch": 0.1907710807154667, + "flos": 14356117290240.0, + "grad_norm": 1.581476317811461, + "language_loss": 0.80126482, + "learning_rate": 3.7349320853664576e-06, + "loss": 0.82329178, + "num_input_tokens_seen": 68511290, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.95703125, + "step": 3173, + "time_per_iteration": 2.464979410171509 + }, + { + "auxiliary_loss_clip": 0.01149716, + "auxiliary_loss_mlp": 0.01051904, + "balance_loss_clip": 1.03432083, + "balance_loss_mlp": 1.05250478, + "epoch": 0.1908312039681347, + "flos": 26907039740160.0, + "grad_norm": 1.9222394249434893, + "language_loss": 0.78740567, + "learning_rate": 3.7347382966900735e-06, + "loss": 0.8094219, + "num_input_tokens_seen": 68532575, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.96875, + "step": 3174, + "time_per_iteration": 2.540959358215332 + }, + { + "auxiliary_loss_clip": 0.01149777, + "auxiliary_loss_mlp": 0.01047648, + "balance_loss_clip": 1.03043461, + "balance_loss_mlp": 1.05367374, + "epoch": 0.19089132722080265, + "flos": 14494695960960.0, + "grad_norm": 1.8458147293094664, + "language_loss": 0.80757344, + "learning_rate": 3.7345444422322395e-06, + "loss": 0.82954776, + "num_input_tokens_seen": 68548760, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9609375, + "step": 3175, + "time_per_iteration": 2.441190481185913 + }, + { + "auxiliary_loss_clip": 0.01148992, + "auxiliary_loss_mlp": 0.01056395, + "balance_loss_clip": 1.03821599, + "balance_loss_mlp": 1.0521791, + "epoch": 0.19095145047347062, + "flos": 13952876232960.0, + "grad_norm": 2.3562328324004445, + "language_loss": 0.85142022, + "learning_rate": 3.7343505220003067e-06, + "loss": 0.87347412, + "num_input_tokens_seen": 68563100, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.96875, + "step": 3176, + "time_per_iteration": 2.4397072792053223 + }, + { + "auxiliary_loss_clip": 0.01152727, + "auxiliary_loss_mlp": 0.01056149, + "balance_loss_clip": 1.036515, + "balance_loss_mlp": 1.05395234, + "epoch": 0.19101157372613858, + "flos": 25301832848640.0, + "grad_norm": 2.002060812172469, + "language_loss": 0.81206596, + "learning_rate": 3.7341565360016285e-06, + "loss": 0.83415473, + "num_input_tokens_seen": 68581650, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.98828125, + "step": 3177, + "time_per_iteration": 2.4980266094207764 + }, + { + "auxiliary_loss_clip": 0.01144454, + "auxiliary_loss_mlp": 0.01048966, + "balance_loss_clip": 1.03073931, + "balance_loss_mlp": 1.0503974, + "epoch": 0.19107169697880655, + "flos": 20558212986240.0, + "grad_norm": 1.9374450898751996, + "language_loss": 0.74628592, + "learning_rate": 3.73396248424356e-06, + "loss": 0.76822007, + "num_input_tokens_seen": 68600360, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.94140625, + "step": 3178, + "time_per_iteration": 2.477679967880249 + }, + { + "auxiliary_loss_clip": 0.01147132, + "auxiliary_loss_mlp": 0.01039746, + "balance_loss_clip": 1.02273464, + "balance_loss_mlp": 1.05001104, + "epoch": 0.19113182023147451, + "flos": 22163204396160.0, + "grad_norm": 1.8429055258583904, + "language_loss": 0.8167876, + "learning_rate": 3.7337683667334606e-06, + "loss": 0.83865643, + "num_input_tokens_seen": 68617885, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.97265625, + "step": 3179, + "time_per_iteration": 2.452310800552368 + }, + { + "auxiliary_loss_clip": 0.0114904, + "auxiliary_loss_mlp": 0.01046544, + "balance_loss_clip": 1.02892482, + "balance_loss_mlp": 1.05279994, + "epoch": 0.19119194348414248, + "flos": 18581796990720.0, + "grad_norm": 2.1508657656276484, + "language_loss": 0.7946887, + "learning_rate": 3.733574183478691e-06, + "loss": 0.81664455, + "num_input_tokens_seen": 68634550, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.96484375, + "step": 3180, + "time_per_iteration": 2.451066732406616 + }, + { + "auxiliary_loss_clip": 0.0114304, + "auxiliary_loss_mlp": 0.01045985, + "balance_loss_clip": 1.02770984, + "balance_loss_mlp": 1.04780042, + "epoch": 0.19125206673681047, + "flos": 19026623018880.0, + "grad_norm": 2.916741655382754, + "language_loss": 0.79891652, + "learning_rate": 3.733379934486615e-06, + "loss": 0.82080674, + "num_input_tokens_seen": 68651895, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.94921875, + "step": 3181, + "time_per_iteration": 2.4310615062713623 + }, + { + "auxiliary_loss_clip": 0.0114616, + "auxiliary_loss_mlp": 0.01053832, + "balance_loss_clip": 1.03623664, + "balance_loss_mlp": 1.04858851, + "epoch": 0.19131218998947844, + "flos": 21690153256320.0, + "grad_norm": 1.7607714952320546, + "language_loss": 0.73820639, + "learning_rate": 3.7331856197645973e-06, + "loss": 0.76020634, + "num_input_tokens_seen": 68671500, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9765625, + "step": 3182, + "time_per_iteration": 2.4712350368499756 + }, + { + "auxiliary_loss_clip": 0.01148345, + "auxiliary_loss_mlp": 0.01048421, + "balance_loss_clip": 1.03093314, + "balance_loss_mlp": 1.05187011, + "epoch": 0.1913723132421464, + "flos": 18442500048000.0, + "grad_norm": 1.8018319163421928, + "language_loss": 0.6486634, + "learning_rate": 3.7329912393200084e-06, + "loss": 0.67063105, + "num_input_tokens_seen": 68690570, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.96484375, + "step": 3183, + "time_per_iteration": 2.440232753753662 + }, + { + "auxiliary_loss_clip": 0.01145449, + "auxiliary_loss_mlp": 0.01047983, + "balance_loss_clip": 1.02920759, + "balance_loss_mlp": 1.04864669, + "epoch": 0.19143243649481437, + "flos": 27160102033920.0, + "grad_norm": 1.760716170695104, + "language_loss": 0.73234087, + "learning_rate": 3.7327967931602173e-06, + "loss": 0.7542752, + "num_input_tokens_seen": 68709735, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.96875, + "step": 3184, + "time_per_iteration": 3.9211573600769043 + }, + { + "auxiliary_loss_clip": 0.01145752, + "auxiliary_loss_mlp": 0.01047876, + "balance_loss_clip": 1.0281471, + "balance_loss_mlp": 1.04738748, + "epoch": 0.19149255974748233, + "flos": 21718952985600.0, + "grad_norm": 2.1066155051108315, + "language_loss": 0.8784132, + "learning_rate": 3.732602281292598e-06, + "loss": 0.9003495, + "num_input_tokens_seen": 68727565, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.984375, + "step": 3185, + "time_per_iteration": 5.396124601364136 + }, + { + "auxiliary_loss_clip": 0.01143307, + "auxiliary_loss_mlp": 0.01046716, + "balance_loss_clip": 1.02803612, + "balance_loss_mlp": 1.04899192, + "epoch": 0.1915526830001503, + "flos": 22963293889920.0, + "grad_norm": 2.10102369978198, + "language_loss": 0.72667789, + "learning_rate": 3.7324077037245267e-06, + "loss": 0.74857807, + "num_input_tokens_seen": 68748110, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.94140625, + "step": 3186, + "time_per_iteration": 2.498241901397705 + }, + { + "auxiliary_loss_clip": 0.01153236, + "auxiliary_loss_mlp": 0.01042185, + "balance_loss_clip": 1.02244437, + "balance_loss_mlp": 1.054919, + "epoch": 0.1916128062528183, + "flos": 26140741966080.0, + "grad_norm": 2.264264166459479, + "language_loss": 0.83865881, + "learning_rate": 3.7322130604633825e-06, + "loss": 0.86061311, + "num_input_tokens_seen": 68769765, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.98046875, + "step": 3187, + "time_per_iteration": 2.527416467666626 + }, + { + "auxiliary_loss_clip": 0.01051867, + "auxiliary_loss_mlp": 0.01015636, + "balance_loss_clip": 1.01343083, + "balance_loss_mlp": 1.01988959, + "epoch": 0.19167292950548626, + "flos": 54925767457920.0, + "grad_norm": 0.8634842964488614, + "language_loss": 0.55803859, + "learning_rate": 3.732018351516544e-06, + "loss": 0.5787136, + "num_input_tokens_seen": 68826815, + "router_z_loss_clip": 0.02209473, + "router_z_loss_mlp": 0.3203125, + "step": 3188, + "time_per_iteration": 3.0815136432647705 + }, + { + "auxiliary_loss_clip": 0.01145462, + "auxiliary_loss_mlp": 0.01055783, + "balance_loss_clip": 1.03709126, + "balance_loss_mlp": 1.04972625, + "epoch": 0.19173305275815422, + "flos": 29935601942400.0, + "grad_norm": 1.71302722892552, + "language_loss": 0.70180511, + "learning_rate": 3.731823576891397e-06, + "loss": 0.72381759, + "num_input_tokens_seen": 68847585, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.95703125, + "step": 3189, + "time_per_iteration": 2.5380465984344482 + }, + { + "auxiliary_loss_clip": 0.01140421, + "auxiliary_loss_mlp": 0.01034792, + "balance_loss_clip": 1.01822233, + "balance_loss_mlp": 1.04853344, + "epoch": 0.1917931760108222, + "flos": 24752471264640.0, + "grad_norm": 2.222159201352765, + "language_loss": 0.74234986, + "learning_rate": 3.7316287365953266e-06, + "loss": 0.76410198, + "num_input_tokens_seen": 68866620, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.91796875, + "step": 3190, + "time_per_iteration": 2.5862700939178467 + }, + { + "auxiliary_loss_clip": 0.01143494, + "auxiliary_loss_mlp": 0.01056401, + "balance_loss_clip": 1.03818583, + "balance_loss_mlp": 1.04965627, + "epoch": 0.19185329926349015, + "flos": 18843550375680.0, + "grad_norm": 1.8818377537371913, + "language_loss": 0.8394708, + "learning_rate": 3.73143383063572e-06, + "loss": 0.86146975, + "num_input_tokens_seen": 68885515, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9375, + "step": 3191, + "time_per_iteration": 2.5077905654907227 + }, + { + "auxiliary_loss_clip": 0.01139779, + "auxiliary_loss_mlp": 0.01038816, + "balance_loss_clip": 1.02217412, + "balance_loss_mlp": 1.04766488, + "epoch": 0.19191342251615812, + "flos": 22086858038400.0, + "grad_norm": 1.7694679756443132, + "language_loss": 0.89325655, + "learning_rate": 3.73123885901997e-06, + "loss": 0.91504252, + "num_input_tokens_seen": 68903225, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.921875, + "step": 3192, + "time_per_iteration": 2.4738776683807373 + }, + { + "auxiliary_loss_clip": 0.01150885, + "auxiliary_loss_mlp": 0.0105345, + "balance_loss_clip": 1.03398299, + "balance_loss_mlp": 1.0531472, + "epoch": 0.19197354576882608, + "flos": 22199115018240.0, + "grad_norm": 2.352703418633998, + "language_loss": 0.74830496, + "learning_rate": 3.7310438217554687e-06, + "loss": 0.77034831, + "num_input_tokens_seen": 68922860, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.9765625, + "step": 3193, + "time_per_iteration": 2.47143816947937 + }, + { + "auxiliary_loss_clip": 0.01146927, + "auxiliary_loss_mlp": 0.01045614, + "balance_loss_clip": 1.02717233, + "balance_loss_mlp": 1.04918766, + "epoch": 0.19203366902149407, + "flos": 24896185580160.0, + "grad_norm": 1.7283890992056894, + "language_loss": 0.74733245, + "learning_rate": 3.730848718849612e-06, + "loss": 0.7692579, + "num_input_tokens_seen": 68943000, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.9765625, + "step": 3194, + "time_per_iteration": 2.5001959800720215 + }, + { + "auxiliary_loss_clip": 0.0105047, + "auxiliary_loss_mlp": 0.010055, + "balance_loss_clip": 1.00319958, + "balance_loss_mlp": 1.01851392, + "epoch": 0.19209379227416204, + "flos": 68416722789120.0, + "grad_norm": 0.7975785668902318, + "language_loss": 0.68455988, + "learning_rate": 3.7306535503097985e-06, + "loss": 0.70511955, + "num_input_tokens_seen": 69000255, + "router_z_loss_clip": 0.02294922, + "router_z_loss_mlp": 0.3203125, + "step": 3195, + "time_per_iteration": 3.014677047729492 + }, + { + "auxiliary_loss_clip": 0.01146296, + "auxiliary_loss_mlp": 0.01043268, + "balance_loss_clip": 1.0254823, + "balance_loss_mlp": 1.05066323, + "epoch": 0.19215391552683, + "flos": 22055185221120.0, + "grad_norm": 1.9672517867074575, + "language_loss": 0.72712696, + "learning_rate": 3.730458316143429e-06, + "loss": 0.74902254, + "num_input_tokens_seen": 69019665, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.95703125, + "step": 3196, + "time_per_iteration": 2.4855856895446777 + }, + { + "auxiliary_loss_clip": 0.01151669, + "auxiliary_loss_mlp": 0.01046783, + "balance_loss_clip": 1.0284251, + "balance_loss_mlp": 1.05643284, + "epoch": 0.19221403877949797, + "flos": 20302959962880.0, + "grad_norm": 1.8158077484015336, + "language_loss": 0.83774233, + "learning_rate": 3.7302630163579068e-06, + "loss": 0.85972691, + "num_input_tokens_seen": 69039055, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.953125, + "step": 3197, + "time_per_iteration": 2.4530181884765625 + }, + { + "auxiliary_loss_clip": 0.01146905, + "auxiliary_loss_mlp": 0.01044345, + "balance_loss_clip": 1.02565312, + "balance_loss_mlp": 1.05036283, + "epoch": 0.19227416203216594, + "flos": 23185329811200.0, + "grad_norm": 2.295881830513264, + "language_loss": 0.80459738, + "learning_rate": 3.7300676509606373e-06, + "loss": 0.82650983, + "num_input_tokens_seen": 69056370, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.96875, + "step": 3198, + "time_per_iteration": 2.4882590770721436 + }, + { + "auxiliary_loss_clip": 0.01148366, + "auxiliary_loss_mlp": 0.01050243, + "balance_loss_clip": 1.03090763, + "balance_loss_mlp": 1.04984999, + "epoch": 0.1923342852848339, + "flos": 25776607841280.0, + "grad_norm": 1.9800701307051174, + "language_loss": 0.7862891, + "learning_rate": 3.729872219959029e-06, + "loss": 0.80827522, + "num_input_tokens_seen": 69075915, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.984375, + "step": 3199, + "time_per_iteration": 2.507227659225464 + }, + { + "auxiliary_loss_clip": 0.01146428, + "auxiliary_loss_mlp": 0.01042987, + "balance_loss_clip": 1.02567828, + "balance_loss_mlp": 1.05150342, + "epoch": 0.19239440853750187, + "flos": 17128349061120.0, + "grad_norm": 2.05190707233933, + "language_loss": 0.83391261, + "learning_rate": 3.7296767233604934e-06, + "loss": 0.85580671, + "num_input_tokens_seen": 69094145, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.94921875, + "step": 3200, + "time_per_iteration": 2.459218978881836 + }, + { + "auxiliary_loss_clip": 0.01148087, + "auxiliary_loss_mlp": 0.01051054, + "balance_loss_clip": 1.03286231, + "balance_loss_mlp": 1.0524931, + "epoch": 0.19245453179016986, + "flos": 16435093593600.0, + "grad_norm": 2.0233550639398428, + "language_loss": 0.78678542, + "learning_rate": 3.729481161172443e-06, + "loss": 0.80877686, + "num_input_tokens_seen": 69111110, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.95703125, + "step": 3201, + "time_per_iteration": 2.435478448867798 + }, + { + "auxiliary_loss_clip": 0.01148745, + "auxiliary_loss_mlp": 0.01046904, + "balance_loss_clip": 1.02874875, + "balance_loss_mlp": 1.05050445, + "epoch": 0.19251465504283782, + "flos": 20230276792320.0, + "grad_norm": 2.1716175760371814, + "language_loss": 0.69168961, + "learning_rate": 3.7292855334022927e-06, + "loss": 0.71364617, + "num_input_tokens_seen": 69130280, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.984375, + "step": 3202, + "time_per_iteration": 2.4596354961395264 + }, + { + "auxiliary_loss_clip": 0.01145317, + "auxiliary_loss_mlp": 0.01035376, + "balance_loss_clip": 1.01790023, + "balance_loss_mlp": 1.05140352, + "epoch": 0.1925747782955058, + "flos": 19464374067840.0, + "grad_norm": 1.7015130302687178, + "language_loss": 0.91123176, + "learning_rate": 3.7290898400574627e-06, + "loss": 0.93303871, + "num_input_tokens_seen": 69149570, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9375, + "step": 3203, + "time_per_iteration": 2.4425902366638184 + }, + { + "auxiliary_loss_clip": 0.01147002, + "auxiliary_loss_mlp": 0.01050127, + "balance_loss_clip": 1.03127956, + "balance_loss_mlp": 1.05008471, + "epoch": 0.19263490154817375, + "flos": 17785586165760.0, + "grad_norm": 2.129263396651385, + "language_loss": 0.81766933, + "learning_rate": 3.7288940811453725e-06, + "loss": 0.83964062, + "num_input_tokens_seen": 69168190, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.96875, + "step": 3204, + "time_per_iteration": 2.4466230869293213 + }, + { + "auxiliary_loss_clip": 0.01143673, + "auxiliary_loss_mlp": 0.01047511, + "balance_loss_clip": 1.03022599, + "balance_loss_mlp": 1.0497942, + "epoch": 0.19269502480084172, + "flos": 17457075354240.0, + "grad_norm": 2.065510679734303, + "language_loss": 0.75797462, + "learning_rate": 3.7286982566734454e-06, + "loss": 0.77988648, + "num_input_tokens_seen": 69186950, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9375, + "step": 3205, + "time_per_iteration": 2.439906358718872 + }, + { + "auxiliary_loss_clip": 0.01150471, + "auxiliary_loss_mlp": 0.01047316, + "balance_loss_clip": 1.02958953, + "balance_loss_mlp": 1.05312991, + "epoch": 0.19275514805350968, + "flos": 21506901045120.0, + "grad_norm": 2.4125731541540465, + "language_loss": 0.83020669, + "learning_rate": 3.728502366649107e-06, + "loss": 0.85218459, + "num_input_tokens_seen": 69204850, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.97265625, + "step": 3206, + "time_per_iteration": 2.463888168334961 + }, + { + "auxiliary_loss_clip": 0.0104957, + "auxiliary_loss_mlp": 0.01003581, + "balance_loss_clip": 1.00139928, + "balance_loss_mlp": 1.01731467, + "epoch": 0.19281527130617768, + "flos": 47695979738880.0, + "grad_norm": 0.8499440783854421, + "language_loss": 0.60609913, + "learning_rate": 3.728306411079786e-06, + "loss": 0.62663066, + "num_input_tokens_seen": 69259200, + "router_z_loss_clip": 0.02185059, + "router_z_loss_mlp": 0.32421875, + "step": 3207, + "time_per_iteration": 2.8865902423858643 + }, + { + "auxiliary_loss_clip": 0.01147085, + "auxiliary_loss_mlp": 0.01045801, + "balance_loss_clip": 1.02789569, + "balance_loss_mlp": 1.05069125, + "epoch": 0.19287539455884564, + "flos": 11801252672640.0, + "grad_norm": 2.4047527057594564, + "language_loss": 0.75119245, + "learning_rate": 3.7281103899729125e-06, + "loss": 0.77312136, + "num_input_tokens_seen": 69275835, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.9609375, + "step": 3208, + "time_per_iteration": 2.4727799892425537 + }, + { + "auxiliary_loss_clip": 0.01146825, + "auxiliary_loss_mlp": 0.01048755, + "balance_loss_clip": 1.02921605, + "balance_loss_mlp": 1.04890394, + "epoch": 0.1929355178115136, + "flos": 20631434860800.0, + "grad_norm": 2.3372356299161696, + "language_loss": 0.60567236, + "learning_rate": 3.7279143033359195e-06, + "loss": 0.62762815, + "num_input_tokens_seen": 69294810, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.98046875, + "step": 3209, + "time_per_iteration": 2.4695677757263184 + }, + { + "auxiliary_loss_clip": 0.0114885, + "auxiliary_loss_mlp": 0.01049539, + "balance_loss_clip": 1.03003573, + "balance_loss_mlp": 1.04981887, + "epoch": 0.19299564106418157, + "flos": 40807916058240.0, + "grad_norm": 1.9457412312791633, + "language_loss": 0.80153656, + "learning_rate": 3.727718151176243e-06, + "loss": 0.82352048, + "num_input_tokens_seen": 69316065, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.9921875, + "step": 3210, + "time_per_iteration": 2.6459405422210693 + }, + { + "auxiliary_loss_clip": 0.01138808, + "auxiliary_loss_mlp": 0.01041334, + "balance_loss_clip": 1.02437103, + "balance_loss_mlp": 1.04580569, + "epoch": 0.19305576431684954, + "flos": 11361418634880.0, + "grad_norm": 2.107646167575127, + "language_loss": 0.82575119, + "learning_rate": 3.7275219335013217e-06, + "loss": 0.84755266, + "num_input_tokens_seen": 69332900, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9296875, + "step": 3211, + "time_per_iteration": 2.454702615737915 + }, + { + "auxiliary_loss_clip": 0.01046258, + "auxiliary_loss_mlp": 0.01012694, + "balance_loss_clip": 1.01057243, + "balance_loss_mlp": 1.01463401, + "epoch": 0.1931158875695175, + "flos": 54511895975040.0, + "grad_norm": 0.9758169311408023, + "language_loss": 0.63670558, + "learning_rate": 3.7273256503185953e-06, + "loss": 0.65729511, + "num_input_tokens_seen": 69382535, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.31640625, + "step": 3212, + "time_per_iteration": 2.914459705352783 + }, + { + "auxiliary_loss_clip": 0.01145937, + "auxiliary_loss_mlp": 0.01046347, + "balance_loss_clip": 1.02967, + "balance_loss_mlp": 1.05140018, + "epoch": 0.19317601082218547, + "flos": 19828436365440.0, + "grad_norm": 1.5978218597026725, + "language_loss": 0.76514798, + "learning_rate": 3.7271293016355074e-06, + "loss": 0.78707075, + "num_input_tokens_seen": 69400600, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9453125, + "step": 3213, + "time_per_iteration": 2.47961163520813 + }, + { + "auxiliary_loss_clip": 0.01147383, + "auxiliary_loss_mlp": 0.01047068, + "balance_loss_clip": 1.02823281, + "balance_loss_mlp": 1.04934072, + "epoch": 0.19323613407485346, + "flos": 13152068467200.0, + "grad_norm": 4.5461953882780115, + "language_loss": 0.70799339, + "learning_rate": 3.726932887459503e-06, + "loss": 0.72993791, + "num_input_tokens_seen": 69417350, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.98046875, + "step": 3214, + "time_per_iteration": 2.4547488689422607 + }, + { + "auxiliary_loss_clip": 0.01142593, + "auxiliary_loss_mlp": 0.01046871, + "balance_loss_clip": 1.02808392, + "balance_loss_mlp": 1.0470041, + "epoch": 0.19329625732752143, + "flos": 14027247342720.0, + "grad_norm": 2.2459266127411848, + "language_loss": 0.75352395, + "learning_rate": 3.72673640779803e-06, + "loss": 0.77541864, + "num_input_tokens_seen": 69431845, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.95703125, + "step": 3215, + "time_per_iteration": 2.4477176666259766 + }, + { + "auxiliary_loss_clip": 0.01139586, + "auxiliary_loss_mlp": 0.01053833, + "balance_loss_clip": 1.03685808, + "balance_loss_mlp": 1.04626155, + "epoch": 0.1933563805801894, + "flos": 23441732069760.0, + "grad_norm": 2.304207478946857, + "language_loss": 0.88559556, + "learning_rate": 3.72653986265854e-06, + "loss": 0.90752971, + "num_input_tokens_seen": 69453275, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9296875, + "step": 3216, + "time_per_iteration": 2.499464988708496 + }, + { + "auxiliary_loss_clip": 0.01140269, + "auxiliary_loss_mlp": 0.0104998, + "balance_loss_clip": 1.0330286, + "balance_loss_mlp": 1.0474, + "epoch": 0.19341650383285736, + "flos": 20485314334080.0, + "grad_norm": 1.5978066249985532, + "language_loss": 0.79762065, + "learning_rate": 3.726343252048485e-06, + "loss": 0.8195231, + "num_input_tokens_seen": 69471830, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9296875, + "step": 3217, + "time_per_iteration": 2.4428889751434326 + }, + { + "auxiliary_loss_clip": 0.0114893, + "auxiliary_loss_mlp": 0.01048467, + "balance_loss_clip": 1.0294652, + "balance_loss_mlp": 1.0504688, + "epoch": 0.19347662708552532, + "flos": 17858484817920.0, + "grad_norm": 2.6606972104147673, + "language_loss": 0.61408496, + "learning_rate": 3.7261465759753206e-06, + "loss": 0.63605893, + "num_input_tokens_seen": 69489320, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.984375, + "step": 3218, + "time_per_iteration": 2.4313230514526367 + }, + { + "auxiliary_loss_clip": 0.0114381, + "auxiliary_loss_mlp": 0.01040596, + "balance_loss_clip": 1.02315593, + "balance_loss_mlp": 1.04883909, + "epoch": 0.1935367503381933, + "flos": 18187247024640.0, + "grad_norm": 1.6811153728366703, + "language_loss": 0.80158418, + "learning_rate": 3.7259498344465053e-06, + "loss": 0.82342821, + "num_input_tokens_seen": 69506665, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.94921875, + "step": 3219, + "time_per_iteration": 2.4347593784332275 + }, + { + "auxiliary_loss_clip": 0.01145851, + "auxiliary_loss_mlp": 0.01048329, + "balance_loss_clip": 1.03010237, + "balance_loss_mlp": 1.05070114, + "epoch": 0.19359687359086128, + "flos": 15957122290560.0, + "grad_norm": 2.032012314604138, + "language_loss": 0.85781908, + "learning_rate": 3.7257530274694993e-06, + "loss": 0.87976086, + "num_input_tokens_seen": 69523835, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.94921875, + "step": 3220, + "time_per_iteration": 2.4572718143463135 + }, + { + "auxiliary_loss_clip": 0.01136805, + "auxiliary_loss_mlp": 0.0103947, + "balance_loss_clip": 1.02356791, + "balance_loss_mlp": 1.0477736, + "epoch": 0.19365699684352924, + "flos": 21215198695680.0, + "grad_norm": 2.087292049011103, + "language_loss": 0.84617937, + "learning_rate": 3.725556155051766e-06, + "loss": 0.86794209, + "num_input_tokens_seen": 69542620, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.890625, + "step": 3221, + "time_per_iteration": 2.4601354598999023 + }, + { + "auxiliary_loss_clip": 0.01142607, + "auxiliary_loss_mlp": 0.01049362, + "balance_loss_clip": 1.0331614, + "balance_loss_mlp": 1.05009556, + "epoch": 0.1937171200961972, + "flos": 17311098481920.0, + "grad_norm": 2.075109928662421, + "language_loss": 0.85929954, + "learning_rate": 3.7253592172007702e-06, + "loss": 0.88121927, + "num_input_tokens_seen": 69561130, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.92578125, + "step": 3222, + "time_per_iteration": 2.433027505874634 + }, + { + "auxiliary_loss_clip": 0.0114145, + "auxiliary_loss_mlp": 0.01040151, + "balance_loss_clip": 1.02212656, + "balance_loss_mlp": 1.04663789, + "epoch": 0.19377724334886517, + "flos": 22635968227200.0, + "grad_norm": 3.9278404759018053, + "language_loss": 0.78207982, + "learning_rate": 3.72516221392398e-06, + "loss": 0.80389583, + "num_input_tokens_seen": 69580425, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9453125, + "step": 3223, + "time_per_iteration": 2.4451496601104736 + }, + { + "auxiliary_loss_clip": 0.01139994, + "auxiliary_loss_mlp": 0.01047584, + "balance_loss_clip": 1.03013206, + "balance_loss_mlp": 1.04896808, + "epoch": 0.19383736660153314, + "flos": 15077813351040.0, + "grad_norm": 1.8200574771064912, + "language_loss": 0.75589085, + "learning_rate": 3.7249651452288653e-06, + "loss": 0.77776659, + "num_input_tokens_seen": 69597085, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.91015625, + "step": 3224, + "time_per_iteration": 2.4390981197357178 + }, + { + "auxiliary_loss_clip": 0.0114036, + "auxiliary_loss_mlp": 0.01039996, + "balance_loss_clip": 1.02274644, + "balance_loss_mlp": 1.04741263, + "epoch": 0.1938974898542011, + "flos": 47119934350080.0, + "grad_norm": 2.092202382915022, + "language_loss": 0.71141279, + "learning_rate": 3.7247680111229e-06, + "loss": 0.73321629, + "num_input_tokens_seen": 69618885, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9296875, + "step": 3225, + "time_per_iteration": 2.6690707206726074 + }, + { + "auxiliary_loss_clip": 0.01142605, + "auxiliary_loss_mlp": 0.01044348, + "balance_loss_clip": 1.0279572, + "balance_loss_mlp": 1.04787326, + "epoch": 0.19395761310686907, + "flos": 25812554376960.0, + "grad_norm": 2.058354492672399, + "language_loss": 0.6915803, + "learning_rate": 3.7245708116135585e-06, + "loss": 0.71344984, + "num_input_tokens_seen": 69638200, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9453125, + "step": 3226, + "time_per_iteration": 3.906217336654663 + }, + { + "auxiliary_loss_clip": 0.0114437, + "auxiliary_loss_mlp": 0.01044177, + "balance_loss_clip": 1.02562809, + "balance_loss_mlp": 1.05274427, + "epoch": 0.19401773635953706, + "flos": 23039604334080.0, + "grad_norm": 1.6131772564475266, + "language_loss": 0.76138854, + "learning_rate": 3.7243735467083193e-06, + "loss": 0.78327405, + "num_input_tokens_seen": 69657550, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9140625, + "step": 3227, + "time_per_iteration": 4.168737411499023 + }, + { + "auxiliary_loss_clip": 0.01140847, + "auxiliary_loss_mlp": 0.01042545, + "balance_loss_clip": 1.02547467, + "balance_loss_mlp": 1.04588878, + "epoch": 0.19407785961220503, + "flos": 15920780705280.0, + "grad_norm": 1.8539897665707572, + "language_loss": 0.69154215, + "learning_rate": 3.724176216414662e-06, + "loss": 0.7133761, + "num_input_tokens_seen": 69675005, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.94921875, + "step": 3228, + "time_per_iteration": 2.4857404232025146 + }, + { + "auxiliary_loss_clip": 0.01142054, + "auxiliary_loss_mlp": 0.01044016, + "balance_loss_clip": 1.02698135, + "balance_loss_mlp": 1.04929864, + "epoch": 0.194137982864873, + "flos": 25921722787200.0, + "grad_norm": 1.9069922854616745, + "language_loss": 0.7428174, + "learning_rate": 3.72397882074007e-06, + "loss": 0.76467812, + "num_input_tokens_seen": 69696455, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9296875, + "step": 3229, + "time_per_iteration": 2.5357918739318848 + }, + { + "auxiliary_loss_clip": 0.01141663, + "auxiliary_loss_mlp": 0.01041685, + "balance_loss_clip": 1.02443504, + "balance_loss_mlp": 1.04832351, + "epoch": 0.19419810611754096, + "flos": 13261344618240.0, + "grad_norm": 1.6963766145995596, + "language_loss": 0.65157712, + "learning_rate": 3.7237813596920285e-06, + "loss": 0.67341059, + "num_input_tokens_seen": 69714245, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.93359375, + "step": 3230, + "time_per_iteration": 2.4796855449676514 + }, + { + "auxiliary_loss_clip": 0.01137871, + "auxiliary_loss_mlp": 0.01044544, + "balance_loss_clip": 1.0268054, + "balance_loss_mlp": 1.04652202, + "epoch": 0.19425822937020892, + "flos": 15705568368000.0, + "grad_norm": 1.8877471342298004, + "language_loss": 0.8184334, + "learning_rate": 3.7235838332780254e-06, + "loss": 0.84025759, + "num_input_tokens_seen": 69731515, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9140625, + "step": 3231, + "time_per_iteration": 2.5315961837768555 + }, + { + "auxiliary_loss_clip": 0.01145592, + "auxiliary_loss_mlp": 0.01039112, + "balance_loss_clip": 1.02045608, + "balance_loss_mlp": 1.05067456, + "epoch": 0.1943183526228769, + "flos": 23105392093440.0, + "grad_norm": 1.787689187471357, + "language_loss": 0.86743605, + "learning_rate": 3.72338624150555e-06, + "loss": 0.88928306, + "num_input_tokens_seen": 69748885, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.94921875, + "step": 3232, + "time_per_iteration": 2.4916152954101562 + }, + { + "auxiliary_loss_clip": 0.01141636, + "auxiliary_loss_mlp": 0.01052447, + "balance_loss_clip": 1.03497076, + "balance_loss_mlp": 1.05008495, + "epoch": 0.19437847587554485, + "flos": 24712610146560.0, + "grad_norm": 1.5602267859616314, + "language_loss": 0.8513217, + "learning_rate": 3.723188584382096e-06, + "loss": 0.87326247, + "num_input_tokens_seen": 69767540, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9140625, + "step": 3233, + "time_per_iteration": 2.526118040084839 + }, + { + "auxiliary_loss_clip": 0.01145232, + "auxiliary_loss_mlp": 0.01053705, + "balance_loss_clip": 1.03603804, + "balance_loss_mlp": 1.04827857, + "epoch": 0.19443859912821285, + "flos": 23116130259840.0, + "grad_norm": 1.6631942166294669, + "language_loss": 0.89191484, + "learning_rate": 3.722990861915158e-06, + "loss": 0.91390419, + "num_input_tokens_seen": 69789340, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.96875, + "step": 3234, + "time_per_iteration": 2.4783849716186523 + }, + { + "auxiliary_loss_clip": 0.01143869, + "auxiliary_loss_mlp": 0.01043333, + "balance_loss_clip": 1.02493858, + "balance_loss_mlp": 1.04675341, + "epoch": 0.1944987223808808, + "flos": 15084385539840.0, + "grad_norm": 2.1776085062187374, + "language_loss": 0.78503513, + "learning_rate": 3.722793074112234e-06, + "loss": 0.80690718, + "num_input_tokens_seen": 69806470, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.96875, + "step": 3235, + "time_per_iteration": 2.4414284229278564 + }, + { + "auxiliary_loss_clip": 0.01146423, + "auxiliary_loss_mlp": 0.01041843, + "balance_loss_clip": 1.02545178, + "balance_loss_mlp": 1.05288744, + "epoch": 0.19455884563354878, + "flos": 17126876603520.0, + "grad_norm": 2.115791514531618, + "language_loss": 0.7937218, + "learning_rate": 3.7225952209808233e-06, + "loss": 0.81560451, + "num_input_tokens_seen": 69822655, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.93359375, + "step": 3236, + "time_per_iteration": 2.4394619464874268 + }, + { + "auxiliary_loss_clip": 0.01144391, + "auxiliary_loss_mlp": 0.01040175, + "balance_loss_clip": 1.02204323, + "balance_loss_mlp": 1.05156302, + "epoch": 0.19461896888621674, + "flos": 20193396503040.0, + "grad_norm": 2.445233321344346, + "language_loss": 0.75936478, + "learning_rate": 3.72239730252843e-06, + "loss": 0.78121042, + "num_input_tokens_seen": 69841895, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9296875, + "step": 3237, + "time_per_iteration": 2.544003486633301 + }, + { + "auxiliary_loss_clip": 0.01147227, + "auxiliary_loss_mlp": 0.01046687, + "balance_loss_clip": 1.03005719, + "balance_loss_mlp": 1.05079889, + "epoch": 0.1946790921388847, + "flos": 25301365971840.0, + "grad_norm": 2.0921387862929586, + "language_loss": 0.75056225, + "learning_rate": 3.7221993187625583e-06, + "loss": 0.77250135, + "num_input_tokens_seen": 69862220, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.96484375, + "step": 3238, + "time_per_iteration": 2.4795806407928467 + }, + { + "auxiliary_loss_clip": 0.01147117, + "auxiliary_loss_mlp": 0.01044554, + "balance_loss_clip": 1.02600455, + "balance_loss_mlp": 1.05317962, + "epoch": 0.19473921539155267, + "flos": 20193396503040.0, + "grad_norm": 1.8233855681516762, + "language_loss": 0.73016453, + "learning_rate": 3.7220012696907155e-06, + "loss": 0.75208122, + "num_input_tokens_seen": 69881830, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.94140625, + "step": 3239, + "time_per_iteration": 2.4695816040039062 + }, + { + "auxiliary_loss_clip": 0.01144581, + "auxiliary_loss_mlp": 0.01048537, + "balance_loss_clip": 1.03026247, + "balance_loss_mlp": 1.0505631, + "epoch": 0.19479933864422067, + "flos": 20887549810560.0, + "grad_norm": 1.897973355517785, + "language_loss": 0.73792124, + "learning_rate": 3.721803155320412e-06, + "loss": 0.75985241, + "num_input_tokens_seen": 69900515, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.94140625, + "step": 3240, + "time_per_iteration": 2.4483954906463623 + }, + { + "auxiliary_loss_clip": 0.0114635, + "auxiliary_loss_mlp": 0.01041908, + "balance_loss_clip": 1.02477801, + "balance_loss_mlp": 1.05221701, + "epoch": 0.19485946189688863, + "flos": 23295072839040.0, + "grad_norm": 1.8797415358152445, + "language_loss": 0.66685343, + "learning_rate": 3.7216049756591606e-06, + "loss": 0.68873608, + "num_input_tokens_seen": 69920060, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.94140625, + "step": 3241, + "time_per_iteration": 2.5644116401672363 + }, + { + "auxiliary_loss_clip": 0.01144249, + "auxiliary_loss_mlp": 0.01045443, + "balance_loss_clip": 1.0280863, + "balance_loss_mlp": 1.05193758, + "epoch": 0.1949195851495566, + "flos": 23295036925440.0, + "grad_norm": 1.4346271942222966, + "language_loss": 0.82889283, + "learning_rate": 3.7214067307144754e-06, + "loss": 0.85078967, + "num_input_tokens_seen": 69939820, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.921875, + "step": 3242, + "time_per_iteration": 2.476043701171875 + }, + { + "auxiliary_loss_clip": 0.01054708, + "auxiliary_loss_mlp": 0.01010683, + "balance_loss_clip": 1.00856066, + "balance_loss_mlp": 1.02379096, + "epoch": 0.19497970840222456, + "flos": 64962871557120.0, + "grad_norm": 0.8482804620416572, + "language_loss": 0.57572454, + "learning_rate": 3.721208420493875e-06, + "loss": 0.59637845, + "num_input_tokens_seen": 70002145, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.30859375, + "step": 3243, + "time_per_iteration": 3.1217525005340576 + }, + { + "auxiliary_loss_clip": 0.01144732, + "auxiliary_loss_mlp": 0.01043073, + "balance_loss_clip": 1.02573967, + "balance_loss_mlp": 1.05099249, + "epoch": 0.19503983165489253, + "flos": 19644717277440.0, + "grad_norm": 2.02063631868758, + "language_loss": 0.83243412, + "learning_rate": 3.7210100450048784e-06, + "loss": 0.85431218, + "num_input_tokens_seen": 70020510, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9375, + "step": 3244, + "time_per_iteration": 2.4848830699920654 + }, + { + "auxiliary_loss_clip": 0.01147429, + "auxiliary_loss_mlp": 0.01048127, + "balance_loss_clip": 1.03144979, + "balance_loss_mlp": 1.05495024, + "epoch": 0.1950999549075605, + "flos": 21141976821120.0, + "grad_norm": 1.8275576625869878, + "language_loss": 0.77049786, + "learning_rate": 3.7208116042550088e-06, + "loss": 0.79245341, + "num_input_tokens_seen": 70040760, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.921875, + "step": 3245, + "time_per_iteration": 2.5539040565490723 + }, + { + "auxiliary_loss_clip": 0.01141945, + "auxiliary_loss_mlp": 0.01041151, + "balance_loss_clip": 1.0235796, + "balance_loss_mlp": 1.04852772, + "epoch": 0.19516007816022846, + "flos": 20884820376960.0, + "grad_norm": 2.8639596298576055, + "language_loss": 0.84020388, + "learning_rate": 3.7206130982517906e-06, + "loss": 0.86203486, + "num_input_tokens_seen": 70058720, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9375, + "step": 3246, + "time_per_iteration": 2.5018341541290283 + }, + { + "auxiliary_loss_clip": 0.0114444, + "auxiliary_loss_mlp": 0.01045285, + "balance_loss_clip": 1.02834511, + "balance_loss_mlp": 1.04978824, + "epoch": 0.19522020141289645, + "flos": 16910515031040.0, + "grad_norm": 2.1267063345385777, + "language_loss": 0.7636531, + "learning_rate": 3.7204145270027514e-06, + "loss": 0.78555036, + "num_input_tokens_seen": 70076470, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9453125, + "step": 3247, + "time_per_iteration": 2.4512898921966553 + }, + { + "auxiliary_loss_clip": 0.01144663, + "auxiliary_loss_mlp": 0.01038699, + "balance_loss_clip": 1.02228367, + "balance_loss_mlp": 1.05077446, + "epoch": 0.19528032466556441, + "flos": 26724829023360.0, + "grad_norm": 1.4744510548582124, + "language_loss": 0.75330198, + "learning_rate": 3.720215890515421e-06, + "loss": 0.77513552, + "num_input_tokens_seen": 70096220, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9375, + "step": 3248, + "time_per_iteration": 2.5222222805023193 + }, + { + "auxiliary_loss_clip": 0.01140079, + "auxiliary_loss_mlp": 0.0104275, + "balance_loss_clip": 1.02590537, + "balance_loss_mlp": 1.04661679, + "epoch": 0.19534044791823238, + "flos": 21032808410880.0, + "grad_norm": 1.9881324270373204, + "language_loss": 0.78316575, + "learning_rate": 3.7200171887973316e-06, + "loss": 0.80499399, + "num_input_tokens_seen": 70114800, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.93359375, + "step": 3249, + "time_per_iteration": 2.475385904312134 + }, + { + "auxiliary_loss_clip": 0.01143906, + "auxiliary_loss_mlp": 0.01048238, + "balance_loss_clip": 1.0316205, + "balance_loss_mlp": 1.04948914, + "epoch": 0.19540057117090034, + "flos": 22344050396160.0, + "grad_norm": 1.839405294960197, + "language_loss": 0.73238158, + "learning_rate": 3.7198184218560176e-06, + "loss": 0.7543031, + "num_input_tokens_seen": 70134930, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9453125, + "step": 3250, + "time_per_iteration": 2.4548323154449463 + }, + { + "auxiliary_loss_clip": 0.01136082, + "auxiliary_loss_mlp": 0.01037994, + "balance_loss_clip": 1.02206779, + "balance_loss_mlp": 1.04583359, + "epoch": 0.1954606944235683, + "flos": 20301631159680.0, + "grad_norm": 1.9014920395959154, + "language_loss": 0.79582441, + "learning_rate": 3.719619589699017e-06, + "loss": 0.8175652, + "num_input_tokens_seen": 70152045, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.90234375, + "step": 3251, + "time_per_iteration": 2.4597084522247314 + }, + { + "auxiliary_loss_clip": 0.01142571, + "auxiliary_loss_mlp": 0.01041367, + "balance_loss_clip": 1.02441597, + "balance_loss_mlp": 1.04888558, + "epoch": 0.19552081767623627, + "flos": 17346865449600.0, + "grad_norm": 3.2143497379473613, + "language_loss": 0.83534026, + "learning_rate": 3.7194206923338695e-06, + "loss": 0.85717964, + "num_input_tokens_seen": 70169240, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9375, + "step": 3252, + "time_per_iteration": 2.4245967864990234 + }, + { + "auxiliary_loss_clip": 0.01143675, + "auxiliary_loss_mlp": 0.01048315, + "balance_loss_clip": 1.03026652, + "balance_loss_mlp": 1.04651105, + "epoch": 0.19558094092890424, + "flos": 31977626129280.0, + "grad_norm": 1.7806404718622555, + "language_loss": 0.73870194, + "learning_rate": 3.719221729768117e-06, + "loss": 0.76062191, + "num_input_tokens_seen": 70192690, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.96875, + "step": 3253, + "time_per_iteration": 2.5752809047698975 + }, + { + "auxiliary_loss_clip": 0.01142809, + "auxiliary_loss_mlp": 0.01040218, + "balance_loss_clip": 1.02352846, + "balance_loss_mlp": 1.04619944, + "epoch": 0.19564106418157223, + "flos": 22268889187200.0, + "grad_norm": 1.833285648050628, + "language_loss": 0.76684111, + "learning_rate": 3.7190227020093037e-06, + "loss": 0.78867137, + "num_input_tokens_seen": 70209685, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.96484375, + "step": 3254, + "time_per_iteration": 2.533993721008301 + }, + { + "auxiliary_loss_clip": 0.01044914, + "auxiliary_loss_mlp": 0.01004749, + "balance_loss_clip": 1.00268674, + "balance_loss_mlp": 1.01349974, + "epoch": 0.1957011874342402, + "flos": 54364554385920.0, + "grad_norm": 0.7652407497357797, + "language_loss": 0.55344874, + "learning_rate": 3.7188236090649774e-06, + "loss": 0.5739454, + "num_input_tokens_seen": 70265050, + "router_z_loss_clip": 0.02062988, + "router_z_loss_mlp": 0.3125, + "step": 3255, + "time_per_iteration": 3.164173126220703 + }, + { + "auxiliary_loss_clip": 0.01144973, + "auxiliary_loss_mlp": 0.01041369, + "balance_loss_clip": 1.02407217, + "balance_loss_mlp": 1.05057478, + "epoch": 0.19576131068690816, + "flos": 16506699356160.0, + "grad_norm": 2.650975615707017, + "language_loss": 0.7066443, + "learning_rate": 3.718624450942688e-06, + "loss": 0.7285077, + "num_input_tokens_seen": 70281830, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9453125, + "step": 3256, + "time_per_iteration": 2.496424436569214 + }, + { + "auxiliary_loss_clip": 0.01139601, + "auxiliary_loss_mlp": 0.0104318, + "balance_loss_clip": 1.02635908, + "balance_loss_mlp": 1.04647136, + "epoch": 0.19582143393957613, + "flos": 14719676797440.0, + "grad_norm": 2.256610935254856, + "language_loss": 0.80055118, + "learning_rate": 3.718425227649987e-06, + "loss": 0.82237899, + "num_input_tokens_seen": 70297420, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9296875, + "step": 3257, + "time_per_iteration": 2.4771504402160645 + }, + { + "auxiliary_loss_clip": 0.01143218, + "auxiliary_loss_mlp": 0.01042656, + "balance_loss_clip": 1.02637219, + "balance_loss_mlp": 1.05034149, + "epoch": 0.1958815571922441, + "flos": 24425504737920.0, + "grad_norm": 1.9567741269254724, + "language_loss": 0.74843282, + "learning_rate": 3.7182259391944292e-06, + "loss": 0.77029151, + "num_input_tokens_seen": 70319210, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.9296875, + "step": 3258, + "time_per_iteration": 2.6177120208740234 + }, + { + "auxiliary_loss_clip": 0.01142767, + "auxiliary_loss_mlp": 0.01036452, + "balance_loss_clip": 1.01932144, + "balance_loss_mlp": 1.04772139, + "epoch": 0.19594168044491206, + "flos": 24900279730560.0, + "grad_norm": 1.7410781544458231, + "language_loss": 0.74462247, + "learning_rate": 3.7180265855835714e-06, + "loss": 0.7664147, + "num_input_tokens_seen": 70339045, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.94921875, + "step": 3259, + "time_per_iteration": 2.54068660736084 + }, + { + "auxiliary_loss_clip": 0.01145135, + "auxiliary_loss_mlp": 0.01036775, + "balance_loss_clip": 1.01923943, + "balance_loss_mlp": 1.04965675, + "epoch": 0.19600180369758005, + "flos": 12057008486400.0, + "grad_norm": 2.380592438675979, + "language_loss": 0.77040654, + "learning_rate": 3.7178271668249735e-06, + "loss": 0.7922256, + "num_input_tokens_seen": 70356505, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.953125, + "step": 3260, + "time_per_iteration": 2.4983303546905518 + }, + { + "auxiliary_loss_clip": 0.01143361, + "auxiliary_loss_mlp": 0.01041828, + "balance_loss_clip": 1.02459061, + "balance_loss_mlp": 1.0486325, + "epoch": 0.19606192695024802, + "flos": 20850202644480.0, + "grad_norm": 2.011568492365706, + "language_loss": 0.82168972, + "learning_rate": 3.7176276829261975e-06, + "loss": 0.84354162, + "num_input_tokens_seen": 70375410, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9453125, + "step": 3261, + "time_per_iteration": 2.52164626121521 + }, + { + "auxiliary_loss_clip": 0.01144228, + "auxiliary_loss_mlp": 0.01042699, + "balance_loss_clip": 1.02510428, + "balance_loss_mlp": 1.05130327, + "epoch": 0.19612205020291598, + "flos": 28475509996800.0, + "grad_norm": 2.1812525814986112, + "language_loss": 0.76691413, + "learning_rate": 3.717428133894807e-06, + "loss": 0.78878343, + "num_input_tokens_seen": 70396315, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9296875, + "step": 3262, + "time_per_iteration": 2.513619899749756 + }, + { + "auxiliary_loss_clip": 0.01145398, + "auxiliary_loss_mlp": 0.01044459, + "balance_loss_clip": 1.02775788, + "balance_loss_mlp": 1.05290008, + "epoch": 0.19618217345558395, + "flos": 25556618995200.0, + "grad_norm": 1.7175684177653927, + "language_loss": 0.8667773, + "learning_rate": 3.71722851973837e-06, + "loss": 0.88867593, + "num_input_tokens_seen": 70417945, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.921875, + "step": 3263, + "time_per_iteration": 2.49373459815979 + }, + { + "auxiliary_loss_clip": 0.01140801, + "auxiliary_loss_mlp": 0.01041854, + "balance_loss_clip": 1.0251646, + "balance_loss_mlp": 1.04784787, + "epoch": 0.1962422967082519, + "flos": 25264413855360.0, + "grad_norm": 1.5660143494742738, + "language_loss": 0.74136549, + "learning_rate": 3.717028840464455e-06, + "loss": 0.76319206, + "num_input_tokens_seen": 70438690, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9296875, + "step": 3264, + "time_per_iteration": 2.4891843795776367 + }, + { + "auxiliary_loss_clip": 0.0114591, + "auxiliary_loss_mlp": 0.01049823, + "balance_loss_clip": 1.03340793, + "balance_loss_mlp": 1.05435038, + "epoch": 0.19630241996091988, + "flos": 18807352444800.0, + "grad_norm": 4.0742741532711975, + "language_loss": 0.78590196, + "learning_rate": 3.7168290960806344e-06, + "loss": 0.8078593, + "num_input_tokens_seen": 70455385, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9140625, + "step": 3265, + "time_per_iteration": 2.4226529598236084 + }, + { + "auxiliary_loss_clip": 0.01047401, + "auxiliary_loss_mlp": 0.01014864, + "balance_loss_clip": 1.01292133, + "balance_loss_mlp": 1.01652646, + "epoch": 0.19636254321358784, + "flos": 62321137896960.0, + "grad_norm": 0.7852387786228787, + "language_loss": 0.53459084, + "learning_rate": 3.716629286594483e-06, + "loss": 0.55521357, + "num_input_tokens_seen": 70514280, + "router_z_loss_clip": 0.01940918, + "router_z_loss_mlp": 0.30859375, + "step": 3266, + "time_per_iteration": 3.0519652366638184 + }, + { + "auxiliary_loss_clip": 0.01145434, + "auxiliary_loss_mlp": 0.01041492, + "balance_loss_clip": 1.02263319, + "balance_loss_mlp": 1.04800785, + "epoch": 0.19642266646625584, + "flos": 21069329564160.0, + "grad_norm": 1.9728388819613873, + "language_loss": 0.80503136, + "learning_rate": 3.7164294120135767e-06, + "loss": 0.82690066, + "num_input_tokens_seen": 70531800, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.9765625, + "step": 3267, + "time_per_iteration": 2.436455011367798 + }, + { + "auxiliary_loss_clip": 0.01138843, + "auxiliary_loss_mlp": 0.0104324, + "balance_loss_clip": 1.02726591, + "balance_loss_mlp": 1.04780269, + "epoch": 0.1964827897189238, + "flos": 14538651229440.0, + "grad_norm": 2.528633756775916, + "language_loss": 0.87031806, + "learning_rate": 3.7162294723454953e-06, + "loss": 0.89213896, + "num_input_tokens_seen": 70550615, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.91015625, + "step": 3268, + "time_per_iteration": 5.348580360412598 + }, + { + "auxiliary_loss_clip": 0.01141651, + "auxiliary_loss_mlp": 0.01045776, + "balance_loss_clip": 1.02865744, + "balance_loss_mlp": 1.04996669, + "epoch": 0.19654291297159177, + "flos": 19244636616960.0, + "grad_norm": 2.7845337804652086, + "language_loss": 0.69331455, + "learning_rate": 3.7160294675978197e-06, + "loss": 0.71518886, + "num_input_tokens_seen": 70568690, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9140625, + "step": 3269, + "time_per_iteration": 3.9386346340179443 + }, + { + "auxiliary_loss_clip": 0.01148343, + "auxiliary_loss_mlp": 0.01051701, + "balance_loss_clip": 1.03361702, + "balance_loss_mlp": 1.0530045, + "epoch": 0.19660303622425973, + "flos": 25775710001280.0, + "grad_norm": 2.4386480468071086, + "language_loss": 0.80760634, + "learning_rate": 3.715829397778135e-06, + "loss": 0.82960677, + "num_input_tokens_seen": 70588665, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.953125, + "step": 3270, + "time_per_iteration": 2.5130820274353027 + }, + { + "auxiliary_loss_clip": 0.01140062, + "auxiliary_loss_mlp": 0.01045089, + "balance_loss_clip": 1.02848363, + "balance_loss_mlp": 1.04726839, + "epoch": 0.1966631594769277, + "flos": 20595093275520.0, + "grad_norm": 1.857854204827715, + "language_loss": 0.83918732, + "learning_rate": 3.715629262894028e-06, + "loss": 0.86103886, + "num_input_tokens_seen": 70606900, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9296875, + "step": 3271, + "time_per_iteration": 2.4522581100463867 + }, + { + "auxiliary_loss_clip": 0.01139583, + "auxiliary_loss_mlp": 0.01046491, + "balance_loss_clip": 1.0297302, + "balance_loss_mlp": 1.04943895, + "epoch": 0.19672328272959566, + "flos": 23623188600960.0, + "grad_norm": 2.1376155358713835, + "language_loss": 0.80162311, + "learning_rate": 3.715429062953087e-06, + "loss": 0.82348382, + "num_input_tokens_seen": 70625955, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8984375, + "step": 3272, + "time_per_iteration": 2.4968738555908203 + }, + { + "auxiliary_loss_clip": 0.01145188, + "auxiliary_loss_mlp": 0.01045772, + "balance_loss_clip": 1.02766371, + "balance_loss_mlp": 1.05075002, + "epoch": 0.19678340598226365, + "flos": 23110922787840.0, + "grad_norm": 1.7855512393811417, + "language_loss": 0.80728978, + "learning_rate": 3.7152287979629043e-06, + "loss": 0.82919937, + "num_input_tokens_seen": 70646090, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9453125, + "step": 3273, + "time_per_iteration": 2.525407552719116 + }, + { + "auxiliary_loss_clip": 0.01142802, + "auxiliary_loss_mlp": 0.01051833, + "balance_loss_clip": 1.03454804, + "balance_loss_mlp": 1.04807115, + "epoch": 0.19684352923493162, + "flos": 24534852716160.0, + "grad_norm": 5.081990879764466, + "language_loss": 0.7791425, + "learning_rate": 3.7150284679310735e-06, + "loss": 0.80108881, + "num_input_tokens_seen": 70666065, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9453125, + "step": 3274, + "time_per_iteration": 2.527858018875122 + }, + { + "auxiliary_loss_clip": 0.01141542, + "auxiliary_loss_mlp": 0.01052239, + "balance_loss_clip": 1.03440571, + "balance_loss_mlp": 1.04765558, + "epoch": 0.19690365248759958, + "flos": 21796448578560.0, + "grad_norm": 2.1984029701042367, + "language_loss": 0.81144857, + "learning_rate": 3.7148280728651914e-06, + "loss": 0.83338642, + "num_input_tokens_seen": 70681580, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.9375, + "step": 3275, + "time_per_iteration": 2.451392412185669 + }, + { + "auxiliary_loss_clip": 0.01143631, + "auxiliary_loss_mlp": 0.01047389, + "balance_loss_clip": 1.02934027, + "balance_loss_mlp": 1.04772139, + "epoch": 0.19696377574026755, + "flos": 19056643810560.0, + "grad_norm": 1.90284229785688, + "language_loss": 0.81104618, + "learning_rate": 3.7146276127728563e-06, + "loss": 0.83295637, + "num_input_tokens_seen": 70697745, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9609375, + "step": 3276, + "time_per_iteration": 2.462033748626709 + }, + { + "auxiliary_loss_clip": 0.01142306, + "auxiliary_loss_mlp": 0.01038428, + "balance_loss_clip": 1.02132106, + "balance_loss_mlp": 1.04889154, + "epoch": 0.19702389899293551, + "flos": 22820656982400.0, + "grad_norm": 2.0909421048868126, + "language_loss": 0.89347923, + "learning_rate": 3.7144270876616713e-06, + "loss": 0.91528654, + "num_input_tokens_seen": 70715110, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.93359375, + "step": 3277, + "time_per_iteration": 2.4887003898620605 + }, + { + "auxiliary_loss_clip": 0.01146208, + "auxiliary_loss_mlp": 0.01047781, + "balance_loss_clip": 1.02804041, + "balance_loss_mlp": 1.04832077, + "epoch": 0.19708402224560348, + "flos": 22894237992960.0, + "grad_norm": 2.9974095646387573, + "language_loss": 0.62265754, + "learning_rate": 3.714226497539239e-06, + "loss": 0.64459741, + "num_input_tokens_seen": 70734715, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.9765625, + "step": 3278, + "time_per_iteration": 2.560401201248169 + }, + { + "auxiliary_loss_clip": 0.01144829, + "auxiliary_loss_mlp": 0.01054112, + "balance_loss_clip": 1.03562284, + "balance_loss_mlp": 1.04910243, + "epoch": 0.19714414549827144, + "flos": 25662519267840.0, + "grad_norm": 3.1131920881239936, + "language_loss": 0.73664343, + "learning_rate": 3.714025842413166e-06, + "loss": 0.75863284, + "num_input_tokens_seen": 70752650, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.95703125, + "step": 3279, + "time_per_iteration": 2.5036048889160156 + }, + { + "auxiliary_loss_clip": 0.01144667, + "auxiliary_loss_mlp": 0.01045176, + "balance_loss_clip": 1.02816486, + "balance_loss_mlp": 1.04906511, + "epoch": 0.19720426875093944, + "flos": 23915824704000.0, + "grad_norm": 1.6310774806952162, + "language_loss": 0.82451236, + "learning_rate": 3.713825122291061e-06, + "loss": 0.84641075, + "num_input_tokens_seen": 70772365, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.95703125, + "step": 3280, + "time_per_iteration": 2.499962091445923 + }, + { + "auxiliary_loss_clip": 0.01145271, + "auxiliary_loss_mlp": 0.01043645, + "balance_loss_clip": 1.02744484, + "balance_loss_mlp": 1.05086279, + "epoch": 0.1972643920036074, + "flos": 13881952828800.0, + "grad_norm": 1.847926035637751, + "language_loss": 0.77581155, + "learning_rate": 3.713624337180536e-06, + "loss": 0.79770064, + "num_input_tokens_seen": 70790340, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.94140625, + "step": 3281, + "time_per_iteration": 2.4610888957977295 + }, + { + "auxiliary_loss_clip": 0.01141389, + "auxiliary_loss_mlp": 0.01043048, + "balance_loss_clip": 1.02719295, + "balance_loss_mlp": 1.0507971, + "epoch": 0.19732451525627537, + "flos": 19863592801920.0, + "grad_norm": 1.593504057665797, + "language_loss": 0.79502213, + "learning_rate": 3.7134234870892045e-06, + "loss": 0.81686652, + "num_input_tokens_seen": 70809295, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.90625, + "step": 3282, + "time_per_iteration": 2.453230857849121 + }, + { + "auxiliary_loss_clip": 0.01149903, + "auxiliary_loss_mlp": 0.01044987, + "balance_loss_clip": 1.0279994, + "balance_loss_mlp": 1.05359089, + "epoch": 0.19738463850894333, + "flos": 24973429777920.0, + "grad_norm": 2.157912578421005, + "language_loss": 0.71937042, + "learning_rate": 3.7132225720246826e-06, + "loss": 0.7413193, + "num_input_tokens_seen": 70828765, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9609375, + "step": 3283, + "time_per_iteration": 2.5070157051086426 + }, + { + "auxiliary_loss_clip": 0.0114321, + "auxiliary_loss_mlp": 0.01041465, + "balance_loss_clip": 1.02462053, + "balance_loss_mlp": 1.04858577, + "epoch": 0.1974447617616113, + "flos": 18368883123840.0, + "grad_norm": 1.741034644212953, + "language_loss": 0.78832877, + "learning_rate": 3.7130215919945886e-06, + "loss": 0.81017548, + "num_input_tokens_seen": 70846805, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9453125, + "step": 3284, + "time_per_iteration": 2.436530113220215 + }, + { + "auxiliary_loss_clip": 0.01147439, + "auxiliary_loss_mlp": 0.01047462, + "balance_loss_clip": 1.02952087, + "balance_loss_mlp": 1.05069387, + "epoch": 0.19750488501427926, + "flos": 22892945103360.0, + "grad_norm": 2.0622477624774325, + "language_loss": 0.86366653, + "learning_rate": 3.7128205470065445e-06, + "loss": 0.88561547, + "num_input_tokens_seen": 70863805, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.96875, + "step": 3285, + "time_per_iteration": 2.4581058025360107 + }, + { + "auxiliary_loss_clip": 0.01143401, + "auxiliary_loss_mlp": 0.01042449, + "balance_loss_clip": 1.02571201, + "balance_loss_mlp": 1.0520879, + "epoch": 0.19756500826694723, + "flos": 21871502046720.0, + "grad_norm": 2.7361177014734372, + "language_loss": 0.88680863, + "learning_rate": 3.712619437068174e-06, + "loss": 0.90866709, + "num_input_tokens_seen": 70882660, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9140625, + "step": 3286, + "time_per_iteration": 2.472475290298462 + }, + { + "auxiliary_loss_clip": 0.01148385, + "auxiliary_loss_mlp": 0.01049125, + "balance_loss_clip": 1.03036189, + "balance_loss_mlp": 1.05260301, + "epoch": 0.19762513151961522, + "flos": 15158972131200.0, + "grad_norm": 2.2372981039860833, + "language_loss": 0.78297567, + "learning_rate": 3.712418262187102e-06, + "loss": 0.80495083, + "num_input_tokens_seen": 70898765, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.95703125, + "step": 3287, + "time_per_iteration": 2.417569875717163 + }, + { + "auxiliary_loss_clip": 0.01146192, + "auxiliary_loss_mlp": 0.01045423, + "balance_loss_clip": 1.02674246, + "balance_loss_mlp": 1.04974318, + "epoch": 0.1976852547722832, + "flos": 16979175878400.0, + "grad_norm": 2.197025185749627, + "language_loss": 0.81252837, + "learning_rate": 3.7122170223709584e-06, + "loss": 0.83444452, + "num_input_tokens_seen": 70916370, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.96484375, + "step": 3288, + "time_per_iteration": 2.4107155799865723 + }, + { + "auxiliary_loss_clip": 0.01139417, + "auxiliary_loss_mlp": 0.01049687, + "balance_loss_clip": 1.03315234, + "balance_loss_mlp": 1.04890108, + "epoch": 0.19774537802495115, + "flos": 20302924049280.0, + "grad_norm": 1.7615970311636253, + "language_loss": 0.72502065, + "learning_rate": 3.712015717627374e-06, + "loss": 0.74691164, + "num_input_tokens_seen": 70934870, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.90625, + "step": 3289, + "time_per_iteration": 2.4479291439056396 + }, + { + "auxiliary_loss_clip": 0.01144115, + "auxiliary_loss_mlp": 0.01045349, + "balance_loss_clip": 1.02807593, + "balance_loss_mlp": 1.0500598, + "epoch": 0.19780550127761912, + "flos": 27235478724480.0, + "grad_norm": 2.0523474932115833, + "language_loss": 0.7944051, + "learning_rate": 3.7118143479639813e-06, + "loss": 0.81629974, + "num_input_tokens_seen": 70955140, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9375, + "step": 3290, + "time_per_iteration": 2.499950408935547 + }, + { + "auxiliary_loss_clip": 0.0104544, + "auxiliary_loss_mlp": 0.01002976, + "balance_loss_clip": 1.00056827, + "balance_loss_mlp": 1.01336336, + "epoch": 0.19786562453028708, + "flos": 63550972684800.0, + "grad_norm": 0.9098407078047199, + "language_loss": 0.60440773, + "learning_rate": 3.711612913388418e-06, + "loss": 0.62489194, + "num_input_tokens_seen": 71012005, + "router_z_loss_clip": 0.02404785, + "router_z_loss_mlp": 0.3203125, + "step": 3291, + "time_per_iteration": 3.1538305282592773 + }, + { + "auxiliary_loss_clip": 0.01144863, + "auxiliary_loss_mlp": 0.01044766, + "balance_loss_clip": 1.02639592, + "balance_loss_mlp": 1.04670751, + "epoch": 0.19792574778295505, + "flos": 26286647011200.0, + "grad_norm": 2.151168561582294, + "language_loss": 0.81352198, + "learning_rate": 3.7114114139083204e-06, + "loss": 0.83541822, + "num_input_tokens_seen": 71031140, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.984375, + "step": 3292, + "time_per_iteration": 2.539417028427124 + }, + { + "auxiliary_loss_clip": 0.01137712, + "auxiliary_loss_mlp": 0.01047669, + "balance_loss_clip": 1.03051507, + "balance_loss_mlp": 1.04855824, + "epoch": 0.19798587103562304, + "flos": 19938107566080.0, + "grad_norm": 2.212806192124084, + "language_loss": 0.82146955, + "learning_rate": 3.7112098495313313e-06, + "loss": 0.84332335, + "num_input_tokens_seen": 71050250, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.890625, + "step": 3293, + "time_per_iteration": 2.438809394836426 + }, + { + "auxiliary_loss_clip": 0.01151271, + "auxiliary_loss_mlp": 0.01048402, + "balance_loss_clip": 1.02988923, + "balance_loss_mlp": 1.05333924, + "epoch": 0.198045994288291, + "flos": 20120282369280.0, + "grad_norm": 2.10438249616411, + "language_loss": 0.61268854, + "learning_rate": 3.711008220265093e-06, + "loss": 0.63468528, + "num_input_tokens_seen": 71068665, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9765625, + "step": 3294, + "time_per_iteration": 2.451650381088257 + }, + { + "auxiliary_loss_clip": 0.01143209, + "auxiliary_loss_mlp": 0.01043395, + "balance_loss_clip": 1.02681279, + "balance_loss_mlp": 1.05004907, + "epoch": 0.19810611754095897, + "flos": 17967653228160.0, + "grad_norm": 2.028666267444235, + "language_loss": 0.86983609, + "learning_rate": 3.710806526117251e-06, + "loss": 0.89170212, + "num_input_tokens_seen": 71085320, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9296875, + "step": 3295, + "time_per_iteration": 2.416771411895752 + }, + { + "auxiliary_loss_clip": 0.01141633, + "auxiliary_loss_mlp": 0.01051654, + "balance_loss_clip": 1.03529871, + "balance_loss_mlp": 1.04786801, + "epoch": 0.19816624079362694, + "flos": 15084996071040.0, + "grad_norm": 13.771873008268457, + "language_loss": 0.80491048, + "learning_rate": 3.7106047670954544e-06, + "loss": 0.82684338, + "num_input_tokens_seen": 71102020, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.9375, + "step": 3296, + "time_per_iteration": 2.450934648513794 + }, + { + "auxiliary_loss_clip": 0.01145402, + "auxiliary_loss_mlp": 0.01045523, + "balance_loss_clip": 1.02637851, + "balance_loss_mlp": 1.0482688, + "epoch": 0.1982263640462949, + "flos": 24900315644160.0, + "grad_norm": 2.0804115334054134, + "language_loss": 0.68406892, + "learning_rate": 3.710402943207354e-06, + "loss": 0.70597816, + "num_input_tokens_seen": 71123390, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.97265625, + "step": 3297, + "time_per_iteration": 2.5111610889434814 + }, + { + "auxiliary_loss_clip": 0.01138573, + "auxiliary_loss_mlp": 0.01040678, + "balance_loss_clip": 1.02440548, + "balance_loss_mlp": 1.04895413, + "epoch": 0.19828648729896287, + "flos": 20376181837440.0, + "grad_norm": 1.7575465421519259, + "language_loss": 0.81232154, + "learning_rate": 3.7102010544606016e-06, + "loss": 0.83411407, + "num_input_tokens_seen": 71141800, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.89453125, + "step": 3298, + "time_per_iteration": 2.472025156021118 + }, + { + "auxiliary_loss_clip": 0.01147375, + "auxiliary_loss_mlp": 0.01046386, + "balance_loss_clip": 1.02634668, + "balance_loss_mlp": 1.05001056, + "epoch": 0.19834661055163083, + "flos": 18880035615360.0, + "grad_norm": 2.343960149367745, + "language_loss": 0.85115641, + "learning_rate": 3.7099991008628544e-06, + "loss": 0.87309396, + "num_input_tokens_seen": 71159505, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 0.9765625, + "step": 3299, + "time_per_iteration": 2.4725356101989746 + }, + { + "auxiliary_loss_clip": 0.01045198, + "auxiliary_loss_mlp": 0.01003179, + "balance_loss_clip": 1.00097358, + "balance_loss_mlp": 1.0131526, + "epoch": 0.19840673380429882, + "flos": 60259184640000.0, + "grad_norm": 0.7731212371218976, + "language_loss": 0.53215671, + "learning_rate": 3.7097970824217706e-06, + "loss": 0.55264044, + "num_input_tokens_seen": 71223265, + "router_z_loss_clip": 0.02209473, + "router_z_loss_mlp": 0.3203125, + "step": 3300, + "time_per_iteration": 3.004054069519043 + }, + { + "auxiliary_loss_clip": 0.01142157, + "auxiliary_loss_mlp": 0.01051571, + "balance_loss_clip": 1.03298628, + "balance_loss_mlp": 1.04772329, + "epoch": 0.1984668570569668, + "flos": 19902017376000.0, + "grad_norm": 1.6138936044346288, + "language_loss": 0.73150593, + "learning_rate": 3.7095949991450093e-06, + "loss": 0.75344324, + "num_input_tokens_seen": 71242385, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9453125, + "step": 3301, + "time_per_iteration": 2.4547884464263916 + }, + { + "auxiliary_loss_clip": 0.01140885, + "auxiliary_loss_mlp": 0.01038256, + "balance_loss_clip": 1.02191293, + "balance_loss_mlp": 1.04811358, + "epoch": 0.19852698030963475, + "flos": 15630766295040.0, + "grad_norm": 2.437382428027231, + "language_loss": 0.88445318, + "learning_rate": 3.709392851040235e-06, + "loss": 0.90624458, + "num_input_tokens_seen": 71258990, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.9296875, + "step": 3302, + "time_per_iteration": 2.429579019546509 + }, + { + "auxiliary_loss_clip": 0.01142317, + "auxiliary_loss_mlp": 0.0104676, + "balance_loss_clip": 1.02940273, + "balance_loss_mlp": 1.04750872, + "epoch": 0.19858710356230272, + "flos": 43143007311360.0, + "grad_norm": 1.9503370408087137, + "language_loss": 0.73907369, + "learning_rate": 3.709190638115111e-06, + "loss": 0.76096445, + "num_input_tokens_seen": 71282770, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9453125, + "step": 3303, + "time_per_iteration": 2.627835273742676 + }, + { + "auxiliary_loss_clip": 0.01141217, + "auxiliary_loss_mlp": 0.0104825, + "balance_loss_clip": 1.03117871, + "balance_loss_mlp": 1.04874539, + "epoch": 0.19864722681497068, + "flos": 35144084643840.0, + "grad_norm": 1.8172241344194675, + "language_loss": 0.74761099, + "learning_rate": 3.7089883603773084e-06, + "loss": 0.76950562, + "num_input_tokens_seen": 71301410, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.921875, + "step": 3304, + "time_per_iteration": 2.551241397857666 + }, + { + "auxiliary_loss_clip": 0.01139854, + "auxiliary_loss_mlp": 0.01039407, + "balance_loss_clip": 1.02333784, + "balance_loss_mlp": 1.04763281, + "epoch": 0.19870735006763865, + "flos": 19426200888960.0, + "grad_norm": 2.605019982075021, + "language_loss": 0.85717452, + "learning_rate": 3.7087860178344955e-06, + "loss": 0.87896717, + "num_input_tokens_seen": 71319670, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.921875, + "step": 3305, + "time_per_iteration": 2.432363986968994 + }, + { + "auxiliary_loss_clip": 0.01141298, + "auxiliary_loss_mlp": 0.01040354, + "balance_loss_clip": 1.02408171, + "balance_loss_mlp": 1.04600525, + "epoch": 0.19876747332030664, + "flos": 23547380947200.0, + "grad_norm": 1.7555780714506408, + "language_loss": 0.68014234, + "learning_rate": 3.7085836104943445e-06, + "loss": 0.70195889, + "num_input_tokens_seen": 71339850, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.953125, + "step": 3306, + "time_per_iteration": 2.478422164916992 + }, + { + "auxiliary_loss_clip": 0.01137681, + "auxiliary_loss_mlp": 0.01036227, + "balance_loss_clip": 1.02098584, + "balance_loss_mlp": 1.0453912, + "epoch": 0.1988275965729746, + "flos": 19829406032640.0, + "grad_norm": 1.4744708200758283, + "language_loss": 0.76455241, + "learning_rate": 3.7083811383645332e-06, + "loss": 0.78629148, + "num_input_tokens_seen": 71359795, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.921875, + "step": 3307, + "time_per_iteration": 2.459176778793335 + }, + { + "auxiliary_loss_clip": 0.01140736, + "auxiliary_loss_mlp": 0.0104117, + "balance_loss_clip": 1.02520776, + "balance_loss_mlp": 1.04866791, + "epoch": 0.19888771982564257, + "flos": 23513625141120.0, + "grad_norm": 1.8666050855147507, + "language_loss": 0.75933248, + "learning_rate": 3.708178601452737e-06, + "loss": 0.78115153, + "num_input_tokens_seen": 71378885, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.921875, + "step": 3308, + "time_per_iteration": 2.483060121536255 + }, + { + "auxiliary_loss_clip": 0.01141228, + "auxiliary_loss_mlp": 0.01041046, + "balance_loss_clip": 1.02426159, + "balance_loss_mlp": 1.04736626, + "epoch": 0.19894784307831054, + "flos": 18150510389760.0, + "grad_norm": 1.6368693105847256, + "language_loss": 0.75640005, + "learning_rate": 3.7079759997666374e-06, + "loss": 0.7782228, + "num_input_tokens_seen": 71397285, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.94140625, + "step": 3309, + "time_per_iteration": 3.8069632053375244 + }, + { + "auxiliary_loss_clip": 0.01138354, + "auxiliary_loss_mlp": 0.01046592, + "balance_loss_clip": 1.02869844, + "balance_loss_mlp": 1.04665506, + "epoch": 0.1990079663309785, + "flos": 24276044246400.0, + "grad_norm": 1.6858420956549012, + "language_loss": 0.87646699, + "learning_rate": 3.707773333313917e-06, + "loss": 0.8983165, + "num_input_tokens_seen": 71415775, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.9140625, + "step": 3310, + "time_per_iteration": 3.9299721717834473 + }, + { + "auxiliary_loss_clip": 0.01138843, + "auxiliary_loss_mlp": 0.01041462, + "balance_loss_clip": 1.02431977, + "balance_loss_mlp": 1.04637599, + "epoch": 0.19906808958364647, + "flos": 34897666366080.0, + "grad_norm": 3.6845239503362412, + "language_loss": 0.64166129, + "learning_rate": 3.70757060210226e-06, + "loss": 0.66346431, + "num_input_tokens_seen": 71437315, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.92578125, + "step": 3311, + "time_per_iteration": 2.5747337341308594 + }, + { + "auxiliary_loss_clip": 0.01143032, + "auxiliary_loss_mlp": 0.01042216, + "balance_loss_clip": 1.02559805, + "balance_loss_mlp": 1.04768658, + "epoch": 0.19912821283631443, + "flos": 24024885373440.0, + "grad_norm": 2.462607887220823, + "language_loss": 0.74053729, + "learning_rate": 3.707367806139355e-06, + "loss": 0.76238978, + "num_input_tokens_seen": 71456320, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.953125, + "step": 3312, + "time_per_iteration": 2.471867799758911 + }, + { + "auxiliary_loss_clip": 0.01141113, + "auxiliary_loss_mlp": 0.01046905, + "balance_loss_clip": 1.03060961, + "balance_loss_mlp": 1.04843581, + "epoch": 0.19918833608898243, + "flos": 19859031774720.0, + "grad_norm": 2.2841450786746016, + "language_loss": 0.83511955, + "learning_rate": 3.7071649454328915e-06, + "loss": 0.8569997, + "num_input_tokens_seen": 71475360, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.92578125, + "step": 3313, + "time_per_iteration": 2.4846627712249756 + }, + { + "auxiliary_loss_clip": 0.01142431, + "auxiliary_loss_mlp": 0.01041924, + "balance_loss_clip": 1.02575922, + "balance_loss_mlp": 1.04944849, + "epoch": 0.1992484593416504, + "flos": 29095794984960.0, + "grad_norm": 3.438256379955746, + "language_loss": 0.80930895, + "learning_rate": 3.7069620199905625e-06, + "loss": 0.83115256, + "num_input_tokens_seen": 71496155, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.9296875, + "step": 3314, + "time_per_iteration": 2.525754928588867 + }, + { + "auxiliary_loss_clip": 0.01137185, + "auxiliary_loss_mlp": 0.01043596, + "balance_loss_clip": 1.0280745, + "balance_loss_mlp": 1.04706359, + "epoch": 0.19930858259431836, + "flos": 23295001011840.0, + "grad_norm": 1.5137591341622172, + "language_loss": 0.87549174, + "learning_rate": 3.7067590298200627e-06, + "loss": 0.89729953, + "num_input_tokens_seen": 71517295, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8984375, + "step": 3315, + "time_per_iteration": 2.5170931816101074 + }, + { + "auxiliary_loss_clip": 0.01141446, + "auxiliary_loss_mlp": 0.01046665, + "balance_loss_clip": 1.03032112, + "balance_loss_mlp": 1.04808092, + "epoch": 0.19936870584698632, + "flos": 25378825651200.0, + "grad_norm": 1.5984895942740787, + "language_loss": 0.71255141, + "learning_rate": 3.7065559749290892e-06, + "loss": 0.73443246, + "num_input_tokens_seen": 71540000, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9296875, + "step": 3316, + "time_per_iteration": 2.520071029663086 + }, + { + "auxiliary_loss_clip": 0.0105243, + "auxiliary_loss_mlp": 0.01028392, + "balance_loss_clip": 1.02646089, + "balance_loss_mlp": 1.01928639, + "epoch": 0.1994288290996543, + "flos": 62168053109760.0, + "grad_norm": 0.8439111854473917, + "language_loss": 0.66260874, + "learning_rate": 3.706352855325342e-06, + "loss": 0.68341696, + "num_input_tokens_seen": 71607880, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.33203125, + "step": 3317, + "time_per_iteration": 3.1460416316986084 + }, + { + "auxiliary_loss_clip": 0.01142295, + "auxiliary_loss_mlp": 0.01052969, + "balance_loss_clip": 1.03557682, + "balance_loss_mlp": 1.04575253, + "epoch": 0.19948895235232225, + "flos": 19025832919680.0, + "grad_norm": 2.672944172124665, + "language_loss": 0.74319738, + "learning_rate": 3.7061496710165233e-06, + "loss": 0.76515001, + "num_input_tokens_seen": 71625695, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.96484375, + "step": 3318, + "time_per_iteration": 2.6139748096466064 + }, + { + "auxiliary_loss_clip": 0.01134724, + "auxiliary_loss_mlp": 0.01043694, + "balance_loss_clip": 1.0282445, + "balance_loss_mlp": 1.04536486, + "epoch": 0.19954907560499022, + "flos": 37815803182080.0, + "grad_norm": 1.900050251198073, + "language_loss": 0.78860074, + "learning_rate": 3.7059464220103385e-06, + "loss": 0.81038487, + "num_input_tokens_seen": 71648520, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.89453125, + "step": 3319, + "time_per_iteration": 2.6014342308044434 + }, + { + "auxiliary_loss_clip": 0.01141458, + "auxiliary_loss_mlp": 0.01042777, + "balance_loss_clip": 1.02439499, + "balance_loss_mlp": 1.04806578, + "epoch": 0.1996091988576582, + "flos": 49565199594240.0, + "grad_norm": 2.0962453666662073, + "language_loss": 0.75462162, + "learning_rate": 3.7057431083144945e-06, + "loss": 0.77646399, + "num_input_tokens_seen": 71672185, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.93359375, + "step": 3320, + "time_per_iteration": 2.739485263824463 + }, + { + "auxiliary_loss_clip": 0.01139438, + "auxiliary_loss_mlp": 0.01042565, + "balance_loss_clip": 1.02613819, + "balance_loss_mlp": 1.04714417, + "epoch": 0.19966932211032618, + "flos": 22635788659200.0, + "grad_norm": 2.167317842134812, + "language_loss": 0.80547488, + "learning_rate": 3.705539729936701e-06, + "loss": 0.82729495, + "num_input_tokens_seen": 71692890, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.921875, + "step": 3321, + "time_per_iteration": 2.581353187561035 + }, + { + "auxiliary_loss_clip": 0.01049309, + "auxiliary_loss_mlp": 0.01003433, + "balance_loss_clip": 1.00151408, + "balance_loss_mlp": 1.01694489, + "epoch": 0.19972944536299414, + "flos": 54082117745280.0, + "grad_norm": 0.880630206553271, + "language_loss": 0.65178835, + "learning_rate": 3.7053362868846696e-06, + "loss": 0.67231572, + "num_input_tokens_seen": 71745815, + "router_z_loss_clip": 0.01916504, + "router_z_loss_mlp": 0.32421875, + "step": 3322, + "time_per_iteration": 2.9042704105377197 + }, + { + "auxiliary_loss_clip": 0.01050141, + "auxiliary_loss_mlp": 0.01003283, + "balance_loss_clip": 1.00130391, + "balance_loss_mlp": 1.01724231, + "epoch": 0.1997895686156621, + "flos": 69355031817600.0, + "grad_norm": 0.7916622121471568, + "language_loss": 0.56975091, + "learning_rate": 3.7051327791661153e-06, + "loss": 0.59028506, + "num_input_tokens_seen": 71806915, + "router_z_loss_clip": 0.01977539, + "router_z_loss_mlp": 0.328125, + "step": 3323, + "time_per_iteration": 3.2141411304473877 + }, + { + "auxiliary_loss_clip": 0.01139547, + "auxiliary_loss_mlp": 0.01035371, + "balance_loss_clip": 1.01859808, + "balance_loss_mlp": 1.04839373, + "epoch": 0.19984969186833007, + "flos": 18552063507840.0, + "grad_norm": 1.9849201654975537, + "language_loss": 0.80526733, + "learning_rate": 3.7049292067887555e-06, + "loss": 0.82701647, + "num_input_tokens_seen": 71824645, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9140625, + "step": 3324, + "time_per_iteration": 2.5455262660980225 + }, + { + "auxiliary_loss_clip": 0.01137077, + "auxiliary_loss_mlp": 0.01040613, + "balance_loss_clip": 1.02329218, + "balance_loss_mlp": 1.04540765, + "epoch": 0.19990981512099804, + "flos": 26429678968320.0, + "grad_norm": 1.8681208438308643, + "language_loss": 0.53681695, + "learning_rate": 3.7047255697603092e-06, + "loss": 0.55859387, + "num_input_tokens_seen": 71845125, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.91796875, + "step": 3325, + "time_per_iteration": 2.581782102584839 + }, + { + "auxiliary_loss_clip": 0.01138508, + "auxiliary_loss_mlp": 0.01039502, + "balance_loss_clip": 1.02337289, + "balance_loss_mlp": 1.04565668, + "epoch": 0.19996993837366603, + "flos": 16325997010560.0, + "grad_norm": 2.0672953846254027, + "language_loss": 0.86169922, + "learning_rate": 3.7045218680884984e-06, + "loss": 0.88347936, + "num_input_tokens_seen": 71863500, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.9296875, + "step": 3326, + "time_per_iteration": 2.494718551635742 + }, + { + "auxiliary_loss_clip": 0.01138244, + "auxiliary_loss_mlp": 0.01037965, + "balance_loss_clip": 1.02243209, + "balance_loss_mlp": 1.04851878, + "epoch": 0.200030061626334, + "flos": 20844169159680.0, + "grad_norm": 1.8653522915536895, + "language_loss": 0.71835959, + "learning_rate": 3.7043181017810476e-06, + "loss": 0.74012172, + "num_input_tokens_seen": 71881845, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8984375, + "step": 3327, + "time_per_iteration": 2.536574602127075 + }, + { + "auxiliary_loss_clip": 0.0114197, + "auxiliary_loss_mlp": 0.01041829, + "balance_loss_clip": 1.02368546, + "balance_loss_mlp": 1.04750776, + "epoch": 0.20009018487900196, + "flos": 23762629198080.0, + "grad_norm": 1.83111198959611, + "language_loss": 0.76588571, + "learning_rate": 3.7041142708456833e-06, + "loss": 0.78772372, + "num_input_tokens_seen": 71900940, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9453125, + "step": 3328, + "time_per_iteration": 2.5083916187286377 + } + ], + "logging_steps": 1.0, + "max_steps": 16632, + "num_input_tokens_seen": 71900940, + "num_train_epochs": 1, + "save_steps": 3328, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.8067294895459533e+17, + "train_batch_size": 5, + "trial_name": null, + "trial_params": null +} diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/training_args.bin b/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..d5146bc02eb18c01fbb1631d22c9f876fbbc43c1 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5772a84283358851660a992e3ef3d119802bea563a14b9beb853b7645b76d38 +size 7992 diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/zero_to_fp32.py b/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-3328/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/added_tokens.json b/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/config.json b/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/config.json new file mode 100644 index 0000000000000000000000000000000000000000..97409ed874967d8d79c126c028d286e8fe8e1484 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/config.json @@ -0,0 +1,199 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": true, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": false, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "smoe_sigmoidgating", + "norm_softmax": false, + "normalization": true, + "num_attention_heads": 32, + "num_experts": 4, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 2, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/mm_projector.bin", + "rate_compete": 0.2, + "rate_flip": 0.05, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": true, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": false, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/generation_config.json b/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/latest b/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/latest new file mode 100644 index 0000000000000000000000000000000000000000..24f37f789c4e6eb86270647db8ff45788e484aa2 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/latest @@ -0,0 +1 @@ +global_step6656 \ No newline at end of file diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/model-00001-of-00003.safetensors b/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..bf7764e05f40249d234ba6cfd1b88a871d6d1e53 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d9a5de9e4e7f7928c12c2e0ee0e4a5c78cb8224e62590abe707c09ea47d8243 +size 4972489328 diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/model-00002-of-00003.safetensors b/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..af6b2cb9b5fa833fb053ea6e1fd668f5823466e3 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:963e957df91dea87d02256f10c7783844c5ba82a6d0e96d72a6acfdf88b520ea +size 4985529648 diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/model-00003-of-00003.safetensors b/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..43d444ecef6a86f4743b0f579ee3573724416f85 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5eca10d8ec0a8889bfe8f3eaaf7d5ffbb48c8849b19bdd21e939a1003a0c510 +size 248943552 diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/model.safetensors.index.json b/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..aa54419fc0a3eab502aa7c4ad974dca52ed10803 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/model.safetensors.index.json @@ -0,0 +1,1005 @@ +{ + "metadata": { + "total_size": 10206819456 + }, + "weight_map": { + "lm_head.weight": "model-00003-of-00003.safetensors", + "model.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00003-of-00003.safetensors", + "model.norm.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors" + } +} diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/rng_state_0.pth b/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..c1e6773e944015af0e83161fa2d20fe7d469fd7f --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22271cc36f268c0b3e870b3930ac590fd40a4a3cd3a88aed74f78e5f8790aceb +size 15024 diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/rng_state_1.pth b/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..a834a7be015ebd36883cec3bb92a8657936cd0a6 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19435e9be5d4b837d96fc2e9286e23e27344bb6ad3222ef1b9d207e6b2bb8c78 +size 15024 diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/rng_state_2.pth b/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..1f1b991258d274ff5481ace768d5b6702d919d50 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2902ec572b1b2f1a6a78f8979353bf31953eacdc78b129cc34a9f04c1de9b8d5 +size 15024 diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/rng_state_3.pth b/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..ee742fbd21912a77c2d25fe5ca60af4403668637 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8a011e80ba323d1fcabf31eaea4d2bc397efadb23603b4248f0067ff8ca3987 +size 15024 diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/special_tokens_map.json b/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/tokenizer.model b/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/tokenizer_config.json b/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/trainer_state.json b/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..71e2573b86e8bd65dd0d028ba4b172d4f4338b3c --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/trainer_state.json @@ -0,0 +1,113185 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.4001803697580039, + "eval_steps": 500, + "global_step": 6656, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.05015663, + "auxiliary_loss_mlp": 0.02215404, + "balance_loss_clip": 1.76946592, + "balance_loss_mlp": 2.42247009, + "epoch": 6.012325266796934e-05, + "flos": 24456507091200.0, + "grad_norm": 54.31846269900138, + "language_loss": 2.84849024, + "learning_rate": 0.0, + "loss": 1.94356799, + "num_input_tokens_seen": 19155, + "router_z_loss_clip": 4.4375, + "router_z_loss_mlp": 26.0, + "step": 1, + "time_per_iteration": 14.062297821044922 + }, + { + "auxiliary_loss_clip": 0.03371575, + "auxiliary_loss_mlp": 0.01459085, + "balance_loss_clip": 1.18919563, + "balance_loss_mlp": 1.61943495, + "epoch": 0.00012024650533593868, + "flos": 20225931246720.0, + "grad_norm": 34.71678092445231, + "language_loss": 1.82690942, + "learning_rate": 4.4628432569317594e-07, + "loss": 1.87521601, + "num_input_tokens_seen": 36175, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 17.5, + "step": 2, + "time_per_iteration": 2.4504079818725586 + }, + { + "auxiliary_loss_clip": 0.03311525, + "auxiliary_loss_mlp": 0.014397, + "balance_loss_clip": 1.18697679, + "balance_loss_mlp": 1.61685562, + "epoch": 0.000180369758003908, + "flos": 22309935454080.0, + "grad_norm": 34.59102075188436, + "language_loss": 1.57529902, + "learning_rate": 7.073439208833112e-07, + "loss": 1.62281132, + "num_input_tokens_seen": 54870, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 17.0, + "step": 3, + "time_per_iteration": 2.4145541191101074 + }, + { + "auxiliary_loss_clip": 0.03353861, + "auxiliary_loss_mlp": 0.01449549, + "balance_loss_clip": 1.15390992, + "balance_loss_mlp": 1.61571431, + "epoch": 0.00024049301067187735, + "flos": 22414650577920.0, + "grad_norm": 51.728740512395206, + "language_loss": 1.67595887, + "learning_rate": 8.925686513863519e-07, + "loss": 1.72399294, + "num_input_tokens_seen": 74575, + "router_z_loss_clip": 2.953125, + "router_z_loss_mlp": 17.375, + "step": 4, + "time_per_iteration": 2.466392993927002 + }, + { + "auxiliary_loss_clip": 0.03393634, + "auxiliary_loss_mlp": 0.01505687, + "balance_loss_clip": 1.21710527, + "balance_loss_mlp": 1.61638641, + "epoch": 0.0003006162633398467, + "flos": 21396978449280.0, + "grad_norm": 56.74196654651921, + "language_loss": 1.90851176, + "learning_rate": 1.0362401141348472e-06, + "loss": 1.95750499, + "num_input_tokens_seen": 92580, + "router_z_loss_clip": 2.890625, + "router_z_loss_mlp": 17.75, + "step": 5, + "time_per_iteration": 2.6828246116638184 + }, + { + "auxiliary_loss_clip": 0.03361898, + "auxiliary_loss_mlp": 0.01518906, + "balance_loss_clip": 1.22441149, + "balance_loss_mlp": 1.60614848, + "epoch": 0.000360739516007816, + "flos": 21652375127040.0, + "grad_norm": 33.32400799743486, + "language_loss": 1.6094954, + "learning_rate": 1.153628246576487e-06, + "loss": 1.6583035, + "num_input_tokens_seen": 109705, + "router_z_loss_clip": 2.953125, + "router_z_loss_mlp": 17.5, + "step": 6, + "time_per_iteration": 2.660855770111084 + }, + { + "auxiliary_loss_clip": 0.03345758, + "auxiliary_loss_mlp": 0.01485904, + "balance_loss_clip": 1.20209074, + "balance_loss_mlp": 1.60783124, + "epoch": 0.0004208627686757854, + "flos": 27159742897920.0, + "grad_norm": 26.76365346454933, + "language_loss": 1.53346825, + "learning_rate": 1.2528784983718962e-06, + "loss": 1.58178496, + "num_input_tokens_seen": 129425, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 17.375, + "step": 7, + "time_per_iteration": 2.718822956085205 + }, + { + "auxiliary_loss_clip": 0.03312894, + "auxiliary_loss_mlp": 0.01444018, + "balance_loss_clip": 1.16630852, + "balance_loss_mlp": 1.60320723, + "epoch": 0.0004809860213437547, + "flos": 31319096135040.0, + "grad_norm": 31.923588970831496, + "language_loss": 1.43687642, + "learning_rate": 1.338852977079528e-06, + "loss": 1.48444545, + "num_input_tokens_seen": 149210, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 17.0, + "step": 8, + "time_per_iteration": 2.779961109161377 + }, + { + "auxiliary_loss_clip": 0.03360351, + "auxiliary_loss_mlp": 0.01496215, + "balance_loss_clip": 1.21144783, + "balance_loss_mlp": 1.60258842, + "epoch": 0.000541109274011724, + "flos": 32160411463680.0, + "grad_norm": 28.084887526361417, + "language_loss": 1.49955618, + "learning_rate": 1.4146878417666224e-06, + "loss": 1.54812181, + "num_input_tokens_seen": 169055, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 17.5, + "step": 9, + "time_per_iteration": 2.799635887145996 + }, + { + "auxiliary_loss_clip": 0.03302188, + "auxiliary_loss_mlp": 0.01477479, + "balance_loss_clip": 1.20797062, + "balance_loss_mlp": 1.6070832, + "epoch": 0.0006012325266796934, + "flos": 18916808163840.0, + "grad_norm": 23.45187310710616, + "language_loss": 1.44727731, + "learning_rate": 1.4825244398280232e-06, + "loss": 1.49507403, + "num_input_tokens_seen": 188045, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 17.0, + "step": 10, + "time_per_iteration": 2.6989152431488037 + }, + { + "auxiliary_loss_clip": 0.03356835, + "auxiliary_loss_mlp": 0.01493566, + "balance_loss_clip": 1.21928966, + "balance_loss_mlp": 1.61121845, + "epoch": 0.0006613557793476627, + "flos": 20774861867520.0, + "grad_norm": 18.63867113279811, + "language_loss": 1.45021069, + "learning_rate": 1.5438901072051983e-06, + "loss": 1.4987148, + "num_input_tokens_seen": 207035, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 17.5, + "step": 11, + "time_per_iteration": 2.6820693016052246 + }, + { + "auxiliary_loss_clip": 0.0328584, + "auxiliary_loss_mlp": 0.01449969, + "balance_loss_clip": 1.17378449, + "balance_loss_mlp": 1.59900761, + "epoch": 0.000721479032015632, + "flos": 16581680997120.0, + "grad_norm": 16.861449854609447, + "language_loss": 1.45122719, + "learning_rate": 1.5999125722696629e-06, + "loss": 1.49858522, + "num_input_tokens_seen": 223225, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 16.875, + "step": 12, + "time_per_iteration": 2.631218910217285 + }, + { + "auxiliary_loss_clip": 0.03313605, + "auxiliary_loss_mlp": 0.01404342, + "balance_loss_clip": 1.14589679, + "balance_loss_mlp": 1.60898232, + "epoch": 0.0007816022846836014, + "flos": 23805471144960.0, + "grad_norm": 11.176593153687291, + "language_loss": 1.24100113, + "learning_rate": 1.6514482443788434e-06, + "loss": 1.28818083, + "num_input_tokens_seen": 242570, + "router_z_loss_clip": 2.578125, + "router_z_loss_mlp": 17.125, + "step": 13, + "time_per_iteration": 2.6961779594421387 + }, + { + "auxiliary_loss_clip": 0.03282163, + "auxiliary_loss_mlp": 0.01472629, + "balance_loss_clip": 1.20464635, + "balance_loss_mlp": 1.60534358, + "epoch": 0.0008417255373515708, + "flos": 19172204841600.0, + "grad_norm": 5.7580183597057975, + "language_loss": 1.20611417, + "learning_rate": 1.6991628240650723e-06, + "loss": 1.25366211, + "num_input_tokens_seen": 261215, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 16.75, + "step": 14, + "time_per_iteration": 2.6555092334747314 + }, + { + "auxiliary_loss_clip": 0.0326835, + "auxiliary_loss_mlp": 0.01431945, + "balance_loss_clip": 1.16815877, + "balance_loss_mlp": 1.6104542, + "epoch": 0.00090184879001954, + "flos": 26395564026240.0, + "grad_norm": 6.4839782289009085, + "language_loss": 1.12832427, + "learning_rate": 1.7435840350181584e-06, + "loss": 1.1753273, + "num_input_tokens_seen": 280035, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 16.5, + "step": 15, + "time_per_iteration": 2.717512607574463 + }, + { + "auxiliary_loss_clip": 0.03231722, + "auxiliary_loss_mlp": 0.01412441, + "balance_loss_clip": 1.16257811, + "balance_loss_mlp": 1.59521294, + "epoch": 0.0009619720426875094, + "flos": 24679500785280.0, + "grad_norm": 4.584872954405151, + "language_loss": 1.1119349, + "learning_rate": 1.7851373027727038e-06, + "loss": 1.15837646, + "num_input_tokens_seen": 300265, + "router_z_loss_clip": 2.5, + "router_z_loss_mlp": 16.375, + "step": 16, + "time_per_iteration": 2.7170701026916504 + }, + { + "auxiliary_loss_clip": 0.03220058, + "auxiliary_loss_mlp": 0.0141779, + "balance_loss_clip": 1.17784595, + "balance_loss_mlp": 1.60289145, + "epoch": 0.0010220952953554788, + "flos": 18624531196800.0, + "grad_norm": 5.285773165398426, + "language_loss": 1.1253047, + "learning_rate": 1.8241705979033208e-06, + "loss": 1.17168307, + "num_input_tokens_seen": 317375, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 16.125, + "step": 17, + "time_per_iteration": 2.6125564575195312 + }, + { + "auxiliary_loss_clip": 0.0315575, + "auxiliary_loss_mlp": 0.01378857, + "balance_loss_clip": 1.14730477, + "balance_loss_mlp": 1.60051179, + "epoch": 0.001082218548023448, + "flos": 26142537646080.0, + "grad_norm": 3.8094646515897193, + "language_loss": 1.08149433, + "learning_rate": 1.860972167459798e-06, + "loss": 1.12684035, + "num_input_tokens_seen": 337975, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 15.5625, + "step": 18, + "time_per_iteration": 5.593315362930298 + }, + { + "auxiliary_loss_clip": 0.03181327, + "auxiliary_loss_mlp": 0.01400224, + "balance_loss_clip": 1.13548398, + "balance_loss_mlp": 1.59901524, + "epoch": 0.0011423418006914173, + "flos": 19609776322560.0, + "grad_norm": 4.551402579460018, + "language_loss": 1.02296436, + "learning_rate": 1.89578346593066e-06, + "loss": 1.06877995, + "num_input_tokens_seen": 356635, + "router_z_loss_clip": 2.65625, + "router_z_loss_mlp": 15.8125, + "step": 19, + "time_per_iteration": 2.6462903022766113 + }, + { + "auxiliary_loss_clip": 0.0312444, + "auxiliary_loss_mlp": 0.01341166, + "balance_loss_clip": 1.12096262, + "balance_loss_mlp": 1.60122275, + "epoch": 0.0012024650533593868, + "flos": 17895365107200.0, + "grad_norm": 4.049985155187145, + "language_loss": 1.16660511, + "learning_rate": 1.928808765521199e-06, + "loss": 1.21126115, + "num_input_tokens_seen": 375625, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 15.25, + "step": 20, + "time_per_iteration": 2.687962293624878 + }, + { + "auxiliary_loss_clip": 0.03111088, + "auxiliary_loss_mlp": 0.01380381, + "balance_loss_clip": 1.13109064, + "balance_loss_mlp": 1.58184814, + "epoch": 0.001262588306027356, + "flos": 21252043071360.0, + "grad_norm": 8.855966691950416, + "language_loss": 1.06044388, + "learning_rate": 1.9602224192552076e-06, + "loss": 1.1053586, + "num_input_tokens_seen": 394350, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 15.3125, + "step": 21, + "time_per_iteration": 2.705784320831299 + }, + { + "auxiliary_loss_clip": 0.03006166, + "auxiliary_loss_mlp": 0.0138104, + "balance_loss_clip": 1.14758062, + "balance_loss_mlp": 1.56386232, + "epoch": 0.0013227115586953253, + "flos": 26104077158400.0, + "grad_norm": 3.503731577984969, + "language_loss": 1.05752254, + "learning_rate": 1.9901744328983746e-06, + "loss": 1.10139465, + "num_input_tokens_seen": 413255, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 14.4375, + "step": 22, + "time_per_iteration": 2.714902400970459 + }, + { + "auxiliary_loss_clip": 0.02958535, + "auxiliary_loss_mlp": 0.01337723, + "balance_loss_clip": 1.12743819, + "balance_loss_mlp": 1.56545472, + "epoch": 0.0013828348113632948, + "flos": 23951376190080.0, + "grad_norm": 2.8887485842740657, + "language_loss": 0.91820848, + "learning_rate": 2.018794797290208e-06, + "loss": 0.96117103, + "num_input_tokens_seen": 433065, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 13.9375, + "step": 23, + "time_per_iteration": 2.6802501678466797 + }, + { + "auxiliary_loss_clip": 0.02925568, + "auxiliary_loss_mlp": 0.0136327, + "balance_loss_clip": 1.14306688, + "balance_loss_mlp": 1.55789983, + "epoch": 0.001442958064031264, + "flos": 15959851724160.0, + "grad_norm": 2.888412626700388, + "language_loss": 1.08090949, + "learning_rate": 2.046196897962839e-06, + "loss": 1.12379789, + "num_input_tokens_seen": 451175, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 13.6875, + "step": 24, + "time_per_iteration": 2.6134862899780273 + }, + { + "auxiliary_loss_clip": 0.02818042, + "auxiliary_loss_mlp": 0.01329399, + "balance_loss_clip": 1.11892343, + "balance_loss_mlp": 1.55278993, + "epoch": 0.0015030813166992333, + "flos": 18108350801280.0, + "grad_norm": 3.5526652768314877, + "language_loss": 1.01197755, + "learning_rate": 2.0724802282696944e-06, + "loss": 1.05345201, + "num_input_tokens_seen": 468775, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 12.6875, + "step": 25, + "time_per_iteration": 2.6801955699920654 + }, + { + "auxiliary_loss_clip": 0.02811065, + "auxiliary_loss_mlp": 0.01310914, + "balance_loss_clip": 1.10196424, + "balance_loss_mlp": 1.55557573, + "epoch": 0.0015632045693672028, + "flos": 22234558763520.0, + "grad_norm": 2.8866965715457127, + "language_loss": 1.0650332, + "learning_rate": 2.0977325700720194e-06, + "loss": 1.10625291, + "num_input_tokens_seen": 488530, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 12.5625, + "step": 26, + "time_per_iteration": 2.6561954021453857 + }, + { + "auxiliary_loss_clip": 0.02754337, + "auxiliary_loss_mlp": 0.01325989, + "balance_loss_clip": 1.12600398, + "balance_loss_mlp": 1.54593086, + "epoch": 0.001623327822035172, + "flos": 23991955580160.0, + "grad_norm": 8.480879524297928, + "language_loss": 0.95465469, + "learning_rate": 2.122031762649933e-06, + "loss": 0.99545801, + "num_input_tokens_seen": 510495, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 12.0625, + "step": 27, + "time_per_iteration": 2.717332363128662 + }, + { + "auxiliary_loss_clip": 0.02732017, + "auxiliary_loss_mlp": 0.0131313, + "balance_loss_clip": 1.13174081, + "balance_loss_mlp": 1.55085063, + "epoch": 0.0016834510747031415, + "flos": 19677647070720.0, + "grad_norm": 2.7582152185230338, + "language_loss": 1.06276608, + "learning_rate": 2.1454471497582483e-06, + "loss": 1.1032176, + "num_input_tokens_seen": 528605, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 11.8125, + "step": 28, + "time_per_iteration": 2.6645846366882324 + }, + { + "auxiliary_loss_clip": 0.02698877, + "auxiliary_loss_mlp": 0.01319704, + "balance_loss_clip": 1.1339283, + "balance_loss_mlp": 1.5357703, + "epoch": 0.0017435743273711108, + "flos": 20923819568640.0, + "grad_norm": 2.703793609192777, + "language_loss": 1.02653611, + "learning_rate": 2.1680407726407727e-06, + "loss": 1.06672192, + "num_input_tokens_seen": 548515, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 11.625, + "step": 29, + "time_per_iteration": 2.6647088527679443 + }, + { + "auxiliary_loss_clip": 0.02692806, + "auxiliary_loss_mlp": 0.01313595, + "balance_loss_clip": 1.12667465, + "balance_loss_mlp": 1.53252506, + "epoch": 0.00180369758003908, + "flos": 19528976678400.0, + "grad_norm": 3.824163422844594, + "language_loss": 1.1929419, + "learning_rate": 2.189868360711334e-06, + "loss": 1.233006, + "num_input_tokens_seen": 564025, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 11.625, + "step": 30, + "time_per_iteration": 2.6305816173553467 + }, + { + "auxiliary_loss_clip": 0.02610821, + "auxiliary_loss_mlp": 0.01338782, + "balance_loss_clip": 1.15748882, + "balance_loss_mlp": 1.51829374, + "epoch": 0.0018638208327070496, + "flos": 27453169100160.0, + "grad_norm": 4.55861683808779, + "language_loss": 1.02499342, + "learning_rate": 2.2109801597326265e-06, + "loss": 1.06448936, + "num_input_tokens_seen": 583345, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 10.9375, + "step": 31, + "time_per_iteration": 2.71045184135437 + }, + { + "auxiliary_loss_clip": 0.02583705, + "auxiliary_loss_mlp": 0.01332414, + "balance_loss_clip": 1.15245557, + "balance_loss_mlp": 1.52035046, + "epoch": 0.0019239440853750188, + "flos": 13589460380160.0, + "grad_norm": 2.526137445187824, + "language_loss": 0.95697796, + "learning_rate": 2.2314216284658796e-06, + "loss": 0.99613917, + "num_input_tokens_seen": 600010, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 10.625, + "step": 32, + "time_per_iteration": 2.626783847808838 + }, + { + "auxiliary_loss_clip": 0.02566919, + "auxiliary_loss_mlp": 0.01304168, + "balance_loss_clip": 1.13670313, + "balance_loss_mlp": 1.51655078, + "epoch": 0.001984067338042988, + "flos": 11253866336640.0, + "grad_norm": 3.344933729659458, + "language_loss": 0.95465255, + "learning_rate": 2.2512340280885094e-06, + "loss": 0.99336338, + "num_input_tokens_seen": 616295, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 10.5, + "step": 33, + "time_per_iteration": 2.645725727081299 + }, + { + "auxiliary_loss_clip": 0.02433039, + "auxiliary_loss_mlp": 0.013041, + "balance_loss_clip": 1.14569449, + "balance_loss_mlp": 1.48877192, + "epoch": 0.0020441905907109576, + "flos": 22386245898240.0, + "grad_norm": 4.808068329548225, + "language_loss": 0.91556877, + "learning_rate": 2.270454923596497e-06, + "loss": 0.95294011, + "num_input_tokens_seen": 637640, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 9.4375, + "step": 34, + "time_per_iteration": 2.7327146530151367 + }, + { + "auxiliary_loss_clip": 0.02385913, + "auxiliary_loss_mlp": 0.0127366, + "balance_loss_clip": 1.1172576, + "balance_loss_mlp": 1.45172572, + "epoch": 0.0021043138433789266, + "flos": 49778580337920.0, + "grad_norm": 2.948252640490764, + "language_loss": 0.76639408, + "learning_rate": 2.2891186125067434e-06, + "loss": 0.80298984, + "num_input_tokens_seen": 659710, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 9.375, + "step": 35, + "time_per_iteration": 2.940739870071411 + }, + { + "auxiliary_loss_clip": 0.02360979, + "auxiliary_loss_mlp": 0.0127456, + "balance_loss_clip": 1.12769413, + "balance_loss_mlp": 1.46427846, + "epoch": 0.002164437096046896, + "flos": 20557961591040.0, + "grad_norm": 2.1659182072135064, + "language_loss": 0.89043307, + "learning_rate": 2.307256493152974e-06, + "loss": 0.92678845, + "num_input_tokens_seen": 679670, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 8.9375, + "step": 36, + "time_per_iteration": 2.693335771560669 + }, + { + "auxiliary_loss_clip": 0.02305413, + "auxiliary_loss_mlp": 0.01335093, + "balance_loss_clip": 1.18574798, + "balance_loss_mlp": 1.45221901, + "epoch": 0.0022245603487148656, + "flos": 26542295084160.0, + "grad_norm": 3.3248653771669416, + "language_loss": 0.93231332, + "learning_rate": 2.3248973825097614e-06, + "loss": 0.96871841, + "num_input_tokens_seen": 700170, + "router_z_loss_clip": 1.4921875, + "router_z_loss_mlp": 8.5, + "step": 37, + "time_per_iteration": 2.70194673538208 + }, + { + "auxiliary_loss_clip": 0.02264412, + "auxiliary_loss_mlp": 0.01277806, + "balance_loss_clip": 1.15373349, + "balance_loss_mlp": 1.44697845, + "epoch": 0.0022846836013828346, + "flos": 20338188226560.0, + "grad_norm": 2.1191864106647906, + "language_loss": 1.04275775, + "learning_rate": 2.3420677916238357e-06, + "loss": 1.07817996, + "num_input_tokens_seen": 718545, + "router_z_loss_clip": 1.2421875, + "router_z_loss_mlp": 8.1875, + "step": 38, + "time_per_iteration": 2.674187183380127 + }, + { + "auxiliary_loss_clip": 0.02234117, + "auxiliary_loss_mlp": 0.01257339, + "balance_loss_clip": 1.13164425, + "balance_loss_mlp": 1.44101977, + "epoch": 0.002344806854050804, + "flos": 26247575992320.0, + "grad_norm": 2.2707505194681685, + "language_loss": 0.85635245, + "learning_rate": 2.358792165262154e-06, + "loss": 0.891267, + "num_input_tokens_seen": 739865, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 7.9375, + "step": 39, + "time_per_iteration": 2.716417074203491 + }, + { + "auxiliary_loss_clip": 0.02209554, + "auxiliary_loss_mlp": 0.01248677, + "balance_loss_clip": 1.1173557, + "balance_loss_mlp": 1.43176007, + "epoch": 0.0024049301067187736, + "flos": 11801539981440.0, + "grad_norm": 2.874633531970748, + "language_loss": 0.90416026, + "learning_rate": 2.3750930912143747e-06, + "loss": 0.93874258, + "num_input_tokens_seen": 755770, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 7.78125, + "step": 40, + "time_per_iteration": 2.621108055114746 + }, + { + "auxiliary_loss_clip": 0.02158681, + "auxiliary_loss_mlp": 0.01271709, + "balance_loss_clip": 1.15626693, + "balance_loss_mlp": 1.42207694, + "epoch": 0.0024650533593867426, + "flos": 20631506688000.0, + "grad_norm": 3.842521317695652, + "language_loss": 0.93497038, + "learning_rate": 2.3909914837471044e-06, + "loss": 0.96927428, + "num_input_tokens_seen": 773440, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 7.375, + "step": 41, + "time_per_iteration": 2.66089129447937 + }, + { + "auxiliary_loss_clip": 0.0212207, + "auxiliary_loss_mlp": 0.0125263, + "balance_loss_clip": 1.14720106, + "balance_loss_mlp": 1.41368401, + "epoch": 0.002525176612054712, + "flos": 18406122549120.0, + "grad_norm": 4.5963223670672635, + "language_loss": 0.97454929, + "learning_rate": 2.4065067449483835e-06, + "loss": 1.00829637, + "num_input_tokens_seen": 790455, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 7.09375, + "step": 42, + "time_per_iteration": 2.63149094581604 + }, + { + "auxiliary_loss_clip": 0.02082851, + "auxiliary_loss_mlp": 0.01298258, + "balance_loss_clip": 1.18939614, + "balance_loss_mlp": 1.41430426, + "epoch": 0.0025852998647226816, + "flos": 28184023128960.0, + "grad_norm": 2.9545418034556814, + "language_loss": 0.97656071, + "learning_rate": 2.4216569070848724e-06, + "loss": 1.01037169, + "num_input_tokens_seen": 810645, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 6.6875, + "step": 43, + "time_per_iteration": 2.7244436740875244 + }, + { + "auxiliary_loss_clip": 0.02102024, + "auxiliary_loss_mlp": 0.01311792, + "balance_loss_clip": 1.19706488, + "balance_loss_mlp": 1.4130851, + "epoch": 0.0026454231173906506, + "flos": 14283110897280.0, + "grad_norm": 2.0531245010632473, + "language_loss": 0.93701768, + "learning_rate": 2.4364587585915504e-06, + "loss": 0.97115582, + "num_input_tokens_seen": 827470, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 6.875, + "step": 44, + "time_per_iteration": 2.6628317832946777 + }, + { + "auxiliary_loss_clip": 0.02065563, + "auxiliary_loss_mlp": 0.01272457, + "balance_loss_clip": 1.17236853, + "balance_loss_mlp": 1.41084957, + "epoch": 0.00270554637005862, + "flos": 22419211605120.0, + "grad_norm": 9.3374631511207, + "language_loss": 0.98937047, + "learning_rate": 2.450927955901469e-06, + "loss": 1.02275062, + "num_input_tokens_seen": 847285, + "router_z_loss_clip": 1.0, + "router_z_loss_mlp": 6.5625, + "step": 45, + "time_per_iteration": 2.7355775833129883 + }, + { + "auxiliary_loss_clip": 0.02040064, + "auxiliary_loss_mlp": 0.01227769, + "balance_loss_clip": 1.13831401, + "balance_loss_mlp": 1.39673805, + "epoch": 0.0027656696227265896, + "flos": 23985778440960.0, + "grad_norm": 1.8055823424878037, + "language_loss": 1.02792716, + "learning_rate": 2.465079122983384e-06, + "loss": 1.06060553, + "num_input_tokens_seen": 867545, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 6.4375, + "step": 46, + "time_per_iteration": 2.7488839626312256 + }, + { + "auxiliary_loss_clip": 0.02002379, + "auxiliary_loss_mlp": 0.01270193, + "balance_loss_clip": 1.17773402, + "balance_loss_mlp": 1.38648152, + "epoch": 0.0028257928753945586, + "flos": 37669503087360.0, + "grad_norm": 2.971366079361506, + "language_loss": 0.88043427, + "learning_rate": 2.4789259401737868e-06, + "loss": 0.91315997, + "num_input_tokens_seen": 889915, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 6.15625, + "step": 47, + "time_per_iteration": 2.845005512237549 + }, + { + "auxiliary_loss_clip": 0.01963914, + "auxiliary_loss_mlp": 0.01252908, + "balance_loss_clip": 1.16493094, + "balance_loss_mlp": 1.37624073, + "epoch": 0.002885916128062528, + "flos": 22454547609600.0, + "grad_norm": 2.070099145794898, + "language_loss": 0.87949276, + "learning_rate": 2.492481223656015e-06, + "loss": 0.91166103, + "num_input_tokens_seen": 908975, + "router_z_loss_clip": 0.87890625, + "router_z_loss_mlp": 5.875, + "step": 48, + "time_per_iteration": 2.7514398097991943 + }, + { + "auxiliary_loss_clip": 0.01962956, + "auxiliary_loss_mlp": 0.01244481, + "balance_loss_clip": 1.15078259, + "balance_loss_mlp": 1.36602139, + "epoch": 0.0029460393807304976, + "flos": 27012796358400.0, + "grad_norm": 2.366138839739612, + "language_loss": 0.89877701, + "learning_rate": 2.5057569967437924e-06, + "loss": 0.93085134, + "num_input_tokens_seen": 929810, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 6.0, + "step": 49, + "time_per_iteration": 2.743236541748047 + }, + { + "auxiliary_loss_clip": 0.01955947, + "auxiliary_loss_mlp": 0.01232227, + "balance_loss_clip": 1.14534748, + "balance_loss_mlp": 1.36045313, + "epoch": 0.0030061626333984666, + "flos": 15851832549120.0, + "grad_norm": 2.8158483763506914, + "language_loss": 0.91078663, + "learning_rate": 2.51876455396287e-06, + "loss": 0.94266832, + "num_input_tokens_seen": 948650, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 5.9375, + "step": 50, + "time_per_iteration": 2.6860456466674805 + }, + { + "auxiliary_loss_clip": 0.01953364, + "auxiliary_loss_mlp": 0.01201227, + "balance_loss_clip": 1.11778045, + "balance_loss_mlp": 1.36547732, + "epoch": 0.003066285886066436, + "flos": 31827052316160.0, + "grad_norm": 3.5299735782100026, + "language_loss": 0.87144494, + "learning_rate": 2.5315145187866316e-06, + "loss": 0.90299082, + "num_input_tokens_seen": 966455, + "router_z_loss_clip": 0.83203125, + "router_z_loss_mlp": 5.875, + "step": 51, + "time_per_iteration": 2.7481534481048584 + }, + { + "auxiliary_loss_clip": 0.01909154, + "auxiliary_loss_mlp": 0.01207037, + "balance_loss_clip": 1.12707186, + "balance_loss_mlp": 1.35597348, + "epoch": 0.0031264091387344056, + "flos": 41427482774400.0, + "grad_norm": 2.0262044932375836, + "language_loss": 0.95253396, + "learning_rate": 2.5440168957651953e-06, + "loss": 0.98369586, + "num_input_tokens_seen": 988110, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 5.53125, + "step": 52, + "time_per_iteration": 2.8958797454833984 + }, + { + "auxiliary_loss_clip": 0.01904814, + "auxiliary_loss_mlp": 0.01243661, + "balance_loss_clip": 1.16274214, + "balance_loss_mlp": 1.35173535, + "epoch": 0.0031865323914023747, + "flos": 23440941970560.0, + "grad_norm": 3.3193539013945546, + "language_loss": 0.92261833, + "learning_rate": 2.5562811176888872e-06, + "loss": 0.95410311, + "num_input_tokens_seen": 1008550, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 5.53125, + "step": 53, + "time_per_iteration": 2.7579286098480225 + }, + { + "auxiliary_loss_clip": 0.01893968, + "auxiliary_loss_mlp": 0.01196907, + "balance_loss_clip": 1.11489081, + "balance_loss_mlp": 1.35535884, + "epoch": 0.003246655644070344, + "flos": 14429195510400.0, + "grad_norm": 2.2021865200163, + "language_loss": 0.82945669, + "learning_rate": 2.5683160883431093e-06, + "loss": 0.86036545, + "num_input_tokens_seen": 1026840, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 5.375, + "step": 54, + "time_per_iteration": 2.684718132019043 + }, + { + "auxiliary_loss_clip": 0.01889572, + "auxiliary_loss_mlp": 0.01211293, + "balance_loss_clip": 1.13113666, + "balance_loss_mlp": 1.34359026, + "epoch": 0.0033067788967383136, + "flos": 35918247496320.0, + "grad_norm": 2.4060188817442487, + "language_loss": 0.81305432, + "learning_rate": 2.580130221340046e-06, + "loss": 0.84406298, + "num_input_tokens_seen": 1048875, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 5.4375, + "step": 55, + "time_per_iteration": 2.7722246646881104 + }, + { + "auxiliary_loss_clip": 0.01879346, + "auxiliary_loss_mlp": 0.01199903, + "balance_loss_clip": 1.11926973, + "balance_loss_mlp": 1.33773279, + "epoch": 0.003366902149406283, + "flos": 22958732862720.0, + "grad_norm": 2.497299649397407, + "language_loss": 0.87261844, + "learning_rate": 2.5917314754514246e-06, + "loss": 0.90341091, + "num_input_tokens_seen": 1066435, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 5.40625, + "step": 56, + "time_per_iteration": 2.7031195163726807 + }, + { + "auxiliary_loss_clip": 0.01879922, + "auxiliary_loss_mlp": 0.01161266, + "balance_loss_clip": 1.0864507, + "balance_loss_mlp": 1.33024335, + "epoch": 0.003427025402074252, + "flos": 26582838560640.0, + "grad_norm": 2.4089458733946882, + "language_loss": 0.92949611, + "learning_rate": 2.6031273868139713e-06, + "loss": 0.95990801, + "num_input_tokens_seen": 1090330, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 5.5, + "step": 57, + "time_per_iteration": 2.8580281734466553 + }, + { + "auxiliary_loss_clip": 0.01843074, + "auxiliary_loss_mlp": 0.01217004, + "balance_loss_clip": 1.14395308, + "balance_loss_mlp": 1.33453596, + "epoch": 0.0034871486547422216, + "flos": 23951196622080.0, + "grad_norm": 2.105168727735643, + "language_loss": 0.99725533, + "learning_rate": 2.614325098333948e-06, + "loss": 1.02785611, + "num_input_tokens_seen": 1109840, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 5.09375, + "step": 58, + "time_per_iteration": 2.687504529953003 + }, + { + "auxiliary_loss_clip": 0.01822907, + "auxiliary_loss_mlp": 0.01195384, + "balance_loss_clip": 1.12319088, + "balance_loss_mlp": 1.32094967, + "epoch": 0.003547271907410191, + "flos": 21214983214080.0, + "grad_norm": 2.1328304194940855, + "language_loss": 0.8821373, + "learning_rate": 2.625331386578098e-06, + "loss": 0.9123202, + "num_input_tokens_seen": 1128415, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 5.03125, + "step": 59, + "time_per_iteration": 6.997380495071411 + }, + { + "auxiliary_loss_clip": 0.01844896, + "auxiliary_loss_mlp": 0.01162144, + "balance_loss_clip": 1.08885431, + "balance_loss_mlp": 1.32932925, + "epoch": 0.00360739516007816, + "flos": 16504903676160.0, + "grad_norm": 2.097582115586327, + "language_loss": 0.93430054, + "learning_rate": 2.63615268640451e-06, + "loss": 0.96437097, + "num_input_tokens_seen": 1146515, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 5.15625, + "step": 60, + "time_per_iteration": 2.67743182182312 + }, + { + "auxiliary_loss_clip": 0.0182307, + "auxiliary_loss_mlp": 0.01172385, + "balance_loss_clip": 1.10376787, + "balance_loss_mlp": 1.31307459, + "epoch": 0.0036675184127461296, + "flos": 19464805031040.0, + "grad_norm": 4.241258673484683, + "language_loss": 0.90090871, + "learning_rate": 2.6467951135575943e-06, + "loss": 0.93086326, + "num_input_tokens_seen": 1166330, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 5.09375, + "step": 61, + "time_per_iteration": 2.707247257232666 + }, + { + "auxiliary_loss_clip": 0.01806801, + "auxiliary_loss_mlp": 0.01142561, + "balance_loss_clip": 1.07475519, + "balance_loss_mlp": 1.31002319, + "epoch": 0.003727641665414099, + "flos": 20957323979520.0, + "grad_norm": 3.0487456468745586, + "language_loss": 0.88434047, + "learning_rate": 2.657264485425803e-06, + "loss": 0.9138341, + "num_input_tokens_seen": 1186010, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 4.96875, + "step": 62, + "time_per_iteration": 2.736107587814331 + }, + { + "auxiliary_loss_clip": 0.01787131, + "auxiliary_loss_mlp": 0.01161947, + "balance_loss_clip": 1.09132755, + "balance_loss_mlp": 1.30018497, + "epoch": 0.003787764918082068, + "flos": 18406050721920.0, + "grad_norm": 2.6509198595432406, + "language_loss": 0.96265876, + "learning_rate": 2.6675663401385186e-06, + "loss": 0.99214947, + "num_input_tokens_seen": 1204985, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 4.875, + "step": 63, + "time_per_iteration": 2.6760194301605225 + }, + { + "auxiliary_loss_clip": 0.01795174, + "auxiliary_loss_mlp": 0.01169703, + "balance_loss_clip": 1.10284996, + "balance_loss_mlp": 1.30725491, + "epoch": 0.0038478881707500376, + "flos": 12459243962880.0, + "grad_norm": 2.677484479433752, + "language_loss": 0.99141657, + "learning_rate": 2.677705954159056e-06, + "loss": 1.02106524, + "num_input_tokens_seen": 1223545, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 4.875, + "step": 64, + "time_per_iteration": 2.675295114517212 + }, + { + "auxiliary_loss_clip": 0.01802087, + "auxiliary_loss_mlp": 0.01149441, + "balance_loss_clip": 1.08134842, + "balance_loss_mlp": 1.30652797, + "epoch": 0.003908011423418007, + "flos": 13553334276480.0, + "grad_norm": 2.45939593962701, + "language_loss": 0.85358196, + "learning_rate": 2.6876883585136904e-06, + "loss": 0.88309723, + "num_input_tokens_seen": 1241175, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 4.9375, + "step": 65, + "time_per_iteration": 2.647696018218994 + }, + { + "auxiliary_loss_clip": 0.01779034, + "auxiliary_loss_mlp": 0.01156784, + "balance_loss_clip": 1.0886445, + "balance_loss_mlp": 1.29322505, + "epoch": 0.003968134676085976, + "flos": 18333475292160.0, + "grad_norm": 2.8561979494145033, + "language_loss": 0.85224223, + "learning_rate": 2.697518353781685e-06, + "loss": 0.88160038, + "num_input_tokens_seen": 1259315, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 4.875, + "step": 66, + "time_per_iteration": 2.617143392562866 + }, + { + "auxiliary_loss_clip": 0.01782156, + "auxiliary_loss_mlp": 0.01152634, + "balance_loss_clip": 1.07648349, + "balance_loss_mlp": 1.29168975, + "epoch": 0.004028257928753946, + "flos": 20485242506880.0, + "grad_norm": 2.246759082278279, + "language_loss": 0.96454394, + "learning_rate": 2.7072005239581103e-06, + "loss": 0.99389184, + "num_input_tokens_seen": 1277055, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 4.90625, + "step": 67, + "time_per_iteration": 2.6343421936035156 + }, + { + "auxiliary_loss_clip": 0.01753238, + "auxiliary_loss_mlp": 0.01155696, + "balance_loss_clip": 1.08340704, + "balance_loss_mlp": 1.28524387, + "epoch": 0.004088381181421915, + "flos": 18843837684480.0, + "grad_norm": 2.549207131743101, + "language_loss": 0.94534445, + "learning_rate": 2.7167392492896727e-06, + "loss": 0.97443378, + "num_input_tokens_seen": 1294355, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 4.6875, + "step": 68, + "time_per_iteration": 2.614696741104126 + }, + { + "auxiliary_loss_clip": 0.01748377, + "auxiliary_loss_mlp": 0.01156697, + "balance_loss_clip": 1.08717394, + "balance_loss_mlp": 1.28268003, + "epoch": 0.004148504434089885, + "flos": 19427817000960.0, + "grad_norm": 1.9922029239060344, + "language_loss": 0.95657748, + "learning_rate": 2.7261387181735195e-06, + "loss": 0.98562825, + "num_input_tokens_seen": 1313525, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 4.65625, + "step": 69, + "time_per_iteration": 2.6637492179870605 + }, + { + "auxiliary_loss_clip": 0.01742428, + "auxiliary_loss_mlp": 0.01160645, + "balance_loss_clip": 1.09598637, + "balance_loss_mlp": 1.2855866, + "epoch": 0.004208627686757853, + "flos": 20811023884800.0, + "grad_norm": 2.4176731159017075, + "language_loss": 0.98073572, + "learning_rate": 2.7354029381999196e-06, + "loss": 1.00976658, + "num_input_tokens_seen": 1330505, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 4.5625, + "step": 70, + "time_per_iteration": 2.6395556926727295 + }, + { + "auxiliary_loss_clip": 0.01748999, + "auxiliary_loss_mlp": 0.01146397, + "balance_loss_clip": 1.07673144, + "balance_loss_mlp": 1.2760632, + "epoch": 0.004268750939425823, + "flos": 19098623831040.0, + "grad_norm": 2.71386904393857, + "language_loss": 0.93927777, + "learning_rate": 2.7445357464116983e-06, + "loss": 0.96823174, + "num_input_tokens_seen": 1349615, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 4.75, + "step": 71, + "time_per_iteration": 2.628272294998169 + }, + { + "auxiliary_loss_clip": 0.01838762, + "auxiliary_loss_mlp": 0.01327632, + "balance_loss_clip": 1.28967619, + "balance_loss_mlp": 1.43997037, + "epoch": 0.004328874192093792, + "flos": 52439635514880.0, + "grad_norm": 2.4194543250518663, + "language_loss": 0.65655279, + "learning_rate": 2.75354081884615e-06, + "loss": 0.68821681, + "num_input_tokens_seen": 1410275, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 4.0, + "step": 72, + "time_per_iteration": 3.104635000228882 + }, + { + "auxiliary_loss_clip": 0.01820285, + "auxiliary_loss_mlp": 0.01295248, + "balance_loss_clip": 1.25824571, + "balance_loss_mlp": 1.43420911, + "epoch": 0.004388997444761762, + "flos": 66473239564800.0, + "grad_norm": 2.2482458517722455, + "language_loss": 0.63711512, + "learning_rate": 2.7624216794188286e-06, + "loss": 0.66827047, + "num_input_tokens_seen": 1473020, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 3.859375, + "step": 73, + "time_per_iteration": 3.208836793899536 + }, + { + "auxiliary_loss_clip": 0.01723308, + "auxiliary_loss_mlp": 0.01141966, + "balance_loss_clip": 1.07382631, + "balance_loss_mlp": 1.26790953, + "epoch": 0.004449120697429731, + "flos": 18952970181120.0, + "grad_norm": 2.4515337577309424, + "language_loss": 0.85899854, + "learning_rate": 2.771181708202938e-06, + "loss": 0.88765126, + "num_input_tokens_seen": 1490385, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 4.5625, + "step": 74, + "time_per_iteration": 2.6287550926208496 + }, + { + "auxiliary_loss_clip": 0.01725734, + "auxiliary_loss_mlp": 0.01165418, + "balance_loss_clip": 1.09584761, + "balance_loss_mlp": 1.26750898, + "epoch": 0.004509243950097701, + "flos": 21105491581440.0, + "grad_norm": 2.110493434952054, + "language_loss": 0.9716984, + "learning_rate": 2.779824149153005e-06, + "loss": 1.00060987, + "num_input_tokens_seen": 1509725, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 4.5625, + "step": 75, + "time_per_iteration": 2.635618209838867 + }, + { + "auxiliary_loss_clip": 0.01704277, + "auxiliary_loss_mlp": 0.01144942, + "balance_loss_clip": 1.07875705, + "balance_loss_mlp": 1.26302838, + "epoch": 0.004569367202765669, + "flos": 20698730991360.0, + "grad_norm": 2.60583579179481, + "language_loss": 0.87675405, + "learning_rate": 2.788352117317012e-06, + "loss": 0.9052462, + "num_input_tokens_seen": 1527245, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 4.4375, + "step": 76, + "time_per_iteration": 2.6379826068878174 + }, + { + "auxiliary_loss_clip": 0.01705571, + "auxiliary_loss_mlp": 0.0114831, + "balance_loss_clip": 1.07845366, + "balance_loss_mlp": 1.26138341, + "epoch": 0.004629490455433639, + "flos": 28658474899200.0, + "grad_norm": 1.9080158042054207, + "language_loss": 0.91751724, + "learning_rate": 2.796768605577095e-06, + "loss": 0.94605613, + "num_input_tokens_seen": 1548930, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 4.4375, + "step": 77, + "time_per_iteration": 2.6596872806549072 + }, + { + "auxiliary_loss_clip": 0.01694222, + "auxiliary_loss_mlp": 0.01165235, + "balance_loss_clip": 1.09494948, + "balance_loss_mlp": 1.26167083, + "epoch": 0.004689613708101608, + "flos": 11072409805440.0, + "grad_norm": 2.1229280552318803, + "language_loss": 0.92189825, + "learning_rate": 2.80507649095533e-06, + "loss": 0.95049286, + "num_input_tokens_seen": 1565695, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 4.3125, + "step": 78, + "time_per_iteration": 2.598590850830078 + }, + { + "auxiliary_loss_clip": 0.01690635, + "auxiliary_loss_mlp": 0.01155594, + "balance_loss_clip": 1.08735824, + "balance_loss_mlp": 1.25696921, + "epoch": 0.004749736960769578, + "flos": 21799106184960.0, + "grad_norm": 2.280813483182965, + "language_loss": 0.82480371, + "learning_rate": 2.813278540517843e-06, + "loss": 0.85326606, + "num_input_tokens_seen": 1582625, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 4.34375, + "step": 79, + "time_per_iteration": 2.6215708255767822 + }, + { + "auxiliary_loss_clip": 0.01705122, + "auxiliary_loss_mlp": 0.01133248, + "balance_loss_clip": 1.06315339, + "balance_loss_mlp": 1.26029253, + "epoch": 0.004809860213437547, + "flos": 19792597570560.0, + "grad_norm": 2.4809717100134616, + "language_loss": 0.91311121, + "learning_rate": 2.8213774169075505e-06, + "loss": 0.94149494, + "num_input_tokens_seen": 1601725, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 4.4375, + "step": 80, + "time_per_iteration": 2.639841079711914 + }, + { + "auxiliary_loss_clip": 0.01674552, + "auxiliary_loss_mlp": 0.01142875, + "balance_loss_clip": 1.07254159, + "balance_loss_mlp": 1.25350285, + "epoch": 0.004869983466105517, + "flos": 26574327037440.0, + "grad_norm": 2.165091554789383, + "language_loss": 0.94981706, + "learning_rate": 2.829375683533245e-06, + "loss": 0.97799134, + "num_input_tokens_seen": 1622420, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 4.21875, + "step": 81, + "time_per_iteration": 2.6689717769622803 + }, + { + "auxiliary_loss_clip": 0.01688803, + "auxiliary_loss_mlp": 0.01148831, + "balance_loss_clip": 1.08269382, + "balance_loss_mlp": 1.25745821, + "epoch": 0.004930106718773485, + "flos": 12823378087680.0, + "grad_norm": 2.9914678747629226, + "language_loss": 0.96341741, + "learning_rate": 2.8372758094402803e-06, + "loss": 0.99179375, + "num_input_tokens_seen": 1640715, + "router_z_loss_clip": 0.66015625, + "router_z_loss_mlp": 4.3125, + "step": 82, + "time_per_iteration": 2.629596471786499 + }, + { + "auxiliary_loss_clip": 0.01671229, + "auxiliary_loss_mlp": 0.01159801, + "balance_loss_clip": 1.09013557, + "balance_loss_mlp": 1.24528587, + "epoch": 0.004990229971441455, + "flos": 25774919902080.0, + "grad_norm": 2.533591741594043, + "language_loss": 0.8664127, + "learning_rate": 2.84508017388607e-06, + "loss": 0.894723, + "num_input_tokens_seen": 1662210, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 4.25, + "step": 83, + "time_per_iteration": 2.7277162075042725 + }, + { + "auxiliary_loss_clip": 0.01664198, + "auxiliary_loss_mlp": 0.01156919, + "balance_loss_clip": 1.08663368, + "balance_loss_mlp": 1.24647975, + "epoch": 0.005050353224109424, + "flos": 17457254922240.0, + "grad_norm": 3.373799694341511, + "language_loss": 0.91779828, + "learning_rate": 2.852791070641559e-06, + "loss": 0.94600952, + "num_input_tokens_seen": 1681070, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 4.1875, + "step": 84, + "time_per_iteration": 2.62187123298645 + }, + { + "auxiliary_loss_clip": 0.01645783, + "auxiliary_loss_mlp": 0.01205663, + "balance_loss_clip": 1.17075825, + "balance_loss_mlp": 1.34984684, + "epoch": 0.005110476476777394, + "flos": 69805460367360.0, + "grad_norm": 1.4266053341540552, + "language_loss": 0.62504542, + "learning_rate": 2.8604107120381682e-06, + "loss": 0.65355992, + "num_input_tokens_seen": 1747140, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 2.96875, + "step": 85, + "time_per_iteration": 3.190223217010498 + }, + { + "auxiliary_loss_clip": 0.0165122, + "auxiliary_loss_mlp": 0.01127154, + "balance_loss_clip": 1.05648708, + "balance_loss_mlp": 1.23674285, + "epoch": 0.005170599729445363, + "flos": 24790105739520.0, + "grad_norm": 1.7428139018461835, + "language_loss": 0.90836501, + "learning_rate": 2.8679412327780482e-06, + "loss": 0.93614876, + "num_input_tokens_seen": 1767475, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 4.15625, + "step": 86, + "time_per_iteration": 2.66162109375 + }, + { + "auxiliary_loss_clip": 0.01655877, + "auxiliary_loss_mlp": 0.01161945, + "balance_loss_clip": 1.09065783, + "balance_loss_mlp": 1.24282312, + "epoch": 0.005230722982113333, + "flos": 23258048895360.0, + "grad_norm": 2.38275425723773, + "language_loss": 0.8209877, + "learning_rate": 2.8753846935240833e-06, + "loss": 0.84916592, + "num_input_tokens_seen": 1784980, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 4.125, + "step": 87, + "time_per_iteration": 2.618906021118164 + }, + { + "auxiliary_loss_clip": 0.01644726, + "auxiliary_loss_mlp": 0.01154792, + "balance_loss_clip": 1.08617568, + "balance_loss_mlp": 1.24127626, + "epoch": 0.005290846234781301, + "flos": 16727909264640.0, + "grad_norm": 1.8918921085406437, + "language_loss": 0.95630223, + "learning_rate": 2.8827430842847267e-06, + "loss": 0.98429739, + "num_input_tokens_seen": 1803030, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 4.03125, + "step": 88, + "time_per_iteration": 2.5916671752929688 + }, + { + "auxiliary_loss_clip": 0.01661198, + "auxiliary_loss_mlp": 0.0114963, + "balance_loss_clip": 1.08230066, + "balance_loss_mlp": 1.24101663, + "epoch": 0.005350969487449271, + "flos": 20886077352960.0, + "grad_norm": 1.9438908009999392, + "language_loss": 0.85920149, + "learning_rate": 2.8900183276075957e-06, + "loss": 0.88730979, + "num_input_tokens_seen": 1822865, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 4.1875, + "step": 89, + "time_per_iteration": 2.6486849784851074 + }, + { + "auxiliary_loss_clip": 0.01648909, + "auxiliary_loss_mlp": 0.01132231, + "balance_loss_clip": 1.06547391, + "balance_loss_mlp": 1.23491406, + "epoch": 0.00541109274011724, + "flos": 26209977431040.0, + "grad_norm": 4.519706664825811, + "language_loss": 0.91517568, + "learning_rate": 2.8972122815946455e-06, + "loss": 0.94298708, + "num_input_tokens_seen": 1842435, + "router_z_loss_clip": 0.66796875, + "router_z_loss_mlp": 4.125, + "step": 90, + "time_per_iteration": 2.658997058868408 + }, + { + "auxiliary_loss_clip": 0.01630542, + "auxiliary_loss_mlp": 0.0113282, + "balance_loss_clip": 1.06496572, + "balance_loss_mlp": 1.23102689, + "epoch": 0.00547121599278521, + "flos": 21178569801600.0, + "grad_norm": 2.2090932400382486, + "language_loss": 0.8587057, + "learning_rate": 2.90432674275074e-06, + "loss": 0.88633931, + "num_input_tokens_seen": 1860065, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 3.984375, + "step": 91, + "time_per_iteration": 2.619231939315796 + }, + { + "auxiliary_loss_clip": 0.01629785, + "auxiliary_loss_mlp": 0.01140917, + "balance_loss_clip": 1.07458866, + "balance_loss_mlp": 1.22673059, + "epoch": 0.005531339245453179, + "flos": 19718801078400.0, + "grad_norm": 2.769705373909222, + "language_loss": 0.86930025, + "learning_rate": 2.91136344867656e-06, + "loss": 0.89700729, + "num_input_tokens_seen": 1878135, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 4.03125, + "step": 92, + "time_per_iteration": 2.646968126296997 + }, + { + "auxiliary_loss_clip": 0.01621216, + "auxiliary_loss_mlp": 0.01179948, + "balance_loss_clip": 1.1122849, + "balance_loss_mlp": 1.21872091, + "epoch": 0.005591462498121149, + "flos": 17636089760640.0, + "grad_norm": 2.5030178409929, + "language_loss": 0.92042911, + "learning_rate": 2.918324080615938e-06, + "loss": 0.94844079, + "num_input_tokens_seen": 1894895, + "router_z_loss_clip": 0.67578125, + "router_z_loss_mlp": 4.03125, + "step": 93, + "time_per_iteration": 2.59853196144104 + }, + { + "auxiliary_loss_clip": 0.016342, + "auxiliary_loss_mlp": 0.01152159, + "balance_loss_clip": 1.08120561, + "balance_loss_mlp": 1.22512126, + "epoch": 0.005651585750789117, + "flos": 20011221699840.0, + "grad_norm": 2.2071592078672198, + "language_loss": 0.87372428, + "learning_rate": 2.925210265866963e-06, + "loss": 0.90158784, + "num_input_tokens_seen": 1913220, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 4.09375, + "step": 94, + "time_per_iteration": 2.587707281112671 + }, + { + "auxiliary_loss_clip": 0.01562532, + "auxiliary_loss_mlp": 0.01067909, + "balance_loss_clip": 1.03243279, + "balance_loss_mlp": 1.30452466, + "epoch": 0.005711709003457087, + "flos": 59812957981440.0, + "grad_norm": 1.3851210442303683, + "language_loss": 0.6813519, + "learning_rate": 2.932023580065507e-06, + "loss": 0.70765626, + "num_input_tokens_seen": 1970970, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 2.578125, + "step": 95, + "time_per_iteration": 3.067047595977783 + }, + { + "auxiliary_loss_clip": 0.01611383, + "auxiliary_loss_mlp": 0.01154317, + "balance_loss_clip": 1.08693981, + "balance_loss_mlp": 1.21303511, + "epoch": 0.005771832256125056, + "flos": 15559591495680.0, + "grad_norm": 2.5109536438971976, + "language_loss": 0.89978027, + "learning_rate": 2.9387655493491906e-06, + "loss": 0.92743719, + "num_input_tokens_seen": 1988930, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 3.984375, + "step": 96, + "time_per_iteration": 2.590522289276123 + }, + { + "auxiliary_loss_clip": 0.01603776, + "auxiliary_loss_mlp": 0.01143264, + "balance_loss_clip": 1.08108413, + "balance_loss_mlp": 1.21597803, + "epoch": 0.005831955508793026, + "flos": 22528380015360.0, + "grad_norm": 2.825781473558237, + "language_loss": 0.89798892, + "learning_rate": 2.9454376524092147e-06, + "loss": 0.92545933, + "num_input_tokens_seen": 2006285, + "router_z_loss_clip": 0.62109375, + "router_z_loss_mlp": 3.875, + "step": 97, + "time_per_iteration": 2.630364179611206 + }, + { + "auxiliary_loss_clip": 0.0158997, + "auxiliary_loss_mlp": 0.01139649, + "balance_loss_clip": 1.07103181, + "balance_loss_mlp": 1.20754981, + "epoch": 0.005892078761460995, + "flos": 22049834094720.0, + "grad_norm": 2.1954130163748573, + "language_loss": 0.76553786, + "learning_rate": 2.952041322436969e-06, + "loss": 0.79283404, + "num_input_tokens_seen": 2024905, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 3.8125, + "step": 98, + "time_per_iteration": 2.6088852882385254 + }, + { + "auxiliary_loss_clip": 0.01531856, + "auxiliary_loss_mlp": 0.01047217, + "balance_loss_clip": 1.01250362, + "balance_loss_mlp": 1.28449416, + "epoch": 0.005952202014128965, + "flos": 68539143317760.0, + "grad_norm": 1.0389188302362988, + "language_loss": 0.65464473, + "learning_rate": 2.9585779489718204e-06, + "loss": 0.68043554, + "num_input_tokens_seen": 2086220, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 2.46875, + "step": 99, + "time_per_iteration": 3.196779251098633 + }, + { + "auxiliary_loss_clip": 0.0159215, + "auxiliary_loss_mlp": 0.01143603, + "balance_loss_clip": 1.07312632, + "balance_loss_mlp": 1.20754516, + "epoch": 0.006012325266796933, + "flos": 22960887678720.0, + "grad_norm": 2.02393591458392, + "language_loss": 0.90861535, + "learning_rate": 2.9650488796560464e-06, + "loss": 0.93597281, + "num_input_tokens_seen": 2103365, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 3.84375, + "step": 100, + "time_per_iteration": 2.659716844558716 + }, + { + "auxiliary_loss_clip": 0.01602583, + "auxiliary_loss_mlp": 0.01150362, + "balance_loss_clip": 1.08360529, + "balance_loss_mlp": 1.21008992, + "epoch": 0.006072448519464903, + "flos": 17347942857600.0, + "grad_norm": 9.149928686451464, + "language_loss": 0.91165614, + "learning_rate": 2.971455421902446e-06, + "loss": 0.93918556, + "num_input_tokens_seen": 2121995, + "router_z_loss_clip": 0.66796875, + "router_z_loss_mlp": 3.921875, + "step": 101, + "time_per_iteration": 5.522722959518433 + }, + { + "auxiliary_loss_clip": 0.01592164, + "auxiliary_loss_mlp": 0.01153598, + "balance_loss_clip": 1.08273995, + "balance_loss_mlp": 1.21078956, + "epoch": 0.006132571772132872, + "flos": 24681116897280.0, + "grad_norm": 2.149611483260168, + "language_loss": 0.90634245, + "learning_rate": 2.9777988444798075e-06, + "loss": 0.9338001, + "num_input_tokens_seen": 2141815, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 3.8125, + "step": 102, + "time_per_iteration": 2.7264201641082764 + }, + { + "auxiliary_loss_clip": 0.01586171, + "auxiliary_loss_mlp": 0.01134806, + "balance_loss_clip": 1.06986046, + "balance_loss_mlp": 1.20794034, + "epoch": 0.006192695024800842, + "flos": 21465675210240.0, + "grad_norm": 2.4455555336324135, + "language_loss": 0.87990314, + "learning_rate": 2.9840803790210285e-06, + "loss": 0.9071129, + "num_input_tokens_seen": 2161125, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 3.78125, + "step": 103, + "time_per_iteration": 2.6332345008850098 + }, + { + "auxiliary_loss_clip": 0.01586169, + "auxiliary_loss_mlp": 0.01136721, + "balance_loss_clip": 1.07015502, + "balance_loss_mlp": 1.2100153, + "epoch": 0.006252818277468811, + "flos": 17420410546560.0, + "grad_norm": 1.9653003456434248, + "language_loss": 0.93796182, + "learning_rate": 2.990301221458371e-06, + "loss": 0.96519077, + "num_input_tokens_seen": 2179510, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 3.765625, + "step": 104, + "time_per_iteration": 2.5763180255889893 + }, + { + "auxiliary_loss_clip": 0.01576682, + "auxiliary_loss_mlp": 0.01148107, + "balance_loss_clip": 1.08382916, + "balance_loss_mlp": 1.20004964, + "epoch": 0.006312941530136781, + "flos": 19099557584640.0, + "grad_norm": 2.978383813748495, + "language_loss": 0.96302718, + "learning_rate": 2.9964625333900544e-06, + "loss": 0.99027503, + "num_input_tokens_seen": 2197870, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 3.765625, + "step": 105, + "time_per_iteration": 2.598074197769165 + }, + { + "auxiliary_loss_clip": 0.01576054, + "auxiliary_loss_mlp": 0.01157995, + "balance_loss_clip": 1.08618331, + "balance_loss_mlp": 1.20040035, + "epoch": 0.006373064782804749, + "flos": 24060831909120.0, + "grad_norm": 2.254409296180574, + "language_loss": 0.86981636, + "learning_rate": 3.002565443382063e-06, + "loss": 0.89715683, + "num_input_tokens_seen": 2217495, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 3.75, + "step": 106, + "time_per_iteration": 2.620400905609131 + }, + { + "auxiliary_loss_clip": 0.01558878, + "auxiliary_loss_mlp": 0.01142953, + "balance_loss_clip": 1.07462192, + "balance_loss_mlp": 1.18650925, + "epoch": 0.006433188035472719, + "flos": 18332433797760.0, + "grad_norm": 2.299900982703377, + "language_loss": 0.8342824, + "learning_rate": 3.008611048208843e-06, + "loss": 0.86130083, + "num_input_tokens_seen": 2236520, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 3.71875, + "step": 107, + "time_per_iteration": 2.6031439304351807 + }, + { + "auxiliary_loss_clip": 0.01473949, + "auxiliary_loss_mlp": 0.01044987, + "balance_loss_clip": 1.01294351, + "balance_loss_mlp": 1.24969411, + "epoch": 0.006493311288140688, + "flos": 62562387594240.0, + "grad_norm": 0.9921074222226888, + "language_loss": 0.64829654, + "learning_rate": 3.014600414036285e-06, + "loss": 0.67348593, + "num_input_tokens_seen": 2300140, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 2.25, + "step": 108, + "time_per_iteration": 3.1797876358032227 + }, + { + "auxiliary_loss_clip": 0.01549803, + "auxiliary_loss_mlp": 0.0113223, + "balance_loss_clip": 1.0634706, + "balance_loss_mlp": 1.18794155, + "epoch": 0.006553434540808658, + "flos": 19500141035520.0, + "grad_norm": 3.0292528917398895, + "language_loss": 0.97705221, + "learning_rate": 3.0205345775501937e-06, + "loss": 1.00387263, + "num_input_tokens_seen": 2317320, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 3.625, + "step": 109, + "time_per_iteration": 2.587251663208008 + }, + { + "auxiliary_loss_clip": 0.01548304, + "auxiliary_loss_mlp": 0.01143686, + "balance_loss_clip": 1.07759643, + "balance_loss_mlp": 1.18955791, + "epoch": 0.006613557793476627, + "flos": 21105132445440.0, + "grad_norm": 1.7037490209774204, + "language_loss": 0.84119976, + "learning_rate": 3.0264145470332218e-06, + "loss": 0.86811972, + "num_input_tokens_seen": 2337820, + "router_z_loss_clip": 0.66015625, + "router_z_loss_mlp": 3.59375, + "step": 110, + "time_per_iteration": 2.612900495529175 + }, + { + "auxiliary_loss_clip": 0.01543027, + "auxiliary_loss_mlp": 0.01148771, + "balance_loss_clip": 1.08287191, + "balance_loss_mlp": 1.18348098, + "epoch": 0.006673681046144597, + "flos": 26030747543040.0, + "grad_norm": 2.0686651571732186, + "language_loss": 0.83053756, + "learning_rate": 3.032241303393073e-06, + "loss": 0.85745549, + "num_input_tokens_seen": 2358560, + "router_z_loss_clip": 0.66015625, + "router_z_loss_mlp": 3.59375, + "step": 111, + "time_per_iteration": 2.648775815963745 + }, + { + "auxiliary_loss_clip": 0.01543945, + "auxiliary_loss_mlp": 0.01132291, + "balance_loss_clip": 1.06906247, + "balance_loss_mlp": 1.18600404, + "epoch": 0.006733804298812566, + "flos": 23147767163520.0, + "grad_norm": 1.9360906695559799, + "language_loss": 0.94064176, + "learning_rate": 3.0380158011446e-06, + "loss": 0.96740413, + "num_input_tokens_seen": 2379005, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 3.59375, + "step": 112, + "time_per_iteration": 2.5952305793762207 + }, + { + "auxiliary_loss_clip": 0.01547241, + "auxiliary_loss_mlp": 0.0113746, + "balance_loss_clip": 1.07342076, + "balance_loss_mlp": 1.18214464, + "epoch": 0.006793927551480535, + "flos": 11764444210560.0, + "grad_norm": 2.4119047199233594, + "language_loss": 0.79298341, + "learning_rate": 3.0437389693482466e-06, + "loss": 0.81983036, + "num_input_tokens_seen": 2395610, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 3.65625, + "step": 113, + "time_per_iteration": 2.524744987487793 + }, + { + "auxiliary_loss_clip": 0.01535171, + "auxiliary_loss_mlp": 0.01130123, + "balance_loss_clip": 1.06460583, + "balance_loss_mlp": 1.1784718, + "epoch": 0.006854050804148504, + "flos": 19171953446400.0, + "grad_norm": 2.1108584765070924, + "language_loss": 0.93168736, + "learning_rate": 3.0494117125071475e-06, + "loss": 0.95834035, + "num_input_tokens_seen": 2415005, + "router_z_loss_clip": 0.65625, + "router_z_loss_mlp": 3.5625, + "step": 114, + "time_per_iteration": 2.6716785430908203 + }, + { + "auxiliary_loss_clip": 0.01541748, + "auxiliary_loss_mlp": 0.01138267, + "balance_loss_clip": 1.07828045, + "balance_loss_mlp": 1.17785645, + "epoch": 0.006914174056816474, + "flos": 21981891519360.0, + "grad_norm": 2.266348661789013, + "language_loss": 0.94440514, + "learning_rate": 3.055034911425055e-06, + "loss": 0.97120523, + "num_input_tokens_seen": 2433965, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 3.640625, + "step": 115, + "time_per_iteration": 2.6136229038238525 + }, + { + "auxiliary_loss_clip": 0.01536673, + "auxiliary_loss_mlp": 0.0111845, + "balance_loss_clip": 1.052122, + "balance_loss_mlp": 1.1758287, + "epoch": 0.006974297309484443, + "flos": 16289152634880.0, + "grad_norm": 12.665326776351556, + "language_loss": 0.81903678, + "learning_rate": 3.0606094240271244e-06, + "loss": 0.84558797, + "num_input_tokens_seen": 2451605, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 3.609375, + "step": 116, + "time_per_iteration": 2.577003240585327 + }, + { + "auxiliary_loss_clip": 0.01526673, + "auxiliary_loss_mlp": 0.01127935, + "balance_loss_clip": 1.06375241, + "balance_loss_mlp": 1.17504787, + "epoch": 0.007034420562152413, + "flos": 26104005331200.0, + "grad_norm": 2.0071741256932794, + "language_loss": 0.88063896, + "learning_rate": 3.0661360861454656e-06, + "loss": 0.90718508, + "num_input_tokens_seen": 2472035, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 3.515625, + "step": 117, + "time_per_iteration": 2.611503839492798 + }, + { + "auxiliary_loss_clip": 0.01525448, + "auxiliary_loss_mlp": 0.01143736, + "balance_loss_clip": 1.07840896, + "balance_loss_mlp": 1.17308259, + "epoch": 0.007094543814820382, + "flos": 14204609723520.0, + "grad_norm": 2.5473368597875594, + "language_loss": 0.84470415, + "learning_rate": 3.071615712271274e-06, + "loss": 0.87139601, + "num_input_tokens_seen": 2489285, + "router_z_loss_clip": 0.65234375, + "router_z_loss_mlp": 3.53125, + "step": 118, + "time_per_iteration": 2.577461004257202 + }, + { + "auxiliary_loss_clip": 0.01536798, + "auxiliary_loss_mlp": 0.01163532, + "balance_loss_clip": 1.09930205, + "balance_loss_mlp": 1.1748507, + "epoch": 0.007154667067488351, + "flos": 14976007228800.0, + "grad_norm": 2.057592918726277, + "language_loss": 0.99470234, + "learning_rate": 3.0770490962752172e-06, + "loss": 1.02170563, + "num_input_tokens_seen": 2506460, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 3.625, + "step": 119, + "time_per_iteration": 2.549661636352539 + }, + { + "auxiliary_loss_clip": 0.01537395, + "auxiliary_loss_mlp": 0.0111939, + "balance_loss_clip": 1.05701971, + "balance_loss_mlp": 1.16968298, + "epoch": 0.00721479032015632, + "flos": 20193288762240.0, + "grad_norm": 2.410205702357196, + "language_loss": 0.89085704, + "learning_rate": 3.082437012097686e-06, + "loss": 0.91742492, + "num_input_tokens_seen": 2525565, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 3.6875, + "step": 120, + "time_per_iteration": 2.583630084991455 + }, + { + "auxiliary_loss_clip": 0.01524337, + "auxiliary_loss_mlp": 0.01130091, + "balance_loss_clip": 1.06667209, + "balance_loss_mlp": 1.17169607, + "epoch": 0.00727491357282429, + "flos": 23147228459520.0, + "grad_norm": 1.904240324338801, + "language_loss": 0.93491054, + "learning_rate": 3.0877802144103967e-06, + "loss": 0.96145487, + "num_input_tokens_seen": 2546605, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 3.53125, + "step": 121, + "time_per_iteration": 2.6146788597106934 + }, + { + "auxiliary_loss_clip": 0.01523412, + "auxiliary_loss_mlp": 0.0114605, + "balance_loss_clip": 1.08382273, + "balance_loss_mlp": 1.17073464, + "epoch": 0.007335036825492259, + "flos": 15521669712000.0, + "grad_norm": 3.352658173167552, + "language_loss": 0.90176952, + "learning_rate": 3.09307943925077e-06, + "loss": 0.92846411, + "num_input_tokens_seen": 2560730, + "router_z_loss_clip": 0.62109375, + "router_z_loss_mlp": 3.53125, + "step": 122, + "time_per_iteration": 2.566470146179199 + }, + { + "auxiliary_loss_clip": 0.01520578, + "auxiliary_loss_mlp": 0.01142532, + "balance_loss_clip": 1.07634664, + "balance_loss_mlp": 1.16606736, + "epoch": 0.007395160078160229, + "flos": 24243365848320.0, + "grad_norm": 2.7249964127160764, + "language_loss": 0.92516506, + "learning_rate": 3.0983354046304154e-06, + "loss": 0.95179617, + "num_input_tokens_seen": 2579550, + "router_z_loss_clip": 0.66015625, + "router_z_loss_mlp": 3.546875, + "step": 123, + "time_per_iteration": 2.6002941131591797 + }, + { + "auxiliary_loss_clip": 0.01517776, + "auxiliary_loss_mlp": 0.01125795, + "balance_loss_clip": 1.06433022, + "balance_loss_mlp": 1.1609534, + "epoch": 0.007455283330828198, + "flos": 31759792099200.0, + "grad_norm": 7.583203404073904, + "language_loss": 0.71128142, + "learning_rate": 3.103548811118979e-06, + "loss": 0.73771715, + "num_input_tokens_seen": 2600390, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 3.5625, + "step": 124, + "time_per_iteration": 2.79618763923645 + }, + { + "auxiliary_loss_clip": 0.01505473, + "auxiliary_loss_mlp": 0.01124615, + "balance_loss_clip": 1.06157708, + "balance_loss_mlp": 1.16223335, + "epoch": 0.007515406583496167, + "flos": 26615157822720.0, + "grad_norm": 2.4227692366027855, + "language_loss": 0.88482195, + "learning_rate": 3.108720342404542e-06, + "loss": 0.9111228, + "num_input_tokens_seen": 2620770, + "router_z_loss_clip": 0.62890625, + "router_z_loss_mlp": 3.4375, + "step": 125, + "time_per_iteration": 2.6131536960601807 + }, + { + "auxiliary_loss_clip": 0.0152071, + "auxiliary_loss_mlp": 0.01140137, + "balance_loss_clip": 1.07762396, + "balance_loss_mlp": 1.16211164, + "epoch": 0.007575529836164136, + "flos": 18223696350720.0, + "grad_norm": 2.993097477973623, + "language_loss": 0.82384819, + "learning_rate": 3.1138506658316945e-06, + "loss": 0.8504566, + "num_input_tokens_seen": 2639900, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 3.59375, + "step": 126, + "time_per_iteration": 2.595423936843872 + }, + { + "auxiliary_loss_clip": 0.01514354, + "auxiliary_loss_mlp": 0.01139255, + "balance_loss_clip": 1.077981, + "balance_loss_mlp": 1.16128385, + "epoch": 0.007635653088832106, + "flos": 21580410228480.0, + "grad_norm": 3.7264016399601534, + "language_loss": 0.67276633, + "learning_rate": 3.1189404329183404e-06, + "loss": 0.69930243, + "num_input_tokens_seen": 2657450, + "router_z_loss_clip": 0.61328125, + "router_z_loss_mlp": 3.53125, + "step": 127, + "time_per_iteration": 2.620950937271118 + }, + { + "auxiliary_loss_clip": 0.01504536, + "auxiliary_loss_mlp": 0.01128822, + "balance_loss_clip": 1.06640375, + "balance_loss_mlp": 1.16422939, + "epoch": 0.007695776341500075, + "flos": 25375054723200.0, + "grad_norm": 3.6226937306152496, + "language_loss": 0.8815757, + "learning_rate": 3.1239902798522317e-06, + "loss": 0.90790927, + "num_input_tokens_seen": 2678150, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 3.40625, + "step": 128, + "time_per_iteration": 2.6476521492004395 + }, + { + "auxiliary_loss_clip": 0.01505804, + "auxiliary_loss_mlp": 0.01141266, + "balance_loss_clip": 1.07870471, + "balance_loss_mlp": 1.15920687, + "epoch": 0.007755899594168045, + "flos": 22343906741760.0, + "grad_norm": 1.875185485357673, + "language_loss": 0.84581351, + "learning_rate": 3.129000827968184e-06, + "loss": 0.87228423, + "num_input_tokens_seen": 2698290, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 3.46875, + "step": 129, + "time_per_iteration": 2.611762285232544 + }, + { + "auxiliary_loss_clip": 0.01499869, + "auxiliary_loss_mlp": 0.01133647, + "balance_loss_clip": 1.07122934, + "balance_loss_mlp": 1.1588279, + "epoch": 0.007816022846836013, + "flos": 22638230784000.0, + "grad_norm": 2.023668494136832, + "language_loss": 0.9742806, + "learning_rate": 3.133972684206866e-06, + "loss": 1.00061572, + "num_input_tokens_seen": 2717630, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 3.40625, + "step": 130, + "time_per_iteration": 2.599639415740967 + }, + { + "auxiliary_loss_clip": 0.01493155, + "auxiliary_loss_mlp": 0.0113499, + "balance_loss_clip": 1.07109392, + "balance_loss_mlp": 1.15518749, + "epoch": 0.007876146099503984, + "flos": 18182901479040.0, + "grad_norm": 2.1876581172480285, + "language_loss": 0.82624269, + "learning_rate": 3.138906441556014e-06, + "loss": 0.85252404, + "num_input_tokens_seen": 2735835, + "router_z_loss_clip": 0.63671875, + "router_z_loss_mlp": 3.375, + "step": 131, + "time_per_iteration": 2.6086065769195557 + }, + { + "auxiliary_loss_clip": 0.01502593, + "auxiliary_loss_mlp": 0.01127672, + "balance_loss_clip": 1.06759024, + "balance_loss_mlp": 1.15800536, + "epoch": 0.007936269352171952, + "flos": 27119486730240.0, + "grad_norm": 2.4868851395581677, + "language_loss": 0.82762384, + "learning_rate": 3.143802679474861e-06, + "loss": 0.85392648, + "num_input_tokens_seen": 2756335, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 3.4375, + "step": 132, + "time_per_iteration": 2.673790454864502 + }, + { + "auxiliary_loss_clip": 0.01493849, + "auxiliary_loss_mlp": 0.01128197, + "balance_loss_clip": 1.06716144, + "balance_loss_mlp": 1.15264463, + "epoch": 0.007996392604839923, + "flos": 19026335710080.0, + "grad_norm": 2.7432419346617443, + "language_loss": 0.95486552, + "learning_rate": 3.1486619643025565e-06, + "loss": 0.98108596, + "num_input_tokens_seen": 2775090, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 3.40625, + "step": 133, + "time_per_iteration": 2.6287872791290283 + }, + { + "auxiliary_loss_clip": 0.01490198, + "auxiliary_loss_mlp": 0.01125526, + "balance_loss_clip": 1.06725681, + "balance_loss_mlp": 1.16143155, + "epoch": 0.008056515857507891, + "flos": 25484151306240.0, + "grad_norm": 1.7764051426707919, + "language_loss": 0.73316634, + "learning_rate": 3.153484849651286e-06, + "loss": 0.7593236, + "num_input_tokens_seen": 2795320, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 3.296875, + "step": 134, + "time_per_iteration": 2.6728081703186035 + }, + { + "auxiliary_loss_clip": 0.01486213, + "auxiliary_loss_mlp": 0.01130543, + "balance_loss_clip": 1.06707644, + "balance_loss_mlp": 1.14955854, + "epoch": 0.00811663911017586, + "flos": 20557566541440.0, + "grad_norm": 3.090234736760587, + "language_loss": 0.88808328, + "learning_rate": 3.1582718767847806e-06, + "loss": 0.91425079, + "num_input_tokens_seen": 2812815, + "router_z_loss_clip": 0.63671875, + "router_z_loss_mlp": 3.375, + "step": 135, + "time_per_iteration": 2.6380510330200195 + }, + { + "auxiliary_loss_clip": 0.01489108, + "auxiliary_loss_mlp": 0.01131555, + "balance_loss_clip": 1.06789732, + "balance_loss_mlp": 1.15456343, + "epoch": 0.00817676236284383, + "flos": 18799738761600.0, + "grad_norm": 2.008171494368998, + "language_loss": 0.89123899, + "learning_rate": 3.1630235749828485e-06, + "loss": 0.9174456, + "num_input_tokens_seen": 2830445, + "router_z_loss_clip": 0.63671875, + "router_z_loss_mlp": 3.34375, + "step": 136, + "time_per_iteration": 2.555936813354492 + }, + { + "auxiliary_loss_clip": 0.01486639, + "auxiliary_loss_mlp": 0.01108223, + "balance_loss_clip": 1.04962027, + "balance_loss_mlp": 1.14870429, + "epoch": 0.008236885615511799, + "flos": 23873593288320.0, + "grad_norm": 5.8712537379963345, + "language_loss": 0.8400104, + "learning_rate": 3.1677404618925676e-06, + "loss": 0.86595905, + "num_input_tokens_seen": 2846965, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 3.375, + "step": 137, + "time_per_iteration": 2.6225337982177734 + }, + { + "auxiliary_loss_clip": 0.01482624, + "auxiliary_loss_mlp": 0.01116377, + "balance_loss_clip": 1.05796409, + "balance_loss_mlp": 1.14842129, + "epoch": 0.00829700886817977, + "flos": 24643626076800.0, + "grad_norm": 1.6861384534946333, + "language_loss": 0.90170664, + "learning_rate": 3.1724230438666953e-06, + "loss": 0.9276967, + "num_input_tokens_seen": 2867520, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 3.34375, + "step": 138, + "time_per_iteration": 2.653205156326294 + }, + { + "auxiliary_loss_clip": 0.01472312, + "auxiliary_loss_mlp": 0.01119929, + "balance_loss_clip": 1.0568912, + "balance_loss_mlp": 1.1478796, + "epoch": 0.008357132120847738, + "flos": 25262007644160.0, + "grad_norm": 2.679342832062188, + "language_loss": 0.91253459, + "learning_rate": 3.177071816289865e-06, + "loss": 0.93845713, + "num_input_tokens_seen": 2885675, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 3.234375, + "step": 139, + "time_per_iteration": 2.6182503700256348 + }, + { + "auxiliary_loss_clip": 0.01489087, + "auxiliary_loss_mlp": 0.01123997, + "balance_loss_clip": 1.06229401, + "balance_loss_mlp": 1.154405, + "epoch": 0.008417255373515706, + "flos": 27344898529920.0, + "grad_norm": 2.5553770836970675, + "language_loss": 0.85446793, + "learning_rate": 3.181687263893095e-06, + "loss": 0.88059878, + "num_input_tokens_seen": 2905960, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 3.34375, + "step": 140, + "time_per_iteration": 2.649454116821289 + }, + { + "auxiliary_loss_clip": 0.01476267, + "auxiliary_loss_mlp": 0.0111889, + "balance_loss_clip": 1.0594281, + "balance_loss_mlp": 1.14865911, + "epoch": 0.008477378626183677, + "flos": 17639070589440.0, + "grad_norm": 2.379593217845822, + "language_loss": 0.84156519, + "learning_rate": 3.186269861057098e-06, + "loss": 0.86751676, + "num_input_tokens_seen": 2922780, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 3.28125, + "step": 141, + "time_per_iteration": 2.608603000640869 + }, + { + "auxiliary_loss_clip": 0.01480312, + "auxiliary_loss_mlp": 0.01134333, + "balance_loss_clip": 1.07320273, + "balance_loss_mlp": 1.14624739, + "epoch": 0.008537501878851645, + "flos": 13881342297600.0, + "grad_norm": 2.3283494467369965, + "language_loss": 0.81387591, + "learning_rate": 3.1908200721048745e-06, + "loss": 0.84002233, + "num_input_tokens_seen": 2938765, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 3.34375, + "step": 142, + "time_per_iteration": 4.023308753967285 + }, + { + "auxiliary_loss_clip": 0.01378722, + "auxiliary_loss_mlp": 0.01032728, + "balance_loss_clip": 1.00621629, + "balance_loss_mlp": 1.1918689, + "epoch": 0.008597625131519616, + "flos": 71248101281280.0, + "grad_norm": 1.0451783350372967, + "language_loss": 0.66831523, + "learning_rate": 3.195338351584042e-06, + "loss": 0.69242978, + "num_input_tokens_seen": 3006665, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.8671875, + "step": 143, + "time_per_iteration": 4.718023777008057 + }, + { + "auxiliary_loss_clip": 0.01472184, + "auxiliary_loss_mlp": 0.0112263, + "balance_loss_clip": 1.06283474, + "balance_loss_mlp": 1.14625573, + "epoch": 0.008657748384187584, + "flos": 17602836744960.0, + "grad_norm": 2.2608538764922295, + "language_loss": 0.83954072, + "learning_rate": 3.1998251445393258e-06, + "loss": 0.86548889, + "num_input_tokens_seen": 3024335, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 3.25, + "step": 144, + "time_per_iteration": 2.5878453254699707 + }, + { + "auxiliary_loss_clip": 0.01457808, + "auxiliary_loss_mlp": 0.01111605, + "balance_loss_clip": 1.04890084, + "balance_loss_mlp": 1.13930941, + "epoch": 0.008717871636855555, + "flos": 19715317459200.0, + "grad_norm": 2.241812154138119, + "language_loss": 0.88511693, + "learning_rate": 3.204280886775619e-06, + "loss": 0.91081107, + "num_input_tokens_seen": 3043300, + "router_z_loss_clip": 0.62890625, + "router_z_loss_mlp": 3.1875, + "step": 145, + "time_per_iteration": 2.586512565612793 + }, + { + "auxiliary_loss_clip": 0.01475641, + "auxiliary_loss_mlp": 0.01124002, + "balance_loss_clip": 1.06153631, + "balance_loss_mlp": 1.14211285, + "epoch": 0.008777994889523523, + "flos": 24717422568960.0, + "grad_norm": 1.792984011276012, + "language_loss": 0.85949898, + "learning_rate": 3.208706005112005e-06, + "loss": 0.88549542, + "num_input_tokens_seen": 3064610, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 3.34375, + "step": 146, + "time_per_iteration": 2.6258151531219482 + }, + { + "auxiliary_loss_clip": 0.01359324, + "auxiliary_loss_mlp": 0.01026953, + "balance_loss_clip": 1.00082254, + "balance_loss_mlp": 1.17825258, + "epoch": 0.008838118142191492, + "flos": 70132067758080.0, + "grad_norm": 0.8557738136673508, + "language_loss": 0.60047674, + "learning_rate": 3.213100917627104e-06, + "loss": 0.62433958, + "num_input_tokens_seen": 3130385, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.8125, + "step": 147, + "time_per_iteration": 3.2522764205932617 + }, + { + "auxiliary_loss_clip": 0.01465546, + "auxiliary_loss_mlp": 0.01124118, + "balance_loss_clip": 1.06670642, + "balance_loss_mlp": 1.14550173, + "epoch": 0.008898241394859462, + "flos": 20044797937920.0, + "grad_norm": 1.8343461268862185, + "language_loss": 0.8454501, + "learning_rate": 3.2174660338961135e-06, + "loss": 0.87134671, + "num_input_tokens_seen": 3149760, + "router_z_loss_clip": 0.57421875, + "router_z_loss_mlp": 3.203125, + "step": 148, + "time_per_iteration": 2.635499954223633 + }, + { + "auxiliary_loss_clip": 0.0147086, + "auxiliary_loss_mlp": 0.01143141, + "balance_loss_clip": 1.07914925, + "balance_loss_mlp": 1.14693797, + "epoch": 0.008958364647527431, + "flos": 10743611685120.0, + "grad_norm": 2.2581185064103404, + "language_loss": 0.88802874, + "learning_rate": 3.2218017552198588e-06, + "loss": 0.91416872, + "num_input_tokens_seen": 3164500, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 3.234375, + "step": 149, + "time_per_iteration": 2.5458836555480957 + }, + { + "auxiliary_loss_clip": 0.01466862, + "auxiliary_loss_mlp": 0.01112061, + "balance_loss_clip": 1.05445874, + "balance_loss_mlp": 1.14131117, + "epoch": 0.009018487900195401, + "flos": 29127467802240.0, + "grad_norm": 2.7760320197047097, + "language_loss": 0.93054724, + "learning_rate": 3.226108474846181e-06, + "loss": 0.95633656, + "num_input_tokens_seen": 3182455, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 3.25, + "step": 150, + "time_per_iteration": 2.648111343383789 + }, + { + "auxiliary_loss_clip": 0.01454371, + "auxiliary_loss_mlp": 0.01109463, + "balance_loss_clip": 1.05391192, + "balance_loss_mlp": 1.13663483, + "epoch": 0.00907861115286337, + "flos": 32963661354240.0, + "grad_norm": 1.9005080345968057, + "language_loss": 0.74303263, + "learning_rate": 3.2303865781839817e-06, + "loss": 0.76867104, + "num_input_tokens_seen": 3203995, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 3.171875, + "step": 151, + "time_per_iteration": 2.6953165531158447 + }, + { + "auxiliary_loss_clip": 0.014664, + "auxiliary_loss_mlp": 0.01125146, + "balance_loss_clip": 1.06735289, + "balance_loss_mlp": 1.14143276, + "epoch": 0.009138734405531338, + "flos": 21762441377280.0, + "grad_norm": 2.6241423805649298, + "language_loss": 0.88251799, + "learning_rate": 3.234636443010188e-06, + "loss": 0.90843344, + "num_input_tokens_seen": 3222575, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 3.25, + "step": 152, + "time_per_iteration": 2.6034231185913086 + }, + { + "auxiliary_loss_clip": 0.01466383, + "auxiliary_loss_mlp": 0.01121244, + "balance_loss_clip": 1.0628314, + "balance_loss_mlp": 1.14757276, + "epoch": 0.009198857658199309, + "flos": 20842517134080.0, + "grad_norm": 3.4062301864690196, + "language_loss": 0.83957756, + "learning_rate": 3.238858439669943e-06, + "loss": 0.86545384, + "num_input_tokens_seen": 3240180, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 3.1875, + "step": 153, + "time_per_iteration": 2.6023271083831787 + }, + { + "auxiliary_loss_clip": 0.01456394, + "auxiliary_loss_mlp": 0.01136316, + "balance_loss_clip": 1.0765202, + "balance_loss_mlp": 1.13805962, + "epoch": 0.009258980910867277, + "flos": 24827381078400.0, + "grad_norm": 1.9441527650945287, + "language_loss": 0.89881843, + "learning_rate": 3.2430529312702712e-06, + "loss": 0.92474556, + "num_input_tokens_seen": 3259800, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 3.1875, + "step": 154, + "time_per_iteration": 2.646308183670044 + }, + { + "auxiliary_loss_clip": 0.01460439, + "auxiliary_loss_mlp": 0.01154617, + "balance_loss_clip": 1.09577537, + "balance_loss_mlp": 1.14094579, + "epoch": 0.009319104163535248, + "flos": 28767786963840.0, + "grad_norm": 2.0692323216259187, + "language_loss": 0.89471745, + "learning_rate": 3.2472202738674737e-06, + "loss": 0.92086804, + "num_input_tokens_seen": 3280400, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 3.1875, + "step": 155, + "time_per_iteration": 2.6336286067962646 + }, + { + "auxiliary_loss_clip": 0.01463585, + "auxiliary_loss_mlp": 0.01116238, + "balance_loss_clip": 1.05894589, + "balance_loss_mlp": 1.13895822, + "epoch": 0.009379227416203216, + "flos": 16582004219520.0, + "grad_norm": 3.3077298720636255, + "language_loss": 0.86882627, + "learning_rate": 3.2513608166485063e-06, + "loss": 0.89462447, + "num_input_tokens_seen": 3297600, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 3.25, + "step": 156, + "time_per_iteration": 2.5539867877960205 + }, + { + "auxiliary_loss_clip": 0.01462083, + "auxiliary_loss_mlp": 0.01121969, + "balance_loss_clip": 1.06408143, + "balance_loss_mlp": 1.14298415, + "epoch": 0.009439350668871187, + "flos": 18329919845760.0, + "grad_norm": 2.4916444524903527, + "language_loss": 0.99553013, + "learning_rate": 3.2554749021065498e-06, + "loss": 1.02137065, + "num_input_tokens_seen": 3313635, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 3.1875, + "step": 157, + "time_per_iteration": 2.5249693393707275 + }, + { + "auxiliary_loss_clip": 0.01445636, + "auxiliary_loss_mlp": 0.01139016, + "balance_loss_clip": 1.08146214, + "balance_loss_mlp": 1.1366899, + "epoch": 0.009499473921539155, + "flos": 24349912565760.0, + "grad_norm": 2.0302475566757225, + "language_loss": 0.8847568, + "learning_rate": 3.2595628662110186e-06, + "loss": 0.91060334, + "num_input_tokens_seen": 3333735, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 3.09375, + "step": 158, + "time_per_iteration": 2.6009252071380615 + }, + { + "auxiliary_loss_clip": 0.01452439, + "auxiliary_loss_mlp": 0.01124253, + "balance_loss_clip": 1.06555486, + "balance_loss_mlp": 1.13677907, + "epoch": 0.009559597174207124, + "flos": 16399326625920.0, + "grad_norm": 4.310723443959545, + "language_loss": 0.86534697, + "learning_rate": 3.2636250385721982e-06, + "loss": 0.89111388, + "num_input_tokens_seen": 3348800, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 3.15625, + "step": 159, + "time_per_iteration": 2.6107394695281982 + }, + { + "auxiliary_loss_clip": 0.01442093, + "auxiliary_loss_mlp": 0.01132817, + "balance_loss_clip": 1.07340288, + "balance_loss_mlp": 1.13145089, + "epoch": 0.009619720426875094, + "flos": 22856890826880.0, + "grad_norm": 1.790220267572532, + "language_loss": 0.86825597, + "learning_rate": 3.2676617426007263e-06, + "loss": 0.89400506, + "num_input_tokens_seen": 3368595, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 3.109375, + "step": 160, + "time_per_iteration": 2.574252128601074 + }, + { + "auxiliary_loss_clip": 0.01449537, + "auxiliary_loss_mlp": 0.01117828, + "balance_loss_clip": 1.06318271, + "balance_loss_mlp": 1.13704872, + "epoch": 0.009679843679543063, + "flos": 19135001329920.0, + "grad_norm": 2.6107931748588893, + "language_loss": 0.91542315, + "learning_rate": 3.2716732956621042e-06, + "loss": 0.94109678, + "num_input_tokens_seen": 3384975, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 3.125, + "step": 161, + "time_per_iteration": 2.550865650177002 + }, + { + "auxiliary_loss_clip": 0.01454094, + "auxiliary_loss_mlp": 0.01109765, + "balance_loss_clip": 1.05488133, + "balance_loss_mlp": 1.13759339, + "epoch": 0.009739966932211033, + "flos": 20302995876480.0, + "grad_norm": 2.2107920101940994, + "language_loss": 0.91690832, + "learning_rate": 3.2756600092264203e-06, + "loss": 0.94254684, + "num_input_tokens_seen": 3404755, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 3.15625, + "step": 162, + "time_per_iteration": 2.5527970790863037 + }, + { + "auxiliary_loss_clip": 0.01312712, + "auxiliary_loss_mlp": 0.0102725, + "balance_loss_clip": 1.00331306, + "balance_loss_mlp": 1.14560354, + "epoch": 0.009800090184879002, + "flos": 67034234177280.0, + "grad_norm": 1.2615279464106541, + "language_loss": 0.72354776, + "learning_rate": 3.279622189013474e-06, + "loss": 0.74694741, + "num_input_tokens_seen": 3467210, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.671875, + "step": 163, + "time_per_iteration": 3.143763542175293 + }, + { + "auxiliary_loss_clip": 0.01440764, + "auxiliary_loss_mlp": 0.01113881, + "balance_loss_clip": 1.05804312, + "balance_loss_mlp": 1.13505006, + "epoch": 0.00986021343754697, + "flos": 17164690646400.0, + "grad_norm": 2.1923315312730374, + "language_loss": 0.8427155, + "learning_rate": 3.283560135133457e-06, + "loss": 0.86826193, + "num_input_tokens_seen": 3483220, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 3.0625, + "step": 164, + "time_per_iteration": 2.5536584854125977 + }, + { + "auxiliary_loss_clip": 0.01429878, + "auxiliary_loss_mlp": 0.01100497, + "balance_loss_clip": 1.04585135, + "balance_loss_mlp": 1.12637794, + "epoch": 0.00992033669021494, + "flos": 17749424148480.0, + "grad_norm": 2.006756380443377, + "language_loss": 0.89215541, + "learning_rate": 3.2874741422233565e-06, + "loss": 0.91745919, + "num_input_tokens_seen": 3501465, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 3.03125, + "step": 165, + "time_per_iteration": 2.5313210487365723 + }, + { + "auxiliary_loss_clip": 0.01434156, + "auxiliary_loss_mlp": 0.01127756, + "balance_loss_clip": 1.0692482, + "balance_loss_mlp": 1.12764359, + "epoch": 0.00998045994288291, + "flos": 25297164080640.0, + "grad_norm": 6.432940691763592, + "language_loss": 0.80138129, + "learning_rate": 3.2913644995792465e-06, + "loss": 0.82700044, + "num_input_tokens_seen": 3520480, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 3.0625, + "step": 166, + "time_per_iteration": 2.6461095809936523 + }, + { + "auxiliary_loss_clip": 0.01438531, + "auxiliary_loss_mlp": 0.01125189, + "balance_loss_clip": 1.06749213, + "balance_loss_mlp": 1.13121533, + "epoch": 0.01004058319555088, + "flos": 32298954220800.0, + "grad_norm": 2.334124726802297, + "language_loss": 0.9190954, + "learning_rate": 3.2952314912845914e-06, + "loss": 0.94473255, + "num_input_tokens_seen": 3539570, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 3.078125, + "step": 167, + "time_per_iteration": 2.655597448348999 + }, + { + "auxiliary_loss_clip": 0.01430369, + "auxiliary_loss_mlp": 0.01135101, + "balance_loss_clip": 1.07997894, + "balance_loss_mlp": 1.12960708, + "epoch": 0.010100706448218848, + "flos": 11319941404800.0, + "grad_norm": 3.1870046541457873, + "language_loss": 0.90852308, + "learning_rate": 3.299075396334735e-06, + "loss": 0.93417776, + "num_input_tokens_seen": 3555465, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 3.0, + "step": 168, + "time_per_iteration": 2.5387983322143555 + }, + { + "auxiliary_loss_clip": 0.01424973, + "auxiliary_loss_mlp": 0.01106848, + "balance_loss_clip": 1.05072391, + "balance_loss_mlp": 1.12456727, + "epoch": 0.010160829700886819, + "flos": 29719491765120.0, + "grad_norm": 2.0495813916191077, + "language_loss": 0.87094414, + "learning_rate": 3.3028964887576868e-06, + "loss": 0.89626241, + "num_input_tokens_seen": 3578970, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 3.0, + "step": 169, + "time_per_iteration": 2.6448419094085693 + }, + { + "auxiliary_loss_clip": 0.01426284, + "auxiliary_loss_mlp": 0.01111393, + "balance_loss_clip": 1.05548358, + "balance_loss_mlp": 1.12704372, + "epoch": 0.010220952953554787, + "flos": 20412343854720.0, + "grad_norm": 3.0203817486241973, + "language_loss": 0.84758192, + "learning_rate": 3.306695037731344e-06, + "loss": 0.87295866, + "num_input_tokens_seen": 3597275, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 3.0, + "step": 170, + "time_per_iteration": 2.5596489906311035 + }, + { + "auxiliary_loss_clip": 0.01435879, + "auxiliary_loss_mlp": 0.01136565, + "balance_loss_clip": 1.07963061, + "balance_loss_mlp": 1.12765205, + "epoch": 0.010281076206222756, + "flos": 31285124847360.0, + "grad_norm": 2.124400250788896, + "language_loss": 0.89896494, + "learning_rate": 3.3104713076972827e-06, + "loss": 0.92468935, + "num_input_tokens_seen": 3618905, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 3.078125, + "step": 171, + "time_per_iteration": 2.71183180809021 + }, + { + "auxiliary_loss_clip": 0.01429687, + "auxiliary_loss_mlp": 0.01108406, + "balance_loss_clip": 1.05421364, + "balance_loss_mlp": 1.1300813, + "epoch": 0.010341199458890726, + "flos": 21982286568960.0, + "grad_norm": 2.015577645060998, + "language_loss": 0.88978243, + "learning_rate": 3.314225558471224e-06, + "loss": 0.91516334, + "num_input_tokens_seen": 3639610, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 3.0, + "step": 172, + "time_per_iteration": 2.6193771362304688 + }, + { + "auxiliary_loss_clip": 0.01419105, + "auxiliary_loss_mlp": 0.01124801, + "balance_loss_clip": 1.06986928, + "balance_loss_mlp": 1.12354624, + "epoch": 0.010401322711558695, + "flos": 30810529422720.0, + "grad_norm": 1.6868779107262128, + "language_loss": 0.81148165, + "learning_rate": 3.317958045350308e-06, + "loss": 0.83692074, + "num_input_tokens_seen": 3664030, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 2.953125, + "step": 173, + "time_per_iteration": 2.656935691833496 + }, + { + "auxiliary_loss_clip": 0.01430653, + "auxiliary_loss_mlp": 0.0110718, + "balance_loss_clip": 1.05496693, + "balance_loss_mlp": 1.12733519, + "epoch": 0.010461445964226665, + "flos": 24715124098560.0, + "grad_norm": 2.1134597687554244, + "language_loss": 0.82498932, + "learning_rate": 3.3216690192172596e-06, + "loss": 0.85036767, + "num_input_tokens_seen": 3683615, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 3.03125, + "step": 174, + "time_per_iteration": 2.6050753593444824 + }, + { + "auxiliary_loss_clip": 0.01425822, + "auxiliary_loss_mlp": 0.01124156, + "balance_loss_clip": 1.06984437, + "balance_loss_mlp": 1.12589645, + "epoch": 0.010521569216894634, + "flos": 27710361457920.0, + "grad_norm": 2.6035215697191965, + "language_loss": 0.72699076, + "learning_rate": 3.325358726641591e-06, + "loss": 0.75249052, + "num_input_tokens_seen": 3704540, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 3.0, + "step": 175, + "time_per_iteration": 2.6859946250915527 + }, + { + "auxiliary_loss_clip": 0.01427679, + "auxiliary_loss_mlp": 0.0113274, + "balance_loss_clip": 1.07571054, + "balance_loss_mlp": 1.12603855, + "epoch": 0.010581692469562603, + "flos": 12458346122880.0, + "grad_norm": 2.402827576481816, + "language_loss": 0.98082507, + "learning_rate": 3.329027409977902e-06, + "loss": 1.00642931, + "num_input_tokens_seen": 3721320, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 3.015625, + "step": 176, + "time_per_iteration": 2.5405664443969727 + }, + { + "auxiliary_loss_clip": 0.01412838, + "auxiliary_loss_mlp": 0.01132631, + "balance_loss_clip": 1.08005941, + "balance_loss_mlp": 1.12270594, + "epoch": 0.010641815722230573, + "flos": 19427601519360.0, + "grad_norm": 2.3427037211777115, + "language_loss": 0.76749414, + "learning_rate": 3.3326753074614087e-06, + "loss": 0.79294884, + "num_input_tokens_seen": 3739385, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.90625, + "step": 177, + "time_per_iteration": 2.555553674697876 + }, + { + "auxiliary_loss_clip": 0.01423246, + "auxiliary_loss_mlp": 0.01104615, + "balance_loss_clip": 1.0507797, + "balance_loss_mlp": 1.12089574, + "epoch": 0.010701938974898541, + "flos": 18332577452160.0, + "grad_norm": 2.4108248963401464, + "language_loss": 0.76824659, + "learning_rate": 3.3363026533007716e-06, + "loss": 0.79352522, + "num_input_tokens_seen": 3756360, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 3.015625, + "step": 178, + "time_per_iteration": 2.5799388885498047 + }, + { + "auxiliary_loss_clip": 0.01429506, + "auxiliary_loss_mlp": 0.01108132, + "balance_loss_clip": 1.05224717, + "balance_loss_mlp": 1.12586653, + "epoch": 0.010762062227566512, + "flos": 19203985399680.0, + "grad_norm": 2.1918052506036174, + "language_loss": 0.84004253, + "learning_rate": 3.3399096777683303e-06, + "loss": 0.86541891, + "num_input_tokens_seen": 3773930, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 3.03125, + "step": 179, + "time_per_iteration": 2.5387184619903564 + }, + { + "auxiliary_loss_clip": 0.01420983, + "auxiliary_loss_mlp": 0.01112539, + "balance_loss_clip": 1.05677247, + "balance_loss_mlp": 1.12062979, + "epoch": 0.01082218548023448, + "flos": 31425427370880.0, + "grad_norm": 1.90488055395076, + "language_loss": 0.83719397, + "learning_rate": 3.3434966072878213e-06, + "loss": 0.86252916, + "num_input_tokens_seen": 3793630, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 3.0, + "step": 180, + "time_per_iteration": 2.6149253845214844 + }, + { + "auxiliary_loss_clip": 0.01421575, + "auxiliary_loss_mlp": 0.01120367, + "balance_loss_clip": 1.06503046, + "balance_loss_mlp": 1.1226536, + "epoch": 0.01088230873290245, + "flos": 25046436170880.0, + "grad_norm": 3.784573507260413, + "language_loss": 0.7774682, + "learning_rate": 3.3470636645196674e-06, + "loss": 0.80288756, + "num_input_tokens_seen": 3813610, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 3.0, + "step": 181, + "time_per_iteration": 2.5769712924957275 + }, + { + "auxiliary_loss_clip": 0.01417045, + "auxiliary_loss_mlp": 0.01131731, + "balance_loss_clip": 1.07732356, + "balance_loss_mlp": 1.11938787, + "epoch": 0.01094243198557042, + "flos": 22893411980160.0, + "grad_norm": 3.1835165271024377, + "language_loss": 0.76440376, + "learning_rate": 3.3506110684439156e-06, + "loss": 0.78989148, + "num_input_tokens_seen": 3831390, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 2.96875, + "step": 182, + "time_per_iteration": 2.5641353130340576 + }, + { + "auxiliary_loss_clip": 0.01412704, + "auxiliary_loss_mlp": 0.01127012, + "balance_loss_clip": 1.07122183, + "balance_loss_mlp": 1.11758399, + "epoch": 0.011002555238238388, + "flos": 17165049782400.0, + "grad_norm": 2.172025067133121, + "language_loss": 0.87377435, + "learning_rate": 3.3541390344409054e-06, + "loss": 0.89917147, + "num_input_tokens_seen": 3849705, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 2.953125, + "step": 183, + "time_per_iteration": 2.567457914352417 + }, + { + "auxiliary_loss_clip": 0.01415124, + "auxiliary_loss_mlp": 0.01114516, + "balance_loss_clip": 1.06397092, + "balance_loss_mlp": 1.1209594, + "epoch": 0.011062678490906358, + "flos": 22310150935680.0, + "grad_norm": 2.2669267607504255, + "language_loss": 0.86875558, + "learning_rate": 3.357647774369736e-06, + "loss": 0.89405191, + "num_input_tokens_seen": 3869230, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.953125, + "step": 184, + "time_per_iteration": 5.380701780319214 + }, + { + "auxiliary_loss_clip": 0.01411555, + "auxiliary_loss_mlp": 0.01107942, + "balance_loss_clip": 1.05308247, + "balance_loss_mlp": 1.12176847, + "epoch": 0.011122801743574327, + "flos": 24388373053440.0, + "grad_norm": 1.8448371257401488, + "language_loss": 0.83683228, + "learning_rate": 3.3611374966446085e-06, + "loss": 0.86202729, + "num_input_tokens_seen": 3889735, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 2.90625, + "step": 185, + "time_per_iteration": 2.5522208213806152 + }, + { + "auxiliary_loss_clip": 0.01420908, + "auxiliary_loss_mlp": 0.01109712, + "balance_loss_clip": 1.05253971, + "balance_loss_mlp": 1.11964798, + "epoch": 0.011182924996242297, + "flos": 18150258994560.0, + "grad_norm": 2.4162416092451475, + "language_loss": 0.71111757, + "learning_rate": 3.3646084063091142e-06, + "loss": 0.73642373, + "num_input_tokens_seen": 3908855, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 3.015625, + "step": 186, + "time_per_iteration": 2.536498546600342 + }, + { + "auxiliary_loss_clip": 0.01416319, + "auxiliary_loss_mlp": 0.01107204, + "balance_loss_clip": 1.0558964, + "balance_loss_mlp": 1.11923158, + "epoch": 0.011243048248910266, + "flos": 15486800584320.0, + "grad_norm": 3.342492581434835, + "language_loss": 1.02028871, + "learning_rate": 3.3680607051085194e-06, + "loss": 1.04552388, + "num_input_tokens_seen": 3923865, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 2.96875, + "step": 187, + "time_per_iteration": 2.5189080238342285 + }, + { + "auxiliary_loss_clip": 0.01405552, + "auxiliary_loss_mlp": 0.01110459, + "balance_loss_clip": 1.05597997, + "balance_loss_mlp": 1.11834478, + "epoch": 0.011303171501578235, + "flos": 40916868986880.0, + "grad_norm": 1.6787333311747052, + "language_loss": 0.75107503, + "learning_rate": 3.371494591560139e-06, + "loss": 0.7762351, + "num_input_tokens_seen": 3946870, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 2.875, + "step": 188, + "time_per_iteration": 2.73420786857605 + }, + { + "auxiliary_loss_clip": 0.01292523, + "auxiliary_loss_mlp": 0.01034102, + "balance_loss_clip": 1.01273942, + "balance_loss_mlp": 1.13387585, + "epoch": 0.011363294754246205, + "flos": 66302697790080.0, + "grad_norm": 0.7700467396195164, + "language_loss": 0.56216431, + "learning_rate": 3.3749102610218297e-06, + "loss": 0.5854305, + "num_input_tokens_seen": 4010005, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.5859375, + "step": 189, + "time_per_iteration": 3.176280975341797 + }, + { + "auxiliary_loss_clip": 0.01402635, + "auxiliary_loss_mlp": 0.01121834, + "balance_loss_clip": 1.06742704, + "balance_loss_mlp": 1.1134795, + "epoch": 0.011423418006914174, + "flos": 24900279730560.0, + "grad_norm": 2.292403028528975, + "language_loss": 0.94771594, + "learning_rate": 3.3783079057586833e-06, + "loss": 0.97296059, + "num_input_tokens_seen": 4029035, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 2.90625, + "step": 190, + "time_per_iteration": 2.604132652282715 + }, + { + "auxiliary_loss_clip": 0.01403317, + "auxiliary_loss_mlp": 0.01101291, + "balance_loss_clip": 1.04964972, + "balance_loss_mlp": 1.11493886, + "epoch": 0.011483541259582144, + "flos": 19791879298560.0, + "grad_norm": 2.993049163405909, + "language_loss": 0.84462845, + "learning_rate": 3.3816877150079665e-06, + "loss": 0.8696745, + "num_input_tokens_seen": 4046995, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 2.875, + "step": 191, + "time_per_iteration": 2.569664716720581 + }, + { + "auxiliary_loss_clip": 0.01402316, + "auxiliary_loss_mlp": 0.01121031, + "balance_loss_clip": 1.0698905, + "balance_loss_mlp": 1.11087692, + "epoch": 0.011543664512250112, + "flos": 26176939896960.0, + "grad_norm": 2.0097697123850593, + "language_loss": 0.91439575, + "learning_rate": 3.385049875042367e-06, + "loss": 0.93962914, + "num_input_tokens_seen": 4065865, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 2.90625, + "step": 192, + "time_per_iteration": 2.6416900157928467 + }, + { + "auxiliary_loss_clip": 0.0139743, + "auxiliary_loss_mlp": 0.01113461, + "balance_loss_clip": 1.05776596, + "balance_loss_mlp": 1.11231375, + "epoch": 0.011603787764918083, + "flos": 23768985905280.0, + "grad_norm": 2.095754720056515, + "language_loss": 0.86849445, + "learning_rate": 3.3883945692315938e-06, + "loss": 0.89360332, + "num_input_tokens_seen": 4085305, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 2.84375, + "step": 193, + "time_per_iteration": 2.569899797439575 + }, + { + "auxiliary_loss_clip": 0.01399232, + "auxiliary_loss_mlp": 0.01095137, + "balance_loss_clip": 1.04409146, + "balance_loss_mlp": 1.10937476, + "epoch": 0.011663911017586051, + "flos": 25954688494080.0, + "grad_norm": 2.446553756436178, + "language_loss": 0.92399615, + "learning_rate": 3.3917219781023906e-06, + "loss": 0.9489398, + "num_input_tokens_seen": 4105185, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 2.90625, + "step": 194, + "time_per_iteration": 2.6078743934631348 + }, + { + "auxiliary_loss_clip": 0.01405837, + "auxiliary_loss_mlp": 0.0110398, + "balance_loss_clip": 1.05188549, + "balance_loss_mlp": 1.11522019, + "epoch": 0.01172403427025402, + "flos": 17895149625600.0, + "grad_norm": 3.1413620570060052, + "language_loss": 0.89698559, + "learning_rate": 3.3950322793970014e-06, + "loss": 0.92208374, + "num_input_tokens_seen": 4123160, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.90625, + "step": 195, + "time_per_iteration": 2.5785820484161377 + }, + { + "auxiliary_loss_clip": 0.01400897, + "auxiliary_loss_mlp": 0.01117652, + "balance_loss_clip": 1.06345916, + "balance_loss_mlp": 1.11416054, + "epoch": 0.01178415752292199, + "flos": 17894539094400.0, + "grad_norm": 3.0173579296668813, + "language_loss": 0.8577168, + "learning_rate": 3.3983256481301445e-06, + "loss": 0.88290232, + "num_input_tokens_seen": 4140425, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 2.875, + "step": 196, + "time_per_iteration": 2.5492773056030273 + }, + { + "auxiliary_loss_clip": 0.01397107, + "auxiliary_loss_mlp": 0.01106206, + "balance_loss_clip": 1.05299139, + "balance_loss_mlp": 1.10991478, + "epoch": 0.011844280775589959, + "flos": 22893555634560.0, + "grad_norm": 2.86264810097015, + "language_loss": 0.93367243, + "learning_rate": 3.4016022566445335e-06, + "loss": 0.95870566, + "num_input_tokens_seen": 4159555, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.875, + "step": 197, + "time_per_iteration": 2.5488431453704834 + }, + { + "auxiliary_loss_clip": 0.01394686, + "auxiliary_loss_mlp": 0.0110986, + "balance_loss_clip": 1.05781317, + "balance_loss_mlp": 1.1120131, + "epoch": 0.01190440402825793, + "flos": 26980333441920.0, + "grad_norm": 2.1872318454948045, + "language_loss": 0.79184073, + "learning_rate": 3.4048622746649966e-06, + "loss": 0.81688625, + "num_input_tokens_seen": 4180480, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 2.828125, + "step": 198, + "time_per_iteration": 2.6208834648132324 + }, + { + "auxiliary_loss_clip": 0.01390401, + "auxiliary_loss_mlp": 0.01116361, + "balance_loss_clip": 1.06545901, + "balance_loss_mlp": 1.11265802, + "epoch": 0.011964527280925898, + "flos": 20521584092160.0, + "grad_norm": 3.3720724842630663, + "language_loss": 0.88065112, + "learning_rate": 3.4081058693512278e-06, + "loss": 0.90571868, + "num_input_tokens_seen": 4198835, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.765625, + "step": 199, + "time_per_iteration": 2.5257043838500977 + }, + { + "auxiliary_loss_clip": 0.01403414, + "auxiliary_loss_mlp": 0.01121968, + "balance_loss_clip": 1.0658679, + "balance_loss_mlp": 1.11557496, + "epoch": 0.012024650533593867, + "flos": 27745984771200.0, + "grad_norm": 1.8432610551497841, + "language_loss": 0.81327617, + "learning_rate": 3.411333205349222e-06, + "loss": 0.83853, + "num_input_tokens_seen": 4219335, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 2.875, + "step": 200, + "time_per_iteration": 2.593231201171875 + }, + { + "auxiliary_loss_clip": 0.01400536, + "auxiliary_loss_mlp": 0.01101092, + "balance_loss_clip": 1.04792464, + "balance_loss_mlp": 1.11138511, + "epoch": 0.012084773786261837, + "flos": 10452017076480.0, + "grad_norm": 2.758923223370522, + "language_loss": 0.87688923, + "learning_rate": 3.4145444448414217e-06, + "loss": 0.90190548, + "num_input_tokens_seen": 4236940, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.90625, + "step": 201, + "time_per_iteration": 2.5057122707366943 + }, + { + "auxiliary_loss_clip": 0.01401128, + "auxiliary_loss_mlp": 0.01110995, + "balance_loss_clip": 1.05751753, + "balance_loss_mlp": 1.1152513, + "epoch": 0.012144897038929806, + "flos": 23105751229440.0, + "grad_norm": 3.7927516715708736, + "language_loss": 0.84123611, + "learning_rate": 3.4177397475956223e-06, + "loss": 0.86635733, + "num_input_tokens_seen": 4256755, + "router_z_loss_clip": 0.53515625, + "router_z_loss_mlp": 2.859375, + "step": 202, + "time_per_iteration": 2.555680751800537 + }, + { + "auxiliary_loss_clip": 0.01388205, + "auxiliary_loss_mlp": 0.01109065, + "balance_loss_clip": 1.05639839, + "balance_loss_mlp": 1.10674798, + "epoch": 0.012205020291597776, + "flos": 21033203460480.0, + "grad_norm": 1.9040504717952067, + "language_loss": 0.90116632, + "learning_rate": 3.4209192710126685e-06, + "loss": 0.926139, + "num_input_tokens_seen": 4276505, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 2.8125, + "step": 203, + "time_per_iteration": 2.526937484741211 + }, + { + "auxiliary_loss_clip": 0.01281494, + "auxiliary_loss_mlp": 0.01053133, + "balance_loss_clip": 1.03138971, + "balance_loss_mlp": 1.12054539, + "epoch": 0.012265143544265745, + "flos": 68447785075200.0, + "grad_norm": 1.0150955472927095, + "language_loss": 0.61259121, + "learning_rate": 3.4240831701729837e-06, + "loss": 0.63593745, + "num_input_tokens_seen": 4330965, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.609375, + "step": 204, + "time_per_iteration": 3.051469326019287 + }, + { + "auxiliary_loss_clip": 0.01398264, + "auxiliary_loss_mlp": 0.01111819, + "balance_loss_clip": 1.0593431, + "balance_loss_mlp": 1.11035323, + "epoch": 0.012325266796933715, + "flos": 17019252478080.0, + "grad_norm": 2.269022633654934, + "language_loss": 0.91206741, + "learning_rate": 3.4272315978819516e-06, + "loss": 0.93716824, + "num_input_tokens_seen": 4348200, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.875, + "step": 205, + "time_per_iteration": 2.5105605125427246 + }, + { + "auxiliary_loss_clip": 0.01406073, + "auxiliary_loss_mlp": 0.01120568, + "balance_loss_clip": 1.06675649, + "balance_loss_mlp": 1.11524296, + "epoch": 0.012385390049601683, + "flos": 20190056538240.0, + "grad_norm": 2.2813283317886497, + "language_loss": 0.89215505, + "learning_rate": 3.4303647047142043e-06, + "loss": 0.91742146, + "num_input_tokens_seen": 4365460, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 2.90625, + "step": 206, + "time_per_iteration": 2.524327039718628 + }, + { + "auxiliary_loss_clip": 0.01394865, + "auxiliary_loss_mlp": 0.01101938, + "balance_loss_clip": 1.05039215, + "balance_loss_mlp": 1.10848641, + "epoch": 0.012445513302269652, + "flos": 16253134272000.0, + "grad_norm": 2.502758142715096, + "language_loss": 0.95368809, + "learning_rate": 3.43348263905683e-06, + "loss": 0.97865611, + "num_input_tokens_seen": 4383650, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 2.859375, + "step": 207, + "time_per_iteration": 2.5147407054901123 + }, + { + "auxiliary_loss_clip": 0.01393931, + "auxiliary_loss_mlp": 0.01116307, + "balance_loss_clip": 1.06416512, + "balance_loss_mlp": 1.11335945, + "epoch": 0.012505636554937622, + "flos": 23769380954880.0, + "grad_norm": 2.4565104125033232, + "language_loss": 0.75770479, + "learning_rate": 3.436585547151547e-06, + "loss": 0.78280723, + "num_input_tokens_seen": 4403765, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.8125, + "step": 208, + "time_per_iteration": 2.5426721572875977 + }, + { + "auxiliary_loss_clip": 0.01382601, + "auxiliary_loss_mlp": 0.01107359, + "balance_loss_clip": 1.05497861, + "balance_loss_mlp": 1.10796773, + "epoch": 0.012565759807605591, + "flos": 30591546157440.0, + "grad_norm": 2.79364384939249, + "language_loss": 0.98718858, + "learning_rate": 3.4396735731358586e-06, + "loss": 1.01208818, + "num_input_tokens_seen": 4421935, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.75, + "step": 209, + "time_per_iteration": 2.607238292694092 + }, + { + "auxiliary_loss_clip": 0.01389293, + "auxiliary_loss_mlp": 0.01112212, + "balance_loss_clip": 1.05971253, + "balance_loss_mlp": 1.11020541, + "epoch": 0.012625883060273561, + "flos": 40113511355520.0, + "grad_norm": 7.039976369418198, + "language_loss": 0.85444254, + "learning_rate": 3.4427468590832302e-06, + "loss": 0.87945753, + "num_input_tokens_seen": 4441470, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.78125, + "step": 210, + "time_per_iteration": 2.67632794380188 + }, + { + "auxiliary_loss_clip": 0.01385349, + "auxiliary_loss_mlp": 0.01120301, + "balance_loss_clip": 1.07042408, + "balance_loss_mlp": 1.1073029, + "epoch": 0.01268600631294153, + "flos": 27089178629760.0, + "grad_norm": 2.2334441604414783, + "language_loss": 0.97016168, + "learning_rate": 3.445805545042314e-06, + "loss": 0.99521822, + "num_input_tokens_seen": 4459950, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 2.78125, + "step": 211, + "time_per_iteration": 2.5733633041381836 + }, + { + "auxiliary_loss_clip": 0.01394963, + "auxiliary_loss_mlp": 0.01114691, + "balance_loss_clip": 1.0616188, + "balance_loss_mlp": 1.11342549, + "epoch": 0.012746129565609499, + "flos": 16982767238400.0, + "grad_norm": 3.6563211355425453, + "language_loss": 0.95188707, + "learning_rate": 3.448849769075239e-06, + "loss": 0.97698367, + "num_input_tokens_seen": 4478390, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.8125, + "step": 212, + "time_per_iteration": 2.5224313735961914 + }, + { + "auxiliary_loss_clip": 0.01383511, + "auxiliary_loss_mlp": 0.01116361, + "balance_loss_clip": 1.06376541, + "balance_loss_mlp": 1.10996664, + "epoch": 0.012806252818277469, + "flos": 46533476995200.0, + "grad_norm": 2.0395830195466504, + "language_loss": 0.76049221, + "learning_rate": 3.4518796672950093e-06, + "loss": 0.78549099, + "num_input_tokens_seen": 4501665, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 2.734375, + "step": 213, + "time_per_iteration": 2.76625919342041 + }, + { + "auxiliary_loss_clip": 0.0138732, + "auxiliary_loss_mlp": 0.01103154, + "balance_loss_clip": 1.052037, + "balance_loss_mlp": 1.10833097, + "epoch": 0.012866376070945438, + "flos": 14388616120320.0, + "grad_norm": 8.414558483522654, + "language_loss": 0.86754733, + "learning_rate": 3.4548953739020187e-06, + "loss": 0.89245206, + "num_input_tokens_seen": 4519055, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 2.78125, + "step": 214, + "time_per_iteration": 2.500417470932007 + }, + { + "auxiliary_loss_clip": 0.0138682, + "auxiliary_loss_mlp": 0.01127788, + "balance_loss_clip": 1.07397687, + "balance_loss_mlp": 1.11549139, + "epoch": 0.012926499323613408, + "flos": 26140813793280.0, + "grad_norm": 2.3854037050744057, + "language_loss": 0.77357471, + "learning_rate": 3.4578970212197196e-06, + "loss": 0.79872084, + "num_input_tokens_seen": 4540870, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 2.703125, + "step": 215, + "time_per_iteration": 2.6116256713867188 + }, + { + "auxiliary_loss_clip": 0.01394912, + "auxiliary_loss_mlp": 0.01111048, + "balance_loss_clip": 1.06002641, + "balance_loss_mlp": 1.11393261, + "epoch": 0.012986622576281377, + "flos": 30117202128000.0, + "grad_norm": 2.44498430810385, + "language_loss": 0.90545797, + "learning_rate": 3.460884739729461e-06, + "loss": 0.93051755, + "num_input_tokens_seen": 4560395, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.8125, + "step": 216, + "time_per_iteration": 2.5903706550598145 + }, + { + "auxiliary_loss_clip": 0.0138678, + "auxiliary_loss_mlp": 0.01107632, + "balance_loss_clip": 1.05622888, + "balance_loss_mlp": 1.10772836, + "epoch": 0.013046745828949347, + "flos": 13954025468160.0, + "grad_norm": 2.630220300857062, + "language_loss": 0.93660516, + "learning_rate": 3.463858658104523e-06, + "loss": 0.96154928, + "num_input_tokens_seen": 4575785, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 2.78125, + "step": 217, + "time_per_iteration": 2.5109100341796875 + }, + { + "auxiliary_loss_clip": 0.01381618, + "auxiliary_loss_mlp": 0.01107152, + "balance_loss_clip": 1.05360317, + "balance_loss_mlp": 1.10700643, + "epoch": 0.013106869081617315, + "flos": 17347835116800.0, + "grad_norm": 1.9165712032980975, + "language_loss": 0.93656206, + "learning_rate": 3.4668189032433696e-06, + "loss": 0.96144974, + "num_input_tokens_seen": 4594985, + "router_z_loss_clip": 0.53515625, + "router_z_loss_mlp": 2.75, + "step": 218, + "time_per_iteration": 2.6586077213287354 + }, + { + "auxiliary_loss_clip": 0.01376505, + "auxiliary_loss_mlp": 0.01108753, + "balance_loss_clip": 1.05820787, + "balance_loss_mlp": 1.10663593, + "epoch": 0.013166992334285284, + "flos": 25884914325120.0, + "grad_norm": 1.916363531530835, + "language_loss": 0.86148179, + "learning_rate": 3.46976560030214e-06, + "loss": 0.88633436, + "num_input_tokens_seen": 4616125, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 2.703125, + "step": 219, + "time_per_iteration": 2.584040880203247 + }, + { + "auxiliary_loss_clip": 0.01383955, + "auxiliary_loss_mlp": 0.01101272, + "balance_loss_clip": 1.05056047, + "balance_loss_mlp": 1.110309, + "epoch": 0.013227115586953254, + "flos": 31175956437120.0, + "grad_norm": 1.7731463199764816, + "language_loss": 0.87598741, + "learning_rate": 3.4726988727263976e-06, + "loss": 0.90083969, + "num_input_tokens_seen": 4637795, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.75, + "step": 220, + "time_per_iteration": 2.6294186115264893 + }, + { + "auxiliary_loss_clip": 0.01373821, + "auxiliary_loss_mlp": 0.01103316, + "balance_loss_clip": 1.05663311, + "balance_loss_mlp": 1.10389161, + "epoch": 0.013287238839621223, + "flos": 20409470766720.0, + "grad_norm": 1.991547522293572, + "language_loss": 0.86413074, + "learning_rate": 3.475618842282164e-06, + "loss": 0.88890207, + "num_input_tokens_seen": 4656835, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 2.6875, + "step": 221, + "time_per_iteration": 2.606137990951538 + }, + { + "auxiliary_loss_clip": 0.0137878, + "auxiliary_loss_mlp": 0.01109834, + "balance_loss_clip": 1.05800176, + "balance_loss_mlp": 1.10240269, + "epoch": 0.013347362092289193, + "flos": 14137134024960.0, + "grad_norm": 2.017045003530743, + "language_loss": 0.92153138, + "learning_rate": 3.4785256290862486e-06, + "loss": 0.94641757, + "num_input_tokens_seen": 4673015, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 2.765625, + "step": 222, + "time_per_iteration": 2.6237566471099854 + }, + { + "auxiliary_loss_clip": 0.01377393, + "auxiliary_loss_mlp": 0.01105441, + "balance_loss_clip": 1.05129576, + "balance_loss_mlp": 1.10672021, + "epoch": 0.013407485344957162, + "flos": 21797705554560.0, + "grad_norm": 2.7127164790698606, + "language_loss": 0.95539695, + "learning_rate": 3.481419351635897e-06, + "loss": 0.98022527, + "num_input_tokens_seen": 4692355, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 2.71875, + "step": 223, + "time_per_iteration": 2.679387092590332 + }, + { + "auxiliary_loss_clip": 0.01377947, + "auxiliary_loss_mlp": 0.01106927, + "balance_loss_clip": 1.05612004, + "balance_loss_mlp": 1.10671806, + "epoch": 0.013467608597625132, + "flos": 18621622195200.0, + "grad_norm": 2.5543531214735586, + "language_loss": 0.88022512, + "learning_rate": 3.484300126837776e-06, + "loss": 0.90507382, + "num_input_tokens_seen": 4710080, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.71875, + "step": 224, + "time_per_iteration": 2.6327528953552246 + }, + { + "auxiliary_loss_clip": 0.0137715, + "auxiliary_loss_mlp": 0.01102713, + "balance_loss_clip": 1.04873466, + "balance_loss_mlp": 1.10632586, + "epoch": 0.013527731850293101, + "flos": 18552314903040.0, + "grad_norm": 2.0812591886363183, + "language_loss": 0.89642018, + "learning_rate": 3.487168070036317e-06, + "loss": 0.92121875, + "num_input_tokens_seen": 4728980, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 2.703125, + "step": 225, + "time_per_iteration": 2.511749505996704 + }, + { + "auxiliary_loss_clip": 0.01374075, + "auxiliary_loss_mlp": 0.01115854, + "balance_loss_clip": 1.06273401, + "balance_loss_mlp": 1.10547256, + "epoch": 0.01358785510296107, + "flos": 19165381257600.0, + "grad_norm": 2.1555099546542142, + "language_loss": 0.99022663, + "learning_rate": 3.4900232950414224e-06, + "loss": 1.01512599, + "num_input_tokens_seen": 4747020, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.6875, + "step": 226, + "time_per_iteration": 5.38438868522644 + }, + { + "auxiliary_loss_clip": 0.0137773, + "auxiliary_loss_mlp": 0.01111487, + "balance_loss_clip": 1.0584867, + "balance_loss_mlp": 1.10696185, + "epoch": 0.01364797835562904, + "flos": 23329941966720.0, + "grad_norm": 15.523681056640678, + "language_loss": 0.91210413, + "learning_rate": 3.4928659141555727e-06, + "loss": 0.93699628, + "num_input_tokens_seen": 4765000, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.703125, + "step": 227, + "time_per_iteration": 2.5391762256622314 + }, + { + "auxiliary_loss_clip": 0.01252818, + "auxiliary_loss_mlp": 0.01025357, + "balance_loss_clip": 1.00666487, + "balance_loss_mlp": 1.10911703, + "epoch": 0.013708101608297009, + "flos": 70993746097920.0, + "grad_norm": 0.99230217192713, + "language_loss": 0.57680154, + "learning_rate": 3.4956960382003234e-06, + "loss": 0.59958327, + "num_input_tokens_seen": 4833210, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.4375, + "step": 228, + "time_per_iteration": 3.1981163024902344 + }, + { + "auxiliary_loss_clip": 0.0136686, + "auxiliary_loss_mlp": 0.01110654, + "balance_loss_clip": 1.06127763, + "balance_loss_mlp": 1.10228515, + "epoch": 0.013768224860964979, + "flos": 16325170997760.0, + "grad_norm": 2.2779006264878374, + "language_loss": 0.8759563, + "learning_rate": 3.4985137765422354e-06, + "loss": 0.90073144, + "num_input_tokens_seen": 4850120, + "router_z_loss_clip": 0.49414062, + "router_z_loss_mlp": 2.65625, + "step": 229, + "time_per_iteration": 2.49130916595459 + }, + { + "auxiliary_loss_clip": 0.01377631, + "auxiliary_loss_mlp": 0.01101911, + "balance_loss_clip": 1.05212951, + "balance_loss_mlp": 1.10486007, + "epoch": 0.013828348113632948, + "flos": 20193037367040.0, + "grad_norm": 4.280679608747667, + "language_loss": 0.84247303, + "learning_rate": 3.501319237118231e-06, + "loss": 0.8672685, + "num_input_tokens_seen": 4866215, + "router_z_loss_clip": 0.49804688, + "router_z_loss_mlp": 2.734375, + "step": 230, + "time_per_iteration": 2.501218557357788 + }, + { + "auxiliary_loss_clip": 0.01375417, + "auxiliary_loss_mlp": 0.0111628, + "balance_loss_clip": 1.06671298, + "balance_loss_mlp": 1.10600948, + "epoch": 0.013888471366300916, + "flos": 20741070147840.0, + "grad_norm": 1.78964280876859, + "language_loss": 0.90378422, + "learning_rate": 3.5041125264604056e-06, + "loss": 0.92870116, + "num_input_tokens_seen": 4885630, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 2.6875, + "step": 231, + "time_per_iteration": 2.541137456893921 + }, + { + "auxiliary_loss_clip": 0.01377441, + "auxiliary_loss_mlp": 0.01108629, + "balance_loss_clip": 1.05941916, + "balance_loss_mlp": 1.10821056, + "epoch": 0.013948594618968886, + "flos": 22090628966400.0, + "grad_norm": 2.031489983297281, + "language_loss": 0.83706695, + "learning_rate": 3.5068937497203002e-06, + "loss": 0.86192763, + "num_input_tokens_seen": 4905570, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 2.6875, + "step": 232, + "time_per_iteration": 2.5444753170013428 + }, + { + "auxiliary_loss_clip": 0.0137977, + "auxiliary_loss_mlp": 0.01092372, + "balance_loss_clip": 1.04125488, + "balance_loss_mlp": 1.10017753, + "epoch": 0.014008717871636855, + "flos": 19063108258560.0, + "grad_norm": 2.928489064169697, + "language_loss": 0.74033689, + "learning_rate": 3.509663010692652e-06, + "loss": 0.76505834, + "num_input_tokens_seen": 4923535, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 2.796875, + "step": 233, + "time_per_iteration": 2.5364952087402344 + }, + { + "auxiliary_loss_clip": 0.01382965, + "auxiliary_loss_mlp": 0.0112384, + "balance_loss_clip": 1.07141209, + "balance_loss_mlp": 1.10741055, + "epoch": 0.014068841124304825, + "flos": 14530822064640.0, + "grad_norm": 2.287774019631123, + "language_loss": 0.85867143, + "learning_rate": 3.512420411838642e-06, + "loss": 0.88373953, + "num_input_tokens_seen": 4939200, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.75, + "step": 234, + "time_per_iteration": 2.532949209213257 + }, + { + "auxiliary_loss_clip": 0.01375298, + "auxiliary_loss_mlp": 0.01106064, + "balance_loss_clip": 1.05683041, + "balance_loss_mlp": 1.10759592, + "epoch": 0.014128964376972794, + "flos": 18077396256000.0, + "grad_norm": 2.6527993685177154, + "language_loss": 0.89144391, + "learning_rate": 3.515166054308634e-06, + "loss": 0.9162575, + "num_input_tokens_seen": 4956620, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 2.671875, + "step": 235, + "time_per_iteration": 2.509592294692993 + }, + { + "auxiliary_loss_clip": 0.0137416, + "auxiliary_loss_mlp": 0.01119384, + "balance_loss_clip": 1.06874382, + "balance_loss_mlp": 1.10830367, + "epoch": 0.014189087629640764, + "flos": 25334331678720.0, + "grad_norm": 4.054998173736759, + "language_loss": 0.85780042, + "learning_rate": 3.5179000379644498e-06, + "loss": 0.88273585, + "num_input_tokens_seen": 4975650, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.65625, + "step": 236, + "time_per_iteration": 2.744925022125244 + }, + { + "auxiliary_loss_clip": 0.0137118, + "auxiliary_loss_mlp": 0.01099258, + "balance_loss_clip": 1.04871392, + "balance_loss_mlp": 1.10178149, + "epoch": 0.014249210882308733, + "flos": 36139744713600.0, + "grad_norm": 2.128422813257453, + "language_loss": 0.82452404, + "learning_rate": 3.520622461401154e-06, + "loss": 0.84922838, + "num_input_tokens_seen": 4997415, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.6875, + "step": 237, + "time_per_iteration": 2.67307710647583 + }, + { + "auxiliary_loss_clip": 0.01369116, + "auxiliary_loss_mlp": 0.01116968, + "balance_loss_clip": 1.0643487, + "balance_loss_mlp": 1.10451889, + "epoch": 0.014309334134976702, + "flos": 12932977461120.0, + "grad_norm": 3.103781307849977, + "language_loss": 0.77321362, + "learning_rate": 3.5233334219683935e-06, + "loss": 0.79807448, + "num_input_tokens_seen": 5013905, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 2.65625, + "step": 238, + "time_per_iteration": 2.4973809719085693 + }, + { + "auxiliary_loss_clip": 0.01368178, + "auxiliary_loss_mlp": 0.01112367, + "balance_loss_clip": 1.06566119, + "balance_loss_mlp": 1.10654771, + "epoch": 0.014369457387644672, + "flos": 20777519473920.0, + "grad_norm": 1.992064896075991, + "language_loss": 0.87370872, + "learning_rate": 3.526033015791284e-06, + "loss": 0.89851415, + "num_input_tokens_seen": 5033645, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 2.609375, + "step": 239, + "time_per_iteration": 2.554222583770752 + }, + { + "auxiliary_loss_clip": 0.01352979, + "auxiliary_loss_mlp": 0.01100535, + "balance_loss_clip": 1.05330408, + "balance_loss_mlp": 1.09776592, + "epoch": 0.01442958064031264, + "flos": 25848536826240.0, + "grad_norm": 2.2433371609956283, + "language_loss": 0.93297911, + "learning_rate": 3.528721337790862e-06, + "loss": 0.95751429, + "num_input_tokens_seen": 5052875, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 2.5625, + "step": 240, + "time_per_iteration": 2.588529348373413 + }, + { + "auxiliary_loss_clip": 0.01362634, + "auxiliary_loss_mlp": 0.01104045, + "balance_loss_clip": 1.05736244, + "balance_loss_mlp": 1.10324717, + "epoch": 0.014489703892980611, + "flos": 28219718269440.0, + "grad_norm": 2.299780828803648, + "language_loss": 0.85129881, + "learning_rate": 3.531398481704111e-06, + "loss": 0.8759656, + "num_input_tokens_seen": 5075005, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 2.59375, + "step": 241, + "time_per_iteration": 2.607272148132324 + }, + { + "auxiliary_loss_clip": 0.01360778, + "auxiliary_loss_mlp": 0.01116022, + "balance_loss_clip": 1.06695509, + "balance_loss_mlp": 1.10865557, + "epoch": 0.01454982714564858, + "flos": 22490925108480.0, + "grad_norm": 1.927287768398498, + "language_loss": 0.88410223, + "learning_rate": 3.534064540103573e-06, + "loss": 0.90887022, + "num_input_tokens_seen": 5091875, + "router_z_loss_clip": 0.49023438, + "router_z_loss_mlp": 2.53125, + "step": 242, + "time_per_iteration": 2.522657632827759 + }, + { + "auxiliary_loss_clip": 0.013595, + "auxiliary_loss_mlp": 0.0109979, + "balance_loss_clip": 1.04981756, + "balance_loss_mlp": 1.10147619, + "epoch": 0.014609950398316548, + "flos": 21653201139840.0, + "grad_norm": 2.6384412969740922, + "language_loss": 0.86817086, + "learning_rate": 3.536719604416555e-06, + "loss": 0.89276373, + "num_input_tokens_seen": 5111290, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 2.578125, + "step": 243, + "time_per_iteration": 2.5738751888275146 + }, + { + "auxiliary_loss_clip": 0.01366378, + "auxiliary_loss_mlp": 0.01105289, + "balance_loss_clip": 1.05574584, + "balance_loss_mlp": 1.10421979, + "epoch": 0.014670073650984519, + "flos": 21869993675520.0, + "grad_norm": 1.576084931358892, + "language_loss": 0.84271425, + "learning_rate": 3.5393637649439464e-06, + "loss": 0.86743093, + "num_input_tokens_seen": 5132265, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 2.625, + "step": 244, + "time_per_iteration": 2.51370906829834 + }, + { + "auxiliary_loss_clip": 0.01374385, + "auxiliary_loss_mlp": 0.01115077, + "balance_loss_clip": 1.06403196, + "balance_loss_mlp": 1.10701251, + "epoch": 0.014730196903652487, + "flos": 23183713699200.0, + "grad_norm": 2.2775099056278916, + "language_loss": 0.78689361, + "learning_rate": 3.54199711087864e-06, + "loss": 0.8117882, + "num_input_tokens_seen": 5148575, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 2.671875, + "step": 245, + "time_per_iteration": 2.5579745769500732 + }, + { + "auxiliary_loss_clip": 0.01372772, + "auxiliary_loss_mlp": 0.0110276, + "balance_loss_clip": 1.04961681, + "balance_loss_mlp": 1.10232484, + "epoch": 0.014790320156320457, + "flos": 23222605150080.0, + "grad_norm": 2.2330220282190685, + "language_loss": 0.84241545, + "learning_rate": 3.5446197303235913e-06, + "loss": 0.86717069, + "num_input_tokens_seen": 5170415, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.703125, + "step": 246, + "time_per_iteration": 2.565614700317383 + }, + { + "auxiliary_loss_clip": 0.01367419, + "auxiliary_loss_mlp": 0.01097455, + "balance_loss_clip": 1.04722059, + "balance_loss_mlp": 1.10181057, + "epoch": 0.014850443408988426, + "flos": 15815490963840.0, + "grad_norm": 1.9335653980079095, + "language_loss": 0.9014703, + "learning_rate": 3.5472317103095034e-06, + "loss": 0.92611909, + "num_input_tokens_seen": 5188565, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 2.65625, + "step": 247, + "time_per_iteration": 2.5572896003723145 + }, + { + "auxiliary_loss_clip": 0.01365881, + "auxiliary_loss_mlp": 0.01097755, + "balance_loss_clip": 1.04952252, + "balance_loss_mlp": 1.09689593, + "epoch": 0.014910566661656396, + "flos": 22781657790720.0, + "grad_norm": 2.1205098484246734, + "language_loss": 0.78058362, + "learning_rate": 3.549833136812155e-06, + "loss": 0.80521989, + "num_input_tokens_seen": 5207810, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 2.6875, + "step": 248, + "time_per_iteration": 2.5365517139434814 + }, + { + "auxiliary_loss_clip": 0.0136687, + "auxiliary_loss_mlp": 0.01105288, + "balance_loss_clip": 1.05552983, + "balance_loss_mlp": 1.10545397, + "epoch": 0.014970689914324365, + "flos": 26865023806080.0, + "grad_norm": 2.1747011613954177, + "language_loss": 0.83849227, + "learning_rate": 3.552424094769381e-06, + "loss": 0.86321384, + "num_input_tokens_seen": 5226210, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 2.609375, + "step": 249, + "time_per_iteration": 2.6142020225524902 + }, + { + "auxiliary_loss_clip": 0.01360073, + "auxiliary_loss_mlp": 0.01106063, + "balance_loss_clip": 1.05806887, + "balance_loss_mlp": 1.09971058, + "epoch": 0.015030813166992334, + "flos": 13985662371840.0, + "grad_norm": 2.2137591284686455, + "language_loss": 0.93476778, + "learning_rate": 3.5550046680977174e-06, + "loss": 0.95942914, + "num_input_tokens_seen": 5241660, + "router_z_loss_clip": 0.47851562, + "router_z_loss_mlp": 2.609375, + "step": 250, + "time_per_iteration": 2.485686779022217 + }, + { + "auxiliary_loss_clip": 0.01369254, + "auxiliary_loss_mlp": 0.01114661, + "balance_loss_clip": 1.06351972, + "balance_loss_mlp": 1.10460913, + "epoch": 0.015090936419660304, + "flos": 24717817618560.0, + "grad_norm": 2.2612141068319622, + "language_loss": 0.97030997, + "learning_rate": 3.5575749397087034e-06, + "loss": 0.99514914, + "num_input_tokens_seen": 5261090, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 2.640625, + "step": 251, + "time_per_iteration": 2.5887296199798584 + }, + { + "auxiliary_loss_clip": 0.01362288, + "auxiliary_loss_mlp": 0.01105325, + "balance_loss_clip": 1.05723596, + "balance_loss_mlp": 1.09872079, + "epoch": 0.015151059672328273, + "flos": 25738793798400.0, + "grad_norm": 2.0465178965121136, + "language_loss": 0.8428089, + "learning_rate": 3.5601349915248707e-06, + "loss": 0.86748511, + "num_input_tokens_seen": 5279175, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 2.640625, + "step": 252, + "time_per_iteration": 2.5749199390411377 + }, + { + "auxiliary_loss_clip": 0.01357969, + "auxiliary_loss_mlp": 0.01114738, + "balance_loss_clip": 1.06569552, + "balance_loss_mlp": 1.10169089, + "epoch": 0.015211182924996243, + "flos": 21871214737920.0, + "grad_norm": 2.482990993198259, + "language_loss": 0.98208833, + "learning_rate": 3.5626849044954064e-06, + "loss": 1.00681543, + "num_input_tokens_seen": 5296975, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 2.5625, + "step": 253, + "time_per_iteration": 2.5639333724975586 + }, + { + "auxiliary_loss_clip": 0.01233728, + "auxiliary_loss_mlp": 0.01026961, + "balance_loss_clip": 1.00855541, + "balance_loss_mlp": 1.09965372, + "epoch": 0.015271306177664212, + "flos": 66895080888960.0, + "grad_norm": 0.8505459641429172, + "language_loss": 0.55672622, + "learning_rate": 3.5652247586115167e-06, + "loss": 0.57933319, + "num_input_tokens_seen": 5358375, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.34375, + "step": 254, + "time_per_iteration": 3.1063449382781982 + }, + { + "auxiliary_loss_clip": 0.01362079, + "auxiliary_loss_mlp": 0.01116704, + "balance_loss_clip": 1.06687438, + "balance_loss_mlp": 1.09652638, + "epoch": 0.01533142943033218, + "flos": 26834069260800.0, + "grad_norm": 2.4360968938917065, + "language_loss": 0.90453845, + "learning_rate": 3.567754632921479e-06, + "loss": 0.9293263, + "num_input_tokens_seen": 5377255, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 2.65625, + "step": 255, + "time_per_iteration": 2.5746912956237793 + }, + { + "auxiliary_loss_clip": 0.01358909, + "auxiliary_loss_mlp": 0.01125654, + "balance_loss_clip": 1.07568169, + "balance_loss_mlp": 1.09931397, + "epoch": 0.01539155268300015, + "flos": 20813753318400.0, + "grad_norm": 2.2666703391376903, + "language_loss": 0.8562001, + "learning_rate": 3.5702746055454075e-06, + "loss": 0.8810457, + "num_input_tokens_seen": 5395320, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 2.59375, + "step": 256, + "time_per_iteration": 2.6095149517059326 + }, + { + "auxiliary_loss_clip": 0.01366413, + "auxiliary_loss_mlp": 0.01112529, + "balance_loss_clip": 1.06305718, + "balance_loss_mlp": 1.09961021, + "epoch": 0.01545167593566812, + "flos": 15961862885760.0, + "grad_norm": 2.7442871984488386, + "language_loss": 0.71504897, + "learning_rate": 3.5727847536897254e-06, + "loss": 0.73983842, + "num_input_tokens_seen": 5411970, + "router_z_loss_clip": 0.49414062, + "router_z_loss_mlp": 2.65625, + "step": 257, + "time_per_iteration": 2.5939691066741943 + }, + { + "auxiliary_loss_clip": 0.01357007, + "auxiliary_loss_mlp": 0.01100177, + "balance_loss_clip": 1.05087197, + "balance_loss_mlp": 1.09875202, + "epoch": 0.01551179918833609, + "flos": 22601745544320.0, + "grad_norm": 1.9522192109187282, + "language_loss": 0.94659579, + "learning_rate": 3.5752851536613596e-06, + "loss": 0.97116768, + "num_input_tokens_seen": 5430245, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 2.578125, + "step": 258, + "time_per_iteration": 2.7119739055633545 + }, + { + "auxiliary_loss_clip": 0.01356701, + "auxiliary_loss_mlp": 0.01104272, + "balance_loss_clip": 1.05615926, + "balance_loss_mlp": 1.09608126, + "epoch": 0.015571922441004058, + "flos": 22816706486400.0, + "grad_norm": 3.167214789879638, + "language_loss": 0.93174207, + "learning_rate": 3.577775880881658e-06, + "loss": 0.95635182, + "num_input_tokens_seen": 5448905, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 2.59375, + "step": 259, + "time_per_iteration": 2.6776607036590576 + }, + { + "auxiliary_loss_clip": 0.01351639, + "auxiliary_loss_mlp": 0.010988, + "balance_loss_clip": 1.05297637, + "balance_loss_mlp": 1.10035825, + "epoch": 0.015632045693672027, + "flos": 18947439486720.0, + "grad_norm": 2.1226725879970605, + "language_loss": 0.97360909, + "learning_rate": 3.5802570099000424e-06, + "loss": 0.99811351, + "num_input_tokens_seen": 5466405, + "router_z_loss_clip": 0.45898438, + "router_z_loss_mlp": 2.515625, + "step": 260, + "time_per_iteration": 2.520759105682373 + }, + { + "auxiliary_loss_clip": 0.01365989, + "auxiliary_loss_mlp": 0.01110082, + "balance_loss_clip": 1.06282747, + "balance_loss_mlp": 1.10060608, + "epoch": 0.015692168946339995, + "flos": 29971728046080.0, + "grad_norm": 2.3569711169381, + "language_loss": 0.87644511, + "learning_rate": 3.5827286144073947e-06, + "loss": 0.90120584, + "num_input_tokens_seen": 5487055, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 2.65625, + "step": 261, + "time_per_iteration": 2.5837602615356445 + }, + { + "auxiliary_loss_clip": 0.0135711, + "auxiliary_loss_mlp": 0.01105229, + "balance_loss_clip": 1.05613816, + "balance_loss_mlp": 1.09709311, + "epoch": 0.015752292199007967, + "flos": 19392085946880.0, + "grad_norm": 1.9926513495738176, + "language_loss": 0.67226446, + "learning_rate": 3.5851907672491904e-06, + "loss": 0.69688779, + "num_input_tokens_seen": 5506600, + "router_z_loss_clip": 0.49023438, + "router_z_loss_mlp": 2.59375, + "step": 262, + "time_per_iteration": 2.5490784645080566 + }, + { + "auxiliary_loss_clip": 0.01354995, + "auxiliary_loss_mlp": 0.01121613, + "balance_loss_clip": 1.07145, + "balance_loss_mlp": 1.0984714, + "epoch": 0.015812415451675936, + "flos": 20339804338560.0, + "grad_norm": 2.3019763169045637, + "language_loss": 0.68570435, + "learning_rate": 3.587643540438383e-06, + "loss": 0.71047044, + "num_input_tokens_seen": 5524350, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 2.5625, + "step": 263, + "time_per_iteration": 2.5207104682922363 + }, + { + "auxiliary_loss_clip": 0.01355963, + "auxiliary_loss_mlp": 0.01105396, + "balance_loss_clip": 1.055686, + "balance_loss_mlp": 1.09446979, + "epoch": 0.015872538704343905, + "flos": 17525412979200.0, + "grad_norm": 3.705792502973735, + "language_loss": 0.85120308, + "learning_rate": 3.590087005168037e-06, + "loss": 0.87581658, + "num_input_tokens_seen": 5542145, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 2.625, + "step": 264, + "time_per_iteration": 2.559406280517578 + }, + { + "auxiliary_loss_clip": 0.01361439, + "auxiliary_loss_mlp": 0.01092909, + "balance_loss_clip": 1.04663229, + "balance_loss_mlp": 1.10003614, + "epoch": 0.015932661957011873, + "flos": 15260490944640.0, + "grad_norm": 4.651007312001026, + "language_loss": 1.04371059, + "learning_rate": 3.5925212318237344e-06, + "loss": 1.06825411, + "num_input_tokens_seen": 5557920, + "router_z_loss_clip": 0.46289062, + "router_z_loss_mlp": 2.625, + "step": 265, + "time_per_iteration": 2.5076427459716797 + }, + { + "auxiliary_loss_clip": 0.01364923, + "auxiliary_loss_mlp": 0.01114141, + "balance_loss_clip": 1.06266677, + "balance_loss_mlp": 1.10278761, + "epoch": 0.015992785209679845, + "flos": 20302528999680.0, + "grad_norm": 2.2797174203272705, + "language_loss": 0.75153112, + "learning_rate": 3.5949462899957323e-06, + "loss": 0.77632177, + "num_input_tokens_seen": 5576290, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 2.625, + "step": 266, + "time_per_iteration": 2.52923583984375 + }, + { + "auxiliary_loss_clip": 0.01351984, + "auxiliary_loss_mlp": 0.01101922, + "balance_loss_clip": 1.05321336, + "balance_loss_mlp": 1.10004377, + "epoch": 0.016052908462347814, + "flos": 23362368969600.0, + "grad_norm": 1.7047265515665009, + "language_loss": 0.90568709, + "learning_rate": 3.5973622484909068e-06, + "loss": 0.93022615, + "num_input_tokens_seen": 5595205, + "router_z_loss_clip": 0.48632812, + "router_z_loss_mlp": 2.515625, + "step": 267, + "time_per_iteration": 4.033226251602173 + }, + { + "auxiliary_loss_clip": 0.01359316, + "auxiliary_loss_mlp": 0.01118854, + "balance_loss_clip": 1.07143235, + "balance_loss_mlp": 1.09878063, + "epoch": 0.016113031715015783, + "flos": 21286588976640.0, + "grad_norm": 2.258126572730018, + "language_loss": 0.86044276, + "learning_rate": 3.599769175344462e-06, + "loss": 0.88522446, + "num_input_tokens_seen": 5612645, + "router_z_loss_clip": 0.47460938, + "router_z_loss_mlp": 2.609375, + "step": 268, + "time_per_iteration": 3.9120936393737793 + }, + { + "auxiliary_loss_clip": 0.01352601, + "auxiliary_loss_mlp": 0.01098281, + "balance_loss_clip": 1.05186045, + "balance_loss_mlp": 1.10092831, + "epoch": 0.01617315496768375, + "flos": 18914689261440.0, + "grad_norm": 3.4793793476816335, + "language_loss": 0.88284534, + "learning_rate": 3.602167137831432e-06, + "loss": 0.90735412, + "num_input_tokens_seen": 5628345, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 2.515625, + "step": 269, + "time_per_iteration": 2.5170347690582275 + }, + { + "auxiliary_loss_clip": 0.01357286, + "auxiliary_loss_mlp": 0.01099969, + "balance_loss_clip": 1.04901874, + "balance_loss_mlp": 1.09723783, + "epoch": 0.01623327822035172, + "flos": 16546488647040.0, + "grad_norm": 2.082153756456244, + "language_loss": 0.97073388, + "learning_rate": 3.6045562024779565e-06, + "loss": 0.99530637, + "num_input_tokens_seen": 5645940, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.59375, + "step": 270, + "time_per_iteration": 2.4856350421905518 + }, + { + "auxiliary_loss_clip": 0.01357366, + "auxiliary_loss_mlp": 0.01117767, + "balance_loss_clip": 1.07001138, + "balance_loss_mlp": 1.10259032, + "epoch": 0.016293401473019692, + "flos": 23513481486720.0, + "grad_norm": 2.1071719511680755, + "language_loss": 0.85919821, + "learning_rate": 3.606936435072361e-06, + "loss": 0.88394946, + "num_input_tokens_seen": 5665690, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 2.546875, + "step": 271, + "time_per_iteration": 2.55047345161438 + }, + { + "auxiliary_loss_clip": 0.01355041, + "auxiliary_loss_mlp": 0.0109977, + "balance_loss_clip": 1.05201519, + "balance_loss_mlp": 1.09418058, + "epoch": 0.01635352472568766, + "flos": 29016072748800.0, + "grad_norm": 3.6330072162998523, + "language_loss": 0.81509304, + "learning_rate": 3.609307900676025e-06, + "loss": 0.83964115, + "num_input_tokens_seen": 5683190, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 2.609375, + "step": 272, + "time_per_iteration": 2.563840389251709 + }, + { + "auxiliary_loss_clip": 0.01348825, + "auxiliary_loss_mlp": 0.01118044, + "balance_loss_clip": 1.07229137, + "balance_loss_mlp": 1.09649634, + "epoch": 0.01641364797835563, + "flos": 13370513028480.0, + "grad_norm": 2.4112371858801436, + "language_loss": 0.81101978, + "learning_rate": 3.611670663634051e-06, + "loss": 0.83568847, + "num_input_tokens_seen": 5699780, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 2.515625, + "step": 273, + "time_per_iteration": 2.504791736602783 + }, + { + "auxiliary_loss_clip": 0.01348205, + "auxiliary_loss_mlp": 0.01105988, + "balance_loss_clip": 1.05825627, + "balance_loss_mlp": 1.0930239, + "epoch": 0.016473771231023598, + "flos": 18878239935360.0, + "grad_norm": 2.3125197915452387, + "language_loss": 0.91599321, + "learning_rate": 3.614024787585744e-06, + "loss": 0.94053519, + "num_input_tokens_seen": 5716980, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 2.5625, + "step": 274, + "time_per_iteration": 2.530883312225342 + }, + { + "auxiliary_loss_clip": 0.01346841, + "auxiliary_loss_mlp": 0.01110058, + "balance_loss_clip": 1.06154013, + "balance_loss_mlp": 1.09588742, + "epoch": 0.016533894483691566, + "flos": 22601637803520.0, + "grad_norm": 1.8828740595481548, + "language_loss": 0.87952697, + "learning_rate": 3.6163703354748927e-06, + "loss": 0.90409595, + "num_input_tokens_seen": 5737780, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 2.515625, + "step": 275, + "time_per_iteration": 2.6067841053009033 + }, + { + "auxiliary_loss_clip": 0.01349399, + "auxiliary_loss_mlp": 0.01103927, + "balance_loss_clip": 1.05481219, + "balance_loss_mlp": 1.09579742, + "epoch": 0.01659401773635954, + "flos": 21507188353920.0, + "grad_norm": 1.8814357547622875, + "language_loss": 0.80717576, + "learning_rate": 3.6187073695598707e-06, + "loss": 0.83170903, + "num_input_tokens_seen": 5758330, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 2.53125, + "step": 276, + "time_per_iteration": 2.5251641273498535 + }, + { + "auxiliary_loss_clip": 0.01340258, + "auxiliary_loss_mlp": 0.01100275, + "balance_loss_clip": 1.0561676, + "balance_loss_mlp": 1.0946306, + "epoch": 0.016654140989027507, + "flos": 32850973411200.0, + "grad_norm": 1.7238418569970533, + "language_loss": 0.81033546, + "learning_rate": 3.621035951423551e-06, + "loss": 0.83474076, + "num_input_tokens_seen": 5778340, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 2.46875, + "step": 277, + "time_per_iteration": 2.6796398162841797 + }, + { + "auxiliary_loss_clip": 0.01338755, + "auxiliary_loss_mlp": 0.01095233, + "balance_loss_clip": 1.04828835, + "balance_loss_mlp": 1.08789539, + "epoch": 0.016714264241695476, + "flos": 12306228024960.0, + "grad_norm": 2.810922211495867, + "language_loss": 0.80307728, + "learning_rate": 3.623356141983041e-06, + "loss": 0.82741719, + "num_input_tokens_seen": 5794295, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 2.515625, + "step": 278, + "time_per_iteration": 2.4939208030700684 + }, + { + "auxiliary_loss_clip": 0.01343866, + "auxiliary_loss_mlp": 0.01101481, + "balance_loss_clip": 1.05634809, + "balance_loss_mlp": 1.09381282, + "epoch": 0.016774387494363444, + "flos": 27123796362240.0, + "grad_norm": 1.7778988036026468, + "language_loss": 0.90482658, + "learning_rate": 3.6256680014992486e-06, + "loss": 0.92928004, + "num_input_tokens_seen": 5814405, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 2.5, + "step": 279, + "time_per_iteration": 2.571439504623413 + }, + { + "auxiliary_loss_clip": 0.01348727, + "auxiliary_loss_mlp": 0.01116075, + "balance_loss_clip": 1.06872559, + "balance_loss_mlp": 1.09391451, + "epoch": 0.016834510747031413, + "flos": 20191493082240.0, + "grad_norm": 3.0477743200742387, + "language_loss": 0.94153798, + "learning_rate": 3.6279715895862713e-06, + "loss": 0.96618605, + "num_input_tokens_seen": 5832795, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 2.546875, + "step": 280, + "time_per_iteration": 2.5161728858947754 + }, + { + "auxiliary_loss_clip": 0.0134865, + "auxiliary_loss_mlp": 0.0110692, + "balance_loss_clip": 1.05864, + "balance_loss_mlp": 1.09245062, + "epoch": 0.016894633999699385, + "flos": 27274262434560.0, + "grad_norm": 3.578687135351882, + "language_loss": 0.73929775, + "learning_rate": 3.6302669652206183e-06, + "loss": 0.76385343, + "num_input_tokens_seen": 5855750, + "router_z_loss_clip": 0.48242188, + "router_z_loss_mlp": 2.5625, + "step": 281, + "time_per_iteration": 2.616241931915283 + }, + { + "auxiliary_loss_clip": 0.01343434, + "auxiliary_loss_mlp": 0.0111488, + "balance_loss_clip": 1.06977129, + "balance_loss_mlp": 1.09390783, + "epoch": 0.016954757252367354, + "flos": 14902964922240.0, + "grad_norm": 2.679798242609796, + "language_loss": 0.80207133, + "learning_rate": 3.632554186750274e-06, + "loss": 0.82665443, + "num_input_tokens_seen": 5872610, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 2.5, + "step": 282, + "time_per_iteration": 2.5421135425567627 + }, + { + "auxiliary_loss_clip": 0.01348806, + "auxiliary_loss_mlp": 0.01117348, + "balance_loss_clip": 1.0704273, + "balance_loss_mlp": 1.09599137, + "epoch": 0.017014880505035322, + "flos": 21358805270400.0, + "grad_norm": 2.1184562475367916, + "language_loss": 0.77788174, + "learning_rate": 3.6348333119035937e-06, + "loss": 0.80254328, + "num_input_tokens_seen": 5892985, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 2.53125, + "step": 283, + "time_per_iteration": 2.516474485397339 + }, + { + "auxiliary_loss_clip": 0.01349252, + "auxiliary_loss_mlp": 0.01091995, + "balance_loss_clip": 1.04788804, + "balance_loss_mlp": 1.09700751, + "epoch": 0.01707500375770329, + "flos": 35333154858240.0, + "grad_norm": 2.1009174504018544, + "language_loss": 0.84172702, + "learning_rate": 3.6371043977980503e-06, + "loss": 0.86613953, + "num_input_tokens_seen": 5914060, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 2.515625, + "step": 284, + "time_per_iteration": 2.646301031112671 + }, + { + "auxiliary_loss_clip": 0.01339164, + "auxiliary_loss_mlp": 0.010994, + "balance_loss_clip": 1.05216956, + "balance_loss_mlp": 1.09148788, + "epoch": 0.01713512701037126, + "flos": 23582070506880.0, + "grad_norm": 3.014395623363928, + "language_loss": 0.96993905, + "learning_rate": 3.639367500948819e-06, + "loss": 0.99432468, + "num_input_tokens_seen": 5932860, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 2.46875, + "step": 285, + "time_per_iteration": 2.5412731170654297 + }, + { + "auxiliary_loss_clip": 0.01342544, + "auxiliary_loss_mlp": 0.01093983, + "balance_loss_clip": 1.05025744, + "balance_loss_mlp": 1.09407294, + "epoch": 0.01719525026303923, + "flos": 27634661544960.0, + "grad_norm": 2.2067050643741433, + "language_loss": 0.93951917, + "learning_rate": 3.6416226772772178e-06, + "loss": 0.96388453, + "num_input_tokens_seen": 5952725, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 2.484375, + "step": 286, + "time_per_iteration": 2.5895566940307617 + }, + { + "auxiliary_loss_clip": 0.0133546, + "auxiliary_loss_mlp": 0.01090331, + "balance_loss_clip": 1.04503167, + "balance_loss_mlp": 1.08924019, + "epoch": 0.0172553735157072, + "flos": 26979722910720.0, + "grad_norm": 1.8729510510678706, + "language_loss": 0.92157722, + "learning_rate": 3.643869982119001e-06, + "loss": 0.94583511, + "num_input_tokens_seen": 5970560, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 2.46875, + "step": 287, + "time_per_iteration": 2.6144802570343018 + }, + { + "auxiliary_loss_clip": 0.01338793, + "auxiliary_loss_mlp": 0.01089685, + "balance_loss_clip": 1.04462433, + "balance_loss_mlp": 1.08859432, + "epoch": 0.01731549676837517, + "flos": 14056621689600.0, + "grad_norm": 3.2271144452092564, + "language_loss": 1.02026963, + "learning_rate": 3.646109470232502e-06, + "loss": 1.04455447, + "num_input_tokens_seen": 5982980, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 2.5, + "step": 288, + "time_per_iteration": 2.488274097442627 + }, + { + "auxiliary_loss_clip": 0.01222501, + "auxiliary_loss_mlp": 0.01036501, + "balance_loss_clip": 1.02000237, + "balance_loss_mlp": 1.09325862, + "epoch": 0.017375620021043137, + "flos": 66510694471680.0, + "grad_norm": 0.9131614435254132, + "language_loss": 0.63915455, + "learning_rate": 3.6483411958066417e-06, + "loss": 0.66174459, + "num_input_tokens_seen": 6049445, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 1.296875, + "step": 289, + "time_per_iteration": 3.222426652908325 + }, + { + "auxiliary_loss_clip": 0.01341104, + "auxiliary_loss_mlp": 0.01107523, + "balance_loss_clip": 1.06379664, + "balance_loss_mlp": 1.09403992, + "epoch": 0.01743574327371111, + "flos": 15225154940160.0, + "grad_norm": 2.4014361624695173, + "language_loss": 0.88569438, + "learning_rate": 3.6505652124687957e-06, + "loss": 0.91018069, + "num_input_tokens_seen": 6064150, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 2.46875, + "step": 290, + "time_per_iteration": 2.49294114112854 + }, + { + "auxiliary_loss_clip": 0.01338257, + "auxiliary_loss_mlp": 0.01091523, + "balance_loss_clip": 1.04631877, + "balance_loss_mlp": 1.09248078, + "epoch": 0.017495866526379078, + "flos": 25373869574400.0, + "grad_norm": 2.156562479490788, + "language_loss": 0.84578067, + "learning_rate": 3.6527815732925258e-06, + "loss": 0.87007844, + "num_input_tokens_seen": 6083920, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 2.453125, + "step": 291, + "time_per_iteration": 2.5356485843658447 + }, + { + "auxiliary_loss_clip": 0.01345108, + "auxiliary_loss_mlp": 0.01106973, + "balance_loss_clip": 1.05897939, + "balance_loss_mlp": 1.10042334, + "epoch": 0.017555989779047047, + "flos": 26359473836160.0, + "grad_norm": 1.6617628708439536, + "language_loss": 0.72766221, + "learning_rate": 3.6549903308051806e-06, + "loss": 0.75218308, + "num_input_tokens_seen": 6105460, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 2.453125, + "step": 292, + "time_per_iteration": 2.6524176597595215 + }, + { + "auxiliary_loss_clip": 0.01333825, + "auxiliary_loss_mlp": 0.01101528, + "balance_loss_clip": 1.05625248, + "balance_loss_mlp": 1.09236324, + "epoch": 0.017616113031715015, + "flos": 22338807010560.0, + "grad_norm": 2.2014441192179866, + "language_loss": 0.8726995, + "learning_rate": 3.6571915369953646e-06, + "loss": 0.89705306, + "num_input_tokens_seen": 6122890, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 2.40625, + "step": 293, + "time_per_iteration": 2.531580686569214 + }, + { + "auxiliary_loss_clip": 0.01334314, + "auxiliary_loss_mlp": 0.0110389, + "balance_loss_clip": 1.05959213, + "balance_loss_mlp": 1.09177744, + "epoch": 0.017676236284382984, + "flos": 20156911263360.0, + "grad_norm": 2.3120260424061367, + "language_loss": 0.81276119, + "learning_rate": 3.6593852433202797e-06, + "loss": 0.83714324, + "num_input_tokens_seen": 6142890, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 2.4375, + "step": 294, + "time_per_iteration": 2.568784236907959 + }, + { + "auxiliary_loss_clip": 0.01334452, + "auxiliary_loss_mlp": 0.01107857, + "balance_loss_clip": 1.06274807, + "balance_loss_mlp": 1.08824301, + "epoch": 0.017736359537050956, + "flos": 25223331674880.0, + "grad_norm": 2.9227055740425705, + "language_loss": 0.83710909, + "learning_rate": 3.6615715007129453e-06, + "loss": 0.86153215, + "num_input_tokens_seen": 6162030, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 2.46875, + "step": 295, + "time_per_iteration": 2.5799450874328613 + }, + { + "auxiliary_loss_clip": 0.01339817, + "auxiliary_loss_mlp": 0.01110731, + "balance_loss_clip": 1.06559837, + "balance_loss_mlp": 1.09874845, + "epoch": 0.017796482789718925, + "flos": 20338798757760.0, + "grad_norm": 2.5339269047951727, + "language_loss": 0.84620988, + "learning_rate": 3.6637503595892897e-06, + "loss": 0.87071538, + "num_input_tokens_seen": 6180540, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 2.40625, + "step": 296, + "time_per_iteration": 2.5243051052093506 + }, + { + "auxiliary_loss_clip": 0.01338756, + "auxiliary_loss_mlp": 0.01097832, + "balance_loss_clip": 1.05417752, + "balance_loss_mlp": 1.09317493, + "epoch": 0.017856606042386893, + "flos": 22379206832640.0, + "grad_norm": 2.123858619871597, + "language_loss": 0.87729871, + "learning_rate": 3.665921869855132e-06, + "loss": 0.90166461, + "num_input_tokens_seen": 6199425, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 2.453125, + "step": 297, + "time_per_iteration": 2.5186710357666016 + }, + { + "auxiliary_loss_clip": 0.01337139, + "auxiliary_loss_mlp": 0.01100837, + "balance_loss_clip": 1.05713463, + "balance_loss_mlp": 1.09108877, + "epoch": 0.017916729295054862, + "flos": 20230061310720.0, + "grad_norm": 2.170328911832355, + "language_loss": 0.88528925, + "learning_rate": 3.6680860809130346e-06, + "loss": 0.90966904, + "num_input_tokens_seen": 6219170, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 2.46875, + "step": 298, + "time_per_iteration": 2.5320143699645996 + }, + { + "auxiliary_loss_clip": 0.0133273, + "auxiliary_loss_mlp": 0.01118432, + "balance_loss_clip": 1.07234538, + "balance_loss_mlp": 1.09249902, + "epoch": 0.01797685254772283, + "flos": 19390972625280.0, + "grad_norm": 1.8938405886263965, + "language_loss": 0.88666737, + "learning_rate": 3.6702430416690516e-06, + "loss": 0.91117901, + "num_input_tokens_seen": 6237930, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 2.40625, + "step": 299, + "time_per_iteration": 2.588275671005249 + }, + { + "auxiliary_loss_clip": 0.01340105, + "auxiliary_loss_mlp": 0.01105829, + "balance_loss_clip": 1.06055307, + "balance_loss_mlp": 1.09275746, + "epoch": 0.018036975800390802, + "flos": 24426007528320.0, + "grad_norm": 3.2936483356677253, + "language_loss": 0.64349103, + "learning_rate": 3.672392800539357e-06, + "loss": 0.66795039, + "num_input_tokens_seen": 6257170, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 2.46875, + "step": 300, + "time_per_iteration": 2.592313289642334 + }, + { + "auxiliary_loss_clip": 0.01338706, + "auxiliary_loss_mlp": 0.01105447, + "balance_loss_clip": 1.05986142, + "balance_loss_mlp": 1.09540462, + "epoch": 0.01809709905305877, + "flos": 15778933896960.0, + "grad_norm": 2.310898752337597, + "language_loss": 0.88330823, + "learning_rate": 3.6745354054567686e-06, + "loss": 0.90774977, + "num_input_tokens_seen": 6274780, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 2.4375, + "step": 301, + "time_per_iteration": 2.499481439590454 + }, + { + "auxiliary_loss_clip": 0.01214573, + "auxiliary_loss_mlp": 0.01024582, + "balance_loss_clip": 1.00932336, + "balance_loss_mlp": 1.08753991, + "epoch": 0.01815722230572674, + "flos": 67348382526720.0, + "grad_norm": 0.8370211186232274, + "language_loss": 0.62198341, + "learning_rate": 3.676670903877158e-06, + "loss": 0.64437497, + "num_input_tokens_seen": 6340435, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 1.265625, + "step": 302, + "time_per_iteration": 3.259997844696045 + }, + { + "auxiliary_loss_clip": 0.01329895, + "auxiliary_loss_mlp": 0.01107479, + "balance_loss_clip": 1.06198907, + "balance_loss_mlp": 1.08938098, + "epoch": 0.01821734555839471, + "flos": 15485615435520.0, + "grad_norm": 2.491293816938874, + "language_loss": 0.89617372, + "learning_rate": 3.6787993427857567e-06, + "loss": 0.92054749, + "num_input_tokens_seen": 6358160, + "router_z_loss_clip": 0.45507812, + "router_z_loss_mlp": 2.40625, + "step": 303, + "time_per_iteration": 2.536773920059204 + }, + { + "auxiliary_loss_clip": 0.01336859, + "auxiliary_loss_mlp": 0.01114111, + "balance_loss_clip": 1.06778669, + "balance_loss_mlp": 1.09363747, + "epoch": 0.018277468811062677, + "flos": 24097424889600.0, + "grad_norm": 4.887297609803561, + "language_loss": 0.80314684, + "learning_rate": 3.680920768703364e-06, + "loss": 0.82765651, + "num_input_tokens_seen": 6378485, + "router_z_loss_clip": 0.46289062, + "router_z_loss_mlp": 2.4375, + "step": 304, + "time_per_iteration": 2.563828945159912 + }, + { + "auxiliary_loss_clip": 0.01331614, + "auxiliary_loss_mlp": 0.01094816, + "balance_loss_clip": 1.05144823, + "balance_loss_mlp": 1.09657788, + "epoch": 0.01833759206373065, + "flos": 20959335141120.0, + "grad_norm": 1.8235558005033383, + "language_loss": 0.82894015, + "learning_rate": 3.6830352276924415e-06, + "loss": 0.85320443, + "num_input_tokens_seen": 6397845, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 2.34375, + "step": 305, + "time_per_iteration": 2.5195910930633545 + }, + { + "auxiliary_loss_clip": 0.01332168, + "auxiliary_loss_mlp": 0.01093114, + "balance_loss_clip": 1.04993677, + "balance_loss_mlp": 1.08868921, + "epoch": 0.018397715316398618, + "flos": 19390757143680.0, + "grad_norm": 1.9087210074301977, + "language_loss": 0.90843809, + "learning_rate": 3.685142765363119e-06, + "loss": 0.93269092, + "num_input_tokens_seen": 6416475, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 2.4375, + "step": 306, + "time_per_iteration": 2.501276969909668 + }, + { + "auxiliary_loss_clip": 0.01324982, + "auxiliary_loss_mlp": 0.01090544, + "balance_loss_clip": 1.04815364, + "balance_loss_mlp": 1.08638549, + "epoch": 0.018457838569066586, + "flos": 29132531619840.0, + "grad_norm": 2.1762826783898586, + "language_loss": 0.86435306, + "learning_rate": 3.687243426879095e-06, + "loss": 0.88850832, + "num_input_tokens_seen": 6437520, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 2.390625, + "step": 307, + "time_per_iteration": 2.6048038005828857 + }, + { + "auxiliary_loss_clip": 0.01325097, + "auxiliary_loss_mlp": 0.01106166, + "balance_loss_clip": 1.05817199, + "balance_loss_mlp": 1.09046888, + "epoch": 0.018517961821734555, + "flos": 19208654167680.0, + "grad_norm": 2.221444292833677, + "language_loss": 0.71723771, + "learning_rate": 3.6893372569634466e-06, + "loss": 0.74155033, + "num_input_tokens_seen": 6455680, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 2.34375, + "step": 308, + "time_per_iteration": 2.513774871826172 + }, + { + "auxiliary_loss_clip": 0.01331987, + "auxiliary_loss_mlp": 0.01102938, + "balance_loss_clip": 1.05904555, + "balance_loss_mlp": 1.08861351, + "epoch": 0.018578085074402523, + "flos": 19863018184320.0, + "grad_norm": 2.2254161740825293, + "language_loss": 0.91952753, + "learning_rate": 3.6914242999043395e-06, + "loss": 0.94387674, + "num_input_tokens_seen": 6474880, + "router_z_loss_clip": 0.43945312, + "router_z_loss_mlp": 2.4375, + "step": 309, + "time_per_iteration": 5.224750280380249 + }, + { + "auxiliary_loss_clip": 0.01338325, + "auxiliary_loss_mlp": 0.01104953, + "balance_loss_clip": 1.05896235, + "balance_loss_mlp": 1.08840334, + "epoch": 0.018638208327070496, + "flos": 29606947476480.0, + "grad_norm": 2.8056803187702135, + "language_loss": 0.72399509, + "learning_rate": 3.69350459956065e-06, + "loss": 0.74842793, + "num_input_tokens_seen": 6495945, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 2.5, + "step": 310, + "time_per_iteration": 2.584730863571167 + }, + { + "auxiliary_loss_clip": 0.01330325, + "auxiliary_loss_mlp": 0.01112154, + "balance_loss_clip": 1.06790328, + "balance_loss_mlp": 1.09306264, + "epoch": 0.018698331579738464, + "flos": 45731555907840.0, + "grad_norm": 12.392698164772181, + "language_loss": 0.74104297, + "learning_rate": 3.695578199367497e-06, + "loss": 0.76546776, + "num_input_tokens_seen": 6519930, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 2.375, + "step": 311, + "time_per_iteration": 2.734072208404541 + }, + { + "auxiliary_loss_clip": 0.01337963, + "auxiliary_loss_mlp": 0.0110935, + "balance_loss_clip": 1.06619668, + "balance_loss_mlp": 1.09045064, + "epoch": 0.018758454832406433, + "flos": 20483662308480.0, + "grad_norm": 2.2753160661232603, + "language_loss": 0.91518372, + "learning_rate": 3.6976451423416825e-06, + "loss": 0.93965685, + "num_input_tokens_seen": 6535070, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 2.46875, + "step": 312, + "time_per_iteration": 2.5117411613464355 + }, + { + "auxiliary_loss_clip": 0.01336169, + "auxiliary_loss_mlp": 0.01112089, + "balance_loss_clip": 1.06609774, + "balance_loss_mlp": 1.09088099, + "epoch": 0.0188185780850744, + "flos": 15777784661760.0, + "grad_norm": 2.320247917383294, + "language_loss": 0.89746982, + "learning_rate": 3.699705471087043e-06, + "loss": 0.92195237, + "num_input_tokens_seen": 6554135, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 2.453125, + "step": 313, + "time_per_iteration": 2.4761838912963867 + }, + { + "auxiliary_loss_clip": 0.01340305, + "auxiliary_loss_mlp": 0.01098393, + "balance_loss_clip": 1.05230689, + "balance_loss_mlp": 1.09061432, + "epoch": 0.018878701337742373, + "flos": 22455732758400.0, + "grad_norm": 2.3404867001555236, + "language_loss": 0.73099983, + "learning_rate": 3.7017592277997256e-06, + "loss": 0.75538683, + "num_input_tokens_seen": 6572275, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 2.5, + "step": 314, + "time_per_iteration": 2.5488638877868652 + }, + { + "auxiliary_loss_clip": 0.01326469, + "auxiliary_loss_mlp": 0.01103837, + "balance_loss_clip": 1.06101751, + "balance_loss_mlp": 1.08694446, + "epoch": 0.018938824590410342, + "flos": 30993530238720.0, + "grad_norm": 2.192553769026804, + "language_loss": 0.89887041, + "learning_rate": 3.7038064542733654e-06, + "loss": 0.92317349, + "num_input_tokens_seen": 6594520, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 2.40625, + "step": 315, + "time_per_iteration": 2.5857741832733154 + }, + { + "auxiliary_loss_clip": 0.01329672, + "auxiliary_loss_mlp": 0.01096027, + "balance_loss_clip": 1.05170512, + "balance_loss_mlp": 1.08870411, + "epoch": 0.01899894784307831, + "flos": 23258910821760.0, + "grad_norm": 1.8364758613144732, + "language_loss": 0.80796063, + "learning_rate": 3.7058471919041945e-06, + "loss": 0.83221763, + "num_input_tokens_seen": 6614245, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 2.40625, + "step": 316, + "time_per_iteration": 2.5222342014312744 + }, + { + "auxiliary_loss_clip": 0.01324399, + "auxiliary_loss_mlp": 0.01095063, + "balance_loss_clip": 1.05131364, + "balance_loss_mlp": 1.08633423, + "epoch": 0.01905907109574628, + "flos": 17457901367040.0, + "grad_norm": 2.1363686538021236, + "language_loss": 0.90357143, + "learning_rate": 3.7078814816960605e-06, + "loss": 0.92776608, + "num_input_tokens_seen": 6632015, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 2.375, + "step": 317, + "time_per_iteration": 2.4696123600006104 + }, + { + "auxiliary_loss_clip": 0.01319895, + "auxiliary_loss_mlp": 0.01095564, + "balance_loss_clip": 1.0515281, + "balance_loss_mlp": 1.0845592, + "epoch": 0.019119194348414248, + "flos": 14970225139200.0, + "grad_norm": 2.5260192321083794, + "language_loss": 0.90939772, + "learning_rate": 3.709909364265374e-06, + "loss": 0.93355227, + "num_input_tokens_seen": 6649015, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 2.34375, + "step": 318, + "time_per_iteration": 2.488128185272217 + }, + { + "auxiliary_loss_clip": 0.01324457, + "auxiliary_loss_mlp": 0.01088861, + "balance_loss_clip": 1.04706657, + "balance_loss_mlp": 1.08574772, + "epoch": 0.01917931760108222, + "flos": 25482822503040.0, + "grad_norm": 2.626221841877022, + "language_loss": 0.93980259, + "learning_rate": 3.7119308798459706e-06, + "loss": 0.96393579, + "num_input_tokens_seen": 6669225, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 2.390625, + "step": 319, + "time_per_iteration": 2.5184502601623535 + }, + { + "auxiliary_loss_clip": 0.01205117, + "auxiliary_loss_mlp": 0.01080363, + "balance_loss_clip": 1.06586683, + "balance_loss_mlp": 1.07482553, + "epoch": 0.01923944085375019, + "flos": 71556967353600.0, + "grad_norm": 0.9345393611259016, + "language_loss": 0.59860981, + "learning_rate": 3.7139460682939026e-06, + "loss": 0.62146461, + "num_input_tokens_seen": 6725775, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 1.296875, + "step": 320, + "time_per_iteration": 3.0250258445739746 + }, + { + "auxiliary_loss_clip": 0.01320993, + "auxiliary_loss_mlp": 0.0110086, + "balance_loss_clip": 1.05827808, + "balance_loss_mlp": 1.08425927, + "epoch": 0.019299564106418157, + "flos": 19682495406720.0, + "grad_norm": 3.0799113353921572, + "language_loss": 0.89622325, + "learning_rate": 3.715954969092154e-06, + "loss": 0.92044175, + "num_input_tokens_seen": 6744170, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 2.375, + "step": 321, + "time_per_iteration": 2.476439952850342 + }, + { + "auxiliary_loss_clip": 0.01332068, + "auxiliary_loss_mlp": 0.0112077, + "balance_loss_clip": 1.07620978, + "balance_loss_mlp": 1.08993089, + "epoch": 0.019359687359086126, + "flos": 24387151991040.0, + "grad_norm": 2.068543890023447, + "language_loss": 0.82884163, + "learning_rate": 3.7179576213552805e-06, + "loss": 0.85337007, + "num_input_tokens_seen": 6764565, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 2.421875, + "step": 322, + "time_per_iteration": 2.556302309036255 + }, + { + "auxiliary_loss_clip": 0.01332156, + "auxiliary_loss_mlp": 0.01090343, + "balance_loss_clip": 1.04828596, + "balance_loss_mlp": 1.08754158, + "epoch": 0.019419810611754094, + "flos": 23951376190080.0, + "grad_norm": 2.2506232399398245, + "language_loss": 0.72734368, + "learning_rate": 3.719954063833981e-06, + "loss": 0.75156873, + "num_input_tokens_seen": 6785310, + "router_z_loss_clip": 0.41992188, + "router_z_loss_mlp": 2.453125, + "step": 323, + "time_per_iteration": 2.5033397674560547 + }, + { + "auxiliary_loss_clip": 0.01318896, + "auxiliary_loss_mlp": 0.01090622, + "balance_loss_clip": 1.04763484, + "balance_loss_mlp": 1.08184087, + "epoch": 0.019479933864422067, + "flos": 22160223567360.0, + "grad_norm": 2.023515622890843, + "language_loss": 0.92639947, + "learning_rate": 3.721944334919596e-06, + "loss": 0.95049465, + "num_input_tokens_seen": 6803290, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 2.375, + "step": 324, + "time_per_iteration": 2.5194544792175293 + }, + { + "auxiliary_loss_clip": 0.01328869, + "auxiliary_loss_mlp": 0.01089838, + "balance_loss_clip": 1.04935479, + "balance_loss_mlp": 1.08943164, + "epoch": 0.019540057117090035, + "flos": 22236821320320.0, + "grad_norm": 4.018466874717804, + "language_loss": 0.65336061, + "learning_rate": 3.7239284726485375e-06, + "loss": 0.67754775, + "num_input_tokens_seen": 6822570, + "router_z_loss_clip": 0.40429688, + "router_z_loss_mlp": 2.390625, + "step": 325, + "time_per_iteration": 2.5107386112213135 + }, + { + "auxiliary_loss_clip": 0.0132709, + "auxiliary_loss_mlp": 0.01101196, + "balance_loss_clip": 1.05799484, + "balance_loss_mlp": 1.093485, + "epoch": 0.019600180369758004, + "flos": 23076771932160.0, + "grad_norm": 1.921455060851243, + "language_loss": 0.76449442, + "learning_rate": 3.72590651470665e-06, + "loss": 0.78877723, + "num_input_tokens_seen": 6841910, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 2.34375, + "step": 326, + "time_per_iteration": 2.5080325603485107 + }, + { + "auxiliary_loss_clip": 0.01320399, + "auxiliary_loss_mlp": 0.01103572, + "balance_loss_clip": 1.06015599, + "balance_loss_mlp": 1.08845115, + "epoch": 0.019660303622425972, + "flos": 25410857604480.0, + "grad_norm": 2.1551163890972123, + "language_loss": 0.79176939, + "learning_rate": 3.727878498433505e-06, + "loss": 0.8160091, + "num_input_tokens_seen": 6862480, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 2.3125, + "step": 327, + "time_per_iteration": 2.5449633598327637 + }, + { + "auxiliary_loss_clip": 0.01326802, + "auxiliary_loss_mlp": 0.01111954, + "balance_loss_clip": 1.06984949, + "balance_loss_mlp": 1.08873606, + "epoch": 0.01972042687509394, + "flos": 23657519024640.0, + "grad_norm": 2.1574079642063246, + "language_loss": 0.80725288, + "learning_rate": 3.7298444608266328e-06, + "loss": 0.83164048, + "num_input_tokens_seen": 6882015, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 2.390625, + "step": 328, + "time_per_iteration": 2.5418970584869385 + }, + { + "auxiliary_loss_clip": 0.01325663, + "auxiliary_loss_mlp": 0.01096681, + "balance_loss_clip": 1.05278802, + "balance_loss_mlp": 1.08396721, + "epoch": 0.019780550127761913, + "flos": 18223480869120.0, + "grad_norm": 2.245263087715646, + "language_loss": 0.93704766, + "learning_rate": 3.731804438545683e-06, + "loss": 0.96127105, + "num_input_tokens_seen": 6899785, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 2.40625, + "step": 329, + "time_per_iteration": 2.4910004138946533 + }, + { + "auxiliary_loss_clip": 0.01332781, + "auxiliary_loss_mlp": 0.01105781, + "balance_loss_clip": 1.06253231, + "balance_loss_mlp": 1.08930123, + "epoch": 0.01984067338042988, + "flos": 22418780641920.0, + "grad_norm": 2.9776357674257365, + "language_loss": 0.74277973, + "learning_rate": 3.7337584679165324e-06, + "loss": 0.7671653, + "num_input_tokens_seen": 6918575, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 2.4375, + "step": 330, + "time_per_iteration": 2.51430082321167 + }, + { + "auxiliary_loss_clip": 0.01328701, + "auxiliary_loss_mlp": 0.01120913, + "balance_loss_clip": 1.07814097, + "balance_loss_mlp": 1.08762872, + "epoch": 0.01990079663309785, + "flos": 17055199013760.0, + "grad_norm": 2.972763157156593, + "language_loss": 0.93870068, + "learning_rate": 3.7357065849353186e-06, + "loss": 0.96319681, + "num_input_tokens_seen": 6936965, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 2.40625, + "step": 331, + "time_per_iteration": 2.4759159088134766 + }, + { + "auxiliary_loss_clip": 0.01316192, + "auxiliary_loss_mlp": 0.01089699, + "balance_loss_clip": 1.04938233, + "balance_loss_mlp": 1.0853951, + "epoch": 0.01996091988576582, + "flos": 15961791058560.0, + "grad_norm": 2.6958694906457836, + "language_loss": 0.92730892, + "learning_rate": 3.737648825272422e-06, + "loss": 0.95136791, + "num_input_tokens_seen": 6953475, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 2.3125, + "step": 332, + "time_per_iteration": 2.49817156791687 + }, + { + "auxiliary_loss_clip": 0.01325132, + "auxiliary_loss_mlp": 0.01092519, + "balance_loss_clip": 1.04903162, + "balance_loss_mlp": 1.09081161, + "epoch": 0.02002104313843379, + "flos": 23586451966080.0, + "grad_norm": 2.6289067025313777, + "language_loss": 0.75589794, + "learning_rate": 3.739585224276384e-06, + "loss": 0.78007442, + "num_input_tokens_seen": 6971630, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 2.34375, + "step": 333, + "time_per_iteration": 2.5180609226226807 + }, + { + "auxiliary_loss_clip": 0.01323371, + "auxiliary_loss_mlp": 0.01087623, + "balance_loss_clip": 1.04597139, + "balance_loss_mlp": 1.08625877, + "epoch": 0.02008116639110176, + "flos": 34094883352320.0, + "grad_norm": 2.1766901409232426, + "language_loss": 0.78768885, + "learning_rate": 3.7415158169777673e-06, + "loss": 0.81179881, + "num_input_tokens_seen": 6992775, + "router_z_loss_clip": 0.41601562, + "router_z_loss_mlp": 2.375, + "step": 334, + "time_per_iteration": 2.614708423614502 + }, + { + "auxiliary_loss_clip": 0.01324397, + "auxiliary_loss_mlp": 0.01094838, + "balance_loss_clip": 1.05015838, + "balance_loss_mlp": 1.08276975, + "epoch": 0.020141289643769728, + "flos": 19683716469120.0, + "grad_norm": 2.4059127888346916, + "language_loss": 0.83083838, + "learning_rate": 3.7434406380929575e-06, + "loss": 0.85503072, + "num_input_tokens_seen": 7011425, + "router_z_loss_clip": 0.44726562, + "router_z_loss_mlp": 2.421875, + "step": 335, + "time_per_iteration": 2.495260000228882 + }, + { + "auxiliary_loss_clip": 0.01320649, + "auxiliary_loss_mlp": 0.01090782, + "balance_loss_clip": 1.04934454, + "balance_loss_mlp": 1.08585882, + "epoch": 0.020201412896437697, + "flos": 20740567357440.0, + "grad_norm": 2.166489879958422, + "language_loss": 0.92639577, + "learning_rate": 3.745359722027911e-06, + "loss": 0.95051014, + "num_input_tokens_seen": 7029450, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 2.34375, + "step": 336, + "time_per_iteration": 2.526906967163086 + }, + { + "auxiliary_loss_clip": 0.01321744, + "auxiliary_loss_mlp": 0.01083167, + "balance_loss_clip": 1.04139614, + "balance_loss_mlp": 1.08352447, + "epoch": 0.020261536149105665, + "flos": 20266510636800.0, + "grad_norm": 1.825762702383362, + "language_loss": 0.88474333, + "learning_rate": 3.7472731028818428e-06, + "loss": 0.90879244, + "num_input_tokens_seen": 7047555, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 2.390625, + "step": 337, + "time_per_iteration": 2.5151607990264893 + }, + { + "auxiliary_loss_clip": 0.01310297, + "auxiliary_loss_mlp": 0.01101804, + "balance_loss_clip": 1.05836427, + "balance_loss_mlp": 1.08001363, + "epoch": 0.020321659401773638, + "flos": 25848752307840.0, + "grad_norm": 1.5415234153999902, + "language_loss": 0.89914495, + "learning_rate": 3.7491808144508626e-06, + "loss": 0.92326593, + "num_input_tokens_seen": 7068185, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 2.3125, + "step": 338, + "time_per_iteration": 2.5795979499816895 + }, + { + "auxiliary_loss_clip": 0.01324391, + "auxiliary_loss_mlp": 0.01099602, + "balance_loss_clip": 1.05742574, + "balance_loss_mlp": 1.08479571, + "epoch": 0.020381782654441606, + "flos": 17495033051520.0, + "grad_norm": 2.047046576054304, + "language_loss": 0.84801471, + "learning_rate": 3.7510828902315576e-06, + "loss": 0.87225461, + "num_input_tokens_seen": 7085955, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 2.40625, + "step": 339, + "time_per_iteration": 2.4558403491973877 + }, + { + "auxiliary_loss_clip": 0.01326609, + "auxiliary_loss_mlp": 0.01093427, + "balance_loss_clip": 1.05001152, + "balance_loss_mlp": 1.08709431, + "epoch": 0.020441905907109575, + "flos": 24243940465920.0, + "grad_norm": 1.7544231793273473, + "language_loss": 0.88913274, + "learning_rate": 3.75297936342452e-06, + "loss": 0.91333312, + "num_input_tokens_seen": 7106345, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 2.40625, + "step": 340, + "time_per_iteration": 2.5330188274383545 + }, + { + "auxiliary_loss_clip": 0.01323557, + "auxiliary_loss_mlp": 0.01086176, + "balance_loss_clip": 1.04135346, + "balance_loss_mlp": 1.0859195, + "epoch": 0.020502029159777543, + "flos": 22233301787520.0, + "grad_norm": 2.2340783182785975, + "language_loss": 0.88071406, + "learning_rate": 3.7548702669378253e-06, + "loss": 0.90481138, + "num_input_tokens_seen": 7125070, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 2.375, + "step": 341, + "time_per_iteration": 2.502161979675293 + }, + { + "auxiliary_loss_clip": 0.01325847, + "auxiliary_loss_mlp": 0.01099304, + "balance_loss_clip": 1.05643678, + "balance_loss_mlp": 1.08389783, + "epoch": 0.020562152412445512, + "flos": 23987861429760.0, + "grad_norm": 3.2005009235922572, + "language_loss": 0.80293322, + "learning_rate": 3.756755633390458e-06, + "loss": 0.82718468, + "num_input_tokens_seen": 7144675, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 2.421875, + "step": 342, + "time_per_iteration": 2.5315535068511963 + }, + { + "auxiliary_loss_clip": 0.0131301, + "auxiliary_loss_mlp": 0.01098615, + "balance_loss_clip": 1.05293417, + "balance_loss_mlp": 1.08132875, + "epoch": 0.020622275665113484, + "flos": 26975305537920.0, + "grad_norm": 2.399130254204822, + "language_loss": 0.89451253, + "learning_rate": 3.7586354951156886e-06, + "loss": 0.91862881, + "num_input_tokens_seen": 7165505, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 2.3125, + "step": 343, + "time_per_iteration": 2.554255485534668 + }, + { + "auxiliary_loss_clip": 0.01325104, + "auxiliary_loss_mlp": 0.01094315, + "balance_loss_clip": 1.05342627, + "balance_loss_mlp": 1.08973229, + "epoch": 0.020682398917781453, + "flos": 22600704049920.0, + "grad_norm": 2.3234219523507296, + "language_loss": 0.78252918, + "learning_rate": 3.7605098841644e-06, + "loss": 0.80672336, + "num_input_tokens_seen": 7184605, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 2.359375, + "step": 344, + "time_per_iteration": 2.514665365219116 + }, + { + "auxiliary_loss_clip": 0.01309596, + "auxiliary_loss_mlp": 0.0110098, + "balance_loss_clip": 1.05730188, + "balance_loss_mlp": 1.08079529, + "epoch": 0.02074252217044942, + "flos": 15013605790080.0, + "grad_norm": 1.8371023099908983, + "language_loss": 0.75138956, + "learning_rate": 3.7623788323083666e-06, + "loss": 0.77549529, + "num_input_tokens_seen": 7203065, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 2.28125, + "step": 345, + "time_per_iteration": 2.513394594192505 + }, + { + "auxiliary_loss_clip": 0.01318525, + "auxiliary_loss_mlp": 0.01101003, + "balance_loss_clip": 1.05806339, + "balance_loss_mlp": 1.08789146, + "epoch": 0.02080264542311739, + "flos": 25337958952320.0, + "grad_norm": 2.0741733748571565, + "language_loss": 0.90269232, + "learning_rate": 3.7642423710434837e-06, + "loss": 0.92688763, + "num_input_tokens_seen": 7222995, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 2.3125, + "step": 346, + "time_per_iteration": 2.5487060546875 + }, + { + "auxiliary_loss_clip": 0.01314255, + "auxiliary_loss_mlp": 0.01095048, + "balance_loss_clip": 1.05527973, + "balance_loss_mlp": 1.08358788, + "epoch": 0.02086276867578536, + "flos": 24388804016640.0, + "grad_norm": 2.0766581400667, + "language_loss": 0.78869188, + "learning_rate": 3.7661005315929563e-06, + "loss": 0.81278479, + "num_input_tokens_seen": 7244625, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 2.3125, + "step": 347, + "time_per_iteration": 2.516402006149292 + }, + { + "auxiliary_loss_clip": 0.01317315, + "auxiliary_loss_mlp": 0.01096912, + "balance_loss_clip": 1.05335259, + "balance_loss_mlp": 1.08719826, + "epoch": 0.02092289192845333, + "flos": 24462205459200.0, + "grad_norm": 2.4234628631287927, + "language_loss": 0.71424043, + "learning_rate": 3.7679533449104354e-06, + "loss": 0.7383827, + "num_input_tokens_seen": 7263255, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 2.3125, + "step": 348, + "time_per_iteration": 2.5407540798187256 + }, + { + "auxiliary_loss_clip": 0.01319638, + "auxiliary_loss_mlp": 0.01101899, + "balance_loss_clip": 1.0595324, + "balance_loss_mlp": 1.08435416, + "epoch": 0.0209830151811213, + "flos": 17451185523840.0, + "grad_norm": 4.002924557181807, + "language_loss": 0.76819432, + "learning_rate": 3.7698008416831116e-06, + "loss": 0.79240972, + "num_input_tokens_seen": 7279275, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 2.34375, + "step": 349, + "time_per_iteration": 2.4884049892425537 + }, + { + "auxiliary_loss_clip": 0.0130292, + "auxiliary_loss_mlp": 0.0109884, + "balance_loss_clip": 1.05792725, + "balance_loss_mlp": 1.08141851, + "epoch": 0.021043138433789268, + "flos": 24573995562240.0, + "grad_norm": 1.9115672624672835, + "language_loss": 0.85271406, + "learning_rate": 3.7716430523347664e-06, + "loss": 0.87673163, + "num_input_tokens_seen": 7300180, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 2.21875, + "step": 350, + "time_per_iteration": 2.559812307357788 + }, + { + "auxiliary_loss_clip": 0.01311162, + "auxiliary_loss_mlp": 0.01089483, + "balance_loss_clip": 1.05083585, + "balance_loss_mlp": 1.08571863, + "epoch": 0.021103261686457236, + "flos": 24454053072000.0, + "grad_norm": 2.3355222976898764, + "language_loss": 0.80104828, + "learning_rate": 3.773480007028776e-06, + "loss": 0.82505476, + "num_input_tokens_seen": 7317430, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 2.25, + "step": 351, + "time_per_iteration": 5.468780517578125 + }, + { + "auxiliary_loss_clip": 0.01318524, + "auxiliary_loss_mlp": 0.01103443, + "balance_loss_clip": 1.06048024, + "balance_loss_mlp": 1.08623564, + "epoch": 0.021163384939125205, + "flos": 14683083816960.0, + "grad_norm": 3.8473493260702125, + "language_loss": 0.87258279, + "learning_rate": 3.775311735671078e-06, + "loss": 0.89680254, + "num_input_tokens_seen": 7334875, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 2.328125, + "step": 352, + "time_per_iteration": 2.4787278175354004 + }, + { + "auxiliary_loss_clip": 0.01312545, + "auxiliary_loss_mlp": 0.01105111, + "balance_loss_clip": 1.06248152, + "balance_loss_mlp": 1.08574009, + "epoch": 0.021223508191793177, + "flos": 24493195918080.0, + "grad_norm": 1.8920106465676412, + "language_loss": 0.82386625, + "learning_rate": 3.7771382679130878e-06, + "loss": 0.84804279, + "num_input_tokens_seen": 7355185, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 2.265625, + "step": 353, + "time_per_iteration": 2.5428433418273926 + }, + { + "auxiliary_loss_clip": 0.01307832, + "auxiliary_loss_mlp": 0.01091814, + "balance_loss_clip": 1.05133069, + "balance_loss_mlp": 1.08353949, + "epoch": 0.021283631444461146, + "flos": 24126978804480.0, + "grad_norm": 2.0636001035279694, + "language_loss": 0.8102631, + "learning_rate": 3.7789596331545845e-06, + "loss": 0.83425963, + "num_input_tokens_seen": 7374425, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 2.25, + "step": 354, + "time_per_iteration": 2.5261166095733643 + }, + { + "auxiliary_loss_clip": 0.01315043, + "auxiliary_loss_mlp": 0.01092413, + "balance_loss_clip": 1.04935455, + "balance_loss_mlp": 1.08190715, + "epoch": 0.021343754697129114, + "flos": 25192233475200.0, + "grad_norm": 2.8065821662627575, + "language_loss": 0.80764574, + "learning_rate": 3.780775860546545e-06, + "loss": 0.83172029, + "num_input_tokens_seen": 7394175, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 2.328125, + "step": 355, + "time_per_iteration": 2.56968355178833 + }, + { + "auxiliary_loss_clip": 0.01310125, + "auxiliary_loss_mlp": 0.01086869, + "balance_loss_clip": 1.0454793, + "balance_loss_mlp": 1.08140039, + "epoch": 0.021403877949797083, + "flos": 17274182279040.0, + "grad_norm": 2.2488803729957, + "language_loss": 0.89553398, + "learning_rate": 3.7825869789939474e-06, + "loss": 0.91950381, + "num_input_tokens_seen": 7412645, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 2.28125, + "step": 356, + "time_per_iteration": 2.5510213375091553 + }, + { + "auxiliary_loss_clip": 0.01308646, + "auxiliary_loss_mlp": 0.01083372, + "balance_loss_clip": 1.04117227, + "balance_loss_mlp": 1.08451605, + "epoch": 0.021464001202465055, + "flos": 30917435276160.0, + "grad_norm": 2.7055681522526522, + "language_loss": 0.80032516, + "learning_rate": 3.784393017158528e-06, + "loss": 0.82424533, + "num_input_tokens_seen": 7432275, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 2.234375, + "step": 357, + "time_per_iteration": 2.5834848880767822 + }, + { + "auxiliary_loss_clip": 0.01311386, + "auxiliary_loss_mlp": 0.0108216, + "balance_loss_clip": 1.04336917, + "balance_loss_mlp": 1.08195996, + "epoch": 0.021524124455133024, + "flos": 18186385098240.0, + "grad_norm": 2.3810225918991827, + "language_loss": 0.7661376, + "learning_rate": 3.786194003461506e-06, + "loss": 0.7900731, + "num_input_tokens_seen": 7450245, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 2.296875, + "step": 358, + "time_per_iteration": 2.4937031269073486 + }, + { + "auxiliary_loss_clip": 0.01308618, + "auxiliary_loss_mlp": 0.01088514, + "balance_loss_clip": 1.04574156, + "balance_loss_mlp": 1.08024073, + "epoch": 0.021584247707800992, + "flos": 13805786039040.0, + "grad_norm": 3.004949550769694, + "language_loss": 0.88491321, + "learning_rate": 3.787989966086264e-06, + "loss": 0.90888453, + "num_input_tokens_seen": 7466845, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 2.28125, + "step": 359, + "time_per_iteration": 2.452698230743408 + }, + { + "auxiliary_loss_clip": 0.01316066, + "auxiliary_loss_mlp": 0.01089057, + "balance_loss_clip": 1.05000377, + "balance_loss_mlp": 1.08438587, + "epoch": 0.02164437096046896, + "flos": 23294713703040.0, + "grad_norm": 2.789884231725057, + "language_loss": 0.76007903, + "learning_rate": 3.789780932980997e-06, + "loss": 0.78413033, + "num_input_tokens_seen": 7485450, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.3125, + "step": 360, + "time_per_iteration": 2.490006685256958 + }, + { + "auxiliary_loss_clip": 0.01189834, + "auxiliary_loss_mlp": 0.010797, + "balance_loss_clip": 1.06634831, + "balance_loss_mlp": 1.06162107, + "epoch": 0.02170449421313693, + "flos": 68899578341760.0, + "grad_norm": 0.8685264055585812, + "language_loss": 0.64943242, + "learning_rate": 3.79156693186132e-06, + "loss": 0.67212784, + "num_input_tokens_seen": 7553780, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 1.28125, + "step": 361, + "time_per_iteration": 3.1978280544281006 + }, + { + "auxiliary_loss_clip": 0.01307066, + "auxiliary_loss_mlp": 0.01088482, + "balance_loss_clip": 1.04826093, + "balance_loss_mlp": 1.0776422, + "epoch": 0.0217646174658049, + "flos": 25228539146880.0, + "grad_norm": 2.6839093883440213, + "language_loss": 0.78157276, + "learning_rate": 3.7933479902128433e-06, + "loss": 0.80552828, + "num_input_tokens_seen": 7574155, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 2.296875, + "step": 362, + "time_per_iteration": 2.5401153564453125 + }, + { + "auxiliary_loss_clip": 0.0131339, + "auxiliary_loss_mlp": 0.01092034, + "balance_loss_clip": 1.05171776, + "balance_loss_mlp": 1.08265781, + "epoch": 0.02182474071847287, + "flos": 22893124671360.0, + "grad_norm": 2.163466714708112, + "language_loss": 0.92508751, + "learning_rate": 3.7951241352937077e-06, + "loss": 0.94914174, + "num_input_tokens_seen": 7592320, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 2.3125, + "step": 363, + "time_per_iteration": 2.4868171215057373 + }, + { + "auxiliary_loss_clip": 0.01307593, + "auxiliary_loss_mlp": 0.01102168, + "balance_loss_clip": 1.06270981, + "balance_loss_mlp": 1.08121252, + "epoch": 0.02188486397114084, + "flos": 23658991482240.0, + "grad_norm": 2.137373361500905, + "language_loss": 0.89611077, + "learning_rate": 3.7968953941370915e-06, + "loss": 0.92020839, + "num_input_tokens_seen": 7611185, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 2.265625, + "step": 364, + "time_per_iteration": 2.5251975059509277 + }, + { + "auxiliary_loss_clip": 0.01313873, + "auxiliary_loss_mlp": 0.01094072, + "balance_loss_clip": 1.05232477, + "balance_loss_mlp": 1.08512843, + "epoch": 0.021944987223808807, + "flos": 21543637680000.0, + "grad_norm": 2.0040846596101867, + "language_loss": 0.79597497, + "learning_rate": 3.798661793553676e-06, + "loss": 0.82005441, + "num_input_tokens_seen": 7631970, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 2.28125, + "step": 365, + "time_per_iteration": 2.5358779430389404 + }, + { + "auxiliary_loss_clip": 0.01307321, + "auxiliary_loss_mlp": 0.01094001, + "balance_loss_clip": 1.05218291, + "balance_loss_mlp": 1.08262253, + "epoch": 0.022005110476476776, + "flos": 16070887641600.0, + "grad_norm": 2.4198695758814126, + "language_loss": 0.84312123, + "learning_rate": 3.8004233601340808e-06, + "loss": 0.86713445, + "num_input_tokens_seen": 7649745, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 2.25, + "step": 366, + "time_per_iteration": 2.4834306240081787 + }, + { + "auxiliary_loss_clip": 0.01314411, + "auxiliary_loss_mlp": 0.01089093, + "balance_loss_clip": 1.05008757, + "balance_loss_mlp": 1.08409071, + "epoch": 0.022065233729144748, + "flos": 21433715084160.0, + "grad_norm": 2.4790438398014114, + "language_loss": 0.87009263, + "learning_rate": 3.8021801202512694e-06, + "loss": 0.89412761, + "num_input_tokens_seen": 7668830, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.296875, + "step": 367, + "time_per_iteration": 2.486476421356201 + }, + { + "auxiliary_loss_clip": 0.01315695, + "auxiliary_loss_mlp": 0.01094559, + "balance_loss_clip": 1.05247772, + "balance_loss_mlp": 1.08183074, + "epoch": 0.022125356981812717, + "flos": 21543709507200.0, + "grad_norm": 3.1787846704720906, + "language_loss": 0.84725291, + "learning_rate": 3.803932100062912e-06, + "loss": 0.87135541, + "num_input_tokens_seen": 7687240, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 2.34375, + "step": 368, + "time_per_iteration": 2.522035837173462 + }, + { + "auxiliary_loss_clip": 0.01314671, + "auxiliary_loss_mlp": 0.01085486, + "balance_loss_clip": 1.04559815, + "balance_loss_mlp": 1.07997978, + "epoch": 0.022185480234480685, + "flos": 20704153944960.0, + "grad_norm": 3.205334425353566, + "language_loss": 0.75328851, + "learning_rate": 3.8056793255137264e-06, + "loss": 0.77728999, + "num_input_tokens_seen": 7704440, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 2.34375, + "step": 369, + "time_per_iteration": 2.5247385501861572 + }, + { + "auxiliary_loss_clip": 0.01309465, + "auxiliary_loss_mlp": 0.01102826, + "balance_loss_clip": 1.06241453, + "balance_loss_mlp": 1.08204889, + "epoch": 0.022245603487148654, + "flos": 25193203142400.0, + "grad_norm": 2.195001895084689, + "language_loss": 0.82444763, + "learning_rate": 3.8074218223377844e-06, + "loss": 0.84857059, + "num_input_tokens_seen": 7727160, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 2.28125, + "step": 370, + "time_per_iteration": 2.556654453277588 + }, + { + "auxiliary_loss_clip": 0.01308477, + "auxiliary_loss_mlp": 0.01102256, + "balance_loss_clip": 1.06186807, + "balance_loss_mlp": 1.08148122, + "epoch": 0.022305726739816623, + "flos": 21395936954880.0, + "grad_norm": 1.701167396379405, + "language_loss": 0.81576145, + "learning_rate": 3.8091596160607834e-06, + "loss": 0.83986878, + "num_input_tokens_seen": 7747730, + "router_z_loss_clip": 0.40429688, + "router_z_loss_mlp": 2.265625, + "step": 371, + "time_per_iteration": 2.5303707122802734 + }, + { + "auxiliary_loss_clip": 0.01313813, + "auxiliary_loss_mlp": 0.01097647, + "balance_loss_clip": 1.05611479, + "balance_loss_mlp": 1.08685589, + "epoch": 0.022365849992484595, + "flos": 22492146170880.0, + "grad_norm": 2.421527930745161, + "language_loss": 0.83273733, + "learning_rate": 3.8108927320022896e-06, + "loss": 0.85685182, + "num_input_tokens_seen": 7766765, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 2.28125, + "step": 372, + "time_per_iteration": 2.528141975402832 + }, + { + "auxiliary_loss_clip": 0.01304775, + "auxiliary_loss_mlp": 0.01093239, + "balance_loss_clip": 1.05170679, + "balance_loss_mlp": 1.08068216, + "epoch": 0.022425973245152563, + "flos": 17856581397120.0, + "grad_norm": 3.9515576064335742, + "language_loss": 0.78448784, + "learning_rate": 3.8126211952779548e-06, + "loss": 0.80846798, + "num_input_tokens_seen": 7784010, + "router_z_loss_clip": 0.41601562, + "router_z_loss_mlp": 2.234375, + "step": 373, + "time_per_iteration": 2.4879236221313477 + }, + { + "auxiliary_loss_clip": 0.01310159, + "auxiliary_loss_mlp": 0.01088775, + "balance_loss_clip": 1.04681301, + "balance_loss_mlp": 1.08387947, + "epoch": 0.022486096497820532, + "flos": 15483029656320.0, + "grad_norm": 2.577150517784044, + "language_loss": 0.77507353, + "learning_rate": 3.8143450308016952e-06, + "loss": 0.79906291, + "num_input_tokens_seen": 7801305, + "router_z_loss_clip": 0.41992188, + "router_z_loss_mlp": 2.265625, + "step": 374, + "time_per_iteration": 2.467660665512085 + }, + { + "auxiliary_loss_clip": 0.01300907, + "auxiliary_loss_mlp": 0.01075524, + "balance_loss_clip": 1.03415811, + "balance_loss_mlp": 1.07458413, + "epoch": 0.0225462197504885, + "flos": 27784157950080.0, + "grad_norm": 2.1361288872426187, + "language_loss": 0.85989249, + "learning_rate": 3.8160642632878525e-06, + "loss": 0.8836568, + "num_input_tokens_seen": 7823965, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 2.265625, + "step": 375, + "time_per_iteration": 2.555748224258423 + }, + { + "auxiliary_loss_clip": 0.01307901, + "auxiliary_loss_mlp": 0.01100092, + "balance_loss_clip": 1.05767775, + "balance_loss_mlp": 1.08341241, + "epoch": 0.02260634300315647, + "flos": 19975490645760.0, + "grad_norm": 5.5735447387306785, + "language_loss": 0.89170349, + "learning_rate": 3.817778917253314e-06, + "loss": 0.91578341, + "num_input_tokens_seen": 7842115, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 2.25, + "step": 376, + "time_per_iteration": 2.53151798248291 + }, + { + "auxiliary_loss_clip": 0.01309113, + "auxiliary_loss_mlp": 0.01087831, + "balance_loss_clip": 1.04908752, + "balance_loss_mlp": 1.07899499, + "epoch": 0.02266646625582444, + "flos": 16028189349120.0, + "grad_norm": 4.261190841992283, + "language_loss": 0.74947262, + "learning_rate": 3.8194890170196155e-06, + "loss": 0.77344215, + "num_input_tokens_seen": 7857830, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 2.3125, + "step": 377, + "time_per_iteration": 2.463115692138672 + }, + { + "auxiliary_loss_clip": 0.0129987, + "auxiliary_loss_mlp": 0.01091273, + "balance_loss_clip": 1.05009794, + "balance_loss_mlp": 1.08131123, + "epoch": 0.02272658950849241, + "flos": 20404622430720.0, + "grad_norm": 9.398931100052017, + "language_loss": 0.99195766, + "learning_rate": 3.8211945867150055e-06, + "loss": 1.01586914, + "num_input_tokens_seen": 7875840, + "router_z_loss_clip": 0.41210938, + "router_z_loss_mlp": 2.1875, + "step": 378, + "time_per_iteration": 2.4765851497650146 + }, + { + "auxiliary_loss_clip": 0.01180245, + "auxiliary_loss_mlp": 0.0112236, + "balance_loss_clip": 1.10910404, + "balance_loss_mlp": 1.06006432, + "epoch": 0.02278671276116038, + "flos": 69847332647040.0, + "grad_norm": 0.9843357397114052, + "language_loss": 0.75457036, + "learning_rate": 3.822895650276492e-06, + "loss": 0.77759647, + "num_input_tokens_seen": 7940190, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 1.203125, + "step": 379, + "time_per_iteration": 3.113067388534546 + }, + { + "auxiliary_loss_clip": 0.01308809, + "auxiliary_loss_mlp": 0.01083458, + "balance_loss_clip": 1.0448581, + "balance_loss_mlp": 1.07811105, + "epoch": 0.022846836013828347, + "flos": 38508771340800.0, + "grad_norm": 4.195302770466088, + "language_loss": 0.78423429, + "learning_rate": 3.824592231451859e-06, + "loss": 0.80815697, + "num_input_tokens_seen": 7960840, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 2.3125, + "step": 380, + "time_per_iteration": 2.6457204818725586 + }, + { + "auxiliary_loss_clip": 0.01302565, + "auxiliary_loss_mlp": 0.01083038, + "balance_loss_clip": 1.04527259, + "balance_loss_mlp": 1.08019924, + "epoch": 0.02290695926649632, + "flos": 20959478795520.0, + "grad_norm": 2.272240555091753, + "language_loss": 0.9679752, + "learning_rate": 3.826284353801652e-06, + "loss": 0.99183118, + "num_input_tokens_seen": 7975500, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 2.21875, + "step": 381, + "time_per_iteration": 2.485316038131714 + }, + { + "auxiliary_loss_clip": 0.01312325, + "auxiliary_loss_mlp": 0.01084569, + "balance_loss_clip": 1.04501581, + "balance_loss_mlp": 1.08177519, + "epoch": 0.022967082519164288, + "flos": 24022407335040.0, + "grad_norm": 2.322972014312181, + "language_loss": 0.88035834, + "learning_rate": 3.827972040701142e-06, + "loss": 0.90432727, + "num_input_tokens_seen": 7993880, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 2.3125, + "step": 382, + "time_per_iteration": 2.5361156463623047 + }, + { + "auxiliary_loss_clip": 0.01306631, + "auxiliary_loss_mlp": 0.01099641, + "balance_loss_clip": 1.06080246, + "balance_loss_mlp": 1.08242524, + "epoch": 0.023027205771832256, + "flos": 20997149184000.0, + "grad_norm": 2.197151340607638, + "language_loss": 0.84830511, + "learning_rate": 3.829655315342268e-06, + "loss": 0.87236774, + "num_input_tokens_seen": 8012730, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 2.25, + "step": 383, + "time_per_iteration": 2.479097843170166 + }, + { + "auxiliary_loss_clip": 0.01303681, + "auxiliary_loss_mlp": 0.01106386, + "balance_loss_clip": 1.06673658, + "balance_loss_mlp": 1.08259249, + "epoch": 0.023087329024500225, + "flos": 21360816432000.0, + "grad_norm": 2.2992198386883116, + "language_loss": 0.83199835, + "learning_rate": 3.831334200735543e-06, + "loss": 0.85609907, + "num_input_tokens_seen": 8031275, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 2.203125, + "step": 384, + "time_per_iteration": 2.5008413791656494 + }, + { + "auxiliary_loss_clip": 0.01303616, + "auxiliary_loss_mlp": 0.0109643, + "balance_loss_clip": 1.06030965, + "balance_loss_mlp": 1.08539534, + "epoch": 0.023147452277168194, + "flos": 21872435800320.0, + "grad_norm": 1.8570399395654076, + "language_loss": 0.89240694, + "learning_rate": 3.8330087197119426e-06, + "loss": 0.91640741, + "num_input_tokens_seen": 8051600, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 2.1875, + "step": 385, + "time_per_iteration": 2.4913859367370605 + }, + { + "auxiliary_loss_clip": 0.01306859, + "auxiliary_loss_mlp": 0.01121647, + "balance_loss_clip": 1.08397639, + "balance_loss_mlp": 1.0826149, + "epoch": 0.023207575529836166, + "flos": 18916700423040.0, + "grad_norm": 2.2576284783670357, + "language_loss": 0.70096415, + "learning_rate": 3.83467889492477e-06, + "loss": 0.72524917, + "num_input_tokens_seen": 8070600, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 2.234375, + "step": 386, + "time_per_iteration": 2.5017154216766357 + }, + { + "auxiliary_loss_clip": 0.01308067, + "auxiliary_loss_mlp": 0.01098351, + "balance_loss_clip": 1.06072879, + "balance_loss_mlp": 1.08460176, + "epoch": 0.023267698782504134, + "flos": 25046005207680.0, + "grad_norm": 1.9470877788533054, + "language_loss": 0.87909782, + "learning_rate": 3.836344748851495e-06, + "loss": 0.90316188, + "num_input_tokens_seen": 8090680, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 2.234375, + "step": 387, + "time_per_iteration": 2.5142157077789307 + }, + { + "auxiliary_loss_clip": 0.01308318, + "auxiliary_loss_mlp": 0.01085815, + "balance_loss_clip": 1.04666662, + "balance_loss_mlp": 1.08291698, + "epoch": 0.023327822035172103, + "flos": 28879217930880.0, + "grad_norm": 2.441105853176172, + "language_loss": 0.83429295, + "learning_rate": 3.838006303795566e-06, + "loss": 0.85823429, + "num_input_tokens_seen": 8114610, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.25, + "step": 388, + "time_per_iteration": 2.591242790222168 + }, + { + "auxiliary_loss_clip": 0.01305661, + "auxiliary_loss_mlp": 0.01094305, + "balance_loss_clip": 1.05754054, + "balance_loss_mlp": 1.08271885, + "epoch": 0.02338794528784007, + "flos": 27121533805440.0, + "grad_norm": 3.2646980282386644, + "language_loss": 0.93823689, + "learning_rate": 3.839663581888206e-06, + "loss": 0.96223652, + "num_input_tokens_seen": 8133975, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 2.21875, + "step": 389, + "time_per_iteration": 2.5427236557006836 + }, + { + "auxiliary_loss_clip": 0.01299094, + "auxiliary_loss_mlp": 0.01087693, + "balance_loss_clip": 1.04954624, + "balance_loss_mlp": 1.08334351, + "epoch": 0.02344806854050804, + "flos": 21322355944320.0, + "grad_norm": 2.08298220488583, + "language_loss": 0.87901413, + "learning_rate": 3.841316605090178e-06, + "loss": 0.90288198, + "num_input_tokens_seen": 8153570, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 2.15625, + "step": 390, + "time_per_iteration": 2.53519606590271 + }, + { + "auxiliary_loss_clip": 0.01304239, + "auxiliary_loss_mlp": 0.01095828, + "balance_loss_clip": 1.05927861, + "balance_loss_mlp": 1.08334053, + "epoch": 0.023508191793176012, + "flos": 24789997998720.0, + "grad_norm": 2.2293869448662362, + "language_loss": 0.89346433, + "learning_rate": 3.842965395193529e-06, + "loss": 0.91746497, + "num_input_tokens_seen": 8170075, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 2.203125, + "step": 391, + "time_per_iteration": 2.5662949085235596 + }, + { + "auxiliary_loss_clip": 0.01302453, + "auxiliary_loss_mlp": 0.0107275, + "balance_loss_clip": 1.03560483, + "balance_loss_mlp": 1.08116579, + "epoch": 0.02356831504584398, + "flos": 25995375624960.0, + "grad_norm": 2.022763227206087, + "language_loss": 0.86065882, + "learning_rate": 3.84460997382332e-06, + "loss": 0.88441086, + "num_input_tokens_seen": 8190420, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 2.21875, + "step": 392, + "time_per_iteration": 4.050429105758667 + }, + { + "auxiliary_loss_clip": 0.01297975, + "auxiliary_loss_mlp": 0.01086863, + "balance_loss_clip": 1.04990816, + "balance_loss_mlp": 1.08006191, + "epoch": 0.02362843829851195, + "flos": 19062461813760.0, + "grad_norm": 2.9628480690926318, + "language_loss": 0.88900077, + "learning_rate": 3.8462503624393256e-06, + "loss": 0.91284919, + "num_input_tokens_seen": 8208790, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 2.1875, + "step": 393, + "time_per_iteration": 3.9293932914733887 + }, + { + "auxiliary_loss_clip": 0.01309989, + "auxiliary_loss_mlp": 0.01103908, + "balance_loss_clip": 1.06449771, + "balance_loss_mlp": 1.087502, + "epoch": 0.023688561551179918, + "flos": 16071031296000.0, + "grad_norm": 2.0531375516435943, + "language_loss": 0.81400156, + "learning_rate": 3.84788658233771e-06, + "loss": 0.83814055, + "num_input_tokens_seen": 8226885, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 2.21875, + "step": 394, + "time_per_iteration": 2.552100658416748 + }, + { + "auxiliary_loss_clip": 0.01299653, + "auxiliary_loss_mlp": 0.01084647, + "balance_loss_clip": 1.04611897, + "balance_loss_mlp": 1.08043575, + "epoch": 0.023748684803847887, + "flos": 21724375939200.0, + "grad_norm": 2.0447414784698092, + "language_loss": 0.86189264, + "learning_rate": 3.84951865465269e-06, + "loss": 0.88573563, + "num_input_tokens_seen": 8246825, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 2.1875, + "step": 395, + "time_per_iteration": 2.536823272705078 + }, + { + "auxiliary_loss_clip": 0.01174527, + "auxiliary_loss_mlp": 0.01044608, + "balance_loss_clip": 1.03135228, + "balance_loss_mlp": 1.0590049, + "epoch": 0.02380880805651586, + "flos": 61926192881280.0, + "grad_norm": 0.9487784547172928, + "language_loss": 0.63808912, + "learning_rate": 3.851146600358172e-06, + "loss": 0.66028047, + "num_input_tokens_seen": 8302835, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 1.15625, + "step": 396, + "time_per_iteration": 2.935506582260132 + }, + { + "auxiliary_loss_clip": 0.01296295, + "auxiliary_loss_mlp": 0.01069502, + "balance_loss_clip": 1.03252339, + "balance_loss_mlp": 1.07895613, + "epoch": 0.023868931309183827, + "flos": 20266331068800.0, + "grad_norm": 2.6168641306315172, + "language_loss": 0.83744055, + "learning_rate": 3.852770440269372e-06, + "loss": 0.86109853, + "num_input_tokens_seen": 8320745, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 2.171875, + "step": 397, + "time_per_iteration": 2.5051028728485107 + }, + { + "auxiliary_loss_clip": 0.01302535, + "auxiliary_loss_mlp": 0.01091033, + "balance_loss_clip": 1.05288601, + "balance_loss_mlp": 1.08300877, + "epoch": 0.023929054561851796, + "flos": 21139103733120.0, + "grad_norm": 2.535145802301163, + "language_loss": 0.84050488, + "learning_rate": 3.854390195044404e-06, + "loss": 0.86444056, + "num_input_tokens_seen": 8339540, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 2.1875, + "step": 398, + "time_per_iteration": 2.5234646797180176 + }, + { + "auxiliary_loss_clip": 0.01300466, + "auxiliary_loss_mlp": 0.0108273, + "balance_loss_clip": 1.04427278, + "balance_loss_mlp": 1.07864475, + "epoch": 0.023989177814519765, + "flos": 13698521049600.0, + "grad_norm": 2.904470095612531, + "language_loss": 0.85865271, + "learning_rate": 3.856005885185868e-06, + "loss": 0.88248467, + "num_input_tokens_seen": 8354890, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 2.21875, + "step": 399, + "time_per_iteration": 2.4674201011657715 + }, + { + "auxiliary_loss_clip": 0.01295496, + "auxiliary_loss_mlp": 0.01093118, + "balance_loss_clip": 1.05566239, + "balance_loss_mlp": 1.08021355, + "epoch": 0.024049301067187733, + "flos": 26322018929280.0, + "grad_norm": 2.016759933832732, + "language_loss": 0.86157769, + "learning_rate": 3.857617531042398e-06, + "loss": 0.88546383, + "num_input_tokens_seen": 8375845, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 2.15625, + "step": 400, + "time_per_iteration": 2.554075241088867 + }, + { + "auxiliary_loss_clip": 0.01303599, + "auxiliary_loss_mlp": 0.01083552, + "balance_loss_clip": 1.04652512, + "balance_loss_mlp": 1.0848943, + "epoch": 0.024109424319855705, + "flos": 24425432910720.0, + "grad_norm": 3.068890951588493, + "language_loss": 0.79142016, + "learning_rate": 3.8592251528102065e-06, + "loss": 0.8152917, + "num_input_tokens_seen": 8395240, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 2.1875, + "step": 401, + "time_per_iteration": 2.543750286102295 + }, + { + "auxiliary_loss_clip": 0.01297911, + "auxiliary_loss_mlp": 0.01096359, + "balance_loss_clip": 1.05968988, + "balance_loss_mlp": 1.07987046, + "epoch": 0.024169547572523674, + "flos": 29604397610880.0, + "grad_norm": 2.2009554384450154, + "language_loss": 0.78456193, + "learning_rate": 3.8608287705345976e-06, + "loss": 0.80850464, + "num_input_tokens_seen": 8416950, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 2.1875, + "step": 402, + "time_per_iteration": 2.5531415939331055 + }, + { + "auxiliary_loss_clip": 0.0130167, + "auxiliary_loss_mlp": 0.01084273, + "balance_loss_clip": 1.04529142, + "balance_loss_mlp": 1.07989287, + "epoch": 0.024229670825191642, + "flos": 22601458235520.0, + "grad_norm": 2.7198213535828923, + "language_loss": 0.94637424, + "learning_rate": 3.86242840411147e-06, + "loss": 0.97023368, + "num_input_tokens_seen": 8433660, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.21875, + "step": 403, + "time_per_iteration": 2.4873671531677246 + }, + { + "auxiliary_loss_clip": 0.01306025, + "auxiliary_loss_mlp": 0.01095616, + "balance_loss_clip": 1.05620587, + "balance_loss_mlp": 1.07952547, + "epoch": 0.02428979407785961, + "flos": 18150258994560.0, + "grad_norm": 2.3706875621243246, + "language_loss": 0.99751151, + "learning_rate": 3.864024073288798e-06, + "loss": 1.02152789, + "num_input_tokens_seen": 8450180, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 2.265625, + "step": 404, + "time_per_iteration": 2.5400550365448 + }, + { + "auxiliary_loss_clip": 0.01304501, + "auxiliary_loss_mlp": 0.01104455, + "balance_loss_clip": 1.06716657, + "balance_loss_mlp": 1.08213115, + "epoch": 0.024349917330527583, + "flos": 15304984917120.0, + "grad_norm": 2.480197457162756, + "language_loss": 0.87603909, + "learning_rate": 3.865615797668091e-06, + "loss": 0.90012866, + "num_input_tokens_seen": 8467775, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 2.21875, + "step": 405, + "time_per_iteration": 2.4698479175567627 + }, + { + "auxiliary_loss_clip": 0.01314075, + "auxiliary_loss_mlp": 0.01107285, + "balance_loss_clip": 1.06835127, + "balance_loss_mlp": 1.08775485, + "epoch": 0.024410040583195552, + "flos": 20773892200320.0, + "grad_norm": 3.242686201363518, + "language_loss": 0.93258083, + "learning_rate": 3.867203596705844e-06, + "loss": 0.9567945, + "num_input_tokens_seen": 8486765, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 2.265625, + "step": 406, + "time_per_iteration": 2.505598306655884 + }, + { + "auxiliary_loss_clip": 0.01305046, + "auxiliary_loss_mlp": 0.01092168, + "balance_loss_clip": 1.05330622, + "balance_loss_mlp": 1.08378315, + "epoch": 0.02447016383586352, + "flos": 21798854789760.0, + "grad_norm": 2.059728688773918, + "language_loss": 0.87446553, + "learning_rate": 3.86878748971496e-06, + "loss": 0.89843762, + "num_input_tokens_seen": 8506515, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 2.21875, + "step": 407, + "time_per_iteration": 2.5017173290252686 + }, + { + "auxiliary_loss_clip": 0.01301523, + "auxiliary_loss_mlp": 0.01085362, + "balance_loss_clip": 1.04814506, + "balance_loss_mlp": 1.08445001, + "epoch": 0.02453028708853149, + "flos": 33948116380800.0, + "grad_norm": 2.439524495250932, + "language_loss": 0.7404871, + "learning_rate": 3.8703674958661596e-06, + "loss": 0.76435596, + "num_input_tokens_seen": 8528035, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 2.171875, + "step": 408, + "time_per_iteration": 2.6097092628479004 + }, + { + "auxiliary_loss_clip": 0.013061, + "auxiliary_loss_mlp": 0.01096961, + "balance_loss_clip": 1.05771768, + "balance_loss_mlp": 1.08381224, + "epoch": 0.024590410341199458, + "flos": 21793000872960.0, + "grad_norm": 2.750776221383638, + "language_loss": 0.92393035, + "learning_rate": 3.871943634189376e-06, + "loss": 0.94796097, + "num_input_tokens_seen": 8546455, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 2.21875, + "step": 409, + "time_per_iteration": 2.5198304653167725 + }, + { + "auxiliary_loss_clip": 0.01304769, + "auxiliary_loss_mlp": 0.01080478, + "balance_loss_clip": 1.04488206, + "balance_loss_mlp": 1.0854609, + "epoch": 0.02465053359386743, + "flos": 35114782124160.0, + "grad_norm": 1.9763435283924244, + "language_loss": 0.82926536, + "learning_rate": 3.873515923575128e-06, + "loss": 0.85311788, + "num_input_tokens_seen": 8568450, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 2.1875, + "step": 410, + "time_per_iteration": 2.624333143234253 + }, + { + "auxiliary_loss_clip": 0.01307118, + "auxiliary_loss_mlp": 0.01089288, + "balance_loss_clip": 1.05164146, + "balance_loss_mlp": 1.08556843, + "epoch": 0.0247106568465354, + "flos": 27451409333760.0, + "grad_norm": 4.176812441051998, + "language_loss": 0.77715993, + "learning_rate": 3.875084382775879e-06, + "loss": 0.80112404, + "num_input_tokens_seen": 8589340, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 2.21875, + "step": 411, + "time_per_iteration": 2.571401596069336 + }, + { + "auxiliary_loss_clip": 0.01303549, + "auxiliary_loss_mlp": 0.01102238, + "balance_loss_clip": 1.06311393, + "balance_loss_mlp": 1.08078265, + "epoch": 0.024770780099203367, + "flos": 20703794808960.0, + "grad_norm": 2.1103060729449883, + "language_loss": 0.86276567, + "learning_rate": 3.87664903040738e-06, + "loss": 0.88682353, + "num_input_tokens_seen": 8607150, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.21875, + "step": 412, + "time_per_iteration": 2.4968833923339844 + }, + { + "auxiliary_loss_clip": 0.01168305, + "auxiliary_loss_mlp": 0.01068817, + "balance_loss_clip": 1.05632353, + "balance_loss_mlp": 1.05478358, + "epoch": 0.024830903351871336, + "flos": 69551859369600.0, + "grad_norm": 0.8568818905087673, + "language_loss": 0.58512402, + "learning_rate": 3.878209884949994e-06, + "loss": 0.60749531, + "num_input_tokens_seen": 8669865, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 1.1328125, + "step": 413, + "time_per_iteration": 3.1763217449188232 + }, + { + "auxiliary_loss_clip": 0.01296528, + "auxiliary_loss_mlp": 0.01092205, + "balance_loss_clip": 1.05145931, + "balance_loss_mlp": 1.07941055, + "epoch": 0.024891026604539304, + "flos": 32270477713920.0, + "grad_norm": 1.7554792190049524, + "language_loss": 0.80704832, + "learning_rate": 3.879766964750006e-06, + "loss": 0.83093566, + "num_input_tokens_seen": 8690235, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 2.171875, + "step": 414, + "time_per_iteration": 2.5954627990722656 + }, + { + "auxiliary_loss_clip": 0.01292737, + "auxiliary_loss_mlp": 0.01093441, + "balance_loss_clip": 1.05660486, + "balance_loss_mlp": 1.07739186, + "epoch": 0.024951149857207276, + "flos": 18840282238080.0, + "grad_norm": 2.3796689224247904, + "language_loss": 0.80473328, + "learning_rate": 3.881320288020917e-06, + "loss": 0.82859504, + "num_input_tokens_seen": 8706295, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 2.15625, + "step": 415, + "time_per_iteration": 2.471665620803833 + }, + { + "auxiliary_loss_clip": 0.0131185, + "auxiliary_loss_mlp": 0.01085672, + "balance_loss_clip": 1.0481931, + "balance_loss_mlp": 1.08601356, + "epoch": 0.025011273109875245, + "flos": 15377201210880.0, + "grad_norm": 5.333540620494007, + "language_loss": 0.96179891, + "learning_rate": 3.882869872844723e-06, + "loss": 0.98577416, + "num_input_tokens_seen": 8724200, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 2.25, + "step": 416, + "time_per_iteration": 2.5133068561553955 + }, + { + "auxiliary_loss_clip": 0.01300197, + "auxiliary_loss_mlp": 0.01076153, + "balance_loss_clip": 1.03702867, + "balance_loss_mlp": 1.0806849, + "epoch": 0.025071396362543213, + "flos": 18915515274240.0, + "grad_norm": 2.409464042642492, + "language_loss": 0.77541196, + "learning_rate": 3.884415737173176e-06, + "loss": 0.79917544, + "num_input_tokens_seen": 8744170, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.1875, + "step": 417, + "time_per_iteration": 2.5206987857818604 + }, + { + "auxiliary_loss_clip": 0.01297091, + "auxiliary_loss_mlp": 0.01092626, + "balance_loss_clip": 1.05512297, + "balance_loss_mlp": 1.08281994, + "epoch": 0.025131519615211182, + "flos": 25337958952320.0, + "grad_norm": 1.6345521849457858, + "language_loss": 0.7689445, + "learning_rate": 3.8859578988290344e-06, + "loss": 0.79284167, + "num_input_tokens_seen": 8765120, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 2.140625, + "step": 418, + "time_per_iteration": 2.6002862453460693 + }, + { + "auxiliary_loss_clip": 0.01304842, + "auxiliary_loss_mlp": 0.01075451, + "balance_loss_clip": 1.03873444, + "balance_loss_mlp": 1.08383846, + "epoch": 0.02519164286787915, + "flos": 18953149749120.0, + "grad_norm": 2.548681745998596, + "language_loss": 0.81088459, + "learning_rate": 3.887496375507294e-06, + "loss": 0.83468759, + "num_input_tokens_seen": 8783500, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 2.203125, + "step": 419, + "time_per_iteration": 2.5097553730010986 + }, + { + "auxiliary_loss_clip": 0.01298642, + "auxiliary_loss_mlp": 0.01085388, + "balance_loss_clip": 1.0453577, + "balance_loss_mlp": 1.08236253, + "epoch": 0.025251766120547123, + "flos": 17421092904960.0, + "grad_norm": 1.9166879875817555, + "language_loss": 0.73812175, + "learning_rate": 3.8890311847764065e-06, + "loss": 0.761962, + "num_input_tokens_seen": 8801175, + "router_z_loss_clip": 0.40039062, + "router_z_loss_mlp": 2.15625, + "step": 420, + "time_per_iteration": 2.480468511581421 + }, + { + "auxiliary_loss_clip": 0.01298409, + "auxiliary_loss_mlp": 0.01098321, + "balance_loss_clip": 1.06086528, + "balance_loss_mlp": 1.0791508, + "epoch": 0.02531188937321509, + "flos": 25045430590080.0, + "grad_norm": 1.7246544027149788, + "language_loss": 0.78928417, + "learning_rate": 3.890562344079484e-06, + "loss": 0.8132515, + "num_input_tokens_seen": 8820215, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 2.1875, + "step": 421, + "time_per_iteration": 2.583979845046997 + }, + { + "auxiliary_loss_clip": 0.01300301, + "auxiliary_loss_mlp": 0.01095113, + "balance_loss_clip": 1.05589294, + "balance_loss_mlp": 1.08374381, + "epoch": 0.02537201262588306, + "flos": 30592228515840.0, + "grad_norm": 2.879256315405443, + "language_loss": 0.81915486, + "learning_rate": 3.89208987073549e-06, + "loss": 0.84310895, + "num_input_tokens_seen": 8839660, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 2.171875, + "step": 422, + "time_per_iteration": 2.5834591388702393 + }, + { + "auxiliary_loss_clip": 0.01299282, + "auxiliary_loss_mlp": 0.01079788, + "balance_loss_clip": 1.0445497, + "balance_loss_mlp": 1.07925105, + "epoch": 0.02543213587855103, + "flos": 26065365275520.0, + "grad_norm": 1.9426129656279463, + "language_loss": 0.83468062, + "learning_rate": 3.893613781940409e-06, + "loss": 0.85847133, + "num_input_tokens_seen": 8859280, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 2.203125, + "step": 423, + "time_per_iteration": 2.5526318550109863 + }, + { + "auxiliary_loss_clip": 0.01293361, + "auxiliary_loss_mlp": 0.01086715, + "balance_loss_clip": 1.04978371, + "balance_loss_mlp": 1.07668817, + "epoch": 0.025492259131218997, + "flos": 36022818965760.0, + "grad_norm": 2.7010989411926367, + "language_loss": 0.74435121, + "learning_rate": 3.895134094768415e-06, + "loss": 0.768152, + "num_input_tokens_seen": 8880560, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 2.171875, + "step": 424, + "time_per_iteration": 2.606895923614502 + }, + { + "auxiliary_loss_clip": 0.01303473, + "auxiliary_loss_mlp": 0.01097188, + "balance_loss_clip": 1.06113958, + "balance_loss_mlp": 1.08349586, + "epoch": 0.02555238238388697, + "flos": 18588045957120.0, + "grad_norm": 2.227147445366898, + "language_loss": 0.83008313, + "learning_rate": 3.896650826173015e-06, + "loss": 0.85408974, + "num_input_tokens_seen": 8899155, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 2.203125, + "step": 425, + "time_per_iteration": 2.522517442703247 + }, + { + "auxiliary_loss_clip": 0.01299491, + "auxiliary_loss_mlp": 0.01096328, + "balance_loss_clip": 1.05691719, + "balance_loss_mlp": 1.07528758, + "epoch": 0.025612505636554938, + "flos": 24243186280320.0, + "grad_norm": 2.394258070540652, + "language_loss": 0.85481966, + "learning_rate": 3.898163992988186e-06, + "loss": 0.87877786, + "num_input_tokens_seen": 8917890, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 2.25, + "step": 426, + "time_per_iteration": 2.5039095878601074 + }, + { + "auxiliary_loss_clip": 0.01160068, + "auxiliary_loss_mlp": 0.0104803, + "balance_loss_clip": 1.03663349, + "balance_loss_mlp": 1.04526472, + "epoch": 0.025672628889222907, + "flos": 60586941265920.0, + "grad_norm": 0.8962322500302954, + "language_loss": 0.57186544, + "learning_rate": 3.899673611929491e-06, + "loss": 0.5939464, + "num_input_tokens_seen": 8978260, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 1.1484375, + "step": 427, + "time_per_iteration": 3.2289342880249023 + }, + { + "auxiliary_loss_clip": 0.01297452, + "auxiliary_loss_mlp": 0.01095521, + "balance_loss_clip": 1.05849457, + "balance_loss_mlp": 1.0838623, + "epoch": 0.025732752141890875, + "flos": 19573255169280.0, + "grad_norm": 2.6536896946259816, + "language_loss": 0.88190198, + "learning_rate": 3.901179699595194e-06, + "loss": 0.90583158, + "num_input_tokens_seen": 8994460, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 2.125, + "step": 428, + "time_per_iteration": 2.500389814376831 + }, + { + "auxiliary_loss_clip": 0.01290417, + "auxiliary_loss_mlp": 0.01078869, + "balance_loss_clip": 1.03972101, + "balance_loss_mlp": 1.07718623, + "epoch": 0.025792875394558847, + "flos": 31284262920960.0, + "grad_norm": 1.6692033855414803, + "language_loss": 0.85672665, + "learning_rate": 3.902682272467353e-06, + "loss": 0.88041949, + "num_input_tokens_seen": 9016670, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.125, + "step": 429, + "time_per_iteration": 2.605687379837036 + }, + { + "auxiliary_loss_clip": 0.01297427, + "auxiliary_loss_mlp": 0.01082502, + "balance_loss_clip": 1.04373491, + "balance_loss_mlp": 1.07673144, + "epoch": 0.025852998647226816, + "flos": 32379610210560.0, + "grad_norm": 2.5023850128037672, + "language_loss": 0.88384748, + "learning_rate": 3.904181346912895e-06, + "loss": 0.90764678, + "num_input_tokens_seen": 9039720, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 2.203125, + "step": 430, + "time_per_iteration": 2.593492269515991 + }, + { + "auxiliary_loss_clip": 0.01298542, + "auxiliary_loss_mlp": 0.01083596, + "balance_loss_clip": 1.04799962, + "balance_loss_mlp": 1.08428442, + "epoch": 0.025913121899894784, + "flos": 20193288762240.0, + "grad_norm": 1.9811912271744876, + "language_loss": 0.84202254, + "learning_rate": 3.905676939184698e-06, + "loss": 0.86584389, + "num_input_tokens_seen": 9059850, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 2.140625, + "step": 431, + "time_per_iteration": 2.5326902866363525 + }, + { + "auxiliary_loss_clip": 0.01291302, + "auxiliary_loss_mlp": 0.01073914, + "balance_loss_clip": 1.03886628, + "balance_loss_mlp": 1.0772872, + "epoch": 0.025973245152562753, + "flos": 14720430983040.0, + "grad_norm": 2.686150654607635, + "language_loss": 0.86775959, + "learning_rate": 3.907169065422638e-06, + "loss": 0.89141178, + "num_input_tokens_seen": 9077590, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 2.140625, + "step": 432, + "time_per_iteration": 2.4793269634246826 + }, + { + "auxiliary_loss_clip": 0.01296964, + "auxiliary_loss_mlp": 0.01080084, + "balance_loss_clip": 1.04491723, + "balance_loss_mlp": 1.08109105, + "epoch": 0.02603336840523072, + "flos": 30992991534720.0, + "grad_norm": 2.6953453355349684, + "language_loss": 0.76074433, + "learning_rate": 3.908657741654636e-06, + "loss": 0.78451484, + "num_input_tokens_seen": 9099880, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 2.15625, + "step": 433, + "time_per_iteration": 2.6125545501708984 + }, + { + "auxiliary_loss_clip": 0.01296292, + "auxiliary_loss_mlp": 0.0109282, + "balance_loss_clip": 1.05312383, + "balance_loss_mlp": 1.07772529, + "epoch": 0.026093491657898694, + "flos": 17674262939520.0, + "grad_norm": 2.2540618473103247, + "language_loss": 0.89764363, + "learning_rate": 3.910142983797699e-06, + "loss": 0.92153478, + "num_input_tokens_seen": 9118620, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 2.1875, + "step": 434, + "time_per_iteration": 5.3097922801971436 + }, + { + "auxiliary_loss_clip": 0.01297376, + "auxiliary_loss_mlp": 0.01102904, + "balance_loss_clip": 1.06404209, + "balance_loss_mlp": 1.08362865, + "epoch": 0.026153614910566662, + "flos": 17857874286720.0, + "grad_norm": 6.328317132251919, + "language_loss": 0.7985189, + "learning_rate": 3.9116248076589305e-06, + "loss": 0.82252169, + "num_input_tokens_seen": 9135655, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 2.125, + "step": 435, + "time_per_iteration": 3.9629530906677246 + }, + { + "auxiliary_loss_clip": 0.01291104, + "auxiliary_loss_mlp": 0.01091144, + "balance_loss_clip": 1.05316401, + "balance_loss_mlp": 1.0750463, + "epoch": 0.02621373816323463, + "flos": 20011113959040.0, + "grad_norm": 3.559504815450524, + "language_loss": 0.86357677, + "learning_rate": 3.913103228936546e-06, + "loss": 0.88739926, + "num_input_tokens_seen": 9153520, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 2.15625, + "step": 436, + "time_per_iteration": 2.479033946990967 + }, + { + "auxiliary_loss_clip": 0.01296325, + "auxiliary_loss_mlp": 0.01099771, + "balance_loss_clip": 1.06214869, + "balance_loss_mlp": 1.07964039, + "epoch": 0.0262738614159026, + "flos": 19281193683840.0, + "grad_norm": 2.6168892141891944, + "language_loss": 0.75002837, + "learning_rate": 3.914578263220868e-06, + "loss": 0.77398932, + "num_input_tokens_seen": 9170750, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 2.171875, + "step": 437, + "time_per_iteration": 2.508769989013672 + }, + { + "auxiliary_loss_clip": 0.01293849, + "auxiliary_loss_mlp": 0.01104049, + "balance_loss_clip": 1.06380415, + "balance_loss_mlp": 1.08015561, + "epoch": 0.026333984668570568, + "flos": 18807208790400.0, + "grad_norm": 2.3031145987765758, + "language_loss": 0.91467845, + "learning_rate": 3.916049925995316e-06, + "loss": 0.93865746, + "num_input_tokens_seen": 9188430, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 2.140625, + "step": 438, + "time_per_iteration": 2.4693844318389893 + }, + { + "auxiliary_loss_clip": 0.01155458, + "auxiliary_loss_mlp": 0.01064255, + "balance_loss_clip": 1.05276346, + "balance_loss_mlp": 1.0448494, + "epoch": 0.02639410792123854, + "flos": 64572020691840.0, + "grad_norm": 0.877669139368542, + "language_loss": 0.62577796, + "learning_rate": 3.917518232637377e-06, + "loss": 0.64797509, + "num_input_tokens_seen": 9255835, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 1.109375, + "step": 439, + "time_per_iteration": 3.162259101867676 + }, + { + "auxiliary_loss_clip": 0.01303989, + "auxiliary_loss_mlp": 0.01098096, + "balance_loss_clip": 1.05873275, + "balance_loss_mlp": 1.08440769, + "epoch": 0.02645423117390651, + "flos": 28473462921600.0, + "grad_norm": 2.1384369611317493, + "language_loss": 0.75629139, + "learning_rate": 3.918983198419573e-06, + "loss": 0.78031218, + "num_input_tokens_seen": 9276835, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 2.203125, + "step": 440, + "time_per_iteration": 2.5541677474975586 + }, + { + "auxiliary_loss_clip": 0.01294139, + "auxiliary_loss_mlp": 0.01082398, + "balance_loss_clip": 1.04408443, + "balance_loss_mlp": 1.08003163, + "epoch": 0.026514354426574478, + "flos": 18551237495040.0, + "grad_norm": 1.9583565981573345, + "language_loss": 0.83186466, + "learning_rate": 3.920444838510415e-06, + "loss": 0.85563004, + "num_input_tokens_seen": 9295075, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 2.140625, + "step": 441, + "time_per_iteration": 2.453705072402954 + }, + { + "auxiliary_loss_clip": 0.01298235, + "auxiliary_loss_mlp": 0.01092726, + "balance_loss_clip": 1.05286217, + "balance_loss_mlp": 1.07855892, + "epoch": 0.026574477679242446, + "flos": 20667812359680.0, + "grad_norm": 2.035076381127293, + "language_loss": 0.7850582, + "learning_rate": 3.92190316797534e-06, + "loss": 0.80896777, + "num_input_tokens_seen": 9314205, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 2.203125, + "step": 442, + "time_per_iteration": 2.477555990219116 + }, + { + "auxiliary_loss_clip": 0.01145517, + "auxiliary_loss_mlp": 0.01012445, + "balance_loss_clip": 1.00185919, + "balance_loss_mlp": 1.04045749, + "epoch": 0.026634600931910415, + "flos": 57956125340160.0, + "grad_norm": 0.9584767110468104, + "language_loss": 0.64475185, + "learning_rate": 3.92335820177765e-06, + "loss": 0.66633147, + "num_input_tokens_seen": 9367395, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 1.046875, + "step": 443, + "time_per_iteration": 2.9838714599609375 + }, + { + "auxiliary_loss_clip": 0.01297944, + "auxiliary_loss_mlp": 0.01087685, + "balance_loss_clip": 1.04941845, + "balance_loss_mlp": 1.08318424, + "epoch": 0.026694724184578387, + "flos": 15815131827840.0, + "grad_norm": 2.4335650573352483, + "language_loss": 0.82707053, + "learning_rate": 3.924809954779425e-06, + "loss": 0.85092688, + "num_input_tokens_seen": 9385185, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 2.140625, + "step": 444, + "time_per_iteration": 2.4520323276519775 + }, + { + "auxiliary_loss_clip": 0.0130195, + "auxiliary_loss_mlp": 0.0108515, + "balance_loss_clip": 1.0440464, + "balance_loss_mlp": 1.08103406, + "epoch": 0.026754847437246355, + "flos": 23440259612160.0, + "grad_norm": 2.6903851096875733, + "language_loss": 0.95400113, + "learning_rate": 3.9262584417424425e-06, + "loss": 0.97787213, + "num_input_tokens_seen": 9403225, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 2.21875, + "step": 445, + "time_per_iteration": 2.5113518238067627 + }, + { + "auxiliary_loss_clip": 0.01296406, + "auxiliary_loss_mlp": 0.01096489, + "balance_loss_clip": 1.05657816, + "balance_loss_mlp": 1.08177555, + "epoch": 0.026814970689914324, + "flos": 17341801632000.0, + "grad_norm": 2.416617421630428, + "language_loss": 0.91790259, + "learning_rate": 3.9277036773290725e-06, + "loss": 0.94183153, + "num_input_tokens_seen": 9420540, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 2.15625, + "step": 446, + "time_per_iteration": 2.4585111141204834 + }, + { + "auxiliary_loss_clip": 0.01293099, + "auxiliary_loss_mlp": 0.01085762, + "balance_loss_clip": 1.04718637, + "balance_loss_mlp": 1.08102632, + "epoch": 0.026875093942582293, + "flos": 17894718662400.0, + "grad_norm": 2.3983095061811635, + "language_loss": 0.80024058, + "learning_rate": 3.92914567610317e-06, + "loss": 0.82402921, + "num_input_tokens_seen": 9438840, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 2.125, + "step": 447, + "time_per_iteration": 2.509643316268921 + }, + { + "auxiliary_loss_clip": 0.01292768, + "auxiliary_loss_mlp": 0.01072511, + "balance_loss_clip": 1.03658175, + "balance_loss_mlp": 1.07935369, + "epoch": 0.026935217195250265, + "flos": 21723980889600.0, + "grad_norm": 2.4579217038825423, + "language_loss": 0.86773896, + "learning_rate": 3.930584452530952e-06, + "loss": 0.89139175, + "num_input_tokens_seen": 9457215, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 2.125, + "step": 448, + "time_per_iteration": 2.477384328842163 + }, + { + "auxiliary_loss_clip": 0.01287268, + "auxiliary_loss_mlp": 0.01093327, + "balance_loss_clip": 1.0583508, + "balance_loss_mlp": 1.07870793, + "epoch": 0.026995340447918233, + "flos": 23622685810560.0, + "grad_norm": 2.1426472419274503, + "language_loss": 0.88779259, + "learning_rate": 3.9320200209818755e-06, + "loss": 0.91159856, + "num_input_tokens_seen": 9475615, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 2.078125, + "step": 449, + "time_per_iteration": 2.50108003616333 + }, + { + "auxiliary_loss_clip": 0.01298718, + "auxiliary_loss_mlp": 0.01087936, + "balance_loss_clip": 1.04897857, + "balance_loss_mlp": 1.08056545, + "epoch": 0.027055463700586202, + "flos": 17931275729280.0, + "grad_norm": 1.9975703664508544, + "language_loss": 0.80516291, + "learning_rate": 3.933452395729493e-06, + "loss": 0.82902944, + "num_input_tokens_seen": 9493975, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.1875, + "step": 450, + "time_per_iteration": 2.470536947250366 + }, + { + "auxiliary_loss_clip": 0.01291132, + "auxiliary_loss_mlp": 0.01077476, + "balance_loss_clip": 1.03973413, + "balance_loss_mlp": 1.08217299, + "epoch": 0.02711558695325417, + "flos": 25118903859840.0, + "grad_norm": 2.7768383062811637, + "language_loss": 0.81500483, + "learning_rate": 3.934881590952304e-06, + "loss": 0.83869088, + "num_input_tokens_seen": 9514810, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 2.09375, + "step": 451, + "time_per_iteration": 2.530539035797119 + }, + { + "auxiliary_loss_clip": 0.01289442, + "auxiliary_loss_mlp": 0.0109125, + "balance_loss_clip": 1.0524354, + "balance_loss_mlp": 1.08151317, + "epoch": 0.02717571020592214, + "flos": 24239559006720.0, + "grad_norm": 1.5925691418309382, + "language_loss": 0.76994318, + "learning_rate": 3.936307620734599e-06, + "loss": 0.79375011, + "num_input_tokens_seen": 9533635, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 2.078125, + "step": 452, + "time_per_iteration": 2.5138871669769287 + }, + { + "auxiliary_loss_clip": 0.01292925, + "auxiliary_loss_mlp": 0.01088314, + "balance_loss_clip": 1.0507158, + "balance_loss_mlp": 1.08201516, + "epoch": 0.02723583345859011, + "flos": 25118939773440.0, + "grad_norm": 1.9334646917545748, + "language_loss": 0.73053265, + "learning_rate": 3.937730499067294e-06, + "loss": 0.754345, + "num_input_tokens_seen": 9555420, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 2.109375, + "step": 453, + "time_per_iteration": 2.5271401405334473 + }, + { + "auxiliary_loss_clip": 0.01288113, + "auxiliary_loss_mlp": 0.01086026, + "balance_loss_clip": 1.04952383, + "balance_loss_mlp": 1.08018303, + "epoch": 0.02729595671125808, + "flos": 42741597847680.0, + "grad_norm": 1.845498968311748, + "language_loss": 0.82439983, + "learning_rate": 3.939150239848748e-06, + "loss": 0.84814119, + "num_input_tokens_seen": 9578950, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 2.078125, + "step": 454, + "time_per_iteration": 2.6724069118499756 + }, + { + "auxiliary_loss_clip": 0.01290287, + "auxiliary_loss_mlp": 0.01078957, + "balance_loss_clip": 1.04491115, + "balance_loss_mlp": 1.0808264, + "epoch": 0.02735607996392605, + "flos": 21430985650560.0, + "grad_norm": 2.1414002490484005, + "language_loss": 0.75815403, + "learning_rate": 3.9405668568855866e-06, + "loss": 0.78184646, + "num_input_tokens_seen": 9598160, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 2.09375, + "step": 455, + "time_per_iteration": 2.496913194656372 + }, + { + "auxiliary_loss_clip": 0.01290624, + "auxiliary_loss_mlp": 0.01097119, + "balance_loss_clip": 1.06114161, + "balance_loss_mlp": 1.07846022, + "epoch": 0.027416203216594017, + "flos": 20851280052480.0, + "grad_norm": 2.102028743174525, + "language_loss": 0.80576169, + "learning_rate": 3.941980363893499e-06, + "loss": 0.82963914, + "num_input_tokens_seen": 9616010, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 2.125, + "step": 456, + "time_per_iteration": 2.4748263359069824 + }, + { + "auxiliary_loss_clip": 0.01286184, + "auxiliary_loss_mlp": 0.01078793, + "balance_loss_clip": 1.04152811, + "balance_loss_mlp": 1.07863176, + "epoch": 0.027476326469261986, + "flos": 13224500242560.0, + "grad_norm": 2.479828414472028, + "language_loss": 0.81621009, + "learning_rate": 3.9433907744980384e-06, + "loss": 0.83985978, + "num_input_tokens_seen": 9634000, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 2.078125, + "step": 457, + "time_per_iteration": 2.5122945308685303 + }, + { + "auxiliary_loss_clip": 0.01289671, + "auxiliary_loss_mlp": 0.01084101, + "balance_loss_clip": 1.04728937, + "balance_loss_mlp": 1.07828617, + "epoch": 0.027536449721929958, + "flos": 24024526237440.0, + "grad_norm": 2.0492464691581476, + "language_loss": 0.94062889, + "learning_rate": 3.944798102235412e-06, + "loss": 0.96436661, + "num_input_tokens_seen": 9653455, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 2.109375, + "step": 458, + "time_per_iteration": 2.542919874191284 + }, + { + "auxiliary_loss_clip": 0.01287914, + "auxiliary_loss_mlp": 0.01093849, + "balance_loss_clip": 1.05872989, + "balance_loss_mlp": 1.07926297, + "epoch": 0.027596572974597926, + "flos": 13006055681280.0, + "grad_norm": 2.4293190258203774, + "language_loss": 0.79353511, + "learning_rate": 3.9462023605532545e-06, + "loss": 0.81735277, + "num_input_tokens_seen": 9669650, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 2.09375, + "step": 459, + "time_per_iteration": 2.472830295562744 + }, + { + "auxiliary_loss_clip": 0.01293203, + "auxiliary_loss_mlp": 0.01082653, + "balance_loss_clip": 1.04360008, + "balance_loss_mlp": 1.08543491, + "epoch": 0.027656696227265895, + "flos": 26143076350080.0, + "grad_norm": 1.8472887331493792, + "language_loss": 0.83103061, + "learning_rate": 3.947603562811407e-06, + "loss": 0.85478914, + "num_input_tokens_seen": 9691415, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.078125, + "step": 460, + "time_per_iteration": 2.5376338958740234 + }, + { + "auxiliary_loss_clip": 0.01140517, + "auxiliary_loss_mlp": 0.01043703, + "balance_loss_clip": 1.03488147, + "balance_loss_mlp": 1.03798664, + "epoch": 0.027716819479933864, + "flos": 60697222997760.0, + "grad_norm": 1.5738760379538346, + "language_loss": 0.73565412, + "learning_rate": 3.949001722282675e-06, + "loss": 0.7574963, + "num_input_tokens_seen": 9755605, + "router_z_loss_clip": 0.08837891, + "router_z_loss_mlp": 1.0234375, + "step": 461, + "time_per_iteration": 3.0358285903930664 + }, + { + "auxiliary_loss_clip": 0.01289208, + "auxiliary_loss_mlp": 0.01081781, + "balance_loss_clip": 1.04735351, + "balance_loss_mlp": 1.086905, + "epoch": 0.027776942732601832, + "flos": 31211938886400.0, + "grad_norm": 2.85425781388422, + "language_loss": 0.81291741, + "learning_rate": 3.950396852153582e-06, + "loss": 0.83662736, + "num_input_tokens_seen": 9776270, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 2.015625, + "step": 462, + "time_per_iteration": 2.6079564094543457 + }, + { + "auxiliary_loss_clip": 0.01287586, + "auxiliary_loss_mlp": 0.01073577, + "balance_loss_clip": 1.04096127, + "balance_loss_mlp": 1.08167982, + "epoch": 0.027837065985269804, + "flos": 22674644196480.0, + "grad_norm": 2.2822341634579195, + "language_loss": 0.90235889, + "learning_rate": 3.951788965525118e-06, + "loss": 0.92597055, + "num_input_tokens_seen": 9794465, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 2.0625, + "step": 463, + "time_per_iteration": 2.4881155490875244 + }, + { + "auxiliary_loss_clip": 0.01137482, + "auxiliary_loss_mlp": 0.01014393, + "balance_loss_clip": 1.00561893, + "balance_loss_mlp": 1.03824747, + "epoch": 0.027897189237937773, + "flos": 62182487399040.0, + "grad_norm": 0.8835585057209928, + "language_loss": 0.59031862, + "learning_rate": 3.953178075413476e-06, + "loss": 0.61183739, + "num_input_tokens_seen": 9849685, + "router_z_loss_clip": 0.08789062, + "router_z_loss_mlp": 0.9921875, + "step": 464, + "time_per_iteration": 3.039377212524414 + }, + { + "auxiliary_loss_clip": 0.01299905, + "auxiliary_loss_mlp": 0.01097461, + "balance_loss_clip": 1.06081581, + "balance_loss_mlp": 1.08716702, + "epoch": 0.02795731249060574, + "flos": 24493160004480.0, + "grad_norm": 2.8663863440598525, + "language_loss": 0.81203198, + "learning_rate": 3.954564194750784e-06, + "loss": 0.83600569, + "num_input_tokens_seen": 9869505, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 2.125, + "step": 465, + "time_per_iteration": 2.5197718143463135 + }, + { + "auxiliary_loss_clip": 0.01286546, + "auxiliary_loss_mlp": 0.01082829, + "balance_loss_clip": 1.04708982, + "balance_loss_mlp": 1.08028877, + "epoch": 0.02801743574327371, + "flos": 23733003456000.0, + "grad_norm": 2.004656273762408, + "language_loss": 0.78560221, + "learning_rate": 3.955947336385828e-06, + "loss": 0.80929601, + "num_input_tokens_seen": 9890950, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 2.0625, + "step": 466, + "time_per_iteration": 2.5151565074920654 + }, + { + "auxiliary_loss_clip": 0.01285777, + "auxiliary_loss_mlp": 0.01085472, + "balance_loss_clip": 1.05075812, + "balance_loss_mlp": 1.0816046, + "epoch": 0.02807755899594168, + "flos": 20629100476800.0, + "grad_norm": 2.05931728393333, + "language_loss": 0.87548482, + "learning_rate": 3.957327513084761e-06, + "loss": 0.89919734, + "num_input_tokens_seen": 9911265, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 2.03125, + "step": 467, + "time_per_iteration": 2.4994542598724365 + }, + { + "auxiliary_loss_clip": 0.01289137, + "auxiliary_loss_mlp": 0.01106554, + "balance_loss_clip": 1.06969416, + "balance_loss_mlp": 1.08202362, + "epoch": 0.02813768224860965, + "flos": 19244564789760.0, + "grad_norm": 2.728881931821799, + "language_loss": 0.86217642, + "learning_rate": 3.958704737531818e-06, + "loss": 0.88613331, + "num_input_tokens_seen": 9929025, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 2.0625, + "step": 468, + "time_per_iteration": 2.482377767562866 + }, + { + "auxiliary_loss_clip": 0.01287545, + "auxiliary_loss_mlp": 0.01081999, + "balance_loss_clip": 1.0447104, + "balance_loss_mlp": 1.07984936, + "epoch": 0.02819780550127762, + "flos": 20813968800000.0, + "grad_norm": 3.6924571591440762, + "language_loss": 0.91605878, + "learning_rate": 3.9600790223300065e-06, + "loss": 0.93975413, + "num_input_tokens_seen": 9945190, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 2.078125, + "step": 469, + "time_per_iteration": 2.471510648727417 + }, + { + "auxiliary_loss_clip": 0.01286876, + "auxiliary_loss_mlp": 0.01096778, + "balance_loss_clip": 1.06106234, + "balance_loss_mlp": 1.08290672, + "epoch": 0.028257928753945588, + "flos": 19974125928960.0, + "grad_norm": 8.38112094971343, + "language_loss": 0.81587195, + "learning_rate": 3.96145038000181e-06, + "loss": 0.83970851, + "num_input_tokens_seen": 9962820, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 2.03125, + "step": 470, + "time_per_iteration": 2.5398614406585693 + }, + { + "auxiliary_loss_clip": 0.01286572, + "auxiliary_loss_mlp": 0.01085498, + "balance_loss_clip": 1.04868627, + "balance_loss_mlp": 1.07859015, + "epoch": 0.028318052006613557, + "flos": 20484488321280.0, + "grad_norm": 1.8437898933227894, + "language_loss": 0.93147206, + "learning_rate": 3.962818822989861e-06, + "loss": 0.9551928, + "num_input_tokens_seen": 9982595, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 2.078125, + "step": 471, + "time_per_iteration": 2.5005030632019043 + }, + { + "auxiliary_loss_clip": 0.0128173, + "auxiliary_loss_mlp": 0.01094713, + "balance_loss_clip": 1.05885458, + "balance_loss_mlp": 1.07808042, + "epoch": 0.02837817525928153, + "flos": 28514832410880.0, + "grad_norm": 1.89303735573371, + "language_loss": 0.757568, + "learning_rate": 3.964184363657625e-06, + "loss": 0.78133243, + "num_input_tokens_seen": 10004645, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 2.03125, + "step": 472, + "time_per_iteration": 2.597637176513672 + }, + { + "auxiliary_loss_clip": 0.0128882, + "auxiliary_loss_mlp": 0.01078393, + "balance_loss_clip": 1.04479945, + "balance_loss_mlp": 1.07699013, + "epoch": 0.028438298511949497, + "flos": 18551668458240.0, + "grad_norm": 3.986951446490631, + "language_loss": 0.93354845, + "learning_rate": 3.965547014290071e-06, + "loss": 0.95722055, + "num_input_tokens_seen": 10022555, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 2.125, + "step": 473, + "time_per_iteration": 2.4882545471191406 + }, + { + "auxiliary_loss_clip": 0.01293922, + "auxiliary_loss_mlp": 0.01115319, + "balance_loss_clip": 1.08134401, + "balance_loss_mlp": 1.08149064, + "epoch": 0.028498421764617466, + "flos": 16910227722240.0, + "grad_norm": 4.845992674029067, + "language_loss": 0.88586211, + "learning_rate": 3.96690678709433e-06, + "loss": 0.90995455, + "num_input_tokens_seen": 10041025, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 2.125, + "step": 474, + "time_per_iteration": 2.483210563659668 + }, + { + "auxiliary_loss_clip": 0.01284496, + "auxiliary_loss_mlp": 0.01091761, + "balance_loss_clip": 1.05559278, + "balance_loss_mlp": 1.07983565, + "epoch": 0.028558545017285435, + "flos": 27778699082880.0, + "grad_norm": 2.474550917046853, + "language_loss": 0.78771299, + "learning_rate": 3.968263694200355e-06, + "loss": 0.81147563, + "num_input_tokens_seen": 10060775, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 2.046875, + "step": 475, + "time_per_iteration": 2.5462486743927 + }, + { + "auxiliary_loss_clip": 0.01139312, + "auxiliary_loss_mlp": 0.0107539, + "balance_loss_clip": 1.06647348, + "balance_loss_mlp": 1.03907108, + "epoch": 0.028618668269953403, + "flos": 65654367258240.0, + "grad_norm": 0.9304884927077405, + "language_loss": 0.66880804, + "learning_rate": 3.969617747661569e-06, + "loss": 0.6909551, + "num_input_tokens_seen": 10120225, + "router_z_loss_clip": 0.08935547, + "router_z_loss_mlp": 1.0, + "step": 476, + "time_per_iteration": 5.8287513256073 + }, + { + "auxiliary_loss_clip": 0.01286666, + "auxiliary_loss_mlp": 0.01081774, + "balance_loss_clip": 1.04527175, + "balance_loss_mlp": 1.0796659, + "epoch": 0.028678791522621375, + "flos": 21937074324480.0, + "grad_norm": 2.9569520931335775, + "language_loss": 0.83852398, + "learning_rate": 3.970968959455509e-06, + "loss": 0.86220837, + "num_input_tokens_seen": 10137880, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 2.078125, + "step": 477, + "time_per_iteration": 2.5179195404052734 + }, + { + "auxiliary_loss_clip": 0.01293161, + "auxiliary_loss_mlp": 0.01088101, + "balance_loss_clip": 1.05164671, + "balance_loss_mlp": 1.08298135, + "epoch": 0.028738914775289344, + "flos": 24572128055040.0, + "grad_norm": 2.2048636254017504, + "language_loss": 0.82267237, + "learning_rate": 3.97231734148446e-06, + "loss": 0.84648502, + "num_input_tokens_seen": 10156930, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 2.09375, + "step": 478, + "time_per_iteration": 2.495760679244995 + }, + { + "auxiliary_loss_clip": 0.01283274, + "auxiliary_loss_mlp": 0.01076252, + "balance_loss_clip": 1.0409658, + "balance_loss_mlp": 1.07707858, + "epoch": 0.028799038027957313, + "flos": 23257977068160.0, + "grad_norm": 2.28603697529264, + "language_loss": 0.81010443, + "learning_rate": 3.973662905576082e-06, + "loss": 0.8336997, + "num_input_tokens_seen": 10176295, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 2.0625, + "step": 479, + "time_per_iteration": 2.491910934448242 + }, + { + "auxiliary_loss_clip": 0.01281719, + "auxiliary_loss_mlp": 0.01080307, + "balance_loss_clip": 1.04323328, + "balance_loss_mlp": 1.07729793, + "epoch": 0.02885916128062528, + "flos": 22164102236160.0, + "grad_norm": 2.2385690137770715, + "language_loss": 0.73465097, + "learning_rate": 3.975005663484038e-06, + "loss": 0.75827128, + "num_input_tokens_seen": 10195790, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 2.03125, + "step": 480, + "time_per_iteration": 2.4959068298339844 + }, + { + "auxiliary_loss_clip": 0.01280408, + "auxiliary_loss_mlp": 0.01071287, + "balance_loss_clip": 1.03945768, + "balance_loss_mlp": 1.07837129, + "epoch": 0.02891928453329325, + "flos": 22932842135040.0, + "grad_norm": 1.6612342828976938, + "language_loss": 0.87719476, + "learning_rate": 3.976345626888605e-06, + "loss": 0.90071172, + "num_input_tokens_seen": 10218405, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 2.03125, + "step": 481, + "time_per_iteration": 2.534792184829712 + }, + { + "auxiliary_loss_clip": 0.0113967, + "auxiliary_loss_mlp": 0.01022688, + "balance_loss_clip": 1.01367593, + "balance_loss_mlp": 1.03470159, + "epoch": 0.028979407785961222, + "flos": 57432941792640.0, + "grad_norm": 0.8259666239631118, + "language_loss": 0.66064727, + "learning_rate": 3.9776828073972864e-06, + "loss": 0.68227088, + "num_input_tokens_seen": 10271005, + "router_z_loss_clip": 0.09033203, + "router_z_loss_mlp": 1.046875, + "step": 482, + "time_per_iteration": 2.8219997882843018 + }, + { + "auxiliary_loss_clip": 0.01295379, + "auxiliary_loss_mlp": 0.01073835, + "balance_loss_clip": 1.04014635, + "balance_loss_mlp": 1.08159328, + "epoch": 0.02903953103862919, + "flos": 16722737706240.0, + "grad_norm": 2.373570732629757, + "language_loss": 0.78743541, + "learning_rate": 3.979017216545415e-06, + "loss": 0.81112754, + "num_input_tokens_seen": 10288405, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 2.140625, + "step": 483, + "time_per_iteration": 2.4733006954193115 + }, + { + "auxiliary_loss_clip": 0.01293434, + "auxiliary_loss_mlp": 0.01090935, + "balance_loss_clip": 1.0548625, + "balance_loss_mlp": 1.08311069, + "epoch": 0.02909965429129716, + "flos": 16763640318720.0, + "grad_norm": 2.520023812901894, + "language_loss": 0.75405324, + "learning_rate": 3.980348865796749e-06, + "loss": 0.77789688, + "num_input_tokens_seen": 10306875, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 2.109375, + "step": 484, + "time_per_iteration": 2.466634750366211 + }, + { + "auxiliary_loss_clip": 0.01288089, + "auxiliary_loss_mlp": 0.01078618, + "balance_loss_clip": 1.04459584, + "balance_loss_mlp": 1.08002305, + "epoch": 0.029159777543965128, + "flos": 19785343023360.0, + "grad_norm": 2.0323982063196153, + "language_loss": 0.84021544, + "learning_rate": 3.9816777665440615e-06, + "loss": 0.86388254, + "num_input_tokens_seen": 10323965, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 2.078125, + "step": 485, + "time_per_iteration": 2.511415719985962 + }, + { + "auxiliary_loss_clip": 0.01293039, + "auxiliary_loss_mlp": 0.01081906, + "balance_loss_clip": 1.04740667, + "balance_loss_mlp": 1.08659554, + "epoch": 0.029219900796633096, + "flos": 19642670202240.0, + "grad_norm": 1.9066132168030567, + "language_loss": 0.84465218, + "learning_rate": 3.983003930109732e-06, + "loss": 0.86840165, + "num_input_tokens_seen": 10342620, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 2.0625, + "step": 486, + "time_per_iteration": 2.453583002090454 + }, + { + "auxiliary_loss_clip": 0.01284719, + "auxiliary_loss_mlp": 0.01083872, + "balance_loss_clip": 1.04841876, + "balance_loss_mlp": 1.07841349, + "epoch": 0.02928002404930107, + "flos": 25885704424320.0, + "grad_norm": 1.9228432408219163, + "language_loss": 0.8891986, + "learning_rate": 3.984327367746315e-06, + "loss": 0.91288453, + "num_input_tokens_seen": 10364610, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 2.0625, + "step": 487, + "time_per_iteration": 2.5558598041534424 + }, + { + "auxiliary_loss_clip": 0.0128758, + "auxiliary_loss_mlp": 0.01070867, + "balance_loss_clip": 1.03806067, + "balance_loss_mlp": 1.08095598, + "epoch": 0.029340147301969037, + "flos": 20660234590080.0, + "grad_norm": 2.5260996981700456, + "language_loss": 0.87981069, + "learning_rate": 3.985648090637122e-06, + "loss": 0.90339512, + "num_input_tokens_seen": 10380910, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 2.0625, + "step": 488, + "time_per_iteration": 2.5299952030181885 + }, + { + "auxiliary_loss_clip": 0.01283325, + "auxiliary_loss_mlp": 0.01079627, + "balance_loss_clip": 1.0449605, + "balance_loss_mlp": 1.07794333, + "epoch": 0.029400270554637006, + "flos": 24428018689920.0, + "grad_norm": 2.1862911790042543, + "language_loss": 0.88956475, + "learning_rate": 3.986966109896785e-06, + "loss": 0.9131943, + "num_input_tokens_seen": 10400665, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 2.046875, + "step": 489, + "time_per_iteration": 2.545240879058838 + }, + { + "auxiliary_loss_clip": 0.0127768, + "auxiliary_loss_mlp": 0.01078157, + "balance_loss_clip": 1.04322839, + "balance_loss_mlp": 1.07402337, + "epoch": 0.029460393807304974, + "flos": 20120892900480.0, + "grad_norm": 2.0397830948196756, + "language_loss": 0.88539088, + "learning_rate": 3.988281436571815e-06, + "loss": 0.90894926, + "num_input_tokens_seen": 10420150, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 2.03125, + "step": 490, + "time_per_iteration": 2.4727838039398193 + }, + { + "auxiliary_loss_clip": 0.01284238, + "auxiliary_loss_mlp": 0.01081508, + "balance_loss_clip": 1.04774833, + "balance_loss_mlp": 1.07731342, + "epoch": 0.029520517059972943, + "flos": 17675914965120.0, + "grad_norm": 2.230679327742206, + "language_loss": 0.91299963, + "learning_rate": 3.989594081641164e-06, + "loss": 0.93665713, + "num_input_tokens_seen": 10438210, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 2.0625, + "step": 491, + "time_per_iteration": 2.4900829792022705 + }, + { + "auxiliary_loss_clip": 0.01274874, + "auxiliary_loss_mlp": 0.01070684, + "balance_loss_clip": 1.03804421, + "balance_loss_mlp": 1.0749476, + "epoch": 0.029580640312640915, + "flos": 18953185662720.0, + "grad_norm": 2.419480988494796, + "language_loss": 0.85232413, + "learning_rate": 3.9909040560167675e-06, + "loss": 0.87577969, + "num_input_tokens_seen": 10455125, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 2.0, + "step": 492, + "time_per_iteration": 2.457188844680786 + }, + { + "auxiliary_loss_clip": 0.0128558, + "auxiliary_loss_mlp": 0.01093772, + "balance_loss_clip": 1.05939209, + "balance_loss_mlp": 1.08082771, + "epoch": 0.029640763565308884, + "flos": 18726121837440.0, + "grad_norm": 2.826333733481051, + "language_loss": 0.83989829, + "learning_rate": 3.992211370544093e-06, + "loss": 0.86369187, + "num_input_tokens_seen": 10470990, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 2.046875, + "step": 493, + "time_per_iteration": 2.4821553230285645 + }, + { + "auxiliary_loss_clip": 0.01280126, + "auxiliary_loss_mlp": 0.01079048, + "balance_loss_clip": 1.04586005, + "balance_loss_mlp": 1.07578444, + "epoch": 0.029700886817976852, + "flos": 20595308757120.0, + "grad_norm": 1.8259196989393787, + "language_loss": 0.86575663, + "learning_rate": 3.99351603600268e-06, + "loss": 0.88934839, + "num_input_tokens_seen": 10490685, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 2.046875, + "step": 494, + "time_per_iteration": 2.507068395614624 + }, + { + "auxiliary_loss_clip": 0.01286409, + "auxiliary_loss_mlp": 0.01082408, + "balance_loss_clip": 1.05084157, + "balance_loss_mlp": 1.07973599, + "epoch": 0.02976101007064482, + "flos": 22236857233920.0, + "grad_norm": 4.414490317498679, + "language_loss": 0.86250752, + "learning_rate": 3.994818063106668e-06, + "loss": 0.88619578, + "num_input_tokens_seen": 10509435, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 2.0625, + "step": 495, + "time_per_iteration": 2.498401165008545 + }, + { + "auxiliary_loss_clip": 0.01274095, + "auxiliary_loss_mlp": 0.0107342, + "balance_loss_clip": 1.04144859, + "balance_loss_mlp": 1.07653904, + "epoch": 0.029821133323312793, + "flos": 23732644320000.0, + "grad_norm": 1.893732744603442, + "language_loss": 0.6230706, + "learning_rate": 3.99611746250533e-06, + "loss": 0.64654577, + "num_input_tokens_seen": 10530050, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 1.9765625, + "step": 496, + "time_per_iteration": 2.499669313430786 + }, + { + "auxiliary_loss_clip": 0.01276388, + "auxiliary_loss_mlp": 0.01085353, + "balance_loss_clip": 1.05314219, + "balance_loss_mlp": 1.07830799, + "epoch": 0.02988125657598076, + "flos": 22419498913920.0, + "grad_norm": 1.8423417765009742, + "language_loss": 0.88582325, + "learning_rate": 3.997414244783595e-06, + "loss": 0.90944064, + "num_input_tokens_seen": 10551370, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 1.984375, + "step": 497, + "time_per_iteration": 2.5570924282073975 + }, + { + "auxiliary_loss_clip": 0.01282787, + "auxiliary_loss_mlp": 0.0108035, + "balance_loss_clip": 1.04711461, + "balance_loss_mlp": 1.07822609, + "epoch": 0.02994137982864873, + "flos": 13845108453120.0, + "grad_norm": 3.4064142479622377, + "language_loss": 0.85174376, + "learning_rate": 3.998708420462557e-06, + "loss": 0.87537515, + "num_input_tokens_seen": 10569225, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 2.046875, + "step": 498, + "time_per_iteration": 2.513601541519165 + }, + { + "auxiliary_loss_clip": 0.01281177, + "auxiliary_loss_mlp": 0.01082811, + "balance_loss_clip": 1.05052912, + "balance_loss_mlp": 1.07829463, + "epoch": 0.0300015030813167, + "flos": 23908354675200.0, + "grad_norm": 37.23719619981942, + "language_loss": 0.78152531, + "learning_rate": 4e-06, + "loss": 0.80516517, + "num_input_tokens_seen": 10586170, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 2.03125, + "step": 499, + "time_per_iteration": 2.4924824237823486 + }, + { + "auxiliary_loss_clip": 0.01282354, + "auxiliary_loss_mlp": 0.01080564, + "balance_loss_clip": 1.04818654, + "balance_loss_mlp": 1.08037949, + "epoch": 0.030061626333984667, + "flos": 22016796560640.0, + "grad_norm": 3.687829420060643, + "language_loss": 0.8271451, + "learning_rate": 3.9999999620799e-06, + "loss": 0.85077423, + "num_input_tokens_seen": 10606205, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 2.015625, + "step": 500, + "time_per_iteration": 2.494333028793335 + }, + { + "auxiliary_loss_clip": 0.01274571, + "auxiliary_loss_mlp": 0.01084389, + "balance_loss_clip": 1.04924583, + "balance_loss_mlp": 1.07541978, + "epoch": 0.03012174958665264, + "flos": 23039747988480.0, + "grad_norm": 2.6096117253121447, + "language_loss": 0.88464928, + "learning_rate": 3.9999998483196e-06, + "loss": 0.90823889, + "num_input_tokens_seen": 10625995, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 1.9921875, + "step": 501, + "time_per_iteration": 2.494575262069702 + }, + { + "auxiliary_loss_clip": 0.01283018, + "auxiliary_loss_mlp": 0.01073076, + "balance_loss_clip": 1.04158127, + "balance_loss_mlp": 1.07912767, + "epoch": 0.030181872839320608, + "flos": 18953257489920.0, + "grad_norm": 2.304054979465899, + "language_loss": 0.86586684, + "learning_rate": 3.9999996587191065e-06, + "loss": 0.88942778, + "num_input_tokens_seen": 10644105, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 2.03125, + "step": 502, + "time_per_iteration": 2.4574413299560547 + }, + { + "auxiliary_loss_clip": 0.01278734, + "auxiliary_loss_mlp": 0.01077839, + "balance_loss_clip": 1.0444839, + "balance_loss_mlp": 1.07952762, + "epoch": 0.030241996091988577, + "flos": 16728017005440.0, + "grad_norm": 2.6244890775354976, + "language_loss": 0.84661186, + "learning_rate": 3.999999393278425e-06, + "loss": 0.87017757, + "num_input_tokens_seen": 10661090, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 1.9921875, + "step": 503, + "time_per_iteration": 2.4406938552856445 + }, + { + "auxiliary_loss_clip": 0.0127278, + "auxiliary_loss_mlp": 0.01082796, + "balance_loss_clip": 1.05008519, + "balance_loss_mlp": 1.07727659, + "epoch": 0.030302119344656545, + "flos": 28621271387520.0, + "grad_norm": 1.6755724800263092, + "language_loss": 0.88215417, + "learning_rate": 3.999999051997567e-06, + "loss": 0.90570992, + "num_input_tokens_seen": 10682380, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.953125, + "step": 504, + "time_per_iteration": 2.5319011211395264 + }, + { + "auxiliary_loss_clip": 0.01274883, + "auxiliary_loss_mlp": 0.01087129, + "balance_loss_clip": 1.05556226, + "balance_loss_mlp": 1.07692564, + "epoch": 0.030362242597324514, + "flos": 15669334523520.0, + "grad_norm": 2.2080583468347, + "language_loss": 0.78446162, + "learning_rate": 3.9999986348765425e-06, + "loss": 0.80808175, + "num_input_tokens_seen": 10699925, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.9765625, + "step": 505, + "time_per_iteration": 2.4724690914154053 + }, + { + "auxiliary_loss_clip": 0.01135682, + "auxiliary_loss_mlp": 0.01018291, + "balance_loss_clip": 1.00927854, + "balance_loss_mlp": 1.04092085, + "epoch": 0.030422365849992486, + "flos": 72125973676800.0, + "grad_norm": 0.8461866637376847, + "language_loss": 0.55057126, + "learning_rate": 3.999998141915371e-06, + "loss": 0.57211095, + "num_input_tokens_seen": 10766525, + "router_z_loss_clip": 0.09033203, + "router_z_loss_mlp": 0.9453125, + "step": 506, + "time_per_iteration": 3.2490124702453613 + }, + { + "auxiliary_loss_clip": 0.01274292, + "auxiliary_loss_mlp": 0.01087138, + "balance_loss_clip": 1.05418897, + "balance_loss_mlp": 1.0756762, + "epoch": 0.030482489102660455, + "flos": 19427817000960.0, + "grad_norm": 1.9034614277572226, + "language_loss": 0.83767861, + "learning_rate": 3.999997573114069e-06, + "loss": 0.8612929, + "num_input_tokens_seen": 10786725, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.984375, + "step": 507, + "time_per_iteration": 2.48811674118042 + }, + { + "auxiliary_loss_clip": 0.01280318, + "auxiliary_loss_mlp": 0.01080114, + "balance_loss_clip": 1.04778421, + "balance_loss_mlp": 1.07709789, + "epoch": 0.030542612355328423, + "flos": 20375822701440.0, + "grad_norm": 2.5950154193771526, + "language_loss": 0.88689649, + "learning_rate": 3.999996928472659e-06, + "loss": 0.91050076, + "num_input_tokens_seen": 10805390, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 2.03125, + "step": 508, + "time_per_iteration": 2.4966533184051514 + }, + { + "auxiliary_loss_clip": 0.01281637, + "auxiliary_loss_mlp": 0.01063766, + "balance_loss_clip": 1.0311265, + "balance_loss_mlp": 1.07728887, + "epoch": 0.030602735607996392, + "flos": 34677354297600.0, + "grad_norm": 2.2339008285543227, + "language_loss": 0.71499902, + "learning_rate": 3.999996207991165e-06, + "loss": 0.73845309, + "num_input_tokens_seen": 10828030, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 2.03125, + "step": 509, + "time_per_iteration": 2.5966317653656006 + }, + { + "auxiliary_loss_clip": 0.01274736, + "auxiliary_loss_mlp": 0.01072718, + "balance_loss_clip": 1.04229546, + "balance_loss_mlp": 1.07770133, + "epoch": 0.03066285886066436, + "flos": 23658668259840.0, + "grad_norm": 2.064360756351981, + "language_loss": 0.82369828, + "learning_rate": 3.999995411669614e-06, + "loss": 0.8471728, + "num_input_tokens_seen": 10845240, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.9765625, + "step": 510, + "time_per_iteration": 2.5276355743408203 + }, + { + "auxiliary_loss_clip": 0.01280977, + "auxiliary_loss_mlp": 0.01082699, + "balance_loss_clip": 1.04984498, + "balance_loss_mlp": 1.08235979, + "epoch": 0.030722982113332332, + "flos": 23002975440000.0, + "grad_norm": 2.1614325499153693, + "language_loss": 0.83621502, + "learning_rate": 3.999994539508036e-06, + "loss": 0.85985172, + "num_input_tokens_seen": 10864325, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.984375, + "step": 511, + "time_per_iteration": 2.503779888153076 + }, + { + "auxiliary_loss_clip": 0.01278507, + "auxiliary_loss_mlp": 0.01077898, + "balance_loss_clip": 1.04633093, + "balance_loss_mlp": 1.07648492, + "epoch": 0.0307831053660003, + "flos": 24750855152640.0, + "grad_norm": 2.1059740170821515, + "language_loss": 0.82234836, + "learning_rate": 3.9999935915064655e-06, + "loss": 0.8459124, + "num_input_tokens_seen": 10883860, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 2.03125, + "step": 512, + "time_per_iteration": 2.5306975841522217 + }, + { + "auxiliary_loss_clip": 0.01276149, + "auxiliary_loss_mlp": 0.01077944, + "balance_loss_clip": 1.04499435, + "balance_loss_mlp": 1.0769974, + "epoch": 0.03084322861866827, + "flos": 26140885620480.0, + "grad_norm": 1.9256325141107502, + "language_loss": 0.87030005, + "learning_rate": 3.9999925676649374e-06, + "loss": 0.89384103, + "num_input_tokens_seen": 10904555, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 1.9921875, + "step": 513, + "time_per_iteration": 2.507490634918213 + }, + { + "auxiliary_loss_clip": 0.01281572, + "auxiliary_loss_mlp": 0.01080973, + "balance_loss_clip": 1.04840553, + "balance_loss_mlp": 1.07869625, + "epoch": 0.03090335187133624, + "flos": 18771298168320.0, + "grad_norm": 3.202753983864072, + "language_loss": 0.79141152, + "learning_rate": 3.999991467983491e-06, + "loss": 0.81503695, + "num_input_tokens_seen": 10923700, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 2.03125, + "step": 514, + "time_per_iteration": 2.515496015548706 + }, + { + "auxiliary_loss_clip": 0.01276725, + "auxiliary_loss_mlp": 0.01063014, + "balance_loss_clip": 1.03218651, + "balance_loss_mlp": 1.07966864, + "epoch": 0.030963475124004207, + "flos": 23221886878080.0, + "grad_norm": 2.5461002634459216, + "language_loss": 0.77459693, + "learning_rate": 3.999990292462167e-06, + "loss": 0.79799432, + "num_input_tokens_seen": 10942730, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.96875, + "step": 515, + "time_per_iteration": 2.481903553009033 + }, + { + "auxiliary_loss_clip": 0.01272098, + "auxiliary_loss_mlp": 0.0106896, + "balance_loss_clip": 1.03615332, + "balance_loss_mlp": 1.07318711, + "epoch": 0.03102359837667218, + "flos": 42525595411200.0, + "grad_norm": 1.901518391780262, + "language_loss": 0.82729101, + "learning_rate": 3.999989041101011e-06, + "loss": 0.85070157, + "num_input_tokens_seen": 10967120, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.9921875, + "step": 516, + "time_per_iteration": 2.699577808380127 + }, + { + "auxiliary_loss_clip": 0.01272185, + "auxiliary_loss_mlp": 0.01070291, + "balance_loss_clip": 1.03760433, + "balance_loss_mlp": 1.07659435, + "epoch": 0.031083721629340148, + "flos": 21176953689600.0, + "grad_norm": 2.071844032637654, + "language_loss": 0.79009813, + "learning_rate": 3.999987713900071e-06, + "loss": 0.81352293, + "num_input_tokens_seen": 10986775, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.953125, + "step": 517, + "time_per_iteration": 4.0190205574035645 + }, + { + "auxiliary_loss_clip": 0.01269009, + "auxiliary_loss_mlp": 0.01072314, + "balance_loss_clip": 1.04069996, + "balance_loss_mlp": 1.07610774, + "epoch": 0.031143844882008116, + "flos": 29716187713920.0, + "grad_norm": 1.58218863781409, + "language_loss": 0.90778029, + "learning_rate": 3.999986310859396e-06, + "loss": 0.93119347, + "num_input_tokens_seen": 11011360, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.9296875, + "step": 518, + "time_per_iteration": 4.080751657485962 + }, + { + "auxiliary_loss_clip": 0.0128372, + "auxiliary_loss_mlp": 0.01093666, + "balance_loss_clip": 1.05883288, + "balance_loss_mlp": 1.08518016, + "epoch": 0.031203968134676085, + "flos": 23112467072640.0, + "grad_norm": 3.008779144342936, + "language_loss": 0.86396456, + "learning_rate": 3.999984831979039e-06, + "loss": 0.88773847, + "num_input_tokens_seen": 11030150, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 1.984375, + "step": 519, + "time_per_iteration": 2.510267734527588 + }, + { + "auxiliary_loss_clip": 0.01278708, + "auxiliary_loss_mlp": 0.01092513, + "balance_loss_clip": 1.06092215, + "balance_loss_mlp": 1.07567024, + "epoch": 0.03126409138734405, + "flos": 20954379064320.0, + "grad_norm": 2.0313723427087216, + "language_loss": 0.87156898, + "learning_rate": 3.999983277259057e-06, + "loss": 0.8952812, + "num_input_tokens_seen": 11049145, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 2.03125, + "step": 520, + "time_per_iteration": 2.4891066551208496 + }, + { + "auxiliary_loss_clip": 0.01281744, + "auxiliary_loss_mlp": 0.01089643, + "balance_loss_clip": 1.05633557, + "balance_loss_mlp": 1.07832289, + "epoch": 0.031324214640012026, + "flos": 21650112570240.0, + "grad_norm": 1.6802829394342778, + "language_loss": 0.89362079, + "learning_rate": 3.999981646699509e-06, + "loss": 0.91733468, + "num_input_tokens_seen": 11068835, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 2.03125, + "step": 521, + "time_per_iteration": 2.508524179458618 + }, + { + "auxiliary_loss_clip": 0.01274208, + "auxiliary_loss_mlp": 0.010832, + "balance_loss_clip": 1.04889154, + "balance_loss_mlp": 1.07795191, + "epoch": 0.03138433789267999, + "flos": 23441337020160.0, + "grad_norm": 2.273639697525746, + "language_loss": 0.71327078, + "learning_rate": 3.999979940300456e-06, + "loss": 0.73684484, + "num_input_tokens_seen": 11088980, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.9609375, + "step": 522, + "time_per_iteration": 2.49629282951355 + }, + { + "auxiliary_loss_clip": 0.01278501, + "auxiliary_loss_mlp": 0.01083501, + "balance_loss_clip": 1.05150533, + "balance_loss_mlp": 1.07655358, + "epoch": 0.03144446114534796, + "flos": 18982164960000.0, + "grad_norm": 3.1208656196394706, + "language_loss": 0.84886295, + "learning_rate": 3.999978158061963e-06, + "loss": 0.87248302, + "num_input_tokens_seen": 11104300, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 2.015625, + "step": 523, + "time_per_iteration": 2.4674315452575684 + }, + { + "auxiliary_loss_clip": 0.01280597, + "auxiliary_loss_mlp": 0.01075539, + "balance_loss_clip": 1.04249442, + "balance_loss_mlp": 1.07655168, + "epoch": 0.031504584398015935, + "flos": 22637692080000.0, + "grad_norm": 1.9693639011355857, + "language_loss": 0.90419745, + "learning_rate": 3.999976299984099e-06, + "loss": 0.92775881, + "num_input_tokens_seen": 11123335, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 2.046875, + "step": 524, + "time_per_iteration": 2.480764627456665 + }, + { + "auxiliary_loss_clip": 0.01285248, + "auxiliary_loss_mlp": 0.01083988, + "balance_loss_clip": 1.05034757, + "balance_loss_mlp": 1.08102393, + "epoch": 0.0315647076506839, + "flos": 25297056339840.0, + "grad_norm": 2.4392367222760276, + "language_loss": 0.80040443, + "learning_rate": 3.999974366066933e-06, + "loss": 0.8240968, + "num_input_tokens_seen": 11140880, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 2.046875, + "step": 525, + "time_per_iteration": 2.5409629344940186 + }, + { + "auxiliary_loss_clip": 0.01277675, + "auxiliary_loss_mlp": 0.01082993, + "balance_loss_clip": 1.05025804, + "balance_loss_mlp": 1.07571197, + "epoch": 0.03162483090335187, + "flos": 16982839065600.0, + "grad_norm": 2.8378410017413658, + "language_loss": 0.80693865, + "learning_rate": 3.999972356310538e-06, + "loss": 0.83054531, + "num_input_tokens_seen": 11158710, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 2.03125, + "step": 526, + "time_per_iteration": 2.4509081840515137 + }, + { + "auxiliary_loss_clip": 0.01285808, + "auxiliary_loss_mlp": 0.01072361, + "balance_loss_clip": 1.03655052, + "balance_loss_mlp": 1.08127069, + "epoch": 0.03168495415601984, + "flos": 18734489706240.0, + "grad_norm": 2.27970800213601, + "language_loss": 0.81417823, + "learning_rate": 3.999970270714991e-06, + "loss": 0.83775997, + "num_input_tokens_seen": 11177550, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 2.046875, + "step": 527, + "time_per_iteration": 2.4760756492614746 + }, + { + "auxiliary_loss_clip": 0.01273782, + "auxiliary_loss_mlp": 0.01080634, + "balance_loss_clip": 1.04651666, + "balance_loss_mlp": 1.07408452, + "epoch": 0.03174507740868781, + "flos": 21214875473280.0, + "grad_norm": 2.59751390244888, + "language_loss": 0.93932182, + "learning_rate": 3.999968109280371e-06, + "loss": 0.96286595, + "num_input_tokens_seen": 11196230, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 2.0, + "step": 528, + "time_per_iteration": 2.4721155166625977 + }, + { + "auxiliary_loss_clip": 0.01273884, + "auxiliary_loss_mlp": 0.01073354, + "balance_loss_clip": 1.04083371, + "balance_loss_mlp": 1.07427406, + "epoch": 0.03180520066135578, + "flos": 24787663614720.0, + "grad_norm": 1.8844039207994492, + "language_loss": 0.84143054, + "learning_rate": 3.99996587200676e-06, + "loss": 0.86490291, + "num_input_tokens_seen": 11214935, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 2.0, + "step": 529, + "time_per_iteration": 2.5173239707946777 + }, + { + "auxiliary_loss_clip": 0.01278919, + "auxiliary_loss_mlp": 0.01087129, + "balance_loss_clip": 1.05530047, + "balance_loss_mlp": 1.08254409, + "epoch": 0.03186532391402375, + "flos": 24864261367680.0, + "grad_norm": 2.130233453276154, + "language_loss": 0.90547037, + "learning_rate": 3.999963558894243e-06, + "loss": 0.92913085, + "num_input_tokens_seen": 11235310, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 1.96875, + "step": 530, + "time_per_iteration": 2.5096359252929688 + }, + { + "auxiliary_loss_clip": 0.0127291, + "auxiliary_loss_mlp": 0.01073181, + "balance_loss_clip": 1.03930187, + "balance_loss_mlp": 1.07199419, + "epoch": 0.03192544716669172, + "flos": 21215055041280.0, + "grad_norm": 2.12169085676626, + "language_loss": 0.76197046, + "learning_rate": 3.999961169942907e-06, + "loss": 0.78543139, + "num_input_tokens_seen": 11254425, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 2.015625, + "step": 531, + "time_per_iteration": 2.503265142440796 + }, + { + "auxiliary_loss_clip": 0.01272973, + "auxiliary_loss_mlp": 0.01064442, + "balance_loss_clip": 1.03030038, + "balance_loss_mlp": 1.07424712, + "epoch": 0.03198557041935969, + "flos": 24353216616960.0, + "grad_norm": 2.621085079916904, + "language_loss": 0.9073056, + "learning_rate": 3.999958705152843e-06, + "loss": 0.9306798, + "num_input_tokens_seen": 11274595, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 1.9921875, + "step": 532, + "time_per_iteration": 2.506220817565918 + }, + { + "auxiliary_loss_clip": 0.01137355, + "auxiliary_loss_mlp": 0.01010615, + "balance_loss_clip": 1.00198412, + "balance_loss_mlp": 1.0428524, + "epoch": 0.032045693672027656, + "flos": 61827367587840.0, + "grad_norm": 0.7306749876416057, + "language_loss": 0.57931173, + "learning_rate": 3.9999561645241445e-06, + "loss": 0.60079145, + "num_input_tokens_seen": 11336705, + "router_z_loss_clip": 0.08642578, + "router_z_loss_mlp": 0.9453125, + "step": 533, + "time_per_iteration": 3.154953956604004 + }, + { + "auxiliary_loss_clip": 0.01271016, + "auxiliary_loss_mlp": 0.01084756, + "balance_loss_clip": 1.05209231, + "balance_loss_mlp": 1.07378936, + "epoch": 0.03210581692469563, + "flos": 28401174800640.0, + "grad_norm": 1.8972625930530718, + "language_loss": 0.86725944, + "learning_rate": 3.999953548056907e-06, + "loss": 0.89081717, + "num_input_tokens_seen": 11356820, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 1.96875, + "step": 534, + "time_per_iteration": 2.5384750366210938 + }, + { + "auxiliary_loss_clip": 0.01271847, + "auxiliary_loss_mlp": 0.01066511, + "balance_loss_clip": 1.03468204, + "balance_loss_mlp": 1.07573223, + "epoch": 0.03216594017736359, + "flos": 24717709877760.0, + "grad_norm": 2.118212102173022, + "language_loss": 0.77352351, + "learning_rate": 3.999950855751232e-06, + "loss": 0.79690707, + "num_input_tokens_seen": 11376645, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 1.9609375, + "step": 535, + "time_per_iteration": 2.517940044403076 + }, + { + "auxiliary_loss_clip": 0.01274503, + "auxiliary_loss_mlp": 0.01083227, + "balance_loss_clip": 1.05151725, + "balance_loss_mlp": 1.07644773, + "epoch": 0.032226063430031565, + "flos": 31175453646720.0, + "grad_norm": 2.176836888233088, + "language_loss": 0.8074764, + "learning_rate": 3.999948087607219e-06, + "loss": 0.83105373, + "num_input_tokens_seen": 11397310, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 1.984375, + "step": 536, + "time_per_iteration": 2.546128034591675 + }, + { + "auxiliary_loss_clip": 0.01275643, + "auxiliary_loss_mlp": 0.01077633, + "balance_loss_clip": 1.04361033, + "balance_loss_mlp": 1.07698941, + "epoch": 0.03228618668269954, + "flos": 32198225506560.0, + "grad_norm": 2.3353202427960627, + "language_loss": 0.70118421, + "learning_rate": 3.999945243624975e-06, + "loss": 0.72471696, + "num_input_tokens_seen": 11418475, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 1.9921875, + "step": 537, + "time_per_iteration": 2.578101634979248 + }, + { + "auxiliary_loss_clip": 0.01274556, + "auxiliary_loss_mlp": 0.01081628, + "balance_loss_clip": 1.04877353, + "balance_loss_mlp": 1.08040798, + "epoch": 0.0323463099353675, + "flos": 22670154996480.0, + "grad_norm": 2.1000918694055044, + "language_loss": 0.8250435, + "learning_rate": 3.999942323804607e-06, + "loss": 0.84860539, + "num_input_tokens_seen": 11436630, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.9375, + "step": 538, + "time_per_iteration": 2.4822683334350586 + }, + { + "auxiliary_loss_clip": 0.01280793, + "auxiliary_loss_mlp": 0.01078587, + "balance_loss_clip": 1.0458765, + "balance_loss_mlp": 1.0775007, + "epoch": 0.032406433188035474, + "flos": 26905172232960.0, + "grad_norm": 1.8128048759039839, + "language_loss": 0.78999949, + "learning_rate": 3.999939328146225e-06, + "loss": 0.81359327, + "num_input_tokens_seen": 11457275, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 2.03125, + "step": 539, + "time_per_iteration": 2.5495705604553223 + }, + { + "auxiliary_loss_clip": 0.01274183, + "auxiliary_loss_mlp": 0.01066988, + "balance_loss_clip": 1.03284597, + "balance_loss_mlp": 1.0766232, + "epoch": 0.03246655644070344, + "flos": 31503928544640.0, + "grad_norm": 2.6651388031929835, + "language_loss": 0.77802742, + "learning_rate": 3.999936256649943e-06, + "loss": 0.80143911, + "num_input_tokens_seen": 11476925, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 1.9765625, + "step": 540, + "time_per_iteration": 2.5547144412994385 + }, + { + "auxiliary_loss_clip": 0.01282159, + "auxiliary_loss_mlp": 0.01070004, + "balance_loss_clip": 1.03755546, + "balance_loss_mlp": 1.08122253, + "epoch": 0.03252667969337141, + "flos": 23218331431680.0, + "grad_norm": 2.2422114385304845, + "language_loss": 0.85410464, + "learning_rate": 3.999933109315878e-06, + "loss": 0.8776263, + "num_input_tokens_seen": 11496830, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 2.0, + "step": 541, + "time_per_iteration": 2.517545700073242 + }, + { + "auxiliary_loss_clip": 0.01271503, + "auxiliary_loss_mlp": 0.01083563, + "balance_loss_clip": 1.04906392, + "balance_loss_mlp": 1.07759655, + "epoch": 0.032586802946039384, + "flos": 14757454926720.0, + "grad_norm": 2.210152212848466, + "language_loss": 0.89072484, + "learning_rate": 3.9999298861441496e-06, + "loss": 0.91427547, + "num_input_tokens_seen": 11515605, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.9375, + "step": 542, + "time_per_iteration": 2.437566041946411 + }, + { + "auxiliary_loss_clip": 0.01272694, + "auxiliary_loss_mlp": 0.01075801, + "balance_loss_clip": 1.04289961, + "balance_loss_mlp": 1.07649362, + "epoch": 0.03264692619870735, + "flos": 24280677100800.0, + "grad_norm": 2.3494598042187236, + "language_loss": 0.71096039, + "learning_rate": 3.999926587134879e-06, + "loss": 0.73444533, + "num_input_tokens_seen": 11536230, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.9609375, + "step": 543, + "time_per_iteration": 2.5121288299560547 + }, + { + "auxiliary_loss_clip": 0.0127171, + "auxiliary_loss_mlp": 0.01086873, + "balance_loss_clip": 1.05411386, + "balance_loss_mlp": 1.07139826, + "epoch": 0.03270704945137532, + "flos": 22893160584960.0, + "grad_norm": 2.6617228213889375, + "language_loss": 0.91273057, + "learning_rate": 3.999923212288192e-06, + "loss": 0.93631637, + "num_input_tokens_seen": 11554715, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 2.0, + "step": 544, + "time_per_iteration": 2.529536008834839 + }, + { + "auxiliary_loss_clip": 0.01274727, + "auxiliary_loss_mlp": 0.01082721, + "balance_loss_clip": 1.05186987, + "balance_loss_mlp": 1.07790041, + "epoch": 0.032767172704043286, + "flos": 18041018757120.0, + "grad_norm": 3.144073602630947, + "language_loss": 0.6640051, + "learning_rate": 3.999919761604216e-06, + "loss": 0.68757957, + "num_input_tokens_seen": 11571370, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.96875, + "step": 545, + "time_per_iteration": 2.487250328063965 + }, + { + "auxiliary_loss_clip": 0.01272187, + "auxiliary_loss_mlp": 0.01069604, + "balance_loss_clip": 1.03715563, + "balance_loss_mlp": 1.07393909, + "epoch": 0.03282729595671126, + "flos": 22528739151360.0, + "grad_norm": 2.6288964335615805, + "language_loss": 0.91857421, + "learning_rate": 3.999916235083083e-06, + "loss": 0.94199216, + "num_input_tokens_seen": 11588560, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 1.984375, + "step": 546, + "time_per_iteration": 2.4893922805786133 + }, + { + "auxiliary_loss_clip": 0.0126813, + "auxiliary_loss_mlp": 0.01071134, + "balance_loss_clip": 1.03723049, + "balance_loss_mlp": 1.07095337, + "epoch": 0.03288741920937923, + "flos": 20410620001920.0, + "grad_norm": 2.4455611041839127, + "language_loss": 0.82002354, + "learning_rate": 3.999912632724925e-06, + "loss": 0.84341609, + "num_input_tokens_seen": 11605685, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 1.96875, + "step": 547, + "time_per_iteration": 2.4879701137542725 + }, + { + "auxiliary_loss_clip": 0.01271545, + "auxiliary_loss_mlp": 0.01070995, + "balance_loss_clip": 1.0376637, + "balance_loss_mlp": 1.07550538, + "epoch": 0.032947542462047195, + "flos": 20777986350720.0, + "grad_norm": 3.015836198351779, + "language_loss": 0.80919325, + "learning_rate": 3.999908954529881e-06, + "loss": 0.83261865, + "num_input_tokens_seen": 11626290, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 1.9609375, + "step": 548, + "time_per_iteration": 2.501983404159546 + }, + { + "auxiliary_loss_clip": 0.01270889, + "auxiliary_loss_mlp": 0.01079421, + "balance_loss_clip": 1.04499304, + "balance_loss_mlp": 1.07411838, + "epoch": 0.03300766571471517, + "flos": 19901263190400.0, + "grad_norm": 3.9904289991591217, + "language_loss": 0.67330974, + "learning_rate": 3.999905200498087e-06, + "loss": 0.69681287, + "num_input_tokens_seen": 11643950, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.96875, + "step": 549, + "time_per_iteration": 2.479069948196411 + }, + { + "auxiliary_loss_clip": 0.01265753, + "auxiliary_loss_mlp": 0.01075673, + "balance_loss_clip": 1.04286647, + "balance_loss_mlp": 1.07537639, + "epoch": 0.03306778896738313, + "flos": 17967760968960.0, + "grad_norm": 2.081726350608672, + "language_loss": 0.86137938, + "learning_rate": 3.999901370629689e-06, + "loss": 0.88479364, + "num_input_tokens_seen": 11662560, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.90625, + "step": 550, + "time_per_iteration": 2.435030221939087 + }, + { + "auxiliary_loss_clip": 0.01271779, + "auxiliary_loss_mlp": 0.01089379, + "balance_loss_clip": 1.05712056, + "balance_loss_mlp": 1.07876444, + "epoch": 0.033127912220051105, + "flos": 21653380707840.0, + "grad_norm": 2.0024940554917534, + "language_loss": 0.81302834, + "learning_rate": 3.99989746492483e-06, + "loss": 0.83663994, + "num_input_tokens_seen": 11682265, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 1.9296875, + "step": 551, + "time_per_iteration": 2.474317789077759 + }, + { + "auxiliary_loss_clip": 0.01278525, + "auxiliary_loss_mlp": 0.01080037, + "balance_loss_clip": 1.0469687, + "balance_loss_mlp": 1.0786469, + "epoch": 0.03318803547271908, + "flos": 30188376927360.0, + "grad_norm": 2.5540153370218697, + "language_loss": 0.85907811, + "learning_rate": 3.999893483383658e-06, + "loss": 0.88266373, + "num_input_tokens_seen": 11699300, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 2.0, + "step": 552, + "time_per_iteration": 2.5352437496185303 + }, + { + "auxiliary_loss_clip": 0.01276099, + "auxiliary_loss_mlp": 0.01077197, + "balance_loss_clip": 1.0428648, + "balance_loss_mlp": 1.07894135, + "epoch": 0.03324815872538704, + "flos": 20376038183040.0, + "grad_norm": 2.3148388677976253, + "language_loss": 0.928128, + "learning_rate": 3.999889426006326e-06, + "loss": 0.95166099, + "num_input_tokens_seen": 11716955, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.96875, + "step": 553, + "time_per_iteration": 2.4860291481018066 + }, + { + "auxiliary_loss_clip": 0.01270959, + "auxiliary_loss_mlp": 0.01072703, + "balance_loss_clip": 1.03858554, + "balance_loss_mlp": 1.0755136, + "epoch": 0.033308281978055014, + "flos": 24494560634880.0, + "grad_norm": 2.234190064541142, + "language_loss": 0.78874755, + "learning_rate": 3.999885292792986e-06, + "loss": 0.81218415, + "num_input_tokens_seen": 11736130, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 1.953125, + "step": 554, + "time_per_iteration": 2.4878416061401367 + }, + { + "auxiliary_loss_clip": 0.0126611, + "auxiliary_loss_mlp": 0.01082645, + "balance_loss_clip": 1.04838455, + "balance_loss_mlp": 1.07417822, + "epoch": 0.03336840523072298, + "flos": 23400326666880.0, + "grad_norm": 2.1365458646452424, + "language_loss": 0.82297659, + "learning_rate": 3.999881083743795e-06, + "loss": 0.84646416, + "num_input_tokens_seen": 11754425, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.9140625, + "step": 555, + "time_per_iteration": 2.4846394062042236 + }, + { + "auxiliary_loss_clip": 0.01270081, + "auxiliary_loss_mlp": 0.01075464, + "balance_loss_clip": 1.04156113, + "balance_loss_mlp": 1.07390678, + "epoch": 0.03342852848339095, + "flos": 30550571717760.0, + "grad_norm": 2.781828445596944, + "language_loss": 0.88624835, + "learning_rate": 3.999876798858914e-06, + "loss": 0.90970379, + "num_input_tokens_seen": 11772845, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 1.96875, + "step": 556, + "time_per_iteration": 2.5788674354553223 + }, + { + "auxiliary_loss_clip": 0.01269545, + "auxiliary_loss_mlp": 0.01079314, + "balance_loss_clip": 1.04531527, + "balance_loss_mlp": 1.07534254, + "epoch": 0.03348865173605892, + "flos": 22893304239360.0, + "grad_norm": 2.0860752820949586, + "language_loss": 0.83492053, + "learning_rate": 3.999872438138503e-06, + "loss": 0.85840911, + "num_input_tokens_seen": 11792850, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 1.9375, + "step": 557, + "time_per_iteration": 2.5352954864501953 + }, + { + "auxiliary_loss_clip": 0.01275093, + "auxiliary_loss_mlp": 0.0106652, + "balance_loss_clip": 1.03495288, + "balance_loss_mlp": 1.07979858, + "epoch": 0.03354877498872689, + "flos": 17676022705920.0, + "grad_norm": 9.145612151583265, + "language_loss": 0.94169575, + "learning_rate": 3.999868001582729e-06, + "loss": 0.96511185, + "num_input_tokens_seen": 11809670, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.953125, + "step": 558, + "time_per_iteration": 2.4541964530944824 + }, + { + "auxiliary_loss_clip": 0.01265501, + "auxiliary_loss_mlp": 0.01073065, + "balance_loss_clip": 1.0406878, + "balance_loss_mlp": 1.07178497, + "epoch": 0.03360889824139486, + "flos": 21652985658240.0, + "grad_norm": 2.48174106566098, + "language_loss": 0.7735827, + "learning_rate": 3.99986348919176e-06, + "loss": 0.7969684, + "num_input_tokens_seen": 11829665, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 1.9375, + "step": 559, + "time_per_iteration": 5.362890005111694 + }, + { + "auxiliary_loss_clip": 0.01268387, + "auxiliary_loss_mlp": 0.01078962, + "balance_loss_clip": 1.04818201, + "balance_loss_mlp": 1.07386613, + "epoch": 0.033669021494062826, + "flos": 21795730306560.0, + "grad_norm": 2.071149038386511, + "language_loss": 0.87681198, + "learning_rate": 3.9998589009657675e-06, + "loss": 0.90028548, + "num_input_tokens_seen": 11848190, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.953125, + "step": 560, + "time_per_iteration": 3.9536426067352295 + }, + { + "auxiliary_loss_clip": 0.01264547, + "auxiliary_loss_mlp": 0.01067998, + "balance_loss_clip": 1.0375762, + "balance_loss_mlp": 1.07323277, + "epoch": 0.0337291447467308, + "flos": 21866222747520.0, + "grad_norm": 2.2284071587683463, + "language_loss": 0.81380183, + "learning_rate": 3.999854236904925e-06, + "loss": 0.83712727, + "num_input_tokens_seen": 11864795, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.9140625, + "step": 561, + "time_per_iteration": 2.49826717376709 + }, + { + "auxiliary_loss_clip": 0.01263917, + "auxiliary_loss_mlp": 0.01071053, + "balance_loss_clip": 1.04029727, + "balance_loss_mlp": 1.07403696, + "epoch": 0.03378926799939877, + "flos": 24245951627520.0, + "grad_norm": 1.7768341081574646, + "language_loss": 0.82018232, + "learning_rate": 3.999849497009409e-06, + "loss": 0.84353203, + "num_input_tokens_seen": 11885275, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.90625, + "step": 562, + "time_per_iteration": 2.503990888595581 + }, + { + "auxiliary_loss_clip": 0.01269896, + "auxiliary_loss_mlp": 0.01075498, + "balance_loss_clip": 1.04352641, + "balance_loss_mlp": 1.07592142, + "epoch": 0.033849391252066735, + "flos": 16507812677760.0, + "grad_norm": 1.966221896086353, + "language_loss": 0.84028983, + "learning_rate": 3.999844681279401e-06, + "loss": 0.86374378, + "num_input_tokens_seen": 11903595, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 1.9375, + "step": 563, + "time_per_iteration": 2.464571952819824 + }, + { + "auxiliary_loss_clip": 0.01268432, + "auxiliary_loss_mlp": 0.01079949, + "balance_loss_clip": 1.04866886, + "balance_loss_mlp": 1.07648492, + "epoch": 0.03390951450473471, + "flos": 15669298609920.0, + "grad_norm": 2.359913311978066, + "language_loss": 0.94194812, + "learning_rate": 3.99983978971508e-06, + "loss": 0.96543193, + "num_input_tokens_seen": 11917815, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.921875, + "step": 564, + "time_per_iteration": 2.423762798309326 + }, + { + "auxiliary_loss_clip": 0.01267204, + "auxiliary_loss_mlp": 0.01070001, + "balance_loss_clip": 1.03745687, + "balance_loss_mlp": 1.07225537, + "epoch": 0.03396963775740267, + "flos": 22674787850880.0, + "grad_norm": 3.7666153248687277, + "language_loss": 0.94089758, + "learning_rate": 3.999834822316635e-06, + "loss": 0.96426964, + "num_input_tokens_seen": 11936305, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 1.953125, + "step": 565, + "time_per_iteration": 2.499417543411255 + }, + { + "auxiliary_loss_clip": 0.01140331, + "auxiliary_loss_mlp": 0.0102506, + "balance_loss_clip": 1.01714468, + "balance_loss_mlp": 1.04934859, + "epoch": 0.034029761010070644, + "flos": 64392683063040.0, + "grad_norm": 1.1198796781785882, + "language_loss": 0.54823005, + "learning_rate": 3.9998297790842535e-06, + "loss": 0.569884, + "num_input_tokens_seen": 11998940, + "router_z_loss_clip": 0.07910156, + "router_z_loss_mlp": 0.91015625, + "step": 566, + "time_per_iteration": 3.1322038173675537 + }, + { + "auxiliary_loss_clip": 0.01270043, + "auxiliary_loss_mlp": 0.01072204, + "balance_loss_clip": 1.03837276, + "balance_loss_mlp": 1.0753262, + "epoch": 0.034089884262738616, + "flos": 25004204755200.0, + "grad_norm": 2.6603630269915683, + "language_loss": 0.76780868, + "learning_rate": 3.999824660018126e-06, + "loss": 0.79123116, + "num_input_tokens_seen": 12018860, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 1.9453125, + "step": 567, + "time_per_iteration": 2.5351951122283936 + }, + { + "auxiliary_loss_clip": 0.01261299, + "auxiliary_loss_mlp": 0.01077897, + "balance_loss_clip": 1.04809463, + "balance_loss_mlp": 1.07400167, + "epoch": 0.03415000751540658, + "flos": 28439096584320.0, + "grad_norm": 4.563520524929296, + "language_loss": 0.80796623, + "learning_rate": 3.999819465118447e-06, + "loss": 0.83135819, + "num_input_tokens_seen": 12039675, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.875, + "step": 568, + "time_per_iteration": 2.558093309402466 + }, + { + "auxiliary_loss_clip": 0.01263323, + "auxiliary_loss_mlp": 0.01079335, + "balance_loss_clip": 1.04836476, + "balance_loss_mlp": 1.07628214, + "epoch": 0.034210130768074554, + "flos": 21468727866240.0, + "grad_norm": 1.809578126153619, + "language_loss": 0.86777622, + "learning_rate": 3.999814194385413e-06, + "loss": 0.89120281, + "num_input_tokens_seen": 12057680, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.875, + "step": 569, + "time_per_iteration": 2.500319719314575 + }, + { + "auxiliary_loss_clip": 0.01264002, + "auxiliary_loss_mlp": 0.01073079, + "balance_loss_clip": 1.04227519, + "balance_loss_mlp": 1.07425416, + "epoch": 0.03427025402074252, + "flos": 18697501676160.0, + "grad_norm": 1.8164454228173497, + "language_loss": 0.95802778, + "learning_rate": 3.9998088478192255e-06, + "loss": 0.98139858, + "num_input_tokens_seen": 12076135, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.8984375, + "step": 570, + "time_per_iteration": 2.473808526992798 + }, + { + "auxiliary_loss_clip": 0.01264689, + "auxiliary_loss_mlp": 0.01080759, + "balance_loss_clip": 1.04733253, + "balance_loss_mlp": 1.07053721, + "epoch": 0.03433037727341049, + "flos": 20849987162880.0, + "grad_norm": 2.217921822086313, + "language_loss": 0.79522127, + "learning_rate": 3.9998034254200846e-06, + "loss": 0.81867576, + "num_input_tokens_seen": 12094785, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 1.9375, + "step": 571, + "time_per_iteration": 2.48317813873291 + }, + { + "auxiliary_loss_clip": 0.01265335, + "auxiliary_loss_mlp": 0.01076969, + "balance_loss_clip": 1.04490221, + "balance_loss_mlp": 1.07593679, + "epoch": 0.03439050052607846, + "flos": 25410282986880.0, + "grad_norm": 2.3471183659940555, + "language_loss": 0.79962778, + "learning_rate": 3.999797927188199e-06, + "loss": 0.82305074, + "num_input_tokens_seen": 12114590, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 1.890625, + "step": 572, + "time_per_iteration": 2.5112764835357666 + }, + { + "auxiliary_loss_clip": 0.01270326, + "auxiliary_loss_mlp": 0.0106947, + "balance_loss_clip": 1.03871393, + "balance_loss_mlp": 1.07574439, + "epoch": 0.03445062377874643, + "flos": 17640147997440.0, + "grad_norm": 1.9544136074887903, + "language_loss": 0.84374899, + "learning_rate": 3.999792353123774e-06, + "loss": 0.86714697, + "num_input_tokens_seen": 12132390, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.9453125, + "step": 573, + "time_per_iteration": 2.474212408065796 + }, + { + "auxiliary_loss_clip": 0.01266726, + "auxiliary_loss_mlp": 0.01064214, + "balance_loss_clip": 1.03460276, + "balance_loss_mlp": 1.07282329, + "epoch": 0.0345107470314144, + "flos": 16764502245120.0, + "grad_norm": 3.553507560277694, + "language_loss": 0.76376265, + "learning_rate": 3.999786703227023e-06, + "loss": 0.78707206, + "num_input_tokens_seen": 12149035, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.9375, + "step": 574, + "time_per_iteration": 2.4510116577148438 + }, + { + "auxiliary_loss_clip": 0.01264596, + "auxiliary_loss_mlp": 0.01064423, + "balance_loss_clip": 1.03531194, + "balance_loss_mlp": 1.0731982, + "epoch": 0.03457087028408237, + "flos": 14684448533760.0, + "grad_norm": 2.5278817664157343, + "language_loss": 0.83801597, + "learning_rate": 3.9997809774981606e-06, + "loss": 0.86130619, + "num_input_tokens_seen": 12167530, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.9140625, + "step": 575, + "time_per_iteration": 2.459693193435669 + }, + { + "auxiliary_loss_clip": 0.01260171, + "auxiliary_loss_mlp": 0.01067742, + "balance_loss_clip": 1.03830886, + "balance_loss_mlp": 1.07501364, + "epoch": 0.03463099353675034, + "flos": 20011293527040.0, + "grad_norm": 2.241383472398266, + "language_loss": 0.83726245, + "learning_rate": 3.9997751759374025e-06, + "loss": 0.86054158, + "num_input_tokens_seen": 12186340, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.8515625, + "step": 576, + "time_per_iteration": 2.47292423248291 + }, + { + "auxiliary_loss_clip": 0.01267718, + "auxiliary_loss_mlp": 0.01074956, + "balance_loss_clip": 1.04582155, + "balance_loss_mlp": 1.08247435, + "epoch": 0.03469111678941831, + "flos": 25301150490240.0, + "grad_norm": 2.0876645490308334, + "language_loss": 0.8640908, + "learning_rate": 3.99976929854497e-06, + "loss": 0.88751757, + "num_input_tokens_seen": 12204090, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.8515625, + "step": 577, + "time_per_iteration": 2.529500961303711 + }, + { + "auxiliary_loss_clip": 0.01262371, + "auxiliary_loss_mlp": 0.01069797, + "balance_loss_clip": 1.04028082, + "balance_loss_mlp": 1.0769875, + "epoch": 0.034751240042086275, + "flos": 23259413612160.0, + "grad_norm": 3.2017547958107784, + "language_loss": 0.72333407, + "learning_rate": 3.9997633453210845e-06, + "loss": 0.74665576, + "num_input_tokens_seen": 12224850, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.859375, + "step": 578, + "time_per_iteration": 2.4868762493133545 + }, + { + "auxiliary_loss_clip": 0.01263036, + "auxiliary_loss_mlp": 0.01071071, + "balance_loss_clip": 1.04050565, + "balance_loss_mlp": 1.07441878, + "epoch": 0.03481136329475425, + "flos": 23769237300480.0, + "grad_norm": 1.8544904120227406, + "language_loss": 0.77664137, + "learning_rate": 3.999757316265973e-06, + "loss": 0.79998243, + "num_input_tokens_seen": 12244935, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.8828125, + "step": 579, + "time_per_iteration": 2.50669002532959 + }, + { + "auxiliary_loss_clip": 0.01260844, + "auxiliary_loss_mlp": 0.01077558, + "balance_loss_clip": 1.04634845, + "balance_loss_mlp": 1.07355189, + "epoch": 0.03487148654742222, + "flos": 20157521794560.0, + "grad_norm": 2.5351053977844136, + "language_loss": 0.86927247, + "learning_rate": 3.999751211379863e-06, + "loss": 0.89265645, + "num_input_tokens_seen": 12262140, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.875, + "step": 580, + "time_per_iteration": 2.505908966064453 + }, + { + "auxiliary_loss_clip": 0.01266331, + "auxiliary_loss_mlp": 0.01063953, + "balance_loss_clip": 1.03536677, + "balance_loss_mlp": 1.07510614, + "epoch": 0.034931609800090184, + "flos": 15669585918720.0, + "grad_norm": 4.565959491833327, + "language_loss": 0.82161844, + "learning_rate": 3.999745030662987e-06, + "loss": 0.84492135, + "num_input_tokens_seen": 12280930, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.9140625, + "step": 581, + "time_per_iteration": 2.4735610485076904 + }, + { + "auxiliary_loss_clip": 0.01263493, + "auxiliary_loss_mlp": 0.01067149, + "balance_loss_clip": 1.03832436, + "balance_loss_mlp": 1.07712197, + "epoch": 0.034991733052758156, + "flos": 16362374509440.0, + "grad_norm": 2.2699668532214377, + "language_loss": 0.77498174, + "learning_rate": 3.99973877411558e-06, + "loss": 0.79828823, + "num_input_tokens_seen": 12299125, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.8671875, + "step": 582, + "time_per_iteration": 2.4596173763275146 + }, + { + "auxiliary_loss_clip": 0.01261728, + "auxiliary_loss_mlp": 0.01075668, + "balance_loss_clip": 1.04467332, + "balance_loss_mlp": 1.07715631, + "epoch": 0.03505185630542612, + "flos": 19387309438080.0, + "grad_norm": 2.0991939318744692, + "language_loss": 0.87632537, + "learning_rate": 3.999732441737877e-06, + "loss": 0.89969933, + "num_input_tokens_seen": 12316905, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 1.84375, + "step": 583, + "time_per_iteration": 2.46062970161438 + }, + { + "auxiliary_loss_clip": 0.01268555, + "auxiliary_loss_mlp": 0.01082553, + "balance_loss_clip": 1.05167794, + "balance_loss_mlp": 1.07587278, + "epoch": 0.03511197955809409, + "flos": 21323828401920.0, + "grad_norm": 2.3581841085942004, + "language_loss": 0.80997103, + "learning_rate": 3.99972603353012e-06, + "loss": 0.83348215, + "num_input_tokens_seen": 12335070, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.921875, + "step": 584, + "time_per_iteration": 2.4776926040649414 + }, + { + "auxiliary_loss_clip": 0.01262257, + "auxiliary_loss_mlp": 0.01063212, + "balance_loss_clip": 1.03326654, + "balance_loss_mlp": 1.0725317, + "epoch": 0.035172102810762065, + "flos": 14136595320960.0, + "grad_norm": 2.6245680316153743, + "language_loss": 0.92654932, + "learning_rate": 3.999719549492551e-06, + "loss": 0.94980395, + "num_input_tokens_seen": 12350315, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 1.8984375, + "step": 585, + "time_per_iteration": 2.486678123474121 + }, + { + "auxiliary_loss_clip": 0.01262479, + "auxiliary_loss_mlp": 0.01070226, + "balance_loss_clip": 1.04011369, + "balance_loss_mlp": 1.07368612, + "epoch": 0.03523222606343003, + "flos": 20296890564480.0, + "grad_norm": 2.4855014647160245, + "language_loss": 0.87484592, + "learning_rate": 3.9997129896254165e-06, + "loss": 0.89817297, + "num_input_tokens_seen": 12366030, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.890625, + "step": 586, + "time_per_iteration": 2.457772970199585 + }, + { + "auxiliary_loss_clip": 0.01269677, + "auxiliary_loss_mlp": 0.01071192, + "balance_loss_clip": 1.04137754, + "balance_loss_mlp": 1.07875896, + "epoch": 0.035292349316098, + "flos": 20375822701440.0, + "grad_norm": 1.7854143394247532, + "language_loss": 0.76574278, + "learning_rate": 3.999706353928965e-06, + "loss": 0.78915149, + "num_input_tokens_seen": 12384895, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.90625, + "step": 587, + "time_per_iteration": 2.4794015884399414 + }, + { + "auxiliary_loss_clip": 0.01269924, + "auxiliary_loss_mlp": 0.01061103, + "balance_loss_clip": 1.02991772, + "balance_loss_mlp": 1.07701528, + "epoch": 0.03535247256876597, + "flos": 21468871520640.0, + "grad_norm": 1.6805414217886456, + "language_loss": 0.78441286, + "learning_rate": 3.999699642403449e-06, + "loss": 0.80772316, + "num_input_tokens_seen": 12404980, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.9296875, + "step": 588, + "time_per_iteration": 2.4755733013153076 + }, + { + "auxiliary_loss_clip": 0.01267146, + "auxiliary_loss_mlp": 0.01071411, + "balance_loss_clip": 1.03850961, + "balance_loss_mlp": 1.07600832, + "epoch": 0.03541259582143394, + "flos": 23623044946560.0, + "grad_norm": 2.6477303031273185, + "language_loss": 0.94003904, + "learning_rate": 3.99969285504912e-06, + "loss": 0.96342462, + "num_input_tokens_seen": 12423835, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.9140625, + "step": 589, + "time_per_iteration": 2.515296459197998 + }, + { + "auxiliary_loss_clip": 0.01269747, + "auxiliary_loss_mlp": 0.01067695, + "balance_loss_clip": 1.03803611, + "balance_loss_mlp": 1.07632184, + "epoch": 0.03547271907410191, + "flos": 33726367768320.0, + "grad_norm": 2.4870139863099157, + "language_loss": 0.84060037, + "learning_rate": 3.99968599186624e-06, + "loss": 0.86397475, + "num_input_tokens_seen": 12443135, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.9375, + "step": 590, + "time_per_iteration": 2.583080291748047 + }, + { + "auxiliary_loss_clip": 0.01259593, + "auxiliary_loss_mlp": 0.01062628, + "balance_loss_clip": 1.0342319, + "balance_loss_mlp": 1.07476449, + "epoch": 0.03553284232676988, + "flos": 21142695093120.0, + "grad_norm": 2.031404841890899, + "language_loss": 0.86889851, + "learning_rate": 3.999679052855065e-06, + "loss": 0.89212072, + "num_input_tokens_seen": 12462895, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.8515625, + "step": 591, + "time_per_iteration": 2.497912883758545 + }, + { + "auxiliary_loss_clip": 0.01264593, + "auxiliary_loss_mlp": 0.01070221, + "balance_loss_clip": 1.03917849, + "balance_loss_mlp": 1.07271862, + "epoch": 0.03559296557943785, + "flos": 20046593617920.0, + "grad_norm": 3.1144902928375586, + "language_loss": 0.82980722, + "learning_rate": 3.999672038015861e-06, + "loss": 0.85315537, + "num_input_tokens_seen": 12481515, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 1.921875, + "step": 592, + "time_per_iteration": 2.463977813720703 + }, + { + "auxiliary_loss_clip": 0.01128612, + "auxiliary_loss_mlp": 0.01022486, + "balance_loss_clip": 1.01590526, + "balance_loss_mlp": 1.03881407, + "epoch": 0.035653088832105814, + "flos": 60334597244160.0, + "grad_norm": 0.8806680605255408, + "language_loss": 0.59741807, + "learning_rate": 3.999664947348893e-06, + "loss": 0.61892909, + "num_input_tokens_seen": 12548220, + "router_z_loss_clip": 0.06591797, + "router_z_loss_mlp": 0.8984375, + "step": 593, + "time_per_iteration": 3.1275696754455566 + }, + { + "auxiliary_loss_clip": 0.01262803, + "auxiliary_loss_mlp": 0.01070928, + "balance_loss_clip": 1.03988624, + "balance_loss_mlp": 1.07810974, + "epoch": 0.035713212084773786, + "flos": 20113135562880.0, + "grad_norm": 1.8853114596204945, + "language_loss": 0.87042278, + "learning_rate": 3.999657780854429e-06, + "loss": 0.89376009, + "num_input_tokens_seen": 12566105, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 1.84375, + "step": 594, + "time_per_iteration": 2.522805690765381 + }, + { + "auxiliary_loss_clip": 0.01262874, + "auxiliary_loss_mlp": 0.01064868, + "balance_loss_clip": 1.03539896, + "balance_loss_mlp": 1.07309461, + "epoch": 0.03577333533744176, + "flos": 26285785084800.0, + "grad_norm": 2.3431313884364395, + "language_loss": 0.83481348, + "learning_rate": 3.999650538532742e-06, + "loss": 0.85809088, + "num_input_tokens_seen": 12586680, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.8984375, + "step": 595, + "time_per_iteration": 2.565220832824707 + }, + { + "auxiliary_loss_clip": 0.01261367, + "auxiliary_loss_mlp": 0.01072242, + "balance_loss_clip": 1.04216576, + "balance_loss_mlp": 1.07610273, + "epoch": 0.035833458590109724, + "flos": 10889732211840.0, + "grad_norm": 3.1278930526147426, + "language_loss": 0.96185803, + "learning_rate": 3.999643220384106e-06, + "loss": 0.98519421, + "num_input_tokens_seen": 12601605, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.859375, + "step": 596, + "time_per_iteration": 2.460515260696411 + }, + { + "auxiliary_loss_clip": 0.0126361, + "auxiliary_loss_mlp": 0.0107037, + "balance_loss_clip": 1.04185498, + "balance_loss_mlp": 1.07627654, + "epoch": 0.035893581842777696, + "flos": 22090198003200.0, + "grad_norm": 2.2167421176017204, + "language_loss": 0.82718551, + "learning_rate": 3.999635826408799e-06, + "loss": 0.85052526, + "num_input_tokens_seen": 12620365, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.875, + "step": 597, + "time_per_iteration": 2.5076701641082764 + }, + { + "auxiliary_loss_clip": 0.01261023, + "auxiliary_loss_mlp": 0.01069081, + "balance_loss_clip": 1.03956461, + "balance_loss_mlp": 1.0784421, + "epoch": 0.03595370509544566, + "flos": 23038347358080.0, + "grad_norm": 2.168981908539252, + "language_loss": 0.81386817, + "learning_rate": 3.999628356607101e-06, + "loss": 0.83716923, + "num_input_tokens_seen": 12641140, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.828125, + "step": 598, + "time_per_iteration": 2.531188726425171 + }, + { + "auxiliary_loss_clip": 0.01254264, + "auxiliary_loss_mlp": 0.0106961, + "balance_loss_clip": 1.03894937, + "balance_loss_mlp": 1.07570839, + "epoch": 0.03601382834811363, + "flos": 20777734955520.0, + "grad_norm": 1.9075541218278638, + "language_loss": 0.81387949, + "learning_rate": 3.999620810979295e-06, + "loss": 0.83711827, + "num_input_tokens_seen": 12661080, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.7890625, + "step": 599, + "time_per_iteration": 2.511871576309204 + }, + { + "auxiliary_loss_clip": 0.01262476, + "auxiliary_loss_mlp": 0.01074253, + "balance_loss_clip": 1.04557085, + "balance_loss_mlp": 1.07350755, + "epoch": 0.036073951600781605, + "flos": 23951627585280.0, + "grad_norm": 2.1528215266255604, + "language_loss": 0.86115932, + "learning_rate": 3.999613189525668e-06, + "loss": 0.88452661, + "num_input_tokens_seen": 12678270, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.890625, + "step": 600, + "time_per_iteration": 2.50054669380188 + }, + { + "auxiliary_loss_clip": 0.01254617, + "auxiliary_loss_mlp": 0.01080731, + "balance_loss_clip": 1.05133438, + "balance_loss_mlp": 1.06909621, + "epoch": 0.03613407485344957, + "flos": 18912283050240.0, + "grad_norm": 3.928737875146519, + "language_loss": 0.82175761, + "learning_rate": 3.999605492246508e-06, + "loss": 0.84511113, + "num_input_tokens_seen": 12697295, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.8515625, + "step": 601, + "time_per_iteration": 6.795202255249023 + }, + { + "auxiliary_loss_clip": 0.01253245, + "auxiliary_loss_mlp": 0.01056304, + "balance_loss_clip": 1.02666831, + "balance_loss_mlp": 1.07096183, + "epoch": 0.03619419810611754, + "flos": 23038526926080.0, + "grad_norm": 2.2629653513719252, + "language_loss": 0.75467926, + "learning_rate": 3.999597719142107e-06, + "loss": 0.77777481, + "num_input_tokens_seen": 12716165, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.8203125, + "step": 602, + "time_per_iteration": 2.503629446029663 + }, + { + "auxiliary_loss_clip": 0.01252806, + "auxiliary_loss_mlp": 0.01057069, + "balance_loss_clip": 1.02833962, + "balance_loss_mlp": 1.07078326, + "epoch": 0.03625432135878551, + "flos": 29457774293760.0, + "grad_norm": 1.9962737747137984, + "language_loss": 0.80078572, + "learning_rate": 3.999589870212761e-06, + "loss": 0.82388449, + "num_input_tokens_seen": 12735475, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.8203125, + "step": 603, + "time_per_iteration": 2.568368911743164 + }, + { + "auxiliary_loss_clip": 0.01258325, + "auxiliary_loss_mlp": 0.01061531, + "balance_loss_clip": 1.03320646, + "balance_loss_mlp": 1.07597041, + "epoch": 0.03631444461145348, + "flos": 23508525409920.0, + "grad_norm": 1.9836566776981934, + "language_loss": 0.86801207, + "learning_rate": 3.9995819454587664e-06, + "loss": 0.89121068, + "num_input_tokens_seen": 12754540, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.8203125, + "step": 604, + "time_per_iteration": 2.496415376663208 + }, + { + "auxiliary_loss_clip": 0.01260423, + "auxiliary_loss_mlp": 0.01065702, + "balance_loss_clip": 1.03549433, + "balance_loss_mlp": 1.07688427, + "epoch": 0.03637456786412145, + "flos": 16618130323200.0, + "grad_norm": 3.252638522711271, + "language_loss": 0.81078291, + "learning_rate": 3.999573944880424e-06, + "loss": 0.83404416, + "num_input_tokens_seen": 12773050, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.828125, + "step": 605, + "time_per_iteration": 2.46071457862854 + }, + { + "auxiliary_loss_clip": 0.01255946, + "auxiliary_loss_mlp": 0.01067125, + "balance_loss_clip": 1.04012406, + "balance_loss_mlp": 1.07317901, + "epoch": 0.03643469111678942, + "flos": 15851832549120.0, + "grad_norm": 2.2162807408147964, + "language_loss": 0.85624671, + "learning_rate": 3.9995658684780375e-06, + "loss": 0.87947738, + "num_input_tokens_seen": 12791240, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.828125, + "step": 606, + "time_per_iteration": 2.450775623321533 + }, + { + "auxiliary_loss_clip": 0.01262483, + "auxiliary_loss_mlp": 0.01072166, + "balance_loss_clip": 1.04279351, + "balance_loss_mlp": 1.07551849, + "epoch": 0.03649481436945739, + "flos": 23620387340160.0, + "grad_norm": 2.1498788116147125, + "language_loss": 0.82370651, + "learning_rate": 3.999557716251912e-06, + "loss": 0.84705305, + "num_input_tokens_seen": 12812245, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.8671875, + "step": 607, + "time_per_iteration": 2.4969747066497803 + }, + { + "auxiliary_loss_clip": 0.01255757, + "auxiliary_loss_mlp": 0.01063348, + "balance_loss_clip": 1.035954, + "balance_loss_mlp": 1.07488835, + "epoch": 0.036554937622125354, + "flos": 21755581879680.0, + "grad_norm": 3.329641026295442, + "language_loss": 0.8315016, + "learning_rate": 3.999549488202358e-06, + "loss": 0.8546927, + "num_input_tokens_seen": 12831085, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.8046875, + "step": 608, + "time_per_iteration": 2.4648640155792236 + }, + { + "auxiliary_loss_clip": 0.01260127, + "auxiliary_loss_mlp": 0.0106578, + "balance_loss_clip": 1.03533435, + "balance_loss_mlp": 1.0769459, + "epoch": 0.036615060874793326, + "flos": 17819772935040.0, + "grad_norm": 2.072924568315734, + "language_loss": 0.82258713, + "learning_rate": 3.999541184329688e-06, + "loss": 0.84584618, + "num_input_tokens_seen": 12849115, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.828125, + "step": 609, + "time_per_iteration": 2.4761714935302734 + }, + { + "auxiliary_loss_clip": 0.01266536, + "auxiliary_loss_mlp": 0.01080333, + "balance_loss_clip": 1.05247378, + "balance_loss_mlp": 1.08229148, + "epoch": 0.0366751841274613, + "flos": 26753808320640.0, + "grad_norm": 2.279075715646142, + "language_loss": 0.7924515, + "learning_rate": 3.999532804634215e-06, + "loss": 0.81592017, + "num_input_tokens_seen": 12868005, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.84375, + "step": 610, + "time_per_iteration": 2.512223958969116 + }, + { + "auxiliary_loss_clip": 0.01265179, + "auxiliary_loss_mlp": 0.01076881, + "balance_loss_clip": 1.04767442, + "balance_loss_mlp": 1.07819688, + "epoch": 0.03673530738012926, + "flos": 22196960202240.0, + "grad_norm": 2.108980449215705, + "language_loss": 0.87263799, + "learning_rate": 3.9995243491162575e-06, + "loss": 0.89605856, + "num_input_tokens_seen": 12886890, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.8671875, + "step": 611, + "time_per_iteration": 2.488800525665283 + }, + { + "auxiliary_loss_clip": 0.01257304, + "auxiliary_loss_mlp": 0.01084406, + "balance_loss_clip": 1.05577183, + "balance_loss_mlp": 1.0769043, + "epoch": 0.036795430632797235, + "flos": 24681655601280.0, + "grad_norm": 2.0539399448943145, + "language_loss": 0.72783852, + "learning_rate": 3.999515817776136e-06, + "loss": 0.75125557, + "num_input_tokens_seen": 12906130, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.8046875, + "step": 612, + "time_per_iteration": 2.4950740337371826 + }, + { + "auxiliary_loss_clip": 0.01258776, + "auxiliary_loss_mlp": 0.01069045, + "balance_loss_clip": 1.03999329, + "balance_loss_mlp": 1.07377708, + "epoch": 0.0368555538854652, + "flos": 17748921358080.0, + "grad_norm": 2.903841869182041, + "language_loss": 0.7909385, + "learning_rate": 3.999507210614175e-06, + "loss": 0.81421661, + "num_input_tokens_seen": 12925260, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.8515625, + "step": 613, + "time_per_iteration": 2.4849369525909424 + }, + { + "auxiliary_loss_clip": 0.01253943, + "auxiliary_loss_mlp": 0.01079095, + "balance_loss_clip": 1.05141413, + "balance_loss_mlp": 1.07326341, + "epoch": 0.03691567713813317, + "flos": 20594554571520.0, + "grad_norm": 2.273957434397869, + "language_loss": 0.93266213, + "learning_rate": 3.9994985276307e-06, + "loss": 0.95599246, + "num_input_tokens_seen": 12944590, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.8125, + "step": 614, + "time_per_iteration": 2.4639992713928223 + }, + { + "auxiliary_loss_clip": 0.01263574, + "auxiliary_loss_mlp": 0.01075313, + "balance_loss_clip": 1.04415178, + "balance_loss_mlp": 1.07938302, + "epoch": 0.036975800390801145, + "flos": 33650380546560.0, + "grad_norm": 2.901964177226116, + "language_loss": 0.72534943, + "learning_rate": 3.999489768826041e-06, + "loss": 0.74873829, + "num_input_tokens_seen": 12964785, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.84375, + "step": 615, + "time_per_iteration": 2.601372480392456 + }, + { + "auxiliary_loss_clip": 0.01258092, + "auxiliary_loss_mlp": 0.01071353, + "balance_loss_clip": 1.04299331, + "balance_loss_mlp": 1.07278967, + "epoch": 0.03703592364346911, + "flos": 28293694329600.0, + "grad_norm": 2.023635364571096, + "language_loss": 0.81449711, + "learning_rate": 3.999480934200528e-06, + "loss": 0.83779156, + "num_input_tokens_seen": 12986705, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.8515625, + "step": 616, + "time_per_iteration": 2.5325467586517334 + }, + { + "auxiliary_loss_clip": 0.01256707, + "auxiliary_loss_mlp": 0.01063142, + "balance_loss_clip": 1.03643894, + "balance_loss_mlp": 1.07431316, + "epoch": 0.03709604689613708, + "flos": 31504215853440.0, + "grad_norm": 1.9753277492127743, + "language_loss": 0.67868775, + "learning_rate": 3.999472023754499e-06, + "loss": 0.7018863, + "num_input_tokens_seen": 13010560, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.828125, + "step": 617, + "time_per_iteration": 2.5784177780151367 + }, + { + "auxiliary_loss_clip": 0.01263095, + "auxiliary_loss_mlp": 0.01064585, + "balance_loss_clip": 1.0349381, + "balance_loss_mlp": 1.07892454, + "epoch": 0.03715617014880505, + "flos": 19609381272960.0, + "grad_norm": 3.556814357499394, + "language_loss": 0.80340034, + "learning_rate": 3.99946303748829e-06, + "loss": 0.8266772, + "num_input_tokens_seen": 13028935, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.84375, + "step": 618, + "time_per_iteration": 2.4876792430877686 + }, + { + "auxiliary_loss_clip": 0.01261829, + "auxiliary_loss_mlp": 0.01070874, + "balance_loss_clip": 1.04059458, + "balance_loss_mlp": 1.07458091, + "epoch": 0.03721629340147302, + "flos": 15924192497280.0, + "grad_norm": 2.355648226269084, + "language_loss": 0.91115171, + "learning_rate": 3.999453975402242e-06, + "loss": 0.93447876, + "num_input_tokens_seen": 13046000, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.875, + "step": 619, + "time_per_iteration": 2.4804162979125977 + }, + { + "auxiliary_loss_clip": 0.01259898, + "auxiliary_loss_mlp": 0.01077134, + "balance_loss_clip": 1.04871452, + "balance_loss_mlp": 1.07845378, + "epoch": 0.03727641665414099, + "flos": 21104090951040.0, + "grad_norm": 2.218621959424752, + "language_loss": 0.94397002, + "learning_rate": 3.9994448374967e-06, + "loss": 0.96734041, + "num_input_tokens_seen": 13062995, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.8125, + "step": 620, + "time_per_iteration": 2.4592232704162598 + }, + { + "auxiliary_loss_clip": 0.01257463, + "auxiliary_loss_mlp": 0.01077616, + "balance_loss_clip": 1.04750419, + "balance_loss_mlp": 1.07455909, + "epoch": 0.037336539906808956, + "flos": 24131683486080.0, + "grad_norm": 1.8159025601621845, + "language_loss": 0.77105826, + "learning_rate": 3.999435623772008e-06, + "loss": 0.7944091, + "num_input_tokens_seen": 13084120, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.828125, + "step": 621, + "time_per_iteration": 2.53365159034729 + }, + { + "auxiliary_loss_clip": 0.01255819, + "auxiliary_loss_mlp": 0.01059811, + "balance_loss_clip": 1.03084326, + "balance_loss_mlp": 1.07761526, + "epoch": 0.03739666315947693, + "flos": 22346384780160.0, + "grad_norm": 2.793013868715132, + "language_loss": 0.86895752, + "learning_rate": 3.999426334228518e-06, + "loss": 0.89211386, + "num_input_tokens_seen": 13100035, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.78125, + "step": 622, + "time_per_iteration": 2.472726583480835 + }, + { + "auxiliary_loss_clip": 0.01258428, + "auxiliary_loss_mlp": 0.01064577, + "balance_loss_clip": 1.03591871, + "balance_loss_mlp": 1.07622766, + "epoch": 0.0374567864121449, + "flos": 20449511452800.0, + "grad_norm": 2.261361439009279, + "language_loss": 0.90376818, + "learning_rate": 3.999416968866581e-06, + "loss": 0.9269982, + "num_input_tokens_seen": 13118070, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.8203125, + "step": 623, + "time_per_iteration": 2.486699104309082 + }, + { + "auxiliary_loss_clip": 0.0125978, + "auxiliary_loss_mlp": 0.01075147, + "balance_loss_clip": 1.04626298, + "balance_loss_mlp": 1.07841158, + "epoch": 0.037516909664812866, + "flos": 19208043636480.0, + "grad_norm": 1.9910669563462169, + "language_loss": 0.84149444, + "learning_rate": 3.999407527686551e-06, + "loss": 0.86484373, + "num_input_tokens_seen": 13136355, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.8125, + "step": 624, + "time_per_iteration": 2.4514520168304443 + }, + { + "auxiliary_loss_clip": 0.01261437, + "auxiliary_loss_mlp": 0.01067743, + "balance_loss_clip": 1.03867936, + "balance_loss_mlp": 1.0750618, + "epoch": 0.03757703291748084, + "flos": 35005218664320.0, + "grad_norm": 2.4867963928692554, + "language_loss": 0.66228586, + "learning_rate": 3.999398010688788e-06, + "loss": 0.68557763, + "num_input_tokens_seen": 13155435, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.8671875, + "step": 625, + "time_per_iteration": 2.5765273571014404 + }, + { + "auxiliary_loss_clip": 0.01253583, + "auxiliary_loss_mlp": 0.0106714, + "balance_loss_clip": 1.03697979, + "balance_loss_mlp": 1.07435441, + "epoch": 0.0376371561701488, + "flos": 25483899911040.0, + "grad_norm": 2.071255255654034, + "language_loss": 0.77375329, + "learning_rate": 3.999388417873652e-06, + "loss": 0.79696059, + "num_input_tokens_seen": 13174295, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.7890625, + "step": 626, + "time_per_iteration": 2.5022406578063965 + }, + { + "auxiliary_loss_clip": 0.01258684, + "auxiliary_loss_mlp": 0.01074389, + "balance_loss_clip": 1.04499173, + "balance_loss_mlp": 1.07735705, + "epoch": 0.037697279422816775, + "flos": 18185630912640.0, + "grad_norm": 2.2077512286027288, + "language_loss": 0.81357861, + "learning_rate": 3.999378749241506e-06, + "loss": 0.83690929, + "num_input_tokens_seen": 13192500, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.8125, + "step": 627, + "time_per_iteration": 2.4750607013702393 + }, + { + "auxiliary_loss_clip": 0.01261632, + "auxiliary_loss_mlp": 0.01076941, + "balance_loss_clip": 1.04768682, + "balance_loss_mlp": 1.07859111, + "epoch": 0.03775740267548475, + "flos": 24644272521600.0, + "grad_norm": 3.546199216596373, + "language_loss": 0.88572276, + "learning_rate": 3.999369004792719e-06, + "loss": 0.90910852, + "num_input_tokens_seen": 13213470, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.828125, + "step": 628, + "time_per_iteration": 2.571899890899658 + }, + { + "auxiliary_loss_clip": 0.01253553, + "auxiliary_loss_mlp": 0.01067038, + "balance_loss_clip": 1.03864217, + "balance_loss_mlp": 1.07086658, + "epoch": 0.03781752592815271, + "flos": 21288205088640.0, + "grad_norm": 2.488861546346732, + "language_loss": 0.79683006, + "learning_rate": 3.999359184527658e-06, + "loss": 0.82003593, + "num_input_tokens_seen": 13232365, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.828125, + "step": 629, + "time_per_iteration": 2.486675262451172 + }, + { + "auxiliary_loss_clip": 0.01258011, + "auxiliary_loss_mlp": 0.01067816, + "balance_loss_clip": 1.03977799, + "balance_loss_mlp": 1.07458425, + "epoch": 0.037877649180820684, + "flos": 22089623385600.0, + "grad_norm": 1.7117761504495859, + "language_loss": 0.76808703, + "learning_rate": 3.999349288446696e-06, + "loss": 0.79134536, + "num_input_tokens_seen": 13251920, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.8359375, + "step": 630, + "time_per_iteration": 2.494297742843628 + }, + { + "auxiliary_loss_clip": 0.01262746, + "auxiliary_loss_mlp": 0.01070638, + "balance_loss_clip": 1.04250503, + "balance_loss_mlp": 1.07651484, + "epoch": 0.03793777243348865, + "flos": 14501339976960.0, + "grad_norm": 2.6765452133705403, + "language_loss": 0.91492796, + "learning_rate": 3.99933931655021e-06, + "loss": 0.93826187, + "num_input_tokens_seen": 13267440, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.859375, + "step": 631, + "time_per_iteration": 2.4605348110198975 + }, + { + "auxiliary_loss_clip": 0.01252436, + "auxiliary_loss_mlp": 0.01076716, + "balance_loss_clip": 1.04560196, + "balance_loss_mlp": 1.07244229, + "epoch": 0.03799789568615662, + "flos": 21908418249600.0, + "grad_norm": 1.669704350294595, + "language_loss": 0.9207651, + "learning_rate": 3.999329268838575e-06, + "loss": 0.94405663, + "num_input_tokens_seen": 13287850, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 1.796875, + "step": 632, + "time_per_iteration": 2.518498659133911 + }, + { + "auxiliary_loss_clip": 0.01255106, + "auxiliary_loss_mlp": 0.01058467, + "balance_loss_clip": 1.03069162, + "balance_loss_mlp": 1.07462335, + "epoch": 0.03805801893882459, + "flos": 24827021942400.0, + "grad_norm": 2.0828864645498872, + "language_loss": 0.8341018, + "learning_rate": 3.999319145312175e-06, + "loss": 0.85723758, + "num_input_tokens_seen": 13307760, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.8046875, + "step": 633, + "time_per_iteration": 2.5217537879943848 + }, + { + "auxiliary_loss_clip": 0.01258224, + "auxiliary_loss_mlp": 0.01071025, + "balance_loss_clip": 1.04153264, + "balance_loss_mlp": 1.07408428, + "epoch": 0.03811814219149256, + "flos": 30482952364800.0, + "grad_norm": 1.6987522649376106, + "language_loss": 0.69638437, + "learning_rate": 3.999308945971392e-06, + "loss": 0.71967685, + "num_input_tokens_seen": 13331230, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.84375, + "step": 634, + "time_per_iteration": 2.5694239139556885 + }, + { + "auxiliary_loss_clip": 0.01127675, + "auxiliary_loss_mlp": 0.01017483, + "balance_loss_clip": 1.0106163, + "balance_loss_mlp": 1.04225707, + "epoch": 0.03817826544416053, + "flos": 66992577379200.0, + "grad_norm": 0.8852243261294688, + "language_loss": 0.61585373, + "learning_rate": 3.999298670816614e-06, + "loss": 0.63730532, + "num_input_tokens_seen": 13394760, + "router_z_loss_clip": 0.06884766, + "router_z_loss_mlp": 0.8515625, + "step": 635, + "time_per_iteration": 3.1059212684631348 + }, + { + "auxiliary_loss_clip": 0.01253433, + "auxiliary_loss_mlp": 0.01068627, + "balance_loss_clip": 1.03930187, + "balance_loss_mlp": 1.07354546, + "epoch": 0.038238388696828496, + "flos": 20485350247680.0, + "grad_norm": 2.2313569204055246, + "language_loss": 0.83721048, + "learning_rate": 3.9992883198482294e-06, + "loss": 0.86043108, + "num_input_tokens_seen": 13412775, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.796875, + "step": 636, + "time_per_iteration": 2.4975383281707764 + }, + { + "auxiliary_loss_clip": 0.01258014, + "auxiliary_loss_mlp": 0.01077997, + "balance_loss_clip": 1.04852867, + "balance_loss_mlp": 1.07623935, + "epoch": 0.03829851194949647, + "flos": 17965893461760.0, + "grad_norm": 2.4018992949787847, + "language_loss": 0.79327047, + "learning_rate": 3.999277893066632e-06, + "loss": 0.8166306, + "num_input_tokens_seen": 13427835, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.8203125, + "step": 637, + "time_per_iteration": 2.4560744762420654 + }, + { + "auxiliary_loss_clip": 0.01258084, + "auxiliary_loss_mlp": 0.01073075, + "balance_loss_clip": 1.04342771, + "balance_loss_mlp": 1.07309079, + "epoch": 0.03835863520216444, + "flos": 22456522857600.0, + "grad_norm": 2.8779285506389924, + "language_loss": 0.8410306, + "learning_rate": 3.999267390472215e-06, + "loss": 0.86434221, + "num_input_tokens_seen": 13447295, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.8515625, + "step": 638, + "time_per_iteration": 2.504343271255493 + }, + { + "auxiliary_loss_clip": 0.01263348, + "auxiliary_loss_mlp": 0.01067898, + "balance_loss_clip": 1.03717756, + "balance_loss_mlp": 1.07495832, + "epoch": 0.038418758454832405, + "flos": 22164425458560.0, + "grad_norm": 2.5416523890288976, + "language_loss": 0.70099992, + "learning_rate": 3.999256812065381e-06, + "loss": 0.72431237, + "num_input_tokens_seen": 13468455, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.890625, + "step": 639, + "time_per_iteration": 2.52817964553833 + }, + { + "auxiliary_loss_clip": 0.01259266, + "auxiliary_loss_mlp": 0.01075603, + "balance_loss_clip": 1.04463232, + "balance_loss_mlp": 1.07514286, + "epoch": 0.03847888170750038, + "flos": 22747435107840.0, + "grad_norm": 2.42201861797838, + "language_loss": 0.85030365, + "learning_rate": 3.999246157846526e-06, + "loss": 0.8736524, + "num_input_tokens_seen": 13489085, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.84375, + "step": 640, + "time_per_iteration": 2.503262758255005 + }, + { + "auxiliary_loss_clip": 0.0126167, + "auxiliary_loss_mlp": 0.0107911, + "balance_loss_clip": 1.04725742, + "balance_loss_mlp": 1.07574821, + "epoch": 0.03853900496016834, + "flos": 22711201263360.0, + "grad_norm": 2.3722848939528953, + "language_loss": 0.82117289, + "learning_rate": 3.9992354278160574e-06, + "loss": 0.84458065, + "num_input_tokens_seen": 13509120, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 1.859375, + "step": 641, + "time_per_iteration": 2.51052188873291 + }, + { + "auxiliary_loss_clip": 0.01118992, + "auxiliary_loss_mlp": 0.01008303, + "balance_loss_clip": 1.00205636, + "balance_loss_mlp": 1.03414774, + "epoch": 0.038599128212836314, + "flos": 70399136355840.0, + "grad_norm": 0.9008353353488252, + "language_loss": 0.6540072, + "learning_rate": 3.999224621974381e-06, + "loss": 0.67528021, + "num_input_tokens_seen": 13562005, + "router_z_loss_clip": 0.06225586, + "router_z_loss_mlp": 0.8515625, + "step": 642, + "time_per_iteration": 4.430839538574219 + }, + { + "auxiliary_loss_clip": 0.01256856, + "auxiliary_loss_mlp": 0.01062608, + "balance_loss_clip": 1.03433132, + "balance_loss_mlp": 1.07364345, + "epoch": 0.03865925146550429, + "flos": 23295144666240.0, + "grad_norm": 1.9870813050305103, + "language_loss": 0.79512584, + "learning_rate": 3.999213740321906e-06, + "loss": 0.81832051, + "num_input_tokens_seen": 13582185, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.8359375, + "step": 643, + "time_per_iteration": 5.386199951171875 + }, + { + "auxiliary_loss_clip": 0.01255871, + "auxiliary_loss_mlp": 0.01074414, + "balance_loss_clip": 1.0456841, + "balance_loss_mlp": 1.07266629, + "epoch": 0.03871937471817225, + "flos": 21430446946560.0, + "grad_norm": 2.074949815918338, + "language_loss": 0.82926929, + "learning_rate": 3.999202782859046e-06, + "loss": 0.85257208, + "num_input_tokens_seen": 13599555, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.828125, + "step": 644, + "time_per_iteration": 2.45499587059021 + }, + { + "auxiliary_loss_clip": 0.01260265, + "auxiliary_loss_mlp": 0.0106622, + "balance_loss_clip": 1.03503489, + "balance_loss_mlp": 1.07482159, + "epoch": 0.038779497970840224, + "flos": 34277309550720.0, + "grad_norm": 2.258008571643512, + "language_loss": 0.82131916, + "learning_rate": 3.9991917495862165e-06, + "loss": 0.84458399, + "num_input_tokens_seen": 13621160, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.859375, + "step": 645, + "time_per_iteration": 2.610435724258423 + }, + { + "auxiliary_loss_clip": 0.0126099, + "auxiliary_loss_mlp": 0.01070847, + "balance_loss_clip": 1.04121125, + "balance_loss_mlp": 1.07544899, + "epoch": 0.03883962122350819, + "flos": 22748189293440.0, + "grad_norm": 2.4729923618605554, + "language_loss": 0.82006776, + "learning_rate": 3.9991806405038345e-06, + "loss": 0.84338611, + "num_input_tokens_seen": 13641915, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.8515625, + "step": 646, + "time_per_iteration": 2.4771342277526855 + }, + { + "auxiliary_loss_clip": 0.01260575, + "auxiliary_loss_mlp": 0.01080585, + "balance_loss_clip": 1.05123544, + "balance_loss_mlp": 1.07928514, + "epoch": 0.03889974447617616, + "flos": 21945837242880.0, + "grad_norm": 1.8327945326632593, + "language_loss": 0.81973422, + "learning_rate": 3.999169455612323e-06, + "loss": 0.84314579, + "num_input_tokens_seen": 13661410, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.8125, + "step": 647, + "time_per_iteration": 2.522347927093506 + }, + { + "auxiliary_loss_clip": 0.01260388, + "auxiliary_loss_mlp": 0.01069051, + "balance_loss_clip": 1.03965366, + "balance_loss_mlp": 1.07776546, + "epoch": 0.03895986772884413, + "flos": 31504826384640.0, + "grad_norm": 1.9222642653000834, + "language_loss": 0.84699827, + "learning_rate": 3.999158194912106e-06, + "loss": 0.87029266, + "num_input_tokens_seen": 13681705, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.828125, + "step": 648, + "time_per_iteration": 2.561929941177368 + }, + { + "auxiliary_loss_clip": 0.01258218, + "auxiliary_loss_mlp": 0.01071465, + "balance_loss_clip": 1.041448, + "balance_loss_mlp": 1.07636404, + "epoch": 0.0390199909815121, + "flos": 19901011795200.0, + "grad_norm": 3.7283662397985053, + "language_loss": 0.84446943, + "learning_rate": 3.9991468584036086e-06, + "loss": 0.86776626, + "num_input_tokens_seen": 13700400, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.8203125, + "step": 649, + "time_per_iteration": 2.477027416229248 + }, + { + "auxiliary_loss_clip": 0.01259496, + "auxiliary_loss_mlp": 0.01070031, + "balance_loss_clip": 1.03977561, + "balance_loss_mlp": 1.07551885, + "epoch": 0.03908011423418007, + "flos": 21612478095360.0, + "grad_norm": 1.8508721849532739, + "language_loss": 0.79670662, + "learning_rate": 3.999135446087263e-06, + "loss": 0.8200019, + "num_input_tokens_seen": 13720145, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.84375, + "step": 650, + "time_per_iteration": 2.482440948486328 + }, + { + "auxiliary_loss_clip": 0.0125375, + "auxiliary_loss_mlp": 0.01072982, + "balance_loss_clip": 1.04314423, + "balance_loss_mlp": 1.07259929, + "epoch": 0.039140237486848035, + "flos": 18661411486080.0, + "grad_norm": 2.708739352564946, + "language_loss": 0.78509629, + "learning_rate": 3.9991239579635e-06, + "loss": 0.80836356, + "num_input_tokens_seen": 13737500, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.8125, + "step": 651, + "time_per_iteration": 2.4757516384124756 + }, + { + "auxiliary_loss_clip": 0.01255418, + "auxiliary_loss_mlp": 0.01082545, + "balance_loss_clip": 1.05004883, + "balance_loss_mlp": 1.0719974, + "epoch": 0.03920036073951601, + "flos": 18661124177280.0, + "grad_norm": 2.7896665115169244, + "language_loss": 0.88031149, + "learning_rate": 3.999112394032757e-06, + "loss": 0.90369117, + "num_input_tokens_seen": 13754750, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 1.8359375, + "step": 652, + "time_per_iteration": 2.4425668716430664 + }, + { + "auxiliary_loss_clip": 0.01249027, + "auxiliary_loss_mlp": 0.01069663, + "balance_loss_clip": 1.0411005, + "balance_loss_mlp": 1.07108784, + "epoch": 0.03926048399218398, + "flos": 31354468053120.0, + "grad_norm": 3.185528651545475, + "language_loss": 0.79044777, + "learning_rate": 3.999100754295471e-06, + "loss": 0.81363463, + "num_input_tokens_seen": 13771990, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.78125, + "step": 653, + "time_per_iteration": 2.5651934146881104 + }, + { + "auxiliary_loss_clip": 0.01264568, + "auxiliary_loss_mlp": 0.01070462, + "balance_loss_clip": 1.03996825, + "balance_loss_mlp": 1.07603264, + "epoch": 0.039320607244851945, + "flos": 29603499770880.0, + "grad_norm": 2.207303268368246, + "language_loss": 0.86304128, + "learning_rate": 3.999089038752085e-06, + "loss": 0.88639158, + "num_input_tokens_seen": 13792750, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.8828125, + "step": 654, + "time_per_iteration": 2.533297061920166 + }, + { + "auxiliary_loss_clip": 0.01115043, + "auxiliary_loss_mlp": 0.01012751, + "balance_loss_clip": 1.00710094, + "balance_loss_mlp": 1.03246427, + "epoch": 0.03938073049751992, + "flos": 66534609951360.0, + "grad_norm": 0.7205066186016396, + "language_loss": 0.49900642, + "learning_rate": 3.999077247403041e-06, + "loss": 0.5202843, + "num_input_tokens_seen": 13858570, + "router_z_loss_clip": 0.05639648, + "router_z_loss_mlp": 0.82421875, + "step": 655, + "time_per_iteration": 3.1399919986724854 + }, + { + "auxiliary_loss_clip": 0.01251012, + "auxiliary_loss_mlp": 0.01066863, + "balance_loss_clip": 1.03866971, + "balance_loss_mlp": 1.07330465, + "epoch": 0.03944085375018788, + "flos": 23367827836800.0, + "grad_norm": 2.4228021909793918, + "language_loss": 0.80845964, + "learning_rate": 3.9990653802487886e-06, + "loss": 0.83163846, + "num_input_tokens_seen": 13876335, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.78125, + "step": 656, + "time_per_iteration": 2.5063297748565674 + }, + { + "auxiliary_loss_clip": 0.01264211, + "auxiliary_loss_mlp": 0.0109165, + "balance_loss_clip": 1.0566026, + "balance_loss_mlp": 1.07672703, + "epoch": 0.039500977002855854, + "flos": 18548292579840.0, + "grad_norm": 2.8602268717749526, + "language_loss": 0.76602596, + "learning_rate": 3.999053437289776e-06, + "loss": 0.78958458, + "num_input_tokens_seen": 13892640, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 1.875, + "step": 657, + "time_per_iteration": 2.4405555725097656 + }, + { + "auxiliary_loss_clip": 0.01258331, + "auxiliary_loss_mlp": 0.01071967, + "balance_loss_clip": 1.04192615, + "balance_loss_mlp": 1.07452726, + "epoch": 0.039561100255523826, + "flos": 25338174433920.0, + "grad_norm": 2.1526815744488945, + "language_loss": 0.81690443, + "learning_rate": 3.999041418526457e-06, + "loss": 0.84020746, + "num_input_tokens_seen": 13910085, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.84375, + "step": 658, + "time_per_iteration": 2.5383949279785156 + }, + { + "auxiliary_loss_clip": 0.01252051, + "auxiliary_loss_mlp": 0.01072669, + "balance_loss_clip": 1.04091132, + "balance_loss_mlp": 1.07283425, + "epoch": 0.03962122350819179, + "flos": 18219889509120.0, + "grad_norm": 2.2075021313123777, + "language_loss": 0.91331315, + "learning_rate": 3.999029323959287e-06, + "loss": 0.93656039, + "num_input_tokens_seen": 13928800, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.796875, + "step": 659, + "time_per_iteration": 2.4678854942321777 + }, + { + "auxiliary_loss_clip": 0.01259034, + "auxiliary_loss_mlp": 0.01066414, + "balance_loss_clip": 1.03699267, + "balance_loss_mlp": 1.07427669, + "epoch": 0.03968134676085976, + "flos": 20522230536960.0, + "grad_norm": 2.5412719342676215, + "language_loss": 0.79241848, + "learning_rate": 3.999017153588724e-06, + "loss": 0.81567293, + "num_input_tokens_seen": 13948325, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.8515625, + "step": 660, + "time_per_iteration": 2.5135834217071533 + }, + { + "auxiliary_loss_clip": 0.01255641, + "auxiliary_loss_mlp": 0.01070807, + "balance_loss_clip": 1.04017007, + "balance_loss_mlp": 1.07534087, + "epoch": 0.03974147001352773, + "flos": 22422587483520.0, + "grad_norm": 1.6909533460123631, + "language_loss": 0.81942898, + "learning_rate": 3.999004907415231e-06, + "loss": 0.84269351, + "num_input_tokens_seen": 13969090, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.796875, + "step": 661, + "time_per_iteration": 2.513702154159546 + }, + { + "auxiliary_loss_clip": 0.01112947, + "auxiliary_loss_mlp": 0.01010967, + "balance_loss_clip": 1.00519753, + "balance_loss_mlp": 1.03039932, + "epoch": 0.0398015932661957, + "flos": 71128769322240.0, + "grad_norm": 0.9113020435813882, + "language_loss": 0.69376045, + "learning_rate": 3.998992585439272e-06, + "loss": 0.7149995, + "num_input_tokens_seen": 14037555, + "router_z_loss_clip": 0.05761719, + "router_z_loss_mlp": 0.82421875, + "step": 662, + "time_per_iteration": 3.2435107231140137 + }, + { + "auxiliary_loss_clip": 0.01260063, + "auxiliary_loss_mlp": 0.01071537, + "balance_loss_clip": 1.04113865, + "balance_loss_mlp": 1.0779382, + "epoch": 0.03986171651886367, + "flos": 16800951571200.0, + "grad_norm": 2.025040011333182, + "language_loss": 0.83253002, + "learning_rate": 3.998980187661314e-06, + "loss": 0.85584599, + "num_input_tokens_seen": 14055765, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.8125, + "step": 663, + "time_per_iteration": 2.5213887691497803 + }, + { + "auxiliary_loss_clip": 0.01261822, + "auxiliary_loss_mlp": 0.0106269, + "balance_loss_clip": 1.032125, + "balance_loss_mlp": 1.07768416, + "epoch": 0.03992183977153164, + "flos": 24535068197760.0, + "grad_norm": 2.8595031628608143, + "language_loss": 0.87538105, + "learning_rate": 3.998967714081826e-06, + "loss": 0.89862621, + "num_input_tokens_seen": 14074195, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.84375, + "step": 664, + "time_per_iteration": 2.516810655593872 + }, + { + "auxiliary_loss_clip": 0.0125116, + "auxiliary_loss_mlp": 0.01060628, + "balance_loss_clip": 1.03065872, + "balance_loss_mlp": 1.07347679, + "epoch": 0.03998196302419961, + "flos": 15595897167360.0, + "grad_norm": 2.3519362819230625, + "language_loss": 0.84738994, + "learning_rate": 3.998955164701281e-06, + "loss": 0.87050784, + "num_input_tokens_seen": 14090215, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 1.7734375, + "step": 665, + "time_per_iteration": 2.4348978996276855 + }, + { + "auxiliary_loss_clip": 0.01263346, + "auxiliary_loss_mlp": 0.01087391, + "balance_loss_clip": 1.05525231, + "balance_loss_mlp": 1.07680821, + "epoch": 0.04004208627686758, + "flos": 25305065072640.0, + "grad_norm": 2.1279588772882687, + "language_loss": 0.81491798, + "learning_rate": 3.998942539520158e-06, + "loss": 0.83842534, + "num_input_tokens_seen": 14112150, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 1.8671875, + "step": 666, + "time_per_iteration": 2.564187526702881 + }, + { + "auxiliary_loss_clip": 0.01252779, + "auxiliary_loss_mlp": 0.01074876, + "balance_loss_clip": 1.04276049, + "balance_loss_mlp": 1.07225358, + "epoch": 0.04010220952953555, + "flos": 23475847011840.0, + "grad_norm": 2.9939634291419526, + "language_loss": 0.87121451, + "learning_rate": 3.998929838538932e-06, + "loss": 0.89449108, + "num_input_tokens_seen": 14131475, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 1.8046875, + "step": 667, + "time_per_iteration": 2.547909736633301 + }, + { + "auxiliary_loss_clip": 0.0125258, + "auxiliary_loss_mlp": 0.01066007, + "balance_loss_clip": 1.03661036, + "balance_loss_mlp": 1.07692444, + "epoch": 0.04016233278220352, + "flos": 18617025254400.0, + "grad_norm": 2.627098567014159, + "language_loss": 0.80619991, + "learning_rate": 3.998917061758087e-06, + "loss": 0.82938576, + "num_input_tokens_seen": 14146165, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.7578125, + "step": 668, + "time_per_iteration": 2.441667079925537 + }, + { + "auxiliary_loss_clip": 0.01110895, + "auxiliary_loss_mlp": 0.01011229, + "balance_loss_clip": 1.0053643, + "balance_loss_mlp": 1.02968836, + "epoch": 0.040222456034871484, + "flos": 70906194696960.0, + "grad_norm": 0.7872457900726799, + "language_loss": 0.60042131, + "learning_rate": 3.998904209178107e-06, + "loss": 0.62164247, + "num_input_tokens_seen": 14215005, + "router_z_loss_clip": 0.05859375, + "router_z_loss_mlp": 0.8125, + "step": 669, + "time_per_iteration": 3.200874090194702 + }, + { + "auxiliary_loss_clip": 0.01253738, + "auxiliary_loss_mlp": 0.0107276, + "balance_loss_clip": 1.0431962, + "balance_loss_mlp": 1.07228541, + "epoch": 0.040282579287539456, + "flos": 23764712186880.0, + "grad_norm": 1.7415828974469272, + "language_loss": 0.86405391, + "learning_rate": 3.9988912807994785e-06, + "loss": 0.88731897, + "num_input_tokens_seen": 14235510, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.8125, + "step": 670, + "time_per_iteration": 2.5169434547424316 + }, + { + "auxiliary_loss_clip": 0.0124964, + "auxiliary_loss_mlp": 0.01070621, + "balance_loss_clip": 1.0414381, + "balance_loss_mlp": 1.07305872, + "epoch": 0.04034270254020743, + "flos": 18478518410880.0, + "grad_norm": 1.9261739939324196, + "language_loss": 0.752123, + "learning_rate": 3.998878276622692e-06, + "loss": 0.7753256, + "num_input_tokens_seen": 14254565, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.765625, + "step": 671, + "time_per_iteration": 2.514566421508789 + }, + { + "auxiliary_loss_clip": 0.01259516, + "auxiliary_loss_mlp": 0.01075144, + "balance_loss_clip": 1.04472136, + "balance_loss_mlp": 1.0774349, + "epoch": 0.040402825792875394, + "flos": 17201858244480.0, + "grad_norm": 2.0846907245314688, + "language_loss": 0.92279977, + "learning_rate": 3.998865196648242e-06, + "loss": 0.94614637, + "num_input_tokens_seen": 14271885, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.8203125, + "step": 672, + "time_per_iteration": 2.464036226272583 + }, + { + "auxiliary_loss_clip": 0.01253491, + "auxiliary_loss_mlp": 0.01071171, + "balance_loss_clip": 1.03921115, + "balance_loss_mlp": 1.07329202, + "epoch": 0.040462949045543366, + "flos": 19172168928000.0, + "grad_norm": 1.816355722874097, + "language_loss": 0.90220857, + "learning_rate": 3.998852040876622e-06, + "loss": 0.92545515, + "num_input_tokens_seen": 14289670, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 1.796875, + "step": 673, + "time_per_iteration": 2.450547456741333 + }, + { + "auxiliary_loss_clip": 0.01249229, + "auxiliary_loss_mlp": 0.01077482, + "balance_loss_clip": 1.0463202, + "balance_loss_mlp": 1.07150948, + "epoch": 0.04052307229821133, + "flos": 24019821555840.0, + "grad_norm": 2.117589951798075, + "language_loss": 0.74881005, + "learning_rate": 3.998838809308334e-06, + "loss": 0.77207714, + "num_input_tokens_seen": 14309285, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.78125, + "step": 674, + "time_per_iteration": 2.5444436073303223 + }, + { + "auxiliary_loss_clip": 0.01260981, + "auxiliary_loss_mlp": 0.01061202, + "balance_loss_clip": 1.03036261, + "balance_loss_mlp": 1.07609737, + "epoch": 0.0405831955508793, + "flos": 16436601964800.0, + "grad_norm": 2.2422867770418797, + "language_loss": 0.78305578, + "learning_rate": 3.9988255019438766e-06, + "loss": 0.80627763, + "num_input_tokens_seen": 14328300, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.84375, + "step": 675, + "time_per_iteration": 2.4525954723358154 + }, + { + "auxiliary_loss_clip": 0.01252319, + "auxiliary_loss_mlp": 0.01078615, + "balance_loss_clip": 1.04578447, + "balance_loss_mlp": 1.07254028, + "epoch": 0.040643318803547275, + "flos": 24279922915200.0, + "grad_norm": 1.7072695919905723, + "language_loss": 0.76650077, + "learning_rate": 3.998812118783757e-06, + "loss": 0.78981006, + "num_input_tokens_seen": 14346395, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.796875, + "step": 676, + "time_per_iteration": 2.530043840408325 + }, + { + "auxiliary_loss_clip": 0.01258388, + "auxiliary_loss_mlp": 0.01076398, + "balance_loss_clip": 1.04564214, + "balance_loss_mlp": 1.0750767, + "epoch": 0.04070344205621524, + "flos": 17712076982400.0, + "grad_norm": 2.3168648577819138, + "language_loss": 0.85182011, + "learning_rate": 3.9987986598284804e-06, + "loss": 0.87516803, + "num_input_tokens_seen": 14364605, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.828125, + "step": 677, + "time_per_iteration": 2.4390082359313965 + }, + { + "auxiliary_loss_clip": 0.01249568, + "auxiliary_loss_mlp": 0.01068372, + "balance_loss_clip": 1.03804517, + "balance_loss_mlp": 1.071486, + "epoch": 0.04076356530888321, + "flos": 26177658168960.0, + "grad_norm": 1.7808730288109123, + "language_loss": 0.76348364, + "learning_rate": 3.998785125078559e-06, + "loss": 0.78666306, + "num_input_tokens_seen": 14385265, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.78125, + "step": 678, + "time_per_iteration": 2.5151596069335938 + }, + { + "auxiliary_loss_clip": 0.01250603, + "auxiliary_loss_mlp": 0.01066495, + "balance_loss_clip": 1.03807509, + "balance_loss_mlp": 1.07162285, + "epoch": 0.04082368856155118, + "flos": 35773455772800.0, + "grad_norm": 1.9938089142752387, + "language_loss": 0.82114184, + "learning_rate": 3.998771514534505e-06, + "loss": 0.84431279, + "num_input_tokens_seen": 14406090, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.7890625, + "step": 679, + "time_per_iteration": 2.5701568126678467 + }, + { + "auxiliary_loss_clip": 0.01255726, + "auxiliary_loss_mlp": 0.01057721, + "balance_loss_clip": 1.02799058, + "balance_loss_mlp": 1.07693028, + "epoch": 0.04088381181421915, + "flos": 28146640049280.0, + "grad_norm": 1.893911305727382, + "language_loss": 0.76349533, + "learning_rate": 3.998757828196835e-06, + "loss": 0.7866298, + "num_input_tokens_seen": 14425130, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.7890625, + "step": 680, + "time_per_iteration": 2.5289864540100098 + }, + { + "auxiliary_loss_clip": 0.01255007, + "auxiliary_loss_mlp": 0.01065478, + "balance_loss_clip": 1.03305268, + "balance_loss_mlp": 1.07167506, + "epoch": 0.04094393506688712, + "flos": 27597673514880.0, + "grad_norm": 1.7999776318515568, + "language_loss": 0.83315849, + "learning_rate": 3.9987440660660685e-06, + "loss": 0.8563633, + "num_input_tokens_seen": 14447355, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 1.8359375, + "step": 681, + "time_per_iteration": 2.5313305854797363 + }, + { + "auxiliary_loss_clip": 0.01253144, + "auxiliary_loss_mlp": 0.01064685, + "balance_loss_clip": 1.03302324, + "balance_loss_mlp": 1.07082057, + "epoch": 0.04100405831955509, + "flos": 23112036109440.0, + "grad_norm": 1.6690976928218293, + "language_loss": 0.71312869, + "learning_rate": 3.998730228142726e-06, + "loss": 0.73630697, + "num_input_tokens_seen": 14466790, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.828125, + "step": 682, + "time_per_iteration": 2.5190017223358154 + }, + { + "auxiliary_loss_clip": 0.01251459, + "auxiliary_loss_mlp": 0.01068202, + "balance_loss_clip": 1.03911471, + "balance_loss_mlp": 1.07090235, + "epoch": 0.04106418157222306, + "flos": 20156731695360.0, + "grad_norm": 1.7744847161326498, + "language_loss": 0.72373003, + "learning_rate": 3.998716314427333e-06, + "loss": 0.74692667, + "num_input_tokens_seen": 14485195, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.8046875, + "step": 683, + "time_per_iteration": 2.473156690597534 + }, + { + "auxiliary_loss_clip": 0.01250706, + "auxiliary_loss_mlp": 0.01075324, + "balance_loss_clip": 1.04540253, + "balance_loss_mlp": 1.07707, + "epoch": 0.041124304824891024, + "flos": 17420697855360.0, + "grad_norm": 2.316908811268422, + "language_loss": 0.81263745, + "learning_rate": 3.998702324920417e-06, + "loss": 0.83589774, + "num_input_tokens_seen": 14503370, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 1.734375, + "step": 684, + "time_per_iteration": 5.34027099609375 + }, + { + "auxiliary_loss_clip": 0.01251905, + "auxiliary_loss_mlp": 0.01072266, + "balance_loss_clip": 1.04053211, + "balance_loss_mlp": 1.07572865, + "epoch": 0.041184428077558996, + "flos": 25780163287680.0, + "grad_norm": 1.5327144156887007, + "language_loss": 0.90501672, + "learning_rate": 3.9986882596225085e-06, + "loss": 0.92825842, + "num_input_tokens_seen": 14526415, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 1.765625, + "step": 685, + "time_per_iteration": 3.918776750564575 + }, + { + "auxiliary_loss_clip": 0.01253389, + "auxiliary_loss_mlp": 0.010703, + "balance_loss_clip": 1.04002118, + "balance_loss_mlp": 1.07458997, + "epoch": 0.04124455133022697, + "flos": 22964766347520.0, + "grad_norm": 2.0402082016953234, + "language_loss": 0.87871253, + "learning_rate": 3.998674118534141e-06, + "loss": 0.90194941, + "num_input_tokens_seen": 14546595, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.7890625, + "step": 686, + "time_per_iteration": 2.481177806854248 + }, + { + "auxiliary_loss_clip": 0.01258153, + "auxiliary_loss_mlp": 0.01071669, + "balance_loss_clip": 1.04158103, + "balance_loss_mlp": 1.07474661, + "epoch": 0.04130467458289493, + "flos": 21289067015040.0, + "grad_norm": 1.7716861202834375, + "language_loss": 0.71645427, + "learning_rate": 3.998659901655851e-06, + "loss": 0.73975253, + "num_input_tokens_seen": 14566590, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.8359375, + "step": 687, + "time_per_iteration": 2.4720261096954346 + }, + { + "auxiliary_loss_clip": 0.01252382, + "auxiliary_loss_mlp": 0.01070684, + "balance_loss_clip": 1.04262209, + "balance_loss_mlp": 1.07918715, + "epoch": 0.041364797835562905, + "flos": 19974233669760.0, + "grad_norm": 2.117746024922212, + "language_loss": 0.8642537, + "learning_rate": 3.998645608988177e-06, + "loss": 0.88748431, + "num_input_tokens_seen": 14585965, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.734375, + "step": 688, + "time_per_iteration": 2.46706223487854 + }, + { + "auxiliary_loss_clip": 0.01249454, + "auxiliary_loss_mlp": 0.01083042, + "balance_loss_clip": 1.05338287, + "balance_loss_mlp": 1.07534754, + "epoch": 0.04142492108823087, + "flos": 21906227520000.0, + "grad_norm": 2.6487514234328304, + "language_loss": 0.83326006, + "learning_rate": 3.998631240531661e-06, + "loss": 0.85658503, + "num_input_tokens_seen": 14606015, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.7421875, + "step": 689, + "time_per_iteration": 2.4689462184906006 + }, + { + "auxiliary_loss_clip": 0.01248134, + "auxiliary_loss_mlp": 0.01077255, + "balance_loss_clip": 1.04847789, + "balance_loss_mlp": 1.07176828, + "epoch": 0.04148504434089884, + "flos": 27639617621760.0, + "grad_norm": 1.7821885346326607, + "language_loss": 0.68391848, + "learning_rate": 3.998616796286848e-06, + "loss": 0.70717239, + "num_input_tokens_seen": 14629955, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.765625, + "step": 690, + "time_per_iteration": 2.5583128929138184 + }, + { + "auxiliary_loss_clip": 0.012458, + "auxiliary_loss_mlp": 0.01071299, + "balance_loss_clip": 1.04197323, + "balance_loss_mlp": 1.07094526, + "epoch": 0.041545167593566815, + "flos": 20518387781760.0, + "grad_norm": 1.747700039366933, + "language_loss": 0.74933273, + "learning_rate": 3.998602276254286e-06, + "loss": 0.77250373, + "num_input_tokens_seen": 14648000, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.75, + "step": 691, + "time_per_iteration": 2.4566729068756104 + }, + { + "auxiliary_loss_clip": 0.01246178, + "auxiliary_loss_mlp": 0.0107911, + "balance_loss_clip": 1.04890203, + "balance_loss_mlp": 1.07268727, + "epoch": 0.04160529084623478, + "flos": 11868907939200.0, + "grad_norm": 2.450885846250815, + "language_loss": 0.84518701, + "learning_rate": 3.998587680434526e-06, + "loss": 0.86843991, + "num_input_tokens_seen": 14662235, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.734375, + "step": 692, + "time_per_iteration": 2.4667932987213135 + }, + { + "auxiliary_loss_clip": 0.01252043, + "auxiliary_loss_mlp": 0.01072791, + "balance_loss_clip": 1.04124784, + "balance_loss_mlp": 1.07099986, + "epoch": 0.04166541409890275, + "flos": 14828306503680.0, + "grad_norm": 9.166238009589804, + "language_loss": 0.89107299, + "learning_rate": 3.99857300882812e-06, + "loss": 0.9143213, + "num_input_tokens_seen": 14676065, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.8125, + "step": 693, + "time_per_iteration": 2.4823052883148193 + }, + { + "auxiliary_loss_clip": 0.01254961, + "auxiliary_loss_mlp": 0.01065864, + "balance_loss_clip": 1.03637171, + "balance_loss_mlp": 1.07755136, + "epoch": 0.04172553735157072, + "flos": 25808137004160.0, + "grad_norm": 2.1462970179067646, + "language_loss": 0.82179356, + "learning_rate": 3.998558261435626e-06, + "loss": 0.84500182, + "num_input_tokens_seen": 14694955, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.7734375, + "step": 694, + "time_per_iteration": 2.564098834991455 + }, + { + "auxiliary_loss_clip": 0.01253069, + "auxiliary_loss_mlp": 0.01067691, + "balance_loss_clip": 1.03791225, + "balance_loss_mlp": 1.07214785, + "epoch": 0.04178566060423869, + "flos": 24279815174400.0, + "grad_norm": 2.057768586122239, + "language_loss": 0.83656573, + "learning_rate": 3.9985434382576015e-06, + "loss": 0.85977334, + "num_input_tokens_seen": 14715510, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.8125, + "step": 695, + "time_per_iteration": 2.5122969150543213 + }, + { + "auxiliary_loss_clip": 0.01249861, + "auxiliary_loss_mlp": 0.01073319, + "balance_loss_clip": 1.04270577, + "balance_loss_mlp": 1.07313716, + "epoch": 0.04184578385690666, + "flos": 18222008411520.0, + "grad_norm": 2.138642052855673, + "language_loss": 0.8441087, + "learning_rate": 3.99852853929461e-06, + "loss": 0.86734056, + "num_input_tokens_seen": 14731755, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.765625, + "step": 696, + "time_per_iteration": 2.462756872177124 + }, + { + "auxiliary_loss_clip": 0.01247863, + "auxiliary_loss_mlp": 0.01073791, + "balance_loss_clip": 1.04253471, + "balance_loss_mlp": 1.07146811, + "epoch": 0.041905907109574626, + "flos": 22776342577920.0, + "grad_norm": 2.042298821772003, + "language_loss": 0.93134123, + "learning_rate": 3.998513564547216e-06, + "loss": 0.95455778, + "num_input_tokens_seen": 14750810, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.765625, + "step": 697, + "time_per_iteration": 2.5189502239227295 + }, + { + "auxiliary_loss_clip": 0.0124398, + "auxiliary_loss_mlp": 0.01069004, + "balance_loss_clip": 1.04048967, + "balance_loss_mlp": 1.07146859, + "epoch": 0.0419660303622426, + "flos": 20156947176960.0, + "grad_norm": 2.2837511795811207, + "language_loss": 0.83989406, + "learning_rate": 3.998498514015987e-06, + "loss": 0.86302388, + "num_input_tokens_seen": 14768435, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.71875, + "step": 698, + "time_per_iteration": 2.5080907344818115 + }, + { + "auxiliary_loss_clip": 0.01247569, + "auxiliary_loss_mlp": 0.01086724, + "balance_loss_clip": 1.05551505, + "balance_loss_mlp": 1.0711751, + "epoch": 0.042026153614910564, + "flos": 23076376882560.0, + "grad_norm": 1.9405760650289445, + "language_loss": 0.91369909, + "learning_rate": 3.998483387701495e-06, + "loss": 0.93704206, + "num_input_tokens_seen": 14786690, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.7578125, + "step": 699, + "time_per_iteration": 2.4667766094207764 + }, + { + "auxiliary_loss_clip": 0.01113685, + "auxiliary_loss_mlp": 0.01024099, + "balance_loss_clip": 1.01842487, + "balance_loss_mlp": 1.03384757, + "epoch": 0.042086276867578536, + "flos": 64495243370880.0, + "grad_norm": 0.8964375713204716, + "language_loss": 0.67850006, + "learning_rate": 3.998468185604312e-06, + "loss": 0.69987792, + "num_input_tokens_seen": 14853840, + "router_z_loss_clip": 0.05664062, + "router_z_loss_mlp": 0.796875, + "step": 700, + "time_per_iteration": 3.1214911937713623 + }, + { + "auxiliary_loss_clip": 0.01254452, + "auxiliary_loss_mlp": 0.01078478, + "balance_loss_clip": 1.04695964, + "balance_loss_mlp": 1.07502532, + "epoch": 0.04214640012024651, + "flos": 15487016065920.0, + "grad_norm": 2.6789371965697524, + "language_loss": 0.89020562, + "learning_rate": 3.998452907725016e-06, + "loss": 0.913535, + "num_input_tokens_seen": 14869580, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 1.796875, + "step": 701, + "time_per_iteration": 2.46085524559021 + }, + { + "auxiliary_loss_clip": 0.01250018, + "auxiliary_loss_mlp": 0.0107128, + "balance_loss_clip": 1.04085803, + "balance_loss_mlp": 1.07681179, + "epoch": 0.04220652337291447, + "flos": 23877040993920.0, + "grad_norm": 2.2592774096130794, + "language_loss": 0.67494118, + "learning_rate": 3.998437554064184e-06, + "loss": 0.69815421, + "num_input_tokens_seen": 14891065, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.734375, + "step": 702, + "time_per_iteration": 2.5170979499816895 + }, + { + "auxiliary_loss_clip": 0.01112958, + "auxiliary_loss_mlp": 0.01006834, + "balance_loss_clip": 1.00161314, + "balance_loss_mlp": 1.03296542, + "epoch": 0.042266646625582445, + "flos": 63795451628160.0, + "grad_norm": 0.8426087453226233, + "language_loss": 0.60777819, + "learning_rate": 3.9984221246224006e-06, + "loss": 0.62897617, + "num_input_tokens_seen": 14954815, + "router_z_loss_clip": 0.05224609, + "router_z_loss_mlp": 0.80078125, + "step": 703, + "time_per_iteration": 3.155794143676758 + }, + { + "auxiliary_loss_clip": 0.01112196, + "auxiliary_loss_mlp": 0.01010352, + "balance_loss_clip": 1.0050354, + "balance_loss_mlp": 1.03251982, + "epoch": 0.04232676987825041, + "flos": 50018863345920.0, + "grad_norm": 1.0167549333074237, + "language_loss": 0.5776214, + "learning_rate": 3.9984066194002494e-06, + "loss": 0.59884691, + "num_input_tokens_seen": 15003050, + "router_z_loss_clip": 0.05322266, + "router_z_loss_mlp": 0.796875, + "step": 704, + "time_per_iteration": 2.95633602142334 + }, + { + "auxiliary_loss_clip": 0.01252148, + "auxiliary_loss_mlp": 0.01070665, + "balance_loss_clip": 1.0397656, + "balance_loss_mlp": 1.07432342, + "epoch": 0.04238689313091838, + "flos": 21616105368960.0, + "grad_norm": 2.1970745802550624, + "language_loss": 0.87708455, + "learning_rate": 3.998391038398319e-06, + "loss": 0.90031266, + "num_input_tokens_seen": 15021990, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.78125, + "step": 705, + "time_per_iteration": 2.51802921295166 + }, + { + "auxiliary_loss_clip": 0.01238458, + "auxiliary_loss_mlp": 0.01062417, + "balance_loss_clip": 1.03498721, + "balance_loss_mlp": 1.06876624, + "epoch": 0.042447016383586354, + "flos": 19135109070720.0, + "grad_norm": 1.7054575923778923, + "language_loss": 0.71612352, + "learning_rate": 3.998375381617201e-06, + "loss": 0.73913229, + "num_input_tokens_seen": 15040700, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.6953125, + "step": 706, + "time_per_iteration": 2.464270830154419 + }, + { + "auxiliary_loss_clip": 0.01243119, + "auxiliary_loss_mlp": 0.01068207, + "balance_loss_clip": 1.03816676, + "balance_loss_mlp": 1.07029784, + "epoch": 0.04250713963625432, + "flos": 24426007528320.0, + "grad_norm": 2.0927829932503714, + "language_loss": 0.93480223, + "learning_rate": 3.9983596490574875e-06, + "loss": 0.95791554, + "num_input_tokens_seen": 15056725, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.7265625, + "step": 707, + "time_per_iteration": 2.5087966918945312 + }, + { + "auxiliary_loss_clip": 0.01245928, + "auxiliary_loss_mlp": 0.01065311, + "balance_loss_clip": 1.03441203, + "balance_loss_mlp": 1.0676806, + "epoch": 0.04256726288892229, + "flos": 30367391333760.0, + "grad_norm": 2.3244890877745883, + "language_loss": 0.81275034, + "learning_rate": 3.998343840719776e-06, + "loss": 0.83586276, + "num_input_tokens_seen": 15077550, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.78125, + "step": 708, + "time_per_iteration": 2.557119607925415 + }, + { + "auxiliary_loss_clip": 0.01251091, + "auxiliary_loss_mlp": 0.01073266, + "balance_loss_clip": 1.04239082, + "balance_loss_mlp": 1.07195199, + "epoch": 0.04262738614159026, + "flos": 16362661818240.0, + "grad_norm": 2.2553269788690224, + "language_loss": 0.82229173, + "learning_rate": 3.998327956604666e-06, + "loss": 0.84553528, + "num_input_tokens_seen": 15094955, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.796875, + "step": 709, + "time_per_iteration": 2.4828600883483887 + }, + { + "auxiliary_loss_clip": 0.01256006, + "auxiliary_loss_mlp": 0.01064315, + "balance_loss_clip": 1.03389335, + "balance_loss_mlp": 1.07517564, + "epoch": 0.04268750939425823, + "flos": 20412379768320.0, + "grad_norm": 2.534138916450152, + "language_loss": 0.85063422, + "learning_rate": 3.99831199671276e-06, + "loss": 0.87383747, + "num_input_tokens_seen": 15113395, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.8125, + "step": 710, + "time_per_iteration": 2.453641653060913 + }, + { + "auxiliary_loss_clip": 0.01254724, + "auxiliary_loss_mlp": 0.01070713, + "balance_loss_clip": 1.04114938, + "balance_loss_mlp": 1.07757199, + "epoch": 0.0427476326469262, + "flos": 20302959962880.0, + "grad_norm": 3.316207411440496, + "language_loss": 0.84996349, + "learning_rate": 3.998295961044662e-06, + "loss": 0.87321782, + "num_input_tokens_seen": 15132920, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.7734375, + "step": 711, + "time_per_iteration": 2.4685802459716797 + }, + { + "auxiliary_loss_clip": 0.01246695, + "auxiliary_loss_mlp": 0.01069917, + "balance_loss_clip": 1.03827882, + "balance_loss_mlp": 1.07044697, + "epoch": 0.042807755899594166, + "flos": 21650794928640.0, + "grad_norm": 2.000925777751644, + "language_loss": 0.85439169, + "learning_rate": 3.9982798496009804e-06, + "loss": 0.87755781, + "num_input_tokens_seen": 15153115, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.765625, + "step": 712, + "time_per_iteration": 2.5002598762512207 + }, + { + "auxiliary_loss_clip": 0.01252579, + "auxiliary_loss_mlp": 0.0107294, + "balance_loss_clip": 1.0445205, + "balance_loss_mlp": 1.0701685, + "epoch": 0.04286787915226214, + "flos": 21435007973760.0, + "grad_norm": 3.2453781921901728, + "language_loss": 0.90829903, + "learning_rate": 3.998263662382328e-06, + "loss": 0.9315542, + "num_input_tokens_seen": 15172770, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.8203125, + "step": 713, + "time_per_iteration": 2.4908998012542725 + }, + { + "auxiliary_loss_clip": 0.01108903, + "auxiliary_loss_mlp": 0.01017546, + "balance_loss_clip": 1.01187158, + "balance_loss_mlp": 1.0288384, + "epoch": 0.04292800240493011, + "flos": 66397970615040.0, + "grad_norm": 0.8777811618173876, + "language_loss": 0.63746506, + "learning_rate": 3.9982473993893165e-06, + "loss": 0.65872955, + "num_input_tokens_seen": 15240055, + "router_z_loss_clip": 0.05664062, + "router_z_loss_mlp": 0.80078125, + "step": 714, + "time_per_iteration": 3.158921480178833 + }, + { + "auxiliary_loss_clip": 0.01249012, + "auxiliary_loss_mlp": 0.01080593, + "balance_loss_clip": 1.05076694, + "balance_loss_mlp": 1.07545531, + "epoch": 0.042988125657598075, + "flos": 31650264552960.0, + "grad_norm": 2.1622955343434382, + "language_loss": 0.74528754, + "learning_rate": 3.998231060622563e-06, + "loss": 0.76858354, + "num_input_tokens_seen": 15261585, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 1.734375, + "step": 715, + "time_per_iteration": 2.5759642124176025 + }, + { + "auxiliary_loss_clip": 0.01250142, + "auxiliary_loss_mlp": 0.01077038, + "balance_loss_clip": 1.04534006, + "balance_loss_mlp": 1.07450986, + "epoch": 0.04304824891026605, + "flos": 33248468292480.0, + "grad_norm": 2.2108029839954213, + "language_loss": 0.72630137, + "learning_rate": 3.998214646082688e-06, + "loss": 0.74957311, + "num_input_tokens_seen": 15281160, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.7578125, + "step": 716, + "time_per_iteration": 2.5973668098449707 + }, + { + "auxiliary_loss_clip": 0.01104967, + "auxiliary_loss_mlp": 0.01006876, + "balance_loss_clip": 1.00091577, + "balance_loss_mlp": 1.02687418, + "epoch": 0.04310837216293401, + "flos": 64064782782720.0, + "grad_norm": 0.9052113850529176, + "language_loss": 0.65557301, + "learning_rate": 3.998198155770314e-06, + "loss": 0.67669141, + "num_input_tokens_seen": 15344505, + "router_z_loss_clip": 0.05957031, + "router_z_loss_mlp": 0.78125, + "step": 717, + "time_per_iteration": 3.114957571029663 + }, + { + "auxiliary_loss_clip": 0.01104969, + "auxiliary_loss_mlp": 0.01003955, + "balance_loss_clip": 0.99780369, + "balance_loss_mlp": 1.02667391, + "epoch": 0.043168495415601985, + "flos": 61343757849600.0, + "grad_norm": 0.9880116621267147, + "language_loss": 0.58762264, + "learning_rate": 3.998181589686065e-06, + "loss": 0.60871184, + "num_input_tokens_seen": 15404050, + "router_z_loss_clip": 0.0612793, + "router_z_loss_mlp": 0.78125, + "step": 718, + "time_per_iteration": 2.910278797149658 + }, + { + "auxiliary_loss_clip": 0.01248398, + "auxiliary_loss_mlp": 0.01074809, + "balance_loss_clip": 1.04314709, + "balance_loss_mlp": 1.0758605, + "epoch": 0.04322861866826996, + "flos": 20704261685760.0, + "grad_norm": 1.8513004644505335, + "language_loss": 0.91198725, + "learning_rate": 3.99816494783057e-06, + "loss": 0.93521935, + "num_input_tokens_seen": 15424190, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.7265625, + "step": 719, + "time_per_iteration": 2.492509126663208 + }, + { + "auxiliary_loss_clip": 0.01244347, + "auxiliary_loss_mlp": 0.0107141, + "balance_loss_clip": 1.04208493, + "balance_loss_mlp": 1.06931555, + "epoch": 0.04328874192093792, + "flos": 30373352991360.0, + "grad_norm": 1.803377327315558, + "language_loss": 0.66468138, + "learning_rate": 3.99814823020446e-06, + "loss": 0.68783891, + "num_input_tokens_seen": 15446500, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.75, + "step": 720, + "time_per_iteration": 2.6061203479766846 + }, + { + "auxiliary_loss_clip": 0.01244682, + "auxiliary_loss_mlp": 0.01079523, + "balance_loss_clip": 1.04895782, + "balance_loss_mlp": 1.07152998, + "epoch": 0.043348865173605894, + "flos": 21944795748480.0, + "grad_norm": 1.8832143461121282, + "language_loss": 0.77743989, + "learning_rate": 3.9981314368083684e-06, + "loss": 0.80068195, + "num_input_tokens_seen": 15465830, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.734375, + "step": 721, + "time_per_iteration": 2.5255632400512695 + }, + { + "auxiliary_loss_clip": 0.01251204, + "auxiliary_loss_mlp": 0.0108774, + "balance_loss_clip": 1.05879569, + "balance_loss_mlp": 1.07584524, + "epoch": 0.04340898842627386, + "flos": 15264225959040.0, + "grad_norm": 3.027898330451403, + "language_loss": 0.87873065, + "learning_rate": 3.998114567642933e-06, + "loss": 0.90212011, + "num_input_tokens_seen": 15479985, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.75, + "step": 722, + "time_per_iteration": 2.536283493041992 + }, + { + "auxiliary_loss_clip": 0.0125365, + "auxiliary_loss_mlp": 0.01075404, + "balance_loss_clip": 1.04660296, + "balance_loss_mlp": 1.0758208, + "epoch": 0.04346911167894183, + "flos": 27965434913280.0, + "grad_norm": 30.376200688873947, + "language_loss": 0.84770942, + "learning_rate": 3.998097622708792e-06, + "loss": 0.87099999, + "num_input_tokens_seen": 15501545, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.78125, + "step": 723, + "time_per_iteration": 2.5167360305786133 + }, + { + "auxiliary_loss_clip": 0.01256754, + "auxiliary_loss_mlp": 0.01076494, + "balance_loss_clip": 1.04638171, + "balance_loss_mlp": 1.07828176, + "epoch": 0.0435292349316098, + "flos": 29242202820480.0, + "grad_norm": 1.9203333396820472, + "language_loss": 0.82793808, + "learning_rate": 3.99808060200659e-06, + "loss": 0.85127056, + "num_input_tokens_seen": 15521725, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.78125, + "step": 724, + "time_per_iteration": 2.5561347007751465 + }, + { + "auxiliary_loss_clip": 0.0125067, + "auxiliary_loss_mlp": 0.01090321, + "balance_loss_clip": 1.05975556, + "balance_loss_mlp": 1.07561088, + "epoch": 0.04358935818427777, + "flos": 20558356640640.0, + "grad_norm": 1.8200683460759586, + "language_loss": 0.79530561, + "learning_rate": 3.998063505536971e-06, + "loss": 0.81871551, + "num_input_tokens_seen": 15540910, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.75, + "step": 725, + "time_per_iteration": 2.4551918506622314 + }, + { + "auxiliary_loss_clip": 0.0126067, + "auxiliary_loss_mlp": 0.01076358, + "balance_loss_clip": 1.04529178, + "balance_loss_mlp": 1.07715642, + "epoch": 0.04364948143694574, + "flos": 14464926564480.0, + "grad_norm": 2.8106150104808485, + "language_loss": 0.87100697, + "learning_rate": 3.998046333300584e-06, + "loss": 0.89437729, + "num_input_tokens_seen": 15558640, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 1.8359375, + "step": 726, + "time_per_iteration": 5.350574731826782 + }, + { + "auxiliary_loss_clip": 0.01106916, + "auxiliary_loss_mlp": 0.01011263, + "balance_loss_clip": 1.00542223, + "balance_loss_mlp": 1.02866364, + "epoch": 0.043709604689613706, + "flos": 50067268922880.0, + "grad_norm": 0.9088619113669424, + "language_loss": 0.5587045, + "learning_rate": 3.998029085298079e-06, + "loss": 0.57988632, + "num_input_tokens_seen": 15612975, + "router_z_loss_clip": 0.05834961, + "router_z_loss_mlp": 0.78125, + "step": 727, + "time_per_iteration": 3.1539440155029297 + }, + { + "auxiliary_loss_clip": 0.01251236, + "auxiliary_loss_mlp": 0.01076851, + "balance_loss_clip": 1.04676282, + "balance_loss_mlp": 1.07453549, + "epoch": 0.04376972794228168, + "flos": 13991588115840.0, + "grad_norm": 2.397861957488019, + "language_loss": 0.82248902, + "learning_rate": 3.998011761530112e-06, + "loss": 0.84576982, + "num_input_tokens_seen": 15631070, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.765625, + "step": 728, + "time_per_iteration": 2.4445250034332275 + }, + { + "auxiliary_loss_clip": 0.01244631, + "auxiliary_loss_mlp": 0.01068516, + "balance_loss_clip": 1.0395956, + "balance_loss_mlp": 1.07265663, + "epoch": 0.04382985119494965, + "flos": 22009901149440.0, + "grad_norm": 2.2715062050859745, + "language_loss": 0.77187145, + "learning_rate": 3.997994361997338e-06, + "loss": 0.79500294, + "num_input_tokens_seen": 15647825, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.71875, + "step": 729, + "time_per_iteration": 2.5091514587402344 + }, + { + "auxiliary_loss_clip": 0.0125233, + "auxiliary_loss_mlp": 0.01074681, + "balance_loss_clip": 1.04502177, + "balance_loss_mlp": 1.07452357, + "epoch": 0.043889974447617615, + "flos": 24206521472640.0, + "grad_norm": 2.258754879989397, + "language_loss": 0.9515503, + "learning_rate": 3.997976886700417e-06, + "loss": 0.97482038, + "num_input_tokens_seen": 15668260, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.78125, + "step": 730, + "time_per_iteration": 2.4795522689819336 + }, + { + "auxiliary_loss_clip": 0.0124716, + "auxiliary_loss_mlp": 0.01065838, + "balance_loss_clip": 1.03496313, + "balance_loss_mlp": 1.07000017, + "epoch": 0.04395009770028559, + "flos": 17274541415040.0, + "grad_norm": 2.2097226025839483, + "language_loss": 0.88016784, + "learning_rate": 3.997959335640013e-06, + "loss": 0.90329784, + "num_input_tokens_seen": 15685630, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.7734375, + "step": 731, + "time_per_iteration": 2.4678709506988525 + }, + { + "auxiliary_loss_clip": 0.01251191, + "auxiliary_loss_mlp": 0.01073318, + "balance_loss_clip": 1.04589999, + "balance_loss_mlp": 1.07521737, + "epoch": 0.04401022095295355, + "flos": 12310286261760.0, + "grad_norm": 3.3707184473936587, + "language_loss": 0.88656235, + "learning_rate": 3.997941708816791e-06, + "loss": 0.90980744, + "num_input_tokens_seen": 15698645, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.7578125, + "step": 732, + "time_per_iteration": 2.4135851860046387 + }, + { + "auxiliary_loss_clip": 0.01251086, + "auxiliary_loss_mlp": 0.01073165, + "balance_loss_clip": 1.04288554, + "balance_loss_mlp": 1.07443762, + "epoch": 0.044070344205621524, + "flos": 20959658363520.0, + "grad_norm": 2.131822645051773, + "language_loss": 0.86010063, + "learning_rate": 3.997924006231419e-06, + "loss": 0.88334322, + "num_input_tokens_seen": 15716775, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.765625, + "step": 733, + "time_per_iteration": 2.491278648376465 + }, + { + "auxiliary_loss_clip": 0.01256254, + "auxiliary_loss_mlp": 0.01078649, + "balance_loss_clip": 1.04715347, + "balance_loss_mlp": 1.07624841, + "epoch": 0.044130467458289496, + "flos": 13845288021120.0, + "grad_norm": 2.0564057381838885, + "language_loss": 0.91515708, + "learning_rate": 3.9979062278845685e-06, + "loss": 0.93850613, + "num_input_tokens_seen": 15733320, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 1.796875, + "step": 734, + "time_per_iteration": 2.451258897781372 + }, + { + "auxiliary_loss_clip": 0.01247796, + "auxiliary_loss_mlp": 0.01064289, + "balance_loss_clip": 1.03696656, + "balance_loss_mlp": 1.07613921, + "epoch": 0.04419059071095746, + "flos": 28655063107200.0, + "grad_norm": 1.8863467898976456, + "language_loss": 0.77831066, + "learning_rate": 3.9978883737769125e-06, + "loss": 0.8014316, + "num_input_tokens_seen": 15752705, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.71875, + "step": 735, + "time_per_iteration": 2.558958053588867 + }, + { + "auxiliary_loss_clip": 0.01240634, + "auxiliary_loss_mlp": 0.01063468, + "balance_loss_clip": 1.03526342, + "balance_loss_mlp": 1.06886315, + "epoch": 0.04425071396362543, + "flos": 28183304856960.0, + "grad_norm": 2.1337917025346074, + "language_loss": 0.88456166, + "learning_rate": 3.9978704439091305e-06, + "loss": 0.90760267, + "num_input_tokens_seen": 15772800, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.71875, + "step": 736, + "time_per_iteration": 2.5100033283233643 + }, + { + "auxiliary_loss_clip": 0.01242163, + "auxiliary_loss_mlp": 0.01067088, + "balance_loss_clip": 1.03995562, + "balance_loss_mlp": 1.07473993, + "epoch": 0.0443108372162934, + "flos": 23658452778240.0, + "grad_norm": 1.954630170969084, + "language_loss": 0.84155536, + "learning_rate": 3.997852438281901e-06, + "loss": 0.86464787, + "num_input_tokens_seen": 15793665, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.671875, + "step": 737, + "time_per_iteration": 2.5015766620635986 + }, + { + "auxiliary_loss_clip": 0.01251899, + "auxiliary_loss_mlp": 0.01072468, + "balance_loss_clip": 1.04077065, + "balance_loss_mlp": 1.07667851, + "epoch": 0.04437096046896137, + "flos": 33979861025280.0, + "grad_norm": 2.0376910697928947, + "language_loss": 0.8518666, + "learning_rate": 3.997834356895906e-06, + "loss": 0.87511027, + "num_input_tokens_seen": 15813175, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.75, + "step": 738, + "time_per_iteration": 2.5576610565185547 + }, + { + "auxiliary_loss_clip": 0.01112093, + "auxiliary_loss_mlp": 0.01046041, + "balance_loss_clip": 1.04048622, + "balance_loss_mlp": 1.03298163, + "epoch": 0.04443108372162934, + "flos": 67397506375680.0, + "grad_norm": 0.8684121686227821, + "language_loss": 0.59110028, + "learning_rate": 3.9978161997518324e-06, + "loss": 0.61268163, + "num_input_tokens_seen": 15872050, + "router_z_loss_clip": 0.05566406, + "router_z_loss_mlp": 0.7890625, + "step": 739, + "time_per_iteration": 3.0643718242645264 + }, + { + "auxiliary_loss_clip": 0.0124678, + "auxiliary_loss_mlp": 0.01070548, + "balance_loss_clip": 1.04220033, + "balance_loss_mlp": 1.07513726, + "epoch": 0.04449120697429731, + "flos": 29752672953600.0, + "grad_norm": 2.1860888775648695, + "language_loss": 0.91622591, + "learning_rate": 3.997797966850369e-06, + "loss": 0.93939924, + "num_input_tokens_seen": 15891085, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.71875, + "step": 740, + "time_per_iteration": 2.5448389053344727 + }, + { + "auxiliary_loss_clip": 0.01252276, + "auxiliary_loss_mlp": 0.01063948, + "balance_loss_clip": 1.03693473, + "balance_loss_mlp": 1.07766986, + "epoch": 0.04455133022696528, + "flos": 36502119072000.0, + "grad_norm": 2.01644947055736, + "language_loss": 0.71842492, + "learning_rate": 3.997779658192205e-06, + "loss": 0.74158716, + "num_input_tokens_seen": 15914225, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.7421875, + "step": 741, + "time_per_iteration": 2.5979790687561035 + }, + { + "auxiliary_loss_clip": 0.01240373, + "auxiliary_loss_mlp": 0.01073056, + "balance_loss_clip": 1.04532838, + "balance_loss_mlp": 1.07044411, + "epoch": 0.044611453479633245, + "flos": 28803661672320.0, + "grad_norm": 1.722907957661965, + "language_loss": 0.88555831, + "learning_rate": 3.997761273778037e-06, + "loss": 0.9086926, + "num_input_tokens_seen": 15934540, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.703125, + "step": 742, + "time_per_iteration": 2.6367549896240234 + }, + { + "auxiliary_loss_clip": 0.0124233, + "auxiliary_loss_mlp": 0.01061826, + "balance_loss_clip": 1.03253651, + "balance_loss_mlp": 1.07209873, + "epoch": 0.04467157673230122, + "flos": 20010970304640.0, + "grad_norm": 2.0306401320231693, + "language_loss": 0.83823264, + "learning_rate": 3.997742813608561e-06, + "loss": 0.86127412, + "num_input_tokens_seen": 15952560, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.703125, + "step": 743, + "time_per_iteration": 2.516587972640991 + }, + { + "auxiliary_loss_clip": 0.01249271, + "auxiliary_loss_mlp": 0.01068722, + "balance_loss_clip": 1.04161429, + "balance_loss_mlp": 1.07474804, + "epoch": 0.04473169998496919, + "flos": 18004964480640.0, + "grad_norm": 3.0889105946672704, + "language_loss": 0.79948521, + "learning_rate": 3.997724277684479e-06, + "loss": 0.8226651, + "num_input_tokens_seen": 15970620, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.75, + "step": 744, + "time_per_iteration": 2.44805645942688 + }, + { + "auxiliary_loss_clip": 0.01243449, + "auxiliary_loss_mlp": 0.01067337, + "balance_loss_clip": 1.04037201, + "balance_loss_mlp": 1.07279778, + "epoch": 0.044791823237637154, + "flos": 20631722169600.0, + "grad_norm": 2.388036535067576, + "language_loss": 0.85400093, + "learning_rate": 3.99770566600649e-06, + "loss": 0.87710881, + "num_input_tokens_seen": 15987325, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.703125, + "step": 745, + "time_per_iteration": 2.4790430068969727 + }, + { + "auxiliary_loss_clip": 0.01242131, + "auxiliary_loss_mlp": 0.01064523, + "balance_loss_clip": 1.03569877, + "balance_loss_mlp": 1.0714339, + "epoch": 0.04485194649030513, + "flos": 31176171918720.0, + "grad_norm": 2.1215702602167688, + "language_loss": 0.6866799, + "learning_rate": 3.997686978575302e-06, + "loss": 0.70974648, + "num_input_tokens_seen": 16008310, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.7109375, + "step": 746, + "time_per_iteration": 2.5645759105682373 + }, + { + "auxiliary_loss_clip": 0.01250748, + "auxiliary_loss_mlp": 0.010776, + "balance_loss_clip": 1.04748797, + "balance_loss_mlp": 1.0783143, + "epoch": 0.04491206974297309, + "flos": 26143291831680.0, + "grad_norm": 2.1376273799467547, + "language_loss": 0.68823957, + "learning_rate": 3.997668215391625e-06, + "loss": 0.71152306, + "num_input_tokens_seen": 16029620, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.7265625, + "step": 747, + "time_per_iteration": 2.5267317295074463 + }, + { + "auxiliary_loss_clip": 0.01248685, + "auxiliary_loss_mlp": 0.01079454, + "balance_loss_clip": 1.04984236, + "balance_loss_mlp": 1.07314527, + "epoch": 0.044972192995641064, + "flos": 20667668705280.0, + "grad_norm": 1.9669744064389407, + "language_loss": 0.66721869, + "learning_rate": 3.997649376456168e-06, + "loss": 0.69050002, + "num_input_tokens_seen": 16049065, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.75, + "step": 748, + "time_per_iteration": 2.4818925857543945 + }, + { + "auxiliary_loss_clip": 0.01250197, + "auxiliary_loss_mlp": 0.01082391, + "balance_loss_clip": 1.05320835, + "balance_loss_mlp": 1.07779491, + "epoch": 0.045032316248309036, + "flos": 16106834177280.0, + "grad_norm": 2.650057046326624, + "language_loss": 0.76540357, + "learning_rate": 3.997630461769647e-06, + "loss": 0.78872949, + "num_input_tokens_seen": 16066765, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.7265625, + "step": 749, + "time_per_iteration": 2.4454426765441895 + }, + { + "auxiliary_loss_clip": 0.01251335, + "auxiliary_loss_mlp": 0.01077492, + "balance_loss_clip": 1.04883409, + "balance_loss_mlp": 1.0770005, + "epoch": 0.045092439500977, + "flos": 17858843953920.0, + "grad_norm": 2.0345099055640317, + "language_loss": 0.88970172, + "learning_rate": 3.997611471332778e-06, + "loss": 0.91298997, + "num_input_tokens_seen": 16085980, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.7421875, + "step": 750, + "time_per_iteration": 2.458716630935669 + }, + { + "auxiliary_loss_clip": 0.01247033, + "auxiliary_loss_mlp": 0.01074335, + "balance_loss_clip": 1.04295921, + "balance_loss_mlp": 1.07139015, + "epoch": 0.04515256275364497, + "flos": 24462815990400.0, + "grad_norm": 2.3716924268159367, + "language_loss": 0.74869245, + "learning_rate": 3.9975924051462825e-06, + "loss": 0.77190608, + "num_input_tokens_seen": 16106260, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 1.7578125, + "step": 751, + "time_per_iteration": 2.5231218338012695 + }, + { + "auxiliary_loss_clip": 0.01243504, + "auxiliary_loss_mlp": 0.01073697, + "balance_loss_clip": 1.04573071, + "balance_loss_mlp": 1.07175446, + "epoch": 0.04521268600631294, + "flos": 20916385453440.0, + "grad_norm": 2.2224468826240975, + "language_loss": 0.69360238, + "learning_rate": 3.997573263210883e-06, + "loss": 0.7167744, + "num_input_tokens_seen": 16123475, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.71875, + "step": 752, + "time_per_iteration": 2.4620048999786377 + }, + { + "auxiliary_loss_clip": 0.01244736, + "auxiliary_loss_mlp": 0.01057192, + "balance_loss_clip": 1.02927327, + "balance_loss_mlp": 1.07154715, + "epoch": 0.04527280925898091, + "flos": 13371374954880.0, + "grad_norm": 2.984649176219999, + "language_loss": 0.91634125, + "learning_rate": 3.997554045527305e-06, + "loss": 0.9393605, + "num_input_tokens_seen": 16138335, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.734375, + "step": 753, + "time_per_iteration": 2.4722437858581543 + }, + { + "auxiliary_loss_clip": 0.01249124, + "auxiliary_loss_mlp": 0.01075497, + "balance_loss_clip": 1.04728031, + "balance_loss_mlp": 1.07501864, + "epoch": 0.04533293251164888, + "flos": 23254565276160.0, + "grad_norm": 2.2056938633592975, + "language_loss": 0.91197902, + "learning_rate": 3.997534752096277e-06, + "loss": 0.93522525, + "num_input_tokens_seen": 16157110, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.7421875, + "step": 754, + "time_per_iteration": 2.472975492477417 + }, + { + "auxiliary_loss_clip": 0.01238249, + "auxiliary_loss_mlp": 0.0107062, + "balance_loss_clip": 1.04144955, + "balance_loss_mlp": 1.07163191, + "epoch": 0.04539305576431685, + "flos": 12422004537600.0, + "grad_norm": 2.234660546964849, + "language_loss": 0.78528345, + "learning_rate": 3.997515382918531e-06, + "loss": 0.80837214, + "num_input_tokens_seen": 16174155, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.671875, + "step": 755, + "time_per_iteration": 2.4704174995422363 + }, + { + "auxiliary_loss_clip": 0.01248815, + "auxiliary_loss_mlp": 0.0107981, + "balance_loss_clip": 1.05100918, + "balance_loss_mlp": 1.07416105, + "epoch": 0.04545317901698482, + "flos": 16070995382400.0, + "grad_norm": 1.9667934561660614, + "language_loss": 0.78451371, + "learning_rate": 3.9974959379948015e-06, + "loss": 0.80779994, + "num_input_tokens_seen": 16192240, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.75, + "step": 756, + "time_per_iteration": 2.4873547554016113 + }, + { + "auxiliary_loss_clip": 0.01118229, + "auxiliary_loss_mlp": 0.010118, + "balance_loss_clip": 1.00600612, + "balance_loss_mlp": 1.03558636, + "epoch": 0.045513302269652785, + "flos": 66396139021440.0, + "grad_norm": 0.8118987787253854, + "language_loss": 0.62730747, + "learning_rate": 3.997476417325827e-06, + "loss": 0.64860779, + "num_input_tokens_seen": 16255775, + "router_z_loss_clip": 0.05786133, + "router_z_loss_mlp": 0.828125, + "step": 757, + "time_per_iteration": 3.1292941570281982 + }, + { + "auxiliary_loss_clip": 0.01242797, + "auxiliary_loss_mlp": 0.01069674, + "balance_loss_clip": 1.04220784, + "balance_loss_mlp": 1.0731318, + "epoch": 0.04557342552232076, + "flos": 21471169991040.0, + "grad_norm": 1.5194495460848947, + "language_loss": 0.84329176, + "learning_rate": 3.997456820912346e-06, + "loss": 0.86641645, + "num_input_tokens_seen": 16277015, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.6953125, + "step": 758, + "time_per_iteration": 2.498905658721924 + }, + { + "auxiliary_loss_clip": 0.01237511, + "auxiliary_loss_mlp": 0.01067085, + "balance_loss_clip": 1.0405376, + "balance_loss_mlp": 1.06733441, + "epoch": 0.04563354877498873, + "flos": 23732680233600.0, + "grad_norm": 2.0933163310434963, + "language_loss": 0.88315606, + "learning_rate": 3.997437148755101e-06, + "loss": 0.90620202, + "num_input_tokens_seen": 16296005, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.703125, + "step": 759, + "time_per_iteration": 2.5122711658477783 + }, + { + "auxiliary_loss_clip": 0.01248241, + "auxiliary_loss_mlp": 0.01075804, + "balance_loss_clip": 1.04644299, + "balance_loss_mlp": 1.075526, + "epoch": 0.045693672027656694, + "flos": 25735741142400.0, + "grad_norm": 2.170817451496144, + "language_loss": 0.73644727, + "learning_rate": 3.9974174008548405e-06, + "loss": 0.75968778, + "num_input_tokens_seen": 16315300, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.7265625, + "step": 760, + "time_per_iteration": 2.511322021484375 + }, + { + "auxiliary_loss_clip": 0.01244913, + "auxiliary_loss_mlp": 0.01073409, + "balance_loss_clip": 1.04630077, + "balance_loss_mlp": 1.07509935, + "epoch": 0.045753795280324666, + "flos": 19719016560000.0, + "grad_norm": 2.192184725657734, + "language_loss": 0.82177126, + "learning_rate": 3.9973975772123105e-06, + "loss": 0.84495443, + "num_input_tokens_seen": 16333820, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.6953125, + "step": 761, + "time_per_iteration": 2.4831535816192627 + }, + { + "auxiliary_loss_clip": 0.01238913, + "auxiliary_loss_mlp": 0.0107061, + "balance_loss_clip": 1.04245305, + "balance_loss_mlp": 1.06961203, + "epoch": 0.04581391853299264, + "flos": 23255786338560.0, + "grad_norm": 1.7986428347309282, + "language_loss": 0.79732436, + "learning_rate": 3.997377677828266e-06, + "loss": 0.82041955, + "num_input_tokens_seen": 16355290, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.6953125, + "step": 762, + "time_per_iteration": 2.52453875541687 + }, + { + "auxiliary_loss_clip": 0.01117014, + "auxiliary_loss_mlp": 0.01036706, + "balance_loss_clip": 1.03112733, + "balance_loss_mlp": 1.03455913, + "epoch": 0.0458740417856606, + "flos": 64231155601920.0, + "grad_norm": 1.008821564963746, + "language_loss": 0.58659625, + "learning_rate": 3.9973577027034585e-06, + "loss": 0.60813344, + "num_input_tokens_seen": 16415995, + "router_z_loss_clip": 0.0559082, + "router_z_loss_mlp": 0.82421875, + "step": 763, + "time_per_iteration": 3.1429429054260254 + }, + { + "auxiliary_loss_clip": 0.01245459, + "auxiliary_loss_mlp": 0.01081866, + "balance_loss_clip": 1.05381632, + "balance_loss_mlp": 1.07288039, + "epoch": 0.045934165038328575, + "flos": 20770121272320.0, + "grad_norm": 2.8717486924500517, + "language_loss": 0.87752867, + "learning_rate": 3.9973376518386475e-06, + "loss": 0.9008019, + "num_input_tokens_seen": 16433120, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.7265625, + "step": 764, + "time_per_iteration": 2.4727554321289062 + }, + { + "auxiliary_loss_clip": 0.01248006, + "auxiliary_loss_mlp": 0.01079864, + "balance_loss_clip": 1.05192137, + "balance_loss_mlp": 1.07565248, + "epoch": 0.04599428829099654, + "flos": 30262891691520.0, + "grad_norm": 1.9426139778845304, + "language_loss": 0.86118066, + "learning_rate": 3.997317525234592e-06, + "loss": 0.88445938, + "num_input_tokens_seen": 16453360, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.71875, + "step": 765, + "time_per_iteration": 2.5370731353759766 + }, + { + "auxiliary_loss_clip": 0.01248646, + "auxiliary_loss_mlp": 0.01070241, + "balance_loss_clip": 1.03912735, + "balance_loss_mlp": 1.07336497, + "epoch": 0.04605441154366451, + "flos": 23038921975680.0, + "grad_norm": 3.0624701923152453, + "language_loss": 0.87846982, + "learning_rate": 3.997297322892056e-06, + "loss": 0.90165865, + "num_input_tokens_seen": 16471160, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 1.75, + "step": 766, + "time_per_iteration": 2.475677013397217 + }, + { + "auxiliary_loss_clip": 0.01239894, + "auxiliary_loss_mlp": 0.01067957, + "balance_loss_clip": 1.03979921, + "balance_loss_mlp": 1.06896472, + "epoch": 0.046114534796332485, + "flos": 22017407091840.0, + "grad_norm": 2.616885530601855, + "language_loss": 0.84314167, + "learning_rate": 3.997277044811806e-06, + "loss": 0.86622024, + "num_input_tokens_seen": 16488940, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.703125, + "step": 767, + "time_per_iteration": 2.465449810028076 + }, + { + "auxiliary_loss_clip": 0.01245421, + "auxiliary_loss_mlp": 0.01060911, + "balance_loss_clip": 1.03249097, + "balance_loss_mlp": 1.07569289, + "epoch": 0.04617465804900045, + "flos": 29862380067840.0, + "grad_norm": 2.056931367891973, + "language_loss": 0.87013769, + "learning_rate": 3.99725669099461e-06, + "loss": 0.89320099, + "num_input_tokens_seen": 16509505, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.703125, + "step": 768, + "time_per_iteration": 5.441957235336304 + }, + { + "auxiliary_loss_clip": 0.01238542, + "auxiliary_loss_mlp": 0.01069073, + "balance_loss_clip": 1.04184508, + "balance_loss_mlp": 1.06768477, + "epoch": 0.04623478130166842, + "flos": 25630056351360.0, + "grad_norm": 2.1199205591749033, + "language_loss": 0.75022334, + "learning_rate": 3.9972362614412395e-06, + "loss": 0.77329946, + "num_input_tokens_seen": 16528840, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.703125, + "step": 769, + "time_per_iteration": 2.5294675827026367 + }, + { + "auxiliary_loss_clip": 0.01238179, + "auxiliary_loss_mlp": 0.01063477, + "balance_loss_clip": 1.03734684, + "balance_loss_mlp": 1.07084632, + "epoch": 0.04629490455433639, + "flos": 20449080489600.0, + "grad_norm": 1.886534334963383, + "language_loss": 0.86162585, + "learning_rate": 3.997215756152471e-06, + "loss": 0.88464236, + "num_input_tokens_seen": 16548335, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.671875, + "step": 770, + "time_per_iteration": 2.4646449089050293 + }, + { + "auxiliary_loss_clip": 0.01248004, + "auxiliary_loss_mlp": 0.01066015, + "balance_loss_clip": 1.0385015, + "balance_loss_mlp": 1.07160687, + "epoch": 0.04635502780700436, + "flos": 23148736830720.0, + "grad_norm": 2.8625416592988477, + "language_loss": 0.87259042, + "learning_rate": 3.99719517512908e-06, + "loss": 0.89573061, + "num_input_tokens_seen": 16567725, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.765625, + "step": 771, + "time_per_iteration": 2.512622117996216 + }, + { + "auxiliary_loss_clip": 0.01246333, + "auxiliary_loss_mlp": 0.01076832, + "balance_loss_clip": 1.04726815, + "balance_loss_mlp": 1.06911707, + "epoch": 0.04641515105967233, + "flos": 23292020183040.0, + "grad_norm": 2.3640102097360587, + "language_loss": 0.83736801, + "learning_rate": 3.997174518371848e-06, + "loss": 0.86059964, + "num_input_tokens_seen": 16588175, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.7734375, + "step": 772, + "time_per_iteration": 2.509572982788086 + }, + { + "auxiliary_loss_clip": 0.01243608, + "auxiliary_loss_mlp": 0.01064058, + "balance_loss_clip": 1.03721189, + "balance_loss_mlp": 1.07392263, + "epoch": 0.046475274312340296, + "flos": 25115204759040.0, + "grad_norm": 2.3097217333215694, + "language_loss": 0.73399591, + "learning_rate": 3.997153785881557e-06, + "loss": 0.75707257, + "num_input_tokens_seen": 16607735, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.6953125, + "step": 773, + "time_per_iteration": 2.5539331436157227 + }, + { + "auxiliary_loss_clip": 0.01240234, + "auxiliary_loss_mlp": 0.01065536, + "balance_loss_clip": 1.03624654, + "balance_loss_mlp": 1.07288945, + "epoch": 0.04653539756500827, + "flos": 25264916645760.0, + "grad_norm": 2.066531290075925, + "language_loss": 0.78523052, + "learning_rate": 3.997132977658996e-06, + "loss": 0.80828828, + "num_input_tokens_seen": 16627225, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.671875, + "step": 774, + "time_per_iteration": 2.5350210666656494 + }, + { + "auxiliary_loss_clip": 0.01239038, + "auxiliary_loss_mlp": 0.01065848, + "balance_loss_clip": 1.03955007, + "balance_loss_mlp": 1.07101154, + "epoch": 0.046595520817676234, + "flos": 35404150089600.0, + "grad_norm": 2.187480231527322, + "language_loss": 0.73357666, + "learning_rate": 3.997112093704952e-06, + "loss": 0.75662553, + "num_input_tokens_seen": 16647785, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.6796875, + "step": 775, + "time_per_iteration": 2.6102981567382812 + }, + { + "auxiliary_loss_clip": 0.01240703, + "auxiliary_loss_mlp": 0.01059246, + "balance_loss_clip": 1.03096998, + "balance_loss_mlp": 1.06996655, + "epoch": 0.046655644070344206, + "flos": 18112516778880.0, + "grad_norm": 1.5904648869830247, + "language_loss": 0.77037287, + "learning_rate": 3.997091134020217e-06, + "loss": 0.79337239, + "num_input_tokens_seen": 16667555, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.703125, + "step": 776, + "time_per_iteration": 2.4713642597198486 + }, + { + "auxiliary_loss_clip": 0.01236202, + "auxiliary_loss_mlp": 0.01063775, + "balance_loss_clip": 1.03790653, + "balance_loss_mlp": 1.06914115, + "epoch": 0.04671576732301218, + "flos": 29205286617600.0, + "grad_norm": 1.9751950676431418, + "language_loss": 0.70967531, + "learning_rate": 3.997070098605585e-06, + "loss": 0.73267508, + "num_input_tokens_seen": 16686875, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.671875, + "step": 777, + "time_per_iteration": 2.540151596069336 + }, + { + "auxiliary_loss_clip": 0.01242182, + "auxiliary_loss_mlp": 0.01078178, + "balance_loss_clip": 1.04999709, + "balance_loss_mlp": 1.07221043, + "epoch": 0.04677589057568014, + "flos": 30478319510400.0, + "grad_norm": 1.9852588200641685, + "language_loss": 0.76756501, + "learning_rate": 3.997048987461856e-06, + "loss": 0.79076868, + "num_input_tokens_seen": 16706420, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.703125, + "step": 778, + "time_per_iteration": 2.5299642086029053 + }, + { + "auxiliary_loss_clip": 0.01236882, + "auxiliary_loss_mlp": 0.01068399, + "balance_loss_clip": 1.04049253, + "balance_loss_mlp": 1.06948996, + "epoch": 0.046836013828348115, + "flos": 20557674282240.0, + "grad_norm": 2.9364819041983576, + "language_loss": 0.78900939, + "learning_rate": 3.997027800589829e-06, + "loss": 0.81206226, + "num_input_tokens_seen": 16726390, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.671875, + "step": 779, + "time_per_iteration": 2.4999477863311768 + }, + { + "auxiliary_loss_clip": 0.01230899, + "auxiliary_loss_mlp": 0.01065999, + "balance_loss_clip": 1.03997588, + "balance_loss_mlp": 1.06776333, + "epoch": 0.04689613708101608, + "flos": 25447378757760.0, + "grad_norm": 1.7037291099106273, + "language_loss": 0.77051055, + "learning_rate": 3.997006537990308e-06, + "loss": 0.7934795, + "num_input_tokens_seen": 16748965, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.6328125, + "step": 780, + "time_per_iteration": 2.54770565032959 + }, + { + "auxiliary_loss_clip": 0.01235667, + "auxiliary_loss_mlp": 0.01067194, + "balance_loss_clip": 1.04187369, + "balance_loss_mlp": 1.07070863, + "epoch": 0.04695626033368405, + "flos": 23001395241600.0, + "grad_norm": 2.6789342331958745, + "language_loss": 0.76432645, + "learning_rate": 3.996985199664099e-06, + "loss": 0.78735507, + "num_input_tokens_seen": 16768620, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.6484375, + "step": 781, + "time_per_iteration": 2.5040361881256104 + }, + { + "auxiliary_loss_clip": 0.01245917, + "auxiliary_loss_mlp": 0.01072818, + "balance_loss_clip": 1.04468417, + "balance_loss_mlp": 1.07423282, + "epoch": 0.047016383586352024, + "flos": 29133357632640.0, + "grad_norm": 2.2171800145032736, + "language_loss": 0.74027473, + "learning_rate": 3.99696378561201e-06, + "loss": 0.76346207, + "num_input_tokens_seen": 16789755, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.71875, + "step": 782, + "time_per_iteration": 2.528890371322632 + }, + { + "auxiliary_loss_clip": 0.01241991, + "auxiliary_loss_mlp": 0.01060851, + "balance_loss_clip": 1.03549504, + "balance_loss_mlp": 1.07483578, + "epoch": 0.04707650683901999, + "flos": 14976330451200.0, + "grad_norm": 6.219089205177081, + "language_loss": 0.8032757, + "learning_rate": 3.996942295834855e-06, + "loss": 0.82630414, + "num_input_tokens_seen": 16807585, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.671875, + "step": 783, + "time_per_iteration": 2.4866061210632324 + }, + { + "auxiliary_loss_clip": 0.01232605, + "auxiliary_loss_mlp": 0.01059533, + "balance_loss_clip": 1.03417742, + "balance_loss_mlp": 1.07062817, + "epoch": 0.04713663009168796, + "flos": 21651118151040.0, + "grad_norm": 2.0172272756643816, + "language_loss": 0.81289953, + "learning_rate": 3.996920730333448e-06, + "loss": 0.83582091, + "num_input_tokens_seen": 16827220, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.6171875, + "step": 784, + "time_per_iteration": 2.476659059524536 + }, + { + "auxiliary_loss_clip": 0.01238913, + "auxiliary_loss_mlp": 0.01072248, + "balance_loss_clip": 1.04597473, + "balance_loss_mlp": 1.0683856, + "epoch": 0.04719675334435593, + "flos": 21325408600320.0, + "grad_norm": 2.171254656371271, + "language_loss": 0.8076694, + "learning_rate": 3.996899089108607e-06, + "loss": 0.83078098, + "num_input_tokens_seen": 16846230, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.703125, + "step": 785, + "time_per_iteration": 2.493598461151123 + }, + { + "auxiliary_loss_clip": 0.01241548, + "auxiliary_loss_mlp": 0.01061941, + "balance_loss_clip": 1.03752661, + "balance_loss_mlp": 1.0762614, + "epoch": 0.0472568765970239, + "flos": 17931383470080.0, + "grad_norm": 2.444819858404617, + "language_loss": 0.89981294, + "learning_rate": 3.996877372161152e-06, + "loss": 0.92284781, + "num_input_tokens_seen": 16865325, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.65625, + "step": 786, + "time_per_iteration": 2.4573311805725098 + }, + { + "auxiliary_loss_clip": 0.012413, + "auxiliary_loss_mlp": 0.01070001, + "balance_loss_clip": 1.04055619, + "balance_loss_mlp": 1.06742501, + "epoch": 0.04731699984969187, + "flos": 18077324428800.0, + "grad_norm": 3.379381752409287, + "language_loss": 0.76639462, + "learning_rate": 3.9968555794919065e-06, + "loss": 0.78950763, + "num_input_tokens_seen": 16882930, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.734375, + "step": 787, + "time_per_iteration": 2.447611093521118 + }, + { + "auxiliary_loss_clip": 0.01247236, + "auxiliary_loss_mlp": 0.01071736, + "balance_loss_clip": 1.04431772, + "balance_loss_mlp": 1.0765723, + "epoch": 0.047377123102359836, + "flos": 23185078416000.0, + "grad_norm": 2.4642209511959403, + "language_loss": 0.80851126, + "learning_rate": 3.996833711101698e-06, + "loss": 0.83170098, + "num_input_tokens_seen": 16900710, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.7109375, + "step": 788, + "time_per_iteration": 2.4679956436157227 + }, + { + "auxiliary_loss_clip": 0.01236983, + "auxiliary_loss_mlp": 0.01074337, + "balance_loss_clip": 1.04551244, + "balance_loss_mlp": 1.07285857, + "epoch": 0.04743724635502781, + "flos": 22747794243840.0, + "grad_norm": 2.2318634793178127, + "language_loss": 0.84819949, + "learning_rate": 3.996811766991355e-06, + "loss": 0.87131274, + "num_input_tokens_seen": 16919210, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.640625, + "step": 789, + "time_per_iteration": 2.4982516765594482 + }, + { + "auxiliary_loss_clip": 0.01242053, + "auxiliary_loss_mlp": 0.01066276, + "balance_loss_clip": 1.04006219, + "balance_loss_mlp": 1.07367456, + "epoch": 0.04749736960769577, + "flos": 17238702620160.0, + "grad_norm": 1.948517450129577, + "language_loss": 0.82196069, + "learning_rate": 3.996789747161709e-06, + "loss": 0.84504396, + "num_input_tokens_seen": 16937125, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.6875, + "step": 790, + "time_per_iteration": 2.4380602836608887 + }, + { + "auxiliary_loss_clip": 0.01236299, + "auxiliary_loss_mlp": 0.01062348, + "balance_loss_clip": 1.03524029, + "balance_loss_mlp": 1.06857598, + "epoch": 0.047557492860363745, + "flos": 40479261592320.0, + "grad_norm": 2.8806939749630054, + "language_loss": 0.88245451, + "learning_rate": 3.996767651613597e-06, + "loss": 0.90544093, + "num_input_tokens_seen": 16958610, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.671875, + "step": 791, + "time_per_iteration": 2.6723573207855225 + }, + { + "auxiliary_loss_clip": 0.01239952, + "auxiliary_loss_mlp": 0.010655, + "balance_loss_clip": 1.03826034, + "balance_loss_mlp": 1.07212687, + "epoch": 0.04761761611303172, + "flos": 18698004466560.0, + "grad_norm": 2.2584516419561464, + "language_loss": 0.90245461, + "learning_rate": 3.996745480347854e-06, + "loss": 0.92550921, + "num_input_tokens_seen": 16977300, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.671875, + "step": 792, + "time_per_iteration": 2.4627771377563477 + }, + { + "auxiliary_loss_clip": 0.01241845, + "auxiliary_loss_mlp": 0.01074856, + "balance_loss_clip": 1.04874945, + "balance_loss_mlp": 1.07157969, + "epoch": 0.04767773936569968, + "flos": 20921987975040.0, + "grad_norm": 1.9386484459236437, + "language_loss": 0.7310667, + "learning_rate": 3.996723233365324e-06, + "loss": 0.75423372, + "num_input_tokens_seen": 16994950, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.703125, + "step": 793, + "time_per_iteration": 2.512657403945923 + }, + { + "auxiliary_loss_clip": 0.0124198, + "auxiliary_loss_mlp": 0.01067209, + "balance_loss_clip": 1.03969526, + "balance_loss_mlp": 1.07207203, + "epoch": 0.047737862618367655, + "flos": 23732680233600.0, + "grad_norm": 2.0117940746735123, + "language_loss": 0.86102074, + "learning_rate": 3.996700910666847e-06, + "loss": 0.88411266, + "num_input_tokens_seen": 17014760, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.703125, + "step": 794, + "time_per_iteration": 2.510611057281494 + }, + { + "auxiliary_loss_clip": 0.0123999, + "auxiliary_loss_mlp": 0.01074174, + "balance_loss_clip": 1.04701805, + "balance_loss_mlp": 1.06925917, + "epoch": 0.04779798587103562, + "flos": 23695764030720.0, + "grad_norm": 3.4118642482115384, + "language_loss": 0.69812739, + "learning_rate": 3.996678512253272e-06, + "loss": 0.72126907, + "num_input_tokens_seen": 17032715, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.7109375, + "step": 795, + "time_per_iteration": 2.500420093536377 + }, + { + "auxiliary_loss_clip": 0.01236981, + "auxiliary_loss_mlp": 0.01070364, + "balance_loss_clip": 1.0432204, + "balance_loss_mlp": 1.06999111, + "epoch": 0.04785810912370359, + "flos": 23183641872000.0, + "grad_norm": 2.0479238599532135, + "language_loss": 0.81053579, + "learning_rate": 3.996656038125449e-06, + "loss": 0.83360916, + "num_input_tokens_seen": 17052215, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.671875, + "step": 796, + "time_per_iteration": 2.4838409423828125 + }, + { + "auxiliary_loss_clip": 0.0124002, + "auxiliary_loss_mlp": 0.01058331, + "balance_loss_clip": 1.03129458, + "balance_loss_mlp": 1.07190371, + "epoch": 0.047918232376371564, + "flos": 18040623707520.0, + "grad_norm": 2.3456590334750858, + "language_loss": 0.81249642, + "learning_rate": 3.996633488284228e-06, + "loss": 0.83547997, + "num_input_tokens_seen": 17069225, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.6796875, + "step": 797, + "time_per_iteration": 2.466343402862549 + }, + { + "auxiliary_loss_clip": 0.01122032, + "auxiliary_loss_mlp": 0.0100279, + "balance_loss_clip": 0.9972828, + "balance_loss_mlp": 1.03672731, + "epoch": 0.04797835562903953, + "flos": 62442588758400.0, + "grad_norm": 0.9120921080635288, + "language_loss": 0.64447635, + "learning_rate": 3.996610862730465e-06, + "loss": 0.66572458, + "num_input_tokens_seen": 17126680, + "router_z_loss_clip": 0.05517578, + "router_z_loss_mlp": 0.8515625, + "step": 798, + "time_per_iteration": 3.0081863403320312 + }, + { + "auxiliary_loss_clip": 0.01243937, + "auxiliary_loss_mlp": 0.01070197, + "balance_loss_clip": 1.04285014, + "balance_loss_mlp": 1.06894708, + "epoch": 0.0480384788817075, + "flos": 21507296094720.0, + "grad_norm": 7.0153313624744005, + "language_loss": 0.90794134, + "learning_rate": 3.996588161465018e-06, + "loss": 0.93108267, + "num_input_tokens_seen": 17144835, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.75, + "step": 799, + "time_per_iteration": 2.4872424602508545 + }, + { + "auxiliary_loss_clip": 0.01242621, + "auxiliary_loss_mlp": 0.01069655, + "balance_loss_clip": 1.04220068, + "balance_loss_mlp": 1.07567, + "epoch": 0.048098602134375466, + "flos": 21726710323200.0, + "grad_norm": 2.1467314479540818, + "language_loss": 0.86701, + "learning_rate": 3.996565384488748e-06, + "loss": 0.89013278, + "num_input_tokens_seen": 17165030, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.671875, + "step": 800, + "time_per_iteration": 2.477720022201538 + }, + { + "auxiliary_loss_clip": 0.01243518, + "auxiliary_loss_mlp": 0.0106979, + "balance_loss_clip": 1.04362369, + "balance_loss_mlp": 1.07207572, + "epoch": 0.04815872538704344, + "flos": 22931082368640.0, + "grad_norm": 7.517902152046504, + "language_loss": 0.84513009, + "learning_rate": 3.996542531802518e-06, + "loss": 0.86826313, + "num_input_tokens_seen": 17184895, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.71875, + "step": 801, + "time_per_iteration": 2.487889528274536 + }, + { + "auxiliary_loss_clip": 0.01242116, + "auxiliary_loss_mlp": 0.01071409, + "balance_loss_clip": 1.04470587, + "balance_loss_mlp": 1.07289147, + "epoch": 0.04821884863971141, + "flos": 43174716042240.0, + "grad_norm": 1.97564705550146, + "language_loss": 0.79967415, + "learning_rate": 3.996519603407196e-06, + "loss": 0.82280934, + "num_input_tokens_seen": 17208225, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.6953125, + "step": 802, + "time_per_iteration": 2.6496224403381348 + }, + { + "auxiliary_loss_clip": 0.01238875, + "auxiliary_loss_mlp": 0.01065547, + "balance_loss_clip": 1.03963101, + "balance_loss_mlp": 1.07270598, + "epoch": 0.048278971892379376, + "flos": 18620006083200.0, + "grad_norm": 2.8331626885697725, + "language_loss": 0.86420751, + "learning_rate": 3.996496599303649e-06, + "loss": 0.88725173, + "num_input_tokens_seen": 17226305, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.6640625, + "step": 803, + "time_per_iteration": 2.4806807041168213 + }, + { + "auxiliary_loss_clip": 0.01238315, + "auxiliary_loss_mlp": 0.01061166, + "balance_loss_clip": 1.0346303, + "balance_loss_mlp": 1.07398677, + "epoch": 0.04833909514504735, + "flos": 20230061310720.0, + "grad_norm": 2.229653749186784, + "language_loss": 0.85436332, + "learning_rate": 3.996473519492753e-06, + "loss": 0.87735808, + "num_input_tokens_seen": 17244545, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.640625, + "step": 804, + "time_per_iteration": 2.458303213119507 + }, + { + "auxiliary_loss_clip": 0.01239413, + "auxiliary_loss_mlp": 0.01066878, + "balance_loss_clip": 1.04099822, + "balance_loss_mlp": 1.07286024, + "epoch": 0.04839921839771532, + "flos": 24645170361600.0, + "grad_norm": 2.2509331098011645, + "language_loss": 0.86119306, + "learning_rate": 3.99645036397538e-06, + "loss": 0.88425595, + "num_input_tokens_seen": 17265730, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.6640625, + "step": 805, + "time_per_iteration": 2.5021419525146484 + }, + { + "auxiliary_loss_clip": 0.01235031, + "auxiliary_loss_mlp": 0.01067273, + "balance_loss_clip": 1.04115391, + "balance_loss_mlp": 1.06942892, + "epoch": 0.048459341650383285, + "flos": 24827452905600.0, + "grad_norm": 1.8866019303880346, + "language_loss": 0.68034315, + "learning_rate": 3.9964271327524085e-06, + "loss": 0.70336622, + "num_input_tokens_seen": 17284820, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.65625, + "step": 806, + "time_per_iteration": 2.4904568195343018 + }, + { + "auxiliary_loss_clip": 0.01235579, + "auxiliary_loss_mlp": 0.01064526, + "balance_loss_clip": 1.03847933, + "balance_loss_mlp": 1.07208037, + "epoch": 0.04851946490305126, + "flos": 22163204396160.0, + "grad_norm": 2.221107161276338, + "language_loss": 0.7716608, + "learning_rate": 3.9964038258247214e-06, + "loss": 0.79466188, + "num_input_tokens_seen": 17305085, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.6328125, + "step": 807, + "time_per_iteration": 2.498624563217163 + }, + { + "auxiliary_loss_clip": 0.01232532, + "auxiliary_loss_mlp": 0.01071643, + "balance_loss_clip": 1.04567873, + "balance_loss_mlp": 1.06831741, + "epoch": 0.04857958815571922, + "flos": 19792022952960.0, + "grad_norm": 2.844770488216335, + "language_loss": 0.86509991, + "learning_rate": 3.9963804431932005e-06, + "loss": 0.88814163, + "num_input_tokens_seen": 17322715, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.640625, + "step": 808, + "time_per_iteration": 2.444673538208008 + }, + { + "auxiliary_loss_clip": 0.01242847, + "auxiliary_loss_mlp": 0.01070908, + "balance_loss_clip": 1.04441929, + "balance_loss_mlp": 1.07261682, + "epoch": 0.048639711408387194, + "flos": 18697968552960.0, + "grad_norm": 1.9428867449931826, + "language_loss": 0.90154302, + "learning_rate": 3.996356984858732e-06, + "loss": 0.92468053, + "num_input_tokens_seen": 17341455, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.703125, + "step": 809, + "time_per_iteration": 5.353702545166016 + }, + { + "auxiliary_loss_clip": 0.01242102, + "auxiliary_loss_mlp": 0.01070004, + "balance_loss_clip": 1.0432415, + "balance_loss_mlp": 1.07577538, + "epoch": 0.048699834661055166, + "flos": 24863507182080.0, + "grad_norm": 2.12821080633451, + "language_loss": 0.84360719, + "learning_rate": 3.996333450822208e-06, + "loss": 0.86672825, + "num_input_tokens_seen": 17360765, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.65625, + "step": 810, + "time_per_iteration": 3.8935022354125977 + }, + { + "auxiliary_loss_clip": 0.01240735, + "auxiliary_loss_mlp": 0.01066961, + "balance_loss_clip": 1.04049659, + "balance_loss_mlp": 1.07189715, + "epoch": 0.04875995791372313, + "flos": 20704010290560.0, + "grad_norm": 1.7610993085905569, + "language_loss": 0.80875039, + "learning_rate": 3.99630984108452e-06, + "loss": 0.8318274, + "num_input_tokens_seen": 17380625, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.6875, + "step": 811, + "time_per_iteration": 2.5000643730163574 + }, + { + "auxiliary_loss_clip": 0.01232044, + "auxiliary_loss_mlp": 0.0107527, + "balance_loss_clip": 1.04991412, + "balance_loss_mlp": 1.06997907, + "epoch": 0.048820081166391104, + "flos": 18588297352320.0, + "grad_norm": 2.0417171226218715, + "language_loss": 0.74768531, + "learning_rate": 3.9962861556465615e-06, + "loss": 0.77075845, + "num_input_tokens_seen": 17399355, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.625, + "step": 812, + "time_per_iteration": 2.4853179454803467 + }, + { + "auxiliary_loss_clip": 0.01233917, + "auxiliary_loss_mlp": 0.01074517, + "balance_loss_clip": 1.04924428, + "balance_loss_mlp": 1.07263327, + "epoch": 0.04888020441905907, + "flos": 22707322594560.0, + "grad_norm": 1.8904091966919716, + "language_loss": 0.89845109, + "learning_rate": 3.996262394509233e-06, + "loss": 0.92153537, + "num_input_tokens_seen": 17418240, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.609375, + "step": 813, + "time_per_iteration": 2.6731016635894775 + }, + { + "auxiliary_loss_clip": 0.01232344, + "auxiliary_loss_mlp": 0.01058653, + "balance_loss_clip": 1.03429866, + "balance_loss_mlp": 1.07083082, + "epoch": 0.04894032767172704, + "flos": 22784351310720.0, + "grad_norm": 2.028357820963791, + "language_loss": 0.74551463, + "learning_rate": 3.9962385576734335e-06, + "loss": 0.76842451, + "num_input_tokens_seen": 17436250, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.6171875, + "step": 814, + "time_per_iteration": 2.509963035583496 + }, + { + "auxiliary_loss_clip": 0.01235531, + "auxiliary_loss_mlp": 0.01067085, + "balance_loss_clip": 1.04074001, + "balance_loss_mlp": 1.07073569, + "epoch": 0.04900045092439501, + "flos": 25516147345920.0, + "grad_norm": 2.3605733083261464, + "language_loss": 0.83740532, + "learning_rate": 3.9962146451400675e-06, + "loss": 0.86043149, + "num_input_tokens_seen": 17455750, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.6484375, + "step": 815, + "time_per_iteration": 2.5490894317626953 + }, + { + "auxiliary_loss_clip": 0.01239776, + "auxiliary_loss_mlp": 0.0106033, + "balance_loss_clip": 1.03396082, + "balance_loss_mlp": 1.07326484, + "epoch": 0.04906057417706298, + "flos": 25958136199680.0, + "grad_norm": 2.271155414035229, + "language_loss": 0.90803105, + "learning_rate": 3.996190656910043e-06, + "loss": 0.93103218, + "num_input_tokens_seen": 17474995, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.6640625, + "step": 816, + "time_per_iteration": 2.5273053646087646 + }, + { + "auxiliary_loss_clip": 0.01240454, + "auxiliary_loss_mlp": 0.01060699, + "balance_loss_clip": 1.03410304, + "balance_loss_mlp": 1.0732162, + "epoch": 0.04912069742973095, + "flos": 18624638937600.0, + "grad_norm": 3.2321750342473603, + "language_loss": 0.79924619, + "learning_rate": 3.996166592984268e-06, + "loss": 0.82225776, + "num_input_tokens_seen": 17493395, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.671875, + "step": 817, + "time_per_iteration": 2.5095019340515137 + }, + { + "auxiliary_loss_clip": 0.0123455, + "auxiliary_loss_mlp": 0.01074727, + "balance_loss_clip": 1.04864395, + "balance_loss_mlp": 1.07184172, + "epoch": 0.049180820682398915, + "flos": 23699786353920.0, + "grad_norm": 1.8264850687392937, + "language_loss": 0.84520394, + "learning_rate": 3.996142453363656e-06, + "loss": 0.86829674, + "num_input_tokens_seen": 17514565, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.625, + "step": 818, + "time_per_iteration": 2.5476157665252686 + }, + { + "auxiliary_loss_clip": 0.01243386, + "auxiliary_loss_mlp": 0.01067455, + "balance_loss_clip": 1.04041791, + "balance_loss_mlp": 1.07401037, + "epoch": 0.04924094393506689, + "flos": 22420396753920.0, + "grad_norm": 2.779535734169796, + "language_loss": 0.75307131, + "learning_rate": 3.996118238049124e-06, + "loss": 0.77617967, + "num_input_tokens_seen": 17534590, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.6953125, + "step": 819, + "time_per_iteration": 2.5486624240875244 + }, + { + "auxiliary_loss_clip": 0.01239669, + "auxiliary_loss_mlp": 0.01061583, + "balance_loss_clip": 1.03858793, + "balance_loss_mlp": 1.07577193, + "epoch": 0.04930106718773486, + "flos": 15738246766080.0, + "grad_norm": 2.1475545017813853, + "language_loss": 0.85166955, + "learning_rate": 3.996093947041586e-06, + "loss": 0.87468207, + "num_input_tokens_seen": 17551900, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.640625, + "step": 820, + "time_per_iteration": 2.4565298557281494 + }, + { + "auxiliary_loss_clip": 0.0123627, + "auxiliary_loss_mlp": 0.01061843, + "balance_loss_clip": 1.03602266, + "balance_loss_mlp": 1.07061315, + "epoch": 0.049361190440402825, + "flos": 26250628648320.0, + "grad_norm": 1.902695357085614, + "language_loss": 0.9041872, + "learning_rate": 3.996069580341966e-06, + "loss": 0.92716837, + "num_input_tokens_seen": 17571485, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.65625, + "step": 821, + "time_per_iteration": 2.5412514209747314 + }, + { + "auxiliary_loss_clip": 0.01233424, + "auxiliary_loss_mlp": 0.01073041, + "balance_loss_clip": 1.04773307, + "balance_loss_mlp": 1.06951392, + "epoch": 0.0494213136930708, + "flos": 21252366293760.0, + "grad_norm": 2.0531707528144274, + "language_loss": 0.8941884, + "learning_rate": 3.996045137951188e-06, + "loss": 0.91725308, + "num_input_tokens_seen": 17591410, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.640625, + "step": 822, + "time_per_iteration": 2.5171031951904297 + }, + { + "auxiliary_loss_clip": 0.01237258, + "auxiliary_loss_mlp": 0.01059943, + "balance_loss_clip": 1.03295374, + "balance_loss_mlp": 1.0742538, + "epoch": 0.04948143694573876, + "flos": 27965506740480.0, + "grad_norm": 2.060390808888412, + "language_loss": 0.67537785, + "learning_rate": 3.996020619870178e-06, + "loss": 0.69834983, + "num_input_tokens_seen": 17612010, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.6328125, + "step": 823, + "time_per_iteration": 2.5744235515594482 + }, + { + "auxiliary_loss_clip": 0.01120581, + "auxiliary_loss_mlp": 0.01008389, + "balance_loss_clip": 1.00323892, + "balance_loss_mlp": 1.04174662, + "epoch": 0.049541560198406734, + "flos": 66180995533440.0, + "grad_norm": 1.3777513990451415, + "language_loss": 0.62206292, + "learning_rate": 3.995996026099866e-06, + "loss": 0.64335263, + "num_input_tokens_seen": 17673430, + "router_z_loss_clip": 0.05151367, + "router_z_loss_mlp": 0.7890625, + "step": 824, + "time_per_iteration": 3.13708758354187 + }, + { + "auxiliary_loss_clip": 0.01240025, + "auxiliary_loss_mlp": 0.01070032, + "balance_loss_clip": 1.0431149, + "balance_loss_mlp": 1.07293963, + "epoch": 0.049601683451074706, + "flos": 22892693708160.0, + "grad_norm": 2.021638376413324, + "language_loss": 0.90364408, + "learning_rate": 3.995971356641185e-06, + "loss": 0.92674464, + "num_input_tokens_seen": 17689545, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.671875, + "step": 825, + "time_per_iteration": 2.519487142562866 + }, + { + "auxiliary_loss_clip": 0.01237141, + "auxiliary_loss_mlp": 0.01064311, + "balance_loss_clip": 1.03678548, + "balance_loss_mlp": 1.0713625, + "epoch": 0.04966180670374267, + "flos": 21433643256960.0, + "grad_norm": 23.06748840114486, + "language_loss": 0.66790086, + "learning_rate": 3.9959466114950695e-06, + "loss": 0.69091535, + "num_input_tokens_seen": 17705965, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.65625, + "step": 826, + "time_per_iteration": 2.486837387084961 + }, + { + "auxiliary_loss_clip": 0.01236344, + "auxiliary_loss_mlp": 0.01062091, + "balance_loss_clip": 1.0362581, + "balance_loss_mlp": 1.07166433, + "epoch": 0.04972192995641064, + "flos": 23107367341440.0, + "grad_norm": 5.4656671498779845, + "language_loss": 0.78386623, + "learning_rate": 3.995921790662459e-06, + "loss": 0.80685055, + "num_input_tokens_seen": 17724580, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.640625, + "step": 827, + "time_per_iteration": 2.517092704772949 + }, + { + "auxiliary_loss_clip": 0.0124052, + "auxiliary_loss_mlp": 0.01072288, + "balance_loss_clip": 1.04553795, + "balance_loss_mlp": 1.07333767, + "epoch": 0.04978205320907861, + "flos": 40406147458560.0, + "grad_norm": 2.8940457048653916, + "language_loss": 0.78592682, + "learning_rate": 3.995896894144294e-06, + "loss": 0.80905491, + "num_input_tokens_seen": 17747755, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.671875, + "step": 828, + "time_per_iteration": 2.6536450386047363 + }, + { + "auxiliary_loss_clip": 0.01227721, + "auxiliary_loss_mlp": 0.01058804, + "balance_loss_clip": 1.03428292, + "balance_loss_mlp": 1.06777728, + "epoch": 0.04984217646174658, + "flos": 25228539146880.0, + "grad_norm": 2.330577425067274, + "language_loss": 0.83493364, + "learning_rate": 3.995871921941519e-06, + "loss": 0.85779881, + "num_input_tokens_seen": 17768550, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.6015625, + "step": 829, + "time_per_iteration": 2.5744268894195557 + }, + { + "auxiliary_loss_clip": 0.01235678, + "auxiliary_loss_mlp": 0.01073434, + "balance_loss_clip": 1.04433525, + "balance_loss_mlp": 1.07021666, + "epoch": 0.04990229971441455, + "flos": 15959636242560.0, + "grad_norm": 2.2375926111489743, + "language_loss": 0.75055873, + "learning_rate": 3.99584687405508e-06, + "loss": 0.77364987, + "num_input_tokens_seen": 17786080, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.65625, + "step": 830, + "time_per_iteration": 2.5045461654663086 + }, + { + "auxiliary_loss_clip": 0.01233457, + "auxiliary_loss_mlp": 0.01065961, + "balance_loss_clip": 1.03935385, + "balance_loss_mlp": 1.06966341, + "epoch": 0.04996242296708252, + "flos": 18405116968320.0, + "grad_norm": 1.962979792887244, + "language_loss": 0.79379636, + "learning_rate": 3.995821750485929e-06, + "loss": 0.81679052, + "num_input_tokens_seen": 17803635, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.640625, + "step": 831, + "time_per_iteration": 2.5924267768859863 + }, + { + "auxiliary_loss_clip": 0.01237676, + "auxiliary_loss_mlp": 0.01070014, + "balance_loss_clip": 1.04487276, + "balance_loss_mlp": 1.07213569, + "epoch": 0.05002254621975049, + "flos": 17858053854720.0, + "grad_norm": 2.758266217871517, + "language_loss": 0.91538632, + "learning_rate": 3.995796551235016e-06, + "loss": 0.93846321, + "num_input_tokens_seen": 17822190, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.65625, + "step": 832, + "time_per_iteration": 2.653150796890259 + }, + { + "auxiliary_loss_clip": 0.01230534, + "auxiliary_loss_mlp": 0.01081981, + "balance_loss_clip": 1.05747163, + "balance_loss_mlp": 1.07053018, + "epoch": 0.050082669472418455, + "flos": 45660273367680.0, + "grad_norm": 1.9700093948003867, + "language_loss": 0.83139837, + "learning_rate": 3.9957712763032974e-06, + "loss": 0.85452354, + "num_input_tokens_seen": 17846915, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.6015625, + "step": 833, + "time_per_iteration": 2.73848819732666 + }, + { + "auxiliary_loss_clip": 0.0123523, + "auxiliary_loss_mlp": 0.01058285, + "balance_loss_clip": 1.0318923, + "balance_loss_mlp": 1.06913459, + "epoch": 0.05014279272508643, + "flos": 37962067363200.0, + "grad_norm": 2.433665596415918, + "language_loss": 0.8254565, + "learning_rate": 3.995745925691733e-06, + "loss": 0.84839165, + "num_input_tokens_seen": 17867270, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.65625, + "step": 834, + "time_per_iteration": 2.6406352519989014 + }, + { + "auxiliary_loss_clip": 0.01236789, + "auxiliary_loss_mlp": 0.01063828, + "balance_loss_clip": 1.03710127, + "balance_loss_mlp": 1.07138014, + "epoch": 0.0502029159777544, + "flos": 20996179516800.0, + "grad_norm": 2.099554255469436, + "language_loss": 0.91758966, + "learning_rate": 3.995720499401282e-06, + "loss": 0.94059587, + "num_input_tokens_seen": 17884880, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.65625, + "step": 835, + "time_per_iteration": 2.4750425815582275 + }, + { + "auxiliary_loss_clip": 0.0123437, + "auxiliary_loss_mlp": 0.01071713, + "balance_loss_clip": 1.04372287, + "balance_loss_mlp": 1.06699944, + "epoch": 0.050263039230422364, + "flos": 15888066393600.0, + "grad_norm": 2.4903656252358735, + "language_loss": 0.76346481, + "learning_rate": 3.995694997432911e-06, + "loss": 0.78652561, + "num_input_tokens_seen": 17903695, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.671875, + "step": 836, + "time_per_iteration": 2.4839258193969727 + }, + { + "auxiliary_loss_clip": 0.01229978, + "auxiliary_loss_mlp": 0.01072782, + "balance_loss_clip": 1.04696083, + "balance_loss_mlp": 1.07100809, + "epoch": 0.050323162483090336, + "flos": 23732752060800.0, + "grad_norm": 2.1380784235063066, + "language_loss": 0.8360337, + "learning_rate": 3.9956694197875855e-06, + "loss": 0.85906136, + "num_input_tokens_seen": 17920745, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.5859375, + "step": 837, + "time_per_iteration": 2.5140485763549805 + }, + { + "auxiliary_loss_clip": 0.01233502, + "auxiliary_loss_mlp": 0.01065592, + "balance_loss_clip": 1.0403192, + "balance_loss_mlp": 1.07245386, + "epoch": 0.0503832857357583, + "flos": 20266223328000.0, + "grad_norm": 2.225982034212064, + "language_loss": 0.73137468, + "learning_rate": 3.995643766466275e-06, + "loss": 0.75436556, + "num_input_tokens_seen": 17938220, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.609375, + "step": 838, + "time_per_iteration": 2.5128419399261475 + }, + { + "auxiliary_loss_clip": 0.01229023, + "auxiliary_loss_mlp": 0.0106788, + "balance_loss_clip": 1.04195237, + "balance_loss_mlp": 1.06636167, + "epoch": 0.05044340898842627, + "flos": 17785011548160.0, + "grad_norm": 1.886796600099776, + "language_loss": 0.83328462, + "learning_rate": 3.995618037469953e-06, + "loss": 0.85625362, + "num_input_tokens_seen": 17957325, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.6328125, + "step": 839, + "time_per_iteration": 2.499415874481201 + }, + { + "auxiliary_loss_clip": 0.01228207, + "auxiliary_loss_mlp": 0.01066651, + "balance_loss_clip": 1.04128349, + "balance_loss_mlp": 1.06866539, + "epoch": 0.050503532241094246, + "flos": 22966526113920.0, + "grad_norm": 2.2056506497336765, + "language_loss": 0.85777193, + "learning_rate": 3.995592232799595e-06, + "loss": 0.8807205, + "num_input_tokens_seen": 17975875, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.59375, + "step": 840, + "time_per_iteration": 2.522038698196411 + }, + { + "auxiliary_loss_clip": 0.01235877, + "auxiliary_loss_mlp": 0.01063775, + "balance_loss_clip": 1.03691697, + "balance_loss_mlp": 1.07246661, + "epoch": 0.05056365549376221, + "flos": 22776989022720.0, + "grad_norm": 2.034102412822674, + "language_loss": 0.94658732, + "learning_rate": 3.99556635245618e-06, + "loss": 0.96958393, + "num_input_tokens_seen": 17994340, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.6328125, + "step": 841, + "time_per_iteration": 2.4996211528778076 + }, + { + "auxiliary_loss_clip": 0.01234454, + "auxiliary_loss_mlp": 0.01066453, + "balance_loss_clip": 1.03958321, + "balance_loss_mlp": 1.07130527, + "epoch": 0.05062377874643018, + "flos": 30916968399360.0, + "grad_norm": 2.030819255438432, + "language_loss": 0.77387047, + "learning_rate": 3.995540396440688e-06, + "loss": 0.79687953, + "num_input_tokens_seen": 18015260, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.6328125, + "step": 842, + "time_per_iteration": 2.6253628730773926 + }, + { + "auxiliary_loss_clip": 0.01238804, + "auxiliary_loss_mlp": 0.01067813, + "balance_loss_clip": 1.041659, + "balance_loss_mlp": 1.07278991, + "epoch": 0.05068390199909815, + "flos": 19647159402240.0, + "grad_norm": 2.283727909175907, + "language_loss": 0.78014457, + "learning_rate": 3.995514364754105e-06, + "loss": 0.80321074, + "num_input_tokens_seen": 18033960, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.6640625, + "step": 843, + "time_per_iteration": 2.5158324241638184 + }, + { + "auxiliary_loss_clip": 0.01237695, + "auxiliary_loss_mlp": 0.01061566, + "balance_loss_clip": 1.036461, + "balance_loss_mlp": 1.07266212, + "epoch": 0.05074402525176612, + "flos": 37962103276800.0, + "grad_norm": 2.249210505837228, + "language_loss": 0.82952344, + "learning_rate": 3.995488257397417e-06, + "loss": 0.85251611, + "num_input_tokens_seen": 18056700, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.6484375, + "step": 844, + "time_per_iteration": 2.6476500034332275 + }, + { + "auxiliary_loss_clip": 0.01229818, + "auxiliary_loss_mlp": 0.0106479, + "balance_loss_clip": 1.03935087, + "balance_loss_mlp": 1.06871867, + "epoch": 0.05080414850443409, + "flos": 22054610603520.0, + "grad_norm": 2.3236550986537368, + "language_loss": 0.76042783, + "learning_rate": 3.995462074371614e-06, + "loss": 0.78337395, + "num_input_tokens_seen": 18075815, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.609375, + "step": 845, + "time_per_iteration": 2.502901315689087 + }, + { + "auxiliary_loss_clip": 0.01229682, + "auxiliary_loss_mlp": 0.01075672, + "balance_loss_clip": 1.04924285, + "balance_loss_mlp": 1.06694174, + "epoch": 0.05086427175710206, + "flos": 20225787592320.0, + "grad_norm": 2.2528566199281905, + "language_loss": 0.87468004, + "learning_rate": 3.99543581567769e-06, + "loss": 0.89773357, + "num_input_tokens_seen": 18095095, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.625, + "step": 846, + "time_per_iteration": 2.5271859169006348 + }, + { + "auxiliary_loss_clip": 0.01230653, + "auxiliary_loss_mlp": 0.01070334, + "balance_loss_clip": 1.04521692, + "balance_loss_mlp": 1.06982791, + "epoch": 0.05092439500977003, + "flos": 15159223526400.0, + "grad_norm": 1.95159927266484, + "language_loss": 0.87571466, + "learning_rate": 3.9954094813166394e-06, + "loss": 0.89872456, + "num_input_tokens_seen": 18112675, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.609375, + "step": 847, + "time_per_iteration": 2.4566030502319336 + }, + { + "auxiliary_loss_clip": 0.01226009, + "auxiliary_loss_mlp": 0.01071018, + "balance_loss_clip": 1.04489946, + "balance_loss_mlp": 1.06883907, + "epoch": 0.050984518262437994, + "flos": 22055149307520.0, + "grad_norm": 2.141846591022022, + "language_loss": 0.81706643, + "learning_rate": 3.995383071289462e-06, + "loss": 0.84003675, + "num_input_tokens_seen": 18130745, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.5703125, + "step": 848, + "time_per_iteration": 2.4695050716400146 + }, + { + "auxiliary_loss_clip": 0.0123182, + "auxiliary_loss_mlp": 0.01077851, + "balance_loss_clip": 1.0522449, + "balance_loss_mlp": 1.07167053, + "epoch": 0.05104464151510597, + "flos": 30225329043840.0, + "grad_norm": 1.898868752622741, + "language_loss": 0.87266076, + "learning_rate": 3.995356585597158e-06, + "loss": 0.89575738, + "num_input_tokens_seen": 18152410, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.6015625, + "step": 849, + "time_per_iteration": 2.5472936630249023 + }, + { + "auxiliary_loss_clip": 0.0122487, + "auxiliary_loss_mlp": 0.01062562, + "balance_loss_clip": 1.03743291, + "balance_loss_mlp": 1.06569946, + "epoch": 0.05110476476777394, + "flos": 18332900674560.0, + "grad_norm": 1.8637209623848903, + "language_loss": 0.83340889, + "learning_rate": 3.995330024240732e-06, + "loss": 0.85628319, + "num_input_tokens_seen": 18170870, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.59375, + "step": 850, + "time_per_iteration": 2.493814468383789 + }, + { + "auxiliary_loss_clip": 0.01229016, + "auxiliary_loss_mlp": 0.01064236, + "balance_loss_clip": 1.03847528, + "balance_loss_mlp": 1.06816506, + "epoch": 0.051164888020441904, + "flos": 37998732170880.0, + "grad_norm": 2.1400408414194154, + "language_loss": 0.6501807, + "learning_rate": 3.995303387221192e-06, + "loss": 0.67311323, + "num_input_tokens_seen": 18191555, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.609375, + "step": 851, + "time_per_iteration": 5.443026065826416 + }, + { + "auxiliary_loss_clip": 0.01228781, + "auxiliary_loss_mlp": 0.01071453, + "balance_loss_clip": 1.04424942, + "balance_loss_mlp": 1.0674876, + "epoch": 0.051225011273109876, + "flos": 23038634666880.0, + "grad_norm": 2.2562645326336686, + "language_loss": 0.8376134, + "learning_rate": 3.995276674539547e-06, + "loss": 0.86061573, + "num_input_tokens_seen": 18208620, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.609375, + "step": 852, + "time_per_iteration": 2.4753623008728027 + }, + { + "auxiliary_loss_clip": 0.01231223, + "auxiliary_loss_mlp": 0.01068594, + "balance_loss_clip": 1.04190326, + "balance_loss_mlp": 1.06879044, + "epoch": 0.05128513452577785, + "flos": 18259822454400.0, + "grad_norm": 1.9405819970113303, + "language_loss": 0.80252314, + "learning_rate": 3.995249886196811e-06, + "loss": 0.82552135, + "num_input_tokens_seen": 18226370, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.625, + "step": 853, + "time_per_iteration": 2.5048112869262695 + }, + { + "auxiliary_loss_clip": 0.01226539, + "auxiliary_loss_mlp": 0.01060743, + "balance_loss_clip": 1.03432584, + "balance_loss_mlp": 1.06710184, + "epoch": 0.05134525777844581, + "flos": 27198957571200.0, + "grad_norm": 1.8237562231360178, + "language_loss": 0.75846469, + "learning_rate": 3.995223022193999e-06, + "loss": 0.7813375, + "num_input_tokens_seen": 18247075, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.59375, + "step": 854, + "time_per_iteration": 2.53165602684021 + }, + { + "auxiliary_loss_clip": 0.01233418, + "auxiliary_loss_mlp": 0.0106357, + "balance_loss_clip": 1.03678393, + "balance_loss_mlp": 1.07139039, + "epoch": 0.051405381031113785, + "flos": 28362247436160.0, + "grad_norm": 2.718422527893707, + "language_loss": 0.81173462, + "learning_rate": 3.99519608253213e-06, + "loss": 0.83470446, + "num_input_tokens_seen": 18265680, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.625, + "step": 855, + "time_per_iteration": 2.5610744953155518 + }, + { + "auxiliary_loss_clip": 0.01113278, + "auxiliary_loss_mlp": 0.01020682, + "balance_loss_clip": 1.01534104, + "balance_loss_mlp": 1.03902698, + "epoch": 0.05146550428378175, + "flos": 65618169327360.0, + "grad_norm": 1.0071030268205712, + "language_loss": 0.65609074, + "learning_rate": 3.995169067212227e-06, + "loss": 0.67743033, + "num_input_tokens_seen": 18327015, + "router_z_loss_clip": 0.0534668, + "router_z_loss_mlp": 0.7421875, + "step": 856, + "time_per_iteration": 3.0546581745147705 + }, + { + "auxiliary_loss_clip": 0.01224884, + "auxiliary_loss_mlp": 0.01053813, + "balance_loss_clip": 1.02823043, + "balance_loss_mlp": 1.06811357, + "epoch": 0.05152562753644972, + "flos": 22054861998720.0, + "grad_norm": 1.8111088050205955, + "language_loss": 0.76996124, + "learning_rate": 3.9951419762353116e-06, + "loss": 0.79274821, + "num_input_tokens_seen": 18345235, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.5625, + "step": 857, + "time_per_iteration": 2.6051554679870605 + }, + { + "auxiliary_loss_clip": 0.01229705, + "auxiliary_loss_mlp": 0.01057595, + "balance_loss_clip": 1.03130889, + "balance_loss_mlp": 1.06846082, + "epoch": 0.051585750789117694, + "flos": 18509544783360.0, + "grad_norm": 3.7937823779894377, + "language_loss": 0.88893878, + "learning_rate": 3.995114809602412e-06, + "loss": 0.91181171, + "num_input_tokens_seen": 18362350, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.6171875, + "step": 858, + "time_per_iteration": 2.4517769813537598 + }, + { + "auxiliary_loss_clip": 0.01228685, + "auxiliary_loss_mlp": 0.01056497, + "balance_loss_clip": 1.03000832, + "balance_loss_mlp": 1.06902003, + "epoch": 0.05164587404178566, + "flos": 23730238108800.0, + "grad_norm": 1.9531750101692102, + "language_loss": 0.75199753, + "learning_rate": 3.9950875673145605e-06, + "loss": 0.77484941, + "num_input_tokens_seen": 18383390, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.59375, + "step": 859, + "time_per_iteration": 2.5090014934539795 + }, + { + "auxiliary_loss_clip": 0.01237239, + "auxiliary_loss_mlp": 0.01070917, + "balance_loss_clip": 1.04280758, + "balance_loss_mlp": 1.06980002, + "epoch": 0.05170599729445363, + "flos": 16252882876800.0, + "grad_norm": 2.092452223155828, + "language_loss": 0.90812773, + "learning_rate": 3.995060249372788e-06, + "loss": 0.93120927, + "num_input_tokens_seen": 18399220, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.671875, + "step": 860, + "time_per_iteration": 2.437220335006714 + }, + { + "auxiliary_loss_clip": 0.01231057, + "auxiliary_loss_mlp": 0.01060698, + "balance_loss_clip": 1.03568769, + "balance_loss_mlp": 1.0717634, + "epoch": 0.0517661205471216, + "flos": 23985922095360.0, + "grad_norm": 1.9189860758016508, + "language_loss": 0.82252973, + "learning_rate": 3.99503285577813e-06, + "loss": 0.8454473, + "num_input_tokens_seen": 18419005, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.59375, + "step": 861, + "time_per_iteration": 2.50883412361145 + }, + { + "auxiliary_loss_clip": 0.01233216, + "auxiliary_loss_mlp": 0.01057472, + "balance_loss_clip": 1.03177071, + "balance_loss_mlp": 1.0704143, + "epoch": 0.05182624379978957, + "flos": 29277718392960.0, + "grad_norm": 2.0352629197197762, + "language_loss": 0.78607392, + "learning_rate": 3.995005386531627e-06, + "loss": 0.80898082, + "num_input_tokens_seen": 18440550, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.625, + "step": 862, + "time_per_iteration": 2.548757314682007 + }, + { + "auxiliary_loss_clip": 0.01229413, + "auxiliary_loss_mlp": 0.01068334, + "balance_loss_clip": 1.04402709, + "balance_loss_mlp": 1.07291067, + "epoch": 0.05188636705245754, + "flos": 24170826332160.0, + "grad_norm": 1.9841587361763113, + "language_loss": 0.88999134, + "learning_rate": 3.9949778416343195e-06, + "loss": 0.91296881, + "num_input_tokens_seen": 18461950, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.5625, + "step": 863, + "time_per_iteration": 2.506289005279541 + }, + { + "auxiliary_loss_clip": 0.01238268, + "auxiliary_loss_mlp": 0.01064282, + "balance_loss_clip": 1.03712606, + "balance_loss_mlp": 1.07635331, + "epoch": 0.051946490305125506, + "flos": 26760703731840.0, + "grad_norm": 2.003999649515418, + "language_loss": 0.7575798, + "learning_rate": 3.9949502210872525e-06, + "loss": 0.78060532, + "num_input_tokens_seen": 18480555, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.625, + "step": 864, + "time_per_iteration": 2.515944480895996 + }, + { + "auxiliary_loss_clip": 0.01236545, + "auxiliary_loss_mlp": 0.01069508, + "balance_loss_clip": 1.04228067, + "balance_loss_mlp": 1.07355332, + "epoch": 0.05200661355779348, + "flos": 21502519585920.0, + "grad_norm": 1.9298630836237705, + "language_loss": 0.7919569, + "learning_rate": 3.994922524891474e-06, + "loss": 0.81501746, + "num_input_tokens_seen": 18499645, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.6328125, + "step": 865, + "time_per_iteration": 2.485499620437622 + }, + { + "auxiliary_loss_clip": 0.0123268, + "auxiliary_loss_mlp": 0.0106684, + "balance_loss_clip": 1.04144871, + "balance_loss_mlp": 1.07079291, + "epoch": 0.05206673681046144, + "flos": 18114492026880.0, + "grad_norm": 2.366131428952597, + "language_loss": 0.85700798, + "learning_rate": 3.994894753048032e-06, + "loss": 0.88000321, + "num_input_tokens_seen": 18516810, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.6171875, + "step": 866, + "time_per_iteration": 2.4854226112365723 + }, + { + "auxiliary_loss_clip": 0.01242589, + "auxiliary_loss_mlp": 0.01065926, + "balance_loss_clip": 1.03910398, + "balance_loss_mlp": 1.0804987, + "epoch": 0.052126860063129415, + "flos": 17524191916800.0, + "grad_norm": 2.535209572965093, + "language_loss": 0.8680315, + "learning_rate": 3.9948669055579815e-06, + "loss": 0.89111662, + "num_input_tokens_seen": 18532510, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.625, + "step": 867, + "time_per_iteration": 2.4644970893859863 + }, + { + "auxiliary_loss_clip": 0.01231644, + "auxiliary_loss_mlp": 0.01073847, + "balance_loss_clip": 1.05021977, + "balance_loss_mlp": 1.07513499, + "epoch": 0.05218698331579739, + "flos": 32598054771840.0, + "grad_norm": 1.64188364663517, + "language_loss": 0.63562089, + "learning_rate": 3.9948389824223785e-06, + "loss": 0.65867579, + "num_input_tokens_seen": 18557380, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.5625, + "step": 868, + "time_per_iteration": 2.567958354949951 + }, + { + "auxiliary_loss_clip": 0.01236968, + "auxiliary_loss_mlp": 0.01065922, + "balance_loss_clip": 1.03753829, + "balance_loss_mlp": 1.07263327, + "epoch": 0.05224710656846535, + "flos": 22127293774080.0, + "grad_norm": 2.1448269109564198, + "language_loss": 0.83076257, + "learning_rate": 3.994810983642281e-06, + "loss": 0.85379148, + "num_input_tokens_seen": 18575720, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.6484375, + "step": 869, + "time_per_iteration": 2.5021841526031494 + }, + { + "auxiliary_loss_clip": 0.01237154, + "auxiliary_loss_mlp": 0.01057742, + "balance_loss_clip": 1.03201652, + "balance_loss_mlp": 1.07245827, + "epoch": 0.052307229821133325, + "flos": 11145092976000.0, + "grad_norm": 2.352948725027126, + "language_loss": 0.87544227, + "learning_rate": 3.994782909218751e-06, + "loss": 0.89839119, + "num_input_tokens_seen": 18592185, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.6484375, + "step": 870, + "time_per_iteration": 2.459662437438965 + }, + { + "auxiliary_loss_clip": 0.01238457, + "auxiliary_loss_mlp": 0.01067184, + "balance_loss_clip": 1.04135191, + "balance_loss_mlp": 1.07536197, + "epoch": 0.05236735307380129, + "flos": 19128070005120.0, + "grad_norm": 1.9212028950510787, + "language_loss": 0.80554998, + "learning_rate": 3.994754759152854e-06, + "loss": 0.82860637, + "num_input_tokens_seen": 18609560, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.6328125, + "step": 871, + "time_per_iteration": 2.4701170921325684 + }, + { + "auxiliary_loss_clip": 0.01234905, + "auxiliary_loss_mlp": 0.01064695, + "balance_loss_clip": 1.04009032, + "balance_loss_mlp": 1.07576704, + "epoch": 0.05242747632646926, + "flos": 20960663944320.0, + "grad_norm": 1.5975290841395262, + "language_loss": 0.81374049, + "learning_rate": 3.994726533445656e-06, + "loss": 0.8367365, + "num_input_tokens_seen": 18629405, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.59375, + "step": 872, + "time_per_iteration": 2.4886369705200195 + }, + { + "auxiliary_loss_clip": 0.0111147, + "auxiliary_loss_mlp": 0.0101489, + "balance_loss_clip": 1.00952566, + "balance_loss_mlp": 1.03955865, + "epoch": 0.052487599579137234, + "flos": 65020542842880.0, + "grad_norm": 0.8879269166117758, + "language_loss": 0.61579192, + "learning_rate": 3.9946982320982274e-06, + "loss": 0.63705552, + "num_input_tokens_seen": 18681480, + "router_z_loss_clip": 0.05371094, + "router_z_loss_mlp": 0.71875, + "step": 873, + "time_per_iteration": 2.9913430213928223 + }, + { + "auxiliary_loss_clip": 0.01231663, + "auxiliary_loss_mlp": 0.01058247, + "balance_loss_clip": 1.03245032, + "balance_loss_mlp": 1.07107997, + "epoch": 0.0525477228318052, + "flos": 23288859786240.0, + "grad_norm": 1.8426182555123698, + "language_loss": 0.88426232, + "learning_rate": 3.994669855111643e-06, + "loss": 0.90716141, + "num_input_tokens_seen": 18700390, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.6015625, + "step": 874, + "time_per_iteration": 2.4794461727142334 + }, + { + "auxiliary_loss_clip": 0.0123222, + "auxiliary_loss_mlp": 0.01062298, + "balance_loss_clip": 1.03626251, + "balance_loss_mlp": 1.06908488, + "epoch": 0.05260784608447317, + "flos": 32230221546240.0, + "grad_norm": 2.2494767595307628, + "language_loss": 0.74779439, + "learning_rate": 3.994641402486977e-06, + "loss": 0.77073956, + "num_input_tokens_seen": 18721280, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.6328125, + "step": 875, + "time_per_iteration": 2.5768113136291504 + }, + { + "auxiliary_loss_clip": 0.01228414, + "auxiliary_loss_mlp": 0.01061086, + "balance_loss_clip": 1.03412056, + "balance_loss_mlp": 1.06905699, + "epoch": 0.052667969337141136, + "flos": 24463211040000.0, + "grad_norm": 2.052141253618648, + "language_loss": 0.92836702, + "learning_rate": 3.99461287422531e-06, + "loss": 0.951262, + "num_input_tokens_seen": 18741545, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.59375, + "step": 876, + "time_per_iteration": 2.535587787628174 + }, + { + "auxiliary_loss_clip": 0.0110653, + "auxiliary_loss_mlp": 0.01009036, + "balance_loss_clip": 1.00379074, + "balance_loss_mlp": 1.03698087, + "epoch": 0.05272809258980911, + "flos": 57784329567360.0, + "grad_norm": 0.854570032578524, + "language_loss": 0.62934959, + "learning_rate": 3.994584270327722e-06, + "loss": 0.6505053, + "num_input_tokens_seen": 18801400, + "router_z_loss_clip": 0.05249023, + "router_z_loss_mlp": 0.6953125, + "step": 877, + "time_per_iteration": 3.094581127166748 + }, + { + "auxiliary_loss_clip": 0.01231545, + "auxiliary_loss_mlp": 0.01069359, + "balance_loss_clip": 1.04174972, + "balance_loss_mlp": 1.06975055, + "epoch": 0.05278821584247708, + "flos": 17420805596160.0, + "grad_norm": 2.154366240232031, + "language_loss": 0.85691291, + "learning_rate": 3.994555590795299e-06, + "loss": 0.87992191, + "num_input_tokens_seen": 18819670, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.6171875, + "step": 878, + "time_per_iteration": 2.5052285194396973 + }, + { + "auxiliary_loss_clip": 0.01232133, + "auxiliary_loss_mlp": 0.01063559, + "balance_loss_clip": 1.03754723, + "balance_loss_mlp": 1.06974411, + "epoch": 0.052848339095145046, + "flos": 26137258346880.0, + "grad_norm": 2.0833089409086942, + "language_loss": 0.82790506, + "learning_rate": 3.9945268356291275e-06, + "loss": 0.85086197, + "num_input_tokens_seen": 18840580, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.625, + "step": 879, + "time_per_iteration": 2.564312219619751 + }, + { + "auxiliary_loss_clip": 0.01227867, + "auxiliary_loss_mlp": 0.01066877, + "balance_loss_clip": 1.04011488, + "balance_loss_mlp": 1.06966615, + "epoch": 0.05290846234781302, + "flos": 16472081623680.0, + "grad_norm": 4.271066320440391, + "language_loss": 0.84404933, + "learning_rate": 3.9944980048302985e-06, + "loss": 0.86699677, + "num_input_tokens_seen": 18859295, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.578125, + "step": 880, + "time_per_iteration": 2.4854133129119873 + }, + { + "auxiliary_loss_clip": 0.01233797, + "auxiliary_loss_mlp": 0.01069821, + "balance_loss_clip": 1.04360688, + "balance_loss_mlp": 1.07206059, + "epoch": 0.05296858560048098, + "flos": 19865173000320.0, + "grad_norm": 3.515636761469604, + "language_loss": 0.87156737, + "learning_rate": 3.994469098399906e-06, + "loss": 0.89460361, + "num_input_tokens_seen": 18877485, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.625, + "step": 881, + "time_per_iteration": 2.476846933364868 + }, + { + "auxiliary_loss_clip": 0.01228751, + "auxiliary_loss_mlp": 0.01065941, + "balance_loss_clip": 1.03789103, + "balance_loss_mlp": 1.06813371, + "epoch": 0.053028708853148955, + "flos": 24388588535040.0, + "grad_norm": 1.9345214626214409, + "language_loss": 0.87682849, + "learning_rate": 3.994440116339046e-06, + "loss": 0.89977539, + "num_input_tokens_seen": 18898275, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.609375, + "step": 882, + "time_per_iteration": 2.6449031829833984 + }, + { + "auxiliary_loss_clip": 0.01233714, + "auxiliary_loss_mlp": 0.01065669, + "balance_loss_clip": 1.03825057, + "balance_loss_mlp": 1.07030129, + "epoch": 0.05308883210581693, + "flos": 36393166143360.0, + "grad_norm": 2.7245054008776814, + "language_loss": 0.68869275, + "learning_rate": 3.994411058648816e-06, + "loss": 0.71168661, + "num_input_tokens_seen": 18920665, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.6328125, + "step": 883, + "time_per_iteration": 2.620363235473633 + }, + { + "auxiliary_loss_clip": 0.01225388, + "auxiliary_loss_mlp": 0.01060527, + "balance_loss_clip": 1.03461075, + "balance_loss_mlp": 1.06937146, + "epoch": 0.05314895535848489, + "flos": 22855095146880.0, + "grad_norm": 1.9628498458506696, + "language_loss": 0.75887203, + "learning_rate": 3.994381925330319e-06, + "loss": 0.78173113, + "num_input_tokens_seen": 18939835, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.5546875, + "step": 884, + "time_per_iteration": 2.4948067665100098 + }, + { + "auxiliary_loss_clip": 0.01225729, + "auxiliary_loss_mlp": 0.01063879, + "balance_loss_clip": 1.03870201, + "balance_loss_mlp": 1.06921601, + "epoch": 0.053209078611152864, + "flos": 12860330204160.0, + "grad_norm": 2.00306560312032, + "language_loss": 0.85323638, + "learning_rate": 3.994352716384659e-06, + "loss": 0.87613249, + "num_input_tokens_seen": 18958405, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.5625, + "step": 885, + "time_per_iteration": 2.5159530639648438 + }, + { + "auxiliary_loss_clip": 0.01228523, + "auxiliary_loss_mlp": 0.01068973, + "balance_loss_clip": 1.04205549, + "balance_loss_mlp": 1.06673646, + "epoch": 0.05326920186382083, + "flos": 12164596698240.0, + "grad_norm": 2.6316893825734344, + "language_loss": 0.85726082, + "learning_rate": 3.994323431812945e-06, + "loss": 0.88023585, + "num_input_tokens_seen": 18975445, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.6171875, + "step": 886, + "time_per_iteration": 2.4650700092315674 + }, + { + "auxiliary_loss_clip": 0.01226585, + "auxiliary_loss_mlp": 0.01066459, + "balance_loss_clip": 1.03908896, + "balance_loss_mlp": 1.06944001, + "epoch": 0.0533293251164888, + "flos": 22704485420160.0, + "grad_norm": 2.1517488326805214, + "language_loss": 0.89229804, + "learning_rate": 3.994294071616286e-06, + "loss": 0.91522843, + "num_input_tokens_seen": 18991930, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.5703125, + "step": 887, + "time_per_iteration": 2.5020337104797363 + }, + { + "auxiliary_loss_clip": 0.01227687, + "auxiliary_loss_mlp": 0.01070962, + "balance_loss_clip": 1.04270935, + "balance_loss_mlp": 1.06604195, + "epoch": 0.053389448369156774, + "flos": 26940939200640.0, + "grad_norm": 2.2836036404275593, + "language_loss": 0.75076836, + "learning_rate": 3.994264635795796e-06, + "loss": 0.77375484, + "num_input_tokens_seen": 19009790, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.6171875, + "step": 888, + "time_per_iteration": 2.5055694580078125 + }, + { + "auxiliary_loss_clip": 0.0123028, + "auxiliary_loss_mlp": 0.0107639, + "balance_loss_clip": 1.0480895, + "balance_loss_mlp": 1.07113457, + "epoch": 0.05344957162182474, + "flos": 25556331686400.0, + "grad_norm": 2.032914331295681, + "language_loss": 0.88330352, + "learning_rate": 3.994235124352592e-06, + "loss": 0.90637028, + "num_input_tokens_seen": 19030170, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.59375, + "step": 889, + "time_per_iteration": 2.5147650241851807 + }, + { + "auxiliary_loss_clip": 0.01222875, + "auxiliary_loss_mlp": 0.01053174, + "balance_loss_clip": 1.02748489, + "balance_loss_mlp": 1.06732821, + "epoch": 0.05350969487449271, + "flos": 19719591177600.0, + "grad_norm": 1.9726085703824752, + "language_loss": 0.88269985, + "learning_rate": 3.994205537287791e-06, + "loss": 0.90546036, + "num_input_tokens_seen": 19048075, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.5546875, + "step": 890, + "time_per_iteration": 2.490300416946411 + }, + { + "auxiliary_loss_clip": 0.01225662, + "auxiliary_loss_mlp": 0.0107145, + "balance_loss_clip": 1.04612982, + "balance_loss_mlp": 1.06690812, + "epoch": 0.053569818127160676, + "flos": 27016351804800.0, + "grad_norm": 2.320271972022273, + "language_loss": 0.93251556, + "learning_rate": 3.994175874602517e-06, + "loss": 0.95548671, + "num_input_tokens_seen": 19067465, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.59375, + "step": 891, + "time_per_iteration": 2.5133957862854004 + }, + { + "auxiliary_loss_clip": 0.01225404, + "auxiliary_loss_mlp": 0.01062212, + "balance_loss_clip": 1.03506804, + "balance_loss_mlp": 1.06682086, + "epoch": 0.05362994137982865, + "flos": 13188338225280.0, + "grad_norm": 2.238230674372026, + "language_loss": 0.71759057, + "learning_rate": 3.994146136297893e-06, + "loss": 0.74046671, + "num_input_tokens_seen": 19085505, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.5859375, + "step": 892, + "time_per_iteration": 2.5544779300689697 + }, + { + "auxiliary_loss_clip": 0.01229119, + "auxiliary_loss_mlp": 0.01067529, + "balance_loss_clip": 1.0421617, + "balance_loss_mlp": 1.06946719, + "epoch": 0.05369006463249662, + "flos": 28658008022400.0, + "grad_norm": 2.3204520758070037, + "language_loss": 0.82304287, + "learning_rate": 3.994116322375049e-06, + "loss": 0.84600937, + "num_input_tokens_seen": 19104360, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.6015625, + "step": 893, + "time_per_iteration": 5.3903117179870605 + }, + { + "auxiliary_loss_clip": 0.0122945, + "auxiliary_loss_mlp": 0.01070342, + "balance_loss_clip": 1.04430699, + "balance_loss_mlp": 1.0679965, + "epoch": 0.053750187885164585, + "flos": 28913153304960.0, + "grad_norm": 2.3808217776212937, + "language_loss": 0.81695569, + "learning_rate": 3.994086432835114e-06, + "loss": 0.83995366, + "num_input_tokens_seen": 19124680, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.609375, + "step": 894, + "time_per_iteration": 2.52809476852417 + }, + { + "auxiliary_loss_clip": 0.01227471, + "auxiliary_loss_mlp": 0.01065449, + "balance_loss_clip": 1.03915119, + "balance_loss_mlp": 1.06881404, + "epoch": 0.05381031113783256, + "flos": 15158828476800.0, + "grad_norm": 2.5337894710206093, + "language_loss": 0.76043701, + "learning_rate": 3.994056467679221e-06, + "loss": 0.7833662, + "num_input_tokens_seen": 19142895, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.5859375, + "step": 895, + "time_per_iteration": 2.4810688495635986 + }, + { + "auxiliary_loss_clip": 0.01238307, + "auxiliary_loss_mlp": 0.01057353, + "balance_loss_clip": 1.03022122, + "balance_loss_mlp": 1.07260597, + "epoch": 0.05387043439050053, + "flos": 21835232288640.0, + "grad_norm": 2.2065839001211156, + "language_loss": 0.86456096, + "learning_rate": 3.9940264269085065e-06, + "loss": 0.88751751, + "num_input_tokens_seen": 19163125, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.65625, + "step": 896, + "time_per_iteration": 2.522254467010498 + }, + { + "auxiliary_loss_clip": 0.01231325, + "auxiliary_loss_mlp": 0.01062675, + "balance_loss_clip": 1.03495908, + "balance_loss_mlp": 1.06809413, + "epoch": 0.053930557643168495, + "flos": 17310308382720.0, + "grad_norm": 2.1680285530564274, + "language_loss": 0.87949234, + "learning_rate": 3.9939963105241115e-06, + "loss": 0.90243232, + "num_input_tokens_seen": 19179385, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.6328125, + "step": 897, + "time_per_iteration": 2.457918167114258 + }, + { + "auxiliary_loss_clip": 0.0122574, + "auxiliary_loss_mlp": 0.01063765, + "balance_loss_clip": 1.03570318, + "balance_loss_mlp": 1.06723523, + "epoch": 0.05399068089583647, + "flos": 17348481561600.0, + "grad_norm": 1.7359050724031848, + "language_loss": 0.9035244, + "learning_rate": 3.993966118527175e-06, + "loss": 0.9264195, + "num_input_tokens_seen": 19198725, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.5859375, + "step": 898, + "time_per_iteration": 2.4593143463134766 + }, + { + "auxiliary_loss_clip": 0.01234899, + "auxiliary_loss_mlp": 0.01084595, + "balance_loss_clip": 1.05808282, + "balance_loss_mlp": 1.07024622, + "epoch": 0.05405080414850443, + "flos": 17486952491520.0, + "grad_norm": 3.958355519485596, + "language_loss": 0.91756964, + "learning_rate": 3.993935850918845e-06, + "loss": 0.94076455, + "num_input_tokens_seen": 19212380, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.6484375, + "step": 899, + "time_per_iteration": 2.4461729526519775 + }, + { + "auxiliary_loss_clip": 0.01225208, + "auxiliary_loss_mlp": 0.01065344, + "balance_loss_clip": 1.03964233, + "balance_loss_mlp": 1.06601286, + "epoch": 0.054110927401172404, + "flos": 24496787278080.0, + "grad_norm": 2.6493739136310643, + "language_loss": 0.75594276, + "learning_rate": 3.9939055077002665e-06, + "loss": 0.77884829, + "num_input_tokens_seen": 19232235, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.59375, + "step": 900, + "time_per_iteration": 2.5180957317352295 + }, + { + "auxiliary_loss_clip": 0.01231903, + "auxiliary_loss_mlp": 0.01059763, + "balance_loss_clip": 1.03413296, + "balance_loss_mlp": 1.06860638, + "epoch": 0.054171050653840376, + "flos": 22930040874240.0, + "grad_norm": 2.2496787705299908, + "language_loss": 0.7377668, + "learning_rate": 3.993875088872592e-06, + "loss": 0.76068342, + "num_input_tokens_seen": 19251460, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.6328125, + "step": 901, + "time_per_iteration": 2.49638032913208 + }, + { + "auxiliary_loss_clip": 0.01221671, + "auxiliary_loss_mlp": 0.01074944, + "balance_loss_clip": 1.04982698, + "balance_loss_mlp": 1.06662059, + "epoch": 0.05423117390650834, + "flos": 12933192942720.0, + "grad_norm": 2.0553503619333586, + "language_loss": 0.85004938, + "learning_rate": 3.9938445944369745e-06, + "loss": 0.87301552, + "num_input_tokens_seen": 19269060, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.546875, + "step": 902, + "time_per_iteration": 2.5067105293273926 + }, + { + "auxiliary_loss_clip": 0.01226177, + "auxiliary_loss_mlp": 0.0106299, + "balance_loss_clip": 1.03638279, + "balance_loss_mlp": 1.06769705, + "epoch": 0.05429129715917631, + "flos": 19901335017600.0, + "grad_norm": 2.0002475654879195, + "language_loss": 0.8655951, + "learning_rate": 3.993814024394569e-06, + "loss": 0.8884868, + "num_input_tokens_seen": 19288620, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5859375, + "step": 903, + "time_per_iteration": 2.522193670272827 + }, + { + "auxiliary_loss_clip": 0.01227512, + "auxiliary_loss_mlp": 0.01062198, + "balance_loss_clip": 1.03622222, + "balance_loss_mlp": 1.06904316, + "epoch": 0.05435142041184428, + "flos": 16908611610240.0, + "grad_norm": 2.4298091072226855, + "language_loss": 0.74835998, + "learning_rate": 3.993783378746537e-06, + "loss": 0.77125704, + "num_input_tokens_seen": 19306615, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.578125, + "step": 904, + "time_per_iteration": 2.456969976425171 + }, + { + "auxiliary_loss_clip": 0.0123038, + "auxiliary_loss_mlp": 0.01073252, + "balance_loss_clip": 1.04685879, + "balance_loss_mlp": 1.06905615, + "epoch": 0.05441154366451225, + "flos": 23948323534080.0, + "grad_norm": 2.0843949675352356, + "language_loss": 0.85750329, + "learning_rate": 3.993752657494039e-06, + "loss": 0.8805396, + "num_input_tokens_seen": 19321680, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.609375, + "step": 905, + "time_per_iteration": 2.5358731746673584 + }, + { + "auxiliary_loss_clip": 0.01227222, + "auxiliary_loss_mlp": 0.01078235, + "balance_loss_clip": 1.05317712, + "balance_loss_mlp": 1.07247257, + "epoch": 0.05447166691718022, + "flos": 19975382904960.0, + "grad_norm": 1.7937911991915148, + "language_loss": 0.74028552, + "learning_rate": 3.993721860638241e-06, + "loss": 0.76334012, + "num_input_tokens_seen": 19339760, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.546875, + "step": 906, + "time_per_iteration": 2.468331813812256 + }, + { + "auxiliary_loss_clip": 0.01228766, + "auxiliary_loss_mlp": 0.01065896, + "balance_loss_clip": 1.03909731, + "balance_loss_mlp": 1.06858826, + "epoch": 0.05453179016984819, + "flos": 24936513575040.0, + "grad_norm": 2.220044948377472, + "language_loss": 0.87410975, + "learning_rate": 3.993690988180309e-06, + "loss": 0.89705634, + "num_input_tokens_seen": 19359585, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.6015625, + "step": 907, + "time_per_iteration": 2.5177390575408936 + }, + { + "auxiliary_loss_clip": 0.01227557, + "auxiliary_loss_mlp": 0.01071851, + "balance_loss_clip": 1.04521942, + "balance_loss_mlp": 1.07002556, + "epoch": 0.05459191342251616, + "flos": 18115102558080.0, + "grad_norm": 1.8689281211501179, + "language_loss": 0.86915505, + "learning_rate": 3.9936600401214165e-06, + "loss": 0.89214909, + "num_input_tokens_seen": 19378590, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.578125, + "step": 908, + "time_per_iteration": 2.45135498046875 + }, + { + "auxiliary_loss_clip": 0.01225417, + "auxiliary_loss_mlp": 0.01068459, + "balance_loss_clip": 1.04073071, + "balance_loss_mlp": 1.06842148, + "epoch": 0.054652036675184125, + "flos": 19208295031680.0, + "grad_norm": 2.409525813232516, + "language_loss": 0.89454836, + "learning_rate": 3.9936290164627345e-06, + "loss": 0.91748714, + "num_input_tokens_seen": 19397910, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.5703125, + "step": 909, + "time_per_iteration": 2.4702625274658203 + }, + { + "auxiliary_loss_clip": 0.01231345, + "auxiliary_loss_mlp": 0.01075786, + "balance_loss_clip": 1.04773629, + "balance_loss_mlp": 1.06930447, + "epoch": 0.0547121599278521, + "flos": 16325745615360.0, + "grad_norm": 2.4022545211155593, + "language_loss": 0.70942473, + "learning_rate": 3.99359791720544e-06, + "loss": 0.73249602, + "num_input_tokens_seen": 19415950, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.625, + "step": 910, + "time_per_iteration": 2.4530797004699707 + }, + { + "auxiliary_loss_clip": 0.01224757, + "auxiliary_loss_mlp": 0.01055797, + "balance_loss_clip": 1.03002357, + "balance_loss_mlp": 1.06815219, + "epoch": 0.05477228318052007, + "flos": 20339014239360.0, + "grad_norm": 2.0100188286094745, + "language_loss": 0.8349818, + "learning_rate": 3.993566742350714e-06, + "loss": 0.85778737, + "num_input_tokens_seen": 19435275, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.5625, + "step": 911, + "time_per_iteration": 2.4792025089263916 + }, + { + "auxiliary_loss_clip": 0.01224017, + "auxiliary_loss_mlp": 0.01072081, + "balance_loss_clip": 1.04524732, + "balance_loss_mlp": 1.06649613, + "epoch": 0.054832406433188034, + "flos": 21973092687360.0, + "grad_norm": 2.746196883211308, + "language_loss": 0.76096344, + "learning_rate": 3.993535491899736e-06, + "loss": 0.7839244, + "num_input_tokens_seen": 19452090, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.578125, + "step": 912, + "time_per_iteration": 2.4651522636413574 + }, + { + "auxiliary_loss_clip": 0.01219912, + "auxiliary_loss_mlp": 0.01052416, + "balance_loss_clip": 1.02733433, + "balance_loss_mlp": 1.06664968, + "epoch": 0.054892529685856006, + "flos": 16398931576320.0, + "grad_norm": 2.385296939765248, + "language_loss": 0.82667339, + "learning_rate": 3.993504165853694e-06, + "loss": 0.84939671, + "num_input_tokens_seen": 19470865, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.53125, + "step": 913, + "time_per_iteration": 2.475384473800659 + }, + { + "auxiliary_loss_clip": 0.01224168, + "auxiliary_loss_mlp": 0.01061883, + "balance_loss_clip": 1.03633678, + "balance_loss_mlp": 1.07065797, + "epoch": 0.05495265293852397, + "flos": 23912341084800.0, + "grad_norm": 2.227172084037845, + "language_loss": 0.83470452, + "learning_rate": 3.993472764213772e-06, + "loss": 0.85756505, + "num_input_tokens_seen": 19492145, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.53125, + "step": 914, + "time_per_iteration": 2.5741806030273438 + }, + { + "auxiliary_loss_clip": 0.01229195, + "auxiliary_loss_mlp": 0.01057782, + "balance_loss_clip": 1.03324902, + "balance_loss_mlp": 1.07264161, + "epoch": 0.055012776191191944, + "flos": 23586954756480.0, + "grad_norm": 2.897688985464872, + "language_loss": 0.9010309, + "learning_rate": 3.9934412869811655e-06, + "loss": 0.92390066, + "num_input_tokens_seen": 19511015, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.5625, + "step": 915, + "time_per_iteration": 2.492981433868408 + }, + { + "auxiliary_loss_clip": 0.01225584, + "auxiliary_loss_mlp": 0.01055475, + "balance_loss_clip": 1.03046489, + "balance_loss_mlp": 1.0708915, + "epoch": 0.055072899443859916, + "flos": 17528501548800.0, + "grad_norm": 1.870109983937874, + "language_loss": 0.89555848, + "learning_rate": 3.993409734157064e-06, + "loss": 0.91836905, + "num_input_tokens_seen": 19529040, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.546875, + "step": 916, + "time_per_iteration": 2.4621188640594482 + }, + { + "auxiliary_loss_clip": 0.01228011, + "auxiliary_loss_mlp": 0.01072271, + "balance_loss_clip": 1.04593801, + "balance_loss_mlp": 1.06942379, + "epoch": 0.05513302269652788, + "flos": 21687172427520.0, + "grad_norm": 1.7933741103180343, + "language_loss": 0.80085957, + "learning_rate": 3.993378105742666e-06, + "loss": 0.82386243, + "num_input_tokens_seen": 19549540, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.5859375, + "step": 917, + "time_per_iteration": 2.49455189704895 + }, + { + "auxiliary_loss_clip": 0.01225592, + "auxiliary_loss_mlp": 0.01058516, + "balance_loss_clip": 1.03270769, + "balance_loss_mlp": 1.06678224, + "epoch": 0.05519314594919585, + "flos": 21613340021760.0, + "grad_norm": 1.9216560267302982, + "language_loss": 0.79673612, + "learning_rate": 3.9933464017391705e-06, + "loss": 0.81957722, + "num_input_tokens_seen": 19567570, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.59375, + "step": 918, + "time_per_iteration": 2.504734516143799 + }, + { + "auxiliary_loss_clip": 0.01223712, + "auxiliary_loss_mlp": 0.01059794, + "balance_loss_clip": 1.03414011, + "balance_loss_mlp": 1.06658053, + "epoch": 0.05525326920186382, + "flos": 21798567480960.0, + "grad_norm": 1.9394116717498289, + "language_loss": 0.89132315, + "learning_rate": 3.99331462214778e-06, + "loss": 0.91415823, + "num_input_tokens_seen": 19585330, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.5703125, + "step": 919, + "time_per_iteration": 2.5087900161743164 + }, + { + "auxiliary_loss_clip": 0.01219042, + "auxiliary_loss_mlp": 0.01068553, + "balance_loss_clip": 1.0427916, + "balance_loss_mlp": 1.06515777, + "epoch": 0.05531339245453179, + "flos": 28439635288320.0, + "grad_norm": 2.688355226699252, + "language_loss": 0.87421197, + "learning_rate": 3.993282766969699e-06, + "loss": 0.89708793, + "num_input_tokens_seen": 19604970, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.5390625, + "step": 920, + "time_per_iteration": 2.536914348602295 + }, + { + "auxiliary_loss_clip": 0.01223828, + "auxiliary_loss_mlp": 0.01063036, + "balance_loss_clip": 1.03733468, + "balance_loss_mlp": 1.06937671, + "epoch": 0.05537351570719976, + "flos": 37375143131520.0, + "grad_norm": 2.1255302161497704, + "language_loss": 0.65921712, + "learning_rate": 3.993250836206136e-06, + "loss": 0.68208569, + "num_input_tokens_seen": 19626235, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.546875, + "step": 921, + "time_per_iteration": 2.643416166305542 + }, + { + "auxiliary_loss_clip": 0.01229793, + "auxiliary_loss_mlp": 0.0106877, + "balance_loss_clip": 1.03969455, + "balance_loss_mlp": 1.0698204, + "epoch": 0.05543363895986773, + "flos": 20084479488000.0, + "grad_norm": 2.143682946402907, + "language_loss": 0.71841472, + "learning_rate": 3.993218829858301e-06, + "loss": 0.74140036, + "num_input_tokens_seen": 19644305, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.6015625, + "step": 922, + "time_per_iteration": 2.4544074535369873 + }, + { + "auxiliary_loss_clip": 0.0122536, + "auxiliary_loss_mlp": 0.01070183, + "balance_loss_clip": 1.04346824, + "balance_loss_mlp": 1.0669136, + "epoch": 0.0554937622125357, + "flos": 24533200690560.0, + "grad_norm": 2.766492717488127, + "language_loss": 0.82548857, + "learning_rate": 3.993186747927408e-06, + "loss": 0.84844404, + "num_input_tokens_seen": 19662130, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5859375, + "step": 923, + "time_per_iteration": 2.490915536880493 + }, + { + "auxiliary_loss_clip": 0.01221243, + "auxiliary_loss_mlp": 0.01068053, + "balance_loss_clip": 1.04194593, + "balance_loss_mlp": 1.06429458, + "epoch": 0.055553885465203665, + "flos": 14320063013760.0, + "grad_norm": 2.2095756655687397, + "language_loss": 0.78808558, + "learning_rate": 3.993154590414675e-06, + "loss": 0.81097853, + "num_input_tokens_seen": 19680715, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.5625, + "step": 924, + "time_per_iteration": 2.45884108543396 + }, + { + "auxiliary_loss_clip": 0.0121918, + "auxiliary_loss_mlp": 0.01059373, + "balance_loss_clip": 1.03245521, + "balance_loss_mlp": 1.06480467, + "epoch": 0.05561400871787164, + "flos": 27381132374400.0, + "grad_norm": 1.9513803878946447, + "language_loss": 1.02250028, + "learning_rate": 3.993122357321319e-06, + "loss": 1.04528582, + "num_input_tokens_seen": 19700535, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.546875, + "step": 925, + "time_per_iteration": 2.5296268463134766 + }, + { + "auxiliary_loss_clip": 0.01220429, + "auxiliary_loss_mlp": 0.01055633, + "balance_loss_clip": 1.02975261, + "balance_loss_mlp": 1.0634799, + "epoch": 0.05567413197053961, + "flos": 23221096778880.0, + "grad_norm": 2.3756260245044687, + "language_loss": 0.80808276, + "learning_rate": 3.993090048648564e-06, + "loss": 0.83084333, + "num_input_tokens_seen": 19718825, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.5703125, + "step": 926, + "time_per_iteration": 2.496299982070923 + }, + { + "auxiliary_loss_clip": 0.01229405, + "auxiliary_loss_mlp": 0.01068259, + "balance_loss_clip": 1.04049563, + "balance_loss_mlp": 1.06743848, + "epoch": 0.055734255223207574, + "flos": 25264952559360.0, + "grad_norm": 2.4713559623940924, + "language_loss": 0.73378903, + "learning_rate": 3.993057664397634e-06, + "loss": 0.75676566, + "num_input_tokens_seen": 19739080, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.6171875, + "step": 927, + "time_per_iteration": 2.5607478618621826 + }, + { + "auxiliary_loss_clip": 0.01103967, + "auxiliary_loss_mlp": 0.01014529, + "balance_loss_clip": 1.00837731, + "balance_loss_mlp": 1.03639269, + "epoch": 0.055794378475875546, + "flos": 66503116702080.0, + "grad_norm": 0.7814837823676635, + "language_loss": 0.5989722, + "learning_rate": 3.9930252045697585e-06, + "loss": 0.62015712, + "num_input_tokens_seen": 19802960, + "router_z_loss_clip": 0.0612793, + "router_z_loss_mlp": 0.67578125, + "step": 928, + "time_per_iteration": 3.0945305824279785 + }, + { + "auxiliary_loss_clip": 0.01223562, + "auxiliary_loss_mlp": 0.01066756, + "balance_loss_clip": 1.04035151, + "balance_loss_mlp": 1.06729245, + "epoch": 0.05585450172854351, + "flos": 25337635729920.0, + "grad_norm": 2.3037954576101587, + "language_loss": 0.95011377, + "learning_rate": 3.992992669166168e-06, + "loss": 0.97301698, + "num_input_tokens_seen": 19822765, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.5625, + "step": 929, + "time_per_iteration": 2.527270555496216 + }, + { + "auxiliary_loss_clip": 0.01221186, + "auxiliary_loss_mlp": 0.01067668, + "balance_loss_clip": 1.03924894, + "balance_loss_mlp": 1.06494856, + "epoch": 0.05591462498121148, + "flos": 33911738881920.0, + "grad_norm": 2.1540114832188553, + "language_loss": 0.71827871, + "learning_rate": 3.992960058188094e-06, + "loss": 0.74116725, + "num_input_tokens_seen": 19843590, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.5625, + "step": 930, + "time_per_iteration": 2.57513689994812 + }, + { + "auxiliary_loss_clip": 0.01227654, + "auxiliary_loss_mlp": 0.01062398, + "balance_loss_clip": 1.0355165, + "balance_loss_mlp": 1.06905401, + "epoch": 0.055974748233879455, + "flos": 17930880679680.0, + "grad_norm": 2.336481182624628, + "language_loss": 0.85333288, + "learning_rate": 3.992927371636776e-06, + "loss": 0.87623346, + "num_input_tokens_seen": 19860230, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.5859375, + "step": 931, + "time_per_iteration": 2.459167957305908 + }, + { + "auxiliary_loss_clip": 0.01224553, + "auxiliary_loss_mlp": 0.01072004, + "balance_loss_clip": 1.0448482, + "balance_loss_mlp": 1.06556344, + "epoch": 0.05603487148654742, + "flos": 24021976371840.0, + "grad_norm": 1.9723738142749898, + "language_loss": 0.83577204, + "learning_rate": 3.9928946095134525e-06, + "loss": 0.85873753, + "num_input_tokens_seen": 19880795, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.59375, + "step": 932, + "time_per_iteration": 2.4930593967437744 + }, + { + "auxiliary_loss_clip": 0.01223225, + "auxiliary_loss_mlp": 0.01067756, + "balance_loss_clip": 1.04012322, + "balance_loss_mlp": 1.06712675, + "epoch": 0.05609499473921539, + "flos": 17307758517120.0, + "grad_norm": 2.411257667891357, + "language_loss": 0.73405433, + "learning_rate": 3.992861771819365e-06, + "loss": 0.75696409, + "num_input_tokens_seen": 19897960, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.5625, + "step": 933, + "time_per_iteration": 2.526521682739258 + }, + { + "auxiliary_loss_clip": 0.01219811, + "auxiliary_loss_mlp": 0.01070368, + "balance_loss_clip": 1.04328358, + "balance_loss_mlp": 1.06432819, + "epoch": 0.05615511799188336, + "flos": 20994742972800.0, + "grad_norm": 2.577929883809357, + "language_loss": 0.86850882, + "learning_rate": 3.99282885855576e-06, + "loss": 0.89141059, + "num_input_tokens_seen": 19913315, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.5546875, + "step": 934, + "time_per_iteration": 5.338034391403198 + }, + { + "auxiliary_loss_clip": 0.01220003, + "auxiliary_loss_mlp": 0.01069693, + "balance_loss_clip": 1.04495692, + "balance_loss_mlp": 1.06842983, + "epoch": 0.05621524124455133, + "flos": 17273535834240.0, + "grad_norm": 2.2060919587088965, + "language_loss": 0.80243224, + "learning_rate": 3.992795869723885e-06, + "loss": 0.82532918, + "num_input_tokens_seen": 19928790, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.515625, + "step": 935, + "time_per_iteration": 3.8198087215423584 + }, + { + "auxiliary_loss_clip": 0.01094527, + "auxiliary_loss_mlp": 0.01008984, + "balance_loss_clip": 1.00321388, + "balance_loss_mlp": 1.02876139, + "epoch": 0.0562753644972193, + "flos": 58719370458240.0, + "grad_norm": 0.8225714537835027, + "language_loss": 0.69179416, + "learning_rate": 3.99276280532499e-06, + "loss": 0.71282923, + "num_input_tokens_seen": 19988785, + "router_z_loss_clip": 0.05761719, + "router_z_loss_mlp": 0.65625, + "step": 936, + "time_per_iteration": 2.9585764408111572 + }, + { + "auxiliary_loss_clip": 0.01220636, + "auxiliary_loss_mlp": 0.01067113, + "balance_loss_clip": 1.04123259, + "balance_loss_mlp": 1.06387568, + "epoch": 0.05633548774988727, + "flos": 17457039440640.0, + "grad_norm": 2.5168182860703237, + "language_loss": 0.75900578, + "learning_rate": 3.992729665360331e-06, + "loss": 0.78188324, + "num_input_tokens_seen": 20007685, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.5703125, + "step": 937, + "time_per_iteration": 2.4891855716705322 + }, + { + "auxiliary_loss_clip": 0.01091743, + "auxiliary_loss_mlp": 0.01002728, + "balance_loss_clip": 0.99738711, + "balance_loss_mlp": 1.02642298, + "epoch": 0.05639561100255524, + "flos": 70654928083200.0, + "grad_norm": 0.8631606334327763, + "language_loss": 0.64287508, + "learning_rate": 3.992696449831162e-06, + "loss": 0.66381979, + "num_input_tokens_seen": 20072750, + "router_z_loss_clip": 0.0534668, + "router_z_loss_mlp": 0.65625, + "step": 938, + "time_per_iteration": 3.0239782333374023 + }, + { + "auxiliary_loss_clip": 0.01226335, + "auxiliary_loss_mlp": 0.0107197, + "balance_loss_clip": 1.04487348, + "balance_loss_mlp": 1.06571174, + "epoch": 0.056455734255223204, + "flos": 20485996692480.0, + "grad_norm": 4.570077538128457, + "language_loss": 0.7903074, + "learning_rate": 3.992663158738745e-06, + "loss": 0.81329048, + "num_input_tokens_seen": 20089070, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.609375, + "step": 939, + "time_per_iteration": 2.494706630706787 + }, + { + "auxiliary_loss_clip": 0.012214, + "auxiliary_loss_mlp": 0.01063948, + "balance_loss_clip": 1.03868759, + "balance_loss_mlp": 1.0669229, + "epoch": 0.056515857507891176, + "flos": 22053569109120.0, + "grad_norm": 1.950609958048397, + "language_loss": 0.73893893, + "learning_rate": 3.992629792084341e-06, + "loss": 0.76179242, + "num_input_tokens_seen": 20108790, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.546875, + "step": 940, + "time_per_iteration": 2.5279061794281006 + }, + { + "auxiliary_loss_clip": 0.01220257, + "auxiliary_loss_mlp": 0.01064421, + "balance_loss_clip": 1.03776574, + "balance_loss_mlp": 1.06722569, + "epoch": 0.05657598076055915, + "flos": 24025316336640.0, + "grad_norm": 1.9142676693922898, + "language_loss": 0.70475829, + "learning_rate": 3.992596349869216e-06, + "loss": 0.72760499, + "num_input_tokens_seen": 20128455, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.53125, + "step": 941, + "time_per_iteration": 2.551604747772217 + }, + { + "auxiliary_loss_clip": 0.01218348, + "auxiliary_loss_mlp": 0.01058281, + "balance_loss_clip": 1.03229308, + "balance_loss_mlp": 1.06624675, + "epoch": 0.05663610401322711, + "flos": 20480609652480.0, + "grad_norm": 2.3045436850665917, + "language_loss": 0.80928791, + "learning_rate": 3.992562832094637e-06, + "loss": 0.83205426, + "num_input_tokens_seen": 20145775, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.515625, + "step": 942, + "time_per_iteration": 2.515646457672119 + }, + { + "auxiliary_loss_clip": 0.01214197, + "auxiliary_loss_mlp": 0.01057859, + "balance_loss_clip": 1.03245521, + "balance_loss_mlp": 1.062042, + "epoch": 0.056696227265895086, + "flos": 21069042255360.0, + "grad_norm": 2.7900678467193205, + "language_loss": 0.88067353, + "learning_rate": 3.9925292387618755e-06, + "loss": 0.9033941, + "num_input_tokens_seen": 20164315, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.5234375, + "step": 943, + "time_per_iteration": 2.674614191055298 + }, + { + "auxiliary_loss_clip": 0.01220399, + "auxiliary_loss_mlp": 0.01056577, + "balance_loss_clip": 1.03182912, + "balance_loss_mlp": 1.06757212, + "epoch": 0.05675635051856306, + "flos": 17821317219840.0, + "grad_norm": 2.6837069047913924, + "language_loss": 0.75092185, + "learning_rate": 3.992495569872206e-06, + "loss": 0.77369165, + "num_input_tokens_seen": 20182760, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.5234375, + "step": 944, + "time_per_iteration": 2.5806639194488525 + }, + { + "auxiliary_loss_clip": 0.01215674, + "auxiliary_loss_mlp": 0.01062669, + "balance_loss_clip": 1.0385294, + "balance_loss_mlp": 1.06267428, + "epoch": 0.05681647377123102, + "flos": 23114945111040.0, + "grad_norm": 1.7462690351912153, + "language_loss": 0.79321784, + "learning_rate": 3.992461825426906e-06, + "loss": 0.8160013, + "num_input_tokens_seen": 20203830, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.53125, + "step": 945, + "time_per_iteration": 2.695613384246826 + }, + { + "auxiliary_loss_clip": 0.01218347, + "auxiliary_loss_mlp": 0.01061935, + "balance_loss_clip": 1.03628159, + "balance_loss_mlp": 1.06407309, + "epoch": 0.056876597023898995, + "flos": 16070528505600.0, + "grad_norm": 2.1794845223078556, + "language_loss": 0.82465631, + "learning_rate": 3.992428005427252e-06, + "loss": 0.84745914, + "num_input_tokens_seen": 20220365, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.546875, + "step": 946, + "time_per_iteration": 2.6081790924072266 + }, + { + "auxiliary_loss_clip": 0.01223255, + "auxiliary_loss_mlp": 0.01059618, + "balance_loss_clip": 1.03258097, + "balance_loss_mlp": 1.06615055, + "epoch": 0.05693672027656696, + "flos": 16835641130880.0, + "grad_norm": 2.7693395657309297, + "language_loss": 0.7904911, + "learning_rate": 3.992394109874529e-06, + "loss": 0.8133198, + "num_input_tokens_seen": 20238640, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.5703125, + "step": 947, + "time_per_iteration": 2.460472822189331 + }, + { + "auxiliary_loss_clip": 0.01227462, + "auxiliary_loss_mlp": 0.01065027, + "balance_loss_clip": 1.03890848, + "balance_loss_mlp": 1.06883287, + "epoch": 0.05699684352923493, + "flos": 21389113370880.0, + "grad_norm": 7.046260534289203, + "language_loss": 0.85772789, + "learning_rate": 3.9923601387700225e-06, + "loss": 0.88065279, + "num_input_tokens_seen": 20251025, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.5859375, + "step": 948, + "time_per_iteration": 2.4560892581939697 + }, + { + "auxiliary_loss_clip": 0.01217019, + "auxiliary_loss_mlp": 0.01060985, + "balance_loss_clip": 1.03374553, + "balance_loss_mlp": 1.06329989, + "epoch": 0.057056966781902904, + "flos": 15560309767680.0, + "grad_norm": 1.8055084405958775, + "language_loss": 0.87044799, + "learning_rate": 3.992326092115019e-06, + "loss": 0.89322805, + "num_input_tokens_seen": 20269775, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.5390625, + "step": 949, + "time_per_iteration": 2.4843316078186035 + }, + { + "auxiliary_loss_clip": 0.01212611, + "auxiliary_loss_mlp": 0.01066298, + "balance_loss_clip": 1.04170561, + "balance_loss_mlp": 1.06284809, + "epoch": 0.05711709003457087, + "flos": 19937856170880.0, + "grad_norm": 2.230679935648155, + "language_loss": 0.79035759, + "learning_rate": 3.992291969910811e-06, + "loss": 0.81314665, + "num_input_tokens_seen": 20287715, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.4921875, + "step": 950, + "time_per_iteration": 2.468172311782837 + }, + { + "auxiliary_loss_clip": 0.01221984, + "auxiliary_loss_mlp": 0.01068601, + "balance_loss_clip": 1.04365039, + "balance_loss_mlp": 1.06574106, + "epoch": 0.05717721328723884, + "flos": 30332701774080.0, + "grad_norm": 2.0871877141587682, + "language_loss": 0.8244521, + "learning_rate": 3.992257772158691e-06, + "loss": 0.84735799, + "num_input_tokens_seen": 20307070, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.5625, + "step": 951, + "time_per_iteration": 2.5418505668640137 + }, + { + "auxiliary_loss_clip": 0.01215404, + "auxiliary_loss_mlp": 0.01062639, + "balance_loss_clip": 1.03568625, + "balance_loss_mlp": 1.06129527, + "epoch": 0.05723733653990681, + "flos": 23654358627840.0, + "grad_norm": 2.5400916768099426, + "language_loss": 0.86685216, + "learning_rate": 3.992223498859958e-06, + "loss": 0.88963258, + "num_input_tokens_seen": 20324945, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.5390625, + "step": 952, + "time_per_iteration": 2.513356924057007 + }, + { + "auxiliary_loss_clip": 0.0122001, + "auxiliary_loss_mlp": 0.01062958, + "balance_loss_clip": 1.03415656, + "balance_loss_mlp": 1.06145215, + "epoch": 0.05729745979257478, + "flos": 22055759838720.0, + "grad_norm": 1.725154467975805, + "language_loss": 0.79043579, + "learning_rate": 3.9921891500159084e-06, + "loss": 0.81326544, + "num_input_tokens_seen": 20346135, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.5859375, + "step": 953, + "time_per_iteration": 2.490940570831299 + }, + { + "auxiliary_loss_clip": 0.01223554, + "auxiliary_loss_mlp": 0.01063244, + "balance_loss_clip": 1.03592086, + "balance_loss_mlp": 1.06757712, + "epoch": 0.05735758304524275, + "flos": 19604353368960.0, + "grad_norm": 2.2937199779067106, + "language_loss": 0.87086606, + "learning_rate": 3.992154725627848e-06, + "loss": 0.89373398, + "num_input_tokens_seen": 20364450, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.5625, + "step": 954, + "time_per_iteration": 2.495039701461792 + }, + { + "auxiliary_loss_clip": 0.01221375, + "auxiliary_loss_mlp": 0.01062344, + "balance_loss_clip": 1.03707159, + "balance_loss_mlp": 1.06446028, + "epoch": 0.057417706297910716, + "flos": 19099018880640.0, + "grad_norm": 2.3514674671771933, + "language_loss": 0.87789929, + "learning_rate": 3.9921202256970804e-06, + "loss": 0.90073651, + "num_input_tokens_seen": 20383500, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.5625, + "step": 955, + "time_per_iteration": 2.5018017292022705 + }, + { + "auxiliary_loss_clip": 0.01214985, + "auxiliary_loss_mlp": 0.01065732, + "balance_loss_clip": 1.04000711, + "balance_loss_mlp": 1.06217909, + "epoch": 0.05747782955057869, + "flos": 16654507822080.0, + "grad_norm": 3.7193659196918576, + "language_loss": 0.89682388, + "learning_rate": 3.992085650224914e-06, + "loss": 0.919631, + "num_input_tokens_seen": 20400295, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.53125, + "step": 956, + "time_per_iteration": 2.43306565284729 + }, + { + "auxiliary_loss_clip": 0.01212174, + "auxiliary_loss_mlp": 0.0105844, + "balance_loss_clip": 1.03232098, + "balance_loss_mlp": 1.06344521, + "epoch": 0.05753795280324665, + "flos": 14502058248960.0, + "grad_norm": 1.7667772588634594, + "language_loss": 0.75335747, + "learning_rate": 3.99205099921266e-06, + "loss": 0.77606356, + "num_input_tokens_seen": 20419085, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.484375, + "step": 957, + "time_per_iteration": 2.469240665435791 + }, + { + "auxiliary_loss_clip": 0.01218166, + "auxiliary_loss_mlp": 0.01075955, + "balance_loss_clip": 1.04713011, + "balance_loss_mlp": 1.06214452, + "epoch": 0.057598076055914625, + "flos": 18076318848000.0, + "grad_norm": 1.8974624224625587, + "language_loss": 0.79871029, + "learning_rate": 3.992016272661633e-06, + "loss": 0.82165146, + "num_input_tokens_seen": 20437465, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.5625, + "step": 958, + "time_per_iteration": 2.5016849040985107 + }, + { + "auxiliary_loss_clip": 0.01214009, + "auxiliary_loss_mlp": 0.01062008, + "balance_loss_clip": 1.03780818, + "balance_loss_mlp": 1.06024444, + "epoch": 0.0576581993085826, + "flos": 22124600254080.0, + "grad_norm": 2.5702669091422234, + "language_loss": 0.88410264, + "learning_rate": 3.99198147057315e-06, + "loss": 0.90686285, + "num_input_tokens_seen": 20456235, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.5390625, + "step": 959, + "time_per_iteration": 2.4830191135406494 + }, + { + "auxiliary_loss_clip": 0.01211651, + "auxiliary_loss_mlp": 0.01063458, + "balance_loss_clip": 1.03832912, + "balance_loss_mlp": 1.0626018, + "epoch": 0.05771832256125056, + "flos": 33181746779520.0, + "grad_norm": 2.6997220185951347, + "language_loss": 0.78556621, + "learning_rate": 3.991946592948529e-06, + "loss": 0.8083173, + "num_input_tokens_seen": 20476825, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.4921875, + "step": 960, + "time_per_iteration": 2.569218397140503 + }, + { + "auxiliary_loss_clip": 0.01217172, + "auxiliary_loss_mlp": 0.01063539, + "balance_loss_clip": 1.03685999, + "balance_loss_mlp": 1.06168103, + "epoch": 0.057778445813918534, + "flos": 24170143973760.0, + "grad_norm": 4.159271492638429, + "language_loss": 0.932491, + "learning_rate": 3.991911639789094e-06, + "loss": 0.95529813, + "num_input_tokens_seen": 20496965, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.5546875, + "step": 961, + "time_per_iteration": 2.511765480041504 + }, + { + "auxiliary_loss_clip": 0.01215042, + "auxiliary_loss_mlp": 0.01070899, + "balance_loss_clip": 1.04411268, + "balance_loss_mlp": 1.06039667, + "epoch": 0.0578385690665865, + "flos": 29643037666560.0, + "grad_norm": 2.532017623976099, + "language_loss": 0.6822986, + "learning_rate": 3.991876611096169e-06, + "loss": 0.70515805, + "num_input_tokens_seen": 20518035, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.546875, + "step": 962, + "time_per_iteration": 2.544498920440674 + }, + { + "auxiliary_loss_clip": 0.01214012, + "auxiliary_loss_mlp": 0.01068596, + "balance_loss_clip": 1.04461062, + "balance_loss_mlp": 1.06268489, + "epoch": 0.05789869231925447, + "flos": 20885430908160.0, + "grad_norm": 2.445305128304827, + "language_loss": 0.88187808, + "learning_rate": 3.991841506871084e-06, + "loss": 0.90470415, + "num_input_tokens_seen": 20534740, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.515625, + "step": 963, + "time_per_iteration": 2.459487199783325 + }, + { + "auxiliary_loss_clip": 0.01222623, + "auxiliary_loss_mlp": 0.01058866, + "balance_loss_clip": 1.03337944, + "balance_loss_mlp": 1.06633568, + "epoch": 0.057958815571922444, + "flos": 26031106679040.0, + "grad_norm": 2.5656796350524473, + "language_loss": 0.84858835, + "learning_rate": 3.99180632711517e-06, + "loss": 0.87140322, + "num_input_tokens_seen": 20553485, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.5625, + "step": 964, + "time_per_iteration": 2.5268235206604004 + }, + { + "auxiliary_loss_clip": 0.01216658, + "auxiliary_loss_mlp": 0.01067828, + "balance_loss_clip": 1.04157782, + "balance_loss_mlp": 1.06309247, + "epoch": 0.05801893882459041, + "flos": 18077683564800.0, + "grad_norm": 2.846103019544017, + "language_loss": 0.77748007, + "learning_rate": 3.99177107182976e-06, + "loss": 0.80032492, + "num_input_tokens_seen": 20572155, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.5390625, + "step": 965, + "time_per_iteration": 2.4572315216064453 + }, + { + "auxiliary_loss_clip": 0.01211478, + "auxiliary_loss_mlp": 0.01068539, + "balance_loss_clip": 1.04424393, + "balance_loss_mlp": 1.0614084, + "epoch": 0.05807906207725838, + "flos": 17748885444480.0, + "grad_norm": 2.4479010977704463, + "language_loss": 0.80922461, + "learning_rate": 3.99173574101619e-06, + "loss": 0.83202475, + "num_input_tokens_seen": 20590395, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.5, + "step": 966, + "time_per_iteration": 2.4682776927948 + }, + { + "auxiliary_loss_clip": 0.01212307, + "auxiliary_loss_mlp": 0.01061872, + "balance_loss_clip": 1.03730273, + "balance_loss_mlp": 1.06173599, + "epoch": 0.058139185329926346, + "flos": 18040372312320.0, + "grad_norm": 1.8643875206872442, + "language_loss": 0.76291096, + "learning_rate": 3.9917003346758035e-06, + "loss": 0.78565276, + "num_input_tokens_seen": 20608435, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.5, + "step": 967, + "time_per_iteration": 2.453474521636963 + }, + { + "auxiliary_loss_clip": 0.01084806, + "auxiliary_loss_mlp": 0.01040579, + "balance_loss_clip": 1.03547657, + "balance_loss_mlp": 1.02152586, + "epoch": 0.05819930858259432, + "flos": 62363297485440.0, + "grad_norm": 0.7926144837125159, + "language_loss": 0.57362092, + "learning_rate": 3.991664852809939e-06, + "loss": 0.59487474, + "num_input_tokens_seen": 20668575, + "router_z_loss_clip": 0.05102539, + "router_z_loss_mlp": 0.6328125, + "step": 968, + "time_per_iteration": 2.994419574737549 + }, + { + "auxiliary_loss_clip": 0.01218807, + "auxiliary_loss_mlp": 0.01055354, + "balance_loss_clip": 1.02865148, + "balance_loss_mlp": 1.06574845, + "epoch": 0.05825943183526229, + "flos": 19135360465920.0, + "grad_norm": 2.057389892616485, + "language_loss": 0.82289147, + "learning_rate": 3.991629295419945e-06, + "loss": 0.84563303, + "num_input_tokens_seen": 20687355, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.53125, + "step": 969, + "time_per_iteration": 2.4718844890594482 + }, + { + "auxiliary_loss_clip": 0.01217673, + "auxiliary_loss_mlp": 0.01056826, + "balance_loss_clip": 1.03105259, + "balance_loss_mlp": 1.06392384, + "epoch": 0.058319555087930255, + "flos": 29022465369600.0, + "grad_norm": 2.1897875503845725, + "language_loss": 0.780442, + "learning_rate": 3.991593662507167e-06, + "loss": 0.80318701, + "num_input_tokens_seen": 20705710, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.5390625, + "step": 970, + "time_per_iteration": 2.5632171630859375 + }, + { + "auxiliary_loss_clip": 0.01216631, + "auxiliary_loss_mlp": 0.01054997, + "balance_loss_clip": 1.02809155, + "balance_loss_mlp": 1.06188202, + "epoch": 0.05837967834059823, + "flos": 18879999701760.0, + "grad_norm": 2.6802242915962, + "language_loss": 0.92492616, + "learning_rate": 3.991557954072958e-06, + "loss": 0.94764245, + "num_input_tokens_seen": 20722405, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.546875, + "step": 971, + "time_per_iteration": 2.4642531871795654 + }, + { + "auxiliary_loss_clip": 0.01210603, + "auxiliary_loss_mlp": 0.010597, + "balance_loss_clip": 1.03439212, + "balance_loss_mlp": 1.05865097, + "epoch": 0.05843980159326619, + "flos": 25703062744320.0, + "grad_norm": 3.0470884327064276, + "language_loss": 0.86133701, + "learning_rate": 3.991522170118673e-06, + "loss": 0.88404, + "num_input_tokens_seen": 20741480, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.515625, + "step": 972, + "time_per_iteration": 2.5298526287078857 + }, + { + "auxiliary_loss_clip": 0.01212752, + "auxiliary_loss_mlp": 0.01064681, + "balance_loss_clip": 1.04038596, + "balance_loss_mlp": 1.0636549, + "epoch": 0.058499924845934165, + "flos": 25552129795200.0, + "grad_norm": 2.0754734138997906, + "language_loss": 0.87340444, + "learning_rate": 3.991486310645667e-06, + "loss": 0.89617872, + "num_input_tokens_seen": 20759685, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.4921875, + "step": 973, + "time_per_iteration": 2.5198311805725098 + }, + { + "auxiliary_loss_clip": 0.01213937, + "auxiliary_loss_mlp": 0.01067264, + "balance_loss_clip": 1.04070425, + "balance_loss_mlp": 1.06140256, + "epoch": 0.05856004809860214, + "flos": 16436171001600.0, + "grad_norm": 3.2539468590332707, + "language_loss": 0.74868345, + "learning_rate": 3.991450375655301e-06, + "loss": 0.77149546, + "num_input_tokens_seen": 20778180, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5234375, + "step": 974, + "time_per_iteration": 2.465268850326538 + }, + { + "auxiliary_loss_clip": 0.0121359, + "auxiliary_loss_mlp": 0.01059075, + "balance_loss_clip": 1.03308713, + "balance_loss_mlp": 1.06260133, + "epoch": 0.0586201713512701, + "flos": 39458824116480.0, + "grad_norm": 1.7891188847385684, + "language_loss": 0.76707923, + "learning_rate": 3.991414365148936e-06, + "loss": 0.78980577, + "num_input_tokens_seen": 20802705, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.515625, + "step": 975, + "time_per_iteration": 2.633850336074829 + }, + { + "auxiliary_loss_clip": 0.01216778, + "auxiliary_loss_mlp": 0.01068456, + "balance_loss_clip": 1.04332697, + "balance_loss_mlp": 1.0621978, + "epoch": 0.058680294603938074, + "flos": 23365170230400.0, + "grad_norm": 2.0981769673049326, + "language_loss": 0.76878488, + "learning_rate": 3.99137827912794e-06, + "loss": 0.79163718, + "num_input_tokens_seen": 20822540, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.546875, + "step": 976, + "time_per_iteration": 6.8309245109558105 + }, + { + "auxiliary_loss_clip": 0.01210296, + "auxiliary_loss_mlp": 0.01061517, + "balance_loss_clip": 1.03606534, + "balance_loss_mlp": 1.0585494, + "epoch": 0.05874041785660604, + "flos": 32232017226240.0, + "grad_norm": 1.8109666318996334, + "language_loss": 0.87465948, + "learning_rate": 3.991342117593679e-06, + "loss": 0.89737761, + "num_input_tokens_seen": 20844175, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.515625, + "step": 977, + "time_per_iteration": 2.5693395137786865 + }, + { + "auxiliary_loss_clip": 0.01213396, + "auxiliary_loss_mlp": 0.01064383, + "balance_loss_clip": 1.0380497, + "balance_loss_mlp": 1.06246471, + "epoch": 0.05880054110927401, + "flos": 22310043194880.0, + "grad_norm": 1.7886661734827753, + "language_loss": 0.79517525, + "learning_rate": 3.991305880547527e-06, + "loss": 0.81795299, + "num_input_tokens_seen": 20864730, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.5078125, + "step": 978, + "time_per_iteration": 2.51609206199646 + }, + { + "auxiliary_loss_clip": 0.01218239, + "auxiliary_loss_mlp": 0.01069938, + "balance_loss_clip": 1.04339027, + "balance_loss_mlp": 1.06304932, + "epoch": 0.05886066436194198, + "flos": 27380450016000.0, + "grad_norm": 2.6270410794651102, + "language_loss": 0.80902123, + "learning_rate": 3.991269567990855e-06, + "loss": 0.83190298, + "num_input_tokens_seen": 20885200, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.546875, + "step": 979, + "time_per_iteration": 2.527127504348755 + }, + { + "auxiliary_loss_clip": 0.01085971, + "auxiliary_loss_mlp": 0.01009543, + "balance_loss_clip": 1.0044651, + "balance_loss_mlp": 1.02304745, + "epoch": 0.05892078761460995, + "flos": 59584493525760.0, + "grad_norm": 0.94528472512207, + "language_loss": 0.59059429, + "learning_rate": 3.9912331799250415e-06, + "loss": 0.61154944, + "num_input_tokens_seen": 20940325, + "router_z_loss_clip": 0.05078125, + "router_z_loss_mlp": 0.62890625, + "step": 980, + "time_per_iteration": 2.9545915126800537 + }, + { + "auxiliary_loss_clip": 0.01210703, + "auxiliary_loss_mlp": 0.0106402, + "balance_loss_clip": 1.03747201, + "balance_loss_mlp": 1.0622623, + "epoch": 0.05898091086727792, + "flos": 15414081500160.0, + "grad_norm": 2.3915266710240917, + "language_loss": 0.86397457, + "learning_rate": 3.9911967163514665e-06, + "loss": 0.88672185, + "num_input_tokens_seen": 20958220, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.484375, + "step": 981, + "time_per_iteration": 2.4726293087005615 + }, + { + "auxiliary_loss_clip": 0.01212695, + "auxiliary_loss_mlp": 0.01057503, + "balance_loss_clip": 1.03423381, + "balance_loss_mlp": 1.06214404, + "epoch": 0.059041034119945886, + "flos": 23655328295040.0, + "grad_norm": 1.9485203495729437, + "language_loss": 0.79623365, + "learning_rate": 3.991160177271513e-06, + "loss": 0.81893563, + "num_input_tokens_seen": 20978920, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.5078125, + "step": 982, + "time_per_iteration": 2.5271458625793457 + }, + { + "auxiliary_loss_clip": 0.01219179, + "auxiliary_loss_mlp": 0.01060762, + "balance_loss_clip": 1.03571582, + "balance_loss_mlp": 1.06248748, + "epoch": 0.05910115737261386, + "flos": 24754087376640.0, + "grad_norm": 2.5320957946125437, + "language_loss": 0.84376037, + "learning_rate": 3.9911235626865654e-06, + "loss": 0.86655974, + "num_input_tokens_seen": 20999490, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.5625, + "step": 983, + "time_per_iteration": 2.526364803314209 + }, + { + "auxiliary_loss_clip": 0.01212847, + "auxiliary_loss_mlp": 0.01067453, + "balance_loss_clip": 1.04361129, + "balance_loss_mlp": 1.06317604, + "epoch": 0.05916128062528183, + "flos": 11728749070080.0, + "grad_norm": 1.8446015864025267, + "language_loss": 0.84607553, + "learning_rate": 3.9910868725980125e-06, + "loss": 0.86887848, + "num_input_tokens_seen": 21017865, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.4921875, + "step": 984, + "time_per_iteration": 2.456803321838379 + }, + { + "auxiliary_loss_clip": 0.01211466, + "auxiliary_loss_mlp": 0.01059154, + "balance_loss_clip": 1.03551483, + "balance_loss_mlp": 1.06338882, + "epoch": 0.059221403877949795, + "flos": 21902995296000.0, + "grad_norm": 2.3276500524021495, + "language_loss": 0.77875566, + "learning_rate": 3.9910501070072465e-06, + "loss": 0.80146182, + "num_input_tokens_seen": 21035900, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.484375, + "step": 985, + "time_per_iteration": 2.504096508026123 + }, + { + "auxiliary_loss_clip": 0.01215785, + "auxiliary_loss_mlp": 0.01061307, + "balance_loss_clip": 1.03661919, + "balance_loss_mlp": 1.06191659, + "epoch": 0.05928152713061777, + "flos": 20514580940160.0, + "grad_norm": 2.294716701848832, + "language_loss": 0.90598249, + "learning_rate": 3.991013265915661e-06, + "loss": 0.92875338, + "num_input_tokens_seen": 21053235, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.5390625, + "step": 986, + "time_per_iteration": 2.4882049560546875 + }, + { + "auxiliary_loss_clip": 0.01215421, + "auxiliary_loss_mlp": 0.01062373, + "balance_loss_clip": 1.03534794, + "balance_loss_mlp": 1.06017947, + "epoch": 0.05934165038328574, + "flos": 24495135252480.0, + "grad_norm": 3.8181645576894256, + "language_loss": 0.7589798, + "learning_rate": 3.9909763493246525e-06, + "loss": 0.78175771, + "num_input_tokens_seen": 21073090, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.546875, + "step": 987, + "time_per_iteration": 2.492412805557251 + }, + { + "auxiliary_loss_clip": 0.01216653, + "auxiliary_loss_mlp": 0.01059941, + "balance_loss_clip": 1.03491902, + "balance_loss_mlp": 1.06059265, + "epoch": 0.059401773635953704, + "flos": 38728041914880.0, + "grad_norm": 2.1447391932017843, + "language_loss": 0.71525705, + "learning_rate": 3.990939357235621e-06, + "loss": 0.73802304, + "num_input_tokens_seen": 21094895, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.5625, + "step": 988, + "time_per_iteration": 2.6386756896972656 + }, + { + "auxiliary_loss_clip": 0.01081383, + "auxiliary_loss_mlp": 0.01012054, + "balance_loss_clip": 1.00680876, + "balance_loss_mlp": 1.01888978, + "epoch": 0.059461896888621676, + "flos": 58023565125120.0, + "grad_norm": 0.9344259157338769, + "language_loss": 0.71159971, + "learning_rate": 3.99090228964997e-06, + "loss": 0.73253405, + "num_input_tokens_seen": 21147555, + "router_z_loss_clip": 0.05249023, + "router_z_loss_mlp": 0.625, + "step": 989, + "time_per_iteration": 2.903996706008911 + }, + { + "auxiliary_loss_clip": 0.01219656, + "auxiliary_loss_mlp": 0.01067443, + "balance_loss_clip": 1.0404067, + "balance_loss_mlp": 1.06221163, + "epoch": 0.05952202014128964, + "flos": 22127760650880.0, + "grad_norm": 1.89069901477269, + "language_loss": 0.78102934, + "learning_rate": 3.990865146569105e-06, + "loss": 0.80390036, + "num_input_tokens_seen": 21167845, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.578125, + "step": 990, + "time_per_iteration": 2.6252431869506836 + }, + { + "auxiliary_loss_clip": 0.01208224, + "auxiliary_loss_mlp": 0.0105602, + "balance_loss_clip": 1.0303905, + "balance_loss_mlp": 1.05700588, + "epoch": 0.059582143393957614, + "flos": 20445776438400.0, + "grad_norm": 2.077710223302236, + "language_loss": 0.86406755, + "learning_rate": 3.990827927994434e-06, + "loss": 0.88671005, + "num_input_tokens_seen": 21185085, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.515625, + "step": 991, + "time_per_iteration": 2.483853340148926 + }, + { + "auxiliary_loss_clip": 0.01216429, + "auxiliary_loss_mlp": 0.01065185, + "balance_loss_clip": 1.04030573, + "balance_loss_mlp": 1.06190968, + "epoch": 0.059642266646625586, + "flos": 20594877793920.0, + "grad_norm": 2.866628977756486, + "language_loss": 0.76876801, + "learning_rate": 3.9907906339273674e-06, + "loss": 0.79158413, + "num_input_tokens_seen": 21204230, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 1.546875, + "step": 992, + "time_per_iteration": 2.5149648189544678 + }, + { + "auxiliary_loss_clip": 0.01214781, + "auxiliary_loss_mlp": 0.0106048, + "balance_loss_clip": 1.03701937, + "balance_loss_mlp": 1.06251192, + "epoch": 0.05970238989929355, + "flos": 19352655792000.0, + "grad_norm": 2.726921793738851, + "language_loss": 0.74594641, + "learning_rate": 3.9907532643693215e-06, + "loss": 0.76869899, + "num_input_tokens_seen": 21222655, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.5234375, + "step": 993, + "time_per_iteration": 2.4739816188812256 + }, + { + "auxiliary_loss_clip": 0.01214249, + "auxiliary_loss_mlp": 0.01071365, + "balance_loss_clip": 1.04560351, + "balance_loss_mlp": 1.06326771, + "epoch": 0.05976251315196152, + "flos": 30264040926720.0, + "grad_norm": 3.2517233877247396, + "language_loss": 0.78911841, + "learning_rate": 3.990715819321712e-06, + "loss": 0.81197453, + "num_input_tokens_seen": 21242310, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.515625, + "step": 994, + "time_per_iteration": 2.5408835411071777 + }, + { + "auxiliary_loss_clip": 0.01214677, + "auxiliary_loss_mlp": 0.01082728, + "balance_loss_clip": 1.05768251, + "balance_loss_mlp": 1.06170893, + "epoch": 0.05982263640462949, + "flos": 23185150243200.0, + "grad_norm": 2.42517884603863, + "language_loss": 0.79639304, + "learning_rate": 3.99067829878596e-06, + "loss": 0.81936711, + "num_input_tokens_seen": 21261410, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.53125, + "step": 995, + "time_per_iteration": 2.5062758922576904 + }, + { + "auxiliary_loss_clip": 0.01212085, + "auxiliary_loss_mlp": 0.01065995, + "balance_loss_clip": 1.04022169, + "balance_loss_mlp": 1.05969059, + "epoch": 0.05988275965729746, + "flos": 27850879463040.0, + "grad_norm": 2.536496545288829, + "language_loss": 0.86939722, + "learning_rate": 3.990640702763487e-06, + "loss": 0.89217806, + "num_input_tokens_seen": 21280080, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.53125, + "step": 996, + "time_per_iteration": 2.5236001014709473 + }, + { + "auxiliary_loss_clip": 0.01217352, + "auxiliary_loss_mlp": 0.01070048, + "balance_loss_clip": 1.04098463, + "balance_loss_mlp": 1.06309104, + "epoch": 0.05994288290996543, + "flos": 24680003575680.0, + "grad_norm": 4.013698471354103, + "language_loss": 0.88192105, + "learning_rate": 3.990603031255718e-06, + "loss": 0.90479505, + "num_input_tokens_seen": 21296765, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.546875, + "step": 997, + "time_per_iteration": 2.483116626739502 + }, + { + "auxiliary_loss_clip": 0.01079761, + "auxiliary_loss_mlp": 0.01004407, + "balance_loss_clip": 0.9993524, + "balance_loss_mlp": 1.01837301, + "epoch": 0.0600030061626334, + "flos": 69929568835200.0, + "grad_norm": 1.020759515587473, + "language_loss": 0.75442117, + "learning_rate": 3.990565284264083e-06, + "loss": 0.77526283, + "num_input_tokens_seen": 21363345, + "router_z_loss_clip": 0.05053711, + "router_z_loss_mlp": 0.6171875, + "step": 998, + "time_per_iteration": 3.152331590652466 + }, + { + "auxiliary_loss_clip": 0.01213812, + "auxiliary_loss_mlp": 0.01067708, + "balance_loss_clip": 1.04179215, + "balance_loss_mlp": 1.0626508, + "epoch": 0.06006312941530137, + "flos": 26540140268160.0, + "grad_norm": 1.8375420281697645, + "language_loss": 0.75796127, + "learning_rate": 3.990527461790013e-06, + "loss": 0.7807765, + "num_input_tokens_seen": 21385290, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.515625, + "step": 999, + "time_per_iteration": 2.5051238536834717 + }, + { + "auxiliary_loss_clip": 0.01212853, + "auxiliary_loss_mlp": 0.0106227, + "balance_loss_clip": 1.03575778, + "balance_loss_mlp": 1.05894446, + "epoch": 0.060123252667969335, + "flos": 27344000689920.0, + "grad_norm": 1.9091686508511199, + "language_loss": 0.82658899, + "learning_rate": 3.990489563834943e-06, + "loss": 0.8493402, + "num_input_tokens_seen": 21407625, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5390625, + "step": 1000, + "time_per_iteration": 2.5369935035705566 + }, + { + "auxiliary_loss_clip": 0.01215386, + "auxiliary_loss_mlp": 0.01057348, + "balance_loss_clip": 1.03282714, + "balance_loss_mlp": 1.06143069, + "epoch": 0.06018337592063731, + "flos": 27016710940800.0, + "grad_norm": 3.4065508827059783, + "language_loss": 0.85644853, + "learning_rate": 3.990451590400309e-06, + "loss": 0.8791759, + "num_input_tokens_seen": 21426835, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.5390625, + "step": 1001, + "time_per_iteration": 2.4972190856933594 + }, + { + "auxiliary_loss_clip": 0.0120879, + "auxiliary_loss_mlp": 0.01063055, + "balance_loss_clip": 1.0376749, + "balance_loss_mlp": 1.0587517, + "epoch": 0.06024349917330528, + "flos": 25592960580480.0, + "grad_norm": 2.156321640703371, + "language_loss": 0.74386394, + "learning_rate": 3.990413541487551e-06, + "loss": 0.76658237, + "num_input_tokens_seen": 21444920, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.5, + "step": 1002, + "time_per_iteration": 2.531172275543213 + }, + { + "auxiliary_loss_clip": 0.01211576, + "auxiliary_loss_mlp": 0.01065904, + "balance_loss_clip": 1.04019034, + "balance_loss_mlp": 1.06015134, + "epoch": 0.060303622425973244, + "flos": 26133271937280.0, + "grad_norm": 3.1165374575777145, + "language_loss": 0.75346643, + "learning_rate": 3.990375417098112e-06, + "loss": 0.77624118, + "num_input_tokens_seen": 21463555, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.515625, + "step": 1003, + "time_per_iteration": 2.508817434310913 + }, + { + "auxiliary_loss_clip": 0.01219434, + "auxiliary_loss_mlp": 0.01065938, + "balance_loss_clip": 1.04047489, + "balance_loss_mlp": 1.06255794, + "epoch": 0.060363745678641216, + "flos": 20377187418240.0, + "grad_norm": 2.2578292515807603, + "language_loss": 0.70071733, + "learning_rate": 3.990337217233437e-06, + "loss": 0.723571, + "num_input_tokens_seen": 21481990, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.5625, + "step": 1004, + "time_per_iteration": 2.480116844177246 + }, + { + "auxiliary_loss_clip": 0.01218526, + "auxiliary_loss_mlp": 0.01073584, + "balance_loss_clip": 1.04810917, + "balance_loss_mlp": 1.06360686, + "epoch": 0.06042386893130918, + "flos": 17749172753280.0, + "grad_norm": 2.248554137518493, + "language_loss": 0.83246684, + "learning_rate": 3.990298941894976e-06, + "loss": 0.85538793, + "num_input_tokens_seen": 21500385, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.546875, + "step": 1005, + "time_per_iteration": 2.449733018875122 + }, + { + "auxiliary_loss_clip": 0.01077543, + "auxiliary_loss_mlp": 0.01007523, + "balance_loss_clip": 1.00306416, + "balance_loss_mlp": 1.0157814, + "epoch": 0.06048399218397715, + "flos": 68538496872960.0, + "grad_norm": 0.8959746990508154, + "language_loss": 0.59000289, + "learning_rate": 3.9902605910841794e-06, + "loss": 0.61085355, + "num_input_tokens_seen": 21561040, + "router_z_loss_clip": 0.04467773, + "router_z_loss_mlp": 0.6171875, + "step": 1006, + "time_per_iteration": 3.1583423614501953 + }, + { + "auxiliary_loss_clip": 0.01209886, + "auxiliary_loss_mlp": 0.0105727, + "balance_loss_clip": 1.03203392, + "balance_loss_mlp": 1.05658197, + "epoch": 0.060544115436645125, + "flos": 23258515772160.0, + "grad_norm": 2.271524805944984, + "language_loss": 0.7428897, + "learning_rate": 3.990222164802503e-06, + "loss": 0.76556122, + "num_input_tokens_seen": 21580655, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.53125, + "step": 1007, + "time_per_iteration": 2.49139666557312 + }, + { + "auxiliary_loss_clip": 0.01212867, + "auxiliary_loss_mlp": 0.01055047, + "balance_loss_clip": 1.02930975, + "balance_loss_mlp": 1.05897522, + "epoch": 0.06060423868931309, + "flos": 23878441624320.0, + "grad_norm": 1.8583948299039934, + "language_loss": 0.80739897, + "learning_rate": 3.9901836630514006e-06, + "loss": 0.83007812, + "num_input_tokens_seen": 21599650, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.5390625, + "step": 1008, + "time_per_iteration": 2.4990036487579346 + }, + { + "auxiliary_loss_clip": 0.01213893, + "auxiliary_loss_mlp": 0.01055804, + "balance_loss_clip": 1.03082955, + "balance_loss_mlp": 1.06254637, + "epoch": 0.06066436194198106, + "flos": 18728061171840.0, + "grad_norm": 1.935763632111394, + "language_loss": 0.77840835, + "learning_rate": 3.990145085832335e-06, + "loss": 0.80110532, + "num_input_tokens_seen": 21617550, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.515625, + "step": 1009, + "time_per_iteration": 2.4785048961639404 + }, + { + "auxiliary_loss_clip": 0.01210213, + "auxiliary_loss_mlp": 0.01059495, + "balance_loss_clip": 1.03537917, + "balance_loss_mlp": 1.06082368, + "epoch": 0.06072448519464903, + "flos": 24640465680000.0, + "grad_norm": 2.1058592784097567, + "language_loss": 0.93059653, + "learning_rate": 3.990106433146769e-06, + "loss": 0.95329368, + "num_input_tokens_seen": 21635865, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.4921875, + "step": 1010, + "time_per_iteration": 2.507596015930176 + }, + { + "auxiliary_loss_clip": 0.01219036, + "auxiliary_loss_mlp": 0.01066438, + "balance_loss_clip": 1.0390203, + "balance_loss_mlp": 1.05885124, + "epoch": 0.060784608447317, + "flos": 17378825575680.0, + "grad_norm": 3.1716667034247843, + "language_loss": 0.71846473, + "learning_rate": 3.9900677049961665e-06, + "loss": 0.74131954, + "num_input_tokens_seen": 21653945, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.6015625, + "step": 1011, + "time_per_iteration": 2.4791040420532227 + }, + { + "auxiliary_loss_clip": 0.01214432, + "auxiliary_loss_mlp": 0.01070485, + "balance_loss_clip": 1.04388905, + "balance_loss_mlp": 1.05902421, + "epoch": 0.06084473169998497, + "flos": 23692208584320.0, + "grad_norm": 2.5871469840663535, + "language_loss": 0.87542284, + "learning_rate": 3.990028901381999e-06, + "loss": 0.89827204, + "num_input_tokens_seen": 21671230, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5546875, + "step": 1012, + "time_per_iteration": 2.4876151084899902 + }, + { + "auxiliary_loss_clip": 0.01206171, + "auxiliary_loss_mlp": 0.01060353, + "balance_loss_clip": 1.03577256, + "balance_loss_mlp": 1.05505085, + "epoch": 0.06090485495265294, + "flos": 23546339452800.0, + "grad_norm": 1.8956263482043672, + "language_loss": 0.76679665, + "learning_rate": 3.989990022305734e-06, + "loss": 0.78946191, + "num_input_tokens_seen": 21691155, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.515625, + "step": 1013, + "time_per_iteration": 2.4874446392059326 + }, + { + "auxiliary_loss_clip": 0.01215089, + "auxiliary_loss_mlp": 0.01067055, + "balance_loss_clip": 1.03946972, + "balance_loss_mlp": 1.05924904, + "epoch": 0.06096497820532091, + "flos": 20339301548160.0, + "grad_norm": 2.654718290448769, + "language_loss": 0.85651302, + "learning_rate": 3.98995106776885e-06, + "loss": 0.87933445, + "num_input_tokens_seen": 21707405, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.5546875, + "step": 1014, + "time_per_iteration": 2.483774423599243 + }, + { + "auxiliary_loss_clip": 0.0122011, + "auxiliary_loss_mlp": 0.01067578, + "balance_loss_clip": 1.03996944, + "balance_loss_mlp": 1.06207335, + "epoch": 0.061025101457988874, + "flos": 26939035779840.0, + "grad_norm": 2.4287988001966028, + "language_loss": 0.72807163, + "learning_rate": 3.98991203777282e-06, + "loss": 0.75094855, + "num_input_tokens_seen": 21728090, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.578125, + "step": 1015, + "time_per_iteration": 2.520026206970215 + }, + { + "auxiliary_loss_clip": 0.01207162, + "auxiliary_loss_mlp": 0.01068406, + "balance_loss_clip": 1.04290748, + "balance_loss_mlp": 1.0576005, + "epoch": 0.061085224710656846, + "flos": 25375054723200.0, + "grad_norm": 1.6555956389633335, + "language_loss": 0.79197502, + "learning_rate": 3.9898729323191275e-06, + "loss": 0.8147307, + "num_input_tokens_seen": 21747950, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4921875, + "step": 1016, + "time_per_iteration": 2.5177054405212402 + }, + { + "auxiliary_loss_clip": 0.01210352, + "auxiliary_loss_mlp": 0.01060413, + "balance_loss_clip": 1.03609443, + "balance_loss_mlp": 1.0571332, + "epoch": 0.06114534796332482, + "flos": 24824759385600.0, + "grad_norm": 1.934405213560846, + "language_loss": 0.76170123, + "learning_rate": 3.989833751409254e-06, + "loss": 0.78440881, + "num_input_tokens_seen": 21767900, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.53125, + "step": 1017, + "time_per_iteration": 2.517730951309204 + }, + { + "auxiliary_loss_clip": 0.01220983, + "auxiliary_loss_mlp": 0.01069505, + "balance_loss_clip": 1.04331422, + "balance_loss_mlp": 1.06240773, + "epoch": 0.061205471215992784, + "flos": 20631434860800.0, + "grad_norm": 1.873264658326973, + "language_loss": 0.86145842, + "learning_rate": 3.989794495044685e-06, + "loss": 0.88436329, + "num_input_tokens_seen": 21787375, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.5859375, + "step": 1018, + "time_per_iteration": 5.324457883834839 + }, + { + "auxiliary_loss_clip": 0.01206709, + "auxiliary_loss_mlp": 0.01071464, + "balance_loss_clip": 1.045012, + "balance_loss_mlp": 1.05659163, + "epoch": 0.061265594468660756, + "flos": 16508351381760.0, + "grad_norm": 2.696758126666256, + "language_loss": 0.77535981, + "learning_rate": 3.989755163226909e-06, + "loss": 0.79814154, + "num_input_tokens_seen": 21806275, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.5, + "step": 1019, + "time_per_iteration": 2.453768253326416 + }, + { + "auxiliary_loss_clip": 0.01210848, + "auxiliary_loss_mlp": 0.01061489, + "balance_loss_clip": 1.03559661, + "balance_loss_mlp": 1.05749679, + "epoch": 0.06132571772132872, + "flos": 26246211275520.0, + "grad_norm": 1.8458417378275351, + "language_loss": 0.84254557, + "learning_rate": 3.989715755957418e-06, + "loss": 0.86526895, + "num_input_tokens_seen": 21826430, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.53125, + "step": 1020, + "time_per_iteration": 2.5126123428344727 + }, + { + "auxiliary_loss_clip": 0.01217116, + "auxiliary_loss_mlp": 0.01060663, + "balance_loss_clip": 1.0352596, + "balance_loss_mlp": 1.06234074, + "epoch": 0.06138584097399669, + "flos": 37414788768000.0, + "grad_norm": 2.186416819505148, + "language_loss": 0.79234397, + "learning_rate": 3.989676273237705e-06, + "loss": 0.81512177, + "num_input_tokens_seen": 21847800, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.546875, + "step": 1021, + "time_per_iteration": 2.583918333053589 + }, + { + "auxiliary_loss_clip": 0.01207219, + "auxiliary_loss_mlp": 0.01064403, + "balance_loss_clip": 1.04207504, + "balance_loss_mlp": 1.05748677, + "epoch": 0.061445964226664665, + "flos": 17420661941760.0, + "grad_norm": 2.2026341390443434, + "language_loss": 0.87493509, + "learning_rate": 3.9896367150692705e-06, + "loss": 0.89765131, + "num_input_tokens_seen": 21863385, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.5, + "step": 1022, + "time_per_iteration": 2.441298007965088 + }, + { + "auxiliary_loss_clip": 0.01213359, + "auxiliary_loss_mlp": 0.0106856, + "balance_loss_clip": 1.04353857, + "balance_loss_mlp": 1.06052542, + "epoch": 0.06150608747933263, + "flos": 22600021691520.0, + "grad_norm": 1.752710779550117, + "language_loss": 0.82776564, + "learning_rate": 3.989597081453611e-06, + "loss": 0.85058486, + "num_input_tokens_seen": 21881880, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.53125, + "step": 1023, + "time_per_iteration": 2.5027952194213867 + }, + { + "auxiliary_loss_clip": 0.01079025, + "auxiliary_loss_mlp": 0.01003997, + "balance_loss_clip": 0.99944335, + "balance_loss_mlp": 1.01796818, + "epoch": 0.0615662107320006, + "flos": 56741482005120.0, + "grad_norm": 0.8999264202466762, + "language_loss": 0.65078986, + "learning_rate": 3.989557372392231e-06, + "loss": 0.67162001, + "num_input_tokens_seen": 21940550, + "router_z_loss_clip": 0.0456543, + "router_z_loss_mlp": 0.609375, + "step": 1024, + "time_per_iteration": 3.0969655513763428 + }, + { + "auxiliary_loss_clip": 0.01212272, + "auxiliary_loss_mlp": 0.01066841, + "balance_loss_clip": 1.04123473, + "balance_loss_mlp": 1.05936897, + "epoch": 0.06162633398466857, + "flos": 22564793427840.0, + "grad_norm": 1.9303372998519377, + "language_loss": 0.88293028, + "learning_rate": 3.989517587886636e-06, + "loss": 0.90572149, + "num_input_tokens_seen": 21958390, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.53125, + "step": 1025, + "time_per_iteration": 2.5229876041412354 + }, + { + "auxiliary_loss_clip": 0.01212316, + "auxiliary_loss_mlp": 0.01059432, + "balance_loss_clip": 1.03513718, + "balance_loss_mlp": 1.05916524, + "epoch": 0.06168645723733654, + "flos": 25593104234880.0, + "grad_norm": 1.519276165786755, + "language_loss": 0.84567487, + "learning_rate": 3.989477727938335e-06, + "loss": 0.86839235, + "num_input_tokens_seen": 21978625, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.53125, + "step": 1026, + "time_per_iteration": 2.5304806232452393 + }, + { + "auxiliary_loss_clip": 0.01212365, + "auxiliary_loss_mlp": 0.01071013, + "balance_loss_clip": 1.04614556, + "balance_loss_mlp": 1.05798197, + "epoch": 0.06174658049000451, + "flos": 15997917162240.0, + "grad_norm": 1.9431802827698534, + "language_loss": 0.82320756, + "learning_rate": 3.989437792548839e-06, + "loss": 0.84604132, + "num_input_tokens_seen": 21996035, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 1.546875, + "step": 1027, + "time_per_iteration": 2.4508368968963623 + }, + { + "auxiliary_loss_clip": 0.01209611, + "auxiliary_loss_mlp": 0.01058329, + "balance_loss_clip": 1.03343821, + "balance_loss_mlp": 1.05799866, + "epoch": 0.06180670374267248, + "flos": 11285970117120.0, + "grad_norm": 2.262386050001272, + "language_loss": 0.84232426, + "learning_rate": 3.989397781719663e-06, + "loss": 0.86500365, + "num_input_tokens_seen": 22011625, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 1.515625, + "step": 1028, + "time_per_iteration": 2.4485137462615967 + }, + { + "auxiliary_loss_clip": 0.01077664, + "auxiliary_loss_mlp": 0.01009618, + "balance_loss_clip": 1.00544536, + "balance_loss_mlp": 1.01686025, + "epoch": 0.06186682699534045, + "flos": 65130142216320.0, + "grad_norm": 0.9476883841381922, + "language_loss": 0.60497737, + "learning_rate": 3.989357695452323e-06, + "loss": 0.6258502, + "num_input_tokens_seen": 22066035, + "router_z_loss_clip": 0.04174805, + "router_z_loss_mlp": 0.609375, + "step": 1029, + "time_per_iteration": 2.8714137077331543 + }, + { + "auxiliary_loss_clip": 0.0120304, + "auxiliary_loss_mlp": 0.01066238, + "balance_loss_clip": 1.0419786, + "balance_loss_mlp": 1.05338669, + "epoch": 0.061926950248008414, + "flos": 21105742976640.0, + "grad_norm": 2.297452518318954, + "language_loss": 0.82309926, + "learning_rate": 3.98931753374834e-06, + "loss": 0.84579194, + "num_input_tokens_seen": 22085015, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.4921875, + "step": 1030, + "time_per_iteration": 2.4705348014831543 + }, + { + "auxiliary_loss_clip": 0.01214194, + "auxiliary_loss_mlp": 0.01071397, + "balance_loss_clip": 1.04586279, + "balance_loss_mlp": 1.06025672, + "epoch": 0.061987073500676386, + "flos": 17748454481280.0, + "grad_norm": 2.391039807046215, + "language_loss": 0.80262065, + "learning_rate": 3.989277296609237e-06, + "loss": 0.82547653, + "num_input_tokens_seen": 22102775, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.5390625, + "step": 1031, + "time_per_iteration": 2.447964906692505 + }, + { + "auxiliary_loss_clip": 0.0121101, + "auxiliary_loss_mlp": 0.01075497, + "balance_loss_clip": 1.04919958, + "balance_loss_mlp": 1.05865717, + "epoch": 0.06204719675334436, + "flos": 21836237869440.0, + "grad_norm": 1.6245278130098144, + "language_loss": 0.77141201, + "learning_rate": 3.98923698403654e-06, + "loss": 0.79427713, + "num_input_tokens_seen": 22121680, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.5234375, + "step": 1032, + "time_per_iteration": 2.475891590118408 + }, + { + "auxiliary_loss_clip": 0.01205906, + "auxiliary_loss_mlp": 0.01069412, + "balance_loss_clip": 1.04350805, + "balance_loss_mlp": 1.05307126, + "epoch": 0.06210732000601232, + "flos": 19353697286400.0, + "grad_norm": 3.949793190746779, + "language_loss": 0.89276892, + "learning_rate": 3.989196596031776e-06, + "loss": 0.91552204, + "num_input_tokens_seen": 22138155, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.53125, + "step": 1033, + "time_per_iteration": 2.5332658290863037 + }, + { + "auxiliary_loss_clip": 0.01212647, + "auxiliary_loss_mlp": 0.01059216, + "balance_loss_clip": 1.03437293, + "balance_loss_mlp": 1.05739737, + "epoch": 0.062167443258680295, + "flos": 24749382695040.0, + "grad_norm": 2.160025730572359, + "language_loss": 0.84795135, + "learning_rate": 3.989156132596479e-06, + "loss": 0.87066996, + "num_input_tokens_seen": 22157420, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.5546875, + "step": 1034, + "time_per_iteration": 2.507636785507202 + }, + { + "auxiliary_loss_clip": 0.01202421, + "auxiliary_loss_mlp": 0.01060051, + "balance_loss_clip": 1.03399241, + "balance_loss_mlp": 1.05694687, + "epoch": 0.06222756651134827, + "flos": 34458478773120.0, + "grad_norm": 3.176440156188905, + "language_loss": 0.81156218, + "learning_rate": 3.989115593732182e-06, + "loss": 0.83418697, + "num_input_tokens_seen": 22178620, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.453125, + "step": 1035, + "time_per_iteration": 2.624635696411133 + }, + { + "auxiliary_loss_clip": 0.01212161, + "auxiliary_loss_mlp": 0.01068413, + "balance_loss_clip": 1.04051828, + "balance_loss_mlp": 1.06080353, + "epoch": 0.06228768976401623, + "flos": 25666469763840.0, + "grad_norm": 2.252599829484163, + "language_loss": 0.78701359, + "learning_rate": 3.989074979440421e-06, + "loss": 0.80981934, + "num_input_tokens_seen": 22197125, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.515625, + "step": 1036, + "time_per_iteration": 2.53027081489563 + }, + { + "auxiliary_loss_clip": 0.01204167, + "auxiliary_loss_mlp": 0.01068533, + "balance_loss_clip": 1.04334402, + "balance_loss_mlp": 1.05620134, + "epoch": 0.062347813016684205, + "flos": 25295619795840.0, + "grad_norm": 1.670767972712633, + "language_loss": 0.86802149, + "learning_rate": 3.989034289722739e-06, + "loss": 0.8907485, + "num_input_tokens_seen": 22217575, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.4765625, + "step": 1037, + "time_per_iteration": 2.506011724472046 + }, + { + "auxiliary_loss_clip": 0.01206019, + "auxiliary_loss_mlp": 0.01057504, + "balance_loss_clip": 1.02990723, + "balance_loss_mlp": 1.05728471, + "epoch": 0.06240793626935217, + "flos": 26907039740160.0, + "grad_norm": 2.1914513209480933, + "language_loss": 0.81051469, + "learning_rate": 3.988993524580676e-06, + "loss": 0.83314991, + "num_input_tokens_seen": 22236840, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.484375, + "step": 1038, + "time_per_iteration": 2.486758232116699 + }, + { + "auxiliary_loss_clip": 0.01205947, + "auxiliary_loss_mlp": 0.01072566, + "balance_loss_clip": 1.04587555, + "balance_loss_mlp": 1.05856836, + "epoch": 0.06246805952202014, + "flos": 21615782146560.0, + "grad_norm": 2.3663261426095965, + "language_loss": 0.85336804, + "learning_rate": 3.98895268401578e-06, + "loss": 0.87615323, + "num_input_tokens_seen": 22256465, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.46875, + "step": 1039, + "time_per_iteration": 2.489241123199463 + }, + { + "auxiliary_loss_clip": 0.01207559, + "auxiliary_loss_mlp": 0.01067632, + "balance_loss_clip": 1.0417757, + "balance_loss_mlp": 1.05744672, + "epoch": 0.0625281827746881, + "flos": 19311896833920.0, + "grad_norm": 1.9774289629637263, + "language_loss": 0.80853289, + "learning_rate": 3.9889117680296e-06, + "loss": 0.83128488, + "num_input_tokens_seen": 22274025, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.5, + "step": 1040, + "time_per_iteration": 2.480022668838501 + }, + { + "auxiliary_loss_clip": 0.01213203, + "auxiliary_loss_mlp": 0.01067746, + "balance_loss_clip": 1.04155588, + "balance_loss_mlp": 1.06227219, + "epoch": 0.06258830602735609, + "flos": 27745769289600.0, + "grad_norm": 2.535271913081881, + "language_loss": 0.69440711, + "learning_rate": 3.988870776623685e-06, + "loss": 0.71721661, + "num_input_tokens_seen": 22292245, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.5078125, + "step": 1041, + "time_per_iteration": 2.5417978763580322 + }, + { + "auxiliary_loss_clip": 0.01210541, + "auxiliary_loss_mlp": 0.0106006, + "balance_loss_clip": 1.03360724, + "balance_loss_mlp": 1.05743289, + "epoch": 0.06264842928002405, + "flos": 23222605150080.0, + "grad_norm": 1.9564735382917973, + "language_loss": 0.80983013, + "learning_rate": 3.9888297097995905e-06, + "loss": 0.83253616, + "num_input_tokens_seen": 22311455, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.53125, + "step": 1042, + "time_per_iteration": 2.478926181793213 + }, + { + "auxiliary_loss_clip": 0.01210242, + "auxiliary_loss_mlp": 0.01057893, + "balance_loss_clip": 1.03352678, + "balance_loss_mlp": 1.05925727, + "epoch": 0.06270855253269202, + "flos": 38399495189760.0, + "grad_norm": 1.9466384226705415, + "language_loss": 0.76463902, + "learning_rate": 3.988788567558874e-06, + "loss": 0.78732038, + "num_input_tokens_seen": 22333750, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.515625, + "step": 1043, + "time_per_iteration": 2.6262781620025635 + }, + { + "auxiliary_loss_clip": 0.01203702, + "auxiliary_loss_mlp": 0.01066445, + "balance_loss_clip": 1.04174471, + "balance_loss_mlp": 1.05835676, + "epoch": 0.06276867578535998, + "flos": 22453542028800.0, + "grad_norm": 1.8860277298285366, + "language_loss": 0.92454541, + "learning_rate": 3.988747349903097e-06, + "loss": 0.94724691, + "num_input_tokens_seen": 22351940, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.453125, + "step": 1044, + "time_per_iteration": 2.4886953830718994 + }, + { + "auxiliary_loss_clip": 0.01204359, + "auxiliary_loss_mlp": 0.01073486, + "balance_loss_clip": 1.04824948, + "balance_loss_mlp": 1.05475259, + "epoch": 0.06282879903802796, + "flos": 22930435923840.0, + "grad_norm": 1.9539908597303346, + "language_loss": 0.8581354, + "learning_rate": 3.988706056833821e-06, + "loss": 0.88091385, + "num_input_tokens_seen": 22372085, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.5, + "step": 1045, + "time_per_iteration": 2.5382347106933594 + }, + { + "auxiliary_loss_clip": 0.01203094, + "auxiliary_loss_mlp": 0.01073753, + "balance_loss_clip": 1.04900479, + "balance_loss_mlp": 1.05618775, + "epoch": 0.06288892229069593, + "flos": 34819237019520.0, + "grad_norm": 2.0798822187092094, + "language_loss": 0.77675486, + "learning_rate": 3.9886646883526125e-06, + "loss": 0.79952335, + "num_input_tokens_seen": 22392020, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.46875, + "step": 1046, + "time_per_iteration": 2.548157215118408 + }, + { + "auxiliary_loss_clip": 0.01206605, + "auxiliary_loss_mlp": 0.01074859, + "balance_loss_clip": 1.04981279, + "balance_loss_mlp": 1.05837655, + "epoch": 0.06294904554336389, + "flos": 19427134642560.0, + "grad_norm": 2.197016946040243, + "language_loss": 0.77317166, + "learning_rate": 3.988623244461039e-06, + "loss": 0.79598629, + "num_input_tokens_seen": 22411180, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.4765625, + "step": 1047, + "time_per_iteration": 2.467973232269287 + }, + { + "auxiliary_loss_clip": 0.0121283, + "auxiliary_loss_mlp": 0.01061299, + "balance_loss_clip": 1.03584743, + "balance_loss_mlp": 1.05874014, + "epoch": 0.06300916879603187, + "flos": 40661867358720.0, + "grad_norm": 2.3103480986625753, + "language_loss": 0.7696203, + "learning_rate": 3.988581725160672e-06, + "loss": 0.79236162, + "num_input_tokens_seen": 22435105, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.546875, + "step": 1048, + "time_per_iteration": 2.636072874069214 + }, + { + "auxiliary_loss_clip": 0.01209565, + "auxiliary_loss_mlp": 0.01072791, + "balance_loss_clip": 1.0470655, + "balance_loss_mlp": 1.0583266, + "epoch": 0.06306929204869983, + "flos": 23804142341760.0, + "grad_norm": 2.2069714466600656, + "language_loss": 0.77757037, + "learning_rate": 3.988540130453087e-06, + "loss": 0.80039394, + "num_input_tokens_seen": 22452710, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.515625, + "step": 1049, + "time_per_iteration": 2.5173420906066895 + }, + { + "auxiliary_loss_clip": 0.01207985, + "auxiliary_loss_mlp": 0.01065489, + "balance_loss_clip": 1.04047871, + "balance_loss_mlp": 1.05734015, + "epoch": 0.0631294153013678, + "flos": 18915802583040.0, + "grad_norm": 2.316298014027776, + "language_loss": 0.83165503, + "learning_rate": 3.988498460339862e-06, + "loss": 0.85438979, + "num_input_tokens_seen": 22470175, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.5078125, + "step": 1050, + "time_per_iteration": 2.4742541313171387 + }, + { + "auxiliary_loss_clip": 0.01204381, + "auxiliary_loss_mlp": 0.01062607, + "balance_loss_clip": 1.03852665, + "balance_loss_mlp": 1.05776763, + "epoch": 0.06318953855403578, + "flos": 24280174310400.0, + "grad_norm": 2.1475970013183563, + "language_loss": 0.76909173, + "learning_rate": 3.988456714822575e-06, + "loss": 0.79176152, + "num_input_tokens_seen": 22490020, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.46875, + "step": 1051, + "time_per_iteration": 2.4629740715026855 + }, + { + "auxiliary_loss_clip": 0.01207556, + "auxiliary_loss_mlp": 0.01070398, + "balance_loss_clip": 1.04487562, + "balance_loss_mlp": 1.05788827, + "epoch": 0.06324966180670374, + "flos": 22528918719360.0, + "grad_norm": 2.090947022989376, + "language_loss": 0.80053556, + "learning_rate": 3.98841489390281e-06, + "loss": 0.82331514, + "num_input_tokens_seen": 22509685, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.4921875, + "step": 1052, + "time_per_iteration": 2.4729230403900146 + }, + { + "auxiliary_loss_clip": 0.01209047, + "auxiliary_loss_mlp": 0.01064567, + "balance_loss_clip": 1.03911567, + "balance_loss_mlp": 1.05839717, + "epoch": 0.06330978505937171, + "flos": 15778107884160.0, + "grad_norm": 2.21177767113968, + "language_loss": 0.78088665, + "learning_rate": 3.988372997582155e-06, + "loss": 0.80362272, + "num_input_tokens_seen": 22527905, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.5078125, + "step": 1053, + "time_per_iteration": 2.433969736099243 + }, + { + "auxiliary_loss_clip": 0.01209123, + "auxiliary_loss_mlp": 0.01055135, + "balance_loss_clip": 1.03094769, + "balance_loss_mlp": 1.0578481, + "epoch": 0.06336990831203967, + "flos": 21471098163840.0, + "grad_norm": 1.8421697124920164, + "language_loss": 0.84737611, + "learning_rate": 3.988331025862195e-06, + "loss": 0.8700186, + "num_input_tokens_seen": 22546335, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.515625, + "step": 1054, + "time_per_iteration": 2.4986183643341064 + }, + { + "auxiliary_loss_clip": 0.01205973, + "auxiliary_loss_mlp": 0.01065192, + "balance_loss_clip": 1.04051518, + "balance_loss_mlp": 1.05870843, + "epoch": 0.06343003156470765, + "flos": 18478877546880.0, + "grad_norm": 1.9255333357469135, + "language_loss": 0.8566432, + "learning_rate": 3.9882889787445225e-06, + "loss": 0.87935483, + "num_input_tokens_seen": 22563885, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.4765625, + "step": 1055, + "time_per_iteration": 2.5098037719726562 + }, + { + "auxiliary_loss_clip": 0.0121179, + "auxiliary_loss_mlp": 0.01071097, + "balance_loss_clip": 1.0451932, + "balance_loss_mlp": 1.05891657, + "epoch": 0.06349015481737562, + "flos": 25154886309120.0, + "grad_norm": 2.390503126540762, + "language_loss": 0.80966836, + "learning_rate": 3.988246856230734e-06, + "loss": 0.83249724, + "num_input_tokens_seen": 22583035, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.53125, + "step": 1056, + "time_per_iteration": 2.4944088459014893 + }, + { + "auxiliary_loss_clip": 0.01211818, + "auxiliary_loss_mlp": 0.01065832, + "balance_loss_clip": 1.03816319, + "balance_loss_mlp": 1.05503476, + "epoch": 0.06355027807004358, + "flos": 26871775562880.0, + "grad_norm": 2.70684555522199, + "language_loss": 0.81153649, + "learning_rate": 3.988204658322426e-06, + "loss": 0.83431304, + "num_input_tokens_seen": 22605055, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.5703125, + "step": 1057, + "time_per_iteration": 2.5327882766723633 + }, + { + "auxiliary_loss_clip": 0.0119703, + "auxiliary_loss_mlp": 0.01056395, + "balance_loss_clip": 1.03401923, + "balance_loss_mlp": 1.054492, + "epoch": 0.06361040132271156, + "flos": 21396691140480.0, + "grad_norm": 2.2830641052403826, + "language_loss": 0.8369416, + "learning_rate": 3.988162385021196e-06, + "loss": 0.85947585, + "num_input_tokens_seen": 22623760, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.421875, + "step": 1058, + "time_per_iteration": 2.4742424488067627 + }, + { + "auxiliary_loss_clip": 0.01208572, + "auxiliary_loss_mlp": 0.01067718, + "balance_loss_clip": 1.03934646, + "balance_loss_mlp": 1.05714464, + "epoch": 0.06367052457537953, + "flos": 25733765894400.0, + "grad_norm": 1.9712110015930453, + "language_loss": 0.87264961, + "learning_rate": 3.988120036328651e-06, + "loss": 0.8954125, + "num_input_tokens_seen": 22643000, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.515625, + "step": 1059, + "time_per_iteration": 5.514882564544678 + }, + { + "auxiliary_loss_clip": 0.01213823, + "auxiliary_loss_mlp": 0.01068179, + "balance_loss_clip": 1.04273927, + "balance_loss_mlp": 1.06130195, + "epoch": 0.0637306478280475, + "flos": 17631420992640.0, + "grad_norm": 2.227642611819728, + "language_loss": 0.9117676, + "learning_rate": 3.988077612246394e-06, + "loss": 0.9345876, + "num_input_tokens_seen": 22660460, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.5234375, + "step": 1060, + "time_per_iteration": 3.8977622985839844 + }, + { + "auxiliary_loss_clip": 0.01204952, + "auxiliary_loss_mlp": 0.01062848, + "balance_loss_clip": 1.03691983, + "balance_loss_mlp": 1.05582809, + "epoch": 0.06379077108071547, + "flos": 13662610427520.0, + "grad_norm": 1.9159755464944204, + "language_loss": 0.87713706, + "learning_rate": 3.988035112776035e-06, + "loss": 0.89981508, + "num_input_tokens_seen": 22679270, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.4921875, + "step": 1061, + "time_per_iteration": 2.4825663566589355 + }, + { + "auxiliary_loss_clip": 0.01213048, + "auxiliary_loss_mlp": 0.01066139, + "balance_loss_clip": 1.03862584, + "balance_loss_mlp": 1.05683804, + "epoch": 0.06385089433338344, + "flos": 28478849961600.0, + "grad_norm": 2.167309005799961, + "language_loss": 0.771905, + "learning_rate": 3.987992537919185e-06, + "loss": 0.79469687, + "num_input_tokens_seen": 22699330, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.5625, + "step": 1062, + "time_per_iteration": 2.5576398372650146 + }, + { + "auxiliary_loss_clip": 0.01206834, + "auxiliary_loss_mlp": 0.01063844, + "balance_loss_clip": 1.03896523, + "balance_loss_mlp": 1.05504322, + "epoch": 0.0639110175860514, + "flos": 24311057028480.0, + "grad_norm": 2.0414192004570872, + "language_loss": 0.86835265, + "learning_rate": 3.987949887677459e-06, + "loss": 0.89105946, + "num_input_tokens_seen": 22717945, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 1.515625, + "step": 1063, + "time_per_iteration": 2.472473382949829 + }, + { + "auxiliary_loss_clip": 0.01206458, + "auxiliary_loss_mlp": 0.0106328, + "balance_loss_clip": 1.03747082, + "balance_loss_mlp": 1.05539751, + "epoch": 0.06397114083871938, + "flos": 22090772620800.0, + "grad_norm": 2.0150359019026185, + "language_loss": 0.8051579, + "learning_rate": 3.9879071620524744e-06, + "loss": 0.82785529, + "num_input_tokens_seen": 22736790, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.515625, + "step": 1064, + "time_per_iteration": 2.478205919265747 + }, + { + "auxiliary_loss_clip": 0.01207278, + "auxiliary_loss_mlp": 0.01070567, + "balance_loss_clip": 1.04409075, + "balance_loss_mlp": 1.05682254, + "epoch": 0.06403126409138735, + "flos": 19572824206080.0, + "grad_norm": 2.254194289767691, + "language_loss": 0.84650666, + "learning_rate": 3.987864361045851e-06, + "loss": 0.86928511, + "num_input_tokens_seen": 22754745, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5078125, + "step": 1065, + "time_per_iteration": 2.456456184387207 + }, + { + "auxiliary_loss_clip": 0.01207067, + "auxiliary_loss_mlp": 0.01055171, + "balance_loss_clip": 1.03099585, + "balance_loss_mlp": 1.05966115, + "epoch": 0.06409138734405531, + "flos": 40807413267840.0, + "grad_norm": 1.66169186591579, + "language_loss": 0.68201709, + "learning_rate": 3.987821484659211e-06, + "loss": 0.70463943, + "num_input_tokens_seen": 22776780, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.46875, + "step": 1066, + "time_per_iteration": 2.6294829845428467 + }, + { + "auxiliary_loss_clip": 0.01206291, + "auxiliary_loss_mlp": 0.01076738, + "balance_loss_clip": 1.05003476, + "balance_loss_mlp": 1.05877519, + "epoch": 0.06415151059672328, + "flos": 20441610460800.0, + "grad_norm": 3.704601442813356, + "language_loss": 0.90345579, + "learning_rate": 3.987778532894181e-06, + "loss": 0.9262861, + "num_input_tokens_seen": 22793915, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.4765625, + "step": 1067, + "time_per_iteration": 2.459721565246582 + }, + { + "auxiliary_loss_clip": 0.01207052, + "auxiliary_loss_mlp": 0.01068129, + "balance_loss_clip": 1.04364336, + "balance_loss_mlp": 1.05625772, + "epoch": 0.06421163384939126, + "flos": 18072045129600.0, + "grad_norm": 2.8684947664405436, + "language_loss": 0.8343029, + "learning_rate": 3.987735505752391e-06, + "loss": 0.85705471, + "num_input_tokens_seen": 22812670, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.515625, + "step": 1068, + "time_per_iteration": 2.4611129760742188 + }, + { + "auxiliary_loss_clip": 0.01205753, + "auxiliary_loss_mlp": 0.01064379, + "balance_loss_clip": 1.03963113, + "balance_loss_mlp": 1.05991328, + "epoch": 0.06427175710205922, + "flos": 25119442563840.0, + "grad_norm": 2.4683216708617053, + "language_loss": 0.89402264, + "learning_rate": 3.987692403235471e-06, + "loss": 0.91672397, + "num_input_tokens_seen": 22832440, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.453125, + "step": 1069, + "time_per_iteration": 2.486241340637207 + }, + { + "auxiliary_loss_clip": 0.01206711, + "auxiliary_loss_mlp": 0.01082225, + "balance_loss_clip": 1.05555749, + "balance_loss_mlp": 1.05718124, + "epoch": 0.06433188035472719, + "flos": 17380549428480.0, + "grad_norm": 2.6076700233042396, + "language_loss": 0.95764256, + "learning_rate": 3.987649225345056e-06, + "loss": 0.98053193, + "num_input_tokens_seen": 22845495, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5, + "step": 1070, + "time_per_iteration": 2.413357734680176 + }, + { + "auxiliary_loss_clip": 0.01209924, + "auxiliary_loss_mlp": 0.01057318, + "balance_loss_clip": 1.0309608, + "balance_loss_mlp": 1.05859673, + "epoch": 0.06439200360739517, + "flos": 23546267625600.0, + "grad_norm": 1.8004745601001504, + "language_loss": 0.8819589, + "learning_rate": 3.987605972082782e-06, + "loss": 0.90463126, + "num_input_tokens_seen": 22865390, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.515625, + "step": 1071, + "time_per_iteration": 2.4717295169830322 + }, + { + "auxiliary_loss_clip": 0.01204044, + "auxiliary_loss_mlp": 0.01052011, + "balance_loss_clip": 1.02799058, + "balance_loss_mlp": 1.056633, + "epoch": 0.06445212686006313, + "flos": 21979772616960.0, + "grad_norm": 1.6498592642907823, + "language_loss": 0.75996184, + "learning_rate": 3.987562643450292e-06, + "loss": 0.78252238, + "num_input_tokens_seen": 22885495, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.4765625, + "step": 1072, + "time_per_iteration": 2.486936330795288 + }, + { + "auxiliary_loss_clip": 0.01207782, + "auxiliary_loss_mlp": 0.010661, + "balance_loss_clip": 1.03951669, + "balance_loss_mlp": 1.05679154, + "epoch": 0.0645122501127311, + "flos": 25921291824000.0, + "grad_norm": 1.95165590675185, + "language_loss": 0.80415034, + "learning_rate": 3.987519239449226e-06, + "loss": 0.82688916, + "num_input_tokens_seen": 22904845, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5078125, + "step": 1073, + "time_per_iteration": 2.476189613342285 + }, + { + "auxiliary_loss_clip": 0.01200054, + "auxiliary_loss_mlp": 0.01059954, + "balance_loss_clip": 1.03563547, + "balance_loss_mlp": 1.05634785, + "epoch": 0.06457237336539907, + "flos": 25626034028160.0, + "grad_norm": 1.7105520573330508, + "language_loss": 0.80205524, + "learning_rate": 3.987475760081233e-06, + "loss": 0.82465529, + "num_input_tokens_seen": 22925940, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.4375, + "step": 1074, + "time_per_iteration": 2.499133586883545 + }, + { + "auxiliary_loss_clip": 0.01204265, + "auxiliary_loss_mlp": 0.01060595, + "balance_loss_clip": 1.03469074, + "balance_loss_mlp": 1.05560029, + "epoch": 0.06463249661806704, + "flos": 19463979018240.0, + "grad_norm": 2.398999995550556, + "language_loss": 0.79203326, + "learning_rate": 3.987432205347958e-06, + "loss": 0.81468183, + "num_input_tokens_seen": 22944375, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.484375, + "step": 1075, + "time_per_iteration": 2.46777606010437 + }, + { + "auxiliary_loss_clip": 0.01207545, + "auxiliary_loss_mlp": 0.01064646, + "balance_loss_clip": 1.04086363, + "balance_loss_mlp": 1.05960226, + "epoch": 0.064692619870735, + "flos": 24498044254080.0, + "grad_norm": 2.7671348430420712, + "language_loss": 0.87819242, + "learning_rate": 3.987388575251055e-06, + "loss": 0.90091443, + "num_input_tokens_seen": 22959145, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.484375, + "step": 1076, + "time_per_iteration": 2.487417221069336 + }, + { + "auxiliary_loss_clip": 0.01199028, + "auxiliary_loss_mlp": 0.01053729, + "balance_loss_clip": 1.02918351, + "balance_loss_mlp": 1.05429745, + "epoch": 0.06475274312340297, + "flos": 17018677860480.0, + "grad_norm": 2.1388407300528534, + "language_loss": 0.80692923, + "learning_rate": 3.98734486979218e-06, + "loss": 0.82945681, + "num_input_tokens_seen": 22978100, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.4453125, + "step": 1077, + "time_per_iteration": 2.4290995597839355 + }, + { + "auxiliary_loss_clip": 0.01211867, + "auxiliary_loss_mlp": 0.01071702, + "balance_loss_clip": 1.04566646, + "balance_loss_mlp": 1.05862093, + "epoch": 0.06481286637607095, + "flos": 24572379450240.0, + "grad_norm": 2.618517400605346, + "language_loss": 0.91640681, + "learning_rate": 3.987301088972986e-06, + "loss": 0.93924248, + "num_input_tokens_seen": 22997285, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.53125, + "step": 1078, + "time_per_iteration": 2.500995635986328 + }, + { + "auxiliary_loss_clip": 0.01212712, + "auxiliary_loss_mlp": 0.01062475, + "balance_loss_clip": 1.03698754, + "balance_loss_mlp": 1.05874825, + "epoch": 0.06487298962873891, + "flos": 21105635235840.0, + "grad_norm": 2.106125999672554, + "language_loss": 0.78772497, + "learning_rate": 3.987257232795137e-06, + "loss": 0.81047684, + "num_input_tokens_seen": 23016285, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.5390625, + "step": 1079, + "time_per_iteration": 2.4510841369628906 + }, + { + "auxiliary_loss_clip": 0.01204732, + "auxiliary_loss_mlp": 0.01061369, + "balance_loss_clip": 1.03619218, + "balance_loss_mlp": 1.05602205, + "epoch": 0.06493311288140688, + "flos": 24608182331520.0, + "grad_norm": 2.051955253501364, + "language_loss": 0.69555283, + "learning_rate": 3.987213301260294e-06, + "loss": 0.7182138, + "num_input_tokens_seen": 23036420, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.484375, + "step": 1080, + "time_per_iteration": 2.5024302005767822 + }, + { + "auxiliary_loss_clip": 0.01204586, + "auxiliary_loss_mlp": 0.01063302, + "balance_loss_clip": 1.03649211, + "balance_loss_mlp": 1.05477285, + "epoch": 0.06499323613407486, + "flos": 25337994865920.0, + "grad_norm": 1.85895294752556, + "language_loss": 0.72094852, + "learning_rate": 3.987169294370123e-06, + "loss": 0.74362737, + "num_input_tokens_seen": 23056945, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.5, + "step": 1081, + "time_per_iteration": 2.5032622814178467 + }, + { + "auxiliary_loss_clip": 0.01201777, + "auxiliary_loss_mlp": 0.01064533, + "balance_loss_clip": 1.03867674, + "balance_loss_mlp": 1.0554111, + "epoch": 0.06505335938674282, + "flos": 20375714960640.0, + "grad_norm": 2.6422342029105863, + "language_loss": 0.84621316, + "learning_rate": 3.987125212126294e-06, + "loss": 0.86887628, + "num_input_tokens_seen": 23074940, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.46875, + "step": 1082, + "time_per_iteration": 2.459564447402954 + }, + { + "auxiliary_loss_clip": 0.01214386, + "auxiliary_loss_mlp": 0.01067955, + "balance_loss_clip": 1.04106104, + "balance_loss_mlp": 1.05817008, + "epoch": 0.06511348263941079, + "flos": 25337923038720.0, + "grad_norm": 2.177850298461163, + "language_loss": 0.8303026, + "learning_rate": 3.987081054530478e-06, + "loss": 0.85312605, + "num_input_tokens_seen": 23093420, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.5625, + "step": 1083, + "time_per_iteration": 2.504584550857544 + }, + { + "auxiliary_loss_clip": 0.01206291, + "auxiliary_loss_mlp": 0.01065303, + "balance_loss_clip": 1.03852844, + "balance_loss_mlp": 1.05794787, + "epoch": 0.06517360589207877, + "flos": 20332801186560.0, + "grad_norm": 2.6002614807121227, + "language_loss": 0.79689312, + "learning_rate": 3.987036821584348e-06, + "loss": 0.81960905, + "num_input_tokens_seen": 23111550, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.484375, + "step": 1084, + "time_per_iteration": 2.4530820846557617 + }, + { + "auxiliary_loss_clip": 0.01204762, + "auxiliary_loss_mlp": 0.01060872, + "balance_loss_clip": 1.03489637, + "balance_loss_mlp": 1.05634058, + "epoch": 0.06523372914474673, + "flos": 31681650061440.0, + "grad_norm": 2.1191367521188074, + "language_loss": 0.66211331, + "learning_rate": 3.986992513289584e-06, + "loss": 0.68476963, + "num_input_tokens_seen": 23130335, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.484375, + "step": 1085, + "time_per_iteration": 2.5733256340026855 + }, + { + "auxiliary_loss_clip": 0.01198609, + "auxiliary_loss_mlp": 0.01069188, + "balance_loss_clip": 1.04436827, + "balance_loss_mlp": 1.05400848, + "epoch": 0.0652938523974147, + "flos": 20778165918720.0, + "grad_norm": 1.9997547556569089, + "language_loss": 0.76998973, + "learning_rate": 3.9869481296478645e-06, + "loss": 0.79266769, + "num_input_tokens_seen": 23152380, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.4453125, + "step": 1086, + "time_per_iteration": 2.4958763122558594 + }, + { + "auxiliary_loss_clip": 0.01199669, + "auxiliary_loss_mlp": 0.01063299, + "balance_loss_clip": 1.03763306, + "balance_loss_mlp": 1.05291176, + "epoch": 0.06535397565008266, + "flos": 16690993061760.0, + "grad_norm": 2.1546414392836977, + "language_loss": 0.85154319, + "learning_rate": 3.986903670660872e-06, + "loss": 0.87417287, + "num_input_tokens_seen": 23171630, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.46875, + "step": 1087, + "time_per_iteration": 2.4456934928894043 + }, + { + "auxiliary_loss_clip": 0.01204231, + "auxiliary_loss_mlp": 0.01061167, + "balance_loss_clip": 1.03609776, + "balance_loss_mlp": 1.05594206, + "epoch": 0.06541409890275064, + "flos": 26868220116480.0, + "grad_norm": 1.7775330808837086, + "language_loss": 0.77970594, + "learning_rate": 3.9868591363302945e-06, + "loss": 0.80235994, + "num_input_tokens_seen": 23192520, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.484375, + "step": 1088, + "time_per_iteration": 2.53505277633667 + }, + { + "auxiliary_loss_clip": 0.01204134, + "auxiliary_loss_mlp": 0.01066637, + "balance_loss_clip": 1.04329574, + "balance_loss_mlp": 1.05602646, + "epoch": 0.06547422215541861, + "flos": 20521620005760.0, + "grad_norm": 1.9036978890371752, + "language_loss": 0.71191919, + "learning_rate": 3.9868145266578186e-06, + "loss": 0.73462689, + "num_input_tokens_seen": 23210710, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.484375, + "step": 1089, + "time_per_iteration": 2.4569168090820312 + }, + { + "auxiliary_loss_clip": 0.01204151, + "auxiliary_loss_mlp": 0.0105997, + "balance_loss_clip": 1.03566289, + "balance_loss_mlp": 1.05729651, + "epoch": 0.06553434540808657, + "flos": 22016616992640.0, + "grad_norm": 1.7924808842614686, + "language_loss": 0.85504186, + "learning_rate": 3.9867698416451366e-06, + "loss": 0.8776831, + "num_input_tokens_seen": 23230305, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.46875, + "step": 1090, + "time_per_iteration": 2.4624812602996826 + }, + { + "auxiliary_loss_clip": 0.01204567, + "auxiliary_loss_mlp": 0.01062106, + "balance_loss_clip": 1.0365001, + "balance_loss_mlp": 1.05594897, + "epoch": 0.06559446866075455, + "flos": 24608649208320.0, + "grad_norm": 2.2382380061135945, + "language_loss": 0.72027361, + "learning_rate": 3.9867250812939434e-06, + "loss": 0.74294031, + "num_input_tokens_seen": 23249015, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.484375, + "step": 1091, + "time_per_iteration": 2.4911999702453613 + }, + { + "auxiliary_loss_clip": 0.01201014, + "auxiliary_loss_mlp": 0.0106187, + "balance_loss_clip": 1.03638279, + "balance_loss_mlp": 1.05507159, + "epoch": 0.06565459191342252, + "flos": 24274679529600.0, + "grad_norm": 2.7948943762047525, + "language_loss": 0.82525271, + "learning_rate": 3.986680245605936e-06, + "loss": 0.8478815, + "num_input_tokens_seen": 23265105, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4609375, + "step": 1092, + "time_per_iteration": 2.510835886001587 + }, + { + "auxiliary_loss_clip": 0.01205888, + "auxiliary_loss_mlp": 0.01064535, + "balance_loss_clip": 1.03716493, + "balance_loss_mlp": 1.05484402, + "epoch": 0.06571471516609048, + "flos": 24787124910720.0, + "grad_norm": 4.994634192306823, + "language_loss": 0.71286589, + "learning_rate": 3.986635334582814e-06, + "loss": 0.73557013, + "num_input_tokens_seen": 23283950, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.515625, + "step": 1093, + "time_per_iteration": 2.528994560241699 + }, + { + "auxiliary_loss_clip": 0.01204526, + "auxiliary_loss_mlp": 0.01063177, + "balance_loss_clip": 1.03668869, + "balance_loss_mlp": 1.05701041, + "epoch": 0.06577483841875846, + "flos": 26214071581440.0, + "grad_norm": 1.8259988866114194, + "language_loss": 0.87971264, + "learning_rate": 3.986590348226282e-06, + "loss": 0.90238965, + "num_input_tokens_seen": 23305005, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.4765625, + "step": 1094, + "time_per_iteration": 2.50201678276062 + }, + { + "auxiliary_loss_clip": 0.01205803, + "auxiliary_loss_mlp": 0.0106202, + "balance_loss_clip": 1.0350548, + "balance_loss_mlp": 1.0575459, + "epoch": 0.06583496167142643, + "flos": 25080802508160.0, + "grad_norm": 1.6349502946236962, + "language_loss": 0.81364405, + "learning_rate": 3.986545286538044e-06, + "loss": 0.83632231, + "num_input_tokens_seen": 23323220, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.484375, + "step": 1095, + "time_per_iteration": 2.4947729110717773 + }, + { + "auxiliary_loss_clip": 0.01200923, + "auxiliary_loss_mlp": 0.01057353, + "balance_loss_clip": 1.03414297, + "balance_loss_mlp": 1.05544913, + "epoch": 0.06589508492409439, + "flos": 25629804956160.0, + "grad_norm": 2.4379029944224215, + "language_loss": 0.69712919, + "learning_rate": 3.986500149519811e-06, + "loss": 0.7197119, + "num_input_tokens_seen": 23342235, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.453125, + "step": 1096, + "time_per_iteration": 2.5135879516601562 + }, + { + "auxiliary_loss_clip": 0.01206873, + "auxiliary_loss_mlp": 0.01069815, + "balance_loss_clip": 1.04451883, + "balance_loss_mlp": 1.0592947, + "epoch": 0.06595520817676236, + "flos": 23621249266560.0, + "grad_norm": 1.7715259730160258, + "language_loss": 0.77498722, + "learning_rate": 3.986454937173292e-06, + "loss": 0.79775411, + "num_input_tokens_seen": 23363680, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.4765625, + "step": 1097, + "time_per_iteration": 2.4872820377349854 + }, + { + "auxiliary_loss_clip": 0.0120653, + "auxiliary_loss_mlp": 0.01063548, + "balance_loss_clip": 1.03814423, + "balance_loss_mlp": 1.05785179, + "epoch": 0.06601533142943034, + "flos": 33801708545280.0, + "grad_norm": 1.7376479388989727, + "language_loss": 0.77846545, + "learning_rate": 3.986409649500203e-06, + "loss": 0.80116618, + "num_input_tokens_seen": 23385590, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.484375, + "step": 1098, + "time_per_iteration": 2.583075761795044 + }, + { + "auxiliary_loss_clip": 0.01204454, + "auxiliary_loss_mlp": 0.01071542, + "balance_loss_clip": 1.04483891, + "balance_loss_mlp": 1.05739522, + "epoch": 0.0660754546820983, + "flos": 20259184262400.0, + "grad_norm": 1.9398633669636132, + "language_loss": 0.81675154, + "learning_rate": 3.986364286502261e-06, + "loss": 0.83951151, + "num_input_tokens_seen": 23402945, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.46875, + "step": 1099, + "time_per_iteration": 2.446969985961914 + }, + { + "auxiliary_loss_clip": 0.01195976, + "auxiliary_loss_mlp": 0.01052705, + "balance_loss_clip": 1.02801692, + "balance_loss_mlp": 1.0519135, + "epoch": 0.06613557793476627, + "flos": 19354164163200.0, + "grad_norm": 2.0018625732470245, + "language_loss": 0.82619941, + "learning_rate": 3.986318848181186e-06, + "loss": 0.84868616, + "num_input_tokens_seen": 23421410, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.4375, + "step": 1100, + "time_per_iteration": 2.4545743465423584 + }, + { + "auxiliary_loss_clip": 0.01204382, + "auxiliary_loss_mlp": 0.01060672, + "balance_loss_clip": 1.03630555, + "balance_loss_mlp": 1.05827951, + "epoch": 0.06619570118743424, + "flos": 13772568936960.0, + "grad_norm": 2.362466383483127, + "language_loss": 0.73439336, + "learning_rate": 3.986273334538702e-06, + "loss": 0.7570439, + "num_input_tokens_seen": 23438870, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.4609375, + "step": 1101, + "time_per_iteration": 6.740786790847778 + }, + { + "auxiliary_loss_clip": 0.0119874, + "auxiliary_loss_mlp": 0.01062411, + "balance_loss_clip": 1.03829539, + "balance_loss_mlp": 1.05373132, + "epoch": 0.06625582444010221, + "flos": 17857874286720.0, + "grad_norm": 2.46656505058328, + "language_loss": 0.86047602, + "learning_rate": 3.986227745576533e-06, + "loss": 0.88308758, + "num_input_tokens_seen": 23456975, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.453125, + "step": 1102, + "time_per_iteration": 2.4480903148651123 + }, + { + "auxiliary_loss_clip": 0.01200394, + "auxiliary_loss_mlp": 0.01057443, + "balance_loss_clip": 1.0322063, + "balance_loss_mlp": 1.05588222, + "epoch": 0.06631594769277017, + "flos": 11838707579520.0, + "grad_norm": 2.0494810685505995, + "language_loss": 0.81707513, + "learning_rate": 3.98618208129641e-06, + "loss": 0.83965349, + "num_input_tokens_seen": 23473440, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.4453125, + "step": 1103, + "time_per_iteration": 2.4419338703155518 + }, + { + "auxiliary_loss_clip": 0.01203538, + "auxiliary_loss_mlp": 0.01063441, + "balance_loss_clip": 1.04029047, + "balance_loss_mlp": 1.05891824, + "epoch": 0.06637607094543815, + "flos": 19793351756160.0, + "grad_norm": 1.7865556655629211, + "language_loss": 0.82059169, + "learning_rate": 3.986136341700063e-06, + "loss": 0.84326148, + "num_input_tokens_seen": 23493880, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.4453125, + "step": 1104, + "time_per_iteration": 2.506230115890503 + }, + { + "auxiliary_loss_clip": 0.01195268, + "auxiliary_loss_mlp": 0.01047754, + "balance_loss_clip": 1.02229071, + "balance_loss_mlp": 1.05232382, + "epoch": 0.06643619419810612, + "flos": 25485659677440.0, + "grad_norm": 1.6089454783719872, + "language_loss": 0.80542791, + "learning_rate": 3.986090526789227e-06, + "loss": 0.82785821, + "num_input_tokens_seen": 23514920, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4296875, + "step": 1105, + "time_per_iteration": 2.524385929107666 + }, + { + "auxiliary_loss_clip": 0.01197193, + "auxiliary_loss_mlp": 0.01060252, + "balance_loss_clip": 1.03720832, + "balance_loss_mlp": 1.05697632, + "epoch": 0.06649631745077408, + "flos": 16946533393920.0, + "grad_norm": 1.8452117827451007, + "language_loss": 0.96738935, + "learning_rate": 3.986044636565639e-06, + "loss": 0.98996383, + "num_input_tokens_seen": 23531635, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.40625, + "step": 1106, + "time_per_iteration": 2.455122470855713 + }, + { + "auxiliary_loss_clip": 0.01204143, + "auxiliary_loss_mlp": 0.01060087, + "balance_loss_clip": 1.03436136, + "balance_loss_mlp": 1.05509543, + "epoch": 0.06655644070344206, + "flos": 17858592558720.0, + "grad_norm": 1.9568581550144768, + "language_loss": 0.82766026, + "learning_rate": 3.985998671031039e-06, + "loss": 0.85030258, + "num_input_tokens_seen": 23551020, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.4921875, + "step": 1107, + "time_per_iteration": 2.4554357528686523 + }, + { + "auxiliary_loss_clip": 0.01077187, + "auxiliary_loss_mlp": 0.01010186, + "balance_loss_clip": 1.0061568, + "balance_loss_mlp": 1.01696265, + "epoch": 0.06661656395611003, + "flos": 61419350021760.0, + "grad_norm": 0.8235952583150978, + "language_loss": 0.56729984, + "learning_rate": 3.9859526301871705e-06, + "loss": 0.58817357, + "num_input_tokens_seen": 23610675, + "router_z_loss_clip": 0.0402832, + "router_z_loss_mlp": 0.6015625, + "step": 1108, + "time_per_iteration": 3.0248770713806152 + }, + { + "auxiliary_loss_clip": 0.01200435, + "auxiliary_loss_mlp": 0.01065514, + "balance_loss_clip": 1.04034865, + "balance_loss_mlp": 1.05397463, + "epoch": 0.066676687208778, + "flos": 20662856282880.0, + "grad_norm": 2.4203653272420693, + "language_loss": 0.72493321, + "learning_rate": 3.9859065140357795e-06, + "loss": 0.74759269, + "num_input_tokens_seen": 23628710, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.46875, + "step": 1109, + "time_per_iteration": 2.4559717178344727 + }, + { + "auxiliary_loss_clip": 0.01197389, + "auxiliary_loss_mlp": 0.01063103, + "balance_loss_clip": 1.03759217, + "balance_loss_mlp": 1.05389571, + "epoch": 0.06673681046144596, + "flos": 20923280864640.0, + "grad_norm": 3.084593088047962, + "language_loss": 0.78256035, + "learning_rate": 3.985860322578614e-06, + "loss": 0.80516529, + "num_input_tokens_seen": 23649160, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4375, + "step": 1110, + "time_per_iteration": 2.4989912509918213 + }, + { + "auxiliary_loss_clip": 0.01201522, + "auxiliary_loss_mlp": 0.01057407, + "balance_loss_clip": 1.0334934, + "balance_loss_mlp": 1.05598152, + "epoch": 0.06679693371411394, + "flos": 31065818359680.0, + "grad_norm": 2.197430378352105, + "language_loss": 0.71290207, + "learning_rate": 3.985814055817427e-06, + "loss": 0.73549128, + "num_input_tokens_seen": 23671995, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.453125, + "step": 1111, + "time_per_iteration": 2.5445287227630615 + }, + { + "auxiliary_loss_clip": 0.0120524, + "auxiliary_loss_mlp": 0.01062473, + "balance_loss_clip": 1.03833365, + "balance_loss_mlp": 1.05788755, + "epoch": 0.0668570569667819, + "flos": 21726135705600.0, + "grad_norm": 1.8078370838130353, + "language_loss": 0.78315711, + "learning_rate": 3.985767713753971e-06, + "loss": 0.80583429, + "num_input_tokens_seen": 23690705, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.4765625, + "step": 1112, + "time_per_iteration": 2.4791040420532227 + }, + { + "auxiliary_loss_clip": 0.01203172, + "auxiliary_loss_mlp": 0.01058254, + "balance_loss_clip": 1.03426933, + "balance_loss_mlp": 1.05794001, + "epoch": 0.06691718021944987, + "flos": 22747255539840.0, + "grad_norm": 2.0430507180103943, + "language_loss": 0.78819263, + "learning_rate": 3.985721296390005e-06, + "loss": 0.81080687, + "num_input_tokens_seen": 23709990, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.453125, + "step": 1113, + "time_per_iteration": 2.4637296199798584 + }, + { + "auxiliary_loss_clip": 0.01195153, + "auxiliary_loss_mlp": 0.01053406, + "balance_loss_clip": 1.03056598, + "balance_loss_mlp": 1.05255365, + "epoch": 0.06697730347211785, + "flos": 16545626720640.0, + "grad_norm": 2.035611213247421, + "language_loss": 0.82393003, + "learning_rate": 3.985674803727289e-06, + "loss": 0.84641558, + "num_input_tokens_seen": 23728485, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.4296875, + "step": 1114, + "time_per_iteration": 2.434006452560425 + }, + { + "auxiliary_loss_clip": 0.01076, + "auxiliary_loss_mlp": 0.01008619, + "balance_loss_clip": 1.00463712, + "balance_loss_mlp": 1.0165143, + "epoch": 0.06703742672478581, + "flos": 59782326658560.0, + "grad_norm": 0.8339607525511222, + "language_loss": 0.58126414, + "learning_rate": 3.985628235767584e-06, + "loss": 0.60211033, + "num_input_tokens_seen": 23786650, + "router_z_loss_clip": 0.03979492, + "router_z_loss_mlp": 0.59375, + "step": 1115, + "time_per_iteration": 3.020782709121704 + }, + { + "auxiliary_loss_clip": 0.01200335, + "auxiliary_loss_mlp": 0.01059737, + "balance_loss_clip": 1.03427422, + "balance_loss_mlp": 1.05479646, + "epoch": 0.06709754997745378, + "flos": 16800197385600.0, + "grad_norm": 2.8263674595854464, + "language_loss": 0.91123891, + "learning_rate": 3.985581592512658e-06, + "loss": 0.93383968, + "num_input_tokens_seen": 23802555, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.453125, + "step": 1116, + "time_per_iteration": 2.446439504623413 + }, + { + "auxiliary_loss_clip": 0.01209259, + "auxiliary_loss_mlp": 0.01067721, + "balance_loss_clip": 1.04323506, + "balance_loss_mlp": 1.06065357, + "epoch": 0.06715767323012176, + "flos": 22123917895680.0, + "grad_norm": 2.019283248682947, + "language_loss": 0.8709814, + "learning_rate": 3.985534873964279e-06, + "loss": 0.89375114, + "num_input_tokens_seen": 23822945, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.484375, + "step": 1117, + "time_per_iteration": 2.486212968826294 + }, + { + "auxiliary_loss_clip": 0.01074244, + "auxiliary_loss_mlp": 0.0100646, + "balance_loss_clip": 1.00250196, + "balance_loss_mlp": 1.01550937, + "epoch": 0.06721779648278972, + "flos": 66618100137600.0, + "grad_norm": 0.9454776991467404, + "language_loss": 0.59798217, + "learning_rate": 3.985488080124218e-06, + "loss": 0.6187892, + "num_input_tokens_seen": 23874075, + "router_z_loss_clip": 0.03955078, + "router_z_loss_mlp": 0.5859375, + "step": 1118, + "time_per_iteration": 3.0197594165802 + }, + { + "auxiliary_loss_clip": 0.01201284, + "auxiliary_loss_mlp": 0.01056466, + "balance_loss_clip": 1.03255224, + "balance_loss_mlp": 1.05418777, + "epoch": 0.06727791973545769, + "flos": 22382474970240.0, + "grad_norm": 3.7568577616727468, + "language_loss": 0.83498162, + "learning_rate": 3.985441210994251e-06, + "loss": 0.85755914, + "num_input_tokens_seen": 23889720, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.4765625, + "step": 1119, + "time_per_iteration": 2.4535257816314697 + }, + { + "auxiliary_loss_clip": 0.01199216, + "auxiliary_loss_mlp": 0.01059451, + "balance_loss_clip": 1.03732562, + "balance_loss_mlp": 1.0562222, + "epoch": 0.06733804298812565, + "flos": 24280210224000.0, + "grad_norm": 1.8165724331790314, + "language_loss": 0.8480413, + "learning_rate": 3.9853942665761545e-06, + "loss": 0.87062794, + "num_input_tokens_seen": 23909385, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.4296875, + "step": 1120, + "time_per_iteration": 2.533182382583618 + }, + { + "auxiliary_loss_clip": 0.01208718, + "auxiliary_loss_mlp": 0.01067102, + "balance_loss_clip": 1.04269981, + "balance_loss_mlp": 1.0602659, + "epoch": 0.06739816624079363, + "flos": 15918230839680.0, + "grad_norm": 2.032922437281707, + "language_loss": 0.78959441, + "learning_rate": 3.985347246871708e-06, + "loss": 0.81235266, + "num_input_tokens_seen": 23926830, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.484375, + "step": 1121, + "time_per_iteration": 2.4523215293884277 + }, + { + "auxiliary_loss_clip": 0.01072018, + "auxiliary_loss_mlp": 0.01004747, + "balance_loss_clip": 1.00033593, + "balance_loss_mlp": 1.0132587, + "epoch": 0.0674582894934616, + "flos": 71398567353600.0, + "grad_norm": 0.7615352754050735, + "language_loss": 0.58346939, + "learning_rate": 3.985300151882694e-06, + "loss": 0.60423702, + "num_input_tokens_seen": 23992640, + "router_z_loss_clip": 0.04418945, + "router_z_loss_mlp": 0.5859375, + "step": 1122, + "time_per_iteration": 3.2087855339050293 + }, + { + "auxiliary_loss_clip": 0.0120309, + "auxiliary_loss_mlp": 0.01065913, + "balance_loss_clip": 1.04245234, + "balance_loss_mlp": 1.0584271, + "epoch": 0.06751841274612956, + "flos": 25264952559360.0, + "grad_norm": 2.0430211727412098, + "language_loss": 0.71546745, + "learning_rate": 3.985252981610901e-06, + "loss": 0.73815745, + "num_input_tokens_seen": 24011135, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.4453125, + "step": 1123, + "time_per_iteration": 2.5017640590667725 + }, + { + "auxiliary_loss_clip": 0.01201701, + "auxiliary_loss_mlp": 0.01057362, + "balance_loss_clip": 1.03216124, + "balance_loss_mlp": 1.05484593, + "epoch": 0.06757853599879754, + "flos": 23802741711360.0, + "grad_norm": 1.8376842720828679, + "language_loss": 0.79288971, + "learning_rate": 3.985205736058114e-06, + "loss": 0.81548035, + "num_input_tokens_seen": 24030695, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.46875, + "step": 1124, + "time_per_iteration": 2.4980688095092773 + }, + { + "auxiliary_loss_clip": 0.01196564, + "auxiliary_loss_mlp": 0.01054377, + "balance_loss_clip": 1.03204954, + "balance_loss_mlp": 1.05469489, + "epoch": 0.0676386592514655, + "flos": 21033742164480.0, + "grad_norm": 2.0983993205372253, + "language_loss": 0.71198726, + "learning_rate": 3.985158415226128e-06, + "loss": 0.73449671, + "num_input_tokens_seen": 24050680, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.421875, + "step": 1125, + "time_per_iteration": 2.4704325199127197 + }, + { + "auxiliary_loss_clip": 0.01198895, + "auxiliary_loss_mlp": 0.01068522, + "balance_loss_clip": 1.04247451, + "balance_loss_mlp": 1.05620742, + "epoch": 0.06769878250413347, + "flos": 25556331686400.0, + "grad_norm": 2.9171204901367243, + "language_loss": 0.80814254, + "learning_rate": 3.985111019116736e-06, + "loss": 0.83081663, + "num_input_tokens_seen": 24067205, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.421875, + "step": 1126, + "time_per_iteration": 2.5046803951263428 + }, + { + "auxiliary_loss_clip": 0.01070877, + "auxiliary_loss_mlp": 0.01002736, + "balance_loss_clip": 0.9986586, + "balance_loss_mlp": 1.01286924, + "epoch": 0.06775890575680145, + "flos": 70655251305600.0, + "grad_norm": 0.7804116507992601, + "language_loss": 0.59733766, + "learning_rate": 3.985063547731735e-06, + "loss": 0.61807376, + "num_input_tokens_seen": 24131320, + "router_z_loss_clip": 0.04077148, + "router_z_loss_mlp": 0.578125, + "step": 1127, + "time_per_iteration": 3.0877249240875244 + }, + { + "auxiliary_loss_clip": 0.01199514, + "auxiliary_loss_mlp": 0.01056848, + "balance_loss_clip": 1.03376949, + "balance_loss_mlp": 1.05723238, + "epoch": 0.06781902900946941, + "flos": 24235500769920.0, + "grad_norm": 2.13286114653412, + "language_loss": 0.81392133, + "learning_rate": 3.985016001072925e-06, + "loss": 0.83648497, + "num_input_tokens_seen": 24149930, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.421875, + "step": 1128, + "time_per_iteration": 2.5406885147094727 + }, + { + "auxiliary_loss_clip": 0.01208088, + "auxiliary_loss_mlp": 0.0105195, + "balance_loss_clip": 1.02692807, + "balance_loss_mlp": 1.0598706, + "epoch": 0.06787915226213738, + "flos": 22417523665920.0, + "grad_norm": 3.047918834731733, + "language_loss": 0.76034033, + "learning_rate": 3.984968379142109e-06, + "loss": 0.78294069, + "num_input_tokens_seen": 24169590, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.484375, + "step": 1129, + "time_per_iteration": 2.486829996109009 + }, + { + "auxiliary_loss_clip": 0.01201584, + "auxiliary_loss_mlp": 0.01061333, + "balance_loss_clip": 1.03721654, + "balance_loss_mlp": 1.05536139, + "epoch": 0.06793927551480534, + "flos": 37706922080640.0, + "grad_norm": 1.8621491947103987, + "language_loss": 0.72340226, + "learning_rate": 3.984920681941094e-06, + "loss": 0.74603146, + "num_input_tokens_seen": 24189965, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.4609375, + "step": 1130, + "time_per_iteration": 2.6195991039276123 + }, + { + "auxiliary_loss_clip": 0.01197626, + "auxiliary_loss_mlp": 0.01063599, + "balance_loss_clip": 1.03957844, + "balance_loss_mlp": 1.05584192, + "epoch": 0.06799939876747332, + "flos": 20631398947200.0, + "grad_norm": 2.3479224842049917, + "language_loss": 0.80624223, + "learning_rate": 3.984872909471688e-06, + "loss": 0.82885444, + "num_input_tokens_seen": 24208045, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.421875, + "step": 1131, + "time_per_iteration": 2.4917030334472656 + }, + { + "auxiliary_loss_clip": 0.01196301, + "auxiliary_loss_mlp": 0.0106802, + "balance_loss_clip": 1.04398775, + "balance_loss_mlp": 1.05550814, + "epoch": 0.06805952202014129, + "flos": 14864755829760.0, + "grad_norm": 2.1673533627141652, + "language_loss": 0.8104949, + "learning_rate": 3.984825061735701e-06, + "loss": 0.83313811, + "num_input_tokens_seen": 24223805, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.40625, + "step": 1132, + "time_per_iteration": 2.4325902462005615 + }, + { + "auxiliary_loss_clip": 0.01199688, + "auxiliary_loss_mlp": 0.01069367, + "balance_loss_clip": 1.04525137, + "balance_loss_mlp": 1.05629563, + "epoch": 0.06811964527280925, + "flos": 48909434947200.0, + "grad_norm": 1.450417149602266, + "language_loss": 0.63629937, + "learning_rate": 3.9847771387349495e-06, + "loss": 0.65898991, + "num_input_tokens_seen": 24249475, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.4296875, + "step": 1133, + "time_per_iteration": 2.7164230346679688 + }, + { + "auxiliary_loss_clip": 0.01203203, + "auxiliary_loss_mlp": 0.01059397, + "balance_loss_clip": 1.03194308, + "balance_loss_mlp": 1.05427325, + "epoch": 0.06817976852547723, + "flos": 15377273038080.0, + "grad_norm": 2.5027083277203963, + "language_loss": 0.74811196, + "learning_rate": 3.9847291404712506e-06, + "loss": 0.77073789, + "num_input_tokens_seen": 24267980, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.484375, + "step": 1134, + "time_per_iteration": 2.420506000518799 + }, + { + "auxiliary_loss_clip": 0.01201452, + "auxiliary_loss_mlp": 0.01064371, + "balance_loss_clip": 1.04088652, + "balance_loss_mlp": 1.05952573, + "epoch": 0.0682398917781452, + "flos": 20155690200960.0, + "grad_norm": 2.0759609389962037, + "language_loss": 0.87245119, + "learning_rate": 3.984681066946423e-06, + "loss": 0.89510942, + "num_input_tokens_seen": 24286805, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.421875, + "step": 1135, + "time_per_iteration": 2.464738607406616 + }, + { + "auxiliary_loss_clip": 0.01200809, + "auxiliary_loss_mlp": 0.01055494, + "balance_loss_clip": 1.03010249, + "balance_loss_mlp": 1.05388534, + "epoch": 0.06830001503081316, + "flos": 23440618748160.0, + "grad_norm": 2.383261313924855, + "language_loss": 0.78335494, + "learning_rate": 3.984632918162291e-06, + "loss": 0.80591798, + "num_input_tokens_seen": 24305855, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.46875, + "step": 1136, + "time_per_iteration": 2.4486002922058105 + }, + { + "auxiliary_loss_clip": 0.01206211, + "auxiliary_loss_mlp": 0.01073979, + "balance_loss_clip": 1.04906416, + "balance_loss_mlp": 1.06089664, + "epoch": 0.06836013828348114, + "flos": 34349813153280.0, + "grad_norm": 3.2008110915617207, + "language_loss": 0.83941948, + "learning_rate": 3.984584694120679e-06, + "loss": 0.86222148, + "num_input_tokens_seen": 24326535, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 1.453125, + "step": 1137, + "time_per_iteration": 2.5714635848999023 + }, + { + "auxiliary_loss_clip": 0.01199575, + "auxiliary_loss_mlp": 0.01061827, + "balance_loss_clip": 1.03806889, + "balance_loss_mlp": 1.05628538, + "epoch": 0.06842026153614911, + "flos": 23148844571520.0, + "grad_norm": 2.067587662099544, + "language_loss": 0.78669268, + "learning_rate": 3.984536394823418e-06, + "loss": 0.80930662, + "num_input_tokens_seen": 24345810, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.4375, + "step": 1138, + "time_per_iteration": 2.459437370300293 + }, + { + "auxiliary_loss_clip": 0.01202271, + "auxiliary_loss_mlp": 0.01059469, + "balance_loss_clip": 1.03480506, + "balance_loss_mlp": 1.05729747, + "epoch": 0.06848038478881707, + "flos": 24608972430720.0, + "grad_norm": 2.606905885529735, + "language_loss": 0.85683703, + "learning_rate": 3.984488020272336e-06, + "loss": 0.87945449, + "num_input_tokens_seen": 24366095, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.453125, + "step": 1139, + "time_per_iteration": 2.5198936462402344 + }, + { + "auxiliary_loss_clip": 0.01201061, + "auxiliary_loss_mlp": 0.01057605, + "balance_loss_clip": 1.03297663, + "balance_loss_mlp": 1.05803108, + "epoch": 0.06854050804148504, + "flos": 40880994278400.0, + "grad_norm": 1.7528507300348692, + "language_loss": 0.74826896, + "learning_rate": 3.984439570469271e-06, + "loss": 0.77085567, + "num_input_tokens_seen": 24388665, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.4296875, + "step": 1140, + "time_per_iteration": 2.6609106063842773 + }, + { + "auxiliary_loss_clip": 0.01198151, + "auxiliary_loss_mlp": 0.01062313, + "balance_loss_clip": 1.03698146, + "balance_loss_mlp": 1.05620885, + "epoch": 0.06860063129415302, + "flos": 31686354743040.0, + "grad_norm": 2.210262717529583, + "language_loss": 0.68083167, + "learning_rate": 3.9843910454160574e-06, + "loss": 0.70343632, + "num_input_tokens_seen": 24407705, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.421875, + "step": 1141, + "time_per_iteration": 2.5661122798919678 + }, + { + "auxiliary_loss_clip": 0.01205913, + "auxiliary_loss_mlp": 0.0106664, + "balance_loss_clip": 1.04098654, + "balance_loss_mlp": 1.05848837, + "epoch": 0.06866075454682098, + "flos": 26542007775360.0, + "grad_norm": 1.82433360121009, + "language_loss": 0.79399014, + "learning_rate": 3.984342445114538e-06, + "loss": 0.8167156, + "num_input_tokens_seen": 24428390, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.46875, + "step": 1142, + "time_per_iteration": 2.5499107837677 + }, + { + "auxiliary_loss_clip": 0.0120232, + "auxiliary_loss_mlp": 0.01061074, + "balance_loss_clip": 1.03650475, + "balance_loss_mlp": 1.05730164, + "epoch": 0.06872087779948895, + "flos": 29789768724480.0, + "grad_norm": 1.6821535193321122, + "language_loss": 0.68701231, + "learning_rate": 3.984293769566553e-06, + "loss": 0.70964622, + "num_input_tokens_seen": 24450810, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.4453125, + "step": 1143, + "time_per_iteration": 5.380373239517212 + }, + { + "auxiliary_loss_clip": 0.01196375, + "auxiliary_loss_mlp": 0.01059216, + "balance_loss_clip": 1.03670955, + "balance_loss_mlp": 1.05885804, + "epoch": 0.06878100105215693, + "flos": 26941118768640.0, + "grad_norm": 1.8434796401844256, + "language_loss": 0.74694496, + "learning_rate": 3.98424501877395e-06, + "loss": 0.76950091, + "num_input_tokens_seen": 24469965, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.375, + "step": 1144, + "time_per_iteration": 2.536839485168457 + }, + { + "auxiliary_loss_clip": 0.01204332, + "auxiliary_loss_mlp": 0.01064223, + "balance_loss_clip": 1.03893876, + "balance_loss_mlp": 1.05654943, + "epoch": 0.06884112430482489, + "flos": 10670748946560.0, + "grad_norm": 2.296493270147659, + "language_loss": 0.91720247, + "learning_rate": 3.984196192738577e-06, + "loss": 0.93988806, + "num_input_tokens_seen": 24486370, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4765625, + "step": 1145, + "time_per_iteration": 2.44307017326355 + }, + { + "auxiliary_loss_clip": 0.01206887, + "auxiliary_loss_mlp": 0.01067692, + "balance_loss_clip": 1.04160893, + "balance_loss_mlp": 1.05779576, + "epoch": 0.06890124755749286, + "flos": 20193647898240.0, + "grad_norm": 2.4650333910918865, + "language_loss": 0.82189268, + "learning_rate": 3.984147291462285e-06, + "loss": 0.84463847, + "num_input_tokens_seen": 24503780, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.4921875, + "step": 1146, + "time_per_iteration": 2.4743804931640625 + }, + { + "auxiliary_loss_clip": 0.01198651, + "auxiliary_loss_mlp": 0.01061891, + "balance_loss_clip": 1.03869271, + "balance_loss_mlp": 1.05755806, + "epoch": 0.06896137081016084, + "flos": 20449224144000.0, + "grad_norm": 2.5935722439127744, + "language_loss": 0.85150343, + "learning_rate": 3.98409831494693e-06, + "loss": 0.87410891, + "num_input_tokens_seen": 24522320, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.4140625, + "step": 1147, + "time_per_iteration": 2.48410701751709 + }, + { + "auxiliary_loss_clip": 0.01201275, + "auxiliary_loss_mlp": 0.01064743, + "balance_loss_clip": 1.03988767, + "balance_loss_mlp": 1.05699074, + "epoch": 0.0690214940628288, + "flos": 18368703555840.0, + "grad_norm": 2.3932988353276645, + "language_loss": 0.86235052, + "learning_rate": 3.984049263194367e-06, + "loss": 0.88501072, + "num_input_tokens_seen": 24540445, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.4453125, + "step": 1148, + "time_per_iteration": 2.455441951751709 + }, + { + "auxiliary_loss_clip": 0.01199305, + "auxiliary_loss_mlp": 0.01058033, + "balance_loss_clip": 1.0337863, + "balance_loss_mlp": 1.05560231, + "epoch": 0.06908161731549677, + "flos": 20558033418240.0, + "grad_norm": 2.070658514783469, + "language_loss": 0.69185412, + "learning_rate": 3.9840001362064575e-06, + "loss": 0.71442747, + "num_input_tokens_seen": 24557105, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.4375, + "step": 1149, + "time_per_iteration": 2.461880922317505 + }, + { + "auxiliary_loss_clip": 0.01203307, + "auxiliary_loss_mlp": 0.0105502, + "balance_loss_clip": 1.0289495, + "balance_loss_mlp": 1.05679548, + "epoch": 0.06914174056816474, + "flos": 27563666313600.0, + "grad_norm": 2.828663566846353, + "language_loss": 0.84069788, + "learning_rate": 3.983950933985064e-06, + "loss": 0.86328113, + "num_input_tokens_seen": 24578240, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.4609375, + "step": 1150, + "time_per_iteration": 2.509122371673584 + }, + { + "auxiliary_loss_clip": 0.01206199, + "auxiliary_loss_mlp": 0.01058671, + "balance_loss_clip": 1.03453135, + "balance_loss_mlp": 1.06116164, + "epoch": 0.06920186382083271, + "flos": 15304015249920.0, + "grad_norm": 3.57752822218259, + "language_loss": 0.82044697, + "learning_rate": 3.983901656532052e-06, + "loss": 0.84309566, + "num_input_tokens_seen": 24593585, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.453125, + "step": 1151, + "time_per_iteration": 2.420128345489502 + }, + { + "auxiliary_loss_clip": 0.01201904, + "auxiliary_loss_mlp": 0.01062248, + "balance_loss_clip": 1.03883505, + "balance_loss_mlp": 1.06011868, + "epoch": 0.06926198707350067, + "flos": 25191227894400.0, + "grad_norm": 1.8279979065740934, + "language_loss": 0.85587418, + "learning_rate": 3.983852303849291e-06, + "loss": 0.87851566, + "num_input_tokens_seen": 24613110, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.4140625, + "step": 1152, + "time_per_iteration": 2.498180866241455 + }, + { + "auxiliary_loss_clip": 0.01198565, + "auxiliary_loss_mlp": 0.01060938, + "balance_loss_clip": 1.03747797, + "balance_loss_mlp": 1.05767703, + "epoch": 0.06932211032616864, + "flos": 13256137146240.0, + "grad_norm": 2.1251557516582995, + "language_loss": 0.90536988, + "learning_rate": 3.983802875938651e-06, + "loss": 0.92796487, + "num_input_tokens_seen": 24628795, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.40625, + "step": 1153, + "time_per_iteration": 2.422480821609497 + }, + { + "auxiliary_loss_clip": 0.01200757, + "auxiliary_loss_mlp": 0.01054623, + "balance_loss_clip": 1.03035152, + "balance_loss_mlp": 1.05790865, + "epoch": 0.06938223357883662, + "flos": 24827381078400.0, + "grad_norm": 2.190017778582164, + "language_loss": 0.81363368, + "learning_rate": 3.983753372802008e-06, + "loss": 0.83618748, + "num_input_tokens_seen": 24645480, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.4296875, + "step": 1154, + "time_per_iteration": 2.528118848800659 + }, + { + "auxiliary_loss_clip": 0.01202754, + "auxiliary_loss_mlp": 0.01068044, + "balance_loss_clip": 1.04476249, + "balance_loss_mlp": 1.06078768, + "epoch": 0.06944235683150458, + "flos": 27267977554560.0, + "grad_norm": 32.79102955334026, + "language_loss": 0.7560131, + "learning_rate": 3.983703794441237e-06, + "loss": 0.77872109, + "num_input_tokens_seen": 24664630, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.421875, + "step": 1155, + "time_per_iteration": 2.5010287761688232 + }, + { + "auxiliary_loss_clip": 0.01196108, + "auxiliary_loss_mlp": 0.01059268, + "balance_loss_clip": 1.03595114, + "balance_loss_mlp": 1.05511975, + "epoch": 0.06950248008417255, + "flos": 25808065176960.0, + "grad_norm": 1.6800097473238784, + "language_loss": 0.71119213, + "learning_rate": 3.98365414085822e-06, + "loss": 0.73374593, + "num_input_tokens_seen": 24684210, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.40625, + "step": 1156, + "time_per_iteration": 2.5034549236297607 + }, + { + "auxiliary_loss_clip": 0.01199728, + "auxiliary_loss_mlp": 0.01069841, + "balance_loss_clip": 1.04437828, + "balance_loss_mlp": 1.05711889, + "epoch": 0.06956260333684053, + "flos": 22271546793600.0, + "grad_norm": 2.0301788984863918, + "language_loss": 0.75299567, + "learning_rate": 3.98360441205484e-06, + "loss": 0.77569139, + "num_input_tokens_seen": 24702490, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4296875, + "step": 1157, + "time_per_iteration": 2.4654574394226074 + }, + { + "auxiliary_loss_clip": 0.0119867, + "auxiliary_loss_mlp": 0.0105715, + "balance_loss_clip": 1.03240204, + "balance_loss_mlp": 1.0551796, + "epoch": 0.0696227265895085, + "flos": 29681390413440.0, + "grad_norm": 1.6687264459000366, + "language_loss": 0.71895158, + "learning_rate": 3.983554608032982e-06, + "loss": 0.7415098, + "num_input_tokens_seen": 24724340, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.4375, + "step": 1158, + "time_per_iteration": 2.53495454788208 + }, + { + "auxiliary_loss_clip": 0.01202231, + "auxiliary_loss_mlp": 0.01063046, + "balance_loss_clip": 1.03764284, + "balance_loss_mlp": 1.05718327, + "epoch": 0.06968284984217646, + "flos": 25523545547520.0, + "grad_norm": 1.9777890540291267, + "language_loss": 0.79796576, + "learning_rate": 3.983504728794533e-06, + "loss": 0.82061857, + "num_input_tokens_seen": 24745550, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.453125, + "step": 1159, + "time_per_iteration": 2.511402130126953 + }, + { + "auxiliary_loss_clip": 0.01205534, + "auxiliary_loss_mlp": 0.0106614, + "balance_loss_clip": 1.03938961, + "balance_loss_mlp": 1.05860782, + "epoch": 0.06974297309484444, + "flos": 20698192287360.0, + "grad_norm": 5.094070474761981, + "language_loss": 0.810929, + "learning_rate": 3.983454774341387e-06, + "loss": 0.83364576, + "num_input_tokens_seen": 24762575, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.46875, + "step": 1160, + "time_per_iteration": 2.4580883979797363 + }, + { + "auxiliary_loss_clip": 0.01197544, + "auxiliary_loss_mlp": 0.01059119, + "balance_loss_clip": 1.03373909, + "balance_loss_mlp": 1.05382752, + "epoch": 0.0698030963475124, + "flos": 26505199313280.0, + "grad_norm": 1.8746427931419856, + "language_loss": 0.75958532, + "learning_rate": 3.983404744675437e-06, + "loss": 0.78215194, + "num_input_tokens_seen": 24782605, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4375, + "step": 1161, + "time_per_iteration": 2.5046370029449463 + }, + { + "auxiliary_loss_clip": 0.01195466, + "auxiliary_loss_mlp": 0.01062077, + "balance_loss_clip": 1.03642368, + "balance_loss_mlp": 1.05299318, + "epoch": 0.06986321960018037, + "flos": 23040430346880.0, + "grad_norm": 1.806880077375887, + "language_loss": 0.8285073, + "learning_rate": 3.9833546397985794e-06, + "loss": 0.85108274, + "num_input_tokens_seen": 24802910, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.4296875, + "step": 1162, + "time_per_iteration": 2.4779040813446045 + }, + { + "auxiliary_loss_clip": 0.01193968, + "auxiliary_loss_mlp": 0.01055987, + "balance_loss_clip": 1.03172803, + "balance_loss_mlp": 1.05355024, + "epoch": 0.06992334285284833, + "flos": 28584822061440.0, + "grad_norm": 1.8779282806609423, + "language_loss": 0.79095101, + "learning_rate": 3.983304459712716e-06, + "loss": 0.81345057, + "num_input_tokens_seen": 24823305, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.40625, + "step": 1163, + "time_per_iteration": 2.515899181365967 + }, + { + "auxiliary_loss_clip": 0.01198337, + "auxiliary_loss_mlp": 0.0106386, + "balance_loss_clip": 1.03728819, + "balance_loss_mlp": 1.05438375, + "epoch": 0.06998346610551631, + "flos": 20595344670720.0, + "grad_norm": 2.1142628107327233, + "language_loss": 0.79552305, + "learning_rate": 3.983254204419749e-06, + "loss": 0.81814498, + "num_input_tokens_seen": 24842155, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.4375, + "step": 1164, + "time_per_iteration": 2.476428747177124 + }, + { + "auxiliary_loss_clip": 0.0119937, + "auxiliary_loss_mlp": 0.0106576, + "balance_loss_clip": 1.0401659, + "balance_loss_mlp": 1.05587661, + "epoch": 0.07004358935818428, + "flos": 22528810978560.0, + "grad_norm": 1.4863162511761774, + "language_loss": 0.73198837, + "learning_rate": 3.983203873921583e-06, + "loss": 0.75463963, + "num_input_tokens_seen": 24862080, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.4375, + "step": 1165, + "time_per_iteration": 2.5053012371063232 + }, + { + "auxiliary_loss_clip": 0.01196916, + "auxiliary_loss_mlp": 0.01056731, + "balance_loss_clip": 1.03225732, + "balance_loss_mlp": 1.05550849, + "epoch": 0.07010371261085224, + "flos": 28949997680640.0, + "grad_norm": 1.690867173089168, + "language_loss": 0.81019437, + "learning_rate": 3.983153468220128e-06, + "loss": 0.83273077, + "num_input_tokens_seen": 24886165, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.4140625, + "step": 1166, + "time_per_iteration": 2.5378963947296143 + }, + { + "auxiliary_loss_clip": 0.01194011, + "auxiliary_loss_mlp": 0.0104974, + "balance_loss_clip": 1.02452731, + "balance_loss_mlp": 1.0534389, + "epoch": 0.07016383586352022, + "flos": 23659171050240.0, + "grad_norm": 4.886682439277329, + "language_loss": 0.84443307, + "learning_rate": 3.983102987317295e-06, + "loss": 0.86687052, + "num_input_tokens_seen": 24905775, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.40625, + "step": 1167, + "time_per_iteration": 2.5244622230529785 + }, + { + "auxiliary_loss_clip": 0.01201364, + "auxiliary_loss_mlp": 0.01057063, + "balance_loss_clip": 1.03188586, + "balance_loss_mlp": 1.05693448, + "epoch": 0.07022395911618819, + "flos": 19792130693760.0, + "grad_norm": 3.687845484368313, + "language_loss": 0.89423364, + "learning_rate": 3.983052431214997e-06, + "loss": 0.9168179, + "num_input_tokens_seen": 24924295, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.4453125, + "step": 1168, + "time_per_iteration": 2.49411678314209 + }, + { + "auxiliary_loss_clip": 0.01203973, + "auxiliary_loss_mlp": 0.01068913, + "balance_loss_clip": 1.04078007, + "balance_loss_mlp": 1.05737031, + "epoch": 0.07028408236885615, + "flos": 21689147675520.0, + "grad_norm": 2.629371766417224, + "language_loss": 0.88661098, + "learning_rate": 3.983001799915153e-06, + "loss": 0.9093399, + "num_input_tokens_seen": 24943210, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.46875, + "step": 1169, + "time_per_iteration": 2.4795143604278564 + }, + { + "auxiliary_loss_clip": 0.01203226, + "auxiliary_loss_mlp": 0.01069625, + "balance_loss_clip": 1.04397118, + "balance_loss_mlp": 1.05864179, + "epoch": 0.07034420562152413, + "flos": 25630271832960.0, + "grad_norm": 2.0154006947860705, + "language_loss": 0.84000075, + "learning_rate": 3.982951093419681e-06, + "loss": 0.86272925, + "num_input_tokens_seen": 24960360, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.4375, + "step": 1170, + "time_per_iteration": 2.501016616821289 + }, + { + "auxiliary_loss_clip": 0.01199625, + "auxiliary_loss_mlp": 0.01064997, + "balance_loss_clip": 1.03860402, + "balance_loss_mlp": 1.05753505, + "epoch": 0.0704043288741921, + "flos": 20810449267200.0, + "grad_norm": 1.945268169582358, + "language_loss": 0.75220597, + "learning_rate": 3.982900311730506e-06, + "loss": 0.77485222, + "num_input_tokens_seen": 24978290, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.421875, + "step": 1171, + "time_per_iteration": 2.4456748962402344 + }, + { + "auxiliary_loss_clip": 0.01199689, + "auxiliary_loss_mlp": 0.01058158, + "balance_loss_clip": 1.03393483, + "balance_loss_mlp": 1.05765915, + "epoch": 0.07046445212686006, + "flos": 25593176062080.0, + "grad_norm": 3.2481396571627923, + "language_loss": 0.88848841, + "learning_rate": 3.9828494548495514e-06, + "loss": 0.91106689, + "num_input_tokens_seen": 24997055, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.421875, + "step": 1172, + "time_per_iteration": 2.4970321655273438 + }, + { + "auxiliary_loss_clip": 0.01202846, + "auxiliary_loss_mlp": 0.01053059, + "balance_loss_clip": 1.02776241, + "balance_loss_mlp": 1.05584753, + "epoch": 0.07052457537952803, + "flos": 25556978131200.0, + "grad_norm": 1.6229718682058278, + "language_loss": 0.8212136, + "learning_rate": 3.982798522778748e-06, + "loss": 0.84377271, + "num_input_tokens_seen": 25017490, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.46875, + "step": 1173, + "time_per_iteration": 2.485822916030884 + }, + { + "auxiliary_loss_clip": 0.01200818, + "auxiliary_loss_mlp": 0.01061183, + "balance_loss_clip": 1.03574347, + "balance_loss_mlp": 1.05786848, + "epoch": 0.070584698632196, + "flos": 17968515154560.0, + "grad_norm": 2.056745883983527, + "language_loss": 0.81825697, + "learning_rate": 3.9827475155200245e-06, + "loss": 0.840877, + "num_input_tokens_seen": 25035660, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4296875, + "step": 1174, + "time_per_iteration": 2.4564759731292725 + }, + { + "auxiliary_loss_clip": 0.01199791, + "auxiliary_loss_mlp": 0.01060254, + "balance_loss_clip": 1.03538728, + "balance_loss_mlp": 1.0569849, + "epoch": 0.07064482188486397, + "flos": 25370888745600.0, + "grad_norm": 1.925446476900023, + "language_loss": 0.8511939, + "learning_rate": 3.982696433075317e-06, + "loss": 0.87379438, + "num_input_tokens_seen": 25054785, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 1.421875, + "step": 1175, + "time_per_iteration": 2.487058639526367 + }, + { + "auxiliary_loss_clip": 0.0120243, + "auxiliary_loss_mlp": 0.01067762, + "balance_loss_clip": 1.04362202, + "balance_loss_mlp": 1.05922508, + "epoch": 0.07070494513753194, + "flos": 24899848767360.0, + "grad_norm": 1.9716433558257507, + "language_loss": 0.8303746, + "learning_rate": 3.982645275446563e-06, + "loss": 0.85307658, + "num_input_tokens_seen": 25075180, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.4375, + "step": 1176, + "time_per_iteration": 2.511456251144409 + }, + { + "auxiliary_loss_clip": 0.01197689, + "auxiliary_loss_mlp": 0.01061097, + "balance_loss_clip": 1.03528786, + "balance_loss_mlp": 1.05717707, + "epoch": 0.07076506839019991, + "flos": 22338447874560.0, + "grad_norm": 2.3318965992312, + "language_loss": 0.74563694, + "learning_rate": 3.982594042635701e-06, + "loss": 0.76822478, + "num_input_tokens_seen": 25093035, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.40625, + "step": 1177, + "time_per_iteration": 2.460102081298828 + }, + { + "auxiliary_loss_clip": 0.01207406, + "auxiliary_loss_mlp": 0.01058724, + "balance_loss_clip": 1.033476, + "balance_loss_mlp": 1.06167924, + "epoch": 0.07082519164286788, + "flos": 18660800954880.0, + "grad_norm": 2.2206541819979995, + "language_loss": 0.86031914, + "learning_rate": 3.982542734644673e-06, + "loss": 0.88298053, + "num_input_tokens_seen": 25112520, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.4609375, + "step": 1178, + "time_per_iteration": 2.4605627059936523 + }, + { + "auxiliary_loss_clip": 0.01087089, + "auxiliary_loss_mlp": 0.01007975, + "balance_loss_clip": 1.00349271, + "balance_loss_mlp": 1.02766943, + "epoch": 0.07088531489553584, + "flos": 63654107610240.0, + "grad_norm": 0.8386980392448491, + "language_loss": 0.63242435, + "learning_rate": 3.982491351475427e-06, + "loss": 0.65337497, + "num_input_tokens_seen": 25177760, + "router_z_loss_clip": 0.04492188, + "router_z_loss_mlp": 0.59375, + "step": 1179, + "time_per_iteration": 3.156688690185547 + }, + { + "auxiliary_loss_clip": 0.01207076, + "auxiliary_loss_mlp": 0.01062734, + "balance_loss_clip": 1.03886819, + "balance_loss_mlp": 1.06038809, + "epoch": 0.07094543814820382, + "flos": 21572688804480.0, + "grad_norm": 2.3853497849810945, + "language_loss": 0.83326972, + "learning_rate": 3.98243989312991e-06, + "loss": 0.85596782, + "num_input_tokens_seen": 25195260, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.46875, + "step": 1180, + "time_per_iteration": 2.4823896884918213 + }, + { + "auxiliary_loss_clip": 0.01200915, + "auxiliary_loss_mlp": 0.01065839, + "balance_loss_clip": 1.04087663, + "balance_loss_mlp": 1.05910683, + "epoch": 0.07100556140087179, + "flos": 22089946608000.0, + "grad_norm": 2.1921067510196446, + "language_loss": 0.88595563, + "learning_rate": 3.982388359610074e-06, + "loss": 0.90862316, + "num_input_tokens_seen": 25212740, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.421875, + "step": 1181, + "time_per_iteration": 2.505908727645874 + }, + { + "auxiliary_loss_clip": 0.01200757, + "auxiliary_loss_mlp": 0.01060636, + "balance_loss_clip": 1.03607869, + "balance_loss_mlp": 1.05944347, + "epoch": 0.07106568465353975, + "flos": 47922286400640.0, + "grad_norm": 2.2303634282095257, + "language_loss": 0.83314365, + "learning_rate": 3.9823367509178725e-06, + "loss": 0.85575759, + "num_input_tokens_seen": 25236420, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.4140625, + "step": 1182, + "time_per_iteration": 2.7283623218536377 + }, + { + "auxiliary_loss_clip": 0.01199287, + "auxiliary_loss_mlp": 0.01065564, + "balance_loss_clip": 1.04006529, + "balance_loss_mlp": 1.06100821, + "epoch": 0.07112580790620772, + "flos": 23440798316160.0, + "grad_norm": 2.671395976555463, + "language_loss": 0.7925818, + "learning_rate": 3.982285067055262e-06, + "loss": 0.81523037, + "num_input_tokens_seen": 25255120, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.3828125, + "step": 1183, + "time_per_iteration": 2.5057172775268555 + }, + { + "auxiliary_loss_clip": 0.01201972, + "auxiliary_loss_mlp": 0.01059167, + "balance_loss_clip": 1.03441906, + "balance_loss_mlp": 1.05550563, + "epoch": 0.0711859311588757, + "flos": 31868888682240.0, + "grad_norm": 2.6492838430830963, + "language_loss": 0.78910172, + "learning_rate": 3.982233308024204e-06, + "loss": 0.8117131, + "num_input_tokens_seen": 25275150, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.46875, + "step": 1184, + "time_per_iteration": 5.494150638580322 + }, + { + "auxiliary_loss_clip": 0.01196982, + "auxiliary_loss_mlp": 0.01057128, + "balance_loss_clip": 1.03369188, + "balance_loss_mlp": 1.05884266, + "epoch": 0.07124605441154366, + "flos": 19610315026560.0, + "grad_norm": 2.546293211356889, + "language_loss": 0.7696892, + "learning_rate": 3.98218147382666e-06, + "loss": 0.79223031, + "num_input_tokens_seen": 25293680, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.375, + "step": 1185, + "time_per_iteration": 3.8873486518859863 + }, + { + "auxiliary_loss_clip": 0.01200052, + "auxiliary_loss_mlp": 0.01065088, + "balance_loss_clip": 1.0408771, + "balance_loss_mlp": 1.05808377, + "epoch": 0.07130617766421163, + "flos": 14684448533760.0, + "grad_norm": 2.519913974657541, + "language_loss": 0.65896261, + "learning_rate": 3.982129564464596e-06, + "loss": 0.68161404, + "num_input_tokens_seen": 25310050, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.421875, + "step": 1186, + "time_per_iteration": 2.44986891746521 + }, + { + "auxiliary_loss_clip": 0.01198722, + "auxiliary_loss_mlp": 0.01056267, + "balance_loss_clip": 1.03234124, + "balance_loss_mlp": 1.05906928, + "epoch": 0.07136630091687961, + "flos": 26067915141120.0, + "grad_norm": 2.0047668871213205, + "language_loss": 0.69673246, + "learning_rate": 3.98207757993998e-06, + "loss": 0.71928233, + "num_input_tokens_seen": 25331020, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.3984375, + "step": 1187, + "time_per_iteration": 2.517432451248169 + }, + { + "auxiliary_loss_clip": 0.01194056, + "auxiliary_loss_mlp": 0.01059861, + "balance_loss_clip": 1.03713942, + "balance_loss_mlp": 1.05690861, + "epoch": 0.07142642416954757, + "flos": 15669190869120.0, + "grad_norm": 2.6848541171122307, + "language_loss": 0.78598166, + "learning_rate": 3.9820255202547845e-06, + "loss": 0.80852079, + "num_input_tokens_seen": 25347875, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.375, + "step": 1188, + "time_per_iteration": 2.4682350158691406 + }, + { + "auxiliary_loss_clip": 0.01197809, + "auxiliary_loss_mlp": 0.01056931, + "balance_loss_clip": 1.03282666, + "balance_loss_mlp": 1.0588758, + "epoch": 0.07148654742221554, + "flos": 19755322231680.0, + "grad_norm": 2.0343008635273834, + "language_loss": 0.84854662, + "learning_rate": 3.981973385410981e-06, + "loss": 0.87109399, + "num_input_tokens_seen": 25366715, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.390625, + "step": 1189, + "time_per_iteration": 2.451464891433716 + }, + { + "auxiliary_loss_clip": 0.01193617, + "auxiliary_loss_mlp": 0.01062112, + "balance_loss_clip": 1.03707767, + "balance_loss_mlp": 1.05589187, + "epoch": 0.07154667067488352, + "flos": 23471824688640.0, + "grad_norm": 1.7193907035784557, + "language_loss": 0.77021295, + "learning_rate": 3.9819211754105494e-06, + "loss": 0.79277021, + "num_input_tokens_seen": 25385450, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.375, + "step": 1190, + "time_per_iteration": 2.5028254985809326 + }, + { + "auxiliary_loss_clip": 0.01200514, + "auxiliary_loss_mlp": 0.01065982, + "balance_loss_clip": 1.04018509, + "balance_loss_mlp": 1.0585537, + "epoch": 0.07160679392755148, + "flos": 18332936588160.0, + "grad_norm": 2.3385605637591302, + "language_loss": 0.75145626, + "learning_rate": 3.981868890255468e-06, + "loss": 0.77412122, + "num_input_tokens_seen": 25403940, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.421875, + "step": 1191, + "time_per_iteration": 2.461045980453491 + }, + { + "auxiliary_loss_clip": 0.0119767, + "auxiliary_loss_mlp": 0.01056581, + "balance_loss_clip": 1.03147578, + "balance_loss_mlp": 1.05730891, + "epoch": 0.07166691718021945, + "flos": 17747017937280.0, + "grad_norm": 3.3332115059632583, + "language_loss": 0.7360636, + "learning_rate": 3.981816529947719e-06, + "loss": 0.75860614, + "num_input_tokens_seen": 25420410, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.40625, + "step": 1192, + "time_per_iteration": 2.4944753646850586 + }, + { + "auxiliary_loss_clip": 0.01194068, + "auxiliary_loss_mlp": 0.01052089, + "balance_loss_clip": 1.02884293, + "balance_loss_mlp": 1.05358601, + "epoch": 0.07172704043288743, + "flos": 22451925916800.0, + "grad_norm": 2.1652973689026176, + "language_loss": 0.7830255, + "learning_rate": 3.9817640944892896e-06, + "loss": 0.80548704, + "num_input_tokens_seen": 25439415, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.40625, + "step": 1193, + "time_per_iteration": 2.487025737762451 + }, + { + "auxiliary_loss_clip": 0.01202609, + "auxiliary_loss_mlp": 0.01053593, + "balance_loss_clip": 1.02786815, + "balance_loss_mlp": 1.06034899, + "epoch": 0.07178716368555539, + "flos": 23222210100480.0, + "grad_norm": 1.9678931818636167, + "language_loss": 0.85748619, + "learning_rate": 3.981711583882166e-06, + "loss": 0.88004816, + "num_input_tokens_seen": 25458715, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.421875, + "step": 1194, + "time_per_iteration": 2.493823766708374 + }, + { + "auxiliary_loss_clip": 0.01197363, + "auxiliary_loss_mlp": 0.0106262, + "balance_loss_clip": 1.03886151, + "balance_loss_mlp": 1.05782473, + "epoch": 0.07184728693822336, + "flos": 25150828072320.0, + "grad_norm": 1.9701258602591958, + "language_loss": 0.81425989, + "learning_rate": 3.981658998128341e-06, + "loss": 0.83685976, + "num_input_tokens_seen": 25477985, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.3984375, + "step": 1195, + "time_per_iteration": 2.5168802738189697 + }, + { + "auxiliary_loss_clip": 0.01195742, + "auxiliary_loss_mlp": 0.01051594, + "balance_loss_clip": 1.02979064, + "balance_loss_mlp": 1.05720496, + "epoch": 0.07190741019089132, + "flos": 22711237176960.0, + "grad_norm": 1.9269272748189905, + "language_loss": 0.79917538, + "learning_rate": 3.981606337229808e-06, + "loss": 0.82164884, + "num_input_tokens_seen": 25497110, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.3828125, + "step": 1196, + "time_per_iteration": 2.4749536514282227 + }, + { + "auxiliary_loss_clip": 0.01193553, + "auxiliary_loss_mlp": 0.01069477, + "balance_loss_clip": 1.04418063, + "balance_loss_mlp": 1.05655897, + "epoch": 0.0719675334435593, + "flos": 29349791032320.0, + "grad_norm": 8.862292558474625, + "language_loss": 0.71015084, + "learning_rate": 3.9815536011885655e-06, + "loss": 0.73278111, + "num_input_tokens_seen": 25516555, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.3671875, + "step": 1197, + "time_per_iteration": 2.520514726638794 + }, + { + "auxiliary_loss_clip": 0.01192449, + "auxiliary_loss_mlp": 0.01052157, + "balance_loss_clip": 1.02845871, + "balance_loss_mlp": 1.05429292, + "epoch": 0.07202765669622727, + "flos": 17639788861440.0, + "grad_norm": 2.0584524946763767, + "language_loss": 0.86034989, + "learning_rate": 3.98150079000661e-06, + "loss": 0.88279593, + "num_input_tokens_seen": 25533895, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.3828125, + "step": 1198, + "time_per_iteration": 2.441458225250244 + }, + { + "auxiliary_loss_clip": 0.01194875, + "auxiliary_loss_mlp": 0.01061206, + "balance_loss_clip": 1.03724504, + "balance_loss_mlp": 1.05664325, + "epoch": 0.07208777994889523, + "flos": 21434038306560.0, + "grad_norm": 2.7240513490380307, + "language_loss": 0.83822477, + "learning_rate": 3.981447903685947e-06, + "loss": 0.8607856, + "num_input_tokens_seen": 25554195, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.3828125, + "step": 1199, + "time_per_iteration": 2.462790012359619 + }, + { + "auxiliary_loss_clip": 0.01201627, + "auxiliary_loss_mlp": 0.01055923, + "balance_loss_clip": 1.03351128, + "balance_loss_mlp": 1.06159616, + "epoch": 0.07214790320156321, + "flos": 26940867373440.0, + "grad_norm": 2.0725431151836453, + "language_loss": 0.76464498, + "learning_rate": 3.981394942228581e-06, + "loss": 0.78722042, + "num_input_tokens_seen": 25574155, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.3984375, + "step": 1200, + "time_per_iteration": 2.5007636547088623 + }, + { + "auxiliary_loss_clip": 0.01196382, + "auxiliary_loss_mlp": 0.01061794, + "balance_loss_clip": 1.0376662, + "balance_loss_mlp": 1.05783701, + "epoch": 0.07220802645423118, + "flos": 23879949995520.0, + "grad_norm": 1.959995672067427, + "language_loss": 0.82965535, + "learning_rate": 3.98134190563652e-06, + "loss": 0.85223711, + "num_input_tokens_seen": 25592735, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.390625, + "step": 1201, + "time_per_iteration": 2.4968512058258057 + }, + { + "auxiliary_loss_clip": 0.01198607, + "auxiliary_loss_mlp": 0.01059493, + "balance_loss_clip": 1.03372014, + "balance_loss_mlp": 1.05568862, + "epoch": 0.07226814970689914, + "flos": 19243631036160.0, + "grad_norm": 2.411287508312223, + "language_loss": 0.69041032, + "learning_rate": 3.981288793911775e-06, + "loss": 0.71299136, + "num_input_tokens_seen": 25611510, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.4296875, + "step": 1202, + "time_per_iteration": 2.456216812133789 + }, + { + "auxiliary_loss_clip": 0.01196785, + "auxiliary_loss_mlp": 0.01063607, + "balance_loss_clip": 1.03804839, + "balance_loss_mlp": 1.05721354, + "epoch": 0.07232827295956712, + "flos": 19172025273600.0, + "grad_norm": 1.9411904343348254, + "language_loss": 0.87723774, + "learning_rate": 3.98123560705636e-06, + "loss": 0.89984161, + "num_input_tokens_seen": 25629560, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.3984375, + "step": 1203, + "time_per_iteration": 2.449903964996338 + }, + { + "auxiliary_loss_clip": 0.01198028, + "auxiliary_loss_mlp": 0.01061987, + "balance_loss_clip": 1.03803837, + "balance_loss_mlp": 1.0546416, + "epoch": 0.07238839621223508, + "flos": 17639752947840.0, + "grad_norm": 1.819852916387131, + "language_loss": 0.7844671, + "learning_rate": 3.981182345072293e-06, + "loss": 0.80706728, + "num_input_tokens_seen": 25648330, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.4375, + "step": 1204, + "time_per_iteration": 2.449265480041504 + }, + { + "auxiliary_loss_clip": 0.01194984, + "auxiliary_loss_mlp": 0.01062187, + "balance_loss_clip": 1.0388217, + "balance_loss_mlp": 1.05605316, + "epoch": 0.07244851946490305, + "flos": 28292401440000.0, + "grad_norm": 1.8514893306986777, + "language_loss": 0.81960398, + "learning_rate": 3.981129007961593e-06, + "loss": 0.84217566, + "num_input_tokens_seen": 25669470, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.390625, + "step": 1205, + "time_per_iteration": 2.517423629760742 + }, + { + "auxiliary_loss_clip": 0.01199989, + "auxiliary_loss_mlp": 0.01067422, + "balance_loss_clip": 1.04250705, + "balance_loss_mlp": 1.05852747, + "epoch": 0.07250864271757101, + "flos": 22564829341440.0, + "grad_norm": 2.0830735488163254, + "language_loss": 0.76702261, + "learning_rate": 3.981075595726283e-06, + "loss": 0.78969669, + "num_input_tokens_seen": 25690470, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.4140625, + "step": 1206, + "time_per_iteration": 2.489978313446045 + }, + { + "auxiliary_loss_clip": 0.01193529, + "auxiliary_loss_mlp": 0.01055273, + "balance_loss_clip": 1.03071594, + "balance_loss_mlp": 1.05481935, + "epoch": 0.072568765970239, + "flos": 21762405463680.0, + "grad_norm": 1.8430962541821914, + "language_loss": 0.77246201, + "learning_rate": 3.981022108368387e-06, + "loss": 0.79495007, + "num_input_tokens_seen": 25709205, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.3828125, + "step": 1207, + "time_per_iteration": 2.4895267486572266 + }, + { + "auxiliary_loss_clip": 0.01194673, + "auxiliary_loss_mlp": 0.01049476, + "balance_loss_clip": 1.02816105, + "balance_loss_mlp": 1.05703962, + "epoch": 0.07262888922290696, + "flos": 25519702792320.0, + "grad_norm": 5.768853045708734, + "language_loss": 0.79723513, + "learning_rate": 3.9809685458899345e-06, + "loss": 0.81967664, + "num_input_tokens_seen": 25728485, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.375, + "step": 1208, + "time_per_iteration": 2.509073495864868 + }, + { + "auxiliary_loss_clip": 0.0119292, + "auxiliary_loss_mlp": 0.01054601, + "balance_loss_clip": 1.03204679, + "balance_loss_mlp": 1.05551386, + "epoch": 0.07268901247557492, + "flos": 21246548290560.0, + "grad_norm": 3.6873449148768063, + "language_loss": 0.78595626, + "learning_rate": 3.980914908292955e-06, + "loss": 0.80843151, + "num_input_tokens_seen": 25747730, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.375, + "step": 1209, + "time_per_iteration": 2.506157398223877 + }, + { + "auxiliary_loss_clip": 0.01194158, + "auxiliary_loss_mlp": 0.01056209, + "balance_loss_clip": 1.03409529, + "balance_loss_mlp": 1.05510461, + "epoch": 0.0727491357282429, + "flos": 25479302970240.0, + "grad_norm": 2.6193169355932104, + "language_loss": 0.81117678, + "learning_rate": 3.980861195579486e-06, + "loss": 0.83368045, + "num_input_tokens_seen": 25768050, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.390625, + "step": 1210, + "time_per_iteration": 2.4985666275024414 + }, + { + "auxiliary_loss_clip": 0.01192388, + "auxiliary_loss_mlp": 0.01061033, + "balance_loss_clip": 1.03688109, + "balance_loss_mlp": 1.0565064, + "epoch": 0.07280925898091087, + "flos": 24462169545600.0, + "grad_norm": 2.2378435782703834, + "language_loss": 0.84350932, + "learning_rate": 3.98080740775156e-06, + "loss": 0.86604351, + "num_input_tokens_seen": 25787985, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.359375, + "step": 1211, + "time_per_iteration": 2.4971728324890137 + }, + { + "auxiliary_loss_clip": 0.01189207, + "auxiliary_loss_mlp": 0.01051238, + "balance_loss_clip": 1.02931547, + "balance_loss_mlp": 1.05233216, + "epoch": 0.07286938223357883, + "flos": 18288191220480.0, + "grad_norm": 2.2910402501943516, + "language_loss": 0.90813953, + "learning_rate": 3.98075354481122e-06, + "loss": 0.9305439, + "num_input_tokens_seen": 25803620, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.375, + "step": 1212, + "time_per_iteration": 2.424874782562256 + }, + { + "auxiliary_loss_clip": 0.01191621, + "auxiliary_loss_mlp": 0.01051304, + "balance_loss_clip": 1.0286777, + "balance_loss_mlp": 1.05457211, + "epoch": 0.07292950548624681, + "flos": 21214803646080.0, + "grad_norm": 2.346480404505952, + "language_loss": 0.7238096, + "learning_rate": 3.9806996067605055e-06, + "loss": 0.74623883, + "num_input_tokens_seen": 25823315, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.3671875, + "step": 1213, + "time_per_iteration": 2.443542003631592 + }, + { + "auxiliary_loss_clip": 0.0119423, + "auxiliary_loss_mlp": 0.01051601, + "balance_loss_clip": 1.02848625, + "balance_loss_mlp": 1.05338192, + "epoch": 0.07298962873891478, + "flos": 24642009964800.0, + "grad_norm": 1.9141465843449694, + "language_loss": 0.84441102, + "learning_rate": 3.980645593601465e-06, + "loss": 0.86686933, + "num_input_tokens_seen": 25842605, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.40625, + "step": 1214, + "time_per_iteration": 2.500112295150757 + }, + { + "auxiliary_loss_clip": 0.01197246, + "auxiliary_loss_mlp": 0.0105819, + "balance_loss_clip": 1.03468192, + "balance_loss_mlp": 1.05678558, + "epoch": 0.07304975199158274, + "flos": 27052765217280.0, + "grad_norm": 2.82775499028919, + "language_loss": 0.83929181, + "learning_rate": 3.980591505336144e-06, + "loss": 0.86184609, + "num_input_tokens_seen": 25863030, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.40625, + "step": 1215, + "time_per_iteration": 2.539769411087036 + }, + { + "auxiliary_loss_clip": 0.01194493, + "auxiliary_loss_mlp": 0.01061008, + "balance_loss_clip": 1.03711891, + "balance_loss_mlp": 1.05474758, + "epoch": 0.07310987524425071, + "flos": 33549544091520.0, + "grad_norm": 1.8082751516232567, + "language_loss": 0.80984753, + "learning_rate": 3.980537341966595e-06, + "loss": 0.83240259, + "num_input_tokens_seen": 25888015, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.390625, + "step": 1216, + "time_per_iteration": 2.6036598682403564 + }, + { + "auxiliary_loss_clip": 0.01196444, + "auxiliary_loss_mlp": 0.01050548, + "balance_loss_clip": 1.02863717, + "balance_loss_mlp": 1.05746269, + "epoch": 0.07316999849691869, + "flos": 28110944908800.0, + "grad_norm": 2.8100743600713276, + "language_loss": 0.76112509, + "learning_rate": 3.980483103494872e-06, + "loss": 0.78359497, + "num_input_tokens_seen": 25908660, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.390625, + "step": 1217, + "time_per_iteration": 2.513061046600342 + }, + { + "auxiliary_loss_clip": 0.01192952, + "auxiliary_loss_mlp": 0.01055183, + "balance_loss_clip": 1.0347029, + "balance_loss_mlp": 1.05546904, + "epoch": 0.07323012174958665, + "flos": 14392602529920.0, + "grad_norm": 2.0751842608938142, + "language_loss": 0.86442709, + "learning_rate": 3.98042878992303e-06, + "loss": 0.88690841, + "num_input_tokens_seen": 25927215, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.375, + "step": 1218, + "time_per_iteration": 2.4514572620391846 + }, + { + "auxiliary_loss_clip": 0.01193593, + "auxiliary_loss_mlp": 0.01062446, + "balance_loss_clip": 1.03989124, + "balance_loss_mlp": 1.05405331, + "epoch": 0.07329024500225462, + "flos": 21616428591360.0, + "grad_norm": 1.9036635750322874, + "language_loss": 0.86757988, + "learning_rate": 3.9803744012531305e-06, + "loss": 0.8901403, + "num_input_tokens_seen": 25945500, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.3984375, + "step": 1219, + "time_per_iteration": 2.4501893520355225 + }, + { + "auxiliary_loss_clip": 0.01190573, + "auxiliary_loss_mlp": 0.01058106, + "balance_loss_clip": 1.03654075, + "balance_loss_mlp": 1.05260015, + "epoch": 0.0733503682549226, + "flos": 13224141106560.0, + "grad_norm": 2.320539289810395, + "language_loss": 0.84721315, + "learning_rate": 3.980319937487235e-06, + "loss": 0.86969984, + "num_input_tokens_seen": 25963105, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.375, + "step": 1220, + "time_per_iteration": 2.4651544094085693 + }, + { + "auxiliary_loss_clip": 0.01193314, + "auxiliary_loss_mlp": 0.01062531, + "balance_loss_clip": 1.04015541, + "balance_loss_mlp": 1.05455709, + "epoch": 0.07341049150759056, + "flos": 20886975192960.0, + "grad_norm": 2.803787378453645, + "language_loss": 0.76840538, + "learning_rate": 3.98026539862741e-06, + "loss": 0.79096377, + "num_input_tokens_seen": 25981690, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.390625, + "step": 1221, + "time_per_iteration": 2.4643850326538086 + }, + { + "auxiliary_loss_clip": 0.01195957, + "auxiliary_loss_mlp": 0.01059407, + "balance_loss_clip": 1.0369482, + "balance_loss_mlp": 1.05698907, + "epoch": 0.07347061476025853, + "flos": 15413614623360.0, + "grad_norm": 4.111967976062365, + "language_loss": 0.92201889, + "learning_rate": 3.980210784675722e-06, + "loss": 0.94457251, + "num_input_tokens_seen": 25999890, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.390625, + "step": 1222, + "time_per_iteration": 2.461393117904663 + }, + { + "auxiliary_loss_clip": 0.01197113, + "auxiliary_loss_mlp": 0.01056347, + "balance_loss_clip": 1.03440046, + "balance_loss_mlp": 1.05795276, + "epoch": 0.0735307380129265, + "flos": 11108859131520.0, + "grad_norm": 2.739326433562924, + "language_loss": 0.91106719, + "learning_rate": 3.980156095634242e-06, + "loss": 0.9336018, + "num_input_tokens_seen": 26016445, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.390625, + "step": 1223, + "time_per_iteration": 2.4616212844848633 + }, + { + "auxiliary_loss_clip": 0.01194512, + "auxiliary_loss_mlp": 0.01071192, + "balance_loss_clip": 1.04895926, + "balance_loss_mlp": 1.05628467, + "epoch": 0.07359086126559447, + "flos": 23732392924800.0, + "grad_norm": 2.5538951271380395, + "language_loss": 0.81946027, + "learning_rate": 3.980101331505045e-06, + "loss": 0.84211743, + "num_input_tokens_seen": 26036080, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.3828125, + "step": 1224, + "time_per_iteration": 2.555060386657715 + }, + { + "auxiliary_loss_clip": 0.01191919, + "auxiliary_loss_mlp": 0.01053854, + "balance_loss_clip": 1.02938056, + "balance_loss_mlp": 1.05385065, + "epoch": 0.07365098451826244, + "flos": 20993270515200.0, + "grad_norm": 2.209826315991058, + "language_loss": 0.83313572, + "learning_rate": 3.9800464922902076e-06, + "loss": 0.8555935, + "num_input_tokens_seen": 26055805, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.375, + "step": 1225, + "time_per_iteration": 2.5317656993865967 + }, + { + "auxiliary_loss_clip": 0.01194191, + "auxiliary_loss_mlp": 0.0105208, + "balance_loss_clip": 1.0300144, + "balance_loss_mlp": 1.05566537, + "epoch": 0.0737111077709304, + "flos": 19933582452480.0, + "grad_norm": 2.0864455990649144, + "language_loss": 0.9037565, + "learning_rate": 3.979991577991808e-06, + "loss": 0.92621917, + "num_input_tokens_seen": 26073905, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.3828125, + "step": 1226, + "time_per_iteration": 5.374137878417969 + }, + { + "auxiliary_loss_clip": 0.01201048, + "auxiliary_loss_mlp": 0.01048748, + "balance_loss_clip": 1.02451301, + "balance_loss_mlp": 1.05401981, + "epoch": 0.07377123102359838, + "flos": 16581537342720.0, + "grad_norm": 2.8833434676543, + "language_loss": 0.76944947, + "learning_rate": 3.97993658861193e-06, + "loss": 0.79194742, + "num_input_tokens_seen": 26091700, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.46875, + "step": 1227, + "time_per_iteration": 2.4528942108154297 + }, + { + "auxiliary_loss_clip": 0.01192324, + "auxiliary_loss_mlp": 0.01049537, + "balance_loss_clip": 1.02720916, + "balance_loss_mlp": 1.05810142, + "epoch": 0.07383135427626634, + "flos": 28328563457280.0, + "grad_norm": 1.6041059240123434, + "language_loss": 0.85634637, + "learning_rate": 3.9798815241526575e-06, + "loss": 0.87876499, + "num_input_tokens_seen": 26114105, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.34375, + "step": 1228, + "time_per_iteration": 2.5452229976654053 + }, + { + "auxiliary_loss_clip": 0.01194537, + "auxiliary_loss_mlp": 0.01061009, + "balance_loss_clip": 1.0383954, + "balance_loss_mlp": 1.05448794, + "epoch": 0.07389147752893431, + "flos": 20047168235520.0, + "grad_norm": 4.251776538682485, + "language_loss": 0.79688829, + "learning_rate": 3.97982638461608e-06, + "loss": 0.81944382, + "num_input_tokens_seen": 26131165, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.3984375, + "step": 1229, + "time_per_iteration": 2.501086711883545 + }, + { + "auxiliary_loss_clip": 0.01196019, + "auxiliary_loss_mlp": 0.01061374, + "balance_loss_clip": 1.03777039, + "balance_loss_mlp": 1.05632436, + "epoch": 0.07395160078160229, + "flos": 18114132890880.0, + "grad_norm": 2.028375336194412, + "language_loss": 0.78218549, + "learning_rate": 3.979771170004287e-06, + "loss": 0.8047595, + "num_input_tokens_seen": 26150040, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.3984375, + "step": 1230, + "time_per_iteration": 2.4474098682403564 + }, + { + "auxiliary_loss_clip": 0.01193092, + "auxiliary_loss_mlp": 0.01048754, + "balance_loss_clip": 1.02554393, + "balance_loss_mlp": 1.05599403, + "epoch": 0.07401172403427025, + "flos": 23586918842880.0, + "grad_norm": 1.924374124094053, + "language_loss": 0.81301343, + "learning_rate": 3.979715880319372e-06, + "loss": 0.83543187, + "num_input_tokens_seen": 26169380, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.375, + "step": 1231, + "time_per_iteration": 2.4861042499542236 + }, + { + "auxiliary_loss_clip": 0.01198041, + "auxiliary_loss_mlp": 0.01066474, + "balance_loss_clip": 1.04277539, + "balance_loss_mlp": 1.05443811, + "epoch": 0.07407184728693822, + "flos": 26359904799360.0, + "grad_norm": 2.4882746298902343, + "language_loss": 0.95111585, + "learning_rate": 3.979660515563434e-06, + "loss": 0.97376096, + "num_input_tokens_seen": 26189420, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.4375, + "step": 1232, + "time_per_iteration": 2.5074143409729004 + }, + { + "auxiliary_loss_clip": 0.01194092, + "auxiliary_loss_mlp": 0.01060623, + "balance_loss_clip": 1.03938031, + "balance_loss_mlp": 1.05667329, + "epoch": 0.0741319705396062, + "flos": 22200443821440.0, + "grad_norm": 2.246534337547551, + "language_loss": 0.80640733, + "learning_rate": 3.979605075738569e-06, + "loss": 0.82895458, + "num_input_tokens_seen": 26209300, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.375, + "step": 1233, + "time_per_iteration": 2.490816831588745 + }, + { + "auxiliary_loss_clip": 0.01198611, + "auxiliary_loss_mlp": 0.01060349, + "balance_loss_clip": 1.03488624, + "balance_loss_mlp": 1.05483365, + "epoch": 0.07419209379227416, + "flos": 39200482523520.0, + "grad_norm": 2.357402762223285, + "language_loss": 0.70458734, + "learning_rate": 3.979549560846883e-06, + "loss": 0.72717696, + "num_input_tokens_seen": 26228110, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4375, + "step": 1234, + "time_per_iteration": 2.605139970779419 + }, + { + "auxiliary_loss_clip": 0.01195848, + "auxiliary_loss_mlp": 0.01059615, + "balance_loss_clip": 1.03665543, + "balance_loss_mlp": 1.05792761, + "epoch": 0.07425221704494213, + "flos": 22781657790720.0, + "grad_norm": 2.1034220776692765, + "language_loss": 0.77058101, + "learning_rate": 3.979493970890478e-06, + "loss": 0.79313564, + "num_input_tokens_seen": 26247020, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.3828125, + "step": 1235, + "time_per_iteration": 2.4728853702545166 + }, + { + "auxiliary_loss_clip": 0.01189622, + "auxiliary_loss_mlp": 0.01053872, + "balance_loss_clip": 1.03123438, + "balance_loss_mlp": 1.05414248, + "epoch": 0.0743123402976101, + "flos": 22272983337600.0, + "grad_norm": 5.584514149172867, + "language_loss": 0.82648033, + "learning_rate": 3.979438305871464e-06, + "loss": 0.84891528, + "num_input_tokens_seen": 26265750, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.359375, + "step": 1236, + "time_per_iteration": 2.462069511413574 + }, + { + "auxiliary_loss_clip": 0.0119681, + "auxiliary_loss_mlp": 0.01057366, + "balance_loss_clip": 1.03385794, + "balance_loss_mlp": 1.05572712, + "epoch": 0.07437246355027807, + "flos": 29315029645440.0, + "grad_norm": 2.2536643652174724, + "language_loss": 0.75702679, + "learning_rate": 3.979382565791951e-06, + "loss": 0.77956861, + "num_input_tokens_seen": 26287905, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.40625, + "step": 1237, + "time_per_iteration": 2.5572054386138916 + }, + { + "auxiliary_loss_clip": 0.01192925, + "auxiliary_loss_mlp": 0.01060344, + "balance_loss_clip": 1.03817141, + "balance_loss_mlp": 1.05427146, + "epoch": 0.07443258680294604, + "flos": 31944732249600.0, + "grad_norm": 1.878495773650564, + "language_loss": 0.7740556, + "learning_rate": 3.979326750654053e-06, + "loss": 0.7965883, + "num_input_tokens_seen": 26311795, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.3828125, + "step": 1238, + "time_per_iteration": 2.5915493965148926 + }, + { + "auxiliary_loss_clip": 0.01198337, + "auxiliary_loss_mlp": 0.01055743, + "balance_loss_clip": 1.03222322, + "balance_loss_mlp": 1.05435395, + "epoch": 0.074492710055614, + "flos": 22675290641280.0, + "grad_norm": 2.0695087378138455, + "language_loss": 0.86322856, + "learning_rate": 3.9792708604598854e-06, + "loss": 0.88576937, + "num_input_tokens_seen": 26330330, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.4375, + "step": 1239, + "time_per_iteration": 2.4961507320404053 + }, + { + "auxiliary_loss_clip": 0.01194884, + "auxiliary_loss_mlp": 0.01049072, + "balance_loss_clip": 1.02401412, + "balance_loss_mlp": 1.05433989, + "epoch": 0.07455283330828198, + "flos": 21284901037440.0, + "grad_norm": 2.179426429753772, + "language_loss": 0.89070082, + "learning_rate": 3.979214895211569e-06, + "loss": 0.91314042, + "num_input_tokens_seen": 26348865, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.40625, + "step": 1240, + "time_per_iteration": 2.456801176071167 + }, + { + "auxiliary_loss_clip": 0.01197473, + "auxiliary_loss_mlp": 0.01058447, + "balance_loss_clip": 1.03325772, + "balance_loss_mlp": 1.05600643, + "epoch": 0.07461295656094995, + "flos": 24388408967040.0, + "grad_norm": 2.2624482063672513, + "language_loss": 0.88586551, + "learning_rate": 3.979158854911225e-06, + "loss": 0.90842468, + "num_input_tokens_seen": 26368210, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.4140625, + "step": 1241, + "time_per_iteration": 2.5667178630828857 + }, + { + "auxiliary_loss_clip": 0.01080695, + "auxiliary_loss_mlp": 0.01022083, + "balance_loss_clip": 1.018507, + "balance_loss_mlp": 1.02113318, + "epoch": 0.07467307981361791, + "flos": 62109660574080.0, + "grad_norm": 0.9233978594431768, + "language_loss": 0.63032585, + "learning_rate": 3.979102739560979e-06, + "loss": 0.65135366, + "num_input_tokens_seen": 26424890, + "router_z_loss_clip": 0.03564453, + "router_z_loss_mlp": 0.59375, + "step": 1242, + "time_per_iteration": 3.1321358680725098 + }, + { + "auxiliary_loss_clip": 0.012088, + "auxiliary_loss_mlp": 0.01059736, + "balance_loss_clip": 1.03305697, + "balance_loss_mlp": 1.05792046, + "epoch": 0.07473320306628589, + "flos": 24863148046080.0, + "grad_norm": 2.8956100556858004, + "language_loss": 0.62917286, + "learning_rate": 3.9790465491629595e-06, + "loss": 0.65185821, + "num_input_tokens_seen": 26446405, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.5078125, + "step": 1243, + "time_per_iteration": 2.5571463108062744 + }, + { + "auxiliary_loss_clip": 0.01196196, + "auxiliary_loss_mlp": 0.01052045, + "balance_loss_clip": 1.0280956, + "balance_loss_mlp": 1.05710852, + "epoch": 0.07479332631895386, + "flos": 24897442556160.0, + "grad_norm": 2.504235331520048, + "language_loss": 0.76465732, + "learning_rate": 3.978990283719296e-06, + "loss": 0.78713971, + "num_input_tokens_seen": 26466070, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.390625, + "step": 1244, + "time_per_iteration": 2.501621723175049 + }, + { + "auxiliary_loss_clip": 0.01197755, + "auxiliary_loss_mlp": 0.01058762, + "balance_loss_clip": 1.03462183, + "balance_loss_mlp": 1.05684423, + "epoch": 0.07485344957162182, + "flos": 17815247821440.0, + "grad_norm": 2.8968513367461495, + "language_loss": 0.69149882, + "learning_rate": 3.978933943232123e-06, + "loss": 0.714064, + "num_input_tokens_seen": 26479350, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.40625, + "step": 1245, + "time_per_iteration": 2.417921781539917 + }, + { + "auxiliary_loss_clip": 0.01196347, + "auxiliary_loss_mlp": 0.01052065, + "balance_loss_clip": 1.02768707, + "balance_loss_mlp": 1.05663347, + "epoch": 0.0749135728242898, + "flos": 25010202326400.0, + "grad_norm": 1.9272496045423029, + "language_loss": 0.88344061, + "learning_rate": 3.978877527703576e-06, + "loss": 0.90592474, + "num_input_tokens_seen": 26498255, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.3984375, + "step": 1246, + "time_per_iteration": 2.5631723403930664 + }, + { + "auxiliary_loss_clip": 0.01205457, + "auxiliary_loss_mlp": 0.01067222, + "balance_loss_clip": 1.04055524, + "balance_loss_mlp": 1.05656838, + "epoch": 0.07497369607695777, + "flos": 17822071405440.0, + "grad_norm": 2.4755370190447064, + "language_loss": 0.87921643, + "learning_rate": 3.9788210371357945e-06, + "loss": 0.90194321, + "num_input_tokens_seen": 26515375, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.4921875, + "step": 1247, + "time_per_iteration": 2.4602389335632324 + }, + { + "auxiliary_loss_clip": 0.01194073, + "auxiliary_loss_mlp": 0.01060013, + "balance_loss_clip": 1.03502667, + "balance_loss_mlp": 1.05565107, + "epoch": 0.07503381932962573, + "flos": 15121086261120.0, + "grad_norm": 2.2039165223770194, + "language_loss": 0.6477375, + "learning_rate": 3.978764471530921e-06, + "loss": 0.67027843, + "num_input_tokens_seen": 26533595, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.3828125, + "step": 1248, + "time_per_iteration": 2.4408388137817383 + }, + { + "auxiliary_loss_clip": 0.01192958, + "auxiliary_loss_mlp": 0.0106246, + "balance_loss_clip": 1.04016805, + "balance_loss_mlp": 1.0575254, + "epoch": 0.0750939425822937, + "flos": 12816734071680.0, + "grad_norm": 2.0641418493429713, + "language_loss": 0.73964334, + "learning_rate": 3.978707830891102e-06, + "loss": 0.76219749, + "num_input_tokens_seen": 26549405, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.359375, + "step": 1249, + "time_per_iteration": 2.443767547607422 + }, + { + "auxiliary_loss_clip": 0.01201286, + "auxiliary_loss_mlp": 0.01068388, + "balance_loss_clip": 1.0433774, + "balance_loss_mlp": 1.05842972, + "epoch": 0.07515406583496168, + "flos": 24206844695040.0, + "grad_norm": 2.607815988938315, + "language_loss": 0.81845009, + "learning_rate": 3.978651115218482e-06, + "loss": 0.84114683, + "num_input_tokens_seen": 26567200, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.4296875, + "step": 1250, + "time_per_iteration": 2.491236448287964 + }, + { + "auxiliary_loss_clip": 0.01197565, + "auxiliary_loss_mlp": 0.01061004, + "balance_loss_clip": 1.03703094, + "balance_loss_mlp": 1.05932856, + "epoch": 0.07521418908762964, + "flos": 26688164215680.0, + "grad_norm": 2.308634463940828, + "language_loss": 0.66713893, + "learning_rate": 3.978594324515215e-06, + "loss": 0.68972456, + "num_input_tokens_seen": 26586190, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.3828125, + "step": 1251, + "time_per_iteration": 2.5437874794006348 + }, + { + "auxiliary_loss_clip": 0.0107681, + "auxiliary_loss_mlp": 0.01002851, + "balance_loss_clip": 0.99946529, + "balance_loss_mlp": 1.02021933, + "epoch": 0.0752743123402976, + "flos": 59095140589440.0, + "grad_norm": 0.8978558428983584, + "language_loss": 0.70356798, + "learning_rate": 3.9785374587834515e-06, + "loss": 0.72436458, + "num_input_tokens_seen": 26650710, + "router_z_loss_clip": 0.03393555, + "router_z_loss_mlp": 0.56640625, + "step": 1252, + "time_per_iteration": 3.1170923709869385 + }, + { + "auxiliary_loss_clip": 0.01194007, + "auxiliary_loss_mlp": 0.01061281, + "balance_loss_clip": 1.03698599, + "balance_loss_mlp": 1.05419612, + "epoch": 0.07533443559296558, + "flos": 23477032160640.0, + "grad_norm": 2.9290655276351045, + "language_loss": 0.79516673, + "learning_rate": 3.97848051802535e-06, + "loss": 0.81771958, + "num_input_tokens_seen": 26669000, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.3984375, + "step": 1253, + "time_per_iteration": 2.4821414947509766 + }, + { + "auxiliary_loss_clip": 0.01199953, + "auxiliary_loss_mlp": 0.01065033, + "balance_loss_clip": 1.04125071, + "balance_loss_mlp": 1.05829906, + "epoch": 0.07539455884563355, + "flos": 20879110114560.0, + "grad_norm": 2.5751371148477995, + "language_loss": 0.93441045, + "learning_rate": 3.978423502243069e-06, + "loss": 0.95706034, + "num_input_tokens_seen": 26683075, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.421875, + "step": 1254, + "time_per_iteration": 2.4245519638061523 + }, + { + "auxiliary_loss_clip": 0.01191058, + "auxiliary_loss_mlp": 0.01062028, + "balance_loss_clip": 1.03849554, + "balance_loss_mlp": 1.05566263, + "epoch": 0.07545468209830151, + "flos": 27672906551040.0, + "grad_norm": 1.866823394820361, + "language_loss": 0.88030314, + "learning_rate": 3.97836641143877e-06, + "loss": 0.902834, + "num_input_tokens_seen": 26701875, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.359375, + "step": 1255, + "time_per_iteration": 2.5579185485839844 + }, + { + "auxiliary_loss_clip": 0.01192242, + "auxiliary_loss_mlp": 0.01064619, + "balance_loss_clip": 1.04009795, + "balance_loss_mlp": 1.05518413, + "epoch": 0.0755148053509695, + "flos": 14136990370560.0, + "grad_norm": 1.7574194703288544, + "language_loss": 0.79516619, + "learning_rate": 3.978309245614618e-06, + "loss": 0.81773484, + "num_input_tokens_seen": 26719050, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.3671875, + "step": 1256, + "time_per_iteration": 2.4203784465789795 + }, + { + "auxiliary_loss_clip": 0.01074137, + "auxiliary_loss_mlp": 0.01007102, + "balance_loss_clip": 1.00378788, + "balance_loss_mlp": 1.01769829, + "epoch": 0.07557492860363746, + "flos": 58235257929600.0, + "grad_norm": 0.8283025846018472, + "language_loss": 0.58016127, + "learning_rate": 3.9782520047727825e-06, + "loss": 0.60097361, + "num_input_tokens_seen": 26780650, + "router_z_loss_clip": 0.03320312, + "router_z_loss_mlp": 0.5625, + "step": 1257, + "time_per_iteration": 3.1732118129730225 + }, + { + "auxiliary_loss_clip": 0.0119581, + "auxiliary_loss_mlp": 0.01056297, + "balance_loss_clip": 1.03272927, + "balance_loss_mlp": 1.05982757, + "epoch": 0.07563505185630542, + "flos": 24644380262400.0, + "grad_norm": 3.1336739114125107, + "language_loss": 0.89859951, + "learning_rate": 3.978194688915432e-06, + "loss": 0.92112058, + "num_input_tokens_seen": 26798725, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.359375, + "step": 1258, + "time_per_iteration": 2.516925811767578 + }, + { + "auxiliary_loss_clip": 0.01192364, + "auxiliary_loss_mlp": 0.01054502, + "balance_loss_clip": 1.03181624, + "balance_loss_mlp": 1.05663717, + "epoch": 0.07569517510897339, + "flos": 15522998515200.0, + "grad_norm": 3.28312942247731, + "language_loss": 0.81211507, + "learning_rate": 3.978137298044741e-06, + "loss": 0.83458376, + "num_input_tokens_seen": 26817005, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.359375, + "step": 1259, + "time_per_iteration": 2.449533224105835 + }, + { + "auxiliary_loss_clip": 0.01193912, + "auxiliary_loss_mlp": 0.01058573, + "balance_loss_clip": 1.03593481, + "balance_loss_mlp": 1.05662787, + "epoch": 0.07575529836164137, + "flos": 22928532503040.0, + "grad_norm": 1.9172803769558988, + "language_loss": 0.75733984, + "learning_rate": 3.978079832162885e-06, + "loss": 0.77986467, + "num_input_tokens_seen": 26836655, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.375, + "step": 1260, + "time_per_iteration": 2.5003559589385986 + }, + { + "auxiliary_loss_clip": 0.01192246, + "auxiliary_loss_mlp": 0.01059755, + "balance_loss_clip": 1.03550828, + "balance_loss_mlp": 1.0552032, + "epoch": 0.07581542161430933, + "flos": 19500428344320.0, + "grad_norm": 1.8260195606442358, + "language_loss": 0.84695768, + "learning_rate": 3.978022291272044e-06, + "loss": 0.86947775, + "num_input_tokens_seen": 26854925, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.375, + "step": 1261, + "time_per_iteration": 2.4633476734161377 + }, + { + "auxiliary_loss_clip": 0.01200376, + "auxiliary_loss_mlp": 0.01060967, + "balance_loss_clip": 1.03828108, + "balance_loss_mlp": 1.05969536, + "epoch": 0.0758755448669773, + "flos": 24973465691520.0, + "grad_norm": 2.3160282321136334, + "language_loss": 0.8266682, + "learning_rate": 3.977964675374399e-06, + "loss": 0.84928167, + "num_input_tokens_seen": 26876170, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.40625, + "step": 1262, + "time_per_iteration": 2.5256471633911133 + }, + { + "auxiliary_loss_clip": 0.01192085, + "auxiliary_loss_mlp": 0.01061195, + "balance_loss_clip": 1.03703153, + "balance_loss_mlp": 1.0540688, + "epoch": 0.07593566811964528, + "flos": 22747973811840.0, + "grad_norm": 2.4581964181262776, + "language_loss": 0.8255769, + "learning_rate": 3.977906984472136e-06, + "loss": 0.84810972, + "num_input_tokens_seen": 26895005, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.3828125, + "step": 1263, + "time_per_iteration": 2.470656633377075 + }, + { + "auxiliary_loss_clip": 0.01195735, + "auxiliary_loss_mlp": 0.01056704, + "balance_loss_clip": 1.03381538, + "balance_loss_mlp": 1.05504882, + "epoch": 0.07599579137231324, + "flos": 23112395245440.0, + "grad_norm": 2.324943057092889, + "language_loss": 0.7591399, + "learning_rate": 3.977849218567442e-06, + "loss": 0.78166431, + "num_input_tokens_seen": 26913930, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.40625, + "step": 1264, + "time_per_iteration": 2.4715359210968018 + }, + { + "auxiliary_loss_clip": 0.0119596, + "auxiliary_loss_mlp": 0.01062168, + "balance_loss_clip": 1.03832579, + "balance_loss_mlp": 1.05711412, + "epoch": 0.07605591462498121, + "flos": 14502058248960.0, + "grad_norm": 2.1997185871944356, + "language_loss": 0.81106204, + "learning_rate": 3.977791377662507e-06, + "loss": 0.83364332, + "num_input_tokens_seen": 26931485, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.390625, + "step": 1265, + "time_per_iteration": 2.440000295639038 + }, + { + "auxiliary_loss_clip": 0.01195477, + "auxiliary_loss_mlp": 0.01056708, + "balance_loss_clip": 1.03408241, + "balance_loss_mlp": 1.05631864, + "epoch": 0.07611603787764919, + "flos": 23514199758720.0, + "grad_norm": 2.141616369936441, + "language_loss": 0.64935738, + "learning_rate": 3.977733461759524e-06, + "loss": 0.67187923, + "num_input_tokens_seen": 26951670, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.390625, + "step": 1266, + "time_per_iteration": 2.495001792907715 + }, + { + "auxiliary_loss_clip": 0.01194799, + "auxiliary_loss_mlp": 0.01060988, + "balance_loss_clip": 1.03752804, + "balance_loss_mlp": 1.05550349, + "epoch": 0.07617616113031715, + "flos": 21507188353920.0, + "grad_norm": 2.2514277899416606, + "language_loss": 0.79527593, + "learning_rate": 3.977675470860691e-06, + "loss": 0.81783378, + "num_input_tokens_seen": 26970335, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.390625, + "step": 1267, + "time_per_iteration": 2.4763970375061035 + }, + { + "auxiliary_loss_clip": 0.01194511, + "auxiliary_loss_mlp": 0.01051729, + "balance_loss_clip": 1.02975869, + "balance_loss_mlp": 1.05526185, + "epoch": 0.07623628438298512, + "flos": 14573161221120.0, + "grad_norm": 2.2740159695832682, + "language_loss": 0.7253381, + "learning_rate": 3.977617404968205e-06, + "loss": 0.74780059, + "num_input_tokens_seen": 26986025, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.390625, + "step": 1268, + "time_per_iteration": 3.8910977840423584 + }, + { + "auxiliary_loss_clip": 0.01192554, + "auxiliary_loss_mlp": 0.01057239, + "balance_loss_clip": 1.03447044, + "balance_loss_mlp": 1.05342031, + "epoch": 0.07629640763565308, + "flos": 14720395069440.0, + "grad_norm": 2.163449384012833, + "language_loss": 0.81891817, + "learning_rate": 3.977559264084269e-06, + "loss": 0.84141612, + "num_input_tokens_seen": 27004045, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.390625, + "step": 1269, + "time_per_iteration": 3.8643741607666016 + }, + { + "auxiliary_loss_clip": 0.01192657, + "auxiliary_loss_mlp": 0.01054484, + "balance_loss_clip": 1.03120267, + "balance_loss_mlp": 1.05559695, + "epoch": 0.07635653088832106, + "flos": 14902929008640.0, + "grad_norm": 3.2383492700687078, + "language_loss": 0.88135087, + "learning_rate": 3.977501048211088e-06, + "loss": 0.90382218, + "num_input_tokens_seen": 27022070, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.375, + "step": 1270, + "time_per_iteration": 2.4746575355529785 + }, + { + "auxiliary_loss_clip": 0.01198155, + "auxiliary_loss_mlp": 0.0105921, + "balance_loss_clip": 1.03559494, + "balance_loss_mlp": 1.05707884, + "epoch": 0.07641665414098903, + "flos": 26651571235200.0, + "grad_norm": 2.188682914143081, + "language_loss": 0.71113384, + "learning_rate": 3.977442757350869e-06, + "loss": 0.73370755, + "num_input_tokens_seen": 27041755, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.4140625, + "step": 1271, + "time_per_iteration": 2.529632091522217 + }, + { + "auxiliary_loss_clip": 0.01189637, + "auxiliary_loss_mlp": 0.01066344, + "balance_loss_clip": 1.04351556, + "balance_loss_mlp": 1.05675423, + "epoch": 0.07647677739365699, + "flos": 25192808092800.0, + "grad_norm": 1.9018984880968814, + "language_loss": 0.82745486, + "learning_rate": 3.977384391505823e-06, + "loss": 0.85001469, + "num_input_tokens_seen": 27061540, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.328125, + "step": 1272, + "time_per_iteration": 2.4950368404388428 + }, + { + "auxiliary_loss_clip": 0.01191491, + "auxiliary_loss_mlp": 0.01061838, + "balance_loss_clip": 1.03867579, + "balance_loss_mlp": 1.05351079, + "epoch": 0.07653690064632497, + "flos": 20558141159040.0, + "grad_norm": 2.0211474255264643, + "language_loss": 0.79951203, + "learning_rate": 3.977325950678162e-06, + "loss": 0.82204533, + "num_input_tokens_seen": 27081395, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.3828125, + "step": 1273, + "time_per_iteration": 2.490281105041504 + }, + { + "auxiliary_loss_clip": 0.01194744, + "auxiliary_loss_mlp": 0.01062211, + "balance_loss_clip": 1.03858376, + "balance_loss_mlp": 1.05600715, + "epoch": 0.07659702389899294, + "flos": 22269320150400.0, + "grad_norm": 1.848359088284866, + "language_loss": 0.81545758, + "learning_rate": 3.977267434870103e-06, + "loss": 0.83802712, + "num_input_tokens_seen": 27101175, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.390625, + "step": 1274, + "time_per_iteration": 2.499799966812134 + }, + { + "auxiliary_loss_clip": 0.01191932, + "auxiliary_loss_mlp": 0.01068548, + "balance_loss_clip": 1.04430115, + "balance_loss_mlp": 1.05469346, + "epoch": 0.0766571471516609, + "flos": 32636120209920.0, + "grad_norm": 1.991418246716423, + "language_loss": 0.73099387, + "learning_rate": 3.977208844083865e-06, + "loss": 0.75359869, + "num_input_tokens_seen": 27124505, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.375, + "step": 1275, + "time_per_iteration": 2.557973623275757 + }, + { + "auxiliary_loss_clip": 0.011939, + "auxiliary_loss_mlp": 0.01061514, + "balance_loss_clip": 1.0368377, + "balance_loss_mlp": 1.05536842, + "epoch": 0.07671727040432888, + "flos": 15267386355840.0, + "grad_norm": 2.1093684912214545, + "language_loss": 0.79584897, + "learning_rate": 3.9771501783216685e-06, + "loss": 0.81840312, + "num_input_tokens_seen": 27140960, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.390625, + "step": 1276, + "time_per_iteration": 2.4329752922058105 + }, + { + "auxiliary_loss_clip": 0.01196395, + "auxiliary_loss_mlp": 0.01051332, + "balance_loss_clip": 1.02838457, + "balance_loss_mlp": 1.05656397, + "epoch": 0.07677739365699685, + "flos": 28184094956160.0, + "grad_norm": 2.623540269613024, + "language_loss": 0.59020305, + "learning_rate": 3.97709143758574e-06, + "loss": 0.61268032, + "num_input_tokens_seen": 27160985, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.3984375, + "step": 1277, + "time_per_iteration": 2.5318989753723145 + }, + { + "auxiliary_loss_clip": 0.01200985, + "auxiliary_loss_mlp": 0.01057464, + "balance_loss_clip": 1.03433728, + "balance_loss_mlp": 1.05805659, + "epoch": 0.07683751690966481, + "flos": 18296128126080.0, + "grad_norm": 2.2944749333347096, + "language_loss": 0.74846482, + "learning_rate": 3.977032621878305e-06, + "loss": 0.77104926, + "num_input_tokens_seen": 27178390, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.4296875, + "step": 1278, + "time_per_iteration": 2.448615789413452 + }, + { + "auxiliary_loss_clip": 0.01190146, + "auxiliary_loss_mlp": 0.01051712, + "balance_loss_clip": 1.02943182, + "balance_loss_mlp": 1.05475163, + "epoch": 0.07689764016233278, + "flos": 21981101420160.0, + "grad_norm": 4.0999470067777075, + "language_loss": 0.88656616, + "learning_rate": 3.976973731201596e-06, + "loss": 0.90898478, + "num_input_tokens_seen": 27197505, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.359375, + "step": 1279, + "time_per_iteration": 2.4883790016174316 + }, + { + "auxiliary_loss_clip": 0.01189256, + "auxiliary_loss_mlp": 0.01062556, + "balance_loss_clip": 1.03973901, + "balance_loss_mlp": 1.05507362, + "epoch": 0.07695776341500075, + "flos": 22235995307520.0, + "grad_norm": 3.4596954186847393, + "language_loss": 0.82899994, + "learning_rate": 3.976914765557845e-06, + "loss": 0.85151803, + "num_input_tokens_seen": 27214260, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.34375, + "step": 1280, + "time_per_iteration": 2.459294319152832 + }, + { + "auxiliary_loss_clip": 0.01188755, + "auxiliary_loss_mlp": 0.01061811, + "balance_loss_clip": 1.03874409, + "balance_loss_mlp": 1.05492759, + "epoch": 0.07701788666766872, + "flos": 16143750380160.0, + "grad_norm": 1.9224222656998016, + "language_loss": 0.76059222, + "learning_rate": 3.9768557249492875e-06, + "loss": 0.78309786, + "num_input_tokens_seen": 27232525, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.3359375, + "step": 1281, + "time_per_iteration": 2.453183650970459 + }, + { + "auxiliary_loss_clip": 0.0119548, + "auxiliary_loss_mlp": 0.01054802, + "balance_loss_clip": 1.03128171, + "balance_loss_mlp": 1.05448353, + "epoch": 0.07707800992033668, + "flos": 19463045264640.0, + "grad_norm": 1.8937081587754587, + "language_loss": 0.75307631, + "learning_rate": 3.9767966093781634e-06, + "loss": 0.77557921, + "num_input_tokens_seen": 27249800, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.40625, + "step": 1282, + "time_per_iteration": 2.4526116847991943 + }, + { + "auxiliary_loss_clip": 0.01190337, + "auxiliary_loss_mlp": 0.01070616, + "balance_loss_clip": 1.04734671, + "balance_loss_mlp": 1.054286, + "epoch": 0.07713813317300466, + "flos": 18990281433600.0, + "grad_norm": 2.0304459145795963, + "language_loss": 0.8428033, + "learning_rate": 3.976737418846713e-06, + "loss": 0.86541283, + "num_input_tokens_seen": 27268895, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.359375, + "step": 1283, + "time_per_iteration": 2.468101739883423 + }, + { + "auxiliary_loss_clip": 0.01192768, + "auxiliary_loss_mlp": 0.01062747, + "balance_loss_clip": 1.0375464, + "balance_loss_mlp": 1.05560803, + "epoch": 0.07719825642567263, + "flos": 18113953322880.0, + "grad_norm": 2.622403612740989, + "language_loss": 0.75031364, + "learning_rate": 3.976678153357181e-06, + "loss": 0.77286887, + "num_input_tokens_seen": 27288180, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.375, + "step": 1284, + "time_per_iteration": 2.451749801635742 + }, + { + "auxiliary_loss_clip": 0.01188745, + "auxiliary_loss_mlp": 0.0106155, + "balance_loss_clip": 1.03947222, + "balance_loss_mlp": 1.05330253, + "epoch": 0.0772583796783406, + "flos": 42194426993280.0, + "grad_norm": 1.6448065546510353, + "language_loss": 0.75934827, + "learning_rate": 3.976618812911817e-06, + "loss": 0.78185129, + "num_input_tokens_seen": 27311815, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.359375, + "step": 1285, + "time_per_iteration": 2.664769411087036 + }, + { + "auxiliary_loss_clip": 0.01196484, + "auxiliary_loss_mlp": 0.01062869, + "balance_loss_clip": 1.0406251, + "balance_loss_mlp": 1.05862105, + "epoch": 0.07731850293100857, + "flos": 24753692327040.0, + "grad_norm": 1.8165785508620624, + "language_loss": 0.84204662, + "learning_rate": 3.9765593975128685e-06, + "loss": 0.86464012, + "num_input_tokens_seen": 27331890, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.375, + "step": 1286, + "time_per_iteration": 2.550670862197876 + }, + { + "auxiliary_loss_clip": 0.01196192, + "auxiliary_loss_mlp": 0.01055874, + "balance_loss_clip": 1.03271151, + "balance_loss_mlp": 1.05582845, + "epoch": 0.07737862618367654, + "flos": 17565884628480.0, + "grad_norm": 4.521300853065514, + "language_loss": 0.76725763, + "learning_rate": 3.97649990716259e-06, + "loss": 0.78977823, + "num_input_tokens_seen": 27348320, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.40625, + "step": 1287, + "time_per_iteration": 2.455627918243408 + }, + { + "auxiliary_loss_clip": 0.01190346, + "auxiliary_loss_mlp": 0.01058612, + "balance_loss_clip": 1.03636777, + "balance_loss_mlp": 1.05476642, + "epoch": 0.0774387494363445, + "flos": 25627147349760.0, + "grad_norm": 1.6785000972571258, + "language_loss": 0.84509134, + "learning_rate": 3.976440341863237e-06, + "loss": 0.86758095, + "num_input_tokens_seen": 27367670, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.359375, + "step": 1288, + "time_per_iteration": 2.500218629837036 + }, + { + "auxiliary_loss_clip": 0.01192387, + "auxiliary_loss_mlp": 0.01056799, + "balance_loss_clip": 1.0350318, + "balance_loss_mlp": 1.05364347, + "epoch": 0.07749887268901248, + "flos": 12239865648000.0, + "grad_norm": 2.192533837519805, + "language_loss": 0.85769016, + "learning_rate": 3.976380701617068e-06, + "loss": 0.88018203, + "num_input_tokens_seen": 27385485, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.390625, + "step": 1289, + "time_per_iteration": 2.4759440422058105 + }, + { + "auxiliary_loss_clip": 0.01189023, + "auxiliary_loss_mlp": 0.01047658, + "balance_loss_clip": 1.02563989, + "balance_loss_mlp": 1.05300641, + "epoch": 0.07755899594168045, + "flos": 25081736261760.0, + "grad_norm": 1.8877463184856607, + "language_loss": 0.85053366, + "learning_rate": 3.976320986426344e-06, + "loss": 0.87290049, + "num_input_tokens_seen": 27405110, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.359375, + "step": 1290, + "time_per_iteration": 2.5266401767730713 + }, + { + "auxiliary_loss_clip": 0.01185369, + "auxiliary_loss_mlp": 0.01059291, + "balance_loss_clip": 1.03541303, + "balance_loss_mlp": 1.05397463, + "epoch": 0.07761911919434841, + "flos": 14246410176000.0, + "grad_norm": 2.3980248629455834, + "language_loss": 0.90562832, + "learning_rate": 3.9762611962933315e-06, + "loss": 0.92807496, + "num_input_tokens_seen": 27422855, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.3125, + "step": 1291, + "time_per_iteration": 2.4760262966156006 + }, + { + "auxiliary_loss_clip": 0.01071114, + "auxiliary_loss_mlp": 0.01008288, + "balance_loss_clip": 1.00456893, + "balance_loss_mlp": 1.01656318, + "epoch": 0.07767924244701638, + "flos": 67237202954880.0, + "grad_norm": 0.9429671936579762, + "language_loss": 0.64993972, + "learning_rate": 3.9762013312202955e-06, + "loss": 0.67073375, + "num_input_tokens_seen": 27487190, + "router_z_loss_clip": 0.03710938, + "router_z_loss_mlp": 0.546875, + "step": 1292, + "time_per_iteration": 3.1508371829986572 + }, + { + "auxiliary_loss_clip": 0.0118873, + "auxiliary_loss_mlp": 0.01059018, + "balance_loss_clip": 1.03716707, + "balance_loss_mlp": 1.05293965, + "epoch": 0.07773936569968436, + "flos": 28550635292160.0, + "grad_norm": 1.7960778456946043, + "language_loss": 0.87610948, + "learning_rate": 3.9761413912095075e-06, + "loss": 0.89858699, + "num_input_tokens_seen": 27510465, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.359375, + "step": 1293, + "time_per_iteration": 2.6359729766845703 + }, + { + "auxiliary_loss_clip": 0.01193413, + "auxiliary_loss_mlp": 0.01062236, + "balance_loss_clip": 1.03789377, + "balance_loss_mlp": 1.05659533, + "epoch": 0.07779948895235232, + "flos": 27490264871040.0, + "grad_norm": 2.312065886688882, + "language_loss": 0.85111046, + "learning_rate": 3.976081376263239e-06, + "loss": 0.873667, + "num_input_tokens_seen": 27528645, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.3671875, + "step": 1294, + "time_per_iteration": 2.5151314735412598 + }, + { + "auxiliary_loss_clip": 0.01193943, + "auxiliary_loss_mlp": 0.01054926, + "balance_loss_clip": 1.03188252, + "balance_loss_mlp": 1.05702615, + "epoch": 0.07785961220502029, + "flos": 18223301301120.0, + "grad_norm": 2.728225366024782, + "language_loss": 0.79202414, + "learning_rate": 3.976021286383768e-06, + "loss": 0.81451285, + "num_input_tokens_seen": 27546165, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.3671875, + "step": 1295, + "time_per_iteration": 2.459510326385498 + }, + { + "auxiliary_loss_clip": 0.01188808, + "auxiliary_loss_mlp": 0.01052849, + "balance_loss_clip": 1.02966261, + "balance_loss_mlp": 1.05383039, + "epoch": 0.07791973545768827, + "flos": 24608218245120.0, + "grad_norm": 2.8222308711400834, + "language_loss": 0.88216382, + "learning_rate": 3.975961121573371e-06, + "loss": 0.90458035, + "num_input_tokens_seen": 27566520, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3515625, + "step": 1296, + "time_per_iteration": 2.492892026901245 + }, + { + "auxiliary_loss_clip": 0.01192946, + "auxiliary_loss_mlp": 0.01058016, + "balance_loss_clip": 1.03410244, + "balance_loss_mlp": 1.05591464, + "epoch": 0.07797985871035623, + "flos": 14282069402880.0, + "grad_norm": 3.2140473454082086, + "language_loss": 0.96160841, + "learning_rate": 3.9759008818343305e-06, + "loss": 0.98411804, + "num_input_tokens_seen": 27581960, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.375, + "step": 1297, + "time_per_iteration": 2.4668915271759033 + }, + { + "auxiliary_loss_clip": 0.01189875, + "auxiliary_loss_mlp": 0.01054366, + "balance_loss_clip": 1.032372, + "balance_loss_mlp": 1.05289149, + "epoch": 0.0780399819630242, + "flos": 26610453141120.0, + "grad_norm": 2.460261972702069, + "language_loss": 0.76087165, + "learning_rate": 3.97584056716893e-06, + "loss": 0.78331399, + "num_input_tokens_seen": 27601415, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.3671875, + "step": 1298, + "time_per_iteration": 2.5059781074523926 + }, + { + "auxiliary_loss_clip": 0.01192131, + "auxiliary_loss_mlp": 0.01061793, + "balance_loss_clip": 1.04039502, + "balance_loss_mlp": 1.05696058, + "epoch": 0.07810010521569218, + "flos": 21834514016640.0, + "grad_norm": 1.8752674736144914, + "language_loss": 0.80755305, + "learning_rate": 3.9757801775794575e-06, + "loss": 0.83009231, + "num_input_tokens_seen": 27621490, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.3515625, + "step": 1299, + "time_per_iteration": 2.5036020278930664 + }, + { + "auxiliary_loss_clip": 0.01183493, + "auxiliary_loss_mlp": 0.01056623, + "balance_loss_clip": 1.03402138, + "balance_loss_mlp": 1.05226159, + "epoch": 0.07816022846836014, + "flos": 25081233471360.0, + "grad_norm": 2.1903498852009813, + "language_loss": 0.86459941, + "learning_rate": 3.975719713068202e-06, + "loss": 0.88700056, + "num_input_tokens_seen": 27640600, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.3125, + "step": 1300, + "time_per_iteration": 2.4866278171539307 + }, + { + "auxiliary_loss_clip": 0.0118988, + "auxiliary_loss_mlp": 0.01052064, + "balance_loss_clip": 1.0284245, + "balance_loss_mlp": 1.05393028, + "epoch": 0.0782203517210281, + "flos": 40917515431680.0, + "grad_norm": 1.909902293479526, + "language_loss": 0.71778899, + "learning_rate": 3.975659173637458e-06, + "loss": 0.74020839, + "num_input_tokens_seen": 27663070, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.359375, + "step": 1301, + "time_per_iteration": 2.6491336822509766 + }, + { + "auxiliary_loss_clip": 0.01196178, + "auxiliary_loss_mlp": 0.0106414, + "balance_loss_clip": 1.04106081, + "balance_loss_mlp": 1.0586772, + "epoch": 0.07828047497369607, + "flos": 41172014269440.0, + "grad_norm": 1.5624281437346959, + "language_loss": 0.70860815, + "learning_rate": 3.97559855928952e-06, + "loss": 0.7312113, + "num_input_tokens_seen": 27686425, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.375, + "step": 1302, + "time_per_iteration": 2.635430335998535 + }, + { + "auxiliary_loss_clip": 0.01188946, + "auxiliary_loss_mlp": 0.01060723, + "balance_loss_clip": 1.03702378, + "balance_loss_mlp": 1.05438161, + "epoch": 0.07834059822636405, + "flos": 23508130360320.0, + "grad_norm": 2.152945758623263, + "language_loss": 0.8192755, + "learning_rate": 3.9755378700266864e-06, + "loss": 0.84177226, + "num_input_tokens_seen": 27704900, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.34375, + "step": 1303, + "time_per_iteration": 2.4861090183258057 + }, + { + "auxiliary_loss_clip": 0.01188578, + "auxiliary_loss_mlp": 0.01061933, + "balance_loss_clip": 1.03879452, + "balance_loss_mlp": 1.05351233, + "epoch": 0.07840072147903202, + "flos": 20193899293440.0, + "grad_norm": 1.8425530042965788, + "language_loss": 0.7497822, + "learning_rate": 3.9754771058512585e-06, + "loss": 0.77228731, + "num_input_tokens_seen": 27724890, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3515625, + "step": 1304, + "time_per_iteration": 2.464087963104248 + }, + { + "auxiliary_loss_clip": 0.01191658, + "auxiliary_loss_mlp": 0.0106237, + "balance_loss_clip": 1.03857565, + "balance_loss_mlp": 1.05645108, + "epoch": 0.07846084473169998, + "flos": 21360816432000.0, + "grad_norm": 1.696211405930565, + "language_loss": 0.76397038, + "learning_rate": 3.975416266765542e-06, + "loss": 0.78651059, + "num_input_tokens_seen": 27743115, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.359375, + "step": 1305, + "time_per_iteration": 2.486093521118164 + }, + { + "auxiliary_loss_clip": 0.01192283, + "auxiliary_loss_mlp": 0.01064575, + "balance_loss_clip": 1.04087615, + "balance_loss_mlp": 1.05527782, + "epoch": 0.07852096798436796, + "flos": 25410965345280.0, + "grad_norm": 2.2926357932273866, + "language_loss": 0.85035503, + "learning_rate": 3.975355352771841e-06, + "loss": 0.87292361, + "num_input_tokens_seen": 27763570, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.3671875, + "step": 1306, + "time_per_iteration": 2.496265172958374 + }, + { + "auxiliary_loss_clip": 0.0119039, + "auxiliary_loss_mlp": 0.0104404, + "balance_loss_clip": 1.02299976, + "balance_loss_mlp": 1.05652416, + "epoch": 0.07858109123703592, + "flos": 24571481610240.0, + "grad_norm": 3.0575778567802976, + "language_loss": 0.90087706, + "learning_rate": 3.975294363872468e-06, + "loss": 0.92322135, + "num_input_tokens_seen": 27780030, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.34375, + "step": 1307, + "time_per_iteration": 2.5122623443603516 + }, + { + "auxiliary_loss_clip": 0.01189263, + "auxiliary_loss_mlp": 0.01057091, + "balance_loss_clip": 1.03295124, + "balance_loss_mlp": 1.05417371, + "epoch": 0.07864121448970389, + "flos": 20698874645760.0, + "grad_norm": 1.8540925974151201, + "language_loss": 0.83408689, + "learning_rate": 3.975233300069735e-06, + "loss": 0.85655046, + "num_input_tokens_seen": 27796225, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.3515625, + "step": 1308, + "time_per_iteration": 2.4686944484710693 + }, + { + "auxiliary_loss_clip": 0.01186004, + "auxiliary_loss_mlp": 0.01053628, + "balance_loss_clip": 1.03177738, + "balance_loss_mlp": 1.05289674, + "epoch": 0.07870133774237187, + "flos": 22966526113920.0, + "grad_norm": 1.6283340971904061, + "language_loss": 0.77841777, + "learning_rate": 3.975172161365958e-06, + "loss": 0.80081415, + "num_input_tokens_seen": 27815975, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.328125, + "step": 1309, + "time_per_iteration": 5.444388151168823 + }, + { + "auxiliary_loss_clip": 0.0119416, + "auxiliary_loss_mlp": 0.01062294, + "balance_loss_clip": 1.0380106, + "balance_loss_mlp": 1.05386913, + "epoch": 0.07876146099503983, + "flos": 18842832103680.0, + "grad_norm": 1.9656388899868151, + "language_loss": 0.80146122, + "learning_rate": 3.975110947763453e-06, + "loss": 0.82402575, + "num_input_tokens_seen": 27832255, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.40625, + "step": 1310, + "time_per_iteration": 3.8553466796875 + }, + { + "auxiliary_loss_clip": 0.01185305, + "auxiliary_loss_mlp": 0.01052602, + "balance_loss_clip": 1.03067899, + "balance_loss_mlp": 1.05544043, + "epoch": 0.0788215842477078, + "flos": 23805794367360.0, + "grad_norm": 1.7115323272474947, + "language_loss": 0.73069102, + "learning_rate": 3.9750496592645435e-06, + "loss": 0.75307012, + "num_input_tokens_seen": 27852180, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.296875, + "step": 1311, + "time_per_iteration": 2.5299458503723145 + }, + { + "auxiliary_loss_clip": 0.01190682, + "auxiliary_loss_mlp": 0.01072627, + "balance_loss_clip": 1.04861844, + "balance_loss_mlp": 1.05650353, + "epoch": 0.07888170750037576, + "flos": 21579907438080.0, + "grad_norm": 1.9161215374898264, + "language_loss": 0.85871482, + "learning_rate": 3.974988295871553e-06, + "loss": 0.88134789, + "num_input_tokens_seen": 27871435, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.34375, + "step": 1312, + "time_per_iteration": 2.5490031242370605 + }, + { + "auxiliary_loss_clip": 0.01186476, + "auxiliary_loss_mlp": 0.01059916, + "balance_loss_clip": 1.03811264, + "balance_loss_mlp": 1.0555284, + "epoch": 0.07894183075304374, + "flos": 19864849777920.0, + "grad_norm": 1.7542323177910393, + "language_loss": 0.81968379, + "learning_rate": 3.9749268575868085e-06, + "loss": 0.84214771, + "num_input_tokens_seen": 27890625, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.3125, + "step": 1313, + "time_per_iteration": 2.507046699523926 + }, + { + "auxiliary_loss_clip": 0.0119261, + "auxiliary_loss_mlp": 0.0105996, + "balance_loss_clip": 1.03528404, + "balance_loss_mlp": 1.05271506, + "epoch": 0.07900195400571171, + "flos": 16143463071360.0, + "grad_norm": 3.109477065223649, + "language_loss": 0.73372161, + "learning_rate": 3.97486534441264e-06, + "loss": 0.7562474, + "num_input_tokens_seen": 27906530, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.3984375, + "step": 1314, + "time_per_iteration": 2.4396395683288574 + }, + { + "auxiliary_loss_clip": 0.01185115, + "auxiliary_loss_mlp": 0.01058505, + "balance_loss_clip": 1.03678489, + "balance_loss_mlp": 1.05120206, + "epoch": 0.07906207725837967, + "flos": 23730417676800.0, + "grad_norm": 1.579996187361532, + "language_loss": 0.79460657, + "learning_rate": 3.974803756351379e-06, + "loss": 0.81704271, + "num_input_tokens_seen": 27926725, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.34375, + "step": 1315, + "time_per_iteration": 2.493365526199341 + }, + { + "auxiliary_loss_clip": 0.011877, + "auxiliary_loss_mlp": 0.01060931, + "balance_loss_clip": 1.03592062, + "balance_loss_mlp": 1.05232, + "epoch": 0.07912220051104765, + "flos": 24315905364480.0, + "grad_norm": 1.9411836832725016, + "language_loss": 0.73614991, + "learning_rate": 3.974742093405362e-06, + "loss": 0.75863618, + "num_input_tokens_seen": 27947875, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.3515625, + "step": 1316, + "time_per_iteration": 2.4696316719055176 + }, + { + "auxiliary_loss_clip": 0.01193023, + "auxiliary_loss_mlp": 0.01063129, + "balance_loss_clip": 1.03940618, + "balance_loss_mlp": 1.05415511, + "epoch": 0.07918232376371562, + "flos": 18880035615360.0, + "grad_norm": 2.862910173072837, + "language_loss": 0.65148681, + "learning_rate": 3.974680355576927e-06, + "loss": 0.67404836, + "num_input_tokens_seen": 27965040, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.390625, + "step": 1317, + "time_per_iteration": 2.447847843170166 + }, + { + "auxiliary_loss_clip": 0.01197561, + "auxiliary_loss_mlp": 0.01063488, + "balance_loss_clip": 1.03899026, + "balance_loss_mlp": 1.05774999, + "epoch": 0.07924244701638358, + "flos": 27376284038400.0, + "grad_norm": 2.3478172138868967, + "language_loss": 0.7324174, + "learning_rate": 3.974618542868415e-06, + "loss": 0.75502789, + "num_input_tokens_seen": 27985330, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.3984375, + "step": 1318, + "time_per_iteration": 2.497406482696533 + }, + { + "auxiliary_loss_clip": 0.01188329, + "auxiliary_loss_mlp": 0.01057875, + "balance_loss_clip": 1.03557122, + "balance_loss_mlp": 1.05335736, + "epoch": 0.07930257026905156, + "flos": 25120340403840.0, + "grad_norm": 1.92969491679129, + "language_loss": 0.90610284, + "learning_rate": 3.97455665528217e-06, + "loss": 0.92856491, + "num_input_tokens_seen": 28007615, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.3515625, + "step": 1319, + "time_per_iteration": 2.5007200241088867 + }, + { + "auxiliary_loss_clip": 0.01188786, + "auxiliary_loss_mlp": 0.01054126, + "balance_loss_clip": 1.03086793, + "balance_loss_mlp": 1.05155873, + "epoch": 0.07936269352171953, + "flos": 21834478103040.0, + "grad_norm": 2.95797867210378, + "language_loss": 0.79765761, + "learning_rate": 3.974494692820539e-06, + "loss": 0.82008684, + "num_input_tokens_seen": 28027765, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.375, + "step": 1320, + "time_per_iteration": 2.4683783054351807 + }, + { + "auxiliary_loss_clip": 0.01190918, + "auxiliary_loss_mlp": 0.01057044, + "balance_loss_clip": 1.03448987, + "balance_loss_mlp": 1.05700457, + "epoch": 0.07942281677438749, + "flos": 16939889377920.0, + "grad_norm": 2.6163787894008363, + "language_loss": 0.69574934, + "learning_rate": 3.974432655485872e-06, + "loss": 0.71822894, + "num_input_tokens_seen": 28044225, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.34375, + "step": 1321, + "time_per_iteration": 2.466911554336548 + }, + { + "auxiliary_loss_clip": 0.01184231, + "auxiliary_loss_mlp": 0.01055954, + "balance_loss_clip": 1.03313756, + "balance_loss_mlp": 1.05313718, + "epoch": 0.07948294002705546, + "flos": 18986941468800.0, + "grad_norm": 1.926313653502779, + "language_loss": 0.83559513, + "learning_rate": 3.9743705432805195e-06, + "loss": 0.857997, + "num_input_tokens_seen": 28062915, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.3125, + "step": 1322, + "time_per_iteration": 2.465885639190674 + }, + { + "auxiliary_loss_clip": 0.01188233, + "auxiliary_loss_mlp": 0.01058763, + "balance_loss_clip": 1.03544521, + "balance_loss_mlp": 1.05104756, + "epoch": 0.07954306327972344, + "flos": 21653452535040.0, + "grad_norm": 1.8863777031262867, + "language_loss": 0.90437615, + "learning_rate": 3.974308356206838e-06, + "loss": 0.92684615, + "num_input_tokens_seen": 28082175, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.375, + "step": 1323, + "time_per_iteration": 2.465841293334961 + }, + { + "auxiliary_loss_clip": 0.0118735, + "auxiliary_loss_mlp": 0.01057162, + "balance_loss_clip": 1.03438115, + "balance_loss_mlp": 1.05414796, + "epoch": 0.0796031865323914, + "flos": 23220270766080.0, + "grad_norm": 1.6454981938510795, + "language_loss": 0.82583225, + "learning_rate": 3.974246094267187e-06, + "loss": 0.84827733, + "num_input_tokens_seen": 28102645, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.328125, + "step": 1324, + "time_per_iteration": 2.475486993789673 + }, + { + "auxiliary_loss_clip": 0.01188398, + "auxiliary_loss_mlp": 0.01049438, + "balance_loss_clip": 1.0255841, + "balance_loss_mlp": 1.05264676, + "epoch": 0.07966330978505937, + "flos": 23294534135040.0, + "grad_norm": 2.416918252865386, + "language_loss": 0.79654729, + "learning_rate": 3.974183757463925e-06, + "loss": 0.81892562, + "num_input_tokens_seen": 28122805, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.359375, + "step": 1325, + "time_per_iteration": 2.482555389404297 + }, + { + "auxiliary_loss_clip": 0.01190127, + "auxiliary_loss_mlp": 0.01064919, + "balance_loss_clip": 1.03989661, + "balance_loss_mlp": 1.05474687, + "epoch": 0.07972343303772735, + "flos": 18363783392640.0, + "grad_norm": 2.170521767048619, + "language_loss": 0.8812806, + "learning_rate": 3.974121345799418e-06, + "loss": 0.90383106, + "num_input_tokens_seen": 28140530, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.3515625, + "step": 1326, + "time_per_iteration": 2.466742753982544 + }, + { + "auxiliary_loss_clip": 0.01182901, + "auxiliary_loss_mlp": 0.01052255, + "balance_loss_clip": 1.02823424, + "balance_loss_mlp": 1.05014396, + "epoch": 0.07978355629039531, + "flos": 21762513204480.0, + "grad_norm": 2.3992518634606164, + "language_loss": 0.83013594, + "learning_rate": 3.974058859276032e-06, + "loss": 0.8524875, + "num_input_tokens_seen": 28159640, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.328125, + "step": 1327, + "time_per_iteration": 2.4989237785339355 + }, + { + "auxiliary_loss_clip": 0.0119143, + "auxiliary_loss_mlp": 0.0105424, + "balance_loss_clip": 1.03013575, + "balance_loss_mlp": 1.05436027, + "epoch": 0.07984367954306328, + "flos": 18551309322240.0, + "grad_norm": 2.1664091533416587, + "language_loss": 0.78452092, + "learning_rate": 3.9739962978961354e-06, + "loss": 0.80697763, + "num_input_tokens_seen": 28177050, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.375, + "step": 1328, + "time_per_iteration": 2.4402852058410645 + }, + { + "auxiliary_loss_clip": 0.01191637, + "auxiliary_loss_mlp": 0.01053331, + "balance_loss_clip": 1.02969217, + "balance_loss_mlp": 1.05460131, + "epoch": 0.07990380279573125, + "flos": 16904050583040.0, + "grad_norm": 2.484533735051083, + "language_loss": 0.74277186, + "learning_rate": 3.973933661662101e-06, + "loss": 0.76522154, + "num_input_tokens_seen": 28193245, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.375, + "step": 1329, + "time_per_iteration": 2.425388813018799 + }, + { + "auxiliary_loss_clip": 0.01185759, + "auxiliary_loss_mlp": 0.01060058, + "balance_loss_clip": 1.03731298, + "balance_loss_mlp": 1.05096054, + "epoch": 0.07996392604839922, + "flos": 24098358643200.0, + "grad_norm": 1.5753219993175995, + "language_loss": 0.81090498, + "learning_rate": 3.973870950576305e-06, + "loss": 0.83336312, + "num_input_tokens_seen": 28213570, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.3515625, + "step": 1330, + "time_per_iteration": 2.4831247329711914 + }, + { + "auxiliary_loss_clip": 0.01190834, + "auxiliary_loss_mlp": 0.01062422, + "balance_loss_clip": 1.03924823, + "balance_loss_mlp": 1.05348384, + "epoch": 0.08002404930106718, + "flos": 14278729438080.0, + "grad_norm": 2.322034822225311, + "language_loss": 0.88790143, + "learning_rate": 3.9738081646411255e-06, + "loss": 0.91043401, + "num_input_tokens_seen": 28229980, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.375, + "step": 1331, + "time_per_iteration": 2.4410722255706787 + }, + { + "auxiliary_loss_clip": 0.01193336, + "auxiliary_loss_mlp": 0.01058252, + "balance_loss_clip": 1.03414834, + "balance_loss_mlp": 1.05288279, + "epoch": 0.08008417255373516, + "flos": 40406219285760.0, + "grad_norm": 2.577873328737783, + "language_loss": 0.73332524, + "learning_rate": 3.973745303858942e-06, + "loss": 0.75584114, + "num_input_tokens_seen": 28253840, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.40625, + "step": 1332, + "time_per_iteration": 2.6054465770721436 + }, + { + "auxiliary_loss_clip": 0.01186558, + "auxiliary_loss_mlp": 0.01050644, + "balance_loss_clip": 1.02820885, + "balance_loss_mlp": 1.05179858, + "epoch": 0.08014429580640313, + "flos": 18478913460480.0, + "grad_norm": 1.9568005204239032, + "language_loss": 0.82994795, + "learning_rate": 3.973682368232138e-06, + "loss": 0.85232008, + "num_input_tokens_seen": 28271675, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.34375, + "step": 1333, + "time_per_iteration": 2.453457832336426 + }, + { + "auxiliary_loss_clip": 0.01187412, + "auxiliary_loss_mlp": 0.01055323, + "balance_loss_clip": 1.03272128, + "balance_loss_mlp": 1.05115032, + "epoch": 0.0802044190590711, + "flos": 22053461368320.0, + "grad_norm": 2.7771179443818466, + "language_loss": 0.74698973, + "learning_rate": 3.9736193577631015e-06, + "loss": 0.76941711, + "num_input_tokens_seen": 28291850, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.359375, + "step": 1334, + "time_per_iteration": 2.5768256187438965 + }, + { + "auxiliary_loss_clip": 0.01187182, + "auxiliary_loss_mlp": 0.01060862, + "balance_loss_clip": 1.03831935, + "balance_loss_mlp": 1.05457497, + "epoch": 0.08026454231173906, + "flos": 24572128055040.0, + "grad_norm": 2.0216765528325635, + "language_loss": 0.80279201, + "learning_rate": 3.973556272454221e-06, + "loss": 0.82527244, + "num_input_tokens_seen": 28310780, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.328125, + "step": 1335, + "time_per_iteration": 2.538670301437378 + }, + { + "auxiliary_loss_clip": 0.01078994, + "auxiliary_loss_mlp": 0.01011272, + "balance_loss_clip": 1.00802934, + "balance_loss_mlp": 1.02308655, + "epoch": 0.08032466556440704, + "flos": 52581841459200.0, + "grad_norm": 0.7427722697577622, + "language_loss": 0.56020629, + "learning_rate": 3.973493112307889e-06, + "loss": 0.58110893, + "num_input_tokens_seen": 28369985, + "router_z_loss_clip": 0.0324707, + "router_z_loss_mlp": 0.5625, + "step": 1336, + "time_per_iteration": 3.125026226043701 + }, + { + "auxiliary_loss_clip": 0.01188939, + "auxiliary_loss_mlp": 0.01054834, + "balance_loss_clip": 1.0331738, + "balance_loss_mlp": 1.05371606, + "epoch": 0.080384788817075, + "flos": 23842602829440.0, + "grad_norm": 2.050916847484745, + "language_loss": 0.67764497, + "learning_rate": 3.9734298773265005e-06, + "loss": 0.70008272, + "num_input_tokens_seen": 28388670, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.3515625, + "step": 1337, + "time_per_iteration": 2.506103038787842 + }, + { + "auxiliary_loss_clip": 0.01188826, + "auxiliary_loss_mlp": 0.01065102, + "balance_loss_clip": 1.04313135, + "balance_loss_mlp": 1.05480385, + "epoch": 0.08044491206974297, + "flos": 25300719527040.0, + "grad_norm": 1.8692893317328456, + "language_loss": 0.86701488, + "learning_rate": 3.973366567512453e-06, + "loss": 0.88955414, + "num_input_tokens_seen": 28411845, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.3359375, + "step": 1338, + "time_per_iteration": 2.5451908111572266 + }, + { + "auxiliary_loss_clip": 0.01188004, + "auxiliary_loss_mlp": 0.01060185, + "balance_loss_clip": 1.0368793, + "balance_loss_mlp": 1.05142283, + "epoch": 0.08050503532241095, + "flos": 22376549226240.0, + "grad_norm": 2.6265473040924725, + "language_loss": 0.87246621, + "learning_rate": 3.973303182868147e-06, + "loss": 0.89494807, + "num_input_tokens_seen": 28427875, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.359375, + "step": 1339, + "time_per_iteration": 2.450932502746582 + }, + { + "auxiliary_loss_clip": 0.01181336, + "auxiliary_loss_mlp": 0.01047749, + "balance_loss_clip": 1.02660179, + "balance_loss_mlp": 1.05106449, + "epoch": 0.08056515857507891, + "flos": 18369421827840.0, + "grad_norm": 2.428441908593999, + "language_loss": 0.88819683, + "learning_rate": 3.973239723395988e-06, + "loss": 0.91048771, + "num_input_tokens_seen": 28446615, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.296875, + "step": 1340, + "time_per_iteration": 2.4539895057678223 + }, + { + "auxiliary_loss_clip": 0.01072684, + "auxiliary_loss_mlp": 0.01003041, + "balance_loss_clip": 0.99951285, + "balance_loss_mlp": 1.01727247, + "epoch": 0.08062528182774688, + "flos": 51348130980480.0, + "grad_norm": 0.8886760882983712, + "language_loss": 0.64806795, + "learning_rate": 3.97317618909838e-06, + "loss": 0.66882515, + "num_input_tokens_seen": 28505290, + "router_z_loss_clip": 0.03540039, + "router_z_loss_mlp": 0.5546875, + "step": 1341, + "time_per_iteration": 3.0034360885620117 + }, + { + "auxiliary_loss_clip": 0.01193907, + "auxiliary_loss_mlp": 0.01060938, + "balance_loss_clip": 1.03577328, + "balance_loss_mlp": 1.05301166, + "epoch": 0.08068540508041486, + "flos": 17599712261760.0, + "grad_norm": 2.817345215565239, + "language_loss": 0.89616883, + "learning_rate": 3.973112579977733e-06, + "loss": 0.91871732, + "num_input_tokens_seen": 28522735, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.40625, + "step": 1342, + "time_per_iteration": 2.479701042175293 + }, + { + "auxiliary_loss_clip": 0.01194936, + "auxiliary_loss_mlp": 0.0105815, + "balance_loss_clip": 1.03334308, + "balance_loss_mlp": 1.05721259, + "epoch": 0.08074552833308282, + "flos": 10561185486720.0, + "grad_norm": 2.7453135307928216, + "language_loss": 0.76378155, + "learning_rate": 3.973048896036459e-06, + "loss": 0.78631246, + "num_input_tokens_seen": 28539460, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.375, + "step": 1343, + "time_per_iteration": 2.4969120025634766 + }, + { + "auxiliary_loss_clip": 0.01072196, + "auxiliary_loss_mlp": 0.01008036, + "balance_loss_clip": 1.00446022, + "balance_loss_mlp": 1.01791215, + "epoch": 0.08080565158575079, + "flos": 60840254954880.0, + "grad_norm": 0.8963318804352591, + "language_loss": 0.57395822, + "learning_rate": 3.972985137276974e-06, + "loss": 0.59476054, + "num_input_tokens_seen": 28599855, + "router_z_loss_clip": 0.03564453, + "router_z_loss_mlp": 0.54296875, + "step": 1344, + "time_per_iteration": 2.9917871952056885 + }, + { + "auxiliary_loss_clip": 0.01190985, + "auxiliary_loss_mlp": 0.0105771, + "balance_loss_clip": 1.03452373, + "balance_loss_mlp": 1.05523396, + "epoch": 0.08086577483841875, + "flos": 18332361970560.0, + "grad_norm": 2.677643541218582, + "language_loss": 0.86665964, + "learning_rate": 3.972921303701695e-06, + "loss": 0.88914657, + "num_input_tokens_seen": 28617585, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.359375, + "step": 1345, + "time_per_iteration": 2.4601447582244873 + }, + { + "auxiliary_loss_clip": 0.01187459, + "auxiliary_loss_mlp": 0.01054913, + "balance_loss_clip": 1.03289497, + "balance_loss_mlp": 1.05403256, + "epoch": 0.08092589809108673, + "flos": 21543601766400.0, + "grad_norm": 1.7098835991166323, + "language_loss": 0.87242532, + "learning_rate": 3.972857395313042e-06, + "loss": 0.894849, + "num_input_tokens_seen": 28636355, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.3359375, + "step": 1346, + "time_per_iteration": 2.4809892177581787 + }, + { + "auxiliary_loss_clip": 0.01185898, + "auxiliary_loss_mlp": 0.01054973, + "balance_loss_clip": 1.03256202, + "balance_loss_mlp": 1.05219567, + "epoch": 0.0809860213437547, + "flos": 22128012046080.0, + "grad_norm": 1.6659805361601863, + "language_loss": 0.92606491, + "learning_rate": 3.972793412113439e-06, + "loss": 0.94847363, + "num_input_tokens_seen": 28656260, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.3359375, + "step": 1347, + "time_per_iteration": 2.4802379608154297 + }, + { + "auxiliary_loss_clip": 0.0118757, + "auxiliary_loss_mlp": 0.01057822, + "balance_loss_clip": 1.03318167, + "balance_loss_mlp": 1.05471659, + "epoch": 0.08104614459642266, + "flos": 21725489260800.0, + "grad_norm": 9.453605004454174, + "language_loss": 0.89181751, + "learning_rate": 3.972729354105312e-06, + "loss": 0.91427147, + "num_input_tokens_seen": 28675865, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.328125, + "step": 1348, + "time_per_iteration": 2.4610300064086914 + }, + { + "auxiliary_loss_clip": 0.01185296, + "auxiliary_loss_mlp": 0.01056008, + "balance_loss_clip": 1.03420484, + "balance_loss_mlp": 1.05543983, + "epoch": 0.08110626784909064, + "flos": 23951878980480.0, + "grad_norm": 2.4916215003739355, + "language_loss": 0.76796132, + "learning_rate": 3.97266522129109e-06, + "loss": 0.7903744, + "num_input_tokens_seen": 28696255, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.296875, + "step": 1349, + "time_per_iteration": 2.4789178371429443 + }, + { + "auxiliary_loss_clip": 0.01187103, + "auxiliary_loss_mlp": 0.0105974, + "balance_loss_clip": 1.03669679, + "balance_loss_mlp": 1.05236626, + "epoch": 0.0811663911017586, + "flos": 19025689265280.0, + "grad_norm": 2.126949034470324, + "language_loss": 0.88571703, + "learning_rate": 3.972601013673205e-06, + "loss": 0.90818548, + "num_input_tokens_seen": 28713905, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.34375, + "step": 1350, + "time_per_iteration": 2.43094539642334 + }, + { + "auxiliary_loss_clip": 0.01184059, + "auxiliary_loss_mlp": 0.01061052, + "balance_loss_clip": 1.03773451, + "balance_loss_mlp": 1.05228257, + "epoch": 0.08122651435442657, + "flos": 15341290588800.0, + "grad_norm": 2.044220866897066, + "language_loss": 0.82058489, + "learning_rate": 3.972536731254092e-06, + "loss": 0.843036, + "num_input_tokens_seen": 28732075, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.3125, + "step": 1351, + "time_per_iteration": 6.688653469085693 + }, + { + "auxiliary_loss_clip": 0.01184193, + "auxiliary_loss_mlp": 0.01053712, + "balance_loss_clip": 1.02917862, + "balance_loss_mlp": 1.04863417, + "epoch": 0.08128663760709455, + "flos": 23221563655680.0, + "grad_norm": 1.9894600711485977, + "language_loss": 0.75347674, + "learning_rate": 3.972472374036189e-06, + "loss": 0.77585584, + "num_input_tokens_seen": 28751150, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.359375, + "step": 1352, + "time_per_iteration": 2.4888412952423096 + }, + { + "auxiliary_loss_clip": 0.01192461, + "auxiliary_loss_mlp": 0.01055559, + "balance_loss_clip": 1.03163338, + "balance_loss_mlp": 1.05483341, + "epoch": 0.08134676085976252, + "flos": 22965628273920.0, + "grad_norm": 1.7603053493114211, + "language_loss": 0.82833469, + "learning_rate": 3.972407942021935e-06, + "loss": 0.85081488, + "num_input_tokens_seen": 28773360, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.375, + "step": 1353, + "time_per_iteration": 2.522960901260376 + }, + { + "auxiliary_loss_clip": 0.01073388, + "auxiliary_loss_mlp": 0.01010471, + "balance_loss_clip": 1.00694275, + "balance_loss_mlp": 1.01996851, + "epoch": 0.08140688411243048, + "flos": 64322115816960.0, + "grad_norm": 0.8931676068679675, + "language_loss": 0.5970993, + "learning_rate": 3.972343435213775e-06, + "loss": 0.61793786, + "num_input_tokens_seen": 28833390, + "router_z_loss_clip": 0.03540039, + "router_z_loss_mlp": 0.53125, + "step": 1354, + "time_per_iteration": 3.0639474391937256 + }, + { + "auxiliary_loss_clip": 0.0118665, + "auxiliary_loss_mlp": 0.01060844, + "balance_loss_clip": 1.03764629, + "balance_loss_mlp": 1.05431724, + "epoch": 0.08146700736509845, + "flos": 22491858862080.0, + "grad_norm": 1.7981329827127455, + "language_loss": 0.82785606, + "learning_rate": 3.972278853614154e-06, + "loss": 0.85033101, + "num_input_tokens_seen": 28852430, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.3203125, + "step": 1355, + "time_per_iteration": 2.4664132595062256 + }, + { + "auxiliary_loss_clip": 0.01186535, + "auxiliary_loss_mlp": 0.01062061, + "balance_loss_clip": 1.03619206, + "balance_loss_mlp": 1.05146575, + "epoch": 0.08152713061776642, + "flos": 20447823513600.0, + "grad_norm": 1.9123465925299232, + "language_loss": 0.70799643, + "learning_rate": 3.972214197225521e-06, + "loss": 0.73048234, + "num_input_tokens_seen": 28870685, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.3515625, + "step": 1356, + "time_per_iteration": 2.509061813354492 + }, + { + "auxiliary_loss_clip": 0.01188256, + "auxiliary_loss_mlp": 0.01055944, + "balance_loss_clip": 1.03169644, + "balance_loss_mlp": 1.05148005, + "epoch": 0.08158725387043439, + "flos": 23550218121600.0, + "grad_norm": 2.53580294551395, + "language_loss": 0.70255458, + "learning_rate": 3.972149466050329e-06, + "loss": 0.72499657, + "num_input_tokens_seen": 28889860, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.3671875, + "step": 1357, + "time_per_iteration": 2.476951837539673 + }, + { + "auxiliary_loss_clip": 0.01191615, + "auxiliary_loss_mlp": 0.01053374, + "balance_loss_clip": 1.03067684, + "balance_loss_mlp": 1.05488217, + "epoch": 0.08164737712310235, + "flos": 22017335264640.0, + "grad_norm": 2.6163823683714953, + "language_loss": 0.84186697, + "learning_rate": 3.97208466009103e-06, + "loss": 0.86431682, + "num_input_tokens_seen": 28905865, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.3671875, + "step": 1358, + "time_per_iteration": 2.457376480102539 + }, + { + "auxiliary_loss_clip": 0.01190093, + "auxiliary_loss_mlp": 0.01056216, + "balance_loss_clip": 1.0310626, + "balance_loss_mlp": 1.05484545, + "epoch": 0.08170750037577033, + "flos": 23367827836800.0, + "grad_norm": 1.9894839389786314, + "language_loss": 1.02294087, + "learning_rate": 3.972019779350084e-06, + "loss": 1.04540396, + "num_input_tokens_seen": 28925250, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.3515625, + "step": 1359, + "time_per_iteration": 2.4723212718963623 + }, + { + "auxiliary_loss_clip": 0.01185855, + "auxiliary_loss_mlp": 0.01057366, + "balance_loss_clip": 1.03344035, + "balance_loss_mlp": 1.0511415, + "epoch": 0.0817676236284383, + "flos": 28397978490240.0, + "grad_norm": 2.0666688933075963, + "language_loss": 0.82969773, + "learning_rate": 3.971954823829951e-06, + "loss": 0.85212988, + "num_input_tokens_seen": 28943445, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.34375, + "step": 1360, + "time_per_iteration": 2.5143508911132812 + }, + { + "auxiliary_loss_clip": 0.01190184, + "auxiliary_loss_mlp": 0.01062181, + "balance_loss_clip": 1.03820777, + "balance_loss_mlp": 1.05335808, + "epoch": 0.08182774688110626, + "flos": 19208905562880.0, + "grad_norm": 2.14797754608813, + "language_loss": 0.72352278, + "learning_rate": 3.971889793533093e-06, + "loss": 0.74604642, + "num_input_tokens_seen": 28962695, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.3671875, + "step": 1361, + "time_per_iteration": 2.458034038543701 + }, + { + "auxiliary_loss_clip": 0.01179057, + "auxiliary_loss_mlp": 0.01057287, + "balance_loss_clip": 1.03249121, + "balance_loss_mlp": 1.04741335, + "epoch": 0.08188787013377424, + "flos": 22784099915520.0, + "grad_norm": 5.8589819193374515, + "language_loss": 0.76781029, + "learning_rate": 3.971824688461976e-06, + "loss": 0.79017377, + "num_input_tokens_seen": 28982120, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.3125, + "step": 1362, + "time_per_iteration": 2.472259759902954 + }, + { + "auxiliary_loss_clip": 0.01187551, + "auxiliary_loss_mlp": 0.01052018, + "balance_loss_clip": 1.0291419, + "balance_loss_mlp": 1.05449164, + "epoch": 0.08194799338644221, + "flos": 16468095214080.0, + "grad_norm": 2.631594675791475, + "language_loss": 0.72409523, + "learning_rate": 3.971759508619069e-06, + "loss": 0.74649096, + "num_input_tokens_seen": 28998100, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.328125, + "step": 1363, + "time_per_iteration": 2.4447264671325684 + }, + { + "auxiliary_loss_clip": 0.01189235, + "auxiliary_loss_mlp": 0.01061525, + "balance_loss_clip": 1.03603828, + "balance_loss_mlp": 1.05607057, + "epoch": 0.08200811663911017, + "flos": 23913633974400.0, + "grad_norm": 3.9166951523525464, + "language_loss": 0.77459586, + "learning_rate": 3.971694254006844e-06, + "loss": 0.79710352, + "num_input_tokens_seen": 29017095, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.328125, + "step": 1364, + "time_per_iteration": 2.475141763687134 + }, + { + "auxiliary_loss_clip": 0.01190144, + "auxiliary_loss_mlp": 0.01061328, + "balance_loss_clip": 1.03745019, + "balance_loss_mlp": 1.05500793, + "epoch": 0.08206823989177814, + "flos": 17896550256000.0, + "grad_norm": 2.6241179536013033, + "language_loss": 0.82025397, + "learning_rate": 3.971628924627776e-06, + "loss": 0.84276867, + "num_input_tokens_seen": 29037240, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.3515625, + "step": 1365, + "time_per_iteration": 2.493732452392578 + }, + { + "auxiliary_loss_clip": 0.0118713, + "auxiliary_loss_mlp": 0.0105741, + "balance_loss_clip": 1.03406882, + "balance_loss_mlp": 1.05614781, + "epoch": 0.08212836314444612, + "flos": 22088186841600.0, + "grad_norm": 3.3261283913074884, + "language_loss": 0.82173789, + "learning_rate": 3.97156352048434e-06, + "loss": 0.84418333, + "num_input_tokens_seen": 29056250, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.3125, + "step": 1366, + "time_per_iteration": 2.4809322357177734 + }, + { + "auxiliary_loss_clip": 0.01186928, + "auxiliary_loss_mlp": 0.0105891, + "balance_loss_clip": 1.03703475, + "balance_loss_mlp": 1.05126381, + "epoch": 0.08218848639711408, + "flos": 17597485618560.0, + "grad_norm": 2.8403828718649033, + "language_loss": 0.81534755, + "learning_rate": 3.97149804157902e-06, + "loss": 0.83780599, + "num_input_tokens_seen": 29073380, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.359375, + "step": 1367, + "time_per_iteration": 2.4577715396881104 + }, + { + "auxiliary_loss_clip": 0.01192402, + "auxiliary_loss_mlp": 0.01060775, + "balance_loss_clip": 1.03724277, + "balance_loss_mlp": 1.05413651, + "epoch": 0.08224860964978205, + "flos": 17857838373120.0, + "grad_norm": 2.3540874203263358, + "language_loss": 0.83644414, + "learning_rate": 3.9714324879142946e-06, + "loss": 0.85897589, + "num_input_tokens_seen": 29091330, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.3828125, + "step": 1368, + "time_per_iteration": 2.453547716140747 + }, + { + "auxiliary_loss_clip": 0.01181645, + "auxiliary_loss_mlp": 0.01049123, + "balance_loss_clip": 1.02694988, + "balance_loss_mlp": 1.05349994, + "epoch": 0.08230873290245003, + "flos": 25227533566080.0, + "grad_norm": 1.7360129433802456, + "language_loss": 0.81245828, + "learning_rate": 3.971366859492653e-06, + "loss": 0.83476603, + "num_input_tokens_seen": 29110375, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.28125, + "step": 1369, + "time_per_iteration": 2.527573585510254 + }, + { + "auxiliary_loss_clip": 0.01185735, + "auxiliary_loss_mlp": 0.01051149, + "balance_loss_clip": 1.02979898, + "balance_loss_mlp": 1.05528903, + "epoch": 0.08236885615511799, + "flos": 31759935753600.0, + "grad_norm": 2.240857135161324, + "language_loss": 0.74790901, + "learning_rate": 3.971301156316582e-06, + "loss": 0.77027786, + "num_input_tokens_seen": 29129395, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.3046875, + "step": 1370, + "time_per_iteration": 2.5205185413360596 + }, + { + "auxiliary_loss_clip": 0.01189372, + "auxiliary_loss_mlp": 0.01061396, + "balance_loss_clip": 1.03697038, + "balance_loss_mlp": 1.05480862, + "epoch": 0.08242897940778596, + "flos": 23185832601600.0, + "grad_norm": 1.6313231263601415, + "language_loss": 0.74633086, + "learning_rate": 3.971235378388573e-06, + "loss": 0.76883852, + "num_input_tokens_seen": 29148650, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.34375, + "step": 1371, + "time_per_iteration": 2.4995803833007812 + }, + { + "auxiliary_loss_clip": 0.01188254, + "auxiliary_loss_mlp": 0.01061601, + "balance_loss_clip": 1.03769946, + "balance_loss_mlp": 1.05410123, + "epoch": 0.08248910266045394, + "flos": 34491480393600.0, + "grad_norm": 2.0830704741847423, + "language_loss": 0.71080554, + "learning_rate": 3.971169525711122e-06, + "loss": 0.73330408, + "num_input_tokens_seen": 29170785, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.34375, + "step": 1372, + "time_per_iteration": 2.574457883834839 + }, + { + "auxiliary_loss_clip": 0.0118845, + "auxiliary_loss_mlp": 0.01051797, + "balance_loss_clip": 1.02750254, + "balance_loss_mlp": 1.05397415, + "epoch": 0.0825492259131219, + "flos": 13436228960640.0, + "grad_norm": 3.137320584176607, + "language_loss": 0.88010907, + "learning_rate": 3.9711035982867246e-06, + "loss": 0.90251154, + "num_input_tokens_seen": 29185210, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.34375, + "step": 1373, + "time_per_iteration": 2.485727310180664 + }, + { + "auxiliary_loss_clip": 0.01186594, + "auxiliary_loss_mlp": 0.01058909, + "balance_loss_clip": 1.03575897, + "balance_loss_mlp": 1.05331743, + "epoch": 0.08260934916578987, + "flos": 25812446636160.0, + "grad_norm": 1.7727067520163604, + "language_loss": 0.82349706, + "learning_rate": 3.971037596117882e-06, + "loss": 0.84595209, + "num_input_tokens_seen": 29205210, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.328125, + "step": 1374, + "time_per_iteration": 2.5223724842071533 + }, + { + "auxiliary_loss_clip": 0.01076427, + "auxiliary_loss_mlp": 0.01007461, + "balance_loss_clip": 1.0032891, + "balance_loss_mlp": 1.02371156, + "epoch": 0.08266947241845783, + "flos": 63460009491840.0, + "grad_norm": 0.8248734910296001, + "language_loss": 0.60630989, + "learning_rate": 3.970971519207095e-06, + "loss": 0.62714875, + "num_input_tokens_seen": 29265350, + "router_z_loss_clip": 0.04174805, + "router_z_loss_mlp": 0.5234375, + "step": 1375, + "time_per_iteration": 3.0909183025360107 + }, + { + "auxiliary_loss_clip": 0.01074233, + "auxiliary_loss_mlp": 0.01006319, + "balance_loss_clip": 1.00221813, + "balance_loss_mlp": 1.02162504, + "epoch": 0.08272959567112581, + "flos": 69993704568960.0, + "grad_norm": 0.9071425511101782, + "language_loss": 0.62149519, + "learning_rate": 3.970905367556871e-06, + "loss": 0.64230067, + "num_input_tokens_seen": 29321475, + "router_z_loss_clip": 0.04101562, + "router_z_loss_mlp": 0.52734375, + "step": 1376, + "time_per_iteration": 2.991158962249756 + }, + { + "auxiliary_loss_clip": 0.01195866, + "auxiliary_loss_mlp": 0.01069408, + "balance_loss_clip": 1.04624534, + "balance_loss_mlp": 1.05995989, + "epoch": 0.08278971892379378, + "flos": 20413205781120.0, + "grad_norm": 1.9826192893196872, + "language_loss": 0.82601643, + "learning_rate": 3.970839141169718e-06, + "loss": 0.84866917, + "num_input_tokens_seen": 29341405, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.359375, + "step": 1377, + "time_per_iteration": 2.5851728916168213 + }, + { + "auxiliary_loss_clip": 0.01188463, + "auxiliary_loss_mlp": 0.01057538, + "balance_loss_clip": 1.0342443, + "balance_loss_mlp": 1.05601847, + "epoch": 0.08284984217646174, + "flos": 26250233598720.0, + "grad_norm": 1.8760965133588865, + "language_loss": 0.84516692, + "learning_rate": 3.970772840048147e-06, + "loss": 0.86762691, + "num_input_tokens_seen": 29361955, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.3203125, + "step": 1378, + "time_per_iteration": 2.500251054763794 + }, + { + "auxiliary_loss_clip": 0.01190024, + "auxiliary_loss_mlp": 0.01062419, + "balance_loss_clip": 1.0383265, + "balance_loss_mlp": 1.05516553, + "epoch": 0.08290996542912972, + "flos": 27194683852800.0, + "grad_norm": 1.9551783234852504, + "language_loss": 0.87725681, + "learning_rate": 3.970706464194672e-06, + "loss": 0.89978123, + "num_input_tokens_seen": 29382395, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.3515625, + "step": 1379, + "time_per_iteration": 2.5428385734558105 + }, + { + "auxiliary_loss_clip": 0.01189534, + "auxiliary_loss_mlp": 0.01056049, + "balance_loss_clip": 1.03336358, + "balance_loss_mlp": 1.05776525, + "epoch": 0.08297008868179769, + "flos": 38618191146240.0, + "grad_norm": 1.7573789229703745, + "language_loss": 0.78658688, + "learning_rate": 3.970640013611812e-06, + "loss": 0.80904275, + "num_input_tokens_seen": 29404460, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.3125, + "step": 1380, + "time_per_iteration": 2.6350595951080322 + }, + { + "auxiliary_loss_clip": 0.01190411, + "auxiliary_loss_mlp": 0.01061393, + "balance_loss_clip": 1.03666866, + "balance_loss_mlp": 1.05878401, + "epoch": 0.08303021193446565, + "flos": 19974736460160.0, + "grad_norm": 2.2395713763978002, + "language_loss": 0.86146504, + "learning_rate": 3.970573488302083e-06, + "loss": 0.88398302, + "num_input_tokens_seen": 29422675, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.3125, + "step": 1381, + "time_per_iteration": 2.470153331756592 + }, + { + "auxiliary_loss_clip": 0.0119877, + "auxiliary_loss_mlp": 0.01060106, + "balance_loss_clip": 1.03604937, + "balance_loss_mlp": 1.06063581, + "epoch": 0.08309033518713363, + "flos": 13662646341120.0, + "grad_norm": 3.795546136319442, + "language_loss": 0.8817445, + "learning_rate": 3.970506888268011e-06, + "loss": 0.90433335, + "num_input_tokens_seen": 29439840, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.3828125, + "step": 1382, + "time_per_iteration": 2.4352822303771973 + }, + { + "auxiliary_loss_clip": 0.01190764, + "auxiliary_loss_mlp": 0.01059612, + "balance_loss_clip": 1.03728414, + "balance_loss_mlp": 1.0569818, + "epoch": 0.0831504584398016, + "flos": 17968551068160.0, + "grad_norm": 2.6234570747150734, + "language_loss": 0.77606535, + "learning_rate": 3.970440213512121e-06, + "loss": 0.79856908, + "num_input_tokens_seen": 29457360, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.34375, + "step": 1383, + "time_per_iteration": 2.45939040184021 + }, + { + "auxiliary_loss_clip": 0.01194291, + "auxiliary_loss_mlp": 0.01056287, + "balance_loss_clip": 1.03254008, + "balance_loss_mlp": 1.05730414, + "epoch": 0.08321058169246956, + "flos": 22601386408320.0, + "grad_norm": 2.1508484512905945, + "language_loss": 0.8293128, + "learning_rate": 3.97037346403694e-06, + "loss": 0.85181862, + "num_input_tokens_seen": 29477040, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.3671875, + "step": 1384, + "time_per_iteration": 2.4773356914520264 + }, + { + "auxiliary_loss_clip": 0.01198678, + "auxiliary_loss_mlp": 0.01055169, + "balance_loss_clip": 1.02937245, + "balance_loss_mlp": 1.05890989, + "epoch": 0.08327070494513754, + "flos": 22850426378880.0, + "grad_norm": 2.4890613364481893, + "language_loss": 0.84828049, + "learning_rate": 3.970306639845e-06, + "loss": 0.87081897, + "num_input_tokens_seen": 29492010, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.3984375, + "step": 1385, + "time_per_iteration": 2.5084009170532227 + }, + { + "auxiliary_loss_clip": 0.01194904, + "auxiliary_loss_mlp": 0.01066074, + "balance_loss_clip": 1.04257774, + "balance_loss_mlp": 1.05825758, + "epoch": 0.0833308281978055, + "flos": 22782986593920.0, + "grad_norm": 2.123672194513448, + "language_loss": 0.68744183, + "learning_rate": 3.970239740938835e-06, + "loss": 0.7100516, + "num_input_tokens_seen": 29511850, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.3671875, + "step": 1386, + "time_per_iteration": 2.477592945098877 + }, + { + "auxiliary_loss_clip": 0.01191265, + "auxiliary_loss_mlp": 0.0105612, + "balance_loss_clip": 1.03186047, + "balance_loss_mlp": 1.05579662, + "epoch": 0.08339095145047347, + "flos": 20812604083200.0, + "grad_norm": 1.7726596290820096, + "language_loss": 0.82067239, + "learning_rate": 3.97017276732098e-06, + "loss": 0.84314626, + "num_input_tokens_seen": 29531415, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.359375, + "step": 1387, + "time_per_iteration": 2.529261350631714 + }, + { + "auxiliary_loss_clip": 0.01196512, + "auxiliary_loss_mlp": 0.0107016, + "balance_loss_clip": 1.04474461, + "balance_loss_mlp": 1.05739772, + "epoch": 0.08345107470314143, + "flos": 18515326872960.0, + "grad_norm": 2.385304875072474, + "language_loss": 0.77194649, + "learning_rate": 3.970105718993978e-06, + "loss": 0.79461324, + "num_input_tokens_seen": 29549525, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.390625, + "step": 1388, + "time_per_iteration": 2.4517693519592285 + }, + { + "auxiliary_loss_clip": 0.01187734, + "auxiliary_loss_mlp": 0.01059717, + "balance_loss_clip": 1.0351125, + "balance_loss_mlp": 1.0574429, + "epoch": 0.08351119795580941, + "flos": 18807567926400.0, + "grad_norm": 2.246368739161805, + "language_loss": 0.79078835, + "learning_rate": 3.970038595960369e-06, + "loss": 0.81326282, + "num_input_tokens_seen": 29568705, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.3046875, + "step": 1389, + "time_per_iteration": 2.4999983310699463 + }, + { + "auxiliary_loss_clip": 0.01194109, + "auxiliary_loss_mlp": 0.01056803, + "balance_loss_clip": 1.03368866, + "balance_loss_mlp": 1.05773938, + "epoch": 0.08357132120847738, + "flos": 18441817689600.0, + "grad_norm": 4.533904477221136, + "language_loss": 0.87495124, + "learning_rate": 3.969971398222699e-06, + "loss": 0.89746046, + "num_input_tokens_seen": 29585855, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.359375, + "step": 1390, + "time_per_iteration": 2.438126802444458 + }, + { + "auxiliary_loss_clip": 0.01190886, + "auxiliary_loss_mlp": 0.01063167, + "balance_loss_clip": 1.03902745, + "balance_loss_mlp": 1.05621624, + "epoch": 0.08363144446114534, + "flos": 25922333318400.0, + "grad_norm": 1.6928828016377326, + "language_loss": 0.86753631, + "learning_rate": 3.969904125783517e-06, + "loss": 0.89007682, + "num_input_tokens_seen": 29607280, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.34375, + "step": 1391, + "time_per_iteration": 2.5615429878234863 + }, + { + "auxiliary_loss_clip": 0.01198327, + "auxiliary_loss_mlp": 0.01071606, + "balance_loss_clip": 1.0480268, + "balance_loss_mlp": 1.05904424, + "epoch": 0.08369156771381332, + "flos": 18041306065920.0, + "grad_norm": 4.090701354718017, + "language_loss": 0.87550449, + "learning_rate": 3.969836778645371e-06, + "loss": 0.89820385, + "num_input_tokens_seen": 29624130, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.390625, + "step": 1392, + "time_per_iteration": 2.4343698024749756 + }, + { + "auxiliary_loss_clip": 0.01190277, + "auxiliary_loss_mlp": 0.01060815, + "balance_loss_clip": 1.03682983, + "balance_loss_mlp": 1.05556941, + "epoch": 0.08375169096648129, + "flos": 22675111073280.0, + "grad_norm": 2.9857894096842457, + "language_loss": 0.80519998, + "learning_rate": 3.969769356810819e-06, + "loss": 0.82771087, + "num_input_tokens_seen": 29643210, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.34375, + "step": 1393, + "time_per_iteration": 3.9978342056274414 + }, + { + "auxiliary_loss_clip": 0.01191931, + "auxiliary_loss_mlp": 0.01054176, + "balance_loss_clip": 1.03098941, + "balance_loss_mlp": 1.05832088, + "epoch": 0.08381181421914925, + "flos": 26103215232000.0, + "grad_norm": 1.8413427873168604, + "language_loss": 0.84738398, + "learning_rate": 3.969701860282415e-06, + "loss": 0.86984503, + "num_input_tokens_seen": 29663920, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3359375, + "step": 1394, + "time_per_iteration": 3.995389461517334 + }, + { + "auxiliary_loss_clip": 0.01193271, + "auxiliary_loss_mlp": 0.010537, + "balance_loss_clip": 1.0296433, + "balance_loss_mlp": 1.05856824, + "epoch": 0.08387193747181723, + "flos": 20629782835200.0, + "grad_norm": 1.7688902284368797, + "language_loss": 0.82957625, + "learning_rate": 3.969634289062719e-06, + "loss": 0.85204601, + "num_input_tokens_seen": 29683825, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.34375, + "step": 1395, + "time_per_iteration": 2.5080416202545166 + }, + { + "auxiliary_loss_clip": 0.01194811, + "auxiliary_loss_mlp": 0.01062467, + "balance_loss_clip": 1.03683722, + "balance_loss_mlp": 1.05833054, + "epoch": 0.0839320607244852, + "flos": 13443196199040.0, + "grad_norm": 1.9626395114639965, + "language_loss": 0.82492781, + "learning_rate": 3.969566643154293e-06, + "loss": 0.84750068, + "num_input_tokens_seen": 29698775, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.3671875, + "step": 1396, + "time_per_iteration": 2.51763653755188 + }, + { + "auxiliary_loss_clip": 0.01191589, + "auxiliary_loss_mlp": 0.01058769, + "balance_loss_clip": 1.03253114, + "balance_loss_mlp": 1.05944824, + "epoch": 0.08399218397715316, + "flos": 23477247642240.0, + "grad_norm": 2.3756879295671367, + "language_loss": 0.7702114, + "learning_rate": 3.969498922559703e-06, + "loss": 0.79271495, + "num_input_tokens_seen": 29719430, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.3203125, + "step": 1397, + "time_per_iteration": 2.522019624710083 + }, + { + "auxiliary_loss_clip": 0.01191257, + "auxiliary_loss_mlp": 0.01050826, + "balance_loss_clip": 1.02635193, + "balance_loss_mlp": 1.05688787, + "epoch": 0.08405230722982113, + "flos": 25920717206400.0, + "grad_norm": 2.1333990758799795, + "language_loss": 0.77589226, + "learning_rate": 3.969431127281516e-06, + "loss": 0.79831308, + "num_input_tokens_seen": 29739685, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.34375, + "step": 1398, + "time_per_iteration": 2.499405860900879 + }, + { + "auxiliary_loss_clip": 0.01187104, + "auxiliary_loss_mlp": 0.01057261, + "balance_loss_clip": 1.03366995, + "balance_loss_mlp": 1.05604136, + "epoch": 0.0841124304824891, + "flos": 17967437746560.0, + "grad_norm": 6.547707007931562, + "language_loss": 0.94411373, + "learning_rate": 3.969363257322304e-06, + "loss": 0.96655744, + "num_input_tokens_seen": 29756165, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.3125, + "step": 1399, + "time_per_iteration": 2.458564043045044 + }, + { + "auxiliary_loss_clip": 0.01192876, + "auxiliary_loss_mlp": 0.01060981, + "balance_loss_clip": 1.03585184, + "balance_loss_mlp": 1.05564523, + "epoch": 0.08417255373515707, + "flos": 25629661301760.0, + "grad_norm": 2.3313569082148637, + "language_loss": 0.82052553, + "learning_rate": 3.96929531268464e-06, + "loss": 0.84306407, + "num_input_tokens_seen": 29776425, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.375, + "step": 1400, + "time_per_iteration": 2.511075258255005 + }, + { + "auxiliary_loss_clip": 0.01191821, + "auxiliary_loss_mlp": 0.01061122, + "balance_loss_clip": 1.03713727, + "balance_loss_mlp": 1.05681479, + "epoch": 0.08423267698782504, + "flos": 26249730808320.0, + "grad_norm": 3.6029570836648723, + "language_loss": 0.86615682, + "learning_rate": 3.969227293371099e-06, + "loss": 0.8886863, + "num_input_tokens_seen": 29796440, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.3515625, + "step": 1401, + "time_per_iteration": 2.5328855514526367 + }, + { + "auxiliary_loss_clip": 0.01190636, + "auxiliary_loss_mlp": 0.01063749, + "balance_loss_clip": 1.0383811, + "balance_loss_mlp": 1.05496573, + "epoch": 0.08429280024049302, + "flos": 20119707751680.0, + "grad_norm": 2.2778357332658543, + "language_loss": 0.87128234, + "learning_rate": 3.969159199384263e-06, + "loss": 0.89382625, + "num_input_tokens_seen": 29814755, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.359375, + "step": 1402, + "time_per_iteration": 2.4695520401000977 + }, + { + "auxiliary_loss_clip": 0.0118725, + "auxiliary_loss_mlp": 0.01056626, + "balance_loss_clip": 1.03340352, + "balance_loss_mlp": 1.0542388, + "epoch": 0.08435292349316098, + "flos": 42924526836480.0, + "grad_norm": 2.954964391273458, + "language_loss": 0.88680542, + "learning_rate": 3.9690910307267125e-06, + "loss": 0.90924418, + "num_input_tokens_seen": 29834785, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.328125, + "step": 1403, + "time_per_iteration": 2.6655161380767822 + }, + { + "auxiliary_loss_clip": 0.01189559, + "auxiliary_loss_mlp": 0.01056388, + "balance_loss_clip": 1.03105569, + "balance_loss_mlp": 1.05429792, + "epoch": 0.08441304674582895, + "flos": 22857285876480.0, + "grad_norm": 1.9645692036725415, + "language_loss": 0.80325729, + "learning_rate": 3.969022787401033e-06, + "loss": 0.82571673, + "num_input_tokens_seen": 29854695, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.359375, + "step": 1404, + "time_per_iteration": 2.5011603832244873 + }, + { + "auxiliary_loss_clip": 0.01195719, + "auxiliary_loss_mlp": 0.01066072, + "balance_loss_clip": 1.04089534, + "balance_loss_mlp": 1.05798006, + "epoch": 0.08447316999849692, + "flos": 18697501676160.0, + "grad_norm": 2.1059643070764027, + "language_loss": 0.83845061, + "learning_rate": 3.968954469409811e-06, + "loss": 0.86106849, + "num_input_tokens_seen": 29872180, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.375, + "step": 1405, + "time_per_iteration": 2.4612858295440674 + }, + { + "auxiliary_loss_clip": 0.01188265, + "auxiliary_loss_mlp": 0.01056168, + "balance_loss_clip": 1.03314888, + "balance_loss_mlp": 1.05381966, + "epoch": 0.08453329325116489, + "flos": 25483971738240.0, + "grad_norm": 1.7581309060245893, + "language_loss": 0.80343008, + "learning_rate": 3.968886076755639e-06, + "loss": 0.82587439, + "num_input_tokens_seen": 29893205, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.34375, + "step": 1406, + "time_per_iteration": 2.496676206588745 + }, + { + "auxiliary_loss_clip": 0.01192497, + "auxiliary_loss_mlp": 0.01065969, + "balance_loss_clip": 1.0421989, + "balance_loss_mlp": 1.05858994, + "epoch": 0.08459341650383286, + "flos": 20920048640640.0, + "grad_norm": 1.8241253914082192, + "language_loss": 0.79411483, + "learning_rate": 3.96881760944111e-06, + "loss": 0.8166995, + "num_input_tokens_seen": 29911970, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.3359375, + "step": 1407, + "time_per_iteration": 2.491055727005005 + }, + { + "auxiliary_loss_clip": 0.01188371, + "auxiliary_loss_mlp": 0.01055807, + "balance_loss_clip": 1.03234673, + "balance_loss_mlp": 1.05521655, + "epoch": 0.08465353975650082, + "flos": 13043079624960.0, + "grad_norm": 4.541456574357825, + "language_loss": 0.91929626, + "learning_rate": 3.968749067468819e-06, + "loss": 0.94173807, + "num_input_tokens_seen": 29929925, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.328125, + "step": 1408, + "time_per_iteration": 2.44599986076355 + }, + { + "auxiliary_loss_clip": 0.01074059, + "auxiliary_loss_mlp": 0.01025014, + "balance_loss_clip": 1.02074611, + "balance_loss_mlp": 1.02193737, + "epoch": 0.0847136630091688, + "flos": 60877422552960.0, + "grad_norm": 0.8980094129226197, + "language_loss": 0.61861706, + "learning_rate": 3.968680450841368e-06, + "loss": 0.63960779, + "num_input_tokens_seen": 29985950, + "router_z_loss_clip": 0.04272461, + "router_z_loss_mlp": 0.5234375, + "step": 1409, + "time_per_iteration": 3.1084799766540527 + }, + { + "auxiliary_loss_clip": 0.01180993, + "auxiliary_loss_mlp": 0.01060196, + "balance_loss_clip": 1.03784466, + "balance_loss_mlp": 1.05419254, + "epoch": 0.08477378626183676, + "flos": 22046530043520.0, + "grad_norm": 2.25814404402445, + "language_loss": 0.86819237, + "learning_rate": 3.968611759561355e-06, + "loss": 0.89060426, + "num_input_tokens_seen": 30004330, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.265625, + "step": 1410, + "time_per_iteration": 2.4854791164398193 + }, + { + "auxiliary_loss_clip": 0.01188551, + "auxiliary_loss_mlp": 0.01056537, + "balance_loss_clip": 1.0309782, + "balance_loss_mlp": 1.05453801, + "epoch": 0.08483390951450473, + "flos": 16690059308160.0, + "grad_norm": 2.048224684561652, + "language_loss": 0.74138093, + "learning_rate": 3.968542993631388e-06, + "loss": 0.76383173, + "num_input_tokens_seen": 30022555, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.3359375, + "step": 1411, + "time_per_iteration": 2.484879970550537 + }, + { + "auxiliary_loss_clip": 0.01068033, + "auxiliary_loss_mlp": 0.01005767, + "balance_loss_clip": 1.00169039, + "balance_loss_mlp": 1.01640451, + "epoch": 0.08489403276717271, + "flos": 51584640082560.0, + "grad_norm": 0.9041737870208939, + "language_loss": 0.56723791, + "learning_rate": 3.968474153054073e-06, + "loss": 0.58797586, + "num_input_tokens_seen": 30077220, + "router_z_loss_clip": 0.04077148, + "router_z_loss_mlp": 0.515625, + "step": 1412, + "time_per_iteration": 3.003227949142456 + }, + { + "auxiliary_loss_clip": 0.01183878, + "auxiliary_loss_mlp": 0.01062107, + "balance_loss_clip": 1.03855133, + "balance_loss_mlp": 1.05354273, + "epoch": 0.08495415601984067, + "flos": 17092330698240.0, + "grad_norm": 2.0338814511208883, + "language_loss": 0.89084172, + "learning_rate": 3.96840523783202e-06, + "loss": 0.91330159, + "num_input_tokens_seen": 30094600, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.3046875, + "step": 1413, + "time_per_iteration": 2.4545698165893555 + }, + { + "auxiliary_loss_clip": 0.01186591, + "auxiliary_loss_mlp": 0.01054482, + "balance_loss_clip": 1.03019929, + "balance_loss_mlp": 1.0562067, + "epoch": 0.08501427927250864, + "flos": 23148413608320.0, + "grad_norm": 2.1859301398641415, + "language_loss": 0.8807795, + "learning_rate": 3.968336247967844e-06, + "loss": 0.90319026, + "num_input_tokens_seen": 30114475, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.3046875, + "step": 1414, + "time_per_iteration": 2.4803147315979004 + }, + { + "auxiliary_loss_clip": 0.01185784, + "auxiliary_loss_mlp": 0.01056984, + "balance_loss_clip": 1.03497767, + "balance_loss_mlp": 1.0540117, + "epoch": 0.08507440252517662, + "flos": 19063467394560.0, + "grad_norm": 1.82577143383273, + "language_loss": 0.77434587, + "learning_rate": 3.96826718346416e-06, + "loss": 0.79677355, + "num_input_tokens_seen": 30133350, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.3125, + "step": 1415, + "time_per_iteration": 2.510671615600586 + }, + { + "auxiliary_loss_clip": 0.01185616, + "auxiliary_loss_mlp": 0.010657, + "balance_loss_clip": 1.0441227, + "balance_loss_mlp": 1.05612898, + "epoch": 0.08513452577784458, + "flos": 60182296600320.0, + "grad_norm": 1.848223104879299, + "language_loss": 0.70859981, + "learning_rate": 3.968198044323587e-06, + "loss": 0.73111296, + "num_input_tokens_seen": 30159005, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.296875, + "step": 1416, + "time_per_iteration": 2.827016592025757 + }, + { + "auxiliary_loss_clip": 0.01192002, + "auxiliary_loss_mlp": 0.0106124, + "balance_loss_clip": 1.03587198, + "balance_loss_mlp": 1.05693281, + "epoch": 0.08519464903051255, + "flos": 27308485117440.0, + "grad_norm": 1.9370001986884609, + "language_loss": 0.74855268, + "learning_rate": 3.968128830548748e-06, + "loss": 0.77108514, + "num_input_tokens_seen": 30179450, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.3515625, + "step": 1417, + "time_per_iteration": 2.51518177986145 + }, + { + "auxiliary_loss_clip": 0.01184448, + "auxiliary_loss_mlp": 0.01055419, + "balance_loss_clip": 1.03157723, + "balance_loss_mlp": 1.05394006, + "epoch": 0.08525477228318051, + "flos": 20266438809600.0, + "grad_norm": 2.566029486363868, + "language_loss": 0.82460356, + "learning_rate": 3.968059542142265e-06, + "loss": 0.84700227, + "num_input_tokens_seen": 30197235, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.3046875, + "step": 1418, + "time_per_iteration": 2.4632515907287598 + }, + { + "auxiliary_loss_clip": 0.01078096, + "auxiliary_loss_mlp": 0.01026146, + "balance_loss_clip": 1.02221191, + "balance_loss_mlp": 1.0269177, + "epoch": 0.08531489553584849, + "flos": 67615017183360.0, + "grad_norm": 0.8662062784105238, + "language_loss": 0.56616145, + "learning_rate": 3.9679901791067685e-06, + "loss": 0.58720386, + "num_input_tokens_seen": 30257410, + "router_z_loss_clip": 0.03930664, + "router_z_loss_mlp": 0.51171875, + "step": 1419, + "time_per_iteration": 3.0262646675109863 + }, + { + "auxiliary_loss_clip": 0.01185611, + "auxiliary_loss_mlp": 0.01062944, + "balance_loss_clip": 1.03858972, + "balance_loss_mlp": 1.05284262, + "epoch": 0.08537501878851646, + "flos": 27526965592320.0, + "grad_norm": 2.301787344693911, + "language_loss": 0.69764268, + "learning_rate": 3.967920741444886e-06, + "loss": 0.72012818, + "num_input_tokens_seen": 30277865, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.328125, + "step": 1420, + "time_per_iteration": 2.5173370838165283 + }, + { + "auxiliary_loss_clip": 0.01182824, + "auxiliary_loss_mlp": 0.01051954, + "balance_loss_clip": 1.02912498, + "balance_loss_mlp": 1.05232763, + "epoch": 0.08543514204118442, + "flos": 22784243569920.0, + "grad_norm": 1.56579546013663, + "language_loss": 0.87886292, + "learning_rate": 3.967851229159252e-06, + "loss": 0.90121067, + "num_input_tokens_seen": 30298545, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.3046875, + "step": 1421, + "time_per_iteration": 2.498198986053467 + }, + { + "auxiliary_loss_clip": 0.01069987, + "auxiliary_loss_mlp": 0.01004015, + "balance_loss_clip": 1.00034332, + "balance_loss_mlp": 1.01909983, + "epoch": 0.0854952652938524, + "flos": 60990721027200.0, + "grad_norm": 0.7935144939089421, + "language_loss": 0.63490081, + "learning_rate": 3.967781642252502e-06, + "loss": 0.65564084, + "num_input_tokens_seen": 30361725, + "router_z_loss_clip": 0.03662109, + "router_z_loss_mlp": 0.5078125, + "step": 1422, + "time_per_iteration": 3.050874948501587 + }, + { + "auxiliary_loss_clip": 0.01182797, + "auxiliary_loss_mlp": 0.01065038, + "balance_loss_clip": 1.04182768, + "balance_loss_mlp": 1.05538559, + "epoch": 0.08555538854652037, + "flos": 28038046256640.0, + "grad_norm": 2.040119561169685, + "language_loss": 0.83427018, + "learning_rate": 3.967711980727276e-06, + "loss": 0.85674852, + "num_input_tokens_seen": 30382180, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.2734375, + "step": 1423, + "time_per_iteration": 2.525075674057007 + }, + { + "auxiliary_loss_clip": 0.01190455, + "auxiliary_loss_mlp": 0.01059451, + "balance_loss_clip": 1.0365268, + "balance_loss_mlp": 1.05613029, + "epoch": 0.08561551179918833, + "flos": 23509279595520.0, + "grad_norm": 1.7627385415604107, + "language_loss": 0.74945033, + "learning_rate": 3.967642244586213e-06, + "loss": 0.77194929, + "num_input_tokens_seen": 30402980, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.34375, + "step": 1424, + "time_per_iteration": 2.523231029510498 + }, + { + "auxiliary_loss_clip": 0.01185893, + "auxiliary_loss_mlp": 0.01056266, + "balance_loss_clip": 1.03307986, + "balance_loss_mlp": 1.05510807, + "epoch": 0.08567563505185631, + "flos": 17926930183680.0, + "grad_norm": 1.9395290082560723, + "language_loss": 0.7574805, + "learning_rate": 3.96757243383196e-06, + "loss": 0.7799021, + "num_input_tokens_seen": 30420800, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3046875, + "step": 1425, + "time_per_iteration": 2.441420793533325 + }, + { + "auxiliary_loss_clip": 0.01183386, + "auxiliary_loss_mlp": 0.01053965, + "balance_loss_clip": 1.03092194, + "balance_loss_mlp": 1.05407834, + "epoch": 0.08573575830452428, + "flos": 19719519350400.0, + "grad_norm": 2.579491371045568, + "language_loss": 0.93504989, + "learning_rate": 3.9675025484671624e-06, + "loss": 0.95742333, + "num_input_tokens_seen": 30439620, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.296875, + "step": 1426, + "time_per_iteration": 2.4703657627105713 + }, + { + "auxiliary_loss_clip": 0.0119154, + "auxiliary_loss_mlp": 0.01066598, + "balance_loss_clip": 1.04115915, + "balance_loss_mlp": 1.05764198, + "epoch": 0.08579588155719224, + "flos": 17931563038080.0, + "grad_norm": 2.235647808517122, + "language_loss": 0.75003266, + "learning_rate": 3.967432588494471e-06, + "loss": 0.772614, + "num_input_tokens_seen": 30457300, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.34375, + "step": 1427, + "time_per_iteration": 2.4430549144744873 + }, + { + "auxiliary_loss_clip": 0.01182417, + "auxiliary_loss_mlp": 0.01061112, + "balance_loss_clip": 1.03907049, + "balance_loss_mlp": 1.05315089, + "epoch": 0.08585600480986022, + "flos": 16033324993920.0, + "grad_norm": 2.3372587699614726, + "language_loss": 0.81915152, + "learning_rate": 3.96736255391654e-06, + "loss": 0.84158677, + "num_input_tokens_seen": 30471580, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.296875, + "step": 1428, + "time_per_iteration": 2.454441785812378 + }, + { + "auxiliary_loss_clip": 0.01189987, + "auxiliary_loss_mlp": 0.01066735, + "balance_loss_clip": 1.04258347, + "balance_loss_mlp": 1.05586076, + "epoch": 0.08591612806252819, + "flos": 28657433404800.0, + "grad_norm": 2.395570851050941, + "language_loss": 0.79697371, + "learning_rate": 3.967292444736023e-06, + "loss": 0.81954098, + "num_input_tokens_seen": 30492720, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.34375, + "step": 1429, + "time_per_iteration": 2.5411579608917236 + }, + { + "auxiliary_loss_clip": 0.0119024, + "auxiliary_loss_mlp": 0.01062326, + "balance_loss_clip": 1.03952122, + "balance_loss_mlp": 1.05773449, + "epoch": 0.08597625131519615, + "flos": 20959119659520.0, + "grad_norm": 2.301464625204156, + "language_loss": 0.88055587, + "learning_rate": 3.967222260955578e-06, + "loss": 0.90308148, + "num_input_tokens_seen": 30509535, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.328125, + "step": 1430, + "time_per_iteration": 2.497523546218872 + }, + { + "auxiliary_loss_clip": 0.01184535, + "auxiliary_loss_mlp": 0.01072949, + "balance_loss_clip": 1.04995334, + "balance_loss_mlp": 1.05712664, + "epoch": 0.08603637456786412, + "flos": 23256360956160.0, + "grad_norm": 1.7504719201320615, + "language_loss": 0.81914723, + "learning_rate": 3.96715200257787e-06, + "loss": 0.84172201, + "num_input_tokens_seen": 30529490, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.2734375, + "step": 1431, + "time_per_iteration": 2.478731393814087 + }, + { + "auxiliary_loss_clip": 0.01184756, + "auxiliary_loss_mlp": 0.01056491, + "balance_loss_clip": 1.03337657, + "balance_loss_mlp": 1.05376828, + "epoch": 0.0860964978205321, + "flos": 28694170039680.0, + "grad_norm": 1.9949655353101803, + "language_loss": 0.77759397, + "learning_rate": 3.967081669605559e-06, + "loss": 0.80000651, + "num_input_tokens_seen": 30550205, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3125, + "step": 1432, + "time_per_iteration": 2.5344104766845703 + }, + { + "auxiliary_loss_clip": 0.0118072, + "auxiliary_loss_mlp": 0.01059034, + "balance_loss_clip": 1.03497803, + "balance_loss_mlp": 1.05027151, + "epoch": 0.08615662107320006, + "flos": 19318397195520.0, + "grad_norm": 2.2873036973179603, + "language_loss": 0.73330259, + "learning_rate": 3.967011262041315e-06, + "loss": 0.75570011, + "num_input_tokens_seen": 30568830, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.3046875, + "step": 1433, + "time_per_iteration": 2.4787938594818115 + }, + { + "auxiliary_loss_clip": 0.01188497, + "auxiliary_loss_mlp": 0.01058804, + "balance_loss_clip": 1.03375793, + "balance_loss_mlp": 1.05464733, + "epoch": 0.08621674432586802, + "flos": 15851688894720.0, + "grad_norm": 2.615593579271415, + "language_loss": 0.85741955, + "learning_rate": 3.9669407798878065e-06, + "loss": 0.87989259, + "num_input_tokens_seen": 30585730, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.3359375, + "step": 1434, + "time_per_iteration": 5.500946998596191 + }, + { + "auxiliary_loss_clip": 0.01182383, + "auxiliary_loss_mlp": 0.01054521, + "balance_loss_clip": 1.03139436, + "balance_loss_mlp": 1.05177212, + "epoch": 0.086276867578536, + "flos": 14100648785280.0, + "grad_norm": 3.0513138823403825, + "language_loss": 0.78913063, + "learning_rate": 3.966870223147707e-06, + "loss": 0.81149966, + "num_input_tokens_seen": 30603180, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3046875, + "step": 1435, + "time_per_iteration": 3.899777412414551 + }, + { + "auxiliary_loss_clip": 0.01070575, + "auxiliary_loss_mlp": 0.01027743, + "balance_loss_clip": 1.02428555, + "balance_loss_mlp": 1.02010655, + "epoch": 0.08633699083120397, + "flos": 70184857772160.0, + "grad_norm": 0.8910926846424677, + "language_loss": 0.57930011, + "learning_rate": 3.96679959182369e-06, + "loss": 0.60028332, + "num_input_tokens_seen": 30668895, + "router_z_loss_clip": 0.03466797, + "router_z_loss_mlp": 0.5078125, + "step": 1436, + "time_per_iteration": 3.179255247116089 + }, + { + "auxiliary_loss_clip": 0.01186059, + "auxiliary_loss_mlp": 0.01049386, + "balance_loss_clip": 1.02633083, + "balance_loss_mlp": 1.05314159, + "epoch": 0.08639711408387193, + "flos": 30298874140800.0, + "grad_norm": 2.429993259280604, + "language_loss": 0.68775386, + "learning_rate": 3.966728885918437e-06, + "loss": 0.71010828, + "num_input_tokens_seen": 30688955, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.328125, + "step": 1437, + "time_per_iteration": 2.529339551925659 + }, + { + "auxiliary_loss_clip": 0.01185365, + "auxiliary_loss_mlp": 0.01050306, + "balance_loss_clip": 1.02806163, + "balance_loss_mlp": 1.05388093, + "epoch": 0.08645723733653991, + "flos": 20297680663680.0, + "grad_norm": 2.5641138848438163, + "language_loss": 0.7274068, + "learning_rate": 3.966658105434627e-06, + "loss": 0.74976349, + "num_input_tokens_seen": 30706095, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.3125, + "step": 1438, + "time_per_iteration": 2.4840176105499268 + }, + { + "auxiliary_loss_clip": 0.01183596, + "auxiliary_loss_mlp": 0.01049035, + "balance_loss_clip": 1.02594447, + "balance_loss_mlp": 1.05472374, + "epoch": 0.08651736058920788, + "flos": 32890583134080.0, + "grad_norm": 1.681614476681305, + "language_loss": 0.64628494, + "learning_rate": 3.966587250374945e-06, + "loss": 0.66861117, + "num_input_tokens_seen": 30729025, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.2890625, + "step": 1439, + "time_per_iteration": 2.61686372756958 + }, + { + "auxiliary_loss_clip": 0.01187197, + "auxiliary_loss_mlp": 0.01055218, + "balance_loss_clip": 1.03131676, + "balance_loss_mlp": 1.05638909, + "epoch": 0.08657748384187584, + "flos": 22637368857600.0, + "grad_norm": 2.062065757985673, + "language_loss": 0.87748063, + "learning_rate": 3.966516320742077e-06, + "loss": 0.89990479, + "num_input_tokens_seen": 30746155, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.3125, + "step": 1440, + "time_per_iteration": 2.5116493701934814 + }, + { + "auxiliary_loss_clip": 0.01188419, + "auxiliary_loss_mlp": 0.01059749, + "balance_loss_clip": 1.03538251, + "balance_loss_mlp": 1.0540843, + "epoch": 0.08663760709454381, + "flos": 23658380951040.0, + "grad_norm": 2.4102507257620363, + "language_loss": 0.83243793, + "learning_rate": 3.9664453165387124e-06, + "loss": 0.85491961, + "num_input_tokens_seen": 30761410, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.34375, + "step": 1441, + "time_per_iteration": 2.5058300495147705 + }, + { + "auxiliary_loss_clip": 0.01070024, + "auxiliary_loss_mlp": 0.01009256, + "balance_loss_clip": 1.00525022, + "balance_loss_mlp": 1.01939523, + "epoch": 0.08669773034721179, + "flos": 62686564911360.0, + "grad_norm": 0.8461220926791603, + "language_loss": 0.60426581, + "learning_rate": 3.966374237767545e-06, + "loss": 0.62505859, + "num_input_tokens_seen": 30823010, + "router_z_loss_clip": 0.04003906, + "router_z_loss_mlp": 0.5078125, + "step": 1442, + "time_per_iteration": 3.1946628093719482 + }, + { + "auxiliary_loss_clip": 0.01192002, + "auxiliary_loss_mlp": 0.01057232, + "balance_loss_clip": 1.03379524, + "balance_loss_mlp": 1.05709028, + "epoch": 0.08675785359987975, + "flos": 20667489137280.0, + "grad_norm": 3.2809405592870835, + "language_loss": 0.79264277, + "learning_rate": 3.96630308443127e-06, + "loss": 0.81513512, + "num_input_tokens_seen": 30841980, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.3515625, + "step": 1443, + "time_per_iteration": 2.477691411972046 + }, + { + "auxiliary_loss_clip": 0.01185255, + "auxiliary_loss_mlp": 0.01054103, + "balance_loss_clip": 1.03170311, + "balance_loss_mlp": 1.05261874, + "epoch": 0.08681797685254772, + "flos": 26941118768640.0, + "grad_norm": 1.764762918327591, + "language_loss": 0.82248437, + "learning_rate": 3.966231856532584e-06, + "loss": 0.8448779, + "num_input_tokens_seen": 30863280, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.328125, + "step": 1444, + "time_per_iteration": 2.584773063659668 + }, + { + "auxiliary_loss_clip": 0.01189581, + "auxiliary_loss_mlp": 0.01049918, + "balance_loss_clip": 1.02745867, + "balance_loss_mlp": 1.05537939, + "epoch": 0.0868781001052157, + "flos": 17712831168000.0, + "grad_norm": 1.945627197742621, + "language_loss": 0.86856627, + "learning_rate": 3.966160554074189e-06, + "loss": 0.89096129, + "num_input_tokens_seen": 30881710, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.34375, + "step": 1445, + "time_per_iteration": 2.506258964538574 + }, + { + "auxiliary_loss_clip": 0.01189413, + "auxiliary_loss_mlp": 0.01054326, + "balance_loss_clip": 1.03303528, + "balance_loss_mlp": 1.05808067, + "epoch": 0.08693822335788366, + "flos": 19896522595200.0, + "grad_norm": 1.9763924186655837, + "language_loss": 0.81639445, + "learning_rate": 3.96608917705879e-06, + "loss": 0.8388319, + "num_input_tokens_seen": 30900225, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.3125, + "step": 1446, + "time_per_iteration": 2.531313180923462 + }, + { + "auxiliary_loss_clip": 0.010647, + "auxiliary_loss_mlp": 0.01005416, + "balance_loss_clip": 1.0020541, + "balance_loss_mlp": 1.0147202, + "epoch": 0.08699834661055163, + "flos": 67023747406080.0, + "grad_norm": 0.728477241136595, + "language_loss": 0.54725462, + "learning_rate": 3.966017725489091e-06, + "loss": 0.56795579, + "num_input_tokens_seen": 30959580, + "router_z_loss_clip": 0.03369141, + "router_z_loss_mlp": 0.5, + "step": 1447, + "time_per_iteration": 3.1009976863861084 + }, + { + "auxiliary_loss_clip": 0.01178637, + "auxiliary_loss_mlp": 0.01052877, + "balance_loss_clip": 1.03104973, + "balance_loss_mlp": 1.05198455, + "epoch": 0.0870584698632196, + "flos": 13480507451520.0, + "grad_norm": 2.2332818090387243, + "language_loss": 0.84593046, + "learning_rate": 3.965946199367804e-06, + "loss": 0.8682456, + "num_input_tokens_seen": 30976775, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.265625, + "step": 1448, + "time_per_iteration": 2.483792543411255 + }, + { + "auxiliary_loss_clip": 0.01185215, + "auxiliary_loss_mlp": 0.01056358, + "balance_loss_clip": 1.03386295, + "balance_loss_mlp": 1.0524509, + "epoch": 0.08711859311588757, + "flos": 16107013745280.0, + "grad_norm": 3.099884448391289, + "language_loss": 0.80688727, + "learning_rate": 3.965874598697638e-06, + "loss": 0.82930297, + "num_input_tokens_seen": 30990495, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.328125, + "step": 1449, + "time_per_iteration": 2.4637081623077393 + }, + { + "auxiliary_loss_clip": 0.01182046, + "auxiliary_loss_mlp": 0.01050023, + "balance_loss_clip": 1.02862501, + "balance_loss_mlp": 1.05370414, + "epoch": 0.08717871636855554, + "flos": 38472357928320.0, + "grad_norm": 4.183651889411507, + "language_loss": 0.71012592, + "learning_rate": 3.965802923481313e-06, + "loss": 0.73244655, + "num_input_tokens_seen": 31014080, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.2890625, + "step": 1450, + "time_per_iteration": 2.6521542072296143 + }, + { + "auxiliary_loss_clip": 0.0118314, + "auxiliary_loss_mlp": 0.01053244, + "balance_loss_clip": 1.03057098, + "balance_loss_mlp": 1.05502534, + "epoch": 0.0872388396212235, + "flos": 17600574188160.0, + "grad_norm": 1.8266796466048172, + "language_loss": 0.83492875, + "learning_rate": 3.965731173721542e-06, + "loss": 0.85729253, + "num_input_tokens_seen": 31031210, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.28125, + "step": 1451, + "time_per_iteration": 2.4866271018981934 + }, + { + "auxiliary_loss_clip": 0.01181156, + "auxiliary_loss_mlp": 0.01057138, + "balance_loss_clip": 1.03538203, + "balance_loss_mlp": 1.05371869, + "epoch": 0.08729896287389148, + "flos": 25259385951360.0, + "grad_norm": 1.850339391564711, + "language_loss": 0.74351519, + "learning_rate": 3.965659349421049e-06, + "loss": 0.76589811, + "num_input_tokens_seen": 31049710, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.2734375, + "step": 1452, + "time_per_iteration": 2.5450925827026367 + }, + { + "auxiliary_loss_clip": 0.01182798, + "auxiliary_loss_mlp": 0.01061481, + "balance_loss_clip": 1.03840256, + "balance_loss_mlp": 1.05121017, + "epoch": 0.08735908612655945, + "flos": 15632454234240.0, + "grad_norm": 3.3421371051734474, + "language_loss": 0.79840016, + "learning_rate": 3.965587450582556e-06, + "loss": 0.82084292, + "num_input_tokens_seen": 31066160, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3125, + "step": 1453, + "time_per_iteration": 2.49350643157959 + }, + { + "auxiliary_loss_clip": 0.01182604, + "auxiliary_loss_mlp": 0.0106489, + "balance_loss_clip": 1.04213262, + "balance_loss_mlp": 1.0545752, + "epoch": 0.08741920937922741, + "flos": 20339660684160.0, + "grad_norm": 1.982640213979625, + "language_loss": 0.71298045, + "learning_rate": 3.96551547720879e-06, + "loss": 0.73545539, + "num_input_tokens_seen": 31085270, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.28125, + "step": 1454, + "time_per_iteration": 2.534457206726074 + }, + { + "auxiliary_loss_clip": 0.01070391, + "auxiliary_loss_mlp": 0.01029091, + "balance_loss_clip": 1.02608728, + "balance_loss_mlp": 1.02026677, + "epoch": 0.08747933263189539, + "flos": 62819795433600.0, + "grad_norm": 0.7993884765543664, + "language_loss": 0.58655661, + "learning_rate": 3.96544342930248e-06, + "loss": 0.6075514, + "num_input_tokens_seen": 31148445, + "router_z_loss_clip": 0.0300293, + "router_z_loss_mlp": 0.5, + "step": 1455, + "time_per_iteration": 3.088113307952881 + }, + { + "auxiliary_loss_clip": 0.01182632, + "auxiliary_loss_mlp": 0.01058901, + "balance_loss_clip": 1.03638279, + "balance_loss_mlp": 1.05210626, + "epoch": 0.08753945588456336, + "flos": 33035877648000.0, + "grad_norm": 1.5590098662562957, + "language_loss": 0.77404714, + "learning_rate": 3.965371306866359e-06, + "loss": 0.79646254, + "num_input_tokens_seen": 31168770, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.3046875, + "step": 1456, + "time_per_iteration": 2.6145191192626953 + }, + { + "auxiliary_loss_clip": 0.01182283, + "auxiliary_loss_mlp": 0.01051585, + "balance_loss_clip": 1.02888715, + "balance_loss_mlp": 1.05235434, + "epoch": 0.08759957913723132, + "flos": 35547182046720.0, + "grad_norm": 2.3657198267749777, + "language_loss": 0.72391665, + "learning_rate": 3.96529910990316e-06, + "loss": 0.74625528, + "num_input_tokens_seen": 31189270, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.296875, + "step": 1457, + "time_per_iteration": 2.6438605785369873 + }, + { + "auxiliary_loss_clip": 0.01179054, + "auxiliary_loss_mlp": 0.01047648, + "balance_loss_clip": 1.02623844, + "balance_loss_mlp": 1.05207849, + "epoch": 0.0876597023898993, + "flos": 23911120022400.0, + "grad_norm": 1.5929331180335078, + "language_loss": 0.86215973, + "learning_rate": 3.965226838415622e-06, + "loss": 0.88442671, + "num_input_tokens_seen": 31210385, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.265625, + "step": 1458, + "time_per_iteration": 2.539658546447754 + }, + { + "auxiliary_loss_clip": 0.01189161, + "auxiliary_loss_mlp": 0.01059801, + "balance_loss_clip": 1.03694844, + "balance_loss_mlp": 1.05887103, + "epoch": 0.08771982564256726, + "flos": 18114025150080.0, + "grad_norm": 1.660016084678777, + "language_loss": 0.80662763, + "learning_rate": 3.965154492406486e-06, + "loss": 0.8291173, + "num_input_tokens_seen": 31229745, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.3046875, + "step": 1459, + "time_per_iteration": 2.4880902767181396 + }, + { + "auxiliary_loss_clip": 0.01187526, + "auxiliary_loss_mlp": 0.01054149, + "balance_loss_clip": 1.03057003, + "balance_loss_mlp": 1.05512893, + "epoch": 0.08777994889523523, + "flos": 17712005155200.0, + "grad_norm": 2.474003232718447, + "language_loss": 0.84058738, + "learning_rate": 3.9650820718784945e-06, + "loss": 0.86300415, + "num_input_tokens_seen": 31248280, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.328125, + "step": 1460, + "time_per_iteration": 2.4644060134887695 + }, + { + "auxiliary_loss_clip": 0.01178547, + "auxiliary_loss_mlp": 0.01054299, + "balance_loss_clip": 1.03287745, + "balance_loss_mlp": 1.05051732, + "epoch": 0.0878400721479032, + "flos": 12819930382080.0, + "grad_norm": 2.696872821623283, + "language_loss": 0.81030595, + "learning_rate": 3.965009576834394e-06, + "loss": 0.83263445, + "num_input_tokens_seen": 31262190, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.28125, + "step": 1461, + "time_per_iteration": 2.4456100463867188 + }, + { + "auxiliary_loss_clip": 0.01187345, + "auxiliary_loss_mlp": 0.01059901, + "balance_loss_clip": 1.03795433, + "balance_loss_mlp": 1.05579305, + "epoch": 0.08790019540057117, + "flos": 26392690938240.0, + "grad_norm": 1.656505593412751, + "language_loss": 0.76405656, + "learning_rate": 3.964937007276932e-06, + "loss": 0.786529, + "num_input_tokens_seen": 31283690, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.3125, + "step": 1462, + "time_per_iteration": 2.546812057495117 + }, + { + "auxiliary_loss_clip": 0.01190578, + "auxiliary_loss_mlp": 0.01058183, + "balance_loss_clip": 1.03431702, + "balance_loss_mlp": 1.05753493, + "epoch": 0.08796031865323914, + "flos": 19134031662720.0, + "grad_norm": 2.4277854967530663, + "language_loss": 0.74615479, + "learning_rate": 3.9648643632088634e-06, + "loss": 0.76864231, + "num_input_tokens_seen": 31302505, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.328125, + "step": 1463, + "time_per_iteration": 2.46510648727417 + }, + { + "auxiliary_loss_clip": 0.01189177, + "auxiliary_loss_mlp": 0.0106376, + "balance_loss_clip": 1.03929877, + "balance_loss_mlp": 1.05380559, + "epoch": 0.0880204419059071, + "flos": 26064287867520.0, + "grad_norm": 2.09054267836168, + "language_loss": 0.83423382, + "learning_rate": 3.964791644632941e-06, + "loss": 0.85676318, + "num_input_tokens_seen": 31323070, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.3515625, + "step": 1464, + "time_per_iteration": 2.5343735218048096 + }, + { + "auxiliary_loss_clip": 0.01183588, + "auxiliary_loss_mlp": 0.01069008, + "balance_loss_clip": 1.04659677, + "balance_loss_mlp": 1.05336595, + "epoch": 0.08808056515857508, + "flos": 22377842115840.0, + "grad_norm": 4.267071209901202, + "language_loss": 0.78351951, + "learning_rate": 3.964718851551923e-06, + "loss": 0.80604541, + "num_input_tokens_seen": 31341880, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.296875, + "step": 1465, + "time_per_iteration": 2.4745209217071533 + }, + { + "auxiliary_loss_clip": 0.01190864, + "auxiliary_loss_mlp": 0.01059186, + "balance_loss_clip": 1.0371089, + "balance_loss_mlp": 1.05628061, + "epoch": 0.08814068841124305, + "flos": 23185293897600.0, + "grad_norm": 1.8950228405880263, + "language_loss": 0.84698099, + "learning_rate": 3.9646459839685675e-06, + "loss": 0.86948144, + "num_input_tokens_seen": 31361995, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.34375, + "step": 1466, + "time_per_iteration": 2.4920802116394043 + }, + { + "auxiliary_loss_clip": 0.01185126, + "auxiliary_loss_mlp": 0.0105874, + "balance_loss_clip": 1.03556609, + "balance_loss_mlp": 1.05407715, + "epoch": 0.08820081166391101, + "flos": 25155281358720.0, + "grad_norm": 3.8136580791310783, + "language_loss": 0.84233636, + "learning_rate": 3.964573041885641e-06, + "loss": 0.86477506, + "num_input_tokens_seen": 31381515, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3125, + "step": 1467, + "time_per_iteration": 2.5413413047790527 + }, + { + "auxiliary_loss_clip": 0.01183856, + "auxiliary_loss_mlp": 0.01056021, + "balance_loss_clip": 1.03381276, + "balance_loss_mlp": 1.05462813, + "epoch": 0.08826093491657899, + "flos": 22231685675520.0, + "grad_norm": 1.7481416698073104, + "language_loss": 0.75517243, + "learning_rate": 3.964500025305907e-06, + "loss": 0.7775712, + "num_input_tokens_seen": 31400345, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.296875, + "step": 1468, + "time_per_iteration": 2.496363878250122 + }, + { + "auxiliary_loss_clip": 0.01184448, + "auxiliary_loss_mlp": 0.0105718, + "balance_loss_clip": 1.03623509, + "balance_loss_mlp": 1.05570245, + "epoch": 0.08832105816924696, + "flos": 22126826897280.0, + "grad_norm": 1.7579385887345491, + "language_loss": 0.80601043, + "learning_rate": 3.9644269342321355e-06, + "loss": 0.82842672, + "num_input_tokens_seen": 31419620, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.2890625, + "step": 1469, + "time_per_iteration": 2.5486512184143066 + }, + { + "auxiliary_loss_clip": 0.01187777, + "auxiliary_loss_mlp": 0.01054482, + "balance_loss_clip": 1.0321182, + "balance_loss_mlp": 1.05454695, + "epoch": 0.08838118142191492, + "flos": 17566495159680.0, + "grad_norm": 3.202810753535508, + "language_loss": 0.77607989, + "learning_rate": 3.9643537686670974e-06, + "loss": 0.7985025, + "num_input_tokens_seen": 31437970, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.3359375, + "step": 1470, + "time_per_iteration": 2.6632297039031982 + }, + { + "auxiliary_loss_clip": 0.01182287, + "auxiliary_loss_mlp": 0.0106647, + "balance_loss_clip": 1.04266429, + "balance_loss_mlp": 1.05412459, + "epoch": 0.0884413046745829, + "flos": 20777196251520.0, + "grad_norm": 1.774803600242038, + "language_loss": 0.84233272, + "learning_rate": 3.964280528613569e-06, + "loss": 0.86482024, + "num_input_tokens_seen": 31457040, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.28125, + "step": 1471, + "time_per_iteration": 2.5040950775146484 + }, + { + "auxiliary_loss_clip": 0.01178062, + "auxiliary_loss_mlp": 0.01052705, + "balance_loss_clip": 1.03247499, + "balance_loss_mlp": 1.05459309, + "epoch": 0.08850142792725087, + "flos": 22125462180480.0, + "grad_norm": 1.6761790638208889, + "language_loss": 0.83481324, + "learning_rate": 3.964207214074324e-06, + "loss": 0.85712093, + "num_input_tokens_seen": 31477520, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.234375, + "step": 1472, + "time_per_iteration": 2.5079073905944824 + }, + { + "auxiliary_loss_clip": 0.01185739, + "auxiliary_loss_mlp": 0.0105882, + "balance_loss_clip": 1.03597999, + "balance_loss_mlp": 1.05491877, + "epoch": 0.08856155117991883, + "flos": 22418744728320.0, + "grad_norm": 2.396127276436556, + "language_loss": 0.828246, + "learning_rate": 3.964133825052146e-06, + "loss": 0.85069156, + "num_input_tokens_seen": 31495575, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.3125, + "step": 1473, + "time_per_iteration": 2.4919679164886475 + }, + { + "auxiliary_loss_clip": 0.01183368, + "auxiliary_loss_mlp": 0.01061525, + "balance_loss_clip": 1.04040098, + "balance_loss_mlp": 1.05414963, + "epoch": 0.0886216744325868, + "flos": 29937002572800.0, + "grad_norm": 1.8346488607114506, + "language_loss": 0.78871369, + "learning_rate": 3.964060361549816e-06, + "loss": 0.81116265, + "num_input_tokens_seen": 31520020, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.296875, + "step": 1474, + "time_per_iteration": 2.5806753635406494 + }, + { + "auxiliary_loss_clip": 0.01181812, + "auxiliary_loss_mlp": 0.01057333, + "balance_loss_clip": 1.03413475, + "balance_loss_mlp": 1.05450511, + "epoch": 0.08868179768525478, + "flos": 23982833525760.0, + "grad_norm": 1.918961213895669, + "language_loss": 0.79045832, + "learning_rate": 3.963986823570121e-06, + "loss": 0.81284976, + "num_input_tokens_seen": 31539265, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.2734375, + "step": 1475, + "time_per_iteration": 2.495753765106201 + }, + { + "auxiliary_loss_clip": 0.01184034, + "auxiliary_loss_mlp": 0.01048109, + "balance_loss_clip": 1.0258882, + "balance_loss_mlp": 1.05443335, + "epoch": 0.08874192093792274, + "flos": 43177553216640.0, + "grad_norm": 1.6510632676992876, + "language_loss": 0.73973525, + "learning_rate": 3.963913211115848e-06, + "loss": 0.76205671, + "num_input_tokens_seen": 31563425, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.296875, + "step": 1476, + "time_per_iteration": 6.925957679748535 + }, + { + "auxiliary_loss_clip": 0.0118493, + "auxiliary_loss_mlp": 0.01060562, + "balance_loss_clip": 1.03723264, + "balance_loss_mlp": 1.05454326, + "epoch": 0.0888020441905907, + "flos": 32852445868800.0, + "grad_norm": 1.527991814504802, + "language_loss": 0.74644423, + "learning_rate": 3.9638395241897895e-06, + "loss": 0.76889908, + "num_input_tokens_seen": 31584525, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.3046875, + "step": 1477, + "time_per_iteration": 2.6033589839935303 + }, + { + "auxiliary_loss_clip": 0.01181345, + "auxiliary_loss_mlp": 0.01048179, + "balance_loss_clip": 1.02571976, + "balance_loss_mlp": 1.05315852, + "epoch": 0.08886216744325869, + "flos": 23149347361920.0, + "grad_norm": 2.4237564416671002, + "language_loss": 0.86488914, + "learning_rate": 3.963765762794739e-06, + "loss": 0.88718438, + "num_input_tokens_seen": 31603325, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.28125, + "step": 1478, + "time_per_iteration": 2.5188398361206055 + }, + { + "auxiliary_loss_clip": 0.01182629, + "auxiliary_loss_mlp": 0.01057749, + "balance_loss_clip": 1.03599334, + "balance_loss_mlp": 1.05417609, + "epoch": 0.08892229069592665, + "flos": 23331593992320.0, + "grad_norm": 7.715019285918926, + "language_loss": 0.77988106, + "learning_rate": 3.963691926933495e-06, + "loss": 0.80228484, + "num_input_tokens_seen": 31624820, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.28125, + "step": 1479, + "time_per_iteration": 2.50730562210083 + }, + { + "auxiliary_loss_clip": 0.01180801, + "auxiliary_loss_mlp": 0.01053517, + "balance_loss_clip": 1.02986622, + "balance_loss_mlp": 1.05275774, + "epoch": 0.08898241394859462, + "flos": 26213784272640.0, + "grad_norm": 2.3628139464189815, + "language_loss": 0.78267598, + "learning_rate": 3.9636180166088555e-06, + "loss": 0.80501914, + "num_input_tokens_seen": 31646080, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.28125, + "step": 1480, + "time_per_iteration": 2.512730360031128 + }, + { + "auxiliary_loss_clip": 0.01185027, + "auxiliary_loss_mlp": 0.01063075, + "balance_loss_clip": 1.03901875, + "balance_loss_mlp": 1.05357075, + "epoch": 0.0890425372012626, + "flos": 23550613171200.0, + "grad_norm": 3.1949876590170825, + "language_loss": 0.66627192, + "learning_rate": 3.963544031823624e-06, + "loss": 0.68875289, + "num_input_tokens_seen": 31665770, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.3125, + "step": 1481, + "time_per_iteration": 2.4874138832092285 + }, + { + "auxiliary_loss_clip": 0.0118244, + "auxiliary_loss_mlp": 0.01051994, + "balance_loss_clip": 1.03040504, + "balance_loss_mlp": 1.05519605, + "epoch": 0.08910266045393056, + "flos": 23002795872000.0, + "grad_norm": 1.9560930463008703, + "language_loss": 0.9644348, + "learning_rate": 3.9634699725806065e-06, + "loss": 0.98677909, + "num_input_tokens_seen": 31683805, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.2734375, + "step": 1482, + "time_per_iteration": 2.484274387359619 + }, + { + "auxiliary_loss_clip": 0.01190541, + "auxiliary_loss_mlp": 0.01055727, + "balance_loss_clip": 1.03306508, + "balance_loss_mlp": 1.0577234, + "epoch": 0.08916278370659853, + "flos": 31936508035200.0, + "grad_norm": 2.358614174414972, + "language_loss": 0.78436875, + "learning_rate": 3.96339583888261e-06, + "loss": 0.80683142, + "num_input_tokens_seen": 31704630, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.328125, + "step": 1483, + "time_per_iteration": 2.566199779510498 + }, + { + "auxiliary_loss_clip": 0.01183147, + "auxiliary_loss_mlp": 0.01072522, + "balance_loss_clip": 1.04891825, + "balance_loss_mlp": 1.05463076, + "epoch": 0.08922290695926649, + "flos": 17530404969600.0, + "grad_norm": 2.232834813834399, + "language_loss": 0.86091626, + "learning_rate": 3.963321630732448e-06, + "loss": 0.88347292, + "num_input_tokens_seen": 31723255, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.28125, + "step": 1484, + "time_per_iteration": 2.4742467403411865 + }, + { + "auxiliary_loss_clip": 0.01190947, + "auxiliary_loss_mlp": 0.01064822, + "balance_loss_clip": 1.04152799, + "balance_loss_mlp": 1.0570302, + "epoch": 0.08928303021193447, + "flos": 32125075459200.0, + "grad_norm": 1.7135103732453094, + "language_loss": 0.80460989, + "learning_rate": 3.963247348132932e-06, + "loss": 0.82716757, + "num_input_tokens_seen": 31747045, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.34375, + "step": 1485, + "time_per_iteration": 2.5808591842651367 + }, + { + "auxiliary_loss_clip": 0.01182644, + "auxiliary_loss_mlp": 0.01059654, + "balance_loss_clip": 1.03663421, + "balance_loss_mlp": 1.05256486, + "epoch": 0.08934315346460243, + "flos": 22125210785280.0, + "grad_norm": 2.0833446931013144, + "language_loss": 0.8295821, + "learning_rate": 3.96317299108688e-06, + "loss": 0.852005, + "num_input_tokens_seen": 31766615, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.296875, + "step": 1486, + "time_per_iteration": 2.5060923099517822 + }, + { + "auxiliary_loss_clip": 0.01184012, + "auxiliary_loss_mlp": 0.01060171, + "balance_loss_clip": 1.03749752, + "balance_loss_mlp": 1.05506349, + "epoch": 0.0894032767172704, + "flos": 22565583527040.0, + "grad_norm": 1.6673763915473876, + "language_loss": 0.76653707, + "learning_rate": 3.963098559597111e-06, + "loss": 0.78897893, + "num_input_tokens_seen": 31785855, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.296875, + "step": 1487, + "time_per_iteration": 2.4968059062957764 + }, + { + "auxiliary_loss_clip": 0.01181982, + "auxiliary_loss_mlp": 0.01063322, + "balance_loss_clip": 1.03908658, + "balance_loss_mlp": 1.05203557, + "epoch": 0.08946339996993838, + "flos": 20193396503040.0, + "grad_norm": 2.360836711926668, + "language_loss": 0.83246535, + "learning_rate": 3.963024053666449e-06, + "loss": 0.85491836, + "num_input_tokens_seen": 31804210, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.296875, + "step": 1488, + "time_per_iteration": 2.48189377784729 + }, + { + "auxiliary_loss_clip": 0.01180173, + "auxiliary_loss_mlp": 0.01051663, + "balance_loss_clip": 1.03020549, + "balance_loss_mlp": 1.05375743, + "epoch": 0.08952352322260634, + "flos": 48360181104000.0, + "grad_norm": 1.9508187836998312, + "language_loss": 0.71647823, + "learning_rate": 3.962949473297718e-06, + "loss": 0.73879659, + "num_input_tokens_seen": 31826150, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.265625, + "step": 1489, + "time_per_iteration": 2.701035737991333 + }, + { + "auxiliary_loss_clip": 0.01178824, + "auxiliary_loss_mlp": 0.01053682, + "balance_loss_clip": 1.03087783, + "balance_loss_mlp": 1.05088401, + "epoch": 0.08958364647527431, + "flos": 31793081028480.0, + "grad_norm": 1.8144641128553483, + "language_loss": 0.89490288, + "learning_rate": 3.962874818493745e-06, + "loss": 0.91722786, + "num_input_tokens_seen": 31848060, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.28125, + "step": 1490, + "time_per_iteration": 2.676098108291626 + }, + { + "auxiliary_loss_clip": 0.01187914, + "auxiliary_loss_mlp": 0.01064376, + "balance_loss_clip": 1.0416671, + "balance_loss_mlp": 1.05264366, + "epoch": 0.08964376972794229, + "flos": 23368186972800.0, + "grad_norm": 2.165908760559946, + "language_loss": 0.73276365, + "learning_rate": 3.9628000892573635e-06, + "loss": 0.75528657, + "num_input_tokens_seen": 31870040, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.3515625, + "step": 1491, + "time_per_iteration": 2.5531163215637207 + }, + { + "auxiliary_loss_clip": 0.01181575, + "auxiliary_loss_mlp": 0.01050632, + "balance_loss_clip": 1.02984166, + "balance_loss_mlp": 1.05362582, + "epoch": 0.08970389298061025, + "flos": 23294785530240.0, + "grad_norm": 1.6884120279290091, + "language_loss": 0.77121007, + "learning_rate": 3.9627252855914055e-06, + "loss": 0.79353207, + "num_input_tokens_seen": 31890400, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.28125, + "step": 1492, + "time_per_iteration": 2.485531806945801 + }, + { + "auxiliary_loss_clip": 0.01180742, + "auxiliary_loss_mlp": 0.01055458, + "balance_loss_clip": 1.03324914, + "balance_loss_mlp": 1.05471706, + "epoch": 0.08976401623327822, + "flos": 33761703772800.0, + "grad_norm": 2.0059524225222414, + "language_loss": 0.71168351, + "learning_rate": 3.962650407498707e-06, + "loss": 0.73404551, + "num_input_tokens_seen": 31913435, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.2578125, + "step": 1493, + "time_per_iteration": 2.5819149017333984 + }, + { + "auxiliary_loss_clip": 0.01184961, + "auxiliary_loss_mlp": 0.01056172, + "balance_loss_clip": 1.03304577, + "balance_loss_mlp": 1.05477107, + "epoch": 0.08982413948594618, + "flos": 23911335504000.0, + "grad_norm": 1.7443337417031568, + "language_loss": 0.86910093, + "learning_rate": 3.962575454982109e-06, + "loss": 0.89151227, + "num_input_tokens_seen": 31932435, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3046875, + "step": 1494, + "time_per_iteration": 2.491126775741577 + }, + { + "auxiliary_loss_clip": 0.01180854, + "auxiliary_loss_mlp": 0.01064445, + "balance_loss_clip": 1.04080594, + "balance_loss_mlp": 1.05289626, + "epoch": 0.08988426273861416, + "flos": 16837544551680.0, + "grad_norm": 1.7176751495851263, + "language_loss": 0.83065581, + "learning_rate": 3.962500428044454e-06, + "loss": 0.85310876, + "num_input_tokens_seen": 31950125, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.28125, + "step": 1495, + "time_per_iteration": 2.463747501373291 + }, + { + "auxiliary_loss_clip": 0.01187726, + "auxiliary_loss_mlp": 0.01056179, + "balance_loss_clip": 1.03410196, + "balance_loss_mlp": 1.05825078, + "epoch": 0.08994438599128213, + "flos": 14793365548800.0, + "grad_norm": 1.861203767183833, + "language_loss": 0.69813877, + "learning_rate": 3.962425326688585e-06, + "loss": 0.72057784, + "num_input_tokens_seen": 31968050, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.296875, + "step": 1496, + "time_per_iteration": 2.4409985542297363 + }, + { + "auxiliary_loss_clip": 0.01180533, + "auxiliary_loss_mlp": 0.01051241, + "balance_loss_clip": 1.03035557, + "balance_loss_mlp": 1.05325341, + "epoch": 0.09000450924395009, + "flos": 17384320356480.0, + "grad_norm": 1.6091347390483586, + "language_loss": 0.79913563, + "learning_rate": 3.962350150917351e-06, + "loss": 0.82145333, + "num_input_tokens_seen": 31985675, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.2734375, + "step": 1497, + "time_per_iteration": 2.492732048034668 + }, + { + "auxiliary_loss_clip": 0.01186533, + "auxiliary_loss_mlp": 0.0105809, + "balance_loss_clip": 1.03484416, + "balance_loss_mlp": 1.05299318, + "epoch": 0.09006463249661807, + "flos": 24280317964800.0, + "grad_norm": 2.3611651581227915, + "language_loss": 0.8262192, + "learning_rate": 3.9622749007336035e-06, + "loss": 0.84866548, + "num_input_tokens_seen": 32005180, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.3359375, + "step": 1498, + "time_per_iteration": 2.492124080657959 + }, + { + "auxiliary_loss_clip": 0.01188542, + "auxiliary_loss_mlp": 0.01061597, + "balance_loss_clip": 1.0402112, + "balance_loss_mlp": 1.05628157, + "epoch": 0.09012475574928604, + "flos": 13661928069120.0, + "grad_norm": 2.316244908481527, + "language_loss": 0.7849865, + "learning_rate": 3.962199576140195e-06, + "loss": 0.80748791, + "num_input_tokens_seen": 32022970, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.3203125, + "step": 1499, + "time_per_iteration": 2.455986738204956 + }, + { + "auxiliary_loss_clip": 0.0117942, + "auxiliary_loss_mlp": 0.01055125, + "balance_loss_clip": 1.03348815, + "balance_loss_mlp": 1.05351877, + "epoch": 0.090184879001954, + "flos": 23327751237120.0, + "grad_norm": 1.652937184766999, + "language_loss": 0.93453979, + "learning_rate": 3.962124177139981e-06, + "loss": 0.95688522, + "num_input_tokens_seen": 32043055, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.2578125, + "step": 1500, + "time_per_iteration": 2.481450080871582 + }, + { + "auxiliary_loss_clip": 0.01182931, + "auxiliary_loss_mlp": 0.01050934, + "balance_loss_clip": 1.0268302, + "balance_loss_mlp": 1.05170345, + "epoch": 0.09024500225462198, + "flos": 23002688131200.0, + "grad_norm": 2.9257189866461966, + "language_loss": 0.74465239, + "learning_rate": 3.962048703735822e-06, + "loss": 0.76699102, + "num_input_tokens_seen": 32061900, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.3125, + "step": 1501, + "time_per_iteration": 2.4806344509124756 + }, + { + "auxiliary_loss_clip": 0.01077215, + "auxiliary_loss_mlp": 0.01003378, + "balance_loss_clip": 0.99992049, + "balance_loss_mlp": 1.02834833, + "epoch": 0.09030512550728995, + "flos": 62189203242240.0, + "grad_norm": 0.7322723529864947, + "language_loss": 0.58304042, + "learning_rate": 3.96197315593058e-06, + "loss": 0.60384637, + "num_input_tokens_seen": 32122745, + "router_z_loss_clip": 0.03466797, + "router_z_loss_mlp": 0.48828125, + "step": 1502, + "time_per_iteration": 3.066755771636963 + }, + { + "auxiliary_loss_clip": 0.01178455, + "auxiliary_loss_mlp": 0.01047829, + "balance_loss_clip": 1.02655029, + "balance_loss_mlp": 1.05134845, + "epoch": 0.09036524875995791, + "flos": 38800689171840.0, + "grad_norm": 2.407651446444188, + "language_loss": 0.69502187, + "learning_rate": 3.961897533727119e-06, + "loss": 0.71728474, + "num_input_tokens_seen": 32145125, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.2734375, + "step": 1503, + "time_per_iteration": 2.608006000518799 + }, + { + "auxiliary_loss_clip": 0.01180913, + "auxiliary_loss_mlp": 0.01054911, + "balance_loss_clip": 1.03346539, + "balance_loss_mlp": 1.0508244, + "epoch": 0.09042537201262588, + "flos": 21690081429120.0, + "grad_norm": 2.015182939383952, + "language_loss": 0.86142361, + "learning_rate": 3.961821837128306e-06, + "loss": 0.88378185, + "num_input_tokens_seen": 32166255, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.3046875, + "step": 1504, + "time_per_iteration": 2.489906072616577 + }, + { + "auxiliary_loss_clip": 0.01188306, + "auxiliary_loss_mlp": 0.01064134, + "balance_loss_clip": 1.03871906, + "balance_loss_mlp": 1.05330658, + "epoch": 0.09048549526529386, + "flos": 22267021680000.0, + "grad_norm": 1.9466916160800904, + "language_loss": 0.72267938, + "learning_rate": 3.961746066137014e-06, + "loss": 0.74520379, + "num_input_tokens_seen": 32184010, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.3515625, + "step": 1505, + "time_per_iteration": 2.465965509414673 + }, + { + "auxiliary_loss_clip": 0.01179818, + "auxiliary_loss_mlp": 0.01054589, + "balance_loss_clip": 1.03203487, + "balance_loss_mlp": 1.05332816, + "epoch": 0.09054561851796182, + "flos": 14610939350400.0, + "grad_norm": 2.3726339000283447, + "language_loss": 0.80946511, + "learning_rate": 3.961670220756114e-06, + "loss": 0.83180916, + "num_input_tokens_seen": 32201635, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.265625, + "step": 1506, + "time_per_iteration": 2.4512932300567627 + }, + { + "auxiliary_loss_clip": 0.01179114, + "auxiliary_loss_mlp": 0.01049611, + "balance_loss_clip": 1.02859426, + "balance_loss_mlp": 1.0531404, + "epoch": 0.09060574177062979, + "flos": 27636169916160.0, + "grad_norm": 2.1533698580433254, + "language_loss": 0.76043189, + "learning_rate": 3.961594300988482e-06, + "loss": 0.78271914, + "num_input_tokens_seen": 32221940, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.2578125, + "step": 1507, + "time_per_iteration": 2.5277152061462402 + }, + { + "auxiliary_loss_clip": 0.01067186, + "auxiliary_loss_mlp": 0.01009923, + "balance_loss_clip": 1.00679994, + "balance_loss_mlp": 1.01922798, + "epoch": 0.09066586502329776, + "flos": 66085797513600.0, + "grad_norm": 0.7312512202665958, + "language_loss": 0.57670546, + "learning_rate": 3.961518306836998e-06, + "loss": 0.59747648, + "num_input_tokens_seen": 32276495, + "router_z_loss_clip": 0.03112793, + "router_z_loss_mlp": 0.48046875, + "step": 1508, + "time_per_iteration": 2.9330992698669434 + }, + { + "auxiliary_loss_clip": 0.01182207, + "auxiliary_loss_mlp": 0.01052694, + "balance_loss_clip": 1.0313319, + "balance_loss_mlp": 1.05309892, + "epoch": 0.09072598827596573, + "flos": 18916449027840.0, + "grad_norm": 2.072562238387217, + "language_loss": 0.85046542, + "learning_rate": 3.961442238304543e-06, + "loss": 0.87281442, + "num_input_tokens_seen": 32294130, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.2890625, + "step": 1509, + "time_per_iteration": 2.475606918334961 + }, + { + "auxiliary_loss_clip": 0.01189974, + "auxiliary_loss_mlp": 0.01065674, + "balance_loss_clip": 1.04158139, + "balance_loss_mlp": 1.05606115, + "epoch": 0.0907861115286337, + "flos": 24821742643200.0, + "grad_norm": 2.413703760690829, + "language_loss": 0.84302551, + "learning_rate": 3.961366095394002e-06, + "loss": 0.86558187, + "num_input_tokens_seen": 32313555, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.3359375, + "step": 1510, + "time_per_iteration": 2.576070785522461 + }, + { + "auxiliary_loss_clip": 0.01184002, + "auxiliary_loss_mlp": 0.01055545, + "balance_loss_clip": 1.0335387, + "balance_loss_mlp": 1.05408144, + "epoch": 0.09084623478130167, + "flos": 21652842003840.0, + "grad_norm": 1.9204492801986277, + "language_loss": 0.85558611, + "learning_rate": 3.961289878108262e-06, + "loss": 0.8779816, + "num_input_tokens_seen": 32331430, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.296875, + "step": 1511, + "time_per_iteration": 2.5085484981536865 + }, + { + "auxiliary_loss_clip": 0.01181957, + "auxiliary_loss_mlp": 0.01048579, + "balance_loss_clip": 1.02690685, + "balance_loss_mlp": 1.05469918, + "epoch": 0.09090635803396964, + "flos": 27639258485760.0, + "grad_norm": 1.5775523407684693, + "language_loss": 0.84897017, + "learning_rate": 3.9612135864502135e-06, + "loss": 0.87127548, + "num_input_tokens_seen": 32353705, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.2734375, + "step": 1512, + "time_per_iteration": 2.515565872192383 + }, + { + "auxiliary_loss_clip": 0.01175178, + "auxiliary_loss_mlp": 0.01049482, + "balance_loss_clip": 1.02888274, + "balance_loss_mlp": 1.05033123, + "epoch": 0.0909664812866376, + "flos": 17669127294720.0, + "grad_norm": 2.9006324958480167, + "language_loss": 0.86704344, + "learning_rate": 3.961137220422749e-06, + "loss": 0.88929009, + "num_input_tokens_seen": 32370520, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.25, + "step": 1513, + "time_per_iteration": 2.475271701812744 + }, + { + "auxiliary_loss_clip": 0.01180699, + "auxiliary_loss_mlp": 0.01051925, + "balance_loss_clip": 1.03170729, + "balance_loss_mlp": 1.0536902, + "epoch": 0.09102660453930557, + "flos": 23951448017280.0, + "grad_norm": 1.6716164971548293, + "language_loss": 0.86379707, + "learning_rate": 3.961060780028764e-06, + "loss": 0.8861233, + "num_input_tokens_seen": 32389105, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.2734375, + "step": 1514, + "time_per_iteration": 2.5317347049713135 + }, + { + "auxiliary_loss_clip": 0.0118192, + "auxiliary_loss_mlp": 0.01060131, + "balance_loss_clip": 1.03991365, + "balance_loss_mlp": 1.05550981, + "epoch": 0.09108672779197355, + "flos": 25812949426560.0, + "grad_norm": 1.9279276264910965, + "language_loss": 0.89882755, + "learning_rate": 3.960984265271159e-06, + "loss": 0.92124808, + "num_input_tokens_seen": 32408065, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.265625, + "step": 1515, + "time_per_iteration": 2.5507757663726807 + }, + { + "auxiliary_loss_clip": 0.011822, + "auxiliary_loss_mlp": 0.0105444, + "balance_loss_clip": 1.03174293, + "balance_loss_mlp": 1.05321527, + "epoch": 0.09114685104464151, + "flos": 29639482220160.0, + "grad_norm": 2.0145121179505905, + "language_loss": 0.85567206, + "learning_rate": 3.9609076761528335e-06, + "loss": 0.87803847, + "num_input_tokens_seen": 32427225, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.28125, + "step": 1516, + "time_per_iteration": 2.524787425994873 + }, + { + "auxiliary_loss_clip": 0.01182997, + "auxiliary_loss_mlp": 0.01053148, + "balance_loss_clip": 1.03130913, + "balance_loss_mlp": 1.05217946, + "epoch": 0.09120697429730948, + "flos": 33729635905920.0, + "grad_norm": 1.5232376391767188, + "language_loss": 0.81104374, + "learning_rate": 3.960831012676692e-06, + "loss": 0.83340514, + "num_input_tokens_seen": 32450510, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.3125, + "step": 1517, + "time_per_iteration": 2.5781173706054688 + }, + { + "auxiliary_loss_clip": 0.01185126, + "auxiliary_loss_mlp": 0.01068952, + "balance_loss_clip": 1.04729199, + "balance_loss_mlp": 1.05378699, + "epoch": 0.09126709754997746, + "flos": 18401381953920.0, + "grad_norm": 1.6026665805728266, + "language_loss": 0.78008473, + "learning_rate": 3.960754274845642e-06, + "loss": 0.80262554, + "num_input_tokens_seen": 32468425, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.3125, + "step": 1518, + "time_per_iteration": 4.000938653945923 + }, + { + "auxiliary_loss_clip": 0.01179619, + "auxiliary_loss_mlp": 0.01060053, + "balance_loss_clip": 1.03851235, + "balance_loss_mlp": 1.05189955, + "epoch": 0.09132722080264542, + "flos": 22091957769600.0, + "grad_norm": 1.883609624415087, + "language_loss": 0.86375809, + "learning_rate": 3.960677462662594e-06, + "loss": 0.88615477, + "num_input_tokens_seen": 32487510, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.28125, + "step": 1519, + "time_per_iteration": 3.945183277130127 + }, + { + "auxiliary_loss_clip": 0.01180521, + "auxiliary_loss_mlp": 0.01053198, + "balance_loss_clip": 1.03027439, + "balance_loss_mlp": 1.05196333, + "epoch": 0.09138734405531339, + "flos": 21033131633280.0, + "grad_norm": 2.4149150298084425, + "language_loss": 0.73425877, + "learning_rate": 3.96060057613046e-06, + "loss": 0.75659597, + "num_input_tokens_seen": 32507250, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.28125, + "step": 1520, + "time_per_iteration": 2.470977306365967 + }, + { + "auxiliary_loss_clip": 0.01181506, + "auxiliary_loss_mlp": 0.01055081, + "balance_loss_clip": 1.03299177, + "balance_loss_mlp": 1.0525614, + "epoch": 0.09144746730798137, + "flos": 20083940784000.0, + "grad_norm": 2.6960755220153825, + "language_loss": 0.85296613, + "learning_rate": 3.960523615252156e-06, + "loss": 0.87533194, + "num_input_tokens_seen": 32526045, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.2890625, + "step": 1521, + "time_per_iteration": 2.478440761566162 + }, + { + "auxiliary_loss_clip": 0.01183058, + "auxiliary_loss_mlp": 0.01057495, + "balance_loss_clip": 1.034917, + "balance_loss_mlp": 1.05319118, + "epoch": 0.09150759056064933, + "flos": 22778210085120.0, + "grad_norm": 2.1543470058122876, + "language_loss": 0.83979875, + "learning_rate": 3.960446580030599e-06, + "loss": 0.86220425, + "num_input_tokens_seen": 32546575, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.296875, + "step": 1522, + "time_per_iteration": 2.4761834144592285 + }, + { + "auxiliary_loss_clip": 0.01174804, + "auxiliary_loss_mlp": 0.01057417, + "balance_loss_clip": 1.03500533, + "balance_loss_mlp": 1.05125594, + "epoch": 0.0915677138133173, + "flos": 27564205017600.0, + "grad_norm": 2.174137545904809, + "language_loss": 0.810691, + "learning_rate": 3.960369470468711e-06, + "loss": 0.83301324, + "num_input_tokens_seen": 32568795, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.234375, + "step": 1523, + "time_per_iteration": 2.525385618209839 + }, + { + "auxiliary_loss_clip": 0.01182998, + "auxiliary_loss_mlp": 0.01063543, + "balance_loss_clip": 1.0426811, + "balance_loss_mlp": 1.05365944, + "epoch": 0.09162783706598528, + "flos": 17674765729920.0, + "grad_norm": 2.529065997296093, + "language_loss": 0.74591744, + "learning_rate": 3.960292286569418e-06, + "loss": 0.76838291, + "num_input_tokens_seen": 32587010, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.296875, + "step": 1524, + "time_per_iteration": 2.4293112754821777 + }, + { + "auxiliary_loss_clip": 0.01181121, + "auxiliary_loss_mlp": 0.01060116, + "balance_loss_clip": 1.03822935, + "balance_loss_mlp": 1.05373263, + "epoch": 0.09168796031865324, + "flos": 18478195188480.0, + "grad_norm": 2.0870290485059586, + "language_loss": 0.861516, + "learning_rate": 3.960215028335644e-06, + "loss": 0.88392842, + "num_input_tokens_seen": 32602375, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.2734375, + "step": 1525, + "time_per_iteration": 2.449774980545044 + }, + { + "auxiliary_loss_clip": 0.01181752, + "auxiliary_loss_mlp": 0.01047765, + "balance_loss_clip": 1.02577078, + "balance_loss_mlp": 1.05424511, + "epoch": 0.0917480835713212, + "flos": 29387605075200.0, + "grad_norm": 2.3600448138049597, + "language_loss": 0.74690467, + "learning_rate": 3.96013769577032e-06, + "loss": 0.76919985, + "num_input_tokens_seen": 32621460, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.28125, + "step": 1526, + "time_per_iteration": 2.5295088291168213 + }, + { + "auxiliary_loss_clip": 0.01177679, + "auxiliary_loss_mlp": 0.01052164, + "balance_loss_clip": 1.03058743, + "balance_loss_mlp": 1.05291057, + "epoch": 0.09180820682398917, + "flos": 19829262378240.0, + "grad_norm": 1.970734062299861, + "language_loss": 0.7736311, + "learning_rate": 3.960060288876378e-06, + "loss": 0.79592943, + "num_input_tokens_seen": 32640440, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.25, + "step": 1527, + "time_per_iteration": 2.465484142303467 + }, + { + "auxiliary_loss_clip": 0.01179355, + "auxiliary_loss_mlp": 0.01053495, + "balance_loss_clip": 1.03064227, + "balance_loss_mlp": 1.05090261, + "epoch": 0.09186833007665715, + "flos": 23841848643840.0, + "grad_norm": 1.9755082573034908, + "language_loss": 0.78465801, + "learning_rate": 3.959982807656753e-06, + "loss": 0.80698651, + "num_input_tokens_seen": 32660020, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.28125, + "step": 1528, + "time_per_iteration": 2.5257718563079834 + }, + { + "auxiliary_loss_clip": 0.01177926, + "auxiliary_loss_mlp": 0.01048723, + "balance_loss_clip": 1.0276351, + "balance_loss_mlp": 1.05085492, + "epoch": 0.09192845332932512, + "flos": 12932726065920.0, + "grad_norm": 2.6736868569465813, + "language_loss": 0.76880527, + "learning_rate": 3.959905252114384e-06, + "loss": 0.79107177, + "num_input_tokens_seen": 32678170, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.2734375, + "step": 1529, + "time_per_iteration": 2.4417288303375244 + }, + { + "auxiliary_loss_clip": 0.01180418, + "auxiliary_loss_mlp": 0.01053431, + "balance_loss_clip": 1.0306139, + "balance_loss_mlp": 1.05037212, + "epoch": 0.09198857658199308, + "flos": 24568177559040.0, + "grad_norm": 1.767002219307874, + "language_loss": 0.83118784, + "learning_rate": 3.959827622252211e-06, + "loss": 0.85352623, + "num_input_tokens_seen": 32697540, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.296875, + "step": 1530, + "time_per_iteration": 2.53367018699646 + }, + { + "auxiliary_loss_clip": 0.01173477, + "auxiliary_loss_mlp": 0.01059229, + "balance_loss_clip": 1.03723454, + "balance_loss_mlp": 1.05024123, + "epoch": 0.09204869983466106, + "flos": 20266941600000.0, + "grad_norm": 2.058190265763826, + "language_loss": 0.8408612, + "learning_rate": 3.959749918073179e-06, + "loss": 0.86318833, + "num_input_tokens_seen": 32716805, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.234375, + "step": 1531, + "time_per_iteration": 2.4784743785858154 + }, + { + "auxiliary_loss_clip": 0.01177383, + "auxiliary_loss_mlp": 0.01048961, + "balance_loss_clip": 1.02728868, + "balance_loss_mlp": 1.05083799, + "epoch": 0.09210882308732903, + "flos": 20885646389760.0, + "grad_norm": 1.8347699676368683, + "language_loss": 0.81135088, + "learning_rate": 3.959672139580233e-06, + "loss": 0.83361435, + "num_input_tokens_seen": 32736385, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.265625, + "step": 1532, + "time_per_iteration": 2.506875991821289 + }, + { + "auxiliary_loss_clip": 0.01179012, + "auxiliary_loss_mlp": 0.01052948, + "balance_loss_clip": 1.03044105, + "balance_loss_mlp": 1.05169332, + "epoch": 0.09216894633999699, + "flos": 30956326727040.0, + "grad_norm": 1.8650949584676202, + "language_loss": 0.83489287, + "learning_rate": 3.9595942867763235e-06, + "loss": 0.85721242, + "num_input_tokens_seen": 32757140, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.2734375, + "step": 1533, + "time_per_iteration": 2.5279369354248047 + }, + { + "auxiliary_loss_clip": 0.01181754, + "auxiliary_loss_mlp": 0.01048559, + "balance_loss_clip": 1.02662432, + "balance_loss_mlp": 1.05468941, + "epoch": 0.09222906959266497, + "flos": 13151565676800.0, + "grad_norm": 1.8226281566677605, + "language_loss": 0.89789164, + "learning_rate": 3.959516359664402e-06, + "loss": 0.92019475, + "num_input_tokens_seen": 32774860, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.2734375, + "step": 1534, + "time_per_iteration": 2.498732089996338 + }, + { + "auxiliary_loss_clip": 0.01178154, + "auxiliary_loss_mlp": 0.01064045, + "balance_loss_clip": 1.03958321, + "balance_loss_mlp": 1.04994035, + "epoch": 0.09228919284533293, + "flos": 25994477784960.0, + "grad_norm": 2.6410414613778777, + "language_loss": 0.75911283, + "learning_rate": 3.959438358247424e-06, + "loss": 0.78153479, + "num_input_tokens_seen": 32795250, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.28125, + "step": 1535, + "time_per_iteration": 2.5389468669891357 + }, + { + "auxiliary_loss_clip": 0.01170543, + "auxiliary_loss_mlp": 0.01043965, + "balance_loss_clip": 1.02404571, + "balance_loss_mlp": 1.04907823, + "epoch": 0.0923493160980009, + "flos": 18660800954880.0, + "grad_norm": 1.8388387816947327, + "language_loss": 0.81344318, + "learning_rate": 3.959360282528346e-06, + "loss": 0.83558822, + "num_input_tokens_seen": 32813805, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.2109375, + "step": 1536, + "time_per_iteration": 2.5075631141662598 + }, + { + "auxiliary_loss_clip": 0.01173873, + "auxiliary_loss_mlp": 0.01051939, + "balance_loss_clip": 1.0312202, + "balance_loss_mlp": 1.04995418, + "epoch": 0.09240943935066886, + "flos": 21140576190720.0, + "grad_norm": 2.109198419692537, + "language_loss": 0.8921392, + "learning_rate": 3.959282132510131e-06, + "loss": 0.91439736, + "num_input_tokens_seen": 32830960, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.234375, + "step": 1537, + "time_per_iteration": 2.4454562664031982 + }, + { + "auxiliary_loss_clip": 0.01177438, + "auxiliary_loss_mlp": 0.01059104, + "balance_loss_clip": 1.03638315, + "balance_loss_mlp": 1.05164456, + "epoch": 0.09246956260333684, + "flos": 20592435669120.0, + "grad_norm": 2.1959440535625285, + "language_loss": 0.8072964, + "learning_rate": 3.959203908195741e-06, + "loss": 0.82966185, + "num_input_tokens_seen": 32848275, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.2578125, + "step": 1538, + "time_per_iteration": 2.50838303565979 + }, + { + "auxiliary_loss_clip": 0.01066028, + "auxiliary_loss_mlp": 0.0101212, + "balance_loss_clip": 1.0091517, + "balance_loss_mlp": 1.01794529, + "epoch": 0.09252968585600481, + "flos": 67558710614400.0, + "grad_norm": 0.74443800558722, + "language_loss": 0.57375526, + "learning_rate": 3.959125609588142e-06, + "loss": 0.59453678, + "num_input_tokens_seen": 32917730, + "router_z_loss_clip": 0.02966309, + "router_z_loss_mlp": 0.48046875, + "step": 1539, + "time_per_iteration": 3.16038179397583 + }, + { + "auxiliary_loss_clip": 0.01179737, + "auxiliary_loss_mlp": 0.01051392, + "balance_loss_clip": 1.02958906, + "balance_loss_mlp": 1.05291581, + "epoch": 0.09258980910867277, + "flos": 17383853479680.0, + "grad_norm": 2.903908071477431, + "language_loss": 0.67164814, + "learning_rate": 3.959047236690304e-06, + "loss": 0.69395947, + "num_input_tokens_seen": 32934910, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.265625, + "step": 1540, + "time_per_iteration": 2.488809585571289 + }, + { + "auxiliary_loss_clip": 0.01178592, + "auxiliary_loss_mlp": 0.0104328, + "balance_loss_clip": 1.02154827, + "balance_loss_mlp": 1.05285096, + "epoch": 0.09264993236134075, + "flos": 19865927185920.0, + "grad_norm": 1.797248436862791, + "language_loss": 0.83666921, + "learning_rate": 3.958968789505198e-06, + "loss": 0.85888791, + "num_input_tokens_seen": 32953840, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.2578125, + "step": 1541, + "time_per_iteration": 2.5406758785247803 + }, + { + "auxiliary_loss_clip": 0.01060695, + "auxiliary_loss_mlp": 0.01009011, + "balance_loss_clip": 1.0061146, + "balance_loss_mlp": 1.01339245, + "epoch": 0.09271005561400872, + "flos": 62284401262080.0, + "grad_norm": 0.8904869203130611, + "language_loss": 0.6196329, + "learning_rate": 3.9588902680358e-06, + "loss": 0.64032996, + "num_input_tokens_seen": 33011410, + "router_z_loss_clip": 0.02893066, + "router_z_loss_mlp": 0.47265625, + "step": 1542, + "time_per_iteration": 3.0973262786865234 + }, + { + "auxiliary_loss_clip": 0.01178215, + "auxiliary_loss_mlp": 0.01055032, + "balance_loss_clip": 1.03486192, + "balance_loss_mlp": 1.05283189, + "epoch": 0.09277017886667668, + "flos": 23329870139520.0, + "grad_norm": 1.711071573157868, + "language_loss": 0.82672381, + "learning_rate": 3.958811672285086e-06, + "loss": 0.84905624, + "num_input_tokens_seen": 33031675, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.25, + "step": 1543, + "time_per_iteration": 2.489415168762207 + }, + { + "auxiliary_loss_clip": 0.0117317, + "auxiliary_loss_mlp": 0.01055984, + "balance_loss_clip": 1.03462195, + "balance_loss_mlp": 1.05128777, + "epoch": 0.09283030211934466, + "flos": 54745169875200.0, + "grad_norm": 1.6169278883375504, + "language_loss": 0.72058821, + "learning_rate": 3.958733002256038e-06, + "loss": 0.74287981, + "num_input_tokens_seen": 33056355, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.21875, + "step": 1544, + "time_per_iteration": 2.7986748218536377 + }, + { + "auxiliary_loss_clip": 0.01177062, + "auxiliary_loss_mlp": 0.01047649, + "balance_loss_clip": 1.0257864, + "balance_loss_mlp": 1.05111873, + "epoch": 0.09289042537201263, + "flos": 30334784762880.0, + "grad_norm": 1.7012123784712243, + "language_loss": 0.77617419, + "learning_rate": 3.958654257951637e-06, + "loss": 0.79842126, + "num_input_tokens_seen": 33079520, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.265625, + "step": 1545, + "time_per_iteration": 2.5673069953918457 + }, + { + "auxiliary_loss_clip": 0.01173726, + "auxiliary_loss_mlp": 0.01050414, + "balance_loss_clip": 1.029338, + "balance_loss_mlp": 1.0525856, + "epoch": 0.09295054862468059, + "flos": 17746838369280.0, + "grad_norm": 2.736353511607615, + "language_loss": 0.74531418, + "learning_rate": 3.9585754393748706e-06, + "loss": 0.76755565, + "num_input_tokens_seen": 33096135, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.2109375, + "step": 1546, + "time_per_iteration": 2.456806182861328 + }, + { + "auxiliary_loss_clip": 0.01180806, + "auxiliary_loss_mlp": 0.01051708, + "balance_loss_clip": 1.02968979, + "balance_loss_mlp": 1.05292201, + "epoch": 0.09301067187734856, + "flos": 23658021815040.0, + "grad_norm": 2.1086065935537284, + "language_loss": 0.84392273, + "learning_rate": 3.9584965465287275e-06, + "loss": 0.86624783, + "num_input_tokens_seen": 33115245, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.28125, + "step": 1547, + "time_per_iteration": 2.5041439533233643 + }, + { + "auxiliary_loss_clip": 0.01177454, + "auxiliary_loss_mlp": 0.01053084, + "balance_loss_clip": 1.03136444, + "balance_loss_mlp": 1.05125856, + "epoch": 0.09307079513001654, + "flos": 27527719777920.0, + "grad_norm": 7.120670718523448, + "language_loss": 0.67616034, + "learning_rate": 3.958417579416199e-06, + "loss": 0.6984657, + "num_input_tokens_seen": 33136640, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.2578125, + "step": 1548, + "time_per_iteration": 2.513141393661499 + }, + { + "auxiliary_loss_clip": 0.01178735, + "auxiliary_loss_mlp": 0.01053, + "balance_loss_clip": 1.03083944, + "balance_loss_mlp": 1.05175209, + "epoch": 0.0931309183826845, + "flos": 20627340710400.0, + "grad_norm": 2.761700755369037, + "language_loss": 0.83445251, + "learning_rate": 3.9583385380402795e-06, + "loss": 0.85676992, + "num_input_tokens_seen": 33155060, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.265625, + "step": 1549, + "time_per_iteration": 2.4822285175323486 + }, + { + "auxiliary_loss_clip": 0.01181659, + "auxiliary_loss_mlp": 0.01043887, + "balance_loss_clip": 1.02312112, + "balance_loss_mlp": 1.05560291, + "epoch": 0.09319104163535247, + "flos": 29020921084800.0, + "grad_norm": 1.7822943519837542, + "language_loss": 0.75744081, + "learning_rate": 3.958259422403966e-06, + "loss": 0.77969635, + "num_input_tokens_seen": 33175420, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.2578125, + "step": 1550, + "time_per_iteration": 2.5503265857696533 + }, + { + "auxiliary_loss_clip": 0.01179426, + "auxiliary_loss_mlp": 0.01069184, + "balance_loss_clip": 1.04579496, + "balance_loss_mlp": 1.05118561, + "epoch": 0.09325116488802045, + "flos": 25301545539840.0, + "grad_norm": 2.0184762942100876, + "language_loss": 0.83272278, + "learning_rate": 3.95818023251026e-06, + "loss": 0.85520893, + "num_input_tokens_seen": 33194120, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.28125, + "step": 1551, + "time_per_iteration": 2.4962081909179688 + }, + { + "auxiliary_loss_clip": 0.01060634, + "auxiliary_loss_mlp": 0.01007794, + "balance_loss_clip": 1.0051949, + "balance_loss_mlp": 1.01350796, + "epoch": 0.09331128814068841, + "flos": 61536203942400.0, + "grad_norm": 0.7800746873014213, + "language_loss": 0.6182366, + "learning_rate": 3.958100968362163e-06, + "loss": 0.6389209, + "num_input_tokens_seen": 33261080, + "router_z_loss_clip": 0.02600098, + "router_z_loss_mlp": 0.47070312, + "step": 1552, + "time_per_iteration": 3.2178378105163574 + }, + { + "auxiliary_loss_clip": 0.01059462, + "auxiliary_loss_mlp": 0.01003668, + "balance_loss_clip": 1.00099754, + "balance_loss_mlp": 1.01257896, + "epoch": 0.09337141139335638, + "flos": 53293700171520.0, + "grad_norm": 0.8330449834122059, + "language_loss": 0.5895977, + "learning_rate": 3.958021629962681e-06, + "loss": 0.61022902, + "num_input_tokens_seen": 33330235, + "router_z_loss_clip": 0.0267334, + "router_z_loss_mlp": 0.46875, + "step": 1553, + "time_per_iteration": 3.220923900604248 + }, + { + "auxiliary_loss_clip": 0.01178223, + "auxiliary_loss_mlp": 0.01058803, + "balance_loss_clip": 1.0369525, + "balance_loss_mlp": 1.05040002, + "epoch": 0.09343153464602436, + "flos": 23476852592640.0, + "grad_norm": 2.0753391269624797, + "language_loss": 0.87452686, + "learning_rate": 3.957942217314823e-06, + "loss": 0.89689714, + "num_input_tokens_seen": 33349035, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.28125, + "step": 1554, + "time_per_iteration": 2.5448763370513916 + }, + { + "auxiliary_loss_clip": 0.01174828, + "auxiliary_loss_mlp": 0.01052934, + "balance_loss_clip": 1.0310595, + "balance_loss_mlp": 1.05265594, + "epoch": 0.09349165789869232, + "flos": 19353481804800.0, + "grad_norm": 2.2438919833216913, + "language_loss": 0.81355709, + "learning_rate": 3.957862730421599e-06, + "loss": 0.83583468, + "num_input_tokens_seen": 33368060, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.21875, + "step": 1555, + "time_per_iteration": 2.4939990043640137 + }, + { + "auxiliary_loss_clip": 0.01058772, + "auxiliary_loss_mlp": 0.01008478, + "balance_loss_clip": 1.00581956, + "balance_loss_mlp": 1.01259685, + "epoch": 0.09355178115136029, + "flos": 67502580635520.0, + "grad_norm": 0.8701907042199977, + "language_loss": 0.59583747, + "learning_rate": 3.957783169286024e-06, + "loss": 0.61651003, + "num_input_tokens_seen": 33430825, + "router_z_loss_clip": 0.02661133, + "router_z_loss_mlp": 0.4609375, + "step": 1556, + "time_per_iteration": 3.0923824310302734 + }, + { + "auxiliary_loss_clip": 0.01177126, + "auxiliary_loss_mlp": 0.01056269, + "balance_loss_clip": 1.03518105, + "balance_loss_mlp": 1.05278862, + "epoch": 0.09361190440402825, + "flos": 37341638720640.0, + "grad_norm": 1.5891177576034032, + "language_loss": 0.84455961, + "learning_rate": 3.9577035339111155e-06, + "loss": 0.86689359, + "num_input_tokens_seen": 33454855, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.2421875, + "step": 1557, + "time_per_iteration": 2.5973968505859375 + }, + { + "auxiliary_loss_clip": 0.01175988, + "auxiliary_loss_mlp": 0.01061513, + "balance_loss_clip": 1.03799307, + "balance_loss_mlp": 1.05065048, + "epoch": 0.09367202765669623, + "flos": 24899705112960.0, + "grad_norm": 1.787574567308206, + "language_loss": 0.77987397, + "learning_rate": 3.957623824299893e-06, + "loss": 0.80224895, + "num_input_tokens_seen": 33476000, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.25, + "step": 1558, + "time_per_iteration": 2.5390286445617676 + }, + { + "auxiliary_loss_clip": 0.01178258, + "auxiliary_loss_mlp": 0.01054751, + "balance_loss_clip": 1.03268576, + "balance_loss_mlp": 1.05035424, + "epoch": 0.0937321509093642, + "flos": 15705568368000.0, + "grad_norm": 2.0310113035260873, + "language_loss": 0.7998119, + "learning_rate": 3.957544040455379e-06, + "loss": 0.822142, + "num_input_tokens_seen": 33493845, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.28125, + "step": 1559, + "time_per_iteration": 5.3233802318573 + }, + { + "auxiliary_loss_clip": 0.01172855, + "auxiliary_loss_mlp": 0.01063353, + "balance_loss_clip": 1.04146647, + "balance_loss_mlp": 1.05015147, + "epoch": 0.09379227416203216, + "flos": 20483698222080.0, + "grad_norm": 1.9877315441152976, + "language_loss": 0.76720232, + "learning_rate": 3.957464182380599e-06, + "loss": 0.78956437, + "num_input_tokens_seen": 33510850, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.2265625, + "step": 1560, + "time_per_iteration": 3.863935947418213 + }, + { + "auxiliary_loss_clip": 0.01180546, + "auxiliary_loss_mlp": 0.01059772, + "balance_loss_clip": 1.03793311, + "balance_loss_mlp": 1.05101645, + "epoch": 0.09385239741470014, + "flos": 24352498344960.0, + "grad_norm": 1.6628394684514, + "language_loss": 0.81219828, + "learning_rate": 3.95738425007858e-06, + "loss": 0.83460152, + "num_input_tokens_seen": 33530430, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.296875, + "step": 1561, + "time_per_iteration": 2.5050160884857178 + }, + { + "auxiliary_loss_clip": 0.01175131, + "auxiliary_loss_mlp": 0.01048338, + "balance_loss_clip": 1.02641547, + "balance_loss_mlp": 1.04764926, + "epoch": 0.0939125206673681, + "flos": 33291489807360.0, + "grad_norm": 2.307547697406205, + "language_loss": 0.61553764, + "learning_rate": 3.957304243552354e-06, + "loss": 0.63777232, + "num_input_tokens_seen": 33551975, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.2734375, + "step": 1562, + "time_per_iteration": 2.5884838104248047 + }, + { + "auxiliary_loss_clip": 0.01177686, + "auxiliary_loss_mlp": 0.01059886, + "balance_loss_clip": 1.03920364, + "balance_loss_mlp": 1.0552876, + "epoch": 0.09397264392003607, + "flos": 19244923925760.0, + "grad_norm": 2.5948914783661468, + "language_loss": 0.84981585, + "learning_rate": 3.957224162804956e-06, + "loss": 0.87219155, + "num_input_tokens_seen": 33569850, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.21875, + "step": 1563, + "time_per_iteration": 2.427928924560547 + }, + { + "auxiliary_loss_clip": 0.01172512, + "auxiliary_loss_mlp": 0.01048044, + "balance_loss_clip": 1.02767134, + "balance_loss_mlp": 1.05013323, + "epoch": 0.09403276717270405, + "flos": 19317930318720.0, + "grad_norm": 1.8141046481233785, + "language_loss": 0.76106739, + "learning_rate": 3.9571440078394205e-06, + "loss": 0.78327298, + "num_input_tokens_seen": 33590510, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.21875, + "step": 1564, + "time_per_iteration": 2.4996325969696045 + }, + { + "auxiliary_loss_clip": 0.01177295, + "auxiliary_loss_mlp": 0.01055133, + "balance_loss_clip": 1.03415227, + "balance_loss_mlp": 1.05290008, + "epoch": 0.09409289042537201, + "flos": 23583471137280.0, + "grad_norm": 2.0134268414891388, + "language_loss": 0.7971766, + "learning_rate": 3.9570637786587895e-06, + "loss": 0.81950086, + "num_input_tokens_seen": 33608810, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.25, + "step": 1565, + "time_per_iteration": 2.470870018005371 + }, + { + "auxiliary_loss_clip": 0.01175133, + "auxiliary_loss_mlp": 0.01069432, + "balance_loss_clip": 1.0479629, + "balance_loss_mlp": 1.0497129, + "epoch": 0.09415301367803998, + "flos": 20078446003200.0, + "grad_norm": 1.8353632925340597, + "language_loss": 0.75241816, + "learning_rate": 3.956983475266103e-06, + "loss": 0.77486378, + "num_input_tokens_seen": 33627265, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.2578125, + "step": 1566, + "time_per_iteration": 2.4962053298950195 + }, + { + "auxiliary_loss_clip": 0.0117411, + "auxiliary_loss_mlp": 0.01059014, + "balance_loss_clip": 1.03746092, + "balance_loss_mlp": 1.04822683, + "epoch": 0.09421313693070796, + "flos": 21062075016960.0, + "grad_norm": 2.55149440594841, + "language_loss": 0.77724433, + "learning_rate": 3.956903097664407e-06, + "loss": 0.79957557, + "num_input_tokens_seen": 33644810, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.2578125, + "step": 1567, + "time_per_iteration": 2.448511838912964 + }, + { + "auxiliary_loss_clip": 0.01178494, + "auxiliary_loss_mlp": 0.01054706, + "balance_loss_clip": 1.03504825, + "balance_loss_mlp": 1.05183101, + "epoch": 0.09427326018337592, + "flos": 24316156759680.0, + "grad_norm": 2.293964487000622, + "language_loss": 0.82571244, + "learning_rate": 3.956822645856749e-06, + "loss": 0.8480444, + "num_input_tokens_seen": 33665665, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.265625, + "step": 1568, + "time_per_iteration": 2.5221774578094482 + }, + { + "auxiliary_loss_clip": 0.01179838, + "auxiliary_loss_mlp": 0.01048346, + "balance_loss_clip": 1.02527881, + "balance_loss_mlp": 1.05191278, + "epoch": 0.09433338343604389, + "flos": 20263888944000.0, + "grad_norm": 4.3822924949764515, + "language_loss": 0.7658236, + "learning_rate": 3.9567421198461814e-06, + "loss": 0.78810549, + "num_input_tokens_seen": 33684760, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.28125, + "step": 1569, + "time_per_iteration": 2.464019775390625 + }, + { + "auxiliary_loss_clip": 0.01171203, + "auxiliary_loss_mlp": 0.01052053, + "balance_loss_clip": 1.03004718, + "balance_loss_mlp": 1.04984534, + "epoch": 0.09439350668871185, + "flos": 12742973493120.0, + "grad_norm": 2.11394347406088, + "language_loss": 0.86315012, + "learning_rate": 3.956661519635756e-06, + "loss": 0.88538271, + "num_input_tokens_seen": 33700750, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.21875, + "step": 1570, + "time_per_iteration": 2.479001998901367 + }, + { + "auxiliary_loss_clip": 0.01177967, + "auxiliary_loss_mlp": 0.01049184, + "balance_loss_clip": 1.02764273, + "balance_loss_mlp": 1.05340183, + "epoch": 0.09445362994137983, + "flos": 25962266263680.0, + "grad_norm": 1.6480791038221163, + "language_loss": 0.76531005, + "learning_rate": 3.95658084522853e-06, + "loss": 0.78758156, + "num_input_tokens_seen": 33724430, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.25, + "step": 1571, + "time_per_iteration": 2.5270462036132812 + }, + { + "auxiliary_loss_clip": 0.01169263, + "auxiliary_loss_mlp": 0.01049685, + "balance_loss_clip": 1.02848995, + "balance_loss_mlp": 1.0496099, + "epoch": 0.0945137531940478, + "flos": 19715353372800.0, + "grad_norm": 1.780883866775424, + "language_loss": 0.79518712, + "learning_rate": 3.956500096627561e-06, + "loss": 0.81737661, + "num_input_tokens_seen": 33743455, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1953125, + "step": 1572, + "time_per_iteration": 2.477403163909912 + }, + { + "auxiliary_loss_clip": 0.01172702, + "auxiliary_loss_mlp": 0.01053869, + "balance_loss_clip": 1.03288805, + "balance_loss_mlp": 1.05036175, + "epoch": 0.09457387644671576, + "flos": 23617047375360.0, + "grad_norm": 1.8458711299535766, + "language_loss": 0.87948155, + "learning_rate": 3.956419273835913e-06, + "loss": 0.90174723, + "num_input_tokens_seen": 33763435, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.21875, + "step": 1573, + "time_per_iteration": 2.5164122581481934 + }, + { + "auxiliary_loss_clip": 0.01177194, + "auxiliary_loss_mlp": 0.01059795, + "balance_loss_clip": 1.03533316, + "balance_loss_mlp": 1.05045378, + "epoch": 0.09463399969938374, + "flos": 26907291135360.0, + "grad_norm": 2.770313323609274, + "language_loss": 0.81827116, + "learning_rate": 3.95633837685665e-06, + "loss": 0.84064102, + "num_input_tokens_seen": 33784325, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.265625, + "step": 1574, + "time_per_iteration": 2.5540831089019775 + }, + { + "auxiliary_loss_clip": 0.01178056, + "auxiliary_loss_mlp": 0.01052269, + "balance_loss_clip": 1.03128815, + "balance_loss_mlp": 1.05359375, + "epoch": 0.0946941229520517, + "flos": 23659566099840.0, + "grad_norm": 2.139236970889498, + "language_loss": 0.80922085, + "learning_rate": 3.95625740569284e-06, + "loss": 0.83152413, + "num_input_tokens_seen": 33802510, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.25, + "step": 1575, + "time_per_iteration": 2.4874608516693115 + }, + { + "auxiliary_loss_clip": 0.01172567, + "auxiliary_loss_mlp": 0.01063693, + "balance_loss_clip": 1.04184198, + "balance_loss_mlp": 1.05048943, + "epoch": 0.09475424620471967, + "flos": 24134053783680.0, + "grad_norm": 2.1107661515601, + "language_loss": 0.86745369, + "learning_rate": 3.956176360347553e-06, + "loss": 0.88981628, + "num_input_tokens_seen": 33819980, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.21875, + "step": 1576, + "time_per_iteration": 2.514961004257202 + }, + { + "auxiliary_loss_clip": 0.01058351, + "auxiliary_loss_mlp": 0.01025821, + "balance_loss_clip": 1.02272177, + "balance_loss_mlp": 1.01272786, + "epoch": 0.09481436945738765, + "flos": 68426168065920.0, + "grad_norm": 0.9836929902555142, + "language_loss": 0.65832257, + "learning_rate": 3.956095240823862e-06, + "loss": 0.67916429, + "num_input_tokens_seen": 33878925, + "router_z_loss_clip": 0.03100586, + "router_z_loss_mlp": 0.45703125, + "step": 1577, + "time_per_iteration": 3.042998790740967 + }, + { + "auxiliary_loss_clip": 0.01175806, + "auxiliary_loss_mlp": 0.01045658, + "balance_loss_clip": 1.02504635, + "balance_loss_mlp": 1.05083144, + "epoch": 0.09487449271005562, + "flos": 16654076858880.0, + "grad_norm": 3.158821122445177, + "language_loss": 0.79113019, + "learning_rate": 3.956014047124844e-06, + "loss": 0.81334484, + "num_input_tokens_seen": 33897600, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.25, + "step": 1578, + "time_per_iteration": 2.492605447769165 + }, + { + "auxiliary_loss_clip": 0.01173104, + "auxiliary_loss_mlp": 0.01056494, + "balance_loss_clip": 1.03446436, + "balance_loss_mlp": 1.04935408, + "epoch": 0.09493461596272358, + "flos": 24275685110400.0, + "grad_norm": 1.6941125689582233, + "language_loss": 0.77994359, + "learning_rate": 3.955932779253578e-06, + "loss": 0.80223954, + "num_input_tokens_seen": 33917365, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.234375, + "step": 1579, + "time_per_iteration": 2.5021350383758545 + }, + { + "auxiliary_loss_clip": 0.01176838, + "auxiliary_loss_mlp": 0.01054415, + "balance_loss_clip": 1.0317533, + "balance_loss_mlp": 1.05228639, + "epoch": 0.09499473921539155, + "flos": 21870173243520.0, + "grad_norm": 2.3012950697800747, + "language_loss": 0.73576474, + "learning_rate": 3.955851437213144e-06, + "loss": 0.75807726, + "num_input_tokens_seen": 33936680, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.2421875, + "step": 1580, + "time_per_iteration": 2.500426769256592 + }, + { + "auxiliary_loss_clip": 0.01171524, + "auxiliary_loss_mlp": 0.01053034, + "balance_loss_clip": 1.03235102, + "balance_loss_mlp": 1.05162525, + "epoch": 0.09505486246805953, + "flos": 33547137880320.0, + "grad_norm": 2.820694860574998, + "language_loss": 0.77813822, + "learning_rate": 3.955770021006627e-06, + "loss": 0.80038381, + "num_input_tokens_seen": 33960685, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.203125, + "step": 1581, + "time_per_iteration": 2.569086790084839 + }, + { + "auxiliary_loss_clip": 0.01177083, + "auxiliary_loss_mlp": 0.0105881, + "balance_loss_clip": 1.03779316, + "balance_loss_mlp": 1.05315304, + "epoch": 0.09511498572072749, + "flos": 21215342350080.0, + "grad_norm": 2.1718701740895443, + "language_loss": 0.86914808, + "learning_rate": 3.955688530637116e-06, + "loss": 0.89150703, + "num_input_tokens_seen": 33980015, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.234375, + "step": 1582, + "time_per_iteration": 2.476386785507202 + }, + { + "auxiliary_loss_clip": 0.01178411, + "auxiliary_loss_mlp": 0.01057193, + "balance_loss_clip": 1.03394723, + "balance_loss_mlp": 1.05487967, + "epoch": 0.09517510897339546, + "flos": 14611262572800.0, + "grad_norm": 1.7496793522695477, + "language_loss": 0.66838771, + "learning_rate": 3.955606966107699e-06, + "loss": 0.6907438, + "num_input_tokens_seen": 33997705, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.234375, + "step": 1583, + "time_per_iteration": 2.4433302879333496 + }, + { + "auxiliary_loss_clip": 0.01180705, + "auxiliary_loss_mlp": 0.01052141, + "balance_loss_clip": 1.02919281, + "balance_loss_mlp": 1.0555923, + "epoch": 0.09523523222606343, + "flos": 27817339138560.0, + "grad_norm": 1.8272679383640855, + "language_loss": 0.70314872, + "learning_rate": 3.95552532742147e-06, + "loss": 0.7254771, + "num_input_tokens_seen": 34017465, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.25, + "step": 1584, + "time_per_iteration": 2.5352938175201416 + }, + { + "auxiliary_loss_clip": 0.01177345, + "auxiliary_loss_mlp": 0.01054432, + "balance_loss_clip": 1.0344646, + "balance_loss_mlp": 1.0527246, + "epoch": 0.0952953554787314, + "flos": 20706272847360.0, + "grad_norm": 1.5429491827095454, + "language_loss": 0.80649364, + "learning_rate": 3.955443614581525e-06, + "loss": 0.82881135, + "num_input_tokens_seen": 34038550, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.2421875, + "step": 1585, + "time_per_iteration": 2.5006139278411865 + }, + { + "auxiliary_loss_clip": 0.01179471, + "auxiliary_loss_mlp": 0.01056563, + "balance_loss_clip": 1.03301954, + "balance_loss_mlp": 1.05324364, + "epoch": 0.09535547873139937, + "flos": 24787627701120.0, + "grad_norm": 1.5763794615860258, + "language_loss": 0.7156626, + "learning_rate": 3.955361827590961e-06, + "loss": 0.73802292, + "num_input_tokens_seen": 34058665, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.265625, + "step": 1586, + "time_per_iteration": 2.510941982269287 + }, + { + "auxiliary_loss_clip": 0.01058124, + "auxiliary_loss_mlp": 0.010121, + "balance_loss_clip": 1.00946522, + "balance_loss_mlp": 1.01272035, + "epoch": 0.09541560198406734, + "flos": 71912194905600.0, + "grad_norm": 0.8128409972345002, + "language_loss": 0.55392706, + "learning_rate": 3.955279966452883e-06, + "loss": 0.57462931, + "num_input_tokens_seen": 34109655, + "router_z_loss_clip": 0.02636719, + "router_z_loss_mlp": 0.453125, + "step": 1587, + "time_per_iteration": 2.8747992515563965 + }, + { + "auxiliary_loss_clip": 0.0118109, + "auxiliary_loss_mlp": 0.01056077, + "balance_loss_clip": 1.0345006, + "balance_loss_mlp": 1.0550952, + "epoch": 0.09547572523673531, + "flos": 28982604251520.0, + "grad_norm": 1.813611272618652, + "language_loss": 0.81023234, + "learning_rate": 3.955198031170391e-06, + "loss": 0.83260405, + "num_input_tokens_seen": 34131115, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.265625, + "step": 1588, + "time_per_iteration": 2.5403292179107666 + }, + { + "auxiliary_loss_clip": 0.01178114, + "auxiliary_loss_mlp": 0.01054854, + "balance_loss_clip": 1.03290713, + "balance_loss_mlp": 1.05471849, + "epoch": 0.09553584848940327, + "flos": 24133910129280.0, + "grad_norm": 2.1843830695972835, + "language_loss": 0.81552076, + "learning_rate": 3.955116021746594e-06, + "loss": 0.83785045, + "num_input_tokens_seen": 34151925, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.234375, + "step": 1589, + "time_per_iteration": 2.4995651245117188 + }, + { + "auxiliary_loss_clip": 0.01175474, + "auxiliary_loss_mlp": 0.01051658, + "balance_loss_clip": 1.02901983, + "balance_loss_mlp": 1.05340207, + "epoch": 0.09559597174207124, + "flos": 42851376789120.0, + "grad_norm": 1.4497838373443381, + "language_loss": 0.65005404, + "learning_rate": 3.955033938184601e-06, + "loss": 0.67232537, + "num_input_tokens_seen": 34175395, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.21875, + "step": 1590, + "time_per_iteration": 2.7222375869750977 + }, + { + "auxiliary_loss_clip": 0.01173556, + "auxiliary_loss_mlp": 0.01051921, + "balance_loss_clip": 1.03036785, + "balance_loss_mlp": 1.05178595, + "epoch": 0.09565609499473922, + "flos": 32670845683200.0, + "grad_norm": 1.714913693600035, + "language_loss": 0.83272862, + "learning_rate": 3.954951780487526e-06, + "loss": 0.85498345, + "num_input_tokens_seen": 34197760, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.21875, + "step": 1591, + "time_per_iteration": 2.571277379989624 + }, + { + "auxiliary_loss_clip": 0.01179776, + "auxiliary_loss_mlp": 0.01055769, + "balance_loss_clip": 1.03419209, + "balance_loss_mlp": 1.05280709, + "epoch": 0.09571621824740718, + "flos": 18478410670080.0, + "grad_norm": 2.268244689889179, + "language_loss": 0.74068749, + "learning_rate": 3.9548695486584835e-06, + "loss": 0.76304293, + "num_input_tokens_seen": 34215330, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.265625, + "step": 1592, + "time_per_iteration": 2.446272373199463 + }, + { + "auxiliary_loss_clip": 0.01173297, + "auxiliary_loss_mlp": 0.0104948, + "balance_loss_clip": 1.0282129, + "balance_loss_mlp": 1.05028248, + "epoch": 0.09577634150007515, + "flos": 29387497334400.0, + "grad_norm": 1.9287746031752921, + "language_loss": 0.74135411, + "learning_rate": 3.954787242700592e-06, + "loss": 0.76358187, + "num_input_tokens_seen": 34237745, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.234375, + "step": 1593, + "time_per_iteration": 2.518038749694824 + }, + { + "auxiliary_loss_clip": 0.01175652, + "auxiliary_loss_mlp": 0.01051222, + "balance_loss_clip": 1.03061128, + "balance_loss_mlp": 1.05365515, + "epoch": 0.09583646475274313, + "flos": 22747830157440.0, + "grad_norm": 1.8251705146793997, + "language_loss": 0.69907188, + "learning_rate": 3.954704862616971e-06, + "loss": 0.72134066, + "num_input_tokens_seen": 34256565, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.21875, + "step": 1594, + "time_per_iteration": 2.5454983711242676 + }, + { + "auxiliary_loss_clip": 0.01174594, + "auxiliary_loss_mlp": 0.0105111, + "balance_loss_clip": 1.03062999, + "balance_loss_mlp": 1.05023921, + "epoch": 0.0958965880054111, + "flos": 23218367345280.0, + "grad_norm": 2.596137828422853, + "language_loss": 0.82464099, + "learning_rate": 3.954622408410747e-06, + "loss": 0.84689802, + "num_input_tokens_seen": 34275970, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.2421875, + "step": 1595, + "time_per_iteration": 2.472062826156616 + }, + { + "auxiliary_loss_clip": 0.01176658, + "auxiliary_loss_mlp": 0.01050558, + "balance_loss_clip": 1.02803886, + "balance_loss_mlp": 1.05217803, + "epoch": 0.09595671125807906, + "flos": 21324438933120.0, + "grad_norm": 2.0311987750358953, + "language_loss": 0.84673214, + "learning_rate": 3.954539880085045e-06, + "loss": 0.86900425, + "num_input_tokens_seen": 34295490, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.2421875, + "step": 1596, + "time_per_iteration": 2.4801599979400635 + }, + { + "auxiliary_loss_clip": 0.01181467, + "auxiliary_loss_mlp": 0.01051063, + "balance_loss_clip": 1.02871156, + "balance_loss_mlp": 1.05628884, + "epoch": 0.09601683451074704, + "flos": 39603472185600.0, + "grad_norm": 2.531539932785817, + "language_loss": 0.68993127, + "learning_rate": 3.9544572776429945e-06, + "loss": 0.71225667, + "num_input_tokens_seen": 34319990, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.25, + "step": 1597, + "time_per_iteration": 2.6195101737976074 + }, + { + "auxiliary_loss_clip": 0.01175632, + "auxiliary_loss_mlp": 0.0104509, + "balance_loss_clip": 1.02370429, + "balance_loss_mlp": 1.04902959, + "epoch": 0.096076957763415, + "flos": 23732716147200.0, + "grad_norm": 2.18946094151333, + "language_loss": 0.74929029, + "learning_rate": 3.954374601087729e-06, + "loss": 0.77149749, + "num_input_tokens_seen": 34339225, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.265625, + "step": 1598, + "time_per_iteration": 2.474071502685547 + }, + { + "auxiliary_loss_clip": 0.01179079, + "auxiliary_loss_mlp": 0.01048418, + "balance_loss_clip": 1.02574444, + "balance_loss_mlp": 1.05284083, + "epoch": 0.09613708101608297, + "flos": 34678108483200.0, + "grad_norm": 1.6350676424235815, + "language_loss": 0.69002283, + "learning_rate": 3.954291850422382e-06, + "loss": 0.7122978, + "num_input_tokens_seen": 34361020, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.265625, + "step": 1599, + "time_per_iteration": 2.5599992275238037 + }, + { + "auxiliary_loss_clip": 0.01174972, + "auxiliary_loss_mlp": 0.01056578, + "balance_loss_clip": 1.0358355, + "balance_loss_mlp": 1.05169392, + "epoch": 0.09619720426875093, + "flos": 20740028653440.0, + "grad_norm": 2.013538613147854, + "language_loss": 0.840271, + "learning_rate": 3.954209025650093e-06, + "loss": 0.8625865, + "num_input_tokens_seen": 34378630, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.234375, + "step": 1600, + "time_per_iteration": 2.4882116317749023 + }, + { + "auxiliary_loss_clip": 0.01174537, + "auxiliary_loss_mlp": 0.01052763, + "balance_loss_clip": 1.03162694, + "balance_loss_mlp": 1.05098653, + "epoch": 0.09625732752141891, + "flos": 13042720488960.0, + "grad_norm": 3.038904015519863, + "language_loss": 0.8034178, + "learning_rate": 3.954126126774001e-06, + "loss": 0.82569081, + "num_input_tokens_seen": 34397110, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.234375, + "step": 1601, + "time_per_iteration": 5.328745365142822 + }, + { + "auxiliary_loss_clip": 0.01178453, + "auxiliary_loss_mlp": 0.01052259, + "balance_loss_clip": 1.03031266, + "balance_loss_mlp": 1.05090928, + "epoch": 0.09631745077408688, + "flos": 22273629782400.0, + "grad_norm": 2.183236390866488, + "language_loss": 0.82405198, + "learning_rate": 3.954043153797251e-06, + "loss": 0.84635913, + "num_input_tokens_seen": 34414165, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.2734375, + "step": 1602, + "time_per_iteration": 2.4609556198120117 + }, + { + "auxiliary_loss_clip": 0.01172805, + "auxiliary_loss_mlp": 0.01051791, + "balance_loss_clip": 1.02926075, + "balance_loss_mlp": 1.05170703, + "epoch": 0.09637757402675484, + "flos": 24754266944640.0, + "grad_norm": 1.882331764966583, + "language_loss": 0.62527591, + "learning_rate": 3.953960106722989e-06, + "loss": 0.64752185, + "num_input_tokens_seen": 34434445, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.2109375, + "step": 1603, + "time_per_iteration": 2.4974379539489746 + }, + { + "auxiliary_loss_clip": 0.01178105, + "auxiliary_loss_mlp": 0.01054363, + "balance_loss_clip": 1.03049707, + "balance_loss_mlp": 1.05224609, + "epoch": 0.09643769727942282, + "flos": 22525758322560.0, + "grad_norm": 2.347327571135852, + "language_loss": 0.71259016, + "learning_rate": 3.953876985554364e-06, + "loss": 0.73491484, + "num_input_tokens_seen": 34453095, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.2578125, + "step": 1604, + "time_per_iteration": 2.5012693405151367 + }, + { + "auxiliary_loss_clip": 0.01172586, + "auxiliary_loss_mlp": 0.01056823, + "balance_loss_clip": 1.0368669, + "balance_loss_mlp": 1.05051208, + "epoch": 0.09649782053209079, + "flos": 30921026636160.0, + "grad_norm": 2.129697971326249, + "language_loss": 0.79487669, + "learning_rate": 3.953793790294527e-06, + "loss": 0.8171708, + "num_input_tokens_seen": 34473680, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.21875, + "step": 1605, + "time_per_iteration": 2.5392873287200928 + }, + { + "auxiliary_loss_clip": 0.01176232, + "auxiliary_loss_mlp": 0.01044253, + "balance_loss_clip": 1.02275968, + "balance_loss_mlp": 1.04916394, + "epoch": 0.09655794378475875, + "flos": 25337635729920.0, + "grad_norm": 3.698123586343809, + "language_loss": 0.74810207, + "learning_rate": 3.953710520946634e-06, + "loss": 0.77030694, + "num_input_tokens_seen": 34492610, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.2734375, + "step": 1606, + "time_per_iteration": 2.4922726154327393 + }, + { + "auxiliary_loss_clip": 0.01176161, + "auxiliary_loss_mlp": 0.01044775, + "balance_loss_clip": 1.02391386, + "balance_loss_mlp": 1.05243278, + "epoch": 0.09661806703742673, + "flos": 22346061557760.0, + "grad_norm": 1.649703340967918, + "language_loss": 0.75382137, + "learning_rate": 3.953627177513843e-06, + "loss": 0.77603066, + "num_input_tokens_seen": 34511855, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.234375, + "step": 1607, + "time_per_iteration": 2.4787087440490723 + }, + { + "auxiliary_loss_clip": 0.0117289, + "auxiliary_loss_mlp": 0.01042475, + "balance_loss_clip": 1.02206647, + "balance_loss_mlp": 1.04831934, + "epoch": 0.0966781902900947, + "flos": 17457578144640.0, + "grad_norm": 2.262571531890369, + "language_loss": 0.86648059, + "learning_rate": 3.953543759999312e-06, + "loss": 0.88863426, + "num_input_tokens_seen": 34528905, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.25, + "step": 1608, + "time_per_iteration": 2.435391664505005 + }, + { + "auxiliary_loss_clip": 0.01183391, + "auxiliary_loss_mlp": 0.01056654, + "balance_loss_clip": 1.03513622, + "balance_loss_mlp": 1.05276418, + "epoch": 0.09673831354276266, + "flos": 36903995412480.0, + "grad_norm": 2.2277980990408297, + "language_loss": 0.70968121, + "learning_rate": 3.953460268406207e-06, + "loss": 0.73208165, + "num_input_tokens_seen": 34548480, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.3046875, + "step": 1609, + "time_per_iteration": 2.599719762802124 + }, + { + "auxiliary_loss_clip": 0.01173214, + "auxiliary_loss_mlp": 0.01054271, + "balance_loss_clip": 1.03342104, + "balance_loss_mlp": 1.04860282, + "epoch": 0.09679843679543064, + "flos": 20701388597760.0, + "grad_norm": 3.7787270736621674, + "language_loss": 0.84566712, + "learning_rate": 3.953376702737693e-06, + "loss": 0.86794198, + "num_input_tokens_seen": 34565410, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.2421875, + "step": 1610, + "time_per_iteration": 2.446676254272461 + }, + { + "auxiliary_loss_clip": 0.01177531, + "auxiliary_loss_mlp": 0.01049914, + "balance_loss_clip": 1.02781224, + "balance_loss_mlp": 1.05382621, + "epoch": 0.0968585600480986, + "flos": 23514415240320.0, + "grad_norm": 2.0483419743874682, + "language_loss": 0.67360532, + "learning_rate": 3.953293062996939e-06, + "loss": 0.69587982, + "num_input_tokens_seen": 34584840, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.234375, + "step": 1611, + "time_per_iteration": 2.520211696624756 + }, + { + "auxiliary_loss_clip": 0.01177545, + "auxiliary_loss_mlp": 0.0105068, + "balance_loss_clip": 1.03000879, + "balance_loss_mlp": 1.05313492, + "epoch": 0.09691868330076657, + "flos": 20121072468480.0, + "grad_norm": 1.6625909003061596, + "language_loss": 0.81166416, + "learning_rate": 3.953209349187115e-06, + "loss": 0.83394641, + "num_input_tokens_seen": 34603360, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.2421875, + "step": 1612, + "time_per_iteration": 2.449491262435913 + }, + { + "auxiliary_loss_clip": 0.01180036, + "auxiliary_loss_mlp": 0.01061745, + "balance_loss_clip": 1.04027581, + "balance_loss_mlp": 1.05431938, + "epoch": 0.09697880655343454, + "flos": 16544692967040.0, + "grad_norm": 2.509420249413084, + "language_loss": 0.80708754, + "learning_rate": 3.953125561311398e-06, + "loss": 0.82950538, + "num_input_tokens_seen": 34620760, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.2578125, + "step": 1613, + "time_per_iteration": 2.4753763675689697 + }, + { + "auxiliary_loss_clip": 0.01173718, + "auxiliary_loss_mlp": 0.01052644, + "balance_loss_clip": 1.03019738, + "balance_loss_mlp": 1.05074048, + "epoch": 0.09703892980610251, + "flos": 26104184899200.0, + "grad_norm": 2.0025313344872484, + "language_loss": 0.84173608, + "learning_rate": 3.953041699372964e-06, + "loss": 0.86399966, + "num_input_tokens_seen": 34640695, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.2265625, + "step": 1614, + "time_per_iteration": 2.5492141246795654 + }, + { + "auxiliary_loss_clip": 0.01065917, + "auxiliary_loss_mlp": 0.010187, + "balance_loss_clip": 1.01610088, + "balance_loss_mlp": 1.019063, + "epoch": 0.09709905305877048, + "flos": 60443622000000.0, + "grad_norm": 0.7078098108364695, + "language_loss": 0.54584575, + "learning_rate": 3.952957763374992e-06, + "loss": 0.56669194, + "num_input_tokens_seen": 34702395, + "router_z_loss_clip": 0.02600098, + "router_z_loss_mlp": 0.46875, + "step": 1615, + "time_per_iteration": 3.1041057109832764 + }, + { + "auxiliary_loss_clip": 0.01065912, + "auxiliary_loss_mlp": 0.01007477, + "balance_loss_clip": 1.00491357, + "balance_loss_mlp": 1.01844954, + "epoch": 0.09715917631143844, + "flos": 57639932893440.0, + "grad_norm": 0.7637649269659756, + "language_loss": 0.5822649, + "learning_rate": 3.952873753320666e-06, + "loss": 0.60299873, + "num_input_tokens_seen": 34768910, + "router_z_loss_clip": 0.02563477, + "router_z_loss_mlp": 0.47460938, + "step": 1616, + "time_per_iteration": 3.215376377105713 + }, + { + "auxiliary_loss_clip": 0.01178513, + "auxiliary_loss_mlp": 0.01055808, + "balance_loss_clip": 1.03275275, + "balance_loss_mlp": 1.05275226, + "epoch": 0.09721929956410642, + "flos": 20558212986240.0, + "grad_norm": 1.690325520565165, + "language_loss": 0.69293094, + "learning_rate": 3.952789669213172e-06, + "loss": 0.71527421, + "num_input_tokens_seen": 34787680, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.2578125, + "step": 1617, + "time_per_iteration": 2.458017587661743 + }, + { + "auxiliary_loss_clip": 0.01176727, + "auxiliary_loss_mlp": 0.01055641, + "balance_loss_clip": 1.03116739, + "balance_loss_mlp": 1.05130577, + "epoch": 0.09727942281677439, + "flos": 27344359825920.0, + "grad_norm": 1.7927692696889819, + "language_loss": 0.80748308, + "learning_rate": 3.952705511055698e-06, + "loss": 0.8298068, + "num_input_tokens_seen": 34808330, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.25, + "step": 1618, + "time_per_iteration": 2.5471577644348145 + }, + { + "auxiliary_loss_clip": 0.01169902, + "auxiliary_loss_mlp": 0.01050477, + "balance_loss_clip": 1.03077149, + "balance_loss_mlp": 1.04996848, + "epoch": 0.09733954606944235, + "flos": 24900028335360.0, + "grad_norm": 1.5831304278494804, + "language_loss": 0.9288674, + "learning_rate": 3.952621278851435e-06, + "loss": 0.9510712, + "num_input_tokens_seen": 34830020, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1953125, + "step": 1619, + "time_per_iteration": 2.515282392501831 + }, + { + "auxiliary_loss_clip": 0.01171299, + "auxiliary_loss_mlp": 0.01052594, + "balance_loss_clip": 1.03150594, + "balance_loss_mlp": 1.05216622, + "epoch": 0.09739966932211033, + "flos": 31503928544640.0, + "grad_norm": 1.7974961209450113, + "language_loss": 0.88785303, + "learning_rate": 3.9525369726035784e-06, + "loss": 0.910092, + "num_input_tokens_seen": 34850330, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1875, + "step": 1620, + "time_per_iteration": 2.556744337081909 + }, + { + "auxiliary_loss_clip": 0.01175309, + "auxiliary_loss_mlp": 0.01056801, + "balance_loss_clip": 1.0339601, + "balance_loss_mlp": 1.05045033, + "epoch": 0.0974597925747783, + "flos": 23878764846720.0, + "grad_norm": 1.90931759761679, + "language_loss": 0.77130795, + "learning_rate": 3.952452592315324e-06, + "loss": 0.79362905, + "num_input_tokens_seen": 34871640, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.25, + "step": 1621, + "time_per_iteration": 2.491441011428833 + }, + { + "auxiliary_loss_clip": 0.01171563, + "auxiliary_loss_mlp": 0.01056002, + "balance_loss_clip": 1.03398418, + "balance_loss_mlp": 1.04859447, + "epoch": 0.09751991582744626, + "flos": 17019575700480.0, + "grad_norm": 1.9170880538391684, + "language_loss": 0.77856946, + "learning_rate": 3.952368137989871e-06, + "loss": 0.80084509, + "num_input_tokens_seen": 34888100, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.2265625, + "step": 1622, + "time_per_iteration": 2.4379701614379883 + }, + { + "auxiliary_loss_clip": 0.01177415, + "auxiliary_loss_mlp": 0.01056732, + "balance_loss_clip": 1.0349052, + "balance_loss_mlp": 1.05105746, + "epoch": 0.09758003908011423, + "flos": 28402826826240.0, + "grad_norm": 1.9420709042223125, + "language_loss": 0.85783195, + "learning_rate": 3.9522836096304225e-06, + "loss": 0.88017344, + "num_input_tokens_seen": 34910485, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.265625, + "step": 1623, + "time_per_iteration": 2.51741099357605 + }, + { + "auxiliary_loss_clip": 0.01172696, + "auxiliary_loss_mlp": 0.01056286, + "balance_loss_clip": 1.03498316, + "balance_loss_mlp": 1.05181813, + "epoch": 0.09764016233278221, + "flos": 18144297336960.0, + "grad_norm": 2.2833168401589656, + "language_loss": 0.80328369, + "learning_rate": 3.952199007240184e-06, + "loss": 0.8255735, + "num_input_tokens_seen": 34928615, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.2109375, + "step": 1624, + "time_per_iteration": 2.4646618366241455 + }, + { + "auxiliary_loss_clip": 0.01170952, + "auxiliary_loss_mlp": 0.01044517, + "balance_loss_clip": 1.02450192, + "balance_loss_mlp": 1.04799926, + "epoch": 0.09770028558545017, + "flos": 15265842071040.0, + "grad_norm": 2.7577002662180954, + "language_loss": 0.8575626, + "learning_rate": 3.952114330822364e-06, + "loss": 0.87971735, + "num_input_tokens_seen": 34946045, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.234375, + "step": 1625, + "time_per_iteration": 2.452615976333618 + }, + { + "auxiliary_loss_clip": 0.01176501, + "auxiliary_loss_mlp": 0.01055325, + "balance_loss_clip": 1.03445125, + "balance_loss_mlp": 1.05226421, + "epoch": 0.09776040883811814, + "flos": 23472435219840.0, + "grad_norm": 3.258883448957912, + "language_loss": 0.8539601, + "learning_rate": 3.952029580380172e-06, + "loss": 0.87627834, + "num_input_tokens_seen": 34962865, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.2421875, + "step": 1626, + "time_per_iteration": 2.4931013584136963 + }, + { + "auxiliary_loss_clip": 0.01181466, + "auxiliary_loss_mlp": 0.0105723, + "balance_loss_clip": 1.03493834, + "balance_loss_mlp": 1.05541551, + "epoch": 0.09782053209078612, + "flos": 24499480798080.0, + "grad_norm": 1.979888643217431, + "language_loss": 0.83329904, + "learning_rate": 3.9519447559168234e-06, + "loss": 0.85568601, + "num_input_tokens_seen": 34983505, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.2578125, + "step": 1627, + "time_per_iteration": 2.5056917667388916 + }, + { + "auxiliary_loss_clip": 0.01170161, + "auxiliary_loss_mlp": 0.01050744, + "balance_loss_clip": 1.03065729, + "balance_loss_mlp": 1.0488416, + "epoch": 0.09788065534345408, + "flos": 21580158833280.0, + "grad_norm": 1.7873285490487296, + "language_loss": 0.84291327, + "learning_rate": 3.951859857435534e-06, + "loss": 0.86512232, + "num_input_tokens_seen": 35001825, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.2109375, + "step": 1628, + "time_per_iteration": 2.4835076332092285 + }, + { + "auxiliary_loss_clip": 0.01169153, + "auxiliary_loss_mlp": 0.01052825, + "balance_loss_clip": 1.0321064, + "balance_loss_mlp": 1.04880238, + "epoch": 0.09794077859612205, + "flos": 23842459175040.0, + "grad_norm": 1.6092149858605884, + "language_loss": 0.75609362, + "learning_rate": 3.951774884939523e-06, + "loss": 0.77831334, + "num_input_tokens_seen": 35023075, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.203125, + "step": 1629, + "time_per_iteration": 2.4959983825683594 + }, + { + "auxiliary_loss_clip": 0.01175285, + "auxiliary_loss_mlp": 0.01046701, + "balance_loss_clip": 1.02412319, + "balance_loss_mlp": 1.0530107, + "epoch": 0.09800090184879003, + "flos": 23659889322240.0, + "grad_norm": 1.5982247062153871, + "language_loss": 0.78224194, + "learning_rate": 3.951689838432013e-06, + "loss": 0.80446172, + "num_input_tokens_seen": 35043480, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.21875, + "step": 1630, + "time_per_iteration": 2.5138731002807617 + }, + { + "auxiliary_loss_clip": 0.01177321, + "auxiliary_loss_mlp": 0.01051411, + "balance_loss_clip": 1.0292381, + "balance_loss_mlp": 1.05457997, + "epoch": 0.09806102510145799, + "flos": 17055773631360.0, + "grad_norm": 1.9134334701620013, + "language_loss": 0.86704385, + "learning_rate": 3.951604717916228e-06, + "loss": 0.8893311, + "num_input_tokens_seen": 35061490, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.2265625, + "step": 1631, + "time_per_iteration": 2.443878173828125 + }, + { + "auxiliary_loss_clip": 0.01172712, + "auxiliary_loss_mlp": 0.01050929, + "balance_loss_clip": 1.03065109, + "balance_loss_mlp": 1.05258322, + "epoch": 0.09812114835412596, + "flos": 23878477537920.0, + "grad_norm": 2.096430969489036, + "language_loss": 0.83111286, + "learning_rate": 3.9515195233953975e-06, + "loss": 0.85334921, + "num_input_tokens_seen": 35079670, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.203125, + "step": 1632, + "time_per_iteration": 2.478825807571411 + }, + { + "auxiliary_loss_clip": 0.01174956, + "auxiliary_loss_mlp": 0.01056552, + "balance_loss_clip": 1.0368464, + "balance_loss_mlp": 1.05281615, + "epoch": 0.09818127160679392, + "flos": 20595488325120.0, + "grad_norm": 1.5107232822128822, + "language_loss": 0.7877655, + "learning_rate": 3.951434254872751e-06, + "loss": 0.81008065, + "num_input_tokens_seen": 35099205, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.21875, + "step": 1633, + "time_per_iteration": 2.447930097579956 + }, + { + "auxiliary_loss_clip": 0.01169184, + "auxiliary_loss_mlp": 0.01049402, + "balance_loss_clip": 1.02833819, + "balance_loss_mlp": 1.04989707, + "epoch": 0.0982413948594619, + "flos": 15487339288320.0, + "grad_norm": 2.0663591821232865, + "language_loss": 0.73159611, + "learning_rate": 3.951348912351521e-06, + "loss": 0.75378191, + "num_input_tokens_seen": 35115270, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1875, + "step": 1634, + "time_per_iteration": 2.460265636444092 + }, + { + "auxiliary_loss_clip": 0.01179893, + "auxiliary_loss_mlp": 0.01062758, + "balance_loss_clip": 1.04026294, + "balance_loss_mlp": 1.0516957, + "epoch": 0.09830151811212987, + "flos": 24207958016640.0, + "grad_norm": 2.7516342600991868, + "language_loss": 0.72714394, + "learning_rate": 3.951263495834947e-06, + "loss": 0.74957043, + "num_input_tokens_seen": 35134065, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.28125, + "step": 1635, + "time_per_iteration": 2.4835710525512695 + }, + { + "auxiliary_loss_clip": 0.01177592, + "auxiliary_loss_mlp": 0.01055297, + "balance_loss_clip": 1.03301644, + "balance_loss_mlp": 1.05253148, + "epoch": 0.09836164136479783, + "flos": 20594590485120.0, + "grad_norm": 1.8458745824258636, + "language_loss": 0.7819975, + "learning_rate": 3.951178005326264e-06, + "loss": 0.80432636, + "num_input_tokens_seen": 35154870, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.25, + "step": 1636, + "time_per_iteration": 2.53061842918396 + }, + { + "auxiliary_loss_clip": 0.01173491, + "auxiliary_loss_mlp": 0.01056847, + "balance_loss_clip": 1.03498387, + "balance_loss_mlp": 1.05113721, + "epoch": 0.09842176461746581, + "flos": 19934157070080.0, + "grad_norm": 2.2976115041381386, + "language_loss": 0.70005965, + "learning_rate": 3.951092440828715e-06, + "loss": 0.722363, + "num_input_tokens_seen": 35171850, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.21875, + "step": 1637, + "time_per_iteration": 2.445742130279541 + }, + { + "auxiliary_loss_clip": 0.0117379, + "auxiliary_loss_mlp": 0.01053221, + "balance_loss_clip": 1.03175139, + "balance_loss_mlp": 1.05108416, + "epoch": 0.09848188787013377, + "flos": 21214659991680.0, + "grad_norm": 2.115587702667026, + "language_loss": 0.77395654, + "learning_rate": 3.951006802345545e-06, + "loss": 0.79622668, + "num_input_tokens_seen": 35188795, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.2265625, + "step": 1638, + "time_per_iteration": 2.4725139141082764 + }, + { + "auxiliary_loss_clip": 0.01170234, + "auxiliary_loss_mlp": 0.01046096, + "balance_loss_clip": 1.02524579, + "balance_loss_mlp": 1.05077171, + "epoch": 0.09854201112280174, + "flos": 30154226071680.0, + "grad_norm": 1.4162008179950134, + "language_loss": 0.7263118, + "learning_rate": 3.950921089880003e-06, + "loss": 0.74847507, + "num_input_tokens_seen": 35212100, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1953125, + "step": 1639, + "time_per_iteration": 2.5534512996673584 + }, + { + "auxiliary_loss_clip": 0.01173162, + "auxiliary_loss_mlp": 0.01040763, + "balance_loss_clip": 1.01943696, + "balance_loss_mlp": 1.05003214, + "epoch": 0.09860213437546972, + "flos": 21795730306560.0, + "grad_norm": 1.8280373897837945, + "language_loss": 0.88669002, + "learning_rate": 3.950835303435337e-06, + "loss": 0.90882927, + "num_input_tokens_seen": 35230390, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.234375, + "step": 1640, + "time_per_iteration": 2.4868786334991455 + }, + { + "auxiliary_loss_clip": 0.01173727, + "auxiliary_loss_mlp": 0.01037743, + "balance_loss_clip": 1.01685774, + "balance_loss_mlp": 1.05164635, + "epoch": 0.09866225762813768, + "flos": 21835555511040.0, + "grad_norm": 2.1859335509376527, + "language_loss": 0.8086108, + "learning_rate": 3.950749443014801e-06, + "loss": 0.83072555, + "num_input_tokens_seen": 35250405, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.21875, + "step": 1641, + "time_per_iteration": 2.5081584453582764 + }, + { + "auxiliary_loss_clip": 0.01173536, + "auxiliary_loss_mlp": 0.01054387, + "balance_loss_clip": 1.03130805, + "balance_loss_mlp": 1.05067503, + "epoch": 0.09872238088080565, + "flos": 17599855916160.0, + "grad_norm": 2.4983515693134417, + "language_loss": 0.85826755, + "learning_rate": 3.95066350862165e-06, + "loss": 0.88054669, + "num_input_tokens_seen": 35262820, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.2265625, + "step": 1642, + "time_per_iteration": 2.4351255893707275 + }, + { + "auxiliary_loss_clip": 0.01177694, + "auxiliary_loss_mlp": 0.01053725, + "balance_loss_clip": 1.0326128, + "balance_loss_mlp": 1.05365527, + "epoch": 0.09878250413347361, + "flos": 27636134002560.0, + "grad_norm": 1.7421144196917664, + "language_loss": 0.80859929, + "learning_rate": 3.950577500259144e-06, + "loss": 0.83091342, + "num_input_tokens_seen": 35284490, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.2421875, + "step": 1643, + "time_per_iteration": 3.9550716876983643 + }, + { + "auxiliary_loss_clip": 0.01170472, + "auxiliary_loss_mlp": 0.01063125, + "balance_loss_clip": 1.04138088, + "balance_loss_mlp": 1.0494256, + "epoch": 0.0988426273861416, + "flos": 16544728880640.0, + "grad_norm": 1.9624417465121429, + "language_loss": 0.8262763, + "learning_rate": 3.950491417930543e-06, + "loss": 0.84861231, + "num_input_tokens_seen": 35302815, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.2109375, + "step": 1644, + "time_per_iteration": 3.8253817558288574 + }, + { + "auxiliary_loss_clip": 0.01169448, + "auxiliary_loss_mlp": 0.01048566, + "balance_loss_clip": 1.02733469, + "balance_loss_mlp": 1.05048347, + "epoch": 0.09890275063880956, + "flos": 21215270522880.0, + "grad_norm": 1.7099323885745632, + "language_loss": 0.6819675, + "learning_rate": 3.9504052616391124e-06, + "loss": 0.70414758, + "num_input_tokens_seen": 35321175, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1953125, + "step": 1645, + "time_per_iteration": 2.4549567699432373 + }, + { + "auxiliary_loss_clip": 0.01065531, + "auxiliary_loss_mlp": 0.01023286, + "balance_loss_clip": 1.0206517, + "balance_loss_mlp": 1.01924491, + "epoch": 0.09896287389147752, + "flos": 59379372910080.0, + "grad_norm": 0.9514884974425206, + "language_loss": 0.60854232, + "learning_rate": 3.950319031388119e-06, + "loss": 0.62943053, + "num_input_tokens_seen": 35381740, + "router_z_loss_clip": 0.02636719, + "router_z_loss_mlp": 0.46289062, + "step": 1646, + "time_per_iteration": 2.9953765869140625 + }, + { + "auxiliary_loss_clip": 0.01170253, + "auxiliary_loss_mlp": 0.01049996, + "balance_loss_clip": 1.02725112, + "balance_loss_mlp": 1.04880357, + "epoch": 0.0990229971441455, + "flos": 29642678530560.0, + "grad_norm": 2.5496486678231425, + "language_loss": 0.73046064, + "learning_rate": 3.950232727180833e-06, + "loss": 0.75266314, + "num_input_tokens_seen": 35403760, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.2109375, + "step": 1647, + "time_per_iteration": 2.5241641998291016 + }, + { + "auxiliary_loss_clip": 0.01171762, + "auxiliary_loss_mlp": 0.0105645, + "balance_loss_clip": 1.03663731, + "balance_loss_mlp": 1.04955053, + "epoch": 0.09908312039681347, + "flos": 21834873152640.0, + "grad_norm": 1.8237647662791463, + "language_loss": 0.84120429, + "learning_rate": 3.950146349020525e-06, + "loss": 0.86348635, + "num_input_tokens_seen": 35424050, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.21875, + "step": 1648, + "time_per_iteration": 2.467717170715332 + }, + { + "auxiliary_loss_clip": 0.01061152, + "auxiliary_loss_mlp": 0.01009658, + "balance_loss_clip": 1.00701165, + "balance_loss_mlp": 1.0159142, + "epoch": 0.09914324364948143, + "flos": 57564304807680.0, + "grad_norm": 0.7437092318732932, + "language_loss": 0.55674303, + "learning_rate": 3.950059896910473e-06, + "loss": 0.57745123, + "num_input_tokens_seen": 35481690, + "router_z_loss_clip": 0.02648926, + "router_z_loss_mlp": 0.453125, + "step": 1649, + "time_per_iteration": 2.99874210357666 + }, + { + "auxiliary_loss_clip": 0.01165781, + "auxiliary_loss_mlp": 0.01046657, + "balance_loss_clip": 1.02598572, + "balance_loss_mlp": 1.04597533, + "epoch": 0.09920336690214941, + "flos": 34123934476800.0, + "grad_norm": 2.284847215884091, + "language_loss": 0.89930248, + "learning_rate": 3.949973370853954e-06, + "loss": 0.92142689, + "num_input_tokens_seen": 35498635, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1953125, + "step": 1650, + "time_per_iteration": 2.5542855262756348 + }, + { + "auxiliary_loss_clip": 0.01058202, + "auxiliary_loss_mlp": 0.01003693, + "balance_loss_clip": 1.00105858, + "balance_loss_mlp": 1.01395106, + "epoch": 0.09926349015481738, + "flos": 71216428464000.0, + "grad_norm": 0.8031298543824162, + "language_loss": 0.63733649, + "learning_rate": 3.94988677085425e-06, + "loss": 0.65795547, + "num_input_tokens_seen": 35565720, + "router_z_loss_clip": 0.02636719, + "router_z_loss_mlp": 0.44140625, + "step": 1651, + "time_per_iteration": 3.217806100845337 + }, + { + "auxiliary_loss_clip": 0.01168872, + "auxiliary_loss_mlp": 0.01054978, + "balance_loss_clip": 1.03318655, + "balance_loss_mlp": 1.04885435, + "epoch": 0.09932361340748534, + "flos": 23148700917120.0, + "grad_norm": 1.9462006377707899, + "language_loss": 0.88288587, + "learning_rate": 3.949800096914643e-06, + "loss": 0.90512443, + "num_input_tokens_seen": 35586000, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.203125, + "step": 1652, + "time_per_iteration": 2.5014448165893555 + }, + { + "auxiliary_loss_clip": 0.01174376, + "auxiliary_loss_mlp": 0.01057611, + "balance_loss_clip": 1.03692842, + "balance_loss_mlp": 1.05190849, + "epoch": 0.09938373666015332, + "flos": 19828651847040.0, + "grad_norm": 1.9500387106757973, + "language_loss": 0.82206833, + "learning_rate": 3.949713349038422e-06, + "loss": 0.84438825, + "num_input_tokens_seen": 35604355, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.2265625, + "step": 1653, + "time_per_iteration": 2.4881839752197266 + }, + { + "auxiliary_loss_clip": 0.01172582, + "auxiliary_loss_mlp": 0.010545, + "balance_loss_clip": 1.03330469, + "balance_loss_mlp": 1.04984093, + "epoch": 0.09944385991282129, + "flos": 22090664880000.0, + "grad_norm": 2.0314065071494136, + "language_loss": 0.79399735, + "learning_rate": 3.949626527228875e-06, + "loss": 0.81626815, + "num_input_tokens_seen": 35625495, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.2265625, + "step": 1654, + "time_per_iteration": 2.5269205570220947 + }, + { + "auxiliary_loss_clip": 0.01167439, + "auxiliary_loss_mlp": 0.01055854, + "balance_loss_clip": 1.03700721, + "balance_loss_mlp": 1.05072093, + "epoch": 0.09950398316548925, + "flos": 19828867328640.0, + "grad_norm": 1.5637423809135174, + "language_loss": 0.8088094, + "learning_rate": 3.949539631489295e-06, + "loss": 0.83104229, + "num_input_tokens_seen": 35645030, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.171875, + "step": 1655, + "time_per_iteration": 2.4652602672576904 + }, + { + "auxiliary_loss_clip": 0.01167369, + "auxiliary_loss_mlp": 0.0105576, + "balance_loss_clip": 1.03495777, + "balance_loss_mlp": 1.04891443, + "epoch": 0.09956410641815722, + "flos": 25003701964800.0, + "grad_norm": 1.9082198159511756, + "language_loss": 0.80947387, + "learning_rate": 3.9494526618229765e-06, + "loss": 0.83170521, + "num_input_tokens_seen": 35664305, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.1875, + "step": 1656, + "time_per_iteration": 2.4966416358947754 + }, + { + "auxiliary_loss_clip": 0.01170477, + "auxiliary_loss_mlp": 0.0106116, + "balance_loss_clip": 1.04066813, + "balance_loss_mlp": 1.05147541, + "epoch": 0.0996242296708252, + "flos": 19317714837120.0, + "grad_norm": 1.6268850155063674, + "language_loss": 0.88850212, + "learning_rate": 3.949365618233217e-06, + "loss": 0.91081852, + "num_input_tokens_seen": 35684060, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1875, + "step": 1657, + "time_per_iteration": 2.446124792098999 + }, + { + "auxiliary_loss_clip": 0.01175951, + "auxiliary_loss_mlp": 0.01063236, + "balance_loss_clip": 1.04088378, + "balance_loss_mlp": 1.05091214, + "epoch": 0.09968435292349316, + "flos": 21871609787520.0, + "grad_norm": 2.0057694643168302, + "language_loss": 0.84758937, + "learning_rate": 3.9492785007233195e-06, + "loss": 0.86998123, + "num_input_tokens_seen": 35703250, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.25, + "step": 1658, + "time_per_iteration": 2.457902669906616 + }, + { + "auxiliary_loss_clip": 0.01054631, + "auxiliary_loss_mlp": 0.01077242, + "balance_loss_clip": 1.07460773, + "balance_loss_mlp": 1.0110395, + "epoch": 0.09974447617616113, + "flos": 65384533313280.0, + "grad_norm": 0.9153195332104517, + "language_loss": 0.60843968, + "learning_rate": 3.949191309296585e-06, + "loss": 0.62975848, + "num_input_tokens_seen": 35762165, + "router_z_loss_clip": 0.02636719, + "router_z_loss_mlp": 0.43554688, + "step": 1659, + "time_per_iteration": 3.077805519104004 + }, + { + "auxiliary_loss_clip": 0.01170517, + "auxiliary_loss_mlp": 0.01052954, + "balance_loss_clip": 1.03155613, + "balance_loss_mlp": 1.04999721, + "epoch": 0.0998045994288291, + "flos": 23659817495040.0, + "grad_norm": 1.8691655756599186, + "language_loss": 0.85116851, + "learning_rate": 3.949104043956321e-06, + "loss": 0.87340325, + "num_input_tokens_seen": 35781520, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.2109375, + "step": 1660, + "time_per_iteration": 2.49082612991333 + }, + { + "auxiliary_loss_clip": 0.01171003, + "auxiliary_loss_mlp": 0.01056184, + "balance_loss_clip": 1.03393948, + "balance_loss_mlp": 1.05291247, + "epoch": 0.09986472268149707, + "flos": 19609704495360.0, + "grad_norm": 2.130922035700174, + "language_loss": 0.80037123, + "learning_rate": 3.949016704705836e-06, + "loss": 0.8226431, + "num_input_tokens_seen": 35799565, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.1796875, + "step": 1661, + "time_per_iteration": 2.4412636756896973 + }, + { + "auxiliary_loss_clip": 0.01175671, + "auxiliary_loss_mlp": 0.01050112, + "balance_loss_clip": 1.02801085, + "balance_loss_mlp": 1.05002224, + "epoch": 0.09992484593416504, + "flos": 26213317395840.0, + "grad_norm": 1.8939661728963775, + "language_loss": 0.83592767, + "learning_rate": 3.948929291548443e-06, + "loss": 0.85818553, + "num_input_tokens_seen": 35821085, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.2578125, + "step": 1662, + "time_per_iteration": 2.5200328826904297 + }, + { + "auxiliary_loss_clip": 0.01171098, + "auxiliary_loss_mlp": 0.01052384, + "balance_loss_clip": 1.02972281, + "balance_loss_mlp": 1.05104828, + "epoch": 0.09998496918683301, + "flos": 17493632421120.0, + "grad_norm": 2.1063962968477, + "language_loss": 0.88696563, + "learning_rate": 3.9488418044874546e-06, + "loss": 0.90920055, + "num_input_tokens_seen": 35839840, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.203125, + "step": 1663, + "time_per_iteration": 2.42790150642395 + }, + { + "auxiliary_loss_clip": 0.01174901, + "auxiliary_loss_mlp": 0.01052956, + "balance_loss_clip": 1.03084326, + "balance_loss_mlp": 1.05225635, + "epoch": 0.10004509243950098, + "flos": 22784925928320.0, + "grad_norm": 1.6888490247303796, + "language_loss": 0.7034179, + "learning_rate": 3.948754243526191e-06, + "loss": 0.72569644, + "num_input_tokens_seen": 35861545, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.21875, + "step": 1664, + "time_per_iteration": 2.512596368789673 + }, + { + "auxiliary_loss_clip": 0.01173831, + "auxiliary_loss_mlp": 0.01050685, + "balance_loss_clip": 1.02903676, + "balance_loss_mlp": 1.0535655, + "epoch": 0.10010521569216894, + "flos": 16253385667200.0, + "grad_norm": 2.1773983349048804, + "language_loss": 0.7878316, + "learning_rate": 3.94866660866797e-06, + "loss": 0.81007671, + "num_input_tokens_seen": 35878295, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.203125, + "step": 1665, + "time_per_iteration": 2.4271252155303955 + }, + { + "auxiliary_loss_clip": 0.0117847, + "auxiliary_loss_mlp": 0.01061559, + "balance_loss_clip": 1.0404706, + "balance_loss_mlp": 1.05681181, + "epoch": 0.10016533894483691, + "flos": 23402589223680.0, + "grad_norm": 1.663243771388797, + "language_loss": 0.70152062, + "learning_rate": 3.9485788999161165e-06, + "loss": 0.72392094, + "num_input_tokens_seen": 35898990, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.21875, + "step": 1666, + "time_per_iteration": 2.499131202697754 + }, + { + "auxiliary_loss_clip": 0.01173729, + "auxiliary_loss_mlp": 0.01060981, + "balance_loss_clip": 1.03777063, + "balance_loss_mlp": 1.0506525, + "epoch": 0.10022546219750489, + "flos": 19354164163200.0, + "grad_norm": 1.8121915129470096, + "language_loss": 0.791031, + "learning_rate": 3.948491117273956e-06, + "loss": 0.8133781, + "num_input_tokens_seen": 35916225, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.234375, + "step": 1667, + "time_per_iteration": 2.4429264068603516 + }, + { + "auxiliary_loss_clip": 0.0117317, + "auxiliary_loss_mlp": 0.01050651, + "balance_loss_clip": 1.02810836, + "balance_loss_mlp": 1.05261493, + "epoch": 0.10028558545017285, + "flos": 27085766837760.0, + "grad_norm": 2.9507555712476945, + "language_loss": 0.7715596, + "learning_rate": 3.948403260744817e-06, + "loss": 0.79379785, + "num_input_tokens_seen": 35934630, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.203125, + "step": 1668, + "time_per_iteration": 2.5223031044006348 + }, + { + "auxiliary_loss_clip": 0.01173457, + "auxiliary_loss_mlp": 0.01057389, + "balance_loss_clip": 1.03434563, + "balance_loss_mlp": 1.05256963, + "epoch": 0.10034570870284082, + "flos": 25847136195840.0, + "grad_norm": 1.9809152554972944, + "language_loss": 0.77852714, + "learning_rate": 3.948315330332031e-06, + "loss": 0.80083561, + "num_input_tokens_seen": 35953855, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.2109375, + "step": 1669, + "time_per_iteration": 2.5082881450653076 + }, + { + "auxiliary_loss_clip": 0.01181618, + "auxiliary_loss_mlp": 0.01059972, + "balance_loss_clip": 1.03641593, + "balance_loss_mlp": 1.05464602, + "epoch": 0.1004058319555088, + "flos": 26249587153920.0, + "grad_norm": 2.145889566444559, + "language_loss": 0.85461181, + "learning_rate": 3.948227326038933e-06, + "loss": 0.87702769, + "num_input_tokens_seen": 35974555, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.2734375, + "step": 1670, + "time_per_iteration": 2.5235135555267334 + }, + { + "auxiliary_loss_clip": 0.01166248, + "auxiliary_loss_mlp": 0.01057789, + "balance_loss_clip": 1.03681993, + "balance_loss_mlp": 1.0501771, + "epoch": 0.10046595520817676, + "flos": 25374480105600.0, + "grad_norm": 1.5986093935623644, + "language_loss": 0.76899171, + "learning_rate": 3.9481392478688586e-06, + "loss": 0.79123211, + "num_input_tokens_seen": 35996830, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.15625, + "step": 1671, + "time_per_iteration": 2.505441665649414 + }, + { + "auxiliary_loss_clip": 0.01059926, + "auxiliary_loss_mlp": 0.01022857, + "balance_loss_clip": 1.02019823, + "balance_loss_mlp": 1.01598763, + "epoch": 0.10052607846084473, + "flos": 67461821677440.0, + "grad_norm": 0.7900846916321359, + "language_loss": 0.60719293, + "learning_rate": 3.948051095825149e-06, + "loss": 0.62802076, + "num_input_tokens_seen": 36054465, + "router_z_loss_clip": 0.02661133, + "router_z_loss_mlp": 0.43945312, + "step": 1672, + "time_per_iteration": 3.07255482673645 + }, + { + "auxiliary_loss_clip": 0.01173395, + "auxiliary_loss_mlp": 0.01064348, + "balance_loss_clip": 1.04179382, + "balance_loss_mlp": 1.05045998, + "epoch": 0.10058620171351271, + "flos": 21360493209600.0, + "grad_norm": 2.0407855091156377, + "language_loss": 0.77119517, + "learning_rate": 3.947962869911147e-06, + "loss": 0.79357255, + "num_input_tokens_seen": 36073480, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.234375, + "step": 1673, + "time_per_iteration": 2.4693222045898438 + }, + { + "auxiliary_loss_clip": 0.01171478, + "auxiliary_loss_mlp": 0.01052114, + "balance_loss_clip": 1.03066778, + "balance_loss_mlp": 1.04964709, + "epoch": 0.10064632496618067, + "flos": 16800125558400.0, + "grad_norm": 2.2570599367002835, + "language_loss": 0.72829556, + "learning_rate": 3.947874570130197e-06, + "loss": 0.75053144, + "num_input_tokens_seen": 36091830, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.21875, + "step": 1674, + "time_per_iteration": 2.4534130096435547 + }, + { + "auxiliary_loss_clip": 0.01170516, + "auxiliary_loss_mlp": 0.01051148, + "balance_loss_clip": 1.03047729, + "balance_loss_mlp": 1.04903197, + "epoch": 0.10070644821884864, + "flos": 23624445576960.0, + "grad_norm": 2.043409325490185, + "language_loss": 0.79386973, + "learning_rate": 3.947786196485649e-06, + "loss": 0.81608635, + "num_input_tokens_seen": 36111400, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.21875, + "step": 1675, + "time_per_iteration": 2.496504545211792 + }, + { + "auxiliary_loss_clip": 0.01168157, + "auxiliary_loss_mlp": 0.01064762, + "balance_loss_clip": 1.04449606, + "balance_loss_mlp": 1.04908013, + "epoch": 0.1007665714715166, + "flos": 24462564595200.0, + "grad_norm": 2.0305638084579294, + "language_loss": 0.81565315, + "learning_rate": 3.947697748980853e-06, + "loss": 0.8379823, + "num_input_tokens_seen": 36129345, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1875, + "step": 1676, + "time_per_iteration": 2.5022919178009033 + }, + { + "auxiliary_loss_clip": 0.01174395, + "auxiliary_loss_mlp": 0.01058603, + "balance_loss_clip": 1.03713369, + "balance_loss_mlp": 1.05283856, + "epoch": 0.10082669472418458, + "flos": 16799119977600.0, + "grad_norm": 2.134524944411931, + "language_loss": 0.86155027, + "learning_rate": 3.947609227619163e-06, + "loss": 0.88388026, + "num_input_tokens_seen": 36146255, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.2109375, + "step": 1677, + "time_per_iteration": 2.44887113571167 + }, + { + "auxiliary_loss_clip": 0.01171962, + "auxiliary_loss_mlp": 0.01055328, + "balance_loss_clip": 1.03452563, + "balance_loss_mlp": 1.05113602, + "epoch": 0.10088681797685255, + "flos": 13553513844480.0, + "grad_norm": 5.349815535910457, + "language_loss": 0.86318195, + "learning_rate": 3.947520632403936e-06, + "loss": 0.88545489, + "num_input_tokens_seen": 36164050, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.2109375, + "step": 1678, + "time_per_iteration": 2.4373903274536133 + }, + { + "auxiliary_loss_clip": 0.01172423, + "auxiliary_loss_mlp": 0.01055078, + "balance_loss_clip": 1.03359675, + "balance_loss_mlp": 1.05214512, + "epoch": 0.10094694122952051, + "flos": 25265706744960.0, + "grad_norm": 2.6897314721028867, + "language_loss": 0.89726269, + "learning_rate": 3.947431963338532e-06, + "loss": 0.91953766, + "num_input_tokens_seen": 36183530, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.203125, + "step": 1679, + "time_per_iteration": 2.493959903717041 + }, + { + "auxiliary_loss_clip": 0.01056795, + "auxiliary_loss_mlp": 0.01017317, + "balance_loss_clip": 1.01468229, + "balance_loss_mlp": 1.01327634, + "epoch": 0.10100706448218849, + "flos": 69854299885440.0, + "grad_norm": 0.7831657514235874, + "language_loss": 0.53018153, + "learning_rate": 3.947343220426312e-06, + "loss": 0.55092263, + "num_input_tokens_seen": 36248550, + "router_z_loss_clip": 0.02636719, + "router_z_loss_mlp": 0.43554688, + "step": 1680, + "time_per_iteration": 3.15899658203125 + }, + { + "auxiliary_loss_clip": 0.01168402, + "auxiliary_loss_mlp": 0.01055332, + "balance_loss_clip": 1.03429151, + "balance_loss_mlp": 1.04983318, + "epoch": 0.10106718773485646, + "flos": 20007163463040.0, + "grad_norm": 1.657625192327098, + "language_loss": 0.76889706, + "learning_rate": 3.947254403670641e-06, + "loss": 0.79113436, + "num_input_tokens_seen": 36266065, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.1875, + "step": 1681, + "time_per_iteration": 2.446937322616577 + }, + { + "auxiliary_loss_clip": 0.01175341, + "auxiliary_loss_mlp": 0.01058478, + "balance_loss_clip": 1.03423131, + "balance_loss_mlp": 1.04937744, + "epoch": 0.10112731098752442, + "flos": 13479825093120.0, + "grad_norm": 2.135292201068385, + "language_loss": 0.93928307, + "learning_rate": 3.947165513074889e-06, + "loss": 0.96162128, + "num_input_tokens_seen": 36280960, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.2578125, + "step": 1682, + "time_per_iteration": 2.4357759952545166 + }, + { + "auxiliary_loss_clip": 0.01172101, + "auxiliary_loss_mlp": 0.01053977, + "balance_loss_clip": 1.03315091, + "balance_loss_mlp": 1.05045152, + "epoch": 0.1011874342401924, + "flos": 18515901490560.0, + "grad_norm": 5.112669241194533, + "language_loss": 0.87866408, + "learning_rate": 3.947076548642425e-06, + "loss": 0.90092492, + "num_input_tokens_seen": 36299010, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.21875, + "step": 1683, + "time_per_iteration": 2.427802562713623 + }, + { + "auxiliary_loss_clip": 0.01169341, + "auxiliary_loss_mlp": 0.01059869, + "balance_loss_clip": 1.03888798, + "balance_loss_mlp": 1.05144525, + "epoch": 0.10124755749286037, + "flos": 20702861055360.0, + "grad_norm": 1.7718228637860187, + "language_loss": 0.74768114, + "learning_rate": 3.946987510376624e-06, + "loss": 0.76997328, + "num_input_tokens_seen": 36318400, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.1796875, + "step": 1684, + "time_per_iteration": 5.332470417022705 + }, + { + "auxiliary_loss_clip": 0.01059522, + "auxiliary_loss_mlp": 0.01011499, + "balance_loss_clip": 1.00863802, + "balance_loss_mlp": 1.01624751, + "epoch": 0.10130768074552833, + "flos": 56109456247680.0, + "grad_norm": 0.760003339390084, + "language_loss": 0.61090153, + "learning_rate": 3.9468983982808615e-06, + "loss": 0.6316117, + "num_input_tokens_seen": 36381815, + "router_z_loss_clip": 0.02856445, + "router_z_loss_mlp": 0.43359375, + "step": 1685, + "time_per_iteration": 4.508171081542969 + }, + { + "auxiliary_loss_clip": 0.01169013, + "auxiliary_loss_mlp": 0.01049359, + "balance_loss_clip": 1.02769828, + "balance_loss_mlp": 1.04891801, + "epoch": 0.1013678039981963, + "flos": 33402346156800.0, + "grad_norm": 2.3224629698824075, + "language_loss": 0.61664945, + "learning_rate": 3.946809212358516e-06, + "loss": 0.63883317, + "num_input_tokens_seen": 36404320, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.203125, + "step": 1686, + "time_per_iteration": 2.586596965789795 + }, + { + "auxiliary_loss_clip": 0.01173787, + "auxiliary_loss_mlp": 0.01054454, + "balance_loss_clip": 1.03238797, + "balance_loss_mlp": 1.0545882, + "epoch": 0.10142792725086427, + "flos": 31905338008320.0, + "grad_norm": 2.1992592502117443, + "language_loss": 0.81408226, + "learning_rate": 3.946719952612972e-06, + "loss": 0.83636469, + "num_input_tokens_seen": 36427510, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1875, + "step": 1687, + "time_per_iteration": 2.5495810508728027 + }, + { + "auxiliary_loss_clip": 0.01173812, + "auxiliary_loss_mlp": 0.01051846, + "balance_loss_clip": 1.03055501, + "balance_loss_mlp": 1.0514555, + "epoch": 0.10148805050353224, + "flos": 28475905046400.0, + "grad_norm": 1.783489688966995, + "language_loss": 0.72360015, + "learning_rate": 3.94663061904761e-06, + "loss": 0.74585676, + "num_input_tokens_seen": 36448230, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.21875, + "step": 1688, + "time_per_iteration": 2.5242748260498047 + }, + { + "auxiliary_loss_clip": 0.01169898, + "auxiliary_loss_mlp": 0.01054433, + "balance_loss_clip": 1.03264165, + "balance_loss_mlp": 1.05043888, + "epoch": 0.1015481737562002, + "flos": 25148888737920.0, + "grad_norm": 1.9893327907397977, + "language_loss": 0.86880058, + "learning_rate": 3.94654121166582e-06, + "loss": 0.8910439, + "num_input_tokens_seen": 36464395, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.1875, + "step": 1689, + "time_per_iteration": 2.5283408164978027 + }, + { + "auxiliary_loss_clip": 0.01165961, + "auxiliary_loss_mlp": 0.01045526, + "balance_loss_clip": 1.02585626, + "balance_loss_mlp": 1.04692245, + "epoch": 0.10160829700886818, + "flos": 30882781630080.0, + "grad_norm": 1.8972643802531153, + "language_loss": 0.88054395, + "learning_rate": 3.946451730470993e-06, + "loss": 0.90265882, + "num_input_tokens_seen": 36486475, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1875, + "step": 1690, + "time_per_iteration": 2.5732247829437256 + }, + { + "auxiliary_loss_clip": 0.01170509, + "auxiliary_loss_mlp": 0.01051598, + "balance_loss_clip": 1.02961624, + "balance_loss_mlp": 1.04965854, + "epoch": 0.10166842026153615, + "flos": 20412020632320.0, + "grad_norm": 1.8841763324380914, + "language_loss": 0.83124495, + "learning_rate": 3.946362175466521e-06, + "loss": 0.85346603, + "num_input_tokens_seen": 36505310, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.203125, + "step": 1691, + "time_per_iteration": 2.453263282775879 + }, + { + "auxiliary_loss_clip": 0.01172636, + "auxiliary_loss_mlp": 0.01050561, + "balance_loss_clip": 1.028579, + "balance_loss_mlp": 1.05049825, + "epoch": 0.10172854351420411, + "flos": 33476968661760.0, + "grad_norm": 1.648035623213742, + "language_loss": 0.66938514, + "learning_rate": 3.946272546655801e-06, + "loss": 0.69161713, + "num_input_tokens_seen": 36529820, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.21875, + "step": 1692, + "time_per_iteration": 2.5865867137908936 + }, + { + "auxiliary_loss_clip": 0.01167535, + "auxiliary_loss_mlp": 0.01067279, + "balance_loss_clip": 1.04540372, + "balance_loss_mlp": 1.0471102, + "epoch": 0.1017886667668721, + "flos": 23550325862400.0, + "grad_norm": 1.649284734670808, + "language_loss": 0.75387824, + "learning_rate": 3.94618284404223e-06, + "loss": 0.77622634, + "num_input_tokens_seen": 36549000, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.203125, + "step": 1693, + "time_per_iteration": 2.499476194381714 + }, + { + "auxiliary_loss_clip": 0.01171507, + "auxiliary_loss_mlp": 0.01049517, + "balance_loss_clip": 1.02685595, + "balance_loss_mlp": 1.04984784, + "epoch": 0.10184879001954006, + "flos": 23296078419840.0, + "grad_norm": 1.6930931596653784, + "language_loss": 0.87206519, + "learning_rate": 3.9460930676292105e-06, + "loss": 0.89427543, + "num_input_tokens_seen": 36567515, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.21875, + "step": 1694, + "time_per_iteration": 2.483264923095703 + }, + { + "auxiliary_loss_clip": 0.01177185, + "auxiliary_loss_mlp": 0.01052768, + "balance_loss_clip": 1.03013015, + "balance_loss_mlp": 1.05056214, + "epoch": 0.10190891327220802, + "flos": 18333116156160.0, + "grad_norm": 3.1999162319303274, + "language_loss": 0.79579329, + "learning_rate": 3.946003217420147e-06, + "loss": 0.81809288, + "num_input_tokens_seen": 36586190, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.265625, + "step": 1695, + "time_per_iteration": 2.4574177265167236 + }, + { + "auxiliary_loss_clip": 0.01168528, + "auxiliary_loss_mlp": 0.01055372, + "balance_loss_clip": 1.03280592, + "balance_loss_mlp": 1.04648614, + "epoch": 0.10196903652487599, + "flos": 26465374108800.0, + "grad_norm": 1.7546035908378184, + "language_loss": 0.86581397, + "learning_rate": 3.945913293418447e-06, + "loss": 0.88805294, + "num_input_tokens_seen": 36607495, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.21875, + "step": 1696, + "time_per_iteration": 2.4986772537231445 + }, + { + "auxiliary_loss_clip": 0.01168623, + "auxiliary_loss_mlp": 0.01056747, + "balance_loss_clip": 1.03532469, + "balance_loss_mlp": 1.04927731, + "epoch": 0.10202915977754397, + "flos": 21869526798720.0, + "grad_norm": 1.97196247739744, + "language_loss": 0.82034266, + "learning_rate": 3.945823295627519e-06, + "loss": 0.84259629, + "num_input_tokens_seen": 36628555, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.1953125, + "step": 1697, + "time_per_iteration": 2.483682155609131 + }, + { + "auxiliary_loss_clip": 0.01170239, + "auxiliary_loss_mlp": 0.0104937, + "balance_loss_clip": 1.02674437, + "balance_loss_mlp": 1.0477041, + "epoch": 0.10208928303021193, + "flos": 22309755886080.0, + "grad_norm": 1.9483747561194416, + "language_loss": 0.80650747, + "learning_rate": 3.9457332240507775e-06, + "loss": 0.82870358, + "num_input_tokens_seen": 36646250, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.2265625, + "step": 1698, + "time_per_iteration": 2.4512858390808105 + }, + { + "auxiliary_loss_clip": 0.01172882, + "auxiliary_loss_mlp": 0.01048738, + "balance_loss_clip": 1.02756608, + "balance_loss_mlp": 1.05113077, + "epoch": 0.1021494062828799, + "flos": 22125569921280.0, + "grad_norm": 4.641294823605382, + "language_loss": 0.75680709, + "learning_rate": 3.945643078691637e-06, + "loss": 0.77902329, + "num_input_tokens_seen": 36666675, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.21875, + "step": 1699, + "time_per_iteration": 2.458315849304199 + }, + { + "auxiliary_loss_clip": 0.01171952, + "auxiliary_loss_mlp": 0.01048121, + "balance_loss_clip": 1.02606726, + "balance_loss_mlp": 1.05093145, + "epoch": 0.10220952953554788, + "flos": 19646728439040.0, + "grad_norm": 1.7623204527071121, + "language_loss": 0.79777479, + "learning_rate": 3.945552859553516e-06, + "loss": 0.81997555, + "num_input_tokens_seen": 36685225, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.2109375, + "step": 1700, + "time_per_iteration": 2.4692423343658447 + }, + { + "auxiliary_loss_clip": 0.01170922, + "auxiliary_loss_mlp": 0.01045823, + "balance_loss_clip": 1.02411532, + "balance_loss_mlp": 1.04850125, + "epoch": 0.10226965278821584, + "flos": 29787290686080.0, + "grad_norm": 1.8827887870563835, + "language_loss": 0.76854098, + "learning_rate": 3.945462566639836e-06, + "loss": 0.79070842, + "num_input_tokens_seen": 36705985, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.21875, + "step": 1701, + "time_per_iteration": 2.5015852451324463 + }, + { + "auxiliary_loss_clip": 0.01176415, + "auxiliary_loss_mlp": 0.01048843, + "balance_loss_clip": 1.02708709, + "balance_loss_mlp": 1.05213511, + "epoch": 0.10232977604088381, + "flos": 27016818681600.0, + "grad_norm": 2.1180628790190927, + "language_loss": 0.78123891, + "learning_rate": 3.945372199954019e-06, + "loss": 0.80349147, + "num_input_tokens_seen": 36725815, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.2421875, + "step": 1702, + "time_per_iteration": 2.4999852180480957 + }, + { + "auxiliary_loss_clip": 0.01168217, + "auxiliary_loss_mlp": 0.01046251, + "balance_loss_clip": 1.02586651, + "balance_loss_mlp": 1.0487566, + "epoch": 0.10238989929355179, + "flos": 20777519473920.0, + "grad_norm": 2.3091523831758765, + "language_loss": 0.94838184, + "learning_rate": 3.945281759499494e-06, + "loss": 0.97052652, + "num_input_tokens_seen": 36742345, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.1953125, + "step": 1703, + "time_per_iteration": 2.4586100578308105 + }, + { + "auxiliary_loss_clip": 0.01058115, + "auxiliary_loss_mlp": 0.01013234, + "balance_loss_clip": 1.01077867, + "balance_loss_mlp": 1.01462317, + "epoch": 0.10245002254621975, + "flos": 57698322451200.0, + "grad_norm": 0.8800585598511617, + "language_loss": 0.55092424, + "learning_rate": 3.94519124527969e-06, + "loss": 0.57163775, + "num_input_tokens_seen": 36798775, + "router_z_loss_clip": 0.02453613, + "router_z_loss_mlp": 0.43554688, + "step": 1704, + "time_per_iteration": 2.998384952545166 + }, + { + "auxiliary_loss_clip": 0.01170706, + "auxiliary_loss_mlp": 0.01050153, + "balance_loss_clip": 1.02790844, + "balance_loss_mlp": 1.04962945, + "epoch": 0.10251014579888772, + "flos": 16800125558400.0, + "grad_norm": 3.5257555777633174, + "language_loss": 0.83979154, + "learning_rate": 3.945100657298039e-06, + "loss": 0.86200017, + "num_input_tokens_seen": 36816295, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.2109375, + "step": 1705, + "time_per_iteration": 2.4242281913757324 + }, + { + "auxiliary_loss_clip": 0.01057951, + "auxiliary_loss_mlp": 0.01005039, + "balance_loss_clip": 1.00258374, + "balance_loss_mlp": 1.01514411, + "epoch": 0.1025702690515557, + "flos": 68565500922240.0, + "grad_norm": 0.7733309182053202, + "language_loss": 0.60434854, + "learning_rate": 3.9450099955579765e-06, + "loss": 0.62497854, + "num_input_tokens_seen": 36882030, + "router_z_loss_clip": 0.02453613, + "router_z_loss_mlp": 0.4296875, + "step": 1706, + "time_per_iteration": 3.127495765686035 + }, + { + "auxiliary_loss_clip": 0.01175774, + "auxiliary_loss_mlp": 0.01050349, + "balance_loss_clip": 1.02876019, + "balance_loss_mlp": 1.05214357, + "epoch": 0.10263039230422366, + "flos": 14866623336960.0, + "grad_norm": 2.0444921886168284, + "language_loss": 0.85967243, + "learning_rate": 3.94491926006294e-06, + "loss": 0.88193369, + "num_input_tokens_seen": 36899245, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.234375, + "step": 1707, + "time_per_iteration": 2.4486777782440186 + }, + { + "auxiliary_loss_clip": 0.01169845, + "auxiliary_loss_mlp": 0.01046854, + "balance_loss_clip": 1.02654099, + "balance_loss_mlp": 1.04891372, + "epoch": 0.10269051555689163, + "flos": 25337599816320.0, + "grad_norm": 1.6368034329364625, + "language_loss": 0.72840983, + "learning_rate": 3.944828450816369e-06, + "loss": 0.75057685, + "num_input_tokens_seen": 36920950, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.2109375, + "step": 1708, + "time_per_iteration": 2.5019850730895996 + }, + { + "auxiliary_loss_clip": 0.01168702, + "auxiliary_loss_mlp": 0.01054619, + "balance_loss_clip": 1.0325532, + "balance_loss_mlp": 1.0493356, + "epoch": 0.10275063880955959, + "flos": 21068826773760.0, + "grad_norm": 1.9016884094819633, + "language_loss": 0.90944314, + "learning_rate": 3.944737567821709e-06, + "loss": 0.93167639, + "num_input_tokens_seen": 36938900, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1953125, + "step": 1709, + "time_per_iteration": 2.4590399265289307 + }, + { + "auxiliary_loss_clip": 0.01173643, + "auxiliary_loss_mlp": 0.01055032, + "balance_loss_clip": 1.03357422, + "balance_loss_mlp": 1.05296373, + "epoch": 0.10281076206222757, + "flos": 30366780802560.0, + "grad_norm": 3.826538703219267, + "language_loss": 0.8828221, + "learning_rate": 3.944646611082406e-06, + "loss": 0.90510881, + "num_input_tokens_seen": 36957010, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.203125, + "step": 1710, + "time_per_iteration": 2.533165216445923 + }, + { + "auxiliary_loss_clip": 0.01167248, + "auxiliary_loss_mlp": 0.01053637, + "balance_loss_clip": 1.03229809, + "balance_loss_mlp": 1.04937959, + "epoch": 0.10287088531489554, + "flos": 22418313765120.0, + "grad_norm": 1.824520485293549, + "language_loss": 0.79264998, + "learning_rate": 3.944555580601908e-06, + "loss": 0.81485879, + "num_input_tokens_seen": 36977690, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1796875, + "step": 1711, + "time_per_iteration": 2.4947102069854736 + }, + { + "auxiliary_loss_clip": 0.01171963, + "auxiliary_loss_mlp": 0.01058195, + "balance_loss_clip": 1.03615332, + "balance_loss_mlp": 1.05005431, + "epoch": 0.1029310085675635, + "flos": 25115994858240.0, + "grad_norm": 2.0689984646996016, + "language_loss": 0.73589319, + "learning_rate": 3.944464476383668e-06, + "loss": 0.7581948, + "num_input_tokens_seen": 36997300, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.21875, + "step": 1712, + "time_per_iteration": 2.521899461746216 + }, + { + "auxiliary_loss_clip": 0.01166438, + "auxiliary_loss_mlp": 0.01058704, + "balance_loss_clip": 1.03902221, + "balance_loss_mlp": 1.04961872, + "epoch": 0.10299113182023148, + "flos": 19865639877120.0, + "grad_norm": 1.8460865361447714, + "language_loss": 0.86673403, + "learning_rate": 3.94437329843114e-06, + "loss": 0.8889854, + "num_input_tokens_seen": 37016110, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.1640625, + "step": 1713, + "time_per_iteration": 2.467824935913086 + }, + { + "auxiliary_loss_clip": 0.01166539, + "auxiliary_loss_mlp": 0.01059926, + "balance_loss_clip": 1.04019666, + "balance_loss_mlp": 1.04741335, + "epoch": 0.10305125507289944, + "flos": 20447608032000.0, + "grad_norm": 2.6691144860495126, + "language_loss": 0.72610664, + "learning_rate": 3.944282046747782e-06, + "loss": 0.74837124, + "num_input_tokens_seen": 37036405, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1875, + "step": 1714, + "time_per_iteration": 2.478605031967163 + }, + { + "auxiliary_loss_clip": 0.0117345, + "auxiliary_loss_mlp": 0.01057893, + "balance_loss_clip": 1.03542209, + "balance_loss_mlp": 1.04920006, + "epoch": 0.10311137832556741, + "flos": 26250772302720.0, + "grad_norm": 2.3323118637090605, + "language_loss": 0.91395295, + "learning_rate": 3.944190721337053e-06, + "loss": 0.93626636, + "num_input_tokens_seen": 37057580, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.2421875, + "step": 1715, + "time_per_iteration": 2.5223729610443115 + }, + { + "auxiliary_loss_clip": 0.01167345, + "auxiliary_loss_mlp": 0.01053609, + "balance_loss_clip": 1.03290248, + "balance_loss_mlp": 1.04737377, + "epoch": 0.10317150157823539, + "flos": 35298932175360.0, + "grad_norm": 1.9302110224144968, + "language_loss": 0.75736755, + "learning_rate": 3.944099322202418e-06, + "loss": 0.77957708, + "num_input_tokens_seen": 37079120, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.203125, + "step": 1716, + "time_per_iteration": 2.585345506668091 + }, + { + "auxiliary_loss_clip": 0.01171415, + "auxiliary_loss_mlp": 0.01068189, + "balance_loss_clip": 1.04601645, + "balance_loss_mlp": 1.04868793, + "epoch": 0.10323162483090335, + "flos": 25739943033600.0, + "grad_norm": 2.1161503252482747, + "language_loss": 0.85214567, + "learning_rate": 3.944007849347342e-06, + "loss": 0.87454176, + "num_input_tokens_seen": 37099710, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.2265625, + "step": 1717, + "time_per_iteration": 2.500964879989624 + }, + { + "auxiliary_loss_clip": 0.01169937, + "auxiliary_loss_mlp": 0.01055982, + "balance_loss_clip": 1.0358479, + "balance_loss_mlp": 1.05102515, + "epoch": 0.10329174808357132, + "flos": 16289870906880.0, + "grad_norm": 2.0308520014155746, + "language_loss": 0.82883167, + "learning_rate": 3.943916302775292e-06, + "loss": 0.85109091, + "num_input_tokens_seen": 37117775, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.1875, + "step": 1718, + "time_per_iteration": 2.436836004257202 + }, + { + "auxiliary_loss_clip": 0.01169212, + "auxiliary_loss_mlp": 0.01052655, + "balance_loss_clip": 1.03058898, + "balance_loss_mlp": 1.05092025, + "epoch": 0.10335187133623928, + "flos": 36687166963200.0, + "grad_norm": 1.8725763890619624, + "language_loss": 0.73192763, + "learning_rate": 3.943824682489742e-06, + "loss": 0.75414634, + "num_input_tokens_seen": 37140280, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1796875, + "step": 1719, + "time_per_iteration": 2.606293201446533 + }, + { + "auxiliary_loss_clip": 0.01172065, + "auxiliary_loss_mlp": 0.01046316, + "balance_loss_clip": 1.02637219, + "balance_loss_mlp": 1.05197001, + "epoch": 0.10341199458890726, + "flos": 14975648092800.0, + "grad_norm": 2.356604748076592, + "language_loss": 0.92601806, + "learning_rate": 3.9437329884941665e-06, + "loss": 0.94820189, + "num_input_tokens_seen": 37158350, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.203125, + "step": 1720, + "time_per_iteration": 2.4628992080688477 + }, + { + "auxiliary_loss_clip": 0.01167182, + "auxiliary_loss_mlp": 0.01054246, + "balance_loss_clip": 1.03239512, + "balance_loss_mlp": 1.04656935, + "epoch": 0.10347211784157523, + "flos": 21031587348480.0, + "grad_norm": 2.8075298743139174, + "language_loss": 0.79416633, + "learning_rate": 3.943641220792039e-06, + "loss": 0.81638062, + "num_input_tokens_seen": 37177120, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.2109375, + "step": 1721, + "time_per_iteration": 2.4713754653930664 + }, + { + "auxiliary_loss_clip": 0.0117694, + "auxiliary_loss_mlp": 0.01056525, + "balance_loss_clip": 1.03317165, + "balance_loss_mlp": 1.05172479, + "epoch": 0.1035322410942432, + "flos": 19792094780160.0, + "grad_norm": 2.496468299898097, + "language_loss": 0.80755401, + "learning_rate": 3.9435493793868434e-06, + "loss": 0.82988858, + "num_input_tokens_seen": 37195895, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.25, + "step": 1722, + "time_per_iteration": 2.4676520824432373 + }, + { + "auxiliary_loss_clip": 0.01056762, + "auxiliary_loss_mlp": 0.01049921, + "balance_loss_clip": 1.04772782, + "balance_loss_mlp": 1.013726, + "epoch": 0.10359236434691117, + "flos": 52698874947840.0, + "grad_norm": 0.9564367479099696, + "language_loss": 0.67185652, + "learning_rate": 3.943457464282059e-06, + "loss": 0.69292337, + "num_input_tokens_seen": 37247270, + "router_z_loss_clip": 0.02197266, + "router_z_loss_mlp": 0.4296875, + "step": 1723, + "time_per_iteration": 2.8474721908569336 + }, + { + "auxiliary_loss_clip": 0.01170693, + "auxiliary_loss_mlp": 0.01050183, + "balance_loss_clip": 1.02951217, + "balance_loss_mlp": 1.04747462, + "epoch": 0.10365248759957914, + "flos": 18405404277120.0, + "grad_norm": 2.780632359822339, + "language_loss": 0.77922273, + "learning_rate": 3.9433654754811745e-06, + "loss": 0.80143142, + "num_input_tokens_seen": 37265595, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.234375, + "step": 1724, + "time_per_iteration": 2.4311840534210205 + }, + { + "auxiliary_loss_clip": 0.01175556, + "auxiliary_loss_mlp": 0.01052899, + "balance_loss_clip": 1.03233576, + "balance_loss_mlp": 1.05101144, + "epoch": 0.1037126108522471, + "flos": 47553555335040.0, + "grad_norm": 1.8180629527722856, + "language_loss": 0.74894094, + "learning_rate": 3.943273412987676e-06, + "loss": 0.77122545, + "num_input_tokens_seen": 37286660, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.2421875, + "step": 1725, + "time_per_iteration": 2.6802284717559814 + }, + { + "auxiliary_loss_clip": 0.01170353, + "auxiliary_loss_mlp": 0.01049343, + "balance_loss_clip": 1.02852905, + "balance_loss_mlp": 1.05098462, + "epoch": 0.10377273410491508, + "flos": 22816670572800.0, + "grad_norm": 2.4392097975248244, + "language_loss": 0.75290418, + "learning_rate": 3.943181276805054e-06, + "loss": 0.77510113, + "num_input_tokens_seen": 37304915, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.1875, + "step": 1726, + "time_per_iteration": 5.461729049682617 + }, + { + "auxiliary_loss_clip": 0.01174745, + "auxiliary_loss_mlp": 0.01059612, + "balance_loss_clip": 1.03765321, + "balance_loss_mlp": 1.0527426, + "epoch": 0.10383285735758305, + "flos": 26138694890880.0, + "grad_norm": 1.8824890959349092, + "language_loss": 0.73943913, + "learning_rate": 3.9430890669368035e-06, + "loss": 0.76178271, + "num_input_tokens_seen": 37325265, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.21875, + "step": 1727, + "time_per_iteration": 3.883134126663208 + }, + { + "auxiliary_loss_clip": 0.01169505, + "auxiliary_loss_mlp": 0.01051483, + "balance_loss_clip": 1.03023946, + "balance_loss_mlp": 1.04815936, + "epoch": 0.10389298061025101, + "flos": 17091791994240.0, + "grad_norm": 2.187385195417556, + "language_loss": 0.84670323, + "learning_rate": 3.942996783386422e-06, + "loss": 0.86891311, + "num_input_tokens_seen": 37341650, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.2109375, + "step": 1728, + "time_per_iteration": 2.4405598640441895 + }, + { + "auxiliary_loss_clip": 0.01171168, + "auxiliary_loss_mlp": 0.0105142, + "balance_loss_clip": 1.02980709, + "balance_loss_mlp": 1.05098438, + "epoch": 0.10395310386291898, + "flos": 20776513893120.0, + "grad_norm": 2.4528097766615677, + "language_loss": 0.70985407, + "learning_rate": 3.942904426157406e-06, + "loss": 0.73207992, + "num_input_tokens_seen": 37360270, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.203125, + "step": 1729, + "time_per_iteration": 2.465688467025757 + }, + { + "auxiliary_loss_clip": 0.01170997, + "auxiliary_loss_mlp": 0.01059912, + "balance_loss_clip": 1.03679705, + "balance_loss_mlp": 1.05000722, + "epoch": 0.10401322711558696, + "flos": 12820540913280.0, + "grad_norm": 2.5788681057232625, + "language_loss": 0.81288344, + "learning_rate": 3.9428119952532605e-06, + "loss": 0.8351925, + "num_input_tokens_seen": 37375225, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.2109375, + "step": 1730, + "time_per_iteration": 2.4582717418670654 + }, + { + "auxiliary_loss_clip": 0.01167657, + "auxiliary_loss_mlp": 0.010515, + "balance_loss_clip": 1.03190255, + "balance_loss_mlp": 1.04836845, + "epoch": 0.10407335036825492, + "flos": 23184683366400.0, + "grad_norm": 2.1021084439253723, + "language_loss": 0.75932384, + "learning_rate": 3.942719490677489e-06, + "loss": 0.78151548, + "num_input_tokens_seen": 37395165, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.1953125, + "step": 1731, + "time_per_iteration": 2.4650096893310547 + }, + { + "auxiliary_loss_clip": 0.01164648, + "auxiliary_loss_mlp": 0.01046999, + "balance_loss_clip": 1.02762735, + "balance_loss_mlp": 1.04899907, + "epoch": 0.10413347362092289, + "flos": 26104184899200.0, + "grad_norm": 1.8082651510271561, + "language_loss": 0.82679468, + "learning_rate": 3.9426269124336e-06, + "loss": 0.84891117, + "num_input_tokens_seen": 37414845, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.15625, + "step": 1732, + "time_per_iteration": 2.5152552127838135 + }, + { + "auxiliary_loss_clip": 0.01169252, + "auxiliary_loss_mlp": 0.01048286, + "balance_loss_clip": 1.02881873, + "balance_loss_mlp": 1.05052853, + "epoch": 0.10419359687359087, + "flos": 12641059630080.0, + "grad_norm": 2.755876599624297, + "language_loss": 0.82947195, + "learning_rate": 3.942534260525104e-06, + "loss": 0.85164732, + "num_input_tokens_seen": 37432490, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.1875, + "step": 1733, + "time_per_iteration": 2.4426257610321045 + }, + { + "auxiliary_loss_clip": 0.01171007, + "auxiliary_loss_mlp": 0.01052347, + "balance_loss_clip": 1.03171146, + "balance_loss_mlp": 1.04982805, + "epoch": 0.10425372012625883, + "flos": 12125094716160.0, + "grad_norm": 2.4971959439308336, + "language_loss": 0.76446331, + "learning_rate": 3.942441534955514e-06, + "loss": 0.78669679, + "num_input_tokens_seen": 37449435, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.2109375, + "step": 1734, + "time_per_iteration": 2.4556663036346436 + }, + { + "auxiliary_loss_clip": 0.01165131, + "auxiliary_loss_mlp": 0.01047841, + "balance_loss_clip": 1.02795696, + "balance_loss_mlp": 1.04718471, + "epoch": 0.1043138433789268, + "flos": 25337563902720.0, + "grad_norm": 1.9861442095390862, + "language_loss": 0.74962163, + "learning_rate": 3.9423487357283465e-06, + "loss": 0.7717514, + "num_input_tokens_seen": 37469105, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.1796875, + "step": 1735, + "time_per_iteration": 2.4961798191070557 + }, + { + "auxiliary_loss_clip": 0.01172587, + "auxiliary_loss_mlp": 0.01048204, + "balance_loss_clip": 1.02724743, + "balance_loss_mlp": 1.05081487, + "epoch": 0.10437396663159478, + "flos": 29167149352320.0, + "grad_norm": 1.9829662552727403, + "language_loss": 0.79049939, + "learning_rate": 3.94225586284712e-06, + "loss": 0.8127073, + "num_input_tokens_seen": 37490540, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.21875, + "step": 1736, + "time_per_iteration": 2.530808448791504 + }, + { + "auxiliary_loss_clip": 0.01166906, + "auxiliary_loss_mlp": 0.01057245, + "balance_loss_clip": 1.03655005, + "balance_loss_mlp": 1.0491184, + "epoch": 0.10443408988426274, + "flos": 25080946162560.0, + "grad_norm": 1.8105684861006923, + "language_loss": 0.70339012, + "learning_rate": 3.942162916315356e-06, + "loss": 0.72563159, + "num_input_tokens_seen": 37511905, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.171875, + "step": 1737, + "time_per_iteration": 2.4789419174194336 + }, + { + "auxiliary_loss_clip": 0.01170601, + "auxiliary_loss_mlp": 0.01051121, + "balance_loss_clip": 1.02758932, + "balance_loss_mlp": 1.04718471, + "epoch": 0.1044942131369307, + "flos": 26759662237440.0, + "grad_norm": 2.004598680960266, + "language_loss": 0.81483257, + "learning_rate": 3.942069896136581e-06, + "loss": 0.83704984, + "num_input_tokens_seen": 37533635, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.234375, + "step": 1738, + "time_per_iteration": 2.505312442779541 + }, + { + "auxiliary_loss_clip": 0.0116919, + "auxiliary_loss_mlp": 0.01058357, + "balance_loss_clip": 1.0351944, + "balance_loss_mlp": 1.04712963, + "epoch": 0.10455433638959867, + "flos": 18442571875200.0, + "grad_norm": 4.442978598454381, + "language_loss": 0.750579, + "learning_rate": 3.9419768023143196e-06, + "loss": 0.77285445, + "num_input_tokens_seen": 37552035, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.21875, + "step": 1739, + "time_per_iteration": 2.4544031620025635 + }, + { + "auxiliary_loss_clip": 0.01168087, + "auxiliary_loss_mlp": 0.01055908, + "balance_loss_clip": 1.0349865, + "balance_loss_mlp": 1.04893625, + "epoch": 0.10461445964226665, + "flos": 23218977876480.0, + "grad_norm": 1.676051388115223, + "language_loss": 0.77279431, + "learning_rate": 3.941883634852104e-06, + "loss": 0.79503429, + "num_input_tokens_seen": 37571540, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1875, + "step": 1740, + "time_per_iteration": 2.489302635192871 + }, + { + "auxiliary_loss_clip": 0.01169756, + "auxiliary_loss_mlp": 0.01048525, + "balance_loss_clip": 1.02820003, + "balance_loss_mlp": 1.05093944, + "epoch": 0.10467458289493461, + "flos": 24345243797760.0, + "grad_norm": 2.1911967502326775, + "language_loss": 0.85983682, + "learning_rate": 3.941790393753467e-06, + "loss": 0.88201964, + "num_input_tokens_seen": 37588265, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1875, + "step": 1741, + "time_per_iteration": 2.4571211338043213 + }, + { + "auxiliary_loss_clip": 0.01171445, + "auxiliary_loss_mlp": 0.01053113, + "balance_loss_clip": 1.03091609, + "balance_loss_mlp": 1.04901385, + "epoch": 0.10473470614760258, + "flos": 21287953693440.0, + "grad_norm": 4.086245960730198, + "language_loss": 0.74991679, + "learning_rate": 3.941697079021942e-06, + "loss": 0.77216244, + "num_input_tokens_seen": 37606860, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.2265625, + "step": 1742, + "time_per_iteration": 2.4919426441192627 + }, + { + "auxiliary_loss_clip": 0.01171849, + "auxiliary_loss_mlp": 0.01059576, + "balance_loss_clip": 1.03914368, + "balance_loss_mlp": 1.05323386, + "epoch": 0.10479482940027056, + "flos": 21687208341120.0, + "grad_norm": 1.9550995481311175, + "language_loss": 0.87150526, + "learning_rate": 3.94160369066107e-06, + "loss": 0.89381945, + "num_input_tokens_seen": 37625210, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.1875, + "step": 1743, + "time_per_iteration": 2.470841884613037 + }, + { + "auxiliary_loss_clip": 0.01168292, + "auxiliary_loss_mlp": 0.01049872, + "balance_loss_clip": 1.02760363, + "balance_loss_mlp": 1.04964471, + "epoch": 0.10485495265293852, + "flos": 21573694385280.0, + "grad_norm": 2.1176645115958923, + "language_loss": 0.75532508, + "learning_rate": 3.941510228674391e-06, + "loss": 0.77750671, + "num_input_tokens_seen": 37644110, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.1875, + "step": 1744, + "time_per_iteration": 2.4725873470306396 + }, + { + "auxiliary_loss_clip": 0.01171079, + "auxiliary_loss_mlp": 0.01052914, + "balance_loss_clip": 1.03336394, + "balance_loss_mlp": 1.05184436, + "epoch": 0.10491507590560649, + "flos": 37961923708800.0, + "grad_norm": 2.151699961275852, + "language_loss": 0.79306591, + "learning_rate": 3.941416693065451e-06, + "loss": 0.81530583, + "num_input_tokens_seen": 37665800, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.1875, + "step": 1745, + "time_per_iteration": 2.5937912464141846 + }, + { + "auxiliary_loss_clip": 0.01166892, + "auxiliary_loss_mlp": 0.01062835, + "balance_loss_clip": 1.04194999, + "balance_loss_mlp": 1.047683, + "epoch": 0.10497519915827447, + "flos": 26396282298240.0, + "grad_norm": 2.087314316255438, + "language_loss": 0.82382894, + "learning_rate": 3.941323083837794e-06, + "loss": 0.8461262, + "num_input_tokens_seen": 37685095, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1875, + "step": 1746, + "time_per_iteration": 2.520306348800659 + }, + { + "auxiliary_loss_clip": 0.01170145, + "auxiliary_loss_mlp": 0.01062461, + "balance_loss_clip": 1.04186153, + "balance_loss_mlp": 1.05198646, + "epoch": 0.10503532241094243, + "flos": 40662190581120.0, + "grad_norm": 1.645771273172373, + "language_loss": 0.69951761, + "learning_rate": 3.941229400994971e-06, + "loss": 0.7218436, + "num_input_tokens_seen": 37707445, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1875, + "step": 1747, + "time_per_iteration": 2.618581771850586 + }, + { + "auxiliary_loss_clip": 0.01176288, + "auxiliary_loss_mlp": 0.01062255, + "balance_loss_clip": 1.04140496, + "balance_loss_mlp": 1.05136323, + "epoch": 0.1050954456636104, + "flos": 29789409588480.0, + "grad_norm": 2.3385484358742192, + "language_loss": 0.84245849, + "learning_rate": 3.941135644540535e-06, + "loss": 0.86484385, + "num_input_tokens_seen": 37728325, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.25, + "step": 1748, + "time_per_iteration": 2.539386034011841 + }, + { + "auxiliary_loss_clip": 0.01165269, + "auxiliary_loss_mlp": 0.01049548, + "balance_loss_clip": 1.02797103, + "balance_loss_mlp": 1.04729426, + "epoch": 0.10515556891627838, + "flos": 23948754497280.0, + "grad_norm": 1.8953667439120294, + "language_loss": 0.71491921, + "learning_rate": 3.941041814478041e-06, + "loss": 0.7370674, + "num_input_tokens_seen": 37748910, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.1796875, + "step": 1749, + "time_per_iteration": 2.481700897216797 + }, + { + "auxiliary_loss_clip": 0.01166695, + "auxiliary_loss_mlp": 0.01060715, + "balance_loss_clip": 1.0395906, + "balance_loss_mlp": 1.04953468, + "epoch": 0.10521569216894634, + "flos": 18259606972800.0, + "grad_norm": 1.9760411129591238, + "language_loss": 0.81960011, + "learning_rate": 3.940947910811047e-06, + "loss": 0.84187424, + "num_input_tokens_seen": 37765745, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.171875, + "step": 1750, + "time_per_iteration": 2.4454832077026367 + }, + { + "auxiliary_loss_clip": 0.01173904, + "auxiliary_loss_mlp": 0.01060945, + "balance_loss_clip": 1.03946304, + "balance_loss_mlp": 1.05259562, + "epoch": 0.10527581542161431, + "flos": 15630909949440.0, + "grad_norm": 2.3402404294313524, + "language_loss": 0.91871023, + "learning_rate": 3.940853933543114e-06, + "loss": 0.94105875, + "num_input_tokens_seen": 37780520, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.21875, + "step": 1751, + "time_per_iteration": 2.416607141494751 + }, + { + "auxiliary_loss_clip": 0.01166575, + "auxiliary_loss_mlp": 0.0104776, + "balance_loss_clip": 1.02698207, + "balance_loss_mlp": 1.04889047, + "epoch": 0.10533593867428227, + "flos": 18296559089280.0, + "grad_norm": 2.265296057434122, + "language_loss": 0.79560149, + "learning_rate": 3.940759882677805e-06, + "loss": 0.81774485, + "num_input_tokens_seen": 37799515, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.1796875, + "step": 1752, + "time_per_iteration": 2.46063494682312 + }, + { + "auxiliary_loss_clip": 0.01167711, + "auxiliary_loss_mlp": 0.01052906, + "balance_loss_clip": 1.03202033, + "balance_loss_mlp": 1.05050862, + "epoch": 0.10539606192695025, + "flos": 29023219555200.0, + "grad_norm": 2.1401152378303867, + "language_loss": 0.75782037, + "learning_rate": 3.940665758218686e-06, + "loss": 0.78002656, + "num_input_tokens_seen": 37818695, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.171875, + "step": 1753, + "time_per_iteration": 2.51767635345459 + }, + { + "auxiliary_loss_clip": 0.01172527, + "auxiliary_loss_mlp": 0.01057137, + "balance_loss_clip": 1.03436756, + "balance_loss_mlp": 1.04939532, + "epoch": 0.10545618517961822, + "flos": 19969313506560.0, + "grad_norm": 2.0790136174876546, + "language_loss": 0.84048498, + "learning_rate": 3.940571560169328e-06, + "loss": 0.86278164, + "num_input_tokens_seen": 37837860, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.234375, + "step": 1754, + "time_per_iteration": 2.4683756828308105 + }, + { + "auxiliary_loss_clip": 0.01175207, + "auxiliary_loss_mlp": 0.01053622, + "balance_loss_clip": 1.03044736, + "balance_loss_mlp": 1.05438888, + "epoch": 0.10551630843228618, + "flos": 16143427157760.0, + "grad_norm": 2.8736094439376645, + "language_loss": 0.68956709, + "learning_rate": 3.940477288533302e-06, + "loss": 0.71185535, + "num_input_tokens_seen": 37856260, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.2109375, + "step": 1755, + "time_per_iteration": 2.45597243309021 + }, + { + "auxiliary_loss_clip": 0.01172827, + "auxiliary_loss_mlp": 0.01061763, + "balance_loss_clip": 1.03989983, + "balance_loss_mlp": 1.05102587, + "epoch": 0.10557643168495416, + "flos": 23440115957760.0, + "grad_norm": 5.502613786824721, + "language_loss": 0.76718754, + "learning_rate": 3.940382943314182e-06, + "loss": 0.78953344, + "num_input_tokens_seen": 37876960, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.21875, + "step": 1756, + "time_per_iteration": 2.5239176750183105 + }, + { + "auxiliary_loss_clip": 0.01172125, + "auxiliary_loss_mlp": 0.01058013, + "balance_loss_clip": 1.03712726, + "balance_loss_mlp": 1.04982626, + "epoch": 0.10563655493762213, + "flos": 21799034357760.0, + "grad_norm": 1.7784869470084927, + "language_loss": 0.80162531, + "learning_rate": 3.940288524515547e-06, + "loss": 0.82392669, + "num_input_tokens_seen": 37897070, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.21875, + "step": 1757, + "time_per_iteration": 2.4551706314086914 + }, + { + "auxiliary_loss_clip": 0.01171845, + "auxiliary_loss_mlp": 0.01056344, + "balance_loss_clip": 1.03499317, + "balance_loss_mlp": 1.05132246, + "epoch": 0.10569667819029009, + "flos": 53800863275520.0, + "grad_norm": 1.631431596421375, + "language_loss": 0.78800333, + "learning_rate": 3.940194032140976e-06, + "loss": 0.81028521, + "num_input_tokens_seen": 37923635, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.203125, + "step": 1758, + "time_per_iteration": 2.7955896854400635 + }, + { + "auxiliary_loss_clip": 0.01177436, + "auxiliary_loss_mlp": 0.01050523, + "balance_loss_clip": 1.02865982, + "balance_loss_mlp": 1.05364573, + "epoch": 0.10575680144295807, + "flos": 22925515760640.0, + "grad_norm": 2.609159841262955, + "language_loss": 0.9189958, + "learning_rate": 3.940099466194054e-06, + "loss": 0.94127536, + "num_input_tokens_seen": 37942650, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.234375, + "step": 1759, + "time_per_iteration": 2.4853782653808594 + }, + { + "auxiliary_loss_clip": 0.01173064, + "auxiliary_loss_mlp": 0.01055702, + "balance_loss_clip": 1.03276575, + "balance_loss_mlp": 1.04970741, + "epoch": 0.10581692469562604, + "flos": 14136667148160.0, + "grad_norm": 2.498568213886603, + "language_loss": 0.76932353, + "learning_rate": 3.940004826678365e-06, + "loss": 0.79161119, + "num_input_tokens_seen": 37960660, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.234375, + "step": 1760, + "time_per_iteration": 2.470705509185791 + }, + { + "auxiliary_loss_clip": 0.01173982, + "auxiliary_loss_mlp": 0.01061265, + "balance_loss_clip": 1.03825736, + "balance_loss_mlp": 1.05152941, + "epoch": 0.105877047948294, + "flos": 25958674903680.0, + "grad_norm": 2.349800445259612, + "language_loss": 0.89282435, + "learning_rate": 3.939910113597498e-06, + "loss": 0.91517675, + "num_input_tokens_seen": 37978625, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.2265625, + "step": 1761, + "time_per_iteration": 2.491501569747925 + }, + { + "auxiliary_loss_clip": 0.01173015, + "auxiliary_loss_mlp": 0.01060542, + "balance_loss_clip": 1.03944254, + "balance_loss_mlp": 1.0518589, + "epoch": 0.10593717120096197, + "flos": 30664768032000.0, + "grad_norm": 2.4794664397863877, + "language_loss": 0.78304708, + "learning_rate": 3.9398153269550464e-06, + "loss": 0.80538261, + "num_input_tokens_seen": 38000005, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.2109375, + "step": 1762, + "time_per_iteration": 2.5563831329345703 + }, + { + "auxiliary_loss_clip": 0.01062071, + "auxiliary_loss_mlp": 0.01014008, + "balance_loss_clip": 1.0110991, + "balance_loss_mlp": 1.02000487, + "epoch": 0.10599729445362994, + "flos": 66436682497920.0, + "grad_norm": 0.753444103392694, + "language_loss": 0.60481733, + "learning_rate": 3.939720466754602e-06, + "loss": 0.62557811, + "num_input_tokens_seen": 38066165, + "router_z_loss_clip": 0.02905273, + "router_z_loss_mlp": 0.421875, + "step": 1763, + "time_per_iteration": 3.2239294052124023 + }, + { + "auxiliary_loss_clip": 0.01170891, + "auxiliary_loss_mlp": 0.01048971, + "balance_loss_clip": 1.02777529, + "balance_loss_mlp": 1.04924011, + "epoch": 0.10605741770629791, + "flos": 23948179879680.0, + "grad_norm": 2.054980370260194, + "language_loss": 0.8010751, + "learning_rate": 3.939625532999763e-06, + "loss": 0.82327372, + "num_input_tokens_seen": 38086150, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.21875, + "step": 1764, + "time_per_iteration": 2.476325273513794 + }, + { + "auxiliary_loss_clip": 0.01169028, + "auxiliary_loss_mlp": 0.01049345, + "balance_loss_clip": 1.02745855, + "balance_loss_mlp": 1.04961264, + "epoch": 0.10611754095896588, + "flos": 19387524919680.0, + "grad_norm": 1.7621956234955212, + "language_loss": 0.7999962, + "learning_rate": 3.9395305256941314e-06, + "loss": 0.82217997, + "num_input_tokens_seen": 38104205, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.1953125, + "step": 1765, + "time_per_iteration": 2.446593999862671 + }, + { + "auxiliary_loss_clip": 0.01167126, + "auxiliary_loss_mlp": 0.01054873, + "balance_loss_clip": 1.03394008, + "balance_loss_mlp": 1.04794002, + "epoch": 0.10617766421163385, + "flos": 22237755073920.0, + "grad_norm": 1.867239621884004, + "language_loss": 0.76693732, + "learning_rate": 3.939435444841306e-06, + "loss": 0.78915727, + "num_input_tokens_seen": 38122005, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1875, + "step": 1766, + "time_per_iteration": 2.4462356567382812 + }, + { + "auxiliary_loss_clip": 0.01170332, + "auxiliary_loss_mlp": 0.01059306, + "balance_loss_clip": 1.0366683, + "balance_loss_mlp": 1.05017042, + "epoch": 0.10623778746430182, + "flos": 28404407024640.0, + "grad_norm": 1.6580981789618001, + "language_loss": 0.77319431, + "learning_rate": 3.939340290444895e-06, + "loss": 0.79549068, + "num_input_tokens_seen": 38143365, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.203125, + "step": 1767, + "time_per_iteration": 2.542797088623047 + }, + { + "auxiliary_loss_clip": 0.01060068, + "auxiliary_loss_mlp": 0.01000453, + "balance_loss_clip": 0.99785471, + "balance_loss_mlp": 1.01804066, + "epoch": 0.10629791071696978, + "flos": 64234639221120.0, + "grad_norm": 0.6789245534488961, + "language_loss": 0.57902765, + "learning_rate": 3.939245062508506e-06, + "loss": 0.59963286, + "num_input_tokens_seen": 38210035, + "router_z_loss_clip": 0.02600098, + "router_z_loss_mlp": 0.421875, + "step": 1768, + "time_per_iteration": 6.071596384048462 + }, + { + "auxiliary_loss_clip": 0.01172748, + "auxiliary_loss_mlp": 0.01041813, + "balance_loss_clip": 1.0219171, + "balance_loss_mlp": 1.05201912, + "epoch": 0.10635803396963776, + "flos": 22747578762240.0, + "grad_norm": 1.446404125156032, + "language_loss": 0.86796767, + "learning_rate": 3.939149761035749e-06, + "loss": 0.89011335, + "num_input_tokens_seen": 38231230, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.2109375, + "step": 1769, + "time_per_iteration": 2.5106868743896484 + }, + { + "auxiliary_loss_clip": 0.01175908, + "auxiliary_loss_mlp": 0.01056805, + "balance_loss_clip": 1.03496528, + "balance_loss_mlp": 1.05300689, + "epoch": 0.10641815722230573, + "flos": 31395586147200.0, + "grad_norm": 1.766851816283336, + "language_loss": 0.61890501, + "learning_rate": 3.9390543860302395e-06, + "loss": 0.64123213, + "num_input_tokens_seen": 38253890, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.2265625, + "step": 1770, + "time_per_iteration": 2.5770323276519775 + }, + { + "auxiliary_loss_clip": 0.01061292, + "auxiliary_loss_mlp": 0.01003176, + "balance_loss_clip": 1.00058925, + "balance_loss_mlp": 1.01873469, + "epoch": 0.1064782804749737, + "flos": 58552527784320.0, + "grad_norm": 0.8864779346546747, + "language_loss": 0.57095039, + "learning_rate": 3.9389589374955925e-06, + "loss": 0.59159505, + "num_input_tokens_seen": 38304290, + "router_z_loss_clip": 0.02587891, + "router_z_loss_mlp": 0.42578125, + "step": 1771, + "time_per_iteration": 2.957993507385254 + }, + { + "auxiliary_loss_clip": 0.01174087, + "auxiliary_loss_mlp": 0.01063103, + "balance_loss_clip": 1.04187179, + "balance_loss_mlp": 1.05443954, + "epoch": 0.10653840372764166, + "flos": 23987825516160.0, + "grad_norm": 1.6398085638646198, + "language_loss": 0.88530469, + "learning_rate": 3.938863415435429e-06, + "loss": 0.90767658, + "num_input_tokens_seen": 38324725, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1953125, + "step": 1772, + "time_per_iteration": 2.520744562149048 + }, + { + "auxiliary_loss_clip": 0.01176768, + "auxiliary_loss_mlp": 0.01063607, + "balance_loss_clip": 1.03945482, + "balance_loss_mlp": 1.05091381, + "epoch": 0.10659852698030964, + "flos": 18294655668480.0, + "grad_norm": 2.8236986107629094, + "language_loss": 0.76021719, + "learning_rate": 3.93876781985337e-06, + "loss": 0.78262091, + "num_input_tokens_seen": 38340735, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.2578125, + "step": 1773, + "time_per_iteration": 2.4228129386901855 + }, + { + "auxiliary_loss_clip": 0.01171647, + "auxiliary_loss_mlp": 0.01063224, + "balance_loss_clip": 1.04087257, + "balance_loss_mlp": 1.05147731, + "epoch": 0.1066586502329776, + "flos": 32160591031680.0, + "grad_norm": 2.1931291175477177, + "language_loss": 0.83184093, + "learning_rate": 3.938672150753041e-06, + "loss": 0.85418963, + "num_input_tokens_seen": 38361315, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.203125, + "step": 1774, + "time_per_iteration": 2.5613787174224854 + }, + { + "auxiliary_loss_clip": 0.01177598, + "auxiliary_loss_mlp": 0.01054445, + "balance_loss_clip": 1.03262997, + "balance_loss_mlp": 1.05220413, + "epoch": 0.10671877348564557, + "flos": 17785155202560.0, + "grad_norm": 2.683505024819064, + "language_loss": 0.76297373, + "learning_rate": 3.9385764081380704e-06, + "loss": 0.78529418, + "num_input_tokens_seen": 38377425, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.25, + "step": 1775, + "time_per_iteration": 2.437676429748535 + }, + { + "auxiliary_loss_clip": 0.01057587, + "auxiliary_loss_mlp": 0.01006639, + "balance_loss_clip": 1.00413537, + "balance_loss_mlp": 1.01520467, + "epoch": 0.10677889673831355, + "flos": 63510177813120.0, + "grad_norm": 0.8253045983972309, + "language_loss": 0.57443953, + "learning_rate": 3.9384805920120876e-06, + "loss": 0.59508181, + "num_input_tokens_seen": 38440275, + "router_z_loss_clip": 0.02502441, + "router_z_loss_mlp": 0.42382812, + "step": 1776, + "time_per_iteration": 3.101378917694092 + }, + { + "auxiliary_loss_clip": 0.01176962, + "auxiliary_loss_mlp": 0.01059775, + "balance_loss_clip": 1.0365653, + "balance_loss_mlp": 1.05411029, + "epoch": 0.10683901999098151, + "flos": 22017694400640.0, + "grad_norm": 1.6481869723516467, + "language_loss": 0.83374244, + "learning_rate": 3.938384702378727e-06, + "loss": 0.8561098, + "num_input_tokens_seen": 38461820, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.2265625, + "step": 1777, + "time_per_iteration": 2.5109002590179443 + }, + { + "auxiliary_loss_clip": 0.01170133, + "auxiliary_loss_mlp": 0.01055162, + "balance_loss_clip": 1.03371584, + "balance_loss_mlp": 1.05298579, + "epoch": 0.10689914324364948, + "flos": 25042952551680.0, + "grad_norm": 2.6420984425067013, + "language_loss": 0.87275863, + "learning_rate": 3.938288739241625e-06, + "loss": 0.89501154, + "num_input_tokens_seen": 38482235, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.171875, + "step": 1778, + "time_per_iteration": 2.503103494644165 + }, + { + "auxiliary_loss_clip": 0.01175003, + "auxiliary_loss_mlp": 0.01053847, + "balance_loss_clip": 1.032354, + "balance_loss_mlp": 1.05328, + "epoch": 0.10695926649631746, + "flos": 16435129507200.0, + "grad_norm": 2.213225731734914, + "language_loss": 0.83970487, + "learning_rate": 3.938192702604417e-06, + "loss": 0.86199337, + "num_input_tokens_seen": 38500690, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.21875, + "step": 1779, + "time_per_iteration": 2.4574496746063232 + }, + { + "auxiliary_loss_clip": 0.01169562, + "auxiliary_loss_mlp": 0.01052117, + "balance_loss_clip": 1.03086162, + "balance_loss_mlp": 1.04975557, + "epoch": 0.10701938974898542, + "flos": 16979211792000.0, + "grad_norm": 2.4959309518827655, + "language_loss": 0.67064941, + "learning_rate": 3.9380965924707495e-06, + "loss": 0.69286621, + "num_input_tokens_seen": 38518405, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1953125, + "step": 1780, + "time_per_iteration": 2.447756052017212 + }, + { + "auxiliary_loss_clip": 0.01171847, + "auxiliary_loss_mlp": 0.01046888, + "balance_loss_clip": 1.02553749, + "balance_loss_mlp": 1.05183458, + "epoch": 0.10707951300165339, + "flos": 15888102307200.0, + "grad_norm": 2.25546613947904, + "language_loss": 0.91667759, + "learning_rate": 3.938000408844265e-06, + "loss": 0.93886495, + "num_input_tokens_seen": 38535060, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.203125, + "step": 1781, + "time_per_iteration": 2.4367144107818604 + }, + { + "auxiliary_loss_clip": 0.01175433, + "auxiliary_loss_mlp": 0.01046071, + "balance_loss_clip": 1.02524495, + "balance_loss_mlp": 1.05302, + "epoch": 0.10713963625432135, + "flos": 14247164361600.0, + "grad_norm": 2.202402738572802, + "language_loss": 0.79505372, + "learning_rate": 3.9379041517286105e-06, + "loss": 0.81726873, + "num_input_tokens_seen": 38552855, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.2265625, + "step": 1782, + "time_per_iteration": 2.4340877532958984 + }, + { + "auxiliary_loss_clip": 0.01175468, + "auxiliary_loss_mlp": 0.01052246, + "balance_loss_clip": 1.03055024, + "balance_loss_mlp": 1.0517509, + "epoch": 0.10719975950698933, + "flos": 16756780821120.0, + "grad_norm": 2.0445491568240994, + "language_loss": 0.78994977, + "learning_rate": 3.937807821127436e-06, + "loss": 0.81222689, + "num_input_tokens_seen": 38570075, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.234375, + "step": 1783, + "time_per_iteration": 2.434527635574341 + }, + { + "auxiliary_loss_clip": 0.01176375, + "auxiliary_loss_mlp": 0.01052212, + "balance_loss_clip": 1.02991986, + "balance_loss_mlp": 1.0529108, + "epoch": 0.1072598827596573, + "flos": 22710626645760.0, + "grad_norm": 1.8050343336808015, + "language_loss": 0.85956216, + "learning_rate": 3.937711417044395e-06, + "loss": 0.88184798, + "num_input_tokens_seen": 38587970, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.234375, + "step": 1784, + "time_per_iteration": 2.4867746829986572 + }, + { + "auxiliary_loss_clip": 0.01174134, + "auxiliary_loss_mlp": 0.01054075, + "balance_loss_clip": 1.03188968, + "balance_loss_mlp": 1.05080986, + "epoch": 0.10732000601232526, + "flos": 23258264376960.0, + "grad_norm": 3.0774406347184806, + "language_loss": 1.00899053, + "learning_rate": 3.937614939483143e-06, + "loss": 1.03127265, + "num_input_tokens_seen": 38605840, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.234375, + "step": 1785, + "time_per_iteration": 2.46663498878479 + }, + { + "auxiliary_loss_clip": 0.01171119, + "auxiliary_loss_mlp": 0.01057254, + "balance_loss_clip": 1.03636849, + "balance_loss_mlp": 1.05306709, + "epoch": 0.10738012926499324, + "flos": 24207060176640.0, + "grad_norm": 1.4495948735276882, + "language_loss": 0.85070992, + "learning_rate": 3.937518388447339e-06, + "loss": 0.87299371, + "num_input_tokens_seen": 38627070, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1796875, + "step": 1786, + "time_per_iteration": 2.505018949508667 + }, + { + "auxiliary_loss_clip": 0.01170035, + "auxiliary_loss_mlp": 0.01059108, + "balance_loss_clip": 1.035779, + "balance_loss_mlp": 1.04750311, + "epoch": 0.1074402525176612, + "flos": 20923065383040.0, + "grad_norm": 1.8788886178726656, + "language_loss": 0.78817046, + "learning_rate": 3.937421763940642e-06, + "loss": 0.81046188, + "num_input_tokens_seen": 38645840, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.21875, + "step": 1787, + "time_per_iteration": 2.443532705307007 + }, + { + "auxiliary_loss_clip": 0.01176938, + "auxiliary_loss_mlp": 0.01049821, + "balance_loss_clip": 1.02768385, + "balance_loss_mlp": 1.0517112, + "epoch": 0.10750037577032917, + "flos": 16946928443520.0, + "grad_norm": 2.551869220071384, + "language_loss": 0.82557851, + "learning_rate": 3.937325065966719e-06, + "loss": 0.84784609, + "num_input_tokens_seen": 38664770, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.25, + "step": 1788, + "time_per_iteration": 2.4807305335998535 + }, + { + "auxiliary_loss_clip": 0.01170019, + "auxiliary_loss_mlp": 0.0106343, + "balance_loss_clip": 1.04219902, + "balance_loss_mlp": 1.04939878, + "epoch": 0.10756049902299715, + "flos": 20266546550400.0, + "grad_norm": 2.778852512980128, + "language_loss": 0.77794182, + "learning_rate": 3.9372282945292335e-06, + "loss": 0.80027628, + "num_input_tokens_seen": 38683865, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.203125, + "step": 1789, + "time_per_iteration": 2.482330322265625 + }, + { + "auxiliary_loss_clip": 0.01173111, + "auxiliary_loss_mlp": 0.01061244, + "balance_loss_clip": 1.03631723, + "balance_loss_mlp": 1.05133712, + "epoch": 0.10762062227566511, + "flos": 23586523793280.0, + "grad_norm": 2.434124451319009, + "language_loss": 0.74467903, + "learning_rate": 3.937131449631859e-06, + "loss": 0.76702261, + "num_input_tokens_seen": 38702485, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 1.21875, + "step": 1790, + "time_per_iteration": 2.5921239852905273 + }, + { + "auxiliary_loss_clip": 0.01177807, + "auxiliary_loss_mlp": 0.01072366, + "balance_loss_clip": 1.04766607, + "balance_loss_mlp": 1.05428767, + "epoch": 0.10768074552833308, + "flos": 24310626065280.0, + "grad_norm": 2.5839507236364554, + "language_loss": 0.78495383, + "learning_rate": 3.9370345312782645e-06, + "loss": 0.80745554, + "num_input_tokens_seen": 38722475, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.234375, + "step": 1791, + "time_per_iteration": 2.5242488384246826 + }, + { + "auxiliary_loss_clip": 0.01167341, + "auxiliary_loss_mlp": 0.01053897, + "balance_loss_clip": 1.0330478, + "balance_loss_mlp": 1.05112934, + "epoch": 0.10774086878100106, + "flos": 25299965341440.0, + "grad_norm": 1.8605555947944812, + "language_loss": 0.70855284, + "learning_rate": 3.936937539472126e-06, + "loss": 0.73076522, + "num_input_tokens_seen": 38743285, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.15625, + "step": 1792, + "time_per_iteration": 2.5260751247406006 + }, + { + "auxiliary_loss_clip": 0.01175824, + "auxiliary_loss_mlp": 0.010463, + "balance_loss_clip": 1.02330506, + "balance_loss_mlp": 1.05109024, + "epoch": 0.10780099203366902, + "flos": 22054035985920.0, + "grad_norm": 2.973355145299492, + "language_loss": 0.76029646, + "learning_rate": 3.9368404742171236e-06, + "loss": 0.78251767, + "num_input_tokens_seen": 38763035, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.25, + "step": 1793, + "time_per_iteration": 2.5037007331848145 + }, + { + "auxiliary_loss_clip": 0.01171847, + "auxiliary_loss_mlp": 0.01060242, + "balance_loss_clip": 1.03793848, + "balance_loss_mlp": 1.0537113, + "epoch": 0.10786111528633699, + "flos": 22747471021440.0, + "grad_norm": 1.7251623627880495, + "language_loss": 0.85158944, + "learning_rate": 3.936743335516936e-06, + "loss": 0.87391031, + "num_input_tokens_seen": 38784900, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.1875, + "step": 1794, + "time_per_iteration": 2.5210132598876953 + }, + { + "auxiliary_loss_clip": 0.01180393, + "auxiliary_loss_mlp": 0.01052991, + "balance_loss_clip": 1.02954292, + "balance_loss_mlp": 1.05342674, + "epoch": 0.10792123853900495, + "flos": 20851064570880.0, + "grad_norm": 1.9245153565321482, + "language_loss": 0.74914879, + "learning_rate": 3.936646123375246e-06, + "loss": 0.77148265, + "num_input_tokens_seen": 38804695, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.265625, + "step": 1795, + "time_per_iteration": 2.486111879348755 + }, + { + "auxiliary_loss_clip": 0.0117609, + "auxiliary_loss_mlp": 0.01060963, + "balance_loss_clip": 1.03863525, + "balance_loss_mlp": 1.05227423, + "epoch": 0.10798136179167293, + "flos": 17748705876480.0, + "grad_norm": 2.917857918230487, + "language_loss": 0.8116014, + "learning_rate": 3.936548837795741e-06, + "loss": 0.83397192, + "num_input_tokens_seen": 38822395, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.234375, + "step": 1796, + "time_per_iteration": 2.4357504844665527 + }, + { + "auxiliary_loss_clip": 0.01177296, + "auxiliary_loss_mlp": 0.01075942, + "balance_loss_clip": 1.05260134, + "balance_loss_mlp": 1.05476594, + "epoch": 0.1080414850443409, + "flos": 13589639948160.0, + "grad_norm": 2.4043777768562293, + "language_loss": 0.73476732, + "learning_rate": 3.936451478782111e-06, + "loss": 0.75729972, + "num_input_tokens_seen": 38839865, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.21875, + "step": 1797, + "time_per_iteration": 2.477867841720581 + }, + { + "auxiliary_loss_clip": 0.01172695, + "auxiliary_loss_mlp": 0.01051138, + "balance_loss_clip": 1.03081274, + "balance_loss_mlp": 1.05260658, + "epoch": 0.10810160829700886, + "flos": 16253421580800.0, + "grad_norm": 3.1892188654982396, + "language_loss": 0.81348622, + "learning_rate": 3.936354046338046e-06, + "loss": 0.83572453, + "num_input_tokens_seen": 38857300, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.203125, + "step": 1798, + "time_per_iteration": 2.5060064792633057 + }, + { + "auxiliary_loss_clip": 0.011719, + "auxiliary_loss_mlp": 0.01053896, + "balance_loss_clip": 1.03075755, + "balance_loss_mlp": 1.0508821, + "epoch": 0.10816173154967684, + "flos": 15158002464000.0, + "grad_norm": 2.4195393058725623, + "language_loss": 0.85180116, + "learning_rate": 3.936256540467242e-06, + "loss": 0.87405908, + "num_input_tokens_seen": 38874960, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.2109375, + "step": 1799, + "time_per_iteration": 2.4546945095062256 + }, + { + "auxiliary_loss_clip": 0.01172982, + "auxiliary_loss_mlp": 0.01064124, + "balance_loss_clip": 1.04271412, + "balance_loss_mlp": 1.0546999, + "epoch": 0.10822185480234481, + "flos": 17785334770560.0, + "grad_norm": 2.2474252534922265, + "language_loss": 0.77365196, + "learning_rate": 3.9361589611733955e-06, + "loss": 0.79602301, + "num_input_tokens_seen": 38893610, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.1796875, + "step": 1800, + "time_per_iteration": 2.4650769233703613 + }, + { + "auxiliary_loss_clip": 0.01168665, + "auxiliary_loss_mlp": 0.01044543, + "balance_loss_clip": 1.02443254, + "balance_loss_mlp": 1.05136347, + "epoch": 0.10828197805501277, + "flos": 25556654908800.0, + "grad_norm": 2.2954016650766844, + "language_loss": 0.7287963, + "learning_rate": 3.9360613084602075e-06, + "loss": 0.7509284, + "num_input_tokens_seen": 38913485, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.171875, + "step": 1801, + "time_per_iteration": 2.5045113563537598 + }, + { + "auxiliary_loss_clip": 0.01177863, + "auxiliary_loss_mlp": 0.01048534, + "balance_loss_clip": 1.02785134, + "balance_loss_mlp": 1.05259442, + "epoch": 0.10834210130768075, + "flos": 28984435845120.0, + "grad_norm": 1.8364602771794378, + "language_loss": 0.66427058, + "learning_rate": 3.935963582331381e-06, + "loss": 0.68653458, + "num_input_tokens_seen": 38935650, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.25, + "step": 1802, + "time_per_iteration": 2.5547947883605957 + }, + { + "auxiliary_loss_clip": 0.01170285, + "auxiliary_loss_mlp": 0.01059138, + "balance_loss_clip": 1.03712058, + "balance_loss_mlp": 1.05202222, + "epoch": 0.10840222456034872, + "flos": 20264212166400.0, + "grad_norm": 1.7898565484043845, + "language_loss": 0.8136133, + "learning_rate": 3.935865782790621e-06, + "loss": 0.83590758, + "num_input_tokens_seen": 38954130, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1796875, + "step": 1803, + "time_per_iteration": 2.4758658409118652 + }, + { + "auxiliary_loss_clip": 0.0116949, + "auxiliary_loss_mlp": 0.01052862, + "balance_loss_clip": 1.031106, + "balance_loss_mlp": 1.05126929, + "epoch": 0.10846234781301668, + "flos": 19863054097920.0, + "grad_norm": 2.61974519761109, + "language_loss": 0.9122982, + "learning_rate": 3.9357679098416365e-06, + "loss": 0.93452168, + "num_input_tokens_seen": 38972905, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.1875, + "step": 1804, + "time_per_iteration": 2.4858944416046143 + }, + { + "auxiliary_loss_clip": 0.01175328, + "auxiliary_loss_mlp": 0.01052796, + "balance_loss_clip": 1.03031349, + "balance_loss_mlp": 1.05401301, + "epoch": 0.10852247106568465, + "flos": 26469037296000.0, + "grad_norm": 2.0091269076806078, + "language_loss": 0.7623654, + "learning_rate": 3.935669963488139e-06, + "loss": 0.78464663, + "num_input_tokens_seen": 38993255, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.2109375, + "step": 1805, + "time_per_iteration": 2.5379836559295654 + }, + { + "auxiliary_loss_clip": 0.01172079, + "auxiliary_loss_mlp": 0.01048159, + "balance_loss_clip": 1.02842999, + "balance_loss_mlp": 1.0535754, + "epoch": 0.10858259431835263, + "flos": 30081506987520.0, + "grad_norm": 1.8192828849331855, + "language_loss": 0.860416, + "learning_rate": 3.935571943733843e-06, + "loss": 0.88261837, + "num_input_tokens_seen": 39012610, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1875, + "step": 1806, + "time_per_iteration": 2.5523955821990967 + }, + { + "auxiliary_loss_clip": 0.01170931, + "auxiliary_loss_mlp": 0.01053704, + "balance_loss_clip": 1.03275895, + "balance_loss_mlp": 1.05068612, + "epoch": 0.10864271757102059, + "flos": 19063180085760.0, + "grad_norm": 5.439462316727856, + "language_loss": 0.80572915, + "learning_rate": 3.9354738505824635e-06, + "loss": 0.82797557, + "num_input_tokens_seen": 39030120, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.203125, + "step": 1807, + "time_per_iteration": 2.514390230178833 + }, + { + "auxiliary_loss_clip": 0.01171878, + "auxiliary_loss_mlp": 0.01051305, + "balance_loss_clip": 1.03168321, + "balance_loss_mlp": 1.05415583, + "epoch": 0.10870284082368856, + "flos": 24715052271360.0, + "grad_norm": 1.7684897552837426, + "language_loss": 0.78731525, + "learning_rate": 3.9353756840377225e-06, + "loss": 0.80954707, + "num_input_tokens_seen": 39049875, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.171875, + "step": 1808, + "time_per_iteration": 2.5084331035614014 + }, + { + "auxiliary_loss_clip": 0.01176105, + "auxiliary_loss_mlp": 0.01052005, + "balance_loss_clip": 1.03090501, + "balance_loss_mlp": 1.05633223, + "epoch": 0.10876296407635654, + "flos": 20627663932800.0, + "grad_norm": 1.6609588216066864, + "language_loss": 0.78927523, + "learning_rate": 3.935277444103342e-06, + "loss": 0.81155634, + "num_input_tokens_seen": 39068935, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1953125, + "step": 1809, + "time_per_iteration": 5.368049621582031 + }, + { + "auxiliary_loss_clip": 0.01171492, + "auxiliary_loss_mlp": 0.01053913, + "balance_loss_clip": 1.03318286, + "balance_loss_mlp": 1.05087388, + "epoch": 0.1088230873290245, + "flos": 21579835610880.0, + "grad_norm": 2.0370215842844197, + "language_loss": 0.8468523, + "learning_rate": 3.935179130783046e-06, + "loss": 0.86910635, + "num_input_tokens_seen": 39087370, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.203125, + "step": 1810, + "time_per_iteration": 3.904432535171509 + }, + { + "auxiliary_loss_clip": 0.01180342, + "auxiliary_loss_mlp": 0.01054633, + "balance_loss_clip": 1.03111291, + "balance_loss_mlp": 1.05665135, + "epoch": 0.10888321058169247, + "flos": 26469037296000.0, + "grad_norm": 1.9531179942167565, + "language_loss": 0.63677633, + "learning_rate": 3.935080744080564e-06, + "loss": 0.6591261, + "num_input_tokens_seen": 39106635, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.234375, + "step": 1811, + "time_per_iteration": 2.523650646209717 + }, + { + "auxiliary_loss_clip": 0.01171345, + "auxiliary_loss_mlp": 0.01048747, + "balance_loss_clip": 1.02737319, + "balance_loss_mlp": 1.05139136, + "epoch": 0.10894333383436045, + "flos": 25848608653440.0, + "grad_norm": 3.279966127836369, + "language_loss": 0.74238914, + "learning_rate": 3.934982283999626e-06, + "loss": 0.76459008, + "num_input_tokens_seen": 39126335, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.203125, + "step": 1812, + "time_per_iteration": 2.5579042434692383 + }, + { + "auxiliary_loss_clip": 0.01173457, + "auxiliary_loss_mlp": 0.01047521, + "balance_loss_clip": 1.02587295, + "balance_loss_mlp": 1.05391026, + "epoch": 0.10900345708702841, + "flos": 19537093152000.0, + "grad_norm": 1.9314487748153213, + "language_loss": 0.72647583, + "learning_rate": 3.934883750543966e-06, + "loss": 0.74868566, + "num_input_tokens_seen": 39144820, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.1875, + "step": 1813, + "time_per_iteration": 2.488762617111206 + }, + { + "auxiliary_loss_clip": 0.01174675, + "auxiliary_loss_mlp": 0.01051455, + "balance_loss_clip": 1.02999711, + "balance_loss_mlp": 1.05744648, + "epoch": 0.10906358033969638, + "flos": 23623296341760.0, + "grad_norm": 10.097396236718186, + "language_loss": 0.82224226, + "learning_rate": 3.93478514371732e-06, + "loss": 0.84450358, + "num_input_tokens_seen": 39165945, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.171875, + "step": 1814, + "time_per_iteration": 2.495798349380493 + }, + { + "auxiliary_loss_clip": 0.01176897, + "auxiliary_loss_mlp": 0.01057916, + "balance_loss_clip": 1.03670859, + "balance_loss_mlp": 1.05595291, + "epoch": 0.10912370359236434, + "flos": 21214731818880.0, + "grad_norm": 2.3551509805271422, + "language_loss": 0.84218144, + "learning_rate": 3.934686463523429e-06, + "loss": 0.86452949, + "num_input_tokens_seen": 39183520, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.2109375, + "step": 1815, + "time_per_iteration": 2.462663173675537 + }, + { + "auxiliary_loss_clip": 0.01175955, + "auxiliary_loss_mlp": 0.01053131, + "balance_loss_clip": 1.03054035, + "balance_loss_mlp": 1.05833483, + "epoch": 0.10918382684503232, + "flos": 13553190622080.0, + "grad_norm": 2.3954928768695027, + "language_loss": 0.71048725, + "learning_rate": 3.9345877099660315e-06, + "loss": 0.73277813, + "num_input_tokens_seen": 39201190, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.171875, + "step": 1816, + "time_per_iteration": 2.465953826904297 + }, + { + "auxiliary_loss_clip": 0.01178612, + "auxiliary_loss_mlp": 0.01063123, + "balance_loss_clip": 1.04061651, + "balance_loss_mlp": 1.056674, + "epoch": 0.10924395009770028, + "flos": 27964321591680.0, + "grad_norm": 2.0063973144433067, + "language_loss": 0.72811669, + "learning_rate": 3.9344888830488744e-06, + "loss": 0.75053406, + "num_input_tokens_seen": 39221210, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.21875, + "step": 1817, + "time_per_iteration": 2.5323143005371094 + }, + { + "auxiliary_loss_clip": 0.01178871, + "auxiliary_loss_mlp": 0.01054207, + "balance_loss_clip": 1.03167605, + "balance_loss_mlp": 1.05709267, + "epoch": 0.10930407335036825, + "flos": 25593750679680.0, + "grad_norm": 1.767365755633268, + "language_loss": 0.67279243, + "learning_rate": 3.934389982775706e-06, + "loss": 0.6951232, + "num_input_tokens_seen": 39242025, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.21875, + "step": 1818, + "time_per_iteration": 2.5450243949890137 + }, + { + "auxiliary_loss_clip": 0.01177017, + "auxiliary_loss_mlp": 0.01063325, + "balance_loss_clip": 1.04123521, + "balance_loss_mlp": 1.05534315, + "epoch": 0.10936419660303623, + "flos": 18406194376320.0, + "grad_norm": 2.0802139312896744, + "language_loss": 0.72992313, + "learning_rate": 3.934291009150275e-06, + "loss": 0.75232661, + "num_input_tokens_seen": 39259870, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.21875, + "step": 1819, + "time_per_iteration": 2.487644910812378 + }, + { + "auxiliary_loss_clip": 0.01180831, + "auxiliary_loss_mlp": 0.01051168, + "balance_loss_clip": 1.02994883, + "balance_loss_mlp": 1.06090236, + "epoch": 0.1094243198557042, + "flos": 23840052963840.0, + "grad_norm": 7.240077427900601, + "language_loss": 0.73943537, + "learning_rate": 3.934191962176335e-06, + "loss": 0.76175541, + "num_input_tokens_seen": 39278500, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.203125, + "step": 1820, + "time_per_iteration": 2.5064899921417236 + }, + { + "auxiliary_loss_clip": 0.01177081, + "auxiliary_loss_mlp": 0.0105084, + "balance_loss_clip": 1.02765381, + "balance_loss_mlp": 1.05699766, + "epoch": 0.10948444310837216, + "flos": 14643940970880.0, + "grad_norm": 2.1677198782015887, + "language_loss": 0.82586408, + "learning_rate": 3.934092841857642e-06, + "loss": 0.84814322, + "num_input_tokens_seen": 39294800, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.203125, + "step": 1821, + "time_per_iteration": 2.4487218856811523 + }, + { + "auxiliary_loss_clip": 0.01174491, + "auxiliary_loss_mlp": 0.01049191, + "balance_loss_clip": 1.0280906, + "balance_loss_mlp": 1.05549288, + "epoch": 0.10954456636104014, + "flos": 27818811596160.0, + "grad_norm": 2.4783722356243065, + "language_loss": 0.76171732, + "learning_rate": 3.933993648197955e-06, + "loss": 0.78395414, + "num_input_tokens_seen": 39314625, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1875, + "step": 1822, + "time_per_iteration": 2.5196666717529297 + }, + { + "auxiliary_loss_clip": 0.01175357, + "auxiliary_loss_mlp": 0.01051738, + "balance_loss_clip": 1.03070986, + "balance_loss_mlp": 1.05751145, + "epoch": 0.1096046896137081, + "flos": 33620934372480.0, + "grad_norm": 1.9066217775511896, + "language_loss": 0.79275787, + "learning_rate": 3.933894381201034e-06, + "loss": 0.81502879, + "num_input_tokens_seen": 39336465, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1796875, + "step": 1823, + "time_per_iteration": 2.5665249824523926 + }, + { + "auxiliary_loss_clip": 0.01176588, + "auxiliary_loss_mlp": 0.01047872, + "balance_loss_clip": 1.02583015, + "balance_loss_mlp": 1.05788529, + "epoch": 0.10966481286637607, + "flos": 26980010219520.0, + "grad_norm": 1.7066251744315906, + "language_loss": 0.79424715, + "learning_rate": 3.933795040870645e-06, + "loss": 0.81649172, + "num_input_tokens_seen": 39357930, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1875, + "step": 1824, + "time_per_iteration": 2.5238118171691895 + }, + { + "auxiliary_loss_clip": 0.01173809, + "auxiliary_loss_mlp": 0.01054255, + "balance_loss_clip": 1.03264284, + "balance_loss_mlp": 1.05610347, + "epoch": 0.10972493611904403, + "flos": 23036551678080.0, + "grad_norm": 2.2183246130345, + "language_loss": 0.87992203, + "learning_rate": 3.933695627210554e-06, + "loss": 0.90220273, + "num_input_tokens_seen": 39376380, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.171875, + "step": 1825, + "time_per_iteration": 2.48294734954834 + }, + { + "auxiliary_loss_clip": 0.01171104, + "auxiliary_loss_mlp": 0.01056568, + "balance_loss_clip": 1.03483629, + "balance_loss_mlp": 1.05362988, + "epoch": 0.10978505937171201, + "flos": 38104632443520.0, + "grad_norm": 1.8404731426595848, + "language_loss": 0.76462233, + "learning_rate": 3.933596140224532e-06, + "loss": 0.78689909, + "num_input_tokens_seen": 39399935, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.171875, + "step": 1826, + "time_per_iteration": 2.6397035121917725 + }, + { + "auxiliary_loss_clip": 0.01066703, + "auxiliary_loss_mlp": 0.01001412, + "balance_loss_clip": 0.9983961, + "balance_loss_mlp": 1.02257371, + "epoch": 0.10984518262437998, + "flos": 59849694616320.0, + "grad_norm": 0.8361632453995619, + "language_loss": 0.54999328, + "learning_rate": 3.93349657991635e-06, + "loss": 0.57067442, + "num_input_tokens_seen": 39460685, + "router_z_loss_clip": 0.03015137, + "router_z_loss_mlp": 0.44140625, + "step": 1827, + "time_per_iteration": 3.065896511077881 + }, + { + "auxiliary_loss_clip": 0.01064494, + "auxiliary_loss_mlp": 0.01003719, + "balance_loss_clip": 1.00082231, + "balance_loss_mlp": 1.02098036, + "epoch": 0.10990530587704794, + "flos": 66719837410560.0, + "grad_norm": 0.7348311418426204, + "language_loss": 0.55346334, + "learning_rate": 3.933396946289784e-06, + "loss": 0.57414544, + "num_input_tokens_seen": 39524765, + "router_z_loss_clip": 0.02893066, + "router_z_loss_mlp": 0.43359375, + "step": 1828, + "time_per_iteration": 3.0850460529327393 + }, + { + "auxiliary_loss_clip": 0.01180205, + "auxiliary_loss_mlp": 0.01063699, + "balance_loss_clip": 1.03967869, + "balance_loss_mlp": 1.05754089, + "epoch": 0.10996542912971592, + "flos": 25447199189760.0, + "grad_norm": 2.992065013624077, + "language_loss": 0.84191215, + "learning_rate": 3.933297239348612e-06, + "loss": 0.86435115, + "num_input_tokens_seen": 39543640, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.2265625, + "step": 1829, + "time_per_iteration": 2.5398643016815186 + }, + { + "auxiliary_loss_clip": 0.01181422, + "auxiliary_loss_mlp": 0.01057367, + "balance_loss_clip": 1.03348923, + "balance_loss_mlp": 1.05845475, + "epoch": 0.11002555238238389, + "flos": 44018186186880.0, + "grad_norm": 2.654516298718269, + "language_loss": 0.8878119, + "learning_rate": 3.933197459096614e-06, + "loss": 0.91019976, + "num_input_tokens_seen": 39567525, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.2265625, + "step": 1830, + "time_per_iteration": 2.6912100315093994 + }, + { + "auxiliary_loss_clip": 0.01061018, + "auxiliary_loss_mlp": 0.01017752, + "balance_loss_clip": 1.01497495, + "balance_loss_mlp": 1.01824236, + "epoch": 0.11008567563505185, + "flos": 54065133590400.0, + "grad_norm": 0.6883241829767079, + "language_loss": 0.55492055, + "learning_rate": 3.9330976055375756e-06, + "loss": 0.57570827, + "num_input_tokens_seen": 39628470, + "router_z_loss_clip": 0.02783203, + "router_z_loss_mlp": 0.42773438, + "step": 1831, + "time_per_iteration": 3.075678825378418 + }, + { + "auxiliary_loss_clip": 0.01183643, + "auxiliary_loss_mlp": 0.01072422, + "balance_loss_clip": 1.04829443, + "balance_loss_mlp": 1.05867732, + "epoch": 0.11014579888771983, + "flos": 24243150366720.0, + "grad_norm": 2.054835171188452, + "language_loss": 0.90726995, + "learning_rate": 3.932997678675282e-06, + "loss": 0.92983055, + "num_input_tokens_seen": 39646670, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.25, + "step": 1832, + "time_per_iteration": 2.5084948539733887 + }, + { + "auxiliary_loss_clip": 0.01058943, + "auxiliary_loss_mlp": 0.01015171, + "balance_loss_clip": 1.01245296, + "balance_loss_mlp": 1.01603723, + "epoch": 0.1102059221403878, + "flos": 57743965658880.0, + "grad_norm": 0.7159549093535102, + "language_loss": 0.59889859, + "learning_rate": 3.932897678513523e-06, + "loss": 0.61963969, + "num_input_tokens_seen": 39712915, + "router_z_loss_clip": 0.02722168, + "router_z_loss_mlp": 0.4296875, + "step": 1833, + "time_per_iteration": 3.0748977661132812 + }, + { + "auxiliary_loss_clip": 0.01175273, + "auxiliary_loss_mlp": 0.01050301, + "balance_loss_clip": 1.0277946, + "balance_loss_mlp": 1.05353165, + "epoch": 0.11026604539305576, + "flos": 16795923667200.0, + "grad_norm": 2.6030857455850303, + "language_loss": 0.8095156, + "learning_rate": 3.93279760505609e-06, + "loss": 0.83177137, + "num_input_tokens_seen": 39730650, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.21875, + "step": 1834, + "time_per_iteration": 2.452131509780884 + }, + { + "auxiliary_loss_clip": 0.01179876, + "auxiliary_loss_mlp": 0.0105407, + "balance_loss_clip": 1.0302285, + "balance_loss_mlp": 1.05899858, + "epoch": 0.11032616864572373, + "flos": 23988076911360.0, + "grad_norm": 2.5262438386564807, + "language_loss": 0.90514123, + "learning_rate": 3.932697458306779e-06, + "loss": 0.9274807, + "num_input_tokens_seen": 39751065, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.2109375, + "step": 1835, + "time_per_iteration": 2.5261363983154297 + }, + { + "auxiliary_loss_clip": 0.01177237, + "auxiliary_loss_mlp": 0.0105853, + "balance_loss_clip": 1.03445005, + "balance_loss_mlp": 1.05625033, + "epoch": 0.1103862918983917, + "flos": 19683141851520.0, + "grad_norm": 2.0785934228774003, + "language_loss": 0.63590646, + "learning_rate": 3.932597238269386e-06, + "loss": 0.65826416, + "num_input_tokens_seen": 39769245, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.2109375, + "step": 1836, + "time_per_iteration": 2.502586603164673 + }, + { + "auxiliary_loss_clip": 0.01173672, + "auxiliary_loss_mlp": 0.01057372, + "balance_loss_clip": 1.03547311, + "balance_loss_mlp": 1.05388379, + "epoch": 0.11044641515105967, + "flos": 32160878340480.0, + "grad_norm": 1.9330421575083043, + "language_loss": 0.72814602, + "learning_rate": 3.932496944947711e-06, + "loss": 0.75045645, + "num_input_tokens_seen": 39790830, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.1953125, + "step": 1837, + "time_per_iteration": 2.594910144805908 + }, + { + "auxiliary_loss_clip": 0.01179947, + "auxiliary_loss_mlp": 0.0105928, + "balance_loss_clip": 1.03733349, + "balance_loss_mlp": 1.05921102, + "epoch": 0.11050653840372764, + "flos": 16689233295360.0, + "grad_norm": 2.132041599419941, + "language_loss": 0.79049784, + "learning_rate": 3.93239657834556e-06, + "loss": 0.81289005, + "num_input_tokens_seen": 39809475, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.203125, + "step": 1838, + "time_per_iteration": 2.4922690391540527 + }, + { + "auxiliary_loss_clip": 0.01175098, + "auxiliary_loss_mlp": 0.01061476, + "balance_loss_clip": 1.03883791, + "balance_loss_mlp": 1.05623114, + "epoch": 0.11056666165639562, + "flos": 21208877902080.0, + "grad_norm": 4.130442583787946, + "language_loss": 0.71453696, + "learning_rate": 3.932296138466736e-06, + "loss": 0.73690271, + "num_input_tokens_seen": 39826355, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.1875, + "step": 1839, + "time_per_iteration": 2.5151031017303467 + }, + { + "auxiliary_loss_clip": 0.01183988, + "auxiliary_loss_mlp": 0.01053903, + "balance_loss_clip": 1.03082371, + "balance_loss_mlp": 1.05938148, + "epoch": 0.11062678490906358, + "flos": 19165488998400.0, + "grad_norm": 2.064820600929851, + "language_loss": 0.79099703, + "learning_rate": 3.93219562531505e-06, + "loss": 0.81337595, + "num_input_tokens_seen": 39845335, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.25, + "step": 1840, + "time_per_iteration": 2.487116575241089 + }, + { + "auxiliary_loss_clip": 0.01171241, + "auxiliary_loss_mlp": 0.01053863, + "balance_loss_clip": 1.03234553, + "balance_loss_mlp": 1.05329347, + "epoch": 0.11068690816173155, + "flos": 24895287740160.0, + "grad_norm": 2.0204098875762293, + "language_loss": 0.87691998, + "learning_rate": 3.932095038894311e-06, + "loss": 0.89917111, + "num_input_tokens_seen": 39865065, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.1796875, + "step": 1841, + "time_per_iteration": 2.5141868591308594 + }, + { + "auxiliary_loss_clip": 0.01170262, + "auxiliary_loss_mlp": 0.01053518, + "balance_loss_clip": 1.03126192, + "balance_loss_mlp": 1.05365491, + "epoch": 0.11074703141439952, + "flos": 16472368932480.0, + "grad_norm": 2.3404569451138535, + "language_loss": 0.90582979, + "learning_rate": 3.931994379208334e-06, + "loss": 0.92806768, + "num_input_tokens_seen": 39882780, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.1640625, + "step": 1842, + "time_per_iteration": 2.4583773612976074 + }, + { + "auxiliary_loss_clip": 0.0117179, + "auxiliary_loss_mlp": 0.0105155, + "balance_loss_clip": 1.03080761, + "balance_loss_mlp": 1.05210185, + "epoch": 0.11080715466706749, + "flos": 19172420323200.0, + "grad_norm": 2.171204868901281, + "language_loss": 0.85597986, + "learning_rate": 3.931893646260937e-06, + "loss": 0.87821329, + "num_input_tokens_seen": 39900295, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1953125, + "step": 1843, + "time_per_iteration": 2.4801278114318848 + }, + { + "auxiliary_loss_clip": 0.01174004, + "auxiliary_loss_mlp": 0.01060021, + "balance_loss_clip": 1.03645349, + "balance_loss_mlp": 1.05622911, + "epoch": 0.11086727791973545, + "flos": 27704687109120.0, + "grad_norm": 1.47825888700324, + "language_loss": 0.7494424, + "learning_rate": 3.931792840055941e-06, + "loss": 0.77178264, + "num_input_tokens_seen": 39922075, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.1796875, + "step": 1844, + "time_per_iteration": 2.526383876800537 + }, + { + "auxiliary_loss_clip": 0.01173241, + "auxiliary_loss_mlp": 0.01054334, + "balance_loss_clip": 1.0304563, + "balance_loss_mlp": 1.05405343, + "epoch": 0.11092740117240343, + "flos": 18514967736960.0, + "grad_norm": 2.0036363505702433, + "language_loss": 0.75732028, + "learning_rate": 3.931691960597165e-06, + "loss": 0.77959603, + "num_input_tokens_seen": 39940115, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.1953125, + "step": 1845, + "time_per_iteration": 2.463327169418335 + }, + { + "auxiliary_loss_clip": 0.01171011, + "auxiliary_loss_mlp": 0.01054645, + "balance_loss_clip": 1.03341389, + "balance_loss_mlp": 1.05351365, + "epoch": 0.1109875244250714, + "flos": 20522446018560.0, + "grad_norm": 1.6129010657048202, + "language_loss": 0.76336479, + "learning_rate": 3.9315910078884375e-06, + "loss": 0.7856214, + "num_input_tokens_seen": 39959920, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.171875, + "step": 1846, + "time_per_iteration": 2.465045928955078 + }, + { + "auxiliary_loss_clip": 0.01175917, + "auxiliary_loss_mlp": 0.01053852, + "balance_loss_clip": 1.03262115, + "balance_loss_mlp": 1.05392015, + "epoch": 0.11104764767773936, + "flos": 14098601710080.0, + "grad_norm": 2.9965527726637577, + "language_loss": 0.85611343, + "learning_rate": 3.931489981933584e-06, + "loss": 0.87841111, + "num_input_tokens_seen": 39974755, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.21875, + "step": 1847, + "time_per_iteration": 2.4493908882141113 + }, + { + "auxiliary_loss_clip": 0.01174539, + "auxiliary_loss_mlp": 0.01053148, + "balance_loss_clip": 1.03018796, + "balance_loss_mlp": 1.05326366, + "epoch": 0.11110777093040733, + "flos": 20594518657920.0, + "grad_norm": 3.3740806549350086, + "language_loss": 0.76464605, + "learning_rate": 3.931388882736438e-06, + "loss": 0.78692293, + "num_input_tokens_seen": 39993355, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.2109375, + "step": 1848, + "time_per_iteration": 2.4647536277770996 + }, + { + "auxiliary_loss_clip": 0.01173713, + "auxiliary_loss_mlp": 0.01048417, + "balance_loss_clip": 1.02754378, + "balance_loss_mlp": 1.05833888, + "epoch": 0.11116789418307531, + "flos": 21870065502720.0, + "grad_norm": 2.0750561163348173, + "language_loss": 0.77849847, + "learning_rate": 3.931287710300832e-06, + "loss": 0.8007198, + "num_input_tokens_seen": 40012410, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1484375, + "step": 1849, + "time_per_iteration": 2.514777660369873 + }, + { + "auxiliary_loss_clip": 0.01176001, + "auxiliary_loss_mlp": 0.01056414, + "balance_loss_clip": 1.03496861, + "balance_loss_mlp": 1.05422294, + "epoch": 0.11122801743574327, + "flos": 15523106256000.0, + "grad_norm": 3.6662643697478066, + "language_loss": 0.71315688, + "learning_rate": 3.931186464630601e-06, + "loss": 0.73548102, + "num_input_tokens_seen": 40029315, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.21875, + "step": 1850, + "time_per_iteration": 2.4420053958892822 + }, + { + "auxiliary_loss_clip": 0.01174469, + "auxiliary_loss_mlp": 0.01056777, + "balance_loss_clip": 1.03434181, + "balance_loss_mlp": 1.05444217, + "epoch": 0.11128814068841124, + "flos": 14392279307520.0, + "grad_norm": 2.2721050151861912, + "language_loss": 0.81174368, + "learning_rate": 3.931085145729588e-06, + "loss": 0.83405614, + "num_input_tokens_seen": 40045765, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.203125, + "step": 1851, + "time_per_iteration": 5.341679811477661 + }, + { + "auxiliary_loss_clip": 0.01173679, + "auxiliary_loss_mlp": 0.01051699, + "balance_loss_clip": 1.03126621, + "balance_loss_mlp": 1.05519962, + "epoch": 0.11134826394107922, + "flos": 16653933204480.0, + "grad_norm": 3.240427658931177, + "language_loss": 0.88860446, + "learning_rate": 3.930983753601631e-06, + "loss": 0.91085827, + "num_input_tokens_seen": 40061660, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.1875, + "step": 1852, + "time_per_iteration": 3.8281352519989014 + }, + { + "auxiliary_loss_clip": 0.01176515, + "auxiliary_loss_mlp": 0.01057817, + "balance_loss_clip": 1.03514326, + "balance_loss_mlp": 1.05636191, + "epoch": 0.11140838719374718, + "flos": 16690993061760.0, + "grad_norm": 2.0685366180695848, + "language_loss": 0.72092974, + "learning_rate": 3.930882288250578e-06, + "loss": 0.74327302, + "num_input_tokens_seen": 40080180, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.203125, + "step": 1853, + "time_per_iteration": 2.4896738529205322 + }, + { + "auxiliary_loss_clip": 0.01055744, + "auxiliary_loss_mlp": 0.01002079, + "balance_loss_clip": 0.99923038, + "balance_loss_mlp": 1.0132798, + "epoch": 0.11146851044641515, + "flos": 60976355587200.0, + "grad_norm": 0.7783537669608381, + "language_loss": 0.53647029, + "learning_rate": 3.930780749680273e-06, + "loss": 0.5570485, + "num_input_tokens_seen": 40138910, + "router_z_loss_clip": 0.02844238, + "router_z_loss_mlp": 0.42578125, + "step": 1854, + "time_per_iteration": 3.0189781188964844 + }, + { + "auxiliary_loss_clip": 0.01184355, + "auxiliary_loss_mlp": 0.01052254, + "balance_loss_clip": 1.02937746, + "balance_loss_mlp": 1.057657, + "epoch": 0.11152863369908313, + "flos": 22193835719040.0, + "grad_norm": 2.006296213399466, + "language_loss": 0.8394689, + "learning_rate": 3.9306791378945705e-06, + "loss": 0.861835, + "num_input_tokens_seen": 40157745, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.265625, + "step": 1855, + "time_per_iteration": 2.4908485412597656 + }, + { + "auxiliary_loss_clip": 0.01173641, + "auxiliary_loss_mlp": 0.0106694, + "balance_loss_clip": 1.04588723, + "balance_loss_mlp": 1.05353498, + "epoch": 0.11158875695175109, + "flos": 19537524115200.0, + "grad_norm": 2.2091175797191815, + "language_loss": 0.82098675, + "learning_rate": 3.9305774528973205e-06, + "loss": 0.84339261, + "num_input_tokens_seen": 40175375, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.203125, + "step": 1856, + "time_per_iteration": 2.4653480052948 + }, + { + "auxiliary_loss_clip": 0.01173222, + "auxiliary_loss_mlp": 0.01048519, + "balance_loss_clip": 1.02631092, + "balance_loss_mlp": 1.05662763, + "epoch": 0.11164888020441906, + "flos": 25442709989760.0, + "grad_norm": 2.9605277294776, + "language_loss": 0.8305279, + "learning_rate": 3.93047569469238e-06, + "loss": 0.85274535, + "num_input_tokens_seen": 40195715, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.1640625, + "step": 1857, + "time_per_iteration": 2.5205907821655273 + }, + { + "auxiliary_loss_clip": 0.01173614, + "auxiliary_loss_mlp": 0.01049346, + "balance_loss_clip": 1.0279119, + "balance_loss_mlp": 1.05195725, + "epoch": 0.11170900345708702, + "flos": 15632741543040.0, + "grad_norm": 2.3309612964817923, + "language_loss": 0.83037764, + "learning_rate": 3.930373863283608e-06, + "loss": 0.85260725, + "num_input_tokens_seen": 40213975, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.21875, + "step": 1858, + "time_per_iteration": 2.4567432403564453 + }, + { + "auxiliary_loss_clip": 0.01175678, + "auxiliary_loss_mlp": 0.01062921, + "balance_loss_clip": 1.04205894, + "balance_loss_mlp": 1.05549788, + "epoch": 0.111769126709755, + "flos": 23039424766080.0, + "grad_norm": 2.004830650729854, + "language_loss": 0.91120583, + "learning_rate": 3.930271958674866e-06, + "loss": 0.93359184, + "num_input_tokens_seen": 40233905, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.203125, + "step": 1859, + "time_per_iteration": 2.532048463821411 + }, + { + "auxiliary_loss_clip": 0.01173939, + "auxiliary_loss_mlp": 0.0105127, + "balance_loss_clip": 1.02983618, + "balance_loss_mlp": 1.05344319, + "epoch": 0.11182924996242297, + "flos": 20850705434880.0, + "grad_norm": 2.4768392741235306, + "language_loss": 0.81709313, + "learning_rate": 3.930169980870018e-06, + "loss": 0.83934522, + "num_input_tokens_seen": 40252810, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.203125, + "step": 1860, + "time_per_iteration": 2.4747087955474854 + }, + { + "auxiliary_loss_clip": 0.01170638, + "auxiliary_loss_mlp": 0.01056481, + "balance_loss_clip": 1.0361197, + "balance_loss_mlp": 1.05388653, + "epoch": 0.11188937321509093, + "flos": 17455315587840.0, + "grad_norm": 2.1256274007234937, + "language_loss": 0.75203162, + "learning_rate": 3.930067929872931e-06, + "loss": 0.77430284, + "num_input_tokens_seen": 40272000, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.171875, + "step": 1861, + "time_per_iteration": 2.4773240089416504 + }, + { + "auxiliary_loss_clip": 0.01169857, + "auxiliary_loss_mlp": 0.01052708, + "balance_loss_clip": 1.03318143, + "balance_loss_mlp": 1.05338371, + "epoch": 0.11194949646775891, + "flos": 24095916518400.0, + "grad_norm": 2.0016824982414776, + "language_loss": 0.88759935, + "learning_rate": 3.929965805687474e-06, + "loss": 0.90982509, + "num_input_tokens_seen": 40290660, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.1640625, + "step": 1862, + "time_per_iteration": 2.4750735759735107 + }, + { + "auxiliary_loss_clip": 0.01173358, + "auxiliary_loss_mlp": 0.01059619, + "balance_loss_clip": 1.03880525, + "balance_loss_mlp": 1.05597067, + "epoch": 0.11200961972042688, + "flos": 25153880728320.0, + "grad_norm": 2.1858127473987015, + "language_loss": 0.8707, + "learning_rate": 3.92986360831752e-06, + "loss": 0.89302975, + "num_input_tokens_seen": 40307820, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.171875, + "step": 1863, + "time_per_iteration": 2.510676860809326 + }, + { + "auxiliary_loss_clip": 0.01173984, + "auxiliary_loss_mlp": 0.01051873, + "balance_loss_clip": 1.0283289, + "balance_loss_mlp": 1.05463171, + "epoch": 0.11206974297309484, + "flos": 21288312829440.0, + "grad_norm": 2.0887108243102976, + "language_loss": 0.64630157, + "learning_rate": 3.929761337766945e-06, + "loss": 0.66856015, + "num_input_tokens_seen": 40327430, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.1953125, + "step": 1864, + "time_per_iteration": 2.4843807220458984 + }, + { + "auxiliary_loss_clip": 0.01171142, + "auxiliary_loss_mlp": 0.01051015, + "balance_loss_clip": 1.03169096, + "balance_loss_mlp": 1.05504417, + "epoch": 0.11212986622576282, + "flos": 18915982151040.0, + "grad_norm": 2.0715232833306874, + "language_loss": 0.73895639, + "learning_rate": 3.929658994039627e-06, + "loss": 0.76117796, + "num_input_tokens_seen": 40344545, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.15625, + "step": 1865, + "time_per_iteration": 2.4509596824645996 + }, + { + "auxiliary_loss_clip": 0.01169998, + "auxiliary_loss_mlp": 0.01051954, + "balance_loss_clip": 1.02928007, + "balance_loss_mlp": 1.05253589, + "epoch": 0.11218998947843078, + "flos": 22054754257920.0, + "grad_norm": 2.190736679244475, + "language_loss": 0.84019023, + "learning_rate": 3.929556577139446e-06, + "loss": 0.86240977, + "num_input_tokens_seen": 40362300, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.171875, + "step": 1866, + "time_per_iteration": 2.473715305328369 + }, + { + "auxiliary_loss_clip": 0.01169711, + "auxiliary_loss_mlp": 0.01048459, + "balance_loss_clip": 1.02737069, + "balance_loss_mlp": 1.05260134, + "epoch": 0.11225011273109875, + "flos": 24571697091840.0, + "grad_norm": 1.5419857436109028, + "language_loss": 0.81424987, + "learning_rate": 3.929454087070286e-06, + "loss": 0.83643156, + "num_input_tokens_seen": 40384720, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.171875, + "step": 1867, + "time_per_iteration": 2.5367391109466553 + }, + { + "auxiliary_loss_clip": 0.01172987, + "auxiliary_loss_mlp": 0.01055012, + "balance_loss_clip": 1.03473496, + "balance_loss_mlp": 1.05594015, + "epoch": 0.11231023598376672, + "flos": 28438665621120.0, + "grad_norm": 2.5308159777425976, + "language_loss": 0.86677599, + "learning_rate": 3.929351523836035e-06, + "loss": 0.88905597, + "num_input_tokens_seen": 40404000, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.171875, + "step": 1868, + "time_per_iteration": 2.5044100284576416 + }, + { + "auxiliary_loss_clip": 0.01172172, + "auxiliary_loss_mlp": 0.01049736, + "balance_loss_clip": 1.03026938, + "balance_loss_mlp": 1.05724931, + "epoch": 0.1123703592364347, + "flos": 14426466076800.0, + "grad_norm": 2.333499600894065, + "language_loss": 0.68059367, + "learning_rate": 3.9292488874405795e-06, + "loss": 0.70281279, + "num_input_tokens_seen": 40418665, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.1484375, + "step": 1869, + "time_per_iteration": 2.4462318420410156 + }, + { + "auxiliary_loss_clip": 0.01176659, + "auxiliary_loss_mlp": 0.01061629, + "balance_loss_clip": 1.03969407, + "balance_loss_mlp": 1.05456114, + "epoch": 0.11243048248910266, + "flos": 22236282616320.0, + "grad_norm": 2.049754856307833, + "language_loss": 0.7735095, + "learning_rate": 3.929146177887814e-06, + "loss": 0.79589236, + "num_input_tokens_seen": 40437870, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.21875, + "step": 1870, + "time_per_iteration": 2.488471031188965 + }, + { + "auxiliary_loss_clip": 0.01174025, + "auxiliary_loss_mlp": 0.01053264, + "balance_loss_clip": 1.03177094, + "balance_loss_mlp": 1.05264199, + "epoch": 0.11249060574177062, + "flos": 18584167288320.0, + "grad_norm": 1.8085683914823212, + "language_loss": 0.75747174, + "learning_rate": 3.929043395181631e-06, + "loss": 0.77974463, + "num_input_tokens_seen": 40455570, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.21875, + "step": 1871, + "time_per_iteration": 2.475358486175537 + }, + { + "auxiliary_loss_clip": 0.01171992, + "auxiliary_loss_mlp": 0.01049389, + "balance_loss_clip": 1.02936232, + "balance_loss_mlp": 1.05448031, + "epoch": 0.1125507289944386, + "flos": 22856567604480.0, + "grad_norm": 2.4822417703451265, + "language_loss": 0.81949306, + "learning_rate": 3.928940539325929e-06, + "loss": 0.84170687, + "num_input_tokens_seen": 40473600, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.171875, + "step": 1872, + "time_per_iteration": 2.4984912872314453 + }, + { + "auxiliary_loss_clip": 0.01172929, + "auxiliary_loss_mlp": 0.01052146, + "balance_loss_clip": 1.03183281, + "balance_loss_mlp": 1.05497694, + "epoch": 0.11261085224710657, + "flos": 19676390094720.0, + "grad_norm": 2.7250665555581937, + "language_loss": 0.83564019, + "learning_rate": 3.9288376103246095e-06, + "loss": 0.85789096, + "num_input_tokens_seen": 40490025, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1796875, + "step": 1873, + "time_per_iteration": 2.480172872543335 + }, + { + "auxiliary_loss_clip": 0.01175617, + "auxiliary_loss_mlp": 0.01053305, + "balance_loss_clip": 1.03089404, + "balance_loss_mlp": 1.05352998, + "epoch": 0.11267097549977453, + "flos": 26063246373120.0, + "grad_norm": 2.2103217259008985, + "language_loss": 0.91925669, + "learning_rate": 3.928734608181575e-06, + "loss": 0.9415459, + "num_input_tokens_seen": 40511580, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.21875, + "step": 1874, + "time_per_iteration": 2.5193865299224854 + }, + { + "auxiliary_loss_clip": 0.01168968, + "auxiliary_loss_mlp": 0.01057524, + "balance_loss_clip": 1.0376637, + "balance_loss_mlp": 1.0528394, + "epoch": 0.11273109875244251, + "flos": 21068036674560.0, + "grad_norm": 1.5656160151577971, + "language_loss": 0.7534616, + "learning_rate": 3.928631532900729e-06, + "loss": 0.77572656, + "num_input_tokens_seen": 40530155, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.15625, + "step": 1875, + "time_per_iteration": 2.509000062942505 + }, + { + "auxiliary_loss_clip": 0.01168067, + "auxiliary_loss_mlp": 0.01054767, + "balance_loss_clip": 1.03545499, + "balance_loss_mlp": 1.05498421, + "epoch": 0.11279122200511048, + "flos": 27088999061760.0, + "grad_norm": 1.875753927893446, + "language_loss": 0.71727258, + "learning_rate": 3.928528384485984e-06, + "loss": 0.73950088, + "num_input_tokens_seen": 40549500, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.1328125, + "step": 1876, + "time_per_iteration": 2.5222911834716797 + }, + { + "auxiliary_loss_clip": 0.01170022, + "auxiliary_loss_mlp": 0.01051226, + "balance_loss_clip": 1.03036463, + "balance_loss_mlp": 1.05574679, + "epoch": 0.11285134525777844, + "flos": 20187901722240.0, + "grad_norm": 2.408917627715415, + "language_loss": 0.76760256, + "learning_rate": 3.9284251629412475e-06, + "loss": 0.78981495, + "num_input_tokens_seen": 40567475, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.140625, + "step": 1877, + "time_per_iteration": 2.487870693206787 + }, + { + "auxiliary_loss_clip": 0.01173931, + "auxiliary_loss_mlp": 0.01057623, + "balance_loss_clip": 1.03555715, + "balance_loss_mlp": 1.05530918, + "epoch": 0.11291146851044641, + "flos": 12458453863680.0, + "grad_norm": 2.569804002246691, + "language_loss": 0.88132238, + "learning_rate": 3.928321868270436e-06, + "loss": 0.90363795, + "num_input_tokens_seen": 40583280, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1875, + "step": 1878, + "time_per_iteration": 2.4562089443206787 + }, + { + "auxiliary_loss_clip": 0.01171231, + "auxiliary_loss_mlp": 0.01046868, + "balance_loss_clip": 1.02628088, + "balance_loss_mlp": 1.05382609, + "epoch": 0.11297159176311439, + "flos": 23842315520640.0, + "grad_norm": 2.2792620862185036, + "language_loss": 0.81521666, + "learning_rate": 3.928218500477466e-06, + "loss": 0.83739763, + "num_input_tokens_seen": 40603080, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.171875, + "step": 1879, + "time_per_iteration": 2.515162944793701 + }, + { + "auxiliary_loss_clip": 0.01174903, + "auxiliary_loss_mlp": 0.01056113, + "balance_loss_clip": 1.03513217, + "balance_loss_mlp": 1.05591071, + "epoch": 0.11303171501578235, + "flos": 29930538124800.0, + "grad_norm": 2.9729184409385376, + "language_loss": 0.70101768, + "learning_rate": 3.928115059566259e-06, + "loss": 0.72332788, + "num_input_tokens_seen": 40623255, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.1875, + "step": 1880, + "time_per_iteration": 2.5420267581939697 + }, + { + "auxiliary_loss_clip": 0.01169399, + "auxiliary_loss_mlp": 0.01045441, + "balance_loss_clip": 1.02442431, + "balance_loss_mlp": 1.05396068, + "epoch": 0.11309183826845032, + "flos": 16180558842240.0, + "grad_norm": 1.7442831242084353, + "language_loss": 0.72337204, + "learning_rate": 3.928011545540734e-06, + "loss": 0.74552047, + "num_input_tokens_seen": 40641570, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.15625, + "step": 1881, + "time_per_iteration": 2.4648680686950684 + }, + { + "auxiliary_loss_clip": 0.01172977, + "auxiliary_loss_mlp": 0.01057236, + "balance_loss_clip": 1.03452694, + "balance_loss_mlp": 1.05385113, + "epoch": 0.1131519615211183, + "flos": 12020702814720.0, + "grad_norm": 2.4452990726029533, + "language_loss": 0.74243963, + "learning_rate": 3.927907958404819e-06, + "loss": 0.76474178, + "num_input_tokens_seen": 40658775, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.1953125, + "step": 1882, + "time_per_iteration": 2.459181547164917 + }, + { + "auxiliary_loss_clip": 0.01171271, + "auxiliary_loss_mlp": 0.01052266, + "balance_loss_clip": 1.03045106, + "balance_loss_mlp": 1.05493677, + "epoch": 0.11321208477378626, + "flos": 26250125857920.0, + "grad_norm": 2.8641228673356873, + "language_loss": 0.79328096, + "learning_rate": 3.92780429816244e-06, + "loss": 0.81551635, + "num_input_tokens_seen": 40679555, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.15625, + "step": 1883, + "time_per_iteration": 2.5236945152282715 + }, + { + "auxiliary_loss_clip": 0.01173507, + "auxiliary_loss_mlp": 0.01054538, + "balance_loss_clip": 1.03271067, + "balance_loss_mlp": 1.05288672, + "epoch": 0.11327220802645423, + "flos": 13626376583040.0, + "grad_norm": 3.0524763398538193, + "language_loss": 0.77151698, + "learning_rate": 3.927700564817529e-06, + "loss": 0.79379749, + "num_input_tokens_seen": 40697295, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.203125, + "step": 1884, + "time_per_iteration": 2.4974489212036133 + }, + { + "auxiliary_loss_clip": 0.01063959, + "auxiliary_loss_mlp": 0.01012749, + "balance_loss_clip": 1.0102694, + "balance_loss_mlp": 1.02156711, + "epoch": 0.1133323312791222, + "flos": 57191802814080.0, + "grad_norm": 0.7928734254501784, + "language_loss": 0.55183071, + "learning_rate": 3.927596758374019e-06, + "loss": 0.5725978, + "num_input_tokens_seen": 40758095, + "router_z_loss_clip": 0.02478027, + "router_z_loss_mlp": 0.42382812, + "step": 1885, + "time_per_iteration": 2.9756290912628174 + }, + { + "auxiliary_loss_clip": 0.01166272, + "auxiliary_loss_mlp": 0.01047922, + "balance_loss_clip": 1.02758515, + "balance_loss_mlp": 1.0534817, + "epoch": 0.11339245453179017, + "flos": 24351708245760.0, + "grad_norm": 5.752063942495911, + "language_loss": 0.90240276, + "learning_rate": 3.927492878835848e-06, + "loss": 0.92454469, + "num_input_tokens_seen": 40777140, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.125, + "step": 1886, + "time_per_iteration": 2.5031139850616455 + }, + { + "auxiliary_loss_clip": 0.01168969, + "auxiliary_loss_mlp": 0.01051145, + "balance_loss_clip": 1.03018832, + "balance_loss_mlp": 1.05306387, + "epoch": 0.11345257778445814, + "flos": 22670693700480.0, + "grad_norm": 2.0267704425546036, + "language_loss": 0.85101235, + "learning_rate": 3.927388926206953e-06, + "loss": 0.87321353, + "num_input_tokens_seen": 40797505, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.15625, + "step": 1887, + "time_per_iteration": 2.5177412033081055 + }, + { + "auxiliary_loss_clip": 0.01172698, + "auxiliary_loss_mlp": 0.01061982, + "balance_loss_clip": 1.0417881, + "balance_loss_mlp": 1.05554259, + "epoch": 0.11351270103712612, + "flos": 20988242611200.0, + "grad_norm": 5.5783153731033055, + "language_loss": 0.76168925, + "learning_rate": 3.927284900491277e-06, + "loss": 0.78403604, + "num_input_tokens_seen": 40812970, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.171875, + "step": 1888, + "time_per_iteration": 2.4965853691101074 + }, + { + "auxiliary_loss_clip": 0.01177743, + "auxiliary_loss_mlp": 0.01057849, + "balance_loss_clip": 1.03542566, + "balance_loss_mlp": 1.05632472, + "epoch": 0.11357282428979408, + "flos": 37347923600640.0, + "grad_norm": 2.114301103868513, + "language_loss": 0.68039739, + "learning_rate": 3.927180801692764e-06, + "loss": 0.70275331, + "num_input_tokens_seen": 40837745, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.21875, + "step": 1889, + "time_per_iteration": 2.643867015838623 + }, + { + "auxiliary_loss_clip": 0.01172679, + "auxiliary_loss_mlp": 0.01047613, + "balance_loss_clip": 1.02611947, + "balance_loss_mlp": 1.05620956, + "epoch": 0.11363294754246205, + "flos": 21757018423680.0, + "grad_norm": 2.158184033346157, + "language_loss": 0.84414917, + "learning_rate": 3.927076629815362e-06, + "loss": 0.86635208, + "num_input_tokens_seen": 40856490, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.1640625, + "step": 1890, + "time_per_iteration": 2.5018270015716553 + }, + { + "auxiliary_loss_clip": 0.01168344, + "auxiliary_loss_mlp": 0.01050115, + "balance_loss_clip": 1.02855039, + "balance_loss_mlp": 1.05288363, + "epoch": 0.11369307079513001, + "flos": 22601637803520.0, + "grad_norm": 2.2859967152973373, + "language_loss": 0.65099049, + "learning_rate": 3.926972384863022e-06, + "loss": 0.67317504, + "num_input_tokens_seen": 40874070, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.15625, + "step": 1891, + "time_per_iteration": 2.4870762825012207 + }, + { + "auxiliary_loss_clip": 0.01173219, + "auxiliary_loss_mlp": 0.01043072, + "balance_loss_clip": 1.02274656, + "balance_loss_mlp": 1.05397856, + "epoch": 0.11375319404779799, + "flos": 21944257044480.0, + "grad_norm": 2.358390081637715, + "language_loss": 0.87789619, + "learning_rate": 3.9268680668396956e-06, + "loss": 0.90005904, + "num_input_tokens_seen": 40892425, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1953125, + "step": 1892, + "time_per_iteration": 2.469215154647827 + }, + { + "auxiliary_loss_clip": 0.01173439, + "auxiliary_loss_mlp": 0.01066287, + "balance_loss_clip": 1.04509139, + "balance_loss_mlp": 1.05419993, + "epoch": 0.11381331730046595, + "flos": 26395456285440.0, + "grad_norm": 2.4185703679999775, + "language_loss": 0.72724342, + "learning_rate": 3.926763675749339e-06, + "loss": 0.7496407, + "num_input_tokens_seen": 40912190, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1875, + "step": 1893, + "time_per_iteration": 4.021688222885132 + }, + { + "auxiliary_loss_clip": 0.01169367, + "auxiliary_loss_mlp": 0.0105827, + "balance_loss_clip": 1.03531051, + "balance_loss_mlp": 1.05175805, + "epoch": 0.11387344055313392, + "flos": 23804716959360.0, + "grad_norm": 2.254020248775613, + "language_loss": 0.79367435, + "learning_rate": 3.92665921159591e-06, + "loss": 0.81595069, + "num_input_tokens_seen": 40928395, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.171875, + "step": 1894, + "time_per_iteration": 3.9190711975097656 + }, + { + "auxiliary_loss_clip": 0.01176791, + "auxiliary_loss_mlp": 0.01052535, + "balance_loss_clip": 1.03074312, + "balance_loss_mlp": 1.05530715, + "epoch": 0.1139335638058019, + "flos": 34522865902080.0, + "grad_norm": 2.587114905294773, + "language_loss": 0.78868139, + "learning_rate": 3.926554674383371e-06, + "loss": 0.81097472, + "num_input_tokens_seen": 40946555, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.21875, + "step": 1895, + "time_per_iteration": 2.5924861431121826 + }, + { + "auxiliary_loss_clip": 0.0106161, + "auxiliary_loss_mlp": 0.01001633, + "balance_loss_clip": 0.99917758, + "balance_loss_mlp": 1.01840448, + "epoch": 0.11399368705846986, + "flos": 70587811520640.0, + "grad_norm": 0.8005582337036792, + "language_loss": 0.63316774, + "learning_rate": 3.926450064115686e-06, + "loss": 0.65380025, + "num_input_tokens_seen": 41004910, + "router_z_loss_clip": 0.02453613, + "router_z_loss_mlp": 0.43359375, + "step": 1896, + "time_per_iteration": 3.143843412399292 + }, + { + "auxiliary_loss_clip": 0.01170086, + "auxiliary_loss_mlp": 0.01059473, + "balance_loss_clip": 1.03600097, + "balance_loss_mlp": 1.05385494, + "epoch": 0.11405381031113783, + "flos": 21324259365120.0, + "grad_norm": 1.6058527618620146, + "language_loss": 0.84707338, + "learning_rate": 3.926345380796821e-06, + "loss": 0.86936897, + "num_input_tokens_seen": 41026385, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.15625, + "step": 1897, + "time_per_iteration": 2.5120036602020264 + }, + { + "auxiliary_loss_clip": 0.0117262, + "auxiliary_loss_mlp": 0.01053072, + "balance_loss_clip": 1.03159046, + "balance_loss_mlp": 1.05385423, + "epoch": 0.11411393356380581, + "flos": 19719627091200.0, + "grad_norm": 2.3286063431421926, + "language_loss": 0.79776239, + "learning_rate": 3.9262406244307465e-06, + "loss": 0.8200193, + "num_input_tokens_seen": 41045315, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.1875, + "step": 1898, + "time_per_iteration": 2.5186216831207275 + }, + { + "auxiliary_loss_clip": 0.01174476, + "auxiliary_loss_mlp": 0.01056562, + "balance_loss_clip": 1.03330398, + "balance_loss_mlp": 1.05247831, + "epoch": 0.11417405681647377, + "flos": 17530440883200.0, + "grad_norm": 1.996095488823442, + "language_loss": 0.73049861, + "learning_rate": 3.926135795021435e-06, + "loss": 0.75280899, + "num_input_tokens_seen": 41063390, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.21875, + "step": 1899, + "time_per_iteration": 2.484767198562622 + }, + { + "auxiliary_loss_clip": 0.01059885, + "auxiliary_loss_mlp": 0.01004452, + "balance_loss_clip": 1.00197208, + "balance_loss_mlp": 1.0167762, + "epoch": 0.11423418006914174, + "flos": 59674666619520.0, + "grad_norm": 0.9092154832512579, + "language_loss": 0.63432097, + "learning_rate": 3.92603089257286e-06, + "loss": 0.65496433, + "num_input_tokens_seen": 41124180, + "router_z_loss_clip": 0.02478027, + "router_z_loss_mlp": 0.4296875, + "step": 1900, + "time_per_iteration": 3.0239956378936768 + }, + { + "auxiliary_loss_clip": 0.0117026, + "auxiliary_loss_mlp": 0.01058021, + "balance_loss_clip": 1.03600276, + "balance_loss_mlp": 1.05181098, + "epoch": 0.1142943033218097, + "flos": 22963114321920.0, + "grad_norm": 1.6715138036124124, + "language_loss": 0.78116465, + "learning_rate": 3.925925917089001e-06, + "loss": 0.80344748, + "num_input_tokens_seen": 41143485, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.1875, + "step": 1901, + "time_per_iteration": 2.5007457733154297 + }, + { + "auxiliary_loss_clip": 0.01172182, + "auxiliary_loss_mlp": 0.01059819, + "balance_loss_clip": 1.03894591, + "balance_loss_mlp": 1.05482793, + "epoch": 0.11435442657447768, + "flos": 18256267008000.0, + "grad_norm": 1.9023337273707566, + "language_loss": 0.83676988, + "learning_rate": 3.925820868573839e-06, + "loss": 0.85908997, + "num_input_tokens_seen": 41161695, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.171875, + "step": 1902, + "time_per_iteration": 2.4389002323150635 + }, + { + "auxiliary_loss_clip": 0.0117356, + "auxiliary_loss_mlp": 0.01050952, + "balance_loss_clip": 1.02856493, + "balance_loss_mlp": 1.05356252, + "epoch": 0.11441454982714565, + "flos": 24061191045120.0, + "grad_norm": 1.6958297254772137, + "language_loss": 0.77551281, + "learning_rate": 3.925715747031356e-06, + "loss": 0.79775804, + "num_input_tokens_seen": 41181715, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.203125, + "step": 1903, + "time_per_iteration": 2.503164768218994 + }, + { + "auxiliary_loss_clip": 0.01171838, + "auxiliary_loss_mlp": 0.01045456, + "balance_loss_clip": 1.02651334, + "balance_loss_mlp": 1.05437744, + "epoch": 0.11447467307981361, + "flos": 25337707557120.0, + "grad_norm": 2.553861289811236, + "language_loss": 0.75704938, + "learning_rate": 3.925610552465539e-06, + "loss": 0.77922231, + "num_input_tokens_seen": 41201770, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.171875, + "step": 1904, + "time_per_iteration": 2.5097854137420654 + }, + { + "auxiliary_loss_clip": 0.01171595, + "auxiliary_loss_mlp": 0.01053755, + "balance_loss_clip": 1.03192747, + "balance_loss_mlp": 1.05519056, + "epoch": 0.11453479633248159, + "flos": 21726063878400.0, + "grad_norm": 2.146045336495955, + "language_loss": 0.92476678, + "learning_rate": 3.9255052848803764e-06, + "loss": 0.94702017, + "num_input_tokens_seen": 41220590, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.1640625, + "step": 1905, + "time_per_iteration": 2.4905850887298584 + }, + { + "auxiliary_loss_clip": 0.0117632, + "auxiliary_loss_mlp": 0.01050773, + "balance_loss_clip": 1.02755141, + "balance_loss_mlp": 1.0496794, + "epoch": 0.11459491958514956, + "flos": 12969714096000.0, + "grad_norm": 2.457773566764277, + "language_loss": 0.77108872, + "learning_rate": 3.925399944279861e-06, + "loss": 0.7933597, + "num_input_tokens_seen": 41237250, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.265625, + "step": 1906, + "time_per_iteration": 2.4469265937805176 + }, + { + "auxiliary_loss_clip": 0.01173869, + "auxiliary_loss_mlp": 0.01053097, + "balance_loss_clip": 1.03072143, + "balance_loss_mlp": 1.05375302, + "epoch": 0.11465504283781752, + "flos": 22711273090560.0, + "grad_norm": 2.4555636334810593, + "language_loss": 0.81855345, + "learning_rate": 3.925294530667986e-06, + "loss": 0.84082305, + "num_input_tokens_seen": 41256680, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.203125, + "step": 1907, + "time_per_iteration": 2.499772071838379 + }, + { + "auxiliary_loss_clip": 0.01173651, + "auxiliary_loss_mlp": 0.0106593, + "balance_loss_clip": 1.045784, + "balance_loss_mlp": 1.05599511, + "epoch": 0.1147151660904855, + "flos": 23398387332480.0, + "grad_norm": 4.041607412488977, + "language_loss": 0.84798187, + "learning_rate": 3.92518904404875e-06, + "loss": 0.87037772, + "num_input_tokens_seen": 41270955, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.171875, + "step": 1908, + "time_per_iteration": 2.468519687652588 + }, + { + "auxiliary_loss_clip": 0.0105669, + "auxiliary_loss_mlp": 0.01004376, + "balance_loss_clip": 1.0018847, + "balance_loss_mlp": 1.01344705, + "epoch": 0.11477528934315347, + "flos": 63011843498880.0, + "grad_norm": 0.9477470057539497, + "language_loss": 0.6100027, + "learning_rate": 3.925083484426153e-06, + "loss": 0.63061339, + "num_input_tokens_seen": 41319180, + "router_z_loss_clip": 0.02490234, + "router_z_loss_mlp": 0.43164062, + "step": 1909, + "time_per_iteration": 2.8313472270965576 + }, + { + "auxiliary_loss_clip": 0.01174173, + "auxiliary_loss_mlp": 0.01052438, + "balance_loss_clip": 1.03223228, + "balance_loss_mlp": 1.05660319, + "epoch": 0.11483541259582143, + "flos": 16325601960960.0, + "grad_norm": 2.135894642259737, + "language_loss": 0.78793955, + "learning_rate": 3.924977851804197e-06, + "loss": 0.8102057, + "num_input_tokens_seen": 41337480, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.171875, + "step": 1910, + "time_per_iteration": 2.4613592624664307 + }, + { + "auxiliary_loss_clip": 0.01178149, + "auxiliary_loss_mlp": 0.01051798, + "balance_loss_clip": 1.03005373, + "balance_loss_mlp": 1.05803406, + "epoch": 0.1148955358484894, + "flos": 21580410228480.0, + "grad_norm": 3.035949872237615, + "language_loss": 0.76787984, + "learning_rate": 3.9248721461868875e-06, + "loss": 0.79017925, + "num_input_tokens_seen": 41354650, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.203125, + "step": 1911, + "time_per_iteration": 2.475069761276245 + }, + { + "auxiliary_loss_clip": 0.01166349, + "auxiliary_loss_mlp": 0.01051416, + "balance_loss_clip": 1.03048277, + "balance_loss_mlp": 1.05284548, + "epoch": 0.11495565910115738, + "flos": 27673696650240.0, + "grad_norm": 2.1144124150337023, + "language_loss": 0.7927531, + "learning_rate": 3.9247663675782336e-06, + "loss": 0.81493074, + "num_input_tokens_seen": 41376935, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1328125, + "step": 1912, + "time_per_iteration": 2.543607473373413 + }, + { + "auxiliary_loss_clip": 0.01169469, + "auxiliary_loss_mlp": 0.0105862, + "balance_loss_clip": 1.0369364, + "balance_loss_mlp": 1.05352569, + "epoch": 0.11501578235382534, + "flos": 20632368614400.0, + "grad_norm": 1.9322037304643997, + "language_loss": 0.7777245, + "learning_rate": 3.924660515982246e-06, + "loss": 0.80000544, + "num_input_tokens_seen": 41396105, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.1640625, + "step": 1913, + "time_per_iteration": 2.5093326568603516 + }, + { + "auxiliary_loss_clip": 0.01169525, + "auxiliary_loss_mlp": 0.01051154, + "balance_loss_clip": 1.02889776, + "balance_loss_mlp": 1.05118954, + "epoch": 0.1150759056064933, + "flos": 19829046896640.0, + "grad_norm": 3.783180746712747, + "language_loss": 0.70389271, + "learning_rate": 3.924554591402939e-06, + "loss": 0.72609949, + "num_input_tokens_seen": 41415600, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.1875, + "step": 1914, + "time_per_iteration": 2.5099785327911377 + }, + { + "auxiliary_loss_clip": 0.01058565, + "auxiliary_loss_mlp": 0.01002053, + "balance_loss_clip": 0.99943084, + "balance_loss_mlp": 1.01452589, + "epoch": 0.11513602885916129, + "flos": 70045776311040.0, + "grad_norm": 0.7556045547130329, + "language_loss": 0.61044526, + "learning_rate": 3.92444859384433e-06, + "loss": 0.63105142, + "num_input_tokens_seen": 41478760, + "router_z_loss_clip": 0.02624512, + "router_z_loss_mlp": 0.44140625, + "step": 1915, + "time_per_iteration": 3.1735148429870605 + }, + { + "auxiliary_loss_clip": 0.01172283, + "auxiliary_loss_mlp": 0.01054758, + "balance_loss_clip": 1.03273964, + "balance_loss_mlp": 1.05674434, + "epoch": 0.11519615211182925, + "flos": 15741730385280.0, + "grad_norm": 2.822924091618307, + "language_loss": 0.9323889, + "learning_rate": 3.924342523310436e-06, + "loss": 0.95465934, + "num_input_tokens_seen": 41495720, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.15625, + "step": 1916, + "time_per_iteration": 2.4806342124938965 + }, + { + "auxiliary_loss_clip": 0.01171086, + "auxiliary_loss_mlp": 0.01061893, + "balance_loss_clip": 1.03845596, + "balance_loss_mlp": 1.05340374, + "epoch": 0.11525627536449722, + "flos": 20667632791680.0, + "grad_norm": 1.8768677942494545, + "language_loss": 0.72286755, + "learning_rate": 3.9242363798052806e-06, + "loss": 0.7451973, + "num_input_tokens_seen": 41513585, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.171875, + "step": 1917, + "time_per_iteration": 2.519758701324463 + }, + { + "auxiliary_loss_clip": 0.01171782, + "auxiliary_loss_mlp": 0.0104867, + "balance_loss_clip": 1.02664053, + "balance_loss_mlp": 1.05521619, + "epoch": 0.1153163986171652, + "flos": 20303283185280.0, + "grad_norm": 2.2984335892825594, + "language_loss": 0.74389827, + "learning_rate": 3.92413016333289e-06, + "loss": 0.76610279, + "num_input_tokens_seen": 41533390, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1640625, + "step": 1918, + "time_per_iteration": 2.4867136478424072 + }, + { + "auxiliary_loss_clip": 0.01173604, + "auxiliary_loss_mlp": 0.01045744, + "balance_loss_clip": 1.02394044, + "balance_loss_mlp": 1.05273843, + "epoch": 0.11537652186983316, + "flos": 17639321984640.0, + "grad_norm": 2.1981507651696193, + "language_loss": 0.86515707, + "learning_rate": 3.92402387389729e-06, + "loss": 0.88735056, + "num_input_tokens_seen": 41551015, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.2109375, + "step": 1919, + "time_per_iteration": 2.4838428497314453 + }, + { + "auxiliary_loss_clip": 0.01168988, + "auxiliary_loss_mlp": 0.01054432, + "balance_loss_clip": 1.03190136, + "balance_loss_mlp": 1.05291939, + "epoch": 0.11543664512250112, + "flos": 21069401391360.0, + "grad_norm": 2.516832715272094, + "language_loss": 0.86640596, + "learning_rate": 3.923917511502512e-06, + "loss": 0.88864017, + "num_input_tokens_seen": 41568055, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.15625, + "step": 1920, + "time_per_iteration": 2.524017333984375 + }, + { + "auxiliary_loss_clip": 0.01167627, + "auxiliary_loss_mlp": 0.01047596, + "balance_loss_clip": 1.02549434, + "balance_loss_mlp": 1.05360281, + "epoch": 0.11549676837516909, + "flos": 22747542848640.0, + "grad_norm": 2.2143351457696525, + "language_loss": 0.79792106, + "learning_rate": 3.923811076152589e-06, + "loss": 0.82007331, + "num_input_tokens_seen": 41587435, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.140625, + "step": 1921, + "time_per_iteration": 2.494673252105713 + }, + { + "auxiliary_loss_clip": 0.01174597, + "auxiliary_loss_mlp": 0.01056005, + "balance_loss_clip": 1.03331947, + "balance_loss_mlp": 1.05358851, + "epoch": 0.11555689162783707, + "flos": 19168972617600.0, + "grad_norm": 8.96706495073623, + "language_loss": 0.78418177, + "learning_rate": 3.923704567851557e-06, + "loss": 0.8064878, + "num_input_tokens_seen": 41604975, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.2109375, + "step": 1922, + "time_per_iteration": 2.5293705463409424 + }, + { + "auxiliary_loss_clip": 0.01174074, + "auxiliary_loss_mlp": 0.01060645, + "balance_loss_clip": 1.03910375, + "balance_loss_mlp": 1.05410469, + "epoch": 0.11561701488050503, + "flos": 24572056227840.0, + "grad_norm": 1.8482726295091094, + "language_loss": 0.84187758, + "learning_rate": 3.923597986603456e-06, + "loss": 0.86422473, + "num_input_tokens_seen": 41626155, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.203125, + "step": 1923, + "time_per_iteration": 2.5203118324279785 + }, + { + "auxiliary_loss_clip": 0.01175585, + "auxiliary_loss_mlp": 0.01053498, + "balance_loss_clip": 1.03074098, + "balance_loss_mlp": 1.05742192, + "epoch": 0.115677138133173, + "flos": 17092546179840.0, + "grad_norm": 2.0576366068601666, + "language_loss": 0.80471247, + "learning_rate": 3.9234913324123264e-06, + "loss": 0.82700336, + "num_input_tokens_seen": 41644805, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.1796875, + "step": 1924, + "time_per_iteration": 2.48531436920166 + }, + { + "auxiliary_loss_clip": 0.01058783, + "auxiliary_loss_mlp": 0.01001491, + "balance_loss_clip": 0.99917841, + "balance_loss_mlp": 1.0154866, + "epoch": 0.11573726138584098, + "flos": 62703875266560.0, + "grad_norm": 0.810907468185892, + "language_loss": 0.6115036, + "learning_rate": 3.923384605282212e-06, + "loss": 0.6321063, + "num_input_tokens_seen": 41709345, + "router_z_loss_clip": 0.02307129, + "router_z_loss_mlp": 0.43359375, + "step": 1925, + "time_per_iteration": 3.112396478652954 + }, + { + "auxiliary_loss_clip": 0.01173159, + "auxiliary_loss_mlp": 0.01076027, + "balance_loss_clip": 1.05304384, + "balance_loss_mlp": 1.05447614, + "epoch": 0.11579738463850894, + "flos": 22601135013120.0, + "grad_norm": 2.806943429185086, + "language_loss": 0.7482335, + "learning_rate": 3.923277805217161e-06, + "loss": 0.77072537, + "num_input_tokens_seen": 41730210, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.1875, + "step": 1926, + "time_per_iteration": 2.4890315532684326 + }, + { + "auxiliary_loss_clip": 0.01174997, + "auxiliary_loss_mlp": 0.0106307, + "balance_loss_clip": 1.03873897, + "balance_loss_mlp": 1.0552361, + "epoch": 0.11585750789117691, + "flos": 21726135705600.0, + "grad_norm": 2.429758451090488, + "language_loss": 0.73112315, + "learning_rate": 3.923170932221222e-06, + "loss": 0.7535038, + "num_input_tokens_seen": 41750270, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.203125, + "step": 1927, + "time_per_iteration": 2.4673402309417725 + }, + { + "auxiliary_loss_clip": 0.0117016, + "auxiliary_loss_mlp": 0.01054777, + "balance_loss_clip": 1.03244913, + "balance_loss_mlp": 1.05291271, + "epoch": 0.11591763114384489, + "flos": 26287544851200.0, + "grad_norm": 2.854021270140142, + "language_loss": 0.86824137, + "learning_rate": 3.92306398629845e-06, + "loss": 0.89049077, + "num_input_tokens_seen": 41772975, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.171875, + "step": 1928, + "time_per_iteration": 2.530325412750244 + }, + { + "auxiliary_loss_clip": 0.01173569, + "auxiliary_loss_mlp": 0.01055147, + "balance_loss_clip": 1.03289056, + "balance_loss_mlp": 1.05469573, + "epoch": 0.11597775439651285, + "flos": 23000461488000.0, + "grad_norm": 1.71243688867153, + "language_loss": 0.77567977, + "learning_rate": 3.922956967452898e-06, + "loss": 0.79796684, + "num_input_tokens_seen": 41791765, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.1875, + "step": 1929, + "time_per_iteration": 2.489664316177368 + }, + { + "auxiliary_loss_clip": 0.01168882, + "auxiliary_loss_mlp": 0.01062437, + "balance_loss_clip": 1.04238629, + "balance_loss_mlp": 1.05385804, + "epoch": 0.11603787764918082, + "flos": 31941715507200.0, + "grad_norm": 1.6293868207273203, + "language_loss": 0.76724243, + "learning_rate": 3.922849875688626e-06, + "loss": 0.78955561, + "num_input_tokens_seen": 41815615, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.1484375, + "step": 1930, + "time_per_iteration": 2.5867533683776855 + }, + { + "auxiliary_loss_clip": 0.01169352, + "auxiliary_loss_mlp": 0.0105213, + "balance_loss_clip": 1.03027928, + "balance_loss_mlp": 1.05313969, + "epoch": 0.1160980009018488, + "flos": 22271654534400.0, + "grad_norm": 1.9270697111110349, + "language_loss": 0.72114342, + "learning_rate": 3.922742711009693e-06, + "loss": 0.74335825, + "num_input_tokens_seen": 41834810, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.1640625, + "step": 1931, + "time_per_iteration": 2.5218429565429688 + }, + { + "auxiliary_loss_clip": 0.01173627, + "auxiliary_loss_mlp": 0.0105412, + "balance_loss_clip": 1.03168511, + "balance_loss_mlp": 1.05528855, + "epoch": 0.11615812415451676, + "flos": 22783633038720.0, + "grad_norm": 1.5295866923660926, + "language_loss": 0.82133794, + "learning_rate": 3.922635473420164e-06, + "loss": 0.84361541, + "num_input_tokens_seen": 41854975, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.1875, + "step": 1932, + "time_per_iteration": 2.4879212379455566 + }, + { + "auxiliary_loss_clip": 0.01053319, + "auxiliary_loss_mlp": 0.01007659, + "balance_loss_clip": 1.00539386, + "balance_loss_mlp": 1.0111897, + "epoch": 0.11621824740718473, + "flos": 67146096107520.0, + "grad_norm": 0.7701959329661775, + "language_loss": 0.61053753, + "learning_rate": 3.922528162924105e-06, + "loss": 0.63114727, + "num_input_tokens_seen": 41911105, + "router_z_loss_clip": 0.02270508, + "router_z_loss_mlp": 0.421875, + "step": 1933, + "time_per_iteration": 2.960437059402466 + }, + { + "auxiliary_loss_clip": 0.01172297, + "auxiliary_loss_mlp": 0.01054299, + "balance_loss_clip": 1.03248382, + "balance_loss_mlp": 1.05259895, + "epoch": 0.11627837065985269, + "flos": 20375930442240.0, + "grad_norm": 2.2263920275904425, + "language_loss": 0.85587192, + "learning_rate": 3.922420779525586e-06, + "loss": 0.87813795, + "num_input_tokens_seen": 41931750, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.1953125, + "step": 1934, + "time_per_iteration": 5.3810875415802 + }, + { + "auxiliary_loss_clip": 0.01178805, + "auxiliary_loss_mlp": 0.01059072, + "balance_loss_clip": 1.03667259, + "balance_loss_mlp": 1.05852652, + "epoch": 0.11633849391252067, + "flos": 21725812483200.0, + "grad_norm": 2.481370623449466, + "language_loss": 0.65555394, + "learning_rate": 3.9223133232286776e-06, + "loss": 0.67793274, + "num_input_tokens_seen": 41949400, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.203125, + "step": 1935, + "time_per_iteration": 2.483814239501953 + }, + { + "auxiliary_loss_clip": 0.01176161, + "auxiliary_loss_mlp": 0.01053675, + "balance_loss_clip": 1.03352857, + "balance_loss_mlp": 1.05533004, + "epoch": 0.11639861716518864, + "flos": 18805341283200.0, + "grad_norm": 1.8046174937009931, + "language_loss": 0.75469184, + "learning_rate": 3.922205794037456e-06, + "loss": 0.77699012, + "num_input_tokens_seen": 41968100, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.2109375, + "step": 1936, + "time_per_iteration": 3.8786003589630127 + }, + { + "auxiliary_loss_clip": 0.01173369, + "auxiliary_loss_mlp": 0.0105617, + "balance_loss_clip": 1.0325551, + "balance_loss_mlp": 1.05320179, + "epoch": 0.1164587404178566, + "flos": 21214983214080.0, + "grad_norm": 1.9600676544166102, + "language_loss": 0.84061754, + "learning_rate": 3.922098191955998e-06, + "loss": 0.86291301, + "num_input_tokens_seen": 41986375, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.1953125, + "step": 1937, + "time_per_iteration": 2.5084798336029053 + }, + { + "auxiliary_loss_clip": 0.01166803, + "auxiliary_loss_mlp": 0.01045843, + "balance_loss_clip": 1.02533889, + "balance_loss_mlp": 1.05254185, + "epoch": 0.11651886367052458, + "flos": 27818632028160.0, + "grad_norm": 2.0067941571917927, + "language_loss": 0.76479459, + "learning_rate": 3.921990516988384e-06, + "loss": 0.78692102, + "num_input_tokens_seen": 42006055, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.140625, + "step": 1938, + "time_per_iteration": 2.5770225524902344 + }, + { + "auxiliary_loss_clip": 0.01177239, + "auxiliary_loss_mlp": 0.01051282, + "balance_loss_clip": 1.02963328, + "balance_loss_mlp": 1.05566061, + "epoch": 0.11657898692319255, + "flos": 22889569224960.0, + "grad_norm": 2.0274312317590084, + "language_loss": 0.79127967, + "learning_rate": 3.921882769138696e-06, + "loss": 0.8135649, + "num_input_tokens_seen": 42024995, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.21875, + "step": 1939, + "time_per_iteration": 2.5020864009857178 + }, + { + "auxiliary_loss_clip": 0.01173869, + "auxiliary_loss_mlp": 0.0105151, + "balance_loss_clip": 1.02886081, + "balance_loss_mlp": 1.05530274, + "epoch": 0.11663911017586051, + "flos": 24315905364480.0, + "grad_norm": 3.7077039427391343, + "language_loss": 0.86712289, + "learning_rate": 3.9217749484110215e-06, + "loss": 0.88937664, + "num_input_tokens_seen": 42042640, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.1875, + "step": 1940, + "time_per_iteration": 2.484750270843506 + }, + { + "auxiliary_loss_clip": 0.01172427, + "auxiliary_loss_mlp": 0.0105781, + "balance_loss_clip": 1.03699601, + "balance_loss_mlp": 1.05674481, + "epoch": 0.11669923342852849, + "flos": 42340152470400.0, + "grad_norm": 1.4506595925957548, + "language_loss": 0.75750297, + "learning_rate": 3.921667054809449e-06, + "loss": 0.7798053, + "num_input_tokens_seen": 42067005, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.15625, + "step": 1941, + "time_per_iteration": 2.7000842094421387 + }, + { + "auxiliary_loss_clip": 0.01167731, + "auxiliary_loss_mlp": 0.01059034, + "balance_loss_clip": 1.0375998, + "balance_loss_mlp": 1.05215478, + "epoch": 0.11675935668119646, + "flos": 14642288945280.0, + "grad_norm": 2.1675787105273256, + "language_loss": 0.8828994, + "learning_rate": 3.921559088338068e-06, + "loss": 0.90516704, + "num_input_tokens_seen": 42082295, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.15625, + "step": 1942, + "time_per_iteration": 2.460014581680298 + }, + { + "auxiliary_loss_clip": 0.01170106, + "auxiliary_loss_mlp": 0.01048326, + "balance_loss_clip": 1.02839422, + "balance_loss_mlp": 1.05465341, + "epoch": 0.11681947993386442, + "flos": 35116470063360.0, + "grad_norm": 1.688985931696262, + "language_loss": 0.67729998, + "learning_rate": 3.921451049000975e-06, + "loss": 0.69948429, + "num_input_tokens_seen": 42105295, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.15625, + "step": 1943, + "time_per_iteration": 2.5899837017059326 + }, + { + "auxiliary_loss_clip": 0.01170349, + "auxiliary_loss_mlp": 0.01046897, + "balance_loss_clip": 1.02586865, + "balance_loss_mlp": 1.05437136, + "epoch": 0.11687960318653239, + "flos": 38983259024640.0, + "grad_norm": 2.2767867948110263, + "language_loss": 0.69852126, + "learning_rate": 3.921342936802265e-06, + "loss": 0.72069371, + "num_input_tokens_seen": 42125520, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1640625, + "step": 1944, + "time_per_iteration": 2.6237125396728516 + }, + { + "auxiliary_loss_clip": 0.01166997, + "auxiliary_loss_mlp": 0.01045496, + "balance_loss_clip": 1.02513456, + "balance_loss_mlp": 1.05112338, + "epoch": 0.11693972643920036, + "flos": 25994980575360.0, + "grad_norm": 2.1059371232711572, + "language_loss": 0.82477605, + "learning_rate": 3.921234751746038e-06, + "loss": 0.84690094, + "num_input_tokens_seen": 42146335, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.15625, + "step": 1945, + "time_per_iteration": 2.519148349761963 + }, + { + "auxiliary_loss_clip": 0.01169071, + "auxiliary_loss_mlp": 0.01058941, + "balance_loss_clip": 1.03805542, + "balance_loss_mlp": 1.05241919, + "epoch": 0.11699984969186833, + "flos": 27272107618560.0, + "grad_norm": 2.378189536328268, + "language_loss": 0.7640717, + "learning_rate": 3.9211264938363975e-06, + "loss": 0.7863518, + "num_input_tokens_seen": 42165320, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1640625, + "step": 1946, + "time_per_iteration": 2.516782283782959 + }, + { + "auxiliary_loss_clip": 0.01169578, + "auxiliary_loss_mlp": 0.0105231, + "balance_loss_clip": 1.03249717, + "balance_loss_mlp": 1.05597568, + "epoch": 0.1170599729445363, + "flos": 15267853232640.0, + "grad_norm": 2.040115867247402, + "language_loss": 0.68749321, + "learning_rate": 3.921018163077448e-06, + "loss": 0.70971209, + "num_input_tokens_seen": 42182955, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.140625, + "step": 1947, + "time_per_iteration": 2.443979501724243 + }, + { + "auxiliary_loss_clip": 0.01173266, + "auxiliary_loss_mlp": 0.01062988, + "balance_loss_clip": 1.041924, + "balance_loss_mlp": 1.05761504, + "epoch": 0.11712009619720427, + "flos": 17164439251200.0, + "grad_norm": 1.892409556337103, + "language_loss": 0.84730887, + "learning_rate": 3.920909759473295e-06, + "loss": 0.86967146, + "num_input_tokens_seen": 42200760, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.15625, + "step": 1948, + "time_per_iteration": 2.456883192062378 + }, + { + "auxiliary_loss_clip": 0.01060706, + "auxiliary_loss_mlp": 0.01000375, + "balance_loss_clip": 0.99784815, + "balance_loss_mlp": 1.01743388, + "epoch": 0.11718021944987224, + "flos": 70940991997440.0, + "grad_norm": 0.8146373030628324, + "language_loss": 0.65102834, + "learning_rate": 3.920801283028054e-06, + "loss": 0.6716392, + "num_input_tokens_seen": 42265745, + "router_z_loss_clip": 0.02526855, + "router_z_loss_mlp": 0.43359375, + "step": 1949, + "time_per_iteration": 3.083716630935669 + }, + { + "auxiliary_loss_clip": 0.01168495, + "auxiliary_loss_mlp": 0.01056708, + "balance_loss_clip": 1.03614426, + "balance_loss_mlp": 1.05524707, + "epoch": 0.1172403427025402, + "flos": 27453456408960.0, + "grad_norm": 1.7265339558443402, + "language_loss": 0.71616268, + "learning_rate": 3.920692733745835e-06, + "loss": 0.73841476, + "num_input_tokens_seen": 42286245, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.1328125, + "step": 1950, + "time_per_iteration": 2.5140750408172607 + }, + { + "auxiliary_loss_clip": 0.01174036, + "auxiliary_loss_mlp": 0.0105899, + "balance_loss_clip": 1.03823543, + "balance_loss_mlp": 1.05524027, + "epoch": 0.11730046595520818, + "flos": 15668723992320.0, + "grad_norm": 13.047142281747327, + "language_loss": 0.76811576, + "learning_rate": 3.920584111630755e-06, + "loss": 0.79044604, + "num_input_tokens_seen": 42302710, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1875, + "step": 1951, + "time_per_iteration": 2.4511098861694336 + }, + { + "auxiliary_loss_clip": 0.01172385, + "auxiliary_loss_mlp": 0.0106409, + "balance_loss_clip": 1.04351449, + "balance_loss_mlp": 1.05736876, + "epoch": 0.11736058920787615, + "flos": 25630164092160.0, + "grad_norm": 2.4689531190361858, + "language_loss": 0.75770319, + "learning_rate": 3.9204754166869325e-06, + "loss": 0.78006792, + "num_input_tokens_seen": 42324115, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.15625, + "step": 1952, + "time_per_iteration": 2.5249404907226562 + }, + { + "auxiliary_loss_clip": 0.01170041, + "auxiliary_loss_mlp": 0.01060486, + "balance_loss_clip": 1.04038692, + "balance_loss_mlp": 1.05350161, + "epoch": 0.11742071246054411, + "flos": 21434289701760.0, + "grad_norm": 1.8929141854364566, + "language_loss": 0.71838403, + "learning_rate": 3.920366648918491e-06, + "loss": 0.74068928, + "num_input_tokens_seen": 42342505, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.171875, + "step": 1953, + "time_per_iteration": 2.5321006774902344 + }, + { + "auxiliary_loss_clip": 0.01178671, + "auxiliary_loss_mlp": 0.01054108, + "balance_loss_clip": 1.03186345, + "balance_loss_mlp": 1.05794597, + "epoch": 0.11748083571321208, + "flos": 15997845335040.0, + "grad_norm": 2.5505654209141317, + "language_loss": 0.7939415, + "learning_rate": 3.920257808329552e-06, + "loss": 0.81626928, + "num_input_tokens_seen": 42360525, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.203125, + "step": 1954, + "time_per_iteration": 2.477182149887085 + }, + { + "auxiliary_loss_clip": 0.01174109, + "auxiliary_loss_mlp": 0.01060284, + "balance_loss_clip": 1.03859961, + "balance_loss_mlp": 1.05628419, + "epoch": 0.11754095896588006, + "flos": 16180056051840.0, + "grad_norm": 2.1305529461824344, + "language_loss": 0.85609406, + "learning_rate": 3.920148894924246e-06, + "loss": 0.878438, + "num_input_tokens_seen": 42377045, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.1796875, + "step": 1955, + "time_per_iteration": 2.4685070514678955 + }, + { + "auxiliary_loss_clip": 0.01172636, + "auxiliary_loss_mlp": 0.01049417, + "balance_loss_clip": 1.02949762, + "balance_loss_mlp": 1.05551839, + "epoch": 0.11760108221854802, + "flos": 13261596013440.0, + "grad_norm": 3.149612339355701, + "language_loss": 0.77626467, + "learning_rate": 3.920039908706701e-06, + "loss": 0.79848516, + "num_input_tokens_seen": 42393960, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.171875, + "step": 1956, + "time_per_iteration": 2.469529151916504 + }, + { + "auxiliary_loss_clip": 0.01169266, + "auxiliary_loss_mlp": 0.01054147, + "balance_loss_clip": 1.03357112, + "balance_loss_mlp": 1.05667603, + "epoch": 0.11766120547121599, + "flos": 24498439303680.0, + "grad_norm": 4.253665449575931, + "language_loss": 0.80333984, + "learning_rate": 3.91993084968105e-06, + "loss": 0.82557392, + "num_input_tokens_seen": 42413160, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.125, + "step": 1957, + "time_per_iteration": 2.508272886276245 + }, + { + "auxiliary_loss_clip": 0.01176684, + "auxiliary_loss_mlp": 0.01049845, + "balance_loss_clip": 1.03003287, + "balance_loss_mlp": 1.05895627, + "epoch": 0.11772132872388397, + "flos": 17784005967360.0, + "grad_norm": 3.1587185145349737, + "language_loss": 0.77638769, + "learning_rate": 3.919821717851428e-06, + "loss": 0.79865301, + "num_input_tokens_seen": 42432590, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.1796875, + "step": 1958, + "time_per_iteration": 2.48563551902771 + }, + { + "auxiliary_loss_clip": 0.01174636, + "auxiliary_loss_mlp": 0.01048305, + "balance_loss_clip": 1.02640605, + "balance_loss_mlp": 1.05859971, + "epoch": 0.11778145197655193, + "flos": 13217030213760.0, + "grad_norm": 2.0966272081131985, + "language_loss": 0.76906043, + "learning_rate": 3.919712513221976e-06, + "loss": 0.79128981, + "num_input_tokens_seen": 42450135, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.15625, + "step": 1959, + "time_per_iteration": 2.4826674461364746 + }, + { + "auxiliary_loss_clip": 0.01171719, + "auxiliary_loss_mlp": 0.01050959, + "balance_loss_clip": 1.03128934, + "balance_loss_mlp": 1.05581582, + "epoch": 0.1178415752292199, + "flos": 20230204965120.0, + "grad_norm": 3.13785825532277, + "language_loss": 0.69989765, + "learning_rate": 3.919603235796832e-06, + "loss": 0.72212446, + "num_input_tokens_seen": 42470050, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.15625, + "step": 1960, + "time_per_iteration": 2.4965405464172363 + }, + { + "auxiliary_loss_clip": 0.01178622, + "auxiliary_loss_mlp": 0.01058274, + "balance_loss_clip": 1.03704309, + "balance_loss_mlp": 1.05921102, + "epoch": 0.11790169848188788, + "flos": 13040134709760.0, + "grad_norm": 2.5802576751796327, + "language_loss": 0.81135678, + "learning_rate": 3.9194938855801406e-06, + "loss": 0.83372575, + "num_input_tokens_seen": 42484335, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1953125, + "step": 1961, + "time_per_iteration": 2.456537961959839 + }, + { + "auxiliary_loss_clip": 0.01167569, + "auxiliary_loss_mlp": 0.01055573, + "balance_loss_clip": 1.03640413, + "balance_loss_mlp": 1.05682623, + "epoch": 0.11796182173455584, + "flos": 22265728790400.0, + "grad_norm": 3.5009623449342206, + "language_loss": 0.92335653, + "learning_rate": 3.919384462576049e-06, + "loss": 0.94558799, + "num_input_tokens_seen": 42502720, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.109375, + "step": 1962, + "time_per_iteration": 2.4831955432891846 + }, + { + "auxiliary_loss_clip": 0.01175087, + "auxiliary_loss_mlp": 0.01054037, + "balance_loss_clip": 1.03379536, + "balance_loss_mlp": 1.05849361, + "epoch": 0.1180219449872238, + "flos": 10635017892480.0, + "grad_norm": 2.1891263418172353, + "language_loss": 0.87132198, + "learning_rate": 3.919274966788707e-06, + "loss": 0.89361322, + "num_input_tokens_seen": 42519460, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.1640625, + "step": 1963, + "time_per_iteration": 2.453864097595215 + }, + { + "auxiliary_loss_clip": 0.01174267, + "auxiliary_loss_mlp": 0.01047313, + "balance_loss_clip": 1.02764392, + "balance_loss_mlp": 1.05800569, + "epoch": 0.11808206823989177, + "flos": 20923532259840.0, + "grad_norm": 2.1122466665000155, + "language_loss": 0.84163988, + "learning_rate": 3.919165398222265e-06, + "loss": 0.86385566, + "num_input_tokens_seen": 42539420, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1640625, + "step": 1964, + "time_per_iteration": 2.496471405029297 + }, + { + "auxiliary_loss_clip": 0.01178376, + "auxiliary_loss_mlp": 0.01056634, + "balance_loss_clip": 1.03699994, + "balance_loss_mlp": 1.06327403, + "epoch": 0.11814219149255975, + "flos": 20777770869120.0, + "grad_norm": 1.965243610427017, + "language_loss": 0.82994169, + "learning_rate": 3.919055756880879e-06, + "loss": 0.85229176, + "num_input_tokens_seen": 42558225, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.1484375, + "step": 1965, + "time_per_iteration": 2.46545672416687 + }, + { + "auxiliary_loss_clip": 0.01175057, + "auxiliary_loss_mlp": 0.01049044, + "balance_loss_clip": 1.02883816, + "balance_loss_mlp": 1.05948591, + "epoch": 0.11820231474522772, + "flos": 48759938542080.0, + "grad_norm": 1.6968751772896917, + "language_loss": 0.74517393, + "learning_rate": 3.918946042768707e-06, + "loss": 0.76741493, + "num_input_tokens_seen": 42580790, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.15625, + "step": 1966, + "time_per_iteration": 2.730928421020508 + }, + { + "auxiliary_loss_clip": 0.01185811, + "auxiliary_loss_mlp": 0.01055482, + "balance_loss_clip": 1.03552604, + "balance_loss_mlp": 1.0661025, + "epoch": 0.11826243799789568, + "flos": 16690598012160.0, + "grad_norm": 3.573953561090722, + "language_loss": 0.725128, + "learning_rate": 3.918836255889908e-06, + "loss": 0.74754095, + "num_input_tokens_seen": 42597355, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.1953125, + "step": 1967, + "time_per_iteration": 2.459409713745117 + }, + { + "auxiliary_loss_clip": 0.01174728, + "auxiliary_loss_mlp": 0.01044222, + "balance_loss_clip": 1.02409899, + "balance_loss_mlp": 1.0596199, + "epoch": 0.11832256125056366, + "flos": 16909868586240.0, + "grad_norm": 2.07735233424318, + "language_loss": 0.87874025, + "learning_rate": 3.9187263962486456e-06, + "loss": 0.90092969, + "num_input_tokens_seen": 42616060, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.15625, + "step": 1968, + "time_per_iteration": 2.474860191345215 + }, + { + "auxiliary_loss_clip": 0.0117476, + "auxiliary_loss_mlp": 0.01051094, + "balance_loss_clip": 1.03083992, + "balance_loss_mlp": 1.05980873, + "epoch": 0.11838268450323162, + "flos": 22820405587200.0, + "grad_norm": 2.3710109771053904, + "language_loss": 0.66827953, + "learning_rate": 3.918616463849087e-06, + "loss": 0.69053805, + "num_input_tokens_seen": 42636285, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1484375, + "step": 1969, + "time_per_iteration": 2.5025057792663574 + }, + { + "auxiliary_loss_clip": 0.01177024, + "auxiliary_loss_mlp": 0.01052173, + "balance_loss_clip": 1.03172874, + "balance_loss_mlp": 1.06375933, + "epoch": 0.11844280775589959, + "flos": 33545844990720.0, + "grad_norm": 2.0668162562591013, + "language_loss": 0.81199527, + "learning_rate": 3.918506458695399e-06, + "loss": 0.83428723, + "num_input_tokens_seen": 42658320, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1328125, + "step": 1970, + "time_per_iteration": 2.6005184650421143 + }, + { + "auxiliary_loss_clip": 0.01071753, + "auxiliary_loss_mlp": 0.0102596, + "balance_loss_clip": 1.02306354, + "balance_loss_mlp": 1.02803779, + "epoch": 0.11850293100856757, + "flos": 66350998604160.0, + "grad_norm": 0.8059191438251484, + "language_loss": 0.66145539, + "learning_rate": 3.918396380791754e-06, + "loss": 0.68243253, + "num_input_tokens_seen": 42721500, + "router_z_loss_clip": 0.02893066, + "router_z_loss_mlp": 0.4375, + "step": 1971, + "time_per_iteration": 3.0580737590789795 + }, + { + "auxiliary_loss_clip": 0.01173379, + "auxiliary_loss_mlp": 0.0105069, + "balance_loss_clip": 1.03112769, + "balance_loss_mlp": 1.0578413, + "epoch": 0.11856305426123553, + "flos": 24681045070080.0, + "grad_norm": 1.9720310647047086, + "language_loss": 0.79760695, + "learning_rate": 3.918286230142327e-06, + "loss": 0.81984764, + "num_input_tokens_seen": 42739825, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.15625, + "step": 1972, + "time_per_iteration": 2.5330677032470703 + }, + { + "auxiliary_loss_clip": 0.01174806, + "auxiliary_loss_mlp": 0.01052417, + "balance_loss_clip": 1.03144813, + "balance_loss_mlp": 1.06013465, + "epoch": 0.1186231775139035, + "flos": 24280102483200.0, + "grad_norm": 2.451560144092476, + "language_loss": 0.72162819, + "learning_rate": 3.918176006751292e-06, + "loss": 0.74390036, + "num_input_tokens_seen": 42758695, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.140625, + "step": 1973, + "time_per_iteration": 2.497079372406006 + }, + { + "auxiliary_loss_clip": 0.0117035, + "auxiliary_loss_mlp": 0.01043803, + "balance_loss_clip": 1.02407408, + "balance_loss_mlp": 1.05802357, + "epoch": 0.11868330076657148, + "flos": 21757413473280.0, + "grad_norm": 2.2680636805256897, + "language_loss": 0.71724641, + "learning_rate": 3.918065710622832e-06, + "loss": 0.73938787, + "num_input_tokens_seen": 42778510, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.125, + "step": 1974, + "time_per_iteration": 2.5145771503448486 + }, + { + "auxiliary_loss_clip": 0.01170733, + "auxiliary_loss_mlp": 0.01038557, + "balance_loss_clip": 1.01937568, + "balance_loss_mlp": 1.05660915, + "epoch": 0.11874342401923944, + "flos": 17193274894080.0, + "grad_norm": 2.192039880981389, + "language_loss": 0.77186036, + "learning_rate": 3.917955341761128e-06, + "loss": 0.7939533, + "num_input_tokens_seen": 42793995, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.140625, + "step": 1975, + "time_per_iteration": 2.4483766555786133 + }, + { + "auxiliary_loss_clip": 0.01172653, + "auxiliary_loss_mlp": 0.01051494, + "balance_loss_clip": 1.03212273, + "balance_loss_mlp": 1.06021976, + "epoch": 0.11880354727190741, + "flos": 15229572312960.0, + "grad_norm": 2.2667330410251596, + "language_loss": 0.7498399, + "learning_rate": 3.917844900170364e-06, + "loss": 0.77208138, + "num_input_tokens_seen": 42809000, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.125, + "step": 1976, + "time_per_iteration": 3.9421374797821045 + }, + { + "auxiliary_loss_clip": 0.01172444, + "auxiliary_loss_mlp": 0.01044553, + "balance_loss_clip": 1.02544367, + "balance_loss_mlp": 1.05979395, + "epoch": 0.11886367052457537, + "flos": 27309706179840.0, + "grad_norm": 1.6192257034176818, + "language_loss": 0.75191766, + "learning_rate": 3.91773438585473e-06, + "loss": 0.77408761, + "num_input_tokens_seen": 42831585, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.125, + "step": 1977, + "time_per_iteration": 3.9506070613861084 + }, + { + "auxiliary_loss_clip": 0.01172952, + "auxiliary_loss_mlp": 0.01053238, + "balance_loss_clip": 1.0338068, + "balance_loss_mlp": 1.05777454, + "epoch": 0.11892379377724335, + "flos": 21798280172160.0, + "grad_norm": 7.387040580957373, + "language_loss": 0.7393533, + "learning_rate": 3.9176237988184165e-06, + "loss": 0.76161528, + "num_input_tokens_seen": 42848420, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.15625, + "step": 1978, + "time_per_iteration": 2.4754912853240967 + }, + { + "auxiliary_loss_clip": 0.01168854, + "auxiliary_loss_mlp": 0.01048253, + "balance_loss_clip": 1.02872658, + "balance_loss_mlp": 1.05782461, + "epoch": 0.11898391702991132, + "flos": 13991013498240.0, + "grad_norm": 1.709416576437117, + "language_loss": 0.73273945, + "learning_rate": 3.917513139065616e-06, + "loss": 0.75491059, + "num_input_tokens_seen": 42866645, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.109375, + "step": 1979, + "time_per_iteration": 2.478938579559326 + }, + { + "auxiliary_loss_clip": 0.01172076, + "auxiliary_loss_mlp": 0.01048439, + "balance_loss_clip": 1.0286746, + "balance_loss_mlp": 1.05735934, + "epoch": 0.11904404028257928, + "flos": 32234567091840.0, + "grad_norm": 1.877436937799078, + "language_loss": 0.98387957, + "learning_rate": 3.917402406600525e-06, + "loss": 1.00608468, + "num_input_tokens_seen": 42888515, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1484375, + "step": 1980, + "time_per_iteration": 2.5758843421936035 + }, + { + "auxiliary_loss_clip": 0.01173349, + "auxiliary_loss_mlp": 0.01046819, + "balance_loss_clip": 1.02580202, + "balance_loss_mlp": 1.05741775, + "epoch": 0.11910416353524726, + "flos": 23586272398080.0, + "grad_norm": 1.8930015682875676, + "language_loss": 0.85929906, + "learning_rate": 3.917291601427342e-06, + "loss": 0.88150084, + "num_input_tokens_seen": 42909035, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.15625, + "step": 1981, + "time_per_iteration": 2.505051612854004 + }, + { + "auxiliary_loss_clip": 0.01172656, + "auxiliary_loss_mlp": 0.01057237, + "balance_loss_clip": 1.03601766, + "balance_loss_mlp": 1.057832, + "epoch": 0.11916428678791523, + "flos": 25333038789120.0, + "grad_norm": 1.9242535829958574, + "language_loss": 0.85007018, + "learning_rate": 3.91718072355027e-06, + "loss": 0.87236911, + "num_input_tokens_seen": 42927555, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1484375, + "step": 1982, + "time_per_iteration": 2.513012409210205 + }, + { + "auxiliary_loss_clip": 0.01166906, + "auxiliary_loss_mlp": 0.01046511, + "balance_loss_clip": 1.02667475, + "balance_loss_mlp": 1.05463564, + "epoch": 0.11922441004058319, + "flos": 19788431592960.0, + "grad_norm": 1.926275276354154, + "language_loss": 0.85026526, + "learning_rate": 3.917069772973513e-06, + "loss": 0.87239939, + "num_input_tokens_seen": 42945300, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.125, + "step": 1983, + "time_per_iteration": 2.4627623558044434 + }, + { + "auxiliary_loss_clip": 0.01172266, + "auxiliary_loss_mlp": 0.01049845, + "balance_loss_clip": 1.02980638, + "balance_loss_mlp": 1.05581713, + "epoch": 0.11928453329325117, + "flos": 21536347219200.0, + "grad_norm": 3.2679367356540894, + "language_loss": 0.77020949, + "learning_rate": 3.916958749701277e-06, + "loss": 0.79243064, + "num_input_tokens_seen": 42961295, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.15625, + "step": 1984, + "time_per_iteration": 2.466224193572998 + }, + { + "auxiliary_loss_clip": 0.01168386, + "auxiliary_loss_mlp": 0.01055095, + "balance_loss_clip": 1.03542554, + "balance_loss_mlp": 1.05464029, + "epoch": 0.11934465654591914, + "flos": 20815010294400.0, + "grad_norm": 1.7272493982968635, + "language_loss": 0.83323789, + "learning_rate": 3.9168476537377745e-06, + "loss": 0.85547268, + "num_input_tokens_seen": 42980330, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.140625, + "step": 1985, + "time_per_iteration": 2.485797882080078 + }, + { + "auxiliary_loss_clip": 0.01162278, + "auxiliary_loss_mlp": 0.01046498, + "balance_loss_clip": 1.02659011, + "balance_loss_mlp": 1.05230284, + "epoch": 0.1194047797985871, + "flos": 19060486565760.0, + "grad_norm": 1.9847962315308523, + "language_loss": 0.7379061, + "learning_rate": 3.916736485087216e-06, + "loss": 0.75999391, + "num_input_tokens_seen": 42996125, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.1015625, + "step": 1986, + "time_per_iteration": 2.4477651119232178 + }, + { + "auxiliary_loss_clip": 0.01167211, + "auxiliary_loss_mlp": 0.01055872, + "balance_loss_clip": 1.03664303, + "balance_loss_mlp": 1.05418456, + "epoch": 0.11946490305125507, + "flos": 27190805184000.0, + "grad_norm": 2.0940320364759573, + "language_loss": 0.7209813, + "learning_rate": 3.916625243753819e-06, + "loss": 0.74321216, + "num_input_tokens_seen": 43014180, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.125, + "step": 1987, + "time_per_iteration": 2.528564929962158 + }, + { + "auxiliary_loss_clip": 0.01166851, + "auxiliary_loss_mlp": 0.01053632, + "balance_loss_clip": 1.03256774, + "balance_loss_mlp": 1.05243921, + "epoch": 0.11952502630392305, + "flos": 21140791672320.0, + "grad_norm": 2.544292945564917, + "language_loss": 0.72455966, + "learning_rate": 3.916513929741799e-06, + "loss": 0.74676454, + "num_input_tokens_seen": 43032120, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.140625, + "step": 1988, + "time_per_iteration": 2.482295274734497 + }, + { + "auxiliary_loss_clip": 0.01168039, + "auxiliary_loss_mlp": 0.01063511, + "balance_loss_clip": 1.04195809, + "balance_loss_mlp": 1.05425191, + "epoch": 0.11958514955659101, + "flos": 22124241118080.0, + "grad_norm": 2.3919568417846544, + "language_loss": 0.80848205, + "learning_rate": 3.91640254305538e-06, + "loss": 0.83079755, + "num_input_tokens_seen": 43052215, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.140625, + "step": 1989, + "time_per_iteration": 2.5321335792541504 + }, + { + "auxiliary_loss_clip": 0.01171171, + "auxiliary_loss_mlp": 0.01051003, + "balance_loss_clip": 1.03040385, + "balance_loss_mlp": 1.05518925, + "epoch": 0.11964527280925898, + "flos": 17421452040960.0, + "grad_norm": 2.7848130249027077, + "language_loss": 0.76000333, + "learning_rate": 3.916291083698784e-06, + "loss": 0.78222507, + "num_input_tokens_seen": 43069720, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.15625, + "step": 1990, + "time_per_iteration": 2.4608383178710938 + }, + { + "auxiliary_loss_clip": 0.01060104, + "auxiliary_loss_mlp": 0.0101675, + "balance_loss_clip": 1.01392448, + "balance_loss_mlp": 1.01813149, + "epoch": 0.11970539606192696, + "flos": 70679741402880.0, + "grad_norm": 0.8877551125762418, + "language_loss": 0.55219597, + "learning_rate": 3.916179551676238e-06, + "loss": 0.57296449, + "num_input_tokens_seen": 43123130, + "router_z_loss_clip": 0.02819824, + "router_z_loss_mlp": 0.41992188, + "step": 1991, + "time_per_iteration": 3.0575883388519287 + }, + { + "auxiliary_loss_clip": 0.01166639, + "auxiliary_loss_mlp": 0.01048947, + "balance_loss_clip": 1.02905095, + "balance_loss_mlp": 1.05472517, + "epoch": 0.11976551931459492, + "flos": 21215019127680.0, + "grad_norm": 2.2244739837006797, + "language_loss": 0.78156978, + "learning_rate": 3.916067946991971e-06, + "loss": 0.8037256, + "num_input_tokens_seen": 43140015, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.1171875, + "step": 1992, + "time_per_iteration": 2.5395517349243164 + }, + { + "auxiliary_loss_clip": 0.01170251, + "auxiliary_loss_mlp": 0.0104925, + "balance_loss_clip": 1.02819777, + "balance_loss_mlp": 1.0534482, + "epoch": 0.11982564256726289, + "flos": 25989306226560.0, + "grad_norm": 1.898510109378507, + "language_loss": 0.78694016, + "learning_rate": 3.915956269650216e-06, + "loss": 0.80913514, + "num_input_tokens_seen": 43160105, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1640625, + "step": 1993, + "time_per_iteration": 2.5264625549316406 + }, + { + "auxiliary_loss_clip": 0.01165494, + "auxiliary_loss_mlp": 0.01058458, + "balance_loss_clip": 1.03837109, + "balance_loss_mlp": 1.05150676, + "epoch": 0.11988576581993086, + "flos": 21650866755840.0, + "grad_norm": 1.7590613991113047, + "language_loss": 0.82287014, + "learning_rate": 3.915844519655208e-06, + "loss": 0.8451097, + "num_input_tokens_seen": 43179835, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.140625, + "step": 1994, + "time_per_iteration": 2.4871127605438232 + }, + { + "auxiliary_loss_clip": 0.01166639, + "auxiliary_loss_mlp": 0.01054967, + "balance_loss_clip": 1.03551149, + "balance_loss_mlp": 1.05389762, + "epoch": 0.11994588907259883, + "flos": 17857407409920.0, + "grad_norm": 2.1035856813409786, + "language_loss": 0.87953222, + "learning_rate": 3.915732697011183e-06, + "loss": 0.9017483, + "num_input_tokens_seen": 43197210, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.125, + "step": 1995, + "time_per_iteration": 2.46690034866333 + }, + { + "auxiliary_loss_clip": 0.01169288, + "auxiliary_loss_mlp": 0.01057862, + "balance_loss_clip": 1.03692937, + "balance_loss_mlp": 1.05346155, + "epoch": 0.1200060123252668, + "flos": 24462744163200.0, + "grad_norm": 2.783456627489481, + "language_loss": 0.74206698, + "learning_rate": 3.9156208017223825e-06, + "loss": 0.76433849, + "num_input_tokens_seen": 43215050, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.15625, + "step": 1996, + "time_per_iteration": 2.5115768909454346 + }, + { + "auxiliary_loss_clip": 0.01167539, + "auxiliary_loss_mlp": 0.01052549, + "balance_loss_clip": 1.03138888, + "balance_loss_mlp": 1.05337763, + "epoch": 0.12006613557793476, + "flos": 18732191235840.0, + "grad_norm": 1.9342712291191904, + "language_loss": 0.88266122, + "learning_rate": 3.915508833793048e-06, + "loss": 0.90486217, + "num_input_tokens_seen": 43233900, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.140625, + "step": 1997, + "time_per_iteration": 2.4716532230377197 + }, + { + "auxiliary_loss_clip": 0.01167703, + "auxiliary_loss_mlp": 0.01063842, + "balance_loss_clip": 1.04287314, + "balance_loss_mlp": 1.05315256, + "epoch": 0.12012625883060274, + "flos": 22267739952000.0, + "grad_norm": 3.8633631849497054, + "language_loss": 0.78929418, + "learning_rate": 3.915396793227428e-06, + "loss": 0.81160963, + "num_input_tokens_seen": 43252105, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1484375, + "step": 1998, + "time_per_iteration": 2.4798996448516846 + }, + { + "auxiliary_loss_clip": 0.01170638, + "auxiliary_loss_mlp": 0.01048668, + "balance_loss_clip": 1.027318, + "balance_loss_mlp": 1.05610394, + "epoch": 0.1201863820832707, + "flos": 21758885930880.0, + "grad_norm": 2.053047413592738, + "language_loss": 0.73435485, + "learning_rate": 3.915284680029769e-06, + "loss": 0.75654793, + "num_input_tokens_seen": 43270315, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1484375, + "step": 1999, + "time_per_iteration": 2.5017611980438232 + }, + { + "auxiliary_loss_clip": 0.01169689, + "auxiliary_loss_mlp": 0.01065385, + "balance_loss_clip": 1.04436839, + "balance_loss_mlp": 1.05347967, + "epoch": 0.12024650533593867, + "flos": 21907987286400.0, + "grad_norm": 3.6093884580795677, + "language_loss": 0.74955112, + "learning_rate": 3.915172494204323e-06, + "loss": 0.77190185, + "num_input_tokens_seen": 43289935, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.15625, + "step": 2000, + "time_per_iteration": 2.5060245990753174 + }, + { + "auxiliary_loss_clip": 0.01170552, + "auxiliary_loss_mlp": 0.01050835, + "balance_loss_clip": 1.02997398, + "balance_loss_mlp": 1.05408299, + "epoch": 0.12030662858860665, + "flos": 21689219502720.0, + "grad_norm": 1.5368563042333518, + "language_loss": 0.84667969, + "learning_rate": 3.915060235755344e-06, + "loss": 0.86889356, + "num_input_tokens_seen": 43309325, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.171875, + "step": 2001, + "time_per_iteration": 2.499922752380371 + }, + { + "auxiliary_loss_clip": 0.01168457, + "auxiliary_loss_mlp": 0.0105136, + "balance_loss_clip": 1.03176236, + "balance_loss_mlp": 1.05330753, + "epoch": 0.12036675184127461, + "flos": 12933228856320.0, + "grad_norm": 2.074842616733997, + "language_loss": 0.73982531, + "learning_rate": 3.91494790468709e-06, + "loss": 0.76202351, + "num_input_tokens_seen": 43327010, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.15625, + "step": 2002, + "time_per_iteration": 2.486853837966919 + }, + { + "auxiliary_loss_clip": 0.01175825, + "auxiliary_loss_mlp": 0.01058049, + "balance_loss_clip": 1.03599501, + "balance_loss_mlp": 1.05508709, + "epoch": 0.12042687509394258, + "flos": 20851028657280.0, + "grad_norm": 2.832741043586106, + "language_loss": 0.78091669, + "learning_rate": 3.9148355010038185e-06, + "loss": 0.80325544, + "num_input_tokens_seen": 43345650, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.2109375, + "step": 2003, + "time_per_iteration": 2.4740982055664062 + }, + { + "auxiliary_loss_clip": 0.01166397, + "auxiliary_loss_mlp": 0.01050741, + "balance_loss_clip": 1.02979612, + "balance_loss_mlp": 1.0521121, + "epoch": 0.12048699834661056, + "flos": 23878513451520.0, + "grad_norm": 1.9652989098821625, + "language_loss": 0.72093791, + "learning_rate": 3.914723024709793e-06, + "loss": 0.74310923, + "num_input_tokens_seen": 43365555, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.140625, + "step": 2004, + "time_per_iteration": 2.5126965045928955 + }, + { + "auxiliary_loss_clip": 0.01174991, + "auxiliary_loss_mlp": 0.01061179, + "balance_loss_clip": 1.03877997, + "balance_loss_mlp": 1.0546937, + "epoch": 0.12054712159927852, + "flos": 19756363726080.0, + "grad_norm": 2.2150760255497945, + "language_loss": 0.78260767, + "learning_rate": 3.914610475809279e-06, + "loss": 0.80496937, + "num_input_tokens_seen": 43384990, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.203125, + "step": 2005, + "time_per_iteration": 2.4991190433502197 + }, + { + "auxiliary_loss_clip": 0.01056255, + "auxiliary_loss_mlp": 0.01005501, + "balance_loss_clip": 1.00284314, + "balance_loss_mlp": 1.01496482, + "epoch": 0.12060724485194649, + "flos": 51672763123200.0, + "grad_norm": 0.9233110616682776, + "language_loss": 0.58020771, + "learning_rate": 3.914497854306543e-06, + "loss": 0.60082525, + "num_input_tokens_seen": 43436335, + "router_z_loss_clip": 0.02661133, + "router_z_loss_mlp": 0.4140625, + "step": 2006, + "time_per_iteration": 2.8520798683166504 + }, + { + "auxiliary_loss_clip": 0.01165745, + "auxiliary_loss_mlp": 0.01049181, + "balance_loss_clip": 1.02958333, + "balance_loss_mlp": 1.05345094, + "epoch": 0.12066736810461445, + "flos": 18990425088000.0, + "grad_norm": 1.7247761793975513, + "language_loss": 0.76275218, + "learning_rate": 3.9143851602058575e-06, + "loss": 0.78490144, + "num_input_tokens_seen": 43456495, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.125, + "step": 2007, + "time_per_iteration": 2.50325083732605 + }, + { + "auxiliary_loss_clip": 0.01170732, + "auxiliary_loss_mlp": 0.01058411, + "balance_loss_clip": 1.03653646, + "balance_loss_mlp": 1.05348623, + "epoch": 0.12072749135728243, + "flos": 16471973882880.0, + "grad_norm": 3.332475401193337, + "language_loss": 0.82973194, + "learning_rate": 3.914272393511494e-06, + "loss": 0.85202336, + "num_input_tokens_seen": 43473085, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.171875, + "step": 2008, + "time_per_iteration": 2.4650609493255615 + }, + { + "auxiliary_loss_clip": 0.0116834, + "auxiliary_loss_mlp": 0.01054228, + "balance_loss_clip": 1.03319979, + "balance_loss_mlp": 1.05225682, + "epoch": 0.1207876146099504, + "flos": 18077108947200.0, + "grad_norm": 2.236244219024357, + "language_loss": 0.84184098, + "learning_rate": 3.91415955422773e-06, + "loss": 0.86406672, + "num_input_tokens_seen": 43491135, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1640625, + "step": 2009, + "time_per_iteration": 2.4602744579315186 + }, + { + "auxiliary_loss_clip": 0.01170159, + "auxiliary_loss_mlp": 0.01053411, + "balance_loss_clip": 1.03083277, + "balance_loss_mlp": 1.0551877, + "epoch": 0.12084773786261836, + "flos": 21871573873920.0, + "grad_norm": 1.7312486930792712, + "language_loss": 0.83945864, + "learning_rate": 3.914046642358844e-06, + "loss": 0.86169434, + "num_input_tokens_seen": 43510440, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.1484375, + "step": 2010, + "time_per_iteration": 2.480238437652588 + }, + { + "auxiliary_loss_clip": 0.01171814, + "auxiliary_loss_mlp": 0.01056176, + "balance_loss_clip": 1.03437304, + "balance_loss_mlp": 1.05634403, + "epoch": 0.12090786111528634, + "flos": 18333044328960.0, + "grad_norm": 1.658807365911602, + "language_loss": 0.84157598, + "learning_rate": 3.9139336579091174e-06, + "loss": 0.8638559, + "num_input_tokens_seen": 43530145, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.15625, + "step": 2011, + "time_per_iteration": 2.454406499862671 + }, + { + "auxiliary_loss_clip": 0.01172165, + "auxiliary_loss_mlp": 0.01055064, + "balance_loss_clip": 1.03386891, + "balance_loss_mlp": 1.055547, + "epoch": 0.1209679843679543, + "flos": 21105850717440.0, + "grad_norm": 1.879921554869875, + "language_loss": 0.96007967, + "learning_rate": 3.913820600882834e-06, + "loss": 0.9823519, + "num_input_tokens_seen": 43549315, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.171875, + "step": 2012, + "time_per_iteration": 2.479583740234375 + }, + { + "auxiliary_loss_clip": 0.01166488, + "auxiliary_loss_mlp": 0.01047788, + "balance_loss_clip": 1.026914, + "balance_loss_mlp": 1.05365777, + "epoch": 0.12102810762062227, + "flos": 29241053585280.0, + "grad_norm": 2.6055417591736036, + "language_loss": 0.80619711, + "learning_rate": 3.913707471284283e-06, + "loss": 0.82833993, + "num_input_tokens_seen": 43569240, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.125, + "step": 2013, + "time_per_iteration": 2.538651704788208 + }, + { + "auxiliary_loss_clip": 0.01172968, + "auxiliary_loss_mlp": 0.0104686, + "balance_loss_clip": 1.02444816, + "balance_loss_mlp": 1.05412138, + "epoch": 0.12108823087329025, + "flos": 17930701111680.0, + "grad_norm": 3.9791821612033953, + "language_loss": 0.77157021, + "learning_rate": 3.9135942691177515e-06, + "loss": 0.79376847, + "num_input_tokens_seen": 43587710, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.1875, + "step": 2014, + "time_per_iteration": 2.4411396980285645 + }, + { + "auxiliary_loss_clip": 0.01169091, + "auxiliary_loss_mlp": 0.01046827, + "balance_loss_clip": 1.02509499, + "balance_loss_mlp": 1.05448556, + "epoch": 0.12114835412595822, + "flos": 22091850028800.0, + "grad_norm": 2.028780359370303, + "language_loss": 0.86930937, + "learning_rate": 3.913480994387535e-06, + "loss": 0.89146852, + "num_input_tokens_seen": 43606000, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.140625, + "step": 2015, + "time_per_iteration": 2.4546844959259033 + }, + { + "auxiliary_loss_clip": 0.01159471, + "auxiliary_loss_mlp": 0.01047561, + "balance_loss_clip": 1.0268662, + "balance_loss_mlp": 1.04779112, + "epoch": 0.12120847737862618, + "flos": 20412343854720.0, + "grad_norm": 2.0866681231001762, + "language_loss": 0.69274801, + "learning_rate": 3.913367647097926e-06, + "loss": 0.71481836, + "num_input_tokens_seen": 43624815, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.109375, + "step": 2016, + "time_per_iteration": 2.469177007675171 + }, + { + "auxiliary_loss_clip": 0.01169041, + "auxiliary_loss_mlp": 0.01043193, + "balance_loss_clip": 1.02042413, + "balance_loss_mlp": 1.05407953, + "epoch": 0.12126860063129415, + "flos": 22309037614080.0, + "grad_norm": 3.095255398319528, + "language_loss": 0.80049825, + "learning_rate": 3.913254227253225e-06, + "loss": 0.82262057, + "num_input_tokens_seen": 43643960, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.15625, + "step": 2017, + "time_per_iteration": 2.459447145462036 + }, + { + "auxiliary_loss_clip": 0.01168347, + "auxiliary_loss_mlp": 0.01051308, + "balance_loss_clip": 1.0292666, + "balance_loss_mlp": 1.05315137, + "epoch": 0.12132872388396213, + "flos": 13699275235200.0, + "grad_norm": 2.364451122732105, + "language_loss": 0.69343489, + "learning_rate": 3.913140734857731e-06, + "loss": 0.71563143, + "num_input_tokens_seen": 43662650, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1484375, + "step": 2018, + "time_per_iteration": 3.919508695602417 + }, + { + "auxiliary_loss_clip": 0.0117102, + "auxiliary_loss_mlp": 0.0105213, + "balance_loss_clip": 1.03226995, + "balance_loss_mlp": 1.05712008, + "epoch": 0.12138884713663009, + "flos": 26466954307200.0, + "grad_norm": 2.162901456551013, + "language_loss": 0.72318506, + "learning_rate": 3.91302716991575e-06, + "loss": 0.74541652, + "num_input_tokens_seen": 43684205, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.140625, + "step": 2019, + "time_per_iteration": 3.910888433456421 + }, + { + "auxiliary_loss_clip": 0.01168573, + "auxiliary_loss_mlp": 0.01057878, + "balance_loss_clip": 1.03615856, + "balance_loss_mlp": 1.05187333, + "epoch": 0.12144897038929806, + "flos": 26141603892480.0, + "grad_norm": 1.8061721544245042, + "language_loss": 0.92484713, + "learning_rate": 3.912913532431586e-06, + "loss": 0.94711161, + "num_input_tokens_seen": 43706320, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.1640625, + "step": 2020, + "time_per_iteration": 2.5007998943328857 + }, + { + "auxiliary_loss_clip": 0.01168404, + "auxiliary_loss_mlp": 0.0105088, + "balance_loss_clip": 1.03064966, + "balance_loss_mlp": 1.05388308, + "epoch": 0.12150909364196603, + "flos": 24717530309760.0, + "grad_norm": 1.9478588429028871, + "language_loss": 0.77149868, + "learning_rate": 3.912799822409549e-06, + "loss": 0.79369152, + "num_input_tokens_seen": 43724805, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.140625, + "step": 2021, + "time_per_iteration": 2.522216796875 + }, + { + "auxiliary_loss_clip": 0.01165897, + "auxiliary_loss_mlp": 0.01046456, + "balance_loss_clip": 1.02586901, + "balance_loss_mlp": 1.05312037, + "epoch": 0.121569216894634, + "flos": 25186990089600.0, + "grad_norm": 2.0305604143992944, + "language_loss": 0.80324662, + "learning_rate": 3.912686039853952e-06, + "loss": 0.82537007, + "num_input_tokens_seen": 43742320, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.125, + "step": 2022, + "time_per_iteration": 2.518737316131592 + }, + { + "auxiliary_loss_clip": 0.01173528, + "auxiliary_loss_mlp": 0.01051897, + "balance_loss_clip": 1.03094029, + "balance_loss_mlp": 1.057019, + "epoch": 0.12162934014730196, + "flos": 13444094039040.0, + "grad_norm": 1.9019957932594662, + "language_loss": 0.8458122, + "learning_rate": 3.912572184769108e-06, + "loss": 0.86806649, + "num_input_tokens_seen": 43760665, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1640625, + "step": 2023, + "time_per_iteration": 2.4534339904785156 + }, + { + "auxiliary_loss_clip": 0.01169339, + "auxiliary_loss_mlp": 0.0104975, + "balance_loss_clip": 1.02916241, + "balance_loss_mlp": 1.05421007, + "epoch": 0.12168946339996994, + "flos": 16946138344320.0, + "grad_norm": 2.2004951084054234, + "language_loss": 0.85155022, + "learning_rate": 3.912458257159335e-06, + "loss": 0.87374109, + "num_input_tokens_seen": 43779020, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.15625, + "step": 2024, + "time_per_iteration": 2.436833143234253 + }, + { + "auxiliary_loss_clip": 0.0116415, + "auxiliary_loss_mlp": 0.010498, + "balance_loss_clip": 1.02974951, + "balance_loss_mlp": 1.04884946, + "epoch": 0.12174958665263791, + "flos": 29821585196160.0, + "grad_norm": 2.043367551334066, + "language_loss": 0.71662712, + "learning_rate": 3.912344257028954e-06, + "loss": 0.73876667, + "num_input_tokens_seen": 43798850, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.15625, + "step": 2025, + "time_per_iteration": 2.541215658187866 + }, + { + "auxiliary_loss_clip": 0.01168343, + "auxiliary_loss_mlp": 0.0104585, + "balance_loss_clip": 1.02564383, + "balance_loss_mlp": 1.05309796, + "epoch": 0.12180970990530587, + "flos": 24641902224000.0, + "grad_norm": 2.0848974538483755, + "language_loss": 0.75976777, + "learning_rate": 3.912230184382286e-06, + "loss": 0.7819097, + "num_input_tokens_seen": 43820130, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.15625, + "step": 2026, + "time_per_iteration": 2.529049873352051 + }, + { + "auxiliary_loss_clip": 0.01167307, + "auxiliary_loss_mlp": 0.01045796, + "balance_loss_clip": 1.02570963, + "balance_loss_mlp": 1.05251837, + "epoch": 0.12186983315797385, + "flos": 20521691832960.0, + "grad_norm": 2.6572777094172597, + "language_loss": 0.88875067, + "learning_rate": 3.912116039223659e-06, + "loss": 0.9108817, + "num_input_tokens_seen": 43838485, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.1484375, + "step": 2027, + "time_per_iteration": 2.472158432006836 + }, + { + "auxiliary_loss_clip": 0.01165413, + "auxiliary_loss_mlp": 0.01052054, + "balance_loss_clip": 1.03375518, + "balance_loss_mlp": 1.05316114, + "epoch": 0.12192995641064182, + "flos": 27818344719360.0, + "grad_norm": 2.343330799439898, + "language_loss": 0.75515145, + "learning_rate": 3.912001821557399e-06, + "loss": 0.77732611, + "num_input_tokens_seen": 43859080, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.125, + "step": 2028, + "time_per_iteration": 2.5286035537719727 + }, + { + "auxiliary_loss_clip": 0.01164893, + "auxiliary_loss_mlp": 0.010582, + "balance_loss_clip": 1.03758836, + "balance_loss_mlp": 1.05089998, + "epoch": 0.12199007966330978, + "flos": 22017119783040.0, + "grad_norm": 2.270604294931249, + "language_loss": 0.766294, + "learning_rate": 3.911887531387839e-06, + "loss": 0.78852487, + "num_input_tokens_seen": 43879030, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.140625, + "step": 2029, + "time_per_iteration": 2.479799747467041 + }, + { + "auxiliary_loss_clip": 0.0116289, + "auxiliary_loss_mlp": 0.01051159, + "balance_loss_clip": 1.03113246, + "balance_loss_mlp": 1.05001879, + "epoch": 0.12205020291597775, + "flos": 23295216493440.0, + "grad_norm": 2.2290592341985747, + "language_loss": 0.7955277, + "learning_rate": 3.911773168719313e-06, + "loss": 0.81766814, + "num_input_tokens_seen": 43898505, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.125, + "step": 2030, + "time_per_iteration": 2.479250431060791 + }, + { + "auxiliary_loss_clip": 0.01164659, + "auxiliary_loss_mlp": 0.01054283, + "balance_loss_clip": 1.03301597, + "balance_loss_mlp": 1.0526309, + "epoch": 0.12211032616864573, + "flos": 26031609469440.0, + "grad_norm": 3.9595633959777694, + "language_loss": 0.74556369, + "learning_rate": 3.911658733556155e-06, + "loss": 0.76775312, + "num_input_tokens_seen": 43917945, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.125, + "step": 2031, + "time_per_iteration": 2.4966888427734375 + }, + { + "auxiliary_loss_clip": 0.01166064, + "auxiliary_loss_mlp": 0.01045008, + "balance_loss_clip": 1.0269599, + "balance_loss_mlp": 1.05319047, + "epoch": 0.12217044942131369, + "flos": 20410943224320.0, + "grad_norm": 1.9774178696035418, + "language_loss": 0.75045705, + "learning_rate": 3.911544225902707e-06, + "loss": 0.77256775, + "num_input_tokens_seen": 43937385, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.1328125, + "step": 2032, + "time_per_iteration": 2.4545648097991943 + }, + { + "auxiliary_loss_clip": 0.01156748, + "auxiliary_loss_mlp": 0.01043308, + "balance_loss_clip": 1.02398455, + "balance_loss_mlp": 1.04844511, + "epoch": 0.12223057267398166, + "flos": 22857142222080.0, + "grad_norm": 1.6143118682838826, + "language_loss": 0.88853258, + "learning_rate": 3.911429645763311e-06, + "loss": 0.91053319, + "num_input_tokens_seen": 43958130, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0859375, + "step": 2033, + "time_per_iteration": 2.505521535873413 + }, + { + "auxiliary_loss_clip": 0.01170793, + "auxiliary_loss_mlp": 0.01050252, + "balance_loss_clip": 1.03059459, + "balance_loss_mlp": 1.05660009, + "epoch": 0.12229069592664964, + "flos": 20047563285120.0, + "grad_norm": 2.1152048244965096, + "language_loss": 0.65517056, + "learning_rate": 3.911314993142311e-06, + "loss": 0.67738092, + "num_input_tokens_seen": 43976800, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.140625, + "step": 2034, + "time_per_iteration": 2.4647884368896484 + }, + { + "auxiliary_loss_clip": 0.01167041, + "auxiliary_loss_mlp": 0.01055195, + "balance_loss_clip": 1.03425026, + "balance_loss_mlp": 1.05399358, + "epoch": 0.1223508191793176, + "flos": 22274240313600.0, + "grad_norm": 1.59634219760927, + "language_loss": 0.76435542, + "learning_rate": 3.911200268044055e-06, + "loss": 0.78657782, + "num_input_tokens_seen": 43996620, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.125, + "step": 2035, + "time_per_iteration": 2.483016014099121 + }, + { + "auxiliary_loss_clip": 0.01169828, + "auxiliary_loss_mlp": 0.01051267, + "balance_loss_clip": 1.03104889, + "balance_loss_mlp": 1.0543201, + "epoch": 0.12241094243198557, + "flos": 21285978445440.0, + "grad_norm": 1.8316823187763973, + "language_loss": 0.71407682, + "learning_rate": 3.911085470472892e-06, + "loss": 0.73628777, + "num_input_tokens_seen": 44016175, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.15625, + "step": 2036, + "time_per_iteration": 2.476471185684204 + }, + { + "auxiliary_loss_clip": 0.01168411, + "auxiliary_loss_mlp": 0.01051825, + "balance_loss_clip": 1.0309397, + "balance_loss_mlp": 1.05532706, + "epoch": 0.12247106568465355, + "flos": 17382381022080.0, + "grad_norm": 1.632988910709452, + "language_loss": 0.83352619, + "learning_rate": 3.910970600433178e-06, + "loss": 0.85572863, + "num_input_tokens_seen": 44035060, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.125, + "step": 2037, + "time_per_iteration": 2.476040840148926 + }, + { + "auxiliary_loss_clip": 0.0117386, + "auxiliary_loss_mlp": 0.01058093, + "balance_loss_clip": 1.03625405, + "balance_loss_mlp": 1.05652785, + "epoch": 0.12253118893732151, + "flos": 27045438842880.0, + "grad_norm": 2.722283338591856, + "language_loss": 0.80255699, + "learning_rate": 3.910855657929267e-06, + "loss": 0.82487655, + "num_input_tokens_seen": 44053330, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.171875, + "step": 2038, + "time_per_iteration": 2.5043163299560547 + }, + { + "auxiliary_loss_clip": 0.01058546, + "auxiliary_loss_mlp": 0.01007425, + "balance_loss_clip": 1.0051837, + "balance_loss_mlp": 1.01638949, + "epoch": 0.12259131218998948, + "flos": 53861518368000.0, + "grad_norm": 0.832889593555193, + "language_loss": 0.58671033, + "learning_rate": 3.910740642965518e-06, + "loss": 0.60737002, + "num_input_tokens_seen": 44107575, + "router_z_loss_clip": 0.02246094, + "router_z_loss_mlp": 0.421875, + "step": 2039, + "time_per_iteration": 2.9495608806610107 + }, + { + "auxiliary_loss_clip": 0.01172242, + "auxiliary_loss_mlp": 0.01049387, + "balance_loss_clip": 1.0277977, + "balance_loss_mlp": 1.05559754, + "epoch": 0.12265143544265744, + "flos": 17891917401600.0, + "grad_norm": 2.6229044060505298, + "language_loss": 0.80485016, + "learning_rate": 3.910625555546292e-06, + "loss": 0.82706642, + "num_input_tokens_seen": 44126075, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.171875, + "step": 2040, + "time_per_iteration": 2.4483039379119873 + }, + { + "auxiliary_loss_clip": 0.01166059, + "auxiliary_loss_mlp": 0.01050675, + "balance_loss_clip": 1.02977788, + "balance_loss_mlp": 1.05270815, + "epoch": 0.12271155869532542, + "flos": 21799932197760.0, + "grad_norm": 1.8235003945490114, + "language_loss": 0.82753873, + "learning_rate": 3.910510395675953e-06, + "loss": 0.84970617, + "num_input_tokens_seen": 44145605, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1328125, + "step": 2041, + "time_per_iteration": 2.4804372787475586 + }, + { + "auxiliary_loss_clip": 0.01170766, + "auxiliary_loss_mlp": 0.01049407, + "balance_loss_clip": 1.02846217, + "balance_loss_mlp": 1.05399048, + "epoch": 0.12277168194799339, + "flos": 19828759587840.0, + "grad_norm": 1.7522185366152092, + "language_loss": 0.66806722, + "learning_rate": 3.9103951633588694e-06, + "loss": 0.69026893, + "num_input_tokens_seen": 44164770, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1640625, + "step": 2042, + "time_per_iteration": 2.4683480262756348 + }, + { + "auxiliary_loss_clip": 0.01165609, + "auxiliary_loss_mlp": 0.01051247, + "balance_loss_clip": 1.03032589, + "balance_loss_mlp": 1.05184031, + "epoch": 0.12283180520066135, + "flos": 23221024951680.0, + "grad_norm": 1.8478924147346443, + "language_loss": 0.81661081, + "learning_rate": 3.910279858599409e-06, + "loss": 0.83877933, + "num_input_tokens_seen": 44184025, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.140625, + "step": 2043, + "time_per_iteration": 2.5265614986419678 + }, + { + "auxiliary_loss_clip": 0.01166463, + "auxiliary_loss_mlp": 0.01049773, + "balance_loss_clip": 1.02792168, + "balance_loss_mlp": 1.05028844, + "epoch": 0.12289192845332933, + "flos": 18588476920320.0, + "grad_norm": 2.0920421188484095, + "language_loss": 0.8049221, + "learning_rate": 3.910164481401946e-06, + "loss": 0.82708442, + "num_input_tokens_seen": 44202950, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.1640625, + "step": 2044, + "time_per_iteration": 2.45843768119812 + }, + { + "auxiliary_loss_clip": 0.0116264, + "auxiliary_loss_mlp": 0.01046352, + "balance_loss_clip": 1.02577674, + "balance_loss_mlp": 1.05169511, + "epoch": 0.1229520517059973, + "flos": 25769532862080.0, + "grad_norm": 1.7057283877293323, + "language_loss": 0.7796452, + "learning_rate": 3.910049031770853e-06, + "loss": 0.8017351, + "num_input_tokens_seen": 44221115, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.109375, + "step": 2045, + "time_per_iteration": 2.5117220878601074 + }, + { + "auxiliary_loss_clip": 0.01172524, + "auxiliary_loss_mlp": 0.01063382, + "balance_loss_clip": 1.04210341, + "balance_loss_mlp": 1.05461311, + "epoch": 0.12301217495866526, + "flos": 20887154760960.0, + "grad_norm": 2.0659302798736436, + "language_loss": 0.67135215, + "learning_rate": 3.90993350971051e-06, + "loss": 0.69371116, + "num_input_tokens_seen": 44240575, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1796875, + "step": 2046, + "time_per_iteration": 2.466304063796997 + }, + { + "auxiliary_loss_clip": 0.01166597, + "auxiliary_loss_mlp": 0.01058908, + "balance_loss_clip": 1.03793919, + "balance_loss_mlp": 1.05408335, + "epoch": 0.12307229821133324, + "flos": 22378811783040.0, + "grad_norm": 2.3143924335245654, + "language_loss": 0.72491664, + "learning_rate": 3.909817915225297e-06, + "loss": 0.7471717, + "num_input_tokens_seen": 44257145, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.125, + "step": 2047, + "time_per_iteration": 2.4625275135040283 + }, + { + "auxiliary_loss_clip": 0.01163998, + "auxiliary_loss_mlp": 0.0106421, + "balance_loss_clip": 1.04232347, + "balance_loss_mlp": 1.05105257, + "epoch": 0.1231324214640012, + "flos": 23367396873600.0, + "grad_norm": 1.6458989790549132, + "language_loss": 0.76394033, + "learning_rate": 3.909702248319597e-06, + "loss": 0.7862224, + "num_input_tokens_seen": 44278035, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.125, + "step": 2048, + "time_per_iteration": 2.508790969848633 + }, + { + "auxiliary_loss_clip": 0.01165989, + "auxiliary_loss_mlp": 0.01048998, + "balance_loss_clip": 1.03061616, + "balance_loss_mlp": 1.05322123, + "epoch": 0.12319254471666917, + "flos": 23767154311680.0, + "grad_norm": 2.118548028298143, + "language_loss": 0.84626836, + "learning_rate": 3.909586508997797e-06, + "loss": 0.86841822, + "num_input_tokens_seen": 44296980, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.125, + "step": 2049, + "time_per_iteration": 2.538325071334839 + }, + { + "auxiliary_loss_clip": 0.01164402, + "auxiliary_loss_mlp": 0.01053692, + "balance_loss_clip": 1.0336647, + "balance_loss_mlp": 1.05051267, + "epoch": 0.12325266796933713, + "flos": 23550146294400.0, + "grad_norm": 3.176509780932849, + "language_loss": 0.75351131, + "learning_rate": 3.909470697264285e-06, + "loss": 0.77569222, + "num_input_tokens_seen": 44318005, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.140625, + "step": 2050, + "time_per_iteration": 2.499915599822998 + }, + { + "auxiliary_loss_clip": 0.0116542, + "auxiliary_loss_mlp": 0.01054604, + "balance_loss_clip": 1.03431439, + "balance_loss_mlp": 1.05127048, + "epoch": 0.12331279122200511, + "flos": 24423996366720.0, + "grad_norm": 1.9728027261326873, + "language_loss": 0.80877042, + "learning_rate": 3.909354813123452e-06, + "loss": 0.83097064, + "num_input_tokens_seen": 44335260, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.140625, + "step": 2051, + "time_per_iteration": 2.5018789768218994 + }, + { + "auxiliary_loss_clip": 0.01164273, + "auxiliary_loss_mlp": 0.01053369, + "balance_loss_clip": 1.03338933, + "balance_loss_mlp": 1.05348301, + "epoch": 0.12337291447467308, + "flos": 25484294960640.0, + "grad_norm": 1.7756923294305167, + "language_loss": 0.79991698, + "learning_rate": 3.909238856579693e-06, + "loss": 0.82209337, + "num_input_tokens_seen": 44355315, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.109375, + "step": 2052, + "time_per_iteration": 2.4962196350097656 + }, + { + "auxiliary_loss_clip": 0.01167428, + "auxiliary_loss_mlp": 0.01059063, + "balance_loss_clip": 1.03793955, + "balance_loss_mlp": 1.0515492, + "epoch": 0.12343303772734104, + "flos": 23550002640000.0, + "grad_norm": 2.071130498978609, + "language_loss": 0.73757279, + "learning_rate": 3.909122827637406e-06, + "loss": 0.75983769, + "num_input_tokens_seen": 44373020, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.15625, + "step": 2053, + "time_per_iteration": 2.4748997688293457 + }, + { + "auxiliary_loss_clip": 0.01164856, + "auxiliary_loss_mlp": 0.01054483, + "balance_loss_clip": 1.03337085, + "balance_loss_mlp": 1.04912996, + "epoch": 0.12349316098000902, + "flos": 47557074867840.0, + "grad_norm": 2.5139588428492408, + "language_loss": 0.73835206, + "learning_rate": 3.909006726300991e-06, + "loss": 0.76054543, + "num_input_tokens_seen": 44397525, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.15625, + "step": 2054, + "time_per_iteration": 2.7009665966033936 + }, + { + "auxiliary_loss_clip": 0.01161738, + "auxiliary_loss_mlp": 0.01042094, + "balance_loss_clip": 1.02381933, + "balance_loss_mlp": 1.04980421, + "epoch": 0.12355328423267699, + "flos": 25045969294080.0, + "grad_norm": 2.0020033330801863, + "language_loss": 0.85107529, + "learning_rate": 3.908890552574849e-06, + "loss": 0.87311363, + "num_input_tokens_seen": 44415890, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.1171875, + "step": 2055, + "time_per_iteration": 2.5038392543792725 + }, + { + "auxiliary_loss_clip": 0.01164626, + "auxiliary_loss_mlp": 0.01053304, + "balance_loss_clip": 1.03445673, + "balance_loss_mlp": 1.05093932, + "epoch": 0.12361340748534495, + "flos": 27709140395520.0, + "grad_norm": 1.9818000135561404, + "language_loss": 0.77465194, + "learning_rate": 3.908774306463384e-06, + "loss": 0.79683125, + "num_input_tokens_seen": 44436625, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.140625, + "step": 2056, + "time_per_iteration": 2.5265629291534424 + }, + { + "auxiliary_loss_clip": 0.01162241, + "auxiliary_loss_mlp": 0.01055177, + "balance_loss_clip": 1.03486395, + "balance_loss_mlp": 1.04937708, + "epoch": 0.12367353073801293, + "flos": 26140598311680.0, + "grad_norm": 1.9976131339644834, + "language_loss": 0.83188522, + "learning_rate": 3.908657987971009e-06, + "loss": 0.85405934, + "num_input_tokens_seen": 44455265, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1328125, + "step": 2057, + "time_per_iteration": 2.502987861633301 + }, + { + "auxiliary_loss_clip": 0.0116756, + "auxiliary_loss_mlp": 0.01053922, + "balance_loss_clip": 1.03272629, + "balance_loss_mlp": 1.05169332, + "epoch": 0.1237336539906809, + "flos": 25156035544320.0, + "grad_norm": 1.751792200322901, + "language_loss": 0.78356105, + "learning_rate": 3.90854159710213e-06, + "loss": 0.80577588, + "num_input_tokens_seen": 44475815, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1640625, + "step": 2058, + "time_per_iteration": 2.5236053466796875 + }, + { + "auxiliary_loss_clip": 0.01167574, + "auxiliary_loss_mlp": 0.01052354, + "balance_loss_clip": 1.03086066, + "balance_loss_mlp": 1.05105174, + "epoch": 0.12379377724334886, + "flos": 15304589867520.0, + "grad_norm": 2.1327254817813124, + "language_loss": 0.83191061, + "learning_rate": 3.9084251338611624e-06, + "loss": 0.85410988, + "num_input_tokens_seen": 44494045, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.1640625, + "step": 2059, + "time_per_iteration": 5.313246726989746 + }, + { + "auxiliary_loss_clip": 0.01169117, + "auxiliary_loss_mlp": 0.01056711, + "balance_loss_clip": 1.0344671, + "balance_loss_mlp": 1.05206418, + "epoch": 0.12385390049601683, + "flos": 21316717509120.0, + "grad_norm": 2.990324814625926, + "language_loss": 0.81387389, + "learning_rate": 3.908308598252523e-06, + "loss": 0.83613217, + "num_input_tokens_seen": 44509120, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.171875, + "step": 2060, + "time_per_iteration": 3.8617331981658936 + }, + { + "auxiliary_loss_clip": 0.01163462, + "auxiliary_loss_mlp": 0.01050537, + "balance_loss_clip": 1.02928221, + "balance_loss_mlp": 1.04859161, + "epoch": 0.1239140237486848, + "flos": 15116309752320.0, + "grad_norm": 2.0129231677956105, + "language_loss": 0.86278749, + "learning_rate": 3.9081919902806306e-06, + "loss": 0.88492751, + "num_input_tokens_seen": 44525780, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1484375, + "step": 2061, + "time_per_iteration": 2.4531033039093018 + }, + { + "auxiliary_loss_clip": 0.01163888, + "auxiliary_loss_mlp": 0.01045306, + "balance_loss_clip": 1.02552915, + "balance_loss_mlp": 1.05163288, + "epoch": 0.12397414700135277, + "flos": 21976791788160.0, + "grad_norm": 2.146204871859891, + "language_loss": 0.84992719, + "learning_rate": 3.908075309949906e-06, + "loss": 0.87201917, + "num_input_tokens_seen": 44543125, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.125, + "step": 2062, + "time_per_iteration": 2.475050449371338 + }, + { + "auxiliary_loss_clip": 0.01167091, + "auxiliary_loss_mlp": 0.01057701, + "balance_loss_clip": 1.03600502, + "balance_loss_mlp": 1.05348217, + "epoch": 0.12403427025402074, + "flos": 13400892956160.0, + "grad_norm": 2.194910982672458, + "language_loss": 0.78651118, + "learning_rate": 3.907958557264774e-06, + "loss": 0.80875909, + "num_input_tokens_seen": 44560275, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.140625, + "step": 2063, + "time_per_iteration": 2.4638655185699463 + }, + { + "auxiliary_loss_clip": 0.01167155, + "auxiliary_loss_mlp": 0.01058063, + "balance_loss_clip": 1.03590226, + "balance_loss_mlp": 1.05330634, + "epoch": 0.12409439350668872, + "flos": 15304374385920.0, + "grad_norm": 2.133219584666701, + "language_loss": 0.79411167, + "learning_rate": 3.907841732229663e-06, + "loss": 0.81636381, + "num_input_tokens_seen": 44577640, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.1328125, + "step": 2064, + "time_per_iteration": 2.4441418647766113 + }, + { + "auxiliary_loss_clip": 0.01163006, + "auxiliary_loss_mlp": 0.01051291, + "balance_loss_clip": 1.03083503, + "balance_loss_mlp": 1.04955256, + "epoch": 0.12415451675935668, + "flos": 25009376313600.0, + "grad_norm": 2.2298036351802533, + "language_loss": 0.92358226, + "learning_rate": 3.907724834849002e-06, + "loss": 0.9457252, + "num_input_tokens_seen": 44594860, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1328125, + "step": 2065, + "time_per_iteration": 2.4909794330596924 + }, + { + "auxiliary_loss_clip": 0.01166011, + "auxiliary_loss_mlp": 0.01050011, + "balance_loss_clip": 1.02880335, + "balance_loss_mlp": 1.05061674, + "epoch": 0.12421464001202465, + "flos": 23659673840640.0, + "grad_norm": 1.7134253508315578, + "language_loss": 0.8042016, + "learning_rate": 3.907607865127225e-06, + "loss": 0.82636184, + "num_input_tokens_seen": 44614780, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.15625, + "step": 2066, + "time_per_iteration": 2.484276056289673 + }, + { + "auxiliary_loss_clip": 0.01052523, + "auxiliary_loss_mlp": 0.0100337, + "balance_loss_clip": 1.00111723, + "balance_loss_mlp": 1.01144505, + "epoch": 0.12427476326469263, + "flos": 65732904345600.0, + "grad_norm": 0.8687209975293121, + "language_loss": 0.63275361, + "learning_rate": 3.907490823068766e-06, + "loss": 0.65331256, + "num_input_tokens_seen": 44671240, + "router_z_loss_clip": 0.02258301, + "router_z_loss_mlp": 0.41015625, + "step": 2067, + "time_per_iteration": 3.0286524295806885 + }, + { + "auxiliary_loss_clip": 0.01166519, + "auxiliary_loss_mlp": 0.0105175, + "balance_loss_clip": 1.03103137, + "balance_loss_mlp": 1.05087852, + "epoch": 0.12433488651736059, + "flos": 24535427333760.0, + "grad_norm": 1.9774411847970965, + "language_loss": 0.93209147, + "learning_rate": 3.907373708678063e-06, + "loss": 0.95427418, + "num_input_tokens_seen": 44691050, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.15625, + "step": 2068, + "time_per_iteration": 2.4971697330474854 + }, + { + "auxiliary_loss_clip": 0.01167817, + "auxiliary_loss_mlp": 0.01049229, + "balance_loss_clip": 1.03079867, + "balance_loss_mlp": 1.053213, + "epoch": 0.12439500977002856, + "flos": 21031659175680.0, + "grad_norm": 1.9835561743386452, + "language_loss": 0.81277847, + "learning_rate": 3.9072565219595596e-06, + "loss": 0.83494884, + "num_input_tokens_seen": 44709850, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.1484375, + "step": 2069, + "time_per_iteration": 2.4772391319274902 + }, + { + "auxiliary_loss_clip": 0.01166777, + "auxiliary_loss_mlp": 0.01055339, + "balance_loss_clip": 1.03519261, + "balance_loss_mlp": 1.05177176, + "epoch": 0.12445513302269653, + "flos": 26830621555200.0, + "grad_norm": 1.606173275168009, + "language_loss": 0.77390277, + "learning_rate": 3.907139262917696e-06, + "loss": 0.79612398, + "num_input_tokens_seen": 44731475, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.1484375, + "step": 2070, + "time_per_iteration": 2.4962410926818848 + }, + { + "auxiliary_loss_clip": 0.01172971, + "auxiliary_loss_mlp": 0.01046497, + "balance_loss_clip": 1.02598071, + "balance_loss_mlp": 1.05637431, + "epoch": 0.1245152562753645, + "flos": 18368919037440.0, + "grad_norm": 2.418044156181854, + "language_loss": 0.80847198, + "learning_rate": 3.907021931556922e-06, + "loss": 0.83066666, + "num_input_tokens_seen": 44749685, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1640625, + "step": 2071, + "time_per_iteration": 2.452148199081421 + }, + { + "auxiliary_loss_clip": 0.01162159, + "auxiliary_loss_mlp": 0.01051578, + "balance_loss_clip": 1.03063262, + "balance_loss_mlp": 1.05134583, + "epoch": 0.12457537952803246, + "flos": 33107986200960.0, + "grad_norm": 1.802846280579791, + "language_loss": 0.77933639, + "learning_rate": 3.906904527881684e-06, + "loss": 0.80147374, + "num_input_tokens_seen": 44772165, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.109375, + "step": 2072, + "time_per_iteration": 2.5763509273529053 + }, + { + "auxiliary_loss_clip": 0.01166298, + "auxiliary_loss_mlp": 0.01054628, + "balance_loss_clip": 1.03480363, + "balance_loss_mlp": 1.05423427, + "epoch": 0.12463550278070043, + "flos": 22270217990400.0, + "grad_norm": 2.6278132513508976, + "language_loss": 0.74839735, + "learning_rate": 3.9067870518964355e-06, + "loss": 0.77060658, + "num_input_tokens_seen": 44790580, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.125, + "step": 2073, + "time_per_iteration": 2.4676945209503174 + }, + { + "auxiliary_loss_clip": 0.01162135, + "auxiliary_loss_mlp": 0.01050014, + "balance_loss_clip": 1.02904546, + "balance_loss_mlp": 1.04915833, + "epoch": 0.12469562603336841, + "flos": 14679025580160.0, + "grad_norm": 1.9457561725453951, + "language_loss": 0.90556443, + "learning_rate": 3.906669503605631e-06, + "loss": 0.92768592, + "num_input_tokens_seen": 44806730, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1328125, + "step": 2074, + "time_per_iteration": 2.4873156547546387 + }, + { + "auxiliary_loss_clip": 0.01168793, + "auxiliary_loss_mlp": 0.01051059, + "balance_loss_clip": 1.02843285, + "balance_loss_mlp": 1.05183172, + "epoch": 0.12475574928603637, + "flos": 24644775312000.0, + "grad_norm": 2.3814572559525877, + "language_loss": 0.83753067, + "learning_rate": 3.906551883013728e-06, + "loss": 0.85972917, + "num_input_tokens_seen": 44825550, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.171875, + "step": 2075, + "time_per_iteration": 2.500657320022583 + }, + { + "auxiliary_loss_clip": 0.01164838, + "auxiliary_loss_mlp": 0.01056086, + "balance_loss_clip": 1.0341754, + "balance_loss_mlp": 1.05080831, + "epoch": 0.12481587253870434, + "flos": 21762980081280.0, + "grad_norm": 2.1638910845289567, + "language_loss": 0.73802024, + "learning_rate": 3.9064341901251865e-06, + "loss": 0.76022947, + "num_input_tokens_seen": 44844155, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.140625, + "step": 2076, + "time_per_iteration": 2.5686564445495605 + }, + { + "auxiliary_loss_clip": 0.01161577, + "auxiliary_loss_mlp": 0.01043023, + "balance_loss_clip": 1.02346063, + "balance_loss_mlp": 1.05219531, + "epoch": 0.12487599579137232, + "flos": 21432529935360.0, + "grad_norm": 1.967733683791653, + "language_loss": 0.7551648, + "learning_rate": 3.906316424944469e-06, + "loss": 0.77721083, + "num_input_tokens_seen": 44863780, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.09375, + "step": 2077, + "time_per_iteration": 2.489954710006714 + }, + { + "auxiliary_loss_clip": 0.01163633, + "auxiliary_loss_mlp": 0.0105265, + "balance_loss_clip": 1.03104901, + "balance_loss_mlp": 1.05015802, + "epoch": 0.12493611904404028, + "flos": 16107624276480.0, + "grad_norm": 4.043491061132511, + "language_loss": 0.82077563, + "learning_rate": 3.906198587476043e-06, + "loss": 0.84293842, + "num_input_tokens_seen": 44881480, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.1328125, + "step": 2078, + "time_per_iteration": 2.445270299911499 + }, + { + "auxiliary_loss_clip": 0.01168396, + "auxiliary_loss_mlp": 0.0104761, + "balance_loss_clip": 1.02629507, + "balance_loss_mlp": 1.05372512, + "epoch": 0.12499624229670825, + "flos": 21580266574080.0, + "grad_norm": 2.023726857078381, + "language_loss": 0.75024784, + "learning_rate": 3.906080677724374e-06, + "loss": 0.77240789, + "num_input_tokens_seen": 44900390, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1484375, + "step": 2079, + "time_per_iteration": 2.4694364070892334 + }, + { + "auxiliary_loss_clip": 0.01173002, + "auxiliary_loss_mlp": 0.01056904, + "balance_loss_clip": 1.03578043, + "balance_loss_mlp": 1.05697465, + "epoch": 0.1250563655493762, + "flos": 25699040421120.0, + "grad_norm": 2.9314739831996124, + "language_loss": 0.83961046, + "learning_rate": 3.905962695693935e-06, + "loss": 0.86190951, + "num_input_tokens_seen": 44920375, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1640625, + "step": 2080, + "time_per_iteration": 2.52955961227417 + }, + { + "auxiliary_loss_clip": 0.01166024, + "auxiliary_loss_mlp": 0.0105753, + "balance_loss_clip": 1.0364058, + "balance_loss_mlp": 1.05275226, + "epoch": 0.12511648880204418, + "flos": 16909509450240.0, + "grad_norm": 2.0357346796271307, + "language_loss": 0.84575123, + "learning_rate": 3.9058446413892e-06, + "loss": 0.8679868, + "num_input_tokens_seen": 44938415, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1328125, + "step": 2081, + "time_per_iteration": 2.4380433559417725 + }, + { + "auxiliary_loss_clip": 0.0116507, + "auxiliary_loss_mlp": 0.01044581, + "balance_loss_clip": 1.02430391, + "balance_loss_mlp": 1.05154538, + "epoch": 0.12517661205471217, + "flos": 17567500740480.0, + "grad_norm": 1.660916229819668, + "language_loss": 0.76882648, + "learning_rate": 3.905726514814646e-06, + "loss": 0.790923, + "num_input_tokens_seen": 44957135, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1328125, + "step": 2082, + "time_per_iteration": 2.454939842224121 + }, + { + "auxiliary_loss_clip": 0.01182882, + "auxiliary_loss_mlp": 0.01052846, + "balance_loss_clip": 1.03117347, + "balance_loss_mlp": 1.06035674, + "epoch": 0.12523673530738014, + "flos": 16033791870720.0, + "grad_norm": 2.833832134330164, + "language_loss": 0.78994107, + "learning_rate": 3.9056083159747495e-06, + "loss": 0.81229836, + "num_input_tokens_seen": 44974480, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.2265625, + "step": 2083, + "time_per_iteration": 2.4439167976379395 + }, + { + "auxiliary_loss_clip": 0.01168103, + "auxiliary_loss_mlp": 0.01051445, + "balance_loss_clip": 1.02855682, + "balance_loss_mlp": 1.05132031, + "epoch": 0.1252968585600481, + "flos": 18807747494400.0, + "grad_norm": 2.376124844090109, + "language_loss": 0.89690113, + "learning_rate": 3.9054900448739966e-06, + "loss": 0.91909659, + "num_input_tokens_seen": 44990310, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.171875, + "step": 2084, + "time_per_iteration": 2.492274045944214 + }, + { + "auxiliary_loss_clip": 0.01168755, + "auxiliary_loss_mlp": 0.01049772, + "balance_loss_clip": 1.02876747, + "balance_loss_mlp": 1.05379784, + "epoch": 0.12535698181271607, + "flos": 27271568914560.0, + "grad_norm": 1.9059704425119062, + "language_loss": 0.79718572, + "learning_rate": 3.905371701516869e-06, + "loss": 0.81937099, + "num_input_tokens_seen": 45010720, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.1484375, + "step": 2085, + "time_per_iteration": 2.5295538902282715 + }, + { + "auxiliary_loss_clip": 0.0116658, + "auxiliary_loss_mlp": 0.01051649, + "balance_loss_clip": 1.03011954, + "balance_loss_mlp": 1.05235541, + "epoch": 0.12541710506538403, + "flos": 22054107813120.0, + "grad_norm": 1.9580642243137214, + "language_loss": 0.88227898, + "learning_rate": 3.905253285907856e-06, + "loss": 0.90446126, + "num_input_tokens_seen": 45030360, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.140625, + "step": 2086, + "time_per_iteration": 2.4508614540100098 + }, + { + "auxiliary_loss_clip": 0.01162238, + "auxiliary_loss_mlp": 0.01045013, + "balance_loss_clip": 1.02541506, + "balance_loss_mlp": 1.05238986, + "epoch": 0.125477228318052, + "flos": 12603173760000.0, + "grad_norm": 2.3707303368435957, + "language_loss": 0.87088495, + "learning_rate": 3.905134798051447e-06, + "loss": 0.89295745, + "num_input_tokens_seen": 45045085, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.09375, + "step": 2087, + "time_per_iteration": 2.4342494010925293 + }, + { + "auxiliary_loss_clip": 0.01166252, + "auxiliary_loss_mlp": 0.01056999, + "balance_loss_clip": 1.03444421, + "balance_loss_mlp": 1.05230761, + "epoch": 0.12553735157071996, + "flos": 23878549365120.0, + "grad_norm": 3.239876707553976, + "language_loss": 0.73480451, + "learning_rate": 3.905016237952136e-06, + "loss": 0.75703704, + "num_input_tokens_seen": 45065145, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.140625, + "step": 2088, + "time_per_iteration": 2.4926228523254395 + }, + { + "auxiliary_loss_clip": 0.01053685, + "auxiliary_loss_mlp": 0.01004858, + "balance_loss_clip": 1.00259304, + "balance_loss_mlp": 1.01231122, + "epoch": 0.12559747482338796, + "flos": 69920841830400.0, + "grad_norm": 0.759594920780347, + "language_loss": 0.61699253, + "learning_rate": 3.904897605614418e-06, + "loss": 0.63757795, + "num_input_tokens_seen": 45126230, + "router_z_loss_clip": 0.02270508, + "router_z_loss_mlp": 0.4140625, + "step": 2089, + "time_per_iteration": 3.0373222827911377 + }, + { + "auxiliary_loss_clip": 0.01165987, + "auxiliary_loss_mlp": 0.01057326, + "balance_loss_clip": 1.03522468, + "balance_loss_mlp": 1.05317736, + "epoch": 0.12565759807605592, + "flos": 24279563779200.0, + "grad_norm": 2.0159960445234746, + "language_loss": 0.78266793, + "learning_rate": 3.904778901042793e-06, + "loss": 0.80490106, + "num_input_tokens_seen": 45145545, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.125, + "step": 2090, + "time_per_iteration": 2.5307860374450684 + }, + { + "auxiliary_loss_clip": 0.01051163, + "auxiliary_loss_mlp": 0.01005786, + "balance_loss_clip": 1.00381935, + "balance_loss_mlp": 1.01062346, + "epoch": 0.12571772132872389, + "flos": 56451180286080.0, + "grad_norm": 0.749206069507312, + "language_loss": 0.59394926, + "learning_rate": 3.90466012424176e-06, + "loss": 0.61451876, + "num_input_tokens_seen": 45206845, + "router_z_loss_clip": 0.01965332, + "router_z_loss_mlp": 0.40625, + "step": 2091, + "time_per_iteration": 2.976081609725952 + }, + { + "auxiliary_loss_clip": 0.01166574, + "auxiliary_loss_mlp": 0.0105012, + "balance_loss_clip": 1.03016472, + "balance_loss_mlp": 1.0538522, + "epoch": 0.12577784458139185, + "flos": 41245846675200.0, + "grad_norm": 1.8692826570762828, + "language_loss": 0.63588953, + "learning_rate": 3.904541275215825e-06, + "loss": 0.6580565, + "num_input_tokens_seen": 45228495, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.125, + "step": 2092, + "time_per_iteration": 2.633054733276367 + }, + { + "auxiliary_loss_clip": 0.01169654, + "auxiliary_loss_mlp": 0.01059319, + "balance_loss_clip": 1.03770638, + "balance_loss_mlp": 1.05095637, + "epoch": 0.12583796783405982, + "flos": 19755501799680.0, + "grad_norm": 3.3800613541528257, + "language_loss": 0.80149096, + "learning_rate": 3.904422353969493e-06, + "loss": 0.82378066, + "num_input_tokens_seen": 45245720, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.1875, + "step": 2093, + "time_per_iteration": 2.4769086837768555 + }, + { + "auxiliary_loss_clip": 0.01166637, + "auxiliary_loss_mlp": 0.01065148, + "balance_loss_clip": 1.04385769, + "balance_loss_mlp": 1.05323935, + "epoch": 0.12589809108672778, + "flos": 22602104680320.0, + "grad_norm": 1.7179534274341421, + "language_loss": 0.75928843, + "learning_rate": 3.904303360507276e-06, + "loss": 0.78160632, + "num_input_tokens_seen": 45265650, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1328125, + "step": 2094, + "time_per_iteration": 2.4775569438934326 + }, + { + "auxiliary_loss_clip": 0.01163905, + "auxiliary_loss_mlp": 0.01053098, + "balance_loss_clip": 1.03322637, + "balance_loss_mlp": 1.05116057, + "epoch": 0.12595821433939577, + "flos": 45222845541120.0, + "grad_norm": 1.654740537988477, + "language_loss": 0.76833487, + "learning_rate": 3.9041842948336835e-06, + "loss": 0.79050487, + "num_input_tokens_seen": 45287790, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.125, + "step": 2095, + "time_per_iteration": 2.669593095779419 + }, + { + "auxiliary_loss_clip": 0.01166425, + "auxiliary_loss_mlp": 0.01064344, + "balance_loss_clip": 1.04330409, + "balance_loss_mlp": 1.05012596, + "epoch": 0.12601833759206374, + "flos": 14319811618560.0, + "grad_norm": 2.7658625824396568, + "language_loss": 0.8312341, + "learning_rate": 3.904065156953232e-06, + "loss": 0.85354173, + "num_input_tokens_seen": 45305720, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1640625, + "step": 2096, + "time_per_iteration": 2.446169853210449 + }, + { + "auxiliary_loss_clip": 0.01167307, + "auxiliary_loss_mlp": 0.0105403, + "balance_loss_clip": 1.03317988, + "balance_loss_mlp": 1.05236387, + "epoch": 0.1260784608447317, + "flos": 21288241002240.0, + "grad_norm": 1.9365429623482773, + "language_loss": 0.7532599, + "learning_rate": 3.903945946870439e-06, + "loss": 0.77547324, + "num_input_tokens_seen": 45325290, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1484375, + "step": 2097, + "time_per_iteration": 2.46520733833313 + }, + { + "auxiliary_loss_clip": 0.0116818, + "auxiliary_loss_mlp": 0.0105919, + "balance_loss_clip": 1.0399375, + "balance_loss_mlp": 1.05366278, + "epoch": 0.12613858409739967, + "flos": 26251311006720.0, + "grad_norm": 2.0415683165998004, + "language_loss": 0.8696878, + "learning_rate": 3.9038266645898246e-06, + "loss": 0.89196146, + "num_input_tokens_seen": 45344465, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.1484375, + "step": 2098, + "time_per_iteration": 2.488985061645508 + }, + { + "auxiliary_loss_clip": 0.01171506, + "auxiliary_loss_mlp": 0.01063691, + "balance_loss_clip": 1.03984964, + "balance_loss_mlp": 1.05263424, + "epoch": 0.12619870735006763, + "flos": 21579979265280.0, + "grad_norm": 1.8810788789855342, + "language_loss": 0.69538295, + "learning_rate": 3.903707310115912e-06, + "loss": 0.71773493, + "num_input_tokens_seen": 45362465, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.1875, + "step": 2099, + "time_per_iteration": 2.4791061878204346 + }, + { + "auxiliary_loss_clip": 0.01167442, + "auxiliary_loss_mlp": 0.01058165, + "balance_loss_clip": 1.03538442, + "balance_loss_mlp": 1.05016196, + "epoch": 0.1262588306027356, + "flos": 23367037737600.0, + "grad_norm": 3.489186386071109, + "language_loss": 0.81622505, + "learning_rate": 3.903587883453228e-06, + "loss": 0.83848113, + "num_input_tokens_seen": 45382700, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.171875, + "step": 2100, + "time_per_iteration": 2.4970083236694336 + }, + { + "auxiliary_loss_clip": 0.01170444, + "auxiliary_loss_mlp": 0.01056399, + "balance_loss_clip": 1.03558493, + "balance_loss_mlp": 1.05375385, + "epoch": 0.12631895385540357, + "flos": 23949185460480.0, + "grad_norm": 21.240028764463403, + "language_loss": 0.80653214, + "learning_rate": 3.903468384606302e-06, + "loss": 0.82880062, + "num_input_tokens_seen": 45401005, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.1640625, + "step": 2101, + "time_per_iteration": 5.441275596618652 + }, + { + "auxiliary_loss_clip": 0.01053889, + "auxiliary_loss_mlp": 0.01009667, + "balance_loss_clip": 1.00753367, + "balance_loss_mlp": 1.01423335, + "epoch": 0.12637907710807156, + "flos": 70282138780800.0, + "grad_norm": 0.7055092704674581, + "language_loss": 0.57077372, + "learning_rate": 3.903348813579662e-06, + "loss": 0.59140933, + "num_input_tokens_seen": 45466555, + "router_z_loss_clip": 0.0213623, + "router_z_loss_mlp": 0.39648438, + "step": 2102, + "time_per_iteration": 4.4595959186553955 + }, + { + "auxiliary_loss_clip": 0.01172982, + "auxiliary_loss_mlp": 0.0105633, + "balance_loss_clip": 1.03513408, + "balance_loss_mlp": 1.05443108, + "epoch": 0.12643920036073952, + "flos": 18915084311040.0, + "grad_norm": 1.9163731362545673, + "language_loss": 0.93033105, + "learning_rate": 3.903229170377845e-06, + "loss": 0.9526242, + "num_input_tokens_seen": 45485165, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1875, + "step": 2103, + "time_per_iteration": 2.4612908363342285 + }, + { + "auxiliary_loss_clip": 0.01160763, + "auxiliary_loss_mlp": 0.01039063, + "balance_loss_clip": 1.01929784, + "balance_loss_mlp": 1.05146646, + "epoch": 0.1264993236134075, + "flos": 27782470010880.0, + "grad_norm": 1.70771861982282, + "language_loss": 0.7804687, + "learning_rate": 3.903109455005387e-06, + "loss": 0.80246699, + "num_input_tokens_seen": 45504630, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.09375, + "step": 2104, + "time_per_iteration": 2.556351661682129 + }, + { + "auxiliary_loss_clip": 0.01173017, + "auxiliary_loss_mlp": 0.01056721, + "balance_loss_clip": 1.03659892, + "balance_loss_mlp": 1.05698192, + "epoch": 0.12655944686607545, + "flos": 24754697907840.0, + "grad_norm": 1.9983303318130716, + "language_loss": 0.81274837, + "learning_rate": 3.902989667466828e-06, + "loss": 0.83504581, + "num_input_tokens_seen": 45524885, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.15625, + "step": 2105, + "time_per_iteration": 2.4998059272766113 + }, + { + "auxiliary_loss_clip": 0.01177911, + "auxiliary_loss_mlp": 0.01057389, + "balance_loss_clip": 1.03515697, + "balance_loss_mlp": 1.05756688, + "epoch": 0.12661957011874342, + "flos": 24133048202880.0, + "grad_norm": 2.6618923007939728, + "language_loss": 0.83258855, + "learning_rate": 3.90286980776671e-06, + "loss": 0.85494161, + "num_input_tokens_seen": 45545000, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.203125, + "step": 2106, + "time_per_iteration": 2.4816856384277344 + }, + { + "auxiliary_loss_clip": 0.01170292, + "auxiliary_loss_mlp": 0.01048713, + "balance_loss_clip": 1.02755296, + "balance_loss_mlp": 1.05664992, + "epoch": 0.12667969337141138, + "flos": 24569614103040.0, + "grad_norm": 2.017673348074064, + "language_loss": 0.73717511, + "learning_rate": 3.902749875909578e-06, + "loss": 0.75936514, + "num_input_tokens_seen": 45564210, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.140625, + "step": 2107, + "time_per_iteration": 2.503575325012207 + }, + { + "auxiliary_loss_clip": 0.01166119, + "auxiliary_loss_mlp": 0.01046685, + "balance_loss_clip": 1.02683651, + "balance_loss_mlp": 1.05330598, + "epoch": 0.12673981662407935, + "flos": 22961677777920.0, + "grad_norm": 2.8409726657459213, + "language_loss": 0.79492414, + "learning_rate": 3.90262987189998e-06, + "loss": 0.81705213, + "num_input_tokens_seen": 45583030, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.125, + "step": 2108, + "time_per_iteration": 2.448009967803955 + }, + { + "auxiliary_loss_clip": 0.01168328, + "auxiliary_loss_mlp": 0.01048086, + "balance_loss_clip": 1.02635407, + "balance_loss_mlp": 1.05213785, + "epoch": 0.12679993987674734, + "flos": 17274864637440.0, + "grad_norm": 2.700834997101356, + "language_loss": 0.75458848, + "learning_rate": 3.902509795742467e-06, + "loss": 0.77675259, + "num_input_tokens_seen": 45602265, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.1640625, + "step": 2109, + "time_per_iteration": 2.463996171951294 + }, + { + "auxiliary_loss_clip": 0.01165378, + "auxiliary_loss_mlp": 0.01046335, + "balance_loss_clip": 1.02641523, + "balance_loss_mlp": 1.05309939, + "epoch": 0.1268600631294153, + "flos": 17275080119040.0, + "grad_norm": 5.620565406896926, + "language_loss": 0.82876229, + "learning_rate": 3.902389647441592e-06, + "loss": 0.85087943, + "num_input_tokens_seen": 45620595, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.125, + "step": 2110, + "time_per_iteration": 2.4536476135253906 + }, + { + "auxiliary_loss_clip": 0.01166918, + "auxiliary_loss_mlp": 0.01055332, + "balance_loss_clip": 1.03271818, + "balance_loss_mlp": 1.0524385, + "epoch": 0.12692018638208327, + "flos": 24061047390720.0, + "grad_norm": 1.8108257578185059, + "language_loss": 0.78553301, + "learning_rate": 3.90226942700191e-06, + "loss": 0.80775553, + "num_input_tokens_seen": 45641140, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.140625, + "step": 2111, + "time_per_iteration": 2.4898500442504883 + }, + { + "auxiliary_loss_clip": 0.01178398, + "auxiliary_loss_mlp": 0.01069762, + "balance_loss_clip": 1.04634905, + "balance_loss_mlp": 1.05599511, + "epoch": 0.12698030963475124, + "flos": 31831900652160.0, + "grad_norm": 2.2255287569010567, + "language_loss": 0.76852119, + "learning_rate": 3.902149134427982e-06, + "loss": 0.79100275, + "num_input_tokens_seen": 45662315, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.2265625, + "step": 2112, + "time_per_iteration": 2.534062623977661 + }, + { + "auxiliary_loss_clip": 0.0116691, + "auxiliary_loss_mlp": 0.01060346, + "balance_loss_clip": 1.03878117, + "balance_loss_mlp": 1.05138493, + "epoch": 0.1270404328874192, + "flos": 25187744275200.0, + "grad_norm": 1.901101750436338, + "language_loss": 0.85764933, + "learning_rate": 3.902028769724367e-06, + "loss": 0.87992191, + "num_input_tokens_seen": 45680335, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.15625, + "step": 2113, + "time_per_iteration": 2.4980924129486084 + }, + { + "auxiliary_loss_clip": 0.01166421, + "auxiliary_loss_mlp": 0.01057595, + "balance_loss_clip": 1.03581548, + "balance_loss_mlp": 1.05287683, + "epoch": 0.12710055614008717, + "flos": 15997342544640.0, + "grad_norm": 2.270588429793272, + "language_loss": 0.74000478, + "learning_rate": 3.9019083328956315e-06, + "loss": 0.76224494, + "num_input_tokens_seen": 45696240, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.1328125, + "step": 2114, + "time_per_iteration": 2.422631025314331 + }, + { + "auxiliary_loss_clip": 0.01170563, + "auxiliary_loss_mlp": 0.01057942, + "balance_loss_clip": 1.03504217, + "balance_loss_mlp": 1.05601084, + "epoch": 0.12716067939275516, + "flos": 15085642515840.0, + "grad_norm": 1.7902572486589996, + "language_loss": 0.83236456, + "learning_rate": 3.901787823946341e-06, + "loss": 0.85464966, + "num_input_tokens_seen": 45713695, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.1484375, + "step": 2115, + "time_per_iteration": 2.4601340293884277 + }, + { + "auxiliary_loss_clip": 0.01169954, + "auxiliary_loss_mlp": 0.01060607, + "balance_loss_clip": 1.03953075, + "balance_loss_mlp": 1.05397201, + "epoch": 0.12722080264542313, + "flos": 28366736636160.0, + "grad_norm": 1.532692301262898, + "language_loss": 0.86615002, + "learning_rate": 3.901667242881065e-06, + "loss": 0.88845563, + "num_input_tokens_seen": 45736655, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1640625, + "step": 2116, + "time_per_iteration": 2.5315732955932617 + }, + { + "auxiliary_loss_clip": 0.01164638, + "auxiliary_loss_mlp": 0.01050843, + "balance_loss_clip": 1.03062534, + "balance_loss_mlp": 1.05188024, + "epoch": 0.1272809258980911, + "flos": 32379897519360.0, + "grad_norm": 1.8525451323112498, + "language_loss": 0.70492947, + "learning_rate": 3.9015465897043775e-06, + "loss": 0.72708428, + "num_input_tokens_seen": 45758195, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.125, + "step": 2117, + "time_per_iteration": 2.6213905811309814 + }, + { + "auxiliary_loss_clip": 0.01168229, + "auxiliary_loss_mlp": 0.01055103, + "balance_loss_clip": 1.03346658, + "balance_loss_mlp": 1.05461121, + "epoch": 0.12734104915075906, + "flos": 16034402401920.0, + "grad_norm": 2.4058915352959294, + "language_loss": 0.86858076, + "learning_rate": 3.901425864420852e-06, + "loss": 0.89081407, + "num_input_tokens_seen": 45774280, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.140625, + "step": 2118, + "time_per_iteration": 2.4760360717773438 + }, + { + "auxiliary_loss_clip": 0.01164532, + "auxiliary_loss_mlp": 0.01048268, + "balance_loss_clip": 1.0279547, + "balance_loss_mlp": 1.0518508, + "epoch": 0.12740117240342702, + "flos": 18260325244800.0, + "grad_norm": 1.7933295144796901, + "language_loss": 0.87325591, + "learning_rate": 3.901305067035068e-06, + "loss": 0.89538383, + "num_input_tokens_seen": 45792760, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.125, + "step": 2119, + "time_per_iteration": 2.547213315963745 + }, + { + "auxiliary_loss_clip": 0.01167828, + "auxiliary_loss_mlp": 0.01051741, + "balance_loss_clip": 1.03024805, + "balance_loss_mlp": 1.05369782, + "epoch": 0.127461295656095, + "flos": 12121790664960.0, + "grad_norm": 2.4444945117671018, + "language_loss": 0.8769815, + "learning_rate": 3.901184197551605e-06, + "loss": 0.89917719, + "num_input_tokens_seen": 45804300, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.140625, + "step": 2120, + "time_per_iteration": 2.4568872451782227 + }, + { + "auxiliary_loss_clip": 0.01169401, + "auxiliary_loss_mlp": 0.0104623, + "balance_loss_clip": 1.02553487, + "balance_loss_mlp": 1.05405664, + "epoch": 0.12752141890876295, + "flos": 23149095966720.0, + "grad_norm": 1.8558714180118523, + "language_loss": 0.75193042, + "learning_rate": 3.901063255975046e-06, + "loss": 0.77408671, + "num_input_tokens_seen": 45823780, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1484375, + "step": 2121, + "time_per_iteration": 2.508117437362671 + }, + { + "auxiliary_loss_clip": 0.01167335, + "auxiliary_loss_mlp": 0.01050063, + "balance_loss_clip": 1.02895081, + "balance_loss_mlp": 1.05228865, + "epoch": 0.12758154216143094, + "flos": 21615997628160.0, + "grad_norm": 2.458066848563671, + "language_loss": 0.8294577, + "learning_rate": 3.900942242309978e-06, + "loss": 0.8516317, + "num_input_tokens_seen": 45840495, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.15625, + "step": 2122, + "time_per_iteration": 2.4878990650177 + }, + { + "auxiliary_loss_clip": 0.01168476, + "auxiliary_loss_mlp": 0.01050394, + "balance_loss_clip": 1.02924609, + "balance_loss_mlp": 1.05379128, + "epoch": 0.1276416654140989, + "flos": 15924874855680.0, + "grad_norm": 2.1208761223769375, + "language_loss": 0.79040462, + "learning_rate": 3.90082115656099e-06, + "loss": 0.81259328, + "num_input_tokens_seen": 45857735, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1484375, + "step": 2123, + "time_per_iteration": 2.512085199356079 + }, + { + "auxiliary_loss_clip": 0.0117181, + "auxiliary_loss_mlp": 0.01056255, + "balance_loss_clip": 1.03411841, + "balance_loss_mlp": 1.05565643, + "epoch": 0.12770178866676687, + "flos": 22382690451840.0, + "grad_norm": 1.7846776317234667, + "language_loss": 0.79227948, + "learning_rate": 3.900699998732673e-06, + "loss": 0.81456017, + "num_input_tokens_seen": 45876485, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1640625, + "step": 2124, + "time_per_iteration": 2.4865264892578125 + }, + { + "auxiliary_loss_clip": 0.01168084, + "auxiliary_loss_mlp": 0.01054179, + "balance_loss_clip": 1.03267348, + "balance_loss_mlp": 1.05149364, + "epoch": 0.12776191191943484, + "flos": 21652482867840.0, + "grad_norm": 2.8175561910153215, + "language_loss": 0.75565529, + "learning_rate": 3.900578768829623e-06, + "loss": 0.77787793, + "num_input_tokens_seen": 45894645, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.1640625, + "step": 2125, + "time_per_iteration": 2.514455795288086 + }, + { + "auxiliary_loss_clip": 0.01166899, + "auxiliary_loss_mlp": 0.01047376, + "balance_loss_clip": 1.02645469, + "balance_loss_mlp": 1.05262208, + "epoch": 0.1278220351721028, + "flos": 25735561574400.0, + "grad_norm": 2.1990589160087493, + "language_loss": 0.77811432, + "learning_rate": 3.900457466856434e-06, + "loss": 0.80025709, + "num_input_tokens_seen": 45913755, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.140625, + "step": 2126, + "time_per_iteration": 2.556657075881958 + }, + { + "auxiliary_loss_clip": 0.01167875, + "auxiliary_loss_mlp": 0.01050746, + "balance_loss_clip": 1.03124356, + "balance_loss_mlp": 1.05559683, + "epoch": 0.12788215842477077, + "flos": 41243224982400.0, + "grad_norm": 1.702389562623477, + "language_loss": 0.69255161, + "learning_rate": 3.9003360928177085e-06, + "loss": 0.71473777, + "num_input_tokens_seen": 45936095, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.125, + "step": 2127, + "time_per_iteration": 2.629990339279175 + }, + { + "auxiliary_loss_clip": 0.01050691, + "auxiliary_loss_mlp": 0.01005275, + "balance_loss_clip": 1.00326061, + "balance_loss_mlp": 1.01139402, + "epoch": 0.12794228167743876, + "flos": 70877430881280.0, + "grad_norm": 0.8552720802624753, + "language_loss": 0.62738979, + "learning_rate": 3.900214646718047e-06, + "loss": 0.64794946, + "num_input_tokens_seen": 46004655, + "router_z_loss_clip": 0.0201416, + "router_z_loss_mlp": 0.39257812, + "step": 2128, + "time_per_iteration": 3.1237356662750244 + }, + { + "auxiliary_loss_clip": 0.01168478, + "auxiliary_loss_mlp": 0.01048721, + "balance_loss_clip": 1.02646422, + "balance_loss_mlp": 1.05287039, + "epoch": 0.12800240493010673, + "flos": 16289727252480.0, + "grad_norm": 2.3711218915030368, + "language_loss": 0.77148604, + "learning_rate": 3.900093128562056e-06, + "loss": 0.79365802, + "num_input_tokens_seen": 46023610, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.15625, + "step": 2129, + "time_per_iteration": 2.4499564170837402 + }, + { + "auxiliary_loss_clip": 0.01179121, + "auxiliary_loss_mlp": 0.01052089, + "balance_loss_clip": 1.02902186, + "balance_loss_mlp": 1.05744195, + "epoch": 0.1280625281827747, + "flos": 20631542601600.0, + "grad_norm": 2.273395516882369, + "language_loss": 0.79321349, + "learning_rate": 3.899971538354343e-06, + "loss": 0.81552559, + "num_input_tokens_seen": 46041725, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.21875, + "step": 2130, + "time_per_iteration": 2.4536893367767334 + }, + { + "auxiliary_loss_clip": 0.0116812, + "auxiliary_loss_mlp": 0.01044456, + "balance_loss_clip": 1.02463198, + "balance_loss_mlp": 1.05328345, + "epoch": 0.12812265143544266, + "flos": 22638230784000.0, + "grad_norm": 2.267455405666958, + "language_loss": 0.70879477, + "learning_rate": 3.899849876099518e-06, + "loss": 0.73092055, + "num_input_tokens_seen": 46061095, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.1484375, + "step": 2131, + "time_per_iteration": 2.514155149459839 + }, + { + "auxiliary_loss_clip": 0.01166691, + "auxiliary_loss_mlp": 0.01051797, + "balance_loss_clip": 1.03007698, + "balance_loss_mlp": 1.05375445, + "epoch": 0.12818277468811062, + "flos": 34714701463680.0, + "grad_norm": 2.2952793086030376, + "language_loss": 0.72266257, + "learning_rate": 3.899728141802197e-06, + "loss": 0.74484742, + "num_input_tokens_seen": 46082670, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.125, + "step": 2132, + "time_per_iteration": 2.5662834644317627 + }, + { + "auxiliary_loss_clip": 0.01163765, + "auxiliary_loss_mlp": 0.01054914, + "balance_loss_clip": 1.03396928, + "balance_loss_mlp": 1.05281162, + "epoch": 0.1282428979407786, + "flos": 23112107936640.0, + "grad_norm": 2.1162344308699828, + "language_loss": 0.82306767, + "learning_rate": 3.8996063354669935e-06, + "loss": 0.84525442, + "num_input_tokens_seen": 46102410, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.109375, + "step": 2133, + "time_per_iteration": 2.488302230834961 + }, + { + "auxiliary_loss_clip": 0.01174206, + "auxiliary_loss_mlp": 0.01061813, + "balance_loss_clip": 1.03871, + "balance_loss_mlp": 1.05329132, + "epoch": 0.12830302119344655, + "flos": 20886508316160.0, + "grad_norm": 2.538367341661163, + "language_loss": 0.79631573, + "learning_rate": 3.899484457098528e-06, + "loss": 0.81867594, + "num_input_tokens_seen": 46121145, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.203125, + "step": 2134, + "time_per_iteration": 2.4610936641693115 + }, + { + "auxiliary_loss_clip": 0.01172893, + "auxiliary_loss_mlp": 0.01045118, + "balance_loss_clip": 1.02393413, + "balance_loss_mlp": 1.05650806, + "epoch": 0.12836314444611455, + "flos": 21397768548480.0, + "grad_norm": 2.033800341734765, + "language_loss": 0.83015293, + "learning_rate": 3.899362506701421e-06, + "loss": 0.85233301, + "num_input_tokens_seen": 46140740, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1640625, + "step": 2135, + "time_per_iteration": 2.4743056297302246 + }, + { + "auxiliary_loss_clip": 0.01165668, + "auxiliary_loss_mlp": 0.01061205, + "balance_loss_clip": 1.03842425, + "balance_loss_mlp": 1.05173945, + "epoch": 0.1284232676987825, + "flos": 13662466773120.0, + "grad_norm": 2.9021762622464853, + "language_loss": 0.77293968, + "learning_rate": 3.899240484280298e-06, + "loss": 0.79520839, + "num_input_tokens_seen": 46156805, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.140625, + "step": 2136, + "time_per_iteration": 2.4412362575531006 + }, + { + "auxiliary_loss_clip": 0.01051729, + "auxiliary_loss_mlp": 0.01001869, + "balance_loss_clip": 0.99983084, + "balance_loss_mlp": 1.01248765, + "epoch": 0.12848339095145048, + "flos": 59994737735040.0, + "grad_norm": 0.8943310105061408, + "language_loss": 0.59115362, + "learning_rate": 3.899118389839785e-06, + "loss": 0.61168963, + "num_input_tokens_seen": 46222085, + "router_z_loss_clip": 0.02038574, + "router_z_loss_mlp": 0.39257812, + "step": 2137, + "time_per_iteration": 3.2407264709472656 + }, + { + "auxiliary_loss_clip": 0.01164926, + "auxiliary_loss_mlp": 0.01052629, + "balance_loss_clip": 1.03207743, + "balance_loss_mlp": 1.04970789, + "epoch": 0.12854351420411844, + "flos": 13881378211200.0, + "grad_norm": 2.4694787743163404, + "language_loss": 0.81923193, + "learning_rate": 3.898996223384512e-06, + "loss": 0.84140748, + "num_input_tokens_seen": 46239970, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.15625, + "step": 2138, + "time_per_iteration": 2.441708564758301 + }, + { + "auxiliary_loss_clip": 0.01170897, + "auxiliary_loss_mlp": 0.01055556, + "balance_loss_clip": 1.03207207, + "balance_loss_mlp": 1.05353928, + "epoch": 0.1286036374567864, + "flos": 22637943475200.0, + "grad_norm": 2.804990264663657, + "language_loss": 0.79418135, + "learning_rate": 3.898873984919113e-06, + "loss": 0.81644583, + "num_input_tokens_seen": 46257740, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.171875, + "step": 2139, + "time_per_iteration": 2.5321907997131348 + }, + { + "auxiliary_loss_clip": 0.01169458, + "auxiliary_loss_mlp": 0.0104552, + "balance_loss_clip": 1.02488446, + "balance_loss_mlp": 1.05315363, + "epoch": 0.12866376070945437, + "flos": 16324775948160.0, + "grad_norm": 2.1742564972583667, + "language_loss": 0.84761363, + "learning_rate": 3.8987516744482215e-06, + "loss": 0.86976337, + "num_input_tokens_seen": 46275445, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.1640625, + "step": 2140, + "time_per_iteration": 2.469543933868408 + }, + { + "auxiliary_loss_clip": 0.01164368, + "auxiliary_loss_mlp": 0.01045521, + "balance_loss_clip": 1.02524316, + "balance_loss_mlp": 1.05079114, + "epoch": 0.12872388396212234, + "flos": 11874546374400.0, + "grad_norm": 2.376703775404894, + "language_loss": 0.85850012, + "learning_rate": 3.898629291976476e-06, + "loss": 0.88059902, + "num_input_tokens_seen": 46291710, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1328125, + "step": 2141, + "time_per_iteration": 2.473205327987671 + }, + { + "auxiliary_loss_clip": 0.0116884, + "auxiliary_loss_mlp": 0.01049008, + "balance_loss_clip": 1.0278126, + "balance_loss_mlp": 1.05059922, + "epoch": 0.12878400721479033, + "flos": 28366700722560.0, + "grad_norm": 3.411777854813752, + "language_loss": 0.68245387, + "learning_rate": 3.898506837508518e-06, + "loss": 0.7046324, + "num_input_tokens_seen": 46311335, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1796875, + "step": 2142, + "time_per_iteration": 2.5327556133270264 + }, + { + "auxiliary_loss_clip": 0.01171992, + "auxiliary_loss_mlp": 0.01048809, + "balance_loss_clip": 1.02702951, + "balance_loss_mlp": 1.05430341, + "epoch": 0.1288441304674583, + "flos": 25885632597120.0, + "grad_norm": 2.0295098459565692, + "language_loss": 0.82883704, + "learning_rate": 3.89838431104899e-06, + "loss": 0.85104507, + "num_input_tokens_seen": 46330985, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.171875, + "step": 2143, + "time_per_iteration": 4.014873743057251 + }, + { + "auxiliary_loss_clip": 0.01171398, + "auxiliary_loss_mlp": 0.01053828, + "balance_loss_clip": 1.03262091, + "balance_loss_mlp": 1.05572712, + "epoch": 0.12890425372012626, + "flos": 20813789232000.0, + "grad_norm": 1.7367706894947552, + "language_loss": 0.81788546, + "learning_rate": 3.898261712602539e-06, + "loss": 0.84013772, + "num_input_tokens_seen": 46351295, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.15625, + "step": 2144, + "time_per_iteration": 4.002255439758301 + }, + { + "auxiliary_loss_clip": 0.0116509, + "auxiliary_loss_mlp": 0.0105384, + "balance_loss_clip": 1.03108335, + "balance_loss_mlp": 1.04864693, + "epoch": 0.12896437697279423, + "flos": 22565870835840.0, + "grad_norm": 3.8817809862500727, + "language_loss": 0.78257203, + "learning_rate": 3.898139042173813e-06, + "loss": 0.80476135, + "num_input_tokens_seen": 46368600, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.1640625, + "step": 2145, + "time_per_iteration": 2.4952287673950195 + }, + { + "auxiliary_loss_clip": 0.01167211, + "auxiliary_loss_mlp": 0.01049919, + "balance_loss_clip": 1.02825832, + "balance_loss_mlp": 1.05031526, + "epoch": 0.1290245002254622, + "flos": 17493776075520.0, + "grad_norm": 2.1659704609946897, + "language_loss": 0.82622325, + "learning_rate": 3.898016299767465e-06, + "loss": 0.84839463, + "num_input_tokens_seen": 46387370, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.171875, + "step": 2146, + "time_per_iteration": 2.4898681640625 + }, + { + "auxiliary_loss_clip": 0.01165601, + "auxiliary_loss_mlp": 0.01051615, + "balance_loss_clip": 1.02959681, + "balance_loss_mlp": 1.05129158, + "epoch": 0.12908462347813016, + "flos": 36315957859200.0, + "grad_norm": 2.717320122986492, + "language_loss": 0.70446974, + "learning_rate": 3.897893485388149e-06, + "loss": 0.72664189, + "num_input_tokens_seen": 46409570, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.140625, + "step": 2147, + "time_per_iteration": 2.5964484214782715 + }, + { + "auxiliary_loss_clip": 0.01165989, + "auxiliary_loss_mlp": 0.01051149, + "balance_loss_clip": 1.03069305, + "balance_loss_mlp": 1.05166912, + "epoch": 0.12914474673079815, + "flos": 22528703237760.0, + "grad_norm": 2.443887417123452, + "language_loss": 0.71685153, + "learning_rate": 3.897770599040521e-06, + "loss": 0.73902297, + "num_input_tokens_seen": 46429320, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.140625, + "step": 2148, + "time_per_iteration": 2.4910573959350586 + }, + { + "auxiliary_loss_clip": 0.01165944, + "auxiliary_loss_mlp": 0.01046939, + "balance_loss_clip": 1.02681684, + "balance_loss_mlp": 1.05413008, + "epoch": 0.12920486998346611, + "flos": 21471888263040.0, + "grad_norm": 1.666574129953403, + "language_loss": 0.79379606, + "learning_rate": 3.897647640729242e-06, + "loss": 0.81592482, + "num_input_tokens_seen": 46450155, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.1171875, + "step": 2149, + "time_per_iteration": 2.495443820953369 + }, + { + "auxiliary_loss_clip": 0.01167493, + "auxiliary_loss_mlp": 0.01046346, + "balance_loss_clip": 1.02455473, + "balance_loss_mlp": 1.05306077, + "epoch": 0.12926499323613408, + "flos": 27308556944640.0, + "grad_norm": 2.1379132369478313, + "language_loss": 0.76475441, + "learning_rate": 3.897524610458975e-06, + "loss": 0.78689277, + "num_input_tokens_seen": 46470280, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.140625, + "step": 2150, + "time_per_iteration": 2.524395704269409 + }, + { + "auxiliary_loss_clip": 0.01166143, + "auxiliary_loss_mlp": 0.0105244, + "balance_loss_clip": 1.03124499, + "balance_loss_mlp": 1.05094671, + "epoch": 0.12932511648880204, + "flos": 22091131756800.0, + "grad_norm": 2.417935370690141, + "language_loss": 0.70735669, + "learning_rate": 3.8974015082343835e-06, + "loss": 0.72954249, + "num_input_tokens_seen": 46487605, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1484375, + "step": 2151, + "time_per_iteration": 2.5213184356689453 + }, + { + "auxiliary_loss_clip": 0.01165721, + "auxiliary_loss_mlp": 0.01044761, + "balance_loss_clip": 1.02502, + "balance_loss_mlp": 1.05457592, + "epoch": 0.12938523974147, + "flos": 20302780394880.0, + "grad_norm": 1.9866869590783298, + "language_loss": 0.84050369, + "learning_rate": 3.897278334060137e-06, + "loss": 0.86260849, + "num_input_tokens_seen": 46505100, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.109375, + "step": 2152, + "time_per_iteration": 2.4494428634643555 + }, + { + "auxiliary_loss_clip": 0.01167192, + "auxiliary_loss_mlp": 0.01057934, + "balance_loss_clip": 1.03689384, + "balance_loss_mlp": 1.05128813, + "epoch": 0.12944536299413797, + "flos": 19499961467520.0, + "grad_norm": 2.226463520109079, + "language_loss": 0.78646791, + "learning_rate": 3.897155087940906e-06, + "loss": 0.80871922, + "num_input_tokens_seen": 46524020, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.15625, + "step": 2153, + "time_per_iteration": 2.46975040435791 + }, + { + "auxiliary_loss_clip": 0.01163518, + "auxiliary_loss_mlp": 0.01052865, + "balance_loss_clip": 1.03220654, + "balance_loss_mlp": 1.05069268, + "epoch": 0.12950548624680594, + "flos": 27707919333120.0, + "grad_norm": 2.482522823334948, + "language_loss": 0.80135351, + "learning_rate": 3.897031769881364e-06, + "loss": 0.82351738, + "num_input_tokens_seen": 46544640, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.125, + "step": 2154, + "time_per_iteration": 2.558769941329956 + }, + { + "auxiliary_loss_clip": 0.01170487, + "auxiliary_loss_mlp": 0.01051039, + "balance_loss_clip": 1.02998686, + "balance_loss_mlp": 1.05522227, + "epoch": 0.12956560949947393, + "flos": 17565740974080.0, + "grad_norm": 2.0988715261553774, + "language_loss": 0.83128881, + "learning_rate": 3.896908379886188e-06, + "loss": 0.85350406, + "num_input_tokens_seen": 46561395, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1484375, + "step": 2155, + "time_per_iteration": 2.476299524307251 + }, + { + "auxiliary_loss_clip": 0.01164922, + "auxiliary_loss_mlp": 0.01050282, + "balance_loss_clip": 1.02961075, + "balance_loss_mlp": 1.05010283, + "epoch": 0.1296257327521419, + "flos": 20740711011840.0, + "grad_norm": 2.842594732542889, + "language_loss": 0.76062953, + "learning_rate": 3.896784917960055e-06, + "loss": 0.7827816, + "num_input_tokens_seen": 46579395, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1484375, + "step": 2156, + "time_per_iteration": 2.6024632453918457 + }, + { + "auxiliary_loss_clip": 0.01161875, + "auxiliary_loss_mlp": 0.01051596, + "balance_loss_clip": 1.03078222, + "balance_loss_mlp": 1.05121815, + "epoch": 0.12968585600480986, + "flos": 16395735265920.0, + "grad_norm": 1.9934077258859366, + "language_loss": 0.86546719, + "learning_rate": 3.896661384107648e-06, + "loss": 0.88760191, + "num_input_tokens_seen": 46597090, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.109375, + "step": 2157, + "time_per_iteration": 2.452826976776123 + }, + { + "auxiliary_loss_clip": 0.01164359, + "auxiliary_loss_mlp": 0.01059125, + "balance_loss_clip": 1.03745282, + "balance_loss_mlp": 1.04796743, + "epoch": 0.12974597925747783, + "flos": 28329533124480.0, + "grad_norm": 2.339899004847696, + "language_loss": 0.80590808, + "learning_rate": 3.896537778333651e-06, + "loss": 0.82814288, + "num_input_tokens_seen": 46617355, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.1640625, + "step": 2158, + "time_per_iteration": 2.5332443714141846 + }, + { + "auxiliary_loss_clip": 0.01169288, + "auxiliary_loss_mlp": 0.01055971, + "balance_loss_clip": 1.03510916, + "balance_loss_mlp": 1.05294585, + "epoch": 0.1298061025101458, + "flos": 9683025782400.0, + "grad_norm": 2.254282600322574, + "language_loss": 0.74603379, + "learning_rate": 3.896414100642752e-06, + "loss": 0.76828635, + "num_input_tokens_seen": 46633130, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1640625, + "step": 2159, + "time_per_iteration": 2.469038963317871 + }, + { + "auxiliary_loss_clip": 0.01158286, + "auxiliary_loss_mlp": 0.0105286, + "balance_loss_clip": 1.0323323, + "balance_loss_mlp": 1.04777908, + "epoch": 0.12986622576281376, + "flos": 27709535445120.0, + "grad_norm": 2.1260113568932746, + "language_loss": 0.8227706, + "learning_rate": 3.89629035103964e-06, + "loss": 0.84488213, + "num_input_tokens_seen": 46650575, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.109375, + "step": 2160, + "time_per_iteration": 2.516723155975342 + }, + { + "auxiliary_loss_clip": 0.01159917, + "auxiliary_loss_mlp": 0.01044215, + "balance_loss_clip": 1.02450943, + "balance_loss_mlp": 1.05318654, + "epoch": 0.12992634901548175, + "flos": 18802719590400.0, + "grad_norm": 1.6308358458278915, + "language_loss": 0.81877828, + "learning_rate": 3.896166529529008e-06, + "loss": 0.8408196, + "num_input_tokens_seen": 46668780, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0625, + "step": 2161, + "time_per_iteration": 2.4677131175994873 + }, + { + "auxiliary_loss_clip": 0.01161774, + "auxiliary_loss_mlp": 0.01056265, + "balance_loss_clip": 1.03479493, + "balance_loss_mlp": 1.05035043, + "epoch": 0.12998647226814972, + "flos": 29127575543040.0, + "grad_norm": 2.2782308625037686, + "language_loss": 0.82592809, + "learning_rate": 3.896042636115551e-06, + "loss": 0.84810847, + "num_input_tokens_seen": 46687550, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.109375, + "step": 2162, + "time_per_iteration": 2.5702993869781494 + }, + { + "auxiliary_loss_clip": 0.01164237, + "auxiliary_loss_mlp": 0.01054699, + "balance_loss_clip": 1.03454113, + "balance_loss_mlp": 1.04993796, + "epoch": 0.13004659552081768, + "flos": 19573686132480.0, + "grad_norm": 2.619296712638915, + "language_loss": 0.72762972, + "learning_rate": 3.895918670803968e-06, + "loss": 0.7498191, + "num_input_tokens_seen": 46706730, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.140625, + "step": 2163, + "time_per_iteration": 2.4531478881835938 + }, + { + "auxiliary_loss_clip": 0.0116607, + "auxiliary_loss_mlp": 0.01053845, + "balance_loss_clip": 1.03183889, + "balance_loss_mlp": 1.05107188, + "epoch": 0.13010671877348565, + "flos": 22490709626880.0, + "grad_norm": 2.0773433264348435, + "language_loss": 0.81498116, + "learning_rate": 3.895794633598958e-06, + "loss": 0.83718032, + "num_input_tokens_seen": 46724250, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1484375, + "step": 2164, + "time_per_iteration": 2.497072458267212 + }, + { + "auxiliary_loss_clip": 0.01164255, + "auxiliary_loss_mlp": 0.01042951, + "balance_loss_clip": 1.02381766, + "balance_loss_mlp": 1.05107093, + "epoch": 0.1301668420261536, + "flos": 23878226142720.0, + "grad_norm": 2.2040156749440523, + "language_loss": 0.72564822, + "learning_rate": 3.8956705245052256e-06, + "loss": 0.7477203, + "num_input_tokens_seen": 46744105, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.1328125, + "step": 2165, + "time_per_iteration": 2.515026807785034 + }, + { + "auxiliary_loss_clip": 0.01167589, + "auxiliary_loss_mlp": 0.01047652, + "balance_loss_clip": 1.02599204, + "balance_loss_mlp": 1.05286038, + "epoch": 0.13022696527882158, + "flos": 23150065633920.0, + "grad_norm": 2.8786436091142913, + "language_loss": 0.74697578, + "learning_rate": 3.8955463435274765e-06, + "loss": 0.76912814, + "num_input_tokens_seen": 46764250, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.1484375, + "step": 2166, + "time_per_iteration": 2.5301709175109863 + }, + { + "auxiliary_loss_clip": 0.01165477, + "auxiliary_loss_mlp": 0.01047606, + "balance_loss_clip": 1.02751899, + "balance_loss_mlp": 1.05156064, + "epoch": 0.13028708853148954, + "flos": 26908548111360.0, + "grad_norm": 1.5708346768068926, + "language_loss": 0.83053899, + "learning_rate": 3.895422090670421e-06, + "loss": 0.85266984, + "num_input_tokens_seen": 46786865, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.140625, + "step": 2167, + "time_per_iteration": 2.632035732269287 + }, + { + "auxiliary_loss_clip": 0.01163335, + "auxiliary_loss_mlp": 0.01060394, + "balance_loss_clip": 1.03931761, + "balance_loss_mlp": 1.05201721, + "epoch": 0.13034721178415754, + "flos": 21251468453760.0, + "grad_norm": 1.9158171210349437, + "language_loss": 0.83286303, + "learning_rate": 3.89529776593877e-06, + "loss": 0.85510027, + "num_input_tokens_seen": 46807030, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.109375, + "step": 2168, + "time_per_iteration": 2.4766387939453125 + }, + { + "auxiliary_loss_clip": 0.0116626, + "auxiliary_loss_mlp": 0.01052307, + "balance_loss_clip": 1.03075409, + "balance_loss_mlp": 1.05258656, + "epoch": 0.1304073350368255, + "flos": 18767239931520.0, + "grad_norm": 2.304013454801214, + "language_loss": 0.80027354, + "learning_rate": 3.8951733693372375e-06, + "loss": 0.82245922, + "num_input_tokens_seen": 46826280, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.140625, + "step": 2169, + "time_per_iteration": 2.5185413360595703 + }, + { + "auxiliary_loss_clip": 0.01166949, + "auxiliary_loss_mlp": 0.01045126, + "balance_loss_clip": 1.02329922, + "balance_loss_mlp": 1.05451608, + "epoch": 0.13046745828949347, + "flos": 28364653647360.0, + "grad_norm": 4.565704621626811, + "language_loss": 0.66456163, + "learning_rate": 3.8950489008705406e-06, + "loss": 0.68668246, + "num_input_tokens_seen": 46846505, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.125, + "step": 2170, + "time_per_iteration": 2.5556788444519043 + }, + { + "auxiliary_loss_clip": 0.01165384, + "auxiliary_loss_mlp": 0.01044241, + "balance_loss_clip": 1.02397573, + "balance_loss_mlp": 1.05294132, + "epoch": 0.13052758154216143, + "flos": 29605044055680.0, + "grad_norm": 1.848772151746763, + "language_loss": 0.66935396, + "learning_rate": 3.8949243605434e-06, + "loss": 0.69145024, + "num_input_tokens_seen": 46867380, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.125, + "step": 2171, + "time_per_iteration": 2.553422451019287 + }, + { + "auxiliary_loss_clip": 0.01164709, + "auxiliary_loss_mlp": 0.01048224, + "balance_loss_clip": 1.02649236, + "balance_loss_mlp": 1.05211711, + "epoch": 0.1305877047948294, + "flos": 19390864884480.0, + "grad_norm": 1.9479804069383955, + "language_loss": 0.71952963, + "learning_rate": 3.894799748360537e-06, + "loss": 0.74165899, + "num_input_tokens_seen": 46886810, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.125, + "step": 2172, + "time_per_iteration": 2.4801840782165527 + }, + { + "auxiliary_loss_clip": 0.01161466, + "auxiliary_loss_mlp": 0.01043706, + "balance_loss_clip": 1.02508521, + "balance_loss_mlp": 1.05435848, + "epoch": 0.13064782804749736, + "flos": 16873527000960.0, + "grad_norm": 1.8616776845407013, + "language_loss": 0.75547618, + "learning_rate": 3.894675064326678e-06, + "loss": 0.77752787, + "num_input_tokens_seen": 46905620, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.0703125, + "step": 2173, + "time_per_iteration": 2.4639194011688232 + }, + { + "auxiliary_loss_clip": 0.01165867, + "auxiliary_loss_mlp": 0.01055263, + "balance_loss_clip": 1.03406715, + "balance_loss_mlp": 1.05319107, + "epoch": 0.13070795130016533, + "flos": 24499085748480.0, + "grad_norm": 2.777389952877741, + "language_loss": 0.70484382, + "learning_rate": 3.894550308446551e-06, + "loss": 0.72705513, + "num_input_tokens_seen": 46925120, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.125, + "step": 2174, + "time_per_iteration": 2.4914908409118652 + }, + { + "auxiliary_loss_clip": 0.01055025, + "auxiliary_loss_mlp": 0.01011158, + "balance_loss_clip": 1.0087378, + "balance_loss_mlp": 1.0165, + "epoch": 0.13076807455283332, + "flos": 71054505953280.0, + "grad_norm": 0.8022263951171452, + "language_loss": 0.59071571, + "learning_rate": 3.894425480724886e-06, + "loss": 0.61137754, + "num_input_tokens_seen": 46988195, + "router_z_loss_clip": 0.02416992, + "router_z_loss_mlp": 0.38671875, + "step": 2175, + "time_per_iteration": 3.244633913040161 + }, + { + "auxiliary_loss_clip": 0.01164931, + "auxiliary_loss_mlp": 0.01051735, + "balance_loss_clip": 1.03214908, + "balance_loss_mlp": 1.05474329, + "epoch": 0.13082819780550128, + "flos": 20264499475200.0, + "grad_norm": 2.247504257537708, + "language_loss": 0.79946023, + "learning_rate": 3.894300581166417e-06, + "loss": 0.8216269, + "num_input_tokens_seen": 47004720, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.1015625, + "step": 2176, + "time_per_iteration": 2.439883232116699 + }, + { + "auxiliary_loss_clip": 0.01163907, + "auxiliary_loss_mlp": 0.01050259, + "balance_loss_clip": 1.02806199, + "balance_loss_mlp": 1.05234194, + "epoch": 0.13088832105816925, + "flos": 34203441231360.0, + "grad_norm": 1.8562517641565577, + "language_loss": 0.74595284, + "learning_rate": 3.894175609775881e-06, + "loss": 0.76809454, + "num_input_tokens_seen": 47024255, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.1171875, + "step": 2177, + "time_per_iteration": 2.561140298843384 + }, + { + "auxiliary_loss_clip": 0.01163674, + "auxiliary_loss_mlp": 0.01046693, + "balance_loss_clip": 1.024472, + "balance_loss_mlp": 1.05222929, + "epoch": 0.13094844431083721, + "flos": 17894970057600.0, + "grad_norm": 2.128567307625778, + "language_loss": 0.81855309, + "learning_rate": 3.894050566558015e-06, + "loss": 0.84065676, + "num_input_tokens_seen": 47042465, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.1171875, + "step": 2178, + "time_per_iteration": 2.458812713623047 + }, + { + "auxiliary_loss_clip": 0.01166111, + "auxiliary_loss_mlp": 0.01047933, + "balance_loss_clip": 1.02695179, + "balance_loss_mlp": 1.05466795, + "epoch": 0.13100856756350518, + "flos": 17311313963520.0, + "grad_norm": 2.66972533149016, + "language_loss": 0.74942935, + "learning_rate": 3.893925451517562e-06, + "loss": 0.77156973, + "num_input_tokens_seen": 47060370, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.109375, + "step": 2179, + "time_per_iteration": 2.4679782390594482 + }, + { + "auxiliary_loss_clip": 0.01161603, + "auxiliary_loss_mlp": 0.0105054, + "balance_loss_clip": 1.03079903, + "balance_loss_mlp": 1.05280709, + "epoch": 0.13106869081617314, + "flos": 22200551562240.0, + "grad_norm": 2.0560779031919636, + "language_loss": 0.84319234, + "learning_rate": 3.893800264659266e-06, + "loss": 0.86531377, + "num_input_tokens_seen": 47081415, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0859375, + "step": 2180, + "time_per_iteration": 2.567873477935791 + }, + { + "auxiliary_loss_clip": 0.01166279, + "auxiliary_loss_mlp": 0.01054601, + "balance_loss_clip": 1.03483582, + "balance_loss_mlp": 1.05700839, + "epoch": 0.13112881406884114, + "flos": 21763123735680.0, + "grad_norm": 2.214126283525484, + "language_loss": 0.8987745, + "learning_rate": 3.8936750059878746e-06, + "loss": 0.92098325, + "num_input_tokens_seen": 47099860, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.09375, + "step": 2181, + "time_per_iteration": 2.4802486896514893 + }, + { + "auxiliary_loss_clip": 0.01166281, + "auxiliary_loss_mlp": 0.01043829, + "balance_loss_clip": 1.02438569, + "balance_loss_mlp": 1.0557189, + "epoch": 0.1311889373215091, + "flos": 23331091201920.0, + "grad_norm": 1.8993602522657917, + "language_loss": 0.68657839, + "learning_rate": 3.893549675508137e-06, + "loss": 0.70867944, + "num_input_tokens_seen": 47118540, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.109375, + "step": 2182, + "time_per_iteration": 2.460148572921753 + }, + { + "auxiliary_loss_clip": 0.01167141, + "auxiliary_loss_mlp": 0.01048146, + "balance_loss_clip": 1.02745128, + "balance_loss_mlp": 1.05504203, + "epoch": 0.13124906057417707, + "flos": 21467363149440.0, + "grad_norm": 2.6442759836393277, + "language_loss": 0.78435183, + "learning_rate": 3.893424273224806e-06, + "loss": 0.80650467, + "num_input_tokens_seen": 47136710, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.125, + "step": 2183, + "time_per_iteration": 2.5462143421173096 + }, + { + "auxiliary_loss_clip": 0.01162472, + "auxiliary_loss_mlp": 0.01043905, + "balance_loss_clip": 1.02375841, + "balance_loss_mlp": 1.05238128, + "epoch": 0.13130918382684503, + "flos": 23255319461760.0, + "grad_norm": 2.788927255894662, + "language_loss": 0.85543215, + "learning_rate": 3.893298799142636e-06, + "loss": 0.87749588, + "num_input_tokens_seen": 47157155, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.1015625, + "step": 2184, + "time_per_iteration": 3.8904993534088135 + }, + { + "auxiliary_loss_clip": 0.01165934, + "auxiliary_loss_mlp": 0.01047649, + "balance_loss_clip": 1.0265255, + "balance_loss_mlp": 1.0529201, + "epoch": 0.131369307079513, + "flos": 20850274471680.0, + "grad_norm": 2.505672435211917, + "language_loss": 0.82206696, + "learning_rate": 3.893173253266387e-06, + "loss": 0.84420282, + "num_input_tokens_seen": 47176820, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1328125, + "step": 2185, + "time_per_iteration": 5.3855485916137695 + }, + { + "auxiliary_loss_clip": 0.01168066, + "auxiliary_loss_mlp": 0.0105393, + "balance_loss_clip": 1.03323543, + "balance_loss_mlp": 1.05440092, + "epoch": 0.13142943033218096, + "flos": 17858341163520.0, + "grad_norm": 2.0294565364346235, + "language_loss": 0.73037684, + "learning_rate": 3.893047635600818e-06, + "loss": 0.7525968, + "num_input_tokens_seen": 47195855, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1328125, + "step": 2186, + "time_per_iteration": 2.4839119911193848 + }, + { + "auxiliary_loss_clip": 0.01165928, + "auxiliary_loss_mlp": 0.01048235, + "balance_loss_clip": 1.02601433, + "balance_loss_mlp": 1.05449164, + "epoch": 0.13148955358484893, + "flos": 20996035862400.0, + "grad_norm": 2.0525608711513614, + "language_loss": 0.80174023, + "learning_rate": 3.892921946150693e-06, + "loss": 0.82388186, + "num_input_tokens_seen": 47214535, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.109375, + "step": 2187, + "time_per_iteration": 2.463906764984131 + }, + { + "auxiliary_loss_clip": 0.01053378, + "auxiliary_loss_mlp": 0.01005839, + "balance_loss_clip": 1.00344312, + "balance_loss_mlp": 1.01508641, + "epoch": 0.13154967683751692, + "flos": 70172467580160.0, + "grad_norm": 0.8435449169341035, + "language_loss": 0.58977342, + "learning_rate": 3.892796184920778e-06, + "loss": 0.61036563, + "num_input_tokens_seen": 47270300, + "router_z_loss_clip": 0.02392578, + "router_z_loss_mlp": 0.3828125, + "step": 2188, + "time_per_iteration": 3.1052041053771973 + }, + { + "auxiliary_loss_clip": 0.01169813, + "auxiliary_loss_mlp": 0.01050803, + "balance_loss_clip": 1.03037024, + "balance_loss_mlp": 1.05918622, + "epoch": 0.1316098000901849, + "flos": 20376145923840.0, + "grad_norm": 2.1443848583942846, + "language_loss": 0.74199927, + "learning_rate": 3.892670351915842e-06, + "loss": 0.76420546, + "num_input_tokens_seen": 47290720, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.109375, + "step": 2189, + "time_per_iteration": 2.5137264728546143 + }, + { + "auxiliary_loss_clip": 0.01166605, + "auxiliary_loss_mlp": 0.01049022, + "balance_loss_clip": 1.02894759, + "balance_loss_mlp": 1.05678558, + "epoch": 0.13166992334285285, + "flos": 23221132692480.0, + "grad_norm": 1.7642431940848833, + "language_loss": 0.72561657, + "learning_rate": 3.892544447140657e-06, + "loss": 0.74777287, + "num_input_tokens_seen": 47311820, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.1015625, + "step": 2190, + "time_per_iteration": 2.5053412914276123 + }, + { + "auxiliary_loss_clip": 0.01169095, + "auxiliary_loss_mlp": 0.01051343, + "balance_loss_clip": 1.03094649, + "balance_loss_mlp": 1.05706906, + "epoch": 0.13173004659552082, + "flos": 23330947547520.0, + "grad_norm": 8.700182749243472, + "language_loss": 0.74395585, + "learning_rate": 3.892418470599996e-06, + "loss": 0.76616025, + "num_input_tokens_seen": 47331605, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.1171875, + "step": 2191, + "time_per_iteration": 2.507687568664551 + }, + { + "auxiliary_loss_clip": 0.01168877, + "auxiliary_loss_mlp": 0.01049305, + "balance_loss_clip": 1.02841949, + "balance_loss_mlp": 1.05689156, + "epoch": 0.13179016984818878, + "flos": 21251504367360.0, + "grad_norm": 2.0250128968483403, + "language_loss": 0.79286075, + "learning_rate": 3.892292422298637e-06, + "loss": 0.8150425, + "num_input_tokens_seen": 47350455, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1171875, + "step": 2192, + "time_per_iteration": 2.5068893432617188 + }, + { + "auxiliary_loss_clip": 0.01168449, + "auxiliary_loss_mlp": 0.01053422, + "balance_loss_clip": 1.03290629, + "balance_loss_mlp": 1.05564141, + "epoch": 0.13185029310085675, + "flos": 17778690754560.0, + "grad_norm": 1.9285179647135495, + "language_loss": 0.84827602, + "learning_rate": 3.892166302241361e-06, + "loss": 0.87049472, + "num_input_tokens_seen": 47368225, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.125, + "step": 2193, + "time_per_iteration": 2.456409215927124 + }, + { + "auxiliary_loss_clip": 0.0105585, + "auxiliary_loss_mlp": 0.01002145, + "balance_loss_clip": 0.99976075, + "balance_loss_mlp": 1.0179081, + "epoch": 0.1319104163535247, + "flos": 69851785933440.0, + "grad_norm": 0.7727203010194038, + "language_loss": 0.54049635, + "learning_rate": 3.8920401104329475e-06, + "loss": 0.56107628, + "num_input_tokens_seen": 47427125, + "router_z_loss_clip": 0.02380371, + "router_z_loss_mlp": 0.37890625, + "step": 2194, + "time_per_iteration": 3.0569794178009033 + }, + { + "auxiliary_loss_clip": 0.01165932, + "auxiliary_loss_mlp": 0.01046302, + "balance_loss_clip": 1.02566671, + "balance_loss_mlp": 1.05514359, + "epoch": 0.1319705396061927, + "flos": 25193095401600.0, + "grad_norm": 1.7688784093808256, + "language_loss": 0.72086227, + "learning_rate": 3.891913846878185e-06, + "loss": 0.74298465, + "num_input_tokens_seen": 47450275, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.109375, + "step": 2195, + "time_per_iteration": 2.527435541152954 + }, + { + "auxiliary_loss_clip": 0.01173804, + "auxiliary_loss_mlp": 0.01045041, + "balance_loss_clip": 1.02310634, + "balance_loss_mlp": 1.05663633, + "epoch": 0.13203066285886067, + "flos": 20740459616640.0, + "grad_norm": 1.7664998702658374, + "language_loss": 0.78195536, + "learning_rate": 3.891787511581859e-06, + "loss": 0.80414379, + "num_input_tokens_seen": 47469155, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.171875, + "step": 2196, + "time_per_iteration": 2.4685165882110596 + }, + { + "auxiliary_loss_clip": 0.01169552, + "auxiliary_loss_mlp": 0.01046979, + "balance_loss_clip": 1.02714252, + "balance_loss_mlp": 1.05638218, + "epoch": 0.13209078611152864, + "flos": 22054395121920.0, + "grad_norm": 2.1663119445052295, + "language_loss": 0.74861938, + "learning_rate": 3.89166110454876e-06, + "loss": 0.77078474, + "num_input_tokens_seen": 47488405, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.1328125, + "step": 2197, + "time_per_iteration": 2.489504814147949 + }, + { + "auxiliary_loss_clip": 0.01170611, + "auxiliary_loss_mlp": 0.01044966, + "balance_loss_clip": 1.02430725, + "balance_loss_mlp": 1.05543399, + "epoch": 0.1321509093641966, + "flos": 16284950743680.0, + "grad_norm": 2.4378795089069674, + "language_loss": 0.8011694, + "learning_rate": 3.891534625783685e-06, + "loss": 0.82332516, + "num_input_tokens_seen": 47505650, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1484375, + "step": 2198, + "time_per_iteration": 2.437718391418457 + }, + { + "auxiliary_loss_clip": 0.0116676, + "auxiliary_loss_mlp": 0.01061419, + "balance_loss_clip": 1.04173732, + "balance_loss_mlp": 1.05483699, + "epoch": 0.13221103261686457, + "flos": 16983018633600.0, + "grad_norm": 2.4514815632850038, + "language_loss": 0.82552117, + "learning_rate": 3.891408075291425e-06, + "loss": 0.847803, + "num_input_tokens_seen": 47521540, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1171875, + "step": 2199, + "time_per_iteration": 2.47356915473938 + }, + { + "auxiliary_loss_clip": 0.01167277, + "auxiliary_loss_mlp": 0.01047633, + "balance_loss_clip": 1.02724838, + "balance_loss_mlp": 1.05458844, + "epoch": 0.13227115586953253, + "flos": 34233605677440.0, + "grad_norm": 2.465688895758548, + "language_loss": 0.68963099, + "learning_rate": 3.8912814530767826e-06, + "loss": 0.71178007, + "num_input_tokens_seen": 47543625, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.125, + "step": 2200, + "time_per_iteration": 2.5828843116760254 + }, + { + "auxiliary_loss_clip": 0.01166491, + "auxiliary_loss_mlp": 0.01055533, + "balance_loss_clip": 1.03420663, + "balance_loss_mlp": 1.05397916, + "epoch": 0.13233127912220052, + "flos": 20704656735360.0, + "grad_norm": 2.591612522060186, + "language_loss": 0.84600091, + "learning_rate": 3.891154759144557e-06, + "loss": 0.86822116, + "num_input_tokens_seen": 47563740, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.125, + "step": 2201, + "time_per_iteration": 2.5546202659606934 + }, + { + "auxiliary_loss_clip": 0.01168797, + "auxiliary_loss_mlp": 0.01054072, + "balance_loss_clip": 1.03315115, + "balance_loss_mlp": 1.05466592, + "epoch": 0.1323914023748685, + "flos": 25805048434560.0, + "grad_norm": 1.901870031688447, + "language_loss": 0.86978126, + "learning_rate": 3.891027993499554e-06, + "loss": 0.89200991, + "num_input_tokens_seen": 47582655, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.140625, + "step": 2202, + "time_per_iteration": 2.509300470352173 + }, + { + "auxiliary_loss_clip": 0.01164666, + "auxiliary_loss_mlp": 0.01042993, + "balance_loss_clip": 1.02364576, + "balance_loss_mlp": 1.05389142, + "epoch": 0.13245152562753645, + "flos": 21251540280960.0, + "grad_norm": 2.3614014237187084, + "language_loss": 0.72746712, + "learning_rate": 3.89090115614658e-06, + "loss": 0.74954367, + "num_input_tokens_seen": 47600875, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.109375, + "step": 2203, + "time_per_iteration": 2.5267388820648193 + }, + { + "auxiliary_loss_clip": 0.01167891, + "auxiliary_loss_mlp": 0.0105678, + "balance_loss_clip": 1.03781366, + "balance_loss_mlp": 1.05453348, + "epoch": 0.13251164888020442, + "flos": 26610955931520.0, + "grad_norm": 2.5436302639516, + "language_loss": 0.73248756, + "learning_rate": 3.890774247090444e-06, + "loss": 0.75473428, + "num_input_tokens_seen": 47619250, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.1328125, + "step": 2204, + "time_per_iteration": 2.5298051834106445 + }, + { + "auxiliary_loss_clip": 0.01168712, + "auxiliary_loss_mlp": 0.01053403, + "balance_loss_clip": 1.03211212, + "balance_loss_mlp": 1.05558085, + "epoch": 0.13257177213287238, + "flos": 29826541272960.0, + "grad_norm": 1.7540271848273767, + "language_loss": 0.78627133, + "learning_rate": 3.89064726633596e-06, + "loss": 0.80849254, + "num_input_tokens_seen": 47639445, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1328125, + "step": 2205, + "time_per_iteration": 2.5343189239501953 + }, + { + "auxiliary_loss_clip": 0.01166449, + "auxiliary_loss_mlp": 0.01053788, + "balance_loss_clip": 1.033391, + "balance_loss_mlp": 1.05560231, + "epoch": 0.13263189538554035, + "flos": 21288456483840.0, + "grad_norm": 2.234297854715259, + "language_loss": 0.78748876, + "learning_rate": 3.890520213887941e-06, + "loss": 0.80969107, + "num_input_tokens_seen": 47658740, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.109375, + "step": 2206, + "time_per_iteration": 2.473229169845581 + }, + { + "auxiliary_loss_clip": 0.01170953, + "auxiliary_loss_mlp": 0.01046503, + "balance_loss_clip": 1.02750087, + "balance_loss_mlp": 1.05758011, + "epoch": 0.13269201863820831, + "flos": 16874101618560.0, + "grad_norm": 2.3028539815574494, + "language_loss": 0.73993444, + "learning_rate": 3.890393089751208e-06, + "loss": 0.76210898, + "num_input_tokens_seen": 47676880, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.125, + "step": 2207, + "time_per_iteration": 2.479421854019165 + }, + { + "auxiliary_loss_clip": 0.01160402, + "auxiliary_loss_mlp": 0.0104899, + "balance_loss_clip": 1.02822387, + "balance_loss_mlp": 1.05323017, + "epoch": 0.1327521418908763, + "flos": 23768914078080.0, + "grad_norm": 2.4105539478543454, + "language_loss": 0.84151787, + "learning_rate": 3.890265893930578e-06, + "loss": 0.86361182, + "num_input_tokens_seen": 47696635, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.0703125, + "step": 2208, + "time_per_iteration": 2.501969337463379 + }, + { + "auxiliary_loss_clip": 0.01161894, + "auxiliary_loss_mlp": 0.01055633, + "balance_loss_clip": 1.03621435, + "balance_loss_mlp": 1.05553222, + "epoch": 0.13281226514354427, + "flos": 26505594362880.0, + "grad_norm": 1.9362156368998853, + "language_loss": 0.85323346, + "learning_rate": 3.890138626430876e-06, + "loss": 0.87540877, + "num_input_tokens_seen": 47717760, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0625, + "step": 2209, + "time_per_iteration": 2.509761333465576 + }, + { + "auxiliary_loss_clip": 0.01166975, + "auxiliary_loss_mlp": 0.01049621, + "balance_loss_clip": 1.03039217, + "balance_loss_mlp": 1.05628705, + "epoch": 0.13287238839621224, + "flos": 24498762526080.0, + "grad_norm": 2.055387861012722, + "language_loss": 0.81545013, + "learning_rate": 3.890011287256929e-06, + "loss": 0.83761609, + "num_input_tokens_seen": 47737685, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.109375, + "step": 2210, + "time_per_iteration": 2.4920527935028076 + }, + { + "auxiliary_loss_clip": 0.0106048, + "auxiliary_loss_mlp": 0.01003994, + "balance_loss_clip": 1.0016222, + "balance_loss_mlp": 1.02205014, + "epoch": 0.1329325116488802, + "flos": 67694344369920.0, + "grad_norm": 0.7616894664797615, + "language_loss": 0.57984382, + "learning_rate": 3.889883876413563e-06, + "loss": 0.6004886, + "num_input_tokens_seen": 47802415, + "router_z_loss_clip": 0.02368164, + "router_z_loss_mlp": 0.3828125, + "step": 2211, + "time_per_iteration": 3.1735260486602783 + }, + { + "auxiliary_loss_clip": 0.01059664, + "auxiliary_loss_mlp": 0.01005439, + "balance_loss_clip": 1.00312614, + "balance_loss_mlp": 1.02081084, + "epoch": 0.13299263490154817, + "flos": 72261894741120.0, + "grad_norm": 0.7970523185699088, + "language_loss": 0.55364317, + "learning_rate": 3.889756393905611e-06, + "loss": 0.57429421, + "num_input_tokens_seen": 47871485, + "router_z_loss_clip": 0.02307129, + "router_z_loss_mlp": 0.38671875, + "step": 2212, + "time_per_iteration": 3.142056465148926 + }, + { + "auxiliary_loss_clip": 0.01170665, + "auxiliary_loss_mlp": 0.01052255, + "balance_loss_clip": 1.03164423, + "balance_loss_mlp": 1.056463, + "epoch": 0.13305275815421613, + "flos": 17931275729280.0, + "grad_norm": 4.2694742121271645, + "language_loss": 0.74779308, + "learning_rate": 3.889628839737908e-06, + "loss": 0.77002227, + "num_input_tokens_seen": 47888315, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.140625, + "step": 2213, + "time_per_iteration": 2.4599013328552246 + }, + { + "auxiliary_loss_clip": 0.0115893, + "auxiliary_loss_mlp": 0.01047564, + "balance_loss_clip": 1.02889609, + "balance_loss_mlp": 1.05235839, + "epoch": 0.13311288140688413, + "flos": 22340889999360.0, + "grad_norm": 2.0343460890824927, + "language_loss": 0.79269958, + "learning_rate": 3.889501213915291e-06, + "loss": 0.81476456, + "num_input_tokens_seen": 47906600, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.0625, + "step": 2214, + "time_per_iteration": 2.4635097980499268 + }, + { + "auxiliary_loss_clip": 0.01166328, + "auxiliary_loss_mlp": 0.01051317, + "balance_loss_clip": 1.03062189, + "balance_loss_mlp": 1.05593503, + "epoch": 0.1331730046595521, + "flos": 31868888682240.0, + "grad_norm": 2.0399610331480407, + "language_loss": 0.69410872, + "learning_rate": 3.889373516442597e-06, + "loss": 0.71628523, + "num_input_tokens_seen": 47927630, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1015625, + "step": 2215, + "time_per_iteration": 2.5798754692077637 + }, + { + "auxiliary_loss_clip": 0.01166771, + "auxiliary_loss_mlp": 0.01046808, + "balance_loss_clip": 1.02725816, + "balance_loss_mlp": 1.05576539, + "epoch": 0.13323312791222006, + "flos": 22566589107840.0, + "grad_norm": 2.4518621177772175, + "language_loss": 0.81136751, + "learning_rate": 3.889245747324671e-06, + "loss": 0.83350337, + "num_input_tokens_seen": 47947935, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.109375, + "step": 2216, + "time_per_iteration": 2.49601674079895 + }, + { + "auxiliary_loss_clip": 0.01166215, + "auxiliary_loss_mlp": 0.01057297, + "balance_loss_clip": 1.03668606, + "balance_loss_mlp": 1.05610895, + "epoch": 0.13329325116488802, + "flos": 15085319293440.0, + "grad_norm": 3.5729384628186307, + "language_loss": 0.87350845, + "learning_rate": 3.889117906566356e-06, + "loss": 0.89574361, + "num_input_tokens_seen": 47965515, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.1015625, + "step": 2217, + "time_per_iteration": 2.435224771499634 + }, + { + "auxiliary_loss_clip": 0.01165439, + "auxiliary_loss_mlp": 0.01048261, + "balance_loss_clip": 1.02716112, + "balance_loss_mlp": 1.05609739, + "epoch": 0.133353374417556, + "flos": 27453671890560.0, + "grad_norm": 2.6393181601709057, + "language_loss": 0.73460543, + "learning_rate": 3.888989994172501e-06, + "loss": 0.75674248, + "num_input_tokens_seen": 47985675, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.09375, + "step": 2218, + "time_per_iteration": 2.4984188079833984 + }, + { + "auxiliary_loss_clip": 0.01164132, + "auxiliary_loss_mlp": 0.01044805, + "balance_loss_clip": 1.02401495, + "balance_loss_mlp": 1.05406141, + "epoch": 0.13341349767022395, + "flos": 24094695456000.0, + "grad_norm": 1.803125703936159, + "language_loss": 0.87483871, + "learning_rate": 3.8888620101479565e-06, + "loss": 0.89692807, + "num_input_tokens_seen": 48004985, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.09375, + "step": 2219, + "time_per_iteration": 2.4761111736297607 + }, + { + "auxiliary_loss_clip": 0.01166927, + "auxiliary_loss_mlp": 0.01051114, + "balance_loss_clip": 1.03198123, + "balance_loss_mlp": 1.05804753, + "epoch": 0.13347362092289192, + "flos": 24133335511680.0, + "grad_norm": 1.5604165479120375, + "language_loss": 0.77241862, + "learning_rate": 3.888733954497574e-06, + "loss": 0.79459906, + "num_input_tokens_seen": 48024965, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0859375, + "step": 2220, + "time_per_iteration": 2.5172770023345947 + }, + { + "auxiliary_loss_clip": 0.01158357, + "auxiliary_loss_mlp": 0.01044474, + "balance_loss_clip": 1.02599692, + "balance_loss_mlp": 1.05065227, + "epoch": 0.1335337441755599, + "flos": 18436538390400.0, + "grad_norm": 2.752699726256429, + "language_loss": 0.79361391, + "learning_rate": 3.888605827226212e-06, + "loss": 0.81564224, + "num_input_tokens_seen": 48040890, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.078125, + "step": 2221, + "time_per_iteration": 2.4729459285736084 + }, + { + "auxiliary_loss_clip": 0.01056162, + "auxiliary_loss_mlp": 0.01004009, + "balance_loss_clip": 1.00177944, + "balance_loss_mlp": 1.01797867, + "epoch": 0.13359386742822787, + "flos": 50611997652480.0, + "grad_norm": 0.9620212456786271, + "language_loss": 0.6890744, + "learning_rate": 3.8884776283387275e-06, + "loss": 0.70967615, + "num_input_tokens_seen": 48091855, + "router_z_loss_clip": 0.02233887, + "router_z_loss_mlp": 0.3828125, + "step": 2222, + "time_per_iteration": 2.9102694988250732 + }, + { + "auxiliary_loss_clip": 0.011664, + "auxiliary_loss_mlp": 0.01047762, + "balance_loss_clip": 1.02885592, + "balance_loss_mlp": 1.05645049, + "epoch": 0.13365399068089584, + "flos": 22778569221120.0, + "grad_norm": 1.8990549263762904, + "language_loss": 0.66966134, + "learning_rate": 3.888349357839982e-06, + "loss": 0.69180298, + "num_input_tokens_seen": 48111350, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.1015625, + "step": 2223, + "time_per_iteration": 2.4860363006591797 + }, + { + "auxiliary_loss_clip": 0.01162257, + "auxiliary_loss_mlp": 0.01055999, + "balance_loss_clip": 1.03584075, + "balance_loss_mlp": 1.05173874, + "epoch": 0.1337141139335638, + "flos": 12531603911040.0, + "grad_norm": 2.0940561003244738, + "language_loss": 0.82572883, + "learning_rate": 3.88822101573484e-06, + "loss": 0.84791142, + "num_input_tokens_seen": 48129840, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.109375, + "step": 2224, + "time_per_iteration": 2.453310966491699 + }, + { + "auxiliary_loss_clip": 0.01167505, + "auxiliary_loss_mlp": 0.01047104, + "balance_loss_clip": 1.0266118, + "balance_loss_mlp": 1.05410361, + "epoch": 0.13377423718623177, + "flos": 23038957889280.0, + "grad_norm": 2.0797940389634624, + "language_loss": 0.66006851, + "learning_rate": 3.888092602028167e-06, + "loss": 0.68221462, + "num_input_tokens_seen": 48149240, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1328125, + "step": 2225, + "time_per_iteration": 2.505760669708252 + }, + { + "auxiliary_loss_clip": 0.01164479, + "auxiliary_loss_mlp": 0.01054978, + "balance_loss_clip": 1.03491461, + "balance_loss_mlp": 1.05366707, + "epoch": 0.13383436043889974, + "flos": 16216397637120.0, + "grad_norm": 2.2490181158076545, + "language_loss": 0.89484501, + "learning_rate": 3.887964116724835e-06, + "loss": 0.91703951, + "num_input_tokens_seen": 48166330, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.109375, + "step": 2226, + "time_per_iteration": 3.827432632446289 + }, + { + "auxiliary_loss_clip": 0.01166768, + "auxiliary_loss_mlp": 0.01050683, + "balance_loss_clip": 1.03132319, + "balance_loss_mlp": 1.05492473, + "epoch": 0.1338944836915677, + "flos": 24279671520000.0, + "grad_norm": 2.0692514385202947, + "language_loss": 0.73874348, + "learning_rate": 3.887835559829712e-06, + "loss": 0.76091796, + "num_input_tokens_seen": 48187600, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.1171875, + "step": 2227, + "time_per_iteration": 5.469221115112305 + }, + { + "auxiliary_loss_clip": 0.01166021, + "auxiliary_loss_mlp": 0.01049972, + "balance_loss_clip": 1.02971888, + "balance_loss_mlp": 1.05582607, + "epoch": 0.1339546069442357, + "flos": 17598742594560.0, + "grad_norm": 2.597241668203809, + "language_loss": 0.8519839, + "learning_rate": 3.8877069313476764e-06, + "loss": 0.87414384, + "num_input_tokens_seen": 48204400, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1015625, + "step": 2228, + "time_per_iteration": 2.449289560317993 + }, + { + "auxiliary_loss_clip": 0.01162737, + "auxiliary_loss_mlp": 0.01047632, + "balance_loss_clip": 1.0275687, + "balance_loss_mlp": 1.05501461, + "epoch": 0.13401473019690366, + "flos": 18990065952000.0, + "grad_norm": 2.700498827765594, + "language_loss": 0.8100034, + "learning_rate": 3.8875782312836054e-06, + "loss": 0.83210707, + "num_input_tokens_seen": 48222180, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.078125, + "step": 2229, + "time_per_iteration": 2.454185962677002 + }, + { + "auxiliary_loss_clip": 0.01165405, + "auxiliary_loss_mlp": 0.01055372, + "balance_loss_clip": 1.03528547, + "balance_loss_mlp": 1.05576682, + "epoch": 0.13407485344957162, + "flos": 26943812288640.0, + "grad_norm": 2.350850930683171, + "language_loss": 0.73814881, + "learning_rate": 3.887449459642378e-06, + "loss": 0.76035661, + "num_input_tokens_seen": 48243245, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.09375, + "step": 2230, + "time_per_iteration": 2.538679838180542 + }, + { + "auxiliary_loss_clip": 0.0116587, + "auxiliary_loss_mlp": 0.01055192, + "balance_loss_clip": 1.03551102, + "balance_loss_mlp": 1.0541544, + "epoch": 0.1341349767022396, + "flos": 20339373375360.0, + "grad_norm": 8.27737726970052, + "language_loss": 0.79914325, + "learning_rate": 3.8873206164288785e-06, + "loss": 0.82135391, + "num_input_tokens_seen": 48262600, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.1171875, + "step": 2231, + "time_per_iteration": 2.436964273452759 + }, + { + "auxiliary_loss_clip": 0.0116777, + "auxiliary_loss_mlp": 0.01049092, + "balance_loss_clip": 1.02906466, + "balance_loss_mlp": 1.05716896, + "epoch": 0.13419509995490755, + "flos": 29862020931840.0, + "grad_norm": 1.9954658779127024, + "language_loss": 0.72341192, + "learning_rate": 3.887191701647992e-06, + "loss": 0.74558049, + "num_input_tokens_seen": 48285075, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.109375, + "step": 2232, + "time_per_iteration": 2.5315330028533936 + }, + { + "auxiliary_loss_clip": 0.01169038, + "auxiliary_loss_mlp": 0.01047761, + "balance_loss_clip": 1.02664888, + "balance_loss_mlp": 1.05505097, + "epoch": 0.13425522320757552, + "flos": 26942986275840.0, + "grad_norm": 2.53729194427275, + "language_loss": 0.65508974, + "learning_rate": 3.8870627153046066e-06, + "loss": 0.67725778, + "num_input_tokens_seen": 48301285, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.140625, + "step": 2233, + "time_per_iteration": 2.480006694793701 + }, + { + "auxiliary_loss_clip": 0.01161612, + "auxiliary_loss_mlp": 0.01047371, + "balance_loss_clip": 1.02687883, + "balance_loss_mlp": 1.05011904, + "epoch": 0.1343153464602435, + "flos": 15777281871360.0, + "grad_norm": 4.541384002557222, + "language_loss": 0.81492066, + "learning_rate": 3.886933657403615e-06, + "loss": 0.8370105, + "num_input_tokens_seen": 48317835, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1171875, + "step": 2234, + "time_per_iteration": 2.431962490081787 + }, + { + "auxiliary_loss_clip": 0.01165653, + "auxiliary_loss_mlp": 0.01054939, + "balance_loss_clip": 1.03466105, + "balance_loss_mlp": 1.05424869, + "epoch": 0.13437546971291148, + "flos": 24314756129280.0, + "grad_norm": 1.9481483268780417, + "language_loss": 0.82361299, + "learning_rate": 3.886804527949909e-06, + "loss": 0.84581894, + "num_input_tokens_seen": 48335670, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1171875, + "step": 2235, + "time_per_iteration": 2.4478979110717773 + }, + { + "auxiliary_loss_clip": 0.0116322, + "auxiliary_loss_mlp": 0.01055841, + "balance_loss_clip": 1.03378713, + "balance_loss_mlp": 1.05170834, + "epoch": 0.13443559296557944, + "flos": 26650673395200.0, + "grad_norm": 1.6568048404288893, + "language_loss": 0.86399209, + "learning_rate": 3.8866753269483864e-06, + "loss": 0.88618279, + "num_input_tokens_seen": 48357805, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1171875, + "step": 2236, + "time_per_iteration": 2.534761428833008 + }, + { + "auxiliary_loss_clip": 0.01166625, + "auxiliary_loss_mlp": 0.0104777, + "balance_loss_clip": 1.02712345, + "balance_loss_mlp": 1.05506372, + "epoch": 0.1344957162182474, + "flos": 21796197183360.0, + "grad_norm": 1.5401183277834882, + "language_loss": 0.76936173, + "learning_rate": 3.886546054403946e-06, + "loss": 0.79150563, + "num_input_tokens_seen": 48377845, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1171875, + "step": 2237, + "time_per_iteration": 2.454881191253662 + }, + { + "auxiliary_loss_clip": 0.0116441, + "auxiliary_loss_mlp": 0.01051932, + "balance_loss_clip": 1.02974725, + "balance_loss_mlp": 1.05312407, + "epoch": 0.13455583947091537, + "flos": 19865568049920.0, + "grad_norm": 1.976295310563951, + "language_loss": 0.78737688, + "learning_rate": 3.886416710321491e-06, + "loss": 0.80954033, + "num_input_tokens_seen": 48394735, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.1171875, + "step": 2238, + "time_per_iteration": 2.508364200592041 + }, + { + "auxiliary_loss_clip": 0.01162005, + "auxiliary_loss_mlp": 0.01051856, + "balance_loss_clip": 1.03057706, + "balance_loss_mlp": 1.0530107, + "epoch": 0.13461596272358334, + "flos": 30846835094400.0, + "grad_norm": 2.3078790626960246, + "language_loss": 0.67977941, + "learning_rate": 3.886287294705924e-06, + "loss": 0.70191795, + "num_input_tokens_seen": 48414200, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.09375, + "step": 2239, + "time_per_iteration": 2.533148765563965 + }, + { + "auxiliary_loss_clip": 0.01165153, + "auxiliary_loss_mlp": 0.01049226, + "balance_loss_clip": 1.02888918, + "balance_loss_mlp": 1.05296254, + "epoch": 0.1346760859762513, + "flos": 12494436312960.0, + "grad_norm": 2.7482132203763245, + "language_loss": 0.81085825, + "learning_rate": 3.8861578075621555e-06, + "loss": 0.83300203, + "num_input_tokens_seen": 48431065, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.125, + "step": 2240, + "time_per_iteration": 2.458702802658081 + }, + { + "auxiliary_loss_clip": 0.01166075, + "auxiliary_loss_mlp": 0.01050419, + "balance_loss_clip": 1.02958083, + "balance_loss_mlp": 1.05302262, + "epoch": 0.1347362092289193, + "flos": 21836022387840.0, + "grad_norm": 1.775061814751768, + "language_loss": 0.77491653, + "learning_rate": 3.886028248895093e-06, + "loss": 0.79708141, + "num_input_tokens_seen": 48450335, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1328125, + "step": 2241, + "time_per_iteration": 2.4814610481262207 + }, + { + "auxiliary_loss_clip": 0.01160364, + "auxiliary_loss_mlp": 0.01044969, + "balance_loss_clip": 1.0265156, + "balance_loss_mlp": 1.05368328, + "epoch": 0.13479633248158726, + "flos": 23509459163520.0, + "grad_norm": 1.708340264075402, + "language_loss": 0.83106101, + "learning_rate": 3.88589861870965e-06, + "loss": 0.85311437, + "num_input_tokens_seen": 48468555, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.0625, + "step": 2242, + "time_per_iteration": 2.531010627746582 + }, + { + "auxiliary_loss_clip": 0.01166889, + "auxiliary_loss_mlp": 0.01052169, + "balance_loss_clip": 1.03056788, + "balance_loss_mlp": 1.05465889, + "epoch": 0.13485645573425523, + "flos": 29344332165120.0, + "grad_norm": 3.594763109819468, + "language_loss": 0.64927268, + "learning_rate": 3.885768917010744e-06, + "loss": 0.67146331, + "num_input_tokens_seen": 48488515, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.125, + "step": 2243, + "time_per_iteration": 2.5215864181518555 + }, + { + "auxiliary_loss_clip": 0.01158013, + "auxiliary_loss_mlp": 0.01045929, + "balance_loss_clip": 1.02573538, + "balance_loss_mlp": 1.05214143, + "epoch": 0.1349165789869232, + "flos": 28037112503040.0, + "grad_norm": 1.6702464572283469, + "language_loss": 0.72275442, + "learning_rate": 3.8856391438032895e-06, + "loss": 0.74479383, + "num_input_tokens_seen": 48510515, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0625, + "step": 2244, + "time_per_iteration": 2.572275161743164 + }, + { + "auxiliary_loss_clip": 0.01161264, + "auxiliary_loss_mlp": 0.01052624, + "balance_loss_clip": 1.03339577, + "balance_loss_mlp": 1.0510093, + "epoch": 0.13497670223959116, + "flos": 22853730430080.0, + "grad_norm": 1.6251739599249553, + "language_loss": 0.86419517, + "learning_rate": 3.88550929909221e-06, + "loss": 0.886334, + "num_input_tokens_seen": 48529940, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.1015625, + "step": 2245, + "time_per_iteration": 2.4847447872161865 + }, + { + "auxiliary_loss_clip": 0.0115964, + "auxiliary_loss_mlp": 0.01049956, + "balance_loss_clip": 1.029953, + "balance_loss_mlp": 1.0534606, + "epoch": 0.13503682549225912, + "flos": 16504580453760.0, + "grad_norm": 1.986035604010071, + "language_loss": 0.79054129, + "learning_rate": 3.88537938288243e-06, + "loss": 0.81263721, + "num_input_tokens_seen": 48548190, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0625, + "step": 2246, + "time_per_iteration": 2.521500825881958 + }, + { + "auxiliary_loss_clip": 0.01053943, + "auxiliary_loss_mlp": 0.01006436, + "balance_loss_clip": 1.00378919, + "balance_loss_mlp": 1.01705432, + "epoch": 0.1350969487449271, + "flos": 70756303242240.0, + "grad_norm": 0.7498554605470831, + "language_loss": 0.60597092, + "learning_rate": 3.885249395178874e-06, + "loss": 0.6265747, + "num_input_tokens_seen": 48613165, + "router_z_loss_clip": 0.02648926, + "router_z_loss_mlp": 0.3671875, + "step": 2247, + "time_per_iteration": 3.209567070007324 + }, + { + "auxiliary_loss_clip": 0.0117261, + "auxiliary_loss_mlp": 0.01058621, + "balance_loss_clip": 1.03629315, + "balance_loss_mlp": 1.05673957, + "epoch": 0.13515707199759508, + "flos": 23075981832960.0, + "grad_norm": 2.930333372025318, + "language_loss": 0.81250268, + "learning_rate": 3.885119335986473e-06, + "loss": 0.83481503, + "num_input_tokens_seen": 48631705, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.15625, + "step": 2248, + "time_per_iteration": 2.5274717807769775 + }, + { + "auxiliary_loss_clip": 0.01157631, + "auxiliary_loss_mlp": 0.0104321, + "balance_loss_clip": 1.02503014, + "balance_loss_mlp": 1.0515008, + "epoch": 0.13521719525026304, + "flos": 23186371305600.0, + "grad_norm": 2.1598236051462383, + "language_loss": 0.77427459, + "learning_rate": 3.884989205310157e-06, + "loss": 0.79628301, + "num_input_tokens_seen": 48649740, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.0625, + "step": 2249, + "time_per_iteration": 2.475325345993042 + }, + { + "auxiliary_loss_clip": 0.01161564, + "auxiliary_loss_mlp": 0.01053869, + "balance_loss_clip": 1.03477216, + "balance_loss_mlp": 1.05408192, + "epoch": 0.135277318502931, + "flos": 24790931752320.0, + "grad_norm": 1.4620260499768896, + "language_loss": 0.84598488, + "learning_rate": 3.884859003154862e-06, + "loss": 0.86813927, + "num_input_tokens_seen": 48671565, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0703125, + "step": 2250, + "time_per_iteration": 2.5579018592834473 + }, + { + "auxiliary_loss_clip": 0.01162926, + "auxiliary_loss_mlp": 0.0104688, + "balance_loss_clip": 1.02586317, + "balance_loss_mlp": 1.05311561, + "epoch": 0.13533744175559898, + "flos": 21908525990400.0, + "grad_norm": 1.9830962049575767, + "language_loss": 0.8213973, + "learning_rate": 3.884728729525524e-06, + "loss": 0.84349537, + "num_input_tokens_seen": 48690425, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.1015625, + "step": 2251, + "time_per_iteration": 2.459254503250122 + }, + { + "auxiliary_loss_clip": 0.01160349, + "auxiliary_loss_mlp": 0.01053163, + "balance_loss_clip": 1.03144348, + "balance_loss_mlp": 1.05075097, + "epoch": 0.13539756500826694, + "flos": 21211643249280.0, + "grad_norm": 1.6927381248236872, + "language_loss": 0.85981321, + "learning_rate": 3.884598384427084e-06, + "loss": 0.88194835, + "num_input_tokens_seen": 48707505, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.09375, + "step": 2252, + "time_per_iteration": 2.508246421813965 + }, + { + "auxiliary_loss_clip": 0.01050697, + "auxiliary_loss_mlp": 0.0100648, + "balance_loss_clip": 1.00398886, + "balance_loss_mlp": 1.01368976, + "epoch": 0.1354576882609349, + "flos": 63242103634560.0, + "grad_norm": 0.7502755191421498, + "language_loss": 0.61736262, + "learning_rate": 3.884467967864485e-06, + "loss": 0.63793439, + "num_input_tokens_seen": 48775895, + "router_z_loss_clip": 0.02490234, + "router_z_loss_mlp": 0.37109375, + "step": 2253, + "time_per_iteration": 3.1357691287994385 + }, + { + "auxiliary_loss_clip": 0.01163708, + "auxiliary_loss_mlp": 0.01055809, + "balance_loss_clip": 1.0357219, + "balance_loss_mlp": 1.05454588, + "epoch": 0.1355178115136029, + "flos": 25483037984640.0, + "grad_norm": 2.033104819567641, + "language_loss": 0.89383745, + "learning_rate": 3.884337479842671e-06, + "loss": 0.91603261, + "num_input_tokens_seen": 48798370, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.09375, + "step": 2254, + "time_per_iteration": 2.4983997344970703 + }, + { + "auxiliary_loss_clip": 0.01166229, + "auxiliary_loss_mlp": 0.01050811, + "balance_loss_clip": 1.02786362, + "balance_loss_mlp": 1.05202925, + "epoch": 0.13557793476627086, + "flos": 21616967295360.0, + "grad_norm": 2.0851597725495843, + "language_loss": 0.84461302, + "learning_rate": 3.884206920366591e-06, + "loss": 0.86678338, + "num_input_tokens_seen": 48817955, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.140625, + "step": 2255, + "time_per_iteration": 2.4466094970703125 + }, + { + "auxiliary_loss_clip": 0.01159898, + "auxiliary_loss_mlp": 0.01046769, + "balance_loss_clip": 1.02632451, + "balance_loss_mlp": 1.05059099, + "epoch": 0.13563805801893883, + "flos": 24928253447040.0, + "grad_norm": 2.8290739743459126, + "language_loss": 0.7493006, + "learning_rate": 3.884076289441196e-06, + "loss": 0.77136725, + "num_input_tokens_seen": 48836330, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.09375, + "step": 2256, + "time_per_iteration": 2.49464750289917 + }, + { + "auxiliary_loss_clip": 0.01164681, + "auxiliary_loss_mlp": 0.01049021, + "balance_loss_clip": 1.02806377, + "balance_loss_mlp": 1.05080438, + "epoch": 0.1356981812716068, + "flos": 14750272206720.0, + "grad_norm": 4.107811937736733, + "language_loss": 0.83023381, + "learning_rate": 3.88394558707144e-06, + "loss": 0.85237086, + "num_input_tokens_seen": 48851890, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.140625, + "step": 2257, + "time_per_iteration": 2.4069128036499023 + }, + { + "auxiliary_loss_clip": 0.0116793, + "auxiliary_loss_mlp": 0.01054876, + "balance_loss_clip": 1.03272712, + "balance_loss_mlp": 1.05211377, + "epoch": 0.13575830452427476, + "flos": 11108571822720.0, + "grad_norm": 2.2162023158830655, + "language_loss": 0.82266492, + "learning_rate": 3.883814813262277e-06, + "loss": 0.84489298, + "num_input_tokens_seen": 48865510, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.15625, + "step": 2258, + "time_per_iteration": 2.4187939167022705 + }, + { + "auxiliary_loss_clip": 0.01161942, + "auxiliary_loss_mlp": 0.01051916, + "balance_loss_clip": 1.02890849, + "balance_loss_mlp": 1.05117583, + "epoch": 0.13581842777694272, + "flos": 17960290940160.0, + "grad_norm": 2.3528312033652434, + "language_loss": 0.82556236, + "learning_rate": 3.883683968018669e-06, + "loss": 0.84770095, + "num_input_tokens_seen": 48882360, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.109375, + "step": 2259, + "time_per_iteration": 2.4182498455047607 + }, + { + "auxiliary_loss_clip": 0.01162398, + "auxiliary_loss_mlp": 0.0105006, + "balance_loss_clip": 1.0313561, + "balance_loss_mlp": 1.05370188, + "epoch": 0.1358785510296107, + "flos": 22857142222080.0, + "grad_norm": 1.9951846625000045, + "language_loss": 0.73434722, + "learning_rate": 3.8835530513455755e-06, + "loss": 0.75647175, + "num_input_tokens_seen": 48902700, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0859375, + "step": 2260, + "time_per_iteration": 2.5117952823638916 + }, + { + "auxiliary_loss_clip": 0.01160597, + "auxiliary_loss_mlp": 0.01053624, + "balance_loss_clip": 1.03389525, + "balance_loss_mlp": 1.05164778, + "epoch": 0.13593867428227868, + "flos": 25739404329600.0, + "grad_norm": 2.6406640236232826, + "language_loss": 0.75450647, + "learning_rate": 3.883422063247961e-06, + "loss": 0.77664864, + "num_input_tokens_seen": 48922525, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.09375, + "step": 2261, + "time_per_iteration": 2.4773809909820557 + }, + { + "auxiliary_loss_clip": 0.01164897, + "auxiliary_loss_mlp": 0.01048665, + "balance_loss_clip": 1.02887654, + "balance_loss_mlp": 1.05329657, + "epoch": 0.13599879753494665, + "flos": 31249214225280.0, + "grad_norm": 1.9984757312973846, + "language_loss": 0.63141024, + "learning_rate": 3.883291003730794e-06, + "loss": 0.65354586, + "num_input_tokens_seen": 48942510, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.1171875, + "step": 2262, + "time_per_iteration": 2.5423331260681152 + }, + { + "auxiliary_loss_clip": 0.01161423, + "auxiliary_loss_mlp": 0.01043862, + "balance_loss_clip": 1.02458549, + "balance_loss_mlp": 1.05046105, + "epoch": 0.1360589207876146, + "flos": 23915034604800.0, + "grad_norm": 2.598036861128168, + "language_loss": 0.82363462, + "learning_rate": 3.883159872799043e-06, + "loss": 0.84568739, + "num_input_tokens_seen": 48962625, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.109375, + "step": 2263, + "time_per_iteration": 2.472050428390503 + }, + { + "auxiliary_loss_clip": 0.01166147, + "auxiliary_loss_mlp": 0.01061822, + "balance_loss_clip": 1.03914785, + "balance_loss_mlp": 1.05306447, + "epoch": 0.13611904404028258, + "flos": 19974197756160.0, + "grad_norm": 1.7757676532235749, + "language_loss": 0.87984985, + "learning_rate": 3.8830286704576815e-06, + "loss": 0.90212959, + "num_input_tokens_seen": 48982525, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.1328125, + "step": 2264, + "time_per_iteration": 2.4857943058013916 + }, + { + "auxiliary_loss_clip": 0.01163519, + "auxiliary_loss_mlp": 0.01048759, + "balance_loss_clip": 1.02700329, + "balance_loss_mlp": 1.05115557, + "epoch": 0.13617916729295054, + "flos": 15340644144000.0, + "grad_norm": 2.9904691281538693, + "language_loss": 0.7103616, + "learning_rate": 3.882897396711683e-06, + "loss": 0.73248434, + "num_input_tokens_seen": 48997605, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.125, + "step": 2265, + "time_per_iteration": 2.428753614425659 + }, + { + "auxiliary_loss_clip": 0.01160486, + "auxiliary_loss_mlp": 0.01041679, + "balance_loss_clip": 1.02187812, + "balance_loss_mlp": 1.05258036, + "epoch": 0.1362392905456185, + "flos": 27451445247360.0, + "grad_norm": 2.049615390343222, + "language_loss": 0.66760135, + "learning_rate": 3.882766051566027e-06, + "loss": 0.689623, + "num_input_tokens_seen": 49018535, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.078125, + "step": 2266, + "time_per_iteration": 2.4990508556365967 + }, + { + "auxiliary_loss_clip": 0.01159505, + "auxiliary_loss_mlp": 0.01060297, + "balance_loss_clip": 1.04079425, + "balance_loss_mlp": 1.05220675, + "epoch": 0.1362994137982865, + "flos": 25009017177600.0, + "grad_norm": 1.7538751206895893, + "language_loss": 0.76376909, + "learning_rate": 3.882634635025694e-06, + "loss": 0.78596711, + "num_input_tokens_seen": 49038865, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0703125, + "step": 2267, + "time_per_iteration": 2.485907554626465 + }, + { + "auxiliary_loss_clip": 0.0116058, + "auxiliary_loss_mlp": 0.01046136, + "balance_loss_clip": 1.02639508, + "balance_loss_mlp": 1.05051804, + "epoch": 0.13635953705095447, + "flos": 20303031790080.0, + "grad_norm": 2.002795226804265, + "language_loss": 0.81781995, + "learning_rate": 3.882503147095667e-06, + "loss": 0.83988714, + "num_input_tokens_seen": 49058010, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1015625, + "step": 2268, + "time_per_iteration": 3.890936851501465 + }, + { + "auxiliary_loss_clip": 0.01161581, + "auxiliary_loss_mlp": 0.01046085, + "balance_loss_clip": 1.02567649, + "balance_loss_mlp": 1.0542717, + "epoch": 0.13641966030362243, + "flos": 31358418549120.0, + "grad_norm": 2.071095479959133, + "language_loss": 0.76078153, + "learning_rate": 3.882371587780931e-06, + "loss": 0.78285825, + "num_input_tokens_seen": 49080330, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0703125, + "step": 2269, + "time_per_iteration": 4.03081202507019 + }, + { + "auxiliary_loss_clip": 0.01165717, + "auxiliary_loss_mlp": 0.0104628, + "balance_loss_clip": 1.02612138, + "balance_loss_mlp": 1.05518508, + "epoch": 0.1364797835562904, + "flos": 20478095700480.0, + "grad_norm": 2.039865659244694, + "language_loss": 0.80856502, + "learning_rate": 3.882239957086477e-06, + "loss": 0.83068502, + "num_input_tokens_seen": 49097035, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.109375, + "step": 2270, + "time_per_iteration": 2.431426525115967 + }, + { + "auxiliary_loss_clip": 0.01164608, + "auxiliary_loss_mlp": 0.01055428, + "balance_loss_clip": 1.03463817, + "balance_loss_mlp": 1.05227089, + "epoch": 0.13653990680895836, + "flos": 13078343802240.0, + "grad_norm": 2.715242097566801, + "language_loss": 0.75720018, + "learning_rate": 3.882108255017295e-06, + "loss": 0.77940053, + "num_input_tokens_seen": 49113945, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.125, + "step": 2271, + "time_per_iteration": 2.440701961517334 + }, + { + "auxiliary_loss_clip": 0.01161613, + "auxiliary_loss_mlp": 0.01052373, + "balance_loss_clip": 1.03149939, + "balance_loss_mlp": 1.05171776, + "epoch": 0.13660003006162633, + "flos": 16946712961920.0, + "grad_norm": 2.2487551674667565, + "language_loss": 0.80084515, + "learning_rate": 3.881976481578379e-06, + "loss": 0.82298499, + "num_input_tokens_seen": 49132855, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1015625, + "step": 2272, + "time_per_iteration": 2.4305598735809326 + }, + { + "auxiliary_loss_clip": 0.01056084, + "auxiliary_loss_mlp": 0.01011943, + "balance_loss_clip": 1.00937963, + "balance_loss_mlp": 1.01818228, + "epoch": 0.1366601533142943, + "flos": 68682749892480.0, + "grad_norm": 0.7032235049035468, + "language_loss": 0.60682511, + "learning_rate": 3.8818446367747255e-06, + "loss": 0.62750536, + "num_input_tokens_seen": 49198310, + "router_z_loss_clip": 0.02563477, + "router_z_loss_mlp": 0.37890625, + "step": 2273, + "time_per_iteration": 3.1601598262786865 + }, + { + "auxiliary_loss_clip": 0.01158579, + "auxiliary_loss_mlp": 0.01047766, + "balance_loss_clip": 1.02732205, + "balance_loss_mlp": 1.05170178, + "epoch": 0.13672027656696228, + "flos": 19244241567360.0, + "grad_norm": 1.7482195510707834, + "language_loss": 0.77978206, + "learning_rate": 3.881712720611336e-06, + "loss": 0.80184555, + "num_input_tokens_seen": 49217250, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0625, + "step": 2274, + "time_per_iteration": 2.448374032974243 + }, + { + "auxiliary_loss_clip": 0.01163563, + "auxiliary_loss_mlp": 0.01046845, + "balance_loss_clip": 1.02613878, + "balance_loss_mlp": 1.0536654, + "epoch": 0.13678039981963025, + "flos": 24534924543360.0, + "grad_norm": 2.152740159395537, + "language_loss": 0.78435361, + "learning_rate": 3.881580733093211e-06, + "loss": 0.80645764, + "num_input_tokens_seen": 49236615, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1015625, + "step": 2275, + "time_per_iteration": 2.4761078357696533 + }, + { + "auxiliary_loss_clip": 0.01161418, + "auxiliary_loss_mlp": 0.01039, + "balance_loss_clip": 1.02003431, + "balance_loss_mlp": 1.05312562, + "epoch": 0.13684052307229821, + "flos": 15669334523520.0, + "grad_norm": 2.879456622893362, + "language_loss": 0.81436646, + "learning_rate": 3.881448674225356e-06, + "loss": 0.83637059, + "num_input_tokens_seen": 49253935, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0859375, + "step": 2276, + "time_per_iteration": 2.453623056411743 + }, + { + "auxiliary_loss_clip": 0.01169888, + "auxiliary_loss_mlp": 0.01054109, + "balance_loss_clip": 1.03082716, + "balance_loss_mlp": 1.05443549, + "epoch": 0.13690064632496618, + "flos": 28364689560960.0, + "grad_norm": 2.7308629221608576, + "language_loss": 0.69347179, + "learning_rate": 3.881316544012779e-06, + "loss": 0.71571183, + "num_input_tokens_seen": 49273605, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.15625, + "step": 2277, + "time_per_iteration": 2.537464141845703 + }, + { + "auxiliary_loss_clip": 0.01162034, + "auxiliary_loss_mlp": 0.01051118, + "balance_loss_clip": 1.03056657, + "balance_loss_mlp": 1.05136657, + "epoch": 0.13696076957763414, + "flos": 23404779953280.0, + "grad_norm": 2.1796180013972384, + "language_loss": 0.80487186, + "learning_rate": 3.88118434246049e-06, + "loss": 0.82700336, + "num_input_tokens_seen": 49291785, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.109375, + "step": 2278, + "time_per_iteration": 2.478158950805664 + }, + { + "auxiliary_loss_clip": 0.01164216, + "auxiliary_loss_mlp": 0.01047731, + "balance_loss_clip": 1.02788246, + "balance_loss_mlp": 1.05658543, + "epoch": 0.1370208928303021, + "flos": 37196595601920.0, + "grad_norm": 2.2222454745927744, + "language_loss": 0.74863833, + "learning_rate": 3.881052069573502e-06, + "loss": 0.77075779, + "num_input_tokens_seen": 49311405, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.078125, + "step": 2279, + "time_per_iteration": 2.5930991172790527 + }, + { + "auxiliary_loss_clip": 0.01166611, + "auxiliary_loss_mlp": 0.0105256, + "balance_loss_clip": 1.03232992, + "balance_loss_mlp": 1.05331779, + "epoch": 0.13708101608297008, + "flos": 26976311118720.0, + "grad_norm": 2.3437990696634916, + "language_loss": 0.76614088, + "learning_rate": 3.880919725356831e-06, + "loss": 0.78833258, + "num_input_tokens_seen": 49331835, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.1328125, + "step": 2280, + "time_per_iteration": 2.527808666229248 + }, + { + "auxiliary_loss_clip": 0.01156674, + "auxiliary_loss_mlp": 0.01046301, + "balance_loss_clip": 1.0272876, + "balance_loss_mlp": 1.04930711, + "epoch": 0.13714113933563807, + "flos": 32556864850560.0, + "grad_norm": 1.7035700975942816, + "language_loss": 0.79808372, + "learning_rate": 3.880787309815496e-06, + "loss": 0.82011348, + "num_input_tokens_seen": 49352290, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.078125, + "step": 2281, + "time_per_iteration": 2.5486884117126465 + }, + { + "auxiliary_loss_clip": 0.01167882, + "auxiliary_loss_mlp": 0.01055632, + "balance_loss_clip": 1.03618872, + "balance_loss_mlp": 1.05488086, + "epoch": 0.13720126258830603, + "flos": 16101267569280.0, + "grad_norm": 1.697672260024265, + "language_loss": 0.83955061, + "learning_rate": 3.880654822954518e-06, + "loss": 0.86178571, + "num_input_tokens_seen": 49370285, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.125, + "step": 2282, + "time_per_iteration": 2.4731719493865967 + }, + { + "auxiliary_loss_clip": 0.01157557, + "auxiliary_loss_mlp": 0.01055496, + "balance_loss_clip": 1.03664923, + "balance_loss_mlp": 1.05028629, + "epoch": 0.137261385840974, + "flos": 18953544798720.0, + "grad_norm": 1.8152250836173982, + "language_loss": 0.73821312, + "learning_rate": 3.8805222647789195e-06, + "loss": 0.76034367, + "num_input_tokens_seen": 49389610, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0703125, + "step": 2283, + "time_per_iteration": 2.5041310787200928 + }, + { + "auxiliary_loss_clip": 0.01161767, + "auxiliary_loss_mlp": 0.01048138, + "balance_loss_clip": 1.02991104, + "balance_loss_mlp": 1.05546188, + "epoch": 0.13732150909364196, + "flos": 23295360147840.0, + "grad_norm": 2.845966051455131, + "language_loss": 0.83875519, + "learning_rate": 3.880389635293729e-06, + "loss": 0.86085427, + "num_input_tokens_seen": 49408390, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.0625, + "step": 2284, + "time_per_iteration": 2.489459991455078 + }, + { + "auxiliary_loss_clip": 0.01164156, + "auxiliary_loss_mlp": 0.01049019, + "balance_loss_clip": 1.02784729, + "balance_loss_mlp": 1.05016088, + "epoch": 0.13738163234630993, + "flos": 29351263489920.0, + "grad_norm": 1.9356174938409232, + "language_loss": 0.74778754, + "learning_rate": 3.880256934503974e-06, + "loss": 0.76991928, + "num_input_tokens_seen": 49427725, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.140625, + "step": 2285, + "time_per_iteration": 2.542114734649658 + }, + { + "auxiliary_loss_clip": 0.01158945, + "auxiliary_loss_mlp": 0.01045355, + "balance_loss_clip": 1.02680647, + "balance_loss_mlp": 1.05192137, + "epoch": 0.1374417555989779, + "flos": 26651319840000.0, + "grad_norm": 1.7476035379248278, + "language_loss": 0.74461651, + "learning_rate": 3.880124162414689e-06, + "loss": 0.7666595, + "num_input_tokens_seen": 49449000, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0703125, + "step": 2286, + "time_per_iteration": 2.52837872505188 + }, + { + "auxiliary_loss_clip": 0.01165905, + "auxiliary_loss_mlp": 0.01045032, + "balance_loss_clip": 1.02407491, + "balance_loss_mlp": 1.05466056, + "epoch": 0.1375018788516459, + "flos": 28403401443840.0, + "grad_norm": 2.4229799840234936, + "language_loss": 0.86074513, + "learning_rate": 3.879991319030908e-06, + "loss": 0.88285446, + "num_input_tokens_seen": 49468360, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.109375, + "step": 2287, + "time_per_iteration": 2.5267093181610107 + }, + { + "auxiliary_loss_clip": 0.01162503, + "auxiliary_loss_mlp": 0.01046382, + "balance_loss_clip": 1.02724862, + "balance_loss_mlp": 1.05281329, + "epoch": 0.13756200210431385, + "flos": 37413783187200.0, + "grad_norm": 2.1686670508464783, + "language_loss": 0.68304116, + "learning_rate": 3.879858404357666e-06, + "loss": 0.70512998, + "num_input_tokens_seen": 49493450, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.09375, + "step": 2288, + "time_per_iteration": 2.6589176654815674 + }, + { + "auxiliary_loss_clip": 0.01162886, + "auxiliary_loss_mlp": 0.01054184, + "balance_loss_clip": 1.03410959, + "balance_loss_mlp": 1.05404294, + "epoch": 0.13762212535698182, + "flos": 22711021695360.0, + "grad_norm": 3.8263362529629896, + "language_loss": 0.87251699, + "learning_rate": 3.879725418400005e-06, + "loss": 0.89468765, + "num_input_tokens_seen": 49511220, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.0859375, + "step": 2289, + "time_per_iteration": 2.4834415912628174 + }, + { + "auxiliary_loss_clip": 0.01154414, + "auxiliary_loss_mlp": 0.01045882, + "balance_loss_clip": 1.02735722, + "balance_loss_mlp": 1.0496552, + "epoch": 0.13768224860964978, + "flos": 23952130375680.0, + "grad_norm": 1.801469753111382, + "language_loss": 0.74045157, + "learning_rate": 3.879592361162969e-06, + "loss": 0.76245451, + "num_input_tokens_seen": 49529820, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.046875, + "step": 2290, + "time_per_iteration": 2.4901175498962402 + }, + { + "auxiliary_loss_clip": 0.01056657, + "auxiliary_loss_mlp": 0.01003238, + "balance_loss_clip": 1.00099707, + "balance_loss_mlp": 1.01923215, + "epoch": 0.13774237186231775, + "flos": 63590438753280.0, + "grad_norm": 0.7021136788609851, + "language_loss": 0.5160234, + "learning_rate": 3.8794592326516015e-06, + "loss": 0.53662229, + "num_input_tokens_seen": 49595325, + "router_z_loss_clip": 0.02246094, + "router_z_loss_mlp": 0.375, + "step": 2291, + "time_per_iteration": 3.1141176223754883 + }, + { + "auxiliary_loss_clip": 0.01158988, + "auxiliary_loss_mlp": 0.01049009, + "balance_loss_clip": 1.02856493, + "balance_loss_mlp": 1.05007744, + "epoch": 0.1378024951149857, + "flos": 24279456038400.0, + "grad_norm": 2.104305633549435, + "language_loss": 0.7090801, + "learning_rate": 3.879326032870952e-06, + "loss": 0.73116004, + "num_input_tokens_seen": 49615850, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.09375, + "step": 2292, + "time_per_iteration": 2.5535075664520264 + }, + { + "auxiliary_loss_clip": 0.01160381, + "auxiliary_loss_mlp": 0.01044896, + "balance_loss_clip": 1.02613211, + "balance_loss_mlp": 1.05272794, + "epoch": 0.13786261836765368, + "flos": 14021537080320.0, + "grad_norm": 2.835181445389694, + "language_loss": 0.79774708, + "learning_rate": 3.879192761826071e-06, + "loss": 0.81979978, + "num_input_tokens_seen": 49631860, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.078125, + "step": 2293, + "time_per_iteration": 2.4434242248535156 + }, + { + "auxiliary_loss_clip": 0.01159833, + "auxiliary_loss_mlp": 0.01050431, + "balance_loss_clip": 1.03065419, + "balance_loss_mlp": 1.0489893, + "epoch": 0.13792274162032167, + "flos": 28878679226880.0, + "grad_norm": 2.8100583587938566, + "language_loss": 0.78455698, + "learning_rate": 3.879059419522011e-06, + "loss": 0.80665964, + "num_input_tokens_seen": 49652145, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.109375, + "step": 2294, + "time_per_iteration": 2.5279018878936768 + }, + { + "auxiliary_loss_clip": 0.01156302, + "auxiliary_loss_mlp": 0.01044594, + "balance_loss_clip": 1.02679634, + "balance_loss_mlp": 1.05053687, + "epoch": 0.13798286487298964, + "flos": 21141150808320.0, + "grad_norm": 2.844605455172751, + "language_loss": 0.80448526, + "learning_rate": 3.878926005963831e-06, + "loss": 0.82649422, + "num_input_tokens_seen": 49669880, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 1.0546875, + "step": 2295, + "time_per_iteration": 2.46471905708313 + }, + { + "auxiliary_loss_clip": 0.01158353, + "auxiliary_loss_mlp": 0.01045588, + "balance_loss_clip": 1.02604938, + "balance_loss_mlp": 1.04990947, + "epoch": 0.1380429881256576, + "flos": 22487477402880.0, + "grad_norm": 1.905081494696058, + "language_loss": 0.78027165, + "learning_rate": 3.878792521156588e-06, + "loss": 0.80231106, + "num_input_tokens_seen": 49687255, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0859375, + "step": 2296, + "time_per_iteration": 2.489081859588623 + }, + { + "auxiliary_loss_clip": 0.0116031, + "auxiliary_loss_mlp": 0.01052914, + "balance_loss_clip": 1.03356612, + "balance_loss_mlp": 1.05272174, + "epoch": 0.13810311137832557, + "flos": 21393674398080.0, + "grad_norm": 1.8577842545242083, + "language_loss": 0.78632545, + "learning_rate": 3.8786589651053446e-06, + "loss": 0.80845773, + "num_input_tokens_seen": 49706650, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.078125, + "step": 2297, + "time_per_iteration": 2.479617118835449 + }, + { + "auxiliary_loss_clip": 0.01157952, + "auxiliary_loss_mlp": 0.01050362, + "balance_loss_clip": 1.03187263, + "balance_loss_mlp": 1.05133367, + "epoch": 0.13816323463099353, + "flos": 25989844930560.0, + "grad_norm": 2.1383795008624946, + "language_loss": 0.69005466, + "learning_rate": 3.878525337815164e-06, + "loss": 0.71213776, + "num_input_tokens_seen": 49725715, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0625, + "step": 2298, + "time_per_iteration": 2.4894726276397705 + }, + { + "auxiliary_loss_clip": 0.01163842, + "auxiliary_loss_mlp": 0.0105021, + "balance_loss_clip": 1.03075552, + "balance_loss_mlp": 1.05287397, + "epoch": 0.1382233578836615, + "flos": 19244313394560.0, + "grad_norm": 1.7932718261070644, + "language_loss": 0.86958891, + "learning_rate": 3.878391639291116e-06, + "loss": 0.89172935, + "num_input_tokens_seen": 49744710, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.109375, + "step": 2299, + "time_per_iteration": 2.4343175888061523 + }, + { + "auxiliary_loss_clip": 0.01158457, + "auxiliary_loss_mlp": 0.01052611, + "balance_loss_clip": 1.03221393, + "balance_loss_mlp": 1.05076718, + "epoch": 0.1382834811363295, + "flos": 25666290195840.0, + "grad_norm": 2.6477233854648015, + "language_loss": 0.7542398, + "learning_rate": 3.878257869538267e-06, + "loss": 0.7763505, + "num_input_tokens_seen": 49764300, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.078125, + "step": 2300, + "time_per_iteration": 2.5398943424224854 + }, + { + "auxiliary_loss_clip": 0.01160789, + "auxiliary_loss_mlp": 0.01050356, + "balance_loss_clip": 1.03088915, + "balance_loss_mlp": 1.05409729, + "epoch": 0.13834360438899745, + "flos": 19784193788160.0, + "grad_norm": 2.6084363319634956, + "language_loss": 0.82612532, + "learning_rate": 3.878124028561692e-06, + "loss": 0.8482368, + "num_input_tokens_seen": 49778380, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0625, + "step": 2301, + "time_per_iteration": 2.435732841491699 + }, + { + "auxiliary_loss_clip": 0.01155849, + "auxiliary_loss_mlp": 0.01043651, + "balance_loss_clip": 1.02461374, + "balance_loss_mlp": 1.04986811, + "epoch": 0.13840372764166542, + "flos": 26651858544000.0, + "grad_norm": 2.0886382571109987, + "language_loss": 0.85972583, + "learning_rate": 3.877990116366466e-06, + "loss": 0.8817209, + "num_input_tokens_seen": 49797460, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0625, + "step": 2302, + "time_per_iteration": 2.504011869430542 + }, + { + "auxiliary_loss_clip": 0.01055451, + "auxiliary_loss_mlp": 0.01009124, + "balance_loss_clip": 1.00688314, + "balance_loss_mlp": 1.0189817, + "epoch": 0.13846385089433338, + "flos": 70510998286080.0, + "grad_norm": 0.7554932596602951, + "language_loss": 0.65648526, + "learning_rate": 3.877856132957667e-06, + "loss": 0.677131, + "num_input_tokens_seen": 49868005, + "router_z_loss_clip": 0.02246094, + "router_z_loss_mlp": 0.36328125, + "step": 2303, + "time_per_iteration": 3.2563750743865967 + }, + { + "auxiliary_loss_clip": 0.0115535, + "auxiliary_loss_mlp": 0.01038432, + "balance_loss_clip": 1.01971662, + "balance_loss_mlp": 1.05022073, + "epoch": 0.13852397414700135, + "flos": 17348732956800.0, + "grad_norm": 2.0694955360834912, + "language_loss": 0.78234196, + "learning_rate": 3.877722078340374e-06, + "loss": 0.80427974, + "num_input_tokens_seen": 49885825, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.046875, + "step": 2304, + "time_per_iteration": 2.461975574493408 + }, + { + "auxiliary_loss_clip": 0.01161783, + "auxiliary_loss_mlp": 0.01038842, + "balance_loss_clip": 1.01991165, + "balance_loss_mlp": 1.05225086, + "epoch": 0.13858409739966931, + "flos": 21543781334400.0, + "grad_norm": 1.838077080535218, + "language_loss": 0.77824223, + "learning_rate": 3.877587952519672e-06, + "loss": 0.8002485, + "num_input_tokens_seen": 49905975, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.09375, + "step": 2305, + "time_per_iteration": 2.468254804611206 + }, + { + "auxiliary_loss_clip": 0.0115424, + "auxiliary_loss_mlp": 0.01045678, + "balance_loss_clip": 1.02732027, + "balance_loss_mlp": 1.04923558, + "epoch": 0.13864422065233728, + "flos": 21579907438080.0, + "grad_norm": 3.2063314507866947, + "language_loss": 0.87484217, + "learning_rate": 3.877453755500647e-06, + "loss": 0.89684129, + "num_input_tokens_seen": 49925800, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.046875, + "step": 2306, + "time_per_iteration": 2.4840242862701416 + }, + { + "auxiliary_loss_clip": 0.0105475, + "auxiliary_loss_mlp": 0.01012102, + "balance_loss_clip": 1.00995588, + "balance_loss_mlp": 1.01749539, + "epoch": 0.13870434390500527, + "flos": 53371156872960.0, + "grad_norm": 0.8793018572536648, + "language_loss": 0.59049129, + "learning_rate": 3.877319487288387e-06, + "loss": 0.6111598, + "num_input_tokens_seen": 49977620, + "router_z_loss_clip": 0.02148438, + "router_z_loss_mlp": 0.37304688, + "step": 2307, + "time_per_iteration": 3.1098880767822266 + }, + { + "auxiliary_loss_clip": 0.01164649, + "auxiliary_loss_mlp": 0.01043993, + "balance_loss_clip": 1.0233345, + "balance_loss_mlp": 1.05279016, + "epoch": 0.13876446715767324, + "flos": 22565906749440.0, + "grad_norm": 1.7539420555734833, + "language_loss": 0.79683769, + "learning_rate": 3.877185147887984e-06, + "loss": 0.81892413, + "num_input_tokens_seen": 49996650, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1171875, + "step": 2308, + "time_per_iteration": 2.5119385719299316 + }, + { + "auxiliary_loss_clip": 0.01159668, + "auxiliary_loss_mlp": 0.01042487, + "balance_loss_clip": 1.02331865, + "balance_loss_mlp": 1.05292201, + "epoch": 0.1388245904103412, + "flos": 20705231352960.0, + "grad_norm": 2.1876242684272342, + "language_loss": 0.78186178, + "learning_rate": 3.877050737304533e-06, + "loss": 0.80388331, + "num_input_tokens_seen": 50015640, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0625, + "step": 2309, + "time_per_iteration": 3.889902353286743 + }, + { + "auxiliary_loss_clip": 0.01164667, + "auxiliary_loss_mlp": 0.01044971, + "balance_loss_clip": 1.02517033, + "balance_loss_mlp": 1.05319023, + "epoch": 0.13888471366300917, + "flos": 20554729367040.0, + "grad_norm": 1.9671645437439387, + "language_loss": 0.67473733, + "learning_rate": 3.876916255543129e-06, + "loss": 0.69683367, + "num_input_tokens_seen": 50033500, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1171875, + "step": 2310, + "time_per_iteration": 5.331011056900024 + }, + { + "auxiliary_loss_clip": 0.01159907, + "auxiliary_loss_mlp": 0.01051301, + "balance_loss_clip": 1.03067827, + "balance_loss_mlp": 1.0511837, + "epoch": 0.13894483691567713, + "flos": 13838033473920.0, + "grad_norm": 1.8339330301012977, + "language_loss": 0.83962393, + "learning_rate": 3.8767817026088725e-06, + "loss": 0.86173606, + "num_input_tokens_seen": 50050075, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.0859375, + "step": 2311, + "time_per_iteration": 2.4287211894989014 + }, + { + "auxiliary_loss_clip": 0.01165031, + "auxiliary_loss_mlp": 0.01046165, + "balance_loss_clip": 1.02629256, + "balance_loss_mlp": 1.05262017, + "epoch": 0.1390049601683451, + "flos": 28031186759040.0, + "grad_norm": 2.2677083380951997, + "language_loss": 0.81788063, + "learning_rate": 3.876647078506866e-06, + "loss": 0.83999264, + "num_input_tokens_seen": 50070080, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.125, + "step": 2312, + "time_per_iteration": 2.5261852741241455 + }, + { + "auxiliary_loss_clip": 0.01165344, + "auxiliary_loss_mlp": 0.01045576, + "balance_loss_clip": 1.02634764, + "balance_loss_mlp": 1.05353236, + "epoch": 0.13906508342101306, + "flos": 26756860976640.0, + "grad_norm": 2.1868066623869202, + "language_loss": 0.86641061, + "learning_rate": 3.876512383242215e-06, + "loss": 0.88851982, + "num_input_tokens_seen": 50090040, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.1171875, + "step": 2313, + "time_per_iteration": 2.491847515106201 + }, + { + "auxiliary_loss_clip": 0.0116138, + "auxiliary_loss_mlp": 0.01052556, + "balance_loss_clip": 1.03208828, + "balance_loss_mlp": 1.05377281, + "epoch": 0.13912520667368106, + "flos": 24535104111360.0, + "grad_norm": 2.199884337980412, + "language_loss": 0.79629153, + "learning_rate": 3.876377616820024e-06, + "loss": 0.8184309, + "num_input_tokens_seen": 50110595, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.078125, + "step": 2314, + "time_per_iteration": 2.513545036315918 + }, + { + "auxiliary_loss_clip": 0.0116003, + "auxiliary_loss_mlp": 0.0104633, + "balance_loss_clip": 1.02668452, + "balance_loss_mlp": 1.05130863, + "epoch": 0.13918532992634902, + "flos": 19383215287680.0, + "grad_norm": 2.30759926974498, + "language_loss": 0.86246645, + "learning_rate": 3.876242779245409e-06, + "loss": 0.88453007, + "num_input_tokens_seen": 50125430, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0859375, + "step": 2315, + "time_per_iteration": 2.4236056804656982 + }, + { + "auxiliary_loss_clip": 0.01159066, + "auxiliary_loss_mlp": 0.01052564, + "balance_loss_clip": 1.03192866, + "balance_loss_mlp": 1.05146074, + "epoch": 0.139245453179017, + "flos": 21323756574720.0, + "grad_norm": 2.162038852448813, + "language_loss": 0.77074778, + "learning_rate": 3.876107870523477e-06, + "loss": 0.79286408, + "num_input_tokens_seen": 50144120, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.078125, + "step": 2316, + "time_per_iteration": 2.4574813842773438 + }, + { + "auxiliary_loss_clip": 0.01157842, + "auxiliary_loss_mlp": 0.01058721, + "balance_loss_clip": 1.03733492, + "balance_loss_mlp": 1.05045736, + "epoch": 0.13930557643168495, + "flos": 19500607912320.0, + "grad_norm": 1.6719823206156588, + "language_loss": 0.76972795, + "learning_rate": 3.875972890659349e-06, + "loss": 0.7918936, + "num_input_tokens_seen": 50162500, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.078125, + "step": 2317, + "time_per_iteration": 2.448096990585327 + }, + { + "auxiliary_loss_clip": 0.01162372, + "auxiliary_loss_mlp": 0.01049791, + "balance_loss_clip": 1.02993095, + "balance_loss_mlp": 1.05272126, + "epoch": 0.13936569968435292, + "flos": 25410821690880.0, + "grad_norm": 2.004328537884534, + "language_loss": 0.80159998, + "learning_rate": 3.875837839658139e-06, + "loss": 0.82372165, + "num_input_tokens_seen": 50182415, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.09375, + "step": 2318, + "time_per_iteration": 2.5152556896209717 + }, + { + "auxiliary_loss_clip": 0.01050424, + "auxiliary_loss_mlp": 0.0100261, + "balance_loss_clip": 1.00027394, + "balance_loss_mlp": 1.01373565, + "epoch": 0.13942582293702088, + "flos": 70771063731840.0, + "grad_norm": 0.8654041988705774, + "language_loss": 0.59008324, + "learning_rate": 3.87570271752497e-06, + "loss": 0.61061358, + "num_input_tokens_seen": 50245160, + "router_z_loss_clip": 0.02331543, + "router_z_loss_mlp": 0.3671875, + "step": 2319, + "time_per_iteration": 3.101083993911743 + }, + { + "auxiliary_loss_clip": 0.01162526, + "auxiliary_loss_mlp": 0.01053809, + "balance_loss_clip": 1.03365111, + "balance_loss_mlp": 1.05213809, + "epoch": 0.13948594618968888, + "flos": 35590885920000.0, + "grad_norm": 2.2307371496542356, + "language_loss": 0.65372109, + "learning_rate": 3.875567524264967e-06, + "loss": 0.67588449, + "num_input_tokens_seen": 50268215, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.109375, + "step": 2320, + "time_per_iteration": 2.580655336380005 + }, + { + "auxiliary_loss_clip": 0.01157047, + "auxiliary_loss_mlp": 0.01043656, + "balance_loss_clip": 1.02407002, + "balance_loss_mlp": 1.0507009, + "epoch": 0.13954606944235684, + "flos": 21105204272640.0, + "grad_norm": 1.6249908375914148, + "language_loss": 0.70695353, + "learning_rate": 3.875432259883256e-06, + "loss": 0.72896051, + "num_input_tokens_seen": 50288575, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0625, + "step": 2321, + "time_per_iteration": 2.4594380855560303 + }, + { + "auxiliary_loss_clip": 0.01158572, + "auxiliary_loss_mlp": 0.01055348, + "balance_loss_clip": 1.0345459, + "balance_loss_mlp": 1.04883599, + "epoch": 0.1396061926950248, + "flos": 25044425009280.0, + "grad_norm": 43.01057366099128, + "language_loss": 0.86161166, + "learning_rate": 3.875296924384965e-06, + "loss": 0.88375086, + "num_input_tokens_seen": 50308735, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.09375, + "step": 2322, + "time_per_iteration": 2.4912750720977783 + }, + { + "auxiliary_loss_clip": 0.01152207, + "auxiliary_loss_mlp": 0.01056736, + "balance_loss_clip": 1.0373404, + "balance_loss_mlp": 1.04840016, + "epoch": 0.13966631594769277, + "flos": 37634023428480.0, + "grad_norm": 1.7187096085030618, + "language_loss": 0.6682983, + "learning_rate": 3.875161517775226e-06, + "loss": 0.69038773, + "num_input_tokens_seen": 50331025, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0390625, + "step": 2323, + "time_per_iteration": 2.606084108352661 + }, + { + "auxiliary_loss_clip": 0.0116621, + "auxiliary_loss_mlp": 0.01051125, + "balance_loss_clip": 1.03068066, + "balance_loss_mlp": 1.05250573, + "epoch": 0.13972643920036074, + "flos": 16690993061760.0, + "grad_norm": 2.0268681764850665, + "language_loss": 0.89011461, + "learning_rate": 3.875026040059175e-06, + "loss": 0.91228795, + "num_input_tokens_seen": 50349725, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1328125, + "step": 2324, + "time_per_iteration": 2.458172559738159 + }, + { + "auxiliary_loss_clip": 0.01159494, + "auxiliary_loss_mlp": 0.01056649, + "balance_loss_clip": 1.03626466, + "balance_loss_mlp": 1.04949069, + "epoch": 0.1397865624530287, + "flos": 23331055288320.0, + "grad_norm": 4.4201897818475775, + "language_loss": 0.70700991, + "learning_rate": 3.8748904912419485e-06, + "loss": 0.7291714, + "num_input_tokens_seen": 50367965, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1015625, + "step": 2325, + "time_per_iteration": 2.4608585834503174 + }, + { + "auxiliary_loss_clip": 0.01161715, + "auxiliary_loss_mlp": 0.01055057, + "balance_loss_clip": 1.03568554, + "balance_loss_mlp": 1.05384755, + "epoch": 0.13984668570569667, + "flos": 22778317825920.0, + "grad_norm": 1.8512202881484865, + "language_loss": 0.81165004, + "learning_rate": 3.874754871328688e-06, + "loss": 0.83381784, + "num_input_tokens_seen": 50385605, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.078125, + "step": 2326, + "time_per_iteration": 2.474729537963867 + }, + { + "auxiliary_loss_clip": 0.01155088, + "auxiliary_loss_mlp": 0.01047016, + "balance_loss_clip": 1.02880073, + "balance_loss_mlp": 1.05092621, + "epoch": 0.13990680895836466, + "flos": 19464553635840.0, + "grad_norm": 1.806872548679543, + "language_loss": 0.88955671, + "learning_rate": 3.874619180324534e-06, + "loss": 0.9115777, + "num_input_tokens_seen": 50403985, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.0390625, + "step": 2327, + "time_per_iteration": 2.4512577056884766 + }, + { + "auxiliary_loss_clip": 0.01155487, + "auxiliary_loss_mlp": 0.0105816, + "balance_loss_clip": 1.03790593, + "balance_loss_mlp": 1.05021226, + "epoch": 0.13996693221103262, + "flos": 20303283185280.0, + "grad_norm": 2.4750320646827992, + "language_loss": 0.85236871, + "learning_rate": 3.874483418234632e-06, + "loss": 0.87450516, + "num_input_tokens_seen": 50421590, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.046875, + "step": 2328, + "time_per_iteration": 2.4724884033203125 + }, + { + "auxiliary_loss_clip": 0.01158673, + "auxiliary_loss_mlp": 0.0104927, + "balance_loss_clip": 1.02926636, + "balance_loss_mlp": 1.05120313, + "epoch": 0.1400270554637006, + "flos": 26617707688320.0, + "grad_norm": 1.653872228613324, + "language_loss": 0.74084997, + "learning_rate": 3.874347585064131e-06, + "loss": 0.76292944, + "num_input_tokens_seen": 50443945, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.078125, + "step": 2329, + "time_per_iteration": 2.5238442420959473 + }, + { + "auxiliary_loss_clip": 0.01156952, + "auxiliary_loss_mlp": 0.01050364, + "balance_loss_clip": 1.03070641, + "balance_loss_mlp": 1.04729962, + "epoch": 0.14008717871636855, + "flos": 19391475415680.0, + "grad_norm": 1.840223813628444, + "language_loss": 0.77969897, + "learning_rate": 3.874211680818183e-06, + "loss": 0.80177212, + "num_input_tokens_seen": 50462065, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.09375, + "step": 2330, + "time_per_iteration": 2.468606948852539 + }, + { + "auxiliary_loss_clip": 0.01156133, + "auxiliary_loss_mlp": 0.01046075, + "balance_loss_clip": 1.02738333, + "balance_loss_mlp": 1.0495398, + "epoch": 0.14014730196903652, + "flos": 15304266645120.0, + "grad_norm": 2.6993483396219506, + "language_loss": 0.72030222, + "learning_rate": 3.87407570550194e-06, + "loss": 0.74232423, + "num_input_tokens_seen": 50479565, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0625, + "step": 2331, + "time_per_iteration": 2.504417896270752 + }, + { + "auxiliary_loss_clip": 0.01150975, + "auxiliary_loss_mlp": 0.01053113, + "balance_loss_clip": 1.03333664, + "balance_loss_mlp": 1.05008936, + "epoch": 0.14020742522170448, + "flos": 14939701557120.0, + "grad_norm": 1.585347596838152, + "language_loss": 0.72609055, + "learning_rate": 3.873939659120557e-06, + "loss": 0.74813151, + "num_input_tokens_seen": 50497305, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0078125, + "step": 2332, + "time_per_iteration": 2.4244635105133057 + }, + { + "auxiliary_loss_clip": 0.01047328, + "auxiliary_loss_mlp": 0.01002801, + "balance_loss_clip": 1.00048828, + "balance_loss_mlp": 1.01059568, + "epoch": 0.14026754847437245, + "flos": 48824580044160.0, + "grad_norm": 0.8290843953692559, + "language_loss": 0.56071591, + "learning_rate": 3.873803541679196e-06, + "loss": 0.58121729, + "num_input_tokens_seen": 50549735, + "router_z_loss_clip": 0.02307129, + "router_z_loss_mlp": 0.3671875, + "step": 2333, + "time_per_iteration": 2.8934712409973145 + }, + { + "auxiliary_loss_clip": 0.01155339, + "auxiliary_loss_mlp": 0.01046057, + "balance_loss_clip": 1.02629185, + "balance_loss_mlp": 1.05001664, + "epoch": 0.14032767172704044, + "flos": 25773267876480.0, + "grad_norm": 1.7851490004805215, + "language_loss": 0.82529652, + "learning_rate": 3.873667353183016e-06, + "loss": 0.84731042, + "num_input_tokens_seen": 50570100, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0546875, + "step": 2334, + "time_per_iteration": 2.495786428451538 + }, + { + "auxiliary_loss_clip": 0.01155545, + "auxiliary_loss_mlp": 0.01048248, + "balance_loss_clip": 1.0293529, + "balance_loss_mlp": 1.05012262, + "epoch": 0.1403877949797084, + "flos": 21216312017280.0, + "grad_norm": 1.8251700419130605, + "language_loss": 0.81237197, + "learning_rate": 3.8735310936371825e-06, + "loss": 0.83440989, + "num_input_tokens_seen": 50589185, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0546875, + "step": 2335, + "time_per_iteration": 2.483055591583252 + }, + { + "auxiliary_loss_clip": 0.01163426, + "auxiliary_loss_mlp": 0.01051429, + "balance_loss_clip": 1.02829087, + "balance_loss_mlp": 1.05328035, + "epoch": 0.14044791823237637, + "flos": 22747973811840.0, + "grad_norm": 1.83822789048078, + "language_loss": 0.82159901, + "learning_rate": 3.873394763046862e-06, + "loss": 0.8437475, + "num_input_tokens_seen": 50609645, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.1015625, + "step": 2336, + "time_per_iteration": 2.4732770919799805 + }, + { + "auxiliary_loss_clip": 0.01157668, + "auxiliary_loss_mlp": 0.01044768, + "balance_loss_clip": 1.02526581, + "balance_loss_mlp": 1.05202782, + "epoch": 0.14050804148504434, + "flos": 22964443125120.0, + "grad_norm": 1.8506426201256954, + "language_loss": 0.80081403, + "learning_rate": 3.873258361417225e-06, + "loss": 0.82283843, + "num_input_tokens_seen": 50628385, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0546875, + "step": 2337, + "time_per_iteration": 2.4599671363830566 + }, + { + "auxiliary_loss_clip": 0.01155582, + "auxiliary_loss_mlp": 0.0104864, + "balance_loss_clip": 1.02911353, + "balance_loss_mlp": 1.04861474, + "epoch": 0.1405681647377123, + "flos": 22200336080640.0, + "grad_norm": 2.2474896580124963, + "language_loss": 0.7927807, + "learning_rate": 3.873121888753442e-06, + "loss": 0.81482291, + "num_input_tokens_seen": 50647260, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0703125, + "step": 2338, + "time_per_iteration": 2.4892208576202393 + }, + { + "auxiliary_loss_clip": 0.01165053, + "auxiliary_loss_mlp": 0.01046329, + "balance_loss_clip": 1.02577746, + "balance_loss_mlp": 1.05685067, + "epoch": 0.14062828799038027, + "flos": 23732787974400.0, + "grad_norm": 2.148660398501072, + "language_loss": 0.79827893, + "learning_rate": 3.87298534506069e-06, + "loss": 0.82039273, + "num_input_tokens_seen": 50666130, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.078125, + "step": 2339, + "time_per_iteration": 2.4672555923461914 + }, + { + "auxiliary_loss_clip": 0.01159986, + "auxiliary_loss_mlp": 0.01055012, + "balance_loss_clip": 1.03506875, + "balance_loss_mlp": 1.0527122, + "epoch": 0.14068841124304826, + "flos": 39202493685120.0, + "grad_norm": 1.7979240482106922, + "language_loss": 0.6582588, + "learning_rate": 3.872848730344146e-06, + "loss": 0.68040884, + "num_input_tokens_seen": 50687440, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0703125, + "step": 2340, + "time_per_iteration": 2.614506483078003 + }, + { + "auxiliary_loss_clip": 0.01155238, + "auxiliary_loss_mlp": 0.01048788, + "balance_loss_clip": 1.02936912, + "balance_loss_mlp": 1.05242825, + "epoch": 0.14074853449571623, + "flos": 20192283181440.0, + "grad_norm": 2.5431372850663334, + "language_loss": 0.78670812, + "learning_rate": 3.87271204460899e-06, + "loss": 0.80874836, + "num_input_tokens_seen": 50704030, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.03125, + "step": 2341, + "time_per_iteration": 2.4420077800750732 + }, + { + "auxiliary_loss_clip": 0.01156345, + "auxiliary_loss_mlp": 0.01050043, + "balance_loss_clip": 1.03058767, + "balance_loss_mlp": 1.05246425, + "epoch": 0.1408086577483842, + "flos": 18405871153920.0, + "grad_norm": 11.570217446637303, + "language_loss": 0.80154169, + "learning_rate": 3.8725752878604066e-06, + "loss": 0.82360554, + "num_input_tokens_seen": 50723305, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.03125, + "step": 2342, + "time_per_iteration": 2.4961190223693848 + }, + { + "auxiliary_loss_clip": 0.01159304, + "auxiliary_loss_mlp": 0.010435, + "balance_loss_clip": 1.02486777, + "balance_loss_mlp": 1.05673313, + "epoch": 0.14086878100105216, + "flos": 25264593423360.0, + "grad_norm": 1.9358851833739352, + "language_loss": 0.77974075, + "learning_rate": 3.87243846010358e-06, + "loss": 0.80176884, + "num_input_tokens_seen": 50743270, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.0234375, + "step": 2343, + "time_per_iteration": 2.479679584503174 + }, + { + "auxiliary_loss_clip": 0.01049361, + "auxiliary_loss_mlp": 0.0100492, + "balance_loss_clip": 1.0025475, + "balance_loss_mlp": 1.01255798, + "epoch": 0.14092890425372012, + "flos": 65978388869760.0, + "grad_norm": 0.8341361150670269, + "language_loss": 0.6155628, + "learning_rate": 3.872301561343699e-06, + "loss": 0.63610566, + "num_input_tokens_seen": 50802710, + "router_z_loss_clip": 0.02368164, + "router_z_loss_mlp": 0.3671875, + "step": 2344, + "time_per_iteration": 3.048691987991333 + }, + { + "auxiliary_loss_clip": 0.01151538, + "auxiliary_loss_mlp": 0.01040868, + "balance_loss_clip": 1.02309346, + "balance_loss_mlp": 1.04911709, + "epoch": 0.1409890275063881, + "flos": 23694973931520.0, + "grad_norm": 1.886714907416039, + "language_loss": 0.64591062, + "learning_rate": 3.872164591585956e-06, + "loss": 0.6678347, + "num_input_tokens_seen": 50822625, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 1.0234375, + "step": 2345, + "time_per_iteration": 2.509552240371704 + }, + { + "auxiliary_loss_clip": 0.01162324, + "auxiliary_loss_mlp": 0.01044831, + "balance_loss_clip": 1.023803, + "balance_loss_mlp": 1.05019534, + "epoch": 0.14104915075905605, + "flos": 23623152687360.0, + "grad_norm": 2.502398022219224, + "language_loss": 0.736485, + "learning_rate": 3.8720275508355435e-06, + "loss": 0.7585566, + "num_input_tokens_seen": 50842330, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1171875, + "step": 2346, + "time_per_iteration": 2.4962430000305176 + }, + { + "auxiliary_loss_clip": 0.01160187, + "auxiliary_loss_mlp": 0.01046171, + "balance_loss_clip": 1.02600074, + "balance_loss_mlp": 1.05144429, + "epoch": 0.14110927401172405, + "flos": 20595165102720.0, + "grad_norm": 2.4324488814849703, + "language_loss": 0.77868927, + "learning_rate": 3.8718904390976585e-06, + "loss": 0.80075288, + "num_input_tokens_seen": 50861035, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0859375, + "step": 2347, + "time_per_iteration": 2.4663050174713135 + }, + { + "auxiliary_loss_clip": 0.01155281, + "auxiliary_loss_mlp": 0.01046804, + "balance_loss_clip": 1.02852941, + "balance_loss_mlp": 1.04918981, + "epoch": 0.141169397264392, + "flos": 28548049512960.0, + "grad_norm": 1.7514485331985392, + "language_loss": 0.76446569, + "learning_rate": 3.8717532563775e-06, + "loss": 0.78648651, + "num_input_tokens_seen": 50880105, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.0625, + "step": 2348, + "time_per_iteration": 2.508002758026123 + }, + { + "auxiliary_loss_clip": 0.0115565, + "auxiliary_loss_mlp": 0.01043027, + "balance_loss_clip": 1.02346444, + "balance_loss_mlp": 1.0508523, + "epoch": 0.14122952051705998, + "flos": 17092258871040.0, + "grad_norm": 1.8350283773112115, + "language_loss": 0.8686446, + "learning_rate": 3.871616002680272e-06, + "loss": 0.89063132, + "num_input_tokens_seen": 50897720, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.046875, + "step": 2349, + "time_per_iteration": 2.4446985721588135 + }, + { + "auxiliary_loss_clip": 0.01156083, + "auxiliary_loss_mlp": 0.0104419, + "balance_loss_clip": 1.02478313, + "balance_loss_mlp": 1.05220377, + "epoch": 0.14128964376972794, + "flos": 28946801370240.0, + "grad_norm": 1.7285118920158233, + "language_loss": 0.8895669, + "learning_rate": 3.871478678011177e-06, + "loss": 0.9115696, + "num_input_tokens_seen": 50918385, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0390625, + "step": 2350, + "time_per_iteration": 2.49725341796875 + }, + { + "auxiliary_loss_clip": 0.0115943, + "auxiliary_loss_mlp": 0.01046195, + "balance_loss_clip": 1.02542889, + "balance_loss_mlp": 1.05281878, + "epoch": 0.1413497670223959, + "flos": 18989778643200.0, + "grad_norm": 1.8656651100546833, + "language_loss": 0.814816, + "learning_rate": 3.871341282375423e-06, + "loss": 0.83687228, + "num_input_tokens_seen": 50938270, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.0703125, + "step": 2351, + "time_per_iteration": 3.941416025161743 + }, + { + "auxiliary_loss_clip": 0.01157242, + "auxiliary_loss_mlp": 0.01040097, + "balance_loss_clip": 1.02102304, + "balance_loss_mlp": 1.05032706, + "epoch": 0.14140989027506387, + "flos": 29862236413440.0, + "grad_norm": 2.6782915885510286, + "language_loss": 0.82935351, + "learning_rate": 3.871203815778219e-06, + "loss": 0.85132694, + "num_input_tokens_seen": 50958155, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0703125, + "step": 2352, + "time_per_iteration": 5.431722640991211 + }, + { + "auxiliary_loss_clip": 0.01047453, + "auxiliary_loss_mlp": 0.01006216, + "balance_loss_clip": 1.00387907, + "balance_loss_mlp": 1.01053333, + "epoch": 0.14147001352773186, + "flos": 62079532041600.0, + "grad_norm": 0.90864091090638, + "language_loss": 0.61894125, + "learning_rate": 3.87106627822478e-06, + "loss": 0.63947791, + "num_input_tokens_seen": 51020705, + "router_z_loss_clip": 0.02331543, + "router_z_loss_mlp": 0.36914062, + "step": 2353, + "time_per_iteration": 3.0071640014648438 + }, + { + "auxiliary_loss_clip": 0.01154516, + "auxiliary_loss_mlp": 0.01047207, + "balance_loss_clip": 1.02807426, + "balance_loss_mlp": 1.05024958, + "epoch": 0.14153013678039983, + "flos": 22017514832640.0, + "grad_norm": 1.8535903324814498, + "language_loss": 0.87264848, + "learning_rate": 3.8709286697203196e-06, + "loss": 0.89466572, + "num_input_tokens_seen": 51039995, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0390625, + "step": 2354, + "time_per_iteration": 2.4613726139068604 + }, + { + "auxiliary_loss_clip": 0.01157155, + "auxiliary_loss_mlp": 0.01046298, + "balance_loss_clip": 1.02607965, + "balance_loss_mlp": 1.04953241, + "epoch": 0.1415902600330678, + "flos": 19720093968000.0, + "grad_norm": 1.9651075901387003, + "language_loss": 0.74872321, + "learning_rate": 3.870790990270057e-06, + "loss": 0.77075779, + "num_input_tokens_seen": 51059075, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.078125, + "step": 2355, + "time_per_iteration": 2.442379951477051 + }, + { + "auxiliary_loss_clip": 0.01047047, + "auxiliary_loss_mlp": 0.01002716, + "balance_loss_clip": 1.00052261, + "balance_loss_mlp": 1.01023293, + "epoch": 0.14165038328573576, + "flos": 65900929190400.0, + "grad_norm": 0.6790475533637321, + "language_loss": 0.5182299, + "learning_rate": 3.870653239879212e-06, + "loss": 0.53872752, + "num_input_tokens_seen": 51120380, + "router_z_loss_clip": 0.02197266, + "router_z_loss_mlp": 0.3671875, + "step": 2356, + "time_per_iteration": 2.9892258644104004 + }, + { + "auxiliary_loss_clip": 0.01156071, + "auxiliary_loss_mlp": 0.01053896, + "balance_loss_clip": 1.03495359, + "balance_loss_mlp": 1.05080867, + "epoch": 0.14171050653840372, + "flos": 12130158533760.0, + "grad_norm": 3.0630792396255053, + "language_loss": 0.70576489, + "learning_rate": 3.8705154185530095e-06, + "loss": 0.72786456, + "num_input_tokens_seen": 51136950, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0546875, + "step": 2357, + "time_per_iteration": 2.421844005584717 + }, + { + "auxiliary_loss_clip": 0.0116013, + "auxiliary_loss_mlp": 0.01050288, + "balance_loss_clip": 1.03169179, + "balance_loss_mlp": 1.05012453, + "epoch": 0.1417706297910717, + "flos": 20412487509120.0, + "grad_norm": 1.8720076771552743, + "language_loss": 0.82205695, + "learning_rate": 3.870377526296674e-06, + "loss": 0.84416115, + "num_input_tokens_seen": 51155175, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.09375, + "step": 2358, + "time_per_iteration": 2.4519011974334717 + }, + { + "auxiliary_loss_clip": 0.01160902, + "auxiliary_loss_mlp": 0.01047176, + "balance_loss_clip": 1.02663624, + "balance_loss_mlp": 1.051018, + "epoch": 0.14183075304373965, + "flos": 22380607463040.0, + "grad_norm": 6.439592826280342, + "language_loss": 0.7129705, + "learning_rate": 3.870239563115436e-06, + "loss": 0.73505127, + "num_input_tokens_seen": 51174500, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1015625, + "step": 2359, + "time_per_iteration": 2.4797613620758057 + }, + { + "auxiliary_loss_clip": 0.0115558, + "auxiliary_loss_mlp": 0.01043529, + "balance_loss_clip": 1.02374041, + "balance_loss_mlp": 1.04988599, + "epoch": 0.14189087629640765, + "flos": 21580913018880.0, + "grad_norm": 5.514404455287625, + "language_loss": 0.76040578, + "learning_rate": 3.870101529014526e-06, + "loss": 0.78239685, + "num_input_tokens_seen": 51194270, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.0546875, + "step": 2360, + "time_per_iteration": 2.4538815021514893 + }, + { + "auxiliary_loss_clip": 0.011559, + "auxiliary_loss_mlp": 0.01041926, + "balance_loss_clip": 1.02173233, + "balance_loss_mlp": 1.05221295, + "epoch": 0.1419509995490756, + "flos": 20008564093440.0, + "grad_norm": 2.1535632205539135, + "language_loss": 0.8188749, + "learning_rate": 3.869963423999178e-06, + "loss": 0.84085315, + "num_input_tokens_seen": 51211850, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0390625, + "step": 2361, + "time_per_iteration": 2.4411346912384033 + }, + { + "auxiliary_loss_clip": 0.01152529, + "auxiliary_loss_mlp": 0.01047315, + "balance_loss_clip": 1.02826524, + "balance_loss_mlp": 1.04964995, + "epoch": 0.14201112280174358, + "flos": 31941464112000.0, + "grad_norm": 2.775663525053056, + "language_loss": 0.74489617, + "learning_rate": 3.86982524807463e-06, + "loss": 0.76689464, + "num_input_tokens_seen": 51233545, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.03125, + "step": 2362, + "time_per_iteration": 2.530163049697876 + }, + { + "auxiliary_loss_clip": 0.01158195, + "auxiliary_loss_mlp": 0.01046424, + "balance_loss_clip": 1.0265274, + "balance_loss_mlp": 1.05187464, + "epoch": 0.14207124605441154, + "flos": 41464147582080.0, + "grad_norm": 4.478599792998506, + "language_loss": 0.73748112, + "learning_rate": 3.869687001246122e-06, + "loss": 0.75952733, + "num_input_tokens_seen": 51257615, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0625, + "step": 2363, + "time_per_iteration": 2.646651029586792 + }, + { + "auxiliary_loss_clip": 0.01156109, + "auxiliary_loss_mlp": 0.01045606, + "balance_loss_clip": 1.02605534, + "balance_loss_mlp": 1.05005693, + "epoch": 0.1421313693070795, + "flos": 31905086613120.0, + "grad_norm": 1.8353407682080387, + "language_loss": 0.72971261, + "learning_rate": 3.8695486835188946e-06, + "loss": 0.75172973, + "num_input_tokens_seen": 51279645, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0625, + "step": 2364, + "time_per_iteration": 2.5670576095581055 + }, + { + "auxiliary_loss_clip": 0.01152213, + "auxiliary_loss_mlp": 0.01048707, + "balance_loss_clip": 1.031183, + "balance_loss_mlp": 1.05015445, + "epoch": 0.14219149255974747, + "flos": 26871165031680.0, + "grad_norm": 4.452075303519762, + "language_loss": 0.90230036, + "learning_rate": 3.869410294898195e-06, + "loss": 0.92430955, + "num_input_tokens_seen": 51299775, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 1.015625, + "step": 2365, + "time_per_iteration": 2.5130062103271484 + }, + { + "auxiliary_loss_clip": 0.01155172, + "auxiliary_loss_mlp": 0.0104726, + "balance_loss_clip": 1.02735198, + "balance_loss_mlp": 1.04896259, + "epoch": 0.14225161581241544, + "flos": 27454426076160.0, + "grad_norm": 1.956458588852685, + "language_loss": 0.65377176, + "learning_rate": 3.869271835389268e-06, + "loss": 0.67579615, + "num_input_tokens_seen": 51319430, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0625, + "step": 2366, + "time_per_iteration": 2.5081095695495605 + }, + { + "auxiliary_loss_clip": 0.01152693, + "auxiliary_loss_mlp": 0.01056429, + "balance_loss_clip": 1.03640223, + "balance_loss_mlp": 1.04979372, + "epoch": 0.14231173906508343, + "flos": 10561436881920.0, + "grad_norm": 2.190613479881076, + "language_loss": 0.80414236, + "learning_rate": 3.8691333049973665e-06, + "loss": 0.82623357, + "num_input_tokens_seen": 51336045, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.03125, + "step": 2367, + "time_per_iteration": 2.4398317337036133 + }, + { + "auxiliary_loss_clip": 0.01158941, + "auxiliary_loss_mlp": 0.01054295, + "balance_loss_clip": 1.0333972, + "balance_loss_mlp": 1.05221498, + "epoch": 0.1423718623177514, + "flos": 28360882719360.0, + "grad_norm": 2.898581267606924, + "language_loss": 0.82619941, + "learning_rate": 3.868994703727742e-06, + "loss": 0.84833181, + "num_input_tokens_seen": 51357030, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.0703125, + "step": 2368, + "time_per_iteration": 2.512401580810547 + }, + { + "auxiliary_loss_clip": 0.01157439, + "auxiliary_loss_mlp": 0.01050054, + "balance_loss_clip": 1.0298835, + "balance_loss_mlp": 1.05165803, + "epoch": 0.14243198557041936, + "flos": 19354235990400.0, + "grad_norm": 2.7587049982231675, + "language_loss": 0.86971414, + "learning_rate": 3.868856031585652e-06, + "loss": 0.89178908, + "num_input_tokens_seen": 51374890, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0625, + "step": 2369, + "time_per_iteration": 2.444784164428711 + }, + { + "auxiliary_loss_clip": 0.01158905, + "auxiliary_loss_mlp": 0.01042779, + "balance_loss_clip": 1.02303767, + "balance_loss_mlp": 1.04913163, + "epoch": 0.14249210882308733, + "flos": 28806857982720.0, + "grad_norm": 1.4370193327140612, + "language_loss": 0.75704634, + "learning_rate": 3.868717288576354e-06, + "loss": 0.77906322, + "num_input_tokens_seen": 51398100, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.09375, + "step": 2370, + "time_per_iteration": 2.527740240097046 + }, + { + "auxiliary_loss_clip": 0.01154457, + "auxiliary_loss_mlp": 0.01058445, + "balance_loss_clip": 1.0384295, + "balance_loss_mlp": 1.04879546, + "epoch": 0.1425522320757553, + "flos": 21835016807040.0, + "grad_norm": 1.7319048865171518, + "language_loss": 0.82923144, + "learning_rate": 3.868578474705109e-06, + "loss": 0.85136044, + "num_input_tokens_seen": 51418745, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.0625, + "step": 2371, + "time_per_iteration": 2.4644808769226074 + }, + { + "auxiliary_loss_clip": 0.01158835, + "auxiliary_loss_mlp": 0.01051346, + "balance_loss_clip": 1.03171265, + "balance_loss_mlp": 1.05157602, + "epoch": 0.14261235532842326, + "flos": 17311457617920.0, + "grad_norm": 1.956158386855541, + "language_loss": 0.82575452, + "learning_rate": 3.868439589977181e-06, + "loss": 0.84785628, + "num_input_tokens_seen": 51437455, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0703125, + "step": 2372, + "time_per_iteration": 2.42240047454834 + }, + { + "auxiliary_loss_clip": 0.01157732, + "auxiliary_loss_mlp": 0.01051664, + "balance_loss_clip": 1.03175569, + "balance_loss_mlp": 1.05134308, + "epoch": 0.14267247858109125, + "flos": 18806741913600.0, + "grad_norm": 2.19442784605527, + "language_loss": 0.8396256, + "learning_rate": 3.868300634397836e-06, + "loss": 0.86171949, + "num_input_tokens_seen": 51455710, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0625, + "step": 2373, + "time_per_iteration": 2.444695472717285 + }, + { + "auxiliary_loss_clip": 0.01154816, + "auxiliary_loss_mlp": 0.01050946, + "balance_loss_clip": 1.03294528, + "balance_loss_mlp": 1.05012143, + "epoch": 0.14273260183375922, + "flos": 11358904682880.0, + "grad_norm": 2.034088541649992, + "language_loss": 0.86271042, + "learning_rate": 3.8681616079723445e-06, + "loss": 0.88476801, + "num_input_tokens_seen": 51471270, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.046875, + "step": 2374, + "time_per_iteration": 2.428062915802002 + }, + { + "auxiliary_loss_clip": 0.01161306, + "auxiliary_loss_mlp": 0.01050996, + "balance_loss_clip": 1.03024197, + "balance_loss_mlp": 1.05125451, + "epoch": 0.14279272508642718, + "flos": 27567688636800.0, + "grad_norm": 4.612229602439842, + "language_loss": 0.7919687, + "learning_rate": 3.868022510705977e-06, + "loss": 0.81409162, + "num_input_tokens_seen": 51492705, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1015625, + "step": 2375, + "time_per_iteration": 2.526838541030884 + }, + { + "auxiliary_loss_clip": 0.01157834, + "auxiliary_loss_mlp": 0.01056869, + "balance_loss_clip": 1.03655601, + "balance_loss_mlp": 1.05240607, + "epoch": 0.14285284833909515, + "flos": 16252559654400.0, + "grad_norm": 2.386247922788535, + "language_loss": 0.76400912, + "learning_rate": 3.867883342604009e-06, + "loss": 0.78615618, + "num_input_tokens_seen": 51510780, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.046875, + "step": 2376, + "time_per_iteration": 2.4554591178894043 + }, + { + "auxiliary_loss_clip": 0.01156552, + "auxiliary_loss_mlp": 0.0104949, + "balance_loss_clip": 1.02995205, + "balance_loss_mlp": 1.05075741, + "epoch": 0.1429129715917631, + "flos": 19755609540480.0, + "grad_norm": 2.9035160782842753, + "language_loss": 0.93037754, + "learning_rate": 3.867744103671717e-06, + "loss": 0.952438, + "num_input_tokens_seen": 51531400, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0625, + "step": 2377, + "time_per_iteration": 2.51385760307312 + }, + { + "auxiliary_loss_clip": 0.01157682, + "auxiliary_loss_mlp": 0.01051526, + "balance_loss_clip": 1.02991319, + "balance_loss_mlp": 1.05085003, + "epoch": 0.14297309484443108, + "flos": 21137092571520.0, + "grad_norm": 1.9751577144221115, + "language_loss": 0.91598773, + "learning_rate": 3.867604793914382e-06, + "loss": 0.93807983, + "num_input_tokens_seen": 51548215, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.0703125, + "step": 2378, + "time_per_iteration": 2.558563470840454 + }, + { + "auxiliary_loss_clip": 0.01159674, + "auxiliary_loss_mlp": 0.01044299, + "balance_loss_clip": 1.02410531, + "balance_loss_mlp": 1.051296, + "epoch": 0.14303321809709904, + "flos": 23586667447680.0, + "grad_norm": 1.745891074970689, + "language_loss": 0.73947102, + "learning_rate": 3.8674654133372864e-06, + "loss": 0.76151079, + "num_input_tokens_seen": 51566820, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0859375, + "step": 2379, + "time_per_iteration": 2.511359214782715 + }, + { + "auxiliary_loss_clip": 0.01156473, + "auxiliary_loss_mlp": 0.01056109, + "balance_loss_clip": 1.03636849, + "balance_loss_mlp": 1.05014992, + "epoch": 0.14309334134976703, + "flos": 15888281875200.0, + "grad_norm": 1.8640465231226504, + "language_loss": 0.79013336, + "learning_rate": 3.867325961945714e-06, + "loss": 0.81225914, + "num_input_tokens_seen": 51585075, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0625, + "step": 2380, + "time_per_iteration": 2.466219663619995 + }, + { + "auxiliary_loss_clip": 0.01162977, + "auxiliary_loss_mlp": 0.0105089, + "balance_loss_clip": 1.03124452, + "balance_loss_mlp": 1.05528164, + "epoch": 0.143153464602435, + "flos": 16325601960960.0, + "grad_norm": 2.3244590707621073, + "language_loss": 0.87958229, + "learning_rate": 3.867186439744955e-06, + "loss": 0.90172088, + "num_input_tokens_seen": 51603185, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.078125, + "step": 2381, + "time_per_iteration": 2.4476850032806396 + }, + { + "auxiliary_loss_clip": 0.01156941, + "auxiliary_loss_mlp": 0.01051059, + "balance_loss_clip": 1.03084123, + "balance_loss_mlp": 1.0517571, + "epoch": 0.14321358785510296, + "flos": 17092079303040.0, + "grad_norm": 2.599935932772449, + "language_loss": 0.76852649, + "learning_rate": 3.867046846740299e-06, + "loss": 0.7906065, + "num_input_tokens_seen": 51620880, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.046875, + "step": 2382, + "time_per_iteration": 2.4389045238494873 + }, + { + "auxiliary_loss_clip": 0.01157847, + "auxiliary_loss_mlp": 0.01053474, + "balance_loss_clip": 1.03338671, + "balance_loss_mlp": 1.05068171, + "epoch": 0.14327371110777093, + "flos": 26322916769280.0, + "grad_norm": 2.461149819336849, + "language_loss": 0.76948071, + "learning_rate": 3.866907182937039e-06, + "loss": 0.79159391, + "num_input_tokens_seen": 51640170, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0703125, + "step": 2383, + "time_per_iteration": 2.516038179397583 + }, + { + "auxiliary_loss_clip": 0.01158748, + "auxiliary_loss_mlp": 0.01050878, + "balance_loss_clip": 1.0299803, + "balance_loss_mlp": 1.05114412, + "epoch": 0.1433338343604389, + "flos": 18076462502400.0, + "grad_norm": 2.169581662424978, + "language_loss": 0.88202822, + "learning_rate": 3.866767448340471e-06, + "loss": 0.9041245, + "num_input_tokens_seen": 51656580, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.078125, + "step": 2384, + "time_per_iteration": 2.42138934135437 + }, + { + "auxiliary_loss_clip": 0.01164338, + "auxiliary_loss_mlp": 0.01049242, + "balance_loss_clip": 1.02780819, + "balance_loss_mlp": 1.05382657, + "epoch": 0.14339395761310686, + "flos": 15522783033600.0, + "grad_norm": 4.175812514986151, + "language_loss": 0.79225606, + "learning_rate": 3.866627642955895e-06, + "loss": 0.81439185, + "num_input_tokens_seen": 51674645, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.109375, + "step": 2385, + "time_per_iteration": 2.4439244270324707 + }, + { + "auxiliary_loss_clip": 0.01156029, + "auxiliary_loss_mlp": 0.01046717, + "balance_loss_clip": 1.02692771, + "balance_loss_mlp": 1.04881537, + "epoch": 0.14345408086577485, + "flos": 28548767784960.0, + "grad_norm": 1.9672730758223058, + "language_loss": 0.74989617, + "learning_rate": 3.866487766788612e-06, + "loss": 0.77192366, + "num_input_tokens_seen": 51695770, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.078125, + "step": 2386, + "time_per_iteration": 2.533304214477539 + }, + { + "auxiliary_loss_clip": 0.01159067, + "auxiliary_loss_mlp": 0.01047419, + "balance_loss_clip": 1.02777338, + "balance_loss_mlp": 1.05180025, + "epoch": 0.14351420411844282, + "flos": 20230061310720.0, + "grad_norm": 2.5174427688568626, + "language_loss": 0.78475344, + "learning_rate": 3.866347819843925e-06, + "loss": 0.80681831, + "num_input_tokens_seen": 51714165, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0703125, + "step": 2387, + "time_per_iteration": 2.4568724632263184 + }, + { + "auxiliary_loss_clip": 0.01157837, + "auxiliary_loss_mlp": 0.010548, + "balance_loss_clip": 1.03389072, + "balance_loss_mlp": 1.05092847, + "epoch": 0.14357432737111078, + "flos": 19865029345920.0, + "grad_norm": 2.559937991009886, + "language_loss": 0.82087159, + "learning_rate": 3.866207802127143e-06, + "loss": 0.84299791, + "num_input_tokens_seen": 51734440, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.0703125, + "step": 2388, + "time_per_iteration": 2.5136237144470215 + }, + { + "auxiliary_loss_clip": 0.01161514, + "auxiliary_loss_mlp": 0.01044981, + "balance_loss_clip": 1.02633715, + "balance_loss_mlp": 1.05393136, + "epoch": 0.14363445062377875, + "flos": 28256814040320.0, + "grad_norm": 2.471836270672028, + "language_loss": 0.82267237, + "learning_rate": 3.866067713643573e-06, + "loss": 0.84473729, + "num_input_tokens_seen": 51753730, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.078125, + "step": 2389, + "time_per_iteration": 2.576352119445801 + }, + { + "auxiliary_loss_clip": 0.01161426, + "auxiliary_loss_mlp": 0.01051291, + "balance_loss_clip": 1.03020322, + "balance_loss_mlp": 1.05032301, + "epoch": 0.1436945738764467, + "flos": 18186672407040.0, + "grad_norm": 2.165584666776674, + "language_loss": 0.82654548, + "learning_rate": 3.8659275543985285e-06, + "loss": 0.84867263, + "num_input_tokens_seen": 51771195, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.109375, + "step": 2390, + "time_per_iteration": 2.5145435333251953 + }, + { + "auxiliary_loss_clip": 0.01158378, + "auxiliary_loss_mlp": 0.01054186, + "balance_loss_clip": 1.03406334, + "balance_loss_mlp": 1.0510571, + "epoch": 0.14375469712911468, + "flos": 27307910499840.0, + "grad_norm": 3.0575281215329086, + "language_loss": 0.74616158, + "learning_rate": 3.865787324397324e-06, + "loss": 0.76828718, + "num_input_tokens_seen": 51792290, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.078125, + "step": 2391, + "time_per_iteration": 2.5368545055389404 + }, + { + "auxiliary_loss_clip": 0.01050934, + "auxiliary_loss_mlp": 0.01014282, + "balance_loss_clip": 1.0121367, + "balance_loss_mlp": 1.01461065, + "epoch": 0.14381482038178264, + "flos": 56891445287040.0, + "grad_norm": 0.8732258813949081, + "language_loss": 0.61769497, + "learning_rate": 3.865647023645277e-06, + "loss": 0.63834715, + "num_input_tokens_seen": 51843675, + "router_z_loss_clip": 0.02148438, + "router_z_loss_mlp": 0.36328125, + "step": 2392, + "time_per_iteration": 2.9315476417541504 + }, + { + "auxiliary_loss_clip": 0.01161818, + "auxiliary_loss_mlp": 0.01056559, + "balance_loss_clip": 1.03449333, + "balance_loss_mlp": 1.04981267, + "epoch": 0.14387494363445064, + "flos": 14282177143680.0, + "grad_norm": 2.638581894381379, + "language_loss": 0.76172751, + "learning_rate": 3.865506652147709e-06, + "loss": 0.78391123, + "num_input_tokens_seen": 51860285, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1171875, + "step": 2393, + "time_per_iteration": 3.857799530029297 + }, + { + "auxiliary_loss_clip": 0.01161345, + "auxiliary_loss_mlp": 0.01049065, + "balance_loss_clip": 1.02908611, + "balance_loss_mlp": 1.05249143, + "epoch": 0.1439350668871186, + "flos": 26761493831040.0, + "grad_norm": 1.8778469598095298, + "language_loss": 0.76782668, + "learning_rate": 3.865366209909941e-06, + "loss": 0.78993082, + "num_input_tokens_seen": 51880105, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.09375, + "step": 2394, + "time_per_iteration": 3.979130983352661 + }, + { + "auxiliary_loss_clip": 0.01158023, + "auxiliary_loss_mlp": 0.01048603, + "balance_loss_clip": 1.02836156, + "balance_loss_mlp": 1.05062532, + "epoch": 0.14399519013978657, + "flos": 40700040537600.0, + "grad_norm": 1.605706810552395, + "language_loss": 0.85831755, + "learning_rate": 3.8652256969372994e-06, + "loss": 0.88038385, + "num_input_tokens_seen": 51905175, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.078125, + "step": 2395, + "time_per_iteration": 2.652092933654785 + }, + { + "auxiliary_loss_clip": 0.01157831, + "auxiliary_loss_mlp": 0.01049814, + "balance_loss_clip": 1.03040648, + "balance_loss_mlp": 1.05241179, + "epoch": 0.14405531339245453, + "flos": 20557530627840.0, + "grad_norm": 1.5230484666362787, + "language_loss": 0.82984561, + "learning_rate": 3.865085113235113e-06, + "loss": 0.85192204, + "num_input_tokens_seen": 51924490, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0546875, + "step": 2396, + "time_per_iteration": 2.4647467136383057 + }, + { + "auxiliary_loss_clip": 0.01152766, + "auxiliary_loss_mlp": 0.01046059, + "balance_loss_clip": 1.02691364, + "balance_loss_mlp": 1.04947495, + "epoch": 0.1441154366451225, + "flos": 19572931946880.0, + "grad_norm": 2.435366869769497, + "language_loss": 0.82564163, + "learning_rate": 3.864944458808712e-06, + "loss": 0.8476299, + "num_input_tokens_seen": 51940490, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.03125, + "step": 2397, + "time_per_iteration": 2.4151055812835693 + }, + { + "auxiliary_loss_clip": 0.01161338, + "auxiliary_loss_mlp": 0.01047669, + "balance_loss_clip": 1.02689052, + "balance_loss_mlp": 1.05216622, + "epoch": 0.14417555989779046, + "flos": 18515721922560.0, + "grad_norm": 1.6104109289920625, + "language_loss": 0.79418427, + "learning_rate": 3.86480373366343e-06, + "loss": 0.81627429, + "num_input_tokens_seen": 51957910, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.09375, + "step": 2398, + "time_per_iteration": 2.4470388889312744 + }, + { + "auxiliary_loss_clip": 0.01158929, + "auxiliary_loss_mlp": 0.01052066, + "balance_loss_clip": 1.03246808, + "balance_loss_mlp": 1.05359757, + "epoch": 0.14423568315045843, + "flos": 26031681296640.0, + "grad_norm": 2.7500042291552433, + "language_loss": 0.64847696, + "learning_rate": 3.864662937804603e-06, + "loss": 0.67058688, + "num_input_tokens_seen": 51978010, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0546875, + "step": 2399, + "time_per_iteration": 2.5123891830444336 + }, + { + "auxiliary_loss_clip": 0.01157977, + "auxiliary_loss_mlp": 0.01044487, + "balance_loss_clip": 1.02472198, + "balance_loss_mlp": 1.05306005, + "epoch": 0.14429580640312642, + "flos": 21288743792640.0, + "grad_norm": 1.4896130870957418, + "language_loss": 0.82329226, + "learning_rate": 3.864522071237571e-06, + "loss": 0.84531689, + "num_input_tokens_seen": 51998515, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0546875, + "step": 2400, + "time_per_iteration": 2.4825797080993652 + }, + { + "auxiliary_loss_clip": 0.01162323, + "auxiliary_loss_mlp": 0.01052957, + "balance_loss_clip": 1.03165436, + "balance_loss_mlp": 1.053689, + "epoch": 0.14435592965579438, + "flos": 25627865621760.0, + "grad_norm": 1.540874002782335, + "language_loss": 0.74606794, + "learning_rate": 3.864381133967676e-06, + "loss": 0.76822078, + "num_input_tokens_seen": 52019270, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.0859375, + "step": 2401, + "time_per_iteration": 2.507983684539795 + }, + { + "auxiliary_loss_clip": 0.01156636, + "auxiliary_loss_mlp": 0.01046459, + "balance_loss_clip": 1.0269084, + "balance_loss_mlp": 1.05109596, + "epoch": 0.14441605290846235, + "flos": 22965053656320.0, + "grad_norm": 1.7568662987329828, + "language_loss": 0.80577219, + "learning_rate": 3.86424012600026e-06, + "loss": 0.82780313, + "num_input_tokens_seen": 52039315, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.046875, + "step": 2402, + "time_per_iteration": 2.4913880825042725 + }, + { + "auxiliary_loss_clip": 0.01156436, + "auxiliary_loss_mlp": 0.01048893, + "balance_loss_clip": 1.02880669, + "balance_loss_mlp": 1.05137098, + "epoch": 0.14447617616113032, + "flos": 17347655548800.0, + "grad_norm": 2.1115432529250753, + "language_loss": 0.84918672, + "learning_rate": 3.864099047340673e-06, + "loss": 0.87124002, + "num_input_tokens_seen": 52056555, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.046875, + "step": 2403, + "time_per_iteration": 2.4267525672912598 + }, + { + "auxiliary_loss_clip": 0.01155438, + "auxiliary_loss_mlp": 0.01053748, + "balance_loss_clip": 1.03312445, + "balance_loss_mlp": 1.04934669, + "epoch": 0.14453629941379828, + "flos": 24060185464320.0, + "grad_norm": 3.423742001713465, + "language_loss": 0.70017314, + "learning_rate": 3.863957897994262e-06, + "loss": 0.72226501, + "num_input_tokens_seen": 52075800, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.0625, + "step": 2404, + "time_per_iteration": 2.487827777862549 + }, + { + "auxiliary_loss_clip": 0.01151274, + "auxiliary_loss_mlp": 0.0104872, + "balance_loss_clip": 1.02976513, + "balance_loss_mlp": 1.0473218, + "epoch": 0.14459642266646625, + "flos": 14429554646400.0, + "grad_norm": 2.368746641876408, + "language_loss": 0.72847003, + "learning_rate": 3.863816677966381e-06, + "loss": 0.75046992, + "num_input_tokens_seen": 52092585, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0390625, + "step": 2405, + "time_per_iteration": 2.458444833755493 + }, + { + "auxiliary_loss_clip": 0.01152813, + "auxiliary_loss_mlp": 0.0104761, + "balance_loss_clip": 1.02879858, + "balance_loss_mlp": 1.04891181, + "epoch": 0.14465654591913424, + "flos": 9867032179200.0, + "grad_norm": 2.2064790582144473, + "language_loss": 0.73115766, + "learning_rate": 3.863675387262386e-06, + "loss": 0.75316191, + "num_input_tokens_seen": 52108990, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0390625, + "step": 2406, + "time_per_iteration": 2.4501168727874756 + }, + { + "auxiliary_loss_clip": 0.0115439, + "auxiliary_loss_mlp": 0.01052848, + "balance_loss_clip": 1.03161645, + "balance_loss_mlp": 1.04889357, + "epoch": 0.1447166691718022, + "flos": 24972926987520.0, + "grad_norm": 4.997473868200426, + "language_loss": 0.75399184, + "learning_rate": 3.8635340258876325e-06, + "loss": 0.77606416, + "num_input_tokens_seen": 52125385, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.0546875, + "step": 2407, + "time_per_iteration": 2.482008934020996 + }, + { + "auxiliary_loss_clip": 0.01151849, + "auxiliary_loss_mlp": 0.01043439, + "balance_loss_clip": 1.02418649, + "balance_loss_mlp": 1.04607177, + "epoch": 0.14477679242447017, + "flos": 21908023200000.0, + "grad_norm": 1.6082248834480546, + "language_loss": 0.79472804, + "learning_rate": 3.8633925938474826e-06, + "loss": 0.81668091, + "num_input_tokens_seen": 52144985, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.0625, + "step": 2408, + "time_per_iteration": 2.4657323360443115 + }, + { + "auxiliary_loss_clip": 0.01155517, + "auxiliary_loss_mlp": 0.01051689, + "balance_loss_clip": 1.03006482, + "balance_loss_mlp": 1.05088127, + "epoch": 0.14483691567713813, + "flos": 20740746925440.0, + "grad_norm": 2.1979655558708893, + "language_loss": 0.82594806, + "learning_rate": 3.863251091147299e-06, + "loss": 0.84802014, + "num_input_tokens_seen": 52163885, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.046875, + "step": 2409, + "time_per_iteration": 2.450345039367676 + }, + { + "auxiliary_loss_clip": 0.01156412, + "auxiliary_loss_mlp": 0.01053711, + "balance_loss_clip": 1.03411365, + "balance_loss_mlp": 1.05046105, + "epoch": 0.1448970389298061, + "flos": 35407705536000.0, + "grad_norm": 1.954409921875598, + "language_loss": 0.74561608, + "learning_rate": 3.863109517792446e-06, + "loss": 0.7677173, + "num_input_tokens_seen": 52184325, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0625, + "step": 2410, + "time_per_iteration": 2.5861988067626953 + }, + { + "auxiliary_loss_clip": 0.01154014, + "auxiliary_loss_mlp": 0.01047443, + "balance_loss_clip": 1.02883387, + "balance_loss_mlp": 1.04858971, + "epoch": 0.14495716218247406, + "flos": 15414368808960.0, + "grad_norm": 2.3844352739280597, + "language_loss": 0.81135416, + "learning_rate": 3.8629678737882945e-06, + "loss": 0.83336866, + "num_input_tokens_seen": 52202740, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0546875, + "step": 2411, + "time_per_iteration": 2.4708898067474365 + }, + { + "auxiliary_loss_clip": 0.0115486, + "auxiliary_loss_mlp": 0.01053033, + "balance_loss_clip": 1.03403103, + "balance_loss_mlp": 1.05123138, + "epoch": 0.14501728543514203, + "flos": 33693222493440.0, + "grad_norm": 1.954560524414831, + "language_loss": 0.69816971, + "learning_rate": 3.862826159140214e-06, + "loss": 0.7202487, + "num_input_tokens_seen": 52223100, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.03125, + "step": 2412, + "time_per_iteration": 2.5614776611328125 + }, + { + "auxiliary_loss_clip": 0.0115476, + "auxiliary_loss_mlp": 0.01046078, + "balance_loss_clip": 1.02640891, + "balance_loss_mlp": 1.05100143, + "epoch": 0.14507740868781002, + "flos": 15596112648960.0, + "grad_norm": 2.1541085269745803, + "language_loss": 0.77347231, + "learning_rate": 3.862684373853579e-06, + "loss": 0.79548067, + "num_input_tokens_seen": 52239690, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0390625, + "step": 2413, + "time_per_iteration": 2.4292590618133545 + }, + { + "auxiliary_loss_clip": 0.01049286, + "auxiliary_loss_mlp": 0.0100403, + "balance_loss_clip": 1.00175285, + "balance_loss_mlp": 1.01294982, + "epoch": 0.145137531940478, + "flos": 66675343438080.0, + "grad_norm": 0.9152840666775347, + "language_loss": 0.58887923, + "learning_rate": 3.8625425179337656e-06, + "loss": 0.60941237, + "num_input_tokens_seen": 52296705, + "router_z_loss_clip": 0.02282715, + "router_z_loss_mlp": 0.36328125, + "step": 2414, + "time_per_iteration": 2.9752402305603027 + }, + { + "auxiliary_loss_clip": 0.01048826, + "auxiliary_loss_mlp": 0.01001535, + "balance_loss_clip": 0.99943656, + "balance_loss_mlp": 1.01240802, + "epoch": 0.14519765519314595, + "flos": 67521578929920.0, + "grad_norm": 0.8348908268898737, + "language_loss": 0.6218617, + "learning_rate": 3.862400591386154e-06, + "loss": 0.64236534, + "num_input_tokens_seen": 52361830, + "router_z_loss_clip": 0.02099609, + "router_z_loss_mlp": 0.36328125, + "step": 2415, + "time_per_iteration": 3.039710521697998 + }, + { + "auxiliary_loss_clip": 0.01151709, + "auxiliary_loss_mlp": 0.01046414, + "balance_loss_clip": 1.02637458, + "balance_loss_mlp": 1.04699647, + "epoch": 0.14525777844581392, + "flos": 17198913329280.0, + "grad_norm": 1.8743578134099377, + "language_loss": 0.72001135, + "learning_rate": 3.8622585942161245e-06, + "loss": 0.74199259, + "num_input_tokens_seen": 52379420, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.046875, + "step": 2416, + "time_per_iteration": 2.4283041954040527 + }, + { + "auxiliary_loss_clip": 0.0104556, + "auxiliary_loss_mlp": 0.01005813, + "balance_loss_clip": 1.00379848, + "balance_loss_mlp": 1.01002693, + "epoch": 0.14531790169848188, + "flos": 65404609015680.0, + "grad_norm": 0.711670432605859, + "language_loss": 0.60392165, + "learning_rate": 3.8621165264290635e-06, + "loss": 0.62443542, + "num_input_tokens_seen": 52446290, + "router_z_loss_clip": 0.0201416, + "router_z_loss_mlp": 0.35546875, + "step": 2417, + "time_per_iteration": 3.0824739933013916 + }, + { + "auxiliary_loss_clip": 0.01155799, + "auxiliary_loss_mlp": 0.01055986, + "balance_loss_clip": 1.03639972, + "balance_loss_mlp": 1.04795754, + "epoch": 0.14537802495114985, + "flos": 32562467372160.0, + "grad_norm": 2.9144560714513363, + "language_loss": 0.79237175, + "learning_rate": 3.861974388030356e-06, + "loss": 0.8144896, + "num_input_tokens_seen": 52467295, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.078125, + "step": 2418, + "time_per_iteration": 2.564497947692871 + }, + { + "auxiliary_loss_clip": 0.01150145, + "auxiliary_loss_mlp": 0.01051645, + "balance_loss_clip": 1.03267837, + "balance_loss_mlp": 1.04712582, + "epoch": 0.1454381482038178, + "flos": 20226685432320.0, + "grad_norm": 1.8755047341617508, + "language_loss": 0.72032261, + "learning_rate": 3.861832179025394e-06, + "loss": 0.74234051, + "num_input_tokens_seen": 52487295, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.03125, + "step": 2419, + "time_per_iteration": 2.457617998123169 + }, + { + "auxiliary_loss_clip": 0.01153915, + "auxiliary_loss_mlp": 0.01053899, + "balance_loss_clip": 1.0335021, + "balance_loss_mlp": 1.05042267, + "epoch": 0.1454982714564858, + "flos": 22893124671360.0, + "grad_norm": 2.3659429121693525, + "language_loss": 0.90125811, + "learning_rate": 3.861689899419569e-06, + "loss": 0.92333627, + "num_input_tokens_seen": 52504220, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.03125, + "step": 2420, + "time_per_iteration": 2.456087827682495 + }, + { + "auxiliary_loss_clip": 0.01154143, + "auxiliary_loss_mlp": 0.01057012, + "balance_loss_clip": 1.0382725, + "balance_loss_mlp": 1.04868603, + "epoch": 0.14555839470915377, + "flos": 20229845829120.0, + "grad_norm": 2.2940003535379057, + "language_loss": 0.83309549, + "learning_rate": 3.861547549218276e-06, + "loss": 0.85520703, + "num_input_tokens_seen": 52521900, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0546875, + "step": 2421, + "time_per_iteration": 2.441432476043701 + }, + { + "auxiliary_loss_clip": 0.01153189, + "auxiliary_loss_mlp": 0.01053683, + "balance_loss_clip": 1.03400183, + "balance_loss_mlp": 1.04684627, + "epoch": 0.14561851796182174, + "flos": 22236282616320.0, + "grad_norm": 1.6167157199382733, + "language_loss": 0.81511533, + "learning_rate": 3.861405128426914e-06, + "loss": 0.83718407, + "num_input_tokens_seen": 52540495, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0625, + "step": 2422, + "time_per_iteration": 2.473010540008545 + }, + { + "auxiliary_loss_clip": 0.01046424, + "auxiliary_loss_mlp": 0.01017838, + "balance_loss_clip": 1.01558518, + "balance_loss_mlp": 1.01065397, + "epoch": 0.1456786412144897, + "flos": 52636786289280.0, + "grad_norm": 0.9226410759759552, + "language_loss": 0.63245702, + "learning_rate": 3.861262637050883e-06, + "loss": 0.65309966, + "num_input_tokens_seen": 52603305, + "router_z_loss_clip": 0.02258301, + "router_z_loss_mlp": 0.35742188, + "step": 2423, + "time_per_iteration": 3.0516433715820312 + }, + { + "auxiliary_loss_clip": 0.01155109, + "auxiliary_loss_mlp": 0.01045363, + "balance_loss_clip": 1.02756512, + "balance_loss_mlp": 1.05096769, + "epoch": 0.14573876446715767, + "flos": 23221671396480.0, + "grad_norm": 1.7656587875688796, + "language_loss": 0.8267172, + "learning_rate": 3.861120075095585e-06, + "loss": 0.84872198, + "num_input_tokens_seen": 52623435, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 1.046875, + "step": 2424, + "time_per_iteration": 2.4918792247772217 + }, + { + "auxiliary_loss_clip": 0.01153149, + "auxiliary_loss_mlp": 0.0104962, + "balance_loss_clip": 1.03071296, + "balance_loss_mlp": 1.04970837, + "epoch": 0.14579888771982563, + "flos": 18114384286080.0, + "grad_norm": 2.0603730404595915, + "language_loss": 0.79317909, + "learning_rate": 3.860977442566429e-06, + "loss": 0.81520677, + "num_input_tokens_seen": 52642255, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.03125, + "step": 2425, + "time_per_iteration": 2.4607083797454834 + }, + { + "auxiliary_loss_clip": 0.01155851, + "auxiliary_loss_mlp": 0.01048544, + "balance_loss_clip": 1.030007, + "balance_loss_mlp": 1.05136847, + "epoch": 0.14585901097249362, + "flos": 23001107932800.0, + "grad_norm": 2.4026453111661703, + "language_loss": 0.83269531, + "learning_rate": 3.860834739468821e-06, + "loss": 0.85473925, + "num_input_tokens_seen": 52658700, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0390625, + "step": 2426, + "time_per_iteration": 2.4615883827209473 + }, + { + "auxiliary_loss_clip": 0.01153183, + "auxiliary_loss_mlp": 0.0104253, + "balance_loss_clip": 1.02420735, + "balance_loss_mlp": 1.05100346, + "epoch": 0.1459191342251616, + "flos": 21908669644800.0, + "grad_norm": 1.78851961601388, + "language_loss": 0.86878085, + "learning_rate": 3.860691965808173e-06, + "loss": 0.89073801, + "num_input_tokens_seen": 52678140, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.0234375, + "step": 2427, + "time_per_iteration": 2.46846866607666 + }, + { + "auxiliary_loss_clip": 0.01159617, + "auxiliary_loss_mlp": 0.01046481, + "balance_loss_clip": 1.0264895, + "balance_loss_mlp": 1.05060291, + "epoch": 0.14597925747782955, + "flos": 14975504438400.0, + "grad_norm": 1.9424277979169204, + "language_loss": 0.66795039, + "learning_rate": 3.8605491215899e-06, + "loss": 0.69001138, + "num_input_tokens_seen": 52696825, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.0859375, + "step": 2428, + "time_per_iteration": 2.4277987480163574 + }, + { + "auxiliary_loss_clip": 0.01154279, + "auxiliary_loss_mlp": 0.01048778, + "balance_loss_clip": 1.02870345, + "balance_loss_mlp": 1.05036306, + "epoch": 0.14603938073049752, + "flos": 21068898600960.0, + "grad_norm": 1.7447652065053452, + "language_loss": 0.8363744, + "learning_rate": 3.860406206819417e-06, + "loss": 0.85840499, + "num_input_tokens_seen": 52715125, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0390625, + "step": 2429, + "time_per_iteration": 2.5208661556243896 + }, + { + "auxiliary_loss_clip": 0.01152615, + "auxiliary_loss_mlp": 0.01048492, + "balance_loss_clip": 1.02972817, + "balance_loss_mlp": 1.04804671, + "epoch": 0.14609950398316549, + "flos": 19864777950720.0, + "grad_norm": 1.723947749216575, + "language_loss": 0.78811824, + "learning_rate": 3.860263221502145e-06, + "loss": 0.8101294, + "num_input_tokens_seen": 52734015, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.046875, + "step": 2430, + "time_per_iteration": 2.460575580596924 + }, + { + "auxiliary_loss_clip": 0.0115835, + "auxiliary_loss_mlp": 0.01049311, + "balance_loss_clip": 1.03014231, + "balance_loss_mlp": 1.0529238, + "epoch": 0.14615962723583345, + "flos": 22418852469120.0, + "grad_norm": 2.3723861833809767, + "language_loss": 0.83178174, + "learning_rate": 3.860120165643504e-06, + "loss": 0.85385835, + "num_input_tokens_seen": 52753025, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0546875, + "step": 2431, + "time_per_iteration": 2.468472480773926 + }, + { + "auxiliary_loss_clip": 0.01158923, + "auxiliary_loss_mlp": 0.01053127, + "balance_loss_clip": 1.03244448, + "balance_loss_mlp": 1.05131185, + "epoch": 0.14621975048850142, + "flos": 22346241125760.0, + "grad_norm": 1.7402379411604871, + "language_loss": 0.78777766, + "learning_rate": 3.859977039248921e-06, + "loss": 0.80989814, + "num_input_tokens_seen": 52773420, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.078125, + "step": 2432, + "time_per_iteration": 2.4618513584136963 + }, + { + "auxiliary_loss_clip": 0.01153865, + "auxiliary_loss_mlp": 0.01052087, + "balance_loss_clip": 1.03158331, + "balance_loss_mlp": 1.04917812, + "epoch": 0.1462798737411694, + "flos": 24389163152640.0, + "grad_norm": 1.9105383938395448, + "language_loss": 0.79940903, + "learning_rate": 3.859833842323822e-06, + "loss": 0.82146859, + "num_input_tokens_seen": 52792870, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.046875, + "step": 2433, + "time_per_iteration": 2.4901435375213623 + }, + { + "auxiliary_loss_clip": 0.01152814, + "auxiliary_loss_mlp": 0.01051119, + "balance_loss_clip": 1.03149712, + "balance_loss_mlp": 1.05186844, + "epoch": 0.14633999699383737, + "flos": 19244672530560.0, + "grad_norm": 1.8984055506020234, + "language_loss": 0.78421938, + "learning_rate": 3.859690574873638e-06, + "loss": 0.80625868, + "num_input_tokens_seen": 52811615, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0078125, + "step": 2434, + "time_per_iteration": 3.833007335662842 + }, + { + "auxiliary_loss_clip": 0.01046525, + "auxiliary_loss_mlp": 0.01005945, + "balance_loss_clip": 1.00356054, + "balance_loss_mlp": 1.01038933, + "epoch": 0.14640012024650534, + "flos": 62660638270080.0, + "grad_norm": 0.8674820067375166, + "language_loss": 0.58373666, + "learning_rate": 3.8595472369038e-06, + "loss": 0.60426134, + "num_input_tokens_seen": 52873230, + "router_z_loss_clip": 0.02380371, + "router_z_loss_mlp": 0.36132812, + "step": 2435, + "time_per_iteration": 5.911077499389648 + }, + { + "auxiliary_loss_clip": 0.01147895, + "auxiliary_loss_mlp": 0.01045492, + "balance_loss_clip": 1.02620411, + "balance_loss_mlp": 1.04662895, + "epoch": 0.1464602434991733, + "flos": 12276243146880.0, + "grad_norm": 2.2832294661951753, + "language_loss": 0.88395989, + "learning_rate": 3.859403828419744e-06, + "loss": 0.90589368, + "num_input_tokens_seen": 52889325, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.015625, + "step": 2436, + "time_per_iteration": 2.440303325653076 + }, + { + "auxiliary_loss_clip": 0.01156296, + "auxiliary_loss_mlp": 0.01046658, + "balance_loss_clip": 1.02697682, + "balance_loss_mlp": 1.05032742, + "epoch": 0.14652036675184127, + "flos": 20922311197440.0, + "grad_norm": 2.0196076648737, + "language_loss": 0.74832988, + "learning_rate": 3.85926034942691e-06, + "loss": 0.7703594, + "num_input_tokens_seen": 52909705, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0625, + "step": 2437, + "time_per_iteration": 2.460806369781494 + }, + { + "auxiliary_loss_clip": 0.01153675, + "auxiliary_loss_mlp": 0.01044961, + "balance_loss_clip": 1.02374196, + "balance_loss_mlp": 1.04798007, + "epoch": 0.14658049000450923, + "flos": 27703681528320.0, + "grad_norm": 2.346268485469047, + "language_loss": 0.73932636, + "learning_rate": 3.859116799930736e-06, + "loss": 0.76131272, + "num_input_tokens_seen": 52930300, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.0546875, + "step": 2438, + "time_per_iteration": 2.5051729679107666 + }, + { + "auxiliary_loss_clip": 0.01154512, + "auxiliary_loss_mlp": 0.01041271, + "balance_loss_clip": 1.02310383, + "balance_loss_mlp": 1.05231857, + "epoch": 0.14664061325717723, + "flos": 24936513575040.0, + "grad_norm": 1.8289443089735578, + "language_loss": 0.74791402, + "learning_rate": 3.858973179936668e-06, + "loss": 0.76987189, + "num_input_tokens_seen": 52949955, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.0234375, + "step": 2439, + "time_per_iteration": 2.4596338272094727 + }, + { + "auxiliary_loss_clip": 0.01151843, + "auxiliary_loss_mlp": 0.01047986, + "balance_loss_clip": 1.02872145, + "balance_loss_mlp": 1.04913521, + "epoch": 0.1467007365098452, + "flos": 40297661406720.0, + "grad_norm": 2.106046924266039, + "language_loss": 0.74542844, + "learning_rate": 3.85882948945015e-06, + "loss": 0.76742673, + "num_input_tokens_seen": 52972905, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.03125, + "step": 2440, + "time_per_iteration": 2.613889217376709 + }, + { + "auxiliary_loss_clip": 0.01146734, + "auxiliary_loss_mlp": 0.01048348, + "balance_loss_clip": 1.02964425, + "balance_loss_mlp": 1.04660702, + "epoch": 0.14676085976251316, + "flos": 26541074021760.0, + "grad_norm": 1.6151911954653986, + "language_loss": 0.83047861, + "learning_rate": 3.85868572847663e-06, + "loss": 0.85242939, + "num_input_tokens_seen": 52994850, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0, + "step": 2441, + "time_per_iteration": 2.508570432662964 + }, + { + "auxiliary_loss_clip": 0.01157481, + "auxiliary_loss_mlp": 0.0104725, + "balance_loss_clip": 1.0275681, + "balance_loss_mlp": 1.04952955, + "epoch": 0.14682098301518112, + "flos": 23550110380800.0, + "grad_norm": 3.362343971731744, + "language_loss": 0.71562135, + "learning_rate": 3.858541897021563e-06, + "loss": 0.73766863, + "num_input_tokens_seen": 53014740, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.078125, + "step": 2442, + "time_per_iteration": 2.4903416633605957 + }, + { + "auxiliary_loss_clip": 0.01160717, + "auxiliary_loss_mlp": 0.01042253, + "balance_loss_clip": 1.02257109, + "balance_loss_mlp": 1.0510819, + "epoch": 0.1468811062678491, + "flos": 11651073909120.0, + "grad_norm": 3.2762909335645043, + "language_loss": 0.80804002, + "learning_rate": 3.8583979950904e-06, + "loss": 0.83006966, + "num_input_tokens_seen": 53029780, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.09375, + "step": 2443, + "time_per_iteration": 2.424539089202881 + }, + { + "auxiliary_loss_clip": 0.01154468, + "auxiliary_loss_mlp": 0.01049831, + "balance_loss_clip": 1.03005409, + "balance_loss_mlp": 1.0504694, + "epoch": 0.14694122952051705, + "flos": 23002616304000.0, + "grad_norm": 2.077049554342068, + "language_loss": 0.8297509, + "learning_rate": 3.858254022688599e-06, + "loss": 0.85179389, + "num_input_tokens_seen": 53048620, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0390625, + "step": 2444, + "time_per_iteration": 2.4937214851379395 + }, + { + "auxiliary_loss_clip": 0.01154781, + "auxiliary_loss_mlp": 0.01048939, + "balance_loss_clip": 1.02961493, + "balance_loss_mlp": 1.05025554, + "epoch": 0.14700135277318502, + "flos": 26502972670080.0, + "grad_norm": 1.763635964291881, + "language_loss": 0.71218902, + "learning_rate": 3.85810997982162e-06, + "loss": 0.73422623, + "num_input_tokens_seen": 53070055, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.046875, + "step": 2445, + "time_per_iteration": 2.491645336151123 + }, + { + "auxiliary_loss_clip": 0.01045345, + "auxiliary_loss_mlp": 0.01028611, + "balance_loss_clip": 1.02659595, + "balance_loss_mlp": 1.00942683, + "epoch": 0.147061476025853, + "flos": 59449434387840.0, + "grad_norm": 0.8232649654452494, + "language_loss": 0.63138294, + "learning_rate": 3.857965866494923e-06, + "loss": 0.6521225, + "num_input_tokens_seen": 53126945, + "router_z_loss_clip": 0.0201416, + "router_z_loss_mlp": 0.359375, + "step": 2446, + "time_per_iteration": 2.9610531330108643 + }, + { + "auxiliary_loss_clip": 0.01158924, + "auxiliary_loss_mlp": 0.0104308, + "balance_loss_clip": 1.02355385, + "balance_loss_mlp": 1.05348802, + "epoch": 0.14712159927852098, + "flos": 28330897841280.0, + "grad_norm": 1.8119571313268434, + "language_loss": 0.74937665, + "learning_rate": 3.857821682713975e-06, + "loss": 0.7713967, + "num_input_tokens_seen": 53149130, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0546875, + "step": 2447, + "time_per_iteration": 2.547112226486206 + }, + { + "auxiliary_loss_clip": 0.0115445, + "auxiliary_loss_mlp": 0.01046965, + "balance_loss_clip": 1.02838051, + "balance_loss_mlp": 1.04998112, + "epoch": 0.14718172253118894, + "flos": 27089825074560.0, + "grad_norm": 2.0554455972062744, + "language_loss": 0.85722244, + "learning_rate": 3.857677428484242e-06, + "loss": 0.87923658, + "num_input_tokens_seen": 53167120, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.046875, + "step": 2448, + "time_per_iteration": 2.519530773162842 + }, + { + "auxiliary_loss_clip": 0.01045412, + "auxiliary_loss_mlp": 0.01010534, + "balance_loss_clip": 1.0081377, + "balance_loss_mlp": 1.00952029, + "epoch": 0.1472418457838569, + "flos": 66706764860160.0, + "grad_norm": 0.7649510042513386, + "language_loss": 0.56836212, + "learning_rate": 3.857533103811195e-06, + "loss": 0.58892155, + "num_input_tokens_seen": 53227945, + "router_z_loss_clip": 0.02392578, + "router_z_loss_mlp": 0.359375, + "step": 2449, + "time_per_iteration": 3.0049068927764893 + }, + { + "auxiliary_loss_clip": 0.01150109, + "auxiliary_loss_mlp": 0.01044261, + "balance_loss_clip": 1.02462673, + "balance_loss_mlp": 1.04850447, + "epoch": 0.14730196903652487, + "flos": 19573578391680.0, + "grad_norm": 1.900224172693126, + "language_loss": 0.85544562, + "learning_rate": 3.857388708700307e-06, + "loss": 0.87738931, + "num_input_tokens_seen": 53244615, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.015625, + "step": 2450, + "time_per_iteration": 2.5826945304870605 + }, + { + "auxiliary_loss_clip": 0.01155696, + "auxiliary_loss_mlp": 0.01049879, + "balance_loss_clip": 1.03009009, + "balance_loss_mlp": 1.05074143, + "epoch": 0.14736209228919284, + "flos": 16071031296000.0, + "grad_norm": 2.029178420182481, + "language_loss": 0.74693608, + "learning_rate": 3.857244243157052e-06, + "loss": 0.76899183, + "num_input_tokens_seen": 53262205, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0546875, + "step": 2451, + "time_per_iteration": 2.4345250129699707 + }, + { + "auxiliary_loss_clip": 0.01146898, + "auxiliary_loss_mlp": 0.01039395, + "balance_loss_clip": 1.02092934, + "balance_loss_mlp": 1.04758763, + "epoch": 0.1474222155418608, + "flos": 23039460679680.0, + "grad_norm": 1.6073898366987713, + "language_loss": 0.82240498, + "learning_rate": 3.85709970718691e-06, + "loss": 0.8442679, + "num_input_tokens_seen": 53282445, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.9921875, + "step": 2452, + "time_per_iteration": 2.468869924545288 + }, + { + "auxiliary_loss_clip": 0.01154267, + "auxiliary_loss_mlp": 0.01038485, + "balance_loss_clip": 1.02032936, + "balance_loss_mlp": 1.05154371, + "epoch": 0.1474823387945288, + "flos": 17018641946880.0, + "grad_norm": 1.7191329381743174, + "language_loss": 0.74021572, + "learning_rate": 3.856955100795361e-06, + "loss": 0.76214325, + "num_input_tokens_seen": 53299060, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.03125, + "step": 2453, + "time_per_iteration": 2.433424472808838 + }, + { + "auxiliary_loss_clip": 0.01154761, + "auxiliary_loss_mlp": 0.01050025, + "balance_loss_clip": 1.03048682, + "balance_loss_mlp": 1.04918802, + "epoch": 0.14754246204719676, + "flos": 17895041884800.0, + "grad_norm": 2.171465059586897, + "language_loss": 0.76326835, + "learning_rate": 3.856810423987889e-06, + "loss": 0.78531623, + "num_input_tokens_seen": 53315970, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0546875, + "step": 2454, + "time_per_iteration": 2.419368028640747 + }, + { + "auxiliary_loss_clip": 0.01155198, + "auxiliary_loss_mlp": 0.01038866, + "balance_loss_clip": 1.01980472, + "balance_loss_mlp": 1.04922831, + "epoch": 0.14760258529986472, + "flos": 13079097987840.0, + "grad_norm": 2.006370127686132, + "language_loss": 0.8301537, + "learning_rate": 3.856665676769979e-06, + "loss": 0.85209435, + "num_input_tokens_seen": 53332940, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0625, + "step": 2455, + "time_per_iteration": 2.426819324493408 + }, + { + "auxiliary_loss_clip": 0.01157227, + "auxiliary_loss_mlp": 0.01044033, + "balance_loss_clip": 1.02519834, + "balance_loss_mlp": 1.04846048, + "epoch": 0.1476627085525327, + "flos": 30806399358720.0, + "grad_norm": 2.442844218228049, + "language_loss": 0.83938581, + "learning_rate": 3.85652085914712e-06, + "loss": 0.8613984, + "num_input_tokens_seen": 53353295, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.09375, + "step": 2456, + "time_per_iteration": 2.525296926498413 + }, + { + "auxiliary_loss_clip": 0.01151791, + "auxiliary_loss_mlp": 0.01043419, + "balance_loss_clip": 1.02459574, + "balance_loss_mlp": 1.04980254, + "epoch": 0.14772283180520066, + "flos": 21689434984320.0, + "grad_norm": 1.8839437807359896, + "language_loss": 0.84325618, + "learning_rate": 3.856375971124805e-06, + "loss": 0.86520827, + "num_input_tokens_seen": 53373410, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0234375, + "step": 2457, + "time_per_iteration": 2.471068859100342 + }, + { + "auxiliary_loss_clip": 0.01149123, + "auxiliary_loss_mlp": 0.01041136, + "balance_loss_clip": 1.02237296, + "balance_loss_mlp": 1.04932761, + "epoch": 0.14778295505786862, + "flos": 18770400328320.0, + "grad_norm": 1.9862753985638202, + "language_loss": 0.75645256, + "learning_rate": 3.856231012708527e-06, + "loss": 0.77835512, + "num_input_tokens_seen": 53391430, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.99609375, + "step": 2458, + "time_per_iteration": 2.44146466255188 + }, + { + "auxiliary_loss_clip": 0.01160318, + "auxiliary_loss_mlp": 0.01049421, + "balance_loss_clip": 1.0284996, + "balance_loss_mlp": 1.05119324, + "epoch": 0.1478430783105366, + "flos": 22893555634560.0, + "grad_norm": 2.405388225865701, + "language_loss": 0.83817005, + "learning_rate": 3.856085983903782e-06, + "loss": 0.86026746, + "num_input_tokens_seen": 53409960, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.09375, + "step": 2459, + "time_per_iteration": 2.470345973968506 + }, + { + "auxiliary_loss_clip": 0.01149406, + "auxiliary_loss_mlp": 0.01041796, + "balance_loss_clip": 1.02359271, + "balance_loss_mlp": 1.0489651, + "epoch": 0.14790320156320458, + "flos": 15085319293440.0, + "grad_norm": 2.6666731923680733, + "language_loss": 0.75856471, + "learning_rate": 3.855940884716071e-06, + "loss": 0.78047681, + "num_input_tokens_seen": 53426160, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.0, + "step": 2460, + "time_per_iteration": 2.4294657707214355 + }, + { + "auxiliary_loss_clip": 0.01157631, + "auxiliary_loss_mlp": 0.01042027, + "balance_loss_clip": 1.02260733, + "balance_loss_mlp": 1.05102873, + "epoch": 0.14796332481587254, + "flos": 26504768350080.0, + "grad_norm": 1.6904429322803973, + "language_loss": 0.81591463, + "learning_rate": 3.855795715150896e-06, + "loss": 0.83791113, + "num_input_tokens_seen": 53448530, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0703125, + "step": 2461, + "time_per_iteration": 2.4993178844451904 + }, + { + "auxiliary_loss_clip": 0.01159506, + "auxiliary_loss_mlp": 0.01046713, + "balance_loss_clip": 1.02611399, + "balance_loss_mlp": 1.05356562, + "epoch": 0.1480234480685405, + "flos": 17563191108480.0, + "grad_norm": 3.2471604819605036, + "language_loss": 0.65689576, + "learning_rate": 3.855650475213761e-06, + "loss": 0.678958, + "num_input_tokens_seen": 53465915, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.0625, + "step": 2462, + "time_per_iteration": 2.4197235107421875 + }, + { + "auxiliary_loss_clip": 0.0115574, + "auxiliary_loss_mlp": 0.01048819, + "balance_loss_clip": 1.02929282, + "balance_loss_mlp": 1.05148113, + "epoch": 0.14808357132120847, + "flos": 53582203232640.0, + "grad_norm": 1.4717210360784851, + "language_loss": 0.67368174, + "learning_rate": 3.8555051649101745e-06, + "loss": 0.69572735, + "num_input_tokens_seen": 53496055, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0390625, + "step": 2463, + "time_per_iteration": 2.774268865585327 + }, + { + "auxiliary_loss_clip": 0.01154664, + "auxiliary_loss_mlp": 0.01050077, + "balance_loss_clip": 1.03071713, + "balance_loss_mlp": 1.04978383, + "epoch": 0.14814369457387644, + "flos": 19829190551040.0, + "grad_norm": 2.177919724516607, + "language_loss": 0.76567936, + "learning_rate": 3.855359784245646e-06, + "loss": 0.78772676, + "num_input_tokens_seen": 53513790, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.046875, + "step": 2464, + "time_per_iteration": 2.4522674083709717 + }, + { + "auxiliary_loss_clip": 0.01152455, + "auxiliary_loss_mlp": 0.01049156, + "balance_loss_clip": 1.03089297, + "balance_loss_mlp": 1.05009413, + "epoch": 0.1482038178265444, + "flos": 23914962777600.0, + "grad_norm": 1.623144605896263, + "language_loss": 0.79623306, + "learning_rate": 3.855214333225688e-06, + "loss": 0.81824923, + "num_input_tokens_seen": 53533410, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.0234375, + "step": 2465, + "time_per_iteration": 2.4946794509887695 + }, + { + "auxiliary_loss_clip": 0.01159963, + "auxiliary_loss_mlp": 0.01045929, + "balance_loss_clip": 1.02543747, + "balance_loss_mlp": 1.0522809, + "epoch": 0.1482639410792124, + "flos": 24170503109760.0, + "grad_norm": 2.8838905575360925, + "language_loss": 0.76230991, + "learning_rate": 3.855068811855817e-06, + "loss": 0.78436887, + "num_input_tokens_seen": 53554775, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.078125, + "step": 2466, + "time_per_iteration": 2.4722483158111572 + }, + { + "auxiliary_loss_clip": 0.01052707, + "auxiliary_loss_mlp": 0.01020247, + "balance_loss_clip": 1.01781487, + "balance_loss_mlp": 1.01613474, + "epoch": 0.14832406433188036, + "flos": 66191051341440.0, + "grad_norm": 0.8013334536894682, + "language_loss": 0.60022712, + "learning_rate": 3.854923220141551e-06, + "loss": 0.62095666, + "num_input_tokens_seen": 53609675, + "router_z_loss_clip": 0.02429199, + "router_z_loss_mlp": 0.3671875, + "step": 2467, + "time_per_iteration": 3.0702927112579346 + }, + { + "auxiliary_loss_clip": 0.01154799, + "auxiliary_loss_mlp": 0.01043072, + "balance_loss_clip": 1.02393889, + "balance_loss_mlp": 1.05059397, + "epoch": 0.14838418758454833, + "flos": 25411252654080.0, + "grad_norm": 2.3345318496369405, + "language_loss": 0.87671721, + "learning_rate": 3.85477755808841e-06, + "loss": 0.89869595, + "num_input_tokens_seen": 53626950, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.046875, + "step": 2468, + "time_per_iteration": 2.5042202472686768 + }, + { + "auxiliary_loss_clip": 0.0115781, + "auxiliary_loss_mlp": 0.01052711, + "balance_loss_clip": 1.0322901, + "balance_loss_mlp": 1.05078602, + "epoch": 0.1484443108372163, + "flos": 23289901280640.0, + "grad_norm": 4.884804263226826, + "language_loss": 0.75884396, + "learning_rate": 3.854631825701919e-06, + "loss": 0.78094912, + "num_input_tokens_seen": 53644200, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0703125, + "step": 2469, + "time_per_iteration": 2.4750967025756836 + }, + { + "auxiliary_loss_clip": 0.01153722, + "auxiliary_loss_mlp": 0.01053888, + "balance_loss_clip": 1.03425384, + "balance_loss_mlp": 1.04954958, + "epoch": 0.14850443408988426, + "flos": 14647675985280.0, + "grad_norm": 2.457578452134473, + "language_loss": 0.76183128, + "learning_rate": 3.854486022987603e-06, + "loss": 0.78390741, + "num_input_tokens_seen": 53659650, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.046875, + "step": 2470, + "time_per_iteration": 2.4312937259674072 + }, + { + "auxiliary_loss_clip": 0.01152707, + "auxiliary_loss_mlp": 0.01045721, + "balance_loss_clip": 1.02651668, + "balance_loss_mlp": 1.05050206, + "epoch": 0.14856455734255222, + "flos": 23548314700800.0, + "grad_norm": 1.9398758609720104, + "language_loss": 0.72121894, + "learning_rate": 3.8543401499509905e-06, + "loss": 0.74320322, + "num_input_tokens_seen": 53680275, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0234375, + "step": 2471, + "time_per_iteration": 2.519866466522217 + }, + { + "auxiliary_loss_clip": 0.01160204, + "auxiliary_loss_mlp": 0.01047639, + "balance_loss_clip": 1.0272181, + "balance_loss_mlp": 1.0499022, + "epoch": 0.1486246805952202, + "flos": 18077288515200.0, + "grad_norm": 2.11598070664324, + "language_loss": 0.89739621, + "learning_rate": 3.854194206597615e-06, + "loss": 0.91947466, + "num_input_tokens_seen": 53698270, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.1015625, + "step": 2472, + "time_per_iteration": 2.4281632900238037 + }, + { + "auxiliary_loss_clip": 0.01155174, + "auxiliary_loss_mlp": 0.01049471, + "balance_loss_clip": 1.030123, + "balance_loss_mlp": 1.05059123, + "epoch": 0.14868480384788818, + "flos": 19353625459200.0, + "grad_norm": 4.013793804030176, + "language_loss": 0.80734539, + "learning_rate": 3.854048192933008e-06, + "loss": 0.82939184, + "num_input_tokens_seen": 53716845, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.046875, + "step": 2473, + "time_per_iteration": 2.4329466819763184 + }, + { + "auxiliary_loss_clip": 0.0115911, + "auxiliary_loss_mlp": 0.01063152, + "balance_loss_clip": 1.04358959, + "balance_loss_mlp": 1.05129409, + "epoch": 0.14874492710055615, + "flos": 22200192426240.0, + "grad_norm": 2.5981192604624526, + "language_loss": 0.77540123, + "learning_rate": 3.853902108962709e-06, + "loss": 0.79762381, + "num_input_tokens_seen": 53734970, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.078125, + "step": 2474, + "time_per_iteration": 2.453432083129883 + }, + { + "auxiliary_loss_clip": 0.01157718, + "auxiliary_loss_mlp": 0.01057886, + "balance_loss_clip": 1.03763211, + "balance_loss_mlp": 1.04955983, + "epoch": 0.1488050503532241, + "flos": 21103444506240.0, + "grad_norm": 1.8103491271764227, + "language_loss": 0.82315612, + "learning_rate": 3.853755954692255e-06, + "loss": 0.84531218, + "num_input_tokens_seen": 53753415, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.0859375, + "step": 2475, + "time_per_iteration": 2.4591174125671387 + }, + { + "auxiliary_loss_clip": 0.01157844, + "auxiliary_loss_mlp": 0.01058234, + "balance_loss_clip": 1.03985167, + "balance_loss_mlp": 1.05399168, + "epoch": 0.14886517360589208, + "flos": 12786569625600.0, + "grad_norm": 1.9240192853863896, + "language_loss": 0.80811602, + "learning_rate": 3.85360973012719e-06, + "loss": 0.83027685, + "num_input_tokens_seen": 53770305, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.0390625, + "step": 2476, + "time_per_iteration": 3.810553789138794 + }, + { + "auxiliary_loss_clip": 0.01148934, + "auxiliary_loss_mlp": 0.01053022, + "balance_loss_clip": 1.03467607, + "balance_loss_mlp": 1.05016851, + "epoch": 0.14892529685856004, + "flos": 29022860419200.0, + "grad_norm": 1.8396010916090604, + "language_loss": 0.77889222, + "learning_rate": 3.853463435273058e-06, + "loss": 0.80091178, + "num_input_tokens_seen": 53788895, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.98828125, + "step": 2477, + "time_per_iteration": 4.031312942504883 + }, + { + "auxiliary_loss_clip": 0.01048753, + "auxiliary_loss_mlp": 0.01018076, + "balance_loss_clip": 1.01581085, + "balance_loss_mlp": 1.01302671, + "epoch": 0.148985420111228, + "flos": 61926121054080.0, + "grad_norm": 0.8050876444063699, + "language_loss": 0.60130364, + "learning_rate": 3.853317070135407e-06, + "loss": 0.62197196, + "num_input_tokens_seen": 53850260, + "router_z_loss_clip": 0.02270508, + "router_z_loss_mlp": 0.35742188, + "step": 2478, + "time_per_iteration": 3.1073787212371826 + }, + { + "auxiliary_loss_clip": 0.01154836, + "auxiliary_loss_mlp": 0.01044957, + "balance_loss_clip": 1.02695656, + "balance_loss_mlp": 1.05078554, + "epoch": 0.149045543363896, + "flos": 23915106432000.0, + "grad_norm": 2.232556799389181, + "language_loss": 0.70951897, + "learning_rate": 3.853170634719787e-06, + "loss": 0.7315169, + "num_input_tokens_seen": 53867520, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0390625, + "step": 2479, + "time_per_iteration": 2.475215435028076 + }, + { + "auxiliary_loss_clip": 0.01153193, + "auxiliary_loss_mlp": 0.01050847, + "balance_loss_clip": 1.0313679, + "balance_loss_mlp": 1.04886127, + "epoch": 0.14910566661656396, + "flos": 23654394541440.0, + "grad_norm": 1.5896653051626852, + "language_loss": 0.80748487, + "learning_rate": 3.853024129031751e-06, + "loss": 0.82952535, + "num_input_tokens_seen": 53886620, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.046875, + "step": 2480, + "time_per_iteration": 2.4618492126464844 + }, + { + "auxiliary_loss_clip": 0.01156746, + "auxiliary_loss_mlp": 0.01047338, + "balance_loss_clip": 1.02838397, + "balance_loss_mlp": 1.05017209, + "epoch": 0.14916578986923193, + "flos": 20515299212160.0, + "grad_norm": 2.4101793906634894, + "language_loss": 0.84132183, + "learning_rate": 3.852877553076854e-06, + "loss": 0.86336267, + "num_input_tokens_seen": 53902230, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0625, + "step": 2481, + "time_per_iteration": 2.437391519546509 + }, + { + "auxiliary_loss_clip": 0.01152664, + "auxiliary_loss_mlp": 0.01051193, + "balance_loss_clip": 1.03046227, + "balance_loss_mlp": 1.04808569, + "epoch": 0.1492259131218999, + "flos": 22491822948480.0, + "grad_norm": 3.194199563979109, + "language_loss": 0.77347398, + "learning_rate": 3.8527309068606546e-06, + "loss": 0.79551256, + "num_input_tokens_seen": 53919475, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.046875, + "step": 2482, + "time_per_iteration": 2.4710068702697754 + }, + { + "auxiliary_loss_clip": 0.01161857, + "auxiliary_loss_mlp": 0.01040162, + "balance_loss_clip": 1.01939583, + "balance_loss_mlp": 1.05186439, + "epoch": 0.14928603637456786, + "flos": 23185868515200.0, + "grad_norm": 2.968394626295353, + "language_loss": 0.78719991, + "learning_rate": 3.852584190388713e-06, + "loss": 0.80922014, + "num_input_tokens_seen": 53939150, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1015625, + "step": 2483, + "time_per_iteration": 2.5075182914733887 + }, + { + "auxiliary_loss_clip": 0.0114759, + "auxiliary_loss_mlp": 0.01040314, + "balance_loss_clip": 1.02304077, + "balance_loss_mlp": 1.04774714, + "epoch": 0.14934615962723582, + "flos": 21653237053440.0, + "grad_norm": 1.642113570978582, + "language_loss": 0.70521605, + "learning_rate": 3.852437403666595e-06, + "loss": 0.72709513, + "num_input_tokens_seen": 53958735, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 1.0, + "step": 2484, + "time_per_iteration": 2.4810657501220703 + }, + { + "auxiliary_loss_clip": 0.01154341, + "auxiliary_loss_mlp": 0.01041731, + "balance_loss_clip": 1.02049971, + "balance_loss_mlp": 1.04769683, + "epoch": 0.1494062828799038, + "flos": 27010066924800.0, + "grad_norm": 2.5518326423103654, + "language_loss": 0.84396368, + "learning_rate": 3.852290546699863e-06, + "loss": 0.86592442, + "num_input_tokens_seen": 53975065, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.0703125, + "step": 2485, + "time_per_iteration": 2.47004771232605 + }, + { + "auxiliary_loss_clip": 0.01155612, + "auxiliary_loss_mlp": 0.01044521, + "balance_loss_clip": 1.02442229, + "balance_loss_mlp": 1.04906201, + "epoch": 0.14946640613257178, + "flos": 21214947300480.0, + "grad_norm": 2.1854599778658663, + "language_loss": 0.84902173, + "learning_rate": 3.8521436194940894e-06, + "loss": 0.87102306, + "num_input_tokens_seen": 53993330, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0625, + "step": 2486, + "time_per_iteration": 2.4553234577178955 + }, + { + "auxiliary_loss_clip": 0.011482, + "auxiliary_loss_mlp": 0.01038818, + "balance_loss_clip": 1.02208114, + "balance_loss_mlp": 1.04672825, + "epoch": 0.14952652938523975, + "flos": 13370872164480.0, + "grad_norm": 2.4579579723442855, + "language_loss": 0.74329305, + "learning_rate": 3.851996622054842e-06, + "loss": 0.76516318, + "num_input_tokens_seen": 54010515, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 1.015625, + "step": 2487, + "time_per_iteration": 2.436316967010498 + }, + { + "auxiliary_loss_clip": 0.01148703, + "auxiliary_loss_mlp": 0.01048053, + "balance_loss_clip": 1.02934861, + "balance_loss_mlp": 1.04707325, + "epoch": 0.1495866526379077, + "flos": 35517699959040.0, + "grad_norm": 2.1423480103066375, + "language_loss": 0.71837348, + "learning_rate": 3.8518495543877e-06, + "loss": 0.74034101, + "num_input_tokens_seen": 54031315, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 2488, + "time_per_iteration": 2.649794816970825 + }, + { + "auxiliary_loss_clip": 0.01156424, + "auxiliary_loss_mlp": 0.01046549, + "balance_loss_clip": 1.02780962, + "balance_loss_mlp": 1.04946375, + "epoch": 0.14964677589057568, + "flos": 17632749795840.0, + "grad_norm": 2.5167610907777513, + "language_loss": 0.70519507, + "learning_rate": 3.851702416498235e-06, + "loss": 0.72722483, + "num_input_tokens_seen": 54045965, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0703125, + "step": 2489, + "time_per_iteration": 2.416708469390869 + }, + { + "auxiliary_loss_clip": 0.01153385, + "auxiliary_loss_mlp": 0.01045512, + "balance_loss_clip": 1.02637911, + "balance_loss_mlp": 1.04785299, + "epoch": 0.14970689914324364, + "flos": 20185280029440.0, + "grad_norm": 6.063777716142612, + "language_loss": 0.81789696, + "learning_rate": 3.8515552083920295e-06, + "loss": 0.83988589, + "num_input_tokens_seen": 54059960, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0546875, + "step": 2490, + "time_per_iteration": 2.433284282684326 + }, + { + "auxiliary_loss_clip": 0.0115747, + "auxiliary_loss_mlp": 0.01042151, + "balance_loss_clip": 1.02357852, + "balance_loss_mlp": 1.05097246, + "epoch": 0.1497670223959116, + "flos": 37228699382400.0, + "grad_norm": 1.781748843431282, + "language_loss": 0.79878485, + "learning_rate": 3.851407930074666e-06, + "loss": 0.82078111, + "num_input_tokens_seen": 54079330, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0625, + "step": 2491, + "time_per_iteration": 2.616642475128174 + }, + { + "auxiliary_loss_clip": 0.0115457, + "auxiliary_loss_mlp": 0.01046038, + "balance_loss_clip": 1.02491403, + "balance_loss_mlp": 1.04683256, + "epoch": 0.1498271456485796, + "flos": 24455848752000.0, + "grad_norm": 2.263792295832721, + "language_loss": 0.90779251, + "learning_rate": 3.851260581551727e-06, + "loss": 0.9297986, + "num_input_tokens_seen": 54097555, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.078125, + "step": 2492, + "time_per_iteration": 2.508188009262085 + }, + { + "auxiliary_loss_clip": 0.01152347, + "auxiliary_loss_mlp": 0.01059815, + "balance_loss_clip": 1.04028893, + "balance_loss_mlp": 1.04883122, + "epoch": 0.14988726890124757, + "flos": 16253601148800.0, + "grad_norm": 2.7210225604175116, + "language_loss": 0.79162109, + "learning_rate": 3.851113162828802e-06, + "loss": 0.8137427, + "num_input_tokens_seen": 54115600, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.03125, + "step": 2493, + "time_per_iteration": 2.4228014945983887 + }, + { + "auxiliary_loss_clip": 0.01150881, + "auxiliary_loss_mlp": 0.01042857, + "balance_loss_clip": 1.02299631, + "balance_loss_mlp": 1.04643607, + "epoch": 0.14994739215391553, + "flos": 20666555383680.0, + "grad_norm": 2.8095511996528297, + "language_loss": 0.80186284, + "learning_rate": 3.85096567391148e-06, + "loss": 0.82380015, + "num_input_tokens_seen": 54135220, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.046875, + "step": 2494, + "time_per_iteration": 2.4774162769317627 + }, + { + "auxiliary_loss_clip": 0.01149241, + "auxiliary_loss_mlp": 0.01046465, + "balance_loss_clip": 1.02613974, + "balance_loss_mlp": 1.04731214, + "epoch": 0.1500075154065835, + "flos": 70652375239680.0, + "grad_norm": 1.9697458415941205, + "language_loss": 0.65825832, + "learning_rate": 3.850818114805354e-06, + "loss": 0.68021536, + "num_input_tokens_seen": 54161065, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.015625, + "step": 2495, + "time_per_iteration": 2.87758207321167 + }, + { + "auxiliary_loss_clip": 0.01053312, + "auxiliary_loss_mlp": 0.01002116, + "balance_loss_clip": 0.99992257, + "balance_loss_mlp": 1.01668406, + "epoch": 0.15006763865925146, + "flos": 68011937447040.0, + "grad_norm": 1.1924806916138095, + "language_loss": 0.59488082, + "learning_rate": 3.850670485516019e-06, + "loss": 0.61543506, + "num_input_tokens_seen": 54225095, + "router_z_loss_clip": 0.02197266, + "router_z_loss_mlp": 0.3671875, + "step": 2496, + "time_per_iteration": 3.0807061195373535 + }, + { + "auxiliary_loss_clip": 0.01152427, + "auxiliary_loss_mlp": 0.01054598, + "balance_loss_clip": 1.03467774, + "balance_loss_mlp": 1.0468092, + "epoch": 0.15012776191191943, + "flos": 18916269459840.0, + "grad_norm": 2.296903755979897, + "language_loss": 0.65457296, + "learning_rate": 3.850522786049075e-06, + "loss": 0.67664325, + "num_input_tokens_seen": 54243750, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0546875, + "step": 2497, + "time_per_iteration": 2.4403655529022217 + }, + { + "auxiliary_loss_clip": 0.01155934, + "auxiliary_loss_mlp": 0.01048581, + "balance_loss_clip": 1.03021121, + "balance_loss_mlp": 1.05125117, + "epoch": 0.1501878851645874, + "flos": 23701330638720.0, + "grad_norm": 1.4500790349521295, + "language_loss": 0.75247943, + "learning_rate": 3.850375016410121e-06, + "loss": 0.77452457, + "num_input_tokens_seen": 54266185, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.046875, + "step": 2498, + "time_per_iteration": 2.5286927223205566 + }, + { + "auxiliary_loss_clip": 0.01155949, + "auxiliary_loss_mlp": 0.01043093, + "balance_loss_clip": 1.02256477, + "balance_loss_mlp": 1.04910398, + "epoch": 0.15024800841725539, + "flos": 20412523422720.0, + "grad_norm": 2.1627878003877257, + "language_loss": 0.72073609, + "learning_rate": 3.850227176604761e-06, + "loss": 0.74272656, + "num_input_tokens_seen": 54283940, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0625, + "step": 2499, + "time_per_iteration": 2.4415009021759033 + }, + { + "auxiliary_loss_clip": 0.01153017, + "auxiliary_loss_mlp": 0.01049378, + "balance_loss_clip": 1.03001857, + "balance_loss_mlp": 1.04765654, + "epoch": 0.15030813166992335, + "flos": 31831002812160.0, + "grad_norm": 1.7935878764928508, + "language_loss": 0.7195605, + "learning_rate": 3.850079266638601e-06, + "loss": 0.74158442, + "num_input_tokens_seen": 54304830, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0625, + "step": 2500, + "time_per_iteration": 2.5504300594329834 + }, + { + "auxiliary_loss_clip": 0.01152715, + "auxiliary_loss_mlp": 0.01058033, + "balance_loss_clip": 1.03831601, + "balance_loss_mlp": 1.04960001, + "epoch": 0.15036825492259132, + "flos": 35657822914560.0, + "grad_norm": 2.491284008551419, + "language_loss": 0.64973354, + "learning_rate": 3.849931286517249e-06, + "loss": 0.67184103, + "num_input_tokens_seen": 54325595, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.03125, + "step": 2501, + "time_per_iteration": 2.587292432785034 + }, + { + "auxiliary_loss_clip": 0.01153217, + "auxiliary_loss_mlp": 0.01059755, + "balance_loss_clip": 1.03940582, + "balance_loss_mlp": 1.04861319, + "epoch": 0.15042837817525928, + "flos": 18838163335680.0, + "grad_norm": 2.0240839018319, + "language_loss": 0.83043593, + "learning_rate": 3.849783236246318e-06, + "loss": 0.85256565, + "num_input_tokens_seen": 54342180, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.046875, + "step": 2502, + "time_per_iteration": 2.470350980758667 + }, + { + "auxiliary_loss_clip": 0.01149694, + "auxiliary_loss_mlp": 0.01050766, + "balance_loss_clip": 1.03272963, + "balance_loss_mlp": 1.04702473, + "epoch": 0.15048850142792725, + "flos": 19535548867200.0, + "grad_norm": 2.3174234065433597, + "language_loss": 0.77197748, + "learning_rate": 3.849635115831421e-06, + "loss": 0.79398209, + "num_input_tokens_seen": 54360255, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.0234375, + "step": 2503, + "time_per_iteration": 2.6598432064056396 + }, + { + "auxiliary_loss_clip": 0.01151836, + "auxiliary_loss_mlp": 0.01043843, + "balance_loss_clip": 1.02585387, + "balance_loss_mlp": 1.04901898, + "epoch": 0.1505486246805952, + "flos": 22017550746240.0, + "grad_norm": 2.1270494317377007, + "language_loss": 0.85432625, + "learning_rate": 3.849486925278176e-06, + "loss": 0.87628305, + "num_input_tokens_seen": 54378260, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0234375, + "step": 2504, + "time_per_iteration": 2.7323355674743652 + }, + { + "auxiliary_loss_clip": 0.01150395, + "auxiliary_loss_mlp": 0.01041023, + "balance_loss_clip": 1.02322519, + "balance_loss_mlp": 1.04855871, + "epoch": 0.15060874793326318, + "flos": 20743153136640.0, + "grad_norm": 1.6383963769174188, + "language_loss": 0.83226919, + "learning_rate": 3.8493386645922e-06, + "loss": 0.85418344, + "num_input_tokens_seen": 54399745, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 1.015625, + "step": 2505, + "time_per_iteration": 2.4866323471069336 + }, + { + "auxiliary_loss_clip": 0.01150621, + "auxiliary_loss_mlp": 0.01046549, + "balance_loss_clip": 1.02851272, + "balance_loss_mlp": 1.04672468, + "epoch": 0.15066887118593117, + "flos": 16471902055680.0, + "grad_norm": 2.268670074130615, + "language_loss": 0.7639147, + "learning_rate": 3.849190333779117e-06, + "loss": 0.78588635, + "num_input_tokens_seen": 54417105, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.0390625, + "step": 2506, + "time_per_iteration": 2.4266390800476074 + }, + { + "auxiliary_loss_clip": 0.01156061, + "auxiliary_loss_mlp": 0.01043099, + "balance_loss_clip": 1.02452636, + "balance_loss_mlp": 1.04987144, + "epoch": 0.15072899443859913, + "flos": 19859319083520.0, + "grad_norm": 4.189374997051622, + "language_loss": 0.76202261, + "learning_rate": 3.849041932844552e-06, + "loss": 0.78401417, + "num_input_tokens_seen": 54433920, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0625, + "step": 2507, + "time_per_iteration": 2.477936029434204 + }, + { + "auxiliary_loss_clip": 0.01145213, + "auxiliary_loss_mlp": 0.01043256, + "balance_loss_clip": 1.02519584, + "balance_loss_mlp": 1.04538798, + "epoch": 0.1507891176912671, + "flos": 20776226584320.0, + "grad_norm": 2.4120052182021503, + "language_loss": 0.69041586, + "learning_rate": 3.848893461794131e-06, + "loss": 0.71230054, + "num_input_tokens_seen": 54451540, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.0, + "step": 2508, + "time_per_iteration": 2.4462738037109375 + }, + { + "auxiliary_loss_clip": 0.01156095, + "auxiliary_loss_mlp": 0.01046654, + "balance_loss_clip": 1.02870142, + "balance_loss_mlp": 1.05190873, + "epoch": 0.15084924094393506, + "flos": 23586631534080.0, + "grad_norm": 1.8904486830015208, + "language_loss": 0.77516425, + "learning_rate": 3.8487449206334845e-06, + "loss": 0.79719174, + "num_input_tokens_seen": 54470800, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0390625, + "step": 2509, + "time_per_iteration": 2.47723126411438 + }, + { + "auxiliary_loss_clip": 0.01160822, + "auxiliary_loss_mlp": 0.01052281, + "balance_loss_clip": 1.0307281, + "balance_loss_mlp": 1.05027628, + "epoch": 0.15090936419660303, + "flos": 18911313383040.0, + "grad_norm": 2.607083522867767, + "language_loss": 0.80497003, + "learning_rate": 3.848596309368246e-06, + "loss": 0.82710105, + "num_input_tokens_seen": 54486525, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.1015625, + "step": 2510, + "time_per_iteration": 2.4445176124572754 + }, + { + "auxiliary_loss_clip": 0.0115714, + "auxiliary_loss_mlp": 0.01053415, + "balance_loss_clip": 1.0336144, + "balance_loss_mlp": 1.05078745, + "epoch": 0.150969487449271, + "flos": 17928223073280.0, + "grad_norm": 2.033214689307001, + "language_loss": 0.73913604, + "learning_rate": 3.8484476280040495e-06, + "loss": 0.76124156, + "num_input_tokens_seen": 54503795, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0625, + "step": 2511, + "time_per_iteration": 2.4372222423553467 + }, + { + "auxiliary_loss_clip": 0.01152059, + "auxiliary_loss_mlp": 0.01040964, + "balance_loss_clip": 1.02332115, + "balance_loss_mlp": 1.04880548, + "epoch": 0.151029610701939, + "flos": 24243078539520.0, + "grad_norm": 2.077792778828972, + "language_loss": 0.6935091, + "learning_rate": 3.848298876546534e-06, + "loss": 0.71543926, + "num_input_tokens_seen": 54523025, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 1.03125, + "step": 2512, + "time_per_iteration": 2.5126166343688965 + }, + { + "auxiliary_loss_clip": 0.01154623, + "auxiliary_loss_mlp": 0.01047016, + "balance_loss_clip": 1.02903962, + "balance_loss_mlp": 1.05130434, + "epoch": 0.15108973395460695, + "flos": 30262496641920.0, + "grad_norm": 3.0703205269170364, + "language_loss": 0.73833334, + "learning_rate": 3.84815005500134e-06, + "loss": 0.76034975, + "num_input_tokens_seen": 54545025, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.03125, + "step": 2513, + "time_per_iteration": 2.5560262203216553 + }, + { + "auxiliary_loss_clip": 0.01052097, + "auxiliary_loss_mlp": 0.01001905, + "balance_loss_clip": 0.99995023, + "balance_loss_mlp": 1.01588845, + "epoch": 0.15114985720727492, + "flos": 60437624428800.0, + "grad_norm": 0.8742342414591, + "language_loss": 0.64759278, + "learning_rate": 3.84800116337411e-06, + "loss": 0.6681329, + "num_input_tokens_seen": 54604545, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.36328125, + "step": 2514, + "time_per_iteration": 3.0147135257720947 + }, + { + "auxiliary_loss_clip": 0.01150943, + "auxiliary_loss_mlp": 0.01043819, + "balance_loss_clip": 1.02588964, + "balance_loss_mlp": 1.04910421, + "epoch": 0.15120998045994288, + "flos": 20521691832960.0, + "grad_norm": 2.6951033245551597, + "language_loss": 0.73257691, + "learning_rate": 3.8478522016704916e-06, + "loss": 0.75452447, + "num_input_tokens_seen": 54620590, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0234375, + "step": 2515, + "time_per_iteration": 2.4640309810638428 + }, + { + "auxiliary_loss_clip": 0.01151787, + "auxiliary_loss_mlp": 0.0104255, + "balance_loss_clip": 1.02361989, + "balance_loss_mlp": 1.04967082, + "epoch": 0.15127010371261085, + "flos": 21178893024000.0, + "grad_norm": 1.8637331039353218, + "language_loss": 0.76990104, + "learning_rate": 3.8477031698961325e-06, + "loss": 0.79184443, + "num_input_tokens_seen": 54640410, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.015625, + "step": 2516, + "time_per_iteration": 2.4672725200653076 + }, + { + "auxiliary_loss_clip": 0.01049641, + "auxiliary_loss_mlp": 0.01003705, + "balance_loss_clip": 1.00177407, + "balance_loss_mlp": 1.01351547, + "epoch": 0.1513302269652788, + "flos": 65320648974720.0, + "grad_norm": 0.745436195681612, + "language_loss": 0.54673135, + "learning_rate": 3.8475540680566835e-06, + "loss": 0.56726485, + "num_input_tokens_seen": 54701430, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.36132812, + "step": 2517, + "time_per_iteration": 3.0677855014801025 + }, + { + "auxiliary_loss_clip": 0.01151686, + "auxiliary_loss_mlp": 0.0104095, + "balance_loss_clip": 1.02126849, + "balance_loss_mlp": 1.04780149, + "epoch": 0.15139035021794678, + "flos": 19135827342720.0, + "grad_norm": 2.2326216563166983, + "language_loss": 0.78515786, + "learning_rate": 3.8474048961577995e-06, + "loss": 0.8070842, + "num_input_tokens_seen": 54720845, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0390625, + "step": 2518, + "time_per_iteration": 3.8305110931396484 + }, + { + "auxiliary_loss_clip": 0.01159011, + "auxiliary_loss_mlp": 0.01048517, + "balance_loss_clip": 1.02851379, + "balance_loss_mlp": 1.05163026, + "epoch": 0.15145047347061477, + "flos": 26578564842240.0, + "grad_norm": 2.1364726943924772, + "language_loss": 0.70153689, + "learning_rate": 3.847255654205137e-06, + "loss": 0.72361219, + "num_input_tokens_seen": 54740495, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0703125, + "step": 2519, + "time_per_iteration": 3.9920616149902344 + }, + { + "auxiliary_loss_clip": 0.01151572, + "auxiliary_loss_mlp": 0.01044317, + "balance_loss_clip": 1.02549386, + "balance_loss_mlp": 1.04812384, + "epoch": 0.15151059672328274, + "flos": 20302959962880.0, + "grad_norm": 1.9802508383478334, + "language_loss": 0.79219216, + "learning_rate": 3.847106342204354e-06, + "loss": 0.81415105, + "num_input_tokens_seen": 54758415, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.03125, + "step": 2520, + "time_per_iteration": 2.4853925704956055 + }, + { + "auxiliary_loss_clip": 0.01155647, + "auxiliary_loss_mlp": 0.01050752, + "balance_loss_clip": 1.03090394, + "balance_loss_mlp": 1.05067897, + "epoch": 0.1515707199759507, + "flos": 27228367831680.0, + "grad_norm": 2.075013959426641, + "language_loss": 0.74324691, + "learning_rate": 3.846956960161114e-06, + "loss": 0.76531088, + "num_input_tokens_seen": 54779355, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.046875, + "step": 2521, + "time_per_iteration": 2.6154706478118896 + }, + { + "auxiliary_loss_clip": 0.01160623, + "auxiliary_loss_mlp": 0.01045817, + "balance_loss_clip": 1.02587366, + "balance_loss_mlp": 1.05273759, + "epoch": 0.15163084322861867, + "flos": 23587349806080.0, + "grad_norm": 2.7623729867934737, + "language_loss": 0.81996739, + "learning_rate": 3.84680750808108e-06, + "loss": 0.84203184, + "num_input_tokens_seen": 54799465, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.078125, + "step": 2522, + "time_per_iteration": 2.4873530864715576 + }, + { + "auxiliary_loss_clip": 0.0104876, + "auxiliary_loss_mlp": 0.01001752, + "balance_loss_clip": 0.99982071, + "balance_loss_mlp": 1.01252866, + "epoch": 0.15169096648128663, + "flos": 66889622021760.0, + "grad_norm": 0.824359498034346, + "language_loss": 0.57915509, + "learning_rate": 3.846657985969922e-06, + "loss": 0.59966022, + "num_input_tokens_seen": 54857665, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.36328125, + "step": 2523, + "time_per_iteration": 2.998990774154663 + }, + { + "auxiliary_loss_clip": 0.01153336, + "auxiliary_loss_mlp": 0.01050595, + "balance_loss_clip": 1.03147376, + "balance_loss_mlp": 1.04972816, + "epoch": 0.1517510897339546, + "flos": 29095435848960.0, + "grad_norm": 1.970015434384356, + "language_loss": 0.7485956, + "learning_rate": 3.8465083938333066e-06, + "loss": 0.77063495, + "num_input_tokens_seen": 54879895, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0390625, + "step": 2524, + "time_per_iteration": 2.570068836212158 + }, + { + "auxiliary_loss_clip": 0.0115237, + "auxiliary_loss_mlp": 0.01044934, + "balance_loss_clip": 1.02603889, + "balance_loss_mlp": 1.0488894, + "epoch": 0.1518112129866226, + "flos": 18406553512320.0, + "grad_norm": 1.8388163356316347, + "language_loss": 0.74780655, + "learning_rate": 3.8463587316769085e-06, + "loss": 0.76977956, + "num_input_tokens_seen": 54898245, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.03125, + "step": 2525, + "time_per_iteration": 2.431143283843994 + }, + { + "auxiliary_loss_clip": 0.01157293, + "auxiliary_loss_mlp": 0.01043467, + "balance_loss_clip": 1.02432156, + "balance_loss_mlp": 1.05145812, + "epoch": 0.15187133623929056, + "flos": 19425410789760.0, + "grad_norm": 1.8962457769996104, + "language_loss": 0.79644465, + "learning_rate": 3.846208999506402e-06, + "loss": 0.81845224, + "num_input_tokens_seen": 54917060, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0625, + "step": 2526, + "time_per_iteration": 2.5167391300201416 + }, + { + "auxiliary_loss_clip": 0.01151222, + "auxiliary_loss_mlp": 0.01044184, + "balance_loss_clip": 1.0271492, + "balance_loss_mlp": 1.05228162, + "epoch": 0.15193145949195852, + "flos": 17566207850880.0, + "grad_norm": 1.8025865198757494, + "language_loss": 0.84928662, + "learning_rate": 3.846059197327466e-06, + "loss": 0.87124068, + "num_input_tokens_seen": 54936365, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9921875, + "step": 2527, + "time_per_iteration": 2.4550719261169434 + }, + { + "auxiliary_loss_clip": 0.01151683, + "auxiliary_loss_mlp": 0.01041065, + "balance_loss_clip": 1.02321947, + "balance_loss_mlp": 1.04876995, + "epoch": 0.15199158274462649, + "flos": 36176265866880.0, + "grad_norm": 2.2810224367730156, + "language_loss": 0.69326001, + "learning_rate": 3.845909325145779e-06, + "loss": 0.71518755, + "num_input_tokens_seen": 54961365, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 1.03125, + "step": 2528, + "time_per_iteration": 2.610042095184326 + }, + { + "auxiliary_loss_clip": 0.01153606, + "auxiliary_loss_mlp": 0.01047581, + "balance_loss_clip": 1.0288415, + "balance_loss_mlp": 1.05137038, + "epoch": 0.15205170599729445, + "flos": 23074042498560.0, + "grad_norm": 2.490892546855648, + "language_loss": 0.86502308, + "learning_rate": 3.845759382967026e-06, + "loss": 0.88703495, + "num_input_tokens_seen": 54980750, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0234375, + "step": 2529, + "time_per_iteration": 2.4695634841918945 + }, + { + "auxiliary_loss_clip": 0.01147713, + "auxiliary_loss_mlp": 0.0103836, + "balance_loss_clip": 1.01989472, + "balance_loss_mlp": 1.04683101, + "epoch": 0.15211182924996242, + "flos": 21908382336000.0, + "grad_norm": 1.8772276619965056, + "language_loss": 0.83002013, + "learning_rate": 3.845609370796893e-06, + "loss": 0.85188091, + "num_input_tokens_seen": 54999675, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.0078125, + "step": 2530, + "time_per_iteration": 2.476238489151001 + }, + { + "auxiliary_loss_clip": 0.01153377, + "auxiliary_loss_mlp": 0.01044599, + "balance_loss_clip": 1.02550209, + "balance_loss_mlp": 1.04987955, + "epoch": 0.15217195250263038, + "flos": 13881521865600.0, + "grad_norm": 2.344030506991615, + "language_loss": 0.80540878, + "learning_rate": 3.845459288641066e-06, + "loss": 0.82738853, + "num_input_tokens_seen": 55018295, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.03125, + "step": 2531, + "time_per_iteration": 2.443617105484009 + }, + { + "auxiliary_loss_clip": 0.01149745, + "auxiliary_loss_mlp": 0.01049084, + "balance_loss_clip": 1.03138137, + "balance_loss_mlp": 1.04895151, + "epoch": 0.15223207575529837, + "flos": 24535319592960.0, + "grad_norm": 2.0816362099746017, + "language_loss": 0.79241651, + "learning_rate": 3.8453091365052394e-06, + "loss": 0.81440473, + "num_input_tokens_seen": 55037975, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 1.0078125, + "step": 2532, + "time_per_iteration": 2.5071239471435547 + }, + { + "auxiliary_loss_clip": 0.0115001, + "auxiliary_loss_mlp": 0.01046515, + "balance_loss_clip": 1.02694106, + "balance_loss_mlp": 1.04952455, + "epoch": 0.15229219900796634, + "flos": 25556798563200.0, + "grad_norm": 1.8298502444413876, + "language_loss": 0.87712961, + "learning_rate": 3.845158914395105e-06, + "loss": 0.89909488, + "num_input_tokens_seen": 55057135, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0, + "step": 2533, + "time_per_iteration": 2.5262463092803955 + }, + { + "auxiliary_loss_clip": 0.01150696, + "auxiliary_loss_mlp": 0.01047398, + "balance_loss_clip": 1.02932572, + "balance_loss_mlp": 1.04766071, + "epoch": 0.1523523222606343, + "flos": 18217806520320.0, + "grad_norm": 2.2606742211331556, + "language_loss": 0.79057097, + "learning_rate": 3.84500862231636e-06, + "loss": 0.81255192, + "num_input_tokens_seen": 55075525, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.03125, + "step": 2534, + "time_per_iteration": 2.4421815872192383 + }, + { + "auxiliary_loss_clip": 0.01156406, + "auxiliary_loss_mlp": 0.01041573, + "balance_loss_clip": 1.02177238, + "balance_loss_mlp": 1.04847312, + "epoch": 0.15241244551330227, + "flos": 13260087642240.0, + "grad_norm": 2.8989864742133933, + "language_loss": 0.76862979, + "learning_rate": 3.844858260274702e-06, + "loss": 0.7906096, + "num_input_tokens_seen": 55090845, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.078125, + "step": 2535, + "time_per_iteration": 2.4193530082702637 + }, + { + "auxiliary_loss_clip": 0.01156147, + "auxiliary_loss_mlp": 0.01040468, + "balance_loss_clip": 1.02153718, + "balance_loss_mlp": 1.04885459, + "epoch": 0.15247256876597023, + "flos": 19715568854400.0, + "grad_norm": 2.234687708038525, + "language_loss": 0.78185135, + "learning_rate": 3.844707828275835e-06, + "loss": 0.80381751, + "num_input_tokens_seen": 55108750, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0703125, + "step": 2536, + "time_per_iteration": 2.478066921234131 + }, + { + "auxiliary_loss_clip": 0.01150448, + "auxiliary_loss_mlp": 0.0105096, + "balance_loss_clip": 1.03305459, + "balance_loss_mlp": 1.05067229, + "epoch": 0.1525326920186382, + "flos": 20375858615040.0, + "grad_norm": 2.124557148089124, + "language_loss": 0.74979979, + "learning_rate": 3.844557326325461e-06, + "loss": 0.77181387, + "num_input_tokens_seen": 55126750, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0, + "step": 2537, + "time_per_iteration": 2.455779552459717 + }, + { + "auxiliary_loss_clip": 0.01152934, + "auxiliary_loss_mlp": 0.01043794, + "balance_loss_clip": 1.02545929, + "balance_loss_mlp": 1.04965043, + "epoch": 0.15259281527130616, + "flos": 13589963170560.0, + "grad_norm": 2.005826380833244, + "language_loss": 0.77631724, + "learning_rate": 3.8444067544292896e-06, + "loss": 0.79828459, + "num_input_tokens_seen": 55144690, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.03125, + "step": 2538, + "time_per_iteration": 2.527730941772461 + }, + { + "auxiliary_loss_clip": 0.01147714, + "auxiliary_loss_mlp": 0.01040159, + "balance_loss_clip": 1.02308786, + "balance_loss_mlp": 1.04806781, + "epoch": 0.15265293852397416, + "flos": 22860374446080.0, + "grad_norm": 1.6961003069906246, + "language_loss": 0.89707708, + "learning_rate": 3.844256112593029e-06, + "loss": 0.9189558, + "num_input_tokens_seen": 55166055, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.99609375, + "step": 2539, + "time_per_iteration": 2.485410451889038 + }, + { + "auxiliary_loss_clip": 0.01151642, + "auxiliary_loss_mlp": 0.01043152, + "balance_loss_clip": 1.02491331, + "balance_loss_mlp": 1.05028892, + "epoch": 0.15271306177664212, + "flos": 29238108670080.0, + "grad_norm": 2.1834515010765627, + "language_loss": 0.93514961, + "learning_rate": 3.844105400822391e-06, + "loss": 0.95709753, + "num_input_tokens_seen": 55186285, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.015625, + "step": 2540, + "time_per_iteration": 2.5399627685546875 + }, + { + "auxiliary_loss_clip": 0.01144897, + "auxiliary_loss_mlp": 0.01043966, + "balance_loss_clip": 1.0266571, + "balance_loss_mlp": 1.04625463, + "epoch": 0.1527731850293101, + "flos": 31246269310080.0, + "grad_norm": 1.9271166035098393, + "language_loss": 0.75039941, + "learning_rate": 3.843954619123092e-06, + "loss": 0.77228808, + "num_input_tokens_seen": 55207915, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.984375, + "step": 2541, + "time_per_iteration": 2.516559362411499 + }, + { + "auxiliary_loss_clip": 0.01147451, + "auxiliary_loss_mlp": 0.01048877, + "balance_loss_clip": 1.03025603, + "balance_loss_mlp": 1.04787207, + "epoch": 0.15283330828197805, + "flos": 22382079920640.0, + "grad_norm": 1.7480154890803248, + "language_loss": 0.81308234, + "learning_rate": 3.84380376750085e-06, + "loss": 0.83504558, + "num_input_tokens_seen": 55227860, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.99609375, + "step": 2542, + "time_per_iteration": 2.4681694507598877 + }, + { + "auxiliary_loss_clip": 0.01150381, + "auxiliary_loss_mlp": 0.01050782, + "balance_loss_clip": 1.03213799, + "balance_loss_mlp": 1.04772067, + "epoch": 0.15289343153464602, + "flos": 25520133755520.0, + "grad_norm": 2.009812895323552, + "language_loss": 0.77568293, + "learning_rate": 3.843652845961383e-06, + "loss": 0.79769456, + "num_input_tokens_seen": 55247330, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.0234375, + "step": 2543, + "time_per_iteration": 2.4899120330810547 + }, + { + "auxiliary_loss_clip": 0.01147346, + "auxiliary_loss_mlp": 0.01045177, + "balance_loss_clip": 1.0266397, + "balance_loss_mlp": 1.04692626, + "epoch": 0.15295355478731398, + "flos": 22710016114560.0, + "grad_norm": 2.3128696364379935, + "language_loss": 0.86483204, + "learning_rate": 3.843501854510416e-06, + "loss": 0.88675725, + "num_input_tokens_seen": 55266195, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0, + "step": 2544, + "time_per_iteration": 2.4774844646453857 + }, + { + "auxiliary_loss_clip": 0.01152485, + "auxiliary_loss_mlp": 0.010531, + "balance_loss_clip": 1.03287029, + "balance_loss_mlp": 1.04675508, + "epoch": 0.15301367803998198, + "flos": 23251907669760.0, + "grad_norm": 2.0966566192890106, + "language_loss": 0.8228749, + "learning_rate": 3.843350793153673e-06, + "loss": 0.84493077, + "num_input_tokens_seen": 55283305, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0546875, + "step": 2545, + "time_per_iteration": 2.4526925086975098 + }, + { + "auxiliary_loss_clip": 0.01149503, + "auxiliary_loss_mlp": 0.01044491, + "balance_loss_clip": 1.02614498, + "balance_loss_mlp": 1.04802954, + "epoch": 0.15307380129264994, + "flos": 25886279041920.0, + "grad_norm": 2.540509049886226, + "language_loss": 0.70711339, + "learning_rate": 3.843199661896884e-06, + "loss": 0.72905338, + "num_input_tokens_seen": 55303035, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.015625, + "step": 2546, + "time_per_iteration": 2.5009732246398926 + }, + { + "auxiliary_loss_clip": 0.01152354, + "auxiliary_loss_mlp": 0.01043417, + "balance_loss_clip": 1.02423596, + "balance_loss_mlp": 1.04967904, + "epoch": 0.1531339245453179, + "flos": 46973239205760.0, + "grad_norm": 1.5770850469719229, + "language_loss": 0.77521312, + "learning_rate": 3.843048460745779e-06, + "loss": 0.79717076, + "num_input_tokens_seen": 55327570, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.03125, + "step": 2547, + "time_per_iteration": 2.6822421550750732 + }, + { + "auxiliary_loss_clip": 0.01152263, + "auxiliary_loss_mlp": 0.01047861, + "balance_loss_clip": 1.02932382, + "balance_loss_mlp": 1.04904902, + "epoch": 0.15319404779798587, + "flos": 35882049565440.0, + "grad_norm": 2.0900989153424976, + "language_loss": 0.73985445, + "learning_rate": 3.842897189706092e-06, + "loss": 0.76185566, + "num_input_tokens_seen": 55351090, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.03125, + "step": 2548, + "time_per_iteration": 2.59080171585083 + }, + { + "auxiliary_loss_clip": 0.01150578, + "auxiliary_loss_mlp": 0.01050674, + "balance_loss_clip": 1.03158915, + "balance_loss_mlp": 1.04806828, + "epoch": 0.15325417105065384, + "flos": 25664638170240.0, + "grad_norm": 1.499185349529517, + "language_loss": 0.80589813, + "learning_rate": 3.842745848783558e-06, + "loss": 0.82791066, + "num_input_tokens_seen": 55371050, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0234375, + "step": 2549, + "time_per_iteration": 2.498096227645874 + }, + { + "auxiliary_loss_clip": 0.01150664, + "auxiliary_loss_mlp": 0.01048572, + "balance_loss_clip": 1.02951026, + "balance_loss_mlp": 1.04750037, + "epoch": 0.1533142943033218, + "flos": 18770831291520.0, + "grad_norm": 1.687491024735964, + "language_loss": 0.74760693, + "learning_rate": 3.842594437983917e-06, + "loss": 0.76959932, + "num_input_tokens_seen": 55390375, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.03125, + "step": 2550, + "time_per_iteration": 2.4740684032440186 + }, + { + "auxiliary_loss_clip": 0.01153822, + "auxiliary_loss_mlp": 0.01039682, + "balance_loss_clip": 1.02035773, + "balance_loss_mlp": 1.04903841, + "epoch": 0.15337441755598977, + "flos": 23107367341440.0, + "grad_norm": 2.205632522725416, + "language_loss": 0.76839805, + "learning_rate": 3.8424429573129115e-06, + "loss": 0.79033309, + "num_input_tokens_seen": 55408890, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.046875, + "step": 2551, + "time_per_iteration": 2.468886375427246 + }, + { + "auxiliary_loss_clip": 0.01045401, + "auxiliary_loss_mlp": 0.01020401, + "balance_loss_clip": 1.01873255, + "balance_loss_mlp": 1.0102303, + "epoch": 0.15343454080865776, + "flos": 59861079227520.0, + "grad_norm": 0.9464853846906186, + "language_loss": 0.56666422, + "learning_rate": 3.842291406776283e-06, + "loss": 0.58732224, + "num_input_tokens_seen": 55463815, + "router_z_loss_clip": 0.01672363, + "router_z_loss_mlp": 0.3515625, + "step": 2552, + "time_per_iteration": 3.0059380531311035 + }, + { + "auxiliary_loss_clip": 0.01152358, + "auxiliary_loss_mlp": 0.010458, + "balance_loss_clip": 1.02684569, + "balance_loss_mlp": 1.04793155, + "epoch": 0.15349466406132573, + "flos": 11910887959680.0, + "grad_norm": 3.2490122092843947, + "language_loss": 0.88505352, + "learning_rate": 3.84213978637978e-06, + "loss": 0.90703511, + "num_input_tokens_seen": 55481050, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.046875, + "step": 2553, + "time_per_iteration": 2.4523322582244873 + }, + { + "auxiliary_loss_clip": 0.01153624, + "auxiliary_loss_mlp": 0.01047537, + "balance_loss_clip": 1.02858269, + "balance_loss_mlp": 1.04771137, + "epoch": 0.1535547873139937, + "flos": 24096922099200.0, + "grad_norm": 1.8003580088176259, + "language_loss": 0.78462374, + "learning_rate": 3.841988096129152e-06, + "loss": 0.80663538, + "num_input_tokens_seen": 55500050, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0625, + "step": 2554, + "time_per_iteration": 2.48526668548584 + }, + { + "auxiliary_loss_clip": 0.01154341, + "auxiliary_loss_mlp": 0.01052395, + "balance_loss_clip": 1.03212881, + "balance_loss_mlp": 1.04941773, + "epoch": 0.15361491056666166, + "flos": 17566459246080.0, + "grad_norm": 2.4926146542113763, + "language_loss": 0.78344929, + "learning_rate": 3.841836336030151e-06, + "loss": 0.80551672, + "num_input_tokens_seen": 55518125, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.046875, + "step": 2555, + "time_per_iteration": 2.4687228202819824 + }, + { + "auxiliary_loss_clip": 0.01149124, + "auxiliary_loss_mlp": 0.01053536, + "balance_loss_clip": 1.03543973, + "balance_loss_mlp": 1.04890609, + "epoch": 0.15367503381932962, + "flos": 25046041121280.0, + "grad_norm": 1.6634961059278193, + "language_loss": 0.76901627, + "learning_rate": 3.8416845060885305e-06, + "loss": 0.7910428, + "num_input_tokens_seen": 55540960, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.0, + "step": 2556, + "time_per_iteration": 2.5006635189056396 + }, + { + "auxiliary_loss_clip": 0.01145988, + "auxiliary_loss_mlp": 0.0104239, + "balance_loss_clip": 1.02362633, + "balance_loss_mlp": 1.04657805, + "epoch": 0.15373515707199759, + "flos": 21507332008320.0, + "grad_norm": 1.8623555031997667, + "language_loss": 0.89489496, + "learning_rate": 3.84153260631005e-06, + "loss": 0.9167788, + "num_input_tokens_seen": 55559210, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.99609375, + "step": 2557, + "time_per_iteration": 2.4434657096862793 + }, + { + "auxiliary_loss_clip": 0.01151609, + "auxiliary_loss_mlp": 0.01046416, + "balance_loss_clip": 1.0263536, + "balance_loss_mlp": 1.04834831, + "epoch": 0.15379528032466555, + "flos": 25994729180160.0, + "grad_norm": 2.0348980361104587, + "language_loss": 0.7119934, + "learning_rate": 3.841380636700468e-06, + "loss": 0.73397368, + "num_input_tokens_seen": 55578925, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.03125, + "step": 2558, + "time_per_iteration": 2.490226984024048 + }, + { + "auxiliary_loss_clip": 0.01152232, + "auxiliary_loss_mlp": 0.01047681, + "balance_loss_clip": 1.02863097, + "balance_loss_mlp": 1.04888546, + "epoch": 0.15385540357733354, + "flos": 19277315015040.0, + "grad_norm": 2.2935483083292705, + "language_loss": 0.92370701, + "learning_rate": 3.841228597265548e-06, + "loss": 0.94570613, + "num_input_tokens_seen": 55597255, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.03125, + "step": 2559, + "time_per_iteration": 3.885131597518921 + }, + { + "auxiliary_loss_clip": 0.01155373, + "auxiliary_loss_mlp": 0.01053347, + "balance_loss_clip": 1.03331971, + "balance_loss_mlp": 1.05068171, + "epoch": 0.1539155268300015, + "flos": 28549126920960.0, + "grad_norm": 5.140445938018919, + "language_loss": 0.63637704, + "learning_rate": 3.841076488011055e-06, + "loss": 0.65846419, + "num_input_tokens_seen": 55619515, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.046875, + "step": 2560, + "time_per_iteration": 5.343889236450195 + }, + { + "auxiliary_loss_clip": 0.01153839, + "auxiliary_loss_mlp": 0.01046849, + "balance_loss_clip": 1.02725124, + "balance_loss_mlp": 1.04950392, + "epoch": 0.15397565008266947, + "flos": 23547883737600.0, + "grad_norm": 1.8613162525264346, + "language_loss": 0.88230681, + "learning_rate": 3.8409243089427574e-06, + "loss": 0.90431374, + "num_input_tokens_seen": 55640050, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.046875, + "step": 2561, + "time_per_iteration": 2.4648611545562744 + }, + { + "auxiliary_loss_clip": 0.01145331, + "auxiliary_loss_mlp": 0.01041909, + "balance_loss_clip": 1.02433765, + "balance_loss_mlp": 1.0477581, + "epoch": 0.15403577333533744, + "flos": 17129821518720.0, + "grad_norm": 1.8458305826175445, + "language_loss": 0.82909077, + "learning_rate": 3.840772060066425e-06, + "loss": 0.85096323, + "num_input_tokens_seen": 55658695, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9765625, + "step": 2562, + "time_per_iteration": 2.4327874183654785 + }, + { + "auxiliary_loss_clip": 0.01160792, + "auxiliary_loss_mlp": 0.0104767, + "balance_loss_clip": 1.02614117, + "balance_loss_mlp": 1.05274105, + "epoch": 0.1540958965880054, + "flos": 17894503180800.0, + "grad_norm": 1.8513620412223286, + "language_loss": 0.74713194, + "learning_rate": 3.840619741387832e-06, + "loss": 0.7692166, + "num_input_tokens_seen": 55676340, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.078125, + "step": 2563, + "time_per_iteration": 2.4246435165405273 + }, + { + "auxiliary_loss_clip": 0.01152598, + "auxiliary_loss_mlp": 0.01044039, + "balance_loss_clip": 1.02425051, + "balance_loss_mlp": 1.04708791, + "epoch": 0.15415601984067337, + "flos": 32161057908480.0, + "grad_norm": 4.308351588789828, + "language_loss": 0.75896233, + "learning_rate": 3.8404673529127534e-06, + "loss": 0.78092873, + "num_input_tokens_seen": 55698890, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.0546875, + "step": 2564, + "time_per_iteration": 2.5528018474578857 + }, + { + "auxiliary_loss_clip": 0.01149402, + "auxiliary_loss_mlp": 0.01050825, + "balance_loss_clip": 1.03233564, + "balance_loss_mlp": 1.04782677, + "epoch": 0.15421614309334136, + "flos": 24024418496640.0, + "grad_norm": 1.9915177170702032, + "language_loss": 0.70825899, + "learning_rate": 3.840314894646969e-06, + "loss": 0.73026133, + "num_input_tokens_seen": 55718535, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.015625, + "step": 2565, + "time_per_iteration": 2.5007505416870117 + }, + { + "auxiliary_loss_clip": 0.01149717, + "auxiliary_loss_mlp": 0.01050801, + "balance_loss_clip": 1.0315845, + "balance_loss_mlp": 1.04728019, + "epoch": 0.15427626634600933, + "flos": 24386290064640.0, + "grad_norm": 2.308308002927142, + "language_loss": 0.71535969, + "learning_rate": 3.840162366596259e-06, + "loss": 0.73736489, + "num_input_tokens_seen": 55738970, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.0234375, + "step": 2566, + "time_per_iteration": 2.498033285140991 + }, + { + "auxiliary_loss_clip": 0.01143736, + "auxiliary_loss_mlp": 0.01042132, + "balance_loss_clip": 1.02379811, + "balance_loss_mlp": 1.04381752, + "epoch": 0.1543363895986773, + "flos": 23331522165120.0, + "grad_norm": 1.7584763964610812, + "language_loss": 0.85129261, + "learning_rate": 3.840009768766408e-06, + "loss": 0.87315124, + "num_input_tokens_seen": 55759585, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.0, + "step": 2567, + "time_per_iteration": 2.46708083152771 + }, + { + "auxiliary_loss_clip": 0.01150763, + "auxiliary_loss_mlp": 0.01050725, + "balance_loss_clip": 1.03266454, + "balance_loss_mlp": 1.0491097, + "epoch": 0.15439651285134526, + "flos": 24274284480000.0, + "grad_norm": 2.4904852760766127, + "language_loss": 0.78025472, + "learning_rate": 3.839857101163202e-06, + "loss": 0.80226958, + "num_input_tokens_seen": 55779250, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.015625, + "step": 2568, + "time_per_iteration": 2.476029634475708 + }, + { + "auxiliary_loss_clip": 0.01150703, + "auxiliary_loss_mlp": 0.01039967, + "balance_loss_clip": 1.01974905, + "balance_loss_mlp": 1.04835856, + "epoch": 0.15445663610401322, + "flos": 22456163721600.0, + "grad_norm": 1.967048361077992, + "language_loss": 0.70183134, + "learning_rate": 3.83970436379243e-06, + "loss": 0.72373807, + "num_input_tokens_seen": 55800470, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0234375, + "step": 2569, + "time_per_iteration": 2.4566383361816406 + }, + { + "auxiliary_loss_clip": 0.011445, + "auxiliary_loss_mlp": 0.0104299, + "balance_loss_clip": 1.0243814, + "balance_loss_mlp": 1.04563344, + "epoch": 0.1545167593566812, + "flos": 22049510872320.0, + "grad_norm": 1.7954711420319855, + "language_loss": 0.76502788, + "learning_rate": 3.839551556659884e-06, + "loss": 0.78690279, + "num_input_tokens_seen": 55817795, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.98828125, + "step": 2570, + "time_per_iteration": 2.4543209075927734 + }, + { + "auxiliary_loss_clip": 0.01149071, + "auxiliary_loss_mlp": 0.01044712, + "balance_loss_clip": 1.02532816, + "balance_loss_mlp": 1.04811645, + "epoch": 0.15457688260934915, + "flos": 19318253541120.0, + "grad_norm": 7.2402617485583525, + "language_loss": 0.77214551, + "learning_rate": 3.839398679771359e-06, + "loss": 0.7940833, + "num_input_tokens_seen": 55836125, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0078125, + "step": 2571, + "time_per_iteration": 2.4532222747802734 + }, + { + "auxiliary_loss_clip": 0.0114992, + "auxiliary_loss_mlp": 0.01049579, + "balance_loss_clip": 1.03086352, + "balance_loss_mlp": 1.04835165, + "epoch": 0.15463700586201715, + "flos": 24133981956480.0, + "grad_norm": 1.949392721600437, + "language_loss": 0.82254899, + "learning_rate": 3.839245733132652e-06, + "loss": 0.84454399, + "num_input_tokens_seen": 55855280, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 2572, + "time_per_iteration": 2.4919703006744385 + }, + { + "auxiliary_loss_clip": 0.01152049, + "auxiliary_loss_mlp": 0.01047577, + "balance_loss_clip": 1.02838445, + "balance_loss_mlp": 1.04827368, + "epoch": 0.1546971291146851, + "flos": 22420935457920.0, + "grad_norm": 1.621727953381826, + "language_loss": 0.90506172, + "learning_rate": 3.839092716749563e-06, + "loss": 0.92705798, + "num_input_tokens_seen": 55875695, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.03125, + "step": 2573, + "time_per_iteration": 2.4679911136627197 + }, + { + "auxiliary_loss_clip": 0.01152025, + "auxiliary_loss_mlp": 0.01056653, + "balance_loss_clip": 1.03724563, + "balance_loss_mlp": 1.04919529, + "epoch": 0.15475725236735308, + "flos": 17530225401600.0, + "grad_norm": 1.7899098306423509, + "language_loss": 0.70378339, + "learning_rate": 3.838939630627893e-06, + "loss": 0.72587025, + "num_input_tokens_seen": 55894575, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0234375, + "step": 2574, + "time_per_iteration": 2.448148012161255 + }, + { + "auxiliary_loss_clip": 0.01150284, + "auxiliary_loss_mlp": 0.01048729, + "balance_loss_clip": 1.02798676, + "balance_loss_mlp": 1.04641008, + "epoch": 0.15481737562002104, + "flos": 22561740771840.0, + "grad_norm": 1.761755301023602, + "language_loss": 0.82718939, + "learning_rate": 3.838786474773448e-06, + "loss": 0.84917951, + "num_input_tokens_seen": 55912855, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.0390625, + "step": 2575, + "time_per_iteration": 2.4515788555145264 + }, + { + "auxiliary_loss_clip": 0.011498, + "auxiliary_loss_mlp": 0.01047927, + "balance_loss_clip": 1.02937794, + "balance_loss_mlp": 1.0456214, + "epoch": 0.154877498872689, + "flos": 24900567039360.0, + "grad_norm": 2.21774000772259, + "language_loss": 0.84661531, + "learning_rate": 3.838633249192036e-06, + "loss": 0.86859256, + "num_input_tokens_seen": 55932375, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0390625, + "step": 2576, + "time_per_iteration": 2.5052003860473633 + }, + { + "auxiliary_loss_clip": 0.01149148, + "auxiliary_loss_mlp": 0.01043114, + "balance_loss_clip": 1.02414751, + "balance_loss_mlp": 1.04679108, + "epoch": 0.15493762212535697, + "flos": 28147501975680.0, + "grad_norm": 1.816317520286285, + "language_loss": 0.81942815, + "learning_rate": 3.838479953889465e-06, + "loss": 0.84135079, + "num_input_tokens_seen": 55953970, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0234375, + "step": 2577, + "time_per_iteration": 2.5133895874023438 + }, + { + "auxiliary_loss_clip": 0.01151988, + "auxiliary_loss_mlp": 0.01049852, + "balance_loss_clip": 1.03090954, + "balance_loss_mlp": 1.04980743, + "epoch": 0.15499774537802496, + "flos": 25411073086080.0, + "grad_norm": 2.384736720709717, + "language_loss": 0.76260924, + "learning_rate": 3.8383265888715525e-06, + "loss": 0.78462768, + "num_input_tokens_seen": 55973120, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0234375, + "step": 2578, + "time_per_iteration": 2.5140793323516846 + }, + { + "auxiliary_loss_clip": 0.01151489, + "auxiliary_loss_mlp": 0.01045761, + "balance_loss_clip": 1.02630556, + "balance_loss_mlp": 1.04832911, + "epoch": 0.15505786863069293, + "flos": 22091562720000.0, + "grad_norm": 2.651100693067537, + "language_loss": 0.82420707, + "learning_rate": 3.83817315414411e-06, + "loss": 0.84617954, + "num_input_tokens_seen": 55993260, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.03125, + "step": 2579, + "time_per_iteration": 2.4410548210144043 + }, + { + "auxiliary_loss_clip": 0.01152359, + "auxiliary_loss_mlp": 0.01049402, + "balance_loss_clip": 1.03056741, + "balance_loss_mlp": 1.05137682, + "epoch": 0.1551179918833609, + "flos": 18917131386240.0, + "grad_norm": 1.6356270056083286, + "language_loss": 0.80460835, + "learning_rate": 3.838019649712958e-06, + "loss": 0.82662606, + "num_input_tokens_seen": 56012130, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0078125, + "step": 2580, + "time_per_iteration": 2.457929849624634 + }, + { + "auxiliary_loss_clip": 0.01050492, + "auxiliary_loss_mlp": 0.01014696, + "balance_loss_clip": 1.0128479, + "balance_loss_mlp": 1.01473403, + "epoch": 0.15517811513602886, + "flos": 66239172587520.0, + "grad_norm": 0.84873853717235, + "language_loss": 0.58840239, + "learning_rate": 3.8378660755839166e-06, + "loss": 0.60905427, + "num_input_tokens_seen": 56079045, + "router_z_loss_clip": 0.01843262, + "router_z_loss_mlp": 0.35742188, + "step": 2581, + "time_per_iteration": 3.1725480556488037 + }, + { + "auxiliary_loss_clip": 0.01152966, + "auxiliary_loss_mlp": 0.01044952, + "balance_loss_clip": 1.02615237, + "balance_loss_mlp": 1.04869819, + "epoch": 0.15523823838869683, + "flos": 24021078531840.0, + "grad_norm": 1.8637973548327127, + "language_loss": 0.85214508, + "learning_rate": 3.8377124317628095e-06, + "loss": 0.87412429, + "num_input_tokens_seen": 56098745, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0390625, + "step": 2582, + "time_per_iteration": 2.486454963684082 + }, + { + "auxiliary_loss_clip": 0.01150766, + "auxiliary_loss_mlp": 0.01055198, + "balance_loss_clip": 1.03534937, + "balance_loss_mlp": 1.04837251, + "epoch": 0.1552983616413648, + "flos": 20485062938880.0, + "grad_norm": 2.457099081417407, + "language_loss": 0.78432047, + "learning_rate": 3.8375587182554625e-06, + "loss": 0.80638009, + "num_input_tokens_seen": 56117655, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0234375, + "step": 2583, + "time_per_iteration": 2.468686580657959 + }, + { + "auxiliary_loss_clip": 0.01151702, + "auxiliary_loss_mlp": 0.01054386, + "balance_loss_clip": 1.03458571, + "balance_loss_mlp": 1.04853427, + "epoch": 0.15535848489403276, + "flos": 32123710742400.0, + "grad_norm": 1.6727812592242826, + "language_loss": 0.76121294, + "learning_rate": 3.837404935067705e-06, + "loss": 0.78327382, + "num_input_tokens_seen": 56141960, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.03125, + "step": 2584, + "time_per_iteration": 2.5471444129943848 + }, + { + "auxiliary_loss_clip": 0.01149066, + "auxiliary_loss_mlp": 0.01046504, + "balance_loss_clip": 1.02746594, + "balance_loss_mlp": 1.04740906, + "epoch": 0.15541860814670075, + "flos": 19098444263040.0, + "grad_norm": 2.0194610159936324, + "language_loss": 0.75623107, + "learning_rate": 3.837251082205368e-06, + "loss": 0.7781868, + "num_input_tokens_seen": 56161430, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.015625, + "step": 2585, + "time_per_iteration": 2.4448020458221436 + }, + { + "auxiliary_loss_clip": 0.01146182, + "auxiliary_loss_mlp": 0.01049826, + "balance_loss_clip": 1.03101528, + "balance_loss_mlp": 1.04662418, + "epoch": 0.1554787313993687, + "flos": 19172097100800.0, + "grad_norm": 2.233481730992117, + "language_loss": 0.611651, + "learning_rate": 3.837097159674286e-06, + "loss": 0.63361114, + "num_input_tokens_seen": 56179390, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.9921875, + "step": 2586, + "time_per_iteration": 2.4375994205474854 + }, + { + "auxiliary_loss_clip": 0.01150781, + "auxiliary_loss_mlp": 0.01047148, + "balance_loss_clip": 1.02814651, + "balance_loss_mlp": 1.04623449, + "epoch": 0.15553885465203668, + "flos": 16143822207360.0, + "grad_norm": 1.8194244944539537, + "language_loss": 0.8108865, + "learning_rate": 3.836943167480296e-06, + "loss": 0.83286583, + "num_input_tokens_seen": 56198020, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.046875, + "step": 2587, + "time_per_iteration": 2.4394617080688477 + }, + { + "auxiliary_loss_clip": 0.01155076, + "auxiliary_loss_mlp": 0.01058653, + "balance_loss_clip": 1.03512144, + "balance_loss_mlp": 1.04851258, + "epoch": 0.15559897790470464, + "flos": 25337779384320.0, + "grad_norm": 1.8978014455674168, + "language_loss": 0.88844347, + "learning_rate": 3.836789105629236e-06, + "loss": 0.91058075, + "num_input_tokens_seen": 56218165, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.0625, + "step": 2588, + "time_per_iteration": 2.519864559173584 + }, + { + "auxiliary_loss_clip": 0.01150101, + "auxiliary_loss_mlp": 0.01053957, + "balance_loss_clip": 1.03351235, + "balance_loss_mlp": 1.04859662, + "epoch": 0.1556591011573726, + "flos": 23148772744320.0, + "grad_norm": 2.6765596364055266, + "language_loss": 0.64950025, + "learning_rate": 3.83663497412695e-06, + "loss": 0.6715408, + "num_input_tokens_seen": 56237160, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.015625, + "step": 2589, + "time_per_iteration": 2.5106732845306396 + }, + { + "auxiliary_loss_clip": 0.01150618, + "auxiliary_loss_mlp": 0.01044948, + "balance_loss_clip": 1.02451587, + "balance_loss_mlp": 1.0483036, + "epoch": 0.15571922441004057, + "flos": 25370888745600.0, + "grad_norm": 1.7614316666112095, + "language_loss": 0.82610166, + "learning_rate": 3.836480772979281e-06, + "loss": 0.84805739, + "num_input_tokens_seen": 56257610, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0234375, + "step": 2590, + "time_per_iteration": 2.519573211669922 + }, + { + "auxiliary_loss_clip": 0.01151128, + "auxiliary_loss_mlp": 0.01047405, + "balance_loss_clip": 1.02761662, + "balance_loss_mlp": 1.04740536, + "epoch": 0.15577934766270854, + "flos": 14501375890560.0, + "grad_norm": 2.1478399705358195, + "language_loss": 0.78919029, + "learning_rate": 3.836326502192077e-06, + "loss": 0.81117558, + "num_input_tokens_seen": 56275215, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.0390625, + "step": 2591, + "time_per_iteration": 2.446871519088745 + }, + { + "auxiliary_loss_clip": 0.01150021, + "auxiliary_loss_mlp": 0.01051358, + "balance_loss_clip": 1.03271413, + "balance_loss_mlp": 1.04902434, + "epoch": 0.15583947091537653, + "flos": 37414537372800.0, + "grad_norm": 1.9877262596002243, + "language_loss": 0.64780253, + "learning_rate": 3.836172161771189e-06, + "loss": 0.66981632, + "num_input_tokens_seen": 56297130, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0078125, + "step": 2592, + "time_per_iteration": 2.5992095470428467 + }, + { + "auxiliary_loss_clip": 0.01156577, + "auxiliary_loss_mlp": 0.01052338, + "balance_loss_clip": 1.03195322, + "balance_loss_mlp": 1.0518856, + "epoch": 0.1558995941680445, + "flos": 21834729498240.0, + "grad_norm": 2.6077304694487062, + "language_loss": 0.81806099, + "learning_rate": 3.836017751722467e-06, + "loss": 0.84015012, + "num_input_tokens_seen": 56314995, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.046875, + "step": 2593, + "time_per_iteration": 2.4317471981048584 + }, + { + "auxiliary_loss_clip": 0.01148564, + "auxiliary_loss_mlp": 0.01049732, + "balance_loss_clip": 1.02876306, + "balance_loss_mlp": 1.04862404, + "epoch": 0.15595971742071246, + "flos": 19792633484160.0, + "grad_norm": 2.3131099691306445, + "language_loss": 0.72585857, + "learning_rate": 3.8358632720517695e-06, + "loss": 0.7478416, + "num_input_tokens_seen": 56334005, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.0, + "step": 2594, + "time_per_iteration": 2.454946994781494 + }, + { + "auxiliary_loss_clip": 0.01145676, + "auxiliary_loss_mlp": 0.01044453, + "balance_loss_clip": 1.02514088, + "balance_loss_mlp": 1.0476191, + "epoch": 0.15601984067338043, + "flos": 26722135503360.0, + "grad_norm": 1.980280068020953, + "language_loss": 0.8170377, + "learning_rate": 3.835708722764952e-06, + "loss": 0.83893895, + "num_input_tokens_seen": 56353795, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.98046875, + "step": 2595, + "time_per_iteration": 2.4859232902526855 + }, + { + "auxiliary_loss_clip": 0.01149214, + "auxiliary_loss_mlp": 0.01047121, + "balance_loss_clip": 1.02761889, + "balance_loss_mlp": 1.04722846, + "epoch": 0.1560799639260484, + "flos": 18369278173440.0, + "grad_norm": 2.3729637830877177, + "language_loss": 0.86587811, + "learning_rate": 3.835554103867876e-06, + "loss": 0.88784146, + "num_input_tokens_seen": 56373195, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.015625, + "step": 2596, + "time_per_iteration": 2.431851387023926 + }, + { + "auxiliary_loss_clip": 0.0114636, + "auxiliary_loss_mlp": 0.01043935, + "balance_loss_clip": 1.02558839, + "balance_loss_mlp": 1.04831815, + "epoch": 0.15614008717871636, + "flos": 22598980197120.0, + "grad_norm": 1.6624104890405602, + "language_loss": 0.68610018, + "learning_rate": 3.835399415366404e-06, + "loss": 0.70800316, + "num_input_tokens_seen": 56391525, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.98046875, + "step": 2597, + "time_per_iteration": 2.447265625 + }, + { + "auxiliary_loss_clip": 0.01144111, + "auxiliary_loss_mlp": 0.01040539, + "balance_loss_clip": 1.02210891, + "balance_loss_mlp": 1.04714298, + "epoch": 0.15620021043138435, + "flos": 22746860490240.0, + "grad_norm": 1.638980754682227, + "language_loss": 0.79885375, + "learning_rate": 3.8352446572664035e-06, + "loss": 0.82070029, + "num_input_tokens_seen": 56410715, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.97265625, + "step": 2598, + "time_per_iteration": 2.4641571044921875 + }, + { + "auxiliary_loss_clip": 0.01141262, + "auxiliary_loss_mlp": 0.01039052, + "balance_loss_clip": 1.02003777, + "balance_loss_mlp": 1.04484367, + "epoch": 0.15626033368405232, + "flos": 13114936782720.0, + "grad_norm": 2.19687533686526, + "language_loss": 0.82877028, + "learning_rate": 3.8350898295737405e-06, + "loss": 0.85057342, + "num_input_tokens_seen": 56429170, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.96484375, + "step": 2599, + "time_per_iteration": 2.419464111328125 + }, + { + "auxiliary_loss_clip": 0.01155461, + "auxiliary_loss_mlp": 0.01052363, + "balance_loss_clip": 1.03115571, + "balance_loss_mlp": 1.04991198, + "epoch": 0.15632045693672028, + "flos": 16472297105280.0, + "grad_norm": 3.412785735027946, + "language_loss": 0.81813747, + "learning_rate": 3.834934932294287e-06, + "loss": 0.84021574, + "num_input_tokens_seen": 56445685, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.0546875, + "step": 2600, + "time_per_iteration": 2.408848524093628 + }, + { + "auxiliary_loss_clip": 0.01152936, + "auxiliary_loss_mlp": 0.0104778, + "balance_loss_clip": 1.02813435, + "balance_loss_mlp": 1.05145574, + "epoch": 0.15638058018938825, + "flos": 20850346298880.0, + "grad_norm": 1.8570517134994367, + "language_loss": 0.8869983, + "learning_rate": 3.834779965433917e-06, + "loss": 0.90900552, + "num_input_tokens_seen": 56465900, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.015625, + "step": 2601, + "time_per_iteration": 3.8960022926330566 + }, + { + "auxiliary_loss_clip": 0.01155618, + "auxiliary_loss_mlp": 0.01064496, + "balance_loss_clip": 1.04250216, + "balance_loss_mlp": 1.05294669, + "epoch": 0.1564407034420562, + "flos": 21872220318720.0, + "grad_norm": 1.6572791804428935, + "language_loss": 0.78657669, + "learning_rate": 3.834624928998508e-06, + "loss": 0.80877781, + "num_input_tokens_seen": 56485020, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.0234375, + "step": 2602, + "time_per_iteration": 5.330498456954956 + }, + { + "auxiliary_loss_clip": 0.01148351, + "auxiliary_loss_mlp": 0.01041482, + "balance_loss_clip": 1.02178836, + "balance_loss_mlp": 1.04872918, + "epoch": 0.15650082669472418, + "flos": 21834549930240.0, + "grad_norm": 1.9481072701353659, + "language_loss": 0.73668396, + "learning_rate": 3.8344698229939376e-06, + "loss": 0.75858229, + "num_input_tokens_seen": 56505205, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.99609375, + "step": 2603, + "time_per_iteration": 2.4632985591888428 + }, + { + "auxiliary_loss_clip": 0.01152236, + "auxiliary_loss_mlp": 0.01051929, + "balance_loss_clip": 1.03205693, + "balance_loss_mlp": 1.05066442, + "epoch": 0.15656094994739214, + "flos": 13800542653440.0, + "grad_norm": 3.4624008692922583, + "language_loss": 0.87223339, + "learning_rate": 3.8343146474260865e-06, + "loss": 0.89427507, + "num_input_tokens_seen": 56521495, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.015625, + "step": 2604, + "time_per_iteration": 2.449589490890503 + }, + { + "auxiliary_loss_clip": 0.01151636, + "auxiliary_loss_mlp": 0.01043178, + "balance_loss_clip": 1.02404523, + "balance_loss_mlp": 1.04892218, + "epoch": 0.15662107320006013, + "flos": 27308197808640.0, + "grad_norm": 1.883819023069068, + "language_loss": 0.85465723, + "learning_rate": 3.834159402300841e-06, + "loss": 0.87660539, + "num_input_tokens_seen": 56540665, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0234375, + "step": 2605, + "time_per_iteration": 2.4958839416503906 + }, + { + "auxiliary_loss_clip": 0.01153078, + "auxiliary_loss_mlp": 0.01047461, + "balance_loss_clip": 1.0274334, + "balance_loss_mlp": 1.04840827, + "epoch": 0.1566811964527281, + "flos": 26685075646080.0, + "grad_norm": 2.4518366617864897, + "language_loss": 0.72954321, + "learning_rate": 3.834004087624087e-06, + "loss": 0.75154853, + "num_input_tokens_seen": 56560805, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.046875, + "step": 2606, + "time_per_iteration": 2.5142898559570312 + }, + { + "auxiliary_loss_clip": 0.01153185, + "auxiliary_loss_mlp": 0.01052184, + "balance_loss_clip": 1.03406429, + "balance_loss_mlp": 1.05257165, + "epoch": 0.15674131970539606, + "flos": 16103422385280.0, + "grad_norm": 1.9820673877795116, + "language_loss": 0.7643044, + "learning_rate": 3.8338487034017145e-06, + "loss": 0.78635812, + "num_input_tokens_seen": 56576335, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.0, + "step": 2607, + "time_per_iteration": 2.433779239654541 + }, + { + "auxiliary_loss_clip": 0.01150219, + "auxiliary_loss_mlp": 0.01046695, + "balance_loss_clip": 1.0282656, + "balance_loss_mlp": 1.05097091, + "epoch": 0.15680144295806403, + "flos": 19169690889600.0, + "grad_norm": 1.7850270515341367, + "language_loss": 0.8191157, + "learning_rate": 3.833693249639615e-06, + "loss": 0.8410849, + "num_input_tokens_seen": 56595880, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.9921875, + "step": 2608, + "time_per_iteration": 2.4599456787109375 + }, + { + "auxiliary_loss_clip": 0.0115477, + "auxiliary_loss_mlp": 0.01051079, + "balance_loss_clip": 1.03001475, + "balance_loss_mlp": 1.05087662, + "epoch": 0.156861566210732, + "flos": 20813430096000.0, + "grad_norm": 1.762197880640894, + "language_loss": 0.72479111, + "learning_rate": 3.833537726343684e-06, + "loss": 0.74684954, + "num_input_tokens_seen": 56615130, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.0390625, + "step": 2609, + "time_per_iteration": 2.478262424468994 + }, + { + "auxiliary_loss_clip": 0.0115339, + "auxiliary_loss_mlp": 0.01043612, + "balance_loss_clip": 1.02415729, + "balance_loss_mlp": 1.04881263, + "epoch": 0.15692168946339996, + "flos": 20047922421120.0, + "grad_norm": 1.8833233307981396, + "language_loss": 0.71974212, + "learning_rate": 3.833382133519818e-06, + "loss": 0.74171209, + "num_input_tokens_seen": 56634005, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.046875, + "step": 2610, + "time_per_iteration": 2.468616247177124 + }, + { + "auxiliary_loss_clip": 0.01153055, + "auxiliary_loss_mlp": 0.01052533, + "balance_loss_clip": 1.03119481, + "balance_loss_mlp": 1.04865789, + "epoch": 0.15698181271606793, + "flos": 21398019943680.0, + "grad_norm": 2.0486839750324117, + "language_loss": 0.72148776, + "learning_rate": 3.833226471173919e-06, + "loss": 0.74354362, + "num_input_tokens_seen": 56653480, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.046875, + "step": 2611, + "time_per_iteration": 2.4812967777252197 + }, + { + "auxiliary_loss_clip": 0.01152967, + "auxiliary_loss_mlp": 0.01044873, + "balance_loss_clip": 1.02517986, + "balance_loss_mlp": 1.05081797, + "epoch": 0.15704193596873592, + "flos": 20845785271680.0, + "grad_norm": 2.1526303920645153, + "language_loss": 0.70732605, + "learning_rate": 3.833070739311887e-06, + "loss": 0.72930443, + "num_input_tokens_seen": 56672270, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0234375, + "step": 2612, + "time_per_iteration": 2.4659905433654785 + }, + { + "auxiliary_loss_clip": 0.0115345, + "auxiliary_loss_mlp": 0.01053573, + "balance_loss_clip": 1.03448749, + "balance_loss_mlp": 1.05112672, + "epoch": 0.15710205922140388, + "flos": 21762908254080.0, + "grad_norm": 1.98698506128839, + "language_loss": 0.75649011, + "learning_rate": 3.83291493793963e-06, + "loss": 0.77856034, + "num_input_tokens_seen": 56691510, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0234375, + "step": 2613, + "time_per_iteration": 2.5053935050964355 + }, + { + "auxiliary_loss_clip": 0.01150247, + "auxiliary_loss_mlp": 0.01054631, + "balance_loss_clip": 1.03454411, + "balance_loss_mlp": 1.04870725, + "epoch": 0.15716218247407185, + "flos": 25007760201600.0, + "grad_norm": 1.7256548803860323, + "language_loss": 0.6593504, + "learning_rate": 3.832759067063055e-06, + "loss": 0.68139917, + "num_input_tokens_seen": 56712230, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.015625, + "step": 2614, + "time_per_iteration": 2.49568772315979 + }, + { + "auxiliary_loss_clip": 0.01154976, + "auxiliary_loss_mlp": 0.01050381, + "balance_loss_clip": 1.02972233, + "balance_loss_mlp": 1.04979289, + "epoch": 0.1572223057267398, + "flos": 20191780391040.0, + "grad_norm": 2.1509467282749055, + "language_loss": 0.7554003, + "learning_rate": 3.832603126688072e-06, + "loss": 0.7774539, + "num_input_tokens_seen": 56727490, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.0546875, + "step": 2615, + "time_per_iteration": 2.529383420944214 + }, + { + "auxiliary_loss_clip": 0.0115204, + "auxiliary_loss_mlp": 0.01052516, + "balance_loss_clip": 1.03374028, + "balance_loss_mlp": 1.05295634, + "epoch": 0.15728242897940778, + "flos": 20959514709120.0, + "grad_norm": 1.616950748432624, + "language_loss": 0.72989607, + "learning_rate": 3.832447116820594e-06, + "loss": 0.75194162, + "num_input_tokens_seen": 56747385, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9921875, + "step": 2616, + "time_per_iteration": 2.5096960067749023 + }, + { + "auxiliary_loss_clip": 0.01152584, + "auxiliary_loss_mlp": 0.01055054, + "balance_loss_clip": 1.03453839, + "balance_loss_mlp": 1.04991412, + "epoch": 0.15734255223207574, + "flos": 23038275530880.0, + "grad_norm": 3.5663633553154774, + "language_loss": 0.72316766, + "learning_rate": 3.832291037466539e-06, + "loss": 0.74524403, + "num_input_tokens_seen": 56768055, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0234375, + "step": 2617, + "time_per_iteration": 2.46756911277771 + }, + { + "auxiliary_loss_clip": 0.01151577, + "auxiliary_loss_mlp": 0.01043789, + "balance_loss_clip": 1.02453637, + "balance_loss_mlp": 1.05169988, + "epoch": 0.15740267548474374, + "flos": 20551281661440.0, + "grad_norm": 2.0296559288157563, + "language_loss": 0.74336463, + "learning_rate": 3.8321348886318235e-06, + "loss": 0.76531827, + "num_input_tokens_seen": 56785110, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.0, + "step": 2618, + "time_per_iteration": 2.4584109783172607 + }, + { + "auxiliary_loss_clip": 0.01156356, + "auxiliary_loss_mlp": 0.01051737, + "balance_loss_clip": 1.02976644, + "balance_loss_mlp": 1.05079079, + "epoch": 0.1574627987374117, + "flos": 22666922772480.0, + "grad_norm": 2.116136233608656, + "language_loss": 0.78624105, + "learning_rate": 3.8319786703223695e-06, + "loss": 0.80832201, + "num_input_tokens_seen": 56804975, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.0546875, + "step": 2619, + "time_per_iteration": 2.481902837753296 + }, + { + "auxiliary_loss_clip": 0.01151953, + "auxiliary_loss_mlp": 0.01052764, + "balance_loss_clip": 1.03373837, + "balance_loss_mlp": 1.05213726, + "epoch": 0.15752292199007967, + "flos": 16800664262400.0, + "grad_norm": 1.705564128099723, + "language_loss": 0.76632881, + "learning_rate": 3.831822382544101e-06, + "loss": 0.78837597, + "num_input_tokens_seen": 56822470, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0, + "step": 2620, + "time_per_iteration": 2.432645082473755 + }, + { + "auxiliary_loss_clip": 0.01153614, + "auxiliary_loss_mlp": 0.01050007, + "balance_loss_clip": 1.02901375, + "balance_loss_mlp": 1.05096626, + "epoch": 0.15758304524274763, + "flos": 29826002568960.0, + "grad_norm": 1.7942321132139696, + "language_loss": 0.70836174, + "learning_rate": 3.831666025302944e-06, + "loss": 0.73039794, + "num_input_tokens_seen": 56842100, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.03125, + "step": 2621, + "time_per_iteration": 2.5259244441986084 + }, + { + "auxiliary_loss_clip": 0.01156472, + "auxiliary_loss_mlp": 0.01049198, + "balance_loss_clip": 1.0277524, + "balance_loss_mlp": 1.05222857, + "epoch": 0.1576431684954156, + "flos": 53577426723840.0, + "grad_norm": 2.5825564073202467, + "language_loss": 0.71880406, + "learning_rate": 3.831509598604828e-06, + "loss": 0.74086076, + "num_input_tokens_seen": 56865920, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.046875, + "step": 2622, + "time_per_iteration": 2.738351583480835 + }, + { + "auxiliary_loss_clip": 0.01153726, + "auxiliary_loss_mlp": 0.01047401, + "balance_loss_clip": 1.02826762, + "balance_loss_mlp": 1.05162704, + "epoch": 0.15770329174808356, + "flos": 20813609664000.0, + "grad_norm": 1.7275011876813262, + "language_loss": 0.87603116, + "learning_rate": 3.831353102455684e-06, + "loss": 0.89804244, + "num_input_tokens_seen": 56885265, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0234375, + "step": 2623, + "time_per_iteration": 2.439276695251465 + }, + { + "auxiliary_loss_clip": 0.01153997, + "auxiliary_loss_mlp": 0.01046147, + "balance_loss_clip": 1.02774143, + "balance_loss_mlp": 1.05301619, + "epoch": 0.15776341500075153, + "flos": 24974004395520.0, + "grad_norm": 1.7488793041913886, + "language_loss": 0.82132548, + "learning_rate": 3.831196536861448e-06, + "loss": 0.84332693, + "num_input_tokens_seen": 56906710, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.0078125, + "step": 2624, + "time_per_iteration": 2.5011823177337646 + }, + { + "auxiliary_loss_clip": 0.01156666, + "auxiliary_loss_mlp": 0.01047764, + "balance_loss_clip": 1.02720022, + "balance_loss_mlp": 1.0518285, + "epoch": 0.15782353825341952, + "flos": 21907915459200.0, + "grad_norm": 2.213311097116894, + "language_loss": 0.79965818, + "learning_rate": 3.831039901828054e-06, + "loss": 0.82170242, + "num_input_tokens_seen": 56924275, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.046875, + "step": 2625, + "time_per_iteration": 2.469705581665039 + }, + { + "auxiliary_loss_clip": 0.01152837, + "auxiliary_loss_mlp": 0.01050956, + "balance_loss_clip": 1.03215635, + "balance_loss_mlp": 1.05189955, + "epoch": 0.15788366150608749, + "flos": 26177191292160.0, + "grad_norm": 2.0497226184185044, + "language_loss": 0.80393386, + "learning_rate": 3.830883197361445e-06, + "loss": 0.82597172, + "num_input_tokens_seen": 56941525, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0078125, + "step": 2626, + "time_per_iteration": 2.4822630882263184 + }, + { + "auxiliary_loss_clip": 0.01157567, + "auxiliary_loss_mlp": 0.01046921, + "balance_loss_clip": 1.02703679, + "balance_loss_mlp": 1.05660009, + "epoch": 0.15794378475875545, + "flos": 27709822753920.0, + "grad_norm": 1.8439314798963051, + "language_loss": 0.73819017, + "learning_rate": 3.830726423467561e-06, + "loss": 0.76023501, + "num_input_tokens_seen": 56962145, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0078125, + "step": 2627, + "time_per_iteration": 2.5146384239196777 + }, + { + "auxiliary_loss_clip": 0.01153645, + "auxiliary_loss_mlp": 0.01055765, + "balance_loss_clip": 1.03515387, + "balance_loss_mlp": 1.05136025, + "epoch": 0.15800390801142342, + "flos": 12130158533760.0, + "grad_norm": 2.581375347872909, + "language_loss": 0.84926289, + "learning_rate": 3.830569580152348e-06, + "loss": 0.87135696, + "num_input_tokens_seen": 56977505, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.0234375, + "step": 2628, + "time_per_iteration": 2.476461172103882 + }, + { + "auxiliary_loss_clip": 0.01152526, + "auxiliary_loss_mlp": 0.01039179, + "balance_loss_clip": 1.02045107, + "balance_loss_mlp": 1.05181646, + "epoch": 0.15806403126409138, + "flos": 20704728562560.0, + "grad_norm": 1.9330212081502065, + "language_loss": 0.76414472, + "learning_rate": 3.830412667421752e-06, + "loss": 0.78606176, + "num_input_tokens_seen": 56996770, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0078125, + "step": 2629, + "time_per_iteration": 2.4604575634002686 + }, + { + "auxiliary_loss_clip": 0.01157301, + "auxiliary_loss_mlp": 0.01053673, + "balance_loss_clip": 1.03277516, + "balance_loss_mlp": 1.05376625, + "epoch": 0.15812415451675935, + "flos": 17821712269440.0, + "grad_norm": 2.3335878107949624, + "language_loss": 0.73786485, + "learning_rate": 3.8302556852817245e-06, + "loss": 0.7599746, + "num_input_tokens_seen": 57014970, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.0390625, + "step": 2630, + "time_per_iteration": 2.4556961059570312 + }, + { + "auxiliary_loss_clip": 0.01159154, + "auxiliary_loss_mlp": 0.01049527, + "balance_loss_clip": 1.02934527, + "balance_loss_mlp": 1.05278432, + "epoch": 0.15818427776942734, + "flos": 20084048524800.0, + "grad_norm": 3.0799062126580385, + "language_loss": 0.83732498, + "learning_rate": 3.8300986337382184e-06, + "loss": 0.85941184, + "num_input_tokens_seen": 57034045, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0625, + "step": 2631, + "time_per_iteration": 2.46466326713562 + }, + { + "auxiliary_loss_clip": 0.0115417, + "auxiliary_loss_mlp": 0.01047476, + "balance_loss_clip": 1.02800894, + "balance_loss_mlp": 1.05072045, + "epoch": 0.1582444010220953, + "flos": 21214911386880.0, + "grad_norm": 1.8231521117013414, + "language_loss": 0.78509778, + "learning_rate": 3.8299415127971895e-06, + "loss": 0.80711424, + "num_input_tokens_seen": 57053695, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0390625, + "step": 2632, + "time_per_iteration": 2.4678170680999756 + }, + { + "auxiliary_loss_clip": 0.01160199, + "auxiliary_loss_mlp": 0.01058182, + "balance_loss_clip": 1.03766572, + "balance_loss_mlp": 1.05516291, + "epoch": 0.15830452427476327, + "flos": 17858341163520.0, + "grad_norm": 2.1429957658458374, + "language_loss": 0.83250827, + "learning_rate": 3.829784322464594e-06, + "loss": 0.8546921, + "num_input_tokens_seen": 57071290, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.046875, + "step": 2633, + "time_per_iteration": 2.4329495429992676 + }, + { + "auxiliary_loss_clip": 0.01161566, + "auxiliary_loss_mlp": 0.01046458, + "balance_loss_clip": 1.02641928, + "balance_loss_mlp": 1.05591452, + "epoch": 0.15836464752743123, + "flos": 24534960456960.0, + "grad_norm": 1.9651575849984717, + "language_loss": 0.77401066, + "learning_rate": 3.829627062746394e-06, + "loss": 0.79609084, + "num_input_tokens_seen": 57091465, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.0625, + "step": 2634, + "time_per_iteration": 2.4989452362060547 + }, + { + "auxiliary_loss_clip": 0.01158347, + "auxiliary_loss_mlp": 0.01049894, + "balance_loss_clip": 1.02961695, + "balance_loss_mlp": 1.05281138, + "epoch": 0.1584247707800992, + "flos": 20120821073280.0, + "grad_norm": 2.178604932363088, + "language_loss": 0.89144027, + "learning_rate": 3.829469733648552e-06, + "loss": 0.91352272, + "num_input_tokens_seen": 57110075, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.0546875, + "step": 2635, + "time_per_iteration": 2.45926570892334 + }, + { + "auxiliary_loss_clip": 0.0115666, + "auxiliary_loss_mlp": 0.0105615, + "balance_loss_clip": 1.03518081, + "balance_loss_mlp": 1.05145168, + "epoch": 0.15848489403276717, + "flos": 20375966355840.0, + "grad_norm": 2.07071202721755, + "language_loss": 0.75814605, + "learning_rate": 3.829312335177034e-06, + "loss": 0.78027415, + "num_input_tokens_seen": 57128945, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.046875, + "step": 2636, + "time_per_iteration": 2.4601919651031494 + }, + { + "auxiliary_loss_clip": 0.01159967, + "auxiliary_loss_mlp": 0.01046973, + "balance_loss_clip": 1.0252409, + "balance_loss_mlp": 1.05383635, + "epoch": 0.15854501728543513, + "flos": 39346890359040.0, + "grad_norm": 2.192817266182781, + "language_loss": 0.72065628, + "learning_rate": 3.82915486733781e-06, + "loss": 0.74272561, + "num_input_tokens_seen": 57152385, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.0625, + "step": 2637, + "time_per_iteration": 2.6509416103363037 + }, + { + "auxiliary_loss_clip": 0.01154792, + "auxiliary_loss_mlp": 0.01042754, + "balance_loss_clip": 1.02395523, + "balance_loss_mlp": 1.05307317, + "epoch": 0.15860514053810312, + "flos": 24864225454080.0, + "grad_norm": 1.9644709833035638, + "language_loss": 0.77938193, + "learning_rate": 3.82899733013685e-06, + "loss": 0.80135739, + "num_input_tokens_seen": 57172620, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 2638, + "time_per_iteration": 2.516597032546997 + }, + { + "auxiliary_loss_clip": 0.01160159, + "auxiliary_loss_mlp": 0.01062214, + "balance_loss_clip": 1.04074407, + "balance_loss_mlp": 1.05348861, + "epoch": 0.1586652637907711, + "flos": 26177694082560.0, + "grad_norm": 1.8473853011869859, + "language_loss": 0.75521988, + "learning_rate": 3.828839723580128e-06, + "loss": 0.77744359, + "num_input_tokens_seen": 57194680, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.0625, + "step": 2639, + "time_per_iteration": 2.5517024993896484 + }, + { + "auxiliary_loss_clip": 0.01159513, + "auxiliary_loss_mlp": 0.01061213, + "balance_loss_clip": 1.04115009, + "balance_loss_mlp": 1.0541048, + "epoch": 0.15872538704343905, + "flos": 19792058866560.0, + "grad_norm": 2.7935559917311212, + "language_loss": 0.81487972, + "learning_rate": 3.82868204767362e-06, + "loss": 0.83708692, + "num_input_tokens_seen": 57214675, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0546875, + "step": 2640, + "time_per_iteration": 2.5613112449645996 + }, + { + "auxiliary_loss_clip": 0.01152653, + "auxiliary_loss_mlp": 0.01050922, + "balance_loss_clip": 1.030406, + "balance_loss_mlp": 1.05107331, + "epoch": 0.15878551029610702, + "flos": 28475366342400.0, + "grad_norm": 1.4887809421561018, + "language_loss": 0.67051661, + "learning_rate": 3.828524302423306e-06, + "loss": 0.69255233, + "num_input_tokens_seen": 57235830, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.015625, + "step": 2641, + "time_per_iteration": 2.5603220462799072 + }, + { + "auxiliary_loss_clip": 0.01163302, + "auxiliary_loss_mlp": 0.01057677, + "balance_loss_clip": 1.03670835, + "balance_loss_mlp": 1.05338526, + "epoch": 0.15884563354877498, + "flos": 24206701040640.0, + "grad_norm": 2.894977763056953, + "language_loss": 0.7508198, + "learning_rate": 3.828366487835167e-06, + "loss": 0.77302957, + "num_input_tokens_seen": 57255970, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.09375, + "step": 2642, + "time_per_iteration": 2.4783003330230713 + }, + { + "auxiliary_loss_clip": 0.01154514, + "auxiliary_loss_mlp": 0.01054374, + "balance_loss_clip": 1.0343703, + "balance_loss_mlp": 1.05342579, + "epoch": 0.15890575680144295, + "flos": 23949795991680.0, + "grad_norm": 2.1233146618452046, + "language_loss": 0.70096999, + "learning_rate": 3.828208603915186e-06, + "loss": 0.72305882, + "num_input_tokens_seen": 57274435, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.015625, + "step": 2643, + "time_per_iteration": 3.8417530059814453 + }, + { + "auxiliary_loss_clip": 0.0115474, + "auxiliary_loss_mlp": 0.01046992, + "balance_loss_clip": 1.02801371, + "balance_loss_mlp": 1.05399418, + "epoch": 0.15896588005411091, + "flos": 21215019127680.0, + "grad_norm": 2.266510625665779, + "language_loss": 0.78172421, + "learning_rate": 3.828050650669353e-06, + "loss": 0.80374151, + "num_input_tokens_seen": 57293115, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0078125, + "step": 2644, + "time_per_iteration": 3.918332099914551 + }, + { + "auxiliary_loss_clip": 0.01155626, + "auxiliary_loss_mlp": 0.01054747, + "balance_loss_clip": 1.03432608, + "balance_loss_mlp": 1.05189228, + "epoch": 0.1590260033067789, + "flos": 24352390604160.0, + "grad_norm": 1.8745538844001242, + "language_loss": 0.82203078, + "learning_rate": 3.827892628103657e-06, + "loss": 0.84413457, + "num_input_tokens_seen": 57312565, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0390625, + "step": 2645, + "time_per_iteration": 2.484264373779297 + }, + { + "auxiliary_loss_clip": 0.01156639, + "auxiliary_loss_mlp": 0.01055562, + "balance_loss_clip": 1.0340929, + "balance_loss_mlp": 1.05192447, + "epoch": 0.15908612655944687, + "flos": 32048944583040.0, + "grad_norm": 1.974907168100252, + "language_loss": 0.69778836, + "learning_rate": 3.827734536224087e-06, + "loss": 0.71991032, + "num_input_tokens_seen": 57333360, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.046875, + "step": 2646, + "time_per_iteration": 2.5406665802001953 + }, + { + "auxiliary_loss_clip": 0.01151139, + "auxiliary_loss_mlp": 0.01046289, + "balance_loss_clip": 1.02738249, + "balance_loss_mlp": 1.05206954, + "epoch": 0.15914624981211484, + "flos": 17785370684160.0, + "grad_norm": 2.5066454352116914, + "language_loss": 0.62659109, + "learning_rate": 3.827576375036642e-06, + "loss": 0.64856541, + "num_input_tokens_seen": 57350575, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.9921875, + "step": 2647, + "time_per_iteration": 2.442711353302002 + }, + { + "auxiliary_loss_clip": 0.01155368, + "auxiliary_loss_mlp": 0.01052347, + "balance_loss_clip": 1.03226066, + "balance_loss_mlp": 1.05410099, + "epoch": 0.1592063730647828, + "flos": 17712507945600.0, + "grad_norm": 2.1253745247586204, + "language_loss": 0.8942067, + "learning_rate": 3.827418144547318e-06, + "loss": 0.91628385, + "num_input_tokens_seen": 57367570, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.015625, + "step": 2648, + "time_per_iteration": 2.4649319648742676 + }, + { + "auxiliary_loss_clip": 0.01152722, + "auxiliary_loss_mlp": 0.01049569, + "balance_loss_clip": 1.03141308, + "balance_loss_mlp": 1.05391204, + "epoch": 0.15926649631745077, + "flos": 18803545603200.0, + "grad_norm": 1.8651001097947648, + "language_loss": 0.91716385, + "learning_rate": 3.827259844762114e-06, + "loss": 0.93918669, + "num_input_tokens_seen": 57383980, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.98828125, + "step": 2649, + "time_per_iteration": 2.451261520385742 + }, + { + "auxiliary_loss_clip": 0.01163223, + "auxiliary_loss_mlp": 0.01049063, + "balance_loss_clip": 1.02802217, + "balance_loss_mlp": 1.05272281, + "epoch": 0.15932661957011873, + "flos": 17566243764480.0, + "grad_norm": 3.3226984417644028, + "language_loss": 0.71273595, + "learning_rate": 3.827101475687033e-06, + "loss": 0.73485881, + "num_input_tokens_seen": 57400840, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1015625, + "step": 2650, + "time_per_iteration": 2.43603253364563 + }, + { + "auxiliary_loss_clip": 0.01153823, + "auxiliary_loss_mlp": 0.01044738, + "balance_loss_clip": 1.02695203, + "balance_loss_mlp": 1.05372715, + "epoch": 0.15938674282278673, + "flos": 13334351011200.0, + "grad_norm": 2.4247432930640898, + "language_loss": 0.71116996, + "learning_rate": 3.826943037328082e-06, + "loss": 0.73315561, + "num_input_tokens_seen": 57419230, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 1.0, + "step": 2651, + "time_per_iteration": 2.467451572418213 + }, + { + "auxiliary_loss_clip": 0.01155622, + "auxiliary_loss_mlp": 0.01049144, + "balance_loss_clip": 1.02912855, + "balance_loss_mlp": 1.0513978, + "epoch": 0.1594468660754547, + "flos": 22488842119680.0, + "grad_norm": 1.909821572556346, + "language_loss": 0.7997523, + "learning_rate": 3.8267845296912674e-06, + "loss": 0.82179999, + "num_input_tokens_seen": 57439315, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.046875, + "step": 2652, + "time_per_iteration": 2.519624948501587 + }, + { + "auxiliary_loss_clip": 0.01153837, + "auxiliary_loss_mlp": 0.01045946, + "balance_loss_clip": 1.02665794, + "balance_loss_mlp": 1.05385149, + "epoch": 0.15950698932812266, + "flos": 15007320910080.0, + "grad_norm": 2.695147262103697, + "language_loss": 0.70050812, + "learning_rate": 3.826625952782601e-06, + "loss": 0.72250587, + "num_input_tokens_seen": 57454635, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.0, + "step": 2653, + "time_per_iteration": 2.439445972442627 + }, + { + "auxiliary_loss_clip": 0.01154814, + "auxiliary_loss_mlp": 0.01043059, + "balance_loss_clip": 1.02309155, + "balance_loss_mlp": 1.05308652, + "epoch": 0.15956711258079062, + "flos": 30155052084480.0, + "grad_norm": 2.046273350718209, + "language_loss": 0.76509416, + "learning_rate": 3.826467306608095e-06, + "loss": 0.7870729, + "num_input_tokens_seen": 57476805, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.015625, + "step": 2654, + "time_per_iteration": 2.529644012451172 + }, + { + "auxiliary_loss_clip": 0.01154147, + "auxiliary_loss_mlp": 0.01047134, + "balance_loss_clip": 1.02750051, + "balance_loss_mlp": 1.0526185, + "epoch": 0.1596272358334586, + "flos": 21032700670080.0, + "grad_norm": 1.961582700797155, + "language_loss": 0.8208828, + "learning_rate": 3.826308591173765e-06, + "loss": 0.84289569, + "num_input_tokens_seen": 57496400, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.015625, + "step": 2655, + "time_per_iteration": 2.4841158390045166 + }, + { + "auxiliary_loss_clip": 0.01154679, + "auxiliary_loss_mlp": 0.01049984, + "balance_loss_clip": 1.03166127, + "balance_loss_mlp": 1.05125904, + "epoch": 0.15968735908612655, + "flos": 15268032800640.0, + "grad_norm": 2.077546195878165, + "language_loss": 0.73565602, + "learning_rate": 3.826149806485631e-06, + "loss": 0.75770259, + "num_input_tokens_seen": 57513700, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.03125, + "step": 2656, + "time_per_iteration": 2.4727072715759277 + }, + { + "auxiliary_loss_clip": 0.01149623, + "auxiliary_loss_mlp": 0.01046235, + "balance_loss_clip": 1.02766216, + "balance_loss_mlp": 1.05170095, + "epoch": 0.15974748233879452, + "flos": 52665726695040.0, + "grad_norm": 1.884771930829773, + "language_loss": 0.77508467, + "learning_rate": 3.825990952549713e-06, + "loss": 0.79704326, + "num_input_tokens_seen": 57536180, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.98046875, + "step": 2657, + "time_per_iteration": 2.801560401916504 + }, + { + "auxiliary_loss_clip": 0.01154211, + "auxiliary_loss_mlp": 0.01048143, + "balance_loss_clip": 1.02910495, + "balance_loss_mlp": 1.05459499, + "epoch": 0.1598076055914625, + "flos": 18733232730240.0, + "grad_norm": 1.6493844029380673, + "language_loss": 0.74807733, + "learning_rate": 3.825832029372035e-06, + "loss": 0.77010089, + "num_input_tokens_seen": 57555025, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.99609375, + "step": 2658, + "time_per_iteration": 2.4434328079223633 + }, + { + "auxiliary_loss_clip": 0.01155878, + "auxiliary_loss_mlp": 0.01050595, + "balance_loss_clip": 1.02912521, + "balance_loss_mlp": 1.05291355, + "epoch": 0.15986772884413047, + "flos": 34349238535680.0, + "grad_norm": 1.8153435843839463, + "language_loss": 0.75194407, + "learning_rate": 3.825673036958624e-06, + "loss": 0.77400887, + "num_input_tokens_seen": 57577660, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.03125, + "step": 2659, + "time_per_iteration": 2.587700366973877 + }, + { + "auxiliary_loss_clip": 0.01159224, + "auxiliary_loss_mlp": 0.01052946, + "balance_loss_clip": 1.03295422, + "balance_loss_mlp": 1.05531979, + "epoch": 0.15992785209679844, + "flos": 22054969739520.0, + "grad_norm": 2.4521775760186526, + "language_loss": 0.90417045, + "learning_rate": 3.825513975315508e-06, + "loss": 0.92629218, + "num_input_tokens_seen": 57596335, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0390625, + "step": 2660, + "time_per_iteration": 2.45237398147583 + }, + { + "auxiliary_loss_clip": 0.01161295, + "auxiliary_loss_mlp": 0.0105014, + "balance_loss_clip": 1.0300889, + "balance_loss_mlp": 1.05822825, + "epoch": 0.1599879753494664, + "flos": 33066652625280.0, + "grad_norm": 2.0123178843036373, + "language_loss": 0.77552611, + "learning_rate": 3.82535484444872e-06, + "loss": 0.79764044, + "num_input_tokens_seen": 57616830, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.03125, + "step": 2661, + "time_per_iteration": 2.574652910232544 + }, + { + "auxiliary_loss_clip": 0.01158998, + "auxiliary_loss_mlp": 0.01048944, + "balance_loss_clip": 1.02913153, + "balance_loss_mlp": 1.05460262, + "epoch": 0.16004809860213437, + "flos": 28038010343040.0, + "grad_norm": 1.7348749157972516, + "language_loss": 0.74735796, + "learning_rate": 3.825195644364292e-06, + "loss": 0.76943737, + "num_input_tokens_seen": 57635515, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.046875, + "step": 2662, + "time_per_iteration": 2.506974935531616 + }, + { + "auxiliary_loss_clip": 0.01158039, + "auxiliary_loss_mlp": 0.01051532, + "balance_loss_clip": 1.03233898, + "balance_loss_mlp": 1.05416894, + "epoch": 0.16010822185480234, + "flos": 22780113505920.0, + "grad_norm": 2.0770925688556074, + "language_loss": 0.82047677, + "learning_rate": 3.825036375068263e-06, + "loss": 0.84257245, + "num_input_tokens_seen": 57654250, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0390625, + "step": 2663, + "time_per_iteration": 2.459630012512207 + }, + { + "auxiliary_loss_clip": 0.0116012, + "auxiliary_loss_mlp": 0.0104966, + "balance_loss_clip": 1.02978826, + "balance_loss_mlp": 1.05576038, + "epoch": 0.16016834510747033, + "flos": 20084012611200.0, + "grad_norm": 2.5815812177362454, + "language_loss": 0.7910682, + "learning_rate": 3.824877036566672e-06, + "loss": 0.81316602, + "num_input_tokens_seen": 57672645, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0390625, + "step": 2664, + "time_per_iteration": 2.4978790283203125 + }, + { + "auxiliary_loss_clip": 0.01156133, + "auxiliary_loss_mlp": 0.01051164, + "balance_loss_clip": 1.03222167, + "balance_loss_mlp": 1.05318165, + "epoch": 0.1602284683601383, + "flos": 21173829206400.0, + "grad_norm": 1.8148985254226184, + "language_loss": 0.93767202, + "learning_rate": 3.824717628865561e-06, + "loss": 0.95974499, + "num_input_tokens_seen": 57691055, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.03125, + "step": 2665, + "time_per_iteration": 2.467349052429199 + }, + { + "auxiliary_loss_clip": 0.01157965, + "auxiliary_loss_mlp": 0.01047224, + "balance_loss_clip": 1.02750635, + "balance_loss_mlp": 1.05352151, + "epoch": 0.16028859161280626, + "flos": 14647568244480.0, + "grad_norm": 1.9534389472193405, + "language_loss": 0.85255575, + "learning_rate": 3.824558151970974e-06, + "loss": 0.87460762, + "num_input_tokens_seen": 57707235, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.046875, + "step": 2666, + "time_per_iteration": 2.4229867458343506 + }, + { + "auxiliary_loss_clip": 0.01155877, + "auxiliary_loss_mlp": 0.01047711, + "balance_loss_clip": 1.02899504, + "balance_loss_mlp": 1.05404496, + "epoch": 0.16034871486547422, + "flos": 20990325600000.0, + "grad_norm": 1.873987360542769, + "language_loss": 0.81461811, + "learning_rate": 3.8243986058889595e-06, + "loss": 0.83665401, + "num_input_tokens_seen": 57724190, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 2667, + "time_per_iteration": 2.4989583492279053 + }, + { + "auxiliary_loss_clip": 0.01157612, + "auxiliary_loss_mlp": 0.01050501, + "balance_loss_clip": 1.03104627, + "balance_loss_mlp": 1.05707479, + "epoch": 0.1604088381181422, + "flos": 21397732634880.0, + "grad_norm": 2.676276626789842, + "language_loss": 0.74079859, + "learning_rate": 3.824238990625567e-06, + "loss": 0.76287973, + "num_input_tokens_seen": 57743620, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0078125, + "step": 2668, + "time_per_iteration": 2.463395357131958 + }, + { + "auxiliary_loss_clip": 0.01158531, + "auxiliary_loss_mlp": 0.01051768, + "balance_loss_clip": 1.03175282, + "balance_loss_mlp": 1.05527806, + "epoch": 0.16046896137081015, + "flos": 23877040993920.0, + "grad_norm": 1.6382268793433732, + "language_loss": 0.77214229, + "learning_rate": 3.824079306186848e-06, + "loss": 0.79424524, + "num_input_tokens_seen": 57764810, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.03125, + "step": 2669, + "time_per_iteration": 2.5107781887054443 + }, + { + "auxiliary_loss_clip": 0.01059914, + "auxiliary_loss_mlp": 0.01008943, + "balance_loss_clip": 1.0062964, + "balance_loss_mlp": 1.0249362, + "epoch": 0.16052908462347812, + "flos": 59806709015040.0, + "grad_norm": 0.8072457077707946, + "language_loss": 0.55571371, + "learning_rate": 3.823919552578861e-06, + "loss": 0.57640231, + "num_input_tokens_seen": 57824390, + "router_z_loss_clip": 0.02648926, + "router_z_loss_mlp": 0.34960938, + "step": 2670, + "time_per_iteration": 2.964386463165283 + }, + { + "auxiliary_loss_clip": 0.01157188, + "auxiliary_loss_mlp": 0.01043938, + "balance_loss_clip": 1.02544856, + "balance_loss_mlp": 1.05379438, + "epoch": 0.1605892078761461, + "flos": 18296559089280.0, + "grad_norm": 8.31640977393562, + "language_loss": 0.77088535, + "learning_rate": 3.82375972980766e-06, + "loss": 0.79289663, + "num_input_tokens_seen": 57843665, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.03125, + "step": 2671, + "time_per_iteration": 2.4722845554351807 + }, + { + "auxiliary_loss_clip": 0.01159298, + "auxiliary_loss_mlp": 0.01045605, + "balance_loss_clip": 1.02684164, + "balance_loss_mlp": 1.05666459, + "epoch": 0.16064933112881408, + "flos": 32160734686080.0, + "grad_norm": 1.9636142117953166, + "language_loss": 0.64497644, + "learning_rate": 3.8235998378793086e-06, + "loss": 0.66702545, + "num_input_tokens_seen": 57863305, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.03125, + "step": 2672, + "time_per_iteration": 2.5702145099639893 + }, + { + "auxiliary_loss_clip": 0.01157155, + "auxiliary_loss_mlp": 0.01042294, + "balance_loss_clip": 1.02128983, + "balance_loss_mlp": 1.05270457, + "epoch": 0.16070945438148204, + "flos": 19828795501440.0, + "grad_norm": 1.885579538712505, + "language_loss": 0.8533771, + "learning_rate": 3.8234398767998675e-06, + "loss": 0.87537158, + "num_input_tokens_seen": 57883025, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.046875, + "step": 2673, + "time_per_iteration": 2.4754209518432617 + }, + { + "auxiliary_loss_clip": 0.01156938, + "auxiliary_loss_mlp": 0.01055602, + "balance_loss_clip": 1.03718424, + "balance_loss_mlp": 1.05537605, + "epoch": 0.16076957763415, + "flos": 18913144976640.0, + "grad_norm": 2.484212796080384, + "language_loss": 0.72797197, + "learning_rate": 3.823279846575403e-06, + "loss": 0.75009739, + "num_input_tokens_seen": 57901430, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.015625, + "step": 2674, + "time_per_iteration": 2.4771230220794678 + }, + { + "auxiliary_loss_clip": 0.01153717, + "auxiliary_loss_mlp": 0.01047616, + "balance_loss_clip": 1.02745771, + "balance_loss_mlp": 1.05242229, + "epoch": 0.16082970088681797, + "flos": 16764358590720.0, + "grad_norm": 2.0917218572710143, + "language_loss": 0.84550452, + "learning_rate": 3.823119747211986e-06, + "loss": 0.86751789, + "num_input_tokens_seen": 57919550, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.015625, + "step": 2675, + "time_per_iteration": 2.4583237171173096 + }, + { + "auxiliary_loss_clip": 0.01158822, + "auxiliary_loss_mlp": 0.0104935, + "balance_loss_clip": 1.02890563, + "balance_loss_mlp": 1.0566349, + "epoch": 0.16088982413948594, + "flos": 35150261783040.0, + "grad_norm": 1.979365293626276, + "language_loss": 0.82605797, + "learning_rate": 3.822959578715685e-06, + "loss": 0.84813964, + "num_input_tokens_seen": 57939890, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0234375, + "step": 2676, + "time_per_iteration": 2.5966403484344482 + }, + { + "auxiliary_loss_clip": 0.01157172, + "auxiliary_loss_mlp": 0.01050632, + "balance_loss_clip": 1.03263116, + "balance_loss_mlp": 1.05701363, + "epoch": 0.1609499473921539, + "flos": 18625105814400.0, + "grad_norm": 1.9372140801278581, + "language_loss": 0.73252106, + "learning_rate": 3.822799341092573e-06, + "loss": 0.75459909, + "num_input_tokens_seen": 57957410, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0, + "step": 2677, + "time_per_iteration": 2.459545135498047 + }, + { + "auxiliary_loss_clip": 0.01153742, + "auxiliary_loss_mlp": 0.01046774, + "balance_loss_clip": 1.02774811, + "balance_loss_mlp": 1.05381799, + "epoch": 0.1610100706448219, + "flos": 33145728416640.0, + "grad_norm": 3.4714871699848, + "language_loss": 0.76175338, + "learning_rate": 3.822639034348728e-06, + "loss": 0.78375852, + "num_input_tokens_seen": 57977900, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0, + "step": 2678, + "time_per_iteration": 2.6220550537109375 + }, + { + "auxiliary_loss_clip": 0.01154476, + "auxiliary_loss_mlp": 0.01048242, + "balance_loss_clip": 1.02835846, + "balance_loss_mlp": 1.05157948, + "epoch": 0.16107019389748986, + "flos": 34676707852800.0, + "grad_norm": 1.6939354956764687, + "language_loss": 0.70202518, + "learning_rate": 3.822478658490228e-06, + "loss": 0.72405231, + "num_input_tokens_seen": 57998210, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.03125, + "step": 2679, + "time_per_iteration": 2.580995559692383 + }, + { + "auxiliary_loss_clip": 0.01054747, + "auxiliary_loss_mlp": 0.01023179, + "balance_loss_clip": 1.020854, + "balance_loss_mlp": 1.02026391, + "epoch": 0.16113031715015783, + "flos": 65713403260800.0, + "grad_norm": 0.8161414687228778, + "language_loss": 0.51844025, + "learning_rate": 3.822318213523154e-06, + "loss": 0.5392195, + "num_input_tokens_seen": 58059420, + "router_z_loss_clip": 0.02319336, + "router_z_loss_mlp": 0.34375, + "step": 2680, + "time_per_iteration": 3.105682849884033 + }, + { + "auxiliary_loss_clip": 0.01155604, + "auxiliary_loss_mlp": 0.01047691, + "balance_loss_clip": 1.02750874, + "balance_loss_mlp": 1.05157876, + "epoch": 0.1611904404028258, + "flos": 20810413353600.0, + "grad_norm": 1.8335073832427007, + "language_loss": 0.80319828, + "learning_rate": 3.8221576994535925e-06, + "loss": 0.82523119, + "num_input_tokens_seen": 58078370, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0390625, + "step": 2681, + "time_per_iteration": 2.4695565700531006 + }, + { + "auxiliary_loss_clip": 0.01151045, + "auxiliary_loss_mlp": 0.01058971, + "balance_loss_clip": 1.04031444, + "balance_loss_mlp": 1.05258918, + "epoch": 0.16125056365549376, + "flos": 27013335062400.0, + "grad_norm": 1.8021457293712753, + "language_loss": 0.69142133, + "learning_rate": 3.821997116287627e-06, + "loss": 0.71352148, + "num_input_tokens_seen": 58097395, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.984375, + "step": 2682, + "time_per_iteration": 2.5027854442596436 + }, + { + "auxiliary_loss_clip": 0.011576, + "auxiliary_loss_mlp": 0.01048243, + "balance_loss_clip": 1.02800107, + "balance_loss_mlp": 1.0559957, + "epoch": 0.16131068690816172, + "flos": 19276524915840.0, + "grad_norm": 1.8107912193408944, + "language_loss": 0.87568235, + "learning_rate": 3.821836464031348e-06, + "loss": 0.89774084, + "num_input_tokens_seen": 58115630, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.015625, + "step": 2683, + "time_per_iteration": 2.461944341659546 + }, + { + "auxiliary_loss_clip": 0.01156212, + "auxiliary_loss_mlp": 0.0105566, + "balance_loss_clip": 1.03587174, + "balance_loss_mlp": 1.05452991, + "epoch": 0.16137081016082971, + "flos": 35337931367040.0, + "grad_norm": 3.5824209574719035, + "language_loss": 0.74160969, + "learning_rate": 3.821675742690849e-06, + "loss": 0.76372838, + "num_input_tokens_seen": 58138655, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.015625, + "step": 2684, + "time_per_iteration": 4.005981206893921 + }, + { + "auxiliary_loss_clip": 0.01159701, + "auxiliary_loss_mlp": 0.01048543, + "balance_loss_clip": 1.02811038, + "balance_loss_mlp": 1.05543995, + "epoch": 0.16143093341349768, + "flos": 34235257703040.0, + "grad_norm": 1.919238603617177, + "language_loss": 0.70244128, + "learning_rate": 3.821514952272223e-06, + "loss": 0.72452366, + "num_input_tokens_seen": 58157440, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.046875, + "step": 2685, + "time_per_iteration": 5.387023448944092 + }, + { + "auxiliary_loss_clip": 0.0115036, + "auxiliary_loss_mlp": 0.01047397, + "balance_loss_clip": 1.0282284, + "balance_loss_mlp": 1.0518229, + "epoch": 0.16149105666616564, + "flos": 27999262546560.0, + "grad_norm": 1.8016019482814314, + "language_loss": 0.71518582, + "learning_rate": 3.821354092781567e-06, + "loss": 0.73716336, + "num_input_tokens_seen": 58176660, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.984375, + "step": 2686, + "time_per_iteration": 2.5451064109802246 + }, + { + "auxiliary_loss_clip": 0.01157161, + "auxiliary_loss_mlp": 0.01051189, + "balance_loss_clip": 1.03191292, + "balance_loss_mlp": 1.05551481, + "epoch": 0.1615511799188336, + "flos": 19422214479360.0, + "grad_norm": 1.8631629169214377, + "language_loss": 0.81521869, + "learning_rate": 3.821193164224981e-06, + "loss": 0.83730221, + "num_input_tokens_seen": 58195085, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.015625, + "step": 2687, + "time_per_iteration": 2.4542620182037354 + }, + { + "auxiliary_loss_clip": 0.01155843, + "auxiliary_loss_mlp": 0.01044301, + "balance_loss_clip": 1.02327275, + "balance_loss_mlp": 1.04894984, + "epoch": 0.16161130317150157, + "flos": 22854915578880.0, + "grad_norm": 1.8081463969498348, + "language_loss": 0.71823454, + "learning_rate": 3.821032166608568e-06, + "loss": 0.74023592, + "num_input_tokens_seen": 58213540, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.0625, + "step": 2688, + "time_per_iteration": 2.493476152420044 + }, + { + "auxiliary_loss_clip": 0.0115191, + "auxiliary_loss_mlp": 0.01045785, + "balance_loss_clip": 1.02730739, + "balance_loss_mlp": 1.05067098, + "epoch": 0.16167142642416954, + "flos": 26110577520000.0, + "grad_norm": 2.2392978206929555, + "language_loss": 0.76041406, + "learning_rate": 3.8208710999384325e-06, + "loss": 0.78239101, + "num_input_tokens_seen": 58236995, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.015625, + "step": 2689, + "time_per_iteration": 2.5840976238250732 + }, + { + "auxiliary_loss_clip": 0.01155388, + "auxiliary_loss_mlp": 0.01046669, + "balance_loss_clip": 1.02704763, + "balance_loss_mlp": 1.05417943, + "epoch": 0.1617315496768375, + "flos": 22779646629120.0, + "grad_norm": 1.9258973882551216, + "language_loss": 0.87260234, + "learning_rate": 3.820709964220683e-06, + "loss": 0.89462292, + "num_input_tokens_seen": 58257230, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.015625, + "step": 2690, + "time_per_iteration": 2.496943473815918 + }, + { + "auxiliary_loss_clip": 0.01151534, + "auxiliary_loss_mlp": 0.01047712, + "balance_loss_clip": 1.02980638, + "balance_loss_mlp": 1.05211663, + "epoch": 0.1617916729295055, + "flos": 22017299351040.0, + "grad_norm": 1.562024048541713, + "language_loss": 0.87728393, + "learning_rate": 3.8205487594614284e-06, + "loss": 0.89927632, + "num_input_tokens_seen": 58277080, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9921875, + "step": 2691, + "time_per_iteration": 2.510960817337036 + }, + { + "auxiliary_loss_clip": 0.01157097, + "auxiliary_loss_mlp": 0.01049167, + "balance_loss_clip": 1.02764988, + "balance_loss_mlp": 1.05021381, + "epoch": 0.16185179618217346, + "flos": 23438248450560.0, + "grad_norm": 2.082856606872889, + "language_loss": 0.82327259, + "learning_rate": 3.820387485666784e-06, + "loss": 0.84533525, + "num_input_tokens_seen": 58294815, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.0703125, + "step": 2692, + "time_per_iteration": 2.481032371520996 + }, + { + "auxiliary_loss_clip": 0.0115716, + "auxiliary_loss_mlp": 0.01048999, + "balance_loss_clip": 1.02835155, + "balance_loss_mlp": 1.05069244, + "epoch": 0.16191191943484143, + "flos": 25666110627840.0, + "grad_norm": 3.0763505181853454, + "language_loss": 0.80942917, + "learning_rate": 3.820226142842862e-06, + "loss": 0.83149081, + "num_input_tokens_seen": 58313215, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.0625, + "step": 2693, + "time_per_iteration": 2.493278980255127 + }, + { + "auxiliary_loss_clip": 0.01150662, + "auxiliary_loss_mlp": 0.01054953, + "balance_loss_clip": 1.03670192, + "balance_loss_mlp": 1.05223358, + "epoch": 0.1619720426875094, + "flos": 23477355383040.0, + "grad_norm": 1.7139740211881158, + "language_loss": 0.83639967, + "learning_rate": 3.820064730995783e-06, + "loss": 0.85845578, + "num_input_tokens_seen": 58333215, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.984375, + "step": 2694, + "time_per_iteration": 2.5051510334014893 + }, + { + "auxiliary_loss_clip": 0.01156309, + "auxiliary_loss_mlp": 0.01048183, + "balance_loss_clip": 1.02764273, + "balance_loss_mlp": 1.0509156, + "epoch": 0.16203216594017736, + "flos": 24133658734080.0, + "grad_norm": 1.9608549080280004, + "language_loss": 0.69125426, + "learning_rate": 3.819903250131667e-06, + "loss": 0.71329916, + "num_input_tokens_seen": 58351160, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0546875, + "step": 2695, + "time_per_iteration": 2.495098352432251 + }, + { + "auxiliary_loss_clip": 0.01159947, + "auxiliary_loss_mlp": 0.01054922, + "balance_loss_clip": 1.03391731, + "balance_loss_mlp": 1.05520689, + "epoch": 0.16209228919284532, + "flos": 22340889999360.0, + "grad_norm": 2.466913217352614, + "language_loss": 0.82403111, + "learning_rate": 3.819741700256637e-06, + "loss": 0.84617984, + "num_input_tokens_seen": 58368505, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.046875, + "step": 2696, + "time_per_iteration": 2.484523296356201 + }, + { + "auxiliary_loss_clip": 0.01161904, + "auxiliary_loss_mlp": 0.01056335, + "balance_loss_clip": 1.03529406, + "balance_loss_mlp": 1.05316591, + "epoch": 0.1621524124455133, + "flos": 15815131827840.0, + "grad_norm": 1.9982919021229957, + "language_loss": 0.8852337, + "learning_rate": 3.8195800813768194e-06, + "loss": 0.90741605, + "num_input_tokens_seen": 58385085, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.09375, + "step": 2697, + "time_per_iteration": 2.4806151390075684 + }, + { + "auxiliary_loss_clip": 0.01147135, + "auxiliary_loss_mlp": 0.01046149, + "balance_loss_clip": 1.02756453, + "balance_loss_mlp": 1.04989469, + "epoch": 0.16221253569818128, + "flos": 30186688988160.0, + "grad_norm": 1.4702975792509376, + "language_loss": 0.80172735, + "learning_rate": 3.819418393498343e-06, + "loss": 0.82366014, + "num_input_tokens_seen": 58406985, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.97265625, + "step": 2698, + "time_per_iteration": 2.532137393951416 + }, + { + "auxiliary_loss_clip": 0.01149805, + "auxiliary_loss_mlp": 0.01049018, + "balance_loss_clip": 1.02957439, + "balance_loss_mlp": 1.05167758, + "epoch": 0.16227265895084925, + "flos": 24605991601920.0, + "grad_norm": 1.5576448961090323, + "language_loss": 0.77258182, + "learning_rate": 3.819256636627339e-06, + "loss": 0.79456997, + "num_input_tokens_seen": 58426205, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.98046875, + "step": 2699, + "time_per_iteration": 2.514084577560425 + }, + { + "auxiliary_loss_clip": 0.01150261, + "auxiliary_loss_mlp": 0.0104371, + "balance_loss_clip": 1.0251497, + "balance_loss_mlp": 1.04891944, + "epoch": 0.1623327822035172, + "flos": 19573326996480.0, + "grad_norm": 2.038036982956784, + "language_loss": 0.85697722, + "learning_rate": 3.81909481076994e-06, + "loss": 0.87891692, + "num_input_tokens_seen": 58443830, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.015625, + "step": 2700, + "time_per_iteration": 2.4434289932250977 + }, + { + "auxiliary_loss_clip": 0.01147712, + "auxiliary_loss_mlp": 0.0104585, + "balance_loss_clip": 1.0247376, + "balance_loss_mlp": 1.04878318, + "epoch": 0.16239290545618518, + "flos": 26468462678400.0, + "grad_norm": 1.6982179557795123, + "language_loss": 0.80378878, + "learning_rate": 3.818932915932284e-06, + "loss": 0.82572436, + "num_input_tokens_seen": 58464405, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.98828125, + "step": 2701, + "time_per_iteration": 2.5267322063446045 + }, + { + "auxiliary_loss_clip": 0.01156296, + "auxiliary_loss_mlp": 0.01048895, + "balance_loss_clip": 1.02945244, + "balance_loss_mlp": 1.05514598, + "epoch": 0.16245302870885314, + "flos": 15851940289920.0, + "grad_norm": 1.5999982166608073, + "language_loss": 0.73006868, + "learning_rate": 3.818770952120511e-06, + "loss": 0.75212055, + "num_input_tokens_seen": 58483295, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0078125, + "step": 2702, + "time_per_iteration": 2.44750714302063 + }, + { + "auxiliary_loss_clip": 0.01153204, + "auxiliary_loss_mlp": 0.01050741, + "balance_loss_clip": 1.02986753, + "balance_loss_mlp": 1.05053687, + "epoch": 0.1625131519615211, + "flos": 14756521173120.0, + "grad_norm": 2.5386207662450464, + "language_loss": 0.73164749, + "learning_rate": 3.81860891934076e-06, + "loss": 0.7536869, + "num_input_tokens_seen": 58501205, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.0234375, + "step": 2703, + "time_per_iteration": 2.469242811203003 + }, + { + "auxiliary_loss_clip": 0.01150736, + "auxiliary_loss_mlp": 0.01046914, + "balance_loss_clip": 1.02538514, + "balance_loss_mlp": 1.04765964, + "epoch": 0.1625732752141891, + "flos": 28220508368640.0, + "grad_norm": 1.9216464968932823, + "language_loss": 0.70681584, + "learning_rate": 3.818446817599176e-06, + "loss": 0.72879231, + "num_input_tokens_seen": 58522315, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.03125, + "step": 2704, + "time_per_iteration": 2.5236263275146484 + }, + { + "auxiliary_loss_clip": 0.0105028, + "auxiliary_loss_mlp": 0.01003507, + "balance_loss_clip": 1.00091982, + "balance_loss_mlp": 1.01563144, + "epoch": 0.16263339846685707, + "flos": 67327947688320.0, + "grad_norm": 0.7797469934396678, + "language_loss": 0.53369009, + "learning_rate": 3.818284646901907e-06, + "loss": 0.55422795, + "num_input_tokens_seen": 58586695, + "router_z_loss_clip": 0.02587891, + "router_z_loss_mlp": 0.34765625, + "step": 2705, + "time_per_iteration": 3.0887868404388428 + }, + { + "auxiliary_loss_clip": 0.0115608, + "auxiliary_loss_mlp": 0.01048272, + "balance_loss_clip": 1.02873373, + "balance_loss_mlp": 1.05151534, + "epoch": 0.16269352171952503, + "flos": 14319165173760.0, + "grad_norm": 2.4525976943058896, + "language_loss": 0.75060308, + "learning_rate": 3.818122407255102e-06, + "loss": 0.77264655, + "num_input_tokens_seen": 58602435, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.046875, + "step": 2706, + "time_per_iteration": 2.439283847808838 + }, + { + "auxiliary_loss_clip": 0.01154579, + "auxiliary_loss_mlp": 0.01051686, + "balance_loss_clip": 1.03248119, + "balance_loss_mlp": 1.05240536, + "epoch": 0.162753644972193, + "flos": 28361205941760.0, + "grad_norm": 1.9153778871117788, + "language_loss": 0.7234174, + "learning_rate": 3.817960098664914e-06, + "loss": 0.74547994, + "num_input_tokens_seen": 58621275, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.015625, + "step": 2707, + "time_per_iteration": 2.51819109916687 + }, + { + "auxiliary_loss_clip": 0.01155215, + "auxiliary_loss_mlp": 0.01050366, + "balance_loss_clip": 1.03154302, + "balance_loss_mlp": 1.05275822, + "epoch": 0.16281376822486096, + "flos": 19937856170880.0, + "grad_norm": 3.869992791268662, + "language_loss": 0.83790398, + "learning_rate": 3.817797721137495e-06, + "loss": 0.85995972, + "num_input_tokens_seen": 58637550, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.03125, + "step": 2708, + "time_per_iteration": 2.4592010974884033 + }, + { + "auxiliary_loss_clip": 0.0115992, + "auxiliary_loss_mlp": 0.01049095, + "balance_loss_clip": 1.02768469, + "balance_loss_mlp": 1.05268705, + "epoch": 0.16287389147752893, + "flos": 21251719848960.0, + "grad_norm": 2.162290718142945, + "language_loss": 0.86529553, + "learning_rate": 3.817635274679006e-06, + "loss": 0.88738573, + "num_input_tokens_seen": 58654135, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.0703125, + "step": 2709, + "time_per_iteration": 2.4745054244995117 + }, + { + "auxiliary_loss_clip": 0.01154974, + "auxiliary_loss_mlp": 0.01054439, + "balance_loss_clip": 1.0353297, + "balance_loss_mlp": 1.05096519, + "epoch": 0.1629340147301969, + "flos": 19244672530560.0, + "grad_norm": 1.6782807127870958, + "language_loss": 0.91449893, + "learning_rate": 3.817472759295605e-06, + "loss": 0.93659306, + "num_input_tokens_seen": 58674320, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0390625, + "step": 2710, + "time_per_iteration": 2.4846651554107666 + }, + { + "auxiliary_loss_clip": 0.0115562, + "auxiliary_loss_mlp": 0.01054818, + "balance_loss_clip": 1.03549433, + "balance_loss_mlp": 1.05447197, + "epoch": 0.16299413798286488, + "flos": 21249816428160.0, + "grad_norm": 1.99410407833921, + "language_loss": 0.8129673, + "learning_rate": 3.817310174993453e-06, + "loss": 0.83507168, + "num_input_tokens_seen": 58691000, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0078125, + "step": 2711, + "time_per_iteration": 2.4878618717193604 + }, + { + "auxiliary_loss_clip": 0.01153665, + "auxiliary_loss_mlp": 0.01046499, + "balance_loss_clip": 1.02731824, + "balance_loss_mlp": 1.04737568, + "epoch": 0.16305426123553285, + "flos": 18770579896320.0, + "grad_norm": 2.7794575527068077, + "language_loss": 0.81605875, + "learning_rate": 3.817147521778719e-06, + "loss": 0.83806038, + "num_input_tokens_seen": 58710230, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0625, + "step": 2712, + "time_per_iteration": 2.4479072093963623 + }, + { + "auxiliary_loss_clip": 0.01158025, + "auxiliary_loss_mlp": 0.01058532, + "balance_loss_clip": 1.03858864, + "balance_loss_mlp": 1.05211174, + "epoch": 0.16311438448820081, + "flos": 22087648137600.0, + "grad_norm": 2.1959953506899774, + "language_loss": 0.76885653, + "learning_rate": 3.816984799657568e-06, + "loss": 0.79102206, + "num_input_tokens_seen": 58728610, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0625, + "step": 2713, + "time_per_iteration": 2.493394374847412 + }, + { + "auxiliary_loss_clip": 0.01155185, + "auxiliary_loss_mlp": 0.01062558, + "balance_loss_clip": 1.04290032, + "balance_loss_mlp": 1.05623782, + "epoch": 0.16317450774086878, + "flos": 16467700164480.0, + "grad_norm": 2.081844956712308, + "language_loss": 0.78926778, + "learning_rate": 3.8168220086361715e-06, + "loss": 0.8114453, + "num_input_tokens_seen": 58744385, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.98828125, + "step": 2714, + "time_per_iteration": 2.442214012145996 + }, + { + "auxiliary_loss_clip": 0.01155305, + "auxiliary_loss_mlp": 0.01059199, + "balance_loss_clip": 1.04011369, + "balance_loss_mlp": 1.05286288, + "epoch": 0.16323463099353674, + "flos": 24352929308160.0, + "grad_norm": 2.259619309439112, + "language_loss": 0.78143466, + "learning_rate": 3.816659148720702e-06, + "loss": 0.80357969, + "num_input_tokens_seen": 58763905, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0234375, + "step": 2715, + "time_per_iteration": 2.499178409576416 + }, + { + "auxiliary_loss_clip": 0.01150615, + "auxiliary_loss_mlp": 0.01047807, + "balance_loss_clip": 1.02973497, + "balance_loss_mlp": 1.04868412, + "epoch": 0.1632947542462047, + "flos": 24900782520960.0, + "grad_norm": 2.0916631483814783, + "language_loss": 0.81397748, + "learning_rate": 3.816496219917336e-06, + "loss": 0.8359617, + "num_input_tokens_seen": 58785580, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.015625, + "step": 2716, + "time_per_iteration": 2.5004689693450928 + }, + { + "auxiliary_loss_clip": 0.01158421, + "auxiliary_loss_mlp": 0.01057354, + "balance_loss_clip": 1.03853106, + "balance_loss_mlp": 1.05482328, + "epoch": 0.1633548774988727, + "flos": 24900279730560.0, + "grad_norm": 1.8793848003912939, + "language_loss": 0.86203027, + "learning_rate": 3.816333222232251e-06, + "loss": 0.88418794, + "num_input_tokens_seen": 58806075, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0390625, + "step": 2717, + "time_per_iteration": 2.5112617015838623 + }, + { + "auxiliary_loss_clip": 0.0115161, + "auxiliary_loss_mlp": 0.01046152, + "balance_loss_clip": 1.02725708, + "balance_loss_mlp": 1.05153894, + "epoch": 0.16341500075154067, + "flos": 30441798357120.0, + "grad_norm": 1.652261986612604, + "language_loss": 0.76514149, + "learning_rate": 3.816170155671629e-06, + "loss": 0.78711915, + "num_input_tokens_seen": 58827405, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0, + "step": 2718, + "time_per_iteration": 2.549654245376587 + }, + { + "auxiliary_loss_clip": 0.01156654, + "auxiliary_loss_mlp": 0.01045361, + "balance_loss_clip": 1.02696729, + "balance_loss_mlp": 1.05180717, + "epoch": 0.16347512400420863, + "flos": 22784530878720.0, + "grad_norm": 2.080955072975882, + "language_loss": 0.73027492, + "learning_rate": 3.816007020241652e-06, + "loss": 0.75229508, + "num_input_tokens_seen": 58847205, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.046875, + "step": 2719, + "time_per_iteration": 2.4911599159240723 + }, + { + "auxiliary_loss_clip": 0.01151759, + "auxiliary_loss_mlp": 0.01049636, + "balance_loss_clip": 1.03084862, + "balance_loss_mlp": 1.0492239, + "epoch": 0.1635352472568766, + "flos": 22633274707200.0, + "grad_norm": 1.6610037254914274, + "language_loss": 0.72384167, + "learning_rate": 3.815843815948507e-06, + "loss": 0.74585563, + "num_input_tokens_seen": 58866865, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0234375, + "step": 2720, + "time_per_iteration": 2.4733760356903076 + }, + { + "auxiliary_loss_clip": 0.01150132, + "auxiliary_loss_mlp": 0.01048266, + "balance_loss_clip": 1.02789283, + "balance_loss_mlp": 1.05076206, + "epoch": 0.16359537050954456, + "flos": 15522998515200.0, + "grad_norm": 2.2797021453727893, + "language_loss": 0.75100243, + "learning_rate": 3.8156805427983824e-06, + "loss": 0.77298641, + "num_input_tokens_seen": 58885200, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.9921875, + "step": 2721, + "time_per_iteration": 2.44942569732666 + }, + { + "auxiliary_loss_clip": 0.01155245, + "auxiliary_loss_mlp": 0.01049168, + "balance_loss_clip": 1.02893853, + "balance_loss_mlp": 1.0502317, + "epoch": 0.16365549376221253, + "flos": 22090162089600.0, + "grad_norm": 1.74959220753002, + "language_loss": 0.79254043, + "learning_rate": 3.8155172007974695e-06, + "loss": 0.81458461, + "num_input_tokens_seen": 58906385, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.046875, + "step": 2722, + "time_per_iteration": 2.4775915145874023 + }, + { + "auxiliary_loss_clip": 0.01158964, + "auxiliary_loss_mlp": 0.01049216, + "balance_loss_clip": 1.02675653, + "balance_loss_mlp": 1.05248678, + "epoch": 0.1637156170148805, + "flos": 24060400945920.0, + "grad_norm": 2.0539311275727634, + "language_loss": 0.8477816, + "learning_rate": 3.8153537899519624e-06, + "loss": 0.86986339, + "num_input_tokens_seen": 58925040, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.0625, + "step": 2723, + "time_per_iteration": 2.5084922313690186 + }, + { + "auxiliary_loss_clip": 0.01146914, + "auxiliary_loss_mlp": 0.01037208, + "balance_loss_clip": 1.0177772, + "balance_loss_mlp": 1.04940808, + "epoch": 0.1637757402675485, + "flos": 26685362954880.0, + "grad_norm": 2.0049787201865503, + "language_loss": 0.70883536, + "learning_rate": 3.815190310268058e-06, + "loss": 0.73067659, + "num_input_tokens_seen": 58944790, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.9765625, + "step": 2724, + "time_per_iteration": 2.5094263553619385 + }, + { + "auxiliary_loss_clip": 0.01150034, + "auxiliary_loss_mlp": 0.01044202, + "balance_loss_clip": 1.02583206, + "balance_loss_mlp": 1.05113125, + "epoch": 0.16383586352021645, + "flos": 16106941918080.0, + "grad_norm": 2.04326868324577, + "language_loss": 0.70914948, + "learning_rate": 3.815026761751955e-06, + "loss": 0.73109186, + "num_input_tokens_seen": 58962500, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.9921875, + "step": 2725, + "time_per_iteration": 2.495342254638672 + }, + { + "auxiliary_loss_clip": 0.01149794, + "auxiliary_loss_mlp": 0.01042547, + "balance_loss_clip": 1.02437937, + "balance_loss_mlp": 1.05219352, + "epoch": 0.16389598677288442, + "flos": 19165991788800.0, + "grad_norm": 1.9381311422505, + "language_loss": 0.8873682, + "learning_rate": 3.814863144409855e-06, + "loss": 0.90929163, + "num_input_tokens_seen": 58980355, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9765625, + "step": 2726, + "time_per_iteration": 3.983738660812378 + }, + { + "auxiliary_loss_clip": 0.01156798, + "auxiliary_loss_mlp": 0.01049309, + "balance_loss_clip": 1.02965117, + "balance_loss_mlp": 1.05406547, + "epoch": 0.16395611002555238, + "flos": 21507008785920.0, + "grad_norm": 1.8502717081228044, + "language_loss": 0.7439661, + "learning_rate": 3.814699458247963e-06, + "loss": 0.76602715, + "num_input_tokens_seen": 58999505, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.03125, + "step": 2727, + "time_per_iteration": 5.52494215965271 + }, + { + "auxiliary_loss_clip": 0.01150784, + "auxiliary_loss_mlp": 0.0105177, + "balance_loss_clip": 1.03429413, + "balance_loss_mlp": 1.05145037, + "epoch": 0.16401623327822035, + "flos": 21470918595840.0, + "grad_norm": 1.6814144838265654, + "language_loss": 0.82321334, + "learning_rate": 3.8145357032724855e-06, + "loss": 0.84523886, + "num_input_tokens_seen": 59017930, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9921875, + "step": 2728, + "time_per_iteration": 2.4621498584747314 + }, + { + "auxiliary_loss_clip": 0.01156146, + "auxiliary_loss_mlp": 0.01050932, + "balance_loss_clip": 1.03131044, + "balance_loss_mlp": 1.05167341, + "epoch": 0.1640763565308883, + "flos": 13626232928640.0, + "grad_norm": 2.4458707176630425, + "language_loss": 0.84766865, + "learning_rate": 3.814371879489633e-06, + "loss": 0.86973941, + "num_input_tokens_seen": 59035130, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0390625, + "step": 2729, + "time_per_iteration": 2.459495782852173 + }, + { + "auxiliary_loss_clip": 0.01151277, + "auxiliary_loss_mlp": 0.01044659, + "balance_loss_clip": 1.02661061, + "balance_loss_mlp": 1.04923487, + "epoch": 0.16413647978355628, + "flos": 15451464579840.0, + "grad_norm": 1.9327126112676087, + "language_loss": 0.72569054, + "learning_rate": 3.814207986905616e-06, + "loss": 0.74764991, + "num_input_tokens_seen": 59053080, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.0234375, + "step": 2730, + "time_per_iteration": 2.451016902923584 + }, + { + "auxiliary_loss_clip": 0.01153124, + "auxiliary_loss_mlp": 0.01053311, + "balance_loss_clip": 1.03243709, + "balance_loss_mlp": 1.04862678, + "epoch": 0.16419660303622427, + "flos": 45878682015360.0, + "grad_norm": 2.2141787283307854, + "language_loss": 0.74431163, + "learning_rate": 3.814044025526651e-06, + "loss": 0.76637596, + "num_input_tokens_seen": 59075610, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.046875, + "step": 2731, + "time_per_iteration": 2.6857874393463135 + }, + { + "auxiliary_loss_clip": 0.0115844, + "auxiliary_loss_mlp": 0.01048812, + "balance_loss_clip": 1.02818894, + "balance_loss_mlp": 1.05408466, + "epoch": 0.16425672628889224, + "flos": 18952826526720.0, + "grad_norm": 2.15833206643789, + "language_loss": 0.78783584, + "learning_rate": 3.8138799953589548e-06, + "loss": 0.80990839, + "num_input_tokens_seen": 59094555, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.046875, + "step": 2732, + "time_per_iteration": 2.44146728515625 + }, + { + "auxiliary_loss_clip": 0.01155842, + "auxiliary_loss_mlp": 0.01051717, + "balance_loss_clip": 1.03166568, + "balance_loss_mlp": 1.05211556, + "epoch": 0.1643168495415602, + "flos": 24312996362880.0, + "grad_norm": 1.9937390498547816, + "language_loss": 0.68943298, + "learning_rate": 3.8137158964087473e-06, + "loss": 0.71150857, + "num_input_tokens_seen": 59113515, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.0390625, + "step": 2733, + "time_per_iteration": 2.4981601238250732 + }, + { + "auxiliary_loss_clip": 0.01151384, + "auxiliary_loss_mlp": 0.01048132, + "balance_loss_clip": 1.02792621, + "balance_loss_mlp": 1.05054927, + "epoch": 0.16437697279422817, + "flos": 26428421992320.0, + "grad_norm": 2.20018793155086, + "language_loss": 0.80626202, + "learning_rate": 3.8135517286822508e-06, + "loss": 0.8282572, + "num_input_tokens_seen": 59133275, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0078125, + "step": 2734, + "time_per_iteration": 2.495030641555786 + }, + { + "auxiliary_loss_clip": 0.01152713, + "auxiliary_loss_mlp": 0.0105599, + "balance_loss_clip": 1.03638041, + "balance_loss_mlp": 1.05143905, + "epoch": 0.16443709604689613, + "flos": 34532239351680.0, + "grad_norm": 4.0691467716051175, + "language_loss": 0.82265377, + "learning_rate": 3.8133874921856914e-06, + "loss": 0.84474081, + "num_input_tokens_seen": 59154095, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0078125, + "step": 2735, + "time_per_iteration": 2.5911896228790283 + }, + { + "auxiliary_loss_clip": 0.01150004, + "auxiliary_loss_mlp": 0.01044021, + "balance_loss_clip": 1.02556753, + "balance_loss_mlp": 1.05158913, + "epoch": 0.1644972192995641, + "flos": 23258048895360.0, + "grad_norm": 2.5735103485950077, + "language_loss": 0.78697491, + "learning_rate": 3.813223186925296e-06, + "loss": 0.80891526, + "num_input_tokens_seen": 59173795, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.984375, + "step": 2736, + "time_per_iteration": 2.4699559211730957 + }, + { + "auxiliary_loss_clip": 0.01155005, + "auxiliary_loss_mlp": 0.01052588, + "balance_loss_clip": 1.03438449, + "balance_loss_mlp": 1.05231023, + "epoch": 0.1645573425522321, + "flos": 26979543342720.0, + "grad_norm": 1.680513335410081, + "language_loss": 0.81409019, + "learning_rate": 3.8130588129072964e-06, + "loss": 0.83616614, + "num_input_tokens_seen": 59191610, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.03125, + "step": 2737, + "time_per_iteration": 2.4892401695251465 + }, + { + "auxiliary_loss_clip": 0.0115392, + "auxiliary_loss_mlp": 0.01046744, + "balance_loss_clip": 1.02819467, + "balance_loss_mlp": 1.05107307, + "epoch": 0.16461746580490005, + "flos": 28731768600960.0, + "grad_norm": 1.8393773079816103, + "language_loss": 0.87291563, + "learning_rate": 3.8128943701379246e-06, + "loss": 0.89492232, + "num_input_tokens_seen": 59213000, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.03125, + "step": 2738, + "time_per_iteration": 2.54569935798645 + }, + { + "auxiliary_loss_clip": 0.01154293, + "auxiliary_loss_mlp": 0.01055893, + "balance_loss_clip": 1.03653371, + "balance_loss_mlp": 1.05139303, + "epoch": 0.16467758905756802, + "flos": 24930156867840.0, + "grad_norm": 2.0122721864238438, + "language_loss": 0.72351867, + "learning_rate": 3.8127298586234167e-06, + "loss": 0.74562055, + "num_input_tokens_seen": 59232340, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.03125, + "step": 2739, + "time_per_iteration": 2.5309460163116455 + }, + { + "auxiliary_loss_clip": 0.01148442, + "auxiliary_loss_mlp": 0.0104888, + "balance_loss_clip": 1.02991343, + "balance_loss_mlp": 1.04766631, + "epoch": 0.16473771231023598, + "flos": 24826519152000.0, + "grad_norm": 1.690107638621115, + "language_loss": 0.81735384, + "learning_rate": 3.8125652783700104e-06, + "loss": 0.8393271, + "num_input_tokens_seen": 59253950, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0078125, + "step": 2740, + "time_per_iteration": 2.5005404949188232 + }, + { + "auxiliary_loss_clip": 0.01157284, + "auxiliary_loss_mlp": 0.01053239, + "balance_loss_clip": 1.03176928, + "balance_loss_mlp": 1.05347896, + "epoch": 0.16479783556290395, + "flos": 39896072375040.0, + "grad_norm": 2.8033984026588756, + "language_loss": 0.69098473, + "learning_rate": 3.8124006293839475e-06, + "loss": 0.71308994, + "num_input_tokens_seen": 59275545, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.0390625, + "step": 2741, + "time_per_iteration": 2.6353659629821777 + }, + { + "auxiliary_loss_clip": 0.01151645, + "auxiliary_loss_mlp": 0.01044648, + "balance_loss_clip": 1.02588463, + "balance_loss_mlp": 1.04987025, + "epoch": 0.16485795881557191, + "flos": 19897061299200.0, + "grad_norm": 2.1078448839323167, + "language_loss": 0.79967189, + "learning_rate": 3.812235911671472e-06, + "loss": 0.82163477, + "num_input_tokens_seen": 59293480, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 2742, + "time_per_iteration": 2.4471442699432373 + }, + { + "auxiliary_loss_clip": 0.01150824, + "auxiliary_loss_mlp": 0.01053847, + "balance_loss_clip": 1.03373659, + "balance_loss_mlp": 1.05117011, + "epoch": 0.16491808206823988, + "flos": 20556129997440.0, + "grad_norm": 2.1468697804747823, + "language_loss": 0.84769481, + "learning_rate": 3.8120711252388274e-06, + "loss": 0.86974156, + "num_input_tokens_seen": 59313435, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0, + "step": 2743, + "time_per_iteration": 2.459146022796631 + }, + { + "auxiliary_loss_clip": 0.01149398, + "auxiliary_loss_mlp": 0.01052609, + "balance_loss_clip": 1.03359556, + "balance_loss_mlp": 1.05074859, + "epoch": 0.16497820532090787, + "flos": 23800802376960.0, + "grad_norm": 1.5853616537097488, + "language_loss": 0.85723281, + "learning_rate": 3.811906270092265e-06, + "loss": 0.87925285, + "num_input_tokens_seen": 59331535, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.984375, + "step": 2744, + "time_per_iteration": 2.4920642375946045 + }, + { + "auxiliary_loss_clip": 0.01147114, + "auxiliary_loss_mlp": 0.01046312, + "balance_loss_clip": 1.0283947, + "balance_loss_mlp": 1.05124998, + "epoch": 0.16503832857357584, + "flos": 25482642935040.0, + "grad_norm": 1.7300129139105382, + "language_loss": 0.82973897, + "learning_rate": 3.811741346238036e-06, + "loss": 0.85167319, + "num_input_tokens_seen": 59350680, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9609375, + "step": 2745, + "time_per_iteration": 2.490399122238159 + }, + { + "auxiliary_loss_clip": 0.0115758, + "auxiliary_loss_mlp": 0.01054165, + "balance_loss_clip": 1.03548467, + "balance_loss_mlp": 1.05477679, + "epoch": 0.1650984518262438, + "flos": 17676058619520.0, + "grad_norm": 2.19754759855213, + "language_loss": 0.76411253, + "learning_rate": 3.8115763536823923e-06, + "loss": 0.78622997, + "num_input_tokens_seen": 59367020, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.03125, + "step": 2746, + "time_per_iteration": 2.46258282661438 + }, + { + "auxiliary_loss_clip": 0.01152266, + "auxiliary_loss_mlp": 0.01052583, + "balance_loss_clip": 1.03387904, + "balance_loss_mlp": 1.05164099, + "epoch": 0.16515857507891177, + "flos": 18698327688960.0, + "grad_norm": 1.5978428663850568, + "language_loss": 0.80686736, + "learning_rate": 3.811411292431592e-06, + "loss": 0.82891583, + "num_input_tokens_seen": 59386075, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0078125, + "step": 2747, + "time_per_iteration": 2.4612972736358643 + }, + { + "auxiliary_loss_clip": 0.01158238, + "auxiliary_loss_mlp": 0.01048108, + "balance_loss_clip": 1.02848577, + "balance_loss_mlp": 1.05559731, + "epoch": 0.16521869833157973, + "flos": 15010481306880.0, + "grad_norm": 1.853069559467639, + "language_loss": 0.69463658, + "learning_rate": 3.8112461624918945e-06, + "loss": 0.71670008, + "num_input_tokens_seen": 59402690, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0234375, + "step": 2748, + "time_per_iteration": 2.4235999584198 + }, + { + "auxiliary_loss_clip": 0.01155731, + "auxiliary_loss_mlp": 0.0105142, + "balance_loss_clip": 1.03314471, + "balance_loss_mlp": 1.05482006, + "epoch": 0.1652788215842477, + "flos": 22121152548480.0, + "grad_norm": 2.265414403061137, + "language_loss": 0.87653661, + "learning_rate": 3.811080963869561e-06, + "loss": 0.89860809, + "num_input_tokens_seen": 59421130, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.0078125, + "step": 2749, + "time_per_iteration": 2.4706709384918213 + }, + { + "auxiliary_loss_clip": 0.01153325, + "auxiliary_loss_mlp": 0.01048781, + "balance_loss_clip": 1.02905142, + "balance_loss_mlp": 1.0509429, + "epoch": 0.16533894483691566, + "flos": 18333080242560.0, + "grad_norm": 2.3451981357461444, + "language_loss": 0.79248077, + "learning_rate": 3.8109156965708557e-06, + "loss": 0.81450188, + "num_input_tokens_seen": 59438970, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0234375, + "step": 2750, + "time_per_iteration": 2.4588990211486816 + }, + { + "auxiliary_loss_clip": 0.01151699, + "auxiliary_loss_mlp": 0.01045956, + "balance_loss_clip": 1.02657294, + "balance_loss_mlp": 1.05188382, + "epoch": 0.16539906808958366, + "flos": 22382115834240.0, + "grad_norm": 1.7653411133265118, + "language_loss": 0.95010567, + "learning_rate": 3.8107503606020455e-06, + "loss": 0.9720822, + "num_input_tokens_seen": 59458510, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.99609375, + "step": 2751, + "time_per_iteration": 2.4776439666748047 + }, + { + "auxiliary_loss_clip": 0.01152135, + "auxiliary_loss_mlp": 0.01045797, + "balance_loss_clip": 1.02762985, + "balance_loss_mlp": 1.05480134, + "epoch": 0.16545919134225162, + "flos": 22711093522560.0, + "grad_norm": 1.9833662518999209, + "language_loss": 0.71080822, + "learning_rate": 3.8105849559693997e-06, + "loss": 0.73278749, + "num_input_tokens_seen": 59477110, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.97265625, + "step": 2752, + "time_per_iteration": 2.4609227180480957 + }, + { + "auxiliary_loss_clip": 0.01051961, + "auxiliary_loss_mlp": 0.01021231, + "balance_loss_clip": 1.01878762, + "balance_loss_mlp": 1.01785779, + "epoch": 0.1655193145949196, + "flos": 67802974076160.0, + "grad_norm": 0.7698122762266473, + "language_loss": 0.54079807, + "learning_rate": 3.810419482679192e-06, + "loss": 0.56152999, + "num_input_tokens_seen": 59541155, + "router_z_loss_clip": 0.02441406, + "router_z_loss_mlp": 0.33984375, + "step": 2753, + "time_per_iteration": 3.161339282989502 + }, + { + "auxiliary_loss_clip": 0.01152964, + "auxiliary_loss_mlp": 0.01042006, + "balance_loss_clip": 1.02282572, + "balance_loss_mlp": 1.05254793, + "epoch": 0.16557943784758755, + "flos": 24280389792000.0, + "grad_norm": 1.9686645345026932, + "language_loss": 0.75467873, + "learning_rate": 3.8102539407376954e-06, + "loss": 0.77662838, + "num_input_tokens_seen": 59561155, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.0, + "step": 2754, + "time_per_iteration": 2.4837357997894287 + }, + { + "auxiliary_loss_clip": 0.01160718, + "auxiliary_loss_mlp": 0.01060834, + "balance_loss_clip": 1.03875661, + "balance_loss_mlp": 1.05358946, + "epoch": 0.16563956110025552, + "flos": 20083617561600.0, + "grad_norm": 3.81944507319113, + "language_loss": 0.87154973, + "learning_rate": 3.810088330151188e-06, + "loss": 0.89376527, + "num_input_tokens_seen": 59580460, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.0703125, + "step": 2755, + "time_per_iteration": 2.482780933380127 + }, + { + "auxiliary_loss_clip": 0.01148695, + "auxiliary_loss_mlp": 0.01052509, + "balance_loss_clip": 1.03348362, + "balance_loss_mlp": 1.04862666, + "epoch": 0.16569968435292348, + "flos": 28034454896640.0, + "grad_norm": 1.859731734913831, + "language_loss": 0.73258269, + "learning_rate": 3.80992265092595e-06, + "loss": 0.7545948, + "num_input_tokens_seen": 59600025, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0, + "step": 2756, + "time_per_iteration": 2.519432783126831 + }, + { + "auxiliary_loss_clip": 0.01149207, + "auxiliary_loss_mlp": 0.01049415, + "balance_loss_clip": 1.02999544, + "balance_loss_mlp": 1.05331099, + "epoch": 0.16575980760559147, + "flos": 26250233598720.0, + "grad_norm": 1.6628427585054586, + "language_loss": 0.74967468, + "learning_rate": 3.8097569030682636e-06, + "loss": 0.77166092, + "num_input_tokens_seen": 59620600, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.9609375, + "step": 2757, + "time_per_iteration": 2.5122530460357666 + }, + { + "auxiliary_loss_clip": 0.01154145, + "auxiliary_loss_mlp": 0.01044644, + "balance_loss_clip": 1.02590466, + "balance_loss_mlp": 1.05359447, + "epoch": 0.16581993085825944, + "flos": 26943955943040.0, + "grad_norm": 2.101183789218018, + "language_loss": 0.84532511, + "learning_rate": 3.8095910865844137e-06, + "loss": 0.86731303, + "num_input_tokens_seen": 59641385, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0078125, + "step": 2758, + "time_per_iteration": 2.5268592834472656 + }, + { + "auxiliary_loss_clip": 0.01153935, + "auxiliary_loss_mlp": 0.01051485, + "balance_loss_clip": 1.03382993, + "balance_loss_mlp": 1.05355358, + "epoch": 0.1658800541109274, + "flos": 21653632103040.0, + "grad_norm": 3.016772390052645, + "language_loss": 0.79003322, + "learning_rate": 3.809425201480689e-06, + "loss": 0.81208748, + "num_input_tokens_seen": 59659865, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 1.0, + "step": 2759, + "time_per_iteration": 2.468798875808716 + }, + { + "auxiliary_loss_clip": 0.01151828, + "auxiliary_loss_mlp": 0.01048294, + "balance_loss_clip": 1.02953088, + "balance_loss_mlp": 1.05121255, + "epoch": 0.16594017736359537, + "flos": 16435488643200.0, + "grad_norm": 4.81235802271706, + "language_loss": 0.75059134, + "learning_rate": 3.8092592477633793e-06, + "loss": 0.77259254, + "num_input_tokens_seen": 59678780, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0, + "step": 2760, + "time_per_iteration": 2.459453582763672 + }, + { + "auxiliary_loss_clip": 0.01158028, + "auxiliary_loss_mlp": 0.0104013, + "balance_loss_clip": 1.02139056, + "balance_loss_mlp": 1.05363011, + "epoch": 0.16600030061626334, + "flos": 22637297030400.0, + "grad_norm": 1.843496656605, + "language_loss": 0.73409051, + "learning_rate": 3.8090932254387774e-06, + "loss": 0.75607204, + "num_input_tokens_seen": 59698795, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.046875, + "step": 2761, + "time_per_iteration": 2.473264455795288 + }, + { + "auxiliary_loss_clip": 0.01154594, + "auxiliary_loss_mlp": 0.01046157, + "balance_loss_clip": 1.02709532, + "balance_loss_mlp": 1.05460942, + "epoch": 0.1660604238689313, + "flos": 26396569607040.0, + "grad_norm": 2.076392836835936, + "language_loss": 0.89255953, + "learning_rate": 3.8089271345131788e-06, + "loss": 0.91456699, + "num_input_tokens_seen": 59718795, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0, + "step": 2762, + "time_per_iteration": 2.4917852878570557 + }, + { + "auxiliary_loss_clip": 0.01153346, + "auxiliary_loss_mlp": 0.01052721, + "balance_loss_clip": 1.03327847, + "balance_loss_mlp": 1.0517025, + "epoch": 0.16612054712159927, + "flos": 23039999383680.0, + "grad_norm": 1.6634533311047424, + "language_loss": 0.87782222, + "learning_rate": 3.8087609749928822e-06, + "loss": 0.89988291, + "num_input_tokens_seen": 59737555, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.015625, + "step": 2763, + "time_per_iteration": 2.48002028465271 + }, + { + "auxiliary_loss_clip": 0.01051809, + "auxiliary_loss_mlp": 0.01013596, + "balance_loss_clip": 1.01105642, + "balance_loss_mlp": 1.01786494, + "epoch": 0.16618067037426726, + "flos": 59241225202560.0, + "grad_norm": 0.7771287992078079, + "language_loss": 0.59777391, + "learning_rate": 3.8085947468841885e-06, + "loss": 0.61842799, + "num_input_tokens_seen": 59800915, + "router_z_loss_clip": 0.02539062, + "router_z_loss_mlp": 0.33984375, + "step": 2764, + "time_per_iteration": 3.0722031593322754 + }, + { + "auxiliary_loss_clip": 0.01154679, + "auxiliary_loss_mlp": 0.01052765, + "balance_loss_clip": 1.03183234, + "balance_loss_mlp": 1.05292118, + "epoch": 0.16624079362693522, + "flos": 27198813916800.0, + "grad_norm": 1.8564974944455146, + "language_loss": 0.82349414, + "learning_rate": 3.808428450193401e-06, + "loss": 0.8455686, + "num_input_tokens_seen": 59822910, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.015625, + "step": 2765, + "time_per_iteration": 2.5071089267730713 + }, + { + "auxiliary_loss_clip": 0.01161301, + "auxiliary_loss_mlp": 0.01048817, + "balance_loss_clip": 1.02758563, + "balance_loss_mlp": 1.05308914, + "epoch": 0.1663009168796032, + "flos": 10925068216320.0, + "grad_norm": 2.1954568630881566, + "language_loss": 0.70029616, + "learning_rate": 3.8082620849268244e-06, + "loss": 0.72239733, + "num_input_tokens_seen": 59838805, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.078125, + "step": 2766, + "time_per_iteration": 2.417538642883301 + }, + { + "auxiliary_loss_clip": 0.01153227, + "auxiliary_loss_mlp": 0.01045171, + "balance_loss_clip": 1.02669311, + "balance_loss_mlp": 1.05449462, + "epoch": 0.16636104013227115, + "flos": 17894431353600.0, + "grad_norm": 2.3642497854018174, + "language_loss": 0.88693011, + "learning_rate": 3.808095651090769e-06, + "loss": 0.90891409, + "num_input_tokens_seen": 59855345, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.98828125, + "step": 2767, + "time_per_iteration": 2.447087287902832 + }, + { + "auxiliary_loss_clip": 0.01048639, + "auxiliary_loss_mlp": 0.01007692, + "balance_loss_clip": 1.0051651, + "balance_loss_mlp": 1.01474071, + "epoch": 0.16642116338493912, + "flos": 66726050463360.0, + "grad_norm": 0.659533193053428, + "language_loss": 0.52894622, + "learning_rate": 3.8079291486915447e-06, + "loss": 0.54950953, + "num_input_tokens_seen": 59917710, + "router_z_loss_clip": 0.02526855, + "router_z_loss_mlp": 0.33984375, + "step": 2768, + "time_per_iteration": 4.540286064147949 + }, + { + "auxiliary_loss_clip": 0.01156575, + "auxiliary_loss_mlp": 0.01051889, + "balance_loss_clip": 1.03196931, + "balance_loss_mlp": 1.05233693, + "epoch": 0.16648128663760708, + "flos": 19026048401280.0, + "grad_norm": 2.4421243199538543, + "language_loss": 0.84964579, + "learning_rate": 3.8077625777354667e-06, + "loss": 0.87173045, + "num_input_tokens_seen": 59935105, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.046875, + "step": 2769, + "time_per_iteration": 3.9888546466827393 + }, + { + "auxiliary_loss_clip": 0.01046706, + "auxiliary_loss_mlp": 0.01007405, + "balance_loss_clip": 1.00486565, + "balance_loss_mlp": 1.01284146, + "epoch": 0.16654140989027508, + "flos": 70134976759680.0, + "grad_norm": 0.809970645404753, + "language_loss": 0.57417655, + "learning_rate": 3.80759593822885e-06, + "loss": 0.59471762, + "num_input_tokens_seen": 59984085, + "router_z_loss_clip": 0.02539062, + "router_z_loss_mlp": 0.33984375, + "step": 2770, + "time_per_iteration": 2.909212350845337 + }, + { + "auxiliary_loss_clip": 0.01045765, + "auxiliary_loss_mlp": 0.01004174, + "balance_loss_clip": 1.00161099, + "balance_loss_mlp": 1.0120976, + "epoch": 0.16660153314294304, + "flos": 70272406195200.0, + "grad_norm": 0.8642108743281017, + "language_loss": 0.5621168, + "learning_rate": 3.807429230178015e-06, + "loss": 0.58261615, + "num_input_tokens_seen": 60043470, + "router_z_loss_clip": 0.02563477, + "router_z_loss_mlp": 0.3359375, + "step": 2771, + "time_per_iteration": 2.9000375270843506 + }, + { + "auxiliary_loss_clip": 0.01152287, + "auxiliary_loss_mlp": 0.01058074, + "balance_loss_clip": 1.03741515, + "balance_loss_mlp": 1.05137527, + "epoch": 0.166661656395611, + "flos": 23075048079360.0, + "grad_norm": 2.4271023422086593, + "language_loss": 0.70461071, + "learning_rate": 3.8072624535892817e-06, + "loss": 0.72671425, + "num_input_tokens_seen": 60063045, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.0078125, + "step": 2772, + "time_per_iteration": 2.45868182182312 + }, + { + "auxiliary_loss_clip": 0.01150213, + "auxiliary_loss_mlp": 0.01052488, + "balance_loss_clip": 1.03305721, + "balance_loss_mlp": 1.04914951, + "epoch": 0.16672177964827897, + "flos": 28366341586560.0, + "grad_norm": 1.8764675289735346, + "language_loss": 0.86201918, + "learning_rate": 3.807095608468975e-06, + "loss": 0.8840462, + "num_input_tokens_seen": 60081945, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0078125, + "step": 2773, + "time_per_iteration": 2.513784885406494 + }, + { + "auxiliary_loss_clip": 0.01152492, + "auxiliary_loss_mlp": 0.01046232, + "balance_loss_clip": 1.02808821, + "balance_loss_mlp": 1.05230188, + "epoch": 0.16678190290094694, + "flos": 19091010147840.0, + "grad_norm": 2.2216439453760595, + "language_loss": 0.81859678, + "learning_rate": 3.8069286948234224e-06, + "loss": 0.84058398, + "num_input_tokens_seen": 60096820, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.0, + "step": 2774, + "time_per_iteration": 2.4288830757141113 + }, + { + "auxiliary_loss_clip": 0.01155539, + "auxiliary_loss_mlp": 0.0104957, + "balance_loss_clip": 1.02955508, + "balance_loss_mlp": 1.05290627, + "epoch": 0.1668420261536149, + "flos": 21799106184960.0, + "grad_norm": 2.1125697386324576, + "language_loss": 0.83287829, + "learning_rate": 3.806761712658952e-06, + "loss": 0.85492939, + "num_input_tokens_seen": 60116140, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.0234375, + "step": 2775, + "time_per_iteration": 2.4773504734039307 + }, + { + "auxiliary_loss_clip": 0.01151, + "auxiliary_loss_mlp": 0.01053902, + "balance_loss_clip": 1.03599668, + "balance_loss_mlp": 1.0527029, + "epoch": 0.16690214940628287, + "flos": 19062533640960.0, + "grad_norm": 1.9011936520028738, + "language_loss": 0.80721045, + "learning_rate": 3.806594661981897e-06, + "loss": 0.82925946, + "num_input_tokens_seen": 60134235, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.984375, + "step": 2776, + "time_per_iteration": 2.4736995697021484 + }, + { + "auxiliary_loss_clip": 0.01147621, + "auxiliary_loss_mlp": 0.01053383, + "balance_loss_clip": 1.03457189, + "balance_loss_mlp": 1.05260348, + "epoch": 0.16696227265895086, + "flos": 18588548747520.0, + "grad_norm": 1.7922512358148395, + "language_loss": 0.798361, + "learning_rate": 3.8064275427985906e-06, + "loss": 0.82037103, + "num_input_tokens_seen": 60153275, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.953125, + "step": 2777, + "time_per_iteration": 2.4625258445739746 + }, + { + "auxiliary_loss_clip": 0.01149386, + "auxiliary_loss_mlp": 0.0105028, + "balance_loss_clip": 1.0313735, + "balance_loss_mlp": 1.05002642, + "epoch": 0.16702239591161883, + "flos": 23294139085440.0, + "grad_norm": 1.8218923631286437, + "language_loss": 0.85132945, + "learning_rate": 3.806260355115371e-06, + "loss": 0.87332618, + "num_input_tokens_seen": 60173215, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.99609375, + "step": 2778, + "time_per_iteration": 2.4819412231445312 + }, + { + "auxiliary_loss_clip": 0.01154381, + "auxiliary_loss_mlp": 0.0104532, + "balance_loss_clip": 1.02626991, + "balance_loss_mlp": 1.05222583, + "epoch": 0.1670825191642868, + "flos": 24425648392320.0, + "grad_norm": 2.6489491047564826, + "language_loss": 0.74133682, + "learning_rate": 3.8060930989385778e-06, + "loss": 0.76333386, + "num_input_tokens_seen": 60190515, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0234375, + "step": 2779, + "time_per_iteration": 2.510207176208496 + }, + { + "auxiliary_loss_clip": 0.0115174, + "auxiliary_loss_mlp": 0.01045601, + "balance_loss_clip": 1.02625358, + "balance_loss_mlp": 1.05116367, + "epoch": 0.16714264241695476, + "flos": 26797512193920.0, + "grad_norm": 2.2761441742273663, + "language_loss": 0.65382051, + "learning_rate": 3.805925774274554e-06, + "loss": 0.67579395, + "num_input_tokens_seen": 60211655, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0078125, + "step": 2780, + "time_per_iteration": 2.5250439643859863 + }, + { + "auxiliary_loss_clip": 0.01150325, + "auxiliary_loss_mlp": 0.01048314, + "balance_loss_clip": 1.02856088, + "balance_loss_mlp": 1.05120933, + "epoch": 0.16720276566962272, + "flos": 21835304115840.0, + "grad_norm": 2.0602280440022382, + "language_loss": 0.78563058, + "learning_rate": 3.805758381129643e-06, + "loss": 0.80761701, + "num_input_tokens_seen": 60230860, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.9921875, + "step": 2781, + "time_per_iteration": 2.4921979904174805 + }, + { + "auxiliary_loss_clip": 0.01153739, + "auxiliary_loss_mlp": 0.01049181, + "balance_loss_clip": 1.03065586, + "balance_loss_mlp": 1.05227423, + "epoch": 0.1672628889222907, + "flos": 21470415805440.0, + "grad_norm": 1.480266857331911, + "language_loss": 0.75262564, + "learning_rate": 3.805590919510193e-06, + "loss": 0.77465487, + "num_input_tokens_seen": 60250535, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.015625, + "step": 2782, + "time_per_iteration": 2.468590021133423 + }, + { + "auxiliary_loss_clip": 0.01159372, + "auxiliary_loss_mlp": 0.01052642, + "balance_loss_clip": 1.03141046, + "balance_loss_mlp": 1.05443954, + "epoch": 0.16732301217495865, + "flos": 30774008269440.0, + "grad_norm": 1.999958464394936, + "language_loss": 0.67841566, + "learning_rate": 3.8054233894225547e-06, + "loss": 0.70053571, + "num_input_tokens_seen": 60269530, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.0546875, + "step": 2783, + "time_per_iteration": 2.5312225818634033 + }, + { + "auxiliary_loss_clip": 0.01153889, + "auxiliary_loss_mlp": 0.01050749, + "balance_loss_clip": 1.03193808, + "balance_loss_mlp": 1.0538497, + "epoch": 0.16738313542762664, + "flos": 23474625949440.0, + "grad_norm": 2.209785525271013, + "language_loss": 0.70028126, + "learning_rate": 3.805255790873081e-06, + "loss": 0.72232759, + "num_input_tokens_seen": 60289900, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0, + "step": 2784, + "time_per_iteration": 2.4932820796966553 + }, + { + "auxiliary_loss_clip": 0.01154602, + "auxiliary_loss_mlp": 0.0105186, + "balance_loss_clip": 1.03087902, + "balance_loss_mlp": 1.05120277, + "epoch": 0.1674432586802946, + "flos": 29789086366080.0, + "grad_norm": 1.9638597335511054, + "language_loss": 0.60441053, + "learning_rate": 3.805088123868126e-06, + "loss": 0.62647516, + "num_input_tokens_seen": 60310025, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.03125, + "step": 2785, + "time_per_iteration": 2.527010440826416 + }, + { + "auxiliary_loss_clip": 0.0104901, + "auxiliary_loss_mlp": 0.01029558, + "balance_loss_clip": 1.02681625, + "balance_loss_mlp": 1.01595187, + "epoch": 0.16750338193296258, + "flos": 66136073575680.0, + "grad_norm": 0.8343482124814343, + "language_loss": 0.588, + "learning_rate": 3.8049203884140492e-06, + "loss": 0.60878569, + "num_input_tokens_seen": 60377800, + "router_z_loss_clip": 0.02746582, + "router_z_loss_mlp": 0.33007812, + "step": 2786, + "time_per_iteration": 3.1062281131744385 + }, + { + "auxiliary_loss_clip": 0.0115343, + "auxiliary_loss_mlp": 0.01044844, + "balance_loss_clip": 1.0253408, + "balance_loss_mlp": 1.05108333, + "epoch": 0.16756350518563054, + "flos": 25696777864320.0, + "grad_norm": 1.9494651562196093, + "language_loss": 0.75846571, + "learning_rate": 3.80475258451721e-06, + "loss": 0.78044844, + "num_input_tokens_seen": 60398215, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0234375, + "step": 2787, + "time_per_iteration": 2.51383900642395 + }, + { + "auxiliary_loss_clip": 0.0115361, + "auxiliary_loss_mlp": 0.0104169, + "balance_loss_clip": 1.02287841, + "balance_loss_mlp": 1.05218899, + "epoch": 0.1676236284382985, + "flos": 23836102467840.0, + "grad_norm": 2.088538847955111, + "language_loss": 0.77615869, + "learning_rate": 3.804584712183972e-06, + "loss": 0.79811174, + "num_input_tokens_seen": 60416910, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 2788, + "time_per_iteration": 2.4926373958587646 + }, + { + "auxiliary_loss_clip": 0.01048965, + "auxiliary_loss_mlp": 0.01004104, + "balance_loss_clip": 1.00154078, + "balance_loss_mlp": 1.01582766, + "epoch": 0.16768375169096647, + "flos": 59874902985600.0, + "grad_norm": 0.861309286667726, + "language_loss": 0.59360403, + "learning_rate": 3.8044167714207013e-06, + "loss": 0.61413473, + "num_input_tokens_seen": 60468660, + "router_z_loss_clip": 0.02563477, + "router_z_loss_mlp": 0.33203125, + "step": 2789, + "time_per_iteration": 2.9390883445739746 + }, + { + "auxiliary_loss_clip": 0.01153417, + "auxiliary_loss_mlp": 0.01052728, + "balance_loss_clip": 1.03262937, + "balance_loss_mlp": 1.05115533, + "epoch": 0.16774387494363446, + "flos": 38435657207040.0, + "grad_norm": 1.8582032581880512, + "language_loss": 0.70117038, + "learning_rate": 3.804248762233765e-06, + "loss": 0.72323185, + "num_input_tokens_seen": 60492370, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.015625, + "step": 2790, + "time_per_iteration": 2.6337287425994873 + }, + { + "auxiliary_loss_clip": 0.01154528, + "auxiliary_loss_mlp": 0.01057043, + "balance_loss_clip": 1.03852975, + "balance_loss_mlp": 1.05254579, + "epoch": 0.16780399819630243, + "flos": 22637620252800.0, + "grad_norm": 1.9267324208283758, + "language_loss": 0.7914235, + "learning_rate": 3.8040806846295356e-06, + "loss": 0.81353921, + "num_input_tokens_seen": 60512655, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0234375, + "step": 2791, + "time_per_iteration": 2.4992258548736572 + }, + { + "auxiliary_loss_clip": 0.01154296, + "auxiliary_loss_mlp": 0.0104755, + "balance_loss_clip": 1.02807093, + "balance_loss_mlp": 1.05311096, + "epoch": 0.1678641214489704, + "flos": 32891516887680.0, + "grad_norm": 1.670563786806713, + "language_loss": 0.71465087, + "learning_rate": 3.8039125386143853e-06, + "loss": 0.73666936, + "num_input_tokens_seen": 60533090, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.015625, + "step": 2792, + "time_per_iteration": 2.5886104106903076 + }, + { + "auxiliary_loss_clip": 0.01154826, + "auxiliary_loss_mlp": 0.01045884, + "balance_loss_clip": 1.02648878, + "balance_loss_mlp": 1.05179656, + "epoch": 0.16792424470163836, + "flos": 19974916028160.0, + "grad_norm": 2.423044729867527, + "language_loss": 0.72166264, + "learning_rate": 3.803744324194691e-06, + "loss": 0.74366981, + "num_input_tokens_seen": 60553190, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.03125, + "step": 2793, + "time_per_iteration": 2.5197043418884277 + }, + { + "auxiliary_loss_clip": 0.01153184, + "auxiliary_loss_mlp": 0.01054586, + "balance_loss_clip": 1.03502417, + "balance_loss_mlp": 1.05135465, + "epoch": 0.16798436795430632, + "flos": 19719878486400.0, + "grad_norm": 1.9474647186442988, + "language_loss": 0.77305138, + "learning_rate": 3.803576041376831e-06, + "loss": 0.79512912, + "num_input_tokens_seen": 60571995, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.015625, + "step": 2794, + "time_per_iteration": 2.467292547225952 + }, + { + "auxiliary_loss_clip": 0.01154384, + "auxiliary_loss_mlp": 0.01054467, + "balance_loss_clip": 1.03558397, + "balance_loss_mlp": 1.05253601, + "epoch": 0.1680444912069743, + "flos": 28104839596800.0, + "grad_norm": 2.2742759048834578, + "language_loss": 0.71613103, + "learning_rate": 3.803407690167187e-06, + "loss": 0.7382195, + "num_input_tokens_seen": 60591275, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.015625, + "step": 2795, + "time_per_iteration": 2.5272278785705566 + }, + { + "auxiliary_loss_clip": 0.01149377, + "auxiliary_loss_mlp": 0.01044626, + "balance_loss_clip": 1.02592218, + "balance_loss_mlp": 1.04932868, + "epoch": 0.16810461445964225, + "flos": 18075205526400.0, + "grad_norm": 1.942494339721957, + "language_loss": 0.83784455, + "learning_rate": 3.803239270572142e-06, + "loss": 0.8597846, + "num_input_tokens_seen": 60609235, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0, + "step": 2796, + "time_per_iteration": 2.448528289794922 + }, + { + "auxiliary_loss_clip": 0.01152862, + "auxiliary_loss_mlp": 0.01059215, + "balance_loss_clip": 1.03911614, + "balance_loss_mlp": 1.04904127, + "epoch": 0.16816473771231025, + "flos": 23878657105920.0, + "grad_norm": 1.6778887705488965, + "language_loss": 0.8109591, + "learning_rate": 3.8030707825980838e-06, + "loss": 0.83307993, + "num_input_tokens_seen": 60629880, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0390625, + "step": 2797, + "time_per_iteration": 2.5044567584991455 + }, + { + "auxiliary_loss_clip": 0.01147186, + "auxiliary_loss_mlp": 0.01044345, + "balance_loss_clip": 1.02766752, + "balance_loss_mlp": 1.05142093, + "epoch": 0.1682248609649782, + "flos": 22783597125120.0, + "grad_norm": 1.4189820060365406, + "language_loss": 0.74740726, + "learning_rate": 3.802902226251401e-06, + "loss": 0.76932257, + "num_input_tokens_seen": 60651175, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.95703125, + "step": 2798, + "time_per_iteration": 2.4913666248321533 + }, + { + "auxiliary_loss_clip": 0.01154688, + "auxiliary_loss_mlp": 0.01049917, + "balance_loss_clip": 1.03250098, + "balance_loss_mlp": 1.05462337, + "epoch": 0.16828498421764618, + "flos": 20705123612160.0, + "grad_norm": 1.8962576537558784, + "language_loss": 0.79592311, + "learning_rate": 3.8027336015384845e-06, + "loss": 0.81796914, + "num_input_tokens_seen": 60670210, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 1.0, + "step": 2799, + "time_per_iteration": 2.4844021797180176 + }, + { + "auxiliary_loss_clip": 0.01153892, + "auxiliary_loss_mlp": 0.01046392, + "balance_loss_clip": 1.02597189, + "balance_loss_mlp": 1.04983997, + "epoch": 0.16834510747031414, + "flos": 29420606695680.0, + "grad_norm": 2.7819182919151455, + "language_loss": 0.70778632, + "learning_rate": 3.8025649084657296e-06, + "loss": 0.72978926, + "num_input_tokens_seen": 60690895, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0390625, + "step": 2800, + "time_per_iteration": 2.548715829849243 + }, + { + "auxiliary_loss_clip": 0.01148463, + "auxiliary_loss_mlp": 0.01043839, + "balance_loss_clip": 1.02365637, + "balance_loss_mlp": 1.04882574, + "epoch": 0.1684052307229821, + "flos": 18145374744960.0, + "grad_norm": 1.9135359518782422, + "language_loss": 0.83549178, + "learning_rate": 3.8023961470395326e-06, + "loss": 0.85741478, + "num_input_tokens_seen": 60708280, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.99609375, + "step": 2801, + "time_per_iteration": 2.456601858139038 + }, + { + "auxiliary_loss_clip": 0.01149997, + "auxiliary_loss_mlp": 0.01052315, + "balance_loss_clip": 1.03355145, + "balance_loss_mlp": 1.04947591, + "epoch": 0.16846535397565007, + "flos": 16574929240320.0, + "grad_norm": 2.757874152621573, + "language_loss": 0.822721, + "learning_rate": 3.8022273172662933e-06, + "loss": 0.84474415, + "num_input_tokens_seen": 60724150, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0078125, + "step": 2802, + "time_per_iteration": 2.4426534175872803 + }, + { + "auxiliary_loss_clip": 0.01153107, + "auxiliary_loss_mlp": 0.01047694, + "balance_loss_clip": 1.02764344, + "balance_loss_mlp": 1.05123353, + "epoch": 0.16852547722831807, + "flos": 30408868563840.0, + "grad_norm": 1.4855905624355255, + "language_loss": 0.81064272, + "learning_rate": 3.802058419152413e-06, + "loss": 0.83265072, + "num_input_tokens_seen": 60746485, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.015625, + "step": 2803, + "time_per_iteration": 2.5615930557250977 + }, + { + "auxiliary_loss_clip": 0.01150255, + "auxiliary_loss_mlp": 0.01045652, + "balance_loss_clip": 1.02693641, + "balance_loss_mlp": 1.05246449, + "epoch": 0.16858560048098603, + "flos": 33507420416640.0, + "grad_norm": 2.2799183114600545, + "language_loss": 0.7645762, + "learning_rate": 3.801889452704297e-06, + "loss": 0.78653532, + "num_input_tokens_seen": 60762875, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9765625, + "step": 2804, + "time_per_iteration": 2.541059970855713 + }, + { + "auxiliary_loss_clip": 0.01045818, + "auxiliary_loss_mlp": 0.01026702, + "balance_loss_clip": 1.02452028, + "balance_loss_mlp": 1.01328063, + "epoch": 0.168645723733654, + "flos": 67370502326400.0, + "grad_norm": 0.8620881286764229, + "language_loss": 0.55414748, + "learning_rate": 3.8017204179283526e-06, + "loss": 0.57487267, + "num_input_tokens_seen": 60825510, + "router_z_loss_clip": 0.02185059, + "router_z_loss_mlp": 0.32421875, + "step": 2805, + "time_per_iteration": 3.033358573913574 + }, + { + "auxiliary_loss_clip": 0.01144187, + "auxiliary_loss_mlp": 0.01039064, + "balance_loss_clip": 1.02161169, + "balance_loss_mlp": 1.04741919, + "epoch": 0.16870584698632196, + "flos": 21324618501120.0, + "grad_norm": 1.9122963285347783, + "language_loss": 0.73038024, + "learning_rate": 3.8015513148309892e-06, + "loss": 0.75221276, + "num_input_tokens_seen": 60844440, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.96484375, + "step": 2806, + "time_per_iteration": 2.4699463844299316 + }, + { + "auxiliary_loss_clip": 0.01148467, + "auxiliary_loss_mlp": 0.01045307, + "balance_loss_clip": 1.02712786, + "balance_loss_mlp": 1.05072176, + "epoch": 0.16876597023898993, + "flos": 20740746925440.0, + "grad_norm": 1.9407491705316076, + "language_loss": 0.69966477, + "learning_rate": 3.80138214341862e-06, + "loss": 0.7216025, + "num_input_tokens_seen": 60863210, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.98046875, + "step": 2807, + "time_per_iteration": 2.4583139419555664 + }, + { + "auxiliary_loss_clip": 0.01149832, + "auxiliary_loss_mlp": 0.01051611, + "balance_loss_clip": 1.03196526, + "balance_loss_mlp": 1.05013919, + "epoch": 0.1688260934916579, + "flos": 20303498666880.0, + "grad_norm": 2.8028706291815912, + "language_loss": 0.70265883, + "learning_rate": 3.8012129036976587e-06, + "loss": 0.72467327, + "num_input_tokens_seen": 60882510, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.9921875, + "step": 2808, + "time_per_iteration": 2.4724719524383545 + }, + { + "auxiliary_loss_clip": 0.01154296, + "auxiliary_loss_mlp": 0.0104775, + "balance_loss_clip": 1.02792549, + "balance_loss_mlp": 1.05130935, + "epoch": 0.16888621674432586, + "flos": 20340702178560.0, + "grad_norm": 2.1293629398657954, + "language_loss": 0.80103064, + "learning_rate": 3.8010435956745236e-06, + "loss": 0.8230511, + "num_input_tokens_seen": 60901105, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.03125, + "step": 2809, + "time_per_iteration": 3.844451427459717 + }, + { + "auxiliary_loss_clip": 0.01155336, + "auxiliary_loss_mlp": 0.01051942, + "balance_loss_clip": 1.03301144, + "balance_loss_mlp": 1.050385, + "epoch": 0.16894633999699385, + "flos": 16244802316800.0, + "grad_norm": 2.0909159229075245, + "language_loss": 0.88465077, + "learning_rate": 3.8008742193556358e-06, + "loss": 0.9067235, + "num_input_tokens_seen": 60915340, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.046875, + "step": 2810, + "time_per_iteration": 5.43256688117981 + }, + { + "auxiliary_loss_clip": 0.0115459, + "auxiliary_loss_mlp": 0.01052284, + "balance_loss_clip": 1.03238845, + "balance_loss_mlp": 1.05188894, + "epoch": 0.16900646324966181, + "flos": 19610171372160.0, + "grad_norm": 2.324870160833927, + "language_loss": 0.92483926, + "learning_rate": 3.800704774747416e-06, + "loss": 0.94690794, + "num_input_tokens_seen": 60933735, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.03125, + "step": 2811, + "time_per_iteration": 2.4633538722991943 + }, + { + "auxiliary_loss_clip": 0.01157458, + "auxiliary_loss_mlp": 0.01049775, + "balance_loss_clip": 1.03154814, + "balance_loss_mlp": 1.05537057, + "epoch": 0.16906658650232978, + "flos": 22018089450240.0, + "grad_norm": 20.150047321728213, + "language_loss": 0.78719699, + "learning_rate": 3.800535261856291e-06, + "loss": 0.80926931, + "num_input_tokens_seen": 60953105, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.0234375, + "step": 2812, + "time_per_iteration": 2.475893974304199 + }, + { + "auxiliary_loss_clip": 0.01154531, + "auxiliary_loss_mlp": 0.01053249, + "balance_loss_clip": 1.0353322, + "balance_loss_mlp": 1.05427527, + "epoch": 0.16912670975499774, + "flos": 11763690024960.0, + "grad_norm": 2.3708558754635103, + "language_loss": 0.7492249, + "learning_rate": 3.8003656806886887e-06, + "loss": 0.7713027, + "num_input_tokens_seen": 60969150, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 1.0078125, + "step": 2813, + "time_per_iteration": 2.4622457027435303 + }, + { + "auxiliary_loss_clip": 0.01155154, + "auxiliary_loss_mlp": 0.01047749, + "balance_loss_clip": 1.02862835, + "balance_loss_mlp": 1.05231524, + "epoch": 0.1691868330076657, + "flos": 17161386595200.0, + "grad_norm": 2.6643465032783955, + "language_loss": 0.69000697, + "learning_rate": 3.8001960312510396e-06, + "loss": 0.71203601, + "num_input_tokens_seen": 60982825, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.03125, + "step": 2814, + "time_per_iteration": 2.442352771759033 + }, + { + "auxiliary_loss_clip": 0.01152587, + "auxiliary_loss_mlp": 0.01048898, + "balance_loss_clip": 1.03032494, + "balance_loss_mlp": 1.05269694, + "epoch": 0.16924695626033368, + "flos": 22416553998720.0, + "grad_norm": 3.3683342322522543, + "language_loss": 0.61842358, + "learning_rate": 3.800026313549776e-06, + "loss": 0.64043844, + "num_input_tokens_seen": 61000875, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0, + "step": 2815, + "time_per_iteration": 2.4859516620635986 + }, + { + "auxiliary_loss_clip": 0.01149812, + "auxiliary_loss_mlp": 0.01050269, + "balance_loss_clip": 1.03179121, + "balance_loss_mlp": 1.05104065, + "epoch": 0.16930707951300164, + "flos": 25739655724800.0, + "grad_norm": 1.9947957584318596, + "language_loss": 0.81983805, + "learning_rate": 3.7998565275913342e-06, + "loss": 0.84183884, + "num_input_tokens_seen": 61021940, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9921875, + "step": 2816, + "time_per_iteration": 2.5549440383911133 + }, + { + "auxiliary_loss_clip": 0.01156016, + "auxiliary_loss_mlp": 0.01049677, + "balance_loss_clip": 1.03072321, + "balance_loss_mlp": 1.05379295, + "epoch": 0.16936720276566963, + "flos": 22747040058240.0, + "grad_norm": 2.502019531770294, + "language_loss": 0.8722589, + "learning_rate": 3.799686673382153e-06, + "loss": 0.89431584, + "num_input_tokens_seen": 61040285, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0234375, + "step": 2817, + "time_per_iteration": 2.4906835556030273 + }, + { + "auxiliary_loss_clip": 0.01152128, + "auxiliary_loss_mlp": 0.01051154, + "balance_loss_clip": 1.03200889, + "balance_loss_mlp": 1.05302715, + "epoch": 0.1694273260183376, + "flos": 19573973441280.0, + "grad_norm": 1.7787508021643152, + "language_loss": 0.81666476, + "learning_rate": 3.799516750928672e-06, + "loss": 0.83869755, + "num_input_tokens_seen": 61059020, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.98828125, + "step": 2818, + "time_per_iteration": 2.4673428535461426 + }, + { + "auxiliary_loss_clip": 0.01151603, + "auxiliary_loss_mlp": 0.01052661, + "balance_loss_clip": 1.03339636, + "balance_loss_mlp": 1.05154157, + "epoch": 0.16948744927100556, + "flos": 12457843332480.0, + "grad_norm": 5.791836374282792, + "language_loss": 0.80712807, + "learning_rate": 3.799346760237336e-06, + "loss": 0.8291707, + "num_input_tokens_seen": 61074245, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0, + "step": 2819, + "time_per_iteration": 2.43947434425354 + }, + { + "auxiliary_loss_clip": 0.01048844, + "auxiliary_loss_mlp": 0.01007246, + "balance_loss_clip": 1.00504076, + "balance_loss_mlp": 1.01552486, + "epoch": 0.16954757252367353, + "flos": 71291694435840.0, + "grad_norm": 0.9491282523447765, + "language_loss": 0.61080176, + "learning_rate": 3.7991767013145902e-06, + "loss": 0.63136268, + "num_input_tokens_seen": 61127080, + "router_z_loss_clip": 0.02209473, + "router_z_loss_mlp": 0.33203125, + "step": 2820, + "time_per_iteration": 3.008953809738159 + }, + { + "auxiliary_loss_clip": 0.01152835, + "auxiliary_loss_mlp": 0.01049908, + "balance_loss_clip": 1.031335, + "balance_loss_mlp": 1.05163527, + "epoch": 0.1696076957763415, + "flos": 29606516513280.0, + "grad_norm": 2.1013484538112097, + "language_loss": 0.78625357, + "learning_rate": 3.7990065741668844e-06, + "loss": 0.808281, + "num_input_tokens_seen": 61146955, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.015625, + "step": 2821, + "time_per_iteration": 2.5363481044769287 + }, + { + "auxiliary_loss_clip": 0.01152889, + "auxiliary_loss_mlp": 0.01056486, + "balance_loss_clip": 1.03667343, + "balance_loss_mlp": 1.05229986, + "epoch": 0.16966781902900946, + "flos": 24388588535040.0, + "grad_norm": 2.87583667245789, + "language_loss": 0.78450388, + "learning_rate": 3.7988363788006685e-06, + "loss": 0.80659759, + "num_input_tokens_seen": 61166605, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.0078125, + "step": 2822, + "time_per_iteration": 2.4969065189361572 + }, + { + "auxiliary_loss_clip": 0.01147495, + "auxiliary_loss_mlp": 0.01050996, + "balance_loss_clip": 1.03299582, + "balance_loss_mlp": 1.04956698, + "epoch": 0.16972794228167745, + "flos": 23038814234880.0, + "grad_norm": 1.9220487825624015, + "language_loss": 0.75016022, + "learning_rate": 3.7986661152223967e-06, + "loss": 0.77214515, + "num_input_tokens_seen": 61186535, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9765625, + "step": 2823, + "time_per_iteration": 2.491588830947876 + }, + { + "auxiliary_loss_clip": 0.01151822, + "auxiliary_loss_mlp": 0.0105186, + "balance_loss_clip": 1.03198779, + "balance_loss_mlp": 1.05209637, + "epoch": 0.16978806553434542, + "flos": 35228691129600.0, + "grad_norm": 1.9648811068121905, + "language_loss": 0.60514438, + "learning_rate": 3.7984957834385257e-06, + "loss": 0.62718117, + "num_input_tokens_seen": 61208965, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.99609375, + "step": 2824, + "time_per_iteration": 2.6178910732269287 + }, + { + "auxiliary_loss_clip": 0.01151958, + "auxiliary_loss_mlp": 0.01040113, + "balance_loss_clip": 1.02030015, + "balance_loss_mlp": 1.05367076, + "epoch": 0.16984818878701338, + "flos": 32014290936960.0, + "grad_norm": 1.6856049786717988, + "language_loss": 0.73004806, + "learning_rate": 3.7983253834555144e-06, + "loss": 0.75196874, + "num_input_tokens_seen": 61230670, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.98046875, + "step": 2825, + "time_per_iteration": 2.559774398803711 + }, + { + "auxiliary_loss_clip": 0.01155697, + "auxiliary_loss_mlp": 0.01054546, + "balance_loss_clip": 1.03321934, + "balance_loss_mlp": 1.0505774, + "epoch": 0.16990831203968135, + "flos": 22818609907200.0, + "grad_norm": 1.7849035157466668, + "language_loss": 0.85660541, + "learning_rate": 3.7981549152798245e-06, + "loss": 0.87870789, + "num_input_tokens_seen": 61249510, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.0546875, + "step": 2826, + "time_per_iteration": 2.4860360622406006 + }, + { + "auxiliary_loss_clip": 0.0115502, + "auxiliary_loss_mlp": 0.01050706, + "balance_loss_clip": 1.03164482, + "balance_loss_mlp": 1.0515151, + "epoch": 0.1699684352923493, + "flos": 23039604334080.0, + "grad_norm": 2.3205594057943175, + "language_loss": 0.8232255, + "learning_rate": 3.7979843789179196e-06, + "loss": 0.84528267, + "num_input_tokens_seen": 61269440, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.03125, + "step": 2827, + "time_per_iteration": 2.4965107440948486 + }, + { + "auxiliary_loss_clip": 0.01153252, + "auxiliary_loss_mlp": 0.01049837, + "balance_loss_clip": 1.02965498, + "balance_loss_mlp": 1.05059743, + "epoch": 0.17002855854501728, + "flos": 21434110133760.0, + "grad_norm": 2.393760877815214, + "language_loss": 0.73652613, + "learning_rate": 3.797813774376267e-06, + "loss": 0.75855708, + "num_input_tokens_seen": 61288195, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0234375, + "step": 2828, + "time_per_iteration": 2.5726237297058105 + }, + { + "auxiliary_loss_clip": 0.01046718, + "auxiliary_loss_mlp": 0.01008554, + "balance_loss_clip": 1.00625372, + "balance_loss_mlp": 1.01360035, + "epoch": 0.17008868179768524, + "flos": 71453509205760.0, + "grad_norm": 0.76062911359866, + "language_loss": 0.56446254, + "learning_rate": 3.797643101661336e-06, + "loss": 0.5850153, + "num_input_tokens_seen": 61350850, + "router_z_loss_clip": 0.02294922, + "router_z_loss_mlp": 0.33203125, + "step": 2829, + "time_per_iteration": 3.1035284996032715 + }, + { + "auxiliary_loss_clip": 0.01148906, + "auxiliary_loss_mlp": 0.01048452, + "balance_loss_clip": 1.02912867, + "balance_loss_mlp": 1.04916263, + "epoch": 0.17014880505035324, + "flos": 24900315644160.0, + "grad_norm": 1.7229604876305038, + "language_loss": 0.83673382, + "learning_rate": 3.7974723607795983e-06, + "loss": 0.85870743, + "num_input_tokens_seen": 61370765, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.99609375, + "step": 2830, + "time_per_iteration": 2.5140810012817383 + }, + { + "auxiliary_loss_clip": 0.01150805, + "auxiliary_loss_mlp": 0.01048567, + "balance_loss_clip": 1.02792013, + "balance_loss_mlp": 1.04919207, + "epoch": 0.1702089283030212, + "flos": 29862415981440.0, + "grad_norm": 2.0065309441313337, + "language_loss": 0.77852297, + "learning_rate": 3.797301551737529e-06, + "loss": 0.80051666, + "num_input_tokens_seen": 61388935, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.015625, + "step": 2831, + "time_per_iteration": 2.524578094482422 + }, + { + "auxiliary_loss_clip": 0.01152075, + "auxiliary_loss_mlp": 0.01050912, + "balance_loss_clip": 1.03013349, + "balance_loss_mlp": 1.04948521, + "epoch": 0.17026905155568917, + "flos": 17744180762880.0, + "grad_norm": 2.1211873867699285, + "language_loss": 0.79345167, + "learning_rate": 3.7971306745416044e-06, + "loss": 0.81548154, + "num_input_tokens_seen": 61407350, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.0234375, + "step": 2832, + "time_per_iteration": 2.459954261779785 + }, + { + "auxiliary_loss_clip": 0.01151972, + "auxiliary_loss_mlp": 0.0104718, + "balance_loss_clip": 1.02836847, + "balance_loss_mlp": 1.05050385, + "epoch": 0.17032917480835713, + "flos": 23148665003520.0, + "grad_norm": 1.9382017652854369, + "language_loss": 0.89026237, + "learning_rate": 3.7969597291983046e-06, + "loss": 0.91225392, + "num_input_tokens_seen": 61429010, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 2833, + "time_per_iteration": 2.4812114238739014 + }, + { + "auxiliary_loss_clip": 0.0115284, + "auxiliary_loss_mlp": 0.01048999, + "balance_loss_clip": 1.02963924, + "balance_loss_mlp": 1.05124569, + "epoch": 0.1703892980610251, + "flos": 39202565512320.0, + "grad_norm": 2.853060698790674, + "language_loss": 0.72425497, + "learning_rate": 3.7967887157141115e-06, + "loss": 0.74627328, + "num_input_tokens_seen": 61450040, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.015625, + "step": 2834, + "time_per_iteration": 2.6271297931671143 + }, + { + "auxiliary_loss_clip": 0.01156378, + "auxiliary_loss_mlp": 0.01058486, + "balance_loss_clip": 1.03894782, + "balance_loss_mlp": 1.05294132, + "epoch": 0.17044942131369306, + "flos": 23039101543680.0, + "grad_norm": 1.9954265429463485, + "language_loss": 0.86434042, + "learning_rate": 3.7966176340955106e-06, + "loss": 0.88648909, + "num_input_tokens_seen": 61468585, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.03125, + "step": 2835, + "time_per_iteration": 2.4804999828338623 + }, + { + "auxiliary_loss_clip": 0.01155592, + "auxiliary_loss_mlp": 0.01051963, + "balance_loss_clip": 1.03007674, + "balance_loss_mlp": 1.05081642, + "epoch": 0.17050954456636103, + "flos": 17054983532160.0, + "grad_norm": 1.9180646463430515, + "language_loss": 0.73242748, + "learning_rate": 3.796446484348989e-06, + "loss": 0.75450307, + "num_input_tokens_seen": 61486330, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.046875, + "step": 2836, + "time_per_iteration": 2.4694178104400635 + }, + { + "auxiliary_loss_clip": 0.01156947, + "auxiliary_loss_mlp": 0.01048414, + "balance_loss_clip": 1.02599072, + "balance_loss_mlp": 1.05033076, + "epoch": 0.17056966781902902, + "flos": 16836969934080.0, + "grad_norm": 2.1253309510576717, + "language_loss": 0.79653537, + "learning_rate": 3.796275266481036e-06, + "loss": 0.81858897, + "num_input_tokens_seen": 61503950, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.0703125, + "step": 2837, + "time_per_iteration": 2.452153444290161 + }, + { + "auxiliary_loss_clip": 0.01150588, + "auxiliary_loss_mlp": 0.01045279, + "balance_loss_clip": 1.02550185, + "balance_loss_mlp": 1.05232143, + "epoch": 0.17062979107169698, + "flos": 17712543859200.0, + "grad_norm": 2.19906443062711, + "language_loss": 0.83575213, + "learning_rate": 3.7961039804981456e-06, + "loss": 0.85771078, + "num_input_tokens_seen": 61523550, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.98046875, + "step": 2838, + "time_per_iteration": 2.479573965072632 + }, + { + "auxiliary_loss_clip": 0.01148981, + "auxiliary_loss_mlp": 0.01045249, + "balance_loss_clip": 1.02660489, + "balance_loss_mlp": 1.05069315, + "epoch": 0.17068991432436495, + "flos": 22525040050560.0, + "grad_norm": 1.7423496230624245, + "language_loss": 0.93620354, + "learning_rate": 3.795932626406812e-06, + "loss": 0.95814586, + "num_input_tokens_seen": 61542720, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.984375, + "step": 2839, + "time_per_iteration": 2.5399010181427 + }, + { + "auxiliary_loss_clip": 0.01154457, + "auxiliary_loss_mlp": 0.01049077, + "balance_loss_clip": 1.0277859, + "balance_loss_mlp": 1.05050242, + "epoch": 0.17075003757703291, + "flos": 25882939077120.0, + "grad_norm": 2.8052720148780894, + "language_loss": 0.83847374, + "learning_rate": 3.7957612042135336e-06, + "loss": 0.86050916, + "num_input_tokens_seen": 61563040, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.0390625, + "step": 2840, + "time_per_iteration": 2.5449130535125732 + }, + { + "auxiliary_loss_clip": 0.01155521, + "auxiliary_loss_mlp": 0.01047778, + "balance_loss_clip": 1.02647519, + "balance_loss_mlp": 1.05213881, + "epoch": 0.17081016082970088, + "flos": 20120713332480.0, + "grad_norm": 2.014300966058614, + "language_loss": 0.76390004, + "learning_rate": 3.79558971392481e-06, + "loss": 0.78593302, + "num_input_tokens_seen": 61581890, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.03125, + "step": 2841, + "time_per_iteration": 2.4836723804473877 + }, + { + "auxiliary_loss_clip": 0.01152003, + "auxiliary_loss_mlp": 0.01052533, + "balance_loss_clip": 1.03243482, + "balance_loss_mlp": 1.04932261, + "epoch": 0.17087028408236885, + "flos": 24936477661440.0, + "grad_norm": 1.8874127741110907, + "language_loss": 0.77000463, + "learning_rate": 3.7954181555471443e-06, + "loss": 0.79205, + "num_input_tokens_seen": 61602095, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.03125, + "step": 2842, + "time_per_iteration": 2.5051841735839844 + }, + { + "auxiliary_loss_clip": 0.01148386, + "auxiliary_loss_mlp": 0.01046299, + "balance_loss_clip": 1.02647448, + "balance_loss_mlp": 1.0497905, + "epoch": 0.17093040733503684, + "flos": 19057864872960.0, + "grad_norm": 2.05566421297988, + "language_loss": 0.86086738, + "learning_rate": 3.795246529087043e-06, + "loss": 0.88281423, + "num_input_tokens_seen": 61620400, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.98828125, + "step": 2843, + "time_per_iteration": 2.4487509727478027 + }, + { + "auxiliary_loss_clip": 0.01150009, + "auxiliary_loss_mlp": 0.01046155, + "balance_loss_clip": 1.02696228, + "balance_loss_mlp": 1.05090249, + "epoch": 0.1709905305877048, + "flos": 13078954333440.0, + "grad_norm": 1.8875494657309706, + "language_loss": 0.6826812, + "learning_rate": 3.7950748345510126e-06, + "loss": 0.70464289, + "num_input_tokens_seen": 61637680, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.9921875, + "step": 2844, + "time_per_iteration": 2.4429779052734375 + }, + { + "auxiliary_loss_clip": 0.01150851, + "auxiliary_loss_mlp": 0.01054229, + "balance_loss_clip": 1.03371274, + "balance_loss_mlp": 1.05040824, + "epoch": 0.17105065384037277, + "flos": 19209336526080.0, + "grad_norm": 1.8058232236820264, + "language_loss": 0.78258789, + "learning_rate": 3.7949030719455646e-06, + "loss": 0.80463862, + "num_input_tokens_seen": 61655630, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0, + "step": 2845, + "time_per_iteration": 2.4377951622009277 + }, + { + "auxiliary_loss_clip": 0.01151786, + "auxiliary_loss_mlp": 0.01045909, + "balance_loss_clip": 1.02687097, + "balance_loss_mlp": 1.05064154, + "epoch": 0.17111077709304073, + "flos": 18515183218560.0, + "grad_norm": 2.746386155528142, + "language_loss": 0.77959955, + "learning_rate": 3.7947312412772127e-06, + "loss": 0.8015765, + "num_input_tokens_seen": 61673475, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0078125, + "step": 2846, + "time_per_iteration": 2.4196622371673584 + }, + { + "auxiliary_loss_clip": 0.01152165, + "auxiliary_loss_mlp": 0.0104791, + "balance_loss_clip": 1.02895534, + "balance_loss_mlp": 1.05158973, + "epoch": 0.1711709003457087, + "flos": 25082670015360.0, + "grad_norm": 1.7441395807388675, + "language_loss": 0.7942031, + "learning_rate": 3.794559342552472e-06, + "loss": 0.81620383, + "num_input_tokens_seen": 61693370, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0078125, + "step": 2847, + "time_per_iteration": 2.504087448120117 + }, + { + "auxiliary_loss_clip": 0.01148457, + "auxiliary_loss_mlp": 0.0104865, + "balance_loss_clip": 1.02913523, + "balance_loss_mlp": 1.04612017, + "epoch": 0.17123102359837666, + "flos": 17566387418880.0, + "grad_norm": 2.239997254259111, + "language_loss": 0.86818451, + "learning_rate": 3.7943873757778614e-06, + "loss": 0.89015555, + "num_input_tokens_seen": 61710820, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0234375, + "step": 2848, + "time_per_iteration": 2.438711643218994 + }, + { + "auxiliary_loss_clip": 0.0115323, + "auxiliary_loss_mlp": 0.01044307, + "balance_loss_clip": 1.02438748, + "balance_loss_mlp": 1.05133212, + "epoch": 0.17129114685104463, + "flos": 26173635845760.0, + "grad_norm": 1.715396677859901, + "language_loss": 0.75223613, + "learning_rate": 3.794215340959902e-06, + "loss": 0.77421153, + "num_input_tokens_seen": 61729855, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.015625, + "step": 2849, + "time_per_iteration": 2.4918415546417236 + }, + { + "auxiliary_loss_clip": 0.01047678, + "auxiliary_loss_mlp": 0.01003312, + "balance_loss_clip": 1.00107098, + "balance_loss_mlp": 1.01492834, + "epoch": 0.17135127010371262, + "flos": 69269710037760.0, + "grad_norm": 0.7949737728021388, + "language_loss": 0.57471085, + "learning_rate": 3.7940432381051163e-06, + "loss": 0.59522074, + "num_input_tokens_seen": 61790290, + "router_z_loss_clip": 0.02246094, + "router_z_loss_mlp": 0.328125, + "step": 2850, + "time_per_iteration": 3.057778835296631 + }, + { + "auxiliary_loss_clip": 0.01146039, + "auxiliary_loss_mlp": 0.0105304, + "balance_loss_clip": 1.03332317, + "balance_loss_mlp": 1.04852295, + "epoch": 0.1714113933563806, + "flos": 23550110380800.0, + "grad_norm": 2.4364727127987704, + "language_loss": 0.80988616, + "learning_rate": 3.793871067220031e-06, + "loss": 0.83187693, + "num_input_tokens_seen": 61809265, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.9765625, + "step": 2851, + "time_per_iteration": 3.887600898742676 + }, + { + "auxiliary_loss_clip": 0.01146778, + "auxiliary_loss_mlp": 0.0104369, + "balance_loss_clip": 1.02549911, + "balance_loss_mlp": 1.04858351, + "epoch": 0.17147151660904855, + "flos": 21142443697920.0, + "grad_norm": 2.035620688428962, + "language_loss": 0.93063158, + "learning_rate": 3.7936988283111764e-06, + "loss": 0.95253623, + "num_input_tokens_seen": 61828980, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.98046875, + "step": 2852, + "time_per_iteration": 3.920153856277466 + }, + { + "auxiliary_loss_clip": 0.01149404, + "auxiliary_loss_mlp": 0.01053071, + "balance_loss_clip": 1.03374732, + "balance_loss_mlp": 1.04728949, + "epoch": 0.17153163986171652, + "flos": 18624890332800.0, + "grad_norm": 1.8406206656402175, + "language_loss": 0.69480836, + "learning_rate": 3.7935265213850817e-06, + "loss": 0.71683311, + "num_input_tokens_seen": 61847915, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.015625, + "step": 2853, + "time_per_iteration": 2.4457037448883057 + }, + { + "auxiliary_loss_clip": 0.0115316, + "auxiliary_loss_mlp": 0.01050964, + "balance_loss_clip": 1.03150904, + "balance_loss_mlp": 1.05059445, + "epoch": 0.17159176311438448, + "flos": 18223265387520.0, + "grad_norm": 2.187977199847503, + "language_loss": 0.66505128, + "learning_rate": 3.7933541464482815e-06, + "loss": 0.68709248, + "num_input_tokens_seen": 61865570, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0234375, + "step": 2854, + "time_per_iteration": 2.4421632289886475 + }, + { + "auxiliary_loss_clip": 0.01144359, + "auxiliary_loss_mlp": 0.01044047, + "balance_loss_clip": 1.02520037, + "balance_loss_mlp": 1.04574227, + "epoch": 0.17165188636705245, + "flos": 20738987159040.0, + "grad_norm": 1.8257227486643586, + "language_loss": 0.89394444, + "learning_rate": 3.7931817035073124e-06, + "loss": 0.91582847, + "num_input_tokens_seen": 61883340, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.984375, + "step": 2855, + "time_per_iteration": 2.4601552486419678 + }, + { + "auxiliary_loss_clip": 0.01148566, + "auxiliary_loss_mlp": 0.01051381, + "balance_loss_clip": 1.03286791, + "balance_loss_mlp": 1.04792452, + "epoch": 0.17171200961972044, + "flos": 24899884680960.0, + "grad_norm": 2.515892939250119, + "language_loss": 0.83822739, + "learning_rate": 3.7930091925687134e-06, + "loss": 0.86022681, + "num_input_tokens_seen": 61900610, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0078125, + "step": 2856, + "time_per_iteration": 2.4747347831726074 + }, + { + "auxiliary_loss_clip": 0.01151618, + "auxiliary_loss_mlp": 0.01048758, + "balance_loss_clip": 1.02906466, + "balance_loss_mlp": 1.05112195, + "epoch": 0.1717721328723884, + "flos": 20157234485760.0, + "grad_norm": 1.9053156238546485, + "language_loss": 0.8645792, + "learning_rate": 3.792836613639026e-06, + "loss": 0.88658297, + "num_input_tokens_seen": 61916795, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0078125, + "step": 2857, + "time_per_iteration": 2.4460220336914062 + }, + { + "auxiliary_loss_clip": 0.01148045, + "auxiliary_loss_mlp": 0.0105234, + "balance_loss_clip": 1.03327847, + "balance_loss_mlp": 1.04805577, + "epoch": 0.17183225612505637, + "flos": 23361650697600.0, + "grad_norm": 2.139076633770832, + "language_loss": 0.77919662, + "learning_rate": 3.7926639667247947e-06, + "loss": 0.80120051, + "num_input_tokens_seen": 61936665, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0, + "step": 2858, + "time_per_iteration": 2.4459195137023926 + }, + { + "auxiliary_loss_clip": 0.01156262, + "auxiliary_loss_mlp": 0.01058687, + "balance_loss_clip": 1.03761101, + "balance_loss_mlp": 1.04760742, + "epoch": 0.17189237937772434, + "flos": 18114240631680.0, + "grad_norm": 2.423579883765011, + "language_loss": 0.77235049, + "learning_rate": 3.7924912518325663e-06, + "loss": 0.79449999, + "num_input_tokens_seen": 61954415, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.0859375, + "step": 2859, + "time_per_iteration": 2.43471360206604 + }, + { + "auxiliary_loss_clip": 0.01148379, + "auxiliary_loss_mlp": 0.01050312, + "balance_loss_clip": 1.03069019, + "balance_loss_mlp": 1.04920983, + "epoch": 0.1719525026303923, + "flos": 23258408031360.0, + "grad_norm": 3.774880148287903, + "language_loss": 0.77179611, + "learning_rate": 3.7923184689688902e-06, + "loss": 0.79378301, + "num_input_tokens_seen": 61973940, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.9921875, + "step": 2860, + "time_per_iteration": 2.463344097137451 + }, + { + "auxiliary_loss_clip": 0.01149457, + "auxiliary_loss_mlp": 0.01051045, + "balance_loss_clip": 1.03217435, + "balance_loss_mlp": 1.04703689, + "epoch": 0.17201262588306027, + "flos": 20810413353600.0, + "grad_norm": 2.1505291491255463, + "language_loss": 0.81964719, + "learning_rate": 3.792145618140317e-06, + "loss": 0.84165227, + "num_input_tokens_seen": 61991845, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0234375, + "step": 2861, + "time_per_iteration": 2.4505395889282227 + }, + { + "auxiliary_loss_clip": 0.01149339, + "auxiliary_loss_mlp": 0.01050609, + "balance_loss_clip": 1.03163123, + "balance_loss_mlp": 1.04897118, + "epoch": 0.17207274913572823, + "flos": 20375858615040.0, + "grad_norm": 4.22955926449596, + "language_loss": 0.85649675, + "learning_rate": 3.7919726993534038e-06, + "loss": 0.87849623, + "num_input_tokens_seen": 62009395, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0078125, + "step": 2862, + "time_per_iteration": 2.4392077922821045 + }, + { + "auxiliary_loss_clip": 0.01144423, + "auxiliary_loss_mlp": 0.01046105, + "balance_loss_clip": 1.02867651, + "balance_loss_mlp": 1.04785109, + "epoch": 0.17213287238839622, + "flos": 26797727675520.0, + "grad_norm": 2.3146804122881037, + "language_loss": 0.77874523, + "learning_rate": 3.7917997126147054e-06, + "loss": 0.80065054, + "num_input_tokens_seen": 62029005, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.96484375, + "step": 2863, + "time_per_iteration": 2.4745166301727295 + }, + { + "auxiliary_loss_clip": 0.01147347, + "auxiliary_loss_mlp": 0.01048138, + "balance_loss_clip": 1.02935052, + "balance_loss_mlp": 1.04726493, + "epoch": 0.1721929956410642, + "flos": 26030819370240.0, + "grad_norm": 1.7012031973405044, + "language_loss": 0.72191179, + "learning_rate": 3.7916266579307823e-06, + "loss": 0.74386668, + "num_input_tokens_seen": 62048730, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0, + "step": 2864, + "time_per_iteration": 2.496522903442383 + }, + { + "auxiliary_loss_clip": 0.01151447, + "auxiliary_loss_mlp": 0.01053526, + "balance_loss_clip": 1.03497648, + "balance_loss_mlp": 1.04935968, + "epoch": 0.17225311889373215, + "flos": 22273091078400.0, + "grad_norm": 1.6688219876641972, + "language_loss": 0.72896975, + "learning_rate": 3.7914535353081973e-06, + "loss": 0.75101948, + "num_input_tokens_seen": 62069000, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.015625, + "step": 2865, + "time_per_iteration": 2.468726396560669 + }, + { + "auxiliary_loss_clip": 0.01151587, + "auxiliary_loss_mlp": 0.01044873, + "balance_loss_clip": 1.02608538, + "balance_loss_mlp": 1.05194211, + "epoch": 0.17231324214640012, + "flos": 21287774125440.0, + "grad_norm": 3.1747822479918764, + "language_loss": 0.79011786, + "learning_rate": 3.7912803447535145e-06, + "loss": 0.81208247, + "num_input_tokens_seen": 62086750, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.99609375, + "step": 2866, + "time_per_iteration": 2.445716381072998 + }, + { + "auxiliary_loss_clip": 0.01149563, + "auxiliary_loss_mlp": 0.01046901, + "balance_loss_clip": 1.02651668, + "balance_loss_mlp": 1.04966402, + "epoch": 0.17237336539906808, + "flos": 19680735640320.0, + "grad_norm": 1.797659045411876, + "language_loss": 0.79865277, + "learning_rate": 3.7911070862733016e-06, + "loss": 0.82061744, + "num_input_tokens_seen": 62106240, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0, + "step": 2867, + "time_per_iteration": 2.4745590686798096 + }, + { + "auxiliary_loss_clip": 0.0114836, + "auxiliary_loss_mlp": 0.01037447, + "balance_loss_clip": 1.01887417, + "balance_loss_mlp": 1.04821014, + "epoch": 0.17243348865173605, + "flos": 17529650784000.0, + "grad_norm": 1.717941409951427, + "language_loss": 0.79707634, + "learning_rate": 3.7909337598741276e-06, + "loss": 0.81893444, + "num_input_tokens_seen": 62124895, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0, + "step": 2868, + "time_per_iteration": 2.4545693397521973 + }, + { + "auxiliary_loss_clip": 0.01157442, + "auxiliary_loss_mlp": 0.0104461, + "balance_loss_clip": 1.02645397, + "balance_loss_mlp": 1.0538218, + "epoch": 0.17249361190440402, + "flos": 18259858368000.0, + "grad_norm": 1.9332967921770021, + "language_loss": 0.84265673, + "learning_rate": 3.7907603655625674e-06, + "loss": 0.86467719, + "num_input_tokens_seen": 62143510, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.03125, + "step": 2869, + "time_per_iteration": 2.445429563522339 + }, + { + "auxiliary_loss_clip": 0.01151297, + "auxiliary_loss_mlp": 0.01052302, + "balance_loss_clip": 1.03226328, + "balance_loss_mlp": 1.04971075, + "epoch": 0.172553735157072, + "flos": 21174367910400.0, + "grad_norm": 2.3539211413688954, + "language_loss": 0.77522051, + "learning_rate": 3.7905869033451932e-06, + "loss": 0.79725653, + "num_input_tokens_seen": 62162285, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.015625, + "step": 2870, + "time_per_iteration": 2.4975087642669678 + }, + { + "auxiliary_loss_clip": 0.01146931, + "auxiliary_loss_mlp": 0.01043287, + "balance_loss_clip": 1.02609706, + "balance_loss_mlp": 1.05132568, + "epoch": 0.17261385840973997, + "flos": 22273270646400.0, + "grad_norm": 1.897031493968697, + "language_loss": 0.7680704, + "learning_rate": 3.7904133732285857e-06, + "loss": 0.78997254, + "num_input_tokens_seen": 62180970, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.95703125, + "step": 2871, + "time_per_iteration": 2.4777348041534424 + }, + { + "auxiliary_loss_clip": 0.01150344, + "auxiliary_loss_mlp": 0.01043916, + "balance_loss_clip": 1.02442563, + "balance_loss_mlp": 1.05061746, + "epoch": 0.17267398166240794, + "flos": 27922233830400.0, + "grad_norm": 2.240934958328371, + "language_loss": 0.74448204, + "learning_rate": 3.7902397752193228e-06, + "loss": 0.76642466, + "num_input_tokens_seen": 62198965, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0, + "step": 2872, + "time_per_iteration": 2.5021097660064697 + }, + { + "auxiliary_loss_clip": 0.01147343, + "auxiliary_loss_mlp": 0.01039942, + "balance_loss_clip": 1.02117848, + "balance_loss_mlp": 1.05127549, + "epoch": 0.1727341049150759, + "flos": 21945118970880.0, + "grad_norm": 1.8155923086100165, + "language_loss": 0.82694656, + "learning_rate": 3.790066109323988e-06, + "loss": 0.84881938, + "num_input_tokens_seen": 62219890, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9609375, + "step": 2873, + "time_per_iteration": 2.4852540493011475 + }, + { + "auxiliary_loss_clip": 0.01147589, + "auxiliary_loss_mlp": 0.01043231, + "balance_loss_clip": 1.0229888, + "balance_loss_mlp": 1.049196, + "epoch": 0.17279422816774387, + "flos": 18107883924480.0, + "grad_norm": 2.0464410919173814, + "language_loss": 0.75083232, + "learning_rate": 3.7898923755491678e-06, + "loss": 0.77274048, + "num_input_tokens_seen": 62237140, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.984375, + "step": 2874, + "time_per_iteration": 2.440610885620117 + }, + { + "auxiliary_loss_clip": 0.01151305, + "auxiliary_loss_mlp": 0.01044062, + "balance_loss_clip": 1.0238322, + "balance_loss_mlp": 1.0515728, + "epoch": 0.17285435142041183, + "flos": 21835447770240.0, + "grad_norm": 1.9230852666364326, + "language_loss": 0.8067199, + "learning_rate": 3.7897185739014487e-06, + "loss": 0.8286736, + "num_input_tokens_seen": 62255405, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.99609375, + "step": 2875, + "time_per_iteration": 2.478473424911499 + }, + { + "auxiliary_loss_clip": 0.01153488, + "auxiliary_loss_mlp": 0.01049908, + "balance_loss_clip": 1.02984488, + "balance_loss_mlp": 1.05083489, + "epoch": 0.17291447467307983, + "flos": 18368452160640.0, + "grad_norm": 2.5699127680633542, + "language_loss": 0.87525117, + "learning_rate": 3.7895447043874217e-06, + "loss": 0.89728516, + "num_input_tokens_seen": 62271280, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.03125, + "step": 2876, + "time_per_iteration": 2.4580602645874023 + }, + { + "auxiliary_loss_clip": 0.01150473, + "auxiliary_loss_mlp": 0.01042457, + "balance_loss_clip": 1.02384901, + "balance_loss_mlp": 1.05273616, + "epoch": 0.1729745979257478, + "flos": 18624638937600.0, + "grad_norm": 1.9567138745888089, + "language_loss": 0.84561193, + "learning_rate": 3.789370767013681e-06, + "loss": 0.86754125, + "num_input_tokens_seen": 62289140, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9765625, + "step": 2877, + "time_per_iteration": 2.4696123600006104 + }, + { + "auxiliary_loss_clip": 0.01151589, + "auxiliary_loss_mlp": 0.01041028, + "balance_loss_clip": 1.02179909, + "balance_loss_mlp": 1.05281305, + "epoch": 0.17303472117841576, + "flos": 22998234844800.0, + "grad_norm": 3.0724129461132406, + "language_loss": 0.79527134, + "learning_rate": 3.7891967617868204e-06, + "loss": 0.81719756, + "num_input_tokens_seen": 62307490, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.984375, + "step": 2878, + "time_per_iteration": 2.4739902019500732 + }, + { + "auxiliary_loss_clip": 0.01147004, + "auxiliary_loss_mlp": 0.01042956, + "balance_loss_clip": 1.02450228, + "balance_loss_mlp": 1.04968572, + "epoch": 0.17309484443108372, + "flos": 25664386775040.0, + "grad_norm": 1.9694378769308076, + "language_loss": 0.70306808, + "learning_rate": 3.78902268871344e-06, + "loss": 0.72496772, + "num_input_tokens_seen": 62328570, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.97265625, + "step": 2879, + "time_per_iteration": 2.5014665126800537 + }, + { + "auxiliary_loss_clip": 0.01151101, + "auxiliary_loss_mlp": 0.01050497, + "balance_loss_clip": 1.03156662, + "balance_loss_mlp": 1.05038834, + "epoch": 0.1731549676837517, + "flos": 13552903313280.0, + "grad_norm": 2.4431111997211734, + "language_loss": 0.83465785, + "learning_rate": 3.78884854780014e-06, + "loss": 0.85667384, + "num_input_tokens_seen": 62345735, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0078125, + "step": 2880, + "time_per_iteration": 2.433776378631592 + }, + { + "auxiliary_loss_clip": 0.01153087, + "auxiliary_loss_mlp": 0.01044219, + "balance_loss_clip": 1.0250026, + "balance_loss_mlp": 1.05171311, + "epoch": 0.17321509093641965, + "flos": 22857070394880.0, + "grad_norm": 2.135155165507549, + "language_loss": 0.80866969, + "learning_rate": 3.7886743390535236e-06, + "loss": 0.8306427, + "num_input_tokens_seen": 62365525, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.0078125, + "step": 2881, + "time_per_iteration": 2.4944772720336914 + }, + { + "auxiliary_loss_clip": 0.01148623, + "auxiliary_loss_mlp": 0.01043966, + "balance_loss_clip": 1.0263828, + "balance_loss_mlp": 1.05030859, + "epoch": 0.17327521418908762, + "flos": 24352785653760.0, + "grad_norm": 2.5502275528368066, + "language_loss": 0.77372867, + "learning_rate": 3.788500062480197e-06, + "loss": 0.79565454, + "num_input_tokens_seen": 62385160, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.984375, + "step": 2882, + "time_per_iteration": 2.5426836013793945 + }, + { + "auxiliary_loss_clip": 0.011482, + "auxiliary_loss_mlp": 0.01051627, + "balance_loss_clip": 1.03276825, + "balance_loss_mlp": 1.05005169, + "epoch": 0.1733353374417556, + "flos": 33105651816960.0, + "grad_norm": 1.8718611847068298, + "language_loss": 0.76652586, + "learning_rate": 3.788325718086769e-06, + "loss": 0.78852415, + "num_input_tokens_seen": 62405280, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.984375, + "step": 2883, + "time_per_iteration": 2.5733277797698975 + }, + { + "auxiliary_loss_clip": 0.01146516, + "auxiliary_loss_mlp": 0.01044391, + "balance_loss_clip": 1.0265696, + "balance_loss_mlp": 1.04944682, + "epoch": 0.17339546069442358, + "flos": 24388947671040.0, + "grad_norm": 1.945193845574475, + "language_loss": 0.85463524, + "learning_rate": 3.7881513058798503e-06, + "loss": 0.87654424, + "num_input_tokens_seen": 62423665, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.97265625, + "step": 2884, + "time_per_iteration": 2.4708735942840576 + }, + { + "auxiliary_loss_clip": 0.01149646, + "auxiliary_loss_mlp": 0.01039486, + "balance_loss_clip": 1.02122355, + "balance_loss_mlp": 1.05114794, + "epoch": 0.17345558394709154, + "flos": 27454174680960.0, + "grad_norm": 1.6148586475999513, + "language_loss": 0.73758793, + "learning_rate": 3.787976825866055e-06, + "loss": 0.75947917, + "num_input_tokens_seen": 62445170, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.984375, + "step": 2885, + "time_per_iteration": 2.5266878604888916 + }, + { + "auxiliary_loss_clip": 0.01147273, + "auxiliary_loss_mlp": 0.01044711, + "balance_loss_clip": 1.02775908, + "balance_loss_mlp": 1.05269074, + "epoch": 0.1735157071997595, + "flos": 24682158391680.0, + "grad_norm": 1.9690054244815705, + "language_loss": 0.70377076, + "learning_rate": 3.7878022780519998e-06, + "loss": 0.72569054, + "num_input_tokens_seen": 62466135, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9453125, + "step": 2886, + "time_per_iteration": 2.508695363998413 + }, + { + "auxiliary_loss_clip": 0.01146959, + "auxiliary_loss_mlp": 0.01040101, + "balance_loss_clip": 1.0212425, + "balance_loss_mlp": 1.04799545, + "epoch": 0.17357583045242747, + "flos": 21688932193920.0, + "grad_norm": 1.9665325510573808, + "language_loss": 0.69294798, + "learning_rate": 3.7876276624443024e-06, + "loss": 0.7148186, + "num_input_tokens_seen": 62483910, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.98828125, + "step": 2887, + "time_per_iteration": 2.4787776470184326 + }, + { + "auxiliary_loss_clip": 0.01149915, + "auxiliary_loss_mlp": 0.01049822, + "balance_loss_clip": 1.03180945, + "balance_loss_mlp": 1.05075955, + "epoch": 0.17363595370509544, + "flos": 15375728753280.0, + "grad_norm": 1.791000255721863, + "language_loss": 0.85391176, + "learning_rate": 3.787452979049585e-06, + "loss": 0.87590909, + "num_input_tokens_seen": 62501530, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9921875, + "step": 2888, + "time_per_iteration": 2.4234085083007812 + }, + { + "auxiliary_loss_clip": 0.01149617, + "auxiliary_loss_mlp": 0.01046928, + "balance_loss_clip": 1.02668667, + "balance_loss_mlp": 1.05046952, + "epoch": 0.1736960769577634, + "flos": 23440941970560.0, + "grad_norm": 3.660213605651755, + "language_loss": 0.78465497, + "learning_rate": 3.7872782278744718e-06, + "loss": 0.80662042, + "num_input_tokens_seen": 62521295, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.9921875, + "step": 2889, + "time_per_iteration": 2.5042123794555664 + }, + { + "auxiliary_loss_clip": 0.01146581, + "auxiliary_loss_mlp": 0.01047944, + "balance_loss_clip": 1.02913308, + "balance_loss_mlp": 1.05222893, + "epoch": 0.1737562002104314, + "flos": 18587830475520.0, + "grad_norm": 2.9081348702485723, + "language_loss": 0.83860242, + "learning_rate": 3.7871034089255883e-06, + "loss": 0.86054766, + "num_input_tokens_seen": 62539615, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.9453125, + "step": 2890, + "time_per_iteration": 2.4698500633239746 + }, + { + "auxiliary_loss_clip": 0.01150813, + "auxiliary_loss_mlp": 0.01047378, + "balance_loss_clip": 1.02880502, + "balance_loss_mlp": 1.05083108, + "epoch": 0.17381632346309936, + "flos": 15998060816640.0, + "grad_norm": 1.9935479009749588, + "language_loss": 0.82253492, + "learning_rate": 3.7869285222095653e-06, + "loss": 0.84451687, + "num_input_tokens_seen": 62556820, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0, + "step": 2891, + "time_per_iteration": 2.4478886127471924 + }, + { + "auxiliary_loss_clip": 0.01150226, + "auxiliary_loss_mlp": 0.01045776, + "balance_loss_clip": 1.02608275, + "balance_loss_mlp": 1.04824781, + "epoch": 0.17387644671576732, + "flos": 13369830670080.0, + "grad_norm": 2.3073165362682873, + "language_loss": 0.81479478, + "learning_rate": 3.7867535677330334e-06, + "loss": 0.8367548, + "num_input_tokens_seen": 62572450, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.015625, + "step": 2892, + "time_per_iteration": 2.4094645977020264 + }, + { + "auxiliary_loss_clip": 0.01154909, + "auxiliary_loss_mlp": 0.01055666, + "balance_loss_clip": 1.03519785, + "balance_loss_mlp": 1.05379355, + "epoch": 0.1739365699684353, + "flos": 26615516958720.0, + "grad_norm": 2.24459564009462, + "language_loss": 0.74480057, + "learning_rate": 3.786578545502627e-06, + "loss": 0.76690638, + "num_input_tokens_seen": 62592580, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0078125, + "step": 2893, + "time_per_iteration": 3.8296191692352295 + }, + { + "auxiliary_loss_clip": 0.01152082, + "auxiliary_loss_mlp": 0.010434, + "balance_loss_clip": 1.02375412, + "balance_loss_mlp": 1.05193436, + "epoch": 0.17399669322110325, + "flos": 23367971491200.0, + "grad_norm": 2.117368029368179, + "language_loss": 0.83073241, + "learning_rate": 3.7864034555249828e-06, + "loss": 0.85268712, + "num_input_tokens_seen": 62611220, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0, + "step": 2894, + "time_per_iteration": 3.9817075729370117 + }, + { + "auxiliary_loss_clip": 0.01150382, + "auxiliary_loss_mlp": 0.01047312, + "balance_loss_clip": 1.02523482, + "balance_loss_mlp": 1.05032384, + "epoch": 0.17405681647377122, + "flos": 22054107813120.0, + "grad_norm": 2.157907065313142, + "language_loss": 0.74051547, + "learning_rate": 3.786228297806741e-06, + "loss": 0.76249242, + "num_input_tokens_seen": 62629185, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.0, + "step": 2895, + "time_per_iteration": 2.461857318878174 + }, + { + "auxiliary_loss_clip": 0.01048544, + "auxiliary_loss_mlp": 0.01006984, + "balance_loss_clip": 1.00467134, + "balance_loss_mlp": 1.01600659, + "epoch": 0.1741169397264392, + "flos": 61457559114240.0, + "grad_norm": 0.8715266336267762, + "language_loss": 0.6273998, + "learning_rate": 3.7860530723545435e-06, + "loss": 0.64795506, + "num_input_tokens_seen": 62691895, + "router_z_loss_clip": 0.02307129, + "router_z_loss_mlp": 0.32421875, + "step": 2896, + "time_per_iteration": 3.1462173461914062 + }, + { + "auxiliary_loss_clip": 0.01148575, + "auxiliary_loss_mlp": 0.01041696, + "balance_loss_clip": 1.02160895, + "balance_loss_mlp": 1.04787612, + "epoch": 0.17417706297910718, + "flos": 27017680608000.0, + "grad_norm": 2.3238967096174923, + "language_loss": 0.75600475, + "learning_rate": 3.785877779175034e-06, + "loss": 0.77790749, + "num_input_tokens_seen": 62713790, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0078125, + "step": 2897, + "time_per_iteration": 2.4974682331085205 + }, + { + "auxiliary_loss_clip": 0.01147676, + "auxiliary_loss_mlp": 0.01042715, + "balance_loss_clip": 1.02354646, + "balance_loss_mlp": 1.05000067, + "epoch": 0.17423718623177514, + "flos": 33508856960640.0, + "grad_norm": 1.9004029304223122, + "language_loss": 0.69384712, + "learning_rate": 3.7857024182748606e-06, + "loss": 0.71575105, + "num_input_tokens_seen": 62736285, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.9765625, + "step": 2898, + "time_per_iteration": 2.5650558471679688 + }, + { + "auxiliary_loss_clip": 0.0115334, + "auxiliary_loss_mlp": 0.01049615, + "balance_loss_clip": 1.03026772, + "balance_loss_mlp": 1.05215359, + "epoch": 0.1742973094844431, + "flos": 27198634348800.0, + "grad_norm": 2.315885710988465, + "language_loss": 0.76069367, + "learning_rate": 3.7855269896606717e-06, + "loss": 0.78272319, + "num_input_tokens_seen": 62756240, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.015625, + "step": 2899, + "time_per_iteration": 2.5006191730499268 + }, + { + "auxiliary_loss_clip": 0.01145178, + "auxiliary_loss_mlp": 0.01045245, + "balance_loss_clip": 1.02571905, + "balance_loss_mlp": 1.04929495, + "epoch": 0.17435743273711107, + "flos": 22710734386560.0, + "grad_norm": 1.9440585306650153, + "language_loss": 0.72821134, + "learning_rate": 3.785351493339121e-06, + "loss": 0.75011557, + "num_input_tokens_seen": 62775910, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.9609375, + "step": 2900, + "time_per_iteration": 2.5199801921844482 + }, + { + "auxiliary_loss_clip": 0.01150074, + "auxiliary_loss_mlp": 0.01051215, + "balance_loss_clip": 1.03261876, + "balance_loss_mlp": 1.04989529, + "epoch": 0.17441755598977904, + "flos": 41646466039680.0, + "grad_norm": 1.6677330343015109, + "language_loss": 0.70085949, + "learning_rate": 3.785175929316863e-06, + "loss": 0.72287238, + "num_input_tokens_seen": 62799385, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0, + "step": 2901, + "time_per_iteration": 2.624864101409912 + }, + { + "auxiliary_loss_clip": 0.01152064, + "auxiliary_loss_mlp": 0.01048884, + "balance_loss_clip": 1.03022778, + "balance_loss_mlp": 1.05087507, + "epoch": 0.174477679242447, + "flos": 26287077974400.0, + "grad_norm": 1.7643324639769489, + "language_loss": 0.76549768, + "learning_rate": 3.7850002976005543e-06, + "loss": 0.78750718, + "num_input_tokens_seen": 62819380, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.0078125, + "step": 2902, + "time_per_iteration": 2.51680850982666 + }, + { + "auxiliary_loss_clip": 0.01150394, + "auxiliary_loss_mlp": 0.01056591, + "balance_loss_clip": 1.03741002, + "balance_loss_mlp": 1.04885221, + "epoch": 0.174537802495115, + "flos": 17858412990720.0, + "grad_norm": 2.129298660499851, + "language_loss": 0.81787169, + "learning_rate": 3.7848245981968558e-06, + "loss": 0.8399415, + "num_input_tokens_seen": 62836205, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.015625, + "step": 2903, + "time_per_iteration": 2.436877727508545 + }, + { + "auxiliary_loss_clip": 0.01148467, + "auxiliary_loss_mlp": 0.01041626, + "balance_loss_clip": 1.02255297, + "balance_loss_mlp": 1.04978609, + "epoch": 0.17459792574778296, + "flos": 16940715390720.0, + "grad_norm": 2.1703016783079327, + "language_loss": 0.73228866, + "learning_rate": 3.784648831112429e-06, + "loss": 0.75418955, + "num_input_tokens_seen": 62854045, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.98828125, + "step": 2904, + "time_per_iteration": 2.462775707244873 + }, + { + "auxiliary_loss_clip": 0.0114756, + "auxiliary_loss_mlp": 0.01045319, + "balance_loss_clip": 1.02719879, + "balance_loss_mlp": 1.04777265, + "epoch": 0.17465804900045093, + "flos": 25520026014720.0, + "grad_norm": 1.9374721445221084, + "language_loss": 0.64526325, + "learning_rate": 3.7844729963539406e-06, + "loss": 0.6671921, + "num_input_tokens_seen": 62873075, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.0, + "step": 2905, + "time_per_iteration": 2.468395233154297 + }, + { + "auxiliary_loss_clip": 0.01157304, + "auxiliary_loss_mlp": 0.01050089, + "balance_loss_clip": 1.0292747, + "balance_loss_mlp": 1.05202341, + "epoch": 0.1747181722531189, + "flos": 24129708238080.0, + "grad_norm": 1.804147248272645, + "language_loss": 0.79236615, + "learning_rate": 3.7842970939280566e-06, + "loss": 0.81444013, + "num_input_tokens_seen": 62892675, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.0546875, + "step": 2906, + "time_per_iteration": 2.4632725715637207 + }, + { + "auxiliary_loss_clip": 0.01150693, + "auxiliary_loss_mlp": 0.01055346, + "balance_loss_clip": 1.03577161, + "balance_loss_mlp": 1.05044913, + "epoch": 0.17477829550578686, + "flos": 17748813617280.0, + "grad_norm": 1.7929508882228948, + "language_loss": 0.81010377, + "learning_rate": 3.784121123841449e-06, + "loss": 0.83216417, + "num_input_tokens_seen": 62910675, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0, + "step": 2907, + "time_per_iteration": 2.4214229583740234 + }, + { + "auxiliary_loss_clip": 0.01152007, + "auxiliary_loss_mlp": 0.01050463, + "balance_loss_clip": 1.03161621, + "balance_loss_mlp": 1.05040026, + "epoch": 0.17483841875845482, + "flos": 15377344865280.0, + "grad_norm": 2.7402312811515515, + "language_loss": 0.81315112, + "learning_rate": 3.7839450861007886e-06, + "loss": 0.83517587, + "num_input_tokens_seen": 62928130, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.015625, + "step": 2908, + "time_per_iteration": 2.4340970516204834 + }, + { + "auxiliary_loss_clip": 0.01150458, + "auxiliary_loss_mlp": 0.01051278, + "balance_loss_clip": 1.03047633, + "balance_loss_mlp": 1.04978228, + "epoch": 0.17489854201112282, + "flos": 17163254102400.0, + "grad_norm": 2.419675279893618, + "language_loss": 0.80399191, + "learning_rate": 3.7837689807127518e-06, + "loss": 0.82600915, + "num_input_tokens_seen": 62944290, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.0078125, + "step": 2909, + "time_per_iteration": 2.4170033931732178 + }, + { + "auxiliary_loss_clip": 0.0115308, + "auxiliary_loss_mlp": 0.01053412, + "balance_loss_clip": 1.03319383, + "balance_loss_mlp": 1.05133021, + "epoch": 0.17495866526379078, + "flos": 19755286318080.0, + "grad_norm": 1.6998329053727648, + "language_loss": 0.76530939, + "learning_rate": 3.783592807684017e-06, + "loss": 0.78737426, + "num_input_tokens_seen": 62963505, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.015625, + "step": 2910, + "time_per_iteration": 2.457628011703491 + }, + { + "auxiliary_loss_clip": 0.01151475, + "auxiliary_loss_mlp": 0.01049527, + "balance_loss_clip": 1.02901077, + "balance_loss_mlp": 1.05060935, + "epoch": 0.17501878851645875, + "flos": 28511133310080.0, + "grad_norm": 1.6502133484544155, + "language_loss": 0.87255991, + "learning_rate": 3.7834165670212645e-06, + "loss": 0.89456993, + "num_input_tokens_seen": 62985020, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0078125, + "step": 2911, + "time_per_iteration": 2.5302672386169434 + }, + { + "auxiliary_loss_clip": 0.01148398, + "auxiliary_loss_mlp": 0.0105451, + "balance_loss_clip": 1.03349352, + "balance_loss_mlp": 1.04746377, + "epoch": 0.1750789117691267, + "flos": 17931203902080.0, + "grad_norm": 2.260601647926804, + "language_loss": 0.89586449, + "learning_rate": 3.7832402587311764e-06, + "loss": 0.91789353, + "num_input_tokens_seen": 63001745, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.0078125, + "step": 2912, + "time_per_iteration": 2.447650194168091 + }, + { + "auxiliary_loss_clip": 0.01150608, + "auxiliary_loss_mlp": 0.01050267, + "balance_loss_clip": 1.0302161, + "balance_loss_mlp": 1.04871392, + "epoch": 0.17513903502179468, + "flos": 18259427404800.0, + "grad_norm": 2.8836544870459813, + "language_loss": 0.7262938, + "learning_rate": 3.783063882820439e-06, + "loss": 0.74830252, + "num_input_tokens_seen": 63019750, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.015625, + "step": 2913, + "time_per_iteration": 2.423595666885376 + }, + { + "auxiliary_loss_clip": 0.01150722, + "auxiliary_loss_mlp": 0.01047113, + "balance_loss_clip": 1.02738369, + "balance_loss_mlp": 1.0522244, + "epoch": 0.17519915827446264, + "flos": 20704728562560.0, + "grad_norm": 2.243393227782369, + "language_loss": 0.68799925, + "learning_rate": 3.782887439295741e-06, + "loss": 0.70997757, + "num_input_tokens_seen": 63039500, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.984375, + "step": 2914, + "time_per_iteration": 2.46085262298584 + }, + { + "auxiliary_loss_clip": 0.01149727, + "auxiliary_loss_mlp": 0.01056566, + "balance_loss_clip": 1.03616977, + "balance_loss_mlp": 1.05143356, + "epoch": 0.1752592815271306, + "flos": 20523415685760.0, + "grad_norm": 1.8218690011087264, + "language_loss": 0.93755293, + "learning_rate": 3.782710928163772e-06, + "loss": 0.95961595, + "num_input_tokens_seen": 63059785, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.98046875, + "step": 2915, + "time_per_iteration": 2.457148551940918 + }, + { + "auxiliary_loss_clip": 0.01143068, + "auxiliary_loss_mlp": 0.01047095, + "balance_loss_clip": 1.02744889, + "balance_loss_mlp": 1.04722261, + "epoch": 0.1753194047797986, + "flos": 21799178012160.0, + "grad_norm": 1.8144768789670476, + "language_loss": 0.80869162, + "learning_rate": 3.782534349431226e-06, + "loss": 0.83059323, + "num_input_tokens_seen": 63079385, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.95703125, + "step": 2916, + "time_per_iteration": 2.4740476608276367 + }, + { + "auxiliary_loss_clip": 0.01150429, + "auxiliary_loss_mlp": 0.01056449, + "balance_loss_clip": 1.03663611, + "balance_loss_mlp": 1.04854608, + "epoch": 0.17537952803246656, + "flos": 20668351063680.0, + "grad_norm": 1.67512565222408, + "language_loss": 0.73645711, + "learning_rate": 3.782357703104799e-06, + "loss": 0.75852591, + "num_input_tokens_seen": 63098970, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.015625, + "step": 2917, + "time_per_iteration": 2.4484915733337402 + }, + { + "auxiliary_loss_clip": 0.01144993, + "auxiliary_loss_mlp": 0.01055794, + "balance_loss_clip": 1.03517044, + "balance_loss_mlp": 1.04897738, + "epoch": 0.17543965128513453, + "flos": 23295072839040.0, + "grad_norm": 12.675743752905372, + "language_loss": 0.77019119, + "learning_rate": 3.7821809891911897e-06, + "loss": 0.79219908, + "num_input_tokens_seen": 63118750, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 0.9609375, + "step": 2918, + "time_per_iteration": 2.4723429679870605 + }, + { + "auxiliary_loss_clip": 0.01154194, + "auxiliary_loss_mlp": 0.01046159, + "balance_loss_clip": 1.0260129, + "balance_loss_mlp": 1.05131745, + "epoch": 0.1754997745378025, + "flos": 29095615416960.0, + "grad_norm": 3.415786226656528, + "language_loss": 0.74196291, + "learning_rate": 3.782004207697098e-06, + "loss": 0.76396644, + "num_input_tokens_seen": 63136865, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.03125, + "step": 2919, + "time_per_iteration": 2.5049829483032227 + }, + { + "auxiliary_loss_clip": 0.01154792, + "auxiliary_loss_mlp": 0.01049917, + "balance_loss_clip": 1.03080809, + "balance_loss_mlp": 1.05090559, + "epoch": 0.17555989779047046, + "flos": 30371844620160.0, + "grad_norm": 1.7754050788280298, + "language_loss": 0.74211872, + "learning_rate": 3.781827358629228e-06, + "loss": 0.76416576, + "num_input_tokens_seen": 63158325, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0390625, + "step": 2920, + "time_per_iteration": 2.565361738204956 + }, + { + "auxiliary_loss_clip": 0.01144387, + "auxiliary_loss_mlp": 0.01039886, + "balance_loss_clip": 1.0219686, + "balance_loss_mlp": 1.04717219, + "epoch": 0.17562002104313842, + "flos": 23287746464640.0, + "grad_norm": 2.3164139995284834, + "language_loss": 0.7949307, + "learning_rate": 3.7816504419942873e-06, + "loss": 0.81677347, + "num_input_tokens_seen": 63173115, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.97265625, + "step": 2921, + "time_per_iteration": 2.4471213817596436 + }, + { + "auxiliary_loss_clip": 0.01153986, + "auxiliary_loss_mlp": 0.0104563, + "balance_loss_clip": 1.02569795, + "balance_loss_mlp": 1.05029321, + "epoch": 0.1756801442958064, + "flos": 24790500789120.0, + "grad_norm": 1.6170497741380607, + "language_loss": 0.87493849, + "learning_rate": 3.7814734577989823e-06, + "loss": 0.89693457, + "num_input_tokens_seen": 63192880, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0390625, + "step": 2922, + "time_per_iteration": 2.5042173862457275 + }, + { + "auxiliary_loss_clip": 0.01149338, + "auxiliary_loss_mlp": 0.01050477, + "balance_loss_clip": 1.03074801, + "balance_loss_mlp": 1.04808784, + "epoch": 0.17574026754847438, + "flos": 25771651764480.0, + "grad_norm": 2.3811708545321735, + "language_loss": 0.62097687, + "learning_rate": 3.7812964060500253e-06, + "loss": 0.64297503, + "num_input_tokens_seen": 63214395, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.015625, + "step": 2923, + "time_per_iteration": 2.5067484378814697 + }, + { + "auxiliary_loss_clip": 0.01154551, + "auxiliary_loss_mlp": 0.01048297, + "balance_loss_clip": 1.02775693, + "balance_loss_mlp": 1.05287814, + "epoch": 0.17580039080114235, + "flos": 17456608477440.0, + "grad_norm": 2.1344206016331797, + "language_loss": 0.80602306, + "learning_rate": 3.78111928675413e-06, + "loss": 0.82805157, + "num_input_tokens_seen": 63231020, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.015625, + "step": 2924, + "time_per_iteration": 2.453174114227295 + }, + { + "auxiliary_loss_clip": 0.0115147, + "auxiliary_loss_mlp": 0.01053673, + "balance_loss_clip": 1.03214407, + "balance_loss_mlp": 1.04809761, + "epoch": 0.1758605140538103, + "flos": 14864648088960.0, + "grad_norm": 3.672968077353321, + "language_loss": 0.70954067, + "learning_rate": 3.7809420999180126e-06, + "loss": 0.73159206, + "num_input_tokens_seen": 63246245, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.03125, + "step": 2925, + "time_per_iteration": 2.4666385650634766 + }, + { + "auxiliary_loss_clip": 0.01148763, + "auxiliary_loss_mlp": 0.0104438, + "balance_loss_clip": 1.02538979, + "balance_loss_mlp": 1.05147243, + "epoch": 0.17592063730647828, + "flos": 23004268329600.0, + "grad_norm": 1.6622274839000213, + "language_loss": 0.71700275, + "learning_rate": 3.7807648455483934e-06, + "loss": 0.73893416, + "num_input_tokens_seen": 63267790, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.97265625, + "step": 2926, + "time_per_iteration": 2.50289249420166 + }, + { + "auxiliary_loss_clip": 0.01150931, + "auxiliary_loss_mlp": 0.01043072, + "balance_loss_clip": 1.02191257, + "balance_loss_mlp": 1.04857433, + "epoch": 0.17598076055914624, + "flos": 20741501111040.0, + "grad_norm": 1.8916391197618272, + "language_loss": 0.84433806, + "learning_rate": 3.7805875236519918e-06, + "loss": 0.86627805, + "num_input_tokens_seen": 63286830, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.0234375, + "step": 2927, + "time_per_iteration": 2.447207450866699 + }, + { + "auxiliary_loss_clip": 0.01149947, + "auxiliary_loss_mlp": 0.01043802, + "balance_loss_clip": 1.02568233, + "balance_loss_mlp": 1.0506475, + "epoch": 0.1760408838118142, + "flos": 34092441227520.0, + "grad_norm": 1.8156588356210406, + "language_loss": 0.71879232, + "learning_rate": 3.7804101342355336e-06, + "loss": 0.74072987, + "num_input_tokens_seen": 63308870, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9921875, + "step": 2928, + "time_per_iteration": 2.585942029953003 + }, + { + "auxiliary_loss_clip": 0.01150116, + "auxiliary_loss_mlp": 0.01048544, + "balance_loss_clip": 1.028934, + "balance_loss_mlp": 1.05230594, + "epoch": 0.1761010070644822, + "flos": 24168384207360.0, + "grad_norm": 2.0402577824357886, + "language_loss": 0.83222824, + "learning_rate": 3.780232677305744e-06, + "loss": 0.85421479, + "num_input_tokens_seen": 63329005, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.9765625, + "step": 2929, + "time_per_iteration": 2.461101770401001 + }, + { + "auxiliary_loss_clip": 0.01149627, + "auxiliary_loss_mlp": 0.0104173, + "balance_loss_clip": 1.02298999, + "balance_loss_mlp": 1.0493536, + "epoch": 0.17616113031715017, + "flos": 26576697335040.0, + "grad_norm": 1.817429721867852, + "language_loss": 0.7933988, + "learning_rate": 3.7800551528693535e-06, + "loss": 0.81531239, + "num_input_tokens_seen": 63349390, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0, + "step": 2930, + "time_per_iteration": 2.491748571395874 + }, + { + "auxiliary_loss_clip": 0.01154203, + "auxiliary_loss_mlp": 0.0104708, + "balance_loss_clip": 1.02671921, + "balance_loss_mlp": 1.05319881, + "epoch": 0.17622125356981813, + "flos": 25666685245440.0, + "grad_norm": 2.194829469856105, + "language_loss": 0.76142448, + "learning_rate": 3.7798775609330927e-06, + "loss": 0.78343737, + "num_input_tokens_seen": 63368835, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0078125, + "step": 2931, + "time_per_iteration": 2.4907379150390625 + }, + { + "auxiliary_loss_clip": 0.01151699, + "auxiliary_loss_mlp": 0.0104003, + "balance_loss_clip": 1.0211587, + "balance_loss_mlp": 1.05108666, + "epoch": 0.1762813768224861, + "flos": 16508530949760.0, + "grad_norm": 2.8261445455709153, + "language_loss": 0.74740392, + "learning_rate": 3.779699901503696e-06, + "loss": 0.7693212, + "num_input_tokens_seen": 63385220, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0078125, + "step": 2932, + "time_per_iteration": 2.4252588748931885 + }, + { + "auxiliary_loss_clip": 0.01157373, + "auxiliary_loss_mlp": 0.01043179, + "balance_loss_clip": 1.0221262, + "balance_loss_mlp": 1.05086923, + "epoch": 0.17634150007515406, + "flos": 11211850402560.0, + "grad_norm": 2.4930669650063355, + "language_loss": 0.8968839, + "learning_rate": 3.7795221745879016e-06, + "loss": 0.9188894, + "num_input_tokens_seen": 63400865, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.0625, + "step": 2933, + "time_per_iteration": 2.4334278106689453 + }, + { + "auxiliary_loss_clip": 0.01147962, + "auxiliary_loss_mlp": 0.01047507, + "balance_loss_clip": 1.02980459, + "balance_loss_mlp": 1.05053639, + "epoch": 0.17640162332782203, + "flos": 23659925235840.0, + "grad_norm": 1.6616334836184845, + "language_loss": 0.88273364, + "learning_rate": 3.779344380192448e-06, + "loss": 0.90468836, + "num_input_tokens_seen": 63421390, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9765625, + "step": 2934, + "time_per_iteration": 3.891472578048706 + }, + { + "auxiliary_loss_clip": 0.01147552, + "auxiliary_loss_mlp": 0.0104641, + "balance_loss_clip": 1.02827823, + "balance_loss_mlp": 1.04972959, + "epoch": 0.17646174658049, + "flos": 53796984606720.0, + "grad_norm": 1.7575209177187046, + "language_loss": 0.70843625, + "learning_rate": 3.779166518324077e-06, + "loss": 0.73037589, + "num_input_tokens_seen": 63444715, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.98046875, + "step": 2935, + "time_per_iteration": 5.650984287261963 + }, + { + "auxiliary_loss_clip": 0.01157572, + "auxiliary_loss_mlp": 0.0104314, + "balance_loss_clip": 1.02405488, + "balance_loss_mlp": 1.05251908, + "epoch": 0.17652186983315798, + "flos": 24243868638720.0, + "grad_norm": 2.2448658169111795, + "language_loss": 0.69255942, + "learning_rate": 3.7789885889895325e-06, + "loss": 0.71456659, + "num_input_tokens_seen": 63465525, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0546875, + "step": 2936, + "time_per_iteration": 2.4864091873168945 + }, + { + "auxiliary_loss_clip": 0.01154775, + "auxiliary_loss_mlp": 0.01045313, + "balance_loss_clip": 1.02758646, + "balance_loss_mlp": 1.05530488, + "epoch": 0.17658199308582595, + "flos": 27454282421760.0, + "grad_norm": 1.883537128373794, + "language_loss": 0.71391022, + "learning_rate": 3.7788105921955634e-06, + "loss": 0.73591107, + "num_input_tokens_seen": 63485815, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.99609375, + "step": 2937, + "time_per_iteration": 2.5096240043640137 + }, + { + "auxiliary_loss_clip": 0.01158892, + "auxiliary_loss_mlp": 0.010448, + "balance_loss_clip": 1.02461779, + "balance_loss_mlp": 1.05530524, + "epoch": 0.17664211633849392, + "flos": 22418672901120.0, + "grad_norm": 2.165923066719211, + "language_loss": 0.7584855, + "learning_rate": 3.7786325279489184e-06, + "loss": 0.78052241, + "num_input_tokens_seen": 63503905, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.03125, + "step": 2938, + "time_per_iteration": 2.475069284439087 + }, + { + "auxiliary_loss_clip": 0.01153261, + "auxiliary_loss_mlp": 0.01043059, + "balance_loss_clip": 1.02466512, + "balance_loss_mlp": 1.05156195, + "epoch": 0.17670223959116188, + "flos": 24715124098560.0, + "grad_norm": 2.20477923303766, + "language_loss": 0.71130306, + "learning_rate": 3.7784543962563495e-06, + "loss": 0.73326623, + "num_input_tokens_seen": 63521985, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.015625, + "step": 2939, + "time_per_iteration": 2.4806766510009766 + }, + { + "auxiliary_loss_clip": 0.01153772, + "auxiliary_loss_mlp": 0.0104332, + "balance_loss_clip": 1.02421093, + "balance_loss_mlp": 1.0538342, + "epoch": 0.17676236284382985, + "flos": 22527051212160.0, + "grad_norm": 3.125031265469358, + "language_loss": 0.73781312, + "learning_rate": 3.7782761971246115e-06, + "loss": 0.7597841, + "num_input_tokens_seen": 63539830, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0, + "step": 2940, + "time_per_iteration": 2.5438694953918457 + }, + { + "auxiliary_loss_clip": 0.01154904, + "auxiliary_loss_mlp": 0.01045748, + "balance_loss_clip": 1.02568471, + "balance_loss_mlp": 1.05372643, + "epoch": 0.1768224860964978, + "flos": 12385160161920.0, + "grad_norm": 2.4976558026918703, + "language_loss": 0.85003591, + "learning_rate": 3.7780979305604616e-06, + "loss": 0.87204242, + "num_input_tokens_seen": 63555495, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0078125, + "step": 2941, + "time_per_iteration": 2.4616622924804688 + }, + { + "auxiliary_loss_clip": 0.01154492, + "auxiliary_loss_mlp": 0.0104577, + "balance_loss_clip": 1.02687514, + "balance_loss_mlp": 1.05292201, + "epoch": 0.1768826093491658, + "flos": 24353360271360.0, + "grad_norm": 2.199835477442084, + "language_loss": 0.7711162, + "learning_rate": 3.7779195965706607e-06, + "loss": 0.79311877, + "num_input_tokens_seen": 63575290, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.015625, + "step": 2942, + "time_per_iteration": 2.512493848800659 + }, + { + "auxiliary_loss_clip": 0.01154308, + "auxiliary_loss_mlp": 0.01044542, + "balance_loss_clip": 1.02514625, + "balance_loss_mlp": 1.05181623, + "epoch": 0.17694273260183377, + "flos": 23587062497280.0, + "grad_norm": 1.9811917296629065, + "language_loss": 0.80591762, + "learning_rate": 3.77774119516197e-06, + "loss": 0.82790613, + "num_input_tokens_seen": 63594670, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0234375, + "step": 2943, + "time_per_iteration": 2.4898416996002197 + }, + { + "auxiliary_loss_clip": 0.01154834, + "auxiliary_loss_mlp": 0.01050893, + "balance_loss_clip": 1.02953053, + "balance_loss_mlp": 1.05046725, + "epoch": 0.17700285585450173, + "flos": 26760991040640.0, + "grad_norm": 2.9958912509352866, + "language_loss": 0.80558729, + "learning_rate": 3.777562726341155e-06, + "loss": 0.82764459, + "num_input_tokens_seen": 63614780, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.046875, + "step": 2944, + "time_per_iteration": 2.533968448638916 + }, + { + "auxiliary_loss_clip": 0.01154843, + "auxiliary_loss_mlp": 0.01062464, + "balance_loss_clip": 1.04353368, + "balance_loss_mlp": 1.05239737, + "epoch": 0.1770629791071697, + "flos": 42776323320960.0, + "grad_norm": 1.992535786356086, + "language_loss": 0.73450243, + "learning_rate": 3.7773841901149835e-06, + "loss": 0.75667548, + "num_input_tokens_seen": 63637190, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0234375, + "step": 2945, + "time_per_iteration": 2.641890287399292 + }, + { + "auxiliary_loss_clip": 0.01152525, + "auxiliary_loss_mlp": 0.01050215, + "balance_loss_clip": 1.03179753, + "balance_loss_mlp": 1.05274916, + "epoch": 0.17712310235983766, + "flos": 17345572560000.0, + "grad_norm": 2.3259800829895028, + "language_loss": 0.7778489, + "learning_rate": 3.7772055864902256e-06, + "loss": 0.79987633, + "num_input_tokens_seen": 63652140, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.99609375, + "step": 2946, + "time_per_iteration": 2.420511484146118 + }, + { + "auxiliary_loss_clip": 0.01150621, + "auxiliary_loss_mlp": 0.01051141, + "balance_loss_clip": 1.03190041, + "balance_loss_mlp": 1.05060697, + "epoch": 0.17718322561250563, + "flos": 23878477537920.0, + "grad_norm": 1.9846715459481197, + "language_loss": 0.76240218, + "learning_rate": 3.7770269154736535e-06, + "loss": 0.78441978, + "num_input_tokens_seen": 63671700, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.0, + "step": 2947, + "time_per_iteration": 2.485795259475708 + }, + { + "auxiliary_loss_clip": 0.01148639, + "auxiliary_loss_mlp": 0.01046512, + "balance_loss_clip": 1.02725959, + "balance_loss_mlp": 1.04881549, + "epoch": 0.1772433488651736, + "flos": 36466352104320.0, + "grad_norm": 2.7031010106606654, + "language_loss": 0.71890748, + "learning_rate": 3.7768481770720424e-06, + "loss": 0.74085903, + "num_input_tokens_seen": 63691685, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.99609375, + "step": 2948, + "time_per_iteration": 2.598586320877075 + }, + { + "auxiliary_loss_clip": 0.01151482, + "auxiliary_loss_mlp": 0.01051629, + "balance_loss_clip": 1.03313947, + "balance_loss_mlp": 1.05261326, + "epoch": 0.1773034721178416, + "flos": 26684716510080.0, + "grad_norm": 1.809900152556277, + "language_loss": 0.81843233, + "learning_rate": 3.776669371292171e-06, + "loss": 0.8404634, + "num_input_tokens_seen": 63711720, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.98828125, + "step": 2949, + "time_per_iteration": 2.496962547302246 + }, + { + "auxiliary_loss_clip": 0.01051917, + "auxiliary_loss_mlp": 0.01007586, + "balance_loss_clip": 1.00552368, + "balance_loss_mlp": 1.01889789, + "epoch": 0.17736359537050955, + "flos": 57117467617920.0, + "grad_norm": 0.7669309197050882, + "language_loss": 0.64973593, + "learning_rate": 3.7764904981408186e-06, + "loss": 0.670331, + "num_input_tokens_seen": 63776280, + "router_z_loss_clip": 0.02062988, + "router_z_loss_mlp": 0.33007812, + "step": 2950, + "time_per_iteration": 3.1220879554748535 + }, + { + "auxiliary_loss_clip": 0.01145274, + "auxiliary_loss_mlp": 0.01049164, + "balance_loss_clip": 1.02992332, + "balance_loss_mlp": 1.04777181, + "epoch": 0.17742371862317752, + "flos": 27198203385600.0, + "grad_norm": 1.9502306021254343, + "language_loss": 0.83540517, + "learning_rate": 3.7763115576247686e-06, + "loss": 0.85734957, + "num_input_tokens_seen": 63797535, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.9765625, + "step": 2951, + "time_per_iteration": 2.5360641479492188 + }, + { + "auxiliary_loss_clip": 0.01153398, + "auxiliary_loss_mlp": 0.01055919, + "balance_loss_clip": 1.03710794, + "balance_loss_mlp": 1.04963326, + "epoch": 0.17748384187584548, + "flos": 20959694277120.0, + "grad_norm": 3.175759961241781, + "language_loss": 0.80564123, + "learning_rate": 3.776132549750806e-06, + "loss": 0.82773435, + "num_input_tokens_seen": 63817045, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0390625, + "step": 2952, + "time_per_iteration": 2.478635787963867 + }, + { + "auxiliary_loss_clip": 0.01150606, + "auxiliary_loss_mlp": 0.01051207, + "balance_loss_clip": 1.03157318, + "balance_loss_mlp": 1.05045855, + "epoch": 0.17754396512851345, + "flos": 25009986844800.0, + "grad_norm": 2.157061982289712, + "language_loss": 0.79982865, + "learning_rate": 3.7759534745257194e-06, + "loss": 0.82184678, + "num_input_tokens_seen": 63837665, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0, + "step": 2953, + "time_per_iteration": 2.5143978595733643 + }, + { + "auxiliary_loss_clip": 0.01152559, + "auxiliary_loss_mlp": 0.01048489, + "balance_loss_clip": 1.03003526, + "balance_loss_mlp": 1.05173969, + "epoch": 0.1776040883811814, + "flos": 32051566275840.0, + "grad_norm": 1.8943960347088487, + "language_loss": 0.88006002, + "learning_rate": 3.7757743319562994e-06, + "loss": 0.90207046, + "num_input_tokens_seen": 63858455, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.0078125, + "step": 2954, + "time_per_iteration": 2.575603485107422 + }, + { + "auxiliary_loss_clip": 0.01150383, + "auxiliary_loss_mlp": 0.01052239, + "balance_loss_clip": 1.0327127, + "balance_loss_mlp": 1.05101538, + "epoch": 0.17766421163384938, + "flos": 21574125348480.0, + "grad_norm": 2.123866524492404, + "language_loss": 0.84441978, + "learning_rate": 3.7755951220493386e-06, + "loss": 0.86644602, + "num_input_tokens_seen": 63876935, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.9921875, + "step": 2955, + "time_per_iteration": 2.476022958755493 + }, + { + "auxiliary_loss_clip": 0.01148363, + "auxiliary_loss_mlp": 0.0104412, + "balance_loss_clip": 1.02591681, + "balance_loss_mlp": 1.04843807, + "epoch": 0.17772433488651737, + "flos": 22419319345920.0, + "grad_norm": 2.0229859139182382, + "language_loss": 0.71172267, + "learning_rate": 3.7754158448116327e-06, + "loss": 0.73364747, + "num_input_tokens_seen": 63896815, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.0, + "step": 2956, + "time_per_iteration": 2.4795608520507812 + }, + { + "auxiliary_loss_clip": 0.0114893, + "auxiliary_loss_mlp": 0.01052163, + "balance_loss_clip": 1.03226662, + "balance_loss_mlp": 1.04974461, + "epoch": 0.17778445813918534, + "flos": 25629445820160.0, + "grad_norm": 1.891261769499534, + "language_loss": 0.82908547, + "learning_rate": 3.7752365002499795e-06, + "loss": 0.85109639, + "num_input_tokens_seen": 63916140, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.9921875, + "step": 2957, + "time_per_iteration": 2.494279384613037 + }, + { + "auxiliary_loss_clip": 0.01146796, + "auxiliary_loss_mlp": 0.01046422, + "balance_loss_clip": 1.02819514, + "balance_loss_mlp": 1.04814482, + "epoch": 0.1778445813918533, + "flos": 25628871202560.0, + "grad_norm": 1.926043663168548, + "language_loss": 0.75286758, + "learning_rate": 3.7750570883711807e-06, + "loss": 0.7747997, + "num_input_tokens_seen": 63935220, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.984375, + "step": 2958, + "time_per_iteration": 2.532339572906494 + }, + { + "auxiliary_loss_clip": 0.01153516, + "auxiliary_loss_mlp": 0.01043348, + "balance_loss_clip": 1.02483475, + "balance_loss_mlp": 1.05278933, + "epoch": 0.17790470464452127, + "flos": 22345522853760.0, + "grad_norm": 2.0794730574663265, + "language_loss": 0.79558724, + "learning_rate": 3.7748776091820397e-06, + "loss": 0.8175559, + "num_input_tokens_seen": 63954550, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.0078125, + "step": 2959, + "time_per_iteration": 2.45941424369812 + }, + { + "auxiliary_loss_clip": 0.01152953, + "auxiliary_loss_mlp": 0.01045372, + "balance_loss_clip": 1.02573824, + "balance_loss_mlp": 1.04968762, + "epoch": 0.17796482789718923, + "flos": 18765875214720.0, + "grad_norm": 2.284306220471852, + "language_loss": 0.52288693, + "learning_rate": 3.774698062689362e-06, + "loss": 0.5448702, + "num_input_tokens_seen": 63972425, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.03125, + "step": 2960, + "time_per_iteration": 2.4603421688079834 + }, + { + "auxiliary_loss_clip": 0.01154348, + "auxiliary_loss_mlp": 0.01055908, + "balance_loss_clip": 1.03685808, + "balance_loss_mlp": 1.05185843, + "epoch": 0.1780249511498572, + "flos": 23440941970560.0, + "grad_norm": 1.9615261009939866, + "language_loss": 0.89047921, + "learning_rate": 3.7745184488999548e-06, + "loss": 0.9125818, + "num_input_tokens_seen": 63992165, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0234375, + "step": 2961, + "time_per_iteration": 2.475848913192749 + }, + { + "auxiliary_loss_clip": 0.01151915, + "auxiliary_loss_mlp": 0.01051556, + "balance_loss_clip": 1.0313381, + "balance_loss_mlp": 1.04849648, + "epoch": 0.1780850744025252, + "flos": 23367468700800.0, + "grad_norm": 2.2193748892921517, + "language_loss": 0.79186273, + "learning_rate": 3.774338767820631e-06, + "loss": 0.81389749, + "num_input_tokens_seen": 64013470, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.03125, + "step": 2962, + "time_per_iteration": 2.508054256439209 + }, + { + "auxiliary_loss_clip": 0.011535, + "auxiliary_loss_mlp": 0.01051549, + "balance_loss_clip": 1.03175986, + "balance_loss_mlp": 1.0524615, + "epoch": 0.17814519765519315, + "flos": 13771994319360.0, + "grad_norm": 1.9550413638631114, + "language_loss": 0.74514943, + "learning_rate": 3.774159019458203e-06, + "loss": 0.76719993, + "num_input_tokens_seen": 64030975, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.015625, + "step": 2963, + "time_per_iteration": 2.4414234161376953 + }, + { + "auxiliary_loss_clip": 0.01156042, + "auxiliary_loss_mlp": 0.01048013, + "balance_loss_clip": 1.02822399, + "balance_loss_mlp": 1.05221784, + "epoch": 0.17820532090786112, + "flos": 21976396738560.0, + "grad_norm": 1.541363360665875, + "language_loss": 0.78624183, + "learning_rate": 3.7739792038194877e-06, + "loss": 0.80828238, + "num_input_tokens_seen": 64050075, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.0390625, + "step": 2964, + "time_per_iteration": 2.502497911453247 + }, + { + "auxiliary_loss_clip": 0.0115044, + "auxiliary_loss_mlp": 0.01056098, + "balance_loss_clip": 1.03661871, + "balance_loss_mlp": 1.05026746, + "epoch": 0.17826544416052909, + "flos": 24790752184320.0, + "grad_norm": 1.923237578914178, + "language_loss": 0.81686175, + "learning_rate": 3.7737993209113027e-06, + "loss": 0.83892715, + "num_input_tokens_seen": 64071920, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0, + "step": 2965, + "time_per_iteration": 2.538076400756836 + }, + { + "auxiliary_loss_clip": 0.01147349, + "auxiliary_loss_mlp": 0.01049832, + "balance_loss_clip": 1.03273785, + "balance_loss_mlp": 1.04941893, + "epoch": 0.17832556741319705, + "flos": 13879582531200.0, + "grad_norm": 2.2408088539265183, + "language_loss": 0.94580686, + "learning_rate": 3.7736193707404698e-06, + "loss": 0.96777868, + "num_input_tokens_seen": 64086835, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.98046875, + "step": 2966, + "time_per_iteration": 2.43082332611084 + }, + { + "auxiliary_loss_clip": 0.01149854, + "auxiliary_loss_mlp": 0.01045128, + "balance_loss_clip": 1.02470756, + "balance_loss_mlp": 1.05002928, + "epoch": 0.17838569066586502, + "flos": 36641703323520.0, + "grad_norm": 2.145285080590972, + "language_loss": 0.72469354, + "learning_rate": 3.7734393533138127e-06, + "loss": 0.74664342, + "num_input_tokens_seen": 64107360, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0, + "step": 2967, + "time_per_iteration": 2.5735998153686523 + }, + { + "auxiliary_loss_clip": 0.01145139, + "auxiliary_loss_mlp": 0.01044527, + "balance_loss_clip": 1.02613282, + "balance_loss_mlp": 1.04889679, + "epoch": 0.17844581391853298, + "flos": 18727271072640.0, + "grad_norm": 2.088672387523525, + "language_loss": 0.76831949, + "learning_rate": 3.773259268638157e-06, + "loss": 0.79021615, + "num_input_tokens_seen": 64124690, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.9609375, + "step": 2968, + "time_per_iteration": 2.437344789505005 + }, + { + "auxiliary_loss_clip": 0.01147824, + "auxiliary_loss_mlp": 0.01047277, + "balance_loss_clip": 1.0287044, + "balance_loss_mlp": 1.04982233, + "epoch": 0.17850593717120097, + "flos": 27378259286400.0, + "grad_norm": 3.3962137266502075, + "language_loss": 0.75934523, + "learning_rate": 3.7730791167203333e-06, + "loss": 0.78129619, + "num_input_tokens_seen": 64146315, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.98046875, + "step": 2969, + "time_per_iteration": 2.5003507137298584 + }, + { + "auxiliary_loss_clip": 0.01047445, + "auxiliary_loss_mlp": 0.01001591, + "balance_loss_clip": 0.99940914, + "balance_loss_mlp": 1.01426291, + "epoch": 0.17856606042386894, + "flos": 66996025084800.0, + "grad_norm": 0.8459028719848601, + "language_loss": 0.69080526, + "learning_rate": 3.772898897567171e-06, + "loss": 0.7112956, + "num_input_tokens_seen": 64210875, + "router_z_loss_clip": 0.02185059, + "router_z_loss_mlp": 0.33203125, + "step": 2970, + "time_per_iteration": 3.1193249225616455 + }, + { + "auxiliary_loss_clip": 0.01153596, + "auxiliary_loss_mlp": 0.01041832, + "balance_loss_clip": 1.0229373, + "balance_loss_mlp": 1.0498271, + "epoch": 0.1786261836765369, + "flos": 36977001805440.0, + "grad_norm": 2.0858657386647614, + "language_loss": 0.67452097, + "learning_rate": 3.772718611185505e-06, + "loss": 0.69647527, + "num_input_tokens_seen": 64230740, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0390625, + "step": 2971, + "time_per_iteration": 2.580946683883667 + }, + { + "auxiliary_loss_clip": 0.01146095, + "auxiliary_loss_mlp": 0.0105018, + "balance_loss_clip": 1.03059363, + "balance_loss_mlp": 1.04643905, + "epoch": 0.17868630692920487, + "flos": 24825441744000.0, + "grad_norm": 1.713623966203784, + "language_loss": 0.89631712, + "learning_rate": 3.7725382575821717e-06, + "loss": 0.91827983, + "num_input_tokens_seen": 64252300, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.99609375, + "step": 2972, + "time_per_iteration": 2.491608142852783 + }, + { + "auxiliary_loss_clip": 0.01150348, + "auxiliary_loss_mlp": 0.01056161, + "balance_loss_clip": 1.03762364, + "balance_loss_mlp": 1.05058205, + "epoch": 0.17874643018187283, + "flos": 16981977139200.0, + "grad_norm": 2.067523530387673, + "language_loss": 0.88030291, + "learning_rate": 3.77235783676401e-06, + "loss": 0.90236795, + "num_input_tokens_seen": 64270105, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0, + "step": 2973, + "time_per_iteration": 2.4357106685638428 + }, + { + "auxiliary_loss_clip": 0.01148396, + "auxiliary_loss_mlp": 0.01051569, + "balance_loss_clip": 1.03282917, + "balance_loss_mlp": 1.04979324, + "epoch": 0.1788065534345408, + "flos": 21032233793280.0, + "grad_norm": 2.1406659419236176, + "language_loss": 0.75648922, + "learning_rate": 3.7721773487378615e-06, + "loss": 0.77848881, + "num_input_tokens_seen": 64287250, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.984375, + "step": 2974, + "time_per_iteration": 2.484236478805542 + }, + { + "auxiliary_loss_clip": 0.01148515, + "auxiliary_loss_mlp": 0.01044686, + "balance_loss_clip": 1.02560067, + "balance_loss_mlp": 1.04925394, + "epoch": 0.17886667668720876, + "flos": 23987717775360.0, + "grad_norm": 2.8019304252630453, + "language_loss": 0.74556506, + "learning_rate": 3.7719967935105705e-06, + "loss": 0.76749712, + "num_input_tokens_seen": 64307140, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.9921875, + "step": 2975, + "time_per_iteration": 2.4658849239349365 + }, + { + "auxiliary_loss_clip": 0.01145454, + "auxiliary_loss_mlp": 0.0104533, + "balance_loss_clip": 1.02692378, + "balance_loss_mlp": 1.04805982, + "epoch": 0.17892679993987676, + "flos": 25739476156800.0, + "grad_norm": 1.5963289978134585, + "language_loss": 0.73245859, + "learning_rate": 3.7718161710889833e-06, + "loss": 0.7543664, + "num_input_tokens_seen": 64328760, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.97265625, + "step": 2976, + "time_per_iteration": 3.921170949935913 + }, + { + "auxiliary_loss_clip": 0.01140857, + "auxiliary_loss_mlp": 0.01040265, + "balance_loss_clip": 1.02455354, + "balance_loss_mlp": 1.04732931, + "epoch": 0.17898692319254472, + "flos": 25699686865920.0, + "grad_norm": 1.5556273460638488, + "language_loss": 0.77324069, + "learning_rate": 3.7716354814799495e-06, + "loss": 0.79505193, + "num_input_tokens_seen": 64348800, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.9375, + "step": 2977, + "time_per_iteration": 5.459218978881836 + }, + { + "auxiliary_loss_clip": 0.01150602, + "auxiliary_loss_mlp": 0.0105157, + "balance_loss_clip": 1.03352153, + "balance_loss_mlp": 1.05327988, + "epoch": 0.1790470464452127, + "flos": 19317786664320.0, + "grad_norm": 2.814268655584857, + "language_loss": 0.79470795, + "learning_rate": 3.7714547246903203e-06, + "loss": 0.81672966, + "num_input_tokens_seen": 64367955, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.97265625, + "step": 2978, + "time_per_iteration": 2.4917376041412354 + }, + { + "auxiliary_loss_clip": 0.01152273, + "auxiliary_loss_mlp": 0.0104187, + "balance_loss_clip": 1.022892, + "balance_loss_mlp": 1.04982674, + "epoch": 0.17910716969788065, + "flos": 30044267562240.0, + "grad_norm": 1.6585859201367117, + "language_loss": 0.76166439, + "learning_rate": 3.7712739007269508e-06, + "loss": 0.78360581, + "num_input_tokens_seen": 64389805, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0234375, + "step": 2979, + "time_per_iteration": 2.5283753871917725 + }, + { + "auxiliary_loss_clip": 0.01144584, + "auxiliary_loss_mlp": 0.01046117, + "balance_loss_clip": 1.0283196, + "balance_loss_mlp": 1.04760695, + "epoch": 0.17916729295054862, + "flos": 19427709260160.0, + "grad_norm": 2.3100878996861014, + "language_loss": 0.69246143, + "learning_rate": 3.7710930095966976e-06, + "loss": 0.7143684, + "num_input_tokens_seen": 64408220, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.96875, + "step": 2980, + "time_per_iteration": 2.452199935913086 + }, + { + "auxiliary_loss_clip": 0.01148553, + "auxiliary_loss_mlp": 0.01047507, + "balance_loss_clip": 1.02703881, + "balance_loss_mlp": 1.04957294, + "epoch": 0.17922741620321658, + "flos": 14611549881600.0, + "grad_norm": 1.6769030770257147, + "language_loss": 0.7077347, + "learning_rate": 3.7709120513064196e-06, + "loss": 0.72969532, + "num_input_tokens_seen": 64426380, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.98828125, + "step": 2981, + "time_per_iteration": 2.453328847885132 + }, + { + "auxiliary_loss_clip": 0.01151272, + "auxiliary_loss_mlp": 0.01057949, + "balance_loss_clip": 1.03929293, + "balance_loss_mlp": 1.05124855, + "epoch": 0.17928753945588458, + "flos": 17165301177600.0, + "grad_norm": 2.4096510966801916, + "language_loss": 0.82313269, + "learning_rate": 3.7707310258629796e-06, + "loss": 0.84522492, + "num_input_tokens_seen": 64444355, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.0, + "step": 2982, + "time_per_iteration": 2.4727423191070557 + }, + { + "auxiliary_loss_clip": 0.01145202, + "auxiliary_loss_mlp": 0.01048964, + "balance_loss_clip": 1.0309453, + "balance_loss_mlp": 1.04754186, + "epoch": 0.17934766270855254, + "flos": 31395622060800.0, + "grad_norm": 2.0170018574221404, + "language_loss": 0.82899523, + "learning_rate": 3.7705499332732413e-06, + "loss": 0.85093689, + "num_input_tokens_seen": 64467800, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9765625, + "step": 2983, + "time_per_iteration": 2.5544486045837402 + }, + { + "auxiliary_loss_clip": 0.01148269, + "auxiliary_loss_mlp": 0.01051569, + "balance_loss_clip": 1.03234076, + "balance_loss_mlp": 1.04676509, + "epoch": 0.1794077859612205, + "flos": 20814184281600.0, + "grad_norm": 2.0025677466759175, + "language_loss": 0.84977567, + "learning_rate": 3.7703687735440718e-06, + "loss": 0.87177408, + "num_input_tokens_seen": 64487230, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.015625, + "step": 2984, + "time_per_iteration": 2.461451530456543 + }, + { + "auxiliary_loss_clip": 0.01146636, + "auxiliary_loss_mlp": 0.01044432, + "balance_loss_clip": 1.02558494, + "balance_loss_mlp": 1.04734373, + "epoch": 0.17946790921388847, + "flos": 28986447006720.0, + "grad_norm": 2.5972673531528874, + "language_loss": 0.89526331, + "learning_rate": 3.7701875466823416e-06, + "loss": 0.91717398, + "num_input_tokens_seen": 64509165, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.9921875, + "step": 2985, + "time_per_iteration": 2.5644643306732178 + }, + { + "auxiliary_loss_clip": 0.01142965, + "auxiliary_loss_mlp": 0.01045202, + "balance_loss_clip": 1.02879906, + "balance_loss_mlp": 1.0478375, + "epoch": 0.17952803246655644, + "flos": 20737406960640.0, + "grad_norm": 1.9029387971382474, + "language_loss": 0.69863129, + "learning_rate": 3.770006252694922e-06, + "loss": 0.72051299, + "num_input_tokens_seen": 64527940, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.953125, + "step": 2986, + "time_per_iteration": 2.4629499912261963 + }, + { + "auxiliary_loss_clip": 0.01144523, + "auxiliary_loss_mlp": 0.01043434, + "balance_loss_clip": 1.02507591, + "balance_loss_mlp": 1.04828227, + "epoch": 0.1795881557192244, + "flos": 28255988027520.0, + "grad_norm": 2.203273814413497, + "language_loss": 0.77872753, + "learning_rate": 3.769824891588688e-06, + "loss": 0.80060714, + "num_input_tokens_seen": 64545230, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.96484375, + "step": 2987, + "time_per_iteration": 2.524712562561035 + }, + { + "auxiliary_loss_clip": 0.01149287, + "auxiliary_loss_mlp": 0.01043881, + "balance_loss_clip": 1.02412844, + "balance_loss_mlp": 1.04834962, + "epoch": 0.17964827897189237, + "flos": 18552027594240.0, + "grad_norm": 2.225668764256514, + "language_loss": 0.78012109, + "learning_rate": 3.7696434633705164e-06, + "loss": 0.8020528, + "num_input_tokens_seen": 64563820, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0078125, + "step": 2988, + "time_per_iteration": 2.4608163833618164 + }, + { + "auxiliary_loss_clip": 0.01048374, + "auxiliary_loss_mlp": 0.01007691, + "balance_loss_clip": 1.00570035, + "balance_loss_mlp": 1.0154314, + "epoch": 0.17970840222456036, + "flos": 58165088711040.0, + "grad_norm": 0.7961406236538413, + "language_loss": 0.62767559, + "learning_rate": 3.7694619680472875e-06, + "loss": 0.64823627, + "num_input_tokens_seen": 64621315, + "router_z_loss_clip": 0.01989746, + "router_z_loss_mlp": 0.33007812, + "step": 2989, + "time_per_iteration": 2.9831957817077637 + }, + { + "auxiliary_loss_clip": 0.01146079, + "auxiliary_loss_mlp": 0.01041184, + "balance_loss_clip": 1.02363658, + "balance_loss_mlp": 1.04836369, + "epoch": 0.17976852547722832, + "flos": 20300805146880.0, + "grad_norm": 3.4434429944335525, + "language_loss": 0.70464563, + "learning_rate": 3.7692804056258837e-06, + "loss": 0.72651821, + "num_input_tokens_seen": 64639885, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.98046875, + "step": 2990, + "time_per_iteration": 2.556100606918335 + }, + { + "auxiliary_loss_clip": 0.01146243, + "auxiliary_loss_mlp": 0.01039011, + "balance_loss_clip": 1.0210464, + "balance_loss_mlp": 1.04735422, + "epoch": 0.1798286487298963, + "flos": 39669367685760.0, + "grad_norm": 1.7649502456354873, + "language_loss": 0.68110204, + "learning_rate": 3.7690987761131893e-06, + "loss": 0.70295459, + "num_input_tokens_seen": 64661220, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.98828125, + "step": 2991, + "time_per_iteration": 2.6224544048309326 + }, + { + "auxiliary_loss_clip": 0.01145545, + "auxiliary_loss_mlp": 0.01040119, + "balance_loss_clip": 1.02194011, + "balance_loss_mlp": 1.04794931, + "epoch": 0.17988877198256426, + "flos": 25520313323520.0, + "grad_norm": 1.5716432326573742, + "language_loss": 0.82754636, + "learning_rate": 3.7689170795160924e-06, + "loss": 0.84940296, + "num_input_tokens_seen": 64682530, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9765625, + "step": 2992, + "time_per_iteration": 2.51824951171875 + }, + { + "auxiliary_loss_clip": 0.01138637, + "auxiliary_loss_mlp": 0.01040458, + "balance_loss_clip": 1.02301776, + "balance_loss_mlp": 1.04464579, + "epoch": 0.17994889523523222, + "flos": 18807496099200.0, + "grad_norm": 2.1353598877924806, + "language_loss": 0.81958085, + "learning_rate": 3.7687353158414822e-06, + "loss": 0.84137177, + "num_input_tokens_seen": 64701025, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.94140625, + "step": 2993, + "time_per_iteration": 2.4349074363708496 + }, + { + "auxiliary_loss_clip": 0.01143824, + "auxiliary_loss_mlp": 0.01047314, + "balance_loss_clip": 1.02889621, + "balance_loss_mlp": 1.04586673, + "epoch": 0.18000901848790019, + "flos": 21104450087040.0, + "grad_norm": 1.7254805142405878, + "language_loss": 0.78390837, + "learning_rate": 3.7685534850962517e-06, + "loss": 0.80581975, + "num_input_tokens_seen": 64719570, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.98046875, + "step": 2994, + "time_per_iteration": 2.4898691177368164 + }, + { + "auxiliary_loss_clip": 0.01148185, + "auxiliary_loss_mlp": 0.01043708, + "balance_loss_clip": 1.02642226, + "balance_loss_mlp": 1.04966068, + "epoch": 0.18006914174056818, + "flos": 19646441130240.0, + "grad_norm": 1.8689491925476576, + "language_loss": 0.80392146, + "learning_rate": 3.768371587287296e-06, + "loss": 0.82584035, + "num_input_tokens_seen": 64738110, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.984375, + "step": 2995, + "time_per_iteration": 2.4521572589874268 + }, + { + "auxiliary_loss_clip": 0.01144196, + "auxiliary_loss_mlp": 0.01046311, + "balance_loss_clip": 1.02939498, + "balance_loss_mlp": 1.04679298, + "epoch": 0.18012926499323614, + "flos": 19499889640320.0, + "grad_norm": 1.5635152056288029, + "language_loss": 0.84467834, + "learning_rate": 3.768189622421512e-06, + "loss": 0.86658335, + "num_input_tokens_seen": 64756345, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.97265625, + "step": 2996, + "time_per_iteration": 2.46993088722229 + }, + { + "auxiliary_loss_clip": 0.01139788, + "auxiliary_loss_mlp": 0.01041997, + "balance_loss_clip": 1.02493799, + "balance_loss_mlp": 1.04656756, + "epoch": 0.1801893882459041, + "flos": 19464553635840.0, + "grad_norm": 2.9197857622903793, + "language_loss": 0.88254511, + "learning_rate": 3.7680075905058006e-06, + "loss": 0.90436304, + "num_input_tokens_seen": 64776375, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9296875, + "step": 2997, + "time_per_iteration": 2.470113515853882 + }, + { + "auxiliary_loss_clip": 0.01148809, + "auxiliary_loss_mlp": 0.01043259, + "balance_loss_clip": 1.02435279, + "balance_loss_mlp": 1.04666877, + "epoch": 0.18024951149857207, + "flos": 26870590414080.0, + "grad_norm": 2.5635961030192935, + "language_loss": 0.8504566, + "learning_rate": 3.7678254915470643e-06, + "loss": 0.87237728, + "num_input_tokens_seen": 64796210, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0234375, + "step": 2998, + "time_per_iteration": 2.5252864360809326 + }, + { + "auxiliary_loss_clip": 0.0114547, + "auxiliary_loss_mlp": 0.01045025, + "balance_loss_clip": 1.02783537, + "balance_loss_mlp": 1.05022454, + "epoch": 0.18030963475124004, + "flos": 30226621933440.0, + "grad_norm": 1.8695557812200347, + "language_loss": 0.84270376, + "learning_rate": 3.7676433255522084e-06, + "loss": 0.86460871, + "num_input_tokens_seen": 64818590, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.953125, + "step": 2999, + "time_per_iteration": 2.5272696018218994 + }, + { + "auxiliary_loss_clip": 0.01143823, + "auxiliary_loss_mlp": 0.01044085, + "balance_loss_clip": 1.02577412, + "balance_loss_mlp": 1.04662383, + "epoch": 0.180369758003908, + "flos": 22307493329280.0, + "grad_norm": 1.7700032623605295, + "language_loss": 0.74753368, + "learning_rate": 3.76746109252814e-06, + "loss": 0.76941276, + "num_input_tokens_seen": 64838350, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.97265625, + "step": 3000, + "time_per_iteration": 2.4800922870635986 + }, + { + "auxiliary_loss_clip": 0.01143329, + "auxiliary_loss_mlp": 0.01060132, + "balance_loss_clip": 1.04111791, + "balance_loss_mlp": 1.04825568, + "epoch": 0.18042988125657597, + "flos": 23732033788800.0, + "grad_norm": 2.369063359757221, + "language_loss": 0.71625632, + "learning_rate": 3.76727879248177e-06, + "loss": 0.73829091, + "num_input_tokens_seen": 64858065, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.953125, + "step": 3001, + "time_per_iteration": 2.484309434890747 + }, + { + "auxiliary_loss_clip": 0.01148499, + "auxiliary_loss_mlp": 0.01048814, + "balance_loss_clip": 1.03010964, + "balance_loss_mlp": 1.04815364, + "epoch": 0.18049000450924396, + "flos": 24093582134400.0, + "grad_norm": 2.7240097708601225, + "language_loss": 0.87795258, + "learning_rate": 3.767096425420011e-06, + "loss": 0.89992571, + "num_input_tokens_seen": 64877305, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.0078125, + "step": 3002, + "time_per_iteration": 2.4881784915924072 + }, + { + "auxiliary_loss_clip": 0.011444, + "auxiliary_loss_mlp": 0.01044754, + "balance_loss_clip": 1.02689672, + "balance_loss_mlp": 1.04694915, + "epoch": 0.18055012776191193, + "flos": 22163168482560.0, + "grad_norm": 1.6880476069492312, + "language_loss": 0.80563951, + "learning_rate": 3.7669139913497788e-06, + "loss": 0.8275311, + "num_input_tokens_seen": 64896955, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.9765625, + "step": 3003, + "time_per_iteration": 2.452103614807129 + }, + { + "auxiliary_loss_clip": 0.0114812, + "auxiliary_loss_mlp": 0.01044755, + "balance_loss_clip": 1.02673101, + "balance_loss_mlp": 1.04780829, + "epoch": 0.1806102510145799, + "flos": 28913512440960.0, + "grad_norm": 2.4630533980116804, + "language_loss": 0.66931474, + "learning_rate": 3.7667314902779907e-06, + "loss": 0.69124347, + "num_input_tokens_seen": 64917080, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0, + "step": 3004, + "time_per_iteration": 2.5085701942443848 + }, + { + "auxiliary_loss_clip": 0.0114685, + "auxiliary_loss_mlp": 0.01050762, + "balance_loss_clip": 1.03184342, + "balance_loss_mlp": 1.04860806, + "epoch": 0.18067037426724786, + "flos": 19025689265280.0, + "grad_norm": 1.8927608809249736, + "language_loss": 0.85172975, + "learning_rate": 3.7665489222115677e-06, + "loss": 0.87370586, + "num_input_tokens_seen": 64935215, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.984375, + "step": 3005, + "time_per_iteration": 2.44529128074646 + }, + { + "auxiliary_loss_clip": 0.01141782, + "auxiliary_loss_mlp": 0.01042658, + "balance_loss_clip": 1.02611172, + "balance_loss_mlp": 1.04684031, + "epoch": 0.18073049751991582, + "flos": 27453635976960.0, + "grad_norm": 1.553419886600377, + "language_loss": 0.82951266, + "learning_rate": 3.766366287157432e-06, + "loss": 0.85135704, + "num_input_tokens_seen": 64956275, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.94921875, + "step": 3006, + "time_per_iteration": 2.532597780227661 + }, + { + "auxiliary_loss_clip": 0.01143778, + "auxiliary_loss_mlp": 0.01050753, + "balance_loss_clip": 1.0315007, + "balance_loss_mlp": 1.04581141, + "epoch": 0.1807906207725838, + "flos": 28729039167360.0, + "grad_norm": 1.6363768703600998, + "language_loss": 0.76883924, + "learning_rate": 3.7661835851225103e-06, + "loss": 0.79078454, + "num_input_tokens_seen": 64979390, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.98046875, + "step": 3007, + "time_per_iteration": 2.5265002250671387 + }, + { + "auxiliary_loss_clip": 0.01046842, + "auxiliary_loss_mlp": 0.01004593, + "balance_loss_clip": 1.00238752, + "balance_loss_mlp": 1.01358199, + "epoch": 0.18085074402525175, + "flos": 64466515468800.0, + "grad_norm": 0.8067080511403597, + "language_loss": 0.56949043, + "learning_rate": 3.7660008161137294e-06, + "loss": 0.59000474, + "num_input_tokens_seen": 65043135, + "router_z_loss_clip": 0.02209473, + "router_z_loss_mlp": 0.33203125, + "step": 3008, + "time_per_iteration": 3.1923961639404297 + }, + { + "auxiliary_loss_clip": 0.01148419, + "auxiliary_loss_mlp": 0.01048421, + "balance_loss_clip": 1.02878737, + "balance_loss_mlp": 1.04951596, + "epoch": 0.18091086727791975, + "flos": 23476960333440.0, + "grad_norm": 1.8063105677439477, + "language_loss": 0.67226636, + "learning_rate": 3.765817980138021e-06, + "loss": 0.69423479, + "num_input_tokens_seen": 65062845, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.98828125, + "step": 3009, + "time_per_iteration": 2.467525005340576 + }, + { + "auxiliary_loss_clip": 0.01147918, + "auxiliary_loss_mlp": 0.01047401, + "balance_loss_clip": 1.02993655, + "balance_loss_mlp": 1.04874969, + "epoch": 0.1809709905305877, + "flos": 24170467196160.0, + "grad_norm": 1.842230928142314, + "language_loss": 0.75573891, + "learning_rate": 3.7656350772023177e-06, + "loss": 0.77769208, + "num_input_tokens_seen": 65082110, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.98828125, + "step": 3010, + "time_per_iteration": 2.486067533493042 + }, + { + "auxiliary_loss_clip": 0.01141012, + "auxiliary_loss_mlp": 0.0104251, + "balance_loss_clip": 1.02585649, + "balance_loss_mlp": 1.04816866, + "epoch": 0.18103111378325568, + "flos": 21650902669440.0, + "grad_norm": 1.6130539386655762, + "language_loss": 0.66672593, + "learning_rate": 3.7654521073135553e-06, + "loss": 0.6885612, + "num_input_tokens_seen": 65101985, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9296875, + "step": 3011, + "time_per_iteration": 2.461749792098999 + }, + { + "auxiliary_loss_clip": 0.01142359, + "auxiliary_loss_mlp": 0.01048591, + "balance_loss_clip": 1.0309006, + "balance_loss_mlp": 1.04706419, + "epoch": 0.18109123703592364, + "flos": 53686918356480.0, + "grad_norm": 2.1517129990512927, + "language_loss": 0.71184897, + "learning_rate": 3.7652690704786723e-06, + "loss": 0.73375839, + "num_input_tokens_seen": 65129295, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.953125, + "step": 3012, + "time_per_iteration": 2.7380943298339844 + }, + { + "auxiliary_loss_clip": 0.01145213, + "auxiliary_loss_mlp": 0.01048493, + "balance_loss_clip": 1.03045654, + "balance_loss_mlp": 1.05109787, + "epoch": 0.1811513602885916, + "flos": 35845564325760.0, + "grad_norm": 2.2489260815019447, + "language_loss": 0.62039113, + "learning_rate": 3.765085966704609e-06, + "loss": 0.64232826, + "num_input_tokens_seen": 65150625, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.94140625, + "step": 3013, + "time_per_iteration": 2.5800936222076416 + }, + { + "auxiliary_loss_clip": 0.01145888, + "auxiliary_loss_mlp": 0.01050021, + "balance_loss_clip": 1.03303385, + "balance_loss_mlp": 1.04870379, + "epoch": 0.18121148354125957, + "flos": 23732572492800.0, + "grad_norm": 1.5535403171237991, + "language_loss": 0.76026124, + "learning_rate": 3.764902795998309e-06, + "loss": 0.7822203, + "num_input_tokens_seen": 65170880, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.97265625, + "step": 3014, + "time_per_iteration": 2.5049405097961426 + }, + { + "auxiliary_loss_clip": 0.01151342, + "auxiliary_loss_mlp": 0.01046668, + "balance_loss_clip": 1.02697504, + "balance_loss_mlp": 1.05086446, + "epoch": 0.18127160679392756, + "flos": 28728320895360.0, + "grad_norm": 1.7733972454950666, + "language_loss": 0.65696967, + "learning_rate": 3.7647195583667184e-06, + "loss": 0.67894971, + "num_input_tokens_seen": 65192530, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0, + "step": 3015, + "time_per_iteration": 2.52614426612854 + }, + { + "auxiliary_loss_clip": 0.01143858, + "auxiliary_loss_mlp": 0.01043977, + "balance_loss_clip": 1.0262742, + "balance_loss_mlp": 1.0490694, + "epoch": 0.18133173004659553, + "flos": 20485062938880.0, + "grad_norm": 1.7500400577379265, + "language_loss": 0.7809943, + "learning_rate": 3.764536253816785e-06, + "loss": 0.80287266, + "num_input_tokens_seen": 65211675, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9453125, + "step": 3016, + "time_per_iteration": 2.4736039638519287 + }, + { + "auxiliary_loss_clip": 0.01152649, + "auxiliary_loss_mlp": 0.01050768, + "balance_loss_clip": 1.03214788, + "balance_loss_mlp": 1.05294776, + "epoch": 0.1813918532992635, + "flos": 22852078404480.0, + "grad_norm": 1.6390488083316745, + "language_loss": 0.83498454, + "learning_rate": 3.7643528823554602e-06, + "loss": 0.85701871, + "num_input_tokens_seen": 65231185, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0, + "step": 3017, + "time_per_iteration": 2.454888105392456 + }, + { + "auxiliary_loss_clip": 0.01142751, + "auxiliary_loss_mlp": 0.01039497, + "balance_loss_clip": 1.02192545, + "balance_loss_mlp": 1.0486486, + "epoch": 0.18145197655193146, + "flos": 36065122208640.0, + "grad_norm": 2.2301629944757964, + "language_loss": 0.67067724, + "learning_rate": 3.764169443989697e-06, + "loss": 0.69249976, + "num_input_tokens_seen": 65251645, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.94140625, + "step": 3018, + "time_per_iteration": 3.950299024581909 + }, + { + "auxiliary_loss_clip": 0.01146405, + "auxiliary_loss_mlp": 0.0103774, + "balance_loss_clip": 1.01953697, + "balance_loss_mlp": 1.04928112, + "epoch": 0.18151209980459942, + "flos": 24023951619840.0, + "grad_norm": 2.174717508383113, + "language_loss": 0.75745898, + "learning_rate": 3.7639859387264518e-06, + "loss": 0.77930045, + "num_input_tokens_seen": 65271125, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.97265625, + "step": 3019, + "time_per_iteration": 3.9721574783325195 + }, + { + "auxiliary_loss_clip": 0.01149794, + "auxiliary_loss_mlp": 0.01045611, + "balance_loss_clip": 1.02653718, + "balance_loss_mlp": 1.05230832, + "epoch": 0.1815722230572674, + "flos": 23951627585280.0, + "grad_norm": 2.1373464597463574, + "language_loss": 0.81687438, + "learning_rate": 3.7638023665726834e-06, + "loss": 0.83882844, + "num_input_tokens_seen": 65290600, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.9765625, + "step": 3020, + "time_per_iteration": 2.510564088821411 + }, + { + "auxiliary_loss_clip": 0.01148401, + "auxiliary_loss_mlp": 0.01042965, + "balance_loss_clip": 1.02373672, + "balance_loss_mlp": 1.05124021, + "epoch": 0.18163234630993536, + "flos": 24386469632640.0, + "grad_norm": 2.9178918869439654, + "language_loss": 0.77220714, + "learning_rate": 3.763618727535352e-06, + "loss": 0.79412079, + "num_input_tokens_seen": 65311040, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.96875, + "step": 3021, + "time_per_iteration": 2.4856297969818115 + }, + { + "auxiliary_loss_clip": 0.01141247, + "auxiliary_loss_mlp": 0.0104233, + "balance_loss_clip": 1.02419829, + "balance_loss_mlp": 1.04617524, + "epoch": 0.18169246956260335, + "flos": 24681332378880.0, + "grad_norm": 1.7066661124221545, + "language_loss": 0.84841502, + "learning_rate": 3.763435021621422e-06, + "loss": 0.87025082, + "num_input_tokens_seen": 65332115, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.94921875, + "step": 3022, + "time_per_iteration": 2.4933700561523438 + }, + { + "auxiliary_loss_clip": 0.01148694, + "auxiliary_loss_mlp": 0.01042276, + "balance_loss_clip": 1.02296424, + "balance_loss_mlp": 1.0491302, + "epoch": 0.1817525928152713, + "flos": 24243294021120.0, + "grad_norm": 1.9452352079001236, + "language_loss": 0.69178426, + "learning_rate": 3.763251248837859e-06, + "loss": 0.7136941, + "num_input_tokens_seen": 65352210, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.9921875, + "step": 3023, + "time_per_iteration": 2.495107412338257 + }, + { + "auxiliary_loss_clip": 0.01144443, + "auxiliary_loss_mlp": 0.01044488, + "balance_loss_clip": 1.0261296, + "balance_loss_mlp": 1.04748738, + "epoch": 0.18181271606793928, + "flos": 16472081623680.0, + "grad_norm": 1.9417078000950883, + "language_loss": 0.73956865, + "learning_rate": 3.7630674091916317e-06, + "loss": 0.76145792, + "num_input_tokens_seen": 65370600, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.96875, + "step": 3024, + "time_per_iteration": 2.4531846046447754 + }, + { + "auxiliary_loss_clip": 0.01145205, + "auxiliary_loss_mlp": 0.01043186, + "balance_loss_clip": 1.02549553, + "balance_loss_mlp": 1.0490942, + "epoch": 0.18187283932060724, + "flos": 18581042805120.0, + "grad_norm": 2.344564071286257, + "language_loss": 0.88167858, + "learning_rate": 3.7628835026897123e-06, + "loss": 0.90356255, + "num_input_tokens_seen": 65387270, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9609375, + "step": 3025, + "time_per_iteration": 2.4708051681518555 + }, + { + "auxiliary_loss_clip": 0.01145802, + "auxiliary_loss_mlp": 0.01052568, + "balance_loss_clip": 1.03356552, + "balance_loss_mlp": 1.05046904, + "epoch": 0.1819329625732752, + "flos": 20266833859200.0, + "grad_norm": 2.755473586939447, + "language_loss": 0.79284346, + "learning_rate": 3.7626995293390735e-06, + "loss": 0.8148272, + "num_input_tokens_seen": 65406550, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.953125, + "step": 3026, + "time_per_iteration": 2.482987403869629 + }, + { + "auxiliary_loss_clip": 0.01149047, + "auxiliary_loss_mlp": 0.01053602, + "balance_loss_clip": 1.03424227, + "balance_loss_mlp": 1.0502665, + "epoch": 0.18199308582594317, + "flos": 25915186512000.0, + "grad_norm": 1.6571051349992714, + "language_loss": 0.76047945, + "learning_rate": 3.762515489146692e-06, + "loss": 0.78250599, + "num_input_tokens_seen": 65425955, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.98828125, + "step": 3027, + "time_per_iteration": 2.4952149391174316 + }, + { + "auxiliary_loss_clip": 0.01151758, + "auxiliary_loss_mlp": 0.01049071, + "balance_loss_clip": 1.03055763, + "balance_loss_mlp": 1.05106115, + "epoch": 0.18205320907861114, + "flos": 15377524433280.0, + "grad_norm": 1.7989426432275553, + "language_loss": 0.85400331, + "learning_rate": 3.762331382119546e-06, + "loss": 0.87601155, + "num_input_tokens_seen": 65442820, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0078125, + "step": 3028, + "time_per_iteration": 2.438113212585449 + }, + { + "auxiliary_loss_clip": 0.01144845, + "auxiliary_loss_mlp": 0.01043225, + "balance_loss_clip": 1.02543902, + "balance_loss_mlp": 1.04937243, + "epoch": 0.18211333233127913, + "flos": 25624310175360.0, + "grad_norm": 1.8205418995180693, + "language_loss": 0.82655656, + "learning_rate": 3.7621472082646183e-06, + "loss": 0.84843719, + "num_input_tokens_seen": 65461825, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.953125, + "step": 3029, + "time_per_iteration": 2.4866995811462402 + }, + { + "auxiliary_loss_clip": 0.01151958, + "auxiliary_loss_mlp": 0.01045395, + "balance_loss_clip": 1.02640462, + "balance_loss_mlp": 1.05306637, + "epoch": 0.1821734555839471, + "flos": 14976007228800.0, + "grad_norm": 2.0975281503542433, + "language_loss": 0.78150737, + "learning_rate": 3.761962967588891e-06, + "loss": 0.80348092, + "num_input_tokens_seen": 65479480, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.9921875, + "step": 3030, + "time_per_iteration": 2.458627700805664 + }, + { + "auxiliary_loss_clip": 0.01150751, + "auxiliary_loss_mlp": 0.01043659, + "balance_loss_clip": 1.02495515, + "balance_loss_mlp": 1.05141127, + "epoch": 0.18223357883661506, + "flos": 20194007034240.0, + "grad_norm": 1.955618442063123, + "language_loss": 0.85318518, + "learning_rate": 3.761778660099352e-06, + "loss": 0.87512928, + "num_input_tokens_seen": 65497775, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.99609375, + "step": 3031, + "time_per_iteration": 2.4492268562316895 + }, + { + "auxiliary_loss_clip": 0.01150203, + "auxiliary_loss_mlp": 0.01045881, + "balance_loss_clip": 1.02824974, + "balance_loss_mlp": 1.05232072, + "epoch": 0.18229370208928303, + "flos": 15231978524160.0, + "grad_norm": 1.8744751837074634, + "language_loss": 0.79713088, + "learning_rate": 3.76159428580299e-06, + "loss": 0.81909174, + "num_input_tokens_seen": 65516505, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.98046875, + "step": 3032, + "time_per_iteration": 2.4449126720428467 + }, + { + "auxiliary_loss_clip": 0.0115633, + "auxiliary_loss_mlp": 0.01044461, + "balance_loss_clip": 1.0260191, + "balance_loss_mlp": 1.05395341, + "epoch": 0.182353825341951, + "flos": 23840483927040.0, + "grad_norm": 2.0774072235136964, + "language_loss": 0.81420642, + "learning_rate": 3.761409844706795e-06, + "loss": 0.8362143, + "num_input_tokens_seen": 65536160, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.0234375, + "step": 3033, + "time_per_iteration": 2.47562575340271 + }, + { + "auxiliary_loss_clip": 0.01052781, + "auxiliary_loss_mlp": 0.01006645, + "balance_loss_clip": 1.00452268, + "balance_loss_mlp": 1.01995599, + "epoch": 0.18241394859461896, + "flos": 61190957393280.0, + "grad_norm": 0.8883360043233282, + "language_loss": 0.63479006, + "learning_rate": 3.7612253368177625e-06, + "loss": 0.6553843, + "num_input_tokens_seen": 65589375, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.328125, + "step": 3034, + "time_per_iteration": 2.9712142944335938 + }, + { + "auxiliary_loss_clip": 0.01148548, + "auxiliary_loss_mlp": 0.01043903, + "balance_loss_clip": 1.0263083, + "balance_loss_mlp": 1.05033147, + "epoch": 0.18247407184728695, + "flos": 18471694826880.0, + "grad_norm": 2.0132790953316113, + "language_loss": 0.79684323, + "learning_rate": 3.7610407621428893e-06, + "loss": 0.81876773, + "num_input_tokens_seen": 65606720, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.984375, + "step": 3035, + "time_per_iteration": 2.4517030715942383 + }, + { + "auxiliary_loss_clip": 0.01147231, + "auxiliary_loss_mlp": 0.01044908, + "balance_loss_clip": 1.02792096, + "balance_loss_mlp": 1.05231702, + "epoch": 0.18253419509995492, + "flos": 21795191602560.0, + "grad_norm": 2.217606261766961, + "language_loss": 0.84895855, + "learning_rate": 3.7608561206891735e-06, + "loss": 0.87087989, + "num_input_tokens_seen": 65625495, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.94921875, + "step": 3036, + "time_per_iteration": 2.5017378330230713 + }, + { + "auxiliary_loss_clip": 0.01142577, + "auxiliary_loss_mlp": 0.01042615, + "balance_loss_clip": 1.02524662, + "balance_loss_mlp": 1.04940438, + "epoch": 0.18259431835262288, + "flos": 20149764456960.0, + "grad_norm": 2.216717642760365, + "language_loss": 0.79836094, + "learning_rate": 3.760671412463617e-06, + "loss": 0.82021284, + "num_input_tokens_seen": 65643515, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9296875, + "step": 3037, + "time_per_iteration": 2.4591338634490967 + }, + { + "auxiliary_loss_clip": 0.01149617, + "auxiliary_loss_mlp": 0.01047298, + "balance_loss_clip": 1.02686584, + "balance_loss_mlp": 1.05208671, + "epoch": 0.18265444160529085, + "flos": 16981653916800.0, + "grad_norm": 2.68131613553598, + "language_loss": 0.79450762, + "learning_rate": 3.7604866374732246e-06, + "loss": 0.81647676, + "num_input_tokens_seen": 65658155, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.9765625, + "step": 3038, + "time_per_iteration": 2.440664768218994 + }, + { + "auxiliary_loss_clip": 0.0114731, + "auxiliary_loss_mlp": 0.01048244, + "balance_loss_clip": 1.03069699, + "balance_loss_mlp": 1.05140162, + "epoch": 0.1827145648579588, + "flos": 34423250509440.0, + "grad_norm": 2.3213350225315748, + "language_loss": 0.67311364, + "learning_rate": 3.7603017957250023e-06, + "loss": 0.69506919, + "num_input_tokens_seen": 65679310, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.95703125, + "step": 3039, + "time_per_iteration": 2.573272466659546 + }, + { + "auxiliary_loss_clip": 0.01148566, + "auxiliary_loss_mlp": 0.01051856, + "balance_loss_clip": 1.03323567, + "balance_loss_mlp": 1.05112875, + "epoch": 0.18277468811062678, + "flos": 53287017264000.0, + "grad_norm": 1.9125298187860031, + "language_loss": 0.73687911, + "learning_rate": 3.7601168872259593e-06, + "loss": 0.75888336, + "num_input_tokens_seen": 65705235, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9765625, + "step": 3040, + "time_per_iteration": 2.771242618560791 + }, + { + "auxiliary_loss_clip": 0.0114474, + "auxiliary_loss_mlp": 0.01042775, + "balance_loss_clip": 1.02418995, + "balance_loss_mlp": 1.04849768, + "epoch": 0.18283481136329474, + "flos": 31650659602560.0, + "grad_norm": 1.8780343880464916, + "language_loss": 0.60176188, + "learning_rate": 3.7599319119831075e-06, + "loss": 0.62363702, + "num_input_tokens_seen": 65727575, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9609375, + "step": 3041, + "time_per_iteration": 2.670691967010498 + }, + { + "auxiliary_loss_clip": 0.01146425, + "auxiliary_loss_mlp": 0.01055713, + "balance_loss_clip": 1.03756928, + "balance_loss_mlp": 1.05012786, + "epoch": 0.18289493461596273, + "flos": 53137664513280.0, + "grad_norm": 1.7488247873746179, + "language_loss": 0.60361505, + "learning_rate": 3.7597468700034616e-06, + "loss": 0.6256364, + "num_input_tokens_seen": 65751370, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9609375, + "step": 3042, + "time_per_iteration": 2.7942960262298584 + }, + { + "auxiliary_loss_clip": 0.01144442, + "auxiliary_loss_mlp": 0.0104919, + "balance_loss_clip": 1.03143954, + "balance_loss_mlp": 1.04945385, + "epoch": 0.1829550578686307, + "flos": 25589369220480.0, + "grad_norm": 1.6831322617730042, + "language_loss": 0.8769263, + "learning_rate": 3.7595617612940374e-06, + "loss": 0.8988626, + "num_input_tokens_seen": 65771040, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.94921875, + "step": 3043, + "time_per_iteration": 2.524871587753296 + }, + { + "auxiliary_loss_clip": 0.01149409, + "auxiliary_loss_mlp": 0.01049211, + "balance_loss_clip": 1.03005409, + "balance_loss_mlp": 1.05107832, + "epoch": 0.18301518112129866, + "flos": 22601422321920.0, + "grad_norm": 1.9464603469819268, + "language_loss": 0.707008, + "learning_rate": 3.7593765858618552e-06, + "loss": 0.72899425, + "num_input_tokens_seen": 65789345, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.984375, + "step": 3044, + "time_per_iteration": 2.483335018157959 + }, + { + "auxiliary_loss_clip": 0.01150484, + "auxiliary_loss_mlp": 0.01055406, + "balance_loss_clip": 1.03552175, + "balance_loss_mlp": 1.04929996, + "epoch": 0.18307530437396663, + "flos": 34020799551360.0, + "grad_norm": 2.0901220952627497, + "language_loss": 0.64385587, + "learning_rate": 3.7591913437139365e-06, + "loss": 0.66591471, + "num_input_tokens_seen": 65810990, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.015625, + "step": 3045, + "time_per_iteration": 2.592855453491211 + }, + { + "auxiliary_loss_clip": 0.01145205, + "auxiliary_loss_mlp": 0.01054969, + "balance_loss_clip": 1.0368377, + "balance_loss_mlp": 1.04977548, + "epoch": 0.1831354276266346, + "flos": 21279765392640.0, + "grad_norm": 2.998731206361719, + "language_loss": 0.79165137, + "learning_rate": 3.7590060348573066e-06, + "loss": 0.81365317, + "num_input_tokens_seen": 65827230, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.953125, + "step": 3046, + "time_per_iteration": 2.5034587383270264 + }, + { + "auxiliary_loss_clip": 0.01146985, + "auxiliary_loss_mlp": 0.01048107, + "balance_loss_clip": 1.02908087, + "balance_loss_mlp": 1.04764223, + "epoch": 0.18319555087930256, + "flos": 21032952065280.0, + "grad_norm": 3.3529268295267016, + "language_loss": 0.78991181, + "learning_rate": 3.7588206592989903e-06, + "loss": 0.81186271, + "num_input_tokens_seen": 65845900, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.9921875, + "step": 3047, + "time_per_iteration": 2.5140535831451416 + }, + { + "auxiliary_loss_clip": 0.01145799, + "auxiliary_loss_mlp": 0.01046901, + "balance_loss_clip": 1.02923381, + "balance_loss_mlp": 1.05111742, + "epoch": 0.18325567413197055, + "flos": 34382958428160.0, + "grad_norm": 1.5613113238500957, + "language_loss": 0.80888635, + "learning_rate": 3.7586352170460194e-06, + "loss": 0.83081341, + "num_input_tokens_seen": 65868730, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9453125, + "step": 3048, + "time_per_iteration": 2.5848963260650635 + }, + { + "auxiliary_loss_clip": 0.01148052, + "auxiliary_loss_mlp": 0.01042108, + "balance_loss_clip": 1.02283192, + "balance_loss_mlp": 1.0502528, + "epoch": 0.18331579738463852, + "flos": 20558464381440.0, + "grad_norm": 1.8161394933049422, + "language_loss": 0.86232805, + "learning_rate": 3.758449708105424e-06, + "loss": 0.88422966, + "num_input_tokens_seen": 65888420, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.9765625, + "step": 3049, + "time_per_iteration": 2.4665114879608154 + }, + { + "auxiliary_loss_clip": 0.01154476, + "auxiliary_loss_mlp": 0.01043247, + "balance_loss_clip": 1.02364874, + "balance_loss_mlp": 1.05159521, + "epoch": 0.18337592063730648, + "flos": 19607872901760.0, + "grad_norm": 2.2703740748038066, + "language_loss": 0.77160966, + "learning_rate": 3.75826413248424e-06, + "loss": 0.79358685, + "num_input_tokens_seen": 65905840, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.03125, + "step": 3050, + "time_per_iteration": 2.4525256156921387 + }, + { + "auxiliary_loss_clip": 0.01146286, + "auxiliary_loss_mlp": 0.01045385, + "balance_loss_clip": 1.02683592, + "balance_loss_mlp": 1.04867804, + "epoch": 0.18343604388997445, + "flos": 20850885002880.0, + "grad_norm": 2.010292972394078, + "language_loss": 0.99174476, + "learning_rate": 3.7580784901895035e-06, + "loss": 1.0136615, + "num_input_tokens_seen": 65922845, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9765625, + "step": 3051, + "time_per_iteration": 2.4559926986694336 + }, + { + "auxiliary_loss_clip": 0.01145751, + "auxiliary_loss_mlp": 0.01039078, + "balance_loss_clip": 1.02096963, + "balance_loss_mlp": 1.050529, + "epoch": 0.1834961671426424, + "flos": 24394370624640.0, + "grad_norm": 1.5992624239842805, + "language_loss": 0.86153144, + "learning_rate": 3.7578927812282542e-06, + "loss": 0.8833797, + "num_input_tokens_seen": 65945555, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.953125, + "step": 3052, + "time_per_iteration": 2.559396505355835 + }, + { + "auxiliary_loss_clip": 0.01145626, + "auxiliary_loss_mlp": 0.0105059, + "balance_loss_clip": 1.03267264, + "balance_loss_mlp": 1.04985499, + "epoch": 0.18355629039531038, + "flos": 21251612108160.0, + "grad_norm": 1.8182752776897229, + "language_loss": 0.73004341, + "learning_rate": 3.7577070056075356e-06, + "loss": 0.75200558, + "num_input_tokens_seen": 65963965, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.95703125, + "step": 3053, + "time_per_iteration": 2.4481074810028076 + }, + { + "auxiliary_loss_clip": 0.01150169, + "auxiliary_loss_mlp": 0.01049972, + "balance_loss_clip": 1.03051662, + "balance_loss_mlp": 1.05208337, + "epoch": 0.18361641364797834, + "flos": 28656499651200.0, + "grad_norm": 1.6467304764216655, + "language_loss": 0.62212563, + "learning_rate": 3.7575211633343902e-06, + "loss": 0.64412701, + "num_input_tokens_seen": 65985965, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.98046875, + "step": 3054, + "time_per_iteration": 2.5701377391815186 + }, + { + "auxiliary_loss_clip": 0.01146023, + "auxiliary_loss_mlp": 0.01042771, + "balance_loss_clip": 1.02510393, + "balance_loss_mlp": 1.04962707, + "epoch": 0.18367653690064634, + "flos": 20918827578240.0, + "grad_norm": 2.2210920593094325, + "language_loss": 0.78501689, + "learning_rate": 3.7573352544158663e-06, + "loss": 0.80690485, + "num_input_tokens_seen": 66005645, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9609375, + "step": 3055, + "time_per_iteration": 2.47776198387146 + }, + { + "auxiliary_loss_clip": 0.0114232, + "auxiliary_loss_mlp": 0.01053937, + "balance_loss_clip": 1.03622222, + "balance_loss_mlp": 1.04779387, + "epoch": 0.1837366601533143, + "flos": 28765596234240.0, + "grad_norm": 1.894881128028073, + "language_loss": 0.70218527, + "learning_rate": 3.757149278859014e-06, + "loss": 0.72414786, + "num_input_tokens_seen": 66025675, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9453125, + "step": 3056, + "time_per_iteration": 2.541361093521118 + }, + { + "auxiliary_loss_clip": 0.0114918, + "auxiliary_loss_mlp": 0.01043721, + "balance_loss_clip": 1.02612543, + "balance_loss_mlp": 1.05066419, + "epoch": 0.18379678340598227, + "flos": 21251432540160.0, + "grad_norm": 1.4932354373853338, + "language_loss": 0.8028152, + "learning_rate": 3.7569632366708842e-06, + "loss": 0.82474422, + "num_input_tokens_seen": 66046125, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.984375, + "step": 3057, + "time_per_iteration": 2.4718995094299316 + }, + { + "auxiliary_loss_clip": 0.0115229, + "auxiliary_loss_mlp": 0.01049373, + "balance_loss_clip": 1.02864265, + "balance_loss_mlp": 1.04847729, + "epoch": 0.18385690665865023, + "flos": 20449619193600.0, + "grad_norm": 2.0112890674266914, + "language_loss": 0.82289785, + "learning_rate": 3.756777127858533e-06, + "loss": 0.84491444, + "num_input_tokens_seen": 66064375, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.0390625, + "step": 3058, + "time_per_iteration": 2.4653379917144775 + }, + { + "auxiliary_loss_clip": 0.01148012, + "auxiliary_loss_mlp": 0.01046535, + "balance_loss_clip": 1.02818882, + "balance_loss_mlp": 1.04893029, + "epoch": 0.1839170299113182, + "flos": 26140562398080.0, + "grad_norm": 2.205773819593527, + "language_loss": 0.85894352, + "learning_rate": 3.756590952429017e-06, + "loss": 0.88088906, + "num_input_tokens_seen": 66084590, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.9921875, + "step": 3059, + "time_per_iteration": 4.0151047706604 + }, + { + "auxiliary_loss_clip": 0.01145706, + "auxiliary_loss_mlp": 0.01045338, + "balance_loss_clip": 1.02724195, + "balance_loss_mlp": 1.04931092, + "epoch": 0.18397715316398616, + "flos": 31758032332800.0, + "grad_norm": 1.70952354928268, + "language_loss": 0.72799402, + "learning_rate": 3.756404710389396e-06, + "loss": 0.74990445, + "num_input_tokens_seen": 66107105, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9609375, + "step": 3060, + "time_per_iteration": 5.466471195220947 + }, + { + "auxiliary_loss_clip": 0.01151276, + "auxiliary_loss_mlp": 0.01042783, + "balance_loss_clip": 1.02375722, + "balance_loss_mlp": 1.05253565, + "epoch": 0.18403727641665413, + "flos": 24611989173120.0, + "grad_norm": 1.7373746338425942, + "language_loss": 0.72797298, + "learning_rate": 3.7562184017467323e-06, + "loss": 0.74991357, + "num_input_tokens_seen": 66129295, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.98828125, + "step": 3061, + "time_per_iteration": 2.5244035720825195 + }, + { + "auxiliary_loss_clip": 0.01146537, + "auxiliary_loss_mlp": 0.0104557, + "balance_loss_clip": 1.02697313, + "balance_loss_mlp": 1.05087519, + "epoch": 0.18409739966932212, + "flos": 23439900476160.0, + "grad_norm": 1.8714044833418495, + "language_loss": 0.81622046, + "learning_rate": 3.7560320265080906e-06, + "loss": 0.83814156, + "num_input_tokens_seen": 66146910, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.95703125, + "step": 3062, + "time_per_iteration": 2.4767649173736572 + }, + { + "auxiliary_loss_clip": 0.01154667, + "auxiliary_loss_mlp": 0.01045371, + "balance_loss_clip": 1.02681041, + "balance_loss_mlp": 1.05394542, + "epoch": 0.18415752292199009, + "flos": 21872112577920.0, + "grad_norm": 1.7582970194369052, + "language_loss": 0.72718614, + "learning_rate": 3.7558455846805383e-06, + "loss": 0.74918652, + "num_input_tokens_seen": 66165370, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0078125, + "step": 3063, + "time_per_iteration": 2.5082144737243652 + }, + { + "auxiliary_loss_clip": 0.01146453, + "auxiliary_loss_mlp": 0.01041784, + "balance_loss_clip": 1.02516627, + "balance_loss_mlp": 1.04935837, + "epoch": 0.18421764617465805, + "flos": 25410678036480.0, + "grad_norm": 2.1216519555610183, + "language_loss": 0.65496099, + "learning_rate": 3.7556590762711463e-06, + "loss": 0.6768434, + "num_input_tokens_seen": 66186210, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.96875, + "step": 3064, + "time_per_iteration": 2.523141622543335 + }, + { + "auxiliary_loss_clip": 0.01149329, + "auxiliary_loss_mlp": 0.01048992, + "balance_loss_clip": 1.03081298, + "balance_loss_mlp": 1.05274165, + "epoch": 0.18427776942732602, + "flos": 27198131558400.0, + "grad_norm": 2.6163412642887947, + "language_loss": 0.68768656, + "learning_rate": 3.7554725012869853e-06, + "loss": 0.70966971, + "num_input_tokens_seen": 66204800, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.96875, + "step": 3065, + "time_per_iteration": 2.5244293212890625 + }, + { + "auxiliary_loss_clip": 0.01151353, + "auxiliary_loss_mlp": 0.01047403, + "balance_loss_clip": 1.02819824, + "balance_loss_mlp": 1.05120087, + "epoch": 0.18433789267999398, + "flos": 27852351920640.0, + "grad_norm": 4.932084281869228, + "language_loss": 0.72561431, + "learning_rate": 3.7552858597351318e-06, + "loss": 0.74760187, + "num_input_tokens_seen": 66222195, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0, + "step": 3066, + "time_per_iteration": 2.5428919792175293 + }, + { + "auxiliary_loss_clip": 0.01148706, + "auxiliary_loss_mlp": 0.01043732, + "balance_loss_clip": 1.02608871, + "balance_loss_mlp": 1.05074954, + "epoch": 0.18439801593266195, + "flos": 17856940533120.0, + "grad_norm": 1.9825677919996112, + "language_loss": 0.82477474, + "learning_rate": 3.7550991516226622e-06, + "loss": 0.84669906, + "num_input_tokens_seen": 66239505, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.98046875, + "step": 3067, + "time_per_iteration": 2.4500880241394043 + }, + { + "auxiliary_loss_clip": 0.01048916, + "auxiliary_loss_mlp": 0.01007477, + "balance_loss_clip": 1.00535476, + "balance_loss_mlp": 1.01668859, + "epoch": 0.18445813918532994, + "flos": 56389522590720.0, + "grad_norm": 0.7924805733675573, + "language_loss": 0.59706604, + "learning_rate": 3.754912376956657e-06, + "loss": 0.61763, + "num_input_tokens_seen": 66295695, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.32226562, + "step": 3068, + "time_per_iteration": 2.9375104904174805 + }, + { + "auxiliary_loss_clip": 0.01153283, + "auxiliary_loss_mlp": 0.01040124, + "balance_loss_clip": 1.02232563, + "balance_loss_mlp": 1.05714762, + "epoch": 0.1845182624379979, + "flos": 20957180325120.0, + "grad_norm": 1.8708990955689164, + "language_loss": 0.76227212, + "learning_rate": 3.7547255357441987e-06, + "loss": 0.78420615, + "num_input_tokens_seen": 66315315, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9609375, + "step": 3069, + "time_per_iteration": 2.462446451187134 + }, + { + "auxiliary_loss_clip": 0.01151625, + "auxiliary_loss_mlp": 0.01040566, + "balance_loss_clip": 1.02233863, + "balance_loss_mlp": 1.05299067, + "epoch": 0.18457838569066587, + "flos": 20485170679680.0, + "grad_norm": 1.7428293735192475, + "language_loss": 0.84803855, + "learning_rate": 3.7545386279923718e-06, + "loss": 0.86996043, + "num_input_tokens_seen": 66333675, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.984375, + "step": 3070, + "time_per_iteration": 2.4887194633483887 + }, + { + "auxiliary_loss_clip": 0.01152145, + "auxiliary_loss_mlp": 0.01042624, + "balance_loss_clip": 1.02462363, + "balance_loss_mlp": 1.05298758, + "epoch": 0.18463850894333383, + "flos": 25010022758400.0, + "grad_norm": 1.9722863584187038, + "language_loss": 0.77370453, + "learning_rate": 3.754351653708265e-06, + "loss": 0.79565221, + "num_input_tokens_seen": 66354075, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.98828125, + "step": 3071, + "time_per_iteration": 2.482213258743286 + }, + { + "auxiliary_loss_clip": 0.01152228, + "auxiliary_loss_mlp": 0.01048541, + "balance_loss_clip": 1.03042173, + "balance_loss_mlp": 1.05342758, + "epoch": 0.1846986321960018, + "flos": 16800628348800.0, + "grad_norm": 2.705053980849468, + "language_loss": 0.77691031, + "learning_rate": 3.7541646128989674e-06, + "loss": 0.79891801, + "num_input_tokens_seen": 66372520, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9921875, + "step": 3072, + "time_per_iteration": 2.466387987136841 + }, + { + "auxiliary_loss_clip": 0.01150023, + "auxiliary_loss_mlp": 0.01042519, + "balance_loss_clip": 1.02339804, + "balance_loss_mlp": 1.05013216, + "epoch": 0.18475875544866976, + "flos": 20814327936000.0, + "grad_norm": 1.8173375196390826, + "language_loss": 0.8607235, + "learning_rate": 3.7539775055715715e-06, + "loss": 0.88264889, + "num_input_tokens_seen": 66390745, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0, + "step": 3073, + "time_per_iteration": 2.4510810375213623 + }, + { + "auxiliary_loss_clip": 0.01151045, + "auxiliary_loss_mlp": 0.01045152, + "balance_loss_clip": 1.02851045, + "balance_loss_mlp": 1.05339348, + "epoch": 0.18481887870133773, + "flos": 22601422321920.0, + "grad_norm": 2.2059027996031877, + "language_loss": 0.92005521, + "learning_rate": 3.7537903317331732e-06, + "loss": 0.9420172, + "num_input_tokens_seen": 66410525, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.97265625, + "step": 3074, + "time_per_iteration": 2.473710298538208 + }, + { + "auxiliary_loss_clip": 0.01146992, + "auxiliary_loss_mlp": 0.01044255, + "balance_loss_clip": 1.02490735, + "balance_loss_mlp": 1.05028176, + "epoch": 0.18487900195400572, + "flos": 29458815788160.0, + "grad_norm": 1.9913742546968862, + "language_loss": 0.65041798, + "learning_rate": 3.75360309139087e-06, + "loss": 0.67233044, + "num_input_tokens_seen": 66432535, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.96875, + "step": 3075, + "time_per_iteration": 2.533724784851074 + }, + { + "auxiliary_loss_clip": 0.01149493, + "auxiliary_loss_mlp": 0.01043367, + "balance_loss_clip": 1.02578402, + "balance_loss_mlp": 1.053177, + "epoch": 0.1849391252066737, + "flos": 20628777254400.0, + "grad_norm": 1.709240712607824, + "language_loss": 0.72323918, + "learning_rate": 3.753415784551761e-06, + "loss": 0.74516779, + "num_input_tokens_seen": 66450620, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9609375, + "step": 3076, + "time_per_iteration": 2.4544899463653564 + }, + { + "auxiliary_loss_clip": 0.01153692, + "auxiliary_loss_mlp": 0.01046042, + "balance_loss_clip": 1.0280292, + "balance_loss_mlp": 1.05341136, + "epoch": 0.18499924845934165, + "flos": 14428549065600.0, + "grad_norm": 2.4900368363969854, + "language_loss": 0.80860448, + "learning_rate": 3.7532284112229507e-06, + "loss": 0.83060181, + "num_input_tokens_seen": 66467865, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0, + "step": 3077, + "time_per_iteration": 2.45137882232666 + }, + { + "auxiliary_loss_clip": 0.01146798, + "auxiliary_loss_mlp": 0.01045715, + "balance_loss_clip": 1.02816749, + "balance_loss_mlp": 1.05103469, + "epoch": 0.18505937171200962, + "flos": 23727652329600.0, + "grad_norm": 1.7908770900539794, + "language_loss": 0.78764129, + "learning_rate": 3.7530409714115424e-06, + "loss": 0.8095665, + "num_input_tokens_seen": 66486245, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.95703125, + "step": 3078, + "time_per_iteration": 2.477393865585327 + }, + { + "auxiliary_loss_clip": 0.01147874, + "auxiliary_loss_mlp": 0.01044325, + "balance_loss_clip": 1.02714717, + "balance_loss_mlp": 1.05057585, + "epoch": 0.18511949496467758, + "flos": 25957489754880.0, + "grad_norm": 1.8549646444276375, + "language_loss": 0.7758081, + "learning_rate": 3.7528534651246453e-06, + "loss": 0.79773009, + "num_input_tokens_seen": 66506510, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9765625, + "step": 3079, + "time_per_iteration": 2.5069448947906494 + }, + { + "auxiliary_loss_clip": 0.01143899, + "auxiliary_loss_mlp": 0.0104323, + "balance_loss_clip": 1.02581406, + "balance_loss_mlp": 1.04723024, + "epoch": 0.18517961821734555, + "flos": 42413553912960.0, + "grad_norm": 2.3452692712375893, + "language_loss": 0.81668431, + "learning_rate": 3.752665892369369e-06, + "loss": 0.83855557, + "num_input_tokens_seen": 66530960, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.96484375, + "step": 3080, + "time_per_iteration": 2.688206911087036 + }, + { + "auxiliary_loss_clip": 0.01149652, + "auxiliary_loss_mlp": 0.01046927, + "balance_loss_clip": 1.02812803, + "balance_loss_mlp": 1.05079699, + "epoch": 0.18523974147001354, + "flos": 24097568544000.0, + "grad_norm": 2.0276132956863764, + "language_loss": 0.7435087, + "learning_rate": 3.7524782531528266e-06, + "loss": 0.7654745, + "num_input_tokens_seen": 66550275, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.98828125, + "step": 3081, + "time_per_iteration": 2.5003983974456787 + }, + { + "auxiliary_loss_clip": 0.01151656, + "auxiliary_loss_mlp": 0.01050271, + "balance_loss_clip": 1.03124547, + "balance_loss_mlp": 1.05527234, + "epoch": 0.1852998647226815, + "flos": 27375278457600.0, + "grad_norm": 2.070281784994394, + "language_loss": 0.71532816, + "learning_rate": 3.7522905474821334e-06, + "loss": 0.73734742, + "num_input_tokens_seen": 66569040, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.9609375, + "step": 3082, + "time_per_iteration": 2.514004707336426 + }, + { + "auxiliary_loss_clip": 0.011545, + "auxiliary_loss_mlp": 0.01050471, + "balance_loss_clip": 1.03155267, + "balance_loss_mlp": 1.05488813, + "epoch": 0.18535998797534947, + "flos": 18332757020160.0, + "grad_norm": 1.869200996989063, + "language_loss": 0.69338834, + "learning_rate": 3.752102775364407e-06, + "loss": 0.71543807, + "num_input_tokens_seen": 66587775, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.9921875, + "step": 3083, + "time_per_iteration": 2.446418523788452 + }, + { + "auxiliary_loss_clip": 0.0114679, + "auxiliary_loss_mlp": 0.01049301, + "balance_loss_clip": 1.03187287, + "balance_loss_mlp": 1.05216169, + "epoch": 0.18542011122801744, + "flos": 37845859887360.0, + "grad_norm": 4.022344342016001, + "language_loss": 0.68854296, + "learning_rate": 3.751914936806767e-06, + "loss": 0.71050388, + "num_input_tokens_seen": 66610800, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9453125, + "step": 3084, + "time_per_iteration": 2.5964090824127197 + }, + { + "auxiliary_loss_clip": 0.01145496, + "auxiliary_loss_mlp": 0.01043341, + "balance_loss_clip": 1.02541232, + "balance_loss_mlp": 1.04961908, + "epoch": 0.1854802344806854, + "flos": 25186128163200.0, + "grad_norm": 1.5883609883793584, + "language_loss": 0.77831411, + "learning_rate": 3.7517270318163377e-06, + "loss": 0.80020249, + "num_input_tokens_seen": 66630960, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9609375, + "step": 3085, + "time_per_iteration": 2.500401020050049 + }, + { + "auxiliary_loss_clip": 0.01145039, + "auxiliary_loss_mlp": 0.01053452, + "balance_loss_clip": 1.03557014, + "balance_loss_mlp": 1.04887915, + "epoch": 0.18554035773335337, + "flos": 26684788337280.0, + "grad_norm": 1.8880953488015286, + "language_loss": 0.73488086, + "learning_rate": 3.751539060400244e-06, + "loss": 0.7568658, + "num_input_tokens_seen": 66650585, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.9609375, + "step": 3086, + "time_per_iteration": 2.5121798515319824 + }, + { + "auxiliary_loss_clip": 0.0114717, + "auxiliary_loss_mlp": 0.01048198, + "balance_loss_clip": 1.02949429, + "balance_loss_mlp": 1.05223882, + "epoch": 0.18560048098602133, + "flos": 22346887570560.0, + "grad_norm": 4.074676999617497, + "language_loss": 0.70087367, + "learning_rate": 3.7513510225656132e-06, + "loss": 0.72282737, + "num_input_tokens_seen": 66670045, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.953125, + "step": 3087, + "time_per_iteration": 2.469980001449585 + }, + { + "auxiliary_loss_clip": 0.01149215, + "auxiliary_loss_mlp": 0.01048668, + "balance_loss_clip": 1.02928519, + "balance_loss_mlp": 1.05118215, + "epoch": 0.18566060423868933, + "flos": 17748526308480.0, + "grad_norm": 2.299065028063824, + "language_loss": 0.72731185, + "learning_rate": 3.7511629183195764e-06, + "loss": 0.74929065, + "num_input_tokens_seen": 66688790, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.98046875, + "step": 3088, + "time_per_iteration": 2.4569249153137207 + }, + { + "auxiliary_loss_clip": 0.01144422, + "auxiliary_loss_mlp": 0.01045089, + "balance_loss_clip": 1.02733839, + "balance_loss_mlp": 1.05015588, + "epoch": 0.1857207274913573, + "flos": 24677274142080.0, + "grad_norm": 2.023411505730453, + "language_loss": 0.91849768, + "learning_rate": 3.7509747476692663e-06, + "loss": 0.94039273, + "num_input_tokens_seen": 66708090, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.94140625, + "step": 3089, + "time_per_iteration": 2.5086276531219482 + }, + { + "auxiliary_loss_clip": 0.01146464, + "auxiliary_loss_mlp": 0.0104377, + "balance_loss_clip": 1.02573323, + "balance_loss_mlp": 1.05124271, + "epoch": 0.18578085074402526, + "flos": 28147825198080.0, + "grad_norm": 2.7535733421879174, + "language_loss": 0.57406759, + "learning_rate": 3.7507865106218176e-06, + "loss": 0.59596992, + "num_input_tokens_seen": 66727320, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.953125, + "step": 3090, + "time_per_iteration": 2.544934034347534 + }, + { + "auxiliary_loss_clip": 0.011443, + "auxiliary_loss_mlp": 0.01049477, + "balance_loss_clip": 1.03133333, + "balance_loss_mlp": 1.04945779, + "epoch": 0.18584097399669322, + "flos": 23951878980480.0, + "grad_norm": 1.9526543189913628, + "language_loss": 0.82229531, + "learning_rate": 3.7505982071843695e-06, + "loss": 0.84423304, + "num_input_tokens_seen": 66747505, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9453125, + "step": 3091, + "time_per_iteration": 2.5339536666870117 + }, + { + "auxiliary_loss_clip": 0.01149127, + "auxiliary_loss_mlp": 0.01049478, + "balance_loss_clip": 1.03165662, + "balance_loss_mlp": 1.05212235, + "epoch": 0.18590109724936119, + "flos": 17201678676480.0, + "grad_norm": 2.0588011246991127, + "language_loss": 0.83561456, + "learning_rate": 3.7504098373640617e-06, + "loss": 0.85760063, + "num_input_tokens_seen": 66766425, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.96875, + "step": 3092, + "time_per_iteration": 2.5091474056243896 + }, + { + "auxiliary_loss_clip": 0.01151013, + "auxiliary_loss_mlp": 0.01044436, + "balance_loss_clip": 1.02562487, + "balance_loss_mlp": 1.05010569, + "epoch": 0.18596122050202915, + "flos": 17234644383360.0, + "grad_norm": 4.142827775979207, + "language_loss": 0.93487823, + "learning_rate": 3.750221401168038e-06, + "loss": 0.95683277, + "num_input_tokens_seen": 66781130, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0078125, + "step": 3093, + "time_per_iteration": 2.4307475090026855 + }, + { + "auxiliary_loss_clip": 0.01146588, + "auxiliary_loss_mlp": 0.01038182, + "balance_loss_clip": 1.02115917, + "balance_loss_mlp": 1.05090082, + "epoch": 0.18602134375469712, + "flos": 19020733188480.0, + "grad_norm": 2.060946690404802, + "language_loss": 0.77380008, + "learning_rate": 3.750032898603443e-06, + "loss": 0.79564774, + "num_input_tokens_seen": 66797535, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.953125, + "step": 3094, + "time_per_iteration": 2.4520375728607178 + }, + { + "auxiliary_loss_clip": 0.01147212, + "auxiliary_loss_mlp": 0.01047698, + "balance_loss_clip": 1.03098452, + "balance_loss_mlp": 1.05099964, + "epoch": 0.1860814670073651, + "flos": 50950094417280.0, + "grad_norm": 1.6535165555915046, + "language_loss": 0.69985378, + "learning_rate": 3.749844329677425e-06, + "loss": 0.72180283, + "num_input_tokens_seen": 66821720, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9609375, + "step": 3095, + "time_per_iteration": 2.7395834922790527 + }, + { + "auxiliary_loss_clip": 0.01149572, + "auxiliary_loss_mlp": 0.01045107, + "balance_loss_clip": 1.02614033, + "balance_loss_mlp": 1.05169249, + "epoch": 0.18614159026003307, + "flos": 19390972625280.0, + "grad_norm": 1.9053555001005595, + "language_loss": 0.8077082, + "learning_rate": 3.749655694397135e-06, + "loss": 0.82965505, + "num_input_tokens_seen": 66839060, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.98046875, + "step": 3096, + "time_per_iteration": 2.4506232738494873 + }, + { + "auxiliary_loss_clip": 0.01147695, + "auxiliary_loss_mlp": 0.01047208, + "balance_loss_clip": 1.02883816, + "balance_loss_mlp": 1.05086875, + "epoch": 0.18620171351270104, + "flos": 21798782962560.0, + "grad_norm": 2.061308652340225, + "language_loss": 0.75101036, + "learning_rate": 3.7494669927697255e-06, + "loss": 0.77295941, + "num_input_tokens_seen": 66857760, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.96875, + "step": 3097, + "time_per_iteration": 2.46639347076416 + }, + { + "auxiliary_loss_clip": 0.01147181, + "auxiliary_loss_mlp": 0.01045993, + "balance_loss_clip": 1.02789688, + "balance_loss_mlp": 1.05196047, + "epoch": 0.186261836765369, + "flos": 16362877299840.0, + "grad_norm": 2.5365100966912664, + "language_loss": 0.66038394, + "learning_rate": 3.749278224802352e-06, + "loss": 0.68231571, + "num_input_tokens_seen": 66876460, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.953125, + "step": 3098, + "time_per_iteration": 2.46763014793396 + }, + { + "auxiliary_loss_clip": 0.01148744, + "auxiliary_loss_mlp": 0.01048857, + "balance_loss_clip": 1.02973545, + "balance_loss_mlp": 1.04978585, + "epoch": 0.18632196001803697, + "flos": 23370054480000.0, + "grad_norm": 1.6025275160282182, + "language_loss": 0.69907904, + "learning_rate": 3.7490893905021733e-06, + "loss": 0.72105503, + "num_input_tokens_seen": 66897960, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.98828125, + "step": 3099, + "time_per_iteration": 2.469336748123169 + }, + { + "auxiliary_loss_clip": 0.01147788, + "auxiliary_loss_mlp": 0.01052362, + "balance_loss_clip": 1.03290749, + "balance_loss_mlp": 1.04985309, + "epoch": 0.18638208327070493, + "flos": 22492002516480.0, + "grad_norm": 1.4888180158498334, + "language_loss": 0.71623552, + "learning_rate": 3.7489004898763494e-06, + "loss": 0.73823702, + "num_input_tokens_seen": 66917675, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.9765625, + "step": 3100, + "time_per_iteration": 2.510803699493408 + }, + { + "auxiliary_loss_clip": 0.01150621, + "auxiliary_loss_mlp": 0.01050424, + "balance_loss_clip": 1.03104091, + "balance_loss_mlp": 1.05147338, + "epoch": 0.18644220652337293, + "flos": 29165245931520.0, + "grad_norm": 2.2181859131844757, + "language_loss": 0.80163074, + "learning_rate": 3.7487115229320444e-06, + "loss": 0.82364118, + "num_input_tokens_seen": 66936000, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.9921875, + "step": 3101, + "time_per_iteration": 4.007607936859131 + }, + { + "auxiliary_loss_clip": 0.0114449, + "auxiliary_loss_mlp": 0.01043434, + "balance_loss_clip": 1.02606487, + "balance_loss_mlp": 1.05100489, + "epoch": 0.1865023297760409, + "flos": 24243796811520.0, + "grad_norm": 2.082156961368248, + "language_loss": 0.76803768, + "learning_rate": 3.7485224896764222e-06, + "loss": 0.78991693, + "num_input_tokens_seen": 66955700, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9375, + "step": 3102, + "time_per_iteration": 5.438685894012451 + }, + { + "auxiliary_loss_clip": 0.0114717, + "auxiliary_loss_mlp": 0.01041158, + "balance_loss_clip": 1.02322865, + "balance_loss_mlp": 1.04973269, + "epoch": 0.18656245302870886, + "flos": 19128716449920.0, + "grad_norm": 2.5595226686006565, + "language_loss": 0.76962835, + "learning_rate": 3.7483333901166525e-06, + "loss": 0.79151165, + "num_input_tokens_seen": 66972815, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9765625, + "step": 3103, + "time_per_iteration": 2.4742202758789062 + }, + { + "auxiliary_loss_clip": 0.0114783, + "auxiliary_loss_mlp": 0.0104302, + "balance_loss_clip": 1.02540123, + "balance_loss_mlp": 1.05014729, + "epoch": 0.18662257628137682, + "flos": 17786088956160.0, + "grad_norm": 1.966347666558745, + "language_loss": 0.79074025, + "learning_rate": 3.7481442242599054e-06, + "loss": 0.81264877, + "num_input_tokens_seen": 66992280, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9765625, + "step": 3104, + "time_per_iteration": 2.4873924255371094 + }, + { + "auxiliary_loss_clip": 0.01148715, + "auxiliary_loss_mlp": 0.01043939, + "balance_loss_clip": 1.02653468, + "balance_loss_mlp": 1.05237842, + "epoch": 0.1866826995340448, + "flos": 24024382583040.0, + "grad_norm": 1.943867006204371, + "language_loss": 0.8519029, + "learning_rate": 3.747954992113354e-06, + "loss": 0.87382948, + "num_input_tokens_seen": 67012220, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9609375, + "step": 3105, + "time_per_iteration": 2.488638162612915 + }, + { + "auxiliary_loss_clip": 0.01152184, + "auxiliary_loss_mlp": 0.01047951, + "balance_loss_clip": 1.02872288, + "balance_loss_mlp": 1.0491997, + "epoch": 0.18674282278671275, + "flos": 26141244756480.0, + "grad_norm": 1.7838474228223986, + "language_loss": 0.86952424, + "learning_rate": 3.7477656936841742e-06, + "loss": 0.89152563, + "num_input_tokens_seen": 67032030, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.03125, + "step": 3106, + "time_per_iteration": 2.5103402137756348 + }, + { + "auxiliary_loss_clip": 0.0115436, + "auxiliary_loss_mlp": 0.01044282, + "balance_loss_clip": 1.02623367, + "balance_loss_mlp": 1.05296755, + "epoch": 0.18680294603938072, + "flos": 19201938324480.0, + "grad_norm": 1.9680738799082358, + "language_loss": 0.78253353, + "learning_rate": 3.7475763289795445e-06, + "loss": 0.80451989, + "num_input_tokens_seen": 67048920, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.015625, + "step": 3107, + "time_per_iteration": 2.44567608833313 + }, + { + "auxiliary_loss_clip": 0.01150298, + "auxiliary_loss_mlp": 0.01051545, + "balance_loss_clip": 1.03179181, + "balance_loss_mlp": 1.05040216, + "epoch": 0.1868630692920487, + "flos": 28544889116160.0, + "grad_norm": 1.9125203241398734, + "language_loss": 0.74114668, + "learning_rate": 3.7473868980066446e-06, + "loss": 0.76316506, + "num_input_tokens_seen": 67068645, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0, + "step": 3108, + "time_per_iteration": 2.5254971981048584 + }, + { + "auxiliary_loss_clip": 0.0115125, + "auxiliary_loss_mlp": 0.01045574, + "balance_loss_clip": 1.02684629, + "balance_loss_mlp": 1.05332017, + "epoch": 0.18692319254471668, + "flos": 17238020261760.0, + "grad_norm": 1.6536820415924105, + "language_loss": 0.74707133, + "learning_rate": 3.747197400772658e-06, + "loss": 0.76903957, + "num_input_tokens_seen": 67087075, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.98046875, + "step": 3109, + "time_per_iteration": 2.426945924758911 + }, + { + "auxiliary_loss_clip": 0.01147996, + "auxiliary_loss_mlp": 0.0104719, + "balance_loss_clip": 1.02845001, + "balance_loss_mlp": 1.05078959, + "epoch": 0.18698331579738464, + "flos": 23185186156800.0, + "grad_norm": 1.4293009008592994, + "language_loss": 0.84324062, + "learning_rate": 3.747007837284772e-06, + "loss": 0.86519247, + "num_input_tokens_seen": 67108040, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.97265625, + "step": 3110, + "time_per_iteration": 2.4744956493377686 + }, + { + "auxiliary_loss_clip": 0.01154611, + "auxiliary_loss_mlp": 0.0104307, + "balance_loss_clip": 1.02472341, + "balance_loss_mlp": 1.05598927, + "epoch": 0.1870434390500526, + "flos": 25516721963520.0, + "grad_norm": 1.633662412254079, + "language_loss": 0.84753799, + "learning_rate": 3.7468182075501737e-06, + "loss": 0.86951482, + "num_input_tokens_seen": 67127605, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.984375, + "step": 3111, + "time_per_iteration": 2.4757230281829834 + }, + { + "auxiliary_loss_clip": 0.01149935, + "auxiliary_loss_mlp": 0.01042098, + "balance_loss_clip": 1.02408528, + "balance_loss_mlp": 1.05231404, + "epoch": 0.18710356230272057, + "flos": 19500823393920.0, + "grad_norm": 1.8513735900463348, + "language_loss": 0.76565534, + "learning_rate": 3.7466285115760536e-06, + "loss": 0.78757566, + "num_input_tokens_seen": 67145785, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9765625, + "step": 3112, + "time_per_iteration": 2.465552806854248 + }, + { + "auxiliary_loss_clip": 0.01150842, + "auxiliary_loss_mlp": 0.0104724, + "balance_loss_clip": 1.02907228, + "balance_loss_mlp": 1.0516355, + "epoch": 0.18716368555538854, + "flos": 26760847386240.0, + "grad_norm": 1.8580615351340177, + "language_loss": 0.64277315, + "learning_rate": 3.7464387493696046e-06, + "loss": 0.66475397, + "num_input_tokens_seen": 67165930, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9921875, + "step": 3113, + "time_per_iteration": 2.491805076599121 + }, + { + "auxiliary_loss_clip": 0.01155946, + "auxiliary_loss_mlp": 0.01047625, + "balance_loss_clip": 1.02817035, + "balance_loss_mlp": 1.0528996, + "epoch": 0.1872238088080565, + "flos": 25189827264000.0, + "grad_norm": 2.238258329288858, + "language_loss": 0.81043601, + "learning_rate": 3.746248920938024e-06, + "loss": 0.83247173, + "num_input_tokens_seen": 67185830, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.03125, + "step": 3114, + "time_per_iteration": 2.4947290420532227 + }, + { + "auxiliary_loss_clip": 0.01153492, + "auxiliary_loss_mlp": 0.01054246, + "balance_loss_clip": 1.03361082, + "balance_loss_mlp": 1.05319226, + "epoch": 0.1872839320607245, + "flos": 24134305178880.0, + "grad_norm": 2.2102322241331467, + "language_loss": 0.57819968, + "learning_rate": 3.74605902628851e-06, + "loss": 0.60027713, + "num_input_tokens_seen": 67206930, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.0, + "step": 3115, + "time_per_iteration": 2.4892075061798096 + }, + { + "auxiliary_loss_clip": 0.01151062, + "auxiliary_loss_mlp": 0.01056747, + "balance_loss_clip": 1.03873444, + "balance_loss_mlp": 1.05434299, + "epoch": 0.18734405531339246, + "flos": 21173793292800.0, + "grad_norm": 1.8141768865365742, + "language_loss": 0.71160758, + "learning_rate": 3.745869065428261e-06, + "loss": 0.73368567, + "num_input_tokens_seen": 67226290, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.96484375, + "step": 3116, + "time_per_iteration": 2.4705467224121094 + }, + { + "auxiliary_loss_clip": 0.01142667, + "auxiliary_loss_mlp": 0.01035702, + "balance_loss_clip": 1.01751065, + "balance_loss_mlp": 1.04771161, + "epoch": 0.18740417856606043, + "flos": 17237697039360.0, + "grad_norm": 1.8736078530078255, + "language_loss": 0.78733885, + "learning_rate": 3.7456790383644833e-06, + "loss": 0.80912256, + "num_input_tokens_seen": 67244410, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.94921875, + "step": 3117, + "time_per_iteration": 2.418527126312256 + }, + { + "auxiliary_loss_clip": 0.01151486, + "auxiliary_loss_mlp": 0.01048418, + "balance_loss_clip": 1.02898717, + "balance_loss_mlp": 1.05421317, + "epoch": 0.1874643018187284, + "flos": 32558049999360.0, + "grad_norm": 1.743274375857092, + "language_loss": 0.83945131, + "learning_rate": 3.745488945104381e-06, + "loss": 0.86145031, + "num_input_tokens_seen": 67264470, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.97265625, + "step": 3118, + "time_per_iteration": 2.5691416263580322 + }, + { + "auxiliary_loss_clip": 0.01151442, + "auxiliary_loss_mlp": 0.01049226, + "balance_loss_clip": 1.03109384, + "balance_loss_mlp": 1.0525409, + "epoch": 0.18752442507139636, + "flos": 23258156636160.0, + "grad_norm": 1.7594323212393352, + "language_loss": 0.76151264, + "learning_rate": 3.7452987856551636e-06, + "loss": 0.78351927, + "num_input_tokens_seen": 67284315, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.98828125, + "step": 3119, + "time_per_iteration": 2.459648847579956 + }, + { + "auxiliary_loss_clip": 0.01150997, + "auxiliary_loss_mlp": 0.01053702, + "balance_loss_clip": 1.03549838, + "balance_loss_mlp": 1.05181718, + "epoch": 0.18758454832406432, + "flos": 21760933006080.0, + "grad_norm": 1.593515591831454, + "language_loss": 0.81975627, + "learning_rate": 3.7451085600240406e-06, + "loss": 0.84180319, + "num_input_tokens_seen": 67302780, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9921875, + "step": 3120, + "time_per_iteration": 2.478870153427124 + }, + { + "auxiliary_loss_clip": 0.01148213, + "auxiliary_loss_mlp": 0.01043273, + "balance_loss_clip": 1.02526081, + "balance_loss_mlp": 1.05178094, + "epoch": 0.1876446715767323, + "flos": 29570210841600.0, + "grad_norm": 1.7598733043788508, + "language_loss": 0.8513701, + "learning_rate": 3.7449182682182263e-06, + "loss": 0.873285, + "num_input_tokens_seen": 67323405, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9609375, + "step": 3121, + "time_per_iteration": 2.5178277492523193 + }, + { + "auxiliary_loss_clip": 0.0115058, + "auxiliary_loss_mlp": 0.0104859, + "balance_loss_clip": 1.02976704, + "balance_loss_mlp": 1.05281448, + "epoch": 0.18770479482940028, + "flos": 30339992234880.0, + "grad_norm": 2.163070382320244, + "language_loss": 0.70038795, + "learning_rate": 3.744727910244937e-06, + "loss": 0.72237968, + "num_input_tokens_seen": 67345800, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9765625, + "step": 3122, + "time_per_iteration": 2.5523242950439453 + }, + { + "auxiliary_loss_clip": 0.0114817, + "auxiliary_loss_mlp": 0.01045591, + "balance_loss_clip": 1.02524245, + "balance_loss_mlp": 1.05194402, + "epoch": 0.18776491808206824, + "flos": 14465357527680.0, + "grad_norm": 2.352571744641408, + "language_loss": 0.7034744, + "learning_rate": 3.7445374861113905e-06, + "loss": 0.72541201, + "num_input_tokens_seen": 67363575, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.9609375, + "step": 3123, + "time_per_iteration": 2.4145569801330566 + }, + { + "auxiliary_loss_clip": 0.01149047, + "auxiliary_loss_mlp": 0.01047451, + "balance_loss_clip": 1.02968884, + "balance_loss_mlp": 1.05238771, + "epoch": 0.1878250413347362, + "flos": 24498547044480.0, + "grad_norm": 2.0330816469172097, + "language_loss": 0.73851109, + "learning_rate": 3.7443469958248066e-06, + "loss": 0.76047611, + "num_input_tokens_seen": 67381765, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.96875, + "step": 3124, + "time_per_iteration": 2.497352123260498 + }, + { + "auxiliary_loss_clip": 0.01153024, + "auxiliary_loss_mlp": 0.01050586, + "balance_loss_clip": 1.03027248, + "balance_loss_mlp": 1.05275774, + "epoch": 0.18788516458740417, + "flos": 39786185692800.0, + "grad_norm": 1.9990758157966066, + "language_loss": 0.80601895, + "learning_rate": 3.7441564393924106e-06, + "loss": 0.82805508, + "num_input_tokens_seen": 67405000, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.0, + "step": 3125, + "time_per_iteration": 2.605851411819458 + }, + { + "auxiliary_loss_clip": 0.01056257, + "auxiliary_loss_mlp": 0.01009024, + "balance_loss_clip": 1.00697315, + "balance_loss_mlp": 1.02352476, + "epoch": 0.18794528784007214, + "flos": 64699250664960.0, + "grad_norm": 0.9386177249275542, + "language_loss": 0.63591504, + "learning_rate": 3.7439658168214273e-06, + "loss": 0.65656781, + "num_input_tokens_seen": 67467140, + "router_z_loss_clip": 0.02050781, + "router_z_loss_mlp": 0.328125, + "step": 3126, + "time_per_iteration": 3.0943961143493652 + }, + { + "auxiliary_loss_clip": 0.01150221, + "auxiliary_loss_mlp": 0.01042071, + "balance_loss_clip": 1.02366543, + "balance_loss_mlp": 1.05439222, + "epoch": 0.1880054110927401, + "flos": 28622061486720.0, + "grad_norm": 1.7984129752859428, + "language_loss": 0.81274688, + "learning_rate": 3.7437751281190857e-06, + "loss": 0.83466977, + "num_input_tokens_seen": 67487980, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.9609375, + "step": 3127, + "time_per_iteration": 2.535048723220825 + }, + { + "auxiliary_loss_clip": 0.01054784, + "auxiliary_loss_mlp": 0.0100739, + "balance_loss_clip": 1.00543487, + "balance_loss_mlp": 1.02235639, + "epoch": 0.1880655343454081, + "flos": 64488958490880.0, + "grad_norm": 0.7620779230288282, + "language_loss": 0.6191628, + "learning_rate": 3.7435843732926164e-06, + "loss": 0.63978451, + "num_input_tokens_seen": 67552500, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.32421875, + "step": 3128, + "time_per_iteration": 3.1384503841400146 + }, + { + "auxiliary_loss_clip": 0.01153999, + "auxiliary_loss_mlp": 0.0104217, + "balance_loss_clip": 1.02329898, + "balance_loss_mlp": 1.05182266, + "epoch": 0.18812565759807606, + "flos": 32124464928000.0, + "grad_norm": 2.171302965646948, + "language_loss": 0.71237707, + "learning_rate": 3.7433935523492536e-06, + "loss": 0.73433876, + "num_input_tokens_seen": 67573295, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0234375, + "step": 3129, + "time_per_iteration": 2.560601234436035 + }, + { + "auxiliary_loss_clip": 0.01149923, + "auxiliary_loss_mlp": 0.01051091, + "balance_loss_clip": 1.03206491, + "balance_loss_mlp": 1.05224252, + "epoch": 0.18818578085074403, + "flos": 20624539449600.0, + "grad_norm": 2.040923932078449, + "language_loss": 0.85375232, + "learning_rate": 3.7432026652962314e-06, + "loss": 0.87576246, + "num_input_tokens_seen": 67590010, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.9765625, + "step": 3130, + "time_per_iteration": 2.4366040229797363 + }, + { + "auxiliary_loss_clip": 0.0114816, + "auxiliary_loss_mlp": 0.01044498, + "balance_loss_clip": 1.02507877, + "balance_loss_mlp": 1.04844868, + "epoch": 0.188245904103412, + "flos": 28840506048000.0, + "grad_norm": 1.9842347260172397, + "language_loss": 0.77227372, + "learning_rate": 3.7430117121407897e-06, + "loss": 0.7942003, + "num_input_tokens_seen": 67611110, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0, + "step": 3131, + "time_per_iteration": 2.503112554550171 + }, + { + "auxiliary_loss_clip": 0.01151098, + "auxiliary_loss_mlp": 0.0104766, + "balance_loss_clip": 1.02800202, + "balance_loss_mlp": 1.05402517, + "epoch": 0.18830602735607996, + "flos": 29420319386880.0, + "grad_norm": 1.8095346888628816, + "language_loss": 0.81244844, + "learning_rate": 3.74282069289017e-06, + "loss": 0.834436, + "num_input_tokens_seen": 67631990, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.96875, + "step": 3132, + "time_per_iteration": 2.5265986919403076 + }, + { + "auxiliary_loss_clip": 0.01154443, + "auxiliary_loss_mlp": 0.01048532, + "balance_loss_clip": 1.02939904, + "balance_loss_mlp": 1.05395401, + "epoch": 0.18836615060874792, + "flos": 28872933050880.0, + "grad_norm": 2.3595669444771135, + "language_loss": 0.79035556, + "learning_rate": 3.742629607551614e-06, + "loss": 0.81238532, + "num_input_tokens_seen": 67650490, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0, + "step": 3133, + "time_per_iteration": 2.500927209854126 + }, + { + "auxiliary_loss_clip": 0.01151251, + "auxiliary_loss_mlp": 0.0105121, + "balance_loss_clip": 1.03224421, + "balance_loss_mlp": 1.05204821, + "epoch": 0.18842627386141592, + "flos": 22601673717120.0, + "grad_norm": 4.024150314183157, + "language_loss": 0.82826144, + "learning_rate": 3.7424384561323698e-06, + "loss": 0.85028601, + "num_input_tokens_seen": 67668860, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.9921875, + "step": 3134, + "time_per_iteration": 2.4773380756378174 + }, + { + "auxiliary_loss_clip": 0.01146819, + "auxiliary_loss_mlp": 0.01047119, + "balance_loss_clip": 1.02847505, + "balance_loss_mlp": 1.05027199, + "epoch": 0.18848639711408388, + "flos": 24573600512640.0, + "grad_norm": 1.4735244825899, + "language_loss": 0.82783771, + "learning_rate": 3.742247238639684e-06, + "loss": 0.8497771, + "num_input_tokens_seen": 67690220, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.96484375, + "step": 3135, + "time_per_iteration": 2.4957115650177 + }, + { + "auxiliary_loss_clip": 0.01149872, + "auxiliary_loss_mlp": 0.01052674, + "balance_loss_clip": 1.03343356, + "balance_loss_mlp": 1.0503304, + "epoch": 0.18854652036675185, + "flos": 34166920078080.0, + "grad_norm": 1.8513380433423674, + "language_loss": 0.79031271, + "learning_rate": 3.7420559550808083e-06, + "loss": 0.81233823, + "num_input_tokens_seen": 67709820, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.9921875, + "step": 3136, + "time_per_iteration": 2.556800127029419 + }, + { + "auxiliary_loss_clip": 0.01150763, + "auxiliary_loss_mlp": 0.01048681, + "balance_loss_clip": 1.02947617, + "balance_loss_mlp": 1.05327463, + "epoch": 0.1886066436194198, + "flos": 24200236592640.0, + "grad_norm": 1.9366242888645147, + "language_loss": 0.81049621, + "learning_rate": 3.741864605462996e-06, + "loss": 0.83249068, + "num_input_tokens_seen": 67729490, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.9765625, + "step": 3137, + "time_per_iteration": 2.487513542175293 + }, + { + "auxiliary_loss_clip": 0.01151307, + "auxiliary_loss_mlp": 0.01057024, + "balance_loss_clip": 1.03913093, + "balance_loss_mlp": 1.05406666, + "epoch": 0.18866676687208778, + "flos": 21251109317760.0, + "grad_norm": 1.5870634004860276, + "language_loss": 0.8119483, + "learning_rate": 3.741673189793504e-06, + "loss": 0.83403158, + "num_input_tokens_seen": 67749665, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.97265625, + "step": 3138, + "time_per_iteration": 2.4554855823516846 + }, + { + "auxiliary_loss_clip": 0.01152889, + "auxiliary_loss_mlp": 0.01050697, + "balance_loss_clip": 1.03162408, + "balance_loss_mlp": 1.05190897, + "epoch": 0.18872689012475574, + "flos": 37308673013760.0, + "grad_norm": 1.760814692015778, + "language_loss": 0.636096, + "learning_rate": 3.7414817080795896e-06, + "loss": 0.6581319, + "num_input_tokens_seen": 67776230, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0078125, + "step": 3139, + "time_per_iteration": 2.6103553771972656 + }, + { + "auxiliary_loss_clip": 0.01146092, + "auxiliary_loss_mlp": 0.0105006, + "balance_loss_clip": 1.03046215, + "balance_loss_mlp": 1.04812348, + "epoch": 0.1887870133774237, + "flos": 21652303299840.0, + "grad_norm": 2.433795452320061, + "language_loss": 0.71546841, + "learning_rate": 3.741290160328514e-06, + "loss": 0.73742986, + "num_input_tokens_seen": 67795080, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.98046875, + "step": 3140, + "time_per_iteration": 2.4519457817077637 + }, + { + "auxiliary_loss_clip": 0.01147517, + "auxiliary_loss_mlp": 0.01047268, + "balance_loss_clip": 1.02764606, + "balance_loss_mlp": 1.04848385, + "epoch": 0.1888471366300917, + "flos": 15924659374080.0, + "grad_norm": 3.1391974719951574, + "language_loss": 0.87001872, + "learning_rate": 3.7410985465475412e-06, + "loss": 0.89196658, + "num_input_tokens_seen": 67813110, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.98828125, + "step": 3141, + "time_per_iteration": 2.4811747074127197 + }, + { + "auxiliary_loss_clip": 0.01153623, + "auxiliary_loss_mlp": 0.0104492, + "balance_loss_clip": 1.02460694, + "balance_loss_mlp": 1.05144691, + "epoch": 0.18890725988275966, + "flos": 18551955767040.0, + "grad_norm": 2.021325930100965, + "language_loss": 0.77418405, + "learning_rate": 3.7409068667439378e-06, + "loss": 0.79616946, + "num_input_tokens_seen": 67831070, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.0234375, + "step": 3142, + "time_per_iteration": 2.5001184940338135 + }, + { + "auxiliary_loss_clip": 0.01148279, + "auxiliary_loss_mlp": 0.01042631, + "balance_loss_clip": 1.02542925, + "balance_loss_mlp": 1.05104184, + "epoch": 0.18896738313542763, + "flos": 28840865184000.0, + "grad_norm": 1.6841374820722228, + "language_loss": 0.78446913, + "learning_rate": 3.740715120924971e-06, + "loss": 0.80637825, + "num_input_tokens_seen": 67852170, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.97265625, + "step": 3143, + "time_per_iteration": 3.9074132442474365 + }, + { + "auxiliary_loss_clip": 0.01150084, + "auxiliary_loss_mlp": 0.01049426, + "balance_loss_clip": 1.03081727, + "balance_loss_mlp": 1.05069065, + "epoch": 0.1890275063880956, + "flos": 22412747157120.0, + "grad_norm": 4.1822349926512485, + "language_loss": 0.71507585, + "learning_rate": 3.740523309097912e-06, + "loss": 0.73707104, + "num_input_tokens_seen": 67869945, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9921875, + "step": 3144, + "time_per_iteration": 3.981715679168701 + }, + { + "auxiliary_loss_clip": 0.01152034, + "auxiliary_loss_mlp": 0.01045652, + "balance_loss_clip": 1.02605355, + "balance_loss_mlp": 1.0513736, + "epoch": 0.18908762964076356, + "flos": 24243904552320.0, + "grad_norm": 2.6203593578621893, + "language_loss": 0.73683178, + "learning_rate": 3.7403314312700356e-06, + "loss": 0.75880861, + "num_input_tokens_seen": 67890240, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0078125, + "step": 3145, + "time_per_iteration": 2.5101706981658936 + }, + { + "auxiliary_loss_clip": 0.01143872, + "auxiliary_loss_mlp": 0.01045631, + "balance_loss_clip": 1.02783298, + "balance_loss_mlp": 1.04759097, + "epoch": 0.18914775289343153, + "flos": 16982910892800.0, + "grad_norm": 2.6756165752276027, + "language_loss": 0.77081764, + "learning_rate": 3.740139487448616e-06, + "loss": 0.79271269, + "num_input_tokens_seen": 67907825, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9609375, + "step": 3146, + "time_per_iteration": 2.4278056621551514 + }, + { + "auxiliary_loss_clip": 0.01148489, + "auxiliary_loss_mlp": 0.01046339, + "balance_loss_clip": 1.02811205, + "balance_loss_mlp": 1.04947495, + "epoch": 0.1892078761460995, + "flos": 21543781334400.0, + "grad_norm": 1.794796296308648, + "language_loss": 0.78377169, + "learning_rate": 3.7399474776409326e-06, + "loss": 0.80571997, + "num_input_tokens_seen": 67926670, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.98828125, + "step": 3147, + "time_per_iteration": 2.467607259750366 + }, + { + "auxiliary_loss_clip": 0.0114757, + "auxiliary_loss_mlp": 0.01048988, + "balance_loss_clip": 1.0310235, + "balance_loss_mlp": 1.0499115, + "epoch": 0.18926799939876748, + "flos": 23001538896000.0, + "grad_norm": 3.2769360880247853, + "language_loss": 0.67016155, + "learning_rate": 3.739755401854267e-06, + "loss": 0.69212711, + "num_input_tokens_seen": 67943645, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9765625, + "step": 3148, + "time_per_iteration": 2.4846954345703125 + }, + { + "auxiliary_loss_clip": 0.01145427, + "auxiliary_loss_mlp": 0.01037742, + "balance_loss_clip": 1.02037382, + "balance_loss_mlp": 1.04898858, + "epoch": 0.18932812265143545, + "flos": 22273019251200.0, + "grad_norm": 4.644784357412393, + "language_loss": 0.75978655, + "learning_rate": 3.739563260095902e-06, + "loss": 0.78161824, + "num_input_tokens_seen": 67962345, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.96484375, + "step": 3149, + "time_per_iteration": 2.4768459796905518 + }, + { + "auxiliary_loss_clip": 0.01143839, + "auxiliary_loss_mlp": 0.01047607, + "balance_loss_clip": 1.03028584, + "balance_loss_mlp": 1.05033517, + "epoch": 0.1893882459041034, + "flos": 18624423456000.0, + "grad_norm": 2.9181295874949735, + "language_loss": 0.81229341, + "learning_rate": 3.7393710523731245e-06, + "loss": 0.83420789, + "num_input_tokens_seen": 67979760, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9375, + "step": 3150, + "time_per_iteration": 2.42832088470459 + }, + { + "auxiliary_loss_clip": 0.01148187, + "auxiliary_loss_mlp": 0.01046446, + "balance_loss_clip": 1.02886271, + "balance_loss_mlp": 1.05068374, + "epoch": 0.18944836915677138, + "flos": 22892981016960.0, + "grad_norm": 2.066054594612055, + "language_loss": 0.84966886, + "learning_rate": 3.7391787786932215e-06, + "loss": 0.87161517, + "num_input_tokens_seen": 67996895, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9765625, + "step": 3151, + "time_per_iteration": 2.458054542541504 + }, + { + "auxiliary_loss_clip": 0.01148364, + "auxiliary_loss_mlp": 0.01052715, + "balance_loss_clip": 1.03441668, + "balance_loss_mlp": 1.04896331, + "epoch": 0.18950849240943934, + "flos": 26796542526720.0, + "grad_norm": 1.9128881662164896, + "language_loss": 0.7443462, + "learning_rate": 3.7389864390634857e-06, + "loss": 0.76635695, + "num_input_tokens_seen": 68018365, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.99609375, + "step": 3152, + "time_per_iteration": 2.4904792308807373 + }, + { + "auxiliary_loss_clip": 0.01146776, + "auxiliary_loss_mlp": 0.01048229, + "balance_loss_clip": 1.02937067, + "balance_loss_mlp": 1.0502255, + "epoch": 0.1895686156621073, + "flos": 24971239048320.0, + "grad_norm": 1.8661622565083957, + "language_loss": 0.75719136, + "learning_rate": 3.738794033491209e-06, + "loss": 0.77914143, + "num_input_tokens_seen": 68037985, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.96484375, + "step": 3153, + "time_per_iteration": 2.5026283264160156 + }, + { + "auxiliary_loss_clip": 0.01148349, + "auxiliary_loss_mlp": 0.01048305, + "balance_loss_clip": 1.03007817, + "balance_loss_mlp": 1.04962945, + "epoch": 0.1896287389147753, + "flos": 21944544353280.0, + "grad_norm": 1.8393709351558127, + "language_loss": 0.79529279, + "learning_rate": 3.7386015619836887e-06, + "loss": 0.81725931, + "num_input_tokens_seen": 68057975, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.984375, + "step": 3154, + "time_per_iteration": 2.4544081687927246 + }, + { + "auxiliary_loss_clip": 0.01151316, + "auxiliary_loss_mlp": 0.01047877, + "balance_loss_clip": 1.02919698, + "balance_loss_mlp": 1.04986668, + "epoch": 0.18968886216744327, + "flos": 18179058723840.0, + "grad_norm": 2.673670363277482, + "language_loss": 0.72798991, + "learning_rate": 3.738409024548223e-06, + "loss": 0.74998182, + "num_input_tokens_seen": 68074175, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 3155, + "time_per_iteration": 2.425431728363037 + }, + { + "auxiliary_loss_clip": 0.01145009, + "auxiliary_loss_mlp": 0.01048344, + "balance_loss_clip": 1.03042662, + "balance_loss_mlp": 1.04930019, + "epoch": 0.18974898542011123, + "flos": 20412487509120.0, + "grad_norm": 1.676026678838244, + "language_loss": 0.73911691, + "learning_rate": 3.7382164211921136e-06, + "loss": 0.76105046, + "num_input_tokens_seen": 68095230, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.95703125, + "step": 3156, + "time_per_iteration": 2.4683640003204346 + }, + { + "auxiliary_loss_clip": 0.01149399, + "auxiliary_loss_mlp": 0.01050259, + "balance_loss_clip": 1.03281915, + "balance_loss_mlp": 1.05195308, + "epoch": 0.1898091086727792, + "flos": 23985024255360.0, + "grad_norm": 1.5984593201401434, + "language_loss": 0.68251741, + "learning_rate": 3.7380237519226623e-06, + "loss": 0.70451397, + "num_input_tokens_seen": 68113805, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9765625, + "step": 3157, + "time_per_iteration": 2.472182512283325 + }, + { + "auxiliary_loss_clip": 0.01146139, + "auxiliary_loss_mlp": 0.01043457, + "balance_loss_clip": 1.02539706, + "balance_loss_mlp": 1.04914486, + "epoch": 0.18986923192544716, + "flos": 27637067756160.0, + "grad_norm": 1.9937577865402571, + "language_loss": 0.80197155, + "learning_rate": 3.737831016747176e-06, + "loss": 0.82386756, + "num_input_tokens_seen": 68133190, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.96875, + "step": 3158, + "time_per_iteration": 2.4978723526000977 + }, + { + "auxiliary_loss_clip": 0.01152812, + "auxiliary_loss_mlp": 0.01045212, + "balance_loss_clip": 1.02624583, + "balance_loss_mlp": 1.05201745, + "epoch": 0.18992935517811513, + "flos": 25484151306240.0, + "grad_norm": 1.9065090881698699, + "language_loss": 0.71940476, + "learning_rate": 3.737638215672964e-06, + "loss": 0.74138498, + "num_input_tokens_seen": 68152330, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0078125, + "step": 3159, + "time_per_iteration": 2.503129720687866 + }, + { + "auxiliary_loss_clip": 0.01150054, + "auxiliary_loss_mlp": 0.01049079, + "balance_loss_clip": 1.02987432, + "balance_loss_mlp": 1.05255282, + "epoch": 0.1899894784307831, + "flos": 17420805596160.0, + "grad_norm": 1.8597759984302606, + "language_loss": 0.85071993, + "learning_rate": 3.7374453487073366e-06, + "loss": 0.8727113, + "num_input_tokens_seen": 68170185, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.9765625, + "step": 3160, + "time_per_iteration": 2.45534348487854 + }, + { + "auxiliary_loss_clip": 0.01143204, + "auxiliary_loss_mlp": 0.01049047, + "balance_loss_clip": 1.03235734, + "balance_loss_mlp": 1.050807, + "epoch": 0.19004960168345109, + "flos": 27492240119040.0, + "grad_norm": 1.7120140162377986, + "language_loss": 0.73554128, + "learning_rate": 3.7372524158576074e-06, + "loss": 0.75746381, + "num_input_tokens_seen": 68191665, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.92578125, + "step": 3161, + "time_per_iteration": 2.5551726818084717 + }, + { + "auxiliary_loss_clip": 0.01150414, + "auxiliary_loss_mlp": 0.01047878, + "balance_loss_clip": 1.02982974, + "balance_loss_mlp": 1.05420387, + "epoch": 0.19010972493611905, + "flos": 38654676385920.0, + "grad_norm": 1.554139282497156, + "language_loss": 0.80939364, + "learning_rate": 3.7370594171310926e-06, + "loss": 0.83137655, + "num_input_tokens_seen": 68214635, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9609375, + "step": 3162, + "time_per_iteration": 2.609764337539673 + }, + { + "auxiliary_loss_clip": 0.01149524, + "auxiliary_loss_mlp": 0.01043018, + "balance_loss_clip": 1.02486265, + "balance_loss_mlp": 1.05257571, + "epoch": 0.19016984818878702, + "flos": 19244744357760.0, + "grad_norm": 1.8884975109329094, + "language_loss": 0.75600141, + "learning_rate": 3.73686635253511e-06, + "loss": 0.77792686, + "num_input_tokens_seen": 68232150, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.96875, + "step": 3163, + "time_per_iteration": 2.4494824409484863 + }, + { + "auxiliary_loss_clip": 0.01149917, + "auxiliary_loss_mlp": 0.0103951, + "balance_loss_clip": 1.02161682, + "balance_loss_mlp": 1.05577397, + "epoch": 0.19022997144145498, + "flos": 37596891744000.0, + "grad_norm": 1.5980783305445414, + "language_loss": 0.74197054, + "learning_rate": 3.736673222076982e-06, + "loss": 0.76386476, + "num_input_tokens_seen": 68253370, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.94140625, + "step": 3164, + "time_per_iteration": 2.5901739597320557 + }, + { + "auxiliary_loss_clip": 0.01148415, + "auxiliary_loss_mlp": 0.01039529, + "balance_loss_clip": 1.02151656, + "balance_loss_mlp": 1.05402589, + "epoch": 0.19029009469412295, + "flos": 61530921665280.0, + "grad_norm": 1.5830796140792522, + "language_loss": 0.66913098, + "learning_rate": 3.7364800257640313e-06, + "loss": 0.69101042, + "num_input_tokens_seen": 68278895, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9453125, + "step": 3165, + "time_per_iteration": 2.899500608444214 + }, + { + "auxiliary_loss_clip": 0.01148214, + "auxiliary_loss_mlp": 0.01045421, + "balance_loss_clip": 1.02624011, + "balance_loss_mlp": 1.05282831, + "epoch": 0.1903502179467909, + "flos": 13954851480960.0, + "grad_norm": 2.1716027754337257, + "language_loss": 0.7452209, + "learning_rate": 3.7362867636035835e-06, + "loss": 0.76715726, + "num_input_tokens_seen": 68294880, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.953125, + "step": 3166, + "time_per_iteration": 2.4325685501098633 + }, + { + "auxiliary_loss_clip": 0.01062623, + "auxiliary_loss_mlp": 0.01017161, + "balance_loss_clip": 1.01490772, + "balance_loss_mlp": 1.02902174, + "epoch": 0.1904103411994589, + "flos": 66899641916160.0, + "grad_norm": 0.8067170187870535, + "language_loss": 0.50396568, + "learning_rate": 3.736093435602968e-06, + "loss": 0.52476352, + "num_input_tokens_seen": 68359665, + "router_z_loss_clip": 0.02258301, + "router_z_loss_mlp": 0.3359375, + "step": 3167, + "time_per_iteration": 3.1095221042633057 + }, + { + "auxiliary_loss_clip": 0.01146367, + "auxiliary_loss_mlp": 0.01049594, + "balance_loss_clip": 1.03184342, + "balance_loss_mlp": 1.05208659, + "epoch": 0.19047046445212687, + "flos": 21908741472000.0, + "grad_norm": 1.7496006549093657, + "language_loss": 0.74235475, + "learning_rate": 3.7359000417695156e-06, + "loss": 0.76431435, + "num_input_tokens_seen": 68378950, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9453125, + "step": 3168, + "time_per_iteration": 2.483551502227783 + }, + { + "auxiliary_loss_clip": 0.01059618, + "auxiliary_loss_mlp": 0.01004786, + "balance_loss_clip": 1.00246131, + "balance_loss_mlp": 1.02649927, + "epoch": 0.19053058770479483, + "flos": 59255156701440.0, + "grad_norm": 0.8615778549663292, + "language_loss": 0.60097563, + "learning_rate": 3.73570658211056e-06, + "loss": 0.6216197, + "num_input_tokens_seen": 68434235, + "router_z_loss_clip": 0.02319336, + "router_z_loss_mlp": 0.33203125, + "step": 3169, + "time_per_iteration": 2.958176851272583 + }, + { + "auxiliary_loss_clip": 0.01152665, + "auxiliary_loss_mlp": 0.01051299, + "balance_loss_clip": 1.03371537, + "balance_loss_mlp": 1.05302989, + "epoch": 0.1905907109574628, + "flos": 23951304362880.0, + "grad_norm": 1.550337238497042, + "language_loss": 0.77976263, + "learning_rate": 3.735513056633436e-06, + "loss": 0.80180222, + "num_input_tokens_seen": 68453830, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.99609375, + "step": 3170, + "time_per_iteration": 2.5174756050109863 + }, + { + "auxiliary_loss_clip": 0.01145075, + "auxiliary_loss_mlp": 0.01046915, + "balance_loss_clip": 1.02960575, + "balance_loss_mlp": 1.05185819, + "epoch": 0.19065083421013077, + "flos": 20812316774400.0, + "grad_norm": 1.7193055204742105, + "language_loss": 0.78597021, + "learning_rate": 3.7353194653454834e-06, + "loss": 0.80789012, + "num_input_tokens_seen": 68473005, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.93359375, + "step": 3171, + "time_per_iteration": 2.4895551204681396 + }, + { + "auxiliary_loss_clip": 0.01149187, + "auxiliary_loss_mlp": 0.0104474, + "balance_loss_clip": 1.02617931, + "balance_loss_mlp": 1.05111575, + "epoch": 0.19071095746279873, + "flos": 31284981192960.0, + "grad_norm": 3.5246110250440386, + "language_loss": 0.78578937, + "learning_rate": 3.7351258082540426e-06, + "loss": 0.80772865, + "num_input_tokens_seen": 68493470, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.98046875, + "step": 3172, + "time_per_iteration": 2.574558973312378 + }, + { + "auxiliary_loss_clip": 0.01148239, + "auxiliary_loss_mlp": 0.01054453, + "balance_loss_clip": 1.03711963, + "balance_loss_mlp": 1.05253482, + "epoch": 0.1907710807154667, + "flos": 14356117290240.0, + "grad_norm": 1.581476317811461, + "language_loss": 0.80126482, + "learning_rate": 3.7349320853664576e-06, + "loss": 0.82329178, + "num_input_tokens_seen": 68511290, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.95703125, + "step": 3173, + "time_per_iteration": 2.464979410171509 + }, + { + "auxiliary_loss_clip": 0.01149716, + "auxiliary_loss_mlp": 0.01051904, + "balance_loss_clip": 1.03432083, + "balance_loss_mlp": 1.05250478, + "epoch": 0.1908312039681347, + "flos": 26907039740160.0, + "grad_norm": 1.9222394249434893, + "language_loss": 0.78740567, + "learning_rate": 3.7347382966900735e-06, + "loss": 0.8094219, + "num_input_tokens_seen": 68532575, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.96875, + "step": 3174, + "time_per_iteration": 2.540959358215332 + }, + { + "auxiliary_loss_clip": 0.01149777, + "auxiliary_loss_mlp": 0.01047648, + "balance_loss_clip": 1.03043461, + "balance_loss_mlp": 1.05367374, + "epoch": 0.19089132722080265, + "flos": 14494695960960.0, + "grad_norm": 1.8458147293094664, + "language_loss": 0.80757344, + "learning_rate": 3.7345444422322395e-06, + "loss": 0.82954776, + "num_input_tokens_seen": 68548760, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9609375, + "step": 3175, + "time_per_iteration": 2.441190481185913 + }, + { + "auxiliary_loss_clip": 0.01148992, + "auxiliary_loss_mlp": 0.01056395, + "balance_loss_clip": 1.03821599, + "balance_loss_mlp": 1.0521791, + "epoch": 0.19095145047347062, + "flos": 13952876232960.0, + "grad_norm": 2.3562328324004445, + "language_loss": 0.85142022, + "learning_rate": 3.7343505220003067e-06, + "loss": 0.87347412, + "num_input_tokens_seen": 68563100, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.96875, + "step": 3176, + "time_per_iteration": 2.4397072792053223 + }, + { + "auxiliary_loss_clip": 0.01152727, + "auxiliary_loss_mlp": 0.01056149, + "balance_loss_clip": 1.036515, + "balance_loss_mlp": 1.05395234, + "epoch": 0.19101157372613858, + "flos": 25301832848640.0, + "grad_norm": 2.002060812172469, + "language_loss": 0.81206596, + "learning_rate": 3.7341565360016285e-06, + "loss": 0.83415473, + "num_input_tokens_seen": 68581650, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.98828125, + "step": 3177, + "time_per_iteration": 2.4980266094207764 + }, + { + "auxiliary_loss_clip": 0.01144454, + "auxiliary_loss_mlp": 0.01048966, + "balance_loss_clip": 1.03073931, + "balance_loss_mlp": 1.0503974, + "epoch": 0.19107169697880655, + "flos": 20558212986240.0, + "grad_norm": 1.9374450898751996, + "language_loss": 0.74628592, + "learning_rate": 3.73396248424356e-06, + "loss": 0.76822007, + "num_input_tokens_seen": 68600360, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.94140625, + "step": 3178, + "time_per_iteration": 2.477679967880249 + }, + { + "auxiliary_loss_clip": 0.01147132, + "auxiliary_loss_mlp": 0.01039746, + "balance_loss_clip": 1.02273464, + "balance_loss_mlp": 1.05001104, + "epoch": 0.19113182023147451, + "flos": 22163204396160.0, + "grad_norm": 1.8429055258583904, + "language_loss": 0.8167876, + "learning_rate": 3.7337683667334606e-06, + "loss": 0.83865643, + "num_input_tokens_seen": 68617885, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.97265625, + "step": 3179, + "time_per_iteration": 2.452310800552368 + }, + { + "auxiliary_loss_clip": 0.0114904, + "auxiliary_loss_mlp": 0.01046544, + "balance_loss_clip": 1.02892482, + "balance_loss_mlp": 1.05279994, + "epoch": 0.19119194348414248, + "flos": 18581796990720.0, + "grad_norm": 2.1508657656276484, + "language_loss": 0.7946887, + "learning_rate": 3.733574183478691e-06, + "loss": 0.81664455, + "num_input_tokens_seen": 68634550, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.96484375, + "step": 3180, + "time_per_iteration": 2.451066732406616 + }, + { + "auxiliary_loss_clip": 0.0114304, + "auxiliary_loss_mlp": 0.01045985, + "balance_loss_clip": 1.02770984, + "balance_loss_mlp": 1.04780042, + "epoch": 0.19125206673681047, + "flos": 19026623018880.0, + "grad_norm": 2.916741655382754, + "language_loss": 0.79891652, + "learning_rate": 3.733379934486615e-06, + "loss": 0.82080674, + "num_input_tokens_seen": 68651895, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.94921875, + "step": 3181, + "time_per_iteration": 2.4310615062713623 + }, + { + "auxiliary_loss_clip": 0.0114616, + "auxiliary_loss_mlp": 0.01053832, + "balance_loss_clip": 1.03623664, + "balance_loss_mlp": 1.04858851, + "epoch": 0.19131218998947844, + "flos": 21690153256320.0, + "grad_norm": 1.7607714952320546, + "language_loss": 0.73820639, + "learning_rate": 3.7331856197645973e-06, + "loss": 0.76020634, + "num_input_tokens_seen": 68671500, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9765625, + "step": 3182, + "time_per_iteration": 2.4712350368499756 + }, + { + "auxiliary_loss_clip": 0.01148345, + "auxiliary_loss_mlp": 0.01048421, + "balance_loss_clip": 1.03093314, + "balance_loss_mlp": 1.05187011, + "epoch": 0.1913723132421464, + "flos": 18442500048000.0, + "grad_norm": 1.8018319163421928, + "language_loss": 0.6486634, + "learning_rate": 3.7329912393200084e-06, + "loss": 0.67063105, + "num_input_tokens_seen": 68690570, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.96484375, + "step": 3183, + "time_per_iteration": 2.440232753753662 + }, + { + "auxiliary_loss_clip": 0.01145449, + "auxiliary_loss_mlp": 0.01047983, + "balance_loss_clip": 1.02920759, + "balance_loss_mlp": 1.04864669, + "epoch": 0.19143243649481437, + "flos": 27160102033920.0, + "grad_norm": 1.760716170695104, + "language_loss": 0.73234087, + "learning_rate": 3.7327967931602173e-06, + "loss": 0.7542752, + "num_input_tokens_seen": 68709735, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.96875, + "step": 3184, + "time_per_iteration": 3.9211573600769043 + }, + { + "auxiliary_loss_clip": 0.01145752, + "auxiliary_loss_mlp": 0.01047876, + "balance_loss_clip": 1.0281471, + "balance_loss_mlp": 1.04738748, + "epoch": 0.19149255974748233, + "flos": 21718952985600.0, + "grad_norm": 2.1066155051108315, + "language_loss": 0.8784132, + "learning_rate": 3.732602281292598e-06, + "loss": 0.9003495, + "num_input_tokens_seen": 68727565, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.984375, + "step": 3185, + "time_per_iteration": 5.396124601364136 + }, + { + "auxiliary_loss_clip": 0.01143307, + "auxiliary_loss_mlp": 0.01046716, + "balance_loss_clip": 1.02803612, + "balance_loss_mlp": 1.04899192, + "epoch": 0.1915526830001503, + "flos": 22963293889920.0, + "grad_norm": 2.10102369978198, + "language_loss": 0.72667789, + "learning_rate": 3.7324077037245267e-06, + "loss": 0.74857807, + "num_input_tokens_seen": 68748110, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.94140625, + "step": 3186, + "time_per_iteration": 2.498241901397705 + }, + { + "auxiliary_loss_clip": 0.01153236, + "auxiliary_loss_mlp": 0.01042185, + "balance_loss_clip": 1.02244437, + "balance_loss_mlp": 1.054919, + "epoch": 0.1916128062528183, + "flos": 26140741966080.0, + "grad_norm": 2.264264166459479, + "language_loss": 0.83865881, + "learning_rate": 3.7322130604633825e-06, + "loss": 0.86061311, + "num_input_tokens_seen": 68769765, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.98046875, + "step": 3187, + "time_per_iteration": 2.527416467666626 + }, + { + "auxiliary_loss_clip": 0.01051867, + "auxiliary_loss_mlp": 0.01015636, + "balance_loss_clip": 1.01343083, + "balance_loss_mlp": 1.01988959, + "epoch": 0.19167292950548626, + "flos": 54925767457920.0, + "grad_norm": 0.8634842964488614, + "language_loss": 0.55803859, + "learning_rate": 3.732018351516544e-06, + "loss": 0.5787136, + "num_input_tokens_seen": 68826815, + "router_z_loss_clip": 0.02209473, + "router_z_loss_mlp": 0.3203125, + "step": 3188, + "time_per_iteration": 3.0815136432647705 + }, + { + "auxiliary_loss_clip": 0.01145462, + "auxiliary_loss_mlp": 0.01055783, + "balance_loss_clip": 1.03709126, + "balance_loss_mlp": 1.04972625, + "epoch": 0.19173305275815422, + "flos": 29935601942400.0, + "grad_norm": 1.71302722892552, + "language_loss": 0.70180511, + "learning_rate": 3.731823576891397e-06, + "loss": 0.72381759, + "num_input_tokens_seen": 68847585, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.95703125, + "step": 3189, + "time_per_iteration": 2.5380465984344482 + }, + { + "auxiliary_loss_clip": 0.01140421, + "auxiliary_loss_mlp": 0.01034792, + "balance_loss_clip": 1.01822233, + "balance_loss_mlp": 1.04853344, + "epoch": 0.1917931760108222, + "flos": 24752471264640.0, + "grad_norm": 2.222159201352765, + "language_loss": 0.74234986, + "learning_rate": 3.7316287365953266e-06, + "loss": 0.76410198, + "num_input_tokens_seen": 68866620, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.91796875, + "step": 3190, + "time_per_iteration": 2.5862700939178467 + }, + { + "auxiliary_loss_clip": 0.01143494, + "auxiliary_loss_mlp": 0.01056401, + "balance_loss_clip": 1.03818583, + "balance_loss_mlp": 1.04965627, + "epoch": 0.19185329926349015, + "flos": 18843550375680.0, + "grad_norm": 1.8818377537371913, + "language_loss": 0.8394708, + "learning_rate": 3.73143383063572e-06, + "loss": 0.86146975, + "num_input_tokens_seen": 68885515, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9375, + "step": 3191, + "time_per_iteration": 2.5077905654907227 + }, + { + "auxiliary_loss_clip": 0.01139779, + "auxiliary_loss_mlp": 0.01038816, + "balance_loss_clip": 1.02217412, + "balance_loss_mlp": 1.04766488, + "epoch": 0.19191342251615812, + "flos": 22086858038400.0, + "grad_norm": 1.7694679756443132, + "language_loss": 0.89325655, + "learning_rate": 3.73123885901997e-06, + "loss": 0.91504252, + "num_input_tokens_seen": 68903225, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.921875, + "step": 3192, + "time_per_iteration": 2.4738776683807373 + }, + { + "auxiliary_loss_clip": 0.01150885, + "auxiliary_loss_mlp": 0.0105345, + "balance_loss_clip": 1.03398299, + "balance_loss_mlp": 1.0531472, + "epoch": 0.19197354576882608, + "flos": 22199115018240.0, + "grad_norm": 2.352703418633998, + "language_loss": 0.74830496, + "learning_rate": 3.7310438217554687e-06, + "loss": 0.77034831, + "num_input_tokens_seen": 68922860, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.9765625, + "step": 3193, + "time_per_iteration": 2.47143816947937 + }, + { + "auxiliary_loss_clip": 0.01146927, + "auxiliary_loss_mlp": 0.01045614, + "balance_loss_clip": 1.02717233, + "balance_loss_mlp": 1.04918766, + "epoch": 0.19203366902149407, + "flos": 24896185580160.0, + "grad_norm": 1.7283890992056894, + "language_loss": 0.74733245, + "learning_rate": 3.730848718849612e-06, + "loss": 0.7692579, + "num_input_tokens_seen": 68943000, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.9765625, + "step": 3194, + "time_per_iteration": 2.5001959800720215 + }, + { + "auxiliary_loss_clip": 0.0105047, + "auxiliary_loss_mlp": 0.010055, + "balance_loss_clip": 1.00319958, + "balance_loss_mlp": 1.01851392, + "epoch": 0.19209379227416204, + "flos": 68416722789120.0, + "grad_norm": 0.7975785668902318, + "language_loss": 0.68455988, + "learning_rate": 3.7306535503097985e-06, + "loss": 0.70511955, + "num_input_tokens_seen": 69000255, + "router_z_loss_clip": 0.02294922, + "router_z_loss_mlp": 0.3203125, + "step": 3195, + "time_per_iteration": 3.014677047729492 + }, + { + "auxiliary_loss_clip": 0.01146296, + "auxiliary_loss_mlp": 0.01043268, + "balance_loss_clip": 1.0254823, + "balance_loss_mlp": 1.05066323, + "epoch": 0.19215391552683, + "flos": 22055185221120.0, + "grad_norm": 1.9672517867074575, + "language_loss": 0.72712696, + "learning_rate": 3.730458316143429e-06, + "loss": 0.74902254, + "num_input_tokens_seen": 69019665, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.95703125, + "step": 3196, + "time_per_iteration": 2.4855856895446777 + }, + { + "auxiliary_loss_clip": 0.01151669, + "auxiliary_loss_mlp": 0.01046783, + "balance_loss_clip": 1.0284251, + "balance_loss_mlp": 1.05643284, + "epoch": 0.19221403877949797, + "flos": 20302959962880.0, + "grad_norm": 1.8158077484015336, + "language_loss": 0.83774233, + "learning_rate": 3.7302630163579068e-06, + "loss": 0.85972691, + "num_input_tokens_seen": 69039055, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.953125, + "step": 3197, + "time_per_iteration": 2.4530181884765625 + }, + { + "auxiliary_loss_clip": 0.01146905, + "auxiliary_loss_mlp": 0.01044345, + "balance_loss_clip": 1.02565312, + "balance_loss_mlp": 1.05036283, + "epoch": 0.19227416203216594, + "flos": 23185329811200.0, + "grad_norm": 2.295881830513264, + "language_loss": 0.80459738, + "learning_rate": 3.7300676509606373e-06, + "loss": 0.82650983, + "num_input_tokens_seen": 69056370, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.96875, + "step": 3198, + "time_per_iteration": 2.4882590770721436 + }, + { + "auxiliary_loss_clip": 0.01148366, + "auxiliary_loss_mlp": 0.01050243, + "balance_loss_clip": 1.03090763, + "balance_loss_mlp": 1.04984999, + "epoch": 0.1923342852848339, + "flos": 25776607841280.0, + "grad_norm": 1.9800701307051174, + "language_loss": 0.7862891, + "learning_rate": 3.729872219959029e-06, + "loss": 0.80827522, + "num_input_tokens_seen": 69075915, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.984375, + "step": 3199, + "time_per_iteration": 2.507227659225464 + }, + { + "auxiliary_loss_clip": 0.01146428, + "auxiliary_loss_mlp": 0.01042987, + "balance_loss_clip": 1.02567828, + "balance_loss_mlp": 1.05150342, + "epoch": 0.19239440853750187, + "flos": 17128349061120.0, + "grad_norm": 2.05190707233933, + "language_loss": 0.83391261, + "learning_rate": 3.7296767233604934e-06, + "loss": 0.85580671, + "num_input_tokens_seen": 69094145, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.94921875, + "step": 3200, + "time_per_iteration": 2.459218978881836 + }, + { + "auxiliary_loss_clip": 0.01148087, + "auxiliary_loss_mlp": 0.01051054, + "balance_loss_clip": 1.03286231, + "balance_loss_mlp": 1.0524931, + "epoch": 0.19245453179016986, + "flos": 16435093593600.0, + "grad_norm": 2.0233550639398428, + "language_loss": 0.78678542, + "learning_rate": 3.729481161172443e-06, + "loss": 0.80877686, + "num_input_tokens_seen": 69111110, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.95703125, + "step": 3201, + "time_per_iteration": 2.435478448867798 + }, + { + "auxiliary_loss_clip": 0.01148745, + "auxiliary_loss_mlp": 0.01046904, + "balance_loss_clip": 1.02874875, + "balance_loss_mlp": 1.05050445, + "epoch": 0.19251465504283782, + "flos": 20230276792320.0, + "grad_norm": 2.1716175760371814, + "language_loss": 0.69168961, + "learning_rate": 3.7292855334022927e-06, + "loss": 0.71364617, + "num_input_tokens_seen": 69130280, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.984375, + "step": 3202, + "time_per_iteration": 2.4596354961395264 + }, + { + "auxiliary_loss_clip": 0.01145317, + "auxiliary_loss_mlp": 0.01035376, + "balance_loss_clip": 1.01790023, + "balance_loss_mlp": 1.05140352, + "epoch": 0.1925747782955058, + "flos": 19464374067840.0, + "grad_norm": 1.7015130302687178, + "language_loss": 0.91123176, + "learning_rate": 3.7290898400574627e-06, + "loss": 0.93303871, + "num_input_tokens_seen": 69149570, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9375, + "step": 3203, + "time_per_iteration": 2.4425902366638184 + }, + { + "auxiliary_loss_clip": 0.01147002, + "auxiliary_loss_mlp": 0.01050127, + "balance_loss_clip": 1.03127956, + "balance_loss_mlp": 1.05008471, + "epoch": 0.19263490154817375, + "flos": 17785586165760.0, + "grad_norm": 2.129263396651385, + "language_loss": 0.81766933, + "learning_rate": 3.7288940811453725e-06, + "loss": 0.83964062, + "num_input_tokens_seen": 69168190, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.96875, + "step": 3204, + "time_per_iteration": 2.4466230869293213 + }, + { + "auxiliary_loss_clip": 0.01143673, + "auxiliary_loss_mlp": 0.01047511, + "balance_loss_clip": 1.03022599, + "balance_loss_mlp": 1.0497942, + "epoch": 0.19269502480084172, + "flos": 17457075354240.0, + "grad_norm": 2.065510679734303, + "language_loss": 0.75797462, + "learning_rate": 3.7286982566734454e-06, + "loss": 0.77988648, + "num_input_tokens_seen": 69186950, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9375, + "step": 3205, + "time_per_iteration": 2.439906358718872 + }, + { + "auxiliary_loss_clip": 0.01150471, + "auxiliary_loss_mlp": 0.01047316, + "balance_loss_clip": 1.02958953, + "balance_loss_mlp": 1.05312991, + "epoch": 0.19275514805350968, + "flos": 21506901045120.0, + "grad_norm": 2.4125731541540465, + "language_loss": 0.83020669, + "learning_rate": 3.728502366649107e-06, + "loss": 0.85218459, + "num_input_tokens_seen": 69204850, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.97265625, + "step": 3206, + "time_per_iteration": 2.463888168334961 + }, + { + "auxiliary_loss_clip": 0.0104957, + "auxiliary_loss_mlp": 0.01003581, + "balance_loss_clip": 1.00139928, + "balance_loss_mlp": 1.01731467, + "epoch": 0.19281527130617768, + "flos": 47695979738880.0, + "grad_norm": 0.8499440783854421, + "language_loss": 0.60609913, + "learning_rate": 3.728306411079786e-06, + "loss": 0.62663066, + "num_input_tokens_seen": 69259200, + "router_z_loss_clip": 0.02185059, + "router_z_loss_mlp": 0.32421875, + "step": 3207, + "time_per_iteration": 2.8865902423858643 + }, + { + "auxiliary_loss_clip": 0.01147085, + "auxiliary_loss_mlp": 0.01045801, + "balance_loss_clip": 1.02789569, + "balance_loss_mlp": 1.05069125, + "epoch": 0.19287539455884564, + "flos": 11801252672640.0, + "grad_norm": 2.4047527057594564, + "language_loss": 0.75119245, + "learning_rate": 3.7281103899729125e-06, + "loss": 0.77312136, + "num_input_tokens_seen": 69275835, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.9609375, + "step": 3208, + "time_per_iteration": 2.4727799892425537 + }, + { + "auxiliary_loss_clip": 0.01146825, + "auxiliary_loss_mlp": 0.01048755, + "balance_loss_clip": 1.02921605, + "balance_loss_mlp": 1.04890394, + "epoch": 0.1929355178115136, + "flos": 20631434860800.0, + "grad_norm": 2.3372356299161696, + "language_loss": 0.60567236, + "learning_rate": 3.7279143033359195e-06, + "loss": 0.62762815, + "num_input_tokens_seen": 69294810, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.98046875, + "step": 3209, + "time_per_iteration": 2.4695677757263184 + }, + { + "auxiliary_loss_clip": 0.0114885, + "auxiliary_loss_mlp": 0.01049539, + "balance_loss_clip": 1.03003573, + "balance_loss_mlp": 1.04981887, + "epoch": 0.19299564106418157, + "flos": 40807916058240.0, + "grad_norm": 1.9457412312791633, + "language_loss": 0.80153656, + "learning_rate": 3.727718151176243e-06, + "loss": 0.82352048, + "num_input_tokens_seen": 69316065, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.9921875, + "step": 3210, + "time_per_iteration": 2.6459405422210693 + }, + { + "auxiliary_loss_clip": 0.01138808, + "auxiliary_loss_mlp": 0.01041334, + "balance_loss_clip": 1.02437103, + "balance_loss_mlp": 1.04580569, + "epoch": 0.19305576431684954, + "flos": 11361418634880.0, + "grad_norm": 2.107646167575127, + "language_loss": 0.82575119, + "learning_rate": 3.7275219335013217e-06, + "loss": 0.84755266, + "num_input_tokens_seen": 69332900, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9296875, + "step": 3211, + "time_per_iteration": 2.454702615737915 + }, + { + "auxiliary_loss_clip": 0.01046258, + "auxiliary_loss_mlp": 0.01012694, + "balance_loss_clip": 1.01057243, + "balance_loss_mlp": 1.01463401, + "epoch": 0.1931158875695175, + "flos": 54511895975040.0, + "grad_norm": 0.9758169311408023, + "language_loss": 0.63670558, + "learning_rate": 3.7273256503185953e-06, + "loss": 0.65729511, + "num_input_tokens_seen": 69382535, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.31640625, + "step": 3212, + "time_per_iteration": 2.914459705352783 + }, + { + "auxiliary_loss_clip": 0.01145937, + "auxiliary_loss_mlp": 0.01046347, + "balance_loss_clip": 1.02967, + "balance_loss_mlp": 1.05140018, + "epoch": 0.19317601082218547, + "flos": 19828436365440.0, + "grad_norm": 1.5978218597026725, + "language_loss": 0.76514798, + "learning_rate": 3.7271293016355074e-06, + "loss": 0.78707075, + "num_input_tokens_seen": 69400600, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9453125, + "step": 3213, + "time_per_iteration": 2.47961163520813 + }, + { + "auxiliary_loss_clip": 0.01147383, + "auxiliary_loss_mlp": 0.01047068, + "balance_loss_clip": 1.02823281, + "balance_loss_mlp": 1.04934072, + "epoch": 0.19323613407485346, + "flos": 13152068467200.0, + "grad_norm": 4.5461953882780115, + "language_loss": 0.70799339, + "learning_rate": 3.726932887459503e-06, + "loss": 0.72993791, + "num_input_tokens_seen": 69417350, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.98046875, + "step": 3214, + "time_per_iteration": 2.4547488689422607 + }, + { + "auxiliary_loss_clip": 0.01142593, + "auxiliary_loss_mlp": 0.01046871, + "balance_loss_clip": 1.02808392, + "balance_loss_mlp": 1.0470041, + "epoch": 0.19329625732752143, + "flos": 14027247342720.0, + "grad_norm": 2.2459266127411848, + "language_loss": 0.75352395, + "learning_rate": 3.72673640779803e-06, + "loss": 0.77541864, + "num_input_tokens_seen": 69431845, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.95703125, + "step": 3215, + "time_per_iteration": 2.4477176666259766 + }, + { + "auxiliary_loss_clip": 0.01139586, + "auxiliary_loss_mlp": 0.01053833, + "balance_loss_clip": 1.03685808, + "balance_loss_mlp": 1.04626155, + "epoch": 0.1933563805801894, + "flos": 23441732069760.0, + "grad_norm": 2.304207478946857, + "language_loss": 0.88559556, + "learning_rate": 3.72653986265854e-06, + "loss": 0.90752971, + "num_input_tokens_seen": 69453275, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9296875, + "step": 3216, + "time_per_iteration": 2.499464988708496 + }, + { + "auxiliary_loss_clip": 0.01140269, + "auxiliary_loss_mlp": 0.0104998, + "balance_loss_clip": 1.0330286, + "balance_loss_mlp": 1.0474, + "epoch": 0.19341650383285736, + "flos": 20485314334080.0, + "grad_norm": 1.5978066249985532, + "language_loss": 0.79762065, + "learning_rate": 3.726343252048485e-06, + "loss": 0.8195231, + "num_input_tokens_seen": 69471830, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9296875, + "step": 3217, + "time_per_iteration": 2.4428889751434326 + }, + { + "auxiliary_loss_clip": 0.0114893, + "auxiliary_loss_mlp": 0.01048467, + "balance_loss_clip": 1.0294652, + "balance_loss_mlp": 1.0504688, + "epoch": 0.19347662708552532, + "flos": 17858484817920.0, + "grad_norm": 2.6606972104147673, + "language_loss": 0.61408496, + "learning_rate": 3.7261465759753206e-06, + "loss": 0.63605893, + "num_input_tokens_seen": 69489320, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.984375, + "step": 3218, + "time_per_iteration": 2.4313230514526367 + }, + { + "auxiliary_loss_clip": 0.0114381, + "auxiliary_loss_mlp": 0.01040596, + "balance_loss_clip": 1.02315593, + "balance_loss_mlp": 1.04883909, + "epoch": 0.1935367503381933, + "flos": 18187247024640.0, + "grad_norm": 1.6811153728366703, + "language_loss": 0.80158418, + "learning_rate": 3.7259498344465053e-06, + "loss": 0.82342821, + "num_input_tokens_seen": 69506665, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.94921875, + "step": 3219, + "time_per_iteration": 2.4347593784332275 + }, + { + "auxiliary_loss_clip": 0.01145851, + "auxiliary_loss_mlp": 0.01048329, + "balance_loss_clip": 1.03010237, + "balance_loss_mlp": 1.05070114, + "epoch": 0.19359687359086128, + "flos": 15957122290560.0, + "grad_norm": 2.032012314604138, + "language_loss": 0.85781908, + "learning_rate": 3.7257530274694993e-06, + "loss": 0.87976086, + "num_input_tokens_seen": 69523835, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.94921875, + "step": 3220, + "time_per_iteration": 2.4572718143463135 + }, + { + "auxiliary_loss_clip": 0.01136805, + "auxiliary_loss_mlp": 0.0103947, + "balance_loss_clip": 1.02356791, + "balance_loss_mlp": 1.0477736, + "epoch": 0.19365699684352924, + "flos": 21215198695680.0, + "grad_norm": 2.087292049011103, + "language_loss": 0.84617937, + "learning_rate": 3.725556155051766e-06, + "loss": 0.86794209, + "num_input_tokens_seen": 69542620, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.890625, + "step": 3221, + "time_per_iteration": 2.4601354598999023 + }, + { + "auxiliary_loss_clip": 0.01142607, + "auxiliary_loss_mlp": 0.01049362, + "balance_loss_clip": 1.0331614, + "balance_loss_mlp": 1.05009556, + "epoch": 0.1937171200961972, + "flos": 17311098481920.0, + "grad_norm": 2.075109928662421, + "language_loss": 0.85929954, + "learning_rate": 3.7253592172007702e-06, + "loss": 0.88121927, + "num_input_tokens_seen": 69561130, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.92578125, + "step": 3222, + "time_per_iteration": 2.433027505874634 + }, + { + "auxiliary_loss_clip": 0.0114145, + "auxiliary_loss_mlp": 0.01040151, + "balance_loss_clip": 1.02212656, + "balance_loss_mlp": 1.04663789, + "epoch": 0.19377724334886517, + "flos": 22635968227200.0, + "grad_norm": 3.9278404759018053, + "language_loss": 0.78207982, + "learning_rate": 3.72516221392398e-06, + "loss": 0.80389583, + "num_input_tokens_seen": 69580425, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9453125, + "step": 3223, + "time_per_iteration": 2.4451496601104736 + }, + { + "auxiliary_loss_clip": 0.01139994, + "auxiliary_loss_mlp": 0.01047584, + "balance_loss_clip": 1.03013206, + "balance_loss_mlp": 1.04896808, + "epoch": 0.19383736660153314, + "flos": 15077813351040.0, + "grad_norm": 1.8200574771064912, + "language_loss": 0.75589085, + "learning_rate": 3.7249651452288653e-06, + "loss": 0.77776659, + "num_input_tokens_seen": 69597085, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.91015625, + "step": 3224, + "time_per_iteration": 2.4390981197357178 + }, + { + "auxiliary_loss_clip": 0.0114036, + "auxiliary_loss_mlp": 0.01039996, + "balance_loss_clip": 1.02274644, + "balance_loss_mlp": 1.04741263, + "epoch": 0.1938974898542011, + "flos": 47119934350080.0, + "grad_norm": 2.092202382915022, + "language_loss": 0.71141279, + "learning_rate": 3.7247680111229e-06, + "loss": 0.73321629, + "num_input_tokens_seen": 69618885, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9296875, + "step": 3225, + "time_per_iteration": 2.6690707206726074 + }, + { + "auxiliary_loss_clip": 0.01142605, + "auxiliary_loss_mlp": 0.01044348, + "balance_loss_clip": 1.0279572, + "balance_loss_mlp": 1.04787326, + "epoch": 0.19395761310686907, + "flos": 25812554376960.0, + "grad_norm": 2.058354492672399, + "language_loss": 0.6915803, + "learning_rate": 3.7245708116135585e-06, + "loss": 0.71344984, + "num_input_tokens_seen": 69638200, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9453125, + "step": 3226, + "time_per_iteration": 3.906217336654663 + }, + { + "auxiliary_loss_clip": 0.0114437, + "auxiliary_loss_mlp": 0.01044177, + "balance_loss_clip": 1.02562809, + "balance_loss_mlp": 1.05274427, + "epoch": 0.19401773635953706, + "flos": 23039604334080.0, + "grad_norm": 1.6131772564475266, + "language_loss": 0.76138854, + "learning_rate": 3.7243735467083193e-06, + "loss": 0.78327405, + "num_input_tokens_seen": 69657550, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9140625, + "step": 3227, + "time_per_iteration": 4.168737411499023 + }, + { + "auxiliary_loss_clip": 0.01140847, + "auxiliary_loss_mlp": 0.01042545, + "balance_loss_clip": 1.02547467, + "balance_loss_mlp": 1.04588878, + "epoch": 0.19407785961220503, + "flos": 15920780705280.0, + "grad_norm": 1.8539897665707572, + "language_loss": 0.69154215, + "learning_rate": 3.724176216414662e-06, + "loss": 0.7133761, + "num_input_tokens_seen": 69675005, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.94921875, + "step": 3228, + "time_per_iteration": 2.4857404232025146 + }, + { + "auxiliary_loss_clip": 0.01142054, + "auxiliary_loss_mlp": 0.01044016, + "balance_loss_clip": 1.02698135, + "balance_loss_mlp": 1.04929864, + "epoch": 0.194137982864873, + "flos": 25921722787200.0, + "grad_norm": 1.9069922854616745, + "language_loss": 0.7428174, + "learning_rate": 3.72397882074007e-06, + "loss": 0.76467812, + "num_input_tokens_seen": 69696455, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9296875, + "step": 3229, + "time_per_iteration": 2.5357918739318848 + }, + { + "auxiliary_loss_clip": 0.01141663, + "auxiliary_loss_mlp": 0.01041685, + "balance_loss_clip": 1.02443504, + "balance_loss_mlp": 1.04832351, + "epoch": 0.19419810611754096, + "flos": 13261344618240.0, + "grad_norm": 1.6963766145995596, + "language_loss": 0.65157712, + "learning_rate": 3.7237813596920285e-06, + "loss": 0.67341059, + "num_input_tokens_seen": 69714245, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.93359375, + "step": 3230, + "time_per_iteration": 2.4796855449676514 + }, + { + "auxiliary_loss_clip": 0.01137871, + "auxiliary_loss_mlp": 0.01044544, + "balance_loss_clip": 1.0268054, + "balance_loss_mlp": 1.04652202, + "epoch": 0.19425822937020892, + "flos": 15705568368000.0, + "grad_norm": 1.8877471342298004, + "language_loss": 0.8184334, + "learning_rate": 3.7235838332780254e-06, + "loss": 0.84025759, + "num_input_tokens_seen": 69731515, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9140625, + "step": 3231, + "time_per_iteration": 2.5315961837768555 + }, + { + "auxiliary_loss_clip": 0.01145592, + "auxiliary_loss_mlp": 0.01039112, + "balance_loss_clip": 1.02045608, + "balance_loss_mlp": 1.05067456, + "epoch": 0.1943183526228769, + "flos": 23105392093440.0, + "grad_norm": 1.787689187471357, + "language_loss": 0.86743605, + "learning_rate": 3.72338624150555e-06, + "loss": 0.88928306, + "num_input_tokens_seen": 69748885, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.94921875, + "step": 3232, + "time_per_iteration": 2.4916152954101562 + }, + { + "auxiliary_loss_clip": 0.01141636, + "auxiliary_loss_mlp": 0.01052447, + "balance_loss_clip": 1.03497076, + "balance_loss_mlp": 1.05008495, + "epoch": 0.19437847587554485, + "flos": 24712610146560.0, + "grad_norm": 1.5602267859616314, + "language_loss": 0.8513217, + "learning_rate": 3.723188584382096e-06, + "loss": 0.87326247, + "num_input_tokens_seen": 69767540, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9140625, + "step": 3233, + "time_per_iteration": 2.526118040084839 + }, + { + "auxiliary_loss_clip": 0.01145232, + "auxiliary_loss_mlp": 0.01053705, + "balance_loss_clip": 1.03603804, + "balance_loss_mlp": 1.04827857, + "epoch": 0.19443859912821285, + "flos": 23116130259840.0, + "grad_norm": 1.6631942166294669, + "language_loss": 0.89191484, + "learning_rate": 3.722990861915158e-06, + "loss": 0.91390419, + "num_input_tokens_seen": 69789340, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.96875, + "step": 3234, + "time_per_iteration": 2.4783849716186523 + }, + { + "auxiliary_loss_clip": 0.01143869, + "auxiliary_loss_mlp": 0.01043333, + "balance_loss_clip": 1.02493858, + "balance_loss_mlp": 1.04675341, + "epoch": 0.1944987223808808, + "flos": 15084385539840.0, + "grad_norm": 2.1776085062187374, + "language_loss": 0.78503513, + "learning_rate": 3.722793074112234e-06, + "loss": 0.80690718, + "num_input_tokens_seen": 69806470, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.96875, + "step": 3235, + "time_per_iteration": 2.4414284229278564 + }, + { + "auxiliary_loss_clip": 0.01146423, + "auxiliary_loss_mlp": 0.01041843, + "balance_loss_clip": 1.02545178, + "balance_loss_mlp": 1.05288744, + "epoch": 0.19455884563354878, + "flos": 17126876603520.0, + "grad_norm": 2.115791514531618, + "language_loss": 0.7937218, + "learning_rate": 3.7225952209808233e-06, + "loss": 0.81560451, + "num_input_tokens_seen": 69822655, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.93359375, + "step": 3236, + "time_per_iteration": 2.4394619464874268 + }, + { + "auxiliary_loss_clip": 0.01144391, + "auxiliary_loss_mlp": 0.01040175, + "balance_loss_clip": 1.02204323, + "balance_loss_mlp": 1.05156302, + "epoch": 0.19461896888621674, + "flos": 20193396503040.0, + "grad_norm": 2.445233321344346, + "language_loss": 0.75936478, + "learning_rate": 3.72239730252843e-06, + "loss": 0.78121042, + "num_input_tokens_seen": 69841895, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9296875, + "step": 3237, + "time_per_iteration": 2.544003486633301 + }, + { + "auxiliary_loss_clip": 0.01147227, + "auxiliary_loss_mlp": 0.01046687, + "balance_loss_clip": 1.03005719, + "balance_loss_mlp": 1.05079889, + "epoch": 0.1946790921388847, + "flos": 25301365971840.0, + "grad_norm": 2.0921387862929586, + "language_loss": 0.75056225, + "learning_rate": 3.7221993187625583e-06, + "loss": 0.77250135, + "num_input_tokens_seen": 69862220, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.96484375, + "step": 3238, + "time_per_iteration": 2.4795806407928467 + }, + { + "auxiliary_loss_clip": 0.01147117, + "auxiliary_loss_mlp": 0.01044554, + "balance_loss_clip": 1.02600455, + "balance_loss_mlp": 1.05317962, + "epoch": 0.19473921539155267, + "flos": 20193396503040.0, + "grad_norm": 1.8233855681516762, + "language_loss": 0.73016453, + "learning_rate": 3.7220012696907155e-06, + "loss": 0.75208122, + "num_input_tokens_seen": 69881830, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.94140625, + "step": 3239, + "time_per_iteration": 2.4695816040039062 + }, + { + "auxiliary_loss_clip": 0.01144581, + "auxiliary_loss_mlp": 0.01048537, + "balance_loss_clip": 1.03026247, + "balance_loss_mlp": 1.0505631, + "epoch": 0.19479933864422067, + "flos": 20887549810560.0, + "grad_norm": 1.897973355517785, + "language_loss": 0.73792124, + "learning_rate": 3.721803155320412e-06, + "loss": 0.75985241, + "num_input_tokens_seen": 69900515, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.94140625, + "step": 3240, + "time_per_iteration": 2.4483954906463623 + }, + { + "auxiliary_loss_clip": 0.0114635, + "auxiliary_loss_mlp": 0.01041908, + "balance_loss_clip": 1.02477801, + "balance_loss_mlp": 1.05221701, + "epoch": 0.19485946189688863, + "flos": 23295072839040.0, + "grad_norm": 1.8797415358152445, + "language_loss": 0.66685343, + "learning_rate": 3.7216049756591606e-06, + "loss": 0.68873608, + "num_input_tokens_seen": 69920060, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.94140625, + "step": 3241, + "time_per_iteration": 2.5644116401672363 + }, + { + "auxiliary_loss_clip": 0.01144249, + "auxiliary_loss_mlp": 0.01045443, + "balance_loss_clip": 1.0280863, + "balance_loss_mlp": 1.05193758, + "epoch": 0.1949195851495566, + "flos": 23295036925440.0, + "grad_norm": 1.4346271942222966, + "language_loss": 0.82889283, + "learning_rate": 3.7214067307144754e-06, + "loss": 0.85078967, + "num_input_tokens_seen": 69939820, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.921875, + "step": 3242, + "time_per_iteration": 2.476043701171875 + }, + { + "auxiliary_loss_clip": 0.01054708, + "auxiliary_loss_mlp": 0.01010683, + "balance_loss_clip": 1.00856066, + "balance_loss_mlp": 1.02379096, + "epoch": 0.19497970840222456, + "flos": 64962871557120.0, + "grad_norm": 0.8482804620416572, + "language_loss": 0.57572454, + "learning_rate": 3.721208420493875e-06, + "loss": 0.59637845, + "num_input_tokens_seen": 70002145, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.30859375, + "step": 3243, + "time_per_iteration": 3.1217525005340576 + }, + { + "auxiliary_loss_clip": 0.01144732, + "auxiliary_loss_mlp": 0.01043073, + "balance_loss_clip": 1.02573967, + "balance_loss_mlp": 1.05099249, + "epoch": 0.19503983165489253, + "flos": 19644717277440.0, + "grad_norm": 2.02063631868758, + "language_loss": 0.83243412, + "learning_rate": 3.7210100450048784e-06, + "loss": 0.85431218, + "num_input_tokens_seen": 70020510, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9375, + "step": 3244, + "time_per_iteration": 2.4848830699920654 + }, + { + "auxiliary_loss_clip": 0.01147429, + "auxiliary_loss_mlp": 0.01048127, + "balance_loss_clip": 1.03144979, + "balance_loss_mlp": 1.05495024, + "epoch": 0.1950999549075605, + "flos": 21141976821120.0, + "grad_norm": 1.8275576625869878, + "language_loss": 0.77049786, + "learning_rate": 3.7208116042550088e-06, + "loss": 0.79245341, + "num_input_tokens_seen": 70040760, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.921875, + "step": 3245, + "time_per_iteration": 2.5539040565490723 + }, + { + "auxiliary_loss_clip": 0.01141945, + "auxiliary_loss_mlp": 0.01041151, + "balance_loss_clip": 1.0235796, + "balance_loss_mlp": 1.04852772, + "epoch": 0.19516007816022846, + "flos": 20884820376960.0, + "grad_norm": 2.8639596298576055, + "language_loss": 0.84020388, + "learning_rate": 3.7206130982517906e-06, + "loss": 0.86203486, + "num_input_tokens_seen": 70058720, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9375, + "step": 3246, + "time_per_iteration": 2.5018341541290283 + }, + { + "auxiliary_loss_clip": 0.0114444, + "auxiliary_loss_mlp": 0.01045285, + "balance_loss_clip": 1.02834511, + "balance_loss_mlp": 1.04978824, + "epoch": 0.19522020141289645, + "flos": 16910515031040.0, + "grad_norm": 2.1267063345385777, + "language_loss": 0.7636531, + "learning_rate": 3.7204145270027514e-06, + "loss": 0.78555036, + "num_input_tokens_seen": 70076470, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9453125, + "step": 3247, + "time_per_iteration": 2.4512898921966553 + }, + { + "auxiliary_loss_clip": 0.01144663, + "auxiliary_loss_mlp": 0.01038699, + "balance_loss_clip": 1.02228367, + "balance_loss_mlp": 1.05077446, + "epoch": 0.19528032466556441, + "flos": 26724829023360.0, + "grad_norm": 1.4744510548582124, + "language_loss": 0.75330198, + "learning_rate": 3.720215890515421e-06, + "loss": 0.77513552, + "num_input_tokens_seen": 70096220, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9375, + "step": 3248, + "time_per_iteration": 2.5222222805023193 + }, + { + "auxiliary_loss_clip": 0.01140079, + "auxiliary_loss_mlp": 0.0104275, + "balance_loss_clip": 1.02590537, + "balance_loss_mlp": 1.04661679, + "epoch": 0.19534044791823238, + "flos": 21032808410880.0, + "grad_norm": 1.9881324270373204, + "language_loss": 0.78316575, + "learning_rate": 3.7200171887973316e-06, + "loss": 0.80499399, + "num_input_tokens_seen": 70114800, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.93359375, + "step": 3249, + "time_per_iteration": 2.475385904312134 + }, + { + "auxiliary_loss_clip": 0.01143906, + "auxiliary_loss_mlp": 0.01048238, + "balance_loss_clip": 1.0316205, + "balance_loss_mlp": 1.04948914, + "epoch": 0.19540057117090034, + "flos": 22344050396160.0, + "grad_norm": 1.839405294960197, + "language_loss": 0.73238158, + "learning_rate": 3.7198184218560176e-06, + "loss": 0.7543031, + "num_input_tokens_seen": 70134930, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9453125, + "step": 3250, + "time_per_iteration": 2.4548323154449463 + }, + { + "auxiliary_loss_clip": 0.01136082, + "auxiliary_loss_mlp": 0.01037994, + "balance_loss_clip": 1.02206779, + "balance_loss_mlp": 1.04583359, + "epoch": 0.1954606944235683, + "flos": 20301631159680.0, + "grad_norm": 1.9014920395959154, + "language_loss": 0.79582441, + "learning_rate": 3.719619589699017e-06, + "loss": 0.8175652, + "num_input_tokens_seen": 70152045, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.90234375, + "step": 3251, + "time_per_iteration": 2.4597084522247314 + }, + { + "auxiliary_loss_clip": 0.01142571, + "auxiliary_loss_mlp": 0.01041367, + "balance_loss_clip": 1.02441597, + "balance_loss_mlp": 1.04888558, + "epoch": 0.19552081767623627, + "flos": 17346865449600.0, + "grad_norm": 3.2143497379473613, + "language_loss": 0.83534026, + "learning_rate": 3.7194206923338695e-06, + "loss": 0.85717964, + "num_input_tokens_seen": 70169240, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9375, + "step": 3252, + "time_per_iteration": 2.4245967864990234 + }, + { + "auxiliary_loss_clip": 0.01143675, + "auxiliary_loss_mlp": 0.01048315, + "balance_loss_clip": 1.03026652, + "balance_loss_mlp": 1.04651105, + "epoch": 0.19558094092890424, + "flos": 31977626129280.0, + "grad_norm": 1.7806404718622555, + "language_loss": 0.73870194, + "learning_rate": 3.719221729768117e-06, + "loss": 0.76062191, + "num_input_tokens_seen": 70192690, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.96875, + "step": 3253, + "time_per_iteration": 2.5752809047698975 + }, + { + "auxiliary_loss_clip": 0.01142809, + "auxiliary_loss_mlp": 0.01040218, + "balance_loss_clip": 1.02352846, + "balance_loss_mlp": 1.04619944, + "epoch": 0.19564106418157223, + "flos": 22268889187200.0, + "grad_norm": 1.833285648050628, + "language_loss": 0.76684111, + "learning_rate": 3.7190227020093037e-06, + "loss": 0.78867137, + "num_input_tokens_seen": 70209685, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.96484375, + "step": 3254, + "time_per_iteration": 2.533993721008301 + }, + { + "auxiliary_loss_clip": 0.01044914, + "auxiliary_loss_mlp": 0.01004749, + "balance_loss_clip": 1.00268674, + "balance_loss_mlp": 1.01349974, + "epoch": 0.1957011874342402, + "flos": 54364554385920.0, + "grad_norm": 0.7652407497357797, + "language_loss": 0.55344874, + "learning_rate": 3.7188236090649774e-06, + "loss": 0.5739454, + "num_input_tokens_seen": 70265050, + "router_z_loss_clip": 0.02062988, + "router_z_loss_mlp": 0.3125, + "step": 3255, + "time_per_iteration": 3.164173126220703 + }, + { + "auxiliary_loss_clip": 0.01144973, + "auxiliary_loss_mlp": 0.01041369, + "balance_loss_clip": 1.02407217, + "balance_loss_mlp": 1.05057478, + "epoch": 0.19576131068690816, + "flos": 16506699356160.0, + "grad_norm": 2.650975615707017, + "language_loss": 0.7066443, + "learning_rate": 3.718624450942688e-06, + "loss": 0.7285077, + "num_input_tokens_seen": 70281830, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9453125, + "step": 3256, + "time_per_iteration": 2.496424436569214 + }, + { + "auxiliary_loss_clip": 0.01139601, + "auxiliary_loss_mlp": 0.0104318, + "balance_loss_clip": 1.02635908, + "balance_loss_mlp": 1.04647136, + "epoch": 0.19582143393957613, + "flos": 14719676797440.0, + "grad_norm": 2.256610935254856, + "language_loss": 0.80055118, + "learning_rate": 3.718425227649987e-06, + "loss": 0.82237899, + "num_input_tokens_seen": 70297420, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9296875, + "step": 3257, + "time_per_iteration": 2.4771504402160645 + }, + { + "auxiliary_loss_clip": 0.01143218, + "auxiliary_loss_mlp": 0.01042656, + "balance_loss_clip": 1.02637219, + "balance_loss_mlp": 1.05034149, + "epoch": 0.1958815571922441, + "flos": 24425504737920.0, + "grad_norm": 1.9567741269254724, + "language_loss": 0.74843282, + "learning_rate": 3.7182259391944292e-06, + "loss": 0.77029151, + "num_input_tokens_seen": 70319210, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.9296875, + "step": 3258, + "time_per_iteration": 2.6177120208740234 + }, + { + "auxiliary_loss_clip": 0.01142767, + "auxiliary_loss_mlp": 0.01036452, + "balance_loss_clip": 1.01932144, + "balance_loss_mlp": 1.04772139, + "epoch": 0.19594168044491206, + "flos": 24900279730560.0, + "grad_norm": 1.7410781544458231, + "language_loss": 0.74462247, + "learning_rate": 3.7180265855835714e-06, + "loss": 0.7664147, + "num_input_tokens_seen": 70339045, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.94921875, + "step": 3259, + "time_per_iteration": 2.54068660736084 + }, + { + "auxiliary_loss_clip": 0.01145135, + "auxiliary_loss_mlp": 0.01036775, + "balance_loss_clip": 1.01923943, + "balance_loss_mlp": 1.04965675, + "epoch": 0.19600180369758005, + "flos": 12057008486400.0, + "grad_norm": 2.380592438675979, + "language_loss": 0.77040654, + "learning_rate": 3.7178271668249735e-06, + "loss": 0.7922256, + "num_input_tokens_seen": 70356505, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.953125, + "step": 3260, + "time_per_iteration": 2.4983303546905518 + }, + { + "auxiliary_loss_clip": 0.01143361, + "auxiliary_loss_mlp": 0.01041828, + "balance_loss_clip": 1.02459061, + "balance_loss_mlp": 1.0486325, + "epoch": 0.19606192695024802, + "flos": 20850202644480.0, + "grad_norm": 2.011568492365706, + "language_loss": 0.82168972, + "learning_rate": 3.7176276829261975e-06, + "loss": 0.84354162, + "num_input_tokens_seen": 70375410, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9453125, + "step": 3261, + "time_per_iteration": 2.52164626121521 + }, + { + "auxiliary_loss_clip": 0.01144228, + "auxiliary_loss_mlp": 0.01042699, + "balance_loss_clip": 1.02510428, + "balance_loss_mlp": 1.05130327, + "epoch": 0.19612205020291598, + "flos": 28475509996800.0, + "grad_norm": 2.1812525814986112, + "language_loss": 0.76691413, + "learning_rate": 3.717428133894807e-06, + "loss": 0.78878343, + "num_input_tokens_seen": 70396315, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9296875, + "step": 3262, + "time_per_iteration": 2.513619899749756 + }, + { + "auxiliary_loss_clip": 0.01145398, + "auxiliary_loss_mlp": 0.01044459, + "balance_loss_clip": 1.02775788, + "balance_loss_mlp": 1.05290008, + "epoch": 0.19618217345558395, + "flos": 25556618995200.0, + "grad_norm": 1.7175684177653927, + "language_loss": 0.8667773, + "learning_rate": 3.71722851973837e-06, + "loss": 0.88867593, + "num_input_tokens_seen": 70417945, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.921875, + "step": 3263, + "time_per_iteration": 2.49373459815979 + }, + { + "auxiliary_loss_clip": 0.01140801, + "auxiliary_loss_mlp": 0.01041854, + "balance_loss_clip": 1.0251646, + "balance_loss_mlp": 1.04784787, + "epoch": 0.1962422967082519, + "flos": 25264413855360.0, + "grad_norm": 1.5660143494742738, + "language_loss": 0.74136549, + "learning_rate": 3.717028840464455e-06, + "loss": 0.76319206, + "num_input_tokens_seen": 70438690, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9296875, + "step": 3264, + "time_per_iteration": 2.4891843795776367 + }, + { + "auxiliary_loss_clip": 0.0114591, + "auxiliary_loss_mlp": 0.01049823, + "balance_loss_clip": 1.03340793, + "balance_loss_mlp": 1.05435038, + "epoch": 0.19630241996091988, + "flos": 18807352444800.0, + "grad_norm": 4.0742741532711975, + "language_loss": 0.78590196, + "learning_rate": 3.7168290960806344e-06, + "loss": 0.8078593, + "num_input_tokens_seen": 70455385, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9140625, + "step": 3265, + "time_per_iteration": 2.4226529598236084 + }, + { + "auxiliary_loss_clip": 0.01047401, + "auxiliary_loss_mlp": 0.01014864, + "balance_loss_clip": 1.01292133, + "balance_loss_mlp": 1.01652646, + "epoch": 0.19636254321358784, + "flos": 62321137896960.0, + "grad_norm": 0.7852387786228787, + "language_loss": 0.53459084, + "learning_rate": 3.716629286594483e-06, + "loss": 0.55521357, + "num_input_tokens_seen": 70514280, + "router_z_loss_clip": 0.01940918, + "router_z_loss_mlp": 0.30859375, + "step": 3266, + "time_per_iteration": 3.0519652366638184 + }, + { + "auxiliary_loss_clip": 0.01145434, + "auxiliary_loss_mlp": 0.01041492, + "balance_loss_clip": 1.02263319, + "balance_loss_mlp": 1.04800785, + "epoch": 0.19642266646625584, + "flos": 21069329564160.0, + "grad_norm": 1.9728388819613873, + "language_loss": 0.80503136, + "learning_rate": 3.7164294120135767e-06, + "loss": 0.82690066, + "num_input_tokens_seen": 70531800, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.9765625, + "step": 3267, + "time_per_iteration": 2.436455011367798 + }, + { + "auxiliary_loss_clip": 0.01138843, + "auxiliary_loss_mlp": 0.0104324, + "balance_loss_clip": 1.02726591, + "balance_loss_mlp": 1.04780269, + "epoch": 0.1964827897189238, + "flos": 14538651229440.0, + "grad_norm": 2.528633756775916, + "language_loss": 0.87031806, + "learning_rate": 3.7162294723454953e-06, + "loss": 0.89213896, + "num_input_tokens_seen": 70550615, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.91015625, + "step": 3268, + "time_per_iteration": 5.348580360412598 + }, + { + "auxiliary_loss_clip": 0.01141651, + "auxiliary_loss_mlp": 0.01045776, + "balance_loss_clip": 1.02865744, + "balance_loss_mlp": 1.04996669, + "epoch": 0.19654291297159177, + "flos": 19244636616960.0, + "grad_norm": 2.7845337804652086, + "language_loss": 0.69331455, + "learning_rate": 3.7160294675978197e-06, + "loss": 0.71518886, + "num_input_tokens_seen": 70568690, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9140625, + "step": 3269, + "time_per_iteration": 3.9386346340179443 + }, + { + "auxiliary_loss_clip": 0.01148343, + "auxiliary_loss_mlp": 0.01051701, + "balance_loss_clip": 1.03361702, + "balance_loss_mlp": 1.0530045, + "epoch": 0.19660303622425973, + "flos": 25775710001280.0, + "grad_norm": 2.4386480468071086, + "language_loss": 0.80760634, + "learning_rate": 3.715829397778135e-06, + "loss": 0.82960677, + "num_input_tokens_seen": 70588665, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.953125, + "step": 3270, + "time_per_iteration": 2.5130820274353027 + }, + { + "auxiliary_loss_clip": 0.01140062, + "auxiliary_loss_mlp": 0.01045089, + "balance_loss_clip": 1.02848363, + "balance_loss_mlp": 1.04726839, + "epoch": 0.1966631594769277, + "flos": 20595093275520.0, + "grad_norm": 1.857854204827715, + "language_loss": 0.83918732, + "learning_rate": 3.715629262894028e-06, + "loss": 0.86103886, + "num_input_tokens_seen": 70606900, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9296875, + "step": 3271, + "time_per_iteration": 2.4522581100463867 + }, + { + "auxiliary_loss_clip": 0.01139583, + "auxiliary_loss_mlp": 0.01046491, + "balance_loss_clip": 1.0297302, + "balance_loss_mlp": 1.04943895, + "epoch": 0.19672328272959566, + "flos": 23623188600960.0, + "grad_norm": 2.1376155358713835, + "language_loss": 0.80162311, + "learning_rate": 3.715429062953087e-06, + "loss": 0.82348382, + "num_input_tokens_seen": 70625955, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8984375, + "step": 3272, + "time_per_iteration": 2.4968738555908203 + }, + { + "auxiliary_loss_clip": 0.01145188, + "auxiliary_loss_mlp": 0.01045772, + "balance_loss_clip": 1.02766371, + "balance_loss_mlp": 1.05075002, + "epoch": 0.19678340598226365, + "flos": 23110922787840.0, + "grad_norm": 1.7855512393811417, + "language_loss": 0.80728978, + "learning_rate": 3.7152287979629043e-06, + "loss": 0.82919937, + "num_input_tokens_seen": 70646090, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9453125, + "step": 3273, + "time_per_iteration": 2.525407552719116 + }, + { + "auxiliary_loss_clip": 0.01142802, + "auxiliary_loss_mlp": 0.01051833, + "balance_loss_clip": 1.03454804, + "balance_loss_mlp": 1.04807115, + "epoch": 0.19684352923493162, + "flos": 24534852716160.0, + "grad_norm": 5.081990879764466, + "language_loss": 0.7791425, + "learning_rate": 3.7150284679310735e-06, + "loss": 0.80108881, + "num_input_tokens_seen": 70666065, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9453125, + "step": 3274, + "time_per_iteration": 2.527858018875122 + }, + { + "auxiliary_loss_clip": 0.01141542, + "auxiliary_loss_mlp": 0.01052239, + "balance_loss_clip": 1.03440571, + "balance_loss_mlp": 1.04765558, + "epoch": 0.19690365248759958, + "flos": 21796448578560.0, + "grad_norm": 2.1984029701042367, + "language_loss": 0.81144857, + "learning_rate": 3.7148280728651914e-06, + "loss": 0.83338642, + "num_input_tokens_seen": 70681580, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.9375, + "step": 3275, + "time_per_iteration": 2.451392412185669 + }, + { + "auxiliary_loss_clip": 0.01143631, + "auxiliary_loss_mlp": 0.01047389, + "balance_loss_clip": 1.02934027, + "balance_loss_mlp": 1.04772139, + "epoch": 0.19696377574026755, + "flos": 19056643810560.0, + "grad_norm": 1.90284229785688, + "language_loss": 0.81104618, + "learning_rate": 3.7146276127728563e-06, + "loss": 0.83295637, + "num_input_tokens_seen": 70697745, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9609375, + "step": 3276, + "time_per_iteration": 2.462033748626709 + }, + { + "auxiliary_loss_clip": 0.01142306, + "auxiliary_loss_mlp": 0.01038428, + "balance_loss_clip": 1.02132106, + "balance_loss_mlp": 1.04889154, + "epoch": 0.19702389899293551, + "flos": 22820656982400.0, + "grad_norm": 2.0909421048868126, + "language_loss": 0.89347923, + "learning_rate": 3.7144270876616713e-06, + "loss": 0.91528654, + "num_input_tokens_seen": 70715110, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.93359375, + "step": 3277, + "time_per_iteration": 2.4887003898620605 + }, + { + "auxiliary_loss_clip": 0.01146208, + "auxiliary_loss_mlp": 0.01047781, + "balance_loss_clip": 1.02804041, + "balance_loss_mlp": 1.04832077, + "epoch": 0.19708402224560348, + "flos": 22894237992960.0, + "grad_norm": 2.9974095646387573, + "language_loss": 0.62265754, + "learning_rate": 3.714226497539239e-06, + "loss": 0.64459741, + "num_input_tokens_seen": 70734715, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.9765625, + "step": 3278, + "time_per_iteration": 2.560401201248169 + }, + { + "auxiliary_loss_clip": 0.01144829, + "auxiliary_loss_mlp": 0.01054112, + "balance_loss_clip": 1.03562284, + "balance_loss_mlp": 1.04910243, + "epoch": 0.19714414549827144, + "flos": 25662519267840.0, + "grad_norm": 3.1131920881239936, + "language_loss": 0.73664343, + "learning_rate": 3.714025842413166e-06, + "loss": 0.75863284, + "num_input_tokens_seen": 70752650, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.95703125, + "step": 3279, + "time_per_iteration": 2.5036048889160156 + }, + { + "auxiliary_loss_clip": 0.01144667, + "auxiliary_loss_mlp": 0.01045176, + "balance_loss_clip": 1.02816486, + "balance_loss_mlp": 1.04906511, + "epoch": 0.19720426875093944, + "flos": 23915824704000.0, + "grad_norm": 1.6310774806952162, + "language_loss": 0.82451236, + "learning_rate": 3.713825122291061e-06, + "loss": 0.84641075, + "num_input_tokens_seen": 70772365, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.95703125, + "step": 3280, + "time_per_iteration": 2.499962091445923 + }, + { + "auxiliary_loss_clip": 0.01145271, + "auxiliary_loss_mlp": 0.01043645, + "balance_loss_clip": 1.02744484, + "balance_loss_mlp": 1.05086279, + "epoch": 0.1972643920036074, + "flos": 13881952828800.0, + "grad_norm": 1.847926035637751, + "language_loss": 0.77581155, + "learning_rate": 3.713624337180536e-06, + "loss": 0.79770064, + "num_input_tokens_seen": 70790340, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.94140625, + "step": 3281, + "time_per_iteration": 2.4610888957977295 + }, + { + "auxiliary_loss_clip": 0.01141389, + "auxiliary_loss_mlp": 0.01043048, + "balance_loss_clip": 1.02719295, + "balance_loss_mlp": 1.0507971, + "epoch": 0.19732451525627537, + "flos": 19863592801920.0, + "grad_norm": 1.593504057665797, + "language_loss": 0.79502213, + "learning_rate": 3.7134234870892045e-06, + "loss": 0.81686652, + "num_input_tokens_seen": 70809295, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.90625, + "step": 3282, + "time_per_iteration": 2.453230857849121 + }, + { + "auxiliary_loss_clip": 0.01149903, + "auxiliary_loss_mlp": 0.01044987, + "balance_loss_clip": 1.0279994, + "balance_loss_mlp": 1.05359089, + "epoch": 0.19738463850894333, + "flos": 24973429777920.0, + "grad_norm": 2.157912578421005, + "language_loss": 0.71937042, + "learning_rate": 3.7132225720246826e-06, + "loss": 0.7413193, + "num_input_tokens_seen": 70828765, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9609375, + "step": 3283, + "time_per_iteration": 2.5070157051086426 + }, + { + "auxiliary_loss_clip": 0.0114321, + "auxiliary_loss_mlp": 0.01041465, + "balance_loss_clip": 1.02462053, + "balance_loss_mlp": 1.04858577, + "epoch": 0.1974447617616113, + "flos": 18368883123840.0, + "grad_norm": 1.741034644212953, + "language_loss": 0.78832877, + "learning_rate": 3.7130215919945886e-06, + "loss": 0.81017548, + "num_input_tokens_seen": 70846805, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9453125, + "step": 3284, + "time_per_iteration": 2.436530113220215 + }, + { + "auxiliary_loss_clip": 0.01147439, + "auxiliary_loss_mlp": 0.01047462, + "balance_loss_clip": 1.02952087, + "balance_loss_mlp": 1.05069387, + "epoch": 0.19750488501427926, + "flos": 22892945103360.0, + "grad_norm": 2.0622477624774325, + "language_loss": 0.86366653, + "learning_rate": 3.7128205470065445e-06, + "loss": 0.88561547, + "num_input_tokens_seen": 70863805, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.96875, + "step": 3285, + "time_per_iteration": 2.4581058025360107 + }, + { + "auxiliary_loss_clip": 0.01143401, + "auxiliary_loss_mlp": 0.01042449, + "balance_loss_clip": 1.02571201, + "balance_loss_mlp": 1.0520879, + "epoch": 0.19756500826694723, + "flos": 21871502046720.0, + "grad_norm": 2.7361177014734372, + "language_loss": 0.88680863, + "learning_rate": 3.712619437068174e-06, + "loss": 0.90866709, + "num_input_tokens_seen": 70882660, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9140625, + "step": 3286, + "time_per_iteration": 2.472475290298462 + }, + { + "auxiliary_loss_clip": 0.01148385, + "auxiliary_loss_mlp": 0.01049125, + "balance_loss_clip": 1.03036189, + "balance_loss_mlp": 1.05260301, + "epoch": 0.19762513151961522, + "flos": 15158972131200.0, + "grad_norm": 2.2372981039860833, + "language_loss": 0.78297567, + "learning_rate": 3.712418262187102e-06, + "loss": 0.80495083, + "num_input_tokens_seen": 70898765, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.95703125, + "step": 3287, + "time_per_iteration": 2.417569875717163 + }, + { + "auxiliary_loss_clip": 0.01146192, + "auxiliary_loss_mlp": 0.01045423, + "balance_loss_clip": 1.02674246, + "balance_loss_mlp": 1.04974318, + "epoch": 0.1976852547722832, + "flos": 16979175878400.0, + "grad_norm": 2.197025185749627, + "language_loss": 0.81252837, + "learning_rate": 3.7122170223709584e-06, + "loss": 0.83444452, + "num_input_tokens_seen": 70916370, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.96484375, + "step": 3288, + "time_per_iteration": 2.4107155799865723 + }, + { + "auxiliary_loss_clip": 0.01139417, + "auxiliary_loss_mlp": 0.01049687, + "balance_loss_clip": 1.03315234, + "balance_loss_mlp": 1.04890108, + "epoch": 0.19774537802495115, + "flos": 20302924049280.0, + "grad_norm": 1.7615970311636253, + "language_loss": 0.72502065, + "learning_rate": 3.712015717627374e-06, + "loss": 0.74691164, + "num_input_tokens_seen": 70934870, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.90625, + "step": 3289, + "time_per_iteration": 2.4479291439056396 + }, + { + "auxiliary_loss_clip": 0.01144115, + "auxiliary_loss_mlp": 0.01045349, + "balance_loss_clip": 1.02807593, + "balance_loss_mlp": 1.0500598, + "epoch": 0.19780550127761912, + "flos": 27235478724480.0, + "grad_norm": 2.0523474932115833, + "language_loss": 0.7944051, + "learning_rate": 3.7118143479639813e-06, + "loss": 0.81629974, + "num_input_tokens_seen": 70955140, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9375, + "step": 3290, + "time_per_iteration": 2.499950408935547 + }, + { + "auxiliary_loss_clip": 0.0104544, + "auxiliary_loss_mlp": 0.01002976, + "balance_loss_clip": 1.00056827, + "balance_loss_mlp": 1.01336336, + "epoch": 0.19786562453028708, + "flos": 63550972684800.0, + "grad_norm": 0.9098407078047199, + "language_loss": 0.60440773, + "learning_rate": 3.711612913388418e-06, + "loss": 0.62489194, + "num_input_tokens_seen": 71012005, + "router_z_loss_clip": 0.02404785, + "router_z_loss_mlp": 0.3203125, + "step": 3291, + "time_per_iteration": 3.1538305282592773 + }, + { + "auxiliary_loss_clip": 0.01144863, + "auxiliary_loss_mlp": 0.01044766, + "balance_loss_clip": 1.02639592, + "balance_loss_mlp": 1.04670751, + "epoch": 0.19792574778295505, + "flos": 26286647011200.0, + "grad_norm": 2.151168561582294, + "language_loss": 0.81352198, + "learning_rate": 3.7114114139083204e-06, + "loss": 0.83541822, + "num_input_tokens_seen": 71031140, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.984375, + "step": 3292, + "time_per_iteration": 2.539417028427124 + }, + { + "auxiliary_loss_clip": 0.01137712, + "auxiliary_loss_mlp": 0.01047669, + "balance_loss_clip": 1.03051507, + "balance_loss_mlp": 1.04855824, + "epoch": 0.19798587103562304, + "flos": 19938107566080.0, + "grad_norm": 2.212806192124084, + "language_loss": 0.82146955, + "learning_rate": 3.7112098495313313e-06, + "loss": 0.84332335, + "num_input_tokens_seen": 71050250, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.890625, + "step": 3293, + "time_per_iteration": 2.438809394836426 + }, + { + "auxiliary_loss_clip": 0.01151271, + "auxiliary_loss_mlp": 0.01048402, + "balance_loss_clip": 1.02988923, + "balance_loss_mlp": 1.05333924, + "epoch": 0.198045994288291, + "flos": 20120282369280.0, + "grad_norm": 2.10438249616411, + "language_loss": 0.61268854, + "learning_rate": 3.711008220265093e-06, + "loss": 0.63468528, + "num_input_tokens_seen": 71068665, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9765625, + "step": 3294, + "time_per_iteration": 2.451650381088257 + }, + { + "auxiliary_loss_clip": 0.01143209, + "auxiliary_loss_mlp": 0.01043395, + "balance_loss_clip": 1.02681279, + "balance_loss_mlp": 1.05004907, + "epoch": 0.19810611754095897, + "flos": 17967653228160.0, + "grad_norm": 2.028666267444235, + "language_loss": 0.86983609, + "learning_rate": 3.710806526117251e-06, + "loss": 0.89170212, + "num_input_tokens_seen": 71085320, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9296875, + "step": 3295, + "time_per_iteration": 2.416771411895752 + }, + { + "auxiliary_loss_clip": 0.01141633, + "auxiliary_loss_mlp": 0.01051654, + "balance_loss_clip": 1.03529871, + "balance_loss_mlp": 1.04786801, + "epoch": 0.19816624079362694, + "flos": 15084996071040.0, + "grad_norm": 13.771873008268457, + "language_loss": 0.80491048, + "learning_rate": 3.7106047670954544e-06, + "loss": 0.82684338, + "num_input_tokens_seen": 71102020, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.9375, + "step": 3296, + "time_per_iteration": 2.450934648513794 + }, + { + "auxiliary_loss_clip": 0.01145402, + "auxiliary_loss_mlp": 0.01045523, + "balance_loss_clip": 1.02637851, + "balance_loss_mlp": 1.0482688, + "epoch": 0.1982263640462949, + "flos": 24900315644160.0, + "grad_norm": 2.0804115334054134, + "language_loss": 0.68406892, + "learning_rate": 3.710402943207354e-06, + "loss": 0.70597816, + "num_input_tokens_seen": 71123390, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.97265625, + "step": 3297, + "time_per_iteration": 2.5111610889434814 + }, + { + "auxiliary_loss_clip": 0.01138573, + "auxiliary_loss_mlp": 0.01040678, + "balance_loss_clip": 1.02440548, + "balance_loss_mlp": 1.04895413, + "epoch": 0.19828648729896287, + "flos": 20376181837440.0, + "grad_norm": 1.7575465421519259, + "language_loss": 0.81232154, + "learning_rate": 3.7102010544606016e-06, + "loss": 0.83411407, + "num_input_tokens_seen": 71141800, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.89453125, + "step": 3298, + "time_per_iteration": 2.472025156021118 + }, + { + "auxiliary_loss_clip": 0.01147375, + "auxiliary_loss_mlp": 0.01046386, + "balance_loss_clip": 1.02634668, + "balance_loss_mlp": 1.05001056, + "epoch": 0.19834661055163083, + "flos": 18880035615360.0, + "grad_norm": 2.343960149367745, + "language_loss": 0.85115641, + "learning_rate": 3.7099991008628544e-06, + "loss": 0.87309396, + "num_input_tokens_seen": 71159505, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 0.9765625, + "step": 3299, + "time_per_iteration": 2.4725356101989746 + }, + { + "auxiliary_loss_clip": 0.01045198, + "auxiliary_loss_mlp": 0.01003179, + "balance_loss_clip": 1.00097358, + "balance_loss_mlp": 1.0131526, + "epoch": 0.19840673380429882, + "flos": 60259184640000.0, + "grad_norm": 0.7731212371218976, + "language_loss": 0.53215671, + "learning_rate": 3.7097970824217706e-06, + "loss": 0.55264044, + "num_input_tokens_seen": 71223265, + "router_z_loss_clip": 0.02209473, + "router_z_loss_mlp": 0.3203125, + "step": 3300, + "time_per_iteration": 3.004054069519043 + }, + { + "auxiliary_loss_clip": 0.01142157, + "auxiliary_loss_mlp": 0.01051571, + "balance_loss_clip": 1.03298628, + "balance_loss_mlp": 1.04772329, + "epoch": 0.1984668570569668, + "flos": 19902017376000.0, + "grad_norm": 1.6138936044346288, + "language_loss": 0.73150593, + "learning_rate": 3.7095949991450093e-06, + "loss": 0.75344324, + "num_input_tokens_seen": 71242385, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9453125, + "step": 3301, + "time_per_iteration": 2.4547884464263916 + }, + { + "auxiliary_loss_clip": 0.01140885, + "auxiliary_loss_mlp": 0.01038256, + "balance_loss_clip": 1.02191293, + "balance_loss_mlp": 1.04811358, + "epoch": 0.19852698030963475, + "flos": 15630766295040.0, + "grad_norm": 2.437382428027231, + "language_loss": 0.88445318, + "learning_rate": 3.709392851040235e-06, + "loss": 0.90624458, + "num_input_tokens_seen": 71258990, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.9296875, + "step": 3302, + "time_per_iteration": 2.429579019546509 + }, + { + "auxiliary_loss_clip": 0.01142317, + "auxiliary_loss_mlp": 0.0104676, + "balance_loss_clip": 1.02940273, + "balance_loss_mlp": 1.04750872, + "epoch": 0.19858710356230272, + "flos": 43143007311360.0, + "grad_norm": 1.9503370408087137, + "language_loss": 0.73907369, + "learning_rate": 3.709190638115111e-06, + "loss": 0.76096445, + "num_input_tokens_seen": 71282770, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9453125, + "step": 3303, + "time_per_iteration": 2.627835273742676 + }, + { + "auxiliary_loss_clip": 0.01141217, + "auxiliary_loss_mlp": 0.0104825, + "balance_loss_clip": 1.03117871, + "balance_loss_mlp": 1.04874539, + "epoch": 0.19864722681497068, + "flos": 35144084643840.0, + "grad_norm": 1.8172241344194675, + "language_loss": 0.74761099, + "learning_rate": 3.7089883603773084e-06, + "loss": 0.76950562, + "num_input_tokens_seen": 71301410, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.921875, + "step": 3304, + "time_per_iteration": 2.551241397857666 + }, + { + "auxiliary_loss_clip": 0.01139854, + "auxiliary_loss_mlp": 0.01039407, + "balance_loss_clip": 1.02333784, + "balance_loss_mlp": 1.04763281, + "epoch": 0.19870735006763865, + "flos": 19426200888960.0, + "grad_norm": 2.605019982075021, + "language_loss": 0.85717452, + "learning_rate": 3.7087860178344955e-06, + "loss": 0.87896717, + "num_input_tokens_seen": 71319670, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.921875, + "step": 3305, + "time_per_iteration": 2.432363986968994 + }, + { + "auxiliary_loss_clip": 0.01141298, + "auxiliary_loss_mlp": 0.01040354, + "balance_loss_clip": 1.02408171, + "balance_loss_mlp": 1.04600525, + "epoch": 0.19876747332030664, + "flos": 23547380947200.0, + "grad_norm": 1.7555780714506408, + "language_loss": 0.68014234, + "learning_rate": 3.7085836104943445e-06, + "loss": 0.70195889, + "num_input_tokens_seen": 71339850, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.953125, + "step": 3306, + "time_per_iteration": 2.478422164916992 + }, + { + "auxiliary_loss_clip": 0.01137681, + "auxiliary_loss_mlp": 0.01036227, + "balance_loss_clip": 1.02098584, + "balance_loss_mlp": 1.0453912, + "epoch": 0.1988275965729746, + "flos": 19829406032640.0, + "grad_norm": 1.4744708200758283, + "language_loss": 0.76455241, + "learning_rate": 3.7083811383645332e-06, + "loss": 0.78629148, + "num_input_tokens_seen": 71359795, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.921875, + "step": 3307, + "time_per_iteration": 2.459176778793335 + }, + { + "auxiliary_loss_clip": 0.01140736, + "auxiliary_loss_mlp": 0.0104117, + "balance_loss_clip": 1.02520776, + "balance_loss_mlp": 1.04866791, + "epoch": 0.19888771982564257, + "flos": 23513625141120.0, + "grad_norm": 1.8666050855147507, + "language_loss": 0.75933248, + "learning_rate": 3.708178601452737e-06, + "loss": 0.78115153, + "num_input_tokens_seen": 71378885, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.921875, + "step": 3308, + "time_per_iteration": 2.483060121536255 + }, + { + "auxiliary_loss_clip": 0.01141228, + "auxiliary_loss_mlp": 0.01041046, + "balance_loss_clip": 1.02426159, + "balance_loss_mlp": 1.04736626, + "epoch": 0.19894784307831054, + "flos": 18150510389760.0, + "grad_norm": 1.6368693105847256, + "language_loss": 0.75640005, + "learning_rate": 3.7079759997666374e-06, + "loss": 0.7782228, + "num_input_tokens_seen": 71397285, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.94140625, + "step": 3309, + "time_per_iteration": 3.8069632053375244 + }, + { + "auxiliary_loss_clip": 0.01138354, + "auxiliary_loss_mlp": 0.01046592, + "balance_loss_clip": 1.02869844, + "balance_loss_mlp": 1.04665506, + "epoch": 0.1990079663309785, + "flos": 24276044246400.0, + "grad_norm": 1.6858420956549012, + "language_loss": 0.87646699, + "learning_rate": 3.707773333313917e-06, + "loss": 0.8983165, + "num_input_tokens_seen": 71415775, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.9140625, + "step": 3310, + "time_per_iteration": 3.9299721717834473 + }, + { + "auxiliary_loss_clip": 0.01138843, + "auxiliary_loss_mlp": 0.01041462, + "balance_loss_clip": 1.02431977, + "balance_loss_mlp": 1.04637599, + "epoch": 0.19906808958364647, + "flos": 34897666366080.0, + "grad_norm": 3.6845239503362412, + "language_loss": 0.64166129, + "learning_rate": 3.70757060210226e-06, + "loss": 0.66346431, + "num_input_tokens_seen": 71437315, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.92578125, + "step": 3311, + "time_per_iteration": 2.5747337341308594 + }, + { + "auxiliary_loss_clip": 0.01143032, + "auxiliary_loss_mlp": 0.01042216, + "balance_loss_clip": 1.02559805, + "balance_loss_mlp": 1.04768658, + "epoch": 0.19912821283631443, + "flos": 24024885373440.0, + "grad_norm": 2.462607887220823, + "language_loss": 0.74053729, + "learning_rate": 3.707367806139355e-06, + "loss": 0.76238978, + "num_input_tokens_seen": 71456320, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.953125, + "step": 3312, + "time_per_iteration": 2.471867799758911 + }, + { + "auxiliary_loss_clip": 0.01141113, + "auxiliary_loss_mlp": 0.01046905, + "balance_loss_clip": 1.03060961, + "balance_loss_mlp": 1.04843581, + "epoch": 0.19918833608898243, + "flos": 19859031774720.0, + "grad_norm": 2.2841450786746016, + "language_loss": 0.83511955, + "learning_rate": 3.7071649454328915e-06, + "loss": 0.8569997, + "num_input_tokens_seen": 71475360, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.92578125, + "step": 3313, + "time_per_iteration": 2.4846627712249756 + }, + { + "auxiliary_loss_clip": 0.01142431, + "auxiliary_loss_mlp": 0.01041924, + "balance_loss_clip": 1.02575922, + "balance_loss_mlp": 1.04944849, + "epoch": 0.1992484593416504, + "flos": 29095794984960.0, + "grad_norm": 3.438256379955746, + "language_loss": 0.80930895, + "learning_rate": 3.7069620199905625e-06, + "loss": 0.83115256, + "num_input_tokens_seen": 71496155, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.9296875, + "step": 3314, + "time_per_iteration": 2.525754928588867 + }, + { + "auxiliary_loss_clip": 0.01137185, + "auxiliary_loss_mlp": 0.01043596, + "balance_loss_clip": 1.0280745, + "balance_loss_mlp": 1.04706359, + "epoch": 0.19930858259431836, + "flos": 23295001011840.0, + "grad_norm": 1.5137591341622172, + "language_loss": 0.87549174, + "learning_rate": 3.7067590298200627e-06, + "loss": 0.89729953, + "num_input_tokens_seen": 71517295, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8984375, + "step": 3315, + "time_per_iteration": 2.5170931816101074 + }, + { + "auxiliary_loss_clip": 0.01141446, + "auxiliary_loss_mlp": 0.01046665, + "balance_loss_clip": 1.03032112, + "balance_loss_mlp": 1.04808092, + "epoch": 0.19936870584698632, + "flos": 25378825651200.0, + "grad_norm": 1.5984895942740787, + "language_loss": 0.71255141, + "learning_rate": 3.7065559749290892e-06, + "loss": 0.73443246, + "num_input_tokens_seen": 71540000, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9296875, + "step": 3316, + "time_per_iteration": 2.520071029663086 + }, + { + "auxiliary_loss_clip": 0.0105243, + "auxiliary_loss_mlp": 0.01028392, + "balance_loss_clip": 1.02646089, + "balance_loss_mlp": 1.01928639, + "epoch": 0.1994288290996543, + "flos": 62168053109760.0, + "grad_norm": 0.8439111854473917, + "language_loss": 0.66260874, + "learning_rate": 3.706352855325342e-06, + "loss": 0.68341696, + "num_input_tokens_seen": 71607880, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.33203125, + "step": 3317, + "time_per_iteration": 3.1460416316986084 + }, + { + "auxiliary_loss_clip": 0.01142295, + "auxiliary_loss_mlp": 0.01052969, + "balance_loss_clip": 1.03557682, + "balance_loss_mlp": 1.04575253, + "epoch": 0.19948895235232225, + "flos": 19025832919680.0, + "grad_norm": 2.672944172124665, + "language_loss": 0.74319738, + "learning_rate": 3.7061496710165233e-06, + "loss": 0.76515001, + "num_input_tokens_seen": 71625695, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.96484375, + "step": 3318, + "time_per_iteration": 2.6139748096466064 + }, + { + "auxiliary_loss_clip": 0.01134724, + "auxiliary_loss_mlp": 0.01043694, + "balance_loss_clip": 1.0282445, + "balance_loss_mlp": 1.04536486, + "epoch": 0.19954907560499022, + "flos": 37815803182080.0, + "grad_norm": 1.900050251198073, + "language_loss": 0.78860074, + "learning_rate": 3.7059464220103385e-06, + "loss": 0.81038487, + "num_input_tokens_seen": 71648520, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.89453125, + "step": 3319, + "time_per_iteration": 2.6014342308044434 + }, + { + "auxiliary_loss_clip": 0.01141458, + "auxiliary_loss_mlp": 0.01042777, + "balance_loss_clip": 1.02439499, + "balance_loss_mlp": 1.04806578, + "epoch": 0.1996091988576582, + "flos": 49565199594240.0, + "grad_norm": 2.0962453666662073, + "language_loss": 0.75462162, + "learning_rate": 3.7057431083144945e-06, + "loss": 0.77646399, + "num_input_tokens_seen": 71672185, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.93359375, + "step": 3320, + "time_per_iteration": 2.739485263824463 + }, + { + "auxiliary_loss_clip": 0.01139438, + "auxiliary_loss_mlp": 0.01042565, + "balance_loss_clip": 1.02613819, + "balance_loss_mlp": 1.04714417, + "epoch": 0.19966932211032618, + "flos": 22635788659200.0, + "grad_norm": 2.167317842134812, + "language_loss": 0.80547488, + "learning_rate": 3.705539729936701e-06, + "loss": 0.82729495, + "num_input_tokens_seen": 71692890, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.921875, + "step": 3321, + "time_per_iteration": 2.581353187561035 + }, + { + "auxiliary_loss_clip": 0.01049309, + "auxiliary_loss_mlp": 0.01003433, + "balance_loss_clip": 1.00151408, + "balance_loss_mlp": 1.01694489, + "epoch": 0.19972944536299414, + "flos": 54082117745280.0, + "grad_norm": 0.880630206553271, + "language_loss": 0.65178835, + "learning_rate": 3.7053362868846696e-06, + "loss": 0.67231572, + "num_input_tokens_seen": 71745815, + "router_z_loss_clip": 0.01916504, + "router_z_loss_mlp": 0.32421875, + "step": 3322, + "time_per_iteration": 2.9042704105377197 + }, + { + "auxiliary_loss_clip": 0.01050141, + "auxiliary_loss_mlp": 0.01003283, + "balance_loss_clip": 1.00130391, + "balance_loss_mlp": 1.01724231, + "epoch": 0.1997895686156621, + "flos": 69355031817600.0, + "grad_norm": 0.7916622121471568, + "language_loss": 0.56975091, + "learning_rate": 3.7051327791661153e-06, + "loss": 0.59028506, + "num_input_tokens_seen": 71806915, + "router_z_loss_clip": 0.01977539, + "router_z_loss_mlp": 0.328125, + "step": 3323, + "time_per_iteration": 3.2141411304473877 + }, + { + "auxiliary_loss_clip": 0.01139547, + "auxiliary_loss_mlp": 0.01035371, + "balance_loss_clip": 1.01859808, + "balance_loss_mlp": 1.04839373, + "epoch": 0.19984969186833007, + "flos": 18552063507840.0, + "grad_norm": 1.9849201654975537, + "language_loss": 0.80526733, + "learning_rate": 3.7049292067887555e-06, + "loss": 0.82701647, + "num_input_tokens_seen": 71824645, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9140625, + "step": 3324, + "time_per_iteration": 2.5455262660980225 + }, + { + "auxiliary_loss_clip": 0.01137077, + "auxiliary_loss_mlp": 0.01040613, + "balance_loss_clip": 1.02329218, + "balance_loss_mlp": 1.04540765, + "epoch": 0.19990981512099804, + "flos": 26429678968320.0, + "grad_norm": 1.8681208438308643, + "language_loss": 0.53681695, + "learning_rate": 3.7047255697603092e-06, + "loss": 0.55859387, + "num_input_tokens_seen": 71845125, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.91796875, + "step": 3325, + "time_per_iteration": 2.581782102584839 + }, + { + "auxiliary_loss_clip": 0.01138508, + "auxiliary_loss_mlp": 0.01039502, + "balance_loss_clip": 1.02337289, + "balance_loss_mlp": 1.04565668, + "epoch": 0.19996993837366603, + "flos": 16325997010560.0, + "grad_norm": 2.0672953846254027, + "language_loss": 0.86169922, + "learning_rate": 3.7045218680884984e-06, + "loss": 0.88347936, + "num_input_tokens_seen": 71863500, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.9296875, + "step": 3326, + "time_per_iteration": 2.494718551635742 + }, + { + "auxiliary_loss_clip": 0.01138244, + "auxiliary_loss_mlp": 0.01037965, + "balance_loss_clip": 1.02243209, + "balance_loss_mlp": 1.04851878, + "epoch": 0.200030061626334, + "flos": 20844169159680.0, + "grad_norm": 1.8653522915536895, + "language_loss": 0.71835959, + "learning_rate": 3.7043181017810476e-06, + "loss": 0.74012172, + "num_input_tokens_seen": 71881845, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8984375, + "step": 3327, + "time_per_iteration": 2.536574602127075 + }, + { + "auxiliary_loss_clip": 0.0114197, + "auxiliary_loss_mlp": 0.01041829, + "balance_loss_clip": 1.02368546, + "balance_loss_mlp": 1.04750776, + "epoch": 0.20009018487900196, + "flos": 23762629198080.0, + "grad_norm": 1.83111198959611, + "language_loss": 0.76588571, + "learning_rate": 3.7041142708456833e-06, + "loss": 0.78772372, + "num_input_tokens_seen": 71900940, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9453125, + "step": 3328, + "time_per_iteration": 2.5083916187286377 + }, + { + "auxiliary_loss_clip": 0.01698253, + "auxiliary_loss_mlp": 0.01552284, + "balance_loss_clip": 1.52980089, + "balance_loss_mlp": 1.56677365, + "epoch": 0.20015030813166992, + "flos": 28106162236800.0, + "grad_norm": 1.6482454448342019, + "language_loss": 1.03044438, + "learning_rate": 3.7039103752901353e-06, + "loss": 0.7143048, + "num_input_tokens_seen": 71921925, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.3125, + "step": 3329, + "time_per_iteration": 15.37552785873413 + }, + { + "auxiliary_loss_clip": 0.01146286, + "auxiliary_loss_mlp": 0.01050404, + "balance_loss_clip": 1.03149772, + "balance_loss_mlp": 1.0504123, + "epoch": 0.2002104313843379, + "flos": 26067160955520.0, + "grad_norm": 1.5519947176183269, + "language_loss": 0.81297028, + "learning_rate": 3.7037064151221353e-06, + "loss": 0.8349371, + "num_input_tokens_seen": 71941855, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.9609375, + "step": 3330, + "time_per_iteration": 2.500103712081909 + }, + { + "auxiliary_loss_clip": 0.01140997, + "auxiliary_loss_mlp": 0.01037259, + "balance_loss_clip": 1.01994956, + "balance_loss_mlp": 1.04669356, + "epoch": 0.20027055463700585, + "flos": 22966633854720.0, + "grad_norm": 2.032272994312633, + "language_loss": 0.76649368, + "learning_rate": 3.703502390349417e-06, + "loss": 0.78827626, + "num_input_tokens_seen": 71960915, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9453125, + "step": 3331, + "time_per_iteration": 2.4018712043762207 + }, + { + "auxiliary_loss_clip": 0.01141733, + "auxiliary_loss_mlp": 0.01045779, + "balance_loss_clip": 1.02819538, + "balance_loss_mlp": 1.04608667, + "epoch": 0.20033067788967382, + "flos": 17165660313600.0, + "grad_norm": 1.6582018653132529, + "language_loss": 0.79261309, + "learning_rate": 3.7032983009797176e-06, + "loss": 0.81448817, + "num_input_tokens_seen": 71979220, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.953125, + "step": 3332, + "time_per_iteration": 2.4550859928131104 + }, + { + "auxiliary_loss_clip": 0.01045684, + "auxiliary_loss_mlp": 0.01005368, + "balance_loss_clip": 1.0036391, + "balance_loss_mlp": 1.01433849, + "epoch": 0.2003908011423418, + "flos": 60825566292480.0, + "grad_norm": 0.9315137515082259, + "language_loss": 0.61990142, + "learning_rate": 3.703094147020776e-06, + "loss": 0.64041197, + "num_input_tokens_seen": 72033950, + "router_z_loss_clip": 0.01733398, + "router_z_loss_mlp": 0.31445312, + "step": 3333, + "time_per_iteration": 2.9623756408691406 + }, + { + "auxiliary_loss_clip": 0.01139681, + "auxiliary_loss_mlp": 0.01044905, + "balance_loss_clip": 1.02819228, + "balance_loss_mlp": 1.04501462, + "epoch": 0.20045092439500978, + "flos": 24206234163840.0, + "grad_norm": 2.1372355522021893, + "language_loss": 0.81203878, + "learning_rate": 3.7028899284803334e-06, + "loss": 0.8338846, + "num_input_tokens_seen": 72051395, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9453125, + "step": 3334, + "time_per_iteration": 2.49924373626709 + }, + { + "auxiliary_loss_clip": 0.01146523, + "auxiliary_loss_mlp": 0.01047388, + "balance_loss_clip": 1.02938735, + "balance_loss_mlp": 1.04878521, + "epoch": 0.20051104764767774, + "flos": 29387605075200.0, + "grad_norm": 2.1564721635267516, + "language_loss": 0.74261904, + "learning_rate": 3.702685645366134e-06, + "loss": 0.76455814, + "num_input_tokens_seen": 72071305, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9765625, + "step": 3335, + "time_per_iteration": 2.634608745574951 + }, + { + "auxiliary_loss_clip": 0.01150022, + "auxiliary_loss_mlp": 0.01058924, + "balance_loss_clip": 1.04205632, + "balance_loss_mlp": 1.05375338, + "epoch": 0.2005711709003457, + "flos": 23513804709120.0, + "grad_norm": 1.6943946878944693, + "language_loss": 0.79839814, + "learning_rate": 3.7024812976859243e-06, + "loss": 0.82048762, + "num_input_tokens_seen": 72090165, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9609375, + "step": 3336, + "time_per_iteration": 2.7025394439697266 + }, + { + "auxiliary_loss_clip": 0.01145798, + "auxiliary_loss_mlp": 0.01045992, + "balance_loss_clip": 1.02744317, + "balance_loss_mlp": 1.04703879, + "epoch": 0.20063129415301367, + "flos": 22523388024960.0, + "grad_norm": 1.9043375292422164, + "language_loss": 0.78031212, + "learning_rate": 3.7022768854474532e-06, + "loss": 0.80223, + "num_input_tokens_seen": 72107210, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.98828125, + "step": 3337, + "time_per_iteration": 2.5718014240264893 + }, + { + "auxiliary_loss_clip": 0.01143827, + "auxiliary_loss_mlp": 0.01045572, + "balance_loss_clip": 1.02708244, + "balance_loss_mlp": 1.0486424, + "epoch": 0.20069141740568164, + "flos": 25958243940480.0, + "grad_norm": 1.9983960159800889, + "language_loss": 0.6873948, + "learning_rate": 3.7020724086584724e-06, + "loss": 0.70928884, + "num_input_tokens_seen": 72126315, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.94921875, + "step": 3338, + "time_per_iteration": 2.5848047733306885 + }, + { + "auxiliary_loss_clip": 0.01143098, + "auxiliary_loss_mlp": 0.01049172, + "balance_loss_clip": 1.03263819, + "balance_loss_mlp": 1.04853702, + "epoch": 0.2007515406583496, + "flos": 24790608529920.0, + "grad_norm": 2.1061075345379576, + "language_loss": 0.68823779, + "learning_rate": 3.701867867326735e-06, + "loss": 0.71016049, + "num_input_tokens_seen": 72146470, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.9453125, + "step": 3339, + "time_per_iteration": 2.523771047592163 + }, + { + "auxiliary_loss_clip": 0.01149874, + "auxiliary_loss_mlp": 0.01038245, + "balance_loss_clip": 1.02217603, + "balance_loss_mlp": 1.05197799, + "epoch": 0.2008116639110176, + "flos": 37925582123520.0, + "grad_norm": 2.3080693694415872, + "language_loss": 0.66263533, + "learning_rate": 3.7016632614599974e-06, + "loss": 0.68451655, + "num_input_tokens_seen": 72166600, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.9765625, + "step": 3340, + "time_per_iteration": 2.647495985031128 + }, + { + "auxiliary_loss_clip": 0.01141947, + "auxiliary_loss_mlp": 0.0103392, + "balance_loss_clip": 1.01570475, + "balance_loss_mlp": 1.0457145, + "epoch": 0.20087178716368556, + "flos": 20740531443840.0, + "grad_norm": 2.8472305033219696, + "language_loss": 0.74124628, + "learning_rate": 3.701458591066019e-06, + "loss": 0.76300496, + "num_input_tokens_seen": 72185160, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.9609375, + "step": 3341, + "time_per_iteration": 2.511585235595703 + }, + { + "auxiliary_loss_clip": 0.01140464, + "auxiliary_loss_mlp": 0.01043131, + "balance_loss_clip": 1.02689481, + "balance_loss_mlp": 1.04846787, + "epoch": 0.20093191041635353, + "flos": 23842279607040.0, + "grad_norm": 2.1698717951472326, + "language_loss": 0.71578503, + "learning_rate": 3.70125385615256e-06, + "loss": 0.73762101, + "num_input_tokens_seen": 72205160, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.91796875, + "step": 3342, + "time_per_iteration": 2.561998128890991 + }, + { + "auxiliary_loss_clip": 0.01142187, + "auxiliary_loss_mlp": 0.01045325, + "balance_loss_clip": 1.02871895, + "balance_loss_mlp": 1.04746354, + "epoch": 0.2009920336690215, + "flos": 21792067119360.0, + "grad_norm": 1.9864957062525024, + "language_loss": 0.73130047, + "learning_rate": 3.701049056727384e-06, + "loss": 0.75317556, + "num_input_tokens_seen": 72223555, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9453125, + "step": 3343, + "time_per_iteration": 4.046127557754517 + }, + { + "auxiliary_loss_clip": 0.01142173, + "auxiliary_loss_mlp": 0.01050047, + "balance_loss_clip": 1.03252363, + "balance_loss_mlp": 1.04738092, + "epoch": 0.20105215692168946, + "flos": 26359222440960.0, + "grad_norm": 1.9813453341923526, + "language_loss": 0.81026411, + "learning_rate": 3.7008441927982574e-06, + "loss": 0.83218634, + "num_input_tokens_seen": 72242465, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.94921875, + "step": 3344, + "time_per_iteration": 2.520765542984009 + }, + { + "auxiliary_loss_clip": 0.01141139, + "auxiliary_loss_mlp": 0.01050367, + "balance_loss_clip": 1.03412437, + "balance_loss_mlp": 1.04661858, + "epoch": 0.20111228017435742, + "flos": 18807280617600.0, + "grad_norm": 2.7491478080862684, + "language_loss": 0.83503234, + "learning_rate": 3.700639264372948e-06, + "loss": 0.85694736, + "num_input_tokens_seen": 72260655, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.9453125, + "step": 3345, + "time_per_iteration": 4.064355373382568 + }, + { + "auxiliary_loss_clip": 0.01135224, + "auxiliary_loss_mlp": 0.01041726, + "balance_loss_clip": 1.02689624, + "balance_loss_mlp": 1.0464828, + "epoch": 0.20117240342702541, + "flos": 19975059682560.0, + "grad_norm": 1.723487885242635, + "language_loss": 0.67909771, + "learning_rate": 3.7004342714592283e-06, + "loss": 0.70086718, + "num_input_tokens_seen": 72279055, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.88671875, + "step": 3346, + "time_per_iteration": 2.521949291229248 + }, + { + "auxiliary_loss_clip": 0.01140579, + "auxiliary_loss_mlp": 0.01048866, + "balance_loss_clip": 1.03233206, + "balance_loss_mlp": 1.04726124, + "epoch": 0.20123252667969338, + "flos": 23142703345920.0, + "grad_norm": 2.272845003166824, + "language_loss": 0.73496711, + "learning_rate": 3.70022921406487e-06, + "loss": 0.75686157, + "num_input_tokens_seen": 72297895, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.93359375, + "step": 3347, + "time_per_iteration": 2.5316877365112305 + }, + { + "auxiliary_loss_clip": 0.01140927, + "auxiliary_loss_mlp": 0.01047237, + "balance_loss_clip": 1.03179908, + "balance_loss_mlp": 1.04827023, + "epoch": 0.20129264993236134, + "flos": 23221671396480.0, + "grad_norm": 1.7467826588499227, + "language_loss": 0.86716485, + "learning_rate": 3.70002409219765e-06, + "loss": 0.88904649, + "num_input_tokens_seen": 72318385, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.92578125, + "step": 3348, + "time_per_iteration": 2.5123202800750732 + }, + { + "auxiliary_loss_clip": 0.01138726, + "auxiliary_loss_mlp": 0.01041589, + "balance_loss_clip": 1.02335036, + "balance_loss_mlp": 1.04729295, + "epoch": 0.2013527731850293, + "flos": 21871466133120.0, + "grad_norm": 1.5886148695932183, + "language_loss": 0.71200913, + "learning_rate": 3.699818905865346e-06, + "loss": 0.73381227, + "num_input_tokens_seen": 72338235, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.9140625, + "step": 3349, + "time_per_iteration": 2.505594491958618 + }, + { + "auxiliary_loss_clip": 0.01144556, + "auxiliary_loss_mlp": 0.01048519, + "balance_loss_clip": 1.03016067, + "balance_loss_mlp": 1.04982185, + "epoch": 0.20141289643769728, + "flos": 18040803275520.0, + "grad_norm": 1.649154800785762, + "language_loss": 0.71079665, + "learning_rate": 3.6996136550757377e-06, + "loss": 0.73272741, + "num_input_tokens_seen": 72357825, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.9453125, + "step": 3350, + "time_per_iteration": 2.4927315711975098 + }, + { + "auxiliary_loss_clip": 0.01145933, + "auxiliary_loss_mlp": 0.01044553, + "balance_loss_clip": 1.02612305, + "balance_loss_mlp": 1.05045485, + "epoch": 0.20147301969036524, + "flos": 23951412103680.0, + "grad_norm": 3.2873247390310554, + "language_loss": 0.76327842, + "learning_rate": 3.69940833983661e-06, + "loss": 0.78518331, + "num_input_tokens_seen": 72376335, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.953125, + "step": 3351, + "time_per_iteration": 2.5077342987060547 + }, + { + "auxiliary_loss_clip": 0.01146641, + "auxiliary_loss_mlp": 0.01043619, + "balance_loss_clip": 1.02555871, + "balance_loss_mlp": 1.05069637, + "epoch": 0.2015331429430332, + "flos": 25588471380480.0, + "grad_norm": 1.662758000066145, + "language_loss": 0.80545723, + "learning_rate": 3.699202960155748e-06, + "loss": 0.8273598, + "num_input_tokens_seen": 72395440, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.95703125, + "step": 3352, + "time_per_iteration": 2.5717766284942627 + }, + { + "auxiliary_loss_clip": 0.01146315, + "auxiliary_loss_mlp": 0.01039569, + "balance_loss_clip": 1.02274823, + "balance_loss_mlp": 1.05210721, + "epoch": 0.2015932661957012, + "flos": 26724972677760.0, + "grad_norm": 1.7179856660366186, + "language_loss": 0.8027631, + "learning_rate": 3.6989975160409396e-06, + "loss": 0.82462192, + "num_input_tokens_seen": 72414670, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9453125, + "step": 3353, + "time_per_iteration": 2.6415467262268066 + }, + { + "auxiliary_loss_clip": 0.01140403, + "auxiliary_loss_mlp": 0.01041635, + "balance_loss_clip": 1.02512455, + "balance_loss_mlp": 1.04978478, + "epoch": 0.20165338944836916, + "flos": 15633136592640.0, + "grad_norm": 2.050762039112588, + "language_loss": 0.8946988, + "learning_rate": 3.6987920074999747e-06, + "loss": 0.91651917, + "num_input_tokens_seen": 72432210, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.90625, + "step": 3354, + "time_per_iteration": 2.4780237674713135 + }, + { + "auxiliary_loss_clip": 0.01052075, + "auxiliary_loss_mlp": 0.01011403, + "balance_loss_clip": 1.00948358, + "balance_loss_mlp": 1.0202148, + "epoch": 0.20171351270103713, + "flos": 57912529207680.0, + "grad_norm": 0.830112597874188, + "language_loss": 0.55839282, + "learning_rate": 3.6985864345406465e-06, + "loss": 0.57902759, + "num_input_tokens_seen": 72489225, + "router_z_loss_clip": 0.01916504, + "router_z_loss_mlp": 0.31835938, + "step": 3355, + "time_per_iteration": 3.0224292278289795 + }, + { + "auxiliary_loss_clip": 0.01140957, + "auxiliary_loss_mlp": 0.01045212, + "balance_loss_clip": 1.02891648, + "balance_loss_mlp": 1.05068707, + "epoch": 0.2017736359537051, + "flos": 20814363849600.0, + "grad_norm": 1.5257876958196368, + "language_loss": 0.84076762, + "learning_rate": 3.698380797170751e-06, + "loss": 0.86262929, + "num_input_tokens_seen": 72508715, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.90234375, + "step": 3356, + "time_per_iteration": 2.510615348815918 + }, + { + "auxiliary_loss_clip": 0.01152963, + "auxiliary_loss_mlp": 0.01043363, + "balance_loss_clip": 1.024194, + "balance_loss_mlp": 1.05356848, + "epoch": 0.20183375920637306, + "flos": 17092043389440.0, + "grad_norm": 2.9361880537925584, + "language_loss": 0.688007, + "learning_rate": 3.698175095398085e-06, + "loss": 0.70997024, + "num_input_tokens_seen": 72525135, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.9921875, + "step": 3357, + "time_per_iteration": 2.460022211074829 + }, + { + "auxiliary_loss_clip": 0.01144866, + "auxiliary_loss_mlp": 0.01040819, + "balance_loss_clip": 1.02280617, + "balance_loss_mlp": 1.0492487, + "epoch": 0.20189388245904102, + "flos": 18661339658880.0, + "grad_norm": 1.7490617907772006, + "language_loss": 0.71748042, + "learning_rate": 3.6979693292304493e-06, + "loss": 0.73933733, + "num_input_tokens_seen": 72543690, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.95703125, + "step": 3358, + "time_per_iteration": 2.563767194747925 + }, + { + "auxiliary_loss_clip": 0.01139733, + "auxiliary_loss_mlp": 0.01054955, + "balance_loss_clip": 1.03818202, + "balance_loss_mlp": 1.04849517, + "epoch": 0.20195400571170902, + "flos": 16797539779200.0, + "grad_norm": 2.042998238377631, + "language_loss": 0.83104217, + "learning_rate": 3.6977634986756463e-06, + "loss": 0.85298896, + "num_input_tokens_seen": 72560725, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9140625, + "step": 3359, + "time_per_iteration": 2.531332015991211 + }, + { + "auxiliary_loss_clip": 0.01052883, + "auxiliary_loss_mlp": 0.01001012, + "balance_loss_clip": 0.99911654, + "balance_loss_mlp": 1.02214265, + "epoch": 0.20201412896437698, + "flos": 67174716268800.0, + "grad_norm": 12.853939959466139, + "language_loss": 0.5895561, + "learning_rate": 3.697557603741482e-06, + "loss": 0.61009508, + "num_input_tokens_seen": 72621940, + "router_z_loss_clip": 0.0189209, + "router_z_loss_mlp": 0.30859375, + "step": 3360, + "time_per_iteration": 3.0536341667175293 + }, + { + "auxiliary_loss_clip": 0.01147837, + "auxiliary_loss_mlp": 0.01049077, + "balance_loss_clip": 1.03117216, + "balance_loss_mlp": 1.05149043, + "epoch": 0.20207425221704495, + "flos": 21325013550720.0, + "grad_norm": 2.4416015649532286, + "language_loss": 0.62138069, + "learning_rate": 3.697351644435763e-06, + "loss": 0.64334983, + "num_input_tokens_seen": 72639135, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.96484375, + "step": 3361, + "time_per_iteration": 2.512657403945923 + }, + { + "auxiliary_loss_clip": 0.0114522, + "auxiliary_loss_mlp": 0.01055979, + "balance_loss_clip": 1.03900385, + "balance_loss_mlp": 1.05156183, + "epoch": 0.2021343754697129, + "flos": 22527158952960.0, + "grad_norm": 2.0025961231737526, + "language_loss": 0.75524926, + "learning_rate": 3.6971456207662993e-06, + "loss": 0.77726126, + "num_input_tokens_seen": 72658525, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9375, + "step": 3362, + "time_per_iteration": 2.555492639541626 + }, + { + "auxiliary_loss_clip": 0.01145631, + "auxiliary_loss_mlp": 0.01046193, + "balance_loss_clip": 1.02926481, + "balance_loss_mlp": 1.05209327, + "epoch": 0.20219449872238088, + "flos": 19062785036160.0, + "grad_norm": 1.6135185744423872, + "language_loss": 0.76400363, + "learning_rate": 3.6969395327409035e-06, + "loss": 0.78592181, + "num_input_tokens_seen": 72678085, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9375, + "step": 3363, + "time_per_iteration": 2.486969470977783 + }, + { + "auxiliary_loss_clip": 0.01141408, + "auxiliary_loss_mlp": 0.01052858, + "balance_loss_clip": 1.03686023, + "balance_loss_mlp": 1.04736471, + "epoch": 0.20225462197504884, + "flos": 24717027519360.0, + "grad_norm": 2.0495916908721434, + "language_loss": 0.74606001, + "learning_rate": 3.696733380367391e-06, + "loss": 0.76800275, + "num_input_tokens_seen": 72698695, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.9375, + "step": 3364, + "time_per_iteration": 2.58673095703125 + }, + { + "auxiliary_loss_clip": 0.01144028, + "auxiliary_loss_mlp": 0.01052057, + "balance_loss_clip": 1.03390145, + "balance_loss_mlp": 1.04865253, + "epoch": 0.2023147452277168, + "flos": 22018304931840.0, + "grad_norm": 2.1992700083841084, + "language_loss": 0.71451771, + "learning_rate": 3.6965271636535783e-06, + "loss": 0.73647857, + "num_input_tokens_seen": 72717880, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.953125, + "step": 3365, + "time_per_iteration": 2.522470712661743 + }, + { + "auxiliary_loss_clip": 0.01147339, + "auxiliary_loss_mlp": 0.01052179, + "balance_loss_clip": 1.03516757, + "balance_loss_mlp": 1.05331004, + "epoch": 0.2023748684803848, + "flos": 17745365911680.0, + "grad_norm": 1.9561618637344158, + "language_loss": 0.85770535, + "learning_rate": 3.696320882607286e-06, + "loss": 0.87970054, + "num_input_tokens_seen": 72736410, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.94140625, + "step": 3366, + "time_per_iteration": 2.536529541015625 + }, + { + "auxiliary_loss_clip": 0.01143453, + "auxiliary_loss_mlp": 0.01044399, + "balance_loss_clip": 1.02761412, + "balance_loss_mlp": 1.0499506, + "epoch": 0.20243499173305277, + "flos": 31138932493440.0, + "grad_norm": 1.628387041142295, + "language_loss": 0.69651556, + "learning_rate": 3.696114537236335e-06, + "loss": 0.7183941, + "num_input_tokens_seen": 72758295, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.93359375, + "step": 3367, + "time_per_iteration": 2.5608372688293457 + }, + { + "auxiliary_loss_clip": 0.01145892, + "auxiliary_loss_mlp": 0.01043195, + "balance_loss_clip": 1.0235498, + "balance_loss_mlp": 1.04696274, + "epoch": 0.20249511498572073, + "flos": 33839235279360.0, + "grad_norm": 2.963599898430263, + "language_loss": 0.68230569, + "learning_rate": 3.6959081275485512e-06, + "loss": 0.70419657, + "num_input_tokens_seen": 72782495, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.98828125, + "step": 3368, + "time_per_iteration": 2.66802978515625 + }, + { + "auxiliary_loss_clip": 0.01143607, + "auxiliary_loss_mlp": 0.01049214, + "balance_loss_clip": 1.03178596, + "balance_loss_mlp": 1.0505259, + "epoch": 0.2025552382383887, + "flos": 21215629658880.0, + "grad_norm": 7.849671101524798, + "language_loss": 0.77025628, + "learning_rate": 3.6957016535517615e-06, + "loss": 0.79218459, + "num_input_tokens_seen": 72801885, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9296875, + "step": 3369, + "time_per_iteration": 2.5143446922302246 + }, + { + "auxiliary_loss_clip": 0.01145287, + "auxiliary_loss_mlp": 0.01057318, + "balance_loss_clip": 1.04029489, + "balance_loss_mlp": 1.04800487, + "epoch": 0.20261536149105666, + "flos": 14647388676480.0, + "grad_norm": 4.298107611861754, + "language_loss": 0.65408337, + "learning_rate": 3.695495115253795e-06, + "loss": 0.67610943, + "num_input_tokens_seen": 72816990, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.97265625, + "step": 3370, + "time_per_iteration": 2.503589630126953 + }, + { + "auxiliary_loss_clip": 0.01048919, + "auxiliary_loss_mlp": 0.01024768, + "balance_loss_clip": 1.02313519, + "balance_loss_mlp": 1.01856685, + "epoch": 0.20267548474372463, + "flos": 66783649921920.0, + "grad_norm": 0.6799262329378595, + "language_loss": 0.58101869, + "learning_rate": 3.6952885126624834e-06, + "loss": 0.60175562, + "num_input_tokens_seen": 72879240, + "router_z_loss_clip": 0.01635742, + "router_z_loss_mlp": 0.3046875, + "step": 3371, + "time_per_iteration": 3.1626369953155518 + }, + { + "auxiliary_loss_clip": 0.01143688, + "auxiliary_loss_mlp": 0.0104249, + "balance_loss_clip": 1.0254668, + "balance_loss_mlp": 1.04866266, + "epoch": 0.2027356079963926, + "flos": 24680793674880.0, + "grad_norm": 1.766606164011739, + "language_loss": 0.92068136, + "learning_rate": 3.6950818457856617e-06, + "loss": 0.94254309, + "num_input_tokens_seen": 72899030, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.94921875, + "step": 3372, + "time_per_iteration": 2.578045129776001 + }, + { + "auxiliary_loss_clip": 0.0114549, + "auxiliary_loss_mlp": 0.01044018, + "balance_loss_clip": 1.02538514, + "balance_loss_mlp": 1.05037856, + "epoch": 0.20279573124906058, + "flos": 26392762765440.0, + "grad_norm": 1.6491924635250923, + "language_loss": 0.78632712, + "learning_rate": 3.694875114631167e-06, + "loss": 0.80822217, + "num_input_tokens_seen": 72919190, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.953125, + "step": 3373, + "time_per_iteration": 2.5762507915496826 + }, + { + "auxiliary_loss_clip": 0.01137806, + "auxiliary_loss_mlp": 0.01039377, + "balance_loss_clip": 1.02162719, + "balance_loss_mlp": 1.04629672, + "epoch": 0.20285585450172855, + "flos": 33799984692480.0, + "grad_norm": 1.8751465027713456, + "language_loss": 0.71102971, + "learning_rate": 3.6946683192068377e-06, + "loss": 0.73280156, + "num_input_tokens_seen": 72939720, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9140625, + "step": 3374, + "time_per_iteration": 2.6212260723114014 + }, + { + "auxiliary_loss_clip": 0.01048807, + "auxiliary_loss_mlp": 0.01001811, + "balance_loss_clip": 1.00001132, + "balance_loss_mlp": 1.01811993, + "epoch": 0.20291597775439651, + "flos": 71164823598720.0, + "grad_norm": 0.9912238676598704, + "language_loss": 0.62450445, + "learning_rate": 3.694461459520516e-06, + "loss": 0.64501071, + "num_input_tokens_seen": 73000015, + "router_z_loss_clip": 0.01794434, + "router_z_loss_mlp": 0.30859375, + "step": 3375, + "time_per_iteration": 3.0768048763275146 + }, + { + "auxiliary_loss_clip": 0.01140549, + "auxiliary_loss_mlp": 0.01044631, + "balance_loss_clip": 1.02722621, + "balance_loss_mlp": 1.04769731, + "epoch": 0.20297610100706448, + "flos": 19494287118720.0, + "grad_norm": 1.6669967725054042, + "language_loss": 0.82450807, + "learning_rate": 3.6942545355800463e-06, + "loss": 0.84635985, + "num_input_tokens_seen": 73017675, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9296875, + "step": 3376, + "time_per_iteration": 2.5632758140563965 + }, + { + "auxiliary_loss_clip": 0.011433, + "auxiliary_loss_mlp": 0.01039932, + "balance_loss_clip": 1.02110839, + "balance_loss_mlp": 1.04692364, + "epoch": 0.20303622425973245, + "flos": 25044245441280.0, + "grad_norm": 2.2640770034372006, + "language_loss": 0.81587797, + "learning_rate": 3.6940475473932743e-06, + "loss": 0.83771032, + "num_input_tokens_seen": 73036135, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.96484375, + "step": 3377, + "time_per_iteration": 2.6376402378082275 + }, + { + "auxiliary_loss_clip": 0.01139097, + "auxiliary_loss_mlp": 0.01045773, + "balance_loss_clip": 1.02786779, + "balance_loss_mlp": 1.04670048, + "epoch": 0.2030963475124004, + "flos": 21979988098560.0, + "grad_norm": 4.046949512949318, + "language_loss": 0.769104, + "learning_rate": 3.69384049496805e-06, + "loss": 0.79095268, + "num_input_tokens_seen": 73054075, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.921875, + "step": 3378, + "time_per_iteration": 2.532942056655884 + }, + { + "auxiliary_loss_clip": 0.01143396, + "auxiliary_loss_mlp": 0.01043534, + "balance_loss_clip": 1.02493691, + "balance_loss_mlp": 1.04772687, + "epoch": 0.2031564707650684, + "flos": 19500392430720.0, + "grad_norm": 1.9870266088444717, + "language_loss": 0.79710048, + "learning_rate": 3.6936333783122242e-06, + "loss": 0.81896979, + "num_input_tokens_seen": 73073530, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.95703125, + "step": 3379, + "time_per_iteration": 2.5187509059906006 + }, + { + "auxiliary_loss_clip": 0.01137083, + "auxiliary_loss_mlp": 0.01038348, + "balance_loss_clip": 1.02162337, + "balance_loss_mlp": 1.04698288, + "epoch": 0.20321659401773637, + "flos": 22747075971840.0, + "grad_norm": 1.7003196517483214, + "language_loss": 0.86949915, + "learning_rate": 3.6934261974336505e-06, + "loss": 0.89125347, + "num_input_tokens_seen": 73092820, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8984375, + "step": 3380, + "time_per_iteration": 2.5350420475006104 + }, + { + "auxiliary_loss_clip": 0.01143485, + "auxiliary_loss_mlp": 0.01046611, + "balance_loss_clip": 1.02905154, + "balance_loss_mlp": 1.05103135, + "epoch": 0.20327671727040433, + "flos": 22455840499200.0, + "grad_norm": 1.9133898096862498, + "language_loss": 0.74515057, + "learning_rate": 3.693218952340186e-06, + "loss": 0.76705158, + "num_input_tokens_seen": 73113385, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.92578125, + "step": 3381, + "time_per_iteration": 2.5428466796875 + }, + { + "auxiliary_loss_clip": 0.01143807, + "auxiliary_loss_mlp": 0.01043772, + "balance_loss_clip": 1.0258193, + "balance_loss_mlp": 1.04754519, + "epoch": 0.2033368405230723, + "flos": 19535010163200.0, + "grad_norm": 1.741042372938858, + "language_loss": 0.79304886, + "learning_rate": 3.6930116430396895e-06, + "loss": 0.81492472, + "num_input_tokens_seen": 73131195, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9609375, + "step": 3382, + "time_per_iteration": 2.51084041595459 + }, + { + "auxiliary_loss_clip": 0.01146625, + "auxiliary_loss_mlp": 0.01040796, + "balance_loss_clip": 1.02123427, + "balance_loss_mlp": 1.04849267, + "epoch": 0.20339696377574026, + "flos": 13809233744640.0, + "grad_norm": 1.8514394244027284, + "language_loss": 0.80188596, + "learning_rate": 3.6928042695400214e-06, + "loss": 0.82376015, + "num_input_tokens_seen": 73148850, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.98046875, + "step": 3383, + "time_per_iteration": 2.5047500133514404 + }, + { + "auxiliary_loss_clip": 0.0113964, + "auxiliary_loss_mlp": 0.01042049, + "balance_loss_clip": 1.02401257, + "balance_loss_mlp": 1.04616201, + "epoch": 0.20345708702840823, + "flos": 20339409288960.0, + "grad_norm": 6.482166974991387, + "language_loss": 0.74195492, + "learning_rate": 3.6925968318490464e-06, + "loss": 0.76377177, + "num_input_tokens_seen": 73166775, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.93359375, + "step": 3384, + "time_per_iteration": 2.4931931495666504 + }, + { + "auxiliary_loss_clip": 0.01147866, + "auxiliary_loss_mlp": 0.01043488, + "balance_loss_clip": 1.02442586, + "balance_loss_mlp": 1.04929996, + "epoch": 0.2035172102810762, + "flos": 20333950421760.0, + "grad_norm": 2.292912234818254, + "language_loss": 0.76429737, + "learning_rate": 3.6923893299746293e-06, + "loss": 0.78621089, + "num_input_tokens_seen": 73183215, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.984375, + "step": 3385, + "time_per_iteration": 3.9999845027923584 + }, + { + "auxiliary_loss_clip": 0.01139546, + "auxiliary_loss_mlp": 0.01058955, + "balance_loss_clip": 1.04031098, + "balance_loss_mlp": 1.04538202, + "epoch": 0.2035773335337442, + "flos": 23330983461120.0, + "grad_norm": 1.8347755395186154, + "language_loss": 0.68259251, + "learning_rate": 3.692181763924639e-06, + "loss": 0.70457751, + "num_input_tokens_seen": 73203290, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.94140625, + "step": 3386, + "time_per_iteration": 2.525538682937622 + }, + { + "auxiliary_loss_clip": 0.01143921, + "auxiliary_loss_mlp": 0.01054172, + "balance_loss_clip": 1.0348835, + "balance_loss_mlp": 1.04785144, + "epoch": 0.20363745678641215, + "flos": 28330287310080.0, + "grad_norm": 1.949323793812955, + "language_loss": 0.81000078, + "learning_rate": 3.691974133706947e-06, + "loss": 0.83198166, + "num_input_tokens_seen": 73226185, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.9609375, + "step": 3387, + "time_per_iteration": 4.122355222702026 + }, + { + "auxiliary_loss_clip": 0.01137445, + "auxiliary_loss_mlp": 0.01040694, + "balance_loss_clip": 1.02331305, + "balance_loss_mlp": 1.04754424, + "epoch": 0.20369758003908012, + "flos": 18915658928640.0, + "grad_norm": 2.869822824167972, + "language_loss": 0.79960001, + "learning_rate": 3.6917664393294262e-06, + "loss": 0.82138139, + "num_input_tokens_seen": 73243300, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.8984375, + "step": 3388, + "time_per_iteration": 2.498455047607422 + }, + { + "auxiliary_loss_clip": 0.01142619, + "auxiliary_loss_mlp": 0.01040015, + "balance_loss_clip": 1.02120411, + "balance_loss_mlp": 1.04757476, + "epoch": 0.20375770329174808, + "flos": 19206499351680.0, + "grad_norm": 1.6489636222716584, + "language_loss": 0.71810246, + "learning_rate": 3.6915586807999527e-06, + "loss": 0.73992884, + "num_input_tokens_seen": 73261490, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.94921875, + "step": 3389, + "time_per_iteration": 2.4751241207122803 + }, + { + "auxiliary_loss_clip": 0.01140457, + "auxiliary_loss_mlp": 0.01048463, + "balance_loss_clip": 1.03108239, + "balance_loss_mlp": 1.04812241, + "epoch": 0.20381782654441605, + "flos": 19391008538880.0, + "grad_norm": 1.7476252287205662, + "language_loss": 0.87431413, + "learning_rate": 3.691350858126404e-06, + "loss": 0.89620328, + "num_input_tokens_seen": 73280180, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.921875, + "step": 3390, + "time_per_iteration": 2.5229172706604004 + }, + { + "auxiliary_loss_clip": 0.01138893, + "auxiliary_loss_mlp": 0.01044263, + "balance_loss_clip": 1.02673888, + "balance_loss_mlp": 1.04638386, + "epoch": 0.203877949797084, + "flos": 24827704300800.0, + "grad_norm": 3.0399462437196743, + "language_loss": 0.71092427, + "learning_rate": 3.691142971316662e-06, + "loss": 0.73275584, + "num_input_tokens_seen": 73300680, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.92578125, + "step": 3391, + "time_per_iteration": 2.528003454208374 + }, + { + "auxiliary_loss_clip": 0.01137362, + "auxiliary_loss_mlp": 0.01043664, + "balance_loss_clip": 1.02592552, + "balance_loss_mlp": 1.04483938, + "epoch": 0.20393807304975198, + "flos": 18003707504640.0, + "grad_norm": 2.517550673127581, + "language_loss": 0.85993969, + "learning_rate": 3.6909350203786086e-06, + "loss": 0.88174999, + "num_input_tokens_seen": 73316760, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.921875, + "step": 3392, + "time_per_iteration": 2.5080008506774902 + }, + { + "auxiliary_loss_clip": 0.01143294, + "auxiliary_loss_mlp": 0.01049793, + "balance_loss_clip": 1.03231716, + "balance_loss_mlp": 1.04759896, + "epoch": 0.20399819630241997, + "flos": 24206988349440.0, + "grad_norm": 1.5067582134175779, + "language_loss": 0.80730146, + "learning_rate": 3.69072700532013e-06, + "loss": 0.82923234, + "num_input_tokens_seen": 73339385, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.95703125, + "step": 3393, + "time_per_iteration": 2.5464906692504883 + }, + { + "auxiliary_loss_clip": 0.01139211, + "auxiliary_loss_mlp": 0.01039094, + "balance_loss_clip": 1.02236915, + "balance_loss_mlp": 1.0471251, + "epoch": 0.20405831955508794, + "flos": 20777124424320.0, + "grad_norm": 1.882536464234473, + "language_loss": 0.86276352, + "learning_rate": 3.6905189261491137e-06, + "loss": 0.88454658, + "num_input_tokens_seen": 73357235, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.921875, + "step": 3394, + "time_per_iteration": 2.495544195175171 + }, + { + "auxiliary_loss_clip": 0.01139364, + "auxiliary_loss_mlp": 0.01042637, + "balance_loss_clip": 1.02640033, + "balance_loss_mlp": 1.04756498, + "epoch": 0.2041184428077559, + "flos": 15486908325120.0, + "grad_norm": 2.9880936155816324, + "language_loss": 0.83455038, + "learning_rate": 3.69031078287345e-06, + "loss": 0.85637033, + "num_input_tokens_seen": 73374435, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.91796875, + "step": 3395, + "time_per_iteration": 2.4636099338531494 + }, + { + "auxiliary_loss_clip": 0.01144564, + "auxiliary_loss_mlp": 0.01035131, + "balance_loss_clip": 1.01753616, + "balance_loss_mlp": 1.04799199, + "epoch": 0.20417856606042387, + "flos": 15588463052160.0, + "grad_norm": 2.0105247570422877, + "language_loss": 0.83632553, + "learning_rate": 3.690102575501033e-06, + "loss": 0.85812247, + "num_input_tokens_seen": 73391025, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.96875, + "step": 3396, + "time_per_iteration": 2.507140636444092 + }, + { + "auxiliary_loss_clip": 0.01139778, + "auxiliary_loss_mlp": 0.01042511, + "balance_loss_clip": 1.02470088, + "balance_loss_mlp": 1.04775488, + "epoch": 0.20423868931309183, + "flos": 24279348297600.0, + "grad_norm": 1.9261630392212734, + "language_loss": 0.77139032, + "learning_rate": 3.6898943040397556e-06, + "loss": 0.79321325, + "num_input_tokens_seen": 73409270, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.91796875, + "step": 3397, + "time_per_iteration": 2.5000061988830566 + }, + { + "auxiliary_loss_clip": 0.01140053, + "auxiliary_loss_mlp": 0.01043864, + "balance_loss_clip": 1.027771, + "balance_loss_mlp": 1.0482713, + "epoch": 0.2042988125657598, + "flos": 18614870438400.0, + "grad_norm": 2.6022565941655285, + "language_loss": 0.87048233, + "learning_rate": 3.689685968497518e-06, + "loss": 0.89232147, + "num_input_tokens_seen": 73425225, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.91796875, + "step": 3398, + "time_per_iteration": 2.4879262447357178 + }, + { + "auxiliary_loss_clip": 0.01146457, + "auxiliary_loss_mlp": 0.01045529, + "balance_loss_clip": 1.02855396, + "balance_loss_mlp": 1.05200124, + "epoch": 0.2043589358184278, + "flos": 17851230270720.0, + "grad_norm": 2.0446998950436273, + "language_loss": 0.77973163, + "learning_rate": 3.6894775688822186e-06, + "loss": 0.8016516, + "num_input_tokens_seen": 73440940, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9453125, + "step": 3399, + "time_per_iteration": 2.4417104721069336 + }, + { + "auxiliary_loss_clip": 0.01142529, + "auxiliary_loss_mlp": 0.01039697, + "balance_loss_clip": 1.02180338, + "balance_loss_mlp": 1.0471437, + "epoch": 0.20441905907109575, + "flos": 21435223455360.0, + "grad_norm": 1.9372936252349278, + "language_loss": 0.76201475, + "learning_rate": 3.6892691052017603e-06, + "loss": 0.78383702, + "num_input_tokens_seen": 73458805, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.953125, + "step": 3400, + "time_per_iteration": 2.513378858566284 + }, + { + "auxiliary_loss_clip": 0.01140509, + "auxiliary_loss_mlp": 0.01043924, + "balance_loss_clip": 1.02709138, + "balance_loss_mlp": 1.04937315, + "epoch": 0.20447918232376372, + "flos": 27707703851520.0, + "grad_norm": 1.6590163779918286, + "language_loss": 0.79357922, + "learning_rate": 3.6890605774640487e-06, + "loss": 0.81542361, + "num_input_tokens_seen": 73479380, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9140625, + "step": 3401, + "time_per_iteration": 2.5628185272216797 + }, + { + "auxiliary_loss_clip": 0.01141107, + "auxiliary_loss_mlp": 0.01041447, + "balance_loss_clip": 1.02400649, + "balance_loss_mlp": 1.04659653, + "epoch": 0.20453930557643168, + "flos": 30524214113280.0, + "grad_norm": 1.682072453203677, + "language_loss": 0.69205511, + "learning_rate": 3.688851985676991e-06, + "loss": 0.71388066, + "num_input_tokens_seen": 73505105, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9453125, + "step": 3402, + "time_per_iteration": 2.653932571411133 + }, + { + "auxiliary_loss_clip": 0.01144935, + "auxiliary_loss_mlp": 0.01043349, + "balance_loss_clip": 1.02538395, + "balance_loss_mlp": 1.05008948, + "epoch": 0.20459942882909965, + "flos": 18987767481600.0, + "grad_norm": 2.6906490082479086, + "language_loss": 0.81077826, + "learning_rate": 3.688643329848496e-06, + "loss": 0.83266115, + "num_input_tokens_seen": 73523700, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.94921875, + "step": 3403, + "time_per_iteration": 2.518402099609375 + }, + { + "auxiliary_loss_clip": 0.01145331, + "auxiliary_loss_mlp": 0.0104575, + "balance_loss_clip": 1.02873933, + "balance_loss_mlp": 1.05067933, + "epoch": 0.20465955208176762, + "flos": 20339050152960.0, + "grad_norm": 1.7308307985558895, + "language_loss": 0.83497006, + "learning_rate": 3.6884346099864772e-06, + "loss": 0.85688084, + "num_input_tokens_seen": 73542625, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9453125, + "step": 3404, + "time_per_iteration": 2.5041427612304688 + }, + { + "auxiliary_loss_clip": 0.0114107, + "auxiliary_loss_mlp": 0.0104714, + "balance_loss_clip": 1.03018808, + "balance_loss_mlp": 1.04686713, + "epoch": 0.20471967533443558, + "flos": 21251288885760.0, + "grad_norm": 1.717424757849508, + "language_loss": 0.86319768, + "learning_rate": 3.6882258260988487e-06, + "loss": 0.88507974, + "num_input_tokens_seen": 73561450, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9453125, + "step": 3405, + "time_per_iteration": 2.5019404888153076 + }, + { + "auxiliary_loss_clip": 0.01138198, + "auxiliary_loss_mlp": 0.01042134, + "balance_loss_clip": 1.02558827, + "balance_loss_mlp": 1.04664326, + "epoch": 0.20477979858710357, + "flos": 14501555458560.0, + "grad_norm": 2.0734152439752327, + "language_loss": 0.84731919, + "learning_rate": 3.6880169781935276e-06, + "loss": 0.86912251, + "num_input_tokens_seen": 73577155, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9140625, + "step": 3406, + "time_per_iteration": 2.508274793624878 + }, + { + "auxiliary_loss_clip": 0.0114, + "auxiliary_loss_mlp": 0.01042004, + "balance_loss_clip": 1.02601814, + "balance_loss_mlp": 1.04885817, + "epoch": 0.20483992183977154, + "flos": 11400310085760.0, + "grad_norm": 2.0579137112366332, + "language_loss": 0.68086451, + "learning_rate": 3.6878080662784336e-06, + "loss": 0.70268458, + "num_input_tokens_seen": 73594900, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.9140625, + "step": 3407, + "time_per_iteration": 2.4675915241241455 + }, + { + "auxiliary_loss_clip": 0.01137483, + "auxiliary_loss_mlp": 0.0104729, + "balance_loss_clip": 1.03039861, + "balance_loss_mlp": 1.0469842, + "epoch": 0.2049000450924395, + "flos": 19060271084160.0, + "grad_norm": 2.4520435823789857, + "language_loss": 0.84025276, + "learning_rate": 3.6875990903614886e-06, + "loss": 0.86210054, + "num_input_tokens_seen": 73613810, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.90625, + "step": 3408, + "time_per_iteration": 2.4996185302734375 + }, + { + "auxiliary_loss_clip": 0.01144748, + "auxiliary_loss_mlp": 0.01045034, + "balance_loss_clip": 1.02851176, + "balance_loss_mlp": 1.05156052, + "epoch": 0.20496016834510747, + "flos": 14574561851520.0, + "grad_norm": 2.726731275915995, + "language_loss": 0.64288676, + "learning_rate": 3.6873900504506166e-06, + "loss": 0.66478455, + "num_input_tokens_seen": 73631495, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9296875, + "step": 3409, + "time_per_iteration": 2.469758987426758 + }, + { + "auxiliary_loss_clip": 0.01139054, + "auxiliary_loss_mlp": 0.01046007, + "balance_loss_clip": 1.0295676, + "balance_loss_mlp": 1.04638147, + "epoch": 0.20502029159777543, + "flos": 22126647329280.0, + "grad_norm": 1.319045584705984, + "language_loss": 0.80357087, + "learning_rate": 3.687180946553745e-06, + "loss": 0.82542145, + "num_input_tokens_seen": 73652840, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.92578125, + "step": 3410, + "time_per_iteration": 2.5167293548583984 + }, + { + "auxiliary_loss_clip": 0.01140553, + "auxiliary_loss_mlp": 0.01046311, + "balance_loss_clip": 1.0303905, + "balance_loss_mlp": 1.05014896, + "epoch": 0.2050804148504434, + "flos": 25367907916800.0, + "grad_norm": 2.259997857874164, + "language_loss": 0.75796056, + "learning_rate": 3.686971778678803e-06, + "loss": 0.7798292, + "num_input_tokens_seen": 73672150, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.90625, + "step": 3411, + "time_per_iteration": 2.5411264896392822 + }, + { + "auxiliary_loss_clip": 0.01144909, + "auxiliary_loss_mlp": 0.0104429, + "balance_loss_clip": 1.02817273, + "balance_loss_mlp": 1.05220985, + "epoch": 0.2051405381031114, + "flos": 23620171858560.0, + "grad_norm": 2.0004173274373183, + "language_loss": 0.73696554, + "learning_rate": 3.686762546833722e-06, + "loss": 0.75885755, + "num_input_tokens_seen": 73691940, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.9296875, + "step": 3412, + "time_per_iteration": 2.5047144889831543 + }, + { + "auxiliary_loss_clip": 0.01143761, + "auxiliary_loss_mlp": 0.01047167, + "balance_loss_clip": 1.03015614, + "balance_loss_mlp": 1.04735541, + "epoch": 0.20520066135577936, + "flos": 19565533745280.0, + "grad_norm": 2.0925027501904228, + "language_loss": 0.77863461, + "learning_rate": 3.6865532510264362e-06, + "loss": 0.8005439, + "num_input_tokens_seen": 73709080, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.96484375, + "step": 3413, + "time_per_iteration": 2.5472991466522217 + }, + { + "auxiliary_loss_clip": 0.01138869, + "auxiliary_loss_mlp": 0.01042643, + "balance_loss_clip": 1.02534604, + "balance_loss_mlp": 1.04989886, + "epoch": 0.20526078460844732, + "flos": 17676345928320.0, + "grad_norm": 1.912987525537943, + "language_loss": 0.84719825, + "learning_rate": 3.6863438912648823e-06, + "loss": 0.86901337, + "num_input_tokens_seen": 73727670, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.890625, + "step": 3414, + "time_per_iteration": 2.478729724884033 + }, + { + "auxiliary_loss_clip": 0.01138295, + "auxiliary_loss_mlp": 0.0104162, + "balance_loss_clip": 1.02496636, + "balance_loss_mlp": 1.04659235, + "epoch": 0.2053209078611153, + "flos": 21500328856320.0, + "grad_norm": 1.9076108002018353, + "language_loss": 0.80448711, + "learning_rate": 3.6861344675569986e-06, + "loss": 0.82628626, + "num_input_tokens_seen": 73747170, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.91796875, + "step": 3415, + "time_per_iteration": 2.5366415977478027 + }, + { + "auxiliary_loss_clip": 0.01137909, + "auxiliary_loss_mlp": 0.01037046, + "balance_loss_clip": 1.02154934, + "balance_loss_mlp": 1.04796863, + "epoch": 0.20538103111378325, + "flos": 25663524848640.0, + "grad_norm": 1.7629792917286327, + "language_loss": 0.72893143, + "learning_rate": 3.6859249799107275e-06, + "loss": 0.75068092, + "num_input_tokens_seen": 73767690, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8984375, + "step": 3416, + "time_per_iteration": 2.5656492710113525 + }, + { + "auxiliary_loss_clip": 0.01140135, + "auxiliary_loss_mlp": 0.01042271, + "balance_loss_clip": 1.02520072, + "balance_loss_mlp": 1.04695165, + "epoch": 0.20544115436645122, + "flos": 23148952312320.0, + "grad_norm": 2.5523210605949425, + "language_loss": 0.78623438, + "learning_rate": 3.6857154283340115e-06, + "loss": 0.80805844, + "num_input_tokens_seen": 73786900, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.93359375, + "step": 3417, + "time_per_iteration": 2.51582932472229 + }, + { + "auxiliary_loss_clip": 0.01140114, + "auxiliary_loss_mlp": 0.01046708, + "balance_loss_clip": 1.02948236, + "balance_loss_mlp": 1.04842472, + "epoch": 0.20550127761911918, + "flos": 19390433921280.0, + "grad_norm": 2.178207343470702, + "language_loss": 0.87390542, + "learning_rate": 3.685505812834798e-06, + "loss": 0.89577365, + "num_input_tokens_seen": 73804515, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.91796875, + "step": 3418, + "time_per_iteration": 2.4900615215301514 + }, + { + "auxiliary_loss_clip": 0.01139839, + "auxiliary_loss_mlp": 0.0104158, + "balance_loss_clip": 1.0251534, + "balance_loss_mlp": 1.04798996, + "epoch": 0.20556140087178718, + "flos": 22893124671360.0, + "grad_norm": 2.115759049165993, + "language_loss": 0.62156075, + "learning_rate": 3.685296133421035e-06, + "loss": 0.64337492, + "num_input_tokens_seen": 73822910, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.921875, + "step": 3419, + "time_per_iteration": 2.527057647705078 + }, + { + "auxiliary_loss_clip": 0.01143982, + "auxiliary_loss_mlp": 0.0104893, + "balance_loss_clip": 1.02977359, + "balance_loss_mlp": 1.04905963, + "epoch": 0.20562152412445514, + "flos": 19789652655360.0, + "grad_norm": 2.2865688080492466, + "language_loss": 0.86502206, + "learning_rate": 3.685086390100674e-06, + "loss": 0.88695121, + "num_input_tokens_seen": 73841160, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.953125, + "step": 3420, + "time_per_iteration": 2.532512664794922 + }, + { + "auxiliary_loss_clip": 0.01138181, + "auxiliary_loss_mlp": 0.0104181, + "balance_loss_clip": 1.02533531, + "balance_loss_mlp": 1.04659796, + "epoch": 0.2056816473771231, + "flos": 31501989210240.0, + "grad_norm": 2.535685660701584, + "language_loss": 0.70904821, + "learning_rate": 3.684876582881668e-06, + "loss": 0.73084807, + "num_input_tokens_seen": 73862795, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.91796875, + "step": 3421, + "time_per_iteration": 2.5924150943756104 + }, + { + "auxiliary_loss_clip": 0.0113664, + "auxiliary_loss_mlp": 0.01038524, + "balance_loss_clip": 1.02099967, + "balance_loss_mlp": 1.04581738, + "epoch": 0.20574177062979107, + "flos": 23258372117760.0, + "grad_norm": 3.5707952740494235, + "language_loss": 0.70370102, + "learning_rate": 3.6846667117719732e-06, + "loss": 0.72545266, + "num_input_tokens_seen": 73881525, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.90625, + "step": 3422, + "time_per_iteration": 2.499041795730591 + }, + { + "auxiliary_loss_clip": 0.01060302, + "auxiliary_loss_mlp": 0.01012319, + "balance_loss_clip": 1.01001859, + "balance_loss_mlp": 1.02983248, + "epoch": 0.20580189388245904, + "flos": 70312518708480.0, + "grad_norm": 0.7605512778953217, + "language_loss": 0.55499864, + "learning_rate": 3.684456776779548e-06, + "loss": 0.57572484, + "num_input_tokens_seen": 73937775, + "router_z_loss_clip": 0.02294922, + "router_z_loss_mlp": 0.3046875, + "step": 3423, + "time_per_iteration": 3.1569108963012695 + }, + { + "auxiliary_loss_clip": 0.0114215, + "auxiliary_loss_mlp": 0.01042807, + "balance_loss_clip": 1.02494931, + "balance_loss_mlp": 1.04882169, + "epoch": 0.205862017135127, + "flos": 30737846252160.0, + "grad_norm": 1.7754304652232902, + "language_loss": 0.71701574, + "learning_rate": 3.684246777912353e-06, + "loss": 0.73886526, + "num_input_tokens_seen": 73958250, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.9296875, + "step": 3424, + "time_per_iteration": 2.58278751373291 + }, + { + "auxiliary_loss_clip": 0.01140924, + "auxiliary_loss_mlp": 0.01046159, + "balance_loss_clip": 1.02920699, + "balance_loss_mlp": 1.05022514, + "epoch": 0.20592214038779497, + "flos": 21324546673920.0, + "grad_norm": 1.563470220797352, + "language_loss": 0.75031066, + "learning_rate": 3.684036715178351e-06, + "loss": 0.77218151, + "num_input_tokens_seen": 73977775, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.90625, + "step": 3425, + "time_per_iteration": 2.518050193786621 + }, + { + "auxiliary_loss_clip": 0.01145974, + "auxiliary_loss_mlp": 0.01057037, + "balance_loss_clip": 1.0404191, + "balance_loss_mlp": 1.0545603, + "epoch": 0.20598226364046296, + "flos": 22891652213760.0, + "grad_norm": 1.8081006382856646, + "language_loss": 0.88246548, + "learning_rate": 3.683826588585508e-06, + "loss": 0.90449566, + "num_input_tokens_seen": 73996590, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9140625, + "step": 3426, + "time_per_iteration": 2.5141823291778564 + }, + { + "auxiliary_loss_clip": 0.01139115, + "auxiliary_loss_mlp": 0.01046156, + "balance_loss_clip": 1.02927566, + "balance_loss_mlp": 1.04961991, + "epoch": 0.20604238689313092, + "flos": 23878549365120.0, + "grad_norm": 1.8273097367093476, + "language_loss": 0.76748925, + "learning_rate": 3.6836163981417926e-06, + "loss": 0.78934193, + "num_input_tokens_seen": 74015935, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.89453125, + "step": 3427, + "time_per_iteration": 4.068110227584839 + }, + { + "auxiliary_loss_clip": 0.01143208, + "auxiliary_loss_mlp": 0.01048853, + "balance_loss_clip": 1.03143609, + "balance_loss_mlp": 1.04978716, + "epoch": 0.2061025101457989, + "flos": 22491535639680.0, + "grad_norm": 1.6956079848027177, + "language_loss": 0.73914266, + "learning_rate": 3.683406143855174e-06, + "loss": 0.76106334, + "num_input_tokens_seen": 74036575, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.93359375, + "step": 3428, + "time_per_iteration": 2.5296199321746826 + }, + { + "auxiliary_loss_clip": 0.0113987, + "auxiliary_loss_mlp": 0.01049805, + "balance_loss_clip": 1.03188777, + "balance_loss_mlp": 1.04691577, + "epoch": 0.20616263339846685, + "flos": 22778928357120.0, + "grad_norm": 3.779292361126499, + "language_loss": 0.73553443, + "learning_rate": 3.6831958257336256e-06, + "loss": 0.75743121, + "num_input_tokens_seen": 74055365, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9296875, + "step": 3429, + "time_per_iteration": 3.979640483856201 + }, + { + "auxiliary_loss_clip": 0.01146724, + "auxiliary_loss_mlp": 0.01041423, + "balance_loss_clip": 1.0242331, + "balance_loss_mlp": 1.05180049, + "epoch": 0.20622275665113482, + "flos": 20882198684160.0, + "grad_norm": 1.8474903397728304, + "language_loss": 0.85301876, + "learning_rate": 3.6829854437851237e-06, + "loss": 0.87490022, + "num_input_tokens_seen": 74074875, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.94921875, + "step": 3430, + "time_per_iteration": 2.532275438308716 + }, + { + "auxiliary_loss_clip": 0.0114587, + "auxiliary_loss_mlp": 0.01052093, + "balance_loss_clip": 1.03411579, + "balance_loss_mlp": 1.05116892, + "epoch": 0.20628287990380278, + "flos": 19354415558400.0, + "grad_norm": 1.4715876867440674, + "language_loss": 0.69369543, + "learning_rate": 3.6827749980176444e-06, + "loss": 0.715675, + "num_input_tokens_seen": 74094505, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.94921875, + "step": 3431, + "time_per_iteration": 2.4857282638549805 + }, + { + "auxiliary_loss_clip": 0.01051719, + "auxiliary_loss_mlp": 0.01015472, + "balance_loss_clip": 1.01329005, + "balance_loss_mlp": 1.02078724, + "epoch": 0.20634300315647078, + "flos": 71517932248320.0, + "grad_norm": 0.8322663536180677, + "language_loss": 0.60249984, + "learning_rate": 3.6825644884391693e-06, + "loss": 0.62317169, + "num_input_tokens_seen": 74158500, + "router_z_loss_clip": 0.02185059, + "router_z_loss_mlp": 0.30859375, + "step": 3432, + "time_per_iteration": 3.250966787338257 + }, + { + "auxiliary_loss_clip": 0.01143675, + "auxiliary_loss_mlp": 0.01047086, + "balance_loss_clip": 1.03021789, + "balance_loss_mlp": 1.05125713, + "epoch": 0.20640312640913874, + "flos": 21723944976000.0, + "grad_norm": 1.7869258470827205, + "language_loss": 0.72495091, + "learning_rate": 3.682353915057679e-06, + "loss": 0.74685854, + "num_input_tokens_seen": 74176685, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.921875, + "step": 3433, + "time_per_iteration": 2.528576135635376 + }, + { + "auxiliary_loss_clip": 0.01143793, + "auxiliary_loss_mlp": 0.01050396, + "balance_loss_clip": 1.03295541, + "balance_loss_mlp": 1.04886997, + "epoch": 0.2064632496618067, + "flos": 20554621626240.0, + "grad_norm": 1.715054190412472, + "language_loss": 0.8721565, + "learning_rate": 3.6821432778811604e-06, + "loss": 0.8940984, + "num_input_tokens_seen": 74194935, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.94921875, + "step": 3434, + "time_per_iteration": 2.507589101791382 + }, + { + "auxiliary_loss_clip": 0.01144514, + "auxiliary_loss_mlp": 0.01043806, + "balance_loss_clip": 1.0269376, + "balance_loss_mlp": 1.04833162, + "epoch": 0.20652337291447467, + "flos": 29823273135360.0, + "grad_norm": 1.6274854163318595, + "language_loss": 0.69133317, + "learning_rate": 3.6819325769176004e-06, + "loss": 0.71321636, + "num_input_tokens_seen": 74215400, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9609375, + "step": 3435, + "time_per_iteration": 2.587930679321289 + }, + { + "auxiliary_loss_clip": 0.01140929, + "auxiliary_loss_mlp": 0.01041675, + "balance_loss_clip": 1.0241158, + "balance_loss_mlp": 1.04983366, + "epoch": 0.20658349616714264, + "flos": 26213640618240.0, + "grad_norm": 1.7028603597643168, + "language_loss": 0.8922776, + "learning_rate": 3.681721812174988e-06, + "loss": 0.91410363, + "num_input_tokens_seen": 74234090, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.91015625, + "step": 3436, + "time_per_iteration": 2.57295298576355 + }, + { + "auxiliary_loss_clip": 0.01144451, + "auxiliary_loss_mlp": 0.01035549, + "balance_loss_clip": 1.01856136, + "balance_loss_mlp": 1.05126333, + "epoch": 0.2066436194198106, + "flos": 25994370044160.0, + "grad_norm": 1.8990861512322268, + "language_loss": 0.76659, + "learning_rate": 3.6815109836613163e-06, + "loss": 0.78839004, + "num_input_tokens_seen": 74253345, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9296875, + "step": 3437, + "time_per_iteration": 2.5819849967956543 + }, + { + "auxiliary_loss_clip": 0.01142266, + "auxiliary_loss_mlp": 0.0104022, + "balance_loss_clip": 1.02397132, + "balance_loss_mlp": 1.04877901, + "epoch": 0.20670374267247857, + "flos": 21361067827200.0, + "grad_norm": 1.7925672188665596, + "language_loss": 0.77611911, + "learning_rate": 3.6813000913845795e-06, + "loss": 0.79794395, + "num_input_tokens_seen": 74271615, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.93359375, + "step": 3438, + "time_per_iteration": 2.5091731548309326 + }, + { + "auxiliary_loss_clip": 0.01047915, + "auxiliary_loss_mlp": 0.01005377, + "balance_loss_clip": 1.00348175, + "balance_loss_mlp": 1.01723933, + "epoch": 0.20676386592514656, + "flos": 66383281952640.0, + "grad_norm": 0.8367234589951487, + "language_loss": 0.67141807, + "learning_rate": 3.6810891353527747e-06, + "loss": 0.69195092, + "num_input_tokens_seen": 74331390, + "router_z_loss_clip": 0.0189209, + "router_z_loss_mlp": 0.30664062, + "step": 3439, + "time_per_iteration": 3.0797181129455566 + }, + { + "auxiliary_loss_clip": 0.01142942, + "auxiliary_loss_mlp": 0.01037044, + "balance_loss_clip": 1.02028275, + "balance_loss_mlp": 1.04791629, + "epoch": 0.20682398917781453, + "flos": 17274577328640.0, + "grad_norm": 2.0580501207842428, + "language_loss": 0.83931267, + "learning_rate": 3.6808781155739014e-06, + "loss": 0.86111259, + "num_input_tokens_seen": 74347335, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.94921875, + "step": 3440, + "time_per_iteration": 2.5015172958374023 + }, + { + "auxiliary_loss_clip": 0.01143016, + "auxiliary_loss_mlp": 0.01041686, + "balance_loss_clip": 1.02584338, + "balance_loss_mlp": 1.05009377, + "epoch": 0.2068841124304825, + "flos": 18077288515200.0, + "grad_norm": 1.9416657792651912, + "language_loss": 0.84825736, + "learning_rate": 3.6806670320559614e-06, + "loss": 0.87010437, + "num_input_tokens_seen": 74366310, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.9296875, + "step": 3441, + "time_per_iteration": 2.4866137504577637 + }, + { + "auxiliary_loss_clip": 0.01140001, + "auxiliary_loss_mlp": 0.01044739, + "balance_loss_clip": 1.02778697, + "balance_loss_mlp": 1.0502038, + "epoch": 0.20694423568315046, + "flos": 27347017432320.0, + "grad_norm": 1.6577892844013908, + "language_loss": 0.85889506, + "learning_rate": 3.680455884806959e-06, + "loss": 0.88074249, + "num_input_tokens_seen": 74387100, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8984375, + "step": 3442, + "time_per_iteration": 2.5914649963378906 + }, + { + "auxiliary_loss_clip": 0.01145487, + "auxiliary_loss_mlp": 0.01040291, + "balance_loss_clip": 1.02305317, + "balance_loss_mlp": 1.05208063, + "epoch": 0.20700435893581842, + "flos": 20229845829120.0, + "grad_norm": 1.9070439101703558, + "language_loss": 0.72829354, + "learning_rate": 3.6802446738349014e-06, + "loss": 0.75015128, + "num_input_tokens_seen": 74404460, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.93359375, + "step": 3443, + "time_per_iteration": 2.5210063457489014 + }, + { + "auxiliary_loss_clip": 0.01140016, + "auxiliary_loss_mlp": 0.01044032, + "balance_loss_clip": 1.02879703, + "balance_loss_mlp": 1.0496819, + "epoch": 0.2070644821884864, + "flos": 20631111638400.0, + "grad_norm": 2.5056876708900186, + "language_loss": 0.85428166, + "learning_rate": 3.680033399147797e-06, + "loss": 0.87612224, + "num_input_tokens_seen": 74423790, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.90625, + "step": 3444, + "time_per_iteration": 2.528493881225586 + }, + { + "auxiliary_loss_clip": 0.01047325, + "auxiliary_loss_mlp": 0.0100746, + "balance_loss_clip": 1.00537384, + "balance_loss_mlp": 1.01688242, + "epoch": 0.20712460544115438, + "flos": 65941077617280.0, + "grad_norm": 0.6978715278146553, + "language_loss": 0.57091653, + "learning_rate": 3.6798220607536585e-06, + "loss": 0.5914644, + "num_input_tokens_seen": 74488130, + "router_z_loss_clip": 0.02087402, + "router_z_loss_mlp": 0.3046875, + "step": 3445, + "time_per_iteration": 3.086552619934082 + }, + { + "auxiliary_loss_clip": 0.01140085, + "auxiliary_loss_mlp": 0.01050946, + "balance_loss_clip": 1.03356516, + "balance_loss_mlp": 1.04968095, + "epoch": 0.20718472869382235, + "flos": 19425734012160.0, + "grad_norm": 1.5621496076246746, + "language_loss": 0.78459281, + "learning_rate": 3.6796106586604987e-06, + "loss": 0.80650306, + "num_input_tokens_seen": 74506720, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.90625, + "step": 3446, + "time_per_iteration": 2.4844422340393066 + }, + { + "auxiliary_loss_clip": 0.01148285, + "auxiliary_loss_mlp": 0.01048146, + "balance_loss_clip": 1.02846456, + "balance_loss_mlp": 1.05057228, + "epoch": 0.2072448519464903, + "flos": 24499049834880.0, + "grad_norm": 2.157476270385918, + "language_loss": 0.62436825, + "learning_rate": 3.679399192876334e-06, + "loss": 0.64633256, + "num_input_tokens_seen": 74525330, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.9765625, + "step": 3447, + "time_per_iteration": 2.592799663543701 + }, + { + "auxiliary_loss_clip": 0.01141894, + "auxiliary_loss_mlp": 0.01047763, + "balance_loss_clip": 1.03071666, + "balance_loss_mlp": 1.04810297, + "epoch": 0.20730497519915828, + "flos": 23075694524160.0, + "grad_norm": 1.740614876967074, + "language_loss": 0.86066437, + "learning_rate": 3.679187663409184e-06, + "loss": 0.88256097, + "num_input_tokens_seen": 74544535, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9375, + "step": 3448, + "time_per_iteration": 2.5054237842559814 + }, + { + "auxiliary_loss_clip": 0.01140662, + "auxiliary_loss_mlp": 0.01044351, + "balance_loss_clip": 1.02576649, + "balance_loss_mlp": 1.04814398, + "epoch": 0.20736509845182624, + "flos": 21069042255360.0, + "grad_norm": 3.1117492515519665, + "language_loss": 0.75452864, + "learning_rate": 3.6789760702670696e-06, + "loss": 0.77637869, + "num_input_tokens_seen": 74562300, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.92578125, + "step": 3449, + "time_per_iteration": 2.506657838821411 + }, + { + "auxiliary_loss_clip": 0.01145667, + "auxiliary_loss_mlp": 0.01050496, + "balance_loss_clip": 1.03194678, + "balance_loss_mlp": 1.04896426, + "epoch": 0.2074252217044942, + "flos": 17633288499840.0, + "grad_norm": 1.7877143934577313, + "language_loss": 0.76703656, + "learning_rate": 3.6787644134580134e-06, + "loss": 0.78899819, + "num_input_tokens_seen": 74580080, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.96875, + "step": 3450, + "time_per_iteration": 2.479090929031372 + }, + { + "auxiliary_loss_clip": 0.01143955, + "auxiliary_loss_mlp": 0.01047659, + "balance_loss_clip": 1.0302192, + "balance_loss_mlp": 1.04780531, + "epoch": 0.20748534495716217, + "flos": 23546985897600.0, + "grad_norm": 1.5227053471466307, + "language_loss": 0.822101, + "learning_rate": 3.6785526929900436e-06, + "loss": 0.84401715, + "num_input_tokens_seen": 74598980, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9609375, + "step": 3451, + "time_per_iteration": 2.5465826988220215 + }, + { + "auxiliary_loss_clip": 0.01047156, + "auxiliary_loss_mlp": 0.01003865, + "balance_loss_clip": 1.00193334, + "balance_loss_mlp": 1.01645589, + "epoch": 0.20754546820983016, + "flos": 52252935598080.0, + "grad_norm": 0.7930757504147553, + "language_loss": 0.56569821, + "learning_rate": 3.6783409088711875e-06, + "loss": 0.58620846, + "num_input_tokens_seen": 74655275, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.30859375, + "step": 3452, + "time_per_iteration": 2.979168653488159 + }, + { + "auxiliary_loss_clip": 0.01144097, + "auxiliary_loss_mlp": 0.01045617, + "balance_loss_clip": 1.02765203, + "balance_loss_mlp": 1.0492605, + "epoch": 0.20760559146249813, + "flos": 20412379768320.0, + "grad_norm": 1.970927529953097, + "language_loss": 0.88332593, + "learning_rate": 3.6781290611094755e-06, + "loss": 0.90522313, + "num_input_tokens_seen": 74674560, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.94921875, + "step": 3453, + "time_per_iteration": 2.5404746532440186 + }, + { + "auxiliary_loss_clip": 0.01145334, + "auxiliary_loss_mlp": 0.01043412, + "balance_loss_clip": 1.02518487, + "balance_loss_mlp": 1.05121803, + "epoch": 0.2076657147151661, + "flos": 23186012169600.0, + "grad_norm": 1.6193396769615114, + "language_loss": 0.80056196, + "learning_rate": 3.6779171497129407e-06, + "loss": 0.82244939, + "num_input_tokens_seen": 74694500, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.94140625, + "step": 3454, + "time_per_iteration": 2.536154270172119 + }, + { + "auxiliary_loss_clip": 0.0114231, + "auxiliary_loss_mlp": 0.0104846, + "balance_loss_clip": 1.03128242, + "balance_loss_mlp": 1.04881716, + "epoch": 0.20772583796783406, + "flos": 18293219124480.0, + "grad_norm": 3.767477329453147, + "language_loss": 0.76424366, + "learning_rate": 3.6777051746896202e-06, + "loss": 0.78615135, + "num_input_tokens_seen": 74710485, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.93359375, + "step": 3455, + "time_per_iteration": 2.502450466156006 + }, + { + "auxiliary_loss_clip": 0.01141184, + "auxiliary_loss_mlp": 0.01049655, + "balance_loss_clip": 1.03247654, + "balance_loss_mlp": 1.04867601, + "epoch": 0.20778596122050202, + "flos": 17602800831360.0, + "grad_norm": 2.1876724852466163, + "language_loss": 0.80599815, + "learning_rate": 3.6774931360475516e-06, + "loss": 0.82790661, + "num_input_tokens_seen": 74727450, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.92578125, + "step": 3456, + "time_per_iteration": 2.495405673980713 + }, + { + "auxiliary_loss_clip": 0.01147485, + "auxiliary_loss_mlp": 0.01042924, + "balance_loss_clip": 1.02447069, + "balance_loss_mlp": 1.05180097, + "epoch": 0.20784608447317, + "flos": 23805578885760.0, + "grad_norm": 1.5859267830694757, + "language_loss": 0.77988815, + "learning_rate": 3.6772810337947745e-06, + "loss": 0.80179226, + "num_input_tokens_seen": 74746725, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.95703125, + "step": 3457, + "time_per_iteration": 2.5625829696655273 + }, + { + "auxiliary_loss_clip": 0.01149281, + "auxiliary_loss_mlp": 0.01054167, + "balance_loss_clip": 1.03461635, + "balance_loss_mlp": 1.05195451, + "epoch": 0.20790620772583795, + "flos": 17639286071040.0, + "grad_norm": 2.0073788397072136, + "language_loss": 0.83581042, + "learning_rate": 3.677068867939333e-06, + "loss": 0.85784483, + "num_input_tokens_seen": 74765255, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.96875, + "step": 3458, + "time_per_iteration": 2.470740556716919 + }, + { + "auxiliary_loss_clip": 0.01142717, + "auxiliary_loss_mlp": 0.01041287, + "balance_loss_clip": 1.02443111, + "balance_loss_mlp": 1.05063045, + "epoch": 0.20796633097850595, + "flos": 27673481168640.0, + "grad_norm": 1.732611194718632, + "language_loss": 0.76041365, + "learning_rate": 3.676856638489272e-06, + "loss": 0.78225368, + "num_input_tokens_seen": 74785710, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.921875, + "step": 3459, + "time_per_iteration": 2.5753207206726074 + }, + { + "auxiliary_loss_clip": 0.01138446, + "auxiliary_loss_mlp": 0.01041199, + "balance_loss_clip": 1.02451003, + "balance_loss_mlp": 1.04829502, + "epoch": 0.2080264542311739, + "flos": 19245606284160.0, + "grad_norm": 2.1264218253084386, + "language_loss": 0.77302521, + "learning_rate": 3.6766443454526382e-06, + "loss": 0.79482168, + "num_input_tokens_seen": 74804490, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8984375, + "step": 3460, + "time_per_iteration": 2.498760938644409 + }, + { + "auxiliary_loss_clip": 0.01143636, + "auxiliary_loss_mlp": 0.01050405, + "balance_loss_clip": 1.03284574, + "balance_loss_mlp": 1.04819179, + "epoch": 0.20808657748384188, + "flos": 27525924097920.0, + "grad_norm": 2.1644839576228296, + "language_loss": 0.75785947, + "learning_rate": 3.6764319888374836e-06, + "loss": 0.77979982, + "num_input_tokens_seen": 74826340, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.953125, + "step": 3461, + "time_per_iteration": 2.5850372314453125 + }, + { + "auxiliary_loss_clip": 0.01145604, + "auxiliary_loss_mlp": 0.010446, + "balance_loss_clip": 1.02645624, + "balance_loss_mlp": 1.0469749, + "epoch": 0.20814670073650984, + "flos": 26906931999360.0, + "grad_norm": 1.8484421465162717, + "language_loss": 0.88227051, + "learning_rate": 3.6762195686518604e-06, + "loss": 0.90417254, + "num_input_tokens_seen": 74844960, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.98828125, + "step": 3462, + "time_per_iteration": 2.558375358581543 + }, + { + "auxiliary_loss_clip": 0.01043601, + "auxiliary_loss_mlp": 0.0101247, + "balance_loss_clip": 1.01059818, + "balance_loss_mlp": 1.01278758, + "epoch": 0.2082068239891778, + "flos": 70175735717760.0, + "grad_norm": 0.7627714646141646, + "language_loss": 0.59057152, + "learning_rate": 3.6760070849038226e-06, + "loss": 0.6111322, + "num_input_tokens_seen": 74909075, + "router_z_loss_clip": 0.01867676, + "router_z_loss_mlp": 0.30859375, + "step": 3463, + "time_per_iteration": 3.2280492782592773 + }, + { + "auxiliary_loss_clip": 0.01144566, + "auxiliary_loss_mlp": 0.01049331, + "balance_loss_clip": 1.03056765, + "balance_loss_mlp": 1.04713821, + "epoch": 0.20826694724184577, + "flos": 24608074590720.0, + "grad_norm": 2.542529703880477, + "language_loss": 0.65831709, + "learning_rate": 3.675794537601429e-06, + "loss": 0.68025607, + "num_input_tokens_seen": 74928125, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.97265625, + "step": 3464, + "time_per_iteration": 2.5706918239593506 + }, + { + "auxiliary_loss_clip": 0.01147872, + "auxiliary_loss_mlp": 0.01050812, + "balance_loss_clip": 1.03160763, + "balance_loss_mlp": 1.0492928, + "epoch": 0.20832707049451377, + "flos": 12892829034240.0, + "grad_norm": 2.848617339554035, + "language_loss": 0.83536243, + "learning_rate": 3.6755819267527373e-06, + "loss": 0.85734928, + "num_input_tokens_seen": 74945090, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.984375, + "step": 3465, + "time_per_iteration": 2.535473585128784 + }, + { + "auxiliary_loss_clip": 0.01143191, + "auxiliary_loss_mlp": 0.01044869, + "balance_loss_clip": 1.02767932, + "balance_loss_mlp": 1.04802513, + "epoch": 0.20838719374718173, + "flos": 22198827709440.0, + "grad_norm": 3.628659863163492, + "language_loss": 0.81463158, + "learning_rate": 3.6753692523658113e-06, + "loss": 0.83651215, + "num_input_tokens_seen": 74963630, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.953125, + "step": 3466, + "time_per_iteration": 2.535311222076416 + }, + { + "auxiliary_loss_clip": 0.01146517, + "auxiliary_loss_mlp": 0.01044717, + "balance_loss_clip": 1.02863586, + "balance_loss_mlp": 1.05303347, + "epoch": 0.2084473169998497, + "flos": 15158648908800.0, + "grad_norm": 1.967186340276973, + "language_loss": 0.81678396, + "learning_rate": 3.675156514448716e-06, + "loss": 0.83869636, + "num_input_tokens_seen": 74981875, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.9375, + "step": 3467, + "time_per_iteration": 2.4783830642700195 + }, + { + "auxiliary_loss_clip": 0.01142574, + "auxiliary_loss_mlp": 0.01041679, + "balance_loss_clip": 1.02469158, + "balance_loss_mlp": 1.05200005, + "epoch": 0.20850744025251766, + "flos": 17456788045440.0, + "grad_norm": 2.0682841758185235, + "language_loss": 0.8186093, + "learning_rate": 3.674943713009518e-06, + "loss": 0.84045184, + "num_input_tokens_seen": 74999155, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.90625, + "step": 3468, + "time_per_iteration": 2.5275001525878906 + }, + { + "auxiliary_loss_clip": 0.0114752, + "auxiliary_loss_mlp": 0.01046643, + "balance_loss_clip": 1.02677095, + "balance_loss_mlp": 1.05024171, + "epoch": 0.20856756350518563, + "flos": 25698968593920.0, + "grad_norm": 1.9832892060266627, + "language_loss": 0.90227246, + "learning_rate": 3.6747308480562856e-06, + "loss": 0.92421412, + "num_input_tokens_seen": 75017850, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.9765625, + "step": 3469, + "time_per_iteration": 3.999607563018799 + }, + { + "auxiliary_loss_clip": 0.01147477, + "auxiliary_loss_mlp": 0.01051285, + "balance_loss_clip": 1.03329682, + "balance_loss_mlp": 1.0530771, + "epoch": 0.2086276867578536, + "flos": 37889060970240.0, + "grad_norm": 1.764094275638393, + "language_loss": 0.7643016, + "learning_rate": 3.674517919597092e-06, + "loss": 0.78628922, + "num_input_tokens_seen": 75039270, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9453125, + "step": 3470, + "time_per_iteration": 4.186570405960083 + }, + { + "auxiliary_loss_clip": 0.0114555, + "auxiliary_loss_mlp": 0.01048445, + "balance_loss_clip": 1.03039646, + "balance_loss_mlp": 1.05154145, + "epoch": 0.20868781001052156, + "flos": 25557049958400.0, + "grad_norm": 1.7254586081909284, + "language_loss": 0.7592454, + "learning_rate": 3.674304927640011e-06, + "loss": 0.78118539, + "num_input_tokens_seen": 75059350, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.94140625, + "step": 3471, + "time_per_iteration": 2.5700020790100098 + }, + { + "auxiliary_loss_clip": 0.01148899, + "auxiliary_loss_mlp": 0.01054484, + "balance_loss_clip": 1.03488564, + "balance_loss_mlp": 1.04796982, + "epoch": 0.20874793326318955, + "flos": 27529192235520.0, + "grad_norm": 1.907022336492936, + "language_loss": 0.75515926, + "learning_rate": 3.67409187219312e-06, + "loss": 0.77719313, + "num_input_tokens_seen": 75080150, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0078125, + "step": 3472, + "time_per_iteration": 2.555927038192749 + }, + { + "auxiliary_loss_clip": 0.01144631, + "auxiliary_loss_mlp": 0.01045397, + "balance_loss_clip": 1.02790928, + "balance_loss_mlp": 1.05051231, + "epoch": 0.20880805651585752, + "flos": 18548795370240.0, + "grad_norm": 1.9877478939715982, + "language_loss": 0.84168947, + "learning_rate": 3.6738787532644966e-06, + "loss": 0.86358976, + "num_input_tokens_seen": 75097920, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.94140625, + "step": 3473, + "time_per_iteration": 2.5261759757995605 + }, + { + "auxiliary_loss_clip": 0.01043725, + "auxiliary_loss_mlp": 0.0100228, + "balance_loss_clip": 1.00027776, + "balance_loss_mlp": 1.01290703, + "epoch": 0.20886817976852548, + "flos": 65946644225280.0, + "grad_norm": 0.8792852781400284, + "language_loss": 0.63631999, + "learning_rate": 3.6736655708622235e-06, + "loss": 0.65678006, + "num_input_tokens_seen": 75152410, + "router_z_loss_clip": 0.02001953, + "router_z_loss_mlp": 0.30859375, + "step": 3474, + "time_per_iteration": 3.025831460952759 + }, + { + "auxiliary_loss_clip": 0.01146356, + "auxiliary_loss_mlp": 0.01041236, + "balance_loss_clip": 1.02334285, + "balance_loss_mlp": 1.04993105, + "epoch": 0.20892830302119345, + "flos": 36539178929280.0, + "grad_norm": 2.882119897934913, + "language_loss": 0.69867098, + "learning_rate": 3.6734523249943844e-06, + "loss": 0.72054696, + "num_input_tokens_seen": 75173265, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.96484375, + "step": 3475, + "time_per_iteration": 2.751676559448242 + }, + { + "auxiliary_loss_clip": 0.01146508, + "auxiliary_loss_mlp": 0.01047852, + "balance_loss_clip": 1.02961278, + "balance_loss_mlp": 1.05162299, + "epoch": 0.2089884262738614, + "flos": 20956749361920.0, + "grad_norm": 1.4951270147360183, + "language_loss": 0.70032048, + "learning_rate": 3.673239015669065e-06, + "loss": 0.72226411, + "num_input_tokens_seen": 75193640, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.94921875, + "step": 3476, + "time_per_iteration": 2.5493083000183105 + }, + { + "auxiliary_loss_clip": 0.011446, + "auxiliary_loss_mlp": 0.01046029, + "balance_loss_clip": 1.02850533, + "balance_loss_mlp": 1.05099094, + "epoch": 0.20904854952652938, + "flos": 22784028088320.0, + "grad_norm": 2.0857679152031716, + "language_loss": 0.89590299, + "learning_rate": 3.6730256428943544e-06, + "loss": 0.91780925, + "num_input_tokens_seen": 75212545, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9375, + "step": 3477, + "time_per_iteration": 2.506962537765503 + }, + { + "auxiliary_loss_clip": 0.01142894, + "auxiliary_loss_mlp": 0.01047844, + "balance_loss_clip": 1.03005815, + "balance_loss_mlp": 1.04896593, + "epoch": 0.20910867277919734, + "flos": 27303277645440.0, + "grad_norm": 4.245750786990739, + "language_loss": 0.67988396, + "learning_rate": 3.672812206678344e-06, + "loss": 0.70179135, + "num_input_tokens_seen": 75230865, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9375, + "step": 3478, + "time_per_iteration": 2.57366681098938 + }, + { + "auxiliary_loss_clip": 0.01143008, + "auxiliary_loss_mlp": 0.0104171, + "balance_loss_clip": 1.02334023, + "balance_loss_mlp": 1.04826832, + "epoch": 0.20916879603186533, + "flos": 14319237000960.0, + "grad_norm": 2.137628491911851, + "language_loss": 0.85035646, + "learning_rate": 3.672598707029127e-06, + "loss": 0.87220371, + "num_input_tokens_seen": 75248285, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.94921875, + "step": 3479, + "time_per_iteration": 2.4716267585754395 + }, + { + "auxiliary_loss_clip": 0.01146636, + "auxiliary_loss_mlp": 0.01049707, + "balance_loss_clip": 1.03156328, + "balance_loss_mlp": 1.04972577, + "epoch": 0.2092289192845333, + "flos": 22273019251200.0, + "grad_norm": 2.2225866030569175, + "language_loss": 0.73807257, + "learning_rate": 3.6723851439548003e-06, + "loss": 0.76003599, + "num_input_tokens_seen": 75266310, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.96875, + "step": 3480, + "time_per_iteration": 2.4856386184692383 + }, + { + "auxiliary_loss_clip": 0.01141126, + "auxiliary_loss_mlp": 0.01047253, + "balance_loss_clip": 1.03113592, + "balance_loss_mlp": 1.04844785, + "epoch": 0.20928904253720126, + "flos": 14830712714880.0, + "grad_norm": 2.023418551380918, + "language_loss": 0.75601453, + "learning_rate": 3.67217151746346e-06, + "loss": 0.77789831, + "num_input_tokens_seen": 75284175, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.9296875, + "step": 3481, + "time_per_iteration": 2.4812443256378174 + }, + { + "auxiliary_loss_clip": 0.01145872, + "auxiliary_loss_mlp": 0.0104777, + "balance_loss_clip": 1.03051996, + "balance_loss_mlp": 1.05047393, + "epoch": 0.20934916578986923, + "flos": 23259162216960.0, + "grad_norm": 3.5251666716598273, + "language_loss": 0.85337639, + "learning_rate": 3.671957827563209e-06, + "loss": 0.87531281, + "num_input_tokens_seen": 75303465, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.953125, + "step": 3482, + "time_per_iteration": 2.521284580230713 + }, + { + "auxiliary_loss_clip": 0.01145664, + "auxiliary_loss_mlp": 0.01048133, + "balance_loss_clip": 1.02940559, + "balance_loss_mlp": 1.05097377, + "epoch": 0.2094092890425372, + "flos": 32014398677760.0, + "grad_norm": 2.8936854891166743, + "language_loss": 0.70626152, + "learning_rate": 3.6717440742621494e-06, + "loss": 0.72819948, + "num_input_tokens_seen": 75325290, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9453125, + "step": 3483, + "time_per_iteration": 2.5876524448394775 + }, + { + "auxiliary_loss_clip": 0.01146142, + "auxiliary_loss_mlp": 0.01060474, + "balance_loss_clip": 1.04193723, + "balance_loss_mlp": 1.04891169, + "epoch": 0.20946941229520516, + "flos": 20010647082240.0, + "grad_norm": 1.8606830424584557, + "language_loss": 0.74988431, + "learning_rate": 3.6715302575683865e-06, + "loss": 0.77195048, + "num_input_tokens_seen": 75343895, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.96875, + "step": 3484, + "time_per_iteration": 2.49701189994812 + }, + { + "auxiliary_loss_clip": 0.01143763, + "auxiliary_loss_mlp": 0.01048057, + "balance_loss_clip": 1.02991378, + "balance_loss_mlp": 1.05028141, + "epoch": 0.20952953554787315, + "flos": 30740072895360.0, + "grad_norm": 1.8378150509428508, + "language_loss": 0.70690203, + "learning_rate": 3.6713163774900292e-06, + "loss": 0.7288202, + "num_input_tokens_seen": 75367100, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9375, + "step": 3485, + "time_per_iteration": 2.5692059993743896 + }, + { + "auxiliary_loss_clip": 0.01146857, + "auxiliary_loss_mlp": 0.01045553, + "balance_loss_clip": 1.02712297, + "balance_loss_mlp": 1.05028093, + "epoch": 0.20958965880054112, + "flos": 27049209770880.0, + "grad_norm": 1.9069158447471781, + "language_loss": 0.82965356, + "learning_rate": 3.6711024340351875e-06, + "loss": 0.85157764, + "num_input_tokens_seen": 75389925, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.96875, + "step": 3486, + "time_per_iteration": 2.569308042526245 + }, + { + "auxiliary_loss_clip": 0.0114472, + "auxiliary_loss_mlp": 0.01050567, + "balance_loss_clip": 1.03309095, + "balance_loss_mlp": 1.04790449, + "epoch": 0.20964978205320908, + "flos": 34204123589760.0, + "grad_norm": 3.843984040964354, + "language_loss": 0.8699702, + "learning_rate": 3.6708884272119737e-06, + "loss": 0.89192313, + "num_input_tokens_seen": 75408575, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.96875, + "step": 3487, + "time_per_iteration": 2.608441114425659 + }, + { + "auxiliary_loss_clip": 0.01141021, + "auxiliary_loss_mlp": 0.01047132, + "balance_loss_clip": 1.0287739, + "balance_loss_mlp": 1.04695904, + "epoch": 0.20970990530587705, + "flos": 23477391296640.0, + "grad_norm": 2.4377115915778713, + "language_loss": 0.72369969, + "learning_rate": 3.670674357028504e-06, + "loss": 0.74558127, + "num_input_tokens_seen": 75427155, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.94140625, + "step": 3488, + "time_per_iteration": 2.529233694076538 + }, + { + "auxiliary_loss_clip": 0.01144055, + "auxiliary_loss_mlp": 0.01045689, + "balance_loss_clip": 1.02812946, + "balance_loss_mlp": 1.04897618, + "epoch": 0.209770028558545, + "flos": 18551452976640.0, + "grad_norm": 2.6657941113460764, + "language_loss": 0.80726898, + "learning_rate": 3.6704602234928945e-06, + "loss": 0.82916641, + "num_input_tokens_seen": 75444450, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.953125, + "step": 3489, + "time_per_iteration": 2.4847962856292725 + }, + { + "auxiliary_loss_clip": 0.01142088, + "auxiliary_loss_mlp": 0.01042551, + "balance_loss_clip": 1.0253495, + "balance_loss_mlp": 1.04718399, + "epoch": 0.20983015181121298, + "flos": 21617003208960.0, + "grad_norm": 1.7888402521564877, + "language_loss": 0.72827011, + "learning_rate": 3.670246026613266e-06, + "loss": 0.75011659, + "num_input_tokens_seen": 75462625, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.94921875, + "step": 3490, + "time_per_iteration": 2.543064594268799 + }, + { + "auxiliary_loss_clip": 0.01140159, + "auxiliary_loss_mlp": 0.0105099, + "balance_loss_clip": 1.03437209, + "balance_loss_mlp": 1.04955435, + "epoch": 0.20989027506388094, + "flos": 16614718531200.0, + "grad_norm": 5.073894522138561, + "language_loss": 0.70159817, + "learning_rate": 3.6700317663977415e-06, + "loss": 0.72350967, + "num_input_tokens_seen": 75480640, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.90625, + "step": 3491, + "time_per_iteration": 2.4785172939300537 + }, + { + "auxiliary_loss_clip": 0.01142629, + "auxiliary_loss_mlp": 0.01045142, + "balance_loss_clip": 1.02633047, + "balance_loss_mlp": 1.04678369, + "epoch": 0.20995039831654894, + "flos": 23216823060480.0, + "grad_norm": 3.7459720995568557, + "language_loss": 0.7931999, + "learning_rate": 3.669817442854444e-06, + "loss": 0.8150776, + "num_input_tokens_seen": 75494900, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9609375, + "step": 3492, + "time_per_iteration": 2.5213027000427246 + }, + { + "auxiliary_loss_clip": 0.01144565, + "auxiliary_loss_mlp": 0.01041079, + "balance_loss_clip": 1.02341175, + "balance_loss_mlp": 1.04977345, + "epoch": 0.2100105215692169, + "flos": 18147493647360.0, + "grad_norm": 1.9629392465329358, + "language_loss": 0.86883962, + "learning_rate": 3.669603055991502e-06, + "loss": 0.89069605, + "num_input_tokens_seen": 75513370, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9453125, + "step": 3493, + "time_per_iteration": 2.499797821044922 + }, + { + "auxiliary_loss_clip": 0.01139311, + "auxiliary_loss_mlp": 0.01040774, + "balance_loss_clip": 1.02408433, + "balance_loss_mlp": 1.04791212, + "epoch": 0.21007064482188487, + "flos": 15961611490560.0, + "grad_norm": 1.8525794886403055, + "language_loss": 0.68810928, + "learning_rate": 3.6693886058170455e-06, + "loss": 0.70991009, + "num_input_tokens_seen": 75532480, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9140625, + "step": 3494, + "time_per_iteration": 2.5374889373779297 + }, + { + "auxiliary_loss_clip": 0.01146689, + "auxiliary_loss_mlp": 0.01037903, + "balance_loss_clip": 1.02054656, + "balance_loss_mlp": 1.05010796, + "epoch": 0.21013076807455283, + "flos": 32234315696640.0, + "grad_norm": 1.7465496854212388, + "language_loss": 0.78900456, + "learning_rate": 3.6691740923392053e-06, + "loss": 0.81085044, + "num_input_tokens_seen": 75552745, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.96484375, + "step": 3495, + "time_per_iteration": 2.6390578746795654 + }, + { + "auxiliary_loss_clip": 0.0114231, + "auxiliary_loss_mlp": 0.01042653, + "balance_loss_clip": 1.02505755, + "balance_loss_mlp": 1.04696178, + "epoch": 0.2101908913272208, + "flos": 23696625957120.0, + "grad_norm": 1.7459726457298623, + "language_loss": 0.77192879, + "learning_rate": 3.668959515566116e-06, + "loss": 0.79377842, + "num_input_tokens_seen": 75574355, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.953125, + "step": 3496, + "time_per_iteration": 2.552386522293091 + }, + { + "auxiliary_loss_clip": 0.01145605, + "auxiliary_loss_mlp": 0.0105152, + "balance_loss_clip": 1.03297126, + "balance_loss_mlp": 1.04933989, + "epoch": 0.21025101457988876, + "flos": 20375786787840.0, + "grad_norm": 2.0396086665216777, + "language_loss": 0.82009852, + "learning_rate": 3.668744875505915e-06, + "loss": 0.84206975, + "num_input_tokens_seen": 75592215, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9609375, + "step": 3497, + "time_per_iteration": 2.498359441757202 + }, + { + "auxiliary_loss_clip": 0.01146873, + "auxiliary_loss_mlp": 0.01048221, + "balance_loss_clip": 1.03091133, + "balance_loss_mlp": 1.04979134, + "epoch": 0.21031113783255675, + "flos": 25775638174080.0, + "grad_norm": 2.5223195218779577, + "language_loss": 0.67314029, + "learning_rate": 3.668530172166741e-06, + "loss": 0.69509119, + "num_input_tokens_seen": 75610740, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.96875, + "step": 3498, + "time_per_iteration": 2.540766716003418 + }, + { + "auxiliary_loss_clip": 0.01145112, + "auxiliary_loss_mlp": 0.01045261, + "balance_loss_clip": 1.02679563, + "balance_loss_mlp": 1.04782224, + "epoch": 0.21037126108522472, + "flos": 22018197191040.0, + "grad_norm": 2.2477271783909414, + "language_loss": 0.80623376, + "learning_rate": 3.6683154055567352e-06, + "loss": 0.82813752, + "num_input_tokens_seen": 75631005, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.97265625, + "step": 3499, + "time_per_iteration": 2.5283098220825195 + }, + { + "auxiliary_loss_clip": 0.0114621, + "auxiliary_loss_mlp": 0.01043014, + "balance_loss_clip": 1.02612233, + "balance_loss_mlp": 1.05201602, + "epoch": 0.21043138433789269, + "flos": 25334403505920.0, + "grad_norm": 1.776862664007905, + "language_loss": 0.78366566, + "learning_rate": 3.668100575684043e-06, + "loss": 0.80555797, + "num_input_tokens_seen": 75650655, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.94140625, + "step": 3500, + "time_per_iteration": 2.5419158935546875 + }, + { + "auxiliary_loss_clip": 0.01142389, + "auxiliary_loss_mlp": 0.01042754, + "balance_loss_clip": 1.02524185, + "balance_loss_mlp": 1.0480907, + "epoch": 0.21049150759056065, + "flos": 25556654908800.0, + "grad_norm": 1.628727093990466, + "language_loss": 0.73989725, + "learning_rate": 3.6678856825568094e-06, + "loss": 0.76174867, + "num_input_tokens_seen": 75669895, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.94140625, + "step": 3501, + "time_per_iteration": 2.535419464111328 + }, + { + "auxiliary_loss_clip": 0.01140428, + "auxiliary_loss_mlp": 0.01041829, + "balance_loss_clip": 1.02429342, + "balance_loss_mlp": 1.04671168, + "epoch": 0.21055163084322862, + "flos": 24495602129280.0, + "grad_norm": 1.6206913905571714, + "language_loss": 0.75292969, + "learning_rate": 3.667670726183183e-06, + "loss": 0.77475226, + "num_input_tokens_seen": 75689535, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9375, + "step": 3502, + "time_per_iteration": 2.508277654647827 + }, + { + "auxiliary_loss_clip": 0.01141546, + "auxiliary_loss_mlp": 0.0104558, + "balance_loss_clip": 1.02796102, + "balance_loss_mlp": 1.0475595, + "epoch": 0.21061175409589658, + "flos": 25739045193600.0, + "grad_norm": 1.9145063235338367, + "language_loss": 0.77090263, + "learning_rate": 3.667455706571316e-06, + "loss": 0.7927739, + "num_input_tokens_seen": 75709265, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.94140625, + "step": 3503, + "time_per_iteration": 2.5607948303222656 + }, + { + "auxiliary_loss_clip": 0.01148374, + "auxiliary_loss_mlp": 0.01049218, + "balance_loss_clip": 1.02813029, + "balance_loss_mlp": 1.048738, + "epoch": 0.21067187734856455, + "flos": 18989168112000.0, + "grad_norm": 2.3817148130730144, + "language_loss": 0.77991742, + "learning_rate": 3.6672406237293617e-06, + "loss": 0.80189341, + "num_input_tokens_seen": 75727050, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.9921875, + "step": 3504, + "time_per_iteration": 2.495028018951416 + }, + { + "auxiliary_loss_clip": 0.01145149, + "auxiliary_loss_mlp": 0.01047751, + "balance_loss_clip": 1.02952361, + "balance_loss_mlp": 1.0473187, + "epoch": 0.21073200060123254, + "flos": 24681368292480.0, + "grad_norm": 1.5529728217373517, + "language_loss": 0.77045631, + "learning_rate": 3.6670254776654754e-06, + "loss": 0.79238534, + "num_input_tokens_seen": 75747175, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9765625, + "step": 3505, + "time_per_iteration": 2.5408663749694824 + }, + { + "auxiliary_loss_clip": 0.01138823, + "auxiliary_loss_mlp": 0.01046578, + "balance_loss_clip": 1.02931666, + "balance_loss_mlp": 1.04786968, + "epoch": 0.2107921238539005, + "flos": 28549342402560.0, + "grad_norm": 1.9911708078552777, + "language_loss": 0.63704473, + "learning_rate": 3.6668102683878163e-06, + "loss": 0.65889871, + "num_input_tokens_seen": 75767690, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.91015625, + "step": 3506, + "time_per_iteration": 2.564246892929077 + }, + { + "auxiliary_loss_clip": 0.01140339, + "auxiliary_loss_mlp": 0.01046628, + "balance_loss_clip": 1.02904439, + "balance_loss_mlp": 1.04773796, + "epoch": 0.21085224710656847, + "flos": 25885848078720.0, + "grad_norm": 1.8633964271687153, + "language_loss": 0.81863034, + "learning_rate": 3.6665949959045443e-06, + "loss": 0.84050006, + "num_input_tokens_seen": 75787255, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.92578125, + "step": 3507, + "time_per_iteration": 2.6049435138702393 + }, + { + "auxiliary_loss_clip": 0.011401, + "auxiliary_loss_mlp": 0.0104784, + "balance_loss_clip": 1.0299232, + "balance_loss_mlp": 1.04645514, + "epoch": 0.21091237035923643, + "flos": 14976294537600.0, + "grad_norm": 2.0263301336255135, + "language_loss": 0.75496012, + "learning_rate": 3.666379660223824e-06, + "loss": 0.77683949, + "num_input_tokens_seen": 75805890, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.93359375, + "step": 3508, + "time_per_iteration": 2.5366437435150146 + }, + { + "auxiliary_loss_clip": 0.01144539, + "auxiliary_loss_mlp": 0.01042146, + "balance_loss_clip": 1.02395463, + "balance_loss_mlp": 1.04809749, + "epoch": 0.2109724936119044, + "flos": 16362518163840.0, + "grad_norm": 2.1922875924351115, + "language_loss": 0.85395098, + "learning_rate": 3.6661642613538192e-06, + "loss": 0.87581778, + "num_input_tokens_seen": 75821620, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.96484375, + "step": 3509, + "time_per_iteration": 2.4895167350769043 + }, + { + "auxiliary_loss_clip": 0.01146568, + "auxiliary_loss_mlp": 0.01043895, + "balance_loss_clip": 1.02503562, + "balance_loss_mlp": 1.04908204, + "epoch": 0.21103261686457236, + "flos": 31502492000640.0, + "grad_norm": 1.5522473876542349, + "language_loss": 0.67803288, + "learning_rate": 3.6659487993026987e-06, + "loss": 0.69993746, + "num_input_tokens_seen": 75842490, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.9765625, + "step": 3510, + "time_per_iteration": 4.065294027328491 + }, + { + "auxiliary_loss_clip": 0.01143018, + "auxiliary_loss_mlp": 0.01041477, + "balance_loss_clip": 1.02381003, + "balance_loss_mlp": 1.04653811, + "epoch": 0.21109274011724033, + "flos": 27344072517120.0, + "grad_norm": 1.9784941086490475, + "language_loss": 0.7240749, + "learning_rate": 3.6657332740786327e-06, + "loss": 0.74591982, + "num_input_tokens_seen": 75865985, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.96484375, + "step": 3511, + "time_per_iteration": 2.5701003074645996 + }, + { + "auxiliary_loss_clip": 0.01148402, + "auxiliary_loss_mlp": 0.01039531, + "balance_loss_clip": 1.02020764, + "balance_loss_mlp": 1.05022192, + "epoch": 0.21115286336990832, + "flos": 17820383466240.0, + "grad_norm": 2.3544542512902322, + "language_loss": 0.69737375, + "learning_rate": 3.665517685689794e-06, + "loss": 0.71925306, + "num_input_tokens_seen": 75882745, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.98046875, + "step": 3512, + "time_per_iteration": 3.9019229412078857 + }, + { + "auxiliary_loss_clip": 0.01143526, + "auxiliary_loss_mlp": 0.01047621, + "balance_loss_clip": 1.02872658, + "balance_loss_mlp": 1.04680824, + "epoch": 0.2112129866225763, + "flos": 27197987904000.0, + "grad_norm": 1.6756724017558497, + "language_loss": 0.73159289, + "learning_rate": 3.6653020341443584e-06, + "loss": 0.7535044, + "num_input_tokens_seen": 75904305, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.96875, + "step": 3513, + "time_per_iteration": 2.5643980503082275 + }, + { + "auxiliary_loss_clip": 0.01140444, + "auxiliary_loss_mlp": 0.0103852, + "balance_loss_clip": 1.02212906, + "balance_loss_mlp": 1.04916954, + "epoch": 0.21127310987524425, + "flos": 23731279603200.0, + "grad_norm": 1.635076517146385, + "language_loss": 0.74235332, + "learning_rate": 3.665086319450502e-06, + "loss": 0.76414299, + "num_input_tokens_seen": 75923710, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9140625, + "step": 3514, + "time_per_iteration": 2.5240070819854736 + }, + { + "auxiliary_loss_clip": 0.01144119, + "auxiliary_loss_mlp": 0.01040689, + "balance_loss_clip": 1.02347541, + "balance_loss_mlp": 1.0482856, + "epoch": 0.21133323312791222, + "flos": 18332505624960.0, + "grad_norm": 1.7928371848293583, + "language_loss": 0.76707381, + "learning_rate": 3.6648705416164062e-06, + "loss": 0.78892195, + "num_input_tokens_seen": 75942625, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9609375, + "step": 3515, + "time_per_iteration": 2.526527166366577 + }, + { + "auxiliary_loss_clip": 0.0114362, + "auxiliary_loss_mlp": 0.01042748, + "balance_loss_clip": 1.02517664, + "balance_loss_mlp": 1.04956555, + "epoch": 0.21139335638058018, + "flos": 17931203902080.0, + "grad_norm": 1.8516547188762509, + "language_loss": 0.68242604, + "learning_rate": 3.6646547006502518e-06, + "loss": 0.70428967, + "num_input_tokens_seen": 75959930, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.94140625, + "step": 3516, + "time_per_iteration": 2.46085786819458 + }, + { + "auxiliary_loss_clip": 0.01145197, + "auxiliary_loss_mlp": 0.01047209, + "balance_loss_clip": 1.02883935, + "balance_loss_mlp": 1.04901481, + "epoch": 0.21145347963324815, + "flos": 24572092141440.0, + "grad_norm": 1.653683865815189, + "language_loss": 0.85012519, + "learning_rate": 3.664438796560225e-06, + "loss": 0.87204921, + "num_input_tokens_seen": 75980335, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.9609375, + "step": 3517, + "time_per_iteration": 2.5080301761627197 + }, + { + "auxiliary_loss_clip": 0.01141463, + "auxiliary_loss_mlp": 0.01037033, + "balance_loss_clip": 1.01965201, + "balance_loss_mlp": 1.04722667, + "epoch": 0.21151360288591614, + "flos": 35845959375360.0, + "grad_norm": 2.26725319642869, + "language_loss": 0.62925792, + "learning_rate": 3.664222829354512e-06, + "loss": 0.65104288, + "num_input_tokens_seen": 76002095, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.94140625, + "step": 3518, + "time_per_iteration": 2.5949900150299072 + }, + { + "auxiliary_loss_clip": 0.01142565, + "auxiliary_loss_mlp": 0.01049413, + "balance_loss_clip": 1.03290248, + "balance_loss_mlp": 1.04891765, + "epoch": 0.2115737261385841, + "flos": 24641579001600.0, + "grad_norm": 1.8284325952385483, + "language_loss": 0.88772321, + "learning_rate": 3.664006799041303e-06, + "loss": 0.90964293, + "num_input_tokens_seen": 76020425, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.9375, + "step": 3519, + "time_per_iteration": 2.5356082916259766 + }, + { + "auxiliary_loss_clip": 0.0114445, + "auxiliary_loss_mlp": 0.01049295, + "balance_loss_clip": 1.03184235, + "balance_loss_mlp": 1.04866135, + "epoch": 0.21163384939125207, + "flos": 25226887121280.0, + "grad_norm": 1.5988506078375424, + "language_loss": 0.81066215, + "learning_rate": 3.6637907056287886e-06, + "loss": 0.83259952, + "num_input_tokens_seen": 76041210, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.95703125, + "step": 3520, + "time_per_iteration": 2.5069239139556885 + }, + { + "auxiliary_loss_clip": 0.0113827, + "auxiliary_loss_mlp": 0.01046076, + "balance_loss_clip": 1.02926779, + "balance_loss_mlp": 1.0469681, + "epoch": 0.21169397264392004, + "flos": 26067520091520.0, + "grad_norm": 1.592359744312873, + "language_loss": 0.76163614, + "learning_rate": 3.6635745491251642e-06, + "loss": 0.78347969, + "num_input_tokens_seen": 76062685, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9140625, + "step": 3521, + "time_per_iteration": 2.560037851333618 + }, + { + "auxiliary_loss_clip": 0.0113934, + "auxiliary_loss_mlp": 0.0104393, + "balance_loss_clip": 1.02842069, + "balance_loss_mlp": 1.04592443, + "epoch": 0.211754095896588, + "flos": 23108265181440.0, + "grad_norm": 2.0717596449561024, + "language_loss": 0.75950933, + "learning_rate": 3.663358329538626e-06, + "loss": 0.78134197, + "num_input_tokens_seen": 76082300, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.93359375, + "step": 3522, + "time_per_iteration": 2.4758715629577637 + }, + { + "auxiliary_loss_clip": 0.01141462, + "auxiliary_loss_mlp": 0.01049727, + "balance_loss_clip": 1.03176177, + "balance_loss_mlp": 1.04737353, + "epoch": 0.21181421914925597, + "flos": 27922341571200.0, + "grad_norm": 2.026497436525855, + "language_loss": 0.70436251, + "learning_rate": 3.663142046877374e-06, + "loss": 0.72627443, + "num_input_tokens_seen": 76101135, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.94140625, + "step": 3523, + "time_per_iteration": 2.5368640422821045 + }, + { + "auxiliary_loss_clip": 0.01140964, + "auxiliary_loss_mlp": 0.01044975, + "balance_loss_clip": 1.02786803, + "balance_loss_mlp": 1.04820895, + "epoch": 0.21187434240192393, + "flos": 17128636369920.0, + "grad_norm": 2.216886450348082, + "language_loss": 0.76683456, + "learning_rate": 3.6629257011496085e-06, + "loss": 0.7886939, + "num_input_tokens_seen": 76119320, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.92578125, + "step": 3524, + "time_per_iteration": 2.5932695865631104 + }, + { + "auxiliary_loss_clip": 0.01139634, + "auxiliary_loss_mlp": 0.01042013, + "balance_loss_clip": 1.02533603, + "balance_loss_mlp": 1.04276347, + "epoch": 0.21193446565459192, + "flos": 22347318533760.0, + "grad_norm": 2.020092904399728, + "language_loss": 0.81433582, + "learning_rate": 3.6627092923635338e-06, + "loss": 0.83615232, + "num_input_tokens_seen": 76137445, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.96875, + "step": 3525, + "time_per_iteration": 2.5425641536712646 + }, + { + "auxiliary_loss_clip": 0.011388, + "auxiliary_loss_mlp": 0.01041718, + "balance_loss_clip": 1.02484989, + "balance_loss_mlp": 1.04668331, + "epoch": 0.2119945889072599, + "flos": 27199316707200.0, + "grad_norm": 2.1031950889850655, + "language_loss": 0.75104785, + "learning_rate": 3.662492820527356e-06, + "loss": 0.77285308, + "num_input_tokens_seen": 76159500, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.921875, + "step": 3526, + "time_per_iteration": 2.533210515975952 + }, + { + "auxiliary_loss_clip": 0.01142205, + "auxiliary_loss_mlp": 0.01041732, + "balance_loss_clip": 1.02466083, + "balance_loss_mlp": 1.04663801, + "epoch": 0.21205471215992786, + "flos": 20991869884800.0, + "grad_norm": 1.9135764326712537, + "language_loss": 0.77385598, + "learning_rate": 3.662276285649284e-06, + "loss": 0.79569542, + "num_input_tokens_seen": 76177990, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.95703125, + "step": 3527, + "time_per_iteration": 2.53898286819458 + }, + { + "auxiliary_loss_clip": 0.0113944, + "auxiliary_loss_mlp": 0.01045919, + "balance_loss_clip": 1.02797842, + "balance_loss_mlp": 1.0461328, + "epoch": 0.21211483541259582, + "flos": 20777663128320.0, + "grad_norm": 1.981008674330079, + "language_loss": 0.78037727, + "learning_rate": 3.662059687737528e-06, + "loss": 0.80223083, + "num_input_tokens_seen": 76197125, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9296875, + "step": 3528, + "time_per_iteration": 2.5360231399536133 + }, + { + "auxiliary_loss_clip": 0.01138776, + "auxiliary_loss_mlp": 0.01047702, + "balance_loss_clip": 1.03096509, + "balance_loss_mlp": 1.04611731, + "epoch": 0.21217495866526379, + "flos": 18989994124800.0, + "grad_norm": 1.7275367809487383, + "language_loss": 0.8170321, + "learning_rate": 3.6618430268003024e-06, + "loss": 0.83889693, + "num_input_tokens_seen": 76216215, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.92578125, + "step": 3529, + "time_per_iteration": 2.531228542327881 + }, + { + "auxiliary_loss_clip": 0.01141251, + "auxiliary_loss_mlp": 0.01044804, + "balance_loss_clip": 1.028234, + "balance_loss_mlp": 1.04647708, + "epoch": 0.21223508191793175, + "flos": 20667309569280.0, + "grad_norm": 2.1603106904513547, + "language_loss": 0.76616383, + "learning_rate": 3.6616263028458235e-06, + "loss": 0.78802443, + "num_input_tokens_seen": 76237010, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9453125, + "step": 3530, + "time_per_iteration": 2.5361740589141846 + }, + { + "auxiliary_loss_clip": 0.01136983, + "auxiliary_loss_mlp": 0.01041907, + "balance_loss_clip": 1.02593338, + "balance_loss_mlp": 1.0451746, + "epoch": 0.21229520517059972, + "flos": 21616464504960.0, + "grad_norm": 2.3391242970409873, + "language_loss": 0.82978404, + "learning_rate": 3.661409515882308e-06, + "loss": 0.85157299, + "num_input_tokens_seen": 76255965, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.91796875, + "step": 3531, + "time_per_iteration": 2.571411609649658 + }, + { + "auxiliary_loss_clip": 0.01141528, + "auxiliary_loss_mlp": 0.01039512, + "balance_loss_clip": 1.02149963, + "balance_loss_mlp": 1.04744506, + "epoch": 0.2123553284232677, + "flos": 13991049411840.0, + "grad_norm": 2.416019676502894, + "language_loss": 0.73473567, + "learning_rate": 3.661192665917977e-06, + "loss": 0.75654608, + "num_input_tokens_seen": 76272150, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.94140625, + "step": 3532, + "time_per_iteration": 2.473006248474121 + }, + { + "auxiliary_loss_clip": 0.01138524, + "auxiliary_loss_mlp": 0.01042643, + "balance_loss_clip": 1.02485681, + "balance_loss_mlp": 1.04561734, + "epoch": 0.21241545167593567, + "flos": 18296774570880.0, + "grad_norm": 1.7353898898315339, + "language_loss": 0.73855233, + "learning_rate": 3.660975752961054e-06, + "loss": 0.76036394, + "num_input_tokens_seen": 76291425, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.92578125, + "step": 3533, + "time_per_iteration": 2.526780366897583 + }, + { + "auxiliary_loss_clip": 0.01140469, + "auxiliary_loss_mlp": 0.01045491, + "balance_loss_clip": 1.02833724, + "balance_loss_mlp": 1.04576015, + "epoch": 0.21247557492860364, + "flos": 34713121265280.0, + "grad_norm": 1.8944995629732337, + "language_loss": 0.7098999, + "learning_rate": 3.6607587770197634e-06, + "loss": 0.73175949, + "num_input_tokens_seen": 76313975, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9453125, + "step": 3534, + "time_per_iteration": 2.6947309970855713 + }, + { + "auxiliary_loss_clip": 0.01141409, + "auxiliary_loss_mlp": 0.01038239, + "balance_loss_clip": 1.02032161, + "balance_loss_mlp": 1.04669714, + "epoch": 0.2125356981812716, + "flos": 22053820504320.0, + "grad_norm": 1.9387778569542722, + "language_loss": 0.71567297, + "learning_rate": 3.6605417381023346e-06, + "loss": 0.73746949, + "num_input_tokens_seen": 76330955, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9453125, + "step": 3535, + "time_per_iteration": 2.6022329330444336 + }, + { + "auxiliary_loss_clip": 0.01136751, + "auxiliary_loss_mlp": 0.01046685, + "balance_loss_clip": 1.0299238, + "balance_loss_mlp": 1.04549336, + "epoch": 0.21259582143393957, + "flos": 28548336821760.0, + "grad_norm": 1.8756666540330442, + "language_loss": 0.7040931, + "learning_rate": 3.660324636216996e-06, + "loss": 0.72592747, + "num_input_tokens_seen": 76352680, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.91015625, + "step": 3536, + "time_per_iteration": 2.6005256175994873 + }, + { + "auxiliary_loss_clip": 0.01140865, + "auxiliary_loss_mlp": 0.01044171, + "balance_loss_clip": 1.02706444, + "balance_loss_mlp": 1.04512393, + "epoch": 0.21265594468660753, + "flos": 20120892900480.0, + "grad_norm": 1.9573194210103453, + "language_loss": 0.88217437, + "learning_rate": 3.660107471371981e-06, + "loss": 0.90402472, + "num_input_tokens_seen": 76370750, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.95703125, + "step": 3537, + "time_per_iteration": 2.5565810203552246 + }, + { + "auxiliary_loss_clip": 0.01134343, + "auxiliary_loss_mlp": 0.01040555, + "balance_loss_clip": 1.02425885, + "balance_loss_mlp": 1.0437026, + "epoch": 0.21271606793927553, + "flos": 23076161400960.0, + "grad_norm": 1.957058885696691, + "language_loss": 0.80129743, + "learning_rate": 3.659890243575524e-06, + "loss": 0.82304639, + "num_input_tokens_seen": 76390610, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.90625, + "step": 3538, + "time_per_iteration": 2.5501785278320312 + }, + { + "auxiliary_loss_clip": 0.01135328, + "auxiliary_loss_mlp": 0.01041186, + "balance_loss_clip": 1.025653, + "balance_loss_mlp": 1.0446775, + "epoch": 0.2127761911919435, + "flos": 26388201738240.0, + "grad_norm": 1.587715235485788, + "language_loss": 0.87131894, + "learning_rate": 3.659672952835863e-06, + "loss": 0.89308405, + "num_input_tokens_seen": 76408860, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.90625, + "step": 3539, + "time_per_iteration": 2.5751259326934814 + }, + { + "auxiliary_loss_clip": 0.01139264, + "auxiliary_loss_mlp": 0.01045476, + "balance_loss_clip": 1.02914476, + "balance_loss_mlp": 1.04718518, + "epoch": 0.21283631444461146, + "flos": 20228265630720.0, + "grad_norm": 3.3040839486156184, + "language_loss": 0.57464051, + "learning_rate": 3.659455599161237e-06, + "loss": 0.59648788, + "num_input_tokens_seen": 76424980, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.921875, + "step": 3540, + "time_per_iteration": 2.4746458530426025 + }, + { + "auxiliary_loss_clip": 0.01140156, + "auxiliary_loss_mlp": 0.01040246, + "balance_loss_clip": 1.02330637, + "balance_loss_mlp": 1.04658604, + "epoch": 0.21289643769727942, + "flos": 13516992691200.0, + "grad_norm": 5.8376417218282874, + "language_loss": 0.76062799, + "learning_rate": 3.659238182559888e-06, + "loss": 0.78243208, + "num_input_tokens_seen": 76443135, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.93359375, + "step": 3541, + "time_per_iteration": 2.5111818313598633 + }, + { + "auxiliary_loss_clip": 0.0113571, + "auxiliary_loss_mlp": 0.01041682, + "balance_loss_clip": 1.02517211, + "balance_loss_mlp": 1.04530454, + "epoch": 0.2129565609499474, + "flos": 24827021942400.0, + "grad_norm": 1.9190227230034667, + "language_loss": 0.69458514, + "learning_rate": 3.6590207030400615e-06, + "loss": 0.71635908, + "num_input_tokens_seen": 76462470, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.90234375, + "step": 3542, + "time_per_iteration": 2.556300401687622 + }, + { + "auxiliary_loss_clip": 0.01134092, + "auxiliary_loss_mlp": 0.01034857, + "balance_loss_clip": 1.01945567, + "balance_loss_mlp": 1.04443789, + "epoch": 0.21301668420261535, + "flos": 23659242877440.0, + "grad_norm": 1.8172219669397587, + "language_loss": 0.75591409, + "learning_rate": 3.658803160610004e-06, + "loss": 0.77760351, + "num_input_tokens_seen": 76481995, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8984375, + "step": 3543, + "time_per_iteration": 2.54424786567688 + }, + { + "auxiliary_loss_clip": 0.01138428, + "auxiliary_loss_mlp": 0.01038735, + "balance_loss_clip": 1.02253413, + "balance_loss_mlp": 1.04843175, + "epoch": 0.21307680745528332, + "flos": 16362805472640.0, + "grad_norm": 2.1531603349332915, + "language_loss": 0.66787028, + "learning_rate": 3.6585855552779634e-06, + "loss": 0.68964195, + "num_input_tokens_seen": 76500245, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8984375, + "step": 3544, + "time_per_iteration": 2.516359329223633 + }, + { + "auxiliary_loss_clip": 0.01136471, + "auxiliary_loss_mlp": 0.01040176, + "balance_loss_clip": 1.0245831, + "balance_loss_mlp": 1.04379654, + "epoch": 0.2131369307079513, + "flos": 19099054794240.0, + "grad_norm": 1.9827170900636153, + "language_loss": 0.71089172, + "learning_rate": 3.6583678870521934e-06, + "loss": 0.73265821, + "num_input_tokens_seen": 76519535, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.92578125, + "step": 3545, + "time_per_iteration": 2.5377357006073 + }, + { + "auxiliary_loss_clip": 0.01138848, + "auxiliary_loss_mlp": 0.01046644, + "balance_loss_clip": 1.03095567, + "balance_loss_mlp": 1.04571509, + "epoch": 0.21319705396061928, + "flos": 30372275583360.0, + "grad_norm": 1.730364240275379, + "language_loss": 0.72334421, + "learning_rate": 3.658150155940946e-06, + "loss": 0.74519908, + "num_input_tokens_seen": 76542065, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.9296875, + "step": 3546, + "time_per_iteration": 2.5640652179718018 + }, + { + "auxiliary_loss_clip": 0.0113929, + "auxiliary_loss_mlp": 0.0104318, + "balance_loss_clip": 1.02695596, + "balance_loss_mlp": 1.0467453, + "epoch": 0.21325717721328724, + "flos": 21756192410880.0, + "grad_norm": 1.889324350950523, + "language_loss": 0.80698627, + "learning_rate": 3.657932361952479e-06, + "loss": 0.82881093, + "num_input_tokens_seen": 76560540, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.92578125, + "step": 3547, + "time_per_iteration": 2.527398109436035 + }, + { + "auxiliary_loss_clip": 0.01140759, + "auxiliary_loss_mlp": 0.0104395, + "balance_loss_clip": 1.02702212, + "balance_loss_mlp": 1.04538703, + "epoch": 0.2133173004659552, + "flos": 28730870760960.0, + "grad_norm": 3.232228952830713, + "language_loss": 0.74496448, + "learning_rate": 3.6577145050950504e-06, + "loss": 0.76681155, + "num_input_tokens_seen": 76581760, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.953125, + "step": 3548, + "time_per_iteration": 2.5493834018707275 + }, + { + "auxiliary_loss_clip": 0.01141872, + "auxiliary_loss_mlp": 0.01045412, + "balance_loss_clip": 1.02719641, + "balance_loss_mlp": 1.04663396, + "epoch": 0.21337742371862317, + "flos": 16837077674880.0, + "grad_norm": 2.0441969792992265, + "language_loss": 0.74135804, + "learning_rate": 3.657496585376922e-06, + "loss": 0.76323086, + "num_input_tokens_seen": 76599940, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.953125, + "step": 3549, + "time_per_iteration": 2.514817476272583 + }, + { + "auxiliary_loss_clip": 0.01142468, + "auxiliary_loss_mlp": 0.01046789, + "balance_loss_clip": 1.03063631, + "balance_loss_mlp": 1.04963064, + "epoch": 0.21343754697129114, + "flos": 24424930120320.0, + "grad_norm": 1.6981522694050752, + "language_loss": 0.80653727, + "learning_rate": 3.657278602806357e-06, + "loss": 0.82842982, + "num_input_tokens_seen": 76619580, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.92578125, + "step": 3550, + "time_per_iteration": 2.541501045227051 + }, + { + "auxiliary_loss_clip": 0.01136887, + "auxiliary_loss_mlp": 0.01044073, + "balance_loss_clip": 1.02883255, + "balance_loss_mlp": 1.04706621, + "epoch": 0.21349767022395913, + "flos": 19277817805440.0, + "grad_norm": 1.615115943492657, + "language_loss": 0.88341218, + "learning_rate": 3.657060557391621e-06, + "loss": 0.90522182, + "num_input_tokens_seen": 76638195, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8984375, + "step": 3551, + "time_per_iteration": 2.5310463905334473 + }, + { + "auxiliary_loss_clip": 0.01136336, + "auxiliary_loss_mlp": 0.0104486, + "balance_loss_clip": 1.02887464, + "balance_loss_mlp": 1.04430258, + "epoch": 0.2135577934766271, + "flos": 17347547808000.0, + "grad_norm": 2.1215125327645152, + "language_loss": 0.83415043, + "learning_rate": 3.656842449140983e-06, + "loss": 0.8559624, + "num_input_tokens_seen": 76656695, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.91796875, + "step": 3552, + "time_per_iteration": 3.974120616912842 + }, + { + "auxiliary_loss_clip": 0.0113546, + "auxiliary_loss_mlp": 0.01048241, + "balance_loss_clip": 1.03164101, + "balance_loss_mlp": 1.04522753, + "epoch": 0.21361791672929506, + "flos": 24057204635520.0, + "grad_norm": 1.7556537525349103, + "language_loss": 0.76692683, + "learning_rate": 3.656624278062713e-06, + "loss": 0.78876388, + "num_input_tokens_seen": 76677430, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.90234375, + "step": 3553, + "time_per_iteration": 3.964289903640747 + }, + { + "auxiliary_loss_clip": 0.0113607, + "auxiliary_loss_mlp": 0.01040019, + "balance_loss_clip": 1.02520156, + "balance_loss_mlp": 1.04556942, + "epoch": 0.21367803998196302, + "flos": 22162306556160.0, + "grad_norm": 1.6502841430946371, + "language_loss": 0.72946119, + "learning_rate": 3.6564060441650843e-06, + "loss": 0.75122207, + "num_input_tokens_seen": 76697615, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.90625, + "step": 3554, + "time_per_iteration": 2.5141818523406982 + }, + { + "auxiliary_loss_clip": 0.01137832, + "auxiliary_loss_mlp": 0.01036933, + "balance_loss_clip": 1.02121508, + "balance_loss_mlp": 1.04672861, + "epoch": 0.213738163234631, + "flos": 20886867452160.0, + "grad_norm": 1.9371755733444218, + "language_loss": 0.6745261, + "learning_rate": 3.6561877474563724e-06, + "loss": 0.69627374, + "num_input_tokens_seen": 76715685, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.91015625, + "step": 3555, + "time_per_iteration": 2.6116089820861816 + }, + { + "auxiliary_loss_clip": 0.01138406, + "auxiliary_loss_mlp": 0.01039389, + "balance_loss_clip": 1.02293801, + "balance_loss_mlp": 1.04564714, + "epoch": 0.21379828648729896, + "flos": 28403114135040.0, + "grad_norm": 2.2550763051095752, + "language_loss": 0.64778429, + "learning_rate": 3.6559693879448553e-06, + "loss": 0.66956222, + "num_input_tokens_seen": 76735405, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9296875, + "step": 3556, + "time_per_iteration": 2.553746223449707 + }, + { + "auxiliary_loss_clip": 0.01139299, + "auxiliary_loss_mlp": 0.01045701, + "balance_loss_clip": 1.02893996, + "balance_loss_mlp": 1.04656768, + "epoch": 0.21385840973996692, + "flos": 25479662106240.0, + "grad_norm": 1.6295299556205536, + "language_loss": 0.72333252, + "learning_rate": 3.6557509656388125e-06, + "loss": 0.74518251, + "num_input_tokens_seen": 76754395, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9296875, + "step": 3557, + "time_per_iteration": 2.6562533378601074 + }, + { + "auxiliary_loss_clip": 0.0114268, + "auxiliary_loss_mlp": 0.01039642, + "balance_loss_clip": 1.02189136, + "balance_loss_mlp": 1.04716706, + "epoch": 0.2139185329926349, + "flos": 28074280101120.0, + "grad_norm": 1.6722734443717013, + "language_loss": 0.67139357, + "learning_rate": 3.655532480546528e-06, + "loss": 0.6932168, + "num_input_tokens_seen": 76777210, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.953125, + "step": 3558, + "time_per_iteration": 2.5435290336608887 + }, + { + "auxiliary_loss_clip": 0.01142773, + "auxiliary_loss_mlp": 0.01036302, + "balance_loss_clip": 1.01943386, + "balance_loss_mlp": 1.04542494, + "epoch": 0.21397865624530288, + "flos": 19608698914560.0, + "grad_norm": 1.8839208997443517, + "language_loss": 0.79702216, + "learning_rate": 3.655313932676286e-06, + "loss": 0.81881285, + "num_input_tokens_seen": 76795830, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9765625, + "step": 3559, + "time_per_iteration": 2.5535330772399902 + }, + { + "auxiliary_loss_clip": 0.01137143, + "auxiliary_loss_mlp": 0.01044167, + "balance_loss_clip": 1.02822304, + "balance_loss_mlp": 1.04436731, + "epoch": 0.21403877949797084, + "flos": 24681476033280.0, + "grad_norm": 1.6653874224583467, + "language_loss": 0.67549068, + "learning_rate": 3.655095322036373e-06, + "loss": 0.69730377, + "num_input_tokens_seen": 76814700, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.9296875, + "step": 3560, + "time_per_iteration": 2.5241451263427734 + }, + { + "auxiliary_loss_clip": 0.0114283, + "auxiliary_loss_mlp": 0.01041365, + "balance_loss_clip": 1.02514052, + "balance_loss_mlp": 1.04846883, + "epoch": 0.2140989027506388, + "flos": 19861150677120.0, + "grad_norm": 1.8721878156787213, + "language_loss": 0.72995424, + "learning_rate": 3.65487664863508e-06, + "loss": 0.75179613, + "num_input_tokens_seen": 76833400, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.9453125, + "step": 3561, + "time_per_iteration": 2.5678720474243164 + }, + { + "auxiliary_loss_clip": 0.01142897, + "auxiliary_loss_mlp": 0.01044952, + "balance_loss_clip": 1.02817965, + "balance_loss_mlp": 1.04897678, + "epoch": 0.21415902600330677, + "flos": 19135324552320.0, + "grad_norm": 2.2783713689110243, + "language_loss": 0.77110738, + "learning_rate": 3.654657912480698e-06, + "loss": 0.79298586, + "num_input_tokens_seen": 76850645, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9375, + "step": 3562, + "time_per_iteration": 2.4598803520202637 + }, + { + "auxiliary_loss_clip": 0.01140561, + "auxiliary_loss_mlp": 0.01038127, + "balance_loss_clip": 1.02160454, + "balance_loss_mlp": 1.04795694, + "epoch": 0.21421914925597474, + "flos": 22272624201600.0, + "grad_norm": 1.5929440625910447, + "language_loss": 0.84534913, + "learning_rate": 3.6544391135815237e-06, + "loss": 0.867136, + "num_input_tokens_seen": 76870135, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.92578125, + "step": 3563, + "time_per_iteration": 2.5654757022857666 + }, + { + "auxiliary_loss_clip": 0.0114087, + "auxiliary_loss_mlp": 0.01038331, + "balance_loss_clip": 1.02227342, + "balance_loss_mlp": 1.04757166, + "epoch": 0.2142792725086427, + "flos": 33875109987840.0, + "grad_norm": 1.6134338415520206, + "language_loss": 0.76727796, + "learning_rate": 3.6542202519458507e-06, + "loss": 0.78907001, + "num_input_tokens_seen": 76893905, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.93359375, + "step": 3564, + "time_per_iteration": 2.591064214706421 + }, + { + "auxiliary_loss_clip": 0.01138542, + "auxiliary_loss_mlp": 0.01041793, + "balance_loss_clip": 1.02560401, + "balance_loss_mlp": 1.0467248, + "epoch": 0.2143393957613107, + "flos": 19860216923520.0, + "grad_norm": 1.880454163642384, + "language_loss": 0.88260084, + "learning_rate": 3.654001327581981e-06, + "loss": 0.90440416, + "num_input_tokens_seen": 76914205, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.91796875, + "step": 3565, + "time_per_iteration": 2.571242094039917 + }, + { + "auxiliary_loss_clip": 0.0105057, + "auxiliary_loss_mlp": 0.01009282, + "balance_loss_clip": 1.00739813, + "balance_loss_mlp": 1.0192101, + "epoch": 0.21439951901397866, + "flos": 68530093090560.0, + "grad_norm": 0.8403524328969202, + "language_loss": 0.52300179, + "learning_rate": 3.653782340498215e-06, + "loss": 0.54360026, + "num_input_tokens_seen": 76975650, + "router_z_loss_clip": 0.01879883, + "router_z_loss_mlp": 0.3125, + "step": 3566, + "time_per_iteration": 3.055588722229004 + }, + { + "auxiliary_loss_clip": 0.01136421, + "auxiliary_loss_mlp": 0.01036219, + "balance_loss_clip": 1.02093637, + "balance_loss_mlp": 1.04677701, + "epoch": 0.21445964226664663, + "flos": 19682998197120.0, + "grad_norm": 1.91490691342046, + "language_loss": 0.67412555, + "learning_rate": 3.6535632907028566e-06, + "loss": 0.69585192, + "num_input_tokens_seen": 76992615, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.89453125, + "step": 3567, + "time_per_iteration": 2.5511529445648193 + }, + { + "auxiliary_loss_clip": 0.01135888, + "auxiliary_loss_mlp": 0.01041672, + "balance_loss_clip": 1.02630615, + "balance_loss_mlp": 1.04691041, + "epoch": 0.2145197655193146, + "flos": 31107259676160.0, + "grad_norm": 1.6974661731729381, + "language_loss": 0.74437779, + "learning_rate": 3.6533441782042126e-06, + "loss": 0.7661534, + "num_input_tokens_seen": 77017005, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.890625, + "step": 3568, + "time_per_iteration": 2.613090753555298 + }, + { + "auxiliary_loss_clip": 0.01137867, + "auxiliary_loss_mlp": 0.01043309, + "balance_loss_clip": 1.02710819, + "balance_loss_mlp": 1.04578757, + "epoch": 0.21457988877198256, + "flos": 20120785159680.0, + "grad_norm": 1.7479940521784256, + "language_loss": 0.77864397, + "learning_rate": 3.6531250030105917e-06, + "loss": 0.80045569, + "num_input_tokens_seen": 77034990, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.921875, + "step": 3569, + "time_per_iteration": 2.567439317703247 + }, + { + "auxiliary_loss_clip": 0.01147794, + "auxiliary_loss_mlp": 0.01038363, + "balance_loss_clip": 1.01981413, + "balance_loss_mlp": 1.05039883, + "epoch": 0.21464001202465052, + "flos": 18588045957120.0, + "grad_norm": 2.3364918832975317, + "language_loss": 0.69533777, + "learning_rate": 3.6529057651303053e-06, + "loss": 0.71719933, + "num_input_tokens_seen": 77052610, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9765625, + "step": 3570, + "time_per_iteration": 2.489550828933716 + }, + { + "auxiliary_loss_clip": 0.01144243, + "auxiliary_loss_mlp": 0.01043749, + "balance_loss_clip": 1.02703631, + "balance_loss_mlp": 1.0480299, + "epoch": 0.21470013527731852, + "flos": 21835160461440.0, + "grad_norm": 2.465398793786977, + "language_loss": 0.78108835, + "learning_rate": 3.6526864645716666e-06, + "loss": 0.80296826, + "num_input_tokens_seen": 77072475, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9609375, + "step": 3571, + "time_per_iteration": 2.527509927749634 + }, + { + "auxiliary_loss_clip": 0.01143428, + "auxiliary_loss_mlp": 0.01043615, + "balance_loss_clip": 1.02556705, + "balance_loss_mlp": 1.0501976, + "epoch": 0.21476025852998648, + "flos": 17603195880960.0, + "grad_norm": 2.5347995603010767, + "language_loss": 0.82851684, + "learning_rate": 3.652467101342991e-06, + "loss": 0.85038722, + "num_input_tokens_seen": 77089930, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.93359375, + "step": 3572, + "time_per_iteration": 2.491955280303955 + }, + { + "auxiliary_loss_clip": 0.01144597, + "auxiliary_loss_mlp": 0.01039432, + "balance_loss_clip": 1.02248025, + "balance_loss_mlp": 1.04700291, + "epoch": 0.21482038178265445, + "flos": 24828135264000.0, + "grad_norm": 2.35018592277076, + "language_loss": 0.64916813, + "learning_rate": 3.652247675452598e-06, + "loss": 0.67100847, + "num_input_tokens_seen": 77108970, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9765625, + "step": 3573, + "time_per_iteration": 2.5238969326019287 + }, + { + "auxiliary_loss_clip": 0.01133482, + "auxiliary_loss_mlp": 0.01040895, + "balance_loss_clip": 1.02481413, + "balance_loss_mlp": 1.04417133, + "epoch": 0.2148805050353224, + "flos": 23258228463360.0, + "grad_norm": 2.2164535787006705, + "language_loss": 0.75577438, + "learning_rate": 3.652028186908807e-06, + "loss": 0.77751815, + "num_input_tokens_seen": 77126045, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.890625, + "step": 3574, + "time_per_iteration": 2.5497734546661377 + }, + { + "auxiliary_loss_clip": 0.01137499, + "auxiliary_loss_mlp": 0.01035076, + "balance_loss_clip": 1.01752853, + "balance_loss_mlp": 1.04568887, + "epoch": 0.21494062828799038, + "flos": 21321098968320.0, + "grad_norm": 1.959683075701339, + "language_loss": 0.72380054, + "learning_rate": 3.6518086357199416e-06, + "loss": 0.74552631, + "num_input_tokens_seen": 77144600, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.91796875, + "step": 3575, + "time_per_iteration": 2.539255142211914 + }, + { + "auxiliary_loss_clip": 0.01141362, + "auxiliary_loss_mlp": 0.01036894, + "balance_loss_clip": 1.02097976, + "balance_loss_mlp": 1.04890776, + "epoch": 0.21500075154065834, + "flos": 18843334894080.0, + "grad_norm": 1.6473570004326006, + "language_loss": 0.68102455, + "learning_rate": 3.6515890218943277e-06, + "loss": 0.70280713, + "num_input_tokens_seen": 77162965, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.92578125, + "step": 3576, + "time_per_iteration": 2.515245199203491 + }, + { + "auxiliary_loss_clip": 0.01144679, + "auxiliary_loss_mlp": 0.01041063, + "balance_loss_clip": 1.02347922, + "balance_loss_mlp": 1.04820943, + "epoch": 0.2150608747933263, + "flos": 18441997257600.0, + "grad_norm": 2.1450103743023936, + "language_loss": 0.88840854, + "learning_rate": 3.651369345440292e-06, + "loss": 0.91026592, + "num_input_tokens_seen": 77179960, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9609375, + "step": 3577, + "time_per_iteration": 2.4426753520965576 + }, + { + "auxiliary_loss_clip": 0.01054886, + "auxiliary_loss_mlp": 0.01006787, + "balance_loss_clip": 1.00466526, + "balance_loss_mlp": 1.02252448, + "epoch": 0.2151209980459943, + "flos": 66598242894720.0, + "grad_norm": 0.8177210285410575, + "language_loss": 0.56242883, + "learning_rate": 3.6511496063661654e-06, + "loss": 0.5830456, + "num_input_tokens_seen": 77239500, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.32421875, + "step": 3578, + "time_per_iteration": 3.0434820652008057 + }, + { + "auxiliary_loss_clip": 0.0114273, + "auxiliary_loss_mlp": 0.01039801, + "balance_loss_clip": 1.02345788, + "balance_loss_mlp": 1.04957211, + "epoch": 0.21518112129866226, + "flos": 21575885114880.0, + "grad_norm": 1.6812319537870581, + "language_loss": 0.88500881, + "learning_rate": 3.6509298046802807e-06, + "loss": 0.90683413, + "num_input_tokens_seen": 77254680, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.9296875, + "step": 3579, + "time_per_iteration": 2.4646458625793457 + }, + { + "auxiliary_loss_clip": 0.01140846, + "auxiliary_loss_mlp": 0.01042477, + "balance_loss_clip": 1.02551329, + "balance_loss_mlp": 1.04618824, + "epoch": 0.21524124455133023, + "flos": 20047635112320.0, + "grad_norm": 1.7668055337606152, + "language_loss": 0.78238297, + "learning_rate": 3.650709940390972e-06, + "loss": 0.80421615, + "num_input_tokens_seen": 77274060, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9453125, + "step": 3580, + "time_per_iteration": 2.5029854774475098 + }, + { + "auxiliary_loss_clip": 0.01138764, + "auxiliary_loss_mlp": 0.01042384, + "balance_loss_clip": 1.02557576, + "balance_loss_mlp": 1.04757452, + "epoch": 0.2153013678039982, + "flos": 23951807153280.0, + "grad_norm": 1.7955176576656944, + "language_loss": 0.73129165, + "learning_rate": 3.6504900135065775e-06, + "loss": 0.75310302, + "num_input_tokens_seen": 77293255, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9140625, + "step": 3581, + "time_per_iteration": 2.503103733062744 + }, + { + "auxiliary_loss_clip": 0.01137091, + "auxiliary_loss_mlp": 0.0104596, + "balance_loss_clip": 1.02723205, + "balance_loss_mlp": 1.04665411, + "epoch": 0.21536149105666616, + "flos": 20594841880320.0, + "grad_norm": 2.610409860459302, + "language_loss": 0.70739609, + "learning_rate": 3.6502700240354357e-06, + "loss": 0.72922659, + "num_input_tokens_seen": 77312390, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.90234375, + "step": 3582, + "time_per_iteration": 2.4840197563171387 + }, + { + "auxiliary_loss_clip": 0.01137402, + "auxiliary_loss_mlp": 0.01041337, + "balance_loss_clip": 1.02401567, + "balance_loss_mlp": 1.04602027, + "epoch": 0.21542161430933413, + "flos": 12860042895360.0, + "grad_norm": 2.8570718584923633, + "language_loss": 0.84140432, + "learning_rate": 3.650049971985889e-06, + "loss": 0.86319172, + "num_input_tokens_seen": 77330985, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9140625, + "step": 3583, + "time_per_iteration": 2.4435312747955322 + }, + { + "auxiliary_loss_clip": 0.01143933, + "auxiliary_loss_mlp": 0.01045352, + "balance_loss_clip": 1.02834046, + "balance_loss_mlp": 1.04859185, + "epoch": 0.21548173756200212, + "flos": 26103933504000.0, + "grad_norm": 3.180305067245919, + "language_loss": 0.83226246, + "learning_rate": 3.6498298573662824e-06, + "loss": 0.8541553, + "num_input_tokens_seen": 77350770, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.953125, + "step": 3584, + "time_per_iteration": 2.521476984024048 + }, + { + "auxiliary_loss_clip": 0.01136808, + "auxiliary_loss_mlp": 0.01046426, + "balance_loss_clip": 1.02816272, + "balance_loss_mlp": 1.04518461, + "epoch": 0.21554186081467008, + "flos": 22163779013760.0, + "grad_norm": 2.0358477693345667, + "language_loss": 0.90233314, + "learning_rate": 3.6496096801849625e-06, + "loss": 0.92416549, + "num_input_tokens_seen": 77370510, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.91796875, + "step": 3585, + "time_per_iteration": 2.464745283126831 + }, + { + "auxiliary_loss_clip": 0.01140925, + "auxiliary_loss_mlp": 0.01042254, + "balance_loss_clip": 1.02568388, + "balance_loss_mlp": 1.04832685, + "epoch": 0.21560198406733805, + "flos": 22966741595520.0, + "grad_norm": 2.8296186032289348, + "language_loss": 0.74414444, + "learning_rate": 3.649389440450277e-06, + "loss": 0.76597619, + "num_input_tokens_seen": 77390645, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9296875, + "step": 3586, + "time_per_iteration": 2.5062146186828613 + }, + { + "auxiliary_loss_clip": 0.01141021, + "auxiliary_loss_mlp": 0.01042527, + "balance_loss_clip": 1.02668393, + "balance_loss_mlp": 1.04796743, + "epoch": 0.215662107320006, + "flos": 22784064001920.0, + "grad_norm": 2.1680236591426416, + "language_loss": 0.83055526, + "learning_rate": 3.6491691381705804e-06, + "loss": 0.85239077, + "num_input_tokens_seen": 77409655, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.9296875, + "step": 3587, + "time_per_iteration": 2.4784295558929443 + }, + { + "auxiliary_loss_clip": 0.01138799, + "auxiliary_loss_mlp": 0.01041925, + "balance_loss_clip": 1.02438986, + "balance_loss_mlp": 1.04664946, + "epoch": 0.21572223057267398, + "flos": 30883859038080.0, + "grad_norm": 1.8176747371086701, + "language_loss": 0.75756669, + "learning_rate": 3.648948773354224e-06, + "loss": 0.77937388, + "num_input_tokens_seen": 77430560, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.921875, + "step": 3588, + "time_per_iteration": 2.5896053314208984 + }, + { + "auxiliary_loss_clip": 0.01137468, + "auxiliary_loss_mlp": 0.01039715, + "balance_loss_clip": 1.02294254, + "balance_loss_mlp": 1.04534698, + "epoch": 0.21578235382534194, + "flos": 26910487445760.0, + "grad_norm": 1.8272464683057401, + "language_loss": 0.81006658, + "learning_rate": 3.6487283460095643e-06, + "loss": 0.83183837, + "num_input_tokens_seen": 77455000, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.921875, + "step": 3589, + "time_per_iteration": 2.540090799331665 + }, + { + "auxiliary_loss_clip": 0.01142059, + "auxiliary_loss_mlp": 0.01039493, + "balance_loss_clip": 1.02341199, + "balance_loss_mlp": 1.04792953, + "epoch": 0.2158424770780099, + "flos": 24425720219520.0, + "grad_norm": 2.6129530472479154, + "language_loss": 0.72591126, + "learning_rate": 3.648507856144961e-06, + "loss": 0.74772674, + "num_input_tokens_seen": 77475075, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.94140625, + "step": 3590, + "time_per_iteration": 2.5113861560821533 + }, + { + "auxiliary_loss_clip": 0.01145271, + "auxiliary_loss_mlp": 0.01046731, + "balance_loss_clip": 1.02769351, + "balance_loss_mlp": 1.04830956, + "epoch": 0.2159026003306779, + "flos": 23949975559680.0, + "grad_norm": 2.0133132975130477, + "language_loss": 0.83914638, + "learning_rate": 3.648287303768775e-06, + "loss": 0.86106646, + "num_input_tokens_seen": 77495945, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.96875, + "step": 3591, + "time_per_iteration": 2.488309621810913 + }, + { + "auxiliary_loss_clip": 0.01145642, + "auxiliary_loss_mlp": 0.01050952, + "balance_loss_clip": 1.03167534, + "balance_loss_mlp": 1.04884136, + "epoch": 0.21596272358334587, + "flos": 30040963511040.0, + "grad_norm": 2.271326779903827, + "language_loss": 0.69294131, + "learning_rate": 3.6480666888893686e-06, + "loss": 0.71490723, + "num_input_tokens_seen": 77517140, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.96875, + "step": 3592, + "time_per_iteration": 2.571373462677002 + }, + { + "auxiliary_loss_clip": 0.01143219, + "auxiliary_loss_mlp": 0.01049672, + "balance_loss_clip": 1.03150403, + "balance_loss_mlp": 1.04881072, + "epoch": 0.21602284683601383, + "flos": 20376217751040.0, + "grad_norm": 2.3999192225546677, + "language_loss": 0.84150124, + "learning_rate": 3.647846011515108e-06, + "loss": 0.86343014, + "num_input_tokens_seen": 77536085, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9453125, + "step": 3593, + "time_per_iteration": 2.4590611457824707 + }, + { + "auxiliary_loss_clip": 0.01144804, + "auxiliary_loss_mlp": 0.01049477, + "balance_loss_clip": 1.03210783, + "balance_loss_mlp": 1.04839182, + "epoch": 0.2160829700886818, + "flos": 20777339905920.0, + "grad_norm": 2.850380650061706, + "language_loss": 0.75163305, + "learning_rate": 3.6476252716543625e-06, + "loss": 0.77357584, + "num_input_tokens_seen": 77553675, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.96484375, + "step": 3594, + "time_per_iteration": 3.9338901042938232 + }, + { + "auxiliary_loss_clip": 0.01139476, + "auxiliary_loss_mlp": 0.01043593, + "balance_loss_clip": 1.02666509, + "balance_loss_mlp": 1.04763508, + "epoch": 0.21614309334134976, + "flos": 22309755886080.0, + "grad_norm": 2.0680180645872057, + "language_loss": 0.80541027, + "learning_rate": 3.6474044693155007e-06, + "loss": 0.82724094, + "num_input_tokens_seen": 77573360, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.921875, + "step": 3595, + "time_per_iteration": 3.9857921600341797 + }, + { + "auxiliary_loss_clip": 0.01146272, + "auxiliary_loss_mlp": 0.01043286, + "balance_loss_clip": 1.0259887, + "balance_loss_mlp": 1.04883027, + "epoch": 0.21620321659401773, + "flos": 19609524927360.0, + "grad_norm": 2.3330392864683347, + "language_loss": 0.78089929, + "learning_rate": 3.647183604506897e-06, + "loss": 0.80279487, + "num_input_tokens_seen": 77591865, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.97265625, + "step": 3596, + "time_per_iteration": 2.4515480995178223 + }, + { + "auxiliary_loss_clip": 0.01138472, + "auxiliary_loss_mlp": 0.0104618, + "balance_loss_clip": 1.03006268, + "balance_loss_mlp": 1.04786897, + "epoch": 0.2162633398466857, + "flos": 18844555956480.0, + "grad_norm": 1.9545740457841054, + "language_loss": 0.83011472, + "learning_rate": 3.6469626772369253e-06, + "loss": 0.85196126, + "num_input_tokens_seen": 77611600, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.90625, + "step": 3597, + "time_per_iteration": 2.504703998565674 + }, + { + "auxiliary_loss_clip": 0.011446, + "auxiliary_loss_mlp": 0.01045187, + "balance_loss_clip": 1.02756798, + "balance_loss_mlp": 1.05029655, + "epoch": 0.21632346309935369, + "flos": 18768820129920.0, + "grad_norm": 1.5849845027976412, + "language_loss": 0.80171728, + "learning_rate": 3.6467416875139642e-06, + "loss": 0.82361513, + "num_input_tokens_seen": 77630665, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.94140625, + "step": 3598, + "time_per_iteration": 2.487013101577759 + }, + { + "auxiliary_loss_clip": 0.0114385, + "auxiliary_loss_mlp": 0.01045551, + "balance_loss_clip": 1.02745485, + "balance_loss_mlp": 1.0476619, + "epoch": 0.21638358635202165, + "flos": 26324173745280.0, + "grad_norm": 1.8175927270691912, + "language_loss": 0.82054996, + "learning_rate": 3.6465206353463934e-06, + "loss": 0.842444, + "num_input_tokens_seen": 77650835, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9609375, + "step": 3599, + "time_per_iteration": 2.5515315532684326 + }, + { + "auxiliary_loss_clip": 0.0113915, + "auxiliary_loss_mlp": 0.0104149, + "balance_loss_clip": 1.0253613, + "balance_loss_mlp": 1.04831243, + "epoch": 0.21644370960468962, + "flos": 20740854666240.0, + "grad_norm": 3.186477441139726, + "language_loss": 0.7654863, + "learning_rate": 3.6462995207425947e-06, + "loss": 0.78729272, + "num_input_tokens_seen": 77669000, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.90625, + "step": 3600, + "time_per_iteration": 2.5067033767700195 + }, + { + "auxiliary_loss_clip": 0.01139528, + "auxiliary_loss_mlp": 0.01043686, + "balance_loss_clip": 1.02842712, + "balance_loss_mlp": 1.04657555, + "epoch": 0.21650383285735758, + "flos": 23952238116480.0, + "grad_norm": 1.9514188507385115, + "language_loss": 0.80026001, + "learning_rate": 3.6460783437109533e-06, + "loss": 0.82209218, + "num_input_tokens_seen": 77688745, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.9296875, + "step": 3601, + "time_per_iteration": 2.5383710861206055 + }, + { + "auxiliary_loss_clip": 0.01142747, + "auxiliary_loss_mlp": 0.01047381, + "balance_loss_clip": 1.0306437, + "balance_loss_mlp": 1.04938436, + "epoch": 0.21656395611002555, + "flos": 23696087253120.0, + "grad_norm": 1.8096424478422806, + "language_loss": 0.83358335, + "learning_rate": 3.6458571042598565e-06, + "loss": 0.85548466, + "num_input_tokens_seen": 77708445, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.93359375, + "step": 3602, + "time_per_iteration": 2.525151491165161 + }, + { + "auxiliary_loss_clip": 0.01140411, + "auxiliary_loss_mlp": 0.01048188, + "balance_loss_clip": 1.03065276, + "balance_loss_mlp": 1.04670155, + "epoch": 0.2166240793626935, + "flos": 20666052593280.0, + "grad_norm": 1.6489882186888527, + "language_loss": 0.74271673, + "learning_rate": 3.645635802397693e-06, + "loss": 0.76460266, + "num_input_tokens_seen": 77728465, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9375, + "step": 3603, + "time_per_iteration": 2.5083842277526855 + }, + { + "auxiliary_loss_clip": 0.01140372, + "auxiliary_loss_mlp": 0.01043135, + "balance_loss_clip": 1.02723289, + "balance_loss_mlp": 1.05022252, + "epoch": 0.2166842026153615, + "flos": 21580410228480.0, + "grad_norm": 1.5478742891076147, + "language_loss": 0.73956323, + "learning_rate": 3.645414438132855e-06, + "loss": 0.76139832, + "num_input_tokens_seen": 77746735, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.90234375, + "step": 3604, + "time_per_iteration": 2.5100204944610596 + }, + { + "auxiliary_loss_clip": 0.01137594, + "auxiliary_loss_mlp": 0.01042667, + "balance_loss_clip": 1.02598965, + "balance_loss_mlp": 1.04853153, + "epoch": 0.21674432586802947, + "flos": 25629948610560.0, + "grad_norm": 2.2268823896980376, + "language_loss": 0.80375803, + "learning_rate": 3.6451930114737366e-06, + "loss": 0.82556069, + "num_input_tokens_seen": 77768105, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.890625, + "step": 3605, + "time_per_iteration": 2.5182228088378906 + }, + { + "auxiliary_loss_clip": 0.01064224, + "auxiliary_loss_mlp": 0.01010449, + "balance_loss_clip": 1.0086962, + "balance_loss_mlp": 1.02975249, + "epoch": 0.21680444912069743, + "flos": 56417783616000.0, + "grad_norm": 0.6948121220218867, + "language_loss": 0.58376318, + "learning_rate": 3.6449715224287347e-06, + "loss": 0.60450989, + "num_input_tokens_seen": 77833750, + "router_z_loss_clip": 0.01757812, + "router_z_loss_mlp": 0.34375, + "step": 3606, + "time_per_iteration": 3.1655373573303223 + }, + { + "auxiliary_loss_clip": 0.01145196, + "auxiliary_loss_mlp": 0.01046918, + "balance_loss_clip": 1.02921534, + "balance_loss_mlp": 1.04939568, + "epoch": 0.2168645723733654, + "flos": 23878944414720.0, + "grad_norm": 2.6754398361548613, + "language_loss": 0.73210037, + "learning_rate": 3.644749971006248e-06, + "loss": 0.75402147, + "num_input_tokens_seen": 77853780, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9609375, + "step": 3607, + "time_per_iteration": 2.508920431137085 + }, + { + "auxiliary_loss_clip": 0.01146221, + "auxiliary_loss_mlp": 0.0104816, + "balance_loss_clip": 1.02995718, + "balance_loss_mlp": 1.04935443, + "epoch": 0.21692469562603336, + "flos": 16946174257920.0, + "grad_norm": 2.5718647894236053, + "language_loss": 0.76626337, + "learning_rate": 3.6445283572146765e-06, + "loss": 0.78820717, + "num_input_tokens_seen": 77872575, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.96875, + "step": 3608, + "time_per_iteration": 2.440258502960205 + }, + { + "auxiliary_loss_clip": 0.01144868, + "auxiliary_loss_mlp": 0.01046046, + "balance_loss_clip": 1.02985787, + "balance_loss_mlp": 1.04866827, + "epoch": 0.21698481887870133, + "flos": 25119047514240.0, + "grad_norm": 1.796333172920123, + "language_loss": 0.74395084, + "learning_rate": 3.6443066810624255e-06, + "loss": 0.76586002, + "num_input_tokens_seen": 77892700, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.9609375, + "step": 3609, + "time_per_iteration": 2.5326688289642334 + }, + { + "auxiliary_loss_clip": 0.01143438, + "auxiliary_loss_mlp": 0.01048498, + "balance_loss_clip": 1.03137922, + "balance_loss_mlp": 1.04871368, + "epoch": 0.2170449421313693, + "flos": 17894682748800.0, + "grad_norm": 1.781486329059154, + "language_loss": 0.88848329, + "learning_rate": 3.6440849425579e-06, + "loss": 0.91040266, + "num_input_tokens_seen": 77911060, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9453125, + "step": 3610, + "time_per_iteration": 2.4611029624938965 + }, + { + "auxiliary_loss_clip": 0.01144855, + "auxiliary_loss_mlp": 0.01038228, + "balance_loss_clip": 1.02090693, + "balance_loss_mlp": 1.05045652, + "epoch": 0.2171050653840373, + "flos": 22638446265600.0, + "grad_norm": 2.036787917991119, + "language_loss": 0.77587712, + "learning_rate": 3.6438631417095095e-06, + "loss": 0.79770797, + "num_input_tokens_seen": 77929930, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9453125, + "step": 3611, + "time_per_iteration": 2.5187723636627197 + }, + { + "auxiliary_loss_clip": 0.01136409, + "auxiliary_loss_mlp": 0.01044629, + "balance_loss_clip": 1.02829766, + "balance_loss_mlp": 1.04609489, + "epoch": 0.21716518863670525, + "flos": 19499997381120.0, + "grad_norm": 2.067133307741882, + "language_loss": 0.63197911, + "learning_rate": 3.6436412785256637e-06, + "loss": 0.65378946, + "num_input_tokens_seen": 77949060, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.90234375, + "step": 3612, + "time_per_iteration": 2.4585959911346436 + }, + { + "auxiliary_loss_clip": 0.0114176, + "auxiliary_loss_mlp": 0.01042113, + "balance_loss_clip": 1.02504194, + "balance_loss_mlp": 1.04799449, + "epoch": 0.21722531188937322, + "flos": 19792022952960.0, + "grad_norm": 1.9312736490377453, + "language_loss": 0.75120652, + "learning_rate": 3.643419353014776e-06, + "loss": 0.77304518, + "num_input_tokens_seen": 77967920, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9375, + "step": 3613, + "time_per_iteration": 2.4866983890533447 + }, + { + "auxiliary_loss_clip": 0.01136544, + "auxiliary_loss_mlp": 0.01046281, + "balance_loss_clip": 1.02900767, + "balance_loss_mlp": 1.04560208, + "epoch": 0.21728543514204118, + "flos": 13334386924800.0, + "grad_norm": 3.0184875495721, + "language_loss": 0.70767504, + "learning_rate": 3.643197365185261e-06, + "loss": 0.72950327, + "num_input_tokens_seen": 77985330, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.90625, + "step": 3614, + "time_per_iteration": 2.4454689025878906 + }, + { + "auxiliary_loss_clip": 0.01141605, + "auxiliary_loss_mlp": 0.01046562, + "balance_loss_clip": 1.0288837, + "balance_loss_mlp": 1.0491401, + "epoch": 0.21734555839470915, + "flos": 15231870783360.0, + "grad_norm": 1.8064523730299737, + "language_loss": 0.7314586, + "learning_rate": 3.6429753150455378e-06, + "loss": 0.75334036, + "num_input_tokens_seen": 78003105, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.92578125, + "step": 3615, + "time_per_iteration": 2.488711357116699 + }, + { + "auxiliary_loss_clip": 0.01145923, + "auxiliary_loss_mlp": 0.0104763, + "balance_loss_clip": 1.02832997, + "balance_loss_mlp": 1.04751146, + "epoch": 0.2174056816473771, + "flos": 19973982274560.0, + "grad_norm": 2.7876016160510377, + "language_loss": 0.90045536, + "learning_rate": 3.6427532026040263e-06, + "loss": 0.92239082, + "num_input_tokens_seen": 78019655, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.984375, + "step": 3616, + "time_per_iteration": 2.4552054405212402 + }, + { + "auxiliary_loss_clip": 0.01143252, + "auxiliary_loss_mlp": 0.01041039, + "balance_loss_clip": 1.02356279, + "balance_loss_mlp": 1.04853153, + "epoch": 0.21746580490004508, + "flos": 16687293960960.0, + "grad_norm": 2.4503731233397383, + "language_loss": 0.8111589, + "learning_rate": 3.642531027869148e-06, + "loss": 0.83300173, + "num_input_tokens_seen": 78036025, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9453125, + "step": 3617, + "time_per_iteration": 2.465254068374634 + }, + { + "auxiliary_loss_clip": 0.01143954, + "auxiliary_loss_mlp": 0.01045828, + "balance_loss_clip": 1.02928162, + "balance_loss_mlp": 1.04851139, + "epoch": 0.21752592815271307, + "flos": 25772298209280.0, + "grad_norm": 1.7784831572545423, + "language_loss": 0.75509727, + "learning_rate": 3.642308790849329e-06, + "loss": 0.77699506, + "num_input_tokens_seen": 78055645, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.953125, + "step": 3618, + "time_per_iteration": 2.5263705253601074 + }, + { + "auxiliary_loss_clip": 0.0114255, + "auxiliary_loss_mlp": 0.01049263, + "balance_loss_clip": 1.03103614, + "balance_loss_mlp": 1.04738426, + "epoch": 0.21758605140538104, + "flos": 11254692349440.0, + "grad_norm": 1.9247647214638754, + "language_loss": 0.69221723, + "learning_rate": 3.642086491552996e-06, + "loss": 0.71413535, + "num_input_tokens_seen": 78071660, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.94921875, + "step": 3619, + "time_per_iteration": 2.4615654945373535 + }, + { + "auxiliary_loss_clip": 0.01145954, + "auxiliary_loss_mlp": 0.01044723, + "balance_loss_clip": 1.02723491, + "balance_loss_mlp": 1.04906762, + "epoch": 0.217646174658049, + "flos": 19242625455360.0, + "grad_norm": 1.7662634429670958, + "language_loss": 0.78337491, + "learning_rate": 3.641864129988579e-06, + "loss": 0.80528164, + "num_input_tokens_seen": 78091265, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.96875, + "step": 3620, + "time_per_iteration": 2.4954700469970703 + }, + { + "auxiliary_loss_clip": 0.01133661, + "auxiliary_loss_mlp": 0.0103768, + "balance_loss_clip": 1.02116966, + "balance_loss_mlp": 1.04363799, + "epoch": 0.21770629791071697, + "flos": 21945083057280.0, + "grad_norm": 2.0129000326388695, + "language_loss": 0.79769373, + "learning_rate": 3.641641706164509e-06, + "loss": 0.81940717, + "num_input_tokens_seen": 78110095, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.90234375, + "step": 3621, + "time_per_iteration": 2.490427255630493 + }, + { + "auxiliary_loss_clip": 0.01138307, + "auxiliary_loss_mlp": 0.01040724, + "balance_loss_clip": 1.02436888, + "balance_loss_mlp": 1.04595852, + "epoch": 0.21776642116338493, + "flos": 24936764970240.0, + "grad_norm": 1.7548460288059653, + "language_loss": 0.87967801, + "learning_rate": 3.641419220089221e-06, + "loss": 0.90146828, + "num_input_tokens_seen": 78129475, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.921875, + "step": 3622, + "time_per_iteration": 2.484462022781372 + }, + { + "auxiliary_loss_clip": 0.01142961, + "auxiliary_loss_mlp": 0.01040225, + "balance_loss_clip": 1.02067459, + "balance_loss_mlp": 1.04766297, + "epoch": 0.2178265444160529, + "flos": 17821317219840.0, + "grad_norm": 4.811459611972859, + "language_loss": 0.76945633, + "learning_rate": 3.641196671771152e-06, + "loss": 0.79128814, + "num_input_tokens_seen": 78146880, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.94921875, + "step": 3623, + "time_per_iteration": 2.4476547241210938 + }, + { + "auxiliary_loss_clip": 0.0114403, + "auxiliary_loss_mlp": 0.01048104, + "balance_loss_clip": 1.02992439, + "balance_loss_mlp": 1.04891419, + "epoch": 0.2178866676687209, + "flos": 17712902995200.0, + "grad_norm": 2.1152987510548615, + "language_loss": 0.84886312, + "learning_rate": 3.640974061218741e-06, + "loss": 0.8707844, + "num_input_tokens_seen": 78165065, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.94921875, + "step": 3624, + "time_per_iteration": 2.444913387298584 + }, + { + "auxiliary_loss_clip": 0.0114445, + "auxiliary_loss_mlp": 0.010571, + "balance_loss_clip": 1.0397315, + "balance_loss_mlp": 1.0487287, + "epoch": 0.21794679092138886, + "flos": 16945851035520.0, + "grad_norm": 2.345969751242133, + "language_loss": 0.77035248, + "learning_rate": 3.640751388440429e-06, + "loss": 0.79236794, + "num_input_tokens_seen": 78180005, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.95703125, + "step": 3625, + "time_per_iteration": 2.4511115550994873 + }, + { + "auxiliary_loss_clip": 0.01059313, + "auxiliary_loss_mlp": 0.01000008, + "balance_loss_clip": 0.99836272, + "balance_loss_mlp": 1.02361774, + "epoch": 0.21800691417405682, + "flos": 63718566566400.0, + "grad_norm": 0.8233389824181596, + "language_loss": 0.60720766, + "learning_rate": 3.64052865344466e-06, + "loss": 0.62780088, + "num_input_tokens_seen": 78245350, + "router_z_loss_clip": 0.01647949, + "router_z_loss_mlp": 0.35546875, + "step": 3626, + "time_per_iteration": 3.21004319190979 + }, + { + "auxiliary_loss_clip": 0.0114194, + "auxiliary_loss_mlp": 0.01047127, + "balance_loss_clip": 1.02858984, + "balance_loss_mlp": 1.04572678, + "epoch": 0.21806703742672479, + "flos": 21616392677760.0, + "grad_norm": 1.8978511257882154, + "language_loss": 0.90608853, + "learning_rate": 3.6403058562398795e-06, + "loss": 0.92797917, + "num_input_tokens_seen": 78264165, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9609375, + "step": 3627, + "time_per_iteration": 2.4744250774383545 + }, + { + "auxiliary_loss_clip": 0.01138482, + "auxiliary_loss_mlp": 0.01041219, + "balance_loss_clip": 1.02346826, + "balance_loss_mlp": 1.04541492, + "epoch": 0.21812716067939275, + "flos": 19354882435200.0, + "grad_norm": 1.8495097769686537, + "language_loss": 0.73612916, + "learning_rate": 3.6400829968345365e-06, + "loss": 0.75792623, + "num_input_tokens_seen": 78283745, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9296875, + "step": 3628, + "time_per_iteration": 2.4595446586608887 + }, + { + "auxiliary_loss_clip": 0.01137064, + "auxiliary_loss_mlp": 0.01039204, + "balance_loss_clip": 1.02232444, + "balance_loss_mlp": 1.04432046, + "epoch": 0.21818728393206072, + "flos": 23548063305600.0, + "grad_norm": 1.99633175048199, + "language_loss": 0.76800162, + "learning_rate": 3.6398600752370826e-06, + "loss": 0.78976429, + "num_input_tokens_seen": 78302900, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.92578125, + "step": 3629, + "time_per_iteration": 2.487610101699829 + }, + { + "auxiliary_loss_clip": 0.01140004, + "auxiliary_loss_mlp": 0.01041342, + "balance_loss_clip": 1.02514172, + "balance_loss_mlp": 1.04701388, + "epoch": 0.21824740718472868, + "flos": 30225652266240.0, + "grad_norm": 1.5547294213075904, + "language_loss": 0.71320152, + "learning_rate": 3.63963709145597e-06, + "loss": 0.73501503, + "num_input_tokens_seen": 78326470, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.9296875, + "step": 3630, + "time_per_iteration": 2.608846426010132 + }, + { + "auxiliary_loss_clip": 0.01134439, + "auxiliary_loss_mlp": 0.01042587, + "balance_loss_clip": 1.0277338, + "balance_loss_mlp": 1.04635286, + "epoch": 0.21830753043739667, + "flos": 26134672567680.0, + "grad_norm": 1.8110131954886999, + "language_loss": 0.76331747, + "learning_rate": 3.6394140454996544e-06, + "loss": 0.78508776, + "num_input_tokens_seen": 78345810, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8828125, + "step": 3631, + "time_per_iteration": 2.53765869140625 + }, + { + "auxiliary_loss_clip": 0.01138964, + "auxiliary_loss_mlp": 0.0103994, + "balance_loss_clip": 1.0237397, + "balance_loss_mlp": 1.0455693, + "epoch": 0.21836765369006464, + "flos": 21720712752000.0, + "grad_norm": 2.0710075205659906, + "language_loss": 0.74879777, + "learning_rate": 3.639190937376594e-06, + "loss": 0.77058685, + "num_input_tokens_seen": 78364085, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.93359375, + "step": 3632, + "time_per_iteration": 2.484896421432495 + }, + { + "auxiliary_loss_clip": 0.01136054, + "auxiliary_loss_mlp": 0.01035961, + "balance_loss_clip": 1.02029681, + "balance_loss_mlp": 1.04511309, + "epoch": 0.2184277769427326, + "flos": 19937604775680.0, + "grad_norm": 1.966664682342333, + "language_loss": 0.83337629, + "learning_rate": 3.638967767095249e-06, + "loss": 0.8550964, + "num_input_tokens_seen": 78381385, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.91015625, + "step": 3633, + "time_per_iteration": 2.4721779823303223 + }, + { + "auxiliary_loss_clip": 0.01136294, + "auxiliary_loss_mlp": 0.01048418, + "balance_loss_clip": 1.03228879, + "balance_loss_mlp": 1.04592657, + "epoch": 0.21848790019540057, + "flos": 20340235301760.0, + "grad_norm": 1.8655293845238095, + "language_loss": 0.81782126, + "learning_rate": 3.6387445346640823e-06, + "loss": 0.83966839, + "num_input_tokens_seen": 78400500, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.90234375, + "step": 3634, + "time_per_iteration": 2.5514795780181885 + }, + { + "auxiliary_loss_clip": 0.01144011, + "auxiliary_loss_mlp": 0.01041001, + "balance_loss_clip": 1.02468133, + "balance_loss_mlp": 1.04863131, + "epoch": 0.21854802344806853, + "flos": 15450818135040.0, + "grad_norm": 2.010090632845536, + "language_loss": 0.75077927, + "learning_rate": 3.638521240091558e-06, + "loss": 0.77262932, + "num_input_tokens_seen": 78418340, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.953125, + "step": 3635, + "time_per_iteration": 4.07889199256897 + }, + { + "auxiliary_loss_clip": 0.01137664, + "auxiliary_loss_mlp": 0.01053987, + "balance_loss_clip": 1.03775024, + "balance_loss_mlp": 1.04744601, + "epoch": 0.2186081467007365, + "flos": 16320717711360.0, + "grad_norm": 2.2167396678675155, + "language_loss": 0.87881035, + "learning_rate": 3.6382978833861445e-06, + "loss": 0.90072685, + "num_input_tokens_seen": 78434375, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.90234375, + "step": 3636, + "time_per_iteration": 3.9134533405303955 + }, + { + "auxiliary_loss_clip": 0.01138959, + "auxiliary_loss_mlp": 0.01051316, + "balance_loss_clip": 1.03406608, + "balance_loss_mlp": 1.0456109, + "epoch": 0.2186682699534045, + "flos": 21689255416320.0, + "grad_norm": 1.9800006249435054, + "language_loss": 0.75948632, + "learning_rate": 3.638074464556311e-06, + "loss": 0.78138912, + "num_input_tokens_seen": 78451735, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.93359375, + "step": 3637, + "time_per_iteration": 2.5531604290008545 + }, + { + "auxiliary_loss_clip": 0.01143812, + "auxiliary_loss_mlp": 0.01042573, + "balance_loss_clip": 1.02445328, + "balance_loss_mlp": 1.04728055, + "epoch": 0.21872839320607246, + "flos": 17739260599680.0, + "grad_norm": 4.376345077988984, + "language_loss": 0.89677018, + "learning_rate": 3.63785098361053e-06, + "loss": 0.91863406, + "num_input_tokens_seen": 78462730, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.96875, + "step": 3638, + "time_per_iteration": 2.435544967651367 + }, + { + "auxiliary_loss_clip": 0.01140476, + "auxiliary_loss_mlp": 0.01050633, + "balance_loss_clip": 1.03377736, + "balance_loss_mlp": 1.04854274, + "epoch": 0.21878851645874042, + "flos": 18652289431680.0, + "grad_norm": 2.382131601644944, + "language_loss": 0.89958721, + "learning_rate": 3.637627440557275e-06, + "loss": 0.9214983, + "num_input_tokens_seen": 78476300, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.91796875, + "step": 3639, + "time_per_iteration": 2.448150634765625 + }, + { + "auxiliary_loss_clip": 0.01138473, + "auxiliary_loss_mlp": 0.01045558, + "balance_loss_clip": 1.02972686, + "balance_loss_mlp": 1.04632282, + "epoch": 0.2188486397114084, + "flos": 25557301353600.0, + "grad_norm": 1.7796744672676124, + "language_loss": 0.79038727, + "learning_rate": 3.637403835405024e-06, + "loss": 0.81222755, + "num_input_tokens_seen": 78496135, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.921875, + "step": 3640, + "time_per_iteration": 2.544577121734619 + }, + { + "auxiliary_loss_clip": 0.01142754, + "auxiliary_loss_mlp": 0.01051502, + "balance_loss_clip": 1.03291786, + "balance_loss_mlp": 1.05100346, + "epoch": 0.21890876296407635, + "flos": 17892061056000.0, + "grad_norm": 2.046383525913898, + "language_loss": 0.72049212, + "learning_rate": 3.637180168162255e-06, + "loss": 0.74243474, + "num_input_tokens_seen": 78513855, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.91796875, + "step": 3641, + "time_per_iteration": 2.465439558029175 + }, + { + "auxiliary_loss_clip": 0.01142611, + "auxiliary_loss_mlp": 0.01042223, + "balance_loss_clip": 1.02610588, + "balance_loss_mlp": 1.05203855, + "epoch": 0.21896888621674432, + "flos": 17749100926080.0, + "grad_norm": 2.4771917366671, + "language_loss": 0.80913448, + "learning_rate": 3.63695643883745e-06, + "loss": 0.8309828, + "num_input_tokens_seen": 78531740, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.90625, + "step": 3642, + "time_per_iteration": 2.4598801136016846 + }, + { + "auxiliary_loss_clip": 0.01144439, + "auxiliary_loss_mlp": 0.01040377, + "balance_loss_clip": 1.02319944, + "balance_loss_mlp": 1.05089164, + "epoch": 0.21902900946941228, + "flos": 23076161400960.0, + "grad_norm": 2.0352379603627684, + "language_loss": 0.71573192, + "learning_rate": 3.6367326474390928e-06, + "loss": 0.73758006, + "num_input_tokens_seen": 78549600, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9375, + "step": 3643, + "time_per_iteration": 2.4988484382629395 + }, + { + "auxiliary_loss_clip": 0.01144262, + "auxiliary_loss_mlp": 0.01048332, + "balance_loss_clip": 1.03115392, + "balance_loss_mlp": 1.05041492, + "epoch": 0.21908913272208028, + "flos": 48178545004800.0, + "grad_norm": 2.9224514767679763, + "language_loss": 0.68172711, + "learning_rate": 3.6365087939756696e-06, + "loss": 0.70365304, + "num_input_tokens_seen": 78573350, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9375, + "step": 3644, + "time_per_iteration": 2.721107244491577 + }, + { + "auxiliary_loss_clip": 0.01144867, + "auxiliary_loss_mlp": 0.01041697, + "balance_loss_clip": 1.0252583, + "balance_loss_mlp": 1.04905653, + "epoch": 0.21914925597474824, + "flos": 22236749493120.0, + "grad_norm": 2.1869112310362504, + "language_loss": 0.77744782, + "learning_rate": 3.636284878455669e-06, + "loss": 0.79931343, + "num_input_tokens_seen": 78591005, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9609375, + "step": 3645, + "time_per_iteration": 2.4838709831237793 + }, + { + "auxiliary_loss_clip": 0.01140139, + "auxiliary_loss_mlp": 0.01048358, + "balance_loss_clip": 1.03275371, + "balance_loss_mlp": 1.04988873, + "epoch": 0.2192093792274162, + "flos": 22125605834880.0, + "grad_norm": 1.575077237748942, + "language_loss": 0.82405865, + "learning_rate": 3.636060900887582e-06, + "loss": 0.84594363, + "num_input_tokens_seen": 78610645, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.90234375, + "step": 3646, + "time_per_iteration": 2.467958927154541 + }, + { + "auxiliary_loss_clip": 0.01137932, + "auxiliary_loss_mlp": 0.01036528, + "balance_loss_clip": 1.02050591, + "balance_loss_mlp": 1.04901123, + "epoch": 0.21926950248008417, + "flos": 15669442264320.0, + "grad_norm": 1.7225223193128734, + "language_loss": 0.83016759, + "learning_rate": 3.635836861279901e-06, + "loss": 0.85191214, + "num_input_tokens_seen": 78628340, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.890625, + "step": 3647, + "time_per_iteration": 2.4670159816741943 + }, + { + "auxiliary_loss_clip": 0.01137396, + "auxiliary_loss_mlp": 0.01046032, + "balance_loss_clip": 1.02991438, + "balance_loss_mlp": 1.04734278, + "epoch": 0.21932962573275214, + "flos": 30262496641920.0, + "grad_norm": 1.5879018059409027, + "language_loss": 0.72555232, + "learning_rate": 3.635612759641123e-06, + "loss": 0.74738657, + "num_input_tokens_seen": 78649355, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.90234375, + "step": 3648, + "time_per_iteration": 2.5572352409362793 + }, + { + "auxiliary_loss_clip": 0.01140287, + "auxiliary_loss_mlp": 0.01045097, + "balance_loss_clip": 1.02628565, + "balance_loss_mlp": 1.04563618, + "epoch": 0.2193897489854201, + "flos": 10780132838400.0, + "grad_norm": 2.3666125536095612, + "language_loss": 0.74363017, + "learning_rate": 3.635388595979745e-06, + "loss": 0.76548404, + "num_input_tokens_seen": 78664915, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9453125, + "step": 3649, + "time_per_iteration": 2.4465692043304443 + }, + { + "auxiliary_loss_clip": 0.01133567, + "auxiliary_loss_mlp": 0.01043993, + "balance_loss_clip": 1.02869856, + "balance_loss_mlp": 1.04609215, + "epoch": 0.21944987223808807, + "flos": 19133313390720.0, + "grad_norm": 2.0558746559562953, + "language_loss": 0.86408567, + "learning_rate": 3.635164370304267e-06, + "loss": 0.88586134, + "num_input_tokens_seen": 78681475, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.875, + "step": 3650, + "time_per_iteration": 2.4408226013183594 + }, + { + "auxiliary_loss_clip": 0.01137285, + "auxiliary_loss_mlp": 0.0104387, + "balance_loss_clip": 1.02747929, + "balance_loss_mlp": 1.04549015, + "epoch": 0.21950999549075606, + "flos": 22711093522560.0, + "grad_norm": 2.0425834927064934, + "language_loss": 0.83693743, + "learning_rate": 3.6349400826231927e-06, + "loss": 0.85874897, + "num_input_tokens_seen": 78702300, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.91796875, + "step": 3651, + "time_per_iteration": 2.502694845199585 + }, + { + "auxiliary_loss_clip": 0.01137563, + "auxiliary_loss_mlp": 0.01046031, + "balance_loss_clip": 1.02941298, + "balance_loss_mlp": 1.04595184, + "epoch": 0.21957011874342403, + "flos": 10561329141120.0, + "grad_norm": 1.8702009414404626, + "language_loss": 0.74629313, + "learning_rate": 3.634715732945027e-06, + "loss": 0.76812911, + "num_input_tokens_seen": 78720230, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9140625, + "step": 3652, + "time_per_iteration": 2.4422640800476074 + }, + { + "auxiliary_loss_clip": 0.01052644, + "auxiliary_loss_mlp": 0.01011234, + "balance_loss_clip": 1.00946999, + "balance_loss_mlp": 1.0194056, + "epoch": 0.219630241996092, + "flos": 65747913252480.0, + "grad_norm": 0.7344385056765022, + "language_loss": 0.51548386, + "learning_rate": 3.6344913212782764e-06, + "loss": 0.53612262, + "num_input_tokens_seen": 78780200, + "router_z_loss_clip": 0.0177002, + "router_z_loss_mlp": 0.33203125, + "step": 3653, + "time_per_iteration": 3.0743935108184814 + }, + { + "auxiliary_loss_clip": 0.01142335, + "auxiliary_loss_mlp": 0.01048616, + "balance_loss_clip": 1.03215361, + "balance_loss_mlp": 1.05115473, + "epoch": 0.21969036524875996, + "flos": 23696518216320.0, + "grad_norm": 1.781801507589209, + "language_loss": 0.75256276, + "learning_rate": 3.6342668476314514e-06, + "loss": 0.77447224, + "num_input_tokens_seen": 78800575, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.91015625, + "step": 3654, + "time_per_iteration": 2.4826300144195557 + }, + { + "auxiliary_loss_clip": 0.01143131, + "auxiliary_loss_mlp": 0.01041429, + "balance_loss_clip": 1.02499056, + "balance_loss_mlp": 1.04988194, + "epoch": 0.21975048850142792, + "flos": 19640910435840.0, + "grad_norm": 1.9986760770887892, + "language_loss": 0.72757828, + "learning_rate": 3.634042312013064e-06, + "loss": 0.74942386, + "num_input_tokens_seen": 78819585, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9296875, + "step": 3655, + "time_per_iteration": 2.494662284851074 + }, + { + "auxiliary_loss_clip": 0.01139919, + "auxiliary_loss_mlp": 0.01044993, + "balance_loss_clip": 1.02860177, + "balance_loss_mlp": 1.04802227, + "epoch": 0.21981061175409589, + "flos": 22448550038400.0, + "grad_norm": 1.6963533722566047, + "language_loss": 0.80971813, + "learning_rate": 3.6338177144316276e-06, + "loss": 0.83156729, + "num_input_tokens_seen": 78837330, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.91796875, + "step": 3656, + "time_per_iteration": 2.465020179748535 + }, + { + "auxiliary_loss_clip": 0.01141894, + "auxiliary_loss_mlp": 0.01039083, + "balance_loss_clip": 1.02267933, + "balance_loss_mlp": 1.05085039, + "epoch": 0.21987073500676388, + "flos": 18151049093760.0, + "grad_norm": 2.205234752003223, + "language_loss": 0.84668207, + "learning_rate": 3.63359305489566e-06, + "loss": 0.86849183, + "num_input_tokens_seen": 78854955, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.91015625, + "step": 3657, + "time_per_iteration": 2.4626548290252686 + }, + { + "auxiliary_loss_clip": 0.01138622, + "auxiliary_loss_mlp": 0.0103825, + "balance_loss_clip": 1.02126312, + "balance_loss_mlp": 1.0460434, + "epoch": 0.21993085825943184, + "flos": 25626177682560.0, + "grad_norm": 1.714181577212399, + "language_loss": 0.80485702, + "learning_rate": 3.6333683334136803e-06, + "loss": 0.8266257, + "num_input_tokens_seen": 78874965, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.92578125, + "step": 3658, + "time_per_iteration": 2.492835521697998 + }, + { + "auxiliary_loss_clip": 0.01053481, + "auxiliary_loss_mlp": 0.01002458, + "balance_loss_clip": 1.00065756, + "balance_loss_mlp": 1.02029002, + "epoch": 0.2199909815120998, + "flos": 70923217743360.0, + "grad_norm": 0.8995084923077876, + "language_loss": 0.58224851, + "learning_rate": 3.6331435499942095e-06, + "loss": 0.60280788, + "num_input_tokens_seen": 78937740, + "router_z_loss_clip": 0.01794434, + "router_z_loss_mlp": 0.33203125, + "step": 3659, + "time_per_iteration": 3.1709213256835938 + }, + { + "auxiliary_loss_clip": 0.01140235, + "auxiliary_loss_mlp": 0.0103939, + "balance_loss_clip": 1.02248645, + "balance_loss_mlp": 1.04958415, + "epoch": 0.22005110476476777, + "flos": 21543529939200.0, + "grad_norm": 2.4575828715719177, + "language_loss": 0.74535513, + "learning_rate": 3.632918704645772e-06, + "loss": 0.76715136, + "num_input_tokens_seen": 78955055, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.90625, + "step": 3660, + "time_per_iteration": 2.474397897720337 + }, + { + "auxiliary_loss_clip": 0.01139013, + "auxiliary_loss_mlp": 0.01040353, + "balance_loss_clip": 1.02336597, + "balance_loss_mlp": 1.04723859, + "epoch": 0.22011122801743574, + "flos": 22054502862720.0, + "grad_norm": 2.0332694306983723, + "language_loss": 0.81225419, + "learning_rate": 3.632693797376893e-06, + "loss": 0.83404779, + "num_input_tokens_seen": 78974895, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.91796875, + "step": 3661, + "time_per_iteration": 2.4926669597625732 + }, + { + "auxiliary_loss_clip": 0.01138494, + "auxiliary_loss_mlp": 0.01042707, + "balance_loss_clip": 1.02639949, + "balance_loss_mlp": 1.04773009, + "epoch": 0.2201713512701037, + "flos": 26687589598080.0, + "grad_norm": 1.8682139743879211, + "language_loss": 0.73236209, + "learning_rate": 3.632468828196102e-06, + "loss": 0.75417411, + "num_input_tokens_seen": 78994990, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.90625, + "step": 3662, + "time_per_iteration": 2.5111234188079834 + }, + { + "auxiliary_loss_clip": 0.01140855, + "auxiliary_loss_mlp": 0.01048578, + "balance_loss_clip": 1.03333092, + "balance_loss_mlp": 1.05132473, + "epoch": 0.22023147452277167, + "flos": 22162198815360.0, + "grad_norm": 1.6440107639340105, + "language_loss": 0.77800119, + "learning_rate": 3.632243797111929e-06, + "loss": 0.79989552, + "num_input_tokens_seen": 79014405, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.89453125, + "step": 3663, + "time_per_iteration": 2.485520601272583 + }, + { + "auxiliary_loss_clip": 0.01144475, + "auxiliary_loss_mlp": 0.01043185, + "balance_loss_clip": 1.02581656, + "balance_loss_mlp": 1.05125535, + "epoch": 0.22029159777543966, + "flos": 22523280284160.0, + "grad_norm": 3.566897500342904, + "language_loss": 0.80484056, + "learning_rate": 3.632018704132908e-06, + "loss": 0.8267172, + "num_input_tokens_seen": 79032375, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.93359375, + "step": 3664, + "time_per_iteration": 2.4827098846435547 + }, + { + "auxiliary_loss_clip": 0.01146334, + "auxiliary_loss_mlp": 0.01042617, + "balance_loss_clip": 1.02354348, + "balance_loss_mlp": 1.04959095, + "epoch": 0.22035172102810763, + "flos": 13042469093760.0, + "grad_norm": 2.530665000734818, + "language_loss": 0.76296824, + "learning_rate": 3.6317935492675742e-06, + "loss": 0.78485775, + "num_input_tokens_seen": 79049635, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.96875, + "step": 3665, + "time_per_iteration": 2.5118229389190674 + }, + { + "auxiliary_loss_clip": 0.01139389, + "auxiliary_loss_mlp": 0.01044667, + "balance_loss_clip": 1.0282042, + "balance_loss_mlp": 1.04779172, + "epoch": 0.2204118442807756, + "flos": 12165817760640.0, + "grad_norm": 2.7337119989610468, + "language_loss": 0.97959125, + "learning_rate": 3.631568332524466e-06, + "loss": 1.00143182, + "num_input_tokens_seen": 79062890, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9140625, + "step": 3666, + "time_per_iteration": 2.4461512565612793 + }, + { + "auxiliary_loss_clip": 0.01136729, + "auxiliary_loss_mlp": 0.01039342, + "balance_loss_clip": 1.02241421, + "balance_loss_mlp": 1.04582953, + "epoch": 0.22047196753344356, + "flos": 40108806673920.0, + "grad_norm": 2.115803047817727, + "language_loss": 0.80494016, + "learning_rate": 3.631343053912122e-06, + "loss": 0.82670087, + "num_input_tokens_seen": 79085495, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.90625, + "step": 3667, + "time_per_iteration": 2.65198016166687 + }, + { + "auxiliary_loss_clip": 0.01144733, + "auxiliary_loss_mlp": 0.01046593, + "balance_loss_clip": 1.02776945, + "balance_loss_mlp": 1.04882097, + "epoch": 0.22053209078611152, + "flos": 20701137202560.0, + "grad_norm": 1.916720089378095, + "language_loss": 0.77463895, + "learning_rate": 3.631117713439087e-06, + "loss": 0.79655218, + "num_input_tokens_seen": 79101820, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9609375, + "step": 3668, + "time_per_iteration": 2.459141254425049 + }, + { + "auxiliary_loss_clip": 0.0114207, + "auxiliary_loss_mlp": 0.01042745, + "balance_loss_clip": 1.02568614, + "balance_loss_mlp": 1.05058837, + "epoch": 0.2205922140387795, + "flos": 24716309247360.0, + "grad_norm": 1.730318389149699, + "language_loss": 0.71514869, + "learning_rate": 3.630892311113904e-06, + "loss": 0.73699689, + "num_input_tokens_seen": 79123320, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9140625, + "step": 3669, + "time_per_iteration": 2.550732135772705 + }, + { + "auxiliary_loss_clip": 0.01139227, + "auxiliary_loss_mlp": 0.01037839, + "balance_loss_clip": 1.02106571, + "balance_loss_mlp": 1.04615474, + "epoch": 0.22065233729144745, + "flos": 23477247642240.0, + "grad_norm": 2.0994504177928826, + "language_loss": 0.85294032, + "learning_rate": 3.6306668469451215e-06, + "loss": 0.87471098, + "num_input_tokens_seen": 79141615, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9296875, + "step": 3670, + "time_per_iteration": 2.4727606773376465 + }, + { + "auxiliary_loss_clip": 0.01147385, + "auxiliary_loss_mlp": 0.01040938, + "balance_loss_clip": 1.02360499, + "balance_loss_mlp": 1.05130565, + "epoch": 0.22071246054411545, + "flos": 35225566646400.0, + "grad_norm": 1.775856591734502, + "language_loss": 0.76796275, + "learning_rate": 3.6304413209412886e-06, + "loss": 0.789846, + "num_input_tokens_seen": 79164910, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9609375, + "step": 3671, + "time_per_iteration": 2.613104820251465 + }, + { + "auxiliary_loss_clip": 0.01140966, + "auxiliary_loss_mlp": 0.01034463, + "balance_loss_clip": 1.01758265, + "balance_loss_mlp": 1.0487864, + "epoch": 0.2207725837967834, + "flos": 18150294908160.0, + "grad_norm": 2.8820912362302202, + "language_loss": 0.80472648, + "learning_rate": 3.6302157331109573e-06, + "loss": 0.82648075, + "num_input_tokens_seen": 79179685, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.921875, + "step": 3672, + "time_per_iteration": 2.4365992546081543 + }, + { + "auxiliary_loss_clip": 0.01144646, + "auxiliary_loss_mlp": 0.01047711, + "balance_loss_clip": 1.03129566, + "balance_loss_mlp": 1.05145025, + "epoch": 0.22083270704945138, + "flos": 20479675898880.0, + "grad_norm": 1.8912849075471436, + "language_loss": 0.736193, + "learning_rate": 3.629990083462682e-06, + "loss": 0.75811654, + "num_input_tokens_seen": 79196285, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9296875, + "step": 3673, + "time_per_iteration": 2.4908931255340576 + }, + { + "auxiliary_loss_clip": 0.01145514, + "auxiliary_loss_mlp": 0.01037762, + "balance_loss_clip": 1.02064395, + "balance_loss_mlp": 1.05221379, + "epoch": 0.22089283030211934, + "flos": 34125801984000.0, + "grad_norm": 1.9375944290288487, + "language_loss": 0.76505005, + "learning_rate": 3.6297643720050203e-06, + "loss": 0.78688282, + "num_input_tokens_seen": 79216060, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9296875, + "step": 3674, + "time_per_iteration": 2.569312572479248 + }, + { + "auxiliary_loss_clip": 0.01142786, + "auxiliary_loss_mlp": 0.01043506, + "balance_loss_clip": 1.02518344, + "balance_loss_mlp": 1.05025005, + "epoch": 0.2209529535547873, + "flos": 18077216688000.0, + "grad_norm": 2.0287396146216055, + "language_loss": 0.74786556, + "learning_rate": 3.6295385987465293e-06, + "loss": 0.76972854, + "num_input_tokens_seen": 79235145, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.92578125, + "step": 3675, + "time_per_iteration": 2.4762706756591797 + }, + { + "auxiliary_loss_clip": 0.01141021, + "auxiliary_loss_mlp": 0.01040878, + "balance_loss_clip": 1.02395034, + "balance_loss_mlp": 1.0473659, + "epoch": 0.22101307680745527, + "flos": 27235335070080.0, + "grad_norm": 1.7527405009289938, + "language_loss": 0.80050498, + "learning_rate": 3.629312763695772e-06, + "loss": 0.82232398, + "num_input_tokens_seen": 79256960, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9375, + "step": 3676, + "time_per_iteration": 2.5846786499023438 + }, + { + "auxiliary_loss_clip": 0.0114147, + "auxiliary_loss_mlp": 0.01047315, + "balance_loss_clip": 1.03106666, + "balance_loss_mlp": 1.0474596, + "epoch": 0.22107320006012326, + "flos": 16543256423040.0, + "grad_norm": 1.974355382670518, + "language_loss": 0.75501895, + "learning_rate": 3.6290868668613107e-06, + "loss": 0.77690685, + "num_input_tokens_seen": 79274860, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.94140625, + "step": 3677, + "time_per_iteration": 4.02753758430481 + }, + { + "auxiliary_loss_clip": 0.01135837, + "auxiliary_loss_mlp": 0.01041033, + "balance_loss_clip": 1.02455878, + "balance_loss_mlp": 1.0449332, + "epoch": 0.22113332331279123, + "flos": 22054466949120.0, + "grad_norm": 2.0397766719275494, + "language_loss": 0.83412457, + "learning_rate": 3.628860908251712e-06, + "loss": 0.85589325, + "num_input_tokens_seen": 79294005, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.91015625, + "step": 3678, + "time_per_iteration": 3.9455032348632812 + }, + { + "auxiliary_loss_clip": 0.01140751, + "auxiliary_loss_mlp": 0.01046282, + "balance_loss_clip": 1.02903211, + "balance_loss_mlp": 1.04866314, + "epoch": 0.2211934465654592, + "flos": 26612787525120.0, + "grad_norm": 1.7724652071984504, + "language_loss": 0.89272189, + "learning_rate": 3.6286348878755452e-06, + "loss": 0.91459215, + "num_input_tokens_seen": 79314005, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.921875, + "step": 3679, + "time_per_iteration": 2.548166036605835 + }, + { + "auxiliary_loss_clip": 0.01142658, + "auxiliary_loss_mlp": 0.01053161, + "balance_loss_clip": 1.03517246, + "balance_loss_mlp": 1.04887235, + "epoch": 0.22125356981812716, + "flos": 16360363347840.0, + "grad_norm": 2.4577897330130773, + "language_loss": 0.86718571, + "learning_rate": 3.6284088057413803e-06, + "loss": 0.88914388, + "num_input_tokens_seen": 79331030, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9375, + "step": 3680, + "time_per_iteration": 2.468712329864502 + }, + { + "auxiliary_loss_clip": 0.0114123, + "auxiliary_loss_mlp": 0.01044656, + "balance_loss_clip": 1.02809739, + "balance_loss_mlp": 1.05175805, + "epoch": 0.22131369307079513, + "flos": 21651118151040.0, + "grad_norm": 2.0752123015423556, + "language_loss": 0.81897914, + "learning_rate": 3.6281826618577894e-06, + "loss": 0.84083802, + "num_input_tokens_seen": 79348560, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.89453125, + "step": 3681, + "time_per_iteration": 2.532210350036621 + }, + { + "auxiliary_loss_clip": 0.01136266, + "auxiliary_loss_mlp": 0.0103672, + "balance_loss_clip": 1.02076972, + "balance_loss_mlp": 1.04784071, + "epoch": 0.2213738163234631, + "flos": 19609524927360.0, + "grad_norm": 2.44274183004677, + "language_loss": 0.79908317, + "learning_rate": 3.62795645623335e-06, + "loss": 0.82081306, + "num_input_tokens_seen": 79367175, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.88671875, + "step": 3682, + "time_per_iteration": 2.491135358810425 + }, + { + "auxiliary_loss_clip": 0.01140313, + "auxiliary_loss_mlp": 0.01042047, + "balance_loss_clip": 1.02433276, + "balance_loss_mlp": 1.04739022, + "epoch": 0.22143393957613106, + "flos": 23623404082560.0, + "grad_norm": 2.2064811404605376, + "language_loss": 0.77283889, + "learning_rate": 3.627730188876638e-06, + "loss": 0.79466248, + "num_input_tokens_seen": 79388435, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9296875, + "step": 3683, + "time_per_iteration": 2.503041982650757 + }, + { + "auxiliary_loss_clip": 0.01141417, + "auxiliary_loss_mlp": 0.01045647, + "balance_loss_clip": 1.02824235, + "balance_loss_mlp": 1.04623342, + "epoch": 0.22149406282879905, + "flos": 26177801823360.0, + "grad_norm": 2.114071962716483, + "language_loss": 0.72779894, + "learning_rate": 3.627503859796234e-06, + "loss": 0.74966961, + "num_input_tokens_seen": 79407910, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.953125, + "step": 3684, + "time_per_iteration": 2.521495819091797 + }, + { + "auxiliary_loss_clip": 0.01142849, + "auxiliary_loss_mlp": 0.01043772, + "balance_loss_clip": 1.02598643, + "balance_loss_mlp": 1.05060613, + "epoch": 0.221554186081467, + "flos": 14538758970240.0, + "grad_norm": 1.9389187138945425, + "language_loss": 0.80108052, + "learning_rate": 3.6272774690007207e-06, + "loss": 0.82294679, + "num_input_tokens_seen": 79424020, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.921875, + "step": 3685, + "time_per_iteration": 2.436958074569702 + }, + { + "auxiliary_loss_clip": 0.01135153, + "auxiliary_loss_mlp": 0.01040139, + "balance_loss_clip": 1.02504683, + "balance_loss_mlp": 1.04634571, + "epoch": 0.22161430933413498, + "flos": 22238257864320.0, + "grad_norm": 1.5568750132404718, + "language_loss": 0.87128556, + "learning_rate": 3.6270510164986823e-06, + "loss": 0.89303845, + "num_input_tokens_seen": 79445605, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.88671875, + "step": 3686, + "time_per_iteration": 2.5519070625305176 + }, + { + "auxiliary_loss_clip": 0.01138026, + "auxiliary_loss_mlp": 0.01042632, + "balance_loss_clip": 1.02552581, + "balance_loss_mlp": 1.04762685, + "epoch": 0.22167443258680294, + "flos": 23476529370240.0, + "grad_norm": 1.942015126167962, + "language_loss": 0.77953136, + "learning_rate": 3.626824502298707e-06, + "loss": 0.8013379, + "num_input_tokens_seen": 79463850, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.90234375, + "step": 3687, + "time_per_iteration": 2.495084285736084 + }, + { + "auxiliary_loss_clip": 0.01146436, + "auxiliary_loss_mlp": 0.01048705, + "balance_loss_clip": 1.03085971, + "balance_loss_mlp": 1.05057812, + "epoch": 0.2217345558394709, + "flos": 23221132692480.0, + "grad_norm": 1.8313314390802422, + "language_loss": 0.84722549, + "learning_rate": 3.626597926409383e-06, + "loss": 0.86917698, + "num_input_tokens_seen": 79482845, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.95703125, + "step": 3688, + "time_per_iteration": 2.5029165744781494 + }, + { + "auxiliary_loss_clip": 0.01146721, + "auxiliary_loss_mlp": 0.0104649, + "balance_loss_clip": 1.02897787, + "balance_loss_mlp": 1.05005932, + "epoch": 0.22179467909213887, + "flos": 20011078045440.0, + "grad_norm": 2.7913489877281905, + "language_loss": 0.81395769, + "learning_rate": 3.6263712888393027e-06, + "loss": 0.83588976, + "num_input_tokens_seen": 79501550, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.96484375, + "step": 3689, + "time_per_iteration": 2.487032651901245 + }, + { + "auxiliary_loss_clip": 0.0114216, + "auxiliary_loss_mlp": 0.01044991, + "balance_loss_clip": 1.02758622, + "balance_loss_mlp": 1.04985952, + "epoch": 0.22185480234480687, + "flos": 19683034110720.0, + "grad_norm": 2.5504206662352082, + "language_loss": 0.70040542, + "learning_rate": 3.626144589597061e-06, + "loss": 0.72227693, + "num_input_tokens_seen": 79519680, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.921875, + "step": 3690, + "time_per_iteration": 2.5005807876586914 + }, + { + "auxiliary_loss_clip": 0.01145048, + "auxiliary_loss_mlp": 0.01038313, + "balance_loss_clip": 1.0202167, + "balance_loss_mlp": 1.04890513, + "epoch": 0.22191492559747483, + "flos": 21981316901760.0, + "grad_norm": 1.7318147752747124, + "language_loss": 0.72394359, + "learning_rate": 3.6259178286912528e-06, + "loss": 0.74577713, + "num_input_tokens_seen": 79539000, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9609375, + "step": 3691, + "time_per_iteration": 2.4835989475250244 + }, + { + "auxiliary_loss_clip": 0.01145815, + "auxiliary_loss_mlp": 0.01049746, + "balance_loss_clip": 1.03169739, + "balance_loss_mlp": 1.05317688, + "epoch": 0.2219750488501428, + "flos": 23222066446080.0, + "grad_norm": 2.1843836481793057, + "language_loss": 0.71611524, + "learning_rate": 3.625691006130477e-06, + "loss": 0.73807085, + "num_input_tokens_seen": 79559695, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.92578125, + "step": 3692, + "time_per_iteration": 2.515230655670166 + }, + { + "auxiliary_loss_clip": 0.01146831, + "auxiliary_loss_mlp": 0.01044658, + "balance_loss_clip": 1.02750337, + "balance_loss_mlp": 1.05008483, + "epoch": 0.22203517210281076, + "flos": 22453685683200.0, + "grad_norm": 2.7650002202849113, + "language_loss": 0.87580657, + "learning_rate": 3.6254641219233362e-06, + "loss": 0.89772147, + "num_input_tokens_seen": 79579095, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.96875, + "step": 3693, + "time_per_iteration": 2.5013554096221924 + }, + { + "auxiliary_loss_clip": 0.01138596, + "auxiliary_loss_mlp": 0.010363, + "balance_loss_clip": 1.02086258, + "balance_loss_mlp": 1.04947054, + "epoch": 0.22209529535547873, + "flos": 17564555825280.0, + "grad_norm": 3.031177285152565, + "language_loss": 0.85307622, + "learning_rate": 3.6252371760784325e-06, + "loss": 0.87482512, + "num_input_tokens_seen": 79596430, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.890625, + "step": 3694, + "time_per_iteration": 2.4828481674194336 + }, + { + "auxiliary_loss_clip": 0.01147368, + "auxiliary_loss_mlp": 0.01041631, + "balance_loss_clip": 1.02370214, + "balance_loss_mlp": 1.0491637, + "epoch": 0.2221554186081467, + "flos": 21469015175040.0, + "grad_norm": 1.9517253418741858, + "language_loss": 0.69055748, + "learning_rate": 3.6250101686043725e-06, + "loss": 0.71244752, + "num_input_tokens_seen": 79615825, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.984375, + "step": 3695, + "time_per_iteration": 2.49957537651062 + }, + { + "auxiliary_loss_clip": 0.01141491, + "auxiliary_loss_mlp": 0.0104003, + "balance_loss_clip": 1.02438951, + "balance_loss_mlp": 1.05095696, + "epoch": 0.22221554186081466, + "flos": 27673445255040.0, + "grad_norm": 1.4867456423055678, + "language_loss": 0.71710318, + "learning_rate": 3.6247830995097637e-06, + "loss": 0.73891842, + "num_input_tokens_seen": 79637875, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.90625, + "step": 3696, + "time_per_iteration": 2.5991299152374268 + }, + { + "auxiliary_loss_clip": 0.01140811, + "auxiliary_loss_mlp": 0.01040962, + "balance_loss_clip": 1.02387977, + "balance_loss_mlp": 1.0483942, + "epoch": 0.22227566511348265, + "flos": 25958926298880.0, + "grad_norm": 1.901791440824732, + "language_loss": 0.87694812, + "learning_rate": 3.624555968803217e-06, + "loss": 0.8987658, + "num_input_tokens_seen": 79656970, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.921875, + "step": 3697, + "time_per_iteration": 2.524841547012329 + }, + { + "auxiliary_loss_clip": 0.01134138, + "auxiliary_loss_mlp": 0.01045972, + "balance_loss_clip": 1.03020072, + "balance_loss_mlp": 1.04646909, + "epoch": 0.22233578836615062, + "flos": 39203678833920.0, + "grad_norm": 1.985465494359005, + "language_loss": 0.66109681, + "learning_rate": 3.624328776493346e-06, + "loss": 0.68289793, + "num_input_tokens_seen": 79680275, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.87890625, + "step": 3698, + "time_per_iteration": 2.6806552410125732 + }, + { + "auxiliary_loss_clip": 0.01143188, + "auxiliary_loss_mlp": 0.01038878, + "balance_loss_clip": 1.0208652, + "balance_loss_mlp": 1.049245, + "epoch": 0.22239591161881858, + "flos": 36283782251520.0, + "grad_norm": 1.9701476357110561, + "language_loss": 0.82699466, + "learning_rate": 3.6241015225887637e-06, + "loss": 0.84881532, + "num_input_tokens_seen": 79701255, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9375, + "step": 3699, + "time_per_iteration": 2.620795965194702 + }, + { + "auxiliary_loss_clip": 0.01141189, + "auxiliary_loss_mlp": 0.01044961, + "balance_loss_clip": 1.02789021, + "balance_loss_mlp": 1.04960978, + "epoch": 0.22245603487148655, + "flos": 19719591177600.0, + "grad_norm": 1.6593732889446324, + "language_loss": 0.79488564, + "learning_rate": 3.62387420709809e-06, + "loss": 0.81674713, + "num_input_tokens_seen": 79721315, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9140625, + "step": 3700, + "time_per_iteration": 2.4886739253997803 + }, + { + "auxiliary_loss_clip": 0.01148421, + "auxiliary_loss_mlp": 0.01045024, + "balance_loss_clip": 1.02639139, + "balance_loss_mlp": 1.05154204, + "epoch": 0.2225161581241545, + "flos": 46280450615040.0, + "grad_norm": 7.082418544009014, + "language_loss": 0.72063768, + "learning_rate": 3.623646830029943e-06, + "loss": 0.74257213, + "num_input_tokens_seen": 79742705, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.96875, + "step": 3701, + "time_per_iteration": 2.7293899059295654 + }, + { + "auxiliary_loss_clip": 0.01139069, + "auxiliary_loss_mlp": 0.0104219, + "balance_loss_clip": 1.02520323, + "balance_loss_mlp": 1.04706395, + "epoch": 0.22257628137682248, + "flos": 23696194993920.0, + "grad_norm": 1.9269634413479926, + "language_loss": 0.79704928, + "learning_rate": 3.6234193913929454e-06, + "loss": 0.81886196, + "num_input_tokens_seen": 79763000, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.921875, + "step": 3702, + "time_per_iteration": 2.5527849197387695 + }, + { + "auxiliary_loss_clip": 0.01132932, + "auxiliary_loss_mlp": 0.0104181, + "balance_loss_clip": 1.02487028, + "balance_loss_mlp": 1.04518211, + "epoch": 0.22263640462949044, + "flos": 19353984595200.0, + "grad_norm": 2.7410709876553447, + "language_loss": 0.78632712, + "learning_rate": 3.623191891195723e-06, + "loss": 0.80807453, + "num_input_tokens_seen": 79781335, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.87890625, + "step": 3703, + "time_per_iteration": 2.4955005645751953 + }, + { + "auxiliary_loss_clip": 0.01140692, + "auxiliary_loss_mlp": 0.01037524, + "balance_loss_clip": 1.01810527, + "balance_loss_mlp": 1.0468421, + "epoch": 0.22269652788215843, + "flos": 20776047016320.0, + "grad_norm": 1.8479834568020117, + "language_loss": 0.74212444, + "learning_rate": 3.6229643294469005e-06, + "loss": 0.7639066, + "num_input_tokens_seen": 79800150, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.9375, + "step": 3704, + "time_per_iteration": 2.5000903606414795 + }, + { + "auxiliary_loss_clip": 0.0113847, + "auxiliary_loss_mlp": 0.01042668, + "balance_loss_clip": 1.02618146, + "balance_loss_mlp": 1.05030012, + "epoch": 0.2227566511348264, + "flos": 47958843467520.0, + "grad_norm": 1.7361108874663713, + "language_loss": 0.64372134, + "learning_rate": 3.6227367061551074e-06, + "loss": 0.66553271, + "num_input_tokens_seen": 79822390, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8828125, + "step": 3705, + "time_per_iteration": 2.6993744373321533 + }, + { + "auxiliary_loss_clip": 0.01064369, + "auxiliary_loss_mlp": 0.01006302, + "balance_loss_clip": 1.00454926, + "balance_loss_mlp": 1.03098035, + "epoch": 0.22281677438749437, + "flos": 66218953230720.0, + "grad_norm": 1.353184132187748, + "language_loss": 0.65301311, + "learning_rate": 3.6225090213289766e-06, + "loss": 0.67371976, + "num_input_tokens_seen": 79873350, + "router_z_loss_clip": 0.01757812, + "router_z_loss_mlp": 0.33398438, + "step": 3706, + "time_per_iteration": 2.9832844734191895 + }, + { + "auxiliary_loss_clip": 0.01137982, + "auxiliary_loss_mlp": 0.01036629, + "balance_loss_clip": 1.02076256, + "balance_loss_mlp": 1.0461061, + "epoch": 0.22287689764016233, + "flos": 21871609787520.0, + "grad_norm": 3.09427451037038, + "language_loss": 0.80608439, + "learning_rate": 3.622281274977141e-06, + "loss": 0.82783049, + "num_input_tokens_seen": 79891715, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.91796875, + "step": 3707, + "time_per_iteration": 2.5236454010009766 + }, + { + "auxiliary_loss_clip": 0.01139003, + "auxiliary_loss_mlp": 0.01038491, + "balance_loss_clip": 1.02184916, + "balance_loss_mlp": 1.04706407, + "epoch": 0.2229370208928303, + "flos": 27672475587840.0, + "grad_norm": 2.0318896185848057, + "language_loss": 0.78124011, + "learning_rate": 3.6220534671082367e-06, + "loss": 0.80301505, + "num_input_tokens_seen": 79911175, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.921875, + "step": 3708, + "time_per_iteration": 2.5254104137420654 + }, + { + "auxiliary_loss_clip": 0.01142891, + "auxiliary_loss_mlp": 0.01039636, + "balance_loss_clip": 1.02291107, + "balance_loss_mlp": 1.04897153, + "epoch": 0.22299714414549826, + "flos": 30154657034880.0, + "grad_norm": 1.913582269302705, + "language_loss": 0.79989487, + "learning_rate": 3.6218255977309024e-06, + "loss": 0.82172012, + "num_input_tokens_seen": 79931875, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9375, + "step": 3709, + "time_per_iteration": 2.5528371334075928 + }, + { + "auxiliary_loss_clip": 0.01139166, + "auxiliary_loss_mlp": 0.01046119, + "balance_loss_clip": 1.02913201, + "balance_loss_mlp": 1.04580092, + "epoch": 0.22305726739816625, + "flos": 23143134309120.0, + "grad_norm": 2.062693768306912, + "language_loss": 0.68752408, + "learning_rate": 3.6215976668537787e-06, + "loss": 0.70937693, + "num_input_tokens_seen": 79952445, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.93359375, + "step": 3710, + "time_per_iteration": 2.511275053024292 + }, + { + "auxiliary_loss_clip": 0.01144244, + "auxiliary_loss_mlp": 0.01039149, + "balance_loss_clip": 1.0221858, + "balance_loss_mlp": 1.04812646, + "epoch": 0.22311739065083422, + "flos": 19172061187200.0, + "grad_norm": 2.3083581079415216, + "language_loss": 0.90696692, + "learning_rate": 3.6213696744855096e-06, + "loss": 0.92880082, + "num_input_tokens_seen": 79971030, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9609375, + "step": 3711, + "time_per_iteration": 2.4757487773895264 + }, + { + "auxiliary_loss_clip": 0.01138091, + "auxiliary_loss_mlp": 0.01051989, + "balance_loss_clip": 1.03406, + "balance_loss_mlp": 1.04603434, + "epoch": 0.22317751390350218, + "flos": 13617757319040.0, + "grad_norm": 2.758927620438821, + "language_loss": 0.89628232, + "learning_rate": 3.6211416206347395e-06, + "loss": 0.91818309, + "num_input_tokens_seen": 79982085, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.921875, + "step": 3712, + "time_per_iteration": 2.3870105743408203 + }, + { + "auxiliary_loss_clip": 0.01139482, + "auxiliary_loss_mlp": 0.01051487, + "balance_loss_clip": 1.03356993, + "balance_loss_mlp": 1.04956841, + "epoch": 0.22323763715617015, + "flos": 11029065068160.0, + "grad_norm": 3.039950461935961, + "language_loss": 0.74859631, + "learning_rate": 3.620913505310117e-06, + "loss": 0.77050602, + "num_input_tokens_seen": 79997460, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.8984375, + "step": 3713, + "time_per_iteration": 2.4336304664611816 + }, + { + "auxiliary_loss_clip": 0.01138793, + "auxiliary_loss_mlp": 0.01041826, + "balance_loss_clip": 1.02543497, + "balance_loss_mlp": 1.048329, + "epoch": 0.22329776040883811, + "flos": 41351531466240.0, + "grad_norm": 1.8221921578975473, + "language_loss": 0.62592143, + "learning_rate": 3.6206853285202917e-06, + "loss": 0.64772761, + "num_input_tokens_seen": 80022450, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90625, + "step": 3714, + "time_per_iteration": 2.6230995655059814 + }, + { + "auxiliary_loss_clip": 0.01139199, + "auxiliary_loss_mlp": 0.01036969, + "balance_loss_clip": 1.02073312, + "balance_loss_mlp": 1.04734552, + "epoch": 0.22335788366150608, + "flos": 25119478477440.0, + "grad_norm": 1.9329837891440178, + "language_loss": 0.79052407, + "learning_rate": 3.6204570902739164e-06, + "loss": 0.81228578, + "num_input_tokens_seen": 80042100, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.91796875, + "step": 3715, + "time_per_iteration": 2.510436534881592 + }, + { + "auxiliary_loss_clip": 0.011421, + "auxiliary_loss_mlp": 0.01050674, + "balance_loss_clip": 1.03372216, + "balance_loss_mlp": 1.05021942, + "epoch": 0.22341800691417404, + "flos": 16983377769600.0, + "grad_norm": 1.6633570096565886, + "language_loss": 0.77182817, + "learning_rate": 3.620228790579645e-06, + "loss": 0.79375589, + "num_input_tokens_seen": 80059690, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.921875, + "step": 3716, + "time_per_iteration": 2.4398605823516846 + }, + { + "auxiliary_loss_clip": 0.01141179, + "auxiliary_loss_mlp": 0.01047022, + "balance_loss_clip": 1.03046429, + "balance_loss_mlp": 1.04845762, + "epoch": 0.22347813016684204, + "flos": 14136738975360.0, + "grad_norm": 2.028714583879474, + "language_loss": 0.79209757, + "learning_rate": 3.6200004294461367e-06, + "loss": 0.81397963, + "num_input_tokens_seen": 80076060, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.92578125, + "step": 3717, + "time_per_iteration": 2.456042766571045 + }, + { + "auxiliary_loss_clip": 0.01143546, + "auxiliary_loss_mlp": 0.01041127, + "balance_loss_clip": 1.02377009, + "balance_loss_mlp": 1.04934192, + "epoch": 0.22353825341951, + "flos": 23583147914880.0, + "grad_norm": 2.2103373086531115, + "language_loss": 0.68029571, + "learning_rate": 3.6197720068820497e-06, + "loss": 0.70214242, + "num_input_tokens_seen": 80094760, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.94140625, + "step": 3718, + "time_per_iteration": 2.4818973541259766 + }, + { + "auxiliary_loss_clip": 0.01142458, + "auxiliary_loss_mlp": 0.01038613, + "balance_loss_clip": 1.02067208, + "balance_loss_mlp": 1.04784536, + "epoch": 0.22359837667217797, + "flos": 29824206888960.0, + "grad_norm": 1.9912565029374794, + "language_loss": 0.80194163, + "learning_rate": 3.619543522896045e-06, + "loss": 0.8237524, + "num_input_tokens_seen": 80114475, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.9453125, + "step": 3719, + "time_per_iteration": 3.985903263092041 + }, + { + "auxiliary_loss_clip": 0.01145808, + "auxiliary_loss_mlp": 0.01052597, + "balance_loss_clip": 1.03396416, + "balance_loss_mlp": 1.04785836, + "epoch": 0.22365849992484593, + "flos": 17603088140160.0, + "grad_norm": 2.0930960597239707, + "language_loss": 0.86421579, + "learning_rate": 3.6193149774967885e-06, + "loss": 0.88619983, + "num_input_tokens_seen": 80132920, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.98046875, + "step": 3720, + "time_per_iteration": 3.914626359939575 + }, + { + "auxiliary_loss_clip": 0.0114136, + "auxiliary_loss_mlp": 0.01033623, + "balance_loss_clip": 1.01682639, + "balance_loss_mlp": 1.05105066, + "epoch": 0.2237186231775139, + "flos": 22710949868160.0, + "grad_norm": 1.6398614781610892, + "language_loss": 0.74860299, + "learning_rate": 3.619086370692945e-06, + "loss": 0.77035284, + "num_input_tokens_seen": 80152845, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.90234375, + "step": 3721, + "time_per_iteration": 2.485271453857422 + }, + { + "auxiliary_loss_clip": 0.011451, + "auxiliary_loss_mlp": 0.01043389, + "balance_loss_clip": 1.0256865, + "balance_loss_mlp": 1.0494988, + "epoch": 0.22377874643018186, + "flos": 13371518609280.0, + "grad_norm": 2.928465692067959, + "language_loss": 0.78943181, + "learning_rate": 3.6188577024931844e-06, + "loss": 0.81131673, + "num_input_tokens_seen": 80170680, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.953125, + "step": 3722, + "time_per_iteration": 2.471928834915161 + }, + { + "auxiliary_loss_clip": 0.01140042, + "auxiliary_loss_mlp": 0.0104164, + "balance_loss_clip": 1.02551126, + "balance_loss_mlp": 1.05004597, + "epoch": 0.22383886968284986, + "flos": 17894970057600.0, + "grad_norm": 2.2482737248582247, + "language_loss": 0.82315016, + "learning_rate": 3.618628972906178e-06, + "loss": 0.84496701, + "num_input_tokens_seen": 80189030, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8984375, + "step": 3723, + "time_per_iteration": 2.4540791511535645 + }, + { + "auxiliary_loss_clip": 0.01144828, + "auxiliary_loss_mlp": 0.01044673, + "balance_loss_clip": 1.02729177, + "balance_loss_mlp": 1.05062389, + "epoch": 0.22389899293551782, + "flos": 23879123982720.0, + "grad_norm": 2.154682666342997, + "language_loss": 0.84433442, + "learning_rate": 3.6184001819405984e-06, + "loss": 0.86622941, + "num_input_tokens_seen": 80208365, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.94140625, + "step": 3724, + "time_per_iteration": 2.526204824447632 + }, + { + "auxiliary_loss_clip": 0.0114043, + "auxiliary_loss_mlp": 0.010395, + "balance_loss_clip": 1.02297735, + "balance_loss_mlp": 1.04889762, + "epoch": 0.2239591161881858, + "flos": 27272430840960.0, + "grad_norm": 2.178002887638817, + "language_loss": 0.79036546, + "learning_rate": 3.618171329605121e-06, + "loss": 0.81216478, + "num_input_tokens_seen": 80228685, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.9140625, + "step": 3725, + "time_per_iteration": 2.513136625289917 + }, + { + "auxiliary_loss_clip": 0.01139478, + "auxiliary_loss_mlp": 0.01039417, + "balance_loss_clip": 1.02271581, + "balance_loss_mlp": 1.04898071, + "epoch": 0.22401923944085375, + "flos": 22236857233920.0, + "grad_norm": 1.6732241790302085, + "language_loss": 0.77158499, + "learning_rate": 3.6179424159084254e-06, + "loss": 0.79337394, + "num_input_tokens_seen": 80247635, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.90625, + "step": 3726, + "time_per_iteration": 2.5645246505737305 + }, + { + "auxiliary_loss_clip": 0.01150164, + "auxiliary_loss_mlp": 0.01045662, + "balance_loss_clip": 1.02677917, + "balance_loss_mlp": 1.05054045, + "epoch": 0.22407936269352172, + "flos": 12053668521600.0, + "grad_norm": 2.7042555627132296, + "language_loss": 0.72376108, + "learning_rate": 3.6177134408591914e-06, + "loss": 0.74571931, + "num_input_tokens_seen": 80260045, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.99609375, + "step": 3727, + "time_per_iteration": 2.4437429904937744 + }, + { + "auxiliary_loss_clip": 0.0114439, + "auxiliary_loss_mlp": 0.01040468, + "balance_loss_clip": 1.02140689, + "balance_loss_mlp": 1.04682648, + "epoch": 0.22413948594618968, + "flos": 19353553632000.0, + "grad_norm": 2.2876633759350327, + "language_loss": 0.86584771, + "learning_rate": 3.6174844044661013e-06, + "loss": 0.88769633, + "num_input_tokens_seen": 80277680, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.9765625, + "step": 3728, + "time_per_iteration": 2.496020793914795 + }, + { + "auxiliary_loss_clip": 0.01143576, + "auxiliary_loss_mlp": 0.0104763, + "balance_loss_clip": 1.02838981, + "balance_loss_mlp": 1.05045211, + "epoch": 0.22419960919885765, + "flos": 24170000319360.0, + "grad_norm": 2.0817566504616734, + "language_loss": 0.80479026, + "learning_rate": 3.6172553067378406e-06, + "loss": 0.82670236, + "num_input_tokens_seen": 80294795, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.9296875, + "step": 3729, + "time_per_iteration": 2.4733448028564453 + }, + { + "auxiliary_loss_clip": 0.01136706, + "auxiliary_loss_mlp": 0.01046287, + "balance_loss_clip": 1.03019357, + "balance_loss_mlp": 1.04672551, + "epoch": 0.22425973245152564, + "flos": 27378977558400.0, + "grad_norm": 2.3054621640206205, + "language_loss": 0.86468041, + "learning_rate": 3.6170261476830964e-06, + "loss": 0.88651037, + "num_input_tokens_seen": 80315425, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8984375, + "step": 3730, + "time_per_iteration": 2.5348362922668457 + }, + { + "auxiliary_loss_clip": 0.01136756, + "auxiliary_loss_mlp": 0.0103563, + "balance_loss_clip": 1.01917958, + "balance_loss_mlp": 1.04737782, + "epoch": 0.2243198557041936, + "flos": 13735652734080.0, + "grad_norm": 1.75673058423422, + "language_loss": 0.73293322, + "learning_rate": 3.616796927310559e-06, + "loss": 0.75465709, + "num_input_tokens_seen": 80333905, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.89453125, + "step": 3731, + "time_per_iteration": 2.4397478103637695 + }, + { + "auxiliary_loss_clip": 0.01141304, + "auxiliary_loss_mlp": 0.0104035, + "balance_loss_clip": 1.02370882, + "balance_loss_mlp": 1.04893279, + "epoch": 0.22437997895686157, + "flos": 19530700531200.0, + "grad_norm": 2.4044438539905575, + "language_loss": 0.75237334, + "learning_rate": 3.6165676456289195e-06, + "loss": 0.77418989, + "num_input_tokens_seen": 80352165, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.921875, + "step": 3732, + "time_per_iteration": 2.476630926132202 + }, + { + "auxiliary_loss_clip": 0.01141784, + "auxiliary_loss_mlp": 0.01058138, + "balance_loss_clip": 1.04106712, + "balance_loss_mlp": 1.0494858, + "epoch": 0.22444010220952954, + "flos": 23696230907520.0, + "grad_norm": 1.8584104659795708, + "language_loss": 0.88037199, + "learning_rate": 3.616338302646873e-06, + "loss": 0.90237123, + "num_input_tokens_seen": 80371305, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.921875, + "step": 3733, + "time_per_iteration": 2.4723222255706787 + }, + { + "auxiliary_loss_clip": 0.01137652, + "auxiliary_loss_mlp": 0.01042602, + "balance_loss_clip": 1.02473271, + "balance_loss_mlp": 1.04564941, + "epoch": 0.2245002254621975, + "flos": 22382905933440.0, + "grad_norm": 1.6767676579772364, + "language_loss": 0.84200239, + "learning_rate": 3.6161088983731166e-06, + "loss": 0.86380494, + "num_input_tokens_seen": 80391020, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.921875, + "step": 3734, + "time_per_iteration": 2.5214619636535645 + }, + { + "auxiliary_loss_clip": 0.01143902, + "auxiliary_loss_mlp": 0.01048514, + "balance_loss_clip": 1.03170574, + "balance_loss_mlp": 1.0513525, + "epoch": 0.22456034871486547, + "flos": 26942303917440.0, + "grad_norm": 1.6368426378189131, + "language_loss": 0.76838279, + "learning_rate": 3.6158794328163482e-06, + "loss": 0.79030693, + "num_input_tokens_seen": 80411365, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.92578125, + "step": 3735, + "time_per_iteration": 2.5025858879089355 + }, + { + "auxiliary_loss_clip": 0.01134798, + "auxiliary_loss_mlp": 0.01047796, + "balance_loss_clip": 1.032215, + "balance_loss_mlp": 1.04791164, + "epoch": 0.22462047196753343, + "flos": 28983538005120.0, + "grad_norm": 3.6998773026048046, + "language_loss": 0.84505916, + "learning_rate": 3.6156499059852702e-06, + "loss": 0.86688507, + "num_input_tokens_seen": 80431075, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8671875, + "step": 3736, + "time_per_iteration": 2.581409454345703 + }, + { + "auxiliary_loss_clip": 0.0114079, + "auxiliary_loss_mlp": 0.01039504, + "balance_loss_clip": 1.02306545, + "balance_loss_mlp": 1.04848719, + "epoch": 0.22468059522020142, + "flos": 20011329440640.0, + "grad_norm": 2.2208030259376192, + "language_loss": 0.86398852, + "learning_rate": 3.615420317888586e-06, + "loss": 0.88579136, + "num_input_tokens_seen": 80449240, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.92578125, + "step": 3737, + "time_per_iteration": 2.4498212337493896 + }, + { + "auxiliary_loss_clip": 0.01141365, + "auxiliary_loss_mlp": 0.01047163, + "balance_loss_clip": 1.02917397, + "balance_loss_mlp": 1.0476644, + "epoch": 0.2247407184728694, + "flos": 29314239546240.0, + "grad_norm": 2.434824168439142, + "language_loss": 0.79145718, + "learning_rate": 3.6151906685350006e-06, + "loss": 0.81334245, + "num_input_tokens_seen": 80467900, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9375, + "step": 3738, + "time_per_iteration": 2.5505504608154297 + }, + { + "auxiliary_loss_clip": 0.01140019, + "auxiliary_loss_mlp": 0.01041393, + "balance_loss_clip": 1.02564526, + "balance_loss_mlp": 1.0471611, + "epoch": 0.22480084172553735, + "flos": 22310366417280.0, + "grad_norm": 2.2711438439691314, + "language_loss": 0.75895345, + "learning_rate": 3.614960957933224e-06, + "loss": 0.78076756, + "num_input_tokens_seen": 80487100, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.9296875, + "step": 3739, + "time_per_iteration": 2.458307981491089 + }, + { + "auxiliary_loss_clip": 0.01137257, + "auxiliary_loss_mlp": 0.0104211, + "balance_loss_clip": 1.0255754, + "balance_loss_mlp": 1.04610491, + "epoch": 0.22486096497820532, + "flos": 25591272641280.0, + "grad_norm": 1.9782758832921432, + "language_loss": 0.74705702, + "learning_rate": 3.6147311860919655e-06, + "loss": 0.76885068, + "num_input_tokens_seen": 80508625, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9140625, + "step": 3740, + "time_per_iteration": 2.5424981117248535 + }, + { + "auxiliary_loss_clip": 0.011377, + "auxiliary_loss_mlp": 0.01039355, + "balance_loss_clip": 1.02234411, + "balance_loss_mlp": 1.04691672, + "epoch": 0.22492108823087328, + "flos": 17639824775040.0, + "grad_norm": 2.174963459036685, + "language_loss": 0.76083958, + "learning_rate": 3.614501353019939e-06, + "loss": 0.78261012, + "num_input_tokens_seen": 80527345, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.90625, + "step": 3741, + "time_per_iteration": 2.4539613723754883 + }, + { + "auxiliary_loss_clip": 0.01140029, + "auxiliary_loss_mlp": 0.01038592, + "balance_loss_clip": 1.02263021, + "balance_loss_mlp": 1.05022252, + "epoch": 0.22498121148354125, + "flos": 16034653797120.0, + "grad_norm": 1.917686629559915, + "language_loss": 0.87458241, + "learning_rate": 3.6142714587258592e-06, + "loss": 0.89636862, + "num_input_tokens_seen": 80545545, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8984375, + "step": 3742, + "time_per_iteration": 2.483146905899048 + }, + { + "auxiliary_loss_clip": 0.01137495, + "auxiliary_loss_mlp": 0.01051324, + "balance_loss_clip": 1.03403831, + "balance_loss_mlp": 1.04824293, + "epoch": 0.22504133473620924, + "flos": 24023772051840.0, + "grad_norm": 2.0726823880461116, + "language_loss": 0.81939828, + "learning_rate": 3.614041503218444e-06, + "loss": 0.84128648, + "num_input_tokens_seen": 80565040, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.890625, + "step": 3743, + "time_per_iteration": 2.4786789417266846 + }, + { + "auxiliary_loss_clip": 0.01140562, + "auxiliary_loss_mlp": 0.01038532, + "balance_loss_clip": 1.02241504, + "balance_loss_mlp": 1.04843307, + "epoch": 0.2251014579888772, + "flos": 16763963541120.0, + "grad_norm": 3.9980575521347697, + "language_loss": 0.63616955, + "learning_rate": 3.6138114865064134e-06, + "loss": 0.65796053, + "num_input_tokens_seen": 80582815, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.921875, + "step": 3744, + "time_per_iteration": 2.4746344089508057 + }, + { + "auxiliary_loss_clip": 0.01137356, + "auxiliary_loss_mlp": 0.01042928, + "balance_loss_clip": 1.02634597, + "balance_loss_mlp": 1.04524422, + "epoch": 0.22516158124154517, + "flos": 13991013498240.0, + "grad_norm": 3.3106228370485806, + "language_loss": 0.75711048, + "learning_rate": 3.613581408598489e-06, + "loss": 0.77891332, + "num_input_tokens_seen": 80600865, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.921875, + "step": 3745, + "time_per_iteration": 2.4295878410339355 + }, + { + "auxiliary_loss_clip": 0.01136631, + "auxiliary_loss_mlp": 0.0103759, + "balance_loss_clip": 1.02142549, + "balance_loss_mlp": 1.04637384, + "epoch": 0.22522170449421314, + "flos": 14390016750720.0, + "grad_norm": 1.8117958881819525, + "language_loss": 0.80839783, + "learning_rate": 3.6133512695033965e-06, + "loss": 0.83013999, + "num_input_tokens_seen": 80617455, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.90234375, + "step": 3746, + "time_per_iteration": 2.4423928260803223 + }, + { + "auxiliary_loss_clip": 0.01138701, + "auxiliary_loss_mlp": 0.01045887, + "balance_loss_clip": 1.02903056, + "balance_loss_mlp": 1.04503584, + "epoch": 0.2252818277468811, + "flos": 23805542972160.0, + "grad_norm": 2.508960709641407, + "language_loss": 0.86067426, + "learning_rate": 3.613121069229862e-06, + "loss": 0.8825202, + "num_input_tokens_seen": 80635125, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.93359375, + "step": 3747, + "time_per_iteration": 2.471223831176758 + }, + { + "auxiliary_loss_clip": 0.01136945, + "auxiliary_loss_mlp": 0.01034039, + "balance_loss_clip": 1.01789808, + "balance_loss_mlp": 1.04515314, + "epoch": 0.22534195099954907, + "flos": 24718033100160.0, + "grad_norm": 1.812236682782158, + "language_loss": 0.76358509, + "learning_rate": 3.6128908077866145e-06, + "loss": 0.78529495, + "num_input_tokens_seen": 80656370, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.91796875, + "step": 3748, + "time_per_iteration": 2.525108575820923 + }, + { + "auxiliary_loss_clip": 0.01142287, + "auxiliary_loss_mlp": 0.01044202, + "balance_loss_clip": 1.0274291, + "balance_loss_mlp": 1.04882264, + "epoch": 0.22540207425221703, + "flos": 21032341534080.0, + "grad_norm": 1.7339876982656162, + "language_loss": 0.79497123, + "learning_rate": 3.6126604851823864e-06, + "loss": 0.81683606, + "num_input_tokens_seen": 80676495, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9375, + "step": 3749, + "time_per_iteration": 2.4881162643432617 + }, + { + "auxiliary_loss_clip": 0.01134115, + "auxiliary_loss_mlp": 0.01039256, + "balance_loss_clip": 1.02423525, + "balance_loss_mlp": 1.04609084, + "epoch": 0.22546219750488503, + "flos": 19390362094080.0, + "grad_norm": 1.6101192523185979, + "language_loss": 0.8009423, + "learning_rate": 3.6124301014259108e-06, + "loss": 0.82267606, + "num_input_tokens_seen": 80694755, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8828125, + "step": 3750, + "time_per_iteration": 2.4656643867492676 + }, + { + "auxiliary_loss_clip": 0.01140861, + "auxiliary_loss_mlp": 0.01044128, + "balance_loss_clip": 1.02733183, + "balance_loss_mlp": 1.04821157, + "epoch": 0.225522320757553, + "flos": 25192628524800.0, + "grad_norm": 2.418289881699729, + "language_loss": 0.81336129, + "learning_rate": 3.6121996565259244e-06, + "loss": 0.83521116, + "num_input_tokens_seen": 80713670, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.92578125, + "step": 3751, + "time_per_iteration": 2.4960029125213623 + }, + { + "auxiliary_loss_clip": 0.01141479, + "auxiliary_loss_mlp": 0.01038662, + "balance_loss_clip": 1.02242589, + "balance_loss_mlp": 1.04915667, + "epoch": 0.22558244401022096, + "flos": 17163110448000.0, + "grad_norm": 1.757449596716865, + "language_loss": 0.83989275, + "learning_rate": 3.611969150491165e-06, + "loss": 0.86169416, + "num_input_tokens_seen": 80731450, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.921875, + "step": 3752, + "time_per_iteration": 2.4668636322021484 + }, + { + "auxiliary_loss_clip": 0.01136965, + "auxiliary_loss_mlp": 0.01039126, + "balance_loss_clip": 1.02375996, + "balance_loss_mlp": 1.04671109, + "epoch": 0.22564256726288892, + "flos": 15231008856960.0, + "grad_norm": 1.7780915453784651, + "language_loss": 0.78616595, + "learning_rate": 3.611738583330375e-06, + "loss": 0.80792689, + "num_input_tokens_seen": 80748415, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.90234375, + "step": 3753, + "time_per_iteration": 2.4305062294006348 + }, + { + "auxiliary_loss_clip": 0.01137405, + "auxiliary_loss_mlp": 0.01038232, + "balance_loss_clip": 1.02113724, + "balance_loss_mlp": 1.04717183, + "epoch": 0.2257026905155569, + "flos": 34568652764160.0, + "grad_norm": 1.990408742554116, + "language_loss": 0.78284466, + "learning_rate": 3.611507955052295e-06, + "loss": 0.80460101, + "num_input_tokens_seen": 80770835, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.90234375, + "step": 3754, + "time_per_iteration": 2.584170341491699 + }, + { + "auxiliary_loss_clip": 0.0113674, + "auxiliary_loss_mlp": 0.01040681, + "balance_loss_clip": 1.0243969, + "balance_loss_mlp": 1.04882884, + "epoch": 0.22576281376822485, + "flos": 19938430788480.0, + "grad_norm": 1.915767444367904, + "language_loss": 0.70267534, + "learning_rate": 3.6112772656656727e-06, + "loss": 0.72444952, + "num_input_tokens_seen": 80787840, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.87890625, + "step": 3755, + "time_per_iteration": 2.458731174468994 + }, + { + "auxiliary_loss_clip": 0.01145193, + "auxiliary_loss_mlp": 0.0104804, + "balance_loss_clip": 1.031744, + "balance_loss_mlp": 1.0502069, + "epoch": 0.22582293702089282, + "flos": 24602005192320.0, + "grad_norm": 2.7446757969812783, + "language_loss": 0.77373838, + "learning_rate": 3.6110465151792547e-06, + "loss": 0.79567063, + "num_input_tokens_seen": 80806335, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.94921875, + "step": 3756, + "time_per_iteration": 2.5073161125183105 + }, + { + "auxiliary_loss_clip": 0.01145039, + "auxiliary_loss_mlp": 0.01042429, + "balance_loss_clip": 1.02498841, + "balance_loss_mlp": 1.05014277, + "epoch": 0.2258830602735608, + "flos": 23035438356480.0, + "grad_norm": 1.8909279955578986, + "language_loss": 0.82552433, + "learning_rate": 3.6108157036017916e-06, + "loss": 0.847399, + "num_input_tokens_seen": 80825355, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.94921875, + "step": 3757, + "time_per_iteration": 2.471353054046631 + }, + { + "auxiliary_loss_clip": 0.01140888, + "auxiliary_loss_mlp": 0.01039381, + "balance_loss_clip": 1.02258492, + "balance_loss_mlp": 1.04810619, + "epoch": 0.22594318352622877, + "flos": 22158427887360.0, + "grad_norm": 1.8410990661161322, + "language_loss": 0.73181808, + "learning_rate": 3.6105848309420358e-06, + "loss": 0.7536208, + "num_input_tokens_seen": 80842570, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.92578125, + "step": 3758, + "time_per_iteration": 2.5376477241516113 + }, + { + "auxiliary_loss_clip": 0.01144551, + "auxiliary_loss_mlp": 0.0104662, + "balance_loss_clip": 1.02985883, + "balance_loss_mlp": 1.04991663, + "epoch": 0.22600330677889674, + "flos": 20594303176320.0, + "grad_norm": 2.0967514749881015, + "language_loss": 0.77208662, + "learning_rate": 3.6103538972087412e-06, + "loss": 0.79399836, + "num_input_tokens_seen": 80858745, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9453125, + "step": 3759, + "time_per_iteration": 2.447608709335327 + }, + { + "auxiliary_loss_clip": 0.01141959, + "auxiliary_loss_mlp": 0.01043995, + "balance_loss_clip": 1.02643597, + "balance_loss_mlp": 1.04806697, + "epoch": 0.2260634300315647, + "flos": 35659798162560.0, + "grad_norm": 1.9036057015372598, + "language_loss": 0.78638428, + "learning_rate": 3.6101229024106655e-06, + "loss": 0.80824387, + "num_input_tokens_seen": 80880085, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.94140625, + "step": 3760, + "time_per_iteration": 4.231990098953247 + }, + { + "auxiliary_loss_clip": 0.01054354, + "auxiliary_loss_mlp": 0.01007925, + "balance_loss_clip": 1.00607765, + "balance_loss_mlp": 1.02028942, + "epoch": 0.22612355328423267, + "flos": 72090455126400.0, + "grad_norm": 0.9344871733021222, + "language_loss": 0.60090166, + "learning_rate": 3.609891846556569e-06, + "loss": 0.62152445, + "num_input_tokens_seen": 80937660, + "router_z_loss_clip": 0.01843262, + "router_z_loss_mlp": 0.33984375, + "step": 3761, + "time_per_iteration": 4.482504367828369 + }, + { + "auxiliary_loss_clip": 0.0114253, + "auxiliary_loss_mlp": 0.01044191, + "balance_loss_clip": 1.02678633, + "balance_loss_mlp": 1.0478611, + "epoch": 0.22618367653690064, + "flos": 22783776693120.0, + "grad_norm": 2.386395888426225, + "language_loss": 0.77400732, + "learning_rate": 3.609660729655211e-06, + "loss": 0.79587454, + "num_input_tokens_seen": 80956265, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9453125, + "step": 3762, + "time_per_iteration": 2.5162198543548584 + }, + { + "auxiliary_loss_clip": 0.01143363, + "auxiliary_loss_mlp": 0.01040981, + "balance_loss_clip": 1.02395821, + "balance_loss_mlp": 1.05073345, + "epoch": 0.22624379978956863, + "flos": 20448254476800.0, + "grad_norm": 2.10132066013886, + "language_loss": 0.78800118, + "learning_rate": 3.6094295517153573e-06, + "loss": 0.80984461, + "num_input_tokens_seen": 80975185, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.92578125, + "step": 3763, + "time_per_iteration": 2.4578778743743896 + }, + { + "auxiliary_loss_clip": 0.01145794, + "auxiliary_loss_mlp": 0.0105418, + "balance_loss_clip": 1.03583384, + "balance_loss_mlp": 1.05000031, + "epoch": 0.2263039230422366, + "flos": 17494314779520.0, + "grad_norm": 1.8659674868358982, + "language_loss": 0.91363662, + "learning_rate": 3.6091983127457743e-06, + "loss": 0.93563628, + "num_input_tokens_seen": 80992830, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.95703125, + "step": 3764, + "time_per_iteration": 2.536231517791748 + }, + { + "auxiliary_loss_clip": 0.01138186, + "auxiliary_loss_mlp": 0.01054666, + "balance_loss_clip": 1.03740454, + "balance_loss_mlp": 1.04773271, + "epoch": 0.22636404629490456, + "flos": 28329748606080.0, + "grad_norm": 1.6188972360392109, + "language_loss": 0.75211406, + "learning_rate": 3.6089670127552293e-06, + "loss": 0.77404261, + "num_input_tokens_seen": 81013675, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.90625, + "step": 3765, + "time_per_iteration": 2.516646146774292 + }, + { + "auxiliary_loss_clip": 0.01139986, + "auxiliary_loss_mlp": 0.01045374, + "balance_loss_clip": 1.02868426, + "balance_loss_mlp": 1.04855943, + "epoch": 0.22642416954757252, + "flos": 17489143221120.0, + "grad_norm": 1.9315012383394614, + "language_loss": 0.89618981, + "learning_rate": 3.608735651752494e-06, + "loss": 0.91804343, + "num_input_tokens_seen": 81030345, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9140625, + "step": 3766, + "time_per_iteration": 2.4829306602478027 + }, + { + "auxiliary_loss_clip": 0.01138287, + "auxiliary_loss_mlp": 0.01042769, + "balance_loss_clip": 1.02568591, + "balance_loss_mlp": 1.04891181, + "epoch": 0.2264842928002405, + "flos": 24384530298240.0, + "grad_norm": 1.6662033714223943, + "language_loss": 0.74710411, + "learning_rate": 3.6085042297463417e-06, + "loss": 0.76891464, + "num_input_tokens_seen": 81051000, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.89453125, + "step": 3767, + "time_per_iteration": 2.4989218711853027 + }, + { + "auxiliary_loss_clip": 0.011397, + "auxiliary_loss_mlp": 0.01044149, + "balance_loss_clip": 1.02664912, + "balance_loss_mlp": 1.04619229, + "epoch": 0.22654441605290845, + "flos": 19830519354240.0, + "grad_norm": 1.4804117361030718, + "language_loss": 0.7156831, + "learning_rate": 3.6082727467455477e-06, + "loss": 0.73752159, + "num_input_tokens_seen": 81071205, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.93359375, + "step": 3768, + "time_per_iteration": 2.5078160762786865 + }, + { + "auxiliary_loss_clip": 0.01143764, + "auxiliary_loss_mlp": 0.01054415, + "balance_loss_clip": 1.03682017, + "balance_loss_mlp": 1.05247319, + "epoch": 0.22660453930557642, + "flos": 27454569730560.0, + "grad_norm": 1.80046116612075, + "language_loss": 0.78268003, + "learning_rate": 3.6080412027588905e-06, + "loss": 0.80466181, + "num_input_tokens_seen": 81091880, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9140625, + "step": 3769, + "time_per_iteration": 2.5122978687286377 + }, + { + "auxiliary_loss_clip": 0.01142038, + "auxiliary_loss_mlp": 0.01042012, + "balance_loss_clip": 1.02465522, + "balance_loss_mlp": 1.0467639, + "epoch": 0.2266646625582444, + "flos": 23988148738560.0, + "grad_norm": 1.7393050758681738, + "language_loss": 0.68427956, + "learning_rate": 3.6078095977951488e-06, + "loss": 0.70612001, + "num_input_tokens_seen": 81113290, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.953125, + "step": 3770, + "time_per_iteration": 2.557098150253296 + }, + { + "auxiliary_loss_clip": 0.01141766, + "auxiliary_loss_mlp": 0.01041674, + "balance_loss_clip": 1.02537811, + "balance_loss_mlp": 1.04682195, + "epoch": 0.22672478581091238, + "flos": 26028054023040.0, + "grad_norm": 1.6251414008252867, + "language_loss": 0.80370939, + "learning_rate": 3.6075779318631067e-06, + "loss": 0.82554382, + "num_input_tokens_seen": 81133535, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.94921875, + "step": 3771, + "time_per_iteration": 2.5156240463256836 + }, + { + "auxiliary_loss_clip": 0.01135038, + "auxiliary_loss_mlp": 0.01045619, + "balance_loss_clip": 1.0290848, + "balance_loss_mlp": 1.04606724, + "epoch": 0.22678490906358034, + "flos": 23841812730240.0, + "grad_norm": 1.567346312954514, + "language_loss": 0.78844583, + "learning_rate": 3.6073462049715486e-06, + "loss": 0.81025243, + "num_input_tokens_seen": 81154650, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.890625, + "step": 3772, + "time_per_iteration": 2.539632558822632 + }, + { + "auxiliary_loss_clip": 0.0105304, + "auxiliary_loss_mlp": 0.01005348, + "balance_loss_clip": 1.00351191, + "balance_loss_mlp": 1.02012253, + "epoch": 0.2268450323162483, + "flos": 65048088574080.0, + "grad_norm": 0.6518085485856671, + "language_loss": 0.54334348, + "learning_rate": 3.607114417129261e-06, + "loss": 0.56392735, + "num_input_tokens_seen": 81221240, + "router_z_loss_clip": 0.01831055, + "router_z_loss_mlp": 0.33007812, + "step": 3773, + "time_per_iteration": 3.1463003158569336 + }, + { + "auxiliary_loss_clip": 0.01136639, + "auxiliary_loss_mlp": 0.01039094, + "balance_loss_clip": 1.02222633, + "balance_loss_mlp": 1.04712117, + "epoch": 0.22690515556891627, + "flos": 22526081544960.0, + "grad_norm": 1.9230264173849037, + "language_loss": 0.70101082, + "learning_rate": 3.6068825683450334e-06, + "loss": 0.72276813, + "num_input_tokens_seen": 81241520, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.89453125, + "step": 3774, + "time_per_iteration": 2.5099127292633057 + }, + { + "auxiliary_loss_clip": 0.01134613, + "auxiliary_loss_mlp": 0.01038845, + "balance_loss_clip": 1.02232277, + "balance_loss_mlp": 1.04480648, + "epoch": 0.22696527882158424, + "flos": 18223444955520.0, + "grad_norm": 2.4369678263863057, + "language_loss": 0.74585366, + "learning_rate": 3.606650658627658e-06, + "loss": 0.76758826, + "num_input_tokens_seen": 81256825, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8984375, + "step": 3775, + "time_per_iteration": 2.4441745281219482 + }, + { + "auxiliary_loss_clip": 0.0113676, + "auxiliary_loss_mlp": 0.01038998, + "balance_loss_clip": 1.02311933, + "balance_loss_mlp": 1.04534245, + "epoch": 0.22702540207425223, + "flos": 17019252478080.0, + "grad_norm": 2.175545430509675, + "language_loss": 0.8256253, + "learning_rate": 3.606418687985928e-06, + "loss": 0.8473829, + "num_input_tokens_seen": 81275695, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.9140625, + "step": 3776, + "time_per_iteration": 2.4418301582336426 + }, + { + "auxiliary_loss_clip": 0.01139885, + "auxiliary_loss_mlp": 0.01037889, + "balance_loss_clip": 1.02125907, + "balance_loss_mlp": 1.04619908, + "epoch": 0.2270855253269202, + "flos": 21325731822720.0, + "grad_norm": 2.75835757539417, + "language_loss": 0.83031607, + "learning_rate": 3.606186656428641e-06, + "loss": 0.85209382, + "num_input_tokens_seen": 81294920, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9375, + "step": 3777, + "time_per_iteration": 2.5585062503814697 + }, + { + "auxiliary_loss_clip": 0.01137385, + "auxiliary_loss_mlp": 0.01039138, + "balance_loss_clip": 1.02232909, + "balance_loss_mlp": 1.04596353, + "epoch": 0.22714564857958816, + "flos": 23550469516800.0, + "grad_norm": 2.6678368583827288, + "language_loss": 0.72658038, + "learning_rate": 3.6059545639645955e-06, + "loss": 0.74834561, + "num_input_tokens_seen": 81314275, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9140625, + "step": 3778, + "time_per_iteration": 2.5019333362579346 + }, + { + "auxiliary_loss_clip": 0.0113896, + "auxiliary_loss_mlp": 0.01040521, + "balance_loss_clip": 1.02386749, + "balance_loss_mlp": 1.04576886, + "epoch": 0.22720577183225613, + "flos": 25989880844160.0, + "grad_norm": 2.229609453971581, + "language_loss": 0.6414392, + "learning_rate": 3.605722410602591e-06, + "loss": 0.663234, + "num_input_tokens_seen": 81333890, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.93359375, + "step": 3779, + "time_per_iteration": 2.5082859992980957 + }, + { + "auxiliary_loss_clip": 0.01138378, + "auxiliary_loss_mlp": 0.01043458, + "balance_loss_clip": 1.02794909, + "balance_loss_mlp": 1.04837573, + "epoch": 0.2272658950849241, + "flos": 20814076540800.0, + "grad_norm": 1.9715072832436495, + "language_loss": 0.70546824, + "learning_rate": 3.6054901963514323e-06, + "loss": 0.72728658, + "num_input_tokens_seen": 81353640, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8984375, + "step": 3780, + "time_per_iteration": 2.4703643321990967 + }, + { + "auxiliary_loss_clip": 0.01140054, + "auxiliary_loss_mlp": 0.0104493, + "balance_loss_clip": 1.02689338, + "balance_loss_mlp": 1.0489254, + "epoch": 0.22732601833759206, + "flos": 23909324342400.0, + "grad_norm": 2.5454366084291133, + "language_loss": 0.89717996, + "learning_rate": 3.6052579212199246e-06, + "loss": 0.91902977, + "num_input_tokens_seen": 81371595, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9140625, + "step": 3781, + "time_per_iteration": 2.4812376499176025 + }, + { + "auxiliary_loss_clip": 0.0113992, + "auxiliary_loss_mlp": 0.01041852, + "balance_loss_clip": 1.02436364, + "balance_loss_mlp": 1.04648304, + "epoch": 0.22738614159026002, + "flos": 15924407978880.0, + "grad_norm": 2.4601522898780805, + "language_loss": 0.7434786, + "learning_rate": 3.6050255852168753e-06, + "loss": 0.76529634, + "num_input_tokens_seen": 81388435, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.93359375, + "step": 3782, + "time_per_iteration": 2.4665582180023193 + }, + { + "auxiliary_loss_clip": 0.01136804, + "auxiliary_loss_mlp": 0.01041674, + "balance_loss_clip": 1.02587914, + "balance_loss_mlp": 1.04467201, + "epoch": 0.22744626484292801, + "flos": 24205515891840.0, + "grad_norm": 1.6148985015615094, + "language_loss": 0.82393098, + "learning_rate": 3.604793188351095e-06, + "loss": 0.84571576, + "num_input_tokens_seen": 81410195, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.921875, + "step": 3783, + "time_per_iteration": 2.4820034503936768 + }, + { + "auxiliary_loss_clip": 0.01137013, + "auxiliary_loss_mlp": 0.01040248, + "balance_loss_clip": 1.02310586, + "balance_loss_mlp": 1.04418266, + "epoch": 0.22750638809559598, + "flos": 24791614110720.0, + "grad_norm": 2.4165791890347714, + "language_loss": 0.75874048, + "learning_rate": 3.6045607306313964e-06, + "loss": 0.78051311, + "num_input_tokens_seen": 81430060, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9296875, + "step": 3784, + "time_per_iteration": 2.5087246894836426 + }, + { + "auxiliary_loss_clip": 0.01134704, + "auxiliary_loss_mlp": 0.01039995, + "balance_loss_clip": 1.02303135, + "balance_loss_mlp": 1.04345798, + "epoch": 0.22756651134826394, + "flos": 22236498097920.0, + "grad_norm": 1.6490497895559066, + "language_loss": 0.70716858, + "learning_rate": 3.604328212066594e-06, + "loss": 0.72891551, + "num_input_tokens_seen": 81447375, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.91015625, + "step": 3785, + "time_per_iteration": 2.4733574390411377 + }, + { + "auxiliary_loss_clip": 0.01051525, + "auxiliary_loss_mlp": 0.01004421, + "balance_loss_clip": 1.00252521, + "balance_loss_mlp": 1.01740241, + "epoch": 0.2276266346009319, + "flos": 62707466626560.0, + "grad_norm": 0.8187947911361427, + "language_loss": 0.61915314, + "learning_rate": 3.6040956326655047e-06, + "loss": 0.63971269, + "num_input_tokens_seen": 81505235, + "router_z_loss_clip": 0.0189209, + "router_z_loss_mlp": 0.34179688, + "step": 3786, + "time_per_iteration": 3.0474631786346436 + }, + { + "auxiliary_loss_clip": 0.01143523, + "auxiliary_loss_mlp": 0.01042446, + "balance_loss_clip": 1.02488649, + "balance_loss_mlp": 1.04777002, + "epoch": 0.22768675785359987, + "flos": 18613936684800.0, + "grad_norm": 2.6740153696427247, + "language_loss": 0.86285794, + "learning_rate": 3.6038629924369486e-06, + "loss": 0.88471758, + "num_input_tokens_seen": 81518685, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.953125, + "step": 3787, + "time_per_iteration": 2.4331281185150146 + }, + { + "auxiliary_loss_clip": 0.01137002, + "auxiliary_loss_mlp": 0.01040164, + "balance_loss_clip": 1.02433276, + "balance_loss_mlp": 1.04612255, + "epoch": 0.22774688110626784, + "flos": 26870195364480.0, + "grad_norm": 1.2844293081892826, + "language_loss": 0.72555876, + "learning_rate": 3.6036302913897474e-06, + "loss": 0.74733031, + "num_input_tokens_seen": 81538940, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.91015625, + "step": 3788, + "time_per_iteration": 2.5378167629241943 + }, + { + "auxiliary_loss_clip": 0.01136486, + "auxiliary_loss_mlp": 0.01036201, + "balance_loss_clip": 1.01929688, + "balance_loss_mlp": 1.04552293, + "epoch": 0.2278070043589358, + "flos": 15553593924480.0, + "grad_norm": 2.4737623033533587, + "language_loss": 0.67524469, + "learning_rate": 3.6033975295327243e-06, + "loss": 0.69697154, + "num_input_tokens_seen": 81555525, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.91015625, + "step": 3789, + "time_per_iteration": 2.412086248397827 + }, + { + "auxiliary_loss_clip": 0.01136455, + "auxiliary_loss_mlp": 0.01041211, + "balance_loss_clip": 1.02416384, + "balance_loss_mlp": 1.04507327, + "epoch": 0.2278671276116038, + "flos": 22416805393920.0, + "grad_norm": 2.1501364843402335, + "language_loss": 0.76075745, + "learning_rate": 3.6031647068747065e-06, + "loss": 0.78253406, + "num_input_tokens_seen": 81576305, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9140625, + "step": 3790, + "time_per_iteration": 2.503600835800171 + }, + { + "auxiliary_loss_clip": 0.01134768, + "auxiliary_loss_mlp": 0.01038527, + "balance_loss_clip": 1.02174211, + "balance_loss_mlp": 1.04253387, + "epoch": 0.22792725086427176, + "flos": 20631363033600.0, + "grad_norm": 2.0794940610838397, + "language_loss": 0.90613973, + "learning_rate": 3.602931823424522e-06, + "loss": 0.92787266, + "num_input_tokens_seen": 81594115, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.921875, + "step": 3791, + "time_per_iteration": 2.4503557682037354 + }, + { + "auxiliary_loss_clip": 0.01138123, + "auxiliary_loss_mlp": 0.01036907, + "balance_loss_clip": 1.02000308, + "balance_loss_mlp": 1.04407096, + "epoch": 0.22798737411693973, + "flos": 31428946903680.0, + "grad_norm": 1.8390004860332834, + "language_loss": 0.82869208, + "learning_rate": 3.6026988791910026e-06, + "loss": 0.85044241, + "num_input_tokens_seen": 81615355, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.94140625, + "step": 3792, + "time_per_iteration": 2.5451550483703613 + }, + { + "auxiliary_loss_clip": 0.01045824, + "auxiliary_loss_mlp": 0.01012041, + "balance_loss_clip": 1.01015747, + "balance_loss_mlp": 1.01168287, + "epoch": 0.2280474973696077, + "flos": 52396685827200.0, + "grad_norm": 1.1436128607221614, + "language_loss": 0.65615487, + "learning_rate": 3.602465874182981e-06, + "loss": 0.67673355, + "num_input_tokens_seen": 81662075, + "router_z_loss_clip": 0.01879883, + "router_z_loss_mlp": 0.34179688, + "step": 3793, + "time_per_iteration": 2.7929015159606934 + }, + { + "auxiliary_loss_clip": 0.01142047, + "auxiliary_loss_mlp": 0.01050177, + "balance_loss_clip": 1.03241456, + "balance_loss_mlp": 1.04557967, + "epoch": 0.22810762062227566, + "flos": 26396066816640.0, + "grad_norm": 2.282271850248546, + "language_loss": 0.77100229, + "learning_rate": 3.602232808409293e-06, + "loss": 0.79292452, + "num_input_tokens_seen": 81681625, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.96484375, + "step": 3794, + "time_per_iteration": 2.4882023334503174 + }, + { + "auxiliary_loss_clip": 0.01139112, + "auxiliary_loss_mlp": 0.01038286, + "balance_loss_clip": 1.02146518, + "balance_loss_mlp": 1.04517698, + "epoch": 0.22816774387494362, + "flos": 25630271832960.0, + "grad_norm": 2.1931228295055716, + "language_loss": 0.80724937, + "learning_rate": 3.6019996818787755e-06, + "loss": 0.82902336, + "num_input_tokens_seen": 81701170, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9375, + "step": 3795, + "time_per_iteration": 2.475311279296875 + }, + { + "auxiliary_loss_clip": 0.0113575, + "auxiliary_loss_mlp": 0.01044854, + "balance_loss_clip": 1.02747297, + "balance_loss_mlp": 1.04336488, + "epoch": 0.22822786712761162, + "flos": 22451602694400.0, + "grad_norm": 1.8416311408581074, + "language_loss": 0.77002209, + "learning_rate": 3.6017664946002704e-06, + "loss": 0.79182816, + "num_input_tokens_seen": 81721265, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.92578125, + "step": 3796, + "time_per_iteration": 2.4734761714935303 + }, + { + "auxiliary_loss_clip": 0.01135997, + "auxiliary_loss_mlp": 0.01038978, + "balance_loss_clip": 1.02236056, + "balance_loss_mlp": 1.04312813, + "epoch": 0.22828799038027958, + "flos": 12202554395520.0, + "grad_norm": 2.506500245398156, + "language_loss": 0.9594354, + "learning_rate": 3.6015332465826188e-06, + "loss": 0.98118514, + "num_input_tokens_seen": 81736565, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9296875, + "step": 3797, + "time_per_iteration": 2.4146203994750977 + }, + { + "auxiliary_loss_clip": 0.01138366, + "auxiliary_loss_mlp": 0.01040269, + "balance_loss_clip": 1.02338922, + "balance_loss_mlp": 1.04537892, + "epoch": 0.22834811363294755, + "flos": 22085708803200.0, + "grad_norm": 1.6428427275001165, + "language_loss": 0.81446218, + "learning_rate": 3.601299937834666e-06, + "loss": 0.83624852, + "num_input_tokens_seen": 81756240, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9296875, + "step": 3798, + "time_per_iteration": 2.490849733352661 + }, + { + "auxiliary_loss_clip": 0.01137089, + "auxiliary_loss_mlp": 0.01038732, + "balance_loss_clip": 1.02080309, + "balance_loss_mlp": 1.04262519, + "epoch": 0.2284082368856155, + "flos": 24860634094080.0, + "grad_norm": 2.3515161945239833, + "language_loss": 0.78744864, + "learning_rate": 3.6010665683652596e-06, + "loss": 0.80920684, + "num_input_tokens_seen": 81775720, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9453125, + "step": 3799, + "time_per_iteration": 2.470564842224121 + }, + { + "auxiliary_loss_clip": 0.0113724, + "auxiliary_loss_mlp": 0.01050228, + "balance_loss_clip": 1.0332408, + "balance_loss_mlp": 1.04381084, + "epoch": 0.22846836013828348, + "flos": 23292882109440.0, + "grad_norm": 1.655995083326211, + "language_loss": 0.75234401, + "learning_rate": 3.6008331381832484e-06, + "loss": 0.77421868, + "num_input_tokens_seen": 81795830, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.93359375, + "step": 3800, + "time_per_iteration": 2.510788917541504 + }, + { + "auxiliary_loss_clip": 0.01137174, + "auxiliary_loss_mlp": 0.01039053, + "balance_loss_clip": 1.02320981, + "balance_loss_mlp": 1.04583156, + "epoch": 0.22852848339095144, + "flos": 27416288810880.0, + "grad_norm": 1.661997570582357, + "language_loss": 0.63433349, + "learning_rate": 3.600599647297484e-06, + "loss": 0.6560958, + "num_input_tokens_seen": 81815745, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.9140625, + "step": 3801, + "time_per_iteration": 2.503643035888672 + }, + { + "auxiliary_loss_clip": 0.01136229, + "auxiliary_loss_mlp": 0.0103618, + "balance_loss_clip": 1.02027762, + "balance_loss_mlp": 1.04721296, + "epoch": 0.2285886066436194, + "flos": 26321157002880.0, + "grad_norm": 1.7846583359688928, + "language_loss": 0.81602335, + "learning_rate": 3.60036609571682e-06, + "loss": 0.83774745, + "num_input_tokens_seen": 81835155, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.890625, + "step": 3802, + "time_per_iteration": 4.002788782119751 + }, + { + "auxiliary_loss_clip": 0.01138233, + "auxiliary_loss_mlp": 0.01047462, + "balance_loss_clip": 1.03027248, + "balance_loss_mlp": 1.04454207, + "epoch": 0.2286487298962874, + "flos": 29716475022720.0, + "grad_norm": 1.7683413549342115, + "language_loss": 0.78830242, + "learning_rate": 3.600132483450114e-06, + "loss": 0.81015933, + "num_input_tokens_seen": 81855655, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9375, + "step": 3803, + "time_per_iteration": 3.9494168758392334 + }, + { + "auxiliary_loss_clip": 0.01135958, + "auxiliary_loss_mlp": 0.01042656, + "balance_loss_clip": 1.02544212, + "balance_loss_mlp": 1.04115725, + "epoch": 0.22870885314895537, + "flos": 21287199507840.0, + "grad_norm": 1.6939241338011581, + "language_loss": 0.85561395, + "learning_rate": 3.5998988105062235e-06, + "loss": 0.87740004, + "num_input_tokens_seen": 81876385, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.94921875, + "step": 3804, + "time_per_iteration": 2.4504544734954834 + }, + { + "auxiliary_loss_clip": 0.01139159, + "auxiliary_loss_mlp": 0.01043693, + "balance_loss_clip": 1.02744436, + "balance_loss_mlp": 1.04339862, + "epoch": 0.22876897640162333, + "flos": 14939450161920.0, + "grad_norm": 2.1651494765134736, + "language_loss": 0.76485813, + "learning_rate": 3.59966507689401e-06, + "loss": 0.78668666, + "num_input_tokens_seen": 81893225, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.9609375, + "step": 3805, + "time_per_iteration": 2.4578893184661865 + }, + { + "auxiliary_loss_clip": 0.01139764, + "auxiliary_loss_mlp": 0.01043484, + "balance_loss_clip": 1.02560234, + "balance_loss_mlp": 1.04387915, + "epoch": 0.2288290996542913, + "flos": 18113917409280.0, + "grad_norm": 2.4014048134005628, + "language_loss": 0.79309744, + "learning_rate": 3.5994312826223363e-06, + "loss": 0.81492996, + "num_input_tokens_seen": 81911350, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.95703125, + "step": 3806, + "time_per_iteration": 2.415726900100708 + }, + { + "auxiliary_loss_clip": 0.01139425, + "auxiliary_loss_mlp": 0.01043738, + "balance_loss_clip": 1.02717948, + "balance_loss_mlp": 1.04547703, + "epoch": 0.22888922290695926, + "flos": 39855457071360.0, + "grad_norm": 2.230394288716221, + "language_loss": 0.69194484, + "learning_rate": 3.5991974277000684e-06, + "loss": 0.71377647, + "num_input_tokens_seen": 81935420, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9375, + "step": 3807, + "time_per_iteration": 2.6051764488220215 + }, + { + "auxiliary_loss_clip": 0.01144692, + "auxiliary_loss_mlp": 0.01053011, + "balance_loss_clip": 1.03484392, + "balance_loss_mlp": 1.04811931, + "epoch": 0.22894934615962723, + "flos": 23403774372480.0, + "grad_norm": 2.5207266425605668, + "language_loss": 0.65717816, + "learning_rate": 3.5989635121360733e-06, + "loss": 0.67915517, + "num_input_tokens_seen": 81953845, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.96484375, + "step": 3808, + "time_per_iteration": 2.463885545730591 + }, + { + "auxiliary_loss_clip": 0.01137242, + "auxiliary_loss_mlp": 0.01041588, + "balance_loss_clip": 1.02564931, + "balance_loss_mlp": 1.04470515, + "epoch": 0.22900946941229522, + "flos": 18843011671680.0, + "grad_norm": 1.8002654314964242, + "language_loss": 0.74498177, + "learning_rate": 3.598729535939222e-06, + "loss": 0.76677001, + "num_input_tokens_seen": 81972100, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.92578125, + "step": 3809, + "time_per_iteration": 2.4587652683258057 + }, + { + "auxiliary_loss_clip": 0.01138179, + "auxiliary_loss_mlp": 0.01042926, + "balance_loss_clip": 1.02695227, + "balance_loss_mlp": 1.04707646, + "epoch": 0.22906959266496318, + "flos": 22929394429440.0, + "grad_norm": 1.6413135962032894, + "language_loss": 0.81699908, + "learning_rate": 3.5984954991183862e-06, + "loss": 0.83881009, + "num_input_tokens_seen": 81992760, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.9140625, + "step": 3810, + "time_per_iteration": 2.454545736312866 + }, + { + "auxiliary_loss_clip": 0.01135521, + "auxiliary_loss_mlp": 0.01040213, + "balance_loss_clip": 1.02448893, + "balance_loss_mlp": 1.04428005, + "epoch": 0.22912971591763115, + "flos": 19354523299200.0, + "grad_norm": 2.1876822434942245, + "language_loss": 0.78671384, + "learning_rate": 3.598261401682441e-06, + "loss": 0.8084712, + "num_input_tokens_seen": 82009080, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.9140625, + "step": 3811, + "time_per_iteration": 2.4564197063446045 + }, + { + "auxiliary_loss_clip": 0.01135961, + "auxiliary_loss_mlp": 0.01046878, + "balance_loss_clip": 1.0296042, + "balance_loss_mlp": 1.04317403, + "epoch": 0.22918983917029911, + "flos": 19933546538880.0, + "grad_norm": 1.8120535445273127, + "language_loss": 0.82811391, + "learning_rate": 3.5980272436402632e-06, + "loss": 0.84994221, + "num_input_tokens_seen": 82026705, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9296875, + "step": 3812, + "time_per_iteration": 2.4357566833496094 + }, + { + "auxiliary_loss_clip": 0.01144518, + "auxiliary_loss_mlp": 0.01051465, + "balance_loss_clip": 1.03463292, + "balance_loss_mlp": 1.04750013, + "epoch": 0.22924996242296708, + "flos": 16690885320960.0, + "grad_norm": 3.041111828111396, + "language_loss": 0.82337058, + "learning_rate": 3.5977930250007324e-06, + "loss": 0.84533036, + "num_input_tokens_seen": 82043245, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.96875, + "step": 3813, + "time_per_iteration": 2.4521987438201904 + }, + { + "auxiliary_loss_clip": 0.01139715, + "auxiliary_loss_mlp": 0.01046648, + "balance_loss_clip": 1.03009009, + "balance_loss_mlp": 1.04595184, + "epoch": 0.22931008567563504, + "flos": 33036164956800.0, + "grad_norm": 3.1740680187078896, + "language_loss": 0.69927102, + "learning_rate": 3.5975587457727298e-06, + "loss": 0.72113466, + "num_input_tokens_seen": 82066870, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9375, + "step": 3814, + "time_per_iteration": 2.5528602600097656 + }, + { + "auxiliary_loss_clip": 0.01134595, + "auxiliary_loss_mlp": 0.01044391, + "balance_loss_clip": 1.02773738, + "balance_loss_mlp": 1.04310775, + "epoch": 0.229370208928303, + "flos": 23330696152320.0, + "grad_norm": 2.479981906508555, + "language_loss": 0.67106915, + "learning_rate": 3.597324405965139e-06, + "loss": 0.69285899, + "num_input_tokens_seen": 82083180, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9140625, + "step": 3815, + "time_per_iteration": 2.4768760204315186 + }, + { + "auxiliary_loss_clip": 0.01139552, + "auxiliary_loss_mlp": 0.01052238, + "balance_loss_clip": 1.03593004, + "balance_loss_mlp": 1.04644942, + "epoch": 0.229430332180971, + "flos": 28617213150720.0, + "grad_norm": 1.8467960453518941, + "language_loss": 0.83103681, + "learning_rate": 3.597090005586848e-06, + "loss": 0.85295475, + "num_input_tokens_seen": 82102950, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.9296875, + "step": 3816, + "time_per_iteration": 2.507967710494995 + }, + { + "auxiliary_loss_clip": 0.0113842, + "auxiliary_loss_mlp": 0.01038991, + "balance_loss_clip": 1.02221847, + "balance_loss_mlp": 1.04643357, + "epoch": 0.22949045543363897, + "flos": 17238199829760.0, + "grad_norm": 2.1171855882825636, + "language_loss": 0.86756372, + "learning_rate": 3.596855544646742e-06, + "loss": 0.8893379, + "num_input_tokens_seen": 82119510, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.91796875, + "step": 3817, + "time_per_iteration": 2.4445815086364746 + }, + { + "auxiliary_loss_clip": 0.01142243, + "auxiliary_loss_mlp": 0.01049311, + "balance_loss_clip": 1.03278852, + "balance_loss_mlp": 1.04829407, + "epoch": 0.22955057868630693, + "flos": 27489438858240.0, + "grad_norm": 2.403232678237585, + "language_loss": 0.75039381, + "learning_rate": 3.5966210231537154e-06, + "loss": 0.77230936, + "num_input_tokens_seen": 82140095, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.9375, + "step": 3818, + "time_per_iteration": 2.508527994155884 + }, + { + "auxiliary_loss_clip": 0.01141204, + "auxiliary_loss_mlp": 0.01041338, + "balance_loss_clip": 1.02426732, + "balance_loss_mlp": 1.04769611, + "epoch": 0.2296107019389749, + "flos": 23476421629440.0, + "grad_norm": 1.6537639427714739, + "language_loss": 0.74597251, + "learning_rate": 3.596386441116659e-06, + "loss": 0.76779795, + "num_input_tokens_seen": 82159510, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.93359375, + "step": 3819, + "time_per_iteration": 2.5009493827819824 + }, + { + "auxiliary_loss_clip": 0.01138376, + "auxiliary_loss_mlp": 0.01044786, + "balance_loss_clip": 1.02806103, + "balance_loss_mlp": 1.04632187, + "epoch": 0.22967082519164286, + "flos": 31285160760960.0, + "grad_norm": 1.815385500594849, + "language_loss": 0.80775046, + "learning_rate": 3.5961517985444684e-06, + "loss": 0.8295821, + "num_input_tokens_seen": 82179580, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.921875, + "step": 3820, + "time_per_iteration": 2.5374531745910645 + }, + { + "auxiliary_loss_clip": 0.01142613, + "auxiliary_loss_mlp": 0.01041984, + "balance_loss_clip": 1.02384043, + "balance_loss_mlp": 1.04725921, + "epoch": 0.22973094844431083, + "flos": 14642935390080.0, + "grad_norm": 2.0886359367899763, + "language_loss": 0.69226766, + "learning_rate": 3.595917095446042e-06, + "loss": 0.71411359, + "num_input_tokens_seen": 82195585, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.953125, + "step": 3821, + "time_per_iteration": 2.4539082050323486 + }, + { + "auxiliary_loss_clip": 0.0113954, + "auxiliary_loss_mlp": 0.01036487, + "balance_loss_clip": 1.01912975, + "balance_loss_mlp": 1.0466336, + "epoch": 0.2297910716969788, + "flos": 22823853292800.0, + "grad_norm": 1.623620301878745, + "language_loss": 0.82655883, + "learning_rate": 3.5956823318302796e-06, + "loss": 0.84831905, + "num_input_tokens_seen": 82217530, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9296875, + "step": 3822, + "time_per_iteration": 2.5025360584259033 + }, + { + "auxiliary_loss_clip": 0.01137437, + "auxiliary_loss_mlp": 0.01040965, + "balance_loss_clip": 1.02264285, + "balance_loss_mlp": 1.04520607, + "epoch": 0.2298511949496468, + "flos": 23039029716480.0, + "grad_norm": 1.581563173789708, + "language_loss": 0.66093826, + "learning_rate": 3.5954475077060833e-06, + "loss": 0.68272227, + "num_input_tokens_seen": 82237980, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.921875, + "step": 3823, + "time_per_iteration": 2.500643253326416 + }, + { + "auxiliary_loss_clip": 0.0104753, + "auxiliary_loss_mlp": 0.01001124, + "balance_loss_clip": 0.99913329, + "balance_loss_mlp": 1.01448655, + "epoch": 0.22991131820231475, + "flos": 66890914911360.0, + "grad_norm": 0.8191682875264555, + "language_loss": 0.56770015, + "learning_rate": 3.595212623082357e-06, + "loss": 0.58818674, + "num_input_tokens_seen": 82301785, + "router_z_loss_clip": 0.01989746, + "router_z_loss_mlp": 0.33203125, + "step": 3824, + "time_per_iteration": 3.1365485191345215 + }, + { + "auxiliary_loss_clip": 0.01135805, + "auxiliary_loss_mlp": 0.01039563, + "balance_loss_clip": 1.02363658, + "balance_loss_mlp": 1.04575276, + "epoch": 0.22997144145498272, + "flos": 17887248633600.0, + "grad_norm": 2.487273324074565, + "language_loss": 0.72840559, + "learning_rate": 3.594977677968009e-06, + "loss": 0.75015926, + "num_input_tokens_seen": 82317355, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.90234375, + "step": 3825, + "time_per_iteration": 2.444730758666992 + }, + { + "auxiliary_loss_clip": 0.01143286, + "auxiliary_loss_mlp": 0.01046033, + "balance_loss_clip": 1.02810407, + "balance_loss_mlp": 1.04978526, + "epoch": 0.23003156470765068, + "flos": 24676843178880.0, + "grad_norm": 1.8892090994393747, + "language_loss": 0.87760615, + "learning_rate": 3.5947426723719473e-06, + "loss": 0.89949936, + "num_input_tokens_seen": 82336645, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9375, + "step": 3826, + "time_per_iteration": 2.492682456970215 + }, + { + "auxiliary_loss_clip": 0.01142911, + "auxiliary_loss_mlp": 0.01043844, + "balance_loss_clip": 1.0258677, + "balance_loss_mlp": 1.04683542, + "epoch": 0.23009168796031865, + "flos": 15814126247040.0, + "grad_norm": 2.6663888482282623, + "language_loss": 0.81568289, + "learning_rate": 3.594507606303083e-06, + "loss": 0.8375504, + "num_input_tokens_seen": 82354225, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9609375, + "step": 3827, + "time_per_iteration": 2.488593578338623 + }, + { + "auxiliary_loss_clip": 0.01135849, + "auxiliary_loss_mlp": 0.01043921, + "balance_loss_clip": 1.02750623, + "balance_loss_mlp": 1.04553437, + "epoch": 0.2301518112129866, + "flos": 16212842190720.0, + "grad_norm": 1.8456206141648608, + "language_loss": 0.86791205, + "learning_rate": 3.5942724797703314e-06, + "loss": 0.88970977, + "num_input_tokens_seen": 82370240, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90625, + "step": 3828, + "time_per_iteration": 2.4386606216430664 + }, + { + "auxiliary_loss_clip": 0.01138396, + "auxiliary_loss_mlp": 0.01049169, + "balance_loss_clip": 1.03147864, + "balance_loss_mlp": 1.04512644, + "epoch": 0.2302119344656546, + "flos": 20595452411520.0, + "grad_norm": 2.106420485404446, + "language_loss": 0.70638877, + "learning_rate": 3.594037292782607e-06, + "loss": 0.72826439, + "num_input_tokens_seen": 82389145, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.93359375, + "step": 3829, + "time_per_iteration": 2.475399971008301 + }, + { + "auxiliary_loss_clip": 0.01139852, + "auxiliary_loss_mlp": 0.01038095, + "balance_loss_clip": 1.02241933, + "balance_loss_mlp": 1.05011487, + "epoch": 0.23027205771832257, + "flos": 26796901662720.0, + "grad_norm": 1.5719627508253273, + "language_loss": 0.84045994, + "learning_rate": 3.5938020453488293e-06, + "loss": 0.86223942, + "num_input_tokens_seen": 82409185, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8984375, + "step": 3830, + "time_per_iteration": 2.4943718910217285 + }, + { + "auxiliary_loss_clip": 0.01139071, + "auxiliary_loss_mlp": 0.01049012, + "balance_loss_clip": 1.03172636, + "balance_loss_mlp": 1.04637957, + "epoch": 0.23033218097099054, + "flos": 43873143068160.0, + "grad_norm": 1.733206127117623, + "language_loss": 0.66863495, + "learning_rate": 3.5935667374779177e-06, + "loss": 0.69051576, + "num_input_tokens_seen": 82432070, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.92578125, + "step": 3831, + "time_per_iteration": 2.6513662338256836 + }, + { + "auxiliary_loss_clip": 0.01141151, + "auxiliary_loss_mlp": 0.01042727, + "balance_loss_clip": 1.02603793, + "balance_loss_mlp": 1.04735637, + "epoch": 0.2303923042236585, + "flos": 26067663745920.0, + "grad_norm": 2.238850649877041, + "language_loss": 0.75253022, + "learning_rate": 3.5933313691787957e-06, + "loss": 0.77436894, + "num_input_tokens_seen": 82450625, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9375, + "step": 3832, + "time_per_iteration": 2.4889180660247803 + }, + { + "auxiliary_loss_clip": 0.01139559, + "auxiliary_loss_mlp": 0.01043075, + "balance_loss_clip": 1.02515745, + "balance_loss_mlp": 1.04709673, + "epoch": 0.23045242747632647, + "flos": 18296379521280.0, + "grad_norm": 1.8583815246829203, + "language_loss": 0.87474239, + "learning_rate": 3.593095940460389e-06, + "loss": 0.89656878, + "num_input_tokens_seen": 82468575, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.92578125, + "step": 3833, + "time_per_iteration": 2.46744966506958 + }, + { + "auxiliary_loss_clip": 0.01140821, + "auxiliary_loss_mlp": 0.01047215, + "balance_loss_clip": 1.02950096, + "balance_loss_mlp": 1.0478369, + "epoch": 0.23051255072899443, + "flos": 25520528805120.0, + "grad_norm": 3.2120713643012206, + "language_loss": 0.74875945, + "learning_rate": 3.592860451331624e-06, + "loss": 0.77063978, + "num_input_tokens_seen": 82488655, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9296875, + "step": 3834, + "time_per_iteration": 2.485504627227783 + }, + { + "auxiliary_loss_clip": 0.0113943, + "auxiliary_loss_mlp": 0.01051682, + "balance_loss_clip": 1.03408706, + "balance_loss_mlp": 1.0484879, + "epoch": 0.2305726739816624, + "flos": 21215198695680.0, + "grad_norm": 1.820281268490984, + "language_loss": 0.85338157, + "learning_rate": 3.592624901801432e-06, + "loss": 0.87529278, + "num_input_tokens_seen": 82507220, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.91015625, + "step": 3835, + "time_per_iteration": 2.4730474948883057 + }, + { + "auxiliary_loss_clip": 0.01146651, + "auxiliary_loss_mlp": 0.01049278, + "balance_loss_clip": 1.03142083, + "balance_loss_mlp": 1.04814029, + "epoch": 0.2306327972343304, + "flos": 23331127115520.0, + "grad_norm": 2.799799470431086, + "language_loss": 0.81974924, + "learning_rate": 3.5923892918787432e-06, + "loss": 0.84170854, + "num_input_tokens_seen": 82527920, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.984375, + "step": 3836, + "time_per_iteration": 2.464657783508301 + }, + { + "auxiliary_loss_clip": 0.0114557, + "auxiliary_loss_mlp": 0.01043707, + "balance_loss_clip": 1.02726793, + "balance_loss_mlp": 1.05202293, + "epoch": 0.23069292048699835, + "flos": 20666734951680.0, + "grad_norm": 1.7793450137018207, + "language_loss": 0.79603267, + "learning_rate": 3.5921536215724934e-06, + "loss": 0.81792545, + "num_input_tokens_seen": 82549040, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9375, + "step": 3837, + "time_per_iteration": 2.4715559482574463 + }, + { + "auxiliary_loss_clip": 0.01055276, + "auxiliary_loss_mlp": 0.01017826, + "balance_loss_clip": 1.01614499, + "balance_loss_mlp": 1.02046371, + "epoch": 0.23075304373966632, + "flos": 70454832393600.0, + "grad_norm": 0.9409846751082755, + "language_loss": 0.65487945, + "learning_rate": 3.5919178908916184e-06, + "loss": 0.67561042, + "num_input_tokens_seen": 82604070, + "router_z_loss_clip": 0.0168457, + "router_z_loss_mlp": 0.34765625, + "step": 3838, + "time_per_iteration": 2.9852375984191895 + }, + { + "auxiliary_loss_clip": 0.01139351, + "auxiliary_loss_mlp": 0.01047713, + "balance_loss_clip": 1.03131008, + "balance_loss_mlp": 1.04721856, + "epoch": 0.23081316699233428, + "flos": 16617986668800.0, + "grad_norm": 2.6310373190732648, + "language_loss": 0.7527796, + "learning_rate": 3.591682099845058e-06, + "loss": 0.77465028, + "num_input_tokens_seen": 82619665, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.921875, + "step": 3839, + "time_per_iteration": 2.4290778636932373 + }, + { + "auxiliary_loss_clip": 0.01145463, + "auxiliary_loss_mlp": 0.01042021, + "balance_loss_clip": 1.02486694, + "balance_loss_mlp": 1.0510757, + "epoch": 0.23087329024500225, + "flos": 13298081253120.0, + "grad_norm": 4.016837458595543, + "language_loss": 0.68691337, + "learning_rate": 3.591446248441752e-06, + "loss": 0.70878816, + "num_input_tokens_seen": 82637530, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9453125, + "step": 3840, + "time_per_iteration": 2.456422805786133 + }, + { + "auxiliary_loss_clip": 0.01143425, + "auxiliary_loss_mlp": 0.01039716, + "balance_loss_clip": 1.02084517, + "balance_loss_mlp": 1.04936612, + "epoch": 0.23093341349767021, + "flos": 17785729820160.0, + "grad_norm": 2.1574295618121426, + "language_loss": 0.79412574, + "learning_rate": 3.591210336690645e-06, + "loss": 0.81595719, + "num_input_tokens_seen": 82656130, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.9375, + "step": 3841, + "time_per_iteration": 2.4762818813323975 + }, + { + "auxiliary_loss_clip": 0.01141641, + "auxiliary_loss_mlp": 0.01041348, + "balance_loss_clip": 1.02557695, + "balance_loss_mlp": 1.04872346, + "epoch": 0.23099353675033818, + "flos": 23988076911360.0, + "grad_norm": 5.070488540070664, + "language_loss": 0.83171731, + "learning_rate": 3.590974364600683e-06, + "loss": 0.85354722, + "num_input_tokens_seen": 82675295, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.9296875, + "step": 3842, + "time_per_iteration": 2.4908032417297363 + }, + { + "auxiliary_loss_clip": 0.01139394, + "auxiliary_loss_mlp": 0.0104314, + "balance_loss_clip": 1.0255568, + "balance_loss_mlp": 1.04567111, + "epoch": 0.23105366000300617, + "flos": 35995168471680.0, + "grad_norm": 1.6842769818445011, + "language_loss": 0.66523731, + "learning_rate": 3.5907383321808135e-06, + "loss": 0.68706262, + "num_input_tokens_seen": 82703260, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9375, + "step": 3843, + "time_per_iteration": 2.6503937244415283 + }, + { + "auxiliary_loss_clip": 0.01138914, + "auxiliary_loss_mlp": 0.01043512, + "balance_loss_clip": 1.02642977, + "balance_loss_mlp": 1.04793119, + "epoch": 0.23111378325567414, + "flos": 31245335556480.0, + "grad_norm": 1.8910129932977493, + "language_loss": 0.77445257, + "learning_rate": 3.590502239439987e-06, + "loss": 0.79627681, + "num_input_tokens_seen": 82725060, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.91015625, + "step": 3844, + "time_per_iteration": 5.4645676612854 + }, + { + "auxiliary_loss_clip": 0.01142407, + "auxiliary_loss_mlp": 0.01041287, + "balance_loss_clip": 1.02321458, + "balance_loss_mlp": 1.04744804, + "epoch": 0.2311739065083421, + "flos": 19208223204480.0, + "grad_norm": 1.6615026518232119, + "language_loss": 0.77974623, + "learning_rate": 3.590266086387156e-06, + "loss": 0.80158317, + "num_input_tokens_seen": 82742960, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.94921875, + "step": 3845, + "time_per_iteration": 2.467289686203003 + }, + { + "auxiliary_loss_clip": 0.01133475, + "auxiliary_loss_mlp": 0.01032005, + "balance_loss_clip": 1.01687717, + "balance_loss_mlp": 1.04577661, + "epoch": 0.23123402976101007, + "flos": 23360178240000.0, + "grad_norm": 2.1438137502119425, + "language_loss": 0.76064527, + "learning_rate": 3.590029873031276e-06, + "loss": 0.78230006, + "num_input_tokens_seen": 82760205, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.875, + "step": 3846, + "time_per_iteration": 2.4985382556915283 + }, + { + "auxiliary_loss_clip": 0.01140881, + "auxiliary_loss_mlp": 0.0104335, + "balance_loss_clip": 1.02638626, + "balance_loss_mlp": 1.04725194, + "epoch": 0.23129415301367803, + "flos": 13735365425280.0, + "grad_norm": 2.4609763976845556, + "language_loss": 0.69493651, + "learning_rate": 3.589793599381304e-06, + "loss": 0.71677887, + "num_input_tokens_seen": 82778590, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9375, + "step": 3847, + "time_per_iteration": 2.4514195919036865 + }, + { + "auxiliary_loss_clip": 0.01048129, + "auxiliary_loss_mlp": 0.01002559, + "balance_loss_clip": 1.00074661, + "balance_loss_mlp": 1.01598144, + "epoch": 0.231354276266346, + "flos": 69737015001600.0, + "grad_norm": 0.7927409416341922, + "language_loss": 0.61051595, + "learning_rate": 3.589557265446198e-06, + "loss": 0.63102281, + "num_input_tokens_seen": 82833925, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.3203125, + "step": 3848, + "time_per_iteration": 2.981518030166626 + }, + { + "auxiliary_loss_clip": 0.011385, + "auxiliary_loss_mlp": 0.0104523, + "balance_loss_clip": 1.02789688, + "balance_loss_mlp": 1.04593349, + "epoch": 0.231414399519014, + "flos": 18835900778880.0, + "grad_norm": 2.568019101440284, + "language_loss": 0.7746805, + "learning_rate": 3.589320871234923e-06, + "loss": 0.79651785, + "num_input_tokens_seen": 82850625, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.92578125, + "step": 3849, + "time_per_iteration": 2.450693130493164 + }, + { + "auxiliary_loss_clip": 0.01139635, + "auxiliary_loss_mlp": 0.01042495, + "balance_loss_clip": 1.02551913, + "balance_loss_mlp": 1.04533124, + "epoch": 0.23147452277168196, + "flos": 36135470995200.0, + "grad_norm": 1.9223002445017061, + "language_loss": 0.71673942, + "learning_rate": 3.5890844167564405e-06, + "loss": 0.73856068, + "num_input_tokens_seen": 82872105, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9453125, + "step": 3850, + "time_per_iteration": 2.589395761489868 + }, + { + "auxiliary_loss_clip": 0.01137166, + "auxiliary_loss_mlp": 0.01035391, + "balance_loss_clip": 1.01870215, + "balance_loss_mlp": 1.04362154, + "epoch": 0.23153464602434992, + "flos": 20812927305600.0, + "grad_norm": 3.8422038584857665, + "language_loss": 0.75846308, + "learning_rate": 3.588847902019718e-06, + "loss": 0.78018856, + "num_input_tokens_seen": 82890595, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.93359375, + "step": 3851, + "time_per_iteration": 2.495729446411133 + }, + { + "auxiliary_loss_clip": 0.01138492, + "auxiliary_loss_mlp": 0.01040821, + "balance_loss_clip": 1.0234046, + "balance_loss_mlp": 1.04747272, + "epoch": 0.2315947692770179, + "flos": 19939256801280.0, + "grad_norm": 1.914141324585442, + "language_loss": 0.69797802, + "learning_rate": 3.588611327033723e-06, + "loss": 0.71977121, + "num_input_tokens_seen": 82908910, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.91015625, + "step": 3852, + "time_per_iteration": 2.478408098220825 + }, + { + "auxiliary_loss_clip": 0.01140513, + "auxiliary_loss_mlp": 0.01037305, + "balance_loss_clip": 1.0206399, + "balance_loss_mlp": 1.04643583, + "epoch": 0.23165489252968585, + "flos": 12855553695360.0, + "grad_norm": 2.1861380100726144, + "language_loss": 0.67030561, + "learning_rate": 3.588374691807428e-06, + "loss": 0.69208378, + "num_input_tokens_seen": 82925405, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.94140625, + "step": 3853, + "time_per_iteration": 2.4445838928222656 + }, + { + "auxiliary_loss_clip": 0.01141194, + "auxiliary_loss_mlp": 0.01035545, + "balance_loss_clip": 1.01815248, + "balance_loss_mlp": 1.04680121, + "epoch": 0.23171501578235382, + "flos": 30628282792320.0, + "grad_norm": 1.6671703506367506, + "language_loss": 0.79851103, + "learning_rate": 3.5881379963498053e-06, + "loss": 0.82027847, + "num_input_tokens_seen": 82945615, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9453125, + "step": 3854, + "time_per_iteration": 2.5455782413482666 + }, + { + "auxiliary_loss_clip": 0.01146661, + "auxiliary_loss_mlp": 0.01042654, + "balance_loss_clip": 1.02476025, + "balance_loss_mlp": 1.04726899, + "epoch": 0.23177513903502178, + "flos": 23842782397440.0, + "grad_norm": 3.8560715318244556, + "language_loss": 0.64987147, + "learning_rate": 3.587901240669831e-06, + "loss": 0.67176461, + "num_input_tokens_seen": 82967570, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9921875, + "step": 3855, + "time_per_iteration": 2.5996124744415283 + }, + { + "auxiliary_loss_clip": 0.01140829, + "auxiliary_loss_mlp": 0.01044083, + "balance_loss_clip": 1.02753139, + "balance_loss_mlp": 1.04570055, + "epoch": 0.23183526228768978, + "flos": 29570282668800.0, + "grad_norm": 2.1096123404526623, + "language_loss": 0.70711654, + "learning_rate": 3.5876644247764815e-06, + "loss": 0.72896564, + "num_input_tokens_seen": 82987435, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.953125, + "step": 3856, + "time_per_iteration": 2.5024092197418213 + }, + { + "auxiliary_loss_clip": 0.01137323, + "auxiliary_loss_mlp": 0.01035633, + "balance_loss_clip": 1.02062488, + "balance_loss_mlp": 1.0464257, + "epoch": 0.23189538554035774, + "flos": 34458694254720.0, + "grad_norm": 6.089384897844753, + "language_loss": 0.76997125, + "learning_rate": 3.5874275486787387e-06, + "loss": 0.79170084, + "num_input_tokens_seen": 83010505, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.91015625, + "step": 3857, + "time_per_iteration": 2.5962576866149902 + }, + { + "auxiliary_loss_clip": 0.01143962, + "auxiliary_loss_mlp": 0.01048446, + "balance_loss_clip": 1.03018308, + "balance_loss_mlp": 1.0477798, + "epoch": 0.2319555087930257, + "flos": 18003815245440.0, + "grad_norm": 3.478057752262005, + "language_loss": 0.91006696, + "learning_rate": 3.587190612385584e-06, + "loss": 0.93199098, + "num_input_tokens_seen": 83026705, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.96484375, + "step": 3858, + "time_per_iteration": 2.4276509284973145 + }, + { + "auxiliary_loss_clip": 0.01136894, + "auxiliary_loss_mlp": 0.01042737, + "balance_loss_clip": 1.02576208, + "balance_loss_mlp": 1.04679012, + "epoch": 0.23201563204569367, + "flos": 23143852581120.0, + "grad_norm": 2.1437168922033747, + "language_loss": 0.75995493, + "learning_rate": 3.5869536159060026e-06, + "loss": 0.78175128, + "num_input_tokens_seen": 83046500, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.8984375, + "step": 3859, + "time_per_iteration": 2.485426187515259 + }, + { + "auxiliary_loss_clip": 0.01136619, + "auxiliary_loss_mlp": 0.01036655, + "balance_loss_clip": 1.01962614, + "balance_loss_mlp": 1.04423487, + "epoch": 0.23207575529836164, + "flos": 20667991927680.0, + "grad_norm": 1.9055462071213993, + "language_loss": 0.84061682, + "learning_rate": 3.58671655924898e-06, + "loss": 0.86234951, + "num_input_tokens_seen": 83065280, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.92578125, + "step": 3860, + "time_per_iteration": 2.4607324600219727 + }, + { + "auxiliary_loss_clip": 0.01137991, + "auxiliary_loss_mlp": 0.01040318, + "balance_loss_clip": 1.02317619, + "balance_loss_mlp": 1.04656291, + "epoch": 0.2321358785510296, + "flos": 16472189364480.0, + "grad_norm": 2.1337823805291047, + "language_loss": 0.82972974, + "learning_rate": 3.586479442423508e-06, + "loss": 0.85151279, + "num_input_tokens_seen": 83082310, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9140625, + "step": 3861, + "time_per_iteration": 2.451805591583252 + }, + { + "auxiliary_loss_clip": 0.01142125, + "auxiliary_loss_mlp": 0.01043892, + "balance_loss_clip": 1.02702415, + "balance_loss_mlp": 1.04800034, + "epoch": 0.2321960018036976, + "flos": 21616320850560.0, + "grad_norm": 1.8456518711772996, + "language_loss": 0.85918242, + "learning_rate": 3.586242265438576e-06, + "loss": 0.8810426, + "num_input_tokens_seen": 83102065, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.94140625, + "step": 3862, + "time_per_iteration": 2.4582395553588867 + }, + { + "auxiliary_loss_clip": 0.01136451, + "auxiliary_loss_mlp": 0.01044214, + "balance_loss_clip": 1.02914643, + "balance_loss_mlp": 1.0468179, + "epoch": 0.23225612505636556, + "flos": 22271474966400.0, + "grad_norm": 1.3833481647146872, + "language_loss": 0.7492758, + "learning_rate": 3.5860050283031773e-06, + "loss": 0.7710824, + "num_input_tokens_seen": 83121445, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8984375, + "step": 3863, + "time_per_iteration": 2.496985912322998 + }, + { + "auxiliary_loss_clip": 0.01139904, + "auxiliary_loss_mlp": 0.01042767, + "balance_loss_clip": 1.02723408, + "balance_loss_mlp": 1.05037498, + "epoch": 0.23231624830903352, + "flos": 17052325925760.0, + "grad_norm": 2.003739732436234, + "language_loss": 0.74640852, + "learning_rate": 3.58576773102631e-06, + "loss": 0.76823521, + "num_input_tokens_seen": 83138175, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.89453125, + "step": 3864, + "time_per_iteration": 2.440204381942749 + }, + { + "auxiliary_loss_clip": 0.0113912, + "auxiliary_loss_mlp": 0.01035726, + "balance_loss_clip": 1.01952517, + "balance_loss_mlp": 1.0468204, + "epoch": 0.2323763715617015, + "flos": 34640043045120.0, + "grad_norm": 3.940820538439298, + "language_loss": 0.70690906, + "learning_rate": 3.5855303736169714e-06, + "loss": 0.72865754, + "num_input_tokens_seen": 83161975, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.921875, + "step": 3865, + "time_per_iteration": 2.598194122314453 + }, + { + "auxiliary_loss_clip": 0.01148702, + "auxiliary_loss_mlp": 0.01049623, + "balance_loss_clip": 1.03091884, + "balance_loss_mlp": 1.04987264, + "epoch": 0.23243649481436945, + "flos": 25551698832000.0, + "grad_norm": 1.9658537667403149, + "language_loss": 0.94853866, + "learning_rate": 3.5852929560841617e-06, + "loss": 0.97052193, + "num_input_tokens_seen": 83180905, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.98828125, + "step": 3866, + "time_per_iteration": 2.496276617050171 + }, + { + "auxiliary_loss_clip": 0.01138876, + "auxiliary_loss_mlp": 0.01040339, + "balance_loss_clip": 1.02412629, + "balance_loss_mlp": 1.04817796, + "epoch": 0.23249661806703742, + "flos": 20483482740480.0, + "grad_norm": 2.6667540210019123, + "language_loss": 0.72528732, + "learning_rate": 3.5850554784368846e-06, + "loss": 0.74707949, + "num_input_tokens_seen": 83196390, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.90625, + "step": 3867, + "time_per_iteration": 2.4933414459228516 + }, + { + "auxiliary_loss_clip": 0.01140693, + "auxiliary_loss_mlp": 0.01043897, + "balance_loss_clip": 1.02625418, + "balance_loss_mlp": 1.04734945, + "epoch": 0.23255674131970538, + "flos": 20376612800640.0, + "grad_norm": 1.8421111702540602, + "language_loss": 0.82411921, + "learning_rate": 3.584817940684145e-06, + "loss": 0.84596509, + "num_input_tokens_seen": 83216165, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.93359375, + "step": 3868, + "time_per_iteration": 2.4994540214538574 + }, + { + "auxiliary_loss_clip": 0.01136829, + "auxiliary_loss_mlp": 0.01040452, + "balance_loss_clip": 1.02433491, + "balance_loss_mlp": 1.04700828, + "epoch": 0.23261686457237338, + "flos": 17056096853760.0, + "grad_norm": 1.815886356300666, + "language_loss": 0.73335075, + "learning_rate": 3.58458034283495e-06, + "loss": 0.75512362, + "num_input_tokens_seen": 83233845, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8984375, + "step": 3869, + "time_per_iteration": 2.4486095905303955 + }, + { + "auxiliary_loss_clip": 0.01139645, + "auxiliary_loss_mlp": 0.01047185, + "balance_loss_clip": 1.03108525, + "balance_loss_mlp": 1.04929376, + "epoch": 0.23267698782504134, + "flos": 29169878785920.0, + "grad_norm": 1.6948965109205438, + "language_loss": 0.79564929, + "learning_rate": 3.5843426848983097e-06, + "loss": 0.81751764, + "num_input_tokens_seen": 83254930, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.90234375, + "step": 3870, + "time_per_iteration": 2.506114959716797 + }, + { + "auxiliary_loss_clip": 0.01143066, + "auxiliary_loss_mlp": 0.01043212, + "balance_loss_clip": 1.02574801, + "balance_loss_mlp": 1.04845953, + "epoch": 0.2327371110777093, + "flos": 21174655219200.0, + "grad_norm": 3.2368167151878797, + "language_loss": 0.70599115, + "learning_rate": 3.5841049668832357e-06, + "loss": 0.72785389, + "num_input_tokens_seen": 83272095, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9453125, + "step": 3871, + "time_per_iteration": 2.455266237258911 + }, + { + "auxiliary_loss_clip": 0.01145685, + "auxiliary_loss_mlp": 0.01055983, + "balance_loss_clip": 1.03674293, + "balance_loss_mlp": 1.05011845, + "epoch": 0.23279723433037727, + "flos": 24863112132480.0, + "grad_norm": 2.2694181422477313, + "language_loss": 0.69087327, + "learning_rate": 3.5838671887987433e-06, + "loss": 0.71289003, + "num_input_tokens_seen": 83290980, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.95703125, + "step": 3872, + "time_per_iteration": 2.482089042663574 + }, + { + "auxiliary_loss_clip": 0.01147162, + "auxiliary_loss_mlp": 0.01045167, + "balance_loss_clip": 1.0271188, + "balance_loss_mlp": 1.04984593, + "epoch": 0.23285735758304524, + "flos": 38800617344640.0, + "grad_norm": 1.4965805681858408, + "language_loss": 0.78046703, + "learning_rate": 3.5836293506538474e-06, + "loss": 0.80239034, + "num_input_tokens_seen": 83315175, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.97265625, + "step": 3873, + "time_per_iteration": 2.6179678440093994 + }, + { + "auxiliary_loss_clip": 0.01053819, + "auxiliary_loss_mlp": 0.01009657, + "balance_loss_clip": 1.00777328, + "balance_loss_mlp": 1.02347898, + "epoch": 0.2329174808357132, + "flos": 53944113692160.0, + "grad_norm": 0.841863213022928, + "language_loss": 0.60519493, + "learning_rate": 3.5833914524575687e-06, + "loss": 0.6258297, + "num_input_tokens_seen": 83372060, + "router_z_loss_clip": 0.01879883, + "router_z_loss_mlp": 0.3046875, + "step": 3874, + "time_per_iteration": 2.955524444580078 + }, + { + "auxiliary_loss_clip": 0.01142096, + "auxiliary_loss_mlp": 0.01044266, + "balance_loss_clip": 1.02695727, + "balance_loss_mlp": 1.04998708, + "epoch": 0.23297760408838117, + "flos": 21216024708480.0, + "grad_norm": 2.0817330720741287, + "language_loss": 0.8082279, + "learning_rate": 3.583153494218927e-06, + "loss": 0.83009154, + "num_input_tokens_seen": 83389795, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.921875, + "step": 3875, + "time_per_iteration": 2.4961941242218018 + }, + { + "auxiliary_loss_clip": 0.01141065, + "auxiliary_loss_mlp": 0.01039949, + "balance_loss_clip": 1.02440381, + "balance_loss_mlp": 1.04931068, + "epoch": 0.23303772734104916, + "flos": 28403006394240.0, + "grad_norm": 1.6586054731564495, + "language_loss": 0.60997009, + "learning_rate": 3.5829154759469464e-06, + "loss": 0.63178027, + "num_input_tokens_seen": 83410005, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.91796875, + "step": 3876, + "time_per_iteration": 2.5234174728393555 + }, + { + "auxiliary_loss_clip": 0.01144475, + "auxiliary_loss_mlp": 0.0104992, + "balance_loss_clip": 1.0319072, + "balance_loss_mlp": 1.05151403, + "epoch": 0.23309785059371713, + "flos": 24314720215680.0, + "grad_norm": 1.9912662806979935, + "language_loss": 0.70357525, + "learning_rate": 3.5826773976506523e-06, + "loss": 0.72551912, + "num_input_tokens_seen": 83430250, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9296875, + "step": 3877, + "time_per_iteration": 2.5117876529693604 + }, + { + "auxiliary_loss_clip": 0.01142635, + "auxiliary_loss_mlp": 0.0104808, + "balance_loss_clip": 1.02984059, + "balance_loss_mlp": 1.04846656, + "epoch": 0.2331579738463851, + "flos": 15992925171840.0, + "grad_norm": 2.20617127152986, + "language_loss": 0.81169856, + "learning_rate": 3.582439259339073e-06, + "loss": 0.83360565, + "num_input_tokens_seen": 83447950, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.94140625, + "step": 3878, + "time_per_iteration": 2.418745517730713 + }, + { + "auxiliary_loss_clip": 0.01145943, + "auxiliary_loss_mlp": 0.01047242, + "balance_loss_clip": 1.02914643, + "balance_loss_mlp": 1.04905999, + "epoch": 0.23321809709905306, + "flos": 36426957863040.0, + "grad_norm": 2.449565501872003, + "language_loss": 0.74765849, + "learning_rate": 3.5822010610212374e-06, + "loss": 0.76959032, + "num_input_tokens_seen": 83467785, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.96875, + "step": 3879, + "time_per_iteration": 2.627453088760376 + }, + { + "auxiliary_loss_clip": 0.0113984, + "auxiliary_loss_mlp": 0.01043428, + "balance_loss_clip": 1.02597582, + "balance_loss_mlp": 1.04611635, + "epoch": 0.23327822035172102, + "flos": 21324762155520.0, + "grad_norm": 2.3281305870509685, + "language_loss": 0.89896512, + "learning_rate": 3.5819628027061795e-06, + "loss": 0.92079782, + "num_input_tokens_seen": 83485390, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9375, + "step": 3880, + "time_per_iteration": 2.529181957244873 + }, + { + "auxiliary_loss_clip": 0.01144521, + "auxiliary_loss_mlp": 0.01046534, + "balance_loss_clip": 1.02926075, + "balance_loss_mlp": 1.05019975, + "epoch": 0.233338343604389, + "flos": 19171881619200.0, + "grad_norm": 1.7300006336865508, + "language_loss": 0.72026277, + "learning_rate": 3.5817244844029334e-06, + "loss": 0.74217331, + "num_input_tokens_seen": 83504890, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9453125, + "step": 3881, + "time_per_iteration": 2.5004756450653076 + }, + { + "auxiliary_loss_clip": 0.01138796, + "auxiliary_loss_mlp": 0.01044785, + "balance_loss_clip": 1.02798867, + "balance_loss_mlp": 1.04610527, + "epoch": 0.23339846685705698, + "flos": 26908368543360.0, + "grad_norm": 1.5765664683306326, + "language_loss": 0.67988127, + "learning_rate": 3.581486106120537e-06, + "loss": 0.70171714, + "num_input_tokens_seen": 83526475, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9296875, + "step": 3882, + "time_per_iteration": 2.5134541988372803 + }, + { + "auxiliary_loss_clip": 0.01143679, + "auxiliary_loss_mlp": 0.01057975, + "balance_loss_clip": 1.04020119, + "balance_loss_mlp": 1.0481658, + "epoch": 0.23345859010972494, + "flos": 32343160884480.0, + "grad_norm": 3.2831975264627116, + "language_loss": 0.76596051, + "learning_rate": 3.5812476678680287e-06, + "loss": 0.78797704, + "num_input_tokens_seen": 83546620, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.953125, + "step": 3883, + "time_per_iteration": 2.5556836128234863 + }, + { + "auxiliary_loss_clip": 0.01046918, + "auxiliary_loss_mlp": 0.01002528, + "balance_loss_clip": 1.00059688, + "balance_loss_mlp": 1.01619315, + "epoch": 0.2335187133623929, + "flos": 58484229050880.0, + "grad_norm": 0.7953130928556094, + "language_loss": 0.59102494, + "learning_rate": 3.58100916965445e-06, + "loss": 0.6115194, + "num_input_tokens_seen": 83616160, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.30859375, + "step": 3884, + "time_per_iteration": 3.210090398788452 + }, + { + "auxiliary_loss_clip": 0.01139917, + "auxiliary_loss_mlp": 0.01035881, + "balance_loss_clip": 1.0196687, + "balance_loss_mlp": 1.04723644, + "epoch": 0.23357883661506088, + "flos": 24502317972480.0, + "grad_norm": 3.4795297654408617, + "language_loss": 0.80128157, + "learning_rate": 3.5807706114888455e-06, + "loss": 0.82303953, + "num_input_tokens_seen": 83636795, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.92578125, + "step": 3885, + "time_per_iteration": 4.129857301712036 + }, + { + "auxiliary_loss_clip": 0.01139579, + "auxiliary_loss_mlp": 0.01039954, + "balance_loss_clip": 1.02257299, + "balance_loss_mlp": 1.04763317, + "epoch": 0.23363895986772884, + "flos": 18948516894720.0, + "grad_norm": 2.392049069504846, + "language_loss": 0.88482237, + "learning_rate": 3.580531993380261e-06, + "loss": 0.9066177, + "num_input_tokens_seen": 83654050, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.921875, + "step": 3886, + "time_per_iteration": 4.002579689025879 + }, + { + "auxiliary_loss_clip": 0.01143892, + "auxiliary_loss_mlp": 0.01041505, + "balance_loss_clip": 1.02452922, + "balance_loss_mlp": 1.04953825, + "epoch": 0.2336990831203968, + "flos": 31686821619840.0, + "grad_norm": 2.2740188667520815, + "language_loss": 0.73199034, + "learning_rate": 3.5802933153377445e-06, + "loss": 0.75384426, + "num_input_tokens_seen": 83673720, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9453125, + "step": 3887, + "time_per_iteration": 2.5730721950531006 + }, + { + "auxiliary_loss_clip": 0.0114256, + "auxiliary_loss_mlp": 0.01043796, + "balance_loss_clip": 1.02709508, + "balance_loss_mlp": 1.04827881, + "epoch": 0.23375920637306477, + "flos": 27709750926720.0, + "grad_norm": 1.8689872769958875, + "language_loss": 0.84098816, + "learning_rate": 3.5800545773703475e-06, + "loss": 0.86285174, + "num_input_tokens_seen": 83693470, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.94140625, + "step": 3888, + "time_per_iteration": 2.526090145111084 + }, + { + "auxiliary_loss_clip": 0.01140206, + "auxiliary_loss_mlp": 0.01051088, + "balance_loss_clip": 1.03400528, + "balance_loss_mlp": 1.04775357, + "epoch": 0.23381932962573276, + "flos": 17675627656320.0, + "grad_norm": 5.34722340994348, + "language_loss": 0.87174153, + "learning_rate": 3.5798157794871225e-06, + "loss": 0.89365447, + "num_input_tokens_seen": 83711620, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.921875, + "step": 3889, + "time_per_iteration": 2.465535879135132 + }, + { + "auxiliary_loss_clip": 0.01143335, + "auxiliary_loss_mlp": 0.0104294, + "balance_loss_clip": 1.02659607, + "balance_loss_mlp": 1.04914057, + "epoch": 0.23387945287840073, + "flos": 14390842763520.0, + "grad_norm": 4.26980733686294, + "language_loss": 0.7660414, + "learning_rate": 3.579576921697125e-06, + "loss": 0.78790414, + "num_input_tokens_seen": 83727890, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.94140625, + "step": 3890, + "time_per_iteration": 2.4164645671844482 + }, + { + "auxiliary_loss_clip": 0.01144006, + "auxiliary_loss_mlp": 0.01046428, + "balance_loss_clip": 1.02940536, + "balance_loss_mlp": 1.05018783, + "epoch": 0.2339395761310687, + "flos": 46097988503040.0, + "grad_norm": 3.12388753004446, + "language_loss": 0.73396742, + "learning_rate": 3.579338004009412e-06, + "loss": 0.75587177, + "num_input_tokens_seen": 83749370, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9375, + "step": 3891, + "time_per_iteration": 2.692251443862915 + }, + { + "auxiliary_loss_clip": 0.01136776, + "auxiliary_loss_mlp": 0.01040069, + "balance_loss_clip": 1.0227952, + "balance_loss_mlp": 1.04672241, + "epoch": 0.23399969938373666, + "flos": 22382044007040.0, + "grad_norm": 1.6638493558493535, + "language_loss": 0.82791233, + "learning_rate": 3.5790990264330433e-06, + "loss": 0.84968084, + "num_input_tokens_seen": 83769560, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.8984375, + "step": 3892, + "time_per_iteration": 2.4657654762268066 + }, + { + "auxiliary_loss_clip": 0.01143467, + "auxiliary_loss_mlp": 0.01042619, + "balance_loss_clip": 1.02550626, + "balance_loss_mlp": 1.04892194, + "epoch": 0.23405982263640462, + "flos": 43508542066560.0, + "grad_norm": 2.124834647136637, + "language_loss": 0.64928782, + "learning_rate": 3.578859988977082e-06, + "loss": 0.67114866, + "num_input_tokens_seen": 83795635, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9453125, + "step": 3893, + "time_per_iteration": 2.6640076637268066 + }, + { + "auxiliary_loss_clip": 0.01139173, + "auxiliary_loss_mlp": 0.01038221, + "balance_loss_clip": 1.02056575, + "balance_loss_mlp": 1.04930127, + "epoch": 0.2341199458890726, + "flos": 22564685687040.0, + "grad_norm": 2.3013698222001753, + "language_loss": 0.79011095, + "learning_rate": 3.5786208916505916e-06, + "loss": 0.81188488, + "num_input_tokens_seen": 83814090, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.8984375, + "step": 3894, + "time_per_iteration": 2.4596238136291504 + }, + { + "auxiliary_loss_clip": 0.01139997, + "auxiliary_loss_mlp": 0.01044293, + "balance_loss_clip": 1.02772284, + "balance_loss_mlp": 1.0473485, + "epoch": 0.23418006914174055, + "flos": 25633970933760.0, + "grad_norm": 1.4729608662155413, + "language_loss": 0.81608742, + "learning_rate": 3.5783817344626383e-06, + "loss": 0.83793032, + "num_input_tokens_seen": 83836870, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.92578125, + "step": 3895, + "time_per_iteration": 2.5229499340057373 + }, + { + "auxiliary_loss_clip": 0.01141397, + "auxiliary_loss_mlp": 0.01048743, + "balance_loss_clip": 1.03210139, + "balance_loss_mlp": 1.04895353, + "epoch": 0.23424019239440855, + "flos": 13545936074880.0, + "grad_norm": 2.370345363223057, + "language_loss": 0.79861861, + "learning_rate": 3.578142517422292e-06, + "loss": 0.82052004, + "num_input_tokens_seen": 83853275, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.92578125, + "step": 3896, + "time_per_iteration": 2.4219553470611572 + }, + { + "auxiliary_loss_clip": 0.01142956, + "auxiliary_loss_mlp": 0.01042754, + "balance_loss_clip": 1.02507555, + "balance_loss_mlp": 1.04863656, + "epoch": 0.2343003156470765, + "flos": 22419498913920.0, + "grad_norm": 1.6083647422684384, + "language_loss": 0.83279634, + "learning_rate": 3.577903240538623e-06, + "loss": 0.85465348, + "num_input_tokens_seen": 83872340, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9453125, + "step": 3897, + "time_per_iteration": 2.497347593307495 + }, + { + "auxiliary_loss_clip": 0.01144102, + "auxiliary_loss_mlp": 0.01048556, + "balance_loss_clip": 1.03093636, + "balance_loss_mlp": 1.04880857, + "epoch": 0.23436043889974448, + "flos": 14790815683200.0, + "grad_norm": 2.0551194275294784, + "language_loss": 0.79281437, + "learning_rate": 3.577663903820705e-06, + "loss": 0.8147409, + "num_input_tokens_seen": 83888795, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.953125, + "step": 3898, + "time_per_iteration": 2.4275295734405518 + }, + { + "auxiliary_loss_clip": 0.01139139, + "auxiliary_loss_mlp": 0.01047646, + "balance_loss_clip": 1.0316844, + "balance_loss_mlp": 1.05034626, + "epoch": 0.23442056215241244, + "flos": 22965700101120.0, + "grad_norm": 3.329769754331659, + "language_loss": 0.73955798, + "learning_rate": 3.577424507277614e-06, + "loss": 0.76142585, + "num_input_tokens_seen": 83906820, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.88671875, + "step": 3899, + "time_per_iteration": 2.5017077922821045 + }, + { + "auxiliary_loss_clip": 0.01141437, + "auxiliary_loss_mlp": 0.01051006, + "balance_loss_clip": 1.03412604, + "balance_loss_mlp": 1.04896975, + "epoch": 0.2344806854050804, + "flos": 23071887682560.0, + "grad_norm": 1.8374782290855665, + "language_loss": 0.75695914, + "learning_rate": 3.5771850509184277e-06, + "loss": 0.77888358, + "num_input_tokens_seen": 83926370, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.92578125, + "step": 3900, + "time_per_iteration": 2.4796969890594482 + }, + { + "auxiliary_loss_clip": 0.01137483, + "auxiliary_loss_mlp": 0.01049218, + "balance_loss_clip": 1.03224266, + "balance_loss_mlp": 1.04685295, + "epoch": 0.23454080865774837, + "flos": 16327074418560.0, + "grad_norm": 1.9641187800197561, + "language_loss": 0.66949147, + "learning_rate": 3.5769455347522256e-06, + "loss": 0.69135845, + "num_input_tokens_seen": 83944600, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.90625, + "step": 3901, + "time_per_iteration": 2.5052907466888428 + }, + { + "auxiliary_loss_clip": 0.01050259, + "auxiliary_loss_mlp": 0.0101831, + "balance_loss_clip": 1.01646185, + "balance_loss_mlp": 1.01950026, + "epoch": 0.23460093191041637, + "flos": 67760958142080.0, + "grad_norm": 0.7670843237762338, + "language_loss": 0.58209252, + "learning_rate": 3.576705958788091e-06, + "loss": 0.6027782, + "num_input_tokens_seen": 84005100, + "router_z_loss_clip": 0.01843262, + "router_z_loss_mlp": 0.30859375, + "step": 3902, + "time_per_iteration": 3.0522701740264893 + }, + { + "auxiliary_loss_clip": 0.01140756, + "auxiliary_loss_mlp": 0.01044175, + "balance_loss_clip": 1.02684176, + "balance_loss_mlp": 1.04932666, + "epoch": 0.23466105516308433, + "flos": 20077619990400.0, + "grad_norm": 1.9913375770157136, + "language_loss": 0.80411339, + "learning_rate": 3.576466323035108e-06, + "loss": 0.82596278, + "num_input_tokens_seen": 84023775, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9140625, + "step": 3903, + "time_per_iteration": 2.515796184539795 + }, + { + "auxiliary_loss_clip": 0.01139226, + "auxiliary_loss_mlp": 0.01039647, + "balance_loss_clip": 1.02274299, + "balance_loss_mlp": 1.04670942, + "epoch": 0.2347211784157523, + "flos": 24535714642560.0, + "grad_norm": 3.712536549247666, + "language_loss": 0.82183945, + "learning_rate": 3.5762266275023645e-06, + "loss": 0.84362817, + "num_input_tokens_seen": 84042605, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.92578125, + "step": 3904, + "time_per_iteration": 2.48119854927063 + }, + { + "auxiliary_loss_clip": 0.01140858, + "auxiliary_loss_mlp": 0.0104346, + "balance_loss_clip": 1.02642536, + "balance_loss_mlp": 1.05013537, + "epoch": 0.23478130166842026, + "flos": 23805040181760.0, + "grad_norm": 1.9990680719867946, + "language_loss": 0.7137326, + "learning_rate": 3.57598687219895e-06, + "loss": 0.7355758, + "num_input_tokens_seen": 84061520, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.90625, + "step": 3905, + "time_per_iteration": 2.494558811187744 + }, + { + "auxiliary_loss_clip": 0.01136571, + "auxiliary_loss_mlp": 0.01036326, + "balance_loss_clip": 1.01987517, + "balance_loss_mlp": 1.04811251, + "epoch": 0.23484142492108823, + "flos": 24093618048000.0, + "grad_norm": 1.865256832649412, + "language_loss": 0.70834756, + "learning_rate": 3.5757470571339543e-06, + "loss": 0.73007655, + "num_input_tokens_seen": 84081800, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8828125, + "step": 3906, + "time_per_iteration": 2.5057764053344727 + }, + { + "auxiliary_loss_clip": 0.01144181, + "auxiliary_loss_mlp": 0.01037627, + "balance_loss_clip": 1.01881528, + "balance_loss_mlp": 1.04728532, + "epoch": 0.2349015481737562, + "flos": 29095830898560.0, + "grad_norm": 2.129912307166789, + "language_loss": 0.73542202, + "learning_rate": 3.575507182316473e-06, + "loss": 0.75724012, + "num_input_tokens_seen": 84102340, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.96875, + "step": 3907, + "time_per_iteration": 2.5734074115753174 + }, + { + "auxiliary_loss_clip": 0.01141507, + "auxiliary_loss_mlp": 0.01047564, + "balance_loss_clip": 1.03004074, + "balance_loss_mlp": 1.04927719, + "epoch": 0.23496167142642416, + "flos": 18916305373440.0, + "grad_norm": 1.7646530569469054, + "language_loss": 0.72807813, + "learning_rate": 3.575267247755601e-06, + "loss": 0.74996883, + "num_input_tokens_seen": 84120370, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.921875, + "step": 3908, + "time_per_iteration": 2.438422441482544 + }, + { + "auxiliary_loss_clip": 0.01049243, + "auxiliary_loss_mlp": 0.01002133, + "balance_loss_clip": 1.00030959, + "balance_loss_mlp": 1.01835775, + "epoch": 0.23502179467909215, + "flos": 55868062896000.0, + "grad_norm": 1.0194055540826834, + "language_loss": 0.73271406, + "learning_rate": 3.5750272534604367e-06, + "loss": 0.75322783, + "num_input_tokens_seen": 84165515, + "router_z_loss_clip": 0.01818848, + "router_z_loss_mlp": 0.30859375, + "step": 3909, + "time_per_iteration": 2.8451788425445557 + }, + { + "auxiliary_loss_clip": 0.01139398, + "auxiliary_loss_mlp": 0.01043023, + "balance_loss_clip": 1.02607155, + "balance_loss_mlp": 1.04842734, + "epoch": 0.23508191793176011, + "flos": 23401763210880.0, + "grad_norm": 1.5487453833335116, + "language_loss": 0.87906706, + "learning_rate": 3.5747871994400822e-06, + "loss": 0.9008913, + "num_input_tokens_seen": 84184540, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.91015625, + "step": 3910, + "time_per_iteration": 2.4648385047912598 + }, + { + "auxiliary_loss_clip": 0.01141916, + "auxiliary_loss_mlp": 0.01040084, + "balance_loss_clip": 1.02370465, + "balance_loss_mlp": 1.04950166, + "epoch": 0.23514204118442808, + "flos": 20047671025920.0, + "grad_norm": 2.1910966534760297, + "language_loss": 0.75809109, + "learning_rate": 3.5745470857036386e-06, + "loss": 0.7799111, + "num_input_tokens_seen": 84202025, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.921875, + "step": 3911, + "time_per_iteration": 2.4715898036956787 + }, + { + "auxiliary_loss_clip": 0.01136455, + "auxiliary_loss_mlp": 0.01042737, + "balance_loss_clip": 1.02729297, + "balance_loss_mlp": 1.04807627, + "epoch": 0.23520216443709605, + "flos": 21580589796480.0, + "grad_norm": 1.9083148186883727, + "language_loss": 0.81775904, + "learning_rate": 3.5743069122602122e-06, + "loss": 0.83955097, + "num_input_tokens_seen": 84221895, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8828125, + "step": 3912, + "time_per_iteration": 2.4627628326416016 + }, + { + "auxiliary_loss_clip": 0.01139949, + "auxiliary_loss_mlp": 0.01050703, + "balance_loss_clip": 1.03270197, + "balance_loss_mlp": 1.04939759, + "epoch": 0.235262287689764, + "flos": 23185796688000.0, + "grad_norm": 1.7554989092460516, + "language_loss": 0.71664345, + "learning_rate": 3.574066679118909e-06, + "loss": 0.73854995, + "num_input_tokens_seen": 84240455, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.90625, + "step": 3913, + "time_per_iteration": 2.5080020427703857 + }, + { + "auxiliary_loss_clip": 0.01147528, + "auxiliary_loss_mlp": 0.01045028, + "balance_loss_clip": 1.02691996, + "balance_loss_mlp": 1.05220175, + "epoch": 0.23532241094243198, + "flos": 23185222070400.0, + "grad_norm": 1.7040704955860875, + "language_loss": 0.75903499, + "learning_rate": 3.57382638628884e-06, + "loss": 0.78096056, + "num_input_tokens_seen": 84261605, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.953125, + "step": 3914, + "time_per_iteration": 2.487429618835449 + }, + { + "auxiliary_loss_clip": 0.01141443, + "auxiliary_loss_mlp": 0.01040515, + "balance_loss_clip": 1.02307451, + "balance_loss_mlp": 1.05093837, + "epoch": 0.23538253419509997, + "flos": 17019324305280.0, + "grad_norm": 2.554647654086476, + "language_loss": 0.89353001, + "learning_rate": 3.5735860337791174e-06, + "loss": 0.9153496, + "num_input_tokens_seen": 84278675, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.90625, + "step": 3915, + "time_per_iteration": 2.500753402709961 + }, + { + "auxiliary_loss_clip": 0.01044736, + "auxiliary_loss_mlp": 0.01003661, + "balance_loss_clip": 1.00158656, + "balance_loss_mlp": 1.0141747, + "epoch": 0.23544265744776793, + "flos": 63448588967040.0, + "grad_norm": 0.8049654288159457, + "language_loss": 0.5935356, + "learning_rate": 3.573345621598854e-06, + "loss": 0.61401957, + "num_input_tokens_seen": 84329765, + "router_z_loss_clip": 0.02075195, + "router_z_loss_mlp": 0.3046875, + "step": 3916, + "time_per_iteration": 2.9926259517669678 + }, + { + "auxiliary_loss_clip": 0.01042644, + "auxiliary_loss_mlp": 0.01002857, + "balance_loss_clip": 1.00075865, + "balance_loss_mlp": 1.01226258, + "epoch": 0.2355027807004359, + "flos": 70515343831680.0, + "grad_norm": 0.7742950949727582, + "language_loss": 0.49486533, + "learning_rate": 3.5731051497571675e-06, + "loss": 0.51532036, + "num_input_tokens_seen": 84393680, + "router_z_loss_clip": 0.02099609, + "router_z_loss_mlp": 0.3046875, + "step": 3917, + "time_per_iteration": 3.085294723510742 + }, + { + "auxiliary_loss_clip": 0.01142529, + "auxiliary_loss_mlp": 0.01052441, + "balance_loss_clip": 1.03615093, + "balance_loss_mlp": 1.04923129, + "epoch": 0.23556290395310386, + "flos": 21434289701760.0, + "grad_norm": 2.000752484300541, + "language_loss": 0.76012552, + "learning_rate": 3.5728646182631756e-06, + "loss": 0.78207517, + "num_input_tokens_seen": 84412640, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.93359375, + "step": 3918, + "time_per_iteration": 2.4883201122283936 + }, + { + "auxiliary_loss_clip": 0.01145359, + "auxiliary_loss_mlp": 0.0104448, + "balance_loss_clip": 1.02805305, + "balance_loss_mlp": 1.04997587, + "epoch": 0.23562302720577183, + "flos": 18186421011840.0, + "grad_norm": 2.209135495431813, + "language_loss": 0.68728662, + "learning_rate": 3.5726240271259995e-06, + "loss": 0.709185, + "num_input_tokens_seen": 84431605, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.953125, + "step": 3919, + "time_per_iteration": 2.4489476680755615 + }, + { + "auxiliary_loss_clip": 0.01137524, + "auxiliary_loss_mlp": 0.01038874, + "balance_loss_clip": 1.02216101, + "balance_loss_mlp": 1.04864836, + "epoch": 0.2356831504584398, + "flos": 33730497832320.0, + "grad_norm": 1.8210843900818243, + "language_loss": 0.70324695, + "learning_rate": 3.5723833763547634e-06, + "loss": 0.72501087, + "num_input_tokens_seen": 84454210, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.88671875, + "step": 3920, + "time_per_iteration": 2.6011908054351807 + }, + { + "auxiliary_loss_clip": 0.01141332, + "auxiliary_loss_mlp": 0.01046958, + "balance_loss_clip": 1.03128195, + "balance_loss_mlp": 1.05122209, + "epoch": 0.23574327371110776, + "flos": 24932778560640.0, + "grad_norm": 1.6333300745229378, + "language_loss": 0.77596343, + "learning_rate": 3.5721426659585916e-06, + "loss": 0.79784632, + "num_input_tokens_seen": 84475540, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8984375, + "step": 3921, + "time_per_iteration": 2.498924732208252 + }, + { + "auxiliary_loss_clip": 0.01142015, + "auxiliary_loss_mlp": 0.01042804, + "balance_loss_clip": 1.02615058, + "balance_loss_mlp": 1.05108023, + "epoch": 0.23580339696377575, + "flos": 17822107319040.0, + "grad_norm": 2.5438781918161375, + "language_loss": 0.7561245, + "learning_rate": 3.571901895946612e-06, + "loss": 0.7779727, + "num_input_tokens_seen": 84494580, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.91015625, + "step": 3922, + "time_per_iteration": 2.467103958129883 + }, + { + "auxiliary_loss_clip": 0.01138646, + "auxiliary_loss_mlp": 0.01041381, + "balance_loss_clip": 1.02583599, + "balance_loss_mlp": 1.0489881, + "epoch": 0.23586352021644372, + "flos": 26286611097600.0, + "grad_norm": 2.3317912313524625, + "language_loss": 0.80016744, + "learning_rate": 3.571661066327956e-06, + "loss": 0.82196772, + "num_input_tokens_seen": 84513850, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8984375, + "step": 3923, + "time_per_iteration": 2.5075273513793945 + }, + { + "auxiliary_loss_clip": 0.01138213, + "auxiliary_loss_mlp": 0.01046068, + "balance_loss_clip": 1.02985525, + "balance_loss_mlp": 1.04845715, + "epoch": 0.23592364346911168, + "flos": 14246697484800.0, + "grad_norm": 1.9692150152538963, + "language_loss": 0.74753797, + "learning_rate": 3.571420177111754e-06, + "loss": 0.76938081, + "num_input_tokens_seen": 84532315, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8984375, + "step": 3924, + "time_per_iteration": 2.442448377609253 + }, + { + "auxiliary_loss_clip": 0.01141205, + "auxiliary_loss_mlp": 0.01046148, + "balance_loss_clip": 1.03013766, + "balance_loss_mlp": 1.04995513, + "epoch": 0.23598376672177965, + "flos": 18587938216320.0, + "grad_norm": 2.1681544357284093, + "language_loss": 0.82770467, + "learning_rate": 3.5711792283071416e-06, + "loss": 0.84957814, + "num_input_tokens_seen": 84550970, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.9140625, + "step": 3925, + "time_per_iteration": 2.44718337059021 + }, + { + "auxiliary_loss_clip": 0.01138195, + "auxiliary_loss_mlp": 0.01047882, + "balance_loss_clip": 1.03100252, + "balance_loss_mlp": 1.04645014, + "epoch": 0.2360438899744476, + "flos": 22675542036480.0, + "grad_norm": 1.8844556004317345, + "language_loss": 0.59408414, + "learning_rate": 3.5709382199232564e-06, + "loss": 0.61594486, + "num_input_tokens_seen": 84571655, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.91796875, + "step": 3926, + "time_per_iteration": 2.4840757846832275 + }, + { + "auxiliary_loss_clip": 0.01135063, + "auxiliary_loss_mlp": 0.01045392, + "balance_loss_clip": 1.02977526, + "balance_loss_mlp": 1.04721665, + "epoch": 0.23610401322711558, + "flos": 29570139014400.0, + "grad_norm": 1.967091588265342, + "language_loss": 0.71317631, + "learning_rate": 3.570697151969235e-06, + "loss": 0.73498082, + "num_input_tokens_seen": 84593130, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.87890625, + "step": 3927, + "time_per_iteration": 4.117234945297241 + }, + { + "auxiliary_loss_clip": 0.01137568, + "auxiliary_loss_mlp": 0.01044401, + "balance_loss_clip": 1.0295651, + "balance_loss_mlp": 1.04787612, + "epoch": 0.23616413647978354, + "flos": 17858520731520.0, + "grad_norm": 1.8263460078369782, + "language_loss": 0.75102496, + "learning_rate": 3.570456024454221e-06, + "loss": 0.77284467, + "num_input_tokens_seen": 84612410, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8984375, + "step": 3928, + "time_per_iteration": 3.9637200832366943 + }, + { + "auxiliary_loss_clip": 0.01137493, + "auxiliary_loss_mlp": 0.01048389, + "balance_loss_clip": 1.03086567, + "balance_loss_mlp": 1.04693556, + "epoch": 0.23622425973245154, + "flos": 11034847157760.0, + "grad_norm": 2.885999758146942, + "language_loss": 0.81520462, + "learning_rate": 3.5702148373873576e-06, + "loss": 0.83706343, + "num_input_tokens_seen": 84627610, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.90625, + "step": 3929, + "time_per_iteration": 2.499310255050659 + }, + { + "auxiliary_loss_clip": 0.01146116, + "auxiliary_loss_mlp": 0.01047853, + "balance_loss_clip": 1.02998328, + "balance_loss_mlp": 1.04974854, + "epoch": 0.2362843829851195, + "flos": 23404061681280.0, + "grad_norm": 4.669381706210694, + "language_loss": 0.7194528, + "learning_rate": 3.569973590777789e-06, + "loss": 0.74139249, + "num_input_tokens_seen": 84648415, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.9609375, + "step": 3930, + "time_per_iteration": 2.4964945316314697 + }, + { + "auxiliary_loss_clip": 0.01137432, + "auxiliary_loss_mlp": 0.0103904, + "balance_loss_clip": 1.02245224, + "balance_loss_mlp": 1.046561, + "epoch": 0.23634450623778747, + "flos": 39529855261440.0, + "grad_norm": 2.489267518834959, + "language_loss": 0.73764896, + "learning_rate": 3.569732284634665e-06, + "loss": 0.7594136, + "num_input_tokens_seen": 84670080, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.91015625, + "step": 3931, + "time_per_iteration": 2.6283528804779053 + }, + { + "auxiliary_loss_clip": 0.01140852, + "auxiliary_loss_mlp": 0.01039788, + "balance_loss_clip": 1.02245522, + "balance_loss_mlp": 1.04971111, + "epoch": 0.23640462949045543, + "flos": 24207167917440.0, + "grad_norm": 2.06419219579993, + "language_loss": 0.8026945, + "learning_rate": 3.569490918967136e-06, + "loss": 0.82450092, + "num_input_tokens_seen": 84686465, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9140625, + "step": 3932, + "time_per_iteration": 2.4901018142700195 + }, + { + "auxiliary_loss_clip": 0.01138855, + "auxiliary_loss_mlp": 0.01039585, + "balance_loss_clip": 1.02483916, + "balance_loss_mlp": 1.05032694, + "epoch": 0.2364647527431234, + "flos": 26177622255360.0, + "grad_norm": 1.5491195596348342, + "language_loss": 0.85760093, + "learning_rate": 3.5692494937843537e-06, + "loss": 0.87938541, + "num_input_tokens_seen": 84708825, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8828125, + "step": 3933, + "time_per_iteration": 2.5625483989715576 + }, + { + "auxiliary_loss_clip": 0.01146232, + "auxiliary_loss_mlp": 0.01037898, + "balance_loss_clip": 1.02008784, + "balance_loss_mlp": 1.0532943, + "epoch": 0.23652487599579136, + "flos": 22637009721600.0, + "grad_norm": 2.0322099534023685, + "language_loss": 0.8277775, + "learning_rate": 3.5690080090954727e-06, + "loss": 0.84961879, + "num_input_tokens_seen": 84726165, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9296875, + "step": 3934, + "time_per_iteration": 2.512068748474121 + }, + { + "auxiliary_loss_clip": 0.01141394, + "auxiliary_loss_mlp": 0.01037778, + "balance_loss_clip": 1.02102923, + "balance_loss_mlp": 1.04977798, + "epoch": 0.23658499924845935, + "flos": 21762261809280.0, + "grad_norm": 1.774494675769988, + "language_loss": 0.7864846, + "learning_rate": 3.5687664649096515e-06, + "loss": 0.80827636, + "num_input_tokens_seen": 84745815, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.91796875, + "step": 3935, + "time_per_iteration": 2.4996352195739746 + }, + { + "auxiliary_loss_clip": 0.01138141, + "auxiliary_loss_mlp": 0.01035042, + "balance_loss_clip": 1.01913905, + "balance_loss_mlp": 1.04973102, + "epoch": 0.23664512250112732, + "flos": 21798998444160.0, + "grad_norm": 1.7164724890649055, + "language_loss": 0.79656923, + "learning_rate": 3.5685248612360487e-06, + "loss": 0.81830108, + "num_input_tokens_seen": 84765415, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8828125, + "step": 3936, + "time_per_iteration": 2.4868710041046143 + }, + { + "auxiliary_loss_clip": 0.01138439, + "auxiliary_loss_mlp": 0.01036243, + "balance_loss_clip": 1.0192436, + "balance_loss_mlp": 1.04798818, + "epoch": 0.23670524575379528, + "flos": 22637871648000.0, + "grad_norm": 1.4334555797897097, + "language_loss": 0.78783411, + "learning_rate": 3.568283198083826e-06, + "loss": 0.80958092, + "num_input_tokens_seen": 84787080, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.90625, + "step": 3937, + "time_per_iteration": 2.499565362930298 + }, + { + "auxiliary_loss_clip": 0.01136409, + "auxiliary_loss_mlp": 0.01037518, + "balance_loss_clip": 1.02244997, + "balance_loss_mlp": 1.04970455, + "epoch": 0.23676536900646325, + "flos": 16725000263040.0, + "grad_norm": 2.078138882715826, + "language_loss": 0.85105085, + "learning_rate": 3.568041475462147e-06, + "loss": 0.8727901, + "num_input_tokens_seen": 84805395, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8671875, + "step": 3938, + "time_per_iteration": 2.449214220046997 + }, + { + "auxiliary_loss_clip": 0.01135246, + "auxiliary_loss_mlp": 0.01044603, + "balance_loss_clip": 1.0285933, + "balance_loss_mlp": 1.04824734, + "epoch": 0.23682549225913122, + "flos": 11135611785600.0, + "grad_norm": 2.4851234695326423, + "language_loss": 0.93872499, + "learning_rate": 3.5677996933801785e-06, + "loss": 0.96052349, + "num_input_tokens_seen": 84818090, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87109375, + "step": 3939, + "time_per_iteration": 2.415891647338867 + }, + { + "auxiliary_loss_clip": 0.01140924, + "auxiliary_loss_mlp": 0.01043341, + "balance_loss_clip": 1.02598429, + "balance_loss_mlp": 1.04769599, + "epoch": 0.23688561551179918, + "flos": 22559226819840.0, + "grad_norm": 1.6764835140151866, + "language_loss": 0.8238095, + "learning_rate": 3.567557851847088e-06, + "loss": 0.84565216, + "num_input_tokens_seen": 84837695, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9296875, + "step": 3940, + "time_per_iteration": 2.47468900680542 + }, + { + "auxiliary_loss_clip": 0.01145021, + "auxiliary_loss_mlp": 0.01042824, + "balance_loss_clip": 1.02592003, + "balance_loss_mlp": 1.04990602, + "epoch": 0.23694573876446715, + "flos": 18514895909760.0, + "grad_norm": 2.2107440191497054, + "language_loss": 0.88986713, + "learning_rate": 3.5673159508720464e-06, + "loss": 0.91174555, + "num_input_tokens_seen": 84854630, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.953125, + "step": 3941, + "time_per_iteration": 2.455631971359253 + }, + { + "auxiliary_loss_clip": 0.01136515, + "auxiliary_loss_mlp": 0.01043393, + "balance_loss_clip": 1.02580976, + "balance_loss_mlp": 1.04538155, + "epoch": 0.23700586201713514, + "flos": 15335723980800.0, + "grad_norm": 2.1526885300024072, + "language_loss": 0.84676927, + "learning_rate": 3.5670739904642274e-06, + "loss": 0.86856836, + "num_input_tokens_seen": 84871805, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9140625, + "step": 3942, + "time_per_iteration": 2.43743634223938 + }, + { + "auxiliary_loss_clip": 0.01140361, + "auxiliary_loss_mlp": 0.01045145, + "balance_loss_clip": 1.02769232, + "balance_loss_mlp": 1.04840159, + "epoch": 0.2370659852698031, + "flos": 23947605262080.0, + "grad_norm": 1.8547641010298248, + "language_loss": 0.80905575, + "learning_rate": 3.5668319706328065e-06, + "loss": 0.83091086, + "num_input_tokens_seen": 84889815, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.921875, + "step": 3943, + "time_per_iteration": 2.5058658123016357 + }, + { + "auxiliary_loss_clip": 0.01143585, + "auxiliary_loss_mlp": 0.01039213, + "balance_loss_clip": 1.02084267, + "balance_loss_mlp": 1.04731488, + "epoch": 0.23712610852247107, + "flos": 15332527670400.0, + "grad_norm": 2.308079684052438, + "language_loss": 0.67493033, + "learning_rate": 3.566589891386959e-06, + "loss": 0.69675827, + "num_input_tokens_seen": 84904380, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.9609375, + "step": 3944, + "time_per_iteration": 2.4276273250579834 + }, + { + "auxiliary_loss_clip": 0.01144217, + "auxiliary_loss_mlp": 0.01038427, + "balance_loss_clip": 1.02116549, + "balance_loss_mlp": 1.05084419, + "epoch": 0.23718623177513903, + "flos": 19682567233920.0, + "grad_norm": 2.061169456768298, + "language_loss": 0.75421506, + "learning_rate": 3.566347752735866e-06, + "loss": 0.77604151, + "num_input_tokens_seen": 84922935, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.93359375, + "step": 3945, + "time_per_iteration": 2.474323272705078 + }, + { + "auxiliary_loss_clip": 0.01137318, + "auxiliary_loss_mlp": 0.01039206, + "balance_loss_clip": 1.02304149, + "balance_loss_mlp": 1.0469377, + "epoch": 0.237246355027807, + "flos": 24973322037120.0, + "grad_norm": 1.6081639136691026, + "language_loss": 0.63469779, + "learning_rate": 3.5661055546887094e-06, + "loss": 0.65646303, + "num_input_tokens_seen": 84943685, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.90234375, + "step": 3946, + "time_per_iteration": 2.5087931156158447 + }, + { + "auxiliary_loss_clip": 0.01137558, + "auxiliary_loss_mlp": 0.01038922, + "balance_loss_clip": 1.02186346, + "balance_loss_mlp": 1.04692435, + "epoch": 0.23730647828047496, + "flos": 15377416692480.0, + "grad_norm": 2.27613511663784, + "language_loss": 0.77508283, + "learning_rate": 3.5658632972546734e-06, + "loss": 0.79684764, + "num_input_tokens_seen": 84959505, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.90625, + "step": 3947, + "time_per_iteration": 2.4716949462890625 + }, + { + "auxiliary_loss_clip": 0.01141281, + "auxiliary_loss_mlp": 0.0104192, + "balance_loss_clip": 1.02496827, + "balance_loss_mlp": 1.05008841, + "epoch": 0.23736660153314296, + "flos": 28150662372480.0, + "grad_norm": 1.6255497375782806, + "language_loss": 0.80575311, + "learning_rate": 3.565620980442944e-06, + "loss": 0.8275851, + "num_input_tokens_seen": 84982130, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.91015625, + "step": 3948, + "time_per_iteration": 2.5750784873962402 + }, + { + "auxiliary_loss_clip": 0.01138983, + "auxiliary_loss_mlp": 0.01043821, + "balance_loss_clip": 1.02715611, + "balance_loss_mlp": 1.04736018, + "epoch": 0.23742672478581092, + "flos": 22086570729600.0, + "grad_norm": 2.0638215262656696, + "language_loss": 0.80578661, + "learning_rate": 3.5653786042627107e-06, + "loss": 0.82761467, + "num_input_tokens_seen": 85000640, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.91796875, + "step": 3949, + "time_per_iteration": 2.512665271759033 + }, + { + "auxiliary_loss_clip": 0.01138607, + "auxiliary_loss_mlp": 0.0104063, + "balance_loss_clip": 1.02382135, + "balance_loss_mlp": 1.04584646, + "epoch": 0.2374868480384789, + "flos": 19537093152000.0, + "grad_norm": 1.8976071400358168, + "language_loss": 0.73124689, + "learning_rate": 3.565136168723163e-06, + "loss": 0.75303924, + "num_input_tokens_seen": 85018970, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.92578125, + "step": 3950, + "time_per_iteration": 2.4842302799224854 + }, + { + "auxiliary_loss_clip": 0.01135058, + "auxiliary_loss_mlp": 0.01034229, + "balance_loss_clip": 1.01944709, + "balance_loss_mlp": 1.04712903, + "epoch": 0.23754697129114685, + "flos": 19422501788160.0, + "grad_norm": 2.0688047231241247, + "language_loss": 0.73064256, + "learning_rate": 3.564893673833495e-06, + "loss": 0.75233537, + "num_input_tokens_seen": 85035905, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8828125, + "step": 3951, + "time_per_iteration": 2.5215439796447754 + }, + { + "auxiliary_loss_clip": 0.01144126, + "auxiliary_loss_mlp": 0.0104004, + "balance_loss_clip": 1.02258813, + "balance_loss_mlp": 1.0507673, + "epoch": 0.23760709454381482, + "flos": 19501002961920.0, + "grad_norm": 1.7591828710207016, + "language_loss": 0.73658371, + "learning_rate": 3.564651119602903e-06, + "loss": 0.75842535, + "num_input_tokens_seen": 85054560, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.93359375, + "step": 3952, + "time_per_iteration": 2.550182342529297 + }, + { + "auxiliary_loss_clip": 0.0113686, + "auxiliary_loss_mlp": 0.01037761, + "balance_loss_clip": 1.02213275, + "balance_loss_mlp": 1.04537988, + "epoch": 0.23766721779648278, + "flos": 27636600879360.0, + "grad_norm": 1.6791264380286672, + "language_loss": 0.71064484, + "learning_rate": 3.564408506040583e-06, + "loss": 0.73239112, + "num_input_tokens_seen": 85074425, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.9140625, + "step": 3953, + "time_per_iteration": 2.530381202697754 + }, + { + "auxiliary_loss_clip": 0.01140701, + "auxiliary_loss_mlp": 0.01042499, + "balance_loss_clip": 1.02522552, + "balance_loss_mlp": 1.04806364, + "epoch": 0.23772734104915075, + "flos": 23404348990080.0, + "grad_norm": 1.9696108021357461, + "language_loss": 0.81686246, + "learning_rate": 3.5641658331557356e-06, + "loss": 0.83869451, + "num_input_tokens_seen": 85092865, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.92578125, + "step": 3954, + "time_per_iteration": 2.491629123687744 + }, + { + "auxiliary_loss_clip": 0.01141999, + "auxiliary_loss_mlp": 0.01047189, + "balance_loss_clip": 1.02915251, + "balance_loss_mlp": 1.04870319, + "epoch": 0.23778746430181874, + "flos": 15705496540800.0, + "grad_norm": 2.155968963382196, + "language_loss": 0.65756261, + "learning_rate": 3.5639231009575634e-06, + "loss": 0.67945445, + "num_input_tokens_seen": 85110175, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.93359375, + "step": 3955, + "time_per_iteration": 2.4659719467163086 + }, + { + "auxiliary_loss_clip": 0.01138242, + "auxiliary_loss_mlp": 0.01053219, + "balance_loss_clip": 1.0362916, + "balance_loss_mlp": 1.04739583, + "epoch": 0.2378475875544867, + "flos": 19426452284160.0, + "grad_norm": 1.3846492045019327, + "language_loss": 0.83788121, + "learning_rate": 3.5636803094552704e-06, + "loss": 0.85979581, + "num_input_tokens_seen": 85129925, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.91015625, + "step": 3956, + "time_per_iteration": 2.48734712600708 + }, + { + "auxiliary_loss_clip": 0.011338, + "auxiliary_loss_mlp": 0.01040785, + "balance_loss_clip": 1.02471578, + "balance_loss_mlp": 1.04647636, + "epoch": 0.23790771080715467, + "flos": 22268565964800.0, + "grad_norm": 2.1805686912335656, + "language_loss": 0.85228634, + "learning_rate": 3.5634374586580635e-06, + "loss": 0.8740322, + "num_input_tokens_seen": 85147755, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87109375, + "step": 3957, + "time_per_iteration": 2.50199294090271 + }, + { + "auxiliary_loss_clip": 0.01139099, + "auxiliary_loss_mlp": 0.01041833, + "balance_loss_clip": 1.02686596, + "balance_loss_mlp": 1.04807806, + "epoch": 0.23796783405982264, + "flos": 20047311889920.0, + "grad_norm": 2.0218180107915757, + "language_loss": 0.70133704, + "learning_rate": 3.563194548575151e-06, + "loss": 0.72314632, + "num_input_tokens_seen": 85165270, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.91015625, + "step": 3958, + "time_per_iteration": 2.4798173904418945 + }, + { + "auxiliary_loss_clip": 0.01136893, + "auxiliary_loss_mlp": 0.01043034, + "balance_loss_clip": 1.02530742, + "balance_loss_mlp": 1.04581285, + "epoch": 0.2380279573124906, + "flos": 14245943299200.0, + "grad_norm": 3.373562251556634, + "language_loss": 0.65834582, + "learning_rate": 3.562951579215745e-06, + "loss": 0.68014508, + "num_input_tokens_seen": 85181555, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.91015625, + "step": 3959, + "time_per_iteration": 2.4558637142181396 + }, + { + "auxiliary_loss_clip": 0.01138452, + "auxiliary_loss_mlp": 0.01041764, + "balance_loss_clip": 1.02565885, + "balance_loss_mlp": 1.04832602, + "epoch": 0.23808808056515857, + "flos": 21179180332800.0, + "grad_norm": 1.7230243338870097, + "language_loss": 0.72128749, + "learning_rate": 3.5627085505890586e-06, + "loss": 0.74308968, + "num_input_tokens_seen": 85199455, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.90234375, + "step": 3960, + "time_per_iteration": 2.4831748008728027 + }, + { + "auxiliary_loss_clip": 0.01139565, + "auxiliary_loss_mlp": 0.01041249, + "balance_loss_clip": 1.0249052, + "balance_loss_mlp": 1.04867244, + "epoch": 0.23814820381782653, + "flos": 22528308188160.0, + "grad_norm": 1.8711627571775973, + "language_loss": 0.74181205, + "learning_rate": 3.562465462704307e-06, + "loss": 0.7636202, + "num_input_tokens_seen": 85219170, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.91015625, + "step": 3961, + "time_per_iteration": 2.5167927742004395 + }, + { + "auxiliary_loss_clip": 0.01138898, + "auxiliary_loss_mlp": 0.0105126, + "balance_loss_clip": 1.03318763, + "balance_loss_mlp": 1.04605162, + "epoch": 0.23820832707049452, + "flos": 22304332932480.0, + "grad_norm": 2.643011810367893, + "language_loss": 0.66067994, + "learning_rate": 3.5622223155707085e-06, + "loss": 0.68258154, + "num_input_tokens_seen": 85238480, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9296875, + "step": 3962, + "time_per_iteration": 2.4900338649749756 + }, + { + "auxiliary_loss_clip": 0.01138484, + "auxiliary_loss_mlp": 0.01050468, + "balance_loss_clip": 1.03387976, + "balance_loss_mlp": 1.04738379, + "epoch": 0.2382684503231625, + "flos": 24864225454080.0, + "grad_norm": 1.7740384877146562, + "language_loss": 0.74581182, + "learning_rate": 3.561979109197483e-06, + "loss": 0.76770139, + "num_input_tokens_seen": 85259180, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.91015625, + "step": 3963, + "time_per_iteration": 2.5409018993377686 + }, + { + "auxiliary_loss_clip": 0.01142407, + "auxiliary_loss_mlp": 0.01046013, + "balance_loss_clip": 1.02899039, + "balance_loss_mlp": 1.0498383, + "epoch": 0.23832857357583045, + "flos": 21871609787520.0, + "grad_norm": 2.0190521185084753, + "language_loss": 0.76898873, + "learning_rate": 3.5617358435938538e-06, + "loss": 0.79087293, + "num_input_tokens_seen": 85278550, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.92578125, + "step": 3964, + "time_per_iteration": 2.492861270904541 + }, + { + "auxiliary_loss_clip": 0.01137102, + "auxiliary_loss_mlp": 0.01044921, + "balance_loss_clip": 1.02911341, + "balance_loss_mlp": 1.04792333, + "epoch": 0.23838869682849842, + "flos": 21288061434240.0, + "grad_norm": 2.0459212281672956, + "language_loss": 0.71593058, + "learning_rate": 3.561492518769045e-06, + "loss": 0.73775077, + "num_input_tokens_seen": 85297345, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.890625, + "step": 3965, + "time_per_iteration": 2.5120911598205566 + }, + { + "auxiliary_loss_clip": 0.01134569, + "auxiliary_loss_mlp": 0.01047354, + "balance_loss_clip": 1.03158259, + "balance_loss_mlp": 1.04674065, + "epoch": 0.23844882008116638, + "flos": 16180594755840.0, + "grad_norm": 1.8902557347099018, + "language_loss": 0.78008091, + "learning_rate": 3.561249134732282e-06, + "loss": 0.80190015, + "num_input_tokens_seen": 85315105, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.87890625, + "step": 3966, + "time_per_iteration": 2.4576594829559326 + }, + { + "auxiliary_loss_clip": 0.01135801, + "auxiliary_loss_mlp": 0.01042292, + "balance_loss_clip": 1.02656794, + "balance_loss_mlp": 1.04652119, + "epoch": 0.23850894333383435, + "flos": 21069724613760.0, + "grad_norm": 2.8460709531404, + "language_loss": 0.68860286, + "learning_rate": 3.561005691492797e-06, + "loss": 0.71038377, + "num_input_tokens_seen": 85334735, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.89453125, + "step": 3967, + "time_per_iteration": 2.484840154647827 + }, + { + "auxiliary_loss_clip": 0.01137019, + "auxiliary_loss_mlp": 0.01053581, + "balance_loss_clip": 1.03739274, + "balance_loss_mlp": 1.04645443, + "epoch": 0.23856906658650234, + "flos": 17201606849280.0, + "grad_norm": 2.11266161128335, + "language_loss": 0.67849773, + "learning_rate": 3.5607621890598185e-06, + "loss": 0.70040375, + "num_input_tokens_seen": 85352875, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.90234375, + "step": 3968, + "time_per_iteration": 2.441445827484131 + }, + { + "auxiliary_loss_clip": 0.01134651, + "auxiliary_loss_mlp": 0.01038945, + "balance_loss_clip": 1.02318573, + "balance_loss_mlp": 1.0451827, + "epoch": 0.2386291898391703, + "flos": 29494223619840.0, + "grad_norm": 1.8948052650888014, + "language_loss": 0.76742399, + "learning_rate": 3.5605186274425823e-06, + "loss": 0.78916001, + "num_input_tokens_seen": 85372205, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.89453125, + "step": 3969, + "time_per_iteration": 5.413191318511963 + }, + { + "auxiliary_loss_clip": 0.01134724, + "auxiliary_loss_mlp": 0.01040503, + "balance_loss_clip": 1.02519631, + "balance_loss_mlp": 1.04734492, + "epoch": 0.23868931309183827, + "flos": 21142443697920.0, + "grad_norm": 2.7243772241637263, + "language_loss": 0.76300085, + "learning_rate": 3.5602750066503225e-06, + "loss": 0.78475308, + "num_input_tokens_seen": 85389705, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.875, + "step": 3970, + "time_per_iteration": 2.4792258739471436 + }, + { + "auxiliary_loss_clip": 0.01137573, + "auxiliary_loss_mlp": 0.01042951, + "balance_loss_clip": 1.02545094, + "balance_loss_mlp": 1.04645324, + "epoch": 0.23874943634450624, + "flos": 25659394784640.0, + "grad_norm": 3.3207921386663584, + "language_loss": 0.85399735, + "learning_rate": 3.5600313266922793e-06, + "loss": 0.87580258, + "num_input_tokens_seen": 85407855, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9140625, + "step": 3971, + "time_per_iteration": 2.500506639480591 + }, + { + "auxiliary_loss_clip": 0.01055799, + "auxiliary_loss_mlp": 0.01017218, + "balance_loss_clip": 1.01547742, + "balance_loss_mlp": 1.02590835, + "epoch": 0.2388095595971742, + "flos": 58986618624000.0, + "grad_norm": 0.7461637295582213, + "language_loss": 0.62814003, + "learning_rate": 3.5597875875776915e-06, + "loss": 0.64887029, + "num_input_tokens_seen": 85470885, + "router_z_loss_clip": 0.01745605, + "router_z_loss_mlp": 0.29882812, + "step": 3972, + "time_per_iteration": 3.173640012741089 + }, + { + "auxiliary_loss_clip": 0.0113938, + "auxiliary_loss_mlp": 0.01037064, + "balance_loss_clip": 1.02119696, + "balance_loss_mlp": 1.04922092, + "epoch": 0.23886968284984217, + "flos": 16800341040000.0, + "grad_norm": 1.9456864585596687, + "language_loss": 0.8170895, + "learning_rate": 3.5595437893158013e-06, + "loss": 0.8388539, + "num_input_tokens_seen": 85488460, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.90234375, + "step": 3973, + "time_per_iteration": 2.4529452323913574 + }, + { + "auxiliary_loss_clip": 0.01137225, + "auxiliary_loss_mlp": 0.01044983, + "balance_loss_clip": 1.02849591, + "balance_loss_mlp": 1.04869485, + "epoch": 0.23892980610251013, + "flos": 22382654538240.0, + "grad_norm": 1.6994626560625323, + "language_loss": 0.79299271, + "learning_rate": 3.5592999319158546e-06, + "loss": 0.81481481, + "num_input_tokens_seen": 85508590, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8828125, + "step": 3974, + "time_per_iteration": 2.5395772457122803 + }, + { + "auxiliary_loss_clip": 0.01139215, + "auxiliary_loss_mlp": 0.01038332, + "balance_loss_clip": 1.02155876, + "balance_loss_mlp": 1.04858148, + "epoch": 0.23898992935517813, + "flos": 12823198519680.0, + "grad_norm": 1.8925619228877844, + "language_loss": 0.84428573, + "learning_rate": 3.5590560153870984e-06, + "loss": 0.86606121, + "num_input_tokens_seen": 85525970, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.90625, + "step": 3975, + "time_per_iteration": 2.430361032485962 + }, + { + "auxiliary_loss_clip": 0.01135199, + "auxiliary_loss_mlp": 0.01037689, + "balance_loss_clip": 1.02215612, + "balance_loss_mlp": 1.0471369, + "epoch": 0.2390500526078461, + "flos": 22345666508160.0, + "grad_norm": 2.06825719132721, + "language_loss": 0.8375293, + "learning_rate": 3.5588120397387816e-06, + "loss": 0.85925817, + "num_input_tokens_seen": 85543700, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.87890625, + "step": 3976, + "time_per_iteration": 2.480534791946411 + }, + { + "auxiliary_loss_clip": 0.01132825, + "auxiliary_loss_mlp": 0.01032287, + "balance_loss_clip": 1.01798213, + "balance_loss_mlp": 1.04606938, + "epoch": 0.23911017586051406, + "flos": 22635142214400.0, + "grad_norm": 1.747752931490835, + "language_loss": 0.74532628, + "learning_rate": 3.5585680049801566e-06, + "loss": 0.76697731, + "num_input_tokens_seen": 85562765, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8671875, + "step": 3977, + "time_per_iteration": 2.4641239643096924 + }, + { + "auxiliary_loss_clip": 0.01138905, + "auxiliary_loss_mlp": 0.01045175, + "balance_loss_clip": 1.02818775, + "balance_loss_mlp": 1.04930067, + "epoch": 0.23917029911318202, + "flos": 23653281219840.0, + "grad_norm": 1.6638092474338306, + "language_loss": 0.72395146, + "learning_rate": 3.5583239111204764e-06, + "loss": 0.74579227, + "num_input_tokens_seen": 85581755, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.89453125, + "step": 3978, + "time_per_iteration": 2.5007903575897217 + }, + { + "auxiliary_loss_clip": 0.01143288, + "auxiliary_loss_mlp": 0.01042156, + "balance_loss_clip": 1.0256691, + "balance_loss_mlp": 1.05204654, + "epoch": 0.23923042236585, + "flos": 22783597125120.0, + "grad_norm": 2.0169903221822683, + "language_loss": 0.78654587, + "learning_rate": 3.558079758168997e-06, + "loss": 0.80840027, + "num_input_tokens_seen": 85599455, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.91015625, + "step": 3979, + "time_per_iteration": 2.5006349086761475 + }, + { + "auxiliary_loss_clip": 0.0113581, + "auxiliary_loss_mlp": 0.01044452, + "balance_loss_clip": 1.02769148, + "balance_loss_mlp": 1.04762173, + "epoch": 0.23929054561851795, + "flos": 28147717457280.0, + "grad_norm": 1.6987462202935262, + "language_loss": 0.81945407, + "learning_rate": 3.557835546134977e-06, + "loss": 0.84125668, + "num_input_tokens_seen": 85619970, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8828125, + "step": 3980, + "time_per_iteration": 2.5287020206451416 + }, + { + "auxiliary_loss_clip": 0.01136489, + "auxiliary_loss_mlp": 0.01036341, + "balance_loss_clip": 1.01974702, + "balance_loss_mlp": 1.04967999, + "epoch": 0.23935066887118592, + "flos": 21686525982720.0, + "grad_norm": 1.749461413213386, + "language_loss": 0.8401112, + "learning_rate": 3.5575912750276775e-06, + "loss": 0.86183953, + "num_input_tokens_seen": 85638850, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8671875, + "step": 3981, + "time_per_iteration": 2.466660261154175 + }, + { + "auxiliary_loss_clip": 0.01141626, + "auxiliary_loss_mlp": 0.01044752, + "balance_loss_clip": 1.0275383, + "balance_loss_mlp": 1.04951072, + "epoch": 0.2394107921238539, + "flos": 32122274198400.0, + "grad_norm": 3.6241006318049864, + "language_loss": 0.76872683, + "learning_rate": 3.5573469448563607e-06, + "loss": 0.79059052, + "num_input_tokens_seen": 85656285, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.921875, + "step": 3982, + "time_per_iteration": 2.558145046234131 + }, + { + "auxiliary_loss_clip": 0.01135351, + "auxiliary_loss_mlp": 0.01043953, + "balance_loss_clip": 1.02811027, + "balance_loss_mlp": 1.04844236, + "epoch": 0.23947091537652188, + "flos": 17019180650880.0, + "grad_norm": 6.059829142106342, + "language_loss": 0.77878481, + "learning_rate": 3.5571025556302915e-06, + "loss": 0.80057788, + "num_input_tokens_seen": 85673020, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8671875, + "step": 3983, + "time_per_iteration": 2.4443132877349854 + }, + { + "auxiliary_loss_clip": 0.01136897, + "auxiliary_loss_mlp": 0.0104106, + "balance_loss_clip": 1.02446592, + "balance_loss_mlp": 1.04759789, + "epoch": 0.23953103862918984, + "flos": 20593584904320.0, + "grad_norm": 1.9981470653963032, + "language_loss": 0.73163629, + "learning_rate": 3.556858107358737e-06, + "loss": 0.75341582, + "num_input_tokens_seen": 85692565, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.89453125, + "step": 3984, + "time_per_iteration": 2.491344690322876 + }, + { + "auxiliary_loss_clip": 0.01137825, + "auxiliary_loss_mlp": 0.01045273, + "balance_loss_clip": 1.02860713, + "balance_loss_mlp": 1.04674625, + "epoch": 0.2395911618818578, + "flos": 20704405340160.0, + "grad_norm": 2.064924146489818, + "language_loss": 0.79049474, + "learning_rate": 3.5566136000509674e-06, + "loss": 0.81232572, + "num_input_tokens_seen": 85709730, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.90625, + "step": 3985, + "time_per_iteration": 2.4587738513946533 + }, + { + "auxiliary_loss_clip": 0.01139616, + "auxiliary_loss_mlp": 0.01041503, + "balance_loss_clip": 1.02484989, + "balance_loss_mlp": 1.04980683, + "epoch": 0.23965128513452577, + "flos": 27053519402880.0, + "grad_norm": 2.0182764415160563, + "language_loss": 0.73312742, + "learning_rate": 3.556369033716254e-06, + "loss": 0.7549386, + "num_input_tokens_seen": 85730045, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8984375, + "step": 3986, + "time_per_iteration": 2.5608811378479004 + }, + { + "auxiliary_loss_clip": 0.0114189, + "auxiliary_loss_mlp": 0.01051013, + "balance_loss_clip": 1.03495562, + "balance_loss_mlp": 1.04923773, + "epoch": 0.23971140838719374, + "flos": 23144319457920.0, + "grad_norm": 2.2624046500679333, + "language_loss": 0.87836051, + "learning_rate": 3.556124408363871e-06, + "loss": 0.90028954, + "num_input_tokens_seen": 85747590, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.92578125, + "step": 3987, + "time_per_iteration": 2.461778402328491 + }, + { + "auxiliary_loss_clip": 0.01132193, + "auxiliary_loss_mlp": 0.01036037, + "balance_loss_clip": 1.02161288, + "balance_loss_mlp": 1.04831004, + "epoch": 0.23977153163986173, + "flos": 18034554309120.0, + "grad_norm": 2.3750633167266306, + "language_loss": 0.8308624, + "learning_rate": 3.5558797240030945e-06, + "loss": 0.85254467, + "num_input_tokens_seen": 85763460, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.83984375, + "step": 3988, + "time_per_iteration": 2.4527788162231445 + }, + { + "auxiliary_loss_clip": 0.01134459, + "auxiliary_loss_mlp": 0.01040007, + "balance_loss_clip": 1.02336502, + "balance_loss_mlp": 1.04686844, + "epoch": 0.2398316548925297, + "flos": 18113378705280.0, + "grad_norm": 1.649806875732991, + "language_loss": 0.85145879, + "learning_rate": 3.5556349806432035e-06, + "loss": 0.87320346, + "num_input_tokens_seen": 85782050, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.875, + "step": 3989, + "time_per_iteration": 2.43949031829834 + }, + { + "auxiliary_loss_clip": 0.01135126, + "auxiliary_loss_mlp": 0.01037638, + "balance_loss_clip": 1.02249837, + "balance_loss_mlp": 1.04763699, + "epoch": 0.23989177814519766, + "flos": 12567730014720.0, + "grad_norm": 2.0784071273800944, + "language_loss": 0.84493041, + "learning_rate": 3.555390178293477e-06, + "loss": 0.86665809, + "num_input_tokens_seen": 85797400, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.875, + "step": 3990, + "time_per_iteration": 2.4476051330566406 + }, + { + "auxiliary_loss_clip": 0.01133399, + "auxiliary_loss_mlp": 0.01040211, + "balance_loss_clip": 1.02507186, + "balance_loss_mlp": 1.0463922, + "epoch": 0.23995190139786562, + "flos": 25264593423360.0, + "grad_norm": 3.585202907729512, + "language_loss": 0.75312221, + "learning_rate": 3.5551453169631994e-06, + "loss": 0.77485824, + "num_input_tokens_seen": 85818995, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.87109375, + "step": 3991, + "time_per_iteration": 2.502324104309082 + }, + { + "auxiliary_loss_clip": 0.01050073, + "auxiliary_loss_mlp": 0.01009423, + "balance_loss_clip": 1.00774217, + "balance_loss_mlp": 1.02049088, + "epoch": 0.2400120246505336, + "flos": 61960379650560.0, + "grad_norm": 0.8894590829003932, + "language_loss": 0.63734841, + "learning_rate": 3.554900396661656e-06, + "loss": 0.65794337, + "num_input_tokens_seen": 85876695, + "router_z_loss_clip": 0.0168457, + "router_z_loss_mlp": 0.296875, + "step": 3992, + "time_per_iteration": 3.0017786026000977 + }, + { + "auxiliary_loss_clip": 0.01050397, + "auxiliary_loss_mlp": 0.01010168, + "balance_loss_clip": 1.00857067, + "balance_loss_mlp": 1.02071452, + "epoch": 0.24007214790320155, + "flos": 66708560540160.0, + "grad_norm": 0.7530514643625366, + "language_loss": 0.62963343, + "learning_rate": 3.5546554173981334e-06, + "loss": 0.65023899, + "num_input_tokens_seen": 85940990, + "router_z_loss_clip": 0.01599121, + "router_z_loss_mlp": 0.296875, + "step": 3993, + "time_per_iteration": 3.176184892654419 + }, + { + "auxiliary_loss_clip": 0.01140668, + "auxiliary_loss_mlp": 0.01047015, + "balance_loss_clip": 1.03085065, + "balance_loss_mlp": 1.05099177, + "epoch": 0.24013227115586952, + "flos": 25809070757760.0, + "grad_norm": 1.6383486345725178, + "language_loss": 0.76938868, + "learning_rate": 3.5544103791819218e-06, + "loss": 0.79126549, + "num_input_tokens_seen": 85961165, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8984375, + "step": 3994, + "time_per_iteration": 2.4940826892852783 + }, + { + "auxiliary_loss_clip": 0.01135853, + "auxiliary_loss_mlp": 0.01047966, + "balance_loss_clip": 1.0305258, + "balance_loss_mlp": 1.04680216, + "epoch": 0.2401923944085375, + "flos": 25557480921600.0, + "grad_norm": 1.7751147523393542, + "language_loss": 0.78457522, + "learning_rate": 3.5541652820223124e-06, + "loss": 0.80641341, + "num_input_tokens_seen": 85982710, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.890625, + "step": 3995, + "time_per_iteration": 2.5075032711029053 + }, + { + "auxiliary_loss_clip": 0.01047716, + "auxiliary_loss_mlp": 0.01003894, + "balance_loss_clip": 1.00232053, + "balance_loss_mlp": 1.01837659, + "epoch": 0.24025251766120548, + "flos": 54941138478720.0, + "grad_norm": 0.8913570860108078, + "language_loss": 0.63479292, + "learning_rate": 3.5539201259286006e-06, + "loss": 0.65530908, + "num_input_tokens_seen": 86046935, + "router_z_loss_clip": 0.01574707, + "router_z_loss_mlp": 0.29296875, + "step": 3996, + "time_per_iteration": 3.1365764141082764 + }, + { + "auxiliary_loss_clip": 0.01137569, + "auxiliary_loss_mlp": 0.01045722, + "balance_loss_clip": 1.02916384, + "balance_loss_mlp": 1.04678392, + "epoch": 0.24031264091387344, + "flos": 20631075724800.0, + "grad_norm": 2.906997418482602, + "language_loss": 0.7009505, + "learning_rate": 3.5536749109100808e-06, + "loss": 0.72278345, + "num_input_tokens_seen": 86064355, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.90625, + "step": 3997, + "time_per_iteration": 2.464714765548706 + }, + { + "auxiliary_loss_clip": 0.01134848, + "auxiliary_loss_mlp": 0.01042521, + "balance_loss_clip": 1.02654672, + "balance_loss_mlp": 1.04642928, + "epoch": 0.2403727641665414, + "flos": 20886256920960.0, + "grad_norm": 1.9831176119326495, + "language_loss": 0.87292743, + "learning_rate": 3.5534296369760535e-06, + "loss": 0.89470112, + "num_input_tokens_seen": 86081340, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8828125, + "step": 3998, + "time_per_iteration": 2.4639480113983154 + }, + { + "auxiliary_loss_clip": 0.01134933, + "auxiliary_loss_mlp": 0.0103944, + "balance_loss_clip": 1.02306032, + "balance_loss_mlp": 1.04208946, + "epoch": 0.24043288741920937, + "flos": 22820046451200.0, + "grad_norm": 1.9745565965944727, + "language_loss": 0.75798607, + "learning_rate": 3.5531843041358183e-06, + "loss": 0.77972972, + "num_input_tokens_seen": 86102260, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9296875, + "step": 3999, + "time_per_iteration": 2.4753127098083496 + }, + { + "auxiliary_loss_clip": 0.01132817, + "auxiliary_loss_mlp": 0.01038028, + "balance_loss_clip": 1.02317488, + "balance_loss_mlp": 1.04545271, + "epoch": 0.24049301067187734, + "flos": 27959652823680.0, + "grad_norm": 1.9306579449884984, + "language_loss": 0.72642016, + "learning_rate": 3.552938912398679e-06, + "loss": 0.74812865, + "num_input_tokens_seen": 86123400, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.875, + "step": 4000, + "time_per_iteration": 2.5172412395477295 + }, + { + "auxiliary_loss_clip": 0.01140243, + "auxiliary_loss_mlp": 0.01036911, + "balance_loss_clip": 1.02025795, + "balance_loss_mlp": 1.04728866, + "epoch": 0.24055313392454533, + "flos": 27451409333760.0, + "grad_norm": 2.4587541869300824, + "language_loss": 0.65991902, + "learning_rate": 3.5526934617739397e-06, + "loss": 0.68169051, + "num_input_tokens_seen": 86144060, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9296875, + "step": 4001, + "time_per_iteration": 2.511198043823242 + }, + { + "auxiliary_loss_clip": 0.01131233, + "auxiliary_loss_mlp": 0.01040424, + "balance_loss_clip": 1.02330589, + "balance_loss_mlp": 1.0427444, + "epoch": 0.2406132571772133, + "flos": 25556618995200.0, + "grad_norm": 2.6796652593661903, + "language_loss": 0.82567388, + "learning_rate": 3.5524479522709095e-06, + "loss": 0.84739041, + "num_input_tokens_seen": 86163005, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.8828125, + "step": 4002, + "time_per_iteration": 2.5147531032562256 + }, + { + "auxiliary_loss_clip": 0.01136125, + "auxiliary_loss_mlp": 0.01038837, + "balance_loss_clip": 1.02382851, + "balance_loss_mlp": 1.04682446, + "epoch": 0.24067338042988126, + "flos": 24791398629120.0, + "grad_norm": 1.8902513751119636, + "language_loss": 0.82875729, + "learning_rate": 3.552202383898897e-06, + "loss": 0.8505069, + "num_input_tokens_seen": 86182580, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.890625, + "step": 4003, + "time_per_iteration": 2.508004665374756 + }, + { + "auxiliary_loss_clip": 0.01134817, + "auxiliary_loss_mlp": 0.01037746, + "balance_loss_clip": 1.0214386, + "balance_loss_mlp": 1.04608846, + "epoch": 0.24073350368254923, + "flos": 21177923356800.0, + "grad_norm": 2.0497424292602835, + "language_loss": 0.87504768, + "learning_rate": 3.551956756667215e-06, + "loss": 0.89677334, + "num_input_tokens_seen": 86200665, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.88671875, + "step": 4004, + "time_per_iteration": 2.4581985473632812 + }, + { + "auxiliary_loss_clip": 0.01133072, + "auxiliary_loss_mlp": 0.0104917, + "balance_loss_clip": 1.03356552, + "balance_loss_mlp": 1.04228568, + "epoch": 0.2407936269352172, + "flos": 22494300986880.0, + "grad_norm": 1.9722136456468877, + "language_loss": 0.77630293, + "learning_rate": 3.551711070585177e-06, + "loss": 0.79812533, + "num_input_tokens_seen": 86221640, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.90625, + "step": 4005, + "time_per_iteration": 2.556365728378296 + }, + { + "auxiliary_loss_clip": 0.01130485, + "auxiliary_loss_mlp": 0.01036948, + "balance_loss_clip": 1.02141535, + "balance_loss_mlp": 1.04398429, + "epoch": 0.24085375018788516, + "flos": 18551129754240.0, + "grad_norm": 1.7295620858093623, + "language_loss": 0.78973985, + "learning_rate": 3.5514653256620995e-06, + "loss": 0.81141412, + "num_input_tokens_seen": 86240795, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8671875, + "step": 4006, + "time_per_iteration": 2.460961103439331 + }, + { + "auxiliary_loss_clip": 0.0113781, + "auxiliary_loss_mlp": 0.01038185, + "balance_loss_clip": 1.02072108, + "balance_loss_mlp": 1.04375279, + "epoch": 0.24091387344055312, + "flos": 24170539023360.0, + "grad_norm": 2.2017624810959346, + "language_loss": 0.71201313, + "learning_rate": 3.551219521907302e-06, + "loss": 0.73377299, + "num_input_tokens_seen": 86262000, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.94140625, + "step": 4007, + "time_per_iteration": 2.5169517993927 + }, + { + "auxiliary_loss_clip": 0.01131131, + "auxiliary_loss_mlp": 0.01039619, + "balance_loss_clip": 1.02459884, + "balance_loss_mlp": 1.04453456, + "epoch": 0.24097399669322112, + "flos": 11036319615360.0, + "grad_norm": 1.805972702734942, + "language_loss": 0.75857127, + "learning_rate": 3.5509736593301042e-06, + "loss": 0.7802788, + "num_input_tokens_seen": 86279680, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8671875, + "step": 4008, + "time_per_iteration": 2.4489922523498535 + }, + { + "auxiliary_loss_clip": 0.01131483, + "auxiliary_loss_mlp": 0.01034828, + "balance_loss_clip": 1.01940203, + "balance_loss_mlp": 1.04296207, + "epoch": 0.24103411994588908, + "flos": 17165085696000.0, + "grad_norm": 2.356516377050019, + "language_loss": 0.73922294, + "learning_rate": 3.5507277379398295e-06, + "loss": 0.76088601, + "num_input_tokens_seen": 86297180, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8828125, + "step": 4009, + "time_per_iteration": 2.4701087474823 + }, + { + "auxiliary_loss_clip": 0.01133056, + "auxiliary_loss_mlp": 0.0104248, + "balance_loss_clip": 1.02664948, + "balance_loss_mlp": 1.04632092, + "epoch": 0.24109424319855705, + "flos": 20667956014080.0, + "grad_norm": 1.636895821506206, + "language_loss": 0.79938453, + "learning_rate": 3.550481757745804e-06, + "loss": 0.82113993, + "num_input_tokens_seen": 86317660, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8671875, + "step": 4010, + "time_per_iteration": 3.9670608043670654 + }, + { + "auxiliary_loss_clip": 0.01131615, + "auxiliary_loss_mlp": 0.01047202, + "balance_loss_clip": 1.02923679, + "balance_loss_mlp": 1.04108143, + "epoch": 0.241154366451225, + "flos": 28181796485760.0, + "grad_norm": 2.295886994366384, + "language_loss": 0.70799017, + "learning_rate": 3.5502357187573555e-06, + "loss": 0.72977829, + "num_input_tokens_seen": 86338325, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.90625, + "step": 4011, + "time_per_iteration": 3.9544472694396973 + }, + { + "auxiliary_loss_clip": 0.01131445, + "auxiliary_loss_mlp": 0.01039733, + "balance_loss_clip": 1.02429593, + "balance_loss_mlp": 1.04258561, + "epoch": 0.24121448970389298, + "flos": 21689722293120.0, + "grad_norm": 1.6166610897431488, + "language_loss": 0.69062299, + "learning_rate": 3.5499896209838118e-06, + "loss": 0.71233475, + "num_input_tokens_seen": 86357615, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.88671875, + "step": 4012, + "time_per_iteration": 2.501347303390503 + }, + { + "auxiliary_loss_clip": 0.01133874, + "auxiliary_loss_mlp": 0.01039375, + "balance_loss_clip": 1.02145839, + "balance_loss_mlp": 1.04454589, + "epoch": 0.24127461295656094, + "flos": 39676191269760.0, + "grad_norm": 2.0861437601678303, + "language_loss": 0.73424822, + "learning_rate": 3.5497434644345073e-06, + "loss": 0.75598073, + "num_input_tokens_seen": 86380355, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.89453125, + "step": 4013, + "time_per_iteration": 2.6360883712768555 + }, + { + "auxiliary_loss_clip": 0.01133872, + "auxiliary_loss_mlp": 0.01036616, + "balance_loss_clip": 1.02110672, + "balance_loss_mlp": 1.04450822, + "epoch": 0.2413347362092289, + "flos": 19135863256320.0, + "grad_norm": 1.8416541794010313, + "language_loss": 0.88554955, + "learning_rate": 3.5494972491187753e-06, + "loss": 0.9072544, + "num_input_tokens_seen": 86399125, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.89453125, + "step": 4014, + "time_per_iteration": 2.4663264751434326 + }, + { + "auxiliary_loss_clip": 0.01137985, + "auxiliary_loss_mlp": 0.01043677, + "balance_loss_clip": 1.02643979, + "balance_loss_mlp": 1.04453659, + "epoch": 0.2413948594618969, + "flos": 26939430829440.0, + "grad_norm": 2.755357499792604, + "language_loss": 0.94270647, + "learning_rate": 3.549250975045952e-06, + "loss": 0.96452308, + "num_input_tokens_seen": 86418625, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.93359375, + "step": 4015, + "time_per_iteration": 2.470952033996582 + }, + { + "auxiliary_loss_clip": 0.01133849, + "auxiliary_loss_mlp": 0.01038159, + "balance_loss_clip": 1.02174377, + "balance_loss_mlp": 1.04334664, + "epoch": 0.24145498271456486, + "flos": 25228108183680.0, + "grad_norm": 1.8402084517778015, + "language_loss": 0.82513833, + "learning_rate": 3.5490046422253768e-06, + "loss": 0.84685838, + "num_input_tokens_seen": 86438375, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90625, + "step": 4016, + "time_per_iteration": 2.4922966957092285 + }, + { + "auxiliary_loss_clip": 0.01127395, + "auxiliary_loss_mlp": 0.01040098, + "balance_loss_clip": 1.02423143, + "balance_loss_mlp": 1.04197156, + "epoch": 0.24151510596723283, + "flos": 40661759617920.0, + "grad_norm": 3.4212830828584386, + "language_loss": 0.69553781, + "learning_rate": 3.54875825066639e-06, + "loss": 0.71721268, + "num_input_tokens_seen": 86463230, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8515625, + "step": 4017, + "time_per_iteration": 2.596977710723877 + }, + { + "auxiliary_loss_clip": 0.01135423, + "auxiliary_loss_mlp": 0.01046549, + "balance_loss_clip": 1.02959788, + "balance_loss_mlp": 1.04421043, + "epoch": 0.2415752292199008, + "flos": 18146667634560.0, + "grad_norm": 2.0038503347112084, + "language_loss": 0.85114455, + "learning_rate": 3.5485118003783353e-06, + "loss": 0.87296432, + "num_input_tokens_seen": 86481230, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.91015625, + "step": 4018, + "time_per_iteration": 2.440749406814575 + }, + { + "auxiliary_loss_clip": 0.01046553, + "auxiliary_loss_mlp": 0.01012788, + "balance_loss_clip": 1.0109762, + "balance_loss_mlp": 1.01676679, + "epoch": 0.24163535247256876, + "flos": 67288409792640.0, + "grad_norm": 0.8182663934779763, + "language_loss": 0.60620981, + "learning_rate": 3.548265291370558e-06, + "loss": 0.62680322, + "num_input_tokens_seen": 86541260, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.296875, + "step": 4019, + "time_per_iteration": 3.112665891647339 + }, + { + "auxiliary_loss_clip": 0.01134098, + "auxiliary_loss_mlp": 0.01038378, + "balance_loss_clip": 1.02299976, + "balance_loss_mlp": 1.04433608, + "epoch": 0.24169547572523672, + "flos": 24929941386240.0, + "grad_norm": 1.880182475838635, + "language_loss": 0.73690915, + "learning_rate": 3.5480187236524055e-06, + "loss": 0.75863391, + "num_input_tokens_seen": 86559580, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8984375, + "step": 4020, + "time_per_iteration": 2.5049281120300293 + }, + { + "auxiliary_loss_clip": 0.01134711, + "auxiliary_loss_mlp": 0.01037599, + "balance_loss_clip": 1.02199471, + "balance_loss_mlp": 1.04660118, + "epoch": 0.24175559897790472, + "flos": 18728312567040.0, + "grad_norm": 1.9671591580269927, + "language_loss": 0.82012737, + "learning_rate": 3.5477720972332285e-06, + "loss": 0.84185052, + "num_input_tokens_seen": 86577560, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8828125, + "step": 4021, + "time_per_iteration": 2.464092493057251 + }, + { + "auxiliary_loss_clip": 0.01137652, + "auxiliary_loss_mlp": 0.01048543, + "balance_loss_clip": 1.03036344, + "balance_loss_mlp": 1.04551053, + "epoch": 0.24181572223057268, + "flos": 23039281111680.0, + "grad_norm": 1.9434993168468309, + "language_loss": 0.76464498, + "learning_rate": 3.547525412122378e-06, + "loss": 0.78650689, + "num_input_tokens_seen": 86595350, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.921875, + "step": 4022, + "time_per_iteration": 2.4939990043640137 + }, + { + "auxiliary_loss_clip": 0.01140564, + "auxiliary_loss_mlp": 0.01042084, + "balance_loss_clip": 1.0248704, + "balance_loss_mlp": 1.04610109, + "epoch": 0.24187584548324065, + "flos": 20376145923840.0, + "grad_norm": 1.893594506248005, + "language_loss": 0.75172901, + "learning_rate": 3.5472786683292083e-06, + "loss": 0.77355558, + "num_input_tokens_seen": 86614805, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9453125, + "step": 4023, + "time_per_iteration": 2.442469358444214 + }, + { + "auxiliary_loss_clip": 0.0113425, + "auxiliary_loss_mlp": 0.010453, + "balance_loss_clip": 1.0288136, + "balance_loss_mlp": 1.04636168, + "epoch": 0.2419359687359086, + "flos": 21397517153280.0, + "grad_norm": 1.7406117596406352, + "language_loss": 0.81464303, + "learning_rate": 3.5470318658630766e-06, + "loss": 0.83643848, + "num_input_tokens_seen": 86633700, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.87890625, + "step": 4024, + "time_per_iteration": 2.45035719871521 + }, + { + "auxiliary_loss_clip": 0.01134068, + "auxiliary_loss_mlp": 0.01045811, + "balance_loss_clip": 1.02951503, + "balance_loss_mlp": 1.0462923, + "epoch": 0.24199609198857658, + "flos": 18369385914240.0, + "grad_norm": 1.8550338864746303, + "language_loss": 0.85851878, + "learning_rate": 3.5467850047333424e-06, + "loss": 0.88031757, + "num_input_tokens_seen": 86650905, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.87890625, + "step": 4025, + "time_per_iteration": 2.4191699028015137 + }, + { + "auxiliary_loss_clip": 0.01136643, + "auxiliary_loss_mlp": 0.01048637, + "balance_loss_clip": 1.03154194, + "balance_loss_mlp": 1.04397535, + "epoch": 0.24205621524124454, + "flos": 19463871277440.0, + "grad_norm": 1.9498897834730646, + "language_loss": 0.71243072, + "learning_rate": 3.546538084949365e-06, + "loss": 0.73428357, + "num_input_tokens_seen": 86669185, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9296875, + "step": 4026, + "time_per_iteration": 2.476792812347412 + }, + { + "auxiliary_loss_clip": 0.01132732, + "auxiliary_loss_mlp": 0.01041866, + "balance_loss_clip": 1.0259757, + "balance_loss_mlp": 1.04589748, + "epoch": 0.2421163384939125, + "flos": 14976330451200.0, + "grad_norm": 1.8853181761927913, + "language_loss": 0.64215046, + "learning_rate": 3.546291106520509e-06, + "loss": 0.66389644, + "num_input_tokens_seen": 86686805, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8671875, + "step": 4027, + "time_per_iteration": 2.443652868270874 + }, + { + "auxiliary_loss_clip": 0.01136833, + "auxiliary_loss_mlp": 0.01037586, + "balance_loss_clip": 1.02226758, + "balance_loss_mlp": 1.04601741, + "epoch": 0.2421764617465805, + "flos": 18662057930880.0, + "grad_norm": 2.5479611354975007, + "language_loss": 0.70294374, + "learning_rate": 3.5460440694561388e-06, + "loss": 0.72468793, + "num_input_tokens_seen": 86705520, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.91015625, + "step": 4028, + "time_per_iteration": 2.48252534866333 + }, + { + "auxiliary_loss_clip": 0.01044866, + "auxiliary_loss_mlp": 0.01007457, + "balance_loss_clip": 1.00585961, + "balance_loss_mlp": 1.01464319, + "epoch": 0.24223658499924847, + "flos": 64347327164160.0, + "grad_norm": 0.8570499142131055, + "language_loss": 0.55407649, + "learning_rate": 3.545796973765623e-06, + "loss": 0.57459968, + "num_input_tokens_seen": 86767320, + "router_z_loss_clip": 0.01599121, + "router_z_loss_mlp": 0.30078125, + "step": 4029, + "time_per_iteration": 3.094402551651001 + }, + { + "auxiliary_loss_clip": 0.01135659, + "auxiliary_loss_mlp": 0.01043386, + "balance_loss_clip": 1.02567101, + "balance_loss_mlp": 1.04526591, + "epoch": 0.24229670825191643, + "flos": 25775243124480.0, + "grad_norm": 2.019101437715354, + "language_loss": 0.73829788, + "learning_rate": 3.54554981945833e-06, + "loss": 0.76008832, + "num_input_tokens_seen": 86788110, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.90234375, + "step": 4030, + "time_per_iteration": 2.5176522731781006 + }, + { + "auxiliary_loss_clip": 0.01135714, + "auxiliary_loss_mlp": 0.01053146, + "balance_loss_clip": 1.03655171, + "balance_loss_mlp": 1.04541922, + "epoch": 0.2423568315045844, + "flos": 20667094087680.0, + "grad_norm": 2.062987020241499, + "language_loss": 0.76440287, + "learning_rate": 3.5453026065436343e-06, + "loss": 0.78629148, + "num_input_tokens_seen": 86807640, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.90234375, + "step": 4031, + "time_per_iteration": 2.4774179458618164 + }, + { + "auxiliary_loss_clip": 0.01140068, + "auxiliary_loss_mlp": 0.01046331, + "balance_loss_clip": 1.02974856, + "balance_loss_mlp": 1.0464952, + "epoch": 0.24241695475725236, + "flos": 22416805393920.0, + "grad_norm": 7.078640241023749, + "language_loss": 0.65947008, + "learning_rate": 3.5450553350309083e-06, + "loss": 0.68133402, + "num_input_tokens_seen": 86826795, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9375, + "step": 4032, + "time_per_iteration": 2.500488519668579 + }, + { + "auxiliary_loss_clip": 0.01130465, + "auxiliary_loss_mlp": 0.01046589, + "balance_loss_clip": 1.03026938, + "balance_loss_mlp": 1.04175007, + "epoch": 0.24247707800992033, + "flos": 17128995505920.0, + "grad_norm": 3.1167913511387995, + "language_loss": 0.81353086, + "learning_rate": 3.5448080049295286e-06, + "loss": 0.83530146, + "num_input_tokens_seen": 86843175, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.88671875, + "step": 4033, + "time_per_iteration": 2.434652805328369 + }, + { + "auxiliary_loss_clip": 0.0113019, + "auxiliary_loss_mlp": 0.01039201, + "balance_loss_clip": 1.02310205, + "balance_loss_mlp": 1.04302979, + "epoch": 0.2425372012625883, + "flos": 31613743399680.0, + "grad_norm": 2.0372289343003023, + "language_loss": 0.69200158, + "learning_rate": 3.5445606162488754e-06, + "loss": 0.71369547, + "num_input_tokens_seen": 86863185, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.87109375, + "step": 4034, + "time_per_iteration": 2.583693027496338 + }, + { + "auxiliary_loss_clip": 0.01132981, + "auxiliary_loss_mlp": 0.01036321, + "balance_loss_clip": 1.01868999, + "balance_loss_mlp": 1.04278564, + "epoch": 0.24259732451525629, + "flos": 16326032924160.0, + "grad_norm": 2.4913709616978554, + "language_loss": 0.95772272, + "learning_rate": 3.5443131689983283e-06, + "loss": 0.97941571, + "num_input_tokens_seen": 86880040, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8984375, + "step": 4035, + "time_per_iteration": 2.4757437705993652 + }, + { + "auxiliary_loss_clip": 0.01126986, + "auxiliary_loss_mlp": 0.01047233, + "balance_loss_clip": 1.03220701, + "balance_loss_mlp": 1.04172754, + "epoch": 0.24265744776792425, + "flos": 22856639431680.0, + "grad_norm": 2.0212510419571794, + "language_loss": 0.77875686, + "learning_rate": 3.5440656631872715e-06, + "loss": 0.80049908, + "num_input_tokens_seen": 86900610, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8515625, + "step": 4036, + "time_per_iteration": 2.5642547607421875 + }, + { + "auxiliary_loss_clip": 0.01134779, + "auxiliary_loss_mlp": 0.01043471, + "balance_loss_clip": 1.02642441, + "balance_loss_mlp": 1.04447269, + "epoch": 0.24271757102059222, + "flos": 21871573873920.0, + "grad_norm": 1.648393445666421, + "language_loss": 0.74427915, + "learning_rate": 3.5438180988250898e-06, + "loss": 0.76606166, + "num_input_tokens_seen": 86919385, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.90234375, + "step": 4037, + "time_per_iteration": 2.4529507160186768 + }, + { + "auxiliary_loss_clip": 0.01133991, + "auxiliary_loss_mlp": 0.0104144, + "balance_loss_clip": 1.02497733, + "balance_loss_mlp": 1.04398596, + "epoch": 0.24277769427326018, + "flos": 19208582340480.0, + "grad_norm": 2.7681997598872656, + "language_loss": 0.76223898, + "learning_rate": 3.543570475921171e-06, + "loss": 0.78399336, + "num_input_tokens_seen": 86938885, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8984375, + "step": 4038, + "time_per_iteration": 2.4618003368377686 + }, + { + "auxiliary_loss_clip": 0.01135029, + "auxiliary_loss_mlp": 0.01044933, + "balance_loss_clip": 1.02742147, + "balance_loss_mlp": 1.04415751, + "epoch": 0.24283781752592815, + "flos": 19499889640320.0, + "grad_norm": 2.0050890767905645, + "language_loss": 0.72632921, + "learning_rate": 3.543322794484905e-06, + "loss": 0.74812889, + "num_input_tokens_seen": 86957705, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.90625, + "step": 4039, + "time_per_iteration": 2.4261560440063477 + }, + { + "auxiliary_loss_clip": 0.01135297, + "auxiliary_loss_mlp": 0.01043184, + "balance_loss_clip": 1.02631593, + "balance_loss_mlp": 1.04608393, + "epoch": 0.2428979407785961, + "flos": 19902196944000.0, + "grad_norm": 1.6810247735848671, + "language_loss": 0.78330719, + "learning_rate": 3.5430750545256843e-06, + "loss": 0.80509198, + "num_input_tokens_seen": 86975845, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.890625, + "step": 4040, + "time_per_iteration": 2.4808037281036377 + }, + { + "auxiliary_loss_clip": 0.01128006, + "auxiliary_loss_mlp": 0.01034019, + "balance_loss_clip": 1.01912999, + "balance_loss_mlp": 1.04237986, + "epoch": 0.2429580640312641, + "flos": 24715878284160.0, + "grad_norm": 1.8145876332629047, + "language_loss": 0.80390251, + "learning_rate": 3.5428272560529027e-06, + "loss": 0.82552278, + "num_input_tokens_seen": 86994800, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.85546875, + "step": 4041, + "time_per_iteration": 2.482576847076416 + }, + { + "auxiliary_loss_clip": 0.01134051, + "auxiliary_loss_mlp": 0.01043295, + "balance_loss_clip": 1.02769041, + "balance_loss_mlp": 1.04653025, + "epoch": 0.24301818728393207, + "flos": 25630343660160.0, + "grad_norm": 4.455498217071982, + "language_loss": 0.76670969, + "learning_rate": 3.542579399075957e-06, + "loss": 0.78848314, + "num_input_tokens_seen": 87016845, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.875, + "step": 4042, + "time_per_iteration": 2.4944398403167725 + }, + { + "auxiliary_loss_clip": 0.01130826, + "auxiliary_loss_mlp": 0.01033112, + "balance_loss_clip": 1.01815128, + "balance_loss_mlp": 1.04393744, + "epoch": 0.24307831053660003, + "flos": 26141388410880.0, + "grad_norm": 1.7591863299055037, + "language_loss": 0.8139993, + "learning_rate": 3.542331483604246e-06, + "loss": 0.83563864, + "num_input_tokens_seen": 87036270, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8671875, + "step": 4043, + "time_per_iteration": 2.4965035915374756 + }, + { + "auxiliary_loss_clip": 0.01135997, + "auxiliary_loss_mlp": 0.01037391, + "balance_loss_clip": 1.02053475, + "balance_loss_mlp": 1.04298007, + "epoch": 0.243138433789268, + "flos": 14972415868800.0, + "grad_norm": 2.448799092011911, + "language_loss": 0.73345625, + "learning_rate": 3.5420835096471706e-06, + "loss": 0.75519013, + "num_input_tokens_seen": 87049920, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9296875, + "step": 4044, + "time_per_iteration": 2.42809796333313 + }, + { + "auxiliary_loss_clip": 0.01136098, + "auxiliary_loss_mlp": 0.01042356, + "balance_loss_clip": 1.0252496, + "balance_loss_mlp": 1.04730773, + "epoch": 0.24319855704193596, + "flos": 25191694771200.0, + "grad_norm": 1.780616714891853, + "language_loss": 0.83562207, + "learning_rate": 3.5418354772141337e-06, + "loss": 0.85740674, + "num_input_tokens_seen": 87068230, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.88671875, + "step": 4045, + "time_per_iteration": 2.4965107440948486 + }, + { + "auxiliary_loss_clip": 0.01134201, + "auxiliary_loss_mlp": 0.01045916, + "balance_loss_clip": 1.02944136, + "balance_loss_mlp": 1.04542089, + "epoch": 0.24325868029460393, + "flos": 22127221946880.0, + "grad_norm": 2.1598753545738663, + "language_loss": 0.86787856, + "learning_rate": 3.541587386314541e-06, + "loss": 0.88967973, + "num_input_tokens_seen": 87086435, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.890625, + "step": 4046, + "time_per_iteration": 2.5126357078552246 + }, + { + "auxiliary_loss_clip": 0.01128157, + "auxiliary_loss_mlp": 0.01041362, + "balance_loss_clip": 1.02526259, + "balance_loss_mlp": 1.04252553, + "epoch": 0.2433188035472719, + "flos": 23582106420480.0, + "grad_norm": 1.9885516182116696, + "language_loss": 0.7281425, + "learning_rate": 3.5413392369578e-06, + "loss": 0.7498377, + "num_input_tokens_seen": 87105340, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.859375, + "step": 4047, + "time_per_iteration": 2.4886271953582764 + }, + { + "auxiliary_loss_clip": 0.01133305, + "auxiliary_loss_mlp": 0.01039984, + "balance_loss_clip": 1.02243662, + "balance_loss_mlp": 1.0435816, + "epoch": 0.2433789267999399, + "flos": 24462815990400.0, + "grad_norm": 2.411807088840578, + "language_loss": 0.72845596, + "learning_rate": 3.5410910291533213e-06, + "loss": 0.75018883, + "num_input_tokens_seen": 87125780, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8984375, + "step": 4048, + "time_per_iteration": 2.522012710571289 + }, + { + "auxiliary_loss_clip": 0.01132229, + "auxiliary_loss_mlp": 0.01042433, + "balance_loss_clip": 1.02720952, + "balance_loss_mlp": 1.04504991, + "epoch": 0.24343905005260785, + "flos": 16727909264640.0, + "grad_norm": 4.923738678144707, + "language_loss": 0.72984087, + "learning_rate": 3.5408427629105155e-06, + "loss": 0.75158751, + "num_input_tokens_seen": 87144470, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.875, + "step": 4049, + "time_per_iteration": 2.4399380683898926 + }, + { + "auxiliary_loss_clip": 0.0112907, + "auxiliary_loss_mlp": 0.01041944, + "balance_loss_clip": 1.02654243, + "balance_loss_mlp": 1.04297137, + "epoch": 0.24349917330527582, + "flos": 20043756443520.0, + "grad_norm": 6.058583880667159, + "language_loss": 0.7388249, + "learning_rate": 3.5405944382387985e-06, + "loss": 0.760535, + "num_input_tokens_seen": 87162830, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.859375, + "step": 4050, + "time_per_iteration": 2.4589998722076416 + }, + { + "auxiliary_loss_clip": 0.01128476, + "auxiliary_loss_mlp": 0.01044223, + "balance_loss_clip": 1.02925062, + "balance_loss_mlp": 1.04373455, + "epoch": 0.24355929655794378, + "flos": 17420554200960.0, + "grad_norm": 3.083460080669968, + "language_loss": 0.74948591, + "learning_rate": 3.5403460551475854e-06, + "loss": 0.77121294, + "num_input_tokens_seen": 87180905, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.84765625, + "step": 4051, + "time_per_iteration": 2.4284183979034424 + }, + { + "auxiliary_loss_clip": 0.01129104, + "auxiliary_loss_mlp": 0.01038015, + "balance_loss_clip": 1.02251768, + "balance_loss_mlp": 1.04273975, + "epoch": 0.24361941981061175, + "flos": 25410929431680.0, + "grad_norm": 2.420510968298769, + "language_loss": 0.70638204, + "learning_rate": 3.540097613646296e-06, + "loss": 0.72805327, + "num_input_tokens_seen": 87202290, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.86328125, + "step": 4052, + "time_per_iteration": 5.468756675720215 + }, + { + "auxiliary_loss_clip": 0.01131368, + "auxiliary_loss_mlp": 0.01048115, + "balance_loss_clip": 1.03215313, + "balance_loss_mlp": 1.04370522, + "epoch": 0.2436795430632797, + "flos": 22820800636800.0, + "grad_norm": 1.61331134721481, + "language_loss": 0.81265736, + "learning_rate": 3.539849113744351e-06, + "loss": 0.83445215, + "num_input_tokens_seen": 87221650, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.875, + "step": 4053, + "time_per_iteration": 2.5280394554138184 + }, + { + "auxiliary_loss_clip": 0.01135173, + "auxiliary_loss_mlp": 0.01035062, + "balance_loss_clip": 1.01895714, + "balance_loss_mlp": 1.04522192, + "epoch": 0.2437396663159477, + "flos": 15157786982400.0, + "grad_norm": 1.5461481286352234, + "language_loss": 0.77842951, + "learning_rate": 3.539600555451172e-06, + "loss": 0.80013186, + "num_input_tokens_seen": 87238515, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8984375, + "step": 4054, + "time_per_iteration": 2.424604892730713 + }, + { + "auxiliary_loss_clip": 0.01128011, + "auxiliary_loss_mlp": 0.01044969, + "balance_loss_clip": 1.02990091, + "balance_loss_mlp": 1.04097724, + "epoch": 0.24379978956861567, + "flos": 22091131756800.0, + "grad_norm": 1.616998838355979, + "language_loss": 0.83784473, + "learning_rate": 3.5393519387761866e-06, + "loss": 0.85957456, + "num_input_tokens_seen": 87256290, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.87109375, + "step": 4055, + "time_per_iteration": 2.4814612865448 + }, + { + "auxiliary_loss_clip": 0.0113426, + "auxiliary_loss_mlp": 0.01038657, + "balance_loss_clip": 1.02194405, + "balance_loss_mlp": 1.04221749, + "epoch": 0.24385991282128364, + "flos": 31467766527360.0, + "grad_norm": 3.407480313131798, + "language_loss": 0.55291057, + "learning_rate": 3.5391032637288217e-06, + "loss": 0.57463974, + "num_input_tokens_seen": 87277085, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.921875, + "step": 4056, + "time_per_iteration": 2.5356216430664062 + }, + { + "auxiliary_loss_clip": 0.01133242, + "auxiliary_loss_mlp": 0.01043161, + "balance_loss_clip": 1.02626896, + "balance_loss_mlp": 1.04361272, + "epoch": 0.2439200360739516, + "flos": 23838795987840.0, + "grad_norm": 2.24663888381965, + "language_loss": 0.79832959, + "learning_rate": 3.538854530318506e-06, + "loss": 0.82009363, + "num_input_tokens_seen": 87293020, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8984375, + "step": 4057, + "time_per_iteration": 2.4915707111358643 + }, + { + "auxiliary_loss_clip": 0.01128391, + "auxiliary_loss_mlp": 0.01037777, + "balance_loss_clip": 1.02195764, + "balance_loss_mlp": 1.04218984, + "epoch": 0.24398015932661957, + "flos": 19169978198400.0, + "grad_norm": 1.7432058239394113, + "language_loss": 0.78817719, + "learning_rate": 3.538605738554673e-06, + "loss": 0.80983889, + "num_input_tokens_seen": 87311445, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.86328125, + "step": 4058, + "time_per_iteration": 2.426687002182007 + }, + { + "auxiliary_loss_clip": 0.01133605, + "auxiliary_loss_mlp": 0.01038515, + "balance_loss_clip": 1.02366126, + "balance_loss_mlp": 1.04273307, + "epoch": 0.24404028257928753, + "flos": 25262474520960.0, + "grad_norm": 1.688831116872718, + "language_loss": 0.85133582, + "learning_rate": 3.538356888446756e-06, + "loss": 0.87305701, + "num_input_tokens_seen": 87332055, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.90625, + "step": 4059, + "time_per_iteration": 2.499464511871338 + }, + { + "auxiliary_loss_clip": 0.01127196, + "auxiliary_loss_mlp": 0.01035816, + "balance_loss_clip": 1.02079606, + "balance_loss_mlp": 1.04288411, + "epoch": 0.2441004058319555, + "flos": 26467600752000.0, + "grad_norm": 1.6494662829711617, + "language_loss": 0.73770267, + "learning_rate": 3.5381079800041913e-06, + "loss": 0.75933278, + "num_input_tokens_seen": 87351295, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84375, + "step": 4060, + "time_per_iteration": 2.4955050945281982 + }, + { + "auxiliary_loss_clip": 0.01137637, + "auxiliary_loss_mlp": 0.01050854, + "balance_loss_clip": 1.03262711, + "balance_loss_mlp": 1.04506934, + "epoch": 0.2441605290846235, + "flos": 26760524163840.0, + "grad_norm": 1.8597953216817902, + "language_loss": 0.73587501, + "learning_rate": 3.5378590132364182e-06, + "loss": 0.75775993, + "num_input_tokens_seen": 87370650, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.92578125, + "step": 4061, + "time_per_iteration": 2.5002825260162354 + }, + { + "auxiliary_loss_clip": 0.01129662, + "auxiliary_loss_mlp": 0.01036553, + "balance_loss_clip": 1.02248669, + "balance_loss_mlp": 1.04437923, + "epoch": 0.24422065233729146, + "flos": 21105850717440.0, + "grad_norm": 1.6775055914479682, + "language_loss": 0.76006806, + "learning_rate": 3.5376099881528768e-06, + "loss": 0.78173012, + "num_input_tokens_seen": 87389020, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8515625, + "step": 4062, + "time_per_iteration": 2.478625535964966 + }, + { + "auxiliary_loss_clip": 0.01126984, + "auxiliary_loss_mlp": 0.01035611, + "balance_loss_clip": 1.0205195, + "balance_loss_mlp": 1.04376316, + "epoch": 0.24428077558995942, + "flos": 25263156879360.0, + "grad_norm": 1.7282475931571, + "language_loss": 0.85710216, + "learning_rate": 3.537360904763011e-06, + "loss": 0.87872803, + "num_input_tokens_seen": 87409695, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.83203125, + "step": 4063, + "time_per_iteration": 2.5161943435668945 + }, + { + "auxiliary_loss_clip": 0.01135931, + "auxiliary_loss_mlp": 0.01042417, + "balance_loss_clip": 1.02603722, + "balance_loss_mlp": 1.04589176, + "epoch": 0.24434089884262739, + "flos": 20485278420480.0, + "grad_norm": 6.32752237165424, + "language_loss": 0.68127096, + "learning_rate": 3.5371117630762656e-06, + "loss": 0.70305437, + "num_input_tokens_seen": 87428250, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8984375, + "step": 4064, + "time_per_iteration": 2.4434523582458496 + }, + { + "auxiliary_loss_clip": 0.01134926, + "auxiliary_loss_mlp": 0.01037547, + "balance_loss_clip": 1.02083397, + "balance_loss_mlp": 1.04318714, + "epoch": 0.24440102209529535, + "flos": 23621895711360.0, + "grad_norm": 1.5178524812834733, + "language_loss": 0.7003206, + "learning_rate": 3.536862563102088e-06, + "loss": 0.72204536, + "num_input_tokens_seen": 87449380, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.91796875, + "step": 4065, + "time_per_iteration": 2.513827085494995 + }, + { + "auxiliary_loss_clip": 0.01136726, + "auxiliary_loss_mlp": 0.01047876, + "balance_loss_clip": 1.02960134, + "balance_loss_mlp": 1.04461718, + "epoch": 0.24446114534796332, + "flos": 20554729367040.0, + "grad_norm": 2.0517728790430048, + "language_loss": 0.83912247, + "learning_rate": 3.5366133048499282e-06, + "loss": 0.86096847, + "num_input_tokens_seen": 87465365, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.921875, + "step": 4066, + "time_per_iteration": 2.4601314067840576 + }, + { + "auxiliary_loss_clip": 0.01053849, + "auxiliary_loss_mlp": 0.01006665, + "balance_loss_clip": 1.00455475, + "balance_loss_mlp": 1.02389407, + "epoch": 0.24452126860063128, + "flos": 60389575009920.0, + "grad_norm": 0.7387464995159381, + "language_loss": 0.52291965, + "learning_rate": 3.5363639883292374e-06, + "loss": 0.54352474, + "num_input_tokens_seen": 87522525, + "router_z_loss_clip": 0.02111816, + "router_z_loss_mlp": 0.29882812, + "step": 4067, + "time_per_iteration": 2.9973862171173096 + }, + { + "auxiliary_loss_clip": 0.01133754, + "auxiliary_loss_mlp": 0.01040771, + "balance_loss_clip": 1.0242008, + "balance_loss_mlp": 1.04483843, + "epoch": 0.24458139185329927, + "flos": 15121660878720.0, + "grad_norm": 3.022186633601072, + "language_loss": 0.71927387, + "learning_rate": 3.5361146135494706e-06, + "loss": 0.74101913, + "num_input_tokens_seen": 87539170, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.890625, + "step": 4068, + "time_per_iteration": 2.4484708309173584 + }, + { + "auxiliary_loss_clip": 0.01132664, + "auxiliary_loss_mlp": 0.01040777, + "balance_loss_clip": 1.02457666, + "balance_loss_mlp": 1.04505873, + "epoch": 0.24464151510596724, + "flos": 27998723842560.0, + "grad_norm": 1.494083672668599, + "language_loss": 0.77513826, + "learning_rate": 3.5358651805200835e-06, + "loss": 0.79687262, + "num_input_tokens_seen": 87558875, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.875, + "step": 4069, + "time_per_iteration": 2.5724000930786133 + }, + { + "auxiliary_loss_clip": 0.01133522, + "auxiliary_loss_mlp": 0.01047384, + "balance_loss_clip": 1.03101087, + "balance_loss_mlp": 1.04646873, + "epoch": 0.2447016383586352, + "flos": 19792884879360.0, + "grad_norm": 1.9755919994455295, + "language_loss": 0.80163878, + "learning_rate": 3.5356156892505347e-06, + "loss": 0.82344782, + "num_input_tokens_seen": 87576485, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.87109375, + "step": 4070, + "time_per_iteration": 2.4932186603546143 + }, + { + "auxiliary_loss_clip": 0.01130692, + "auxiliary_loss_mlp": 0.01045764, + "balance_loss_clip": 1.03018379, + "balance_loss_mlp": 1.04351497, + "epoch": 0.24476176161130317, + "flos": 26067340523520.0, + "grad_norm": 1.6271146290001441, + "language_loss": 0.8410303, + "learning_rate": 3.5353661397502854e-06, + "loss": 0.86279482, + "num_input_tokens_seen": 87598620, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.875, + "step": 4071, + "time_per_iteration": 2.5299296379089355 + }, + { + "auxiliary_loss_clip": 0.0113627, + "auxiliary_loss_mlp": 0.01045362, + "balance_loss_clip": 1.02795792, + "balance_loss_mlp": 1.04406631, + "epoch": 0.24482188486397113, + "flos": 18843550375680.0, + "grad_norm": 1.720640728536457, + "language_loss": 0.79751229, + "learning_rate": 3.535116532028798e-06, + "loss": 0.81932867, + "num_input_tokens_seen": 87616595, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.921875, + "step": 4072, + "time_per_iteration": 2.470327854156494 + }, + { + "auxiliary_loss_clip": 0.01129102, + "auxiliary_loss_mlp": 0.01043575, + "balance_loss_clip": 1.02906084, + "balance_loss_mlp": 1.04437995, + "epoch": 0.2448820081166391, + "flos": 21251791676160.0, + "grad_norm": 1.615929332251483, + "language_loss": 0.70322561, + "learning_rate": 3.5348668660955382e-06, + "loss": 0.7249524, + "num_input_tokens_seen": 87635755, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.84765625, + "step": 4073, + "time_per_iteration": 2.4951980113983154 + }, + { + "auxiliary_loss_clip": 0.01129351, + "auxiliary_loss_mlp": 0.01041111, + "balance_loss_clip": 1.02662683, + "balance_loss_mlp": 1.04456043, + "epoch": 0.2449421313693071, + "flos": 23950586090880.0, + "grad_norm": 2.5968867848691133, + "language_loss": 0.67692697, + "learning_rate": 3.5346171419599728e-06, + "loss": 0.69863164, + "num_input_tokens_seen": 87652885, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.84765625, + "step": 4074, + "time_per_iteration": 2.4697325229644775 + }, + { + "auxiliary_loss_clip": 0.01052266, + "auxiliary_loss_mlp": 0.01006876, + "balance_loss_clip": 1.00504053, + "balance_loss_mlp": 1.0222578, + "epoch": 0.24500225462197506, + "flos": 60687669980160.0, + "grad_norm": 0.896032421619399, + "language_loss": 0.68665123, + "learning_rate": 3.5343673596315718e-06, + "loss": 0.70724261, + "num_input_tokens_seen": 87713220, + "router_z_loss_clip": 0.01831055, + "router_z_loss_mlp": 0.30078125, + "step": 4075, + "time_per_iteration": 3.1993846893310547 + }, + { + "auxiliary_loss_clip": 0.01131428, + "auxiliary_loss_mlp": 0.01040459, + "balance_loss_clip": 1.02548659, + "balance_loss_mlp": 1.04603517, + "epoch": 0.24506237787464302, + "flos": 26284204886400.0, + "grad_norm": 2.243483207404797, + "language_loss": 0.79306483, + "learning_rate": 3.5341175191198063e-06, + "loss": 0.81478369, + "num_input_tokens_seen": 87732680, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.85546875, + "step": 4076, + "time_per_iteration": 2.542245388031006 + }, + { + "auxiliary_loss_clip": 0.01134594, + "auxiliary_loss_mlp": 0.01045082, + "balance_loss_clip": 1.02749884, + "balance_loss_mlp": 1.04342794, + "epoch": 0.245122501127311, + "flos": 20552287242240.0, + "grad_norm": 2.0630196459837618, + "language_loss": 0.82211018, + "learning_rate": 3.533867620434151e-06, + "loss": 0.84390688, + "num_input_tokens_seen": 87751880, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.91015625, + "step": 4077, + "time_per_iteration": 2.5165140628814697 + }, + { + "auxiliary_loss_clip": 0.01132098, + "auxiliary_loss_mlp": 0.01044565, + "balance_loss_clip": 1.02695799, + "balance_loss_mlp": 1.04380083, + "epoch": 0.24518262437997895, + "flos": 29132603447040.0, + "grad_norm": 12.782264679420269, + "language_loss": 0.61930454, + "learning_rate": 3.533617663584082e-06, + "loss": 0.64107114, + "num_input_tokens_seen": 87771795, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8828125, + "step": 4078, + "time_per_iteration": 2.5202372074127197 + }, + { + "auxiliary_loss_clip": 0.01129452, + "auxiliary_loss_mlp": 0.0103596, + "balance_loss_clip": 1.02035594, + "balance_loss_mlp": 1.04474652, + "epoch": 0.24524274763264692, + "flos": 23476924419840.0, + "grad_norm": 1.7044874550491866, + "language_loss": 0.75514519, + "learning_rate": 3.5333676485790765e-06, + "loss": 0.77679932, + "num_input_tokens_seen": 87793640, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84765625, + "step": 4079, + "time_per_iteration": 2.483339309692383 + }, + { + "auxiliary_loss_clip": 0.01129188, + "auxiliary_loss_mlp": 0.01042937, + "balance_loss_clip": 1.02686739, + "balance_loss_mlp": 1.04370368, + "epoch": 0.24530287088531488, + "flos": 17201175886080.0, + "grad_norm": 1.8257477744529516, + "language_loss": 0.74925131, + "learning_rate": 3.5331175754286173e-06, + "loss": 0.77097261, + "num_input_tokens_seen": 87812390, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.85546875, + "step": 4080, + "time_per_iteration": 2.4843389987945557 + }, + { + "auxiliary_loss_clip": 0.01125805, + "auxiliary_loss_mlp": 0.01039252, + "balance_loss_clip": 1.02375531, + "balance_loss_mlp": 1.04129529, + "epoch": 0.24536299413798288, + "flos": 14867449349760.0, + "grad_norm": 2.211780780293779, + "language_loss": 0.82807517, + "learning_rate": 3.532867444142186e-06, + "loss": 0.84972572, + "num_input_tokens_seen": 87830640, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.84375, + "step": 4081, + "time_per_iteration": 2.4753835201263428 + }, + { + "auxiliary_loss_clip": 0.01128982, + "auxiliary_loss_mlp": 0.01039204, + "balance_loss_clip": 1.02445221, + "balance_loss_mlp": 1.04313576, + "epoch": 0.24542311739065084, + "flos": 35262051886080.0, + "grad_norm": 4.1574914526272515, + "language_loss": 0.73153239, + "learning_rate": 3.532617254729267e-06, + "loss": 0.75321424, + "num_input_tokens_seen": 87850450, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.859375, + "step": 4082, + "time_per_iteration": 2.5975396633148193 + }, + { + "auxiliary_loss_clip": 0.01127179, + "auxiliary_loss_mlp": 0.01042857, + "balance_loss_clip": 1.02837873, + "balance_loss_mlp": 1.04274178, + "epoch": 0.2454832406433188, + "flos": 21503130117120.0, + "grad_norm": 1.543838453785988, + "language_loss": 0.71628594, + "learning_rate": 3.5323670071993485e-06, + "loss": 0.73798621, + "num_input_tokens_seen": 87868810, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.84375, + "step": 4083, + "time_per_iteration": 2.471519947052002 + }, + { + "auxiliary_loss_clip": 0.01131409, + "auxiliary_loss_mlp": 0.01040567, + "balance_loss_clip": 1.02285206, + "balance_loss_mlp": 1.04234004, + "epoch": 0.24554336389598677, + "flos": 14756664827520.0, + "grad_norm": 2.1941070650453094, + "language_loss": 0.74700832, + "learning_rate": 3.532116701561919e-06, + "loss": 0.76872808, + "num_input_tokens_seen": 87885685, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.890625, + "step": 4084, + "time_per_iteration": 2.4286506175994873 + }, + { + "auxiliary_loss_clip": 0.01126312, + "auxiliary_loss_mlp": 0.01035336, + "balance_loss_clip": 1.01986289, + "balance_loss_mlp": 1.04189909, + "epoch": 0.24560348714865474, + "flos": 14976402278400.0, + "grad_norm": 2.042106499003273, + "language_loss": 0.85206825, + "learning_rate": 3.531866337826471e-06, + "loss": 0.8736847, + "num_input_tokens_seen": 87903715, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84375, + "step": 4085, + "time_per_iteration": 2.4283318519592285 + }, + { + "auxiliary_loss_clip": 0.01130256, + "auxiliary_loss_mlp": 0.01048422, + "balance_loss_clip": 1.03209007, + "balance_loss_mlp": 1.04266381, + "epoch": 0.2456636104013227, + "flos": 22675326554880.0, + "grad_norm": 1.8090063737063005, + "language_loss": 0.7876097, + "learning_rate": 3.5316159160024982e-06, + "loss": 0.80939639, + "num_input_tokens_seen": 87923375, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.875, + "step": 4086, + "time_per_iteration": 2.478954792022705 + }, + { + "auxiliary_loss_clip": 0.01126651, + "auxiliary_loss_mlp": 0.01041575, + "balance_loss_clip": 1.02669752, + "balance_loss_mlp": 1.04330873, + "epoch": 0.2457237336539907, + "flos": 27417869009280.0, + "grad_norm": 1.6669278195562474, + "language_loss": 0.75269985, + "learning_rate": 3.531365436099496e-06, + "loss": 0.77438211, + "num_input_tokens_seen": 87943115, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8359375, + "step": 4087, + "time_per_iteration": 2.4871292114257812 + }, + { + "auxiliary_loss_clip": 0.01132319, + "auxiliary_loss_mlp": 0.01041093, + "balance_loss_clip": 1.02364135, + "balance_loss_mlp": 1.04574418, + "epoch": 0.24578385690665866, + "flos": 20412379768320.0, + "grad_norm": 2.5789657141026, + "language_loss": 0.79284519, + "learning_rate": 3.5311148981269635e-06, + "loss": 0.81457937, + "num_input_tokens_seen": 87959505, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.8671875, + "step": 4088, + "time_per_iteration": 2.479841709136963 + }, + { + "auxiliary_loss_clip": 0.01123487, + "auxiliary_loss_mlp": 0.01033801, + "balance_loss_clip": 1.0196631, + "balance_loss_mlp": 1.04091823, + "epoch": 0.24584398015932662, + "flos": 23915393740800.0, + "grad_norm": 1.6187757849670203, + "language_loss": 0.7736612, + "learning_rate": 3.5308643020944e-06, + "loss": 0.79523408, + "num_input_tokens_seen": 87979725, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.828125, + "step": 4089, + "time_per_iteration": 2.483436346054077 + }, + { + "auxiliary_loss_clip": 0.01129539, + "auxiliary_loss_mlp": 0.01040613, + "balance_loss_clip": 1.02440071, + "balance_loss_mlp": 1.04232669, + "epoch": 0.2459041034119946, + "flos": 41496359103360.0, + "grad_norm": 3.8690522662716416, + "language_loss": 0.81463957, + "learning_rate": 3.530613648011309e-06, + "loss": 0.83634108, + "num_input_tokens_seen": 87998270, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.87109375, + "step": 4090, + "time_per_iteration": 2.657944917678833 + }, + { + "auxiliary_loss_clip": 0.01132703, + "auxiliary_loss_mlp": 0.01049826, + "balance_loss_clip": 1.03265369, + "balance_loss_mlp": 1.04411578, + "epoch": 0.24596422666466256, + "flos": 19936814676480.0, + "grad_norm": 1.9398667366019489, + "language_loss": 0.72874928, + "learning_rate": 3.5303629358871946e-06, + "loss": 0.75057453, + "num_input_tokens_seen": 88016760, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.88671875, + "step": 4091, + "time_per_iteration": 2.448307991027832 + }, + { + "auxiliary_loss_clip": 0.0113285, + "auxiliary_loss_mlp": 0.01036521, + "balance_loss_clip": 1.02166772, + "balance_loss_mlp": 1.04811478, + "epoch": 0.24602434991733052, + "flos": 21544391865600.0, + "grad_norm": 1.9209724672120978, + "language_loss": 0.76486623, + "learning_rate": 3.5301121657315653e-06, + "loss": 0.78656, + "num_input_tokens_seen": 88036465, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.84765625, + "step": 4092, + "time_per_iteration": 2.510815143585205 + }, + { + "auxiliary_loss_clip": 0.01134482, + "auxiliary_loss_mlp": 0.01035407, + "balance_loss_clip": 1.01899242, + "balance_loss_mlp": 1.04404068, + "epoch": 0.24608447316999849, + "flos": 23185078416000.0, + "grad_norm": 2.544549098738024, + "language_loss": 0.80905128, + "learning_rate": 3.5298613375539287e-06, + "loss": 0.83075017, + "num_input_tokens_seen": 88053270, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90234375, + "step": 4093, + "time_per_iteration": 2.4658117294311523 + }, + { + "auxiliary_loss_clip": 0.01133522, + "auxiliary_loss_mlp": 0.01042815, + "balance_loss_clip": 1.02542281, + "balance_loss_mlp": 1.04285693, + "epoch": 0.24614459642266648, + "flos": 19641951930240.0, + "grad_norm": 1.9793331271335382, + "language_loss": 0.87355959, + "learning_rate": 3.529610451363797e-06, + "loss": 0.89532292, + "num_input_tokens_seen": 88072305, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.90625, + "step": 4094, + "time_per_iteration": 5.436578035354614 + }, + { + "auxiliary_loss_clip": 0.01055645, + "auxiliary_loss_mlp": 0.01004731, + "balance_loss_clip": 1.00285995, + "balance_loss_mlp": 1.02449679, + "epoch": 0.24620471967533444, + "flos": 61739816186880.0, + "grad_norm": 0.7591937233735362, + "language_loss": 0.57501638, + "learning_rate": 3.5293595071706833e-06, + "loss": 0.59562016, + "num_input_tokens_seen": 88137995, + "router_z_loss_clip": 0.01867676, + "router_z_loss_mlp": 0.3125, + "step": 4095, + "time_per_iteration": 3.1966967582702637 + }, + { + "auxiliary_loss_clip": 0.01055105, + "auxiliary_loss_mlp": 0.01001708, + "balance_loss_clip": 0.99987203, + "balance_loss_mlp": 1.02336812, + "epoch": 0.2462648429280024, + "flos": 69154436315520.0, + "grad_norm": 0.643968481445629, + "language_loss": 0.56195372, + "learning_rate": 3.5291085049841042e-06, + "loss": 0.58252186, + "num_input_tokens_seen": 88208490, + "router_z_loss_clip": 0.01831055, + "router_z_loss_mlp": 0.31640625, + "step": 4096, + "time_per_iteration": 3.187084436416626 + }, + { + "auxiliary_loss_clip": 0.01134655, + "auxiliary_loss_mlp": 0.01035351, + "balance_loss_clip": 1.02030087, + "balance_loss_mlp": 1.04697204, + "epoch": 0.24632496618067037, + "flos": 29459605887360.0, + "grad_norm": 2.0390556104017907, + "language_loss": 0.77674699, + "learning_rate": 3.5288574448135773e-06, + "loss": 0.79844701, + "num_input_tokens_seen": 88228050, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.875, + "step": 4097, + "time_per_iteration": 2.5585436820983887 + }, + { + "auxiliary_loss_clip": 0.01134087, + "auxiliary_loss_mlp": 0.01044655, + "balance_loss_clip": 1.02608228, + "balance_loss_mlp": 1.04491377, + "epoch": 0.24638508943333834, + "flos": 24316444068480.0, + "grad_norm": 2.135816170269485, + "language_loss": 0.76393569, + "learning_rate": 3.5286063266686235e-06, + "loss": 0.78572309, + "num_input_tokens_seen": 88248090, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.890625, + "step": 4098, + "time_per_iteration": 2.478665828704834 + }, + { + "auxiliary_loss_clip": 0.01133268, + "auxiliary_loss_mlp": 0.01040789, + "balance_loss_clip": 1.02568507, + "balance_loss_mlp": 1.04479909, + "epoch": 0.2464452126860063, + "flos": 26613254401920.0, + "grad_norm": 2.152719854213413, + "language_loss": 0.68733507, + "learning_rate": 3.528355150558764e-06, + "loss": 0.70907569, + "num_input_tokens_seen": 88267545, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8828125, + "step": 4099, + "time_per_iteration": 2.515821933746338 + }, + { + "auxiliary_loss_clip": 0.01124761, + "auxiliary_loss_mlp": 0.01041381, + "balance_loss_clip": 1.02621734, + "balance_loss_mlp": 1.04163074, + "epoch": 0.24650533593867427, + "flos": 31212405763200.0, + "grad_norm": 2.459538616056665, + "language_loss": 0.65975124, + "learning_rate": 3.5281039164935237e-06, + "loss": 0.68141258, + "num_input_tokens_seen": 88289785, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.83203125, + "step": 4100, + "time_per_iteration": 2.562962532043457 + }, + { + "auxiliary_loss_clip": 0.01051305, + "auxiliary_loss_mlp": 0.01002462, + "balance_loss_clip": 1.00055432, + "balance_loss_mlp": 1.02057505, + "epoch": 0.24656545919134226, + "flos": 68494002900480.0, + "grad_norm": 0.7078763540659354, + "language_loss": 0.61549371, + "learning_rate": 3.5278526244824304e-06, + "loss": 0.63603139, + "num_input_tokens_seen": 88357320, + "router_z_loss_clip": 0.01904297, + "router_z_loss_mlp": 0.30859375, + "step": 4101, + "time_per_iteration": 3.1617352962493896 + }, + { + "auxiliary_loss_clip": 0.01128803, + "auxiliary_loss_mlp": 0.01034815, + "balance_loss_clip": 1.01893687, + "balance_loss_mlp": 1.04385781, + "epoch": 0.24662558244401023, + "flos": 20084192179200.0, + "grad_norm": 1.7154022892986804, + "language_loss": 0.73020113, + "learning_rate": 3.527601274535012e-06, + "loss": 0.75183737, + "num_input_tokens_seen": 88377040, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8515625, + "step": 4102, + "time_per_iteration": 2.5522637367248535 + }, + { + "auxiliary_loss_clip": 0.01132375, + "auxiliary_loss_mlp": 0.0104013, + "balance_loss_clip": 1.02463281, + "balance_loss_mlp": 1.04294777, + "epoch": 0.2466857056966782, + "flos": 30701361012480.0, + "grad_norm": 2.2979425011191528, + "language_loss": 0.75574934, + "learning_rate": 3.5273498666608004e-06, + "loss": 0.7774744, + "num_input_tokens_seen": 88395085, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.89453125, + "step": 4103, + "time_per_iteration": 2.5117204189300537 + }, + { + "auxiliary_loss_clip": 0.01129454, + "auxiliary_loss_mlp": 0.01043396, + "balance_loss_clip": 1.02647424, + "balance_loss_mlp": 1.04096079, + "epoch": 0.24674582894934616, + "flos": 22528523669760.0, + "grad_norm": 2.002646106823912, + "language_loss": 0.78701174, + "learning_rate": 3.5270984008693288e-06, + "loss": 0.80874026, + "num_input_tokens_seen": 88413205, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.88671875, + "step": 4104, + "time_per_iteration": 2.5791869163513184 + }, + { + "auxiliary_loss_clip": 0.011264, + "auxiliary_loss_mlp": 0.01041575, + "balance_loss_clip": 1.02333593, + "balance_loss_mlp": 1.0411272, + "epoch": 0.24680595220201412, + "flos": 20704297599360.0, + "grad_norm": 1.7283937272898544, + "language_loss": 0.83567655, + "learning_rate": 3.526846877170133e-06, + "loss": 0.85735631, + "num_input_tokens_seen": 88431525, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.8515625, + "step": 4105, + "time_per_iteration": 2.447399854660034 + }, + { + "auxiliary_loss_clip": 0.01134164, + "auxiliary_loss_mlp": 0.01043152, + "balance_loss_clip": 1.02768457, + "balance_loss_mlp": 1.04806173, + "epoch": 0.2468660754546821, + "flos": 21831174051840.0, + "grad_norm": 1.7373974977996043, + "language_loss": 0.7646578, + "learning_rate": 3.52659529557275e-06, + "loss": 0.78643101, + "num_input_tokens_seen": 88451210, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.859375, + "step": 4106, + "time_per_iteration": 2.519059658050537 + }, + { + "auxiliary_loss_clip": 0.01127139, + "auxiliary_loss_mlp": 0.01042215, + "balance_loss_clip": 1.02539492, + "balance_loss_mlp": 1.04087114, + "epoch": 0.24692619870735008, + "flos": 15267709578240.0, + "grad_norm": 2.1665884513414513, + "language_loss": 0.72764528, + "learning_rate": 3.5263436560867205e-06, + "loss": 0.74933887, + "num_input_tokens_seen": 88467790, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.86328125, + "step": 4107, + "time_per_iteration": 2.4489266872406006 + }, + { + "auxiliary_loss_clip": 0.01131987, + "auxiliary_loss_mlp": 0.01048032, + "balance_loss_clip": 1.03173625, + "balance_loss_mlp": 1.0454886, + "epoch": 0.24698632196001805, + "flos": 29680097523840.0, + "grad_norm": 2.3712774609847274, + "language_loss": 0.65420353, + "learning_rate": 3.526091958721587e-06, + "loss": 0.67600369, + "num_input_tokens_seen": 88490330, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8671875, + "step": 4108, + "time_per_iteration": 2.5401792526245117 + }, + { + "auxiliary_loss_clip": 0.01131766, + "auxiliary_loss_mlp": 0.01046054, + "balance_loss_clip": 1.02961504, + "balance_loss_mlp": 1.04324555, + "epoch": 0.247046445212686, + "flos": 39165469741440.0, + "grad_norm": 2.174268382145969, + "language_loss": 0.72611141, + "learning_rate": 3.5258402034868936e-06, + "loss": 0.74788952, + "num_input_tokens_seen": 88512435, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8828125, + "step": 4109, + "time_per_iteration": 2.593358278274536 + }, + { + "auxiliary_loss_clip": 0.01133432, + "auxiliary_loss_mlp": 0.010446, + "balance_loss_clip": 1.02788687, + "balance_loss_mlp": 1.04414606, + "epoch": 0.24710656846535398, + "flos": 22998845376000.0, + "grad_norm": 1.7026194733932167, + "language_loss": 0.79302657, + "learning_rate": 3.5255883903921866e-06, + "loss": 0.81480682, + "num_input_tokens_seen": 88529780, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.89453125, + "step": 4110, + "time_per_iteration": 2.4776864051818848 + }, + { + "auxiliary_loss_clip": 0.01133691, + "auxiliary_loss_mlp": 0.01032561, + "balance_loss_clip": 1.01618171, + "balance_loss_mlp": 1.04541993, + "epoch": 0.24716669171802194, + "flos": 26432803451520.0, + "grad_norm": 2.5002063230568545, + "language_loss": 0.80653715, + "learning_rate": 3.5253365194470144e-06, + "loss": 0.82819968, + "num_input_tokens_seen": 88547200, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8828125, + "step": 4111, + "time_per_iteration": 2.4957237243652344 + }, + { + "auxiliary_loss_clip": 0.01130011, + "auxiliary_loss_mlp": 0.01039883, + "balance_loss_clip": 1.02517819, + "balance_loss_mlp": 1.04273677, + "epoch": 0.2472268149706899, + "flos": 23329870139520.0, + "grad_norm": 2.4547784256207663, + "language_loss": 0.75205207, + "learning_rate": 3.5250845906609294e-06, + "loss": 0.77375102, + "num_input_tokens_seen": 88566415, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.875, + "step": 4112, + "time_per_iteration": 2.481778860092163 + }, + { + "auxiliary_loss_clip": 0.01130648, + "auxiliary_loss_mlp": 0.01044261, + "balance_loss_clip": 1.02868617, + "balance_loss_mlp": 1.04366612, + "epoch": 0.24728693822335787, + "flos": 23768734510080.0, + "grad_norm": 1.9927491285660106, + "language_loss": 0.82454932, + "learning_rate": 3.5248326040434835e-06, + "loss": 0.8462984, + "num_input_tokens_seen": 88585225, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.87109375, + "step": 4113, + "time_per_iteration": 2.4658617973327637 + }, + { + "auxiliary_loss_clip": 0.01129834, + "auxiliary_loss_mlp": 0.01036833, + "balance_loss_clip": 1.0205375, + "balance_loss_mlp": 1.0423646, + "epoch": 0.24734706147602586, + "flos": 19317499355520.0, + "grad_norm": 2.834925175676511, + "language_loss": 0.87073094, + "learning_rate": 3.5245805596042322e-06, + "loss": 0.89239764, + "num_input_tokens_seen": 88603280, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.875, + "step": 4114, + "time_per_iteration": 2.4575555324554443 + }, + { + "auxiliary_loss_clip": 0.01130204, + "auxiliary_loss_mlp": 0.01038167, + "balance_loss_clip": 1.02274156, + "balance_loss_mlp": 1.04354906, + "epoch": 0.24740718472869383, + "flos": 28036932935040.0, + "grad_norm": 2.804779626044085, + "language_loss": 0.753479, + "learning_rate": 3.524328457352734e-06, + "loss": 0.7751627, + "num_input_tokens_seen": 88624925, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8671875, + "step": 4115, + "time_per_iteration": 2.5051238536834717 + }, + { + "auxiliary_loss_clip": 0.01052886, + "auxiliary_loss_mlp": 0.01002125, + "balance_loss_clip": 1.00016963, + "balance_loss_mlp": 1.02261877, + "epoch": 0.2474673079813618, + "flos": 68107569408000.0, + "grad_norm": 0.6664049604648837, + "language_loss": 0.58203655, + "learning_rate": 3.5240762972985475e-06, + "loss": 0.60258663, + "num_input_tokens_seen": 88691475, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.30273438, + "step": 4116, + "time_per_iteration": 3.172032117843628 + }, + { + "auxiliary_loss_clip": 0.01130845, + "auxiliary_loss_mlp": 0.01035114, + "balance_loss_clip": 1.01992679, + "balance_loss_mlp": 1.04510772, + "epoch": 0.24752743123402976, + "flos": 29462119839360.0, + "grad_norm": 1.6806447251481575, + "language_loss": 0.83616889, + "learning_rate": 3.523824079451235e-06, + "loss": 0.8578285, + "num_input_tokens_seen": 88713425, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.85546875, + "step": 4117, + "time_per_iteration": 2.5228748321533203 + }, + { + "auxiliary_loss_clip": 0.01053619, + "auxiliary_loss_mlp": 0.0100274, + "balance_loss_clip": 1.00073707, + "balance_loss_mlp": 1.02337885, + "epoch": 0.24758755448669773, + "flos": 58350459824640.0, + "grad_norm": 0.9069522642789956, + "language_loss": 0.63507527, + "learning_rate": 3.5235718038203602e-06, + "loss": 0.65563887, + "num_input_tokens_seen": 88769995, + "router_z_loss_clip": 0.02001953, + "router_z_loss_mlp": 0.30078125, + "step": 4118, + "time_per_iteration": 2.9459333419799805 + }, + { + "auxiliary_loss_clip": 0.0113153, + "auxiliary_loss_mlp": 0.01040526, + "balance_loss_clip": 1.02470684, + "balance_loss_mlp": 1.04544902, + "epoch": 0.2476476777393657, + "flos": 20484416494080.0, + "grad_norm": 1.5050779056214143, + "language_loss": 0.79252797, + "learning_rate": 3.523319470415491e-06, + "loss": 0.8142485, + "num_input_tokens_seen": 88789970, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.859375, + "step": 4119, + "time_per_iteration": 2.438519239425659 + }, + { + "auxiliary_loss_clip": 0.01129874, + "auxiliary_loss_mlp": 0.01039225, + "balance_loss_clip": 1.02359676, + "balance_loss_mlp": 1.04430819, + "epoch": 0.24770780099203366, + "flos": 20485853038080.0, + "grad_norm": 1.9430586352888408, + "language_loss": 0.73955107, + "learning_rate": 3.5230670792461943e-06, + "loss": 0.76124215, + "num_input_tokens_seen": 88810000, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.85546875, + "step": 4120, + "time_per_iteration": 2.4728164672851562 + }, + { + "auxiliary_loss_clip": 0.01132188, + "auxiliary_loss_mlp": 0.01047049, + "balance_loss_clip": 1.03010893, + "balance_loss_mlp": 1.0446558, + "epoch": 0.24776792424470165, + "flos": 15153405523200.0, + "grad_norm": 3.4886461941998563, + "language_loss": 0.88028777, + "learning_rate": 3.522814630322041e-06, + "loss": 0.90208006, + "num_input_tokens_seen": 88827515, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.875, + "step": 4121, + "time_per_iteration": 2.4117653369903564 + }, + { + "auxiliary_loss_clip": 0.01134577, + "auxiliary_loss_mlp": 0.01037836, + "balance_loss_clip": 1.02102745, + "balance_loss_mlp": 1.04516518, + "epoch": 0.2478280474973696, + "flos": 21725453347200.0, + "grad_norm": 2.7360865086006285, + "language_loss": 0.69088298, + "learning_rate": 3.5225621236526045e-06, + "loss": 0.71260709, + "num_input_tokens_seen": 88845025, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.89453125, + "step": 4122, + "time_per_iteration": 2.484830617904663 + }, + { + "auxiliary_loss_clip": 0.0113273, + "auxiliary_loss_mlp": 0.01040601, + "balance_loss_clip": 1.02224231, + "balance_loss_mlp": 1.04380226, + "epoch": 0.24788817075003758, + "flos": 20412200200320.0, + "grad_norm": 2.016808492688271, + "language_loss": 0.80196065, + "learning_rate": 3.5223095592474596e-06, + "loss": 0.82369387, + "num_input_tokens_seen": 88861740, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.890625, + "step": 4123, + "time_per_iteration": 2.43839955329895 + }, + { + "auxiliary_loss_clip": 0.01130784, + "auxiliary_loss_mlp": 0.01041496, + "balance_loss_clip": 1.02620113, + "balance_loss_mlp": 1.04464054, + "epoch": 0.24794829400270554, + "flos": 22594455083520.0, + "grad_norm": 2.3250466211888745, + "language_loss": 0.74919629, + "learning_rate": 3.5220569371161846e-06, + "loss": 0.77091914, + "num_input_tokens_seen": 88879740, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.86328125, + "step": 4124, + "time_per_iteration": 2.4909141063690186 + }, + { + "auxiliary_loss_clip": 0.01127616, + "auxiliary_loss_mlp": 0.01034142, + "balance_loss_clip": 1.01922846, + "balance_loss_mlp": 1.0432241, + "epoch": 0.2480084172553735, + "flos": 39676047615360.0, + "grad_norm": 1.6909299882519486, + "language_loss": 0.73759794, + "learning_rate": 3.521804257268357e-06, + "loss": 0.75921559, + "num_input_tokens_seen": 88904095, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.84375, + "step": 4125, + "time_per_iteration": 2.6068458557128906 + }, + { + "auxiliary_loss_clip": 0.01135393, + "auxiliary_loss_mlp": 0.01046005, + "balance_loss_clip": 1.02914929, + "balance_loss_mlp": 1.04383993, + "epoch": 0.24806854050804147, + "flos": 22053712763520.0, + "grad_norm": 2.376019449241759, + "language_loss": 0.69416726, + "learning_rate": 3.5215515197135595e-06, + "loss": 0.71598125, + "num_input_tokens_seen": 88920740, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9140625, + "step": 4126, + "time_per_iteration": 2.4516806602478027 + }, + { + "auxiliary_loss_clip": 0.01130323, + "auxiliary_loss_mlp": 0.01047803, + "balance_loss_clip": 1.03112614, + "balance_loss_mlp": 1.04299593, + "epoch": 0.24812866376070947, + "flos": 15486764670720.0, + "grad_norm": 2.081795572279456, + "language_loss": 0.81602275, + "learning_rate": 3.5212987244613764e-06, + "loss": 0.83780402, + "num_input_tokens_seen": 88938510, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.875, + "step": 4127, + "time_per_iteration": 2.482492446899414 + }, + { + "auxiliary_loss_clip": 0.01134053, + "auxiliary_loss_mlp": 0.01045575, + "balance_loss_clip": 1.02998269, + "balance_loss_mlp": 1.04527378, + "epoch": 0.24818878701337743, + "flos": 14757419013120.0, + "grad_norm": 5.2721581441441465, + "language_loss": 0.84604752, + "learning_rate": 3.5210458715213927e-06, + "loss": 0.86784381, + "num_input_tokens_seen": 88955235, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.88671875, + "step": 4128, + "time_per_iteration": 2.4577715396881104 + }, + { + "auxiliary_loss_clip": 0.01132567, + "auxiliary_loss_mlp": 0.01043389, + "balance_loss_clip": 1.02779055, + "balance_loss_mlp": 1.04397762, + "epoch": 0.2482489102660454, + "flos": 27089501852160.0, + "grad_norm": 3.598051635390234, + "language_loss": 0.65576231, + "learning_rate": 3.5207929609031973e-06, + "loss": 0.67752188, + "num_input_tokens_seen": 88975210, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8828125, + "step": 4129, + "time_per_iteration": 2.498321294784546 + }, + { + "auxiliary_loss_clip": 0.01130769, + "auxiliary_loss_mlp": 0.01043091, + "balance_loss_clip": 1.02573466, + "balance_loss_mlp": 1.04308498, + "epoch": 0.24830903351871336, + "flos": 26467528924800.0, + "grad_norm": 2.23477186449736, + "language_loss": 0.75251818, + "learning_rate": 3.5205399926163806e-06, + "loss": 0.77425677, + "num_input_tokens_seen": 88996120, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.875, + "step": 4130, + "time_per_iteration": 2.534014940261841 + }, + { + "auxiliary_loss_clip": 0.01132521, + "auxiliary_loss_mlp": 0.01048652, + "balance_loss_clip": 1.03198647, + "balance_loss_mlp": 1.04404271, + "epoch": 0.24836915677138133, + "flos": 10228436870400.0, + "grad_norm": 2.282827015603824, + "language_loss": 0.77323985, + "learning_rate": 3.520286966670535e-06, + "loss": 0.79505157, + "num_input_tokens_seen": 89008685, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8828125, + "step": 4131, + "time_per_iteration": 2.3971383571624756 + }, + { + "auxiliary_loss_clip": 0.011274, + "auxiliary_loss_mlp": 0.01036942, + "balance_loss_clip": 1.02241063, + "balance_loss_mlp": 1.0428257, + "epoch": 0.2484292800240493, + "flos": 30080429579520.0, + "grad_norm": 1.5452946340590639, + "language_loss": 0.83932686, + "learning_rate": 3.520033883075255e-06, + "loss": 0.86097032, + "num_input_tokens_seen": 89031160, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.84375, + "step": 4132, + "time_per_iteration": 2.552804470062256 + }, + { + "auxiliary_loss_clip": 0.01129759, + "auxiliary_loss_mlp": 0.01042276, + "balance_loss_clip": 1.02601552, + "balance_loss_mlp": 1.04280567, + "epoch": 0.24848940327671726, + "flos": 13442944803840.0, + "grad_norm": 2.4707160060639857, + "language_loss": 0.71077073, + "learning_rate": 3.5197807418401386e-06, + "loss": 0.73249108, + "num_input_tokens_seen": 89047235, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.87109375, + "step": 4133, + "time_per_iteration": 2.40258526802063 + }, + { + "auxiliary_loss_clip": 0.01138495, + "auxiliary_loss_mlp": 0.01044523, + "balance_loss_clip": 1.02486503, + "balance_loss_mlp": 1.0454644, + "epoch": 0.24854952652938525, + "flos": 19970247260160.0, + "grad_norm": 2.206352055564895, + "language_loss": 0.61492884, + "learning_rate": 3.5195275429747834e-06, + "loss": 0.63675898, + "num_input_tokens_seen": 89064790, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.9296875, + "step": 4134, + "time_per_iteration": 2.476027250289917 + }, + { + "auxiliary_loss_clip": 0.01133349, + "auxiliary_loss_mlp": 0.01037132, + "balance_loss_clip": 1.02063298, + "balance_loss_mlp": 1.04393268, + "epoch": 0.24860964978205322, + "flos": 18150187167360.0, + "grad_norm": 2.276340033899988, + "language_loss": 0.78899026, + "learning_rate": 3.5192742864887914e-06, + "loss": 0.81069505, + "num_input_tokens_seen": 89083250, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.89453125, + "step": 4135, + "time_per_iteration": 3.9668710231781006 + }, + { + "auxiliary_loss_clip": 0.01136879, + "auxiliary_loss_mlp": 0.01032054, + "balance_loss_clip": 1.01746297, + "balance_loss_mlp": 1.04908156, + "epoch": 0.24866977303472118, + "flos": 11728641329280.0, + "grad_norm": 2.12923907223803, + "language_loss": 0.82729924, + "learning_rate": 3.5190209723917662e-06, + "loss": 0.84898853, + "num_input_tokens_seen": 89100905, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.87890625, + "step": 4136, + "time_per_iteration": 3.8651821613311768 + }, + { + "auxiliary_loss_clip": 0.01135708, + "auxiliary_loss_mlp": 0.01045715, + "balance_loss_clip": 1.02919221, + "balance_loss_mlp": 1.04593039, + "epoch": 0.24872989628738915, + "flos": 34823582565120.0, + "grad_norm": 1.7063584090687087, + "language_loss": 0.70454097, + "learning_rate": 3.518767600693314e-06, + "loss": 0.72635514, + "num_input_tokens_seen": 89122630, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8984375, + "step": 4137, + "time_per_iteration": 2.581270456314087 + }, + { + "auxiliary_loss_clip": 0.01135031, + "auxiliary_loss_mlp": 0.01035007, + "balance_loss_clip": 1.0193553, + "balance_loss_mlp": 1.04428291, + "epoch": 0.2487900195400571, + "flos": 13699347062400.0, + "grad_norm": 2.0340803052703236, + "language_loss": 0.66840076, + "learning_rate": 3.518514171403042e-06, + "loss": 0.69010115, + "num_input_tokens_seen": 89141050, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.90625, + "step": 4138, + "time_per_iteration": 2.438858985900879 + }, + { + "auxiliary_loss_clip": 0.01130089, + "auxiliary_loss_mlp": 0.01035018, + "balance_loss_clip": 1.01977062, + "balance_loss_mlp": 1.0451256, + "epoch": 0.24885014279272508, + "flos": 25337815297920.0, + "grad_norm": 2.467393625239628, + "language_loss": 0.83937073, + "learning_rate": 3.51826068453056e-06, + "loss": 0.86102176, + "num_input_tokens_seen": 89160810, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8515625, + "step": 4139, + "time_per_iteration": 2.4858012199401855 + }, + { + "auxiliary_loss_clip": 0.01134672, + "auxiliary_loss_mlp": 0.0104164, + "balance_loss_clip": 1.02424788, + "balance_loss_mlp": 1.04416132, + "epoch": 0.24891026604539307, + "flos": 20631434860800.0, + "grad_norm": 1.5320149755260415, + "language_loss": 0.7864905, + "learning_rate": 3.518007140085481e-06, + "loss": 0.80825365, + "num_input_tokens_seen": 89180610, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.90625, + "step": 4140, + "time_per_iteration": 2.4608240127563477 + }, + { + "auxiliary_loss_clip": 0.01058216, + "auxiliary_loss_mlp": 0.01013447, + "balance_loss_clip": 1.01150382, + "balance_loss_mlp": 1.02780879, + "epoch": 0.24897038929806103, + "flos": 66960294030720.0, + "grad_norm": 0.8230161703115366, + "language_loss": 0.60980695, + "learning_rate": 3.51775353807742e-06, + "loss": 0.63052356, + "num_input_tokens_seen": 89241880, + "router_z_loss_clip": 0.01940918, + "router_z_loss_mlp": 0.3046875, + "step": 4141, + "time_per_iteration": 3.1306700706481934 + }, + { + "auxiliary_loss_clip": 0.01136317, + "auxiliary_loss_mlp": 0.01042658, + "balance_loss_clip": 1.02537298, + "balance_loss_mlp": 1.04692519, + "epoch": 0.249030512550729, + "flos": 36392555612160.0, + "grad_norm": 2.804889663143828, + "language_loss": 0.72997624, + "learning_rate": 3.5174998785159913e-06, + "loss": 0.75176597, + "num_input_tokens_seen": 89263340, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.89453125, + "step": 4142, + "time_per_iteration": 2.60341215133667 + }, + { + "auxiliary_loss_clip": 0.011336, + "auxiliary_loss_mlp": 0.01039484, + "balance_loss_clip": 1.02335465, + "balance_loss_mlp": 1.04601634, + "epoch": 0.24909063580339696, + "flos": 20154576879360.0, + "grad_norm": 2.0852522280017873, + "language_loss": 0.80985868, + "learning_rate": 3.5172461614108157e-06, + "loss": 0.83158958, + "num_input_tokens_seen": 89282870, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.875, + "step": 4143, + "time_per_iteration": 2.4621787071228027 + }, + { + "auxiliary_loss_clip": 0.01127478, + "auxiliary_loss_mlp": 0.01036024, + "balance_loss_clip": 1.02113485, + "balance_loss_mlp": 1.04291701, + "epoch": 0.24915075905606493, + "flos": 26396569607040.0, + "grad_norm": 1.8417531415701045, + "language_loss": 0.5884496, + "learning_rate": 3.5169923867715137e-06, + "loss": 0.61008459, + "num_input_tokens_seen": 89303830, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.84765625, + "step": 4144, + "time_per_iteration": 2.5253236293792725 + }, + { + "auxiliary_loss_clip": 0.0113091, + "auxiliary_loss_mlp": 0.0103722, + "balance_loss_clip": 1.02135301, + "balance_loss_mlp": 1.04400194, + "epoch": 0.2492108823087329, + "flos": 27527216987520.0, + "grad_norm": 2.2350400575734146, + "language_loss": 0.78882402, + "learning_rate": 3.516738554607708e-06, + "loss": 0.81050527, + "num_input_tokens_seen": 89324350, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8671875, + "step": 4145, + "time_per_iteration": 2.500868797302246 + }, + { + "auxiliary_loss_clip": 0.01141282, + "auxiliary_loss_mlp": 0.01049792, + "balance_loss_clip": 1.02981293, + "balance_loss_mlp": 1.04593182, + "epoch": 0.24927100556140086, + "flos": 16691388111360.0, + "grad_norm": 2.0986803435557415, + "language_loss": 0.65651333, + "learning_rate": 3.5164846649290253e-06, + "loss": 0.678424, + "num_input_tokens_seen": 89342875, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.953125, + "step": 4146, + "time_per_iteration": 2.482405424118042 + }, + { + "auxiliary_loss_clip": 0.01048172, + "auxiliary_loss_mlp": 0.01006681, + "balance_loss_clip": 1.00482178, + "balance_loss_mlp": 1.01849687, + "epoch": 0.24933112881406885, + "flos": 62772464286720.0, + "grad_norm": 3.0854856510049458, + "language_loss": 0.67327654, + "learning_rate": 3.5162307177450915e-06, + "loss": 0.69382501, + "num_input_tokens_seen": 89404925, + "router_z_loss_clip": 0.01855469, + "router_z_loss_mlp": 0.296875, + "step": 4147, + "time_per_iteration": 3.1769258975982666 + }, + { + "auxiliary_loss_clip": 0.01136528, + "auxiliary_loss_mlp": 0.01046222, + "balance_loss_clip": 1.02930617, + "balance_loss_mlp": 1.04857254, + "epoch": 0.24939125206673682, + "flos": 26651894457600.0, + "grad_norm": 2.0368820911017025, + "language_loss": 0.8893261, + "learning_rate": 3.5159767130655366e-06, + "loss": 0.91115361, + "num_input_tokens_seen": 89425090, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8828125, + "step": 4148, + "time_per_iteration": 2.5202085971832275 + }, + { + "auxiliary_loss_clip": 0.0113885, + "auxiliary_loss_mlp": 0.01045745, + "balance_loss_clip": 1.02649307, + "balance_loss_mlp": 1.04754162, + "epoch": 0.24945137531940478, + "flos": 20704333512960.0, + "grad_norm": 1.8605307211390085, + "language_loss": 0.68053228, + "learning_rate": 3.5157226508999935e-06, + "loss": 0.70237827, + "num_input_tokens_seen": 89442615, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.9140625, + "step": 4149, + "time_per_iteration": 2.455733060836792 + }, + { + "auxiliary_loss_clip": 0.01133288, + "auxiliary_loss_mlp": 0.01038884, + "balance_loss_clip": 1.02291596, + "balance_loss_mlp": 1.04694724, + "epoch": 0.24951149857207275, + "flos": 23768662682880.0, + "grad_norm": 2.99652773874907, + "language_loss": 0.71235985, + "learning_rate": 3.515468531258095e-06, + "loss": 0.73408163, + "num_input_tokens_seen": 89463025, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.86328125, + "step": 4150, + "time_per_iteration": 2.514190196990967 + }, + { + "auxiliary_loss_clip": 0.01134014, + "auxiliary_loss_mlp": 0.01049321, + "balance_loss_clip": 1.03256035, + "balance_loss_mlp": 1.04471052, + "epoch": 0.2495716218247407, + "flos": 15664881237120.0, + "grad_norm": 1.862035570914478, + "language_loss": 0.72954226, + "learning_rate": 3.515214354149478e-06, + "loss": 0.75137556, + "num_input_tokens_seen": 89480225, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.89453125, + "step": 4151, + "time_per_iteration": 2.4198975563049316 + }, + { + "auxiliary_loss_clip": 0.01141172, + "auxiliary_loss_mlp": 0.01049288, + "balance_loss_clip": 1.03213382, + "balance_loss_mlp": 1.04694724, + "epoch": 0.24963174507740868, + "flos": 24052499953920.0, + "grad_norm": 4.099427504771762, + "language_loss": 0.62436807, + "learning_rate": 3.514960119583781e-06, + "loss": 0.64627266, + "num_input_tokens_seen": 89496985, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.94140625, + "step": 4152, + "time_per_iteration": 2.563032865524292 + }, + { + "auxiliary_loss_clip": 0.01131413, + "auxiliary_loss_mlp": 0.01038045, + "balance_loss_clip": 1.02188039, + "balance_loss_mlp": 1.04631066, + "epoch": 0.24969186833007664, + "flos": 21799501234560.0, + "grad_norm": 2.3735561607913596, + "language_loss": 0.77219248, + "learning_rate": 3.514705827570645e-06, + "loss": 0.79388708, + "num_input_tokens_seen": 89514420, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8515625, + "step": 4153, + "time_per_iteration": 2.5059967041015625 + }, + { + "auxiliary_loss_clip": 0.01132512, + "auxiliary_loss_mlp": 0.01040076, + "balance_loss_clip": 1.0242573, + "balance_loss_mlp": 1.04642224, + "epoch": 0.24975199158274464, + "flos": 19938143479680.0, + "grad_norm": 2.164577963489155, + "language_loss": 0.76443702, + "learning_rate": 3.514451478119711e-06, + "loss": 0.78616285, + "num_input_tokens_seen": 89532925, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.859375, + "step": 4154, + "time_per_iteration": 2.48317551612854 + }, + { + "auxiliary_loss_clip": 0.01138697, + "auxiliary_loss_mlp": 0.0104451, + "balance_loss_clip": 1.02586532, + "balance_loss_mlp": 1.04451203, + "epoch": 0.2498121148354126, + "flos": 25338389915520.0, + "grad_norm": 2.2000943153895722, + "language_loss": 0.70740849, + "learning_rate": 3.5141970712406258e-06, + "loss": 0.72924054, + "num_input_tokens_seen": 89552855, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.94140625, + "step": 4155, + "time_per_iteration": 2.498227834701538 + }, + { + "auxiliary_loss_clip": 0.01137147, + "auxiliary_loss_mlp": 0.01050913, + "balance_loss_clip": 1.03379464, + "balance_loss_mlp": 1.04736114, + "epoch": 0.24987223808808057, + "flos": 20558787603840.0, + "grad_norm": 1.8252469259439843, + "language_loss": 0.7499637, + "learning_rate": 3.513942606943036e-06, + "loss": 0.77184427, + "num_input_tokens_seen": 89572830, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.8984375, + "step": 4156, + "time_per_iteration": 2.473536729812622 + }, + { + "auxiliary_loss_clip": 0.01132111, + "auxiliary_loss_mlp": 0.01040008, + "balance_loss_clip": 1.0244987, + "balance_loss_mlp": 1.04498601, + "epoch": 0.24993236134074853, + "flos": 19749037351680.0, + "grad_norm": 2.1247768054564333, + "language_loss": 0.76757634, + "learning_rate": 3.513688085236591e-06, + "loss": 0.78929752, + "num_input_tokens_seen": 89590345, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.87109375, + "step": 4157, + "time_per_iteration": 2.476402759552002 + }, + { + "auxiliary_loss_clip": 0.01135567, + "auxiliary_loss_mlp": 0.01044785, + "balance_loss_clip": 1.02821517, + "balance_loss_mlp": 1.04551077, + "epoch": 0.2499924845934165, + "flos": 18770292587520.0, + "grad_norm": 1.6430173172536622, + "language_loss": 0.81497854, + "learning_rate": 3.513433506130942e-06, + "loss": 0.8367821, + "num_input_tokens_seen": 89610295, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8984375, + "step": 4158, + "time_per_iteration": 2.4706146717071533 + }, + { + "auxiliary_loss_clip": 0.01134661, + "auxiliary_loss_mlp": 0.01031651, + "balance_loss_clip": 1.01533163, + "balance_loss_mlp": 1.04511046, + "epoch": 0.25005260784608446, + "flos": 16872198197760.0, + "grad_norm": 2.425058111765743, + "language_loss": 0.75573325, + "learning_rate": 3.5131788696357427e-06, + "loss": 0.77739644, + "num_input_tokens_seen": 89627795, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.89453125, + "step": 4159, + "time_per_iteration": 2.447530746459961 + }, + { + "auxiliary_loss_clip": 0.01137664, + "auxiliary_loss_mlp": 0.01036787, + "balance_loss_clip": 1.01928759, + "balance_loss_mlp": 1.04643881, + "epoch": 0.2501127310987524, + "flos": 22124923476480.0, + "grad_norm": 2.3851333770237044, + "language_loss": 0.71434534, + "learning_rate": 3.512924175760649e-06, + "loss": 0.73608989, + "num_input_tokens_seen": 89648090, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9140625, + "step": 4160, + "time_per_iteration": 2.4909448623657227 + }, + { + "auxiliary_loss_clip": 0.01045571, + "auxiliary_loss_mlp": 0.01008394, + "balance_loss_clip": 1.0062604, + "balance_loss_mlp": 1.01615632, + "epoch": 0.2501728543514204, + "flos": 69458061980160.0, + "grad_norm": 0.7574731626167057, + "language_loss": 0.56755257, + "learning_rate": 3.5126694245153186e-06, + "loss": 0.58809221, + "num_input_tokens_seen": 89710345, + "router_z_loss_clip": 0.0213623, + "router_z_loss_mlp": 0.29492188, + "step": 4161, + "time_per_iteration": 3.1169064044952393 + }, + { + "auxiliary_loss_clip": 0.01143652, + "auxiliary_loss_mlp": 0.01045602, + "balance_loss_clip": 1.02767324, + "balance_loss_mlp": 1.04854345, + "epoch": 0.25023297760408836, + "flos": 16289978647680.0, + "grad_norm": 1.822598728260487, + "language_loss": 0.8071059, + "learning_rate": 3.5124146159094125e-06, + "loss": 0.82899845, + "num_input_tokens_seen": 89729390, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.94921875, + "step": 4162, + "time_per_iteration": 2.4679477214813232 + }, + { + "auxiliary_loss_clip": 0.01136921, + "auxiliary_loss_mlp": 0.01039377, + "balance_loss_clip": 1.02212739, + "balance_loss_mlp": 1.04364812, + "epoch": 0.2502931008567563, + "flos": 12237998140800.0, + "grad_norm": 2.543272880301035, + "language_loss": 0.87439299, + "learning_rate": 3.5121597499525927e-06, + "loss": 0.89615595, + "num_input_tokens_seen": 89742805, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.93359375, + "step": 4163, + "time_per_iteration": 2.411324977874756 + }, + { + "auxiliary_loss_clip": 0.01135069, + "auxiliary_loss_mlp": 0.01036709, + "balance_loss_clip": 1.02013874, + "balance_loss_mlp": 1.04609334, + "epoch": 0.25035322410942434, + "flos": 23181882105600.0, + "grad_norm": 1.8835095650007205, + "language_loss": 0.83242726, + "learning_rate": 3.5119048266545232e-06, + "loss": 0.85414505, + "num_input_tokens_seen": 89761145, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.890625, + "step": 4164, + "time_per_iteration": 2.4910058975219727 + }, + { + "auxiliary_loss_clip": 0.01130392, + "auxiliary_loss_mlp": 0.01047055, + "balance_loss_clip": 1.03235698, + "balance_loss_mlp": 1.04616356, + "epoch": 0.2504133473620923, + "flos": 20917534688640.0, + "grad_norm": 1.7333709529875627, + "language_loss": 0.74548686, + "learning_rate": 3.5116498460248716e-06, + "loss": 0.76726139, + "num_input_tokens_seen": 89780905, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.84375, + "step": 4165, + "time_per_iteration": 2.4566714763641357 + }, + { + "auxiliary_loss_clip": 0.01139627, + "auxiliary_loss_mlp": 0.01045895, + "balance_loss_clip": 1.02819216, + "balance_loss_mlp": 1.04689348, + "epoch": 0.2504734706147603, + "flos": 20776549806720.0, + "grad_norm": 5.301488379412456, + "language_loss": 0.74214685, + "learning_rate": 3.5113948080733062e-06, + "loss": 0.76400197, + "num_input_tokens_seen": 89799230, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9296875, + "step": 4166, + "time_per_iteration": 2.462092161178589 + }, + { + "auxiliary_loss_clip": 0.01134276, + "auxiliary_loss_mlp": 0.01045442, + "balance_loss_clip": 1.02898526, + "balance_loss_mlp": 1.04551435, + "epoch": 0.25053359386742824, + "flos": 24349373861760.0, + "grad_norm": 1.9752225074857819, + "language_loss": 0.82011521, + "learning_rate": 3.5111397128094973e-06, + "loss": 0.84191239, + "num_input_tokens_seen": 89818240, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.890625, + "step": 4167, + "time_per_iteration": 2.482534885406494 + }, + { + "auxiliary_loss_clip": 0.01134736, + "auxiliary_loss_mlp": 0.01044102, + "balance_loss_clip": 1.0280689, + "balance_loss_mlp": 1.04616201, + "epoch": 0.2505937171200962, + "flos": 21214336769280.0, + "grad_norm": 2.42679689243218, + "language_loss": 0.79602242, + "learning_rate": 3.51088456024312e-06, + "loss": 0.81781083, + "num_input_tokens_seen": 89834485, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8828125, + "step": 4168, + "time_per_iteration": 2.463700532913208 + }, + { + "auxiliary_loss_clip": 0.01139283, + "auxiliary_loss_mlp": 0.01042051, + "balance_loss_clip": 1.02353752, + "balance_loss_mlp": 1.04523754, + "epoch": 0.25065384037276417, + "flos": 41427231379200.0, + "grad_norm": 2.966293758738445, + "language_loss": 0.70029891, + "learning_rate": 3.510629350383849e-06, + "loss": 0.72211224, + "num_input_tokens_seen": 89855645, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9375, + "step": 4169, + "time_per_iteration": 2.6148693561553955 + }, + { + "auxiliary_loss_clip": 0.01131562, + "auxiliary_loss_mlp": 0.0104538, + "balance_loss_clip": 1.02926338, + "balance_loss_mlp": 1.0446701, + "epoch": 0.25071396362543213, + "flos": 26102389219200.0, + "grad_norm": 1.8138505316100015, + "language_loss": 0.77564663, + "learning_rate": 3.510374083241361e-06, + "loss": 0.79741603, + "num_input_tokens_seen": 89874895, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8671875, + "step": 4170, + "time_per_iteration": 2.522921562194824 + }, + { + "auxiliary_loss_clip": 0.01137572, + "auxiliary_loss_mlp": 0.01043275, + "balance_loss_clip": 1.02731323, + "balance_loss_mlp": 1.04796529, + "epoch": 0.2507740868781001, + "flos": 19098982967040.0, + "grad_norm": 2.4512078878938404, + "language_loss": 0.76246989, + "learning_rate": 3.5101187588253368e-06, + "loss": 0.78427839, + "num_input_tokens_seen": 89891700, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8984375, + "step": 4171, + "time_per_iteration": 2.4322195053100586 + }, + { + "auxiliary_loss_clip": 0.01046694, + "auxiliary_loss_mlp": 0.01021172, + "balance_loss_clip": 1.01924038, + "balance_loss_mlp": 1.01739454, + "epoch": 0.25083421013076806, + "flos": 64341868296960.0, + "grad_norm": 0.8497756598481241, + "language_loss": 0.60047227, + "learning_rate": 3.509863377145458e-06, + "loss": 0.62115091, + "num_input_tokens_seen": 89955775, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.29296875, + "step": 4172, + "time_per_iteration": 3.1110994815826416 + }, + { + "auxiliary_loss_clip": 0.01137052, + "auxiliary_loss_mlp": 0.01042686, + "balance_loss_clip": 1.02567458, + "balance_loss_mlp": 1.04652381, + "epoch": 0.25089433338343603, + "flos": 24279599692800.0, + "grad_norm": 1.4442293166181488, + "language_loss": 0.78647727, + "learning_rate": 3.509607938211409e-06, + "loss": 0.80827463, + "num_input_tokens_seen": 89977150, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.90625, + "step": 4173, + "time_per_iteration": 2.481062889099121 + }, + { + "auxiliary_loss_clip": 0.01140203, + "auxiliary_loss_mlp": 0.01046542, + "balance_loss_clip": 1.0300796, + "balance_loss_mlp": 1.05017626, + "epoch": 0.250954456636104, + "flos": 14721472477440.0, + "grad_norm": 2.4202296115923883, + "language_loss": 0.83543748, + "learning_rate": 3.509352442032875e-06, + "loss": 0.85730493, + "num_input_tokens_seen": 89994925, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8984375, + "step": 4174, + "time_per_iteration": 2.4566147327423096 + }, + { + "auxiliary_loss_clip": 0.01138282, + "auxiliary_loss_mlp": 0.01040651, + "balance_loss_clip": 1.02299595, + "balance_loss_mlp": 1.04786515, + "epoch": 0.25101457988877196, + "flos": 22273593868800.0, + "grad_norm": 2.0903096624482624, + "language_loss": 0.71291864, + "learning_rate": 3.509096888619545e-06, + "loss": 0.73470795, + "num_input_tokens_seen": 90013235, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.90234375, + "step": 4175, + "time_per_iteration": 2.4616360664367676 + }, + { + "auxiliary_loss_clip": 0.01138348, + "auxiliary_loss_mlp": 0.01036282, + "balance_loss_clip": 1.01866269, + "balance_loss_mlp": 1.0460453, + "epoch": 0.2510747031414399, + "flos": 25188929424000.0, + "grad_norm": 2.247188920587568, + "language_loss": 0.80564427, + "learning_rate": 3.50884127798111e-06, + "loss": 0.82739055, + "num_input_tokens_seen": 90032150, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.921875, + "step": 4176, + "time_per_iteration": 2.525686740875244 + }, + { + "auxiliary_loss_clip": 0.01138723, + "auxiliary_loss_mlp": 0.01044107, + "balance_loss_clip": 1.02553427, + "balance_loss_mlp": 1.04782593, + "epoch": 0.25113482639410795, + "flos": 20704189858560.0, + "grad_norm": 2.362252442770041, + "language_loss": 0.83099151, + "learning_rate": 3.5085856101272623e-06, + "loss": 0.8528198, + "num_input_tokens_seen": 90049085, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.90625, + "step": 4177, + "time_per_iteration": 5.424759387969971 + }, + { + "auxiliary_loss_clip": 0.01135735, + "auxiliary_loss_mlp": 0.01043794, + "balance_loss_clip": 1.02675891, + "balance_loss_mlp": 1.04777622, + "epoch": 0.2511949496467759, + "flos": 21506936958720.0, + "grad_norm": 2.9753996759374846, + "language_loss": 0.8209883, + "learning_rate": 3.508329885067698e-06, + "loss": 0.84278357, + "num_input_tokens_seen": 90067695, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.87890625, + "step": 4178, + "time_per_iteration": 2.451418161392212 + }, + { + "auxiliary_loss_clip": 0.01130203, + "auxiliary_loss_mlp": 0.01042984, + "balance_loss_clip": 1.02702177, + "balance_loss_mlp": 1.04445124, + "epoch": 0.2512550728994439, + "flos": 20701999128960.0, + "grad_norm": 2.6671564243834505, + "language_loss": 0.75406277, + "learning_rate": 3.508074102812112e-06, + "loss": 0.77579463, + "num_input_tokens_seen": 90083890, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.859375, + "step": 4179, + "time_per_iteration": 2.4710347652435303 + }, + { + "auxiliary_loss_clip": 0.01135846, + "auxiliary_loss_mlp": 0.01048206, + "balance_loss_clip": 1.03050375, + "balance_loss_mlp": 1.04526711, + "epoch": 0.25131519615211184, + "flos": 18478626151680.0, + "grad_norm": 2.189208999533023, + "language_loss": 0.70452499, + "learning_rate": 3.507818263370206e-06, + "loss": 0.72636557, + "num_input_tokens_seen": 90100995, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.90625, + "step": 4180, + "time_per_iteration": 2.433922290802002 + }, + { + "auxiliary_loss_clip": 0.01132491, + "auxiliary_loss_mlp": 0.01041648, + "balance_loss_clip": 1.02485168, + "balance_loss_mlp": 1.04449701, + "epoch": 0.2513753194047798, + "flos": 20484955198080.0, + "grad_norm": 2.0603947372587244, + "language_loss": 0.85379761, + "learning_rate": 3.5075623667516796e-06, + "loss": 0.875539, + "num_input_tokens_seen": 90120365, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8828125, + "step": 4181, + "time_per_iteration": 2.4513771533966064 + }, + { + "auxiliary_loss_clip": 0.01135555, + "auxiliary_loss_mlp": 0.01042648, + "balance_loss_clip": 1.02608991, + "balance_loss_mlp": 1.0464716, + "epoch": 0.25143544265744777, + "flos": 37670077704960.0, + "grad_norm": 1.9568163341605829, + "language_loss": 0.67662674, + "learning_rate": 3.507306412966238e-06, + "loss": 0.69840884, + "num_input_tokens_seen": 90142610, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.890625, + "step": 4182, + "time_per_iteration": 2.588513135910034 + }, + { + "auxiliary_loss_clip": 0.01047089, + "auxiliary_loss_mlp": 0.01008874, + "balance_loss_clip": 1.00675201, + "balance_loss_mlp": 1.01742792, + "epoch": 0.25149556591011574, + "flos": 69367457923200.0, + "grad_norm": 0.8484678873575391, + "language_loss": 0.70098495, + "learning_rate": 3.5070504020235853e-06, + "loss": 0.72154456, + "num_input_tokens_seen": 90200555, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.296875, + "step": 4183, + "time_per_iteration": 3.0990090370178223 + }, + { + "auxiliary_loss_clip": 0.01129729, + "auxiliary_loss_mlp": 0.01038846, + "balance_loss_clip": 1.02088118, + "balance_loss_mlp": 1.04070854, + "epoch": 0.2515556891627837, + "flos": 13990402967040.0, + "grad_norm": 1.7162399200173233, + "language_loss": 0.7452544, + "learning_rate": 3.506794333933431e-06, + "loss": 0.76694012, + "num_input_tokens_seen": 90218120, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.890625, + "step": 4184, + "time_per_iteration": 2.4367544651031494 + }, + { + "auxiliary_loss_clip": 0.01137253, + "auxiliary_loss_mlp": 0.01045885, + "balance_loss_clip": 1.02888608, + "balance_loss_mlp": 1.04825735, + "epoch": 0.25161581241545167, + "flos": 22163527618560.0, + "grad_norm": 1.9130230292696613, + "language_loss": 0.82872695, + "learning_rate": 3.506538208705484e-06, + "loss": 0.85055834, + "num_input_tokens_seen": 90236790, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.890625, + "step": 4185, + "time_per_iteration": 2.4604692459106445 + }, + { + "auxiliary_loss_clip": 0.01047588, + "auxiliary_loss_mlp": 0.01003961, + "balance_loss_clip": 1.00194633, + "balance_loss_mlp": 1.01820421, + "epoch": 0.25167593566811963, + "flos": 69358407696000.0, + "grad_norm": 0.7885291752286397, + "language_loss": 0.61534387, + "learning_rate": 3.5062820263494574e-06, + "loss": 0.63585937, + "num_input_tokens_seen": 90297070, + "router_z_loss_clip": 0.0201416, + "router_z_loss_mlp": 0.29296875, + "step": 4186, + "time_per_iteration": 2.9629924297332764 + }, + { + "auxiliary_loss_clip": 0.01133243, + "auxiliary_loss_mlp": 0.01040873, + "balance_loss_clip": 1.02320647, + "balance_loss_mlp": 1.04432559, + "epoch": 0.2517360589207876, + "flos": 13261452359040.0, + "grad_norm": 2.1070381215060308, + "language_loss": 0.79260957, + "learning_rate": 3.5060257868750656e-06, + "loss": 0.81435084, + "num_input_tokens_seen": 90315255, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.890625, + "step": 4187, + "time_per_iteration": 2.454988479614258 + }, + { + "auxiliary_loss_clip": 0.01136483, + "auxiliary_loss_mlp": 0.01049456, + "balance_loss_clip": 1.03235006, + "balance_loss_mlp": 1.04733062, + "epoch": 0.25179618217345556, + "flos": 20376828282240.0, + "grad_norm": 1.5254881034867085, + "language_loss": 0.79854965, + "learning_rate": 3.5057694902920244e-06, + "loss": 0.82040906, + "num_input_tokens_seen": 90334990, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.890625, + "step": 4188, + "time_per_iteration": 2.4807493686676025 + }, + { + "auxiliary_loss_clip": 0.01135001, + "auxiliary_loss_mlp": 0.01046554, + "balance_loss_clip": 1.03022218, + "balance_loss_mlp": 1.04635882, + "epoch": 0.25185630542612353, + "flos": 27664718250240.0, + "grad_norm": 1.727912733373243, + "language_loss": 0.74509478, + "learning_rate": 3.5055131366100534e-06, + "loss": 0.76691031, + "num_input_tokens_seen": 90351825, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.88671875, + "step": 4189, + "time_per_iteration": 2.4887545108795166 + }, + { + "auxiliary_loss_clip": 0.01131737, + "auxiliary_loss_mlp": 0.01044524, + "balance_loss_clip": 1.02914619, + "balance_loss_mlp": 1.04616165, + "epoch": 0.25191642867879155, + "flos": 20996430912000.0, + "grad_norm": 1.957544272457229, + "language_loss": 0.84454727, + "learning_rate": 3.5052567258388745e-06, + "loss": 0.86630988, + "num_input_tokens_seen": 90369860, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.85546875, + "step": 4190, + "time_per_iteration": 2.4629735946655273 + }, + { + "auxiliary_loss_clip": 0.01134245, + "auxiliary_loss_mlp": 0.01044478, + "balance_loss_clip": 1.02633452, + "balance_loss_mlp": 1.04529381, + "epoch": 0.2519765519314595, + "flos": 21105671149440.0, + "grad_norm": 1.9468541382775664, + "language_loss": 0.75593925, + "learning_rate": 3.5050002579882082e-06, + "loss": 0.77772641, + "num_input_tokens_seen": 90389245, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.88671875, + "step": 4191, + "time_per_iteration": 2.451493263244629 + }, + { + "auxiliary_loss_clip": 0.01042669, + "auxiliary_loss_mlp": 0.0101771, + "balance_loss_clip": 1.01577878, + "balance_loss_mlp": 1.01320672, + "epoch": 0.2520366751841275, + "flos": 62744993360640.0, + "grad_norm": 0.7165761170014687, + "language_loss": 0.57155997, + "learning_rate": 3.5047437330677823e-06, + "loss": 0.59216374, + "num_input_tokens_seen": 90456735, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.29492188, + "step": 4192, + "time_per_iteration": 3.1455304622650146 + }, + { + "auxiliary_loss_clip": 0.01132992, + "auxiliary_loss_mlp": 0.01042104, + "balance_loss_clip": 1.02593958, + "balance_loss_mlp": 1.04640245, + "epoch": 0.25209679843679544, + "flos": 22230716008320.0, + "grad_norm": 2.0419031963399434, + "language_loss": 0.76306844, + "learning_rate": 3.504487151087323e-06, + "loss": 0.78481936, + "num_input_tokens_seen": 90474165, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8671875, + "step": 4193, + "time_per_iteration": 2.46201491355896 + }, + { + "auxiliary_loss_clip": 0.01136471, + "auxiliary_loss_mlp": 0.01048175, + "balance_loss_clip": 1.03115189, + "balance_loss_mlp": 1.04506373, + "epoch": 0.2521569216894634, + "flos": 12166643773440.0, + "grad_norm": 2.1192679618590007, + "language_loss": 0.84261906, + "learning_rate": 3.5042305120565598e-06, + "loss": 0.86446548, + "num_input_tokens_seen": 90491660, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9140625, + "step": 4194, + "time_per_iteration": 2.4525146484375 + }, + { + "auxiliary_loss_clip": 0.01138489, + "auxiliary_loss_mlp": 0.01049416, + "balance_loss_clip": 1.03404951, + "balance_loss_mlp": 1.04636192, + "epoch": 0.2522170449421314, + "flos": 23699786353920.0, + "grad_norm": 1.488794247862028, + "language_loss": 0.88176262, + "learning_rate": 3.5039738159852253e-06, + "loss": 0.90364158, + "num_input_tokens_seen": 90514025, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.921875, + "step": 4195, + "time_per_iteration": 2.507788896560669 + }, + { + "auxiliary_loss_clip": 0.01136069, + "auxiliary_loss_mlp": 0.01042605, + "balance_loss_clip": 1.02323329, + "balance_loss_mlp": 1.04540074, + "epoch": 0.25227716819479934, + "flos": 20955456472320.0, + "grad_norm": 2.8940350432545787, + "language_loss": 0.85288155, + "learning_rate": 3.503717062883053e-06, + "loss": 0.87466824, + "num_input_tokens_seen": 90533530, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.90625, + "step": 4196, + "time_per_iteration": 2.4843344688415527 + }, + { + "auxiliary_loss_clip": 0.01135455, + "auxiliary_loss_mlp": 0.01042942, + "balance_loss_clip": 1.02644312, + "balance_loss_mlp": 1.0454607, + "epoch": 0.2523372914474673, + "flos": 23331342597120.0, + "grad_norm": 1.6596186150335415, + "language_loss": 0.83368516, + "learning_rate": 3.5034602527597786e-06, + "loss": 0.85546911, + "num_input_tokens_seen": 90554025, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8984375, + "step": 4197, + "time_per_iteration": 2.480834484100342 + }, + { + "auxiliary_loss_clip": 0.01139173, + "auxiliary_loss_mlp": 0.01047624, + "balance_loss_clip": 1.02840698, + "balance_loss_mlp": 1.04775643, + "epoch": 0.25239741470013527, + "flos": 36970321875840.0, + "grad_norm": 2.7573342641631093, + "language_loss": 0.72406292, + "learning_rate": 3.5032033856251405e-06, + "loss": 0.74593097, + "num_input_tokens_seen": 90576930, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.9140625, + "step": 4198, + "time_per_iteration": 2.6081368923187256 + }, + { + "auxiliary_loss_clip": 0.01139571, + "auxiliary_loss_mlp": 0.01052953, + "balance_loss_clip": 1.03469038, + "balance_loss_mlp": 1.0462662, + "epoch": 0.25245753795280323, + "flos": 18515757836160.0, + "grad_norm": 1.9511850390779815, + "language_loss": 0.76798427, + "learning_rate": 3.50294646148888e-06, + "loss": 0.7899096, + "num_input_tokens_seen": 90595710, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.93359375, + "step": 4199, + "time_per_iteration": 2.463322162628174 + }, + { + "auxiliary_loss_clip": 0.01137047, + "auxiliary_loss_mlp": 0.01039237, + "balance_loss_clip": 1.02334595, + "balance_loss_mlp": 1.04600453, + "epoch": 0.2525176612054712, + "flos": 32344884737280.0, + "grad_norm": 1.6881838085079777, + "language_loss": 0.727651, + "learning_rate": 3.502689480360739e-06, + "loss": 0.74941385, + "num_input_tokens_seen": 90617945, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.91015625, + "step": 4200, + "time_per_iteration": 2.586298942565918 + }, + { + "auxiliary_loss_clip": 0.01134782, + "auxiliary_loss_mlp": 0.01047981, + "balance_loss_clip": 1.03206062, + "balance_loss_mlp": 1.04300654, + "epoch": 0.25257778445813917, + "flos": 45258217459200.0, + "grad_norm": 1.7166145531144803, + "language_loss": 0.82271791, + "learning_rate": 3.5024324422504616e-06, + "loss": 0.84454548, + "num_input_tokens_seen": 90640855, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.91796875, + "step": 4201, + "time_per_iteration": 2.6430721282958984 + }, + { + "auxiliary_loss_clip": 0.01138395, + "auxiliary_loss_mlp": 0.01046441, + "balance_loss_clip": 1.02960861, + "balance_loss_mlp": 1.04680324, + "epoch": 0.25263790771080713, + "flos": 23367791923200.0, + "grad_norm": 1.8945534984036327, + "language_loss": 0.74844849, + "learning_rate": 3.5021753471677965e-06, + "loss": 0.77029681, + "num_input_tokens_seen": 90661350, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9140625, + "step": 4202, + "time_per_iteration": 2.477376699447632 + }, + { + "auxiliary_loss_clip": 0.01133899, + "auxiliary_loss_mlp": 0.01041807, + "balance_loss_clip": 1.02545786, + "balance_loss_mlp": 1.04550529, + "epoch": 0.25269803096347515, + "flos": 18515039564160.0, + "grad_norm": 1.8769942277842264, + "language_loss": 0.73058856, + "learning_rate": 3.501918195122491e-06, + "loss": 0.75234556, + "num_input_tokens_seen": 90680540, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.88671875, + "step": 4203, + "time_per_iteration": 2.4526968002319336 + }, + { + "auxiliary_loss_clip": 0.01134593, + "auxiliary_loss_mlp": 0.01040695, + "balance_loss_clip": 1.02403569, + "balance_loss_mlp": 1.04434335, + "epoch": 0.2527581542161431, + "flos": 24610552629120.0, + "grad_norm": 1.7217444479200419, + "language_loss": 0.77377844, + "learning_rate": 3.501660986124297e-06, + "loss": 0.79553127, + "num_input_tokens_seen": 90703460, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.90234375, + "step": 4204, + "time_per_iteration": 2.540573835372925 + }, + { + "auxiliary_loss_clip": 0.01135598, + "auxiliary_loss_mlp": 0.01051513, + "balance_loss_clip": 1.03463292, + "balance_loss_mlp": 1.04443574, + "epoch": 0.2528182774688111, + "flos": 12641275111680.0, + "grad_norm": 3.2226665017353655, + "language_loss": 0.72443974, + "learning_rate": 3.5014037201829684e-06, + "loss": 0.74631095, + "num_input_tokens_seen": 90718815, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9140625, + "step": 4205, + "time_per_iteration": 2.405823230743408 + }, + { + "auxiliary_loss_clip": 0.01131667, + "auxiliary_loss_mlp": 0.01038371, + "balance_loss_clip": 1.02304697, + "balance_loss_mlp": 1.04673433, + "epoch": 0.25287840072147905, + "flos": 46936789879680.0, + "grad_norm": 1.4419344159614245, + "language_loss": 0.75674903, + "learning_rate": 3.50114639730826e-06, + "loss": 0.77844942, + "num_input_tokens_seen": 90742125, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.84765625, + "step": 4206, + "time_per_iteration": 2.7117254734039307 + }, + { + "auxiliary_loss_clip": 0.01134608, + "auxiliary_loss_mlp": 0.01041465, + "balance_loss_clip": 1.02502584, + "balance_loss_mlp": 1.04381466, + "epoch": 0.252938523974147, + "flos": 18879712392960.0, + "grad_norm": 1.8459801280493204, + "language_loss": 0.79013956, + "learning_rate": 3.5008890175099296e-06, + "loss": 0.81190026, + "num_input_tokens_seen": 90760785, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90625, + "step": 4207, + "time_per_iteration": 2.4338433742523193 + }, + { + "auxiliary_loss_clip": 0.01131159, + "auxiliary_loss_mlp": 0.01042915, + "balance_loss_clip": 1.02688169, + "balance_loss_mlp": 1.04521704, + "epoch": 0.252998647226815, + "flos": 21434720664960.0, + "grad_norm": 1.5263501886522268, + "language_loss": 0.76010746, + "learning_rate": 3.5006315807977375e-06, + "loss": 0.78184819, + "num_input_tokens_seen": 90780045, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.859375, + "step": 4208, + "time_per_iteration": 2.4712774753570557 + }, + { + "auxiliary_loss_clip": 0.01131243, + "auxiliary_loss_mlp": 0.01042204, + "balance_loss_clip": 1.02559781, + "balance_loss_mlp": 1.04407811, + "epoch": 0.25305877047948294, + "flos": 25442171285760.0, + "grad_norm": 1.8494822470113228, + "language_loss": 0.6965062, + "learning_rate": 3.5003740871814456e-06, + "loss": 0.71824062, + "num_input_tokens_seen": 90797980, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.87109375, + "step": 4209, + "time_per_iteration": 2.4723262786865234 + }, + { + "auxiliary_loss_clip": 0.01046036, + "auxiliary_loss_mlp": 0.00999993, + "balance_loss_clip": 0.99819291, + "balance_loss_mlp": 1.01643014, + "epoch": 0.2531188937321509, + "flos": 60185603629440.0, + "grad_norm": 0.7581785291884388, + "language_loss": 0.55080217, + "learning_rate": 3.5001165366708175e-06, + "loss": 0.57126248, + "num_input_tokens_seen": 90864865, + "router_z_loss_clip": 0.01794434, + "router_z_loss_mlp": 0.296875, + "step": 4210, + "time_per_iteration": 3.141958236694336 + }, + { + "auxiliary_loss_clip": 0.0113523, + "auxiliary_loss_mlp": 0.01034659, + "balance_loss_clip": 1.01853585, + "balance_loss_mlp": 1.04541481, + "epoch": 0.25317901698481887, + "flos": 19682387665920.0, + "grad_norm": 2.0581011511690606, + "language_loss": 0.8021341, + "learning_rate": 3.4998589292756204e-06, + "loss": 0.82383299, + "num_input_tokens_seen": 90882885, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8984375, + "step": 4211, + "time_per_iteration": 2.4423909187316895 + }, + { + "auxiliary_loss_clip": 0.01128499, + "auxiliary_loss_mlp": 0.01039387, + "balance_loss_clip": 1.02402079, + "balance_loss_mlp": 1.04284227, + "epoch": 0.25323914023748684, + "flos": 24424355502720.0, + "grad_norm": 1.6375033978461933, + "language_loss": 0.78310406, + "learning_rate": 3.499601265005622e-06, + "loss": 0.80478293, + "num_input_tokens_seen": 90902985, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.85546875, + "step": 4212, + "time_per_iteration": 2.535416841506958 + }, + { + "auxiliary_loss_clip": 0.01131331, + "auxiliary_loss_mlp": 0.01040125, + "balance_loss_clip": 1.02356696, + "balance_loss_mlp": 1.04314673, + "epoch": 0.2532992634901548, + "flos": 25447450584960.0, + "grad_norm": 2.0206536972721088, + "language_loss": 0.53393918, + "learning_rate": 3.4993435438705938e-06, + "loss": 0.55565375, + "num_input_tokens_seen": 90923550, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8828125, + "step": 4213, + "time_per_iteration": 2.488844871520996 + }, + { + "auxiliary_loss_clip": 0.01132972, + "auxiliary_loss_mlp": 0.01042806, + "balance_loss_clip": 1.02566385, + "balance_loss_mlp": 1.04508567, + "epoch": 0.25335938674282277, + "flos": 18880538405760.0, + "grad_norm": 2.6682600080383816, + "language_loss": 0.65329081, + "learning_rate": 3.499085765880308e-06, + "loss": 0.67504859, + "num_input_tokens_seen": 90943260, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.87890625, + "step": 4214, + "time_per_iteration": 2.478422164916992 + }, + { + "auxiliary_loss_clip": 0.01043385, + "auxiliary_loss_mlp": 0.0100812, + "balance_loss_clip": 1.00630808, + "balance_loss_mlp": 1.0142169, + "epoch": 0.25341950999549073, + "flos": 53062649936640.0, + "grad_norm": 0.8479929036578698, + "language_loss": 0.58049941, + "learning_rate": 3.4988279310445396e-06, + "loss": 0.60101438, + "num_input_tokens_seen": 90996295, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.29296875, + "step": 4215, + "time_per_iteration": 2.824084997177124 + }, + { + "auxiliary_loss_clip": 0.01133433, + "auxiliary_loss_mlp": 0.01043479, + "balance_loss_clip": 1.02636075, + "balance_loss_mlp": 1.04583967, + "epoch": 0.2534796332481587, + "flos": 39020247054720.0, + "grad_norm": 1.7693463876532338, + "language_loss": 0.83949232, + "learning_rate": 3.498570039373066e-06, + "loss": 0.86126143, + "num_input_tokens_seen": 91017545, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.875, + "step": 4216, + "time_per_iteration": 2.650329828262329 + }, + { + "auxiliary_loss_clip": 0.01134428, + "auxiliary_loss_mlp": 0.01041784, + "balance_loss_clip": 1.02504706, + "balance_loss_mlp": 1.04571652, + "epoch": 0.2535397565008267, + "flos": 23586990670080.0, + "grad_norm": 1.7652170119003572, + "language_loss": 0.80028123, + "learning_rate": 3.498312090875666e-06, + "loss": 0.82204342, + "num_input_tokens_seen": 91037715, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.88671875, + "step": 4217, + "time_per_iteration": 2.49381160736084 + }, + { + "auxiliary_loss_clip": 0.01129632, + "auxiliary_loss_mlp": 0.01039348, + "balance_loss_clip": 1.02422011, + "balance_loss_mlp": 1.04193234, + "epoch": 0.2535998797534947, + "flos": 19281373251840.0, + "grad_norm": 2.1701414828965464, + "language_loss": 0.75014293, + "learning_rate": 3.4980540855621218e-06, + "loss": 0.7718327, + "num_input_tokens_seen": 91055295, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.87890625, + "step": 4218, + "time_per_iteration": 2.4794864654541016 + }, + { + "auxiliary_loss_clip": 0.01135591, + "auxiliary_loss_mlp": 0.01041436, + "balance_loss_clip": 1.02462721, + "balance_loss_mlp": 1.04470503, + "epoch": 0.25366000300616265, + "flos": 24024382583040.0, + "grad_norm": 1.8718582993796022, + "language_loss": 0.74483025, + "learning_rate": 3.4977960234422167e-06, + "loss": 0.76660055, + "num_input_tokens_seen": 91075485, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.90625, + "step": 4219, + "time_per_iteration": 5.428370952606201 + }, + { + "auxiliary_loss_clip": 0.01137942, + "auxiliary_loss_mlp": 0.01052625, + "balance_loss_clip": 1.0351491, + "balance_loss_mlp": 1.04695058, + "epoch": 0.2537201262588306, + "flos": 16289368116480.0, + "grad_norm": 2.1507448030921057, + "language_loss": 0.81194967, + "learning_rate": 3.497537904525736e-06, + "loss": 0.83385527, + "num_input_tokens_seen": 91093620, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.91015625, + "step": 4220, + "time_per_iteration": 2.454045534133911 + }, + { + "auxiliary_loss_clip": 0.01134951, + "auxiliary_loss_mlp": 0.01047743, + "balance_loss_clip": 1.03007603, + "balance_loss_mlp": 1.04596126, + "epoch": 0.2537802495114986, + "flos": 23294677789440.0, + "grad_norm": 2.058400170489012, + "language_loss": 0.70873475, + "learning_rate": 3.497279728822468e-06, + "loss": 0.73056173, + "num_input_tokens_seen": 91114110, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.890625, + "step": 4221, + "time_per_iteration": 2.4728429317474365 + }, + { + "auxiliary_loss_clip": 0.01134228, + "auxiliary_loss_mlp": 0.01039832, + "balance_loss_clip": 1.02309537, + "balance_loss_mlp": 1.0444454, + "epoch": 0.25384037276416654, + "flos": 17639142416640.0, + "grad_norm": 2.3290205392002847, + "language_loss": 0.62039649, + "learning_rate": 3.497021496342202e-06, + "loss": 0.64213717, + "num_input_tokens_seen": 91133135, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8984375, + "step": 4222, + "time_per_iteration": 2.4465436935424805 + }, + { + "auxiliary_loss_clip": 0.01137839, + "auxiliary_loss_mlp": 0.01052178, + "balance_loss_clip": 1.0352385, + "balance_loss_mlp": 1.04635429, + "epoch": 0.2539004960168345, + "flos": 21507044699520.0, + "grad_norm": 1.6514367228652884, + "language_loss": 0.74686599, + "learning_rate": 3.496763207094731e-06, + "loss": 0.76876616, + "num_input_tokens_seen": 91151805, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9140625, + "step": 4223, + "time_per_iteration": 2.449887275695801 + }, + { + "auxiliary_loss_clip": 0.01134875, + "auxiliary_loss_mlp": 0.01035973, + "balance_loss_clip": 1.02001095, + "balance_loss_mlp": 1.04763556, + "epoch": 0.2539606192695025, + "flos": 23950909313280.0, + "grad_norm": 1.7274606282993847, + "language_loss": 0.79782087, + "learning_rate": 3.49650486108985e-06, + "loss": 0.81952935, + "num_input_tokens_seen": 91172270, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.875, + "step": 4224, + "time_per_iteration": 2.4809348583221436 + }, + { + "auxiliary_loss_clip": 0.01129812, + "auxiliary_loss_mlp": 0.01043453, + "balance_loss_clip": 1.02668035, + "balance_loss_mlp": 1.04306865, + "epoch": 0.25402074252217044, + "flos": 24169784837760.0, + "grad_norm": 1.7388314634599362, + "language_loss": 0.77813148, + "learning_rate": 3.496246458337354e-06, + "loss": 0.79986417, + "num_input_tokens_seen": 91192080, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8671875, + "step": 4225, + "time_per_iteration": 2.4813735485076904 + }, + { + "auxiliary_loss_clip": 0.01135622, + "auxiliary_loss_mlp": 0.01054065, + "balance_loss_clip": 1.03661263, + "balance_loss_mlp": 1.04603362, + "epoch": 0.2540808657748384, + "flos": 22303758314880.0, + "grad_norm": 1.6070040517314534, + "language_loss": 0.84763634, + "learning_rate": 3.4959879988470426e-06, + "loss": 0.86953318, + "num_input_tokens_seen": 91211450, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.89453125, + "step": 4226, + "time_per_iteration": 2.4583990573883057 + }, + { + "auxiliary_loss_clip": 0.01130131, + "auxiliary_loss_mlp": 0.0104498, + "balance_loss_clip": 1.0277667, + "balance_loss_mlp": 1.04317141, + "epoch": 0.25414098902750637, + "flos": 27599541022080.0, + "grad_norm": 2.4872704745527168, + "language_loss": 0.70759654, + "learning_rate": 3.4957294826287164e-06, + "loss": 0.72934765, + "num_input_tokens_seen": 91231835, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.8671875, + "step": 4227, + "time_per_iteration": 2.532057762145996 + }, + { + "auxiliary_loss_clip": 0.01041509, + "auxiliary_loss_mlp": 0.01000975, + "balance_loss_clip": 0.9989962, + "balance_loss_mlp": 1.01186037, + "epoch": 0.25420111228017434, + "flos": 58170834887040.0, + "grad_norm": 0.9701035361715339, + "language_loss": 0.61865914, + "learning_rate": 3.4954709096921785e-06, + "loss": 0.63908398, + "num_input_tokens_seen": 91288755, + "router_z_loss_clip": 0.01977539, + "router_z_loss_mlp": 0.296875, + "step": 4228, + "time_per_iteration": 2.9040682315826416 + }, + { + "auxiliary_loss_clip": 0.01136332, + "auxiliary_loss_mlp": 0.01037582, + "balance_loss_clip": 1.02026105, + "balance_loss_mlp": 1.04564357, + "epoch": 0.2542612355328423, + "flos": 11464409905920.0, + "grad_norm": 4.885618231754604, + "language_loss": 0.86024547, + "learning_rate": 3.4952122800472336e-06, + "loss": 0.88198459, + "num_input_tokens_seen": 91302485, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.90625, + "step": 4229, + "time_per_iteration": 2.404157876968384 + }, + { + "auxiliary_loss_clip": 0.0113505, + "auxiliary_loss_mlp": 0.01044312, + "balance_loss_clip": 1.02696753, + "balance_loss_mlp": 1.0466435, + "epoch": 0.2543213587855103, + "flos": 22965879669120.0, + "grad_norm": 1.8862111092995248, + "language_loss": 0.77280557, + "learning_rate": 3.4949535937036892e-06, + "loss": 0.79459918, + "num_input_tokens_seen": 91321120, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.8828125, + "step": 4230, + "time_per_iteration": 2.4956207275390625 + }, + { + "auxiliary_loss_clip": 0.01133757, + "auxiliary_loss_mlp": 0.01046935, + "balance_loss_clip": 1.02980483, + "balance_loss_mlp": 1.04598594, + "epoch": 0.2543814820381783, + "flos": 18253178438400.0, + "grad_norm": 1.9381647251913205, + "language_loss": 0.75116754, + "learning_rate": 3.4946948506713544e-06, + "loss": 0.77297449, + "num_input_tokens_seen": 91338575, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.87890625, + "step": 4231, + "time_per_iteration": 2.4570302963256836 + }, + { + "auxiliary_loss_clip": 0.0113225, + "auxiliary_loss_mlp": 0.01038768, + "balance_loss_clip": 1.02253127, + "balance_loss_mlp": 1.04484463, + "epoch": 0.25444160529084625, + "flos": 15632705629440.0, + "grad_norm": 2.3236339630790916, + "language_loss": 0.74055511, + "learning_rate": 3.4944360509600416e-06, + "loss": 0.76226532, + "num_input_tokens_seen": 91357355, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.875, + "step": 4232, + "time_per_iteration": 2.4537932872772217 + }, + { + "auxiliary_loss_clip": 0.01134838, + "auxiliary_loss_mlp": 0.01041691, + "balance_loss_clip": 1.02412581, + "balance_loss_mlp": 1.04658151, + "epoch": 0.2545017285435142, + "flos": 24601610142720.0, + "grad_norm": 1.8521853851823955, + "language_loss": 0.86557174, + "learning_rate": 3.4941771945795637e-06, + "loss": 0.88733703, + "num_input_tokens_seen": 91376515, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8828125, + "step": 4233, + "time_per_iteration": 2.4943323135375977 + }, + { + "auxiliary_loss_clip": 0.01125532, + "auxiliary_loss_mlp": 0.01040876, + "balance_loss_clip": 1.02570057, + "balance_loss_mlp": 1.04215169, + "epoch": 0.2545618517961822, + "flos": 24679069822080.0, + "grad_norm": 1.5280608213400515, + "language_loss": 0.74841732, + "learning_rate": 3.493918281539737e-06, + "loss": 0.7700814, + "num_input_tokens_seen": 91397595, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8359375, + "step": 4234, + "time_per_iteration": 2.541349172592163 + }, + { + "auxiliary_loss_clip": 0.01133471, + "auxiliary_loss_mlp": 0.01042663, + "balance_loss_clip": 1.02661681, + "balance_loss_mlp": 1.04286838, + "epoch": 0.25462197504885015, + "flos": 23915106432000.0, + "grad_norm": 1.542232814469661, + "language_loss": 0.7489568, + "learning_rate": 3.493659311850379e-06, + "loss": 0.77071816, + "num_input_tokens_seen": 91417775, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.90625, + "step": 4235, + "time_per_iteration": 2.5059099197387695 + }, + { + "auxiliary_loss_clip": 0.01141785, + "auxiliary_loss_mlp": 0.01044345, + "balance_loss_clip": 1.02570069, + "balance_loss_mlp": 1.04655004, + "epoch": 0.2546820983015181, + "flos": 24789387467520.0, + "grad_norm": 2.0015253194085645, + "language_loss": 0.64487904, + "learning_rate": 3.4934002855213106e-06, + "loss": 0.6667403, + "num_input_tokens_seen": 91437665, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.953125, + "step": 4236, + "time_per_iteration": 2.512286424636841 + }, + { + "auxiliary_loss_clip": 0.01131709, + "auxiliary_loss_mlp": 0.01032901, + "balance_loss_clip": 1.01757693, + "balance_loss_mlp": 1.04509079, + "epoch": 0.2547422215541861, + "flos": 18734130570240.0, + "grad_norm": 1.5430935122242522, + "language_loss": 0.67046815, + "learning_rate": 3.493141202562354e-06, + "loss": 0.69211423, + "num_input_tokens_seen": 91456705, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8671875, + "step": 4237, + "time_per_iteration": 2.455911636352539 + }, + { + "auxiliary_loss_clip": 0.01134325, + "auxiliary_loss_mlp": 0.01045903, + "balance_loss_clip": 1.02916634, + "balance_loss_mlp": 1.04509199, + "epoch": 0.25480234480685404, + "flos": 21032449274880.0, + "grad_norm": 1.9754127990153556, + "language_loss": 0.74863333, + "learning_rate": 3.492882062983333e-06, + "loss": 0.77043563, + "num_input_tokens_seen": 91475535, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.89453125, + "step": 4238, + "time_per_iteration": 2.4770114421844482 + }, + { + "auxiliary_loss_clip": 0.01136693, + "auxiliary_loss_mlp": 0.0104647, + "balance_loss_clip": 1.02848125, + "balance_loss_mlp": 1.04734778, + "epoch": 0.254862468059522, + "flos": 25082167224960.0, + "grad_norm": 1.8397193389954023, + "language_loss": 0.8033936, + "learning_rate": 3.492622866794074e-06, + "loss": 0.82522523, + "num_input_tokens_seen": 91499140, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.89453125, + "step": 4239, + "time_per_iteration": 2.5087499618530273 + }, + { + "auxiliary_loss_clip": 0.01131893, + "auxiliary_loss_mlp": 0.01041684, + "balance_loss_clip": 1.02457762, + "balance_loss_mlp": 1.04512548, + "epoch": 0.25492259131219, + "flos": 20558392554240.0, + "grad_norm": 1.749971041952711, + "language_loss": 0.77208781, + "learning_rate": 3.492363614004407e-06, + "loss": 0.7938236, + "num_input_tokens_seen": 91518335, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.8671875, + "step": 4240, + "time_per_iteration": 2.4757072925567627 + }, + { + "auxiliary_loss_clip": 0.01141112, + "auxiliary_loss_mlp": 0.01042401, + "balance_loss_clip": 1.02463925, + "balance_loss_mlp": 1.04773092, + "epoch": 0.25498271456485794, + "flos": 25042485674880.0, + "grad_norm": 2.0511352101670126, + "language_loss": 0.83254647, + "learning_rate": 3.492104304624162e-06, + "loss": 0.85438156, + "num_input_tokens_seen": 91537655, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.93359375, + "step": 4241, + "time_per_iteration": 2.5062708854675293 + }, + { + "auxiliary_loss_clip": 0.01136148, + "auxiliary_loss_mlp": 0.01044003, + "balance_loss_clip": 1.02761221, + "balance_loss_mlp": 1.0463624, + "epoch": 0.2550428378175259, + "flos": 26178412354560.0, + "grad_norm": 1.6663950411566644, + "language_loss": 0.73410285, + "learning_rate": 3.4918449386631725e-06, + "loss": 0.75590432, + "num_input_tokens_seen": 91557545, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8984375, + "step": 4242, + "time_per_iteration": 2.5570173263549805 + }, + { + "auxiliary_loss_clip": 0.01136205, + "auxiliary_loss_mlp": 0.01038733, + "balance_loss_clip": 1.02249646, + "balance_loss_mlp": 1.04695976, + "epoch": 0.2551029610701939, + "flos": 15267170874240.0, + "grad_norm": 2.4092613771466453, + "language_loss": 0.72371018, + "learning_rate": 3.491585516131273e-06, + "loss": 0.74545956, + "num_input_tokens_seen": 91574405, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.890625, + "step": 4243, + "time_per_iteration": 2.440492868423462 + }, + { + "auxiliary_loss_clip": 0.01136318, + "auxiliary_loss_mlp": 0.01041492, + "balance_loss_clip": 1.02507675, + "balance_loss_mlp": 1.04668963, + "epoch": 0.2551630843228619, + "flos": 18112193556480.0, + "grad_norm": 2.3937572910440847, + "language_loss": 0.81865323, + "learning_rate": 3.491326037038301e-06, + "loss": 0.84043133, + "num_input_tokens_seen": 91593755, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8984375, + "step": 4244, + "time_per_iteration": 2.4728784561157227 + }, + { + "auxiliary_loss_clip": 0.01044231, + "auxiliary_loss_mlp": 0.01002536, + "balance_loss_clip": 1.00084293, + "balance_loss_mlp": 1.01474202, + "epoch": 0.25522320757552985, + "flos": 70520192167680.0, + "grad_norm": 0.7400094393930867, + "language_loss": 0.5777986, + "learning_rate": 3.4910665013940967e-06, + "loss": 0.5982663, + "num_input_tokens_seen": 91660335, + "router_z_loss_clip": 0.01696777, + "router_z_loss_mlp": 0.29492188, + "step": 4245, + "time_per_iteration": 3.155487537384033 + }, + { + "auxiliary_loss_clip": 0.01135489, + "auxiliary_loss_mlp": 0.01049355, + "balance_loss_clip": 1.03248656, + "balance_loss_mlp": 1.04526567, + "epoch": 0.2552833308281978, + "flos": 22893088757760.0, + "grad_norm": 1.9776048921576397, + "language_loss": 0.65246034, + "learning_rate": 3.4908069092085015e-06, + "loss": 0.67430878, + "num_input_tokens_seen": 91678500, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.90234375, + "step": 4246, + "time_per_iteration": 2.4889461994171143 + }, + { + "auxiliary_loss_clip": 0.01127053, + "auxiliary_loss_mlp": 0.0104224, + "balance_loss_clip": 1.02715993, + "balance_loss_mlp": 1.04366493, + "epoch": 0.2553434540808658, + "flos": 22053605022720.0, + "grad_norm": 1.748925776992144, + "language_loss": 0.81467927, + "learning_rate": 3.4905472604913585e-06, + "loss": 0.83637214, + "num_input_tokens_seen": 91696430, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8359375, + "step": 4247, + "time_per_iteration": 2.4680213928222656 + }, + { + "auxiliary_loss_clip": 0.0114026, + "auxiliary_loss_mlp": 0.01045845, + "balance_loss_clip": 1.02718902, + "balance_loss_mlp": 1.04570985, + "epoch": 0.25540357733353375, + "flos": 16544190176640.0, + "grad_norm": 2.9702547035135165, + "language_loss": 0.83062297, + "learning_rate": 3.490287555252514e-06, + "loss": 0.85248411, + "num_input_tokens_seen": 91713270, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.9453125, + "step": 4248, + "time_per_iteration": 2.446810245513916 + }, + { + "auxiliary_loss_clip": 0.01136577, + "auxiliary_loss_mlp": 0.01045007, + "balance_loss_clip": 1.02793586, + "balance_loss_mlp": 1.04672599, + "epoch": 0.2554637005862017, + "flos": 17565022702080.0, + "grad_norm": 2.21885342952208, + "language_loss": 0.84529531, + "learning_rate": 3.4900277935018166e-06, + "loss": 0.86711109, + "num_input_tokens_seen": 91728865, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.8984375, + "step": 4249, + "time_per_iteration": 2.4372382164001465 + }, + { + "auxiliary_loss_clip": 0.01044447, + "auxiliary_loss_mlp": 0.01003984, + "balance_loss_clip": 1.00217199, + "balance_loss_mlp": 1.01503897, + "epoch": 0.2555238238388697, + "flos": 72244763953920.0, + "grad_norm": 0.7531523874953217, + "language_loss": 0.56312215, + "learning_rate": 3.489767975249115e-06, + "loss": 0.58360648, + "num_input_tokens_seen": 91787470, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.29492188, + "step": 4250, + "time_per_iteration": 3.047654628753662 + }, + { + "auxiliary_loss_clip": 0.01133573, + "auxiliary_loss_mlp": 0.01038351, + "balance_loss_clip": 1.02139914, + "balance_loss_mlp": 1.04434705, + "epoch": 0.25558394709153764, + "flos": 24389414547840.0, + "grad_norm": 2.1374171101673243, + "language_loss": 0.80306417, + "learning_rate": 3.4895081005042632e-06, + "loss": 0.82478344, + "num_input_tokens_seen": 91805640, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.890625, + "step": 4251, + "time_per_iteration": 2.4866387844085693 + }, + { + "auxiliary_loss_clip": 0.01042955, + "auxiliary_loss_mlp": 0.01004928, + "balance_loss_clip": 1.00307989, + "balance_loss_mlp": 1.01383376, + "epoch": 0.2556440703442056, + "flos": 69231213636480.0, + "grad_norm": 0.7958061962206047, + "language_loss": 0.66077995, + "learning_rate": 3.4892481692771146e-06, + "loss": 0.6812588, + "num_input_tokens_seen": 91869695, + "router_z_loss_clip": 0.01843262, + "router_z_loss_mlp": 0.29296875, + "step": 4252, + "time_per_iteration": 3.117496967315674 + }, + { + "auxiliary_loss_clip": 0.011309, + "auxiliary_loss_mlp": 0.01037068, + "balance_loss_clip": 1.02198839, + "balance_loss_mlp": 1.04373813, + "epoch": 0.2557041935968736, + "flos": 24863902231680.0, + "grad_norm": 2.169743717969613, + "language_loss": 0.73382849, + "learning_rate": 3.4889881815775267e-06, + "loss": 0.75550812, + "num_input_tokens_seen": 91889920, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.87109375, + "step": 4253, + "time_per_iteration": 2.5709948539733887 + }, + { + "auxiliary_loss_clip": 0.01134729, + "auxiliary_loss_mlp": 0.01044447, + "balance_loss_clip": 1.02873516, + "balance_loss_mlp": 1.04698956, + "epoch": 0.25576431684954154, + "flos": 22492110257280.0, + "grad_norm": 1.9741012093631007, + "language_loss": 0.72927308, + "learning_rate": 3.488728137415357e-06, + "loss": 0.75106484, + "num_input_tokens_seen": 91908665, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.875, + "step": 4254, + "time_per_iteration": 2.509932518005371 + }, + { + "auxiliary_loss_clip": 0.01133463, + "auxiliary_loss_mlp": 0.01043601, + "balance_loss_clip": 1.02636361, + "balance_loss_mlp": 1.04452896, + "epoch": 0.2558244401022095, + "flos": 19826748426240.0, + "grad_norm": 1.7290530974650873, + "language_loss": 0.80863065, + "learning_rate": 3.4884680368004675e-06, + "loss": 0.8304013, + "num_input_tokens_seen": 91927855, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.890625, + "step": 4255, + "time_per_iteration": 2.4473092555999756 + }, + { + "auxiliary_loss_clip": 0.01133499, + "auxiliary_loss_mlp": 0.01043496, + "balance_loss_clip": 1.02681875, + "balance_loss_mlp": 1.04673088, + "epoch": 0.2558845633548775, + "flos": 23220486247680.0, + "grad_norm": 1.512169748685899, + "language_loss": 0.85572308, + "learning_rate": 3.488207879742721e-06, + "loss": 0.87749302, + "num_input_tokens_seen": 91948500, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8671875, + "step": 4256, + "time_per_iteration": 2.500788927078247 + }, + { + "auxiliary_loss_clip": 0.01136428, + "auxiliary_loss_mlp": 0.01048361, + "balance_loss_clip": 1.03119493, + "balance_loss_mlp": 1.04482555, + "epoch": 0.2559446866075455, + "flos": 16837867774080.0, + "grad_norm": 4.026866255210063, + "language_loss": 0.74821836, + "learning_rate": 3.4879476662519826e-06, + "loss": 0.77006626, + "num_input_tokens_seen": 91968375, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9140625, + "step": 4257, + "time_per_iteration": 2.4511358737945557 + }, + { + "auxiliary_loss_clip": 0.01040508, + "auxiliary_loss_mlp": 0.01009541, + "balance_loss_clip": 1.00763345, + "balance_loss_mlp": 1.01154876, + "epoch": 0.25600480986021346, + "flos": 57593786895360.0, + "grad_norm": 0.8061088541165783, + "language_loss": 0.65227318, + "learning_rate": 3.4876873963381196e-06, + "loss": 0.67277366, + "num_input_tokens_seen": 92028490, + "router_z_loss_clip": 0.01904297, + "router_z_loss_mlp": 0.2890625, + "step": 4258, + "time_per_iteration": 2.9953789710998535 + }, + { + "auxiliary_loss_clip": 0.01131454, + "auxiliary_loss_mlp": 0.0103789, + "balance_loss_clip": 1.0202589, + "balance_loss_mlp": 1.04548264, + "epoch": 0.2560649331128814, + "flos": 27819529868160.0, + "grad_norm": 1.622828615893818, + "language_loss": 0.7647177, + "learning_rate": 3.4874270700110013e-06, + "loss": 0.78641111, + "num_input_tokens_seen": 92048060, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.859375, + "step": 4259, + "time_per_iteration": 2.5079360008239746 + }, + { + "auxiliary_loss_clip": 0.01038142, + "auxiliary_loss_mlp": 0.01004188, + "balance_loss_clip": 1.00237584, + "balance_loss_mlp": 1.0093925, + "epoch": 0.2561250563655494, + "flos": 70950509101440.0, + "grad_norm": 0.7946947905759578, + "language_loss": 0.58501768, + "learning_rate": 3.4871666872804994e-06, + "loss": 0.60544097, + "num_input_tokens_seen": 92118180, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.28710938, + "step": 4260, + "time_per_iteration": 4.636982202529907 + }, + { + "auxiliary_loss_clip": 0.01131187, + "auxiliary_loss_mlp": 0.01044504, + "balance_loss_clip": 1.02759969, + "balance_loss_mlp": 1.04300261, + "epoch": 0.25618517961821735, + "flos": 27012329481600.0, + "grad_norm": 1.8728817118968701, + "language_loss": 0.76659095, + "learning_rate": 3.4869062481564875e-06, + "loss": 0.7883479, + "num_input_tokens_seen": 92137570, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8828125, + "step": 4261, + "time_per_iteration": 3.974956750869751 + }, + { + "auxiliary_loss_clip": 0.01130829, + "auxiliary_loss_mlp": 0.01037912, + "balance_loss_clip": 1.02280843, + "balance_loss_mlp": 1.04460573, + "epoch": 0.2562453028708853, + "flos": 23068296322560.0, + "grad_norm": 1.6516780840688012, + "language_loss": 0.8323037, + "learning_rate": 3.486645752648842e-06, + "loss": 0.85399115, + "num_input_tokens_seen": 92157625, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.86328125, + "step": 4262, + "time_per_iteration": 2.5251948833465576 + }, + { + "auxiliary_loss_clip": 0.01136997, + "auxiliary_loss_mlp": 0.01048847, + "balance_loss_clip": 1.03123951, + "balance_loss_mlp": 1.04404712, + "epoch": 0.2563054261235533, + "flos": 15120942606720.0, + "grad_norm": 2.7380780768968016, + "language_loss": 0.74153852, + "learning_rate": 3.4863852007674405e-06, + "loss": 0.76339698, + "num_input_tokens_seen": 92175350, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9296875, + "step": 4263, + "time_per_iteration": 2.42657208442688 + }, + { + "auxiliary_loss_clip": 0.01133473, + "auxiliary_loss_mlp": 0.01051758, + "balance_loss_clip": 1.03533101, + "balance_loss_mlp": 1.04720163, + "epoch": 0.25636554937622125, + "flos": 27854865872640.0, + "grad_norm": 1.7828084139599185, + "language_loss": 0.82793939, + "learning_rate": 3.486124592522163e-06, + "loss": 0.84979165, + "num_input_tokens_seen": 92196070, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.86328125, + "step": 4264, + "time_per_iteration": 2.534097194671631 + }, + { + "auxiliary_loss_clip": 0.01134463, + "auxiliary_loss_mlp": 0.01041936, + "balance_loss_clip": 1.02506804, + "balance_loss_mlp": 1.04660988, + "epoch": 0.2564256726288892, + "flos": 28906509288960.0, + "grad_norm": 1.7080317762970965, + "language_loss": 0.7443161, + "learning_rate": 3.4858639279228924e-06, + "loss": 0.76608008, + "num_input_tokens_seen": 92216310, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.87890625, + "step": 4265, + "time_per_iteration": 2.51088809967041 + }, + { + "auxiliary_loss_clip": 0.01129849, + "auxiliary_loss_mlp": 0.01032538, + "balance_loss_clip": 1.01679027, + "balance_loss_mlp": 1.0425024, + "epoch": 0.2564857958815572, + "flos": 18514931823360.0, + "grad_norm": 1.644190377842657, + "language_loss": 0.8153013, + "learning_rate": 3.485603206979513e-06, + "loss": 0.83692515, + "num_input_tokens_seen": 92234510, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.875, + "step": 4266, + "time_per_iteration": 2.4706335067749023 + }, + { + "auxiliary_loss_clip": 0.01128054, + "auxiliary_loss_mlp": 0.01035268, + "balance_loss_clip": 1.01909137, + "balance_loss_mlp": 1.04252076, + "epoch": 0.25654591913422514, + "flos": 25808280658560.0, + "grad_norm": 1.6333370834261398, + "language_loss": 0.79287028, + "learning_rate": 3.4853424297019103e-06, + "loss": 0.81450343, + "num_input_tokens_seen": 92254070, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.85546875, + "step": 4267, + "time_per_iteration": 2.4819366931915283 + }, + { + "auxiliary_loss_clip": 0.01127366, + "auxiliary_loss_mlp": 0.01041101, + "balance_loss_clip": 1.02480555, + "balance_loss_mlp": 1.04406714, + "epoch": 0.2566060423868931, + "flos": 19099665325440.0, + "grad_norm": 1.7559000109968124, + "language_loss": 0.78708017, + "learning_rate": 3.4850815960999736e-06, + "loss": 0.80876482, + "num_input_tokens_seen": 92275060, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8359375, + "step": 4268, + "time_per_iteration": 2.4778378009796143 + }, + { + "auxiliary_loss_clip": 0.0113239, + "auxiliary_loss_mlp": 0.01037875, + "balance_loss_clip": 1.02198434, + "balance_loss_mlp": 1.04507172, + "epoch": 0.25666616563956113, + "flos": 23842674656640.0, + "grad_norm": 2.2514359992660204, + "language_loss": 0.68120348, + "learning_rate": 3.484820706183595e-06, + "loss": 0.70290613, + "num_input_tokens_seen": 92293610, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.87109375, + "step": 4269, + "time_per_iteration": 2.4696271419525146 + }, + { + "auxiliary_loss_clip": 0.01134604, + "auxiliary_loss_mlp": 0.01042059, + "balance_loss_clip": 1.0249877, + "balance_loss_mlp": 1.04593778, + "epoch": 0.2567262888922291, + "flos": 14604259420800.0, + "grad_norm": 4.018282830570473, + "language_loss": 0.78496158, + "learning_rate": 3.484559759962666e-06, + "loss": 0.80672824, + "num_input_tokens_seen": 92308305, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.88671875, + "step": 4270, + "time_per_iteration": 2.418912172317505 + }, + { + "auxiliary_loss_clip": 0.01139603, + "auxiliary_loss_mlp": 0.01037807, + "balance_loss_clip": 1.01953244, + "balance_loss_mlp": 1.04711556, + "epoch": 0.25678641214489706, + "flos": 32923117877760.0, + "grad_norm": 2.0502449379686256, + "language_loss": 0.68136632, + "learning_rate": 3.4842987574470816e-06, + "loss": 0.70314038, + "num_input_tokens_seen": 92329875, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.921875, + "step": 4271, + "time_per_iteration": 2.5410749912261963 + }, + { + "auxiliary_loss_clip": 0.01137314, + "auxiliary_loss_mlp": 0.0104993, + "balance_loss_clip": 1.0325973, + "balance_loss_mlp": 1.04592848, + "epoch": 0.256846535397565, + "flos": 24098933260800.0, + "grad_norm": 4.518410893879739, + "language_loss": 0.8741951, + "learning_rate": 3.4840376986467403e-06, + "loss": 0.8960675, + "num_input_tokens_seen": 92348780, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9140625, + "step": 4272, + "time_per_iteration": 2.5022568702697754 + }, + { + "auxiliary_loss_clip": 0.0113724, + "auxiliary_loss_mlp": 0.01044761, + "balance_loss_clip": 1.02734506, + "balance_loss_mlp": 1.04770613, + "epoch": 0.256906658650233, + "flos": 19718441942400.0, + "grad_norm": 1.953603621991432, + "language_loss": 0.81442308, + "learning_rate": 3.483776583571541e-06, + "loss": 0.83624303, + "num_input_tokens_seen": 92368175, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.8984375, + "step": 4273, + "time_per_iteration": 2.453834295272827 + }, + { + "auxiliary_loss_clip": 0.01131691, + "auxiliary_loss_mlp": 0.01041869, + "balance_loss_clip": 1.02492929, + "balance_loss_mlp": 1.04724693, + "epoch": 0.25696678190290095, + "flos": 22926018551040.0, + "grad_norm": 1.682161023261006, + "language_loss": 0.77215779, + "learning_rate": 3.4835154122313846e-06, + "loss": 0.79389334, + "num_input_tokens_seen": 92387755, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.84375, + "step": 4274, + "time_per_iteration": 2.486238956451416 + }, + { + "auxiliary_loss_clip": 0.01129914, + "auxiliary_loss_mlp": 0.01037046, + "balance_loss_clip": 1.02061856, + "balance_loss_mlp": 1.04450369, + "epoch": 0.2570269051555689, + "flos": 27307838672640.0, + "grad_norm": 1.8548211040661395, + "language_loss": 0.8401829, + "learning_rate": 3.4832541846361743e-06, + "loss": 0.86185247, + "num_input_tokens_seen": 92409850, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8515625, + "step": 4275, + "time_per_iteration": 2.5145719051361084 + }, + { + "auxiliary_loss_clip": 0.01133209, + "auxiliary_loss_mlp": 0.0103751, + "balance_loss_clip": 1.02078438, + "balance_loss_mlp": 1.04492021, + "epoch": 0.2570870284082369, + "flos": 27563414918400.0, + "grad_norm": 3.0116628321367678, + "language_loss": 0.78124094, + "learning_rate": 3.4829929007958175e-06, + "loss": 0.80294812, + "num_input_tokens_seen": 92431250, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8828125, + "step": 4276, + "time_per_iteration": 2.533989906311035 + }, + { + "auxiliary_loss_clip": 0.01133318, + "auxiliary_loss_mlp": 0.01043592, + "balance_loss_clip": 1.02723646, + "balance_loss_mlp": 1.04575086, + "epoch": 0.25714715166090485, + "flos": 28730834847360.0, + "grad_norm": 1.750550841347414, + "language_loss": 0.79439288, + "learning_rate": 3.4827315607202214e-06, + "loss": 0.81616199, + "num_input_tokens_seen": 92452065, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.875, + "step": 4277, + "time_per_iteration": 2.5131442546844482 + }, + { + "auxiliary_loss_clip": 0.01134263, + "auxiliary_loss_mlp": 0.01036244, + "balance_loss_clip": 1.01981688, + "balance_loss_mlp": 1.04671657, + "epoch": 0.2572072749135728, + "flos": 20116152305280.0, + "grad_norm": 2.0431628844466543, + "language_loss": 0.78804862, + "learning_rate": 3.482470164419295e-06, + "loss": 0.80975372, + "num_input_tokens_seen": 92470025, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.875, + "step": 4278, + "time_per_iteration": 2.4813432693481445 + }, + { + "auxiliary_loss_clip": 0.01137794, + "auxiliary_loss_mlp": 0.01039572, + "balance_loss_clip": 1.02299643, + "balance_loss_mlp": 1.04657972, + "epoch": 0.2572673981662408, + "flos": 26030855283840.0, + "grad_norm": 2.020871128069371, + "language_loss": 0.74624676, + "learning_rate": 3.482208711902952e-06, + "loss": 0.76802039, + "num_input_tokens_seen": 92489825, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9140625, + "step": 4279, + "time_per_iteration": 2.4989213943481445 + }, + { + "auxiliary_loss_clip": 0.01136508, + "auxiliary_loss_mlp": 0.01051836, + "balance_loss_clip": 1.03472984, + "balance_loss_mlp": 1.04528475, + "epoch": 0.25732752141890874, + "flos": 16106618695680.0, + "grad_norm": 2.295268067844067, + "language_loss": 0.85406947, + "learning_rate": 3.4819472031811065e-06, + "loss": 0.87595296, + "num_input_tokens_seen": 92507270, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9140625, + "step": 4280, + "time_per_iteration": 2.479163408279419 + }, + { + "auxiliary_loss_clip": 0.0113599, + "auxiliary_loss_mlp": 0.01041209, + "balance_loss_clip": 1.02362585, + "balance_loss_mlp": 1.04583955, + "epoch": 0.2573876446715767, + "flos": 22524429519360.0, + "grad_norm": 2.2211313624852447, + "language_loss": 0.78780186, + "learning_rate": 3.4816856382636744e-06, + "loss": 0.80957377, + "num_input_tokens_seen": 92526300, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8984375, + "step": 4281, + "time_per_iteration": 2.463003158569336 + }, + { + "auxiliary_loss_clip": 0.01134819, + "auxiliary_loss_mlp": 0.01039212, + "balance_loss_clip": 1.02277303, + "balance_loss_mlp": 1.0472312, + "epoch": 0.2574477679242447, + "flos": 23950837486080.0, + "grad_norm": 1.9444978312753, + "language_loss": 0.87356091, + "learning_rate": 3.4814240171605737e-06, + "loss": 0.89530122, + "num_input_tokens_seen": 92546465, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.875, + "step": 4282, + "time_per_iteration": 2.5049889087677 + }, + { + "auxiliary_loss_clip": 0.01137104, + "auxiliary_loss_mlp": 0.0104319, + "balance_loss_clip": 1.02739513, + "balance_loss_mlp": 1.04648709, + "epoch": 0.2575078911769127, + "flos": 21981711951360.0, + "grad_norm": 1.5754049466604292, + "language_loss": 0.70172656, + "learning_rate": 3.4811623398817267e-06, + "loss": 0.72352946, + "num_input_tokens_seen": 92567260, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.90625, + "step": 4283, + "time_per_iteration": 2.520315408706665 + }, + { + "auxiliary_loss_clip": 0.01132284, + "auxiliary_loss_mlp": 0.01042212, + "balance_loss_clip": 1.02698922, + "balance_loss_mlp": 1.04772711, + "epoch": 0.25756801442958066, + "flos": 21945406279680.0, + "grad_norm": 2.712350413324169, + "language_loss": 0.80323613, + "learning_rate": 3.4809006064370553e-06, + "loss": 0.82498109, + "num_input_tokens_seen": 92585425, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.84765625, + "step": 4284, + "time_per_iteration": 2.483292579650879 + }, + { + "auxiliary_loss_clip": 0.01134487, + "auxiliary_loss_mlp": 0.01040012, + "balance_loss_clip": 1.02538466, + "balance_loss_mlp": 1.04674387, + "epoch": 0.2576281376822486, + "flos": 35261980058880.0, + "grad_norm": 2.1742402973432893, + "language_loss": 0.70485193, + "learning_rate": 3.4806388168364835e-06, + "loss": 0.72659695, + "num_input_tokens_seen": 92604770, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.875, + "step": 4285, + "time_per_iteration": 2.564211130142212 + }, + { + "auxiliary_loss_clip": 0.01137353, + "auxiliary_loss_mlp": 0.01038151, + "balance_loss_clip": 1.02282071, + "balance_loss_mlp": 1.04953337, + "epoch": 0.2576882609349166, + "flos": 14132285688960.0, + "grad_norm": 2.328286971317511, + "language_loss": 0.58380014, + "learning_rate": 3.4803769710899402e-06, + "loss": 0.60555518, + "num_input_tokens_seen": 92622635, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.87890625, + "step": 4286, + "time_per_iteration": 2.4425430297851562 + }, + { + "auxiliary_loss_clip": 0.01139826, + "auxiliary_loss_mlp": 0.01043664, + "balance_loss_clip": 1.02702272, + "balance_loss_mlp": 1.04858327, + "epoch": 0.25774838418758456, + "flos": 23258336204160.0, + "grad_norm": 1.6452331987585218, + "language_loss": 0.64191288, + "learning_rate": 3.480115069207354e-06, + "loss": 0.66374773, + "num_input_tokens_seen": 92642960, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.91015625, + "step": 4287, + "time_per_iteration": 2.470015287399292 + }, + { + "auxiliary_loss_clip": 0.0113955, + "auxiliary_loss_mlp": 0.01040533, + "balance_loss_clip": 1.02293801, + "balance_loss_mlp": 1.04739881, + "epoch": 0.2578085074402525, + "flos": 22601745544320.0, + "grad_norm": 2.0830358142366148, + "language_loss": 0.72029591, + "learning_rate": 3.4798531111986557e-06, + "loss": 0.74209672, + "num_input_tokens_seen": 92662455, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.921875, + "step": 4288, + "time_per_iteration": 2.4983417987823486 + }, + { + "auxiliary_loss_clip": 0.01135736, + "auxiliary_loss_mlp": 0.01038411, + "balance_loss_clip": 1.02263355, + "balance_loss_mlp": 1.04882312, + "epoch": 0.2578686306929205, + "flos": 24571840746240.0, + "grad_norm": 1.9870049696680936, + "language_loss": 0.76965904, + "learning_rate": 3.4795910970737786e-06, + "loss": 0.79140055, + "num_input_tokens_seen": 92683520, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8671875, + "step": 4289, + "time_per_iteration": 2.4997475147247314 + }, + { + "auxiliary_loss_clip": 0.01134323, + "auxiliary_loss_mlp": 0.01040378, + "balance_loss_clip": 1.02311635, + "balance_loss_mlp": 1.04562807, + "epoch": 0.25792875394558845, + "flos": 18113953322880.0, + "grad_norm": 1.946897603323323, + "language_loss": 0.85123539, + "learning_rate": 3.4793290268426592e-06, + "loss": 0.87298238, + "num_input_tokens_seen": 92701450, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.88671875, + "step": 4290, + "time_per_iteration": 2.454871416091919 + }, + { + "auxiliary_loss_clip": 0.01140117, + "auxiliary_loss_mlp": 0.0105053, + "balance_loss_clip": 1.03159952, + "balance_loss_mlp": 1.04959655, + "epoch": 0.2579888771982564, + "flos": 17712902995200.0, + "grad_norm": 2.195715426849753, + "language_loss": 0.72170424, + "learning_rate": 3.4790669005152354e-06, + "loss": 0.74361074, + "num_input_tokens_seen": 92720355, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.90625, + "step": 4291, + "time_per_iteration": 2.4512693881988525 + }, + { + "auxiliary_loss_clip": 0.01142047, + "auxiliary_loss_mlp": 0.01041391, + "balance_loss_clip": 1.02436781, + "balance_loss_mlp": 1.05002344, + "epoch": 0.2580490004509244, + "flos": 16434878112000.0, + "grad_norm": 2.4805881311796423, + "language_loss": 0.80718195, + "learning_rate": 3.4788047181014458e-06, + "loss": 0.82901633, + "num_input_tokens_seen": 92736755, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.921875, + "step": 4292, + "time_per_iteration": 2.469034433364868 + }, + { + "auxiliary_loss_clip": 0.01141659, + "auxiliary_loss_mlp": 0.01044805, + "balance_loss_clip": 1.02767503, + "balance_loss_mlp": 1.05171072, + "epoch": 0.25810912370359235, + "flos": 33835141128960.0, + "grad_norm": 7.501455001056755, + "language_loss": 0.67646754, + "learning_rate": 3.4785424796112337e-06, + "loss": 0.69833219, + "num_input_tokens_seen": 92757655, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.8984375, + "step": 4293, + "time_per_iteration": 2.5785787105560303 + }, + { + "auxiliary_loss_clip": 0.01130106, + "auxiliary_loss_mlp": 0.01042426, + "balance_loss_clip": 1.02660704, + "balance_loss_mlp": 1.04503, + "epoch": 0.2581692469562603, + "flos": 25192197561600.0, + "grad_norm": 1.9136357435420137, + "language_loss": 0.75409257, + "learning_rate": 3.478280185054542e-06, + "loss": 0.77581787, + "num_input_tokens_seen": 92776100, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8515625, + "step": 4294, + "time_per_iteration": 2.5044636726379395 + }, + { + "auxiliary_loss_clip": 0.01136505, + "auxiliary_loss_mlp": 0.01047021, + "balance_loss_clip": 1.02974749, + "balance_loss_mlp": 1.04808116, + "epoch": 0.2582293702089283, + "flos": 34932212271360.0, + "grad_norm": 2.168244565891273, + "language_loss": 0.81049722, + "learning_rate": 3.478017834441318e-06, + "loss": 0.83233249, + "num_input_tokens_seen": 92798880, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.8828125, + "step": 4295, + "time_per_iteration": 2.5875558853149414 + }, + { + "auxiliary_loss_clip": 0.01140472, + "auxiliary_loss_mlp": 0.01046123, + "balance_loss_clip": 1.02797985, + "balance_loss_mlp": 1.04796624, + "epoch": 0.2582894934615963, + "flos": 26833746038400.0, + "grad_norm": 2.1973562505628026, + "language_loss": 0.72515166, + "learning_rate": 3.4777554277815096e-06, + "loss": 0.74701762, + "num_input_tokens_seen": 92817750, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.92578125, + "step": 4296, + "time_per_iteration": 2.535693407058716 + }, + { + "auxiliary_loss_clip": 0.01138613, + "auxiliary_loss_mlp": 0.01039903, + "balance_loss_clip": 1.02322531, + "balance_loss_mlp": 1.04918242, + "epoch": 0.25834961671426426, + "flos": 23515241253120.0, + "grad_norm": 1.8330269406357795, + "language_loss": 0.86766148, + "learning_rate": 3.477492965085067e-06, + "loss": 0.88944662, + "num_input_tokens_seen": 92837995, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.89453125, + "step": 4297, + "time_per_iteration": 2.5001306533813477 + }, + { + "auxiliary_loss_clip": 0.01137068, + "auxiliary_loss_mlp": 0.01048271, + "balance_loss_clip": 1.03208232, + "balance_loss_mlp": 1.04755223, + "epoch": 0.25840973996693223, + "flos": 22451028076800.0, + "grad_norm": 2.2622150737063955, + "language_loss": 0.84706259, + "learning_rate": 3.477230446361943e-06, + "loss": 0.86891592, + "num_input_tokens_seen": 92857245, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.89453125, + "step": 4298, + "time_per_iteration": 2.489917278289795 + }, + { + "auxiliary_loss_clip": 0.01137188, + "auxiliary_loss_mlp": 0.0103747, + "balance_loss_clip": 1.02069676, + "balance_loss_mlp": 1.04739285, + "epoch": 0.2584698632196002, + "flos": 11290854366720.0, + "grad_norm": 2.0676974538336266, + "language_loss": 0.83596241, + "learning_rate": 3.4769678716220927e-06, + "loss": 0.85770899, + "num_input_tokens_seen": 92873265, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8984375, + "step": 4299, + "time_per_iteration": 2.4274845123291016 + }, + { + "auxiliary_loss_clip": 0.0113508, + "auxiliary_loss_mlp": 0.01035558, + "balance_loss_clip": 1.01985788, + "balance_loss_mlp": 1.04795814, + "epoch": 0.25852998647226816, + "flos": 17929982839680.0, + "grad_norm": 2.477231855960524, + "language_loss": 0.82685435, + "learning_rate": 3.4767052408754726e-06, + "loss": 0.84856081, + "num_input_tokens_seen": 92890880, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.87109375, + "step": 4300, + "time_per_iteration": 2.4730846881866455 + }, + { + "auxiliary_loss_clip": 0.01137103, + "auxiliary_loss_mlp": 0.0104166, + "balance_loss_clip": 1.02492332, + "balance_loss_mlp": 1.04620934, + "epoch": 0.2585901097249361, + "flos": 33256117889280.0, + "grad_norm": 2.2046546957653077, + "language_loss": 0.67186987, + "learning_rate": 3.4764425541320417e-06, + "loss": 0.69365752, + "num_input_tokens_seen": 92910770, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.91015625, + "step": 4301, + "time_per_iteration": 2.5633106231689453 + }, + { + "auxiliary_loss_clip": 0.01141797, + "auxiliary_loss_mlp": 0.01039122, + "balance_loss_clip": 1.02191997, + "balance_loss_mlp": 1.04805672, + "epoch": 0.2586502329776041, + "flos": 18441278985600.0, + "grad_norm": 2.459016606739088, + "language_loss": 0.80929118, + "learning_rate": 3.4761798114017617e-06, + "loss": 0.83110034, + "num_input_tokens_seen": 92929520, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9375, + "step": 4302, + "time_per_iteration": 5.438407897949219 + }, + { + "auxiliary_loss_clip": 0.01138396, + "auxiliary_loss_mlp": 0.0104179, + "balance_loss_clip": 1.02535129, + "balance_loss_mlp": 1.04789591, + "epoch": 0.25871035623027205, + "flos": 17968120104960.0, + "grad_norm": 2.9925401825996545, + "language_loss": 0.92246419, + "learning_rate": 3.475917012694595e-06, + "loss": 0.94426608, + "num_input_tokens_seen": 92947890, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90625, + "step": 4303, + "time_per_iteration": 2.514573574066162 + }, + { + "auxiliary_loss_clip": 0.01139372, + "auxiliary_loss_mlp": 0.01036604, + "balance_loss_clip": 1.020046, + "balance_loss_mlp": 1.04932761, + "epoch": 0.25877047948294, + "flos": 27777729415680.0, + "grad_norm": 1.8070234866344623, + "language_loss": 0.67034984, + "learning_rate": 3.475654158020507e-06, + "loss": 0.69210964, + "num_input_tokens_seen": 92967690, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8984375, + "step": 4304, + "time_per_iteration": 2.540682315826416 + }, + { + "auxiliary_loss_clip": 0.01138164, + "auxiliary_loss_mlp": 0.0105089, + "balance_loss_clip": 1.03355694, + "balance_loss_mlp": 1.04595923, + "epoch": 0.258830602735608, + "flos": 27125843437440.0, + "grad_norm": 2.73594521825367, + "language_loss": 0.72829735, + "learning_rate": 3.4753912473894657e-06, + "loss": 0.75018799, + "num_input_tokens_seen": 92986830, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.921875, + "step": 4305, + "time_per_iteration": 2.580801248550415 + }, + { + "auxiliary_loss_clip": 0.01138565, + "auxiliary_loss_mlp": 0.01041261, + "balance_loss_clip": 1.02417874, + "balance_loss_mlp": 1.04731607, + "epoch": 0.25889072598827595, + "flos": 17891486438400.0, + "grad_norm": 2.196623082948333, + "language_loss": 0.75595653, + "learning_rate": 3.4751282808114403e-06, + "loss": 0.77775478, + "num_input_tokens_seen": 93002740, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9140625, + "step": 4306, + "time_per_iteration": 2.44267201423645 + }, + { + "auxiliary_loss_clip": 0.01045399, + "auxiliary_loss_mlp": 0.01003539, + "balance_loss_clip": 1.00138175, + "balance_loss_mlp": 1.01567113, + "epoch": 0.2589508492409439, + "flos": 53934955724160.0, + "grad_norm": 0.8506593293873899, + "language_loss": 0.5717386, + "learning_rate": 3.474865258296403e-06, + "loss": 0.59222794, + "num_input_tokens_seen": 93058645, + "router_z_loss_clip": 0.02160645, + "router_z_loss_mlp": 0.296875, + "step": 4307, + "time_per_iteration": 3.0457189083099365 + }, + { + "auxiliary_loss_clip": 0.01135839, + "auxiliary_loss_mlp": 0.01039878, + "balance_loss_clip": 1.02389181, + "balance_loss_mlp": 1.04729199, + "epoch": 0.2590109724936119, + "flos": 22125785402880.0, + "grad_norm": 1.7695447826328226, + "language_loss": 0.71543598, + "learning_rate": 3.474602179854327e-06, + "loss": 0.73719311, + "num_input_tokens_seen": 93077140, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8828125, + "step": 4308, + "time_per_iteration": 2.4612655639648438 + }, + { + "auxiliary_loss_clip": 0.0113812, + "auxiliary_loss_mlp": 0.01041087, + "balance_loss_clip": 1.02439809, + "balance_loss_mlp": 1.04625905, + "epoch": 0.2590710957462799, + "flos": 13474294398720.0, + "grad_norm": 2.097007373458932, + "language_loss": 0.84195936, + "learning_rate": 3.4743390454951886e-06, + "loss": 0.86375141, + "num_input_tokens_seen": 93093580, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.91796875, + "step": 4309, + "time_per_iteration": 2.458937883377075 + }, + { + "auxiliary_loss_clip": 0.01138522, + "auxiliary_loss_mlp": 0.01041522, + "balance_loss_clip": 1.02609062, + "balance_loss_mlp": 1.04893243, + "epoch": 0.25913121899894787, + "flos": 22307098279680.0, + "grad_norm": 1.520786669442297, + "language_loss": 0.8451637, + "learning_rate": 3.474075855228966e-06, + "loss": 0.8669641, + "num_input_tokens_seen": 93112345, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8984375, + "step": 4310, + "time_per_iteration": 2.453946828842163 + }, + { + "auxiliary_loss_clip": 0.0113925, + "auxiliary_loss_mlp": 0.01043346, + "balance_loss_clip": 1.02706194, + "balance_loss_mlp": 1.04705715, + "epoch": 0.25919134225161583, + "flos": 25811728364160.0, + "grad_norm": 2.3904067628525305, + "language_loss": 0.77478111, + "learning_rate": 3.473812609065639e-06, + "loss": 0.79660702, + "num_input_tokens_seen": 93131545, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.921875, + "step": 4311, + "time_per_iteration": 2.5142812728881836 + }, + { + "auxiliary_loss_clip": 0.01138542, + "auxiliary_loss_mlp": 0.01041115, + "balance_loss_clip": 1.0248189, + "balance_loss_mlp": 1.04691362, + "epoch": 0.2592514655042838, + "flos": 31212262108800.0, + "grad_norm": 3.1447136536803852, + "language_loss": 0.72220832, + "learning_rate": 3.4735493070151904e-06, + "loss": 0.74400491, + "num_input_tokens_seen": 93150730, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.91796875, + "step": 4312, + "time_per_iteration": 2.5275332927703857 + }, + { + "auxiliary_loss_clip": 0.01134993, + "auxiliary_loss_mlp": 0.01040705, + "balance_loss_clip": 1.02434921, + "balance_loss_mlp": 1.04480851, + "epoch": 0.25931158875695176, + "flos": 18474998878080.0, + "grad_norm": 2.2264539824076683, + "language_loss": 0.69908661, + "learning_rate": 3.4732859490876044e-06, + "loss": 0.72084355, + "num_input_tokens_seen": 93167895, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90234375, + "step": 4313, + "time_per_iteration": 2.479011058807373 + }, + { + "auxiliary_loss_clip": 0.01133563, + "auxiliary_loss_mlp": 0.01043367, + "balance_loss_clip": 1.02800131, + "balance_loss_mlp": 1.04467726, + "epoch": 0.2593717120096197, + "flos": 19207935895680.0, + "grad_norm": 1.7186396349483555, + "language_loss": 0.80486274, + "learning_rate": 3.473022535292867e-06, + "loss": 0.82663202, + "num_input_tokens_seen": 93187650, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.88671875, + "step": 4314, + "time_per_iteration": 2.443934679031372 + }, + { + "auxiliary_loss_clip": 0.01138226, + "auxiliary_loss_mlp": 0.01047643, + "balance_loss_clip": 1.03030992, + "balance_loss_mlp": 1.04506671, + "epoch": 0.2594318352622877, + "flos": 31248100903680.0, + "grad_norm": 2.0498851814527863, + "language_loss": 0.6687156, + "learning_rate": 3.472759065640968e-06, + "loss": 0.69057429, + "num_input_tokens_seen": 93207370, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9296875, + "step": 4315, + "time_per_iteration": 2.5375983715057373 + }, + { + "auxiliary_loss_clip": 0.01132586, + "auxiliary_loss_mlp": 0.01040869, + "balance_loss_clip": 1.02563381, + "balance_loss_mlp": 1.04426146, + "epoch": 0.25949195851495566, + "flos": 22237144542720.0, + "grad_norm": 1.5303062780919283, + "language_loss": 0.7911852, + "learning_rate": 3.4724955401418976e-06, + "loss": 0.81291974, + "num_input_tokens_seen": 93227925, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8828125, + "step": 4316, + "time_per_iteration": 2.448997735977173 + }, + { + "auxiliary_loss_clip": 0.01136257, + "auxiliary_loss_mlp": 0.01039906, + "balance_loss_clip": 1.02333546, + "balance_loss_mlp": 1.0446136, + "epoch": 0.2595520817676236, + "flos": 28075716645120.0, + "grad_norm": 1.687308210321376, + "language_loss": 0.77601087, + "learning_rate": 3.4722319588056487e-06, + "loss": 0.79777247, + "num_input_tokens_seen": 93250020, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9140625, + "step": 4317, + "time_per_iteration": 2.5545339584350586 + }, + { + "auxiliary_loss_clip": 0.01136641, + "auxiliary_loss_mlp": 0.01048751, + "balance_loss_clip": 1.03160882, + "balance_loss_mlp": 1.04599953, + "epoch": 0.2596122050202916, + "flos": 20190954378240.0, + "grad_norm": 2.5535432929686883, + "language_loss": 0.77773315, + "learning_rate": 3.4719683216422163e-06, + "loss": 0.79958701, + "num_input_tokens_seen": 93269070, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.90625, + "step": 4318, + "time_per_iteration": 2.450573682785034 + }, + { + "auxiliary_loss_clip": 0.01133741, + "auxiliary_loss_mlp": 0.01045128, + "balance_loss_clip": 1.02717471, + "balance_loss_mlp": 1.04450393, + "epoch": 0.25967232827295955, + "flos": 22527949052160.0, + "grad_norm": 1.801084946435003, + "language_loss": 0.76197278, + "learning_rate": 3.471704628661598e-06, + "loss": 0.78376144, + "num_input_tokens_seen": 93290250, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.890625, + "step": 4319, + "time_per_iteration": 2.5243709087371826 + }, + { + "auxiliary_loss_clip": 0.01131874, + "auxiliary_loss_mlp": 0.01037382, + "balance_loss_clip": 1.0216701, + "balance_loss_mlp": 1.04500592, + "epoch": 0.2597324515256275, + "flos": 21068252156160.0, + "grad_norm": 1.8511829127720039, + "language_loss": 0.76338619, + "learning_rate": 3.4714408798737925e-06, + "loss": 0.78507876, + "num_input_tokens_seen": 93310090, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8671875, + "step": 4320, + "time_per_iteration": 2.4792070388793945 + }, + { + "auxiliary_loss_clip": 0.01135729, + "auxiliary_loss_mlp": 0.01038323, + "balance_loss_clip": 1.02205038, + "balance_loss_mlp": 1.04641151, + "epoch": 0.2597925747782955, + "flos": 22050013662720.0, + "grad_norm": 1.7592602092397844, + "language_loss": 0.71143925, + "learning_rate": 3.471177075288801e-06, + "loss": 0.73317981, + "num_input_tokens_seen": 93329570, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.890625, + "step": 4321, + "time_per_iteration": 2.5381112098693848 + }, + { + "auxiliary_loss_clip": 0.01137982, + "auxiliary_loss_mlp": 0.01044713, + "balance_loss_clip": 1.02813125, + "balance_loss_mlp": 1.04517424, + "epoch": 0.2598526980309635, + "flos": 19536949497600.0, + "grad_norm": 2.037757848326605, + "language_loss": 0.74483943, + "learning_rate": 3.4709132149166277e-06, + "loss": 0.76666641, + "num_input_tokens_seen": 93347920, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9296875, + "step": 4322, + "time_per_iteration": 2.4379777908325195 + }, + { + "auxiliary_loss_clip": 0.01134471, + "auxiliary_loss_mlp": 0.01047461, + "balance_loss_clip": 1.03059244, + "balance_loss_mlp": 1.04368353, + "epoch": 0.25991282128363147, + "flos": 24495207079680.0, + "grad_norm": 1.9467125010752846, + "language_loss": 0.73674595, + "learning_rate": 3.470649298767278e-06, + "loss": 0.75856531, + "num_input_tokens_seen": 93367145, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.90625, + "step": 4323, + "time_per_iteration": 2.517399549484253 + }, + { + "auxiliary_loss_clip": 0.01141538, + "auxiliary_loss_mlp": 0.01044133, + "balance_loss_clip": 1.0263952, + "balance_loss_mlp": 1.04524922, + "epoch": 0.25997294453629943, + "flos": 24201457655040.0, + "grad_norm": 2.197207179409235, + "language_loss": 0.6710211, + "learning_rate": 3.4703853268507597e-06, + "loss": 0.69287789, + "num_input_tokens_seen": 93386555, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.96484375, + "step": 4324, + "time_per_iteration": 2.478419303894043 + }, + { + "auxiliary_loss_clip": 0.01132905, + "auxiliary_loss_mlp": 0.01043334, + "balance_loss_clip": 1.02839708, + "balance_loss_mlp": 1.04456055, + "epoch": 0.2600330677889674, + "flos": 31431460855680.0, + "grad_norm": 2.3342631450552838, + "language_loss": 0.70809424, + "learning_rate": 3.470121299177082e-06, + "loss": 0.72985667, + "num_input_tokens_seen": 93405590, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8828125, + "step": 4325, + "time_per_iteration": 2.5444648265838623 + }, + { + "auxiliary_loss_clip": 0.01133012, + "auxiliary_loss_mlp": 0.01037608, + "balance_loss_clip": 1.02139568, + "balance_loss_mlp": 1.04295206, + "epoch": 0.26009319104163536, + "flos": 32266527217920.0, + "grad_norm": 2.476658211689484, + "language_loss": 0.73041123, + "learning_rate": 3.469857215756257e-06, + "loss": 0.7521174, + "num_input_tokens_seen": 93424750, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.90234375, + "step": 4326, + "time_per_iteration": 2.5281147956848145 + }, + { + "auxiliary_loss_clip": 0.01127256, + "auxiliary_loss_mlp": 0.01038648, + "balance_loss_clip": 1.02424729, + "balance_loss_mlp": 1.04237306, + "epoch": 0.26015331429430333, + "flos": 26286754752000.0, + "grad_norm": 1.820673081097861, + "language_loss": 0.8661378, + "learning_rate": 3.4695930765982997e-06, + "loss": 0.88779688, + "num_input_tokens_seen": 93443465, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8515625, + "step": 4327, + "time_per_iteration": 2.4929087162017822 + }, + { + "auxiliary_loss_clip": 0.01138344, + "auxiliary_loss_mlp": 0.01049414, + "balance_loss_clip": 1.03121042, + "balance_loss_mlp": 1.04679346, + "epoch": 0.2602134375469713, + "flos": 21142335957120.0, + "grad_norm": 2.002075266566112, + "language_loss": 0.80111909, + "learning_rate": 3.4693288817132255e-06, + "loss": 0.82299662, + "num_input_tokens_seen": 93462580, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.9140625, + "step": 4328, + "time_per_iteration": 2.451131582260132 + }, + { + "auxiliary_loss_clip": 0.0112995, + "auxiliary_loss_mlp": 0.01040006, + "balance_loss_clip": 1.02394891, + "balance_loss_mlp": 1.04219353, + "epoch": 0.26027356079963926, + "flos": 25921327737600.0, + "grad_norm": 1.514483384647774, + "language_loss": 0.87428784, + "learning_rate": 3.4690646311110525e-06, + "loss": 0.89598739, + "num_input_tokens_seen": 93482790, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87890625, + "step": 4329, + "time_per_iteration": 2.522368907928467 + }, + { + "auxiliary_loss_clip": 0.01132983, + "auxiliary_loss_mlp": 0.010381, + "balance_loss_clip": 1.02261448, + "balance_loss_mlp": 1.04585731, + "epoch": 0.2603336840523072, + "flos": 26359222440960.0, + "grad_norm": 2.096665977126354, + "language_loss": 0.77746803, + "learning_rate": 3.468800324801802e-06, + "loss": 0.79917884, + "num_input_tokens_seen": 93498795, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.87109375, + "step": 4330, + "time_per_iteration": 2.4771482944488525 + }, + { + "auxiliary_loss_clip": 0.01134796, + "auxiliary_loss_mlp": 0.01047613, + "balance_loss_clip": 1.03136468, + "balance_loss_mlp": 1.04525268, + "epoch": 0.2603938073049752, + "flos": 23513661054720.0, + "grad_norm": 2.4595446714184654, + "language_loss": 0.75248575, + "learning_rate": 3.4685359627954958e-06, + "loss": 0.77430975, + "num_input_tokens_seen": 93518335, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.89453125, + "step": 4331, + "time_per_iteration": 2.5284199714660645 + }, + { + "auxiliary_loss_clip": 0.01137533, + "auxiliary_loss_mlp": 0.0103716, + "balance_loss_clip": 1.02158558, + "balance_loss_mlp": 1.05026567, + "epoch": 0.26045393055764315, + "flos": 25374300537600.0, + "grad_norm": 1.3491085383994963, + "language_loss": 0.69003588, + "learning_rate": 3.4682715451021584e-06, + "loss": 0.71178281, + "num_input_tokens_seen": 93539170, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.87109375, + "step": 4332, + "time_per_iteration": 2.476125478744507 + }, + { + "auxiliary_loss_clip": 0.0113624, + "auxiliary_loss_mlp": 0.01041054, + "balance_loss_clip": 1.02453184, + "balance_loss_mlp": 1.04542089, + "epoch": 0.2605140538103111, + "flos": 27635272076160.0, + "grad_norm": 2.3270567941112854, + "language_loss": 0.79674375, + "learning_rate": 3.4680070717318174e-06, + "loss": 0.81851673, + "num_input_tokens_seen": 93558480, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.91015625, + "step": 4333, + "time_per_iteration": 2.5234756469726562 + }, + { + "auxiliary_loss_clip": 0.01129676, + "auxiliary_loss_mlp": 0.01043365, + "balance_loss_clip": 1.02791548, + "balance_loss_mlp": 1.04336357, + "epoch": 0.2605741770629791, + "flos": 13769839503360.0, + "grad_norm": 1.7608965931322442, + "language_loss": 0.80725265, + "learning_rate": 3.467742542694501e-06, + "loss": 0.82898307, + "num_input_tokens_seen": 93575220, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.86328125, + "step": 4334, + "time_per_iteration": 2.4361026287078857 + }, + { + "auxiliary_loss_clip": 0.01132792, + "auxiliary_loss_mlp": 0.01038034, + "balance_loss_clip": 1.02128482, + "balance_loss_mlp": 1.04452491, + "epoch": 0.26063430031564705, + "flos": 26031681296640.0, + "grad_norm": 1.8337144126432974, + "language_loss": 0.80039275, + "learning_rate": 3.46747795800024e-06, + "loss": 0.822101, + "num_input_tokens_seen": 93597015, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.87890625, + "step": 4335, + "time_per_iteration": 2.5246174335479736 + }, + { + "auxiliary_loss_clip": 0.01043695, + "auxiliary_loss_mlp": 0.0102207, + "balance_loss_clip": 1.02024579, + "balance_loss_mlp": 1.01431763, + "epoch": 0.26069442356831507, + "flos": 62443809820800.0, + "grad_norm": 0.849908687169067, + "language_loss": 0.60851145, + "learning_rate": 3.467213317659068e-06, + "loss": 0.62916911, + "num_input_tokens_seen": 93657775, + "router_z_loss_clip": 0.01818848, + "router_z_loss_mlp": 0.29296875, + "step": 4336, + "time_per_iteration": 3.0349080562591553 + }, + { + "auxiliary_loss_clip": 0.01136323, + "auxiliary_loss_mlp": 0.01047902, + "balance_loss_clip": 1.03172541, + "balance_loss_mlp": 1.04599738, + "epoch": 0.26075454682098304, + "flos": 13626376583040.0, + "grad_norm": 6.860825703537795, + "language_loss": 0.77407634, + "learning_rate": 3.46694862168102e-06, + "loss": 0.79591858, + "num_input_tokens_seen": 93676145, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.90625, + "step": 4337, + "time_per_iteration": 2.4549763202667236 + }, + { + "auxiliary_loss_clip": 0.01135126, + "auxiliary_loss_mlp": 0.01045126, + "balance_loss_clip": 1.02755404, + "balance_loss_mlp": 1.04531193, + "epoch": 0.260814670073651, + "flos": 12126531260160.0, + "grad_norm": 2.1553767319060646, + "language_loss": 0.74116468, + "learning_rate": 3.4666838700761334e-06, + "loss": 0.76296723, + "num_input_tokens_seen": 93692480, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8984375, + "step": 4338, + "time_per_iteration": 2.4109654426574707 + }, + { + "auxiliary_loss_clip": 0.01137659, + "auxiliary_loss_mlp": 0.01042073, + "balance_loss_clip": 1.02495456, + "balance_loss_mlp": 1.0451895, + "epoch": 0.26087479332631897, + "flos": 15122522805120.0, + "grad_norm": 2.414973208379154, + "language_loss": 0.80645537, + "learning_rate": 3.466419062854447e-06, + "loss": 0.82825273, + "num_input_tokens_seen": 93710165, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.92578125, + "step": 4339, + "time_per_iteration": 2.4671595096588135 + }, + { + "auxiliary_loss_clip": 0.01133141, + "auxiliary_loss_mlp": 0.01038061, + "balance_loss_clip": 1.02287948, + "balance_loss_mlp": 1.04559159, + "epoch": 0.26093491657898693, + "flos": 24680937329280.0, + "grad_norm": 1.5844023841754464, + "language_loss": 0.76694596, + "learning_rate": 3.4661542000260033e-06, + "loss": 0.78865802, + "num_input_tokens_seen": 93730185, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.875, + "step": 4340, + "time_per_iteration": 2.4803388118743896 + }, + { + "auxiliary_loss_clip": 0.01137352, + "auxiliary_loss_mlp": 0.01037831, + "balance_loss_clip": 1.02185678, + "balance_loss_mlp": 1.04666209, + "epoch": 0.2609950398316549, + "flos": 25116138512640.0, + "grad_norm": 1.5290989424491332, + "language_loss": 0.82436979, + "learning_rate": 3.465889281600845e-06, + "loss": 0.84612167, + "num_input_tokens_seen": 93747690, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.90625, + "step": 4341, + "time_per_iteration": 2.5263681411743164 + }, + { + "auxiliary_loss_clip": 0.01134552, + "auxiliary_loss_mlp": 0.01039374, + "balance_loss_clip": 1.02236271, + "balance_loss_mlp": 1.04563117, + "epoch": 0.26105516308432286, + "flos": 28548588216960.0, + "grad_norm": 2.4125290221035773, + "language_loss": 0.76542389, + "learning_rate": 3.4656243075890183e-06, + "loss": 0.78716314, + "num_input_tokens_seen": 93767405, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.890625, + "step": 4342, + "time_per_iteration": 2.5043585300445557 + }, + { + "auxiliary_loss_clip": 0.01132446, + "auxiliary_loss_mlp": 0.010328, + "balance_loss_clip": 1.01570523, + "balance_loss_mlp": 1.04324019, + "epoch": 0.2611152863369908, + "flos": 39530609447040.0, + "grad_norm": 1.8018778201456855, + "language_loss": 0.66747689, + "learning_rate": 3.4653592780005707e-06, + "loss": 0.68912935, + "num_input_tokens_seen": 93789950, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.89453125, + "step": 4343, + "time_per_iteration": 2.6470234394073486 + }, + { + "auxiliary_loss_clip": 0.01136306, + "auxiliary_loss_mlp": 0.01041522, + "balance_loss_clip": 1.02467799, + "balance_loss_mlp": 1.04494977, + "epoch": 0.2611754095896588, + "flos": 13735329511680.0, + "grad_norm": 2.0339901471708646, + "language_loss": 0.73817015, + "learning_rate": 3.465094192845553e-06, + "loss": 0.75994843, + "num_input_tokens_seen": 93807835, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9140625, + "step": 4344, + "time_per_iteration": 5.431513071060181 + }, + { + "auxiliary_loss_clip": 0.0113578, + "auxiliary_loss_mlp": 0.01038278, + "balance_loss_clip": 1.02257776, + "balance_loss_mlp": 1.04692459, + "epoch": 0.26123553284232676, + "flos": 21506649649920.0, + "grad_norm": 3.7636245605224072, + "language_loss": 0.86394477, + "learning_rate": 3.4648290521340165e-06, + "loss": 0.88568532, + "num_input_tokens_seen": 93825670, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.890625, + "step": 4345, + "time_per_iteration": 2.4908552169799805 + }, + { + "auxiliary_loss_clip": 0.01129626, + "auxiliary_loss_mlp": 0.01039942, + "balance_loss_clip": 1.02422452, + "balance_loss_mlp": 1.04427588, + "epoch": 0.2612956560949947, + "flos": 21139786091520.0, + "grad_norm": 1.88977116996907, + "language_loss": 0.7612443, + "learning_rate": 3.464563855876015e-06, + "loss": 0.78293997, + "num_input_tokens_seen": 93844045, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.85546875, + "step": 4346, + "time_per_iteration": 2.4966983795166016 + }, + { + "auxiliary_loss_clip": 0.01133219, + "auxiliary_loss_mlp": 0.01041377, + "balance_loss_clip": 1.02547407, + "balance_loss_mlp": 1.04483962, + "epoch": 0.2613557793476627, + "flos": 25119011600640.0, + "grad_norm": 1.5621162347417301, + "language_loss": 0.75868237, + "learning_rate": 3.464298604081606e-06, + "loss": 0.78042835, + "num_input_tokens_seen": 93864380, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8828125, + "step": 4347, + "time_per_iteration": 2.5392181873321533 + }, + { + "auxiliary_loss_clip": 0.01133725, + "auxiliary_loss_mlp": 0.01033882, + "balance_loss_clip": 1.01846862, + "balance_loss_mlp": 1.04549503, + "epoch": 0.26141590260033065, + "flos": 26067699659520.0, + "grad_norm": 1.4125954345922265, + "language_loss": 0.73354399, + "learning_rate": 3.4640332967608476e-06, + "loss": 0.75522006, + "num_input_tokens_seen": 93885475, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8828125, + "step": 4348, + "time_per_iteration": 2.5206878185272217 + }, + { + "auxiliary_loss_clip": 0.01134547, + "auxiliary_loss_mlp": 0.01039621, + "balance_loss_clip": 1.02286005, + "balance_loss_mlp": 1.04503882, + "epoch": 0.2614760258529987, + "flos": 25701518459520.0, + "grad_norm": 1.8182616406273437, + "language_loss": 0.91063923, + "learning_rate": 3.463767933923799e-06, + "loss": 0.93238091, + "num_input_tokens_seen": 93905545, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.89453125, + "step": 4349, + "time_per_iteration": 2.526134967803955 + }, + { + "auxiliary_loss_clip": 0.01132572, + "auxiliary_loss_mlp": 0.01042392, + "balance_loss_clip": 1.02663279, + "balance_loss_mlp": 1.0461632, + "epoch": 0.26153614910566664, + "flos": 17457147181440.0, + "grad_norm": 1.7312169360414529, + "language_loss": 0.79879099, + "learning_rate": 3.463502515580524e-06, + "loss": 0.82054067, + "num_input_tokens_seen": 93924185, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.86328125, + "step": 4350, + "time_per_iteration": 2.4420506954193115 + }, + { + "auxiliary_loss_clip": 0.01129246, + "auxiliary_loss_mlp": 0.01039783, + "balance_loss_clip": 1.02388072, + "balance_loss_mlp": 1.04430401, + "epoch": 0.2615962723583346, + "flos": 17712831168000.0, + "grad_norm": 1.8647374515536046, + "language_loss": 0.62139511, + "learning_rate": 3.4632370417410866e-06, + "loss": 0.64308536, + "num_input_tokens_seen": 93942825, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8515625, + "step": 4351, + "time_per_iteration": 2.4613640308380127 + }, + { + "auxiliary_loss_clip": 0.01134641, + "auxiliary_loss_mlp": 0.01038195, + "balance_loss_clip": 1.02241123, + "balance_loss_mlp": 1.04469466, + "epoch": 0.26165639561100257, + "flos": 23257725672960.0, + "grad_norm": 2.09308554357217, + "language_loss": 0.83596927, + "learning_rate": 3.462971512415555e-06, + "loss": 0.85769767, + "num_input_tokens_seen": 93962045, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8984375, + "step": 4352, + "time_per_iteration": 2.4712979793548584 + }, + { + "auxiliary_loss_clip": 0.01045226, + "auxiliary_loss_mlp": 0.01000353, + "balance_loss_clip": 0.9986006, + "balance_loss_mlp": 1.01526213, + "epoch": 0.26171651886367053, + "flos": 66737970800640.0, + "grad_norm": 0.8010954727993301, + "language_loss": 0.70645392, + "learning_rate": 3.462705927613996e-06, + "loss": 0.72690976, + "num_input_tokens_seen": 94021175, + "router_z_loss_clip": 0.01757812, + "router_z_loss_mlp": 0.29882812, + "step": 4353, + "time_per_iteration": 3.026418447494507 + }, + { + "auxiliary_loss_clip": 0.01132608, + "auxiliary_loss_mlp": 0.01047561, + "balance_loss_clip": 1.03045464, + "balance_loss_mlp": 1.04494369, + "epoch": 0.2617766421163385, + "flos": 22349581090560.0, + "grad_norm": 1.7700850953213416, + "language_loss": 0.77393121, + "learning_rate": 3.4624402873464816e-06, + "loss": 0.79573292, + "num_input_tokens_seen": 94043370, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.875, + "step": 4354, + "time_per_iteration": 2.535482883453369 + }, + { + "auxiliary_loss_clip": 0.01138552, + "auxiliary_loss_mlp": 0.0104457, + "balance_loss_clip": 1.02826262, + "balance_loss_mlp": 1.04513574, + "epoch": 0.26183676536900646, + "flos": 26067125041920.0, + "grad_norm": 2.1625978203859826, + "language_loss": 0.68280292, + "learning_rate": 3.462174591623085e-06, + "loss": 0.70463413, + "num_input_tokens_seen": 94063510, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.93359375, + "step": 4355, + "time_per_iteration": 2.5276527404785156 + }, + { + "auxiliary_loss_clip": 0.01130838, + "auxiliary_loss_mlp": 0.01039393, + "balance_loss_clip": 1.02207148, + "balance_loss_mlp": 1.04375613, + "epoch": 0.26189688862167443, + "flos": 20996466825600.0, + "grad_norm": 1.9702640724114775, + "language_loss": 0.67509294, + "learning_rate": 3.4619088404538815e-06, + "loss": 0.69679523, + "num_input_tokens_seen": 94083865, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.87109375, + "step": 4356, + "time_per_iteration": 2.454436779022217 + }, + { + "auxiliary_loss_clip": 0.01043638, + "auxiliary_loss_mlp": 0.01003266, + "balance_loss_clip": 1.00139415, + "balance_loss_mlp": 1.01376009, + "epoch": 0.2619570118743424, + "flos": 65798261141760.0, + "grad_norm": 0.6781381277043278, + "language_loss": 0.53156137, + "learning_rate": 3.4616430338489487e-06, + "loss": 0.55203032, + "num_input_tokens_seen": 94144095, + "router_z_loss_clip": 0.01867676, + "router_z_loss_mlp": 0.29882812, + "step": 4357, + "time_per_iteration": 2.99239444732666 + }, + { + "auxiliary_loss_clip": 0.01138081, + "auxiliary_loss_mlp": 0.01045526, + "balance_loss_clip": 1.02955151, + "balance_loss_mlp": 1.04608119, + "epoch": 0.26201713512701036, + "flos": 28766817296640.0, + "grad_norm": 1.843205511563007, + "language_loss": 0.84329486, + "learning_rate": 3.4613771718183654e-06, + "loss": 0.86513096, + "num_input_tokens_seen": 94163035, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.921875, + "step": 4358, + "time_per_iteration": 2.511441707611084 + }, + { + "auxiliary_loss_clip": 0.0113833, + "auxiliary_loss_mlp": 0.01042477, + "balance_loss_clip": 1.02476251, + "balance_loss_mlp": 1.0450834, + "epoch": 0.2620772583796783, + "flos": 26432516142720.0, + "grad_norm": 2.1805365254718367, + "language_loss": 0.67303276, + "learning_rate": 3.4611112543722127e-06, + "loss": 0.69484085, + "num_input_tokens_seen": 94182520, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9296875, + "step": 4359, + "time_per_iteration": 2.5318756103515625 + }, + { + "auxiliary_loss_clip": 0.0113089, + "auxiliary_loss_mlp": 0.01042276, + "balance_loss_clip": 1.02725601, + "balance_loss_mlp": 1.04242957, + "epoch": 0.2621373816323463, + "flos": 20156552127360.0, + "grad_norm": 1.947910834650985, + "language_loss": 0.78673261, + "learning_rate": 3.4608452815205757e-06, + "loss": 0.80846429, + "num_input_tokens_seen": 94201795, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.88671875, + "step": 4360, + "time_per_iteration": 2.4551331996917725 + }, + { + "auxiliary_loss_clip": 0.01129221, + "auxiliary_loss_mlp": 0.01040075, + "balance_loss_clip": 1.02454162, + "balance_loss_mlp": 1.04250073, + "epoch": 0.26219750488501425, + "flos": 28621235473920.0, + "grad_norm": 1.9921513845886445, + "language_loss": 0.68169516, + "learning_rate": 3.4605792532735387e-06, + "loss": 0.70338809, + "num_input_tokens_seen": 94222390, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8671875, + "step": 4361, + "time_per_iteration": 2.57106351852417 + }, + { + "auxiliary_loss_clip": 0.01135372, + "auxiliary_loss_mlp": 0.01057475, + "balance_loss_clip": 1.04022598, + "balance_loss_mlp": 1.04400647, + "epoch": 0.2622576281376823, + "flos": 15042549173760.0, + "grad_norm": 1.9312179198305752, + "language_loss": 0.84310883, + "learning_rate": 3.46031316964119e-06, + "loss": 0.86503732, + "num_input_tokens_seen": 94239980, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9140625, + "step": 4362, + "time_per_iteration": 2.430020570755005 + }, + { + "auxiliary_loss_clip": 0.01133753, + "auxiliary_loss_mlp": 0.01040156, + "balance_loss_clip": 1.02282345, + "balance_loss_mlp": 1.04637551, + "epoch": 0.26231775139035024, + "flos": 26396174557440.0, + "grad_norm": 1.792780117353334, + "language_loss": 0.65294504, + "learning_rate": 3.4600470306336197e-06, + "loss": 0.67468411, + "num_input_tokens_seen": 94260715, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.875, + "step": 4363, + "time_per_iteration": 2.546393632888794 + }, + { + "auxiliary_loss_clip": 0.01042076, + "auxiliary_loss_mlp": 0.01004318, + "balance_loss_clip": 1.00252998, + "balance_loss_mlp": 1.0123173, + "epoch": 0.2623778746430182, + "flos": 65408918647680.0, + "grad_norm": 0.8867533167936222, + "language_loss": 0.61098528, + "learning_rate": 3.4597808362609194e-06, + "loss": 0.63144922, + "num_input_tokens_seen": 94321285, + "router_z_loss_clip": 0.01782227, + "router_z_loss_mlp": 0.296875, + "step": 4364, + "time_per_iteration": 3.150812864303589 + }, + { + "auxiliary_loss_clip": 0.01138346, + "auxiliary_loss_mlp": 0.01051385, + "balance_loss_clip": 1.03358722, + "balance_loss_mlp": 1.0468297, + "epoch": 0.26243799789568617, + "flos": 12604215254400.0, + "grad_norm": 2.424942653514092, + "language_loss": 0.71549827, + "learning_rate": 3.459514586533184e-06, + "loss": 0.73739558, + "num_input_tokens_seen": 94335420, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9140625, + "step": 4365, + "time_per_iteration": 2.493540048599243 + }, + { + "auxiliary_loss_clip": 0.0113494, + "auxiliary_loss_mlp": 0.01045115, + "balance_loss_clip": 1.02917075, + "balance_loss_mlp": 1.04654169, + "epoch": 0.26249812114835414, + "flos": 28623821253120.0, + "grad_norm": 1.8316261966241354, + "language_loss": 0.76925993, + "learning_rate": 3.459248281460509e-06, + "loss": 0.79106045, + "num_input_tokens_seen": 94357440, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8828125, + "step": 4366, + "time_per_iteration": 2.536853313446045 + }, + { + "auxiliary_loss_clip": 0.01135829, + "auxiliary_loss_mlp": 0.01042418, + "balance_loss_clip": 1.02684951, + "balance_loss_mlp": 1.04666197, + "epoch": 0.2625582444010221, + "flos": 14465393441280.0, + "grad_norm": 2.2091260788228975, + "language_loss": 0.75838757, + "learning_rate": 3.4589819210529927e-06, + "loss": 0.78017008, + "num_input_tokens_seen": 94375690, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.890625, + "step": 4367, + "time_per_iteration": 2.4576163291931152 + }, + { + "auxiliary_loss_clip": 0.01131307, + "auxiliary_loss_mlp": 0.01040361, + "balance_loss_clip": 1.02454233, + "balance_loss_mlp": 1.04452682, + "epoch": 0.26261836765369007, + "flos": 16613174246400.0, + "grad_norm": 2.1913456464974392, + "language_loss": 0.69633925, + "learning_rate": 3.458715505320736e-06, + "loss": 0.71805596, + "num_input_tokens_seen": 94393190, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8671875, + "step": 4368, + "time_per_iteration": 2.4301586151123047 + }, + { + "auxiliary_loss_clip": 0.01130278, + "auxiliary_loss_mlp": 0.01046678, + "balance_loss_clip": 1.02970243, + "balance_loss_mlp": 1.04319167, + "epoch": 0.26267849090635803, + "flos": 20519932066560.0, + "grad_norm": 1.7035150195415922, + "language_loss": 0.78589904, + "learning_rate": 3.458449034273841e-06, + "loss": 0.80766863, + "num_input_tokens_seen": 94410975, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.8671875, + "step": 4369, + "time_per_iteration": 2.489316701889038 + }, + { + "auxiliary_loss_clip": 0.01132105, + "auxiliary_loss_mlp": 0.01042711, + "balance_loss_clip": 1.02653408, + "balance_loss_mlp": 1.04431546, + "epoch": 0.262738614159026, + "flos": 21323936142720.0, + "grad_norm": 2.0413446884893047, + "language_loss": 0.83486217, + "learning_rate": 3.4581825079224133e-06, + "loss": 0.85661036, + "num_input_tokens_seen": 94429985, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.87890625, + "step": 4370, + "time_per_iteration": 2.4422430992126465 + }, + { + "auxiliary_loss_clip": 0.01136913, + "auxiliary_loss_mlp": 0.01050187, + "balance_loss_clip": 1.03060055, + "balance_loss_mlp": 1.04530215, + "epoch": 0.26279873741169396, + "flos": 17603590930560.0, + "grad_norm": 2.3340239620956287, + "language_loss": 0.70963454, + "learning_rate": 3.4579159262765575e-06, + "loss": 0.73150551, + "num_input_tokens_seen": 94448660, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.9140625, + "step": 4371, + "time_per_iteration": 2.5099778175354004 + }, + { + "auxiliary_loss_clip": 0.01043374, + "auxiliary_loss_mlp": 0.00999769, + "balance_loss_clip": 0.99784929, + "balance_loss_mlp": 1.01338005, + "epoch": 0.2628588606643619, + "flos": 60949746587520.0, + "grad_norm": 0.7657034729714577, + "language_loss": 0.56477904, + "learning_rate": 3.457649289346384e-06, + "loss": 0.58521044, + "num_input_tokens_seen": 94515630, + "router_z_loss_clip": 0.01916504, + "router_z_loss_mlp": 0.30078125, + "step": 4372, + "time_per_iteration": 3.244558572769165 + }, + { + "auxiliary_loss_clip": 0.01129835, + "auxiliary_loss_mlp": 0.01038918, + "balance_loss_clip": 1.02283084, + "balance_loss_mlp": 1.04335582, + "epoch": 0.2629189839170299, + "flos": 27016315891200.0, + "grad_norm": 1.7597219251079876, + "language_loss": 0.77415234, + "learning_rate": 3.4573825971420042e-06, + "loss": 0.79583991, + "num_input_tokens_seen": 94535385, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.86328125, + "step": 4373, + "time_per_iteration": 2.517784833908081 + }, + { + "auxiliary_loss_clip": 0.01131814, + "auxiliary_loss_mlp": 0.01041505, + "balance_loss_clip": 1.02563787, + "balance_loss_mlp": 1.04454422, + "epoch": 0.26297910716969786, + "flos": 17019863009280.0, + "grad_norm": 4.0873872332994905, + "language_loss": 0.71538949, + "learning_rate": 3.4571158496735294e-06, + "loss": 0.73712265, + "num_input_tokens_seen": 94552650, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.87109375, + "step": 4374, + "time_per_iteration": 2.442124605178833 + }, + { + "auxiliary_loss_clip": 0.01133779, + "auxiliary_loss_mlp": 0.01042, + "balance_loss_clip": 1.02435732, + "balance_loss_mlp": 1.0458709, + "epoch": 0.2630392304223659, + "flos": 24897370728960.0, + "grad_norm": 2.271567992891854, + "language_loss": 0.80945283, + "learning_rate": 3.4568490469510756e-06, + "loss": 0.83121061, + "num_input_tokens_seen": 94574075, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.875, + "step": 4375, + "time_per_iteration": 2.4889678955078125 + }, + { + "auxiliary_loss_clip": 0.0113002, + "auxiliary_loss_mlp": 0.0104209, + "balance_loss_clip": 1.0265336, + "balance_loss_mlp": 1.04366982, + "epoch": 0.26309935367503384, + "flos": 32854026067200.0, + "grad_norm": 2.3689389683703, + "language_loss": 0.65721256, + "learning_rate": 3.4565821889847603e-06, + "loss": 0.67893362, + "num_input_tokens_seen": 94594255, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.86328125, + "step": 4376, + "time_per_iteration": 2.563701629638672 + }, + { + "auxiliary_loss_clip": 0.01134504, + "auxiliary_loss_mlp": 0.0104592, + "balance_loss_clip": 1.02940989, + "balance_loss_mlp": 1.04445267, + "epoch": 0.2631594769277018, + "flos": 15887958652800.0, + "grad_norm": 1.8646607453842572, + "language_loss": 0.69517326, + "learning_rate": 3.4563152757847026e-06, + "loss": 0.71697748, + "num_input_tokens_seen": 94611410, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8984375, + "step": 4377, + "time_per_iteration": 2.486117124557495 + }, + { + "auxiliary_loss_clip": 0.01134243, + "auxiliary_loss_mlp": 0.01044305, + "balance_loss_clip": 1.02786613, + "balance_loss_mlp": 1.04500914, + "epoch": 0.2632196001803698, + "flos": 50804943557760.0, + "grad_norm": 1.711844873276418, + "language_loss": 0.7866202, + "learning_rate": 3.4560483073610233e-06, + "loss": 0.80840576, + "num_input_tokens_seen": 94636575, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.890625, + "step": 4378, + "time_per_iteration": 2.7608227729797363 + }, + { + "auxiliary_loss_clip": 0.01133245, + "auxiliary_loss_mlp": 0.01045029, + "balance_loss_clip": 1.03000844, + "balance_loss_mlp": 1.04554546, + "epoch": 0.26327972343303774, + "flos": 13733031041280.0, + "grad_norm": 2.6216377344963004, + "language_loss": 0.76320505, + "learning_rate": 3.455781283723846e-06, + "loss": 0.78498781, + "num_input_tokens_seen": 94654345, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.875, + "step": 4379, + "time_per_iteration": 2.4329168796539307 + }, + { + "auxiliary_loss_clip": 0.01138135, + "auxiliary_loss_mlp": 0.01041523, + "balance_loss_clip": 1.02252114, + "balance_loss_mlp": 1.04633284, + "epoch": 0.2633398466857057, + "flos": 23769057732480.0, + "grad_norm": 2.3003567904549156, + "language_loss": 0.78237861, + "learning_rate": 3.4555142048832975e-06, + "loss": 0.8041752, + "num_input_tokens_seen": 94673985, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.91796875, + "step": 4380, + "time_per_iteration": 2.5423548221588135 + }, + { + "auxiliary_loss_clip": 0.01135772, + "auxiliary_loss_mlp": 0.01040582, + "balance_loss_clip": 1.02419698, + "balance_loss_mlp": 1.0444113, + "epoch": 0.26339996993837367, + "flos": 27600223380480.0, + "grad_norm": 2.288842357619654, + "language_loss": 0.63811409, + "learning_rate": 3.4552470708495036e-06, + "loss": 0.65987766, + "num_input_tokens_seen": 94693145, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9140625, + "step": 4381, + "time_per_iteration": 2.5096213817596436 + }, + { + "auxiliary_loss_clip": 0.01131521, + "auxiliary_loss_mlp": 0.01037713, + "balance_loss_clip": 1.02148831, + "balance_loss_mlp": 1.04359913, + "epoch": 0.26346009319104163, + "flos": 16946317912320.0, + "grad_norm": 1.8729093449566216, + "language_loss": 0.82822418, + "learning_rate": 3.454979881632595e-06, + "loss": 0.84991652, + "num_input_tokens_seen": 94710185, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.87890625, + "step": 4382, + "time_per_iteration": 2.4691555500030518 + }, + { + "auxiliary_loss_clip": 0.01138155, + "auxiliary_loss_mlp": 0.010471, + "balance_loss_clip": 1.02902842, + "balance_loss_mlp": 1.04550982, + "epoch": 0.2635202164437096, + "flos": 37232218915200.0, + "grad_norm": 2.126733729537993, + "language_loss": 0.69686437, + "learning_rate": 3.4547126372427035e-06, + "loss": 0.71871686, + "num_input_tokens_seen": 94730280, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9296875, + "step": 4383, + "time_per_iteration": 2.5923891067504883 + }, + { + "auxiliary_loss_clip": 0.01135659, + "auxiliary_loss_mlp": 0.01042881, + "balance_loss_clip": 1.02732468, + "balance_loss_mlp": 1.04591441, + "epoch": 0.26358033969637756, + "flos": 20996359084800.0, + "grad_norm": 1.929045699346076, + "language_loss": 0.69191134, + "learning_rate": 3.4544453376899638e-06, + "loss": 0.71369672, + "num_input_tokens_seen": 94748560, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8984375, + "step": 4384, + "time_per_iteration": 2.5067081451416016 + }, + { + "auxiliary_loss_clip": 0.01132133, + "auxiliary_loss_mlp": 0.01039726, + "balance_loss_clip": 1.02319217, + "balance_loss_mlp": 1.04400492, + "epoch": 0.26364046294904553, + "flos": 27746092512000.0, + "grad_norm": 2.1647401570854075, + "language_loss": 0.6994158, + "learning_rate": 3.45417798298451e-06, + "loss": 0.72113448, + "num_input_tokens_seen": 94767570, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8828125, + "step": 4385, + "time_per_iteration": 4.062510251998901 + }, + { + "auxiliary_loss_clip": 0.01138578, + "auxiliary_loss_mlp": 0.01042633, + "balance_loss_clip": 1.02551472, + "balance_loss_mlp": 1.04978371, + "epoch": 0.2637005862017135, + "flos": 22893088757760.0, + "grad_norm": 2.0926426044309543, + "language_loss": 0.85188037, + "learning_rate": 3.453910573136482e-06, + "loss": 0.87369245, + "num_input_tokens_seen": 94784985, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.88671875, + "step": 4386, + "time_per_iteration": 3.9604547023773193 + }, + { + "auxiliary_loss_clip": 0.0113699, + "auxiliary_loss_mlp": 0.01041328, + "balance_loss_clip": 1.02487707, + "balance_loss_mlp": 1.04755282, + "epoch": 0.26376070945438146, + "flos": 15048834053760.0, + "grad_norm": 2.2248904155103637, + "language_loss": 0.77169371, + "learning_rate": 3.4536431081560196e-06, + "loss": 0.79347688, + "num_input_tokens_seen": 94802545, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.89453125, + "step": 4387, + "time_per_iteration": 2.472367286682129 + }, + { + "auxiliary_loss_clip": 0.01137279, + "auxiliary_loss_mlp": 0.01046481, + "balance_loss_clip": 1.0305903, + "balance_loss_mlp": 1.04989982, + "epoch": 0.2638208327070494, + "flos": 21141833166720.0, + "grad_norm": 3.996041212149396, + "language_loss": 0.76269597, + "learning_rate": 3.453375588053264e-06, + "loss": 0.78453362, + "num_input_tokens_seen": 94820730, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.875, + "step": 4388, + "time_per_iteration": 2.4858386516571045 + }, + { + "auxiliary_loss_clip": 0.01132552, + "auxiliary_loss_mlp": 0.01035954, + "balance_loss_clip": 1.01924086, + "balance_loss_mlp": 1.04387724, + "epoch": 0.26388095595971744, + "flos": 21725597001600.0, + "grad_norm": 1.9510825560869567, + "language_loss": 0.86210662, + "learning_rate": 3.4531080128383617e-06, + "loss": 0.88379163, + "num_input_tokens_seen": 94839175, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.88671875, + "step": 4389, + "time_per_iteration": 2.508162260055542 + }, + { + "auxiliary_loss_clip": 0.0104392, + "auxiliary_loss_mlp": 0.01009323, + "balance_loss_clip": 1.00736833, + "balance_loss_mlp": 1.01341343, + "epoch": 0.2639410792123854, + "flos": 65515537192320.0, + "grad_norm": 0.8096176904924934, + "language_loss": 0.60333931, + "learning_rate": 3.452840382521457e-06, + "loss": 0.6238718, + "num_input_tokens_seen": 94898865, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.3046875, + "step": 4390, + "time_per_iteration": 3.0593924522399902 + }, + { + "auxiliary_loss_clip": 0.01135834, + "auxiliary_loss_mlp": 0.01038563, + "balance_loss_clip": 1.02213633, + "balance_loss_mlp": 1.04522729, + "epoch": 0.2640012024650534, + "flos": 23948574929280.0, + "grad_norm": 1.7836890720002585, + "language_loss": 0.77702433, + "learning_rate": 3.4525726971127e-06, + "loss": 0.79876828, + "num_input_tokens_seen": 94917490, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90625, + "step": 4391, + "time_per_iteration": 2.5331051349639893 + }, + { + "auxiliary_loss_clip": 0.0104332, + "auxiliary_loss_mlp": 0.01003932, + "balance_loss_clip": 1.00221586, + "balance_loss_mlp": 1.01322889, + "epoch": 0.26406132571772134, + "flos": 56441163369600.0, + "grad_norm": 0.9020745061185262, + "language_loss": 0.58752227, + "learning_rate": 3.45230495662224e-06, + "loss": 0.60799479, + "num_input_tokens_seen": 94969065, + "router_z_loss_clip": 0.01721191, + "router_z_loss_mlp": 0.30078125, + "step": 4392, + "time_per_iteration": 3.047438144683838 + }, + { + "auxiliary_loss_clip": 0.01140884, + "auxiliary_loss_mlp": 0.0104677, + "balance_loss_clip": 1.03039694, + "balance_loss_mlp": 1.04925656, + "epoch": 0.2641214489703893, + "flos": 22090557139200.0, + "grad_norm": 2.5811541881681697, + "language_loss": 0.68459845, + "learning_rate": 3.4520371610602306e-06, + "loss": 0.70647496, + "num_input_tokens_seen": 94988540, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.91796875, + "step": 4393, + "time_per_iteration": 2.5537288188934326 + }, + { + "auxiliary_loss_clip": 0.01139955, + "auxiliary_loss_mlp": 0.01040744, + "balance_loss_clip": 1.02258813, + "balance_loss_mlp": 1.04662204, + "epoch": 0.26418157222305727, + "flos": 16544764794240.0, + "grad_norm": 1.8702197697463565, + "language_loss": 0.83116519, + "learning_rate": 3.4517693104368267e-06, + "loss": 0.85297221, + "num_input_tokens_seen": 95004810, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.93359375, + "step": 4394, + "time_per_iteration": 2.421211004257202 + }, + { + "auxiliary_loss_clip": 0.01143407, + "auxiliary_loss_mlp": 0.01042347, + "balance_loss_clip": 1.02357125, + "balance_loss_mlp": 1.04951847, + "epoch": 0.26424169547572524, + "flos": 18002486442240.0, + "grad_norm": 2.049654769643576, + "language_loss": 0.70211649, + "learning_rate": 3.4515014047621856e-06, + "loss": 0.72397399, + "num_input_tokens_seen": 95024085, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9375, + "step": 4395, + "time_per_iteration": 2.522111654281616 + }, + { + "auxiliary_loss_clip": 0.01135756, + "auxiliary_loss_mlp": 0.01035967, + "balance_loss_clip": 1.01925397, + "balance_loss_mlp": 1.04784906, + "epoch": 0.2643018187283932, + "flos": 16983162288000.0, + "grad_norm": 1.822626622734132, + "language_loss": 0.86866504, + "learning_rate": 3.4512334440464655e-06, + "loss": 0.89038229, + "num_input_tokens_seen": 95042515, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8828125, + "step": 4396, + "time_per_iteration": 2.4450392723083496 + }, + { + "auxiliary_loss_clip": 0.01042786, + "auxiliary_loss_mlp": 0.01024145, + "balance_loss_clip": 1.02226114, + "balance_loss_mlp": 1.01312816, + "epoch": 0.26436194198106117, + "flos": 59664359416320.0, + "grad_norm": 0.7917805441344085, + "language_loss": 0.54999918, + "learning_rate": 3.4509654282998277e-06, + "loss": 0.57066846, + "num_input_tokens_seen": 95094835, + "router_z_loss_clip": 0.01879883, + "router_z_loss_mlp": 0.296875, + "step": 4397, + "time_per_iteration": 2.8438708782196045 + }, + { + "auxiliary_loss_clip": 0.01134821, + "auxiliary_loss_mlp": 0.01052373, + "balance_loss_clip": 1.03567195, + "balance_loss_mlp": 1.04701614, + "epoch": 0.26442206523372913, + "flos": 32921322197760.0, + "grad_norm": 2.0493441687219724, + "language_loss": 0.77840483, + "learning_rate": 3.450697357532435e-06, + "loss": 0.80027676, + "num_input_tokens_seen": 95113480, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.87890625, + "step": 4398, + "time_per_iteration": 2.562499523162842 + }, + { + "auxiliary_loss_clip": 0.01141073, + "auxiliary_loss_mlp": 0.01040123, + "balance_loss_clip": 1.02262306, + "balance_loss_mlp": 1.05005002, + "epoch": 0.2644821884863971, + "flos": 21031300039680.0, + "grad_norm": 2.041566803030235, + "language_loss": 0.67037976, + "learning_rate": 3.4504292317544534e-06, + "loss": 0.69219166, + "num_input_tokens_seen": 95132580, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.91015625, + "step": 4399, + "time_per_iteration": 2.487778663635254 + }, + { + "auxiliary_loss_clip": 0.01128661, + "auxiliary_loss_mlp": 0.0103792, + "balance_loss_clip": 1.02288818, + "balance_loss_mlp": 1.04565811, + "epoch": 0.26454231173906506, + "flos": 20776801201920.0, + "grad_norm": 2.1160884119586303, + "language_loss": 0.86152196, + "learning_rate": 3.4501610509760504e-06, + "loss": 0.88318777, + "num_input_tokens_seen": 95152375, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 4400, + "time_per_iteration": 2.4837841987609863 + }, + { + "auxiliary_loss_clip": 0.01138875, + "auxiliary_loss_mlp": 0.01039625, + "balance_loss_clip": 1.02188635, + "balance_loss_mlp": 1.04813862, + "epoch": 0.264602434991733, + "flos": 16618669027200.0, + "grad_norm": 2.751022626956878, + "language_loss": 0.75779396, + "learning_rate": 3.4498928152073944e-06, + "loss": 0.77957898, + "num_input_tokens_seen": 95170265, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.90625, + "step": 4401, + "time_per_iteration": 2.548297166824341 + }, + { + "auxiliary_loss_clip": 0.01138206, + "auxiliary_loss_mlp": 0.01050893, + "balance_loss_clip": 1.03236771, + "balance_loss_mlp": 1.04606974, + "epoch": 0.26466255824440105, + "flos": 19062677295360.0, + "grad_norm": 1.9215434150559794, + "language_loss": 0.88267732, + "learning_rate": 3.4496245244586577e-06, + "loss": 0.90456831, + "num_input_tokens_seen": 95188655, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.921875, + "step": 4402, + "time_per_iteration": 2.4422647953033447 + }, + { + "auxiliary_loss_clip": 0.01135603, + "auxiliary_loss_mlp": 0.01048039, + "balance_loss_clip": 1.03151679, + "balance_loss_mlp": 1.04594266, + "epoch": 0.264722681497069, + "flos": 22638554006400.0, + "grad_norm": 1.8196807161845878, + "language_loss": 0.78123331, + "learning_rate": 3.4493561787400137e-06, + "loss": 0.80306977, + "num_input_tokens_seen": 95209615, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8984375, + "step": 4403, + "time_per_iteration": 2.587623357772827 + }, + { + "auxiliary_loss_clip": 0.0113528, + "auxiliary_loss_mlp": 0.01040463, + "balance_loss_clip": 1.02334428, + "balance_loss_mlp": 1.04440784, + "epoch": 0.264782804749737, + "flos": 22492253911680.0, + "grad_norm": 1.9946669841411302, + "language_loss": 0.87767446, + "learning_rate": 3.4490877780616387e-06, + "loss": 0.89943182, + "num_input_tokens_seen": 95227810, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.91015625, + "step": 4404, + "time_per_iteration": 2.492913246154785 + }, + { + "auxiliary_loss_clip": 0.01138307, + "auxiliary_loss_mlp": 0.01036911, + "balance_loss_clip": 1.02106786, + "balance_loss_mlp": 1.04683399, + "epoch": 0.26484292800240494, + "flos": 16800269212800.0, + "grad_norm": 1.7395093434050468, + "language_loss": 0.7593658, + "learning_rate": 3.448819322433709e-06, + "loss": 0.78111804, + "num_input_tokens_seen": 95245890, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.9140625, + "step": 4405, + "time_per_iteration": 2.508970260620117 + }, + { + "auxiliary_loss_clip": 0.01138042, + "auxiliary_loss_mlp": 0.01038835, + "balance_loss_clip": 1.02166891, + "balance_loss_mlp": 1.04870844, + "epoch": 0.2649030512550729, + "flos": 20449583280000.0, + "grad_norm": 1.9681610481113616, + "language_loss": 0.69979274, + "learning_rate": 3.4485508118664066e-06, + "loss": 0.72156149, + "num_input_tokens_seen": 95264955, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.89453125, + "step": 4406, + "time_per_iteration": 2.4548041820526123 + }, + { + "auxiliary_loss_clip": 0.01134971, + "auxiliary_loss_mlp": 0.01047688, + "balance_loss_clip": 1.03255999, + "balance_loss_mlp": 1.04781294, + "epoch": 0.2649631745077409, + "flos": 22416123035520.0, + "grad_norm": 1.7455123192469384, + "language_loss": 0.83764267, + "learning_rate": 3.448282246369912e-06, + "loss": 0.85946929, + "num_input_tokens_seen": 95284245, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.875, + "step": 4407, + "time_per_iteration": 2.5359292030334473 + }, + { + "auxiliary_loss_clip": 0.01134967, + "auxiliary_loss_mlp": 0.01032434, + "balance_loss_clip": 1.01566172, + "balance_loss_mlp": 1.04678226, + "epoch": 0.26502329776040884, + "flos": 35116110927360.0, + "grad_norm": 1.7942044569518307, + "language_loss": 0.76068008, + "learning_rate": 3.4480136259544084e-06, + "loss": 0.78235412, + "num_input_tokens_seen": 95307125, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8828125, + "step": 4408, + "time_per_iteration": 2.6124041080474854 + }, + { + "auxiliary_loss_clip": 0.011362, + "auxiliary_loss_mlp": 0.01034702, + "balance_loss_clip": 1.01832306, + "balance_loss_mlp": 1.04918611, + "epoch": 0.2650834210130768, + "flos": 38687498438400.0, + "grad_norm": 1.8724720588087471, + "language_loss": 0.70920485, + "learning_rate": 3.447744950630084e-06, + "loss": 0.73091388, + "num_input_tokens_seen": 95329150, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.87109375, + "step": 4409, + "time_per_iteration": 2.6539366245269775 + }, + { + "auxiliary_loss_clip": 0.01136441, + "auxiliary_loss_mlp": 0.01037034, + "balance_loss_clip": 1.01931942, + "balance_loss_mlp": 1.04666233, + "epoch": 0.26514354426574477, + "flos": 24716847951360.0, + "grad_norm": 1.7884535623295956, + "language_loss": 0.73085511, + "learning_rate": 3.4474762204071253e-06, + "loss": 0.75258988, + "num_input_tokens_seen": 95349880, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.8984375, + "step": 4410, + "time_per_iteration": 2.545083999633789 + }, + { + "auxiliary_loss_clip": 0.01139704, + "auxiliary_loss_mlp": 0.01049137, + "balance_loss_clip": 1.03218508, + "balance_loss_mlp": 1.04741001, + "epoch": 0.26520366751841273, + "flos": 20340055733760.0, + "grad_norm": 1.9280641145018393, + "language_loss": 0.73272175, + "learning_rate": 3.4472074352957244e-06, + "loss": 0.75461018, + "num_input_tokens_seen": 95368570, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.921875, + "step": 4411, + "time_per_iteration": 2.4818248748779297 + }, + { + "auxiliary_loss_clip": 0.01137094, + "auxiliary_loss_mlp": 0.01042757, + "balance_loss_clip": 1.02593684, + "balance_loss_mlp": 1.04815316, + "epoch": 0.2652637907710807, + "flos": 22343870828160.0, + "grad_norm": 2.073752901007566, + "language_loss": 0.82294202, + "learning_rate": 3.446938595306071e-06, + "loss": 0.84474051, + "num_input_tokens_seen": 95387065, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.88671875, + "step": 4412, + "time_per_iteration": 2.56634521484375 + }, + { + "auxiliary_loss_clip": 0.01134293, + "auxiliary_loss_mlp": 0.01047936, + "balance_loss_clip": 1.03161621, + "balance_loss_mlp": 1.04541004, + "epoch": 0.26532391402374866, + "flos": 19354235990400.0, + "grad_norm": 1.721718037322793, + "language_loss": 0.74245501, + "learning_rate": 3.4466697004483622e-06, + "loss": 0.76427728, + "num_input_tokens_seen": 95406345, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.890625, + "step": 4413, + "time_per_iteration": 2.4994029998779297 + }, + { + "auxiliary_loss_clip": 0.01046706, + "auxiliary_loss_mlp": 0.01018291, + "balance_loss_clip": 1.01659799, + "balance_loss_mlp": 1.0160358, + "epoch": 0.26538403727641663, + "flos": 44787611422080.0, + "grad_norm": 0.8825812455559224, + "language_loss": 0.56986731, + "learning_rate": 3.446400750732793e-06, + "loss": 0.59051728, + "num_input_tokens_seen": 95463595, + "router_z_loss_clip": 0.01696777, + "router_z_loss_mlp": 0.30664062, + "step": 4414, + "time_per_iteration": 2.9884986877441406 + }, + { + "auxiliary_loss_clip": 0.01128281, + "auxiliary_loss_mlp": 0.01041185, + "balance_loss_clip": 1.02605712, + "balance_loss_mlp": 1.04307461, + "epoch": 0.26544416052908465, + "flos": 28182119708160.0, + "grad_norm": 1.8727128035200367, + "language_loss": 0.74535894, + "learning_rate": 3.4461317461695625e-06, + "loss": 0.76705366, + "num_input_tokens_seen": 95484115, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8515625, + "step": 4415, + "time_per_iteration": 2.5531253814697266 + }, + { + "auxiliary_loss_clip": 0.01138825, + "auxiliary_loss_mlp": 0.01043694, + "balance_loss_clip": 1.02506185, + "balance_loss_mlp": 1.04656732, + "epoch": 0.2655042837817526, + "flos": 17565274097280.0, + "grad_norm": 2.3504707987247917, + "language_loss": 0.86662048, + "learning_rate": 3.4458626867688707e-06, + "loss": 0.88844568, + "num_input_tokens_seen": 95501435, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.921875, + "step": 4416, + "time_per_iteration": 2.4751384258270264 + }, + { + "auxiliary_loss_clip": 0.0113975, + "auxiliary_loss_mlp": 0.01042134, + "balance_loss_clip": 1.02439594, + "balance_loss_mlp": 1.0492208, + "epoch": 0.2655644070344206, + "flos": 23404636298880.0, + "grad_norm": 1.6281293305848954, + "language_loss": 0.76152384, + "learning_rate": 3.4455935725409217e-06, + "loss": 0.78334266, + "num_input_tokens_seen": 95520135, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.90625, + "step": 4417, + "time_per_iteration": 2.5017013549804688 + }, + { + "auxiliary_loss_clip": 0.01135215, + "auxiliary_loss_mlp": 0.01039785, + "balance_loss_clip": 1.02167702, + "balance_loss_mlp": 1.04778051, + "epoch": 0.26562453028708854, + "flos": 26468462678400.0, + "grad_norm": 1.7397383944852411, + "language_loss": 0.79984045, + "learning_rate": 3.4453244034959196e-06, + "loss": 0.82159042, + "num_input_tokens_seen": 95541705, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.875, + "step": 4418, + "time_per_iteration": 2.539454460144043 + }, + { + "auxiliary_loss_clip": 0.01138688, + "auxiliary_loss_mlp": 0.01046556, + "balance_loss_clip": 1.02983057, + "balance_loss_mlp": 1.04861307, + "epoch": 0.2656846535397565, + "flos": 19207576759680.0, + "grad_norm": 2.7780034581995965, + "language_loss": 0.67397833, + "learning_rate": 3.445055179644071e-06, + "loss": 0.69583082, + "num_input_tokens_seen": 95560300, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.90234375, + "step": 4419, + "time_per_iteration": 2.461444616317749 + }, + { + "auxiliary_loss_clip": 0.01139197, + "auxiliary_loss_mlp": 0.01045382, + "balance_loss_clip": 1.02739358, + "balance_loss_mlp": 1.04920876, + "epoch": 0.2657447767924245, + "flos": 30551325903360.0, + "grad_norm": 2.097903587873874, + "language_loss": 0.79365611, + "learning_rate": 3.444785900995585e-06, + "loss": 0.81550193, + "num_input_tokens_seen": 95580150, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.8984375, + "step": 4420, + "time_per_iteration": 2.5908427238464355 + }, + { + "auxiliary_loss_clip": 0.01141654, + "auxiliary_loss_mlp": 0.01049212, + "balance_loss_clip": 1.02990031, + "balance_loss_mlp": 1.0493983, + "epoch": 0.26580490004509244, + "flos": 20922742160640.0, + "grad_norm": 2.1223383047232933, + "language_loss": 0.81612432, + "learning_rate": 3.444516567560673e-06, + "loss": 0.83803296, + "num_input_tokens_seen": 95597570, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.921875, + "step": 4421, + "time_per_iteration": 2.4869320392608643 + }, + { + "auxiliary_loss_clip": 0.01134642, + "auxiliary_loss_mlp": 0.01037045, + "balance_loss_clip": 1.02027202, + "balance_loss_mlp": 1.04734015, + "epoch": 0.2658650232977604, + "flos": 43945682584320.0, + "grad_norm": 1.5724937400793966, + "language_loss": 0.65278006, + "learning_rate": 3.444247179349548e-06, + "loss": 0.67449689, + "num_input_tokens_seen": 95619415, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.87109375, + "step": 4422, + "time_per_iteration": 2.7370638847351074 + }, + { + "auxiliary_loss_clip": 0.01138513, + "auxiliary_loss_mlp": 0.01046085, + "balance_loss_clip": 1.02965808, + "balance_loss_mlp": 1.04750621, + "epoch": 0.26592514655042837, + "flos": 29716439109120.0, + "grad_norm": 2.411979213410041, + "language_loss": 0.73841226, + "learning_rate": 3.4439777363724252e-06, + "loss": 0.76025832, + "num_input_tokens_seen": 95639155, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.91015625, + "step": 4423, + "time_per_iteration": 2.5510191917419434 + }, + { + "auxiliary_loss_clip": 0.01136367, + "auxiliary_loss_mlp": 0.01046611, + "balance_loss_clip": 1.03017163, + "balance_loss_mlp": 1.04504442, + "epoch": 0.26598526980309634, + "flos": 46677730014720.0, + "grad_norm": 1.6317340067044743, + "language_loss": 0.77703154, + "learning_rate": 3.443708238639522e-06, + "loss": 0.79886127, + "num_input_tokens_seen": 95663320, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9140625, + "step": 4424, + "time_per_iteration": 2.809495449066162 + }, + { + "auxiliary_loss_clip": 0.01137168, + "auxiliary_loss_mlp": 0.01043079, + "balance_loss_clip": 1.02675951, + "balance_loss_mlp": 1.04695249, + "epoch": 0.2660453930557643, + "flos": 11509442582400.0, + "grad_norm": 2.064218808714238, + "language_loss": 0.79345673, + "learning_rate": 3.4434386861610573e-06, + "loss": 0.81525922, + "num_input_tokens_seen": 95680260, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.90234375, + "step": 4425, + "time_per_iteration": 2.48149037361145 + }, + { + "auxiliary_loss_clip": 0.01133425, + "auxiliary_loss_mlp": 0.01046814, + "balance_loss_clip": 1.03138816, + "balance_loss_mlp": 1.04685736, + "epoch": 0.26610551630843227, + "flos": 24791578197120.0, + "grad_norm": 1.774406296589384, + "language_loss": 0.80463314, + "learning_rate": 3.4431690789472532e-06, + "loss": 0.82643557, + "num_input_tokens_seen": 95701140, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8671875, + "step": 4426, + "time_per_iteration": 2.5968613624572754 + }, + { + "auxiliary_loss_clip": 0.01138948, + "auxiliary_loss_mlp": 0.0104835, + "balance_loss_clip": 1.03180957, + "balance_loss_mlp": 1.04982209, + "epoch": 0.26616563956110023, + "flos": 27636385397760.0, + "grad_norm": 1.8207507571493768, + "language_loss": 0.77337295, + "learning_rate": 3.442899417008333e-06, + "loss": 0.79524601, + "num_input_tokens_seen": 95722060, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.890625, + "step": 4427, + "time_per_iteration": 4.045380353927612 + }, + { + "auxiliary_loss_clip": 0.01133558, + "auxiliary_loss_mlp": 0.010331, + "balance_loss_clip": 1.01760316, + "balance_loss_mlp": 1.04737306, + "epoch": 0.26622576281376825, + "flos": 28362893880960.0, + "grad_norm": 1.8400253790543033, + "language_loss": 0.76800078, + "learning_rate": 3.4426297003545227e-06, + "loss": 0.78966737, + "num_input_tokens_seen": 95742495, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.86328125, + "step": 4428, + "time_per_iteration": 4.018831491470337 + }, + { + "auxiliary_loss_clip": 0.01135115, + "auxiliary_loss_mlp": 0.01034355, + "balance_loss_clip": 1.01858354, + "balance_loss_mlp": 1.04529297, + "epoch": 0.2662858860664362, + "flos": 18041341979520.0, + "grad_norm": 1.9075878866801723, + "language_loss": 0.83010298, + "learning_rate": 3.4423599289960495e-06, + "loss": 0.8517977, + "num_input_tokens_seen": 95761510, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8984375, + "step": 4429, + "time_per_iteration": 2.576535940170288 + }, + { + "auxiliary_loss_clip": 0.01133677, + "auxiliary_loss_mlp": 0.01042932, + "balance_loss_clip": 1.02644563, + "balance_loss_mlp": 1.04664719, + "epoch": 0.2663460093191042, + "flos": 22745818995840.0, + "grad_norm": 3.2197583620662082, + "language_loss": 0.72143924, + "learning_rate": 3.442090102943143e-06, + "loss": 0.74320537, + "num_input_tokens_seen": 95782385, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.87109375, + "step": 4430, + "time_per_iteration": 2.5262365341186523 + }, + { + "auxiliary_loss_clip": 0.01136153, + "auxiliary_loss_mlp": 0.01042808, + "balance_loss_clip": 1.02453375, + "balance_loss_mlp": 1.04667306, + "epoch": 0.26640613257177215, + "flos": 16508782344960.0, + "grad_norm": 2.382555523964676, + "language_loss": 0.81635833, + "learning_rate": 3.441820222206035e-06, + "loss": 0.83814788, + "num_input_tokens_seen": 95800595, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.89453125, + "step": 4431, + "time_per_iteration": 2.5135624408721924 + }, + { + "auxiliary_loss_clip": 0.01142285, + "auxiliary_loss_mlp": 0.01050952, + "balance_loss_clip": 1.03360736, + "balance_loss_mlp": 1.04865289, + "epoch": 0.2664662558244401, + "flos": 23075945919360.0, + "grad_norm": 2.34486467491615, + "language_loss": 0.76153386, + "learning_rate": 3.44155028679496e-06, + "loss": 0.78346616, + "num_input_tokens_seen": 95818480, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9375, + "step": 4432, + "time_per_iteration": 2.469515562057495 + }, + { + "auxiliary_loss_clip": 0.01136779, + "auxiliary_loss_mlp": 0.01044676, + "balance_loss_clip": 1.02711606, + "balance_loss_mlp": 1.04703665, + "epoch": 0.2665263790771081, + "flos": 23769273214080.0, + "grad_norm": 2.148919041496035, + "language_loss": 0.82521772, + "learning_rate": 3.441280296720154e-06, + "loss": 0.84703225, + "num_input_tokens_seen": 95837205, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8984375, + "step": 4433, + "time_per_iteration": 2.540174961090088 + }, + { + "auxiliary_loss_clip": 0.01138849, + "auxiliary_loss_mlp": 0.01048222, + "balance_loss_clip": 1.03065097, + "balance_loss_mlp": 1.04955435, + "epoch": 0.26658650232977604, + "flos": 28001273708160.0, + "grad_norm": 2.091984027516481, + "language_loss": 0.76638913, + "learning_rate": 3.441010251991854e-06, + "loss": 0.78825986, + "num_input_tokens_seen": 95858395, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.890625, + "step": 4434, + "time_per_iteration": 2.549769878387451 + }, + { + "auxiliary_loss_clip": 0.01133542, + "auxiliary_loss_mlp": 0.01043118, + "balance_loss_clip": 1.02770376, + "balance_loss_mlp": 1.04645348, + "epoch": 0.266646625582444, + "flos": 22163635359360.0, + "grad_norm": 2.251252650424801, + "language_loss": 0.82632279, + "learning_rate": 3.440740152620301e-06, + "loss": 0.84808934, + "num_input_tokens_seen": 95877875, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.87109375, + "step": 4435, + "time_per_iteration": 2.5329744815826416 + }, + { + "auxiliary_loss_clip": 0.01140704, + "auxiliary_loss_mlp": 0.0105698, + "balance_loss_clip": 1.03870487, + "balance_loss_mlp": 1.04742312, + "epoch": 0.266706748835112, + "flos": 27853537069440.0, + "grad_norm": 2.2611652281579397, + "language_loss": 0.87278962, + "learning_rate": 3.4404699986157376e-06, + "loss": 0.89476645, + "num_input_tokens_seen": 95895820, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.9296875, + "step": 4436, + "time_per_iteration": 2.5375254154205322 + }, + { + "auxiliary_loss_clip": 0.01136328, + "auxiliary_loss_mlp": 0.0104305, + "balance_loss_clip": 1.02670658, + "balance_loss_mlp": 1.04566383, + "epoch": 0.26676687208777994, + "flos": 25812123413760.0, + "grad_norm": 1.4304916595737875, + "language_loss": 0.78941, + "learning_rate": 3.440199789988407e-06, + "loss": 0.81120378, + "num_input_tokens_seen": 95918025, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90625, + "step": 4437, + "time_per_iteration": 2.591017007827759 + }, + { + "auxiliary_loss_clip": 0.01134502, + "auxiliary_loss_mlp": 0.01041567, + "balance_loss_clip": 1.02533066, + "balance_loss_mlp": 1.04595256, + "epoch": 0.2668269953404479, + "flos": 36064583504640.0, + "grad_norm": 2.0731379310987412, + "language_loss": 0.63412011, + "learning_rate": 3.439929526748556e-06, + "loss": 0.65588087, + "num_input_tokens_seen": 95937725, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8828125, + "step": 4438, + "time_per_iteration": 2.6429452896118164 + }, + { + "auxiliary_loss_clip": 0.01137556, + "auxiliary_loss_mlp": 0.01036987, + "balance_loss_clip": 1.02125144, + "balance_loss_mlp": 1.04869223, + "epoch": 0.26688711859311587, + "flos": 26570987072640.0, + "grad_norm": 1.8133794638407341, + "language_loss": 0.75628942, + "learning_rate": 3.4396592089064334e-06, + "loss": 0.77803481, + "num_input_tokens_seen": 95956335, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.890625, + "step": 4439, + "time_per_iteration": 2.5296032428741455 + }, + { + "auxiliary_loss_clip": 0.01140638, + "auxiliary_loss_mlp": 0.01038527, + "balance_loss_clip": 1.02052629, + "balance_loss_mlp": 1.04913759, + "epoch": 0.26694724184578383, + "flos": 26761565658240.0, + "grad_norm": 1.7792140134846064, + "language_loss": 0.71444011, + "learning_rate": 3.4393888364722897e-06, + "loss": 0.7362318, + "num_input_tokens_seen": 95977135, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9140625, + "step": 4440, + "time_per_iteration": 2.5714335441589355 + }, + { + "auxiliary_loss_clip": 0.01139576, + "auxiliary_loss_mlp": 0.01045623, + "balance_loss_clip": 1.02757502, + "balance_loss_mlp": 1.04816949, + "epoch": 0.2670073650984518, + "flos": 20959586536320.0, + "grad_norm": 1.8363906583736056, + "language_loss": 0.66291904, + "learning_rate": 3.439118409456376e-06, + "loss": 0.68477106, + "num_input_tokens_seen": 95995435, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9140625, + "step": 4441, + "time_per_iteration": 2.522589683532715 + }, + { + "auxiliary_loss_clip": 0.01137665, + "auxiliary_loss_mlp": 0.0104418, + "balance_loss_clip": 1.02654862, + "balance_loss_mlp": 1.04803538, + "epoch": 0.2670674883511198, + "flos": 28366054277760.0, + "grad_norm": 1.5597318548365904, + "language_loss": 0.76451373, + "learning_rate": 3.4388479278689486e-06, + "loss": 0.78633213, + "num_input_tokens_seen": 96016340, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.89453125, + "step": 4442, + "time_per_iteration": 2.5659492015838623 + }, + { + "auxiliary_loss_clip": 0.01060214, + "auxiliary_loss_mlp": 0.0100059, + "balance_loss_clip": 0.99855101, + "balance_loss_mlp": 1.02895594, + "epoch": 0.2671276116037878, + "flos": 58971319430400.0, + "grad_norm": 0.912864167592289, + "language_loss": 0.61270142, + "learning_rate": 3.4385773917202637e-06, + "loss": 0.63330936, + "num_input_tokens_seen": 96071205, + "router_z_loss_clip": 0.02038574, + "router_z_loss_mlp": 0.3125, + "step": 4443, + "time_per_iteration": 3.0256776809692383 + }, + { + "auxiliary_loss_clip": 0.01140806, + "auxiliary_loss_mlp": 0.01035952, + "balance_loss_clip": 1.01968026, + "balance_loss_mlp": 1.0495882, + "epoch": 0.26718773485645575, + "flos": 43945072053120.0, + "grad_norm": 1.5525166591100914, + "language_loss": 0.76200545, + "learning_rate": 3.4383068010205793e-06, + "loss": 0.78377306, + "num_input_tokens_seen": 96094240, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.91015625, + "step": 4444, + "time_per_iteration": 2.7414674758911133 + }, + { + "auxiliary_loss_clip": 0.0114013, + "auxiliary_loss_mlp": 0.01040836, + "balance_loss_clip": 1.02330077, + "balance_loss_mlp": 1.04932773, + "epoch": 0.2672478581091237, + "flos": 25228323665280.0, + "grad_norm": 3.16165776963455, + "language_loss": 0.80212528, + "learning_rate": 3.438036155780158e-06, + "loss": 0.82393491, + "num_input_tokens_seen": 96114105, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.90625, + "step": 4445, + "time_per_iteration": 2.5349111557006836 + }, + { + "auxiliary_loss_clip": 0.01139166, + "auxiliary_loss_mlp": 0.0104, + "balance_loss_clip": 1.02232134, + "balance_loss_mlp": 1.04797101, + "epoch": 0.2673079813617917, + "flos": 15268176455040.0, + "grad_norm": 2.3952290716593825, + "language_loss": 0.89144397, + "learning_rate": 3.43776545600926e-06, + "loss": 0.91323566, + "num_input_tokens_seen": 96132140, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.91015625, + "step": 4446, + "time_per_iteration": 2.5512521266937256 + }, + { + "auxiliary_loss_clip": 0.01140462, + "auxiliary_loss_mlp": 0.01047593, + "balance_loss_clip": 1.0311892, + "balance_loss_mlp": 1.04977763, + "epoch": 0.26736810461445965, + "flos": 25812733944960.0, + "grad_norm": 1.831363923725005, + "language_loss": 0.68259656, + "learning_rate": 3.437494701718153e-06, + "loss": 0.70447719, + "num_input_tokens_seen": 96152090, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90625, + "step": 4447, + "time_per_iteration": 2.5752837657928467 + }, + { + "auxiliary_loss_clip": 0.01140858, + "auxiliary_loss_mlp": 0.01040004, + "balance_loss_clip": 1.02261138, + "balance_loss_mlp": 1.04972827, + "epoch": 0.2674282278671276, + "flos": 24312709054080.0, + "grad_norm": 1.9862084341014827, + "language_loss": 0.82976532, + "learning_rate": 3.4372238929171026e-06, + "loss": 0.85157394, + "num_input_tokens_seen": 96170015, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.91015625, + "step": 4448, + "time_per_iteration": 2.6524059772491455 + }, + { + "auxiliary_loss_clip": 0.01137667, + "auxiliary_loss_mlp": 0.01048508, + "balance_loss_clip": 1.03110301, + "balance_loss_mlp": 1.04973495, + "epoch": 0.2674883511197956, + "flos": 22815521337600.0, + "grad_norm": 2.185461436072074, + "language_loss": 0.84288895, + "learning_rate": 3.436953029616378e-06, + "loss": 0.86475068, + "num_input_tokens_seen": 96188065, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.87890625, + "step": 4449, + "time_per_iteration": 2.5167598724365234 + }, + { + "auxiliary_loss_clip": 0.01148403, + "auxiliary_loss_mlp": 0.01047832, + "balance_loss_clip": 1.02892506, + "balance_loss_mlp": 1.05114913, + "epoch": 0.26754847437246354, + "flos": 25370170473600.0, + "grad_norm": 1.9936425417360089, + "language_loss": 0.84260273, + "learning_rate": 3.4366821118262506e-06, + "loss": 0.86456501, + "num_input_tokens_seen": 96205780, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.97265625, + "step": 4450, + "time_per_iteration": 2.555941343307495 + }, + { + "auxiliary_loss_clip": 0.01133946, + "auxiliary_loss_mlp": 0.0104095, + "balance_loss_clip": 1.02560782, + "balance_loss_mlp": 1.04674196, + "epoch": 0.2676085976251315, + "flos": 20230420446720.0, + "grad_norm": 1.900524277018137, + "language_loss": 0.81065774, + "learning_rate": 3.4364111395569937e-06, + "loss": 0.83240664, + "num_input_tokens_seen": 96224990, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.87109375, + "step": 4451, + "time_per_iteration": 2.5289859771728516 + }, + { + "auxiliary_loss_clip": 0.01140947, + "auxiliary_loss_mlp": 0.01041834, + "balance_loss_clip": 1.02593148, + "balance_loss_mlp": 1.05186319, + "epoch": 0.26766872087779947, + "flos": 28038225824640.0, + "grad_norm": 1.8040621200757803, + "language_loss": 0.86401796, + "learning_rate": 3.436140112818882e-06, + "loss": 0.88584578, + "num_input_tokens_seen": 96245345, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.890625, + "step": 4452, + "time_per_iteration": 2.617918014526367 + }, + { + "auxiliary_loss_clip": 0.01143372, + "auxiliary_loss_mlp": 0.01037743, + "balance_loss_clip": 1.02055311, + "balance_loss_mlp": 1.05132198, + "epoch": 0.26772884413046744, + "flos": 18325179250560.0, + "grad_norm": 1.9731948573099198, + "language_loss": 0.83129871, + "learning_rate": 3.435869031622194e-06, + "loss": 0.8531099, + "num_input_tokens_seen": 96259000, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.921875, + "step": 4453, + "time_per_iteration": 2.483130931854248 + }, + { + "auxiliary_loss_clip": 0.0113897, + "auxiliary_loss_mlp": 0.01046975, + "balance_loss_clip": 1.02936745, + "balance_loss_mlp": 1.04995108, + "epoch": 0.2677889673831354, + "flos": 22127509255680.0, + "grad_norm": 1.62656613015929, + "language_loss": 0.79744816, + "learning_rate": 3.435597895977208e-06, + "loss": 0.81930768, + "num_input_tokens_seen": 96277000, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.890625, + "step": 4454, + "time_per_iteration": 2.537853717803955 + }, + { + "auxiliary_loss_clip": 0.01141821, + "auxiliary_loss_mlp": 0.01042652, + "balance_loss_clip": 1.02599859, + "balance_loss_mlp": 1.04989707, + "epoch": 0.2678490906358034, + "flos": 23729699404800.0, + "grad_norm": 1.7640316216704761, + "language_loss": 0.7215519, + "learning_rate": 3.435326705894206e-06, + "loss": 0.74339664, + "num_input_tokens_seen": 96297010, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.91796875, + "step": 4455, + "time_per_iteration": 2.5023562908172607 + }, + { + "auxiliary_loss_clip": 0.01137457, + "auxiliary_loss_mlp": 0.01039805, + "balance_loss_clip": 1.02406991, + "balance_loss_mlp": 1.05066276, + "epoch": 0.2679092138884714, + "flos": 21762872340480.0, + "grad_norm": 1.5496021720121687, + "language_loss": 0.74044335, + "learning_rate": 3.435055461383471e-06, + "loss": 0.76221603, + "num_input_tokens_seen": 96315780, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8671875, + "step": 4456, + "time_per_iteration": 2.487581729888916 + }, + { + "auxiliary_loss_clip": 0.01141742, + "auxiliary_loss_mlp": 0.01038216, + "balance_loss_clip": 1.02121687, + "balance_loss_mlp": 1.04937947, + "epoch": 0.26796933714113935, + "flos": 19861186590720.0, + "grad_norm": 2.2089309948453697, + "language_loss": 0.70965469, + "learning_rate": 3.4347841624552896e-06, + "loss": 0.73145425, + "num_input_tokens_seen": 96333465, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.921875, + "step": 4457, + "time_per_iteration": 2.4584691524505615 + }, + { + "auxiliary_loss_clip": 0.01143072, + "auxiliary_loss_mlp": 0.010439, + "balance_loss_clip": 1.02681756, + "balance_loss_mlp": 1.05237103, + "epoch": 0.2680294603938073, + "flos": 20047886507520.0, + "grad_norm": 2.29797460876898, + "language_loss": 0.79029202, + "learning_rate": 3.4345128091199493e-06, + "loss": 0.81216174, + "num_input_tokens_seen": 96352005, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.90625, + "step": 4458, + "time_per_iteration": 2.6079578399658203 + }, + { + "auxiliary_loss_clip": 0.01052787, + "auxiliary_loss_mlp": 0.01006207, + "balance_loss_clip": 1.00439513, + "balance_loss_mlp": 1.02259135, + "epoch": 0.2680895836464753, + "flos": 72113763052800.0, + "grad_norm": 0.8640508796264214, + "language_loss": 0.58716619, + "learning_rate": 3.434241401387739e-06, + "loss": 0.60775614, + "num_input_tokens_seen": 96406265, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.30078125, + "step": 4459, + "time_per_iteration": 3.0725412368774414 + }, + { + "auxiliary_loss_clip": 0.0113409, + "auxiliary_loss_mlp": 0.01040081, + "balance_loss_clip": 1.02444053, + "balance_loss_mlp": 1.04671741, + "epoch": 0.26814970689914325, + "flos": 20449044576000.0, + "grad_norm": 2.0778557825519055, + "language_loss": 0.85224575, + "learning_rate": 3.4339699392689507e-06, + "loss": 0.87398744, + "num_input_tokens_seen": 96425225, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.875, + "step": 4460, + "time_per_iteration": 2.483299732208252 + }, + { + "auxiliary_loss_clip": 0.01136074, + "auxiliary_loss_mlp": 0.01043042, + "balance_loss_clip": 1.02653205, + "balance_loss_mlp": 1.04752469, + "epoch": 0.2682098301518112, + "flos": 17566674727680.0, + "grad_norm": 2.805871571962145, + "language_loss": 0.68256581, + "learning_rate": 3.4336984227738796e-06, + "loss": 0.70435691, + "num_input_tokens_seen": 96443780, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8828125, + "step": 4461, + "time_per_iteration": 2.439304828643799 + }, + { + "auxiliary_loss_clip": 0.01135713, + "auxiliary_loss_mlp": 0.01049055, + "balance_loss_clip": 1.03198409, + "balance_loss_mlp": 1.0470686, + "epoch": 0.2682699534044792, + "flos": 18333259810560.0, + "grad_norm": 1.5557483279788171, + "language_loss": 0.67342007, + "learning_rate": 3.43342685191282e-06, + "loss": 0.69526774, + "num_input_tokens_seen": 96464530, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.88671875, + "step": 4462, + "time_per_iteration": 2.5081140995025635 + }, + { + "auxiliary_loss_clip": 0.01136996, + "auxiliary_loss_mlp": 0.01041529, + "balance_loss_clip": 1.02413619, + "balance_loss_mlp": 1.04865909, + "epoch": 0.26833007665714714, + "flos": 25301294144640.0, + "grad_norm": 1.8707784514564991, + "language_loss": 0.6927141, + "learning_rate": 3.4331552266960705e-06, + "loss": 0.71449935, + "num_input_tokens_seen": 96483345, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.8828125, + "step": 4463, + "time_per_iteration": 2.5280556678771973 + }, + { + "auxiliary_loss_clip": 0.01140107, + "auxiliary_loss_mlp": 0.01041395, + "balance_loss_clip": 1.02414584, + "balance_loss_mlp": 1.04812574, + "epoch": 0.2683901999098151, + "flos": 16099759198080.0, + "grad_norm": 2.4976114648735304, + "language_loss": 0.77389008, + "learning_rate": 3.432883547133931e-06, + "loss": 0.79570508, + "num_input_tokens_seen": 96498305, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.921875, + "step": 4464, + "time_per_iteration": 2.469650983810425 + }, + { + "auxiliary_loss_clip": 0.01134508, + "auxiliary_loss_mlp": 0.01039425, + "balance_loss_clip": 1.02215123, + "balance_loss_mlp": 1.0458076, + "epoch": 0.2684503231624831, + "flos": 27308054154240.0, + "grad_norm": 1.844577670487785, + "language_loss": 0.70796561, + "learning_rate": 3.432611813236704e-06, + "loss": 0.72970498, + "num_input_tokens_seen": 96519740, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.88671875, + "step": 4465, + "time_per_iteration": 2.5685060024261475 + }, + { + "auxiliary_loss_clip": 0.01049569, + "auxiliary_loss_mlp": 0.0100238, + "balance_loss_clip": 1.00067484, + "balance_loss_mlp": 1.01956284, + "epoch": 0.26851044641515104, + "flos": 71858007239040.0, + "grad_norm": 0.6800540965400289, + "language_loss": 0.53096056, + "learning_rate": 3.4323400250146943e-06, + "loss": 0.55148005, + "num_input_tokens_seen": 96588870, + "router_z_loss_clip": 0.01708984, + "router_z_loss_mlp": 0.30078125, + "step": 4466, + "time_per_iteration": 3.2327654361724854 + }, + { + "auxiliary_loss_clip": 0.01133624, + "auxiliary_loss_mlp": 0.01039482, + "balance_loss_clip": 1.02219653, + "balance_loss_mlp": 1.04600596, + "epoch": 0.268570569667819, + "flos": 18733771434240.0, + "grad_norm": 2.0764143418179213, + "language_loss": 0.7343837, + "learning_rate": 3.4320681824782057e-06, + "loss": 0.75611472, + "num_input_tokens_seen": 96605100, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.875, + "step": 4467, + "time_per_iteration": 2.5052013397216797 + }, + { + "auxiliary_loss_clip": 0.01138792, + "auxiliary_loss_mlp": 0.01045438, + "balance_loss_clip": 1.0278548, + "balance_loss_mlp": 1.04801464, + "epoch": 0.268630692920487, + "flos": 18178376365440.0, + "grad_norm": 2.5834152956256555, + "language_loss": 0.80703115, + "learning_rate": 3.4317962856375493e-06, + "loss": 0.82887346, + "num_input_tokens_seen": 96621410, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.90625, + "step": 4468, + "time_per_iteration": 2.4547622203826904 + }, + { + "auxiliary_loss_clip": 0.01047735, + "auxiliary_loss_mlp": 0.01005617, + "balance_loss_clip": 1.00407946, + "balance_loss_mlp": 1.01768315, + "epoch": 0.268690816173155, + "flos": 68731768978560.0, + "grad_norm": 0.8449159500606429, + "language_loss": 0.59532088, + "learning_rate": 3.4315243345030334e-06, + "loss": 0.61585438, + "num_input_tokens_seen": 96684810, + "router_z_loss_clip": 0.01538086, + "router_z_loss_mlp": 0.30078125, + "step": 4469, + "time_per_iteration": 4.6310715675354 + }, + { + "auxiliary_loss_clip": 0.01137988, + "auxiliary_loss_mlp": 0.01045172, + "balance_loss_clip": 1.02687383, + "balance_loss_mlp": 1.04844749, + "epoch": 0.26875093942582295, + "flos": 23293636295040.0, + "grad_norm": 2.3316897890333954, + "language_loss": 0.81785607, + "learning_rate": 3.431252329084972e-06, + "loss": 0.83968771, + "num_input_tokens_seen": 96701920, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.89453125, + "step": 4470, + "time_per_iteration": 2.5501935482025146 + }, + { + "auxiliary_loss_clip": 0.01129268, + "auxiliary_loss_mlp": 0.01037606, + "balance_loss_clip": 1.02091098, + "balance_loss_mlp": 1.04484963, + "epoch": 0.2688110626784909, + "flos": 21543458112000.0, + "grad_norm": 1.6194658793917844, + "language_loss": 0.82648492, + "learning_rate": 3.4309802693936786e-06, + "loss": 0.84815365, + "num_input_tokens_seen": 96721260, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.84375, + "step": 4471, + "time_per_iteration": 2.559220552444458 + }, + { + "auxiliary_loss_clip": 0.0113472, + "auxiliary_loss_mlp": 0.010367, + "balance_loss_clip": 1.02042806, + "balance_loss_mlp": 1.04853129, + "epoch": 0.2688711859311589, + "flos": 28400600183040.0, + "grad_norm": 8.458966217412893, + "language_loss": 0.69382554, + "learning_rate": 3.43070815543947e-06, + "loss": 0.71553975, + "num_input_tokens_seen": 96740385, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.86328125, + "step": 4472, + "time_per_iteration": 2.561326742172241 + }, + { + "auxiliary_loss_clip": 0.01135298, + "auxiliary_loss_mlp": 0.01036687, + "balance_loss_clip": 1.02045035, + "balance_loss_mlp": 1.04783702, + "epoch": 0.26893130918382685, + "flos": 25994944661760.0, + "grad_norm": 1.596928542569954, + "language_loss": 0.67870784, + "learning_rate": 3.4304359872326656e-06, + "loss": 0.70042771, + "num_input_tokens_seen": 96761860, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.875, + "step": 4473, + "time_per_iteration": 2.5437636375427246 + }, + { + "auxiliary_loss_clip": 0.01133236, + "auxiliary_loss_mlp": 0.01044607, + "balance_loss_clip": 1.02844238, + "balance_loss_mlp": 1.04768729, + "epoch": 0.2689914324364948, + "flos": 20339624770560.0, + "grad_norm": 1.8504576821316179, + "language_loss": 0.82971931, + "learning_rate": 3.4301637647835843e-06, + "loss": 0.85149777, + "num_input_tokens_seen": 96781890, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.85546875, + "step": 4474, + "time_per_iteration": 2.474095582962036 + }, + { + "auxiliary_loss_clip": 0.01132567, + "auxiliary_loss_mlp": 0.01046818, + "balance_loss_clip": 1.03042698, + "balance_loss_mlp": 1.04697323, + "epoch": 0.2690515556891628, + "flos": 19464553635840.0, + "grad_norm": 2.0689967373005977, + "language_loss": 0.70303237, + "learning_rate": 3.4298914881025494e-06, + "loss": 0.72482622, + "num_input_tokens_seen": 96800390, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.85546875, + "step": 4475, + "time_per_iteration": 2.4865996837615967 + }, + { + "auxiliary_loss_clip": 0.01135068, + "auxiliary_loss_mlp": 0.01040112, + "balance_loss_clip": 1.02335167, + "balance_loss_mlp": 1.04614162, + "epoch": 0.26911167894183075, + "flos": 18146631720960.0, + "grad_norm": 1.7721029234489851, + "language_loss": 0.73711979, + "learning_rate": 3.4296191571998863e-06, + "loss": 0.75887156, + "num_input_tokens_seen": 96816685, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.890625, + "step": 4476, + "time_per_iteration": 2.477308988571167 + }, + { + "auxiliary_loss_clip": 0.01130545, + "auxiliary_loss_mlp": 0.01040281, + "balance_loss_clip": 1.02456927, + "balance_loss_mlp": 1.04561102, + "epoch": 0.2691718021944987, + "flos": 19975131509760.0, + "grad_norm": 1.720914514753409, + "language_loss": 0.80110955, + "learning_rate": 3.429346772085922e-06, + "loss": 0.8228178, + "num_input_tokens_seen": 96836285, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84765625, + "step": 4477, + "time_per_iteration": 2.497809648513794 + }, + { + "auxiliary_loss_clip": 0.01133072, + "auxiliary_loss_mlp": 0.01042879, + "balance_loss_clip": 1.02578449, + "balance_loss_mlp": 1.04442573, + "epoch": 0.2692319254471667, + "flos": 37447215770880.0, + "grad_norm": 1.9038830637231319, + "language_loss": 0.64580482, + "learning_rate": 3.429074332770984e-06, + "loss": 0.66756433, + "num_input_tokens_seen": 96857745, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.8828125, + "step": 4478, + "time_per_iteration": 2.6485564708709717 + }, + { + "auxiliary_loss_clip": 0.01130767, + "auxiliary_loss_mlp": 0.0104511, + "balance_loss_clip": 1.02876592, + "balance_loss_mlp": 1.04380882, + "epoch": 0.26929204869983464, + "flos": 22127796564480.0, + "grad_norm": 1.8571100614964546, + "language_loss": 0.80653036, + "learning_rate": 3.4288018392654047e-06, + "loss": 0.82828909, + "num_input_tokens_seen": 96877295, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8671875, + "step": 4479, + "time_per_iteration": 2.4851014614105225 + }, + { + "auxiliary_loss_clip": 0.01135761, + "auxiliary_loss_mlp": 0.01043964, + "balance_loss_clip": 1.02725112, + "balance_loss_mlp": 1.04611528, + "epoch": 0.2693521719525026, + "flos": 19792813052160.0, + "grad_norm": 2.4630797167742458, + "language_loss": 0.80834484, + "learning_rate": 3.4285292915795166e-06, + "loss": 0.83014214, + "num_input_tokens_seen": 96896160, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.89453125, + "step": 4480, + "time_per_iteration": 2.490147590637207 + }, + { + "auxiliary_loss_clip": 0.01124775, + "auxiliary_loss_mlp": 0.01036236, + "balance_loss_clip": 1.02066684, + "balance_loss_mlp": 1.04153395, + "epoch": 0.2694122952051706, + "flos": 20994383836800.0, + "grad_norm": 1.7677898796301312, + "language_loss": 0.77612787, + "learning_rate": 3.4282566897236543e-06, + "loss": 0.79773796, + "num_input_tokens_seen": 96915410, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.83203125, + "step": 4481, + "time_per_iteration": 2.4699158668518066 + }, + { + "auxiliary_loss_clip": 0.01134279, + "auxiliary_loss_mlp": 0.01044694, + "balance_loss_clip": 1.02737296, + "balance_loss_mlp": 1.04591584, + "epoch": 0.2694724184578386, + "flos": 25849291011840.0, + "grad_norm": 2.5981026313468525, + "language_loss": 0.74701524, + "learning_rate": 3.4279840337081547e-06, + "loss": 0.76880491, + "num_input_tokens_seen": 96937865, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.8828125, + "step": 4482, + "time_per_iteration": 2.556087017059326 + }, + { + "auxiliary_loss_clip": 0.01135034, + "auxiliary_loss_mlp": 0.01039094, + "balance_loss_clip": 1.02198792, + "balance_loss_mlp": 1.04693186, + "epoch": 0.26953254171050656, + "flos": 21726961718400.0, + "grad_norm": 1.852738059166697, + "language_loss": 0.72176206, + "learning_rate": 3.4277113235433584e-06, + "loss": 0.74350333, + "num_input_tokens_seen": 96957710, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.8828125, + "step": 4483, + "time_per_iteration": 2.4762344360351562 + }, + { + "auxiliary_loss_clip": 0.01133416, + "auxiliary_loss_mlp": 0.01043511, + "balance_loss_clip": 1.02635717, + "balance_loss_mlp": 1.04290676, + "epoch": 0.2695926649631745, + "flos": 19682926369920.0, + "grad_norm": 2.626283812761087, + "language_loss": 0.87107188, + "learning_rate": 3.427438559239605e-06, + "loss": 0.8928411, + "num_input_tokens_seen": 96975890, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.90625, + "step": 4484, + "time_per_iteration": 2.486185073852539 + }, + { + "auxiliary_loss_clip": 0.01131969, + "auxiliary_loss_mlp": 0.01040339, + "balance_loss_clip": 1.02447212, + "balance_loss_mlp": 1.04373026, + "epoch": 0.2696527882158425, + "flos": 32886596724480.0, + "grad_norm": 1.901905407661022, + "language_loss": 0.66389644, + "learning_rate": 3.427165740807239e-06, + "loss": 0.68561947, + "num_input_tokens_seen": 96998595, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8828125, + "step": 4485, + "time_per_iteration": 2.5674586296081543 + }, + { + "auxiliary_loss_clip": 0.01133447, + "auxiliary_loss_mlp": 0.01040269, + "balance_loss_clip": 1.02371132, + "balance_loss_mlp": 1.0445261, + "epoch": 0.26971291146851045, + "flos": 12124843320960.0, + "grad_norm": 2.8933932068842783, + "language_loss": 0.72378826, + "learning_rate": 3.426892868256604e-06, + "loss": 0.74552536, + "num_input_tokens_seen": 97013715, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.890625, + "step": 4486, + "time_per_iteration": 2.471036434173584 + }, + { + "auxiliary_loss_clip": 0.01137696, + "auxiliary_loss_mlp": 0.01038547, + "balance_loss_clip": 1.02257311, + "balance_loss_mlp": 1.04809284, + "epoch": 0.2697730347211784, + "flos": 22634459856000.0, + "grad_norm": 1.8546648123058087, + "language_loss": 0.83810318, + "learning_rate": 3.4266199415980495e-06, + "loss": 0.85986561, + "num_input_tokens_seen": 97031570, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8984375, + "step": 4487, + "time_per_iteration": 2.4867916107177734 + }, + { + "auxiliary_loss_clip": 0.01137573, + "auxiliary_loss_mlp": 0.01044901, + "balance_loss_clip": 1.02749646, + "balance_loss_mlp": 1.0477773, + "epoch": 0.2698331579738464, + "flos": 23513050523520.0, + "grad_norm": 2.2079504028023598, + "language_loss": 0.71220767, + "learning_rate": 3.4263469608419234e-06, + "loss": 0.73403245, + "num_input_tokens_seen": 97049815, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.8984375, + "step": 4488, + "time_per_iteration": 2.5174567699432373 + }, + { + "auxiliary_loss_clip": 0.01136886, + "auxiliary_loss_mlp": 0.01045434, + "balance_loss_clip": 1.02851868, + "balance_loss_mlp": 1.04792523, + "epoch": 0.26989328122651435, + "flos": 24641040297600.0, + "grad_norm": 1.6338784898376273, + "language_loss": 0.83736706, + "learning_rate": 3.426073925998578e-06, + "loss": 0.85919023, + "num_input_tokens_seen": 97067570, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.890625, + "step": 4489, + "time_per_iteration": 2.5314295291900635 + }, + { + "auxiliary_loss_clip": 0.01136964, + "auxiliary_loss_mlp": 0.01054545, + "balance_loss_clip": 1.03696203, + "balance_loss_mlp": 1.04693484, + "epoch": 0.2699534044791823, + "flos": 10772555068800.0, + "grad_norm": 2.5551945574509176, + "language_loss": 0.89805245, + "learning_rate": 3.4258008370783656e-06, + "loss": 0.91996753, + "num_input_tokens_seen": 97082180, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8984375, + "step": 4490, + "time_per_iteration": 2.4975826740264893 + }, + { + "auxiliary_loss_clip": 0.01128305, + "auxiliary_loss_mlp": 0.01042718, + "balance_loss_clip": 1.02741122, + "balance_loss_mlp": 1.04349554, + "epoch": 0.2700135277318503, + "flos": 36171597098880.0, + "grad_norm": 1.8455290723250308, + "language_loss": 0.73354411, + "learning_rate": 3.4255276940916434e-06, + "loss": 0.75525427, + "num_input_tokens_seen": 97103470, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.84765625, + "step": 4491, + "time_per_iteration": 2.6303470134735107 + }, + { + "auxiliary_loss_clip": 0.01138617, + "auxiliary_loss_mlp": 0.01043046, + "balance_loss_clip": 1.02613568, + "balance_loss_mlp": 1.04974079, + "epoch": 0.27007365098451824, + "flos": 17418614866560.0, + "grad_norm": 3.089516252272487, + "language_loss": 0.74379975, + "learning_rate": 3.4252544970487676e-06, + "loss": 0.7656163, + "num_input_tokens_seen": 97118100, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.890625, + "step": 4492, + "time_per_iteration": 2.5124619007110596 + }, + { + "auxiliary_loss_clip": 0.01133231, + "auxiliary_loss_mlp": 0.01040234, + "balance_loss_clip": 1.0241406, + "balance_loss_mlp": 1.04671812, + "epoch": 0.2701337742371862, + "flos": 23185688947200.0, + "grad_norm": 1.896651323252439, + "language_loss": 0.88740528, + "learning_rate": 3.4249812459600986e-06, + "loss": 0.90913987, + "num_input_tokens_seen": 97136765, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8671875, + "step": 4493, + "time_per_iteration": 2.480473756790161 + }, + { + "auxiliary_loss_clip": 0.01134006, + "auxiliary_loss_mlp": 0.01041715, + "balance_loss_clip": 1.02564538, + "balance_loss_mlp": 1.04676843, + "epoch": 0.2701938974898542, + "flos": 24389450461440.0, + "grad_norm": 1.468971775969503, + "language_loss": 0.70976114, + "learning_rate": 3.424707940835998e-06, + "loss": 0.73151839, + "num_input_tokens_seen": 97157470, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87109375, + "step": 4494, + "time_per_iteration": 2.5703446865081787 + }, + { + "auxiliary_loss_clip": 0.0112953, + "auxiliary_loss_mlp": 0.01034192, + "balance_loss_clip": 1.01920152, + "balance_loss_mlp": 1.04545951, + "epoch": 0.2702540207425222, + "flos": 26214322976640.0, + "grad_norm": 2.0322990364449325, + "language_loss": 0.86294192, + "learning_rate": 3.42443458168683e-06, + "loss": 0.88457918, + "num_input_tokens_seen": 97176905, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.83984375, + "step": 4495, + "time_per_iteration": 2.5428457260131836 + }, + { + "auxiliary_loss_clip": 0.01134, + "auxiliary_loss_mlp": 0.0104559, + "balance_loss_clip": 1.02968764, + "balance_loss_mlp": 1.04731214, + "epoch": 0.27031414399519016, + "flos": 22926377687040.0, + "grad_norm": 1.8698467905293557, + "language_loss": 0.76562083, + "learning_rate": 3.424161168522959e-06, + "loss": 0.7874167, + "num_input_tokens_seen": 97196380, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8671875, + "step": 4496, + "time_per_iteration": 2.5074446201324463 + }, + { + "auxiliary_loss_clip": 0.01048323, + "auxiliary_loss_mlp": 0.01012102, + "balance_loss_clip": 1.01042128, + "balance_loss_mlp": 1.01925802, + "epoch": 0.2703742672478581, + "flos": 63019780404480.0, + "grad_norm": 0.7221920911850954, + "language_loss": 0.50221699, + "learning_rate": 3.423887701354754e-06, + "loss": 0.52282125, + "num_input_tokens_seen": 97260100, + "router_z_loss_clip": 0.0168457, + "router_z_loss_mlp": 0.2890625, + "step": 4497, + "time_per_iteration": 3.110724687576294 + }, + { + "auxiliary_loss_clip": 0.01137008, + "auxiliary_loss_mlp": 0.01045622, + "balance_loss_clip": 1.03011322, + "balance_loss_mlp": 1.05020094, + "epoch": 0.2704343905005261, + "flos": 18840820942080.0, + "grad_norm": 1.6519561002314052, + "language_loss": 0.72420043, + "learning_rate": 3.4236141801925847e-06, + "loss": 0.74602675, + "num_input_tokens_seen": 97277935, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8671875, + "step": 4498, + "time_per_iteration": 2.522507429122925 + }, + { + "auxiliary_loss_clip": 0.01047265, + "auxiliary_loss_mlp": 0.0100549, + "balance_loss_clip": 1.0038569, + "balance_loss_mlp": 1.0182879, + "epoch": 0.27049451375319405, + "flos": 71233412618880.0, + "grad_norm": 0.7584910907853958, + "language_loss": 0.59222841, + "learning_rate": 3.4233406050468237e-06, + "loss": 0.61275595, + "num_input_tokens_seen": 97338845, + "router_z_loss_clip": 0.01635742, + "router_z_loss_mlp": 0.2890625, + "step": 4499, + "time_per_iteration": 3.1193060874938965 + }, + { + "auxiliary_loss_clip": 0.01132683, + "auxiliary_loss_mlp": 0.01036933, + "balance_loss_clip": 1.02085209, + "balance_loss_mlp": 1.04637063, + "epoch": 0.270554637005862, + "flos": 24278594112000.0, + "grad_norm": 2.0468109740969576, + "language_loss": 0.7361812, + "learning_rate": 3.4230669759278438e-06, + "loss": 0.75787735, + "num_input_tokens_seen": 97356640, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.86328125, + "step": 4500, + "time_per_iteration": 2.5073533058166504 + }, + { + "auxiliary_loss_clip": 0.01130893, + "auxiliary_loss_mlp": 0.01044299, + "balance_loss_clip": 1.02739513, + "balance_loss_mlp": 1.04379177, + "epoch": 0.27061476025853, + "flos": 17632318832640.0, + "grad_norm": 3.2528800155878765, + "language_loss": 0.80392325, + "learning_rate": 3.4227932928460215e-06, + "loss": 0.82567519, + "num_input_tokens_seen": 97372585, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.87109375, + "step": 4501, + "time_per_iteration": 2.4665989875793457 + }, + { + "auxiliary_loss_clip": 0.01134872, + "auxiliary_loss_mlp": 0.01044198, + "balance_loss_clip": 1.0278666, + "balance_loss_mlp": 1.04683352, + "epoch": 0.27067488351119795, + "flos": 22710123855360.0, + "grad_norm": 1.9148884605164396, + "language_loss": 0.72832727, + "learning_rate": 3.422519555811735e-06, + "loss": 0.75011796, + "num_input_tokens_seen": 97393315, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8828125, + "step": 4502, + "time_per_iteration": 2.511070489883423 + }, + { + "auxiliary_loss_clip": 0.01134337, + "auxiliary_loss_mlp": 0.01038575, + "balance_loss_clip": 1.0209558, + "balance_loss_mlp": 1.04282784, + "epoch": 0.2707350067638659, + "flos": 41719616087040.0, + "grad_norm": 1.724044037192685, + "language_loss": 0.68474984, + "learning_rate": 3.4222457648353642e-06, + "loss": 0.70647895, + "num_input_tokens_seen": 97417860, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9140625, + "step": 4503, + "time_per_iteration": 2.6554527282714844 + }, + { + "auxiliary_loss_clip": 0.01133759, + "auxiliary_loss_mlp": 0.01040282, + "balance_loss_clip": 1.02425468, + "balance_loss_mlp": 1.04659927, + "epoch": 0.2707951300165339, + "flos": 20193037367040.0, + "grad_norm": 2.0245220791315655, + "language_loss": 0.68488902, + "learning_rate": 3.4219719199272918e-06, + "loss": 0.7066294, + "num_input_tokens_seen": 97436780, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87109375, + "step": 4504, + "time_per_iteration": 2.4813036918640137 + }, + { + "auxiliary_loss_clip": 0.01135516, + "auxiliary_loss_mlp": 0.01043406, + "balance_loss_clip": 1.02811766, + "balance_loss_mlp": 1.05043292, + "epoch": 0.27085525326920185, + "flos": 21433966479360.0, + "grad_norm": 1.7616188880043606, + "language_loss": 0.75553012, + "learning_rate": 3.421698021097902e-06, + "loss": 0.77731931, + "num_input_tokens_seen": 97456190, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8515625, + "step": 4505, + "time_per_iteration": 2.482228994369507 + }, + { + "auxiliary_loss_clip": 0.01138199, + "auxiliary_loss_mlp": 0.01049925, + "balance_loss_clip": 1.03271127, + "balance_loss_mlp": 1.047171, + "epoch": 0.2709153765218698, + "flos": 17675232606720.0, + "grad_norm": 1.8888030992954683, + "language_loss": 0.73508286, + "learning_rate": 3.42142406835758e-06, + "loss": 0.75696409, + "num_input_tokens_seen": 97474545, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9140625, + "step": 4506, + "time_per_iteration": 2.493534803390503 + }, + { + "auxiliary_loss_clip": 0.01137077, + "auxiliary_loss_mlp": 0.01040919, + "balance_loss_clip": 1.02390218, + "balance_loss_mlp": 1.04818904, + "epoch": 0.2709754997745378, + "flos": 24456243801600.0, + "grad_norm": 2.012438120988393, + "language_loss": 0.80958861, + "learning_rate": 3.421150061716715e-06, + "loss": 0.83136857, + "num_input_tokens_seen": 97494520, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.890625, + "step": 4507, + "time_per_iteration": 2.488477945327759 + }, + { + "auxiliary_loss_clip": 0.01046128, + "auxiliary_loss_mlp": 0.01011944, + "balance_loss_clip": 1.0102514, + "balance_loss_mlp": 1.01738429, + "epoch": 0.2710356230272058, + "flos": 65210798206080.0, + "grad_norm": 0.7384209784394716, + "language_loss": 0.50892401, + "learning_rate": 3.420876001185698e-06, + "loss": 0.52950472, + "num_input_tokens_seen": 97552455, + "router_z_loss_clip": 0.01696777, + "router_z_loss_mlp": 0.28710938, + "step": 4508, + "time_per_iteration": 3.005894660949707 + }, + { + "auxiliary_loss_clip": 0.01129132, + "auxiliary_loss_mlp": 0.01039667, + "balance_loss_clip": 1.02413416, + "balance_loss_mlp": 1.04509401, + "epoch": 0.27109574627987376, + "flos": 25484438615040.0, + "grad_norm": 4.914093534195162, + "language_loss": 0.74373507, + "learning_rate": 3.4206018867749197e-06, + "loss": 0.76542306, + "num_input_tokens_seen": 97572650, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.83984375, + "step": 4509, + "time_per_iteration": 2.555645227432251 + }, + { + "auxiliary_loss_clip": 0.01126467, + "auxiliary_loss_mlp": 0.01039629, + "balance_loss_clip": 1.02418542, + "balance_loss_mlp": 1.04368544, + "epoch": 0.2711558695325417, + "flos": 19682782715520.0, + "grad_norm": 1.7859895301291084, + "language_loss": 0.71706283, + "learning_rate": 3.4203277184947757e-06, + "loss": 0.73872381, + "num_input_tokens_seen": 97591150, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.828125, + "step": 4510, + "time_per_iteration": 2.469756841659546 + }, + { + "auxiliary_loss_clip": 0.01133239, + "auxiliary_loss_mlp": 0.01034855, + "balance_loss_clip": 1.01921451, + "balance_loss_mlp": 1.04728365, + "epoch": 0.2712159927852097, + "flos": 18587758648320.0, + "grad_norm": 4.171230322312489, + "language_loss": 0.70698422, + "learning_rate": 3.4200534963556627e-06, + "loss": 0.72866517, + "num_input_tokens_seen": 97607410, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.859375, + "step": 4511, + "time_per_iteration": 3.9261832237243652 + }, + { + "auxiliary_loss_clip": 0.01133865, + "auxiliary_loss_mlp": 0.01043141, + "balance_loss_clip": 1.02660656, + "balance_loss_mlp": 1.04600286, + "epoch": 0.27127611603787766, + "flos": 25630235919360.0, + "grad_norm": 2.0859148079323564, + "language_loss": 0.80823237, + "learning_rate": 3.419779220367979e-06, + "loss": 0.83000243, + "num_input_tokens_seen": 97626870, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.87890625, + "step": 4512, + "time_per_iteration": 2.5112404823303223 + }, + { + "auxiliary_loss_clip": 0.01128916, + "auxiliary_loss_mlp": 0.01035298, + "balance_loss_clip": 1.02108788, + "balance_loss_mlp": 1.04543233, + "epoch": 0.2713362392905456, + "flos": 23148952312320.0, + "grad_norm": 1.880665339674376, + "language_loss": 0.80508482, + "learning_rate": 3.419504890542124e-06, + "loss": 0.82672697, + "num_input_tokens_seen": 97646595, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8359375, + "step": 4513, + "time_per_iteration": 2.5550525188446045 + }, + { + "auxiliary_loss_clip": 0.01132709, + "auxiliary_loss_mlp": 0.01042049, + "balance_loss_clip": 1.02668297, + "balance_loss_mlp": 1.04505134, + "epoch": 0.2713963625432136, + "flos": 18366045949440.0, + "grad_norm": 1.8883190176483522, + "language_loss": 0.88062817, + "learning_rate": 3.4192305068885026e-06, + "loss": 0.90237576, + "num_input_tokens_seen": 97665485, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.875, + "step": 4514, + "time_per_iteration": 2.4411823749542236 + }, + { + "auxiliary_loss_clip": 0.0113378, + "auxiliary_loss_mlp": 0.01041006, + "balance_loss_clip": 1.02475166, + "balance_loss_mlp": 1.04799736, + "epoch": 0.27145648579588155, + "flos": 22491751121280.0, + "grad_norm": 2.468440108941068, + "language_loss": 0.92064375, + "learning_rate": 3.418956069417517e-06, + "loss": 0.94239157, + "num_input_tokens_seen": 97683800, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.859375, + "step": 4515, + "time_per_iteration": 2.507073402404785 + }, + { + "auxiliary_loss_clip": 0.01140812, + "auxiliary_loss_mlp": 0.01050656, + "balance_loss_clip": 1.03202391, + "balance_loss_mlp": 1.04952395, + "epoch": 0.2715166090485495, + "flos": 19239177749760.0, + "grad_norm": 2.5869205534481017, + "language_loss": 0.73691195, + "learning_rate": 3.4186815781395756e-06, + "loss": 0.75882661, + "num_input_tokens_seen": 97700505, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.9140625, + "step": 4516, + "time_per_iteration": 2.4427852630615234 + }, + { + "auxiliary_loss_clip": 0.01134153, + "auxiliary_loss_mlp": 0.01040166, + "balance_loss_clip": 1.02352417, + "balance_loss_mlp": 1.0466857, + "epoch": 0.2715767323012175, + "flos": 17709598944000.0, + "grad_norm": 6.588152355110397, + "language_loss": 0.76239699, + "learning_rate": 3.4184070330650866e-06, + "loss": 0.78414017, + "num_input_tokens_seen": 97717410, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.875, + "step": 4517, + "time_per_iteration": 2.4891836643218994 + }, + { + "auxiliary_loss_clip": 0.01133662, + "auxiliary_loss_mlp": 0.01039085, + "balance_loss_clip": 1.02201402, + "balance_loss_mlp": 1.0473218, + "epoch": 0.27163685555388545, + "flos": 22382834106240.0, + "grad_norm": 2.2012309941627066, + "language_loss": 0.76785064, + "learning_rate": 3.4181324342044607e-06, + "loss": 0.78957808, + "num_input_tokens_seen": 97734545, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.86328125, + "step": 4518, + "time_per_iteration": 2.503117561340332 + }, + { + "auxiliary_loss_clip": 0.01133735, + "auxiliary_loss_mlp": 0.01039304, + "balance_loss_clip": 1.0241586, + "balance_loss_mlp": 1.04699707, + "epoch": 0.2716969788065534, + "flos": 22346708002560.0, + "grad_norm": 1.6415373198141725, + "language_loss": 0.68314338, + "learning_rate": 3.41785778156811e-06, + "loss": 0.7048738, + "num_input_tokens_seen": 97754000, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8671875, + "step": 4519, + "time_per_iteration": 2.573230028152466 + }, + { + "auxiliary_loss_clip": 0.01132669, + "auxiliary_loss_mlp": 0.01034398, + "balance_loss_clip": 1.01893663, + "balance_loss_mlp": 1.04631245, + "epoch": 0.2717571020592214, + "flos": 25228467319680.0, + "grad_norm": 2.6734918677628685, + "language_loss": 0.755759, + "learning_rate": 3.417583075166451e-06, + "loss": 0.7774297, + "num_input_tokens_seen": 97772080, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.86328125, + "step": 4520, + "time_per_iteration": 2.535546064376831 + }, + { + "auxiliary_loss_clip": 0.01138716, + "auxiliary_loss_mlp": 0.01044302, + "balance_loss_clip": 1.02628946, + "balance_loss_mlp": 1.0501039, + "epoch": 0.2718172253118894, + "flos": 20189769229440.0, + "grad_norm": 2.5201661256644523, + "language_loss": 0.76219606, + "learning_rate": 3.4173083150099e-06, + "loss": 0.78402621, + "num_input_tokens_seen": 97789370, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.88671875, + "step": 4521, + "time_per_iteration": 2.491654396057129 + }, + { + "auxiliary_loss_clip": 0.01137284, + "auxiliary_loss_mlp": 0.01047464, + "balance_loss_clip": 1.03102481, + "balance_loss_mlp": 1.04803133, + "epoch": 0.27187734856455736, + "flos": 14319129260160.0, + "grad_norm": 2.3970894391693967, + "language_loss": 0.75911158, + "learning_rate": 3.417033501108875e-06, + "loss": 0.78095901, + "num_input_tokens_seen": 97807385, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.890625, + "step": 4522, + "time_per_iteration": 2.471673011779785 + }, + { + "auxiliary_loss_clip": 0.01137707, + "auxiliary_loss_mlp": 0.0103702, + "balance_loss_clip": 1.02042627, + "balance_loss_mlp": 1.04873872, + "epoch": 0.27193747181722533, + "flos": 21107682311040.0, + "grad_norm": 5.0666434109354075, + "language_loss": 0.72895801, + "learning_rate": 3.416758633473798e-06, + "loss": 0.75070536, + "num_input_tokens_seen": 97827930, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.890625, + "step": 4523, + "time_per_iteration": 2.5152363777160645 + }, + { + "auxiliary_loss_clip": 0.01129262, + "auxiliary_loss_mlp": 0.01038594, + "balance_loss_clip": 1.02208352, + "balance_loss_mlp": 1.04448104, + "epoch": 0.2719975950698933, + "flos": 19682782715520.0, + "grad_norm": 1.5338044020439772, + "language_loss": 0.74324989, + "learning_rate": 3.4164837121150915e-06, + "loss": 0.76492846, + "num_input_tokens_seen": 97847440, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.84765625, + "step": 4524, + "time_per_iteration": 2.495253562927246 + }, + { + "auxiliary_loss_clip": 0.01135118, + "auxiliary_loss_mlp": 0.01039959, + "balance_loss_clip": 1.02380621, + "balance_loss_mlp": 1.04772878, + "epoch": 0.27205771832256126, + "flos": 24754482426240.0, + "grad_norm": 2.881398237919427, + "language_loss": 0.76651889, + "learning_rate": 3.4162087370431803e-06, + "loss": 0.78826964, + "num_input_tokens_seen": 97867620, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.875, + "step": 4525, + "time_per_iteration": 2.511634111404419 + }, + { + "auxiliary_loss_clip": 0.01131035, + "auxiliary_loss_mlp": 0.01049235, + "balance_loss_clip": 1.0334518, + "balance_loss_mlp": 1.04626358, + "epoch": 0.2721178415752292, + "flos": 21755581879680.0, + "grad_norm": 1.8599028556429251, + "language_loss": 0.81914634, + "learning_rate": 3.4159337082684926e-06, + "loss": 0.84094906, + "num_input_tokens_seen": 97884345, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.84765625, + "step": 4526, + "time_per_iteration": 2.495011568069458 + }, + { + "auxiliary_loss_clip": 0.01137971, + "auxiliary_loss_mlp": 0.01044775, + "balance_loss_clip": 1.02770483, + "balance_loss_mlp": 1.0466783, + "epoch": 0.2721779648278972, + "flos": 12676826597760.0, + "grad_norm": 3.313629745591453, + "language_loss": 0.77007318, + "learning_rate": 3.4156586258014566e-06, + "loss": 0.79190063, + "num_input_tokens_seen": 97901500, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9140625, + "step": 4527, + "time_per_iteration": 2.5181260108947754 + }, + { + "auxiliary_loss_clip": 0.0113407, + "auxiliary_loss_mlp": 0.01041797, + "balance_loss_clip": 1.02496481, + "balance_loss_mlp": 1.04637635, + "epoch": 0.27223808808056515, + "flos": 16253206099200.0, + "grad_norm": 2.1845797146290784, + "language_loss": 0.81825048, + "learning_rate": 3.415383489652503e-06, + "loss": 0.84000921, + "num_input_tokens_seen": 97917800, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.87890625, + "step": 4528, + "time_per_iteration": 2.469916582107544 + }, + { + "auxiliary_loss_clip": 0.01131576, + "auxiliary_loss_mlp": 0.01042667, + "balance_loss_clip": 1.0273608, + "balance_loss_mlp": 1.04669189, + "epoch": 0.2722982113332331, + "flos": 27745805203200.0, + "grad_norm": 1.6672454466706952, + "language_loss": 0.77123594, + "learning_rate": 3.4151082998320666e-06, + "loss": 0.79297841, + "num_input_tokens_seen": 97937225, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8515625, + "step": 4529, + "time_per_iteration": 2.5379140377044678 + }, + { + "auxiliary_loss_clip": 0.01133862, + "auxiliary_loss_mlp": 0.01044178, + "balance_loss_clip": 1.02900243, + "balance_loss_mlp": 1.04580855, + "epoch": 0.2723583345859011, + "flos": 21726243446400.0, + "grad_norm": 2.4153957329893228, + "language_loss": 0.8195889, + "learning_rate": 3.4148330563505805e-06, + "loss": 0.84136933, + "num_input_tokens_seen": 97956845, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8828125, + "step": 4530, + "time_per_iteration": 2.5363659858703613 + }, + { + "auxiliary_loss_clip": 0.01133042, + "auxiliary_loss_mlp": 0.01036315, + "balance_loss_clip": 1.02010226, + "balance_loss_mlp": 1.04630172, + "epoch": 0.27241845783856905, + "flos": 17347260499200.0, + "grad_norm": 2.1797176655983432, + "language_loss": 0.91650689, + "learning_rate": 3.4145577592184838e-06, + "loss": 0.93820047, + "num_input_tokens_seen": 97972465, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8671875, + "step": 4531, + "time_per_iteration": 2.508429765701294 + }, + { + "auxiliary_loss_clip": 0.01134833, + "auxiliary_loss_mlp": 0.01047772, + "balance_loss_clip": 1.03159511, + "balance_loss_mlp": 1.04611766, + "epoch": 0.272478581091237, + "flos": 24754302858240.0, + "grad_norm": 2.532443443519077, + "language_loss": 0.76107466, + "learning_rate": 3.4142824084462155e-06, + "loss": 0.78290069, + "num_input_tokens_seen": 97990770, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.88671875, + "step": 4532, + "time_per_iteration": 2.499457359313965 + }, + { + "auxiliary_loss_clip": 0.01130352, + "auxiliary_loss_mlp": 0.01034139, + "balance_loss_clip": 1.01861846, + "balance_loss_mlp": 1.04643464, + "epoch": 0.272538704343905, + "flos": 17890624512000.0, + "grad_norm": 3.1928401528407746, + "language_loss": 0.89197671, + "learning_rate": 3.4140070040442162e-06, + "loss": 0.91362166, + "num_input_tokens_seen": 98005775, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.83984375, + "step": 4533, + "time_per_iteration": 2.508202075958252 + }, + { + "auxiliary_loss_clip": 0.0113001, + "auxiliary_loss_mlp": 0.01037014, + "balance_loss_clip": 1.02118278, + "balance_loss_mlp": 1.04587626, + "epoch": 0.272598827596573, + "flos": 22932016122240.0, + "grad_norm": 2.096334750916122, + "language_loss": 0.7125262, + "learning_rate": 3.413731546022929e-06, + "loss": 0.73419642, + "num_input_tokens_seen": 98025750, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.84375, + "step": 4534, + "time_per_iteration": 2.5111024379730225 + }, + { + "auxiliary_loss_clip": 0.01134655, + "auxiliary_loss_mlp": 0.01040405, + "balance_loss_clip": 1.02315593, + "balance_loss_mlp": 1.04651427, + "epoch": 0.27265895084924097, + "flos": 24238409771520.0, + "grad_norm": 1.9613498766130548, + "language_loss": 0.91064882, + "learning_rate": 3.4134560343928005e-06, + "loss": 0.93239939, + "num_input_tokens_seen": 98044955, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.8828125, + "step": 4535, + "time_per_iteration": 2.5509371757507324 + }, + { + "auxiliary_loss_clip": 0.01138846, + "auxiliary_loss_mlp": 0.01039245, + "balance_loss_clip": 1.02262712, + "balance_loss_mlp": 1.05108571, + "epoch": 0.27271907410190893, + "flos": 27013155494400.0, + "grad_norm": 1.5906078149456282, + "language_loss": 0.72618866, + "learning_rate": 3.4131804691642778e-06, + "loss": 0.74796963, + "num_input_tokens_seen": 98065860, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.87890625, + "step": 4536, + "time_per_iteration": 2.5106241703033447 + }, + { + "auxiliary_loss_clip": 0.01133436, + "auxiliary_loss_mlp": 0.0103976, + "balance_loss_clip": 1.02302337, + "balance_loss_mlp": 1.04617631, + "epoch": 0.2727791973545769, + "flos": 34452588942720.0, + "grad_norm": 1.839444357786457, + "language_loss": 0.7144469, + "learning_rate": 3.41290485034781e-06, + "loss": 0.73617887, + "num_input_tokens_seen": 98085450, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.87109375, + "step": 4537, + "time_per_iteration": 2.588439464569092 + }, + { + "auxiliary_loss_clip": 0.01132537, + "auxiliary_loss_mlp": 0.01040014, + "balance_loss_clip": 1.02363503, + "balance_loss_mlp": 1.04501796, + "epoch": 0.27283932060724486, + "flos": 15041723160960.0, + "grad_norm": 2.431092364938405, + "language_loss": 0.78177559, + "learning_rate": 3.4126291779538485e-06, + "loss": 0.80350113, + "num_input_tokens_seen": 98099115, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.875, + "step": 4538, + "time_per_iteration": 2.438603639602661 + }, + { + "auxiliary_loss_clip": 0.01134265, + "auxiliary_loss_mlp": 0.01041521, + "balance_loss_clip": 1.02609527, + "balance_loss_mlp": 1.04698634, + "epoch": 0.2728994438599128, + "flos": 21652411040640.0, + "grad_norm": 1.4794812227008705, + "language_loss": 0.90038705, + "learning_rate": 3.412353451992847e-06, + "loss": 0.92214489, + "num_input_tokens_seen": 98118415, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.875, + "step": 4539, + "time_per_iteration": 2.5052709579467773 + }, + { + "auxiliary_loss_clip": 0.01132202, + "auxiliary_loss_mlp": 0.01041987, + "balance_loss_clip": 1.02414095, + "balance_loss_mlp": 1.04627967, + "epoch": 0.2729595671125808, + "flos": 17488424949120.0, + "grad_norm": 2.0712338481270884, + "language_loss": 0.88711655, + "learning_rate": 3.4120776724752607e-06, + "loss": 0.90885842, + "num_input_tokens_seen": 98136300, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.859375, + "step": 4540, + "time_per_iteration": 2.457939624786377 + }, + { + "auxiliary_loss_clip": 0.01133918, + "auxiliary_loss_mlp": 0.01033711, + "balance_loss_clip": 1.01771343, + "balance_loss_mlp": 1.04666936, + "epoch": 0.27301969036524876, + "flos": 19318145800320.0, + "grad_norm": 1.9363402300433894, + "language_loss": 0.81993663, + "learning_rate": 3.4118018394115476e-06, + "loss": 0.84161294, + "num_input_tokens_seen": 98154580, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.875, + "step": 4541, + "time_per_iteration": 2.461517333984375 + }, + { + "auxiliary_loss_clip": 0.01133224, + "auxiliary_loss_mlp": 0.01041774, + "balance_loss_clip": 1.02484596, + "balance_loss_mlp": 1.04623377, + "epoch": 0.2730798136179167, + "flos": 21065666376960.0, + "grad_norm": 1.8882731025231656, + "language_loss": 0.7925449, + "learning_rate": 3.4115259528121678e-06, + "loss": 0.81429487, + "num_input_tokens_seen": 98173115, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.87109375, + "step": 4542, + "time_per_iteration": 2.487905979156494 + }, + { + "auxiliary_loss_clip": 0.01136186, + "auxiliary_loss_mlp": 0.01040722, + "balance_loss_clip": 1.02441418, + "balance_loss_mlp": 1.04965162, + "epoch": 0.2731399368705847, + "flos": 19171737964800.0, + "grad_norm": 2.197105758262293, + "language_loss": 0.89471424, + "learning_rate": 3.411250012687582e-06, + "loss": 0.91648328, + "num_input_tokens_seen": 98190260, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8671875, + "step": 4543, + "time_per_iteration": 2.4903039932250977 + }, + { + "auxiliary_loss_clip": 0.01137887, + "auxiliary_loss_mlp": 0.01046974, + "balance_loss_clip": 1.02955735, + "balance_loss_mlp": 1.04841042, + "epoch": 0.27320006012325265, + "flos": 18290130554880.0, + "grad_norm": 2.084938235366164, + "language_loss": 0.63666493, + "learning_rate": 3.410974019048255e-06, + "loss": 0.65851355, + "num_input_tokens_seen": 98207115, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.89453125, + "step": 4544, + "time_per_iteration": 2.4529080390930176 + }, + { + "auxiliary_loss_clip": 0.01137894, + "auxiliary_loss_mlp": 0.01047722, + "balance_loss_clip": 1.03043687, + "balance_loss_mlp": 1.05032265, + "epoch": 0.2732601833759206, + "flos": 34860929731200.0, + "grad_norm": 1.5170655618085727, + "language_loss": 0.6996637, + "learning_rate": 3.410697971904651e-06, + "loss": 0.72151983, + "num_input_tokens_seen": 98230610, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.875, + "step": 4545, + "time_per_iteration": 2.6089117527008057 + }, + { + "auxiliary_loss_clip": 0.01048793, + "auxiliary_loss_mlp": 0.01019944, + "balance_loss_clip": 1.01828671, + "balance_loss_mlp": 1.01938868, + "epoch": 0.2733203066285886, + "flos": 53910824762880.0, + "grad_norm": 0.7273987605446792, + "language_loss": 0.61571473, + "learning_rate": 3.4104218712672383e-06, + "loss": 0.63640207, + "num_input_tokens_seen": 98293585, + "router_z_loss_clip": 0.01660156, + "router_z_loss_mlp": 0.29296875, + "step": 4546, + "time_per_iteration": 3.1125431060791016 + }, + { + "auxiliary_loss_clip": 0.01136724, + "auxiliary_loss_mlp": 0.0104828, + "balance_loss_clip": 1.03199649, + "balance_loss_mlp": 1.05012798, + "epoch": 0.2733804298812566, + "flos": 20660378244480.0, + "grad_norm": 1.9369682323358774, + "language_loss": 0.64982706, + "learning_rate": 3.410145717146488e-06, + "loss": 0.67167711, + "num_input_tokens_seen": 98311680, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8671875, + "step": 4547, + "time_per_iteration": 2.497563600540161 + }, + { + "auxiliary_loss_clip": 0.01132998, + "auxiliary_loss_mlp": 0.01041584, + "balance_loss_clip": 1.0262835, + "balance_loss_mlp": 1.04765081, + "epoch": 0.27344055313392457, + "flos": 25884339707520.0, + "grad_norm": 2.2377196076559183, + "language_loss": 0.77178854, + "learning_rate": 3.4098695095528694e-06, + "loss": 0.7935344, + "num_input_tokens_seen": 98330770, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8515625, + "step": 4548, + "time_per_iteration": 2.536813259124756 + }, + { + "auxiliary_loss_clip": 0.01133984, + "auxiliary_loss_mlp": 0.01043122, + "balance_loss_clip": 1.02854848, + "balance_loss_mlp": 1.04827595, + "epoch": 0.27350067638659253, + "flos": 22929753565440.0, + "grad_norm": 1.8894391736419274, + "language_loss": 0.82382214, + "learning_rate": 3.4095932484968585e-06, + "loss": 0.84559321, + "num_input_tokens_seen": 98349860, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.859375, + "step": 4549, + "time_per_iteration": 2.5156633853912354 + }, + { + "auxiliary_loss_clip": 0.01132691, + "auxiliary_loss_mlp": 0.0104484, + "balance_loss_clip": 1.02744722, + "balance_loss_mlp": 1.04482448, + "epoch": 0.2735607996392605, + "flos": 16574821499520.0, + "grad_norm": 2.2209993145005793, + "language_loss": 0.70675868, + "learning_rate": 3.4093169339889305e-06, + "loss": 0.72853404, + "num_input_tokens_seen": 98367040, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.875, + "step": 4550, + "time_per_iteration": 2.4510462284088135 + }, + { + "auxiliary_loss_clip": 0.0113302, + "auxiliary_loss_mlp": 0.01046908, + "balance_loss_clip": 1.03272784, + "balance_loss_mlp": 1.04789186, + "epoch": 0.27362092289192846, + "flos": 19645291895040.0, + "grad_norm": 2.43111621366583, + "language_loss": 0.78738058, + "learning_rate": 3.409040566039563e-06, + "loss": 0.80917984, + "num_input_tokens_seen": 98384010, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8515625, + "step": 4551, + "time_per_iteration": 2.470520496368408 + }, + { + "auxiliary_loss_clip": 0.01132621, + "auxiliary_loss_mlp": 0.01051474, + "balance_loss_clip": 1.03548765, + "balance_loss_mlp": 1.04601097, + "epoch": 0.27368104614459643, + "flos": 17639142416640.0, + "grad_norm": 2.681171335598487, + "language_loss": 0.70585275, + "learning_rate": 3.4087641446592362e-06, + "loss": 0.72769368, + "num_input_tokens_seen": 98399625, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8671875, + "step": 4552, + "time_per_iteration": 3.9179859161376953 + }, + { + "auxiliary_loss_clip": 0.01135382, + "auxiliary_loss_mlp": 0.01046031, + "balance_loss_clip": 1.02936506, + "balance_loss_mlp": 1.04864776, + "epoch": 0.2737411693972644, + "flos": 21580015178880.0, + "grad_norm": 2.3865688662341005, + "language_loss": 0.71857619, + "learning_rate": 3.408487669858431e-06, + "loss": 0.7403903, + "num_input_tokens_seen": 98417310, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8671875, + "step": 4553, + "time_per_iteration": 4.032766342163086 + }, + { + "auxiliary_loss_clip": 0.01131855, + "auxiliary_loss_mlp": 0.01044919, + "balance_loss_clip": 1.02853942, + "balance_loss_mlp": 1.04585433, + "epoch": 0.27380129264993236, + "flos": 25484043565440.0, + "grad_norm": 1.5870570208244068, + "language_loss": 0.59154749, + "learning_rate": 3.4082111416476337e-06, + "loss": 0.61331522, + "num_input_tokens_seen": 98438670, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.859375, + "step": 4554, + "time_per_iteration": 2.549534320831299 + }, + { + "auxiliary_loss_clip": 0.01138763, + "auxiliary_loss_mlp": 0.01041638, + "balance_loss_clip": 1.02448368, + "balance_loss_mlp": 1.04893517, + "epoch": 0.2738614159026003, + "flos": 18661196004480.0, + "grad_norm": 1.7727518382715788, + "language_loss": 0.73820007, + "learning_rate": 3.4079345600373275e-06, + "loss": 0.76000404, + "num_input_tokens_seen": 98456060, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.8984375, + "step": 4555, + "time_per_iteration": 2.5162432193756104 + }, + { + "auxiliary_loss_clip": 0.01136837, + "auxiliary_loss_mlp": 0.01039479, + "balance_loss_clip": 1.02348125, + "balance_loss_mlp": 1.04923606, + "epoch": 0.2739215391552683, + "flos": 23477139901440.0, + "grad_norm": 1.956724452661134, + "language_loss": 0.7785511, + "learning_rate": 3.407657925038002e-06, + "loss": 0.80031419, + "num_input_tokens_seen": 98473765, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.875, + "step": 4556, + "time_per_iteration": 2.5205135345458984 + }, + { + "auxiliary_loss_clip": 0.01145391, + "auxiliary_loss_mlp": 0.0105386, + "balance_loss_clip": 1.03640783, + "balance_loss_mlp": 1.04952264, + "epoch": 0.27398166240793626, + "flos": 17128636369920.0, + "grad_norm": 1.7956202604517526, + "language_loss": 0.82272434, + "learning_rate": 3.4073812366601473e-06, + "loss": 0.84471685, + "num_input_tokens_seen": 98490590, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9609375, + "step": 4557, + "time_per_iteration": 2.486485719680786 + }, + { + "auxiliary_loss_clip": 0.01133092, + "auxiliary_loss_mlp": 0.01042572, + "balance_loss_clip": 1.02691972, + "balance_loss_mlp": 1.04657316, + "epoch": 0.2740417856606042, + "flos": 23404744039680.0, + "grad_norm": 1.7971714372597054, + "language_loss": 0.72697943, + "learning_rate": 3.4071044949142547e-06, + "loss": 0.74873614, + "num_input_tokens_seen": 98510590, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.86328125, + "step": 4558, + "time_per_iteration": 2.5272727012634277 + }, + { + "auxiliary_loss_clip": 0.01131967, + "auxiliary_loss_mlp": 0.01048867, + "balance_loss_clip": 1.03243995, + "balance_loss_mlp": 1.04504418, + "epoch": 0.2741019089132722, + "flos": 12780428400000.0, + "grad_norm": 2.1318143008079686, + "language_loss": 0.6804775, + "learning_rate": 3.406827699810819e-06, + "loss": 0.70228577, + "num_input_tokens_seen": 98527875, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8671875, + "step": 4559, + "time_per_iteration": 2.4787509441375732 + }, + { + "auxiliary_loss_clip": 0.01131026, + "auxiliary_loss_mlp": 0.01043891, + "balance_loss_clip": 1.02750015, + "balance_loss_mlp": 1.04517901, + "epoch": 0.27416203216594015, + "flos": 20631542601600.0, + "grad_norm": 3.5500966853689673, + "language_loss": 0.71847737, + "learning_rate": 3.4065508513603353e-06, + "loss": 0.74022651, + "num_input_tokens_seen": 98547575, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.859375, + "step": 4560, + "time_per_iteration": 2.490152359008789 + }, + { + "auxiliary_loss_clip": 0.0113572, + "auxiliary_loss_mlp": 0.01041958, + "balance_loss_clip": 1.02642488, + "balance_loss_mlp": 1.04779601, + "epoch": 0.27422215541860817, + "flos": 26541576812160.0, + "grad_norm": 1.7948619898284635, + "language_loss": 0.80998009, + "learning_rate": 3.406273949573303e-06, + "loss": 0.83175689, + "num_input_tokens_seen": 98566290, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.87890625, + "step": 4561, + "time_per_iteration": 2.554872512817383 + }, + { + "auxiliary_loss_clip": 0.01136406, + "auxiliary_loss_mlp": 0.01041546, + "balance_loss_clip": 1.02600157, + "balance_loss_mlp": 1.04711854, + "epoch": 0.27428227867127614, + "flos": 23331163029120.0, + "grad_norm": 1.7370289005889625, + "language_loss": 0.7531321, + "learning_rate": 3.4059969944602214e-06, + "loss": 0.77491164, + "num_input_tokens_seen": 98586255, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.890625, + "step": 4562, + "time_per_iteration": 2.4925429821014404 + }, + { + "auxiliary_loss_clip": 0.01133486, + "auxiliary_loss_mlp": 0.01037482, + "balance_loss_clip": 1.02173424, + "balance_loss_mlp": 1.04701662, + "epoch": 0.2743424019239441, + "flos": 23035115134080.0, + "grad_norm": 1.598166418515773, + "language_loss": 0.74503827, + "learning_rate": 3.4057199860315928e-06, + "loss": 0.76674795, + "num_input_tokens_seen": 98606030, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.86328125, + "step": 4563, + "time_per_iteration": 2.5514259338378906 + }, + { + "auxiliary_loss_clip": 0.01138282, + "auxiliary_loss_mlp": 0.01045739, + "balance_loss_clip": 1.02798915, + "balance_loss_mlp": 1.04708612, + "epoch": 0.27440252517661207, + "flos": 21981101420160.0, + "grad_norm": 1.8271759108968861, + "language_loss": 0.62526429, + "learning_rate": 3.4054429242979213e-06, + "loss": 0.64710456, + "num_input_tokens_seen": 98625225, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9140625, + "step": 4564, + "time_per_iteration": 2.479156494140625 + }, + { + "auxiliary_loss_clip": 0.01136574, + "auxiliary_loss_mlp": 0.01042695, + "balance_loss_clip": 1.02513587, + "balance_loss_mlp": 1.04808652, + "epoch": 0.27446264842928003, + "flos": 40187451502080.0, + "grad_norm": 1.9245884320117708, + "language_loss": 0.78135669, + "learning_rate": 3.4051658092697135e-06, + "loss": 0.80314934, + "num_input_tokens_seen": 98649470, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8828125, + "step": 4565, + "time_per_iteration": 2.714069366455078 + }, + { + "auxiliary_loss_clip": 0.01133378, + "auxiliary_loss_mlp": 0.01039885, + "balance_loss_clip": 1.02443504, + "balance_loss_mlp": 1.04669619, + "epoch": 0.274522771681948, + "flos": 13479681438720.0, + "grad_norm": 2.3377831889988547, + "language_loss": 0.68350124, + "learning_rate": 3.404888640957477e-06, + "loss": 0.70523381, + "num_input_tokens_seen": 98666915, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8671875, + "step": 4566, + "time_per_iteration": 2.469357967376709 + }, + { + "auxiliary_loss_clip": 0.01133208, + "auxiliary_loss_mlp": 0.01046416, + "balance_loss_clip": 1.03211665, + "balance_loss_mlp": 1.04901338, + "epoch": 0.27458289493461596, + "flos": 28622133313920.0, + "grad_norm": 1.7938914020631171, + "language_loss": 0.60886472, + "learning_rate": 3.404611419371723e-06, + "loss": 0.63066101, + "num_input_tokens_seen": 98688240, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.84375, + "step": 4567, + "time_per_iteration": 2.5856754779815674 + }, + { + "auxiliary_loss_clip": 0.01134122, + "auxiliary_loss_mlp": 0.01042972, + "balance_loss_clip": 1.02597237, + "balance_loss_mlp": 1.04754972, + "epoch": 0.2746430181872839, + "flos": 20119815492480.0, + "grad_norm": 1.7650663548751138, + "language_loss": 0.82787997, + "learning_rate": 3.4043341445229627e-06, + "loss": 0.84965092, + "num_input_tokens_seen": 98708245, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.86328125, + "step": 4568, + "time_per_iteration": 2.476353168487549 + }, + { + "auxiliary_loss_clip": 0.0113839, + "auxiliary_loss_mlp": 0.01034679, + "balance_loss_clip": 1.01868141, + "balance_loss_mlp": 1.05012584, + "epoch": 0.2747031414399519, + "flos": 20193468330240.0, + "grad_norm": 2.0155686346894415, + "language_loss": 0.68656778, + "learning_rate": 3.4040568164217117e-06, + "loss": 0.7082985, + "num_input_tokens_seen": 98724575, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8828125, + "step": 4569, + "time_per_iteration": 2.5027451515197754 + }, + { + "auxiliary_loss_clip": 0.01133852, + "auxiliary_loss_mlp": 0.0103613, + "balance_loss_clip": 1.01947594, + "balance_loss_mlp": 1.0464673, + "epoch": 0.27476326469261986, + "flos": 13516346246400.0, + "grad_norm": 2.247407128453888, + "language_loss": 0.71138883, + "learning_rate": 3.4037794350784848e-06, + "loss": 0.73308867, + "num_input_tokens_seen": 98740700, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.875, + "step": 4570, + "time_per_iteration": 2.466845750808716 + }, + { + "auxiliary_loss_clip": 0.0104735, + "auxiliary_loss_mlp": 0.01010434, + "balance_loss_clip": 1.00881279, + "balance_loss_mlp": 1.01781416, + "epoch": 0.2748233879452878, + "flos": 65937127121280.0, + "grad_norm": 0.7344992896847644, + "language_loss": 0.55774754, + "learning_rate": 3.4035020005038014e-06, + "loss": 0.57832539, + "num_input_tokens_seen": 98803030, + "router_z_loss_clip": 0.01623535, + "router_z_loss_mlp": 0.296875, + "step": 4571, + "time_per_iteration": 3.192523241043091 + }, + { + "auxiliary_loss_clip": 0.01140339, + "auxiliary_loss_mlp": 0.01044242, + "balance_loss_clip": 1.02805328, + "balance_loss_mlp": 1.05039406, + "epoch": 0.2748835111979558, + "flos": 17384212615680.0, + "grad_norm": 3.6883594473706482, + "language_loss": 0.77785081, + "learning_rate": 3.4032245127081812e-06, + "loss": 0.79969662, + "num_input_tokens_seen": 98820505, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8984375, + "step": 4572, + "time_per_iteration": 2.4755914211273193 + }, + { + "auxiliary_loss_clip": 0.01129408, + "auxiliary_loss_mlp": 0.01036409, + "balance_loss_clip": 1.02200866, + "balance_loss_mlp": 1.04679561, + "epoch": 0.27494363445062375, + "flos": 23587565287680.0, + "grad_norm": 1.7042315716847805, + "language_loss": 0.81357443, + "learning_rate": 3.402946971702147e-06, + "loss": 0.83523262, + "num_input_tokens_seen": 98842150, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.828125, + "step": 4573, + "time_per_iteration": 2.540905237197876 + }, + { + "auxiliary_loss_clip": 0.01129787, + "auxiliary_loss_mlp": 0.01036343, + "balance_loss_clip": 1.02038062, + "balance_loss_mlp": 1.04580402, + "epoch": 0.2750037577032918, + "flos": 17164582905600.0, + "grad_norm": 1.7927939239771835, + "language_loss": 0.79077196, + "learning_rate": 3.402669377496223e-06, + "loss": 0.81243324, + "num_input_tokens_seen": 98861050, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.83984375, + "step": 4574, + "time_per_iteration": 2.451016664505005 + }, + { + "auxiliary_loss_clip": 0.01136155, + "auxiliary_loss_mlp": 0.01044603, + "balance_loss_clip": 1.02889121, + "balance_loss_mlp": 1.04886127, + "epoch": 0.27506388095595974, + "flos": 24491903028480.0, + "grad_norm": 2.232643844604772, + "language_loss": 0.74191976, + "learning_rate": 3.402391730100936e-06, + "loss": 0.76372731, + "num_input_tokens_seen": 98879695, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.875, + "step": 4575, + "time_per_iteration": 2.5744149684906006 + }, + { + "auxiliary_loss_clip": 0.01131901, + "auxiliary_loss_mlp": 0.01038458, + "balance_loss_clip": 1.02353263, + "balance_loss_mlp": 1.04711711, + "epoch": 0.2751240042086277, + "flos": 38764706722560.0, + "grad_norm": 1.8105072672356382, + "language_loss": 0.71877766, + "learning_rate": 3.402114029526814e-06, + "loss": 0.7404812, + "num_input_tokens_seen": 98902035, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.84765625, + "step": 4576, + "time_per_iteration": 2.634305715560913 + }, + { + "auxiliary_loss_clip": 0.01134924, + "auxiliary_loss_mlp": 0.01041859, + "balance_loss_clip": 1.02495503, + "balance_loss_mlp": 1.04823232, + "epoch": 0.27518412746129567, + "flos": 26907039740160.0, + "grad_norm": 1.7690392048384511, + "language_loss": 0.73200434, + "learning_rate": 3.4018362757843866e-06, + "loss": 0.75377214, + "num_input_tokens_seen": 98921835, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8671875, + "step": 4577, + "time_per_iteration": 2.5365946292877197 + }, + { + "auxiliary_loss_clip": 0.01137469, + "auxiliary_loss_mlp": 0.01038584, + "balance_loss_clip": 1.02182376, + "balance_loss_mlp": 1.04931974, + "epoch": 0.27524425071396363, + "flos": 24900531125760.0, + "grad_norm": 5.099060573221768, + "language_loss": 0.75943893, + "learning_rate": 3.401558468884188e-06, + "loss": 0.78119946, + "num_input_tokens_seen": 98939610, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8828125, + "step": 4578, + "time_per_iteration": 2.5121536254882812 + }, + { + "auxiliary_loss_clip": 0.01135832, + "auxiliary_loss_mlp": 0.01046459, + "balance_loss_clip": 1.02704024, + "balance_loss_mlp": 1.0475626, + "epoch": 0.2753043739666316, + "flos": 26288047641600.0, + "grad_norm": 2.3614458833507603, + "language_loss": 0.66299897, + "learning_rate": 3.4012806088367516e-06, + "loss": 0.68482184, + "num_input_tokens_seen": 98962250, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.8828125, + "step": 4579, + "time_per_iteration": 2.5445947647094727 + }, + { + "auxiliary_loss_clip": 0.01137742, + "auxiliary_loss_mlp": 0.01056341, + "balance_loss_clip": 1.03841197, + "balance_loss_mlp": 1.04862928, + "epoch": 0.27536449721929956, + "flos": 24206772867840.0, + "grad_norm": 1.9384727438162337, + "language_loss": 0.8013078, + "learning_rate": 3.4010026956526137e-06, + "loss": 0.82324862, + "num_input_tokens_seen": 98981845, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.890625, + "step": 4580, + "time_per_iteration": 2.4895741939544678 + }, + { + "auxiliary_loss_clip": 0.01138586, + "auxiliary_loss_mlp": 0.01044508, + "balance_loss_clip": 1.02581632, + "balance_loss_mlp": 1.05140579, + "epoch": 0.27542462047196753, + "flos": 19537272720000.0, + "grad_norm": 1.4702192551629332, + "language_loss": 0.67702103, + "learning_rate": 3.4007247293423137e-06, + "loss": 0.698852, + "num_input_tokens_seen": 99001855, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.87109375, + "step": 4581, + "time_per_iteration": 2.5905539989471436 + }, + { + "auxiliary_loss_clip": 0.01137135, + "auxiliary_loss_mlp": 0.01046005, + "balance_loss_clip": 1.03024602, + "balance_loss_mlp": 1.04847145, + "epoch": 0.2754847437246355, + "flos": 14319165173760.0, + "grad_norm": 1.8568978026073784, + "language_loss": 0.78120708, + "learning_rate": 3.400446709916392e-06, + "loss": 0.80303848, + "num_input_tokens_seen": 99019880, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.88671875, + "step": 4582, + "time_per_iteration": 2.467210531234741 + }, + { + "auxiliary_loss_clip": 0.01133579, + "auxiliary_loss_mlp": 0.01040863, + "balance_loss_clip": 1.02537727, + "balance_loss_mlp": 1.04905152, + "epoch": 0.27554486697730346, + "flos": 18838773866880.0, + "grad_norm": 2.5358708072067406, + "language_loss": 0.84527528, + "learning_rate": 3.4001686373853895e-06, + "loss": 0.86701977, + "num_input_tokens_seen": 99037570, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.84375, + "step": 4583, + "time_per_iteration": 2.511457920074463 + }, + { + "auxiliary_loss_clip": 0.01138165, + "auxiliary_loss_mlp": 0.01041348, + "balance_loss_clip": 1.02529025, + "balance_loss_mlp": 1.04905808, + "epoch": 0.2756049902299714, + "flos": 22382295402240.0, + "grad_norm": 2.037294788318467, + "language_loss": 0.67308438, + "learning_rate": 3.3998905117598528e-06, + "loss": 0.69487947, + "num_input_tokens_seen": 99056875, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.890625, + "step": 4584, + "time_per_iteration": 2.5193254947662354 + }, + { + "auxiliary_loss_clip": 0.01132805, + "auxiliary_loss_mlp": 0.01041645, + "balance_loss_clip": 1.02645802, + "balance_loss_mlp": 1.04761386, + "epoch": 0.2756651134826394, + "flos": 19573901614080.0, + "grad_norm": 1.737999785464117, + "language_loss": 0.77330101, + "learning_rate": 3.399612333050327e-06, + "loss": 0.7950455, + "num_input_tokens_seen": 99074685, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8515625, + "step": 4585, + "time_per_iteration": 2.5393707752227783 + }, + { + "auxiliary_loss_clip": 0.0114213, + "auxiliary_loss_mlp": 0.01039297, + "balance_loss_clip": 1.02227354, + "balance_loss_mlp": 1.0530591, + "epoch": 0.27572523673530736, + "flos": 23586559706880.0, + "grad_norm": 1.654604836009794, + "language_loss": 0.71854031, + "learning_rate": 3.399334101267362e-06, + "loss": 0.74035466, + "num_input_tokens_seen": 99095300, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.890625, + "step": 4586, + "time_per_iteration": 2.534979820251465 + }, + { + "auxiliary_loss_clip": 0.01136306, + "auxiliary_loss_mlp": 0.01035904, + "balance_loss_clip": 1.01996541, + "balance_loss_mlp": 1.04988265, + "epoch": 0.2757853599879754, + "flos": 22820118278400.0, + "grad_norm": 1.5248017982775213, + "language_loss": 0.80546939, + "learning_rate": 3.3990558164215073e-06, + "loss": 0.82719147, + "num_input_tokens_seen": 99115965, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.86328125, + "step": 4587, + "time_per_iteration": 2.5424065589904785 + }, + { + "auxiliary_loss_clip": 0.01133784, + "auxiliary_loss_mlp": 0.0103925, + "balance_loss_clip": 1.02356219, + "balance_loss_mlp": 1.04939508, + "epoch": 0.27584548324064334, + "flos": 18551704371840.0, + "grad_norm": 2.136921841599078, + "language_loss": 0.82694119, + "learning_rate": 3.398777478523316e-06, + "loss": 0.8486715, + "num_input_tokens_seen": 99134265, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.84375, + "step": 4588, + "time_per_iteration": 2.467923879623413 + }, + { + "auxiliary_loss_clip": 0.01132148, + "auxiliary_loss_mlp": 0.01038443, + "balance_loss_clip": 1.0228622, + "balance_loss_mlp": 1.04754925, + "epoch": 0.2759056064933113, + "flos": 23769883745280.0, + "grad_norm": 1.3980423175693042, + "language_loss": 0.75352502, + "learning_rate": 3.398499087583342e-06, + "loss": 0.775231, + "num_input_tokens_seen": 99156185, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84375, + "step": 4589, + "time_per_iteration": 2.535837173461914 + }, + { + "auxiliary_loss_clip": 0.01132096, + "auxiliary_loss_mlp": 0.01042232, + "balance_loss_clip": 1.02526879, + "balance_loss_mlp": 1.04686022, + "epoch": 0.27596572974597927, + "flos": 24281898163200.0, + "grad_norm": 1.7720046877472317, + "language_loss": 0.88438141, + "learning_rate": 3.398220643612143e-06, + "loss": 0.90612471, + "num_input_tokens_seen": 99176735, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.8515625, + "step": 4590, + "time_per_iteration": 2.5248916149139404 + }, + { + "auxiliary_loss_clip": 0.01135164, + "auxiliary_loss_mlp": 0.01045359, + "balance_loss_clip": 1.02946877, + "balance_loss_mlp": 1.04789972, + "epoch": 0.27602585299864724, + "flos": 35040985632000.0, + "grad_norm": 1.6299691755620427, + "language_loss": 0.7129395, + "learning_rate": 3.397942146620277e-06, + "loss": 0.73474467, + "num_input_tokens_seen": 99199765, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.87109375, + "step": 4591, + "time_per_iteration": 2.6112425327301025 + }, + { + "auxiliary_loss_clip": 0.01135759, + "auxiliary_loss_mlp": 0.01049557, + "balance_loss_clip": 1.03268862, + "balance_loss_mlp": 1.04847574, + "epoch": 0.2760859762513152, + "flos": 24309405002880.0, + "grad_norm": 1.8477043284936983, + "language_loss": 0.80190659, + "learning_rate": 3.3976635966183046e-06, + "loss": 0.82375979, + "num_input_tokens_seen": 99218435, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.875, + "step": 4592, + "time_per_iteration": 2.483894109725952 + }, + { + "auxiliary_loss_clip": 0.01048363, + "auxiliary_loss_mlp": 0.01005872, + "balance_loss_clip": 1.00416684, + "balance_loss_mlp": 1.0189774, + "epoch": 0.27614609950398317, + "flos": 71260739890560.0, + "grad_norm": 0.7716758671018623, + "language_loss": 0.61627746, + "learning_rate": 3.3973849936167886e-06, + "loss": 0.63681984, + "num_input_tokens_seen": 99276200, + "router_z_loss_clip": 0.01708984, + "router_z_loss_mlp": 0.29296875, + "step": 4593, + "time_per_iteration": 3.0616326332092285 + }, + { + "auxiliary_loss_clip": 0.01135076, + "auxiliary_loss_mlp": 0.01045597, + "balance_loss_clip": 1.02965856, + "balance_loss_mlp": 1.04938328, + "epoch": 0.27620622275665113, + "flos": 29674854138240.0, + "grad_norm": 1.8877557773606983, + "language_loss": 0.77589142, + "learning_rate": 3.3971063376262937e-06, + "loss": 0.79769808, + "num_input_tokens_seen": 99297625, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.859375, + "step": 4594, + "time_per_iteration": 4.043708086013794 + }, + { + "auxiliary_loss_clip": 0.01134807, + "auxiliary_loss_mlp": 0.01033386, + "balance_loss_clip": 1.01769793, + "balance_loss_mlp": 1.04991734, + "epoch": 0.2762663460093191, + "flos": 15378063137280.0, + "grad_norm": 1.7681451067423914, + "language_loss": 0.91645586, + "learning_rate": 3.3968276286573866e-06, + "loss": 0.93813777, + "num_input_tokens_seen": 99315790, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84765625, + "step": 4595, + "time_per_iteration": 3.973101854324341 + }, + { + "auxiliary_loss_clip": 0.01138485, + "auxiliary_loss_mlp": 0.01047274, + "balance_loss_clip": 1.03034675, + "balance_loss_mlp": 1.05122674, + "epoch": 0.27632646926198706, + "flos": 20704082117760.0, + "grad_norm": 1.7288059110569738, + "language_loss": 0.69101036, + "learning_rate": 3.3965488667206353e-06, + "loss": 0.71286798, + "num_input_tokens_seen": 99334615, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.87109375, + "step": 4596, + "time_per_iteration": 2.509199380874634 + }, + { + "auxiliary_loss_clip": 0.0114029, + "auxiliary_loss_mlp": 0.01041278, + "balance_loss_clip": 1.0249939, + "balance_loss_mlp": 1.04883707, + "epoch": 0.276386592514655, + "flos": 32813374849920.0, + "grad_norm": 2.01522187594791, + "language_loss": 0.63536406, + "learning_rate": 3.3962700518266113e-06, + "loss": 0.65717971, + "num_input_tokens_seen": 99356685, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.9140625, + "step": 4597, + "time_per_iteration": 2.5944221019744873 + }, + { + "auxiliary_loss_clip": 0.01133967, + "auxiliary_loss_mlp": 0.0104198, + "balance_loss_clip": 1.02629232, + "balance_loss_mlp": 1.05002272, + "epoch": 0.276446715767323, + "flos": 18551704371840.0, + "grad_norm": 2.1842552390134586, + "language_loss": 0.86612505, + "learning_rate": 3.395991183985887e-06, + "loss": 0.88788456, + "num_input_tokens_seen": 99374810, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.83984375, + "step": 4598, + "time_per_iteration": 2.4870996475219727 + }, + { + "auxiliary_loss_clip": 0.01135257, + "auxiliary_loss_mlp": 0.01042781, + "balance_loss_clip": 1.02586544, + "balance_loss_mlp": 1.04847229, + "epoch": 0.27650683901999096, + "flos": 22819615488000.0, + "grad_norm": 2.0694668215518996, + "language_loss": 0.79822165, + "learning_rate": 3.395712263209037e-06, + "loss": 0.82000202, + "num_input_tokens_seen": 99391290, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8671875, + "step": 4599, + "time_per_iteration": 2.4923834800720215 + }, + { + "auxiliary_loss_clip": 0.01140028, + "auxiliary_loss_mlp": 0.0104597, + "balance_loss_clip": 1.02965581, + "balance_loss_mlp": 1.04958415, + "epoch": 0.276566962272659, + "flos": 21361534704000.0, + "grad_norm": 1.9049018096400723, + "language_loss": 0.78357869, + "learning_rate": 3.395433289506639e-06, + "loss": 0.80543864, + "num_input_tokens_seen": 99409120, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.90625, + "step": 4600, + "time_per_iteration": 2.496173620223999 + }, + { + "auxiliary_loss_clip": 0.01139042, + "auxiliary_loss_mlp": 0.01046211, + "balance_loss_clip": 1.03007007, + "balance_loss_mlp": 1.04887986, + "epoch": 0.27662708552532694, + "flos": 17710604524800.0, + "grad_norm": 1.9474431855639402, + "language_loss": 0.73361742, + "learning_rate": 3.3951542628892694e-06, + "loss": 0.75546992, + "num_input_tokens_seen": 99426180, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.90234375, + "step": 4601, + "time_per_iteration": 2.475919246673584 + }, + { + "auxiliary_loss_clip": 0.01135661, + "auxiliary_loss_mlp": 0.01045476, + "balance_loss_clip": 1.02883482, + "balance_loss_mlp": 1.04879355, + "epoch": 0.2766872087779949, + "flos": 21252725429760.0, + "grad_norm": 1.9134344988482315, + "language_loss": 0.79341739, + "learning_rate": 3.3948751833675113e-06, + "loss": 0.81522876, + "num_input_tokens_seen": 99447720, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8671875, + "step": 4602, + "time_per_iteration": 2.511716842651367 + }, + { + "auxiliary_loss_clip": 0.01140401, + "auxiliary_loss_mlp": 0.01051234, + "balance_loss_clip": 1.03349614, + "balance_loss_mlp": 1.04920423, + "epoch": 0.2767473320306629, + "flos": 12931900053120.0, + "grad_norm": 2.260382216699142, + "language_loss": 0.76887643, + "learning_rate": 3.3945960509519455e-06, + "loss": 0.79079276, + "num_input_tokens_seen": 99464720, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.91015625, + "step": 4603, + "time_per_iteration": 2.4667811393737793 + }, + { + "auxiliary_loss_clip": 0.0112975, + "auxiliary_loss_mlp": 0.01040095, + "balance_loss_clip": 1.0252831, + "balance_loss_mlp": 1.04736543, + "epoch": 0.27680745528333084, + "flos": 15012851604480.0, + "grad_norm": 1.7288101924316703, + "language_loss": 0.81411278, + "learning_rate": 3.3943168656531585e-06, + "loss": 0.83581114, + "num_input_tokens_seen": 99482310, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.82421875, + "step": 4604, + "time_per_iteration": 2.4586222171783447 + }, + { + "auxiliary_loss_clip": 0.01135813, + "auxiliary_loss_mlp": 0.01031451, + "balance_loss_clip": 1.01516712, + "balance_loss_mlp": 1.04756212, + "epoch": 0.2768675785359988, + "flos": 22637835734400.0, + "grad_norm": 1.7513688477785454, + "language_loss": 0.69912565, + "learning_rate": 3.3940376274817363e-06, + "loss": 0.72079831, + "num_input_tokens_seen": 99501255, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8828125, + "step": 4605, + "time_per_iteration": 2.5138533115386963 + }, + { + "auxiliary_loss_clip": 0.01045677, + "auxiliary_loss_mlp": 0.01001918, + "balance_loss_clip": 1.00033224, + "balance_loss_mlp": 1.01580858, + "epoch": 0.27692770178866677, + "flos": 66130542881280.0, + "grad_norm": 0.7252635192802935, + "language_loss": 0.57151282, + "learning_rate": 3.3937583364482673e-06, + "loss": 0.59198874, + "num_input_tokens_seen": 99568925, + "router_z_loss_clip": 0.01586914, + "router_z_loss_mlp": 0.296875, + "step": 4606, + "time_per_iteration": 3.184955596923828 + }, + { + "auxiliary_loss_clip": 0.01136733, + "auxiliary_loss_mlp": 0.01049361, + "balance_loss_clip": 1.03234947, + "balance_loss_mlp": 1.0481658, + "epoch": 0.27698782504133473, + "flos": 26464979059200.0, + "grad_norm": 2.0717297663627825, + "language_loss": 0.69666946, + "learning_rate": 3.3934789925633424e-06, + "loss": 0.71853042, + "num_input_tokens_seen": 99588455, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.88671875, + "step": 4607, + "time_per_iteration": 2.5373001098632812 + }, + { + "auxiliary_loss_clip": 0.011299, + "auxiliary_loss_mlp": 0.01035631, + "balance_loss_clip": 1.02014589, + "balance_loss_mlp": 1.04721832, + "epoch": 0.2770479482940027, + "flos": 25884806584320.0, + "grad_norm": 3.332085537790215, + "language_loss": 0.6982615, + "learning_rate": 3.393199595837555e-06, + "loss": 0.71991682, + "num_input_tokens_seen": 99609355, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.828125, + "step": 4608, + "time_per_iteration": 2.5396809577941895 + }, + { + "auxiliary_loss_clip": 0.01135928, + "auxiliary_loss_mlp": 0.01037862, + "balance_loss_clip": 1.02185202, + "balance_loss_mlp": 1.04715931, + "epoch": 0.27710807154667066, + "flos": 22857249962880.0, + "grad_norm": 1.8242818121189563, + "language_loss": 0.72541273, + "learning_rate": 3.392920146281499e-06, + "loss": 0.74715054, + "num_input_tokens_seen": 99628780, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.88671875, + "step": 4609, + "time_per_iteration": 2.5383543968200684 + }, + { + "auxiliary_loss_clip": 0.01134274, + "auxiliary_loss_mlp": 0.01048832, + "balance_loss_clip": 1.03226149, + "balance_loss_mlp": 1.04623055, + "epoch": 0.27716819479933863, + "flos": 17711071401600.0, + "grad_norm": 2.2576811985082967, + "language_loss": 0.84010947, + "learning_rate": 3.3926406439057714e-06, + "loss": 0.86194062, + "num_input_tokens_seen": 99644545, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.87890625, + "step": 4610, + "time_per_iteration": 2.4456827640533447 + }, + { + "auxiliary_loss_clip": 0.01141086, + "auxiliary_loss_mlp": 0.01051097, + "balance_loss_clip": 1.03344178, + "balance_loss_mlp": 1.04996872, + "epoch": 0.2772283180520066, + "flos": 19646046080640.0, + "grad_norm": 2.570198611472629, + "language_loss": 0.68948054, + "learning_rate": 3.3923610887209705e-06, + "loss": 0.71140236, + "num_input_tokens_seen": 99663125, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9140625, + "step": 4611, + "time_per_iteration": 2.5342319011688232 + }, + { + "auxiliary_loss_clip": 0.01130823, + "auxiliary_loss_mlp": 0.01036995, + "balance_loss_clip": 1.0212357, + "balance_loss_mlp": 1.04892015, + "epoch": 0.27728844130467456, + "flos": 21032628842880.0, + "grad_norm": 2.354058548299899, + "language_loss": 0.73450744, + "learning_rate": 3.392081480737698e-06, + "loss": 0.75618565, + "num_input_tokens_seen": 99682645, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8203125, + "step": 4612, + "time_per_iteration": 2.472200632095337 + }, + { + "auxiliary_loss_clip": 0.01137408, + "auxiliary_loss_mlp": 0.0105089, + "balance_loss_clip": 1.03378379, + "balance_loss_mlp": 1.04807258, + "epoch": 0.2773485645573425, + "flos": 18989204025600.0, + "grad_norm": 2.166254073057622, + "language_loss": 0.66736221, + "learning_rate": 3.3918018199665563e-06, + "loss": 0.68924516, + "num_input_tokens_seen": 99700520, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.89453125, + "step": 4613, + "time_per_iteration": 2.5313632488250732 + }, + { + "auxiliary_loss_clip": 0.01131014, + "auxiliary_loss_mlp": 0.01043283, + "balance_loss_clip": 1.02721334, + "balance_loss_mlp": 1.04604864, + "epoch": 0.27740868781001055, + "flos": 21468440557440.0, + "grad_norm": 1.8826548789840187, + "language_loss": 0.79452634, + "learning_rate": 3.39152210641815e-06, + "loss": 0.81626928, + "num_input_tokens_seen": 99720355, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8515625, + "step": 4614, + "time_per_iteration": 2.4869751930236816 + }, + { + "auxiliary_loss_clip": 0.01135901, + "auxiliary_loss_mlp": 0.01043201, + "balance_loss_clip": 1.02684534, + "balance_loss_mlp": 1.0477469, + "epoch": 0.2774688110626785, + "flos": 19827825834240.0, + "grad_norm": 2.573597172535304, + "language_loss": 0.80251336, + "learning_rate": 3.3912423401030865e-06, + "loss": 0.8243044, + "num_input_tokens_seen": 99736090, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8828125, + "step": 4615, + "time_per_iteration": 2.521615505218506 + }, + { + "auxiliary_loss_clip": 0.01135416, + "auxiliary_loss_mlp": 0.01043384, + "balance_loss_clip": 1.02676582, + "balance_loss_mlp": 1.04627132, + "epoch": 0.2775289343153465, + "flos": 18216226321920.0, + "grad_norm": 2.403593727320557, + "language_loss": 0.63926548, + "learning_rate": 3.3909625210319735e-06, + "loss": 0.66105354, + "num_input_tokens_seen": 99751805, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.890625, + "step": 4616, + "time_per_iteration": 2.439410448074341 + }, + { + "auxiliary_loss_clip": 0.01133721, + "auxiliary_loss_mlp": 0.01041991, + "balance_loss_clip": 1.02593398, + "balance_loss_mlp": 1.04661143, + "epoch": 0.27758905756801444, + "flos": 16472476673280.0, + "grad_norm": 1.8467628074440183, + "language_loss": 0.82283223, + "learning_rate": 3.3906826492154226e-06, + "loss": 0.84458935, + "num_input_tokens_seen": 99770610, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87109375, + "step": 4617, + "time_per_iteration": 2.49495792388916 + }, + { + "auxiliary_loss_clip": 0.01133289, + "auxiliary_loss_mlp": 0.01041846, + "balance_loss_clip": 1.02587175, + "balance_loss_mlp": 1.04613662, + "epoch": 0.2776491808206824, + "flos": 18728240739840.0, + "grad_norm": 2.1015666973838942, + "language_loss": 0.76835418, + "learning_rate": 3.3904027246640458e-06, + "loss": 0.79010552, + "num_input_tokens_seen": 99787305, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87109375, + "step": 4618, + "time_per_iteration": 2.4882123470306396 + }, + { + "auxiliary_loss_clip": 0.01136682, + "auxiliary_loss_mlp": 0.0104057, + "balance_loss_clip": 1.02501273, + "balance_loss_mlp": 1.0495801, + "epoch": 0.27770930407335037, + "flos": 28038189911040.0, + "grad_norm": 1.6700061931983001, + "language_loss": 0.84698343, + "learning_rate": 3.390122747388459e-06, + "loss": 0.868756, + "num_input_tokens_seen": 99808940, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.87109375, + "step": 4619, + "time_per_iteration": 2.5996124744415283 + }, + { + "auxiliary_loss_clip": 0.01128767, + "auxiliary_loss_mlp": 0.0103795, + "balance_loss_clip": 1.02340662, + "balance_loss_mlp": 1.04523671, + "epoch": 0.27776942732601834, + "flos": 23549823072000.0, + "grad_norm": 1.4068177028172657, + "language_loss": 0.76720011, + "learning_rate": 3.3898427173992778e-06, + "loss": 0.78886724, + "num_input_tokens_seen": 99829575, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8359375, + "step": 4620, + "time_per_iteration": 2.4851698875427246 + }, + { + "auxiliary_loss_clip": 0.01130943, + "auxiliary_loss_mlp": 0.01036697, + "balance_loss_clip": 1.02126586, + "balance_loss_mlp": 1.04728413, + "epoch": 0.2778295505786863, + "flos": 23908713811200.0, + "grad_norm": 2.4956264272783084, + "language_loss": 0.78746819, + "learning_rate": 3.389562634707122e-06, + "loss": 0.80914462, + "num_input_tokens_seen": 99847575, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8359375, + "step": 4621, + "time_per_iteration": 2.543513774871826 + }, + { + "auxiliary_loss_clip": 0.01135835, + "auxiliary_loss_mlp": 0.01046354, + "balance_loss_clip": 1.02953315, + "balance_loss_mlp": 1.04871762, + "epoch": 0.27788967383135427, + "flos": 25554571920000.0, + "grad_norm": 1.9988562622182164, + "language_loss": 0.87520665, + "learning_rate": 3.389282499322611e-06, + "loss": 0.89702857, + "num_input_tokens_seen": 99864995, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.87109375, + "step": 4622, + "time_per_iteration": 2.4818174839019775 + }, + { + "auxiliary_loss_clip": 0.01133366, + "auxiliary_loss_mlp": 0.01046006, + "balance_loss_clip": 1.02960837, + "balance_loss_mlp": 1.04635906, + "epoch": 0.27794979708402223, + "flos": 16252631481600.0, + "grad_norm": 1.9062066208333321, + "language_loss": 0.81094646, + "learning_rate": 3.389002311256369e-06, + "loss": 0.83274019, + "num_input_tokens_seen": 99881540, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8671875, + "step": 4623, + "time_per_iteration": 2.509218692779541 + }, + { + "auxiliary_loss_clip": 0.01136736, + "auxiliary_loss_mlp": 0.01039194, + "balance_loss_clip": 1.02357817, + "balance_loss_mlp": 1.04981863, + "epoch": 0.2780099203366902, + "flos": 20667632791680.0, + "grad_norm": 1.93503772017796, + "language_loss": 0.81099498, + "learning_rate": 3.3887220705190204e-06, + "loss": 0.83275431, + "num_input_tokens_seen": 99899595, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8671875, + "step": 4624, + "time_per_iteration": 2.470041513442993 + }, + { + "auxiliary_loss_clip": 0.01135607, + "auxiliary_loss_mlp": 0.01041344, + "balance_loss_clip": 1.02563787, + "balance_loss_mlp": 1.05091214, + "epoch": 0.27807004358935816, + "flos": 17739583822080.0, + "grad_norm": 3.184384520938543, + "language_loss": 0.76514304, + "learning_rate": 3.388441777121191e-06, + "loss": 0.7869125, + "num_input_tokens_seen": 99913020, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.84765625, + "step": 4625, + "time_per_iteration": 2.4965567588806152 + }, + { + "auxiliary_loss_clip": 0.01133566, + "auxiliary_loss_mlp": 0.01041252, + "balance_loss_clip": 1.02439606, + "balance_loss_mlp": 1.04835677, + "epoch": 0.2781301668420261, + "flos": 16727119165440.0, + "grad_norm": 2.5511238477154095, + "language_loss": 0.70091927, + "learning_rate": 3.388161431073511e-06, + "loss": 0.7226674, + "num_input_tokens_seen": 99931405, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8515625, + "step": 4626, + "time_per_iteration": 2.462007522583008 + }, + { + "auxiliary_loss_clip": 0.01142353, + "auxiliary_loss_mlp": 0.01036943, + "balance_loss_clip": 1.01981282, + "balance_loss_mlp": 1.05177855, + "epoch": 0.27819029009469415, + "flos": 13844749317120.0, + "grad_norm": 2.1576082410571704, + "language_loss": 0.92738312, + "learning_rate": 3.38788103238661e-06, + "loss": 0.94917607, + "num_input_tokens_seen": 99948100, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.90625, + "step": 4627, + "time_per_iteration": 2.5731146335601807 + }, + { + "auxiliary_loss_clip": 0.01137702, + "auxiliary_loss_mlp": 0.01041394, + "balance_loss_clip": 1.02640903, + "balance_loss_mlp": 1.04856014, + "epoch": 0.2782504133473621, + "flos": 27089286370560.0, + "grad_norm": 4.44086075484182, + "language_loss": 0.85802954, + "learning_rate": 3.387600581071121e-06, + "loss": 0.87982047, + "num_input_tokens_seen": 99966470, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.890625, + "step": 4628, + "time_per_iteration": 2.502816915512085 + }, + { + "auxiliary_loss_clip": 0.01136721, + "auxiliary_loss_mlp": 0.01039197, + "balance_loss_clip": 1.02358079, + "balance_loss_mlp": 1.05035257, + "epoch": 0.2783105366000301, + "flos": 21068826773760.0, + "grad_norm": 1.4685731198996637, + "language_loss": 0.79003006, + "learning_rate": 3.387320077137679e-06, + "loss": 0.81178927, + "num_input_tokens_seen": 99985930, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.86328125, + "step": 4629, + "time_per_iteration": 2.544255256652832 + }, + { + "auxiliary_loss_clip": 0.01132865, + "auxiliary_loss_mlp": 0.01038616, + "balance_loss_clip": 1.02419138, + "balance_loss_mlp": 1.05083036, + "epoch": 0.27837065985269804, + "flos": 26501823434880.0, + "grad_norm": 1.4531737557023054, + "language_loss": 0.84322643, + "learning_rate": 3.3870395205969208e-06, + "loss": 0.86494124, + "num_input_tokens_seen": 100006235, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8203125, + "step": 4630, + "time_per_iteration": 2.514413833618164 + }, + { + "auxiliary_loss_clip": 0.01136217, + "auxiliary_loss_mlp": 0.01040004, + "balance_loss_clip": 1.02343392, + "balance_loss_mlp": 1.04834175, + "epoch": 0.278430783105366, + "flos": 20223201813120.0, + "grad_norm": 2.1800575167200997, + "language_loss": 0.80845618, + "learning_rate": 3.386758911459485e-06, + "loss": 0.83021843, + "num_input_tokens_seen": 100023655, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.87890625, + "step": 4631, + "time_per_iteration": 2.530393123626709 + }, + { + "auxiliary_loss_clip": 0.01141592, + "auxiliary_loss_mlp": 0.01050096, + "balance_loss_clip": 1.03403842, + "balance_loss_mlp": 1.05319762, + "epoch": 0.278490906358034, + "flos": 25592888753280.0, + "grad_norm": 2.154319840219951, + "language_loss": 0.71817827, + "learning_rate": 3.3864782497360126e-06, + "loss": 0.74009514, + "num_input_tokens_seen": 100043280, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8828125, + "step": 4632, + "time_per_iteration": 2.504826307296753 + }, + { + "auxiliary_loss_clip": 0.01135617, + "auxiliary_loss_mlp": 0.01040662, + "balance_loss_clip": 1.02571952, + "balance_loss_mlp": 1.05240536, + "epoch": 0.27855102961070194, + "flos": 16171544528640.0, + "grad_norm": 1.8401586776799086, + "language_loss": 0.82518554, + "learning_rate": 3.386197535437145e-06, + "loss": 0.84694839, + "num_input_tokens_seen": 100057690, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.83203125, + "step": 4633, + "time_per_iteration": 2.484894037246704 + }, + { + "auxiliary_loss_clip": 0.0113869, + "auxiliary_loss_mlp": 0.01044294, + "balance_loss_clip": 1.02622163, + "balance_loss_mlp": 1.05006409, + "epoch": 0.2786111528633699, + "flos": 22927598749440.0, + "grad_norm": 1.740894494158558, + "language_loss": 0.87933433, + "learning_rate": 3.385916768573529e-06, + "loss": 0.90116417, + "num_input_tokens_seen": 100075875, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.88671875, + "step": 4634, + "time_per_iteration": 2.465115785598755 + }, + { + "auxiliary_loss_clip": 0.01139508, + "auxiliary_loss_mlp": 0.0103873, + "balance_loss_clip": 1.02182591, + "balance_loss_mlp": 1.05175185, + "epoch": 0.27867127611603787, + "flos": 23404205335680.0, + "grad_norm": 1.5848956099548452, + "language_loss": 0.77060932, + "learning_rate": 3.38563594915581e-06, + "loss": 0.79239166, + "num_input_tokens_seen": 100092930, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.875, + "step": 4635, + "time_per_iteration": 2.5032925605773926 + }, + { + "auxiliary_loss_clip": 0.01137724, + "auxiliary_loss_mlp": 0.01045364, + "balance_loss_clip": 1.02843595, + "balance_loss_mlp": 1.04919934, + "epoch": 0.27873139936870583, + "flos": 19829010983040.0, + "grad_norm": 1.7277393232375848, + "language_loss": 0.65047133, + "learning_rate": 3.385355077194637e-06, + "loss": 0.67230225, + "num_input_tokens_seen": 100110790, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8828125, + "step": 4636, + "time_per_iteration": 4.078390121459961 + }, + { + "auxiliary_loss_clip": 0.01137292, + "auxiliary_loss_mlp": 0.01039979, + "balance_loss_clip": 1.02249098, + "balance_loss_mlp": 1.04898095, + "epoch": 0.2787915226213738, + "flos": 17707659609600.0, + "grad_norm": 2.3949865449269034, + "language_loss": 0.84131932, + "learning_rate": 3.3850741527006604e-06, + "loss": 0.86309206, + "num_input_tokens_seen": 100126970, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.8828125, + "step": 4637, + "time_per_iteration": 3.9023706912994385 + }, + { + "auxiliary_loss_clip": 0.01131584, + "auxiliary_loss_mlp": 0.0104016, + "balance_loss_clip": 1.02468669, + "balance_loss_mlp": 1.04683113, + "epoch": 0.27885164587404176, + "flos": 22090557139200.0, + "grad_norm": 1.9572077756422592, + "language_loss": 0.75880706, + "learning_rate": 3.384793175684533e-06, + "loss": 0.78052455, + "num_input_tokens_seen": 100146720, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84765625, + "step": 4638, + "time_per_iteration": 2.5291664600372314 + }, + { + "auxiliary_loss_clip": 0.01137756, + "auxiliary_loss_mlp": 0.01044501, + "balance_loss_clip": 1.0281812, + "balance_loss_mlp": 1.04918075, + "epoch": 0.27891176912670973, + "flos": 19207684500480.0, + "grad_norm": 1.663593201704466, + "language_loss": 0.71469444, + "learning_rate": 3.38451214615691e-06, + "loss": 0.73651695, + "num_input_tokens_seen": 100165920, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8828125, + "step": 4639, + "time_per_iteration": 2.4396321773529053 + }, + { + "auxiliary_loss_clip": 0.01135046, + "auxiliary_loss_mlp": 0.01035153, + "balance_loss_clip": 1.01814222, + "balance_loss_mlp": 1.0477488, + "epoch": 0.27897189237937775, + "flos": 27600007898880.0, + "grad_norm": 2.020838508390905, + "language_loss": 0.65634811, + "learning_rate": 3.384231064128447e-06, + "loss": 0.67805016, + "num_input_tokens_seen": 100185525, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.87109375, + "step": 4640, + "time_per_iteration": 2.524146556854248 + }, + { + "auxiliary_loss_clip": 0.01135389, + "auxiliary_loss_mlp": 0.01038864, + "balance_loss_clip": 1.02278829, + "balance_loss_mlp": 1.04838169, + "epoch": 0.2790320156320457, + "flos": 21178210665600.0, + "grad_norm": 1.8663182251903623, + "language_loss": 0.71682954, + "learning_rate": 3.383949929609804e-06, + "loss": 0.738572, + "num_input_tokens_seen": 100204850, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.87109375, + "step": 4641, + "time_per_iteration": 2.45416522026062 + }, + { + "auxiliary_loss_clip": 0.01137426, + "auxiliary_loss_mlp": 0.01043433, + "balance_loss_clip": 1.02620697, + "balance_loss_mlp": 1.04805887, + "epoch": 0.2790921388847137, + "flos": 22783920347520.0, + "grad_norm": 1.721157258136314, + "language_loss": 0.74843872, + "learning_rate": 3.383668742611641e-06, + "loss": 0.77024734, + "num_input_tokens_seen": 100224520, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.890625, + "step": 4642, + "time_per_iteration": 2.498901128768921 + }, + { + "auxiliary_loss_clip": 0.01136083, + "auxiliary_loss_mlp": 0.01041154, + "balance_loss_clip": 1.0241071, + "balance_loss_mlp": 1.04755557, + "epoch": 0.27915226213738165, + "flos": 23400649889280.0, + "grad_norm": 1.7771181879405247, + "language_loss": 0.85500491, + "learning_rate": 3.3833875031446205e-06, + "loss": 0.87677723, + "num_input_tokens_seen": 100243935, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.8828125, + "step": 4643, + "time_per_iteration": 2.4678151607513428 + }, + { + "auxiliary_loss_clip": 0.01135774, + "auxiliary_loss_mlp": 0.01044591, + "balance_loss_clip": 1.02831888, + "balance_loss_mlp": 1.04914284, + "epoch": 0.2792123853900496, + "flos": 22747794243840.0, + "grad_norm": 1.8372365182177028, + "language_loss": 0.8320173, + "learning_rate": 3.383106211219407e-06, + "loss": 0.85382092, + "num_input_tokens_seen": 100262290, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8671875, + "step": 4644, + "time_per_iteration": 2.4989511966705322 + }, + { + "auxiliary_loss_clip": 0.01137034, + "auxiliary_loss_mlp": 0.01039698, + "balance_loss_clip": 1.02340162, + "balance_loss_mlp": 1.04927874, + "epoch": 0.2792725086427176, + "flos": 15049372757760.0, + "grad_norm": 2.1578284197730246, + "language_loss": 0.7905547, + "learning_rate": 3.3828248668466673e-06, + "loss": 0.81232202, + "num_input_tokens_seen": 100280015, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.87890625, + "step": 4645, + "time_per_iteration": 2.444539785385132 + }, + { + "auxiliary_loss_clip": 0.01045698, + "auxiliary_loss_mlp": 0.01013694, + "balance_loss_clip": 1.01202476, + "balance_loss_mlp": 1.01603949, + "epoch": 0.27933263189538554, + "flos": 62544861757440.0, + "grad_norm": 0.7789852310638867, + "language_loss": 0.62276232, + "learning_rate": 3.3825434700370705e-06, + "loss": 0.64335632, + "num_input_tokens_seen": 100338935, + "router_z_loss_clip": 0.01672363, + "router_z_loss_mlp": 0.296875, + "step": 4646, + "time_per_iteration": 3.0487425327301025 + }, + { + "auxiliary_loss_clip": 0.01130687, + "auxiliary_loss_mlp": 0.01035262, + "balance_loss_clip": 1.02039671, + "balance_loss_mlp": 1.04760003, + "epoch": 0.2793927551480535, + "flos": 25118365155840.0, + "grad_norm": 1.6043045349905556, + "language_loss": 0.89379698, + "learning_rate": 3.3822620208012865e-06, + "loss": 0.91545647, + "num_input_tokens_seen": 100359905, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.83203125, + "step": 4647, + "time_per_iteration": 2.537818193435669 + }, + { + "auxiliary_loss_clip": 0.01137315, + "auxiliary_loss_mlp": 0.0104209, + "balance_loss_clip": 1.02559125, + "balance_loss_mlp": 1.04848313, + "epoch": 0.27945287840072147, + "flos": 21324582587520.0, + "grad_norm": 1.6404696751402497, + "language_loss": 0.87119055, + "learning_rate": 3.381980519149988e-06, + "loss": 0.89298457, + "num_input_tokens_seen": 100376955, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.890625, + "step": 4648, + "time_per_iteration": 2.490755081176758 + }, + { + "auxiliary_loss_clip": 0.01138515, + "auxiliary_loss_mlp": 0.01036468, + "balance_loss_clip": 1.01993406, + "balance_loss_mlp": 1.04894495, + "epoch": 0.27951300165338944, + "flos": 27450547407360.0, + "grad_norm": 4.859667262510518, + "language_loss": 0.72424746, + "learning_rate": 3.38169896509385e-06, + "loss": 0.74599725, + "num_input_tokens_seen": 100397545, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.89453125, + "step": 4649, + "time_per_iteration": 2.551149368286133 + }, + { + "auxiliary_loss_clip": 0.01134145, + "auxiliary_loss_mlp": 0.0104133, + "balance_loss_clip": 1.02275741, + "balance_loss_mlp": 1.04667568, + "epoch": 0.2795731249060574, + "flos": 15159008044800.0, + "grad_norm": 2.198213539311656, + "language_loss": 0.80241156, + "learning_rate": 3.381417358643549e-06, + "loss": 0.8241663, + "num_input_tokens_seen": 100415080, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.875, + "step": 4650, + "time_per_iteration": 2.495481252670288 + }, + { + "auxiliary_loss_clip": 0.01043234, + "auxiliary_loss_mlp": 0.01001825, + "balance_loss_clip": 1.00015628, + "balance_loss_mlp": 1.01336908, + "epoch": 0.27963324815872537, + "flos": 60120103178880.0, + "grad_norm": 1.2001935939690993, + "language_loss": 0.58821332, + "learning_rate": 3.3811356998097624e-06, + "loss": 0.60866392, + "num_input_tokens_seen": 100471105, + "router_z_loss_clip": 0.01672363, + "router_z_loss_mlp": 0.296875, + "step": 4651, + "time_per_iteration": 3.089278221130371 + }, + { + "auxiliary_loss_clip": 0.01136266, + "auxiliary_loss_mlp": 0.01041939, + "balance_loss_clip": 1.0239383, + "balance_loss_mlp": 1.04576242, + "epoch": 0.27969337141139333, + "flos": 21765960910080.0, + "grad_norm": 1.6305345142383205, + "language_loss": 0.74335963, + "learning_rate": 3.3808539886031726e-06, + "loss": 0.76514173, + "num_input_tokens_seen": 100492520, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.90625, + "step": 4652, + "time_per_iteration": 2.5034215450286865 + }, + { + "auxiliary_loss_clip": 0.01140774, + "auxiliary_loss_mlp": 0.01044834, + "balance_loss_clip": 1.02826357, + "balance_loss_mlp": 1.05137777, + "epoch": 0.27975349466406135, + "flos": 39851398834560.0, + "grad_norm": 2.1744902530470527, + "language_loss": 0.79703641, + "learning_rate": 3.380572225034461e-06, + "loss": 0.81889254, + "num_input_tokens_seen": 100512870, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.89453125, + "step": 4653, + "time_per_iteration": 2.654989242553711 + }, + { + "auxiliary_loss_clip": 0.0113484, + "auxiliary_loss_mlp": 0.01045549, + "balance_loss_clip": 1.02851391, + "balance_loss_mlp": 1.04782343, + "epoch": 0.2798136179167293, + "flos": 21579799697280.0, + "grad_norm": 2.2131663157599597, + "language_loss": 0.79123974, + "learning_rate": 3.380290409114312e-06, + "loss": 0.81304365, + "num_input_tokens_seen": 100531655, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.87109375, + "step": 4654, + "time_per_iteration": 2.4707679748535156 + }, + { + "auxiliary_loss_clip": 0.01139148, + "auxiliary_loss_mlp": 0.01041113, + "balance_loss_clip": 1.02370811, + "balance_loss_mlp": 1.04861951, + "epoch": 0.2798737411693973, + "flos": 21537676022400.0, + "grad_norm": 2.2002818233708497, + "language_loss": 0.80829996, + "learning_rate": 3.3800085408534127e-06, + "loss": 0.83010256, + "num_input_tokens_seen": 100548005, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.90625, + "step": 4655, + "time_per_iteration": 2.513359546661377 + }, + { + "auxiliary_loss_clip": 0.01135255, + "auxiliary_loss_mlp": 0.01040566, + "balance_loss_clip": 1.0232811, + "balance_loss_mlp": 1.04709148, + "epoch": 0.27993386442206525, + "flos": 26981051713920.0, + "grad_norm": 1.5763016498426998, + "language_loss": 0.8125751, + "learning_rate": 3.3797266202624506e-06, + "loss": 0.8343333, + "num_input_tokens_seen": 100567980, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.87890625, + "step": 4656, + "time_per_iteration": 2.519552707672119 + }, + { + "auxiliary_loss_clip": 0.01135028, + "auxiliary_loss_mlp": 0.01039911, + "balance_loss_clip": 1.02292323, + "balance_loss_mlp": 1.04802632, + "epoch": 0.2799939876747332, + "flos": 24349876652160.0, + "grad_norm": 1.6475258015019663, + "language_loss": 0.83235347, + "learning_rate": 3.3794446473521176e-06, + "loss": 0.85410285, + "num_input_tokens_seen": 100588630, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.87109375, + "step": 4657, + "time_per_iteration": 2.533052444458008 + }, + { + "auxiliary_loss_clip": 0.01136508, + "auxiliary_loss_mlp": 0.01042865, + "balance_loss_clip": 1.0267477, + "balance_loss_mlp": 1.04885554, + "epoch": 0.2800541109274012, + "flos": 33656988648960.0, + "grad_norm": 1.9420207304275756, + "language_loss": 0.63918132, + "learning_rate": 3.379162622133105e-06, + "loss": 0.66097504, + "num_input_tokens_seen": 100608775, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.875, + "step": 4658, + "time_per_iteration": 2.577223777770996 + }, + { + "auxiliary_loss_clip": 0.01137419, + "auxiliary_loss_mlp": 0.01048486, + "balance_loss_clip": 1.03177238, + "balance_loss_mlp": 1.04906631, + "epoch": 0.28011423418006914, + "flos": 21614417429760.0, + "grad_norm": 1.71469006603513, + "language_loss": 0.78447223, + "learning_rate": 3.3788805446161073e-06, + "loss": 0.80633128, + "num_input_tokens_seen": 100627975, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8828125, + "step": 4659, + "time_per_iteration": 2.5102882385253906 + }, + { + "auxiliary_loss_clip": 0.01141159, + "auxiliary_loss_mlp": 0.01052526, + "balance_loss_clip": 1.03565836, + "balance_loss_mlp": 1.05118299, + "epoch": 0.2801743574327371, + "flos": 23112431159040.0, + "grad_norm": 1.8275002529569282, + "language_loss": 0.79481149, + "learning_rate": 3.3785984148118215e-06, + "loss": 0.81674838, + "num_input_tokens_seen": 100645430, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8984375, + "step": 4660, + "time_per_iteration": 2.478348731994629 + }, + { + "auxiliary_loss_clip": 0.01134524, + "auxiliary_loss_mlp": 0.0103899, + "balance_loss_clip": 1.02289653, + "balance_loss_mlp": 1.04855609, + "epoch": 0.2802344806854051, + "flos": 12641418766080.0, + "grad_norm": 1.7763153734220711, + "language_loss": 0.80286032, + "learning_rate": 3.3783162327309453e-06, + "loss": 0.82459545, + "num_input_tokens_seen": 100663775, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.859375, + "step": 4661, + "time_per_iteration": 2.514369249343872 + }, + { + "auxiliary_loss_clip": 0.01140753, + "auxiliary_loss_mlp": 0.01055451, + "balance_loss_clip": 1.03888094, + "balance_loss_mlp": 1.05259752, + "epoch": 0.28029460393807304, + "flos": 37267878142080.0, + "grad_norm": 1.5344085017366311, + "language_loss": 0.78856266, + "learning_rate": 3.3780339983841794e-06, + "loss": 0.8105247, + "num_input_tokens_seen": 100686085, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8828125, + "step": 4662, + "time_per_iteration": 2.6068239212036133 + }, + { + "auxiliary_loss_clip": 0.01142079, + "auxiliary_loss_mlp": 0.01052002, + "balance_loss_clip": 1.03345299, + "balance_loss_mlp": 1.04998207, + "epoch": 0.280354727190741, + "flos": 20741106061440.0, + "grad_norm": 2.3559784459233923, + "language_loss": 0.70354843, + "learning_rate": 3.377751711782227e-06, + "loss": 0.72548926, + "num_input_tokens_seen": 100705135, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.921875, + "step": 4663, + "time_per_iteration": 2.530852794647217 + }, + { + "auxiliary_loss_clip": 0.01139833, + "auxiliary_loss_mlp": 0.01053723, + "balance_loss_clip": 1.03522193, + "balance_loss_mlp": 1.05016875, + "epoch": 0.28041485044340897, + "flos": 21471026336640.0, + "grad_norm": 1.7070620658846938, + "language_loss": 0.77552772, + "learning_rate": 3.377469372935791e-06, + "loss": 0.7974633, + "num_input_tokens_seen": 100724960, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.8984375, + "step": 4664, + "time_per_iteration": 2.5026586055755615 + }, + { + "auxiliary_loss_clip": 0.01132144, + "auxiliary_loss_mlp": 0.01043613, + "balance_loss_clip": 1.02688766, + "balance_loss_mlp": 1.04697514, + "epoch": 0.28047497369607693, + "flos": 14794263388800.0, + "grad_norm": 1.9676420802042491, + "language_loss": 0.79575229, + "learning_rate": 3.377186981855578e-06, + "loss": 0.81750983, + "num_input_tokens_seen": 100741995, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8515625, + "step": 4665, + "time_per_iteration": 2.496948003768921 + }, + { + "auxiliary_loss_clip": 0.01136069, + "auxiliary_loss_mlp": 0.01042713, + "balance_loss_clip": 1.02628565, + "balance_loss_mlp": 1.04934978, + "epoch": 0.2805350969487449, + "flos": 23070738447360.0, + "grad_norm": 8.778135585709748, + "language_loss": 0.80523062, + "learning_rate": 3.3769045385522968e-06, + "loss": 0.82701844, + "num_input_tokens_seen": 100758985, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8671875, + "step": 4666, + "time_per_iteration": 2.4551992416381836 + }, + { + "auxiliary_loss_clip": 0.0113922, + "auxiliary_loss_mlp": 0.01054727, + "balance_loss_clip": 1.03710806, + "balance_loss_mlp": 1.05058241, + "epoch": 0.2805952202014129, + "flos": 20479855466880.0, + "grad_norm": 2.0519370530418493, + "language_loss": 0.84514672, + "learning_rate": 3.376622043036658e-06, + "loss": 0.86708617, + "num_input_tokens_seen": 100777820, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.88671875, + "step": 4667, + "time_per_iteration": 2.503024101257324 + }, + { + "auxiliary_loss_clip": 0.01141868, + "auxiliary_loss_mlp": 0.0104464, + "balance_loss_clip": 1.02771258, + "balance_loss_mlp": 1.05165899, + "epoch": 0.2806553434540809, + "flos": 27417330305280.0, + "grad_norm": 1.59556786146991, + "language_loss": 0.79110259, + "learning_rate": 3.376339495319373e-06, + "loss": 0.81296772, + "num_input_tokens_seen": 100798205, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.90234375, + "step": 4668, + "time_per_iteration": 2.5109217166900635 + }, + { + "auxiliary_loss_clip": 0.01137821, + "auxiliary_loss_mlp": 0.01045025, + "balance_loss_clip": 1.02783513, + "balance_loss_mlp": 1.0472095, + "epoch": 0.28071546670674885, + "flos": 26505019745280.0, + "grad_norm": 5.202292388628492, + "language_loss": 0.7594949, + "learning_rate": 3.3760568954111563e-06, + "loss": 0.78132337, + "num_input_tokens_seen": 100819800, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.90625, + "step": 4669, + "time_per_iteration": 2.5443029403686523 + }, + { + "auxiliary_loss_clip": 0.01139015, + "auxiliary_loss_mlp": 0.01050472, + "balance_loss_clip": 1.03276944, + "balance_loss_mlp": 1.05060363, + "epoch": 0.2807755899594168, + "flos": 20558679863040.0, + "grad_norm": 2.249572842905479, + "language_loss": 0.78818107, + "learning_rate": 3.375774243322725e-06, + "loss": 0.81007588, + "num_input_tokens_seen": 100837880, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.8828125, + "step": 4670, + "time_per_iteration": 2.4583303928375244 + }, + { + "auxiliary_loss_clip": 0.01142576, + "auxiliary_loss_mlp": 0.010505, + "balance_loss_clip": 1.03272545, + "balance_loss_mlp": 1.05169237, + "epoch": 0.2808357132120848, + "flos": 24313319585280.0, + "grad_norm": 2.1344815005037323, + "language_loss": 0.78915119, + "learning_rate": 3.3754915390647955e-06, + "loss": 0.81108201, + "num_input_tokens_seen": 100856350, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.91015625, + "step": 4671, + "time_per_iteration": 2.576904296875 + }, + { + "auxiliary_loss_clip": 0.01136631, + "auxiliary_loss_mlp": 0.01039569, + "balance_loss_clip": 1.02419102, + "balance_loss_mlp": 1.05212355, + "epoch": 0.28089583646475275, + "flos": 26432408401920.0, + "grad_norm": 1.655300005604084, + "language_loss": 0.74891758, + "learning_rate": 3.37520878264809e-06, + "loss": 0.77067947, + "num_input_tokens_seen": 100876135, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84375, + "step": 4672, + "time_per_iteration": 2.5101001262664795 + }, + { + "auxiliary_loss_clip": 0.01139664, + "auxiliary_loss_mlp": 0.01048727, + "balance_loss_clip": 1.0297612, + "balance_loss_mlp": 1.05017138, + "epoch": 0.2809559597174207, + "flos": 23111820627840.0, + "grad_norm": 2.377632390973165, + "language_loss": 0.7485683, + "learning_rate": 3.3749259740833286e-06, + "loss": 0.77045226, + "num_input_tokens_seen": 100894790, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.89453125, + "step": 4673, + "time_per_iteration": 2.5559215545654297 + }, + { + "auxiliary_loss_clip": 0.0113758, + "auxiliary_loss_mlp": 0.01040684, + "balance_loss_clip": 1.02367294, + "balance_loss_mlp": 1.04911065, + "epoch": 0.2810160829700887, + "flos": 20923496346240.0, + "grad_norm": 2.162495737742732, + "language_loss": 0.72274792, + "learning_rate": 3.374643113381237e-06, + "loss": 0.74453062, + "num_input_tokens_seen": 100915100, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.88671875, + "step": 4674, + "time_per_iteration": 2.4755725860595703 + }, + { + "auxiliary_loss_clip": 0.01142202, + "auxiliary_loss_mlp": 0.01043016, + "balance_loss_clip": 1.02487254, + "balance_loss_mlp": 1.05152214, + "epoch": 0.28107620622275664, + "flos": 14355901808640.0, + "grad_norm": 1.8501022214838438, + "language_loss": 0.77636325, + "learning_rate": 3.374360200552541e-06, + "loss": 0.79821539, + "num_input_tokens_seen": 100932795, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.90625, + "step": 4675, + "time_per_iteration": 2.5076191425323486 + }, + { + "auxiliary_loss_clip": 0.011417, + "auxiliary_loss_mlp": 0.01048524, + "balance_loss_clip": 1.03059506, + "balance_loss_mlp": 1.05080581, + "epoch": 0.2811363294754246, + "flos": 20919078973440.0, + "grad_norm": 4.743769816525981, + "language_loss": 0.7033428, + "learning_rate": 3.374077235607968e-06, + "loss": 0.72524506, + "num_input_tokens_seen": 100950505, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.90625, + "step": 4676, + "time_per_iteration": 2.4664652347564697 + }, + { + "auxiliary_loss_clip": 0.01136213, + "auxiliary_loss_mlp": 0.01042174, + "balance_loss_clip": 1.02637279, + "balance_loss_mlp": 1.05219054, + "epoch": 0.28119645272809257, + "flos": 20594841880320.0, + "grad_norm": 1.6504598517134752, + "language_loss": 0.70294476, + "learning_rate": 3.3737942185582487e-06, + "loss": 0.7247287, + "num_input_tokens_seen": 100968790, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.83984375, + "step": 4677, + "time_per_iteration": 3.9926962852478027 + }, + { + "auxiliary_loss_clip": 0.01140831, + "auxiliary_loss_mlp": 0.01046995, + "balance_loss_clip": 1.02779067, + "balance_loss_mlp": 1.05172849, + "epoch": 0.28125657598076054, + "flos": 25337420248320.0, + "grad_norm": 1.7155329144241396, + "language_loss": 0.63506716, + "learning_rate": 3.3735111494141153e-06, + "loss": 0.65694547, + "num_input_tokens_seen": 100990205, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.890625, + "step": 4678, + "time_per_iteration": 5.452545642852783 + }, + { + "auxiliary_loss_clip": 0.01140503, + "auxiliary_loss_mlp": 0.01048157, + "balance_loss_clip": 1.031039, + "balance_loss_mlp": 1.05193949, + "epoch": 0.2813166992334285, + "flos": 24827093769600.0, + "grad_norm": 1.4644682748892532, + "language_loss": 0.70249045, + "learning_rate": 3.3732280281863013e-06, + "loss": 0.7243771, + "num_input_tokens_seen": 101009815, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.88671875, + "step": 4679, + "time_per_iteration": 2.557156801223755 + }, + { + "auxiliary_loss_clip": 0.01138678, + "auxiliary_loss_mlp": 0.01040208, + "balance_loss_clip": 1.02276742, + "balance_loss_mlp": 1.05024076, + "epoch": 0.2813768224860965, + "flos": 21760753438080.0, + "grad_norm": 1.8307759218313573, + "language_loss": 0.74600148, + "learning_rate": 3.3729448548855422e-06, + "loss": 0.76779038, + "num_input_tokens_seen": 101026780, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.8828125, + "step": 4680, + "time_per_iteration": 2.478760004043579 + }, + { + "auxiliary_loss_clip": 0.01140599, + "auxiliary_loss_mlp": 0.01043469, + "balance_loss_clip": 1.0268507, + "balance_loss_mlp": 1.0514679, + "epoch": 0.2814369457387645, + "flos": 24316803204480.0, + "grad_norm": 1.8069902018568411, + "language_loss": 0.77090317, + "learning_rate": 3.3726616295225774e-06, + "loss": 0.79274386, + "num_input_tokens_seen": 101046215, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.890625, + "step": 4681, + "time_per_iteration": 2.5532946586608887 + }, + { + "auxiliary_loss_clip": 0.01142988, + "auxiliary_loss_mlp": 0.01041039, + "balance_loss_clip": 1.02353942, + "balance_loss_mlp": 1.05301392, + "epoch": 0.28149706899143245, + "flos": 18515326872960.0, + "grad_norm": 4.33574203258507, + "language_loss": 0.74047244, + "learning_rate": 3.372378352108146e-06, + "loss": 0.76231277, + "num_input_tokens_seen": 101063365, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.8984375, + "step": 4682, + "time_per_iteration": 2.450707197189331 + }, + { + "auxiliary_loss_clip": 0.0113683, + "auxiliary_loss_mlp": 0.01043566, + "balance_loss_clip": 1.02712727, + "balance_loss_mlp": 1.04989302, + "epoch": 0.2815571922441004, + "flos": 24863255786880.0, + "grad_norm": 1.4103030378304897, + "language_loss": 0.80830532, + "learning_rate": 3.3720950226529894e-06, + "loss": 0.8301093, + "num_input_tokens_seen": 101083835, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8671875, + "step": 4683, + "time_per_iteration": 2.5405828952789307 + }, + { + "auxiliary_loss_clip": 0.01142223, + "auxiliary_loss_mlp": 0.01047785, + "balance_loss_clip": 1.02984428, + "balance_loss_mlp": 1.05146146, + "epoch": 0.2816173154967684, + "flos": 19901622326400.0, + "grad_norm": 1.6936052100643573, + "language_loss": 0.76107442, + "learning_rate": 3.371811641167852e-06, + "loss": 0.78297454, + "num_input_tokens_seen": 101101740, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.90625, + "step": 4684, + "time_per_iteration": 2.4734883308410645 + }, + { + "auxiliary_loss_clip": 0.01135013, + "auxiliary_loss_mlp": 0.01038474, + "balance_loss_clip": 1.02196348, + "balance_loss_mlp": 1.04849601, + "epoch": 0.28167743874943635, + "flos": 17491333950720.0, + "grad_norm": 1.9675146174992446, + "language_loss": 0.7601878, + "learning_rate": 3.3715282076634807e-06, + "loss": 0.7819227, + "num_input_tokens_seen": 101120480, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.86328125, + "step": 4685, + "time_per_iteration": 2.521883010864258 + }, + { + "auxiliary_loss_clip": 0.01136456, + "auxiliary_loss_mlp": 0.01043262, + "balance_loss_clip": 1.02666783, + "balance_loss_mlp": 1.05083728, + "epoch": 0.2817375620021043, + "flos": 25302120157440.0, + "grad_norm": 2.003036282603561, + "language_loss": 0.7616905, + "learning_rate": 3.3712447221506218e-06, + "loss": 0.78348768, + "num_input_tokens_seen": 101142910, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.85546875, + "step": 4686, + "time_per_iteration": 2.5261688232421875 + }, + { + "auxiliary_loss_clip": 0.01140126, + "auxiliary_loss_mlp": 0.01051506, + "balance_loss_clip": 1.03319538, + "balance_loss_mlp": 1.04916072, + "epoch": 0.2817976852547723, + "flos": 18693227957760.0, + "grad_norm": 2.230965321609006, + "language_loss": 0.63345516, + "learning_rate": 3.370961184640025e-06, + "loss": 0.65537149, + "num_input_tokens_seen": 101160030, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.90625, + "step": 4687, + "time_per_iteration": 2.473508834838867 + }, + { + "auxiliary_loss_clip": 0.0114172, + "auxiliary_loss_mlp": 0.01052796, + "balance_loss_clip": 1.03577256, + "balance_loss_mlp": 1.05180609, + "epoch": 0.28185780850744024, + "flos": 22742263549440.0, + "grad_norm": 1.9761865692880811, + "language_loss": 0.76504958, + "learning_rate": 3.3706775951424433e-06, + "loss": 0.7869947, + "num_input_tokens_seen": 101177675, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.8984375, + "step": 4688, + "time_per_iteration": 2.4815330505371094 + }, + { + "auxiliary_loss_clip": 0.01135292, + "auxiliary_loss_mlp": 0.01040309, + "balance_loss_clip": 1.02364409, + "balance_loss_mlp": 1.04902148, + "epoch": 0.2819179317601082, + "flos": 14933919467520.0, + "grad_norm": 2.291650314126009, + "language_loss": 0.78333032, + "learning_rate": 3.37039395366863e-06, + "loss": 0.80508631, + "num_input_tokens_seen": 101192225, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.86328125, + "step": 4689, + "time_per_iteration": 2.464221239089966 + }, + { + "auxiliary_loss_clip": 0.01136671, + "auxiliary_loss_mlp": 0.01042633, + "balance_loss_clip": 1.02566934, + "balance_loss_mlp": 1.04886627, + "epoch": 0.2819780550127762, + "flos": 23145325038720.0, + "grad_norm": 2.2251394110426896, + "language_loss": 0.77819848, + "learning_rate": 3.37011026022934e-06, + "loss": 0.79999155, + "num_input_tokens_seen": 101210870, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.87890625, + "step": 4690, + "time_per_iteration": 2.4802086353302 + }, + { + "auxiliary_loss_clip": 0.01138887, + "auxiliary_loss_mlp": 0.01044233, + "balance_loss_clip": 1.02809191, + "balance_loss_mlp": 1.04984617, + "epoch": 0.28203817826544414, + "flos": 21616356764160.0, + "grad_norm": 1.762007121853784, + "language_loss": 0.8775022, + "learning_rate": 3.369826514835332e-06, + "loss": 0.89933336, + "num_input_tokens_seen": 101229965, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.890625, + "step": 4691, + "time_per_iteration": 2.5098307132720947 + }, + { + "auxiliary_loss_clip": 0.01144357, + "auxiliary_loss_mlp": 0.01043906, + "balance_loss_clip": 1.02714467, + "balance_loss_mlp": 1.0519383, + "epoch": 0.2820983015181121, + "flos": 24026788794240.0, + "grad_norm": 2.144178457094415, + "language_loss": 0.81952238, + "learning_rate": 3.3695427174973654e-06, + "loss": 0.84140503, + "num_input_tokens_seen": 101250980, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.921875, + "step": 4692, + "time_per_iteration": 2.501150131225586 + }, + { + "auxiliary_loss_clip": 0.01137061, + "auxiliary_loss_mlp": 0.01039873, + "balance_loss_clip": 1.02284956, + "balance_loss_mlp": 1.04852128, + "epoch": 0.2821584247707801, + "flos": 30007925976960.0, + "grad_norm": 1.7100054669520195, + "language_loss": 0.74535745, + "learning_rate": 3.3692588682262022e-06, + "loss": 0.7671268, + "num_input_tokens_seen": 101273335, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.88671875, + "step": 4693, + "time_per_iteration": 2.581108808517456 + }, + { + "auxiliary_loss_clip": 0.01139239, + "auxiliary_loss_mlp": 0.01036265, + "balance_loss_clip": 1.01942062, + "balance_loss_mlp": 1.04924035, + "epoch": 0.2822185480234481, + "flos": 21396762967680.0, + "grad_norm": 1.6174705324311944, + "language_loss": 0.7761777, + "learning_rate": 3.3689749670326046e-06, + "loss": 0.79793274, + "num_input_tokens_seen": 101292110, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8984375, + "step": 4694, + "time_per_iteration": 2.479616403579712 + }, + { + "auxiliary_loss_clip": 0.01136707, + "auxiliary_loss_mlp": 0.01038934, + "balance_loss_clip": 1.02220941, + "balance_loss_mlp": 1.05057073, + "epoch": 0.28227867127611606, + "flos": 27452809964160.0, + "grad_norm": 2.0658621313481604, + "language_loss": 0.66812259, + "learning_rate": 3.3686910139273392e-06, + "loss": 0.68987906, + "num_input_tokens_seen": 101312815, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.86328125, + "step": 4695, + "time_per_iteration": 2.560234546661377 + }, + { + "auxiliary_loss_clip": 0.0114143, + "auxiliary_loss_mlp": 0.01047559, + "balance_loss_clip": 1.02859259, + "balance_loss_mlp": 1.05084562, + "epoch": 0.282338794528784, + "flos": 22593736811520.0, + "grad_norm": 2.206840044366299, + "language_loss": 0.75868189, + "learning_rate": 3.3684070089211736e-06, + "loss": 0.78057176, + "num_input_tokens_seen": 101329045, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.90625, + "step": 4696, + "time_per_iteration": 2.484731674194336 + }, + { + "auxiliary_loss_clip": 0.01142111, + "auxiliary_loss_mlp": 0.01049599, + "balance_loss_clip": 1.03283811, + "balance_loss_mlp": 1.05234432, + "epoch": 0.282398917781452, + "flos": 42010923386880.0, + "grad_norm": 4.801168729119655, + "language_loss": 0.62373543, + "learning_rate": 3.368122952024877e-06, + "loss": 0.64565253, + "num_input_tokens_seen": 101352715, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8984375, + "step": 4697, + "time_per_iteration": 2.6771903038024902 + }, + { + "auxiliary_loss_clip": 0.01131406, + "auxiliary_loss_mlp": 0.01035664, + "balance_loss_clip": 1.02003598, + "balance_loss_mlp": 1.0468322, + "epoch": 0.28245904103411995, + "flos": 23224724052480.0, + "grad_norm": 1.6839402690923742, + "language_loss": 0.73317522, + "learning_rate": 3.3678388432492214e-06, + "loss": 0.75484592, + "num_input_tokens_seen": 101374640, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84375, + "step": 4698, + "time_per_iteration": 2.5262162685394287 + }, + { + "auxiliary_loss_clip": 0.01130801, + "auxiliary_loss_mlp": 0.01044648, + "balance_loss_clip": 1.029091, + "balance_loss_mlp": 1.0463903, + "epoch": 0.2825191642867879, + "flos": 25374623760000.0, + "grad_norm": 2.1160143892835275, + "language_loss": 0.74896884, + "learning_rate": 3.3675546826049788e-06, + "loss": 0.77072334, + "num_input_tokens_seen": 101393595, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.84375, + "step": 4699, + "time_per_iteration": 2.5613014698028564 + }, + { + "auxiliary_loss_clip": 0.01139697, + "auxiliary_loss_mlp": 0.01041633, + "balance_loss_clip": 1.02369165, + "balance_loss_mlp": 1.05032122, + "epoch": 0.2825792875394559, + "flos": 17236799199360.0, + "grad_norm": 3.187545417707515, + "language_loss": 0.80256712, + "learning_rate": 3.3672704701029265e-06, + "loss": 0.8243804, + "num_input_tokens_seen": 101409265, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.89453125, + "step": 4700, + "time_per_iteration": 2.4355719089508057 + }, + { + "auxiliary_loss_clip": 0.01132153, + "auxiliary_loss_mlp": 0.01049134, + "balance_loss_clip": 1.03461456, + "balance_loss_mlp": 1.05022645, + "epoch": 0.28263941079212385, + "flos": 26723967096960.0, + "grad_norm": 1.7483881606912919, + "language_loss": 0.81309319, + "learning_rate": 3.3669862057538402e-06, + "loss": 0.8349061, + "num_input_tokens_seen": 101428365, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8203125, + "step": 4701, + "time_per_iteration": 2.590824842453003 + }, + { + "auxiliary_loss_clip": 0.0113653, + "auxiliary_loss_mlp": 0.01038832, + "balance_loss_clip": 1.02301347, + "balance_loss_mlp": 1.05007911, + "epoch": 0.2826995340447918, + "flos": 25921327737600.0, + "grad_norm": 2.214271940066586, + "language_loss": 0.73758674, + "learning_rate": 3.3667018895685004e-06, + "loss": 0.75934035, + "num_input_tokens_seen": 101447280, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.86328125, + "step": 4702, + "time_per_iteration": 2.496689796447754 + }, + { + "auxiliary_loss_clip": 0.01136189, + "auxiliary_loss_mlp": 0.01038892, + "balance_loss_clip": 1.02251232, + "balance_loss_mlp": 1.05127287, + "epoch": 0.2827596572974598, + "flos": 22379709623040.0, + "grad_norm": 1.7981890053968508, + "language_loss": 0.78189409, + "learning_rate": 3.3664175215576886e-06, + "loss": 0.8036449, + "num_input_tokens_seen": 101465435, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8515625, + "step": 4703, + "time_per_iteration": 2.5225300788879395 + }, + { + "auxiliary_loss_clip": 0.011353, + "auxiliary_loss_mlp": 0.01046746, + "balance_loss_clip": 1.02923465, + "balance_loss_mlp": 1.0484302, + "epoch": 0.28281978055012774, + "flos": 33547137880320.0, + "grad_norm": 1.6026897384097336, + "language_loss": 0.6944623, + "learning_rate": 3.3661331017321867e-06, + "loss": 0.71628278, + "num_input_tokens_seen": 101486355, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8671875, + "step": 4704, + "time_per_iteration": 2.5721168518066406 + }, + { + "auxiliary_loss_clip": 0.0113917, + "auxiliary_loss_mlp": 0.01043731, + "balance_loss_clip": 1.02685118, + "balance_loss_mlp": 1.05374229, + "epoch": 0.2828799038027957, + "flos": 23440870143360.0, + "grad_norm": 1.9868129767490792, + "language_loss": 0.69884789, + "learning_rate": 3.3658486301027807e-06, + "loss": 0.7206769, + "num_input_tokens_seen": 101505875, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.85546875, + "step": 4705, + "time_per_iteration": 2.532034397125244 + }, + { + "auxiliary_loss_clip": 0.01057982, + "auxiliary_loss_mlp": 0.01003525, + "balance_loss_clip": 1.0018208, + "balance_loss_mlp": 1.02761459, + "epoch": 0.2829400270554637, + "flos": 69873690251520.0, + "grad_norm": 0.7396595768854823, + "language_loss": 0.59243953, + "learning_rate": 3.3655641066802577e-06, + "loss": 0.61305463, + "num_input_tokens_seen": 101565045, + "router_z_loss_clip": 0.01708984, + "router_z_loss_mlp": 0.3046875, + "step": 4706, + "time_per_iteration": 3.1149942874908447 + }, + { + "auxiliary_loss_clip": 0.01135425, + "auxiliary_loss_mlp": 0.01040531, + "balance_loss_clip": 1.02586842, + "balance_loss_mlp": 1.05135274, + "epoch": 0.2830001503081317, + "flos": 24789028331520.0, + "grad_norm": 1.3972451569930537, + "language_loss": 0.82227451, + "learning_rate": 3.365279531475407e-06, + "loss": 0.84403402, + "num_input_tokens_seen": 101585825, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.83984375, + "step": 4707, + "time_per_iteration": 2.5387215614318848 + }, + { + "auxiliary_loss_clip": 0.01137999, + "auxiliary_loss_mlp": 0.01039747, + "balance_loss_clip": 1.02199709, + "balance_loss_mlp": 1.04914331, + "epoch": 0.28306027356079966, + "flos": 27669387018240.0, + "grad_norm": 1.4509576382878049, + "language_loss": 0.80561262, + "learning_rate": 3.36499490449902e-06, + "loss": 0.82739007, + "num_input_tokens_seen": 101606105, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.890625, + "step": 4708, + "time_per_iteration": 2.5140204429626465 + }, + { + "auxiliary_loss_clip": 0.0105521, + "auxiliary_loss_mlp": 0.01000508, + "balance_loss_clip": 0.99875605, + "balance_loss_mlp": 1.02517498, + "epoch": 0.2831203968134676, + "flos": 60527938199040.0, + "grad_norm": 0.9117312370003612, + "language_loss": 0.62801576, + "learning_rate": 3.3647102257618895e-06, + "loss": 0.64857292, + "num_input_tokens_seen": 101656875, + "router_z_loss_clip": 0.01757812, + "router_z_loss_mlp": 0.30078125, + "step": 4709, + "time_per_iteration": 2.936171054840088 + }, + { + "auxiliary_loss_clip": 0.01133236, + "auxiliary_loss_mlp": 0.01038943, + "balance_loss_clip": 1.02320743, + "balance_loss_mlp": 1.04888415, + "epoch": 0.2831805200661356, + "flos": 22054790171520.0, + "grad_norm": 1.3738384560226649, + "language_loss": 0.73850632, + "learning_rate": 3.3644254952748103e-06, + "loss": 0.76022816, + "num_input_tokens_seen": 101676225, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.84375, + "step": 4710, + "time_per_iteration": 2.4954519271850586 + }, + { + "auxiliary_loss_clip": 0.01137863, + "auxiliary_loss_mlp": 0.01049743, + "balance_loss_clip": 1.03191566, + "balance_loss_mlp": 1.04925823, + "epoch": 0.28324064331880355, + "flos": 22600668136320.0, + "grad_norm": 1.9168276099157815, + "language_loss": 0.79272872, + "learning_rate": 3.364140713048579e-06, + "loss": 0.81460476, + "num_input_tokens_seen": 101693710, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.88671875, + "step": 4711, + "time_per_iteration": 2.4867448806762695 + }, + { + "auxiliary_loss_clip": 0.01138366, + "auxiliary_loss_mlp": 0.01043891, + "balance_loss_clip": 1.02646244, + "balance_loss_mlp": 1.04965401, + "epoch": 0.2833007665714715, + "flos": 30404127968640.0, + "grad_norm": 2.0504814559042064, + "language_loss": 0.71246219, + "learning_rate": 3.363855879093996e-06, + "loss": 0.73428476, + "num_input_tokens_seen": 101714010, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.88671875, + "step": 4712, + "time_per_iteration": 2.575636863708496 + }, + { + "auxiliary_loss_clip": 0.01138441, + "auxiliary_loss_mlp": 0.01049763, + "balance_loss_clip": 1.03291881, + "balance_loss_mlp": 1.05000687, + "epoch": 0.2833608898241395, + "flos": 23549499849600.0, + "grad_norm": 1.8055678270358249, + "language_loss": 0.82008445, + "learning_rate": 3.3635709934218605e-06, + "loss": 0.84196651, + "num_input_tokens_seen": 101732995, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8828125, + "step": 4713, + "time_per_iteration": 2.493767499923706 + }, + { + "auxiliary_loss_clip": 0.01136571, + "auxiliary_loss_mlp": 0.01041134, + "balance_loss_clip": 1.02401519, + "balance_loss_mlp": 1.05028057, + "epoch": 0.28342101307680745, + "flos": 20266726118400.0, + "grad_norm": 1.7485744544400377, + "language_loss": 0.75356781, + "learning_rate": 3.3632860560429766e-06, + "loss": 0.77534491, + "num_input_tokens_seen": 101751385, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.86328125, + "step": 4714, + "time_per_iteration": 2.505153179168701 + }, + { + "auxiliary_loss_clip": 0.01136297, + "auxiliary_loss_mlp": 0.01045701, + "balance_loss_clip": 1.02967894, + "balance_loss_mlp": 1.04942465, + "epoch": 0.2834811363294754, + "flos": 30847050576000.0, + "grad_norm": 1.4087892826571713, + "language_loss": 0.78411347, + "learning_rate": 3.3630010669681494e-06, + "loss": 0.80593348, + "num_input_tokens_seen": 101773825, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8671875, + "step": 4715, + "time_per_iteration": 2.554814100265503 + }, + { + "auxiliary_loss_clip": 0.01135347, + "auxiliary_loss_mlp": 0.01036876, + "balance_loss_clip": 1.02042472, + "balance_loss_mlp": 1.04960322, + "epoch": 0.2835412595821434, + "flos": 22711021695360.0, + "grad_norm": 1.6801208741854476, + "language_loss": 0.73694074, + "learning_rate": 3.3627160262081845e-06, + "loss": 0.758663, + "num_input_tokens_seen": 101791920, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.859375, + "step": 4716, + "time_per_iteration": 2.5286571979522705 + }, + { + "auxiliary_loss_clip": 0.01139786, + "auxiliary_loss_mlp": 0.01041969, + "balance_loss_clip": 1.02437401, + "balance_loss_mlp": 1.04774714, + "epoch": 0.28360138283481134, + "flos": 18077719478400.0, + "grad_norm": 2.328876822443367, + "language_loss": 0.74648547, + "learning_rate": 3.3624309337738917e-06, + "loss": 0.76830298, + "num_input_tokens_seen": 101809515, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.921875, + "step": 4717, + "time_per_iteration": 2.46952223777771 + }, + { + "auxiliary_loss_clip": 0.01139264, + "auxiliary_loss_mlp": 0.01044702, + "balance_loss_clip": 1.02846563, + "balance_loss_mlp": 1.04963374, + "epoch": 0.2836615060874793, + "flos": 17854785717120.0, + "grad_norm": 1.4913957575980352, + "language_loss": 0.669999, + "learning_rate": 3.3621457896760813e-06, + "loss": 0.69183862, + "num_input_tokens_seen": 101827735, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.89453125, + "step": 4718, + "time_per_iteration": 2.4831228256225586 + }, + { + "auxiliary_loss_clip": 0.01137489, + "auxiliary_loss_mlp": 0.01046854, + "balance_loss_clip": 1.03000975, + "balance_loss_mlp": 1.04782009, + "epoch": 0.2837216293401473, + "flos": 25740302169600.0, + "grad_norm": 1.8756812569885382, + "language_loss": 0.72633672, + "learning_rate": 3.361860593925566e-06, + "loss": 0.74818015, + "num_input_tokens_seen": 101845970, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.89453125, + "step": 4719, + "time_per_iteration": 4.022828102111816 + }, + { + "auxiliary_loss_clip": 0.01135497, + "auxiliary_loss_mlp": 0.01041437, + "balance_loss_clip": 1.02554655, + "balance_loss_mlp": 1.04928601, + "epoch": 0.2837817525928153, + "flos": 20923532259840.0, + "grad_norm": 1.5135010931827333, + "language_loss": 0.80621493, + "learning_rate": 3.3615753465331605e-06, + "loss": 0.82798427, + "num_input_tokens_seen": 101865040, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.86328125, + "step": 4720, + "time_per_iteration": 5.367753505706787 + }, + { + "auxiliary_loss_clip": 0.0113932, + "auxiliary_loss_mlp": 0.01044199, + "balance_loss_clip": 1.02752209, + "balance_loss_mlp": 1.05115819, + "epoch": 0.28384187584548326, + "flos": 18916700423040.0, + "grad_norm": 1.7029911565101727, + "language_loss": 0.79467577, + "learning_rate": 3.3612900475096817e-06, + "loss": 0.81651098, + "num_input_tokens_seen": 101883735, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8828125, + "step": 4721, + "time_per_iteration": 2.50327730178833 + }, + { + "auxiliary_loss_clip": 0.01133729, + "auxiliary_loss_mlp": 0.01035212, + "balance_loss_clip": 1.01929736, + "balance_loss_mlp": 1.04810679, + "epoch": 0.2839019990981512, + "flos": 27343964776320.0, + "grad_norm": 2.0644081658079343, + "language_loss": 0.82823032, + "learning_rate": 3.3610046968659474e-06, + "loss": 0.84991974, + "num_input_tokens_seen": 101903025, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.859375, + "step": 4722, + "time_per_iteration": 2.4968478679656982 + }, + { + "auxiliary_loss_clip": 0.01137825, + "auxiliary_loss_mlp": 0.01039882, + "balance_loss_clip": 1.02364612, + "balance_loss_mlp": 1.05073261, + "epoch": 0.2839621223508192, + "flos": 18114312458880.0, + "grad_norm": 1.6187910677092856, + "language_loss": 0.70086461, + "learning_rate": 3.3607192946127785e-06, + "loss": 0.72264171, + "num_input_tokens_seen": 101922255, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.87109375, + "step": 4723, + "time_per_iteration": 2.4899258613586426 + }, + { + "auxiliary_loss_clip": 0.01135132, + "auxiliary_loss_mlp": 0.01044726, + "balance_loss_clip": 1.02747679, + "balance_loss_mlp": 1.04938078, + "epoch": 0.28402224560348716, + "flos": 26358360514560.0, + "grad_norm": 1.736224288784384, + "language_loss": 0.78556609, + "learning_rate": 3.360433840760998e-06, + "loss": 0.8073647, + "num_input_tokens_seen": 101943100, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.859375, + "step": 4724, + "time_per_iteration": 2.496594190597534 + }, + { + "auxiliary_loss_clip": 0.01139767, + "auxiliary_loss_mlp": 0.01043591, + "balance_loss_clip": 1.02660346, + "balance_loss_mlp": 1.05093193, + "epoch": 0.2840823688561551, + "flos": 24060795995520.0, + "grad_norm": 1.6232572980988387, + "language_loss": 0.92404163, + "learning_rate": 3.36014833532143e-06, + "loss": 0.94587529, + "num_input_tokens_seen": 101963160, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.88671875, + "step": 4725, + "time_per_iteration": 2.511526584625244 + }, + { + "auxiliary_loss_clip": 0.01140103, + "auxiliary_loss_mlp": 0.01043108, + "balance_loss_clip": 1.0257988, + "balance_loss_mlp": 1.05020452, + "epoch": 0.2841424921088231, + "flos": 29459821368960.0, + "grad_norm": 2.0539060112221645, + "language_loss": 0.88626051, + "learning_rate": 3.3598627783049e-06, + "loss": 0.90809256, + "num_input_tokens_seen": 101984300, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.8984375, + "step": 4726, + "time_per_iteration": 2.5431292057037354 + }, + { + "auxiliary_loss_clip": 0.01139706, + "auxiliary_loss_mlp": 0.01048538, + "balance_loss_clip": 1.03090727, + "balance_loss_mlp": 1.05034256, + "epoch": 0.28420261536149105, + "flos": 48100367053440.0, + "grad_norm": 2.15176079657567, + "language_loss": 0.78793001, + "learning_rate": 3.359577169722238e-06, + "loss": 0.80981243, + "num_input_tokens_seen": 102005765, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.89453125, + "step": 4727, + "time_per_iteration": 2.7037220001220703 + }, + { + "auxiliary_loss_clip": 0.01133758, + "auxiliary_loss_mlp": 0.01037347, + "balance_loss_clip": 1.02229071, + "balance_loss_mlp": 1.04985464, + "epoch": 0.284262738614159, + "flos": 25666146541440.0, + "grad_norm": 2.258515630996078, + "language_loss": 0.66358554, + "learning_rate": 3.3592915095842733e-06, + "loss": 0.68529654, + "num_input_tokens_seen": 102022755, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.83984375, + "step": 4728, + "time_per_iteration": 2.5066046714782715 + }, + { + "auxiliary_loss_clip": 0.01134281, + "auxiliary_loss_mlp": 0.01045863, + "balance_loss_clip": 1.02941179, + "balance_loss_mlp": 1.04727221, + "epoch": 0.284322861866827, + "flos": 19718980646400.0, + "grad_norm": 1.756924339447767, + "language_loss": 0.75958216, + "learning_rate": 3.3590057979018386e-06, + "loss": 0.78138363, + "num_input_tokens_seen": 102041850, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.87109375, + "step": 4729, + "time_per_iteration": 2.4989402294158936 + }, + { + "auxiliary_loss_clip": 0.01140784, + "auxiliary_loss_mlp": 0.01050786, + "balance_loss_clip": 1.03383398, + "balance_loss_mlp": 1.05095756, + "epoch": 0.28438298511949495, + "flos": 23915250086400.0, + "grad_norm": 1.9682162336594704, + "language_loss": 0.66691023, + "learning_rate": 3.3587200346857674e-06, + "loss": 0.68882596, + "num_input_tokens_seen": 102059500, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.8984375, + "step": 4730, + "time_per_iteration": 2.509514570236206 + }, + { + "auxiliary_loss_clip": 0.01138579, + "auxiliary_loss_mlp": 0.01039094, + "balance_loss_clip": 1.02232122, + "balance_loss_mlp": 1.05049443, + "epoch": 0.2844431083721629, + "flos": 26067340523520.0, + "grad_norm": 1.7814838549320247, + "language_loss": 0.74382442, + "learning_rate": 3.3584342199468965e-06, + "loss": 0.76560116, + "num_input_tokens_seen": 102080460, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8828125, + "step": 4731, + "time_per_iteration": 2.547813653945923 + }, + { + "auxiliary_loss_clip": 0.01136629, + "auxiliary_loss_mlp": 0.01033401, + "balance_loss_clip": 1.01700974, + "balance_loss_mlp": 1.04890573, + "epoch": 0.2845032316248309, + "flos": 25810435474560.0, + "grad_norm": 1.530013147894791, + "language_loss": 0.83553517, + "learning_rate": 3.3581483536960638e-06, + "loss": 0.85723549, + "num_input_tokens_seen": 102100950, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.87890625, + "step": 4732, + "time_per_iteration": 2.5120863914489746 + }, + { + "auxiliary_loss_clip": 0.01136161, + "auxiliary_loss_mlp": 0.0105072, + "balance_loss_clip": 1.03301716, + "balance_loss_mlp": 1.04855001, + "epoch": 0.2845633548774989, + "flos": 19823192979840.0, + "grad_norm": 1.9723104549008028, + "language_loss": 0.79331958, + "learning_rate": 3.357862435944109e-06, + "loss": 0.81518835, + "num_input_tokens_seen": 102119345, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.875, + "step": 4733, + "time_per_iteration": 2.5007243156433105 + }, + { + "auxiliary_loss_clip": 0.01142281, + "auxiliary_loss_mlp": 0.01047444, + "balance_loss_clip": 1.02999151, + "balance_loss_mlp": 1.05076027, + "epoch": 0.28462347813016686, + "flos": 23182815859200.0, + "grad_norm": 2.3591023601535834, + "language_loss": 0.71619761, + "learning_rate": 3.357576466701875e-06, + "loss": 0.73809481, + "num_input_tokens_seen": 102139050, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9140625, + "step": 4734, + "time_per_iteration": 2.482696771621704 + }, + { + "auxiliary_loss_clip": 0.01131669, + "auxiliary_loss_mlp": 0.01036215, + "balance_loss_clip": 1.02036047, + "balance_loss_mlp": 1.04631829, + "epoch": 0.2846836013828348, + "flos": 18660477732480.0, + "grad_norm": 1.8927344989841068, + "language_loss": 0.73762977, + "learning_rate": 3.3572904459802056e-06, + "loss": 0.75930858, + "num_input_tokens_seen": 102157935, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.85546875, + "step": 4735, + "time_per_iteration": 2.4837005138397217 + }, + { + "auxiliary_loss_clip": 0.011344, + "auxiliary_loss_mlp": 0.01045777, + "balance_loss_clip": 1.03006482, + "balance_loss_mlp": 1.04755783, + "epoch": 0.2847437246355028, + "flos": 14173511523840.0, + "grad_norm": 1.630230460143418, + "language_loss": 0.79573876, + "learning_rate": 3.357004373789946e-06, + "loss": 0.81754053, + "num_input_tokens_seen": 102175325, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8671875, + "step": 4736, + "time_per_iteration": 2.4434666633605957 + }, + { + "auxiliary_loss_clip": 0.01139538, + "auxiliary_loss_mlp": 0.0104413, + "balance_loss_clip": 1.02740479, + "balance_loss_mlp": 1.05133057, + "epoch": 0.28480384788817076, + "flos": 29278364837760.0, + "grad_norm": 2.7860738328288637, + "language_loss": 0.59551513, + "learning_rate": 3.3567182501419453e-06, + "loss": 0.61735177, + "num_input_tokens_seen": 102196625, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8828125, + "step": 4737, + "time_per_iteration": 2.580573558807373 + }, + { + "auxiliary_loss_clip": 0.01132871, + "auxiliary_loss_mlp": 0.01039338, + "balance_loss_clip": 1.02334046, + "balance_loss_mlp": 1.04766428, + "epoch": 0.2848639711408387, + "flos": 22601314581120.0, + "grad_norm": 1.7923236486738074, + "language_loss": 0.86353856, + "learning_rate": 3.356432075047052e-06, + "loss": 0.8852607, + "num_input_tokens_seen": 102214975, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8515625, + "step": 4738, + "time_per_iteration": 2.483482837677002 + }, + { + "auxiliary_loss_clip": 0.0113957, + "auxiliary_loss_mlp": 0.01045248, + "balance_loss_clip": 1.02778435, + "balance_loss_mlp": 1.04864287, + "epoch": 0.2849240943935067, + "flos": 17599460866560.0, + "grad_norm": 2.438418234236932, + "language_loss": 0.89730442, + "learning_rate": 3.356145848516118e-06, + "loss": 0.91915256, + "num_input_tokens_seen": 102231885, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.91015625, + "step": 4739, + "time_per_iteration": 2.4746406078338623 + }, + { + "auxiliary_loss_clip": 0.01137971, + "auxiliary_loss_mlp": 0.01041062, + "balance_loss_clip": 1.02450418, + "balance_loss_mlp": 1.05253863, + "epoch": 0.28498421764617465, + "flos": 24862573428480.0, + "grad_norm": 1.3849266219761887, + "language_loss": 0.7207197, + "learning_rate": 3.355859570559998e-06, + "loss": 0.74250996, + "num_input_tokens_seen": 102252725, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.85546875, + "step": 4740, + "time_per_iteration": 2.49682879447937 + }, + { + "auxiliary_loss_clip": 0.01135048, + "auxiliary_loss_mlp": 0.01036754, + "balance_loss_clip": 1.0209707, + "balance_loss_mlp": 1.04970956, + "epoch": 0.2850443408988426, + "flos": 22782555630720.0, + "grad_norm": 1.6055473402712246, + "language_loss": 0.77937335, + "learning_rate": 3.3555732411895477e-06, + "loss": 0.80109143, + "num_input_tokens_seen": 102271730, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8515625, + "step": 4741, + "time_per_iteration": 2.51096248626709 + }, + { + "auxiliary_loss_clip": 0.01136227, + "auxiliary_loss_mlp": 0.01045688, + "balance_loss_clip": 1.02828324, + "balance_loss_mlp": 1.04566443, + "epoch": 0.2851044641515106, + "flos": 18844053166080.0, + "grad_norm": 1.6279093143019605, + "language_loss": 0.76295173, + "learning_rate": 3.3552868604156235e-06, + "loss": 0.78477085, + "num_input_tokens_seen": 102291325, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.90625, + "step": 4742, + "time_per_iteration": 2.462972402572632 + }, + { + "auxiliary_loss_clip": 0.01139125, + "auxiliary_loss_mlp": 0.01048964, + "balance_loss_clip": 1.03039074, + "balance_loss_mlp": 1.04792476, + "epoch": 0.28516458740417855, + "flos": 18880502492160.0, + "grad_norm": 1.8587468959738758, + "language_loss": 0.5772593, + "learning_rate": 3.355000428249086e-06, + "loss": 0.59914023, + "num_input_tokens_seen": 102309000, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9140625, + "step": 4743, + "time_per_iteration": 2.511903762817383 + }, + { + "auxiliary_loss_clip": 0.01141389, + "auxiliary_loss_mlp": 0.01054233, + "balance_loss_clip": 1.03724515, + "balance_loss_mlp": 1.05195451, + "epoch": 0.2852247106568465, + "flos": 25299821687040.0, + "grad_norm": 2.12515026406258, + "language_loss": 0.74454999, + "learning_rate": 3.354713944700797e-06, + "loss": 0.7665062, + "num_input_tokens_seen": 102329240, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.89453125, + "step": 4744, + "time_per_iteration": 2.48883318901062 + }, + { + "auxiliary_loss_clip": 0.01135189, + "auxiliary_loss_mlp": 0.01043767, + "balance_loss_clip": 1.02801967, + "balance_loss_mlp": 1.04948175, + "epoch": 0.2852848339095145, + "flos": 11655383541120.0, + "grad_norm": 2.362002737479584, + "language_loss": 0.77483714, + "learning_rate": 3.3544274097816185e-06, + "loss": 0.79662669, + "num_input_tokens_seen": 102344440, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.859375, + "step": 4745, + "time_per_iteration": 2.4669532775878906 + }, + { + "auxiliary_loss_clip": 0.01130558, + "auxiliary_loss_mlp": 0.01039375, + "balance_loss_clip": 1.02363896, + "balance_loss_mlp": 1.04884791, + "epoch": 0.2853449571621825, + "flos": 12933228856320.0, + "grad_norm": 1.753549870597739, + "language_loss": 0.83101368, + "learning_rate": 3.3541408235024173e-06, + "loss": 0.85271305, + "num_input_tokens_seen": 102360985, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.81640625, + "step": 4746, + "time_per_iteration": 2.4236245155334473 + }, + { + "auxiliary_loss_clip": 0.01138419, + "auxiliary_loss_mlp": 0.01039496, + "balance_loss_clip": 1.02243769, + "balance_loss_mlp": 1.04718721, + "epoch": 0.28540508041485046, + "flos": 20010575255040.0, + "grad_norm": 1.6977094615171933, + "language_loss": 0.79818654, + "learning_rate": 3.3538541858740604e-06, + "loss": 0.81996572, + "num_input_tokens_seen": 102380320, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9140625, + "step": 4747, + "time_per_iteration": 2.47261118888855 + }, + { + "auxiliary_loss_clip": 0.01044617, + "auxiliary_loss_mlp": 0.01004042, + "balance_loss_clip": 1.00208712, + "balance_loss_mlp": 1.01364255, + "epoch": 0.28546520366751843, + "flos": 68139349966080.0, + "grad_norm": 0.7754058718106229, + "language_loss": 0.60505557, + "learning_rate": 3.3535674969074173e-06, + "loss": 0.62554216, + "num_input_tokens_seen": 102439140, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.30859375, + "step": 4748, + "time_per_iteration": 3.087096691131592 + }, + { + "auxiliary_loss_clip": 0.0113463, + "auxiliary_loss_mlp": 0.01041876, + "balance_loss_clip": 1.02596188, + "balance_loss_mlp": 1.04764485, + "epoch": 0.2855253269201864, + "flos": 13251540205440.0, + "grad_norm": 2.177788697298361, + "language_loss": 0.80300528, + "learning_rate": 3.3532807566133592e-06, + "loss": 0.82477033, + "num_input_tokens_seen": 102450990, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.87109375, + "step": 4749, + "time_per_iteration": 2.4132721424102783 + }, + { + "auxiliary_loss_clip": 0.01134988, + "auxiliary_loss_mlp": 0.01038736, + "balance_loss_clip": 1.022488, + "balance_loss_mlp": 1.04882109, + "epoch": 0.28558545017285436, + "flos": 28620876337920.0, + "grad_norm": 1.910787577049047, + "language_loss": 0.7067076, + "learning_rate": 3.3529939650027587e-06, + "loss": 0.72844481, + "num_input_tokens_seen": 102471820, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.86328125, + "step": 4750, + "time_per_iteration": 2.5576114654541016 + }, + { + "auxiliary_loss_clip": 0.01132669, + "auxiliary_loss_mlp": 0.01037305, + "balance_loss_clip": 1.02121782, + "balance_loss_mlp": 1.04961181, + "epoch": 0.2856455734255223, + "flos": 34130470752000.0, + "grad_norm": 1.569446011166348, + "language_loss": 0.81798106, + "learning_rate": 3.3527071220864917e-06, + "loss": 0.83968079, + "num_input_tokens_seen": 102492625, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.83203125, + "step": 4751, + "time_per_iteration": 2.5805511474609375 + }, + { + "auxiliary_loss_clip": 0.01134337, + "auxiliary_loss_mlp": 0.01044364, + "balance_loss_clip": 1.02847314, + "balance_loss_mlp": 1.04876757, + "epoch": 0.2857056966781903, + "flos": 39786149779200.0, + "grad_norm": 1.8824724995030706, + "language_loss": 0.80753136, + "learning_rate": 3.3524202278754353e-06, + "loss": 0.82931828, + "num_input_tokens_seen": 102514145, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.859375, + "step": 4752, + "time_per_iteration": 2.6541080474853516 + }, + { + "auxiliary_loss_clip": 0.01134255, + "auxiliary_loss_mlp": 0.01039105, + "balance_loss_clip": 1.02258289, + "balance_loss_mlp": 1.04778147, + "epoch": 0.28576581993085826, + "flos": 21872292145920.0, + "grad_norm": 1.8943096426553439, + "language_loss": 0.78827929, + "learning_rate": 3.3521332823804676e-06, + "loss": 0.81001288, + "num_input_tokens_seen": 102532365, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.86328125, + "step": 4753, + "time_per_iteration": 2.4775567054748535 + }, + { + "auxiliary_loss_clip": 0.01140638, + "auxiliary_loss_mlp": 0.01043914, + "balance_loss_clip": 1.02559114, + "balance_loss_mlp": 1.05078959, + "epoch": 0.2858259431835262, + "flos": 19091656592640.0, + "grad_norm": 2.205371578508451, + "language_loss": 0.89809895, + "learning_rate": 3.3518462856124704e-06, + "loss": 0.91994447, + "num_input_tokens_seen": 102548425, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.8984375, + "step": 4754, + "time_per_iteration": 2.486128091812134 + }, + { + "auxiliary_loss_clip": 0.01134093, + "auxiliary_loss_mlp": 0.01041613, + "balance_loss_clip": 1.02616322, + "balance_loss_mlp": 1.04897058, + "epoch": 0.2858860664361942, + "flos": 20334309557760.0, + "grad_norm": 1.932227485650823, + "language_loss": 0.8234359, + "learning_rate": 3.3515592375823267e-06, + "loss": 0.84519303, + "num_input_tokens_seen": 102566370, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8515625, + "step": 4755, + "time_per_iteration": 2.491184711456299 + }, + { + "auxiliary_loss_clip": 0.01133783, + "auxiliary_loss_mlp": 0.010447, + "balance_loss_clip": 1.02915466, + "balance_loss_mlp": 1.04667544, + "epoch": 0.28594618968886215, + "flos": 24461738582400.0, + "grad_norm": 1.4908389000148254, + "language_loss": 0.83846784, + "learning_rate": 3.351272138300922e-06, + "loss": 0.86025268, + "num_input_tokens_seen": 102588715, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.87109375, + "step": 4756, + "time_per_iteration": 2.5934014320373535 + }, + { + "auxiliary_loss_clip": 0.01048134, + "auxiliary_loss_mlp": 0.01008558, + "balance_loss_clip": 1.0067457, + "balance_loss_mlp": 1.01677859, + "epoch": 0.2860063129415301, + "flos": 71652850709760.0, + "grad_norm": 0.8659269702666513, + "language_loss": 0.61012161, + "learning_rate": 3.350984987779142e-06, + "loss": 0.63068855, + "num_input_tokens_seen": 102656715, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.3125, + "step": 4757, + "time_per_iteration": 3.2122225761413574 + }, + { + "auxiliary_loss_clip": 0.01137202, + "auxiliary_loss_mlp": 0.01033086, + "balance_loss_clip": 1.0173862, + "balance_loss_mlp": 1.05204773, + "epoch": 0.2860664361941981, + "flos": 20558679863040.0, + "grad_norm": 1.9457322051707677, + "language_loss": 0.65794766, + "learning_rate": 3.3506977860278756e-06, + "loss": 0.67965055, + "num_input_tokens_seen": 102676545, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8515625, + "step": 4758, + "time_per_iteration": 2.60023832321167 + }, + { + "auxiliary_loss_clip": 0.01134399, + "auxiliary_loss_mlp": 0.01036695, + "balance_loss_clip": 1.02027392, + "balance_loss_mlp": 1.04756904, + "epoch": 0.2861265594468661, + "flos": 35996389534080.0, + "grad_norm": 1.560843999265526, + "language_loss": 0.62950313, + "learning_rate": 3.3504105330580143e-06, + "loss": 0.65121412, + "num_input_tokens_seen": 102702875, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8671875, + "step": 4759, + "time_per_iteration": 2.6352102756500244 + }, + { + "auxiliary_loss_clip": 0.0113658, + "auxiliary_loss_mlp": 0.01042718, + "balance_loss_clip": 1.02611256, + "balance_loss_mlp": 1.05098844, + "epoch": 0.28618668269953407, + "flos": 20047419630720.0, + "grad_norm": 1.76909488275169, + "language_loss": 0.7385608, + "learning_rate": 3.3501232288804496e-06, + "loss": 0.76035368, + "num_input_tokens_seen": 102723160, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.85546875, + "step": 4760, + "time_per_iteration": 2.5397889614105225 + }, + { + "auxiliary_loss_clip": 0.01132288, + "auxiliary_loss_mlp": 0.01038545, + "balance_loss_clip": 1.02357185, + "balance_loss_mlp": 1.04949427, + "epoch": 0.28624680595220203, + "flos": 24971849579520.0, + "grad_norm": 1.9401243114633073, + "language_loss": 0.72422945, + "learning_rate": 3.3498358735060773e-06, + "loss": 0.74593776, + "num_input_tokens_seen": 102743855, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 4761, + "time_per_iteration": 4.029369592666626 + }, + { + "auxiliary_loss_clip": 0.01135721, + "auxiliary_loss_mlp": 0.0104628, + "balance_loss_clip": 1.0303421, + "balance_loss_mlp": 1.04875946, + "epoch": 0.28630692920487, + "flos": 22492253911680.0, + "grad_norm": 2.026540334724573, + "language_loss": 0.74605787, + "learning_rate": 3.349548466945793e-06, + "loss": 0.76787788, + "num_input_tokens_seen": 102761370, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.87109375, + "step": 4762, + "time_per_iteration": 3.9056994915008545 + }, + { + "auxiliary_loss_clip": 0.01134836, + "auxiliary_loss_mlp": 0.01045458, + "balance_loss_clip": 1.02963901, + "balance_loss_mlp": 1.05027771, + "epoch": 0.28636705245753796, + "flos": 21249888255360.0, + "grad_norm": 1.79451974437327, + "language_loss": 0.76088154, + "learning_rate": 3.349261009210496e-06, + "loss": 0.78268445, + "num_input_tokens_seen": 102780885, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.84375, + "step": 4763, + "time_per_iteration": 2.521223545074463 + }, + { + "auxiliary_loss_clip": 0.01133105, + "auxiliary_loss_mlp": 0.01035478, + "balance_loss_clip": 1.01907468, + "balance_loss_mlp": 1.04712808, + "epoch": 0.28642717571020593, + "flos": 24095772864000.0, + "grad_norm": 1.9430054907967222, + "language_loss": 0.76937616, + "learning_rate": 3.348973500311086e-06, + "loss": 0.79106188, + "num_input_tokens_seen": 102801000, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.859375, + "step": 4764, + "time_per_iteration": 2.4924814701080322 + }, + { + "auxiliary_loss_clip": 0.01137128, + "auxiliary_loss_mlp": 0.01041403, + "balance_loss_clip": 1.02354538, + "balance_loss_mlp": 1.04996395, + "epoch": 0.2864872989628739, + "flos": 22601386408320.0, + "grad_norm": 1.8973954036904035, + "language_loss": 0.71061826, + "learning_rate": 3.348685940258466e-06, + "loss": 0.73240352, + "num_input_tokens_seen": 102820230, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.87109375, + "step": 4765, + "time_per_iteration": 2.509204387664795 + }, + { + "auxiliary_loss_clip": 0.01131492, + "auxiliary_loss_mlp": 0.01037402, + "balance_loss_clip": 1.02149963, + "balance_loss_mlp": 1.04705501, + "epoch": 0.28654742221554186, + "flos": 32745073138560.0, + "grad_norm": 1.5129940587619137, + "language_loss": 0.75756145, + "learning_rate": 3.3483983290635395e-06, + "loss": 0.77925038, + "num_input_tokens_seen": 102842670, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.84375, + "step": 4766, + "time_per_iteration": 2.562422513961792 + }, + { + "auxiliary_loss_clip": 0.01135318, + "auxiliary_loss_mlp": 0.0103558, + "balance_loss_clip": 1.01960635, + "balance_loss_mlp": 1.05073392, + "epoch": 0.2866075454682098, + "flos": 26981626331520.0, + "grad_norm": 1.5780141248071407, + "language_loss": 0.77556801, + "learning_rate": 3.348110666737214e-06, + "loss": 0.79727697, + "num_input_tokens_seen": 102864480, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.84375, + "step": 4767, + "time_per_iteration": 2.5476057529449463 + }, + { + "auxiliary_loss_clip": 0.01133832, + "auxiliary_loss_mlp": 0.01041655, + "balance_loss_clip": 1.02591908, + "balance_loss_mlp": 1.04878676, + "epoch": 0.2866676687208778, + "flos": 23253847004160.0, + "grad_norm": 2.169490874338027, + "language_loss": 0.6494413, + "learning_rate": 3.3478229532903956e-06, + "loss": 0.67119616, + "num_input_tokens_seen": 102883740, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8515625, + "step": 4768, + "time_per_iteration": 2.4961044788360596 + }, + { + "auxiliary_loss_clip": 0.01137611, + "auxiliary_loss_mlp": 0.01044314, + "balance_loss_clip": 1.02807736, + "balance_loss_mlp": 1.04944301, + "epoch": 0.28672779197354575, + "flos": 21579727870080.0, + "grad_norm": 1.5253191671074575, + "language_loss": 0.70345664, + "learning_rate": 3.3475351887339967e-06, + "loss": 0.72527587, + "num_input_tokens_seen": 102902945, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8828125, + "step": 4769, + "time_per_iteration": 2.5243568420410156 + }, + { + "auxiliary_loss_clip": 0.01136117, + "auxiliary_loss_mlp": 0.01034848, + "balance_loss_clip": 1.01992261, + "balance_loss_mlp": 1.04866219, + "epoch": 0.2867879152262137, + "flos": 19865568049920.0, + "grad_norm": 1.7483868508562144, + "language_loss": 0.75552189, + "learning_rate": 3.3472473730789288e-06, + "loss": 0.77723145, + "num_input_tokens_seen": 102922405, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.875, + "step": 4770, + "time_per_iteration": 2.468655586242676 + }, + { + "auxiliary_loss_clip": 0.01137893, + "auxiliary_loss_mlp": 0.01043906, + "balance_loss_clip": 1.02745509, + "balance_loss_mlp": 1.0500282, + "epoch": 0.2868480384788817, + "flos": 28213325648640.0, + "grad_norm": 3.1666126901900107, + "language_loss": 0.6730839, + "learning_rate": 3.3469595063361045e-06, + "loss": 0.69490194, + "num_input_tokens_seen": 102938980, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.87890625, + "step": 4771, + "time_per_iteration": 2.5334818363189697 + }, + { + "auxiliary_loss_clip": 0.01046415, + "auxiliary_loss_mlp": 0.01005401, + "balance_loss_clip": 1.00367248, + "balance_loss_mlp": 1.01655006, + "epoch": 0.2869081617315497, + "flos": 65424286690560.0, + "grad_norm": 0.7694277286160668, + "language_loss": 0.56883639, + "learning_rate": 3.3466715885164414e-06, + "loss": 0.58935452, + "num_input_tokens_seen": 103000405, + "router_z_loss_clip": 0.01733398, + "router_z_loss_mlp": 0.29882812, + "step": 4772, + "time_per_iteration": 3.0373501777648926 + }, + { + "auxiliary_loss_clip": 0.01136901, + "auxiliary_loss_mlp": 0.01041473, + "balance_loss_clip": 1.02567768, + "balance_loss_mlp": 1.05014777, + "epoch": 0.28696828498421767, + "flos": 18660729127680.0, + "grad_norm": 2.6517872983988844, + "language_loss": 0.83356023, + "learning_rate": 3.346383619630856e-06, + "loss": 0.85534406, + "num_input_tokens_seen": 103017970, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8671875, + "step": 4773, + "time_per_iteration": 2.477537155151367 + }, + { + "auxiliary_loss_clip": 0.0113402, + "auxiliary_loss_mlp": 0.0103818, + "balance_loss_clip": 1.02159762, + "balance_loss_mlp": 1.04630029, + "epoch": 0.28702840823688563, + "flos": 23659745667840.0, + "grad_norm": 2.6367186533355356, + "language_loss": 0.77910906, + "learning_rate": 3.34609559969027e-06, + "loss": 0.80083102, + "num_input_tokens_seen": 103036385, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.875, + "step": 4774, + "time_per_iteration": 2.514545440673828 + }, + { + "auxiliary_loss_clip": 0.01136368, + "auxiliary_loss_mlp": 0.01037759, + "balance_loss_clip": 1.02091432, + "balance_loss_mlp": 1.05010271, + "epoch": 0.2870885314895536, + "flos": 13804744544640.0, + "grad_norm": 1.7122435327393783, + "language_loss": 0.73488462, + "learning_rate": 3.3458075287056034e-06, + "loss": 0.75662589, + "num_input_tokens_seen": 103052170, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.86328125, + "step": 4775, + "time_per_iteration": 2.4526851177215576 + }, + { + "auxiliary_loss_clip": 0.0113744, + "auxiliary_loss_mlp": 0.01037967, + "balance_loss_clip": 1.02267885, + "balance_loss_mlp": 1.05033445, + "epoch": 0.28714865474222157, + "flos": 17786771314560.0, + "grad_norm": 1.655187901014976, + "language_loss": 0.88345891, + "learning_rate": 3.34551940668778e-06, + "loss": 0.905213, + "num_input_tokens_seen": 103070510, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.87109375, + "step": 4776, + "time_per_iteration": 2.5487112998962402 + }, + { + "auxiliary_loss_clip": 0.01135791, + "auxiliary_loss_mlp": 0.01037024, + "balance_loss_clip": 1.02170587, + "balance_loss_mlp": 1.05060029, + "epoch": 0.28720877799488953, + "flos": 15997486199040.0, + "grad_norm": 1.7920640817181568, + "language_loss": 0.74046421, + "learning_rate": 3.345231233647726e-06, + "loss": 0.76219237, + "num_input_tokens_seen": 103089590, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8515625, + "step": 4777, + "time_per_iteration": 2.4858744144439697 + }, + { + "auxiliary_loss_clip": 0.01143681, + "auxiliary_loss_mlp": 0.01044417, + "balance_loss_clip": 1.02763224, + "balance_loss_mlp": 1.05306673, + "epoch": 0.2872689012475575, + "flos": 20923137210240.0, + "grad_norm": 1.9679293284940167, + "language_loss": 0.80052459, + "learning_rate": 3.3449430095963696e-06, + "loss": 0.82240558, + "num_input_tokens_seen": 103109080, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.90625, + "step": 4778, + "time_per_iteration": 2.536553382873535 + }, + { + "auxiliary_loss_clip": 0.01135156, + "auxiliary_loss_mlp": 0.01046142, + "balance_loss_clip": 1.03032279, + "balance_loss_mlp": 1.05058503, + "epoch": 0.28732902450022546, + "flos": 21325121291520.0, + "grad_norm": 1.6265242751714746, + "language_loss": 0.73940611, + "learning_rate": 3.3446547345446386e-06, + "loss": 0.76121908, + "num_input_tokens_seen": 103127755, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.84765625, + "step": 4779, + "time_per_iteration": 2.5068604946136475 + }, + { + "auxiliary_loss_clip": 0.01139025, + "auxiliary_loss_mlp": 0.01044309, + "balance_loss_clip": 1.02791739, + "balance_loss_mlp": 1.05089593, + "epoch": 0.2873891477528934, + "flos": 20850382212480.0, + "grad_norm": 1.5791887497798731, + "language_loss": 0.76378506, + "learning_rate": 3.3443664085034656e-06, + "loss": 0.78561842, + "num_input_tokens_seen": 103147035, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8828125, + "step": 4780, + "time_per_iteration": 2.6357336044311523 + }, + { + "auxiliary_loss_clip": 0.01132548, + "auxiliary_loss_mlp": 0.01042513, + "balance_loss_clip": 1.02789187, + "balance_loss_mlp": 1.04874134, + "epoch": 0.2874492710055614, + "flos": 17420051410560.0, + "grad_norm": 1.8554557560955622, + "language_loss": 0.81367111, + "learning_rate": 3.344078031483784e-06, + "loss": 0.83542168, + "num_input_tokens_seen": 103165410, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8359375, + "step": 4781, + "time_per_iteration": 2.484217405319214 + }, + { + "auxiliary_loss_clip": 0.01138983, + "auxiliary_loss_mlp": 0.01044127, + "balance_loss_clip": 1.02688909, + "balance_loss_mlp": 1.0511862, + "epoch": 0.28750939425822936, + "flos": 13406818700160.0, + "grad_norm": 1.9124031057386872, + "language_loss": 0.86249948, + "learning_rate": 3.3437896034965283e-06, + "loss": 0.88433063, + "num_input_tokens_seen": 103183710, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.87890625, + "step": 4782, + "time_per_iteration": 2.4822945594787598 + }, + { + "auxiliary_loss_clip": 0.0113749, + "auxiliary_loss_mlp": 0.0104499, + "balance_loss_clip": 1.02842641, + "balance_loss_mlp": 1.05222881, + "epoch": 0.2875695175108973, + "flos": 21870029589120.0, + "grad_norm": 1.5584901619772236, + "language_loss": 0.71195668, + "learning_rate": 3.3435011245526357e-06, + "loss": 0.73378146, + "num_input_tokens_seen": 103203790, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.85546875, + "step": 4783, + "time_per_iteration": 2.4959099292755127 + }, + { + "auxiliary_loss_clip": 0.01136896, + "auxiliary_loss_mlp": 0.0104062, + "balance_loss_clip": 1.02443171, + "balance_loss_mlp": 1.05179179, + "epoch": 0.2876296407635653, + "flos": 26245457089920.0, + "grad_norm": 3.6731562407195932, + "language_loss": 0.77011871, + "learning_rate": 3.343212594663047e-06, + "loss": 0.79189384, + "num_input_tokens_seen": 103223925, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8515625, + "step": 4784, + "time_per_iteration": 2.55037784576416 + }, + { + "auxiliary_loss_clip": 0.0113214, + "auxiliary_loss_mlp": 0.01041887, + "balance_loss_clip": 1.02603197, + "balance_loss_mlp": 1.04896331, + "epoch": 0.28768976401623325, + "flos": 25373654092800.0, + "grad_norm": 1.5223386635016902, + "language_loss": 0.75859249, + "learning_rate": 3.3429240138387015e-06, + "loss": 0.7803328, + "num_input_tokens_seen": 103244760, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.83203125, + "step": 4785, + "time_per_iteration": 2.526587724685669 + }, + { + "auxiliary_loss_clip": 0.01135192, + "auxiliary_loss_mlp": 0.01042659, + "balance_loss_clip": 1.02724528, + "balance_loss_mlp": 1.04946601, + "epoch": 0.28774988726890127, + "flos": 30664372982400.0, + "grad_norm": 1.9982438427344784, + "language_loss": 0.83033895, + "learning_rate": 3.3426353820905425e-06, + "loss": 0.85211748, + "num_input_tokens_seen": 103261995, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.859375, + "step": 4786, + "time_per_iteration": 2.5786821842193604 + }, + { + "auxiliary_loss_clip": 0.01134245, + "auxiliary_loss_mlp": 0.01033568, + "balance_loss_clip": 1.01899481, + "balance_loss_mlp": 1.04868317, + "epoch": 0.28781001052156924, + "flos": 20595452411520.0, + "grad_norm": 1.95457297040312, + "language_loss": 0.80007184, + "learning_rate": 3.342346699429516e-06, + "loss": 0.82174993, + "num_input_tokens_seen": 103279780, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.85546875, + "step": 4787, + "time_per_iteration": 2.4734396934509277 + }, + { + "auxiliary_loss_clip": 0.01136278, + "auxiliary_loss_mlp": 0.01039735, + "balance_loss_clip": 1.02397585, + "balance_loss_mlp": 1.04906642, + "epoch": 0.2878701337742372, + "flos": 26542330997760.0, + "grad_norm": 2.6671828195015044, + "language_loss": 0.83666658, + "learning_rate": 3.3420579658665677e-06, + "loss": 0.85842675, + "num_input_tokens_seen": 103300580, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.87109375, + "step": 4788, + "time_per_iteration": 2.5388548374176025 + }, + { + "auxiliary_loss_clip": 0.01137234, + "auxiliary_loss_mlp": 0.01046523, + "balance_loss_clip": 1.03135991, + "balance_loss_mlp": 1.05051816, + "epoch": 0.28793025702690517, + "flos": 28146855530880.0, + "grad_norm": 1.8168797658695668, + "language_loss": 0.73769903, + "learning_rate": 3.3417691814126468e-06, + "loss": 0.75953662, + "num_input_tokens_seen": 103320430, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8671875, + "step": 4789, + "time_per_iteration": 2.5259692668914795 + }, + { + "auxiliary_loss_clip": 0.01129641, + "auxiliary_loss_mlp": 0.01043253, + "balance_loss_clip": 1.02819657, + "balance_loss_mlp": 1.0466274, + "epoch": 0.28799038027957313, + "flos": 23805471144960.0, + "grad_norm": 1.7572733449240283, + "language_loss": 0.83982229, + "learning_rate": 3.341480346078704e-06, + "loss": 0.86155128, + "num_input_tokens_seen": 103337695, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 4790, + "time_per_iteration": 2.5347094535827637 + }, + { + "auxiliary_loss_clip": 0.01136016, + "auxiliary_loss_mlp": 0.01039193, + "balance_loss_clip": 1.02267063, + "balance_loss_mlp": 1.05011547, + "epoch": 0.2880505035322411, + "flos": 22344122223360.0, + "grad_norm": 1.8137236403798864, + "language_loss": 0.77924603, + "learning_rate": 3.3411914598756922e-06, + "loss": 0.80099815, + "num_input_tokens_seen": 103357010, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.859375, + "step": 4791, + "time_per_iteration": 2.475328207015991 + }, + { + "auxiliary_loss_clip": 0.0113676, + "auxiliary_loss_mlp": 0.01034669, + "balance_loss_clip": 1.01854002, + "balance_loss_mlp": 1.04824567, + "epoch": 0.28811062678490906, + "flos": 18004246208640.0, + "grad_norm": 1.933659829708973, + "language_loss": 0.70760292, + "learning_rate": 3.3409025228145654e-06, + "loss": 0.72931719, + "num_input_tokens_seen": 103375600, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.88671875, + "step": 4792, + "time_per_iteration": 2.4705538749694824 + }, + { + "auxiliary_loss_clip": 0.01135222, + "auxiliary_loss_mlp": 0.01035984, + "balance_loss_clip": 1.02065361, + "balance_loss_mlp": 1.04968917, + "epoch": 0.28817075003757703, + "flos": 22090880361600.0, + "grad_norm": 2.08648870526395, + "language_loss": 0.79392564, + "learning_rate": 3.3406135349062812e-06, + "loss": 0.81563771, + "num_input_tokens_seen": 103395225, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.85546875, + "step": 4793, + "time_per_iteration": 2.509697914123535 + }, + { + "auxiliary_loss_clip": 0.01131221, + "auxiliary_loss_mlp": 0.01040002, + "balance_loss_clip": 1.02479076, + "balance_loss_mlp": 1.04920101, + "epoch": 0.288230873290245, + "flos": 41683130847360.0, + "grad_norm": 1.6269924793239006, + "language_loss": 0.77731872, + "learning_rate": 3.340324496161797e-06, + "loss": 0.7990309, + "num_input_tokens_seen": 103417245, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8203125, + "step": 4794, + "time_per_iteration": 2.6943047046661377 + }, + { + "auxiliary_loss_clip": 0.01134923, + "auxiliary_loss_mlp": 0.0104449, + "balance_loss_clip": 1.02819395, + "balance_loss_mlp": 1.04913807, + "epoch": 0.28829099654291296, + "flos": 18624423456000.0, + "grad_norm": 2.663854929830155, + "language_loss": 0.8254813, + "learning_rate": 3.340035406592074e-06, + "loss": 0.84727538, + "num_input_tokens_seen": 103435500, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.859375, + "step": 4795, + "time_per_iteration": 2.4633255004882812 + }, + { + "auxiliary_loss_clip": 0.01129713, + "auxiliary_loss_mlp": 0.01038999, + "balance_loss_clip": 1.02387166, + "balance_loss_mlp": 1.04899204, + "epoch": 0.2883511197955809, + "flos": 24674832017280.0, + "grad_norm": 2.661730786650402, + "language_loss": 0.74650323, + "learning_rate": 3.339746266208074e-06, + "loss": 0.76819038, + "num_input_tokens_seen": 103451040, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.80859375, + "step": 4796, + "time_per_iteration": 2.5179266929626465 + }, + { + "auxiliary_loss_clip": 0.01138692, + "auxiliary_loss_mlp": 0.01040905, + "balance_loss_clip": 1.02334583, + "balance_loss_mlp": 1.04789257, + "epoch": 0.2884112430482489, + "flos": 23112143850240.0, + "grad_norm": 1.8865626242662115, + "language_loss": 0.72797763, + "learning_rate": 3.3394570750207614e-06, + "loss": 0.74977362, + "num_input_tokens_seen": 103471330, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.90625, + "step": 4797, + "time_per_iteration": 2.4910430908203125 + }, + { + "auxiliary_loss_clip": 0.01135339, + "auxiliary_loss_mlp": 0.01040629, + "balance_loss_clip": 1.02475667, + "balance_loss_mlp": 1.04989898, + "epoch": 0.28847136630091685, + "flos": 16873347432960.0, + "grad_norm": 2.109884297899412, + "language_loss": 0.74219149, + "learning_rate": 3.3391678330411017e-06, + "loss": 0.76395118, + "num_input_tokens_seen": 103488060, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8515625, + "step": 4798, + "time_per_iteration": 2.472590923309326 + }, + { + "auxiliary_loss_clip": 0.01134882, + "auxiliary_loss_mlp": 0.01043827, + "balance_loss_clip": 1.02631509, + "balance_loss_mlp": 1.04689598, + "epoch": 0.2885314895535849, + "flos": 25657527277440.0, + "grad_norm": 2.7660889265500996, + "language_loss": 0.64920753, + "learning_rate": 3.3388785402800642e-06, + "loss": 0.67099464, + "num_input_tokens_seen": 103503600, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.87890625, + "step": 4799, + "time_per_iteration": 2.4816339015960693 + }, + { + "auxiliary_loss_clip": 0.01136164, + "auxiliary_loss_mlp": 0.01043963, + "balance_loss_clip": 1.02784538, + "balance_loss_mlp": 1.04912758, + "epoch": 0.28859161280625284, + "flos": 21107251347840.0, + "grad_norm": 2.0794132014970272, + "language_loss": 0.82202137, + "learning_rate": 3.3385891967486178e-06, + "loss": 0.84382272, + "num_input_tokens_seen": 103524195, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.87109375, + "step": 4800, + "time_per_iteration": 2.5249674320220947 + }, + { + "auxiliary_loss_clip": 0.01128617, + "auxiliary_loss_mlp": 0.01038626, + "balance_loss_clip": 1.02312899, + "balance_loss_mlp": 1.04702258, + "epoch": 0.2886517360589208, + "flos": 26469540086400.0, + "grad_norm": 1.639042715490093, + "language_loss": 0.90946537, + "learning_rate": 3.3382998024577347e-06, + "loss": 0.93113768, + "num_input_tokens_seen": 103545235, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.81640625, + "step": 4801, + "time_per_iteration": 2.531658172607422 + }, + { + "auxiliary_loss_clip": 0.01133327, + "auxiliary_loss_mlp": 0.01038392, + "balance_loss_clip": 1.0221796, + "balance_loss_mlp": 1.04792547, + "epoch": 0.28871185931158877, + "flos": 25265275781760.0, + "grad_norm": 2.176318344562637, + "language_loss": 0.73644328, + "learning_rate": 3.33801035741839e-06, + "loss": 0.75816047, + "num_input_tokens_seen": 103563305, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8515625, + "step": 4802, + "time_per_iteration": 4.080524444580078 + }, + { + "auxiliary_loss_clip": 0.01040178, + "auxiliary_loss_mlp": 0.01006047, + "balance_loss_clip": 1.00423479, + "balance_loss_mlp": 1.01114249, + "epoch": 0.28877198256425674, + "flos": 66665431284480.0, + "grad_norm": 0.7820100192493779, + "language_loss": 0.63009298, + "learning_rate": 3.337720861641558e-06, + "loss": 0.65055525, + "num_input_tokens_seen": 103625025, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.2890625, + "step": 4803, + "time_per_iteration": 4.464243412017822 + }, + { + "auxiliary_loss_clip": 0.0112919, + "auxiliary_loss_mlp": 0.01046023, + "balance_loss_clip": 1.03008461, + "balance_loss_mlp": 1.04523563, + "epoch": 0.2888321058169247, + "flos": 20303031790080.0, + "grad_norm": 1.7581002683255658, + "language_loss": 0.70800668, + "learning_rate": 3.3374313151382165e-06, + "loss": 0.72975886, + "num_input_tokens_seen": 103644235, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8359375, + "step": 4804, + "time_per_iteration": 2.4655730724334717 + }, + { + "auxiliary_loss_clip": 0.01134858, + "auxiliary_loss_mlp": 0.0104232, + "balance_loss_clip": 1.02464128, + "balance_loss_mlp": 1.04650438, + "epoch": 0.28889222906959267, + "flos": 25516721963520.0, + "grad_norm": 1.8916446417141755, + "language_loss": 0.68253011, + "learning_rate": 3.337141717919346e-06, + "loss": 0.70430195, + "num_input_tokens_seen": 103664700, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.8828125, + "step": 4805, + "time_per_iteration": 2.53932523727417 + }, + { + "auxiliary_loss_clip": 0.01133301, + "auxiliary_loss_mlp": 0.01041795, + "balance_loss_clip": 1.0262022, + "balance_loss_mlp": 1.04706144, + "epoch": 0.28895235232226063, + "flos": 32671312560000.0, + "grad_norm": 1.968490446816616, + "language_loss": 0.69469118, + "learning_rate": 3.3368520699959272e-06, + "loss": 0.71644211, + "num_input_tokens_seen": 103686595, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.86328125, + "step": 4806, + "time_per_iteration": 2.558811902999878 + }, + { + "auxiliary_loss_clip": 0.0113054, + "auxiliary_loss_mlp": 0.01046922, + "balance_loss_clip": 1.031461, + "balance_loss_mlp": 1.04788303, + "epoch": 0.2890124755749286, + "flos": 29714679342720.0, + "grad_norm": 1.428284074184194, + "language_loss": 0.71372461, + "learning_rate": 3.3365623713789443e-06, + "loss": 0.73549926, + "num_input_tokens_seen": 103707525, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.828125, + "step": 4807, + "time_per_iteration": 2.5614373683929443 + }, + { + "auxiliary_loss_clip": 0.01132479, + "auxiliary_loss_mlp": 0.01043529, + "balance_loss_clip": 1.02736425, + "balance_loss_mlp": 1.04677331, + "epoch": 0.28907259882759656, + "flos": 22674464628480.0, + "grad_norm": 1.7487230864068215, + "language_loss": 0.81519878, + "learning_rate": 3.336272622079382e-06, + "loss": 0.83695877, + "num_input_tokens_seen": 103727905, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.85546875, + "step": 4808, + "time_per_iteration": 2.4744319915771484 + }, + { + "auxiliary_loss_clip": 0.01128992, + "auxiliary_loss_mlp": 0.0105043, + "balance_loss_clip": 1.03418779, + "balance_loss_mlp": 1.04669142, + "epoch": 0.2891327220802645, + "flos": 22566050403840.0, + "grad_norm": 1.636259514454852, + "language_loss": 0.78387201, + "learning_rate": 3.3359828221082276e-06, + "loss": 0.80566621, + "num_input_tokens_seen": 103748335, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8203125, + "step": 4809, + "time_per_iteration": 2.4998364448547363 + }, + { + "auxiliary_loss_clip": 0.01134273, + "auxiliary_loss_mlp": 0.0104619, + "balance_loss_clip": 1.02908349, + "balance_loss_mlp": 1.04490733, + "epoch": 0.2891928453329325, + "flos": 21652806090240.0, + "grad_norm": 1.6563631129995537, + "language_loss": 0.78611737, + "learning_rate": 3.3356929714764714e-06, + "loss": 0.80792195, + "num_input_tokens_seen": 103767020, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.890625, + "step": 4810, + "time_per_iteration": 2.4702351093292236 + }, + { + "auxiliary_loss_clip": 0.01129985, + "auxiliary_loss_mlp": 0.01045099, + "balance_loss_clip": 1.02966762, + "balance_loss_mlp": 1.04653728, + "epoch": 0.28925296858560046, + "flos": 23222102359680.0, + "grad_norm": 2.008599276638055, + "language_loss": 0.77134252, + "learning_rate": 3.3354030701951032e-06, + "loss": 0.79309338, + "num_input_tokens_seen": 103786355, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8359375, + "step": 4811, + "time_per_iteration": 2.502671718597412 + }, + { + "auxiliary_loss_clip": 0.01130702, + "auxiliary_loss_mlp": 0.01050867, + "balance_loss_clip": 1.03385544, + "balance_loss_mlp": 1.0460732, + "epoch": 0.2893130918382685, + "flos": 28621666437120.0, + "grad_norm": 1.3273574459957262, + "language_loss": 0.76748705, + "learning_rate": 3.335113118275117e-06, + "loss": 0.78930271, + "num_input_tokens_seen": 103809345, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.84375, + "step": 4812, + "time_per_iteration": 2.5386435985565186 + }, + { + "auxiliary_loss_clip": 0.01038211, + "auxiliary_loss_mlp": 0.01023073, + "balance_loss_clip": 1.02121317, + "balance_loss_mlp": 1.00933552, + "epoch": 0.28937321509093644, + "flos": 72301288982400.0, + "grad_norm": 0.8452992206378583, + "language_loss": 0.60239071, + "learning_rate": 3.3348231157275085e-06, + "loss": 0.62300354, + "num_input_tokens_seen": 103871180, + "router_z_loss_clip": 0.01855469, + "router_z_loss_mlp": 0.2890625, + "step": 4813, + "time_per_iteration": 3.227616548538208 + }, + { + "auxiliary_loss_clip": 0.01130233, + "auxiliary_loss_mlp": 0.0104328, + "balance_loss_clip": 1.02727079, + "balance_loss_mlp": 1.04549837, + "epoch": 0.2894333383436044, + "flos": 16216397637120.0, + "grad_norm": 1.8826759768804342, + "language_loss": 0.81616402, + "learning_rate": 3.3345330625632725e-06, + "loss": 0.83789915, + "num_input_tokens_seen": 103889040, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.84765625, + "step": 4814, + "time_per_iteration": 2.5389657020568848 + }, + { + "auxiliary_loss_clip": 0.01132807, + "auxiliary_loss_mlp": 0.01045738, + "balance_loss_clip": 1.0297873, + "balance_loss_mlp": 1.04464495, + "epoch": 0.2894934615962724, + "flos": 24828278918400.0, + "grad_norm": 1.6532361717230013, + "language_loss": 0.72615647, + "learning_rate": 3.3342429587934094e-06, + "loss": 0.74794197, + "num_input_tokens_seen": 103910380, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8828125, + "step": 4815, + "time_per_iteration": 2.517458438873291 + }, + { + "auxiliary_loss_clip": 0.01129383, + "auxiliary_loss_mlp": 0.0104797, + "balance_loss_clip": 1.03274667, + "balance_loss_mlp": 1.04815507, + "epoch": 0.28955358484894034, + "flos": 20449978329600.0, + "grad_norm": 1.520143184033477, + "language_loss": 0.70801306, + "learning_rate": 3.3339528044289198e-06, + "loss": 0.72978652, + "num_input_tokens_seen": 103929955, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8125, + "step": 4816, + "time_per_iteration": 2.5287740230560303 + }, + { + "auxiliary_loss_clip": 0.01135738, + "auxiliary_loss_mlp": 0.0104281, + "balance_loss_clip": 1.02590585, + "balance_loss_mlp": 1.04615664, + "epoch": 0.2896137081016083, + "flos": 22565188477440.0, + "grad_norm": 3.3715101323822174, + "language_loss": 0.74736607, + "learning_rate": 3.3336625994808055e-06, + "loss": 0.76915157, + "num_input_tokens_seen": 103948020, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8984375, + "step": 4817, + "time_per_iteration": 2.4828009605407715 + }, + { + "auxiliary_loss_clip": 0.01134031, + "auxiliary_loss_mlp": 0.0105341, + "balance_loss_clip": 1.03637469, + "balance_loss_mlp": 1.0465169, + "epoch": 0.28967383135427627, + "flos": 26687948734080.0, + "grad_norm": 1.754631597755812, + "language_loss": 0.76169789, + "learning_rate": 3.3333723439600723e-06, + "loss": 0.78357232, + "num_input_tokens_seen": 103968740, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.875, + "step": 4818, + "time_per_iteration": 2.5453133583068848 + }, + { + "auxiliary_loss_clip": 0.011322, + "auxiliary_loss_mlp": 0.01035125, + "balance_loss_clip": 1.0184474, + "balance_loss_mlp": 1.04606366, + "epoch": 0.28973395460694423, + "flos": 15558262692480.0, + "grad_norm": 1.8604375380991018, + "language_loss": 0.79827082, + "learning_rate": 3.3330820378777263e-06, + "loss": 0.81994408, + "num_input_tokens_seen": 103986005, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.859375, + "step": 4819, + "time_per_iteration": 2.4516472816467285 + }, + { + "auxiliary_loss_clip": 0.01135104, + "auxiliary_loss_mlp": 0.01043377, + "balance_loss_clip": 1.02553141, + "balance_loss_mlp": 1.04452121, + "epoch": 0.2897940778596122, + "flos": 18697465762560.0, + "grad_norm": 1.6026789889191464, + "language_loss": 0.78726941, + "learning_rate": 3.332791681244776e-06, + "loss": 0.80905426, + "num_input_tokens_seen": 104005070, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.90625, + "step": 4820, + "time_per_iteration": 2.512927770614624 + }, + { + "auxiliary_loss_clip": 0.0113352, + "auxiliary_loss_mlp": 0.01036477, + "balance_loss_clip": 1.0202527, + "balance_loss_mlp": 1.04560018, + "epoch": 0.28985420111228016, + "flos": 18770292587520.0, + "grad_norm": 2.352701358428358, + "language_loss": 0.73083222, + "learning_rate": 3.332501274072231e-06, + "loss": 0.75253224, + "num_input_tokens_seen": 104022945, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.87890625, + "step": 4821, + "time_per_iteration": 2.4575939178466797 + }, + { + "auxiliary_loss_clip": 0.01130585, + "auxiliary_loss_mlp": 0.01036495, + "balance_loss_clip": 1.01979387, + "balance_loss_mlp": 1.04503322, + "epoch": 0.28991432436494813, + "flos": 23069840607360.0, + "grad_norm": 1.843174914976853, + "language_loss": 0.72629523, + "learning_rate": 3.332210816371104e-06, + "loss": 0.74796605, + "num_input_tokens_seen": 104042080, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.85546875, + "step": 4822, + "time_per_iteration": 2.4981486797332764 + }, + { + "auxiliary_loss_clip": 0.01133236, + "auxiliary_loss_mlp": 0.01047323, + "balance_loss_clip": 1.03044343, + "balance_loss_mlp": 1.04679179, + "epoch": 0.2899744476176161, + "flos": 17603195880960.0, + "grad_norm": 1.7581642571514904, + "language_loss": 0.66571164, + "learning_rate": 3.3319203081524102e-06, + "loss": 0.68751729, + "num_input_tokens_seen": 104060975, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.86328125, + "step": 4823, + "time_per_iteration": 2.4363584518432617 + }, + { + "auxiliary_loss_clip": 0.01128693, + "auxiliary_loss_mlp": 0.01036254, + "balance_loss_clip": 1.02018452, + "balance_loss_mlp": 1.04382014, + "epoch": 0.29003457087028406, + "flos": 22309360836480.0, + "grad_norm": 3.6840420234688684, + "language_loss": 0.80786806, + "learning_rate": 3.331629749427164e-06, + "loss": 0.82951754, + "num_input_tokens_seen": 104081395, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8515625, + "step": 4824, + "time_per_iteration": 2.4978654384613037 + }, + { + "auxiliary_loss_clip": 0.01132559, + "auxiliary_loss_mlp": 0.01043716, + "balance_loss_clip": 1.02547669, + "balance_loss_mlp": 1.04512334, + "epoch": 0.2900946941229521, + "flos": 21944975316480.0, + "grad_norm": 1.8817460080316075, + "language_loss": 0.72507697, + "learning_rate": 3.331339140206385e-06, + "loss": 0.74683976, + "num_input_tokens_seen": 104099995, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.875, + "step": 4825, + "time_per_iteration": 2.4740118980407715 + }, + { + "auxiliary_loss_clip": 0.01136872, + "auxiliary_loss_mlp": 0.01035046, + "balance_loss_clip": 1.01760566, + "balance_loss_mlp": 1.04886889, + "epoch": 0.29015481737562004, + "flos": 17932173569280.0, + "grad_norm": 2.3450778905142813, + "language_loss": 0.73504382, + "learning_rate": 3.331048480501092e-06, + "loss": 0.75676298, + "num_input_tokens_seen": 104118930, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.8828125, + "step": 4826, + "time_per_iteration": 2.4689221382141113 + }, + { + "auxiliary_loss_clip": 0.01131943, + "auxiliary_loss_mlp": 0.01036484, + "balance_loss_clip": 1.02041411, + "balance_loss_mlp": 1.04524112, + "epoch": 0.290214940628288, + "flos": 22783525297920.0, + "grad_norm": 3.139827505949132, + "language_loss": 0.68472409, + "learning_rate": 3.3307577703223073e-06, + "loss": 0.70640838, + "num_input_tokens_seen": 104136940, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8671875, + "step": 4827, + "time_per_iteration": 2.5236809253692627 + }, + { + "auxiliary_loss_clip": 0.01136266, + "auxiliary_loss_mlp": 0.01036355, + "balance_loss_clip": 1.01816392, + "balance_loss_mlp": 1.04921937, + "epoch": 0.290275063880956, + "flos": 20006481104640.0, + "grad_norm": 1.8651963869616242, + "language_loss": 0.80072737, + "learning_rate": 3.3304670096810545e-06, + "loss": 0.82245356, + "num_input_tokens_seen": 104154280, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.87109375, + "step": 4828, + "time_per_iteration": 2.491584300994873 + }, + { + "auxiliary_loss_clip": 0.01133081, + "auxiliary_loss_mlp": 0.01042375, + "balance_loss_clip": 1.026335, + "balance_loss_mlp": 1.0482254, + "epoch": 0.29033518713362394, + "flos": 22053605022720.0, + "grad_norm": 2.2252387209358666, + "language_loss": 0.80475402, + "learning_rate": 3.33017619858836e-06, + "loss": 0.82650864, + "num_input_tokens_seen": 104172605, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.84765625, + "step": 4829, + "time_per_iteration": 2.473210334777832 + }, + { + "auxiliary_loss_clip": 0.01131629, + "auxiliary_loss_mlp": 0.01041142, + "balance_loss_clip": 1.02482176, + "balance_loss_mlp": 1.04794419, + "epoch": 0.2903953103862919, + "flos": 25630056351360.0, + "grad_norm": 1.544892870636461, + "language_loss": 0.82288766, + "learning_rate": 3.329885337055249e-06, + "loss": 0.84461534, + "num_input_tokens_seen": 104194120, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8359375, + "step": 4830, + "time_per_iteration": 2.52874755859375 + }, + { + "auxiliary_loss_clip": 0.01136051, + "auxiliary_loss_mlp": 0.01046661, + "balance_loss_clip": 1.02992344, + "balance_loss_mlp": 1.04847991, + "epoch": 0.29045543363895987, + "flos": 16945851035520.0, + "grad_norm": 2.366175746199002, + "language_loss": 0.78858435, + "learning_rate": 3.3295944250927546e-06, + "loss": 0.81041145, + "num_input_tokens_seen": 104210875, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.875, + "step": 4831, + "time_per_iteration": 2.5465588569641113 + }, + { + "auxiliary_loss_clip": 0.0112817, + "auxiliary_loss_mlp": 0.010386, + "balance_loss_clip": 1.02356744, + "balance_loss_mlp": 1.045138, + "epoch": 0.29051555689162784, + "flos": 26395492199040.0, + "grad_norm": 1.8105888440812088, + "language_loss": 0.74415791, + "learning_rate": 3.3293034627119055e-06, + "loss": 0.76582563, + "num_input_tokens_seen": 104229875, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 4832, + "time_per_iteration": 2.6398987770080566 + }, + { + "auxiliary_loss_clip": 0.0113002, + "auxiliary_loss_mlp": 0.01032742, + "balance_loss_clip": 1.01806784, + "balance_loss_mlp": 1.04516697, + "epoch": 0.2905756801442958, + "flos": 21103875469440.0, + "grad_norm": 1.6051950803449415, + "language_loss": 0.75986588, + "learning_rate": 3.329012449923736e-06, + "loss": 0.78149348, + "num_input_tokens_seen": 104250405, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.84765625, + "step": 4833, + "time_per_iteration": 2.4772675037384033 + }, + { + "auxiliary_loss_clip": 0.01129626, + "auxiliary_loss_mlp": 0.01037033, + "balance_loss_clip": 1.02108264, + "balance_loss_mlp": 1.04542434, + "epoch": 0.29063580339696377, + "flos": 15706071158400.0, + "grad_norm": 1.807689816327527, + "language_loss": 0.64523911, + "learning_rate": 3.3287213867392813e-06, + "loss": 0.6669057, + "num_input_tokens_seen": 104269185, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.83984375, + "step": 4834, + "time_per_iteration": 2.4944729804992676 + }, + { + "auxiliary_loss_clip": 0.0112967, + "auxiliary_loss_mlp": 0.01031422, + "balance_loss_clip": 1.01674771, + "balance_loss_mlp": 1.04650283, + "epoch": 0.29069592664963173, + "flos": 24644990793600.0, + "grad_norm": 1.5516449013863105, + "language_loss": 0.71436119, + "learning_rate": 3.3284302731695783e-06, + "loss": 0.73597211, + "num_input_tokens_seen": 104289400, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.83203125, + "step": 4835, + "time_per_iteration": 2.5122785568237305 + }, + { + "auxiliary_loss_clip": 0.01129192, + "auxiliary_loss_mlp": 0.01038882, + "balance_loss_clip": 1.02430248, + "balance_loss_mlp": 1.04510283, + "epoch": 0.2907560499022997, + "flos": 24973753000320.0, + "grad_norm": 2.123413568873549, + "language_loss": 0.79669547, + "learning_rate": 3.3281391092256668e-06, + "loss": 0.81837618, + "num_input_tokens_seen": 104310485, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.83984375, + "step": 4836, + "time_per_iteration": 2.533221483230591 + }, + { + "auxiliary_loss_clip": 0.01129403, + "auxiliary_loss_mlp": 0.01039274, + "balance_loss_clip": 1.02338338, + "balance_loss_mlp": 1.04589558, + "epoch": 0.29081617315496766, + "flos": 18657496903680.0, + "grad_norm": 1.6671781935549963, + "language_loss": 0.80777872, + "learning_rate": 3.3278478949185865e-06, + "loss": 0.82946539, + "num_input_tokens_seen": 104327330, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8359375, + "step": 4837, + "time_per_iteration": 2.4579083919525146 + }, + { + "auxiliary_loss_clip": 0.01131777, + "auxiliary_loss_mlp": 0.01037569, + "balance_loss_clip": 1.02170265, + "balance_loss_mlp": 1.04491532, + "epoch": 0.2908762964076356, + "flos": 35331035955840.0, + "grad_norm": 1.8624538054458508, + "language_loss": 0.67733121, + "learning_rate": 3.327556630259381e-06, + "loss": 0.69902468, + "num_input_tokens_seen": 104350350, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8671875, + "step": 4838, + "time_per_iteration": 2.613682270050049 + }, + { + "auxiliary_loss_clip": 0.01137044, + "auxiliary_loss_mlp": 0.010412, + "balance_loss_clip": 1.02485621, + "balance_loss_mlp": 1.04893696, + "epoch": 0.29093641966030365, + "flos": 23076305055360.0, + "grad_norm": 1.6135989987029238, + "language_loss": 0.71288264, + "learning_rate": 3.327265315259095e-06, + "loss": 0.73466504, + "num_input_tokens_seen": 104369995, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.87890625, + "step": 4839, + "time_per_iteration": 2.506908416748047 + }, + { + "auxiliary_loss_clip": 0.0112979, + "auxiliary_loss_mlp": 0.01038337, + "balance_loss_clip": 1.02341795, + "balance_loss_mlp": 1.04433274, + "epoch": 0.2909965429129716, + "flos": 35955415094400.0, + "grad_norm": 1.876317037835641, + "language_loss": 0.75619674, + "learning_rate": 3.326973949928776e-06, + "loss": 0.77787805, + "num_input_tokens_seen": 104392285, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.85546875, + "step": 4840, + "time_per_iteration": 2.6259472370147705 + }, + { + "auxiliary_loss_clip": 0.011316, + "auxiliary_loss_mlp": 0.01041868, + "balance_loss_clip": 1.02688372, + "balance_loss_mlp": 1.0469749, + "epoch": 0.2910566661656396, + "flos": 30880231764480.0, + "grad_norm": 1.9955793585576265, + "language_loss": 0.60459495, + "learning_rate": 3.326682534279471e-06, + "loss": 0.62632966, + "num_input_tokens_seen": 104412640, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84375, + "step": 4841, + "time_per_iteration": 2.5497686862945557 + }, + { + "auxiliary_loss_clip": 0.01133928, + "auxiliary_loss_mlp": 0.01038335, + "balance_loss_clip": 1.0215385, + "balance_loss_mlp": 1.0483892, + "epoch": 0.29111678941830754, + "flos": 30010188533760.0, + "grad_norm": 1.7266193979009703, + "language_loss": 0.71366, + "learning_rate": 3.326391068322232e-06, + "loss": 0.73538262, + "num_input_tokens_seen": 104435245, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.85546875, + "step": 4842, + "time_per_iteration": 2.5817017555236816 + }, + { + "auxiliary_loss_clip": 0.01131749, + "auxiliary_loss_mlp": 0.0103654, + "balance_loss_clip": 1.02188897, + "balance_loss_mlp": 1.04632473, + "epoch": 0.2911769126709755, + "flos": 22857393617280.0, + "grad_norm": 1.5806493177236067, + "language_loss": 0.72846174, + "learning_rate": 3.3260995520681098e-06, + "loss": 0.7501446, + "num_input_tokens_seen": 104455395, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.85546875, + "step": 4843, + "time_per_iteration": 2.4728853702545166 + }, + { + "auxiliary_loss_clip": 0.01132332, + "auxiliary_loss_mlp": 0.01038034, + "balance_loss_clip": 1.0223223, + "balance_loss_mlp": 1.04598284, + "epoch": 0.2912370359236435, + "flos": 21650507619840.0, + "grad_norm": 2.0237546438656393, + "language_loss": 0.5840022, + "learning_rate": 3.3258079855281602e-06, + "loss": 0.60570586, + "num_input_tokens_seen": 104473350, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.86328125, + "step": 4844, + "time_per_iteration": 3.9377825260162354 + }, + { + "auxiliary_loss_clip": 0.01136792, + "auxiliary_loss_mlp": 0.01042261, + "balance_loss_clip": 1.02518439, + "balance_loss_mlp": 1.04942751, + "epoch": 0.29129715917631144, + "flos": 22893340152960.0, + "grad_norm": 2.1502970284536493, + "language_loss": 0.86360186, + "learning_rate": 3.3255163687134396e-06, + "loss": 0.88539243, + "num_input_tokens_seen": 104492265, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.875, + "step": 4845, + "time_per_iteration": 5.415091276168823 + }, + { + "auxiliary_loss_clip": 0.01134782, + "auxiliary_loss_mlp": 0.01051996, + "balance_loss_clip": 1.03494883, + "balance_loss_mlp": 1.04779911, + "epoch": 0.2913572824289794, + "flos": 22674464628480.0, + "grad_norm": 1.7275133095664568, + "language_loss": 0.66684157, + "learning_rate": 3.3252247016350046e-06, + "loss": 0.68870938, + "num_input_tokens_seen": 104510755, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.8671875, + "step": 4846, + "time_per_iteration": 2.495901584625244 + }, + { + "auxiliary_loss_clip": 0.01131044, + "auxiliary_loss_mlp": 0.01042533, + "balance_loss_clip": 1.02700055, + "balance_loss_mlp": 1.04691291, + "epoch": 0.29141740568164737, + "flos": 23107403255040.0, + "grad_norm": 1.7117272730106567, + "language_loss": 0.70501876, + "learning_rate": 3.3249329843039166e-06, + "loss": 0.72675455, + "num_input_tokens_seen": 104530830, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.83984375, + "step": 4847, + "time_per_iteration": 2.50537109375 + }, + { + "auxiliary_loss_clip": 0.01131589, + "auxiliary_loss_mlp": 0.01035574, + "balance_loss_clip": 1.01918232, + "balance_loss_mlp": 1.04682243, + "epoch": 0.29147752893431533, + "flos": 23587026583680.0, + "grad_norm": 2.14972579950547, + "language_loss": 0.73494464, + "learning_rate": 3.324641216731237e-06, + "loss": 0.75661629, + "num_input_tokens_seen": 104550115, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.84765625, + "step": 4848, + "time_per_iteration": 2.506683111190796 + }, + { + "auxiliary_loss_clip": 0.01132855, + "auxiliary_loss_mlp": 0.01042107, + "balance_loss_clip": 1.02569222, + "balance_loss_mlp": 1.04670119, + "epoch": 0.2915376521869833, + "flos": 20591968792320.0, + "grad_norm": 2.106691725132959, + "language_loss": 0.76689458, + "learning_rate": 3.3243493989280295e-06, + "loss": 0.78864431, + "num_input_tokens_seen": 104566255, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.86328125, + "step": 4849, + "time_per_iteration": 2.475512742996216 + }, + { + "auxiliary_loss_clip": 0.01134647, + "auxiliary_loss_mlp": 0.01043325, + "balance_loss_clip": 1.02732718, + "balance_loss_mlp": 1.04683709, + "epoch": 0.29159777543965126, + "flos": 20811490761600.0, + "grad_norm": 1.7698868684834754, + "language_loss": 0.78437513, + "learning_rate": 3.3240575309053596e-06, + "loss": 0.80615485, + "num_input_tokens_seen": 104585235, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87890625, + "step": 4850, + "time_per_iteration": 2.4774062633514404 + }, + { + "auxiliary_loss_clip": 0.01130071, + "auxiliary_loss_mlp": 0.01038553, + "balance_loss_clip": 1.02231026, + "balance_loss_mlp": 1.04620552, + "epoch": 0.29165789869231923, + "flos": 24244155947520.0, + "grad_norm": 1.7416717517415665, + "language_loss": 0.75775445, + "learning_rate": 3.323765612674296e-06, + "loss": 0.77944064, + "num_input_tokens_seen": 104605315, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.83984375, + "step": 4851, + "time_per_iteration": 2.4973719120025635 + }, + { + "auxiliary_loss_clip": 0.01130818, + "auxiliary_loss_mlp": 0.01045515, + "balance_loss_clip": 1.03071558, + "balance_loss_mlp": 1.04819655, + "epoch": 0.29171802194498725, + "flos": 28949925853440.0, + "grad_norm": 1.378687766604426, + "language_loss": 0.77111661, + "learning_rate": 3.3234736442459078e-06, + "loss": 0.79287988, + "num_input_tokens_seen": 104626055, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.828125, + "step": 4852, + "time_per_iteration": 2.5339767932891846 + }, + { + "auxiliary_loss_clip": 0.01132717, + "auxiliary_loss_mlp": 0.01045328, + "balance_loss_clip": 1.0296402, + "balance_loss_mlp": 1.04735672, + "epoch": 0.2917781451976552, + "flos": 22598226011520.0, + "grad_norm": 1.5345579183576068, + "language_loss": 0.78385615, + "learning_rate": 3.3231816256312665e-06, + "loss": 0.80563664, + "num_input_tokens_seen": 104646005, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8515625, + "step": 4853, + "time_per_iteration": 2.511125087738037 + }, + { + "auxiliary_loss_clip": 0.0113401, + "auxiliary_loss_mlp": 0.01038526, + "balance_loss_clip": 1.02278996, + "balance_loss_mlp": 1.04668474, + "epoch": 0.2918382684503232, + "flos": 21574448570880.0, + "grad_norm": 2.984154109703724, + "language_loss": 0.87946999, + "learning_rate": 3.322889556841445e-06, + "loss": 0.90119541, + "num_input_tokens_seen": 104661620, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.875, + "step": 4854, + "time_per_iteration": 2.4654700756073 + }, + { + "auxiliary_loss_clip": 0.0113237, + "auxiliary_loss_mlp": 0.01052716, + "balance_loss_clip": 1.03352284, + "balance_loss_mlp": 1.04678071, + "epoch": 0.29189839170299114, + "flos": 24353503925760.0, + "grad_norm": 1.8357290509449282, + "language_loss": 0.86585724, + "learning_rate": 3.322597437887519e-06, + "loss": 0.88770819, + "num_input_tokens_seen": 104681445, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.85546875, + "step": 4855, + "time_per_iteration": 2.519432783126831 + }, + { + "auxiliary_loss_clip": 0.01043355, + "auxiliary_loss_mlp": 0.01004722, + "balance_loss_clip": 1.00283837, + "balance_loss_mlp": 1.01374364, + "epoch": 0.2919585149556591, + "flos": 71316726215040.0, + "grad_norm": 0.8090362112321295, + "language_loss": 0.60199535, + "learning_rate": 3.322305268780566e-06, + "loss": 0.6224761, + "num_input_tokens_seen": 104747945, + "router_z_loss_clip": 0.01879883, + "router_z_loss_mlp": 0.296875, + "step": 4856, + "time_per_iteration": 3.164905309677124 + }, + { + "auxiliary_loss_clip": 0.01130578, + "auxiliary_loss_mlp": 0.01039982, + "balance_loss_clip": 1.02499735, + "balance_loss_mlp": 1.04626632, + "epoch": 0.2920186382083271, + "flos": 15633208419840.0, + "grad_norm": 2.394144218040463, + "language_loss": 0.67995465, + "learning_rate": 3.322013049531664e-06, + "loss": 0.70166028, + "num_input_tokens_seen": 104766225, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.84375, + "step": 4857, + "time_per_iteration": 2.4615678787231445 + }, + { + "auxiliary_loss_clip": 0.01129998, + "auxiliary_loss_mlp": 0.01035751, + "balance_loss_clip": 1.0210768, + "balance_loss_mlp": 1.04613733, + "epoch": 0.29207876146099504, + "flos": 28366018364160.0, + "grad_norm": 2.1807634638236566, + "language_loss": 0.83958411, + "learning_rate": 3.321720780151895e-06, + "loss": 0.86124158, + "num_input_tokens_seen": 104785345, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.83984375, + "step": 4858, + "time_per_iteration": 2.561347723007202 + }, + { + "auxiliary_loss_clip": 0.01131346, + "auxiliary_loss_mlp": 0.0103964, + "balance_loss_clip": 1.02478647, + "balance_loss_mlp": 1.04746854, + "epoch": 0.292138884713663, + "flos": 21870963342720.0, + "grad_norm": 2.0714117361066298, + "language_loss": 0.77547097, + "learning_rate": 3.321428460652342e-06, + "loss": 0.79718083, + "num_input_tokens_seen": 104804560, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.83984375, + "step": 4859, + "time_per_iteration": 2.4801361560821533 + }, + { + "auxiliary_loss_clip": 0.01132855, + "auxiliary_loss_mlp": 0.01043796, + "balance_loss_clip": 1.02764332, + "balance_loss_mlp": 1.04424477, + "epoch": 0.29219900796633097, + "flos": 20992552243200.0, + "grad_norm": 2.0548529873010564, + "language_loss": 0.68948561, + "learning_rate": 3.3211360910440885e-06, + "loss": 0.71125209, + "num_input_tokens_seen": 104821105, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8828125, + "step": 4860, + "time_per_iteration": 2.531022071838379 + }, + { + "auxiliary_loss_clip": 0.01129954, + "auxiliary_loss_mlp": 0.01040561, + "balance_loss_clip": 1.0267868, + "balance_loss_mlp": 1.04821134, + "epoch": 0.29225913121899894, + "flos": 35004608133120.0, + "grad_norm": 2.771004145303475, + "language_loss": 0.75952631, + "learning_rate": 3.320843671338222e-06, + "loss": 0.78123146, + "num_input_tokens_seen": 104841440, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.81640625, + "step": 4861, + "time_per_iteration": 2.619257926940918 + }, + { + "auxiliary_loss_clip": 0.01129568, + "auxiliary_loss_mlp": 0.01048123, + "balance_loss_clip": 1.03350759, + "balance_loss_mlp": 1.04631817, + "epoch": 0.2923192544716669, + "flos": 13515663888000.0, + "grad_norm": 1.7230129115334698, + "language_loss": 0.91648388, + "learning_rate": 3.320551201545832e-06, + "loss": 0.93826073, + "num_input_tokens_seen": 104858210, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.83203125, + "step": 4862, + "time_per_iteration": 2.4596564769744873 + }, + { + "auxiliary_loss_clip": 0.01129785, + "auxiliary_loss_mlp": 0.01038215, + "balance_loss_clip": 1.02336144, + "balance_loss_mlp": 1.04544663, + "epoch": 0.29237937772433487, + "flos": 19463512141440.0, + "grad_norm": 2.061794510539927, + "language_loss": 0.73736131, + "learning_rate": 3.320258681678008e-06, + "loss": 0.75904131, + "num_input_tokens_seen": 104875620, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.84375, + "step": 4863, + "time_per_iteration": 2.4478728771209717 + }, + { + "auxiliary_loss_clip": 0.01125934, + "auxiliary_loss_mlp": 0.01038806, + "balance_loss_clip": 1.02474487, + "balance_loss_mlp": 1.04584527, + "epoch": 0.29243950097700283, + "flos": 20850597694080.0, + "grad_norm": 1.6779515608592832, + "language_loss": 0.78057373, + "learning_rate": 3.319966111745842e-06, + "loss": 0.80222106, + "num_input_tokens_seen": 104894600, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.80078125, + "step": 4864, + "time_per_iteration": 2.487544059753418 + }, + { + "auxiliary_loss_clip": 0.0113348, + "auxiliary_loss_mlp": 0.01045654, + "balance_loss_clip": 1.02927482, + "balance_loss_mlp": 1.04763806, + "epoch": 0.29249962422967085, + "flos": 23584225322880.0, + "grad_norm": 2.699456605470703, + "language_loss": 0.81919956, + "learning_rate": 3.319673491760429e-06, + "loss": 0.8409909, + "num_input_tokens_seen": 104914530, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.859375, + "step": 4865, + "time_per_iteration": 2.486553192138672 + }, + { + "auxiliary_loss_clip": 0.01130825, + "auxiliary_loss_mlp": 0.01040981, + "balance_loss_clip": 1.02523327, + "balance_loss_mlp": 1.04592669, + "epoch": 0.2925597474823388, + "flos": 22273342473600.0, + "grad_norm": 1.8393536761495908, + "language_loss": 0.85281575, + "learning_rate": 3.3193808217328645e-06, + "loss": 0.87453377, + "num_input_tokens_seen": 104933460, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8515625, + "step": 4866, + "time_per_iteration": 2.4981276988983154 + }, + { + "auxiliary_loss_clip": 0.01124877, + "auxiliary_loss_mlp": 0.01037248, + "balance_loss_clip": 1.02263868, + "balance_loss_mlp": 1.04323506, + "epoch": 0.2926198707350068, + "flos": 34456108475520.0, + "grad_norm": 1.627734535935432, + "language_loss": 0.755858, + "learning_rate": 3.3190881016742476e-06, + "loss": 0.77747923, + "num_input_tokens_seen": 104954495, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.81640625, + "step": 4867, + "time_per_iteration": 2.5813703536987305 + }, + { + "auxiliary_loss_clip": 0.01129928, + "auxiliary_loss_mlp": 0.01049325, + "balance_loss_clip": 1.03337526, + "balance_loss_mlp": 1.04375887, + "epoch": 0.29267999398767475, + "flos": 20704153944960.0, + "grad_norm": 4.179606236398783, + "language_loss": 0.73403615, + "learning_rate": 3.3187953315956776e-06, + "loss": 0.75582874, + "num_input_tokens_seen": 104971915, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.859375, + "step": 4868, + "time_per_iteration": 2.48374342918396 + }, + { + "auxiliary_loss_clip": 0.01128319, + "auxiliary_loss_mlp": 0.01033217, + "balance_loss_clip": 1.01857829, + "balance_loss_mlp": 1.04520726, + "epoch": 0.2927401172403427, + "flos": 18368667642240.0, + "grad_norm": 1.3015957921166281, + "language_loss": 0.74555755, + "learning_rate": 3.3185025115082566e-06, + "loss": 0.76717293, + "num_input_tokens_seen": 104991335, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.83203125, + "step": 4869, + "time_per_iteration": 2.458434820175171 + }, + { + "auxiliary_loss_clip": 0.01131121, + "auxiliary_loss_mlp": 0.01038828, + "balance_loss_clip": 1.02390289, + "balance_loss_mlp": 1.04639244, + "epoch": 0.2928002404930107, + "flos": 26104041244800.0, + "grad_norm": 1.465584897312906, + "language_loss": 0.76539874, + "learning_rate": 3.318209641423088e-06, + "loss": 0.78709823, + "num_input_tokens_seen": 105012015, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.84765625, + "step": 4870, + "time_per_iteration": 2.5194873809814453 + }, + { + "auxiliary_loss_clip": 0.01133405, + "auxiliary_loss_mlp": 0.01046415, + "balance_loss_clip": 1.03040564, + "balance_loss_mlp": 1.04584765, + "epoch": 0.29286036374567864, + "flos": 21324726241920.0, + "grad_norm": 2.259080578005736, + "language_loss": 0.67315602, + "learning_rate": 3.3179167213512777e-06, + "loss": 0.69495422, + "num_input_tokens_seen": 105031460, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.875, + "step": 4871, + "time_per_iteration": 2.4556169509887695 + }, + { + "auxiliary_loss_clip": 0.01125512, + "auxiliary_loss_mlp": 0.01039548, + "balance_loss_clip": 1.02509975, + "balance_loss_mlp": 1.04283524, + "epoch": 0.2929204869983466, + "flos": 29569492569600.0, + "grad_norm": 1.8081222369362746, + "language_loss": 0.76924586, + "learning_rate": 3.317623751303933e-06, + "loss": 0.79089642, + "num_input_tokens_seen": 105052965, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.828125, + "step": 4872, + "time_per_iteration": 2.575421094894409 + }, + { + "auxiliary_loss_clip": 0.01131072, + "auxiliary_loss_mlp": 0.01043663, + "balance_loss_clip": 1.0271883, + "balance_loss_mlp": 1.04527128, + "epoch": 0.2929806102510146, + "flos": 19058259922560.0, + "grad_norm": 2.2968152323379347, + "language_loss": 0.72835052, + "learning_rate": 3.317330731292164e-06, + "loss": 0.75009787, + "num_input_tokens_seen": 105071840, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.859375, + "step": 4873, + "time_per_iteration": 2.4370815753936768 + }, + { + "auxiliary_loss_clip": 0.01133087, + "auxiliary_loss_mlp": 0.01041861, + "balance_loss_clip": 1.02518392, + "balance_loss_mlp": 1.04519463, + "epoch": 0.29304073350368254, + "flos": 21944221130880.0, + "grad_norm": 1.8384173868300016, + "language_loss": 0.77871835, + "learning_rate": 3.3170376613270812e-06, + "loss": 0.80046785, + "num_input_tokens_seen": 105089445, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.87890625, + "step": 4874, + "time_per_iteration": 2.512613534927368 + }, + { + "auxiliary_loss_clip": 0.01135856, + "auxiliary_loss_mlp": 0.01045857, + "balance_loss_clip": 1.02962041, + "balance_loss_mlp": 1.04670048, + "epoch": 0.2931008567563505, + "flos": 15450818135040.0, + "grad_norm": 2.084283832751276, + "language_loss": 0.77047002, + "learning_rate": 3.3167445414197985e-06, + "loss": 0.79228717, + "num_input_tokens_seen": 105106210, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.890625, + "step": 4875, + "time_per_iteration": 2.487417221069336 + }, + { + "auxiliary_loss_clip": 0.01136141, + "auxiliary_loss_mlp": 0.0103441, + "balance_loss_clip": 1.01912785, + "balance_loss_mlp": 1.04909277, + "epoch": 0.29316098000901847, + "flos": 16983162288000.0, + "grad_norm": 1.6806867883636405, + "language_loss": 0.69183826, + "learning_rate": 3.316451371581431e-06, + "loss": 0.71354383, + "num_input_tokens_seen": 105124200, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.87109375, + "step": 4876, + "time_per_iteration": 2.4764888286590576 + }, + { + "auxiliary_loss_clip": 0.01128897, + "auxiliary_loss_mlp": 0.01045114, + "balance_loss_clip": 1.03027201, + "balance_loss_mlp": 1.04482532, + "epoch": 0.29322110326168643, + "flos": 16357705741440.0, + "grad_norm": 2.3621737524413913, + "language_loss": 0.8195532, + "learning_rate": 3.316158151823096e-06, + "loss": 0.84129333, + "num_input_tokens_seen": 105140400, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.83984375, + "step": 4877, + "time_per_iteration": 2.4738340377807617 + }, + { + "auxiliary_loss_clip": 0.01134087, + "auxiliary_loss_mlp": 0.01042806, + "balance_loss_clip": 1.02765405, + "balance_loss_mlp": 1.04704273, + "epoch": 0.29328122651435445, + "flos": 13990869843840.0, + "grad_norm": 1.8654341954981455, + "language_loss": 0.67843962, + "learning_rate": 3.315864882155911e-06, + "loss": 0.70020854, + "num_input_tokens_seen": 105157535, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.87109375, + "step": 4878, + "time_per_iteration": 2.4606332778930664 + }, + { + "auxiliary_loss_clip": 0.01130502, + "auxiliary_loss_mlp": 0.01041377, + "balance_loss_clip": 1.02624929, + "balance_loss_mlp": 1.04562759, + "epoch": 0.2933413497670224, + "flos": 25264593423360.0, + "grad_norm": 1.8286598598322423, + "language_loss": 0.7351383, + "learning_rate": 3.3155715625909982e-06, + "loss": 0.7568571, + "num_input_tokens_seen": 105175185, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.84765625, + "step": 4879, + "time_per_iteration": 2.502901315689087 + }, + { + "auxiliary_loss_clip": 0.01137009, + "auxiliary_loss_mlp": 0.01046436, + "balance_loss_clip": 1.02881706, + "balance_loss_mlp": 1.0484302, + "epoch": 0.2934014730196904, + "flos": 32123746656000.0, + "grad_norm": 2.0641755158914634, + "language_loss": 0.65864384, + "learning_rate": 3.3152781931394803e-06, + "loss": 0.68047822, + "num_input_tokens_seen": 105194540, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.88671875, + "step": 4880, + "time_per_iteration": 2.5785939693450928 + }, + { + "auxiliary_loss_clip": 0.01130839, + "auxiliary_loss_mlp": 0.01045571, + "balance_loss_clip": 1.02962136, + "balance_loss_mlp": 1.04453218, + "epoch": 0.29346159627235835, + "flos": 24352498344960.0, + "grad_norm": 2.157512175932489, + "language_loss": 0.70518327, + "learning_rate": 3.314984773812481e-06, + "loss": 0.72694737, + "num_input_tokens_seen": 105213215, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.86328125, + "step": 4881, + "time_per_iteration": 2.4913742542266846 + }, + { + "auxiliary_loss_clip": 0.01133082, + "auxiliary_loss_mlp": 0.01039157, + "balance_loss_clip": 1.02336192, + "balance_loss_mlp": 1.0471015, + "epoch": 0.2935217195250263, + "flos": 22746752749440.0, + "grad_norm": 2.112776228996839, + "language_loss": 0.83907056, + "learning_rate": 3.314691304621127e-06, + "loss": 0.86079299, + "num_input_tokens_seen": 105231585, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.859375, + "step": 4882, + "time_per_iteration": 2.4955010414123535 + }, + { + "auxiliary_loss_clip": 0.01135526, + "auxiliary_loss_mlp": 0.01041581, + "balance_loss_clip": 1.02495086, + "balance_loss_mlp": 1.0470233, + "epoch": 0.2935818427776943, + "flos": 21725561088000.0, + "grad_norm": 2.198383771985309, + "language_loss": 0.71811014, + "learning_rate": 3.314397785576548e-06, + "loss": 0.73988116, + "num_input_tokens_seen": 105250120, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8828125, + "step": 4883, + "time_per_iteration": 2.474574089050293 + }, + { + "auxiliary_loss_clip": 0.01132572, + "auxiliary_loss_mlp": 0.01038466, + "balance_loss_clip": 1.02225327, + "balance_loss_mlp": 1.04580843, + "epoch": 0.29364196603036224, + "flos": 23804968354560.0, + "grad_norm": 3.497082861184858, + "language_loss": 0.92629534, + "learning_rate": 3.3141042166898726e-06, + "loss": 0.94800568, + "num_input_tokens_seen": 105266065, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8671875, + "step": 4884, + "time_per_iteration": 2.4947426319122314 + }, + { + "auxiliary_loss_clip": 0.01138135, + "auxiliary_loss_mlp": 0.01045606, + "balance_loss_clip": 1.03032374, + "balance_loss_mlp": 1.05094171, + "epoch": 0.2937020892830302, + "flos": 23470064922240.0, + "grad_norm": 2.2315982417854876, + "language_loss": 0.73729408, + "learning_rate": 3.313810597972234e-06, + "loss": 0.75913155, + "num_input_tokens_seen": 105282155, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.87109375, + "step": 4885, + "time_per_iteration": 2.5076494216918945 + }, + { + "auxiliary_loss_clip": 0.01132864, + "auxiliary_loss_mlp": 0.0104824, + "balance_loss_clip": 1.03185511, + "balance_loss_mlp": 1.0468272, + "epoch": 0.2937622125356982, + "flos": 24272740195200.0, + "grad_norm": 2.1964333946604135, + "language_loss": 0.85011208, + "learning_rate": 3.3135169294347655e-06, + "loss": 0.87192315, + "num_input_tokens_seen": 105299225, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.859375, + "step": 4886, + "time_per_iteration": 3.911407232284546 + }, + { + "auxiliary_loss_clip": 0.01135202, + "auxiliary_loss_mlp": 0.01041375, + "balance_loss_clip": 1.02624702, + "balance_loss_mlp": 1.04678059, + "epoch": 0.29382233578836614, + "flos": 20662461233280.0, + "grad_norm": 2.1393217933297657, + "language_loss": 0.77027792, + "learning_rate": 3.313223211088603e-06, + "loss": 0.79204369, + "num_input_tokens_seen": 105315710, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.88671875, + "step": 4887, + "time_per_iteration": 3.906132936477661 + }, + { + "auxiliary_loss_clip": 0.01135916, + "auxiliary_loss_mlp": 0.01046614, + "balance_loss_clip": 1.03127122, + "balance_loss_mlp": 1.04697633, + "epoch": 0.2938824590410341, + "flos": 16545052103040.0, + "grad_norm": 2.1952396364021536, + "language_loss": 0.79558414, + "learning_rate": 3.3129294429448855e-06, + "loss": 0.8174094, + "num_input_tokens_seen": 105333505, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.890625, + "step": 4888, + "time_per_iteration": 2.4338221549987793 + }, + { + "auxiliary_loss_clip": 0.01130748, + "auxiliary_loss_mlp": 0.01032451, + "balance_loss_clip": 1.0173831, + "balance_loss_mlp": 1.04529762, + "epoch": 0.29394258229370207, + "flos": 37925474382720.0, + "grad_norm": 1.4299668586503376, + "language_loss": 0.55301261, + "learning_rate": 3.3126356250147517e-06, + "loss": 0.57464457, + "num_input_tokens_seen": 105355605, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.85546875, + "step": 4889, + "time_per_iteration": 2.637645959854126 + }, + { + "auxiliary_loss_clip": 0.01134449, + "auxiliary_loss_mlp": 0.01039798, + "balance_loss_clip": 1.02314413, + "balance_loss_mlp": 1.0465076, + "epoch": 0.29400270554637004, + "flos": 20044690197120.0, + "grad_norm": 1.9477461279926194, + "language_loss": 0.84309214, + "learning_rate": 3.3123417573093434e-06, + "loss": 0.86483455, + "num_input_tokens_seen": 105374225, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.87890625, + "step": 4890, + "time_per_iteration": 2.445218801498413 + }, + { + "auxiliary_loss_clip": 0.01135501, + "auxiliary_loss_mlp": 0.01039459, + "balance_loss_clip": 1.02402174, + "balance_loss_mlp": 1.04780436, + "epoch": 0.294062828799038, + "flos": 15266380775040.0, + "grad_norm": 1.9951401673219091, + "language_loss": 0.72357798, + "learning_rate": 3.3120478398398046e-06, + "loss": 0.74532759, + "num_input_tokens_seen": 105391565, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.875, + "step": 4891, + "time_per_iteration": 2.434298515319824 + }, + { + "auxiliary_loss_clip": 0.01134115, + "auxiliary_loss_mlp": 0.01045308, + "balance_loss_clip": 1.02910721, + "balance_loss_mlp": 1.04683042, + "epoch": 0.294122952051706, + "flos": 22747147799040.0, + "grad_norm": 1.9834299238301316, + "language_loss": 0.77230573, + "learning_rate": 3.3117538726172797e-06, + "loss": 0.79410005, + "num_input_tokens_seen": 105409840, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.875, + "step": 4892, + "time_per_iteration": 2.4628806114196777 + }, + { + "auxiliary_loss_clip": 0.01130172, + "auxiliary_loss_mlp": 0.01035757, + "balance_loss_clip": 1.01989055, + "balance_loss_mlp": 1.04514182, + "epoch": 0.294183075304374, + "flos": 24972891073920.0, + "grad_norm": 1.7053650125053033, + "language_loss": 0.7846024, + "learning_rate": 3.3114598556529164e-06, + "loss": 0.80626166, + "num_input_tokens_seen": 105428645, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8515625, + "step": 4893, + "time_per_iteration": 2.505946159362793 + }, + { + "auxiliary_loss_clip": 0.01132333, + "auxiliary_loss_mlp": 0.01048117, + "balance_loss_clip": 1.03252435, + "balance_loss_mlp": 1.04651928, + "epoch": 0.29424319855704195, + "flos": 30952986762240.0, + "grad_norm": 1.8389301673785101, + "language_loss": 0.85052156, + "learning_rate": 3.311165788957864e-06, + "loss": 0.87232608, + "num_input_tokens_seen": 105447480, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.85546875, + "step": 4894, + "time_per_iteration": 2.5221872329711914 + }, + { + "auxiliary_loss_clip": 0.01132661, + "auxiliary_loss_mlp": 0.01036474, + "balance_loss_clip": 1.02120304, + "balance_loss_mlp": 1.04568195, + "epoch": 0.2943033218097099, + "flos": 15231583474560.0, + "grad_norm": 2.595597690193387, + "language_loss": 0.9027828, + "learning_rate": 3.310871672543274e-06, + "loss": 0.92447418, + "num_input_tokens_seen": 105464600, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.87109375, + "step": 4895, + "time_per_iteration": 2.4466798305511475 + }, + { + "auxiliary_loss_clip": 0.01135692, + "auxiliary_loss_mlp": 0.01040955, + "balance_loss_clip": 1.02434874, + "balance_loss_mlp": 1.04720199, + "epoch": 0.2943634450623779, + "flos": 21725884310400.0, + "grad_norm": 3.001231056574592, + "language_loss": 0.86597103, + "learning_rate": 3.3105775064202982e-06, + "loss": 0.88773751, + "num_input_tokens_seen": 105481510, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8828125, + "step": 4896, + "time_per_iteration": 2.459611654281616 + }, + { + "auxiliary_loss_clip": 0.01134294, + "auxiliary_loss_mlp": 0.01050105, + "balance_loss_clip": 1.03402412, + "balance_loss_mlp": 1.04802299, + "epoch": 0.29442356831504585, + "flos": 22602104680320.0, + "grad_norm": 2.652800133974417, + "language_loss": 0.73196733, + "learning_rate": 3.3102832906000924e-06, + "loss": 0.75381136, + "num_input_tokens_seen": 105501390, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.86328125, + "step": 4897, + "time_per_iteration": 2.4981348514556885 + }, + { + "auxiliary_loss_clip": 0.01136241, + "auxiliary_loss_mlp": 0.01042547, + "balance_loss_clip": 1.02546394, + "balance_loss_mlp": 1.0458895, + "epoch": 0.2944836915677138, + "flos": 20011401267840.0, + "grad_norm": 1.867954953207583, + "language_loss": 0.73798919, + "learning_rate": 3.309989025093813e-06, + "loss": 0.75977707, + "num_input_tokens_seen": 105519600, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.90234375, + "step": 4898, + "time_per_iteration": 2.439952850341797 + }, + { + "auxiliary_loss_clip": 0.01142949, + "auxiliary_loss_mlp": 0.01042887, + "balance_loss_clip": 1.02471972, + "balance_loss_mlp": 1.05136585, + "epoch": 0.2945438148203818, + "flos": 20045875345920.0, + "grad_norm": 2.6754375338801477, + "language_loss": 0.70309317, + "learning_rate": 3.309694709912618e-06, + "loss": 0.72495157, + "num_input_tokens_seen": 105535970, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9140625, + "step": 4899, + "time_per_iteration": 2.4757347106933594 + }, + { + "auxiliary_loss_clip": 0.01135914, + "auxiliary_loss_mlp": 0.01041458, + "balance_loss_clip": 1.02458405, + "balance_loss_mlp": 1.0484879, + "epoch": 0.29460393807304974, + "flos": 23733542160000.0, + "grad_norm": 1.9063479453414416, + "language_loss": 0.79007781, + "learning_rate": 3.3094003450676685e-06, + "loss": 0.8118515, + "num_input_tokens_seen": 105556735, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.875, + "step": 4900, + "time_per_iteration": 2.50555419921875 + }, + { + "auxiliary_loss_clip": 0.01131673, + "auxiliary_loss_mlp": 0.01042831, + "balance_loss_clip": 1.02720261, + "balance_loss_mlp": 1.04425764, + "epoch": 0.2946640613257177, + "flos": 14976079056000.0, + "grad_norm": 1.709443882500664, + "language_loss": 0.80718857, + "learning_rate": 3.3091059305701268e-06, + "loss": 0.8289336, + "num_input_tokens_seen": 105574875, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.875, + "step": 4901, + "time_per_iteration": 2.481768846511841 + }, + { + "auxiliary_loss_clip": 0.01127885, + "auxiliary_loss_mlp": 0.01035027, + "balance_loss_clip": 1.02062666, + "balance_loss_mlp": 1.04583955, + "epoch": 0.2947241845783857, + "flos": 24243904552320.0, + "grad_norm": 1.9567596526300628, + "language_loss": 0.57923675, + "learning_rate": 3.308811466431157e-06, + "loss": 0.60086584, + "num_input_tokens_seen": 105594225, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8203125, + "step": 4902, + "time_per_iteration": 2.491337299346924 + }, + { + "auxiliary_loss_clip": 0.01131951, + "auxiliary_loss_mlp": 0.01038919, + "balance_loss_clip": 1.02416682, + "balance_loss_mlp": 1.045946, + "epoch": 0.29478430783105364, + "flos": 19938394874880.0, + "grad_norm": 1.6713771638909152, + "language_loss": 0.75298065, + "learning_rate": 3.308516952661925e-06, + "loss": 0.77468932, + "num_input_tokens_seen": 105614000, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.859375, + "step": 4903, + "time_per_iteration": 2.4884400367736816 + }, + { + "auxiliary_loss_clip": 0.01132991, + "auxiliary_loss_mlp": 0.01042452, + "balance_loss_clip": 1.02560806, + "balance_loss_mlp": 1.04630995, + "epoch": 0.2948444310837216, + "flos": 27381347856000.0, + "grad_norm": 1.8012466742437707, + "language_loss": 0.6254617, + "learning_rate": 3.3082223892736e-06, + "loss": 0.64721614, + "num_input_tokens_seen": 105634575, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8671875, + "step": 4904, + "time_per_iteration": 2.5288941860198975 + }, + { + "auxiliary_loss_clip": 0.01134735, + "auxiliary_loss_mlp": 0.01038462, + "balance_loss_clip": 1.02252424, + "balance_loss_mlp": 1.04603219, + "epoch": 0.2949045543363896, + "flos": 23405462311680.0, + "grad_norm": 1.5173763027357385, + "language_loss": 0.7301079, + "learning_rate": 3.3079277762773496e-06, + "loss": 0.75183994, + "num_input_tokens_seen": 105654385, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.88671875, + "step": 4905, + "time_per_iteration": 2.5069708824157715 + }, + { + "auxiliary_loss_clip": 0.01131529, + "auxiliary_loss_mlp": 0.01041997, + "balance_loss_clip": 1.02577305, + "balance_loss_mlp": 1.0456897, + "epoch": 0.2949646775890576, + "flos": 23951483930880.0, + "grad_norm": 1.6701950888056076, + "language_loss": 0.81584871, + "learning_rate": 3.3076331136843476e-06, + "loss": 0.8375839, + "num_input_tokens_seen": 105673570, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.859375, + "step": 4906, + "time_per_iteration": 2.473604202270508 + }, + { + "auxiliary_loss_clip": 0.01128251, + "auxiliary_loss_mlp": 0.01034193, + "balance_loss_clip": 1.01870799, + "balance_loss_mlp": 1.04443395, + "epoch": 0.29502480084172555, + "flos": 22784315397120.0, + "grad_norm": 1.9494272179492087, + "language_loss": 0.87158448, + "learning_rate": 3.3073384015057667e-06, + "loss": 0.89320892, + "num_input_tokens_seen": 105691940, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.83984375, + "step": 4907, + "time_per_iteration": 2.490842819213867 + }, + { + "auxiliary_loss_clip": 0.01135464, + "auxiliary_loss_mlp": 0.01042187, + "balance_loss_clip": 1.02623653, + "balance_loss_mlp": 1.04758191, + "epoch": 0.2950849240943935, + "flos": 19646656611840.0, + "grad_norm": 2.3387997458884833, + "language_loss": 0.81563503, + "learning_rate": 3.307043639752782e-06, + "loss": 0.83741152, + "num_input_tokens_seen": 105709825, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87890625, + "step": 4908, + "time_per_iteration": 2.4586410522460938 + }, + { + "auxiliary_loss_clip": 0.01054339, + "auxiliary_loss_mlp": 0.01042068, + "balance_loss_clip": 1.03970814, + "balance_loss_mlp": 1.0157342, + "epoch": 0.2951450473470615, + "flos": 71002829260800.0, + "grad_norm": 0.7811313355607663, + "language_loss": 0.57214808, + "learning_rate": 3.3067488284365728e-06, + "loss": 0.59311211, + "num_input_tokens_seen": 105766880, + "router_z_loss_clip": 0.02355957, + "router_z_loss_mlp": 0.38671875, + "step": 4909, + "time_per_iteration": 2.9739394187927246 + }, + { + "auxiliary_loss_clip": 0.01136234, + "auxiliary_loss_mlp": 0.01038405, + "balance_loss_clip": 1.02340245, + "balance_loss_mlp": 1.05156505, + "epoch": 0.29520517059972945, + "flos": 22966310632320.0, + "grad_norm": 1.44395719574742, + "language_loss": 0.86585498, + "learning_rate": 3.3064539675683163e-06, + "loss": 0.88760138, + "num_input_tokens_seen": 105786875, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.84765625, + "step": 4910, + "time_per_iteration": 2.4779117107391357 + }, + { + "auxiliary_loss_clip": 0.01126914, + "auxiliary_loss_mlp": 0.01040378, + "balance_loss_clip": 1.02551222, + "balance_loss_mlp": 1.04549575, + "epoch": 0.2952652938523974, + "flos": 20485673470080.0, + "grad_norm": 1.8630755123750513, + "language_loss": 0.72632295, + "learning_rate": 3.3061590571591946e-06, + "loss": 0.74799585, + "num_input_tokens_seen": 105805315, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8125, + "step": 4911, + "time_per_iteration": 2.4959700107574463 + }, + { + "auxiliary_loss_clip": 0.01131053, + "auxiliary_loss_mlp": 0.01037399, + "balance_loss_clip": 1.02239108, + "balance_loss_mlp": 1.04823601, + "epoch": 0.2953254171050654, + "flos": 19646584784640.0, + "grad_norm": 1.774615067737937, + "language_loss": 0.8988539, + "learning_rate": 3.3058640972203904e-06, + "loss": 0.92053854, + "num_input_tokens_seen": 105825125, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 4912, + "time_per_iteration": 2.4532997608184814 + }, + { + "auxiliary_loss_clip": 0.01131716, + "auxiliary_loss_mlp": 0.01046481, + "balance_loss_clip": 1.03022075, + "balance_loss_mlp": 1.04712319, + "epoch": 0.29538554035773334, + "flos": 22747973811840.0, + "grad_norm": 1.458226475428025, + "language_loss": 0.83448595, + "learning_rate": 3.3055690877630894e-06, + "loss": 0.85626793, + "num_input_tokens_seen": 105846085, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.84765625, + "step": 4913, + "time_per_iteration": 2.515580654144287 + }, + { + "auxiliary_loss_clip": 0.01129704, + "auxiliary_loss_mlp": 0.01039162, + "balance_loss_clip": 1.02385521, + "balance_loss_mlp": 1.0438993, + "epoch": 0.2954456636104013, + "flos": 21871861182720.0, + "grad_norm": 1.6602062940724112, + "language_loss": 0.77029538, + "learning_rate": 3.3052740287984765e-06, + "loss": 0.79198408, + "num_input_tokens_seen": 105865400, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.859375, + "step": 4914, + "time_per_iteration": 2.457158088684082 + }, + { + "auxiliary_loss_clip": 0.01128554, + "auxiliary_loss_mlp": 0.01039033, + "balance_loss_clip": 1.02302349, + "balance_loss_mlp": 1.04553497, + "epoch": 0.2955057868630693, + "flos": 40442560871040.0, + "grad_norm": 1.9027466376674422, + "language_loss": 0.81550008, + "learning_rate": 3.3049789203377424e-06, + "loss": 0.83717597, + "num_input_tokens_seen": 105887920, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.828125, + "step": 4915, + "time_per_iteration": 2.6669511795043945 + }, + { + "auxiliary_loss_clip": 0.01134085, + "auxiliary_loss_mlp": 0.01037926, + "balance_loss_clip": 1.02215445, + "balance_loss_mlp": 1.0477066, + "epoch": 0.29556591011573724, + "flos": 22564506119040.0, + "grad_norm": 1.9544787473030132, + "language_loss": 0.84415555, + "learning_rate": 3.3046837623920772e-06, + "loss": 0.8658756, + "num_input_tokens_seen": 105904035, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.86328125, + "step": 4916, + "time_per_iteration": 2.473867416381836 + }, + { + "auxiliary_loss_clip": 0.01127987, + "auxiliary_loss_mlp": 0.01033684, + "balance_loss_clip": 1.01874673, + "balance_loss_mlp": 1.04477537, + "epoch": 0.2956260333684052, + "flos": 22089300163200.0, + "grad_norm": 3.5737730841451225, + "language_loss": 0.69611692, + "learning_rate": 3.3043885549726723e-06, + "loss": 0.71773368, + "num_input_tokens_seen": 105922685, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.83203125, + "step": 4917, + "time_per_iteration": 2.5078670978546143 + }, + { + "auxiliary_loss_clip": 0.01134116, + "auxiliary_loss_mlp": 0.01041431, + "balance_loss_clip": 1.02550471, + "balance_loss_mlp": 1.04932523, + "epoch": 0.2956861566210732, + "flos": 16435488643200.0, + "grad_norm": 2.1750223310256507, + "language_loss": 0.90840054, + "learning_rate": 3.3040932980907226e-06, + "loss": 0.93015605, + "num_input_tokens_seen": 105940425, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.84765625, + "step": 4918, + "time_per_iteration": 2.438870668411255 + }, + { + "auxiliary_loss_clip": 0.01134586, + "auxiliary_loss_mlp": 0.01040808, + "balance_loss_clip": 1.02504885, + "balance_loss_mlp": 1.04929781, + "epoch": 0.2957462798737412, + "flos": 25812087500160.0, + "grad_norm": 1.9164121886210477, + "language_loss": 0.72399461, + "learning_rate": 3.303797991757425e-06, + "loss": 0.74574864, + "num_input_tokens_seen": 105960550, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8515625, + "step": 4919, + "time_per_iteration": 2.5533134937286377 + }, + { + "auxiliary_loss_clip": 0.01130751, + "auxiliary_loss_mlp": 0.01042531, + "balance_loss_clip": 1.02661633, + "balance_loss_mlp": 1.04704165, + "epoch": 0.29580640312640916, + "flos": 16690849407360.0, + "grad_norm": 1.7148380002351797, + "language_loss": 0.75758076, + "learning_rate": 3.3035026359839763e-06, + "loss": 0.77931356, + "num_input_tokens_seen": 105978820, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8359375, + "step": 4920, + "time_per_iteration": 2.4288933277130127 + }, + { + "auxiliary_loss_clip": 0.01138995, + "auxiliary_loss_mlp": 0.01045405, + "balance_loss_clip": 1.02953875, + "balance_loss_mlp": 1.05214858, + "epoch": 0.2958665263790771, + "flos": 23945594100480.0, + "grad_norm": 2.2591712667141075, + "language_loss": 0.68327153, + "learning_rate": 3.3032072307815774e-06, + "loss": 0.7051155, + "num_input_tokens_seen": 105997545, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8671875, + "step": 4921, + "time_per_iteration": 2.5978074073791504 + }, + { + "auxiliary_loss_clip": 0.01136262, + "auxiliary_loss_mlp": 0.01042633, + "balance_loss_clip": 1.02580023, + "balance_loss_mlp": 1.04953861, + "epoch": 0.2959266496317451, + "flos": 18478410670080.0, + "grad_norm": 1.8781945072150448, + "language_loss": 0.74265885, + "learning_rate": 3.3029117761614298e-06, + "loss": 0.76444781, + "num_input_tokens_seen": 106015320, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8671875, + "step": 4922, + "time_per_iteration": 2.4518954753875732 + }, + { + "auxiliary_loss_clip": 0.0113841, + "auxiliary_loss_mlp": 0.0103604, + "balance_loss_clip": 1.01932716, + "balance_loss_mlp": 1.04900336, + "epoch": 0.29598677288441305, + "flos": 25957489754880.0, + "grad_norm": 2.178664992776949, + "language_loss": 0.76679426, + "learning_rate": 3.302616272134737e-06, + "loss": 0.78853875, + "num_input_tokens_seen": 106034555, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.89453125, + "step": 4923, + "time_per_iteration": 2.5565848350524902 + }, + { + "auxiliary_loss_clip": 0.0113218, + "auxiliary_loss_mlp": 0.01039495, + "balance_loss_clip": 1.02359807, + "balance_loss_mlp": 1.04730439, + "epoch": 0.296046896137081, + "flos": 25155999630720.0, + "grad_norm": 1.616043641477794, + "language_loss": 0.86307567, + "learning_rate": 3.3023207187127042e-06, + "loss": 0.88479245, + "num_input_tokens_seen": 106054200, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8515625, + "step": 4924, + "time_per_iteration": 2.5081374645233154 + }, + { + "auxiliary_loss_clip": 0.01132422, + "auxiliary_loss_mlp": 0.01034495, + "balance_loss_clip": 1.01864016, + "balance_loss_mlp": 1.04767513, + "epoch": 0.296107019389749, + "flos": 21761148487680.0, + "grad_norm": 1.3983202546472309, + "language_loss": 0.8180936, + "learning_rate": 3.3020251159065396e-06, + "loss": 0.83976275, + "num_input_tokens_seen": 106074700, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.84765625, + "step": 4925, + "time_per_iteration": 2.5473146438598633 + }, + { + "auxiliary_loss_clip": 0.01132696, + "auxiliary_loss_mlp": 0.01036748, + "balance_loss_clip": 1.02175128, + "balance_loss_mlp": 1.04893184, + "epoch": 0.29616714264241695, + "flos": 17960039544960.0, + "grad_norm": 2.5479827750219735, + "language_loss": 0.85168374, + "learning_rate": 3.301729463727452e-06, + "loss": 0.87337816, + "num_input_tokens_seen": 106091415, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8359375, + "step": 4926, + "time_per_iteration": 2.4603803157806396 + }, + { + "auxiliary_loss_clip": 0.0113285, + "auxiliary_loss_mlp": 0.01039481, + "balance_loss_clip": 1.02391791, + "balance_loss_mlp": 1.04658842, + "epoch": 0.2962272658950849, + "flos": 15012779777280.0, + "grad_norm": 2.1014080951069913, + "language_loss": 0.85908806, + "learning_rate": 3.3014337621866527e-06, + "loss": 0.88081133, + "num_input_tokens_seen": 106109135, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.86328125, + "step": 4927, + "time_per_iteration": 2.4724504947662354 + }, + { + "auxiliary_loss_clip": 0.01129564, + "auxiliary_loss_mlp": 0.01039461, + "balance_loss_clip": 1.02434492, + "balance_loss_mlp": 1.04636681, + "epoch": 0.2962873891477529, + "flos": 14720861946240.0, + "grad_norm": 1.8730507383843338, + "language_loss": 0.80967462, + "learning_rate": 3.3011380112953553e-06, + "loss": 0.83136487, + "num_input_tokens_seen": 106125750, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.83203125, + "step": 4928, + "time_per_iteration": 5.46146297454834 + }, + { + "auxiliary_loss_clip": 0.01138553, + "auxiliary_loss_mlp": 0.01041915, + "balance_loss_clip": 1.023211, + "balance_loss_mlp": 1.04749835, + "epoch": 0.29634751240042084, + "flos": 26723787528960.0, + "grad_norm": 3.002605920988437, + "language_loss": 0.72472513, + "learning_rate": 3.300842211064773e-06, + "loss": 0.7465297, + "num_input_tokens_seen": 106142835, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.91015625, + "step": 4929, + "time_per_iteration": 2.4938502311706543 + }, + { + "auxiliary_loss_clip": 0.01136289, + "auxiliary_loss_mlp": 0.01043304, + "balance_loss_clip": 1.02631676, + "balance_loss_mlp": 1.04823208, + "epoch": 0.2964076356530888, + "flos": 14571293713920.0, + "grad_norm": 2.429634231323073, + "language_loss": 0.72424346, + "learning_rate": 3.3005463615061246e-06, + "loss": 0.74603939, + "num_input_tokens_seen": 106160680, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.8828125, + "step": 4930, + "time_per_iteration": 2.486492156982422 + }, + { + "auxiliary_loss_clip": 0.01059615, + "auxiliary_loss_mlp": 0.01002568, + "balance_loss_clip": 1.00047004, + "balance_loss_mlp": 1.0186131, + "epoch": 0.29646775890575683, + "flos": 63104315063040.0, + "grad_norm": 0.8134562784526058, + "language_loss": 0.60710716, + "learning_rate": 3.3002504626306275e-06, + "loss": 0.627729, + "num_input_tokens_seen": 106224415, + "router_z_loss_clip": 0.02099609, + "router_z_loss_mlp": 0.41015625, + "step": 4931, + "time_per_iteration": 3.002444267272949 + }, + { + "auxiliary_loss_clip": 0.01058931, + "auxiliary_loss_mlp": 0.01001224, + "balance_loss_clip": 0.99926931, + "balance_loss_mlp": 1.01823413, + "epoch": 0.2965278821584248, + "flos": 63067686168960.0, + "grad_norm": 0.7413672345708404, + "language_loss": 0.52383232, + "learning_rate": 3.2999545144495023e-06, + "loss": 0.54443383, + "num_input_tokens_seen": 106279140, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.40625, + "step": 4932, + "time_per_iteration": 2.974777936935425 + }, + { + "auxiliary_loss_clip": 0.01127694, + "auxiliary_loss_mlp": 0.01039106, + "balance_loss_clip": 1.02322757, + "balance_loss_mlp": 1.04449248, + "epoch": 0.29658800541109276, + "flos": 23768734510080.0, + "grad_norm": 3.155895790893495, + "language_loss": 0.81622797, + "learning_rate": 3.299658516973972e-06, + "loss": 0.83789599, + "num_input_tokens_seen": 106298190, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.83203125, + "step": 4933, + "time_per_iteration": 2.518906593322754 + }, + { + "auxiliary_loss_clip": 0.0112788, + "auxiliary_loss_mlp": 0.0103376, + "balance_loss_clip": 1.01854897, + "balance_loss_mlp": 1.04651821, + "epoch": 0.2966481286637607, + "flos": 23988543788160.0, + "grad_norm": 1.671865304120784, + "language_loss": 0.75257647, + "learning_rate": 3.299362470215261e-06, + "loss": 0.77419287, + "num_input_tokens_seen": 106319065, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8125, + "step": 4934, + "time_per_iteration": 2.5015640258789062 + }, + { + "auxiliary_loss_clip": 0.01134944, + "auxiliary_loss_mlp": 0.01045163, + "balance_loss_clip": 1.02837849, + "balance_loss_mlp": 1.04699588, + "epoch": 0.2967082519164287, + "flos": 17165157523200.0, + "grad_norm": 1.752558919138232, + "language_loss": 0.62510157, + "learning_rate": 3.299066374184594e-06, + "loss": 0.64690268, + "num_input_tokens_seen": 106338040, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.87890625, + "step": 4935, + "time_per_iteration": 2.462982654571533 + }, + { + "auxiliary_loss_clip": 0.01129673, + "auxiliary_loss_mlp": 0.01041832, + "balance_loss_clip": 1.02585804, + "balance_loss_mlp": 1.04613912, + "epoch": 0.29676837516909665, + "flos": 29387712816000.0, + "grad_norm": 1.4993711353436514, + "language_loss": 0.79789758, + "learning_rate": 3.2987702288932e-06, + "loss": 0.81961262, + "num_input_tokens_seen": 106358900, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8359375, + "step": 4936, + "time_per_iteration": 2.5267326831817627 + }, + { + "auxiliary_loss_clip": 0.01132719, + "auxiliary_loss_mlp": 0.01045272, + "balance_loss_clip": 1.02854681, + "balance_loss_mlp": 1.04649782, + "epoch": 0.2968284984217646, + "flos": 34751222616960.0, + "grad_norm": 1.8807271027259396, + "language_loss": 0.74074632, + "learning_rate": 3.298474034352309e-06, + "loss": 0.76252627, + "num_input_tokens_seen": 106381805, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.86328125, + "step": 4937, + "time_per_iteration": 2.607790946960449 + }, + { + "auxiliary_loss_clip": 0.01132772, + "auxiliary_loss_mlp": 0.01038823, + "balance_loss_clip": 1.0224793, + "balance_loss_mlp": 1.04839468, + "epoch": 0.2968886216744326, + "flos": 21544104556800.0, + "grad_norm": 1.629632810423829, + "language_loss": 0.7804476, + "learning_rate": 3.2981777905731526e-06, + "loss": 0.80216354, + "num_input_tokens_seen": 106402365, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.84375, + "step": 4938, + "time_per_iteration": 2.469116687774658 + }, + { + "auxiliary_loss_clip": 0.01134705, + "auxiliary_loss_mlp": 0.01041256, + "balance_loss_clip": 1.02543736, + "balance_loss_mlp": 1.04814208, + "epoch": 0.29694874492710055, + "flos": 12787323811200.0, + "grad_norm": 2.041677851061636, + "language_loss": 0.77017808, + "learning_rate": 3.297881497566964e-06, + "loss": 0.79193771, + "num_input_tokens_seen": 106419800, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8671875, + "step": 4939, + "time_per_iteration": 2.453615427017212 + }, + { + "auxiliary_loss_clip": 0.01136816, + "auxiliary_loss_mlp": 0.01036899, + "balance_loss_clip": 1.02075171, + "balance_loss_mlp": 1.04958081, + "epoch": 0.2970088681797685, + "flos": 24569973239040.0, + "grad_norm": 1.5588161926919628, + "language_loss": 0.78206903, + "learning_rate": 3.297585155344979e-06, + "loss": 0.80380619, + "num_input_tokens_seen": 106440300, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.875, + "step": 4940, + "time_per_iteration": 2.5125393867492676 + }, + { + "auxiliary_loss_clip": 0.01133351, + "auxiliary_loss_mlp": 0.01040737, + "balance_loss_clip": 1.0233798, + "balance_loss_mlp": 1.04633832, + "epoch": 0.2970689914324365, + "flos": 23659171050240.0, + "grad_norm": 3.9307439231373884, + "language_loss": 0.75487554, + "learning_rate": 3.297288763918435e-06, + "loss": 0.77661633, + "num_input_tokens_seen": 106460035, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.87109375, + "step": 4941, + "time_per_iteration": 2.5308516025543213 + }, + { + "auxiliary_loss_clip": 0.0113684, + "auxiliary_loss_mlp": 0.01050296, + "balance_loss_clip": 1.03295147, + "balance_loss_mlp": 1.04803753, + "epoch": 0.29712911468510445, + "flos": 39670301439360.0, + "grad_norm": 2.557458362521145, + "language_loss": 0.73998737, + "learning_rate": 3.2969923232985712e-06, + "loss": 0.7618587, + "num_input_tokens_seen": 106481095, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.890625, + "step": 4942, + "time_per_iteration": 2.6214303970336914 + }, + { + "auxiliary_loss_clip": 0.0113696, + "auxiliary_loss_mlp": 0.01047243, + "balance_loss_clip": 1.03017855, + "balance_loss_mlp": 1.04778039, + "epoch": 0.2971892379377724, + "flos": 26395312631040.0, + "grad_norm": 1.997792424787015, + "language_loss": 0.70484138, + "learning_rate": 3.2966958334966287e-06, + "loss": 0.72668344, + "num_input_tokens_seen": 106501590, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.890625, + "step": 4943, + "time_per_iteration": 2.533313751220703 + }, + { + "auxiliary_loss_clip": 0.01137748, + "auxiliary_loss_mlp": 0.01043005, + "balance_loss_clip": 1.02657795, + "balance_loss_mlp": 1.04838014, + "epoch": 0.2972493611904404, + "flos": 17603195880960.0, + "grad_norm": 1.9523342898428475, + "language_loss": 0.80111414, + "learning_rate": 3.2963992945238497e-06, + "loss": 0.82292169, + "num_input_tokens_seen": 106519430, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.89453125, + "step": 4944, + "time_per_iteration": 2.464364528656006 + }, + { + "auxiliary_loss_clip": 0.01129992, + "auxiliary_loss_mlp": 0.01044699, + "balance_loss_clip": 1.02979231, + "balance_loss_mlp": 1.04640603, + "epoch": 0.2973094844431084, + "flos": 20412774817920.0, + "grad_norm": 2.1633352367153105, + "language_loss": 0.83451837, + "learning_rate": 3.2961027063914795e-06, + "loss": 0.85626531, + "num_input_tokens_seen": 106535870, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8359375, + "step": 4945, + "time_per_iteration": 2.4981510639190674 + }, + { + "auxiliary_loss_clip": 0.011318, + "auxiliary_loss_mlp": 0.01039077, + "balance_loss_clip": 1.02353168, + "balance_loss_mlp": 1.04738569, + "epoch": 0.29736960769577636, + "flos": 17493488766720.0, + "grad_norm": 2.2158088930062747, + "language_loss": 0.66624904, + "learning_rate": 3.2958060691107654e-06, + "loss": 0.68795776, + "num_input_tokens_seen": 106553560, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84375, + "step": 4946, + "time_per_iteration": 2.526228666305542 + }, + { + "auxiliary_loss_clip": 0.0113802, + "auxiliary_loss_mlp": 0.01034492, + "balance_loss_clip": 1.01880383, + "balance_loss_mlp": 1.0509392, + "epoch": 0.2974297309484443, + "flos": 26103969417600.0, + "grad_norm": 1.7941079108563611, + "language_loss": 0.73766255, + "learning_rate": 3.2955093826929547e-06, + "loss": 0.75938767, + "num_input_tokens_seen": 106574115, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.87109375, + "step": 4947, + "time_per_iteration": 2.5380265712738037 + }, + { + "auxiliary_loss_clip": 0.01135403, + "auxiliary_loss_mlp": 0.01044741, + "balance_loss_clip": 1.02774215, + "balance_loss_mlp": 1.04653597, + "epoch": 0.2974898542011123, + "flos": 25666433850240.0, + "grad_norm": 2.40735653244717, + "language_loss": 0.7330308, + "learning_rate": 3.2952126471492985e-06, + "loss": 0.75483221, + "num_input_tokens_seen": 106593070, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.890625, + "step": 4948, + "time_per_iteration": 2.5096492767333984 + }, + { + "auxiliary_loss_clip": 0.01129361, + "auxiliary_loss_mlp": 0.01033618, + "balance_loss_clip": 1.01824629, + "balance_loss_mlp": 1.04442465, + "epoch": 0.29754997745378026, + "flos": 18661339658880.0, + "grad_norm": 2.0973131899278825, + "language_loss": 0.84031421, + "learning_rate": 3.2949158624910497e-06, + "loss": 0.86194396, + "num_input_tokens_seen": 106610695, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8515625, + "step": 4949, + "time_per_iteration": 2.4650402069091797 + }, + { + "auxiliary_loss_clip": 0.01129505, + "auxiliary_loss_mlp": 0.01036103, + "balance_loss_clip": 1.02019429, + "balance_loss_mlp": 1.04509461, + "epoch": 0.2976101007064482, + "flos": 22274599449600.0, + "grad_norm": 1.77267818675948, + "language_loss": 0.71322602, + "learning_rate": 3.2946190287294603e-06, + "loss": 0.73488206, + "num_input_tokens_seen": 106631300, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.84375, + "step": 4950, + "time_per_iteration": 2.491163969039917 + }, + { + "auxiliary_loss_clip": 0.01127031, + "auxiliary_loss_mlp": 0.01043355, + "balance_loss_clip": 1.02792883, + "balance_loss_mlp": 1.04543924, + "epoch": 0.2976702239591162, + "flos": 21945657674880.0, + "grad_norm": 1.7996518465212372, + "language_loss": 0.82192945, + "learning_rate": 3.294322145875789e-06, + "loss": 0.84363329, + "num_input_tokens_seen": 106650065, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.81640625, + "step": 4951, + "time_per_iteration": 2.5001299381256104 + }, + { + "auxiliary_loss_clip": 0.01127377, + "auxiliary_loss_mlp": 0.01035652, + "balance_loss_clip": 1.01936841, + "balance_loss_mlp": 1.04211378, + "epoch": 0.29773034721178415, + "flos": 24637197542400.0, + "grad_norm": 2.6816702718299763, + "language_loss": 0.73421168, + "learning_rate": 3.2940252139412912e-06, + "loss": 0.75584191, + "num_input_tokens_seen": 106668230, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8515625, + "step": 4952, + "time_per_iteration": 2.4888715744018555 + }, + { + "auxiliary_loss_clip": 0.01132645, + "auxiliary_loss_mlp": 0.01041244, + "balance_loss_clip": 1.0246501, + "balance_loss_mlp": 1.04677546, + "epoch": 0.2977904704644521, + "flos": 20557566541440.0, + "grad_norm": 1.7548041314188605, + "language_loss": 0.83702904, + "learning_rate": 3.293728232937228e-06, + "loss": 0.85876799, + "num_input_tokens_seen": 106687785, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.859375, + "step": 4953, + "time_per_iteration": 2.486267566680908 + }, + { + "auxiliary_loss_clip": 0.01131661, + "auxiliary_loss_mlp": 0.0103863, + "balance_loss_clip": 1.02330005, + "balance_loss_mlp": 1.04566419, + "epoch": 0.2978505937171201, + "flos": 18916449027840.0, + "grad_norm": 2.078619348093555, + "language_loss": 0.74560732, + "learning_rate": 3.2934312028748597e-06, + "loss": 0.7673102, + "num_input_tokens_seen": 106706875, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.859375, + "step": 4954, + "time_per_iteration": 2.454066276550293 + }, + { + "auxiliary_loss_clip": 0.01129016, + "auxiliary_loss_mlp": 0.01036885, + "balance_loss_clip": 1.02153063, + "balance_loss_mlp": 1.0450201, + "epoch": 0.29791071696978805, + "flos": 19317750750720.0, + "grad_norm": 1.9786208165821892, + "language_loss": 0.75643009, + "learning_rate": 3.293134123765452e-06, + "loss": 0.77808911, + "num_input_tokens_seen": 106725105, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.83984375, + "step": 4955, + "time_per_iteration": 2.487297773361206 + }, + { + "auxiliary_loss_clip": 0.01132846, + "auxiliary_loss_mlp": 0.01035521, + "balance_loss_clip": 1.01980329, + "balance_loss_mlp": 1.04604173, + "epoch": 0.297970840222456, + "flos": 18806813740800.0, + "grad_norm": 3.347495877937089, + "language_loss": 0.72235912, + "learning_rate": 3.2928369956202684e-06, + "loss": 0.74404275, + "num_input_tokens_seen": 106744780, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8671875, + "step": 4956, + "time_per_iteration": 2.453639507293701 + }, + { + "auxiliary_loss_clip": 0.01134178, + "auxiliary_loss_mlp": 0.01044496, + "balance_loss_clip": 1.02737164, + "balance_loss_mlp": 1.04482651, + "epoch": 0.298030963475124, + "flos": 22852760762880.0, + "grad_norm": 1.6786835957024704, + "language_loss": 0.79504669, + "learning_rate": 3.2925398184505754e-06, + "loss": 0.81683344, + "num_input_tokens_seen": 106764670, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.89453125, + "step": 4957, + "time_per_iteration": 2.4680192470550537 + }, + { + "auxiliary_loss_clip": 0.01133842, + "auxiliary_loss_mlp": 0.0103974, + "balance_loss_clip": 1.02283621, + "balance_loss_mlp": 1.04692602, + "epoch": 0.298091086727792, + "flos": 21868485304320.0, + "grad_norm": 1.5505958112028584, + "language_loss": 0.70515305, + "learning_rate": 3.2922425922676437e-06, + "loss": 0.7268889, + "num_input_tokens_seen": 106783695, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8671875, + "step": 4958, + "time_per_iteration": 2.463550090789795 + }, + { + "auxiliary_loss_clip": 0.01130665, + "auxiliary_loss_mlp": 0.0104304, + "balance_loss_clip": 1.02685153, + "balance_loss_mlp": 1.04660892, + "epoch": 0.29815120998045996, + "flos": 21175014355200.0, + "grad_norm": 1.6483091075690746, + "language_loss": 0.78709656, + "learning_rate": 3.291945317082743e-06, + "loss": 0.8088336, + "num_input_tokens_seen": 106803150, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.83984375, + "step": 4959, + "time_per_iteration": 2.4896273612976074 + }, + { + "auxiliary_loss_clip": 0.0112987, + "auxiliary_loss_mlp": 0.01045688, + "balance_loss_clip": 1.03010738, + "balance_loss_mlp": 1.04477429, + "epoch": 0.29821133323312793, + "flos": 19896271200000.0, + "grad_norm": 1.8058675414038505, + "language_loss": 0.79814601, + "learning_rate": 3.291647992907147e-06, + "loss": 0.81990159, + "num_input_tokens_seen": 106820705, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8515625, + "step": 4960, + "time_per_iteration": 2.4524307250976562 + }, + { + "auxiliary_loss_clip": 0.01133353, + "auxiliary_loss_mlp": 0.01047089, + "balance_loss_clip": 1.02998269, + "balance_loss_mlp": 1.04504156, + "epoch": 0.2982714564857959, + "flos": 12750766744320.0, + "grad_norm": 2.8105894923901418, + "language_loss": 0.73709917, + "learning_rate": 3.291350619752129e-06, + "loss": 0.75890362, + "num_input_tokens_seen": 106837335, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.8828125, + "step": 4961, + "time_per_iteration": 2.463160991668701 + }, + { + "auxiliary_loss_clip": 0.01132538, + "auxiliary_loss_mlp": 0.0103792, + "balance_loss_clip": 1.02301908, + "balance_loss_mlp": 1.0466218, + "epoch": 0.29833157973846386, + "flos": 22271905929600.0, + "grad_norm": 1.946317435202559, + "language_loss": 0.62041843, + "learning_rate": 3.291053197628967e-06, + "loss": 0.64212298, + "num_input_tokens_seen": 106856250, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.859375, + "step": 4962, + "time_per_iteration": 2.4734280109405518 + }, + { + "auxiliary_loss_clip": 0.0113099, + "auxiliary_loss_mlp": 0.01038012, + "balance_loss_clip": 1.02143037, + "balance_loss_mlp": 1.04580986, + "epoch": 0.2983917029911318, + "flos": 15372999319680.0, + "grad_norm": 1.708438122809617, + "language_loss": 0.83075964, + "learning_rate": 3.2907557265489375e-06, + "loss": 0.85244966, + "num_input_tokens_seen": 106873370, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8515625, + "step": 4963, + "time_per_iteration": 2.4676647186279297 + }, + { + "auxiliary_loss_clip": 0.01132139, + "auxiliary_loss_mlp": 0.01037543, + "balance_loss_clip": 1.02108073, + "balance_loss_mlp": 1.04811728, + "epoch": 0.2984518262437998, + "flos": 15377632174080.0, + "grad_norm": 2.8539744131594924, + "language_loss": 0.66537225, + "learning_rate": 3.290458206523322e-06, + "loss": 0.68706906, + "num_input_tokens_seen": 106890330, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.83984375, + "step": 4964, + "time_per_iteration": 2.425261974334717 + }, + { + "auxiliary_loss_clip": 0.01128116, + "auxiliary_loss_mlp": 0.01034534, + "balance_loss_clip": 1.01994288, + "balance_loss_mlp": 1.04498291, + "epoch": 0.29851194949646775, + "flos": 18108458542080.0, + "grad_norm": 1.6142193033036512, + "language_loss": 0.70836121, + "learning_rate": 3.2901606375634015e-06, + "loss": 0.72998774, + "num_input_tokens_seen": 106909190, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.83203125, + "step": 4965, + "time_per_iteration": 2.468221664428711 + }, + { + "auxiliary_loss_clip": 0.01137695, + "auxiliary_loss_mlp": 0.01047125, + "balance_loss_clip": 1.0309124, + "balance_loss_mlp": 1.05098724, + "epoch": 0.2985720727491357, + "flos": 22018233104640.0, + "grad_norm": 2.501073720290292, + "language_loss": 0.66185117, + "learning_rate": 3.289863019680461e-06, + "loss": 0.68369937, + "num_input_tokens_seen": 106927825, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8671875, + "step": 4966, + "time_per_iteration": 2.479327440261841 + }, + { + "auxiliary_loss_clip": 0.01134994, + "auxiliary_loss_mlp": 0.01040953, + "balance_loss_clip": 1.02595615, + "balance_loss_mlp": 1.04869342, + "epoch": 0.2986321960018037, + "flos": 13041355772160.0, + "grad_norm": 2.7651343279829215, + "language_loss": 0.74186444, + "learning_rate": 3.289565352885785e-06, + "loss": 0.76362395, + "num_input_tokens_seen": 106943155, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.86328125, + "step": 4967, + "time_per_iteration": 2.4752163887023926 + }, + { + "auxiliary_loss_clip": 0.01129475, + "auxiliary_loss_mlp": 0.01035044, + "balance_loss_clip": 1.02035177, + "balance_loss_mlp": 1.04422212, + "epoch": 0.29869231925447165, + "flos": 14465034305280.0, + "grad_norm": 1.9700123684688966, + "language_loss": 0.71222222, + "learning_rate": 3.2892676371906614e-06, + "loss": 0.73386747, + "num_input_tokens_seen": 106960295, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8515625, + "step": 4968, + "time_per_iteration": 2.448028564453125 + }, + { + "auxiliary_loss_clip": 0.01131577, + "auxiliary_loss_mlp": 0.0103395, + "balance_loss_clip": 1.01884651, + "balance_loss_mlp": 1.04596853, + "epoch": 0.2987524425071396, + "flos": 31650228639360.0, + "grad_norm": 2.0898000655075752, + "language_loss": 0.77127141, + "learning_rate": 3.2889698726063805e-06, + "loss": 0.79292667, + "num_input_tokens_seen": 106982870, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.85546875, + "step": 4969, + "time_per_iteration": 2.5737853050231934 + }, + { + "auxiliary_loss_clip": 0.01131698, + "auxiliary_loss_mlp": 0.01037718, + "balance_loss_clip": 1.022578, + "balance_loss_mlp": 1.04641569, + "epoch": 0.2988125657598076, + "flos": 21433427775360.0, + "grad_norm": 1.5683816051841135, + "language_loss": 0.69798505, + "learning_rate": 3.2886720591442327e-06, + "loss": 0.71967924, + "num_input_tokens_seen": 107002405, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8515625, + "step": 4970, + "time_per_iteration": 5.428143501281738 + }, + { + "auxiliary_loss_clip": 0.01135849, + "auxiliary_loss_mlp": 0.01045402, + "balance_loss_clip": 1.02831888, + "balance_loss_mlp": 1.04582572, + "epoch": 0.2988726890124756, + "flos": 18076965292800.0, + "grad_norm": 2.0403310419369314, + "language_loss": 0.85269564, + "learning_rate": 3.2883741968155103e-06, + "loss": 0.8745082, + "num_input_tokens_seen": 107017310, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.8984375, + "step": 4971, + "time_per_iteration": 2.4557158946990967 + }, + { + "auxiliary_loss_clip": 0.0113165, + "auxiliary_loss_mlp": 0.01044418, + "balance_loss_clip": 1.02905178, + "balance_loss_mlp": 1.0487361, + "epoch": 0.29893281226514357, + "flos": 21755653706880.0, + "grad_norm": 1.8300460221108372, + "language_loss": 0.79116535, + "learning_rate": 3.2880762856315107e-06, + "loss": 0.81292605, + "num_input_tokens_seen": 107034645, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.828125, + "step": 4972, + "time_per_iteration": 2.492119550704956 + }, + { + "auxiliary_loss_clip": 0.0113418, + "auxiliary_loss_mlp": 0.01040036, + "balance_loss_clip": 1.02457476, + "balance_loss_mlp": 1.0491786, + "epoch": 0.29899293551781153, + "flos": 16836718538880.0, + "grad_norm": 1.9080397703774756, + "language_loss": 0.85019803, + "learning_rate": 3.2877783256035285e-06, + "loss": 0.87194014, + "num_input_tokens_seen": 107051125, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84765625, + "step": 4973, + "time_per_iteration": 2.4409923553466797 + }, + { + "auxiliary_loss_clip": 0.01128243, + "auxiliary_loss_mlp": 0.0103693, + "balance_loss_clip": 1.02192163, + "balance_loss_mlp": 1.04866779, + "epoch": 0.2990530587704795, + "flos": 11729215946880.0, + "grad_norm": 1.5302170897903997, + "language_loss": 0.77397263, + "learning_rate": 3.287480316742863e-06, + "loss": 0.79562438, + "num_input_tokens_seen": 107068815, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.796875, + "step": 4974, + "time_per_iteration": 2.4786176681518555 + }, + { + "auxiliary_loss_clip": 0.01135129, + "auxiliary_loss_mlp": 0.01042004, + "balance_loss_clip": 1.02723432, + "balance_loss_mlp": 1.04905188, + "epoch": 0.29911318202314746, + "flos": 28039877850240.0, + "grad_norm": 2.0911748108299015, + "language_loss": 0.72264957, + "learning_rate": 3.287182259060815e-06, + "loss": 0.74442089, + "num_input_tokens_seen": 107090420, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.859375, + "step": 4975, + "time_per_iteration": 2.5267655849456787 + }, + { + "auxiliary_loss_clip": 0.01133427, + "auxiliary_loss_mlp": 0.01038056, + "balance_loss_clip": 1.02204621, + "balance_loss_mlp": 1.0501368, + "epoch": 0.2991733052758154, + "flos": 18733555952640.0, + "grad_norm": 4.957635138610608, + "language_loss": 0.76028466, + "learning_rate": 3.286884152568687e-06, + "loss": 0.78199953, + "num_input_tokens_seen": 107107255, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.83203125, + "step": 4976, + "time_per_iteration": 2.46476149559021 + }, + { + "auxiliary_loss_clip": 0.01131159, + "auxiliary_loss_mlp": 0.01039669, + "balance_loss_clip": 1.02464914, + "balance_loss_mlp": 1.04786563, + "epoch": 0.2992334285284834, + "flos": 15559160532480.0, + "grad_norm": 2.141179611311424, + "language_loss": 0.86060619, + "learning_rate": 3.2865859972777827e-06, + "loss": 0.88231456, + "num_input_tokens_seen": 107123840, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.83203125, + "step": 4977, + "time_per_iteration": 2.4342682361602783 + }, + { + "auxiliary_loss_clip": 0.01135764, + "auxiliary_loss_mlp": 0.01041989, + "balance_loss_clip": 1.02605033, + "balance_loss_mlp": 1.0510987, + "epoch": 0.29929355178115136, + "flos": 21797561900160.0, + "grad_norm": 1.6147948075287948, + "language_loss": 0.68286109, + "learning_rate": 3.2862877931994088e-06, + "loss": 0.7046386, + "num_input_tokens_seen": 107143475, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.84375, + "step": 4978, + "time_per_iteration": 2.539616823196411 + }, + { + "auxiliary_loss_clip": 0.01138133, + "auxiliary_loss_mlp": 0.01036989, + "balance_loss_clip": 1.02078843, + "balance_loss_mlp": 1.053123, + "epoch": 0.2993536750338193, + "flos": 21178533888000.0, + "grad_norm": 1.9781984123500023, + "language_loss": 0.7654568, + "learning_rate": 3.2859895403448726e-06, + "loss": 0.78720796, + "num_input_tokens_seen": 107161725, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8515625, + "step": 4979, + "time_per_iteration": 2.4865188598632812 + }, + { + "auxiliary_loss_clip": 0.01130641, + "auxiliary_loss_mlp": 0.01038072, + "balance_loss_clip": 1.02265859, + "balance_loss_mlp": 1.04520524, + "epoch": 0.2994137982864873, + "flos": 32122130544000.0, + "grad_norm": 1.7578947600277828, + "language_loss": 0.68300819, + "learning_rate": 3.285691238725484e-06, + "loss": 0.70469534, + "num_input_tokens_seen": 107183935, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.85546875, + "step": 4980, + "time_per_iteration": 2.6137757301330566 + }, + { + "auxiliary_loss_clip": 0.01132193, + "auxiliary_loss_mlp": 0.01039056, + "balance_loss_clip": 1.02396405, + "balance_loss_mlp": 1.05068171, + "epoch": 0.29947392153915525, + "flos": 21105419754240.0, + "grad_norm": 1.9242198828448243, + "language_loss": 0.73239923, + "learning_rate": 3.285392888352555e-06, + "loss": 0.75411171, + "num_input_tokens_seen": 107204285, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8125, + "step": 4981, + "time_per_iteration": 2.5342931747436523 + }, + { + "auxiliary_loss_clip": 0.01135451, + "auxiliary_loss_mlp": 0.01037274, + "balance_loss_clip": 1.02227712, + "balance_loss_mlp": 1.04691803, + "epoch": 0.2995340447918232, + "flos": 21542632099200.0, + "grad_norm": 1.470312251429405, + "language_loss": 0.86429024, + "learning_rate": 3.2850944892373987e-06, + "loss": 0.8860175, + "num_input_tokens_seen": 107225265, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.88671875, + "step": 4982, + "time_per_iteration": 2.5238943099975586 + }, + { + "auxiliary_loss_clip": 0.01138194, + "auxiliary_loss_mlp": 0.01041283, + "balance_loss_clip": 1.02446246, + "balance_loss_mlp": 1.04975057, + "epoch": 0.2995941680444912, + "flos": 16725143917440.0, + "grad_norm": 2.2481661066872904, + "language_loss": 0.86378068, + "learning_rate": 3.2847960413913307e-06, + "loss": 0.88557541, + "num_input_tokens_seen": 107241335, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8828125, + "step": 4983, + "time_per_iteration": 2.4477322101593018 + }, + { + "auxiliary_loss_clip": 0.01133456, + "auxiliary_loss_mlp": 0.01040756, + "balance_loss_clip": 1.02577138, + "balance_loss_mlp": 1.0483377, + "epoch": 0.2996542912971592, + "flos": 20923496346240.0, + "grad_norm": 1.8474343514891325, + "language_loss": 0.78286207, + "learning_rate": 3.284497544825668e-06, + "loss": 0.80460417, + "num_input_tokens_seen": 107259375, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8515625, + "step": 4984, + "time_per_iteration": 2.490079402923584 + }, + { + "auxiliary_loss_clip": 0.01136807, + "auxiliary_loss_mlp": 0.01046143, + "balance_loss_clip": 1.02960873, + "balance_loss_mlp": 1.05052662, + "epoch": 0.29971441454982717, + "flos": 25079868754560.0, + "grad_norm": 1.555514289558953, + "language_loss": 0.78418988, + "learning_rate": 3.2841989995517303e-06, + "loss": 0.80601943, + "num_input_tokens_seen": 107279890, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.86328125, + "step": 4985, + "time_per_iteration": 2.5188379287719727 + }, + { + "auxiliary_loss_clip": 0.01138287, + "auxiliary_loss_mlp": 0.01037976, + "balance_loss_clip": 1.02115583, + "balance_loss_mlp": 1.05010915, + "epoch": 0.29977453780249513, + "flos": 52555911840000.0, + "grad_norm": 3.8074401298215905, + "language_loss": 0.72157449, + "learning_rate": 3.283900405580837e-06, + "loss": 0.74333715, + "num_input_tokens_seen": 107303430, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8828125, + "step": 4986, + "time_per_iteration": 2.7730660438537598 + }, + { + "auxiliary_loss_clip": 0.01136069, + "auxiliary_loss_mlp": 0.01041734, + "balance_loss_clip": 1.02523577, + "balance_loss_mlp": 1.04813981, + "epoch": 0.2998346610551631, + "flos": 22237144542720.0, + "grad_norm": 1.7357810931981628, + "language_loss": 0.73332191, + "learning_rate": 3.283601762924312e-06, + "loss": 0.75509989, + "num_input_tokens_seen": 107323700, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.87890625, + "step": 4987, + "time_per_iteration": 2.4857406616210938 + }, + { + "auxiliary_loss_clip": 0.01131799, + "auxiliary_loss_mlp": 0.01036451, + "balance_loss_clip": 1.02162147, + "balance_loss_mlp": 1.04787469, + "epoch": 0.29989478430783106, + "flos": 16873203778560.0, + "grad_norm": 2.6184059112472817, + "language_loss": 0.80173379, + "learning_rate": 3.2833030715934793e-06, + "loss": 0.82341629, + "num_input_tokens_seen": 107341965, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.83984375, + "step": 4988, + "time_per_iteration": 2.477614641189575 + }, + { + "auxiliary_loss_clip": 0.01133993, + "auxiliary_loss_mlp": 0.01044495, + "balance_loss_clip": 1.02874756, + "balance_loss_mlp": 1.04897678, + "epoch": 0.29995490756049903, + "flos": 23768878164480.0, + "grad_norm": 1.615528223125509, + "language_loss": 0.70302641, + "learning_rate": 3.2830043315996658e-06, + "loss": 0.72481132, + "num_input_tokens_seen": 107362615, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8515625, + "step": 4989, + "time_per_iteration": 2.4942874908447266 + }, + { + "auxiliary_loss_clip": 0.01137636, + "auxiliary_loss_mlp": 0.01040507, + "balance_loss_clip": 1.02382946, + "balance_loss_mlp": 1.05045295, + "epoch": 0.300015030813167, + "flos": 14465321614080.0, + "grad_norm": 2.0547136882256654, + "language_loss": 0.85636222, + "learning_rate": 3.282705542954199e-06, + "loss": 0.87814367, + "num_input_tokens_seen": 107378980, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.87109375, + "step": 4990, + "time_per_iteration": 2.455134391784668 + }, + { + "auxiliary_loss_clip": 0.01135916, + "auxiliary_loss_mlp": 0.01035323, + "balance_loss_clip": 1.01884782, + "balance_loss_mlp": 1.04822564, + "epoch": 0.30007515406583496, + "flos": 25191982080000.0, + "grad_norm": 1.6641511475566748, + "language_loss": 0.67125142, + "learning_rate": 3.28240670566841e-06, + "loss": 0.69296378, + "num_input_tokens_seen": 107397640, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.87890625, + "step": 4991, + "time_per_iteration": 2.4928019046783447 + }, + { + "auxiliary_loss_clip": 0.01137405, + "auxiliary_loss_mlp": 0.0103888, + "balance_loss_clip": 1.02165437, + "balance_loss_mlp": 1.0479908, + "epoch": 0.3001352773185029, + "flos": 19391188106880.0, + "grad_norm": 1.5868946812173, + "language_loss": 0.78707612, + "learning_rate": 3.28210781975363e-06, + "loss": 0.80883896, + "num_input_tokens_seen": 107416020, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.89453125, + "step": 4992, + "time_per_iteration": 2.5030534267425537 + }, + { + "auxiliary_loss_clip": 0.01135049, + "auxiliary_loss_mlp": 0.01036168, + "balance_loss_clip": 1.02056348, + "balance_loss_mlp": 1.04976213, + "epoch": 0.3001954005711709, + "flos": 21543853161600.0, + "grad_norm": 1.8035914694742925, + "language_loss": 0.824085, + "learning_rate": 3.281808885221193e-06, + "loss": 0.84579718, + "num_input_tokens_seen": 107436340, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8515625, + "step": 4993, + "time_per_iteration": 2.475588083267212 + }, + { + "auxiliary_loss_clip": 0.01138101, + "auxiliary_loss_mlp": 0.01042764, + "balance_loss_clip": 1.02522802, + "balance_loss_mlp": 1.04808736, + "epoch": 0.30025552382383885, + "flos": 17384320356480.0, + "grad_norm": 2.0505124462232898, + "language_loss": 0.85850489, + "learning_rate": 3.2815099020824345e-06, + "loss": 0.88031358, + "num_input_tokens_seen": 107454585, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8984375, + "step": 4994, + "time_per_iteration": 2.47881817817688 + }, + { + "auxiliary_loss_clip": 0.0113641, + "auxiliary_loss_mlp": 0.01036445, + "balance_loss_clip": 1.02086425, + "balance_loss_mlp": 1.05017769, + "epoch": 0.3003156470765068, + "flos": 29533330552320.0, + "grad_norm": 1.5183999234373478, + "language_loss": 0.8111707, + "learning_rate": 3.2812108703486924e-06, + "loss": 0.83289921, + "num_input_tokens_seen": 107477180, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.86328125, + "step": 4995, + "time_per_iteration": 2.5481183528900146 + }, + { + "auxiliary_loss_clip": 0.0113418, + "auxiliary_loss_mlp": 0.01041404, + "balance_loss_clip": 1.02522683, + "balance_loss_mlp": 1.05089867, + "epoch": 0.3003757703291748, + "flos": 43646402465280.0, + "grad_norm": 1.7074459415862762, + "language_loss": 0.67098773, + "learning_rate": 3.2809117900313055e-06, + "loss": 0.69274354, + "num_input_tokens_seen": 107500250, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.83203125, + "step": 4996, + "time_per_iteration": 2.6810193061828613 + }, + { + "auxiliary_loss_clip": 0.01134671, + "auxiliary_loss_mlp": 0.01040082, + "balance_loss_clip": 1.02392912, + "balance_loss_mlp": 1.04883564, + "epoch": 0.30043589358184275, + "flos": 22528380015360.0, + "grad_norm": 1.7509046873587113, + "language_loss": 0.75304276, + "learning_rate": 3.280612661141615e-06, + "loss": 0.77479029, + "num_input_tokens_seen": 107520070, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.859375, + "step": 4997, + "time_per_iteration": 2.472226858139038 + }, + { + "auxiliary_loss_clip": 0.01132042, + "auxiliary_loss_mlp": 0.01038973, + "balance_loss_clip": 1.02372646, + "balance_loss_mlp": 1.04816282, + "epoch": 0.30049601683451077, + "flos": 20995892208000.0, + "grad_norm": 1.9401125864941864, + "language_loss": 0.77664721, + "learning_rate": 3.2803134836909646e-06, + "loss": 0.79835731, + "num_input_tokens_seen": 107539285, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.83984375, + "step": 4998, + "time_per_iteration": 2.495087146759033 + }, + { + "auxiliary_loss_clip": 0.01129805, + "auxiliary_loss_mlp": 0.01039417, + "balance_loss_clip": 1.02469468, + "balance_loss_mlp": 1.04812598, + "epoch": 0.30055614008717874, + "flos": 23916004272000.0, + "grad_norm": 1.5996751316274151, + "language_loss": 0.73429006, + "learning_rate": 3.2800142576906985e-06, + "loss": 0.75598228, + "num_input_tokens_seen": 107560260, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.81640625, + "step": 4999, + "time_per_iteration": 2.491774082183838 + }, + { + "auxiliary_loss_clip": 0.01134839, + "auxiliary_loss_mlp": 0.01037955, + "balance_loss_clip": 1.02250576, + "balance_loss_mlp": 1.0498935, + "epoch": 0.3006162633398467, + "flos": 19169798630400.0, + "grad_norm": 1.6017930279588588, + "language_loss": 0.756015, + "learning_rate": 3.2797149831521626e-06, + "loss": 0.77774298, + "num_input_tokens_seen": 107579260, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8515625, + "step": 5000, + "time_per_iteration": 2.572003126144409 + }, + { + "auxiliary_loss_clip": 0.01131295, + "auxiliary_loss_mlp": 0.01036738, + "balance_loss_clip": 1.02329731, + "balance_loss_mlp": 1.04886353, + "epoch": 0.30067638659251467, + "flos": 14679241061760.0, + "grad_norm": 1.977226227337592, + "language_loss": 0.81681275, + "learning_rate": 3.2794156600867073e-06, + "loss": 0.83849311, + "num_input_tokens_seen": 107595245, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.82421875, + "step": 5001, + "time_per_iteration": 2.4240355491638184 + }, + { + "auxiliary_loss_clip": 0.01136183, + "auxiliary_loss_mlp": 0.01041393, + "balance_loss_clip": 1.02538288, + "balance_loss_mlp": 1.05103087, + "epoch": 0.30073650984518263, + "flos": 23368007404800.0, + "grad_norm": 1.5846802536013025, + "language_loss": 0.8056432, + "learning_rate": 3.2791162885056815e-06, + "loss": 0.82741892, + "num_input_tokens_seen": 107613985, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8515625, + "step": 5002, + "time_per_iteration": 2.5848264694213867 + }, + { + "auxiliary_loss_clip": 0.01137551, + "auxiliary_loss_mlp": 0.01037496, + "balance_loss_clip": 1.02240372, + "balance_loss_mlp": 1.04907179, + "epoch": 0.3007966330978506, + "flos": 22966633854720.0, + "grad_norm": 1.6918091030667293, + "language_loss": 0.71209854, + "learning_rate": 3.2788168684204376e-06, + "loss": 0.73384899, + "num_input_tokens_seen": 107631435, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8828125, + "step": 5003, + "time_per_iteration": 2.4672186374664307 + }, + { + "auxiliary_loss_clip": 0.01136595, + "auxiliary_loss_mlp": 0.01038624, + "balance_loss_clip": 1.02377009, + "balance_loss_mlp": 1.05050564, + "epoch": 0.30085675635051856, + "flos": 27818452460160.0, + "grad_norm": 1.8725932973877313, + "language_loss": 0.70613277, + "learning_rate": 3.27851739984233e-06, + "loss": 0.72788501, + "num_input_tokens_seen": 107650530, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.859375, + "step": 5004, + "time_per_iteration": 2.579941511154175 + }, + { + "auxiliary_loss_clip": 0.01135872, + "auxiliary_loss_mlp": 0.01044061, + "balance_loss_clip": 1.02817035, + "balance_loss_mlp": 1.04977477, + "epoch": 0.3009168796031865, + "flos": 10882729059840.0, + "grad_norm": 2.8634075898885767, + "language_loss": 0.81359464, + "learning_rate": 3.278217882782715e-06, + "loss": 0.83539397, + "num_input_tokens_seen": 107662240, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.859375, + "step": 5005, + "time_per_iteration": 2.4043233394622803 + }, + { + "auxiliary_loss_clip": 0.01132041, + "auxiliary_loss_mlp": 0.01041947, + "balance_loss_clip": 1.02702177, + "balance_loss_mlp": 1.04792035, + "epoch": 0.3009770028558545, + "flos": 23805399317760.0, + "grad_norm": 2.9232502202927266, + "language_loss": 0.74906754, + "learning_rate": 3.2779183172529497e-06, + "loss": 0.77080745, + "num_input_tokens_seen": 107680330, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.84375, + "step": 5006, + "time_per_iteration": 2.5169718265533447 + }, + { + "auxiliary_loss_clip": 0.0113004, + "auxiliary_loss_mlp": 0.01041924, + "balance_loss_clip": 1.02712977, + "balance_loss_mlp": 1.04745531, + "epoch": 0.30103712610852246, + "flos": 26468211283200.0, + "grad_norm": 2.157802275476472, + "language_loss": 0.70810544, + "learning_rate": 3.2776187032643932e-06, + "loss": 0.72982514, + "num_input_tokens_seen": 107700020, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.828125, + "step": 5007, + "time_per_iteration": 2.500135898590088 + }, + { + "auxiliary_loss_clip": 0.01133792, + "auxiliary_loss_mlp": 0.01040278, + "balance_loss_clip": 1.02453065, + "balance_loss_mlp": 1.04947257, + "epoch": 0.3010972493611904, + "flos": 22856459863680.0, + "grad_norm": 2.301214894203853, + "language_loss": 0.76435697, + "learning_rate": 3.2773190408284075e-06, + "loss": 0.78609765, + "num_input_tokens_seen": 107718575, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.84375, + "step": 5008, + "time_per_iteration": 2.5071120262145996 + }, + { + "auxiliary_loss_clip": 0.0113182, + "auxiliary_loss_mlp": 0.01039886, + "balance_loss_clip": 1.02464485, + "balance_loss_mlp": 1.04823518, + "epoch": 0.3011573726138584, + "flos": 24053685102720.0, + "grad_norm": 1.7973688674758703, + "language_loss": 0.84830707, + "learning_rate": 3.2770193299563564e-06, + "loss": 0.87002409, + "num_input_tokens_seen": 107738635, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8359375, + "step": 5009, + "time_per_iteration": 2.531024694442749 + }, + { + "auxiliary_loss_clip": 0.01135897, + "auxiliary_loss_mlp": 0.01037547, + "balance_loss_clip": 1.0211432, + "balance_loss_mlp": 1.04830122, + "epoch": 0.30121749586652635, + "flos": 20259687052800.0, + "grad_norm": 1.9976209282841157, + "language_loss": 0.83813334, + "learning_rate": 3.276719570659604e-06, + "loss": 0.85986781, + "num_input_tokens_seen": 107753415, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.875, + "step": 5010, + "time_per_iteration": 2.4690375328063965 + }, + { + "auxiliary_loss_clip": 0.01130658, + "auxiliary_loss_mlp": 0.01034306, + "balance_loss_clip": 1.02003646, + "balance_loss_mlp": 1.04724431, + "epoch": 0.3012776191191944, + "flos": 26943058103040.0, + "grad_norm": 1.9597018241269177, + "language_loss": 0.85013181, + "learning_rate": 3.2764197629495176e-06, + "loss": 0.87178147, + "num_input_tokens_seen": 107773840, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.83203125, + "step": 5011, + "time_per_iteration": 2.501708745956421 + }, + { + "auxiliary_loss_clip": 0.01134213, + "auxiliary_loss_mlp": 0.01039104, + "balance_loss_clip": 1.02335644, + "balance_loss_mlp": 1.04754543, + "epoch": 0.30133774237186234, + "flos": 20412307941120.0, + "grad_norm": 2.0524404295798013, + "language_loss": 0.71966654, + "learning_rate": 3.2761199068374656e-06, + "loss": 0.74139971, + "num_input_tokens_seen": 107792020, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8671875, + "step": 5012, + "time_per_iteration": 3.979128360748291 + }, + { + "auxiliary_loss_clip": 0.01131878, + "auxiliary_loss_mlp": 0.01037572, + "balance_loss_clip": 1.0229032, + "balance_loss_mlp": 1.04721081, + "epoch": 0.3013978656245303, + "flos": 19792453916160.0, + "grad_norm": 1.9997819947408795, + "language_loss": 0.87396109, + "learning_rate": 3.275820002334819e-06, + "loss": 0.89565563, + "num_input_tokens_seen": 107809595, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.84765625, + "step": 5013, + "time_per_iteration": 2.467177629470825 + }, + { + "auxiliary_loss_clip": 0.01136565, + "auxiliary_loss_mlp": 0.01036881, + "balance_loss_clip": 1.0200367, + "balance_loss_mlp": 1.04842985, + "epoch": 0.30145798887719827, + "flos": 16249650652800.0, + "grad_norm": 3.4702040063697313, + "language_loss": 0.83367115, + "learning_rate": 3.2755200494529496e-06, + "loss": 0.85540557, + "num_input_tokens_seen": 107827230, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8828125, + "step": 5014, + "time_per_iteration": 2.4654901027679443 + }, + { + "auxiliary_loss_clip": 0.01128425, + "auxiliary_loss_mlp": 0.01033529, + "balance_loss_clip": 1.01896727, + "balance_loss_mlp": 1.0471499, + "epoch": 0.30151811212986623, + "flos": 24571733005440.0, + "grad_norm": 1.6346146355602116, + "language_loss": 0.68218327, + "learning_rate": 3.2752200482032323e-06, + "loss": 0.70380276, + "num_input_tokens_seen": 107847195, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8125, + "step": 5015, + "time_per_iteration": 2.4994328022003174 + }, + { + "auxiliary_loss_clip": 0.01132371, + "auxiliary_loss_mlp": 0.01038543, + "balance_loss_clip": 1.02309942, + "balance_loss_mlp": 1.04864407, + "epoch": 0.3015782353825342, + "flos": 21872076664320.0, + "grad_norm": 2.7110353723362635, + "language_loss": 0.74712509, + "learning_rate": 3.2749199985970436e-06, + "loss": 0.76883423, + "num_input_tokens_seen": 107866420, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8359375, + "step": 5016, + "time_per_iteration": 2.5168755054473877 + }, + { + "auxiliary_loss_clip": 0.01135364, + "auxiliary_loss_mlp": 0.01036445, + "balance_loss_clip": 1.0210197, + "balance_loss_mlp": 1.0498333, + "epoch": 0.30163835863520216, + "flos": 28769331248640.0, + "grad_norm": 1.6963436015958502, + "language_loss": 0.65179884, + "learning_rate": 3.2746199006457603e-06, + "loss": 0.67351693, + "num_input_tokens_seen": 107889090, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.85546875, + "step": 5017, + "time_per_iteration": 2.543577194213867 + }, + { + "auxiliary_loss_clip": 0.01134511, + "auxiliary_loss_mlp": 0.01043761, + "balance_loss_clip": 1.02860379, + "balance_loss_mlp": 1.05030179, + "epoch": 0.30169848188787013, + "flos": 22966202891520.0, + "grad_norm": 2.078433105892768, + "language_loss": 0.69045079, + "learning_rate": 3.2743197543607628e-06, + "loss": 0.71223348, + "num_input_tokens_seen": 107907520, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.84375, + "step": 5018, + "time_per_iteration": 2.498060464859009 + }, + { + "auxiliary_loss_clip": 0.01129538, + "auxiliary_loss_mlp": 0.01041131, + "balance_loss_clip": 1.02772546, + "balance_loss_mlp": 1.04842138, + "epoch": 0.3017586051405381, + "flos": 21835268202240.0, + "grad_norm": 1.9198297669603306, + "language_loss": 0.78841144, + "learning_rate": 3.2740195597534327e-06, + "loss": 0.81011814, + "num_input_tokens_seen": 107925650, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.8125, + "step": 5019, + "time_per_iteration": 2.4873573780059814 + }, + { + "auxiliary_loss_clip": 0.01134625, + "auxiliary_loss_mlp": 0.01041878, + "balance_loss_clip": 1.02695298, + "balance_loss_mlp": 1.05073094, + "epoch": 0.30181872839320606, + "flos": 22160403135360.0, + "grad_norm": 2.24109756344656, + "language_loss": 0.69867152, + "learning_rate": 3.2737193168351527e-06, + "loss": 0.72043651, + "num_input_tokens_seen": 107943975, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.83984375, + "step": 5020, + "time_per_iteration": 2.493370532989502 + }, + { + "auxiliary_loss_clip": 0.01136052, + "auxiliary_loss_mlp": 0.01040456, + "balance_loss_clip": 1.0256741, + "balance_loss_mlp": 1.04941368, + "epoch": 0.301878851645874, + "flos": 18114168804480.0, + "grad_norm": 1.9013759847828555, + "language_loss": 0.78134364, + "learning_rate": 3.2734190256173085e-06, + "loss": 0.80310869, + "num_input_tokens_seen": 107962950, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8671875, + "step": 5021, + "time_per_iteration": 2.4670474529266357 + }, + { + "auxiliary_loss_clip": 0.01133279, + "auxiliary_loss_mlp": 0.01029745, + "balance_loss_clip": 1.01527357, + "balance_loss_mlp": 1.04964936, + "epoch": 0.301938974898542, + "flos": 17602226213760.0, + "grad_norm": 2.3821225807179696, + "language_loss": 0.76075405, + "learning_rate": 3.2731186861112877e-06, + "loss": 0.78238434, + "num_input_tokens_seen": 107979700, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8359375, + "step": 5022, + "time_per_iteration": 2.4737884998321533 + }, + { + "auxiliary_loss_clip": 0.01133657, + "auxiliary_loss_mlp": 0.01042925, + "balance_loss_clip": 1.02791631, + "balance_loss_mlp": 1.04880631, + "epoch": 0.30199909815120995, + "flos": 11181219079680.0, + "grad_norm": 1.7684005868111572, + "language_loss": 0.69896525, + "learning_rate": 3.2728182983284793e-06, + "loss": 0.72073108, + "num_input_tokens_seen": 107996645, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84765625, + "step": 5023, + "time_per_iteration": 2.4453155994415283 + }, + { + "auxiliary_loss_clip": 0.01136173, + "auxiliary_loss_mlp": 0.01041698, + "balance_loss_clip": 1.02673686, + "balance_loss_mlp": 1.04927671, + "epoch": 0.302059221403878, + "flos": 21907843632000.0, + "grad_norm": 2.0912728997662127, + "language_loss": 0.71588898, + "learning_rate": 3.2725178622802724e-06, + "loss": 0.73766768, + "num_input_tokens_seen": 108015020, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8671875, + "step": 5024, + "time_per_iteration": 2.4998810291290283 + }, + { + "auxiliary_loss_clip": 0.0113052, + "auxiliary_loss_mlp": 0.01047301, + "balance_loss_clip": 1.0314939, + "balance_loss_mlp": 1.04858792, + "epoch": 0.30211934465654594, + "flos": 26396390039040.0, + "grad_norm": 1.6483742353836974, + "language_loss": 0.73955721, + "learning_rate": 3.272217377978061e-06, + "loss": 0.76133543, + "num_input_tokens_seen": 108036430, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8203125, + "step": 5025, + "time_per_iteration": 2.5167019367218018 + }, + { + "auxiliary_loss_clip": 0.0113244, + "auxiliary_loss_mlp": 0.01042201, + "balance_loss_clip": 1.02800322, + "balance_loss_mlp": 1.0518502, + "epoch": 0.3021794679092139, + "flos": 23400470321280.0, + "grad_norm": 1.4799709397217862, + "language_loss": 0.67022824, + "learning_rate": 3.2719168454332387e-06, + "loss": 0.6919747, + "num_input_tokens_seen": 108054250, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8046875, + "step": 5026, + "time_per_iteration": 2.5326507091522217 + }, + { + "auxiliary_loss_clip": 0.01134018, + "auxiliary_loss_mlp": 0.01043238, + "balance_loss_clip": 1.02799106, + "balance_loss_mlp": 1.05083036, + "epoch": 0.30223959116188187, + "flos": 20260979942400.0, + "grad_norm": 1.6876842646939136, + "language_loss": 0.85252607, + "learning_rate": 3.2716162646572034e-06, + "loss": 0.87429863, + "num_input_tokens_seen": 108071495, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.83203125, + "step": 5027, + "time_per_iteration": 2.4527347087860107 + }, + { + "auxiliary_loss_clip": 0.01129327, + "auxiliary_loss_mlp": 0.01045705, + "balance_loss_clip": 1.03187656, + "balance_loss_mlp": 1.04739702, + "epoch": 0.30229971441454984, + "flos": 26687840993280.0, + "grad_norm": 1.665552114762065, + "language_loss": 0.78757018, + "learning_rate": 3.271315635661351e-06, + "loss": 0.80932051, + "num_input_tokens_seen": 108092135, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.8203125, + "step": 5028, + "time_per_iteration": 2.5677576065063477 + }, + { + "auxiliary_loss_clip": 0.01132481, + "auxiliary_loss_mlp": 0.01044847, + "balance_loss_clip": 1.0295043, + "balance_loss_mlp": 1.04922223, + "epoch": 0.3023598376672178, + "flos": 34345323953280.0, + "grad_norm": 2.0260385179345346, + "language_loss": 0.76721144, + "learning_rate": 3.2710149584570826e-06, + "loss": 0.78898472, + "num_input_tokens_seen": 108112945, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.83203125, + "step": 5029, + "time_per_iteration": 2.611917734146118 + }, + { + "auxiliary_loss_clip": 0.01133028, + "auxiliary_loss_mlp": 0.01043332, + "balance_loss_clip": 1.02642775, + "balance_loss_mlp": 1.04855132, + "epoch": 0.30241996091988577, + "flos": 23112143850240.0, + "grad_norm": 1.944959289407135, + "language_loss": 0.81868339, + "learning_rate": 3.2707142330557993e-06, + "loss": 0.84044701, + "num_input_tokens_seen": 108130325, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.84375, + "step": 5030, + "time_per_iteration": 2.605531930923462 + }, + { + "auxiliary_loss_clip": 0.01132926, + "auxiliary_loss_mlp": 0.01045193, + "balance_loss_clip": 1.02982664, + "balance_loss_mlp": 1.04754734, + "epoch": 0.30248008417255373, + "flos": 19390002958080.0, + "grad_norm": 1.748277903644489, + "language_loss": 0.69869608, + "learning_rate": 3.270413459468905e-06, + "loss": 0.72047728, + "num_input_tokens_seen": 108150300, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.85546875, + "step": 5031, + "time_per_iteration": 2.496833086013794 + }, + { + "auxiliary_loss_clip": 0.01132221, + "auxiliary_loss_mlp": 0.01036128, + "balance_loss_clip": 1.02103615, + "balance_loss_mlp": 1.04892659, + "epoch": 0.3025402074252217, + "flos": 23769704177280.0, + "grad_norm": 1.8467264077922103, + "language_loss": 0.82302773, + "learning_rate": 3.2701126377078047e-06, + "loss": 0.84471118, + "num_input_tokens_seen": 108170330, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 5032, + "time_per_iteration": 2.5062966346740723 + }, + { + "auxiliary_loss_clip": 0.01140181, + "auxiliary_loss_mlp": 0.01046543, + "balance_loss_clip": 1.02991903, + "balance_loss_mlp": 1.05332685, + "epoch": 0.30260033067788966, + "flos": 25994118648960.0, + "grad_norm": 2.10117653020426, + "language_loss": 0.73383862, + "learning_rate": 3.269811767783906e-06, + "loss": 0.75570583, + "num_input_tokens_seen": 108191265, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8671875, + "step": 5033, + "time_per_iteration": 2.561467170715332 + }, + { + "auxiliary_loss_clip": 0.01130223, + "auxiliary_loss_mlp": 0.01045217, + "balance_loss_clip": 1.03000593, + "balance_loss_mlp": 1.04782772, + "epoch": 0.3026604539305576, + "flos": 25374551932800.0, + "grad_norm": 1.437497934350084, + "language_loss": 0.74057245, + "learning_rate": 3.2695108497086185e-06, + "loss": 0.76232684, + "num_input_tokens_seen": 108211615, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.82421875, + "step": 5034, + "time_per_iteration": 2.511861801147461 + }, + { + "auxiliary_loss_clip": 0.01131916, + "auxiliary_loss_mlp": 0.01033507, + "balance_loss_clip": 1.01840353, + "balance_loss_mlp": 1.04825819, + "epoch": 0.3027205771832256, + "flos": 25812733944960.0, + "grad_norm": 1.9672144407329994, + "language_loss": 0.71617639, + "learning_rate": 3.269209883493352e-06, + "loss": 0.73783064, + "num_input_tokens_seen": 108231080, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8359375, + "step": 5035, + "time_per_iteration": 2.545917272567749 + }, + { + "auxiliary_loss_clip": 0.0113067, + "auxiliary_loss_mlp": 0.01032255, + "balance_loss_clip": 1.01835537, + "balance_loss_mlp": 1.04876685, + "epoch": 0.30278070043589356, + "flos": 27344539393920.0, + "grad_norm": 1.774174351542542, + "language_loss": 0.87232339, + "learning_rate": 3.2689088691495196e-06, + "loss": 0.89395267, + "num_input_tokens_seen": 108251125, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.8203125, + "step": 5036, + "time_per_iteration": 2.5197184085845947 + }, + { + "auxiliary_loss_clip": 0.01131426, + "auxiliary_loss_mlp": 0.01043041, + "balance_loss_clip": 1.02679288, + "balance_loss_mlp": 1.04866219, + "epoch": 0.3028408236885616, + "flos": 24786227070720.0, + "grad_norm": 2.2121077897300134, + "language_loss": 0.77760899, + "learning_rate": 3.268607806688536e-06, + "loss": 0.7993536, + "num_input_tokens_seen": 108272545, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.828125, + "step": 5037, + "time_per_iteration": 2.5372917652130127 + }, + { + "auxiliary_loss_clip": 0.01133533, + "auxiliary_loss_mlp": 0.01041477, + "balance_loss_clip": 1.02603984, + "balance_loss_mlp": 1.04973745, + "epoch": 0.30290094694122954, + "flos": 12932474670720.0, + "grad_norm": 2.4260021818478634, + "language_loss": 0.77920854, + "learning_rate": 3.268306696121816e-06, + "loss": 0.80095863, + "num_input_tokens_seen": 108289725, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8359375, + "step": 5038, + "time_per_iteration": 2.4360761642456055 + }, + { + "auxiliary_loss_clip": 0.01128654, + "auxiliary_loss_mlp": 0.01034869, + "balance_loss_clip": 1.02073669, + "balance_loss_mlp": 1.04859674, + "epoch": 0.3029610701938975, + "flos": 25916443488000.0, + "grad_norm": 1.8428508909689656, + "language_loss": 0.74134624, + "learning_rate": 3.2680055374607804e-06, + "loss": 0.76298141, + "num_input_tokens_seen": 108310690, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.80078125, + "step": 5039, + "time_per_iteration": 2.520517587661743 + }, + { + "auxiliary_loss_clip": 0.01129815, + "auxiliary_loss_mlp": 0.01037874, + "balance_loss_clip": 1.02426052, + "balance_loss_mlp": 1.05003977, + "epoch": 0.3030211934465655, + "flos": 21980993679360.0, + "grad_norm": 1.8268154911840482, + "language_loss": 0.80263746, + "learning_rate": 3.267704330716847e-06, + "loss": 0.82431436, + "num_input_tokens_seen": 108328905, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.796875, + "step": 5040, + "time_per_iteration": 2.469822406768799 + }, + { + "auxiliary_loss_clip": 0.01131744, + "auxiliary_loss_mlp": 0.01036261, + "balance_loss_clip": 1.02227795, + "balance_loss_mlp": 1.05101466, + "epoch": 0.30308131669923344, + "flos": 20991977625600.0, + "grad_norm": 1.5747579863116856, + "language_loss": 0.81914759, + "learning_rate": 3.267403075901438e-06, + "loss": 0.8408277, + "num_input_tokens_seen": 108346680, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.8046875, + "step": 5041, + "time_per_iteration": 2.5240108966827393 + }, + { + "auxiliary_loss_clip": 0.01062494, + "auxiliary_loss_mlp": 0.01003022, + "balance_loss_clip": 1.00106716, + "balance_loss_mlp": 1.02890241, + "epoch": 0.3031414399519014, + "flos": 60548875827840.0, + "grad_norm": 0.7678965945904674, + "language_loss": 0.59521127, + "learning_rate": 3.267101773025978e-06, + "loss": 0.61586642, + "num_input_tokens_seen": 108413885, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.3359375, + "step": 5042, + "time_per_iteration": 3.169004440307617 + }, + { + "auxiliary_loss_clip": 0.01134227, + "auxiliary_loss_mlp": 0.01037406, + "balance_loss_clip": 1.02271986, + "balance_loss_mlp": 1.05006266, + "epoch": 0.30320156320456937, + "flos": 21907664064000.0, + "grad_norm": 1.6113397759888244, + "language_loss": 0.71136838, + "learning_rate": 3.266800422101892e-06, + "loss": 0.73308468, + "num_input_tokens_seen": 108433640, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.84375, + "step": 5043, + "time_per_iteration": 2.5217440128326416 + }, + { + "auxiliary_loss_clip": 0.01132657, + "auxiliary_loss_mlp": 0.01037151, + "balance_loss_clip": 1.02207136, + "balance_loss_mlp": 1.04824769, + "epoch": 0.30326168645723733, + "flos": 21652770176640.0, + "grad_norm": 2.6644669890018773, + "language_loss": 0.69351244, + "learning_rate": 3.266499023140606e-06, + "loss": 0.71521056, + "num_input_tokens_seen": 108452640, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84375, + "step": 5044, + "time_per_iteration": 2.4741897583007812 + }, + { + "auxiliary_loss_clip": 0.01129908, + "auxiliary_loss_mlp": 0.01037342, + "balance_loss_clip": 1.02252388, + "balance_loss_mlp": 1.04823565, + "epoch": 0.3033218097099053, + "flos": 21871286565120.0, + "grad_norm": 1.3748845619029404, + "language_loss": 0.77210236, + "learning_rate": 3.2661975761535513e-06, + "loss": 0.79377484, + "num_input_tokens_seen": 108472470, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.81640625, + "step": 5045, + "time_per_iteration": 2.5023043155670166 + }, + { + "auxiliary_loss_clip": 0.01132495, + "auxiliary_loss_mlp": 0.0103816, + "balance_loss_clip": 1.02240646, + "balance_loss_mlp": 1.04892182, + "epoch": 0.30338193296257326, + "flos": 27089717333760.0, + "grad_norm": 1.538768377317596, + "language_loss": 0.72444695, + "learning_rate": 3.2658960811521564e-06, + "loss": 0.74615347, + "num_input_tokens_seen": 108493025, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8359375, + "step": 5046, + "time_per_iteration": 2.5163753032684326 + }, + { + "auxiliary_loss_clip": 0.01134062, + "auxiliary_loss_mlp": 0.01042653, + "balance_loss_clip": 1.02611256, + "balance_loss_mlp": 1.04859519, + "epoch": 0.30344205621524123, + "flos": 19534363718400.0, + "grad_norm": 3.2419373644374176, + "language_loss": 0.80737638, + "learning_rate": 3.2655945381478564e-06, + "loss": 0.82914352, + "num_input_tokens_seen": 108513480, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.85546875, + "step": 5047, + "time_per_iteration": 2.547245979309082 + }, + { + "auxiliary_loss_clip": 0.01131045, + "auxiliary_loss_mlp": 0.01040382, + "balance_loss_clip": 1.02569556, + "balance_loss_mlp": 1.04871237, + "epoch": 0.3035021794679092, + "flos": 23910976368000.0, + "grad_norm": 1.9357354539113198, + "language_loss": 0.72334075, + "learning_rate": 3.265292947152084e-06, + "loss": 0.74505508, + "num_input_tokens_seen": 108533155, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.82421875, + "step": 5048, + "time_per_iteration": 2.494016170501709 + }, + { + "auxiliary_loss_clip": 0.01129755, + "auxiliary_loss_mlp": 0.01035796, + "balance_loss_clip": 1.02093613, + "balance_loss_mlp": 1.04574537, + "epoch": 0.30356230272057716, + "flos": 16143606725760.0, + "grad_norm": 1.7731178616486785, + "language_loss": 0.75098324, + "learning_rate": 3.2649913081762763e-06, + "loss": 0.7726388, + "num_input_tokens_seen": 108551900, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.83984375, + "step": 5049, + "time_per_iteration": 2.502979040145874 + }, + { + "auxiliary_loss_clip": 0.01133123, + "auxiliary_loss_mlp": 0.01037727, + "balance_loss_clip": 1.0226109, + "balance_loss_mlp": 1.04864645, + "epoch": 0.3036224259732452, + "flos": 28914697589760.0, + "grad_norm": 1.6762363098185904, + "language_loss": 0.8194561, + "learning_rate": 3.2646896212318717e-06, + "loss": 0.84116459, + "num_input_tokens_seen": 108574005, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.84375, + "step": 5050, + "time_per_iteration": 2.5254666805267334 + }, + { + "auxiliary_loss_clip": 0.01132852, + "auxiliary_loss_mlp": 0.01038752, + "balance_loss_clip": 1.02299261, + "balance_loss_mlp": 1.04868484, + "epoch": 0.30368254922591315, + "flos": 21105599322240.0, + "grad_norm": 2.8996577335854625, + "language_loss": 0.73712784, + "learning_rate": 3.2643878863303106e-06, + "loss": 0.7588439, + "num_input_tokens_seen": 108592715, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.83984375, + "step": 5051, + "time_per_iteration": 2.511455774307251 + }, + { + "auxiliary_loss_clip": 0.01130282, + "auxiliary_loss_mlp": 0.01035032, + "balance_loss_clip": 1.01967764, + "balance_loss_mlp": 1.04650712, + "epoch": 0.3037426724785811, + "flos": 23002293081600.0, + "grad_norm": 1.5939626777548828, + "language_loss": 0.76463652, + "learning_rate": 3.264086103483033e-06, + "loss": 0.78628969, + "num_input_tokens_seen": 108611770, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8359375, + "step": 5052, + "time_per_iteration": 2.478046417236328 + }, + { + "auxiliary_loss_clip": 0.01131682, + "auxiliary_loss_mlp": 0.01039995, + "balance_loss_clip": 1.02484894, + "balance_loss_mlp": 1.04609728, + "epoch": 0.3038027957312491, + "flos": 15632705629440.0, + "grad_norm": 1.8043694132732864, + "language_loss": 0.82780337, + "learning_rate": 3.2637842727014836e-06, + "loss": 0.84952009, + "num_input_tokens_seen": 108629070, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.85546875, + "step": 5053, + "time_per_iteration": 3.983353614807129 + }, + { + "auxiliary_loss_clip": 0.01130411, + "auxiliary_loss_mlp": 0.01042277, + "balance_loss_clip": 1.02661896, + "balance_loss_mlp": 1.04685903, + "epoch": 0.30386291898391704, + "flos": 12713994195840.0, + "grad_norm": 1.5364375285570075, + "language_loss": 0.70702368, + "learning_rate": 3.2634823939971083e-06, + "loss": 0.72875059, + "num_input_tokens_seen": 108646315, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8359375, + "step": 5054, + "time_per_iteration": 2.4379446506500244 + }, + { + "auxiliary_loss_clip": 0.01132155, + "auxiliary_loss_mlp": 0.01033029, + "balance_loss_clip": 1.01768088, + "balance_loss_mlp": 1.04817367, + "epoch": 0.303923042236585, + "flos": 26359437922560.0, + "grad_norm": 1.8280069054430388, + "language_loss": 0.69543922, + "learning_rate": 3.2631804673813545e-06, + "loss": 0.71709108, + "num_input_tokens_seen": 108665920, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.84375, + "step": 5055, + "time_per_iteration": 2.5247206687927246 + }, + { + "auxiliary_loss_clip": 0.01131491, + "auxiliary_loss_mlp": 0.01036764, + "balance_loss_clip": 1.02056348, + "balance_loss_mlp": 1.04682207, + "epoch": 0.30398316548925297, + "flos": 19719232041600.0, + "grad_norm": 2.038005952710024, + "language_loss": 0.67502165, + "learning_rate": 3.2628784928656707e-06, + "loss": 0.69670427, + "num_input_tokens_seen": 108683485, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.84765625, + "step": 5056, + "time_per_iteration": 2.4767425060272217 + }, + { + "auxiliary_loss_clip": 0.01130078, + "auxiliary_loss_mlp": 0.01039078, + "balance_loss_clip": 1.02434373, + "balance_loss_mlp": 1.04886115, + "epoch": 0.30404328874192094, + "flos": 24239846315520.0, + "grad_norm": 1.5579435169669187, + "language_loss": 0.82500231, + "learning_rate": 3.262576470461507e-06, + "loss": 0.84669387, + "num_input_tokens_seen": 108702700, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8125, + "step": 5057, + "time_per_iteration": 2.499105453491211 + }, + { + "auxiliary_loss_clip": 0.01129487, + "auxiliary_loss_mlp": 0.01036385, + "balance_loss_clip": 1.02171588, + "balance_loss_mlp": 1.04686213, + "epoch": 0.3041034119945889, + "flos": 24498942094080.0, + "grad_norm": 3.274565054245196, + "language_loss": 0.89040101, + "learning_rate": 3.2622744001803176e-06, + "loss": 0.91205966, + "num_input_tokens_seen": 108721860, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.828125, + "step": 5058, + "time_per_iteration": 2.4966368675231934 + }, + { + "auxiliary_loss_clip": 0.01131903, + "auxiliary_loss_mlp": 0.01042482, + "balance_loss_clip": 1.02681756, + "balance_loss_mlp": 1.04829955, + "epoch": 0.30416353524725687, + "flos": 28288881907200.0, + "grad_norm": 2.2189779437975274, + "language_loss": 0.71709251, + "learning_rate": 3.2619722820335564e-06, + "loss": 0.73883629, + "num_input_tokens_seen": 108743215, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8359375, + "step": 5059, + "time_per_iteration": 2.5429141521453857 + }, + { + "auxiliary_loss_clip": 0.01130965, + "auxiliary_loss_mlp": 0.01037733, + "balance_loss_clip": 1.0233928, + "balance_loss_mlp": 1.04720807, + "epoch": 0.30422365849992483, + "flos": 23660392112640.0, + "grad_norm": 10.158939103063299, + "language_loss": 0.73069966, + "learning_rate": 3.26167011603268e-06, + "loss": 0.75238669, + "num_input_tokens_seen": 108765505, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.8359375, + "step": 5060, + "time_per_iteration": 2.529862403869629 + }, + { + "auxiliary_loss_clip": 0.01132671, + "auxiliary_loss_mlp": 0.01034539, + "balance_loss_clip": 1.01979291, + "balance_loss_mlp": 1.04885316, + "epoch": 0.3042837817525928, + "flos": 22998773548800.0, + "grad_norm": 1.8510962431794071, + "language_loss": 0.76926744, + "learning_rate": 3.2613679021891463e-06, + "loss": 0.79093957, + "num_input_tokens_seen": 108783370, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8359375, + "step": 5061, + "time_per_iteration": 2.496739149093628 + }, + { + "auxiliary_loss_clip": 0.01138048, + "auxiliary_loss_mlp": 0.01039513, + "balance_loss_clip": 1.02312136, + "balance_loss_mlp": 1.0527482, + "epoch": 0.30434390500526076, + "flos": 22082332924800.0, + "grad_norm": 2.264413063412747, + "language_loss": 0.82064837, + "learning_rate": 3.261065640514415e-06, + "loss": 0.84242392, + "num_input_tokens_seen": 108797430, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8515625, + "step": 5062, + "time_per_iteration": 2.476290702819824 + }, + { + "auxiliary_loss_clip": 0.01128914, + "auxiliary_loss_mlp": 0.01032652, + "balance_loss_clip": 1.01821566, + "balance_loss_mlp": 1.04721808, + "epoch": 0.3044040282579287, + "flos": 25483504861440.0, + "grad_norm": 1.7072945635391377, + "language_loss": 0.74737656, + "learning_rate": 3.2607633310199483e-06, + "loss": 0.76899219, + "num_input_tokens_seen": 108816945, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.81640625, + "step": 5063, + "time_per_iteration": 2.5384082794189453 + }, + { + "auxiliary_loss_clip": 0.01132221, + "auxiliary_loss_mlp": 0.01037959, + "balance_loss_clip": 1.0214901, + "balance_loss_mlp": 1.04908288, + "epoch": 0.30446415151059675, + "flos": 21945478106880.0, + "grad_norm": 1.8176932093217915, + "language_loss": 0.84120226, + "learning_rate": 3.26046097371721e-06, + "loss": 0.86290407, + "num_input_tokens_seen": 108836615, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.83203125, + "step": 5064, + "time_per_iteration": 2.5108115673065186 + }, + { + "auxiliary_loss_clip": 0.01131651, + "auxiliary_loss_mlp": 0.01034827, + "balance_loss_clip": 1.01888871, + "balance_loss_mlp": 1.04751444, + "epoch": 0.3045242747632647, + "flos": 16435416816000.0, + "grad_norm": 1.7759562417820063, + "language_loss": 0.75990027, + "learning_rate": 3.2601585686176655e-06, + "loss": 0.78156507, + "num_input_tokens_seen": 108855165, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.84375, + "step": 5065, + "time_per_iteration": 2.5061376094818115 + }, + { + "auxiliary_loss_clip": 0.01133071, + "auxiliary_loss_mlp": 0.01040555, + "balance_loss_clip": 1.02470005, + "balance_loss_mlp": 1.04716659, + "epoch": 0.3045843980159327, + "flos": 31540341957120.0, + "grad_norm": 2.0133457948817406, + "language_loss": 0.62271762, + "learning_rate": 3.2598561157327814e-06, + "loss": 0.64445394, + "num_input_tokens_seen": 108874690, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.859375, + "step": 5066, + "time_per_iteration": 2.6000661849975586 + }, + { + "auxiliary_loss_clip": 0.01140413, + "auxiliary_loss_mlp": 0.01049285, + "balance_loss_clip": 1.03385913, + "balance_loss_mlp": 1.05344141, + "epoch": 0.30464452126860064, + "flos": 17853636481920.0, + "grad_norm": 1.7828452375691122, + "language_loss": 0.82887459, + "learning_rate": 3.2595536150740265e-06, + "loss": 0.85077155, + "num_input_tokens_seen": 108893140, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.87109375, + "step": 5067, + "time_per_iteration": 2.4754927158355713 + }, + { + "auxiliary_loss_clip": 0.01130661, + "auxiliary_loss_mlp": 0.01043304, + "balance_loss_clip": 1.02829516, + "balance_loss_mlp": 1.04839194, + "epoch": 0.3047046445212686, + "flos": 20631398947200.0, + "grad_norm": 2.0779895110277535, + "language_loss": 0.62978256, + "learning_rate": 3.259251066652873e-06, + "loss": 0.65152222, + "num_input_tokens_seen": 108911880, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8203125, + "step": 5068, + "time_per_iteration": 2.4957847595214844 + }, + { + "auxiliary_loss_clip": 0.01127866, + "auxiliary_loss_mlp": 0.01031598, + "balance_loss_clip": 1.01633286, + "balance_loss_mlp": 1.04544926, + "epoch": 0.3047647677739366, + "flos": 21287594557440.0, + "grad_norm": 1.6700683770947133, + "language_loss": 0.75058538, + "learning_rate": 3.258948470480793e-06, + "loss": 0.77217996, + "num_input_tokens_seen": 108930440, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.82421875, + "step": 5069, + "time_per_iteration": 2.487473964691162 + }, + { + "auxiliary_loss_clip": 0.0112831, + "auxiliary_loss_mlp": 0.01043362, + "balance_loss_clip": 1.02798414, + "balance_loss_mlp": 1.04746199, + "epoch": 0.30482489102660454, + "flos": 20995928121600.0, + "grad_norm": 1.839652658151057, + "language_loss": 0.75732648, + "learning_rate": 3.258645826569261e-06, + "loss": 0.7790432, + "num_input_tokens_seen": 108949125, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.80859375, + "step": 5070, + "time_per_iteration": 2.500335216522217 + }, + { + "auxiliary_loss_clip": 0.0113368, + "auxiliary_loss_mlp": 0.0103861, + "balance_loss_clip": 1.02229047, + "balance_loss_mlp": 1.04640067, + "epoch": 0.3048850142792725, + "flos": 26290812988800.0, + "grad_norm": 1.7318177446844936, + "language_loss": 0.81738281, + "learning_rate": 3.2583431349297527e-06, + "loss": 0.83910567, + "num_input_tokens_seen": 108972190, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.875, + "step": 5071, + "time_per_iteration": 2.5726318359375 + }, + { + "auxiliary_loss_clip": 0.01134597, + "auxiliary_loss_mlp": 0.01041754, + "balance_loss_clip": 1.02507651, + "balance_loss_mlp": 1.04737437, + "epoch": 0.30494513753194047, + "flos": 22346241125760.0, + "grad_norm": 1.5942809817556516, + "language_loss": 0.76252651, + "learning_rate": 3.2580403955737467e-06, + "loss": 0.78428996, + "num_input_tokens_seen": 108990325, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.875, + "step": 5072, + "time_per_iteration": 2.5147287845611572 + }, + { + "auxiliary_loss_clip": 0.01132045, + "auxiliary_loss_mlp": 0.01046119, + "balance_loss_clip": 1.03059769, + "balance_loss_mlp": 1.04904687, + "epoch": 0.30500526078460843, + "flos": 19537667769600.0, + "grad_norm": 2.176920469303851, + "language_loss": 0.71318722, + "learning_rate": 3.257737608512723e-06, + "loss": 0.73496878, + "num_input_tokens_seen": 109009505, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.83203125, + "step": 5073, + "time_per_iteration": 2.4736156463623047 + }, + { + "auxiliary_loss_clip": 0.01135708, + "auxiliary_loss_mlp": 0.0104584, + "balance_loss_clip": 1.02974713, + "balance_loss_mlp": 1.04842663, + "epoch": 0.3050653840372764, + "flos": 14465321614080.0, + "grad_norm": 2.146618897096623, + "language_loss": 0.7663309, + "learning_rate": 3.257434773758163e-06, + "loss": 0.78814638, + "num_input_tokens_seen": 109026350, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.875, + "step": 5074, + "time_per_iteration": 2.4547433853149414 + }, + { + "auxiliary_loss_clip": 0.01131716, + "auxiliary_loss_mlp": 0.01035183, + "balance_loss_clip": 1.02015638, + "balance_loss_mlp": 1.04879379, + "epoch": 0.30512550728994436, + "flos": 24243796811520.0, + "grad_norm": 1.8636036931869358, + "language_loss": 0.73939347, + "learning_rate": 3.25713189132155e-06, + "loss": 0.76106244, + "num_input_tokens_seen": 109044165, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 5075, + "time_per_iteration": 2.4922661781311035 + }, + { + "auxiliary_loss_clip": 0.01135073, + "auxiliary_loss_mlp": 0.01042646, + "balance_loss_clip": 1.02508652, + "balance_loss_mlp": 1.04769778, + "epoch": 0.30518563054261233, + "flos": 16360542915840.0, + "grad_norm": 2.14961805392919, + "language_loss": 0.75488788, + "learning_rate": 3.2568289612143703e-06, + "loss": 0.77666509, + "num_input_tokens_seen": 109060665, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.875, + "step": 5076, + "time_per_iteration": 2.471381187438965 + }, + { + "auxiliary_loss_clip": 0.0113449, + "auxiliary_loss_mlp": 0.01039246, + "balance_loss_clip": 1.02407038, + "balance_loss_mlp": 1.05137944, + "epoch": 0.30524575379528035, + "flos": 21579584215680.0, + "grad_norm": 1.505999917432091, + "language_loss": 0.79183954, + "learning_rate": 3.25652598344811e-06, + "loss": 0.81357688, + "num_input_tokens_seen": 109080035, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.828125, + "step": 5077, + "time_per_iteration": 2.5000534057617188 + }, + { + "auxiliary_loss_clip": 0.01127394, + "auxiliary_loss_mlp": 0.01030923, + "balance_loss_clip": 1.01739252, + "balance_loss_mlp": 1.0478642, + "epoch": 0.3053058770479483, + "flos": 16545231671040.0, + "grad_norm": 1.9961733055656423, + "language_loss": 0.74662113, + "learning_rate": 3.256222958034259e-06, + "loss": 0.76820433, + "num_input_tokens_seen": 109097385, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.796875, + "step": 5078, + "time_per_iteration": 2.4746944904327393 + }, + { + "auxiliary_loss_clip": 0.01130678, + "auxiliary_loss_mlp": 0.01047379, + "balance_loss_clip": 1.03203678, + "balance_loss_mlp": 1.04787958, + "epoch": 0.3053660003006163, + "flos": 12312907954560.0, + "grad_norm": 2.113994612729099, + "language_loss": 0.67216343, + "learning_rate": 3.255919884984307e-06, + "loss": 0.69394398, + "num_input_tokens_seen": 109115495, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.828125, + "step": 5079, + "time_per_iteration": 2.4575493335723877 + }, + { + "auxiliary_loss_clip": 0.01130366, + "auxiliary_loss_mlp": 0.01034996, + "balance_loss_clip": 1.02034521, + "balance_loss_mlp": 1.04758203, + "epoch": 0.30542612355328425, + "flos": 23112287504640.0, + "grad_norm": 1.7438542216491464, + "language_loss": 0.80291754, + "learning_rate": 3.2556167643097477e-06, + "loss": 0.82457113, + "num_input_tokens_seen": 109134235, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.828125, + "step": 5080, + "time_per_iteration": 2.490842342376709 + }, + { + "auxiliary_loss_clip": 0.01129694, + "auxiliary_loss_mlp": 0.01039719, + "balance_loss_clip": 1.02475858, + "balance_loss_mlp": 1.04612935, + "epoch": 0.3054862468059522, + "flos": 24389450461440.0, + "grad_norm": 2.2926909410882903, + "language_loss": 0.80971938, + "learning_rate": 3.255313596022074e-06, + "loss": 0.83141345, + "num_input_tokens_seen": 109152760, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8359375, + "step": 5081, + "time_per_iteration": 2.5298712253570557 + }, + { + "auxiliary_loss_clip": 0.01129539, + "auxiliary_loss_mlp": 0.01034881, + "balance_loss_clip": 1.01952672, + "balance_loss_mlp": 1.04690182, + "epoch": 0.3055463700586202, + "flos": 29386096704000.0, + "grad_norm": 1.691443128795128, + "language_loss": 0.71810889, + "learning_rate": 3.255010380132783e-06, + "loss": 0.73975313, + "num_input_tokens_seen": 109173925, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.828125, + "step": 5082, + "time_per_iteration": 2.5567750930786133 + }, + { + "auxiliary_loss_clip": 0.0113354, + "auxiliary_loss_mlp": 0.01038275, + "balance_loss_clip": 1.02073884, + "balance_loss_mlp": 1.0468955, + "epoch": 0.30560649331128814, + "flos": 25591775431680.0, + "grad_norm": 1.9955003311475592, + "language_loss": 0.73615241, + "learning_rate": 3.2547071166533736e-06, + "loss": 0.75787055, + "num_input_tokens_seen": 109192510, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8671875, + "step": 5083, + "time_per_iteration": 2.5083980560302734 + }, + { + "auxiliary_loss_clip": 0.01129694, + "auxiliary_loss_mlp": 0.0103765, + "balance_loss_clip": 1.02184248, + "balance_loss_mlp": 1.04441404, + "epoch": 0.3056666165639561, + "flos": 19128321400320.0, + "grad_norm": 3.7957379738132517, + "language_loss": 0.70895267, + "learning_rate": 3.254403805595344e-06, + "loss": 0.73062611, + "num_input_tokens_seen": 109210885, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8515625, + "step": 5084, + "time_per_iteration": 2.477665424346924 + }, + { + "auxiliary_loss_clip": 0.01134775, + "auxiliary_loss_mlp": 0.01032514, + "balance_loss_clip": 1.01631355, + "balance_loss_mlp": 1.04818797, + "epoch": 0.30572673981662407, + "flos": 15523860441600.0, + "grad_norm": 2.0055460894973933, + "language_loss": 0.78791595, + "learning_rate": 3.2541004469701962e-06, + "loss": 0.80958885, + "num_input_tokens_seen": 109229180, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8671875, + "step": 5085, + "time_per_iteration": 2.475783586502075 + }, + { + "auxiliary_loss_clip": 0.01127203, + "auxiliary_loss_mlp": 0.01037047, + "balance_loss_clip": 1.02187788, + "balance_loss_mlp": 1.04529142, + "epoch": 0.30578686306929204, + "flos": 21506541909120.0, + "grad_norm": 1.5510153728860234, + "language_loss": 0.77846372, + "learning_rate": 3.2537970407894342e-06, + "loss": 0.80010617, + "num_input_tokens_seen": 109249510, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8203125, + "step": 5086, + "time_per_iteration": 2.514472007751465 + }, + { + "auxiliary_loss_clip": 0.01132639, + "auxiliary_loss_mlp": 0.01041993, + "balance_loss_clip": 1.02592945, + "balance_loss_mlp": 1.04930758, + "epoch": 0.30584698632196, + "flos": 20954271323520.0, + "grad_norm": 1.7256556540888637, + "language_loss": 0.77121228, + "learning_rate": 3.253493587064563e-06, + "loss": 0.79295856, + "num_input_tokens_seen": 109268200, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8359375, + "step": 5087, + "time_per_iteration": 2.4817616939544678 + }, + { + "auxiliary_loss_clip": 0.01132683, + "auxiliary_loss_mlp": 0.01040214, + "balance_loss_clip": 1.02346563, + "balance_loss_mlp": 1.04716742, + "epoch": 0.30590710957462797, + "flos": 24681116897280.0, + "grad_norm": 2.0600622883478517, + "language_loss": 0.72582048, + "learning_rate": 3.2531900858070885e-06, + "loss": 0.74754953, + "num_input_tokens_seen": 109288370, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.85546875, + "step": 5088, + "time_per_iteration": 2.538318395614624 + }, + { + "auxiliary_loss_clip": 0.01135035, + "auxiliary_loss_mlp": 0.01039164, + "balance_loss_clip": 1.02300477, + "balance_loss_mlp": 1.04673004, + "epoch": 0.30596723282729593, + "flos": 17086907744640.0, + "grad_norm": 2.417480227404851, + "language_loss": 0.7889666, + "learning_rate": 3.252886537028521e-06, + "loss": 0.81070858, + "num_input_tokens_seen": 109306730, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8828125, + "step": 5089, + "time_per_iteration": 2.4561989307403564 + }, + { + "auxiliary_loss_clip": 0.0113113, + "auxiliary_loss_mlp": 0.0103884, + "balance_loss_clip": 1.02328289, + "balance_loss_mlp": 1.04813027, + "epoch": 0.30602735607996395, + "flos": 22857106308480.0, + "grad_norm": 2.044405318996134, + "language_loss": 0.77061844, + "learning_rate": 3.2525829407403703e-06, + "loss": 0.79231811, + "num_input_tokens_seen": 109327360, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.828125, + "step": 5090, + "time_per_iteration": 2.5215258598327637 + }, + { + "auxiliary_loss_clip": 0.01134524, + "auxiliary_loss_mlp": 0.01046182, + "balance_loss_clip": 1.02999353, + "balance_loss_mlp": 1.04693675, + "epoch": 0.3060874793326319, + "flos": 29861482227840.0, + "grad_norm": 1.7474050348479595, + "language_loss": 0.76481628, + "learning_rate": 3.2522792969541488e-06, + "loss": 0.78662336, + "num_input_tokens_seen": 109348135, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.875, + "step": 5091, + "time_per_iteration": 2.535468578338623 + }, + { + "auxiliary_loss_clip": 0.01133443, + "auxiliary_loss_mlp": 0.0103559, + "balance_loss_clip": 1.01955616, + "balance_loss_mlp": 1.04671383, + "epoch": 0.3061476025852999, + "flos": 20448577699200.0, + "grad_norm": 1.638842582319787, + "language_loss": 0.71933579, + "learning_rate": 3.2519756056813705e-06, + "loss": 0.7410261, + "num_input_tokens_seen": 109366220, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8671875, + "step": 5092, + "time_per_iteration": 2.512096405029297 + }, + { + "auxiliary_loss_clip": 0.01131454, + "auxiliary_loss_mlp": 0.01035497, + "balance_loss_clip": 1.02131701, + "balance_loss_mlp": 1.04765177, + "epoch": 0.30620772583796785, + "flos": 19391475415680.0, + "grad_norm": 1.9362192703697652, + "language_loss": 0.8216877, + "learning_rate": 3.2516718669335522e-06, + "loss": 0.84335721, + "num_input_tokens_seen": 109385260, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8359375, + "step": 5093, + "time_per_iteration": 2.464477300643921 + }, + { + "auxiliary_loss_clip": 0.01129105, + "auxiliary_loss_mlp": 0.01036711, + "balance_loss_clip": 1.02239954, + "balance_loss_mlp": 1.04639721, + "epoch": 0.3062678490906358, + "flos": 24024562151040.0, + "grad_norm": 1.6957020618246583, + "language_loss": 0.75365555, + "learning_rate": 3.2513680807222114e-06, + "loss": 0.77531368, + "num_input_tokens_seen": 109405025, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.828125, + "step": 5094, + "time_per_iteration": 2.5149855613708496 + }, + { + "auxiliary_loss_clip": 0.01128293, + "auxiliary_loss_mlp": 0.01039658, + "balance_loss_clip": 1.02464378, + "balance_loss_mlp": 1.04530072, + "epoch": 0.3063279723433038, + "flos": 19754639873280.0, + "grad_norm": 1.922814039194465, + "language_loss": 0.76033115, + "learning_rate": 3.251064247058868e-06, + "loss": 0.78201067, + "num_input_tokens_seen": 109422465, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 5095, + "time_per_iteration": 5.438723802566528 + }, + { + "auxiliary_loss_clip": 0.01127363, + "auxiliary_loss_mlp": 0.01038505, + "balance_loss_clip": 1.02325845, + "balance_loss_mlp": 1.04581833, + "epoch": 0.30638809559597174, + "flos": 22450022496000.0, + "grad_norm": 1.7577098515851188, + "language_loss": 0.8050971, + "learning_rate": 3.250760365955042e-06, + "loss": 0.82675582, + "num_input_tokens_seen": 109440575, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.81640625, + "step": 5096, + "time_per_iteration": 2.4706614017486572 + }, + { + "auxiliary_loss_clip": 0.01130131, + "auxiliary_loss_mlp": 0.01035159, + "balance_loss_clip": 1.02052069, + "balance_loss_mlp": 1.04556763, + "epoch": 0.3064482188486397, + "flos": 17165157523200.0, + "grad_norm": 2.0672553061960586, + "language_loss": 0.8209089, + "learning_rate": 3.250456437422258e-06, + "loss": 0.84256178, + "num_input_tokens_seen": 109459050, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.84375, + "step": 5097, + "time_per_iteration": 2.457242250442505 + }, + { + "auxiliary_loss_clip": 0.0112984, + "auxiliary_loss_mlp": 0.01039085, + "balance_loss_clip": 1.02227616, + "balance_loss_mlp": 1.04537082, + "epoch": 0.3065083421013077, + "flos": 23768483114880.0, + "grad_norm": 1.9081721986815667, + "language_loss": 0.77858478, + "learning_rate": 3.250152461472041e-06, + "loss": 0.80027401, + "num_input_tokens_seen": 109475860, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.84375, + "step": 5098, + "time_per_iteration": 2.4709839820861816 + }, + { + "auxiliary_loss_clip": 0.01128893, + "auxiliary_loss_mlp": 0.01035797, + "balance_loss_clip": 1.02057385, + "balance_loss_mlp": 1.0466584, + "epoch": 0.30656846535397564, + "flos": 26431833784320.0, + "grad_norm": 1.9501450681008343, + "language_loss": 0.83948421, + "learning_rate": 3.249848438115917e-06, + "loss": 0.86113107, + "num_input_tokens_seen": 109494760, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8203125, + "step": 5099, + "time_per_iteration": 2.537771224975586 + }, + { + "auxiliary_loss_clip": 0.01130145, + "auxiliary_loss_mlp": 0.01042266, + "balance_loss_clip": 1.02653074, + "balance_loss_mlp": 1.04364753, + "epoch": 0.3066285886066436, + "flos": 26651786716800.0, + "grad_norm": 2.2273819247618376, + "language_loss": 0.85744429, + "learning_rate": 3.2495443673654148e-06, + "loss": 0.87916839, + "num_input_tokens_seen": 109516480, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8671875, + "step": 5100, + "time_per_iteration": 2.5103259086608887 + }, + { + "auxiliary_loss_clip": 0.01129277, + "auxiliary_loss_mlp": 0.01038498, + "balance_loss_clip": 1.02259541, + "balance_loss_mlp": 1.04542243, + "epoch": 0.30668871185931157, + "flos": 15049947375360.0, + "grad_norm": 1.8863659276771934, + "language_loss": 0.79225194, + "learning_rate": 3.249240249232065e-06, + "loss": 0.81392968, + "num_input_tokens_seen": 109534615, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8359375, + "step": 5101, + "time_per_iteration": 2.4733920097351074 + }, + { + "auxiliary_loss_clip": 0.01131914, + "auxiliary_loss_mlp": 0.01045873, + "balance_loss_clip": 1.02869534, + "balance_loss_mlp": 1.04708326, + "epoch": 0.30674883511197953, + "flos": 20082109190400.0, + "grad_norm": 1.7393564952665503, + "language_loss": 0.79405224, + "learning_rate": 3.2489360837273998e-06, + "loss": 0.81583011, + "num_input_tokens_seen": 109554040, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.84765625, + "step": 5102, + "time_per_iteration": 2.4608778953552246 + }, + { + "auxiliary_loss_clip": 0.01134414, + "auxiliary_loss_mlp": 0.01038608, + "balance_loss_clip": 1.02135825, + "balance_loss_mlp": 1.04940438, + "epoch": 0.30680895836464755, + "flos": 22893807029760.0, + "grad_norm": 1.7201607461659805, + "language_loss": 0.88999605, + "learning_rate": 3.2486318708629532e-06, + "loss": 0.9117263, + "num_input_tokens_seen": 109574345, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.84765625, + "step": 5103, + "time_per_iteration": 2.5295228958129883 + }, + { + "auxiliary_loss_clip": 0.01131581, + "auxiliary_loss_mlp": 0.01041048, + "balance_loss_clip": 1.02549076, + "balance_loss_mlp": 1.04700959, + "epoch": 0.3068690816173155, + "flos": 23696159080320.0, + "grad_norm": 1.6453097169103326, + "language_loss": 0.74079049, + "learning_rate": 3.2483276106502607e-06, + "loss": 0.76251674, + "num_input_tokens_seen": 109593670, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84375, + "step": 5104, + "time_per_iteration": 2.4923107624053955 + }, + { + "auxiliary_loss_clip": 0.01132054, + "auxiliary_loss_mlp": 0.01042794, + "balance_loss_clip": 1.02690291, + "balance_loss_mlp": 1.04555643, + "epoch": 0.3069292048699835, + "flos": 23551044134400.0, + "grad_norm": 1.8308515164246026, + "language_loss": 0.73333633, + "learning_rate": 3.2480233031008605e-06, + "loss": 0.75508481, + "num_input_tokens_seen": 109613385, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.86328125, + "step": 5105, + "time_per_iteration": 2.542391777038574 + }, + { + "auxiliary_loss_clip": 0.01131684, + "auxiliary_loss_mlp": 0.01047183, + "balance_loss_clip": 1.03058875, + "balance_loss_mlp": 1.04582942, + "epoch": 0.30698932812265145, + "flos": 24531656405760.0, + "grad_norm": 5.5167708582846515, + "language_loss": 0.8714695, + "learning_rate": 3.2477189482262916e-06, + "loss": 0.89325809, + "num_input_tokens_seen": 109632395, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.859375, + "step": 5106, + "time_per_iteration": 2.5054032802581787 + }, + { + "auxiliary_loss_clip": 0.01136079, + "auxiliary_loss_mlp": 0.01048019, + "balance_loss_clip": 1.03128242, + "balance_loss_mlp": 1.04750919, + "epoch": 0.3070494513753194, + "flos": 20996430912000.0, + "grad_norm": 2.142568748510771, + "language_loss": 0.71183497, + "learning_rate": 3.2474145460380945e-06, + "loss": 0.73367596, + "num_input_tokens_seen": 109651380, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.88671875, + "step": 5107, + "time_per_iteration": 2.4980053901672363 + }, + { + "auxiliary_loss_clip": 0.01125715, + "auxiliary_loss_mlp": 0.01050168, + "balance_loss_clip": 1.03372955, + "balance_loss_mlp": 1.04304433, + "epoch": 0.3071095746279874, + "flos": 19025940660480.0, + "grad_norm": 1.7923615416213727, + "language_loss": 0.72302651, + "learning_rate": 3.247110096547814e-06, + "loss": 0.74478543, + "num_input_tokens_seen": 109670240, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.82421875, + "step": 5108, + "time_per_iteration": 2.4588091373443604 + }, + { + "auxiliary_loss_clip": 0.01129796, + "auxiliary_loss_mlp": 0.01039934, + "balance_loss_clip": 1.02435362, + "balance_loss_mlp": 1.04538584, + "epoch": 0.30716969788065535, + "flos": 21215521918080.0, + "grad_norm": 1.5361542639570684, + "language_loss": 0.85768104, + "learning_rate": 3.2468055997669926e-06, + "loss": 0.87937832, + "num_input_tokens_seen": 109690810, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84375, + "step": 5109, + "time_per_iteration": 2.5077664852142334 + }, + { + "auxiliary_loss_clip": 0.01129418, + "auxiliary_loss_mlp": 0.01036702, + "balance_loss_clip": 1.02176476, + "balance_loss_mlp": 1.04534364, + "epoch": 0.3072298211333233, + "flos": 25772765086080.0, + "grad_norm": 1.6710196569280569, + "language_loss": 0.67220587, + "learning_rate": 3.2465010557071788e-06, + "loss": 0.69386709, + "num_input_tokens_seen": 109711145, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.84375, + "step": 5110, + "time_per_iteration": 2.5019631385803223 + }, + { + "auxiliary_loss_clip": 0.01126741, + "auxiliary_loss_mlp": 0.01036309, + "balance_loss_clip": 1.0220511, + "balance_loss_mlp": 1.04472136, + "epoch": 0.3072899443859913, + "flos": 25848931875840.0, + "grad_norm": 1.5071731281437177, + "language_loss": 0.76981276, + "learning_rate": 3.246196464379919e-06, + "loss": 0.79144323, + "num_input_tokens_seen": 109731425, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8203125, + "step": 5111, + "time_per_iteration": 2.544111490249634 + }, + { + "auxiliary_loss_clip": 0.01130011, + "auxiliary_loss_mlp": 0.01040184, + "balance_loss_clip": 1.02486551, + "balance_loss_mlp": 1.04580235, + "epoch": 0.30735006763865924, + "flos": 25922800195200.0, + "grad_norm": 1.9077726149637915, + "language_loss": 0.67174292, + "learning_rate": 3.245891825796765e-06, + "loss": 0.69344485, + "num_input_tokens_seen": 109752720, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.84375, + "step": 5112, + "time_per_iteration": 2.5171637535095215 + }, + { + "auxiliary_loss_clip": 0.01136791, + "auxiliary_loss_mlp": 0.01041143, + "balance_loss_clip": 1.02382207, + "balance_loss_mlp": 1.04846382, + "epoch": 0.3074101908913272, + "flos": 30917004312960.0, + "grad_norm": 1.8925702151041777, + "language_loss": 0.798181, + "learning_rate": 3.2455871399692678e-06, + "loss": 0.81996036, + "num_input_tokens_seen": 109772840, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.8828125, + "step": 5113, + "time_per_iteration": 2.55889892578125 + }, + { + "auxiliary_loss_clip": 0.01130603, + "auxiliary_loss_mlp": 0.01041707, + "balance_loss_clip": 1.0257802, + "balance_loss_mlp": 1.04549623, + "epoch": 0.30747031414399517, + "flos": 18401058731520.0, + "grad_norm": 1.951625458848465, + "language_loss": 0.77243912, + "learning_rate": 3.2452824069089815e-06, + "loss": 0.79416221, + "num_input_tokens_seen": 109790150, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8515625, + "step": 5114, + "time_per_iteration": 2.4328107833862305 + }, + { + "auxiliary_loss_clip": 0.01132188, + "auxiliary_loss_mlp": 0.01037897, + "balance_loss_clip": 1.02079093, + "balance_loss_mlp": 1.04755759, + "epoch": 0.30753043739666314, + "flos": 22633166966400.0, + "grad_norm": 1.8985095809631356, + "language_loss": 0.62356925, + "learning_rate": 3.2449776266274623e-06, + "loss": 0.64527011, + "num_input_tokens_seen": 109807985, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.84765625, + "step": 5115, + "time_per_iteration": 2.480536699295044 + }, + { + "auxiliary_loss_clip": 0.01132859, + "auxiliary_loss_mlp": 0.01036205, + "balance_loss_clip": 1.02033865, + "balance_loss_mlp": 1.04663444, + "epoch": 0.3075905606493311, + "flos": 27344072517120.0, + "grad_norm": 3.0190652682973176, + "language_loss": 0.82743216, + "learning_rate": 3.2446727991362657e-06, + "loss": 0.84912288, + "num_input_tokens_seen": 109825920, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.859375, + "step": 5116, + "time_per_iteration": 2.5121662616729736 + }, + { + "auxiliary_loss_clip": 0.01131907, + "auxiliary_loss_mlp": 0.0103869, + "balance_loss_clip": 1.02322841, + "balance_loss_mlp": 1.04825926, + "epoch": 0.3076506839019991, + "flos": 22090808534400.0, + "grad_norm": 1.8681947014951163, + "language_loss": 0.75772393, + "learning_rate": 3.244367924446952e-06, + "loss": 0.77942991, + "num_input_tokens_seen": 109846220, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8359375, + "step": 5117, + "time_per_iteration": 2.48750376701355 + }, + { + "auxiliary_loss_clip": 0.0113581, + "auxiliary_loss_mlp": 0.01035582, + "balance_loss_clip": 1.01952434, + "balance_loss_mlp": 1.05018401, + "epoch": 0.3077108071546671, + "flos": 21289533891840.0, + "grad_norm": 2.225887232792708, + "language_loss": 0.71873093, + "learning_rate": 3.2440630025710826e-06, + "loss": 0.74044484, + "num_input_tokens_seen": 109863870, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.85546875, + "step": 5118, + "time_per_iteration": 2.4745492935180664 + }, + { + "auxiliary_loss_clip": 0.01130971, + "auxiliary_loss_mlp": 0.01039981, + "balance_loss_clip": 1.02442479, + "balance_loss_mlp": 1.04630661, + "epoch": 0.30777093040733505, + "flos": 21430985650560.0, + "grad_norm": 1.5789952404099556, + "language_loss": 0.74312431, + "learning_rate": 3.243758033520219e-06, + "loss": 0.76483381, + "num_input_tokens_seen": 109883500, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.84375, + "step": 5119, + "time_per_iteration": 2.5185489654541016 + }, + { + "auxiliary_loss_clip": 0.01136122, + "auxiliary_loss_mlp": 0.01051479, + "balance_loss_clip": 1.03291845, + "balance_loss_mlp": 1.04891181, + "epoch": 0.307831053660003, + "flos": 23149275534720.0, + "grad_norm": 1.733023320063412, + "language_loss": 0.80267692, + "learning_rate": 3.243453017305926e-06, + "loss": 0.82455289, + "num_input_tokens_seen": 109904620, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.875, + "step": 5120, + "time_per_iteration": 2.5592849254608154 + }, + { + "auxiliary_loss_clip": 0.01127219, + "auxiliary_loss_mlp": 0.01048202, + "balance_loss_clip": 1.03299093, + "balance_loss_mlp": 1.04384947, + "epoch": 0.307891176912671, + "flos": 17019755268480.0, + "grad_norm": 1.564134517039273, + "language_loss": 0.80110037, + "learning_rate": 3.24314795393977e-06, + "loss": 0.82285464, + "num_input_tokens_seen": 109922275, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8359375, + "step": 5121, + "time_per_iteration": 2.440516948699951 + }, + { + "auxiliary_loss_clip": 0.0113076, + "auxiliary_loss_mlp": 0.01035014, + "balance_loss_clip": 1.01981497, + "balance_loss_mlp": 1.0480212, + "epoch": 0.30795130016533895, + "flos": 27705046245120.0, + "grad_norm": 1.5001896125792977, + "language_loss": 0.82594395, + "learning_rate": 3.242842843433319e-06, + "loss": 0.84760171, + "num_input_tokens_seen": 109944265, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.828125, + "step": 5122, + "time_per_iteration": 2.510576009750366 + }, + { + "auxiliary_loss_clip": 0.01050329, + "auxiliary_loss_mlp": 0.01017411, + "balance_loss_clip": 1.01562333, + "balance_loss_mlp": 1.01982307, + "epoch": 0.3080114234180069, + "flos": 69058699591680.0, + "grad_norm": 0.7473381596642288, + "language_loss": 0.58639288, + "learning_rate": 3.242537685798143e-06, + "loss": 0.60707027, + "num_input_tokens_seen": 110014160, + "router_z_loss_clip": 0.01782227, + "router_z_loss_mlp": 0.3046875, + "step": 5123, + "time_per_iteration": 3.2167654037475586 + }, + { + "auxiliary_loss_clip": 0.01134332, + "auxiliary_loss_mlp": 0.01036733, + "balance_loss_clip": 1.01917315, + "balance_loss_mlp": 1.04640436, + "epoch": 0.3080715466706749, + "flos": 24060221377920.0, + "grad_norm": 1.5767520801619384, + "language_loss": 0.83622873, + "learning_rate": 3.242232481045813e-06, + "loss": 0.85793942, + "num_input_tokens_seen": 110034865, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.87890625, + "step": 5124, + "time_per_iteration": 2.474625587463379 + }, + { + "auxiliary_loss_clip": 0.01135515, + "auxiliary_loss_mlp": 0.01039715, + "balance_loss_clip": 1.02420545, + "balance_loss_mlp": 1.04945302, + "epoch": 0.30813166992334284, + "flos": 25848680480640.0, + "grad_norm": 1.8429802725909379, + "language_loss": 0.78703862, + "learning_rate": 3.2419272291879035e-06, + "loss": 0.80879092, + "num_input_tokens_seen": 110052930, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.859375, + "step": 5125, + "time_per_iteration": 2.5806493759155273 + }, + { + "auxiliary_loss_clip": 0.01134207, + "auxiliary_loss_mlp": 0.01037354, + "balance_loss_clip": 1.02050948, + "balance_loss_mlp": 1.04717779, + "epoch": 0.3081917931760108, + "flos": 20449619193600.0, + "grad_norm": 1.8928574451074776, + "language_loss": 0.6450479, + "learning_rate": 3.241621930235989e-06, + "loss": 0.66676342, + "num_input_tokens_seen": 110071765, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.87109375, + "step": 5126, + "time_per_iteration": 2.467099666595459 + }, + { + "auxiliary_loss_clip": 0.01129876, + "auxiliary_loss_mlp": 0.01039444, + "balance_loss_clip": 1.02367234, + "balance_loss_mlp": 1.04831636, + "epoch": 0.3082519164286788, + "flos": 22166257052160.0, + "grad_norm": 1.5538294270453243, + "language_loss": 0.86619091, + "learning_rate": 3.241316584201646e-06, + "loss": 0.88788408, + "num_input_tokens_seen": 110092660, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.81640625, + "step": 5127, + "time_per_iteration": 2.543095111846924 + }, + { + "auxiliary_loss_clip": 0.01129649, + "auxiliary_loss_mlp": 0.01040552, + "balance_loss_clip": 1.02439952, + "balance_loss_mlp": 1.04648781, + "epoch": 0.30831203968134674, + "flos": 28913404700160.0, + "grad_norm": 2.186420023793508, + "language_loss": 0.68816996, + "learning_rate": 3.2410111910964538e-06, + "loss": 0.70987189, + "num_input_tokens_seen": 110114960, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.83203125, + "step": 5128, + "time_per_iteration": 2.525390863418579 + }, + { + "auxiliary_loss_clip": 0.01134323, + "auxiliary_loss_mlp": 0.01041963, + "balance_loss_clip": 1.02571476, + "balance_loss_mlp": 1.04763198, + "epoch": 0.3083721629340147, + "flos": 25667726739840.0, + "grad_norm": 1.801256837086347, + "language_loss": 0.71226776, + "learning_rate": 3.240705750931993e-06, + "loss": 0.7340306, + "num_input_tokens_seen": 110135750, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8671875, + "step": 5129, + "time_per_iteration": 2.5417068004608154 + }, + { + "auxiliary_loss_clip": 0.01045915, + "auxiliary_loss_mlp": 0.01008464, + "balance_loss_clip": 1.00633001, + "balance_loss_mlp": 1.01580441, + "epoch": 0.3084322861866827, + "flos": 68212679581440.0, + "grad_norm": 0.9000157132793972, + "language_loss": 0.59171313, + "learning_rate": 3.240400263719846e-06, + "loss": 0.61225688, + "num_input_tokens_seen": 110189480, + "router_z_loss_clip": 0.0213623, + "router_z_loss_mlp": 0.30078125, + "step": 5130, + "time_per_iteration": 3.024799108505249 + }, + { + "auxiliary_loss_clip": 0.01135089, + "auxiliary_loss_mlp": 0.01038466, + "balance_loss_clip": 1.02233696, + "balance_loss_mlp": 1.0485276, + "epoch": 0.3084924094393507, + "flos": 20296495514880.0, + "grad_norm": 2.1422150520884773, + "language_loss": 0.72951442, + "learning_rate": 3.2400947294715957e-06, + "loss": 0.75124997, + "num_input_tokens_seen": 110206445, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8671875, + "step": 5131, + "time_per_iteration": 2.5145480632781982 + }, + { + "auxiliary_loss_clip": 0.01130631, + "auxiliary_loss_mlp": 0.01036573, + "balance_loss_clip": 1.02222049, + "balance_loss_mlp": 1.04737425, + "epoch": 0.30855253269201866, + "flos": 23949831905280.0, + "grad_norm": 1.759562546324366, + "language_loss": 0.71208251, + "learning_rate": 3.2397891481988303e-06, + "loss": 0.73375452, + "num_input_tokens_seen": 110226845, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.83203125, + "step": 5132, + "time_per_iteration": 2.4997506141662598 + }, + { + "auxiliary_loss_clip": 0.01128489, + "auxiliary_loss_mlp": 0.01040365, + "balance_loss_clip": 1.02580929, + "balance_loss_mlp": 1.04823279, + "epoch": 0.3086126559446866, + "flos": 19281876042240.0, + "grad_norm": 1.7072095629792627, + "language_loss": 0.8999784, + "learning_rate": 3.239483519913136e-06, + "loss": 0.92166698, + "num_input_tokens_seen": 110244095, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8046875, + "step": 5133, + "time_per_iteration": 2.4972143173217773 + }, + { + "auxiliary_loss_clip": 0.01136466, + "auxiliary_loss_mlp": 0.01047935, + "balance_loss_clip": 1.03186607, + "balance_loss_mlp": 1.04911399, + "epoch": 0.3086727791973546, + "flos": 33760770019200.0, + "grad_norm": 1.8506383958840185, + "language_loss": 0.67226613, + "learning_rate": 3.239177844626102e-06, + "loss": 0.6941101, + "num_input_tokens_seen": 110264240, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.875, + "step": 5134, + "time_per_iteration": 2.5700669288635254 + }, + { + "auxiliary_loss_clip": 0.0113384, + "auxiliary_loss_mlp": 0.01047239, + "balance_loss_clip": 1.0317775, + "balance_loss_mlp": 1.04718161, + "epoch": 0.30873290245002255, + "flos": 16034151006720.0, + "grad_norm": 2.423009332179396, + "language_loss": 0.82865155, + "learning_rate": 3.2388721223493197e-06, + "loss": 0.85046244, + "num_input_tokens_seen": 110282450, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8671875, + "step": 5135, + "time_per_iteration": 2.4712367057800293 + }, + { + "auxiliary_loss_clip": 0.0104583, + "auxiliary_loss_mlp": 0.01004049, + "balance_loss_clip": 1.00197494, + "balance_loss_mlp": 1.015975, + "epoch": 0.3087930257026905, + "flos": 65048304055680.0, + "grad_norm": 0.7120747448350507, + "language_loss": 0.55243868, + "learning_rate": 3.2385663530943824e-06, + "loss": 0.57293749, + "num_input_tokens_seen": 110343715, + "router_z_loss_clip": 0.02075195, + "router_z_loss_mlp": 0.29882812, + "step": 5136, + "time_per_iteration": 3.1432137489318848 + }, + { + "auxiliary_loss_clip": 0.01132561, + "auxiliary_loss_mlp": 0.01040613, + "balance_loss_clip": 1.02502048, + "balance_loss_mlp": 1.04724097, + "epoch": 0.3088531489553585, + "flos": 74738829824640.0, + "grad_norm": 1.9824711220984585, + "language_loss": 0.76057774, + "learning_rate": 3.2382605368728852e-06, + "loss": 0.78230941, + "num_input_tokens_seen": 110368430, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8515625, + "step": 5137, + "time_per_iteration": 5.764686822891235 + }, + { + "auxiliary_loss_clip": 0.0113183, + "auxiliary_loss_mlp": 0.01037097, + "balance_loss_clip": 1.02310133, + "balance_loss_mlp": 1.04696631, + "epoch": 0.30891327220802645, + "flos": 21142300043520.0, + "grad_norm": 2.0179579208290264, + "language_loss": 0.79909992, + "learning_rate": 3.237954673696424e-06, + "loss": 0.8207891, + "num_input_tokens_seen": 110386735, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.84765625, + "step": 5138, + "time_per_iteration": 2.45621657371521 + }, + { + "auxiliary_loss_clip": 0.01132491, + "auxiliary_loss_mlp": 0.01042876, + "balance_loss_clip": 1.02666378, + "balance_loss_mlp": 1.04560494, + "epoch": 0.3089733954606944, + "flos": 25664494515840.0, + "grad_norm": 1.4272945699581137, + "language_loss": 0.81220984, + "learning_rate": 3.2376487635765983e-06, + "loss": 0.83396351, + "num_input_tokens_seen": 110406820, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.87109375, + "step": 5139, + "time_per_iteration": 2.5283203125 + }, + { + "auxiliary_loss_clip": 0.01137198, + "auxiliary_loss_mlp": 0.01042206, + "balance_loss_clip": 1.02492023, + "balance_loss_mlp": 1.04787183, + "epoch": 0.3090335187133624, + "flos": 19427350124160.0, + "grad_norm": 2.1565991279061736, + "language_loss": 0.77528149, + "learning_rate": 3.2373428065250067e-06, + "loss": 0.79707557, + "num_input_tokens_seen": 110424225, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.89453125, + "step": 5140, + "time_per_iteration": 2.43929386138916 + }, + { + "auxiliary_loss_clip": 0.01129104, + "auxiliary_loss_mlp": 0.01044008, + "balance_loss_clip": 1.02920234, + "balance_loss_mlp": 1.04757929, + "epoch": 0.30909364196603034, + "flos": 20011329440640.0, + "grad_norm": 2.2023621297160156, + "language_loss": 0.78595555, + "learning_rate": 3.237036802553252e-06, + "loss": 0.80768663, + "num_input_tokens_seen": 110443310, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.81640625, + "step": 5141, + "time_per_iteration": 2.5164880752563477 + }, + { + "auxiliary_loss_clip": 0.01134378, + "auxiliary_loss_mlp": 0.01047349, + "balance_loss_clip": 1.03046894, + "balance_loss_mlp": 1.04716825, + "epoch": 0.3091537652186983, + "flos": 19677575243520.0, + "grad_norm": 2.127714885761315, + "language_loss": 0.87142885, + "learning_rate": 3.2367307516729377e-06, + "loss": 0.89324611, + "num_input_tokens_seen": 110460215, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.87109375, + "step": 5142, + "time_per_iteration": 2.4362974166870117 + }, + { + "auxiliary_loss_clip": 0.01131531, + "auxiliary_loss_mlp": 0.0104755, + "balance_loss_clip": 1.03220749, + "balance_loss_mlp": 1.04556274, + "epoch": 0.3092138884713663, + "flos": 17020042577280.0, + "grad_norm": 1.7972015737501748, + "language_loss": 0.7877624, + "learning_rate": 3.23642465389567e-06, + "loss": 0.80955315, + "num_input_tokens_seen": 110479385, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.859375, + "step": 5143, + "time_per_iteration": 2.459317445755005 + }, + { + "auxiliary_loss_clip": 0.01130331, + "auxiliary_loss_mlp": 0.01043432, + "balance_loss_clip": 1.02742219, + "balance_loss_mlp": 1.04593444, + "epoch": 0.3092740117240343, + "flos": 25009986844800.0, + "grad_norm": 1.9461458902951219, + "language_loss": 0.72098875, + "learning_rate": 3.236118509233055e-06, + "loss": 0.74272639, + "num_input_tokens_seen": 110499885, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.84375, + "step": 5144, + "time_per_iteration": 2.4872243404388428 + }, + { + "auxiliary_loss_clip": 0.01132569, + "auxiliary_loss_mlp": 0.01040748, + "balance_loss_clip": 1.02418947, + "balance_loss_mlp": 1.04587483, + "epoch": 0.30933413497670226, + "flos": 25590410714880.0, + "grad_norm": 1.7305751805857612, + "language_loss": 0.74054307, + "learning_rate": 3.235812317696702e-06, + "loss": 0.76227629, + "num_input_tokens_seen": 110519690, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8671875, + "step": 5145, + "time_per_iteration": 2.524683952331543 + }, + { + "auxiliary_loss_clip": 0.01132717, + "auxiliary_loss_mlp": 0.01045609, + "balance_loss_clip": 1.02951622, + "balance_loss_mlp": 1.04737079, + "epoch": 0.3093942582293702, + "flos": 24389665943040.0, + "grad_norm": 1.6607552662218326, + "language_loss": 0.76461762, + "learning_rate": 3.2355060792982224e-06, + "loss": 0.78640091, + "num_input_tokens_seen": 110540520, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8515625, + "step": 5146, + "time_per_iteration": 2.4848198890686035 + }, + { + "auxiliary_loss_clip": 0.01130265, + "auxiliary_loss_mlp": 0.01037136, + "balance_loss_clip": 1.02213407, + "balance_loss_mlp": 1.04672074, + "epoch": 0.3094543814820382, + "flos": 19646441130240.0, + "grad_norm": 2.385312171088194, + "language_loss": 0.66755533, + "learning_rate": 3.2351997940492286e-06, + "loss": 0.68922937, + "num_input_tokens_seen": 110557950, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8359375, + "step": 5147, + "time_per_iteration": 2.4861929416656494 + }, + { + "auxiliary_loss_clip": 0.01135751, + "auxiliary_loss_mlp": 0.01040015, + "balance_loss_clip": 1.02517319, + "balance_loss_mlp": 1.04931486, + "epoch": 0.30951450473470615, + "flos": 25663812157440.0, + "grad_norm": 2.0402709532397205, + "language_loss": 0.75148058, + "learning_rate": 3.2348934619613346e-06, + "loss": 0.77323824, + "num_input_tokens_seen": 110578215, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8671875, + "step": 5148, + "time_per_iteration": 2.505180597305298 + }, + { + "auxiliary_loss_clip": 0.01139245, + "auxiliary_loss_mlp": 0.0104464, + "balance_loss_clip": 1.02815318, + "balance_loss_mlp": 1.04876494, + "epoch": 0.3095746279873741, + "flos": 12020415505920.0, + "grad_norm": 2.1288750992632677, + "language_loss": 0.72576058, + "learning_rate": 3.2345870830461567e-06, + "loss": 0.74759942, + "num_input_tokens_seen": 110592990, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.90625, + "step": 5149, + "time_per_iteration": 2.4605252742767334 + }, + { + "auxiliary_loss_clip": 0.01133233, + "auxiliary_loss_mlp": 0.01041255, + "balance_loss_clip": 1.02442312, + "balance_loss_mlp": 1.0457058, + "epoch": 0.3096347512400421, + "flos": 23623044946560.0, + "grad_norm": 2.112154456836484, + "language_loss": 0.84981489, + "learning_rate": 3.2342806573153132e-06, + "loss": 0.87155974, + "num_input_tokens_seen": 110612130, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.875, + "step": 5150, + "time_per_iteration": 2.4866578578948975 + }, + { + "auxiliary_loss_clip": 0.01131574, + "auxiliary_loss_mlp": 0.01041618, + "balance_loss_clip": 1.02515531, + "balance_loss_mlp": 1.04593086, + "epoch": 0.30969487449271005, + "flos": 22529313768960.0, + "grad_norm": 1.9529089609254688, + "language_loss": 0.79053164, + "learning_rate": 3.233974184780424e-06, + "loss": 0.81226349, + "num_input_tokens_seen": 110632045, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.859375, + "step": 5151, + "time_per_iteration": 2.4936540126800537 + }, + { + "auxiliary_loss_clip": 0.01133842, + "auxiliary_loss_mlp": 0.01042555, + "balance_loss_clip": 1.02580595, + "balance_loss_mlp": 1.0471015, + "epoch": 0.309754997745378, + "flos": 15267925059840.0, + "grad_norm": 3.1311630498810774, + "language_loss": 0.67020154, + "learning_rate": 3.2336676654531084e-06, + "loss": 0.69196552, + "num_input_tokens_seen": 110649340, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8671875, + "step": 5152, + "time_per_iteration": 2.429640054702759 + }, + { + "auxiliary_loss_clip": 0.01132623, + "auxiliary_loss_mlp": 0.01043705, + "balance_loss_clip": 1.0275166, + "balance_loss_mlp": 1.04688787, + "epoch": 0.309815120998046, + "flos": 26979291947520.0, + "grad_norm": 12.57465651148819, + "language_loss": 0.82058132, + "learning_rate": 3.2333610993449926e-06, + "loss": 0.84234464, + "num_input_tokens_seen": 110668450, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.859375, + "step": 5153, + "time_per_iteration": 2.578856945037842 + }, + { + "auxiliary_loss_clip": 0.01133847, + "auxiliary_loss_mlp": 0.01042916, + "balance_loss_clip": 1.02788973, + "balance_loss_mlp": 1.04822588, + "epoch": 0.30987524425071394, + "flos": 21143161969920.0, + "grad_norm": 1.7956706783057126, + "language_loss": 0.73902357, + "learning_rate": 3.2330544864676997e-06, + "loss": 0.76079118, + "num_input_tokens_seen": 110689410, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.85546875, + "step": 5154, + "time_per_iteration": 2.5063655376434326 + }, + { + "auxiliary_loss_clip": 0.01133271, + "auxiliary_loss_mlp": 0.01039056, + "balance_loss_clip": 1.02287924, + "balance_loss_mlp": 1.04747653, + "epoch": 0.3099353675033819, + "flos": 15268284195840.0, + "grad_norm": 2.516871287947693, + "language_loss": 0.76051688, + "learning_rate": 3.232747826832858e-06, + "loss": 0.78224009, + "num_input_tokens_seen": 110707350, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.859375, + "step": 5155, + "time_per_iteration": 2.4838123321533203 + }, + { + "auxiliary_loss_clip": 0.01135928, + "auxiliary_loss_mlp": 0.01042973, + "balance_loss_clip": 1.02701044, + "balance_loss_mlp": 1.04871869, + "epoch": 0.30999549075604993, + "flos": 15413794191360.0, + "grad_norm": 1.7492301646526522, + "language_loss": 0.7883296, + "learning_rate": 3.232441120452094e-06, + "loss": 0.81011862, + "num_input_tokens_seen": 110724910, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87109375, + "step": 5156, + "time_per_iteration": 2.4420597553253174 + }, + { + "auxiliary_loss_clip": 0.01134302, + "auxiliary_loss_mlp": 0.01046544, + "balance_loss_clip": 1.02894902, + "balance_loss_mlp": 1.04688191, + "epoch": 0.3100556140087179, + "flos": 23184539712000.0, + "grad_norm": 3.007667649484548, + "language_loss": 0.75094402, + "learning_rate": 3.23213436733704e-06, + "loss": 0.77275252, + "num_input_tokens_seen": 110744010, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.875, + "step": 5157, + "time_per_iteration": 2.4922094345092773 + }, + { + "auxiliary_loss_clip": 0.01131262, + "auxiliary_loss_mlp": 0.01037688, + "balance_loss_clip": 1.02282262, + "balance_loss_mlp": 1.04701662, + "epoch": 0.31011573726138586, + "flos": 25742169676800.0, + "grad_norm": 1.583276716554569, + "language_loss": 0.69391131, + "learning_rate": 3.231827567499327e-06, + "loss": 0.71560085, + "num_input_tokens_seen": 110765835, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.84375, + "step": 5158, + "time_per_iteration": 2.5119874477386475 + }, + { + "auxiliary_loss_clip": 0.0113222, + "auxiliary_loss_mlp": 0.01040926, + "balance_loss_clip": 1.0260725, + "balance_loss_mlp": 1.04802489, + "epoch": 0.3101758605140538, + "flos": 20011329440640.0, + "grad_norm": 1.8674515495135584, + "language_loss": 0.84731698, + "learning_rate": 3.2315207209505896e-06, + "loss": 0.86904848, + "num_input_tokens_seen": 110784655, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.84375, + "step": 5159, + "time_per_iteration": 2.5553479194641113 + }, + { + "auxiliary_loss_clip": 0.01130577, + "auxiliary_loss_mlp": 0.01037318, + "balance_loss_clip": 1.0215224, + "balance_loss_mlp": 1.04617286, + "epoch": 0.3102359837667218, + "flos": 19135683688320.0, + "grad_norm": 1.6286624468626467, + "language_loss": 0.85222661, + "learning_rate": 3.231213827702462e-06, + "loss": 0.87390554, + "num_input_tokens_seen": 110802545, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.84375, + "step": 5160, + "time_per_iteration": 2.4521608352661133 + }, + { + "auxiliary_loss_clip": 0.01131067, + "auxiliary_loss_mlp": 0.01039214, + "balance_loss_clip": 1.02385354, + "balance_loss_mlp": 1.04720986, + "epoch": 0.31029610701938976, + "flos": 22265405568000.0, + "grad_norm": 2.1323719792042404, + "language_loss": 0.76438844, + "learning_rate": 3.230906887766584e-06, + "loss": 0.78609127, + "num_input_tokens_seen": 110820265, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.83984375, + "step": 5161, + "time_per_iteration": 2.4705073833465576 + }, + { + "auxiliary_loss_clip": 0.01133183, + "auxiliary_loss_mlp": 0.01040129, + "balance_loss_clip": 1.02420259, + "balance_loss_mlp": 1.04661226, + "epoch": 0.3103562302720577, + "flos": 20805349536000.0, + "grad_norm": 1.9681741891595628, + "language_loss": 0.81644946, + "learning_rate": 3.2305999011545924e-06, + "loss": 0.83818257, + "num_input_tokens_seen": 110836195, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8671875, + "step": 5162, + "time_per_iteration": 2.4359090328216553 + }, + { + "auxiliary_loss_clip": 0.01129525, + "auxiliary_loss_mlp": 0.01037231, + "balance_loss_clip": 1.0231998, + "balance_loss_mlp": 1.04580498, + "epoch": 0.3104163535247257, + "flos": 22344158136960.0, + "grad_norm": 1.6668116654420786, + "language_loss": 0.82879269, + "learning_rate": 3.2302928678781295e-06, + "loss": 0.85046029, + "num_input_tokens_seen": 110856420, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8359375, + "step": 5163, + "time_per_iteration": 2.536198854446411 + }, + { + "auxiliary_loss_clip": 0.01134589, + "auxiliary_loss_mlp": 0.01042558, + "balance_loss_clip": 1.02670264, + "balance_loss_mlp": 1.04848182, + "epoch": 0.31047647677739365, + "flos": 21689363157120.0, + "grad_norm": 1.61479678935284, + "language_loss": 0.76103258, + "learning_rate": 3.2299857879488376e-06, + "loss": 0.78280413, + "num_input_tokens_seen": 110876650, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.859375, + "step": 5164, + "time_per_iteration": 2.4736320972442627 + }, + { + "auxiliary_loss_clip": 0.01134485, + "auxiliary_loss_mlp": 0.01041252, + "balance_loss_clip": 1.02492666, + "balance_loss_mlp": 1.04932189, + "epoch": 0.3105366000300616, + "flos": 18917275040640.0, + "grad_norm": 1.73414256762253, + "language_loss": 0.74515426, + "learning_rate": 3.2296786613783626e-06, + "loss": 0.76691169, + "num_input_tokens_seen": 110894445, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8515625, + "step": 5165, + "time_per_iteration": 2.4788122177124023 + }, + { + "auxiliary_loss_clip": 0.01132367, + "auxiliary_loss_mlp": 0.01042006, + "balance_loss_clip": 1.02627063, + "balance_loss_mlp": 1.0472759, + "epoch": 0.3105967232827296, + "flos": 18260397072000.0, + "grad_norm": 2.461614607097325, + "language_loss": 0.75987816, + "learning_rate": 3.229371488178348e-06, + "loss": 0.78162187, + "num_input_tokens_seen": 110912855, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8515625, + "step": 5166, + "time_per_iteration": 2.4461371898651123 + }, + { + "auxiliary_loss_clip": 0.01133631, + "auxiliary_loss_mlp": 0.01045635, + "balance_loss_clip": 1.02939892, + "balance_loss_mlp": 1.04844868, + "epoch": 0.31065684653539755, + "flos": 17672144037120.0, + "grad_norm": 2.4324780660218557, + "language_loss": 0.73424876, + "learning_rate": 3.229064268360444e-06, + "loss": 0.75604147, + "num_input_tokens_seen": 110928025, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8515625, + "step": 5167, + "time_per_iteration": 2.4301631450653076 + }, + { + "auxiliary_loss_clip": 0.01047334, + "auxiliary_loss_mlp": 0.01006703, + "balance_loss_clip": 1.00467682, + "balance_loss_mlp": 1.01844001, + "epoch": 0.3107169697880655, + "flos": 68531996511360.0, + "grad_norm": 0.725291341239906, + "language_loss": 0.53031516, + "learning_rate": 3.2287570019362997e-06, + "loss": 0.55085552, + "num_input_tokens_seen": 110992215, + "router_z_loss_clip": 0.02026367, + "router_z_loss_mlp": 0.2890625, + "step": 5168, + "time_per_iteration": 3.1146020889282227 + }, + { + "auxiliary_loss_clip": 0.01133751, + "auxiliary_loss_mlp": 0.01043639, + "balance_loss_clip": 1.0269258, + "balance_loss_mlp": 1.0465318, + "epoch": 0.3107770930407335, + "flos": 13188733274880.0, + "grad_norm": 1.782356602828545, + "language_loss": 0.78745592, + "learning_rate": 3.2284496889175668e-06, + "loss": 0.80922985, + "num_input_tokens_seen": 111010400, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.87109375, + "step": 5169, + "time_per_iteration": 2.4755852222442627 + }, + { + "auxiliary_loss_clip": 0.01132974, + "auxiliary_loss_mlp": 0.01038846, + "balance_loss_clip": 1.02337217, + "balance_loss_mlp": 1.04640126, + "epoch": 0.3108372162934015, + "flos": 31580849520000.0, + "grad_norm": 1.536235209485244, + "language_loss": 0.6414057, + "learning_rate": 3.2281423293158986e-06, + "loss": 0.66312397, + "num_input_tokens_seen": 111033960, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8671875, + "step": 5170, + "time_per_iteration": 2.5690839290618896 + }, + { + "auxiliary_loss_clip": 0.01133399, + "auxiliary_loss_mlp": 0.01042243, + "balance_loss_clip": 1.02635252, + "balance_loss_mlp": 1.04721069, + "epoch": 0.31089733954606946, + "flos": 28729829266560.0, + "grad_norm": 2.41080559035864, + "language_loss": 0.77698815, + "learning_rate": 3.22783492314295e-06, + "loss": 0.79874456, + "num_input_tokens_seen": 111053265, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.86328125, + "step": 5171, + "time_per_iteration": 2.558258295059204 + }, + { + "auxiliary_loss_clip": 0.01132946, + "auxiliary_loss_mlp": 0.01053954, + "balance_loss_clip": 1.03769374, + "balance_loss_mlp": 1.04645526, + "epoch": 0.3109574627987374, + "flos": 19683249592320.0, + "grad_norm": 1.9319520361735263, + "language_loss": 0.83802366, + "learning_rate": 3.2275274704103785e-06, + "loss": 0.85989261, + "num_input_tokens_seen": 111071130, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8671875, + "step": 5172, + "time_per_iteration": 2.4601597785949707 + }, + { + "auxiliary_loss_clip": 0.01133186, + "auxiliary_loss_mlp": 0.01045771, + "balance_loss_clip": 1.02948654, + "balance_loss_mlp": 1.0467186, + "epoch": 0.3110175860514054, + "flos": 14683981656960.0, + "grad_norm": 1.9586589765002733, + "language_loss": 0.84225619, + "learning_rate": 3.227219971129842e-06, + "loss": 0.86404574, + "num_input_tokens_seen": 111089560, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8671875, + "step": 5173, + "time_per_iteration": 2.501840591430664 + }, + { + "auxiliary_loss_clip": 0.01128358, + "auxiliary_loss_mlp": 0.01034767, + "balance_loss_clip": 1.02038455, + "balance_loss_mlp": 1.04595959, + "epoch": 0.31107770930407336, + "flos": 25739655724800.0, + "grad_norm": 1.622637298809784, + "language_loss": 0.83323705, + "learning_rate": 3.226912425313001e-06, + "loss": 0.85486829, + "num_input_tokens_seen": 111109960, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.82421875, + "step": 5174, + "time_per_iteration": 2.507127285003662 + }, + { + "auxiliary_loss_clip": 0.01131648, + "auxiliary_loss_mlp": 0.01047063, + "balance_loss_clip": 1.03155434, + "balance_loss_mlp": 1.04670012, + "epoch": 0.3111378325567413, + "flos": 19208259118080.0, + "grad_norm": 2.3340025504670003, + "language_loss": 0.84681082, + "learning_rate": 3.2266048329715183e-06, + "loss": 0.86859798, + "num_input_tokens_seen": 111127960, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84765625, + "step": 5175, + "time_per_iteration": 2.4853246212005615 + }, + { + "auxiliary_loss_clip": 0.01133689, + "auxiliary_loss_mlp": 0.01047203, + "balance_loss_clip": 1.03029919, + "balance_loss_mlp": 1.04996502, + "epoch": 0.3111979558094093, + "flos": 23696374561920.0, + "grad_norm": 1.6466695594130172, + "language_loss": 0.83448446, + "learning_rate": 3.2262971941170575e-06, + "loss": 0.85629338, + "num_input_tokens_seen": 111146730, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8359375, + "step": 5176, + "time_per_iteration": 2.4759509563446045 + }, + { + "auxiliary_loss_clip": 0.01128858, + "auxiliary_loss_mlp": 0.01044446, + "balance_loss_clip": 1.02836514, + "balance_loss_mlp": 1.04442942, + "epoch": 0.31125807906207725, + "flos": 21033023892480.0, + "grad_norm": 1.7899579393784935, + "language_loss": 0.80820966, + "learning_rate": 3.2259895087612837e-06, + "loss": 0.8299427, + "num_input_tokens_seen": 111166295, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.84375, + "step": 5177, + "time_per_iteration": 2.5106611251831055 + }, + { + "auxiliary_loss_clip": 0.0113295, + "auxiliary_loss_mlp": 0.01041813, + "balance_loss_clip": 1.02629185, + "balance_loss_mlp": 1.048877, + "epoch": 0.3113182023147452, + "flos": 23076628277760.0, + "grad_norm": 1.9871899212943351, + "language_loss": 0.80703342, + "learning_rate": 3.2256817769158657e-06, + "loss": 0.82878101, + "num_input_tokens_seen": 111185665, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.84375, + "step": 5178, + "time_per_iteration": 4.0482330322265625 + }, + { + "auxiliary_loss_clip": 0.01131397, + "auxiliary_loss_mlp": 0.01048541, + "balance_loss_clip": 1.03310347, + "balance_loss_mlp": 1.04518402, + "epoch": 0.3113783255674132, + "flos": 11838994888320.0, + "grad_norm": 1.8347450184704097, + "language_loss": 0.81340981, + "learning_rate": 3.225373998592471e-06, + "loss": 0.83520925, + "num_input_tokens_seen": 111201615, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.86328125, + "step": 5179, + "time_per_iteration": 3.82991886138916 + }, + { + "auxiliary_loss_clip": 0.01132507, + "auxiliary_loss_mlp": 0.01049787, + "balance_loss_clip": 1.0338006, + "balance_loss_mlp": 1.04824936, + "epoch": 0.31143844882008115, + "flos": 16289547684480.0, + "grad_norm": 1.599561013411363, + "language_loss": 0.78199375, + "learning_rate": 3.2250661738027715e-06, + "loss": 0.8038168, + "num_input_tokens_seen": 111220515, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.84375, + "step": 5180, + "time_per_iteration": 2.4656291007995605 + }, + { + "auxiliary_loss_clip": 0.01130701, + "auxiliary_loss_mlp": 0.01035311, + "balance_loss_clip": 1.01915836, + "balance_loss_mlp": 1.04672408, + "epoch": 0.3114985720727491, + "flos": 23217792727680.0, + "grad_norm": 1.6380256774064115, + "language_loss": 0.83046079, + "learning_rate": 3.22475830255844e-06, + "loss": 0.85212088, + "num_input_tokens_seen": 111240395, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.84375, + "step": 5181, + "time_per_iteration": 2.5661914348602295 + }, + { + "auxiliary_loss_clip": 0.01128181, + "auxiliary_loss_mlp": 0.01043679, + "balance_loss_clip": 1.02903986, + "balance_loss_mlp": 1.0464232, + "epoch": 0.3115586953254171, + "flos": 30044626698240.0, + "grad_norm": 1.700886032828765, + "language_loss": 0.74084079, + "learning_rate": 3.2244503848711516e-06, + "loss": 0.76255929, + "num_input_tokens_seen": 111261100, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8203125, + "step": 5182, + "time_per_iteration": 2.5913209915161133 + }, + { + "auxiliary_loss_clip": 0.01136348, + "auxiliary_loss_mlp": 0.01050649, + "balance_loss_clip": 1.03479409, + "balance_loss_mlp": 1.04858768, + "epoch": 0.3116188185780851, + "flos": 25666326109440.0, + "grad_norm": 1.8010906920491343, + "language_loss": 0.70658493, + "learning_rate": 3.2241424207525815e-06, + "loss": 0.72845489, + "num_input_tokens_seen": 111281320, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.875, + "step": 5183, + "time_per_iteration": 2.4991438388824463 + }, + { + "auxiliary_loss_clip": 0.01045533, + "auxiliary_loss_mlp": 0.01014757, + "balance_loss_clip": 1.01301634, + "balance_loss_mlp": 1.01690507, + "epoch": 0.31167894183075306, + "flos": 69510058917120.0, + "grad_norm": 0.9414003998762589, + "language_loss": 0.59602594, + "learning_rate": 3.223834410214408e-06, + "loss": 0.61662877, + "num_input_tokens_seen": 111341405, + "router_z_loss_clip": 0.01745605, + "router_z_loss_mlp": 0.28515625, + "step": 5184, + "time_per_iteration": 3.0754520893096924 + }, + { + "auxiliary_loss_clip": 0.01130364, + "auxiliary_loss_mlp": 0.01047375, + "balance_loss_clip": 1.03264058, + "balance_loss_mlp": 1.04596519, + "epoch": 0.31173906508342103, + "flos": 14939845211520.0, + "grad_norm": 2.811836993883612, + "language_loss": 0.69750082, + "learning_rate": 3.223526353268311e-06, + "loss": 0.71927822, + "num_input_tokens_seen": 111358975, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.84375, + "step": 5185, + "time_per_iteration": 2.435033082962036 + }, + { + "auxiliary_loss_clip": 0.01136749, + "auxiliary_loss_mlp": 0.01048147, + "balance_loss_clip": 1.0323875, + "balance_loss_mlp": 1.05073345, + "epoch": 0.311799188336089, + "flos": 16176033728640.0, + "grad_norm": 2.346024133586612, + "language_loss": 0.63920057, + "learning_rate": 3.2232182499259725e-06, + "loss": 0.66104954, + "num_input_tokens_seen": 111375845, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.859375, + "step": 5186, + "time_per_iteration": 2.463900327682495 + }, + { + "auxiliary_loss_clip": 0.01137188, + "auxiliary_loss_mlp": 0.01049347, + "balance_loss_clip": 1.03219295, + "balance_loss_mlp": 1.04886758, + "epoch": 0.31185931158875696, + "flos": 25009627708800.0, + "grad_norm": 2.108066194391345, + "language_loss": 0.86249322, + "learning_rate": 3.2229101001990747e-06, + "loss": 0.88435853, + "num_input_tokens_seen": 111394150, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.8828125, + "step": 5187, + "time_per_iteration": 2.4854979515075684 + }, + { + "auxiliary_loss_clip": 0.01129847, + "auxiliary_loss_mlp": 0.0104672, + "balance_loss_clip": 1.03048384, + "balance_loss_mlp": 1.0451926, + "epoch": 0.3119194348414249, + "flos": 37232901273600.0, + "grad_norm": 1.7445298378798078, + "language_loss": 0.62983185, + "learning_rate": 3.2226019040993036e-06, + "loss": 0.6515975, + "num_input_tokens_seen": 111418355, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.84765625, + "step": 5188, + "time_per_iteration": 2.6161019802093506 + }, + { + "auxiliary_loss_clip": 0.01135744, + "auxiliary_loss_mlp": 0.01045566, + "balance_loss_clip": 1.02961564, + "balance_loss_mlp": 1.05116081, + "epoch": 0.3119795580940929, + "flos": 15012779777280.0, + "grad_norm": 2.1633857437120256, + "language_loss": 0.8347863, + "learning_rate": 3.222293661638346e-06, + "loss": 0.85659939, + "num_input_tokens_seen": 111435445, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.84375, + "step": 5189, + "time_per_iteration": 2.4360432624816895 + }, + { + "auxiliary_loss_clip": 0.01129905, + "auxiliary_loss_mlp": 0.01036753, + "balance_loss_clip": 1.0213753, + "balance_loss_mlp": 1.04657507, + "epoch": 0.31203968134676086, + "flos": 15998168557440.0, + "grad_norm": 1.6712014044776404, + "language_loss": 0.7916308, + "learning_rate": 3.22198537282789e-06, + "loss": 0.81329739, + "num_input_tokens_seen": 111453430, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.83203125, + "step": 5190, + "time_per_iteration": 2.472668170928955 + }, + { + "auxiliary_loss_clip": 0.01133914, + "auxiliary_loss_mlp": 0.01035258, + "balance_loss_clip": 1.01986194, + "balance_loss_mlp": 1.04946673, + "epoch": 0.3120998045994288, + "flos": 23837359443840.0, + "grad_norm": 1.4545499288259176, + "language_loss": 0.75318813, + "learning_rate": 3.2216770376796262e-06, + "loss": 0.77487987, + "num_input_tokens_seen": 111475325, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84375, + "step": 5191, + "time_per_iteration": 2.486673355102539 + }, + { + "auxiliary_loss_clip": 0.01049091, + "auxiliary_loss_mlp": 0.01002214, + "balance_loss_clip": 1.00025892, + "balance_loss_mlp": 1.02067924, + "epoch": 0.3121599278520968, + "flos": 69184205712000.0, + "grad_norm": 0.8451593954944295, + "language_loss": 0.63957787, + "learning_rate": 3.221368656205247e-06, + "loss": 0.66009092, + "num_input_tokens_seen": 111533960, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.28515625, + "step": 5192, + "time_per_iteration": 3.1464638710021973 + }, + { + "auxiliary_loss_clip": 0.01134311, + "auxiliary_loss_mlp": 0.01041004, + "balance_loss_clip": 1.02365923, + "balance_loss_mlp": 1.04795599, + "epoch": 0.31222005110476475, + "flos": 23806368984960.0, + "grad_norm": 1.6164756923867671, + "language_loss": 0.80154347, + "learning_rate": 3.221060228416446e-06, + "loss": 0.82329667, + "num_input_tokens_seen": 111554055, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.86328125, + "step": 5193, + "time_per_iteration": 2.5156989097595215 + }, + { + "auxiliary_loss_clip": 0.01131615, + "auxiliary_loss_mlp": 0.01042627, + "balance_loss_clip": 1.02610445, + "balance_loss_mlp": 1.045856, + "epoch": 0.3122801743574327, + "flos": 25226132935680.0, + "grad_norm": 1.8140889441731107, + "language_loss": 0.72050476, + "learning_rate": 3.2207517543249183e-06, + "loss": 0.74224722, + "num_input_tokens_seen": 111574305, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.85546875, + "step": 5194, + "time_per_iteration": 2.519972801208496 + }, + { + "auxiliary_loss_clip": 0.01133223, + "auxiliary_loss_mlp": 0.01039811, + "balance_loss_clip": 1.02471924, + "balance_loss_mlp": 1.04870749, + "epoch": 0.3123402976101007, + "flos": 22966490200320.0, + "grad_norm": 1.3544515008303952, + "language_loss": 0.76475823, + "learning_rate": 3.2204432339423616e-06, + "loss": 0.78648859, + "num_input_tokens_seen": 111595680, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84765625, + "step": 5195, + "time_per_iteration": 2.512247323989868 + }, + { + "auxiliary_loss_clip": 0.01131656, + "auxiliary_loss_mlp": 0.01042642, + "balance_loss_clip": 1.02718091, + "balance_loss_mlp": 1.0449183, + "epoch": 0.3124004208627687, + "flos": 25192089820800.0, + "grad_norm": 1.3526234536893298, + "language_loss": 0.7817502, + "learning_rate": 3.220134667280476e-06, + "loss": 0.80349314, + "num_input_tokens_seen": 111618135, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8671875, + "step": 5196, + "time_per_iteration": 2.528002977371216 + }, + { + "auxiliary_loss_clip": 0.01044386, + "auxiliary_loss_mlp": 0.01000444, + "balance_loss_clip": 0.99860841, + "balance_loss_mlp": 1.01643729, + "epoch": 0.31246054411543667, + "flos": 67485165517440.0, + "grad_norm": 0.7752479618797538, + "language_loss": 0.54834789, + "learning_rate": 3.2198260543509613e-06, + "loss": 0.56879622, + "num_input_tokens_seen": 111682220, + "router_z_loss_clip": 0.01831055, + "router_z_loss_mlp": 0.27929688, + "step": 5197, + "time_per_iteration": 3.0728254318237305 + }, + { + "auxiliary_loss_clip": 0.01130689, + "auxiliary_loss_mlp": 0.01038137, + "balance_loss_clip": 1.02328372, + "balance_loss_mlp": 1.0477525, + "epoch": 0.31252066736810463, + "flos": 17858520731520.0, + "grad_norm": 1.6543672060788046, + "language_loss": 0.66300559, + "learning_rate": 3.21951739516552e-06, + "loss": 0.68469381, + "num_input_tokens_seen": 111700815, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.828125, + "step": 5198, + "time_per_iteration": 2.4312028884887695 + }, + { + "auxiliary_loss_clip": 0.01135097, + "auxiliary_loss_mlp": 0.01037705, + "balance_loss_clip": 1.02156413, + "balance_loss_mlp": 1.0472604, + "epoch": 0.3125807906207726, + "flos": 18475034791680.0, + "grad_norm": 2.083859755504136, + "language_loss": 0.69763082, + "learning_rate": 3.219208689735857e-06, + "loss": 0.71935886, + "num_input_tokens_seen": 111718195, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.875, + "step": 5199, + "time_per_iteration": 2.454464912414551 + }, + { + "auxiliary_loss_clip": 0.01131797, + "auxiliary_loss_mlp": 0.01049575, + "balance_loss_clip": 1.0336132, + "balance_loss_mlp": 1.04692471, + "epoch": 0.31264091387344056, + "flos": 18946541646720.0, + "grad_norm": 1.8982997112015956, + "language_loss": 0.79004937, + "learning_rate": 3.2188999380736785e-06, + "loss": 0.81186306, + "num_input_tokens_seen": 111734440, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.84765625, + "step": 5200, + "time_per_iteration": 2.4382827281951904 + }, + { + "auxiliary_loss_clip": 0.01127793, + "auxiliary_loss_mlp": 0.01036846, + "balance_loss_clip": 1.02187347, + "balance_loss_mlp": 1.04621911, + "epoch": 0.3127010371261085, + "flos": 21468512384640.0, + "grad_norm": 2.042457973745699, + "language_loss": 0.83946276, + "learning_rate": 3.2185911401906917e-06, + "loss": 0.86110914, + "num_input_tokens_seen": 111751960, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.81640625, + "step": 5201, + "time_per_iteration": 2.475511074066162 + }, + { + "auxiliary_loss_clip": 0.01134303, + "auxiliary_loss_mlp": 0.01046403, + "balance_loss_clip": 1.02990484, + "balance_loss_mlp": 1.04985881, + "epoch": 0.3127611603787765, + "flos": 15336047203200.0, + "grad_norm": 2.37604325800411, + "language_loss": 0.69560832, + "learning_rate": 3.2182822960986072e-06, + "loss": 0.71741533, + "num_input_tokens_seen": 111769585, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.84375, + "step": 5202, + "time_per_iteration": 2.4265501499176025 + }, + { + "auxiliary_loss_clip": 0.01133329, + "auxiliary_loss_mlp": 0.01041339, + "balance_loss_clip": 1.02737963, + "balance_loss_mlp": 1.04759419, + "epoch": 0.31282128363144446, + "flos": 17602980399360.0, + "grad_norm": 1.800546738819683, + "language_loss": 0.84001613, + "learning_rate": 3.2179734058091358e-06, + "loss": 0.86176282, + "num_input_tokens_seen": 111787880, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.85546875, + "step": 5203, + "time_per_iteration": 2.480233907699585 + }, + { + "auxiliary_loss_clip": 0.01131997, + "auxiliary_loss_mlp": 0.01047163, + "balance_loss_clip": 1.03176749, + "balance_loss_mlp": 1.04697657, + "epoch": 0.3128814068841124, + "flos": 26756753235840.0, + "grad_norm": 2.9129021624211417, + "language_loss": 0.60623944, + "learning_rate": 3.2176644693339913e-06, + "loss": 0.62803102, + "num_input_tokens_seen": 111805950, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8515625, + "step": 5204, + "time_per_iteration": 2.50688099861145 + }, + { + "auxiliary_loss_clip": 0.01129885, + "auxiliary_loss_mlp": 0.01041088, + "balance_loss_clip": 1.02672338, + "balance_loss_mlp": 1.04707503, + "epoch": 0.3129415301367804, + "flos": 22272372806400.0, + "grad_norm": 1.6006708998064776, + "language_loss": 0.65964866, + "learning_rate": 3.217355486684887e-06, + "loss": 0.68135834, + "num_input_tokens_seen": 111826135, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.828125, + "step": 5205, + "time_per_iteration": 2.4824163913726807 + }, + { + "auxiliary_loss_clip": 0.01132532, + "auxiliary_loss_mlp": 0.01043219, + "balance_loss_clip": 1.0264169, + "balance_loss_mlp": 1.0476222, + "epoch": 0.31300165338944835, + "flos": 26464907232000.0, + "grad_norm": 1.9498647702732133, + "language_loss": 0.76618874, + "learning_rate": 3.2170464578735414e-06, + "loss": 0.78794622, + "num_input_tokens_seen": 111844700, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.84765625, + "step": 5206, + "time_per_iteration": 2.4947307109832764 + }, + { + "auxiliary_loss_clip": 0.0112786, + "auxiliary_loss_mlp": 0.01039372, + "balance_loss_clip": 1.02416039, + "balance_loss_mlp": 1.04438448, + "epoch": 0.3130617766421163, + "flos": 21944652094080.0, + "grad_norm": 3.088705810465425, + "language_loss": 0.83287984, + "learning_rate": 3.216737382911672e-06, + "loss": 0.85455215, + "num_input_tokens_seen": 111861585, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8359375, + "step": 5207, + "time_per_iteration": 2.4767825603485107 + }, + { + "auxiliary_loss_clip": 0.01128039, + "auxiliary_loss_mlp": 0.01041894, + "balance_loss_clip": 1.02784562, + "balance_loss_mlp": 1.04694057, + "epoch": 0.3131218998947843, + "flos": 23292774368640.0, + "grad_norm": 1.5219202808663073, + "language_loss": 0.71293664, + "learning_rate": 3.216428261810999e-06, + "loss": 0.73463601, + "num_input_tokens_seen": 111882950, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8125, + "step": 5208, + "time_per_iteration": 2.4853296279907227 + }, + { + "auxiliary_loss_clip": 0.01133198, + "auxiliary_loss_mlp": 0.01041056, + "balance_loss_clip": 1.02534437, + "balance_loss_mlp": 1.04957032, + "epoch": 0.3131820231474523, + "flos": 21139642437120.0, + "grad_norm": 1.8332946649412374, + "language_loss": 0.74547577, + "learning_rate": 3.2161190945832445e-06, + "loss": 0.76721835, + "num_input_tokens_seen": 111901640, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8359375, + "step": 5209, + "time_per_iteration": 2.5162742137908936 + }, + { + "auxiliary_loss_clip": 0.0113008, + "auxiliary_loss_mlp": 0.01040855, + "balance_loss_clip": 1.02695489, + "balance_loss_mlp": 1.04557538, + "epoch": 0.31324214640012027, + "flos": 23909863046400.0, + "grad_norm": 1.818845882779476, + "language_loss": 0.77656835, + "learning_rate": 3.2158098812401325e-06, + "loss": 0.79827774, + "num_input_tokens_seen": 111919615, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.84375, + "step": 5210, + "time_per_iteration": 2.4701180458068848 + }, + { + "auxiliary_loss_clip": 0.01125909, + "auxiliary_loss_mlp": 0.0103947, + "balance_loss_clip": 1.02443743, + "balance_loss_mlp": 1.04593706, + "epoch": 0.31330226965278823, + "flos": 22236929061120.0, + "grad_norm": 1.8627745841798442, + "language_loss": 0.79177994, + "learning_rate": 3.2155006217933874e-06, + "loss": 0.81343371, + "num_input_tokens_seen": 111938485, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.80078125, + "step": 5211, + "time_per_iteration": 2.482102870941162 + }, + { + "auxiliary_loss_clip": 0.01130248, + "auxiliary_loss_mlp": 0.01038221, + "balance_loss_clip": 1.02448201, + "balance_loss_mlp": 1.04849112, + "epoch": 0.3133623929054562, + "flos": 19753993428480.0, + "grad_norm": 1.64859412039223, + "language_loss": 0.79837513, + "learning_rate": 3.2151913162547367e-06, + "loss": 0.82005984, + "num_input_tokens_seen": 111956425, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.8203125, + "step": 5212, + "time_per_iteration": 2.460986852645874 + }, + { + "auxiliary_loss_clip": 0.01133278, + "auxiliary_loss_mlp": 0.01049778, + "balance_loss_clip": 1.03395939, + "balance_loss_mlp": 1.04740417, + "epoch": 0.31342251615812416, + "flos": 27162256849920.0, + "grad_norm": 2.096287390218497, + "language_loss": 0.71467483, + "learning_rate": 3.2148819646359097e-06, + "loss": 0.73650539, + "num_input_tokens_seen": 111975915, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.859375, + "step": 5213, + "time_per_iteration": 2.5129754543304443 + }, + { + "auxiliary_loss_clip": 0.01135204, + "auxiliary_loss_mlp": 0.01041521, + "balance_loss_clip": 1.02660799, + "balance_loss_mlp": 1.05014026, + "epoch": 0.31348263941079213, + "flos": 20229809915520.0, + "grad_norm": 5.183832853627301, + "language_loss": 0.77595121, + "learning_rate": 3.2145725669486374e-06, + "loss": 0.79771841, + "num_input_tokens_seen": 111995055, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8515625, + "step": 5214, + "time_per_iteration": 2.453228712081909 + }, + { + "auxiliary_loss_clip": 0.01126524, + "auxiliary_loss_mlp": 0.01034893, + "balance_loss_clip": 1.02082658, + "balance_loss_mlp": 1.04599309, + "epoch": 0.3135427626634601, + "flos": 24607643627520.0, + "grad_norm": 1.6576138068605464, + "language_loss": 0.82562625, + "learning_rate": 3.2142631232046517e-06, + "loss": 0.84724051, + "num_input_tokens_seen": 112015830, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8046875, + "step": 5215, + "time_per_iteration": 2.544684886932373 + }, + { + "auxiliary_loss_clip": 0.01131802, + "auxiliary_loss_mlp": 0.01037959, + "balance_loss_clip": 1.02242613, + "balance_loss_mlp": 1.04732776, + "epoch": 0.31360288591612806, + "flos": 20959873845120.0, + "grad_norm": 2.510877303679677, + "language_loss": 0.79557931, + "learning_rate": 3.213953633415686e-06, + "loss": 0.81727695, + "num_input_tokens_seen": 112035065, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.84375, + "step": 5216, + "time_per_iteration": 2.4559943675994873 + }, + { + "auxiliary_loss_clip": 0.0113211, + "auxiliary_loss_mlp": 0.01047322, + "balance_loss_clip": 1.03042984, + "balance_loss_mlp": 1.04632115, + "epoch": 0.313663009168796, + "flos": 26980513009920.0, + "grad_norm": 2.0079960226100293, + "language_loss": 0.68489361, + "learning_rate": 3.213644097593477e-06, + "loss": 0.70668793, + "num_input_tokens_seen": 112058405, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.859375, + "step": 5217, + "time_per_iteration": 2.524624824523926 + }, + { + "auxiliary_loss_clip": 0.01134019, + "auxiliary_loss_mlp": 0.01036007, + "balance_loss_clip": 1.02095652, + "balance_loss_mlp": 1.04952598, + "epoch": 0.313723132421464, + "flos": 18040911016320.0, + "grad_norm": 1.8597778329644077, + "language_loss": 0.80357039, + "learning_rate": 3.2133345157497624e-06, + "loss": 0.82527065, + "num_input_tokens_seen": 112076420, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84765625, + "step": 5218, + "time_per_iteration": 2.437819480895996 + }, + { + "auxiliary_loss_clip": 0.01130766, + "auxiliary_loss_mlp": 0.01041589, + "balance_loss_clip": 1.025931, + "balance_loss_mlp": 1.04692423, + "epoch": 0.31378325567413196, + "flos": 22488913946880.0, + "grad_norm": 2.311414379590861, + "language_loss": 0.68608415, + "learning_rate": 3.2130248878962813e-06, + "loss": 0.70780772, + "num_input_tokens_seen": 112090775, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8359375, + "step": 5219, + "time_per_iteration": 2.4811697006225586 + }, + { + "auxiliary_loss_clip": 0.01132783, + "auxiliary_loss_mlp": 0.01040103, + "balance_loss_clip": 1.02585125, + "balance_loss_mlp": 1.05002093, + "epoch": 0.3138433789267999, + "flos": 22419247518720.0, + "grad_norm": 1.886141735907444, + "language_loss": 0.7973401, + "learning_rate": 3.2127152140447747e-06, + "loss": 0.81906897, + "num_input_tokens_seen": 112110980, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.828125, + "step": 5220, + "time_per_iteration": 5.5014426708221436 + }, + { + "auxiliary_loss_clip": 0.01129795, + "auxiliary_loss_mlp": 0.0103477, + "balance_loss_clip": 1.02036917, + "balance_loss_mlp": 1.0470016, + "epoch": 0.3139035021794679, + "flos": 13005912026880.0, + "grad_norm": 1.696615671785811, + "language_loss": 0.72865409, + "learning_rate": 3.212405494206986e-06, + "loss": 0.75029969, + "num_input_tokens_seen": 112129020, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.828125, + "step": 5221, + "time_per_iteration": 2.4286248683929443 + }, + { + "auxiliary_loss_clip": 0.01129062, + "auxiliary_loss_mlp": 0.01037622, + "balance_loss_clip": 1.02370405, + "balance_loss_mlp": 1.0478735, + "epoch": 0.31396362543213585, + "flos": 16945994689920.0, + "grad_norm": 1.5798649053475948, + "language_loss": 0.8195132, + "learning_rate": 3.2120957283946588e-06, + "loss": 0.84118003, + "num_input_tokens_seen": 112147865, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.8125, + "step": 5222, + "time_per_iteration": 2.453622817993164 + }, + { + "auxiliary_loss_clip": 0.01133873, + "auxiliary_loss_mlp": 0.01044471, + "balance_loss_clip": 1.02744806, + "balance_loss_mlp": 1.04833627, + "epoch": 0.31402374868480387, + "flos": 20156731695360.0, + "grad_norm": 1.948806511089887, + "language_loss": 0.70150459, + "learning_rate": 3.2117859166195407e-06, + "loss": 0.723288, + "num_input_tokens_seen": 112166745, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.85546875, + "step": 5223, + "time_per_iteration": 2.442513942718506 + }, + { + "auxiliary_loss_clip": 0.01130042, + "auxiliary_loss_mlp": 0.01034314, + "balance_loss_clip": 1.01980042, + "balance_loss_mlp": 1.04643512, + "epoch": 0.31408387193747184, + "flos": 21251073404160.0, + "grad_norm": 1.6111281957709347, + "language_loss": 0.80361176, + "learning_rate": 3.211476058893379e-06, + "loss": 0.82525527, + "num_input_tokens_seen": 112185895, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8359375, + "step": 5224, + "time_per_iteration": 2.5533599853515625 + }, + { + "auxiliary_loss_clip": 0.01139668, + "auxiliary_loss_mlp": 0.01042146, + "balance_loss_clip": 1.02615976, + "balance_loss_mlp": 1.05134106, + "epoch": 0.3141439951901398, + "flos": 27484267299840.0, + "grad_norm": 1.9819108050216143, + "language_loss": 0.58416283, + "learning_rate": 3.2111661552279243e-06, + "loss": 0.60598099, + "num_input_tokens_seen": 112204465, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8828125, + "step": 5225, + "time_per_iteration": 2.493633508682251 + }, + { + "auxiliary_loss_clip": 0.01125852, + "auxiliary_loss_mlp": 0.01031717, + "balance_loss_clip": 1.01826406, + "balance_loss_mlp": 1.04575014, + "epoch": 0.31420411844280777, + "flos": 17852235851520.0, + "grad_norm": 1.9016989590060558, + "language_loss": 0.81870753, + "learning_rate": 3.2108562056349273e-06, + "loss": 0.84028322, + "num_input_tokens_seen": 112221635, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.80078125, + "step": 5226, + "time_per_iteration": 2.455474376678467 + }, + { + "auxiliary_loss_clip": 0.01132046, + "auxiliary_loss_mlp": 0.0103993, + "balance_loss_clip": 1.0245285, + "balance_loss_mlp": 1.04804921, + "epoch": 0.31426424169547573, + "flos": 21616967295360.0, + "grad_norm": 3.2929472014065864, + "language_loss": 0.73947561, + "learning_rate": 3.210546210126141e-06, + "loss": 0.7611953, + "num_input_tokens_seen": 112241240, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.83984375, + "step": 5227, + "time_per_iteration": 2.4582889080047607 + }, + { + "auxiliary_loss_clip": 0.01132913, + "auxiliary_loss_mlp": 0.01042937, + "balance_loss_clip": 1.02783334, + "balance_loss_mlp": 1.04827404, + "epoch": 0.3143243649481437, + "flos": 30920631586560.0, + "grad_norm": 1.9061545786481, + "language_loss": 0.67636049, + "learning_rate": 3.2102361687133213e-06, + "loss": 0.69811898, + "num_input_tokens_seen": 112262350, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84375, + "step": 5228, + "time_per_iteration": 2.572122573852539 + }, + { + "auxiliary_loss_clip": 0.01130676, + "auxiliary_loss_mlp": 0.01040068, + "balance_loss_clip": 1.02567399, + "balance_loss_mlp": 1.04645872, + "epoch": 0.31438448820081166, + "flos": 22821411168000.0, + "grad_norm": 1.857425256773369, + "language_loss": 0.79938543, + "learning_rate": 3.2099260814082254e-06, + "loss": 0.82109284, + "num_input_tokens_seen": 112283710, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.83984375, + "step": 5229, + "time_per_iteration": 2.4785192012786865 + }, + { + "auxiliary_loss_clip": 0.01129346, + "auxiliary_loss_mlp": 0.01039876, + "balance_loss_clip": 1.02474797, + "balance_loss_mlp": 1.04716849, + "epoch": 0.3144446114534796, + "flos": 23292127923840.0, + "grad_norm": 1.8246409730399047, + "language_loss": 0.70264775, + "learning_rate": 3.209615948222611e-06, + "loss": 0.72434002, + "num_input_tokens_seen": 112304285, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8203125, + "step": 5230, + "time_per_iteration": 2.504387140274048 + }, + { + "auxiliary_loss_clip": 0.01129413, + "auxiliary_loss_mlp": 0.01043609, + "balance_loss_clip": 1.02805161, + "balance_loss_mlp": 1.04486191, + "epoch": 0.3145047347061476, + "flos": 31355976424320.0, + "grad_norm": 1.680902640440715, + "language_loss": 0.79707456, + "learning_rate": 3.209305769168239e-06, + "loss": 0.81880474, + "num_input_tokens_seen": 112325110, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84375, + "step": 5231, + "time_per_iteration": 2.535352945327759 + }, + { + "auxiliary_loss_clip": 0.01129002, + "auxiliary_loss_mlp": 0.01042731, + "balance_loss_clip": 1.02675736, + "balance_loss_mlp": 1.04756021, + "epoch": 0.31456485795881556, + "flos": 10889552643840.0, + "grad_norm": 2.0146998384070254, + "language_loss": 0.8507638, + "learning_rate": 3.2089955442568704e-06, + "loss": 0.87248111, + "num_input_tokens_seen": 112339855, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8125, + "step": 5232, + "time_per_iteration": 2.5626280307769775 + }, + { + "auxiliary_loss_clip": 0.01127281, + "auxiliary_loss_mlp": 0.01049783, + "balance_loss_clip": 1.03439283, + "balance_loss_mlp": 1.0461762, + "epoch": 0.3146249812114835, + "flos": 17092438439040.0, + "grad_norm": 1.5681064196444345, + "language_loss": 0.7984041, + "learning_rate": 3.2086852735002692e-06, + "loss": 0.82017469, + "num_input_tokens_seen": 112358480, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.80859375, + "step": 5233, + "time_per_iteration": 2.4478254318237305 + }, + { + "auxiliary_loss_clip": 0.01132884, + "auxiliary_loss_mlp": 0.01038194, + "balance_loss_clip": 1.0233047, + "balance_loss_mlp": 1.04861724, + "epoch": 0.3146851044641515, + "flos": 55291442889600.0, + "grad_norm": 1.628646597563271, + "language_loss": 0.70788991, + "learning_rate": 3.2083749569102024e-06, + "loss": 0.72960073, + "num_input_tokens_seen": 112382350, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.84375, + "step": 5234, + "time_per_iteration": 2.775871992111206 + }, + { + "auxiliary_loss_clip": 0.01131513, + "auxiliary_loss_mlp": 0.01035924, + "balance_loss_clip": 1.0205102, + "balance_loss_mlp": 1.04739237, + "epoch": 0.31474522771681945, + "flos": 27015884928000.0, + "grad_norm": 1.8519873535555593, + "language_loss": 0.72068667, + "learning_rate": 3.2080645944984356e-06, + "loss": 0.74236101, + "num_input_tokens_seen": 112400260, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.83984375, + "step": 5235, + "time_per_iteration": 2.515869617462158 + }, + { + "auxiliary_loss_clip": 0.01126993, + "auxiliary_loss_mlp": 0.01036359, + "balance_loss_clip": 1.02204823, + "balance_loss_mlp": 1.04428434, + "epoch": 0.3148053509694875, + "flos": 21251935330560.0, + "grad_norm": 2.06424580772138, + "language_loss": 0.7832365, + "learning_rate": 3.2077541862767384e-06, + "loss": 0.80487001, + "num_input_tokens_seen": 112419400, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.828125, + "step": 5236, + "time_per_iteration": 2.5591800212860107 + }, + { + "auxiliary_loss_clip": 0.01134794, + "auxiliary_loss_mlp": 0.01041698, + "balance_loss_clip": 1.02609372, + "balance_loss_mlp": 1.04730821, + "epoch": 0.31486547422215544, + "flos": 31248675521280.0, + "grad_norm": 1.44778330648976, + "language_loss": 0.75856584, + "learning_rate": 3.207443732256881e-06, + "loss": 0.78033078, + "num_input_tokens_seen": 112440825, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.875, + "step": 5237, + "time_per_iteration": 2.5414791107177734 + }, + { + "auxiliary_loss_clip": 0.01125329, + "auxiliary_loss_mlp": 0.01037879, + "balance_loss_clip": 1.02424169, + "balance_loss_mlp": 1.04500508, + "epoch": 0.3149255974748234, + "flos": 19828615933440.0, + "grad_norm": 2.1889759499940813, + "language_loss": 0.79916662, + "learning_rate": 3.2071332324506372e-06, + "loss": 0.82079864, + "num_input_tokens_seen": 112459180, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.8046875, + "step": 5238, + "time_per_iteration": 2.484102725982666 + }, + { + "auxiliary_loss_clip": 0.01045144, + "auxiliary_loss_mlp": 0.0100711, + "balance_loss_clip": 1.0053103, + "balance_loss_mlp": 1.01739836, + "epoch": 0.31498572072749137, + "flos": 67683965339520.0, + "grad_norm": 0.8333107882681854, + "language_loss": 0.67920464, + "learning_rate": 3.2068226868697795e-06, + "loss": 0.69972724, + "num_input_tokens_seen": 112516680, + "router_z_loss_clip": 0.01794434, + "router_z_loss_mlp": 0.27734375, + "step": 5239, + "time_per_iteration": 3.0362496376037598 + }, + { + "auxiliary_loss_clip": 0.01130796, + "auxiliary_loss_mlp": 0.01038602, + "balance_loss_clip": 1.02197254, + "balance_loss_mlp": 1.04535258, + "epoch": 0.31504584398015933, + "flos": 19793136274560.0, + "grad_norm": 2.0536997136778847, + "language_loss": 0.82329869, + "learning_rate": 3.2065120955260846e-06, + "loss": 0.84499264, + "num_input_tokens_seen": 112535895, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.85546875, + "step": 5240, + "time_per_iteration": 2.5182995796203613 + }, + { + "auxiliary_loss_clip": 0.01130164, + "auxiliary_loss_mlp": 0.01039014, + "balance_loss_clip": 1.02451253, + "balance_loss_mlp": 1.04874361, + "epoch": 0.3151059672328273, + "flos": 26615409217920.0, + "grad_norm": 2.2630790499207962, + "language_loss": 0.80981195, + "learning_rate": 3.2062014584313302e-06, + "loss": 0.83150375, + "num_input_tokens_seen": 112557490, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8125, + "step": 5241, + "time_per_iteration": 2.5001909732818604 + }, + { + "auxiliary_loss_clip": 0.01129016, + "auxiliary_loss_mlp": 0.01036038, + "balance_loss_clip": 1.02131534, + "balance_loss_mlp": 1.04834199, + "epoch": 0.31516609048549526, + "flos": 24204438483840.0, + "grad_norm": 1.5804052674973608, + "language_loss": 0.74575627, + "learning_rate": 3.2058907755972956e-06, + "loss": 0.76740676, + "num_input_tokens_seen": 112577075, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8046875, + "step": 5242, + "time_per_iteration": 2.530768871307373 + }, + { + "auxiliary_loss_clip": 0.01129278, + "auxiliary_loss_mlp": 0.01034942, + "balance_loss_clip": 1.0189085, + "balance_loss_mlp": 1.04601228, + "epoch": 0.31522621373816323, + "flos": 25958710817280.0, + "grad_norm": 1.9335835713568477, + "language_loss": 0.74171245, + "learning_rate": 3.2055800470357626e-06, + "loss": 0.7633546, + "num_input_tokens_seen": 112597620, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.83203125, + "step": 5243, + "time_per_iteration": 2.495138168334961 + }, + { + "auxiliary_loss_clip": 0.01129958, + "auxiliary_loss_mlp": 0.01036952, + "balance_loss_clip": 1.02221215, + "balance_loss_mlp": 1.04677868, + "epoch": 0.3152863369908312, + "flos": 21908813299200.0, + "grad_norm": 3.400707627247709, + "language_loss": 0.64608908, + "learning_rate": 3.205269272758513e-06, + "loss": 0.66775823, + "num_input_tokens_seen": 112617150, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.83203125, + "step": 5244, + "time_per_iteration": 2.4930343627929688 + }, + { + "auxiliary_loss_clip": 0.01132393, + "auxiliary_loss_mlp": 0.01035579, + "balance_loss_clip": 1.02088022, + "balance_loss_mlp": 1.04716229, + "epoch": 0.31534646024349916, + "flos": 16281072074880.0, + "grad_norm": 2.1590647535644965, + "language_loss": 0.91464043, + "learning_rate": 3.2049584527773313e-06, + "loss": 0.93632007, + "num_input_tokens_seen": 112631090, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8515625, + "step": 5245, + "time_per_iteration": 2.4007837772369385 + }, + { + "auxiliary_loss_clip": 0.0113079, + "auxiliary_loss_mlp": 0.0104148, + "balance_loss_clip": 1.02636433, + "balance_loss_mlp": 1.04643655, + "epoch": 0.3154065834961671, + "flos": 24717243000960.0, + "grad_norm": 9.888646015204756, + "language_loss": 0.75272042, + "learning_rate": 3.2046475871040048e-06, + "loss": 0.77444315, + "num_input_tokens_seen": 112651220, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.84375, + "step": 5246, + "time_per_iteration": 2.4886202812194824 + }, + { + "auxiliary_loss_clip": 0.01131208, + "auxiliary_loss_mlp": 0.01040085, + "balance_loss_clip": 1.02524352, + "balance_loss_mlp": 1.04602718, + "epoch": 0.3154667067488351, + "flos": 35371148469120.0, + "grad_norm": 1.4670109155165818, + "language_loss": 0.6160199, + "learning_rate": 3.204336675750321e-06, + "loss": 0.63773286, + "num_input_tokens_seen": 112671560, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8515625, + "step": 5247, + "time_per_iteration": 2.567185640335083 + }, + { + "auxiliary_loss_clip": 0.01132287, + "auxiliary_loss_mlp": 0.01038251, + "balance_loss_clip": 1.02283072, + "balance_loss_mlp": 1.04756081, + "epoch": 0.31552683000150306, + "flos": 17456464823040.0, + "grad_norm": 2.2084660310503526, + "language_loss": 0.82410538, + "learning_rate": 3.2040257187280693e-06, + "loss": 0.84581077, + "num_input_tokens_seen": 112689790, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84375, + "step": 5248, + "time_per_iteration": 2.52426815032959 + }, + { + "auxiliary_loss_clip": 0.01129578, + "auxiliary_loss_mlp": 0.01050015, + "balance_loss_clip": 1.03413653, + "balance_loss_mlp": 1.04662156, + "epoch": 0.3155869532541711, + "flos": 18405763413120.0, + "grad_norm": 1.8083364563285407, + "language_loss": 0.85017586, + "learning_rate": 3.2037147160490423e-06, + "loss": 0.87197179, + "num_input_tokens_seen": 112708265, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.828125, + "step": 5249, + "time_per_iteration": 2.4549005031585693 + }, + { + "auxiliary_loss_clip": 0.0113226, + "auxiliary_loss_mlp": 0.01038074, + "balance_loss_clip": 1.02245772, + "balance_loss_mlp": 1.04802227, + "epoch": 0.31564707650683904, + "flos": 21579763783680.0, + "grad_norm": 1.8090626711780673, + "language_loss": 0.85569501, + "learning_rate": 3.2034036677250322e-06, + "loss": 0.87739837, + "num_input_tokens_seen": 112727820, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84375, + "step": 5250, + "time_per_iteration": 2.502629041671753 + }, + { + "auxiliary_loss_clip": 0.01128678, + "auxiliary_loss_mlp": 0.01042591, + "balance_loss_clip": 1.02766562, + "balance_loss_mlp": 1.04532385, + "epoch": 0.315707199759507, + "flos": 21030976817280.0, + "grad_norm": 4.215523946509053, + "language_loss": 0.68559456, + "learning_rate": 3.203092573767835e-06, + "loss": 0.70730722, + "num_input_tokens_seen": 112743140, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8359375, + "step": 5251, + "time_per_iteration": 2.4467368125915527 + }, + { + "auxiliary_loss_clip": 0.01130513, + "auxiliary_loss_mlp": 0.01039965, + "balance_loss_clip": 1.02487266, + "balance_loss_mlp": 1.04848695, + "epoch": 0.31576732301217497, + "flos": 26828861788800.0, + "grad_norm": 1.7890606859490685, + "language_loss": 0.78783, + "learning_rate": 3.202781434189246e-06, + "loss": 0.80953479, + "num_input_tokens_seen": 112764705, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8203125, + "step": 5252, + "time_per_iteration": 2.5056369304656982 + }, + { + "auxiliary_loss_clip": 0.01129131, + "auxiliary_loss_mlp": 0.01040491, + "balance_loss_clip": 1.02635264, + "balance_loss_mlp": 1.04820085, + "epoch": 0.31582744626484294, + "flos": 22711165349760.0, + "grad_norm": 1.7467438086499925, + "language_loss": 0.74374568, + "learning_rate": 3.202470249001066e-06, + "loss": 0.76544189, + "num_input_tokens_seen": 112785310, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.80859375, + "step": 5253, + "time_per_iteration": 2.485865592956543 + }, + { + "auxiliary_loss_clip": 0.01129339, + "auxiliary_loss_mlp": 0.01038803, + "balance_loss_clip": 1.02308559, + "balance_loss_mlp": 1.04530692, + "epoch": 0.3158875695175109, + "flos": 23951914894080.0, + "grad_norm": 1.6622002067810395, + "language_loss": 0.73305148, + "learning_rate": 3.2021590182150924e-06, + "loss": 0.75473285, + "num_input_tokens_seen": 112802905, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.83984375, + "step": 5254, + "time_per_iteration": 2.5044641494750977 + }, + { + "auxiliary_loss_clip": 0.01131731, + "auxiliary_loss_mlp": 0.01038119, + "balance_loss_clip": 1.02293146, + "balance_loss_mlp": 1.04714012, + "epoch": 0.31594769277017887, + "flos": 13261883322240.0, + "grad_norm": 1.9319514966089122, + "language_loss": 0.78156364, + "learning_rate": 3.201847741843128e-06, + "loss": 0.80326211, + "num_input_tokens_seen": 112820305, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.84375, + "step": 5255, + "time_per_iteration": 2.4380881786346436 + }, + { + "auxiliary_loss_clip": 0.01130732, + "auxiliary_loss_mlp": 0.0104233, + "balance_loss_clip": 1.02565229, + "balance_loss_mlp": 1.04770398, + "epoch": 0.31600781602284683, + "flos": 23368258800000.0, + "grad_norm": 2.551434599641695, + "language_loss": 0.78019011, + "learning_rate": 3.2015364198969772e-06, + "loss": 0.80192077, + "num_input_tokens_seen": 112841185, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.828125, + "step": 5256, + "time_per_iteration": 2.517211437225342 + }, + { + "auxiliary_loss_clip": 0.01125561, + "auxiliary_loss_mlp": 0.01035033, + "balance_loss_clip": 1.02159786, + "balance_loss_mlp": 1.04710865, + "epoch": 0.3160679392755148, + "flos": 19828580019840.0, + "grad_norm": 1.6136648036258991, + "language_loss": 0.71117795, + "learning_rate": 3.2012250523884453e-06, + "loss": 0.73278391, + "num_input_tokens_seen": 112860570, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.78515625, + "step": 5257, + "time_per_iteration": 2.4690449237823486 + }, + { + "auxiliary_loss_clip": 0.01129863, + "auxiliary_loss_mlp": 0.01037255, + "balance_loss_clip": 1.02207994, + "balance_loss_mlp": 1.04662931, + "epoch": 0.31612806252818276, + "flos": 20193216935040.0, + "grad_norm": 1.9672329013590102, + "language_loss": 0.77098101, + "learning_rate": 3.2009136393293393e-06, + "loss": 0.79265225, + "num_input_tokens_seen": 112877975, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.83203125, + "step": 5258, + "time_per_iteration": 2.4586384296417236 + }, + { + "auxiliary_loss_clip": 0.01130533, + "auxiliary_loss_mlp": 0.01037626, + "balance_loss_clip": 1.02291536, + "balance_loss_mlp": 1.04706669, + "epoch": 0.31618818578085073, + "flos": 24235967646720.0, + "grad_norm": 4.102208009404704, + "language_loss": 0.72829109, + "learning_rate": 3.200602180731467e-06, + "loss": 0.7499727, + "num_input_tokens_seen": 112896170, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8359375, + "step": 5259, + "time_per_iteration": 2.463867425918579 + }, + { + "auxiliary_loss_clip": 0.011339, + "auxiliary_loss_mlp": 0.01048149, + "balance_loss_clip": 1.03382003, + "balance_loss_mlp": 1.04840684, + "epoch": 0.3162483090335187, + "flos": 25081844002560.0, + "grad_norm": 1.940451679167918, + "language_loss": 0.66212165, + "learning_rate": 3.20029067660664e-06, + "loss": 0.68394214, + "num_input_tokens_seen": 112916180, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.85546875, + "step": 5260, + "time_per_iteration": 2.498173475265503 + }, + { + "auxiliary_loss_clip": 0.01125905, + "auxiliary_loss_mlp": 0.0103285, + "balance_loss_clip": 1.01806808, + "balance_loss_mlp": 1.04255199, + "epoch": 0.31630843228618666, + "flos": 26323383646080.0, + "grad_norm": 1.9564366458132632, + "language_loss": 0.72557104, + "learning_rate": 3.1999791269666706e-06, + "loss": 0.74715853, + "num_input_tokens_seen": 112936745, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8359375, + "step": 5261, + "time_per_iteration": 4.0577170848846436 + }, + { + "auxiliary_loss_clip": 0.01040968, + "auxiliary_loss_mlp": 0.01005761, + "balance_loss_clip": 1.00365114, + "balance_loss_mlp": 1.01333809, + "epoch": 0.3163685555388547, + "flos": 66758441552640.0, + "grad_norm": 0.7495327099187281, + "language_loss": 0.50639355, + "learning_rate": 3.1996675318233716e-06, + "loss": 0.52686083, + "num_input_tokens_seen": 112994845, + "router_z_loss_clip": 0.02111816, + "router_z_loss_mlp": 0.27734375, + "step": 5262, + "time_per_iteration": 5.9139063358306885 + }, + { + "auxiliary_loss_clip": 0.01133191, + "auxiliary_loss_mlp": 0.01038454, + "balance_loss_clip": 1.02408338, + "balance_loss_mlp": 1.04845881, + "epoch": 0.31642867879152264, + "flos": 25995662933760.0, + "grad_norm": 1.4936033884005069, + "language_loss": 0.85241222, + "learning_rate": 3.19935589118856e-06, + "loss": 0.87412858, + "num_input_tokens_seen": 113015125, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.84765625, + "step": 5263, + "time_per_iteration": 2.4966084957122803 + }, + { + "auxiliary_loss_clip": 0.01127359, + "auxiliary_loss_mlp": 0.01045858, + "balance_loss_clip": 1.03201818, + "balance_loss_mlp": 1.04657304, + "epoch": 0.3164888020441906, + "flos": 25774955815680.0, + "grad_norm": 1.4671140059184749, + "language_loss": 0.81675243, + "learning_rate": 3.1990442050740535e-06, + "loss": 0.83848464, + "num_input_tokens_seen": 113035535, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.80859375, + "step": 5264, + "time_per_iteration": 2.5126495361328125 + }, + { + "auxiliary_loss_clip": 0.01133844, + "auxiliary_loss_mlp": 0.0103675, + "balance_loss_clip": 1.02107441, + "balance_loss_mlp": 1.0484283, + "epoch": 0.3165489252968586, + "flos": 19756220071680.0, + "grad_norm": 1.6829803459821215, + "language_loss": 0.79974926, + "learning_rate": 3.19873247349167e-06, + "loss": 0.82145512, + "num_input_tokens_seen": 113052720, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8515625, + "step": 5265, + "time_per_iteration": 2.444263219833374 + }, + { + "auxiliary_loss_clip": 0.0113354, + "auxiliary_loss_mlp": 0.01039316, + "balance_loss_clip": 1.02361572, + "balance_loss_mlp": 1.04815876, + "epoch": 0.31660904854952654, + "flos": 23183929180800.0, + "grad_norm": 1.5672890574859826, + "language_loss": 0.74875605, + "learning_rate": 3.1984206964532307e-06, + "loss": 0.77048463, + "num_input_tokens_seen": 113071435, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8515625, + "step": 5266, + "time_per_iteration": 2.5323407649993896 + }, + { + "auxiliary_loss_clip": 0.01131974, + "auxiliary_loss_mlp": 0.01043072, + "balance_loss_clip": 1.02851653, + "balance_loss_mlp": 1.04640543, + "epoch": 0.3166691718021945, + "flos": 20408501099520.0, + "grad_norm": 2.021043754719528, + "language_loss": 0.78872609, + "learning_rate": 3.1981088739705585e-06, + "loss": 0.81047654, + "num_input_tokens_seen": 113088645, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.85546875, + "step": 5267, + "time_per_iteration": 2.4591164588928223 + }, + { + "auxiliary_loss_clip": 0.01042632, + "auxiliary_loss_mlp": 0.01004279, + "balance_loss_clip": 1.00216913, + "balance_loss_mlp": 1.01493907, + "epoch": 0.31672929505486247, + "flos": 70144781172480.0, + "grad_norm": 0.7322532755123746, + "language_loss": 0.57800645, + "learning_rate": 3.197797006055478e-06, + "loss": 0.59847558, + "num_input_tokens_seen": 113152775, + "router_z_loss_clip": 0.02111816, + "router_z_loss_mlp": 0.27734375, + "step": 5268, + "time_per_iteration": 3.061121702194214 + }, + { + "auxiliary_loss_clip": 0.01132182, + "auxiliary_loss_mlp": 0.01037998, + "balance_loss_clip": 1.02291262, + "balance_loss_mlp": 1.04683709, + "epoch": 0.31678941830753043, + "flos": 14355758154240.0, + "grad_norm": 1.8728828385616285, + "language_loss": 0.72881675, + "learning_rate": 3.197485092719815e-06, + "loss": 0.75051844, + "num_input_tokens_seen": 113171410, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.85546875, + "step": 5269, + "time_per_iteration": 2.4871747493743896 + }, + { + "auxiliary_loss_clip": 0.0113037, + "auxiliary_loss_mlp": 0.01039313, + "balance_loss_clip": 1.02470934, + "balance_loss_mlp": 1.04689598, + "epoch": 0.3168495415601984, + "flos": 22747722416640.0, + "grad_norm": 2.0592855460289394, + "language_loss": 0.79914796, + "learning_rate": 3.1971731339753973e-06, + "loss": 0.82084477, + "num_input_tokens_seen": 113189965, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8359375, + "step": 5270, + "time_per_iteration": 2.502607822418213 + }, + { + "auxiliary_loss_clip": 0.01134389, + "auxiliary_loss_mlp": 0.01041999, + "balance_loss_clip": 1.02582264, + "balance_loss_mlp": 1.04792333, + "epoch": 0.31690966481286637, + "flos": 20115254465280.0, + "grad_norm": 1.9728362515560998, + "language_loss": 0.79207718, + "learning_rate": 3.1968611298340545e-06, + "loss": 0.8138411, + "num_input_tokens_seen": 113206355, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8671875, + "step": 5271, + "time_per_iteration": 2.4412505626678467 + }, + { + "auxiliary_loss_clip": 0.0113132, + "auxiliary_loss_mlp": 0.01040651, + "balance_loss_clip": 1.02440262, + "balance_loss_mlp": 1.04685235, + "epoch": 0.31696978806553433, + "flos": 21178928937600.0, + "grad_norm": 1.769221166791082, + "language_loss": 0.73264146, + "learning_rate": 3.1965490803076173e-06, + "loss": 0.75436121, + "num_input_tokens_seen": 113225440, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.84375, + "step": 5272, + "time_per_iteration": 2.4992945194244385 + }, + { + "auxiliary_loss_clip": 0.0113408, + "auxiliary_loss_mlp": 0.01039209, + "balance_loss_clip": 1.02262676, + "balance_loss_mlp": 1.04613161, + "epoch": 0.3170299113182023, + "flos": 42997030439040.0, + "grad_norm": 1.9537759660060814, + "language_loss": 0.69159341, + "learning_rate": 3.1962369854079194e-06, + "loss": 0.71332633, + "num_input_tokens_seen": 113248840, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.87890625, + "step": 5273, + "time_per_iteration": 2.6510114669799805 + }, + { + "auxiliary_loss_clip": 0.01128979, + "auxiliary_loss_mlp": 0.01036407, + "balance_loss_clip": 1.02110016, + "balance_loss_mlp": 1.04609132, + "epoch": 0.31709003457087026, + "flos": 24460158384000.0, + "grad_norm": 1.4826309074588198, + "language_loss": 0.67691469, + "learning_rate": 3.195924845146795e-06, + "loss": 0.69856858, + "num_input_tokens_seen": 113269630, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.828125, + "step": 5274, + "time_per_iteration": 2.5467329025268555 + }, + { + "auxiliary_loss_clip": 0.01124583, + "auxiliary_loss_mlp": 0.01035156, + "balance_loss_clip": 1.02092862, + "balance_loss_mlp": 1.04432762, + "epoch": 0.3171501578235382, + "flos": 24135310759680.0, + "grad_norm": 1.5251182195487059, + "language_loss": 0.80846918, + "learning_rate": 3.195612659536081e-06, + "loss": 0.83006656, + "num_input_tokens_seen": 113291200, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8046875, + "step": 5275, + "time_per_iteration": 2.511544704437256 + }, + { + "auxiliary_loss_clip": 0.0113165, + "auxiliary_loss_mlp": 0.01044428, + "balance_loss_clip": 1.0286448, + "balance_loss_mlp": 1.04539275, + "epoch": 0.31721028107620625, + "flos": 18879712392960.0, + "grad_norm": 1.952892513614063, + "language_loss": 0.72608984, + "learning_rate": 3.1953004285876147e-06, + "loss": 0.7478506, + "num_input_tokens_seen": 113310170, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.859375, + "step": 5276, + "time_per_iteration": 2.5273983478546143 + }, + { + "auxiliary_loss_clip": 0.01124489, + "auxiliary_loss_mlp": 0.0103537, + "balance_loss_clip": 1.02098107, + "balance_loss_mlp": 1.04455817, + "epoch": 0.3172704043288742, + "flos": 23147874904320.0, + "grad_norm": 1.3590988237701342, + "language_loss": 0.77843654, + "learning_rate": 3.194988152313236e-06, + "loss": 0.80003512, + "num_input_tokens_seen": 113331140, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.796875, + "step": 5277, + "time_per_iteration": 2.51247501373291 + }, + { + "auxiliary_loss_clip": 0.0112964, + "auxiliary_loss_mlp": 0.01034254, + "balance_loss_clip": 1.01833999, + "balance_loss_mlp": 1.04444003, + "epoch": 0.3173305275815422, + "flos": 17858520731520.0, + "grad_norm": 1.8256288285105424, + "language_loss": 0.78756094, + "learning_rate": 3.1946758307247878e-06, + "loss": 0.80919981, + "num_input_tokens_seen": 113350030, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8515625, + "step": 5278, + "time_per_iteration": 2.5376405715942383 + }, + { + "auxiliary_loss_clip": 0.01037546, + "auxiliary_loss_mlp": 0.01002993, + "balance_loss_clip": 1.0011332, + "balance_loss_mlp": 1.00972891, + "epoch": 0.31739065083421014, + "flos": 59973476883840.0, + "grad_norm": 0.8755672893463982, + "language_loss": 0.62821174, + "learning_rate": 3.1943634638341114e-06, + "loss": 0.64861709, + "num_input_tokens_seen": 113395820, + "router_z_loss_clip": 0.01855469, + "router_z_loss_mlp": 0.27734375, + "step": 5279, + "time_per_iteration": 2.823489189147949 + }, + { + "auxiliary_loss_clip": 0.01133426, + "auxiliary_loss_mlp": 0.01040678, + "balance_loss_clip": 1.0242753, + "balance_loss_mlp": 1.04568505, + "epoch": 0.3174507740868781, + "flos": 23800981944960.0, + "grad_norm": 1.6672726712999033, + "language_loss": 0.8099947, + "learning_rate": 3.194051051653053e-06, + "loss": 0.83173573, + "num_input_tokens_seen": 113416835, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.87890625, + "step": 5280, + "time_per_iteration": 2.490154981613159 + }, + { + "auxiliary_loss_clip": 0.01130309, + "auxiliary_loss_mlp": 0.01044119, + "balance_loss_clip": 1.02963543, + "balance_loss_mlp": 1.04713202, + "epoch": 0.31751089733954607, + "flos": 27638899349760.0, + "grad_norm": 1.444928497123541, + "language_loss": 0.77968711, + "learning_rate": 3.19373859419346e-06, + "loss": 0.80143142, + "num_input_tokens_seen": 113440850, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.83203125, + "step": 5281, + "time_per_iteration": 2.590106248855591 + }, + { + "auxiliary_loss_clip": 0.01129621, + "auxiliary_loss_mlp": 0.01035628, + "balance_loss_clip": 1.02001119, + "balance_loss_mlp": 1.0464325, + "epoch": 0.31757102059221404, + "flos": 23769273214080.0, + "grad_norm": 1.6441690082428626, + "language_loss": 0.78319824, + "learning_rate": 3.193426091467179e-06, + "loss": 0.8048507, + "num_input_tokens_seen": 113461000, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.83203125, + "step": 5282, + "time_per_iteration": 2.4879021644592285 + }, + { + "auxiliary_loss_clip": 0.01133826, + "auxiliary_loss_mlp": 0.0103931, + "balance_loss_clip": 1.02429008, + "balance_loss_mlp": 1.04685783, + "epoch": 0.317631143844882, + "flos": 25264521596160.0, + "grad_norm": 2.066002014025373, + "language_loss": 0.66989815, + "learning_rate": 3.193113543486061e-06, + "loss": 0.69162953, + "num_input_tokens_seen": 113480820, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8671875, + "step": 5283, + "time_per_iteration": 2.4914467334747314 + }, + { + "auxiliary_loss_clip": 0.01037416, + "auxiliary_loss_mlp": 0.01002537, + "balance_loss_clip": 1.00047421, + "balance_loss_mlp": 1.00956297, + "epoch": 0.31769126709754997, + "flos": 55825939221120.0, + "grad_norm": 0.7287723120729913, + "language_loss": 0.52796859, + "learning_rate": 3.192800950261958e-06, + "loss": 0.5483681, + "num_input_tokens_seen": 113536910, + "router_z_loss_clip": 0.02062988, + "router_z_loss_mlp": 0.27734375, + "step": 5284, + "time_per_iteration": 3.0077779293060303 + }, + { + "auxiliary_loss_clip": 0.01137201, + "auxiliary_loss_mlp": 0.01037818, + "balance_loss_clip": 1.02314341, + "balance_loss_mlp": 1.04976773, + "epoch": 0.31775139035021793, + "flos": 16690562098560.0, + "grad_norm": 1.732541053937659, + "language_loss": 0.7061168, + "learning_rate": 3.1924883118067235e-06, + "loss": 0.72786701, + "num_input_tokens_seen": 113555480, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.875, + "step": 5285, + "time_per_iteration": 2.4796152114868164 + }, + { + "auxiliary_loss_clip": 0.0103775, + "auxiliary_loss_mlp": 0.01003604, + "balance_loss_clip": 1.00170827, + "balance_loss_mlp": 1.00987303, + "epoch": 0.3178115136028859, + "flos": 64227241019520.0, + "grad_norm": 0.8184329386673247, + "language_loss": 0.60497808, + "learning_rate": 3.1921756281322123e-06, + "loss": 0.6253916, + "num_input_tokens_seen": 113616790, + "router_z_loss_clip": 0.0189209, + "router_z_loss_mlp": 0.27929688, + "step": 5286, + "time_per_iteration": 3.060959815979004 + }, + { + "auxiliary_loss_clip": 0.01131379, + "auxiliary_loss_mlp": 0.01042363, + "balance_loss_clip": 1.02701449, + "balance_loss_mlp": 1.04520202, + "epoch": 0.31787163685555386, + "flos": 18697465762560.0, + "grad_norm": 1.8142745455991967, + "language_loss": 0.72112805, + "learning_rate": 3.1918628992502826e-06, + "loss": 0.74286544, + "num_input_tokens_seen": 113635320, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.86328125, + "step": 5287, + "time_per_iteration": 2.480926752090454 + }, + { + "auxiliary_loss_clip": 0.01131312, + "auxiliary_loss_mlp": 0.01047698, + "balance_loss_clip": 1.03083003, + "balance_loss_mlp": 1.04454064, + "epoch": 0.31793176010822183, + "flos": 21324762155520.0, + "grad_norm": 1.8467549942081902, + "language_loss": 0.75335222, + "learning_rate": 3.191550125172792e-06, + "loss": 0.77514231, + "num_input_tokens_seen": 113654000, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8671875, + "step": 5288, + "time_per_iteration": 2.4506337642669678 + }, + { + "auxiliary_loss_clip": 0.01123463, + "auxiliary_loss_mlp": 0.01036721, + "balance_loss_clip": 1.02344155, + "balance_loss_mlp": 1.04175711, + "epoch": 0.31799188336088985, + "flos": 20958688696320.0, + "grad_norm": 2.214262263159222, + "language_loss": 0.87642509, + "learning_rate": 3.1912373059116007e-06, + "loss": 0.89802694, + "num_input_tokens_seen": 113672375, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.8203125, + "step": 5289, + "time_per_iteration": 2.4887404441833496 + }, + { + "auxiliary_loss_clip": 0.01127988, + "auxiliary_loss_mlp": 0.01039305, + "balance_loss_clip": 1.02569127, + "balance_loss_mlp": 1.04635859, + "epoch": 0.3180520066135578, + "flos": 22491930689280.0, + "grad_norm": 1.8563377401537928, + "language_loss": 0.67677546, + "learning_rate": 3.190924441478572e-06, + "loss": 0.69844842, + "num_input_tokens_seen": 113692385, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.81640625, + "step": 5290, + "time_per_iteration": 2.4699981212615967 + }, + { + "auxiliary_loss_clip": 0.01130209, + "auxiliary_loss_mlp": 0.01045373, + "balance_loss_clip": 1.02983999, + "balance_loss_mlp": 1.04348135, + "epoch": 0.3181121298662258, + "flos": 27235335070080.0, + "grad_norm": 1.9889060202243536, + "language_loss": 0.79926544, + "learning_rate": 3.1906115318855687e-06, + "loss": 0.82102132, + "num_input_tokens_seen": 113712145, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8671875, + "step": 5291, + "time_per_iteration": 2.5350663661956787 + }, + { + "auxiliary_loss_clip": 0.011332, + "auxiliary_loss_mlp": 0.01037344, + "balance_loss_clip": 1.02160883, + "balance_loss_mlp": 1.04684091, + "epoch": 0.31817225311889374, + "flos": 23180158252800.0, + "grad_norm": 2.2851564798864694, + "language_loss": 0.79887748, + "learning_rate": 3.1902985771444577e-06, + "loss": 0.82058293, + "num_input_tokens_seen": 113731435, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8671875, + "step": 5292, + "time_per_iteration": 2.4561853408813477 + }, + { + "auxiliary_loss_clip": 0.01124086, + "auxiliary_loss_mlp": 0.01034983, + "balance_loss_clip": 1.02173245, + "balance_loss_mlp": 1.04506028, + "epoch": 0.3182323763715617, + "flos": 23258803080960.0, + "grad_norm": 1.6321803022225574, + "language_loss": 0.74406421, + "learning_rate": 3.1899855772671043e-06, + "loss": 0.76565492, + "num_input_tokens_seen": 113750825, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7890625, + "step": 5293, + "time_per_iteration": 2.562264919281006 + }, + { + "auxiliary_loss_clip": 0.01127349, + "auxiliary_loss_mlp": 0.0104221, + "balance_loss_clip": 1.02864981, + "balance_loss_mlp": 1.04655647, + "epoch": 0.3182924996242297, + "flos": 29016683280000.0, + "grad_norm": 1.669926034583184, + "language_loss": 0.74003655, + "learning_rate": 3.189672532265379e-06, + "loss": 0.7617321, + "num_input_tokens_seen": 113770010, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.80859375, + "step": 5294, + "time_per_iteration": 2.511491537094116 + }, + { + "auxiliary_loss_clip": 0.01131359, + "auxiliary_loss_mlp": 0.01034334, + "balance_loss_clip": 1.0186758, + "balance_loss_mlp": 1.04616928, + "epoch": 0.31835262287689764, + "flos": 20449188230400.0, + "grad_norm": 1.856323864882145, + "language_loss": 0.76211727, + "learning_rate": 3.189359442151152e-06, + "loss": 0.78377414, + "num_input_tokens_seen": 113788640, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8515625, + "step": 5295, + "time_per_iteration": 2.482302665710449 + }, + { + "auxiliary_loss_clip": 0.01134404, + "auxiliary_loss_mlp": 0.01042471, + "balance_loss_clip": 1.02765322, + "balance_loss_mlp": 1.04831004, + "epoch": 0.3184127461295656, + "flos": 25119478477440.0, + "grad_norm": 1.6316405915506296, + "language_loss": 0.69476807, + "learning_rate": 3.189046306936296e-06, + "loss": 0.71653676, + "num_input_tokens_seen": 113809515, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.859375, + "step": 5296, + "time_per_iteration": 2.4972259998321533 + }, + { + "auxiliary_loss_clip": 0.01129364, + "auxiliary_loss_mlp": 0.0103894, + "balance_loss_clip": 1.02421129, + "balance_loss_mlp": 1.04513788, + "epoch": 0.31847286938223357, + "flos": 25551231955200.0, + "grad_norm": 2.3772504575271367, + "language_loss": 0.77559733, + "learning_rate": 3.1887331266326846e-06, + "loss": 0.79728031, + "num_input_tokens_seen": 113829770, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.84375, + "step": 5297, + "time_per_iteration": 2.5681862831115723 + }, + { + "auxiliary_loss_clip": 0.01126969, + "auxiliary_loss_mlp": 0.01030144, + "balance_loss_clip": 1.01533866, + "balance_loss_mlp": 1.04480934, + "epoch": 0.31853299263490154, + "flos": 27782470010880.0, + "grad_norm": 1.9869765921291695, + "language_loss": 0.79451257, + "learning_rate": 3.1884199012521942e-06, + "loss": 0.81608367, + "num_input_tokens_seen": 113849320, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8203125, + "step": 5298, + "time_per_iteration": 2.4990038871765137 + }, + { + "auxiliary_loss_clip": 0.01132136, + "auxiliary_loss_mlp": 0.01039187, + "balance_loss_clip": 1.0245657, + "balance_loss_mlp": 1.04609096, + "epoch": 0.3185931158875695, + "flos": 22706747976960.0, + "grad_norm": 2.132815699592654, + "language_loss": 0.7431671, + "learning_rate": 3.1881066308067016e-06, + "loss": 0.7648803, + "num_input_tokens_seen": 113867860, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.859375, + "step": 5299, + "time_per_iteration": 2.4902234077453613 + }, + { + "auxiliary_loss_clip": 0.01130922, + "auxiliary_loss_mlp": 0.01042737, + "balance_loss_clip": 1.02775824, + "balance_loss_mlp": 1.04395795, + "epoch": 0.31865323914023747, + "flos": 24571517523840.0, + "grad_norm": 5.1444082132017925, + "language_loss": 0.7834971, + "learning_rate": 3.1877933153080873e-06, + "loss": 0.80523366, + "num_input_tokens_seen": 113886375, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8671875, + "step": 5300, + "time_per_iteration": 2.476113796234131 + }, + { + "auxiliary_loss_clip": 0.01127423, + "auxiliary_loss_mlp": 0.01038051, + "balance_loss_clip": 1.02245879, + "balance_loss_mlp": 1.04332328, + "epoch": 0.31871336239290543, + "flos": 18186564666240.0, + "grad_norm": 4.220537638442504, + "language_loss": 0.8416568, + "learning_rate": 3.1874799547682304e-06, + "loss": 0.86331153, + "num_input_tokens_seen": 113904065, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84375, + "step": 5301, + "time_per_iteration": 2.4672341346740723 + }, + { + "auxiliary_loss_clip": 0.01132761, + "auxiliary_loss_mlp": 0.01045513, + "balance_loss_clip": 1.0299325, + "balance_loss_mlp": 1.05064154, + "epoch": 0.31877348564557345, + "flos": 21826756679040.0, + "grad_norm": 2.4555807672502277, + "language_loss": 0.77689236, + "learning_rate": 3.187166549199015e-06, + "loss": 0.79867512, + "num_input_tokens_seen": 113918415, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8203125, + "step": 5302, + "time_per_iteration": 2.4480254650115967 + }, + { + "auxiliary_loss_clip": 0.011261, + "auxiliary_loss_mlp": 0.01037244, + "balance_loss_clip": 1.02197289, + "balance_loss_mlp": 1.0458461, + "epoch": 0.3188336088982414, + "flos": 22015252275840.0, + "grad_norm": 1.6601771821563076, + "language_loss": 0.79729378, + "learning_rate": 3.1868530986123255e-06, + "loss": 0.81892729, + "num_input_tokens_seen": 113938135, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8046875, + "step": 5303, + "time_per_iteration": 5.451193809509277 + }, + { + "auxiliary_loss_clip": 0.01137183, + "auxiliary_loss_mlp": 0.0104561, + "balance_loss_clip": 1.0295403, + "balance_loss_mlp": 1.04810047, + "epoch": 0.3188937321509094, + "flos": 20047886507520.0, + "grad_norm": 2.065727829234295, + "language_loss": 0.72734123, + "learning_rate": 3.186539603020047e-06, + "loss": 0.74916923, + "num_input_tokens_seen": 113957125, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.890625, + "step": 5304, + "time_per_iteration": 3.835230588912964 + }, + { + "auxiliary_loss_clip": 0.01126733, + "auxiliary_loss_mlp": 0.01039176, + "balance_loss_clip": 1.02546668, + "balance_loss_mlp": 1.04595399, + "epoch": 0.31895385540357735, + "flos": 25848105863040.0, + "grad_norm": 1.8866410100018438, + "language_loss": 0.71773344, + "learning_rate": 3.186226062434068e-06, + "loss": 0.73939252, + "num_input_tokens_seen": 113974875, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.80859375, + "step": 5305, + "time_per_iteration": 2.5330212116241455 + }, + { + "auxiliary_loss_clip": 0.01129402, + "auxiliary_loss_mlp": 0.0103604, + "balance_loss_clip": 1.02209806, + "balance_loss_mlp": 1.0472002, + "epoch": 0.3190139786562453, + "flos": 23477714519040.0, + "grad_norm": 1.6861128411196662, + "language_loss": 0.64708328, + "learning_rate": 3.1859124768662778e-06, + "loss": 0.66873765, + "num_input_tokens_seen": 113994450, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.82421875, + "step": 5306, + "time_per_iteration": 2.4788570404052734 + }, + { + "auxiliary_loss_clip": 0.01135221, + "auxiliary_loss_mlp": 0.01042556, + "balance_loss_clip": 1.02714205, + "balance_loss_mlp": 1.05026746, + "epoch": 0.3190741019089133, + "flos": 29095543589760.0, + "grad_norm": 2.161280639112344, + "language_loss": 0.79625881, + "learning_rate": 3.1855988463285678e-06, + "loss": 0.81803662, + "num_input_tokens_seen": 114013945, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84765625, + "step": 5307, + "time_per_iteration": 2.5614371299743652 + }, + { + "auxiliary_loss_clip": 0.0112354, + "auxiliary_loss_mlp": 0.01039882, + "balance_loss_clip": 1.02412832, + "balance_loss_mlp": 1.04311657, + "epoch": 0.31913422516158124, + "flos": 17129534209920.0, + "grad_norm": 1.727529620646192, + "language_loss": 0.77898794, + "learning_rate": 3.1852851708328308e-06, + "loss": 0.80062222, + "num_input_tokens_seen": 114031375, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8046875, + "step": 5308, + "time_per_iteration": 2.4443254470825195 + }, + { + "auxiliary_loss_clip": 0.01142678, + "auxiliary_loss_mlp": 0.01048979, + "balance_loss_clip": 1.03182518, + "balance_loss_mlp": 1.05046844, + "epoch": 0.3191943484142492, + "flos": 16069846147200.0, + "grad_norm": 5.1649453810283426, + "language_loss": 0.74302876, + "learning_rate": 3.184971450390961e-06, + "loss": 0.76494527, + "num_input_tokens_seen": 114048465, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.921875, + "step": 5309, + "time_per_iteration": 2.494800090789795 + }, + { + "auxiliary_loss_clip": 0.0112957, + "auxiliary_loss_mlp": 0.01034444, + "balance_loss_clip": 1.01998436, + "balance_loss_mlp": 1.04589248, + "epoch": 0.3192544716669172, + "flos": 22966166977920.0, + "grad_norm": 1.754429841361115, + "language_loss": 0.82606339, + "learning_rate": 3.184657685014856e-06, + "loss": 0.84770352, + "num_input_tokens_seen": 114068415, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8359375, + "step": 5310, + "time_per_iteration": 2.4630603790283203 + }, + { + "auxiliary_loss_clip": 0.01129012, + "auxiliary_loss_mlp": 0.01041266, + "balance_loss_clip": 1.02762246, + "balance_loss_mlp": 1.04536486, + "epoch": 0.31931459491958514, + "flos": 26870339018880.0, + "grad_norm": 1.4405475768569584, + "language_loss": 0.78319013, + "learning_rate": 3.184343874716412e-06, + "loss": 0.8048929, + "num_input_tokens_seen": 114088565, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.8359375, + "step": 5311, + "time_per_iteration": 2.5892724990844727 + }, + { + "auxiliary_loss_clip": 0.01130953, + "auxiliary_loss_mlp": 0.01040389, + "balance_loss_clip": 1.02419996, + "balance_loss_mlp": 1.04695129, + "epoch": 0.3193747181722531, + "flos": 21836525178240.0, + "grad_norm": 2.475613964939968, + "language_loss": 0.84316272, + "learning_rate": 3.1840300195075295e-06, + "loss": 0.86487615, + "num_input_tokens_seen": 114107160, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.83984375, + "step": 5312, + "time_per_iteration": 2.4625802040100098 + }, + { + "auxiliary_loss_clip": 0.01137215, + "auxiliary_loss_mlp": 0.01044515, + "balance_loss_clip": 1.02808809, + "balance_loss_mlp": 1.0480628, + "epoch": 0.31943484142492107, + "flos": 18324999682560.0, + "grad_norm": 2.3910939905221302, + "language_loss": 0.78584075, + "learning_rate": 3.1837161194001102e-06, + "loss": 0.80765808, + "num_input_tokens_seen": 114123420, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.890625, + "step": 5313, + "time_per_iteration": 2.548509359359741 + }, + { + "auxiliary_loss_clip": 0.01132383, + "auxiliary_loss_mlp": 0.01036276, + "balance_loss_clip": 1.02133918, + "balance_loss_mlp": 1.04814112, + "epoch": 0.31949496467758903, + "flos": 21615818060160.0, + "grad_norm": 2.1643333364087582, + "language_loss": 0.85868084, + "learning_rate": 3.183402174406057e-06, + "loss": 0.88036746, + "num_input_tokens_seen": 114139230, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.83984375, + "step": 5314, + "time_per_iteration": 2.4721946716308594 + }, + { + "auxiliary_loss_clip": 0.01131852, + "auxiliary_loss_mlp": 0.01040473, + "balance_loss_clip": 1.02502346, + "balance_loss_mlp": 1.04725409, + "epoch": 0.31955508793025705, + "flos": 21760214734080.0, + "grad_norm": 1.7188296838329389, + "language_loss": 0.79836512, + "learning_rate": 3.1830881845372747e-06, + "loss": 0.82008839, + "num_input_tokens_seen": 114159290, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84375, + "step": 5315, + "time_per_iteration": 2.512554407119751 + }, + { + "auxiliary_loss_clip": 0.01135172, + "auxiliary_loss_mlp": 0.01049715, + "balance_loss_clip": 1.03331804, + "balance_loss_mlp": 1.0493269, + "epoch": 0.319615211182925, + "flos": 17164331510400.0, + "grad_norm": 6.566744634036759, + "language_loss": 0.67652613, + "learning_rate": 3.18277414980567e-06, + "loss": 0.69837505, + "num_input_tokens_seen": 114177655, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.859375, + "step": 5316, + "time_per_iteration": 2.4364819526672363 + }, + { + "auxiliary_loss_clip": 0.01133826, + "auxiliary_loss_mlp": 0.01034907, + "balance_loss_clip": 1.02105474, + "balance_loss_mlp": 1.04888916, + "epoch": 0.319675334435593, + "flos": 28112812416000.0, + "grad_norm": 1.4751284993654519, + "language_loss": 0.69336772, + "learning_rate": 3.1824600702231515e-06, + "loss": 0.71505511, + "num_input_tokens_seen": 114200880, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.84765625, + "step": 5317, + "time_per_iteration": 2.6055562496185303 + }, + { + "auxiliary_loss_clip": 0.01043016, + "auxiliary_loss_mlp": 0.0100349, + "balance_loss_clip": 1.00143993, + "balance_loss_mlp": 1.01474404, + "epoch": 0.31973545768826095, + "flos": 69501119408640.0, + "grad_norm": 0.7259742625655435, + "language_loss": 0.53048342, + "learning_rate": 3.182145945801628e-06, + "loss": 0.5509485, + "num_input_tokens_seen": 114267145, + "router_z_loss_clip": 0.02050781, + "router_z_loss_mlp": 0.28320312, + "step": 5318, + "time_per_iteration": 3.200087308883667 + }, + { + "auxiliary_loss_clip": 0.01132062, + "auxiliary_loss_mlp": 0.01037492, + "balance_loss_clip": 1.02311563, + "balance_loss_mlp": 1.04900801, + "epoch": 0.3197955809409289, + "flos": 13699203408000.0, + "grad_norm": 1.839211184718713, + "language_loss": 0.83865941, + "learning_rate": 3.181831776553012e-06, + "loss": 0.8603549, + "num_input_tokens_seen": 114284630, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.83203125, + "step": 5319, + "time_per_iteration": 2.471498966217041 + }, + { + "auxiliary_loss_clip": 0.01131434, + "auxiliary_loss_mlp": 0.01042883, + "balance_loss_clip": 1.0279578, + "balance_loss_mlp": 1.04728413, + "epoch": 0.3198557041935969, + "flos": 33218124278400.0, + "grad_norm": 1.3959306603032393, + "language_loss": 0.63542199, + "learning_rate": 3.1815175624892165e-06, + "loss": 0.65716517, + "num_input_tokens_seen": 114305830, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.83984375, + "step": 5320, + "time_per_iteration": 2.5526087284088135 + }, + { + "auxiliary_loss_clip": 0.01136898, + "auxiliary_loss_mlp": 0.01040372, + "balance_loss_clip": 1.02528036, + "balance_loss_mlp": 1.04970324, + "epoch": 0.31991582744626484, + "flos": 23732033788800.0, + "grad_norm": 1.9943779690432752, + "language_loss": 0.70519614, + "learning_rate": 3.1812033036221567e-06, + "loss": 0.72696882, + "num_input_tokens_seen": 114325165, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.87109375, + "step": 5321, + "time_per_iteration": 2.5262763500213623 + }, + { + "auxiliary_loss_clip": 0.01141108, + "auxiliary_loss_mlp": 0.01056872, + "balance_loss_clip": 1.04030156, + "balance_loss_mlp": 1.05110431, + "epoch": 0.3199759506989328, + "flos": 18550842445440.0, + "grad_norm": 3.2234904552907238, + "language_loss": 0.86543447, + "learning_rate": 3.180888999963749e-06, + "loss": 0.88741434, + "num_input_tokens_seen": 114341310, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8984375, + "step": 5322, + "time_per_iteration": 2.4432008266448975 + }, + { + "auxiliary_loss_clip": 0.01132235, + "auxiliary_loss_mlp": 0.01035962, + "balance_loss_clip": 1.02119207, + "balance_loss_mlp": 1.04827893, + "epoch": 0.3200360739516008, + "flos": 22418888382720.0, + "grad_norm": 1.7854648356549414, + "language_loss": 0.82820231, + "learning_rate": 3.1805746515259123e-06, + "loss": 0.84988427, + "num_input_tokens_seen": 114360355, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.83984375, + "step": 5323, + "time_per_iteration": 2.554539680480957 + }, + { + "auxiliary_loss_clip": 0.01130058, + "auxiliary_loss_mlp": 0.01037837, + "balance_loss_clip": 1.02157664, + "balance_loss_mlp": 1.04700553, + "epoch": 0.32009619720426874, + "flos": 20595236929920.0, + "grad_norm": 1.8735349940723531, + "language_loss": 0.77858555, + "learning_rate": 3.1802602583205663e-06, + "loss": 0.8002646, + "num_input_tokens_seen": 114379220, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.828125, + "step": 5324, + "time_per_iteration": 2.452894687652588 + }, + { + "auxiliary_loss_clip": 0.0113163, + "auxiliary_loss_mlp": 0.01034726, + "balance_loss_clip": 1.01981282, + "balance_loss_mlp": 1.04770339, + "epoch": 0.3201563204569367, + "flos": 18147637301760.0, + "grad_norm": 1.8150910160625646, + "language_loss": 0.80162597, + "learning_rate": 3.1799458203596333e-06, + "loss": 0.82328951, + "num_input_tokens_seen": 114396365, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.83984375, + "step": 5325, + "time_per_iteration": 2.5261802673339844 + }, + { + "auxiliary_loss_clip": 0.01133025, + "auxiliary_loss_mlp": 0.01041931, + "balance_loss_clip": 1.02690446, + "balance_loss_mlp": 1.04872847, + "epoch": 0.32021644370960467, + "flos": 31684235840640.0, + "grad_norm": 1.8959189814779316, + "language_loss": 0.75171864, + "learning_rate": 3.179631337655037e-06, + "loss": 0.77346826, + "num_input_tokens_seen": 114416780, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84375, + "step": 5326, + "time_per_iteration": 2.5300135612487793 + }, + { + "auxiliary_loss_clip": 0.01129958, + "auxiliary_loss_mlp": 0.01037614, + "balance_loss_clip": 1.02285552, + "balance_loss_mlp": 1.04836321, + "epoch": 0.32027656696227264, + "flos": 26865921646080.0, + "grad_norm": 1.4421847054475023, + "language_loss": 0.80826092, + "learning_rate": 3.179316810218701e-06, + "loss": 0.82993662, + "num_input_tokens_seen": 114437405, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.81640625, + "step": 5327, + "time_per_iteration": 2.5393614768981934 + }, + { + "auxiliary_loss_clip": 0.01135097, + "auxiliary_loss_mlp": 0.01037836, + "balance_loss_clip": 1.022434, + "balance_loss_mlp": 1.04888535, + "epoch": 0.32033669021494066, + "flos": 24169928492160.0, + "grad_norm": 1.5386676468863185, + "language_loss": 0.77926928, + "learning_rate": 3.179002238062554e-06, + "loss": 0.80099857, + "num_input_tokens_seen": 114458505, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.859375, + "step": 5328, + "time_per_iteration": 2.471806287765503 + }, + { + "auxiliary_loss_clip": 0.011322, + "auxiliary_loss_mlp": 0.01041791, + "balance_loss_clip": 1.02550721, + "balance_loss_mlp": 1.04632294, + "epoch": 0.3203968134676086, + "flos": 24460768915200.0, + "grad_norm": 2.9951100938200765, + "language_loss": 0.73971635, + "learning_rate": 3.178687621198524e-06, + "loss": 0.76145625, + "num_input_tokens_seen": 114479050, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.859375, + "step": 5329, + "time_per_iteration": 2.52327561378479 + }, + { + "auxiliary_loss_clip": 0.01127399, + "auxiliary_loss_mlp": 0.01033731, + "balance_loss_clip": 1.02012336, + "balance_loss_mlp": 1.04675198, + "epoch": 0.3204569367202766, + "flos": 18004713085440.0, + "grad_norm": 2.060461898980319, + "language_loss": 0.71036464, + "learning_rate": 3.1783729596385415e-06, + "loss": 0.73197591, + "num_input_tokens_seen": 114497415, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.8046875, + "step": 5330, + "time_per_iteration": 2.4405477046966553 + }, + { + "auxiliary_loss_clip": 0.01136038, + "auxiliary_loss_mlp": 0.01049965, + "balance_loss_clip": 1.03343058, + "balance_loss_mlp": 1.0474323, + "epoch": 0.32051705997294455, + "flos": 30589678650240.0, + "grad_norm": 1.7909305839918348, + "language_loss": 0.80022657, + "learning_rate": 3.1780582533945376e-06, + "loss": 0.82208663, + "num_input_tokens_seen": 114518785, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8828125, + "step": 5331, + "time_per_iteration": 2.5934245586395264 + }, + { + "auxiliary_loss_clip": 0.01037799, + "auxiliary_loss_mlp": 0.01004509, + "balance_loss_clip": 1.00256538, + "balance_loss_mlp": 1.01001608, + "epoch": 0.3205771832256125, + "flos": 68417979765120.0, + "grad_norm": 0.8366333048595008, + "language_loss": 0.57806182, + "learning_rate": 3.177743502478447e-06, + "loss": 0.59848487, + "num_input_tokens_seen": 114577710, + "router_z_loss_clip": 0.01940918, + "router_z_loss_mlp": 0.27734375, + "step": 5332, + "time_per_iteration": 2.9984278678894043 + }, + { + "auxiliary_loss_clip": 0.01134361, + "auxiliary_loss_mlp": 0.01039294, + "balance_loss_clip": 1.02450585, + "balance_loss_mlp": 1.04747975, + "epoch": 0.3206373064782805, + "flos": 30443953173120.0, + "grad_norm": 1.7943987990453594, + "language_loss": 0.73309821, + "learning_rate": 3.177428706902205e-06, + "loss": 0.75483477, + "num_input_tokens_seen": 114598640, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.87109375, + "step": 5333, + "time_per_iteration": 2.554401159286499 + }, + { + "auxiliary_loss_clip": 0.01133668, + "auxiliary_loss_mlp": 0.01042462, + "balance_loss_clip": 1.02686942, + "balance_loss_mlp": 1.04836345, + "epoch": 0.32069742973094845, + "flos": 22054502862720.0, + "grad_norm": 1.5896288664703238, + "language_loss": 0.71050882, + "learning_rate": 3.1771138666777485e-06, + "loss": 0.73227012, + "num_input_tokens_seen": 114618780, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8515625, + "step": 5334, + "time_per_iteration": 2.468472957611084 + }, + { + "auxiliary_loss_clip": 0.01132404, + "auxiliary_loss_mlp": 0.01041967, + "balance_loss_clip": 1.02658951, + "balance_loss_mlp": 1.04644001, + "epoch": 0.3207575529836164, + "flos": 22054000072320.0, + "grad_norm": 1.9528247502362917, + "language_loss": 0.77601135, + "learning_rate": 3.1767989818170156e-06, + "loss": 0.797755, + "num_input_tokens_seen": 114637525, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.859375, + "step": 5335, + "time_per_iteration": 2.524211883544922 + }, + { + "auxiliary_loss_clip": 0.01131695, + "auxiliary_loss_mlp": 0.01040383, + "balance_loss_clip": 1.02519548, + "balance_loss_mlp": 1.04687452, + "epoch": 0.3208176762362844, + "flos": 34057536186240.0, + "grad_norm": 1.5197552931214375, + "language_loss": 0.68353152, + "learning_rate": 3.1764840523319477e-06, + "loss": 0.70525241, + "num_input_tokens_seen": 114659705, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.84765625, + "step": 5336, + "time_per_iteration": 2.5674326419830322 + }, + { + "auxiliary_loss_clip": 0.01131682, + "auxiliary_loss_mlp": 0.01045646, + "balance_loss_clip": 1.03027439, + "balance_loss_mlp": 1.04688144, + "epoch": 0.32087779948895234, + "flos": 21798711135360.0, + "grad_norm": 1.7063748564330914, + "language_loss": 0.7895453, + "learning_rate": 3.176169078234487e-06, + "loss": 0.81131858, + "num_input_tokens_seen": 114678340, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84765625, + "step": 5337, + "time_per_iteration": 2.5010595321655273 + }, + { + "auxiliary_loss_clip": 0.01124535, + "auxiliary_loss_mlp": 0.01035607, + "balance_loss_clip": 1.02194548, + "balance_loss_mlp": 1.04505002, + "epoch": 0.3209379227416203, + "flos": 21434110133760.0, + "grad_norm": 1.7193225847880926, + "language_loss": 0.73997593, + "learning_rate": 3.1758540595365766e-06, + "loss": 0.76157737, + "num_input_tokens_seen": 114696980, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.796875, + "step": 5338, + "time_per_iteration": 2.4961647987365723 + }, + { + "auxiliary_loss_clip": 0.01132045, + "auxiliary_loss_mlp": 0.01041805, + "balance_loss_clip": 1.02633142, + "balance_loss_mlp": 1.04477298, + "epoch": 0.3209980459942883, + "flos": 25849075530240.0, + "grad_norm": 1.8336519924948942, + "language_loss": 0.63149244, + "learning_rate": 3.1755389962501626e-06, + "loss": 0.65323097, + "num_input_tokens_seen": 114717330, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.87109375, + "step": 5339, + "time_per_iteration": 2.5218987464904785 + }, + { + "auxiliary_loss_clip": 0.01130495, + "auxiliary_loss_mlp": 0.01039604, + "balance_loss_clip": 1.02409506, + "balance_loss_mlp": 1.04546928, + "epoch": 0.32105816924695624, + "flos": 19099162535040.0, + "grad_norm": 1.814332726776551, + "language_loss": 0.81917858, + "learning_rate": 3.175223888387192e-06, + "loss": 0.84087962, + "num_input_tokens_seen": 114736320, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8515625, + "step": 5340, + "time_per_iteration": 2.427483558654785 + }, + { + "auxiliary_loss_clip": 0.0113181, + "auxiliary_loss_mlp": 0.01043319, + "balance_loss_clip": 1.02847123, + "balance_loss_mlp": 1.04696941, + "epoch": 0.3211182924996242, + "flos": 16581860565120.0, + "grad_norm": 1.7172536004624983, + "language_loss": 0.7620244, + "learning_rate": 3.1749087359596137e-06, + "loss": 0.78377569, + "num_input_tokens_seen": 114754575, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.84765625, + "step": 5341, + "time_per_iteration": 2.4785468578338623 + }, + { + "auxiliary_loss_clip": 0.01130847, + "auxiliary_loss_mlp": 0.01036241, + "balance_loss_clip": 1.02154231, + "balance_loss_mlp": 1.04897809, + "epoch": 0.3211784157522922, + "flos": 22672202071680.0, + "grad_norm": 1.9213308470980235, + "language_loss": 0.78627086, + "learning_rate": 3.1745935389793786e-06, + "loss": 0.80794168, + "num_input_tokens_seen": 114773590, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.81640625, + "step": 5342, + "time_per_iteration": 2.4524106979370117 + }, + { + "auxiliary_loss_clip": 0.01133398, + "auxiliary_loss_mlp": 0.01039022, + "balance_loss_clip": 1.02290499, + "balance_loss_mlp": 1.04772902, + "epoch": 0.3212385390049602, + "flos": 20558787603840.0, + "grad_norm": 3.762302479650767, + "language_loss": 0.74934483, + "learning_rate": 3.174278297458438e-06, + "loss": 0.77106899, + "num_input_tokens_seen": 114790775, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.859375, + "step": 5343, + "time_per_iteration": 2.4744415283203125 + }, + { + "auxiliary_loss_clip": 0.01130661, + "auxiliary_loss_mlp": 0.01035912, + "balance_loss_clip": 1.02040279, + "balance_loss_mlp": 1.04623377, + "epoch": 0.32129866225762815, + "flos": 24791147233920.0, + "grad_norm": 1.6135516142824962, + "language_loss": 0.82859504, + "learning_rate": 3.173963011408748e-06, + "loss": 0.85026079, + "num_input_tokens_seen": 114809835, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.84375, + "step": 5344, + "time_per_iteration": 2.47578763961792 + }, + { + "auxiliary_loss_clip": 0.01130938, + "auxiliary_loss_mlp": 0.01039787, + "balance_loss_clip": 1.02407503, + "balance_loss_mlp": 1.04474425, + "epoch": 0.3213587855102961, + "flos": 18366871962240.0, + "grad_norm": 2.07297685310976, + "language_loss": 0.79812628, + "learning_rate": 3.173647680842262e-06, + "loss": 0.81983352, + "num_input_tokens_seen": 114826505, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.859375, + "step": 5345, + "time_per_iteration": 5.33889365196228 + }, + { + "auxiliary_loss_clip": 0.01130545, + "auxiliary_loss_mlp": 0.01036519, + "balance_loss_clip": 1.02149296, + "balance_loss_mlp": 1.04473424, + "epoch": 0.3214189087629641, + "flos": 27015992668800.0, + "grad_norm": 1.8810220564208493, + "language_loss": 0.83404821, + "learning_rate": 3.1733323057709384e-06, + "loss": 0.85571885, + "num_input_tokens_seen": 114846140, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.859375, + "step": 5346, + "time_per_iteration": 2.500577688217163 + }, + { + "auxiliary_loss_clip": 0.01131977, + "auxiliary_loss_mlp": 0.0103944, + "balance_loss_clip": 1.02362108, + "balance_loss_mlp": 1.04492784, + "epoch": 0.32147903201563205, + "flos": 23148269953920.0, + "grad_norm": 1.4095386913443633, + "language_loss": 0.81571388, + "learning_rate": 3.1730168862067366e-06, + "loss": 0.83742809, + "num_input_tokens_seen": 114866660, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.87109375, + "step": 5347, + "time_per_iteration": 2.4491653442382812 + }, + { + "auxiliary_loss_clip": 0.01130206, + "auxiliary_loss_mlp": 0.01039058, + "balance_loss_clip": 1.02332854, + "balance_loss_mlp": 1.04715562, + "epoch": 0.3215391552683, + "flos": 16580747243520.0, + "grad_norm": 1.9965712334987884, + "language_loss": 0.79898697, + "learning_rate": 3.1727014221616164e-06, + "loss": 0.82067955, + "num_input_tokens_seen": 114882820, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.828125, + "step": 5348, + "time_per_iteration": 2.471261501312256 + }, + { + "auxiliary_loss_clip": 0.01132661, + "auxiliary_loss_mlp": 0.01047853, + "balance_loss_clip": 1.03262997, + "balance_loss_mlp": 1.04691792, + "epoch": 0.321599278520968, + "flos": 17821820010240.0, + "grad_norm": 1.9690807455187813, + "language_loss": 0.8506968, + "learning_rate": 3.172385913647542e-06, + "loss": 0.87250197, + "num_input_tokens_seen": 114900745, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.859375, + "step": 5349, + "time_per_iteration": 2.4376416206359863 + }, + { + "auxiliary_loss_clip": 0.01130553, + "auxiliary_loss_mlp": 0.01037186, + "balance_loss_clip": 1.02215409, + "balance_loss_mlp": 1.04589188, + "epoch": 0.32165940177363594, + "flos": 16251769555200.0, + "grad_norm": 1.7092259574450879, + "language_loss": 0.80862331, + "learning_rate": 3.172070360676475e-06, + "loss": 0.83030069, + "num_input_tokens_seen": 114917940, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84765625, + "step": 5350, + "time_per_iteration": 2.463998794555664 + }, + { + "auxiliary_loss_clip": 0.01129559, + "auxiliary_loss_mlp": 0.01040074, + "balance_loss_clip": 1.02545869, + "balance_loss_mlp": 1.04548049, + "epoch": 0.3217195250263039, + "flos": 27599900158080.0, + "grad_norm": 1.7709203173786705, + "language_loss": 0.79856229, + "learning_rate": 3.1717547632603828e-06, + "loss": 0.82025862, + "num_input_tokens_seen": 114937735, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.83984375, + "step": 5351, + "time_per_iteration": 2.5017340183258057 + }, + { + "auxiliary_loss_clip": 0.01129171, + "auxiliary_loss_mlp": 0.01040328, + "balance_loss_clip": 1.02396047, + "balance_loss_mlp": 1.04505897, + "epoch": 0.3217796482789719, + "flos": 21470595373440.0, + "grad_norm": 1.701097630272038, + "language_loss": 0.75491166, + "learning_rate": 3.1714391214112326e-06, + "loss": 0.77660662, + "num_input_tokens_seen": 114956630, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.83984375, + "step": 5352, + "time_per_iteration": 2.4916653633117676 + }, + { + "auxiliary_loss_clip": 0.01132153, + "auxiliary_loss_mlp": 0.01038225, + "balance_loss_clip": 1.02179837, + "balance_loss_mlp": 1.0472436, + "epoch": 0.32183977153163984, + "flos": 21215593745280.0, + "grad_norm": 1.8428416092094815, + "language_loss": 0.8174473, + "learning_rate": 3.1711234351409933e-06, + "loss": 0.83915108, + "num_input_tokens_seen": 114976470, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8515625, + "step": 5353, + "time_per_iteration": 2.4554946422576904 + }, + { + "auxiliary_loss_clip": 0.01127699, + "auxiliary_loss_mlp": 0.01037405, + "balance_loss_clip": 1.02147865, + "balance_loss_mlp": 1.04577875, + "epoch": 0.3218998947843078, + "flos": 24608182331520.0, + "grad_norm": 1.533417142425662, + "language_loss": 0.73054826, + "learning_rate": 3.1708077044616365e-06, + "loss": 0.75219929, + "num_input_tokens_seen": 114996710, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8203125, + "step": 5354, + "time_per_iteration": 2.521679639816284 + }, + { + "auxiliary_loss_clip": 0.01129194, + "auxiliary_loss_mlp": 0.01033035, + "balance_loss_clip": 1.01830053, + "balance_loss_mlp": 1.04482782, + "epoch": 0.3219600180369758, + "flos": 22270577126400.0, + "grad_norm": 1.5056594732405602, + "language_loss": 0.8349731, + "learning_rate": 3.1704919293851334e-06, + "loss": 0.8565954, + "num_input_tokens_seen": 115015775, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.84375, + "step": 5355, + "time_per_iteration": 2.4590871334075928 + }, + { + "auxiliary_loss_clip": 0.0113427, + "auxiliary_loss_mlp": 0.01045552, + "balance_loss_clip": 1.0299834, + "balance_loss_mlp": 1.04840243, + "epoch": 0.3220201412896438, + "flos": 14939126939520.0, + "grad_norm": 2.2450583198173737, + "language_loss": 0.71577442, + "learning_rate": 3.1701761099234597e-06, + "loss": 0.73757267, + "num_input_tokens_seen": 115034265, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.859375, + "step": 5356, + "time_per_iteration": 2.4499382972717285 + }, + { + "auxiliary_loss_clip": 0.01137452, + "auxiliary_loss_mlp": 0.0103626, + "balance_loss_clip": 1.0196538, + "balance_loss_mlp": 1.04720378, + "epoch": 0.32208026454231176, + "flos": 22667389649280.0, + "grad_norm": 2.5072162620412968, + "language_loss": 0.68480343, + "learning_rate": 3.1698602460885903e-06, + "loss": 0.70654052, + "num_input_tokens_seen": 115051945, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.90234375, + "step": 5357, + "time_per_iteration": 2.449125289916992 + }, + { + "auxiliary_loss_clip": 0.01042111, + "auxiliary_loss_mlp": 0.01002103, + "balance_loss_clip": 1.00029111, + "balance_loss_mlp": 1.01435876, + "epoch": 0.3221403877949797, + "flos": 64605130053120.0, + "grad_norm": 0.7023861387911429, + "language_loss": 0.58256829, + "learning_rate": 3.1695443378925035e-06, + "loss": 0.60301042, + "num_input_tokens_seen": 115119090, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.27734375, + "step": 5358, + "time_per_iteration": 3.1561930179595947 + }, + { + "auxiliary_loss_clip": 0.01130123, + "auxiliary_loss_mlp": 0.01041349, + "balance_loss_clip": 1.02506542, + "balance_loss_mlp": 1.04423356, + "epoch": 0.3222005110476477, + "flos": 20157019004160.0, + "grad_norm": 5.918956850418863, + "language_loss": 0.83524048, + "learning_rate": 3.1692283853471777e-06, + "loss": 0.85695517, + "num_input_tokens_seen": 115137755, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.859375, + "step": 5359, + "time_per_iteration": 2.4850337505340576 + }, + { + "auxiliary_loss_clip": 0.01132117, + "auxiliary_loss_mlp": 0.01034071, + "balance_loss_clip": 1.019122, + "balance_loss_mlp": 1.04514802, + "epoch": 0.32226063430031565, + "flos": 22674177319680.0, + "grad_norm": 1.5557598040672038, + "language_loss": 0.79817981, + "learning_rate": 3.168912388464595e-06, + "loss": 0.81984174, + "num_input_tokens_seen": 115158150, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8671875, + "step": 5360, + "time_per_iteration": 2.476698637008667 + }, + { + "auxiliary_loss_clip": 0.01040711, + "auxiliary_loss_mlp": 0.00999439, + "balance_loss_clip": 0.99754351, + "balance_loss_mlp": 1.01316833, + "epoch": 0.3223207575529836, + "flos": 63828525075840.0, + "grad_norm": 0.750004294413456, + "language_loss": 0.5697335, + "learning_rate": 3.168596347256737e-06, + "loss": 0.59013498, + "num_input_tokens_seen": 115212755, + "router_z_loss_clip": 0.0189209, + "router_z_loss_mlp": 0.27539062, + "step": 5361, + "time_per_iteration": 2.933368444442749 + }, + { + "auxiliary_loss_clip": 0.01129938, + "auxiliary_loss_mlp": 0.01039744, + "balance_loss_clip": 1.02452111, + "balance_loss_mlp": 1.04625082, + "epoch": 0.3223808808056516, + "flos": 26870123537280.0, + "grad_norm": 1.730134050345621, + "language_loss": 0.71349204, + "learning_rate": 3.168280261735588e-06, + "loss": 0.73518884, + "num_input_tokens_seen": 115233090, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8359375, + "step": 5362, + "time_per_iteration": 2.508444309234619 + }, + { + "auxiliary_loss_clip": 0.0113054, + "auxiliary_loss_mlp": 0.01040003, + "balance_loss_clip": 1.02606201, + "balance_loss_mlp": 1.04685211, + "epoch": 0.32244100405831955, + "flos": 26761350176640.0, + "grad_norm": 1.6566995758494631, + "language_loss": 0.74008292, + "learning_rate": 3.167964131913135e-06, + "loss": 0.76178837, + "num_input_tokens_seen": 115252645, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.8359375, + "step": 5363, + "time_per_iteration": 2.530428409576416 + }, + { + "auxiliary_loss_clip": 0.01134592, + "auxiliary_loss_mlp": 0.01040493, + "balance_loss_clip": 1.02481735, + "balance_loss_mlp": 1.04535139, + "epoch": 0.3225011273109875, + "flos": 23803029020160.0, + "grad_norm": 2.5112112412179624, + "language_loss": 0.77012563, + "learning_rate": 3.167647957801365e-06, + "loss": 0.79187649, + "num_input_tokens_seen": 115269085, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.890625, + "step": 5364, + "time_per_iteration": 2.475532054901123 + }, + { + "auxiliary_loss_clip": 0.01128803, + "auxiliary_loss_mlp": 0.01043179, + "balance_loss_clip": 1.02747917, + "balance_loss_mlp": 1.04455853, + "epoch": 0.3225612505636555, + "flos": 17274505501440.0, + "grad_norm": 2.1198351151285992, + "language_loss": 0.77043676, + "learning_rate": 3.1673317394122672e-06, + "loss": 0.79215652, + "num_input_tokens_seen": 115286470, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.84375, + "step": 5365, + "time_per_iteration": 2.4466004371643066 + }, + { + "auxiliary_loss_clip": 0.01133051, + "auxiliary_loss_mlp": 0.01049625, + "balance_loss_clip": 1.03444982, + "balance_loss_mlp": 1.04861832, + "epoch": 0.32262137381632344, + "flos": 23366247638400.0, + "grad_norm": 1.5183743876703555, + "language_loss": 0.76853883, + "learning_rate": 3.1670154767578333e-06, + "loss": 0.79036558, + "num_input_tokens_seen": 115307000, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.84375, + "step": 5366, + "time_per_iteration": 2.4716286659240723 + }, + { + "auxiliary_loss_clip": 0.01129975, + "auxiliary_loss_mlp": 0.01042819, + "balance_loss_clip": 1.02767324, + "balance_loss_mlp": 1.04463363, + "epoch": 0.3226814970689914, + "flos": 23258803080960.0, + "grad_norm": 1.6325357922005805, + "language_loss": 0.7200039, + "learning_rate": 3.166699169850055e-06, + "loss": 0.74173188, + "num_input_tokens_seen": 115325925, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8515625, + "step": 5367, + "time_per_iteration": 2.4936037063598633 + }, + { + "auxiliary_loss_clip": 0.01125689, + "auxiliary_loss_mlp": 0.01042014, + "balance_loss_clip": 1.02759588, + "balance_loss_mlp": 1.04335558, + "epoch": 0.32274162032165943, + "flos": 16395196561920.0, + "grad_norm": 1.8801069032327764, + "language_loss": 0.7456941, + "learning_rate": 3.1663828187009274e-06, + "loss": 0.76737112, + "num_input_tokens_seen": 115343705, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8203125, + "step": 5368, + "time_per_iteration": 2.436897039413452 + }, + { + "auxiliary_loss_clip": 0.01125271, + "auxiliary_loss_mlp": 0.0104042, + "balance_loss_clip": 1.02592432, + "balance_loss_mlp": 1.04390144, + "epoch": 0.3228017435743274, + "flos": 27855081354240.0, + "grad_norm": 1.5502047591083525, + "language_loss": 0.79212499, + "learning_rate": 3.1660664233224467e-06, + "loss": 0.81378186, + "num_input_tokens_seen": 115364170, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8125, + "step": 5369, + "time_per_iteration": 2.516191244125366 + }, + { + "auxiliary_loss_clip": 0.01124886, + "auxiliary_loss_mlp": 0.01034584, + "balance_loss_clip": 1.02042747, + "balance_loss_mlp": 1.04432988, + "epoch": 0.32286186682699536, + "flos": 19608770741760.0, + "grad_norm": 1.8370527927944635, + "language_loss": 0.83173579, + "learning_rate": 3.16574998372661e-06, + "loss": 0.85333049, + "num_input_tokens_seen": 115382495, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8046875, + "step": 5370, + "time_per_iteration": 2.423494338989258 + }, + { + "auxiliary_loss_clip": 0.01128659, + "auxiliary_loss_mlp": 0.01038158, + "balance_loss_clip": 1.02367377, + "balance_loss_mlp": 1.04524064, + "epoch": 0.3229219900796633, + "flos": 24134017870080.0, + "grad_norm": 1.743608915284185, + "language_loss": 0.83372939, + "learning_rate": 3.1654334999254177e-06, + "loss": 0.85539752, + "num_input_tokens_seen": 115399450, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8359375, + "step": 5371, + "time_per_iteration": 2.481677532196045 + }, + { + "auxiliary_loss_clip": 0.01131779, + "auxiliary_loss_mlp": 0.01048903, + "balance_loss_clip": 1.0323211, + "balance_loss_mlp": 1.04514813, + "epoch": 0.3229821133323313, + "flos": 17748705876480.0, + "grad_norm": 2.043238736788368, + "language_loss": 0.88539696, + "learning_rate": 3.1651169719308695e-06, + "loss": 0.90720367, + "num_input_tokens_seen": 115417700, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8671875, + "step": 5372, + "time_per_iteration": 2.434785842895508 + }, + { + "auxiliary_loss_clip": 0.01128015, + "auxiliary_loss_mlp": 0.01045541, + "balance_loss_clip": 1.03011537, + "balance_loss_mlp": 1.04532862, + "epoch": 0.32304223658499925, + "flos": 22346025644160.0, + "grad_norm": 1.9701661898720624, + "language_loss": 0.73064935, + "learning_rate": 3.1648003997549694e-06, + "loss": 0.75238496, + "num_input_tokens_seen": 115435840, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.828125, + "step": 5373, + "time_per_iteration": 2.509288787841797 + }, + { + "auxiliary_loss_clip": 0.01126431, + "auxiliary_loss_mlp": 0.01036263, + "balance_loss_clip": 1.0217371, + "balance_loss_mlp": 1.04496944, + "epoch": 0.3231023598376672, + "flos": 18478302929280.0, + "grad_norm": 2.118108535598075, + "language_loss": 0.81306481, + "learning_rate": 3.1644837834097214e-06, + "loss": 0.83469176, + "num_input_tokens_seen": 115454210, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8125, + "step": 5374, + "time_per_iteration": 2.43719744682312 + }, + { + "auxiliary_loss_clip": 0.01122361, + "auxiliary_loss_mlp": 0.01036065, + "balance_loss_clip": 1.02135515, + "balance_loss_mlp": 1.04158425, + "epoch": 0.3231624830903352, + "flos": 27636313570560.0, + "grad_norm": 2.0253542373007223, + "language_loss": 0.87507123, + "learning_rate": 3.1641671229071317e-06, + "loss": 0.89665556, + "num_input_tokens_seen": 115471785, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.80859375, + "step": 5375, + "time_per_iteration": 2.5192272663116455 + }, + { + "auxiliary_loss_clip": 0.0112955, + "auxiliary_loss_mlp": 0.01037551, + "balance_loss_clip": 1.02163696, + "balance_loss_mlp": 1.04312396, + "epoch": 0.32322260634300315, + "flos": 21726423014400.0, + "grad_norm": 1.8491566525281582, + "language_loss": 0.75873786, + "learning_rate": 3.1638504182592076e-06, + "loss": 0.78040886, + "num_input_tokens_seen": 115491405, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8671875, + "step": 5376, + "time_per_iteration": 2.463103771209717 + }, + { + "auxiliary_loss_clip": 0.01123814, + "auxiliary_loss_mlp": 0.0103315, + "balance_loss_clip": 1.01955426, + "balance_loss_mlp": 1.04269242, + "epoch": 0.3232827295956711, + "flos": 22637656166400.0, + "grad_norm": 1.5890241026671568, + "language_loss": 0.67173672, + "learning_rate": 3.1635336694779594e-06, + "loss": 0.69330645, + "num_input_tokens_seen": 115511555, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.8125, + "step": 5377, + "time_per_iteration": 2.5341343879699707 + }, + { + "auxiliary_loss_clip": 0.01127281, + "auxiliary_loss_mlp": 0.01046076, + "balance_loss_clip": 1.02922571, + "balance_loss_mlp": 1.04433763, + "epoch": 0.3233428528483391, + "flos": 26322593546880.0, + "grad_norm": 1.5071806558198568, + "language_loss": 0.7231617, + "learning_rate": 3.1632168765753982e-06, + "loss": 0.74489522, + "num_input_tokens_seen": 115532860, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.828125, + "step": 5378, + "time_per_iteration": 2.4838621616363525 + }, + { + "auxiliary_loss_clip": 0.01123972, + "auxiliary_loss_mlp": 0.01032073, + "balance_loss_clip": 1.0174818, + "balance_loss_mlp": 1.04056036, + "epoch": 0.32340297610100704, + "flos": 28585217111040.0, + "grad_norm": 1.9527598104570445, + "language_loss": 0.82083338, + "learning_rate": 3.1629000395635357e-06, + "loss": 0.84239388, + "num_input_tokens_seen": 115553850, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8359375, + "step": 5379, + "time_per_iteration": 2.5433154106140137 + }, + { + "auxiliary_loss_clip": 0.01127314, + "auxiliary_loss_mlp": 0.01032505, + "balance_loss_clip": 1.01805711, + "balance_loss_mlp": 1.04230165, + "epoch": 0.323463099353675, + "flos": 30773792787840.0, + "grad_norm": 1.9705325619840932, + "language_loss": 0.78379917, + "learning_rate": 3.162583158454388e-06, + "loss": 0.80539739, + "num_input_tokens_seen": 115575530, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8515625, + "step": 5380, + "time_per_iteration": 2.5306878089904785 + }, + { + "auxiliary_loss_clip": 0.0112988, + "auxiliary_loss_mlp": 0.01036402, + "balance_loss_clip": 1.02207887, + "balance_loss_mlp": 1.04637241, + "epoch": 0.32352322260634303, + "flos": 25228610974080.0, + "grad_norm": 1.5992937517204726, + "language_loss": 0.76871669, + "learning_rate": 3.1622662332599697e-06, + "loss": 0.79037952, + "num_input_tokens_seen": 115594885, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8359375, + "step": 5381, + "time_per_iteration": 2.545740842819214 + }, + { + "auxiliary_loss_clip": 0.0112089, + "auxiliary_loss_mlp": 0.01035663, + "balance_loss_clip": 1.02228761, + "balance_loss_mlp": 1.04212475, + "epoch": 0.323583345859011, + "flos": 23330480670720.0, + "grad_norm": 1.912812068704809, + "language_loss": 0.71864545, + "learning_rate": 3.1619492639922998e-06, + "loss": 0.74021101, + "num_input_tokens_seen": 115614080, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7890625, + "step": 5382, + "time_per_iteration": 2.488344430923462 + }, + { + "auxiliary_loss_clip": 0.01127382, + "auxiliary_loss_mlp": 0.0103402, + "balance_loss_clip": 1.0192976, + "balance_loss_mlp": 1.0424943, + "epoch": 0.32364346911167896, + "flos": 26207499392640.0, + "grad_norm": 2.8562908675977754, + "language_loss": 0.70752692, + "learning_rate": 3.1616322506633964e-06, + "loss": 0.72914088, + "num_input_tokens_seen": 115632820, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8515625, + "step": 5383, + "time_per_iteration": 2.5236711502075195 + }, + { + "auxiliary_loss_clip": 0.01123876, + "auxiliary_loss_mlp": 0.01039099, + "balance_loss_clip": 1.0259378, + "balance_loss_mlp": 1.0442363, + "epoch": 0.3237035923643469, + "flos": 23695764030720.0, + "grad_norm": 2.094388352971362, + "language_loss": 0.78742963, + "learning_rate": 3.161315193285283e-06, + "loss": 0.80905938, + "num_input_tokens_seen": 115652860, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.796875, + "step": 5384, + "time_per_iteration": 2.4685723781585693 + }, + { + "auxiliary_loss_clip": 0.0112912, + "auxiliary_loss_mlp": 0.01038116, + "balance_loss_clip": 1.0222249, + "balance_loss_mlp": 1.04443073, + "epoch": 0.3237637156170149, + "flos": 14428728633600.0, + "grad_norm": 2.069351852322995, + "language_loss": 0.74553645, + "learning_rate": 3.16099809186998e-06, + "loss": 0.76720881, + "num_input_tokens_seen": 115670940, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.84765625, + "step": 5385, + "time_per_iteration": 2.46968936920166 + }, + { + "auxiliary_loss_clip": 0.01127931, + "auxiliary_loss_mlp": 0.01035567, + "balance_loss_clip": 1.02101183, + "balance_loss_mlp": 1.04604125, + "epoch": 0.32382383886968286, + "flos": 31062981185280.0, + "grad_norm": 1.8196037573439483, + "language_loss": 0.72068852, + "learning_rate": 3.1606809464295145e-06, + "loss": 0.74232352, + "num_input_tokens_seen": 115691155, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8203125, + "step": 5386, + "time_per_iteration": 2.559480667114258 + }, + { + "auxiliary_loss_clip": 0.01128094, + "auxiliary_loss_mlp": 0.01036855, + "balance_loss_clip": 1.02119136, + "balance_loss_mlp": 1.04176617, + "epoch": 0.3238839621223508, + "flos": 23256935573760.0, + "grad_norm": 1.8525904099951498, + "language_loss": 0.94343817, + "learning_rate": 3.1603637569759095e-06, + "loss": 0.96508765, + "num_input_tokens_seen": 115710340, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.86328125, + "step": 5387, + "time_per_iteration": 5.378048896789551 + }, + { + "auxiliary_loss_clip": 0.0112709, + "auxiliary_loss_mlp": 0.01037979, + "balance_loss_clip": 1.02227962, + "balance_loss_mlp": 1.04373097, + "epoch": 0.3239440853750188, + "flos": 22964658606720.0, + "grad_norm": 2.7647642243142747, + "language_loss": 0.77544433, + "learning_rate": 3.1600465235211956e-06, + "loss": 0.79709506, + "num_input_tokens_seen": 115726745, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8359375, + "step": 5388, + "time_per_iteration": 2.4804563522338867 + }, + { + "auxiliary_loss_clip": 0.0112736, + "auxiliary_loss_mlp": 0.01030728, + "balance_loss_clip": 1.01554048, + "balance_loss_mlp": 1.04277194, + "epoch": 0.32400420862768675, + "flos": 36246614653440.0, + "grad_norm": 2.092216766577811, + "language_loss": 0.71867704, + "learning_rate": 3.1597292460774006e-06, + "loss": 0.74025786, + "num_input_tokens_seen": 115749385, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.84375, + "step": 5389, + "time_per_iteration": 2.5753331184387207 + }, + { + "auxiliary_loss_clip": 0.01128194, + "auxiliary_loss_mlp": 0.0103865, + "balance_loss_clip": 1.0233078, + "balance_loss_mlp": 1.04672205, + "epoch": 0.3240643318803547, + "flos": 21616500418560.0, + "grad_norm": 2.0374979548818497, + "language_loss": 0.80883735, + "learning_rate": 3.159411924656557e-06, + "loss": 0.83050573, + "num_input_tokens_seen": 115768105, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.81640625, + "step": 5390, + "time_per_iteration": 2.479557991027832 + }, + { + "auxiliary_loss_clip": 0.01130573, + "auxiliary_loss_mlp": 0.01044181, + "balance_loss_clip": 1.02911294, + "balance_loss_mlp": 1.04798484, + "epoch": 0.3241244551330227, + "flos": 23295611543040.0, + "grad_norm": 2.0682587448682384, + "language_loss": 0.72983515, + "learning_rate": 3.1590945592706967e-06, + "loss": 0.75158268, + "num_input_tokens_seen": 115787340, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.82421875, + "step": 5391, + "time_per_iteration": 2.4689247608184814 + }, + { + "auxiliary_loss_clip": 0.01125432, + "auxiliary_loss_mlp": 0.01041396, + "balance_loss_clip": 1.02728176, + "balance_loss_mlp": 1.04465139, + "epoch": 0.32418457838569065, + "flos": 14097236993280.0, + "grad_norm": 1.6356435132494873, + "language_loss": 0.77357036, + "learning_rate": 3.158777149931855e-06, + "loss": 0.79523861, + "num_input_tokens_seen": 115805565, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.80859375, + "step": 5392, + "time_per_iteration": 2.4942643642425537 + }, + { + "auxiliary_loss_clip": 0.01129141, + "auxiliary_loss_mlp": 0.01040265, + "balance_loss_clip": 1.02454162, + "balance_loss_mlp": 1.04454243, + "epoch": 0.3242447016383586, + "flos": 29752672953600.0, + "grad_norm": 2.035025217222515, + "language_loss": 0.62445068, + "learning_rate": 3.158459696652067e-06, + "loss": 0.64614469, + "num_input_tokens_seen": 115826725, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.84375, + "step": 5393, + "time_per_iteration": 2.5294058322906494 + }, + { + "auxiliary_loss_clip": 0.01127178, + "auxiliary_loss_mlp": 0.01037592, + "balance_loss_clip": 1.02292883, + "balance_loss_mlp": 1.0455395, + "epoch": 0.3243048248910266, + "flos": 24351205455360.0, + "grad_norm": 1.541011228274946, + "language_loss": 0.8250984, + "learning_rate": 3.158142199443371e-06, + "loss": 0.84674609, + "num_input_tokens_seen": 115846955, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.81640625, + "step": 5394, + "time_per_iteration": 2.5204803943634033 + }, + { + "auxiliary_loss_clip": 0.01125244, + "auxiliary_loss_mlp": 0.0104429, + "balance_loss_clip": 1.03089094, + "balance_loss_mlp": 1.04596353, + "epoch": 0.3243649481436946, + "flos": 24353037048960.0, + "grad_norm": 1.8431569167236632, + "language_loss": 0.81585443, + "learning_rate": 3.1578246583178076e-06, + "loss": 0.83754981, + "num_input_tokens_seen": 115865975, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.79296875, + "step": 5395, + "time_per_iteration": 2.481722116470337 + }, + { + "auxiliary_loss_clip": 0.01126361, + "auxiliary_loss_mlp": 0.01042015, + "balance_loss_clip": 1.02844906, + "balance_loss_mlp": 1.04834461, + "epoch": 0.32442507139636256, + "flos": 22925228451840.0, + "grad_norm": 3.644291671680186, + "language_loss": 0.83163011, + "learning_rate": 3.157507073287417e-06, + "loss": 0.8533138, + "num_input_tokens_seen": 115884950, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.78125, + "step": 5396, + "time_per_iteration": 2.5014734268188477 + }, + { + "auxiliary_loss_clip": 0.01133358, + "auxiliary_loss_mlp": 0.01039347, + "balance_loss_clip": 1.02392137, + "balance_loss_mlp": 1.04687238, + "epoch": 0.32448519464903053, + "flos": 22200192426240.0, + "grad_norm": 1.8637158339296453, + "language_loss": 0.75718713, + "learning_rate": 3.1571894443642414e-06, + "loss": 0.77891421, + "num_input_tokens_seen": 115904170, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8671875, + "step": 5397, + "time_per_iteration": 2.475958824157715 + }, + { + "auxiliary_loss_clip": 0.01125578, + "auxiliary_loss_mlp": 0.0103396, + "balance_loss_clip": 1.01953566, + "balance_loss_mlp": 1.04540443, + "epoch": 0.3245453179016985, + "flos": 18838450644480.0, + "grad_norm": 2.571224523552484, + "language_loss": 0.66835862, + "learning_rate": 3.1568717715603263e-06, + "loss": 0.68995398, + "num_input_tokens_seen": 115919255, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8046875, + "step": 5398, + "time_per_iteration": 2.447065830230713 + }, + { + "auxiliary_loss_clip": 0.01124854, + "auxiliary_loss_mlp": 0.0103244, + "balance_loss_clip": 1.0183022, + "balance_loss_mlp": 1.04326463, + "epoch": 0.32460544115436646, + "flos": 21178390233600.0, + "grad_norm": 1.4279244162742584, + "language_loss": 0.73232102, + "learning_rate": 3.156554054887718e-06, + "loss": 0.75389397, + "num_input_tokens_seen": 115938535, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8203125, + "step": 5399, + "time_per_iteration": 2.466137409210205 + }, + { + "auxiliary_loss_clip": 0.01129831, + "auxiliary_loss_mlp": 0.01035026, + "balance_loss_clip": 1.02016079, + "balance_loss_mlp": 1.04749155, + "epoch": 0.3246655644070344, + "flos": 21981137333760.0, + "grad_norm": 2.110147681467196, + "language_loss": 0.71391356, + "learning_rate": 3.1562362943584645e-06, + "loss": 0.73556215, + "num_input_tokens_seen": 115955005, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.82421875, + "step": 5400, + "time_per_iteration": 2.484243631362915 + }, + { + "auxiliary_loss_clip": 0.01128373, + "auxiliary_loss_mlp": 0.01035494, + "balance_loss_clip": 1.02108145, + "balance_loss_mlp": 1.04439175, + "epoch": 0.3247256876597024, + "flos": 32159729105280.0, + "grad_norm": 3.048924003265154, + "language_loss": 0.79583031, + "learning_rate": 3.155918489984614e-06, + "loss": 0.81746894, + "num_input_tokens_seen": 115975305, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.83984375, + "step": 5401, + "time_per_iteration": 2.5695505142211914 + }, + { + "auxiliary_loss_clip": 0.01130508, + "auxiliary_loss_mlp": 0.01042722, + "balance_loss_clip": 1.02642608, + "balance_loss_mlp": 1.04700303, + "epoch": 0.32478581091237035, + "flos": 20997544233600.0, + "grad_norm": 1.4209306386542333, + "language_loss": 0.87675726, + "learning_rate": 3.1556006417782196e-06, + "loss": 0.89848959, + "num_input_tokens_seen": 115994810, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8359375, + "step": 5402, + "time_per_iteration": 2.4811201095581055 + }, + { + "auxiliary_loss_clip": 0.01122645, + "auxiliary_loss_mlp": 0.01036689, + "balance_loss_clip": 1.02249742, + "balance_loss_mlp": 1.04369164, + "epoch": 0.3248459341650383, + "flos": 17924990849280.0, + "grad_norm": 1.934597728175988, + "language_loss": 0.84513289, + "learning_rate": 3.155282749751332e-06, + "loss": 0.86672628, + "num_input_tokens_seen": 116011095, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7890625, + "step": 5403, + "time_per_iteration": 2.418501377105713 + }, + { + "auxiliary_loss_clip": 0.01129275, + "auxiliary_loss_mlp": 0.01041866, + "balance_loss_clip": 1.02852631, + "balance_loss_mlp": 1.05024314, + "epoch": 0.3249060574177063, + "flos": 24535606901760.0, + "grad_norm": 2.0001546098828955, + "language_loss": 0.87642342, + "learning_rate": 3.154964813916007e-06, + "loss": 0.89813483, + "num_input_tokens_seen": 116028805, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7890625, + "step": 5404, + "time_per_iteration": 2.5094971656799316 + }, + { + "auxiliary_loss_clip": 0.01125879, + "auxiliary_loss_mlp": 0.01038939, + "balance_loss_clip": 1.02413273, + "balance_loss_mlp": 1.04579973, + "epoch": 0.32496618067037425, + "flos": 25994765093760.0, + "grad_norm": 1.6336968005079966, + "language_loss": 0.72491479, + "learning_rate": 3.1546468342843008e-06, + "loss": 0.74656296, + "num_input_tokens_seen": 116047765, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.80078125, + "step": 5405, + "time_per_iteration": 2.4927978515625 + }, + { + "auxiliary_loss_clip": 0.01125757, + "auxiliary_loss_mlp": 0.01035734, + "balance_loss_clip": 1.02147698, + "balance_loss_mlp": 1.04514825, + "epoch": 0.3250263039230422, + "flos": 19573757959680.0, + "grad_norm": 1.8637721662214948, + "language_loss": 0.83356953, + "learning_rate": 3.1543288108682707e-06, + "loss": 0.85518444, + "num_input_tokens_seen": 116068385, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.80859375, + "step": 5406, + "time_per_iteration": 2.534508228302002 + }, + { + "auxiliary_loss_clip": 0.01127659, + "auxiliary_loss_mlp": 0.01036296, + "balance_loss_clip": 1.02241969, + "balance_loss_mlp": 1.0469048, + "epoch": 0.3250864271757102, + "flos": 16763640318720.0, + "grad_norm": 1.836635199790601, + "language_loss": 0.8826412, + "learning_rate": 3.1540107436799764e-06, + "loss": 0.90428072, + "num_input_tokens_seen": 116085350, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.80859375, + "step": 5407, + "time_per_iteration": 2.4199326038360596 + }, + { + "auxiliary_loss_clip": 0.01127405, + "auxiliary_loss_mlp": 0.01036157, + "balance_loss_clip": 1.02160144, + "balance_loss_mlp": 1.04602861, + "epoch": 0.3251465504283782, + "flos": 27819458040960.0, + "grad_norm": 1.5140887230520799, + "language_loss": 0.69643426, + "learning_rate": 3.153692632731479e-06, + "loss": 0.71806979, + "num_input_tokens_seen": 116107560, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8125, + "step": 5408, + "time_per_iteration": 2.5646731853485107 + }, + { + "auxiliary_loss_clip": 0.01131319, + "auxiliary_loss_mlp": 0.01035172, + "balance_loss_clip": 1.02013946, + "balance_loss_mlp": 1.04438102, + "epoch": 0.32520667368104617, + "flos": 19063144172160.0, + "grad_norm": 1.6429750268405912, + "language_loss": 0.77442145, + "learning_rate": 3.153374478034841e-06, + "loss": 0.79608637, + "num_input_tokens_seen": 116125980, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.87109375, + "step": 5409, + "time_per_iteration": 2.450200080871582 + }, + { + "auxiliary_loss_clip": 0.01129924, + "auxiliary_loss_mlp": 0.01046371, + "balance_loss_clip": 1.03142262, + "balance_loss_mlp": 1.04331136, + "epoch": 0.32526679693371413, + "flos": 29382146208000.0, + "grad_norm": 2.3862040562488716, + "language_loss": 0.83582234, + "learning_rate": 3.1530562796021285e-06, + "loss": 0.85758531, + "num_input_tokens_seen": 116146530, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8671875, + "step": 5410, + "time_per_iteration": 2.5161662101745605 + }, + { + "auxiliary_loss_clip": 0.01121858, + "auxiliary_loss_mlp": 0.01034854, + "balance_loss_clip": 1.02089429, + "balance_loss_mlp": 1.04224813, + "epoch": 0.3253269201863821, + "flos": 20704513080960.0, + "grad_norm": 1.5577179591930796, + "language_loss": 0.71270931, + "learning_rate": 3.152738037445405e-06, + "loss": 0.73427641, + "num_input_tokens_seen": 116165695, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.796875, + "step": 5411, + "time_per_iteration": 2.4465057849884033 + }, + { + "auxiliary_loss_clip": 0.01125475, + "auxiliary_loss_mlp": 0.01039417, + "balance_loss_clip": 1.02544606, + "balance_loss_mlp": 1.04381669, + "epoch": 0.32538704343905006, + "flos": 29094142959360.0, + "grad_norm": 1.6024997274503978, + "language_loss": 0.83103073, + "learning_rate": 3.1524197515767403e-06, + "loss": 0.85267961, + "num_input_tokens_seen": 116185375, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.81640625, + "step": 5412, + "time_per_iteration": 2.512471914291382 + }, + { + "auxiliary_loss_clip": 0.01129762, + "auxiliary_loss_mlp": 0.01035505, + "balance_loss_clip": 1.01963782, + "balance_loss_mlp": 1.04417348, + "epoch": 0.325447166691718, + "flos": 24676124906880.0, + "grad_norm": 2.3149031646834577, + "language_loss": 0.80794364, + "learning_rate": 3.152101422008203e-06, + "loss": 0.82959628, + "num_input_tokens_seen": 116204335, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.85546875, + "step": 5413, + "time_per_iteration": 2.483309030532837 + }, + { + "auxiliary_loss_clip": 0.01128818, + "auxiliary_loss_mlp": 0.01042957, + "balance_loss_clip": 1.02723312, + "balance_loss_mlp": 1.04606462, + "epoch": 0.325507289944386, + "flos": 21543134889600.0, + "grad_norm": 1.5892127721025033, + "language_loss": 0.76887989, + "learning_rate": 3.151783048751864e-06, + "loss": 0.79059768, + "num_input_tokens_seen": 116222840, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.828125, + "step": 5414, + "time_per_iteration": 2.4696640968322754 + }, + { + "auxiliary_loss_clip": 0.01039619, + "auxiliary_loss_mlp": 0.01008091, + "balance_loss_clip": 1.00601661, + "balance_loss_mlp": 1.01271892, + "epoch": 0.32556741319705396, + "flos": 71518722347520.0, + "grad_norm": 0.9084647328862615, + "language_loss": 0.64009887, + "learning_rate": 3.1514646318197965e-06, + "loss": 0.66057593, + "num_input_tokens_seen": 116274940, + "router_z_loss_clip": 0.02075195, + "router_z_loss_mlp": 0.26953125, + "step": 5415, + "time_per_iteration": 2.982389450073242 + }, + { + "auxiliary_loss_clip": 0.01124624, + "auxiliary_loss_mlp": 0.01036817, + "balance_loss_clip": 1.02214265, + "balance_loss_mlp": 1.04286838, + "epoch": 0.3256275364497219, + "flos": 23732428838400.0, + "grad_norm": 2.942597496869342, + "language_loss": 0.74265057, + "learning_rate": 3.151146171224075e-06, + "loss": 0.764265, + "num_input_tokens_seen": 116297300, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.81640625, + "step": 5416, + "time_per_iteration": 2.526956558227539 + }, + { + "auxiliary_loss_clip": 0.01039656, + "auxiliary_loss_mlp": 0.01005548, + "balance_loss_clip": 1.00335431, + "balance_loss_mlp": 1.01254702, + "epoch": 0.3256876597023899, + "flos": 67289199891840.0, + "grad_norm": 0.7736939008633222, + "language_loss": 0.57947183, + "learning_rate": 3.1508276669767757e-06, + "loss": 0.59992385, + "num_input_tokens_seen": 116362370, + "router_z_loss_clip": 0.02197266, + "router_z_loss_mlp": 0.26953125, + "step": 5417, + "time_per_iteration": 3.1500296592712402 + }, + { + "auxiliary_loss_clip": 0.01038219, + "auxiliary_loss_mlp": 0.01002181, + "balance_loss_clip": 1.0002141, + "balance_loss_mlp": 1.01140058, + "epoch": 0.32574778295505785, + "flos": 71282323964160.0, + "grad_norm": 0.9133944403169288, + "language_loss": 0.63476181, + "learning_rate": 3.150509119089975e-06, + "loss": 0.65516579, + "num_input_tokens_seen": 116430365, + "router_z_loss_clip": 0.01965332, + "router_z_loss_mlp": 0.26953125, + "step": 5418, + "time_per_iteration": 3.1724026203155518 + }, + { + "auxiliary_loss_clip": 0.01125951, + "auxiliary_loss_mlp": 0.01041707, + "balance_loss_clip": 1.02739, + "balance_loss_mlp": 1.0441196, + "epoch": 0.3258079062077258, + "flos": 20776370238720.0, + "grad_norm": 3.240595355482155, + "language_loss": 0.69061959, + "learning_rate": 3.1501905275757537e-06, + "loss": 0.71229619, + "num_input_tokens_seen": 116447525, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.8203125, + "step": 5419, + "time_per_iteration": 2.4643847942352295 + }, + { + "auxiliary_loss_clip": 0.01125895, + "auxiliary_loss_mlp": 0.01035053, + "balance_loss_clip": 1.01951957, + "balance_loss_mlp": 1.04326844, + "epoch": 0.3258680294603938, + "flos": 22235456603520.0, + "grad_norm": 2.1209544014848443, + "language_loss": 0.77064359, + "learning_rate": 3.1498718924461926e-06, + "loss": 0.79225302, + "num_input_tokens_seen": 116466310, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.828125, + "step": 5420, + "time_per_iteration": 2.5241270065307617 + }, + { + "auxiliary_loss_clip": 0.01128645, + "auxiliary_loss_mlp": 0.01035079, + "balance_loss_clip": 1.01945043, + "balance_loss_mlp": 1.04400003, + "epoch": 0.3259281527130618, + "flos": 26979974305920.0, + "grad_norm": 1.4823274263144444, + "language_loss": 0.80134791, + "learning_rate": 3.1495532137133736e-06, + "loss": 0.82298517, + "num_input_tokens_seen": 116487825, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84375, + "step": 5421, + "time_per_iteration": 2.5376439094543457 + }, + { + "auxiliary_loss_clip": 0.01122338, + "auxiliary_loss_mlp": 0.01037347, + "balance_loss_clip": 1.02359045, + "balance_loss_mlp": 1.04254711, + "epoch": 0.32598827596572977, + "flos": 26214251149440.0, + "grad_norm": 1.5045024534641303, + "language_loss": 0.75446749, + "learning_rate": 3.149234491389381e-06, + "loss": 0.77606434, + "num_input_tokens_seen": 116509950, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.796875, + "step": 5422, + "time_per_iteration": 2.5713820457458496 + }, + { + "auxiliary_loss_clip": 0.01128336, + "auxiliary_loss_mlp": 0.01039164, + "balance_loss_clip": 1.02324986, + "balance_loss_mlp": 1.04553628, + "epoch": 0.32604839921839773, + "flos": 17639752947840.0, + "grad_norm": 2.780294141224906, + "language_loss": 0.62795889, + "learning_rate": 3.1489157254863026e-06, + "loss": 0.64963388, + "num_input_tokens_seen": 116527695, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.828125, + "step": 5423, + "time_per_iteration": 2.4667959213256836 + }, + { + "auxiliary_loss_clip": 0.01118583, + "auxiliary_loss_mlp": 0.01031258, + "balance_loss_clip": 1.01824594, + "balance_loss_mlp": 1.04085255, + "epoch": 0.3261085224710657, + "flos": 23622721724160.0, + "grad_norm": 4.488088575635961, + "language_loss": 0.74664211, + "learning_rate": 3.148596916016224e-06, + "loss": 0.76814055, + "num_input_tokens_seen": 116547800, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.77734375, + "step": 5424, + "time_per_iteration": 2.488187313079834 + }, + { + "auxiliary_loss_clip": 0.01122401, + "auxiliary_loss_mlp": 0.01035954, + "balance_loss_clip": 1.02231038, + "balance_loss_mlp": 1.04298568, + "epoch": 0.32616864572373366, + "flos": 23260455106560.0, + "grad_norm": 1.6359586167011877, + "language_loss": 0.76958472, + "learning_rate": 3.1482780629912355e-06, + "loss": 0.79116821, + "num_input_tokens_seen": 116568460, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.796875, + "step": 5425, + "time_per_iteration": 2.5025157928466797 + }, + { + "auxiliary_loss_clip": 0.01127865, + "auxiliary_loss_mlp": 0.01047272, + "balance_loss_clip": 1.03051138, + "balance_loss_mlp": 1.04193544, + "epoch": 0.32622876897640163, + "flos": 25593427457280.0, + "grad_norm": 4.663874352034687, + "language_loss": 0.78857136, + "learning_rate": 3.147959166423428e-06, + "loss": 0.8103227, + "num_input_tokens_seen": 116588705, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.859375, + "step": 5426, + "time_per_iteration": 2.484064817428589 + }, + { + "auxiliary_loss_clip": 0.01124966, + "auxiliary_loss_mlp": 0.01037083, + "balance_loss_clip": 1.02116871, + "balance_loss_mlp": 1.04324198, + "epoch": 0.3262888922290696, + "flos": 22418996123520.0, + "grad_norm": 1.7688447582142532, + "language_loss": 0.74363142, + "learning_rate": 3.147640226324893e-06, + "loss": 0.76525187, + "num_input_tokens_seen": 116608845, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.81640625, + "step": 5427, + "time_per_iteration": 2.4785962104797363 + }, + { + "auxiliary_loss_clip": 0.0112706, + "auxiliary_loss_mlp": 0.01043058, + "balance_loss_clip": 1.02742934, + "balance_loss_mlp": 1.04290414, + "epoch": 0.32634901548173756, + "flos": 19718908819200.0, + "grad_norm": 1.911492416062928, + "language_loss": 0.79305124, + "learning_rate": 3.1473212427077266e-06, + "loss": 0.8147524, + "num_input_tokens_seen": 116628145, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.83984375, + "step": 5428, + "time_per_iteration": 3.9864413738250732 + }, + { + "auxiliary_loss_clip": 0.01123467, + "auxiliary_loss_mlp": 0.01041045, + "balance_loss_clip": 1.02597678, + "balance_loss_mlp": 1.04084587, + "epoch": 0.3264091387344055, + "flos": 16142924367360.0, + "grad_norm": 1.7222830625250152, + "language_loss": 0.71369523, + "learning_rate": 3.147002215584023e-06, + "loss": 0.73534036, + "num_input_tokens_seen": 116646920, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.82421875, + "step": 5429, + "time_per_iteration": 3.8856096267700195 + }, + { + "auxiliary_loss_clip": 0.01124973, + "auxiliary_loss_mlp": 0.01038401, + "balance_loss_clip": 1.02448976, + "balance_loss_mlp": 1.04308093, + "epoch": 0.3264692619870735, + "flos": 16399075230720.0, + "grad_norm": 1.889570703315701, + "language_loss": 0.78612322, + "learning_rate": 3.146683144965881e-06, + "loss": 0.80775696, + "num_input_tokens_seen": 116665100, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.8203125, + "step": 5430, + "time_per_iteration": 2.4374818801879883 + }, + { + "auxiliary_loss_clip": 0.01128219, + "auxiliary_loss_mlp": 0.01037824, + "balance_loss_clip": 1.02077675, + "balance_loss_mlp": 1.04359281, + "epoch": 0.32652938523974145, + "flos": 22382331315840.0, + "grad_norm": 1.8594684871120744, + "language_loss": 0.83897448, + "learning_rate": 3.146364030865399e-06, + "loss": 0.86063492, + "num_input_tokens_seen": 116682205, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.84765625, + "step": 5431, + "time_per_iteration": 2.4513139724731445 + }, + { + "auxiliary_loss_clip": 0.01122027, + "auxiliary_loss_mlp": 0.01038117, + "balance_loss_clip": 1.02431297, + "balance_loss_mlp": 1.04116321, + "epoch": 0.3265895084924094, + "flos": 21908059113600.0, + "grad_norm": 1.7565110160676718, + "language_loss": 0.70459324, + "learning_rate": 3.146044873294678e-06, + "loss": 0.72619462, + "num_input_tokens_seen": 116702575, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.80859375, + "step": 5432, + "time_per_iteration": 2.529365301132202 + }, + { + "auxiliary_loss_clip": 0.01123519, + "auxiliary_loss_mlp": 0.01035948, + "balance_loss_clip": 1.02182746, + "balance_loss_mlp": 1.04076195, + "epoch": 0.3266496317450774, + "flos": 16067152627200.0, + "grad_norm": 1.4205622330102, + "language_loss": 0.84161848, + "learning_rate": 3.1457256722658203e-06, + "loss": 0.86321318, + "num_input_tokens_seen": 116720885, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.828125, + "step": 5433, + "time_per_iteration": 2.4302597045898438 + }, + { + "auxiliary_loss_clip": 0.01123612, + "auxiliary_loss_mlp": 0.01035544, + "balance_loss_clip": 1.02132881, + "balance_loss_mlp": 1.0439055, + "epoch": 0.3267097549977454, + "flos": 22528236360960.0, + "grad_norm": 1.4699213962063424, + "language_loss": 0.85906386, + "learning_rate": 3.145406427790931e-06, + "loss": 0.88065541, + "num_input_tokens_seen": 116740395, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.796875, + "step": 5434, + "time_per_iteration": 2.496676445007324 + }, + { + "auxiliary_loss_clip": 0.01129105, + "auxiliary_loss_mlp": 0.01035794, + "balance_loss_clip": 1.02083361, + "balance_loss_mlp": 1.04468119, + "epoch": 0.32676987825041337, + "flos": 27270419679360.0, + "grad_norm": 1.8331918492971015, + "language_loss": 0.87817061, + "learning_rate": 3.1450871398821147e-06, + "loss": 0.89981961, + "num_input_tokens_seen": 116758870, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.84375, + "step": 5435, + "time_per_iteration": 2.51159405708313 + }, + { + "auxiliary_loss_clip": 0.0112533, + "auxiliary_loss_mlp": 0.01035758, + "balance_loss_clip": 1.02140474, + "balance_loss_mlp": 1.04326773, + "epoch": 0.32683000150308134, + "flos": 11508257433600.0, + "grad_norm": 2.5496215899058443, + "language_loss": 0.76460963, + "learning_rate": 3.144767808551479e-06, + "loss": 0.78622043, + "num_input_tokens_seen": 116773440, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.8203125, + "step": 5436, + "time_per_iteration": 2.43637752532959 + }, + { + "auxiliary_loss_clip": 0.01125315, + "auxiliary_loss_mlp": 0.01034854, + "balance_loss_clip": 1.02040625, + "balance_loss_mlp": 1.04435849, + "epoch": 0.3268901247557493, + "flos": 25630200005760.0, + "grad_norm": 1.5905557916714361, + "language_loss": 0.72127515, + "learning_rate": 3.144448433811134e-06, + "loss": 0.74287689, + "num_input_tokens_seen": 116794375, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.80859375, + "step": 5437, + "time_per_iteration": 2.493673086166382 + }, + { + "auxiliary_loss_clip": 0.01126466, + "auxiliary_loss_mlp": 0.01039117, + "balance_loss_clip": 1.02236819, + "balance_loss_mlp": 1.04143524, + "epoch": 0.32695024800841727, + "flos": 24860849575680.0, + "grad_norm": 1.6336098458574233, + "language_loss": 0.64049256, + "learning_rate": 3.144129015673189e-06, + "loss": 0.66214842, + "num_input_tokens_seen": 116815095, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8515625, + "step": 5438, + "time_per_iteration": 2.5062596797943115 + }, + { + "auxiliary_loss_clip": 0.01126505, + "auxiliary_loss_mlp": 0.01034195, + "balance_loss_clip": 1.01943088, + "balance_loss_mlp": 1.04510128, + "epoch": 0.32701037126108523, + "flos": 28839249072000.0, + "grad_norm": 1.5452802319075516, + "language_loss": 0.74544024, + "learning_rate": 3.1438095541497576e-06, + "loss": 0.76704717, + "num_input_tokens_seen": 116836630, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8125, + "step": 5439, + "time_per_iteration": 2.501279830932617 + }, + { + "auxiliary_loss_clip": 0.01126727, + "auxiliary_loss_mlp": 0.01045237, + "balance_loss_clip": 1.02985907, + "balance_loss_mlp": 1.04374349, + "epoch": 0.3270704945137532, + "flos": 27965075777280.0, + "grad_norm": 2.6196339079167323, + "language_loss": 0.75183308, + "learning_rate": 3.1434900492529527e-06, + "loss": 0.77355272, + "num_input_tokens_seen": 116856880, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.828125, + "step": 5440, + "time_per_iteration": 2.507341146469116 + }, + { + "auxiliary_loss_clip": 0.01124779, + "auxiliary_loss_mlp": 0.01047409, + "balance_loss_clip": 1.03317571, + "balance_loss_mlp": 1.04308057, + "epoch": 0.32713061776642116, + "flos": 23690700213120.0, + "grad_norm": 1.9066250681455874, + "language_loss": 0.84613734, + "learning_rate": 3.1431705009948914e-06, + "loss": 0.86785924, + "num_input_tokens_seen": 116873770, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8203125, + "step": 5441, + "time_per_iteration": 2.4737346172332764 + }, + { + "auxiliary_loss_clip": 0.01126255, + "auxiliary_loss_mlp": 0.01042859, + "balance_loss_clip": 1.02743292, + "balance_loss_mlp": 1.04209113, + "epoch": 0.3271907410190891, + "flos": 22455625017600.0, + "grad_norm": 1.9602585650153952, + "language_loss": 0.8673979, + "learning_rate": 3.1428509093876897e-06, + "loss": 0.88908899, + "num_input_tokens_seen": 116891225, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84375, + "step": 5442, + "time_per_iteration": 2.4779980182647705 + }, + { + "auxiliary_loss_clip": 0.0113032, + "auxiliary_loss_mlp": 0.01038435, + "balance_loss_clip": 1.02193677, + "balance_loss_mlp": 1.04526424, + "epoch": 0.3272508642717571, + "flos": 22820118278400.0, + "grad_norm": 1.8849886885636646, + "language_loss": 0.77500421, + "learning_rate": 3.1425312744434668e-06, + "loss": 0.79669178, + "num_input_tokens_seen": 116912300, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8515625, + "step": 5443, + "time_per_iteration": 2.5263850688934326 + }, + { + "auxiliary_loss_clip": 0.01126577, + "auxiliary_loss_mlp": 0.01039448, + "balance_loss_clip": 1.02428412, + "balance_loss_mlp": 1.04207098, + "epoch": 0.32731098752442506, + "flos": 11801360413440.0, + "grad_norm": 2.0180593262473487, + "language_loss": 0.81630802, + "learning_rate": 3.142211596174343e-06, + "loss": 0.83796823, + "num_input_tokens_seen": 116929425, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.84765625, + "step": 5444, + "time_per_iteration": 2.447061061859131 + }, + { + "auxiliary_loss_clip": 0.0112612, + "auxiliary_loss_mlp": 0.01038044, + "balance_loss_clip": 1.02335095, + "balance_loss_mlp": 1.04356718, + "epoch": 0.327371110777093, + "flos": 21027780506880.0, + "grad_norm": 2.9587875585664523, + "language_loss": 0.59421074, + "learning_rate": 3.1418918745924423e-06, + "loss": 0.61585242, + "num_input_tokens_seen": 116948255, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.82421875, + "step": 5445, + "time_per_iteration": 2.4542667865753174 + }, + { + "auxiliary_loss_clip": 0.01128674, + "auxiliary_loss_mlp": 0.01039464, + "balance_loss_clip": 1.02397823, + "balance_loss_mlp": 1.04482532, + "epoch": 0.327431234029761, + "flos": 19062102677760.0, + "grad_norm": 2.043321690225375, + "language_loss": 0.88286638, + "learning_rate": 3.1415721097098865e-06, + "loss": 0.90454781, + "num_input_tokens_seen": 116964905, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8359375, + "step": 5446, + "time_per_iteration": 2.4518625736236572 + }, + { + "auxiliary_loss_clip": 0.01133247, + "auxiliary_loss_mlp": 0.01042878, + "balance_loss_clip": 1.02577102, + "balance_loss_mlp": 1.04609275, + "epoch": 0.32749135728242895, + "flos": 25849219184640.0, + "grad_norm": 1.9059445881205361, + "language_loss": 0.78455317, + "learning_rate": 3.141252301538802e-06, + "loss": 0.80631441, + "num_input_tokens_seen": 116983650, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.87109375, + "step": 5447, + "time_per_iteration": 2.488555669784546 + }, + { + "auxiliary_loss_clip": 0.01125433, + "auxiliary_loss_mlp": 0.01039956, + "balance_loss_clip": 1.02621138, + "balance_loss_mlp": 1.04297531, + "epoch": 0.327551480535097, + "flos": 20120533764480.0, + "grad_norm": 1.7948266966340543, + "language_loss": 0.73349774, + "learning_rate": 3.1409324500913157e-06, + "loss": 0.75515163, + "num_input_tokens_seen": 117003265, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.82421875, + "step": 5448, + "time_per_iteration": 2.460759162902832 + }, + { + "auxiliary_loss_clip": 0.01125074, + "auxiliary_loss_mlp": 0.0104344, + "balance_loss_clip": 1.02788281, + "balance_loss_mlp": 1.04221821, + "epoch": 0.32761160378776494, + "flos": 28803553931520.0, + "grad_norm": 1.3797343272994427, + "language_loss": 0.66896623, + "learning_rate": 3.1406125553795567e-06, + "loss": 0.69065142, + "num_input_tokens_seen": 117025370, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.828125, + "step": 5449, + "time_per_iteration": 2.5101547241210938 + }, + { + "auxiliary_loss_clip": 0.01125182, + "auxiliary_loss_mlp": 0.01035774, + "balance_loss_clip": 1.02111173, + "balance_loss_mlp": 1.04373384, + "epoch": 0.3276717270404329, + "flos": 26937778803840.0, + "grad_norm": 1.3889431777217922, + "language_loss": 0.65617704, + "learning_rate": 3.1402926174156556e-06, + "loss": 0.67778659, + "num_input_tokens_seen": 117044350, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8125, + "step": 5450, + "time_per_iteration": 2.4815587997436523 + }, + { + "auxiliary_loss_clip": 0.01126325, + "auxiliary_loss_mlp": 0.01041593, + "balance_loss_clip": 1.02644145, + "balance_loss_mlp": 1.04330397, + "epoch": 0.32773185029310087, + "flos": 25338425829120.0, + "grad_norm": 1.5376267502191867, + "language_loss": 0.77276003, + "learning_rate": 3.1399726362117437e-06, + "loss": 0.7944392, + "num_input_tokens_seen": 117064450, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.828125, + "step": 5451, + "time_per_iteration": 2.496264696121216 + }, + { + "auxiliary_loss_clip": 0.0112906, + "auxiliary_loss_mlp": 0.01039497, + "balance_loss_clip": 1.02348745, + "balance_loss_mlp": 1.04470944, + "epoch": 0.32779197354576883, + "flos": 26391721271040.0, + "grad_norm": 2.4373215337565015, + "language_loss": 0.7011131, + "learning_rate": 3.1396526117799555e-06, + "loss": 0.72279859, + "num_input_tokens_seen": 117083060, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.84375, + "step": 5452, + "time_per_iteration": 2.504953384399414 + }, + { + "auxiliary_loss_clip": 0.01121729, + "auxiliary_loss_mlp": 0.0103441, + "balance_loss_clip": 1.01944947, + "balance_loss_mlp": 1.04188132, + "epoch": 0.3278520967984368, + "flos": 24899381890560.0, + "grad_norm": 1.7019757848824575, + "language_loss": 0.78734571, + "learning_rate": 3.1393325441324256e-06, + "loss": 0.80890715, + "num_input_tokens_seen": 117101860, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.796875, + "step": 5453, + "time_per_iteration": 2.493701219558716 + }, + { + "auxiliary_loss_clip": 0.01126073, + "auxiliary_loss_mlp": 0.01030616, + "balance_loss_clip": 1.01610184, + "balance_loss_mlp": 1.04306984, + "epoch": 0.32791222005110476, + "flos": 29752996176000.0, + "grad_norm": 2.2894918901687333, + "language_loss": 0.75428879, + "learning_rate": 3.1390124332812916e-06, + "loss": 0.77585566, + "num_input_tokens_seen": 117123100, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.828125, + "step": 5454, + "time_per_iteration": 2.5295286178588867 + }, + { + "auxiliary_loss_clip": 0.01121153, + "auxiliary_loss_mlp": 0.01037163, + "balance_loss_clip": 1.02382326, + "balance_loss_mlp": 1.04198301, + "epoch": 0.32797234330377273, + "flos": 16508064072960.0, + "grad_norm": 2.0725507665811826, + "language_loss": 0.77059573, + "learning_rate": 3.1386922792386924e-06, + "loss": 0.79217887, + "num_input_tokens_seen": 117140515, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7890625, + "step": 5455, + "time_per_iteration": 2.426988124847412 + }, + { + "auxiliary_loss_clip": 0.0112837, + "auxiliary_loss_mlp": 0.01039409, + "balance_loss_clip": 1.02304173, + "balance_loss_mlp": 1.04281068, + "epoch": 0.3280324665564407, + "flos": 26577918397440.0, + "grad_norm": 1.669914346129418, + "language_loss": 0.74029738, + "learning_rate": 3.138372082016768e-06, + "loss": 0.76197511, + "num_input_tokens_seen": 117161485, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.85546875, + "step": 5456, + "time_per_iteration": 2.512131929397583 + }, + { + "auxiliary_loss_clip": 0.01126084, + "auxiliary_loss_mlp": 0.01049831, + "balance_loss_clip": 1.03444123, + "balance_loss_mlp": 1.04250574, + "epoch": 0.32809258980910866, + "flos": 22929969047040.0, + "grad_norm": 1.518027485126158, + "language_loss": 0.78283882, + "learning_rate": 3.1380518416276596e-06, + "loss": 0.80459797, + "num_input_tokens_seen": 117181870, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8359375, + "step": 5457, + "time_per_iteration": 2.4819135665893555 + }, + { + "auxiliary_loss_clip": 0.0112739, + "auxiliary_loss_mlp": 0.01038783, + "balance_loss_clip": 1.02432334, + "balance_loss_mlp": 1.04155684, + "epoch": 0.3281527130617766, + "flos": 22783848520320.0, + "grad_norm": 2.199350012619834, + "language_loss": 0.79332864, + "learning_rate": 3.1377315580835115e-06, + "loss": 0.81499034, + "num_input_tokens_seen": 117201380, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.859375, + "step": 5458, + "time_per_iteration": 2.4749457836151123 + }, + { + "auxiliary_loss_clip": 0.01123398, + "auxiliary_loss_mlp": 0.01035313, + "balance_loss_clip": 1.01988721, + "balance_loss_mlp": 1.04204702, + "epoch": 0.3282128363144446, + "flos": 21250678354560.0, + "grad_norm": 4.694290331797846, + "language_loss": 0.72896576, + "learning_rate": 3.1374112313964686e-06, + "loss": 0.75055289, + "num_input_tokens_seen": 117221040, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.81640625, + "step": 5459, + "time_per_iteration": 2.4506032466888428 + }, + { + "auxiliary_loss_clip": 0.01128673, + "auxiliary_loss_mlp": 0.01037647, + "balance_loss_clip": 1.02303815, + "balance_loss_mlp": 1.04444695, + "epoch": 0.32827295956711255, + "flos": 30843064166400.0, + "grad_norm": 1.8402325574836436, + "language_loss": 0.84511495, + "learning_rate": 3.1370908615786783e-06, + "loss": 0.86677814, + "num_input_tokens_seen": 117241395, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.84375, + "step": 5460, + "time_per_iteration": 2.521491527557373 + }, + { + "auxiliary_loss_clip": 0.01125172, + "auxiliary_loss_mlp": 0.01035571, + "balance_loss_clip": 1.02176023, + "balance_loss_mlp": 1.0420599, + "epoch": 0.3283330828197806, + "flos": 25915006944000.0, + "grad_norm": 1.7736363390075318, + "language_loss": 0.76822042, + "learning_rate": 3.136770448642288e-06, + "loss": 0.78982782, + "num_input_tokens_seen": 117259340, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.83203125, + "step": 5461, + "time_per_iteration": 2.4919962882995605 + }, + { + "auxiliary_loss_clip": 0.01129873, + "auxiliary_loss_mlp": 0.010367, + "balance_loss_clip": 1.02015376, + "balance_loss_mlp": 1.04589903, + "epoch": 0.32839320607244854, + "flos": 38582065042560.0, + "grad_norm": 1.6989905310418616, + "language_loss": 0.62835252, + "learning_rate": 3.1364499925994484e-06, + "loss": 0.65001822, + "num_input_tokens_seen": 117282375, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.83984375, + "step": 5462, + "time_per_iteration": 2.6128923892974854 + }, + { + "auxiliary_loss_clip": 0.0112585, + "auxiliary_loss_mlp": 0.01033948, + "balance_loss_clip": 1.02048922, + "balance_loss_mlp": 1.04426169, + "epoch": 0.3284533293251165, + "flos": 26650888876800.0, + "grad_norm": 1.8014296603715538, + "language_loss": 0.78155506, + "learning_rate": 3.1361294934623115e-06, + "loss": 0.80315304, + "num_input_tokens_seen": 117303830, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.8125, + "step": 5463, + "time_per_iteration": 2.5255165100097656 + }, + { + "auxiliary_loss_clip": 0.0112647, + "auxiliary_loss_mlp": 0.01034449, + "balance_loss_clip": 1.02001238, + "balance_loss_mlp": 1.04409099, + "epoch": 0.32851345257778447, + "flos": 15304158904320.0, + "grad_norm": 2.049558292675733, + "language_loss": 0.7029627, + "learning_rate": 3.1358089512430303e-06, + "loss": 0.72457188, + "num_input_tokens_seen": 117320665, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.82421875, + "step": 5464, + "time_per_iteration": 2.460951089859009 + }, + { + "auxiliary_loss_clip": 0.01127719, + "auxiliary_loss_mlp": 0.01039646, + "balance_loss_clip": 1.02505457, + "balance_loss_mlp": 1.04683673, + "epoch": 0.32857357583045244, + "flos": 23513732881920.0, + "grad_norm": 1.6142145677103121, + "language_loss": 0.72746348, + "learning_rate": 3.1354883659537594e-06, + "loss": 0.74913716, + "num_input_tokens_seen": 117339795, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.80859375, + "step": 5465, + "time_per_iteration": 2.4767887592315674 + }, + { + "auxiliary_loss_clip": 0.01128882, + "auxiliary_loss_mlp": 0.01036634, + "balance_loss_clip": 1.02208447, + "balance_loss_mlp": 1.04690027, + "epoch": 0.3286336990831204, + "flos": 20995209849600.0, + "grad_norm": 1.6282981827525145, + "language_loss": 0.82756901, + "learning_rate": 3.1351677376066567e-06, + "loss": 0.84922415, + "num_input_tokens_seen": 117359525, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8203125, + "step": 5466, + "time_per_iteration": 2.463127613067627 + }, + { + "auxiliary_loss_clip": 0.01127231, + "auxiliary_loss_mlp": 0.01037656, + "balance_loss_clip": 1.02343404, + "balance_loss_mlp": 1.04421949, + "epoch": 0.32869382233578837, + "flos": 23658811914240.0, + "grad_norm": 1.6977355395672606, + "language_loss": 0.79485095, + "learning_rate": 3.134847066213879e-06, + "loss": 0.81649983, + "num_input_tokens_seen": 117380320, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.83203125, + "step": 5467, + "time_per_iteration": 2.482245683670044 + }, + { + "auxiliary_loss_clip": 0.01128659, + "auxiliary_loss_mlp": 0.01034682, + "balance_loss_clip": 1.02011502, + "balance_loss_mlp": 1.0452255, + "epoch": 0.32875394558845633, + "flos": 25336522408320.0, + "grad_norm": 1.5356074654715184, + "language_loss": 0.74795353, + "learning_rate": 3.134526351787587e-06, + "loss": 0.76958692, + "num_input_tokens_seen": 117400695, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8359375, + "step": 5468, + "time_per_iteration": 2.4828743934631348 + }, + { + "auxiliary_loss_clip": 0.01136832, + "auxiliary_loss_mlp": 0.01041148, + "balance_loss_clip": 1.02467322, + "balance_loss_mlp": 1.04996455, + "epoch": 0.3288140688411243, + "flos": 14903108576640.0, + "grad_norm": 1.8525214053644714, + "language_loss": 0.78469932, + "learning_rate": 3.134205594339942e-06, + "loss": 0.8064791, + "num_input_tokens_seen": 117418800, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8671875, + "step": 5469, + "time_per_iteration": 2.455672264099121 + }, + { + "auxiliary_loss_clip": 0.01129512, + "auxiliary_loss_mlp": 0.01034665, + "balance_loss_clip": 1.02008545, + "balance_loss_mlp": 1.04602098, + "epoch": 0.32887419209379226, + "flos": 18551345235840.0, + "grad_norm": 1.646072726718358, + "language_loss": 0.82014406, + "learning_rate": 3.133884793883107e-06, + "loss": 0.84178579, + "num_input_tokens_seen": 117438220, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8359375, + "step": 5470, + "time_per_iteration": 5.531651020050049 + }, + { + "auxiliary_loss_clip": 0.0112936, + "auxiliary_loss_mlp": 0.01038355, + "balance_loss_clip": 1.02315605, + "balance_loss_mlp": 1.04359245, + "epoch": 0.3289343153464602, + "flos": 48105610439040.0, + "grad_norm": 1.806312825179731, + "language_loss": 0.67675972, + "learning_rate": 3.1335639504292478e-06, + "loss": 0.69843686, + "num_input_tokens_seen": 117462560, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.859375, + "step": 5471, + "time_per_iteration": 2.7400858402252197 + }, + { + "auxiliary_loss_clip": 0.01135248, + "auxiliary_loss_mlp": 0.01042507, + "balance_loss_clip": 1.02578163, + "balance_loss_mlp": 1.04856122, + "epoch": 0.3289944385991282, + "flos": 27600295207680.0, + "grad_norm": 1.6357076803377442, + "language_loss": 0.65059721, + "learning_rate": 3.1332430639905288e-06, + "loss": 0.67237478, + "num_input_tokens_seen": 117483665, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8671875, + "step": 5472, + "time_per_iteration": 2.530604124069214 + }, + { + "auxiliary_loss_clip": 0.01132922, + "auxiliary_loss_mlp": 0.01043552, + "balance_loss_clip": 1.0271014, + "balance_loss_mlp": 1.04821706, + "epoch": 0.32905456185179616, + "flos": 20120318282880.0, + "grad_norm": 1.6631612231063349, + "language_loss": 0.88497955, + "learning_rate": 3.13292213457912e-06, + "loss": 0.9067443, + "num_input_tokens_seen": 117503565, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.84765625, + "step": 5473, + "time_per_iteration": 2.521026611328125 + }, + { + "auxiliary_loss_clip": 0.01133162, + "auxiliary_loss_mlp": 0.01043181, + "balance_loss_clip": 1.02669442, + "balance_loss_mlp": 1.0483191, + "epoch": 0.3291146851044642, + "flos": 23180230080000.0, + "grad_norm": 2.3087074790673423, + "language_loss": 0.78349268, + "learning_rate": 3.1326011622071903e-06, + "loss": 0.80525613, + "num_input_tokens_seen": 117521460, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.84765625, + "step": 5474, + "time_per_iteration": 2.4769628047943115 + }, + { + "auxiliary_loss_clip": 0.01047146, + "auxiliary_loss_mlp": 0.00999487, + "balance_loss_clip": 0.99740046, + "balance_loss_mlp": 1.02056372, + "epoch": 0.32917480835713214, + "flos": 67621912594560.0, + "grad_norm": 0.888273800575083, + "language_loss": 0.60237771, + "learning_rate": 3.132280146886911e-06, + "loss": 0.62284404, + "num_input_tokens_seen": 117580550, + "router_z_loss_clip": 0.02087402, + "router_z_loss_mlp": 0.265625, + "step": 5475, + "time_per_iteration": 3.039971351623535 + }, + { + "auxiliary_loss_clip": 0.01133227, + "auxiliary_loss_mlp": 0.01051514, + "balance_loss_clip": 1.03437138, + "balance_loss_mlp": 1.04512429, + "epoch": 0.3292349316098001, + "flos": 27964537073280.0, + "grad_norm": 2.5350164106808766, + "language_loss": 0.76634103, + "learning_rate": 3.131959088630455e-06, + "loss": 0.78818846, + "num_input_tokens_seen": 117600645, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.8828125, + "step": 5476, + "time_per_iteration": 2.488698959350586 + }, + { + "auxiliary_loss_clip": 0.01131587, + "auxiliary_loss_mlp": 0.01040976, + "balance_loss_clip": 1.02640307, + "balance_loss_mlp": 1.04819024, + "epoch": 0.3292950548624681, + "flos": 20263673462400.0, + "grad_norm": 1.8435246505513339, + "language_loss": 0.74520677, + "learning_rate": 3.131637987449997e-06, + "loss": 0.76693243, + "num_input_tokens_seen": 117618880, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8359375, + "step": 5477, + "time_per_iteration": 2.533641815185547 + }, + { + "auxiliary_loss_clip": 0.01124642, + "auxiliary_loss_mlp": 0.01034244, + "balance_loss_clip": 1.02036786, + "balance_loss_mlp": 1.04507232, + "epoch": 0.32935517811513604, + "flos": 20812999132800.0, + "grad_norm": 1.9138938380730264, + "language_loss": 0.75581098, + "learning_rate": 3.131316843357713e-06, + "loss": 0.7773999, + "num_input_tokens_seen": 117636445, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.796875, + "step": 5478, + "time_per_iteration": 2.4541866779327393 + }, + { + "auxiliary_loss_clip": 0.01129718, + "auxiliary_loss_mlp": 0.01036647, + "balance_loss_clip": 1.02218664, + "balance_loss_mlp": 1.04736805, + "epoch": 0.329415301367804, + "flos": 18441853603200.0, + "grad_norm": 1.6780134795902322, + "language_loss": 0.80241555, + "learning_rate": 3.1309956563657807e-06, + "loss": 0.82407916, + "num_input_tokens_seen": 117653105, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8203125, + "step": 5479, + "time_per_iteration": 2.5348050594329834 + }, + { + "auxiliary_loss_clip": 0.01046129, + "auxiliary_loss_mlp": 0.01003977, + "balance_loss_clip": 1.00210512, + "balance_loss_mlp": 1.01921439, + "epoch": 0.32947542462047197, + "flos": 66323024887680.0, + "grad_norm": 0.7411588561506779, + "language_loss": 0.56543052, + "learning_rate": 3.1306744264863804e-06, + "loss": 0.58593154, + "num_input_tokens_seen": 117719225, + "router_z_loss_clip": 0.01867676, + "router_z_loss_mlp": 0.26953125, + "step": 5480, + "time_per_iteration": 3.121812343597412 + }, + { + "auxiliary_loss_clip": 0.01128951, + "auxiliary_loss_mlp": 0.01044263, + "balance_loss_clip": 1.02871847, + "balance_loss_mlp": 1.04606879, + "epoch": 0.32953554787313993, + "flos": 23221599569280.0, + "grad_norm": 1.656023636160042, + "language_loss": 0.77029848, + "learning_rate": 3.1303531537316915e-06, + "loss": 0.79203057, + "num_input_tokens_seen": 117738725, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.828125, + "step": 5481, + "time_per_iteration": 2.4819936752319336 + }, + { + "auxiliary_loss_clip": 0.01129556, + "auxiliary_loss_mlp": 0.01034734, + "balance_loss_clip": 1.02028024, + "balance_loss_mlp": 1.04622722, + "epoch": 0.3295956711258079, + "flos": 27009492307200.0, + "grad_norm": 1.8057287203311059, + "language_loss": 0.78732938, + "learning_rate": 3.130031838113899e-06, + "loss": 0.80897224, + "num_input_tokens_seen": 117757765, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.83203125, + "step": 5482, + "time_per_iteration": 2.501615285873413 + }, + { + "auxiliary_loss_clip": 0.01129525, + "auxiliary_loss_mlp": 0.01041437, + "balance_loss_clip": 1.02601135, + "balance_loss_mlp": 1.04573894, + "epoch": 0.32965579437847586, + "flos": 19171702051200.0, + "grad_norm": 1.6414395423474737, + "language_loss": 0.74055123, + "learning_rate": 3.129710479645185e-06, + "loss": 0.76226085, + "num_input_tokens_seen": 117776810, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8359375, + "step": 5483, + "time_per_iteration": 2.5213518142700195 + }, + { + "auxiliary_loss_clip": 0.01128456, + "auxiliary_loss_mlp": 0.01032447, + "balance_loss_clip": 1.0187676, + "balance_loss_mlp": 1.04614615, + "epoch": 0.32971591763114383, + "flos": 30482521401600.0, + "grad_norm": 1.8373674608308554, + "language_loss": 0.75627816, + "learning_rate": 3.1293890783377366e-06, + "loss": 0.77788723, + "num_input_tokens_seen": 117797730, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.8203125, + "step": 5484, + "time_per_iteration": 2.543795108795166 + }, + { + "auxiliary_loss_clip": 0.01129378, + "auxiliary_loss_mlp": 0.01038991, + "balance_loss_clip": 1.02441156, + "balance_loss_mlp": 1.04699099, + "epoch": 0.3297760408838118, + "flos": 16289583598080.0, + "grad_norm": 2.1329507570753243, + "language_loss": 0.7209897, + "learning_rate": 3.129067634203742e-06, + "loss": 0.74267334, + "num_input_tokens_seen": 117815365, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.82421875, + "step": 5485, + "time_per_iteration": 2.4598846435546875 + }, + { + "auxiliary_loss_clip": 0.01124565, + "auxiliary_loss_mlp": 0.01040064, + "balance_loss_clip": 1.02626562, + "balance_loss_mlp": 1.04448354, + "epoch": 0.32983616413647976, + "flos": 29530924341120.0, + "grad_norm": 1.7963509228415293, + "language_loss": 0.80416954, + "learning_rate": 3.128746147255388e-06, + "loss": 0.8258158, + "num_input_tokens_seen": 117836095, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.80078125, + "step": 5486, + "time_per_iteration": 2.5368754863739014 + }, + { + "auxiliary_loss_clip": 0.011236, + "auxiliary_loss_mlp": 0.01037413, + "balance_loss_clip": 1.02264309, + "balance_loss_mlp": 1.04300976, + "epoch": 0.3298962873891478, + "flos": 20631398947200.0, + "grad_norm": 2.3473245188806056, + "language_loss": 0.84351611, + "learning_rate": 3.1284246175048683e-06, + "loss": 0.86512625, + "num_input_tokens_seen": 117854655, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8046875, + "step": 5487, + "time_per_iteration": 2.5140841007232666 + }, + { + "auxiliary_loss_clip": 0.01131842, + "auxiliary_loss_mlp": 0.01040276, + "balance_loss_clip": 1.02440929, + "balance_loss_mlp": 1.04636502, + "epoch": 0.32995641064181574, + "flos": 14976007228800.0, + "grad_norm": 2.289610395509379, + "language_loss": 0.74163198, + "learning_rate": 3.1281030449643735e-06, + "loss": 0.76335323, + "num_input_tokens_seen": 117873300, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.85546875, + "step": 5488, + "time_per_iteration": 2.4159257411956787 + }, + { + "auxiliary_loss_clip": 0.01127802, + "auxiliary_loss_mlp": 0.01040136, + "balance_loss_clip": 1.02519917, + "balance_loss_mlp": 1.04548192, + "epoch": 0.3300165338944837, + "flos": 18661447399680.0, + "grad_norm": 2.3379517114480004, + "language_loss": 0.72564352, + "learning_rate": 3.127781429646098e-06, + "loss": 0.74732298, + "num_input_tokens_seen": 117891540, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.82421875, + "step": 5489, + "time_per_iteration": 2.4810056686401367 + }, + { + "auxiliary_loss_clip": 0.01122617, + "auxiliary_loss_mlp": 0.01033113, + "balance_loss_clip": 1.01852202, + "balance_loss_mlp": 1.04076719, + "epoch": 0.3300766571471517, + "flos": 25583730785280.0, + "grad_norm": 2.5348585918072235, + "language_loss": 0.88752508, + "learning_rate": 3.127459771562238e-06, + "loss": 0.90908241, + "num_input_tokens_seen": 117907690, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8203125, + "step": 5490, + "time_per_iteration": 2.448437452316284 + }, + { + "auxiliary_loss_clip": 0.01121475, + "auxiliary_loss_mlp": 0.01034389, + "balance_loss_clip": 1.02022719, + "balance_loss_mlp": 1.0403626, + "epoch": 0.33013678039981964, + "flos": 11363501623680.0, + "grad_norm": 1.9493471797358817, + "language_loss": 0.83395195, + "learning_rate": 3.1271380707249907e-06, + "loss": 0.85551059, + "num_input_tokens_seen": 117925640, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8125, + "step": 5491, + "time_per_iteration": 2.44634747505188 + }, + { + "auxiliary_loss_clip": 0.01126063, + "auxiliary_loss_mlp": 0.01039892, + "balance_loss_clip": 1.02492499, + "balance_loss_mlp": 1.04421842, + "epoch": 0.3301969036524876, + "flos": 24821203939200.0, + "grad_norm": 2.715750342336911, + "language_loss": 0.77514994, + "learning_rate": 3.126816327146554e-06, + "loss": 0.79680943, + "num_input_tokens_seen": 117944525, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.81640625, + "step": 5492, + "time_per_iteration": 2.4870479106903076 + }, + { + "auxiliary_loss_clip": 0.01131001, + "auxiliary_loss_mlp": 0.0104338, + "balance_loss_clip": 1.0269649, + "balance_loss_mlp": 1.04629827, + "epoch": 0.33025702690515557, + "flos": 15961144613760.0, + "grad_norm": 2.2776411561569265, + "language_loss": 0.7450884, + "learning_rate": 3.12649454083913e-06, + "loss": 0.76683223, + "num_input_tokens_seen": 117962515, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.84375, + "step": 5493, + "time_per_iteration": 2.4506607055664062 + }, + { + "auxiliary_loss_clip": 0.01045286, + "auxiliary_loss_mlp": 0.01012729, + "balance_loss_clip": 1.01074982, + "balance_loss_mlp": 1.01881337, + "epoch": 0.33031715015782354, + "flos": 59416755989760.0, + "grad_norm": 0.7955029917088393, + "language_loss": 0.53910893, + "learning_rate": 3.12617271181492e-06, + "loss": 0.55968904, + "num_input_tokens_seen": 118018780, + "router_z_loss_clip": 0.01977539, + "router_z_loss_mlp": 0.265625, + "step": 5494, + "time_per_iteration": 3.0042550563812256 + }, + { + "auxiliary_loss_clip": 0.01124159, + "auxiliary_loss_mlp": 0.01037133, + "balance_loss_clip": 1.02245855, + "balance_loss_mlp": 1.04378355, + "epoch": 0.3303772734104915, + "flos": 23184360144000.0, + "grad_norm": 1.6073630563578136, + "language_loss": 0.87087989, + "learning_rate": 3.1258508400861276e-06, + "loss": 0.89249277, + "num_input_tokens_seen": 118038610, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8046875, + "step": 5495, + "time_per_iteration": 2.4716837406158447 + }, + { + "auxiliary_loss_clip": 0.01128875, + "auxiliary_loss_mlp": 0.01047751, + "balance_loss_clip": 1.03133559, + "balance_loss_mlp": 1.04508138, + "epoch": 0.33043739666315947, + "flos": 33071896010880.0, + "grad_norm": 3.5655917637781784, + "language_loss": 0.73526418, + "learning_rate": 3.1255289256649587e-06, + "loss": 0.75703049, + "num_input_tokens_seen": 118055905, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8359375, + "step": 5496, + "time_per_iteration": 2.531670570373535 + }, + { + "auxiliary_loss_clip": 0.01124295, + "auxiliary_loss_mlp": 0.01028859, + "balance_loss_clip": 1.01509058, + "balance_loss_mlp": 1.04384971, + "epoch": 0.33049751991582743, + "flos": 24895431394560.0, + "grad_norm": 2.1703031984353514, + "language_loss": 0.72764325, + "learning_rate": 3.1252069685636196e-06, + "loss": 0.74917477, + "num_input_tokens_seen": 118073695, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.8046875, + "step": 5497, + "time_per_iteration": 2.5148839950561523 + }, + { + "auxiliary_loss_clip": 0.01123603, + "auxiliary_loss_mlp": 0.01033477, + "balance_loss_clip": 1.01861119, + "balance_loss_mlp": 1.04340625, + "epoch": 0.3305576431684954, + "flos": 29460575554560.0, + "grad_norm": 2.5654673530164307, + "language_loss": 0.80193126, + "learning_rate": 3.124884968794321e-06, + "loss": 0.82350206, + "num_input_tokens_seen": 118094030, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.80078125, + "step": 5498, + "time_per_iteration": 2.517765522003174 + }, + { + "auxiliary_loss_clip": 0.01123393, + "auxiliary_loss_mlp": 0.01038843, + "balance_loss_clip": 1.02397776, + "balance_loss_mlp": 1.03977811, + "epoch": 0.33061776642116336, + "flos": 22632305040000.0, + "grad_norm": 2.1435474357237405, + "language_loss": 0.76491725, + "learning_rate": 3.12456292636927e-06, + "loss": 0.78653955, + "num_input_tokens_seen": 118111665, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8359375, + "step": 5499, + "time_per_iteration": 2.5006067752838135 + }, + { + "auxiliary_loss_clip": 0.01122541, + "auxiliary_loss_mlp": 0.01031983, + "balance_loss_clip": 1.0175705, + "balance_loss_mlp": 1.04131985, + "epoch": 0.3306778896738313, + "flos": 25776320532480.0, + "grad_norm": 1.506886865759599, + "language_loss": 0.79332948, + "learning_rate": 3.124240841300681e-06, + "loss": 0.81487471, + "num_input_tokens_seen": 118132435, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8125, + "step": 5500, + "time_per_iteration": 2.4859495162963867 + }, + { + "auxiliary_loss_clip": 0.01129022, + "auxiliary_loss_mlp": 0.0103113, + "balance_loss_clip": 1.01607347, + "balance_loss_mlp": 1.04564214, + "epoch": 0.33073801292649935, + "flos": 36940552479360.0, + "grad_norm": 2.164639953437845, + "language_loss": 0.66065335, + "learning_rate": 3.1239187136007665e-06, + "loss": 0.68225485, + "num_input_tokens_seen": 118155255, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.83203125, + "step": 5501, + "time_per_iteration": 2.6189892292022705 + }, + { + "auxiliary_loss_clip": 0.01126823, + "auxiliary_loss_mlp": 0.01041122, + "balance_loss_clip": 1.02471876, + "balance_loss_mlp": 1.04285216, + "epoch": 0.3307981361791673, + "flos": 12967738848000.0, + "grad_norm": 2.260615362067107, + "language_loss": 0.77580702, + "learning_rate": 3.1235965432817417e-06, + "loss": 0.79748642, + "num_input_tokens_seen": 118169865, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.83984375, + "step": 5502, + "time_per_iteration": 2.4086782932281494 + }, + { + "auxiliary_loss_clip": 0.01130061, + "auxiliary_loss_mlp": 0.01039, + "balance_loss_clip": 1.02389622, + "balance_loss_mlp": 1.04632545, + "epoch": 0.3308582594318353, + "flos": 25374372364800.0, + "grad_norm": 2.045089737815956, + "language_loss": 0.72346115, + "learning_rate": 3.123274330355824e-06, + "loss": 0.74515176, + "num_input_tokens_seen": 118190760, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8359375, + "step": 5503, + "time_per_iteration": 2.5176749229431152 + }, + { + "auxiliary_loss_clip": 0.0112493, + "auxiliary_loss_mlp": 0.01033516, + "balance_loss_clip": 1.01865053, + "balance_loss_mlp": 1.04248357, + "epoch": 0.33091838268450324, + "flos": 26468570419200.0, + "grad_norm": 1.5402224202893484, + "language_loss": 0.75216055, + "learning_rate": 3.12295207483523e-06, + "loss": 0.77374506, + "num_input_tokens_seen": 118213620, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.82421875, + "step": 5504, + "time_per_iteration": 2.530212879180908 + }, + { + "auxiliary_loss_clip": 0.01127019, + "auxiliary_loss_mlp": 0.01038843, + "balance_loss_clip": 1.02438283, + "balance_loss_mlp": 1.04382253, + "epoch": 0.3309785059371712, + "flos": 24971167221120.0, + "grad_norm": 1.6148817370045387, + "language_loss": 0.70049053, + "learning_rate": 3.1226297767321816e-06, + "loss": 0.72214913, + "num_input_tokens_seen": 118235010, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.83203125, + "step": 5505, + "time_per_iteration": 2.5212292671203613 + }, + { + "auxiliary_loss_clip": 0.01126444, + "auxiliary_loss_mlp": 0.01041221, + "balance_loss_clip": 1.02720845, + "balance_loss_mlp": 1.04601455, + "epoch": 0.3310386291898392, + "flos": 20446710192000.0, + "grad_norm": 1.586520967819923, + "language_loss": 0.81541443, + "learning_rate": 3.122307436058899e-06, + "loss": 0.83709103, + "num_input_tokens_seen": 118255820, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.8046875, + "step": 5506, + "time_per_iteration": 2.5494561195373535 + }, + { + "auxiliary_loss_clip": 0.01128621, + "auxiliary_loss_mlp": 0.01037729, + "balance_loss_clip": 1.02277398, + "balance_loss_mlp": 1.04704857, + "epoch": 0.33109875244250714, + "flos": 23182672204800.0, + "grad_norm": 1.929478423939084, + "language_loss": 0.79097712, + "learning_rate": 3.121985052827606e-06, + "loss": 0.81264055, + "num_input_tokens_seen": 118274160, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.81640625, + "step": 5507, + "time_per_iteration": 2.498659610748291 + }, + { + "auxiliary_loss_clip": 0.01123401, + "auxiliary_loss_mlp": 0.01040623, + "balance_loss_clip": 1.02594829, + "balance_loss_mlp": 1.04136062, + "epoch": 0.3311588756951751, + "flos": 24168384207360.0, + "grad_norm": 1.6667627205960738, + "language_loss": 0.71733725, + "learning_rate": 3.1216626270505274e-06, + "loss": 0.73897743, + "num_input_tokens_seen": 118294385, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8203125, + "step": 5508, + "time_per_iteration": 2.478593111038208 + }, + { + "auxiliary_loss_clip": 0.01124563, + "auxiliary_loss_mlp": 0.0102968, + "balance_loss_clip": 1.01566064, + "balance_loss_mlp": 1.04539418, + "epoch": 0.33121899894784307, + "flos": 28145742209280.0, + "grad_norm": 2.030813517097255, + "language_loss": 0.72023594, + "learning_rate": 3.12134015873989e-06, + "loss": 0.74177837, + "num_input_tokens_seen": 118313105, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 5509, + "time_per_iteration": 2.539806842803955 + }, + { + "auxiliary_loss_clip": 0.01126062, + "auxiliary_loss_mlp": 0.01034216, + "balance_loss_clip": 1.01975, + "balance_loss_mlp": 1.04503942, + "epoch": 0.33127912220051103, + "flos": 29567660976000.0, + "grad_norm": 1.5191607241878, + "language_loss": 0.73049426, + "learning_rate": 3.121017647907921e-06, + "loss": 0.75209701, + "num_input_tokens_seen": 118335250, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8125, + "step": 5510, + "time_per_iteration": 2.536083698272705 + }, + { + "auxiliary_loss_clip": 0.01123553, + "auxiliary_loss_mlp": 0.01035708, + "balance_loss_clip": 1.02148628, + "balance_loss_mlp": 1.0429213, + "epoch": 0.331339245453179, + "flos": 14428836374400.0, + "grad_norm": 2.1286159820346984, + "language_loss": 0.87371129, + "learning_rate": 3.1206950945668508e-06, + "loss": 0.89530391, + "num_input_tokens_seen": 118351470, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8046875, + "step": 5511, + "time_per_iteration": 2.4380695819854736 + }, + { + "auxiliary_loss_clip": 0.01119745, + "auxiliary_loss_mlp": 0.01032859, + "balance_loss_clip": 1.01986468, + "balance_loss_mlp": 1.04396749, + "epoch": 0.33139936870584696, + "flos": 20887118847360.0, + "grad_norm": 1.6025966363766477, + "language_loss": 0.72926772, + "learning_rate": 3.12037249872891e-06, + "loss": 0.7507937, + "num_input_tokens_seen": 118370970, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7578125, + "step": 5512, + "time_per_iteration": 5.464786767959595 + }, + { + "auxiliary_loss_clip": 0.01124343, + "auxiliary_loss_mlp": 0.01041694, + "balance_loss_clip": 1.02759719, + "balance_loss_mlp": 1.04466701, + "epoch": 0.33145949195851493, + "flos": 36284356869120.0, + "grad_norm": 1.8365879467062751, + "language_loss": 0.72230887, + "learning_rate": 3.1200498604063317e-06, + "loss": 0.7439692, + "num_input_tokens_seen": 118393125, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.796875, + "step": 5513, + "time_per_iteration": 2.6175873279571533 + }, + { + "auxiliary_loss_clip": 0.01128264, + "auxiliary_loss_mlp": 0.01034719, + "balance_loss_clip": 1.01972222, + "balance_loss_mlp": 1.04398656, + "epoch": 0.33151961521118295, + "flos": 14279735018880.0, + "grad_norm": 1.8557947519919487, + "language_loss": 0.68629253, + "learning_rate": 3.1197271796113507e-06, + "loss": 0.70792234, + "num_input_tokens_seen": 118410860, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84375, + "step": 5514, + "time_per_iteration": 2.4340810775756836 + }, + { + "auxiliary_loss_clip": 0.01127749, + "auxiliary_loss_mlp": 0.01041934, + "balance_loss_clip": 1.0251019, + "balance_loss_mlp": 1.04505849, + "epoch": 0.3315797384638509, + "flos": 20774323163520.0, + "grad_norm": 2.411486097564539, + "language_loss": 0.66439879, + "learning_rate": 3.1194044563562026e-06, + "loss": 0.6860956, + "num_input_tokens_seen": 118429570, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.828125, + "step": 5515, + "time_per_iteration": 2.4983339309692383 + }, + { + "auxiliary_loss_clip": 0.01124572, + "auxiliary_loss_mlp": 0.01034351, + "balance_loss_clip": 1.01960468, + "balance_loss_mlp": 1.04258537, + "epoch": 0.3316398617165189, + "flos": 24679464871680.0, + "grad_norm": 1.4970111675637168, + "language_loss": 0.69111156, + "learning_rate": 3.1190816906531257e-06, + "loss": 0.71270084, + "num_input_tokens_seen": 118450285, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8203125, + "step": 5516, + "time_per_iteration": 2.515367031097412 + }, + { + "auxiliary_loss_clip": 0.0112502, + "auxiliary_loss_mlp": 0.01036046, + "balance_loss_clip": 1.02154398, + "balance_loss_mlp": 1.04021645, + "epoch": 0.33169998496918685, + "flos": 18587974129920.0, + "grad_norm": 2.365933570102145, + "language_loss": 0.80287617, + "learning_rate": 3.118758882514359e-06, + "loss": 0.82448685, + "num_input_tokens_seen": 118468270, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.84765625, + "step": 5517, + "time_per_iteration": 2.5149497985839844 + }, + { + "auxiliary_loss_clip": 0.01121413, + "auxiliary_loss_mlp": 0.01036775, + "balance_loss_clip": 1.02264261, + "balance_loss_mlp": 1.04258931, + "epoch": 0.3317601082218548, + "flos": 20193647898240.0, + "grad_norm": 2.188422581245926, + "language_loss": 0.74551105, + "learning_rate": 3.118436031952143e-06, + "loss": 0.76709294, + "num_input_tokens_seen": 118486615, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7890625, + "step": 5518, + "time_per_iteration": 2.450188159942627 + }, + { + "auxiliary_loss_clip": 0.01048984, + "auxiliary_loss_mlp": 0.01008888, + "balance_loss_clip": 1.00682592, + "balance_loss_mlp": 1.02244139, + "epoch": 0.3318202314745228, + "flos": 68974703637120.0, + "grad_norm": 0.6172932492598038, + "language_loss": 0.54346693, + "learning_rate": 3.1181131389787206e-06, + "loss": 0.56404567, + "num_input_tokens_seen": 118553580, + "router_z_loss_clip": 0.02062988, + "router_z_loss_mlp": 0.265625, + "step": 5519, + "time_per_iteration": 3.167750358581543 + }, + { + "auxiliary_loss_clip": 0.0112453, + "auxiliary_loss_mlp": 0.01039354, + "balance_loss_clip": 1.0239042, + "balance_loss_mlp": 1.0434345, + "epoch": 0.33188035472719074, + "flos": 21500113374720.0, + "grad_norm": 3.8105825888408855, + "language_loss": 0.78854358, + "learning_rate": 3.117790203606336e-06, + "loss": 0.81018245, + "num_input_tokens_seen": 118570280, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8125, + "step": 5520, + "time_per_iteration": 2.451781988143921 + }, + { + "auxiliary_loss_clip": 0.01121269, + "auxiliary_loss_mlp": 0.01032127, + "balance_loss_clip": 1.01835227, + "balance_loss_mlp": 1.04244733, + "epoch": 0.3319404779798587, + "flos": 28870490926080.0, + "grad_norm": 2.656623957411012, + "language_loss": 0.76576293, + "learning_rate": 3.1174672258472344e-06, + "loss": 0.78729689, + "num_input_tokens_seen": 118590455, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7890625, + "step": 5521, + "time_per_iteration": 2.525865077972412 + }, + { + "auxiliary_loss_clip": 0.01126792, + "auxiliary_loss_mlp": 0.01044731, + "balance_loss_clip": 1.02932894, + "balance_loss_mlp": 1.04259682, + "epoch": 0.33200060123252667, + "flos": 23076915586560.0, + "grad_norm": 3.3004720611075964, + "language_loss": 0.70353854, + "learning_rate": 3.117144205713664e-06, + "loss": 0.72525376, + "num_input_tokens_seen": 118609495, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.83984375, + "step": 5522, + "time_per_iteration": 2.472001791000366 + }, + { + "auxiliary_loss_clip": 0.01123475, + "auxiliary_loss_mlp": 0.01030854, + "balance_loss_clip": 1.01739514, + "balance_loss_mlp": 1.04362595, + "epoch": 0.33206072448519464, + "flos": 21142479611520.0, + "grad_norm": 1.7154852702320889, + "language_loss": 0.74052203, + "learning_rate": 3.1168211432178735e-06, + "loss": 0.76206541, + "num_input_tokens_seen": 118628720, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.80078125, + "step": 5523, + "time_per_iteration": 2.4924776554107666 + }, + { + "auxiliary_loss_clip": 0.01122263, + "auxiliary_loss_mlp": 0.0103648, + "balance_loss_clip": 1.0211792, + "balance_loss_mlp": 1.04308188, + "epoch": 0.3321208477378626, + "flos": 13079097987840.0, + "grad_norm": 1.6905303226226114, + "language_loss": 0.82272083, + "learning_rate": 3.116498038372114e-06, + "loss": 0.84430826, + "num_input_tokens_seen": 118645955, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.79296875, + "step": 5524, + "time_per_iteration": 2.439711332321167 + }, + { + "auxiliary_loss_clip": 0.01123508, + "auxiliary_loss_mlp": 0.01038508, + "balance_loss_clip": 1.0251627, + "balance_loss_mlp": 1.04402184, + "epoch": 0.33218097099053057, + "flos": 21215414177280.0, + "grad_norm": 1.6540586406432352, + "language_loss": 0.8307848, + "learning_rate": 3.116174891188636e-06, + "loss": 0.85240501, + "num_input_tokens_seen": 118665605, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.79296875, + "step": 5525, + "time_per_iteration": 2.4927310943603516 + }, + { + "auxiliary_loss_clip": 0.01044531, + "auxiliary_loss_mlp": 0.01006175, + "balance_loss_clip": 1.00405347, + "balance_loss_mlp": 1.01804781, + "epoch": 0.33224109424319853, + "flos": 64348979189760.0, + "grad_norm": 0.7716933739699889, + "language_loss": 0.5260945, + "learning_rate": 3.1158517016796945e-06, + "loss": 0.54660153, + "num_input_tokens_seen": 118728155, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.265625, + "step": 5526, + "time_per_iteration": 3.0598835945129395 + }, + { + "auxiliary_loss_clip": 0.01126467, + "auxiliary_loss_mlp": 0.01042827, + "balance_loss_clip": 1.02724671, + "balance_loss_mlp": 1.04371929, + "epoch": 0.33230121749586655, + "flos": 17346003523200.0, + "grad_norm": 2.1037159361855737, + "language_loss": 0.77490491, + "learning_rate": 3.1155284698575445e-06, + "loss": 0.79659784, + "num_input_tokens_seen": 118743955, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.828125, + "step": 5527, + "time_per_iteration": 2.4878480434417725 + }, + { + "auxiliary_loss_clip": 0.01126946, + "auxiliary_loss_mlp": 0.01044009, + "balance_loss_clip": 1.03025246, + "balance_loss_mlp": 1.04651201, + "epoch": 0.3323613407485345, + "flos": 20997041443200.0, + "grad_norm": 2.9813221594214494, + "language_loss": 0.72143763, + "learning_rate": 3.1152051957344434e-06, + "loss": 0.74314719, + "num_input_tokens_seen": 118763275, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.8046875, + "step": 5528, + "time_per_iteration": 2.4562795162200928 + }, + { + "auxiliary_loss_clip": 0.0112635, + "auxiliary_loss_mlp": 0.01036386, + "balance_loss_clip": 1.02256346, + "balance_loss_mlp": 1.04463542, + "epoch": 0.3324214640012025, + "flos": 13152535344000.0, + "grad_norm": 1.7054310511699202, + "language_loss": 0.82638806, + "learning_rate": 3.1148818793226497e-06, + "loss": 0.84801543, + "num_input_tokens_seen": 118781110, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.81640625, + "step": 5529, + "time_per_iteration": 2.474243640899658 + }, + { + "auxiliary_loss_clip": 0.01129499, + "auxiliary_loss_mlp": 0.01036464, + "balance_loss_clip": 1.02223659, + "balance_loss_mlp": 1.04554248, + "epoch": 0.33248158725387045, + "flos": 22273522041600.0, + "grad_norm": 1.9738718949190572, + "language_loss": 0.69718957, + "learning_rate": 3.114558520634423e-06, + "loss": 0.71884924, + "num_input_tokens_seen": 118800620, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.83984375, + "step": 5530, + "time_per_iteration": 2.471686840057373 + }, + { + "auxiliary_loss_clip": 0.01127236, + "auxiliary_loss_mlp": 0.01045423, + "balance_loss_clip": 1.02996182, + "balance_loss_mlp": 1.04500127, + "epoch": 0.3325417105065384, + "flos": 20740998320640.0, + "grad_norm": 2.4616968900166643, + "language_loss": 0.7616601, + "learning_rate": 3.1142351196820256e-06, + "loss": 0.78338665, + "num_input_tokens_seen": 118818725, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8203125, + "step": 5531, + "time_per_iteration": 2.473328113555908 + }, + { + "auxiliary_loss_clip": 0.01128043, + "auxiliary_loss_mlp": 0.01037476, + "balance_loss_clip": 1.0231235, + "balance_loss_mlp": 1.04481292, + "epoch": 0.3326018337592064, + "flos": 24790536702720.0, + "grad_norm": 1.7553607817915955, + "language_loss": 0.73413068, + "learning_rate": 3.1139116764777206e-06, + "loss": 0.75578588, + "num_input_tokens_seen": 118839390, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.83203125, + "step": 5532, + "time_per_iteration": 2.4864931106567383 + }, + { + "auxiliary_loss_clip": 0.01128977, + "auxiliary_loss_mlp": 0.01026777, + "balance_loss_clip": 1.01321709, + "balance_loss_mlp": 1.04721618, + "epoch": 0.33266195701187434, + "flos": 14501699112960.0, + "grad_norm": 2.2280638741168057, + "language_loss": 0.65813714, + "learning_rate": 3.1135881910337735e-06, + "loss": 0.67969465, + "num_input_tokens_seen": 118856275, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.8203125, + "step": 5533, + "time_per_iteration": 2.5232229232788086 + }, + { + "auxiliary_loss_clip": 0.01126882, + "auxiliary_loss_mlp": 0.01040338, + "balance_loss_clip": 1.02541876, + "balance_loss_mlp": 1.04451632, + "epoch": 0.3327220802645423, + "flos": 15304410299520.0, + "grad_norm": 1.9248590192503388, + "language_loss": 0.70790148, + "learning_rate": 3.113264663362451e-06, + "loss": 0.72957367, + "num_input_tokens_seen": 118873830, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.82421875, + "step": 5534, + "time_per_iteration": 2.418875217437744 + }, + { + "auxiliary_loss_clip": 0.01125629, + "auxiliary_loss_mlp": 0.01033414, + "balance_loss_clip": 1.01890588, + "balance_loss_mlp": 1.04565191, + "epoch": 0.3327822035172103, + "flos": 23477534951040.0, + "grad_norm": 1.8142926842561948, + "language_loss": 0.6684956, + "learning_rate": 3.1129410934760204e-06, + "loss": 0.69008601, + "num_input_tokens_seen": 118891560, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.80078125, + "step": 5535, + "time_per_iteration": 2.5031726360321045 + }, + { + "auxiliary_loss_clip": 0.01126804, + "auxiliary_loss_mlp": 0.01038594, + "balance_loss_clip": 1.02450383, + "balance_loss_mlp": 1.04416704, + "epoch": 0.33284232676987824, + "flos": 25374516019200.0, + "grad_norm": 2.1308907042960525, + "language_loss": 0.72915065, + "learning_rate": 3.1126174813867517e-06, + "loss": 0.75080466, + "num_input_tokens_seen": 118910260, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.828125, + "step": 5536, + "time_per_iteration": 2.494007110595703 + }, + { + "auxiliary_loss_clip": 0.01126771, + "auxiliary_loss_mlp": 0.01038616, + "balance_loss_clip": 1.02474046, + "balance_loss_mlp": 1.0450089, + "epoch": 0.3329024500225462, + "flos": 23694363400320.0, + "grad_norm": 1.6653416647198893, + "language_loss": 0.81801486, + "learning_rate": 3.112293827106917e-06, + "loss": 0.83966869, + "num_input_tokens_seen": 118929985, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.81640625, + "step": 5537, + "time_per_iteration": 2.611788272857666 + }, + { + "auxiliary_loss_clip": 0.01131655, + "auxiliary_loss_mlp": 0.01042421, + "balance_loss_clip": 1.02805638, + "balance_loss_mlp": 1.04771638, + "epoch": 0.33296257327521417, + "flos": 31723163205120.0, + "grad_norm": 1.938500745409862, + "language_loss": 0.71606827, + "learning_rate": 3.111970130648789e-06, + "loss": 0.73780894, + "num_input_tokens_seen": 118951355, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.83984375, + "step": 5538, + "time_per_iteration": 2.538574695587158 + }, + { + "auxiliary_loss_clip": 0.01125022, + "auxiliary_loss_mlp": 0.01030337, + "balance_loss_clip": 1.01642489, + "balance_loss_mlp": 1.04461074, + "epoch": 0.33302269652788213, + "flos": 22744705674240.0, + "grad_norm": 2.0173985756025417, + "language_loss": 0.7442342, + "learning_rate": 3.1116463920246424e-06, + "loss": 0.76578778, + "num_input_tokens_seen": 118970910, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.8046875, + "step": 5539, + "time_per_iteration": 2.539393424987793 + }, + { + "auxiliary_loss_clip": 0.01132315, + "auxiliary_loss_mlp": 0.01045465, + "balance_loss_clip": 1.03062367, + "balance_loss_mlp": 1.04543138, + "epoch": 0.33308281978055015, + "flos": 11473747441920.0, + "grad_norm": 1.8798801752229715, + "language_loss": 0.70726681, + "learning_rate": 3.1113226112467527e-06, + "loss": 0.72904468, + "num_input_tokens_seen": 118989200, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8671875, + "step": 5540, + "time_per_iteration": 2.460745096206665 + }, + { + "auxiliary_loss_clip": 0.01123042, + "auxiliary_loss_mlp": 0.01035986, + "balance_loss_clip": 1.02156138, + "balance_loss_mlp": 1.04151917, + "epoch": 0.3331429430332181, + "flos": 38213693112960.0, + "grad_norm": 2.212860979219503, + "language_loss": 0.60678709, + "learning_rate": 3.1109987883273983e-06, + "loss": 0.62837738, + "num_input_tokens_seen": 119011030, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8125, + "step": 5541, + "time_per_iteration": 2.643308162689209 + }, + { + "auxiliary_loss_clip": 0.01129096, + "auxiliary_loss_mlp": 0.01040856, + "balance_loss_clip": 1.0256207, + "balance_loss_mlp": 1.04428339, + "epoch": 0.3332030662858861, + "flos": 22528667324160.0, + "grad_norm": 1.7250198470895146, + "language_loss": 0.68636936, + "learning_rate": 3.1106749232788584e-06, + "loss": 0.70806885, + "num_input_tokens_seen": 119030620, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8515625, + "step": 5542, + "time_per_iteration": 2.472029209136963 + }, + { + "auxiliary_loss_clip": 0.0112742, + "auxiliary_loss_mlp": 0.01037594, + "balance_loss_clip": 1.02362895, + "balance_loss_mlp": 1.04488277, + "epoch": 0.33326318953855405, + "flos": 15997773507840.0, + "grad_norm": 1.6472310915335262, + "language_loss": 0.75526464, + "learning_rate": 3.110351016113414e-06, + "loss": 0.77691472, + "num_input_tokens_seen": 119048015, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.82421875, + "step": 5543, + "time_per_iteration": 2.453550100326538 + }, + { + "auxiliary_loss_clip": 0.01132047, + "auxiliary_loss_mlp": 0.01037735, + "balance_loss_clip": 1.02342415, + "balance_loss_mlp": 1.04834402, + "epoch": 0.333323312791222, + "flos": 25593535198080.0, + "grad_norm": 1.6694578175563026, + "language_loss": 0.75282717, + "learning_rate": 3.110027066843348e-06, + "loss": 0.77452493, + "num_input_tokens_seen": 119066280, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.8359375, + "step": 5544, + "time_per_iteration": 2.486992835998535 + }, + { + "auxiliary_loss_clip": 0.01124934, + "auxiliary_loss_mlp": 0.01033224, + "balance_loss_clip": 1.01910329, + "balance_loss_mlp": 1.04350412, + "epoch": 0.33338343604389, + "flos": 25119550304640.0, + "grad_norm": 1.4864809930890506, + "language_loss": 0.70886022, + "learning_rate": 3.1097030754809456e-06, + "loss": 0.73044181, + "num_input_tokens_seen": 119087680, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8125, + "step": 5545, + "time_per_iteration": 2.5813279151916504 + }, + { + "auxiliary_loss_clip": 0.01125022, + "auxiliary_loss_mlp": 0.01037243, + "balance_loss_clip": 1.02333164, + "balance_loss_mlp": 1.04530168, + "epoch": 0.33344355929655795, + "flos": 16947287579520.0, + "grad_norm": 1.7150542013191912, + "language_loss": 0.69300294, + "learning_rate": 3.1093790420384894e-06, + "loss": 0.7146256, + "num_input_tokens_seen": 119105820, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.796875, + "step": 5546, + "time_per_iteration": 2.4564788341522217 + }, + { + "auxiliary_loss_clip": 0.01129119, + "auxiliary_loss_mlp": 0.01037833, + "balance_loss_clip": 1.02340865, + "balance_loss_mlp": 1.04343665, + "epoch": 0.3335036825492259, + "flos": 27889591345920.0, + "grad_norm": 1.6632006519185205, + "language_loss": 0.64804697, + "learning_rate": 3.1090549665282702e-06, + "loss": 0.66971648, + "num_input_tokens_seen": 119126630, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.859375, + "step": 5547, + "time_per_iteration": 2.554959774017334 + }, + { + "auxiliary_loss_clip": 0.01127842, + "auxiliary_loss_mlp": 0.01030841, + "balance_loss_clip": 1.01782918, + "balance_loss_mlp": 1.0467664, + "epoch": 0.3335638058018939, + "flos": 16179553261440.0, + "grad_norm": 2.454082693277369, + "language_loss": 0.856148, + "learning_rate": 3.1087308489625742e-06, + "loss": 0.87773478, + "num_input_tokens_seen": 119143375, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.8125, + "step": 5548, + "time_per_iteration": 2.451032876968384 + }, + { + "auxiliary_loss_clip": 0.01129139, + "auxiliary_loss_mlp": 0.01036581, + "balance_loss_clip": 1.02100003, + "balance_loss_mlp": 1.04508662, + "epoch": 0.33362392905456184, + "flos": 39896108288640.0, + "grad_norm": 2.024965729715467, + "language_loss": 0.74754196, + "learning_rate": 3.1084066893536945e-06, + "loss": 0.76919919, + "num_input_tokens_seen": 119166450, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.83984375, + "step": 5549, + "time_per_iteration": 2.6875991821289062 + }, + { + "auxiliary_loss_clip": 0.01128755, + "auxiliary_loss_mlp": 0.01038769, + "balance_loss_clip": 1.02362955, + "balance_loss_mlp": 1.04486775, + "epoch": 0.3336840523072298, + "flos": 44271212567040.0, + "grad_norm": 1.8150391856089545, + "language_loss": 0.68361247, + "learning_rate": 3.108082487713921e-06, + "loss": 0.70528769, + "num_input_tokens_seen": 119189645, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.83984375, + "step": 5550, + "time_per_iteration": 2.640758752822876 + }, + { + "auxiliary_loss_clip": 0.0112866, + "auxiliary_loss_mlp": 0.01039899, + "balance_loss_clip": 1.02611244, + "balance_loss_mlp": 1.04545677, + "epoch": 0.33374417555989777, + "flos": 15085678429440.0, + "grad_norm": 1.742869766825136, + "language_loss": 0.60666394, + "learning_rate": 3.1077582440555495e-06, + "loss": 0.62834954, + "num_input_tokens_seen": 119208045, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.83203125, + "step": 5551, + "time_per_iteration": 2.454871654510498 + }, + { + "auxiliary_loss_clip": 0.01127389, + "auxiliary_loss_mlp": 0.01040643, + "balance_loss_clip": 1.02569366, + "balance_loss_mlp": 1.0459497, + "epoch": 0.33380429881256574, + "flos": 15849174942720.0, + "grad_norm": 1.6119589143573256, + "language_loss": 0.70450759, + "learning_rate": 3.1074339583908746e-06, + "loss": 0.72618788, + "num_input_tokens_seen": 119224910, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8125, + "step": 5552, + "time_per_iteration": 2.4226949214935303 + }, + { + "auxiliary_loss_clip": 0.01127587, + "auxiliary_loss_mlp": 0.01036933, + "balance_loss_clip": 1.02297902, + "balance_loss_mlp": 1.04462051, + "epoch": 0.33386442206523376, + "flos": 13480327883520.0, + "grad_norm": 2.0022942324560145, + "language_loss": 0.8289907, + "learning_rate": 3.107109630732192e-06, + "loss": 0.85063589, + "num_input_tokens_seen": 119243290, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.828125, + "step": 5553, + "time_per_iteration": 3.8951358795166016 + }, + { + "auxiliary_loss_clip": 0.01128647, + "auxiliary_loss_mlp": 0.01036827, + "balance_loss_clip": 1.0211153, + "balance_loss_mlp": 1.04528964, + "epoch": 0.3339245453179017, + "flos": 16690669839360.0, + "grad_norm": 2.095475541363027, + "language_loss": 0.81220448, + "learning_rate": 3.1067852610918017e-06, + "loss": 0.83385921, + "num_input_tokens_seen": 119261195, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.83203125, + "step": 5554, + "time_per_iteration": 3.8097896575927734 + }, + { + "auxiliary_loss_clip": 0.01128551, + "auxiliary_loss_mlp": 0.0104249, + "balance_loss_clip": 1.02811968, + "balance_loss_mlp": 1.0457983, + "epoch": 0.3339846685705697, + "flos": 24610624456320.0, + "grad_norm": 1.4459560856203526, + "language_loss": 0.81277251, + "learning_rate": 3.1064608494820032e-06, + "loss": 0.83448291, + "num_input_tokens_seen": 119282845, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.828125, + "step": 5555, + "time_per_iteration": 2.51686954498291 + }, + { + "auxiliary_loss_clip": 0.01126865, + "auxiliary_loss_mlp": 0.01038536, + "balance_loss_clip": 1.02469552, + "balance_loss_mlp": 1.04441357, + "epoch": 0.33404479182323765, + "flos": 30953812775040.0, + "grad_norm": 1.713035899616047, + "language_loss": 0.74563497, + "learning_rate": 3.106136395915099e-06, + "loss": 0.76728898, + "num_input_tokens_seen": 119304430, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.82421875, + "step": 5556, + "time_per_iteration": 2.550630807876587 + }, + { + "auxiliary_loss_clip": 0.0112773, + "auxiliary_loss_mlp": 0.01038673, + "balance_loss_clip": 1.02459431, + "balance_loss_mlp": 1.04586554, + "epoch": 0.3341049150759056, + "flos": 23513301918720.0, + "grad_norm": 1.4096864083862861, + "language_loss": 0.82588691, + "learning_rate": 3.105811900403391e-06, + "loss": 0.84755093, + "num_input_tokens_seen": 119323830, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8203125, + "step": 5557, + "time_per_iteration": 2.498108148574829 + }, + { + "auxiliary_loss_clip": 0.01129625, + "auxiliary_loss_mlp": 0.01045289, + "balance_loss_clip": 1.03055513, + "balance_loss_mlp": 1.04486346, + "epoch": 0.3341650383285736, + "flos": 24026824707840.0, + "grad_norm": 1.7414701325609587, + "language_loss": 0.80056083, + "learning_rate": 3.1054873629591855e-06, + "loss": 0.82230997, + "num_input_tokens_seen": 119346340, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.84765625, + "step": 5558, + "time_per_iteration": 2.5519607067108154 + }, + { + "auxiliary_loss_clip": 0.01129512, + "auxiliary_loss_mlp": 0.01034927, + "balance_loss_clip": 1.02159929, + "balance_loss_mlp": 1.04537535, + "epoch": 0.33422516158124155, + "flos": 24901967669760.0, + "grad_norm": 1.595273660638049, + "language_loss": 0.81953323, + "learning_rate": 3.105162783594788e-06, + "loss": 0.84117764, + "num_input_tokens_seen": 119367285, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.84375, + "step": 5559, + "time_per_iteration": 2.5202248096466064 + }, + { + "auxiliary_loss_clip": 0.01126195, + "auxiliary_loss_mlp": 0.01038303, + "balance_loss_clip": 1.02384293, + "balance_loss_mlp": 1.04450536, + "epoch": 0.3342852848339095, + "flos": 18333403464960.0, + "grad_norm": 2.784570608011319, + "language_loss": 0.72027284, + "learning_rate": 3.1048381623225074e-06, + "loss": 0.74191785, + "num_input_tokens_seen": 119385370, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.81640625, + "step": 5560, + "time_per_iteration": 2.453016757965088 + }, + { + "auxiliary_loss_clip": 0.01133571, + "auxiliary_loss_mlp": 0.01046441, + "balance_loss_clip": 1.03118193, + "balance_loss_mlp": 1.04679513, + "epoch": 0.3343454080865775, + "flos": 30046530119040.0, + "grad_norm": 2.584817000325422, + "language_loss": 0.74888778, + "learning_rate": 3.1045134991546526e-06, + "loss": 0.77068788, + "num_input_tokens_seen": 119409150, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8671875, + "step": 5561, + "time_per_iteration": 2.526980400085449 + }, + { + "auxiliary_loss_clip": 0.01128977, + "auxiliary_loss_mlp": 0.01038649, + "balance_loss_clip": 1.02410603, + "balance_loss_mlp": 1.04610825, + "epoch": 0.33440553133924544, + "flos": 16398823835520.0, + "grad_norm": 2.2689753945529176, + "language_loss": 0.69638503, + "learning_rate": 3.1041887941035355e-06, + "loss": 0.71806127, + "num_input_tokens_seen": 119426475, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.828125, + "step": 5562, + "time_per_iteration": 2.483530282974243 + }, + { + "auxiliary_loss_clip": 0.01127212, + "auxiliary_loss_mlp": 0.01041398, + "balance_loss_clip": 1.02821374, + "balance_loss_mlp": 1.04549575, + "epoch": 0.3344656545919134, + "flos": 24242072958720.0, + "grad_norm": 1.5595683236821118, + "language_loss": 0.65407914, + "learning_rate": 3.1038640471814685e-06, + "loss": 0.67576528, + "num_input_tokens_seen": 119446900, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.8203125, + "step": 5563, + "time_per_iteration": 2.489734649658203 + }, + { + "auxiliary_loss_clip": 0.01131891, + "auxiliary_loss_mlp": 0.0104331, + "balance_loss_clip": 1.027843, + "balance_loss_mlp": 1.0464654, + "epoch": 0.3345257778445814, + "flos": 52118843149440.0, + "grad_norm": 3.650208894964183, + "language_loss": 0.74457055, + "learning_rate": 3.103539258400766e-06, + "loss": 0.76632255, + "num_input_tokens_seen": 119470945, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.85546875, + "step": 5564, + "time_per_iteration": 2.7312686443328857 + }, + { + "auxiliary_loss_clip": 0.01049511, + "auxiliary_loss_mlp": 0.00999253, + "balance_loss_clip": 0.99735802, + "balance_loss_mlp": 1.02280784, + "epoch": 0.33458590109724934, + "flos": 68048602254720.0, + "grad_norm": 0.7800603717209338, + "language_loss": 0.55489159, + "learning_rate": 3.103214427773745e-06, + "loss": 0.57537925, + "num_input_tokens_seen": 119529925, + "router_z_loss_clip": 0.0189209, + "router_z_loss_mlp": 0.265625, + "step": 5565, + "time_per_iteration": 3.0266246795654297 + }, + { + "auxiliary_loss_clip": 0.01126829, + "auxiliary_loss_mlp": 0.0103706, + "balance_loss_clip": 1.02271366, + "balance_loss_mlp": 1.04589689, + "epoch": 0.3346460243499173, + "flos": 37414788768000.0, + "grad_norm": 1.7346222757402157, + "language_loss": 0.64754677, + "learning_rate": 3.102889555312721e-06, + "loss": 0.66918564, + "num_input_tokens_seen": 119550700, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.80859375, + "step": 5566, + "time_per_iteration": 2.5819363594055176 + }, + { + "auxiliary_loss_clip": 0.01128946, + "auxiliary_loss_mlp": 0.01040566, + "balance_loss_clip": 1.0259037, + "balance_loss_mlp": 1.04706717, + "epoch": 0.3347061476025853, + "flos": 18697358021760.0, + "grad_norm": 1.73011072762743, + "language_loss": 0.77735972, + "learning_rate": 3.102564641030016e-06, + "loss": 0.7990548, + "num_input_tokens_seen": 119569295, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8203125, + "step": 5567, + "time_per_iteration": 2.508108377456665 + }, + { + "auxiliary_loss_clip": 0.01130701, + "auxiliary_loss_mlp": 0.01040305, + "balance_loss_clip": 1.02480745, + "balance_loss_mlp": 1.04583585, + "epoch": 0.3347662708552533, + "flos": 13917827537280.0, + "grad_norm": 1.719738804733239, + "language_loss": 0.76512182, + "learning_rate": 3.102239684937949e-06, + "loss": 0.78683186, + "num_input_tokens_seen": 119587375, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84765625, + "step": 5568, + "time_per_iteration": 2.4344217777252197 + }, + { + "auxiliary_loss_clip": 0.01130164, + "auxiliary_loss_mlp": 0.01044901, + "balance_loss_clip": 1.02973104, + "balance_loss_mlp": 1.04528308, + "epoch": 0.33482639410792125, + "flos": 19750402068480.0, + "grad_norm": 2.265483767853782, + "language_loss": 0.71277773, + "learning_rate": 3.101914687048842e-06, + "loss": 0.73452842, + "num_input_tokens_seen": 119604530, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.84765625, + "step": 5569, + "time_per_iteration": 2.462592840194702 + }, + { + "auxiliary_loss_clip": 0.0112772, + "auxiliary_loss_mlp": 0.0103514, + "balance_loss_clip": 1.01920176, + "balance_loss_mlp": 1.04275155, + "epoch": 0.3348865173605892, + "flos": 16102991422080.0, + "grad_norm": 1.859999754882374, + "language_loss": 0.90291858, + "learning_rate": 3.10158964737502e-06, + "loss": 0.9245472, + "num_input_tokens_seen": 119621025, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8515625, + "step": 5570, + "time_per_iteration": 2.432124614715576 + }, + { + "auxiliary_loss_clip": 0.0112712, + "auxiliary_loss_mlp": 0.01030792, + "balance_loss_clip": 1.01634383, + "balance_loss_mlp": 1.04461455, + "epoch": 0.3349466406132572, + "flos": 25008945350400.0, + "grad_norm": 1.7333982724081918, + "language_loss": 0.80038494, + "learning_rate": 3.101264565928808e-06, + "loss": 0.82196403, + "num_input_tokens_seen": 119641725, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.82421875, + "step": 5571, + "time_per_iteration": 2.52752947807312 + }, + { + "auxiliary_loss_clip": 0.0104544, + "auxiliary_loss_mlp": 0.00998336, + "balance_loss_clip": 0.99651235, + "balance_loss_mlp": 1.01880455, + "epoch": 0.33500676386592515, + "flos": 54319991564160.0, + "grad_norm": 0.9063074837999179, + "language_loss": 0.55948162, + "learning_rate": 3.1009394427225335e-06, + "loss": 0.5799194, + "num_input_tokens_seen": 119693560, + "router_z_loss_clip": 0.01818848, + "router_z_loss_mlp": 0.265625, + "step": 5572, + "time_per_iteration": 3.0247979164123535 + }, + { + "auxiliary_loss_clip": 0.01131084, + "auxiliary_loss_mlp": 0.01046374, + "balance_loss_clip": 1.03212237, + "balance_loss_mlp": 1.04797339, + "epoch": 0.3350668871185931, + "flos": 26797332625920.0, + "grad_norm": 2.028320341949736, + "language_loss": 0.78112698, + "learning_rate": 3.1006142777685257e-06, + "loss": 0.80290151, + "num_input_tokens_seen": 119712935, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.83203125, + "step": 5573, + "time_per_iteration": 2.5152878761291504 + }, + { + "auxiliary_loss_clip": 0.01130248, + "auxiliary_loss_mlp": 0.01046989, + "balance_loss_clip": 1.03143215, + "balance_loss_mlp": 1.04525197, + "epoch": 0.3351270103712611, + "flos": 33510508986240.0, + "grad_norm": 2.1279768530108503, + "language_loss": 0.72473001, + "learning_rate": 3.1002890710791133e-06, + "loss": 0.7465024, + "num_input_tokens_seen": 119731680, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8515625, + "step": 5574, + "time_per_iteration": 2.543531656265259 + }, + { + "auxiliary_loss_clip": 0.01125319, + "auxiliary_loss_mlp": 0.0103147, + "balance_loss_clip": 1.017308, + "balance_loss_mlp": 1.04292774, + "epoch": 0.33518713362392905, + "flos": 26506240807680.0, + "grad_norm": 2.78085640379241, + "language_loss": 0.87911499, + "learning_rate": 3.0999638226666287e-06, + "loss": 0.90068293, + "num_input_tokens_seen": 119752155, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.82421875, + "step": 5575, + "time_per_iteration": 2.546952724456787 + }, + { + "auxiliary_loss_clip": 0.01132707, + "auxiliary_loss_mlp": 0.01045745, + "balance_loss_clip": 1.02899647, + "balance_loss_mlp": 1.04479516, + "epoch": 0.335247256876597, + "flos": 17232345912960.0, + "grad_norm": 2.569353520757799, + "language_loss": 0.82441479, + "learning_rate": 3.0996385325434063e-06, + "loss": 0.84619927, + "num_input_tokens_seen": 119769195, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.875, + "step": 5576, + "time_per_iteration": 2.414294958114624 + }, + { + "auxiliary_loss_clip": 0.01129312, + "auxiliary_loss_mlp": 0.01044917, + "balance_loss_clip": 1.0286808, + "balance_loss_mlp": 1.043697, + "epoch": 0.335307380129265, + "flos": 25629373992960.0, + "grad_norm": 3.008815557703919, + "language_loss": 0.73384887, + "learning_rate": 3.0993132007217806e-06, + "loss": 0.75559115, + "num_input_tokens_seen": 119786810, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.85546875, + "step": 5577, + "time_per_iteration": 2.50136399269104 + }, + { + "auxiliary_loss_clip": 0.01131921, + "auxiliary_loss_mlp": 0.0104202, + "balance_loss_clip": 1.02667177, + "balance_loss_mlp": 1.04811549, + "epoch": 0.33536750338193294, + "flos": 19680089195520.0, + "grad_norm": 1.7225109171896533, + "language_loss": 0.81555498, + "learning_rate": 3.0989878272140883e-06, + "loss": 0.8372944, + "num_input_tokens_seen": 119805395, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8359375, + "step": 5578, + "time_per_iteration": 2.431365728378296 + }, + { + "auxiliary_loss_clip": 0.01125183, + "auxiliary_loss_mlp": 0.0103725, + "balance_loss_clip": 1.02277184, + "balance_loss_mlp": 1.04578936, + "epoch": 0.3354276266346009, + "flos": 18332613365760.0, + "grad_norm": 1.8947087551065327, + "language_loss": 0.71785814, + "learning_rate": 3.0986624120326676e-06, + "loss": 0.73948246, + "num_input_tokens_seen": 119823135, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.79296875, + "step": 5579, + "time_per_iteration": 2.4519495964050293 + }, + { + "auxiliary_loss_clip": 0.01130811, + "auxiliary_loss_mlp": 0.01037429, + "balance_loss_clip": 1.02191353, + "balance_loss_mlp": 1.0456152, + "epoch": 0.3354877498872689, + "flos": 17858556645120.0, + "grad_norm": 2.0306401350469225, + "language_loss": 0.81084043, + "learning_rate": 3.0983369551898573e-06, + "loss": 0.83252287, + "num_input_tokens_seen": 119842265, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8515625, + "step": 5580, + "time_per_iteration": 2.427481174468994 + }, + { + "auxiliary_loss_clip": 0.01130056, + "auxiliary_loss_mlp": 0.01036614, + "balance_loss_clip": 1.02091432, + "balance_loss_mlp": 1.04496789, + "epoch": 0.3355478731399369, + "flos": 24717745791360.0, + "grad_norm": 1.8687829543354073, + "language_loss": 0.77912092, + "learning_rate": 3.0980114566980003e-06, + "loss": 0.80078757, + "num_input_tokens_seen": 119862500, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8515625, + "step": 5581, + "time_per_iteration": 2.5320229530334473 + }, + { + "auxiliary_loss_clip": 0.01132086, + "auxiliary_loss_mlp": 0.01045037, + "balance_loss_clip": 1.02735782, + "balance_loss_mlp": 1.04367673, + "epoch": 0.33560799639260486, + "flos": 16873886136960.0, + "grad_norm": 5.02896087449, + "language_loss": 0.74623251, + "learning_rate": 3.0976859165694384e-06, + "loss": 0.76800376, + "num_input_tokens_seen": 119880160, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.8828125, + "step": 5582, + "time_per_iteration": 2.421482801437378 + }, + { + "auxiliary_loss_clip": 0.0113015, + "auxiliary_loss_mlp": 0.01041343, + "balance_loss_clip": 1.02528524, + "balance_loss_mlp": 1.04456937, + "epoch": 0.3356681196452728, + "flos": 18333511205760.0, + "grad_norm": 1.790512330860928, + "language_loss": 0.82143587, + "learning_rate": 3.0973603348165166e-06, + "loss": 0.84315073, + "num_input_tokens_seen": 119899040, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.85546875, + "step": 5583, + "time_per_iteration": 2.4543566703796387 + }, + { + "auxiliary_loss_clip": 0.01127596, + "auxiliary_loss_mlp": 0.01044573, + "balance_loss_clip": 1.02991009, + "balance_loss_mlp": 1.04491317, + "epoch": 0.3357282428979408, + "flos": 34750612085760.0, + "grad_norm": 1.9267692381394996, + "language_loss": 0.7779209, + "learning_rate": 3.097034711451581e-06, + "loss": 0.79964256, + "num_input_tokens_seen": 119921120, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.828125, + "step": 5584, + "time_per_iteration": 2.6100947856903076 + }, + { + "auxiliary_loss_clip": 0.01129164, + "auxiliary_loss_mlp": 0.01038197, + "balance_loss_clip": 1.02343249, + "balance_loss_mlp": 1.04359186, + "epoch": 0.33578836615060875, + "flos": 21580087006080.0, + "grad_norm": 1.4758908421399493, + "language_loss": 0.75978506, + "learning_rate": 3.0967090464869795e-06, + "loss": 0.78145868, + "num_input_tokens_seen": 119940165, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.85546875, + "step": 5585, + "time_per_iteration": 2.4898715019226074 + }, + { + "auxiliary_loss_clip": 0.01121936, + "auxiliary_loss_mlp": 0.01037049, + "balance_loss_clip": 1.02170694, + "balance_loss_mlp": 1.04066801, + "epoch": 0.3358484894032767, + "flos": 24530291688960.0, + "grad_norm": 1.4987207146888684, + "language_loss": 0.77731383, + "learning_rate": 3.0963833399350608e-06, + "loss": 0.79890364, + "num_input_tokens_seen": 119959730, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8125, + "step": 5586, + "time_per_iteration": 2.4825005531311035 + }, + { + "auxiliary_loss_clip": 0.01136236, + "auxiliary_loss_mlp": 0.01048607, + "balance_loss_clip": 1.03070199, + "balance_loss_mlp": 1.04787183, + "epoch": 0.3359086126559447, + "flos": 22455589104000.0, + "grad_norm": 1.6235624689574053, + "language_loss": 0.81027555, + "learning_rate": 3.0960575918081756e-06, + "loss": 0.83212399, + "num_input_tokens_seen": 119979315, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.8828125, + "step": 5587, + "time_per_iteration": 2.486459493637085 + }, + { + "auxiliary_loss_clip": 0.01125436, + "auxiliary_loss_mlp": 0.01040884, + "balance_loss_clip": 1.0270915, + "balance_loss_mlp": 1.04548144, + "epoch": 0.33596873590861265, + "flos": 16543687386240.0, + "grad_norm": 1.7952449023594161, + "language_loss": 0.67014575, + "learning_rate": 3.095731802118677e-06, + "loss": 0.69180894, + "num_input_tokens_seen": 119996140, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.80078125, + "step": 5588, + "time_per_iteration": 2.435070753097534 + }, + { + "auxiliary_loss_clip": 0.01130516, + "auxiliary_loss_mlp": 0.0104412, + "balance_loss_clip": 1.02784824, + "balance_loss_mlp": 1.04568088, + "epoch": 0.3360288591612806, + "flos": 31175812782720.0, + "grad_norm": 1.6839710852868943, + "language_loss": 0.69882601, + "learning_rate": 3.095405970878919e-06, + "loss": 0.72057241, + "num_input_tokens_seen": 120017720, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.84765625, + "step": 5589, + "time_per_iteration": 2.548051118850708 + }, + { + "auxiliary_loss_clip": 0.01129859, + "auxiliary_loss_mlp": 0.01043753, + "balance_loss_clip": 1.02709961, + "balance_loss_mlp": 1.04461861, + "epoch": 0.3360889824139486, + "flos": 23696913265920.0, + "grad_norm": 2.1328325025080987, + "language_loss": 0.66886735, + "learning_rate": 3.0950800981012567e-06, + "loss": 0.69060349, + "num_input_tokens_seen": 120036335, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8515625, + "step": 5590, + "time_per_iteration": 2.4735047817230225 + }, + { + "auxiliary_loss_clip": 0.01126204, + "auxiliary_loss_mlp": 0.01045607, + "balance_loss_clip": 1.02993059, + "balance_loss_mlp": 1.04570127, + "epoch": 0.33614910566661654, + "flos": 19318109886720.0, + "grad_norm": 1.8322479695472769, + "language_loss": 0.73409903, + "learning_rate": 3.094754183798047e-06, + "loss": 0.75581712, + "num_input_tokens_seen": 120056120, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8046875, + "step": 5591, + "time_per_iteration": 2.4736244678497314 + }, + { + "auxiliary_loss_clip": 0.01127166, + "auxiliary_loss_mlp": 0.01042172, + "balance_loss_clip": 1.02686584, + "balance_loss_mlp": 1.04408562, + "epoch": 0.3362092289192845, + "flos": 16472261191680.0, + "grad_norm": 1.9183925576882788, + "language_loss": 0.69446647, + "learning_rate": 3.0944282279816493e-06, + "loss": 0.71615982, + "num_input_tokens_seen": 120073650, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.828125, + "step": 5592, + "time_per_iteration": 2.4232676029205322 + }, + { + "auxiliary_loss_clip": 0.0112535, + "auxiliary_loss_mlp": 0.01037895, + "balance_loss_clip": 1.02366149, + "balance_loss_mlp": 1.0442183, + "epoch": 0.33626935217195253, + "flos": 24243581329920.0, + "grad_norm": 2.4700576130478367, + "language_loss": 0.76281321, + "learning_rate": 3.094102230664423e-06, + "loss": 0.78444564, + "num_input_tokens_seen": 120093260, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8125, + "step": 5593, + "time_per_iteration": 2.4856812953948975 + }, + { + "auxiliary_loss_clip": 0.01128845, + "auxiliary_loss_mlp": 0.01044628, + "balance_loss_clip": 1.02703261, + "balance_loss_mlp": 1.04333365, + "epoch": 0.3363294754246205, + "flos": 19718765164800.0, + "grad_norm": 2.2267028217655516, + "language_loss": 0.71435678, + "learning_rate": 3.093776191858731e-06, + "loss": 0.73609149, + "num_input_tokens_seen": 120111830, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8515625, + "step": 5594, + "time_per_iteration": 2.437554359436035 + }, + { + "auxiliary_loss_clip": 0.0113233, + "auxiliary_loss_mlp": 0.01045948, + "balance_loss_clip": 1.02985501, + "balance_loss_mlp": 1.04690135, + "epoch": 0.33638959867728846, + "flos": 22596286677120.0, + "grad_norm": 1.637052204404589, + "language_loss": 0.80350173, + "learning_rate": 3.0934501115769363e-06, + "loss": 0.82528448, + "num_input_tokens_seen": 120130470, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.85546875, + "step": 5595, + "time_per_iteration": 5.51651668548584 + }, + { + "auxiliary_loss_clip": 0.0112868, + "auxiliary_loss_mlp": 0.01033292, + "balance_loss_clip": 1.01964831, + "balance_loss_mlp": 1.04542542, + "epoch": 0.3364497219299564, + "flos": 20994742972800.0, + "grad_norm": 1.8244163047079407, + "language_loss": 0.81611145, + "learning_rate": 3.0931239898314037e-06, + "loss": 0.83773112, + "num_input_tokens_seen": 120150735, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.83203125, + "step": 5596, + "time_per_iteration": 2.4959781169891357 + }, + { + "auxiliary_loss_clip": 0.01128091, + "auxiliary_loss_mlp": 0.01039521, + "balance_loss_clip": 1.02508509, + "balance_loss_mlp": 1.04461718, + "epoch": 0.3365098451826244, + "flos": 25228610974080.0, + "grad_norm": 1.7014468319312177, + "language_loss": 0.76001227, + "learning_rate": 3.0927978266344995e-06, + "loss": 0.78168839, + "num_input_tokens_seen": 120173230, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8359375, + "step": 5597, + "time_per_iteration": 2.4965333938598633 + }, + { + "auxiliary_loss_clip": 0.01126223, + "auxiliary_loss_mlp": 0.01037273, + "balance_loss_clip": 1.0233258, + "balance_loss_mlp": 1.04597533, + "epoch": 0.33656996843529235, + "flos": 24571697091840.0, + "grad_norm": 1.8007239192940239, + "language_loss": 0.78937811, + "learning_rate": 3.0924716219985916e-06, + "loss": 0.81101304, + "num_input_tokens_seen": 120191860, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.80078125, + "step": 5598, + "time_per_iteration": 2.587813377380371 + }, + { + "auxiliary_loss_clip": 0.01133011, + "auxiliary_loss_mlp": 0.01036012, + "balance_loss_clip": 1.02036011, + "balance_loss_mlp": 1.04606342, + "epoch": 0.3366300916879603, + "flos": 44091120752640.0, + "grad_norm": 1.4664560154247552, + "language_loss": 0.64197004, + "learning_rate": 3.0921453759360514e-06, + "loss": 0.66366023, + "num_input_tokens_seen": 120219195, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.87109375, + "step": 5599, + "time_per_iteration": 2.647618293762207 + }, + { + "auxiliary_loss_clip": 0.0113527, + "auxiliary_loss_mlp": 0.01043272, + "balance_loss_clip": 1.02685726, + "balance_loss_mlp": 1.0468514, + "epoch": 0.3366902149406283, + "flos": 13879869840000.0, + "grad_norm": 2.652004853610392, + "language_loss": 0.8172245, + "learning_rate": 3.091819088459249e-06, + "loss": 0.83900994, + "num_input_tokens_seen": 120232950, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.88671875, + "step": 5600, + "time_per_iteration": 2.441237211227417 + }, + { + "auxiliary_loss_clip": 0.01130498, + "auxiliary_loss_mlp": 0.01050016, + "balance_loss_clip": 1.03369582, + "balance_loss_mlp": 1.04399288, + "epoch": 0.33675033819329625, + "flos": 16253098358400.0, + "grad_norm": 3.359102963412802, + "language_loss": 0.82717538, + "learning_rate": 3.0914927595805573e-06, + "loss": 0.84898043, + "num_input_tokens_seen": 120248865, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.86328125, + "step": 5601, + "time_per_iteration": 2.4369428157806396 + }, + { + "auxiliary_loss_clip": 0.01127768, + "auxiliary_loss_mlp": 0.01032812, + "balance_loss_clip": 1.01911497, + "balance_loss_mlp": 1.04890418, + "epoch": 0.3368104614459642, + "flos": 17055809544960.0, + "grad_norm": 1.6511579237160083, + "language_loss": 0.82726496, + "learning_rate": 3.0911663893123507e-06, + "loss": 0.84887075, + "num_input_tokens_seen": 120267820, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7890625, + "step": 5602, + "time_per_iteration": 2.463291645050049 + }, + { + "auxiliary_loss_clip": 0.01130933, + "auxiliary_loss_mlp": 0.01055384, + "balance_loss_clip": 1.04039955, + "balance_loss_mlp": 1.04712546, + "epoch": 0.3368705846986322, + "flos": 17858628472320.0, + "grad_norm": 1.700541242008466, + "language_loss": 0.70208776, + "learning_rate": 3.0908399776670048e-06, + "loss": 0.72395098, + "num_input_tokens_seen": 120286540, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8359375, + "step": 5603, + "time_per_iteration": 2.4309756755828857 + }, + { + "auxiliary_loss_clip": 0.01133654, + "auxiliary_loss_mlp": 0.0103849, + "balance_loss_clip": 1.02392292, + "balance_loss_mlp": 1.04724145, + "epoch": 0.33693070795130015, + "flos": 22929502170240.0, + "grad_norm": 1.625433979180813, + "language_loss": 0.82925308, + "learning_rate": 3.090513524656898e-06, + "loss": 0.8509745, + "num_input_tokens_seen": 120307305, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.86328125, + "step": 5604, + "time_per_iteration": 2.4980010986328125 + }, + { + "auxiliary_loss_clip": 0.01129789, + "auxiliary_loss_mlp": 0.01042861, + "balance_loss_clip": 1.02782226, + "balance_loss_mlp": 1.0447166, + "epoch": 0.3369908312039681, + "flos": 22017443005440.0, + "grad_norm": 3.2518642032613654, + "language_loss": 0.73756403, + "learning_rate": 3.090187030294409e-06, + "loss": 0.75929046, + "num_input_tokens_seen": 120327845, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8515625, + "step": 5605, + "time_per_iteration": 2.4563212394714355 + }, + { + "auxiliary_loss_clip": 0.01132634, + "auxiliary_loss_mlp": 0.01040526, + "balance_loss_clip": 1.02520752, + "balance_loss_mlp": 1.04604197, + "epoch": 0.33705095445663613, + "flos": 11801970944640.0, + "grad_norm": 2.772980532366942, + "language_loss": 0.83487791, + "learning_rate": 3.089860494591919e-06, + "loss": 0.85660958, + "num_input_tokens_seen": 120343255, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8671875, + "step": 5606, + "time_per_iteration": 2.456441640853882 + }, + { + "auxiliary_loss_clip": 0.0112361, + "auxiliary_loss_mlp": 0.01039979, + "balance_loss_clip": 1.02549469, + "balance_loss_mlp": 1.0414753, + "epoch": 0.3371110777093041, + "flos": 25046400257280.0, + "grad_norm": 1.7790448991820722, + "language_loss": 0.67335433, + "learning_rate": 3.089533917561809e-06, + "loss": 0.69499022, + "num_input_tokens_seen": 120361745, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8203125, + "step": 5607, + "time_per_iteration": 2.4964821338653564 + }, + { + "auxiliary_loss_clip": 0.01130916, + "auxiliary_loss_mlp": 0.01041895, + "balance_loss_clip": 1.02694631, + "balance_loss_mlp": 1.04507923, + "epoch": 0.33717120096197206, + "flos": 26579031719040.0, + "grad_norm": 2.032375572186737, + "language_loss": 0.71093041, + "learning_rate": 3.089207299216464e-06, + "loss": 0.73265851, + "num_input_tokens_seen": 120380565, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.859375, + "step": 5608, + "time_per_iteration": 2.5247933864593506 + }, + { + "auxiliary_loss_clip": 0.01128549, + "auxiliary_loss_mlp": 0.01038175, + "balance_loss_clip": 1.0236311, + "balance_loss_mlp": 1.0446682, + "epoch": 0.33723132421464, + "flos": 15158541168000.0, + "grad_norm": 1.8968208773724307, + "language_loss": 0.79062563, + "learning_rate": 3.088880639568269e-06, + "loss": 0.81229293, + "num_input_tokens_seen": 120399235, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.83984375, + "step": 5609, + "time_per_iteration": 2.439502477645874 + }, + { + "auxiliary_loss_clip": 0.01129667, + "auxiliary_loss_mlp": 0.01042877, + "balance_loss_clip": 1.02706969, + "balance_loss_mlp": 1.04544735, + "epoch": 0.337291447467308, + "flos": 23436093634560.0, + "grad_norm": 2.0456898754189354, + "language_loss": 0.82218611, + "learning_rate": 3.0885539386296114e-06, + "loss": 0.84391159, + "num_input_tokens_seen": 120420095, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.84375, + "step": 5610, + "time_per_iteration": 2.502028226852417 + }, + { + "auxiliary_loss_clip": 0.01123686, + "auxiliary_loss_mlp": 0.01040586, + "balance_loss_clip": 1.02520823, + "balance_loss_mlp": 1.04264688, + "epoch": 0.33735157071997596, + "flos": 17238163916160.0, + "grad_norm": 1.8264685829582996, + "language_loss": 0.81998217, + "learning_rate": 3.088227196412879e-06, + "loss": 0.84162486, + "num_input_tokens_seen": 120437690, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8125, + "step": 5611, + "time_per_iteration": 2.4255177974700928 + }, + { + "auxiliary_loss_clip": 0.01130986, + "auxiliary_loss_mlp": 0.01044325, + "balance_loss_clip": 1.02728975, + "balance_loss_mlp": 1.04550552, + "epoch": 0.3374116939726439, + "flos": 28257388657920.0, + "grad_norm": 1.5753494383615703, + "language_loss": 0.79407716, + "learning_rate": 3.0879004129304626e-06, + "loss": 0.81583023, + "num_input_tokens_seen": 120459240, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.85546875, + "step": 5612, + "time_per_iteration": 2.537048578262329 + }, + { + "auxiliary_loss_clip": 0.01124133, + "auxiliary_loss_mlp": 0.01037075, + "balance_loss_clip": 1.02212596, + "balance_loss_mlp": 1.04021907, + "epoch": 0.3374718172253119, + "flos": 35919396731520.0, + "grad_norm": 2.519050824799004, + "language_loss": 0.70024467, + "learning_rate": 3.087573588194753e-06, + "loss": 0.72185683, + "num_input_tokens_seen": 120481090, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.83984375, + "step": 5613, + "time_per_iteration": 2.570373773574829 + }, + { + "auxiliary_loss_clip": 0.01129945, + "auxiliary_loss_mlp": 0.01037211, + "balance_loss_clip": 1.02203548, + "balance_loss_mlp": 1.04490113, + "epoch": 0.33753194047797985, + "flos": 18186672407040.0, + "grad_norm": 1.6646408753448763, + "language_loss": 0.79615057, + "learning_rate": 3.087246722218144e-06, + "loss": 0.81782216, + "num_input_tokens_seen": 120500045, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8515625, + "step": 5614, + "time_per_iteration": 2.4379053115844727 + }, + { + "auxiliary_loss_clip": 0.01126744, + "auxiliary_loss_mlp": 0.01040084, + "balance_loss_clip": 1.02331161, + "balance_loss_mlp": 1.04260945, + "epoch": 0.3375920637306478, + "flos": 23148916398720.0, + "grad_norm": 1.8534958586083128, + "language_loss": 0.90879035, + "learning_rate": 3.086919815013031e-06, + "loss": 0.93045861, + "num_input_tokens_seen": 120521125, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.84375, + "step": 5615, + "time_per_iteration": 2.4876632690429688 + }, + { + "auxiliary_loss_clip": 0.0112252, + "auxiliary_loss_mlp": 0.01040203, + "balance_loss_clip": 1.02596951, + "balance_loss_mlp": 1.04105914, + "epoch": 0.3376521869833158, + "flos": 23112215677440.0, + "grad_norm": 1.6970154369052728, + "language_loss": 0.80636102, + "learning_rate": 3.086592866591809e-06, + "loss": 0.82798827, + "num_input_tokens_seen": 120539180, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8125, + "step": 5616, + "time_per_iteration": 2.476569175720215 + }, + { + "auxiliary_loss_clip": 0.01131427, + "auxiliary_loss_mlp": 0.01044004, + "balance_loss_clip": 1.02715993, + "balance_loss_mlp": 1.04379678, + "epoch": 0.33771231023598375, + "flos": 19274585581440.0, + "grad_norm": 2.5053489219363754, + "language_loss": 0.84079826, + "learning_rate": 3.0862658769668774e-06, + "loss": 0.86255258, + "num_input_tokens_seen": 120556280, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.875, + "step": 5617, + "time_per_iteration": 2.4204065799713135 + }, + { + "auxiliary_loss_clip": 0.01125211, + "auxiliary_loss_mlp": 0.01036412, + "balance_loss_clip": 1.02190411, + "balance_loss_mlp": 1.04171932, + "epoch": 0.3377724334886517, + "flos": 18150187167360.0, + "grad_norm": 1.648273719366553, + "language_loss": 0.80173457, + "learning_rate": 3.0859388461506343e-06, + "loss": 0.82335079, + "num_input_tokens_seen": 120575395, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8359375, + "step": 5618, + "time_per_iteration": 2.4789302349090576 + }, + { + "auxiliary_loss_clip": 0.01128326, + "auxiliary_loss_mlp": 0.01034119, + "balance_loss_clip": 1.01895535, + "balance_loss_mlp": 1.04367077, + "epoch": 0.3378325567413197, + "flos": 25775997310080.0, + "grad_norm": 1.9548255306646998, + "language_loss": 0.70458674, + "learning_rate": 3.085611774155481e-06, + "loss": 0.72621119, + "num_input_tokens_seen": 120596075, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.84765625, + "step": 5619, + "time_per_iteration": 2.4674489498138428 + }, + { + "auxiliary_loss_clip": 0.01127452, + "auxiliary_loss_mlp": 0.01047075, + "balance_loss_clip": 1.0322814, + "balance_loss_mlp": 1.04403424, + "epoch": 0.3378926799939877, + "flos": 21317112558720.0, + "grad_norm": 5.009208052913787, + "language_loss": 0.69223797, + "learning_rate": 3.085284660993821e-06, + "loss": 0.7139833, + "num_input_tokens_seen": 120614195, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8359375, + "step": 5620, + "time_per_iteration": 2.475889205932617 + }, + { + "auxiliary_loss_clip": 0.01127115, + "auxiliary_loss_mlp": 0.01046185, + "balance_loss_clip": 1.03159392, + "balance_loss_mlp": 1.04497766, + "epoch": 0.33795280324665566, + "flos": 24900028335360.0, + "grad_norm": 2.0914960236262075, + "language_loss": 0.67498147, + "learning_rate": 3.084957506678058e-06, + "loss": 0.69671446, + "num_input_tokens_seen": 120634475, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8203125, + "step": 5621, + "time_per_iteration": 2.4732306003570557 + }, + { + "auxiliary_loss_clip": 0.01124388, + "auxiliary_loss_mlp": 0.01036572, + "balance_loss_clip": 1.02258897, + "balance_loss_mlp": 1.04336381, + "epoch": 0.33801292649932363, + "flos": 24753943722240.0, + "grad_norm": 1.811430245584347, + "language_loss": 0.82714671, + "learning_rate": 3.0846303112205975e-06, + "loss": 0.84875631, + "num_input_tokens_seen": 120654980, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.80859375, + "step": 5622, + "time_per_iteration": 2.5028531551361084 + }, + { + "auxiliary_loss_clip": 0.01122679, + "auxiliary_loss_mlp": 0.01041633, + "balance_loss_clip": 1.0279355, + "balance_loss_mlp": 1.04111528, + "epoch": 0.3380730497519916, + "flos": 26723967096960.0, + "grad_norm": 1.4271980952069887, + "language_loss": 0.73785996, + "learning_rate": 3.0843030746338464e-06, + "loss": 0.75950313, + "num_input_tokens_seen": 120676245, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.81640625, + "step": 5623, + "time_per_iteration": 2.483354091644287 + }, + { + "auxiliary_loss_clip": 0.01044412, + "auxiliary_loss_mlp": 0.01001556, + "balance_loss_clip": 0.99976796, + "balance_loss_mlp": 1.01787817, + "epoch": 0.33813317300465956, + "flos": 70035756416640.0, + "grad_norm": 0.7308868621653948, + "language_loss": 0.54898107, + "learning_rate": 3.083975796930215e-06, + "loss": 0.56944072, + "num_input_tokens_seen": 120741965, + "router_z_loss_clip": 0.01782227, + "router_z_loss_mlp": 0.265625, + "step": 5624, + "time_per_iteration": 3.2154293060302734 + }, + { + "auxiliary_loss_clip": 0.01128701, + "auxiliary_loss_mlp": 0.01040775, + "balance_loss_clip": 1.02536166, + "balance_loss_mlp": 1.04464245, + "epoch": 0.3381932962573275, + "flos": 24097317148800.0, + "grad_norm": 3.114382300094, + "language_loss": 0.73013008, + "learning_rate": 3.083648478122111e-06, + "loss": 0.75182486, + "num_input_tokens_seen": 120760410, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.83984375, + "step": 5625, + "time_per_iteration": 2.4632089138031006 + }, + { + "auxiliary_loss_clip": 0.01129587, + "auxiliary_loss_mlp": 0.01038275, + "balance_loss_clip": 1.02315879, + "balance_loss_mlp": 1.04408085, + "epoch": 0.3382534195099955, + "flos": 19278248768640.0, + "grad_norm": 1.7442247016960708, + "language_loss": 0.70501375, + "learning_rate": 3.0833211182219497e-06, + "loss": 0.72669238, + "num_input_tokens_seen": 120777705, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.85546875, + "step": 5626, + "time_per_iteration": 2.4782652854919434 + }, + { + "auxiliary_loss_clip": 0.01123049, + "auxiliary_loss_mlp": 0.01033781, + "balance_loss_clip": 1.01887965, + "balance_loss_mlp": 1.04265583, + "epoch": 0.33831354276266346, + "flos": 25226240676480.0, + "grad_norm": 1.496721640957227, + "language_loss": 0.81184483, + "learning_rate": 3.0829937172421425e-06, + "loss": 0.83341312, + "num_input_tokens_seen": 120798660, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8046875, + "step": 5627, + "time_per_iteration": 2.48683762550354 + }, + { + "auxiliary_loss_clip": 0.01133025, + "auxiliary_loss_mlp": 0.01038727, + "balance_loss_clip": 1.02332532, + "balance_loss_mlp": 1.04643917, + "epoch": 0.3383736660153314, + "flos": 23112000195840.0, + "grad_norm": 2.112092075284961, + "language_loss": 0.80725849, + "learning_rate": 3.0826662751951055e-06, + "loss": 0.82897604, + "num_input_tokens_seen": 120816705, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.86328125, + "step": 5628, + "time_per_iteration": 2.485978841781616 + }, + { + "auxiliary_loss_clip": 0.01125942, + "auxiliary_loss_mlp": 0.01032154, + "balance_loss_clip": 1.01716328, + "balance_loss_mlp": 1.04272234, + "epoch": 0.3384337892679994, + "flos": 23477139901440.0, + "grad_norm": 1.9378827683544937, + "language_loss": 0.77360773, + "learning_rate": 3.082338792093254e-06, + "loss": 0.79518872, + "num_input_tokens_seen": 120835375, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.83203125, + "step": 5629, + "time_per_iteration": 2.459749937057495 + }, + { + "auxiliary_loss_clip": 0.0112767, + "auxiliary_loss_mlp": 0.01042637, + "balance_loss_clip": 1.02604353, + "balance_loss_mlp": 1.0426172, + "epoch": 0.33849391252066735, + "flos": 19425805839360.0, + "grad_norm": 1.750727836719773, + "language_loss": 0.84873146, + "learning_rate": 3.0820112679490074e-06, + "loss": 0.87043452, + "num_input_tokens_seen": 120854260, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.84765625, + "step": 5630, + "time_per_iteration": 2.502168655395508 + }, + { + "auxiliary_loss_clip": 0.01128287, + "auxiliary_loss_mlp": 0.01039609, + "balance_loss_clip": 1.02593017, + "balance_loss_mlp": 1.04496086, + "epoch": 0.3385540357733353, + "flos": 21064840364160.0, + "grad_norm": 2.44277401951878, + "language_loss": 0.71778762, + "learning_rate": 3.0816837027747857e-06, + "loss": 0.73946661, + "num_input_tokens_seen": 120871590, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.83203125, + "step": 5631, + "time_per_iteration": 2.4541988372802734 + }, + { + "auxiliary_loss_clip": 0.01044995, + "auxiliary_loss_mlp": 0.01006836, + "balance_loss_clip": 1.0050118, + "balance_loss_mlp": 1.01844144, + "epoch": 0.3386141590260033, + "flos": 69208013450880.0, + "grad_norm": 0.84858361279948, + "language_loss": 0.56171906, + "learning_rate": 3.0813560965830084e-06, + "loss": 0.58223736, + "num_input_tokens_seen": 120925550, + "router_z_loss_clip": 0.01818848, + "router_z_loss_mlp": 0.265625, + "step": 5632, + "time_per_iteration": 3.130112409591675 + }, + { + "auxiliary_loss_clip": 0.01126092, + "auxiliary_loss_mlp": 0.01034757, + "balance_loss_clip": 1.01925933, + "balance_loss_mlp": 1.04301071, + "epoch": 0.3386742822786713, + "flos": 25519487310720.0, + "grad_norm": 1.4746675536042473, + "language_loss": 0.80288029, + "learning_rate": 3.0810284493861005e-06, + "loss": 0.82448882, + "num_input_tokens_seen": 120947620, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.828125, + "step": 5633, + "time_per_iteration": 2.4772210121154785 + }, + { + "auxiliary_loss_clip": 0.01126262, + "auxiliary_loss_mlp": 0.01031131, + "balance_loss_clip": 1.01671278, + "balance_loss_mlp": 1.04355168, + "epoch": 0.33873440553133927, + "flos": 23623116773760.0, + "grad_norm": 2.3860801146544692, + "language_loss": 0.59222949, + "learning_rate": 3.0807007611964855e-06, + "loss": 0.61380345, + "num_input_tokens_seen": 120965205, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.828125, + "step": 5634, + "time_per_iteration": 2.490783214569092 + }, + { + "auxiliary_loss_clip": 0.01124946, + "auxiliary_loss_mlp": 0.01033397, + "balance_loss_clip": 1.01930678, + "balance_loss_mlp": 1.04328096, + "epoch": 0.33879452878400723, + "flos": 17088882992640.0, + "grad_norm": 1.758176339753219, + "language_loss": 0.92591304, + "learning_rate": 3.080373032026589e-06, + "loss": 0.94749641, + "num_input_tokens_seen": 120983560, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.81640625, + "step": 5635, + "time_per_iteration": 2.4895272254943848 + }, + { + "auxiliary_loss_clip": 0.01123271, + "auxiliary_loss_mlp": 0.01030062, + "balance_loss_clip": 1.01594758, + "balance_loss_mlp": 1.04428411, + "epoch": 0.3388546520366752, + "flos": 15742053607680.0, + "grad_norm": 1.7397877385381144, + "language_loss": 0.74791968, + "learning_rate": 3.0800452618888386e-06, + "loss": 0.76945299, + "num_input_tokens_seen": 121001400, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 5636, + "time_per_iteration": 2.4868686199188232 + }, + { + "auxiliary_loss_clip": 0.01123089, + "auxiliary_loss_mlp": 0.01037449, + "balance_loss_clip": 1.02264357, + "balance_loss_mlp": 1.04291928, + "epoch": 0.33891477528934316, + "flos": 22418744728320.0, + "grad_norm": 1.533650755617547, + "language_loss": 0.83216572, + "learning_rate": 3.0797174507956637e-06, + "loss": 0.85377115, + "num_input_tokens_seen": 121021760, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.80078125, + "step": 5637, + "time_per_iteration": 5.43249249458313 + }, + { + "auxiliary_loss_clip": 0.0112926, + "auxiliary_loss_mlp": 0.01044612, + "balance_loss_clip": 1.02837586, + "balance_loss_mlp": 1.04624391, + "epoch": 0.3389748985420111, + "flos": 17274828723840.0, + "grad_norm": 1.6200031021198193, + "language_loss": 0.70037901, + "learning_rate": 3.079389598759495e-06, + "loss": 0.72211778, + "num_input_tokens_seen": 121041070, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.828125, + "step": 5638, + "time_per_iteration": 2.430814504623413 + }, + { + "auxiliary_loss_clip": 0.01128885, + "auxiliary_loss_mlp": 0.0104494, + "balance_loss_clip": 1.02993131, + "balance_loss_mlp": 1.0461942, + "epoch": 0.3390350217946791, + "flos": 27744979190400.0, + "grad_norm": 1.644027939558444, + "language_loss": 0.80699074, + "learning_rate": 3.079061705792765e-06, + "loss": 0.82872897, + "num_input_tokens_seen": 121060890, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 5639, + "time_per_iteration": 2.5219810009002686 + }, + { + "auxiliary_loss_clip": 0.01129363, + "auxiliary_loss_mlp": 0.01042542, + "balance_loss_clip": 1.02714002, + "balance_loss_mlp": 1.044734, + "epoch": 0.33909514504734706, + "flos": 20339804338560.0, + "grad_norm": 2.006873412015597, + "language_loss": 0.67907631, + "learning_rate": 3.078733771907907e-06, + "loss": 0.70079535, + "num_input_tokens_seen": 121079135, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84765625, + "step": 5640, + "time_per_iteration": 2.4252562522888184 + }, + { + "auxiliary_loss_clip": 0.01123424, + "auxiliary_loss_mlp": 0.01037389, + "balance_loss_clip": 1.02229738, + "balance_loss_mlp": 1.0432744, + "epoch": 0.339155268300015, + "flos": 14830030356480.0, + "grad_norm": 1.561334672972187, + "language_loss": 0.70158339, + "learning_rate": 3.0784057971173554e-06, + "loss": 0.72319156, + "num_input_tokens_seen": 121097685, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.80078125, + "step": 5641, + "time_per_iteration": 2.4703073501586914 + }, + { + "auxiliary_loss_clip": 0.01129782, + "auxiliary_loss_mlp": 0.01043462, + "balance_loss_clip": 1.02881122, + "balance_loss_mlp": 1.04692698, + "epoch": 0.339215391552683, + "flos": 26067951054720.0, + "grad_norm": 1.7323035027878293, + "language_loss": 0.87336594, + "learning_rate": 3.0780777814335483e-06, + "loss": 0.89509839, + "num_input_tokens_seen": 121115640, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.828125, + "step": 5642, + "time_per_iteration": 2.489957332611084 + }, + { + "auxiliary_loss_clip": 0.01119376, + "auxiliary_loss_mlp": 0.01030563, + "balance_loss_clip": 1.01812363, + "balance_loss_mlp": 1.04361117, + "epoch": 0.33927551480535095, + "flos": 14574705505920.0, + "grad_norm": 1.899951429632433, + "language_loss": 0.83783317, + "learning_rate": 3.077749724868924e-06, + "loss": 0.85933256, + "num_input_tokens_seen": 121132485, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7578125, + "step": 5643, + "time_per_iteration": 2.454176902770996 + }, + { + "auxiliary_loss_clip": 0.01122874, + "auxiliary_loss_mlp": 0.01041824, + "balance_loss_clip": 1.02779329, + "balance_loss_mlp": 1.04303253, + "epoch": 0.3393356380580189, + "flos": 23805578885760.0, + "grad_norm": 1.6286036888414737, + "language_loss": 0.76940101, + "learning_rate": 3.077421627435922e-06, + "loss": 0.79104799, + "num_input_tokens_seen": 121152935, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.796875, + "step": 5644, + "time_per_iteration": 2.46893048286438 + }, + { + "auxiliary_loss_clip": 0.01124612, + "auxiliary_loss_mlp": 0.01043858, + "balance_loss_clip": 1.02898121, + "balance_loss_mlp": 1.04242706, + "epoch": 0.3393957613106869, + "flos": 17347871030400.0, + "grad_norm": 4.638882451456986, + "language_loss": 0.62893367, + "learning_rate": 3.0770934891469832e-06, + "loss": 0.65061837, + "num_input_tokens_seen": 121169835, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8203125, + "step": 5645, + "time_per_iteration": 2.4539859294891357 + }, + { + "auxiliary_loss_clip": 0.01120022, + "auxiliary_loss_mlp": 0.01033694, + "balance_loss_clip": 1.02033067, + "balance_loss_mlp": 1.04122853, + "epoch": 0.3394558845633549, + "flos": 28433960939520.0, + "grad_norm": 2.1237754414429637, + "language_loss": 0.76276195, + "learning_rate": 3.076765310014552e-06, + "loss": 0.78429914, + "num_input_tokens_seen": 121190290, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7890625, + "step": 5646, + "time_per_iteration": 2.4913554191589355 + }, + { + "auxiliary_loss_clip": 0.01128945, + "auxiliary_loss_mlp": 0.01043856, + "balance_loss_clip": 1.02835846, + "balance_loss_mlp": 1.04360342, + "epoch": 0.33951600781602287, + "flos": 22086929865600.0, + "grad_norm": 1.9547585113359744, + "language_loss": 0.79175937, + "learning_rate": 3.0764370900510727e-06, + "loss": 0.81348741, + "num_input_tokens_seen": 121209060, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.85546875, + "step": 5647, + "time_per_iteration": 2.521603584289551 + }, + { + "auxiliary_loss_clip": 0.01128449, + "auxiliary_loss_mlp": 0.01040236, + "balance_loss_clip": 1.02541864, + "balance_loss_mlp": 1.04706085, + "epoch": 0.33957613106869083, + "flos": 23878262056320.0, + "grad_norm": 1.87789373580567, + "language_loss": 0.77358377, + "learning_rate": 3.0761088292689904e-06, + "loss": 0.79527068, + "num_input_tokens_seen": 121227480, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8125, + "step": 5648, + "time_per_iteration": 2.4812231063842773 + }, + { + "auxiliary_loss_clip": 0.0104448, + "auxiliary_loss_mlp": 0.01001624, + "balance_loss_clip": 0.99964541, + "balance_loss_mlp": 1.01817107, + "epoch": 0.3396362543213588, + "flos": 71242642414080.0, + "grad_norm": 0.7825270224300925, + "language_loss": 0.56261832, + "learning_rate": 3.075780527680754e-06, + "loss": 0.5830794, + "num_input_tokens_seen": 121291305, + "router_z_loss_clip": 0.01977539, + "router_z_loss_mlp": 0.26171875, + "step": 5649, + "time_per_iteration": 3.1050350666046143 + }, + { + "auxiliary_loss_clip": 0.01123703, + "auxiliary_loss_mlp": 0.01042955, + "balance_loss_clip": 1.02804756, + "balance_loss_mlp": 1.0422622, + "epoch": 0.33969637757402676, + "flos": 25921615046400.0, + "grad_norm": 1.5021179324123226, + "language_loss": 0.85269898, + "learning_rate": 3.0754521852988117e-06, + "loss": 0.87436557, + "num_input_tokens_seen": 121312740, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8125, + "step": 5650, + "time_per_iteration": 2.5013816356658936 + }, + { + "auxiliary_loss_clip": 0.01123225, + "auxiliary_loss_mlp": 0.01029214, + "balance_loss_clip": 1.01540327, + "balance_loss_mlp": 1.04317355, + "epoch": 0.33975650082669473, + "flos": 35261728663680.0, + "grad_norm": 1.6954461839420942, + "language_loss": 0.70868433, + "learning_rate": 3.0751238021356152e-06, + "loss": 0.73020875, + "num_input_tokens_seen": 121334220, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.80078125, + "step": 5651, + "time_per_iteration": 2.579455852508545 + }, + { + "auxiliary_loss_clip": 0.01123721, + "auxiliary_loss_mlp": 0.01037329, + "balance_loss_clip": 1.02354813, + "balance_loss_mlp": 1.04347372, + "epoch": 0.3398166240793627, + "flos": 16647001879680.0, + "grad_norm": 1.7042541017727943, + "language_loss": 0.81267643, + "learning_rate": 3.074795378203616e-06, + "loss": 0.83428693, + "num_input_tokens_seen": 121351870, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.8046875, + "step": 5652, + "time_per_iteration": 2.4690871238708496 + }, + { + "auxiliary_loss_clip": 0.01128696, + "auxiliary_loss_mlp": 0.01041185, + "balance_loss_clip": 1.02670693, + "balance_loss_mlp": 1.04464078, + "epoch": 0.33987674733203066, + "flos": 24062196625920.0, + "grad_norm": 1.8642865553854127, + "language_loss": 0.77315342, + "learning_rate": 3.0744669135152685e-06, + "loss": 0.79485226, + "num_input_tokens_seen": 121373400, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.83984375, + "step": 5653, + "time_per_iteration": 2.4836156368255615 + }, + { + "auxiliary_loss_clip": 0.01123907, + "auxiliary_loss_mlp": 0.01036159, + "balance_loss_clip": 1.02225959, + "balance_loss_mlp": 1.04310441, + "epoch": 0.3399368705846986, + "flos": 13250678279040.0, + "grad_norm": 4.3033812467068895, + "language_loss": 0.85072839, + "learning_rate": 3.0741384080830278e-06, + "loss": 0.87232912, + "num_input_tokens_seen": 121385225, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.80859375, + "step": 5654, + "time_per_iteration": 2.4139702320098877 + }, + { + "auxiliary_loss_clip": 0.01122836, + "auxiliary_loss_mlp": 0.01042828, + "balance_loss_clip": 1.02853489, + "balance_loss_mlp": 1.04074049, + "epoch": 0.3399969938373666, + "flos": 27012832272000.0, + "grad_norm": 5.132089356193866, + "language_loss": 0.65128249, + "learning_rate": 3.073809861919351e-06, + "loss": 0.67293918, + "num_input_tokens_seen": 121404735, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8203125, + "step": 5655, + "time_per_iteration": 2.475292444229126 + }, + { + "auxiliary_loss_clip": 0.01124776, + "auxiliary_loss_mlp": 0.01041891, + "balance_loss_clip": 1.02781832, + "balance_loss_mlp": 1.04365194, + "epoch": 0.34005711709003456, + "flos": 28550096588160.0, + "grad_norm": 1.4436453355930483, + "language_loss": 0.76766688, + "learning_rate": 3.073481275036697e-06, + "loss": 0.78933358, + "num_input_tokens_seen": 121426780, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8125, + "step": 5656, + "time_per_iteration": 2.550999879837036 + }, + { + "auxiliary_loss_clip": 0.01130894, + "auxiliary_loss_mlp": 0.01039317, + "balance_loss_clip": 1.02413023, + "balance_loss_mlp": 1.04413342, + "epoch": 0.3401172403427025, + "flos": 21617003208960.0, + "grad_norm": 1.5863892165941962, + "language_loss": 0.82438695, + "learning_rate": 3.073152647447525e-06, + "loss": 0.84608912, + "num_input_tokens_seen": 121447245, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8671875, + "step": 5657, + "time_per_iteration": 2.4573473930358887 + }, + { + "auxiliary_loss_clip": 0.01122831, + "auxiliary_loss_mlp": 0.01040787, + "balance_loss_clip": 1.02675629, + "balance_loss_mlp": 1.04342616, + "epoch": 0.3401773635953705, + "flos": 25885776251520.0, + "grad_norm": 1.6511746791476316, + "language_loss": 0.85153604, + "learning_rate": 3.0728239791642976e-06, + "loss": 0.87317222, + "num_input_tokens_seen": 121468165, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.79296875, + "step": 5658, + "time_per_iteration": 2.505319833755493 + }, + { + "auxiliary_loss_clip": 0.01042351, + "auxiliary_loss_mlp": 0.01002353, + "balance_loss_clip": 1.0001955, + "balance_loss_mlp": 1.01611352, + "epoch": 0.3402374868480385, + "flos": 65507995336320.0, + "grad_norm": 0.8147477326465351, + "language_loss": 0.60012162, + "learning_rate": 3.072495270199477e-06, + "loss": 0.62056863, + "num_input_tokens_seen": 121523795, + "router_z_loss_clip": 0.02160645, + "router_z_loss_mlp": 0.26171875, + "step": 5659, + "time_per_iteration": 3.024125814437866 + }, + { + "auxiliary_loss_clip": 0.01122626, + "auxiliary_loss_mlp": 0.01035679, + "balance_loss_clip": 1.02190423, + "balance_loss_mlp": 1.04398155, + "epoch": 0.34029761010070647, + "flos": 24060580513920.0, + "grad_norm": 1.936270792227836, + "language_loss": 0.67855251, + "learning_rate": 3.0721665205655284e-06, + "loss": 0.70013559, + "num_input_tokens_seen": 121542950, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.78515625, + "step": 5660, + "time_per_iteration": 2.5009706020355225 + }, + { + "auxiliary_loss_clip": 0.01125634, + "auxiliary_loss_mlp": 0.01045639, + "balance_loss_clip": 1.0307138, + "balance_loss_mlp": 1.04558277, + "epoch": 0.34035773335337444, + "flos": 27599720590080.0, + "grad_norm": 1.6106101267942714, + "language_loss": 0.67213613, + "learning_rate": 3.071837730274918e-06, + "loss": 0.69384885, + "num_input_tokens_seen": 121562765, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.80078125, + "step": 5661, + "time_per_iteration": 2.501034736633301 + }, + { + "auxiliary_loss_clip": 0.01123137, + "auxiliary_loss_mlp": 0.01037886, + "balance_loss_clip": 1.0241766, + "balance_loss_mlp": 1.04442382, + "epoch": 0.3404178566060424, + "flos": 20812783651200.0, + "grad_norm": 1.9145784194305409, + "language_loss": 0.78845918, + "learning_rate": 3.071508899340113e-06, + "loss": 0.81006938, + "num_input_tokens_seen": 121581610, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7890625, + "step": 5662, + "time_per_iteration": 2.4689018726348877 + }, + { + "auxiliary_loss_clip": 0.01123734, + "auxiliary_loss_mlp": 0.01039121, + "balance_loss_clip": 1.02395773, + "balance_loss_mlp": 1.04277706, + "epoch": 0.34047797985871037, + "flos": 26833566470400.0, + "grad_norm": 1.9415115692891318, + "language_loss": 0.73675144, + "learning_rate": 3.0711800277735833e-06, + "loss": 0.75838, + "num_input_tokens_seen": 121601885, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.80859375, + "step": 5663, + "time_per_iteration": 2.4802587032318115 + }, + { + "auxiliary_loss_clip": 0.01121343, + "auxiliary_loss_mlp": 0.01034164, + "balance_loss_clip": 1.02101541, + "balance_loss_mlp": 1.04342198, + "epoch": 0.34053810311137833, + "flos": 19682639061120.0, + "grad_norm": 2.0753473798431608, + "language_loss": 0.85900557, + "learning_rate": 3.0708511155877997e-06, + "loss": 0.88056058, + "num_input_tokens_seen": 121621335, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.77734375, + "step": 5664, + "time_per_iteration": 2.46343731880188 + }, + { + "auxiliary_loss_clip": 0.01127455, + "auxiliary_loss_mlp": 0.01033802, + "balance_loss_clip": 1.02055156, + "balance_loss_mlp": 1.0459125, + "epoch": 0.3405982263640463, + "flos": 21725740656000.0, + "grad_norm": 1.782528704092853, + "language_loss": 0.69047546, + "learning_rate": 3.070522162795235e-06, + "loss": 0.71208799, + "num_input_tokens_seen": 121641310, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.81640625, + "step": 5665, + "time_per_iteration": 2.4448721408843994 + }, + { + "auxiliary_loss_clip": 0.01123992, + "auxiliary_loss_mlp": 0.01035732, + "balance_loss_clip": 1.02006817, + "balance_loss_mlp": 1.04218054, + "epoch": 0.34065834961671426, + "flos": 18041629288320.0, + "grad_norm": 2.296518315240935, + "language_loss": 0.72806692, + "learning_rate": 3.0701931694083626e-06, + "loss": 0.74966413, + "num_input_tokens_seen": 121659625, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8203125, + "step": 5666, + "time_per_iteration": 2.4749717712402344 + }, + { + "auxiliary_loss_clip": 0.01126484, + "auxiliary_loss_mlp": 0.0103642, + "balance_loss_clip": 1.02236485, + "balance_loss_mlp": 1.04428983, + "epoch": 0.3407184728693822, + "flos": 21397337585280.0, + "grad_norm": 1.5083890198292058, + "language_loss": 0.73306108, + "learning_rate": 3.0698641354396576e-06, + "loss": 0.75469005, + "num_input_tokens_seen": 121679205, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8203125, + "step": 5667, + "time_per_iteration": 2.467684030532837 + }, + { + "auxiliary_loss_clip": 0.0104148, + "auxiliary_loss_mlp": 0.01001962, + "balance_loss_clip": 1.00007808, + "balance_loss_mlp": 1.01518095, + "epoch": 0.3407785961220502, + "flos": 68688101018880.0, + "grad_norm": 0.8424548288565059, + "language_loss": 0.6331358, + "learning_rate": 3.069535060901597e-06, + "loss": 0.65357018, + "num_input_tokens_seen": 121751085, + "router_z_loss_clip": 0.01879883, + "router_z_loss_mlp": 0.26367188, + "step": 5668, + "time_per_iteration": 3.233991861343384 + }, + { + "auxiliary_loss_clip": 0.01124776, + "auxiliary_loss_mlp": 0.01039147, + "balance_loss_clip": 1.02460372, + "balance_loss_mlp": 1.04407477, + "epoch": 0.34083871937471816, + "flos": 14064379027200.0, + "grad_norm": 2.1457172939364892, + "language_loss": 0.72030753, + "learning_rate": 3.0692059458066596e-06, + "loss": 0.74194676, + "num_input_tokens_seen": 121768565, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.80859375, + "step": 5669, + "time_per_iteration": 2.4226186275482178 + }, + { + "auxiliary_loss_clip": 0.01127607, + "auxiliary_loss_mlp": 0.01035265, + "balance_loss_clip": 1.02078128, + "balance_loss_mlp": 1.04468203, + "epoch": 0.3408988426273861, + "flos": 17085435287040.0, + "grad_norm": 1.9050671295461388, + "language_loss": 0.80285168, + "learning_rate": 3.0688767901673265e-06, + "loss": 0.82448041, + "num_input_tokens_seen": 121784925, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.828125, + "step": 5670, + "time_per_iteration": 2.4354984760284424 + }, + { + "auxiliary_loss_clip": 0.01127772, + "auxiliary_loss_mlp": 0.0103567, + "balance_loss_clip": 1.02122176, + "balance_loss_mlp": 1.04374027, + "epoch": 0.3409589658800541, + "flos": 24024562151040.0, + "grad_norm": 1.5994061750955757, + "language_loss": 0.76886785, + "learning_rate": 3.068547593996078e-06, + "loss": 0.79050225, + "num_input_tokens_seen": 121804425, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.83984375, + "step": 5671, + "time_per_iteration": 2.4775397777557373 + }, + { + "auxiliary_loss_clip": 0.01125342, + "auxiliary_loss_mlp": 0.01040087, + "balance_loss_clip": 1.02513266, + "balance_loss_mlp": 1.04437792, + "epoch": 0.34101908913272205, + "flos": 21142012734720.0, + "grad_norm": 1.9602332848552635, + "language_loss": 0.74416959, + "learning_rate": 3.0682183573053974e-06, + "loss": 0.7658239, + "num_input_tokens_seen": 121825145, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.80859375, + "step": 5672, + "time_per_iteration": 2.5027272701263428 + }, + { + "auxiliary_loss_clip": 0.01127201, + "auxiliary_loss_mlp": 0.01032286, + "balance_loss_clip": 1.01889324, + "balance_loss_mlp": 1.04523087, + "epoch": 0.3410792123853901, + "flos": 15702012921600.0, + "grad_norm": 1.991076139860355, + "language_loss": 0.73781157, + "learning_rate": 3.06788908010777e-06, + "loss": 0.75940639, + "num_input_tokens_seen": 121842185, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.8203125, + "step": 5673, + "time_per_iteration": 2.424955368041992 + }, + { + "auxiliary_loss_clip": 0.01123926, + "auxiliary_loss_mlp": 0.01036446, + "balance_loss_clip": 1.02243853, + "balance_loss_mlp": 1.04432535, + "epoch": 0.34113933563805804, + "flos": 23036012974080.0, + "grad_norm": 1.774655206888726, + "language_loss": 0.79900169, + "learning_rate": 3.067559762415682e-06, + "loss": 0.8206054, + "num_input_tokens_seen": 121862260, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.796875, + "step": 5674, + "time_per_iteration": 2.490407705307007 + }, + { + "auxiliary_loss_clip": 0.01041345, + "auxiliary_loss_mlp": 0.01001058, + "balance_loss_clip": 0.99942493, + "balance_loss_mlp": 1.01517344, + "epoch": 0.341199458890726, + "flos": 69614235336960.0, + "grad_norm": 0.7963469989165133, + "language_loss": 0.56096685, + "learning_rate": 3.0672304042416198e-06, + "loss": 0.58139086, + "num_input_tokens_seen": 121923560, + "router_z_loss_clip": 0.01635742, + "router_z_loss_mlp": 0.26171875, + "step": 5675, + "time_per_iteration": 3.223119020462036 + }, + { + "auxiliary_loss_clip": 0.01123194, + "auxiliary_loss_mlp": 0.01041089, + "balance_loss_clip": 1.0270282, + "balance_loss_mlp": 1.04428756, + "epoch": 0.34125958214339397, + "flos": 22346348866560.0, + "grad_norm": 1.6179892480447855, + "language_loss": 0.79029286, + "learning_rate": 3.0669010055980734e-06, + "loss": 0.81193566, + "num_input_tokens_seen": 121943515, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 5676, + "time_per_iteration": 2.4798848628997803 + }, + { + "auxiliary_loss_clip": 0.01123343, + "auxiliary_loss_mlp": 0.0103332, + "balance_loss_clip": 1.01836538, + "balance_loss_mlp": 1.0424788, + "epoch": 0.34131970539606193, + "flos": 21871933009920.0, + "grad_norm": 1.8072554320592242, + "language_loss": 0.85598934, + "learning_rate": 3.0665715664975357e-06, + "loss": 0.87755597, + "num_input_tokens_seen": 121962540, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.80859375, + "step": 5677, + "time_per_iteration": 2.4501733779907227 + }, + { + "auxiliary_loss_clip": 0.01122654, + "auxiliary_loss_mlp": 0.01041495, + "balance_loss_clip": 1.0266118, + "balance_loss_mlp": 1.04244757, + "epoch": 0.3413798286487299, + "flos": 24935723475840.0, + "grad_norm": 2.009404852791833, + "language_loss": 0.79283166, + "learning_rate": 3.0662420869524966e-06, + "loss": 0.81447315, + "num_input_tokens_seen": 121979830, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.80078125, + "step": 5678, + "time_per_iteration": 4.054651260375977 + }, + { + "auxiliary_loss_clip": 0.01123013, + "auxiliary_loss_mlp": 0.01033318, + "balance_loss_clip": 1.01983547, + "balance_loss_mlp": 1.04135132, + "epoch": 0.34143995190139786, + "flos": 25374372364800.0, + "grad_norm": 1.8818653655236122, + "language_loss": 0.74546856, + "learning_rate": 3.0659125669754506e-06, + "loss": 0.76703185, + "num_input_tokens_seen": 121999055, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.81640625, + "step": 5679, + "time_per_iteration": 3.9024462699890137 + }, + { + "auxiliary_loss_clip": 0.01042201, + "auxiliary_loss_mlp": 0.01001255, + "balance_loss_clip": 0.99970549, + "balance_loss_mlp": 1.01624846, + "epoch": 0.34150007515406583, + "flos": 67782578129280.0, + "grad_norm": 0.7519133883291979, + "language_loss": 0.59481025, + "learning_rate": 3.0655830065788923e-06, + "loss": 0.61524487, + "num_input_tokens_seen": 122067015, + "router_z_loss_clip": 0.01544189, + "router_z_loss_mlp": 0.25976562, + "step": 5680, + "time_per_iteration": 3.152480125427246 + }, + { + "auxiliary_loss_clip": 0.01121207, + "auxiliary_loss_mlp": 0.01033444, + "balance_loss_clip": 1.01953864, + "balance_loss_mlp": 1.04320455, + "epoch": 0.3415601984067338, + "flos": 20302421258880.0, + "grad_norm": 2.208026502208574, + "language_loss": 0.7233687, + "learning_rate": 3.0652534057753206e-06, + "loss": 0.74491525, + "num_input_tokens_seen": 122085295, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.78125, + "step": 5681, + "time_per_iteration": 2.4450337886810303 + }, + { + "auxiliary_loss_clip": 0.01118824, + "auxiliary_loss_mlp": 0.01041972, + "balance_loss_clip": 1.02798879, + "balance_loss_mlp": 1.04110432, + "epoch": 0.34162032165940176, + "flos": 26031178506240.0, + "grad_norm": 2.0075854608407058, + "language_loss": 0.7144351, + "learning_rate": 3.064923764577233e-06, + "loss": 0.7360431, + "num_input_tokens_seen": 122104020, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7734375, + "step": 5682, + "time_per_iteration": 2.53000807762146 + }, + { + "auxiliary_loss_clip": 0.01120348, + "auxiliary_loss_mlp": 0.01039153, + "balance_loss_clip": 1.02446055, + "balance_loss_mlp": 1.04079127, + "epoch": 0.3416804449120697, + "flos": 28803338449920.0, + "grad_norm": 1.4570201559150766, + "language_loss": 0.8396616, + "learning_rate": 3.0645940829971295e-06, + "loss": 0.86125666, + "num_input_tokens_seen": 122125080, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.796875, + "step": 5683, + "time_per_iteration": 2.511646270751953 + }, + { + "auxiliary_loss_clip": 0.01126192, + "auxiliary_loss_mlp": 0.01047188, + "balance_loss_clip": 1.03189898, + "balance_loss_mlp": 1.04384482, + "epoch": 0.3417405681647377, + "flos": 22601601889920.0, + "grad_norm": 2.5567263249521965, + "language_loss": 0.70622635, + "learning_rate": 3.0642643610475116e-06, + "loss": 0.72796011, + "num_input_tokens_seen": 122146350, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.82421875, + "step": 5684, + "time_per_iteration": 2.58811616897583 + }, + { + "auxiliary_loss_clip": 0.01120756, + "auxiliary_loss_mlp": 0.01034084, + "balance_loss_clip": 1.02119195, + "balance_loss_mlp": 1.0428822, + "epoch": 0.34180069141740566, + "flos": 24716237420160.0, + "grad_norm": 1.480860615854928, + "language_loss": 0.75386423, + "learning_rate": 3.0639345987408823e-06, + "loss": 0.77541268, + "num_input_tokens_seen": 122168085, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.78125, + "step": 5685, + "time_per_iteration": 2.485405445098877 + }, + { + "auxiliary_loss_clip": 0.01120925, + "auxiliary_loss_mlp": 0.0103682, + "balance_loss_clip": 1.02399325, + "balance_loss_mlp": 1.04268134, + "epoch": 0.3418608146700737, + "flos": 30518755246080.0, + "grad_norm": 1.6707381387615057, + "language_loss": 0.70186603, + "learning_rate": 3.0636047960897468e-06, + "loss": 0.72344351, + "num_input_tokens_seen": 122191040, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.78125, + "step": 5686, + "time_per_iteration": 2.5536224842071533 + }, + { + "auxiliary_loss_clip": 0.01121848, + "auxiliary_loss_mlp": 0.01042102, + "balance_loss_clip": 1.02681327, + "balance_loss_mlp": 1.04087019, + "epoch": 0.34192093792274164, + "flos": 15122343237120.0, + "grad_norm": 2.6880234800017844, + "language_loss": 0.77629769, + "learning_rate": 3.06327495310661e-06, + "loss": 0.79793721, + "num_input_tokens_seen": 122209225, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8125, + "step": 5687, + "time_per_iteration": 2.4526383876800537 + }, + { + "auxiliary_loss_clip": 0.01122013, + "auxiliary_loss_mlp": 0.01034386, + "balance_loss_clip": 1.01947296, + "balance_loss_mlp": 1.04425466, + "epoch": 0.3419810611754096, + "flos": 13187799521280.0, + "grad_norm": 1.7522626505921908, + "language_loss": 0.86505169, + "learning_rate": 3.062945069803981e-06, + "loss": 0.88661563, + "num_input_tokens_seen": 122226160, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.77734375, + "step": 5688, + "time_per_iteration": 2.457873821258545 + }, + { + "auxiliary_loss_clip": 0.01129554, + "auxiliary_loss_mlp": 0.01038372, + "balance_loss_clip": 1.02274323, + "balance_loss_mlp": 1.04438853, + "epoch": 0.34204118442807757, + "flos": 19536267139200.0, + "grad_norm": 1.6277101200549902, + "language_loss": 0.79875666, + "learning_rate": 3.0626151461943684e-06, + "loss": 0.82043588, + "num_input_tokens_seen": 122243115, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8515625, + "step": 5689, + "time_per_iteration": 2.4494895935058594 + }, + { + "auxiliary_loss_clip": 0.01124588, + "auxiliary_loss_mlp": 0.01038419, + "balance_loss_clip": 1.02351832, + "balance_loss_mlp": 1.04300821, + "epoch": 0.34210130768074554, + "flos": 15194846839680.0, + "grad_norm": 2.0745412821804057, + "language_loss": 0.7351048, + "learning_rate": 3.0622851822902834e-06, + "loss": 0.75673485, + "num_input_tokens_seen": 122261105, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.81640625, + "step": 5690, + "time_per_iteration": 2.448133945465088 + }, + { + "auxiliary_loss_clip": 0.01120421, + "auxiliary_loss_mlp": 0.01036215, + "balance_loss_clip": 1.02270865, + "balance_loss_mlp": 1.03998768, + "epoch": 0.3421614309334135, + "flos": 24936226266240.0, + "grad_norm": 2.433761635396741, + "language_loss": 0.7631194, + "learning_rate": 3.061955178104237e-06, + "loss": 0.78468573, + "num_input_tokens_seen": 122279995, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.8046875, + "step": 5691, + "time_per_iteration": 2.479569435119629 + }, + { + "auxiliary_loss_clip": 0.01120907, + "auxiliary_loss_mlp": 0.01041441, + "balance_loss_clip": 1.02782106, + "balance_loss_mlp": 1.0415988, + "epoch": 0.34222155418608147, + "flos": 21908633731200.0, + "grad_norm": 1.5387604656502187, + "language_loss": 0.68159282, + "learning_rate": 3.0616251336487447e-06, + "loss": 0.70321631, + "num_input_tokens_seen": 122299070, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.79296875, + "step": 5692, + "time_per_iteration": 2.490466356277466 + }, + { + "auxiliary_loss_clip": 0.01124667, + "auxiliary_loss_mlp": 0.01042741, + "balance_loss_clip": 1.02682638, + "balance_loss_mlp": 1.04275179, + "epoch": 0.34228167743874943, + "flos": 18114061063680.0, + "grad_norm": 2.6924087388900606, + "language_loss": 0.72292894, + "learning_rate": 3.06129504893632e-06, + "loss": 0.74460298, + "num_input_tokens_seen": 122316800, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8203125, + "step": 5693, + "time_per_iteration": 2.451026439666748 + }, + { + "auxiliary_loss_clip": 0.01122133, + "auxiliary_loss_mlp": 0.01037278, + "balance_loss_clip": 1.02408743, + "balance_loss_mlp": 1.0417974, + "epoch": 0.3423418006914174, + "flos": 21288600138240.0, + "grad_norm": 1.7157866574439644, + "language_loss": 0.75877678, + "learning_rate": 3.0609649239794813e-06, + "loss": 0.78037089, + "num_input_tokens_seen": 122335275, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.8046875, + "step": 5694, + "time_per_iteration": 2.499997615814209 + }, + { + "auxiliary_loss_clip": 0.01119791, + "auxiliary_loss_mlp": 0.01036927, + "balance_loss_clip": 1.02320051, + "balance_loss_mlp": 1.04253125, + "epoch": 0.34240192394408536, + "flos": 19823480288640.0, + "grad_norm": 1.9697512050835562, + "language_loss": 0.79815507, + "learning_rate": 3.060634758790747e-06, + "loss": 0.81972229, + "num_input_tokens_seen": 122353215, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7734375, + "step": 5695, + "time_per_iteration": 2.4279983043670654 + }, + { + "auxiliary_loss_clip": 0.01122261, + "auxiliary_loss_mlp": 0.01039624, + "balance_loss_clip": 1.0248661, + "balance_loss_mlp": 1.04168487, + "epoch": 0.3424620471967533, + "flos": 24535535074560.0, + "grad_norm": 1.7314755849975545, + "language_loss": 0.73487073, + "learning_rate": 3.060304553382635e-06, + "loss": 0.75648957, + "num_input_tokens_seen": 122372495, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8046875, + "step": 5696, + "time_per_iteration": 2.507782459259033 + }, + { + "auxiliary_loss_clip": 0.01122963, + "auxiliary_loss_mlp": 0.01047657, + "balance_loss_clip": 1.03301835, + "balance_loss_mlp": 1.0419805, + "epoch": 0.3425221704494213, + "flos": 25848895962240.0, + "grad_norm": 1.6676891559017708, + "language_loss": 0.70874155, + "learning_rate": 3.0599743077676685e-06, + "loss": 0.73044771, + "num_input_tokens_seen": 122394600, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.80859375, + "step": 5697, + "time_per_iteration": 2.4868175983428955 + }, + { + "auxiliary_loss_clip": 0.01122392, + "auxiliary_loss_mlp": 0.01034383, + "balance_loss_clip": 1.01949954, + "balance_loss_mlp": 1.04456246, + "epoch": 0.34258229370208926, + "flos": 21540513196800.0, + "grad_norm": 1.6712097888676536, + "language_loss": 0.81875223, + "learning_rate": 3.05964402195837e-06, + "loss": 0.84031999, + "num_input_tokens_seen": 122414700, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.77734375, + "step": 5698, + "time_per_iteration": 2.500499725341797 + }, + { + "auxiliary_loss_clip": 0.01121288, + "auxiliary_loss_mlp": 0.01043706, + "balance_loss_clip": 1.02712393, + "balance_loss_mlp": 1.03982306, + "epoch": 0.3426424169547573, + "flos": 23652778429440.0, + "grad_norm": 1.9988541020523172, + "language_loss": 0.69163442, + "learning_rate": 3.0593136959672645e-06, + "loss": 0.71328437, + "num_input_tokens_seen": 122432760, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8125, + "step": 5699, + "time_per_iteration": 2.4522063732147217 + }, + { + "auxiliary_loss_clip": 0.01123011, + "auxiliary_loss_mlp": 0.01035122, + "balance_loss_clip": 1.0211153, + "balance_loss_mlp": 1.0424068, + "epoch": 0.34270254020742524, + "flos": 24644883052800.0, + "grad_norm": 2.0139701241951196, + "language_loss": 0.72246462, + "learning_rate": 3.058983329806877e-06, + "loss": 0.74404591, + "num_input_tokens_seen": 122449105, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8046875, + "step": 5700, + "time_per_iteration": 2.4942879676818848 + }, + { + "auxiliary_loss_clip": 0.01123902, + "auxiliary_loss_mlp": 0.01033961, + "balance_loss_clip": 1.02018046, + "balance_loss_mlp": 1.04403377, + "epoch": 0.3427626634600932, + "flos": 20996754134400.0, + "grad_norm": 2.026861038115517, + "language_loss": 0.81818259, + "learning_rate": 3.0586529234897354e-06, + "loss": 0.83976114, + "num_input_tokens_seen": 122468700, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.796875, + "step": 5701, + "time_per_iteration": 2.4650135040283203 + }, + { + "auxiliary_loss_clip": 0.01124816, + "auxiliary_loss_mlp": 0.01032731, + "balance_loss_clip": 1.01886129, + "balance_loss_mlp": 1.04328442, + "epoch": 0.3428227867127612, + "flos": 21433786911360.0, + "grad_norm": 1.616013756330385, + "language_loss": 0.71818215, + "learning_rate": 3.0583224770283694e-06, + "loss": 0.73975766, + "num_input_tokens_seen": 122488160, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.81640625, + "step": 5702, + "time_per_iteration": 2.446018695831299 + }, + { + "auxiliary_loss_clip": 0.01038258, + "auxiliary_loss_mlp": 0.01007974, + "balance_loss_clip": 1.00623345, + "balance_loss_mlp": 1.01261425, + "epoch": 0.34288290996542914, + "flos": 55731782695680.0, + "grad_norm": 0.78067456401119, + "language_loss": 0.57387871, + "learning_rate": 3.057991990435309e-06, + "loss": 0.5943411, + "num_input_tokens_seen": 122542890, + "router_z_loss_clip": 0.01745605, + "router_z_loss_mlp": 0.2578125, + "step": 5703, + "time_per_iteration": 2.9596943855285645 + }, + { + "auxiliary_loss_clip": 0.01125647, + "auxiliary_loss_mlp": 0.01041995, + "balance_loss_clip": 1.02599692, + "balance_loss_mlp": 1.04436553, + "epoch": 0.3429430332180971, + "flos": 20156803522560.0, + "grad_norm": 1.8868866692845514, + "language_loss": 0.74849427, + "learning_rate": 3.057661463723086e-06, + "loss": 0.77017069, + "num_input_tokens_seen": 122561770, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8125, + "step": 5704, + "time_per_iteration": 2.475206136703491 + }, + { + "auxiliary_loss_clip": 0.01122188, + "auxiliary_loss_mlp": 0.01035238, + "balance_loss_clip": 1.0218513, + "balance_loss_mlp": 1.0432725, + "epoch": 0.34300315647076507, + "flos": 17965857548160.0, + "grad_norm": 2.4058395538044572, + "language_loss": 0.73303944, + "learning_rate": 3.0573308969042346e-06, + "loss": 0.75461364, + "num_input_tokens_seen": 122580580, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7890625, + "step": 5705, + "time_per_iteration": 2.435140609741211 + }, + { + "auxiliary_loss_clip": 0.0112299, + "auxiliary_loss_mlp": 0.01035313, + "balance_loss_clip": 1.0204711, + "balance_loss_mlp": 1.04320812, + "epoch": 0.34306327972343303, + "flos": 22086822124800.0, + "grad_norm": 3.54760070735666, + "language_loss": 0.79599071, + "learning_rate": 3.057000289991289e-06, + "loss": 0.81757367, + "num_input_tokens_seen": 122599810, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 5706, + "time_per_iteration": 2.4922068119049072 + }, + { + "auxiliary_loss_clip": 0.01127669, + "auxiliary_loss_mlp": 0.01032738, + "balance_loss_clip": 1.01815271, + "balance_loss_mlp": 1.04497337, + "epoch": 0.343123402976101, + "flos": 18442679616000.0, + "grad_norm": 1.9921713202453553, + "language_loss": 0.83170593, + "learning_rate": 3.056669642996787e-06, + "loss": 0.85330999, + "num_input_tokens_seen": 122616035, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.828125, + "step": 5707, + "time_per_iteration": 2.441812753677368 + }, + { + "auxiliary_loss_clip": 0.01126551, + "auxiliary_loss_mlp": 0.0103365, + "balance_loss_clip": 1.01919019, + "balance_loss_mlp": 1.04623604, + "epoch": 0.34318352622876896, + "flos": 17163685065600.0, + "grad_norm": 1.5424527465289883, + "language_loss": 0.75429368, + "learning_rate": 3.056338955933266e-06, + "loss": 0.77589571, + "num_input_tokens_seen": 122633785, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8046875, + "step": 5708, + "time_per_iteration": 2.448415756225586 + }, + { + "auxiliary_loss_clip": 0.01120092, + "auxiliary_loss_mlp": 0.01034667, + "balance_loss_clip": 1.02046943, + "balance_loss_mlp": 1.04284358, + "epoch": 0.34324364948143693, + "flos": 26688164215680.0, + "grad_norm": 1.6552343197625845, + "language_loss": 0.81159383, + "learning_rate": 3.0560082288132662e-06, + "loss": 0.83314145, + "num_input_tokens_seen": 122652100, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7734375, + "step": 5709, + "time_per_iteration": 2.488879919052124 + }, + { + "auxiliary_loss_clip": 0.01125291, + "auxiliary_loss_mlp": 0.01039585, + "balance_loss_clip": 1.0235213, + "balance_loss_mlp": 1.04413152, + "epoch": 0.3433037727341049, + "flos": 21251576194560.0, + "grad_norm": 2.1306910299424677, + "language_loss": 0.79152101, + "learning_rate": 3.055677461649329e-06, + "loss": 0.81316978, + "num_input_tokens_seen": 122669720, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8125, + "step": 5710, + "time_per_iteration": 2.487224817276001 + }, + { + "auxiliary_loss_clip": 0.01124884, + "auxiliary_loss_mlp": 0.01036256, + "balance_loss_clip": 1.0209142, + "balance_loss_mlp": 1.04181814, + "epoch": 0.34336389598677286, + "flos": 20629423699200.0, + "grad_norm": 1.821164645381994, + "language_loss": 0.69994622, + "learning_rate": 3.055346654453996e-06, + "loss": 0.72155762, + "num_input_tokens_seen": 122688715, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.828125, + "step": 5711, + "time_per_iteration": 2.471989631652832 + }, + { + "auxiliary_loss_clip": 0.01123068, + "auxiliary_loss_mlp": 0.01039448, + "balance_loss_clip": 1.02455926, + "balance_loss_mlp": 1.04235482, + "epoch": 0.3434240192394409, + "flos": 14538579402240.0, + "grad_norm": 1.7360043656013842, + "language_loss": 0.68002397, + "learning_rate": 3.055015807239812e-06, + "loss": 0.70164913, + "num_input_tokens_seen": 122706970, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.80859375, + "step": 5712, + "time_per_iteration": 2.440960168838501 + }, + { + "auxiliary_loss_clip": 0.01036814, + "auxiliary_loss_mlp": 0.01007067, + "balance_loss_clip": 1.00550556, + "balance_loss_mlp": 1.011006, + "epoch": 0.34348414249210885, + "flos": 58051538841600.0, + "grad_norm": 0.8415582534154722, + "language_loss": 0.58101094, + "learning_rate": 3.0546849200193226e-06, + "loss": 0.60144973, + "num_input_tokens_seen": 122758095, + "router_z_loss_clip": 0.01556396, + "router_z_loss_mlp": 0.2578125, + "step": 5713, + "time_per_iteration": 3.018573045730591 + }, + { + "auxiliary_loss_clip": 0.01122962, + "auxiliary_loss_mlp": 0.01042443, + "balance_loss_clip": 1.02773833, + "balance_loss_mlp": 1.04283524, + "epoch": 0.3435442657447768, + "flos": 20704441253760.0, + "grad_norm": 1.6636797952259372, + "language_loss": 0.80745685, + "learning_rate": 3.054353992805076e-06, + "loss": 0.82911092, + "num_input_tokens_seen": 122777815, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.80078125, + "step": 5714, + "time_per_iteration": 2.4916322231292725 + }, + { + "auxiliary_loss_clip": 0.01126185, + "auxiliary_loss_mlp": 0.01040552, + "balance_loss_clip": 1.02519822, + "balance_loss_mlp": 1.04508591, + "epoch": 0.3436043889974448, + "flos": 22930256355840.0, + "grad_norm": 1.759201097406795, + "language_loss": 0.71844554, + "learning_rate": 3.05402302560962e-06, + "loss": 0.7401129, + "num_input_tokens_seen": 122797555, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8125, + "step": 5715, + "time_per_iteration": 2.468292474746704 + }, + { + "auxiliary_loss_clip": 0.01036063, + "auxiliary_loss_mlp": 0.01006756, + "balance_loss_clip": 1.00499201, + "balance_loss_mlp": 1.01020741, + "epoch": 0.34366451225011274, + "flos": 58403285752320.0, + "grad_norm": 0.8941035310387452, + "language_loss": 0.65942305, + "learning_rate": 3.053692018445505e-06, + "loss": 0.67985129, + "num_input_tokens_seen": 122863955, + "router_z_loss_clip": 0.0177002, + "router_z_loss_mlp": 0.2578125, + "step": 5716, + "time_per_iteration": 3.101933717727661 + }, + { + "auxiliary_loss_clip": 0.0112152, + "auxiliary_loss_mlp": 0.0104123, + "balance_loss_clip": 1.02705014, + "balance_loss_mlp": 1.04254961, + "epoch": 0.3437246355027807, + "flos": 15596292216960.0, + "grad_norm": 2.0405702698755657, + "language_loss": 0.74612904, + "learning_rate": 3.0533609713252838e-06, + "loss": 0.76775646, + "num_input_tokens_seen": 122883000, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7890625, + "step": 5717, + "time_per_iteration": 2.426793098449707 + }, + { + "auxiliary_loss_clip": 0.01123044, + "auxiliary_loss_mlp": 0.01042851, + "balance_loss_clip": 1.02894473, + "balance_loss_mlp": 1.0413748, + "epoch": 0.34378475875544867, + "flos": 27672260106240.0, + "grad_norm": 1.6999619338826393, + "language_loss": 0.7507081, + "learning_rate": 3.0530298842615077e-06, + "loss": 0.77236706, + "num_input_tokens_seen": 122903265, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.81640625, + "step": 5718, + "time_per_iteration": 2.534557342529297 + }, + { + "auxiliary_loss_clip": 0.01125265, + "auxiliary_loss_mlp": 0.01040651, + "balance_loss_clip": 1.02563679, + "balance_loss_mlp": 1.04245746, + "epoch": 0.34384488200811664, + "flos": 31431496769280.0, + "grad_norm": 1.9991347741656986, + "language_loss": 0.63971305, + "learning_rate": 3.052698757266734e-06, + "loss": 0.66137218, + "num_input_tokens_seen": 122923860, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 5719, + "time_per_iteration": 2.5236892700195312 + }, + { + "auxiliary_loss_clip": 0.01124826, + "auxiliary_loss_mlp": 0.01038568, + "balance_loss_clip": 1.02251017, + "balance_loss_mlp": 1.0418756, + "epoch": 0.3439050052607846, + "flos": 24899920594560.0, + "grad_norm": 2.111950804429908, + "language_loss": 0.73612356, + "learning_rate": 3.0523675903535183e-06, + "loss": 0.75775748, + "num_input_tokens_seen": 122945305, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.828125, + "step": 5720, + "time_per_iteration": 5.3536376953125 + }, + { + "auxiliary_loss_clip": 0.0112352, + "auxiliary_loss_mlp": 0.01040582, + "balance_loss_clip": 1.02520978, + "balance_loss_mlp": 1.04300022, + "epoch": 0.34396512851345257, + "flos": 18150079426560.0, + "grad_norm": 1.805745396214866, + "language_loss": 0.74198145, + "learning_rate": 3.0520363835344173e-06, + "loss": 0.76362252, + "num_input_tokens_seen": 122962535, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8046875, + "step": 5721, + "time_per_iteration": 2.4301607608795166 + }, + { + "auxiliary_loss_clip": 0.01126876, + "auxiliary_loss_mlp": 0.0104413, + "balance_loss_clip": 1.0286088, + "balance_loss_mlp": 1.04481733, + "epoch": 0.34402525176612053, + "flos": 16034438315520.0, + "grad_norm": 3.5063882769532313, + "language_loss": 0.80132651, + "learning_rate": 3.051705136821992e-06, + "loss": 0.82303661, + "num_input_tokens_seen": 122979750, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8203125, + "step": 5722, + "time_per_iteration": 2.411731243133545 + }, + { + "auxiliary_loss_clip": 0.01122709, + "auxiliary_loss_mlp": 0.01032098, + "balance_loss_clip": 1.01809728, + "balance_loss_mlp": 1.04312289, + "epoch": 0.3440853750187885, + "flos": 21178641628800.0, + "grad_norm": 1.5863267197766868, + "language_loss": 0.8194539, + "learning_rate": 3.051373850228801e-06, + "loss": 0.84100199, + "num_input_tokens_seen": 122998955, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.796875, + "step": 5723, + "time_per_iteration": 2.476672410964966 + }, + { + "auxiliary_loss_clip": 0.01124156, + "auxiliary_loss_mlp": 0.01039991, + "balance_loss_clip": 1.02559686, + "balance_loss_mlp": 1.0428493, + "epoch": 0.34414549827145646, + "flos": 12677868092160.0, + "grad_norm": 1.852885568649272, + "language_loss": 0.8147676, + "learning_rate": 3.0510425237674096e-06, + "loss": 0.83640903, + "num_input_tokens_seen": 123016165, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.8125, + "step": 5724, + "time_per_iteration": 2.4115889072418213 + }, + { + "auxiliary_loss_clip": 0.01125316, + "auxiliary_loss_mlp": 0.01036091, + "balance_loss_clip": 1.0210526, + "balance_loss_mlp": 1.04397368, + "epoch": 0.3442056215241244, + "flos": 31284514316160.0, + "grad_norm": 1.759268883551978, + "language_loss": 0.6919744, + "learning_rate": 3.05071115745038e-06, + "loss": 0.71358848, + "num_input_tokens_seen": 123036900, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8125, + "step": 5725, + "time_per_iteration": 2.589571714401245 + }, + { + "auxiliary_loss_clip": 0.01130624, + "auxiliary_loss_mlp": 0.01042614, + "balance_loss_clip": 1.02578139, + "balance_loss_mlp": 1.04464412, + "epoch": 0.34426574477679245, + "flos": 23367289132800.0, + "grad_norm": 1.4578739764018875, + "language_loss": 0.69519544, + "learning_rate": 3.0503797512902773e-06, + "loss": 0.71692783, + "num_input_tokens_seen": 123057480, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.859375, + "step": 5726, + "time_per_iteration": 2.4600956439971924 + }, + { + "auxiliary_loss_clip": 0.01123936, + "auxiliary_loss_mlp": 0.0103637, + "balance_loss_clip": 1.02222002, + "balance_loss_mlp": 1.0427928, + "epoch": 0.3443258680294604, + "flos": 24535427333760.0, + "grad_norm": 1.656148044371735, + "language_loss": 0.73426235, + "learning_rate": 3.0500483052996703e-06, + "loss": 0.7558654, + "num_input_tokens_seen": 123076890, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8125, + "step": 5727, + "time_per_iteration": 2.5102531909942627 + }, + { + "auxiliary_loss_clip": 0.01125161, + "auxiliary_loss_mlp": 0.01041626, + "balance_loss_clip": 1.02636731, + "balance_loss_mlp": 1.04398954, + "epoch": 0.3443859912821284, + "flos": 20230133137920.0, + "grad_norm": 1.8280399137078096, + "language_loss": 0.87897557, + "learning_rate": 3.0497168194911257e-06, + "loss": 0.90064341, + "num_input_tokens_seen": 123092530, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8125, + "step": 5728, + "time_per_iteration": 2.4304542541503906 + }, + { + "auxiliary_loss_clip": 0.01122947, + "auxiliary_loss_mlp": 0.01045129, + "balance_loss_clip": 1.03106284, + "balance_loss_mlp": 1.04264569, + "epoch": 0.34444611453479634, + "flos": 24316515895680.0, + "grad_norm": 2.0505664478102426, + "language_loss": 0.70451075, + "learning_rate": 3.0493852938772143e-06, + "loss": 0.72619152, + "num_input_tokens_seen": 123110560, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8046875, + "step": 5729, + "time_per_iteration": 2.4979374408721924 + }, + { + "auxiliary_loss_clip": 0.01122265, + "auxiliary_loss_mlp": 0.0103421, + "balance_loss_clip": 1.01952362, + "balance_loss_mlp": 1.0427525, + "epoch": 0.3445062377874643, + "flos": 16983413683200.0, + "grad_norm": 1.7284434335955414, + "language_loss": 0.73995942, + "learning_rate": 3.0490537284705078e-06, + "loss": 0.7615242, + "num_input_tokens_seen": 123128655, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.796875, + "step": 5730, + "time_per_iteration": 2.4471776485443115 + }, + { + "auxiliary_loss_clip": 0.0112363, + "auxiliary_loss_mlp": 0.01041517, + "balance_loss_clip": 1.02693152, + "balance_loss_mlp": 1.04263377, + "epoch": 0.3445663610401323, + "flos": 20302708567680.0, + "grad_norm": 2.104777326243209, + "language_loss": 0.80005515, + "learning_rate": 3.048722123283578e-06, + "loss": 0.82170659, + "num_input_tokens_seen": 123145130, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8125, + "step": 5731, + "time_per_iteration": 2.454735279083252 + }, + { + "auxiliary_loss_clip": 0.01123928, + "auxiliary_loss_mlp": 0.0104428, + "balance_loss_clip": 1.02953923, + "balance_loss_mlp": 1.04394484, + "epoch": 0.34462648429280024, + "flos": 15888102307200.0, + "grad_norm": 2.039149215632527, + "language_loss": 0.78837991, + "learning_rate": 3.0483904783290006e-06, + "loss": 0.81006193, + "num_input_tokens_seen": 123162265, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.796875, + "step": 5732, + "time_per_iteration": 2.4177064895629883 + }, + { + "auxiliary_loss_clip": 0.01043649, + "auxiliary_loss_mlp": 0.01003776, + "balance_loss_clip": 1.00184536, + "balance_loss_mlp": 1.01788378, + "epoch": 0.3446866075454682, + "flos": 59311035285120.0, + "grad_norm": 0.7440231134556253, + "language_loss": 0.53498071, + "learning_rate": 3.0480587936193505e-06, + "loss": 0.55545497, + "num_input_tokens_seen": 123218620, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.2578125, + "step": 5733, + "time_per_iteration": 3.0976667404174805 + }, + { + "auxiliary_loss_clip": 0.0112691, + "auxiliary_loss_mlp": 0.01042834, + "balance_loss_clip": 1.02806389, + "balance_loss_mlp": 1.04630947, + "epoch": 0.34474673079813617, + "flos": 22343799000960.0, + "grad_norm": 1.6025085195413686, + "language_loss": 0.83345532, + "learning_rate": 3.047727069167207e-06, + "loss": 0.85515279, + "num_input_tokens_seen": 123237325, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8046875, + "step": 5734, + "time_per_iteration": 2.462327718734741 + }, + { + "auxiliary_loss_clip": 0.01125766, + "auxiliary_loss_mlp": 0.0103401, + "balance_loss_clip": 1.01899588, + "balance_loss_mlp": 1.04382658, + "epoch": 0.34480685405080413, + "flos": 27670141203840.0, + "grad_norm": 2.7233898634254525, + "language_loss": 0.9245038, + "learning_rate": 3.0473953049851478e-06, + "loss": 0.94610149, + "num_input_tokens_seen": 123258650, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8203125, + "step": 5735, + "time_per_iteration": 2.600933790206909 + }, + { + "auxiliary_loss_clip": 0.01129266, + "auxiliary_loss_mlp": 0.01041814, + "balance_loss_clip": 1.02607846, + "balance_loss_mlp": 1.04662871, + "epoch": 0.3448669773034721, + "flos": 22456020067200.0, + "grad_norm": 1.628548106881684, + "language_loss": 0.76666284, + "learning_rate": 3.0470635010857533e-06, + "loss": 0.78837371, + "num_input_tokens_seen": 123277155, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.828125, + "step": 5736, + "time_per_iteration": 2.4607973098754883 + }, + { + "auxiliary_loss_clip": 0.0113014, + "auxiliary_loss_mlp": 0.01043797, + "balance_loss_clip": 1.02948046, + "balance_loss_mlp": 1.04773998, + "epoch": 0.34492710055614006, + "flos": 24936190352640.0, + "grad_norm": 1.59823002014571, + "language_loss": 0.78745639, + "learning_rate": 3.0467316574816064e-06, + "loss": 0.80919576, + "num_input_tokens_seen": 123297640, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.82421875, + "step": 5737, + "time_per_iteration": 2.5059142112731934 + }, + { + "auxiliary_loss_clip": 0.0112976, + "auxiliary_loss_mlp": 0.01040269, + "balance_loss_clip": 1.02459311, + "balance_loss_mlp": 1.04445243, + "epoch": 0.34498722380880803, + "flos": 20120821073280.0, + "grad_norm": 2.0456946138928767, + "language_loss": 0.71714234, + "learning_rate": 3.0463997741852893e-06, + "loss": 0.73884267, + "num_input_tokens_seen": 123314370, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8515625, + "step": 5738, + "time_per_iteration": 2.4374310970306396 + }, + { + "auxiliary_loss_clip": 0.01129235, + "auxiliary_loss_mlp": 0.01042362, + "balance_loss_clip": 1.02727044, + "balance_loss_mlp": 1.04496205, + "epoch": 0.34504734706147605, + "flos": 28438126917120.0, + "grad_norm": 1.8999072115309161, + "language_loss": 0.81518626, + "learning_rate": 3.046067851209389e-06, + "loss": 0.83690214, + "num_input_tokens_seen": 123336085, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84375, + "step": 5739, + "time_per_iteration": 2.559990406036377 + }, + { + "auxiliary_loss_clip": 0.0112747, + "auxiliary_loss_mlp": 0.01045734, + "balance_loss_clip": 1.03067827, + "balance_loss_mlp": 1.04620492, + "epoch": 0.345107470314144, + "flos": 22674464628480.0, + "grad_norm": 2.6856273454827275, + "language_loss": 0.8322401, + "learning_rate": 3.0457358885664898e-06, + "loss": 0.85397214, + "num_input_tokens_seen": 123354460, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8125, + "step": 5740, + "time_per_iteration": 2.4684722423553467 + }, + { + "auxiliary_loss_clip": 0.01127563, + "auxiliary_loss_mlp": 0.01038564, + "balance_loss_clip": 1.0227803, + "balance_loss_mlp": 1.04611385, + "epoch": 0.345167593566812, + "flos": 20630716588800.0, + "grad_norm": 2.03424253553345, + "language_loss": 0.77135098, + "learning_rate": 3.045403886269181e-06, + "loss": 0.7930122, + "num_input_tokens_seen": 123373420, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8125, + "step": 5741, + "time_per_iteration": 2.48624587059021 + }, + { + "auxiliary_loss_clip": 0.01125981, + "auxiliary_loss_mlp": 0.01036869, + "balance_loss_clip": 1.02226019, + "balance_loss_mlp": 1.04276562, + "epoch": 0.34522771681947995, + "flos": 26214358890240.0, + "grad_norm": 1.4993687582247586, + "language_loss": 0.77224493, + "learning_rate": 3.045071844330053e-06, + "loss": 0.79387349, + "num_input_tokens_seen": 123394730, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.83203125, + "step": 5742, + "time_per_iteration": 2.5046300888061523 + }, + { + "auxiliary_loss_clip": 0.01123657, + "auxiliary_loss_mlp": 0.01039728, + "balance_loss_clip": 1.02479076, + "balance_loss_mlp": 1.04310095, + "epoch": 0.3452878400721479, + "flos": 19062354072960.0, + "grad_norm": 1.823337430242114, + "language_loss": 0.76346177, + "learning_rate": 3.0447397627616955e-06, + "loss": 0.78509557, + "num_input_tokens_seen": 123412895, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8046875, + "step": 5743, + "time_per_iteration": 2.4554226398468018 + }, + { + "auxiliary_loss_clip": 0.01124183, + "auxiliary_loss_mlp": 0.01036757, + "balance_loss_clip": 1.02278566, + "balance_loss_mlp": 1.04435802, + "epoch": 0.3453479633248159, + "flos": 27929739772800.0, + "grad_norm": 1.5691807126711539, + "language_loss": 0.70255435, + "learning_rate": 3.0444076415767016e-06, + "loss": 0.72416371, + "num_input_tokens_seen": 123432320, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.796875, + "step": 5744, + "time_per_iteration": 2.497314929962158 + }, + { + "auxiliary_loss_clip": 0.01121947, + "auxiliary_loss_mlp": 0.01036476, + "balance_loss_clip": 1.02205133, + "balance_loss_mlp": 1.04318309, + "epoch": 0.34540808657748384, + "flos": 19606113135360.0, + "grad_norm": 1.629619176768893, + "language_loss": 0.79692256, + "learning_rate": 3.044075480787665e-06, + "loss": 0.81850678, + "num_input_tokens_seen": 123450980, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7890625, + "step": 5745, + "time_per_iteration": 2.5154099464416504 + }, + { + "auxiliary_loss_clip": 0.01129348, + "auxiliary_loss_mlp": 0.01040489, + "balance_loss_clip": 1.02460432, + "balance_loss_mlp": 1.04556072, + "epoch": 0.3454682098301518, + "flos": 20411661496320.0, + "grad_norm": 1.7858540966841563, + "language_loss": 0.88775939, + "learning_rate": 3.043743280407182e-06, + "loss": 0.9094578, + "num_input_tokens_seen": 123469365, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8359375, + "step": 5746, + "time_per_iteration": 2.436028003692627 + }, + { + "auxiliary_loss_clip": 0.01129654, + "auxiliary_loss_mlp": 0.01039755, + "balance_loss_clip": 1.02438855, + "balance_loss_mlp": 1.04509354, + "epoch": 0.34552833308281977, + "flos": 21325121291520.0, + "grad_norm": 1.8755596522528313, + "language_loss": 0.64010286, + "learning_rate": 3.043411040447849e-06, + "loss": 0.66179693, + "num_input_tokens_seen": 123489425, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.84765625, + "step": 5747, + "time_per_iteration": 2.465817451477051 + }, + { + "auxiliary_loss_clip": 0.0112633, + "auxiliary_loss_mlp": 0.01035998, + "balance_loss_clip": 1.02193761, + "balance_loss_mlp": 1.04486203, + "epoch": 0.34558845633548774, + "flos": 36243633824640.0, + "grad_norm": 1.5413680181151455, + "language_loss": 0.72813559, + "learning_rate": 3.043078760922264e-06, + "loss": 0.74975884, + "num_input_tokens_seen": 123509970, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8125, + "step": 5748, + "time_per_iteration": 2.566849946975708 + }, + { + "auxiliary_loss_clip": 0.01123147, + "auxiliary_loss_mlp": 0.01034299, + "balance_loss_clip": 1.020715, + "balance_loss_mlp": 1.04517043, + "epoch": 0.3456485795881557, + "flos": 22450561200000.0, + "grad_norm": 1.6451707518978071, + "language_loss": 0.75697249, + "learning_rate": 3.042746441843029e-06, + "loss": 0.77854693, + "num_input_tokens_seen": 123531055, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.77734375, + "step": 5749, + "time_per_iteration": 2.5068271160125732 + }, + { + "auxiliary_loss_clip": 0.01036655, + "auxiliary_loss_mlp": 0.01004838, + "balance_loss_clip": 1.00293088, + "balance_loss_mlp": 1.01066136, + "epoch": 0.34570870284082367, + "flos": 62004299005440.0, + "grad_norm": 0.8931526891439046, + "language_loss": 0.62754983, + "learning_rate": 3.0424140832227437e-06, + "loss": 0.64796478, + "num_input_tokens_seen": 123584720, + "router_z_loss_clip": 0.01904297, + "router_z_loss_mlp": 0.25976562, + "step": 5750, + "time_per_iteration": 2.930236577987671 + }, + { + "auxiliary_loss_clip": 0.01119501, + "auxiliary_loss_mlp": 0.01033201, + "balance_loss_clip": 1.01933062, + "balance_loss_mlp": 1.04268134, + "epoch": 0.34576882609349163, + "flos": 22782196494720.0, + "grad_norm": 2.1199041216122314, + "language_loss": 0.80762947, + "learning_rate": 3.042081685074012e-06, + "loss": 0.82915652, + "num_input_tokens_seen": 123604465, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 5751, + "time_per_iteration": 2.4710936546325684 + }, + { + "auxiliary_loss_clip": 0.01121328, + "auxiliary_loss_mlp": 0.01046759, + "balance_loss_clip": 1.03268027, + "balance_loss_mlp": 1.04408574, + "epoch": 0.34582894934615965, + "flos": 12348818576640.0, + "grad_norm": 3.882107217624466, + "language_loss": 0.83630323, + "learning_rate": 3.041749247409439e-06, + "loss": 0.85798407, + "num_input_tokens_seen": 123622320, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7734375, + "step": 5752, + "time_per_iteration": 2.421095132827759 + }, + { + "auxiliary_loss_clip": 0.01036836, + "auxiliary_loss_mlp": 0.01002038, + "balance_loss_clip": 1.00014234, + "balance_loss_mlp": 1.01131189, + "epoch": 0.3458890725988276, + "flos": 70167691071360.0, + "grad_norm": 0.7425573992046552, + "language_loss": 0.63106978, + "learning_rate": 3.0414167702416296e-06, + "loss": 0.6514585, + "num_input_tokens_seen": 123678010, + "router_z_loss_clip": 0.0189209, + "router_z_loss_mlp": 0.25585938, + "step": 5753, + "time_per_iteration": 2.960430383682251 + }, + { + "auxiliary_loss_clip": 0.01122475, + "auxiliary_loss_mlp": 0.01040243, + "balance_loss_clip": 1.0252701, + "balance_loss_mlp": 1.0433172, + "epoch": 0.3459491958514956, + "flos": 17092582093440.0, + "grad_norm": 1.7337780765213762, + "language_loss": 0.70964289, + "learning_rate": 3.0410842535831914e-06, + "loss": 0.73127007, + "num_input_tokens_seen": 123696830, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.79296875, + "step": 5754, + "time_per_iteration": 2.473090171813965 + }, + { + "auxiliary_loss_clip": 0.01126645, + "auxiliary_loss_mlp": 0.01033305, + "balance_loss_clip": 1.01889825, + "balance_loss_mlp": 1.04436386, + "epoch": 0.34600931910416355, + "flos": 16650952375680.0, + "grad_norm": 3.1958037374869357, + "language_loss": 0.72880316, + "learning_rate": 3.0407516974467343e-06, + "loss": 0.75040269, + "num_input_tokens_seen": 123714360, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.82421875, + "step": 5755, + "time_per_iteration": 2.486187219619751 + }, + { + "auxiliary_loss_clip": 0.01122516, + "auxiliary_loss_mlp": 0.01034129, + "balance_loss_clip": 1.01985335, + "balance_loss_mlp": 1.04448533, + "epoch": 0.3460694423568315, + "flos": 38546190334080.0, + "grad_norm": 1.6620890991055186, + "language_loss": 0.72366977, + "learning_rate": 3.040419101844869e-06, + "loss": 0.74523616, + "num_input_tokens_seen": 123739250, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 5756, + "time_per_iteration": 2.6883044242858887 + }, + { + "auxiliary_loss_clip": 0.01036738, + "auxiliary_loss_mlp": 0.01004698, + "balance_loss_clip": 1.00295758, + "balance_loss_mlp": 1.01152658, + "epoch": 0.3461295656094995, + "flos": 72081479704320.0, + "grad_norm": 0.7127234008063932, + "language_loss": 0.62522227, + "learning_rate": 3.040086466790207e-06, + "loss": 0.64563662, + "num_input_tokens_seen": 123802845, + "router_z_loss_clip": 0.01745605, + "router_z_loss_mlp": 0.25195312, + "step": 5757, + "time_per_iteration": 3.0644619464874268 + }, + { + "auxiliary_loss_clip": 0.01036676, + "auxiliary_loss_mlp": 0.01006374, + "balance_loss_clip": 1.00465703, + "balance_loss_mlp": 1.01123941, + "epoch": 0.34618968886216744, + "flos": 65460089571840.0, + "grad_norm": 0.8513650993905141, + "language_loss": 0.59153563, + "learning_rate": 3.039753792295362e-06, + "loss": 0.61196613, + "num_input_tokens_seen": 123861805, + "router_z_loss_clip": 0.01721191, + "router_z_loss_mlp": 0.25390625, + "step": 5758, + "time_per_iteration": 3.0601916313171387 + }, + { + "auxiliary_loss_clip": 0.01126165, + "auxiliary_loss_mlp": 0.0103975, + "balance_loss_clip": 1.02576697, + "balance_loss_mlp": 1.04562724, + "epoch": 0.3462498121148354, + "flos": 23472542960640.0, + "grad_norm": 1.8469236817688628, + "language_loss": 0.71498728, + "learning_rate": 3.0394210783729487e-06, + "loss": 0.73664641, + "num_input_tokens_seen": 123881820, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.8046875, + "step": 5759, + "time_per_iteration": 2.4722588062286377 + }, + { + "auxiliary_loss_clip": 0.0112123, + "auxiliary_loss_mlp": 0.01046171, + "balance_loss_clip": 1.03079295, + "balance_loss_mlp": 1.04248834, + "epoch": 0.3463099353675034, + "flos": 24170790418560.0, + "grad_norm": 1.8727439754442439, + "language_loss": 0.83008277, + "learning_rate": 3.0390883250355836e-06, + "loss": 0.85175675, + "num_input_tokens_seen": 123903700, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.7890625, + "step": 5760, + "time_per_iteration": 2.5002012252807617 + }, + { + "auxiliary_loss_clip": 0.01035648, + "auxiliary_loss_mlp": 0.01005512, + "balance_loss_clip": 1.00358045, + "balance_loss_mlp": 1.01033783, + "epoch": 0.34637005862017134, + "flos": 63700609766400.0, + "grad_norm": 0.8745886359800412, + "language_loss": 0.5653646, + "learning_rate": 3.0387555322958865e-06, + "loss": 0.58577621, + "num_input_tokens_seen": 123960075, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.25390625, + "step": 5761, + "time_per_iteration": 3.0950896739959717 + }, + { + "auxiliary_loss_clip": 0.01120096, + "auxiliary_loss_mlp": 0.01038691, + "balance_loss_clip": 1.02470756, + "balance_loss_mlp": 1.04127657, + "epoch": 0.3464301818728393, + "flos": 13145532192000.0, + "grad_norm": 2.0018538772922883, + "language_loss": 0.95053494, + "learning_rate": 3.038422700166474e-06, + "loss": 0.97212291, + "num_input_tokens_seen": 123975805, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7890625, + "step": 5762, + "time_per_iteration": 5.290884256362915 + }, + { + "auxiliary_loss_clip": 0.01123668, + "auxiliary_loss_mlp": 0.01034396, + "balance_loss_clip": 1.01935804, + "balance_loss_mlp": 1.0417943, + "epoch": 0.34649030512550727, + "flos": 29315173299840.0, + "grad_norm": 2.194288284173203, + "language_loss": 0.69335818, + "learning_rate": 3.0380898286599692e-06, + "loss": 0.71493888, + "num_input_tokens_seen": 123997530, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8203125, + "step": 5763, + "time_per_iteration": 2.5411787033081055 + }, + { + "auxiliary_loss_clip": 0.01130216, + "auxiliary_loss_mlp": 0.01045092, + "balance_loss_clip": 1.02862906, + "balance_loss_mlp": 1.0458554, + "epoch": 0.34655042837817523, + "flos": 23730884553600.0, + "grad_norm": 2.0099592928074497, + "language_loss": 0.83589876, + "learning_rate": 3.0377569177889945e-06, + "loss": 0.85765183, + "num_input_tokens_seen": 124016375, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.84375, + "step": 5764, + "time_per_iteration": 2.48040771484375 + }, + { + "auxiliary_loss_clip": 0.01123556, + "auxiliary_loss_mlp": 0.01033291, + "balance_loss_clip": 1.0186758, + "balance_loss_mlp": 1.04343057, + "epoch": 0.34661055163084326, + "flos": 22054215553920.0, + "grad_norm": 2.159805793212971, + "language_loss": 0.67403859, + "learning_rate": 3.0374239675661722e-06, + "loss": 0.69560707, + "num_input_tokens_seen": 124033975, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.80078125, + "step": 5765, + "time_per_iteration": 2.502297878265381 + }, + { + "auxiliary_loss_clip": 0.01130095, + "auxiliary_loss_mlp": 0.01037318, + "balance_loss_clip": 1.02291703, + "balance_loss_mlp": 1.04937232, + "epoch": 0.3466706748835112, + "flos": 21799213925760.0, + "grad_norm": 2.083918060213648, + "language_loss": 0.77861524, + "learning_rate": 3.03709097800413e-06, + "loss": 0.80028939, + "num_input_tokens_seen": 124051930, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.80859375, + "step": 5766, + "time_per_iteration": 2.465325355529785 + }, + { + "auxiliary_loss_clip": 0.01122551, + "auxiliary_loss_mlp": 0.01035113, + "balance_loss_clip": 1.0215292, + "balance_loss_mlp": 1.04335451, + "epoch": 0.3467307981361792, + "flos": 19461680547840.0, + "grad_norm": 1.5377908130541305, + "language_loss": 0.73529994, + "learning_rate": 3.0367579491154943e-06, + "loss": 0.75687665, + "num_input_tokens_seen": 124071220, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7890625, + "step": 5767, + "time_per_iteration": 2.4656143188476562 + }, + { + "auxiliary_loss_clip": 0.01127128, + "auxiliary_loss_mlp": 0.01040956, + "balance_loss_clip": 1.02538764, + "balance_loss_mlp": 1.04720497, + "epoch": 0.34679092138884715, + "flos": 24827452905600.0, + "grad_norm": 2.233359981487989, + "language_loss": 0.77795279, + "learning_rate": 3.036424880912893e-06, + "loss": 0.79963356, + "num_input_tokens_seen": 124090140, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.796875, + "step": 5768, + "time_per_iteration": 2.4951131343841553 + }, + { + "auxiliary_loss_clip": 0.0103542, + "auxiliary_loss_mlp": 0.01008769, + "balance_loss_clip": 1.00693345, + "balance_loss_mlp": 1.01015306, + "epoch": 0.3468510446415151, + "flos": 63236070149760.0, + "grad_norm": 0.7739728920865777, + "language_loss": 0.57404095, + "learning_rate": 3.036091773408956e-06, + "loss": 0.59448284, + "num_input_tokens_seen": 124152025, + "router_z_loss_clip": 0.01831055, + "router_z_loss_mlp": 0.25195312, + "step": 5769, + "time_per_iteration": 3.0867085456848145 + }, + { + "auxiliary_loss_clip": 0.01135857, + "auxiliary_loss_mlp": 0.01043057, + "balance_loss_clip": 1.02577174, + "balance_loss_mlp": 1.04723847, + "epoch": 0.3469111678941831, + "flos": 12120713256960.0, + "grad_norm": 2.3808887206764244, + "language_loss": 0.85625517, + "learning_rate": 3.0357586266163154e-06, + "loss": 0.87804437, + "num_input_tokens_seen": 124165795, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.88671875, + "step": 5770, + "time_per_iteration": 2.4296391010284424 + }, + { + "auxiliary_loss_clip": 0.0103532, + "auxiliary_loss_mlp": 0.01003334, + "balance_loss_clip": 1.00152194, + "balance_loss_mlp": 1.01001954, + "epoch": 0.34697129114685105, + "flos": 65934110378880.0, + "grad_norm": 0.7779481231658855, + "language_loss": 0.59827816, + "learning_rate": 3.0354254405476036e-06, + "loss": 0.61866474, + "num_input_tokens_seen": 124222925, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.25390625, + "step": 5771, + "time_per_iteration": 2.858952522277832 + }, + { + "auxiliary_loss_clip": 0.0112466, + "auxiliary_loss_mlp": 0.01046684, + "balance_loss_clip": 1.03183091, + "balance_loss_mlp": 1.04478061, + "epoch": 0.347031414399519, + "flos": 34454205054720.0, + "grad_norm": 2.6949016474557475, + "language_loss": 0.71790159, + "learning_rate": 3.0350922152154557e-06, + "loss": 0.73961502, + "num_input_tokens_seen": 124240915, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 5772, + "time_per_iteration": 2.629441976547241 + }, + { + "auxiliary_loss_clip": 0.01125724, + "auxiliary_loss_mlp": 0.01041085, + "balance_loss_clip": 1.02608275, + "balance_loss_mlp": 1.04398608, + "epoch": 0.347091537652187, + "flos": 26944135511040.0, + "grad_norm": 1.4939658014033708, + "language_loss": 0.76165307, + "learning_rate": 3.034758950632507e-06, + "loss": 0.78332114, + "num_input_tokens_seen": 124262770, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.81640625, + "step": 5773, + "time_per_iteration": 2.5281848907470703 + }, + { + "auxiliary_loss_clip": 0.01127127, + "auxiliary_loss_mlp": 0.01043606, + "balance_loss_clip": 1.02811444, + "balance_loss_mlp": 1.04447389, + "epoch": 0.34715166090485494, + "flos": 21142228216320.0, + "grad_norm": 2.0748415381607717, + "language_loss": 0.70428938, + "learning_rate": 3.034425646811396e-06, + "loss": 0.72599673, + "num_input_tokens_seen": 124280950, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.828125, + "step": 5774, + "time_per_iteration": 2.4930198192596436 + }, + { + "auxiliary_loss_clip": 0.01125136, + "auxiliary_loss_mlp": 0.01040671, + "balance_loss_clip": 1.02630043, + "balance_loss_mlp": 1.04615033, + "epoch": 0.3472117841575229, + "flos": 23478001827840.0, + "grad_norm": 1.6801460468757594, + "language_loss": 0.76410925, + "learning_rate": 3.0340923037647602e-06, + "loss": 0.78576738, + "num_input_tokens_seen": 124299540, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7890625, + "step": 5775, + "time_per_iteration": 2.501793622970581 + }, + { + "auxiliary_loss_clip": 0.01129926, + "auxiliary_loss_mlp": 0.0104389, + "balance_loss_clip": 1.02778447, + "balance_loss_mlp": 1.04408336, + "epoch": 0.34727190741019087, + "flos": 17492806408320.0, + "grad_norm": 2.2786937073337956, + "language_loss": 0.78098702, + "learning_rate": 3.0337589215052404e-06, + "loss": 0.8027252, + "num_input_tokens_seen": 124316285, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.859375, + "step": 5776, + "time_per_iteration": 2.547508716583252 + }, + { + "auxiliary_loss_clip": 0.01034004, + "auxiliary_loss_mlp": 0.01012403, + "balance_loss_clip": 1.01073408, + "balance_loss_mlp": 1.00864577, + "epoch": 0.34733203066285884, + "flos": 65265491640960.0, + "grad_norm": 0.8366551978688649, + "language_loss": 0.63353252, + "learning_rate": 3.033425500045478e-06, + "loss": 0.65399659, + "num_input_tokens_seen": 124376650, + "router_z_loss_clip": 0.01672363, + "router_z_loss_mlp": 0.25390625, + "step": 5777, + "time_per_iteration": 3.118314743041992 + }, + { + "auxiliary_loss_clip": 0.01124542, + "auxiliary_loss_mlp": 0.01047894, + "balance_loss_clip": 1.03253984, + "balance_loss_mlp": 1.04198289, + "epoch": 0.3473921539155268, + "flos": 28658726294400.0, + "grad_norm": 2.1982821508403956, + "language_loss": 0.64399695, + "learning_rate": 3.033092039398119e-06, + "loss": 0.66572136, + "num_input_tokens_seen": 124396475, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.82421875, + "step": 5778, + "time_per_iteration": 2.5438621044158936 + }, + { + "auxiliary_loss_clip": 0.01128237, + "auxiliary_loss_mlp": 0.01054212, + "balance_loss_clip": 1.03947175, + "balance_loss_mlp": 1.04425573, + "epoch": 0.3474522771681948, + "flos": 40836895355520.0, + "grad_norm": 1.7264375706792277, + "language_loss": 0.71190178, + "learning_rate": 3.0327585395758046e-06, + "loss": 0.73372632, + "num_input_tokens_seen": 124416480, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.83984375, + "step": 5779, + "time_per_iteration": 2.6116013526916504 + }, + { + "auxiliary_loss_clip": 0.01128331, + "auxiliary_loss_mlp": 0.01048092, + "balance_loss_clip": 1.03270197, + "balance_loss_mlp": 1.04354596, + "epoch": 0.3475124004208628, + "flos": 24608577381120.0, + "grad_norm": 1.874853063849031, + "language_loss": 0.62552947, + "learning_rate": 3.0324250005911837e-06, + "loss": 0.64729369, + "num_input_tokens_seen": 124435950, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84765625, + "step": 5780, + "time_per_iteration": 2.5024712085723877 + }, + { + "auxiliary_loss_clip": 0.01124027, + "auxiliary_loss_mlp": 0.01041985, + "balance_loss_clip": 1.0278883, + "balance_loss_mlp": 1.04260445, + "epoch": 0.34757252367353075, + "flos": 22711309004160.0, + "grad_norm": 1.604616792806945, + "language_loss": 0.72373253, + "learning_rate": 3.0320914224569033e-06, + "loss": 0.74539268, + "num_input_tokens_seen": 124455410, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.81640625, + "step": 5781, + "time_per_iteration": 2.471235513687134 + }, + { + "auxiliary_loss_clip": 0.01125801, + "auxiliary_loss_mlp": 0.01050458, + "balance_loss_clip": 1.03416181, + "balance_loss_mlp": 1.04316914, + "epoch": 0.3476326469261987, + "flos": 19828184970240.0, + "grad_norm": 2.0942988164582266, + "language_loss": 0.76741016, + "learning_rate": 3.031757805185612e-06, + "loss": 0.78917271, + "num_input_tokens_seen": 124474870, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.828125, + "step": 5782, + "time_per_iteration": 2.4831414222717285 + }, + { + "auxiliary_loss_clip": 0.01123989, + "auxiliary_loss_mlp": 0.01036279, + "balance_loss_clip": 1.02140737, + "balance_loss_mlp": 1.04221606, + "epoch": 0.3476927701788667, + "flos": 19938107566080.0, + "grad_norm": 1.9917493867858045, + "language_loss": 0.62131268, + "learning_rate": 3.0314241487899622e-06, + "loss": 0.64291537, + "num_input_tokens_seen": 124494105, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8203125, + "step": 5783, + "time_per_iteration": 2.4483954906463623 + }, + { + "auxiliary_loss_clip": 0.01119293, + "auxiliary_loss_mlp": 0.01031994, + "balance_loss_clip": 1.01833832, + "balance_loss_mlp": 1.0410347, + "epoch": 0.34775289343153465, + "flos": 20735108490240.0, + "grad_norm": 1.6546414102961637, + "language_loss": 0.88575971, + "learning_rate": 3.031090453282605e-06, + "loss": 0.90727258, + "num_input_tokens_seen": 124512030, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.78125, + "step": 5784, + "time_per_iteration": 2.5281262397766113 + }, + { + "auxiliary_loss_clip": 0.01121731, + "auxiliary_loss_mlp": 0.01036934, + "balance_loss_clip": 1.02219379, + "balance_loss_mlp": 1.04283547, + "epoch": 0.3478130166842026, + "flos": 19354846521600.0, + "grad_norm": 1.7834042756277195, + "language_loss": 0.81664282, + "learning_rate": 3.0307567186761946e-06, + "loss": 0.83822948, + "num_input_tokens_seen": 124530980, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.7890625, + "step": 5785, + "time_per_iteration": 2.444279432296753 + }, + { + "auxiliary_loss_clip": 0.01126224, + "auxiliary_loss_mlp": 0.01037443, + "balance_loss_clip": 1.02301908, + "balance_loss_mlp": 1.04558039, + "epoch": 0.3478731399368706, + "flos": 22051198811520.0, + "grad_norm": 1.6236713309130966, + "language_loss": 0.80679643, + "learning_rate": 3.0304229449833862e-06, + "loss": 0.82843316, + "num_input_tokens_seen": 124549330, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8046875, + "step": 5786, + "time_per_iteration": 2.506639242172241 + }, + { + "auxiliary_loss_clip": 0.01123366, + "auxiliary_loss_mlp": 0.01033692, + "balance_loss_clip": 1.01860058, + "balance_loss_mlp": 1.0443275, + "epoch": 0.34793326318953854, + "flos": 18041449720320.0, + "grad_norm": 1.5789553434659291, + "language_loss": 0.74868137, + "learning_rate": 3.030089132216836e-06, + "loss": 0.77025199, + "num_input_tokens_seen": 124567200, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7890625, + "step": 5787, + "time_per_iteration": 2.4305543899536133 + }, + { + "auxiliary_loss_clip": 0.01122978, + "auxiliary_loss_mlp": 0.01037288, + "balance_loss_clip": 1.02276862, + "balance_loss_mlp": 1.04133916, + "epoch": 0.3479933864422065, + "flos": 29314670509440.0, + "grad_norm": 1.685205733624188, + "language_loss": 0.81207466, + "learning_rate": 3.029755280389203e-06, + "loss": 0.83367729, + "num_input_tokens_seen": 124587025, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.81640625, + "step": 5788, + "time_per_iteration": 2.58461332321167 + }, + { + "auxiliary_loss_clip": 0.01130932, + "auxiliary_loss_mlp": 0.01038586, + "balance_loss_clip": 1.02333927, + "balance_loss_mlp": 1.04716599, + "epoch": 0.3480535096948745, + "flos": 20120713332480.0, + "grad_norm": 1.7599288417752579, + "language_loss": 0.85399663, + "learning_rate": 3.029421389513147e-06, + "loss": 0.87569183, + "num_input_tokens_seen": 124605860, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8359375, + "step": 5789, + "time_per_iteration": 2.4460527896881104 + }, + { + "auxiliary_loss_clip": 0.01127788, + "auxiliary_loss_mlp": 0.01050411, + "balance_loss_clip": 1.03517616, + "balance_loss_mlp": 1.04420161, + "epoch": 0.34811363294754244, + "flos": 18548974938240.0, + "grad_norm": 1.9217222904205502, + "language_loss": 0.84973574, + "learning_rate": 3.029087459601328e-06, + "loss": 0.87151778, + "num_input_tokens_seen": 124624270, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8359375, + "step": 5790, + "time_per_iteration": 2.4690423011779785 + }, + { + "auxiliary_loss_clip": 0.0112493, + "auxiliary_loss_mlp": 0.01044909, + "balance_loss_clip": 1.0295074, + "balance_loss_mlp": 1.04403305, + "epoch": 0.3481737562002104, + "flos": 26870303105280.0, + "grad_norm": 2.0218239222922785, + "language_loss": 0.82098949, + "learning_rate": 3.0287534906664097e-06, + "loss": 0.8426879, + "num_input_tokens_seen": 124644005, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.80859375, + "step": 5791, + "time_per_iteration": 2.4949092864990234 + }, + { + "auxiliary_loss_clip": 0.01124824, + "auxiliary_loss_mlp": 0.01038811, + "balance_loss_clip": 1.02386248, + "balance_loss_mlp": 1.04235744, + "epoch": 0.3482338794528784, + "flos": 28908664104960.0, + "grad_norm": 1.7691925727921667, + "language_loss": 0.77531552, + "learning_rate": 3.028419482721056e-06, + "loss": 0.79695195, + "num_input_tokens_seen": 124663020, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.82421875, + "step": 5792, + "time_per_iteration": 2.5464468002319336 + }, + { + "auxiliary_loss_clip": 0.01121021, + "auxiliary_loss_mlp": 0.01031113, + "balance_loss_clip": 1.01623607, + "balance_loss_mlp": 1.04100966, + "epoch": 0.3482940027055464, + "flos": 22200767043840.0, + "grad_norm": 1.5041206153246893, + "language_loss": 0.81592953, + "learning_rate": 3.0280854357779325e-06, + "loss": 0.83745086, + "num_input_tokens_seen": 124682975, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.80078125, + "step": 5793, + "time_per_iteration": 2.454220771789551 + }, + { + "auxiliary_loss_clip": 0.01126572, + "auxiliary_loss_mlp": 0.01046613, + "balance_loss_clip": 1.03078222, + "balance_loss_mlp": 1.04426205, + "epoch": 0.34835412595821436, + "flos": 20302708567680.0, + "grad_norm": 1.7524057524538565, + "language_loss": 0.76222527, + "learning_rate": 3.027751349849706e-06, + "loss": 0.78395712, + "num_input_tokens_seen": 124701340, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8203125, + "step": 5794, + "time_per_iteration": 2.485077142715454 + }, + { + "auxiliary_loss_clip": 0.01121136, + "auxiliary_loss_mlp": 0.01036584, + "balance_loss_clip": 1.02165866, + "balance_loss_mlp": 1.04168189, + "epoch": 0.3484142492108823, + "flos": 20449691020800.0, + "grad_norm": 2.2347385462744165, + "language_loss": 0.56926, + "learning_rate": 3.0274172249490456e-06, + "loss": 0.59083712, + "num_input_tokens_seen": 124719165, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.796875, + "step": 5795, + "time_per_iteration": 2.4378490447998047 + }, + { + "auxiliary_loss_clip": 0.01121205, + "auxiliary_loss_mlp": 0.01036633, + "balance_loss_clip": 1.02250659, + "balance_loss_mlp": 1.04285967, + "epoch": 0.3484743724635503, + "flos": 24352929308160.0, + "grad_norm": 2.137832792929428, + "language_loss": 0.82437253, + "learning_rate": 3.0270830610886213e-06, + "loss": 0.84595084, + "num_input_tokens_seen": 124738670, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.78125, + "step": 5796, + "time_per_iteration": 2.5187671184539795 + }, + { + "auxiliary_loss_clip": 0.01120499, + "auxiliary_loss_mlp": 0.01029245, + "balance_loss_clip": 1.0153811, + "balance_loss_mlp": 1.043782, + "epoch": 0.34853449571621825, + "flos": 24353001135360.0, + "grad_norm": 1.7817355656860259, + "language_loss": 0.83580989, + "learning_rate": 3.0267488582811033e-06, + "loss": 0.85730731, + "num_input_tokens_seen": 124758760, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 5797, + "time_per_iteration": 2.518832206726074 + }, + { + "auxiliary_loss_clip": 0.01119982, + "auxiliary_loss_mlp": 0.01034692, + "balance_loss_clip": 1.02017224, + "balance_loss_mlp": 1.04206371, + "epoch": 0.3485946189688862, + "flos": 27267690245760.0, + "grad_norm": 1.7199370679887815, + "language_loss": 0.73215538, + "learning_rate": 3.026414616539167e-06, + "loss": 0.7537021, + "num_input_tokens_seen": 124777765, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 5798, + "time_per_iteration": 2.499967575073242 + }, + { + "auxiliary_loss_clip": 0.01123251, + "auxiliary_loss_mlp": 0.01041885, + "balance_loss_clip": 1.02660251, + "balance_loss_mlp": 1.04203498, + "epoch": 0.3486547422215542, + "flos": 20156695781760.0, + "grad_norm": 2.0872044860332597, + "language_loss": 0.75936413, + "learning_rate": 3.026080335875485e-06, + "loss": 0.78101552, + "num_input_tokens_seen": 124796775, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8125, + "step": 5799, + "time_per_iteration": 2.4452474117279053 + }, + { + "auxiliary_loss_clip": 0.01121272, + "auxiliary_loss_mlp": 0.01038695, + "balance_loss_clip": 1.0248909, + "balance_loss_mlp": 1.04197407, + "epoch": 0.34871486547422215, + "flos": 20230348619520.0, + "grad_norm": 1.7461935027983841, + "language_loss": 0.75557071, + "learning_rate": 3.025746016302734e-06, + "loss": 0.7771703, + "num_input_tokens_seen": 124815825, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.79296875, + "step": 5800, + "time_per_iteration": 2.4526796340942383 + }, + { + "auxiliary_loss_clip": 0.01129939, + "auxiliary_loss_mlp": 0.0104466, + "balance_loss_clip": 1.02854276, + "balance_loss_mlp": 1.04578733, + "epoch": 0.3487749887268901, + "flos": 44053234882560.0, + "grad_norm": 2.3150001070935127, + "language_loss": 0.67645729, + "learning_rate": 3.025411657833591e-06, + "loss": 0.69820327, + "num_input_tokens_seen": 124838420, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.84375, + "step": 5801, + "time_per_iteration": 2.644601821899414 + }, + { + "auxiliary_loss_clip": 0.01122812, + "auxiliary_loss_mlp": 0.0104057, + "balance_loss_clip": 1.02563334, + "balance_loss_mlp": 1.04446411, + "epoch": 0.3488351119795581, + "flos": 23295144666240.0, + "grad_norm": 1.9000140831486088, + "language_loss": 0.76785576, + "learning_rate": 3.025077260480735e-06, + "loss": 0.78948951, + "num_input_tokens_seen": 124857320, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.78515625, + "step": 5802, + "time_per_iteration": 2.46921968460083 + }, + { + "auxiliary_loss_clip": 0.01118956, + "auxiliary_loss_mlp": 0.01033761, + "balance_loss_clip": 1.01905692, + "balance_loss_mlp": 1.04294538, + "epoch": 0.34889523523222604, + "flos": 19934839428480.0, + "grad_norm": 1.750768588632487, + "language_loss": 0.78868455, + "learning_rate": 3.0247428242568474e-06, + "loss": 0.81021172, + "num_input_tokens_seen": 124875685, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.76171875, + "step": 5803, + "time_per_iteration": 3.979863405227661 + }, + { + "auxiliary_loss_clip": 0.01123257, + "auxiliary_loss_mlp": 0.01036614, + "balance_loss_clip": 1.02266085, + "balance_loss_mlp": 1.0410372, + "epoch": 0.348955358484894, + "flos": 30446179816320.0, + "grad_norm": 1.9657380954946277, + "language_loss": 0.67745399, + "learning_rate": 3.0244083491746085e-06, + "loss": 0.69905275, + "num_input_tokens_seen": 124895960, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.8203125, + "step": 5804, + "time_per_iteration": 3.8562989234924316 + }, + { + "auxiliary_loss_clip": 0.01121349, + "auxiliary_loss_mlp": 0.01044714, + "balance_loss_clip": 1.03001559, + "balance_loss_mlp": 1.0454638, + "epoch": 0.349015481737562, + "flos": 17999972490240.0, + "grad_norm": 2.669385195944029, + "language_loss": 0.76021814, + "learning_rate": 3.024073835246702e-06, + "loss": 0.78187871, + "num_input_tokens_seen": 124914140, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7578125, + "step": 5805, + "time_per_iteration": 2.458235263824463 + }, + { + "auxiliary_loss_clip": 0.01124464, + "auxiliary_loss_mlp": 0.01036858, + "balance_loss_clip": 1.02199244, + "balance_loss_mlp": 1.0451802, + "epoch": 0.34907560499023, + "flos": 27198490694400.0, + "grad_norm": 3.0752866237359884, + "language_loss": 0.67804134, + "learning_rate": 3.023739282485814e-06, + "loss": 0.69965458, + "num_input_tokens_seen": 124934180, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.79296875, + "step": 5806, + "time_per_iteration": 2.4840877056121826 + }, + { + "auxiliary_loss_clip": 0.01126527, + "auxiliary_loss_mlp": 0.01040199, + "balance_loss_clip": 1.02523851, + "balance_loss_mlp": 1.04571056, + "epoch": 0.34913572824289796, + "flos": 30226873328640.0, + "grad_norm": 1.4876164360326454, + "language_loss": 0.71957624, + "learning_rate": 3.023404690904629e-06, + "loss": 0.74124348, + "num_input_tokens_seen": 124956060, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8046875, + "step": 5807, + "time_per_iteration": 2.542815685272217 + }, + { + "auxiliary_loss_clip": 0.01123687, + "auxiliary_loss_mlp": 0.01038505, + "balance_loss_clip": 1.02295971, + "balance_loss_mlp": 1.04158592, + "epoch": 0.3491958514955659, + "flos": 29971907614080.0, + "grad_norm": 1.7054576034597768, + "language_loss": 0.74218416, + "learning_rate": 3.0230700605158364e-06, + "loss": 0.7638061, + "num_input_tokens_seen": 124976070, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8203125, + "step": 5808, + "time_per_iteration": 2.503438949584961 + }, + { + "auxiliary_loss_clip": 0.01122149, + "auxiliary_loss_mlp": 0.01048536, + "balance_loss_clip": 1.03412986, + "balance_loss_mlp": 1.04479396, + "epoch": 0.3492559747482339, + "flos": 22783273902720.0, + "grad_norm": 1.5095416937429198, + "language_loss": 0.84245461, + "learning_rate": 3.0227353913321238e-06, + "loss": 0.86416149, + "num_input_tokens_seen": 124996995, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7734375, + "step": 5809, + "time_per_iteration": 2.4860358238220215 + }, + { + "auxiliary_loss_clip": 0.01119567, + "auxiliary_loss_mlp": 0.01036784, + "balance_loss_clip": 1.02354026, + "balance_loss_mlp": 1.04322374, + "epoch": 0.34931609800090185, + "flos": 26068022881920.0, + "grad_norm": 1.8434153763939258, + "language_loss": 0.80251479, + "learning_rate": 3.0224006833661835e-06, + "loss": 0.82407832, + "num_input_tokens_seen": 125015600, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.76171875, + "step": 5810, + "time_per_iteration": 2.481653928756714 + }, + { + "auxiliary_loss_clip": 0.01124044, + "auxiliary_loss_mlp": 0.01039794, + "balance_loss_clip": 1.02613211, + "balance_loss_mlp": 1.04406404, + "epoch": 0.3493762212535698, + "flos": 29242023252480.0, + "grad_norm": 1.967526444092296, + "language_loss": 0.75335366, + "learning_rate": 3.0220659366307057e-06, + "loss": 0.77499199, + "num_input_tokens_seen": 125035290, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.80078125, + "step": 5811, + "time_per_iteration": 2.534524440765381 + }, + { + "auxiliary_loss_clip": 0.01128245, + "auxiliary_loss_mlp": 0.01039888, + "balance_loss_clip": 1.02543986, + "balance_loss_mlp": 1.04616523, + "epoch": 0.3494363445062378, + "flos": 27126058919040.0, + "grad_norm": 1.4977831051483896, + "language_loss": 0.80070162, + "learning_rate": 3.021731151138386e-06, + "loss": 0.82238293, + "num_input_tokens_seen": 125057130, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8203125, + "step": 5812, + "time_per_iteration": 2.503074884414673 + }, + { + "auxiliary_loss_clip": 0.01123214, + "auxiliary_loss_mlp": 0.01042781, + "balance_loss_clip": 1.02746272, + "balance_loss_mlp": 1.04195547, + "epoch": 0.34949646775890575, + "flos": 12276207233280.0, + "grad_norm": 1.9471141693502576, + "language_loss": 0.6923517, + "learning_rate": 3.021396326901918e-06, + "loss": 0.71401167, + "num_input_tokens_seen": 125073720, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8125, + "step": 5813, + "time_per_iteration": 2.4503591060638428 + }, + { + "auxiliary_loss_clip": 0.01122458, + "auxiliary_loss_mlp": 0.01039452, + "balance_loss_clip": 1.02492023, + "balance_loss_mlp": 1.04438448, + "epoch": 0.3495565910115737, + "flos": 17165516659200.0, + "grad_norm": 2.4036318537481334, + "language_loss": 0.77007949, + "learning_rate": 3.0210614639339998e-06, + "loss": 0.79169858, + "num_input_tokens_seen": 125090635, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.78125, + "step": 5814, + "time_per_iteration": 2.4173405170440674 + }, + { + "auxiliary_loss_clip": 0.01126142, + "auxiliary_loss_mlp": 0.01042541, + "balance_loss_clip": 1.02692485, + "balance_loss_mlp": 1.04406822, + "epoch": 0.3496167142642417, + "flos": 26465661417600.0, + "grad_norm": 1.5090517849605465, + "language_loss": 0.84283173, + "learning_rate": 3.020726562247328e-06, + "loss": 0.86451852, + "num_input_tokens_seen": 125110070, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8203125, + "step": 5815, + "time_per_iteration": 2.5173141956329346 + }, + { + "auxiliary_loss_clip": 0.01124466, + "auxiliary_loss_mlp": 0.01033251, + "balance_loss_clip": 1.01981044, + "balance_loss_mlp": 1.04368711, + "epoch": 0.34967683751690964, + "flos": 17414843938560.0, + "grad_norm": 2.123091285603595, + "language_loss": 0.77423191, + "learning_rate": 3.0203916218546024e-06, + "loss": 0.79580915, + "num_input_tokens_seen": 125125730, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.80859375, + "step": 5816, + "time_per_iteration": 2.413438558578491 + }, + { + "auxiliary_loss_clip": 0.01128865, + "auxiliary_loss_mlp": 0.01042596, + "balance_loss_clip": 1.02761126, + "balance_loss_mlp": 1.0468061, + "epoch": 0.3497369607695776, + "flos": 22600021691520.0, + "grad_norm": 2.144763996717865, + "language_loss": 0.58441401, + "learning_rate": 3.0200566427685246e-06, + "loss": 0.60612863, + "num_input_tokens_seen": 125146195, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8203125, + "step": 5817, + "time_per_iteration": 2.5161447525024414 + }, + { + "auxiliary_loss_clip": 0.01042618, + "auxiliary_loss_mlp": 0.01011257, + "balance_loss_clip": 1.00957632, + "balance_loss_mlp": 1.01738954, + "epoch": 0.34979708402224563, + "flos": 68529374818560.0, + "grad_norm": 0.8658844915790124, + "language_loss": 0.59855008, + "learning_rate": 3.0197216250017975e-06, + "loss": 0.61908889, + "num_input_tokens_seen": 125207790, + "router_z_loss_clip": 0.0168457, + "router_z_loss_mlp": 0.25195312, + "step": 5818, + "time_per_iteration": 3.105595111846924 + }, + { + "auxiliary_loss_clip": 0.01123632, + "auxiliary_loss_mlp": 0.01036752, + "balance_loss_clip": 1.02226782, + "balance_loss_mlp": 1.04561055, + "epoch": 0.3498572072749136, + "flos": 18989634988800.0, + "grad_norm": 3.0068929936640103, + "language_loss": 0.83458424, + "learning_rate": 3.019386568567123e-06, + "loss": 0.85618806, + "num_input_tokens_seen": 125226220, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 5819, + "time_per_iteration": 2.47537899017334 + }, + { + "auxiliary_loss_clip": 0.01123279, + "auxiliary_loss_mlp": 0.0103063, + "balance_loss_clip": 1.01655149, + "balance_loss_mlp": 1.04359841, + "epoch": 0.34991733052758156, + "flos": 27818883423360.0, + "grad_norm": 3.6330435008795483, + "language_loss": 0.70765841, + "learning_rate": 3.0190514734772083e-06, + "loss": 0.7291975, + "num_input_tokens_seen": 125247485, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.796875, + "step": 5820, + "time_per_iteration": 2.4817428588867188 + }, + { + "auxiliary_loss_clip": 0.01125706, + "auxiliary_loss_mlp": 0.01035768, + "balance_loss_clip": 1.022434, + "balance_loss_mlp": 1.04544306, + "epoch": 0.3499774537802495, + "flos": 33584197737600.0, + "grad_norm": 2.1579309336976547, + "language_loss": 0.70112801, + "learning_rate": 3.018716339744759e-06, + "loss": 0.7227428, + "num_input_tokens_seen": 125268625, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.80078125, + "step": 5821, + "time_per_iteration": 2.578753709793091 + }, + { + "auxiliary_loss_clip": 0.01131817, + "auxiliary_loss_mlp": 0.0103919, + "balance_loss_clip": 1.02328706, + "balance_loss_mlp": 1.04798198, + "epoch": 0.3500375770329175, + "flos": 23476744851840.0, + "grad_norm": 2.9634934958204076, + "language_loss": 0.73591399, + "learning_rate": 3.0183811673824842e-06, + "loss": 0.75762403, + "num_input_tokens_seen": 125287530, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.83984375, + "step": 5822, + "time_per_iteration": 2.469041109085083 + }, + { + "auxiliary_loss_clip": 0.01127055, + "auxiliary_loss_mlp": 0.01036959, + "balance_loss_clip": 1.02150989, + "balance_loss_mlp": 1.0447278, + "epoch": 0.35009770028558546, + "flos": 19026048401280.0, + "grad_norm": 1.5203539526389718, + "language_loss": 0.78104019, + "learning_rate": 3.018045956403094e-06, + "loss": 0.80268037, + "num_input_tokens_seen": 125307020, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.82421875, + "step": 5823, + "time_per_iteration": 2.4932196140289307 + }, + { + "auxiliary_loss_clip": 0.01038228, + "auxiliary_loss_mlp": 0.01001335, + "balance_loss_clip": 0.99964237, + "balance_loss_mlp": 1.01332808, + "epoch": 0.3501578235382534, + "flos": 68351868783360.0, + "grad_norm": 1.4438996436497689, + "language_loss": 0.59237444, + "learning_rate": 3.017710706819298e-06, + "loss": 0.61277008, + "num_input_tokens_seen": 125370445, + "router_z_loss_clip": 0.01696777, + "router_z_loss_mlp": 0.24902344, + "step": 5824, + "time_per_iteration": 3.109966278076172 + }, + { + "auxiliary_loss_clip": 0.01125511, + "auxiliary_loss_mlp": 0.01036598, + "balance_loss_clip": 1.0213685, + "balance_loss_mlp": 1.04462993, + "epoch": 0.3502179467909214, + "flos": 21250893836160.0, + "grad_norm": 1.8425293735622459, + "language_loss": 0.84740114, + "learning_rate": 3.017375418643811e-06, + "loss": 0.86902225, + "num_input_tokens_seen": 125388900, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.80859375, + "step": 5825, + "time_per_iteration": 2.4780030250549316 + }, + { + "auxiliary_loss_clip": 0.01125254, + "auxiliary_loss_mlp": 0.01038054, + "balance_loss_clip": 1.02292657, + "balance_loss_mlp": 1.04522121, + "epoch": 0.35027807004358935, + "flos": 11942955826560.0, + "grad_norm": 2.24584207136959, + "language_loss": 0.82778502, + "learning_rate": 3.0170400918893464e-06, + "loss": 0.84941804, + "num_input_tokens_seen": 125402675, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.80078125, + "step": 5826, + "time_per_iteration": 2.4147045612335205 + }, + { + "auxiliary_loss_clip": 0.01126938, + "auxiliary_loss_mlp": 0.01041103, + "balance_loss_clip": 1.02587962, + "balance_loss_mlp": 1.04480314, + "epoch": 0.3503381932962573, + "flos": 21470918595840.0, + "grad_norm": 1.5075773428374344, + "language_loss": 0.80714649, + "learning_rate": 3.0167047265686186e-06, + "loss": 0.8288269, + "num_input_tokens_seen": 125421360, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8203125, + "step": 5827, + "time_per_iteration": 2.4650330543518066 + }, + { + "auxiliary_loss_clip": 0.01123347, + "auxiliary_loss_mlp": 0.01035841, + "balance_loss_clip": 1.0220902, + "balance_loss_mlp": 1.04475152, + "epoch": 0.3503983165489253, + "flos": 21251109317760.0, + "grad_norm": 2.7582821019631836, + "language_loss": 0.70936024, + "learning_rate": 3.0163693226943467e-06, + "loss": 0.73095214, + "num_input_tokens_seen": 125440000, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.78515625, + "step": 5828, + "time_per_iteration": 2.4710564613342285 + }, + { + "auxiliary_loss_clip": 0.01130881, + "auxiliary_loss_mlp": 0.01043725, + "balance_loss_clip": 1.02666616, + "balance_loss_mlp": 1.04788435, + "epoch": 0.35045843980159325, + "flos": 27815723026560.0, + "grad_norm": 1.628373483521701, + "language_loss": 0.79397106, + "learning_rate": 3.016033880279248e-06, + "loss": 0.81571716, + "num_input_tokens_seen": 125460390, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.828125, + "step": 5829, + "time_per_iteration": 2.5081264972686768 + }, + { + "auxiliary_loss_clip": 0.01129997, + "auxiliary_loss_mlp": 0.0104564, + "balance_loss_clip": 1.02900994, + "balance_loss_mlp": 1.04607642, + "epoch": 0.3505185630542612, + "flos": 25921148169600.0, + "grad_norm": 1.7135270810407168, + "language_loss": 0.72111332, + "learning_rate": 3.0156983993360417e-06, + "loss": 0.74286962, + "num_input_tokens_seen": 125478410, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.83984375, + "step": 5830, + "time_per_iteration": 2.507263422012329 + }, + { + "auxiliary_loss_clip": 0.01122818, + "auxiliary_loss_mlp": 0.01033023, + "balance_loss_clip": 1.01801419, + "balance_loss_mlp": 1.04352021, + "epoch": 0.35057868630692923, + "flos": 20521763660160.0, + "grad_norm": 2.0188022258715996, + "language_loss": 0.88740343, + "learning_rate": 3.0153628798774513e-06, + "loss": 0.90896189, + "num_input_tokens_seen": 125495975, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.79296875, + "step": 5831, + "time_per_iteration": 2.4769816398620605 + }, + { + "auxiliary_loss_clip": 0.01122435, + "auxiliary_loss_mlp": 0.01040768, + "balance_loss_clip": 1.02560508, + "balance_loss_mlp": 1.04128802, + "epoch": 0.3506388095595972, + "flos": 20448649526400.0, + "grad_norm": 1.9377344606434141, + "language_loss": 0.78478962, + "learning_rate": 3.0150273219161985e-06, + "loss": 0.80642164, + "num_input_tokens_seen": 125515035, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8125, + "step": 5832, + "time_per_iteration": 2.458019971847534 + }, + { + "auxiliary_loss_clip": 0.01125835, + "auxiliary_loss_mlp": 0.01043672, + "balance_loss_clip": 1.02744734, + "balance_loss_mlp": 1.04360127, + "epoch": 0.35069893281226516, + "flos": 23109665811840.0, + "grad_norm": 1.8976688118149017, + "language_loss": 0.70859557, + "learning_rate": 3.014691725465008e-06, + "loss": 0.73029065, + "num_input_tokens_seen": 125535555, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.82421875, + "step": 5833, + "time_per_iteration": 2.494739055633545 + }, + { + "auxiliary_loss_clip": 0.01121087, + "auxiliary_loss_mlp": 0.01030807, + "balance_loss_clip": 1.01635337, + "balance_loss_mlp": 1.04384482, + "epoch": 0.35075905606493313, + "flos": 27271999877760.0, + "grad_norm": 1.3472514068868482, + "language_loss": 0.80878949, + "learning_rate": 3.014356090536606e-06, + "loss": 0.83030844, + "num_input_tokens_seen": 125558195, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7734375, + "step": 5834, + "time_per_iteration": 2.521343231201172 + }, + { + "auxiliary_loss_clip": 0.01124914, + "auxiliary_loss_mlp": 0.01043402, + "balance_loss_clip": 1.02823853, + "balance_loss_mlp": 1.04525888, + "epoch": 0.3508191793176011, + "flos": 19128608709120.0, + "grad_norm": 2.219662071096021, + "language_loss": 0.83629, + "learning_rate": 3.0140204171437183e-06, + "loss": 0.8579731, + "num_input_tokens_seen": 125575375, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.796875, + "step": 5835, + "time_per_iteration": 2.53587007522583 + }, + { + "auxiliary_loss_clip": 0.01123177, + "auxiliary_loss_mlp": 0.01043674, + "balance_loss_clip": 1.02932119, + "balance_loss_mlp": 1.04351568, + "epoch": 0.35087930257026906, + "flos": 25557588662400.0, + "grad_norm": 2.120648036265282, + "language_loss": 0.76607329, + "learning_rate": 3.0136847052990754e-06, + "loss": 0.78774178, + "num_input_tokens_seen": 125596745, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.796875, + "step": 5836, + "time_per_iteration": 2.54390549659729 + }, + { + "auxiliary_loss_clip": 0.01128097, + "auxiliary_loss_mlp": 0.01038562, + "balance_loss_clip": 1.02382731, + "balance_loss_mlp": 1.04872775, + "epoch": 0.350939425822937, + "flos": 18004246208640.0, + "grad_norm": 2.2292749531356986, + "language_loss": 0.77354801, + "learning_rate": 3.0133489550154074e-06, + "loss": 0.79521459, + "num_input_tokens_seen": 125613980, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.79296875, + "step": 5837, + "time_per_iteration": 2.4478273391723633 + }, + { + "auxiliary_loss_clip": 0.01123898, + "auxiliary_loss_mlp": 0.01044754, + "balance_loss_clip": 1.02998376, + "balance_loss_mlp": 1.04441822, + "epoch": 0.350999549075605, + "flos": 22273198819200.0, + "grad_norm": 1.6098451794116821, + "language_loss": 0.68129408, + "learning_rate": 3.0130131663054442e-06, + "loss": 0.70298064, + "num_input_tokens_seen": 125632100, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.79296875, + "step": 5838, + "time_per_iteration": 2.505833864212036 + }, + { + "auxiliary_loss_clip": 0.01122037, + "auxiliary_loss_mlp": 0.01034351, + "balance_loss_clip": 1.01945019, + "balance_loss_mlp": 1.04240978, + "epoch": 0.35105967232827295, + "flos": 14392279307520.0, + "grad_norm": 2.0937603738721173, + "language_loss": 0.83561182, + "learning_rate": 3.0126773391819215e-06, + "loss": 0.85717571, + "num_input_tokens_seen": 125649190, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.796875, + "step": 5839, + "time_per_iteration": 2.4378576278686523 + }, + { + "auxiliary_loss_clip": 0.01126069, + "auxiliary_loss_mlp": 0.01042905, + "balance_loss_clip": 1.02775335, + "balance_loss_mlp": 1.04351032, + "epoch": 0.3511197955809409, + "flos": 25082346792960.0, + "grad_norm": 1.6277808139419232, + "language_loss": 0.58590645, + "learning_rate": 3.012341473657572e-06, + "loss": 0.60759622, + "num_input_tokens_seen": 125668680, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.828125, + "step": 5840, + "time_per_iteration": 2.4883387088775635 + }, + { + "auxiliary_loss_clip": 0.01125241, + "auxiliary_loss_mlp": 0.01035574, + "balance_loss_clip": 1.02015984, + "balance_loss_mlp": 1.04445219, + "epoch": 0.3511799188336089, + "flos": 25884160139520.0, + "grad_norm": 2.7790843018814058, + "language_loss": 0.87061596, + "learning_rate": 3.0120055697451322e-06, + "loss": 0.89222413, + "num_input_tokens_seen": 125686935, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.80859375, + "step": 5841, + "time_per_iteration": 2.5035836696624756 + }, + { + "auxiliary_loss_clip": 0.01128185, + "auxiliary_loss_mlp": 0.01041931, + "balance_loss_clip": 1.02551615, + "balance_loss_mlp": 1.0455035, + "epoch": 0.35124004208627685, + "flos": 20083725302400.0, + "grad_norm": 1.6842451001577108, + "language_loss": 0.74924648, + "learning_rate": 3.0116696274573406e-06, + "loss": 0.77094764, + "num_input_tokens_seen": 125707180, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.828125, + "step": 5842, + "time_per_iteration": 2.4677891731262207 + }, + { + "auxiliary_loss_clip": 0.01125535, + "auxiliary_loss_mlp": 0.01040751, + "balance_loss_clip": 1.02552199, + "balance_loss_mlp": 1.04403496, + "epoch": 0.3513001653389448, + "flos": 17783431349760.0, + "grad_norm": 3.45436030057014, + "language_loss": 0.68184745, + "learning_rate": 3.0113336468069346e-06, + "loss": 0.70351034, + "num_input_tokens_seen": 125722780, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8125, + "step": 5843, + "time_per_iteration": 2.4356935024261475 + }, + { + "auxiliary_loss_clip": 0.01123467, + "auxiliary_loss_mlp": 0.01042343, + "balance_loss_clip": 1.02734041, + "balance_loss_mlp": 1.04418659, + "epoch": 0.3513602885916128, + "flos": 29387138198400.0, + "grad_norm": 3.71115813366519, + "language_loss": 0.65957326, + "learning_rate": 3.010997627806655e-06, + "loss": 0.68123138, + "num_input_tokens_seen": 125742110, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.79296875, + "step": 5844, + "time_per_iteration": 2.4961743354797363 + }, + { + "auxiliary_loss_clip": 0.01124887, + "auxiliary_loss_mlp": 0.01040447, + "balance_loss_clip": 1.02446079, + "balance_loss_mlp": 1.04466677, + "epoch": 0.3514204118442808, + "flos": 16179876483840.0, + "grad_norm": 2.036064641334285, + "language_loss": 0.75629944, + "learning_rate": 3.010661570469245e-06, + "loss": 0.77795279, + "num_input_tokens_seen": 125759980, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8046875, + "step": 5845, + "time_per_iteration": 5.325402498245239 + }, + { + "auxiliary_loss_clip": 0.01123091, + "auxiliary_loss_mlp": 0.01039418, + "balance_loss_clip": 1.02483845, + "balance_loss_mlp": 1.04537153, + "epoch": 0.35148053509694877, + "flos": 23834665923840.0, + "grad_norm": 2.494167784966283, + "language_loss": 0.73075795, + "learning_rate": 3.0103254748074465e-06, + "loss": 0.75238299, + "num_input_tokens_seen": 125772660, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.77734375, + "step": 5846, + "time_per_iteration": 2.4515323638916016 + }, + { + "auxiliary_loss_clip": 0.01127959, + "auxiliary_loss_mlp": 0.01040823, + "balance_loss_clip": 1.02587426, + "balance_loss_mlp": 1.04755926, + "epoch": 0.35154065834961673, + "flos": 20991295267200.0, + "grad_norm": 1.6229430725765215, + "language_loss": 0.75876832, + "learning_rate": 3.0099893408340046e-06, + "loss": 0.78045619, + "num_input_tokens_seen": 125791935, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8046875, + "step": 5847, + "time_per_iteration": 2.4869656562805176 + }, + { + "auxiliary_loss_clip": 0.0112227, + "auxiliary_loss_mlp": 0.01034732, + "balance_loss_clip": 1.02067161, + "balance_loss_mlp": 1.04212832, + "epoch": 0.3516007816022847, + "flos": 33255471444480.0, + "grad_norm": 2.14189752244475, + "language_loss": 0.72070903, + "learning_rate": 3.009653168561666e-06, + "loss": 0.74227905, + "num_input_tokens_seen": 125813455, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.80078125, + "step": 5848, + "time_per_iteration": 2.5580503940582275 + }, + { + "auxiliary_loss_clip": 0.01127957, + "auxiliary_loss_mlp": 0.01044586, + "balance_loss_clip": 1.02953017, + "balance_loss_mlp": 1.04648554, + "epoch": 0.35166090485495266, + "flos": 11726953390080.0, + "grad_norm": 2.252970750126207, + "language_loss": 0.89321303, + "learning_rate": 3.009316958003178e-06, + "loss": 0.91493851, + "num_input_tokens_seen": 125827660, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8125, + "step": 5849, + "time_per_iteration": 2.4167070388793945 + }, + { + "auxiliary_loss_clip": 0.01123705, + "auxiliary_loss_mlp": 0.01032745, + "balance_loss_clip": 1.01810622, + "balance_loss_mlp": 1.04373825, + "epoch": 0.3517210281076206, + "flos": 22638446265600.0, + "grad_norm": 2.8040734708025026, + "language_loss": 0.74810916, + "learning_rate": 3.0089807091712897e-06, + "loss": 0.76967371, + "num_input_tokens_seen": 125846655, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.80078125, + "step": 5850, + "time_per_iteration": 2.457970142364502 + }, + { + "auxiliary_loss_clip": 0.0112382, + "auxiliary_loss_mlp": 0.01032618, + "balance_loss_clip": 1.01809859, + "balance_loss_mlp": 1.04618788, + "epoch": 0.3517811513602886, + "flos": 21322750993920.0, + "grad_norm": 1.5003899492593988, + "language_loss": 0.7563765, + "learning_rate": 3.0086444220787515e-06, + "loss": 0.77794087, + "num_input_tokens_seen": 125866290, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.77734375, + "step": 5851, + "time_per_iteration": 2.48270845413208 + }, + { + "auxiliary_loss_clip": 0.01126446, + "auxiliary_loss_mlp": 0.01036787, + "balance_loss_clip": 1.0219928, + "balance_loss_mlp": 1.04683256, + "epoch": 0.35184127461295656, + "flos": 21032880238080.0, + "grad_norm": 2.074837490144385, + "language_loss": 0.87552518, + "learning_rate": 3.0083080967383165e-06, + "loss": 0.89715755, + "num_input_tokens_seen": 125884620, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 5852, + "time_per_iteration": 2.4690029621124268 + }, + { + "auxiliary_loss_clip": 0.01122074, + "auxiliary_loss_mlp": 0.01035979, + "balance_loss_clip": 1.02193666, + "balance_loss_mlp": 1.04361391, + "epoch": 0.3519013978656245, + "flos": 22455265881600.0, + "grad_norm": 2.0973347969099048, + "language_loss": 0.67880064, + "learning_rate": 3.007971733162737e-06, + "loss": 0.70038116, + "num_input_tokens_seen": 125902430, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.78515625, + "step": 5853, + "time_per_iteration": 2.4953458309173584 + }, + { + "auxiliary_loss_clip": 0.01125495, + "auxiliary_loss_mlp": 0.01034243, + "balance_loss_clip": 1.0195092, + "balance_loss_mlp": 1.04545975, + "epoch": 0.3519615211182925, + "flos": 13115295918720.0, + "grad_norm": 1.6680659623481517, + "language_loss": 0.8122859, + "learning_rate": 3.0076353313647686e-06, + "loss": 0.83388329, + "num_input_tokens_seen": 125920570, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.80078125, + "step": 5854, + "time_per_iteration": 2.4702916145324707 + }, + { + "auxiliary_loss_clip": 0.01122667, + "auxiliary_loss_mlp": 0.01030591, + "balance_loss_clip": 1.01734662, + "balance_loss_mlp": 1.04566765, + "epoch": 0.35202164437096045, + "flos": 19135144984320.0, + "grad_norm": 1.6003148952985655, + "language_loss": 0.73131359, + "learning_rate": 3.0072988913571666e-06, + "loss": 0.75284624, + "num_input_tokens_seen": 125939800, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.76953125, + "step": 5855, + "time_per_iteration": 2.4895823001861572 + }, + { + "auxiliary_loss_clip": 0.01120527, + "auxiliary_loss_mlp": 0.01039285, + "balance_loss_clip": 1.02549887, + "balance_loss_mlp": 1.04334307, + "epoch": 0.3520817676236284, + "flos": 26542187343360.0, + "grad_norm": 3.701560840262617, + "language_loss": 0.70894778, + "learning_rate": 3.006962413152691e-06, + "loss": 0.73054588, + "num_input_tokens_seen": 125958720, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7734375, + "step": 5856, + "time_per_iteration": 2.5133585929870605 + }, + { + "auxiliary_loss_clip": 0.01126422, + "auxiliary_loss_mlp": 0.01044153, + "balance_loss_clip": 1.02881038, + "balance_loss_mlp": 1.0456897, + "epoch": 0.3521418908762964, + "flos": 44893472803200.0, + "grad_norm": 1.8086114170356375, + "language_loss": 0.60915685, + "learning_rate": 3.0066258967640987e-06, + "loss": 0.63086259, + "num_input_tokens_seen": 125984310, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.80859375, + "step": 5857, + "time_per_iteration": 2.723238468170166 + }, + { + "auxiliary_loss_clip": 0.01123346, + "auxiliary_loss_mlp": 0.01039329, + "balance_loss_clip": 1.02434421, + "balance_loss_mlp": 1.04425693, + "epoch": 0.3522020141289644, + "flos": 20187398931840.0, + "grad_norm": 1.754440516271971, + "language_loss": 0.73341751, + "learning_rate": 3.006289342204152e-06, + "loss": 0.75504428, + "num_input_tokens_seen": 126002410, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7890625, + "step": 5858, + "time_per_iteration": 2.509556293487549 + }, + { + "auxiliary_loss_clip": 0.01125415, + "auxiliary_loss_mlp": 0.01041448, + "balance_loss_clip": 1.02720821, + "balance_loss_mlp": 1.04428148, + "epoch": 0.35226213738163237, + "flos": 27563917708800.0, + "grad_norm": 1.4710047028379252, + "language_loss": 0.76090813, + "learning_rate": 3.0059527494856126e-06, + "loss": 0.7825768, + "num_input_tokens_seen": 126022490, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8125, + "step": 5859, + "time_per_iteration": 2.584312677383423 + }, + { + "auxiliary_loss_clip": 0.0113226, + "auxiliary_loss_mlp": 0.01038835, + "balance_loss_clip": 1.0230875, + "balance_loss_mlp": 1.04828274, + "epoch": 0.35232226063430033, + "flos": 22966310632320.0, + "grad_norm": 1.6944630123418771, + "language_loss": 0.71475387, + "learning_rate": 3.0056161186212435e-06, + "loss": 0.73646474, + "num_input_tokens_seen": 126042895, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.83984375, + "step": 5860, + "time_per_iteration": 2.5120623111724854 + }, + { + "auxiliary_loss_clip": 0.01125655, + "auxiliary_loss_mlp": 0.0104098, + "balance_loss_clip": 1.02506578, + "balance_loss_mlp": 1.04208136, + "epoch": 0.3523823838869683, + "flos": 19168290259200.0, + "grad_norm": 2.10777684168558, + "language_loss": 0.6624974, + "learning_rate": 3.005279449623811e-06, + "loss": 0.68416381, + "num_input_tokens_seen": 126060130, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8359375, + "step": 5861, + "time_per_iteration": 2.4927096366882324 + }, + { + "auxiliary_loss_clip": 0.01123555, + "auxiliary_loss_mlp": 0.0103431, + "balance_loss_clip": 1.01994538, + "balance_loss_mlp": 1.04497313, + "epoch": 0.35244250713963626, + "flos": 17930988420480.0, + "grad_norm": 2.1064993181157843, + "language_loss": 0.66780227, + "learning_rate": 3.0049427425060815e-06, + "loss": 0.68938088, + "num_input_tokens_seen": 126077850, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78515625, + "step": 5862, + "time_per_iteration": 2.4275379180908203 + }, + { + "auxiliary_loss_clip": 0.01124727, + "auxiliary_loss_mlp": 0.01037294, + "balance_loss_clip": 1.02132034, + "balance_loss_mlp": 1.04420304, + "epoch": 0.35250263039230423, + "flos": 21432529935360.0, + "grad_norm": 2.0193315360348842, + "language_loss": 0.77049166, + "learning_rate": 3.0046059972808215e-06, + "loss": 0.79211187, + "num_input_tokens_seen": 126095985, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8046875, + "step": 5863, + "time_per_iteration": 2.504391670227051 + }, + { + "auxiliary_loss_clip": 0.0112515, + "auxiliary_loss_mlp": 0.01034667, + "balance_loss_clip": 1.02027822, + "balance_loss_mlp": 1.04449666, + "epoch": 0.3525627536449722, + "flos": 27416863428480.0, + "grad_norm": 2.7341123556359297, + "language_loss": 0.75018549, + "learning_rate": 3.0042692139608024e-06, + "loss": 0.77178371, + "num_input_tokens_seen": 126116070, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8046875, + "step": 5864, + "time_per_iteration": 2.4962751865386963 + }, + { + "auxiliary_loss_clip": 0.01123376, + "auxiliary_loss_mlp": 0.0104564, + "balance_loss_clip": 1.03110838, + "balance_loss_mlp": 1.04376507, + "epoch": 0.35262287689764016, + "flos": 24789818430720.0, + "grad_norm": 1.9972182581193567, + "language_loss": 0.79051632, + "learning_rate": 3.003932392558793e-06, + "loss": 0.81220651, + "num_input_tokens_seen": 126135205, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.796875, + "step": 5865, + "time_per_iteration": 2.5369789600372314 + }, + { + "auxiliary_loss_clip": 0.01130515, + "auxiliary_loss_mlp": 0.01045214, + "balance_loss_clip": 1.02901387, + "balance_loss_mlp": 1.04835618, + "epoch": 0.3526830001503081, + "flos": 17821604528640.0, + "grad_norm": 1.8375125007543296, + "language_loss": 0.81622374, + "learning_rate": 3.0035955330875677e-06, + "loss": 0.8379811, + "num_input_tokens_seen": 126151895, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8203125, + "step": 5866, + "time_per_iteration": 2.497587203979492 + }, + { + "auxiliary_loss_clip": 0.01131205, + "auxiliary_loss_mlp": 0.01037377, + "balance_loss_clip": 1.02081871, + "balance_loss_mlp": 1.04493296, + "epoch": 0.3527431234029761, + "flos": 18078114528000.0, + "grad_norm": 2.1796505180833696, + "language_loss": 0.84552217, + "learning_rate": 3.0032586355598986e-06, + "loss": 0.867208, + "num_input_tokens_seen": 126168515, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.86328125, + "step": 5867, + "time_per_iteration": 2.5673649311065674 + }, + { + "auxiliary_loss_clip": 0.01126594, + "auxiliary_loss_mlp": 0.01043148, + "balance_loss_clip": 1.02764452, + "balance_loss_mlp": 1.04441357, + "epoch": 0.35280324665564405, + "flos": 19427350124160.0, + "grad_norm": 2.2018810166756873, + "language_loss": 0.74618357, + "learning_rate": 3.0029216999885613e-06, + "loss": 0.76788092, + "num_input_tokens_seen": 126186460, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8203125, + "step": 5868, + "time_per_iteration": 2.4571762084960938 + }, + { + "auxiliary_loss_clip": 0.01127392, + "auxiliary_loss_mlp": 0.01039387, + "balance_loss_clip": 1.02433038, + "balance_loss_mlp": 1.04489541, + "epoch": 0.352863369908312, + "flos": 21504027957120.0, + "grad_norm": 2.0366485396940615, + "language_loss": 0.61648643, + "learning_rate": 3.0025847263863327e-06, + "loss": 0.63815421, + "num_input_tokens_seen": 126206170, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.82421875, + "step": 5869, + "time_per_iteration": 2.5125019550323486 + }, + { + "auxiliary_loss_clip": 0.01124688, + "auxiliary_loss_mlp": 0.01042493, + "balance_loss_clip": 1.02690625, + "balance_loss_mlp": 1.04286385, + "epoch": 0.35292349316098, + "flos": 22309504490880.0, + "grad_norm": 2.290977208251557, + "language_loss": 0.74328029, + "learning_rate": 3.0022477147659917e-06, + "loss": 0.76495212, + "num_input_tokens_seen": 126225605, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8203125, + "step": 5870, + "time_per_iteration": 2.4636306762695312 + }, + { + "auxiliary_loss_clip": 0.0112446, + "auxiliary_loss_mlp": 0.01036399, + "balance_loss_clip": 1.02097332, + "balance_loss_mlp": 1.04412317, + "epoch": 0.352983616413648, + "flos": 33109745967360.0, + "grad_norm": 1.44010977521146, + "language_loss": 0.71498513, + "learning_rate": 3.001910665140316e-06, + "loss": 0.73659372, + "num_input_tokens_seen": 126250230, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8046875, + "step": 5871, + "time_per_iteration": 2.629002094268799 + }, + { + "auxiliary_loss_clip": 0.01120822, + "auxiliary_loss_mlp": 0.01033704, + "balance_loss_clip": 1.01999545, + "balance_loss_mlp": 1.04340768, + "epoch": 0.35304373966631597, + "flos": 18696603836160.0, + "grad_norm": 2.215441176085892, + "language_loss": 0.74219513, + "learning_rate": 3.0015735775220873e-06, + "loss": 0.76374042, + "num_input_tokens_seen": 126268315, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 5872, + "time_per_iteration": 2.4672691822052 + }, + { + "auxiliary_loss_clip": 0.01121667, + "auxiliary_loss_mlp": 0.01036996, + "balance_loss_clip": 1.02291727, + "balance_loss_mlp": 1.04295182, + "epoch": 0.35310386291898394, + "flos": 23364954748800.0, + "grad_norm": 1.6120105579455812, + "language_loss": 0.82492435, + "learning_rate": 3.001236451924089e-06, + "loss": 0.84651101, + "num_input_tokens_seen": 126288390, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 5873, + "time_per_iteration": 2.549706220626831 + }, + { + "auxiliary_loss_clip": 0.01128213, + "auxiliary_loss_mlp": 0.01044661, + "balance_loss_clip": 1.02800715, + "balance_loss_mlp": 1.04399252, + "epoch": 0.3531639861716519, + "flos": 24461954064000.0, + "grad_norm": 1.8495868157058504, + "language_loss": 0.6583339, + "learning_rate": 3.000899288359104e-06, + "loss": 0.68006265, + "num_input_tokens_seen": 126305750, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.84375, + "step": 5874, + "time_per_iteration": 2.4949634075164795 + }, + { + "auxiliary_loss_clip": 0.01044147, + "auxiliary_loss_mlp": 0.01006984, + "balance_loss_clip": 1.00510025, + "balance_loss_mlp": 1.01915693, + "epoch": 0.35322410942431987, + "flos": 70312446881280.0, + "grad_norm": 0.771003921858337, + "language_loss": 0.61583531, + "learning_rate": 3.000562086839917e-06, + "loss": 0.63634658, + "num_input_tokens_seen": 126362495, + "router_z_loss_clip": 0.01879883, + "router_z_loss_mlp": 0.25, + "step": 5875, + "time_per_iteration": 2.9931485652923584 + }, + { + "auxiliary_loss_clip": 0.01124819, + "auxiliary_loss_mlp": 0.01043824, + "balance_loss_clip": 1.02995443, + "balance_loss_mlp": 1.04544568, + "epoch": 0.35328423267698783, + "flos": 19820894509440.0, + "grad_norm": 1.6836782364007539, + "language_loss": 0.800933, + "learning_rate": 3.0002248473793163e-06, + "loss": 0.82261944, + "num_input_tokens_seen": 126378320, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.796875, + "step": 5876, + "time_per_iteration": 2.443178415298462 + }, + { + "auxiliary_loss_clip": 0.01041911, + "auxiliary_loss_mlp": 0.01006634, + "balance_loss_clip": 1.00477409, + "balance_loss_mlp": 1.01663578, + "epoch": 0.3533443559296558, + "flos": 60826356391680.0, + "grad_norm": 1.6287450036197537, + "language_loss": 0.5674026, + "learning_rate": 2.999887569990088e-06, + "loss": 0.587888, + "num_input_tokens_seen": 126442735, + "router_z_loss_clip": 0.01855469, + "router_z_loss_mlp": 0.25195312, + "step": 5877, + "time_per_iteration": 3.1782116889953613 + }, + { + "auxiliary_loss_clip": 0.01124291, + "auxiliary_loss_mlp": 0.01030449, + "balance_loss_clip": 1.01677024, + "balance_loss_mlp": 1.04401922, + "epoch": 0.35340447918232376, + "flos": 24755775315840.0, + "grad_norm": 1.5579095187110108, + "language_loss": 0.71649593, + "learning_rate": 2.999550254685024e-06, + "loss": 0.73804337, + "num_input_tokens_seen": 126463090, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.80078125, + "step": 5878, + "time_per_iteration": 2.4984474182128906 + }, + { + "auxiliary_loss_clip": 0.01123007, + "auxiliary_loss_mlp": 0.0103937, + "balance_loss_clip": 1.02498162, + "balance_loss_mlp": 1.04198527, + "epoch": 0.3534646024349917, + "flos": 21796304924160.0, + "grad_norm": 1.9384917614544617, + "language_loss": 0.78492844, + "learning_rate": 2.9992129014769136e-06, + "loss": 0.80655217, + "num_input_tokens_seen": 126482105, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.8125, + "step": 5879, + "time_per_iteration": 2.5369913578033447 + }, + { + "auxiliary_loss_clip": 0.01126898, + "auxiliary_loss_mlp": 0.01045347, + "balance_loss_clip": 1.02870536, + "balance_loss_mlp": 1.04373121, + "epoch": 0.3535247256876597, + "flos": 20012119539840.0, + "grad_norm": 2.0656781659104917, + "language_loss": 0.63695049, + "learning_rate": 2.9988755103785493e-06, + "loss": 0.65867293, + "num_input_tokens_seen": 126502125, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.83203125, + "step": 5880, + "time_per_iteration": 2.457787036895752 + }, + { + "auxiliary_loss_clip": 0.01125585, + "auxiliary_loss_mlp": 0.01036241, + "balance_loss_clip": 1.02078009, + "balance_loss_mlp": 1.04375386, + "epoch": 0.35358484894032766, + "flos": 18187929383040.0, + "grad_norm": 3.125568384757795, + "language_loss": 0.65818816, + "learning_rate": 2.998538081402727e-06, + "loss": 0.67980647, + "num_input_tokens_seen": 126521950, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.81640625, + "step": 5881, + "time_per_iteration": 2.5198867321014404 + }, + { + "auxiliary_loss_clip": 0.01119138, + "auxiliary_loss_mlp": 0.01031894, + "balance_loss_clip": 1.01851869, + "balance_loss_mlp": 1.04197288, + "epoch": 0.3536449721929956, + "flos": 22820369673600.0, + "grad_norm": 1.3882047203281038, + "language_loss": 0.75280428, + "learning_rate": 2.998200614562239e-06, + "loss": 0.77431458, + "num_input_tokens_seen": 126542445, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7734375, + "step": 5882, + "time_per_iteration": 2.4526872634887695 + }, + { + "auxiliary_loss_clip": 0.01126623, + "auxiliary_loss_mlp": 0.01037746, + "balance_loss_clip": 1.02266037, + "balance_loss_mlp": 1.04543018, + "epoch": 0.3537050954456636, + "flos": 26432336574720.0, + "grad_norm": 2.123888211837838, + "language_loss": 0.70349854, + "learning_rate": 2.9978631098698847e-06, + "loss": 0.72514224, + "num_input_tokens_seen": 126560690, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8125, + "step": 5883, + "time_per_iteration": 2.538865566253662 + }, + { + "auxiliary_loss_clip": 0.01129519, + "auxiliary_loss_mlp": 0.01038175, + "balance_loss_clip": 1.0228982, + "balance_loss_mlp": 1.04584253, + "epoch": 0.3537652186983316, + "flos": 17197153562880.0, + "grad_norm": 2.009195754637657, + "language_loss": 0.78500903, + "learning_rate": 2.9975255673384614e-06, + "loss": 0.80668598, + "num_input_tokens_seen": 126577620, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8359375, + "step": 5884, + "time_per_iteration": 2.4410510063171387 + }, + { + "auxiliary_loss_clip": 0.0112138, + "auxiliary_loss_mlp": 0.01032576, + "balance_loss_clip": 1.01901007, + "balance_loss_mlp": 1.04336667, + "epoch": 0.3538253419509996, + "flos": 19536769929600.0, + "grad_norm": 1.8922441591552446, + "language_loss": 0.75478536, + "learning_rate": 2.9971879869807673e-06, + "loss": 0.77632499, + "num_input_tokens_seen": 126596235, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.78125, + "step": 5885, + "time_per_iteration": 2.555816650390625 + }, + { + "auxiliary_loss_clip": 0.01127447, + "auxiliary_loss_mlp": 0.01042213, + "balance_loss_clip": 1.02666783, + "balance_loss_mlp": 1.04478371, + "epoch": 0.35388546520366754, + "flos": 12128578335360.0, + "grad_norm": 2.2081606315958635, + "language_loss": 0.82679224, + "learning_rate": 2.996850368809606e-06, + "loss": 0.84848893, + "num_input_tokens_seen": 126612830, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.82421875, + "step": 5886, + "time_per_iteration": 2.482151985168457 + }, + { + "auxiliary_loss_clip": 0.01124743, + "auxiliary_loss_mlp": 0.01032293, + "balance_loss_clip": 1.01717782, + "balance_loss_mlp": 1.04533887, + "epoch": 0.3539455884563355, + "flos": 19678149861120.0, + "grad_norm": 2.4580910750403775, + "language_loss": 0.78723359, + "learning_rate": 2.9965127128377787e-06, + "loss": 0.80880398, + "num_input_tokens_seen": 126630910, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.79296875, + "step": 5887, + "time_per_iteration": 5.388309001922607 + }, + { + "auxiliary_loss_clip": 0.01122251, + "auxiliary_loss_mlp": 0.0104141, + "balance_loss_clip": 1.0269978, + "balance_loss_mlp": 1.04226518, + "epoch": 0.35400571170900347, + "flos": 18072045129600.0, + "grad_norm": 3.1093010737907867, + "language_loss": 0.65404654, + "learning_rate": 2.996175019078089e-06, + "loss": 0.67568314, + "num_input_tokens_seen": 126648365, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.80078125, + "step": 5888, + "time_per_iteration": 2.4438626766204834 + }, + { + "auxiliary_loss_clip": 0.01123271, + "auxiliary_loss_mlp": 0.01036138, + "balance_loss_clip": 1.02248812, + "balance_loss_mlp": 1.04373193, + "epoch": 0.35406583496167143, + "flos": 26068058795520.0, + "grad_norm": 1.6702882106954304, + "language_loss": 0.76662588, + "learning_rate": 2.9958372875433437e-06, + "loss": 0.78821993, + "num_input_tokens_seen": 126667500, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.796875, + "step": 5889, + "time_per_iteration": 2.503023624420166 + }, + { + "auxiliary_loss_clip": 0.01125083, + "auxiliary_loss_mlp": 0.01037766, + "balance_loss_clip": 1.02329397, + "balance_loss_mlp": 1.0469135, + "epoch": 0.3541259582143394, + "flos": 19792453916160.0, + "grad_norm": 1.7418080185903937, + "language_loss": 0.80142188, + "learning_rate": 2.9954995182463478e-06, + "loss": 0.82305038, + "num_input_tokens_seen": 126686820, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 5890, + "time_per_iteration": 2.4669902324676514 + }, + { + "auxiliary_loss_clip": 0.01118725, + "auxiliary_loss_mlp": 0.01034555, + "balance_loss_clip": 1.02204418, + "balance_loss_mlp": 1.04123974, + "epoch": 0.35418608146700736, + "flos": 24022084112640.0, + "grad_norm": 1.4765808553545194, + "language_loss": 0.79590207, + "learning_rate": 2.99516171119991e-06, + "loss": 0.81743479, + "num_input_tokens_seen": 126706965, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7734375, + "step": 5891, + "time_per_iteration": 2.491048812866211 + }, + { + "auxiliary_loss_clip": 0.01123501, + "auxiliary_loss_mlp": 0.01037192, + "balance_loss_clip": 1.02260685, + "balance_loss_mlp": 1.04425383, + "epoch": 0.35424620471967533, + "flos": 12385770693120.0, + "grad_norm": 2.0747162768055616, + "language_loss": 0.73339593, + "learning_rate": 2.9948238664168415e-06, + "loss": 0.7550028, + "num_input_tokens_seen": 126724015, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.79296875, + "step": 5892, + "time_per_iteration": 2.497422695159912 + }, + { + "auxiliary_loss_clip": 0.01124613, + "auxiliary_loss_mlp": 0.01038788, + "balance_loss_clip": 1.02425075, + "balance_loss_mlp": 1.04473233, + "epoch": 0.3543063279723433, + "flos": 19673624747520.0, + "grad_norm": 1.9338165898472526, + "language_loss": 0.66916019, + "learning_rate": 2.9944859839099518e-06, + "loss": 0.69079423, + "num_input_tokens_seen": 126737565, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.796875, + "step": 5893, + "time_per_iteration": 2.4516420364379883 + }, + { + "auxiliary_loss_clip": 0.01123079, + "auxiliary_loss_mlp": 0.01037638, + "balance_loss_clip": 1.02253413, + "balance_loss_mlp": 1.04405212, + "epoch": 0.35436645122501126, + "flos": 21909208348800.0, + "grad_norm": 1.878049090913109, + "language_loss": 0.69472313, + "learning_rate": 2.9941480636920533e-06, + "loss": 0.71633029, + "num_input_tokens_seen": 126756095, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7890625, + "step": 5894, + "time_per_iteration": 2.479174852371216 + }, + { + "auxiliary_loss_clip": 0.01123499, + "auxiliary_loss_mlp": 0.01033075, + "balance_loss_clip": 1.01983714, + "balance_loss_mlp": 1.04524636, + "epoch": 0.3544265744776792, + "flos": 21719527603200.0, + "grad_norm": 1.6954645527360779, + "language_loss": 0.74891931, + "learning_rate": 2.9938101057759615e-06, + "loss": 0.77048504, + "num_input_tokens_seen": 126775455, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.78125, + "step": 5895, + "time_per_iteration": 2.4786908626556396 + }, + { + "auxiliary_loss_clip": 0.01122907, + "auxiliary_loss_mlp": 0.01037799, + "balance_loss_clip": 1.02366102, + "balance_loss_mlp": 1.04388869, + "epoch": 0.3544866977303472, + "flos": 21213223447680.0, + "grad_norm": 2.0548310630504854, + "language_loss": 0.83688253, + "learning_rate": 2.993472110174491e-06, + "loss": 0.85848963, + "num_input_tokens_seen": 126792320, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 5896, + "time_per_iteration": 2.4765214920043945 + }, + { + "auxiliary_loss_clip": 0.01122608, + "auxiliary_loss_mlp": 0.01048408, + "balance_loss_clip": 1.03348279, + "balance_loss_mlp": 1.0444181, + "epoch": 0.35454682098301515, + "flos": 29311402371840.0, + "grad_norm": 1.6634726813042469, + "language_loss": 0.70031154, + "learning_rate": 2.9931340769004576e-06, + "loss": 0.7220217, + "num_input_tokens_seen": 126813680, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.78125, + "step": 5897, + "time_per_iteration": 2.5142548084259033 + }, + { + "auxiliary_loss_clip": 0.01121754, + "auxiliary_loss_mlp": 0.01038358, + "balance_loss_clip": 1.02430916, + "balance_loss_mlp": 1.04337025, + "epoch": 0.3546069442356832, + "flos": 24316587722880.0, + "grad_norm": 1.7331024671064506, + "language_loss": 0.82091749, + "learning_rate": 2.9927960059666816e-06, + "loss": 0.84251857, + "num_input_tokens_seen": 126834395, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.78515625, + "step": 5898, + "time_per_iteration": 2.4900712966918945 + }, + { + "auxiliary_loss_clip": 0.0112068, + "auxiliary_loss_mlp": 0.01036408, + "balance_loss_clip": 1.0234853, + "balance_loss_mlp": 1.04411018, + "epoch": 0.35466706748835114, + "flos": 22857285876480.0, + "grad_norm": 1.4876974136883365, + "language_loss": 0.73901182, + "learning_rate": 2.9924578973859804e-06, + "loss": 0.76058269, + "num_input_tokens_seen": 126855145, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.765625, + "step": 5899, + "time_per_iteration": 2.498659133911133 + }, + { + "auxiliary_loss_clip": 0.01121982, + "auxiliary_loss_mlp": 0.01042838, + "balance_loss_clip": 1.02825308, + "balance_loss_mlp": 1.04316258, + "epoch": 0.3547271907410191, + "flos": 28330107742080.0, + "grad_norm": 1.69682390123668, + "language_loss": 0.79345262, + "learning_rate": 2.9921197511711763e-06, + "loss": 0.81510079, + "num_input_tokens_seen": 126873790, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7890625, + "step": 5900, + "time_per_iteration": 2.548612594604492 + }, + { + "auxiliary_loss_clip": 0.01123598, + "auxiliary_loss_mlp": 0.01040285, + "balance_loss_clip": 1.02556252, + "balance_loss_mlp": 1.04530048, + "epoch": 0.35478731399368707, + "flos": 23514092017920.0, + "grad_norm": 1.7758743329418227, + "language_loss": 0.81637204, + "learning_rate": 2.991781567335093e-06, + "loss": 0.83801091, + "num_input_tokens_seen": 126892865, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.78125, + "step": 5901, + "time_per_iteration": 2.6031999588012695 + }, + { + "auxiliary_loss_clip": 0.01127681, + "auxiliary_loss_mlp": 0.01034926, + "balance_loss_clip": 1.02063251, + "balance_loss_mlp": 1.04535294, + "epoch": 0.35484743724635504, + "flos": 18624315715200.0, + "grad_norm": 1.92677562296577, + "language_loss": 0.75667071, + "learning_rate": 2.9914433458905525e-06, + "loss": 0.77829683, + "num_input_tokens_seen": 126911935, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.82421875, + "step": 5902, + "time_per_iteration": 2.528026819229126 + }, + { + "auxiliary_loss_clip": 0.0112195, + "auxiliary_loss_mlp": 0.01036748, + "balance_loss_clip": 1.02359962, + "balance_loss_mlp": 1.04320014, + "epoch": 0.354907560499023, + "flos": 17384499924480.0, + "grad_norm": 1.7304108811682997, + "language_loss": 0.70582771, + "learning_rate": 2.991105086850381e-06, + "loss": 0.72741467, + "num_input_tokens_seen": 126930040, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7890625, + "step": 5903, + "time_per_iteration": 2.423454999923706 + }, + { + "auxiliary_loss_clip": 0.01124223, + "auxiliary_loss_mlp": 0.01034819, + "balance_loss_clip": 1.0205555, + "balance_loss_mlp": 1.04234982, + "epoch": 0.35496768375169097, + "flos": 19208546426880.0, + "grad_norm": 2.52210089781831, + "language_loss": 0.74574983, + "learning_rate": 2.9907667902274053e-06, + "loss": 0.76734024, + "num_input_tokens_seen": 126948390, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8203125, + "step": 5904, + "time_per_iteration": 2.462024688720703 + }, + { + "auxiliary_loss_clip": 0.0112423, + "auxiliary_loss_mlp": 0.01040901, + "balance_loss_clip": 1.02649426, + "balance_loss_mlp": 1.04362941, + "epoch": 0.35502780700435893, + "flos": 18332792933760.0, + "grad_norm": 2.0389703534000443, + "language_loss": 0.78855121, + "learning_rate": 2.9904284560344536e-06, + "loss": 0.81020248, + "num_input_tokens_seen": 126964905, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.8046875, + "step": 5905, + "time_per_iteration": 2.418665885925293 + }, + { + "auxiliary_loss_clip": 0.0111773, + "auxiliary_loss_mlp": 0.01031377, + "balance_loss_clip": 1.0190388, + "balance_loss_mlp": 1.04383469, + "epoch": 0.3550879302570269, + "flos": 15448555578240.0, + "grad_norm": 2.1398902938273547, + "language_loss": 0.72515827, + "learning_rate": 2.990090084284356e-06, + "loss": 0.74664938, + "num_input_tokens_seen": 126982000, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.73828125, + "step": 5906, + "time_per_iteration": 2.441795825958252 + }, + { + "auxiliary_loss_clip": 0.01128267, + "auxiliary_loss_mlp": 0.01037607, + "balance_loss_clip": 1.02187109, + "balance_loss_mlp": 1.04545534, + "epoch": 0.35514805350969486, + "flos": 21979197999360.0, + "grad_norm": 2.0230910533888107, + "language_loss": 0.74762344, + "learning_rate": 2.9897516749899426e-06, + "loss": 0.7692821, + "num_input_tokens_seen": 126998390, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.828125, + "step": 5907, + "time_per_iteration": 2.4404122829437256 + }, + { + "auxiliary_loss_clip": 0.01123497, + "auxiliary_loss_mlp": 0.01033795, + "balance_loss_clip": 1.01939988, + "balance_loss_mlp": 1.04492426, + "epoch": 0.3552081767623628, + "flos": 29861949104640.0, + "grad_norm": 1.7742327577799557, + "language_loss": 0.75751841, + "learning_rate": 2.989413228164047e-06, + "loss": 0.77909136, + "num_input_tokens_seen": 127020220, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78515625, + "step": 5908, + "time_per_iteration": 2.5631895065307617 + }, + { + "auxiliary_loss_clip": 0.01126393, + "auxiliary_loss_mlp": 0.01033964, + "balance_loss_clip": 1.01961696, + "balance_loss_mlp": 1.04734707, + "epoch": 0.3552683000150308, + "flos": 26432264747520.0, + "grad_norm": 1.7057235578436956, + "language_loss": 0.68026733, + "learning_rate": 2.989074743819502e-06, + "loss": 0.70187092, + "num_input_tokens_seen": 127038585, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7890625, + "step": 5909, + "time_per_iteration": 2.480511426925659 + }, + { + "auxiliary_loss_clip": 0.0112022, + "auxiliary_loss_mlp": 0.01032654, + "balance_loss_clip": 1.01937413, + "balance_loss_mlp": 1.04523396, + "epoch": 0.35532842326769876, + "flos": 19785989468160.0, + "grad_norm": 3.5777269988287297, + "language_loss": 0.78628188, + "learning_rate": 2.988736221969144e-06, + "loss": 0.8078106, + "num_input_tokens_seen": 127056215, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75, + "step": 5910, + "time_per_iteration": 2.4763131141662598 + }, + { + "auxiliary_loss_clip": 0.01128543, + "auxiliary_loss_mlp": 0.01040834, + "balance_loss_clip": 1.02545595, + "balance_loss_mlp": 1.04625309, + "epoch": 0.3553885465203668, + "flos": 17239277237760.0, + "grad_norm": 1.525011794663279, + "language_loss": 0.70639479, + "learning_rate": 2.98839766262581e-06, + "loss": 0.72808856, + "num_input_tokens_seen": 127075825, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8203125, + "step": 5911, + "time_per_iteration": 2.4335598945617676 + }, + { + "auxiliary_loss_clip": 0.01119575, + "auxiliary_loss_mlp": 0.01035647, + "balance_loss_clip": 1.02149105, + "balance_loss_mlp": 1.04294884, + "epoch": 0.35544866977303474, + "flos": 14934350430720.0, + "grad_norm": 1.9668748220600272, + "language_loss": 0.87014282, + "learning_rate": 2.9880590658023366e-06, + "loss": 0.89169508, + "num_input_tokens_seen": 127091205, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.765625, + "step": 5912, + "time_per_iteration": 2.461251735687256 + }, + { + "auxiliary_loss_clip": 0.01123002, + "auxiliary_loss_mlp": 0.01032384, + "balance_loss_clip": 1.018556, + "balance_loss_mlp": 1.04507196, + "epoch": 0.3555087930257027, + "flos": 19756040503680.0, + "grad_norm": 1.7619620740638822, + "language_loss": 0.7701745, + "learning_rate": 2.9877204315115646e-06, + "loss": 0.79172838, + "num_input_tokens_seen": 127109210, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.78125, + "step": 5913, + "time_per_iteration": 2.4517738819122314 + }, + { + "auxiliary_loss_clip": 0.01124528, + "auxiliary_loss_mlp": 0.01033929, + "balance_loss_clip": 1.02001143, + "balance_loss_mlp": 1.04793298, + "epoch": 0.3555689162783707, + "flos": 21068252156160.0, + "grad_norm": 1.3300117090522248, + "language_loss": 0.82507938, + "learning_rate": 2.9873817597663353e-06, + "loss": 0.84666395, + "num_input_tokens_seen": 127128400, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.765625, + "step": 5914, + "time_per_iteration": 2.4964141845703125 + }, + { + "auxiliary_loss_clip": 0.01124534, + "auxiliary_loss_mlp": 0.01031988, + "balance_loss_clip": 1.01771307, + "balance_loss_mlp": 1.04573739, + "epoch": 0.35562903953103864, + "flos": 33069633454080.0, + "grad_norm": 2.1657623831524604, + "language_loss": 0.70703268, + "learning_rate": 2.98704305057949e-06, + "loss": 0.72859794, + "num_input_tokens_seen": 127149965, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7890625, + "step": 5915, + "time_per_iteration": 2.5425658226013184 + }, + { + "auxiliary_loss_clip": 0.01120767, + "auxiliary_loss_mlp": 0.01038436, + "balance_loss_clip": 1.0249182, + "balance_loss_mlp": 1.04248476, + "epoch": 0.3556891627837066, + "flos": 20557853850240.0, + "grad_norm": 1.7489130528457595, + "language_loss": 0.76365829, + "learning_rate": 2.9867043039638737e-06, + "loss": 0.78525031, + "num_input_tokens_seen": 127169865, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.78125, + "step": 5916, + "time_per_iteration": 2.49629545211792 + }, + { + "auxiliary_loss_clip": 0.01128234, + "auxiliary_loss_mlp": 0.01037051, + "balance_loss_clip": 1.02360404, + "balance_loss_mlp": 1.04853928, + "epoch": 0.35574928603637457, + "flos": 20703327932160.0, + "grad_norm": 1.96232440030472, + "language_loss": 0.88380635, + "learning_rate": 2.986365519932332e-06, + "loss": 0.90545923, + "num_input_tokens_seen": 127188075, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.796875, + "step": 5917, + "time_per_iteration": 2.4549498558044434 + }, + { + "auxiliary_loss_clip": 0.01123557, + "auxiliary_loss_mlp": 0.01025214, + "balance_loss_clip": 1.01144493, + "balance_loss_mlp": 1.04562521, + "epoch": 0.35580940928904253, + "flos": 15194595444480.0, + "grad_norm": 2.0473051476373048, + "language_loss": 0.74389327, + "learning_rate": 2.98602669849771e-06, + "loss": 0.76538098, + "num_input_tokens_seen": 127206065, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.78125, + "step": 5918, + "time_per_iteration": 2.448164701461792 + }, + { + "auxiliary_loss_clip": 0.01039303, + "auxiliary_loss_mlp": 0.01015071, + "balance_loss_clip": 1.01344931, + "balance_loss_mlp": 1.01430607, + "epoch": 0.3558695325417105, + "flos": 58639145431680.0, + "grad_norm": 1.0267040132589962, + "language_loss": 0.63732457, + "learning_rate": 2.985687839672857e-06, + "loss": 0.65786839, + "num_input_tokens_seen": 127257885, + "router_z_loss_clip": 0.01623535, + "router_z_loss_mlp": 0.25, + "step": 5919, + "time_per_iteration": 2.837815999984741 + }, + { + "auxiliary_loss_clip": 0.01124878, + "auxiliary_loss_mlp": 0.01032772, + "balance_loss_clip": 1.01805615, + "balance_loss_mlp": 1.04376245, + "epoch": 0.35592965579437846, + "flos": 22018233104640.0, + "grad_norm": 2.8747663216478503, + "language_loss": 0.73868048, + "learning_rate": 2.9853489434706223e-06, + "loss": 0.76025695, + "num_input_tokens_seen": 127275550, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.80859375, + "step": 5920, + "time_per_iteration": 2.4837357997894287 + }, + { + "auxiliary_loss_clip": 0.0112079, + "auxiliary_loss_mlp": 0.01034089, + "balance_loss_clip": 1.02015972, + "balance_loss_mlp": 1.04353166, + "epoch": 0.35598977904704643, + "flos": 23367684182400.0, + "grad_norm": 1.659561193633535, + "language_loss": 0.77124226, + "learning_rate": 2.985010009903857e-06, + "loss": 0.79279101, + "num_input_tokens_seen": 127295110, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7734375, + "step": 5921, + "time_per_iteration": 2.461014986038208 + }, + { + "auxiliary_loss_clip": 0.01122705, + "auxiliary_loss_mlp": 0.0103307, + "balance_loss_clip": 1.01968277, + "balance_loss_mlp": 1.04409981, + "epoch": 0.3560499022997144, + "flos": 17785334770560.0, + "grad_norm": 3.1644779785561563, + "language_loss": 0.67710596, + "learning_rate": 2.9846710389854133e-06, + "loss": 0.69866371, + "num_input_tokens_seen": 127312865, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7890625, + "step": 5922, + "time_per_iteration": 2.495504140853882 + }, + { + "auxiliary_loss_clip": 0.01122686, + "auxiliary_loss_mlp": 0.01029775, + "balance_loss_clip": 1.01567268, + "balance_loss_mlp": 1.04373431, + "epoch": 0.35611002555238236, + "flos": 20740459616640.0, + "grad_norm": 1.9745978513449503, + "language_loss": 0.79269004, + "learning_rate": 2.9843320307281454e-06, + "loss": 0.81421471, + "num_input_tokens_seen": 127331710, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 5923, + "time_per_iteration": 2.4515416622161865 + }, + { + "auxiliary_loss_clip": 0.01124058, + "auxiliary_loss_mlp": 0.01039223, + "balance_loss_clip": 1.02631271, + "balance_loss_mlp": 1.04502511, + "epoch": 0.3561701488050504, + "flos": 19462219251840.0, + "grad_norm": 1.7698063934253627, + "language_loss": 0.85475516, + "learning_rate": 2.983992985144908e-06, + "loss": 0.87638795, + "num_input_tokens_seen": 127350950, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7890625, + "step": 5924, + "time_per_iteration": 2.4790685176849365 + }, + { + "auxiliary_loss_clip": 0.01121235, + "auxiliary_loss_mlp": 0.01037826, + "balance_loss_clip": 1.02344394, + "balance_loss_mlp": 1.04368067, + "epoch": 0.35623027205771834, + "flos": 30774942023040.0, + "grad_norm": 1.844353158814239, + "language_loss": 0.77513188, + "learning_rate": 2.9836539022485578e-06, + "loss": 0.79672253, + "num_input_tokens_seen": 127369385, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7734375, + "step": 5925, + "time_per_iteration": 2.5064613819122314 + }, + { + "auxiliary_loss_clip": 0.01119102, + "auxiliary_loss_mlp": 0.01043971, + "balance_loss_clip": 1.0301789, + "balance_loss_mlp": 1.04067063, + "epoch": 0.3562903953103863, + "flos": 16981079299200.0, + "grad_norm": 1.7016119178915972, + "language_loss": 0.75874609, + "learning_rate": 2.9833147820519535e-06, + "loss": 0.78037679, + "num_input_tokens_seen": 127386965, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.78125, + "step": 5926, + "time_per_iteration": 2.451852798461914 + }, + { + "auxiliary_loss_clip": 0.01125239, + "auxiliary_loss_mlp": 0.01036384, + "balance_loss_clip": 1.02194762, + "balance_loss_mlp": 1.04408717, + "epoch": 0.3563505185630543, + "flos": 23839837482240.0, + "grad_norm": 2.0486133546267737, + "language_loss": 0.69321811, + "learning_rate": 2.9829756245679544e-06, + "loss": 0.71483439, + "num_input_tokens_seen": 127406075, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.80859375, + "step": 5927, + "time_per_iteration": 2.4770915508270264 + }, + { + "auxiliary_loss_clip": 0.01119921, + "auxiliary_loss_mlp": 0.01036862, + "balance_loss_clip": 1.0237366, + "balance_loss_mlp": 1.0428226, + "epoch": 0.35641064181572224, + "flos": 22273450214400.0, + "grad_norm": 1.8762651107969224, + "language_loss": 0.79633021, + "learning_rate": 2.9826364298094212e-06, + "loss": 0.81789798, + "num_input_tokens_seen": 127425350, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.76953125, + "step": 5928, + "time_per_iteration": 4.019433259963989 + }, + { + "auxiliary_loss_clip": 0.01120965, + "auxiliary_loss_mlp": 0.01039766, + "balance_loss_clip": 1.02581263, + "balance_loss_mlp": 1.04338682, + "epoch": 0.3564707650683902, + "flos": 23001251587200.0, + "grad_norm": 1.4128421638180557, + "language_loss": 0.81568098, + "learning_rate": 2.982297197789215e-06, + "loss": 0.83728826, + "num_input_tokens_seen": 127446335, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7734375, + "step": 5929, + "time_per_iteration": 3.869184970855713 + }, + { + "auxiliary_loss_clip": 0.01116246, + "auxiliary_loss_mlp": 0.01034774, + "balance_loss_clip": 1.02172661, + "balance_loss_mlp": 1.0402571, + "epoch": 0.35653088832105817, + "flos": 14684268965760.0, + "grad_norm": 1.7650523310611956, + "language_loss": 0.69981778, + "learning_rate": 2.981957928520201e-06, + "loss": 0.7213279, + "num_input_tokens_seen": 127462795, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7578125, + "step": 5930, + "time_per_iteration": 2.418992519378662 + }, + { + "auxiliary_loss_clip": 0.01123929, + "auxiliary_loss_mlp": 0.01043162, + "balance_loss_clip": 1.02858853, + "balance_loss_mlp": 1.04340863, + "epoch": 0.35659101157372614, + "flos": 23477068074240.0, + "grad_norm": 1.9164187115059894, + "language_loss": 0.67766178, + "learning_rate": 2.981618622015244e-06, + "loss": 0.69933271, + "num_input_tokens_seen": 127482675, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8046875, + "step": 5931, + "time_per_iteration": 2.4688074588775635 + }, + { + "auxiliary_loss_clip": 0.01121557, + "auxiliary_loss_mlp": 0.01033991, + "balance_loss_clip": 1.0203712, + "balance_loss_mlp": 1.04403675, + "epoch": 0.3566511348263941, + "flos": 26578672583040.0, + "grad_norm": 1.736290109138699, + "language_loss": 0.67451715, + "learning_rate": 2.981279278287211e-06, + "loss": 0.69607264, + "num_input_tokens_seen": 127502275, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.77734375, + "step": 5932, + "time_per_iteration": 2.4908299446105957 + }, + { + "auxiliary_loss_clip": 0.01118994, + "auxiliary_loss_mlp": 0.01031751, + "balance_loss_clip": 1.0182085, + "balance_loss_mlp": 1.04304647, + "epoch": 0.35671125807906207, + "flos": 13115008609920.0, + "grad_norm": 2.602576254435761, + "language_loss": 0.7878592, + "learning_rate": 2.980939897348969e-06, + "loss": 0.8093667, + "num_input_tokens_seen": 127520195, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 5933, + "time_per_iteration": 2.442464590072632 + }, + { + "auxiliary_loss_clip": 0.01122141, + "auxiliary_loss_mlp": 0.01042886, + "balance_loss_clip": 1.02893806, + "balance_loss_mlp": 1.04176354, + "epoch": 0.35677138133173003, + "flos": 33000577557120.0, + "grad_norm": 1.4946029259135472, + "language_loss": 0.69271672, + "learning_rate": 2.980600479213388e-06, + "loss": 0.71436697, + "num_input_tokens_seen": 127544495, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.8046875, + "step": 5934, + "time_per_iteration": 2.5435454845428467 + }, + { + "auxiliary_loss_clip": 0.01131019, + "auxiliary_loss_mlp": 0.0104198, + "balance_loss_clip": 1.02636409, + "balance_loss_mlp": 1.04726946, + "epoch": 0.356831504584398, + "flos": 20777842696320.0, + "grad_norm": 1.881720756405168, + "language_loss": 0.71268845, + "learning_rate": 2.9802610238933384e-06, + "loss": 0.73441839, + "num_input_tokens_seen": 127563810, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8359375, + "step": 5935, + "time_per_iteration": 2.460548162460327 + }, + { + "auxiliary_loss_clip": 0.01124043, + "auxiliary_loss_mlp": 0.01039228, + "balance_loss_clip": 1.02476776, + "balance_loss_mlp": 1.04411018, + "epoch": 0.35689162783706596, + "flos": 12165566365440.0, + "grad_norm": 2.474293421119334, + "language_loss": 0.78293073, + "learning_rate": 2.979921531401692e-06, + "loss": 0.8045634, + "num_input_tokens_seen": 127579065, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.796875, + "step": 5936, + "time_per_iteration": 2.4517645835876465 + }, + { + "auxiliary_loss_clip": 0.01121611, + "auxiliary_loss_mlp": 0.01039592, + "balance_loss_clip": 1.02472031, + "balance_loss_mlp": 1.04367638, + "epoch": 0.356951751089734, + "flos": 23841489507840.0, + "grad_norm": 1.4518862241402966, + "language_loss": 0.64218014, + "learning_rate": 2.9795820017513242e-06, + "loss": 0.66379213, + "num_input_tokens_seen": 127599105, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78125, + "step": 5937, + "time_per_iteration": 2.5837321281433105 + }, + { + "auxiliary_loss_clip": 0.01124449, + "auxiliary_loss_mlp": 0.01038603, + "balance_loss_clip": 1.02395844, + "balance_loss_mlp": 1.04442978, + "epoch": 0.35701187434240195, + "flos": 11722176881280.0, + "grad_norm": 2.5143509931773553, + "language_loss": 0.77877963, + "learning_rate": 2.9792424349551073e-06, + "loss": 0.80041015, + "num_input_tokens_seen": 127614940, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.80078125, + "step": 5938, + "time_per_iteration": 2.4190945625305176 + }, + { + "auxiliary_loss_clip": 0.0112532, + "auxiliary_loss_mlp": 0.01042565, + "balance_loss_clip": 1.02890944, + "balance_loss_mlp": 1.04582071, + "epoch": 0.3570719975950699, + "flos": 24898879100160.0, + "grad_norm": 1.8770011073758637, + "language_loss": 0.80256367, + "learning_rate": 2.9789028310259202e-06, + "loss": 0.82424247, + "num_input_tokens_seen": 127634960, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.796875, + "step": 5939, + "time_per_iteration": 2.5029094219207764 + }, + { + "auxiliary_loss_clip": 0.01126611, + "auxiliary_loss_mlp": 0.01035861, + "balance_loss_clip": 1.0213412, + "balance_loss_mlp": 1.04299128, + "epoch": 0.3571321208477379, + "flos": 25994836920960.0, + "grad_norm": 1.6875415435298406, + "language_loss": 0.79203522, + "learning_rate": 2.9785631899766395e-06, + "loss": 0.81365997, + "num_input_tokens_seen": 127654545, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8359375, + "step": 5940, + "time_per_iteration": 2.526545524597168 + }, + { + "auxiliary_loss_clip": 0.01124522, + "auxiliary_loss_mlp": 0.01031852, + "balance_loss_clip": 1.01704049, + "balance_loss_mlp": 1.0441246, + "epoch": 0.35719224410040584, + "flos": 14501663199360.0, + "grad_norm": 2.480743427796476, + "language_loss": 0.72739166, + "learning_rate": 2.9782235118201443e-06, + "loss": 0.74895537, + "num_input_tokens_seen": 127672320, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8046875, + "step": 5941, + "time_per_iteration": 2.4599413871765137 + }, + { + "auxiliary_loss_clip": 0.01123947, + "auxiliary_loss_mlp": 0.01040367, + "balance_loss_clip": 1.02546012, + "balance_loss_mlp": 1.04480743, + "epoch": 0.3572523673530738, + "flos": 31175453646720.0, + "grad_norm": 1.979069530543237, + "language_loss": 0.64202702, + "learning_rate": 2.9778837965693154e-06, + "loss": 0.66367018, + "num_input_tokens_seen": 127693315, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7890625, + "step": 5942, + "time_per_iteration": 2.5174636840820312 + }, + { + "auxiliary_loss_clip": 0.01123104, + "auxiliary_loss_mlp": 0.01036752, + "balance_loss_clip": 1.02194643, + "balance_loss_mlp": 1.04385567, + "epoch": 0.3573124906057418, + "flos": 15851976203520.0, + "grad_norm": 2.2469009256176053, + "language_loss": 0.74055374, + "learning_rate": 2.9775440442370354e-06, + "loss": 0.76215225, + "num_input_tokens_seen": 127711570, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.79296875, + "step": 5943, + "time_per_iteration": 2.5392913818359375 + }, + { + "auxiliary_loss_clip": 0.01039679, + "auxiliary_loss_mlp": 0.01008275, + "balance_loss_clip": 1.00640345, + "balance_loss_mlp": 1.01455188, + "epoch": 0.35737261385840974, + "flos": 60822729118080.0, + "grad_norm": 0.7872915284740177, + "language_loss": 0.60689372, + "learning_rate": 2.9772042548361867e-06, + "loss": 0.62737316, + "num_input_tokens_seen": 127772475, + "router_z_loss_clip": 0.01867676, + "router_z_loss_mlp": 0.25, + "step": 5944, + "time_per_iteration": 3.17051100730896 + }, + { + "auxiliary_loss_clip": 0.01121351, + "auxiliary_loss_mlp": 0.01034271, + "balance_loss_clip": 1.02003157, + "balance_loss_mlp": 1.04313469, + "epoch": 0.3574327371110777, + "flos": 18843765857280.0, + "grad_norm": 2.033108996495456, + "language_loss": 0.72646821, + "learning_rate": 2.976864428379655e-06, + "loss": 0.7480244, + "num_input_tokens_seen": 127790940, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 5945, + "time_per_iteration": 2.444373846054077 + }, + { + "auxiliary_loss_clip": 0.01121962, + "auxiliary_loss_mlp": 0.01039125, + "balance_loss_clip": 1.02446246, + "balance_loss_mlp": 1.04313612, + "epoch": 0.35749286036374567, + "flos": 23549679417600.0, + "grad_norm": 1.7423109631574678, + "language_loss": 0.81255424, + "learning_rate": 2.976524564880326e-06, + "loss": 0.8341651, + "num_input_tokens_seen": 127808275, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7890625, + "step": 5946, + "time_per_iteration": 2.470513343811035 + }, + { + "auxiliary_loss_clip": 0.01124684, + "auxiliary_loss_mlp": 0.01042743, + "balance_loss_clip": 1.02808666, + "balance_loss_mlp": 1.04524601, + "epoch": 0.35755298361641363, + "flos": 21105491581440.0, + "grad_norm": 1.9099881709146462, + "language_loss": 0.68893784, + "learning_rate": 2.9761846643510882e-06, + "loss": 0.71061212, + "num_input_tokens_seen": 127828840, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.79296875, + "step": 5947, + "time_per_iteration": 2.4653477668762207 + }, + { + "auxiliary_loss_clip": 0.01120435, + "auxiliary_loss_mlp": 0.01039661, + "balance_loss_clip": 1.02568388, + "balance_loss_mlp": 1.04441905, + "epoch": 0.3576131068690816, + "flos": 19245031666560.0, + "grad_norm": 1.655085874443405, + "language_loss": 0.75428057, + "learning_rate": 2.9758447268048297e-06, + "loss": 0.77588153, + "num_input_tokens_seen": 127846240, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7578125, + "step": 5948, + "time_per_iteration": 2.4385483264923096 + }, + { + "auxiliary_loss_clip": 0.01119692, + "auxiliary_loss_mlp": 0.0104111, + "balance_loss_clip": 1.02650094, + "balance_loss_mlp": 1.04049134, + "epoch": 0.35767323012174956, + "flos": 28654703971200.0, + "grad_norm": 2.354345427402619, + "language_loss": 0.70556438, + "learning_rate": 2.9755047522544415e-06, + "loss": 0.72717237, + "num_input_tokens_seen": 127866880, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.79296875, + "step": 5949, + "time_per_iteration": 2.4992663860321045 + }, + { + "auxiliary_loss_clip": 0.01121175, + "auxiliary_loss_mlp": 0.0103916, + "balance_loss_clip": 1.02567744, + "balance_loss_mlp": 1.04348552, + "epoch": 0.35773335337441753, + "flos": 17085363459840.0, + "grad_norm": 1.8941983472442732, + "language_loss": 0.77248389, + "learning_rate": 2.9751647407128154e-06, + "loss": 0.79408723, + "num_input_tokens_seen": 127883560, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.77734375, + "step": 5950, + "time_per_iteration": 2.4295101165771484 + }, + { + "auxiliary_loss_clip": 0.0112255, + "auxiliary_loss_mlp": 0.01038813, + "balance_loss_clip": 1.02394795, + "balance_loss_mlp": 1.04274225, + "epoch": 0.35779347662708555, + "flos": 15888605097600.0, + "grad_norm": 1.5707876816938207, + "language_loss": 0.72766685, + "learning_rate": 2.9748246921928445e-06, + "loss": 0.74928057, + "num_input_tokens_seen": 127902330, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 5951, + "time_per_iteration": 2.444349765777588 + }, + { + "auxiliary_loss_clip": 0.0112562, + "auxiliary_loss_mlp": 0.01039318, + "balance_loss_clip": 1.02452421, + "balance_loss_mlp": 1.04390478, + "epoch": 0.3578535998797535, + "flos": 28658834035200.0, + "grad_norm": 1.9955959935597258, + "language_loss": 0.69730532, + "learning_rate": 2.9744846067074236e-06, + "loss": 0.71895468, + "num_input_tokens_seen": 127922325, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.81640625, + "step": 5952, + "time_per_iteration": 2.49656081199646 + }, + { + "auxiliary_loss_clip": 0.01120518, + "auxiliary_loss_mlp": 0.010387, + "balance_loss_clip": 1.02497923, + "balance_loss_mlp": 1.04271066, + "epoch": 0.3579137231324215, + "flos": 37852432076160.0, + "grad_norm": 2.0583657570083416, + "language_loss": 0.69432503, + "learning_rate": 2.974144484269449e-06, + "loss": 0.71591723, + "num_input_tokens_seen": 127942635, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.77734375, + "step": 5953, + "time_per_iteration": 2.6221721172332764 + }, + { + "auxiliary_loss_clip": 0.0112099, + "auxiliary_loss_mlp": 0.01030421, + "balance_loss_clip": 1.01641417, + "balance_loss_mlp": 1.04322994, + "epoch": 0.35797384638508944, + "flos": 22346851656960.0, + "grad_norm": 1.5429391611916807, + "language_loss": 0.66673422, + "learning_rate": 2.9738043248918175e-06, + "loss": 0.68824828, + "num_input_tokens_seen": 127962520, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.77734375, + "step": 5954, + "time_per_iteration": 2.465116262435913 + }, + { + "auxiliary_loss_clip": 0.01123263, + "auxiliary_loss_mlp": 0.01037735, + "balance_loss_clip": 1.02383566, + "balance_loss_mlp": 1.04475307, + "epoch": 0.3580339696377574, + "flos": 13589711775360.0, + "grad_norm": 1.7040470297828096, + "language_loss": 0.74838006, + "learning_rate": 2.9734641285874282e-06, + "loss": 0.76998997, + "num_input_tokens_seen": 127981180, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.78515625, + "step": 5955, + "time_per_iteration": 2.4968783855438232 + }, + { + "auxiliary_loss_clip": 0.01117597, + "auxiliary_loss_mlp": 0.01035772, + "balance_loss_clip": 1.0219382, + "balance_loss_mlp": 1.04289603, + "epoch": 0.3580940928904254, + "flos": 23768231719680.0, + "grad_norm": 1.6820855707774873, + "language_loss": 0.76043999, + "learning_rate": 2.973123895369182e-06, + "loss": 0.78197372, + "num_input_tokens_seen": 127999725, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.74609375, + "step": 5956, + "time_per_iteration": 2.498699903488159 + }, + { + "auxiliary_loss_clip": 0.01117465, + "auxiliary_loss_mlp": 0.01034975, + "balance_loss_clip": 1.0214982, + "balance_loss_mlp": 1.04263568, + "epoch": 0.35815421614309334, + "flos": 19463871277440.0, + "grad_norm": 1.7390523407913014, + "language_loss": 0.73059452, + "learning_rate": 2.9727836252499805e-06, + "loss": 0.75211895, + "num_input_tokens_seen": 128018885, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 5957, + "time_per_iteration": 2.4503817558288574 + }, + { + "auxiliary_loss_clip": 0.0112235, + "auxiliary_loss_mlp": 0.01035332, + "balance_loss_clip": 1.02197433, + "balance_loss_mlp": 1.04503369, + "epoch": 0.3582143393957613, + "flos": 23368186972800.0, + "grad_norm": 2.990259024529503, + "language_loss": 0.70640051, + "learning_rate": 2.972443318242726e-06, + "loss": 0.7279774, + "num_input_tokens_seen": 128037875, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7734375, + "step": 5958, + "time_per_iteration": 2.4611945152282715 + }, + { + "auxiliary_loss_clip": 0.01116817, + "auxiliary_loss_mlp": 0.01029572, + "balance_loss_clip": 1.0165484, + "balance_loss_mlp": 1.0413444, + "epoch": 0.35827446264842927, + "flos": 26323275905280.0, + "grad_norm": 1.7206269565580243, + "language_loss": 0.88610697, + "learning_rate": 2.972102974360324e-06, + "loss": 0.90757084, + "num_input_tokens_seen": 128056045, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.75390625, + "step": 5959, + "time_per_iteration": 2.5129401683807373 + }, + { + "auxiliary_loss_clip": 0.01121057, + "auxiliary_loss_mlp": 0.01036795, + "balance_loss_clip": 1.02281785, + "balance_loss_mlp": 1.04400599, + "epoch": 0.35833458590109724, + "flos": 30446610779520.0, + "grad_norm": 1.483187088646708, + "language_loss": 0.58103061, + "learning_rate": 2.971762593615679e-06, + "loss": 0.6026091, + "num_input_tokens_seen": 128077815, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76953125, + "step": 5960, + "time_per_iteration": 2.5110409259796143 + }, + { + "auxiliary_loss_clip": 0.01120594, + "auxiliary_loss_mlp": 0.01037396, + "balance_loss_clip": 1.02201176, + "balance_loss_mlp": 1.04267251, + "epoch": 0.3583947091537652, + "flos": 14829886702080.0, + "grad_norm": 1.9323395592862886, + "language_loss": 0.76102602, + "learning_rate": 2.9714221760216993e-06, + "loss": 0.78260595, + "num_input_tokens_seen": 128095460, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.77734375, + "step": 5961, + "time_per_iteration": 2.46943736076355 + }, + { + "auxiliary_loss_clip": 0.01121367, + "auxiliary_loss_mlp": 0.01033122, + "balance_loss_clip": 1.01862621, + "balance_loss_mlp": 1.04458857, + "epoch": 0.35845483240643317, + "flos": 34240644743040.0, + "grad_norm": 1.8327349140058107, + "language_loss": 0.69974017, + "learning_rate": 2.971081721591294e-06, + "loss": 0.72128505, + "num_input_tokens_seen": 128118605, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.765625, + "step": 5962, + "time_per_iteration": 2.5654361248016357 + }, + { + "auxiliary_loss_clip": 0.01118608, + "auxiliary_loss_mlp": 0.01033334, + "balance_loss_clip": 1.0210433, + "balance_loss_mlp": 1.04321802, + "epoch": 0.35851495565910113, + "flos": 20960089326720.0, + "grad_norm": 1.5613001239774846, + "language_loss": 0.74749398, + "learning_rate": 2.9707412303373716e-06, + "loss": 0.76901346, + "num_input_tokens_seen": 128139205, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.75390625, + "step": 5963, + "time_per_iteration": 2.5135319232940674 + }, + { + "auxiliary_loss_clip": 0.01122172, + "auxiliary_loss_mlp": 0.01034993, + "balance_loss_clip": 1.02149796, + "balance_loss_mlp": 1.04597044, + "epoch": 0.35857507891176915, + "flos": 22309863626880.0, + "grad_norm": 1.5825069258384938, + "language_loss": 0.78811383, + "learning_rate": 2.9704007022728447e-06, + "loss": 0.80968547, + "num_input_tokens_seen": 128158765, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.76171875, + "step": 5964, + "time_per_iteration": 2.493169069290161 + }, + { + "auxiliary_loss_clip": 0.01124119, + "auxiliary_loss_mlp": 0.01033282, + "balance_loss_clip": 1.01870322, + "balance_loss_mlp": 1.04482806, + "epoch": 0.3586352021644371, + "flos": 23367863750400.0, + "grad_norm": 1.8296471859577264, + "language_loss": 0.66694742, + "learning_rate": 2.970060137410626e-06, + "loss": 0.6885215, + "num_input_tokens_seen": 128177850, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.79296875, + "step": 5965, + "time_per_iteration": 2.4995884895324707 + }, + { + "auxiliary_loss_clip": 0.01120182, + "auxiliary_loss_mlp": 0.01032631, + "balance_loss_clip": 1.01876068, + "balance_loss_mlp": 1.04270399, + "epoch": 0.3586953254171051, + "flos": 27849227437440.0, + "grad_norm": 4.210402322068537, + "language_loss": 0.79008359, + "learning_rate": 2.9697195357636294e-06, + "loss": 0.81161171, + "num_input_tokens_seen": 128196925, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7734375, + "step": 5966, + "time_per_iteration": 2.485438346862793 + }, + { + "auxiliary_loss_clip": 0.01121545, + "auxiliary_loss_mlp": 0.01037084, + "balance_loss_clip": 1.02238536, + "balance_loss_mlp": 1.04341781, + "epoch": 0.35875544866977305, + "flos": 19500500171520.0, + "grad_norm": 5.107721360348662, + "language_loss": 0.90911728, + "learning_rate": 2.9693788973447715e-06, + "loss": 0.93070352, + "num_input_tokens_seen": 128213955, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78125, + "step": 5967, + "time_per_iteration": 2.547287702560425 + }, + { + "auxiliary_loss_clip": 0.01125829, + "auxiliary_loss_mlp": 0.01041518, + "balance_loss_clip": 1.02648592, + "balance_loss_mlp": 1.04528475, + "epoch": 0.358815571922441, + "flos": 21471134077440.0, + "grad_norm": 1.7620117516801617, + "language_loss": 0.79739827, + "learning_rate": 2.9690382221669682e-06, + "loss": 0.81907177, + "num_input_tokens_seen": 128232980, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8046875, + "step": 5968, + "time_per_iteration": 2.4543471336364746 + }, + { + "auxiliary_loss_clip": 0.01127455, + "auxiliary_loss_mlp": 0.01052904, + "balance_loss_clip": 1.0384376, + "balance_loss_mlp": 1.04604244, + "epoch": 0.358875695175109, + "flos": 21835411856640.0, + "grad_norm": 2.0044885906540424, + "language_loss": 0.83642054, + "learning_rate": 2.9686975102431384e-06, + "loss": 0.85822409, + "num_input_tokens_seen": 128252795, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8125, + "step": 5969, + "time_per_iteration": 2.502815008163452 + }, + { + "auxiliary_loss_clip": 0.0111906, + "auxiliary_loss_mlp": 0.01032145, + "balance_loss_clip": 1.01893663, + "balance_loss_mlp": 1.04245603, + "epoch": 0.35893581842777694, + "flos": 32011633330560.0, + "grad_norm": 1.876228198696561, + "language_loss": 0.72377515, + "learning_rate": 2.968356761586202e-06, + "loss": 0.74528718, + "num_input_tokens_seen": 128273115, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.765625, + "step": 5970, + "time_per_iteration": 4.051819086074829 + }, + { + "auxiliary_loss_clip": 0.01119038, + "auxiliary_loss_mlp": 0.01035066, + "balance_loss_clip": 1.02178049, + "balance_loss_mlp": 1.0424037, + "epoch": 0.3589959416804449, + "flos": 20485817124480.0, + "grad_norm": 1.6844020581036279, + "language_loss": 0.79522693, + "learning_rate": 2.9680159762090805e-06, + "loss": 0.81676805, + "num_input_tokens_seen": 128292220, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.765625, + "step": 5971, + "time_per_iteration": 3.8910434246063232 + }, + { + "auxiliary_loss_clip": 0.01120261, + "auxiliary_loss_mlp": 0.01039002, + "balance_loss_clip": 1.02427924, + "balance_loss_mlp": 1.0402174, + "epoch": 0.3590560649331129, + "flos": 16180666583040.0, + "grad_norm": 1.924864359347905, + "language_loss": 0.78594625, + "learning_rate": 2.967675154124696e-06, + "loss": 0.80753887, + "num_input_tokens_seen": 128310305, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.80078125, + "step": 5972, + "time_per_iteration": 2.4272611141204834 + }, + { + "auxiliary_loss_clip": 0.01120688, + "auxiliary_loss_mlp": 0.01037349, + "balance_loss_clip": 1.02378309, + "balance_loss_mlp": 1.04185021, + "epoch": 0.35911618818578084, + "flos": 20375391738240.0, + "grad_norm": 3.2741380987368327, + "language_loss": 0.81252539, + "learning_rate": 2.9673342953459722e-06, + "loss": 0.83410573, + "num_input_tokens_seen": 128328305, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7890625, + "step": 5973, + "time_per_iteration": 2.469438314437866 + }, + { + "auxiliary_loss_clip": 0.0103695, + "auxiliary_loss_mlp": 0.01001955, + "balance_loss_clip": 0.9999882, + "balance_loss_mlp": 1.01160312, + "epoch": 0.3591763114384488, + "flos": 41236691685120.0, + "grad_norm": 0.9181567019376142, + "language_loss": 0.56828684, + "learning_rate": 2.9669933998858355e-06, + "loss": 0.58867586, + "num_input_tokens_seen": 128378380, + "router_z_loss_clip": 0.01965332, + "router_z_loss_mlp": 0.25390625, + "step": 5974, + "time_per_iteration": 2.918166399002075 + }, + { + "auxiliary_loss_clip": 0.01122634, + "auxiliary_loss_mlp": 0.01038804, + "balance_loss_clip": 1.02548242, + "balance_loss_mlp": 1.04407859, + "epoch": 0.35923643469111677, + "flos": 18695454600960.0, + "grad_norm": 1.6252506462115286, + "language_loss": 0.68750453, + "learning_rate": 2.9666524677572114e-06, + "loss": 0.7091189, + "num_input_tokens_seen": 128394315, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.78515625, + "step": 5975, + "time_per_iteration": 2.4578702449798584 + }, + { + "auxiliary_loss_clip": 0.01119888, + "auxiliary_loss_mlp": 0.01034451, + "balance_loss_clip": 1.02132642, + "balance_loss_mlp": 1.04269934, + "epoch": 0.35929655794378473, + "flos": 25009950931200.0, + "grad_norm": 1.7542310571392548, + "language_loss": 0.79961413, + "learning_rate": 2.96631149897303e-06, + "loss": 0.82115752, + "num_input_tokens_seen": 128414515, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7734375, + "step": 5976, + "time_per_iteration": 2.494723081588745 + }, + { + "auxiliary_loss_clip": 0.01119534, + "auxiliary_loss_mlp": 0.0104186, + "balance_loss_clip": 1.02761412, + "balance_loss_mlp": 1.04172039, + "epoch": 0.35935668119645275, + "flos": 14975576265600.0, + "grad_norm": 1.7409485188517788, + "language_loss": 0.79081398, + "learning_rate": 2.9659704935462194e-06, + "loss": 0.81242788, + "num_input_tokens_seen": 128430615, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.77734375, + "step": 5977, + "time_per_iteration": 2.4949100017547607 + }, + { + "auxiliary_loss_clip": 0.01116029, + "auxiliary_loss_mlp": 0.01034751, + "balance_loss_clip": 1.02151847, + "balance_loss_mlp": 1.04029524, + "epoch": 0.3594168044491207, + "flos": 21178138838400.0, + "grad_norm": 1.7920092294573908, + "language_loss": 0.80654621, + "learning_rate": 2.9656294514897102e-06, + "loss": 0.82805401, + "num_input_tokens_seen": 128449480, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7578125, + "step": 5978, + "time_per_iteration": 2.445866584777832 + }, + { + "auxiliary_loss_clip": 0.01122409, + "auxiliary_loss_mlp": 0.01034873, + "balance_loss_clip": 1.02046657, + "balance_loss_mlp": 1.04394007, + "epoch": 0.3594769277017887, + "flos": 27672152365440.0, + "grad_norm": 1.5382295990908517, + "language_loss": 0.67741489, + "learning_rate": 2.965288372816436e-06, + "loss": 0.69898772, + "num_input_tokens_seen": 128471465, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78515625, + "step": 5979, + "time_per_iteration": 2.538585662841797 + }, + { + "auxiliary_loss_clip": 0.01119324, + "auxiliary_loss_mlp": 0.01038492, + "balance_loss_clip": 1.02478838, + "balance_loss_mlp": 1.04136634, + "epoch": 0.35953705095445665, + "flos": 23002328995200.0, + "grad_norm": 2.3207911240165697, + "language_loss": 0.67176729, + "learning_rate": 2.9649472575393296e-06, + "loss": 0.69334549, + "num_input_tokens_seen": 128490645, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.78125, + "step": 5980, + "time_per_iteration": 2.4896938800811768 + }, + { + "auxiliary_loss_clip": 0.01123377, + "auxiliary_loss_mlp": 0.01039239, + "balance_loss_clip": 1.02377748, + "balance_loss_mlp": 1.0416832, + "epoch": 0.3595971742071246, + "flos": 25513992529920.0, + "grad_norm": 1.8107777091561479, + "language_loss": 0.71148199, + "learning_rate": 2.964606105671327e-06, + "loss": 0.73310816, + "num_input_tokens_seen": 128510225, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.81640625, + "step": 5981, + "time_per_iteration": 2.49064302444458 + }, + { + "auxiliary_loss_clip": 0.01121979, + "auxiliary_loss_mlp": 0.01038955, + "balance_loss_clip": 1.02387476, + "balance_loss_mlp": 1.0432086, + "epoch": 0.3596572974597926, + "flos": 29862559635840.0, + "grad_norm": 1.7933500913622242, + "language_loss": 0.71331298, + "learning_rate": 2.9642649172253635e-06, + "loss": 0.73492229, + "num_input_tokens_seen": 128530195, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7890625, + "step": 5982, + "time_per_iteration": 2.5167934894561768 + }, + { + "auxiliary_loss_clip": 0.01117371, + "auxiliary_loss_mlp": 0.01036663, + "balance_loss_clip": 1.02361536, + "balance_loss_mlp": 1.0427959, + "epoch": 0.35971742071246054, + "flos": 23112538899840.0, + "grad_norm": 1.6761533335073455, + "language_loss": 0.75808942, + "learning_rate": 2.9639236922143786e-06, + "loss": 0.77962971, + "num_input_tokens_seen": 128549990, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.74609375, + "step": 5983, + "time_per_iteration": 2.4915101528167725 + }, + { + "auxiliary_loss_clip": 0.01126703, + "auxiliary_loss_mlp": 0.01043227, + "balance_loss_clip": 1.02771819, + "balance_loss_mlp": 1.04474413, + "epoch": 0.3597775439651285, + "flos": 16725359399040.0, + "grad_norm": 2.1804669018597043, + "language_loss": 0.76302433, + "learning_rate": 2.96358243065131e-06, + "loss": 0.78472364, + "num_input_tokens_seen": 128567925, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8203125, + "step": 5984, + "time_per_iteration": 2.436640501022339 + }, + { + "auxiliary_loss_clip": 0.01118377, + "auxiliary_loss_mlp": 0.01037581, + "balance_loss_clip": 1.02356207, + "balance_loss_mlp": 1.0420785, + "epoch": 0.3598376672177965, + "flos": 19719483436800.0, + "grad_norm": 1.837904559260202, + "language_loss": 0.86617446, + "learning_rate": 2.9632411325490993e-06, + "loss": 0.88773406, + "num_input_tokens_seen": 128585655, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 5985, + "time_per_iteration": 2.476853609085083 + }, + { + "auxiliary_loss_clip": 0.0111809, + "auxiliary_loss_mlp": 0.01036238, + "balance_loss_clip": 1.02130079, + "balance_loss_mlp": 1.04078126, + "epoch": 0.35989779047046444, + "flos": 17311529445120.0, + "grad_norm": 1.416236209566339, + "language_loss": 0.72801065, + "learning_rate": 2.9628997979206884e-06, + "loss": 0.74955392, + "num_input_tokens_seen": 128604820, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.7734375, + "step": 5986, + "time_per_iteration": 2.443871021270752 + }, + { + "auxiliary_loss_clip": 0.01124328, + "auxiliary_loss_mlp": 0.0103792, + "balance_loss_clip": 1.02354908, + "balance_loss_mlp": 1.04230642, + "epoch": 0.3599579137231324, + "flos": 22711237176960.0, + "grad_norm": 1.880079313238184, + "language_loss": 0.73711401, + "learning_rate": 2.9625584267790204e-06, + "loss": 0.75873649, + "num_input_tokens_seen": 128623070, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.8203125, + "step": 5987, + "time_per_iteration": 2.517045736312866 + }, + { + "auxiliary_loss_clip": 0.01121357, + "auxiliary_loss_mlp": 0.01036656, + "balance_loss_clip": 1.0217309, + "balance_loss_mlp": 1.04161966, + "epoch": 0.36001803697580037, + "flos": 20959873845120.0, + "grad_norm": 1.8583263097896845, + "language_loss": 0.69824201, + "learning_rate": 2.9622170191370404e-06, + "loss": 0.71982217, + "num_input_tokens_seen": 128642430, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.796875, + "step": 5988, + "time_per_iteration": 2.484654426574707 + }, + { + "auxiliary_loss_clip": 0.01125207, + "auxiliary_loss_mlp": 0.01041231, + "balance_loss_clip": 1.02675915, + "balance_loss_mlp": 1.04297233, + "epoch": 0.36007816022846834, + "flos": 20485565729280.0, + "grad_norm": 1.851186734533378, + "language_loss": 0.72918314, + "learning_rate": 2.9618755750076953e-06, + "loss": 0.75084746, + "num_input_tokens_seen": 128661285, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8203125, + "step": 5989, + "time_per_iteration": 2.464378833770752 + }, + { + "auxiliary_loss_clip": 0.01120868, + "auxiliary_loss_mlp": 0.0103281, + "balance_loss_clip": 1.0194943, + "balance_loss_mlp": 1.04283333, + "epoch": 0.36013828348113636, + "flos": 28001237794560.0, + "grad_norm": 1.8425061302669492, + "language_loss": 0.79664916, + "learning_rate": 2.961534094403931e-06, + "loss": 0.81818593, + "num_input_tokens_seen": 128682210, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.78125, + "step": 5990, + "time_per_iteration": 2.4947755336761475 + }, + { + "auxiliary_loss_clip": 0.01121243, + "auxiliary_loss_mlp": 0.01028868, + "balance_loss_clip": 1.01472998, + "balance_loss_mlp": 1.04281235, + "epoch": 0.3601984067338043, + "flos": 20082181017600.0, + "grad_norm": 1.9352260247419832, + "language_loss": 0.84225297, + "learning_rate": 2.961192577338698e-06, + "loss": 0.86375415, + "num_input_tokens_seen": 128700445, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.78515625, + "step": 5991, + "time_per_iteration": 2.4728991985321045 + }, + { + "auxiliary_loss_clip": 0.01123597, + "auxiliary_loss_mlp": 0.01039266, + "balance_loss_clip": 1.02490079, + "balance_loss_mlp": 1.04197788, + "epoch": 0.3602585299864723, + "flos": 18617599872000.0, + "grad_norm": 1.9640325518662143, + "language_loss": 0.75616056, + "learning_rate": 2.9608510238249463e-06, + "loss": 0.77778924, + "num_input_tokens_seen": 128716855, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.81640625, + "step": 5992, + "time_per_iteration": 2.4422738552093506 + }, + { + "auxiliary_loss_clip": 0.01119253, + "auxiliary_loss_mlp": 0.01034904, + "balance_loss_clip": 1.02022302, + "balance_loss_mlp": 1.04177451, + "epoch": 0.36031865323914025, + "flos": 19573003774080.0, + "grad_norm": 6.32582004359923, + "language_loss": 0.77500135, + "learning_rate": 2.960509433875627e-06, + "loss": 0.79654288, + "num_input_tokens_seen": 128735835, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7734375, + "step": 5993, + "time_per_iteration": 2.4513776302337646 + }, + { + "auxiliary_loss_clip": 0.01124951, + "auxiliary_loss_mlp": 0.01036544, + "balance_loss_clip": 1.02281737, + "balance_loss_mlp": 1.04405534, + "epoch": 0.3603787764918082, + "flos": 17490615678720.0, + "grad_norm": 1.9096274983436938, + "language_loss": 0.74686468, + "learning_rate": 2.9601678075036943e-06, + "loss": 0.7684797, + "num_input_tokens_seen": 128752465, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.80859375, + "step": 5994, + "time_per_iteration": 2.4278860092163086 + }, + { + "auxiliary_loss_clip": 0.0112434, + "auxiliary_loss_mlp": 0.01038632, + "balance_loss_clip": 1.02506554, + "balance_loss_mlp": 1.04320991, + "epoch": 0.3604388997444762, + "flos": 15523393564800.0, + "grad_norm": 1.8397117218597796, + "language_loss": 0.68890274, + "learning_rate": 2.9598261447221024e-06, + "loss": 0.71053243, + "num_input_tokens_seen": 128770865, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.8125, + "step": 5995, + "time_per_iteration": 2.462557554244995 + }, + { + "auxiliary_loss_clip": 0.01124519, + "auxiliary_loss_mlp": 0.01040187, + "balance_loss_clip": 1.02548289, + "balance_loss_mlp": 1.04238582, + "epoch": 0.36049902299714415, + "flos": 17310883000320.0, + "grad_norm": 1.7352965040741237, + "language_loss": 0.82057822, + "learning_rate": 2.9594844455438057e-06, + "loss": 0.84222531, + "num_input_tokens_seen": 128789730, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8203125, + "step": 5996, + "time_per_iteration": 2.4284703731536865 + }, + { + "auxiliary_loss_clip": 0.01119849, + "auxiliary_loss_mlp": 0.01034523, + "balance_loss_clip": 1.02054608, + "balance_loss_mlp": 1.04242694, + "epoch": 0.3605591462498121, + "flos": 17056025026560.0, + "grad_norm": 1.56212250683249, + "language_loss": 0.73570979, + "learning_rate": 2.959142709981763e-06, + "loss": 0.75725353, + "num_input_tokens_seen": 128806610, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7734375, + "step": 5997, + "time_per_iteration": 2.4418485164642334 + }, + { + "auxiliary_loss_clip": 0.01120213, + "auxiliary_loss_mlp": 0.01036336, + "balance_loss_clip": 1.02272177, + "balance_loss_mlp": 1.04307055, + "epoch": 0.3606192695024801, + "flos": 16836862193280.0, + "grad_norm": 2.1655767572067637, + "language_loss": 0.68651283, + "learning_rate": 2.9588009380489337e-06, + "loss": 0.70807832, + "num_input_tokens_seen": 128824830, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 5998, + "time_per_iteration": 2.435884475708008 + }, + { + "auxiliary_loss_clip": 0.01124048, + "auxiliary_loss_mlp": 0.01034007, + "balance_loss_clip": 1.01983321, + "balance_loss_mlp": 1.04494119, + "epoch": 0.36067939275514804, + "flos": 12129655743360.0, + "grad_norm": 2.6750874406601914, + "language_loss": 0.77190387, + "learning_rate": 2.9584591297582758e-06, + "loss": 0.79348445, + "num_input_tokens_seen": 128838170, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7890625, + "step": 5999, + "time_per_iteration": 2.415649175643921 + }, + { + "auxiliary_loss_clip": 0.01123679, + "auxiliary_loss_mlp": 0.01037913, + "balance_loss_clip": 1.02381015, + "balance_loss_mlp": 1.04481769, + "epoch": 0.360739516007816, + "flos": 18041449720320.0, + "grad_norm": 2.719833162653021, + "language_loss": 0.78307509, + "learning_rate": 2.9581172851227516e-06, + "loss": 0.80469108, + "num_input_tokens_seen": 128855625, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7890625, + "step": 6000, + "time_per_iteration": 2.450085401535034 + }, + { + "auxiliary_loss_clip": 0.01122067, + "auxiliary_loss_mlp": 0.01034294, + "balance_loss_clip": 1.02061474, + "balance_loss_mlp": 1.04283905, + "epoch": 0.360799639260484, + "flos": 18549800951040.0, + "grad_norm": 1.6917067376727954, + "language_loss": 0.78621352, + "learning_rate": 2.9577754041553243e-06, + "loss": 0.80777717, + "num_input_tokens_seen": 128873540, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.79296875, + "step": 6001, + "time_per_iteration": 2.4247405529022217 + }, + { + "auxiliary_loss_clip": 0.01119251, + "auxiliary_loss_mlp": 0.0103132, + "balance_loss_clip": 1.01761651, + "balance_loss_mlp": 1.04341698, + "epoch": 0.36085976251315194, + "flos": 19682028529920.0, + "grad_norm": 1.9017223481518102, + "language_loss": 0.83743405, + "learning_rate": 2.9574334868689575e-06, + "loss": 0.85893983, + "num_input_tokens_seen": 128889925, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7578125, + "step": 6002, + "time_per_iteration": 2.4587790966033936 + }, + { + "auxiliary_loss_clip": 0.01117677, + "auxiliary_loss_mlp": 0.01030372, + "balance_loss_clip": 1.01753855, + "balance_loss_mlp": 1.04298413, + "epoch": 0.3609198857658199, + "flos": 24198943703040.0, + "grad_norm": 2.101850625944426, + "language_loss": 0.90627617, + "learning_rate": 2.9570915332766165e-06, + "loss": 0.92775667, + "num_input_tokens_seen": 128906890, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.75, + "step": 6003, + "time_per_iteration": 2.450408697128296 + }, + { + "auxiliary_loss_clip": 0.01040628, + "auxiliary_loss_mlp": 0.01013073, + "balance_loss_clip": 1.01102221, + "balance_loss_mlp": 1.01496768, + "epoch": 0.3609800090184879, + "flos": 57115995160320.0, + "grad_norm": 0.8843653445723816, + "language_loss": 0.53374904, + "learning_rate": 2.9567495433912693e-06, + "loss": 0.55428606, + "num_input_tokens_seen": 128965940, + "router_z_loss_clip": 0.02050781, + "router_z_loss_mlp": 0.25585938, + "step": 6004, + "time_per_iteration": 3.005659341812134 + }, + { + "auxiliary_loss_clip": 0.01121195, + "auxiliary_loss_mlp": 0.01037049, + "balance_loss_clip": 1.02152824, + "balance_loss_mlp": 1.04164577, + "epoch": 0.3610401322711559, + "flos": 20811239366400.0, + "grad_norm": 1.7248099575523852, + "language_loss": 0.77609527, + "learning_rate": 2.956407517225883e-06, + "loss": 0.7976777, + "num_input_tokens_seen": 128985835, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.796875, + "step": 6005, + "time_per_iteration": 2.4916067123413086 + }, + { + "auxiliary_loss_clip": 0.01124405, + "auxiliary_loss_mlp": 0.0103607, + "balance_loss_clip": 1.02230704, + "balance_loss_mlp": 1.04700613, + "epoch": 0.36110025552382385, + "flos": 13699167494400.0, + "grad_norm": 2.24467290311728, + "language_loss": 0.79267776, + "learning_rate": 2.956065454793429e-06, + "loss": 0.81428248, + "num_input_tokens_seen": 129003120, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7734375, + "step": 6006, + "time_per_iteration": 2.4366166591644287 + }, + { + "auxiliary_loss_clip": 0.01124848, + "auxiliary_loss_mlp": 0.01038917, + "balance_loss_clip": 1.02309775, + "balance_loss_mlp": 1.04587984, + "epoch": 0.3611603787764918, + "flos": 22455014486400.0, + "grad_norm": 1.7888636143213261, + "language_loss": 0.84360719, + "learning_rate": 2.955723356106876e-06, + "loss": 0.86524487, + "num_input_tokens_seen": 129021645, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.7890625, + "step": 6007, + "time_per_iteration": 2.51680850982666 + }, + { + "auxiliary_loss_clip": 0.01129854, + "auxiliary_loss_mlp": 0.01037601, + "balance_loss_clip": 1.02166319, + "balance_loss_mlp": 1.04622328, + "epoch": 0.3612205020291598, + "flos": 20886651970560.0, + "grad_norm": 2.0771979180574425, + "language_loss": 0.72564125, + "learning_rate": 2.955381221179198e-06, + "loss": 0.74731576, + "num_input_tokens_seen": 129038375, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8359375, + "step": 6008, + "time_per_iteration": 2.4473018646240234 + }, + { + "auxiliary_loss_clip": 0.01120564, + "auxiliary_loss_mlp": 0.01033967, + "balance_loss_clip": 1.02066362, + "balance_loss_mlp": 1.04255283, + "epoch": 0.36128062528182775, + "flos": 15741981780480.0, + "grad_norm": 1.9836274680059969, + "language_loss": 0.8284781, + "learning_rate": 2.955039050023368e-06, + "loss": 0.85002339, + "num_input_tokens_seen": 129056235, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.78125, + "step": 6009, + "time_per_iteration": 2.470031261444092 + }, + { + "auxiliary_loss_clip": 0.01125455, + "auxiliary_loss_mlp": 0.01043722, + "balance_loss_clip": 1.02945232, + "balance_loss_mlp": 1.04598057, + "epoch": 0.3613407485344957, + "flos": 16764502245120.0, + "grad_norm": 1.714442270200285, + "language_loss": 0.76139152, + "learning_rate": 2.954696842652362e-06, + "loss": 0.78308332, + "num_input_tokens_seen": 129072405, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.79296875, + "step": 6010, + "time_per_iteration": 2.446833848953247 + }, + { + "auxiliary_loss_clip": 0.01123758, + "auxiliary_loss_mlp": 0.01037408, + "balance_loss_clip": 1.0236752, + "balance_loss_mlp": 1.04619896, + "epoch": 0.3614008717871637, + "flos": 20371189847040.0, + "grad_norm": 1.905716478313633, + "language_loss": 0.82946253, + "learning_rate": 2.9543545990791554e-06, + "loss": 0.85107422, + "num_input_tokens_seen": 129090225, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.77734375, + "step": 6011, + "time_per_iteration": 2.508147716522217 + }, + { + "auxiliary_loss_clip": 0.01132257, + "auxiliary_loss_mlp": 0.01041461, + "balance_loss_clip": 1.0264287, + "balance_loss_mlp": 1.0491302, + "epoch": 0.36146099503983165, + "flos": 22776665800320.0, + "grad_norm": 1.8484903271380355, + "language_loss": 0.62762833, + "learning_rate": 2.954012319316727e-06, + "loss": 0.64936543, + "num_input_tokens_seen": 129107685, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 6012, + "time_per_iteration": 5.36588454246521 + }, + { + "auxiliary_loss_clip": 0.01118968, + "auxiliary_loss_mlp": 0.01034853, + "balance_loss_clip": 1.02112007, + "balance_loss_mlp": 1.04337454, + "epoch": 0.3615211182924996, + "flos": 22996654646400.0, + "grad_norm": 1.8689670235824563, + "language_loss": 0.84111822, + "learning_rate": 2.9536700033780565e-06, + "loss": 0.86265635, + "num_input_tokens_seen": 129125315, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7578125, + "step": 6013, + "time_per_iteration": 2.494051933288574 + }, + { + "auxiliary_loss_clip": 0.01124804, + "auxiliary_loss_mlp": 0.0104133, + "balance_loss_clip": 1.02690601, + "balance_loss_mlp": 1.04570448, + "epoch": 0.3615812415451676, + "flos": 16648079287680.0, + "grad_norm": 1.7351999387675028, + "language_loss": 0.91496456, + "learning_rate": 2.9533276512761228e-06, + "loss": 0.93662584, + "num_input_tokens_seen": 129141600, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7890625, + "step": 6014, + "time_per_iteration": 2.4356749057769775 + }, + { + "auxiliary_loss_clip": 0.01123597, + "auxiliary_loss_mlp": 0.01044534, + "balance_loss_clip": 1.03078914, + "balance_loss_mlp": 1.04549718, + "epoch": 0.36164136479783554, + "flos": 21320093387520.0, + "grad_norm": 1.727703603585928, + "language_loss": 0.73830914, + "learning_rate": 2.95298526302391e-06, + "loss": 0.75999045, + "num_input_tokens_seen": 129160665, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.78125, + "step": 6015, + "time_per_iteration": 2.4990644454956055 + }, + { + "auxiliary_loss_clip": 0.01125644, + "auxiliary_loss_mlp": 0.01038195, + "balance_loss_clip": 1.02394915, + "balance_loss_mlp": 1.04633307, + "epoch": 0.3617014880505035, + "flos": 24169569356160.0, + "grad_norm": 1.7277224025907603, + "language_loss": 0.65316677, + "learning_rate": 2.9526428386344e-06, + "loss": 0.67480516, + "num_input_tokens_seen": 129179220, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.79296875, + "step": 6016, + "time_per_iteration": 2.5260934829711914 + }, + { + "auxiliary_loss_clip": 0.01126131, + "auxiliary_loss_mlp": 0.01041518, + "balance_loss_clip": 1.02522171, + "balance_loss_mlp": 1.04727304, + "epoch": 0.3617616113031715, + "flos": 39014824101120.0, + "grad_norm": 1.744160138264151, + "language_loss": 0.72101283, + "learning_rate": 2.9523003781205785e-06, + "loss": 0.74268931, + "num_input_tokens_seen": 129200385, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.7890625, + "step": 6017, + "time_per_iteration": 2.638683795928955 + }, + { + "auxiliary_loss_clip": 0.01126121, + "auxiliary_loss_mlp": 0.01038852, + "balance_loss_clip": 1.02413559, + "balance_loss_mlp": 1.04454577, + "epoch": 0.3618217345558395, + "flos": 12130840892160.0, + "grad_norm": 1.9120538903838002, + "language_loss": 0.73590356, + "learning_rate": 2.9519578814954307e-06, + "loss": 0.75755334, + "num_input_tokens_seen": 129217395, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.81640625, + "step": 6018, + "time_per_iteration": 2.4477858543395996 + }, + { + "auxiliary_loss_clip": 0.01119909, + "auxiliary_loss_mlp": 0.01033819, + "balance_loss_clip": 1.02013361, + "balance_loss_mlp": 1.04458487, + "epoch": 0.36188185780850746, + "flos": 24935005203840.0, + "grad_norm": 1.754547200149591, + "language_loss": 0.69080901, + "learning_rate": 2.9516153487719448e-06, + "loss": 0.71234632, + "num_input_tokens_seen": 129238940, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75390625, + "step": 6019, + "time_per_iteration": 2.519831657409668 + }, + { + "auxiliary_loss_clip": 0.01124958, + "auxiliary_loss_mlp": 0.01034647, + "balance_loss_clip": 1.01980555, + "balance_loss_mlp": 1.0443728, + "epoch": 0.3619419810611754, + "flos": 20958832350720.0, + "grad_norm": 1.5467952079219929, + "language_loss": 0.76299942, + "learning_rate": 2.95127277996311e-06, + "loss": 0.78459549, + "num_input_tokens_seen": 129258240, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8046875, + "step": 6020, + "time_per_iteration": 2.4692177772521973 + }, + { + "auxiliary_loss_clip": 0.01125932, + "auxiliary_loss_mlp": 0.01043324, + "balance_loss_clip": 1.02814841, + "balance_loss_mlp": 1.04721653, + "epoch": 0.3620021043138434, + "flos": 22528882805760.0, + "grad_norm": 1.938447153390643, + "language_loss": 0.73921824, + "learning_rate": 2.9509301750819156e-06, + "loss": 0.76091087, + "num_input_tokens_seen": 129279040, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.78515625, + "step": 6021, + "time_per_iteration": 2.5069808959960938 + }, + { + "auxiliary_loss_clip": 0.01123146, + "auxiliary_loss_mlp": 0.01034023, + "balance_loss_clip": 1.02059376, + "balance_loss_mlp": 1.04596186, + "epoch": 0.36206222756651135, + "flos": 15596687266560.0, + "grad_norm": 1.8648032073369731, + "language_loss": 0.80978441, + "learning_rate": 2.9505875341413533e-06, + "loss": 0.83135605, + "num_input_tokens_seen": 129295415, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.76953125, + "step": 6022, + "time_per_iteration": 2.4620115756988525 + }, + { + "auxiliary_loss_clip": 0.01122576, + "auxiliary_loss_mlp": 0.0103516, + "balance_loss_clip": 1.02212477, + "balance_loss_mlp": 1.04778302, + "epoch": 0.3621223508191793, + "flos": 23587170238080.0, + "grad_norm": 1.6799220656127192, + "language_loss": 0.81351119, + "learning_rate": 2.950244857154417e-06, + "loss": 0.83508855, + "num_input_tokens_seen": 129312620, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.74609375, + "step": 6023, + "time_per_iteration": 2.4969308376312256 + }, + { + "auxiliary_loss_clip": 0.01124727, + "auxiliary_loss_mlp": 0.01034523, + "balance_loss_clip": 1.01975274, + "balance_loss_mlp": 1.04494548, + "epoch": 0.3621824740718473, + "flos": 22309899540480.0, + "grad_norm": 1.8793265875700644, + "language_loss": 0.79767907, + "learning_rate": 2.9499021441341e-06, + "loss": 0.81927156, + "num_input_tokens_seen": 129331825, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.796875, + "step": 6024, + "time_per_iteration": 2.468369245529175 + }, + { + "auxiliary_loss_clip": 0.01119855, + "auxiliary_loss_mlp": 0.01029797, + "balance_loss_clip": 1.01629043, + "balance_loss_mlp": 1.04456711, + "epoch": 0.36224259732451525, + "flos": 16763640318720.0, + "grad_norm": 1.7897574616215441, + "language_loss": 0.74720407, + "learning_rate": 2.9495593950933997e-06, + "loss": 0.7687006, + "num_input_tokens_seen": 129350400, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75390625, + "step": 6025, + "time_per_iteration": 2.4410412311553955 + }, + { + "auxiliary_loss_clip": 0.01120005, + "auxiliary_loss_mlp": 0.0103221, + "balance_loss_clip": 1.01849484, + "balance_loss_mlp": 1.04340899, + "epoch": 0.3623027205771832, + "flos": 23149742411520.0, + "grad_norm": 1.5522426900619628, + "language_loss": 0.72055018, + "learning_rate": 2.9492166100453107e-06, + "loss": 0.74207234, + "num_input_tokens_seen": 129371155, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.765625, + "step": 6026, + "time_per_iteration": 2.4997596740722656 + }, + { + "auxiliary_loss_clip": 0.01128673, + "auxiliary_loss_mlp": 0.01041263, + "balance_loss_clip": 1.02645707, + "balance_loss_mlp": 1.04604256, + "epoch": 0.3623628438298512, + "flos": 28549162834560.0, + "grad_norm": 2.401846993246305, + "language_loss": 0.79332775, + "learning_rate": 2.948873789002833e-06, + "loss": 0.81502712, + "num_input_tokens_seen": 129391230, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.82421875, + "step": 6027, + "time_per_iteration": 2.5326383113861084 + }, + { + "auxiliary_loss_clip": 0.0112338, + "auxiliary_loss_mlp": 0.01040685, + "balance_loss_clip": 1.02576041, + "balance_loss_mlp": 1.04399586, + "epoch": 0.36242296708251914, + "flos": 25484941405440.0, + "grad_norm": 1.7548337209278033, + "language_loss": 0.67809385, + "learning_rate": 2.9485309319789667e-06, + "loss": 0.69973445, + "num_input_tokens_seen": 129410065, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.796875, + "step": 6028, + "time_per_iteration": 2.548088788986206 + }, + { + "auxiliary_loss_clip": 0.0112104, + "auxiliary_loss_mlp": 0.01032512, + "balance_loss_clip": 1.01922584, + "balance_loss_mlp": 1.04415894, + "epoch": 0.3624830903351871, + "flos": 16290373697280.0, + "grad_norm": 1.63067637662311, + "language_loss": 0.85700679, + "learning_rate": 2.9481880389867117e-06, + "loss": 0.8785423, + "num_input_tokens_seen": 129428655, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.76953125, + "step": 6029, + "time_per_iteration": 2.429720878601074 + }, + { + "auxiliary_loss_clip": 0.01120137, + "auxiliary_loss_mlp": 0.01038059, + "balance_loss_clip": 1.02412939, + "balance_loss_mlp": 1.04442835, + "epoch": 0.36254321358785513, + "flos": 18296307694080.0, + "grad_norm": 1.6511023563359555, + "language_loss": 0.72693753, + "learning_rate": 2.9478451100390714e-06, + "loss": 0.74851942, + "num_input_tokens_seen": 129447845, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 6030, + "time_per_iteration": 2.4299302101135254 + }, + { + "auxiliary_loss_clip": 0.01123199, + "auxiliary_loss_mlp": 0.01041671, + "balance_loss_clip": 1.02529144, + "balance_loss_mlp": 1.04264557, + "epoch": 0.3626033368405231, + "flos": 14865294533760.0, + "grad_norm": 2.02536170930057, + "language_loss": 0.73986644, + "learning_rate": 2.94750214514905e-06, + "loss": 0.76151514, + "num_input_tokens_seen": 129463275, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8046875, + "step": 6031, + "time_per_iteration": 2.4376232624053955 + }, + { + "auxiliary_loss_clip": 0.01120355, + "auxiliary_loss_mlp": 0.01031654, + "balance_loss_clip": 1.0177424, + "balance_loss_mlp": 1.04309845, + "epoch": 0.36266346009319106, + "flos": 22306595489280.0, + "grad_norm": 1.8475328889194098, + "language_loss": 0.73286617, + "learning_rate": 2.9471591443296516e-06, + "loss": 0.75438625, + "num_input_tokens_seen": 129483205, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76953125, + "step": 6032, + "time_per_iteration": 2.4811155796051025 + }, + { + "auxiliary_loss_clip": 0.01121253, + "auxiliary_loss_mlp": 0.01038206, + "balance_loss_clip": 1.02412748, + "balance_loss_mlp": 1.0427382, + "epoch": 0.362723583345859, + "flos": 18222331633920.0, + "grad_norm": 1.684246043345259, + "language_loss": 0.77953577, + "learning_rate": 2.946816107593884e-06, + "loss": 0.80113035, + "num_input_tokens_seen": 129499885, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.78515625, + "step": 6033, + "time_per_iteration": 2.4283456802368164 + }, + { + "auxiliary_loss_clip": 0.01040416, + "auxiliary_loss_mlp": 0.01019079, + "balance_loss_clip": 1.01733828, + "balance_loss_mlp": 1.01487339, + "epoch": 0.362783706598527, + "flos": 68499174458880.0, + "grad_norm": 0.786107382559835, + "language_loss": 0.64822888, + "learning_rate": 2.9464730349547547e-06, + "loss": 0.66882384, + "num_input_tokens_seen": 129561885, + "router_z_loss_clip": 0.01745605, + "router_z_loss_mlp": 0.25585938, + "step": 6034, + "time_per_iteration": 3.1253511905670166 + }, + { + "auxiliary_loss_clip": 0.01118206, + "auxiliary_loss_mlp": 0.01035423, + "balance_loss_clip": 1.02139246, + "balance_loss_mlp": 1.04131126, + "epoch": 0.36284382985119495, + "flos": 26576589594240.0, + "grad_norm": 1.4985312456135769, + "language_loss": 0.90059769, + "learning_rate": 2.946129926425273e-06, + "loss": 0.92213392, + "num_input_tokens_seen": 129582325, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76953125, + "step": 6035, + "time_per_iteration": 2.4888923168182373 + }, + { + "auxiliary_loss_clip": 0.01122001, + "auxiliary_loss_mlp": 0.01034945, + "balance_loss_clip": 1.02030611, + "balance_loss_mlp": 1.04239392, + "epoch": 0.3629039531038629, + "flos": 20156767608960.0, + "grad_norm": 1.7493433732375512, + "language_loss": 0.73526931, + "learning_rate": 2.9457867820184496e-06, + "loss": 0.7568388, + "num_input_tokens_seen": 129600350, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.796875, + "step": 6036, + "time_per_iteration": 2.445058822631836 + }, + { + "auxiliary_loss_clip": 0.01124436, + "auxiliary_loss_mlp": 0.01029235, + "balance_loss_clip": 1.01500189, + "balance_loss_mlp": 1.04274487, + "epoch": 0.3629640763565309, + "flos": 18625716345600.0, + "grad_norm": 1.901551926176817, + "language_loss": 0.75938255, + "learning_rate": 2.945443601747297e-06, + "loss": 0.78091925, + "num_input_tokens_seen": 129618425, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.81640625, + "step": 6037, + "time_per_iteration": 2.422229766845703 + }, + { + "auxiliary_loss_clip": 0.0111661, + "auxiliary_loss_mlp": 0.0103799, + "balance_loss_clip": 1.0238812, + "balance_loss_mlp": 1.04227912, + "epoch": 0.36302419960919885, + "flos": 19571459489280.0, + "grad_norm": 1.6899683541385933, + "language_loss": 0.78120697, + "learning_rate": 2.945100385624828e-06, + "loss": 0.80275297, + "num_input_tokens_seen": 129636750, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 6038, + "time_per_iteration": 2.4582855701446533 + }, + { + "auxiliary_loss_clip": 0.0103994, + "auxiliary_loss_mlp": 0.01006466, + "balance_loss_clip": 1.00467765, + "balance_loss_mlp": 1.01452303, + "epoch": 0.3630843228618668, + "flos": 63797606444160.0, + "grad_norm": 0.8286249809211084, + "language_loss": 0.63413143, + "learning_rate": 2.9447571336640573e-06, + "loss": 0.65459549, + "num_input_tokens_seen": 129699030, + "router_z_loss_clip": 0.01782227, + "router_z_loss_mlp": 0.25390625, + "step": 6039, + "time_per_iteration": 3.1417860984802246 + }, + { + "auxiliary_loss_clip": 0.01120334, + "auxiliary_loss_mlp": 0.01035281, + "balance_loss_clip": 1.02175713, + "balance_loss_mlp": 1.04391789, + "epoch": 0.3631444461145348, + "flos": 21835160461440.0, + "grad_norm": 1.9215128015710738, + "language_loss": 0.70857447, + "learning_rate": 2.944413845878002e-06, + "loss": 0.73013067, + "num_input_tokens_seen": 129717135, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 6040, + "time_per_iteration": 2.505627155303955 + }, + { + "auxiliary_loss_clip": 0.0112497, + "auxiliary_loss_mlp": 0.01032549, + "balance_loss_clip": 1.01827383, + "balance_loss_mlp": 1.04445744, + "epoch": 0.36320456936720275, + "flos": 21722041555200.0, + "grad_norm": 2.327350689124367, + "language_loss": 0.81322253, + "learning_rate": 2.9440705222796783e-06, + "loss": 0.83479762, + "num_input_tokens_seen": 129735940, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8046875, + "step": 6041, + "time_per_iteration": 2.4475231170654297 + }, + { + "auxiliary_loss_clip": 0.01120173, + "auxiliary_loss_mlp": 0.01030159, + "balance_loss_clip": 1.01526928, + "balance_loss_mlp": 1.04150891, + "epoch": 0.3632646926198707, + "flos": 17019072910080.0, + "grad_norm": 2.252727008735842, + "language_loss": 0.83721769, + "learning_rate": 2.943727162882107e-06, + "loss": 0.85872102, + "num_input_tokens_seen": 129752790, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78515625, + "step": 6042, + "time_per_iteration": 2.461111545562744 + }, + { + "auxiliary_loss_clip": 0.01120803, + "auxiliary_loss_mlp": 0.01039778, + "balance_loss_clip": 1.02583623, + "balance_loss_mlp": 1.04390788, + "epoch": 0.36332481587253873, + "flos": 23331163029120.0, + "grad_norm": 1.6644116234057968, + "language_loss": 0.78122932, + "learning_rate": 2.9433837676983064e-06, + "loss": 0.80283511, + "num_input_tokens_seen": 129773655, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76953125, + "step": 6043, + "time_per_iteration": 2.477030038833618 + }, + { + "auxiliary_loss_clip": 0.01117368, + "auxiliary_loss_mlp": 0.01035057, + "balance_loss_clip": 1.02017403, + "balance_loss_mlp": 1.04266226, + "epoch": 0.3633849391252067, + "flos": 10743539857920.0, + "grad_norm": 3.8032713581650515, + "language_loss": 0.65792918, + "learning_rate": 2.943040336741298e-06, + "loss": 0.67945337, + "num_input_tokens_seen": 129791605, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.74609375, + "step": 6044, + "time_per_iteration": 2.471221446990967 + }, + { + "auxiliary_loss_clip": 0.01118191, + "auxiliary_loss_mlp": 0.01030901, + "balance_loss_clip": 1.01706135, + "balance_loss_mlp": 1.04186332, + "epoch": 0.36344506237787466, + "flos": 25849147357440.0, + "grad_norm": 1.74112377533005, + "language_loss": 0.80978471, + "learning_rate": 2.9426968700241066e-06, + "loss": 0.83127558, + "num_input_tokens_seen": 129811075, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76171875, + "step": 6045, + "time_per_iteration": 2.482147693634033 + }, + { + "auxiliary_loss_clip": 0.01122131, + "auxiliary_loss_mlp": 0.01038556, + "balance_loss_clip": 1.02388096, + "balance_loss_mlp": 1.04342091, + "epoch": 0.3635051856305426, + "flos": 30154046503680.0, + "grad_norm": 1.7414472049280392, + "language_loss": 0.64214617, + "learning_rate": 2.942353367559755e-06, + "loss": 0.66375309, + "num_input_tokens_seen": 129833755, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78515625, + "step": 6046, + "time_per_iteration": 2.593209743499756 + }, + { + "auxiliary_loss_clip": 0.01119542, + "auxiliary_loss_mlp": 0.01035387, + "balance_loss_clip": 1.02142787, + "balance_loss_mlp": 1.04214859, + "epoch": 0.3635653088832106, + "flos": 22198396746240.0, + "grad_norm": 1.623453692259123, + "language_loss": 0.77366132, + "learning_rate": 2.9420098293612692e-06, + "loss": 0.7952106, + "num_input_tokens_seen": 129854475, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7734375, + "step": 6047, + "time_per_iteration": 2.4650797843933105 + }, + { + "auxiliary_loss_clip": 0.01125471, + "auxiliary_loss_mlp": 0.01041953, + "balance_loss_clip": 1.02609777, + "balance_loss_mlp": 1.04148006, + "epoch": 0.36362543213587856, + "flos": 24787053083520.0, + "grad_norm": 1.508802610673932, + "language_loss": 0.79679012, + "learning_rate": 2.9416662554416767e-06, + "loss": 0.81846434, + "num_input_tokens_seen": 129873530, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8359375, + "step": 6048, + "time_per_iteration": 2.5329999923706055 + }, + { + "auxiliary_loss_clip": 0.01037747, + "auxiliary_loss_mlp": 0.01000265, + "balance_loss_clip": 0.99839348, + "balance_loss_mlp": 1.0124383, + "epoch": 0.3636855553885465, + "flos": 62526369231360.0, + "grad_norm": 0.7564639677567045, + "language_loss": 0.52584642, + "learning_rate": 2.9413226458140054e-06, + "loss": 0.54622656, + "num_input_tokens_seen": 129940400, + "router_z_loss_clip": 0.01867676, + "router_z_loss_mlp": 0.25390625, + "step": 6049, + "time_per_iteration": 3.1051762104034424 + }, + { + "auxiliary_loss_clip": 0.0112103, + "auxiliary_loss_mlp": 0.01036313, + "balance_loss_clip": 1.02172136, + "balance_loss_mlp": 1.04254675, + "epoch": 0.3637456786412145, + "flos": 24060652341120.0, + "grad_norm": 2.0453292842004833, + "language_loss": 0.86365628, + "learning_rate": 2.9409790004912845e-06, + "loss": 0.88522977, + "num_input_tokens_seen": 129958635, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.78515625, + "step": 6050, + "time_per_iteration": 2.469092845916748 + }, + { + "auxiliary_loss_clip": 0.01119484, + "auxiliary_loss_mlp": 0.01036489, + "balance_loss_clip": 1.0227803, + "balance_loss_mlp": 1.04309154, + "epoch": 0.36380580189388245, + "flos": 16691495852160.0, + "grad_norm": 1.7649295268136813, + "language_loss": 0.7855531, + "learning_rate": 2.940635319486546e-06, + "loss": 0.80711287, + "num_input_tokens_seen": 129977685, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76171875, + "step": 6051, + "time_per_iteration": 2.425166368484497 + }, + { + "auxiliary_loss_clip": 0.0111821, + "auxiliary_loss_mlp": 0.01034377, + "balance_loss_clip": 1.02044129, + "balance_loss_mlp": 1.04047346, + "epoch": 0.3638659251465504, + "flos": 25114091437440.0, + "grad_norm": 2.0280679706971423, + "language_loss": 0.83024764, + "learning_rate": 2.940291602812822e-06, + "loss": 0.8517735, + "num_input_tokens_seen": 129997530, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.77734375, + "step": 6052, + "time_per_iteration": 2.511826992034912 + }, + { + "auxiliary_loss_clip": 0.01114918, + "auxiliary_loss_mlp": 0.01034279, + "balance_loss_clip": 1.02146947, + "balance_loss_mlp": 1.03992438, + "epoch": 0.3639260483992184, + "flos": 23003011353600.0, + "grad_norm": 3.055248278017369, + "language_loss": 0.72156489, + "learning_rate": 2.939947850483145e-06, + "loss": 0.74305683, + "num_input_tokens_seen": 130017955, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.75, + "step": 6053, + "time_per_iteration": 4.030078887939453 + }, + { + "auxiliary_loss_clip": 0.01038499, + "auxiliary_loss_mlp": 0.01000787, + "balance_loss_clip": 0.99893934, + "balance_loss_mlp": 1.01315093, + "epoch": 0.36398617165188635, + "flos": 70716011160960.0, + "grad_norm": 0.7695228081579073, + "language_loss": 0.61234874, + "learning_rate": 2.9396040625105532e-06, + "loss": 0.63274157, + "num_input_tokens_seen": 130074275, + "router_z_loss_clip": 0.01843262, + "router_z_loss_mlp": 0.25390625, + "step": 6054, + "time_per_iteration": 4.498634576797485 + }, + { + "auxiliary_loss_clip": 0.01121607, + "auxiliary_loss_mlp": 0.01038553, + "balance_loss_clip": 1.02378917, + "balance_loss_mlp": 1.0425837, + "epoch": 0.3640462949045543, + "flos": 22235456603520.0, + "grad_norm": 1.9647165397438333, + "language_loss": 0.75846946, + "learning_rate": 2.9392602389080802e-06, + "loss": 0.78007108, + "num_input_tokens_seen": 130091375, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.7890625, + "step": 6055, + "time_per_iteration": 2.46478271484375 + }, + { + "auxiliary_loss_clip": 0.01121913, + "auxiliary_loss_mlp": 0.010384, + "balance_loss_clip": 1.0240891, + "balance_loss_mlp": 1.04369521, + "epoch": 0.3641064181572223, + "flos": 21543529939200.0, + "grad_norm": 1.6567803669377452, + "language_loss": 0.75263339, + "learning_rate": 2.938916379688765e-06, + "loss": 0.7742365, + "num_input_tokens_seen": 130111595, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78125, + "step": 6056, + "time_per_iteration": 2.4739041328430176 + }, + { + "auxiliary_loss_clip": 0.01120031, + "auxiliary_loss_mlp": 0.01039041, + "balance_loss_clip": 1.02447379, + "balance_loss_mlp": 1.04331231, + "epoch": 0.3641665414098903, + "flos": 22273306560000.0, + "grad_norm": 2.0844054878938607, + "language_loss": 0.80676425, + "learning_rate": 2.9385724848656468e-06, + "loss": 0.82835501, + "num_input_tokens_seen": 130131440, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.765625, + "step": 6057, + "time_per_iteration": 2.4778594970703125 + }, + { + "auxiliary_loss_clip": 0.01119344, + "auxiliary_loss_mlp": 0.0103916, + "balance_loss_clip": 1.02457452, + "balance_loss_mlp": 1.04333091, + "epoch": 0.36422666466255826, + "flos": 28329676778880.0, + "grad_norm": 1.8744131952209395, + "language_loss": 0.79986346, + "learning_rate": 2.9382285544517647e-06, + "loss": 0.82144856, + "num_input_tokens_seen": 130151375, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.76171875, + "step": 6058, + "time_per_iteration": 2.5267081260681152 + }, + { + "auxiliary_loss_clip": 0.01119278, + "auxiliary_loss_mlp": 0.01035858, + "balance_loss_clip": 1.02142191, + "balance_loss_mlp": 1.04207647, + "epoch": 0.36428678791522623, + "flos": 24170503109760.0, + "grad_norm": 1.8448855765347556, + "language_loss": 0.8485254, + "learning_rate": 2.9378845884601636e-06, + "loss": 0.87007678, + "num_input_tokens_seen": 130169960, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.76953125, + "step": 6059, + "time_per_iteration": 2.4876210689544678 + }, + { + "auxiliary_loss_clip": 0.01123355, + "auxiliary_loss_mlp": 0.01040047, + "balance_loss_clip": 1.02527666, + "balance_loss_mlp": 1.04397857, + "epoch": 0.3643469111678942, + "flos": 22528451842560.0, + "grad_norm": 1.4958849024653313, + "language_loss": 0.8783946, + "learning_rate": 2.937540586903884e-06, + "loss": 0.90002865, + "num_input_tokens_seen": 130189800, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.79296875, + "step": 6060, + "time_per_iteration": 2.516439199447632 + }, + { + "auxiliary_loss_clip": 0.01124396, + "auxiliary_loss_mlp": 0.0104086, + "balance_loss_clip": 1.02583957, + "balance_loss_mlp": 1.04366183, + "epoch": 0.36440703442056216, + "flos": 19426595938560.0, + "grad_norm": 2.6600271028380824, + "language_loss": 0.67965293, + "learning_rate": 2.937196549795971e-06, + "loss": 0.70130551, + "num_input_tokens_seen": 130206370, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.80859375, + "step": 6061, + "time_per_iteration": 2.4436440467834473 + }, + { + "auxiliary_loss_clip": 0.01127668, + "auxiliary_loss_mlp": 0.01039684, + "balance_loss_clip": 1.02444267, + "balance_loss_mlp": 1.04622734, + "epoch": 0.3644671576732301, + "flos": 18040515966720.0, + "grad_norm": 2.142951671935031, + "language_loss": 0.75072217, + "learning_rate": 2.9368524771494718e-06, + "loss": 0.77239573, + "num_input_tokens_seen": 130224445, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8125, + "step": 6062, + "time_per_iteration": 2.4325368404388428 + }, + { + "auxiliary_loss_clip": 0.011222, + "auxiliary_loss_mlp": 0.01035014, + "balance_loss_clip": 1.01910567, + "balance_loss_mlp": 1.04460645, + "epoch": 0.3645272809258981, + "flos": 21542811667200.0, + "grad_norm": 1.6782897381106048, + "language_loss": 0.72632384, + "learning_rate": 2.936508368977432e-06, + "loss": 0.74789596, + "num_input_tokens_seen": 130245380, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.77734375, + "step": 6063, + "time_per_iteration": 2.498168468475342 + }, + { + "auxiliary_loss_clip": 0.0112083, + "auxiliary_loss_mlp": 0.01039322, + "balance_loss_clip": 1.0249579, + "balance_loss_mlp": 1.04365671, + "epoch": 0.36458740417856605, + "flos": 22746860490240.0, + "grad_norm": 1.8702732296649918, + "language_loss": 0.68128121, + "learning_rate": 2.936164225292901e-06, + "loss": 0.70288265, + "num_input_tokens_seen": 130265575, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7734375, + "step": 6064, + "time_per_iteration": 2.4951584339141846 + }, + { + "auxiliary_loss_clip": 0.01125722, + "auxiliary_loss_mlp": 0.01046801, + "balance_loss_clip": 1.03205502, + "balance_loss_mlp": 1.04549003, + "epoch": 0.364647527431234, + "flos": 26140670138880.0, + "grad_norm": 1.679838788119498, + "language_loss": 0.74604851, + "learning_rate": 2.9358200461089297e-06, + "loss": 0.76777375, + "num_input_tokens_seen": 130286195, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8046875, + "step": 6065, + "time_per_iteration": 2.4980344772338867 + }, + { + "auxiliary_loss_clip": 0.01125488, + "auxiliary_loss_mlp": 0.01041621, + "balance_loss_clip": 1.02544403, + "balance_loss_mlp": 1.04464209, + "epoch": 0.364707650683902, + "flos": 31029907737600.0, + "grad_norm": 1.8520658730284223, + "language_loss": 0.75248677, + "learning_rate": 2.9354758314385676e-06, + "loss": 0.77415788, + "num_input_tokens_seen": 130306095, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8046875, + "step": 6066, + "time_per_iteration": 2.5525264739990234 + }, + { + "auxiliary_loss_clip": 0.01116764, + "auxiliary_loss_mlp": 0.01034497, + "balance_loss_clip": 1.02101445, + "balance_loss_mlp": 1.04115653, + "epoch": 0.36476777393656995, + "flos": 19572896033280.0, + "grad_norm": 2.55479391525507, + "language_loss": 0.76988614, + "learning_rate": 2.9351315812948684e-06, + "loss": 0.79139876, + "num_input_tokens_seen": 130324685, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 6067, + "time_per_iteration": 2.440595865249634 + }, + { + "auxiliary_loss_clip": 0.01120327, + "auxiliary_loss_mlp": 0.01037255, + "balance_loss_clip": 1.02422583, + "balance_loss_mlp": 1.04442596, + "epoch": 0.3648278971892379, + "flos": 17748849530880.0, + "grad_norm": 2.1532465459722574, + "language_loss": 0.70826519, + "learning_rate": 2.934787295690886e-06, + "loss": 0.72984099, + "num_input_tokens_seen": 130343855, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7578125, + "step": 6068, + "time_per_iteration": 2.4555468559265137 + }, + { + "auxiliary_loss_clip": 0.01123082, + "auxiliary_loss_mlp": 0.01037893, + "balance_loss_clip": 1.02359986, + "balance_loss_mlp": 1.04301953, + "epoch": 0.3648880204419059, + "flos": 17931167988480.0, + "grad_norm": 1.8428063971352102, + "language_loss": 0.73987395, + "learning_rate": 2.9344429746396755e-06, + "loss": 0.76148373, + "num_input_tokens_seen": 130362320, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.80078125, + "step": 6069, + "time_per_iteration": 2.4380593299865723 + }, + { + "auxiliary_loss_clip": 0.01124432, + "auxiliary_loss_mlp": 0.01035086, + "balance_loss_clip": 1.0203104, + "balance_loss_mlp": 1.04434299, + "epoch": 0.3649481436945739, + "flos": 22638266697600.0, + "grad_norm": 1.740540431199334, + "language_loss": 0.66149801, + "learning_rate": 2.9340986181542945e-06, + "loss": 0.68309319, + "num_input_tokens_seen": 130383165, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.80078125, + "step": 6070, + "time_per_iteration": 2.4852278232574463 + }, + { + "auxiliary_loss_clip": 0.01120342, + "auxiliary_loss_mlp": 0.01036109, + "balance_loss_clip": 1.02225685, + "balance_loss_mlp": 1.04412127, + "epoch": 0.36500826694724187, + "flos": 21579656042880.0, + "grad_norm": 1.5531027619052142, + "language_loss": 0.74474913, + "learning_rate": 2.9337542262477994e-06, + "loss": 0.76631367, + "num_input_tokens_seen": 130402425, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76171875, + "step": 6071, + "time_per_iteration": 2.483961820602417 + }, + { + "auxiliary_loss_clip": 0.01119978, + "auxiliary_loss_mlp": 0.01034213, + "balance_loss_clip": 1.01926446, + "balance_loss_mlp": 1.04232538, + "epoch": 0.36506839019990983, + "flos": 13772533023360.0, + "grad_norm": 2.0347636440980277, + "language_loss": 0.88132894, + "learning_rate": 2.9334097989332506e-06, + "loss": 0.90287089, + "num_input_tokens_seen": 130419440, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.77734375, + "step": 6072, + "time_per_iteration": 2.4083876609802246 + }, + { + "auxiliary_loss_clip": 0.01121735, + "auxiliary_loss_mlp": 0.01035678, + "balance_loss_clip": 1.02184379, + "balance_loss_mlp": 1.04389739, + "epoch": 0.3651285134525778, + "flos": 17274972378240.0, + "grad_norm": 2.230203116909298, + "language_loss": 0.72432441, + "learning_rate": 2.9330653362237094e-06, + "loss": 0.74589849, + "num_input_tokens_seen": 130438495, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.77734375, + "step": 6073, + "time_per_iteration": 2.4769015312194824 + }, + { + "auxiliary_loss_clip": 0.01123465, + "auxiliary_loss_mlp": 0.01039544, + "balance_loss_clip": 1.02520275, + "balance_loss_mlp": 1.04425395, + "epoch": 0.36518863670524576, + "flos": 21907987286400.0, + "grad_norm": 1.8811318432297164, + "language_loss": 0.66584921, + "learning_rate": 2.932720838132236e-06, + "loss": 0.68747932, + "num_input_tokens_seen": 130455575, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7890625, + "step": 6074, + "time_per_iteration": 2.4474194049835205 + }, + { + "auxiliary_loss_clip": 0.01117894, + "auxiliary_loss_mlp": 0.01032639, + "balance_loss_clip": 1.01891208, + "balance_loss_mlp": 1.04079318, + "epoch": 0.3652487599579137, + "flos": 27122180250240.0, + "grad_norm": 1.5068114870819531, + "language_loss": 0.72946787, + "learning_rate": 2.9323763046718954e-06, + "loss": 0.75097322, + "num_input_tokens_seen": 130476385, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76953125, + "step": 6075, + "time_per_iteration": 2.5063765048980713 + }, + { + "auxiliary_loss_clip": 0.01126029, + "auxiliary_loss_mlp": 0.01044219, + "balance_loss_clip": 1.02888894, + "balance_loss_mlp": 1.04484594, + "epoch": 0.3653088832105817, + "flos": 19755573626880.0, + "grad_norm": 2.7314154698808113, + "language_loss": 0.8938573, + "learning_rate": 2.9320317358557524e-06, + "loss": 0.91555977, + "num_input_tokens_seen": 130493630, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8125, + "step": 6076, + "time_per_iteration": 2.4518303871154785 + }, + { + "auxiliary_loss_clip": 0.01121617, + "auxiliary_loss_mlp": 0.01038999, + "balance_loss_clip": 1.02438378, + "balance_loss_mlp": 1.04457617, + "epoch": 0.36536900646324966, + "flos": 13115008609920.0, + "grad_norm": 2.2164690925931976, + "language_loss": 0.69506466, + "learning_rate": 2.931687131696872e-06, + "loss": 0.71667087, + "num_input_tokens_seen": 130510735, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.76953125, + "step": 6077, + "time_per_iteration": 2.447659730911255 + }, + { + "auxiliary_loss_clip": 0.01043202, + "auxiliary_loss_mlp": 0.01009421, + "balance_loss_clip": 1.00758541, + "balance_loss_mlp": 1.01693892, + "epoch": 0.3654291297159176, + "flos": 71100472383360.0, + "grad_norm": 0.7520139059893192, + "language_loss": 0.61798048, + "learning_rate": 2.9313424922083224e-06, + "loss": 0.63850671, + "num_input_tokens_seen": 130577050, + "router_z_loss_clip": 0.01831055, + "router_z_loss_mlp": 0.26171875, + "step": 6078, + "time_per_iteration": 3.1669509410858154 + }, + { + "auxiliary_loss_clip": 0.01119836, + "auxiliary_loss_mlp": 0.01036426, + "balance_loss_clip": 1.02238369, + "balance_loss_mlp": 1.04217839, + "epoch": 0.3654892529685856, + "flos": 23617478338560.0, + "grad_norm": 1.8851740765331422, + "language_loss": 0.78088033, + "learning_rate": 2.930997817403173e-06, + "loss": 0.80244297, + "num_input_tokens_seen": 130593780, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.77734375, + "step": 6079, + "time_per_iteration": 2.4570510387420654 + }, + { + "auxiliary_loss_clip": 0.01124854, + "auxiliary_loss_mlp": 0.01040383, + "balance_loss_clip": 1.02517176, + "balance_loss_mlp": 1.04497504, + "epoch": 0.36554937622125355, + "flos": 43470799850880.0, + "grad_norm": 2.129422570654268, + "language_loss": 0.62885886, + "learning_rate": 2.9306531072944913e-06, + "loss": 0.65051121, + "num_input_tokens_seen": 130615510, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.796875, + "step": 6080, + "time_per_iteration": 2.65580415725708 + }, + { + "auxiliary_loss_clip": 0.01122781, + "auxiliary_loss_mlp": 0.01034606, + "balance_loss_clip": 1.01974106, + "balance_loss_mlp": 1.04280567, + "epoch": 0.3656094994739215, + "flos": 23294641875840.0, + "grad_norm": 2.4061972925673385, + "language_loss": 0.67665905, + "learning_rate": 2.930308361895352e-06, + "loss": 0.69823289, + "num_input_tokens_seen": 130635410, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 6081, + "time_per_iteration": 2.4747202396392822 + }, + { + "auxiliary_loss_clip": 0.01124634, + "auxiliary_loss_mlp": 0.01038138, + "balance_loss_clip": 1.02287912, + "balance_loss_mlp": 1.04305673, + "epoch": 0.3656696227265895, + "flos": 24571984400640.0, + "grad_norm": 1.9082106177767983, + "language_loss": 0.74747473, + "learning_rate": 2.9299635812188257e-06, + "loss": 0.76910245, + "num_input_tokens_seen": 130657725, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.81640625, + "step": 6082, + "time_per_iteration": 2.5238633155822754 + }, + { + "auxiliary_loss_clip": 0.01126171, + "auxiliary_loss_mlp": 0.01029096, + "balance_loss_clip": 1.01576877, + "balance_loss_mlp": 1.04598689, + "epoch": 0.3657297459792575, + "flos": 27928375056000.0, + "grad_norm": 1.8091692998669453, + "language_loss": 0.82823056, + "learning_rate": 2.929618765277987e-06, + "loss": 0.84978318, + "num_input_tokens_seen": 130678360, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.8046875, + "step": 6083, + "time_per_iteration": 2.517704963684082 + }, + { + "auxiliary_loss_clip": 0.01041849, + "auxiliary_loss_mlp": 0.01002206, + "balance_loss_clip": 1.00026309, + "balance_loss_mlp": 1.01621974, + "epoch": 0.36578986923192547, + "flos": 67392622126080.0, + "grad_norm": 0.8152809684063654, + "language_loss": 0.59372437, + "learning_rate": 2.9292739140859125e-06, + "loss": 0.61416495, + "num_input_tokens_seen": 130742110, + "router_z_loss_clip": 0.01940918, + "router_z_loss_mlp": 0.25585938, + "step": 6084, + "time_per_iteration": 3.126275062561035 + }, + { + "auxiliary_loss_clip": 0.01121734, + "auxiliary_loss_mlp": 0.01036969, + "balance_loss_clip": 1.02273536, + "balance_loss_mlp": 1.04410744, + "epoch": 0.36584999248459343, + "flos": 20227511445120.0, + "grad_norm": 2.719357970509058, + "language_loss": 0.73096633, + "learning_rate": 2.9289290276556767e-06, + "loss": 0.75255334, + "num_input_tokens_seen": 130759870, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.77734375, + "step": 6085, + "time_per_iteration": 2.436722755432129 + }, + { + "auxiliary_loss_clip": 0.01122986, + "auxiliary_loss_mlp": 0.01028561, + "balance_loss_clip": 1.01485801, + "balance_loss_mlp": 1.0447793, + "epoch": 0.3659101157372614, + "flos": 19062461813760.0, + "grad_norm": 4.360512376704014, + "language_loss": 0.7831111, + "learning_rate": 2.9285841060003604e-06, + "loss": 0.80462652, + "num_input_tokens_seen": 130778510, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.78125, + "step": 6086, + "time_per_iteration": 2.557521104812622 + }, + { + "auxiliary_loss_clip": 0.0111444, + "auxiliary_loss_mlp": 0.01029117, + "balance_loss_clip": 1.0150919, + "balance_loss_mlp": 1.0403074, + "epoch": 0.36597023898992936, + "flos": 30810708990720.0, + "grad_norm": 1.7974113126538098, + "language_loss": 0.77105325, + "learning_rate": 2.9282391491330416e-06, + "loss": 0.79248881, + "num_input_tokens_seen": 130798535, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 6087, + "time_per_iteration": 2.544868230819702 + }, + { + "auxiliary_loss_clip": 0.01121777, + "auxiliary_loss_mlp": 0.01030672, + "balance_loss_clip": 1.01587856, + "balance_loss_mlp": 1.04190612, + "epoch": 0.36603036224259733, + "flos": 20521799573760.0, + "grad_norm": 5.741725291334025, + "language_loss": 0.70710862, + "learning_rate": 2.9278941570668002e-06, + "loss": 0.72863311, + "num_input_tokens_seen": 130816655, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.80078125, + "step": 6088, + "time_per_iteration": 2.491933822631836 + }, + { + "auxiliary_loss_clip": 0.01130389, + "auxiliary_loss_mlp": 0.01034477, + "balance_loss_clip": 1.01897383, + "balance_loss_mlp": 1.04569137, + "epoch": 0.3660904854952653, + "flos": 38329397798400.0, + "grad_norm": 1.6695945607154594, + "language_loss": 0.79878473, + "learning_rate": 2.92754912981472e-06, + "loss": 0.82043338, + "num_input_tokens_seen": 130841225, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84765625, + "step": 6089, + "time_per_iteration": 2.666814088821411 + }, + { + "auxiliary_loss_clip": 0.01119748, + "auxiliary_loss_mlp": 0.01031445, + "balance_loss_clip": 1.01816463, + "balance_loss_mlp": 1.04267049, + "epoch": 0.36615060874793326, + "flos": 21835555511040.0, + "grad_norm": 1.7190941707632215, + "language_loss": 0.71335226, + "learning_rate": 2.927204067389884e-06, + "loss": 0.73486418, + "num_input_tokens_seen": 130861050, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7734375, + "step": 6090, + "time_per_iteration": 2.5138063430786133 + }, + { + "auxiliary_loss_clip": 0.01118993, + "auxiliary_loss_mlp": 0.01041328, + "balance_loss_clip": 1.02757084, + "balance_loss_mlp": 1.04391527, + "epoch": 0.3662107320006012, + "flos": 16581537342720.0, + "grad_norm": 1.9784029627642763, + "language_loss": 0.74276829, + "learning_rate": 2.9268589698053763e-06, + "loss": 0.76437145, + "num_input_tokens_seen": 130879775, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.75, + "step": 6091, + "time_per_iteration": 2.437126636505127 + }, + { + "auxiliary_loss_clip": 0.01120866, + "auxiliary_loss_mlp": 0.01039193, + "balance_loss_clip": 1.02506638, + "balance_loss_mlp": 1.04396391, + "epoch": 0.3662708552532692, + "flos": 20958365473920.0, + "grad_norm": 1.8707748404117035, + "language_loss": 0.72492194, + "learning_rate": 2.926513837074284e-06, + "loss": 0.74652249, + "num_input_tokens_seen": 130898070, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76953125, + "step": 6092, + "time_per_iteration": 2.5038540363311768 + }, + { + "auxiliary_loss_clip": 0.01122728, + "auxiliary_loss_mlp": 0.01045071, + "balance_loss_clip": 1.03072441, + "balance_loss_mlp": 1.04359424, + "epoch": 0.36633097850593715, + "flos": 21902707987200.0, + "grad_norm": 1.9548617375197639, + "language_loss": 0.78251863, + "learning_rate": 2.9261686692096942e-06, + "loss": 0.8041966, + "num_input_tokens_seen": 130915250, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7890625, + "step": 6093, + "time_per_iteration": 2.453854560852051 + }, + { + "auxiliary_loss_clip": 0.01119754, + "auxiliary_loss_mlp": 0.01036262, + "balance_loss_clip": 1.02226686, + "balance_loss_mlp": 1.04095936, + "epoch": 0.3663911017586051, + "flos": 32854133808000.0, + "grad_norm": 1.7535936892187265, + "language_loss": 0.74123377, + "learning_rate": 2.925823466224696e-06, + "loss": 0.76279384, + "num_input_tokens_seen": 130936995, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7890625, + "step": 6094, + "time_per_iteration": 2.5953075885772705 + }, + { + "auxiliary_loss_clip": 0.01125058, + "auxiliary_loss_mlp": 0.01052761, + "balance_loss_clip": 1.0381875, + "balance_loss_mlp": 1.04492939, + "epoch": 0.3664512250112731, + "flos": 27271748482560.0, + "grad_norm": 1.5564182913572622, + "language_loss": 0.79226458, + "learning_rate": 2.9254782281323785e-06, + "loss": 0.81404281, + "num_input_tokens_seen": 130957970, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.80078125, + "step": 6095, + "time_per_iteration": 5.4338037967681885 + }, + { + "auxiliary_loss_clip": 0.01125087, + "auxiliary_loss_mlp": 0.01035776, + "balance_loss_clip": 1.02055264, + "balance_loss_mlp": 1.04422212, + "epoch": 0.3665113482639411, + "flos": 17784436930560.0, + "grad_norm": 2.287741364035224, + "language_loss": 0.73586392, + "learning_rate": 2.925132954945834e-06, + "loss": 0.75747252, + "num_input_tokens_seen": 130974915, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.80859375, + "step": 6096, + "time_per_iteration": 3.923590660095215 + }, + { + "auxiliary_loss_clip": 0.0112257, + "auxiliary_loss_mlp": 0.01033526, + "balance_loss_clip": 1.0195781, + "balance_loss_mlp": 1.04206252, + "epoch": 0.36657147151660907, + "flos": 27854614477440.0, + "grad_norm": 2.2038030169597875, + "language_loss": 0.67285162, + "learning_rate": 2.924787646678155e-06, + "loss": 0.69441259, + "num_input_tokens_seen": 130995745, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.8046875, + "step": 6097, + "time_per_iteration": 2.4843504428863525 + }, + { + "auxiliary_loss_clip": 0.01123525, + "auxiliary_loss_mlp": 0.01038839, + "balance_loss_clip": 1.0249629, + "balance_loss_mlp": 1.04401898, + "epoch": 0.36663159476927704, + "flos": 25374013228800.0, + "grad_norm": 1.6404590263223953, + "language_loss": 0.77676886, + "learning_rate": 2.9244423033424365e-06, + "loss": 0.79839253, + "num_input_tokens_seen": 131015545, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.796875, + "step": 6098, + "time_per_iteration": 2.5663979053497314 + }, + { + "auxiliary_loss_clip": 0.0111895, + "auxiliary_loss_mlp": 0.01038815, + "balance_loss_clip": 1.02467644, + "balance_loss_mlp": 1.04334557, + "epoch": 0.366691718021945, + "flos": 21357225072000.0, + "grad_norm": 1.7512654587161538, + "language_loss": 0.73807114, + "learning_rate": 2.9240969249517723e-06, + "loss": 0.7596488, + "num_input_tokens_seen": 131033990, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7578125, + "step": 6099, + "time_per_iteration": 2.442549705505371 + }, + { + "auxiliary_loss_clip": 0.01116483, + "auxiliary_loss_mlp": 0.01047226, + "balance_loss_clip": 1.03380322, + "balance_loss_mlp": 1.04073739, + "epoch": 0.36675184127461297, + "flos": 16800376953600.0, + "grad_norm": 1.739052204204903, + "language_loss": 0.84383607, + "learning_rate": 2.9237515115192602e-06, + "loss": 0.86547315, + "num_input_tokens_seen": 131050710, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 6100, + "time_per_iteration": 2.4783878326416016 + }, + { + "auxiliary_loss_clip": 0.01124265, + "auxiliary_loss_mlp": 0.01034789, + "balance_loss_clip": 1.02046633, + "balance_loss_mlp": 1.04215789, + "epoch": 0.36681196452728093, + "flos": 21906514828800.0, + "grad_norm": 2.450199870045222, + "language_loss": 0.70504647, + "learning_rate": 2.9234060630579992e-06, + "loss": 0.72663701, + "num_input_tokens_seen": 131071435, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.8203125, + "step": 6101, + "time_per_iteration": 2.4591257572174072 + }, + { + "auxiliary_loss_clip": 0.01121661, + "auxiliary_loss_mlp": 0.01041857, + "balance_loss_clip": 1.02629983, + "balance_loss_mlp": 1.04228854, + "epoch": 0.3668720877799489, + "flos": 17712436118400.0, + "grad_norm": 2.0513606804107543, + "language_loss": 0.76049435, + "learning_rate": 2.9230605795810865e-06, + "loss": 0.78212953, + "num_input_tokens_seen": 131088775, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.79296875, + "step": 6102, + "time_per_iteration": 2.491046190261841 + }, + { + "auxiliary_loss_clip": 0.01126584, + "auxiliary_loss_mlp": 0.01036727, + "balance_loss_clip": 1.02142096, + "balance_loss_mlp": 1.04445052, + "epoch": 0.36693221103261686, + "flos": 47045455499520.0, + "grad_norm": 1.6383228145690705, + "language_loss": 0.69930172, + "learning_rate": 2.922715061101625e-06, + "loss": 0.72093487, + "num_input_tokens_seen": 131112800, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8203125, + "step": 6103, + "time_per_iteration": 2.676790952682495 + }, + { + "auxiliary_loss_clip": 0.01121704, + "auxiliary_loss_mlp": 0.01036936, + "balance_loss_clip": 1.02213061, + "balance_loss_mlp": 1.0423454, + "epoch": 0.3669923342852848, + "flos": 15960929132160.0, + "grad_norm": 1.8701272650505458, + "language_loss": 0.71414149, + "learning_rate": 2.922369507632716e-06, + "loss": 0.73572791, + "num_input_tokens_seen": 131131150, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.79296875, + "step": 6104, + "time_per_iteration": 2.438197374343872 + }, + { + "auxiliary_loss_clip": 0.01121263, + "auxiliary_loss_mlp": 0.01032552, + "balance_loss_clip": 1.01794899, + "balance_loss_mlp": 1.04288161, + "epoch": 0.3670524575379528, + "flos": 19974485064960.0, + "grad_norm": 2.0275913231037923, + "language_loss": 0.81653488, + "learning_rate": 2.9220239191874617e-06, + "loss": 0.83807302, + "num_input_tokens_seen": 131150365, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78515625, + "step": 6105, + "time_per_iteration": 2.437201976776123 + }, + { + "auxiliary_loss_clip": 0.0112675, + "auxiliary_loss_mlp": 0.010372, + "balance_loss_clip": 1.02255476, + "balance_loss_mlp": 1.0441767, + "epoch": 0.36711258079062076, + "flos": 25702955003520.0, + "grad_norm": 1.7477833912391936, + "language_loss": 0.81079835, + "learning_rate": 2.9216782957789692e-06, + "loss": 0.83243787, + "num_input_tokens_seen": 131169310, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.82421875, + "step": 6106, + "time_per_iteration": 2.5447771549224854 + }, + { + "auxiliary_loss_clip": 0.01041229, + "auxiliary_loss_mlp": 0.0100622, + "balance_loss_clip": 1.00440836, + "balance_loss_mlp": 1.01511836, + "epoch": 0.3671727040432887, + "flos": 60772743342720.0, + "grad_norm": 0.6829750500510474, + "language_loss": 0.59212124, + "learning_rate": 2.9213326374203426e-06, + "loss": 0.6125958, + "num_input_tokens_seen": 131232900, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.26171875, + "step": 6107, + "time_per_iteration": 3.0983083248138428 + }, + { + "auxiliary_loss_clip": 0.01119584, + "auxiliary_loss_mlp": 0.01031391, + "balance_loss_clip": 1.01756859, + "balance_loss_mlp": 1.04195333, + "epoch": 0.3672328272959567, + "flos": 18661303745280.0, + "grad_norm": 1.5524752326282045, + "language_loss": 0.74417794, + "learning_rate": 2.92098694412469e-06, + "loss": 0.7656877, + "num_input_tokens_seen": 131250920, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.77734375, + "step": 6108, + "time_per_iteration": 2.5146114826202393 + }, + { + "auxiliary_loss_clip": 0.01120578, + "auxiliary_loss_mlp": 0.01036173, + "balance_loss_clip": 1.02218354, + "balance_loss_mlp": 1.04104972, + "epoch": 0.3672929505486247, + "flos": 15049049535360.0, + "grad_norm": 2.0732100862766294, + "language_loss": 0.73141801, + "learning_rate": 2.9206412159051213e-06, + "loss": 0.7529856, + "num_input_tokens_seen": 131267910, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.796875, + "step": 6109, + "time_per_iteration": 2.4597368240356445 + }, + { + "auxiliary_loss_clip": 0.01118669, + "auxiliary_loss_mlp": 0.01034099, + "balance_loss_clip": 1.02015734, + "balance_loss_mlp": 1.0407654, + "epoch": 0.3673530738012927, + "flos": 20589347099520.0, + "grad_norm": 1.8280489650426288, + "language_loss": 0.53282952, + "learning_rate": 2.920295452774744e-06, + "loss": 0.55435723, + "num_input_tokens_seen": 131287150, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.77734375, + "step": 6110, + "time_per_iteration": 2.5454814434051514 + }, + { + "auxiliary_loss_clip": 0.01120475, + "auxiliary_loss_mlp": 0.01034416, + "balance_loss_clip": 1.01949728, + "balance_loss_mlp": 1.04360104, + "epoch": 0.36741319705396064, + "flos": 21689830033920.0, + "grad_norm": 1.4515242715586747, + "language_loss": 0.8026799, + "learning_rate": 2.919949654746672e-06, + "loss": 0.82422882, + "num_input_tokens_seen": 131308225, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.76953125, + "step": 6111, + "time_per_iteration": 2.4838016033172607 + }, + { + "auxiliary_loss_clip": 0.01119124, + "auxiliary_loss_mlp": 0.01040745, + "balance_loss_clip": 1.02637434, + "balance_loss_mlp": 1.04195952, + "epoch": 0.3674733203066286, + "flos": 29862200499840.0, + "grad_norm": 1.7574831080907656, + "language_loss": 0.72220403, + "learning_rate": 2.9196038218340163e-06, + "loss": 0.74380273, + "num_input_tokens_seen": 131332115, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.76953125, + "step": 6112, + "time_per_iteration": 2.590109348297119 + }, + { + "auxiliary_loss_clip": 0.01120572, + "auxiliary_loss_mlp": 0.01039294, + "balance_loss_clip": 1.02503061, + "balance_loss_mlp": 1.04220295, + "epoch": 0.36753344355929657, + "flos": 18257021193600.0, + "grad_norm": 1.6166739673118746, + "language_loss": 0.85398543, + "learning_rate": 2.919257954049892e-06, + "loss": 0.87558413, + "num_input_tokens_seen": 131351885, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78515625, + "step": 6113, + "time_per_iteration": 2.4480674266815186 + }, + { + "auxiliary_loss_clip": 0.01122714, + "auxiliary_loss_mlp": 0.01036848, + "balance_loss_clip": 1.02228022, + "balance_loss_mlp": 1.04214144, + "epoch": 0.36759356681196453, + "flos": 25301150490240.0, + "grad_norm": 1.8814317352542869, + "language_loss": 0.78741604, + "learning_rate": 2.918912051407413e-06, + "loss": 0.80901164, + "num_input_tokens_seen": 131370245, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8046875, + "step": 6114, + "time_per_iteration": 2.4870779514312744 + }, + { + "auxiliary_loss_clip": 0.01125295, + "auxiliary_loss_mlp": 0.01044195, + "balance_loss_clip": 1.0278033, + "balance_loss_mlp": 1.04344988, + "epoch": 0.3676536900646325, + "flos": 21032952065280.0, + "grad_norm": 1.5830307408310422, + "language_loss": 0.66854429, + "learning_rate": 2.918566113919698e-06, + "loss": 0.69023919, + "num_input_tokens_seen": 131388115, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.81640625, + "step": 6115, + "time_per_iteration": 2.4361841678619385 + }, + { + "auxiliary_loss_clip": 0.01114869, + "auxiliary_loss_mlp": 0.01033503, + "balance_loss_clip": 1.01953745, + "balance_loss_mlp": 1.03984118, + "epoch": 0.36771381331730046, + "flos": 16288506190080.0, + "grad_norm": 2.406761648754093, + "language_loss": 0.76663208, + "learning_rate": 2.9182201415998636e-06, + "loss": 0.78811574, + "num_input_tokens_seen": 131404595, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75, + "step": 6116, + "time_per_iteration": 2.417569875717163 + }, + { + "auxiliary_loss_clip": 0.01119646, + "auxiliary_loss_mlp": 0.01040473, + "balance_loss_clip": 1.02685893, + "balance_loss_mlp": 1.04111099, + "epoch": 0.36777393656996843, + "flos": 22309971367680.0, + "grad_norm": 1.9705222106020779, + "language_loss": 0.62811542, + "learning_rate": 2.9178741344610286e-06, + "loss": 0.64971662, + "num_input_tokens_seen": 131423760, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.78515625, + "step": 6117, + "time_per_iteration": 2.443798065185547 + }, + { + "auxiliary_loss_clip": 0.01118324, + "auxiliary_loss_mlp": 0.0103365, + "balance_loss_clip": 1.019261, + "balance_loss_mlp": 1.04137671, + "epoch": 0.3678340598226364, + "flos": 26834069260800.0, + "grad_norm": 1.9131647495504847, + "language_loss": 0.72974634, + "learning_rate": 2.9175280925163156e-06, + "loss": 0.75126612, + "num_input_tokens_seen": 131444955, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.76953125, + "step": 6118, + "time_per_iteration": 2.531804084777832 + }, + { + "auxiliary_loss_clip": 0.01123956, + "auxiliary_loss_mlp": 0.01042313, + "balance_loss_clip": 1.02694678, + "balance_loss_mlp": 1.04156733, + "epoch": 0.36789418307530436, + "flos": 21761723105280.0, + "grad_norm": 2.002097677722335, + "language_loss": 0.72413695, + "learning_rate": 2.9171820157788445e-06, + "loss": 0.7457996, + "num_input_tokens_seen": 131465720, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.82421875, + "step": 6119, + "time_per_iteration": 2.4641144275665283 + }, + { + "auxiliary_loss_clip": 0.01121284, + "auxiliary_loss_mlp": 0.01032475, + "balance_loss_clip": 1.0179317, + "balance_loss_mlp": 1.04397964, + "epoch": 0.3679543063279723, + "flos": 15924192497280.0, + "grad_norm": 1.84976209385018, + "language_loss": 0.79848421, + "learning_rate": 2.9168359042617404e-06, + "loss": 0.82002181, + "num_input_tokens_seen": 131483080, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7734375, + "step": 6120, + "time_per_iteration": 2.487030029296875 + }, + { + "auxiliary_loss_clip": 0.01117761, + "auxiliary_loss_mlp": 0.01040133, + "balance_loss_clip": 1.02612031, + "balance_loss_mlp": 1.04084468, + "epoch": 0.3680144295806403, + "flos": 24275541456000.0, + "grad_norm": 1.8961465807450149, + "language_loss": 0.63855267, + "learning_rate": 2.916489757978126e-06, + "loss": 0.66013169, + "num_input_tokens_seen": 131502545, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76953125, + "step": 6121, + "time_per_iteration": 2.4573564529418945 + }, + { + "auxiliary_loss_clip": 0.01122895, + "auxiliary_loss_mlp": 0.01042434, + "balance_loss_clip": 1.02755642, + "balance_loss_mlp": 1.0431416, + "epoch": 0.36807455283330826, + "flos": 26104148985600.0, + "grad_norm": 1.8845840511442051, + "language_loss": 0.71209222, + "learning_rate": 2.9161435769411286e-06, + "loss": 0.73374552, + "num_input_tokens_seen": 131522155, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 6122, + "time_per_iteration": 2.5197854042053223 + }, + { + "auxiliary_loss_clip": 0.01116909, + "auxiliary_loss_mlp": 0.01034858, + "balance_loss_clip": 1.02091694, + "balance_loss_mlp": 1.04319501, + "epoch": 0.3681346760859763, + "flos": 24644990793600.0, + "grad_norm": 1.8566190114316727, + "language_loss": 0.69493115, + "learning_rate": 2.915797361163875e-06, + "loss": 0.71644878, + "num_input_tokens_seen": 131543865, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.734375, + "step": 6123, + "time_per_iteration": 2.5585381984710693 + }, + { + "auxiliary_loss_clip": 0.0112515, + "auxiliary_loss_mlp": 0.01039826, + "balance_loss_clip": 1.02426958, + "balance_loss_mlp": 1.04312396, + "epoch": 0.36819479933864424, + "flos": 23878369797120.0, + "grad_norm": 1.995367064863914, + "language_loss": 0.73392212, + "learning_rate": 2.9154511106594933e-06, + "loss": 0.7555719, + "num_input_tokens_seen": 131562155, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8203125, + "step": 6124, + "time_per_iteration": 2.56925368309021 + }, + { + "auxiliary_loss_clip": 0.01121929, + "auxiliary_loss_mlp": 0.01040848, + "balance_loss_clip": 1.02465916, + "balance_loss_mlp": 1.04337013, + "epoch": 0.3682549225913122, + "flos": 25553997302400.0, + "grad_norm": 1.997016319446362, + "language_loss": 0.74426562, + "learning_rate": 2.915104825441114e-06, + "loss": 0.76589334, + "num_input_tokens_seen": 131581695, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.7890625, + "step": 6125, + "time_per_iteration": 2.493232488632202 + }, + { + "auxiliary_loss_clip": 0.01124729, + "auxiliary_loss_mlp": 0.01046169, + "balance_loss_clip": 1.03009367, + "balance_loss_mlp": 1.04400194, + "epoch": 0.36831504584398017, + "flos": 16946605221120.0, + "grad_norm": 1.8135805598812564, + "language_loss": 0.78254056, + "learning_rate": 2.9147585055218686e-06, + "loss": 0.80424947, + "num_input_tokens_seen": 131599465, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8046875, + "step": 6126, + "time_per_iteration": 2.4767327308654785 + }, + { + "auxiliary_loss_clip": 0.01123227, + "auxiliary_loss_mlp": 0.0103777, + "balance_loss_clip": 1.02125943, + "balance_loss_mlp": 1.04164457, + "epoch": 0.36837516909664814, + "flos": 19865065259520.0, + "grad_norm": 2.275366104968191, + "language_loss": 0.66100526, + "learning_rate": 2.914412150914888e-06, + "loss": 0.68261528, + "num_input_tokens_seen": 131618330, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.81640625, + "step": 6127, + "time_per_iteration": 2.4442801475524902 + }, + { + "auxiliary_loss_clip": 0.01126304, + "auxiliary_loss_mlp": 0.01042916, + "balance_loss_clip": 1.02783585, + "balance_loss_mlp": 1.04527378, + "epoch": 0.3684352923493161, + "flos": 37626984362880.0, + "grad_norm": 1.809419798014635, + "language_loss": 0.70553637, + "learning_rate": 2.9140657616333074e-06, + "loss": 0.72722864, + "num_input_tokens_seen": 131638960, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.80859375, + "step": 6128, + "time_per_iteration": 2.6163570880889893 + }, + { + "auxiliary_loss_clip": 0.01121361, + "auxiliary_loss_mlp": 0.01041508, + "balance_loss_clip": 1.0266788, + "balance_loss_mlp": 1.04374862, + "epoch": 0.36849541560198407, + "flos": 14465501182080.0, + "grad_norm": 2.366686546837111, + "language_loss": 0.75425905, + "learning_rate": 2.9137193376902614e-06, + "loss": 0.77588773, + "num_input_tokens_seen": 131657440, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.77734375, + "step": 6129, + "time_per_iteration": 2.418318510055542 + }, + { + "auxiliary_loss_clip": 0.01119858, + "auxiliary_loss_mlp": 0.01041313, + "balance_loss_clip": 1.02652466, + "balance_loss_mlp": 1.0419023, + "epoch": 0.36855553885465203, + "flos": 25770753924480.0, + "grad_norm": 1.583632674026135, + "language_loss": 0.84801334, + "learning_rate": 2.9133728790988868e-06, + "loss": 0.86962497, + "num_input_tokens_seen": 131678035, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.78125, + "step": 6130, + "time_per_iteration": 2.4933249950408936 + }, + { + "auxiliary_loss_clip": 0.01041681, + "auxiliary_loss_mlp": 0.010081, + "balance_loss_clip": 1.00623989, + "balance_loss_mlp": 1.01602125, + "epoch": 0.36861566210732, + "flos": 65049417377280.0, + "grad_norm": 0.8093683158704721, + "language_loss": 0.60352623, + "learning_rate": 2.913026385872321e-06, + "loss": 0.62402403, + "num_input_tokens_seen": 131742470, + "router_z_loss_clip": 0.01855469, + "router_z_loss_mlp": 0.2578125, + "step": 6131, + "time_per_iteration": 3.1686718463897705 + }, + { + "auxiliary_loss_clip": 0.01117904, + "auxiliary_loss_mlp": 0.01031101, + "balance_loss_clip": 1.01657534, + "balance_loss_mlp": 1.04083943, + "epoch": 0.36867578535998796, + "flos": 30954495133440.0, + "grad_norm": 1.5510352980860918, + "language_loss": 0.72903317, + "learning_rate": 2.9126798580237034e-06, + "loss": 0.75052321, + "num_input_tokens_seen": 131764570, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.76953125, + "step": 6132, + "time_per_iteration": 2.54154109954834 + }, + { + "auxiliary_loss_clip": 0.01124361, + "auxiliary_loss_mlp": 0.0103786, + "balance_loss_clip": 1.02221942, + "balance_loss_mlp": 1.04263651, + "epoch": 0.3687359086126559, + "flos": 28837956182400.0, + "grad_norm": 1.665822939326855, + "language_loss": 0.74255228, + "learning_rate": 2.9123332955661736e-06, + "loss": 0.76417446, + "num_input_tokens_seen": 131785720, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.81640625, + "step": 6133, + "time_per_iteration": 2.501119375228882 + }, + { + "auxiliary_loss_clip": 0.01118037, + "auxiliary_loss_mlp": 0.01038324, + "balance_loss_clip": 1.02420318, + "balance_loss_mlp": 1.04308438, + "epoch": 0.3687960318653239, + "flos": 21396798881280.0, + "grad_norm": 1.60564703390979, + "language_loss": 0.71415824, + "learning_rate": 2.911986698512874e-06, + "loss": 0.73572183, + "num_input_tokens_seen": 131804430, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.75, + "step": 6134, + "time_per_iteration": 2.472978353500366 + }, + { + "auxiliary_loss_clip": 0.01121139, + "auxiliary_loss_mlp": 0.01035306, + "balance_loss_clip": 1.0202322, + "balance_loss_mlp": 1.04333591, + "epoch": 0.36885615511799186, + "flos": 20266043760000.0, + "grad_norm": 1.501197032587339, + "language_loss": 0.74985242, + "learning_rate": 2.9116400668769477e-06, + "loss": 0.77141684, + "num_input_tokens_seen": 131822060, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.77734375, + "step": 6135, + "time_per_iteration": 2.458523750305176 + }, + { + "auxiliary_loss_clip": 0.01043215, + "auxiliary_loss_mlp": 0.01004045, + "balance_loss_clip": 1.00209045, + "balance_loss_mlp": 1.01762199, + "epoch": 0.3689162783706599, + "flos": 63088836301440.0, + "grad_norm": 0.8063752733434837, + "language_loss": 0.5878793, + "learning_rate": 2.9112934006715376e-06, + "loss": 0.60835183, + "num_input_tokens_seen": 131880715, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.25585938, + "step": 6136, + "time_per_iteration": 2.9917385578155518 + }, + { + "auxiliary_loss_clip": 0.0112236, + "auxiliary_loss_mlp": 0.01035902, + "balance_loss_clip": 1.02095878, + "balance_loss_mlp": 1.04477668, + "epoch": 0.36897640162332784, + "flos": 10961984419200.0, + "grad_norm": 1.8816926848284692, + "language_loss": 0.78812146, + "learning_rate": 2.9109466999097918e-06, + "loss": 0.80970407, + "num_input_tokens_seen": 131895850, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.77734375, + "step": 6137, + "time_per_iteration": 6.900243520736694 + }, + { + "auxiliary_loss_clip": 0.01122666, + "auxiliary_loss_mlp": 0.01040761, + "balance_loss_clip": 1.02594304, + "balance_loss_mlp": 1.04392326, + "epoch": 0.3690365248759958, + "flos": 20704297599360.0, + "grad_norm": 2.0278297083458345, + "language_loss": 0.74142605, + "learning_rate": 2.9105999646048552e-06, + "loss": 0.76306027, + "num_input_tokens_seen": 131915775, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7890625, + "step": 6138, + "time_per_iteration": 2.5056889057159424 + }, + { + "auxiliary_loss_clip": 0.01127012, + "auxiliary_loss_mlp": 0.01038954, + "balance_loss_clip": 1.02365959, + "balance_loss_mlp": 1.04482222, + "epoch": 0.3690966481286638, + "flos": 31826369957760.0, + "grad_norm": 1.957735157830462, + "language_loss": 0.64818108, + "learning_rate": 2.9102531947698764e-06, + "loss": 0.66984075, + "num_input_tokens_seen": 131935715, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8203125, + "step": 6139, + "time_per_iteration": 2.5345380306243896 + }, + { + "auxiliary_loss_clip": 0.01119273, + "auxiliary_loss_mlp": 0.01040439, + "balance_loss_clip": 1.02563334, + "balance_loss_mlp": 1.04279661, + "epoch": 0.36915677138133174, + "flos": 13114936782720.0, + "grad_norm": 2.0918485574433734, + "language_loss": 0.71384197, + "learning_rate": 2.909906390418006e-06, + "loss": 0.73543906, + "num_input_tokens_seen": 131954120, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.765625, + "step": 6140, + "time_per_iteration": 2.4318323135375977 + }, + { + "auxiliary_loss_clip": 0.01042951, + "auxiliary_loss_mlp": 0.00999596, + "balance_loss_clip": 0.99771231, + "balance_loss_mlp": 1.01712704, + "epoch": 0.3692168946339997, + "flos": 68686879956480.0, + "grad_norm": 0.7479140823872853, + "language_loss": 0.59281325, + "learning_rate": 2.9095595515623934e-06, + "loss": 0.61323869, + "num_input_tokens_seen": 132017485, + "router_z_loss_clip": 0.01879883, + "router_z_loss_mlp": 0.2578125, + "step": 6141, + "time_per_iteration": 3.1505937576293945 + }, + { + "auxiliary_loss_clip": 0.01122987, + "auxiliary_loss_mlp": 0.0103975, + "balance_loss_clip": 1.02499199, + "balance_loss_mlp": 1.04369187, + "epoch": 0.36927701788666767, + "flos": 22017873968640.0, + "grad_norm": 1.768624510630746, + "language_loss": 0.7473368, + "learning_rate": 2.909212678216192e-06, + "loss": 0.76896417, + "num_input_tokens_seen": 132036760, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.79296875, + "step": 6142, + "time_per_iteration": 2.4768457412719727 + }, + { + "auxiliary_loss_clip": 0.01119694, + "auxiliary_loss_mlp": 0.01036766, + "balance_loss_clip": 1.02291358, + "balance_loss_mlp": 1.04270506, + "epoch": 0.36933714113933563, + "flos": 21835591424640.0, + "grad_norm": 2.5385068391341603, + "language_loss": 0.76985848, + "learning_rate": 2.908865770392555e-06, + "loss": 0.79142308, + "num_input_tokens_seen": 132056935, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76953125, + "step": 6143, + "time_per_iteration": 2.4604313373565674 + }, + { + "auxiliary_loss_clip": 0.01118955, + "auxiliary_loss_mlp": 0.01035839, + "balance_loss_clip": 1.02289248, + "balance_loss_mlp": 1.04277074, + "epoch": 0.3693972643920036, + "flos": 23691705793920.0, + "grad_norm": 1.4994482416842545, + "language_loss": 0.81616801, + "learning_rate": 2.9085188281046364e-06, + "loss": 0.83771598, + "num_input_tokens_seen": 132077285, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.76171875, + "step": 6144, + "time_per_iteration": 2.529298782348633 + }, + { + "auxiliary_loss_clip": 0.0112261, + "auxiliary_loss_mlp": 0.01038443, + "balance_loss_clip": 1.02425694, + "balance_loss_mlp": 1.04323006, + "epoch": 0.36945738764467156, + "flos": 22856747172480.0, + "grad_norm": 1.9122738225408384, + "language_loss": 0.77019674, + "learning_rate": 2.908171851365593e-06, + "loss": 0.79180729, + "num_input_tokens_seen": 132095520, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.79296875, + "step": 6145, + "time_per_iteration": 2.4642515182495117 + }, + { + "auxiliary_loss_clip": 0.01123051, + "auxiliary_loss_mlp": 0.01032135, + "balance_loss_clip": 1.01760387, + "balance_loss_mlp": 1.04384804, + "epoch": 0.36951751089733953, + "flos": 16615939593600.0, + "grad_norm": 1.7518336089815172, + "language_loss": 0.76903462, + "learning_rate": 2.9078248401885815e-06, + "loss": 0.79058653, + "num_input_tokens_seen": 132112810, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.79296875, + "step": 6146, + "time_per_iteration": 2.49208927154541 + }, + { + "auxiliary_loss_clip": 0.01125412, + "auxiliary_loss_mlp": 0.01042993, + "balance_loss_clip": 1.02746034, + "balance_loss_mlp": 1.04481673, + "epoch": 0.3695776341500075, + "flos": 18914545607040.0, + "grad_norm": 1.7861503855196468, + "language_loss": 0.80794239, + "learning_rate": 2.907477794586761e-06, + "loss": 0.82962638, + "num_input_tokens_seen": 132131615, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8046875, + "step": 6147, + "time_per_iteration": 2.417968988418579 + }, + { + "auxiliary_loss_clip": 0.01120028, + "auxiliary_loss_mlp": 0.01037464, + "balance_loss_clip": 1.0238626, + "balance_loss_mlp": 1.04083371, + "epoch": 0.36963775740267546, + "flos": 20808474019200.0, + "grad_norm": 1.7356953572419536, + "language_loss": 0.83196342, + "learning_rate": 2.9071307145732926e-06, + "loss": 0.85353833, + "num_input_tokens_seen": 132149585, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.79296875, + "step": 6148, + "time_per_iteration": 2.4493086338043213 + }, + { + "auxiliary_loss_clip": 0.01118838, + "auxiliary_loss_mlp": 0.01038426, + "balance_loss_clip": 1.02424645, + "balance_loss_mlp": 1.04304922, + "epoch": 0.3696978806553435, + "flos": 26061881656320.0, + "grad_norm": 2.337121678381176, + "language_loss": 0.74373478, + "learning_rate": 2.9067836001613357e-06, + "loss": 0.76530743, + "num_input_tokens_seen": 132165555, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7578125, + "step": 6149, + "time_per_iteration": 2.4594686031341553 + }, + { + "auxiliary_loss_clip": 0.01124701, + "auxiliary_loss_mlp": 0.01038071, + "balance_loss_clip": 1.02210915, + "balance_loss_mlp": 1.04449439, + "epoch": 0.36975800390801145, + "flos": 26833925606400.0, + "grad_norm": 1.7562888589836316, + "language_loss": 0.70538592, + "learning_rate": 2.906436451364054e-06, + "loss": 0.72701365, + "num_input_tokens_seen": 132185100, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.80078125, + "step": 6150, + "time_per_iteration": 2.5232975482940674 + }, + { + "auxiliary_loss_clip": 0.01121201, + "auxiliary_loss_mlp": 0.01039019, + "balance_loss_clip": 1.02470183, + "balance_loss_mlp": 1.04390609, + "epoch": 0.3698181271606794, + "flos": 21142623265920.0, + "grad_norm": 1.6469943204532072, + "language_loss": 0.82023048, + "learning_rate": 2.906089268194611e-06, + "loss": 0.84183264, + "num_input_tokens_seen": 132203930, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7734375, + "step": 6151, + "time_per_iteration": 2.448066473007202 + }, + { + "auxiliary_loss_clip": 0.01036606, + "auxiliary_loss_mlp": 0.01001329, + "balance_loss_clip": 0.99951726, + "balance_loss_mlp": 1.01119328, + "epoch": 0.3698782504133474, + "flos": 66742639568640.0, + "grad_norm": 0.838014312453704, + "language_loss": 0.63083476, + "learning_rate": 2.9057420506661726e-06, + "loss": 0.65121406, + "num_input_tokens_seen": 132263845, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.25390625, + "step": 6152, + "time_per_iteration": 3.170707941055298 + }, + { + "auxiliary_loss_clip": 0.01117624, + "auxiliary_loss_mlp": 0.01037374, + "balance_loss_clip": 1.02347398, + "balance_loss_mlp": 1.0429337, + "epoch": 0.36993837366601534, + "flos": 24311523905280.0, + "grad_norm": 1.8166659348284784, + "language_loss": 0.70360208, + "learning_rate": 2.9053947987919044e-06, + "loss": 0.72515202, + "num_input_tokens_seen": 132282350, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 6153, + "time_per_iteration": 2.480318546295166 + }, + { + "auxiliary_loss_clip": 0.01123537, + "auxiliary_loss_mlp": 0.01039275, + "balance_loss_clip": 1.02420688, + "balance_loss_mlp": 1.04319179, + "epoch": 0.3699984969186833, + "flos": 24349194293760.0, + "grad_norm": 2.0600031325492107, + "language_loss": 0.72201782, + "learning_rate": 2.9050475125849755e-06, + "loss": 0.74364597, + "num_input_tokens_seen": 132301930, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8046875, + "step": 6154, + "time_per_iteration": 2.48018479347229 + }, + { + "auxiliary_loss_clip": 0.0111958, + "auxiliary_loss_mlp": 0.01029952, + "balance_loss_clip": 1.01624274, + "balance_loss_mlp": 1.04201758, + "epoch": 0.37005862017135127, + "flos": 19829154637440.0, + "grad_norm": 1.8383479148193087, + "language_loss": 0.67877179, + "learning_rate": 2.9047001920585534e-06, + "loss": 0.70026708, + "num_input_tokens_seen": 132320915, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 6155, + "time_per_iteration": 2.454582929611206 + }, + { + "auxiliary_loss_clip": 0.01119091, + "auxiliary_loss_mlp": 0.01028886, + "balance_loss_clip": 1.01518905, + "balance_loss_mlp": 1.0420723, + "epoch": 0.37011874342401924, + "flos": 19573793873280.0, + "grad_norm": 1.7213710867444976, + "language_loss": 0.67835188, + "learning_rate": 2.9043528372258097e-06, + "loss": 0.6998316, + "num_input_tokens_seen": 132340415, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76953125, + "step": 6156, + "time_per_iteration": 2.456244707107544 + }, + { + "auxiliary_loss_clip": 0.01117904, + "auxiliary_loss_mlp": 0.01038018, + "balance_loss_clip": 1.02461255, + "balance_loss_mlp": 1.04180884, + "epoch": 0.3701788666766872, + "flos": 20374350243840.0, + "grad_norm": 1.7871024658649661, + "language_loss": 0.82324016, + "learning_rate": 2.904005448099916e-06, + "loss": 0.8447994, + "num_input_tokens_seen": 132358600, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 6157, + "time_per_iteration": 2.467258930206299 + }, + { + "auxiliary_loss_clip": 0.0112233, + "auxiliary_loss_mlp": 0.0103747, + "balance_loss_clip": 1.02214015, + "balance_loss_mlp": 1.04224074, + "epoch": 0.37023898992935517, + "flos": 15340931452800.0, + "grad_norm": 2.319348977212497, + "language_loss": 0.76519799, + "learning_rate": 2.9036580246940444e-06, + "loss": 0.78679597, + "num_input_tokens_seen": 132373160, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.80078125, + "step": 6158, + "time_per_iteration": 2.4462850093841553 + }, + { + "auxiliary_loss_clip": 0.01121021, + "auxiliary_loss_mlp": 0.01037892, + "balance_loss_clip": 1.02276468, + "balance_loss_mlp": 1.04128695, + "epoch": 0.37029911318202313, + "flos": 19573937527680.0, + "grad_norm": 2.3237426114128903, + "language_loss": 0.6888833, + "learning_rate": 2.9033105670213708e-06, + "loss": 0.71047246, + "num_input_tokens_seen": 132392345, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.796875, + "step": 6159, + "time_per_iteration": 2.444615364074707 + }, + { + "auxiliary_loss_clip": 0.0111775, + "auxiliary_loss_mlp": 0.0103647, + "balance_loss_clip": 1.02298164, + "balance_loss_mlp": 1.04054952, + "epoch": 0.3703592364346911, + "flos": 26213353309440.0, + "grad_norm": 1.7829911261722147, + "language_loss": 0.7101602, + "learning_rate": 2.9029630750950697e-06, + "loss": 0.73170245, + "num_input_tokens_seen": 132412620, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7734375, + "step": 6160, + "time_per_iteration": 2.4807472229003906 + }, + { + "auxiliary_loss_clip": 0.01114504, + "auxiliary_loss_mlp": 0.01030762, + "balance_loss_clip": 1.01808465, + "balance_loss_mlp": 1.04033566, + "epoch": 0.37041935968735906, + "flos": 20048317470720.0, + "grad_norm": 1.5671410195286926, + "language_loss": 0.79049259, + "learning_rate": 2.9026155489283176e-06, + "loss": 0.81194532, + "num_input_tokens_seen": 132431570, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7421875, + "step": 6161, + "time_per_iteration": 2.445615768432617 + }, + { + "auxiliary_loss_clip": 0.01119907, + "auxiliary_loss_mlp": 0.01037146, + "balance_loss_clip": 1.02266204, + "balance_loss_mlp": 1.04217172, + "epoch": 0.3704794829400271, + "flos": 24133802388480.0, + "grad_norm": 1.6578530571842398, + "language_loss": 0.7961942, + "learning_rate": 2.902267988534295e-06, + "loss": 0.81776464, + "num_input_tokens_seen": 132451525, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.77734375, + "step": 6162, + "time_per_iteration": 2.474179267883301 + }, + { + "auxiliary_loss_clip": 0.01118518, + "auxiliary_loss_mlp": 0.01035343, + "balance_loss_clip": 1.02122831, + "balance_loss_mlp": 1.04136944, + "epoch": 0.37053960619269505, + "flos": 14866874732160.0, + "grad_norm": 1.751569507310971, + "language_loss": 0.79592955, + "learning_rate": 2.9019203939261783e-06, + "loss": 0.81746811, + "num_input_tokens_seen": 132469875, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76953125, + "step": 6163, + "time_per_iteration": 2.429410696029663 + }, + { + "auxiliary_loss_clip": 0.01121642, + "auxiliary_loss_mlp": 0.01032856, + "balance_loss_clip": 1.01815772, + "balance_loss_mlp": 1.04239571, + "epoch": 0.370599729445363, + "flos": 21361498790400.0, + "grad_norm": 1.6995697719291154, + "language_loss": 0.68002689, + "learning_rate": 2.9015727651171507e-06, + "loss": 0.70157188, + "num_input_tokens_seen": 132488360, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.79296875, + "step": 6164, + "time_per_iteration": 2.4500439167022705 + }, + { + "auxiliary_loss_clip": 0.01125233, + "auxiliary_loss_mlp": 0.01035754, + "balance_loss_clip": 1.0206207, + "balance_loss_mlp": 1.04507017, + "epoch": 0.370659852698031, + "flos": 26829041356800.0, + "grad_norm": 2.4697759057606197, + "language_loss": 0.82807398, + "learning_rate": 2.9012251021203935e-06, + "loss": 0.84968388, + "num_input_tokens_seen": 132508630, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.80078125, + "step": 6165, + "time_per_iteration": 2.4863715171813965 + }, + { + "auxiliary_loss_clip": 0.01125688, + "auxiliary_loss_mlp": 0.01036899, + "balance_loss_clip": 1.02060854, + "balance_loss_mlp": 1.04388845, + "epoch": 0.37071997595069894, + "flos": 19099018880640.0, + "grad_norm": 1.8224972170046692, + "language_loss": 0.69500774, + "learning_rate": 2.9008774049490896e-06, + "loss": 0.71663356, + "num_input_tokens_seen": 132527465, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.81640625, + "step": 6166, + "time_per_iteration": 2.560605049133301 + }, + { + "auxiliary_loss_clip": 0.01038031, + "auxiliary_loss_mlp": 0.01006399, + "balance_loss_clip": 1.00471771, + "balance_loss_mlp": 1.01302195, + "epoch": 0.3707800992033669, + "flos": 52178384920320.0, + "grad_norm": 0.8093247029889314, + "language_loss": 0.56892115, + "learning_rate": 2.9005296736164244e-06, + "loss": 0.58936548, + "num_input_tokens_seen": 132579940, + "router_z_loss_clip": 0.0168457, + "router_z_loss_mlp": 0.25, + "step": 6167, + "time_per_iteration": 2.922917127609253 + }, + { + "auxiliary_loss_clip": 0.01118731, + "auxiliary_loss_mlp": 0.01033249, + "balance_loss_clip": 1.01992154, + "balance_loss_mlp": 1.04288507, + "epoch": 0.3708402224560349, + "flos": 19901837808000.0, + "grad_norm": 1.945139483069219, + "language_loss": 0.75539452, + "learning_rate": 2.900181908135584e-06, + "loss": 0.77691436, + "num_input_tokens_seen": 132598390, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7578125, + "step": 6168, + "time_per_iteration": 2.4489872455596924 + }, + { + "auxiliary_loss_clip": 0.01120115, + "auxiliary_loss_mlp": 0.01035803, + "balance_loss_clip": 1.02202857, + "balance_loss_mlp": 1.04180634, + "epoch": 0.37090034570870284, + "flos": 20007630339840.0, + "grad_norm": 2.5586684776543853, + "language_loss": 0.7432459, + "learning_rate": 2.899834108519755e-06, + "loss": 0.76480508, + "num_input_tokens_seen": 132616920, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.78125, + "step": 6169, + "time_per_iteration": 2.4537463188171387 + }, + { + "auxiliary_loss_clip": 0.01120897, + "auxiliary_loss_mlp": 0.01032585, + "balance_loss_clip": 1.01891184, + "balance_loss_mlp": 1.04480267, + "epoch": 0.3709604689613708, + "flos": 24134700228480.0, + "grad_norm": 1.3706540261028175, + "language_loss": 0.79311681, + "learning_rate": 2.899486274782127e-06, + "loss": 0.81465161, + "num_input_tokens_seen": 132637660, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7578125, + "step": 6170, + "time_per_iteration": 2.4723992347717285 + }, + { + "auxiliary_loss_clip": 0.01122845, + "auxiliary_loss_mlp": 0.01038875, + "balance_loss_clip": 1.02390242, + "balance_loss_mlp": 1.04451621, + "epoch": 0.37102059221403877, + "flos": 23876071326720.0, + "grad_norm": 1.6235616399590074, + "language_loss": 0.76385272, + "learning_rate": 2.8991384069358885e-06, + "loss": 0.78546989, + "num_input_tokens_seen": 132657635, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.78125, + "step": 6171, + "time_per_iteration": 2.5364768505096436 + }, + { + "auxiliary_loss_clip": 0.01123724, + "auxiliary_loss_mlp": 0.01031905, + "balance_loss_clip": 1.01663446, + "balance_loss_mlp": 1.04594254, + "epoch": 0.37108071546670673, + "flos": 14501268149760.0, + "grad_norm": 1.9768297571305458, + "language_loss": 0.80696416, + "learning_rate": 2.898790504994232e-06, + "loss": 0.82852054, + "num_input_tokens_seen": 132674455, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.77734375, + "step": 6172, + "time_per_iteration": 2.451099395751953 + }, + { + "auxiliary_loss_clip": 0.01124197, + "auxiliary_loss_mlp": 0.01037606, + "balance_loss_clip": 1.0219543, + "balance_loss_mlp": 1.04385138, + "epoch": 0.3711408387193747, + "flos": 34562619279360.0, + "grad_norm": 2.2157067962534875, + "language_loss": 0.59447742, + "learning_rate": 2.89844256897035e-06, + "loss": 0.61609542, + "num_input_tokens_seen": 132695140, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8046875, + "step": 6173, + "time_per_iteration": 2.5750677585601807 + }, + { + "auxiliary_loss_clip": 0.01121876, + "auxiliary_loss_mlp": 0.01036073, + "balance_loss_clip": 1.02122533, + "balance_loss_mlp": 1.04391754, + "epoch": 0.37120096197204266, + "flos": 17310703432320.0, + "grad_norm": 1.9248503394254857, + "language_loss": 0.81157243, + "learning_rate": 2.898094598877435e-06, + "loss": 0.83315188, + "num_input_tokens_seen": 132712470, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78125, + "step": 6174, + "time_per_iteration": 2.421182155609131 + }, + { + "auxiliary_loss_clip": 0.01117919, + "auxiliary_loss_mlp": 0.01033906, + "balance_loss_clip": 1.02035165, + "balance_loss_mlp": 1.04281855, + "epoch": 0.37126108522471063, + "flos": 30664049760000.0, + "grad_norm": 1.8542839121663495, + "language_loss": 0.79834068, + "learning_rate": 2.8977465947286826e-06, + "loss": 0.81985891, + "num_input_tokens_seen": 132732945, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.75, + "step": 6175, + "time_per_iteration": 2.533447027206421 + }, + { + "auxiliary_loss_clip": 0.01124428, + "auxiliary_loss_mlp": 0.01046668, + "balance_loss_clip": 1.03194535, + "balance_loss_mlp": 1.04644537, + "epoch": 0.37132120847737865, + "flos": 25155640494720.0, + "grad_norm": 1.6734071315129293, + "language_loss": 0.88764346, + "learning_rate": 2.89739855653729e-06, + "loss": 0.90935433, + "num_input_tokens_seen": 132752470, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.78125, + "step": 6176, + "time_per_iteration": 2.486224412918091 + }, + { + "auxiliary_loss_clip": 0.01122363, + "auxiliary_loss_mlp": 0.01036031, + "balance_loss_clip": 1.02174938, + "balance_loss_mlp": 1.04402244, + "epoch": 0.3713813317300466, + "flos": 21213474842880.0, + "grad_norm": 1.5809846817738957, + "language_loss": 0.73293233, + "learning_rate": 2.8970504843164546e-06, + "loss": 0.75451624, + "num_input_tokens_seen": 132771485, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 6177, + "time_per_iteration": 2.492033004760742 + }, + { + "auxiliary_loss_clip": 0.01119881, + "auxiliary_loss_mlp": 0.01039443, + "balance_loss_clip": 1.02551913, + "balance_loss_mlp": 1.04359818, + "epoch": 0.3714414549827146, + "flos": 21616644072960.0, + "grad_norm": 1.8832415058442271, + "language_loss": 0.75425023, + "learning_rate": 2.896702378079374e-06, + "loss": 0.77584344, + "num_input_tokens_seen": 132791465, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76171875, + "step": 6178, + "time_per_iteration": 4.005537748336792 + }, + { + "auxiliary_loss_clip": 0.01123036, + "auxiliary_loss_mlp": 0.01033082, + "balance_loss_clip": 1.01896191, + "balance_loss_mlp": 1.04618645, + "epoch": 0.37150157823538255, + "flos": 19972294335360.0, + "grad_norm": 1.761738877644596, + "language_loss": 0.7228415, + "learning_rate": 2.8963542378392502e-06, + "loss": 0.74440265, + "num_input_tokens_seen": 132810160, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76953125, + "step": 6179, + "time_per_iteration": 5.333393812179565 + }, + { + "auxiliary_loss_clip": 0.01122372, + "auxiliary_loss_mlp": 0.01035165, + "balance_loss_clip": 1.01987052, + "balance_loss_mlp": 1.04356897, + "epoch": 0.3715617014880505, + "flos": 24860562266880.0, + "grad_norm": 2.1666258639633518, + "language_loss": 0.69705212, + "learning_rate": 2.896006063609283e-06, + "loss": 0.71862751, + "num_input_tokens_seen": 132831265, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.7890625, + "step": 6180, + "time_per_iteration": 2.4896974563598633 + }, + { + "auxiliary_loss_clip": 0.01117383, + "auxiliary_loss_mlp": 0.01030855, + "balance_loss_clip": 1.01695561, + "balance_loss_mlp": 1.04157031, + "epoch": 0.3716218247407185, + "flos": 20449080489600.0, + "grad_norm": 1.7756296340851163, + "language_loss": 0.77702844, + "learning_rate": 2.8956578554026767e-06, + "loss": 0.79851079, + "num_input_tokens_seen": 132850005, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 6181, + "time_per_iteration": 2.4324231147766113 + }, + { + "auxiliary_loss_clip": 0.01118444, + "auxiliary_loss_mlp": 0.0103491, + "balance_loss_clip": 1.0202775, + "balance_loss_mlp": 1.04225945, + "epoch": 0.37168194799338644, + "flos": 24133479166080.0, + "grad_norm": 1.8526172549307973, + "language_loss": 0.78767365, + "learning_rate": 2.8953096132326343e-06, + "loss": 0.80920726, + "num_input_tokens_seen": 132865790, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.76171875, + "step": 6182, + "time_per_iteration": 2.47566819190979 + }, + { + "auxiliary_loss_clip": 0.01036072, + "auxiliary_loss_mlp": 0.01008449, + "balance_loss_clip": 1.00650644, + "balance_loss_mlp": 1.01082778, + "epoch": 0.3717420712460544, + "flos": 67408926900480.0, + "grad_norm": 0.7841437663574693, + "language_loss": 0.5748502, + "learning_rate": 2.894961337112362e-06, + "loss": 0.59529543, + "num_input_tokens_seen": 132921775, + "router_z_loss_clip": 0.01940918, + "router_z_loss_mlp": 0.25195312, + "step": 6183, + "time_per_iteration": 3.0538721084594727 + }, + { + "auxiliary_loss_clip": 0.01124733, + "auxiliary_loss_mlp": 0.010435, + "balance_loss_clip": 1.02772832, + "balance_loss_mlp": 1.04238844, + "epoch": 0.37180219449872237, + "flos": 22376908362240.0, + "grad_norm": 2.1996761862640715, + "language_loss": 0.76940209, + "learning_rate": 2.894613027055066e-06, + "loss": 0.79108441, + "num_input_tokens_seen": 132941060, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.82421875, + "step": 6184, + "time_per_iteration": 2.4653987884521484 + }, + { + "auxiliary_loss_clip": 0.0111964, + "auxiliary_loss_mlp": 0.01036854, + "balance_loss_clip": 1.02268612, + "balance_loss_mlp": 1.04353404, + "epoch": 0.37186231775139034, + "flos": 21869885934720.0, + "grad_norm": 13.965274526936179, + "language_loss": 0.72047049, + "learning_rate": 2.894264683073954e-06, + "loss": 0.74203539, + "num_input_tokens_seen": 132961850, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.76171875, + "step": 6185, + "time_per_iteration": 2.458340644836426 + }, + { + "auxiliary_loss_clip": 0.01117506, + "auxiliary_loss_mlp": 0.01027176, + "balance_loss_clip": 1.01282895, + "balance_loss_mlp": 1.04169369, + "epoch": 0.3719224410040583, + "flos": 22415225195520.0, + "grad_norm": 1.55661462109525, + "language_loss": 0.7702297, + "learning_rate": 2.8939163051822363e-06, + "loss": 0.79167652, + "num_input_tokens_seen": 132981625, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7578125, + "step": 6186, + "time_per_iteration": 2.4665393829345703 + }, + { + "auxiliary_loss_clip": 0.01125099, + "auxiliary_loss_mlp": 0.01041622, + "balance_loss_clip": 1.02608871, + "balance_loss_mlp": 1.0436089, + "epoch": 0.37198256425672627, + "flos": 25151223121920.0, + "grad_norm": 1.8483894715485976, + "language_loss": 0.83475709, + "learning_rate": 2.8935678933931224e-06, + "loss": 0.85642433, + "num_input_tokens_seen": 133001225, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8125, + "step": 6187, + "time_per_iteration": 2.520294427871704 + }, + { + "auxiliary_loss_clip": 0.01118811, + "auxiliary_loss_mlp": 0.0103693, + "balance_loss_clip": 1.02228546, + "balance_loss_mlp": 1.0421021, + "epoch": 0.37204268750939423, + "flos": 21138313633920.0, + "grad_norm": 2.555128723697134, + "language_loss": 0.84544367, + "learning_rate": 2.893219447719824e-06, + "loss": 0.86700106, + "num_input_tokens_seen": 133018820, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.765625, + "step": 6188, + "time_per_iteration": 2.4926793575286865 + }, + { + "auxiliary_loss_clip": 0.01121509, + "auxiliary_loss_mlp": 0.01034942, + "balance_loss_clip": 1.01966548, + "balance_loss_mlp": 1.04392672, + "epoch": 0.37210281076206225, + "flos": 21506829217920.0, + "grad_norm": 1.6829112555225307, + "language_loss": 0.65646267, + "learning_rate": 2.8928709681755548e-06, + "loss": 0.67802715, + "num_input_tokens_seen": 133040205, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.7734375, + "step": 6189, + "time_per_iteration": 2.447175979614258 + }, + { + "auxiliary_loss_clip": 0.01122656, + "auxiliary_loss_mlp": 0.01039465, + "balance_loss_clip": 1.02514815, + "balance_loss_mlp": 1.04456878, + "epoch": 0.3721629340147302, + "flos": 17347835116800.0, + "grad_norm": 2.6073714147883162, + "language_loss": 0.83948457, + "learning_rate": 2.8925224547735293e-06, + "loss": 0.8611058, + "num_input_tokens_seen": 133058095, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 6190, + "time_per_iteration": 2.4410126209259033 + }, + { + "auxiliary_loss_clip": 0.01125721, + "auxiliary_loss_mlp": 0.01033909, + "balance_loss_clip": 1.01949084, + "balance_loss_mlp": 1.04337156, + "epoch": 0.3722230572673982, + "flos": 16432400073600.0, + "grad_norm": 2.3404623023220643, + "language_loss": 0.88506198, + "learning_rate": 2.8921739075269633e-06, + "loss": 0.90665835, + "num_input_tokens_seen": 133071530, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.82421875, + "step": 6191, + "time_per_iteration": 2.452972650527954 + }, + { + "auxiliary_loss_clip": 0.01123549, + "auxiliary_loss_mlp": 0.01033146, + "balance_loss_clip": 1.01648057, + "balance_loss_mlp": 1.04218102, + "epoch": 0.37228318052006615, + "flos": 22674716023680.0, + "grad_norm": 1.570395080331924, + "language_loss": 0.74228191, + "learning_rate": 2.891825326449073e-06, + "loss": 0.76384884, + "num_input_tokens_seen": 133091410, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8125, + "step": 6192, + "time_per_iteration": 2.6486353874206543 + }, + { + "auxiliary_loss_clip": 0.01119485, + "auxiliary_loss_mlp": 0.01036496, + "balance_loss_clip": 1.02246475, + "balance_loss_mlp": 1.0427109, + "epoch": 0.3723433037727341, + "flos": 25265491263360.0, + "grad_norm": 2.4820365699908944, + "language_loss": 0.79760754, + "learning_rate": 2.8914767115530766e-06, + "loss": 0.81916732, + "num_input_tokens_seen": 133110365, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.765625, + "step": 6193, + "time_per_iteration": 2.525973081588745 + }, + { + "auxiliary_loss_clip": 0.01123101, + "auxiliary_loss_mlp": 0.01039009, + "balance_loss_clip": 1.02436423, + "balance_loss_mlp": 1.043504, + "epoch": 0.3724034270254021, + "flos": 10524664333440.0, + "grad_norm": 1.7895472081978328, + "language_loss": 0.84495157, + "learning_rate": 2.891128062852194e-06, + "loss": 0.86657262, + "num_input_tokens_seen": 133128255, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.796875, + "step": 6194, + "time_per_iteration": 2.419099807739258 + }, + { + "auxiliary_loss_clip": 0.01118251, + "auxiliary_loss_mlp": 0.01034958, + "balance_loss_clip": 1.02080166, + "balance_loss_mlp": 1.04037666, + "epoch": 0.37246355027807004, + "flos": 20266223328000.0, + "grad_norm": 2.9207659578016463, + "language_loss": 0.77555239, + "learning_rate": 2.890779380359646e-06, + "loss": 0.79708451, + "num_input_tokens_seen": 133143975, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.77734375, + "step": 6195, + "time_per_iteration": 2.3995044231414795 + }, + { + "auxiliary_loss_clip": 0.01119279, + "auxiliary_loss_mlp": 0.01032495, + "balance_loss_clip": 1.01814234, + "balance_loss_mlp": 1.0428412, + "epoch": 0.372523673530738, + "flos": 19500571998720.0, + "grad_norm": 1.677102671463593, + "language_loss": 0.79111922, + "learning_rate": 2.890430664088655e-06, + "loss": 0.81263697, + "num_input_tokens_seen": 133162935, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.765625, + "step": 6196, + "time_per_iteration": 2.445478916168213 + }, + { + "auxiliary_loss_clip": 0.01119995, + "auxiliary_loss_mlp": 0.01036406, + "balance_loss_clip": 1.02235723, + "balance_loss_mlp": 1.04315817, + "epoch": 0.372583796783406, + "flos": 16764250849920.0, + "grad_norm": 1.8393036550873767, + "language_loss": 0.8332746, + "learning_rate": 2.890081914052443e-06, + "loss": 0.85483867, + "num_input_tokens_seen": 133181180, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.765625, + "step": 6197, + "time_per_iteration": 2.392005443572998 + }, + { + "auxiliary_loss_clip": 0.01115911, + "auxiliary_loss_mlp": 0.01035382, + "balance_loss_clip": 1.0202899, + "balance_loss_mlp": 1.04070568, + "epoch": 0.37264392003607394, + "flos": 22637979388800.0, + "grad_norm": 2.267147370646453, + "language_loss": 0.64613056, + "learning_rate": 2.889733130264237e-06, + "loss": 0.66764355, + "num_input_tokens_seen": 133199615, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.75, + "step": 6198, + "time_per_iteration": 2.4624876976013184 + }, + { + "auxiliary_loss_clip": 0.0111678, + "auxiliary_loss_mlp": 0.01043759, + "balance_loss_clip": 1.02989507, + "balance_loss_mlp": 1.04129016, + "epoch": 0.3727040432887419, + "flos": 19973120348160.0, + "grad_norm": 2.4815957641530084, + "language_loss": 0.7439245, + "learning_rate": 2.889384312737261e-06, + "loss": 0.76552987, + "num_input_tokens_seen": 133219650, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75390625, + "step": 6199, + "time_per_iteration": 2.454932689666748 + }, + { + "auxiliary_loss_clip": 0.01117342, + "auxiliary_loss_mlp": 0.01032553, + "balance_loss_clip": 1.01881397, + "balance_loss_mlp": 1.04112601, + "epoch": 0.37276416654140987, + "flos": 63899122279680.0, + "grad_norm": 1.569210214205425, + "language_loss": 0.80711329, + "learning_rate": 2.889035461484742e-06, + "loss": 0.82861221, + "num_input_tokens_seen": 133245675, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76171875, + "step": 6200, + "time_per_iteration": 2.853854179382324 + }, + { + "auxiliary_loss_clip": 0.01118801, + "auxiliary_loss_mlp": 0.01040438, + "balance_loss_clip": 1.02588272, + "balance_loss_mlp": 1.04248428, + "epoch": 0.37282428979407783, + "flos": 39785970211200.0, + "grad_norm": 2.046105641958108, + "language_loss": 0.60723466, + "learning_rate": 2.88868657651991e-06, + "loss": 0.6288271, + "num_input_tokens_seen": 133266905, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.765625, + "step": 6201, + "time_per_iteration": 2.58642315864563 + }, + { + "auxiliary_loss_clip": 0.01122167, + "auxiliary_loss_mlp": 0.01032753, + "balance_loss_clip": 1.01813745, + "balance_loss_mlp": 1.04334736, + "epoch": 0.37288441304674586, + "flos": 22709046447360.0, + "grad_norm": 1.5967185311646992, + "language_loss": 0.72980845, + "learning_rate": 2.8883376578559934e-06, + "loss": 0.75135767, + "num_input_tokens_seen": 133286865, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7890625, + "step": 6202, + "time_per_iteration": 2.461116075515747 + }, + { + "auxiliary_loss_clip": 0.01120095, + "auxiliary_loss_mlp": 0.01034793, + "balance_loss_clip": 1.02064919, + "balance_loss_mlp": 1.04372942, + "epoch": 0.3729445362994138, + "flos": 18770292587520.0, + "grad_norm": 2.8761852736669793, + "language_loss": 0.739654, + "learning_rate": 2.8879887055062243e-06, + "loss": 0.76120287, + "num_input_tokens_seen": 133305295, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.765625, + "step": 6203, + "time_per_iteration": 2.4199976921081543 + }, + { + "auxiliary_loss_clip": 0.01113815, + "auxiliary_loss_mlp": 0.01033282, + "balance_loss_clip": 1.02083671, + "balance_loss_mlp": 1.03933048, + "epoch": 0.3730046595520818, + "flos": 22456199635200.0, + "grad_norm": 1.6894031212763305, + "language_loss": 0.81359541, + "learning_rate": 2.8876397194838353e-06, + "loss": 0.83506644, + "num_input_tokens_seen": 133324625, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.74609375, + "step": 6204, + "time_per_iteration": 2.527442693710327 + }, + { + "auxiliary_loss_clip": 0.01122288, + "auxiliary_loss_mlp": 0.01040396, + "balance_loss_clip": 1.02538753, + "balance_loss_mlp": 1.04287875, + "epoch": 0.37306478280474975, + "flos": 24316372241280.0, + "grad_norm": 1.5818895271767701, + "language_loss": 0.75028086, + "learning_rate": 2.8872906998020577e-06, + "loss": 0.77190769, + "num_input_tokens_seen": 133344625, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.796875, + "step": 6205, + "time_per_iteration": 2.515028953552246 + }, + { + "auxiliary_loss_clip": 0.01118084, + "auxiliary_loss_mlp": 0.01034946, + "balance_loss_clip": 1.02002704, + "balance_loss_mlp": 1.04183412, + "epoch": 0.3731249060574177, + "flos": 15815167741440.0, + "grad_norm": 1.8699710225203796, + "language_loss": 0.78044879, + "learning_rate": 2.886941646474128e-06, + "loss": 0.80197906, + "num_input_tokens_seen": 133363605, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.76171875, + "step": 6206, + "time_per_iteration": 2.433136224746704 + }, + { + "auxiliary_loss_clip": 0.01119546, + "auxiliary_loss_mlp": 0.01032561, + "balance_loss_clip": 1.01752925, + "balance_loss_mlp": 1.04182768, + "epoch": 0.3731850293100857, + "flos": 19828077229440.0, + "grad_norm": 2.1358392378140487, + "language_loss": 0.93595111, + "learning_rate": 2.886592559513283e-06, + "loss": 0.95747221, + "num_input_tokens_seen": 133379405, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7734375, + "step": 6207, + "time_per_iteration": 2.422592878341675 + }, + { + "auxiliary_loss_clip": 0.01120785, + "auxiliary_loss_mlp": 0.01031375, + "balance_loss_clip": 1.01774943, + "balance_loss_mlp": 1.04154027, + "epoch": 0.37324515256275365, + "flos": 19062354072960.0, + "grad_norm": 2.238385364236049, + "language_loss": 0.82666922, + "learning_rate": 2.886243438932759e-06, + "loss": 0.84819084, + "num_input_tokens_seen": 133397585, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.79296875, + "step": 6208, + "time_per_iteration": 2.5171287059783936 + }, + { + "auxiliary_loss_clip": 0.01122491, + "auxiliary_loss_mlp": 0.0103487, + "balance_loss_clip": 1.01911068, + "balance_loss_mlp": 1.04320371, + "epoch": 0.3733052758154216, + "flos": 20704333512960.0, + "grad_norm": 1.7601988102738153, + "language_loss": 0.73197794, + "learning_rate": 2.8858942847457953e-06, + "loss": 0.75355148, + "num_input_tokens_seen": 133415365, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.79296875, + "step": 6209, + "time_per_iteration": 2.480943202972412 + }, + { + "auxiliary_loss_clip": 0.01120081, + "auxiliary_loss_mlp": 0.01037238, + "balance_loss_clip": 1.02178252, + "balance_loss_mlp": 1.0430553, + "epoch": 0.3733653990680896, + "flos": 20193504243840.0, + "grad_norm": 1.4781766070975684, + "language_loss": 0.69951272, + "learning_rate": 2.8855450969656305e-06, + "loss": 0.72108591, + "num_input_tokens_seen": 133435700, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.76953125, + "step": 6210, + "time_per_iteration": 2.5063016414642334 + }, + { + "auxiliary_loss_clip": 0.01121548, + "auxiliary_loss_mlp": 0.01030573, + "balance_loss_clip": 1.01533842, + "balance_loss_mlp": 1.04171228, + "epoch": 0.37342552232075754, + "flos": 20339660684160.0, + "grad_norm": 1.960293983782413, + "language_loss": 0.77729124, + "learning_rate": 2.8851958756055073e-06, + "loss": 0.79881245, + "num_input_tokens_seen": 133455180, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.796875, + "step": 6211, + "time_per_iteration": 2.4845266342163086 + }, + { + "auxiliary_loss_clip": 0.01121905, + "auxiliary_loss_mlp": 0.01038644, + "balance_loss_clip": 1.0240593, + "balance_loss_mlp": 1.04219186, + "epoch": 0.3734856455734255, + "flos": 35517879527040.0, + "grad_norm": 1.9911666037414828, + "language_loss": 0.73026669, + "learning_rate": 2.884846620678668e-06, + "loss": 0.75187218, + "num_input_tokens_seen": 133476715, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.796875, + "step": 6212, + "time_per_iteration": 2.615323066711426 + }, + { + "auxiliary_loss_clip": 0.01130473, + "auxiliary_loss_mlp": 0.01047817, + "balance_loss_clip": 1.03231955, + "balance_loss_mlp": 1.04560018, + "epoch": 0.37354576882609347, + "flos": 21142300043520.0, + "grad_norm": 4.00760557025762, + "language_loss": 0.81895888, + "learning_rate": 2.884497332198356e-06, + "loss": 0.84074175, + "num_input_tokens_seen": 133494550, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.84765625, + "step": 6213, + "time_per_iteration": 2.4621500968933105 + }, + { + "auxiliary_loss_clip": 0.01119566, + "auxiliary_loss_mlp": 0.01039017, + "balance_loss_clip": 1.02433026, + "balance_loss_mlp": 1.04143643, + "epoch": 0.37360589207876144, + "flos": 21506793304320.0, + "grad_norm": 2.2631910468903014, + "language_loss": 0.7890203, + "learning_rate": 2.8841480101778167e-06, + "loss": 0.81060612, + "num_input_tokens_seen": 133512640, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.78125, + "step": 6214, + "time_per_iteration": 2.5582997798919678 + }, + { + "auxiliary_loss_clip": 0.01117625, + "auxiliary_loss_mlp": 0.01043035, + "balance_loss_clip": 1.02859902, + "balance_loss_mlp": 1.04069364, + "epoch": 0.37366601533142946, + "flos": 38435800861440.0, + "grad_norm": 1.7789401165216012, + "language_loss": 0.84881294, + "learning_rate": 2.883798654630296e-06, + "loss": 0.87041962, + "num_input_tokens_seen": 133535540, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.76953125, + "step": 6215, + "time_per_iteration": 2.6216535568237305 + }, + { + "auxiliary_loss_clip": 0.01121702, + "auxiliary_loss_mlp": 0.01041572, + "balance_loss_clip": 1.02595592, + "balance_loss_mlp": 1.04088581, + "epoch": 0.3737261385840974, + "flos": 18441171244800.0, + "grad_norm": 5.614431195109344, + "language_loss": 0.67669535, + "learning_rate": 2.8834492655690423e-06, + "loss": 0.69832802, + "num_input_tokens_seen": 133555795, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.80859375, + "step": 6216, + "time_per_iteration": 2.4592814445495605 + }, + { + "auxiliary_loss_clip": 0.01121492, + "auxiliary_loss_mlp": 0.01040499, + "balance_loss_clip": 1.02500176, + "balance_loss_mlp": 1.04252148, + "epoch": 0.3737862618367654, + "flos": 22929861306240.0, + "grad_norm": 2.041107256757408, + "language_loss": 0.65695626, + "learning_rate": 2.883099843007303e-06, + "loss": 0.67857617, + "num_input_tokens_seen": 133575905, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.7890625, + "step": 6217, + "time_per_iteration": 2.50801420211792 + }, + { + "auxiliary_loss_clip": 0.01122909, + "auxiliary_loss_mlp": 0.01039721, + "balance_loss_clip": 1.02378845, + "balance_loss_mlp": 1.04290843, + "epoch": 0.37384638508943335, + "flos": 15409664127360.0, + "grad_norm": 3.2488334570714725, + "language_loss": 0.80776107, + "learning_rate": 2.88275038695833e-06, + "loss": 0.82938731, + "num_input_tokens_seen": 133592585, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.80078125, + "step": 6218, + "time_per_iteration": 2.469524383544922 + }, + { + "auxiliary_loss_clip": 0.01117083, + "auxiliary_loss_mlp": 0.01032877, + "balance_loss_clip": 1.01851249, + "balance_loss_mlp": 1.04241216, + "epoch": 0.3739065083421013, + "flos": 24280820755200.0, + "grad_norm": 1.3682227753048604, + "language_loss": 0.78710622, + "learning_rate": 2.8824008974353736e-06, + "loss": 0.80860579, + "num_input_tokens_seen": 133615070, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.74609375, + "step": 6219, + "time_per_iteration": 2.595862627029419 + }, + { + "auxiliary_loss_clip": 0.01119648, + "auxiliary_loss_mlp": 0.0104261, + "balance_loss_clip": 1.02776265, + "balance_loss_mlp": 1.0430454, + "epoch": 0.3739666315947693, + "flos": 23002831785600.0, + "grad_norm": 2.1916352692915217, + "language_loss": 0.76985866, + "learning_rate": 2.8820513744516866e-06, + "loss": 0.79148126, + "num_input_tokens_seen": 133633490, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.765625, + "step": 6220, + "time_per_iteration": 6.68864631652832 + }, + { + "auxiliary_loss_clip": 0.01120187, + "auxiliary_loss_mlp": 0.01041991, + "balance_loss_clip": 1.02635062, + "balance_loss_mlp": 1.04149485, + "epoch": 0.37402675484743725, + "flos": 19391116279680.0, + "grad_norm": 1.921342744454882, + "language_loss": 0.82958305, + "learning_rate": 2.8817018180205235e-06, + "loss": 0.85120487, + "num_input_tokens_seen": 133653425, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.7890625, + "step": 6221, + "time_per_iteration": 3.9474618434906006 + }, + { + "auxiliary_loss_clip": 0.0111979, + "auxiliary_loss_mlp": 0.01042782, + "balance_loss_clip": 1.02852452, + "balance_loss_mlp": 1.04195023, + "epoch": 0.3740868781001052, + "flos": 17126158331520.0, + "grad_norm": 1.6461952088047174, + "language_loss": 0.75817096, + "learning_rate": 2.8813522281551387e-06, + "loss": 0.7797966, + "num_input_tokens_seen": 133670220, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 6222, + "time_per_iteration": 2.43192720413208 + }, + { + "auxiliary_loss_clip": 0.01121141, + "auxiliary_loss_mlp": 0.01033917, + "balance_loss_clip": 1.0191592, + "balance_loss_mlp": 1.04333961, + "epoch": 0.3741470013527732, + "flos": 20043505048320.0, + "grad_norm": 1.6728060456550218, + "language_loss": 0.70215583, + "learning_rate": 2.881002604868789e-06, + "loss": 0.72370636, + "num_input_tokens_seen": 133688910, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.77734375, + "step": 6223, + "time_per_iteration": 2.4719529151916504 + }, + { + "auxiliary_loss_clip": 0.01123096, + "auxiliary_loss_mlp": 0.010342, + "balance_loss_clip": 1.01976991, + "balance_loss_mlp": 1.04556298, + "epoch": 0.37420712460544114, + "flos": 36897279569280.0, + "grad_norm": 2.209456781749309, + "language_loss": 0.69100869, + "learning_rate": 2.8806529481747325e-06, + "loss": 0.71258163, + "num_input_tokens_seen": 133708690, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.77734375, + "step": 6224, + "time_per_iteration": 2.6382336616516113 + }, + { + "auxiliary_loss_clip": 0.01120784, + "auxiliary_loss_mlp": 0.01033639, + "balance_loss_clip": 1.01942348, + "balance_loss_mlp": 1.04488885, + "epoch": 0.3742672478581091, + "flos": 22201198007040.0, + "grad_norm": 1.8205395187863704, + "language_loss": 0.69828689, + "learning_rate": 2.880303258086228e-06, + "loss": 0.71983123, + "num_input_tokens_seen": 133728095, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7578125, + "step": 6225, + "time_per_iteration": 2.501041889190674 + }, + { + "auxiliary_loss_clip": 0.01118888, + "auxiliary_loss_mlp": 0.01038877, + "balance_loss_clip": 1.02376127, + "balance_loss_mlp": 1.04357982, + "epoch": 0.3743273711107771, + "flos": 24681547860480.0, + "grad_norm": 2.305559014636685, + "language_loss": 0.79056358, + "learning_rate": 2.879953534616536e-06, + "loss": 0.81214118, + "num_input_tokens_seen": 133745590, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.75390625, + "step": 6226, + "time_per_iteration": 2.485196113586426 + }, + { + "auxiliary_loss_clip": 0.01121484, + "auxiliary_loss_mlp": 0.01040329, + "balance_loss_clip": 1.02517128, + "balance_loss_mlp": 1.04342556, + "epoch": 0.37438749436344504, + "flos": 24459619680000.0, + "grad_norm": 2.1155280603994546, + "language_loss": 0.68059194, + "learning_rate": 2.879603777778917e-06, + "loss": 0.70221007, + "num_input_tokens_seen": 133766155, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.78125, + "step": 6227, + "time_per_iteration": 2.553396463394165 + }, + { + "auxiliary_loss_clip": 0.01119717, + "auxiliary_loss_mlp": 0.01034725, + "balance_loss_clip": 1.02044404, + "balance_loss_mlp": 1.04391932, + "epoch": 0.374447617616113, + "flos": 21798747048960.0, + "grad_norm": 1.719573737271176, + "language_loss": 0.82955533, + "learning_rate": 2.879253987586635e-06, + "loss": 0.85109973, + "num_input_tokens_seen": 133783185, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7578125, + "step": 6228, + "time_per_iteration": 2.449979305267334 + }, + { + "auxiliary_loss_clip": 0.01120724, + "auxiliary_loss_mlp": 0.01038988, + "balance_loss_clip": 1.0244565, + "balance_loss_mlp": 1.0452075, + "epoch": 0.374507740868781, + "flos": 17968191932160.0, + "grad_norm": 1.610770216359874, + "language_loss": 0.74802738, + "learning_rate": 2.8789041640529535e-06, + "loss": 0.76962447, + "num_input_tokens_seen": 133800975, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7578125, + "step": 6229, + "time_per_iteration": 2.4768621921539307 + }, + { + "auxiliary_loss_clip": 0.01121137, + "auxiliary_loss_mlp": 0.01039119, + "balance_loss_clip": 1.02384853, + "balance_loss_mlp": 1.04209936, + "epoch": 0.374567864121449, + "flos": 16105828596480.0, + "grad_norm": 1.8233250091751425, + "language_loss": 0.83350682, + "learning_rate": 2.8785543071911383e-06, + "loss": 0.85510933, + "num_input_tokens_seen": 133818020, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.7890625, + "step": 6230, + "time_per_iteration": 2.4503889083862305 + }, + { + "auxiliary_loss_clip": 0.01125186, + "auxiliary_loss_mlp": 0.01039118, + "balance_loss_clip": 1.02383518, + "balance_loss_mlp": 1.04665947, + "epoch": 0.37462798737411696, + "flos": 25773160135680.0, + "grad_norm": 1.8327028169227884, + "language_loss": 0.73589134, + "learning_rate": 2.878204417014456e-06, + "loss": 0.75753438, + "num_input_tokens_seen": 133840690, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.78515625, + "step": 6231, + "time_per_iteration": 2.5793888568878174 + }, + { + "auxiliary_loss_clip": 0.01126351, + "auxiliary_loss_mlp": 0.01042616, + "balance_loss_clip": 1.02754807, + "balance_loss_mlp": 1.04669595, + "epoch": 0.3746881106267849, + "flos": 16654507822080.0, + "grad_norm": 2.0748427868287536, + "language_loss": 0.72982037, + "learning_rate": 2.8778544935361735e-06, + "loss": 0.75151008, + "num_input_tokens_seen": 133858350, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.796875, + "step": 6232, + "time_per_iteration": 2.5400028228759766 + }, + { + "auxiliary_loss_clip": 0.01120736, + "auxiliary_loss_mlp": 0.01034639, + "balance_loss_clip": 1.01927304, + "balance_loss_mlp": 1.04244757, + "epoch": 0.3747482338794529, + "flos": 26177981391360.0, + "grad_norm": 1.7557793199484253, + "language_loss": 0.77042818, + "learning_rate": 2.877504536769561e-06, + "loss": 0.791982, + "num_input_tokens_seen": 133879775, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.78125, + "step": 6233, + "time_per_iteration": 2.6110641956329346 + }, + { + "auxiliary_loss_clip": 0.01124346, + "auxiliary_loss_mlp": 0.01039458, + "balance_loss_clip": 1.02521205, + "balance_loss_mlp": 1.04520559, + "epoch": 0.37480835713212085, + "flos": 12021061950720.0, + "grad_norm": 1.733253645903673, + "language_loss": 0.68936831, + "learning_rate": 2.8771545467278883e-06, + "loss": 0.71100628, + "num_input_tokens_seen": 133898295, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.79296875, + "step": 6234, + "time_per_iteration": 2.4476797580718994 + }, + { + "auxiliary_loss_clip": 0.01121608, + "auxiliary_loss_mlp": 0.01040174, + "balance_loss_clip": 1.02685833, + "balance_loss_mlp": 1.04514599, + "epoch": 0.3748684803847888, + "flos": 19679263182720.0, + "grad_norm": 1.8436539021155727, + "language_loss": 0.82329285, + "learning_rate": 2.8768045234244276e-06, + "loss": 0.84491062, + "num_input_tokens_seen": 133915230, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.765625, + "step": 6235, + "time_per_iteration": 2.4766016006469727 + }, + { + "auxiliary_loss_clip": 0.01127866, + "auxiliary_loss_mlp": 0.01032441, + "balance_loss_clip": 1.01823175, + "balance_loss_mlp": 1.04744995, + "epoch": 0.3749286036374568, + "flos": 20521189042560.0, + "grad_norm": 1.8082481713782126, + "language_loss": 0.77776909, + "learning_rate": 2.8764544668724517e-06, + "loss": 0.79937214, + "num_input_tokens_seen": 133934110, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8046875, + "step": 6236, + "time_per_iteration": 2.440678596496582 + }, + { + "auxiliary_loss_clip": 0.01124108, + "auxiliary_loss_mlp": 0.0104869, + "balance_loss_clip": 1.03139293, + "balance_loss_mlp": 1.04308259, + "epoch": 0.37498872689012475, + "flos": 20704620821760.0, + "grad_norm": 2.0063576687211704, + "language_loss": 0.73203218, + "learning_rate": 2.876104377085234e-06, + "loss": 0.7537601, + "num_input_tokens_seen": 133952395, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.80859375, + "step": 6237, + "time_per_iteration": 2.5782086849212646 + }, + { + "auxiliary_loss_clip": 0.01120953, + "auxiliary_loss_mlp": 0.01037906, + "balance_loss_clip": 1.02257562, + "balance_loss_mlp": 1.04084682, + "epoch": 0.3750488501427927, + "flos": 21574843620480.0, + "grad_norm": 2.2861902523152935, + "language_loss": 0.93017888, + "learning_rate": 2.8757542540760508e-06, + "loss": 0.9517675, + "num_input_tokens_seen": 133969635, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.80078125, + "step": 6238, + "time_per_iteration": 2.514997720718384 + }, + { + "auxiliary_loss_clip": 0.01121834, + "auxiliary_loss_mlp": 0.01033577, + "balance_loss_clip": 1.01821709, + "balance_loss_mlp": 1.04316592, + "epoch": 0.3751089733954607, + "flos": 15923869274880.0, + "grad_norm": 1.9811721217026943, + "language_loss": 0.71066076, + "learning_rate": 2.8754040978581777e-06, + "loss": 0.73221493, + "num_input_tokens_seen": 133987215, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.78515625, + "step": 6239, + "time_per_iteration": 2.5054962635040283 + }, + { + "auxiliary_loss_clip": 0.01127026, + "auxiliary_loss_mlp": 0.01031398, + "balance_loss_clip": 1.01659262, + "balance_loss_mlp": 1.04635918, + "epoch": 0.37516909664812864, + "flos": 36284644177920.0, + "grad_norm": 1.6550300124553972, + "language_loss": 0.6566934, + "learning_rate": 2.875053908444895e-06, + "loss": 0.67827761, + "num_input_tokens_seen": 134009250, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8046875, + "step": 6240, + "time_per_iteration": 2.5776519775390625 + }, + { + "auxiliary_loss_clip": 0.01124905, + "auxiliary_loss_mlp": 0.01031367, + "balance_loss_clip": 1.01703799, + "balance_loss_mlp": 1.04560649, + "epoch": 0.3752292199007966, + "flos": 13515915283200.0, + "grad_norm": 2.0148493018475877, + "language_loss": 0.75634778, + "learning_rate": 2.8747036858494795e-06, + "loss": 0.77791047, + "num_input_tokens_seen": 134026875, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.79296875, + "step": 6241, + "time_per_iteration": 2.503861904144287 + }, + { + "auxiliary_loss_clip": 0.01123464, + "auxiliary_loss_mlp": 0.01040235, + "balance_loss_clip": 1.02436805, + "balance_loss_mlp": 1.04321361, + "epoch": 0.3752893431534646, + "flos": 27198095644800.0, + "grad_norm": 2.5579725641576876, + "language_loss": 0.83610159, + "learning_rate": 2.874353430085213e-06, + "loss": 0.85773861, + "num_input_tokens_seen": 134047185, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.80078125, + "step": 6242, + "time_per_iteration": 2.4933042526245117 + }, + { + "auxiliary_loss_clip": 0.01122935, + "auxiliary_loss_mlp": 0.01038353, + "balance_loss_clip": 1.02435803, + "balance_loss_mlp": 1.04265308, + "epoch": 0.3753494664061326, + "flos": 30007674581760.0, + "grad_norm": 2.190530656574709, + "language_loss": 0.67888391, + "learning_rate": 2.8740031411653766e-06, + "loss": 0.70049673, + "num_input_tokens_seen": 134067330, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.8046875, + "step": 6243, + "time_per_iteration": 2.543820381164551 + }, + { + "auxiliary_loss_clip": 0.01121963, + "auxiliary_loss_mlp": 0.01038078, + "balance_loss_clip": 1.02241397, + "balance_loss_mlp": 1.04404676, + "epoch": 0.37540958965880056, + "flos": 24461954064000.0, + "grad_norm": 1.7974063962239055, + "language_loss": 0.84275806, + "learning_rate": 2.8736528191032535e-06, + "loss": 0.86435848, + "num_input_tokens_seen": 134085525, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.78125, + "step": 6244, + "time_per_iteration": 2.4710450172424316 + }, + { + "auxiliary_loss_clip": 0.01119065, + "auxiliary_loss_mlp": 0.01036596, + "balance_loss_clip": 1.02229667, + "balance_loss_mlp": 1.0436101, + "epoch": 0.3754697129114685, + "flos": 16508387295360.0, + "grad_norm": 2.387588700969948, + "language_loss": 0.83019805, + "learning_rate": 2.8733024639121277e-06, + "loss": 0.85175467, + "num_input_tokens_seen": 134101855, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.75390625, + "step": 6245, + "time_per_iteration": 2.4594197273254395 + }, + { + "auxiliary_loss_clip": 0.01122149, + "auxiliary_loss_mlp": 0.01037692, + "balance_loss_clip": 1.02207565, + "balance_loss_mlp": 1.04337263, + "epoch": 0.3755298361641365, + "flos": 19390900798080.0, + "grad_norm": 1.94802763897559, + "language_loss": 0.64043313, + "learning_rate": 2.8729520756052853e-06, + "loss": 0.66203153, + "num_input_tokens_seen": 134119360, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.7890625, + "step": 6246, + "time_per_iteration": 2.4522809982299805 + }, + { + "auxiliary_loss_clip": 0.01125162, + "auxiliary_loss_mlp": 0.01038669, + "balance_loss_clip": 1.0231837, + "balance_loss_mlp": 1.04382014, + "epoch": 0.37558995941680445, + "flos": 14720395069440.0, + "grad_norm": 1.7195896287931138, + "language_loss": 0.75146973, + "learning_rate": 2.8726016541960124e-06, + "loss": 0.77310807, + "num_input_tokens_seen": 134137475, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8125, + "step": 6247, + "time_per_iteration": 2.4527103900909424 + }, + { + "auxiliary_loss_clip": 0.01122539, + "auxiliary_loss_mlp": 0.01037762, + "balance_loss_clip": 1.02281308, + "balance_loss_mlp": 1.04276609, + "epoch": 0.3756500826694724, + "flos": 21689901861120.0, + "grad_norm": 3.472354315090956, + "language_loss": 0.55157161, + "learning_rate": 2.872251199697598e-06, + "loss": 0.5731746, + "num_input_tokens_seen": 134154580, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.796875, + "step": 6248, + "time_per_iteration": 2.4399521350860596 + }, + { + "auxiliary_loss_clip": 0.01119734, + "auxiliary_loss_mlp": 0.0103806, + "balance_loss_clip": 1.02334976, + "balance_loss_mlp": 1.04241502, + "epoch": 0.3757102059221404, + "flos": 26505666190080.0, + "grad_norm": 2.875026035710993, + "language_loss": 0.84247208, + "learning_rate": 2.8719007121233297e-06, + "loss": 0.86404997, + "num_input_tokens_seen": 134174285, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.7734375, + "step": 6249, + "time_per_iteration": 2.529763698577881 + }, + { + "auxiliary_loss_clip": 0.01120861, + "auxiliary_loss_mlp": 0.01033042, + "balance_loss_clip": 1.018713, + "balance_loss_mlp": 1.0427655, + "epoch": 0.37577032917480835, + "flos": 37338083274240.0, + "grad_norm": 1.7253468577749267, + "language_loss": 0.68124413, + "learning_rate": 2.8715501914864993e-06, + "loss": 0.70278323, + "num_input_tokens_seen": 134195940, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78125, + "step": 6250, + "time_per_iteration": 2.572439193725586 + }, + { + "auxiliary_loss_clip": 0.01124257, + "auxiliary_loss_mlp": 0.01042227, + "balance_loss_clip": 1.02791047, + "balance_loss_mlp": 1.04538727, + "epoch": 0.3758304524274763, + "flos": 21908597817600.0, + "grad_norm": 2.0419035804756716, + "language_loss": 0.77633286, + "learning_rate": 2.8711996378003987e-06, + "loss": 0.79799771, + "num_input_tokens_seen": 134212235, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7890625, + "step": 6251, + "time_per_iteration": 2.58437442779541 + }, + { + "auxiliary_loss_clip": 0.01120391, + "auxiliary_loss_mlp": 0.01033287, + "balance_loss_clip": 1.01910138, + "balance_loss_mlp": 1.04232824, + "epoch": 0.3758905756801443, + "flos": 36569343375360.0, + "grad_norm": 2.137051103462404, + "language_loss": 0.58463252, + "learning_rate": 2.8708490510783203e-06, + "loss": 0.60616934, + "num_input_tokens_seen": 134233810, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 6252, + "time_per_iteration": 2.6117262840270996 + }, + { + "auxiliary_loss_clip": 0.01124494, + "auxiliary_loss_mlp": 0.01043009, + "balance_loss_clip": 1.02730918, + "balance_loss_mlp": 1.04393482, + "epoch": 0.37595069893281224, + "flos": 24528783317760.0, + "grad_norm": 2.9959533965383836, + "language_loss": 0.89689183, + "learning_rate": 2.8704984313335584e-06, + "loss": 0.91856694, + "num_input_tokens_seen": 134252020, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8046875, + "step": 6253, + "time_per_iteration": 2.5241925716400146 + }, + { + "auxiliary_loss_clip": 0.01123311, + "auxiliary_loss_mlp": 0.01036763, + "balance_loss_clip": 1.0227623, + "balance_loss_mlp": 1.04618073, + "epoch": 0.3760108221854802, + "flos": 16435021766400.0, + "grad_norm": 1.9568868773694639, + "language_loss": 0.76368916, + "learning_rate": 2.8701477785794097e-06, + "loss": 0.78528988, + "num_input_tokens_seen": 134269495, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76953125, + "step": 6254, + "time_per_iteration": 2.44631028175354 + }, + { + "auxiliary_loss_clip": 0.011269, + "auxiliary_loss_mlp": 0.01044619, + "balance_loss_clip": 1.02906847, + "balance_loss_mlp": 1.04640615, + "epoch": 0.37607094543814823, + "flos": 13771742924160.0, + "grad_norm": 2.019237604940679, + "language_loss": 0.61830014, + "learning_rate": 2.869797092829169e-06, + "loss": 0.6400153, + "num_input_tokens_seen": 134287035, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8046875, + "step": 6255, + "time_per_iteration": 2.474303960800171 + }, + { + "auxiliary_loss_clip": 0.01125813, + "auxiliary_loss_mlp": 0.01037924, + "balance_loss_clip": 1.02204537, + "balance_loss_mlp": 1.0434109, + "epoch": 0.3761310686908162, + "flos": 19857918453120.0, + "grad_norm": 2.4357923747979675, + "language_loss": 0.74234015, + "learning_rate": 2.869446374096135e-06, + "loss": 0.76397753, + "num_input_tokens_seen": 134304840, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.82421875, + "step": 6256, + "time_per_iteration": 2.4332830905914307 + }, + { + "auxiliary_loss_clip": 0.01129168, + "auxiliary_loss_mlp": 0.01045861, + "balance_loss_clip": 1.03029239, + "balance_loss_mlp": 1.04842019, + "epoch": 0.37619119194348416, + "flos": 12750802657920.0, + "grad_norm": 1.807318668329893, + "language_loss": 0.70297635, + "learning_rate": 2.8690956223936088e-06, + "loss": 0.72472662, + "num_input_tokens_seen": 134323180, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.80859375, + "step": 6257, + "time_per_iteration": 2.600249767303467 + }, + { + "auxiliary_loss_clip": 0.01123849, + "auxiliary_loss_mlp": 0.01034306, + "balance_loss_clip": 1.01998889, + "balance_loss_mlp": 1.04582894, + "epoch": 0.3762513151961521, + "flos": 17530548624000.0, + "grad_norm": 1.8628634379537026, + "language_loss": 0.84647095, + "learning_rate": 2.868744837734889e-06, + "loss": 0.86805254, + "num_input_tokens_seen": 134341390, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78125, + "step": 6258, + "time_per_iteration": 2.443833351135254 + }, + { + "auxiliary_loss_clip": 0.01122949, + "auxiliary_loss_mlp": 0.01043602, + "balance_loss_clip": 1.02936888, + "balance_loss_mlp": 1.04430962, + "epoch": 0.3763114384488201, + "flos": 23617406511360.0, + "grad_norm": 1.514941849696829, + "language_loss": 0.81009686, + "learning_rate": 2.868394020133277e-06, + "loss": 0.83176237, + "num_input_tokens_seen": 134360425, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78515625, + "step": 6259, + "time_per_iteration": 2.5727832317352295 + }, + { + "auxiliary_loss_clip": 0.01130377, + "auxiliary_loss_mlp": 0.01042246, + "balance_loss_clip": 1.02660608, + "balance_loss_mlp": 1.04775453, + "epoch": 0.37637156170148806, + "flos": 25406978935680.0, + "grad_norm": 1.8915772167347047, + "language_loss": 0.71919596, + "learning_rate": 2.8680431696020783e-06, + "loss": 0.74092221, + "num_input_tokens_seen": 134379775, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.828125, + "step": 6260, + "time_per_iteration": 2.5225539207458496 + }, + { + "auxiliary_loss_clip": 0.0112693, + "auxiliary_loss_mlp": 0.01036069, + "balance_loss_clip": 1.02061951, + "balance_loss_mlp": 1.04538989, + "epoch": 0.376431684954156, + "flos": 23440906056960.0, + "grad_norm": 1.725193491542272, + "language_loss": 0.78423822, + "learning_rate": 2.867692286154594e-06, + "loss": 0.80586827, + "num_input_tokens_seen": 134400315, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.81640625, + "step": 6261, + "time_per_iteration": 2.4926671981811523 + }, + { + "auxiliary_loss_clip": 0.01132188, + "auxiliary_loss_mlp": 0.01043226, + "balance_loss_clip": 1.02784848, + "balance_loss_mlp": 1.04861188, + "epoch": 0.376491808206824, + "flos": 34204482725760.0, + "grad_norm": 1.7544905551461754, + "language_loss": 0.80327791, + "learning_rate": 2.867341369804132e-06, + "loss": 0.82503211, + "num_input_tokens_seen": 134422875, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8359375, + "step": 6262, + "time_per_iteration": 6.861605167388916 + }, + { + "auxiliary_loss_clip": 0.01121874, + "auxiliary_loss_mlp": 0.01032438, + "balance_loss_clip": 1.01796031, + "balance_loss_mlp": 1.04471791, + "epoch": 0.37655193145949195, + "flos": 35185669614720.0, + "grad_norm": 1.7128267856657793, + "language_loss": 0.80543715, + "learning_rate": 2.866990420563998e-06, + "loss": 0.82698023, + "num_input_tokens_seen": 134443025, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.76953125, + "step": 6263, + "time_per_iteration": 2.6574654579162598 + }, + { + "auxiliary_loss_clip": 0.01128017, + "auxiliary_loss_mlp": 0.01041966, + "balance_loss_clip": 1.02705324, + "balance_loss_mlp": 1.04757583, + "epoch": 0.3766120547121599, + "flos": 16761844638720.0, + "grad_norm": 2.7435231382382033, + "language_loss": 0.80158919, + "learning_rate": 2.866639438447501e-06, + "loss": 0.82328904, + "num_input_tokens_seen": 134460945, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8046875, + "step": 6264, + "time_per_iteration": 2.4326720237731934 + }, + { + "auxiliary_loss_clip": 0.01122852, + "auxiliary_loss_mlp": 0.01046441, + "balance_loss_clip": 1.03120613, + "balance_loss_mlp": 1.04323912, + "epoch": 0.3766721779648279, + "flos": 23550361776000.0, + "grad_norm": 2.2579254623504585, + "language_loss": 0.73604524, + "learning_rate": 2.8662884234679497e-06, + "loss": 0.75773823, + "num_input_tokens_seen": 134480440, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.796875, + "step": 6265, + "time_per_iteration": 2.481248617172241 + }, + { + "auxiliary_loss_clip": 0.01126145, + "auxiliary_loss_mlp": 0.01038865, + "balance_loss_clip": 1.02525079, + "balance_loss_mlp": 1.04878664, + "epoch": 0.37673230121749585, + "flos": 29129191655040.0, + "grad_norm": 1.6798839148056366, + "language_loss": 0.68685853, + "learning_rate": 2.865937375638654e-06, + "loss": 0.70850861, + "num_input_tokens_seen": 134501110, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 6266, + "time_per_iteration": 2.517972946166992 + }, + { + "auxiliary_loss_clip": 0.01129377, + "auxiliary_loss_mlp": 0.0104268, + "balance_loss_clip": 1.02746832, + "balance_loss_mlp": 1.04570127, + "epoch": 0.3767924244701638, + "flos": 28146783703680.0, + "grad_norm": 21.71943634627446, + "language_loss": 0.6330213, + "learning_rate": 2.8655862949729264e-06, + "loss": 0.65474188, + "num_input_tokens_seen": 134522460, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8359375, + "step": 6267, + "time_per_iteration": 2.534775733947754 + }, + { + "auxiliary_loss_clip": 0.01049589, + "auxiliary_loss_mlp": 0.01002617, + "balance_loss_clip": 1.00076914, + "balance_loss_mlp": 1.02342653, + "epoch": 0.37685254772283183, + "flos": 60797197526400.0, + "grad_norm": 0.7181832227527338, + "language_loss": 0.58946306, + "learning_rate": 2.8652351814840795e-06, + "loss": 0.60998511, + "num_input_tokens_seen": 134589545, + "router_z_loss_clip": 0.01843262, + "router_z_loss_mlp": 0.26171875, + "step": 6268, + "time_per_iteration": 3.168419361114502 + }, + { + "auxiliary_loss_clip": 0.011283, + "auxiliary_loss_mlp": 0.01038795, + "balance_loss_clip": 1.02268982, + "balance_loss_mlp": 1.04734302, + "epoch": 0.3769126709754998, + "flos": 26032543223040.0, + "grad_norm": 1.4797604992869704, + "language_loss": 0.65026355, + "learning_rate": 2.8648840351854283e-06, + "loss": 0.67193449, + "num_input_tokens_seen": 134610550, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8125, + "step": 6269, + "time_per_iteration": 2.5472333431243896 + }, + { + "auxiliary_loss_clip": 0.01127949, + "auxiliary_loss_mlp": 0.01038619, + "balance_loss_clip": 1.02263296, + "balance_loss_mlp": 1.05022144, + "epoch": 0.37697279422816776, + "flos": 23579879777280.0, + "grad_norm": 1.46875421159053, + "language_loss": 0.70592397, + "learning_rate": 2.8645328560902874e-06, + "loss": 0.72758961, + "num_input_tokens_seen": 134630485, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.77734375, + "step": 6270, + "time_per_iteration": 2.4763948917388916 + }, + { + "auxiliary_loss_clip": 0.01045864, + "auxiliary_loss_mlp": 0.0100198, + "balance_loss_clip": 1.00021577, + "balance_loss_mlp": 1.02014744, + "epoch": 0.3770329174808357, + "flos": 64745935367040.0, + "grad_norm": 0.7024360778923162, + "language_loss": 0.56136239, + "learning_rate": 2.8641816442119746e-06, + "loss": 0.58184087, + "num_input_tokens_seen": 134693510, + "router_z_loss_clip": 0.0177002, + "router_z_loss_mlp": 0.2578125, + "step": 6271, + "time_per_iteration": 3.0738816261291504 + }, + { + "auxiliary_loss_clip": 0.01124439, + "auxiliary_loss_mlp": 0.01039364, + "balance_loss_clip": 1.02326441, + "balance_loss_mlp": 1.04638743, + "epoch": 0.3770930407335037, + "flos": 21835304115840.0, + "grad_norm": 2.066611127756055, + "language_loss": 0.79340166, + "learning_rate": 2.8638303995638066e-06, + "loss": 0.81503969, + "num_input_tokens_seen": 134713115, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.78125, + "step": 6272, + "time_per_iteration": 2.4686055183410645 + }, + { + "auxiliary_loss_clip": 0.01122198, + "auxiliary_loss_mlp": 0.01031935, + "balance_loss_clip": 1.01802933, + "balance_loss_mlp": 1.04578209, + "epoch": 0.37715316398617166, + "flos": 22747901984640.0, + "grad_norm": 1.4641670728096365, + "language_loss": 0.74172843, + "learning_rate": 2.863479122159103e-06, + "loss": 0.76326972, + "num_input_tokens_seen": 134732635, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 6273, + "time_per_iteration": 2.5079009532928467 + }, + { + "auxiliary_loss_clip": 0.01124789, + "auxiliary_loss_mlp": 0.01045969, + "balance_loss_clip": 1.03112721, + "balance_loss_mlp": 1.04621577, + "epoch": 0.3772132872388396, + "flos": 18914581520640.0, + "grad_norm": 1.4163029825487425, + "language_loss": 0.71801323, + "learning_rate": 2.8631278120111858e-06, + "loss": 0.73972082, + "num_input_tokens_seen": 134750695, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78515625, + "step": 6274, + "time_per_iteration": 2.460338592529297 + }, + { + "auxiliary_loss_clip": 0.01128245, + "auxiliary_loss_mlp": 0.01036844, + "balance_loss_clip": 1.02277732, + "balance_loss_mlp": 1.04794264, + "epoch": 0.3772734104915076, + "flos": 17346219004800.0, + "grad_norm": 1.663376044288712, + "language_loss": 0.83692443, + "learning_rate": 2.8627764691333742e-06, + "loss": 0.85857534, + "num_input_tokens_seen": 134768935, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.80078125, + "step": 6275, + "time_per_iteration": 2.48319149017334 + }, + { + "auxiliary_loss_clip": 0.01121629, + "auxiliary_loss_mlp": 0.0103252, + "balance_loss_clip": 1.01949656, + "balance_loss_mlp": 1.04532933, + "epoch": 0.37733353374417555, + "flos": 32342370785280.0, + "grad_norm": 1.4340123311349162, + "language_loss": 0.75342453, + "learning_rate": 2.8624250935389935e-06, + "loss": 0.77496612, + "num_input_tokens_seen": 134791260, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.76171875, + "step": 6276, + "time_per_iteration": 2.5773236751556396 + }, + { + "auxiliary_loss_clip": 0.01127758, + "auxiliary_loss_mlp": 0.01042729, + "balance_loss_clip": 1.02724338, + "balance_loss_mlp": 1.04667568, + "epoch": 0.3773936569968435, + "flos": 23360681030400.0, + "grad_norm": 1.858122502551201, + "language_loss": 0.85519129, + "learning_rate": 2.862073685241366e-06, + "loss": 0.87689614, + "num_input_tokens_seen": 134808350, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8125, + "step": 6277, + "time_per_iteration": 2.5827369689941406 + }, + { + "auxiliary_loss_clip": 0.01123645, + "auxiliary_loss_mlp": 0.01032265, + "balance_loss_clip": 1.01833546, + "balance_loss_mlp": 1.04713118, + "epoch": 0.3774537802495115, + "flos": 21466788531840.0, + "grad_norm": 2.807350675061797, + "language_loss": 0.78055024, + "learning_rate": 2.861722244253818e-06, + "loss": 0.80210936, + "num_input_tokens_seen": 134826005, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 6278, + "time_per_iteration": 2.491334915161133 + }, + { + "auxiliary_loss_clip": 0.01128448, + "auxiliary_loss_mlp": 0.01044224, + "balance_loss_clip": 1.02795196, + "balance_loss_mlp": 1.04698181, + "epoch": 0.37751390350217945, + "flos": 24973717086720.0, + "grad_norm": 1.933979010172509, + "language_loss": 0.82702643, + "learning_rate": 2.8613707705896767e-06, + "loss": 0.84875309, + "num_input_tokens_seen": 134844995, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8125, + "step": 6279, + "time_per_iteration": 2.538426160812378 + }, + { + "auxiliary_loss_clip": 0.01125885, + "auxiliary_loss_mlp": 0.01037058, + "balance_loss_clip": 1.02310467, + "balance_loss_mlp": 1.04578614, + "epoch": 0.3775740267548474, + "flos": 27819098904960.0, + "grad_norm": 2.0225623598483358, + "language_loss": 0.74985826, + "learning_rate": 2.861019264262269e-06, + "loss": 0.77148765, + "num_input_tokens_seen": 134865285, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.80078125, + "step": 6280, + "time_per_iteration": 2.5161032676696777 + }, + { + "auxiliary_loss_clip": 0.01123339, + "auxiliary_loss_mlp": 0.01036466, + "balance_loss_clip": 1.02283478, + "balance_loss_mlp": 1.04662085, + "epoch": 0.3776341500075154, + "flos": 22565224391040.0, + "grad_norm": 1.4438938373085308, + "language_loss": 0.76017272, + "learning_rate": 2.8606677252849242e-06, + "loss": 0.78177071, + "num_input_tokens_seen": 134886535, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.765625, + "step": 6281, + "time_per_iteration": 2.504711151123047 + }, + { + "auxiliary_loss_clip": 0.01122332, + "auxiliary_loss_mlp": 0.01035077, + "balance_loss_clip": 1.02049732, + "balance_loss_mlp": 1.04368496, + "epoch": 0.3776942732601834, + "flos": 23077238808960.0, + "grad_norm": 1.7476205657776698, + "language_loss": 0.8391279, + "learning_rate": 2.860316153670974e-06, + "loss": 0.86070192, + "num_input_tokens_seen": 134907435, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.78515625, + "step": 6282, + "time_per_iteration": 2.4668593406677246 + }, + { + "auxiliary_loss_clip": 0.01120742, + "auxiliary_loss_mlp": 0.01037931, + "balance_loss_clip": 1.02337587, + "balance_loss_mlp": 1.04434681, + "epoch": 0.37775439651285136, + "flos": 21724411852800.0, + "grad_norm": 1.8037618077250128, + "language_loss": 0.70150751, + "learning_rate": 2.8599645494337484e-06, + "loss": 0.72309422, + "num_input_tokens_seen": 134925360, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.765625, + "step": 6283, + "time_per_iteration": 2.481948137283325 + }, + { + "auxiliary_loss_clip": 0.0112321, + "auxiliary_loss_mlp": 0.01045267, + "balance_loss_clip": 1.02967477, + "balance_loss_mlp": 1.04516089, + "epoch": 0.37781451976551933, + "flos": 23987753688960.0, + "grad_norm": 1.804590454145544, + "language_loss": 0.76529062, + "learning_rate": 2.859612912586581e-06, + "loss": 0.78697532, + "num_input_tokens_seen": 134944205, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.78125, + "step": 6284, + "time_per_iteration": 2.462968349456787 + }, + { + "auxiliary_loss_clip": 0.01130082, + "auxiliary_loss_mlp": 0.01034565, + "balance_loss_clip": 1.01884174, + "balance_loss_mlp": 1.0466392, + "epoch": 0.3778746430181873, + "flos": 13727967223680.0, + "grad_norm": 2.0529722445272167, + "language_loss": 0.85851312, + "learning_rate": 2.8592612431428055e-06, + "loss": 0.88015962, + "num_input_tokens_seen": 134960255, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8359375, + "step": 6285, + "time_per_iteration": 2.4435150623321533 + }, + { + "auxiliary_loss_clip": 0.01125611, + "auxiliary_loss_mlp": 0.0104034, + "balance_loss_clip": 1.0240438, + "balance_loss_mlp": 1.04457164, + "epoch": 0.37793476627085526, + "flos": 19460495399040.0, + "grad_norm": 1.9682053367320125, + "language_loss": 0.83967972, + "learning_rate": 2.858909541115758e-06, + "loss": 0.86133921, + "num_input_tokens_seen": 134978605, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8125, + "step": 6286, + "time_per_iteration": 2.4270951747894287 + }, + { + "auxiliary_loss_clip": 0.01123272, + "auxiliary_loss_mlp": 0.01041948, + "balance_loss_clip": 1.0268203, + "balance_loss_mlp": 1.04474115, + "epoch": 0.3779948895235232, + "flos": 10707018704640.0, + "grad_norm": 2.20319687907872, + "language_loss": 0.81550682, + "learning_rate": 2.858557806518775e-06, + "loss": 0.83715904, + "num_input_tokens_seen": 134995020, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.78515625, + "step": 6287, + "time_per_iteration": 2.4504740238189697 + }, + { + "auxiliary_loss_clip": 0.01121581, + "auxiliary_loss_mlp": 0.01040758, + "balance_loss_clip": 1.02559495, + "balance_loss_mlp": 1.04340911, + "epoch": 0.3780550127761912, + "flos": 22310007281280.0, + "grad_norm": 2.428511311582982, + "language_loss": 0.73038173, + "learning_rate": 2.8582060393651927e-06, + "loss": 0.75200516, + "num_input_tokens_seen": 135012620, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.78125, + "step": 6288, + "time_per_iteration": 2.4988601207733154 + }, + { + "auxiliary_loss_clip": 0.01126071, + "auxiliary_loss_mlp": 0.01036255, + "balance_loss_clip": 1.02103162, + "balance_loss_mlp": 1.04705048, + "epoch": 0.37811513602885916, + "flos": 28950644125440.0, + "grad_norm": 1.726028925404572, + "language_loss": 0.75453335, + "learning_rate": 2.857854239668352e-06, + "loss": 0.7761566, + "num_input_tokens_seen": 135033365, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.7890625, + "step": 6289, + "time_per_iteration": 2.5323870182037354 + }, + { + "auxiliary_loss_clip": 0.01121413, + "auxiliary_loss_mlp": 0.01038832, + "balance_loss_clip": 1.02428889, + "balance_loss_mlp": 1.04395676, + "epoch": 0.3781752592815271, + "flos": 23112933949440.0, + "grad_norm": 1.9121243331279245, + "language_loss": 0.7341041, + "learning_rate": 2.857502407441593e-06, + "loss": 0.75570655, + "num_input_tokens_seen": 135052185, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7734375, + "step": 6290, + "time_per_iteration": 2.4703667163848877 + }, + { + "auxiliary_loss_clip": 0.01126076, + "auxiliary_loss_mlp": 0.01040267, + "balance_loss_clip": 1.02388752, + "balance_loss_mlp": 1.0441103, + "epoch": 0.3782353825341951, + "flos": 19755932762880.0, + "grad_norm": 2.4130424762969502, + "language_loss": 0.79729307, + "learning_rate": 2.8571505426982566e-06, + "loss": 0.81895649, + "num_input_tokens_seen": 135070425, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8203125, + "step": 6291, + "time_per_iteration": 2.590517520904541 + }, + { + "auxiliary_loss_clip": 0.01124797, + "auxiliary_loss_mlp": 0.0103595, + "balance_loss_clip": 1.02038157, + "balance_loss_mlp": 1.04347014, + "epoch": 0.37829550578686305, + "flos": 22050839675520.0, + "grad_norm": 1.7851511943573266, + "language_loss": 0.76090503, + "learning_rate": 2.8567986454516854e-06, + "loss": 0.78251249, + "num_input_tokens_seen": 135090525, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8125, + "step": 6292, + "time_per_iteration": 2.486375570297241 + }, + { + "auxiliary_loss_clip": 0.0112214, + "auxiliary_loss_mlp": 0.01042986, + "balance_loss_clip": 1.02708387, + "balance_loss_mlp": 1.04380596, + "epoch": 0.378355629039531, + "flos": 16470357770880.0, + "grad_norm": 1.8744506208430416, + "language_loss": 0.69510674, + "learning_rate": 2.856446715715224e-06, + "loss": 0.71675801, + "num_input_tokens_seen": 135109575, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.78125, + "step": 6293, + "time_per_iteration": 2.477025032043457 + }, + { + "auxiliary_loss_clip": 0.01120173, + "auxiliary_loss_mlp": 0.01036754, + "balance_loss_clip": 1.02140629, + "balance_loss_mlp": 1.04180205, + "epoch": 0.378415752292199, + "flos": 19974844200960.0, + "grad_norm": 1.812028848861632, + "language_loss": 0.71631789, + "learning_rate": 2.8560947535022173e-06, + "loss": 0.73788714, + "num_input_tokens_seen": 135127000, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.78515625, + "step": 6294, + "time_per_iteration": 2.446382522583008 + }, + { + "auxiliary_loss_clip": 0.01128463, + "auxiliary_loss_mlp": 0.01035795, + "balance_loss_clip": 1.02050054, + "balance_loss_mlp": 1.04522586, + "epoch": 0.378475875544867, + "flos": 14647388676480.0, + "grad_norm": 2.0852903309957815, + "language_loss": 0.8254326, + "learning_rate": 2.855742758826011e-06, + "loss": 0.84707516, + "num_input_tokens_seen": 135145285, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.83203125, + "step": 6295, + "time_per_iteration": 2.4684417247772217 + }, + { + "auxiliary_loss_clip": 0.01123253, + "auxiliary_loss_mlp": 0.01033263, + "balance_loss_clip": 1.01870751, + "balance_loss_mlp": 1.04352689, + "epoch": 0.37853599879753497, + "flos": 26650996617600.0, + "grad_norm": 1.687128097470698, + "language_loss": 0.71806532, + "learning_rate": 2.8553907316999547e-06, + "loss": 0.73963046, + "num_input_tokens_seen": 135165240, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.796875, + "step": 6296, + "time_per_iteration": 2.515676975250244 + }, + { + "auxiliary_loss_clip": 0.01119269, + "auxiliary_loss_mlp": 0.01039959, + "balance_loss_clip": 1.02523708, + "balance_loss_mlp": 1.04370534, + "epoch": 0.37859612205020293, + "flos": 17311960408320.0, + "grad_norm": 1.741193546240543, + "language_loss": 0.77094543, + "learning_rate": 2.855038672137396e-06, + "loss": 0.79253769, + "num_input_tokens_seen": 135184045, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7578125, + "step": 6297, + "time_per_iteration": 2.4617502689361572 + }, + { + "auxiliary_loss_clip": 0.01123428, + "auxiliary_loss_mlp": 0.01035161, + "balance_loss_clip": 1.02042699, + "balance_loss_mlp": 1.04360187, + "epoch": 0.3786562453028709, + "flos": 18220392299520.0, + "grad_norm": 2.034703790395703, + "language_loss": 0.79179847, + "learning_rate": 2.854686580151684e-06, + "loss": 0.81338429, + "num_input_tokens_seen": 135202365, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.796875, + "step": 6298, + "time_per_iteration": 2.4516994953155518 + }, + { + "auxiliary_loss_clip": 0.01121762, + "auxiliary_loss_mlp": 0.01034647, + "balance_loss_clip": 1.02001977, + "balance_loss_mlp": 1.04453242, + "epoch": 0.37871636855553886, + "flos": 21214875473280.0, + "grad_norm": 2.0947541210526466, + "language_loss": 0.84758198, + "learning_rate": 2.8543344557561722e-06, + "loss": 0.86914611, + "num_input_tokens_seen": 135220955, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7734375, + "step": 6299, + "time_per_iteration": 2.4814558029174805 + }, + { + "auxiliary_loss_clip": 0.01123706, + "auxiliary_loss_mlp": 0.0103612, + "balance_loss_clip": 1.02153504, + "balance_loss_mlp": 1.04462421, + "epoch": 0.3787764918082068, + "flos": 20952727038720.0, + "grad_norm": 2.218392777517032, + "language_loss": 0.7657811, + "learning_rate": 2.8539822989642116e-06, + "loss": 0.78737932, + "num_input_tokens_seen": 135239715, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7890625, + "step": 6300, + "time_per_iteration": 2.4615044593811035 + }, + { + "auxiliary_loss_clip": 0.01127842, + "auxiliary_loss_mlp": 0.01039306, + "balance_loss_clip": 1.02135265, + "balance_loss_mlp": 1.04486537, + "epoch": 0.3788366150608748, + "flos": 17308009912320.0, + "grad_norm": 2.28104869272164, + "language_loss": 0.82490808, + "learning_rate": 2.8536301097891577e-06, + "loss": 0.84657955, + "num_input_tokens_seen": 135257035, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.828125, + "step": 6301, + "time_per_iteration": 2.4864752292633057 + }, + { + "auxiliary_loss_clip": 0.01119304, + "auxiliary_loss_mlp": 0.0104447, + "balance_loss_clip": 1.02967012, + "balance_loss_mlp": 1.04097867, + "epoch": 0.37889673831354276, + "flos": 24311092942080.0, + "grad_norm": 1.8461206090891127, + "language_loss": 0.67669666, + "learning_rate": 2.8532778882443636e-06, + "loss": 0.69833434, + "num_input_tokens_seen": 135275720, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78125, + "step": 6302, + "time_per_iteration": 2.501873016357422 + }, + { + "auxiliary_loss_clip": 0.01122155, + "auxiliary_loss_mlp": 0.0104003, + "balance_loss_clip": 1.02617788, + "balance_loss_mlp": 1.04561174, + "epoch": 0.3789568615662107, + "flos": 26683603188480.0, + "grad_norm": 1.9271400579859064, + "language_loss": 0.68487787, + "learning_rate": 2.8529256343431867e-06, + "loss": 0.7064997, + "num_input_tokens_seen": 135294140, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 6303, + "time_per_iteration": 4.003960371017456 + }, + { + "auxiliary_loss_clip": 0.01119108, + "auxiliary_loss_mlp": 0.01034602, + "balance_loss_clip": 1.02055335, + "balance_loss_mlp": 1.04143763, + "epoch": 0.3790169848188787, + "flos": 23585194990080.0, + "grad_norm": 1.8915662489351535, + "language_loss": 0.77611423, + "learning_rate": 2.8525733480989846e-06, + "loss": 0.79765135, + "num_input_tokens_seen": 135314845, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.77734375, + "step": 6304, + "time_per_iteration": 5.393261432647705 + }, + { + "auxiliary_loss_clip": 0.01127431, + "auxiliary_loss_mlp": 0.01037711, + "balance_loss_clip": 1.02176046, + "balance_loss_mlp": 1.04611588, + "epoch": 0.37907710807154665, + "flos": 18437436230400.0, + "grad_norm": 2.1278904960845724, + "language_loss": 0.80447114, + "learning_rate": 2.8522210295251146e-06, + "loss": 0.82612252, + "num_input_tokens_seen": 135333055, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8125, + "step": 6305, + "time_per_iteration": 2.471761703491211 + }, + { + "auxiliary_loss_clip": 0.01041012, + "auxiliary_loss_mlp": 0.0101182, + "balance_loss_clip": 1.01011562, + "balance_loss_mlp": 1.01491702, + "epoch": 0.3791372313242146, + "flos": 50107165954560.0, + "grad_norm": 0.9794242329238577, + "language_loss": 0.64524716, + "learning_rate": 2.8518686786349387e-06, + "loss": 0.66577548, + "num_input_tokens_seen": 135387865, + "router_z_loss_clip": 0.01708984, + "router_z_loss_mlp": 0.26171875, + "step": 6306, + "time_per_iteration": 2.9702882766723633 + }, + { + "auxiliary_loss_clip": 0.01126961, + "auxiliary_loss_mlp": 0.01048893, + "balance_loss_clip": 1.03371215, + "balance_loss_mlp": 1.04693508, + "epoch": 0.3791973545768826, + "flos": 24316551809280.0, + "grad_norm": 1.6253037153644523, + "language_loss": 0.73722827, + "learning_rate": 2.851516295441817e-06, + "loss": 0.75898677, + "num_input_tokens_seen": 135409095, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.80078125, + "step": 6307, + "time_per_iteration": 2.508127450942993 + }, + { + "auxiliary_loss_clip": 0.01124488, + "auxiliary_loss_mlp": 0.01040535, + "balance_loss_clip": 1.02550268, + "balance_loss_mlp": 1.04390907, + "epoch": 0.3792574778295506, + "flos": 21579907438080.0, + "grad_norm": 1.494726737463818, + "language_loss": 0.78469551, + "learning_rate": 2.851163879959112e-06, + "loss": 0.80634576, + "num_input_tokens_seen": 135429585, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8046875, + "step": 6308, + "time_per_iteration": 2.453012466430664 + }, + { + "auxiliary_loss_clip": 0.01120475, + "auxiliary_loss_mlp": 0.01040507, + "balance_loss_clip": 1.02552223, + "balance_loss_mlp": 1.04146767, + "epoch": 0.37931760108221857, + "flos": 22272731942400.0, + "grad_norm": 2.8302348181917263, + "language_loss": 0.73083341, + "learning_rate": 2.8508114322001876e-06, + "loss": 0.75244319, + "num_input_tokens_seen": 135446320, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7890625, + "step": 6309, + "time_per_iteration": 2.495020866394043 + }, + { + "auxiliary_loss_clip": 0.01122333, + "auxiliary_loss_mlp": 0.01039635, + "balance_loss_clip": 1.02509165, + "balance_loss_mlp": 1.04503894, + "epoch": 0.37937772433488653, + "flos": 19682998197120.0, + "grad_norm": 1.4661467923449947, + "language_loss": 0.78449893, + "learning_rate": 2.8504589521784083e-06, + "loss": 0.80611867, + "num_input_tokens_seen": 135465720, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7734375, + "step": 6310, + "time_per_iteration": 2.466533899307251 + }, + { + "auxiliary_loss_clip": 0.01121702, + "auxiliary_loss_mlp": 0.0103985, + "balance_loss_clip": 1.02562881, + "balance_loss_mlp": 1.04319441, + "epoch": 0.3794378475875545, + "flos": 19099378016640.0, + "grad_norm": 1.894743489836823, + "language_loss": 0.76103079, + "learning_rate": 2.8501064399071403e-06, + "loss": 0.7826463, + "num_input_tokens_seen": 135485155, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78515625, + "step": 6311, + "time_per_iteration": 2.4859142303466797 + }, + { + "auxiliary_loss_clip": 0.0112103, + "auxiliary_loss_mlp": 0.0103355, + "balance_loss_clip": 1.01906657, + "balance_loss_mlp": 1.04379332, + "epoch": 0.37949797084022246, + "flos": 20339660684160.0, + "grad_norm": 1.4829862533126659, + "language_loss": 0.71025705, + "learning_rate": 2.8497538953997504e-06, + "loss": 0.73180288, + "num_input_tokens_seen": 135502675, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7734375, + "step": 6312, + "time_per_iteration": 2.4632480144500732 + }, + { + "auxiliary_loss_clip": 0.01041554, + "auxiliary_loss_mlp": 0.01005886, + "balance_loss_clip": 1.00425243, + "balance_loss_mlp": 1.01538157, + "epoch": 0.37955809409289043, + "flos": 63972203477760.0, + "grad_norm": 0.7762054489660294, + "language_loss": 0.56084001, + "learning_rate": 2.849401318669608e-06, + "loss": 0.58131444, + "num_input_tokens_seen": 135562005, + "router_z_loss_clip": 0.01635742, + "router_z_loss_mlp": 0.26171875, + "step": 6313, + "time_per_iteration": 3.0646302700042725 + }, + { + "auxiliary_loss_clip": 0.0112246, + "auxiliary_loss_mlp": 0.01043557, + "balance_loss_clip": 1.02876949, + "balance_loss_mlp": 1.04362202, + "epoch": 0.3796182173455584, + "flos": 31540665179520.0, + "grad_norm": 4.480184070608776, + "language_loss": 0.7158128, + "learning_rate": 2.849048709730083e-06, + "loss": 0.73747301, + "num_input_tokens_seen": 135582600, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.7890625, + "step": 6314, + "time_per_iteration": 2.5263309478759766 + }, + { + "auxiliary_loss_clip": 0.01126357, + "auxiliary_loss_mlp": 0.01038649, + "balance_loss_clip": 1.02331841, + "balance_loss_mlp": 1.04427075, + "epoch": 0.37967834059822636, + "flos": 12130804978560.0, + "grad_norm": 1.7655759267809688, + "language_loss": 0.73132306, + "learning_rate": 2.848696068594545e-06, + "loss": 0.75297308, + "num_input_tokens_seen": 135600280, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8203125, + "step": 6315, + "time_per_iteration": 2.4753336906433105 + }, + { + "auxiliary_loss_clip": 0.0111862, + "auxiliary_loss_mlp": 0.01038847, + "balance_loss_clip": 1.02454782, + "balance_loss_mlp": 1.04206967, + "epoch": 0.3797384638508943, + "flos": 39348578298240.0, + "grad_norm": 2.0286726324195477, + "language_loss": 0.71049547, + "learning_rate": 2.8483433952763677e-06, + "loss": 0.73207021, + "num_input_tokens_seen": 135621560, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.765625, + "step": 6316, + "time_per_iteration": 2.636176824569702 + }, + { + "auxiliary_loss_clip": 0.01122, + "auxiliary_loss_mlp": 0.01038731, + "balance_loss_clip": 1.02524233, + "balance_loss_mlp": 1.04524136, + "epoch": 0.3797985871035623, + "flos": 34054016653440.0, + "grad_norm": 1.8086467732489355, + "language_loss": 0.65270519, + "learning_rate": 2.847990689788923e-06, + "loss": 0.67431247, + "num_input_tokens_seen": 135641745, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 6317, + "time_per_iteration": 2.595952033996582 + }, + { + "auxiliary_loss_clip": 0.01118597, + "auxiliary_loss_mlp": 0.0103544, + "balance_loss_clip": 1.02174878, + "balance_loss_mlp": 1.04161143, + "epoch": 0.37985871035623026, + "flos": 23222174186880.0, + "grad_norm": 2.0501625369641867, + "language_loss": 0.85361171, + "learning_rate": 2.8476379521455877e-06, + "loss": 0.87515211, + "num_input_tokens_seen": 135660650, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76953125, + "step": 6318, + "time_per_iteration": 2.4805264472961426 + }, + { + "auxiliary_loss_clip": 0.01124758, + "auxiliary_loss_mlp": 0.01040235, + "balance_loss_clip": 1.02443993, + "balance_loss_mlp": 1.04483223, + "epoch": 0.3799188336088982, + "flos": 18114958903680.0, + "grad_norm": 2.489676718863087, + "language_loss": 0.76274204, + "learning_rate": 2.8472851823597354e-06, + "loss": 0.784392, + "num_input_tokens_seen": 135679980, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.796875, + "step": 6319, + "time_per_iteration": 2.4780025482177734 + }, + { + "auxiliary_loss_clip": 0.01123743, + "auxiliary_loss_mlp": 0.01044293, + "balance_loss_clip": 1.02961218, + "balance_loss_mlp": 1.04587555, + "epoch": 0.3799789568615662, + "flos": 21871897096320.0, + "grad_norm": 1.6998661229427972, + "language_loss": 0.63923568, + "learning_rate": 2.846932380444744e-06, + "loss": 0.66091597, + "num_input_tokens_seen": 135699400, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78125, + "step": 6320, + "time_per_iteration": 2.4700872898101807 + }, + { + "auxiliary_loss_clip": 0.01121041, + "auxiliary_loss_mlp": 0.01038091, + "balance_loss_clip": 1.02375042, + "balance_loss_mlp": 1.04365289, + "epoch": 0.3800390801142342, + "flos": 32962943082240.0, + "grad_norm": 1.883216130529445, + "language_loss": 0.7112022, + "learning_rate": 2.846579546413992e-06, + "loss": 0.73279351, + "num_input_tokens_seen": 135723455, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7734375, + "step": 6321, + "time_per_iteration": 2.5686967372894287 + }, + { + "auxiliary_loss_clip": 0.01123308, + "auxiliary_loss_mlp": 0.01038205, + "balance_loss_clip": 1.02372098, + "balance_loss_mlp": 1.04298186, + "epoch": 0.38009920336690217, + "flos": 26907075653760.0, + "grad_norm": 1.720302384597662, + "language_loss": 0.74730933, + "learning_rate": 2.846226680280859e-06, + "loss": 0.76892447, + "num_input_tokens_seen": 135744335, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8046875, + "step": 6322, + "time_per_iteration": 2.5368685722351074 + }, + { + "auxiliary_loss_clip": 0.01121658, + "auxiliary_loss_mlp": 0.01036997, + "balance_loss_clip": 1.02155948, + "balance_loss_mlp": 1.04405749, + "epoch": 0.38015932661957014, + "flos": 22488913946880.0, + "grad_norm": 2.6715016816856787, + "language_loss": 0.84910119, + "learning_rate": 2.845873782058725e-06, + "loss": 0.87068772, + "num_input_tokens_seen": 135761440, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.77734375, + "step": 6323, + "time_per_iteration": 2.483771562576294 + }, + { + "auxiliary_loss_clip": 0.01123254, + "auxiliary_loss_mlp": 0.01035788, + "balance_loss_clip": 1.01983762, + "balance_loss_mlp": 1.04395103, + "epoch": 0.3802194498722381, + "flos": 21980993679360.0, + "grad_norm": 2.3955157937634586, + "language_loss": 0.73466647, + "learning_rate": 2.845520851760973e-06, + "loss": 0.75625694, + "num_input_tokens_seen": 135779955, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.79296875, + "step": 6324, + "time_per_iteration": 2.4709885120391846 + }, + { + "auxiliary_loss_clip": 0.0112564, + "auxiliary_loss_mlp": 0.01035755, + "balance_loss_clip": 1.02020979, + "balance_loss_mlp": 1.045573, + "epoch": 0.38027957312490607, + "flos": 21324869896320.0, + "grad_norm": 1.6580896914625747, + "language_loss": 0.84147018, + "learning_rate": 2.8451678894009847e-06, + "loss": 0.86308414, + "num_input_tokens_seen": 135799840, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.80078125, + "step": 6325, + "time_per_iteration": 2.4846572875976562 + }, + { + "auxiliary_loss_clip": 0.01122273, + "auxiliary_loss_mlp": 0.01032055, + "balance_loss_clip": 1.01833439, + "balance_loss_mlp": 1.04476464, + "epoch": 0.38033969637757403, + "flos": 16691244456960.0, + "grad_norm": 1.7291759572194114, + "language_loss": 0.79642469, + "learning_rate": 2.8448148949921465e-06, + "loss": 0.81796801, + "num_input_tokens_seen": 135817880, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.77734375, + "step": 6326, + "time_per_iteration": 2.4206631183624268 + }, + { + "auxiliary_loss_clip": 0.0111945, + "auxiliary_loss_mlp": 0.01038943, + "balance_loss_clip": 1.02524638, + "balance_loss_mlp": 1.04261708, + "epoch": 0.380399819630242, + "flos": 36210847685760.0, + "grad_norm": 1.8040593924859922, + "language_loss": 0.72696453, + "learning_rate": 2.844461868547842e-06, + "loss": 0.74854851, + "num_input_tokens_seen": 135838940, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76953125, + "step": 6327, + "time_per_iteration": 2.5964794158935547 + }, + { + "auxiliary_loss_clip": 0.01122464, + "auxiliary_loss_mlp": 0.01037019, + "balance_loss_clip": 1.02165246, + "balance_loss_mlp": 1.04614949, + "epoch": 0.38045994288290996, + "flos": 21288851533440.0, + "grad_norm": 1.6287717027141382, + "language_loss": 0.83090091, + "learning_rate": 2.844108810081459e-06, + "loss": 0.85249579, + "num_input_tokens_seen": 135858325, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.76171875, + "step": 6328, + "time_per_iteration": 2.4602181911468506 + }, + { + "auxiliary_loss_clip": 0.01120102, + "auxiliary_loss_mlp": 0.01032163, + "balance_loss_clip": 1.01746464, + "balance_loss_mlp": 1.04347932, + "epoch": 0.38052006613557793, + "flos": 20922885815040.0, + "grad_norm": 1.31755328246291, + "language_loss": 0.61384171, + "learning_rate": 2.843755719606385e-06, + "loss": 0.63536435, + "num_input_tokens_seen": 135878430, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.765625, + "step": 6329, + "time_per_iteration": 2.5268959999084473 + }, + { + "auxiliary_loss_clip": 0.01124125, + "auxiliary_loss_mlp": 0.01041725, + "balance_loss_clip": 1.02731824, + "balance_loss_mlp": 1.04603863, + "epoch": 0.3805801893882459, + "flos": 20990720649600.0, + "grad_norm": 1.7232754549878644, + "language_loss": 0.5586049, + "learning_rate": 2.8434025971360104e-06, + "loss": 0.58026338, + "num_input_tokens_seen": 135894755, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 6330, + "time_per_iteration": 2.450221061706543 + }, + { + "auxiliary_loss_clip": 0.01119473, + "auxiliary_loss_mlp": 0.01035672, + "balance_loss_clip": 1.02255917, + "balance_loss_mlp": 1.04540074, + "epoch": 0.38064031264091386, + "flos": 25558594243200.0, + "grad_norm": 1.7778053530951745, + "language_loss": 0.65694439, + "learning_rate": 2.8430494426837243e-06, + "loss": 0.67849582, + "num_input_tokens_seen": 135918275, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 6331, + "time_per_iteration": 2.544187545776367 + }, + { + "auxiliary_loss_clip": 0.01126283, + "auxiliary_loss_mlp": 0.01041557, + "balance_loss_clip": 1.02635133, + "balance_loss_mlp": 1.04744291, + "epoch": 0.3807004358935818, + "flos": 15085857997440.0, + "grad_norm": 1.725296368277029, + "language_loss": 0.75737906, + "learning_rate": 2.842696256262919e-06, + "loss": 0.77905744, + "num_input_tokens_seen": 135937430, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.7890625, + "step": 6332, + "time_per_iteration": 2.443654775619507 + }, + { + "auxiliary_loss_clip": 0.01123212, + "auxiliary_loss_mlp": 0.0104071, + "balance_loss_clip": 1.02546334, + "balance_loss_mlp": 1.04323936, + "epoch": 0.3807605591462498, + "flos": 16399398453120.0, + "grad_norm": 2.2212054448627425, + "language_loss": 0.81889552, + "learning_rate": 2.842343037886987e-06, + "loss": 0.84053469, + "num_input_tokens_seen": 135954210, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.80078125, + "step": 6333, + "time_per_iteration": 2.467007637023926 + }, + { + "auxiliary_loss_clip": 0.01121534, + "auxiliary_loss_mlp": 0.01033012, + "balance_loss_clip": 1.0190227, + "balance_loss_mlp": 1.04437923, + "epoch": 0.3808206823989178, + "flos": 29057083102080.0, + "grad_norm": 1.583221243495577, + "language_loss": 0.86192155, + "learning_rate": 2.8419897875693226e-06, + "loss": 0.88346696, + "num_input_tokens_seen": 135974425, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7734375, + "step": 6334, + "time_per_iteration": 2.521341323852539 + }, + { + "auxiliary_loss_clip": 0.01123312, + "auxiliary_loss_mlp": 0.01035753, + "balance_loss_clip": 1.02130485, + "balance_loss_mlp": 1.04498506, + "epoch": 0.3808808056515858, + "flos": 15705855676800.0, + "grad_norm": 2.2115670432842847, + "language_loss": 0.79179001, + "learning_rate": 2.841636505323321e-06, + "loss": 0.8133806, + "num_input_tokens_seen": 135991985, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 6335, + "time_per_iteration": 2.4648449420928955 + }, + { + "auxiliary_loss_clip": 0.01124606, + "auxiliary_loss_mlp": 0.01035281, + "balance_loss_clip": 1.02027273, + "balance_loss_mlp": 1.04485524, + "epoch": 0.38094092890425374, + "flos": 20704584908160.0, + "grad_norm": 1.872233235491229, + "language_loss": 0.72775364, + "learning_rate": 2.8412831911623795e-06, + "loss": 0.74935251, + "num_input_tokens_seen": 136010015, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.796875, + "step": 6336, + "time_per_iteration": 2.443255662918091 + }, + { + "auxiliary_loss_clip": 0.01119223, + "auxiliary_loss_mlp": 0.01031785, + "balance_loss_clip": 1.0180763, + "balance_loss_mlp": 1.0430727, + "epoch": 0.3810010521569217, + "flos": 20667956014080.0, + "grad_norm": 1.9910419737037044, + "language_loss": 0.69146657, + "learning_rate": 2.840929845099894e-06, + "loss": 0.71297657, + "num_input_tokens_seen": 136028440, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76171875, + "step": 6337, + "time_per_iteration": 2.4838876724243164 + }, + { + "auxiliary_loss_clip": 0.01124374, + "auxiliary_loss_mlp": 0.01035164, + "balance_loss_clip": 1.02016187, + "balance_loss_mlp": 1.04606009, + "epoch": 0.38106117540958967, + "flos": 31827626933760.0, + "grad_norm": 1.9033617326941272, + "language_loss": 0.63247615, + "learning_rate": 2.8405764671492652e-06, + "loss": 0.65407151, + "num_input_tokens_seen": 136048360, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.78125, + "step": 6338, + "time_per_iteration": 2.5538294315338135 + }, + { + "auxiliary_loss_clip": 0.01123732, + "auxiliary_loss_mlp": 0.01040443, + "balance_loss_clip": 1.02492189, + "balance_loss_mlp": 1.04498446, + "epoch": 0.38112129866225763, + "flos": 16902757693440.0, + "grad_norm": 1.8718033662194862, + "language_loss": 0.69288802, + "learning_rate": 2.8402230573238923e-06, + "loss": 0.71452975, + "num_input_tokens_seen": 136065500, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.7890625, + "step": 6339, + "time_per_iteration": 2.490813970565796 + }, + { + "auxiliary_loss_clip": 0.0112515, + "auxiliary_loss_mlp": 0.01040007, + "balance_loss_clip": 1.0256902, + "balance_loss_mlp": 1.0461787, + "epoch": 0.3811814219149256, + "flos": 20887226588160.0, + "grad_norm": 2.5980221539464914, + "language_loss": 0.68312418, + "learning_rate": 2.839869615637177e-06, + "loss": 0.70477575, + "num_input_tokens_seen": 136084060, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7890625, + "step": 6340, + "time_per_iteration": 2.4576282501220703 + }, + { + "auxiliary_loss_clip": 0.01124677, + "auxiliary_loss_mlp": 0.01036157, + "balance_loss_clip": 1.02026618, + "balance_loss_mlp": 1.04393721, + "epoch": 0.38124154516759357, + "flos": 16690813493760.0, + "grad_norm": 2.141170258916756, + "language_loss": 0.89404309, + "learning_rate": 2.839516142102522e-06, + "loss": 0.91565144, + "num_input_tokens_seen": 136102310, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.80859375, + "step": 6341, + "time_per_iteration": 2.4688920974731445 + }, + { + "auxiliary_loss_clip": 0.01126312, + "auxiliary_loss_mlp": 0.01040778, + "balance_loss_clip": 1.02477455, + "balance_loss_mlp": 1.04559851, + "epoch": 0.38130166842026153, + "flos": 19681956702720.0, + "grad_norm": 1.5516456894508346, + "language_loss": 0.74665564, + "learning_rate": 2.83916263673333e-06, + "loss": 0.76832652, + "num_input_tokens_seen": 136120725, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8046875, + "step": 6342, + "time_per_iteration": 2.4610931873321533 + }, + { + "auxiliary_loss_clip": 0.0112203, + "auxiliary_loss_mlp": 0.01034151, + "balance_loss_clip": 1.01900578, + "balance_loss_mlp": 1.04325199, + "epoch": 0.3813617916729295, + "flos": 22198432659840.0, + "grad_norm": 1.6121504127073445, + "language_loss": 0.83334327, + "learning_rate": 2.838809099543007e-06, + "loss": 0.85490513, + "num_input_tokens_seen": 136139105, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.7890625, + "step": 6343, + "time_per_iteration": 2.490952730178833 + }, + { + "auxiliary_loss_clip": 0.0112236, + "auxiliary_loss_mlp": 0.01037814, + "balance_loss_clip": 1.0233357, + "balance_loss_mlp": 1.04305577, + "epoch": 0.38142191492559746, + "flos": 19096899978240.0, + "grad_norm": 1.5912858717665679, + "language_loss": 0.76965082, + "learning_rate": 2.838455530544959e-06, + "loss": 0.79125255, + "num_input_tokens_seen": 136158265, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.79296875, + "step": 6344, + "time_per_iteration": 2.458669424057007 + }, + { + "auxiliary_loss_clip": 0.01126022, + "auxiliary_loss_mlp": 0.01039382, + "balance_loss_clip": 1.02413464, + "balance_loss_mlp": 1.04601693, + "epoch": 0.3814820381782654, + "flos": 24097748112000.0, + "grad_norm": 2.369132092535199, + "language_loss": 0.72790027, + "learning_rate": 2.838101929752593e-06, + "loss": 0.7495544, + "num_input_tokens_seen": 136176100, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.796875, + "step": 6345, + "time_per_iteration": 5.361874341964722 + }, + { + "auxiliary_loss_clip": 0.0112125, + "auxiliary_loss_mlp": 0.01035577, + "balance_loss_clip": 1.02172494, + "balance_loss_mlp": 1.04348969, + "epoch": 0.3815421614309334, + "flos": 15778502933760.0, + "grad_norm": 1.723509048793367, + "language_loss": 0.69687438, + "learning_rate": 2.8377482971793187e-06, + "loss": 0.71844268, + "num_input_tokens_seen": 136195125, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.77734375, + "step": 6346, + "time_per_iteration": 3.8780832290649414 + }, + { + "auxiliary_loss_clip": 0.0112555, + "auxiliary_loss_mlp": 0.01037985, + "balance_loss_clip": 1.02351856, + "balance_loss_mlp": 1.04639161, + "epoch": 0.38160228468360136, + "flos": 19899754819200.0, + "grad_norm": 1.8691929226070287, + "language_loss": 0.75860906, + "learning_rate": 2.8373946328385437e-06, + "loss": 0.78024441, + "num_input_tokens_seen": 136213885, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.79296875, + "step": 6347, + "time_per_iteration": 2.4724838733673096 + }, + { + "auxiliary_loss_clip": 0.01121549, + "auxiliary_loss_mlp": 0.01036633, + "balance_loss_clip": 1.02258432, + "balance_loss_mlp": 1.04272556, + "epoch": 0.3816624079362694, + "flos": 19281050029440.0, + "grad_norm": 1.5494744961647557, + "language_loss": 0.74775678, + "learning_rate": 2.8370409367436813e-06, + "loss": 0.76933861, + "num_input_tokens_seen": 136232700, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 6348, + "time_per_iteration": 2.4360201358795166 + }, + { + "auxiliary_loss_clip": 0.01121636, + "auxiliary_loss_mlp": 0.01034098, + "balance_loss_clip": 1.01947105, + "balance_loss_mlp": 1.04346061, + "epoch": 0.38172253118893734, + "flos": 21177564220800.0, + "grad_norm": 2.012782025185047, + "language_loss": 0.86987114, + "learning_rate": 2.836687208908142e-06, + "loss": 0.89142847, + "num_input_tokens_seen": 136248975, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78125, + "step": 6349, + "time_per_iteration": 2.4653983116149902 + }, + { + "auxiliary_loss_clip": 0.01121297, + "auxiliary_loss_mlp": 0.01040466, + "balance_loss_clip": 1.02576792, + "balance_loss_mlp": 1.04300261, + "epoch": 0.3817826544416053, + "flos": 17529219820800.0, + "grad_norm": 3.1419886249283624, + "language_loss": 0.76335979, + "learning_rate": 2.836333449345341e-06, + "loss": 0.78497744, + "num_input_tokens_seen": 136266710, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78125, + "step": 6350, + "time_per_iteration": 2.4111151695251465 + }, + { + "auxiliary_loss_clip": 0.01122319, + "auxiliary_loss_mlp": 0.01032772, + "balance_loss_clip": 1.01693547, + "balance_loss_mlp": 1.04389453, + "epoch": 0.38184277769427327, + "flos": 16326535714560.0, + "grad_norm": 2.0441694615934325, + "language_loss": 0.76182568, + "learning_rate": 2.8359796580686907e-06, + "loss": 0.78337657, + "num_input_tokens_seen": 136284445, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.78125, + "step": 6351, + "time_per_iteration": 2.449831485748291 + }, + { + "auxiliary_loss_clip": 0.0112512, + "auxiliary_loss_mlp": 0.01039723, + "balance_loss_clip": 1.0235939, + "balance_loss_mlp": 1.04464602, + "epoch": 0.38190290094694124, + "flos": 30443450382720.0, + "grad_norm": 1.6974231581634962, + "language_loss": 0.74360836, + "learning_rate": 2.8356258350916085e-06, + "loss": 0.76525676, + "num_input_tokens_seen": 136305730, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8046875, + "step": 6352, + "time_per_iteration": 2.5342295169830322 + }, + { + "auxiliary_loss_clip": 0.01117506, + "auxiliary_loss_mlp": 0.01033939, + "balance_loss_clip": 1.02103508, + "balance_loss_mlp": 1.04153097, + "epoch": 0.3819630241996092, + "flos": 14209924936320.0, + "grad_norm": 1.834359776939538, + "language_loss": 0.64362574, + "learning_rate": 2.8352719804275104e-06, + "loss": 0.66514015, + "num_input_tokens_seen": 136323850, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7578125, + "step": 6353, + "time_per_iteration": 2.434100866317749 + }, + { + "auxiliary_loss_clip": 0.01120333, + "auxiliary_loss_mlp": 0.01033689, + "balance_loss_clip": 1.02020061, + "balance_loss_mlp": 1.04363215, + "epoch": 0.38202314745227717, + "flos": 25009699536000.0, + "grad_norm": 1.6268216674771125, + "language_loss": 0.83035302, + "learning_rate": 2.834918094089816e-06, + "loss": 0.85189331, + "num_input_tokens_seen": 136344880, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 6354, + "time_per_iteration": 2.4903476238250732 + }, + { + "auxiliary_loss_clip": 0.0112166, + "auxiliary_loss_mlp": 0.01035018, + "balance_loss_clip": 1.02154744, + "balance_loss_mlp": 1.04571426, + "epoch": 0.38208327070494513, + "flos": 20814507504000.0, + "grad_norm": 1.7360324347242302, + "language_loss": 0.8071996, + "learning_rate": 2.834564176091943e-06, + "loss": 0.82876635, + "num_input_tokens_seen": 136366060, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 6355, + "time_per_iteration": 2.5086817741394043 + }, + { + "auxiliary_loss_clip": 0.0112186, + "auxiliary_loss_mlp": 0.01033195, + "balance_loss_clip": 1.01959288, + "balance_loss_mlp": 1.04464841, + "epoch": 0.3821433939576131, + "flos": 22637727993600.0, + "grad_norm": 1.7080815693685156, + "language_loss": 0.75032043, + "learning_rate": 2.8342102264473125e-06, + "loss": 0.77187097, + "num_input_tokens_seen": 136385625, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7734375, + "step": 6356, + "time_per_iteration": 2.471919298171997 + }, + { + "auxiliary_loss_clip": 0.01121242, + "auxiliary_loss_mlp": 0.0103649, + "balance_loss_clip": 1.02251887, + "balance_loss_mlp": 1.04420352, + "epoch": 0.38220351721028106, + "flos": 26869872142080.0, + "grad_norm": 1.8091380313160346, + "language_loss": 0.81251574, + "learning_rate": 2.833856245169348e-06, + "loss": 0.83409309, + "num_input_tokens_seen": 136405750, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76953125, + "step": 6357, + "time_per_iteration": 2.5302257537841797 + }, + { + "auxiliary_loss_clip": 0.01127375, + "auxiliary_loss_mlp": 0.01040855, + "balance_loss_clip": 1.02465415, + "balance_loss_mlp": 1.04773057, + "epoch": 0.38226364046294903, + "flos": 23367468700800.0, + "grad_norm": 3.08273691075534, + "language_loss": 0.77903318, + "learning_rate": 2.8335022322714695e-06, + "loss": 0.80071545, + "num_input_tokens_seen": 136426085, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.796875, + "step": 6358, + "time_per_iteration": 2.4700090885162354 + }, + { + "auxiliary_loss_clip": 0.01122323, + "auxiliary_loss_mlp": 0.01040593, + "balance_loss_clip": 1.02576303, + "balance_loss_mlp": 1.0432725, + "epoch": 0.382323763715617, + "flos": 19646225648640.0, + "grad_norm": 2.070211767582473, + "language_loss": 0.78700459, + "learning_rate": 2.8331481877671036e-06, + "loss": 0.80863374, + "num_input_tokens_seen": 136442670, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7890625, + "step": 6359, + "time_per_iteration": 2.4555094242095947 + }, + { + "auxiliary_loss_clip": 0.01118608, + "auxiliary_loss_mlp": 0.01041395, + "balance_loss_clip": 1.02698255, + "balance_loss_mlp": 1.04290545, + "epoch": 0.38238388696828496, + "flos": 54124741232640.0, + "grad_norm": 2.6399902686671113, + "language_loss": 0.69392359, + "learning_rate": 2.8327941116696754e-06, + "loss": 0.7155236, + "num_input_tokens_seen": 136465730, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7578125, + "step": 6360, + "time_per_iteration": 2.736069440841675 + }, + { + "auxiliary_loss_clip": 0.01118797, + "auxiliary_loss_mlp": 0.01031345, + "balance_loss_clip": 1.01632452, + "balance_loss_mlp": 1.04197633, + "epoch": 0.382444010220953, + "flos": 24936190352640.0, + "grad_norm": 1.9168722583294633, + "language_loss": 0.78836095, + "learning_rate": 2.83244000399261e-06, + "loss": 0.80986238, + "num_input_tokens_seen": 136487215, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.765625, + "step": 6361, + "time_per_iteration": 2.511254072189331 + }, + { + "auxiliary_loss_clip": 0.01115444, + "auxiliary_loss_mlp": 0.01036962, + "balance_loss_clip": 1.02274048, + "balance_loss_mlp": 1.04114652, + "epoch": 0.38250413347362094, + "flos": 42337351209600.0, + "grad_norm": 1.4566170801765106, + "language_loss": 0.65315771, + "learning_rate": 2.832085864749337e-06, + "loss": 0.67468172, + "num_input_tokens_seen": 136510365, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7421875, + "step": 6362, + "time_per_iteration": 2.632784128189087 + }, + { + "auxiliary_loss_clip": 0.01118848, + "auxiliary_loss_mlp": 0.01032495, + "balance_loss_clip": 1.01779675, + "balance_loss_mlp": 1.04175615, + "epoch": 0.3825642567262889, + "flos": 16289224462080.0, + "grad_norm": 1.8527291741217293, + "language_loss": 0.82063204, + "learning_rate": 2.8317316939532848e-06, + "loss": 0.84214544, + "num_input_tokens_seen": 136527100, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.76953125, + "step": 6363, + "time_per_iteration": 2.4478373527526855 + }, + { + "auxiliary_loss_clip": 0.01119064, + "auxiliary_loss_mlp": 0.01042512, + "balance_loss_clip": 1.02837944, + "balance_loss_mlp": 1.0446111, + "epoch": 0.3826243799789569, + "flos": 45654778586880.0, + "grad_norm": 1.811422380776527, + "language_loss": 0.58428323, + "learning_rate": 2.8313774916178825e-06, + "loss": 0.60589898, + "num_input_tokens_seen": 136550870, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7421875, + "step": 6364, + "time_per_iteration": 2.655128002166748 + }, + { + "auxiliary_loss_clip": 0.01123151, + "auxiliary_loss_mlp": 0.01039269, + "balance_loss_clip": 1.02496374, + "balance_loss_mlp": 1.04423463, + "epoch": 0.38268450323162484, + "flos": 25301581453440.0, + "grad_norm": 2.1451175401130893, + "language_loss": 0.68881112, + "learning_rate": 2.8310232577565635e-06, + "loss": 0.71043533, + "num_input_tokens_seen": 136569895, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7890625, + "step": 6365, + "time_per_iteration": 2.51526141166687 + }, + { + "auxiliary_loss_clip": 0.01121408, + "auxiliary_loss_mlp": 0.01036007, + "balance_loss_clip": 1.02065301, + "balance_loss_mlp": 1.04057527, + "epoch": 0.3827446264842928, + "flos": 21836022387840.0, + "grad_norm": 4.555943608034253, + "language_loss": 0.73442698, + "learning_rate": 2.830668992382758e-06, + "loss": 0.75600111, + "num_input_tokens_seen": 136588585, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8046875, + "step": 6366, + "time_per_iteration": 2.448585033416748 + }, + { + "auxiliary_loss_clip": 0.01120534, + "auxiliary_loss_mlp": 0.01035025, + "balance_loss_clip": 1.02026677, + "balance_loss_mlp": 1.04226327, + "epoch": 0.38280474973696077, + "flos": 25734591907200.0, + "grad_norm": 2.0234001922769327, + "language_loss": 0.68829554, + "learning_rate": 2.830314695509902e-06, + "loss": 0.70985115, + "num_input_tokens_seen": 136606640, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.78125, + "step": 6367, + "time_per_iteration": 2.569301128387451 + }, + { + "auxiliary_loss_clip": 0.0111708, + "auxiliary_loss_mlp": 0.01033716, + "balance_loss_clip": 1.01917887, + "balance_loss_mlp": 1.04202485, + "epoch": 0.38286487298962874, + "flos": 24895934184960.0, + "grad_norm": 4.344593393004367, + "language_loss": 0.6481666, + "learning_rate": 2.82996036715143e-06, + "loss": 0.66967463, + "num_input_tokens_seen": 136624940, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.75, + "step": 6368, + "time_per_iteration": 2.4531960487365723 + }, + { + "auxiliary_loss_clip": 0.01120319, + "auxiliary_loss_mlp": 0.01034921, + "balance_loss_clip": 1.02053833, + "balance_loss_mlp": 1.04277039, + "epoch": 0.3829249962422967, + "flos": 28543703967360.0, + "grad_norm": 1.315785818077373, + "language_loss": 0.68389189, + "learning_rate": 2.8296060073207763e-06, + "loss": 0.70544434, + "num_input_tokens_seen": 136645540, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7734375, + "step": 6369, + "time_per_iteration": 2.5403318405151367 + }, + { + "auxiliary_loss_clip": 0.01116957, + "auxiliary_loss_mlp": 0.01042768, + "balance_loss_clip": 1.02774167, + "balance_loss_mlp": 1.04172897, + "epoch": 0.38298511949496467, + "flos": 21471205904640.0, + "grad_norm": 1.7184057003296296, + "language_loss": 0.78214431, + "learning_rate": 2.8292516160313804e-06, + "loss": 0.80374157, + "num_input_tokens_seen": 136664530, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.75, + "step": 6370, + "time_per_iteration": 2.4397096633911133 + }, + { + "auxiliary_loss_clip": 0.01119925, + "auxiliary_loss_mlp": 0.01039652, + "balance_loss_clip": 1.02569818, + "balance_loss_mlp": 1.04368424, + "epoch": 0.38304524274763263, + "flos": 31679998035840.0, + "grad_norm": 2.8055794910549525, + "language_loss": 0.64556968, + "learning_rate": 2.8288971932966805e-06, + "loss": 0.66716546, + "num_input_tokens_seen": 136682315, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76171875, + "step": 6371, + "time_per_iteration": 2.5470147132873535 + }, + { + "auxiliary_loss_clip": 0.01124784, + "auxiliary_loss_mlp": 0.01037674, + "balance_loss_clip": 1.0221653, + "balance_loss_mlp": 1.04452634, + "epoch": 0.3831053660003006, + "flos": 25076816098560.0, + "grad_norm": 1.8238449128176952, + "language_loss": 0.72682339, + "learning_rate": 2.8285427391301155e-06, + "loss": 0.7484479, + "num_input_tokens_seen": 136701185, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.80078125, + "step": 6372, + "time_per_iteration": 2.47695255279541 + }, + { + "auxiliary_loss_clip": 0.01121696, + "auxiliary_loss_mlp": 0.01038223, + "balance_loss_clip": 1.02325058, + "balance_loss_mlp": 1.04308939, + "epoch": 0.38316548925296856, + "flos": 23259018562560.0, + "grad_norm": 1.5970403518130607, + "language_loss": 0.84758627, + "learning_rate": 2.8281882535451266e-06, + "loss": 0.86918551, + "num_input_tokens_seen": 136721265, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.7890625, + "step": 6373, + "time_per_iteration": 2.514571189880371 + }, + { + "auxiliary_loss_clip": 0.01124014, + "auxiliary_loss_mlp": 0.01043172, + "balance_loss_clip": 1.02784181, + "balance_loss_mlp": 1.04392529, + "epoch": 0.3832256125056366, + "flos": 34423465991040.0, + "grad_norm": 4.718004058381721, + "language_loss": 0.74721354, + "learning_rate": 2.8278337365551567e-06, + "loss": 0.76888537, + "num_input_tokens_seen": 136741885, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.80078125, + "step": 6374, + "time_per_iteration": 2.5505032539367676 + }, + { + "auxiliary_loss_clip": 0.01124139, + "auxiliary_loss_mlp": 0.01041765, + "balance_loss_clip": 1.02675653, + "balance_loss_mlp": 1.04414058, + "epoch": 0.38328573575830455, + "flos": 21762764599680.0, + "grad_norm": 2.8586580554057472, + "language_loss": 0.75701195, + "learning_rate": 2.8274791881736485e-06, + "loss": 0.77867097, + "num_input_tokens_seen": 136760905, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.80078125, + "step": 6375, + "time_per_iteration": 2.467555522918701 + }, + { + "auxiliary_loss_clip": 0.01122331, + "auxiliary_loss_mlp": 0.01037666, + "balance_loss_clip": 1.02300918, + "balance_loss_mlp": 1.04375613, + "epoch": 0.3833458590109725, + "flos": 17380010724480.0, + "grad_norm": 2.257221103761015, + "language_loss": 0.72827101, + "learning_rate": 2.8271246084140457e-06, + "loss": 0.7498709, + "num_input_tokens_seen": 136777240, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78515625, + "step": 6376, + "time_per_iteration": 2.4082555770874023 + }, + { + "auxiliary_loss_clip": 0.01118574, + "auxiliary_loss_mlp": 0.01039859, + "balance_loss_clip": 1.02455282, + "balance_loss_mlp": 1.04245007, + "epoch": 0.3834059822636405, + "flos": 29424557191680.0, + "grad_norm": 1.5879949283042905, + "language_loss": 0.67586625, + "learning_rate": 2.826769997289796e-06, + "loss": 0.69745058, + "num_input_tokens_seen": 136801040, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.76171875, + "step": 6377, + "time_per_iteration": 2.54896879196167 + }, + { + "auxiliary_loss_clip": 0.01124961, + "auxiliary_loss_mlp": 0.01039863, + "balance_loss_clip": 1.02448511, + "balance_loss_mlp": 1.04608607, + "epoch": 0.38346610551630844, + "flos": 21470739027840.0, + "grad_norm": 2.1973025079181117, + "language_loss": 0.72991705, + "learning_rate": 2.826415354814344e-06, + "loss": 0.75156534, + "num_input_tokens_seen": 136819495, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.7890625, + "step": 6378, + "time_per_iteration": 2.4442975521087646 + }, + { + "auxiliary_loss_clip": 0.01121801, + "auxiliary_loss_mlp": 0.0104221, + "balance_loss_clip": 1.02755964, + "balance_loss_mlp": 1.04327178, + "epoch": 0.3835262287689764, + "flos": 27561224188800.0, + "grad_norm": 1.6808845830991803, + "language_loss": 0.69162869, + "learning_rate": 2.8260606810011396e-06, + "loss": 0.71326876, + "num_input_tokens_seen": 136838840, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78515625, + "step": 6379, + "time_per_iteration": 2.529088258743286 + }, + { + "auxiliary_loss_clip": 0.01121458, + "auxiliary_loss_mlp": 0.01038205, + "balance_loss_clip": 1.02344704, + "balance_loss_mlp": 1.04552865, + "epoch": 0.3835863520216444, + "flos": 15523716787200.0, + "grad_norm": 1.6321901167852362, + "language_loss": 0.82979369, + "learning_rate": 2.8257059758636315e-06, + "loss": 0.85139024, + "num_input_tokens_seen": 136854425, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7578125, + "step": 6380, + "time_per_iteration": 2.4336190223693848 + }, + { + "auxiliary_loss_clip": 0.01120843, + "auxiliary_loss_mlp": 0.0103481, + "balance_loss_clip": 1.02090406, + "balance_loss_mlp": 1.04595208, + "epoch": 0.38364647527431234, + "flos": 21904934630400.0, + "grad_norm": 1.4297951270127425, + "language_loss": 0.81347466, + "learning_rate": 2.8253512394152697e-06, + "loss": 0.83503115, + "num_input_tokens_seen": 136874355, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 6381, + "time_per_iteration": 2.5029306411743164 + }, + { + "auxiliary_loss_clip": 0.0104681, + "auxiliary_loss_mlp": 0.01005882, + "balance_loss_clip": 1.00420141, + "balance_loss_mlp": 1.02098966, + "epoch": 0.3837065985269803, + "flos": 65534927558400.0, + "grad_norm": 0.796129115027233, + "language_loss": 0.60459685, + "learning_rate": 2.8249964716695068e-06, + "loss": 0.6251238, + "num_input_tokens_seen": 136937475, + "router_z_loss_clip": 0.0168457, + "router_z_loss_mlp": 0.2578125, + "step": 6382, + "time_per_iteration": 3.0525829792022705 + }, + { + "auxiliary_loss_clip": 0.01123582, + "auxiliary_loss_mlp": 0.01036921, + "balance_loss_clip": 1.02186477, + "balance_loss_mlp": 1.04358447, + "epoch": 0.38376672177964827, + "flos": 28256598558720.0, + "grad_norm": 2.302869327575685, + "language_loss": 0.66052485, + "learning_rate": 2.824641672639794e-06, + "loss": 0.68212986, + "num_input_tokens_seen": 136955805, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.796875, + "step": 6383, + "time_per_iteration": 2.5166289806365967 + }, + { + "auxiliary_loss_clip": 0.01124634, + "auxiliary_loss_mlp": 0.01033937, + "balance_loss_clip": 1.01944149, + "balance_loss_mlp": 1.04657924, + "epoch": 0.38382684503231623, + "flos": 20631363033600.0, + "grad_norm": 2.2385812040155932, + "language_loss": 0.74811673, + "learning_rate": 2.824286842339587e-06, + "loss": 0.76970243, + "num_input_tokens_seen": 136975240, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 6384, + "time_per_iteration": 2.4451465606689453 + }, + { + "auxiliary_loss_clip": 0.01120418, + "auxiliary_loss_mlp": 0.01036466, + "balance_loss_clip": 1.02219081, + "balance_loss_mlp": 1.04429483, + "epoch": 0.3838869682849842, + "flos": 19605825826560.0, + "grad_norm": 1.4336247312181014, + "language_loss": 0.75883526, + "learning_rate": 2.823931980782341e-06, + "loss": 0.78040409, + "num_input_tokens_seen": 136994985, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7578125, + "step": 6385, + "time_per_iteration": 2.4994513988494873 + }, + { + "auxiliary_loss_clip": 0.01046845, + "auxiliary_loss_mlp": 0.0100207, + "balance_loss_clip": 1.0002346, + "balance_loss_mlp": 1.02044809, + "epoch": 0.38394709153765216, + "flos": 56556110891520.0, + "grad_norm": 0.9433326566144719, + "language_loss": 0.67094183, + "learning_rate": 2.82357708798151e-06, + "loss": 0.69143105, + "num_input_tokens_seen": 137046290, + "router_z_loss_clip": 0.01831055, + "router_z_loss_mlp": 0.265625, + "step": 6386, + "time_per_iteration": 2.938122272491455 + }, + { + "auxiliary_loss_clip": 0.0112227, + "auxiliary_loss_mlp": 0.01032989, + "balance_loss_clip": 1.01933384, + "balance_loss_mlp": 1.0465281, + "epoch": 0.3840072147903202, + "flos": 15888748752000.0, + "grad_norm": 1.7796918810721745, + "language_loss": 0.72464442, + "learning_rate": 2.8232221639505547e-06, + "loss": 0.74619704, + "num_input_tokens_seen": 137064725, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7578125, + "step": 6387, + "time_per_iteration": 5.465053081512451 + }, + { + "auxiliary_loss_clip": 0.01120429, + "auxiliary_loss_mlp": 0.01038992, + "balance_loss_clip": 1.02478194, + "balance_loss_mlp": 1.0451014, + "epoch": 0.38406733804298815, + "flos": 28218030330240.0, + "grad_norm": 1.6321565887315352, + "language_loss": 0.81181073, + "learning_rate": 2.822867208702932e-06, + "loss": 0.8334049, + "num_input_tokens_seen": 137086030, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.75390625, + "step": 6388, + "time_per_iteration": 3.940337657928467 + }, + { + "auxiliary_loss_clip": 0.01117866, + "auxiliary_loss_mlp": 0.01035288, + "balance_loss_clip": 1.02183485, + "balance_loss_mlp": 1.04249692, + "epoch": 0.3841274612956561, + "flos": 18223588609920.0, + "grad_norm": 1.6383752800672902, + "language_loss": 0.76158738, + "learning_rate": 2.8225122222521026e-06, + "loss": 0.78311884, + "num_input_tokens_seen": 137105400, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75390625, + "step": 6389, + "time_per_iteration": 2.4720914363861084 + }, + { + "auxiliary_loss_clip": 0.01125023, + "auxiliary_loss_mlp": 0.01044293, + "balance_loss_clip": 1.02846217, + "balance_loss_mlp": 1.04541564, + "epoch": 0.3841875845483241, + "flos": 19792884879360.0, + "grad_norm": 1.5616719605863645, + "language_loss": 0.76284117, + "learning_rate": 2.8221572046115273e-06, + "loss": 0.78453434, + "num_input_tokens_seen": 137124985, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.796875, + "step": 6390, + "time_per_iteration": 2.4576520919799805 + }, + { + "auxiliary_loss_clip": 0.01124413, + "auxiliary_loss_mlp": 0.01048913, + "balance_loss_clip": 1.03295112, + "balance_loss_mlp": 1.04433882, + "epoch": 0.38424770780099204, + "flos": 29898829393920.0, + "grad_norm": 1.6285452565530243, + "language_loss": 0.70119178, + "learning_rate": 2.821802155794668e-06, + "loss": 0.72292501, + "num_input_tokens_seen": 137146745, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.80078125, + "step": 6391, + "time_per_iteration": 2.5657877922058105 + }, + { + "auxiliary_loss_clip": 0.01121063, + "auxiliary_loss_mlp": 0.01035269, + "balance_loss_clip": 1.01978421, + "balance_loss_mlp": 1.04267848, + "epoch": 0.38430783105366, + "flos": 20813717404800.0, + "grad_norm": 1.938766253942268, + "language_loss": 0.84100312, + "learning_rate": 2.8214470758149884e-06, + "loss": 0.86256641, + "num_input_tokens_seen": 137163195, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.78515625, + "step": 6392, + "time_per_iteration": 2.4366884231567383 + }, + { + "auxiliary_loss_clip": 0.01120524, + "auxiliary_loss_mlp": 0.01035539, + "balance_loss_clip": 1.0215621, + "balance_loss_mlp": 1.04348612, + "epoch": 0.384367954306328, + "flos": 10998577399680.0, + "grad_norm": 2.11211623143903, + "language_loss": 0.61170864, + "learning_rate": 2.8210919646859536e-06, + "loss": 0.63326931, + "num_input_tokens_seen": 137179330, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76953125, + "step": 6393, + "time_per_iteration": 2.428238868713379 + }, + { + "auxiliary_loss_clip": 0.01128297, + "auxiliary_loss_mlp": 0.01035177, + "balance_loss_clip": 1.01886964, + "balance_loss_mlp": 1.04589796, + "epoch": 0.38442807755899594, + "flos": 25338030779520.0, + "grad_norm": 2.3555579295861775, + "language_loss": 0.71295553, + "learning_rate": 2.820736822421029e-06, + "loss": 0.73459029, + "num_input_tokens_seen": 137198655, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.82421875, + "step": 6394, + "time_per_iteration": 2.483506679534912 + }, + { + "auxiliary_loss_clip": 0.01129724, + "auxiliary_loss_mlp": 0.01033781, + "balance_loss_clip": 1.01760483, + "balance_loss_mlp": 1.04732203, + "epoch": 0.3844882008116639, + "flos": 21069760527360.0, + "grad_norm": 2.3366242235467047, + "language_loss": 0.81172824, + "learning_rate": 2.8203816490336822e-06, + "loss": 0.83336329, + "num_input_tokens_seen": 137217120, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.82421875, + "step": 6395, + "time_per_iteration": 2.471301317214966 + }, + { + "auxiliary_loss_clip": 0.01126851, + "auxiliary_loss_mlp": 0.01043233, + "balance_loss_clip": 1.02880275, + "balance_loss_mlp": 1.04770553, + "epoch": 0.38454832406433187, + "flos": 17963235855360.0, + "grad_norm": 3.9526859148826707, + "language_loss": 0.70642132, + "learning_rate": 2.8200264445373813e-06, + "loss": 0.72812212, + "num_input_tokens_seen": 137234410, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7890625, + "step": 6396, + "time_per_iteration": 2.4121108055114746 + }, + { + "auxiliary_loss_clip": 0.01046507, + "auxiliary_loss_mlp": 0.00999241, + "balance_loss_clip": 0.99745274, + "balance_loss_mlp": 1.01972008, + "epoch": 0.38460844731699984, + "flos": 67924999555200.0, + "grad_norm": 0.8889613923167966, + "language_loss": 0.59708536, + "learning_rate": 2.8196712089455954e-06, + "loss": 0.61754286, + "num_input_tokens_seen": 137294940, + "router_z_loss_clip": 0.01782227, + "router_z_loss_mlp": 0.26757812, + "step": 6397, + "time_per_iteration": 3.1453351974487305 + }, + { + "auxiliary_loss_clip": 0.01123309, + "auxiliary_loss_mlp": 0.01031546, + "balance_loss_clip": 1.01682329, + "balance_loss_mlp": 1.0459342, + "epoch": 0.3846685705696678, + "flos": 25849075530240.0, + "grad_norm": 1.8498202803423767, + "language_loss": 0.84868926, + "learning_rate": 2.819315942271794e-06, + "loss": 0.87023783, + "num_input_tokens_seen": 137315035, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.7734375, + "step": 6398, + "time_per_iteration": 2.488083839416504 + }, + { + "auxiliary_loss_clip": 0.01121502, + "auxiliary_loss_mlp": 0.01032478, + "balance_loss_clip": 1.01826787, + "balance_loss_mlp": 1.0444839, + "epoch": 0.38472869382233577, + "flos": 16290194129280.0, + "grad_norm": 1.942979036208199, + "language_loss": 0.79634017, + "learning_rate": 2.8189606445294515e-06, + "loss": 0.81787992, + "num_input_tokens_seen": 137333155, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7734375, + "step": 6399, + "time_per_iteration": 2.4537224769592285 + }, + { + "auxiliary_loss_clip": 0.01124087, + "auxiliary_loss_mlp": 0.01036794, + "balance_loss_clip": 1.02149892, + "balance_loss_mlp": 1.04439902, + "epoch": 0.38478881707500373, + "flos": 19353122668800.0, + "grad_norm": 1.8928366067789952, + "language_loss": 0.67337728, + "learning_rate": 2.818605315732038e-06, + "loss": 0.69498605, + "num_input_tokens_seen": 137351515, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.796875, + "step": 6400, + "time_per_iteration": 2.434598207473755 + }, + { + "auxiliary_loss_clip": 0.0112665, + "auxiliary_loss_mlp": 0.0104604, + "balance_loss_clip": 1.030936, + "balance_loss_mlp": 1.04645705, + "epoch": 0.38484894032767175, + "flos": 24860849575680.0, + "grad_norm": 1.6542190438860391, + "language_loss": 0.73004973, + "learning_rate": 2.81824995589303e-06, + "loss": 0.7517767, + "num_input_tokens_seen": 137371255, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.80078125, + "step": 6401, + "time_per_iteration": 2.4963061809539795 + }, + { + "auxiliary_loss_clip": 0.01123878, + "auxiliary_loss_mlp": 0.01038712, + "balance_loss_clip": 1.02329874, + "balance_loss_mlp": 1.045017, + "epoch": 0.3849090635803397, + "flos": 14501806853760.0, + "grad_norm": 1.9430058457885813, + "language_loss": 0.71920168, + "learning_rate": 2.8178945650259012e-06, + "loss": 0.74082762, + "num_input_tokens_seen": 137388980, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.7890625, + "step": 6402, + "time_per_iteration": 2.426349639892578 + }, + { + "auxiliary_loss_clip": 0.01118079, + "auxiliary_loss_mlp": 0.01034485, + "balance_loss_clip": 1.02007246, + "balance_loss_mlp": 1.04232907, + "epoch": 0.3849691868330077, + "flos": 18515865576960.0, + "grad_norm": 1.7846208976590752, + "language_loss": 0.82449806, + "learning_rate": 2.817539143144128e-06, + "loss": 0.84602368, + "num_input_tokens_seen": 137406885, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7578125, + "step": 6403, + "time_per_iteration": 2.4700570106506348 + }, + { + "auxiliary_loss_clip": 0.0112163, + "auxiliary_loss_mlp": 0.01038876, + "balance_loss_clip": 1.02340865, + "balance_loss_mlp": 1.04500651, + "epoch": 0.38502931008567565, + "flos": 21616392677760.0, + "grad_norm": 1.8891944292176732, + "language_loss": 0.82468271, + "learning_rate": 2.817183690261189e-06, + "loss": 0.84628773, + "num_input_tokens_seen": 137425535, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.765625, + "step": 6404, + "time_per_iteration": 2.481968402862549 + }, + { + "auxiliary_loss_clip": 0.01122268, + "auxiliary_loss_mlp": 0.01035856, + "balance_loss_clip": 1.02136576, + "balance_loss_mlp": 1.04299283, + "epoch": 0.3850894333383436, + "flos": 25415346804480.0, + "grad_norm": 1.6334992055527433, + "language_loss": 0.69588619, + "learning_rate": 2.816828206390563e-06, + "loss": 0.71746749, + "num_input_tokens_seen": 137447700, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.79296875, + "step": 6405, + "time_per_iteration": 2.5947635173797607 + }, + { + "auxiliary_loss_clip": 0.01119754, + "auxiliary_loss_mlp": 0.01038243, + "balance_loss_clip": 1.02475476, + "balance_loss_mlp": 1.04411674, + "epoch": 0.3851495565910116, + "flos": 20227870581120.0, + "grad_norm": 1.9268009005119906, + "language_loss": 0.79068285, + "learning_rate": 2.816472691545729e-06, + "loss": 0.81226277, + "num_input_tokens_seen": 137462245, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75390625, + "step": 6406, + "time_per_iteration": 2.4195396900177 + }, + { + "auxiliary_loss_clip": 0.01125718, + "auxiliary_loss_mlp": 0.01037827, + "balance_loss_clip": 1.02247298, + "balance_loss_mlp": 1.04682863, + "epoch": 0.38520967984367954, + "flos": 16508459122560.0, + "grad_norm": 2.277779532957622, + "language_loss": 0.8438794, + "learning_rate": 2.8161171457401694e-06, + "loss": 0.86551487, + "num_input_tokens_seen": 137476455, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.7890625, + "step": 6407, + "time_per_iteration": 2.4518916606903076 + }, + { + "auxiliary_loss_clip": 0.01043854, + "auxiliary_loss_mlp": 0.01007721, + "balance_loss_clip": 1.00623727, + "balance_loss_mlp": 1.01778841, + "epoch": 0.3852698030963475, + "flos": 61313772971520.0, + "grad_norm": 0.8214817017046727, + "language_loss": 0.64868087, + "learning_rate": 2.815761568987365e-06, + "loss": 0.66919661, + "num_input_tokens_seen": 137539845, + "router_z_loss_clip": 0.01483154, + "router_z_loss_mlp": 0.25976562, + "step": 6408, + "time_per_iteration": 3.090940475463867 + }, + { + "auxiliary_loss_clip": 0.01123062, + "auxiliary_loss_mlp": 0.01043064, + "balance_loss_clip": 1.02676785, + "balance_loss_mlp": 1.04405272, + "epoch": 0.3853299263490155, + "flos": 22893016930560.0, + "grad_norm": 1.5501960898767924, + "language_loss": 0.73628408, + "learning_rate": 2.8154059613008e-06, + "loss": 0.7579453, + "num_input_tokens_seen": 137559880, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.7890625, + "step": 6409, + "time_per_iteration": 2.4831972122192383 + }, + { + "auxiliary_loss_clip": 0.01129844, + "auxiliary_loss_mlp": 0.01049195, + "balance_loss_clip": 1.03255367, + "balance_loss_mlp": 1.04574656, + "epoch": 0.38539004960168344, + "flos": 20047491457920.0, + "grad_norm": 2.0394333066705874, + "language_loss": 0.70208335, + "learning_rate": 2.81505032269396e-06, + "loss": 0.72387373, + "num_input_tokens_seen": 137578225, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.83984375, + "step": 6410, + "time_per_iteration": 2.430617332458496 + }, + { + "auxiliary_loss_clip": 0.01043682, + "auxiliary_loss_mlp": 0.01003736, + "balance_loss_clip": 1.0021385, + "balance_loss_mlp": 1.01802111, + "epoch": 0.3854501728543514, + "flos": 68730691570560.0, + "grad_norm": 0.6794214350275563, + "language_loss": 0.60311568, + "learning_rate": 2.81469465318033e-06, + "loss": 0.62358987, + "num_input_tokens_seen": 137645770, + "router_z_loss_clip": 0.01599121, + "router_z_loss_mlp": 0.2578125, + "step": 6411, + "time_per_iteration": 3.1681244373321533 + }, + { + "auxiliary_loss_clip": 0.01118542, + "auxiliary_loss_mlp": 0.01029148, + "balance_loss_clip": 1.01543355, + "balance_loss_mlp": 1.04146707, + "epoch": 0.38551029610701937, + "flos": 20485027025280.0, + "grad_norm": 1.9543275921913768, + "language_loss": 0.7770192, + "learning_rate": 2.814338952773397e-06, + "loss": 0.79849613, + "num_input_tokens_seen": 137664090, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76953125, + "step": 6412, + "time_per_iteration": 2.4670822620391846 + }, + { + "auxiliary_loss_clip": 0.01124348, + "auxiliary_loss_mlp": 0.01037148, + "balance_loss_clip": 1.02093506, + "balance_loss_mlp": 1.0437274, + "epoch": 0.38557041935968733, + "flos": 23471788775040.0, + "grad_norm": 1.7609162802618283, + "language_loss": 0.78148544, + "learning_rate": 2.8139832214866493e-06, + "loss": 0.80310041, + "num_input_tokens_seen": 137683190, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8046875, + "step": 6413, + "time_per_iteration": 2.4506192207336426 + }, + { + "auxiliary_loss_clip": 0.01040458, + "auxiliary_loss_mlp": 0.01006495, + "balance_loss_clip": 1.00485027, + "balance_loss_mlp": 1.01477003, + "epoch": 0.38563054261235535, + "flos": 63966636869760.0, + "grad_norm": 0.8068957555662655, + "language_loss": 0.61344963, + "learning_rate": 2.813627459333576e-06, + "loss": 0.63391918, + "num_input_tokens_seen": 137737315, + "router_z_loss_clip": 0.01647949, + "router_z_loss_mlp": 0.2578125, + "step": 6414, + "time_per_iteration": 2.897420883178711 + }, + { + "auxiliary_loss_clip": 0.01124739, + "auxiliary_loss_mlp": 0.0104191, + "balance_loss_clip": 1.02712834, + "balance_loss_mlp": 1.04452538, + "epoch": 0.3856906658650233, + "flos": 23987789602560.0, + "grad_norm": 2.3808373048749543, + "language_loss": 0.77121973, + "learning_rate": 2.8132716663276685e-06, + "loss": 0.79288626, + "num_input_tokens_seen": 137753535, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.80078125, + "step": 6415, + "time_per_iteration": 2.455246686935425 + }, + { + "auxiliary_loss_clip": 0.01115597, + "auxiliary_loss_mlp": 0.01032062, + "balance_loss_clip": 1.01916933, + "balance_loss_mlp": 1.04303658, + "epoch": 0.3857507891176913, + "flos": 25007436979200.0, + "grad_norm": 1.6468091717833364, + "language_loss": 0.79597795, + "learning_rate": 2.8129158424824173e-06, + "loss": 0.81745458, + "num_input_tokens_seen": 137773405, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7265625, + "step": 6416, + "time_per_iteration": 2.5162863731384277 + }, + { + "auxiliary_loss_clip": 0.0111887, + "auxiliary_loss_mlp": 0.01034214, + "balance_loss_clip": 1.02100587, + "balance_loss_mlp": 1.04190922, + "epoch": 0.38581091237035925, + "flos": 21536778182400.0, + "grad_norm": 1.6816352340920986, + "language_loss": 0.7957328, + "learning_rate": 2.8125599878113155e-06, + "loss": 0.81726366, + "num_input_tokens_seen": 137790810, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.76953125, + "step": 6417, + "time_per_iteration": 2.462679862976074 + }, + { + "auxiliary_loss_clip": 0.01116808, + "auxiliary_loss_mlp": 0.01037406, + "balance_loss_clip": 1.02369118, + "balance_loss_mlp": 1.03945839, + "epoch": 0.3858710356230272, + "flos": 17383889393280.0, + "grad_norm": 9.924006648688666, + "language_loss": 0.80246758, + "learning_rate": 2.8122041023278583e-06, + "loss": 0.82400978, + "num_input_tokens_seen": 137810265, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 6418, + "time_per_iteration": 2.4485208988189697 + }, + { + "auxiliary_loss_clip": 0.01114184, + "auxiliary_loss_mlp": 0.01033119, + "balance_loss_clip": 1.01992905, + "balance_loss_mlp": 1.03939319, + "epoch": 0.3859311588756952, + "flos": 20339588856960.0, + "grad_norm": 1.9958339666442106, + "language_loss": 0.79694712, + "learning_rate": 2.8118481860455407e-06, + "loss": 0.81842011, + "num_input_tokens_seen": 137828580, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.75, + "step": 6419, + "time_per_iteration": 2.4360008239746094 + }, + { + "auxiliary_loss_clip": 0.01115104, + "auxiliary_loss_mlp": 0.01034912, + "balance_loss_clip": 1.01972449, + "balance_loss_mlp": 1.04120576, + "epoch": 0.38599128212836314, + "flos": 26321157002880.0, + "grad_norm": 2.0553625572614678, + "language_loss": 0.67804086, + "learning_rate": 2.8114922389778573e-06, + "loss": 0.69954103, + "num_input_tokens_seen": 137846145, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.73828125, + "step": 6420, + "time_per_iteration": 2.489661931991577 + }, + { + "auxiliary_loss_clip": 0.01116038, + "auxiliary_loss_mlp": 0.01036432, + "balance_loss_clip": 1.02286029, + "balance_loss_mlp": 1.04163957, + "epoch": 0.3860514053810311, + "flos": 13553837066880.0, + "grad_norm": 2.4512212791744576, + "language_loss": 0.81831443, + "learning_rate": 2.8111362611383076e-06, + "loss": 0.83983916, + "num_input_tokens_seen": 137863705, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.74609375, + "step": 6421, + "time_per_iteration": 2.4278934001922607 + }, + { + "auxiliary_loss_clip": 0.01118285, + "auxiliary_loss_mlp": 0.01033321, + "balance_loss_clip": 1.01888454, + "balance_loss_mlp": 1.04031229, + "epoch": 0.3861115286336991, + "flos": 20954271323520.0, + "grad_norm": 2.2431145476637266, + "language_loss": 0.72079587, + "learning_rate": 2.8107802525403886e-06, + "loss": 0.74231195, + "num_input_tokens_seen": 137880285, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 6422, + "time_per_iteration": 2.441708564758301 + }, + { + "auxiliary_loss_clip": 0.01116019, + "auxiliary_loss_mlp": 0.01038012, + "balance_loss_clip": 1.02482104, + "balance_loss_mlp": 1.0425638, + "epoch": 0.38617165188636704, + "flos": 16362697731840.0, + "grad_norm": 1.6611822537555545, + "language_loss": 0.65814191, + "learning_rate": 2.8104242131976025e-06, + "loss": 0.6796822, + "num_input_tokens_seen": 137898335, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 6423, + "time_per_iteration": 2.4211878776550293 + }, + { + "auxiliary_loss_clip": 0.01121429, + "auxiliary_loss_mlp": 0.01039251, + "balance_loss_clip": 1.02561951, + "balance_loss_mlp": 1.0439117, + "epoch": 0.386231775139035, + "flos": 34787276893440.0, + "grad_norm": 1.965242475874499, + "language_loss": 0.68746173, + "learning_rate": 2.810068143123449e-06, + "loss": 0.70906854, + "num_input_tokens_seen": 137918605, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 6424, + "time_per_iteration": 2.5804436206817627 + }, + { + "auxiliary_loss_clip": 0.01117257, + "auxiliary_loss_mlp": 0.0103853, + "balance_loss_clip": 1.0243144, + "balance_loss_mlp": 1.04261661, + "epoch": 0.38629189839170297, + "flos": 21726171619200.0, + "grad_norm": 1.3808875353222407, + "language_loss": 0.72237349, + "learning_rate": 2.809712042331429e-06, + "loss": 0.74393135, + "num_input_tokens_seen": 137938245, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.74609375, + "step": 6425, + "time_per_iteration": 2.4568634033203125 + }, + { + "auxiliary_loss_clip": 0.01121874, + "auxiliary_loss_mlp": 0.01038367, + "balance_loss_clip": 1.02413344, + "balance_loss_mlp": 1.0424571, + "epoch": 0.38635202164437094, + "flos": 27923634460800.0, + "grad_norm": 2.566599175889616, + "language_loss": 0.80062914, + "learning_rate": 2.8093559108350484e-06, + "loss": 0.82223159, + "num_input_tokens_seen": 137956770, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.79296875, + "step": 6426, + "time_per_iteration": 2.5236575603485107 + }, + { + "auxiliary_loss_clip": 0.01123371, + "auxiliary_loss_mlp": 0.01036233, + "balance_loss_clip": 1.0222559, + "balance_loss_mlp": 1.04582727, + "epoch": 0.38641214489703896, + "flos": 23586631534080.0, + "grad_norm": 2.32293087490025, + "language_loss": 0.74624443, + "learning_rate": 2.80899974864781e-06, + "loss": 0.7678405, + "num_input_tokens_seen": 137977040, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7734375, + "step": 6427, + "time_per_iteration": 2.467555046081543 + }, + { + "auxiliary_loss_clip": 0.01118544, + "auxiliary_loss_mlp": 0.01039206, + "balance_loss_clip": 1.02530599, + "balance_loss_mlp": 1.04256904, + "epoch": 0.3864722681497069, + "flos": 12641239198080.0, + "grad_norm": 1.6951631816528543, + "language_loss": 0.69630527, + "learning_rate": 2.8086435557832203e-06, + "loss": 0.71788281, + "num_input_tokens_seen": 137993545, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 6428, + "time_per_iteration": 2.4336817264556885 + }, + { + "auxiliary_loss_clip": 0.01120968, + "auxiliary_loss_mlp": 0.01042024, + "balance_loss_clip": 1.02787971, + "balance_loss_mlp": 1.0427897, + "epoch": 0.3865323914023749, + "flos": 17598922162560.0, + "grad_norm": 2.175868568260599, + "language_loss": 0.84272587, + "learning_rate": 2.8082873322547863e-06, + "loss": 0.86435586, + "num_input_tokens_seen": 138010140, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.78125, + "step": 6429, + "time_per_iteration": 5.324048757553101 + }, + { + "auxiliary_loss_clip": 0.01121297, + "auxiliary_loss_mlp": 0.01037029, + "balance_loss_clip": 1.02358222, + "balance_loss_mlp": 1.04458523, + "epoch": 0.38659251465504285, + "flos": 18478949374080.0, + "grad_norm": 2.0434704200334726, + "language_loss": 0.808312, + "learning_rate": 2.807931078076015e-06, + "loss": 0.82989526, + "num_input_tokens_seen": 138028880, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 6430, + "time_per_iteration": 3.8362674713134766 + }, + { + "auxiliary_loss_clip": 0.01037896, + "auxiliary_loss_mlp": 0.01001686, + "balance_loss_clip": 1.00019002, + "balance_loss_mlp": 1.01247668, + "epoch": 0.3866526379077108, + "flos": 64165726978560.0, + "grad_norm": 0.7147232834997996, + "language_loss": 0.58793551, + "learning_rate": 2.807574793260416e-06, + "loss": 0.60833132, + "num_input_tokens_seen": 138098090, + "router_z_loss_clip": 0.01495361, + "router_z_loss_mlp": 0.25390625, + "step": 6431, + "time_per_iteration": 3.1054275035858154 + }, + { + "auxiliary_loss_clip": 0.01123522, + "auxiliary_loss_mlp": 0.01036133, + "balance_loss_clip": 1.0213275, + "balance_loss_mlp": 1.04425848, + "epoch": 0.3867127611603788, + "flos": 14388292897920.0, + "grad_norm": 1.8418420222570902, + "language_loss": 0.78914982, + "learning_rate": 2.8072184778215004e-06, + "loss": 0.81074637, + "num_input_tokens_seen": 138114735, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 6432, + "time_per_iteration": 2.441103458404541 + }, + { + "auxiliary_loss_clip": 0.01120861, + "auxiliary_loss_mlp": 0.01042942, + "balance_loss_clip": 1.02820802, + "balance_loss_mlp": 1.04033065, + "epoch": 0.38677288441304675, + "flos": 20010754823040.0, + "grad_norm": 3.1335187433073006, + "language_loss": 0.80734611, + "learning_rate": 2.806862131772779e-06, + "loss": 0.82898408, + "num_input_tokens_seen": 138130480, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8046875, + "step": 6433, + "time_per_iteration": 2.4334840774536133 + }, + { + "auxiliary_loss_clip": 0.01122101, + "auxiliary_loss_mlp": 0.01036931, + "balance_loss_clip": 1.02167201, + "balance_loss_mlp": 1.04427695, + "epoch": 0.3868330076657147, + "flos": 22236893147520.0, + "grad_norm": 1.9920607209076013, + "language_loss": 0.70712543, + "learning_rate": 2.806505755127765e-06, + "loss": 0.72871572, + "num_input_tokens_seen": 138150640, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.77734375, + "step": 6434, + "time_per_iteration": 2.4485912322998047 + }, + { + "auxiliary_loss_clip": 0.01122001, + "auxiliary_loss_mlp": 0.01037218, + "balance_loss_clip": 1.02259684, + "balance_loss_mlp": 1.04096544, + "epoch": 0.3868931309183827, + "flos": 16727442387840.0, + "grad_norm": 3.1146547904297615, + "language_loss": 0.77674437, + "learning_rate": 2.806149347899972e-06, + "loss": 0.79833651, + "num_input_tokens_seen": 138169700, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8125, + "step": 6435, + "time_per_iteration": 2.4734902381896973 + }, + { + "auxiliary_loss_clip": 0.01117121, + "auxiliary_loss_mlp": 0.01032568, + "balance_loss_clip": 1.01877558, + "balance_loss_mlp": 1.04157901, + "epoch": 0.38695325417105064, + "flos": 22674716023680.0, + "grad_norm": 1.6626735995393465, + "language_loss": 0.79557228, + "learning_rate": 2.805792910102915e-06, + "loss": 0.81706917, + "num_input_tokens_seen": 138185835, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.75390625, + "step": 6436, + "time_per_iteration": 2.461880922317505 + }, + { + "auxiliary_loss_clip": 0.01115966, + "auxiliary_loss_mlp": 0.01032941, + "balance_loss_clip": 1.01937521, + "balance_loss_mlp": 1.04099202, + "epoch": 0.3870133774237186, + "flos": 23112036109440.0, + "grad_norm": 1.7213495950653388, + "language_loss": 0.77057981, + "learning_rate": 2.8054364417501093e-06, + "loss": 0.79206884, + "num_input_tokens_seen": 138204080, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.75, + "step": 6437, + "time_per_iteration": 2.506342649459839 + }, + { + "auxiliary_loss_clip": 0.01118581, + "auxiliary_loss_mlp": 0.0104126, + "balance_loss_clip": 1.02759838, + "balance_loss_mlp": 1.0425818, + "epoch": 0.3870735006763866, + "flos": 17675699483520.0, + "grad_norm": 2.0991099349261013, + "language_loss": 0.8199805, + "learning_rate": 2.805079942855074e-06, + "loss": 0.84157896, + "num_input_tokens_seen": 138220710, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7578125, + "step": 6438, + "time_per_iteration": 2.4236960411071777 + }, + { + "auxiliary_loss_clip": 0.01119447, + "auxiliary_loss_mlp": 0.01039004, + "balance_loss_clip": 1.02413225, + "balance_loss_mlp": 1.04198575, + "epoch": 0.38713362392905454, + "flos": 23295791111040.0, + "grad_norm": 1.4416179830694351, + "language_loss": 0.75274503, + "learning_rate": 2.804723413431326e-06, + "loss": 0.77432954, + "num_input_tokens_seen": 138241720, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7734375, + "step": 6439, + "time_per_iteration": 2.4746499061584473 + }, + { + "auxiliary_loss_clip": 0.01115954, + "auxiliary_loss_mlp": 0.01030927, + "balance_loss_clip": 1.01804042, + "balance_loss_mlp": 1.04231787, + "epoch": 0.38719374718172256, + "flos": 21031192298880.0, + "grad_norm": 1.4591961315755648, + "language_loss": 0.74029297, + "learning_rate": 2.8043668534923855e-06, + "loss": 0.76176178, + "num_input_tokens_seen": 138261885, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.734375, + "step": 6440, + "time_per_iteration": 2.470442056655884 + }, + { + "auxiliary_loss_clip": 0.01120633, + "auxiliary_loss_mlp": 0.01041682, + "balance_loss_clip": 1.02755535, + "balance_loss_mlp": 1.04172719, + "epoch": 0.3872538704343905, + "flos": 19609776322560.0, + "grad_norm": 1.882594032026591, + "language_loss": 0.82420492, + "learning_rate": 2.804010263051774e-06, + "loss": 0.84582806, + "num_input_tokens_seen": 138280255, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 6441, + "time_per_iteration": 2.4857184886932373 + }, + { + "auxiliary_loss_clip": 0.01119023, + "auxiliary_loss_mlp": 0.01044385, + "balance_loss_clip": 1.03132594, + "balance_loss_mlp": 1.04210794, + "epoch": 0.3873139936870585, + "flos": 17530045833600.0, + "grad_norm": 2.099147848905264, + "language_loss": 0.81835496, + "learning_rate": 2.8036536421230118e-06, + "loss": 0.83998901, + "num_input_tokens_seen": 138296675, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.76953125, + "step": 6442, + "time_per_iteration": 2.4149296283721924 + }, + { + "auxiliary_loss_clip": 0.01116335, + "auxiliary_loss_mlp": 0.01035738, + "balance_loss_clip": 1.02142096, + "balance_loss_mlp": 1.04025602, + "epoch": 0.38737411693972645, + "flos": 17786555832960.0, + "grad_norm": 1.5694674536603201, + "language_loss": 0.83847654, + "learning_rate": 2.803296990719624e-06, + "loss": 0.85999727, + "num_input_tokens_seen": 138314985, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7578125, + "step": 6443, + "time_per_iteration": 2.4515957832336426 + }, + { + "auxiliary_loss_clip": 0.01039021, + "auxiliary_loss_mlp": 0.01007024, + "balance_loss_clip": 1.00551593, + "balance_loss_mlp": 1.0140909, + "epoch": 0.3874342401923944, + "flos": 58304637048960.0, + "grad_norm": 0.7719544775144753, + "language_loss": 0.50268674, + "learning_rate": 2.8029403088551327e-06, + "loss": 0.52314723, + "num_input_tokens_seen": 138373275, + "router_z_loss_clip": 0.01507568, + "router_z_loss_mlp": 0.24902344, + "step": 6444, + "time_per_iteration": 3.092834711074829 + }, + { + "auxiliary_loss_clip": 0.01115245, + "auxiliary_loss_mlp": 0.01037927, + "balance_loss_clip": 1.02502251, + "balance_loss_mlp": 1.04225266, + "epoch": 0.3874943634450624, + "flos": 17711933328000.0, + "grad_norm": 1.537835026490341, + "language_loss": 0.78736365, + "learning_rate": 2.802583596543065e-06, + "loss": 0.80889541, + "num_input_tokens_seen": 138391145, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7265625, + "step": 6445, + "time_per_iteration": 2.435347557067871 + }, + { + "auxiliary_loss_clip": 0.01115913, + "auxiliary_loss_mlp": 0.01033846, + "balance_loss_clip": 1.02055407, + "balance_loss_mlp": 1.04211605, + "epoch": 0.38755448669773035, + "flos": 19244852098560.0, + "grad_norm": 1.672895701432963, + "language_loss": 0.81121695, + "learning_rate": 2.8022268537969474e-06, + "loss": 0.83271456, + "num_input_tokens_seen": 138409875, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73828125, + "step": 6446, + "time_per_iteration": 2.469536781311035 + }, + { + "auxiliary_loss_clip": 0.01114406, + "auxiliary_loss_mlp": 0.01037114, + "balance_loss_clip": 1.02434087, + "balance_loss_mlp": 1.03933239, + "epoch": 0.3876146099503983, + "flos": 20594267262720.0, + "grad_norm": 1.877585125713849, + "language_loss": 0.77093089, + "learning_rate": 2.801870080630306e-06, + "loss": 0.79244608, + "num_input_tokens_seen": 138428965, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.75, + "step": 6447, + "time_per_iteration": 2.428525447845459 + }, + { + "auxiliary_loss_clip": 0.01116221, + "auxiliary_loss_mlp": 0.01032372, + "balance_loss_clip": 1.01940775, + "balance_loss_mlp": 1.04256356, + "epoch": 0.3876747332030663, + "flos": 19281121856640.0, + "grad_norm": 1.5240627220637166, + "language_loss": 0.75767821, + "learning_rate": 2.801513277056671e-06, + "loss": 0.7791642, + "num_input_tokens_seen": 138448090, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.734375, + "step": 6448, + "time_per_iteration": 2.4325876235961914 + }, + { + "auxiliary_loss_clip": 0.01115196, + "auxiliary_loss_mlp": 0.01033743, + "balance_loss_clip": 1.02023029, + "balance_loss_mlp": 1.04179466, + "epoch": 0.38773485645573424, + "flos": 18945895201920.0, + "grad_norm": 1.6442003276819328, + "language_loss": 0.75754648, + "learning_rate": 2.8011564430895725e-06, + "loss": 0.77903593, + "num_input_tokens_seen": 138466105, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 6449, + "time_per_iteration": 2.435208320617676 + }, + { + "auxiliary_loss_clip": 0.0111808, + "auxiliary_loss_mlp": 0.01033453, + "balance_loss_clip": 1.0194999, + "balance_loss_mlp": 1.03956699, + "epoch": 0.3877949797084022, + "flos": 23071348978560.0, + "grad_norm": 1.5394171504545016, + "language_loss": 0.78183508, + "learning_rate": 2.800799578742542e-06, + "loss": 0.80335045, + "num_input_tokens_seen": 138485160, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.78515625, + "step": 6450, + "time_per_iteration": 2.467933177947998 + }, + { + "auxiliary_loss_clip": 0.0112145, + "auxiliary_loss_mlp": 0.01036071, + "balance_loss_clip": 1.02190948, + "balance_loss_mlp": 1.04104686, + "epoch": 0.3878551029610702, + "flos": 29095543589760.0, + "grad_norm": 2.1284571270947263, + "language_loss": 0.77706474, + "learning_rate": 2.8004426840291106e-06, + "loss": 0.79863995, + "num_input_tokens_seen": 138504135, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8046875, + "step": 6451, + "time_per_iteration": 2.513192892074585 + }, + { + "auxiliary_loss_clip": 0.01112409, + "auxiliary_loss_mlp": 0.01026865, + "balance_loss_clip": 1.01337111, + "balance_loss_mlp": 1.03988457, + "epoch": 0.38791522621373814, + "flos": 20996394998400.0, + "grad_norm": 1.5965207120841256, + "language_loss": 0.7642619, + "learning_rate": 2.800085758962812e-06, + "loss": 0.7856546, + "num_input_tokens_seen": 138523955, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7265625, + "step": 6452, + "time_per_iteration": 2.453756809234619 + }, + { + "auxiliary_loss_clip": 0.01118677, + "auxiliary_loss_mlp": 0.01040253, + "balance_loss_clip": 1.02721739, + "balance_loss_mlp": 1.04313231, + "epoch": 0.3879753494664061, + "flos": 15486836497920.0, + "grad_norm": 1.5417712426283914, + "language_loss": 0.79843581, + "learning_rate": 2.799728803557182e-06, + "loss": 0.82002515, + "num_input_tokens_seen": 138541655, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7578125, + "step": 6453, + "time_per_iteration": 2.434788465499878 + }, + { + "auxiliary_loss_clip": 0.01126032, + "auxiliary_loss_mlp": 0.01037051, + "balance_loss_clip": 1.02257931, + "balance_loss_mlp": 1.0456028, + "epoch": 0.3880354727190741, + "flos": 22053964158720.0, + "grad_norm": 1.779502658436086, + "language_loss": 0.71759796, + "learning_rate": 2.7993718178257555e-06, + "loss": 0.73922884, + "num_input_tokens_seen": 138560860, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8046875, + "step": 6454, + "time_per_iteration": 2.456637382507324 + }, + { + "auxiliary_loss_clip": 0.01122488, + "auxiliary_loss_mlp": 0.01039713, + "balance_loss_clip": 1.02489531, + "balance_loss_mlp": 1.04253364, + "epoch": 0.3880955959717421, + "flos": 20340307128960.0, + "grad_norm": 2.1246626443539216, + "language_loss": 0.77918947, + "learning_rate": 2.7990148017820694e-06, + "loss": 0.80081153, + "num_input_tokens_seen": 138580200, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 6455, + "time_per_iteration": 2.4589757919311523 + }, + { + "auxiliary_loss_clip": 0.01118002, + "auxiliary_loss_mlp": 0.01034735, + "balance_loss_clip": 1.02040577, + "balance_loss_mlp": 1.04232621, + "epoch": 0.38815571922441006, + "flos": 23075407215360.0, + "grad_norm": 1.6339807395025958, + "language_loss": 0.75865024, + "learning_rate": 2.798657755439662e-06, + "loss": 0.78017759, + "num_input_tokens_seen": 138598315, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7578125, + "step": 6456, + "time_per_iteration": 2.4390318393707275 + }, + { + "auxiliary_loss_clip": 0.01121145, + "auxiliary_loss_mlp": 0.01033254, + "balance_loss_clip": 1.01944995, + "balance_loss_mlp": 1.04276633, + "epoch": 0.388215842477078, + "flos": 20776944856320.0, + "grad_norm": 2.085241252102015, + "language_loss": 0.60518527, + "learning_rate": 2.7983006788120726e-06, + "loss": 0.62672919, + "num_input_tokens_seen": 138615695, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.78515625, + "step": 6457, + "time_per_iteration": 2.459535837173462 + }, + { + "auxiliary_loss_clip": 0.01121291, + "auxiliary_loss_mlp": 0.01037459, + "balance_loss_clip": 1.02167547, + "balance_loss_mlp": 1.04195237, + "epoch": 0.388275965729746, + "flos": 20448182649600.0, + "grad_norm": 2.1234505206368475, + "language_loss": 0.80247247, + "learning_rate": 2.797943571912841e-06, + "loss": 0.82405996, + "num_input_tokens_seen": 138633180, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.79296875, + "step": 6458, + "time_per_iteration": 2.425049066543579 + }, + { + "auxiliary_loss_clip": 0.01120771, + "auxiliary_loss_mlp": 0.0103458, + "balance_loss_clip": 1.02072167, + "balance_loss_mlp": 1.04291797, + "epoch": 0.38833608898241395, + "flos": 27892392606720.0, + "grad_norm": 1.8371533851039183, + "language_loss": 0.81683058, + "learning_rate": 2.797586434755509e-06, + "loss": 0.83838403, + "num_input_tokens_seen": 138654785, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.77734375, + "step": 6459, + "time_per_iteration": 2.5234129428863525 + }, + { + "auxiliary_loss_clip": 0.01117247, + "auxiliary_loss_mlp": 0.01034445, + "balance_loss_clip": 1.02105141, + "balance_loss_mlp": 1.04261899, + "epoch": 0.3883962122350819, + "flos": 18076390675200.0, + "grad_norm": 3.3845315312390643, + "language_loss": 0.61609662, + "learning_rate": 2.7972292673536202e-06, + "loss": 0.63761353, + "num_input_tokens_seen": 138673330, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 6460, + "time_per_iteration": 2.4271440505981445 + }, + { + "auxiliary_loss_clip": 0.01121121, + "auxiliary_loss_mlp": 0.01034276, + "balance_loss_clip": 1.0216701, + "balance_loss_mlp": 1.04498553, + "epoch": 0.3884563354877499, + "flos": 23622254847360.0, + "grad_norm": 1.999840896697599, + "language_loss": 0.85928953, + "learning_rate": 2.796872069720717e-06, + "loss": 0.88084352, + "num_input_tokens_seen": 138694185, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.76171875, + "step": 6461, + "time_per_iteration": 2.4874932765960693 + }, + { + "auxiliary_loss_clip": 0.01121067, + "auxiliary_loss_mlp": 0.01041247, + "balance_loss_clip": 1.02712059, + "balance_loss_mlp": 1.04198229, + "epoch": 0.38851645874041785, + "flos": 27453528236160.0, + "grad_norm": 5.6194775515218085, + "language_loss": 0.71397054, + "learning_rate": 2.7965148418703456e-06, + "loss": 0.73559368, + "num_input_tokens_seen": 138714625, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 6462, + "time_per_iteration": 2.4839894771575928 + }, + { + "auxiliary_loss_clip": 0.01120048, + "auxiliary_loss_mlp": 0.01036945, + "balance_loss_clip": 1.02274752, + "balance_loss_mlp": 1.04190457, + "epoch": 0.3885765819930858, + "flos": 25228072270080.0, + "grad_norm": 2.13487298932128, + "language_loss": 0.7582581, + "learning_rate": 2.796157583816052e-06, + "loss": 0.77982807, + "num_input_tokens_seen": 138733585, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.78125, + "step": 6463, + "time_per_iteration": 2.4897215366363525 + }, + { + "auxiliary_loss_clip": 0.0112511, + "auxiliary_loss_mlp": 0.01046321, + "balance_loss_clip": 1.0305022, + "balance_loss_mlp": 1.04482341, + "epoch": 0.3886367052457538, + "flos": 16946605221120.0, + "grad_norm": 1.9442764767857983, + "language_loss": 0.70078236, + "learning_rate": 2.795800295571382e-06, + "loss": 0.72249663, + "num_input_tokens_seen": 138752335, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8046875, + "step": 6464, + "time_per_iteration": 2.4519219398498535 + }, + { + "auxiliary_loss_clip": 0.01118845, + "auxiliary_loss_mlp": 0.01036529, + "balance_loss_clip": 1.02258134, + "balance_loss_mlp": 1.04280329, + "epoch": 0.38869682849842174, + "flos": 27154140376320.0, + "grad_norm": 1.8350923871455525, + "language_loss": 0.69608724, + "learning_rate": 2.7954429771498858e-06, + "loss": 0.717641, + "num_input_tokens_seen": 138768450, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7578125, + "step": 6465, + "time_per_iteration": 2.524698495864868 + }, + { + "auxiliary_loss_clip": 0.01120474, + "auxiliary_loss_mlp": 0.01043161, + "balance_loss_clip": 1.02772307, + "balance_loss_mlp": 1.04204226, + "epoch": 0.3887569517510897, + "flos": 21063619301760.0, + "grad_norm": 2.02186972310505, + "language_loss": 0.77957165, + "learning_rate": 2.7950856285651117e-06, + "loss": 0.80120802, + "num_input_tokens_seen": 138786775, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.78125, + "step": 6466, + "time_per_iteration": 2.4420318603515625 + }, + { + "auxiliary_loss_clip": 0.0112437, + "auxiliary_loss_mlp": 0.01039754, + "balance_loss_clip": 1.02520418, + "balance_loss_mlp": 1.04476476, + "epoch": 0.38881707500375773, + "flos": 29497384016640.0, + "grad_norm": 1.578436157089315, + "language_loss": 0.69438803, + "learning_rate": 2.794728249830611e-06, + "loss": 0.71602929, + "num_input_tokens_seen": 138810100, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.796875, + "step": 6467, + "time_per_iteration": 2.526315212249756 + }, + { + "auxiliary_loss_clip": 0.01122941, + "auxiliary_loss_mlp": 0.01048409, + "balance_loss_clip": 1.03337657, + "balance_loss_mlp": 1.04374123, + "epoch": 0.3888771982564257, + "flos": 17488281294720.0, + "grad_norm": 2.7189933074164316, + "language_loss": 0.83444071, + "learning_rate": 2.794370840959936e-06, + "loss": 0.85615414, + "num_input_tokens_seen": 138825140, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.79296875, + "step": 6468, + "time_per_iteration": 2.433612108230591 + }, + { + "auxiliary_loss_clip": 0.01119544, + "auxiliary_loss_mlp": 0.01040242, + "balance_loss_clip": 1.02720666, + "balance_loss_mlp": 1.04250181, + "epoch": 0.38893732150909366, + "flos": 21942425450880.0, + "grad_norm": 5.890128393718138, + "language_loss": 0.84300733, + "learning_rate": 2.7940134019666383e-06, + "loss": 0.86460519, + "num_input_tokens_seen": 138844115, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.76953125, + "step": 6469, + "time_per_iteration": 2.501368284225464 + }, + { + "auxiliary_loss_clip": 0.011205, + "auxiliary_loss_mlp": 0.0104307, + "balance_loss_clip": 1.02871704, + "balance_loss_mlp": 1.0433706, + "epoch": 0.3889974447617616, + "flos": 24276367468800.0, + "grad_norm": 1.6566744770772097, + "language_loss": 0.74790764, + "learning_rate": 2.793655932864273e-06, + "loss": 0.76954335, + "num_input_tokens_seen": 138860860, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.76953125, + "step": 6470, + "time_per_iteration": 5.350924015045166 + }, + { + "auxiliary_loss_clip": 0.01120302, + "auxiliary_loss_mlp": 0.01041359, + "balance_loss_clip": 1.02632678, + "balance_loss_mlp": 1.04234362, + "epoch": 0.3890575680144296, + "flos": 25667116208640.0, + "grad_norm": 1.5254918915202156, + "language_loss": 0.74916464, + "learning_rate": 2.7932984336663953e-06, + "loss": 0.77078122, + "num_input_tokens_seen": 138881910, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.78125, + "step": 6471, + "time_per_iteration": 5.323298215866089 + }, + { + "auxiliary_loss_clip": 0.01121653, + "auxiliary_loss_mlp": 0.01045365, + "balance_loss_clip": 1.0310601, + "balance_loss_mlp": 1.04548645, + "epoch": 0.38911769126709755, + "flos": 22855274714880.0, + "grad_norm": 1.9258613787227117, + "language_loss": 0.68053186, + "learning_rate": 2.792940904386562e-06, + "loss": 0.70220202, + "num_input_tokens_seen": 138900975, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.76171875, + "step": 6472, + "time_per_iteration": 2.453610420227051 + }, + { + "auxiliary_loss_clip": 0.01120597, + "auxiliary_loss_mlp": 0.01046672, + "balance_loss_clip": 1.0330584, + "balance_loss_mlp": 1.04305148, + "epoch": 0.3891778145197655, + "flos": 25447522412160.0, + "grad_norm": 1.6233097762345425, + "language_loss": 0.76542008, + "learning_rate": 2.7925833450383293e-06, + "loss": 0.7870928, + "num_input_tokens_seen": 138920795, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7734375, + "step": 6473, + "time_per_iteration": 2.487966775894165 + }, + { + "auxiliary_loss_clip": 0.01123459, + "auxiliary_loss_mlp": 0.01046447, + "balance_loss_clip": 1.03157008, + "balance_loss_mlp": 1.04532015, + "epoch": 0.3892379377724335, + "flos": 14027965614720.0, + "grad_norm": 1.8986671727726652, + "language_loss": 0.70897496, + "learning_rate": 2.792225755635257e-06, + "loss": 0.73067403, + "num_input_tokens_seen": 138938770, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78125, + "step": 6474, + "time_per_iteration": 2.4192309379577637 + }, + { + "auxiliary_loss_clip": 0.01121654, + "auxiliary_loss_mlp": 0.01039414, + "balance_loss_clip": 1.02607441, + "balance_loss_mlp": 1.04441047, + "epoch": 0.38929806102510145, + "flos": 20157449967360.0, + "grad_norm": 1.400231739949646, + "language_loss": 0.68822956, + "learning_rate": 2.7918681361909046e-06, + "loss": 0.70984024, + "num_input_tokens_seen": 138958880, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7734375, + "step": 6475, + "time_per_iteration": 2.508747100830078 + }, + { + "auxiliary_loss_clip": 0.01129756, + "auxiliary_loss_mlp": 0.0104873, + "balance_loss_clip": 1.03369188, + "balance_loss_mlp": 1.04747105, + "epoch": 0.3893581842777694, + "flos": 22163958581760.0, + "grad_norm": 2.0025883037810055, + "language_loss": 0.76052523, + "learning_rate": 2.7915104867188332e-06, + "loss": 0.78231013, + "num_input_tokens_seen": 138977240, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.82421875, + "step": 6476, + "time_per_iteration": 2.4432644844055176 + }, + { + "auxiliary_loss_clip": 0.01040957, + "auxiliary_loss_mlp": 0.01003672, + "balance_loss_clip": 1.00199068, + "balance_loss_mlp": 1.01581097, + "epoch": 0.3894183075304374, + "flos": 67301877392640.0, + "grad_norm": 0.7803986728659921, + "language_loss": 0.58254546, + "learning_rate": 2.7911528072326055e-06, + "loss": 0.60299176, + "num_input_tokens_seen": 139039035, + "router_z_loss_clip": 0.0168457, + "router_z_loss_mlp": 0.25, + "step": 6477, + "time_per_iteration": 3.0704691410064697 + }, + { + "auxiliary_loss_clip": 0.01123971, + "auxiliary_loss_mlp": 0.01038208, + "balance_loss_clip": 1.02279997, + "balance_loss_mlp": 1.04507279, + "epoch": 0.38947843078310534, + "flos": 18547502480640.0, + "grad_norm": 1.75333723767605, + "language_loss": 0.77916539, + "learning_rate": 2.7907950977457832e-06, + "loss": 0.80078721, + "num_input_tokens_seen": 139055560, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.7890625, + "step": 6478, + "time_per_iteration": 2.488922357559204 + }, + { + "auxiliary_loss_clip": 0.01118156, + "auxiliary_loss_mlp": 0.0103482, + "balance_loss_clip": 1.0212301, + "balance_loss_mlp": 1.04128957, + "epoch": 0.3895385540357733, + "flos": 14605875532800.0, + "grad_norm": 1.928920480761015, + "language_loss": 0.82250136, + "learning_rate": 2.7904373582719317e-06, + "loss": 0.8440311, + "num_input_tokens_seen": 139071865, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.76953125, + "step": 6479, + "time_per_iteration": 2.4171228408813477 + }, + { + "auxiliary_loss_clip": 0.01118219, + "auxiliary_loss_mlp": 0.01036864, + "balance_loss_clip": 1.02262461, + "balance_loss_mlp": 1.04175949, + "epoch": 0.38959867728844133, + "flos": 19975203336960.0, + "grad_norm": 1.7024032073041733, + "language_loss": 0.80111545, + "learning_rate": 2.790079588824617e-06, + "loss": 0.82266629, + "num_input_tokens_seen": 139089640, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.765625, + "step": 6480, + "time_per_iteration": 2.4750797748565674 + }, + { + "auxiliary_loss_clip": 0.01117569, + "auxiliary_loss_mlp": 0.01027596, + "balance_loss_clip": 1.01428056, + "balance_loss_mlp": 1.04215932, + "epoch": 0.3896588005411093, + "flos": 22672130244480.0, + "grad_norm": 1.550121095479633, + "language_loss": 0.83083898, + "learning_rate": 2.7897217894174038e-06, + "loss": 0.85229063, + "num_input_tokens_seen": 139109365, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75390625, + "step": 6481, + "time_per_iteration": 2.4715166091918945 + }, + { + "auxiliary_loss_clip": 0.01117656, + "auxiliary_loss_mlp": 0.01037477, + "balance_loss_clip": 1.02437592, + "balance_loss_mlp": 1.04459131, + "epoch": 0.38971892379377726, + "flos": 20996035862400.0, + "grad_norm": 1.557560720892756, + "language_loss": 0.75559932, + "learning_rate": 2.789363960063863e-06, + "loss": 0.77715063, + "num_input_tokens_seen": 139128260, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73046875, + "step": 6482, + "time_per_iteration": 2.4623568058013916 + }, + { + "auxiliary_loss_clip": 0.01119557, + "auxiliary_loss_mlp": 0.01033451, + "balance_loss_clip": 1.01972985, + "balance_loss_mlp": 1.04252028, + "epoch": 0.3897790470464452, + "flos": 22528487756160.0, + "grad_norm": 3.29893715214875, + "language_loss": 0.79150903, + "learning_rate": 2.78900610077756e-06, + "loss": 0.81303906, + "num_input_tokens_seen": 139147315, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76953125, + "step": 6483, + "time_per_iteration": 2.4530816078186035 + }, + { + "auxiliary_loss_clip": 0.01117537, + "auxiliary_loss_mlp": 0.01028675, + "balance_loss_clip": 1.0135119, + "balance_loss_mlp": 1.04091668, + "epoch": 0.3898391702991132, + "flos": 26209905603840.0, + "grad_norm": 1.4423872752445677, + "language_loss": 0.79842782, + "learning_rate": 2.788648211572067e-06, + "loss": 0.81989002, + "num_input_tokens_seen": 139167270, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.765625, + "step": 6484, + "time_per_iteration": 2.511016845703125 + }, + { + "auxiliary_loss_clip": 0.01121595, + "auxiliary_loss_mlp": 0.01044531, + "balance_loss_clip": 1.02905726, + "balance_loss_mlp": 1.04556251, + "epoch": 0.38989929355178116, + "flos": 21065558636160.0, + "grad_norm": 1.7756536915325172, + "language_loss": 0.78321344, + "learning_rate": 2.7882902924609557e-06, + "loss": 0.80487472, + "num_input_tokens_seen": 139185970, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.76171875, + "step": 6485, + "time_per_iteration": 2.443439245223999 + }, + { + "auxiliary_loss_clip": 0.01121432, + "auxiliary_loss_mlp": 0.0103836, + "balance_loss_clip": 1.02298832, + "balance_loss_mlp": 1.0427072, + "epoch": 0.3899594168044491, + "flos": 25484115392640.0, + "grad_norm": 2.7221954850945425, + "language_loss": 0.85305119, + "learning_rate": 2.7879323434577965e-06, + "loss": 0.87464917, + "num_input_tokens_seen": 139203730, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.78515625, + "step": 6486, + "time_per_iteration": 2.5056657791137695 + }, + { + "auxiliary_loss_clip": 0.01120884, + "auxiliary_loss_mlp": 0.01033404, + "balance_loss_clip": 1.01942706, + "balance_loss_mlp": 1.04115701, + "epoch": 0.3900195400571171, + "flos": 31139363456640.0, + "grad_norm": 1.7551040773297495, + "language_loss": 0.85345674, + "learning_rate": 2.7875743645761645e-06, + "loss": 0.87499964, + "num_input_tokens_seen": 139222560, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.796875, + "step": 6487, + "time_per_iteration": 2.577178478240967 + }, + { + "auxiliary_loss_clip": 0.01117565, + "auxiliary_loss_mlp": 0.01032303, + "balance_loss_clip": 1.01737833, + "balance_loss_mlp": 1.04198551, + "epoch": 0.39007966330978505, + "flos": 20229917656320.0, + "grad_norm": 1.5246902220393208, + "language_loss": 0.73225224, + "learning_rate": 2.787216355829633e-06, + "loss": 0.75375092, + "num_input_tokens_seen": 139242165, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.75390625, + "step": 6488, + "time_per_iteration": 2.523616075515747 + }, + { + "auxiliary_loss_clip": 0.01123928, + "auxiliary_loss_mlp": 0.0103261, + "balance_loss_clip": 1.01795912, + "balance_loss_mlp": 1.04519773, + "epoch": 0.390139786562453, + "flos": 22528739151360.0, + "grad_norm": 2.5708303691917815, + "language_loss": 0.68585873, + "learning_rate": 2.786858317231779e-06, + "loss": 0.7074241, + "num_input_tokens_seen": 139262525, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7890625, + "step": 6489, + "time_per_iteration": 2.478531837463379 + }, + { + "auxiliary_loss_clip": 0.01115096, + "auxiliary_loss_mlp": 0.01041079, + "balance_loss_clip": 1.02680993, + "balance_loss_mlp": 1.04124475, + "epoch": 0.390199909815121, + "flos": 26432911192320.0, + "grad_norm": 1.801271673710844, + "language_loss": 0.81112868, + "learning_rate": 2.7865002487961788e-06, + "loss": 0.83269042, + "num_input_tokens_seen": 139282835, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.734375, + "step": 6490, + "time_per_iteration": 2.511854887008667 + }, + { + "auxiliary_loss_clip": 0.01121469, + "auxiliary_loss_mlp": 0.01033838, + "balance_loss_clip": 1.0193367, + "balance_loss_mlp": 1.04286718, + "epoch": 0.39026003306778895, + "flos": 17274577328640.0, + "grad_norm": 1.9146492238240407, + "language_loss": 0.89305747, + "learning_rate": 2.7861421505364104e-06, + "loss": 0.91461056, + "num_input_tokens_seen": 139299490, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78515625, + "step": 6491, + "time_per_iteration": 2.460026264190674 + }, + { + "auxiliary_loss_clip": 0.01121295, + "auxiliary_loss_mlp": 0.01035704, + "balance_loss_clip": 1.02187026, + "balance_loss_mlp": 1.04215312, + "epoch": 0.3903201563204569, + "flos": 24532841554560.0, + "grad_norm": 1.8200320241713732, + "language_loss": 0.78811067, + "learning_rate": 2.7857840224660523e-06, + "loss": 0.80968064, + "num_input_tokens_seen": 139317865, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7890625, + "step": 6492, + "time_per_iteration": 2.529750108718872 + }, + { + "auxiliary_loss_clip": 0.01122151, + "auxiliary_loss_mlp": 0.01037174, + "balance_loss_clip": 1.02316093, + "balance_loss_mlp": 1.04309416, + "epoch": 0.39038027957312493, + "flos": 23767944410880.0, + "grad_norm": 1.613220074099035, + "language_loss": 0.74635601, + "learning_rate": 2.7854258645986857e-06, + "loss": 0.76794928, + "num_input_tokens_seen": 139339840, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 6493, + "time_per_iteration": 2.506000280380249 + }, + { + "auxiliary_loss_clip": 0.01123496, + "auxiliary_loss_mlp": 0.0103661, + "balance_loss_clip": 1.02160168, + "balance_loss_mlp": 1.04215276, + "epoch": 0.3904404028257929, + "flos": 14100612871680.0, + "grad_norm": 1.9992899078543964, + "language_loss": 0.76100057, + "learning_rate": 2.7850676769478916e-06, + "loss": 0.78260159, + "num_input_tokens_seen": 139357555, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8125, + "step": 6494, + "time_per_iteration": 2.4696662425994873 + }, + { + "auxiliary_loss_clip": 0.01128232, + "auxiliary_loss_mlp": 0.01048514, + "balance_loss_clip": 1.03233767, + "balance_loss_mlp": 1.04337156, + "epoch": 0.39050052607846086, + "flos": 16910048154240.0, + "grad_norm": 2.027559897328472, + "language_loss": 0.74284697, + "learning_rate": 2.7847094595272525e-06, + "loss": 0.76461446, + "num_input_tokens_seen": 139374455, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.84765625, + "step": 6495, + "time_per_iteration": 2.4156551361083984 + }, + { + "auxiliary_loss_clip": 0.01121782, + "auxiliary_loss_mlp": 0.01041912, + "balance_loss_clip": 1.02683187, + "balance_loss_mlp": 1.04346669, + "epoch": 0.39056064933112883, + "flos": 25915761129600.0, + "grad_norm": 1.725682312794404, + "language_loss": 0.67885542, + "learning_rate": 2.784351212350352e-06, + "loss": 0.70049238, + "num_input_tokens_seen": 139394770, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.78125, + "step": 6496, + "time_per_iteration": 2.530111789703369 + }, + { + "auxiliary_loss_clip": 0.01038749, + "auxiliary_loss_mlp": 0.01000219, + "balance_loss_clip": 0.99871743, + "balance_loss_mlp": 1.01313972, + "epoch": 0.3906207725837968, + "flos": 60028421713920.0, + "grad_norm": 0.6624336186281815, + "language_loss": 0.53998011, + "learning_rate": 2.783992935430775e-06, + "loss": 0.56036979, + "num_input_tokens_seen": 139454760, + "router_z_loss_clip": 0.01501465, + "router_z_loss_mlp": 0.25585938, + "step": 6497, + "time_per_iteration": 3.140427589416504 + }, + { + "auxiliary_loss_clip": 0.01119875, + "auxiliary_loss_mlp": 0.01038317, + "balance_loss_clip": 1.02404737, + "balance_loss_mlp": 1.04236674, + "epoch": 0.39068089583646476, + "flos": 21068683119360.0, + "grad_norm": 2.818865741362812, + "language_loss": 0.68966502, + "learning_rate": 2.7836346287821068e-06, + "loss": 0.71124697, + "num_input_tokens_seen": 139472645, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7734375, + "step": 6498, + "time_per_iteration": 2.4631001949310303 + }, + { + "auxiliary_loss_clip": 0.01037794, + "auxiliary_loss_mlp": 0.01003613, + "balance_loss_clip": 1.00210512, + "balance_loss_mlp": 1.0124712, + "epoch": 0.3907410190891327, + "flos": 70445677403520.0, + "grad_norm": 1.032001330091421, + "language_loss": 0.51830518, + "learning_rate": 2.783276292417936e-06, + "loss": 0.5387193, + "num_input_tokens_seen": 139536730, + "router_z_loss_clip": 0.01507568, + "router_z_loss_mlp": 0.25390625, + "step": 6499, + "time_per_iteration": 3.1206116676330566 + }, + { + "auxiliary_loss_clip": 0.01122549, + "auxiliary_loss_mlp": 0.01043094, + "balance_loss_clip": 1.0266552, + "balance_loss_mlp": 1.04158521, + "epoch": 0.3908011423418007, + "flos": 27962454084480.0, + "grad_norm": 1.8695650437594764, + "language_loss": 0.73693466, + "learning_rate": 2.7829179263518487e-06, + "loss": 0.75859112, + "num_input_tokens_seen": 139557540, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.80859375, + "step": 6500, + "time_per_iteration": 2.5413036346435547 + }, + { + "auxiliary_loss_clip": 0.01125544, + "auxiliary_loss_mlp": 0.01041341, + "balance_loss_clip": 1.02720869, + "balance_loss_mlp": 1.04501247, + "epoch": 0.39086126559446865, + "flos": 24462097718400.0, + "grad_norm": 2.5451317073491353, + "language_loss": 0.68355215, + "learning_rate": 2.7825595305974354e-06, + "loss": 0.70522094, + "num_input_tokens_seen": 139576875, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8046875, + "step": 6501, + "time_per_iteration": 2.4725823402404785 + }, + { + "auxiliary_loss_clip": 0.01118681, + "auxiliary_loss_mlp": 0.01039085, + "balance_loss_clip": 1.02550125, + "balance_loss_mlp": 1.04143763, + "epoch": 0.3909213888471366, + "flos": 16941541403520.0, + "grad_norm": 1.6766627212042646, + "language_loss": 0.79162323, + "learning_rate": 2.782201105168287e-06, + "loss": 0.81320089, + "num_input_tokens_seen": 139594295, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7734375, + "step": 6502, + "time_per_iteration": 2.4758012294769287 + }, + { + "auxiliary_loss_clip": 0.01118294, + "auxiliary_loss_mlp": 0.01037474, + "balance_loss_clip": 1.02378237, + "balance_loss_mlp": 1.0435648, + "epoch": 0.3909815120998046, + "flos": 29278400751360.0, + "grad_norm": 2.24722484247342, + "language_loss": 0.79379106, + "learning_rate": 2.7818426500779932e-06, + "loss": 0.81534874, + "num_input_tokens_seen": 139614080, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 6503, + "time_per_iteration": 2.510356903076172 + }, + { + "auxiliary_loss_clip": 0.0111339, + "auxiliary_loss_mlp": 0.01034049, + "balance_loss_clip": 1.02076924, + "balance_loss_mlp": 1.03882694, + "epoch": 0.39104163535247255, + "flos": 18951246328320.0, + "grad_norm": 1.8991979162106922, + "language_loss": 0.71695077, + "learning_rate": 2.7814841653401485e-06, + "loss": 0.73842514, + "num_input_tokens_seen": 139632755, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.74609375, + "step": 6504, + "time_per_iteration": 2.474257230758667 + }, + { + "auxiliary_loss_clip": 0.01116218, + "auxiliary_loss_mlp": 0.01038584, + "balance_loss_clip": 1.02404082, + "balance_loss_mlp": 1.03938556, + "epoch": 0.3911017586051405, + "flos": 26323347732480.0, + "grad_norm": 1.4403698273396093, + "language_loss": 0.83054864, + "learning_rate": 2.7811256509683454e-06, + "loss": 0.85209668, + "num_input_tokens_seen": 139654205, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.765625, + "step": 6505, + "time_per_iteration": 2.4917776584625244 + }, + { + "auxiliary_loss_clip": 0.01118318, + "auxiliary_loss_mlp": 0.01039423, + "balance_loss_clip": 1.02379465, + "balance_loss_mlp": 1.04268944, + "epoch": 0.3911618818578085, + "flos": 21835770992640.0, + "grad_norm": 1.9728617659661118, + "language_loss": 0.71202552, + "learning_rate": 2.7807671069761797e-06, + "loss": 0.73360288, + "num_input_tokens_seen": 139673595, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.7578125, + "step": 6506, + "time_per_iteration": 2.4846489429473877 + }, + { + "auxiliary_loss_clip": 0.01115165, + "auxiliary_loss_mlp": 0.01038977, + "balance_loss_clip": 1.02529216, + "balance_loss_mlp": 1.04129732, + "epoch": 0.3912220051104765, + "flos": 16359680989440.0, + "grad_norm": 2.0442674369719547, + "language_loss": 0.74914789, + "learning_rate": 2.7804085333772477e-06, + "loss": 0.77068931, + "num_input_tokens_seen": 139690565, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73828125, + "step": 6507, + "time_per_iteration": 2.4173166751861572 + }, + { + "auxiliary_loss_clip": 0.01036092, + "auxiliary_loss_mlp": 0.01010532, + "balance_loss_clip": 1.00900638, + "balance_loss_mlp": 1.01097417, + "epoch": 0.39128212836314447, + "flos": 71050986420480.0, + "grad_norm": 0.7697412763639314, + "language_loss": 0.56554615, + "learning_rate": 2.7800499301851446e-06, + "loss": 0.58601236, + "num_input_tokens_seen": 139749420, + "router_z_loss_clip": 0.01525879, + "router_z_loss_mlp": 0.25195312, + "step": 6508, + "time_per_iteration": 3.222599744796753 + }, + { + "auxiliary_loss_clip": 0.01118923, + "auxiliary_loss_mlp": 0.0103919, + "balance_loss_clip": 1.0256958, + "balance_loss_mlp": 1.04224479, + "epoch": 0.39134225161581243, + "flos": 20331975173760.0, + "grad_norm": 1.8903485988869968, + "language_loss": 0.7639432, + "learning_rate": 2.779691297413471e-06, + "loss": 0.78552431, + "num_input_tokens_seen": 139766265, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 6509, + "time_per_iteration": 2.4504122734069824 + }, + { + "auxiliary_loss_clip": 0.01119308, + "auxiliary_loss_mlp": 0.01046298, + "balance_loss_clip": 1.02919126, + "balance_loss_mlp": 1.04120517, + "epoch": 0.3914023748684804, + "flos": 17018390551680.0, + "grad_norm": 2.5320410479027284, + "language_loss": 0.82538676, + "learning_rate": 2.779332635075825e-06, + "loss": 0.84704286, + "num_input_tokens_seen": 139782400, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.78125, + "step": 6510, + "time_per_iteration": 2.4280829429626465 + }, + { + "auxiliary_loss_clip": 0.01119081, + "auxiliary_loss_mlp": 0.01036031, + "balance_loss_clip": 1.02202439, + "balance_loss_mlp": 1.04137504, + "epoch": 0.39146249812114836, + "flos": 18405224709120.0, + "grad_norm": 1.9726874536239134, + "language_loss": 0.76478642, + "learning_rate": 2.7789739431858073e-06, + "loss": 0.78633761, + "num_input_tokens_seen": 139801435, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.77734375, + "step": 6511, + "time_per_iteration": 2.438093662261963 + }, + { + "auxiliary_loss_clip": 0.01035954, + "auxiliary_loss_mlp": 0.01004811, + "balance_loss_clip": 1.0033921, + "balance_loss_mlp": 1.01070499, + "epoch": 0.3915226213738163, + "flos": 67637355442560.0, + "grad_norm": 0.7278620231464888, + "language_loss": 0.57780313, + "learning_rate": 2.7786152217570196e-06, + "loss": 0.59821081, + "num_input_tokens_seen": 139869700, + "router_z_loss_clip": 0.01416016, + "router_z_loss_mlp": 0.25390625, + "step": 6512, + "time_per_iteration": 6.094903230667114 + }, + { + "auxiliary_loss_clip": 0.01120485, + "auxiliary_loss_mlp": 0.01036295, + "balance_loss_clip": 1.02039289, + "balance_loss_mlp": 1.04215658, + "epoch": 0.3915827446264843, + "flos": 26359330181760.0, + "grad_norm": 1.6857291908308145, + "language_loss": 0.69891763, + "learning_rate": 2.7782564708030647e-06, + "loss": 0.72048545, + "num_input_tokens_seen": 139890140, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.78125, + "step": 6513, + "time_per_iteration": 3.8939309120178223 + }, + { + "auxiliary_loss_clip": 0.01122702, + "auxiliary_loss_mlp": 0.01039276, + "balance_loss_clip": 1.02474439, + "balance_loss_mlp": 1.04184556, + "epoch": 0.39164286787915226, + "flos": 21943897908480.0, + "grad_norm": 2.2930968868818606, + "language_loss": 0.76267236, + "learning_rate": 2.7778976903375464e-06, + "loss": 0.7842921, + "num_input_tokens_seen": 139908020, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.80859375, + "step": 6514, + "time_per_iteration": 2.4622693061828613 + }, + { + "auxiliary_loss_clip": 0.01118584, + "auxiliary_loss_mlp": 0.01035563, + "balance_loss_clip": 1.02168727, + "balance_loss_mlp": 1.04042864, + "epoch": 0.3917029911318202, + "flos": 16399829416320.0, + "grad_norm": 1.7838082674219136, + "language_loss": 0.77452338, + "learning_rate": 2.7775388803740693e-06, + "loss": 0.79606491, + "num_input_tokens_seen": 139926180, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.78125, + "step": 6515, + "time_per_iteration": 2.4336462020874023 + }, + { + "auxiliary_loss_clip": 0.01114007, + "auxiliary_loss_mlp": 0.01038217, + "balance_loss_clip": 1.02564025, + "balance_loss_mlp": 1.03940558, + "epoch": 0.3917631143844882, + "flos": 26211701283840.0, + "grad_norm": 1.4542421972503212, + "language_loss": 0.79846406, + "learning_rate": 2.7771800409262406e-06, + "loss": 0.81998634, + "num_input_tokens_seen": 139947420, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.74609375, + "step": 6516, + "time_per_iteration": 2.500826597213745 + }, + { + "auxiliary_loss_clip": 0.01118601, + "auxiliary_loss_mlp": 0.01033224, + "balance_loss_clip": 1.01891923, + "balance_loss_mlp": 1.04082477, + "epoch": 0.39182323763715615, + "flos": 18548364407040.0, + "grad_norm": 2.228742695866407, + "language_loss": 0.70205939, + "learning_rate": 2.7768211720076665e-06, + "loss": 0.72357762, + "num_input_tokens_seen": 139965800, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.77734375, + "step": 6517, + "time_per_iteration": 2.425739288330078 + }, + { + "auxiliary_loss_clip": 0.01117481, + "auxiliary_loss_mlp": 0.01036962, + "balance_loss_clip": 1.0218817, + "balance_loss_mlp": 1.03986263, + "epoch": 0.3918833608898241, + "flos": 34313543395200.0, + "grad_norm": 1.595983335780194, + "language_loss": 0.72092575, + "learning_rate": 2.776462273631956e-06, + "loss": 0.74247015, + "num_input_tokens_seen": 139988140, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7734375, + "step": 6518, + "time_per_iteration": 2.559140205383301 + }, + { + "auxiliary_loss_clip": 0.01118745, + "auxiliary_loss_mlp": 0.01032386, + "balance_loss_clip": 1.0179677, + "balance_loss_mlp": 1.04041731, + "epoch": 0.3919434841424921, + "flos": 36939582812160.0, + "grad_norm": 1.563160017416143, + "language_loss": 0.61668754, + "learning_rate": 2.7761033458127177e-06, + "loss": 0.63819885, + "num_input_tokens_seen": 140010060, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 6519, + "time_per_iteration": 2.5673322677612305 + }, + { + "auxiliary_loss_clip": 0.01124684, + "auxiliary_loss_mlp": 0.0104391, + "balance_loss_clip": 1.02800775, + "balance_loss_mlp": 1.04341698, + "epoch": 0.3920036073951601, + "flos": 23508956373120.0, + "grad_norm": 2.4564373100444232, + "language_loss": 0.6693083, + "learning_rate": 2.775744388563563e-06, + "loss": 0.6909942, + "num_input_tokens_seen": 140029400, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8125, + "step": 6520, + "time_per_iteration": 2.487650156021118 + }, + { + "auxiliary_loss_clip": 0.0111526, + "auxiliary_loss_mlp": 0.01033685, + "balance_loss_clip": 1.01958799, + "balance_loss_mlp": 1.03966665, + "epoch": 0.39206373064782807, + "flos": 18406086635520.0, + "grad_norm": 1.7599889377917473, + "language_loss": 0.78522319, + "learning_rate": 2.775385401898104e-06, + "loss": 0.80671263, + "num_input_tokens_seen": 140048940, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7578125, + "step": 6521, + "time_per_iteration": 2.418458938598633 + }, + { + "auxiliary_loss_clip": 0.0112345, + "auxiliary_loss_mlp": 0.01036245, + "balance_loss_clip": 1.01853049, + "balance_loss_mlp": 1.04218912, + "epoch": 0.39212385390049603, + "flos": 12313051608960.0, + "grad_norm": 2.4256865138527353, + "language_loss": 0.70340407, + "learning_rate": 2.775026385829952e-06, + "loss": 0.7250011, + "num_input_tokens_seen": 140066380, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.8125, + "step": 6522, + "time_per_iteration": 2.435802936553955 + }, + { + "auxiliary_loss_clip": 0.01120666, + "auxiliary_loss_mlp": 0.01034593, + "balance_loss_clip": 1.02013338, + "balance_loss_mlp": 1.04137838, + "epoch": 0.392183977153164, + "flos": 19719160214400.0, + "grad_norm": 1.8374103087918643, + "language_loss": 0.76740485, + "learning_rate": 2.774667340372722e-06, + "loss": 0.78895748, + "num_input_tokens_seen": 140085275, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7890625, + "step": 6523, + "time_per_iteration": 2.4279329776763916 + }, + { + "auxiliary_loss_clip": 0.01120195, + "auxiliary_loss_mlp": 0.0103949, + "balance_loss_clip": 1.02522683, + "balance_loss_mlp": 1.04124415, + "epoch": 0.39224410040583196, + "flos": 33144902403840.0, + "grad_norm": 2.339335808739943, + "language_loss": 0.61661494, + "learning_rate": 2.7743082655400293e-06, + "loss": 0.63821173, + "num_input_tokens_seen": 140105105, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7890625, + "step": 6524, + "time_per_iteration": 2.5591442584991455 + }, + { + "auxiliary_loss_clip": 0.01117506, + "auxiliary_loss_mlp": 0.0103718, + "balance_loss_clip": 1.02181363, + "balance_loss_mlp": 1.03898454, + "epoch": 0.39230422365849993, + "flos": 27782434097280.0, + "grad_norm": 1.6728206813409823, + "language_loss": 0.73940414, + "learning_rate": 2.773949161345489e-06, + "loss": 0.76095104, + "num_input_tokens_seen": 140125645, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.78515625, + "step": 6525, + "time_per_iteration": 2.4897830486297607 + }, + { + "auxiliary_loss_clip": 0.01117533, + "auxiliary_loss_mlp": 0.01036063, + "balance_loss_clip": 1.02224112, + "balance_loss_mlp": 1.03882146, + "epoch": 0.3923643469111679, + "flos": 17931634865280.0, + "grad_norm": 2.0942212479104363, + "language_loss": 0.81385779, + "learning_rate": 2.773590027802719e-06, + "loss": 0.83539373, + "num_input_tokens_seen": 140141925, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.78515625, + "step": 6526, + "time_per_iteration": 2.442091226577759 + }, + { + "auxiliary_loss_clip": 0.01115953, + "auxiliary_loss_mlp": 0.01036718, + "balance_loss_clip": 1.02265131, + "balance_loss_mlp": 1.03931344, + "epoch": 0.39242447016383586, + "flos": 24059539019520.0, + "grad_norm": 1.56527231709598, + "language_loss": 0.69802964, + "learning_rate": 2.7732308649253383e-06, + "loss": 0.71955633, + "num_input_tokens_seen": 140160965, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.765625, + "step": 6527, + "time_per_iteration": 2.465498924255371 + }, + { + "auxiliary_loss_clip": 0.01116064, + "auxiliary_loss_mlp": 0.01029624, + "balance_loss_clip": 1.0154264, + "balance_loss_mlp": 1.04067612, + "epoch": 0.3924845934165038, + "flos": 10664069016960.0, + "grad_norm": 2.4439619967755983, + "language_loss": 0.82215756, + "learning_rate": 2.772871672726965e-06, + "loss": 0.84361446, + "num_input_tokens_seen": 140177780, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.75390625, + "step": 6528, + "time_per_iteration": 2.488581418991089 + }, + { + "auxiliary_loss_clip": 0.01114295, + "auxiliary_loss_mlp": 0.0103716, + "balance_loss_clip": 1.02282465, + "balance_loss_mlp": 1.04024255, + "epoch": 0.3925447166691718, + "flos": 31245910174080.0, + "grad_norm": 1.4897772961790412, + "language_loss": 0.68726033, + "learning_rate": 2.7725124512212205e-06, + "loss": 0.70877492, + "num_input_tokens_seen": 140201660, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7421875, + "step": 6529, + "time_per_iteration": 2.5409562587738037 + }, + { + "auxiliary_loss_clip": 0.0111896, + "auxiliary_loss_mlp": 0.01039977, + "balance_loss_clip": 1.02561271, + "balance_loss_mlp": 1.04070282, + "epoch": 0.39260483992183975, + "flos": 29415040087680.0, + "grad_norm": 2.9003920421281926, + "language_loss": 0.79728955, + "learning_rate": 2.7721532004217267e-06, + "loss": 0.81887889, + "num_input_tokens_seen": 140218585, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78125, + "step": 6530, + "time_per_iteration": 2.514547109603882 + }, + { + "auxiliary_loss_clip": 0.01112608, + "auxiliary_loss_mlp": 0.01036942, + "balance_loss_clip": 1.02267241, + "balance_loss_mlp": 1.03750181, + "epoch": 0.3926649631745077, + "flos": 22857788666880.0, + "grad_norm": 1.6221630004730245, + "language_loss": 0.75564003, + "learning_rate": 2.7717939203421063e-06, + "loss": 0.77713549, + "num_input_tokens_seen": 140239905, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.75, + "step": 6531, + "time_per_iteration": 2.4572982788085938 + }, + { + "auxiliary_loss_clip": 0.01038893, + "auxiliary_loss_mlp": 0.0100286, + "balance_loss_clip": 1.00127435, + "balance_loss_mlp": 1.01370025, + "epoch": 0.3927250864271757, + "flos": 63893881872000.0, + "grad_norm": 0.8170127744653651, + "language_loss": 0.60378772, + "learning_rate": 2.7714346109959822e-06, + "loss": 0.62420523, + "num_input_tokens_seen": 140293820, + "router_z_loss_clip": 0.01586914, + "router_z_loss_mlp": 0.25195312, + "step": 6532, + "time_per_iteration": 2.929732084274292 + }, + { + "auxiliary_loss_clip": 0.01036987, + "auxiliary_loss_mlp": 0.01003862, + "balance_loss_clip": 1.00225282, + "balance_loss_mlp": 1.0117799, + "epoch": 0.3927852096798437, + "flos": 68909741890560.0, + "grad_norm": 0.7837299971611431, + "language_loss": 0.55545104, + "learning_rate": 2.771075272396981e-06, + "loss": 0.57585955, + "num_input_tokens_seen": 140360420, + "router_z_loss_clip": 0.01611328, + "router_z_loss_mlp": 0.25195312, + "step": 6533, + "time_per_iteration": 3.1820483207702637 + }, + { + "auxiliary_loss_clip": 0.01120735, + "auxiliary_loss_mlp": 0.01037419, + "balance_loss_clip": 1.02316761, + "balance_loss_mlp": 1.04170942, + "epoch": 0.39284533293251167, + "flos": 29715972232320.0, + "grad_norm": 1.9313522305780093, + "language_loss": 0.75972468, + "learning_rate": 2.7707159045587284e-06, + "loss": 0.78130615, + "num_input_tokens_seen": 140381950, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7890625, + "step": 6534, + "time_per_iteration": 2.5650813579559326 + }, + { + "auxiliary_loss_clip": 0.01122617, + "auxiliary_loss_mlp": 0.01038287, + "balance_loss_clip": 1.02376163, + "balance_loss_mlp": 1.04177046, + "epoch": 0.39290545618517964, + "flos": 18552027594240.0, + "grad_norm": 2.213634574223379, + "language_loss": 0.78067005, + "learning_rate": 2.770356507494851e-06, + "loss": 0.802279, + "num_input_tokens_seen": 140399410, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.80859375, + "step": 6535, + "time_per_iteration": 2.447950839996338 + }, + { + "auxiliary_loss_clip": 0.01113628, + "auxiliary_loss_mlp": 0.01032655, + "balance_loss_clip": 1.01950026, + "balance_loss_mlp": 1.03985262, + "epoch": 0.3929655794378476, + "flos": 26249479413120.0, + "grad_norm": 2.091132286884177, + "language_loss": 0.68613565, + "learning_rate": 2.769997081218978e-06, + "loss": 0.70759845, + "num_input_tokens_seen": 140419055, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73828125, + "step": 6536, + "time_per_iteration": 2.4873242378234863 + }, + { + "auxiliary_loss_clip": 0.01112038, + "auxiliary_loss_mlp": 0.01035235, + "balance_loss_clip": 1.0223484, + "balance_loss_mlp": 1.03908086, + "epoch": 0.39302570269051557, + "flos": 29277933874560.0, + "grad_norm": 1.7105256577096235, + "language_loss": 0.69052541, + "learning_rate": 2.769637625744738e-06, + "loss": 0.71199811, + "num_input_tokens_seen": 140438800, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.73046875, + "step": 6537, + "time_per_iteration": 2.5867457389831543 + }, + { + "auxiliary_loss_clip": 0.01117392, + "auxiliary_loss_mlp": 0.01038473, + "balance_loss_clip": 1.02420986, + "balance_loss_mlp": 1.04011965, + "epoch": 0.39308582594318353, + "flos": 17347440067200.0, + "grad_norm": 1.6628056753547982, + "language_loss": 0.79044384, + "learning_rate": 2.769278141085763e-06, + "loss": 0.81200254, + "num_input_tokens_seen": 140456880, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7734375, + "step": 6538, + "time_per_iteration": 2.437757968902588 + }, + { + "auxiliary_loss_clip": 0.01034351, + "auxiliary_loss_mlp": 0.01009828, + "balance_loss_clip": 1.0084635, + "balance_loss_mlp": 1.00972295, + "epoch": 0.3931459491958515, + "flos": 61007094650880.0, + "grad_norm": 0.8042725449961473, + "language_loss": 0.61871827, + "learning_rate": 2.768918627255683e-06, + "loss": 0.63916004, + "num_input_tokens_seen": 140507510, + "router_z_loss_clip": 0.01367188, + "router_z_loss_mlp": 0.24609375, + "step": 6539, + "time_per_iteration": 2.9012601375579834 + }, + { + "auxiliary_loss_clip": 0.01115165, + "auxiliary_loss_mlp": 0.01038758, + "balance_loss_clip": 1.02417326, + "balance_loss_mlp": 1.03897023, + "epoch": 0.39320607244851946, + "flos": 39016009249920.0, + "grad_norm": 2.1025744829352306, + "language_loss": 0.68334043, + "learning_rate": 2.7685590842681315e-06, + "loss": 0.70487964, + "num_input_tokens_seen": 140528740, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.76171875, + "step": 6540, + "time_per_iteration": 2.617544412612915 + }, + { + "auxiliary_loss_clip": 0.01114279, + "auxiliary_loss_mlp": 0.01035483, + "balance_loss_clip": 1.02167249, + "balance_loss_mlp": 1.0387044, + "epoch": 0.3932661957011874, + "flos": 24679752180480.0, + "grad_norm": 1.7155589252050778, + "language_loss": 0.72714561, + "learning_rate": 2.7681995121367433e-06, + "loss": 0.74864328, + "num_input_tokens_seen": 140547560, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 6541, + "time_per_iteration": 2.5576202869415283 + }, + { + "auxiliary_loss_clip": 0.01034882, + "auxiliary_loss_mlp": 0.01010056, + "balance_loss_clip": 1.00863171, + "balance_loss_mlp": 1.0103662, + "epoch": 0.3933263189538554, + "flos": 70096552185600.0, + "grad_norm": 0.8254504926360222, + "language_loss": 0.60302341, + "learning_rate": 2.7678399108751516e-06, + "loss": 0.62347269, + "num_input_tokens_seen": 140601175, + "router_z_loss_clip": 0.01422119, + "router_z_loss_mlp": 0.24511719, + "step": 6542, + "time_per_iteration": 2.921311378479004 + }, + { + "auxiliary_loss_clip": 0.01115263, + "auxiliary_loss_mlp": 0.01035713, + "balance_loss_clip": 1.02204013, + "balance_loss_mlp": 1.03968477, + "epoch": 0.39338644220652336, + "flos": 22929071207040.0, + "grad_norm": 1.9294145782355336, + "language_loss": 0.82255107, + "learning_rate": 2.7674802804969947e-06, + "loss": 0.84406084, + "num_input_tokens_seen": 140622200, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75390625, + "step": 6543, + "time_per_iteration": 2.5267767906188965 + }, + { + "auxiliary_loss_clip": 0.01111884, + "auxiliary_loss_mlp": 0.01035514, + "balance_loss_clip": 1.02153063, + "balance_loss_mlp": 1.03692436, + "epoch": 0.3934465654591913, + "flos": 30848163897600.0, + "grad_norm": 1.6066266241550669, + "language_loss": 0.69336796, + "learning_rate": 2.767120621015908e-06, + "loss": 0.7148419, + "num_input_tokens_seen": 140643125, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75, + "step": 6544, + "time_per_iteration": 2.5192980766296387 + }, + { + "auxiliary_loss_clip": 0.01118616, + "auxiliary_loss_mlp": 0.01042644, + "balance_loss_clip": 1.02729011, + "balance_loss_mlp": 1.03997457, + "epoch": 0.3935066887118593, + "flos": 29236528471680.0, + "grad_norm": 1.880723151689185, + "language_loss": 0.75104976, + "learning_rate": 2.76676093244553e-06, + "loss": 0.77266246, + "num_input_tokens_seen": 140662500, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.7890625, + "step": 6545, + "time_per_iteration": 2.5483953952789307 + }, + { + "auxiliary_loss_clip": 0.01112383, + "auxiliary_loss_mlp": 0.01035537, + "balance_loss_clip": 1.02350879, + "balance_loss_mlp": 1.04072022, + "epoch": 0.3935668119645273, + "flos": 19135288638720.0, + "grad_norm": 1.4191511939867936, + "language_loss": 0.74600172, + "learning_rate": 2.7664012147995015e-06, + "loss": 0.76748097, + "num_input_tokens_seen": 140681960, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.71875, + "step": 6546, + "time_per_iteration": 2.435189962387085 + }, + { + "auxiliary_loss_clip": 0.01120275, + "auxiliary_loss_mlp": 0.01037024, + "balance_loss_clip": 1.02256405, + "balance_loss_mlp": 1.03998446, + "epoch": 0.3936269352171953, + "flos": 18516116972160.0, + "grad_norm": 2.8050093889996326, + "language_loss": 0.81520575, + "learning_rate": 2.7660414680914617e-06, + "loss": 0.83677876, + "num_input_tokens_seen": 140699170, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.80078125, + "step": 6547, + "time_per_iteration": 2.5359435081481934 + }, + { + "auxiliary_loss_clip": 0.0111424, + "auxiliary_loss_mlp": 0.01028344, + "balance_loss_clip": 1.01444387, + "balance_loss_mlp": 1.03795588, + "epoch": 0.39368705846986324, + "flos": 15632813370240.0, + "grad_norm": 2.282095961224954, + "language_loss": 0.84300089, + "learning_rate": 2.7656816923350525e-06, + "loss": 0.86442673, + "num_input_tokens_seen": 140714920, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.765625, + "step": 6548, + "time_per_iteration": 2.430497407913208 + }, + { + "auxiliary_loss_clip": 0.01110548, + "auxiliary_loss_mlp": 0.01030679, + "balance_loss_clip": 1.01784039, + "balance_loss_mlp": 1.0382576, + "epoch": 0.3937471817225312, + "flos": 21325839563520.0, + "grad_norm": 1.5261467823901598, + "language_loss": 0.72481942, + "learning_rate": 2.7653218875439174e-06, + "loss": 0.74623168, + "num_input_tokens_seen": 140734595, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.72265625, + "step": 6549, + "time_per_iteration": 2.484938383102417 + }, + { + "auxiliary_loss_clip": 0.01116832, + "auxiliary_loss_mlp": 0.01034368, + "balance_loss_clip": 1.02025914, + "balance_loss_mlp": 1.04114747, + "epoch": 0.39380730497519917, + "flos": 20776693461120.0, + "grad_norm": 1.525417369659451, + "language_loss": 0.77678335, + "learning_rate": 2.764962053731699e-06, + "loss": 0.79829538, + "num_input_tokens_seen": 140754050, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7578125, + "step": 6550, + "time_per_iteration": 2.4533822536468506 + }, + { + "auxiliary_loss_clip": 0.01112095, + "auxiliary_loss_mlp": 0.0103049, + "balance_loss_clip": 1.01695979, + "balance_loss_mlp": 1.03770638, + "epoch": 0.39386742822786713, + "flos": 21609784575360.0, + "grad_norm": 1.6825180459961226, + "language_loss": 0.81065381, + "learning_rate": 2.7646021909120434e-06, + "loss": 0.83207965, + "num_input_tokens_seen": 140771440, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7421875, + "step": 6551, + "time_per_iteration": 2.4740419387817383 + }, + { + "auxiliary_loss_clip": 0.01115626, + "auxiliary_loss_mlp": 0.01037041, + "balance_loss_clip": 1.02310574, + "balance_loss_mlp": 1.03833413, + "epoch": 0.3939275514805351, + "flos": 12414642249600.0, + "grad_norm": 2.2350138021364003, + "language_loss": 0.80241704, + "learning_rate": 2.764242299098596e-06, + "loss": 0.82394373, + "num_input_tokens_seen": 140786715, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7734375, + "step": 6552, + "time_per_iteration": 2.4066245555877686 + }, + { + "auxiliary_loss_clip": 0.01118032, + "auxiliary_loss_mlp": 0.01038605, + "balance_loss_clip": 1.02449059, + "balance_loss_mlp": 1.04108357, + "epoch": 0.39398767473320306, + "flos": 18552027594240.0, + "grad_norm": 2.2028177738118884, + "language_loss": 0.71154666, + "learning_rate": 2.763882378305003e-06, + "loss": 0.73311305, + "num_input_tokens_seen": 140804950, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.76953125, + "step": 6553, + "time_per_iteration": 2.454035997390747 + }, + { + "auxiliary_loss_clip": 0.01115775, + "auxiliary_loss_mlp": 0.01034177, + "balance_loss_clip": 1.02037239, + "balance_loss_mlp": 1.0409205, + "epoch": 0.39404779798587103, + "flos": 29308888419840.0, + "grad_norm": 1.9276274050376605, + "language_loss": 0.63445336, + "learning_rate": 2.7635224285449144e-06, + "loss": 0.65595293, + "num_input_tokens_seen": 140822800, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.75, + "step": 6554, + "time_per_iteration": 5.467530250549316 + }, + { + "auxiliary_loss_clip": 0.01116231, + "auxiliary_loss_mlp": 0.0103909, + "balance_loss_clip": 1.02620983, + "balance_loss_mlp": 1.041237, + "epoch": 0.394107921238539, + "flos": 34897055834880.0, + "grad_norm": 2.7325305725381703, + "language_loss": 0.79567587, + "learning_rate": 2.7631624498319796e-06, + "loss": 0.81722915, + "num_input_tokens_seen": 140842940, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.75, + "step": 6555, + "time_per_iteration": 3.9707608222961426 + }, + { + "auxiliary_loss_clip": 0.01119332, + "auxiliary_loss_mlp": 0.01036045, + "balance_loss_clip": 1.0209887, + "balance_loss_mlp": 1.04194546, + "epoch": 0.39416804449120696, + "flos": 25081413039360.0, + "grad_norm": 1.8303237809157376, + "language_loss": 0.71571302, + "learning_rate": 2.7628024421798473e-06, + "loss": 0.73726678, + "num_input_tokens_seen": 140863060, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7734375, + "step": 6556, + "time_per_iteration": 2.5013363361358643 + }, + { + "auxiliary_loss_clip": 0.01115996, + "auxiliary_loss_mlp": 0.0103255, + "balance_loss_clip": 1.01806605, + "balance_loss_mlp": 1.03954887, + "epoch": 0.3942281677438749, + "flos": 32306639731200.0, + "grad_norm": 2.056709462434603, + "language_loss": 0.83915412, + "learning_rate": 2.7624424056021705e-06, + "loss": 0.86063957, + "num_input_tokens_seen": 140883795, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.765625, + "step": 6557, + "time_per_iteration": 2.7162060737609863 + }, + { + "auxiliary_loss_clip": 0.01115806, + "auxiliary_loss_mlp": 0.01035387, + "balance_loss_clip": 1.02195859, + "balance_loss_mlp": 1.04014397, + "epoch": 0.3942882909965429, + "flos": 24936621315840.0, + "grad_norm": 3.2694171829217953, + "language_loss": 0.80285048, + "learning_rate": 2.7620823401126004e-06, + "loss": 0.8243624, + "num_input_tokens_seen": 140903055, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 6558, + "time_per_iteration": 2.466904401779175 + }, + { + "auxiliary_loss_clip": 0.01115408, + "auxiliary_loss_mlp": 0.01033231, + "balance_loss_clip": 1.02037418, + "balance_loss_mlp": 1.04165912, + "epoch": 0.39434841424921085, + "flos": 11874797769600.0, + "grad_norm": 1.7254990423790144, + "language_loss": 0.71022832, + "learning_rate": 2.761722245724792e-06, + "loss": 0.73171461, + "num_input_tokens_seen": 140920685, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.734375, + "step": 6559, + "time_per_iteration": 2.474142551422119 + }, + { + "auxiliary_loss_clip": 0.01120627, + "auxiliary_loss_mlp": 0.0103693, + "balance_loss_clip": 1.02111125, + "balance_loss_mlp": 1.04030299, + "epoch": 0.3944085375018789, + "flos": 16361620323840.0, + "grad_norm": 1.8853849407225942, + "language_loss": 0.80391413, + "learning_rate": 2.7613621224524003e-06, + "loss": 0.82548964, + "num_input_tokens_seen": 140937320, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8046875, + "step": 6560, + "time_per_iteration": 2.4220218658447266 + }, + { + "auxiliary_loss_clip": 0.01121865, + "auxiliary_loss_mlp": 0.01037184, + "balance_loss_clip": 1.022223, + "balance_loss_mlp": 1.04395843, + "epoch": 0.39446866075454684, + "flos": 10633365866880.0, + "grad_norm": 3.2514761912447283, + "language_loss": 0.83440554, + "learning_rate": 2.7610019703090803e-06, + "loss": 0.85599601, + "num_input_tokens_seen": 140954855, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.78125, + "step": 6561, + "time_per_iteration": 2.458305835723877 + }, + { + "auxiliary_loss_clip": 0.01117482, + "auxiliary_loss_mlp": 0.01038407, + "balance_loss_clip": 1.02458477, + "balance_loss_mlp": 1.04098439, + "epoch": 0.3945287840072148, + "flos": 18187498419840.0, + "grad_norm": 2.862241713271481, + "language_loss": 0.79548055, + "learning_rate": 2.7606417893084887e-06, + "loss": 0.81703943, + "num_input_tokens_seen": 140973250, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 6562, + "time_per_iteration": 2.4390974044799805 + }, + { + "auxiliary_loss_clip": 0.01113935, + "auxiliary_loss_mlp": 0.01036907, + "balance_loss_clip": 1.02301359, + "balance_loss_mlp": 1.04043949, + "epoch": 0.39458890725988277, + "flos": 23039891642880.0, + "grad_norm": 1.512260767998718, + "language_loss": 0.81355608, + "learning_rate": 2.7602815794642853e-06, + "loss": 0.83506453, + "num_input_tokens_seen": 140993050, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.734375, + "step": 6563, + "time_per_iteration": 2.518843650817871 + }, + { + "auxiliary_loss_clip": 0.0111742, + "auxiliary_loss_mlp": 0.01040253, + "balance_loss_clip": 1.02541161, + "balance_loss_mlp": 1.041682, + "epoch": 0.39464903051255074, + "flos": 17159052211200.0, + "grad_norm": 1.9438463538262531, + "language_loss": 0.69416577, + "learning_rate": 2.759921340790127e-06, + "loss": 0.71574247, + "num_input_tokens_seen": 141010815, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7578125, + "step": 6564, + "time_per_iteration": 2.446140766143799 + }, + { + "auxiliary_loss_clip": 0.01118477, + "auxiliary_loss_mlp": 0.01034591, + "balance_loss_clip": 1.02079892, + "balance_loss_mlp": 1.04157352, + "epoch": 0.3947091537652187, + "flos": 15889000147200.0, + "grad_norm": 3.234298893133154, + "language_loss": 0.83141822, + "learning_rate": 2.759561073299676e-06, + "loss": 0.8529489, + "num_input_tokens_seen": 141028720, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76953125, + "step": 6565, + "time_per_iteration": 2.474611520767212 + }, + { + "auxiliary_loss_clip": 0.01115253, + "auxiliary_loss_mlp": 0.01033237, + "balance_loss_clip": 1.02002859, + "balance_loss_mlp": 1.04039359, + "epoch": 0.39476927701788667, + "flos": 18545491319040.0, + "grad_norm": 1.7678460287206497, + "language_loss": 0.82917452, + "learning_rate": 2.7592007770065937e-06, + "loss": 0.85065943, + "num_input_tokens_seen": 141046025, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.75, + "step": 6566, + "time_per_iteration": 2.432832956314087 + }, + { + "auxiliary_loss_clip": 0.01122918, + "auxiliary_loss_mlp": 0.01038867, + "balance_loss_clip": 1.02493143, + "balance_loss_mlp": 1.04225016, + "epoch": 0.39482940027055463, + "flos": 22275712771200.0, + "grad_norm": 2.357536272997057, + "language_loss": 0.7778033, + "learning_rate": 2.7588404519245403e-06, + "loss": 0.79942119, + "num_input_tokens_seen": 141066865, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.8046875, + "step": 6567, + "time_per_iteration": 2.5020110607147217 + }, + { + "auxiliary_loss_clip": 0.01110775, + "auxiliary_loss_mlp": 0.01040399, + "balance_loss_clip": 1.02689242, + "balance_loss_mlp": 1.04026425, + "epoch": 0.3948895235232226, + "flos": 14757634494720.0, + "grad_norm": 2.0625384967809546, + "language_loss": 0.80381507, + "learning_rate": 2.758480098067182e-06, + "loss": 0.8253268, + "num_input_tokens_seen": 141084210, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.70703125, + "step": 6568, + "time_per_iteration": 2.464186429977417 + }, + { + "auxiliary_loss_clip": 0.01116352, + "auxiliary_loss_mlp": 0.01036196, + "balance_loss_clip": 1.02282655, + "balance_loss_mlp": 1.04130197, + "epoch": 0.39494964677589056, + "flos": 22565763095040.0, + "grad_norm": 1.6625556258765348, + "language_loss": 0.84206939, + "learning_rate": 2.7581197154481816e-06, + "loss": 0.86359489, + "num_input_tokens_seen": 141103895, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.75, + "step": 6569, + "time_per_iteration": 2.4947829246520996 + }, + { + "auxiliary_loss_clip": 0.01118805, + "auxiliary_loss_mlp": 0.01037428, + "balance_loss_clip": 1.02418959, + "balance_loss_mlp": 1.04450357, + "epoch": 0.3950097700285585, + "flos": 22963186149120.0, + "grad_norm": 1.920459843417803, + "language_loss": 0.74973899, + "learning_rate": 2.7577593040812066e-06, + "loss": 0.77130127, + "num_input_tokens_seen": 141124000, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7421875, + "step": 6570, + "time_per_iteration": 2.50211763381958 + }, + { + "auxiliary_loss_clip": 0.01117011, + "auxiliary_loss_mlp": 0.01037708, + "balance_loss_clip": 1.02365923, + "balance_loss_mlp": 1.04104555, + "epoch": 0.3950698932812265, + "flos": 20595236929920.0, + "grad_norm": 1.649568183340291, + "language_loss": 0.79813123, + "learning_rate": 2.757398863979922e-06, + "loss": 0.81967843, + "num_input_tokens_seen": 141142535, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 6571, + "time_per_iteration": 2.477740526199341 + }, + { + "auxiliary_loss_clip": 0.01116017, + "auxiliary_loss_mlp": 0.01041795, + "balance_loss_clip": 1.02846146, + "balance_loss_mlp": 1.04203689, + "epoch": 0.39513001653389446, + "flos": 20375786787840.0, + "grad_norm": 1.628324795196944, + "language_loss": 0.77873337, + "learning_rate": 2.757038395157997e-06, + "loss": 0.80031145, + "num_input_tokens_seen": 141161575, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73828125, + "step": 6572, + "time_per_iteration": 2.4463839530944824 + }, + { + "auxiliary_loss_clip": 0.01118116, + "auxiliary_loss_mlp": 0.01040167, + "balance_loss_clip": 1.02636874, + "balance_loss_mlp": 1.0404911, + "epoch": 0.3951901397865625, + "flos": 26463650256000.0, + "grad_norm": 1.6456702645470058, + "language_loss": 0.7506038, + "learning_rate": 2.7566778976291002e-06, + "loss": 0.77218664, + "num_input_tokens_seen": 141181150, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.77734375, + "step": 6573, + "time_per_iteration": 2.501692295074463 + }, + { + "auxiliary_loss_clip": 0.01114036, + "auxiliary_loss_mlp": 0.01034006, + "balance_loss_clip": 1.02165031, + "balance_loss_mlp": 1.04046559, + "epoch": 0.39525026303923044, + "flos": 43838345767680.0, + "grad_norm": 1.4003162240803297, + "language_loss": 0.67956495, + "learning_rate": 2.7563173714069017e-06, + "loss": 0.70104533, + "num_input_tokens_seen": 141206310, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.734375, + "step": 6574, + "time_per_iteration": 2.6566920280456543 + }, + { + "auxiliary_loss_clip": 0.01119799, + "auxiliary_loss_mlp": 0.01034669, + "balance_loss_clip": 1.01978612, + "balance_loss_mlp": 1.04216623, + "epoch": 0.3953103862918984, + "flos": 18040803275520.0, + "grad_norm": 2.170019312223073, + "language_loss": 0.71719187, + "learning_rate": 2.755956816505072e-06, + "loss": 0.73873657, + "num_input_tokens_seen": 141223925, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7734375, + "step": 6575, + "time_per_iteration": 2.463792085647583 + }, + { + "auxiliary_loss_clip": 0.01119276, + "auxiliary_loss_mlp": 0.01042807, + "balance_loss_clip": 1.02859664, + "balance_loss_mlp": 1.04105997, + "epoch": 0.3953705095445664, + "flos": 16976015481600.0, + "grad_norm": 2.0080051897694324, + "language_loss": 0.73535955, + "learning_rate": 2.7555962329372845e-06, + "loss": 0.75698036, + "num_input_tokens_seen": 141239010, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 6576, + "time_per_iteration": 2.409817934036255 + }, + { + "auxiliary_loss_clip": 0.01115385, + "auxiliary_loss_mlp": 0.01036906, + "balance_loss_clip": 1.0243237, + "balance_loss_mlp": 1.03979337, + "epoch": 0.39543063279723434, + "flos": 17411144837760.0, + "grad_norm": 2.36733568983198, + "language_loss": 0.83294857, + "learning_rate": 2.7552356207172124e-06, + "loss": 0.8544715, + "num_input_tokens_seen": 141252255, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7578125, + "step": 6577, + "time_per_iteration": 2.4421181678771973 + }, + { + "auxiliary_loss_clip": 0.01115466, + "auxiliary_loss_mlp": 0.01031962, + "balance_loss_clip": 1.01860428, + "balance_loss_mlp": 1.04138541, + "epoch": 0.3954907560499023, + "flos": 22784207656320.0, + "grad_norm": 3.8530294325048984, + "language_loss": 0.89916354, + "learning_rate": 2.75487497985853e-06, + "loss": 0.92063785, + "num_input_tokens_seen": 141269325, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7421875, + "step": 6578, + "time_per_iteration": 2.470369577407837 + }, + { + "auxiliary_loss_clip": 0.01118987, + "auxiliary_loss_mlp": 0.01037124, + "balance_loss_clip": 1.02170992, + "balance_loss_mlp": 1.04030561, + "epoch": 0.39555087930257027, + "flos": 21944400698880.0, + "grad_norm": 1.7408596896151103, + "language_loss": 0.77871025, + "learning_rate": 2.7545143103749117e-06, + "loss": 0.80027139, + "num_input_tokens_seen": 141288505, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.7890625, + "step": 6579, + "time_per_iteration": 2.4619040489196777 + }, + { + "auxiliary_loss_clip": 0.01119633, + "auxiliary_loss_mlp": 0.01031625, + "balance_loss_clip": 1.01760054, + "balance_loss_mlp": 1.0407021, + "epoch": 0.39561100255523823, + "flos": 20404622430720.0, + "grad_norm": 2.037188254408411, + "language_loss": 0.68324131, + "learning_rate": 2.754153612280037e-06, + "loss": 0.70475388, + "num_input_tokens_seen": 141303680, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 6580, + "time_per_iteration": 2.4363577365875244 + }, + { + "auxiliary_loss_clip": 0.01115068, + "auxiliary_loss_mlp": 0.01028446, + "balance_loss_clip": 1.01499939, + "balance_loss_mlp": 1.04099488, + "epoch": 0.3956711258079062, + "flos": 27964572986880.0, + "grad_norm": 1.613777567548473, + "language_loss": 0.58620721, + "learning_rate": 2.7537928855875797e-06, + "loss": 0.60764229, + "num_input_tokens_seen": 141324090, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7421875, + "step": 6581, + "time_per_iteration": 2.5704734325408936 + }, + { + "auxiliary_loss_clip": 0.01118807, + "auxiliary_loss_mlp": 0.01038995, + "balance_loss_clip": 1.02479148, + "balance_loss_mlp": 1.04165769, + "epoch": 0.39573124906057416, + "flos": 14428297670400.0, + "grad_norm": 2.015576445189345, + "language_loss": 0.698632, + "learning_rate": 2.7534321303112224e-06, + "loss": 0.72021002, + "num_input_tokens_seen": 141342235, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7734375, + "step": 6582, + "time_per_iteration": 2.4640939235687256 + }, + { + "auxiliary_loss_clip": 0.01118406, + "auxiliary_loss_mlp": 0.01035395, + "balance_loss_clip": 1.02167404, + "balance_loss_mlp": 1.0415566, + "epoch": 0.39579137231324213, + "flos": 18733699607040.0, + "grad_norm": 2.285451965985758, + "language_loss": 0.76454568, + "learning_rate": 2.753071346464642e-06, + "loss": 0.78608364, + "num_input_tokens_seen": 141361195, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76953125, + "step": 6583, + "time_per_iteration": 2.437396287918091 + }, + { + "auxiliary_loss_clip": 0.01118401, + "auxiliary_loss_mlp": 0.01030837, + "balance_loss_clip": 1.01708043, + "balance_loss_mlp": 1.04192805, + "epoch": 0.3958514955659101, + "flos": 17676417755520.0, + "grad_norm": 1.5685917359515968, + "language_loss": 0.65989023, + "learning_rate": 2.7527105340615207e-06, + "loss": 0.68138266, + "num_input_tokens_seen": 141378275, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.765625, + "step": 6584, + "time_per_iteration": 2.4562485218048096 + }, + { + "auxiliary_loss_clip": 0.01120331, + "auxiliary_loss_mlp": 0.01037784, + "balance_loss_clip": 1.02262115, + "balance_loss_mlp": 1.04122627, + "epoch": 0.39591161881857806, + "flos": 29309103901440.0, + "grad_norm": 2.6735523944320136, + "language_loss": 0.72423065, + "learning_rate": 2.7523496931155413e-06, + "loss": 0.74581182, + "num_input_tokens_seen": 141396960, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.7890625, + "step": 6585, + "time_per_iteration": 2.517333984375 + }, + { + "auxiliary_loss_clip": 0.0111653, + "auxiliary_loss_mlp": 0.01031743, + "balance_loss_clip": 1.01811159, + "balance_loss_mlp": 1.04010367, + "epoch": 0.3959717420712461, + "flos": 25771831332480.0, + "grad_norm": 1.986310622320223, + "language_loss": 0.73430967, + "learning_rate": 2.7519888236403856e-06, + "loss": 0.75579244, + "num_input_tokens_seen": 141417320, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.765625, + "step": 6586, + "time_per_iteration": 2.513847827911377 + }, + { + "auxiliary_loss_clip": 0.01117404, + "auxiliary_loss_mlp": 0.01031194, + "balance_loss_clip": 1.01738322, + "balance_loss_mlp": 1.04139459, + "epoch": 0.39603186532391405, + "flos": 20923783655040.0, + "grad_norm": 2.2420315368265915, + "language_loss": 0.71627617, + "learning_rate": 2.7516279256497382e-06, + "loss": 0.73776209, + "num_input_tokens_seen": 141435985, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 6587, + "time_per_iteration": 2.498534917831421 + }, + { + "auxiliary_loss_clip": 0.01038457, + "auxiliary_loss_mlp": 0.01003592, + "balance_loss_clip": 1.0020541, + "balance_loss_mlp": 1.01416993, + "epoch": 0.396091988576582, + "flos": 54880986176640.0, + "grad_norm": 0.9067384171744824, + "language_loss": 0.61162889, + "learning_rate": 2.751266999157285e-06, + "loss": 0.63204944, + "num_input_tokens_seen": 141486075, + "router_z_loss_clip": 0.01531982, + "router_z_loss_mlp": 0.2421875, + "step": 6588, + "time_per_iteration": 2.9129557609558105 + }, + { + "auxiliary_loss_clip": 0.01117429, + "auxiliary_loss_mlp": 0.01035443, + "balance_loss_clip": 1.0215075, + "balance_loss_mlp": 1.04087436, + "epoch": 0.39615211182925, + "flos": 20702896968960.0, + "grad_norm": 1.9745840784771536, + "language_loss": 0.81579673, + "learning_rate": 2.7509060441767115e-06, + "loss": 0.83732545, + "num_input_tokens_seen": 141505280, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 6589, + "time_per_iteration": 2.487581253051758 + }, + { + "auxiliary_loss_clip": 0.01118186, + "auxiliary_loss_mlp": 0.01033247, + "balance_loss_clip": 1.01858449, + "balance_loss_mlp": 1.04102254, + "epoch": 0.39621223508191794, + "flos": 20994312009600.0, + "grad_norm": 2.0157149751951606, + "language_loss": 0.70171028, + "learning_rate": 2.7505450607217057e-06, + "loss": 0.72322464, + "num_input_tokens_seen": 141523930, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7734375, + "step": 6590, + "time_per_iteration": 2.4837629795074463 + }, + { + "auxiliary_loss_clip": 0.01118811, + "auxiliary_loss_mlp": 0.01039001, + "balance_loss_clip": 1.02517259, + "balance_loss_mlp": 1.04276454, + "epoch": 0.3962723583345859, + "flos": 23368833417600.0, + "grad_norm": 1.6568331410473631, + "language_loss": 0.76061213, + "learning_rate": 2.750184048805956e-06, + "loss": 0.7821902, + "num_input_tokens_seen": 141541320, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76171875, + "step": 6591, + "time_per_iteration": 2.574401617050171 + }, + { + "auxiliary_loss_clip": 0.01119076, + "auxiliary_loss_mlp": 0.0104207, + "balance_loss_clip": 1.02803326, + "balance_loss_mlp": 1.04253912, + "epoch": 0.39633248158725387, + "flos": 25115599808640.0, + "grad_norm": 1.7800794685008139, + "language_loss": 0.79121935, + "learning_rate": 2.749823008443152e-06, + "loss": 0.81283081, + "num_input_tokens_seen": 141561880, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.765625, + "step": 6592, + "time_per_iteration": 2.5065057277679443 + }, + { + "auxiliary_loss_clip": 0.01112832, + "auxiliary_loss_mlp": 0.010329, + "balance_loss_clip": 1.01945305, + "balance_loss_mlp": 1.04020298, + "epoch": 0.39639260483992184, + "flos": 39787622236800.0, + "grad_norm": 1.6584377020479992, + "language_loss": 0.69372392, + "learning_rate": 2.7494619396469843e-06, + "loss": 0.71518123, + "num_input_tokens_seen": 141586460, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7265625, + "step": 6593, + "time_per_iteration": 2.691351890563965 + }, + { + "auxiliary_loss_clip": 0.01119923, + "auxiliary_loss_mlp": 0.01038681, + "balance_loss_clip": 1.02389932, + "balance_loss_mlp": 1.04100418, + "epoch": 0.3964527280925898, + "flos": 17347045017600.0, + "grad_norm": 1.6545825162449217, + "language_loss": 0.77913815, + "learning_rate": 2.7491008424311452e-06, + "loss": 0.80072421, + "num_input_tokens_seen": 141605955, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7890625, + "step": 6594, + "time_per_iteration": 2.452536106109619 + }, + { + "auxiliary_loss_clip": 0.01038921, + "auxiliary_loss_mlp": 0.01002091, + "balance_loss_clip": 1.0005945, + "balance_loss_mlp": 1.0146898, + "epoch": 0.39651285134525777, + "flos": 71717848369920.0, + "grad_norm": 0.9454940833877284, + "language_loss": 0.63038307, + "learning_rate": 2.7487397168093265e-06, + "loss": 0.65079319, + "num_input_tokens_seen": 141673140, + "router_z_loss_clip": 0.01495361, + "router_z_loss_mlp": 0.2421875, + "step": 6595, + "time_per_iteration": 6.018520355224609 + }, + { + "auxiliary_loss_clip": 0.01121925, + "auxiliary_loss_mlp": 0.01044146, + "balance_loss_clip": 1.02908421, + "balance_loss_mlp": 1.04294038, + "epoch": 0.39657297459792573, + "flos": 25775710001280.0, + "grad_norm": 2.072222886004575, + "language_loss": 0.6329869, + "learning_rate": 2.748378562795223e-06, + "loss": 0.65464759, + "num_input_tokens_seen": 141692955, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7890625, + "step": 6596, + "time_per_iteration": 5.302752494812012 + }, + { + "auxiliary_loss_clip": 0.01115587, + "auxiliary_loss_mlp": 0.01035004, + "balance_loss_clip": 1.02110457, + "balance_loss_mlp": 1.04157937, + "epoch": 0.3966330978505937, + "flos": 20266115587200.0, + "grad_norm": 2.0492451282774273, + "language_loss": 0.78553772, + "learning_rate": 2.7480173804025293e-06, + "loss": 0.80704355, + "num_input_tokens_seen": 141710680, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.73828125, + "step": 6597, + "time_per_iteration": 2.457028388977051 + }, + { + "auxiliary_loss_clip": 0.01121814, + "auxiliary_loss_mlp": 0.01040279, + "balance_loss_clip": 1.02558672, + "balance_loss_mlp": 1.04262114, + "epoch": 0.39669322110326166, + "flos": 20631183465600.0, + "grad_norm": 1.95592503590265, + "language_loss": 0.67559552, + "learning_rate": 2.747656169644941e-06, + "loss": 0.69721651, + "num_input_tokens_seen": 141729860, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.79296875, + "step": 6598, + "time_per_iteration": 2.4448981285095215 + }, + { + "auxiliary_loss_clip": 0.01117545, + "auxiliary_loss_mlp": 0.01034771, + "balance_loss_clip": 1.02153933, + "balance_loss_mlp": 1.0411458, + "epoch": 0.3967533443559297, + "flos": 21726063878400.0, + "grad_norm": 2.3323846151329235, + "language_loss": 0.78922117, + "learning_rate": 2.747294930536157e-06, + "loss": 0.81074429, + "num_input_tokens_seen": 141749060, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.765625, + "step": 6599, + "time_per_iteration": 2.4799394607543945 + }, + { + "auxiliary_loss_clip": 0.01117884, + "auxiliary_loss_mlp": 0.01032083, + "balance_loss_clip": 1.01680064, + "balance_loss_mlp": 1.04196167, + "epoch": 0.39681346760859765, + "flos": 25484151306240.0, + "grad_norm": 1.67964508136209, + "language_loss": 0.72716624, + "learning_rate": 2.7469336630898737e-06, + "loss": 0.74866593, + "num_input_tokens_seen": 141769860, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.7578125, + "step": 6600, + "time_per_iteration": 2.4940543174743652 + }, + { + "auxiliary_loss_clip": 0.01115602, + "auxiliary_loss_mlp": 0.01032131, + "balance_loss_clip": 1.01864827, + "balance_loss_mlp": 1.03997052, + "epoch": 0.3968735908612656, + "flos": 20959586536320.0, + "grad_norm": 1.9442093512958227, + "language_loss": 0.85773253, + "learning_rate": 2.746572367319791e-06, + "loss": 0.87920988, + "num_input_tokens_seen": 141788465, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 6601, + "time_per_iteration": 2.4826369285583496 + }, + { + "auxiliary_loss_clip": 0.01124374, + "auxiliary_loss_mlp": 0.01038225, + "balance_loss_clip": 1.02191091, + "balance_loss_mlp": 1.04298782, + "epoch": 0.3969337141139336, + "flos": 10707090531840.0, + "grad_norm": 2.3202277168625054, + "language_loss": 0.70015699, + "learning_rate": 2.7462110432396095e-06, + "loss": 0.72178292, + "num_input_tokens_seen": 141804955, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8125, + "step": 6602, + "time_per_iteration": 2.4452199935913086 + }, + { + "auxiliary_loss_clip": 0.01119686, + "auxiliary_loss_mlp": 0.01038286, + "balance_loss_clip": 1.02458847, + "balance_loss_mlp": 1.04225206, + "epoch": 0.39699383736660154, + "flos": 17593714690560.0, + "grad_norm": 2.564497124514123, + "language_loss": 0.83408487, + "learning_rate": 2.7458496908630305e-06, + "loss": 0.85566461, + "num_input_tokens_seen": 141820025, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 6603, + "time_per_iteration": 2.50046968460083 + }, + { + "auxiliary_loss_clip": 0.01115539, + "auxiliary_loss_mlp": 0.01032527, + "balance_loss_clip": 1.0192889, + "balance_loss_mlp": 1.04076076, + "epoch": 0.3970539606192695, + "flos": 17785945301760.0, + "grad_norm": 1.4733286794124776, + "language_loss": 0.72804213, + "learning_rate": 2.7454883102037563e-06, + "loss": 0.74952281, + "num_input_tokens_seen": 141838735, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.74609375, + "step": 6604, + "time_per_iteration": 2.435645580291748 + }, + { + "auxiliary_loss_clip": 0.01114015, + "auxiliary_loss_mlp": 0.01037208, + "balance_loss_clip": 1.02366602, + "balance_loss_mlp": 1.0427258, + "epoch": 0.3971140838719375, + "flos": 24789495208320.0, + "grad_norm": 1.694386771997249, + "language_loss": 0.82919562, + "learning_rate": 2.745126901275491e-06, + "loss": 0.85070789, + "num_input_tokens_seen": 141858090, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7109375, + "step": 6605, + "time_per_iteration": 2.538792371749878 + }, + { + "auxiliary_loss_clip": 0.01113567, + "auxiliary_loss_mlp": 0.0103244, + "balance_loss_clip": 1.02053654, + "balance_loss_mlp": 1.04017544, + "epoch": 0.39717420712460544, + "flos": 24243581329920.0, + "grad_norm": 1.515379376113219, + "language_loss": 0.73755872, + "learning_rate": 2.7447654640919383e-06, + "loss": 0.75901884, + "num_input_tokens_seen": 141877540, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.734375, + "step": 6606, + "time_per_iteration": 2.4766290187835693 + }, + { + "auxiliary_loss_clip": 0.0111968, + "auxiliary_loss_mlp": 0.01034451, + "balance_loss_clip": 1.0207423, + "balance_loss_mlp": 1.04279184, + "epoch": 0.3972343303772734, + "flos": 25884698843520.0, + "grad_norm": 1.9669838489657716, + "language_loss": 0.73925817, + "learning_rate": 2.744403998666805e-06, + "loss": 0.76079941, + "num_input_tokens_seen": 141897315, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76953125, + "step": 6607, + "time_per_iteration": 2.550140380859375 + }, + { + "auxiliary_loss_clip": 0.01121372, + "auxiliary_loss_mlp": 0.01034109, + "balance_loss_clip": 1.02045417, + "balance_loss_mlp": 1.04417753, + "epoch": 0.39729445362994137, + "flos": 45623716300800.0, + "grad_norm": 1.5241940789626238, + "language_loss": 0.67978024, + "learning_rate": 2.744042505013797e-06, + "loss": 0.70133507, + "num_input_tokens_seen": 141919580, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 6608, + "time_per_iteration": 2.70333194732666 + }, + { + "auxiliary_loss_clip": 0.01120221, + "auxiliary_loss_mlp": 0.01042402, + "balance_loss_clip": 1.0263803, + "balance_loss_mlp": 1.04247403, + "epoch": 0.39735457688260933, + "flos": 20193971120640.0, + "grad_norm": 2.3779993769587486, + "language_loss": 0.74649572, + "learning_rate": 2.7436809831466233e-06, + "loss": 0.76812196, + "num_input_tokens_seen": 141937045, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.77734375, + "step": 6609, + "time_per_iteration": 2.4810678958892822 + }, + { + "auxiliary_loss_clip": 0.01119236, + "auxiliary_loss_mlp": 0.01032767, + "balance_loss_clip": 1.01909387, + "balance_loss_mlp": 1.04284418, + "epoch": 0.3974147001352773, + "flos": 23331163029120.0, + "grad_norm": 4.182923272039756, + "language_loss": 0.71530509, + "learning_rate": 2.7433194330789927e-06, + "loss": 0.73682511, + "num_input_tokens_seen": 141956695, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.765625, + "step": 6610, + "time_per_iteration": 2.483358860015869 + }, + { + "auxiliary_loss_clip": 0.01110348, + "auxiliary_loss_mlp": 0.0103254, + "balance_loss_clip": 1.01881909, + "balance_loss_mlp": 1.03868747, + "epoch": 0.39747482338794526, + "flos": 21688644885120.0, + "grad_norm": 1.6591621928280806, + "language_loss": 0.7848928, + "learning_rate": 2.7429578548246133e-06, + "loss": 0.80632162, + "num_input_tokens_seen": 141975935, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71484375, + "step": 6611, + "time_per_iteration": 2.4707412719726562 + }, + { + "auxiliary_loss_clip": 0.01120047, + "auxiliary_loss_mlp": 0.01036907, + "balance_loss_clip": 1.0234127, + "balance_loss_mlp": 1.04496026, + "epoch": 0.3975349466406133, + "flos": 30988717816320.0, + "grad_norm": 1.7910222988347433, + "language_loss": 0.78681552, + "learning_rate": 2.7425962483971985e-06, + "loss": 0.80838501, + "num_input_tokens_seen": 141995750, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 6612, + "time_per_iteration": 2.552384614944458 + }, + { + "auxiliary_loss_clip": 0.01042423, + "auxiliary_loss_mlp": 0.01023175, + "balance_loss_clip": 1.02180374, + "balance_loss_mlp": 1.01794136, + "epoch": 0.39759506989328125, + "flos": 63683948833920.0, + "grad_norm": 0.8703127674216669, + "language_loss": 0.64956641, + "learning_rate": 2.742234613810459e-06, + "loss": 0.6702224, + "num_input_tokens_seen": 142057655, + "router_z_loss_clip": 0.01373291, + "router_z_loss_mlp": 0.24414062, + "step": 6613, + "time_per_iteration": 2.978494882583618 + }, + { + "auxiliary_loss_clip": 0.01116625, + "auxiliary_loss_mlp": 0.01031073, + "balance_loss_clip": 1.01683927, + "balance_loss_mlp": 1.04148316, + "epoch": 0.3976551931459492, + "flos": 23695835857920.0, + "grad_norm": 2.0550022834902797, + "language_loss": 0.71538055, + "learning_rate": 2.741872951078109e-06, + "loss": 0.73685759, + "num_input_tokens_seen": 142076020, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.75390625, + "step": 6614, + "time_per_iteration": 2.4898061752319336 + }, + { + "auxiliary_loss_clip": 0.01116246, + "auxiliary_loss_mlp": 0.0103036, + "balance_loss_clip": 1.01644266, + "balance_loss_mlp": 1.04124689, + "epoch": 0.3977153163986172, + "flos": 15669657745920.0, + "grad_norm": 1.8540793086422767, + "language_loss": 0.81317735, + "learning_rate": 2.741511260213862e-06, + "loss": 0.83464336, + "num_input_tokens_seen": 142093790, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 6615, + "time_per_iteration": 2.4708592891693115 + }, + { + "auxiliary_loss_clip": 0.01116463, + "auxiliary_loss_mlp": 0.01033552, + "balance_loss_clip": 1.02074313, + "balance_loss_mlp": 1.04221725, + "epoch": 0.39777543965128515, + "flos": 14064702249600.0, + "grad_norm": 2.466828000769562, + "language_loss": 0.67015827, + "learning_rate": 2.741149541231434e-06, + "loss": 0.69165838, + "num_input_tokens_seen": 142110545, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7421875, + "step": 6616, + "time_per_iteration": 2.4453790187835693 + }, + { + "auxiliary_loss_clip": 0.01120268, + "auxiliary_loss_mlp": 0.01034152, + "balance_loss_clip": 1.02032995, + "balance_loss_mlp": 1.04185963, + "epoch": 0.3978355629039531, + "flos": 23367468700800.0, + "grad_norm": 2.097035382924748, + "language_loss": 0.83857769, + "learning_rate": 2.740787794144541e-06, + "loss": 0.86012185, + "num_input_tokens_seen": 142128695, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.78125, + "step": 6617, + "time_per_iteration": 2.4740309715270996 + }, + { + "auxiliary_loss_clip": 0.01113934, + "auxiliary_loss_mlp": 0.01035523, + "balance_loss_clip": 1.02256477, + "balance_loss_mlp": 1.04305041, + "epoch": 0.3978956861566211, + "flos": 19062785036160.0, + "grad_norm": 1.6139116519566428, + "language_loss": 0.72253633, + "learning_rate": 2.7404260189669e-06, + "loss": 0.74403095, + "num_input_tokens_seen": 142148375, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 6618, + "time_per_iteration": 2.451362371444702 + }, + { + "auxiliary_loss_clip": 0.01117142, + "auxiliary_loss_mlp": 0.01035822, + "balance_loss_clip": 1.02070642, + "balance_loss_mlp": 1.04263783, + "epoch": 0.39795580940928904, + "flos": 30227699341440.0, + "grad_norm": 1.9091502235972209, + "language_loss": 0.65847683, + "learning_rate": 2.740064215712231e-06, + "loss": 0.6800065, + "num_input_tokens_seen": 142169735, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.74609375, + "step": 6619, + "time_per_iteration": 2.5479021072387695 + }, + { + "auxiliary_loss_clip": 0.01041684, + "auxiliary_loss_mlp": 0.00999907, + "balance_loss_clip": 0.99843466, + "balance_loss_mlp": 1.0170114, + "epoch": 0.398015932661957, + "flos": 69847224906240.0, + "grad_norm": 0.7720250582246381, + "language_loss": 0.58222711, + "learning_rate": 2.7397023843942527e-06, + "loss": 0.60264301, + "num_input_tokens_seen": 142229520, + "router_z_loss_clip": 0.01470947, + "router_z_loss_mlp": 0.24609375, + "step": 6620, + "time_per_iteration": 3.0502688884735107 + }, + { + "auxiliary_loss_clip": 0.01116159, + "auxiliary_loss_mlp": 0.01036059, + "balance_loss_clip": 1.02383971, + "balance_loss_mlp": 1.04254556, + "epoch": 0.39807605591462497, + "flos": 20157773189760.0, + "grad_norm": 1.5861085047038441, + "language_loss": 0.79551339, + "learning_rate": 2.739340525026686e-06, + "loss": 0.81703556, + "num_input_tokens_seen": 142247660, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.73828125, + "step": 6621, + "time_per_iteration": 2.4595162868499756 + }, + { + "auxiliary_loss_clip": 0.01115012, + "auxiliary_loss_mlp": 0.01030289, + "balance_loss_clip": 1.01709294, + "balance_loss_mlp": 1.04198873, + "epoch": 0.39813617916729294, + "flos": 21141761339520.0, + "grad_norm": 1.9955210259775171, + "language_loss": 0.78070045, + "learning_rate": 2.738978637623252e-06, + "loss": 0.80215347, + "num_input_tokens_seen": 142266990, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73046875, + "step": 6622, + "time_per_iteration": 2.487805128097534 + }, + { + "auxiliary_loss_clip": 0.01116184, + "auxiliary_loss_mlp": 0.01030398, + "balance_loss_clip": 1.01685607, + "balance_loss_mlp": 1.04132223, + "epoch": 0.3981963024199609, + "flos": 18988485753600.0, + "grad_norm": 1.5290489885204759, + "language_loss": 0.75010175, + "learning_rate": 2.738616722197674e-06, + "loss": 0.77156758, + "num_input_tokens_seen": 142287170, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.75, + "step": 6623, + "time_per_iteration": 2.464571714401245 + }, + { + "auxiliary_loss_clip": 0.01116211, + "auxiliary_loss_mlp": 0.01036455, + "balance_loss_clip": 1.02278805, + "balance_loss_mlp": 1.04220378, + "epoch": 0.39825642567262887, + "flos": 16575108808320.0, + "grad_norm": 1.7278538768787957, + "language_loss": 0.79535556, + "learning_rate": 2.7382547787636766e-06, + "loss": 0.81688213, + "num_input_tokens_seen": 142305405, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73828125, + "step": 6624, + "time_per_iteration": 2.4550037384033203 + }, + { + "auxiliary_loss_clip": 0.01120431, + "auxiliary_loss_mlp": 0.01041321, + "balance_loss_clip": 1.02627707, + "balance_loss_mlp": 1.04234707, + "epoch": 0.39831654892529683, + "flos": 22199833290240.0, + "grad_norm": 2.035642441182755, + "language_loss": 0.83558613, + "learning_rate": 2.7378928073349832e-06, + "loss": 0.85720372, + "num_input_tokens_seen": 142322710, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.78125, + "step": 6625, + "time_per_iteration": 2.456171989440918 + }, + { + "auxiliary_loss_clip": 0.01114643, + "auxiliary_loss_mlp": 0.01042231, + "balance_loss_clip": 1.02839124, + "balance_loss_mlp": 1.04085207, + "epoch": 0.39837667217796485, + "flos": 10487963612160.0, + "grad_norm": 2.051687002705142, + "language_loss": 0.86593187, + "learning_rate": 2.737530807925321e-06, + "loss": 0.88750064, + "num_input_tokens_seen": 142338535, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.73828125, + "step": 6626, + "time_per_iteration": 2.4335460662841797 + }, + { + "auxiliary_loss_clip": 0.01115116, + "auxiliary_loss_mlp": 0.01036656, + "balance_loss_clip": 1.02238643, + "balance_loss_mlp": 1.04094946, + "epoch": 0.3984367954306328, + "flos": 17965282930560.0, + "grad_norm": 2.3900066005878386, + "language_loss": 0.83897698, + "learning_rate": 2.737168780548417e-06, + "loss": 0.86049473, + "num_input_tokens_seen": 142354570, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7421875, + "step": 6627, + "time_per_iteration": 2.4269766807556152 + }, + { + "auxiliary_loss_clip": 0.01111344, + "auxiliary_loss_mlp": 0.01038178, + "balance_loss_clip": 1.02514243, + "balance_loss_mlp": 1.03955984, + "epoch": 0.3984969186833008, + "flos": 22711057608960.0, + "grad_norm": 1.4398151096773946, + "language_loss": 0.82760668, + "learning_rate": 2.736806725217998e-06, + "loss": 0.8491019, + "num_input_tokens_seen": 142374395, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71875, + "step": 6628, + "time_per_iteration": 2.529315948486328 + }, + { + "auxiliary_loss_clip": 0.01115476, + "auxiliary_loss_mlp": 0.01040725, + "balance_loss_clip": 1.027421, + "balance_loss_mlp": 1.04130399, + "epoch": 0.39855704193596875, + "flos": 23405785534080.0, + "grad_norm": 1.8256672588255014, + "language_loss": 0.70683473, + "learning_rate": 2.7364446419477945e-06, + "loss": 0.72839677, + "num_input_tokens_seen": 142396040, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7421875, + "step": 6629, + "time_per_iteration": 2.5025413036346436 + }, + { + "auxiliary_loss_clip": 0.01114279, + "auxiliary_loss_mlp": 0.01035106, + "balance_loss_clip": 1.02155161, + "balance_loss_mlp": 1.04309297, + "epoch": 0.3986171651886367, + "flos": 21251935330560.0, + "grad_norm": 4.278612279497538, + "language_loss": 0.80683714, + "learning_rate": 2.7360825307515366e-06, + "loss": 0.82833099, + "num_input_tokens_seen": 142415495, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7109375, + "step": 6630, + "time_per_iteration": 2.4792280197143555 + }, + { + "auxiliary_loss_clip": 0.01116417, + "auxiliary_loss_mlp": 0.01027934, + "balance_loss_clip": 1.01485634, + "balance_loss_mlp": 1.04143131, + "epoch": 0.3986772884413047, + "flos": 12458705258880.0, + "grad_norm": 1.8749880656247468, + "language_loss": 0.75354141, + "learning_rate": 2.7357203916429555e-06, + "loss": 0.7749849, + "num_input_tokens_seen": 142431865, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.75, + "step": 6631, + "time_per_iteration": 2.417546272277832 + }, + { + "auxiliary_loss_clip": 0.0111705, + "auxiliary_loss_mlp": 0.01035263, + "balance_loss_clip": 1.0218699, + "balance_loss_mlp": 1.04246461, + "epoch": 0.39873741169397264, + "flos": 19646117907840.0, + "grad_norm": 2.3246230169523194, + "language_loss": 0.7156167, + "learning_rate": 2.735358224635783e-06, + "loss": 0.73713982, + "num_input_tokens_seen": 142450595, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.74609375, + "step": 6632, + "time_per_iteration": 2.446089744567871 + }, + { + "auxiliary_loss_clip": 0.01111142, + "auxiliary_loss_mlp": 0.01037094, + "balance_loss_clip": 1.02449358, + "balance_loss_mlp": 1.03939462, + "epoch": 0.3987975349466406, + "flos": 21684766216320.0, + "grad_norm": 1.8450465759001686, + "language_loss": 0.74742806, + "learning_rate": 2.7349960297437533e-06, + "loss": 0.76891041, + "num_input_tokens_seen": 142466650, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71875, + "step": 6633, + "time_per_iteration": 2.431104898452759 + }, + { + "auxiliary_loss_clip": 0.011138, + "auxiliary_loss_mlp": 0.01027649, + "balance_loss_clip": 1.01455402, + "balance_loss_mlp": 1.03961205, + "epoch": 0.3988576581993086, + "flos": 23914064937600.0, + "grad_norm": 1.781985159362602, + "language_loss": 0.808864, + "learning_rate": 2.7346338069806e-06, + "loss": 0.83027852, + "num_input_tokens_seen": 142486165, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7421875, + "step": 6634, + "time_per_iteration": 2.471496105194092 + }, + { + "auxiliary_loss_clip": 0.01116689, + "auxiliary_loss_mlp": 0.01032338, + "balance_loss_clip": 1.01856947, + "balance_loss_mlp": 1.04252565, + "epoch": 0.39891778145197654, + "flos": 18149899858560.0, + "grad_norm": 1.7295196741572958, + "language_loss": 0.74605262, + "learning_rate": 2.7342715563600597e-06, + "loss": 0.7675429, + "num_input_tokens_seen": 142505035, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7421875, + "step": 6635, + "time_per_iteration": 2.4630682468414307 + }, + { + "auxiliary_loss_clip": 0.01120499, + "auxiliary_loss_mlp": 0.01044274, + "balance_loss_clip": 1.02930093, + "balance_loss_mlp": 1.04096711, + "epoch": 0.3989779047046445, + "flos": 22595281096320.0, + "grad_norm": 1.9670463450002986, + "language_loss": 0.66429746, + "learning_rate": 2.733909277895868e-06, + "loss": 0.68594521, + "num_input_tokens_seen": 142521870, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.796875, + "step": 6636, + "time_per_iteration": 2.454789876937866 + }, + { + "auxiliary_loss_clip": 0.0111332, + "auxiliary_loss_mlp": 0.01034295, + "balance_loss_clip": 1.02131867, + "balance_loss_mlp": 1.0403626, + "epoch": 0.39903802795731247, + "flos": 18077216688000.0, + "grad_norm": 1.695302941119513, + "language_loss": 0.81410646, + "learning_rate": 2.733546971601763e-06, + "loss": 0.83558261, + "num_input_tokens_seen": 142540455, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.73046875, + "step": 6637, + "time_per_iteration": 5.387745380401611 + }, + { + "auxiliary_loss_clip": 0.01040567, + "auxiliary_loss_mlp": 0.01000444, + "balance_loss_clip": 0.99893045, + "balance_loss_mlp": 1.0159328, + "epoch": 0.39909815120998043, + "flos": 70441367771520.0, + "grad_norm": 0.7139106827959352, + "language_loss": 0.53211641, + "learning_rate": 2.733184637491484e-06, + "loss": 0.55252659, + "num_input_tokens_seen": 142599665, + "router_z_loss_clip": 0.01513672, + "router_z_loss_mlp": 0.24609375, + "step": 6638, + "time_per_iteration": 4.465191125869751 + }, + { + "auxiliary_loss_clip": 0.01114413, + "auxiliary_loss_mlp": 0.01035276, + "balance_loss_clip": 1.02260959, + "balance_loss_mlp": 1.04064405, + "epoch": 0.39915827446264845, + "flos": 18549262247040.0, + "grad_norm": 1.9403504228046689, + "language_loss": 0.75377512, + "learning_rate": 2.732822275578769e-06, + "loss": 0.77527201, + "num_input_tokens_seen": 142618845, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.73828125, + "step": 6639, + "time_per_iteration": 2.4947104454040527 + }, + { + "auxiliary_loss_clip": 0.01112086, + "auxiliary_loss_mlp": 0.01030627, + "balance_loss_clip": 1.01788926, + "balance_loss_mlp": 1.04078937, + "epoch": 0.3992183977153164, + "flos": 29897249195520.0, + "grad_norm": 1.632879790681491, + "language_loss": 0.76217377, + "learning_rate": 2.7324598858773603e-06, + "loss": 0.78360093, + "num_input_tokens_seen": 142640885, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 6640, + "time_per_iteration": 2.524815320968628 + }, + { + "auxiliary_loss_clip": 0.01112883, + "auxiliary_loss_mlp": 0.01037412, + "balance_loss_clip": 1.02448368, + "balance_loss_mlp": 1.03855717, + "epoch": 0.3992785209679844, + "flos": 22565080736640.0, + "grad_norm": 2.5962495804033794, + "language_loss": 0.82264209, + "learning_rate": 2.7320974684009996e-06, + "loss": 0.84414506, + "num_input_tokens_seen": 142659340, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7421875, + "step": 6641, + "time_per_iteration": 2.4753921031951904 + }, + { + "auxiliary_loss_clip": 0.01116915, + "auxiliary_loss_mlp": 0.010322, + "balance_loss_clip": 1.01891971, + "balance_loss_mlp": 1.04188418, + "epoch": 0.39933864422065235, + "flos": 19682674974720.0, + "grad_norm": 2.015070946619467, + "language_loss": 0.7685014, + "learning_rate": 2.7317350231634288e-06, + "loss": 0.78999245, + "num_input_tokens_seen": 142677085, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75, + "step": 6642, + "time_per_iteration": 2.431239604949951 + }, + { + "auxiliary_loss_clip": 0.01114257, + "auxiliary_loss_mlp": 0.0103328, + "balance_loss_clip": 1.019642, + "balance_loss_mlp": 1.03963089, + "epoch": 0.3993987674733203, + "flos": 23038491012480.0, + "grad_norm": 2.2960488262105145, + "language_loss": 0.7247656, + "learning_rate": 2.731372550178393e-06, + "loss": 0.74624097, + "num_input_tokens_seen": 142694595, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 6643, + "time_per_iteration": 2.4759740829467773 + }, + { + "auxiliary_loss_clip": 0.01115242, + "auxiliary_loss_mlp": 0.01035377, + "balance_loss_clip": 1.0214113, + "balance_loss_mlp": 1.04014993, + "epoch": 0.3994588907259883, + "flos": 19390828970880.0, + "grad_norm": 1.5171926718970592, + "language_loss": 0.65988386, + "learning_rate": 2.7310100494596375e-06, + "loss": 0.68139005, + "num_input_tokens_seen": 142714175, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75, + "step": 6644, + "time_per_iteration": 2.437404155731201 + }, + { + "auxiliary_loss_clip": 0.01113182, + "auxiliary_loss_mlp": 0.01037023, + "balance_loss_clip": 1.0235281, + "balance_loss_mlp": 1.0386616, + "epoch": 0.39951901397865625, + "flos": 13734395758080.0, + "grad_norm": 1.956427678643188, + "language_loss": 0.78470129, + "learning_rate": 2.730647521020907e-06, + "loss": 0.80620331, + "num_input_tokens_seen": 142730955, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 6645, + "time_per_iteration": 2.44826078414917 + }, + { + "auxiliary_loss_clip": 0.01115381, + "auxiliary_loss_mlp": 0.01033765, + "balance_loss_clip": 1.02034187, + "balance_loss_mlp": 1.04042077, + "epoch": 0.3995791372313242, + "flos": 23586451966080.0, + "grad_norm": 1.409098570486763, + "language_loss": 0.69889182, + "learning_rate": 2.73028496487595e-06, + "loss": 0.72038329, + "num_input_tokens_seen": 142751200, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 6646, + "time_per_iteration": 2.4746181964874268 + }, + { + "auxiliary_loss_clip": 0.01113557, + "auxiliary_loss_mlp": 0.01035502, + "balance_loss_clip": 1.0222578, + "balance_loss_mlp": 1.03869605, + "epoch": 0.3996392604839922, + "flos": 21355896268800.0, + "grad_norm": 1.7478077072518943, + "language_loss": 0.72165501, + "learning_rate": 2.729922381038513e-06, + "loss": 0.74314553, + "num_input_tokens_seen": 142770170, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.74609375, + "step": 6647, + "time_per_iteration": 2.4814393520355225 + }, + { + "auxiliary_loss_clip": 0.01108545, + "auxiliary_loss_mlp": 0.01037927, + "balance_loss_clip": 1.02576208, + "balance_loss_mlp": 1.03874063, + "epoch": 0.39969938373666014, + "flos": 26032255914240.0, + "grad_norm": 1.4937426139380796, + "language_loss": 0.74371958, + "learning_rate": 2.7295597695223463e-06, + "loss": 0.76518434, + "num_input_tokens_seen": 142792680, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69921875, + "step": 6648, + "time_per_iteration": 2.4970345497131348 + }, + { + "auxiliary_loss_clip": 0.01115329, + "auxiliary_loss_mlp": 0.01036503, + "balance_loss_clip": 1.02300286, + "balance_loss_mlp": 1.04061389, + "epoch": 0.3997595069893281, + "flos": 20116367786880.0, + "grad_norm": 2.209642859907432, + "language_loss": 0.66124469, + "learning_rate": 2.7291971303412006e-06, + "loss": 0.68276298, + "num_input_tokens_seen": 142810510, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 6649, + "time_per_iteration": 2.4624104499816895 + }, + { + "auxiliary_loss_clip": 0.01116294, + "auxiliary_loss_mlp": 0.01036155, + "balance_loss_clip": 1.02280378, + "balance_loss_mlp": 1.0420115, + "epoch": 0.39981963024199607, + "flos": 27783403764480.0, + "grad_norm": 1.57860522688022, + "language_loss": 0.75273359, + "learning_rate": 2.728834463508826e-06, + "loss": 0.77425814, + "num_input_tokens_seen": 142832455, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7421875, + "step": 6650, + "time_per_iteration": 2.5091254711151123 + }, + { + "auxiliary_loss_clip": 0.01112744, + "auxiliary_loss_mlp": 0.01037491, + "balance_loss_clip": 1.02397823, + "balance_loss_mlp": 1.03905869, + "epoch": 0.39987975349466404, + "flos": 21944436612480.0, + "grad_norm": 1.4583647344722164, + "language_loss": 0.71954048, + "learning_rate": 2.728471769038975e-06, + "loss": 0.74104279, + "num_input_tokens_seen": 142852590, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73828125, + "step": 6651, + "time_per_iteration": 2.4820897579193115 + }, + { + "auxiliary_loss_clip": 0.01113579, + "auxiliary_loss_mlp": 0.01035523, + "balance_loss_clip": 1.02220726, + "balance_loss_mlp": 1.03815126, + "epoch": 0.39993987674733206, + "flos": 20704405340160.0, + "grad_norm": 1.787132664616244, + "language_loss": 0.72906494, + "learning_rate": 2.728109046945403e-06, + "loss": 0.75055599, + "num_input_tokens_seen": 142870595, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75390625, + "step": 6652, + "time_per_iteration": 2.4568119049072266 + }, + { + "auxiliary_loss_clip": 0.01039541, + "auxiliary_loss_mlp": 0.00999581, + "balance_loss_clip": 0.99819815, + "balance_loss_mlp": 1.01483345, + "epoch": 0.4, + "flos": 61525429862400.0, + "grad_norm": 0.8299860195083637, + "language_loss": 0.61066198, + "learning_rate": 2.727746297241862e-06, + "loss": 0.63105321, + "num_input_tokens_seen": 142925805, + "router_z_loss_clip": 0.01385498, + "router_z_loss_mlp": 0.24707031, + "step": 6653, + "time_per_iteration": 3.0071723461151123 + }, + { + "auxiliary_loss_clip": 0.01113323, + "auxiliary_loss_mlp": 0.01038964, + "balance_loss_clip": 1.02607179, + "balance_loss_mlp": 1.04303741, + "epoch": 0.400060123252668, + "flos": 14502309644160.0, + "grad_norm": 2.127427836980077, + "language_loss": 0.67038172, + "learning_rate": 2.7273835199421085e-06, + "loss": 0.6919046, + "num_input_tokens_seen": 142943145, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 6654, + "time_per_iteration": 2.442049026489258 + }, + { + "auxiliary_loss_clip": 0.01113347, + "auxiliary_loss_mlp": 0.01039111, + "balance_loss_clip": 1.02741051, + "balance_loss_mlp": 1.03887355, + "epoch": 0.40012024650533595, + "flos": 19093308618240.0, + "grad_norm": 2.299433298478917, + "language_loss": 0.89737195, + "learning_rate": 2.7270207150599e-06, + "loss": 0.91889656, + "num_input_tokens_seen": 142956925, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.74609375, + "step": 6655, + "time_per_iteration": 2.4836323261260986 + }, + { + "auxiliary_loss_clip": 0.01110377, + "auxiliary_loss_mlp": 0.0102991, + "balance_loss_clip": 1.01865685, + "balance_loss_mlp": 1.04077053, + "epoch": 0.4001803697580039, + "flos": 29351012094720.0, + "grad_norm": 1.5855954082229138, + "language_loss": 0.73497427, + "learning_rate": 2.7266578826089917e-06, + "loss": 0.75637716, + "num_input_tokens_seen": 142978040, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6953125, + "step": 6656, + "time_per_iteration": 2.5071847438812256 + } + ], + "logging_steps": 1.0, + "max_steps": 16632, + "num_input_tokens_seen": 142978040, + "num_train_epochs": 1, + "save_steps": 3328, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.575800476999352e+17, + "train_batch_size": 5, + "trial_name": null, + "trial_params": null +} diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/training_args.bin b/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6b9a73eb97a1ef37776f0d97a0590d802e6f8d5a --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a0c59c7a64d6e018f6d41a91f3e718772a260e91597586a7ce64cd9f7d3d0c6 +size 7992 diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/zero_to_fp32.py b/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-6656/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/added_tokens.json b/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/added_tokens.json new file mode 100644 index 0000000000000000000000000000000000000000..c9d3d3a1b74d87e381e471f7b33784015d2dc0ea --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/added_tokens.json @@ -0,0 +1,13 @@ +{ + "<|assistant|>": 32001, + "<|endoftext|>": 32000, + "<|end|>": 32007, + "<|placeholder1|>": 32002, + "<|placeholder2|>": 32003, + "<|placeholder3|>": 32004, + "<|placeholder4|>": 32005, + "<|placeholder5|>": 32008, + "<|placeholder6|>": 32009, + "<|system|>": 32006, + "<|user|>": 32010 +} diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/config.json b/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/config.json new file mode 100644 index 0000000000000000000000000000000000000000..97409ed874967d8d79c126c028d286e8fe8e1484 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/config.json @@ -0,0 +1,199 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": true, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": false, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "smoe_sigmoidgating", + "norm_softmax": false, + "normalization": true, + "num_attention_heads": 32, + "num_experts": 4, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 2, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/mm_projector.bin", + "rate_compete": 0.2, + "rate_flip": 0.05, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": true, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": false, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/generation_config.json b/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/latest b/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/latest new file mode 100644 index 0000000000000000000000000000000000000000..6c2bd85bc7c6d33e172c9c565d8517bade9572ca --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/latest @@ -0,0 +1 @@ +global_step9984 \ No newline at end of file diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/model-00001-of-00003.safetensors b/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..99ce51aa7db798f439e525a22f1cdfbb42a02af5 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db3e29d7513ff775ec5998733ec39857f619b98ab52122c7b9a5556b522b9477 +size 4972489328 diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/model-00002-of-00003.safetensors b/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..57b2813e17b0f24ba50ef3bd6a0723b8c33f8c8a --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61881ebd377fe77ccf6a6225d31e198c80dc4c27f102a6a394870304f2889f00 +size 4985529648 diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/model-00003-of-00003.safetensors b/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0fe14f79a75bf5be4ad688643a052e9a7fd62b91 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d4a00b538b4788b22a692a59274bd15c4031f33752c79a882f38e8957076c48 +size 248943552 diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/model.safetensors.index.json b/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..aa54419fc0a3eab502aa7c4ad974dca52ed10803 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/model.safetensors.index.json @@ -0,0 +1,1005 @@ +{ + "metadata": { + "total_size": 10206819456 + }, + "weight_map": { + "lm_head.weight": "model-00003-of-00003.safetensors", + "model.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00003-of-00003.safetensors", + "model.norm.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors" + } +} diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/rng_state_0.pth b/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..c1e6773e944015af0e83161fa2d20fe7d469fd7f --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22271cc36f268c0b3e870b3930ac590fd40a4a3cd3a88aed74f78e5f8790aceb +size 15024 diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/rng_state_1.pth b/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..a834a7be015ebd36883cec3bb92a8657936cd0a6 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19435e9be5d4b837d96fc2e9286e23e27344bb6ad3222ef1b9d207e6b2bb8c78 +size 15024 diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/rng_state_2.pth b/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..1f1b991258d274ff5481ace768d5b6702d919d50 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2902ec572b1b2f1a6a78f8979353bf31953eacdc78b129cc34a9f04c1de9b8d5 +size 15024 diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/rng_state_3.pth b/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..ee742fbd21912a77c2d25fe5ca60af4403668637 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8a011e80ba323d1fcabf31eaea4d2bc397efadb23603b4248f0067ff8ca3987 +size 15024 diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/special_tokens_map.json b/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/tokenizer.model b/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/tokenizer_config.json b/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/trainer_state.json b/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..eebd1fc18ada455f8f724258c6ec03ce7e1cdc05 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/trainer_state.json @@ -0,0 +1,169761 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.6002705546370058, + "eval_steps": 500, + "global_step": 9984, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.05015663, + "auxiliary_loss_mlp": 0.02215404, + "balance_loss_clip": 1.76946592, + "balance_loss_mlp": 2.42247009, + "epoch": 6.012325266796934e-05, + "flos": 24456507091200.0, + "grad_norm": 54.31846269900138, + "language_loss": 2.84849024, + "learning_rate": 0.0, + "loss": 1.94356799, + "num_input_tokens_seen": 19155, + "router_z_loss_clip": 4.4375, + "router_z_loss_mlp": 26.0, + "step": 1, + "time_per_iteration": 14.062297821044922 + }, + { + "auxiliary_loss_clip": 0.03371575, + "auxiliary_loss_mlp": 0.01459085, + "balance_loss_clip": 1.18919563, + "balance_loss_mlp": 1.61943495, + "epoch": 0.00012024650533593868, + "flos": 20225931246720.0, + "grad_norm": 34.71678092445231, + "language_loss": 1.82690942, + "learning_rate": 4.4628432569317594e-07, + "loss": 1.87521601, + "num_input_tokens_seen": 36175, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 17.5, + "step": 2, + "time_per_iteration": 2.4504079818725586 + }, + { + "auxiliary_loss_clip": 0.03311525, + "auxiliary_loss_mlp": 0.014397, + "balance_loss_clip": 1.18697679, + "balance_loss_mlp": 1.61685562, + "epoch": 0.000180369758003908, + "flos": 22309935454080.0, + "grad_norm": 34.59102075188436, + "language_loss": 1.57529902, + "learning_rate": 7.073439208833112e-07, + "loss": 1.62281132, + "num_input_tokens_seen": 54870, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 17.0, + "step": 3, + "time_per_iteration": 2.4145541191101074 + }, + { + "auxiliary_loss_clip": 0.03353861, + "auxiliary_loss_mlp": 0.01449549, + "balance_loss_clip": 1.15390992, + "balance_loss_mlp": 1.61571431, + "epoch": 0.00024049301067187735, + "flos": 22414650577920.0, + "grad_norm": 51.728740512395206, + "language_loss": 1.67595887, + "learning_rate": 8.925686513863519e-07, + "loss": 1.72399294, + "num_input_tokens_seen": 74575, + "router_z_loss_clip": 2.953125, + "router_z_loss_mlp": 17.375, + "step": 4, + "time_per_iteration": 2.466392993927002 + }, + { + "auxiliary_loss_clip": 0.03393634, + "auxiliary_loss_mlp": 0.01505687, + "balance_loss_clip": 1.21710527, + "balance_loss_mlp": 1.61638641, + "epoch": 0.0003006162633398467, + "flos": 21396978449280.0, + "grad_norm": 56.74196654651921, + "language_loss": 1.90851176, + "learning_rate": 1.0362401141348472e-06, + "loss": 1.95750499, + "num_input_tokens_seen": 92580, + "router_z_loss_clip": 2.890625, + "router_z_loss_mlp": 17.75, + "step": 5, + "time_per_iteration": 2.6828246116638184 + }, + { + "auxiliary_loss_clip": 0.03361898, + "auxiliary_loss_mlp": 0.01518906, + "balance_loss_clip": 1.22441149, + "balance_loss_mlp": 1.60614848, + "epoch": 0.000360739516007816, + "flos": 21652375127040.0, + "grad_norm": 33.32400799743486, + "language_loss": 1.6094954, + "learning_rate": 1.153628246576487e-06, + "loss": 1.6583035, + "num_input_tokens_seen": 109705, + "router_z_loss_clip": 2.953125, + "router_z_loss_mlp": 17.5, + "step": 6, + "time_per_iteration": 2.660855770111084 + }, + { + "auxiliary_loss_clip": 0.03345758, + "auxiliary_loss_mlp": 0.01485904, + "balance_loss_clip": 1.20209074, + "balance_loss_mlp": 1.60783124, + "epoch": 0.0004208627686757854, + "flos": 27159742897920.0, + "grad_norm": 26.76365346454933, + "language_loss": 1.53346825, + "learning_rate": 1.2528784983718962e-06, + "loss": 1.58178496, + "num_input_tokens_seen": 129425, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 17.375, + "step": 7, + "time_per_iteration": 2.718822956085205 + }, + { + "auxiliary_loss_clip": 0.03312894, + "auxiliary_loss_mlp": 0.01444018, + "balance_loss_clip": 1.16630852, + "balance_loss_mlp": 1.60320723, + "epoch": 0.0004809860213437547, + "flos": 31319096135040.0, + "grad_norm": 31.923588970831496, + "language_loss": 1.43687642, + "learning_rate": 1.338852977079528e-06, + "loss": 1.48444545, + "num_input_tokens_seen": 149210, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 17.0, + "step": 8, + "time_per_iteration": 2.779961109161377 + }, + { + "auxiliary_loss_clip": 0.03360351, + "auxiliary_loss_mlp": 0.01496215, + "balance_loss_clip": 1.21144783, + "balance_loss_mlp": 1.60258842, + "epoch": 0.000541109274011724, + "flos": 32160411463680.0, + "grad_norm": 28.084887526361417, + "language_loss": 1.49955618, + "learning_rate": 1.4146878417666224e-06, + "loss": 1.54812181, + "num_input_tokens_seen": 169055, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 17.5, + "step": 9, + "time_per_iteration": 2.799635887145996 + }, + { + "auxiliary_loss_clip": 0.03302188, + "auxiliary_loss_mlp": 0.01477479, + "balance_loss_clip": 1.20797062, + "balance_loss_mlp": 1.6070832, + "epoch": 0.0006012325266796934, + "flos": 18916808163840.0, + "grad_norm": 23.45187310710616, + "language_loss": 1.44727731, + "learning_rate": 1.4825244398280232e-06, + "loss": 1.49507403, + "num_input_tokens_seen": 188045, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 17.0, + "step": 10, + "time_per_iteration": 2.6989152431488037 + }, + { + "auxiliary_loss_clip": 0.03356835, + "auxiliary_loss_mlp": 0.01493566, + "balance_loss_clip": 1.21928966, + "balance_loss_mlp": 1.61121845, + "epoch": 0.0006613557793476627, + "flos": 20774861867520.0, + "grad_norm": 18.63867113279811, + "language_loss": 1.45021069, + "learning_rate": 1.5438901072051983e-06, + "loss": 1.4987148, + "num_input_tokens_seen": 207035, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 17.5, + "step": 11, + "time_per_iteration": 2.6820693016052246 + }, + { + "auxiliary_loss_clip": 0.0328584, + "auxiliary_loss_mlp": 0.01449969, + "balance_loss_clip": 1.17378449, + "balance_loss_mlp": 1.59900761, + "epoch": 0.000721479032015632, + "flos": 16581680997120.0, + "grad_norm": 16.861449854609447, + "language_loss": 1.45122719, + "learning_rate": 1.5999125722696629e-06, + "loss": 1.49858522, + "num_input_tokens_seen": 223225, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 16.875, + "step": 12, + "time_per_iteration": 2.631218910217285 + }, + { + "auxiliary_loss_clip": 0.03313605, + "auxiliary_loss_mlp": 0.01404342, + "balance_loss_clip": 1.14589679, + "balance_loss_mlp": 1.60898232, + "epoch": 0.0007816022846836014, + "flos": 23805471144960.0, + "grad_norm": 11.176593153687291, + "language_loss": 1.24100113, + "learning_rate": 1.6514482443788434e-06, + "loss": 1.28818083, + "num_input_tokens_seen": 242570, + "router_z_loss_clip": 2.578125, + "router_z_loss_mlp": 17.125, + "step": 13, + "time_per_iteration": 2.6961779594421387 + }, + { + "auxiliary_loss_clip": 0.03282163, + "auxiliary_loss_mlp": 0.01472629, + "balance_loss_clip": 1.20464635, + "balance_loss_mlp": 1.60534358, + "epoch": 0.0008417255373515708, + "flos": 19172204841600.0, + "grad_norm": 5.7580183597057975, + "language_loss": 1.20611417, + "learning_rate": 1.6991628240650723e-06, + "loss": 1.25366211, + "num_input_tokens_seen": 261215, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 16.75, + "step": 14, + "time_per_iteration": 2.6555092334747314 + }, + { + "auxiliary_loss_clip": 0.0326835, + "auxiliary_loss_mlp": 0.01431945, + "balance_loss_clip": 1.16815877, + "balance_loss_mlp": 1.6104542, + "epoch": 0.00090184879001954, + "flos": 26395564026240.0, + "grad_norm": 6.4839782289009085, + "language_loss": 1.12832427, + "learning_rate": 1.7435840350181584e-06, + "loss": 1.1753273, + "num_input_tokens_seen": 280035, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 16.5, + "step": 15, + "time_per_iteration": 2.717512607574463 + }, + { + "auxiliary_loss_clip": 0.03231722, + "auxiliary_loss_mlp": 0.01412441, + "balance_loss_clip": 1.16257811, + "balance_loss_mlp": 1.59521294, + "epoch": 0.0009619720426875094, + "flos": 24679500785280.0, + "grad_norm": 4.584872954405151, + "language_loss": 1.1119349, + "learning_rate": 1.7851373027727038e-06, + "loss": 1.15837646, + "num_input_tokens_seen": 300265, + "router_z_loss_clip": 2.5, + "router_z_loss_mlp": 16.375, + "step": 16, + "time_per_iteration": 2.7170701026916504 + }, + { + "auxiliary_loss_clip": 0.03220058, + "auxiliary_loss_mlp": 0.0141779, + "balance_loss_clip": 1.17784595, + "balance_loss_mlp": 1.60289145, + "epoch": 0.0010220952953554788, + "flos": 18624531196800.0, + "grad_norm": 5.285773165398426, + "language_loss": 1.1253047, + "learning_rate": 1.8241705979033208e-06, + "loss": 1.17168307, + "num_input_tokens_seen": 317375, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 16.125, + "step": 17, + "time_per_iteration": 2.6125564575195312 + }, + { + "auxiliary_loss_clip": 0.0315575, + "auxiliary_loss_mlp": 0.01378857, + "balance_loss_clip": 1.14730477, + "balance_loss_mlp": 1.60051179, + "epoch": 0.001082218548023448, + "flos": 26142537646080.0, + "grad_norm": 3.8094646515897193, + "language_loss": 1.08149433, + "learning_rate": 1.860972167459798e-06, + "loss": 1.12684035, + "num_input_tokens_seen": 337975, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 15.5625, + "step": 18, + "time_per_iteration": 5.593315362930298 + }, + { + "auxiliary_loss_clip": 0.03181327, + "auxiliary_loss_mlp": 0.01400224, + "balance_loss_clip": 1.13548398, + "balance_loss_mlp": 1.59901524, + "epoch": 0.0011423418006914173, + "flos": 19609776322560.0, + "grad_norm": 4.551402579460018, + "language_loss": 1.02296436, + "learning_rate": 1.89578346593066e-06, + "loss": 1.06877995, + "num_input_tokens_seen": 356635, + "router_z_loss_clip": 2.65625, + "router_z_loss_mlp": 15.8125, + "step": 19, + "time_per_iteration": 2.6462903022766113 + }, + { + "auxiliary_loss_clip": 0.0312444, + "auxiliary_loss_mlp": 0.01341166, + "balance_loss_clip": 1.12096262, + "balance_loss_mlp": 1.60122275, + "epoch": 0.0012024650533593868, + "flos": 17895365107200.0, + "grad_norm": 4.049985155187145, + "language_loss": 1.16660511, + "learning_rate": 1.928808765521199e-06, + "loss": 1.21126115, + "num_input_tokens_seen": 375625, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 15.25, + "step": 20, + "time_per_iteration": 2.687962293624878 + }, + { + "auxiliary_loss_clip": 0.03111088, + "auxiliary_loss_mlp": 0.01380381, + "balance_loss_clip": 1.13109064, + "balance_loss_mlp": 1.58184814, + "epoch": 0.001262588306027356, + "flos": 21252043071360.0, + "grad_norm": 8.855966691950416, + "language_loss": 1.06044388, + "learning_rate": 1.9602224192552076e-06, + "loss": 1.1053586, + "num_input_tokens_seen": 394350, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 15.3125, + "step": 21, + "time_per_iteration": 2.705784320831299 + }, + { + "auxiliary_loss_clip": 0.03006166, + "auxiliary_loss_mlp": 0.0138104, + "balance_loss_clip": 1.14758062, + "balance_loss_mlp": 1.56386232, + "epoch": 0.0013227115586953253, + "flos": 26104077158400.0, + "grad_norm": 3.503731577984969, + "language_loss": 1.05752254, + "learning_rate": 1.9901744328983746e-06, + "loss": 1.10139465, + "num_input_tokens_seen": 413255, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 14.4375, + "step": 22, + "time_per_iteration": 2.714902400970459 + }, + { + "auxiliary_loss_clip": 0.02958535, + "auxiliary_loss_mlp": 0.01337723, + "balance_loss_clip": 1.12743819, + "balance_loss_mlp": 1.56545472, + "epoch": 0.0013828348113632948, + "flos": 23951376190080.0, + "grad_norm": 2.8887485842740657, + "language_loss": 0.91820848, + "learning_rate": 2.018794797290208e-06, + "loss": 0.96117103, + "num_input_tokens_seen": 433065, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 13.9375, + "step": 23, + "time_per_iteration": 2.6802501678466797 + }, + { + "auxiliary_loss_clip": 0.02925568, + "auxiliary_loss_mlp": 0.0136327, + "balance_loss_clip": 1.14306688, + "balance_loss_mlp": 1.55789983, + "epoch": 0.001442958064031264, + "flos": 15959851724160.0, + "grad_norm": 2.888412626700388, + "language_loss": 1.08090949, + "learning_rate": 2.046196897962839e-06, + "loss": 1.12379789, + "num_input_tokens_seen": 451175, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 13.6875, + "step": 24, + "time_per_iteration": 2.6134862899780273 + }, + { + "auxiliary_loss_clip": 0.02818042, + "auxiliary_loss_mlp": 0.01329399, + "balance_loss_clip": 1.11892343, + "balance_loss_mlp": 1.55278993, + "epoch": 0.0015030813166992333, + "flos": 18108350801280.0, + "grad_norm": 3.5526652768314877, + "language_loss": 1.01197755, + "learning_rate": 2.0724802282696944e-06, + "loss": 1.05345201, + "num_input_tokens_seen": 468775, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 12.6875, + "step": 25, + "time_per_iteration": 2.6801955699920654 + }, + { + "auxiliary_loss_clip": 0.02811065, + "auxiliary_loss_mlp": 0.01310914, + "balance_loss_clip": 1.10196424, + "balance_loss_mlp": 1.55557573, + "epoch": 0.0015632045693672028, + "flos": 22234558763520.0, + "grad_norm": 2.8866965715457127, + "language_loss": 1.0650332, + "learning_rate": 2.0977325700720194e-06, + "loss": 1.10625291, + "num_input_tokens_seen": 488530, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 12.5625, + "step": 26, + "time_per_iteration": 2.6561954021453857 + }, + { + "auxiliary_loss_clip": 0.02754337, + "auxiliary_loss_mlp": 0.01325989, + "balance_loss_clip": 1.12600398, + "balance_loss_mlp": 1.54593086, + "epoch": 0.001623327822035172, + "flos": 23991955580160.0, + "grad_norm": 8.480879524297928, + "language_loss": 0.95465469, + "learning_rate": 2.122031762649933e-06, + "loss": 0.99545801, + "num_input_tokens_seen": 510495, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 12.0625, + "step": 27, + "time_per_iteration": 2.717332363128662 + }, + { + "auxiliary_loss_clip": 0.02732017, + "auxiliary_loss_mlp": 0.0131313, + "balance_loss_clip": 1.13174081, + "balance_loss_mlp": 1.55085063, + "epoch": 0.0016834510747031415, + "flos": 19677647070720.0, + "grad_norm": 2.7582152185230338, + "language_loss": 1.06276608, + "learning_rate": 2.1454471497582483e-06, + "loss": 1.1032176, + "num_input_tokens_seen": 528605, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 11.8125, + "step": 28, + "time_per_iteration": 2.6645846366882324 + }, + { + "auxiliary_loss_clip": 0.02698877, + "auxiliary_loss_mlp": 0.01319704, + "balance_loss_clip": 1.1339283, + "balance_loss_mlp": 1.5357703, + "epoch": 0.0017435743273711108, + "flos": 20923819568640.0, + "grad_norm": 2.703793609192777, + "language_loss": 1.02653611, + "learning_rate": 2.1680407726407727e-06, + "loss": 1.06672192, + "num_input_tokens_seen": 548515, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 11.625, + "step": 29, + "time_per_iteration": 2.6647088527679443 + }, + { + "auxiliary_loss_clip": 0.02692806, + "auxiliary_loss_mlp": 0.01313595, + "balance_loss_clip": 1.12667465, + "balance_loss_mlp": 1.53252506, + "epoch": 0.00180369758003908, + "flos": 19528976678400.0, + "grad_norm": 3.824163422844594, + "language_loss": 1.1929419, + "learning_rate": 2.189868360711334e-06, + "loss": 1.233006, + "num_input_tokens_seen": 564025, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 11.625, + "step": 30, + "time_per_iteration": 2.6305816173553467 + }, + { + "auxiliary_loss_clip": 0.02610821, + "auxiliary_loss_mlp": 0.01338782, + "balance_loss_clip": 1.15748882, + "balance_loss_mlp": 1.51829374, + "epoch": 0.0018638208327070496, + "flos": 27453169100160.0, + "grad_norm": 4.55861683808779, + "language_loss": 1.02499342, + "learning_rate": 2.2109801597326265e-06, + "loss": 1.06448936, + "num_input_tokens_seen": 583345, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 10.9375, + "step": 31, + "time_per_iteration": 2.71045184135437 + }, + { + "auxiliary_loss_clip": 0.02583705, + "auxiliary_loss_mlp": 0.01332414, + "balance_loss_clip": 1.15245557, + "balance_loss_mlp": 1.52035046, + "epoch": 0.0019239440853750188, + "flos": 13589460380160.0, + "grad_norm": 2.526137445187824, + "language_loss": 0.95697796, + "learning_rate": 2.2314216284658796e-06, + "loss": 0.99613917, + "num_input_tokens_seen": 600010, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 10.625, + "step": 32, + "time_per_iteration": 2.626783847808838 + }, + { + "auxiliary_loss_clip": 0.02566919, + "auxiliary_loss_mlp": 0.01304168, + "balance_loss_clip": 1.13670313, + "balance_loss_mlp": 1.51655078, + "epoch": 0.001984067338042988, + "flos": 11253866336640.0, + "grad_norm": 3.344933729659458, + "language_loss": 0.95465255, + "learning_rate": 2.2512340280885094e-06, + "loss": 0.99336338, + "num_input_tokens_seen": 616295, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 10.5, + "step": 33, + "time_per_iteration": 2.645725727081299 + }, + { + "auxiliary_loss_clip": 0.02433039, + "auxiliary_loss_mlp": 0.013041, + "balance_loss_clip": 1.14569449, + "balance_loss_mlp": 1.48877192, + "epoch": 0.0020441905907109576, + "flos": 22386245898240.0, + "grad_norm": 4.808068329548225, + "language_loss": 0.91556877, + "learning_rate": 2.270454923596497e-06, + "loss": 0.95294011, + "num_input_tokens_seen": 637640, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 9.4375, + "step": 34, + "time_per_iteration": 2.7327146530151367 + }, + { + "auxiliary_loss_clip": 0.02385913, + "auxiliary_loss_mlp": 0.0127366, + "balance_loss_clip": 1.1172576, + "balance_loss_mlp": 1.45172572, + "epoch": 0.0021043138433789266, + "flos": 49778580337920.0, + "grad_norm": 2.948252640490764, + "language_loss": 0.76639408, + "learning_rate": 2.2891186125067434e-06, + "loss": 0.80298984, + "num_input_tokens_seen": 659710, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 9.375, + "step": 35, + "time_per_iteration": 2.940739870071411 + }, + { + "auxiliary_loss_clip": 0.02360979, + "auxiliary_loss_mlp": 0.0127456, + "balance_loss_clip": 1.12769413, + "balance_loss_mlp": 1.46427846, + "epoch": 0.002164437096046896, + "flos": 20557961591040.0, + "grad_norm": 2.1659182072135064, + "language_loss": 0.89043307, + "learning_rate": 2.307256493152974e-06, + "loss": 0.92678845, + "num_input_tokens_seen": 679670, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 8.9375, + "step": 36, + "time_per_iteration": 2.693335771560669 + }, + { + "auxiliary_loss_clip": 0.02305413, + "auxiliary_loss_mlp": 0.01335093, + "balance_loss_clip": 1.18574798, + "balance_loss_mlp": 1.45221901, + "epoch": 0.0022245603487148656, + "flos": 26542295084160.0, + "grad_norm": 3.3248653771669416, + "language_loss": 0.93231332, + "learning_rate": 2.3248973825097614e-06, + "loss": 0.96871841, + "num_input_tokens_seen": 700170, + "router_z_loss_clip": 1.4921875, + "router_z_loss_mlp": 8.5, + "step": 37, + "time_per_iteration": 2.70194673538208 + }, + { + "auxiliary_loss_clip": 0.02264412, + "auxiliary_loss_mlp": 0.01277806, + "balance_loss_clip": 1.15373349, + "balance_loss_mlp": 1.44697845, + "epoch": 0.0022846836013828346, + "flos": 20338188226560.0, + "grad_norm": 2.1191864106647906, + "language_loss": 1.04275775, + "learning_rate": 2.3420677916238357e-06, + "loss": 1.07817996, + "num_input_tokens_seen": 718545, + "router_z_loss_clip": 1.2421875, + "router_z_loss_mlp": 8.1875, + "step": 38, + "time_per_iteration": 2.674187183380127 + }, + { + "auxiliary_loss_clip": 0.02234117, + "auxiliary_loss_mlp": 0.01257339, + "balance_loss_clip": 1.13164425, + "balance_loss_mlp": 1.44101977, + "epoch": 0.002344806854050804, + "flos": 26247575992320.0, + "grad_norm": 2.2707505194681685, + "language_loss": 0.85635245, + "learning_rate": 2.358792165262154e-06, + "loss": 0.891267, + "num_input_tokens_seen": 739865, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 7.9375, + "step": 39, + "time_per_iteration": 2.716417074203491 + }, + { + "auxiliary_loss_clip": 0.02209554, + "auxiliary_loss_mlp": 0.01248677, + "balance_loss_clip": 1.1173557, + "balance_loss_mlp": 1.43176007, + "epoch": 0.0024049301067187736, + "flos": 11801539981440.0, + "grad_norm": 2.874633531970748, + "language_loss": 0.90416026, + "learning_rate": 2.3750930912143747e-06, + "loss": 0.93874258, + "num_input_tokens_seen": 755770, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 7.78125, + "step": 40, + "time_per_iteration": 2.621108055114746 + }, + { + "auxiliary_loss_clip": 0.02158681, + "auxiliary_loss_mlp": 0.01271709, + "balance_loss_clip": 1.15626693, + "balance_loss_mlp": 1.42207694, + "epoch": 0.0024650533593867426, + "flos": 20631506688000.0, + "grad_norm": 3.842521317695652, + "language_loss": 0.93497038, + "learning_rate": 2.3909914837471044e-06, + "loss": 0.96927428, + "num_input_tokens_seen": 773440, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 7.375, + "step": 41, + "time_per_iteration": 2.66089129447937 + }, + { + "auxiliary_loss_clip": 0.0212207, + "auxiliary_loss_mlp": 0.0125263, + "balance_loss_clip": 1.14720106, + "balance_loss_mlp": 1.41368401, + "epoch": 0.002525176612054712, + "flos": 18406122549120.0, + "grad_norm": 4.5963223670672635, + "language_loss": 0.97454929, + "learning_rate": 2.4065067449483835e-06, + "loss": 1.00829637, + "num_input_tokens_seen": 790455, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 7.09375, + "step": 42, + "time_per_iteration": 2.63149094581604 + }, + { + "auxiliary_loss_clip": 0.02082851, + "auxiliary_loss_mlp": 0.01298258, + "balance_loss_clip": 1.18939614, + "balance_loss_mlp": 1.41430426, + "epoch": 0.0025852998647226816, + "flos": 28184023128960.0, + "grad_norm": 2.9545418034556814, + "language_loss": 0.97656071, + "learning_rate": 2.4216569070848724e-06, + "loss": 1.01037169, + "num_input_tokens_seen": 810645, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 6.6875, + "step": 43, + "time_per_iteration": 2.7244436740875244 + }, + { + "auxiliary_loss_clip": 0.02102024, + "auxiliary_loss_mlp": 0.01311792, + "balance_loss_clip": 1.19706488, + "balance_loss_mlp": 1.4130851, + "epoch": 0.0026454231173906506, + "flos": 14283110897280.0, + "grad_norm": 2.0531245010632473, + "language_loss": 0.93701768, + "learning_rate": 2.4364587585915504e-06, + "loss": 0.97115582, + "num_input_tokens_seen": 827470, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 6.875, + "step": 44, + "time_per_iteration": 2.6628317832946777 + }, + { + "auxiliary_loss_clip": 0.02065563, + "auxiliary_loss_mlp": 0.01272457, + "balance_loss_clip": 1.17236853, + "balance_loss_mlp": 1.41084957, + "epoch": 0.00270554637005862, + "flos": 22419211605120.0, + "grad_norm": 9.3374631511207, + "language_loss": 0.98937047, + "learning_rate": 2.450927955901469e-06, + "loss": 1.02275062, + "num_input_tokens_seen": 847285, + "router_z_loss_clip": 1.0, + "router_z_loss_mlp": 6.5625, + "step": 45, + "time_per_iteration": 2.7355775833129883 + }, + { + "auxiliary_loss_clip": 0.02040064, + "auxiliary_loss_mlp": 0.01227769, + "balance_loss_clip": 1.13831401, + "balance_loss_mlp": 1.39673805, + "epoch": 0.0027656696227265896, + "flos": 23985778440960.0, + "grad_norm": 1.8055823424878037, + "language_loss": 1.02792716, + "learning_rate": 2.465079122983384e-06, + "loss": 1.06060553, + "num_input_tokens_seen": 867545, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 6.4375, + "step": 46, + "time_per_iteration": 2.7488839626312256 + }, + { + "auxiliary_loss_clip": 0.02002379, + "auxiliary_loss_mlp": 0.01270193, + "balance_loss_clip": 1.17773402, + "balance_loss_mlp": 1.38648152, + "epoch": 0.0028257928753945586, + "flos": 37669503087360.0, + "grad_norm": 2.971366079361506, + "language_loss": 0.88043427, + "learning_rate": 2.4789259401737868e-06, + "loss": 0.91315997, + "num_input_tokens_seen": 889915, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 6.15625, + "step": 47, + "time_per_iteration": 2.845005512237549 + }, + { + "auxiliary_loss_clip": 0.01963914, + "auxiliary_loss_mlp": 0.01252908, + "balance_loss_clip": 1.16493094, + "balance_loss_mlp": 1.37624073, + "epoch": 0.002885916128062528, + "flos": 22454547609600.0, + "grad_norm": 2.070099145794898, + "language_loss": 0.87949276, + "learning_rate": 2.492481223656015e-06, + "loss": 0.91166103, + "num_input_tokens_seen": 908975, + "router_z_loss_clip": 0.87890625, + "router_z_loss_mlp": 5.875, + "step": 48, + "time_per_iteration": 2.7514398097991943 + }, + { + "auxiliary_loss_clip": 0.01962956, + "auxiliary_loss_mlp": 0.01244481, + "balance_loss_clip": 1.15078259, + "balance_loss_mlp": 1.36602139, + "epoch": 0.0029460393807304976, + "flos": 27012796358400.0, + "grad_norm": 2.366138839739612, + "language_loss": 0.89877701, + "learning_rate": 2.5057569967437924e-06, + "loss": 0.93085134, + "num_input_tokens_seen": 929810, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 6.0, + "step": 49, + "time_per_iteration": 2.743236541748047 + }, + { + "auxiliary_loss_clip": 0.01955947, + "auxiliary_loss_mlp": 0.01232227, + "balance_loss_clip": 1.14534748, + "balance_loss_mlp": 1.36045313, + "epoch": 0.0030061626333984666, + "flos": 15851832549120.0, + "grad_norm": 2.8158483763506914, + "language_loss": 0.91078663, + "learning_rate": 2.51876455396287e-06, + "loss": 0.94266832, + "num_input_tokens_seen": 948650, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 5.9375, + "step": 50, + "time_per_iteration": 2.6860456466674805 + }, + { + "auxiliary_loss_clip": 0.01953364, + "auxiliary_loss_mlp": 0.01201227, + "balance_loss_clip": 1.11778045, + "balance_loss_mlp": 1.36547732, + "epoch": 0.003066285886066436, + "flos": 31827052316160.0, + "grad_norm": 3.5299735782100026, + "language_loss": 0.87144494, + "learning_rate": 2.5315145187866316e-06, + "loss": 0.90299082, + "num_input_tokens_seen": 966455, + "router_z_loss_clip": 0.83203125, + "router_z_loss_mlp": 5.875, + "step": 51, + "time_per_iteration": 2.7481534481048584 + }, + { + "auxiliary_loss_clip": 0.01909154, + "auxiliary_loss_mlp": 0.01207037, + "balance_loss_clip": 1.12707186, + "balance_loss_mlp": 1.35597348, + "epoch": 0.0031264091387344056, + "flos": 41427482774400.0, + "grad_norm": 2.0262044932375836, + "language_loss": 0.95253396, + "learning_rate": 2.5440168957651953e-06, + "loss": 0.98369586, + "num_input_tokens_seen": 988110, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 5.53125, + "step": 52, + "time_per_iteration": 2.8958797454833984 + }, + { + "auxiliary_loss_clip": 0.01904814, + "auxiliary_loss_mlp": 0.01243661, + "balance_loss_clip": 1.16274214, + "balance_loss_mlp": 1.35173535, + "epoch": 0.0031865323914023747, + "flos": 23440941970560.0, + "grad_norm": 3.3193539013945546, + "language_loss": 0.92261833, + "learning_rate": 2.5562811176888872e-06, + "loss": 0.95410311, + "num_input_tokens_seen": 1008550, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 5.53125, + "step": 53, + "time_per_iteration": 2.7579286098480225 + }, + { + "auxiliary_loss_clip": 0.01893968, + "auxiliary_loss_mlp": 0.01196907, + "balance_loss_clip": 1.11489081, + "balance_loss_mlp": 1.35535884, + "epoch": 0.003246655644070344, + "flos": 14429195510400.0, + "grad_norm": 2.2021865200163, + "language_loss": 0.82945669, + "learning_rate": 2.5683160883431093e-06, + "loss": 0.86036545, + "num_input_tokens_seen": 1026840, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 5.375, + "step": 54, + "time_per_iteration": 2.684718132019043 + }, + { + "auxiliary_loss_clip": 0.01889572, + "auxiliary_loss_mlp": 0.01211293, + "balance_loss_clip": 1.13113666, + "balance_loss_mlp": 1.34359026, + "epoch": 0.0033067788967383136, + "flos": 35918247496320.0, + "grad_norm": 2.4060188817442487, + "language_loss": 0.81305432, + "learning_rate": 2.580130221340046e-06, + "loss": 0.84406298, + "num_input_tokens_seen": 1048875, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 5.4375, + "step": 55, + "time_per_iteration": 2.7722246646881104 + }, + { + "auxiliary_loss_clip": 0.01879346, + "auxiliary_loss_mlp": 0.01199903, + "balance_loss_clip": 1.11926973, + "balance_loss_mlp": 1.33773279, + "epoch": 0.003366902149406283, + "flos": 22958732862720.0, + "grad_norm": 2.497299649397407, + "language_loss": 0.87261844, + "learning_rate": 2.5917314754514246e-06, + "loss": 0.90341091, + "num_input_tokens_seen": 1066435, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 5.40625, + "step": 56, + "time_per_iteration": 2.7031195163726807 + }, + { + "auxiliary_loss_clip": 0.01879922, + "auxiliary_loss_mlp": 0.01161266, + "balance_loss_clip": 1.0864507, + "balance_loss_mlp": 1.33024335, + "epoch": 0.003427025402074252, + "flos": 26582838560640.0, + "grad_norm": 2.4089458733946882, + "language_loss": 0.92949611, + "learning_rate": 2.6031273868139713e-06, + "loss": 0.95990801, + "num_input_tokens_seen": 1090330, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 5.5, + "step": 57, + "time_per_iteration": 2.8580281734466553 + }, + { + "auxiliary_loss_clip": 0.01843074, + "auxiliary_loss_mlp": 0.01217004, + "balance_loss_clip": 1.14395308, + "balance_loss_mlp": 1.33453596, + "epoch": 0.0034871486547422216, + "flos": 23951196622080.0, + "grad_norm": 2.105168727735643, + "language_loss": 0.99725533, + "learning_rate": 2.614325098333948e-06, + "loss": 1.02785611, + "num_input_tokens_seen": 1109840, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 5.09375, + "step": 58, + "time_per_iteration": 2.687504529953003 + }, + { + "auxiliary_loss_clip": 0.01822907, + "auxiliary_loss_mlp": 0.01195384, + "balance_loss_clip": 1.12319088, + "balance_loss_mlp": 1.32094967, + "epoch": 0.003547271907410191, + "flos": 21214983214080.0, + "grad_norm": 2.1328304194940855, + "language_loss": 0.8821373, + "learning_rate": 2.625331386578098e-06, + "loss": 0.9123202, + "num_input_tokens_seen": 1128415, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 5.03125, + "step": 59, + "time_per_iteration": 6.997380495071411 + }, + { + "auxiliary_loss_clip": 0.01844896, + "auxiliary_loss_mlp": 0.01162144, + "balance_loss_clip": 1.08885431, + "balance_loss_mlp": 1.32932925, + "epoch": 0.00360739516007816, + "flos": 16504903676160.0, + "grad_norm": 2.097582115586327, + "language_loss": 0.93430054, + "learning_rate": 2.63615268640451e-06, + "loss": 0.96437097, + "num_input_tokens_seen": 1146515, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 5.15625, + "step": 60, + "time_per_iteration": 2.67743182182312 + }, + { + "auxiliary_loss_clip": 0.0182307, + "auxiliary_loss_mlp": 0.01172385, + "balance_loss_clip": 1.10376787, + "balance_loss_mlp": 1.31307459, + "epoch": 0.0036675184127461296, + "flos": 19464805031040.0, + "grad_norm": 4.241258673484683, + "language_loss": 0.90090871, + "learning_rate": 2.6467951135575943e-06, + "loss": 0.93086326, + "num_input_tokens_seen": 1166330, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 5.09375, + "step": 61, + "time_per_iteration": 2.707247257232666 + }, + { + "auxiliary_loss_clip": 0.01806801, + "auxiliary_loss_mlp": 0.01142561, + "balance_loss_clip": 1.07475519, + "balance_loss_mlp": 1.31002319, + "epoch": 0.003727641665414099, + "flos": 20957323979520.0, + "grad_norm": 3.0487456468745586, + "language_loss": 0.88434047, + "learning_rate": 2.657264485425803e-06, + "loss": 0.9138341, + "num_input_tokens_seen": 1186010, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 4.96875, + "step": 62, + "time_per_iteration": 2.736107587814331 + }, + { + "auxiliary_loss_clip": 0.01787131, + "auxiliary_loss_mlp": 0.01161947, + "balance_loss_clip": 1.09132755, + "balance_loss_mlp": 1.30018497, + "epoch": 0.003787764918082068, + "flos": 18406050721920.0, + "grad_norm": 2.6509198595432406, + "language_loss": 0.96265876, + "learning_rate": 2.6675663401385186e-06, + "loss": 0.99214947, + "num_input_tokens_seen": 1204985, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 4.875, + "step": 63, + "time_per_iteration": 2.6760194301605225 + }, + { + "auxiliary_loss_clip": 0.01795174, + "auxiliary_loss_mlp": 0.01169703, + "balance_loss_clip": 1.10284996, + "balance_loss_mlp": 1.30725491, + "epoch": 0.0038478881707500376, + "flos": 12459243962880.0, + "grad_norm": 2.677484479433752, + "language_loss": 0.99141657, + "learning_rate": 2.677705954159056e-06, + "loss": 1.02106524, + "num_input_tokens_seen": 1223545, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 4.875, + "step": 64, + "time_per_iteration": 2.675295114517212 + }, + { + "auxiliary_loss_clip": 0.01802087, + "auxiliary_loss_mlp": 0.01149441, + "balance_loss_clip": 1.08134842, + "balance_loss_mlp": 1.30652797, + "epoch": 0.003908011423418007, + "flos": 13553334276480.0, + "grad_norm": 2.45939593962701, + "language_loss": 0.85358196, + "learning_rate": 2.6876883585136904e-06, + "loss": 0.88309723, + "num_input_tokens_seen": 1241175, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 4.9375, + "step": 65, + "time_per_iteration": 2.647696018218994 + }, + { + "auxiliary_loss_clip": 0.01779034, + "auxiliary_loss_mlp": 0.01156784, + "balance_loss_clip": 1.0886445, + "balance_loss_mlp": 1.29322505, + "epoch": 0.003968134676085976, + "flos": 18333475292160.0, + "grad_norm": 2.8561979494145033, + "language_loss": 0.85224223, + "learning_rate": 2.697518353781685e-06, + "loss": 0.88160038, + "num_input_tokens_seen": 1259315, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 4.875, + "step": 66, + "time_per_iteration": 2.617143392562866 + }, + { + "auxiliary_loss_clip": 0.01782156, + "auxiliary_loss_mlp": 0.01152634, + "balance_loss_clip": 1.07648349, + "balance_loss_mlp": 1.29168975, + "epoch": 0.004028257928753946, + "flos": 20485242506880.0, + "grad_norm": 2.246759082278279, + "language_loss": 0.96454394, + "learning_rate": 2.7072005239581103e-06, + "loss": 0.99389184, + "num_input_tokens_seen": 1277055, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 4.90625, + "step": 67, + "time_per_iteration": 2.6343421936035156 + }, + { + "auxiliary_loss_clip": 0.01753238, + "auxiliary_loss_mlp": 0.01155696, + "balance_loss_clip": 1.08340704, + "balance_loss_mlp": 1.28524387, + "epoch": 0.004088381181421915, + "flos": 18843837684480.0, + "grad_norm": 2.549207131743101, + "language_loss": 0.94534445, + "learning_rate": 2.7167392492896727e-06, + "loss": 0.97443378, + "num_input_tokens_seen": 1294355, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 4.6875, + "step": 68, + "time_per_iteration": 2.614696741104126 + }, + { + "auxiliary_loss_clip": 0.01748377, + "auxiliary_loss_mlp": 0.01156697, + "balance_loss_clip": 1.08717394, + "balance_loss_mlp": 1.28268003, + "epoch": 0.004148504434089885, + "flos": 19427817000960.0, + "grad_norm": 1.9922029239060344, + "language_loss": 0.95657748, + "learning_rate": 2.7261387181735195e-06, + "loss": 0.98562825, + "num_input_tokens_seen": 1313525, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 4.65625, + "step": 69, + "time_per_iteration": 2.6637492179870605 + }, + { + "auxiliary_loss_clip": 0.01742428, + "auxiliary_loss_mlp": 0.01160645, + "balance_loss_clip": 1.09598637, + "balance_loss_mlp": 1.2855866, + "epoch": 0.004208627686757853, + "flos": 20811023884800.0, + "grad_norm": 2.4176731159017075, + "language_loss": 0.98073572, + "learning_rate": 2.7354029381999196e-06, + "loss": 1.00976658, + "num_input_tokens_seen": 1330505, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 4.5625, + "step": 70, + "time_per_iteration": 2.6395556926727295 + }, + { + "auxiliary_loss_clip": 0.01748999, + "auxiliary_loss_mlp": 0.01146397, + "balance_loss_clip": 1.07673144, + "balance_loss_mlp": 1.2760632, + "epoch": 0.004268750939425823, + "flos": 19098623831040.0, + "grad_norm": 2.71386904393857, + "language_loss": 0.93927777, + "learning_rate": 2.7445357464116983e-06, + "loss": 0.96823174, + "num_input_tokens_seen": 1349615, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 4.75, + "step": 71, + "time_per_iteration": 2.628272294998169 + }, + { + "auxiliary_loss_clip": 0.01838762, + "auxiliary_loss_mlp": 0.01327632, + "balance_loss_clip": 1.28967619, + "balance_loss_mlp": 1.43997037, + "epoch": 0.004328874192093792, + "flos": 52439635514880.0, + "grad_norm": 2.4194543250518663, + "language_loss": 0.65655279, + "learning_rate": 2.75354081884615e-06, + "loss": 0.68821681, + "num_input_tokens_seen": 1410275, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 4.0, + "step": 72, + "time_per_iteration": 3.104635000228882 + }, + { + "auxiliary_loss_clip": 0.01820285, + "auxiliary_loss_mlp": 0.01295248, + "balance_loss_clip": 1.25824571, + "balance_loss_mlp": 1.43420911, + "epoch": 0.004388997444761762, + "flos": 66473239564800.0, + "grad_norm": 2.2482458517722455, + "language_loss": 0.63711512, + "learning_rate": 2.7624216794188286e-06, + "loss": 0.66827047, + "num_input_tokens_seen": 1473020, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 3.859375, + "step": 73, + "time_per_iteration": 3.208836793899536 + }, + { + "auxiliary_loss_clip": 0.01723308, + "auxiliary_loss_mlp": 0.01141966, + "balance_loss_clip": 1.07382631, + "balance_loss_mlp": 1.26790953, + "epoch": 0.004449120697429731, + "flos": 18952970181120.0, + "grad_norm": 2.4515337577309424, + "language_loss": 0.85899854, + "learning_rate": 2.771181708202938e-06, + "loss": 0.88765126, + "num_input_tokens_seen": 1490385, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 4.5625, + "step": 74, + "time_per_iteration": 2.6287550926208496 + }, + { + "auxiliary_loss_clip": 0.01725734, + "auxiliary_loss_mlp": 0.01165418, + "balance_loss_clip": 1.09584761, + "balance_loss_mlp": 1.26750898, + "epoch": 0.004509243950097701, + "flos": 21105491581440.0, + "grad_norm": 2.110493434952054, + "language_loss": 0.9716984, + "learning_rate": 2.779824149153005e-06, + "loss": 1.00060987, + "num_input_tokens_seen": 1509725, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 4.5625, + "step": 75, + "time_per_iteration": 2.635618209838867 + }, + { + "auxiliary_loss_clip": 0.01704277, + "auxiliary_loss_mlp": 0.01144942, + "balance_loss_clip": 1.07875705, + "balance_loss_mlp": 1.26302838, + "epoch": 0.004569367202765669, + "flos": 20698730991360.0, + "grad_norm": 2.60583579179481, + "language_loss": 0.87675405, + "learning_rate": 2.788352117317012e-06, + "loss": 0.9052462, + "num_input_tokens_seen": 1527245, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 4.4375, + "step": 76, + "time_per_iteration": 2.6379826068878174 + }, + { + "auxiliary_loss_clip": 0.01705571, + "auxiliary_loss_mlp": 0.0114831, + "balance_loss_clip": 1.07845366, + "balance_loss_mlp": 1.26138341, + "epoch": 0.004629490455433639, + "flos": 28658474899200.0, + "grad_norm": 1.9080158042054207, + "language_loss": 0.91751724, + "learning_rate": 2.796768605577095e-06, + "loss": 0.94605613, + "num_input_tokens_seen": 1548930, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 4.4375, + "step": 77, + "time_per_iteration": 2.6596872806549072 + }, + { + "auxiliary_loss_clip": 0.01694222, + "auxiliary_loss_mlp": 0.01165235, + "balance_loss_clip": 1.09494948, + "balance_loss_mlp": 1.26167083, + "epoch": 0.004689613708101608, + "flos": 11072409805440.0, + "grad_norm": 2.1229280552318803, + "language_loss": 0.92189825, + "learning_rate": 2.80507649095533e-06, + "loss": 0.95049286, + "num_input_tokens_seen": 1565695, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 4.3125, + "step": 78, + "time_per_iteration": 2.598590850830078 + }, + { + "auxiliary_loss_clip": 0.01690635, + "auxiliary_loss_mlp": 0.01155594, + "balance_loss_clip": 1.08735824, + "balance_loss_mlp": 1.25696921, + "epoch": 0.004749736960769578, + "flos": 21799106184960.0, + "grad_norm": 2.280813483182965, + "language_loss": 0.82480371, + "learning_rate": 2.813278540517843e-06, + "loss": 0.85326606, + "num_input_tokens_seen": 1582625, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 4.34375, + "step": 79, + "time_per_iteration": 2.6215708255767822 + }, + { + "auxiliary_loss_clip": 0.01705122, + "auxiliary_loss_mlp": 0.01133248, + "balance_loss_clip": 1.06315339, + "balance_loss_mlp": 1.26029253, + "epoch": 0.004809860213437547, + "flos": 19792597570560.0, + "grad_norm": 2.4809717100134616, + "language_loss": 0.91311121, + "learning_rate": 2.8213774169075505e-06, + "loss": 0.94149494, + "num_input_tokens_seen": 1601725, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 4.4375, + "step": 80, + "time_per_iteration": 2.639841079711914 + }, + { + "auxiliary_loss_clip": 0.01674552, + "auxiliary_loss_mlp": 0.01142875, + "balance_loss_clip": 1.07254159, + "balance_loss_mlp": 1.25350285, + "epoch": 0.004869983466105517, + "flos": 26574327037440.0, + "grad_norm": 2.165091554789383, + "language_loss": 0.94981706, + "learning_rate": 2.829375683533245e-06, + "loss": 0.97799134, + "num_input_tokens_seen": 1622420, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 4.21875, + "step": 81, + "time_per_iteration": 2.6689717769622803 + }, + { + "auxiliary_loss_clip": 0.01688803, + "auxiliary_loss_mlp": 0.01148831, + "balance_loss_clip": 1.08269382, + "balance_loss_mlp": 1.25745821, + "epoch": 0.004930106718773485, + "flos": 12823378087680.0, + "grad_norm": 2.9914678747629226, + "language_loss": 0.96341741, + "learning_rate": 2.8372758094402803e-06, + "loss": 0.99179375, + "num_input_tokens_seen": 1640715, + "router_z_loss_clip": 0.66015625, + "router_z_loss_mlp": 4.3125, + "step": 82, + "time_per_iteration": 2.629596471786499 + }, + { + "auxiliary_loss_clip": 0.01671229, + "auxiliary_loss_mlp": 0.01159801, + "balance_loss_clip": 1.09013557, + "balance_loss_mlp": 1.24528587, + "epoch": 0.004990229971441455, + "flos": 25774919902080.0, + "grad_norm": 2.533591741594043, + "language_loss": 0.8664127, + "learning_rate": 2.84508017388607e-06, + "loss": 0.894723, + "num_input_tokens_seen": 1662210, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 4.25, + "step": 83, + "time_per_iteration": 2.7277162075042725 + }, + { + "auxiliary_loss_clip": 0.01664198, + "auxiliary_loss_mlp": 0.01156919, + "balance_loss_clip": 1.08663368, + "balance_loss_mlp": 1.24647975, + "epoch": 0.005050353224109424, + "flos": 17457254922240.0, + "grad_norm": 3.373799694341511, + "language_loss": 0.91779828, + "learning_rate": 2.852791070641559e-06, + "loss": 0.94600952, + "num_input_tokens_seen": 1681070, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 4.1875, + "step": 84, + "time_per_iteration": 2.62187123298645 + }, + { + "auxiliary_loss_clip": 0.01645783, + "auxiliary_loss_mlp": 0.01205663, + "balance_loss_clip": 1.17075825, + "balance_loss_mlp": 1.34984684, + "epoch": 0.005110476476777394, + "flos": 69805460367360.0, + "grad_norm": 1.4266053341540552, + "language_loss": 0.62504542, + "learning_rate": 2.8604107120381682e-06, + "loss": 0.65355992, + "num_input_tokens_seen": 1747140, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 2.96875, + "step": 85, + "time_per_iteration": 3.190223217010498 + }, + { + "auxiliary_loss_clip": 0.0165122, + "auxiliary_loss_mlp": 0.01127154, + "balance_loss_clip": 1.05648708, + "balance_loss_mlp": 1.23674285, + "epoch": 0.005170599729445363, + "flos": 24790105739520.0, + "grad_norm": 1.7428139018461835, + "language_loss": 0.90836501, + "learning_rate": 2.8679412327780482e-06, + "loss": 0.93614876, + "num_input_tokens_seen": 1767475, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 4.15625, + "step": 86, + "time_per_iteration": 2.66162109375 + }, + { + "auxiliary_loss_clip": 0.01655877, + "auxiliary_loss_mlp": 0.01161945, + "balance_loss_clip": 1.09065783, + "balance_loss_mlp": 1.24282312, + "epoch": 0.005230722982113333, + "flos": 23258048895360.0, + "grad_norm": 2.38275425723773, + "language_loss": 0.8209877, + "learning_rate": 2.8753846935240833e-06, + "loss": 0.84916592, + "num_input_tokens_seen": 1784980, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 4.125, + "step": 87, + "time_per_iteration": 2.618906021118164 + }, + { + "auxiliary_loss_clip": 0.01644726, + "auxiliary_loss_mlp": 0.01154792, + "balance_loss_clip": 1.08617568, + "balance_loss_mlp": 1.24127626, + "epoch": 0.005290846234781301, + "flos": 16727909264640.0, + "grad_norm": 1.8918921085406437, + "language_loss": 0.95630223, + "learning_rate": 2.8827430842847267e-06, + "loss": 0.98429739, + "num_input_tokens_seen": 1803030, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 4.03125, + "step": 88, + "time_per_iteration": 2.5916671752929688 + }, + { + "auxiliary_loss_clip": 0.01661198, + "auxiliary_loss_mlp": 0.0114963, + "balance_loss_clip": 1.08230066, + "balance_loss_mlp": 1.24101663, + "epoch": 0.005350969487449271, + "flos": 20886077352960.0, + "grad_norm": 1.9438908009999392, + "language_loss": 0.85920149, + "learning_rate": 2.8900183276075957e-06, + "loss": 0.88730979, + "num_input_tokens_seen": 1822865, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 4.1875, + "step": 89, + "time_per_iteration": 2.6486849784851074 + }, + { + "auxiliary_loss_clip": 0.01648909, + "auxiliary_loss_mlp": 0.01132231, + "balance_loss_clip": 1.06547391, + "balance_loss_mlp": 1.23491406, + "epoch": 0.00541109274011724, + "flos": 26209977431040.0, + "grad_norm": 4.519706664825811, + "language_loss": 0.91517568, + "learning_rate": 2.8972122815946455e-06, + "loss": 0.94298708, + "num_input_tokens_seen": 1842435, + "router_z_loss_clip": 0.66796875, + "router_z_loss_mlp": 4.125, + "step": 90, + "time_per_iteration": 2.658997058868408 + }, + { + "auxiliary_loss_clip": 0.01630542, + "auxiliary_loss_mlp": 0.0113282, + "balance_loss_clip": 1.06496572, + "balance_loss_mlp": 1.23102689, + "epoch": 0.00547121599278521, + "flos": 21178569801600.0, + "grad_norm": 2.2090932400382486, + "language_loss": 0.8587057, + "learning_rate": 2.90432674275074e-06, + "loss": 0.88633931, + "num_input_tokens_seen": 1860065, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 3.984375, + "step": 91, + "time_per_iteration": 2.619231939315796 + }, + { + "auxiliary_loss_clip": 0.01629785, + "auxiliary_loss_mlp": 0.01140917, + "balance_loss_clip": 1.07458866, + "balance_loss_mlp": 1.22673059, + "epoch": 0.005531339245453179, + "flos": 19718801078400.0, + "grad_norm": 2.769705373909222, + "language_loss": 0.86930025, + "learning_rate": 2.91136344867656e-06, + "loss": 0.89700729, + "num_input_tokens_seen": 1878135, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 4.03125, + "step": 92, + "time_per_iteration": 2.646968126296997 + }, + { + "auxiliary_loss_clip": 0.01621216, + "auxiliary_loss_mlp": 0.01179948, + "balance_loss_clip": 1.1122849, + "balance_loss_mlp": 1.21872091, + "epoch": 0.005591462498121149, + "flos": 17636089760640.0, + "grad_norm": 2.5030178409929, + "language_loss": 0.92042911, + "learning_rate": 2.918324080615938e-06, + "loss": 0.94844079, + "num_input_tokens_seen": 1894895, + "router_z_loss_clip": 0.67578125, + "router_z_loss_mlp": 4.03125, + "step": 93, + "time_per_iteration": 2.59853196144104 + }, + { + "auxiliary_loss_clip": 0.016342, + "auxiliary_loss_mlp": 0.01152159, + "balance_loss_clip": 1.08120561, + "balance_loss_mlp": 1.22512126, + "epoch": 0.005651585750789117, + "flos": 20011221699840.0, + "grad_norm": 2.2071592078672198, + "language_loss": 0.87372428, + "learning_rate": 2.925210265866963e-06, + "loss": 0.90158784, + "num_input_tokens_seen": 1913220, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 4.09375, + "step": 94, + "time_per_iteration": 2.587707281112671 + }, + { + "auxiliary_loss_clip": 0.01562532, + "auxiliary_loss_mlp": 0.01067909, + "balance_loss_clip": 1.03243279, + "balance_loss_mlp": 1.30452466, + "epoch": 0.005711709003457087, + "flos": 59812957981440.0, + "grad_norm": 1.3851210442303683, + "language_loss": 0.6813519, + "learning_rate": 2.932023580065507e-06, + "loss": 0.70765626, + "num_input_tokens_seen": 1970970, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 2.578125, + "step": 95, + "time_per_iteration": 3.067047595977783 + }, + { + "auxiliary_loss_clip": 0.01611383, + "auxiliary_loss_mlp": 0.01154317, + "balance_loss_clip": 1.08693981, + "balance_loss_mlp": 1.21303511, + "epoch": 0.005771832256125056, + "flos": 15559591495680.0, + "grad_norm": 2.5109536438971976, + "language_loss": 0.89978027, + "learning_rate": 2.9387655493491906e-06, + "loss": 0.92743719, + "num_input_tokens_seen": 1988930, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 3.984375, + "step": 96, + "time_per_iteration": 2.590522289276123 + }, + { + "auxiliary_loss_clip": 0.01603776, + "auxiliary_loss_mlp": 0.01143264, + "balance_loss_clip": 1.08108413, + "balance_loss_mlp": 1.21597803, + "epoch": 0.005831955508793026, + "flos": 22528380015360.0, + "grad_norm": 2.825781473558237, + "language_loss": 0.89798892, + "learning_rate": 2.9454376524092147e-06, + "loss": 0.92545933, + "num_input_tokens_seen": 2006285, + "router_z_loss_clip": 0.62109375, + "router_z_loss_mlp": 3.875, + "step": 97, + "time_per_iteration": 2.630364179611206 + }, + { + "auxiliary_loss_clip": 0.0158997, + "auxiliary_loss_mlp": 0.01139649, + "balance_loss_clip": 1.07103181, + "balance_loss_mlp": 1.20754981, + "epoch": 0.005892078761460995, + "flos": 22049834094720.0, + "grad_norm": 2.1954130163748573, + "language_loss": 0.76553786, + "learning_rate": 2.952041322436969e-06, + "loss": 0.79283404, + "num_input_tokens_seen": 2024905, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 3.8125, + "step": 98, + "time_per_iteration": 2.6088852882385254 + }, + { + "auxiliary_loss_clip": 0.01531856, + "auxiliary_loss_mlp": 0.01047217, + "balance_loss_clip": 1.01250362, + "balance_loss_mlp": 1.28449416, + "epoch": 0.005952202014128965, + "flos": 68539143317760.0, + "grad_norm": 1.0389188302362988, + "language_loss": 0.65464473, + "learning_rate": 2.9585779489718204e-06, + "loss": 0.68043554, + "num_input_tokens_seen": 2086220, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 2.46875, + "step": 99, + "time_per_iteration": 3.196779251098633 + }, + { + "auxiliary_loss_clip": 0.0159215, + "auxiliary_loss_mlp": 0.01143603, + "balance_loss_clip": 1.07312632, + "balance_loss_mlp": 1.20754516, + "epoch": 0.006012325266796933, + "flos": 22960887678720.0, + "grad_norm": 2.02393591458392, + "language_loss": 0.90861535, + "learning_rate": 2.9650488796560464e-06, + "loss": 0.93597281, + "num_input_tokens_seen": 2103365, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 3.84375, + "step": 100, + "time_per_iteration": 2.659716844558716 + }, + { + "auxiliary_loss_clip": 0.01602583, + "auxiliary_loss_mlp": 0.01150362, + "balance_loss_clip": 1.08360529, + "balance_loss_mlp": 1.21008992, + "epoch": 0.006072448519464903, + "flos": 17347942857600.0, + "grad_norm": 9.149928686451464, + "language_loss": 0.91165614, + "learning_rate": 2.971455421902446e-06, + "loss": 0.93918556, + "num_input_tokens_seen": 2121995, + "router_z_loss_clip": 0.66796875, + "router_z_loss_mlp": 3.921875, + "step": 101, + "time_per_iteration": 5.522722959518433 + }, + { + "auxiliary_loss_clip": 0.01592164, + "auxiliary_loss_mlp": 0.01153598, + "balance_loss_clip": 1.08273995, + "balance_loss_mlp": 1.21078956, + "epoch": 0.006132571772132872, + "flos": 24681116897280.0, + "grad_norm": 2.149611483260168, + "language_loss": 0.90634245, + "learning_rate": 2.9777988444798075e-06, + "loss": 0.9338001, + "num_input_tokens_seen": 2141815, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 3.8125, + "step": 102, + "time_per_iteration": 2.7264201641082764 + }, + { + "auxiliary_loss_clip": 0.01586171, + "auxiliary_loss_mlp": 0.01134806, + "balance_loss_clip": 1.06986046, + "balance_loss_mlp": 1.20794034, + "epoch": 0.006192695024800842, + "flos": 21465675210240.0, + "grad_norm": 2.4455555336324135, + "language_loss": 0.87990314, + "learning_rate": 2.9840803790210285e-06, + "loss": 0.9071129, + "num_input_tokens_seen": 2161125, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 3.78125, + "step": 103, + "time_per_iteration": 2.6332345008850098 + }, + { + "auxiliary_loss_clip": 0.01586169, + "auxiliary_loss_mlp": 0.01136721, + "balance_loss_clip": 1.07015502, + "balance_loss_mlp": 1.2100153, + "epoch": 0.006252818277468811, + "flos": 17420410546560.0, + "grad_norm": 1.9653003456434248, + "language_loss": 0.93796182, + "learning_rate": 2.990301221458371e-06, + "loss": 0.96519077, + "num_input_tokens_seen": 2179510, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 3.765625, + "step": 104, + "time_per_iteration": 2.5763180255889893 + }, + { + "auxiliary_loss_clip": 0.01576682, + "auxiliary_loss_mlp": 0.01148107, + "balance_loss_clip": 1.08382916, + "balance_loss_mlp": 1.20004964, + "epoch": 0.006312941530136781, + "flos": 19099557584640.0, + "grad_norm": 2.978383813748495, + "language_loss": 0.96302718, + "learning_rate": 2.9964625333900544e-06, + "loss": 0.99027503, + "num_input_tokens_seen": 2197870, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 3.765625, + "step": 105, + "time_per_iteration": 2.598074197769165 + }, + { + "auxiliary_loss_clip": 0.01576054, + "auxiliary_loss_mlp": 0.01157995, + "balance_loss_clip": 1.08618331, + "balance_loss_mlp": 1.20040035, + "epoch": 0.006373064782804749, + "flos": 24060831909120.0, + "grad_norm": 2.254409296180574, + "language_loss": 0.86981636, + "learning_rate": 3.002565443382063e-06, + "loss": 0.89715683, + "num_input_tokens_seen": 2217495, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 3.75, + "step": 106, + "time_per_iteration": 2.620400905609131 + }, + { + "auxiliary_loss_clip": 0.01558878, + "auxiliary_loss_mlp": 0.01142953, + "balance_loss_clip": 1.07462192, + "balance_loss_mlp": 1.18650925, + "epoch": 0.006433188035472719, + "flos": 18332433797760.0, + "grad_norm": 2.299900982703377, + "language_loss": 0.8342824, + "learning_rate": 3.008611048208843e-06, + "loss": 0.86130083, + "num_input_tokens_seen": 2236520, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 3.71875, + "step": 107, + "time_per_iteration": 2.6031439304351807 + }, + { + "auxiliary_loss_clip": 0.01473949, + "auxiliary_loss_mlp": 0.01044987, + "balance_loss_clip": 1.01294351, + "balance_loss_mlp": 1.24969411, + "epoch": 0.006493311288140688, + "flos": 62562387594240.0, + "grad_norm": 0.9921074222226888, + "language_loss": 0.64829654, + "learning_rate": 3.014600414036285e-06, + "loss": 0.67348593, + "num_input_tokens_seen": 2300140, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 2.25, + "step": 108, + "time_per_iteration": 3.1797876358032227 + }, + { + "auxiliary_loss_clip": 0.01549803, + "auxiliary_loss_mlp": 0.0113223, + "balance_loss_clip": 1.0634706, + "balance_loss_mlp": 1.18794155, + "epoch": 0.006553434540808658, + "flos": 19500141035520.0, + "grad_norm": 3.0292528917398895, + "language_loss": 0.97705221, + "learning_rate": 3.0205345775501937e-06, + "loss": 1.00387263, + "num_input_tokens_seen": 2317320, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 3.625, + "step": 109, + "time_per_iteration": 2.587251663208008 + }, + { + "auxiliary_loss_clip": 0.01548304, + "auxiliary_loss_mlp": 0.01143686, + "balance_loss_clip": 1.07759643, + "balance_loss_mlp": 1.18955791, + "epoch": 0.006613557793476627, + "flos": 21105132445440.0, + "grad_norm": 1.7037490209774204, + "language_loss": 0.84119976, + "learning_rate": 3.0264145470332218e-06, + "loss": 0.86811972, + "num_input_tokens_seen": 2337820, + "router_z_loss_clip": 0.66015625, + "router_z_loss_mlp": 3.59375, + "step": 110, + "time_per_iteration": 2.612900495529175 + }, + { + "auxiliary_loss_clip": 0.01543027, + "auxiliary_loss_mlp": 0.01148771, + "balance_loss_clip": 1.08287191, + "balance_loss_mlp": 1.18348098, + "epoch": 0.006673681046144597, + "flos": 26030747543040.0, + "grad_norm": 2.0686651571732186, + "language_loss": 0.83053756, + "learning_rate": 3.032241303393073e-06, + "loss": 0.85745549, + "num_input_tokens_seen": 2358560, + "router_z_loss_clip": 0.66015625, + "router_z_loss_mlp": 3.59375, + "step": 111, + "time_per_iteration": 2.648775815963745 + }, + { + "auxiliary_loss_clip": 0.01543945, + "auxiliary_loss_mlp": 0.01132291, + "balance_loss_clip": 1.06906247, + "balance_loss_mlp": 1.18600404, + "epoch": 0.006733804298812566, + "flos": 23147767163520.0, + "grad_norm": 1.9360906695559799, + "language_loss": 0.94064176, + "learning_rate": 3.0380158011446e-06, + "loss": 0.96740413, + "num_input_tokens_seen": 2379005, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 3.59375, + "step": 112, + "time_per_iteration": 2.5952305793762207 + }, + { + "auxiliary_loss_clip": 0.01547241, + "auxiliary_loss_mlp": 0.0113746, + "balance_loss_clip": 1.07342076, + "balance_loss_mlp": 1.18214464, + "epoch": 0.006793927551480535, + "flos": 11764444210560.0, + "grad_norm": 2.4119047199233594, + "language_loss": 0.79298341, + "learning_rate": 3.0437389693482466e-06, + "loss": 0.81983036, + "num_input_tokens_seen": 2395610, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 3.65625, + "step": 113, + "time_per_iteration": 2.524744987487793 + }, + { + "auxiliary_loss_clip": 0.01535171, + "auxiliary_loss_mlp": 0.01130123, + "balance_loss_clip": 1.06460583, + "balance_loss_mlp": 1.1784718, + "epoch": 0.006854050804148504, + "flos": 19171953446400.0, + "grad_norm": 2.1108584765070924, + "language_loss": 0.93168736, + "learning_rate": 3.0494117125071475e-06, + "loss": 0.95834035, + "num_input_tokens_seen": 2415005, + "router_z_loss_clip": 0.65625, + "router_z_loss_mlp": 3.5625, + "step": 114, + "time_per_iteration": 2.6716785430908203 + }, + { + "auxiliary_loss_clip": 0.01541748, + "auxiliary_loss_mlp": 0.01138267, + "balance_loss_clip": 1.07828045, + "balance_loss_mlp": 1.17785645, + "epoch": 0.006914174056816474, + "flos": 21981891519360.0, + "grad_norm": 2.266348661789013, + "language_loss": 0.94440514, + "learning_rate": 3.055034911425055e-06, + "loss": 0.97120523, + "num_input_tokens_seen": 2433965, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 3.640625, + "step": 115, + "time_per_iteration": 2.6136229038238525 + }, + { + "auxiliary_loss_clip": 0.01536673, + "auxiliary_loss_mlp": 0.0111845, + "balance_loss_clip": 1.052122, + "balance_loss_mlp": 1.1758287, + "epoch": 0.006974297309484443, + "flos": 16289152634880.0, + "grad_norm": 12.665326776351556, + "language_loss": 0.81903678, + "learning_rate": 3.0606094240271244e-06, + "loss": 0.84558797, + "num_input_tokens_seen": 2451605, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 3.609375, + "step": 116, + "time_per_iteration": 2.577003240585327 + }, + { + "auxiliary_loss_clip": 0.01526673, + "auxiliary_loss_mlp": 0.01127935, + "balance_loss_clip": 1.06375241, + "balance_loss_mlp": 1.17504787, + "epoch": 0.007034420562152413, + "flos": 26104005331200.0, + "grad_norm": 2.0071741256932794, + "language_loss": 0.88063896, + "learning_rate": 3.0661360861454656e-06, + "loss": 0.90718508, + "num_input_tokens_seen": 2472035, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 3.515625, + "step": 117, + "time_per_iteration": 2.611503839492798 + }, + { + "auxiliary_loss_clip": 0.01525448, + "auxiliary_loss_mlp": 0.01143736, + "balance_loss_clip": 1.07840896, + "balance_loss_mlp": 1.17308259, + "epoch": 0.007094543814820382, + "flos": 14204609723520.0, + "grad_norm": 2.5473368597875594, + "language_loss": 0.84470415, + "learning_rate": 3.071615712271274e-06, + "loss": 0.87139601, + "num_input_tokens_seen": 2489285, + "router_z_loss_clip": 0.65234375, + "router_z_loss_mlp": 3.53125, + "step": 118, + "time_per_iteration": 2.577461004257202 + }, + { + "auxiliary_loss_clip": 0.01536798, + "auxiliary_loss_mlp": 0.01163532, + "balance_loss_clip": 1.09930205, + "balance_loss_mlp": 1.1748507, + "epoch": 0.007154667067488351, + "flos": 14976007228800.0, + "grad_norm": 2.057592918726277, + "language_loss": 0.99470234, + "learning_rate": 3.0770490962752172e-06, + "loss": 1.02170563, + "num_input_tokens_seen": 2506460, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 3.625, + "step": 119, + "time_per_iteration": 2.549661636352539 + }, + { + "auxiliary_loss_clip": 0.01537395, + "auxiliary_loss_mlp": 0.0111939, + "balance_loss_clip": 1.05701971, + "balance_loss_mlp": 1.16968298, + "epoch": 0.00721479032015632, + "flos": 20193288762240.0, + "grad_norm": 2.410205702357196, + "language_loss": 0.89085704, + "learning_rate": 3.082437012097686e-06, + "loss": 0.91742492, + "num_input_tokens_seen": 2525565, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 3.6875, + "step": 120, + "time_per_iteration": 2.583630084991455 + }, + { + "auxiliary_loss_clip": 0.01524337, + "auxiliary_loss_mlp": 0.01130091, + "balance_loss_clip": 1.06667209, + "balance_loss_mlp": 1.17169607, + "epoch": 0.00727491357282429, + "flos": 23147228459520.0, + "grad_norm": 1.904240324338801, + "language_loss": 0.93491054, + "learning_rate": 3.0877802144103967e-06, + "loss": 0.96145487, + "num_input_tokens_seen": 2546605, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 3.53125, + "step": 121, + "time_per_iteration": 2.6146788597106934 + }, + { + "auxiliary_loss_clip": 0.01523412, + "auxiliary_loss_mlp": 0.0114605, + "balance_loss_clip": 1.08382273, + "balance_loss_mlp": 1.17073464, + "epoch": 0.007335036825492259, + "flos": 15521669712000.0, + "grad_norm": 3.352658173167552, + "language_loss": 0.90176952, + "learning_rate": 3.09307943925077e-06, + "loss": 0.92846411, + "num_input_tokens_seen": 2560730, + "router_z_loss_clip": 0.62109375, + "router_z_loss_mlp": 3.53125, + "step": 122, + "time_per_iteration": 2.566470146179199 + }, + { + "auxiliary_loss_clip": 0.01520578, + "auxiliary_loss_mlp": 0.01142532, + "balance_loss_clip": 1.07634664, + "balance_loss_mlp": 1.16606736, + "epoch": 0.007395160078160229, + "flos": 24243365848320.0, + "grad_norm": 2.7249964127160764, + "language_loss": 0.92516506, + "learning_rate": 3.0983354046304154e-06, + "loss": 0.95179617, + "num_input_tokens_seen": 2579550, + "router_z_loss_clip": 0.66015625, + "router_z_loss_mlp": 3.546875, + "step": 123, + "time_per_iteration": 2.6002941131591797 + }, + { + "auxiliary_loss_clip": 0.01517776, + "auxiliary_loss_mlp": 0.01125795, + "balance_loss_clip": 1.06433022, + "balance_loss_mlp": 1.1609534, + "epoch": 0.007455283330828198, + "flos": 31759792099200.0, + "grad_norm": 7.583203404073904, + "language_loss": 0.71128142, + "learning_rate": 3.103548811118979e-06, + "loss": 0.73771715, + "num_input_tokens_seen": 2600390, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 3.5625, + "step": 124, + "time_per_iteration": 2.79618763923645 + }, + { + "auxiliary_loss_clip": 0.01505473, + "auxiliary_loss_mlp": 0.01124615, + "balance_loss_clip": 1.06157708, + "balance_loss_mlp": 1.16223335, + "epoch": 0.007515406583496167, + "flos": 26615157822720.0, + "grad_norm": 2.4227692366027855, + "language_loss": 0.88482195, + "learning_rate": 3.108720342404542e-06, + "loss": 0.9111228, + "num_input_tokens_seen": 2620770, + "router_z_loss_clip": 0.62890625, + "router_z_loss_mlp": 3.4375, + "step": 125, + "time_per_iteration": 2.6131536960601807 + }, + { + "auxiliary_loss_clip": 0.0152071, + "auxiliary_loss_mlp": 0.01140137, + "balance_loss_clip": 1.07762396, + "balance_loss_mlp": 1.16211164, + "epoch": 0.007575529836164136, + "flos": 18223696350720.0, + "grad_norm": 2.993097477973623, + "language_loss": 0.82384819, + "learning_rate": 3.1138506658316945e-06, + "loss": 0.8504566, + "num_input_tokens_seen": 2639900, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 3.59375, + "step": 126, + "time_per_iteration": 2.595423936843872 + }, + { + "auxiliary_loss_clip": 0.01514354, + "auxiliary_loss_mlp": 0.01139255, + "balance_loss_clip": 1.077981, + "balance_loss_mlp": 1.16128385, + "epoch": 0.007635653088832106, + "flos": 21580410228480.0, + "grad_norm": 3.7264016399601534, + "language_loss": 0.67276633, + "learning_rate": 3.1189404329183404e-06, + "loss": 0.69930243, + "num_input_tokens_seen": 2657450, + "router_z_loss_clip": 0.61328125, + "router_z_loss_mlp": 3.53125, + "step": 127, + "time_per_iteration": 2.620950937271118 + }, + { + "auxiliary_loss_clip": 0.01504536, + "auxiliary_loss_mlp": 0.01128822, + "balance_loss_clip": 1.06640375, + "balance_loss_mlp": 1.16422939, + "epoch": 0.007695776341500075, + "flos": 25375054723200.0, + "grad_norm": 3.6226937306152496, + "language_loss": 0.8815757, + "learning_rate": 3.1239902798522317e-06, + "loss": 0.90790927, + "num_input_tokens_seen": 2678150, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 3.40625, + "step": 128, + "time_per_iteration": 2.6476521492004395 + }, + { + "auxiliary_loss_clip": 0.01505804, + "auxiliary_loss_mlp": 0.01141266, + "balance_loss_clip": 1.07870471, + "balance_loss_mlp": 1.15920687, + "epoch": 0.007755899594168045, + "flos": 22343906741760.0, + "grad_norm": 1.875185485357673, + "language_loss": 0.84581351, + "learning_rate": 3.129000827968184e-06, + "loss": 0.87228423, + "num_input_tokens_seen": 2698290, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 3.46875, + "step": 129, + "time_per_iteration": 2.611762285232544 + }, + { + "auxiliary_loss_clip": 0.01499869, + "auxiliary_loss_mlp": 0.01133647, + "balance_loss_clip": 1.07122934, + "balance_loss_mlp": 1.1588279, + "epoch": 0.007816022846836013, + "flos": 22638230784000.0, + "grad_norm": 2.023668494136832, + "language_loss": 0.9742806, + "learning_rate": 3.133972684206866e-06, + "loss": 1.00061572, + "num_input_tokens_seen": 2717630, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 3.40625, + "step": 130, + "time_per_iteration": 2.599639415740967 + }, + { + "auxiliary_loss_clip": 0.01493155, + "auxiliary_loss_mlp": 0.0113499, + "balance_loss_clip": 1.07109392, + "balance_loss_mlp": 1.15518749, + "epoch": 0.007876146099503984, + "flos": 18182901479040.0, + "grad_norm": 2.1876581172480285, + "language_loss": 0.82624269, + "learning_rate": 3.138906441556014e-06, + "loss": 0.85252404, + "num_input_tokens_seen": 2735835, + "router_z_loss_clip": 0.63671875, + "router_z_loss_mlp": 3.375, + "step": 131, + "time_per_iteration": 2.6086065769195557 + }, + { + "auxiliary_loss_clip": 0.01502593, + "auxiliary_loss_mlp": 0.01127672, + "balance_loss_clip": 1.06759024, + "balance_loss_mlp": 1.15800536, + "epoch": 0.007936269352171952, + "flos": 27119486730240.0, + "grad_norm": 2.4868851395581677, + "language_loss": 0.82762384, + "learning_rate": 3.143802679474861e-06, + "loss": 0.85392648, + "num_input_tokens_seen": 2756335, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 3.4375, + "step": 132, + "time_per_iteration": 2.673790454864502 + }, + { + "auxiliary_loss_clip": 0.01493849, + "auxiliary_loss_mlp": 0.01128197, + "balance_loss_clip": 1.06716144, + "balance_loss_mlp": 1.15264463, + "epoch": 0.007996392604839923, + "flos": 19026335710080.0, + "grad_norm": 2.7432419346617443, + "language_loss": 0.95486552, + "learning_rate": 3.1486619643025565e-06, + "loss": 0.98108596, + "num_input_tokens_seen": 2775090, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 3.40625, + "step": 133, + "time_per_iteration": 2.6287872791290283 + }, + { + "auxiliary_loss_clip": 0.01490198, + "auxiliary_loss_mlp": 0.01125526, + "balance_loss_clip": 1.06725681, + "balance_loss_mlp": 1.16143155, + "epoch": 0.008056515857507891, + "flos": 25484151306240.0, + "grad_norm": 1.7764051426707919, + "language_loss": 0.73316634, + "learning_rate": 3.153484849651286e-06, + "loss": 0.7593236, + "num_input_tokens_seen": 2795320, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 3.296875, + "step": 134, + "time_per_iteration": 2.6728081703186035 + }, + { + "auxiliary_loss_clip": 0.01486213, + "auxiliary_loss_mlp": 0.01130543, + "balance_loss_clip": 1.06707644, + "balance_loss_mlp": 1.14955854, + "epoch": 0.00811663911017586, + "flos": 20557566541440.0, + "grad_norm": 3.090234736760587, + "language_loss": 0.88808328, + "learning_rate": 3.1582718767847806e-06, + "loss": 0.91425079, + "num_input_tokens_seen": 2812815, + "router_z_loss_clip": 0.63671875, + "router_z_loss_mlp": 3.375, + "step": 135, + "time_per_iteration": 2.6380510330200195 + }, + { + "auxiliary_loss_clip": 0.01489108, + "auxiliary_loss_mlp": 0.01131555, + "balance_loss_clip": 1.06789732, + "balance_loss_mlp": 1.15456343, + "epoch": 0.00817676236284383, + "flos": 18799738761600.0, + "grad_norm": 2.008171494368998, + "language_loss": 0.89123899, + "learning_rate": 3.1630235749828485e-06, + "loss": 0.9174456, + "num_input_tokens_seen": 2830445, + "router_z_loss_clip": 0.63671875, + "router_z_loss_mlp": 3.34375, + "step": 136, + "time_per_iteration": 2.555936813354492 + }, + { + "auxiliary_loss_clip": 0.01486639, + "auxiliary_loss_mlp": 0.01108223, + "balance_loss_clip": 1.04962027, + "balance_loss_mlp": 1.14870429, + "epoch": 0.008236885615511799, + "flos": 23873593288320.0, + "grad_norm": 5.8712537379963345, + "language_loss": 0.8400104, + "learning_rate": 3.1677404618925676e-06, + "loss": 0.86595905, + "num_input_tokens_seen": 2846965, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 3.375, + "step": 137, + "time_per_iteration": 2.6225337982177734 + }, + { + "auxiliary_loss_clip": 0.01482624, + "auxiliary_loss_mlp": 0.01116377, + "balance_loss_clip": 1.05796409, + "balance_loss_mlp": 1.14842129, + "epoch": 0.00829700886817977, + "flos": 24643626076800.0, + "grad_norm": 1.6861384534946333, + "language_loss": 0.90170664, + "learning_rate": 3.1724230438666953e-06, + "loss": 0.9276967, + "num_input_tokens_seen": 2867520, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 3.34375, + "step": 138, + "time_per_iteration": 2.653205156326294 + }, + { + "auxiliary_loss_clip": 0.01472312, + "auxiliary_loss_mlp": 0.01119929, + "balance_loss_clip": 1.0568912, + "balance_loss_mlp": 1.1478796, + "epoch": 0.008357132120847738, + "flos": 25262007644160.0, + "grad_norm": 2.679342832062188, + "language_loss": 0.91253459, + "learning_rate": 3.177071816289865e-06, + "loss": 0.93845713, + "num_input_tokens_seen": 2885675, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 3.234375, + "step": 139, + "time_per_iteration": 2.6182503700256348 + }, + { + "auxiliary_loss_clip": 0.01489087, + "auxiliary_loss_mlp": 0.01123997, + "balance_loss_clip": 1.06229401, + "balance_loss_mlp": 1.154405, + "epoch": 0.008417255373515706, + "flos": 27344898529920.0, + "grad_norm": 2.5553770836970675, + "language_loss": 0.85446793, + "learning_rate": 3.181687263893095e-06, + "loss": 0.88059878, + "num_input_tokens_seen": 2905960, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 3.34375, + "step": 140, + "time_per_iteration": 2.649454116821289 + }, + { + "auxiliary_loss_clip": 0.01476267, + "auxiliary_loss_mlp": 0.0111889, + "balance_loss_clip": 1.0594281, + "balance_loss_mlp": 1.14865911, + "epoch": 0.008477378626183677, + "flos": 17639070589440.0, + "grad_norm": 2.379593217845822, + "language_loss": 0.84156519, + "learning_rate": 3.186269861057098e-06, + "loss": 0.86751676, + "num_input_tokens_seen": 2922780, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 3.28125, + "step": 141, + "time_per_iteration": 2.608603000640869 + }, + { + "auxiliary_loss_clip": 0.01480312, + "auxiliary_loss_mlp": 0.01134333, + "balance_loss_clip": 1.07320273, + "balance_loss_mlp": 1.14624739, + "epoch": 0.008537501878851645, + "flos": 13881342297600.0, + "grad_norm": 2.3283494467369965, + "language_loss": 0.81387591, + "learning_rate": 3.1908200721048745e-06, + "loss": 0.84002233, + "num_input_tokens_seen": 2938765, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 3.34375, + "step": 142, + "time_per_iteration": 4.023308753967285 + }, + { + "auxiliary_loss_clip": 0.01378722, + "auxiliary_loss_mlp": 0.01032728, + "balance_loss_clip": 1.00621629, + "balance_loss_mlp": 1.1918689, + "epoch": 0.008597625131519616, + "flos": 71248101281280.0, + "grad_norm": 1.0451783350372967, + "language_loss": 0.66831523, + "learning_rate": 3.195338351584042e-06, + "loss": 0.69242978, + "num_input_tokens_seen": 3006665, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.8671875, + "step": 143, + "time_per_iteration": 4.718023777008057 + }, + { + "auxiliary_loss_clip": 0.01472184, + "auxiliary_loss_mlp": 0.0112263, + "balance_loss_clip": 1.06283474, + "balance_loss_mlp": 1.14625573, + "epoch": 0.008657748384187584, + "flos": 17602836744960.0, + "grad_norm": 2.2608538764922295, + "language_loss": 0.83954072, + "learning_rate": 3.1998251445393258e-06, + "loss": 0.86548889, + "num_input_tokens_seen": 3024335, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 3.25, + "step": 144, + "time_per_iteration": 2.5878453254699707 + }, + { + "auxiliary_loss_clip": 0.01457808, + "auxiliary_loss_mlp": 0.01111605, + "balance_loss_clip": 1.04890084, + "balance_loss_mlp": 1.13930941, + "epoch": 0.008717871636855555, + "flos": 19715317459200.0, + "grad_norm": 2.241812154138119, + "language_loss": 0.88511693, + "learning_rate": 3.204280886775619e-06, + "loss": 0.91081107, + "num_input_tokens_seen": 3043300, + "router_z_loss_clip": 0.62890625, + "router_z_loss_mlp": 3.1875, + "step": 145, + "time_per_iteration": 2.586512565612793 + }, + { + "auxiliary_loss_clip": 0.01475641, + "auxiliary_loss_mlp": 0.01124002, + "balance_loss_clip": 1.06153631, + "balance_loss_mlp": 1.14211285, + "epoch": 0.008777994889523523, + "flos": 24717422568960.0, + "grad_norm": 1.792984011276012, + "language_loss": 0.85949898, + "learning_rate": 3.208706005112005e-06, + "loss": 0.88549542, + "num_input_tokens_seen": 3064610, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 3.34375, + "step": 146, + "time_per_iteration": 2.6258151531219482 + }, + { + "auxiliary_loss_clip": 0.01359324, + "auxiliary_loss_mlp": 0.01026953, + "balance_loss_clip": 1.00082254, + "balance_loss_mlp": 1.17825258, + "epoch": 0.008838118142191492, + "flos": 70132067758080.0, + "grad_norm": 0.8557738136673508, + "language_loss": 0.60047674, + "learning_rate": 3.213100917627104e-06, + "loss": 0.62433958, + "num_input_tokens_seen": 3130385, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.8125, + "step": 147, + "time_per_iteration": 3.2522764205932617 + }, + { + "auxiliary_loss_clip": 0.01465546, + "auxiliary_loss_mlp": 0.01124118, + "balance_loss_clip": 1.06670642, + "balance_loss_mlp": 1.14550173, + "epoch": 0.008898241394859462, + "flos": 20044797937920.0, + "grad_norm": 1.8343461268862185, + "language_loss": 0.8454501, + "learning_rate": 3.2174660338961135e-06, + "loss": 0.87134671, + "num_input_tokens_seen": 3149760, + "router_z_loss_clip": 0.57421875, + "router_z_loss_mlp": 3.203125, + "step": 148, + "time_per_iteration": 2.635499954223633 + }, + { + "auxiliary_loss_clip": 0.0147086, + "auxiliary_loss_mlp": 0.01143141, + "balance_loss_clip": 1.07914925, + "balance_loss_mlp": 1.14693797, + "epoch": 0.008958364647527431, + "flos": 10743611685120.0, + "grad_norm": 2.2581185064103404, + "language_loss": 0.88802874, + "learning_rate": 3.2218017552198588e-06, + "loss": 0.91416872, + "num_input_tokens_seen": 3164500, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 3.234375, + "step": 149, + "time_per_iteration": 2.5458836555480957 + }, + { + "auxiliary_loss_clip": 0.01466862, + "auxiliary_loss_mlp": 0.01112061, + "balance_loss_clip": 1.05445874, + "balance_loss_mlp": 1.14131117, + "epoch": 0.009018487900195401, + "flos": 29127467802240.0, + "grad_norm": 2.7760320197047097, + "language_loss": 0.93054724, + "learning_rate": 3.226108474846181e-06, + "loss": 0.95633656, + "num_input_tokens_seen": 3182455, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 3.25, + "step": 150, + "time_per_iteration": 2.648111343383789 + }, + { + "auxiliary_loss_clip": 0.01454371, + "auxiliary_loss_mlp": 0.01109463, + "balance_loss_clip": 1.05391192, + "balance_loss_mlp": 1.13663483, + "epoch": 0.00907861115286337, + "flos": 32963661354240.0, + "grad_norm": 1.9005080345968057, + "language_loss": 0.74303263, + "learning_rate": 3.2303865781839817e-06, + "loss": 0.76867104, + "num_input_tokens_seen": 3203995, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 3.171875, + "step": 151, + "time_per_iteration": 2.6953165531158447 + }, + { + "auxiliary_loss_clip": 0.014664, + "auxiliary_loss_mlp": 0.01125146, + "balance_loss_clip": 1.06735289, + "balance_loss_mlp": 1.14143276, + "epoch": 0.009138734405531338, + "flos": 21762441377280.0, + "grad_norm": 2.6241423805649298, + "language_loss": 0.88251799, + "learning_rate": 3.234636443010188e-06, + "loss": 0.90843344, + "num_input_tokens_seen": 3222575, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 3.25, + "step": 152, + "time_per_iteration": 2.6034231185913086 + }, + { + "auxiliary_loss_clip": 0.01466383, + "auxiliary_loss_mlp": 0.01121244, + "balance_loss_clip": 1.0628314, + "balance_loss_mlp": 1.14757276, + "epoch": 0.009198857658199309, + "flos": 20842517134080.0, + "grad_norm": 3.4062301864690196, + "language_loss": 0.83957756, + "learning_rate": 3.238858439669943e-06, + "loss": 0.86545384, + "num_input_tokens_seen": 3240180, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 3.1875, + "step": 153, + "time_per_iteration": 2.6023271083831787 + }, + { + "auxiliary_loss_clip": 0.01456394, + "auxiliary_loss_mlp": 0.01136316, + "balance_loss_clip": 1.0765202, + "balance_loss_mlp": 1.13805962, + "epoch": 0.009258980910867277, + "flos": 24827381078400.0, + "grad_norm": 1.9441527650945287, + "language_loss": 0.89881843, + "learning_rate": 3.2430529312702712e-06, + "loss": 0.92474556, + "num_input_tokens_seen": 3259800, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 3.1875, + "step": 154, + "time_per_iteration": 2.646308183670044 + }, + { + "auxiliary_loss_clip": 0.01460439, + "auxiliary_loss_mlp": 0.01154617, + "balance_loss_clip": 1.09577537, + "balance_loss_mlp": 1.14094579, + "epoch": 0.009319104163535248, + "flos": 28767786963840.0, + "grad_norm": 2.0692323216259187, + "language_loss": 0.89471745, + "learning_rate": 3.2472202738674737e-06, + "loss": 0.92086804, + "num_input_tokens_seen": 3280400, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 3.1875, + "step": 155, + "time_per_iteration": 2.6336286067962646 + }, + { + "auxiliary_loss_clip": 0.01463585, + "auxiliary_loss_mlp": 0.01116238, + "balance_loss_clip": 1.05894589, + "balance_loss_mlp": 1.13895822, + "epoch": 0.009379227416203216, + "flos": 16582004219520.0, + "grad_norm": 3.3077298720636255, + "language_loss": 0.86882627, + "learning_rate": 3.2513608166485063e-06, + "loss": 0.89462447, + "num_input_tokens_seen": 3297600, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 3.25, + "step": 156, + "time_per_iteration": 2.5539867877960205 + }, + { + "auxiliary_loss_clip": 0.01462083, + "auxiliary_loss_mlp": 0.01121969, + "balance_loss_clip": 1.06408143, + "balance_loss_mlp": 1.14298415, + "epoch": 0.009439350668871187, + "flos": 18329919845760.0, + "grad_norm": 2.4916444524903527, + "language_loss": 0.99553013, + "learning_rate": 3.2554749021065498e-06, + "loss": 1.02137065, + "num_input_tokens_seen": 3313635, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 3.1875, + "step": 157, + "time_per_iteration": 2.5249693393707275 + }, + { + "auxiliary_loss_clip": 0.01445636, + "auxiliary_loss_mlp": 0.01139016, + "balance_loss_clip": 1.08146214, + "balance_loss_mlp": 1.1366899, + "epoch": 0.009499473921539155, + "flos": 24349912565760.0, + "grad_norm": 2.0302475566757225, + "language_loss": 0.8847568, + "learning_rate": 3.2595628662110186e-06, + "loss": 0.91060334, + "num_input_tokens_seen": 3333735, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 3.09375, + "step": 158, + "time_per_iteration": 2.6009252071380615 + }, + { + "auxiliary_loss_clip": 0.01452439, + "auxiliary_loss_mlp": 0.01124253, + "balance_loss_clip": 1.06555486, + "balance_loss_mlp": 1.13677907, + "epoch": 0.009559597174207124, + "flos": 16399326625920.0, + "grad_norm": 4.310723443959545, + "language_loss": 0.86534697, + "learning_rate": 3.2636250385721982e-06, + "loss": 0.89111388, + "num_input_tokens_seen": 3348800, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 3.15625, + "step": 159, + "time_per_iteration": 2.6107394695281982 + }, + { + "auxiliary_loss_clip": 0.01442093, + "auxiliary_loss_mlp": 0.01132817, + "balance_loss_clip": 1.07340288, + "balance_loss_mlp": 1.13145089, + "epoch": 0.009619720426875094, + "flos": 22856890826880.0, + "grad_norm": 1.790220267572532, + "language_loss": 0.86825597, + "learning_rate": 3.2676617426007263e-06, + "loss": 0.89400506, + "num_input_tokens_seen": 3368595, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 3.109375, + "step": 160, + "time_per_iteration": 2.574252128601074 + }, + { + "auxiliary_loss_clip": 0.01449537, + "auxiliary_loss_mlp": 0.01117828, + "balance_loss_clip": 1.06318271, + "balance_loss_mlp": 1.13704872, + "epoch": 0.009679843679543063, + "flos": 19135001329920.0, + "grad_norm": 2.6107931748588893, + "language_loss": 0.91542315, + "learning_rate": 3.2716732956621042e-06, + "loss": 0.94109678, + "num_input_tokens_seen": 3384975, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 3.125, + "step": 161, + "time_per_iteration": 2.550865650177002 + }, + { + "auxiliary_loss_clip": 0.01454094, + "auxiliary_loss_mlp": 0.01109765, + "balance_loss_clip": 1.05488133, + "balance_loss_mlp": 1.13759339, + "epoch": 0.009739966932211033, + "flos": 20302995876480.0, + "grad_norm": 2.2107920101940994, + "language_loss": 0.91690832, + "learning_rate": 3.2756600092264203e-06, + "loss": 0.94254684, + "num_input_tokens_seen": 3404755, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 3.15625, + "step": 162, + "time_per_iteration": 2.5527970790863037 + }, + { + "auxiliary_loss_clip": 0.01312712, + "auxiliary_loss_mlp": 0.0102725, + "balance_loss_clip": 1.00331306, + "balance_loss_mlp": 1.14560354, + "epoch": 0.009800090184879002, + "flos": 67034234177280.0, + "grad_norm": 1.2615279464106541, + "language_loss": 0.72354776, + "learning_rate": 3.279622189013474e-06, + "loss": 0.74694741, + "num_input_tokens_seen": 3467210, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.671875, + "step": 163, + "time_per_iteration": 3.143763542175293 + }, + { + "auxiliary_loss_clip": 0.01440764, + "auxiliary_loss_mlp": 0.01113881, + "balance_loss_clip": 1.05804312, + "balance_loss_mlp": 1.13505006, + "epoch": 0.00986021343754697, + "flos": 17164690646400.0, + "grad_norm": 2.1923315312730374, + "language_loss": 0.8427155, + "learning_rate": 3.283560135133457e-06, + "loss": 0.86826193, + "num_input_tokens_seen": 3483220, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 3.0625, + "step": 164, + "time_per_iteration": 2.5536584854125977 + }, + { + "auxiliary_loss_clip": 0.01429878, + "auxiliary_loss_mlp": 0.01100497, + "balance_loss_clip": 1.04585135, + "balance_loss_mlp": 1.12637794, + "epoch": 0.00992033669021494, + "flos": 17749424148480.0, + "grad_norm": 2.006756380443377, + "language_loss": 0.89215541, + "learning_rate": 3.2874741422233565e-06, + "loss": 0.91745919, + "num_input_tokens_seen": 3501465, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 3.03125, + "step": 165, + "time_per_iteration": 2.5313210487365723 + }, + { + "auxiliary_loss_clip": 0.01434156, + "auxiliary_loss_mlp": 0.01127756, + "balance_loss_clip": 1.0692482, + "balance_loss_mlp": 1.12764359, + "epoch": 0.00998045994288291, + "flos": 25297164080640.0, + "grad_norm": 6.432940691763592, + "language_loss": 0.80138129, + "learning_rate": 3.2913644995792465e-06, + "loss": 0.82700044, + "num_input_tokens_seen": 3520480, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 3.0625, + "step": 166, + "time_per_iteration": 2.6461095809936523 + }, + { + "auxiliary_loss_clip": 0.01438531, + "auxiliary_loss_mlp": 0.01125189, + "balance_loss_clip": 1.06749213, + "balance_loss_mlp": 1.13121533, + "epoch": 0.01004058319555088, + "flos": 32298954220800.0, + "grad_norm": 2.334124726802297, + "language_loss": 0.9190954, + "learning_rate": 3.2952314912845914e-06, + "loss": 0.94473255, + "num_input_tokens_seen": 3539570, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 3.078125, + "step": 167, + "time_per_iteration": 2.655597448348999 + }, + { + "auxiliary_loss_clip": 0.01430369, + "auxiliary_loss_mlp": 0.01135101, + "balance_loss_clip": 1.07997894, + "balance_loss_mlp": 1.12960708, + "epoch": 0.010100706448218848, + "flos": 11319941404800.0, + "grad_norm": 3.1870046541457873, + "language_loss": 0.90852308, + "learning_rate": 3.299075396334735e-06, + "loss": 0.93417776, + "num_input_tokens_seen": 3555465, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 3.0, + "step": 168, + "time_per_iteration": 2.5387983322143555 + }, + { + "auxiliary_loss_clip": 0.01424973, + "auxiliary_loss_mlp": 0.01106848, + "balance_loss_clip": 1.05072391, + "balance_loss_mlp": 1.12456727, + "epoch": 0.010160829700886819, + "flos": 29719491765120.0, + "grad_norm": 2.0495813916191077, + "language_loss": 0.87094414, + "learning_rate": 3.3028964887576868e-06, + "loss": 0.89626241, + "num_input_tokens_seen": 3578970, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 3.0, + "step": 169, + "time_per_iteration": 2.6448419094085693 + }, + { + "auxiliary_loss_clip": 0.01426284, + "auxiliary_loss_mlp": 0.01111393, + "balance_loss_clip": 1.05548358, + "balance_loss_mlp": 1.12704372, + "epoch": 0.010220952953554787, + "flos": 20412343854720.0, + "grad_norm": 3.0203817486241973, + "language_loss": 0.84758192, + "learning_rate": 3.306695037731344e-06, + "loss": 0.87295866, + "num_input_tokens_seen": 3597275, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 3.0, + "step": 170, + "time_per_iteration": 2.5596489906311035 + }, + { + "auxiliary_loss_clip": 0.01435879, + "auxiliary_loss_mlp": 0.01136565, + "balance_loss_clip": 1.07963061, + "balance_loss_mlp": 1.12765205, + "epoch": 0.010281076206222756, + "flos": 31285124847360.0, + "grad_norm": 2.124400250788896, + "language_loss": 0.89896494, + "learning_rate": 3.3104713076972827e-06, + "loss": 0.92468935, + "num_input_tokens_seen": 3618905, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 3.078125, + "step": 171, + "time_per_iteration": 2.71183180809021 + }, + { + "auxiliary_loss_clip": 0.01429687, + "auxiliary_loss_mlp": 0.01108406, + "balance_loss_clip": 1.05421364, + "balance_loss_mlp": 1.1300813, + "epoch": 0.010341199458890726, + "flos": 21982286568960.0, + "grad_norm": 2.015577645060998, + "language_loss": 0.88978243, + "learning_rate": 3.314225558471224e-06, + "loss": 0.91516334, + "num_input_tokens_seen": 3639610, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 3.0, + "step": 172, + "time_per_iteration": 2.6193771362304688 + }, + { + "auxiliary_loss_clip": 0.01419105, + "auxiliary_loss_mlp": 0.01124801, + "balance_loss_clip": 1.06986928, + "balance_loss_mlp": 1.12354624, + "epoch": 0.010401322711558695, + "flos": 30810529422720.0, + "grad_norm": 1.6868779107262128, + "language_loss": 0.81148165, + "learning_rate": 3.317958045350308e-06, + "loss": 0.83692074, + "num_input_tokens_seen": 3664030, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 2.953125, + "step": 173, + "time_per_iteration": 2.656935691833496 + }, + { + "auxiliary_loss_clip": 0.01430653, + "auxiliary_loss_mlp": 0.0110718, + "balance_loss_clip": 1.05496693, + "balance_loss_mlp": 1.12733519, + "epoch": 0.010461445964226665, + "flos": 24715124098560.0, + "grad_norm": 2.1134597687554244, + "language_loss": 0.82498932, + "learning_rate": 3.3216690192172596e-06, + "loss": 0.85036767, + "num_input_tokens_seen": 3683615, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 3.03125, + "step": 174, + "time_per_iteration": 2.6050753593444824 + }, + { + "auxiliary_loss_clip": 0.01425822, + "auxiliary_loss_mlp": 0.01124156, + "balance_loss_clip": 1.06984437, + "balance_loss_mlp": 1.12589645, + "epoch": 0.010521569216894634, + "flos": 27710361457920.0, + "grad_norm": 2.6035215697191965, + "language_loss": 0.72699076, + "learning_rate": 3.325358726641591e-06, + "loss": 0.75249052, + "num_input_tokens_seen": 3704540, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 3.0, + "step": 175, + "time_per_iteration": 2.6859946250915527 + }, + { + "auxiliary_loss_clip": 0.01427679, + "auxiliary_loss_mlp": 0.0113274, + "balance_loss_clip": 1.07571054, + "balance_loss_mlp": 1.12603855, + "epoch": 0.010581692469562603, + "flos": 12458346122880.0, + "grad_norm": 2.402827576481816, + "language_loss": 0.98082507, + "learning_rate": 3.329027409977902e-06, + "loss": 1.00642931, + "num_input_tokens_seen": 3721320, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 3.015625, + "step": 176, + "time_per_iteration": 2.5405664443969727 + }, + { + "auxiliary_loss_clip": 0.01412838, + "auxiliary_loss_mlp": 0.01132631, + "balance_loss_clip": 1.08005941, + "balance_loss_mlp": 1.12270594, + "epoch": 0.010641815722230573, + "flos": 19427601519360.0, + "grad_norm": 2.3427037211777115, + "language_loss": 0.76749414, + "learning_rate": 3.3326753074614087e-06, + "loss": 0.79294884, + "num_input_tokens_seen": 3739385, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.90625, + "step": 177, + "time_per_iteration": 2.555553674697876 + }, + { + "auxiliary_loss_clip": 0.01423246, + "auxiliary_loss_mlp": 0.01104615, + "balance_loss_clip": 1.0507797, + "balance_loss_mlp": 1.12089574, + "epoch": 0.010701938974898541, + "flos": 18332577452160.0, + "grad_norm": 2.4108248963401464, + "language_loss": 0.76824659, + "learning_rate": 3.3363026533007716e-06, + "loss": 0.79352522, + "num_input_tokens_seen": 3756360, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 3.015625, + "step": 178, + "time_per_iteration": 2.5799388885498047 + }, + { + "auxiliary_loss_clip": 0.01429506, + "auxiliary_loss_mlp": 0.01108132, + "balance_loss_clip": 1.05224717, + "balance_loss_mlp": 1.12586653, + "epoch": 0.010762062227566512, + "flos": 19203985399680.0, + "grad_norm": 2.1918052506036174, + "language_loss": 0.84004253, + "learning_rate": 3.3399096777683303e-06, + "loss": 0.86541891, + "num_input_tokens_seen": 3773930, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 3.03125, + "step": 179, + "time_per_iteration": 2.5387184619903564 + }, + { + "auxiliary_loss_clip": 0.01420983, + "auxiliary_loss_mlp": 0.01112539, + "balance_loss_clip": 1.05677247, + "balance_loss_mlp": 1.12062979, + "epoch": 0.01082218548023448, + "flos": 31425427370880.0, + "grad_norm": 1.90488055395076, + "language_loss": 0.83719397, + "learning_rate": 3.3434966072878213e-06, + "loss": 0.86252916, + "num_input_tokens_seen": 3793630, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 3.0, + "step": 180, + "time_per_iteration": 2.6149253845214844 + }, + { + "auxiliary_loss_clip": 0.01421575, + "auxiliary_loss_mlp": 0.01120367, + "balance_loss_clip": 1.06503046, + "balance_loss_mlp": 1.1226536, + "epoch": 0.01088230873290245, + "flos": 25046436170880.0, + "grad_norm": 3.784573507260413, + "language_loss": 0.7774682, + "learning_rate": 3.3470636645196674e-06, + "loss": 0.80288756, + "num_input_tokens_seen": 3813610, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 3.0, + "step": 181, + "time_per_iteration": 2.5769712924957275 + }, + { + "auxiliary_loss_clip": 0.01417045, + "auxiliary_loss_mlp": 0.01131731, + "balance_loss_clip": 1.07732356, + "balance_loss_mlp": 1.11938787, + "epoch": 0.01094243198557042, + "flos": 22893411980160.0, + "grad_norm": 3.1835165271024377, + "language_loss": 0.76440376, + "learning_rate": 3.3506110684439156e-06, + "loss": 0.78989148, + "num_input_tokens_seen": 3831390, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 2.96875, + "step": 182, + "time_per_iteration": 2.5641353130340576 + }, + { + "auxiliary_loss_clip": 0.01412704, + "auxiliary_loss_mlp": 0.01127012, + "balance_loss_clip": 1.07122183, + "balance_loss_mlp": 1.11758399, + "epoch": 0.011002555238238388, + "flos": 17165049782400.0, + "grad_norm": 2.172025067133121, + "language_loss": 0.87377435, + "learning_rate": 3.3541390344409054e-06, + "loss": 0.89917147, + "num_input_tokens_seen": 3849705, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 2.953125, + "step": 183, + "time_per_iteration": 2.567457914352417 + }, + { + "auxiliary_loss_clip": 0.01415124, + "auxiliary_loss_mlp": 0.01114516, + "balance_loss_clip": 1.06397092, + "balance_loss_mlp": 1.1209594, + "epoch": 0.011062678490906358, + "flos": 22310150935680.0, + "grad_norm": 2.2669267607504255, + "language_loss": 0.86875558, + "learning_rate": 3.357647774369736e-06, + "loss": 0.89405191, + "num_input_tokens_seen": 3869230, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.953125, + "step": 184, + "time_per_iteration": 5.380701780319214 + }, + { + "auxiliary_loss_clip": 0.01411555, + "auxiliary_loss_mlp": 0.01107942, + "balance_loss_clip": 1.05308247, + "balance_loss_mlp": 1.12176847, + "epoch": 0.011122801743574327, + "flos": 24388373053440.0, + "grad_norm": 1.8448371257401488, + "language_loss": 0.83683228, + "learning_rate": 3.3611374966446085e-06, + "loss": 0.86202729, + "num_input_tokens_seen": 3889735, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 2.90625, + "step": 185, + "time_per_iteration": 2.5522208213806152 + }, + { + "auxiliary_loss_clip": 0.01420908, + "auxiliary_loss_mlp": 0.01109712, + "balance_loss_clip": 1.05253971, + "balance_loss_mlp": 1.11964798, + "epoch": 0.011182924996242297, + "flos": 18150258994560.0, + "grad_norm": 2.4162416092451475, + "language_loss": 0.71111757, + "learning_rate": 3.3646084063091142e-06, + "loss": 0.73642373, + "num_input_tokens_seen": 3908855, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 3.015625, + "step": 186, + "time_per_iteration": 2.536498546600342 + }, + { + "auxiliary_loss_clip": 0.01416319, + "auxiliary_loss_mlp": 0.01107204, + "balance_loss_clip": 1.0558964, + "balance_loss_mlp": 1.11923158, + "epoch": 0.011243048248910266, + "flos": 15486800584320.0, + "grad_norm": 3.342492581434835, + "language_loss": 1.02028871, + "learning_rate": 3.3680607051085194e-06, + "loss": 1.04552388, + "num_input_tokens_seen": 3923865, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 2.96875, + "step": 187, + "time_per_iteration": 2.5189080238342285 + }, + { + "auxiliary_loss_clip": 0.01405552, + "auxiliary_loss_mlp": 0.01110459, + "balance_loss_clip": 1.05597997, + "balance_loss_mlp": 1.11834478, + "epoch": 0.011303171501578235, + "flos": 40916868986880.0, + "grad_norm": 1.6787333311747052, + "language_loss": 0.75107503, + "learning_rate": 3.371494591560139e-06, + "loss": 0.7762351, + "num_input_tokens_seen": 3946870, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 2.875, + "step": 188, + "time_per_iteration": 2.73420786857605 + }, + { + "auxiliary_loss_clip": 0.01292523, + "auxiliary_loss_mlp": 0.01034102, + "balance_loss_clip": 1.01273942, + "balance_loss_mlp": 1.13387585, + "epoch": 0.011363294754246205, + "flos": 66302697790080.0, + "grad_norm": 0.7700467396195164, + "language_loss": 0.56216431, + "learning_rate": 3.3749102610218297e-06, + "loss": 0.5854305, + "num_input_tokens_seen": 4010005, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.5859375, + "step": 189, + "time_per_iteration": 3.176280975341797 + }, + { + "auxiliary_loss_clip": 0.01402635, + "auxiliary_loss_mlp": 0.01121834, + "balance_loss_clip": 1.06742704, + "balance_loss_mlp": 1.1134795, + "epoch": 0.011423418006914174, + "flos": 24900279730560.0, + "grad_norm": 2.292403028528975, + "language_loss": 0.94771594, + "learning_rate": 3.3783079057586833e-06, + "loss": 0.97296059, + "num_input_tokens_seen": 4029035, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 2.90625, + "step": 190, + "time_per_iteration": 2.604132652282715 + }, + { + "auxiliary_loss_clip": 0.01403317, + "auxiliary_loss_mlp": 0.01101291, + "balance_loss_clip": 1.04964972, + "balance_loss_mlp": 1.11493886, + "epoch": 0.011483541259582144, + "flos": 19791879298560.0, + "grad_norm": 2.993049163405909, + "language_loss": 0.84462845, + "learning_rate": 3.3816877150079665e-06, + "loss": 0.8696745, + "num_input_tokens_seen": 4046995, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 2.875, + "step": 191, + "time_per_iteration": 2.569664716720581 + }, + { + "auxiliary_loss_clip": 0.01402316, + "auxiliary_loss_mlp": 0.01121031, + "balance_loss_clip": 1.0698905, + "balance_loss_mlp": 1.11087692, + "epoch": 0.011543664512250112, + "flos": 26176939896960.0, + "grad_norm": 2.0097697123850593, + "language_loss": 0.91439575, + "learning_rate": 3.385049875042367e-06, + "loss": 0.93962914, + "num_input_tokens_seen": 4065865, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 2.90625, + "step": 192, + "time_per_iteration": 2.6416900157928467 + }, + { + "auxiliary_loss_clip": 0.0139743, + "auxiliary_loss_mlp": 0.01113461, + "balance_loss_clip": 1.05776596, + "balance_loss_mlp": 1.11231375, + "epoch": 0.011603787764918083, + "flos": 23768985905280.0, + "grad_norm": 2.095754720056515, + "language_loss": 0.86849445, + "learning_rate": 3.3883945692315938e-06, + "loss": 0.89360332, + "num_input_tokens_seen": 4085305, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 2.84375, + "step": 193, + "time_per_iteration": 2.569899797439575 + }, + { + "auxiliary_loss_clip": 0.01399232, + "auxiliary_loss_mlp": 0.01095137, + "balance_loss_clip": 1.04409146, + "balance_loss_mlp": 1.10937476, + "epoch": 0.011663911017586051, + "flos": 25954688494080.0, + "grad_norm": 2.446553756436178, + "language_loss": 0.92399615, + "learning_rate": 3.3917219781023906e-06, + "loss": 0.9489398, + "num_input_tokens_seen": 4105185, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 2.90625, + "step": 194, + "time_per_iteration": 2.6078743934631348 + }, + { + "auxiliary_loss_clip": 0.01405837, + "auxiliary_loss_mlp": 0.0110398, + "balance_loss_clip": 1.05188549, + "balance_loss_mlp": 1.11522019, + "epoch": 0.01172403427025402, + "flos": 17895149625600.0, + "grad_norm": 3.1413620570060052, + "language_loss": 0.89698559, + "learning_rate": 3.3950322793970014e-06, + "loss": 0.92208374, + "num_input_tokens_seen": 4123160, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.90625, + "step": 195, + "time_per_iteration": 2.5785820484161377 + }, + { + "auxiliary_loss_clip": 0.01400897, + "auxiliary_loss_mlp": 0.01117652, + "balance_loss_clip": 1.06345916, + "balance_loss_mlp": 1.11416054, + "epoch": 0.01178415752292199, + "flos": 17894539094400.0, + "grad_norm": 3.0173579296668813, + "language_loss": 0.8577168, + "learning_rate": 3.3983256481301445e-06, + "loss": 0.88290232, + "num_input_tokens_seen": 4140425, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 2.875, + "step": 196, + "time_per_iteration": 2.5492773056030273 + }, + { + "auxiliary_loss_clip": 0.01397107, + "auxiliary_loss_mlp": 0.01106206, + "balance_loss_clip": 1.05299139, + "balance_loss_mlp": 1.10991478, + "epoch": 0.011844280775589959, + "flos": 22893555634560.0, + "grad_norm": 2.86264810097015, + "language_loss": 0.93367243, + "learning_rate": 3.4016022566445335e-06, + "loss": 0.95870566, + "num_input_tokens_seen": 4159555, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.875, + "step": 197, + "time_per_iteration": 2.5488431453704834 + }, + { + "auxiliary_loss_clip": 0.01394686, + "auxiliary_loss_mlp": 0.0110986, + "balance_loss_clip": 1.05781317, + "balance_loss_mlp": 1.1120131, + "epoch": 0.01190440402825793, + "flos": 26980333441920.0, + "grad_norm": 2.1872318454948045, + "language_loss": 0.79184073, + "learning_rate": 3.4048622746649966e-06, + "loss": 0.81688625, + "num_input_tokens_seen": 4180480, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 2.828125, + "step": 198, + "time_per_iteration": 2.6208834648132324 + }, + { + "auxiliary_loss_clip": 0.01390401, + "auxiliary_loss_mlp": 0.01116361, + "balance_loss_clip": 1.06545901, + "balance_loss_mlp": 1.11265802, + "epoch": 0.011964527280925898, + "flos": 20521584092160.0, + "grad_norm": 3.3720724842630663, + "language_loss": 0.88065112, + "learning_rate": 3.4081058693512278e-06, + "loss": 0.90571868, + "num_input_tokens_seen": 4198835, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.765625, + "step": 199, + "time_per_iteration": 2.5257043838500977 + }, + { + "auxiliary_loss_clip": 0.01403414, + "auxiliary_loss_mlp": 0.01121968, + "balance_loss_clip": 1.0658679, + "balance_loss_mlp": 1.11557496, + "epoch": 0.012024650533593867, + "flos": 27745984771200.0, + "grad_norm": 1.8432610551497841, + "language_loss": 0.81327617, + "learning_rate": 3.411333205349222e-06, + "loss": 0.83853, + "num_input_tokens_seen": 4219335, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 2.875, + "step": 200, + "time_per_iteration": 2.593231201171875 + }, + { + "auxiliary_loss_clip": 0.01400536, + "auxiliary_loss_mlp": 0.01101092, + "balance_loss_clip": 1.04792464, + "balance_loss_mlp": 1.11138511, + "epoch": 0.012084773786261837, + "flos": 10452017076480.0, + "grad_norm": 2.758923223370522, + "language_loss": 0.87688923, + "learning_rate": 3.4145444448414217e-06, + "loss": 0.90190548, + "num_input_tokens_seen": 4236940, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.90625, + "step": 201, + "time_per_iteration": 2.5057122707366943 + }, + { + "auxiliary_loss_clip": 0.01401128, + "auxiliary_loss_mlp": 0.01110995, + "balance_loss_clip": 1.05751753, + "balance_loss_mlp": 1.1152513, + "epoch": 0.012144897038929806, + "flos": 23105751229440.0, + "grad_norm": 3.7927516715708736, + "language_loss": 0.84123611, + "learning_rate": 3.4177397475956223e-06, + "loss": 0.86635733, + "num_input_tokens_seen": 4256755, + "router_z_loss_clip": 0.53515625, + "router_z_loss_mlp": 2.859375, + "step": 202, + "time_per_iteration": 2.555680751800537 + }, + { + "auxiliary_loss_clip": 0.01388205, + "auxiliary_loss_mlp": 0.01109065, + "balance_loss_clip": 1.05639839, + "balance_loss_mlp": 1.10674798, + "epoch": 0.012205020291597776, + "flos": 21033203460480.0, + "grad_norm": 1.9040504717952067, + "language_loss": 0.90116632, + "learning_rate": 3.4209192710126685e-06, + "loss": 0.926139, + "num_input_tokens_seen": 4276505, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 2.8125, + "step": 203, + "time_per_iteration": 2.526937484741211 + }, + { + "auxiliary_loss_clip": 0.01281494, + "auxiliary_loss_mlp": 0.01053133, + "balance_loss_clip": 1.03138971, + "balance_loss_mlp": 1.12054539, + "epoch": 0.012265143544265745, + "flos": 68447785075200.0, + "grad_norm": 1.0150955472927095, + "language_loss": 0.61259121, + "learning_rate": 3.4240831701729837e-06, + "loss": 0.63593745, + "num_input_tokens_seen": 4330965, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.609375, + "step": 204, + "time_per_iteration": 3.051469326019287 + }, + { + "auxiliary_loss_clip": 0.01398264, + "auxiliary_loss_mlp": 0.01111819, + "balance_loss_clip": 1.0593431, + "balance_loss_mlp": 1.11035323, + "epoch": 0.012325266796933715, + "flos": 17019252478080.0, + "grad_norm": 2.269022633654934, + "language_loss": 0.91206741, + "learning_rate": 3.4272315978819516e-06, + "loss": 0.93716824, + "num_input_tokens_seen": 4348200, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.875, + "step": 205, + "time_per_iteration": 2.5105605125427246 + }, + { + "auxiliary_loss_clip": 0.01406073, + "auxiliary_loss_mlp": 0.01120568, + "balance_loss_clip": 1.06675649, + "balance_loss_mlp": 1.11524296, + "epoch": 0.012385390049601683, + "flos": 20190056538240.0, + "grad_norm": 2.2813283317886497, + "language_loss": 0.89215505, + "learning_rate": 3.4303647047142043e-06, + "loss": 0.91742146, + "num_input_tokens_seen": 4365460, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 2.90625, + "step": 206, + "time_per_iteration": 2.524327039718628 + }, + { + "auxiliary_loss_clip": 0.01394865, + "auxiliary_loss_mlp": 0.01101938, + "balance_loss_clip": 1.05039215, + "balance_loss_mlp": 1.10848641, + "epoch": 0.012445513302269652, + "flos": 16253134272000.0, + "grad_norm": 2.502758142715096, + "language_loss": 0.95368809, + "learning_rate": 3.43348263905683e-06, + "loss": 0.97865611, + "num_input_tokens_seen": 4383650, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 2.859375, + "step": 207, + "time_per_iteration": 2.5147407054901123 + }, + { + "auxiliary_loss_clip": 0.01393931, + "auxiliary_loss_mlp": 0.01116307, + "balance_loss_clip": 1.06416512, + "balance_loss_mlp": 1.11335945, + "epoch": 0.012505636554937622, + "flos": 23769380954880.0, + "grad_norm": 2.4565104125033232, + "language_loss": 0.75770479, + "learning_rate": 3.436585547151547e-06, + "loss": 0.78280723, + "num_input_tokens_seen": 4403765, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.8125, + "step": 208, + "time_per_iteration": 2.5426721572875977 + }, + { + "auxiliary_loss_clip": 0.01382601, + "auxiliary_loss_mlp": 0.01107359, + "balance_loss_clip": 1.05497861, + "balance_loss_mlp": 1.10796773, + "epoch": 0.012565759807605591, + "flos": 30591546157440.0, + "grad_norm": 2.79364384939249, + "language_loss": 0.98718858, + "learning_rate": 3.4396735731358586e-06, + "loss": 1.01208818, + "num_input_tokens_seen": 4421935, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.75, + "step": 209, + "time_per_iteration": 2.607238292694092 + }, + { + "auxiliary_loss_clip": 0.01389293, + "auxiliary_loss_mlp": 0.01112212, + "balance_loss_clip": 1.05971253, + "balance_loss_mlp": 1.11020541, + "epoch": 0.012625883060273561, + "flos": 40113511355520.0, + "grad_norm": 7.039976369418198, + "language_loss": 0.85444254, + "learning_rate": 3.4427468590832302e-06, + "loss": 0.87945753, + "num_input_tokens_seen": 4441470, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.78125, + "step": 210, + "time_per_iteration": 2.67632794380188 + }, + { + "auxiliary_loss_clip": 0.01385349, + "auxiliary_loss_mlp": 0.01120301, + "balance_loss_clip": 1.07042408, + "balance_loss_mlp": 1.1073029, + "epoch": 0.01268600631294153, + "flos": 27089178629760.0, + "grad_norm": 2.2334441604414783, + "language_loss": 0.97016168, + "learning_rate": 3.445805545042314e-06, + "loss": 0.99521822, + "num_input_tokens_seen": 4459950, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 2.78125, + "step": 211, + "time_per_iteration": 2.5733633041381836 + }, + { + "auxiliary_loss_clip": 0.01394963, + "auxiliary_loss_mlp": 0.01114691, + "balance_loss_clip": 1.0616188, + "balance_loss_mlp": 1.11342549, + "epoch": 0.012746129565609499, + "flos": 16982767238400.0, + "grad_norm": 3.6563211355425453, + "language_loss": 0.95188707, + "learning_rate": 3.448849769075239e-06, + "loss": 0.97698367, + "num_input_tokens_seen": 4478390, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.8125, + "step": 212, + "time_per_iteration": 2.5224313735961914 + }, + { + "auxiliary_loss_clip": 0.01383511, + "auxiliary_loss_mlp": 0.01116361, + "balance_loss_clip": 1.06376541, + "balance_loss_mlp": 1.10996664, + "epoch": 0.012806252818277469, + "flos": 46533476995200.0, + "grad_norm": 2.0395830195466504, + "language_loss": 0.76049221, + "learning_rate": 3.4518796672950093e-06, + "loss": 0.78549099, + "num_input_tokens_seen": 4501665, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 2.734375, + "step": 213, + "time_per_iteration": 2.76625919342041 + }, + { + "auxiliary_loss_clip": 0.0138732, + "auxiliary_loss_mlp": 0.01103154, + "balance_loss_clip": 1.052037, + "balance_loss_mlp": 1.10833097, + "epoch": 0.012866376070945438, + "flos": 14388616120320.0, + "grad_norm": 8.414558483522654, + "language_loss": 0.86754733, + "learning_rate": 3.4548953739020187e-06, + "loss": 0.89245206, + "num_input_tokens_seen": 4519055, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 2.78125, + "step": 214, + "time_per_iteration": 2.500417470932007 + }, + { + "auxiliary_loss_clip": 0.0138682, + "auxiliary_loss_mlp": 0.01127788, + "balance_loss_clip": 1.07397687, + "balance_loss_mlp": 1.11549139, + "epoch": 0.012926499323613408, + "flos": 26140813793280.0, + "grad_norm": 2.3854037050744057, + "language_loss": 0.77357471, + "learning_rate": 3.4578970212197196e-06, + "loss": 0.79872084, + "num_input_tokens_seen": 4540870, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 2.703125, + "step": 215, + "time_per_iteration": 2.6116256713867188 + }, + { + "auxiliary_loss_clip": 0.01394912, + "auxiliary_loss_mlp": 0.01111048, + "balance_loss_clip": 1.06002641, + "balance_loss_mlp": 1.11393261, + "epoch": 0.012986622576281377, + "flos": 30117202128000.0, + "grad_norm": 2.44498430810385, + "language_loss": 0.90545797, + "learning_rate": 3.460884739729461e-06, + "loss": 0.93051755, + "num_input_tokens_seen": 4560395, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.8125, + "step": 216, + "time_per_iteration": 2.5903706550598145 + }, + { + "auxiliary_loss_clip": 0.0138678, + "auxiliary_loss_mlp": 0.01107632, + "balance_loss_clip": 1.05622888, + "balance_loss_mlp": 1.10772836, + "epoch": 0.013046745828949347, + "flos": 13954025468160.0, + "grad_norm": 2.630220300857062, + "language_loss": 0.93660516, + "learning_rate": 3.463858658104523e-06, + "loss": 0.96154928, + "num_input_tokens_seen": 4575785, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 2.78125, + "step": 217, + "time_per_iteration": 2.5109100341796875 + }, + { + "auxiliary_loss_clip": 0.01381618, + "auxiliary_loss_mlp": 0.01107152, + "balance_loss_clip": 1.05360317, + "balance_loss_mlp": 1.10700643, + "epoch": 0.013106869081617315, + "flos": 17347835116800.0, + "grad_norm": 1.9165712032980975, + "language_loss": 0.93656206, + "learning_rate": 3.4668189032433696e-06, + "loss": 0.96144974, + "num_input_tokens_seen": 4594985, + "router_z_loss_clip": 0.53515625, + "router_z_loss_mlp": 2.75, + "step": 218, + "time_per_iteration": 2.6586077213287354 + }, + { + "auxiliary_loss_clip": 0.01376505, + "auxiliary_loss_mlp": 0.01108753, + "balance_loss_clip": 1.05820787, + "balance_loss_mlp": 1.10663593, + "epoch": 0.013166992334285284, + "flos": 25884914325120.0, + "grad_norm": 1.916363531530835, + "language_loss": 0.86148179, + "learning_rate": 3.46976560030214e-06, + "loss": 0.88633436, + "num_input_tokens_seen": 4616125, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 2.703125, + "step": 219, + "time_per_iteration": 2.584040880203247 + }, + { + "auxiliary_loss_clip": 0.01383955, + "auxiliary_loss_mlp": 0.01101272, + "balance_loss_clip": 1.05056047, + "balance_loss_mlp": 1.110309, + "epoch": 0.013227115586953254, + "flos": 31175956437120.0, + "grad_norm": 1.7731463199764816, + "language_loss": 0.87598741, + "learning_rate": 3.4726988727263976e-06, + "loss": 0.90083969, + "num_input_tokens_seen": 4637795, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.75, + "step": 220, + "time_per_iteration": 2.6294186115264893 + }, + { + "auxiliary_loss_clip": 0.01373821, + "auxiliary_loss_mlp": 0.01103316, + "balance_loss_clip": 1.05663311, + "balance_loss_mlp": 1.10389161, + "epoch": 0.013287238839621223, + "flos": 20409470766720.0, + "grad_norm": 1.991547522293572, + "language_loss": 0.86413074, + "learning_rate": 3.475618842282164e-06, + "loss": 0.88890207, + "num_input_tokens_seen": 4656835, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 2.6875, + "step": 221, + "time_per_iteration": 2.606137990951538 + }, + { + "auxiliary_loss_clip": 0.0137878, + "auxiliary_loss_mlp": 0.01109834, + "balance_loss_clip": 1.05800176, + "balance_loss_mlp": 1.10240269, + "epoch": 0.013347362092289193, + "flos": 14137134024960.0, + "grad_norm": 2.017045003530743, + "language_loss": 0.92153138, + "learning_rate": 3.4785256290862486e-06, + "loss": 0.94641757, + "num_input_tokens_seen": 4673015, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 2.765625, + "step": 222, + "time_per_iteration": 2.6237566471099854 + }, + { + "auxiliary_loss_clip": 0.01377393, + "auxiliary_loss_mlp": 0.01105441, + "balance_loss_clip": 1.05129576, + "balance_loss_mlp": 1.10672021, + "epoch": 0.013407485344957162, + "flos": 21797705554560.0, + "grad_norm": 2.7127164790698606, + "language_loss": 0.95539695, + "learning_rate": 3.481419351635897e-06, + "loss": 0.98022527, + "num_input_tokens_seen": 4692355, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 2.71875, + "step": 223, + "time_per_iteration": 2.679387092590332 + }, + { + "auxiliary_loss_clip": 0.01377947, + "auxiliary_loss_mlp": 0.01106927, + "balance_loss_clip": 1.05612004, + "balance_loss_mlp": 1.10671806, + "epoch": 0.013467608597625132, + "flos": 18621622195200.0, + "grad_norm": 2.5543531214735586, + "language_loss": 0.88022512, + "learning_rate": 3.484300126837776e-06, + "loss": 0.90507382, + "num_input_tokens_seen": 4710080, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.71875, + "step": 224, + "time_per_iteration": 2.6327528953552246 + }, + { + "auxiliary_loss_clip": 0.0137715, + "auxiliary_loss_mlp": 0.01102713, + "balance_loss_clip": 1.04873466, + "balance_loss_mlp": 1.10632586, + "epoch": 0.013527731850293101, + "flos": 18552314903040.0, + "grad_norm": 2.0812591886363183, + "language_loss": 0.89642018, + "learning_rate": 3.487168070036317e-06, + "loss": 0.92121875, + "num_input_tokens_seen": 4728980, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 2.703125, + "step": 225, + "time_per_iteration": 2.511749505996704 + }, + { + "auxiliary_loss_clip": 0.01374075, + "auxiliary_loss_mlp": 0.01115854, + "balance_loss_clip": 1.06273401, + "balance_loss_mlp": 1.10547256, + "epoch": 0.01358785510296107, + "flos": 19165381257600.0, + "grad_norm": 2.1555099546542142, + "language_loss": 0.99022663, + "learning_rate": 3.4900232950414224e-06, + "loss": 1.01512599, + "num_input_tokens_seen": 4747020, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.6875, + "step": 226, + "time_per_iteration": 5.38438868522644 + }, + { + "auxiliary_loss_clip": 0.0137773, + "auxiliary_loss_mlp": 0.01111487, + "balance_loss_clip": 1.0584867, + "balance_loss_mlp": 1.10696185, + "epoch": 0.01364797835562904, + "flos": 23329941966720.0, + "grad_norm": 15.523681056640678, + "language_loss": 0.91210413, + "learning_rate": 3.4928659141555727e-06, + "loss": 0.93699628, + "num_input_tokens_seen": 4765000, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.703125, + "step": 227, + "time_per_iteration": 2.5391762256622314 + }, + { + "auxiliary_loss_clip": 0.01252818, + "auxiliary_loss_mlp": 0.01025357, + "balance_loss_clip": 1.00666487, + "balance_loss_mlp": 1.10911703, + "epoch": 0.013708101608297009, + "flos": 70993746097920.0, + "grad_norm": 0.99230217192713, + "language_loss": 0.57680154, + "learning_rate": 3.4956960382003234e-06, + "loss": 0.59958327, + "num_input_tokens_seen": 4833210, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.4375, + "step": 228, + "time_per_iteration": 3.1981163024902344 + }, + { + "auxiliary_loss_clip": 0.0136686, + "auxiliary_loss_mlp": 0.01110654, + "balance_loss_clip": 1.06127763, + "balance_loss_mlp": 1.10228515, + "epoch": 0.013768224860964979, + "flos": 16325170997760.0, + "grad_norm": 2.2779006264878374, + "language_loss": 0.8759563, + "learning_rate": 3.4985137765422354e-06, + "loss": 0.90073144, + "num_input_tokens_seen": 4850120, + "router_z_loss_clip": 0.49414062, + "router_z_loss_mlp": 2.65625, + "step": 229, + "time_per_iteration": 2.49130916595459 + }, + { + "auxiliary_loss_clip": 0.01377631, + "auxiliary_loss_mlp": 0.01101911, + "balance_loss_clip": 1.05212951, + "balance_loss_mlp": 1.10486007, + "epoch": 0.013828348113632948, + "flos": 20193037367040.0, + "grad_norm": 4.280679608747667, + "language_loss": 0.84247303, + "learning_rate": 3.501319237118231e-06, + "loss": 0.8672685, + "num_input_tokens_seen": 4866215, + "router_z_loss_clip": 0.49804688, + "router_z_loss_mlp": 2.734375, + "step": 230, + "time_per_iteration": 2.501218557357788 + }, + { + "auxiliary_loss_clip": 0.01375417, + "auxiliary_loss_mlp": 0.0111628, + "balance_loss_clip": 1.06671298, + "balance_loss_mlp": 1.10600948, + "epoch": 0.013888471366300916, + "flos": 20741070147840.0, + "grad_norm": 1.78964280876859, + "language_loss": 0.90378422, + "learning_rate": 3.5041125264604056e-06, + "loss": 0.92870116, + "num_input_tokens_seen": 4885630, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 2.6875, + "step": 231, + "time_per_iteration": 2.541137456893921 + }, + { + "auxiliary_loss_clip": 0.01377441, + "auxiliary_loss_mlp": 0.01108629, + "balance_loss_clip": 1.05941916, + "balance_loss_mlp": 1.10821056, + "epoch": 0.013948594618968886, + "flos": 22090628966400.0, + "grad_norm": 2.031489983297281, + "language_loss": 0.83706695, + "learning_rate": 3.5068937497203002e-06, + "loss": 0.86192763, + "num_input_tokens_seen": 4905570, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 2.6875, + "step": 232, + "time_per_iteration": 2.5444753170013428 + }, + { + "auxiliary_loss_clip": 0.0137977, + "auxiliary_loss_mlp": 0.01092372, + "balance_loss_clip": 1.04125488, + "balance_loss_mlp": 1.10017753, + "epoch": 0.014008717871636855, + "flos": 19063108258560.0, + "grad_norm": 2.928489064169697, + "language_loss": 0.74033689, + "learning_rate": 3.509663010692652e-06, + "loss": 0.76505834, + "num_input_tokens_seen": 4923535, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 2.796875, + "step": 233, + "time_per_iteration": 2.5364952087402344 + }, + { + "auxiliary_loss_clip": 0.01382965, + "auxiliary_loss_mlp": 0.0112384, + "balance_loss_clip": 1.07141209, + "balance_loss_mlp": 1.10741055, + "epoch": 0.014068841124304825, + "flos": 14530822064640.0, + "grad_norm": 2.287774019631123, + "language_loss": 0.85867143, + "learning_rate": 3.512420411838642e-06, + "loss": 0.88373953, + "num_input_tokens_seen": 4939200, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.75, + "step": 234, + "time_per_iteration": 2.532949209213257 + }, + { + "auxiliary_loss_clip": 0.01375298, + "auxiliary_loss_mlp": 0.01106064, + "balance_loss_clip": 1.05683041, + "balance_loss_mlp": 1.10759592, + "epoch": 0.014128964376972794, + "flos": 18077396256000.0, + "grad_norm": 2.6527993685177154, + "language_loss": 0.89144391, + "learning_rate": 3.515166054308634e-06, + "loss": 0.9162575, + "num_input_tokens_seen": 4956620, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 2.671875, + "step": 235, + "time_per_iteration": 2.509592294692993 + }, + { + "auxiliary_loss_clip": 0.0137416, + "auxiliary_loss_mlp": 0.01119384, + "balance_loss_clip": 1.06874382, + "balance_loss_mlp": 1.10830367, + "epoch": 0.014189087629640764, + "flos": 25334331678720.0, + "grad_norm": 4.054998173736759, + "language_loss": 0.85780042, + "learning_rate": 3.5179000379644498e-06, + "loss": 0.88273585, + "num_input_tokens_seen": 4975650, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.65625, + "step": 236, + "time_per_iteration": 2.744925022125244 + }, + { + "auxiliary_loss_clip": 0.0137118, + "auxiliary_loss_mlp": 0.01099258, + "balance_loss_clip": 1.04871392, + "balance_loss_mlp": 1.10178149, + "epoch": 0.014249210882308733, + "flos": 36139744713600.0, + "grad_norm": 2.128422813257453, + "language_loss": 0.82452404, + "learning_rate": 3.520622461401154e-06, + "loss": 0.84922838, + "num_input_tokens_seen": 4997415, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.6875, + "step": 237, + "time_per_iteration": 2.67307710647583 + }, + { + "auxiliary_loss_clip": 0.01369116, + "auxiliary_loss_mlp": 0.01116968, + "balance_loss_clip": 1.0643487, + "balance_loss_mlp": 1.10451889, + "epoch": 0.014309334134976702, + "flos": 12932977461120.0, + "grad_norm": 3.103781307849977, + "language_loss": 0.77321362, + "learning_rate": 3.5233334219683935e-06, + "loss": 0.79807448, + "num_input_tokens_seen": 5013905, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 2.65625, + "step": 238, + "time_per_iteration": 2.4973809719085693 + }, + { + "auxiliary_loss_clip": 0.01368178, + "auxiliary_loss_mlp": 0.01112367, + "balance_loss_clip": 1.06566119, + "balance_loss_mlp": 1.10654771, + "epoch": 0.014369457387644672, + "flos": 20777519473920.0, + "grad_norm": 1.992064896075991, + "language_loss": 0.87370872, + "learning_rate": 3.526033015791284e-06, + "loss": 0.89851415, + "num_input_tokens_seen": 5033645, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 2.609375, + "step": 239, + "time_per_iteration": 2.554222583770752 + }, + { + "auxiliary_loss_clip": 0.01352979, + "auxiliary_loss_mlp": 0.01100535, + "balance_loss_clip": 1.05330408, + "balance_loss_mlp": 1.09776592, + "epoch": 0.01442958064031264, + "flos": 25848536826240.0, + "grad_norm": 2.2433371609956283, + "language_loss": 0.93297911, + "learning_rate": 3.528721337790862e-06, + "loss": 0.95751429, + "num_input_tokens_seen": 5052875, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 2.5625, + "step": 240, + "time_per_iteration": 2.588529348373413 + }, + { + "auxiliary_loss_clip": 0.01362634, + "auxiliary_loss_mlp": 0.01104045, + "balance_loss_clip": 1.05736244, + "balance_loss_mlp": 1.10324717, + "epoch": 0.014489703892980611, + "flos": 28219718269440.0, + "grad_norm": 2.299780828803648, + "language_loss": 0.85129881, + "learning_rate": 3.531398481704111e-06, + "loss": 0.8759656, + "num_input_tokens_seen": 5075005, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 2.59375, + "step": 241, + "time_per_iteration": 2.607272148132324 + }, + { + "auxiliary_loss_clip": 0.01360778, + "auxiliary_loss_mlp": 0.01116022, + "balance_loss_clip": 1.06695509, + "balance_loss_mlp": 1.10865557, + "epoch": 0.01454982714564858, + "flos": 22490925108480.0, + "grad_norm": 1.927287768398498, + "language_loss": 0.88410223, + "learning_rate": 3.534064540103573e-06, + "loss": 0.90887022, + "num_input_tokens_seen": 5091875, + "router_z_loss_clip": 0.49023438, + "router_z_loss_mlp": 2.53125, + "step": 242, + "time_per_iteration": 2.522657632827759 + }, + { + "auxiliary_loss_clip": 0.013595, + "auxiliary_loss_mlp": 0.0109979, + "balance_loss_clip": 1.04981756, + "balance_loss_mlp": 1.10147619, + "epoch": 0.014609950398316548, + "flos": 21653201139840.0, + "grad_norm": 2.6384412969740922, + "language_loss": 0.86817086, + "learning_rate": 3.536719604416555e-06, + "loss": 0.89276373, + "num_input_tokens_seen": 5111290, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 2.578125, + "step": 243, + "time_per_iteration": 2.5738751888275146 + }, + { + "auxiliary_loss_clip": 0.01366378, + "auxiliary_loss_mlp": 0.01105289, + "balance_loss_clip": 1.05574584, + "balance_loss_mlp": 1.10421979, + "epoch": 0.014670073650984519, + "flos": 21869993675520.0, + "grad_norm": 1.576084931358892, + "language_loss": 0.84271425, + "learning_rate": 3.5393637649439464e-06, + "loss": 0.86743093, + "num_input_tokens_seen": 5132265, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 2.625, + "step": 244, + "time_per_iteration": 2.51370906829834 + }, + { + "auxiliary_loss_clip": 0.01374385, + "auxiliary_loss_mlp": 0.01115077, + "balance_loss_clip": 1.06403196, + "balance_loss_mlp": 1.10701251, + "epoch": 0.014730196903652487, + "flos": 23183713699200.0, + "grad_norm": 2.2775099056278916, + "language_loss": 0.78689361, + "learning_rate": 3.54199711087864e-06, + "loss": 0.8117882, + "num_input_tokens_seen": 5148575, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 2.671875, + "step": 245, + "time_per_iteration": 2.5579745769500732 + }, + { + "auxiliary_loss_clip": 0.01372772, + "auxiliary_loss_mlp": 0.0110276, + "balance_loss_clip": 1.04961681, + "balance_loss_mlp": 1.10232484, + "epoch": 0.014790320156320457, + "flos": 23222605150080.0, + "grad_norm": 2.2330220282190685, + "language_loss": 0.84241545, + "learning_rate": 3.5446197303235913e-06, + "loss": 0.86717069, + "num_input_tokens_seen": 5170415, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.703125, + "step": 246, + "time_per_iteration": 2.565614700317383 + }, + { + "auxiliary_loss_clip": 0.01367419, + "auxiliary_loss_mlp": 0.01097455, + "balance_loss_clip": 1.04722059, + "balance_loss_mlp": 1.10181057, + "epoch": 0.014850443408988426, + "flos": 15815490963840.0, + "grad_norm": 1.9335653980079095, + "language_loss": 0.9014703, + "learning_rate": 3.5472317103095034e-06, + "loss": 0.92611909, + "num_input_tokens_seen": 5188565, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 2.65625, + "step": 247, + "time_per_iteration": 2.5572896003723145 + }, + { + "auxiliary_loss_clip": 0.01365881, + "auxiliary_loss_mlp": 0.01097755, + "balance_loss_clip": 1.04952252, + "balance_loss_mlp": 1.09689593, + "epoch": 0.014910566661656396, + "flos": 22781657790720.0, + "grad_norm": 2.1205098484246734, + "language_loss": 0.78058362, + "learning_rate": 3.549833136812155e-06, + "loss": 0.80521989, + "num_input_tokens_seen": 5207810, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 2.6875, + "step": 248, + "time_per_iteration": 2.5365517139434814 + }, + { + "auxiliary_loss_clip": 0.0136687, + "auxiliary_loss_mlp": 0.01105288, + "balance_loss_clip": 1.05552983, + "balance_loss_mlp": 1.10545397, + "epoch": 0.014970689914324365, + "flos": 26865023806080.0, + "grad_norm": 2.1747011613954177, + "language_loss": 0.83849227, + "learning_rate": 3.552424094769381e-06, + "loss": 0.86321384, + "num_input_tokens_seen": 5226210, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 2.609375, + "step": 249, + "time_per_iteration": 2.6142020225524902 + }, + { + "auxiliary_loss_clip": 0.01360073, + "auxiliary_loss_mlp": 0.01106063, + "balance_loss_clip": 1.05806887, + "balance_loss_mlp": 1.09971058, + "epoch": 0.015030813166992334, + "flos": 13985662371840.0, + "grad_norm": 2.2137591284686455, + "language_loss": 0.93476778, + "learning_rate": 3.5550046680977174e-06, + "loss": 0.95942914, + "num_input_tokens_seen": 5241660, + "router_z_loss_clip": 0.47851562, + "router_z_loss_mlp": 2.609375, + "step": 250, + "time_per_iteration": 2.485686779022217 + }, + { + "auxiliary_loss_clip": 0.01369254, + "auxiliary_loss_mlp": 0.01114661, + "balance_loss_clip": 1.06351972, + "balance_loss_mlp": 1.10460913, + "epoch": 0.015090936419660304, + "flos": 24717817618560.0, + "grad_norm": 2.2612141068319622, + "language_loss": 0.97030997, + "learning_rate": 3.5575749397087034e-06, + "loss": 0.99514914, + "num_input_tokens_seen": 5261090, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 2.640625, + "step": 251, + "time_per_iteration": 2.5887296199798584 + }, + { + "auxiliary_loss_clip": 0.01362288, + "auxiliary_loss_mlp": 0.01105325, + "balance_loss_clip": 1.05723596, + "balance_loss_mlp": 1.09872079, + "epoch": 0.015151059672328273, + "flos": 25738793798400.0, + "grad_norm": 2.0465178965121136, + "language_loss": 0.8428089, + "learning_rate": 3.5601349915248707e-06, + "loss": 0.86748511, + "num_input_tokens_seen": 5279175, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 2.640625, + "step": 252, + "time_per_iteration": 2.5749199390411377 + }, + { + "auxiliary_loss_clip": 0.01357969, + "auxiliary_loss_mlp": 0.01114738, + "balance_loss_clip": 1.06569552, + "balance_loss_mlp": 1.10169089, + "epoch": 0.015211182924996243, + "flos": 21871214737920.0, + "grad_norm": 2.482990993198259, + "language_loss": 0.98208833, + "learning_rate": 3.5626849044954064e-06, + "loss": 1.00681543, + "num_input_tokens_seen": 5296975, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 2.5625, + "step": 253, + "time_per_iteration": 2.5639333724975586 + }, + { + "auxiliary_loss_clip": 0.01233728, + "auxiliary_loss_mlp": 0.01026961, + "balance_loss_clip": 1.00855541, + "balance_loss_mlp": 1.09965372, + "epoch": 0.015271306177664212, + "flos": 66895080888960.0, + "grad_norm": 0.8505459641429172, + "language_loss": 0.55672622, + "learning_rate": 3.5652247586115167e-06, + "loss": 0.57933319, + "num_input_tokens_seen": 5358375, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.34375, + "step": 254, + "time_per_iteration": 3.1063449382781982 + }, + { + "auxiliary_loss_clip": 0.01362079, + "auxiliary_loss_mlp": 0.01116704, + "balance_loss_clip": 1.06687438, + "balance_loss_mlp": 1.09652638, + "epoch": 0.01533142943033218, + "flos": 26834069260800.0, + "grad_norm": 2.4360968938917065, + "language_loss": 0.90453845, + "learning_rate": 3.567754632921479e-06, + "loss": 0.9293263, + "num_input_tokens_seen": 5377255, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 2.65625, + "step": 255, + "time_per_iteration": 2.5746912956237793 + }, + { + "auxiliary_loss_clip": 0.01358909, + "auxiliary_loss_mlp": 0.01125654, + "balance_loss_clip": 1.07568169, + "balance_loss_mlp": 1.09931397, + "epoch": 0.01539155268300015, + "flos": 20813753318400.0, + "grad_norm": 2.2666703391376903, + "language_loss": 0.8562001, + "learning_rate": 3.5702746055454075e-06, + "loss": 0.8810457, + "num_input_tokens_seen": 5395320, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 2.59375, + "step": 256, + "time_per_iteration": 2.6095149517059326 + }, + { + "auxiliary_loss_clip": 0.01366413, + "auxiliary_loss_mlp": 0.01112529, + "balance_loss_clip": 1.06305718, + "balance_loss_mlp": 1.09961021, + "epoch": 0.01545167593566812, + "flos": 15961862885760.0, + "grad_norm": 2.7442871984488386, + "language_loss": 0.71504897, + "learning_rate": 3.5727847536897254e-06, + "loss": 0.73983842, + "num_input_tokens_seen": 5411970, + "router_z_loss_clip": 0.49414062, + "router_z_loss_mlp": 2.65625, + "step": 257, + "time_per_iteration": 2.5939691066741943 + }, + { + "auxiliary_loss_clip": 0.01357007, + "auxiliary_loss_mlp": 0.01100177, + "balance_loss_clip": 1.05087197, + "balance_loss_mlp": 1.09875202, + "epoch": 0.01551179918833609, + "flos": 22601745544320.0, + "grad_norm": 1.9522192109187282, + "language_loss": 0.94659579, + "learning_rate": 3.5752851536613596e-06, + "loss": 0.97116768, + "num_input_tokens_seen": 5430245, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 2.578125, + "step": 258, + "time_per_iteration": 2.7119739055633545 + }, + { + "auxiliary_loss_clip": 0.01356701, + "auxiliary_loss_mlp": 0.01104272, + "balance_loss_clip": 1.05615926, + "balance_loss_mlp": 1.09608126, + "epoch": 0.015571922441004058, + "flos": 22816706486400.0, + "grad_norm": 3.167214789879638, + "language_loss": 0.93174207, + "learning_rate": 3.577775880881658e-06, + "loss": 0.95635182, + "num_input_tokens_seen": 5448905, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 2.59375, + "step": 259, + "time_per_iteration": 2.6776607036590576 + }, + { + "auxiliary_loss_clip": 0.01351639, + "auxiliary_loss_mlp": 0.010988, + "balance_loss_clip": 1.05297637, + "balance_loss_mlp": 1.10035825, + "epoch": 0.015632045693672027, + "flos": 18947439486720.0, + "grad_norm": 2.1226725879970605, + "language_loss": 0.97360909, + "learning_rate": 3.5802570099000424e-06, + "loss": 0.99811351, + "num_input_tokens_seen": 5466405, + "router_z_loss_clip": 0.45898438, + "router_z_loss_mlp": 2.515625, + "step": 260, + "time_per_iteration": 2.520759105682373 + }, + { + "auxiliary_loss_clip": 0.01365989, + "auxiliary_loss_mlp": 0.01110082, + "balance_loss_clip": 1.06282747, + "balance_loss_mlp": 1.10060608, + "epoch": 0.015692168946339995, + "flos": 29971728046080.0, + "grad_norm": 2.3569711169381, + "language_loss": 0.87644511, + "learning_rate": 3.5827286144073947e-06, + "loss": 0.90120584, + "num_input_tokens_seen": 5487055, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 2.65625, + "step": 261, + "time_per_iteration": 2.5837602615356445 + }, + { + "auxiliary_loss_clip": 0.0135711, + "auxiliary_loss_mlp": 0.01105229, + "balance_loss_clip": 1.05613816, + "balance_loss_mlp": 1.09709311, + "epoch": 0.015752292199007967, + "flos": 19392085946880.0, + "grad_norm": 1.9926513495738176, + "language_loss": 0.67226446, + "learning_rate": 3.5851907672491904e-06, + "loss": 0.69688779, + "num_input_tokens_seen": 5506600, + "router_z_loss_clip": 0.49023438, + "router_z_loss_mlp": 2.59375, + "step": 262, + "time_per_iteration": 2.5490784645080566 + }, + { + "auxiliary_loss_clip": 0.01354995, + "auxiliary_loss_mlp": 0.01121613, + "balance_loss_clip": 1.07145, + "balance_loss_mlp": 1.0984714, + "epoch": 0.015812415451675936, + "flos": 20339804338560.0, + "grad_norm": 2.3019763169045637, + "language_loss": 0.68570435, + "learning_rate": 3.587643540438383e-06, + "loss": 0.71047044, + "num_input_tokens_seen": 5524350, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 2.5625, + "step": 263, + "time_per_iteration": 2.5207104682922363 + }, + { + "auxiliary_loss_clip": 0.01355963, + "auxiliary_loss_mlp": 0.01105396, + "balance_loss_clip": 1.055686, + "balance_loss_mlp": 1.09446979, + "epoch": 0.015872538704343905, + "flos": 17525412979200.0, + "grad_norm": 3.705792502973735, + "language_loss": 0.85120308, + "learning_rate": 3.590087005168037e-06, + "loss": 0.87581658, + "num_input_tokens_seen": 5542145, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 2.625, + "step": 264, + "time_per_iteration": 2.559406280517578 + }, + { + "auxiliary_loss_clip": 0.01361439, + "auxiliary_loss_mlp": 0.01092909, + "balance_loss_clip": 1.04663229, + "balance_loss_mlp": 1.10003614, + "epoch": 0.015932661957011873, + "flos": 15260490944640.0, + "grad_norm": 4.651007312001026, + "language_loss": 1.04371059, + "learning_rate": 3.5925212318237344e-06, + "loss": 1.06825411, + "num_input_tokens_seen": 5557920, + "router_z_loss_clip": 0.46289062, + "router_z_loss_mlp": 2.625, + "step": 265, + "time_per_iteration": 2.5076427459716797 + }, + { + "auxiliary_loss_clip": 0.01364923, + "auxiliary_loss_mlp": 0.01114141, + "balance_loss_clip": 1.06266677, + "balance_loss_mlp": 1.10278761, + "epoch": 0.015992785209679845, + "flos": 20302528999680.0, + "grad_norm": 2.2797174203272705, + "language_loss": 0.75153112, + "learning_rate": 3.5949462899957323e-06, + "loss": 0.77632177, + "num_input_tokens_seen": 5576290, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 2.625, + "step": 266, + "time_per_iteration": 2.52923583984375 + }, + { + "auxiliary_loss_clip": 0.01351984, + "auxiliary_loss_mlp": 0.01101922, + "balance_loss_clip": 1.05321336, + "balance_loss_mlp": 1.10004377, + "epoch": 0.016052908462347814, + "flos": 23362368969600.0, + "grad_norm": 1.7047265515665009, + "language_loss": 0.90568709, + "learning_rate": 3.5973622484909068e-06, + "loss": 0.93022615, + "num_input_tokens_seen": 5595205, + "router_z_loss_clip": 0.48632812, + "router_z_loss_mlp": 2.515625, + "step": 267, + "time_per_iteration": 4.033226251602173 + }, + { + "auxiliary_loss_clip": 0.01359316, + "auxiliary_loss_mlp": 0.01118854, + "balance_loss_clip": 1.07143235, + "balance_loss_mlp": 1.09878063, + "epoch": 0.016113031715015783, + "flos": 21286588976640.0, + "grad_norm": 2.258126572730018, + "language_loss": 0.86044276, + "learning_rate": 3.599769175344462e-06, + "loss": 0.88522446, + "num_input_tokens_seen": 5612645, + "router_z_loss_clip": 0.47460938, + "router_z_loss_mlp": 2.609375, + "step": 268, + "time_per_iteration": 3.9120936393737793 + }, + { + "auxiliary_loss_clip": 0.01352601, + "auxiliary_loss_mlp": 0.01098281, + "balance_loss_clip": 1.05186045, + "balance_loss_mlp": 1.10092831, + "epoch": 0.01617315496768375, + "flos": 18914689261440.0, + "grad_norm": 3.4793793476816335, + "language_loss": 0.88284534, + "learning_rate": 3.602167137831432e-06, + "loss": 0.90735412, + "num_input_tokens_seen": 5628345, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 2.515625, + "step": 269, + "time_per_iteration": 2.5170347690582275 + }, + { + "auxiliary_loss_clip": 0.01357286, + "auxiliary_loss_mlp": 0.01099969, + "balance_loss_clip": 1.04901874, + "balance_loss_mlp": 1.09723783, + "epoch": 0.01623327822035172, + "flos": 16546488647040.0, + "grad_norm": 2.082153756456244, + "language_loss": 0.97073388, + "learning_rate": 3.6045562024779565e-06, + "loss": 0.99530637, + "num_input_tokens_seen": 5645940, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.59375, + "step": 270, + "time_per_iteration": 2.4856350421905518 + }, + { + "auxiliary_loss_clip": 0.01357366, + "auxiliary_loss_mlp": 0.01117767, + "balance_loss_clip": 1.07001138, + "balance_loss_mlp": 1.10259032, + "epoch": 0.016293401473019692, + "flos": 23513481486720.0, + "grad_norm": 2.1071719511680755, + "language_loss": 0.85919821, + "learning_rate": 3.606936435072361e-06, + "loss": 0.88394946, + "num_input_tokens_seen": 5665690, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 2.546875, + "step": 271, + "time_per_iteration": 2.55047345161438 + }, + { + "auxiliary_loss_clip": 0.01355041, + "auxiliary_loss_mlp": 0.0109977, + "balance_loss_clip": 1.05201519, + "balance_loss_mlp": 1.09418058, + "epoch": 0.01635352472568766, + "flos": 29016072748800.0, + "grad_norm": 3.6330072162998523, + "language_loss": 0.81509304, + "learning_rate": 3.609307900676025e-06, + "loss": 0.83964115, + "num_input_tokens_seen": 5683190, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 2.609375, + "step": 272, + "time_per_iteration": 2.563840389251709 + }, + { + "auxiliary_loss_clip": 0.01348825, + "auxiliary_loss_mlp": 0.01118044, + "balance_loss_clip": 1.07229137, + "balance_loss_mlp": 1.09649634, + "epoch": 0.01641364797835563, + "flos": 13370513028480.0, + "grad_norm": 2.4112371858801436, + "language_loss": 0.81101978, + "learning_rate": 3.611670663634051e-06, + "loss": 0.83568847, + "num_input_tokens_seen": 5699780, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 2.515625, + "step": 273, + "time_per_iteration": 2.504791736602783 + }, + { + "auxiliary_loss_clip": 0.01348205, + "auxiliary_loss_mlp": 0.01105988, + "balance_loss_clip": 1.05825627, + "balance_loss_mlp": 1.0930239, + "epoch": 0.016473771231023598, + "flos": 18878239935360.0, + "grad_norm": 2.3125197915452387, + "language_loss": 0.91599321, + "learning_rate": 3.614024787585744e-06, + "loss": 0.94053519, + "num_input_tokens_seen": 5716980, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 2.5625, + "step": 274, + "time_per_iteration": 2.530883312225342 + }, + { + "auxiliary_loss_clip": 0.01346841, + "auxiliary_loss_mlp": 0.01110058, + "balance_loss_clip": 1.06154013, + "balance_loss_mlp": 1.09588742, + "epoch": 0.016533894483691566, + "flos": 22601637803520.0, + "grad_norm": 1.8828740595481548, + "language_loss": 0.87952697, + "learning_rate": 3.6163703354748927e-06, + "loss": 0.90409595, + "num_input_tokens_seen": 5737780, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 2.515625, + "step": 275, + "time_per_iteration": 2.6067841053009033 + }, + { + "auxiliary_loss_clip": 0.01349399, + "auxiliary_loss_mlp": 0.01103927, + "balance_loss_clip": 1.05481219, + "balance_loss_mlp": 1.09579742, + "epoch": 0.01659401773635954, + "flos": 21507188353920.0, + "grad_norm": 1.8814357547622875, + "language_loss": 0.80717576, + "learning_rate": 3.6187073695598707e-06, + "loss": 0.83170903, + "num_input_tokens_seen": 5758330, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 2.53125, + "step": 276, + "time_per_iteration": 2.5251641273498535 + }, + { + "auxiliary_loss_clip": 0.01340258, + "auxiliary_loss_mlp": 0.01100275, + "balance_loss_clip": 1.0561676, + "balance_loss_mlp": 1.0946306, + "epoch": 0.016654140989027507, + "flos": 32850973411200.0, + "grad_norm": 1.7238418569970533, + "language_loss": 0.81033546, + "learning_rate": 3.621035951423551e-06, + "loss": 0.83474076, + "num_input_tokens_seen": 5778340, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 2.46875, + "step": 277, + "time_per_iteration": 2.6796398162841797 + }, + { + "auxiliary_loss_clip": 0.01338755, + "auxiliary_loss_mlp": 0.01095233, + "balance_loss_clip": 1.04828835, + "balance_loss_mlp": 1.08789539, + "epoch": 0.016714264241695476, + "flos": 12306228024960.0, + "grad_norm": 2.810922211495867, + "language_loss": 0.80307728, + "learning_rate": 3.623356141983041e-06, + "loss": 0.82741719, + "num_input_tokens_seen": 5794295, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 2.515625, + "step": 278, + "time_per_iteration": 2.4939208030700684 + }, + { + "auxiliary_loss_clip": 0.01343866, + "auxiliary_loss_mlp": 0.01101481, + "balance_loss_clip": 1.05634809, + "balance_loss_mlp": 1.09381282, + "epoch": 0.016774387494363444, + "flos": 27123796362240.0, + "grad_norm": 1.7778988036026468, + "language_loss": 0.90482658, + "learning_rate": 3.6256680014992486e-06, + "loss": 0.92928004, + "num_input_tokens_seen": 5814405, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 2.5, + "step": 279, + "time_per_iteration": 2.571439504623413 + }, + { + "auxiliary_loss_clip": 0.01348727, + "auxiliary_loss_mlp": 0.01116075, + "balance_loss_clip": 1.06872559, + "balance_loss_mlp": 1.09391451, + "epoch": 0.016834510747031413, + "flos": 20191493082240.0, + "grad_norm": 3.0477743200742387, + "language_loss": 0.94153798, + "learning_rate": 3.6279715895862713e-06, + "loss": 0.96618605, + "num_input_tokens_seen": 5832795, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 2.546875, + "step": 280, + "time_per_iteration": 2.5161728858947754 + }, + { + "auxiliary_loss_clip": 0.0134865, + "auxiliary_loss_mlp": 0.0110692, + "balance_loss_clip": 1.05864, + "balance_loss_mlp": 1.09245062, + "epoch": 0.016894633999699385, + "flos": 27274262434560.0, + "grad_norm": 3.578687135351882, + "language_loss": 0.73929775, + "learning_rate": 3.6302669652206183e-06, + "loss": 0.76385343, + "num_input_tokens_seen": 5855750, + "router_z_loss_clip": 0.48242188, + "router_z_loss_mlp": 2.5625, + "step": 281, + "time_per_iteration": 2.616241931915283 + }, + { + "auxiliary_loss_clip": 0.01343434, + "auxiliary_loss_mlp": 0.0111488, + "balance_loss_clip": 1.06977129, + "balance_loss_mlp": 1.09390783, + "epoch": 0.016954757252367354, + "flos": 14902964922240.0, + "grad_norm": 2.679798242609796, + "language_loss": 0.80207133, + "learning_rate": 3.632554186750274e-06, + "loss": 0.82665443, + "num_input_tokens_seen": 5872610, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 2.5, + "step": 282, + "time_per_iteration": 2.5421135425567627 + }, + { + "auxiliary_loss_clip": 0.01348806, + "auxiliary_loss_mlp": 0.01117348, + "balance_loss_clip": 1.0704273, + "balance_loss_mlp": 1.09599137, + "epoch": 0.017014880505035322, + "flos": 21358805270400.0, + "grad_norm": 2.1184562475367916, + "language_loss": 0.77788174, + "learning_rate": 3.6348333119035937e-06, + "loss": 0.80254328, + "num_input_tokens_seen": 5892985, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 2.53125, + "step": 283, + "time_per_iteration": 2.516474485397339 + }, + { + "auxiliary_loss_clip": 0.01349252, + "auxiliary_loss_mlp": 0.01091995, + "balance_loss_clip": 1.04788804, + "balance_loss_mlp": 1.09700751, + "epoch": 0.01707500375770329, + "flos": 35333154858240.0, + "grad_norm": 2.1009174504018544, + "language_loss": 0.84172702, + "learning_rate": 3.6371043977980503e-06, + "loss": 0.86613953, + "num_input_tokens_seen": 5914060, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 2.515625, + "step": 284, + "time_per_iteration": 2.646301031112671 + }, + { + "auxiliary_loss_clip": 0.01339164, + "auxiliary_loss_mlp": 0.010994, + "balance_loss_clip": 1.05216956, + "balance_loss_mlp": 1.09148788, + "epoch": 0.01713512701037126, + "flos": 23582070506880.0, + "grad_norm": 3.014395623363928, + "language_loss": 0.96993905, + "learning_rate": 3.639367500948819e-06, + "loss": 0.99432468, + "num_input_tokens_seen": 5932860, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 2.46875, + "step": 285, + "time_per_iteration": 2.5412731170654297 + }, + { + "auxiliary_loss_clip": 0.01342544, + "auxiliary_loss_mlp": 0.01093983, + "balance_loss_clip": 1.05025744, + "balance_loss_mlp": 1.09407294, + "epoch": 0.01719525026303923, + "flos": 27634661544960.0, + "grad_norm": 2.2067050643741433, + "language_loss": 0.93951917, + "learning_rate": 3.6416226772772178e-06, + "loss": 0.96388453, + "num_input_tokens_seen": 5952725, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 2.484375, + "step": 286, + "time_per_iteration": 2.5895566940307617 + }, + { + "auxiliary_loss_clip": 0.0133546, + "auxiliary_loss_mlp": 0.01090331, + "balance_loss_clip": 1.04503167, + "balance_loss_mlp": 1.08924019, + "epoch": 0.0172553735157072, + "flos": 26979722910720.0, + "grad_norm": 1.8729510510678706, + "language_loss": 0.92157722, + "learning_rate": 3.643869982119001e-06, + "loss": 0.94583511, + "num_input_tokens_seen": 5970560, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 2.46875, + "step": 287, + "time_per_iteration": 2.6144802570343018 + }, + { + "auxiliary_loss_clip": 0.01338793, + "auxiliary_loss_mlp": 0.01089685, + "balance_loss_clip": 1.04462433, + "balance_loss_mlp": 1.08859432, + "epoch": 0.01731549676837517, + "flos": 14056621689600.0, + "grad_norm": 3.2271144452092564, + "language_loss": 1.02026963, + "learning_rate": 3.646109470232502e-06, + "loss": 1.04455447, + "num_input_tokens_seen": 5982980, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 2.5, + "step": 288, + "time_per_iteration": 2.488274097442627 + }, + { + "auxiliary_loss_clip": 0.01222501, + "auxiliary_loss_mlp": 0.01036501, + "balance_loss_clip": 1.02000237, + "balance_loss_mlp": 1.09325862, + "epoch": 0.017375620021043137, + "flos": 66510694471680.0, + "grad_norm": 0.9131614435254132, + "language_loss": 0.63915455, + "learning_rate": 3.6483411958066417e-06, + "loss": 0.66174459, + "num_input_tokens_seen": 6049445, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 1.296875, + "step": 289, + "time_per_iteration": 3.222426652908325 + }, + { + "auxiliary_loss_clip": 0.01341104, + "auxiliary_loss_mlp": 0.01107523, + "balance_loss_clip": 1.06379664, + "balance_loss_mlp": 1.09403992, + "epoch": 0.01743574327371111, + "flos": 15225154940160.0, + "grad_norm": 2.4014361624695173, + "language_loss": 0.88569438, + "learning_rate": 3.6505652124687957e-06, + "loss": 0.91018069, + "num_input_tokens_seen": 6064150, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 2.46875, + "step": 290, + "time_per_iteration": 2.49294114112854 + }, + { + "auxiliary_loss_clip": 0.01338257, + "auxiliary_loss_mlp": 0.01091523, + "balance_loss_clip": 1.04631877, + "balance_loss_mlp": 1.09248078, + "epoch": 0.017495866526379078, + "flos": 25373869574400.0, + "grad_norm": 2.156562479490788, + "language_loss": 0.84578067, + "learning_rate": 3.6527815732925258e-06, + "loss": 0.87007844, + "num_input_tokens_seen": 6083920, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 2.453125, + "step": 291, + "time_per_iteration": 2.5356485843658447 + }, + { + "auxiliary_loss_clip": 0.01345108, + "auxiliary_loss_mlp": 0.01106973, + "balance_loss_clip": 1.05897939, + "balance_loss_mlp": 1.10042334, + "epoch": 0.017555989779047047, + "flos": 26359473836160.0, + "grad_norm": 1.6617628708439536, + "language_loss": 0.72766221, + "learning_rate": 3.6549903308051806e-06, + "loss": 0.75218308, + "num_input_tokens_seen": 6105460, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 2.453125, + "step": 292, + "time_per_iteration": 2.6524176597595215 + }, + { + "auxiliary_loss_clip": 0.01333825, + "auxiliary_loss_mlp": 0.01101528, + "balance_loss_clip": 1.05625248, + "balance_loss_mlp": 1.09236324, + "epoch": 0.017616113031715015, + "flos": 22338807010560.0, + "grad_norm": 2.2014441192179866, + "language_loss": 0.8726995, + "learning_rate": 3.6571915369953646e-06, + "loss": 0.89705306, + "num_input_tokens_seen": 6122890, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 2.40625, + "step": 293, + "time_per_iteration": 2.531580686569214 + }, + { + "auxiliary_loss_clip": 0.01334314, + "auxiliary_loss_mlp": 0.0110389, + "balance_loss_clip": 1.05959213, + "balance_loss_mlp": 1.09177744, + "epoch": 0.017676236284382984, + "flos": 20156911263360.0, + "grad_norm": 2.3120260424061367, + "language_loss": 0.81276119, + "learning_rate": 3.6593852433202797e-06, + "loss": 0.83714324, + "num_input_tokens_seen": 6142890, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 2.4375, + "step": 294, + "time_per_iteration": 2.568784236907959 + }, + { + "auxiliary_loss_clip": 0.01334452, + "auxiliary_loss_mlp": 0.01107857, + "balance_loss_clip": 1.06274807, + "balance_loss_mlp": 1.08824301, + "epoch": 0.017736359537050956, + "flos": 25223331674880.0, + "grad_norm": 2.9227055740425705, + "language_loss": 0.83710909, + "learning_rate": 3.6615715007129453e-06, + "loss": 0.86153215, + "num_input_tokens_seen": 6162030, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 2.46875, + "step": 295, + "time_per_iteration": 2.5799450874328613 + }, + { + "auxiliary_loss_clip": 0.01339817, + "auxiliary_loss_mlp": 0.01110731, + "balance_loss_clip": 1.06559837, + "balance_loss_mlp": 1.09874845, + "epoch": 0.017796482789718925, + "flos": 20338798757760.0, + "grad_norm": 2.5339269047951727, + "language_loss": 0.84620988, + "learning_rate": 3.6637503595892897e-06, + "loss": 0.87071538, + "num_input_tokens_seen": 6180540, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 2.40625, + "step": 296, + "time_per_iteration": 2.5243051052093506 + }, + { + "auxiliary_loss_clip": 0.01338756, + "auxiliary_loss_mlp": 0.01097832, + "balance_loss_clip": 1.05417752, + "balance_loss_mlp": 1.09317493, + "epoch": 0.017856606042386893, + "flos": 22379206832640.0, + "grad_norm": 2.123858619871597, + "language_loss": 0.87729871, + "learning_rate": 3.665921869855132e-06, + "loss": 0.90166461, + "num_input_tokens_seen": 6199425, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 2.453125, + "step": 297, + "time_per_iteration": 2.5186710357666016 + }, + { + "auxiliary_loss_clip": 0.01337139, + "auxiliary_loss_mlp": 0.01100837, + "balance_loss_clip": 1.05713463, + "balance_loss_mlp": 1.09108877, + "epoch": 0.017916729295054862, + "flos": 20230061310720.0, + "grad_norm": 2.170328911832355, + "language_loss": 0.88528925, + "learning_rate": 3.6680860809130346e-06, + "loss": 0.90966904, + "num_input_tokens_seen": 6219170, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 2.46875, + "step": 298, + "time_per_iteration": 2.5320143699645996 + }, + { + "auxiliary_loss_clip": 0.0133273, + "auxiliary_loss_mlp": 0.01118432, + "balance_loss_clip": 1.07234538, + "balance_loss_mlp": 1.09249902, + "epoch": 0.01797685254772283, + "flos": 19390972625280.0, + "grad_norm": 1.8938405886263965, + "language_loss": 0.88666737, + "learning_rate": 3.6702430416690516e-06, + "loss": 0.91117901, + "num_input_tokens_seen": 6237930, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 2.40625, + "step": 299, + "time_per_iteration": 2.588275671005249 + }, + { + "auxiliary_loss_clip": 0.01340105, + "auxiliary_loss_mlp": 0.01105829, + "balance_loss_clip": 1.06055307, + "balance_loss_mlp": 1.09275746, + "epoch": 0.018036975800390802, + "flos": 24426007528320.0, + "grad_norm": 3.2936483356677253, + "language_loss": 0.64349103, + "learning_rate": 3.672392800539357e-06, + "loss": 0.66795039, + "num_input_tokens_seen": 6257170, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 2.46875, + "step": 300, + "time_per_iteration": 2.592313289642334 + }, + { + "auxiliary_loss_clip": 0.01338706, + "auxiliary_loss_mlp": 0.01105447, + "balance_loss_clip": 1.05986142, + "balance_loss_mlp": 1.09540462, + "epoch": 0.01809709905305877, + "flos": 15778933896960.0, + "grad_norm": 2.310898752337597, + "language_loss": 0.88330823, + "learning_rate": 3.6745354054567686e-06, + "loss": 0.90774977, + "num_input_tokens_seen": 6274780, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 2.4375, + "step": 301, + "time_per_iteration": 2.499481439590454 + }, + { + "auxiliary_loss_clip": 0.01214573, + "auxiliary_loss_mlp": 0.01024582, + "balance_loss_clip": 1.00932336, + "balance_loss_mlp": 1.08753991, + "epoch": 0.01815722230572674, + "flos": 67348382526720.0, + "grad_norm": 0.8370211186232274, + "language_loss": 0.62198341, + "learning_rate": 3.676670903877158e-06, + "loss": 0.64437497, + "num_input_tokens_seen": 6340435, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 1.265625, + "step": 302, + "time_per_iteration": 3.259997844696045 + }, + { + "auxiliary_loss_clip": 0.01329895, + "auxiliary_loss_mlp": 0.01107479, + "balance_loss_clip": 1.06198907, + "balance_loss_mlp": 1.08938098, + "epoch": 0.01821734555839471, + "flos": 15485615435520.0, + "grad_norm": 2.491293816938874, + "language_loss": 0.89617372, + "learning_rate": 3.6787993427857567e-06, + "loss": 0.92054749, + "num_input_tokens_seen": 6358160, + "router_z_loss_clip": 0.45507812, + "router_z_loss_mlp": 2.40625, + "step": 303, + "time_per_iteration": 2.536773920059204 + }, + { + "auxiliary_loss_clip": 0.01336859, + "auxiliary_loss_mlp": 0.01114111, + "balance_loss_clip": 1.06778669, + "balance_loss_mlp": 1.09363747, + "epoch": 0.018277468811062677, + "flos": 24097424889600.0, + "grad_norm": 4.887297609803561, + "language_loss": 0.80314684, + "learning_rate": 3.680920768703364e-06, + "loss": 0.82765651, + "num_input_tokens_seen": 6378485, + "router_z_loss_clip": 0.46289062, + "router_z_loss_mlp": 2.4375, + "step": 304, + "time_per_iteration": 2.563828945159912 + }, + { + "auxiliary_loss_clip": 0.01331614, + "auxiliary_loss_mlp": 0.01094816, + "balance_loss_clip": 1.05144823, + "balance_loss_mlp": 1.09657788, + "epoch": 0.01833759206373065, + "flos": 20959335141120.0, + "grad_norm": 1.8235558005033383, + "language_loss": 0.82894015, + "learning_rate": 3.6830352276924415e-06, + "loss": 0.85320443, + "num_input_tokens_seen": 6397845, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 2.34375, + "step": 305, + "time_per_iteration": 2.5195910930633545 + }, + { + "auxiliary_loss_clip": 0.01332168, + "auxiliary_loss_mlp": 0.01093114, + "balance_loss_clip": 1.04993677, + "balance_loss_mlp": 1.08868921, + "epoch": 0.018397715316398618, + "flos": 19390757143680.0, + "grad_norm": 1.9087210074301977, + "language_loss": 0.90843809, + "learning_rate": 3.685142765363119e-06, + "loss": 0.93269092, + "num_input_tokens_seen": 6416475, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 2.4375, + "step": 306, + "time_per_iteration": 2.501276969909668 + }, + { + "auxiliary_loss_clip": 0.01324982, + "auxiliary_loss_mlp": 0.01090544, + "balance_loss_clip": 1.04815364, + "balance_loss_mlp": 1.08638549, + "epoch": 0.018457838569066586, + "flos": 29132531619840.0, + "grad_norm": 2.1762826783898586, + "language_loss": 0.86435306, + "learning_rate": 3.687243426879095e-06, + "loss": 0.88850832, + "num_input_tokens_seen": 6437520, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 2.390625, + "step": 307, + "time_per_iteration": 2.6048038005828857 + }, + { + "auxiliary_loss_clip": 0.01325097, + "auxiliary_loss_mlp": 0.01106166, + "balance_loss_clip": 1.05817199, + "balance_loss_mlp": 1.09046888, + "epoch": 0.018517961821734555, + "flos": 19208654167680.0, + "grad_norm": 2.221444292833677, + "language_loss": 0.71723771, + "learning_rate": 3.6893372569634466e-06, + "loss": 0.74155033, + "num_input_tokens_seen": 6455680, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 2.34375, + "step": 308, + "time_per_iteration": 2.513774871826172 + }, + { + "auxiliary_loss_clip": 0.01331987, + "auxiliary_loss_mlp": 0.01102938, + "balance_loss_clip": 1.05904555, + "balance_loss_mlp": 1.08861351, + "epoch": 0.018578085074402523, + "flos": 19863018184320.0, + "grad_norm": 2.2254161740825293, + "language_loss": 0.91952753, + "learning_rate": 3.6914242999043395e-06, + "loss": 0.94387674, + "num_input_tokens_seen": 6474880, + "router_z_loss_clip": 0.43945312, + "router_z_loss_mlp": 2.4375, + "step": 309, + "time_per_iteration": 5.224750280380249 + }, + { + "auxiliary_loss_clip": 0.01338325, + "auxiliary_loss_mlp": 0.01104953, + "balance_loss_clip": 1.05896235, + "balance_loss_mlp": 1.08840334, + "epoch": 0.018638208327070496, + "flos": 29606947476480.0, + "grad_norm": 2.8056803187702135, + "language_loss": 0.72399509, + "learning_rate": 3.69350459956065e-06, + "loss": 0.74842793, + "num_input_tokens_seen": 6495945, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 2.5, + "step": 310, + "time_per_iteration": 2.584730863571167 + }, + { + "auxiliary_loss_clip": 0.01330325, + "auxiliary_loss_mlp": 0.01112154, + "balance_loss_clip": 1.06790328, + "balance_loss_mlp": 1.09306264, + "epoch": 0.018698331579738464, + "flos": 45731555907840.0, + "grad_norm": 12.392698164772181, + "language_loss": 0.74104297, + "learning_rate": 3.695578199367497e-06, + "loss": 0.76546776, + "num_input_tokens_seen": 6519930, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 2.375, + "step": 311, + "time_per_iteration": 2.734072208404541 + }, + { + "auxiliary_loss_clip": 0.01337963, + "auxiliary_loss_mlp": 0.0110935, + "balance_loss_clip": 1.06619668, + "balance_loss_mlp": 1.09045064, + "epoch": 0.018758454832406433, + "flos": 20483662308480.0, + "grad_norm": 2.2753160661232603, + "language_loss": 0.91518372, + "learning_rate": 3.6976451423416825e-06, + "loss": 0.93965685, + "num_input_tokens_seen": 6535070, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 2.46875, + "step": 312, + "time_per_iteration": 2.5117411613464355 + }, + { + "auxiliary_loss_clip": 0.01336169, + "auxiliary_loss_mlp": 0.01112089, + "balance_loss_clip": 1.06609774, + "balance_loss_mlp": 1.09088099, + "epoch": 0.0188185780850744, + "flos": 15777784661760.0, + "grad_norm": 2.320247917383294, + "language_loss": 0.89746982, + "learning_rate": 3.699705471087043e-06, + "loss": 0.92195237, + "num_input_tokens_seen": 6554135, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 2.453125, + "step": 313, + "time_per_iteration": 2.4761838912963867 + }, + { + "auxiliary_loss_clip": 0.01340305, + "auxiliary_loss_mlp": 0.01098393, + "balance_loss_clip": 1.05230689, + "balance_loss_mlp": 1.09061432, + "epoch": 0.018878701337742373, + "flos": 22455732758400.0, + "grad_norm": 2.3404867001555236, + "language_loss": 0.73099983, + "learning_rate": 3.7017592277997256e-06, + "loss": 0.75538683, + "num_input_tokens_seen": 6572275, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 2.5, + "step": 314, + "time_per_iteration": 2.5488638877868652 + }, + { + "auxiliary_loss_clip": 0.01326469, + "auxiliary_loss_mlp": 0.01103837, + "balance_loss_clip": 1.06101751, + "balance_loss_mlp": 1.08694446, + "epoch": 0.018938824590410342, + "flos": 30993530238720.0, + "grad_norm": 2.192553769026804, + "language_loss": 0.89887041, + "learning_rate": 3.7038064542733654e-06, + "loss": 0.92317349, + "num_input_tokens_seen": 6594520, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 2.40625, + "step": 315, + "time_per_iteration": 2.5857741832733154 + }, + { + "auxiliary_loss_clip": 0.01329672, + "auxiliary_loss_mlp": 0.01096027, + "balance_loss_clip": 1.05170512, + "balance_loss_mlp": 1.08870411, + "epoch": 0.01899894784307831, + "flos": 23258910821760.0, + "grad_norm": 1.8364758613144732, + "language_loss": 0.80796063, + "learning_rate": 3.7058471919041945e-06, + "loss": 0.83221763, + "num_input_tokens_seen": 6614245, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 2.40625, + "step": 316, + "time_per_iteration": 2.5222342014312744 + }, + { + "auxiliary_loss_clip": 0.01324399, + "auxiliary_loss_mlp": 0.01095063, + "balance_loss_clip": 1.05131364, + "balance_loss_mlp": 1.08633423, + "epoch": 0.01905907109574628, + "flos": 17457901367040.0, + "grad_norm": 2.1363686538021236, + "language_loss": 0.90357143, + "learning_rate": 3.7078814816960605e-06, + "loss": 0.92776608, + "num_input_tokens_seen": 6632015, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 2.375, + "step": 317, + "time_per_iteration": 2.4696123600006104 + }, + { + "auxiliary_loss_clip": 0.01319895, + "auxiliary_loss_mlp": 0.01095564, + "balance_loss_clip": 1.0515281, + "balance_loss_mlp": 1.0845592, + "epoch": 0.019119194348414248, + "flos": 14970225139200.0, + "grad_norm": 2.5260192321083794, + "language_loss": 0.90939772, + "learning_rate": 3.709909364265374e-06, + "loss": 0.93355227, + "num_input_tokens_seen": 6649015, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 2.34375, + "step": 318, + "time_per_iteration": 2.488128185272217 + }, + { + "auxiliary_loss_clip": 0.01324457, + "auxiliary_loss_mlp": 0.01088861, + "balance_loss_clip": 1.04706657, + "balance_loss_mlp": 1.08574772, + "epoch": 0.01917931760108222, + "flos": 25482822503040.0, + "grad_norm": 2.626221841877022, + "language_loss": 0.93980259, + "learning_rate": 3.7119308798459706e-06, + "loss": 0.96393579, + "num_input_tokens_seen": 6669225, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 2.390625, + "step": 319, + "time_per_iteration": 2.5184502601623535 + }, + { + "auxiliary_loss_clip": 0.01205117, + "auxiliary_loss_mlp": 0.01080363, + "balance_loss_clip": 1.06586683, + "balance_loss_mlp": 1.07482553, + "epoch": 0.01923944085375019, + "flos": 71556967353600.0, + "grad_norm": 0.9345393611259016, + "language_loss": 0.59860981, + "learning_rate": 3.7139460682939026e-06, + "loss": 0.62146461, + "num_input_tokens_seen": 6725775, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 1.296875, + "step": 320, + "time_per_iteration": 3.0250258445739746 + }, + { + "auxiliary_loss_clip": 0.01320993, + "auxiliary_loss_mlp": 0.0110086, + "balance_loss_clip": 1.05827808, + "balance_loss_mlp": 1.08425927, + "epoch": 0.019299564106418157, + "flos": 19682495406720.0, + "grad_norm": 3.0799113353921572, + "language_loss": 0.89622325, + "learning_rate": 3.715954969092154e-06, + "loss": 0.92044175, + "num_input_tokens_seen": 6744170, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 2.375, + "step": 321, + "time_per_iteration": 2.476439952850342 + }, + { + "auxiliary_loss_clip": 0.01332068, + "auxiliary_loss_mlp": 0.0112077, + "balance_loss_clip": 1.07620978, + "balance_loss_mlp": 1.08993089, + "epoch": 0.019359687359086126, + "flos": 24387151991040.0, + "grad_norm": 2.068543890023447, + "language_loss": 0.82884163, + "learning_rate": 3.7179576213552805e-06, + "loss": 0.85337007, + "num_input_tokens_seen": 6764565, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 2.421875, + "step": 322, + "time_per_iteration": 2.556302309036255 + }, + { + "auxiliary_loss_clip": 0.01332156, + "auxiliary_loss_mlp": 0.01090343, + "balance_loss_clip": 1.04828596, + "balance_loss_mlp": 1.08754158, + "epoch": 0.019419810611754094, + "flos": 23951376190080.0, + "grad_norm": 2.2506232399398245, + "language_loss": 0.72734368, + "learning_rate": 3.719954063833981e-06, + "loss": 0.75156873, + "num_input_tokens_seen": 6785310, + "router_z_loss_clip": 0.41992188, + "router_z_loss_mlp": 2.453125, + "step": 323, + "time_per_iteration": 2.5033397674560547 + }, + { + "auxiliary_loss_clip": 0.01318896, + "auxiliary_loss_mlp": 0.01090622, + "balance_loss_clip": 1.04763484, + "balance_loss_mlp": 1.08184087, + "epoch": 0.019479933864422067, + "flos": 22160223567360.0, + "grad_norm": 2.023515622890843, + "language_loss": 0.92639947, + "learning_rate": 3.721944334919596e-06, + "loss": 0.95049465, + "num_input_tokens_seen": 6803290, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 2.375, + "step": 324, + "time_per_iteration": 2.5194544792175293 + }, + { + "auxiliary_loss_clip": 0.01328869, + "auxiliary_loss_mlp": 0.01089838, + "balance_loss_clip": 1.04935479, + "balance_loss_mlp": 1.08943164, + "epoch": 0.019540057117090035, + "flos": 22236821320320.0, + "grad_norm": 4.018466874717804, + "language_loss": 0.65336061, + "learning_rate": 3.7239284726485375e-06, + "loss": 0.67754775, + "num_input_tokens_seen": 6822570, + "router_z_loss_clip": 0.40429688, + "router_z_loss_mlp": 2.390625, + "step": 325, + "time_per_iteration": 2.5107386112213135 + }, + { + "auxiliary_loss_clip": 0.0132709, + "auxiliary_loss_mlp": 0.01101196, + "balance_loss_clip": 1.05799484, + "balance_loss_mlp": 1.093485, + "epoch": 0.019600180369758004, + "flos": 23076771932160.0, + "grad_norm": 1.921455060851243, + "language_loss": 0.76449442, + "learning_rate": 3.72590651470665e-06, + "loss": 0.78877723, + "num_input_tokens_seen": 6841910, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 2.34375, + "step": 326, + "time_per_iteration": 2.5080325603485107 + }, + { + "auxiliary_loss_clip": 0.01320399, + "auxiliary_loss_mlp": 0.01103572, + "balance_loss_clip": 1.06015599, + "balance_loss_mlp": 1.08845115, + "epoch": 0.019660303622425972, + "flos": 25410857604480.0, + "grad_norm": 2.1551163890972123, + "language_loss": 0.79176939, + "learning_rate": 3.727878498433505e-06, + "loss": 0.8160091, + "num_input_tokens_seen": 6862480, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 2.3125, + "step": 327, + "time_per_iteration": 2.5449633598327637 + }, + { + "auxiliary_loss_clip": 0.01326802, + "auxiliary_loss_mlp": 0.01111954, + "balance_loss_clip": 1.06984949, + "balance_loss_mlp": 1.08873606, + "epoch": 0.01972042687509394, + "flos": 23657519024640.0, + "grad_norm": 2.1574079642063246, + "language_loss": 0.80725288, + "learning_rate": 3.7298444608266328e-06, + "loss": 0.83164048, + "num_input_tokens_seen": 6882015, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 2.390625, + "step": 328, + "time_per_iteration": 2.5418970584869385 + }, + { + "auxiliary_loss_clip": 0.01325663, + "auxiliary_loss_mlp": 0.01096681, + "balance_loss_clip": 1.05278802, + "balance_loss_mlp": 1.08396721, + "epoch": 0.019780550127761913, + "flos": 18223480869120.0, + "grad_norm": 2.245263087715646, + "language_loss": 0.93704766, + "learning_rate": 3.731804438545683e-06, + "loss": 0.96127105, + "num_input_tokens_seen": 6899785, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 2.40625, + "step": 329, + "time_per_iteration": 2.4910004138946533 + }, + { + "auxiliary_loss_clip": 0.01332781, + "auxiliary_loss_mlp": 0.01105781, + "balance_loss_clip": 1.06253231, + "balance_loss_mlp": 1.08930123, + "epoch": 0.01984067338042988, + "flos": 22418780641920.0, + "grad_norm": 2.9776357674257365, + "language_loss": 0.74277973, + "learning_rate": 3.7337584679165324e-06, + "loss": 0.7671653, + "num_input_tokens_seen": 6918575, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 2.4375, + "step": 330, + "time_per_iteration": 2.51430082321167 + }, + { + "auxiliary_loss_clip": 0.01328701, + "auxiliary_loss_mlp": 0.01120913, + "balance_loss_clip": 1.07814097, + "balance_loss_mlp": 1.08762872, + "epoch": 0.01990079663309785, + "flos": 17055199013760.0, + "grad_norm": 2.972763157156593, + "language_loss": 0.93870068, + "learning_rate": 3.7357065849353186e-06, + "loss": 0.96319681, + "num_input_tokens_seen": 6936965, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 2.40625, + "step": 331, + "time_per_iteration": 2.4759159088134766 + }, + { + "auxiliary_loss_clip": 0.01316192, + "auxiliary_loss_mlp": 0.01089699, + "balance_loss_clip": 1.04938233, + "balance_loss_mlp": 1.0853951, + "epoch": 0.01996091988576582, + "flos": 15961791058560.0, + "grad_norm": 2.6958694906457836, + "language_loss": 0.92730892, + "learning_rate": 3.737648825272422e-06, + "loss": 0.95136791, + "num_input_tokens_seen": 6953475, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 2.3125, + "step": 332, + "time_per_iteration": 2.49817156791687 + }, + { + "auxiliary_loss_clip": 0.01325132, + "auxiliary_loss_mlp": 0.01092519, + "balance_loss_clip": 1.04903162, + "balance_loss_mlp": 1.09081161, + "epoch": 0.02002104313843379, + "flos": 23586451966080.0, + "grad_norm": 2.6289067025313777, + "language_loss": 0.75589794, + "learning_rate": 3.739585224276384e-06, + "loss": 0.78007442, + "num_input_tokens_seen": 6971630, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 2.34375, + "step": 333, + "time_per_iteration": 2.5180609226226807 + }, + { + "auxiliary_loss_clip": 0.01323371, + "auxiliary_loss_mlp": 0.01087623, + "balance_loss_clip": 1.04597139, + "balance_loss_mlp": 1.08625877, + "epoch": 0.02008116639110176, + "flos": 34094883352320.0, + "grad_norm": 2.1766901409232426, + "language_loss": 0.78768885, + "learning_rate": 3.7415158169777673e-06, + "loss": 0.81179881, + "num_input_tokens_seen": 6992775, + "router_z_loss_clip": 0.41601562, + "router_z_loss_mlp": 2.375, + "step": 334, + "time_per_iteration": 2.614708423614502 + }, + { + "auxiliary_loss_clip": 0.01324397, + "auxiliary_loss_mlp": 0.01094838, + "balance_loss_clip": 1.05015838, + "balance_loss_mlp": 1.08276975, + "epoch": 0.020141289643769728, + "flos": 19683716469120.0, + "grad_norm": 2.4059127888346916, + "language_loss": 0.83083838, + "learning_rate": 3.7434406380929575e-06, + "loss": 0.85503072, + "num_input_tokens_seen": 7011425, + "router_z_loss_clip": 0.44726562, + "router_z_loss_mlp": 2.421875, + "step": 335, + "time_per_iteration": 2.495260000228882 + }, + { + "auxiliary_loss_clip": 0.01320649, + "auxiliary_loss_mlp": 0.01090782, + "balance_loss_clip": 1.04934454, + "balance_loss_mlp": 1.08585882, + "epoch": 0.020201412896437697, + "flos": 20740567357440.0, + "grad_norm": 2.166489879958422, + "language_loss": 0.92639577, + "learning_rate": 3.745359722027911e-06, + "loss": 0.95051014, + "num_input_tokens_seen": 7029450, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 2.34375, + "step": 336, + "time_per_iteration": 2.526906967163086 + }, + { + "auxiliary_loss_clip": 0.01321744, + "auxiliary_loss_mlp": 0.01083167, + "balance_loss_clip": 1.04139614, + "balance_loss_mlp": 1.08352447, + "epoch": 0.020261536149105665, + "flos": 20266510636800.0, + "grad_norm": 1.825762702383362, + "language_loss": 0.88474333, + "learning_rate": 3.7472731028818428e-06, + "loss": 0.90879244, + "num_input_tokens_seen": 7047555, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 2.390625, + "step": 337, + "time_per_iteration": 2.5151607990264893 + }, + { + "auxiliary_loss_clip": 0.01310297, + "auxiliary_loss_mlp": 0.01101804, + "balance_loss_clip": 1.05836427, + "balance_loss_mlp": 1.08001363, + "epoch": 0.020321659401773638, + "flos": 25848752307840.0, + "grad_norm": 1.5415234153999902, + "language_loss": 0.89914495, + "learning_rate": 3.7491808144508626e-06, + "loss": 0.92326593, + "num_input_tokens_seen": 7068185, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 2.3125, + "step": 338, + "time_per_iteration": 2.5795979499816895 + }, + { + "auxiliary_loss_clip": 0.01324391, + "auxiliary_loss_mlp": 0.01099602, + "balance_loss_clip": 1.05742574, + "balance_loss_mlp": 1.08479571, + "epoch": 0.020381782654441606, + "flos": 17495033051520.0, + "grad_norm": 2.047046576054304, + "language_loss": 0.84801471, + "learning_rate": 3.7510828902315576e-06, + "loss": 0.87225461, + "num_input_tokens_seen": 7085955, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 2.40625, + "step": 339, + "time_per_iteration": 2.4558403491973877 + }, + { + "auxiliary_loss_clip": 0.01326609, + "auxiliary_loss_mlp": 0.01093427, + "balance_loss_clip": 1.05001152, + "balance_loss_mlp": 1.08709431, + "epoch": 0.020441905907109575, + "flos": 24243940465920.0, + "grad_norm": 1.7544231793273473, + "language_loss": 0.88913274, + "learning_rate": 3.75297936342452e-06, + "loss": 0.91333312, + "num_input_tokens_seen": 7106345, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 2.40625, + "step": 340, + "time_per_iteration": 2.5330188274383545 + }, + { + "auxiliary_loss_clip": 0.01323557, + "auxiliary_loss_mlp": 0.01086176, + "balance_loss_clip": 1.04135346, + "balance_loss_mlp": 1.0859195, + "epoch": 0.020502029159777543, + "flos": 22233301787520.0, + "grad_norm": 2.2340783182785975, + "language_loss": 0.88071406, + "learning_rate": 3.7548702669378253e-06, + "loss": 0.90481138, + "num_input_tokens_seen": 7125070, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 2.375, + "step": 341, + "time_per_iteration": 2.502161979675293 + }, + { + "auxiliary_loss_clip": 0.01325847, + "auxiliary_loss_mlp": 0.01099304, + "balance_loss_clip": 1.05643678, + "balance_loss_mlp": 1.08389783, + "epoch": 0.020562152412445512, + "flos": 23987861429760.0, + "grad_norm": 3.2005009235922572, + "language_loss": 0.80293322, + "learning_rate": 3.756755633390458e-06, + "loss": 0.82718468, + "num_input_tokens_seen": 7144675, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 2.421875, + "step": 342, + "time_per_iteration": 2.5315535068511963 + }, + { + "auxiliary_loss_clip": 0.0131301, + "auxiliary_loss_mlp": 0.01098615, + "balance_loss_clip": 1.05293417, + "balance_loss_mlp": 1.08132875, + "epoch": 0.020622275665113484, + "flos": 26975305537920.0, + "grad_norm": 2.399130254204822, + "language_loss": 0.89451253, + "learning_rate": 3.7586354951156886e-06, + "loss": 0.91862881, + "num_input_tokens_seen": 7165505, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 2.3125, + "step": 343, + "time_per_iteration": 2.554255485534668 + }, + { + "auxiliary_loss_clip": 0.01325104, + "auxiliary_loss_mlp": 0.01094315, + "balance_loss_clip": 1.05342627, + "balance_loss_mlp": 1.08973229, + "epoch": 0.020682398917781453, + "flos": 22600704049920.0, + "grad_norm": 2.3234219523507296, + "language_loss": 0.78252918, + "learning_rate": 3.7605098841644e-06, + "loss": 0.80672336, + "num_input_tokens_seen": 7184605, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 2.359375, + "step": 344, + "time_per_iteration": 2.514665365219116 + }, + { + "auxiliary_loss_clip": 0.01309596, + "auxiliary_loss_mlp": 0.0110098, + "balance_loss_clip": 1.05730188, + "balance_loss_mlp": 1.08079529, + "epoch": 0.02074252217044942, + "flos": 15013605790080.0, + "grad_norm": 1.8371023099908983, + "language_loss": 0.75138956, + "learning_rate": 3.7623788323083666e-06, + "loss": 0.77549529, + "num_input_tokens_seen": 7203065, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 2.28125, + "step": 345, + "time_per_iteration": 2.513394594192505 + }, + { + "auxiliary_loss_clip": 0.01318525, + "auxiliary_loss_mlp": 0.01101003, + "balance_loss_clip": 1.05806339, + "balance_loss_mlp": 1.08789146, + "epoch": 0.02080264542311739, + "flos": 25337958952320.0, + "grad_norm": 2.0741733748571565, + "language_loss": 0.90269232, + "learning_rate": 3.7642423710434837e-06, + "loss": 0.92688763, + "num_input_tokens_seen": 7222995, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 2.3125, + "step": 346, + "time_per_iteration": 2.5487060546875 + }, + { + "auxiliary_loss_clip": 0.01314255, + "auxiliary_loss_mlp": 0.01095048, + "balance_loss_clip": 1.05527973, + "balance_loss_mlp": 1.08358788, + "epoch": 0.02086276867578536, + "flos": 24388804016640.0, + "grad_norm": 2.0766581400667, + "language_loss": 0.78869188, + "learning_rate": 3.7661005315929563e-06, + "loss": 0.81278479, + "num_input_tokens_seen": 7244625, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 2.3125, + "step": 347, + "time_per_iteration": 2.516402006149292 + }, + { + "auxiliary_loss_clip": 0.01317315, + "auxiliary_loss_mlp": 0.01096912, + "balance_loss_clip": 1.05335259, + "balance_loss_mlp": 1.08719826, + "epoch": 0.02092289192845333, + "flos": 24462205459200.0, + "grad_norm": 2.4234628631287927, + "language_loss": 0.71424043, + "learning_rate": 3.7679533449104354e-06, + "loss": 0.7383827, + "num_input_tokens_seen": 7263255, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 2.3125, + "step": 348, + "time_per_iteration": 2.5407540798187256 + }, + { + "auxiliary_loss_clip": 0.01319638, + "auxiliary_loss_mlp": 0.01101899, + "balance_loss_clip": 1.0595324, + "balance_loss_mlp": 1.08435416, + "epoch": 0.0209830151811213, + "flos": 17451185523840.0, + "grad_norm": 4.002924557181807, + "language_loss": 0.76819432, + "learning_rate": 3.7698008416831116e-06, + "loss": 0.79240972, + "num_input_tokens_seen": 7279275, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 2.34375, + "step": 349, + "time_per_iteration": 2.4884049892425537 + }, + { + "auxiliary_loss_clip": 0.0130292, + "auxiliary_loss_mlp": 0.0109884, + "balance_loss_clip": 1.05792725, + "balance_loss_mlp": 1.08141851, + "epoch": 0.021043138433789268, + "flos": 24573995562240.0, + "grad_norm": 1.9115672624672835, + "language_loss": 0.85271406, + "learning_rate": 3.7716430523347664e-06, + "loss": 0.87673163, + "num_input_tokens_seen": 7300180, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 2.21875, + "step": 350, + "time_per_iteration": 2.559812307357788 + }, + { + "auxiliary_loss_clip": 0.01311162, + "auxiliary_loss_mlp": 0.01089483, + "balance_loss_clip": 1.05083585, + "balance_loss_mlp": 1.08571863, + "epoch": 0.021103261686457236, + "flos": 24454053072000.0, + "grad_norm": 2.3355222976898764, + "language_loss": 0.80104828, + "learning_rate": 3.773480007028776e-06, + "loss": 0.82505476, + "num_input_tokens_seen": 7317430, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 2.25, + "step": 351, + "time_per_iteration": 5.468780517578125 + }, + { + "auxiliary_loss_clip": 0.01318524, + "auxiliary_loss_mlp": 0.01103443, + "balance_loss_clip": 1.06048024, + "balance_loss_mlp": 1.08623564, + "epoch": 0.021163384939125205, + "flos": 14683083816960.0, + "grad_norm": 3.8473493260702125, + "language_loss": 0.87258279, + "learning_rate": 3.775311735671078e-06, + "loss": 0.89680254, + "num_input_tokens_seen": 7334875, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 2.328125, + "step": 352, + "time_per_iteration": 2.4787278175354004 + }, + { + "auxiliary_loss_clip": 0.01312545, + "auxiliary_loss_mlp": 0.01105111, + "balance_loss_clip": 1.06248152, + "balance_loss_mlp": 1.08574009, + "epoch": 0.021223508191793177, + "flos": 24493195918080.0, + "grad_norm": 1.8920106465676412, + "language_loss": 0.82386625, + "learning_rate": 3.7771382679130878e-06, + "loss": 0.84804279, + "num_input_tokens_seen": 7355185, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 2.265625, + "step": 353, + "time_per_iteration": 2.5428433418273926 + }, + { + "auxiliary_loss_clip": 0.01307832, + "auxiliary_loss_mlp": 0.01091814, + "balance_loss_clip": 1.05133069, + "balance_loss_mlp": 1.08353949, + "epoch": 0.021283631444461146, + "flos": 24126978804480.0, + "grad_norm": 2.0636001035279694, + "language_loss": 0.8102631, + "learning_rate": 3.7789596331545845e-06, + "loss": 0.83425963, + "num_input_tokens_seen": 7374425, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 2.25, + "step": 354, + "time_per_iteration": 2.5261166095733643 + }, + { + "auxiliary_loss_clip": 0.01315043, + "auxiliary_loss_mlp": 0.01092413, + "balance_loss_clip": 1.04935455, + "balance_loss_mlp": 1.08190715, + "epoch": 0.021343754697129114, + "flos": 25192233475200.0, + "grad_norm": 2.8065821662627575, + "language_loss": 0.80764574, + "learning_rate": 3.780775860546545e-06, + "loss": 0.83172029, + "num_input_tokens_seen": 7394175, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 2.328125, + "step": 355, + "time_per_iteration": 2.56968355178833 + }, + { + "auxiliary_loss_clip": 0.01310125, + "auxiliary_loss_mlp": 0.01086869, + "balance_loss_clip": 1.0454793, + "balance_loss_mlp": 1.08140039, + "epoch": 0.021403877949797083, + "flos": 17274182279040.0, + "grad_norm": 2.2488803729957, + "language_loss": 0.89553398, + "learning_rate": 3.7825869789939474e-06, + "loss": 0.91950381, + "num_input_tokens_seen": 7412645, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 2.28125, + "step": 356, + "time_per_iteration": 2.5510213375091553 + }, + { + "auxiliary_loss_clip": 0.01308646, + "auxiliary_loss_mlp": 0.01083372, + "balance_loss_clip": 1.04117227, + "balance_loss_mlp": 1.08451605, + "epoch": 0.021464001202465055, + "flos": 30917435276160.0, + "grad_norm": 2.7055681522526522, + "language_loss": 0.80032516, + "learning_rate": 3.784393017158528e-06, + "loss": 0.82424533, + "num_input_tokens_seen": 7432275, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 2.234375, + "step": 357, + "time_per_iteration": 2.5834848880767822 + }, + { + "auxiliary_loss_clip": 0.01311386, + "auxiliary_loss_mlp": 0.0108216, + "balance_loss_clip": 1.04336917, + "balance_loss_mlp": 1.08195996, + "epoch": 0.021524124455133024, + "flos": 18186385098240.0, + "grad_norm": 2.3810225918991827, + "language_loss": 0.7661376, + "learning_rate": 3.786194003461506e-06, + "loss": 0.7900731, + "num_input_tokens_seen": 7450245, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 2.296875, + "step": 358, + "time_per_iteration": 2.4937031269073486 + }, + { + "auxiliary_loss_clip": 0.01308618, + "auxiliary_loss_mlp": 0.01088514, + "balance_loss_clip": 1.04574156, + "balance_loss_mlp": 1.08024073, + "epoch": 0.021584247707800992, + "flos": 13805786039040.0, + "grad_norm": 3.004949550769694, + "language_loss": 0.88491321, + "learning_rate": 3.787989966086264e-06, + "loss": 0.90888453, + "num_input_tokens_seen": 7466845, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 2.28125, + "step": 359, + "time_per_iteration": 2.452698230743408 + }, + { + "auxiliary_loss_clip": 0.01316066, + "auxiliary_loss_mlp": 0.01089057, + "balance_loss_clip": 1.05000377, + "balance_loss_mlp": 1.08438587, + "epoch": 0.02164437096046896, + "flos": 23294713703040.0, + "grad_norm": 2.789884231725057, + "language_loss": 0.76007903, + "learning_rate": 3.789780932980997e-06, + "loss": 0.78413033, + "num_input_tokens_seen": 7485450, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.3125, + "step": 360, + "time_per_iteration": 2.490006685256958 + }, + { + "auxiliary_loss_clip": 0.01189834, + "auxiliary_loss_mlp": 0.010797, + "balance_loss_clip": 1.06634831, + "balance_loss_mlp": 1.06162107, + "epoch": 0.02170449421313693, + "flos": 68899578341760.0, + "grad_norm": 0.8685264055585812, + "language_loss": 0.64943242, + "learning_rate": 3.79156693186132e-06, + "loss": 0.67212784, + "num_input_tokens_seen": 7553780, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 1.28125, + "step": 361, + "time_per_iteration": 3.1978280544281006 + }, + { + "auxiliary_loss_clip": 0.01307066, + "auxiliary_loss_mlp": 0.01088482, + "balance_loss_clip": 1.04826093, + "balance_loss_mlp": 1.0776422, + "epoch": 0.0217646174658049, + "flos": 25228539146880.0, + "grad_norm": 2.6839093883440213, + "language_loss": 0.78157276, + "learning_rate": 3.7933479902128433e-06, + "loss": 0.80552828, + "num_input_tokens_seen": 7574155, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 2.296875, + "step": 362, + "time_per_iteration": 2.5401153564453125 + }, + { + "auxiliary_loss_clip": 0.0131339, + "auxiliary_loss_mlp": 0.01092034, + "balance_loss_clip": 1.05171776, + "balance_loss_mlp": 1.08265781, + "epoch": 0.02182474071847287, + "flos": 22893124671360.0, + "grad_norm": 2.163466714708112, + "language_loss": 0.92508751, + "learning_rate": 3.7951241352937077e-06, + "loss": 0.94914174, + "num_input_tokens_seen": 7592320, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 2.3125, + "step": 363, + "time_per_iteration": 2.4868171215057373 + }, + { + "auxiliary_loss_clip": 0.01307593, + "auxiliary_loss_mlp": 0.01102168, + "balance_loss_clip": 1.06270981, + "balance_loss_mlp": 1.08121252, + "epoch": 0.02188486397114084, + "flos": 23658991482240.0, + "grad_norm": 2.137373361500905, + "language_loss": 0.89611077, + "learning_rate": 3.7968953941370915e-06, + "loss": 0.92020839, + "num_input_tokens_seen": 7611185, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 2.265625, + "step": 364, + "time_per_iteration": 2.5251975059509277 + }, + { + "auxiliary_loss_clip": 0.01313873, + "auxiliary_loss_mlp": 0.01094072, + "balance_loss_clip": 1.05232477, + "balance_loss_mlp": 1.08512843, + "epoch": 0.021944987223808807, + "flos": 21543637680000.0, + "grad_norm": 2.0040846596101867, + "language_loss": 0.79597497, + "learning_rate": 3.798661793553676e-06, + "loss": 0.82005441, + "num_input_tokens_seen": 7631970, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 2.28125, + "step": 365, + "time_per_iteration": 2.5358779430389404 + }, + { + "auxiliary_loss_clip": 0.01307321, + "auxiliary_loss_mlp": 0.01094001, + "balance_loss_clip": 1.05218291, + "balance_loss_mlp": 1.08262253, + "epoch": 0.022005110476476776, + "flos": 16070887641600.0, + "grad_norm": 2.4198695758814126, + "language_loss": 0.84312123, + "learning_rate": 3.8004233601340808e-06, + "loss": 0.86713445, + "num_input_tokens_seen": 7649745, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 2.25, + "step": 366, + "time_per_iteration": 2.4834306240081787 + }, + { + "auxiliary_loss_clip": 0.01314411, + "auxiliary_loss_mlp": 0.01089093, + "balance_loss_clip": 1.05008757, + "balance_loss_mlp": 1.08409071, + "epoch": 0.022065233729144748, + "flos": 21433715084160.0, + "grad_norm": 2.4790438398014114, + "language_loss": 0.87009263, + "learning_rate": 3.8021801202512694e-06, + "loss": 0.89412761, + "num_input_tokens_seen": 7668830, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.296875, + "step": 367, + "time_per_iteration": 2.486476421356201 + }, + { + "auxiliary_loss_clip": 0.01315695, + "auxiliary_loss_mlp": 0.01094559, + "balance_loss_clip": 1.05247772, + "balance_loss_mlp": 1.08183074, + "epoch": 0.022125356981812717, + "flos": 21543709507200.0, + "grad_norm": 3.1787846704720906, + "language_loss": 0.84725291, + "learning_rate": 3.803932100062912e-06, + "loss": 0.87135541, + "num_input_tokens_seen": 7687240, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 2.34375, + "step": 368, + "time_per_iteration": 2.522035837173462 + }, + { + "auxiliary_loss_clip": 0.01314671, + "auxiliary_loss_mlp": 0.01085486, + "balance_loss_clip": 1.04559815, + "balance_loss_mlp": 1.07997978, + "epoch": 0.022185480234480685, + "flos": 20704153944960.0, + "grad_norm": 3.205334425353566, + "language_loss": 0.75328851, + "learning_rate": 3.8056793255137264e-06, + "loss": 0.77728999, + "num_input_tokens_seen": 7704440, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 2.34375, + "step": 369, + "time_per_iteration": 2.5247385501861572 + }, + { + "auxiliary_loss_clip": 0.01309465, + "auxiliary_loss_mlp": 0.01102826, + "balance_loss_clip": 1.06241453, + "balance_loss_mlp": 1.08204889, + "epoch": 0.022245603487148654, + "flos": 25193203142400.0, + "grad_norm": 2.195001895084689, + "language_loss": 0.82444763, + "learning_rate": 3.8074218223377844e-06, + "loss": 0.84857059, + "num_input_tokens_seen": 7727160, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 2.28125, + "step": 370, + "time_per_iteration": 2.556654453277588 + }, + { + "auxiliary_loss_clip": 0.01308477, + "auxiliary_loss_mlp": 0.01102256, + "balance_loss_clip": 1.06186807, + "balance_loss_mlp": 1.08148122, + "epoch": 0.022305726739816623, + "flos": 21395936954880.0, + "grad_norm": 1.701167396379405, + "language_loss": 0.81576145, + "learning_rate": 3.8091596160607834e-06, + "loss": 0.83986878, + "num_input_tokens_seen": 7747730, + "router_z_loss_clip": 0.40429688, + "router_z_loss_mlp": 2.265625, + "step": 371, + "time_per_iteration": 2.5303707122802734 + }, + { + "auxiliary_loss_clip": 0.01313813, + "auxiliary_loss_mlp": 0.01097647, + "balance_loss_clip": 1.05611479, + "balance_loss_mlp": 1.08685589, + "epoch": 0.022365849992484595, + "flos": 22492146170880.0, + "grad_norm": 2.421527930745161, + "language_loss": 0.83273733, + "learning_rate": 3.8108927320022896e-06, + "loss": 0.85685182, + "num_input_tokens_seen": 7766765, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 2.28125, + "step": 372, + "time_per_iteration": 2.528141975402832 + }, + { + "auxiliary_loss_clip": 0.01304775, + "auxiliary_loss_mlp": 0.01093239, + "balance_loss_clip": 1.05170679, + "balance_loss_mlp": 1.08068216, + "epoch": 0.022425973245152563, + "flos": 17856581397120.0, + "grad_norm": 3.9515576064335742, + "language_loss": 0.78448784, + "learning_rate": 3.8126211952779548e-06, + "loss": 0.80846798, + "num_input_tokens_seen": 7784010, + "router_z_loss_clip": 0.41601562, + "router_z_loss_mlp": 2.234375, + "step": 373, + "time_per_iteration": 2.4879236221313477 + }, + { + "auxiliary_loss_clip": 0.01310159, + "auxiliary_loss_mlp": 0.01088775, + "balance_loss_clip": 1.04681301, + "balance_loss_mlp": 1.08387947, + "epoch": 0.022486096497820532, + "flos": 15483029656320.0, + "grad_norm": 2.577150517784044, + "language_loss": 0.77507353, + "learning_rate": 3.8143450308016952e-06, + "loss": 0.79906291, + "num_input_tokens_seen": 7801305, + "router_z_loss_clip": 0.41992188, + "router_z_loss_mlp": 2.265625, + "step": 374, + "time_per_iteration": 2.467660665512085 + }, + { + "auxiliary_loss_clip": 0.01300907, + "auxiliary_loss_mlp": 0.01075524, + "balance_loss_clip": 1.03415811, + "balance_loss_mlp": 1.07458413, + "epoch": 0.0225462197504885, + "flos": 27784157950080.0, + "grad_norm": 2.1361288872426187, + "language_loss": 0.85989249, + "learning_rate": 3.8160642632878525e-06, + "loss": 0.8836568, + "num_input_tokens_seen": 7823965, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 2.265625, + "step": 375, + "time_per_iteration": 2.555748224258423 + }, + { + "auxiliary_loss_clip": 0.01307901, + "auxiliary_loss_mlp": 0.01100092, + "balance_loss_clip": 1.05767775, + "balance_loss_mlp": 1.08341241, + "epoch": 0.02260634300315647, + "flos": 19975490645760.0, + "grad_norm": 5.5735447387306785, + "language_loss": 0.89170349, + "learning_rate": 3.817778917253314e-06, + "loss": 0.91578341, + "num_input_tokens_seen": 7842115, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 2.25, + "step": 376, + "time_per_iteration": 2.53151798248291 + }, + { + "auxiliary_loss_clip": 0.01309113, + "auxiliary_loss_mlp": 0.01087831, + "balance_loss_clip": 1.04908752, + "balance_loss_mlp": 1.07899499, + "epoch": 0.02266646625582444, + "flos": 16028189349120.0, + "grad_norm": 4.261190841992283, + "language_loss": 0.74947262, + "learning_rate": 3.8194890170196155e-06, + "loss": 0.77344215, + "num_input_tokens_seen": 7857830, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 2.3125, + "step": 377, + "time_per_iteration": 2.463115692138672 + }, + { + "auxiliary_loss_clip": 0.0129987, + "auxiliary_loss_mlp": 0.01091273, + "balance_loss_clip": 1.05009794, + "balance_loss_mlp": 1.08131123, + "epoch": 0.02272658950849241, + "flos": 20404622430720.0, + "grad_norm": 9.398931100052017, + "language_loss": 0.99195766, + "learning_rate": 3.8211945867150055e-06, + "loss": 1.01586914, + "num_input_tokens_seen": 7875840, + "router_z_loss_clip": 0.41210938, + "router_z_loss_mlp": 2.1875, + "step": 378, + "time_per_iteration": 2.4765851497650146 + }, + { + "auxiliary_loss_clip": 0.01180245, + "auxiliary_loss_mlp": 0.0112236, + "balance_loss_clip": 1.10910404, + "balance_loss_mlp": 1.06006432, + "epoch": 0.02278671276116038, + "flos": 69847332647040.0, + "grad_norm": 0.9843357397114052, + "language_loss": 0.75457036, + "learning_rate": 3.822895650276492e-06, + "loss": 0.77759647, + "num_input_tokens_seen": 7940190, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 1.203125, + "step": 379, + "time_per_iteration": 3.113067388534546 + }, + { + "auxiliary_loss_clip": 0.01308809, + "auxiliary_loss_mlp": 0.01083458, + "balance_loss_clip": 1.0448581, + "balance_loss_mlp": 1.07811105, + "epoch": 0.022846836013828347, + "flos": 38508771340800.0, + "grad_norm": 4.195302770466088, + "language_loss": 0.78423429, + "learning_rate": 3.824592231451859e-06, + "loss": 0.80815697, + "num_input_tokens_seen": 7960840, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 2.3125, + "step": 380, + "time_per_iteration": 2.6457204818725586 + }, + { + "auxiliary_loss_clip": 0.01302565, + "auxiliary_loss_mlp": 0.01083038, + "balance_loss_clip": 1.04527259, + "balance_loss_mlp": 1.08019924, + "epoch": 0.02290695926649632, + "flos": 20959478795520.0, + "grad_norm": 2.272240555091753, + "language_loss": 0.9679752, + "learning_rate": 3.826284353801652e-06, + "loss": 0.99183118, + "num_input_tokens_seen": 7975500, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 2.21875, + "step": 381, + "time_per_iteration": 2.485316038131714 + }, + { + "auxiliary_loss_clip": 0.01312325, + "auxiliary_loss_mlp": 0.01084569, + "balance_loss_clip": 1.04501581, + "balance_loss_mlp": 1.08177519, + "epoch": 0.022967082519164288, + "flos": 24022407335040.0, + "grad_norm": 2.322972014312181, + "language_loss": 0.88035834, + "learning_rate": 3.827972040701142e-06, + "loss": 0.90432727, + "num_input_tokens_seen": 7993880, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 2.3125, + "step": 382, + "time_per_iteration": 2.5361156463623047 + }, + { + "auxiliary_loss_clip": 0.01306631, + "auxiliary_loss_mlp": 0.01099641, + "balance_loss_clip": 1.06080246, + "balance_loss_mlp": 1.08242524, + "epoch": 0.023027205771832256, + "flos": 20997149184000.0, + "grad_norm": 2.197151340607638, + "language_loss": 0.84830511, + "learning_rate": 3.829655315342268e-06, + "loss": 0.87236774, + "num_input_tokens_seen": 8012730, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 2.25, + "step": 383, + "time_per_iteration": 2.479097843170166 + }, + { + "auxiliary_loss_clip": 0.01303681, + "auxiliary_loss_mlp": 0.01106386, + "balance_loss_clip": 1.06673658, + "balance_loss_mlp": 1.08259249, + "epoch": 0.023087329024500225, + "flos": 21360816432000.0, + "grad_norm": 2.2992198386883116, + "language_loss": 0.83199835, + "learning_rate": 3.831334200735543e-06, + "loss": 0.85609907, + "num_input_tokens_seen": 8031275, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 2.203125, + "step": 384, + "time_per_iteration": 2.5008413791656494 + }, + { + "auxiliary_loss_clip": 0.01303616, + "auxiliary_loss_mlp": 0.0109643, + "balance_loss_clip": 1.06030965, + "balance_loss_mlp": 1.08539534, + "epoch": 0.023147452277168194, + "flos": 21872435800320.0, + "grad_norm": 1.8570399395654076, + "language_loss": 0.89240694, + "learning_rate": 3.8330087197119426e-06, + "loss": 0.91640741, + "num_input_tokens_seen": 8051600, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 2.1875, + "step": 385, + "time_per_iteration": 2.4913859367370605 + }, + { + "auxiliary_loss_clip": 0.01306859, + "auxiliary_loss_mlp": 0.01121647, + "balance_loss_clip": 1.08397639, + "balance_loss_mlp": 1.0826149, + "epoch": 0.023207575529836166, + "flos": 18916700423040.0, + "grad_norm": 2.2576284783670357, + "language_loss": 0.70096415, + "learning_rate": 3.83467889492477e-06, + "loss": 0.72524917, + "num_input_tokens_seen": 8070600, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 2.234375, + "step": 386, + "time_per_iteration": 2.5017154216766357 + }, + { + "auxiliary_loss_clip": 0.01308067, + "auxiliary_loss_mlp": 0.01098351, + "balance_loss_clip": 1.06072879, + "balance_loss_mlp": 1.08460176, + "epoch": 0.023267698782504134, + "flos": 25046005207680.0, + "grad_norm": 1.9470877788533054, + "language_loss": 0.87909782, + "learning_rate": 3.836344748851495e-06, + "loss": 0.90316188, + "num_input_tokens_seen": 8090680, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 2.234375, + "step": 387, + "time_per_iteration": 2.5142157077789307 + }, + { + "auxiliary_loss_clip": 0.01308318, + "auxiliary_loss_mlp": 0.01085815, + "balance_loss_clip": 1.04666662, + "balance_loss_mlp": 1.08291698, + "epoch": 0.023327822035172103, + "flos": 28879217930880.0, + "grad_norm": 2.441105853176172, + "language_loss": 0.83429295, + "learning_rate": 3.838006303795566e-06, + "loss": 0.85823429, + "num_input_tokens_seen": 8114610, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.25, + "step": 388, + "time_per_iteration": 2.591242790222168 + }, + { + "auxiliary_loss_clip": 0.01305661, + "auxiliary_loss_mlp": 0.01094305, + "balance_loss_clip": 1.05754054, + "balance_loss_mlp": 1.08271885, + "epoch": 0.02338794528784007, + "flos": 27121533805440.0, + "grad_norm": 3.2646980282386644, + "language_loss": 0.93823689, + "learning_rate": 3.839663581888206e-06, + "loss": 0.96223652, + "num_input_tokens_seen": 8133975, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 2.21875, + "step": 389, + "time_per_iteration": 2.5427236557006836 + }, + { + "auxiliary_loss_clip": 0.01299094, + "auxiliary_loss_mlp": 0.01087693, + "balance_loss_clip": 1.04954624, + "balance_loss_mlp": 1.08334351, + "epoch": 0.02344806854050804, + "flos": 21322355944320.0, + "grad_norm": 2.08298220488583, + "language_loss": 0.87901413, + "learning_rate": 3.841316605090178e-06, + "loss": 0.90288198, + "num_input_tokens_seen": 8153570, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 2.15625, + "step": 390, + "time_per_iteration": 2.53519606590271 + }, + { + "auxiliary_loss_clip": 0.01304239, + "auxiliary_loss_mlp": 0.01095828, + "balance_loss_clip": 1.05927861, + "balance_loss_mlp": 1.08334053, + "epoch": 0.023508191793176012, + "flos": 24789997998720.0, + "grad_norm": 2.2293869448662362, + "language_loss": 0.89346433, + "learning_rate": 3.842965395193529e-06, + "loss": 0.91746497, + "num_input_tokens_seen": 8170075, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 2.203125, + "step": 391, + "time_per_iteration": 2.5662949085235596 + }, + { + "auxiliary_loss_clip": 0.01302453, + "auxiliary_loss_mlp": 0.0107275, + "balance_loss_clip": 1.03560483, + "balance_loss_mlp": 1.08116579, + "epoch": 0.02356831504584398, + "flos": 25995375624960.0, + "grad_norm": 2.022763227206087, + "language_loss": 0.86065882, + "learning_rate": 3.84460997382332e-06, + "loss": 0.88441086, + "num_input_tokens_seen": 8190420, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 2.21875, + "step": 392, + "time_per_iteration": 4.050429105758667 + }, + { + "auxiliary_loss_clip": 0.01297975, + "auxiliary_loss_mlp": 0.01086863, + "balance_loss_clip": 1.04990816, + "balance_loss_mlp": 1.08006191, + "epoch": 0.02362843829851195, + "flos": 19062461813760.0, + "grad_norm": 2.9628480690926318, + "language_loss": 0.88900077, + "learning_rate": 3.8462503624393256e-06, + "loss": 0.91284919, + "num_input_tokens_seen": 8208790, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 2.1875, + "step": 393, + "time_per_iteration": 3.9293932914733887 + }, + { + "auxiliary_loss_clip": 0.01309989, + "auxiliary_loss_mlp": 0.01103908, + "balance_loss_clip": 1.06449771, + "balance_loss_mlp": 1.087502, + "epoch": 0.023688561551179918, + "flos": 16071031296000.0, + "grad_norm": 2.0531375516435943, + "language_loss": 0.81400156, + "learning_rate": 3.84788658233771e-06, + "loss": 0.83814055, + "num_input_tokens_seen": 8226885, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 2.21875, + "step": 394, + "time_per_iteration": 2.552100658416748 + }, + { + "auxiliary_loss_clip": 0.01299653, + "auxiliary_loss_mlp": 0.01084647, + "balance_loss_clip": 1.04611897, + "balance_loss_mlp": 1.08043575, + "epoch": 0.023748684803847887, + "flos": 21724375939200.0, + "grad_norm": 2.0447414784698092, + "language_loss": 0.86189264, + "learning_rate": 3.84951865465269e-06, + "loss": 0.88573563, + "num_input_tokens_seen": 8246825, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 2.1875, + "step": 395, + "time_per_iteration": 2.536823272705078 + }, + { + "auxiliary_loss_clip": 0.01174527, + "auxiliary_loss_mlp": 0.01044608, + "balance_loss_clip": 1.03135228, + "balance_loss_mlp": 1.0590049, + "epoch": 0.02380880805651586, + "flos": 61926192881280.0, + "grad_norm": 0.9487784547172928, + "language_loss": 0.63808912, + "learning_rate": 3.851146600358172e-06, + "loss": 0.66028047, + "num_input_tokens_seen": 8302835, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 1.15625, + "step": 396, + "time_per_iteration": 2.935506582260132 + }, + { + "auxiliary_loss_clip": 0.01296295, + "auxiliary_loss_mlp": 0.01069502, + "balance_loss_clip": 1.03252339, + "balance_loss_mlp": 1.07895613, + "epoch": 0.023868931309183827, + "flos": 20266331068800.0, + "grad_norm": 2.6168641306315172, + "language_loss": 0.83744055, + "learning_rate": 3.852770440269372e-06, + "loss": 0.86109853, + "num_input_tokens_seen": 8320745, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 2.171875, + "step": 397, + "time_per_iteration": 2.5051028728485107 + }, + { + "auxiliary_loss_clip": 0.01302535, + "auxiliary_loss_mlp": 0.01091033, + "balance_loss_clip": 1.05288601, + "balance_loss_mlp": 1.08300877, + "epoch": 0.023929054561851796, + "flos": 21139103733120.0, + "grad_norm": 2.535145802301163, + "language_loss": 0.84050488, + "learning_rate": 3.854390195044404e-06, + "loss": 0.86444056, + "num_input_tokens_seen": 8339540, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 2.1875, + "step": 398, + "time_per_iteration": 2.5234646797180176 + }, + { + "auxiliary_loss_clip": 0.01300466, + "auxiliary_loss_mlp": 0.0108273, + "balance_loss_clip": 1.04427278, + "balance_loss_mlp": 1.07864475, + "epoch": 0.023989177814519765, + "flos": 13698521049600.0, + "grad_norm": 2.904470095612531, + "language_loss": 0.85865271, + "learning_rate": 3.856005885185868e-06, + "loss": 0.88248467, + "num_input_tokens_seen": 8354890, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 2.21875, + "step": 399, + "time_per_iteration": 2.4674201011657715 + }, + { + "auxiliary_loss_clip": 0.01295496, + "auxiliary_loss_mlp": 0.01093118, + "balance_loss_clip": 1.05566239, + "balance_loss_mlp": 1.08021355, + "epoch": 0.024049301067187733, + "flos": 26322018929280.0, + "grad_norm": 2.016759933832732, + "language_loss": 0.86157769, + "learning_rate": 3.857617531042398e-06, + "loss": 0.88546383, + "num_input_tokens_seen": 8375845, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 2.15625, + "step": 400, + "time_per_iteration": 2.554075241088867 + }, + { + "auxiliary_loss_clip": 0.01303599, + "auxiliary_loss_mlp": 0.01083552, + "balance_loss_clip": 1.04652512, + "balance_loss_mlp": 1.0848943, + "epoch": 0.024109424319855705, + "flos": 24425432910720.0, + "grad_norm": 3.068890951588493, + "language_loss": 0.79142016, + "learning_rate": 3.8592251528102065e-06, + "loss": 0.8152917, + "num_input_tokens_seen": 8395240, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 2.1875, + "step": 401, + "time_per_iteration": 2.543750286102295 + }, + { + "auxiliary_loss_clip": 0.01297911, + "auxiliary_loss_mlp": 0.01096359, + "balance_loss_clip": 1.05968988, + "balance_loss_mlp": 1.07987046, + "epoch": 0.024169547572523674, + "flos": 29604397610880.0, + "grad_norm": 2.2009554384450154, + "language_loss": 0.78456193, + "learning_rate": 3.8608287705345976e-06, + "loss": 0.80850464, + "num_input_tokens_seen": 8416950, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 2.1875, + "step": 402, + "time_per_iteration": 2.5531415939331055 + }, + { + "auxiliary_loss_clip": 0.0130167, + "auxiliary_loss_mlp": 0.01084273, + "balance_loss_clip": 1.04529142, + "balance_loss_mlp": 1.07989287, + "epoch": 0.024229670825191642, + "flos": 22601458235520.0, + "grad_norm": 2.7198213535828923, + "language_loss": 0.94637424, + "learning_rate": 3.86242840411147e-06, + "loss": 0.97023368, + "num_input_tokens_seen": 8433660, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.21875, + "step": 403, + "time_per_iteration": 2.4873671531677246 + }, + { + "auxiliary_loss_clip": 0.01306025, + "auxiliary_loss_mlp": 0.01095616, + "balance_loss_clip": 1.05620587, + "balance_loss_mlp": 1.07952547, + "epoch": 0.02428979407785961, + "flos": 18150258994560.0, + "grad_norm": 2.3706875621243246, + "language_loss": 0.99751151, + "learning_rate": 3.864024073288798e-06, + "loss": 1.02152789, + "num_input_tokens_seen": 8450180, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 2.265625, + "step": 404, + "time_per_iteration": 2.5400550365448 + }, + { + "auxiliary_loss_clip": 0.01304501, + "auxiliary_loss_mlp": 0.01104455, + "balance_loss_clip": 1.06716657, + "balance_loss_mlp": 1.08213115, + "epoch": 0.024349917330527583, + "flos": 15304984917120.0, + "grad_norm": 2.480197457162756, + "language_loss": 0.87603909, + "learning_rate": 3.865615797668091e-06, + "loss": 0.90012866, + "num_input_tokens_seen": 8467775, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 2.21875, + "step": 405, + "time_per_iteration": 2.4698479175567627 + }, + { + "auxiliary_loss_clip": 0.01314075, + "auxiliary_loss_mlp": 0.01107285, + "balance_loss_clip": 1.06835127, + "balance_loss_mlp": 1.08775485, + "epoch": 0.024410040583195552, + "flos": 20773892200320.0, + "grad_norm": 3.242686201363518, + "language_loss": 0.93258083, + "learning_rate": 3.867203596705844e-06, + "loss": 0.9567945, + "num_input_tokens_seen": 8486765, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 2.265625, + "step": 406, + "time_per_iteration": 2.505598306655884 + }, + { + "auxiliary_loss_clip": 0.01305046, + "auxiliary_loss_mlp": 0.01092168, + "balance_loss_clip": 1.05330622, + "balance_loss_mlp": 1.08378315, + "epoch": 0.02447016383586352, + "flos": 21798854789760.0, + "grad_norm": 2.059728688773918, + "language_loss": 0.87446553, + "learning_rate": 3.86878748971496e-06, + "loss": 0.89843762, + "num_input_tokens_seen": 8506515, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 2.21875, + "step": 407, + "time_per_iteration": 2.5017173290252686 + }, + { + "auxiliary_loss_clip": 0.01301523, + "auxiliary_loss_mlp": 0.01085362, + "balance_loss_clip": 1.04814506, + "balance_loss_mlp": 1.08445001, + "epoch": 0.02453028708853149, + "flos": 33948116380800.0, + "grad_norm": 2.439524495250932, + "language_loss": 0.7404871, + "learning_rate": 3.8703674958661596e-06, + "loss": 0.76435596, + "num_input_tokens_seen": 8528035, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 2.171875, + "step": 408, + "time_per_iteration": 2.6097092628479004 + }, + { + "auxiliary_loss_clip": 0.013061, + "auxiliary_loss_mlp": 0.01096961, + "balance_loss_clip": 1.05771768, + "balance_loss_mlp": 1.08381224, + "epoch": 0.024590410341199458, + "flos": 21793000872960.0, + "grad_norm": 2.750776221383638, + "language_loss": 0.92393035, + "learning_rate": 3.871943634189376e-06, + "loss": 0.94796097, + "num_input_tokens_seen": 8546455, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 2.21875, + "step": 409, + "time_per_iteration": 2.5198304653167725 + }, + { + "auxiliary_loss_clip": 0.01304769, + "auxiliary_loss_mlp": 0.01080478, + "balance_loss_clip": 1.04488206, + "balance_loss_mlp": 1.0854609, + "epoch": 0.02465053359386743, + "flos": 35114782124160.0, + "grad_norm": 1.9763435283924244, + "language_loss": 0.82926536, + "learning_rate": 3.873515923575128e-06, + "loss": 0.85311788, + "num_input_tokens_seen": 8568450, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 2.1875, + "step": 410, + "time_per_iteration": 2.624333143234253 + }, + { + "auxiliary_loss_clip": 0.01307118, + "auxiliary_loss_mlp": 0.01089288, + "balance_loss_clip": 1.05164146, + "balance_loss_mlp": 1.08556843, + "epoch": 0.0247106568465354, + "flos": 27451409333760.0, + "grad_norm": 4.176812441051998, + "language_loss": 0.77715993, + "learning_rate": 3.875084382775879e-06, + "loss": 0.80112404, + "num_input_tokens_seen": 8589340, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 2.21875, + "step": 411, + "time_per_iteration": 2.571401596069336 + }, + { + "auxiliary_loss_clip": 0.01303549, + "auxiliary_loss_mlp": 0.01102238, + "balance_loss_clip": 1.06311393, + "balance_loss_mlp": 1.08078265, + "epoch": 0.024770780099203367, + "flos": 20703794808960.0, + "grad_norm": 2.1103060729449883, + "language_loss": 0.86276567, + "learning_rate": 3.87664903040738e-06, + "loss": 0.88682353, + "num_input_tokens_seen": 8607150, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.21875, + "step": 412, + "time_per_iteration": 2.4968833923339844 + }, + { + "auxiliary_loss_clip": 0.01168305, + "auxiliary_loss_mlp": 0.01068817, + "balance_loss_clip": 1.05632353, + "balance_loss_mlp": 1.05478358, + "epoch": 0.024830903351871336, + "flos": 69551859369600.0, + "grad_norm": 0.8568818905087673, + "language_loss": 0.58512402, + "learning_rate": 3.878209884949994e-06, + "loss": 0.60749531, + "num_input_tokens_seen": 8669865, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 1.1328125, + "step": 413, + "time_per_iteration": 3.1763217449188232 + }, + { + "auxiliary_loss_clip": 0.01296528, + "auxiliary_loss_mlp": 0.01092205, + "balance_loss_clip": 1.05145931, + "balance_loss_mlp": 1.07941055, + "epoch": 0.024891026604539304, + "flos": 32270477713920.0, + "grad_norm": 1.7554792190049524, + "language_loss": 0.80704832, + "learning_rate": 3.879766964750006e-06, + "loss": 0.83093566, + "num_input_tokens_seen": 8690235, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 2.171875, + "step": 414, + "time_per_iteration": 2.5954627990722656 + }, + { + "auxiliary_loss_clip": 0.01292737, + "auxiliary_loss_mlp": 0.01093441, + "balance_loss_clip": 1.05660486, + "balance_loss_mlp": 1.07739186, + "epoch": 0.024951149857207276, + "flos": 18840282238080.0, + "grad_norm": 2.3796689224247904, + "language_loss": 0.80473328, + "learning_rate": 3.881320288020917e-06, + "loss": 0.82859504, + "num_input_tokens_seen": 8706295, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 2.15625, + "step": 415, + "time_per_iteration": 2.471665620803833 + }, + { + "auxiliary_loss_clip": 0.0131185, + "auxiliary_loss_mlp": 0.01085672, + "balance_loss_clip": 1.0481931, + "balance_loss_mlp": 1.08601356, + "epoch": 0.025011273109875245, + "flos": 15377201210880.0, + "grad_norm": 5.333540620494007, + "language_loss": 0.96179891, + "learning_rate": 3.882869872844723e-06, + "loss": 0.98577416, + "num_input_tokens_seen": 8724200, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 2.25, + "step": 416, + "time_per_iteration": 2.5133068561553955 + }, + { + "auxiliary_loss_clip": 0.01300197, + "auxiliary_loss_mlp": 0.01076153, + "balance_loss_clip": 1.03702867, + "balance_loss_mlp": 1.0806849, + "epoch": 0.025071396362543213, + "flos": 18915515274240.0, + "grad_norm": 2.409464042642492, + "language_loss": 0.77541196, + "learning_rate": 3.884415737173176e-06, + "loss": 0.79917544, + "num_input_tokens_seen": 8744170, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.1875, + "step": 417, + "time_per_iteration": 2.5206987857818604 + }, + { + "auxiliary_loss_clip": 0.01297091, + "auxiliary_loss_mlp": 0.01092626, + "balance_loss_clip": 1.05512297, + "balance_loss_mlp": 1.08281994, + "epoch": 0.025131519615211182, + "flos": 25337958952320.0, + "grad_norm": 1.6345521849457858, + "language_loss": 0.7689445, + "learning_rate": 3.8859578988290344e-06, + "loss": 0.79284167, + "num_input_tokens_seen": 8765120, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 2.140625, + "step": 418, + "time_per_iteration": 2.6002862453460693 + }, + { + "auxiliary_loss_clip": 0.01304842, + "auxiliary_loss_mlp": 0.01075451, + "balance_loss_clip": 1.03873444, + "balance_loss_mlp": 1.08383846, + "epoch": 0.02519164286787915, + "flos": 18953149749120.0, + "grad_norm": 2.548681745998596, + "language_loss": 0.81088459, + "learning_rate": 3.887496375507294e-06, + "loss": 0.83468759, + "num_input_tokens_seen": 8783500, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 2.203125, + "step": 419, + "time_per_iteration": 2.5097553730010986 + }, + { + "auxiliary_loss_clip": 0.01298642, + "auxiliary_loss_mlp": 0.01085388, + "balance_loss_clip": 1.0453577, + "balance_loss_mlp": 1.08236253, + "epoch": 0.025251766120547123, + "flos": 17421092904960.0, + "grad_norm": 1.9166879875817555, + "language_loss": 0.73812175, + "learning_rate": 3.8890311847764065e-06, + "loss": 0.761962, + "num_input_tokens_seen": 8801175, + "router_z_loss_clip": 0.40039062, + "router_z_loss_mlp": 2.15625, + "step": 420, + "time_per_iteration": 2.480468511581421 + }, + { + "auxiliary_loss_clip": 0.01298409, + "auxiliary_loss_mlp": 0.01098321, + "balance_loss_clip": 1.06086528, + "balance_loss_mlp": 1.0791508, + "epoch": 0.02531188937321509, + "flos": 25045430590080.0, + "grad_norm": 1.7246544027149788, + "language_loss": 0.78928417, + "learning_rate": 3.890562344079484e-06, + "loss": 0.8132515, + "num_input_tokens_seen": 8820215, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 2.1875, + "step": 421, + "time_per_iteration": 2.583979845046997 + }, + { + "auxiliary_loss_clip": 0.01300301, + "auxiliary_loss_mlp": 0.01095113, + "balance_loss_clip": 1.05589294, + "balance_loss_mlp": 1.08374381, + "epoch": 0.02537201262588306, + "flos": 30592228515840.0, + "grad_norm": 2.879256315405443, + "language_loss": 0.81915486, + "learning_rate": 3.89208987073549e-06, + "loss": 0.84310895, + "num_input_tokens_seen": 8839660, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 2.171875, + "step": 422, + "time_per_iteration": 2.5834591388702393 + }, + { + "auxiliary_loss_clip": 0.01299282, + "auxiliary_loss_mlp": 0.01079788, + "balance_loss_clip": 1.0445497, + "balance_loss_mlp": 1.07925105, + "epoch": 0.02543213587855103, + "flos": 26065365275520.0, + "grad_norm": 1.9426129656279463, + "language_loss": 0.83468062, + "learning_rate": 3.893613781940409e-06, + "loss": 0.85847133, + "num_input_tokens_seen": 8859280, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 2.203125, + "step": 423, + "time_per_iteration": 2.5526318550109863 + }, + { + "auxiliary_loss_clip": 0.01293361, + "auxiliary_loss_mlp": 0.01086715, + "balance_loss_clip": 1.04978371, + "balance_loss_mlp": 1.07668817, + "epoch": 0.025492259131218997, + "flos": 36022818965760.0, + "grad_norm": 2.7010989411926367, + "language_loss": 0.74435121, + "learning_rate": 3.895134094768415e-06, + "loss": 0.768152, + "num_input_tokens_seen": 8880560, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 2.171875, + "step": 424, + "time_per_iteration": 2.606895923614502 + }, + { + "auxiliary_loss_clip": 0.01303473, + "auxiliary_loss_mlp": 0.01097188, + "balance_loss_clip": 1.06113958, + "balance_loss_mlp": 1.08349586, + "epoch": 0.02555238238388697, + "flos": 18588045957120.0, + "grad_norm": 2.227147445366898, + "language_loss": 0.83008313, + "learning_rate": 3.896650826173015e-06, + "loss": 0.85408974, + "num_input_tokens_seen": 8899155, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 2.203125, + "step": 425, + "time_per_iteration": 2.522517442703247 + }, + { + "auxiliary_loss_clip": 0.01299491, + "auxiliary_loss_mlp": 0.01096328, + "balance_loss_clip": 1.05691719, + "balance_loss_mlp": 1.07528758, + "epoch": 0.025612505636554938, + "flos": 24243186280320.0, + "grad_norm": 2.394258070540652, + "language_loss": 0.85481966, + "learning_rate": 3.898163992988186e-06, + "loss": 0.87877786, + "num_input_tokens_seen": 8917890, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 2.25, + "step": 426, + "time_per_iteration": 2.5039095878601074 + }, + { + "auxiliary_loss_clip": 0.01160068, + "auxiliary_loss_mlp": 0.0104803, + "balance_loss_clip": 1.03663349, + "balance_loss_mlp": 1.04526472, + "epoch": 0.025672628889222907, + "flos": 60586941265920.0, + "grad_norm": 0.8962322500302954, + "language_loss": 0.57186544, + "learning_rate": 3.899673611929491e-06, + "loss": 0.5939464, + "num_input_tokens_seen": 8978260, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 1.1484375, + "step": 427, + "time_per_iteration": 3.2289342880249023 + }, + { + "auxiliary_loss_clip": 0.01297452, + "auxiliary_loss_mlp": 0.01095521, + "balance_loss_clip": 1.05849457, + "balance_loss_mlp": 1.0838623, + "epoch": 0.025732752141890875, + "flos": 19573255169280.0, + "grad_norm": 2.6536896946259816, + "language_loss": 0.88190198, + "learning_rate": 3.901179699595194e-06, + "loss": 0.90583158, + "num_input_tokens_seen": 8994460, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 2.125, + "step": 428, + "time_per_iteration": 2.500389814376831 + }, + { + "auxiliary_loss_clip": 0.01290417, + "auxiliary_loss_mlp": 0.01078869, + "balance_loss_clip": 1.03972101, + "balance_loss_mlp": 1.07718623, + "epoch": 0.025792875394558847, + "flos": 31284262920960.0, + "grad_norm": 1.6692033855414803, + "language_loss": 0.85672665, + "learning_rate": 3.902682272467353e-06, + "loss": 0.88041949, + "num_input_tokens_seen": 9016670, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.125, + "step": 429, + "time_per_iteration": 2.605687379837036 + }, + { + "auxiliary_loss_clip": 0.01297427, + "auxiliary_loss_mlp": 0.01082502, + "balance_loss_clip": 1.04373491, + "balance_loss_mlp": 1.07673144, + "epoch": 0.025852998647226816, + "flos": 32379610210560.0, + "grad_norm": 2.5023850128037672, + "language_loss": 0.88384748, + "learning_rate": 3.904181346912895e-06, + "loss": 0.90764678, + "num_input_tokens_seen": 9039720, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 2.203125, + "step": 430, + "time_per_iteration": 2.593492269515991 + }, + { + "auxiliary_loss_clip": 0.01298542, + "auxiliary_loss_mlp": 0.01083596, + "balance_loss_clip": 1.04799962, + "balance_loss_mlp": 1.08428442, + "epoch": 0.025913121899894784, + "flos": 20193288762240.0, + "grad_norm": 1.9811912271744876, + "language_loss": 0.84202254, + "learning_rate": 3.905676939184698e-06, + "loss": 0.86584389, + "num_input_tokens_seen": 9059850, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 2.140625, + "step": 431, + "time_per_iteration": 2.5326902866363525 + }, + { + "auxiliary_loss_clip": 0.01291302, + "auxiliary_loss_mlp": 0.01073914, + "balance_loss_clip": 1.03886628, + "balance_loss_mlp": 1.0772872, + "epoch": 0.025973245152562753, + "flos": 14720430983040.0, + "grad_norm": 2.686150654607635, + "language_loss": 0.86775959, + "learning_rate": 3.907169065422638e-06, + "loss": 0.89141178, + "num_input_tokens_seen": 9077590, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 2.140625, + "step": 432, + "time_per_iteration": 2.4793269634246826 + }, + { + "auxiliary_loss_clip": 0.01296964, + "auxiliary_loss_mlp": 0.01080084, + "balance_loss_clip": 1.04491723, + "balance_loss_mlp": 1.08109105, + "epoch": 0.02603336840523072, + "flos": 30992991534720.0, + "grad_norm": 2.6953453355349684, + "language_loss": 0.76074433, + "learning_rate": 3.908657741654636e-06, + "loss": 0.78451484, + "num_input_tokens_seen": 9099880, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 2.15625, + "step": 433, + "time_per_iteration": 2.6125545501708984 + }, + { + "auxiliary_loss_clip": 0.01296292, + "auxiliary_loss_mlp": 0.0109282, + "balance_loss_clip": 1.05312383, + "balance_loss_mlp": 1.07772529, + "epoch": 0.026093491657898694, + "flos": 17674262939520.0, + "grad_norm": 2.2540618473103247, + "language_loss": 0.89764363, + "learning_rate": 3.910142983797699e-06, + "loss": 0.92153478, + "num_input_tokens_seen": 9118620, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 2.1875, + "step": 434, + "time_per_iteration": 5.3097922801971436 + }, + { + "auxiliary_loss_clip": 0.01297376, + "auxiliary_loss_mlp": 0.01102904, + "balance_loss_clip": 1.06404209, + "balance_loss_mlp": 1.08362865, + "epoch": 0.026153614910566662, + "flos": 17857874286720.0, + "grad_norm": 6.328317132251919, + "language_loss": 0.7985189, + "learning_rate": 3.9116248076589305e-06, + "loss": 0.82252169, + "num_input_tokens_seen": 9135655, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 2.125, + "step": 435, + "time_per_iteration": 3.9629530906677246 + }, + { + "auxiliary_loss_clip": 0.01291104, + "auxiliary_loss_mlp": 0.01091144, + "balance_loss_clip": 1.05316401, + "balance_loss_mlp": 1.0750463, + "epoch": 0.02621373816323463, + "flos": 20011113959040.0, + "grad_norm": 3.559504815450524, + "language_loss": 0.86357677, + "learning_rate": 3.913103228936546e-06, + "loss": 0.88739926, + "num_input_tokens_seen": 9153520, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 2.15625, + "step": 436, + "time_per_iteration": 2.479033946990967 + }, + { + "auxiliary_loss_clip": 0.01296325, + "auxiliary_loss_mlp": 0.01099771, + "balance_loss_clip": 1.06214869, + "balance_loss_mlp": 1.07964039, + "epoch": 0.0262738614159026, + "flos": 19281193683840.0, + "grad_norm": 2.6168892141891944, + "language_loss": 0.75002837, + "learning_rate": 3.914578263220868e-06, + "loss": 0.77398932, + "num_input_tokens_seen": 9170750, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 2.171875, + "step": 437, + "time_per_iteration": 2.508769989013672 + }, + { + "auxiliary_loss_clip": 0.01293849, + "auxiliary_loss_mlp": 0.01104049, + "balance_loss_clip": 1.06380415, + "balance_loss_mlp": 1.08015561, + "epoch": 0.026333984668570568, + "flos": 18807208790400.0, + "grad_norm": 2.3031145987765758, + "language_loss": 0.91467845, + "learning_rate": 3.916049925995316e-06, + "loss": 0.93865746, + "num_input_tokens_seen": 9188430, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 2.140625, + "step": 438, + "time_per_iteration": 2.4693844318389893 + }, + { + "auxiliary_loss_clip": 0.01155458, + "auxiliary_loss_mlp": 0.01064255, + "balance_loss_clip": 1.05276346, + "balance_loss_mlp": 1.0448494, + "epoch": 0.02639410792123854, + "flos": 64572020691840.0, + "grad_norm": 0.877669139368542, + "language_loss": 0.62577796, + "learning_rate": 3.917518232637377e-06, + "loss": 0.64797509, + "num_input_tokens_seen": 9255835, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 1.109375, + "step": 439, + "time_per_iteration": 3.162259101867676 + }, + { + "auxiliary_loss_clip": 0.01303989, + "auxiliary_loss_mlp": 0.01098096, + "balance_loss_clip": 1.05873275, + "balance_loss_mlp": 1.08440769, + "epoch": 0.02645423117390651, + "flos": 28473462921600.0, + "grad_norm": 2.1384369611317493, + "language_loss": 0.75629139, + "learning_rate": 3.918983198419573e-06, + "loss": 0.78031218, + "num_input_tokens_seen": 9276835, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 2.203125, + "step": 440, + "time_per_iteration": 2.5541677474975586 + }, + { + "auxiliary_loss_clip": 0.01294139, + "auxiliary_loss_mlp": 0.01082398, + "balance_loss_clip": 1.04408443, + "balance_loss_mlp": 1.08003163, + "epoch": 0.026514354426574478, + "flos": 18551237495040.0, + "grad_norm": 1.9583565981573345, + "language_loss": 0.83186466, + "learning_rate": 3.920444838510415e-06, + "loss": 0.85563004, + "num_input_tokens_seen": 9295075, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 2.140625, + "step": 441, + "time_per_iteration": 2.453705072402954 + }, + { + "auxiliary_loss_clip": 0.01298235, + "auxiliary_loss_mlp": 0.01092726, + "balance_loss_clip": 1.05286217, + "balance_loss_mlp": 1.07855892, + "epoch": 0.026574477679242446, + "flos": 20667812359680.0, + "grad_norm": 2.035076381127293, + "language_loss": 0.7850582, + "learning_rate": 3.92190316797534e-06, + "loss": 0.80896777, + "num_input_tokens_seen": 9314205, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 2.203125, + "step": 442, + "time_per_iteration": 2.477555990219116 + }, + { + "auxiliary_loss_clip": 0.01145517, + "auxiliary_loss_mlp": 0.01012445, + "balance_loss_clip": 1.00185919, + "balance_loss_mlp": 1.04045749, + "epoch": 0.026634600931910415, + "flos": 57956125340160.0, + "grad_norm": 0.9584767110468104, + "language_loss": 0.64475185, + "learning_rate": 3.92335820177765e-06, + "loss": 0.66633147, + "num_input_tokens_seen": 9367395, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 1.046875, + "step": 443, + "time_per_iteration": 2.9838714599609375 + }, + { + "auxiliary_loss_clip": 0.01297944, + "auxiliary_loss_mlp": 0.01087685, + "balance_loss_clip": 1.04941845, + "balance_loss_mlp": 1.08318424, + "epoch": 0.026694724184578387, + "flos": 15815131827840.0, + "grad_norm": 2.4335650573352483, + "language_loss": 0.82707053, + "learning_rate": 3.924809954779425e-06, + "loss": 0.85092688, + "num_input_tokens_seen": 9385185, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 2.140625, + "step": 444, + "time_per_iteration": 2.4520323276519775 + }, + { + "auxiliary_loss_clip": 0.0130195, + "auxiliary_loss_mlp": 0.0108515, + "balance_loss_clip": 1.0440464, + "balance_loss_mlp": 1.08103406, + "epoch": 0.026754847437246355, + "flos": 23440259612160.0, + "grad_norm": 2.6903851096875733, + "language_loss": 0.95400113, + "learning_rate": 3.9262584417424425e-06, + "loss": 0.97787213, + "num_input_tokens_seen": 9403225, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 2.21875, + "step": 445, + "time_per_iteration": 2.5113518238067627 + }, + { + "auxiliary_loss_clip": 0.01296406, + "auxiliary_loss_mlp": 0.01096489, + "balance_loss_clip": 1.05657816, + "balance_loss_mlp": 1.08177555, + "epoch": 0.026814970689914324, + "flos": 17341801632000.0, + "grad_norm": 2.416617421630428, + "language_loss": 0.91790259, + "learning_rate": 3.9277036773290725e-06, + "loss": 0.94183153, + "num_input_tokens_seen": 9420540, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 2.15625, + "step": 446, + "time_per_iteration": 2.4585111141204834 + }, + { + "auxiliary_loss_clip": 0.01293099, + "auxiliary_loss_mlp": 0.01085762, + "balance_loss_clip": 1.04718637, + "balance_loss_mlp": 1.08102632, + "epoch": 0.026875093942582293, + "flos": 17894718662400.0, + "grad_norm": 2.3983095061811635, + "language_loss": 0.80024058, + "learning_rate": 3.92914567610317e-06, + "loss": 0.82402921, + "num_input_tokens_seen": 9438840, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 2.125, + "step": 447, + "time_per_iteration": 2.509643316268921 + }, + { + "auxiliary_loss_clip": 0.01292768, + "auxiliary_loss_mlp": 0.01072511, + "balance_loss_clip": 1.03658175, + "balance_loss_mlp": 1.07935369, + "epoch": 0.026935217195250265, + "flos": 21723980889600.0, + "grad_norm": 2.4579217038825423, + "language_loss": 0.86773896, + "learning_rate": 3.930584452530952e-06, + "loss": 0.89139175, + "num_input_tokens_seen": 9457215, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 2.125, + "step": 448, + "time_per_iteration": 2.477384328842163 + }, + { + "auxiliary_loss_clip": 0.01287268, + "auxiliary_loss_mlp": 0.01093327, + "balance_loss_clip": 1.0583508, + "balance_loss_mlp": 1.07870793, + "epoch": 0.026995340447918233, + "flos": 23622685810560.0, + "grad_norm": 2.1426472419274503, + "language_loss": 0.88779259, + "learning_rate": 3.9320200209818755e-06, + "loss": 0.91159856, + "num_input_tokens_seen": 9475615, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 2.078125, + "step": 449, + "time_per_iteration": 2.50108003616333 + }, + { + "auxiliary_loss_clip": 0.01298718, + "auxiliary_loss_mlp": 0.01087936, + "balance_loss_clip": 1.04897857, + "balance_loss_mlp": 1.08056545, + "epoch": 0.027055463700586202, + "flos": 17931275729280.0, + "grad_norm": 1.9975703664508544, + "language_loss": 0.80516291, + "learning_rate": 3.933452395729493e-06, + "loss": 0.82902944, + "num_input_tokens_seen": 9493975, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.1875, + "step": 450, + "time_per_iteration": 2.470536947250366 + }, + { + "auxiliary_loss_clip": 0.01291132, + "auxiliary_loss_mlp": 0.01077476, + "balance_loss_clip": 1.03973413, + "balance_loss_mlp": 1.08217299, + "epoch": 0.02711558695325417, + "flos": 25118903859840.0, + "grad_norm": 2.7768383062811637, + "language_loss": 0.81500483, + "learning_rate": 3.934881590952304e-06, + "loss": 0.83869088, + "num_input_tokens_seen": 9514810, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 2.09375, + "step": 451, + "time_per_iteration": 2.530539035797119 + }, + { + "auxiliary_loss_clip": 0.01289442, + "auxiliary_loss_mlp": 0.0109125, + "balance_loss_clip": 1.0524354, + "balance_loss_mlp": 1.08151317, + "epoch": 0.02717571020592214, + "flos": 24239559006720.0, + "grad_norm": 1.5925691418309382, + "language_loss": 0.76994318, + "learning_rate": 3.936307620734599e-06, + "loss": 0.79375011, + "num_input_tokens_seen": 9533635, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 2.078125, + "step": 452, + "time_per_iteration": 2.5138871669769287 + }, + { + "auxiliary_loss_clip": 0.01292925, + "auxiliary_loss_mlp": 0.01088314, + "balance_loss_clip": 1.0507158, + "balance_loss_mlp": 1.08201516, + "epoch": 0.02723583345859011, + "flos": 25118939773440.0, + "grad_norm": 1.9334646917545748, + "language_loss": 0.73053265, + "learning_rate": 3.937730499067294e-06, + "loss": 0.754345, + "num_input_tokens_seen": 9555420, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 2.109375, + "step": 453, + "time_per_iteration": 2.5271401405334473 + }, + { + "auxiliary_loss_clip": 0.01288113, + "auxiliary_loss_mlp": 0.01086026, + "balance_loss_clip": 1.04952383, + "balance_loss_mlp": 1.08018303, + "epoch": 0.02729595671125808, + "flos": 42741597847680.0, + "grad_norm": 1.845498968311748, + "language_loss": 0.82439983, + "learning_rate": 3.939150239848748e-06, + "loss": 0.84814119, + "num_input_tokens_seen": 9578950, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 2.078125, + "step": 454, + "time_per_iteration": 2.6724069118499756 + }, + { + "auxiliary_loss_clip": 0.01290287, + "auxiliary_loss_mlp": 0.01078957, + "balance_loss_clip": 1.04491115, + "balance_loss_mlp": 1.0808264, + "epoch": 0.02735607996392605, + "flos": 21430985650560.0, + "grad_norm": 2.1414002490484005, + "language_loss": 0.75815403, + "learning_rate": 3.9405668568855866e-06, + "loss": 0.78184646, + "num_input_tokens_seen": 9598160, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 2.09375, + "step": 455, + "time_per_iteration": 2.496913194656372 + }, + { + "auxiliary_loss_clip": 0.01290624, + "auxiliary_loss_mlp": 0.01097119, + "balance_loss_clip": 1.06114161, + "balance_loss_mlp": 1.07846022, + "epoch": 0.027416203216594017, + "flos": 20851280052480.0, + "grad_norm": 2.102028743174525, + "language_loss": 0.80576169, + "learning_rate": 3.941980363893499e-06, + "loss": 0.82963914, + "num_input_tokens_seen": 9616010, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 2.125, + "step": 456, + "time_per_iteration": 2.4748263359069824 + }, + { + "auxiliary_loss_clip": 0.01286184, + "auxiliary_loss_mlp": 0.01078793, + "balance_loss_clip": 1.04152811, + "balance_loss_mlp": 1.07863176, + "epoch": 0.027476326469261986, + "flos": 13224500242560.0, + "grad_norm": 2.479828414472028, + "language_loss": 0.81621009, + "learning_rate": 3.9433907744980384e-06, + "loss": 0.83985978, + "num_input_tokens_seen": 9634000, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 2.078125, + "step": 457, + "time_per_iteration": 2.5122945308685303 + }, + { + "auxiliary_loss_clip": 0.01289671, + "auxiliary_loss_mlp": 0.01084101, + "balance_loss_clip": 1.04728937, + "balance_loss_mlp": 1.07828617, + "epoch": 0.027536449721929958, + "flos": 24024526237440.0, + "grad_norm": 2.0492464691581476, + "language_loss": 0.94062889, + "learning_rate": 3.944798102235412e-06, + "loss": 0.96436661, + "num_input_tokens_seen": 9653455, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 2.109375, + "step": 458, + "time_per_iteration": 2.542919874191284 + }, + { + "auxiliary_loss_clip": 0.01287914, + "auxiliary_loss_mlp": 0.01093849, + "balance_loss_clip": 1.05872989, + "balance_loss_mlp": 1.07926297, + "epoch": 0.027596572974597926, + "flos": 13006055681280.0, + "grad_norm": 2.4293190258203774, + "language_loss": 0.79353511, + "learning_rate": 3.9462023605532545e-06, + "loss": 0.81735277, + "num_input_tokens_seen": 9669650, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 2.09375, + "step": 459, + "time_per_iteration": 2.472830295562744 + }, + { + "auxiliary_loss_clip": 0.01293203, + "auxiliary_loss_mlp": 0.01082653, + "balance_loss_clip": 1.04360008, + "balance_loss_mlp": 1.08543491, + "epoch": 0.027656696227265895, + "flos": 26143076350080.0, + "grad_norm": 1.8472887331493792, + "language_loss": 0.83103061, + "learning_rate": 3.947603562811407e-06, + "loss": 0.85478914, + "num_input_tokens_seen": 9691415, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.078125, + "step": 460, + "time_per_iteration": 2.5376338958740234 + }, + { + "auxiliary_loss_clip": 0.01140517, + "auxiliary_loss_mlp": 0.01043703, + "balance_loss_clip": 1.03488147, + "balance_loss_mlp": 1.03798664, + "epoch": 0.027716819479933864, + "flos": 60697222997760.0, + "grad_norm": 1.5738760379538346, + "language_loss": 0.73565412, + "learning_rate": 3.949001722282675e-06, + "loss": 0.7574963, + "num_input_tokens_seen": 9755605, + "router_z_loss_clip": 0.08837891, + "router_z_loss_mlp": 1.0234375, + "step": 461, + "time_per_iteration": 3.0358285903930664 + }, + { + "auxiliary_loss_clip": 0.01289208, + "auxiliary_loss_mlp": 0.01081781, + "balance_loss_clip": 1.04735351, + "balance_loss_mlp": 1.086905, + "epoch": 0.027776942732601832, + "flos": 31211938886400.0, + "grad_norm": 2.85425781388422, + "language_loss": 0.81291741, + "learning_rate": 3.950396852153582e-06, + "loss": 0.83662736, + "num_input_tokens_seen": 9776270, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 2.015625, + "step": 462, + "time_per_iteration": 2.6079564094543457 + }, + { + "auxiliary_loss_clip": 0.01287586, + "auxiliary_loss_mlp": 0.01073577, + "balance_loss_clip": 1.04096127, + "balance_loss_mlp": 1.08167982, + "epoch": 0.027837065985269804, + "flos": 22674644196480.0, + "grad_norm": 2.2822341634579195, + "language_loss": 0.90235889, + "learning_rate": 3.951788965525118e-06, + "loss": 0.92597055, + "num_input_tokens_seen": 9794465, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 2.0625, + "step": 463, + "time_per_iteration": 2.4881155490875244 + }, + { + "auxiliary_loss_clip": 0.01137482, + "auxiliary_loss_mlp": 0.01014393, + "balance_loss_clip": 1.00561893, + "balance_loss_mlp": 1.03824747, + "epoch": 0.027897189237937773, + "flos": 62182487399040.0, + "grad_norm": 0.8835585057209928, + "language_loss": 0.59031862, + "learning_rate": 3.953178075413476e-06, + "loss": 0.61183739, + "num_input_tokens_seen": 9849685, + "router_z_loss_clip": 0.08789062, + "router_z_loss_mlp": 0.9921875, + "step": 464, + "time_per_iteration": 3.039377212524414 + }, + { + "auxiliary_loss_clip": 0.01299905, + "auxiliary_loss_mlp": 0.01097461, + "balance_loss_clip": 1.06081581, + "balance_loss_mlp": 1.08716702, + "epoch": 0.02795731249060574, + "flos": 24493160004480.0, + "grad_norm": 2.8663863440598525, + "language_loss": 0.81203198, + "learning_rate": 3.954564194750784e-06, + "loss": 0.83600569, + "num_input_tokens_seen": 9869505, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 2.125, + "step": 465, + "time_per_iteration": 2.5197718143463135 + }, + { + "auxiliary_loss_clip": 0.01286546, + "auxiliary_loss_mlp": 0.01082829, + "balance_loss_clip": 1.04708982, + "balance_loss_mlp": 1.08028877, + "epoch": 0.02801743574327371, + "flos": 23733003456000.0, + "grad_norm": 2.004656273762408, + "language_loss": 0.78560221, + "learning_rate": 3.955947336385828e-06, + "loss": 0.80929601, + "num_input_tokens_seen": 9890950, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 2.0625, + "step": 466, + "time_per_iteration": 2.5151565074920654 + }, + { + "auxiliary_loss_clip": 0.01285777, + "auxiliary_loss_mlp": 0.01085472, + "balance_loss_clip": 1.05075812, + "balance_loss_mlp": 1.0816046, + "epoch": 0.02807755899594168, + "flos": 20629100476800.0, + "grad_norm": 2.05931728393333, + "language_loss": 0.87548482, + "learning_rate": 3.957327513084761e-06, + "loss": 0.89919734, + "num_input_tokens_seen": 9911265, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 2.03125, + "step": 467, + "time_per_iteration": 2.4994542598724365 + }, + { + "auxiliary_loss_clip": 0.01289137, + "auxiliary_loss_mlp": 0.01106554, + "balance_loss_clip": 1.06969416, + "balance_loss_mlp": 1.08202362, + "epoch": 0.02813768224860965, + "flos": 19244564789760.0, + "grad_norm": 2.728881931821799, + "language_loss": 0.86217642, + "learning_rate": 3.958704737531818e-06, + "loss": 0.88613331, + "num_input_tokens_seen": 9929025, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 2.0625, + "step": 468, + "time_per_iteration": 2.482377767562866 + }, + { + "auxiliary_loss_clip": 0.01287545, + "auxiliary_loss_mlp": 0.01081999, + "balance_loss_clip": 1.0447104, + "balance_loss_mlp": 1.07984936, + "epoch": 0.02819780550127762, + "flos": 20813968800000.0, + "grad_norm": 3.6924571591440762, + "language_loss": 0.91605878, + "learning_rate": 3.9600790223300065e-06, + "loss": 0.93975413, + "num_input_tokens_seen": 9945190, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 2.078125, + "step": 469, + "time_per_iteration": 2.471510648727417 + }, + { + "auxiliary_loss_clip": 0.01286876, + "auxiliary_loss_mlp": 0.01096778, + "balance_loss_clip": 1.06106234, + "balance_loss_mlp": 1.08290672, + "epoch": 0.028257928753945588, + "flos": 19974125928960.0, + "grad_norm": 8.38112094971343, + "language_loss": 0.81587195, + "learning_rate": 3.96145038000181e-06, + "loss": 0.83970851, + "num_input_tokens_seen": 9962820, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 2.03125, + "step": 470, + "time_per_iteration": 2.5398614406585693 + }, + { + "auxiliary_loss_clip": 0.01286572, + "auxiliary_loss_mlp": 0.01085498, + "balance_loss_clip": 1.04868627, + "balance_loss_mlp": 1.07859015, + "epoch": 0.028318052006613557, + "flos": 20484488321280.0, + "grad_norm": 1.8437898933227894, + "language_loss": 0.93147206, + "learning_rate": 3.962818822989861e-06, + "loss": 0.9551928, + "num_input_tokens_seen": 9982595, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 2.078125, + "step": 471, + "time_per_iteration": 2.5005030632019043 + }, + { + "auxiliary_loss_clip": 0.0128173, + "auxiliary_loss_mlp": 0.01094713, + "balance_loss_clip": 1.05885458, + "balance_loss_mlp": 1.07808042, + "epoch": 0.02837817525928153, + "flos": 28514832410880.0, + "grad_norm": 1.89303735573371, + "language_loss": 0.757568, + "learning_rate": 3.964184363657625e-06, + "loss": 0.78133243, + "num_input_tokens_seen": 10004645, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 2.03125, + "step": 472, + "time_per_iteration": 2.597637176513672 + }, + { + "auxiliary_loss_clip": 0.0128882, + "auxiliary_loss_mlp": 0.01078393, + "balance_loss_clip": 1.04479945, + "balance_loss_mlp": 1.07699013, + "epoch": 0.028438298511949497, + "flos": 18551668458240.0, + "grad_norm": 3.986951446490631, + "language_loss": 0.93354845, + "learning_rate": 3.965547014290071e-06, + "loss": 0.95722055, + "num_input_tokens_seen": 10022555, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 2.125, + "step": 473, + "time_per_iteration": 2.4882545471191406 + }, + { + "auxiliary_loss_clip": 0.01293922, + "auxiliary_loss_mlp": 0.01115319, + "balance_loss_clip": 1.08134401, + "balance_loss_mlp": 1.08149064, + "epoch": 0.028498421764617466, + "flos": 16910227722240.0, + "grad_norm": 4.845992674029067, + "language_loss": 0.88586211, + "learning_rate": 3.96690678709433e-06, + "loss": 0.90995455, + "num_input_tokens_seen": 10041025, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 2.125, + "step": 474, + "time_per_iteration": 2.483210563659668 + }, + { + "auxiliary_loss_clip": 0.01284496, + "auxiliary_loss_mlp": 0.01091761, + "balance_loss_clip": 1.05559278, + "balance_loss_mlp": 1.07983565, + "epoch": 0.028558545017285435, + "flos": 27778699082880.0, + "grad_norm": 2.474550917046853, + "language_loss": 0.78771299, + "learning_rate": 3.968263694200355e-06, + "loss": 0.81147563, + "num_input_tokens_seen": 10060775, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 2.046875, + "step": 475, + "time_per_iteration": 2.5462486743927 + }, + { + "auxiliary_loss_clip": 0.01139312, + "auxiliary_loss_mlp": 0.0107539, + "balance_loss_clip": 1.06647348, + "balance_loss_mlp": 1.03907108, + "epoch": 0.028618668269953403, + "flos": 65654367258240.0, + "grad_norm": 0.9304884927077405, + "language_loss": 0.66880804, + "learning_rate": 3.969617747661569e-06, + "loss": 0.6909551, + "num_input_tokens_seen": 10120225, + "router_z_loss_clip": 0.08935547, + "router_z_loss_mlp": 1.0, + "step": 476, + "time_per_iteration": 5.8287513256073 + }, + { + "auxiliary_loss_clip": 0.01286666, + "auxiliary_loss_mlp": 0.01081774, + "balance_loss_clip": 1.04527175, + "balance_loss_mlp": 1.0796659, + "epoch": 0.028678791522621375, + "flos": 21937074324480.0, + "grad_norm": 2.9569520931335775, + "language_loss": 0.83852398, + "learning_rate": 3.970968959455509e-06, + "loss": 0.86220837, + "num_input_tokens_seen": 10137880, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 2.078125, + "step": 477, + "time_per_iteration": 2.5179195404052734 + }, + { + "auxiliary_loss_clip": 0.01293161, + "auxiliary_loss_mlp": 0.01088101, + "balance_loss_clip": 1.05164671, + "balance_loss_mlp": 1.08298135, + "epoch": 0.028738914775289344, + "flos": 24572128055040.0, + "grad_norm": 2.2048636254017504, + "language_loss": 0.82267237, + "learning_rate": 3.97231734148446e-06, + "loss": 0.84648502, + "num_input_tokens_seen": 10156930, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 2.09375, + "step": 478, + "time_per_iteration": 2.495760679244995 + }, + { + "auxiliary_loss_clip": 0.01283274, + "auxiliary_loss_mlp": 0.01076252, + "balance_loss_clip": 1.0409658, + "balance_loss_mlp": 1.07707858, + "epoch": 0.028799038027957313, + "flos": 23257977068160.0, + "grad_norm": 2.28603697529264, + "language_loss": 0.81010443, + "learning_rate": 3.973662905576082e-06, + "loss": 0.8336997, + "num_input_tokens_seen": 10176295, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 2.0625, + "step": 479, + "time_per_iteration": 2.491910934448242 + }, + { + "auxiliary_loss_clip": 0.01281719, + "auxiliary_loss_mlp": 0.01080307, + "balance_loss_clip": 1.04323328, + "balance_loss_mlp": 1.07729793, + "epoch": 0.02885916128062528, + "flos": 22164102236160.0, + "grad_norm": 2.2385690137770715, + "language_loss": 0.73465097, + "learning_rate": 3.975005663484038e-06, + "loss": 0.75827128, + "num_input_tokens_seen": 10195790, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 2.03125, + "step": 480, + "time_per_iteration": 2.4959068298339844 + }, + { + "auxiliary_loss_clip": 0.01280408, + "auxiliary_loss_mlp": 0.01071287, + "balance_loss_clip": 1.03945768, + "balance_loss_mlp": 1.07837129, + "epoch": 0.02891928453329325, + "flos": 22932842135040.0, + "grad_norm": 1.6612342828976938, + "language_loss": 0.87719476, + "learning_rate": 3.976345626888605e-06, + "loss": 0.90071172, + "num_input_tokens_seen": 10218405, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 2.03125, + "step": 481, + "time_per_iteration": 2.534792184829712 + }, + { + "auxiliary_loss_clip": 0.0113967, + "auxiliary_loss_mlp": 0.01022688, + "balance_loss_clip": 1.01367593, + "balance_loss_mlp": 1.03470159, + "epoch": 0.028979407785961222, + "flos": 57432941792640.0, + "grad_norm": 0.8259666239631118, + "language_loss": 0.66064727, + "learning_rate": 3.9776828073972864e-06, + "loss": 0.68227088, + "num_input_tokens_seen": 10271005, + "router_z_loss_clip": 0.09033203, + "router_z_loss_mlp": 1.046875, + "step": 482, + "time_per_iteration": 2.8219997882843018 + }, + { + "auxiliary_loss_clip": 0.01295379, + "auxiliary_loss_mlp": 0.01073835, + "balance_loss_clip": 1.04014635, + "balance_loss_mlp": 1.08159328, + "epoch": 0.02903953103862919, + "flos": 16722737706240.0, + "grad_norm": 2.373570732629757, + "language_loss": 0.78743541, + "learning_rate": 3.979017216545415e-06, + "loss": 0.81112754, + "num_input_tokens_seen": 10288405, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 2.140625, + "step": 483, + "time_per_iteration": 2.4733006954193115 + }, + { + "auxiliary_loss_clip": 0.01293434, + "auxiliary_loss_mlp": 0.01090935, + "balance_loss_clip": 1.0548625, + "balance_loss_mlp": 1.08311069, + "epoch": 0.02909965429129716, + "flos": 16763640318720.0, + "grad_norm": 2.520023812901894, + "language_loss": 0.75405324, + "learning_rate": 3.980348865796749e-06, + "loss": 0.77789688, + "num_input_tokens_seen": 10306875, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 2.109375, + "step": 484, + "time_per_iteration": 2.466634750366211 + }, + { + "auxiliary_loss_clip": 0.01288089, + "auxiliary_loss_mlp": 0.01078618, + "balance_loss_clip": 1.04459584, + "balance_loss_mlp": 1.08002305, + "epoch": 0.029159777543965128, + "flos": 19785343023360.0, + "grad_norm": 2.0323982063196153, + "language_loss": 0.84021544, + "learning_rate": 3.9816777665440615e-06, + "loss": 0.86388254, + "num_input_tokens_seen": 10323965, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 2.078125, + "step": 485, + "time_per_iteration": 2.511415719985962 + }, + { + "auxiliary_loss_clip": 0.01293039, + "auxiliary_loss_mlp": 0.01081906, + "balance_loss_clip": 1.04740667, + "balance_loss_mlp": 1.08659554, + "epoch": 0.029219900796633096, + "flos": 19642670202240.0, + "grad_norm": 1.9066132168030567, + "language_loss": 0.84465218, + "learning_rate": 3.983003930109732e-06, + "loss": 0.86840165, + "num_input_tokens_seen": 10342620, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 2.0625, + "step": 486, + "time_per_iteration": 2.453583002090454 + }, + { + "auxiliary_loss_clip": 0.01284719, + "auxiliary_loss_mlp": 0.01083872, + "balance_loss_clip": 1.04841876, + "balance_loss_mlp": 1.07841349, + "epoch": 0.02928002404930107, + "flos": 25885704424320.0, + "grad_norm": 1.9228432408219163, + "language_loss": 0.8891986, + "learning_rate": 3.984327367746315e-06, + "loss": 0.91288453, + "num_input_tokens_seen": 10364610, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 2.0625, + "step": 487, + "time_per_iteration": 2.5558598041534424 + }, + { + "auxiliary_loss_clip": 0.0128758, + "auxiliary_loss_mlp": 0.01070867, + "balance_loss_clip": 1.03806067, + "balance_loss_mlp": 1.08095598, + "epoch": 0.029340147301969037, + "flos": 20660234590080.0, + "grad_norm": 2.5260996981700456, + "language_loss": 0.87981069, + "learning_rate": 3.985648090637122e-06, + "loss": 0.90339512, + "num_input_tokens_seen": 10380910, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 2.0625, + "step": 488, + "time_per_iteration": 2.5299952030181885 + }, + { + "auxiliary_loss_clip": 0.01283325, + "auxiliary_loss_mlp": 0.01079627, + "balance_loss_clip": 1.0449605, + "balance_loss_mlp": 1.07794333, + "epoch": 0.029400270554637006, + "flos": 24428018689920.0, + "grad_norm": 2.1862911790042543, + "language_loss": 0.88956475, + "learning_rate": 3.986966109896785e-06, + "loss": 0.9131943, + "num_input_tokens_seen": 10400665, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 2.046875, + "step": 489, + "time_per_iteration": 2.545240879058838 + }, + { + "auxiliary_loss_clip": 0.0127768, + "auxiliary_loss_mlp": 0.01078157, + "balance_loss_clip": 1.04322839, + "balance_loss_mlp": 1.07402337, + "epoch": 0.029460393807304974, + "flos": 20120892900480.0, + "grad_norm": 2.0397830948196756, + "language_loss": 0.88539088, + "learning_rate": 3.988281436571815e-06, + "loss": 0.90894926, + "num_input_tokens_seen": 10420150, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 2.03125, + "step": 490, + "time_per_iteration": 2.4727838039398193 + }, + { + "auxiliary_loss_clip": 0.01284238, + "auxiliary_loss_mlp": 0.01081508, + "balance_loss_clip": 1.04774833, + "balance_loss_mlp": 1.07731342, + "epoch": 0.029520517059972943, + "flos": 17675914965120.0, + "grad_norm": 2.230679327742206, + "language_loss": 0.91299963, + "learning_rate": 3.989594081641164e-06, + "loss": 0.93665713, + "num_input_tokens_seen": 10438210, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 2.0625, + "step": 491, + "time_per_iteration": 2.4900829792022705 + }, + { + "auxiliary_loss_clip": 0.01274874, + "auxiliary_loss_mlp": 0.01070684, + "balance_loss_clip": 1.03804421, + "balance_loss_mlp": 1.0749476, + "epoch": 0.029580640312640915, + "flos": 18953185662720.0, + "grad_norm": 2.419480988494796, + "language_loss": 0.85232413, + "learning_rate": 3.9909040560167675e-06, + "loss": 0.87577969, + "num_input_tokens_seen": 10455125, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 2.0, + "step": 492, + "time_per_iteration": 2.457188844680786 + }, + { + "auxiliary_loss_clip": 0.0128558, + "auxiliary_loss_mlp": 0.01093772, + "balance_loss_clip": 1.05939209, + "balance_loss_mlp": 1.08082771, + "epoch": 0.029640763565308884, + "flos": 18726121837440.0, + "grad_norm": 2.826333733481051, + "language_loss": 0.83989829, + "learning_rate": 3.992211370544093e-06, + "loss": 0.86369187, + "num_input_tokens_seen": 10470990, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 2.046875, + "step": 493, + "time_per_iteration": 2.4821553230285645 + }, + { + "auxiliary_loss_clip": 0.01280126, + "auxiliary_loss_mlp": 0.01079048, + "balance_loss_clip": 1.04586005, + "balance_loss_mlp": 1.07578444, + "epoch": 0.029700886817976852, + "flos": 20595308757120.0, + "grad_norm": 1.8259196989393787, + "language_loss": 0.86575663, + "learning_rate": 3.99351603600268e-06, + "loss": 0.88934839, + "num_input_tokens_seen": 10490685, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 2.046875, + "step": 494, + "time_per_iteration": 2.507068395614624 + }, + { + "auxiliary_loss_clip": 0.01286409, + "auxiliary_loss_mlp": 0.01082408, + "balance_loss_clip": 1.05084157, + "balance_loss_mlp": 1.07973599, + "epoch": 0.02976101007064482, + "flos": 22236857233920.0, + "grad_norm": 4.414490317498679, + "language_loss": 0.86250752, + "learning_rate": 3.994818063106668e-06, + "loss": 0.88619578, + "num_input_tokens_seen": 10509435, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 2.0625, + "step": 495, + "time_per_iteration": 2.498401165008545 + }, + { + "auxiliary_loss_clip": 0.01274095, + "auxiliary_loss_mlp": 0.0107342, + "balance_loss_clip": 1.04144859, + "balance_loss_mlp": 1.07653904, + "epoch": 0.029821133323312793, + "flos": 23732644320000.0, + "grad_norm": 1.893732744603442, + "language_loss": 0.6230706, + "learning_rate": 3.99611746250533e-06, + "loss": 0.64654577, + "num_input_tokens_seen": 10530050, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 1.9765625, + "step": 496, + "time_per_iteration": 2.499669313430786 + }, + { + "auxiliary_loss_clip": 0.01276388, + "auxiliary_loss_mlp": 0.01085353, + "balance_loss_clip": 1.05314219, + "balance_loss_mlp": 1.07830799, + "epoch": 0.02988125657598076, + "flos": 22419498913920.0, + "grad_norm": 1.8423417765009742, + "language_loss": 0.88582325, + "learning_rate": 3.997414244783595e-06, + "loss": 0.90944064, + "num_input_tokens_seen": 10551370, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 1.984375, + "step": 497, + "time_per_iteration": 2.5570924282073975 + }, + { + "auxiliary_loss_clip": 0.01282787, + "auxiliary_loss_mlp": 0.0108035, + "balance_loss_clip": 1.04711461, + "balance_loss_mlp": 1.07822609, + "epoch": 0.02994137982864873, + "flos": 13845108453120.0, + "grad_norm": 3.4064142479622377, + "language_loss": 0.85174376, + "learning_rate": 3.998708420462557e-06, + "loss": 0.87537515, + "num_input_tokens_seen": 10569225, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 2.046875, + "step": 498, + "time_per_iteration": 2.513601541519165 + }, + { + "auxiliary_loss_clip": 0.01281177, + "auxiliary_loss_mlp": 0.01082811, + "balance_loss_clip": 1.05052912, + "balance_loss_mlp": 1.07829463, + "epoch": 0.0300015030813167, + "flos": 23908354675200.0, + "grad_norm": 37.23719619981942, + "language_loss": 0.78152531, + "learning_rate": 4e-06, + "loss": 0.80516517, + "num_input_tokens_seen": 10586170, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 2.03125, + "step": 499, + "time_per_iteration": 2.4924824237823486 + }, + { + "auxiliary_loss_clip": 0.01282354, + "auxiliary_loss_mlp": 0.01080564, + "balance_loss_clip": 1.04818654, + "balance_loss_mlp": 1.08037949, + "epoch": 0.030061626333984667, + "flos": 22016796560640.0, + "grad_norm": 3.687829420060643, + "language_loss": 0.8271451, + "learning_rate": 3.9999999620799e-06, + "loss": 0.85077423, + "num_input_tokens_seen": 10606205, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 2.015625, + "step": 500, + "time_per_iteration": 2.494333028793335 + }, + { + "auxiliary_loss_clip": 0.01274571, + "auxiliary_loss_mlp": 0.01084389, + "balance_loss_clip": 1.04924583, + "balance_loss_mlp": 1.07541978, + "epoch": 0.03012174958665264, + "flos": 23039747988480.0, + "grad_norm": 2.6096117253121447, + "language_loss": 0.88464928, + "learning_rate": 3.9999998483196e-06, + "loss": 0.90823889, + "num_input_tokens_seen": 10625995, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 1.9921875, + "step": 501, + "time_per_iteration": 2.494575262069702 + }, + { + "auxiliary_loss_clip": 0.01283018, + "auxiliary_loss_mlp": 0.01073076, + "balance_loss_clip": 1.04158127, + "balance_loss_mlp": 1.07912767, + "epoch": 0.030181872839320608, + "flos": 18953257489920.0, + "grad_norm": 2.304054979465899, + "language_loss": 0.86586684, + "learning_rate": 3.9999996587191065e-06, + "loss": 0.88942778, + "num_input_tokens_seen": 10644105, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 2.03125, + "step": 502, + "time_per_iteration": 2.4574413299560547 + }, + { + "auxiliary_loss_clip": 0.01278734, + "auxiliary_loss_mlp": 0.01077839, + "balance_loss_clip": 1.0444839, + "balance_loss_mlp": 1.07952762, + "epoch": 0.030241996091988577, + "flos": 16728017005440.0, + "grad_norm": 2.6244890775354976, + "language_loss": 0.84661186, + "learning_rate": 3.999999393278425e-06, + "loss": 0.87017757, + "num_input_tokens_seen": 10661090, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 1.9921875, + "step": 503, + "time_per_iteration": 2.4406938552856445 + }, + { + "auxiliary_loss_clip": 0.0127278, + "auxiliary_loss_mlp": 0.01082796, + "balance_loss_clip": 1.05008519, + "balance_loss_mlp": 1.07727659, + "epoch": 0.030302119344656545, + "flos": 28621271387520.0, + "grad_norm": 1.6755724800263092, + "language_loss": 0.88215417, + "learning_rate": 3.999999051997567e-06, + "loss": 0.90570992, + "num_input_tokens_seen": 10682380, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.953125, + "step": 504, + "time_per_iteration": 2.5319011211395264 + }, + { + "auxiliary_loss_clip": 0.01274883, + "auxiliary_loss_mlp": 0.01087129, + "balance_loss_clip": 1.05556226, + "balance_loss_mlp": 1.07692564, + "epoch": 0.030362242597324514, + "flos": 15669334523520.0, + "grad_norm": 2.2080583468347, + "language_loss": 0.78446162, + "learning_rate": 3.9999986348765425e-06, + "loss": 0.80808175, + "num_input_tokens_seen": 10699925, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.9765625, + "step": 505, + "time_per_iteration": 2.4724690914154053 + }, + { + "auxiliary_loss_clip": 0.01135682, + "auxiliary_loss_mlp": 0.01018291, + "balance_loss_clip": 1.00927854, + "balance_loss_mlp": 1.04092085, + "epoch": 0.030422365849992486, + "flos": 72125973676800.0, + "grad_norm": 0.8461866637376847, + "language_loss": 0.55057126, + "learning_rate": 3.999998141915371e-06, + "loss": 0.57211095, + "num_input_tokens_seen": 10766525, + "router_z_loss_clip": 0.09033203, + "router_z_loss_mlp": 0.9453125, + "step": 506, + "time_per_iteration": 3.2490124702453613 + }, + { + "auxiliary_loss_clip": 0.01274292, + "auxiliary_loss_mlp": 0.01087138, + "balance_loss_clip": 1.05418897, + "balance_loss_mlp": 1.0756762, + "epoch": 0.030482489102660455, + "flos": 19427817000960.0, + "grad_norm": 1.9034614277572226, + "language_loss": 0.83767861, + "learning_rate": 3.999997573114069e-06, + "loss": 0.8612929, + "num_input_tokens_seen": 10786725, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.984375, + "step": 507, + "time_per_iteration": 2.48811674118042 + }, + { + "auxiliary_loss_clip": 0.01280318, + "auxiliary_loss_mlp": 0.01080114, + "balance_loss_clip": 1.04778421, + "balance_loss_mlp": 1.07709789, + "epoch": 0.030542612355328423, + "flos": 20375822701440.0, + "grad_norm": 2.5950154193771526, + "language_loss": 0.88689649, + "learning_rate": 3.999996928472659e-06, + "loss": 0.91050076, + "num_input_tokens_seen": 10805390, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 2.03125, + "step": 508, + "time_per_iteration": 2.4966533184051514 + }, + { + "auxiliary_loss_clip": 0.01281637, + "auxiliary_loss_mlp": 0.01063766, + "balance_loss_clip": 1.0311265, + "balance_loss_mlp": 1.07728887, + "epoch": 0.030602735607996392, + "flos": 34677354297600.0, + "grad_norm": 2.2339008285543227, + "language_loss": 0.71499902, + "learning_rate": 3.999996207991165e-06, + "loss": 0.73845309, + "num_input_tokens_seen": 10828030, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 2.03125, + "step": 509, + "time_per_iteration": 2.5966317653656006 + }, + { + "auxiliary_loss_clip": 0.01274736, + "auxiliary_loss_mlp": 0.01072718, + "balance_loss_clip": 1.04229546, + "balance_loss_mlp": 1.07770133, + "epoch": 0.03066285886066436, + "flos": 23658668259840.0, + "grad_norm": 2.064360756351981, + "language_loss": 0.82369828, + "learning_rate": 3.999995411669614e-06, + "loss": 0.8471728, + "num_input_tokens_seen": 10845240, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.9765625, + "step": 510, + "time_per_iteration": 2.5276355743408203 + }, + { + "auxiliary_loss_clip": 0.01280977, + "auxiliary_loss_mlp": 0.01082699, + "balance_loss_clip": 1.04984498, + "balance_loss_mlp": 1.08235979, + "epoch": 0.030722982113332332, + "flos": 23002975440000.0, + "grad_norm": 2.1614325499153693, + "language_loss": 0.83621502, + "learning_rate": 3.999994539508036e-06, + "loss": 0.85985172, + "num_input_tokens_seen": 10864325, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.984375, + "step": 511, + "time_per_iteration": 2.503779888153076 + }, + { + "auxiliary_loss_clip": 0.01278507, + "auxiliary_loss_mlp": 0.01077898, + "balance_loss_clip": 1.04633093, + "balance_loss_mlp": 1.07648492, + "epoch": 0.0307831053660003, + "flos": 24750855152640.0, + "grad_norm": 2.1059740170821515, + "language_loss": 0.82234836, + "learning_rate": 3.9999935915064655e-06, + "loss": 0.8459124, + "num_input_tokens_seen": 10883860, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 2.03125, + "step": 512, + "time_per_iteration": 2.5306975841522217 + }, + { + "auxiliary_loss_clip": 0.01276149, + "auxiliary_loss_mlp": 0.01077944, + "balance_loss_clip": 1.04499435, + "balance_loss_mlp": 1.0769974, + "epoch": 0.03084322861866827, + "flos": 26140885620480.0, + "grad_norm": 1.9256325141107502, + "language_loss": 0.87030005, + "learning_rate": 3.9999925676649374e-06, + "loss": 0.89384103, + "num_input_tokens_seen": 10904555, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 1.9921875, + "step": 513, + "time_per_iteration": 2.507490634918213 + }, + { + "auxiliary_loss_clip": 0.01281572, + "auxiliary_loss_mlp": 0.01080973, + "balance_loss_clip": 1.04840553, + "balance_loss_mlp": 1.07869625, + "epoch": 0.03090335187133624, + "flos": 18771298168320.0, + "grad_norm": 3.202753983864072, + "language_loss": 0.79141152, + "learning_rate": 3.999991467983491e-06, + "loss": 0.81503695, + "num_input_tokens_seen": 10923700, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 2.03125, + "step": 514, + "time_per_iteration": 2.515496015548706 + }, + { + "auxiliary_loss_clip": 0.01276725, + "auxiliary_loss_mlp": 0.01063014, + "balance_loss_clip": 1.03218651, + "balance_loss_mlp": 1.07966864, + "epoch": 0.030963475124004207, + "flos": 23221886878080.0, + "grad_norm": 2.5461002634459216, + "language_loss": 0.77459693, + "learning_rate": 3.999990292462167e-06, + "loss": 0.79799432, + "num_input_tokens_seen": 10942730, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.96875, + "step": 515, + "time_per_iteration": 2.481903553009033 + }, + { + "auxiliary_loss_clip": 0.01272098, + "auxiliary_loss_mlp": 0.0106896, + "balance_loss_clip": 1.03615332, + "balance_loss_mlp": 1.07318711, + "epoch": 0.03102359837667218, + "flos": 42525595411200.0, + "grad_norm": 1.901518391780262, + "language_loss": 0.82729101, + "learning_rate": 3.999989041101011e-06, + "loss": 0.85070157, + "num_input_tokens_seen": 10967120, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.9921875, + "step": 516, + "time_per_iteration": 2.699577808380127 + }, + { + "auxiliary_loss_clip": 0.01272185, + "auxiliary_loss_mlp": 0.01070291, + "balance_loss_clip": 1.03760433, + "balance_loss_mlp": 1.07659435, + "epoch": 0.031083721629340148, + "flos": 21176953689600.0, + "grad_norm": 2.071844032637654, + "language_loss": 0.79009813, + "learning_rate": 3.999987713900071e-06, + "loss": 0.81352293, + "num_input_tokens_seen": 10986775, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.953125, + "step": 517, + "time_per_iteration": 4.0190205574035645 + }, + { + "auxiliary_loss_clip": 0.01269009, + "auxiliary_loss_mlp": 0.01072314, + "balance_loss_clip": 1.04069996, + "balance_loss_mlp": 1.07610774, + "epoch": 0.031143844882008116, + "flos": 29716187713920.0, + "grad_norm": 1.58218863781409, + "language_loss": 0.90778029, + "learning_rate": 3.999986310859396e-06, + "loss": 0.93119347, + "num_input_tokens_seen": 11011360, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.9296875, + "step": 518, + "time_per_iteration": 4.080751657485962 + }, + { + "auxiliary_loss_clip": 0.0128372, + "auxiliary_loss_mlp": 0.01093666, + "balance_loss_clip": 1.05883288, + "balance_loss_mlp": 1.08518016, + "epoch": 0.031203968134676085, + "flos": 23112467072640.0, + "grad_norm": 3.008779144342936, + "language_loss": 0.86396456, + "learning_rate": 3.999984831979039e-06, + "loss": 0.88773847, + "num_input_tokens_seen": 11030150, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 1.984375, + "step": 519, + "time_per_iteration": 2.510267734527588 + }, + { + "auxiliary_loss_clip": 0.01278708, + "auxiliary_loss_mlp": 0.01092513, + "balance_loss_clip": 1.06092215, + "balance_loss_mlp": 1.07567024, + "epoch": 0.03126409138734405, + "flos": 20954379064320.0, + "grad_norm": 2.0313723427087216, + "language_loss": 0.87156898, + "learning_rate": 3.999983277259057e-06, + "loss": 0.8952812, + "num_input_tokens_seen": 11049145, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 2.03125, + "step": 520, + "time_per_iteration": 2.4891066551208496 + }, + { + "auxiliary_loss_clip": 0.01281744, + "auxiliary_loss_mlp": 0.01089643, + "balance_loss_clip": 1.05633557, + "balance_loss_mlp": 1.07832289, + "epoch": 0.031324214640012026, + "flos": 21650112570240.0, + "grad_norm": 1.6802829394342778, + "language_loss": 0.89362079, + "learning_rate": 3.999981646699509e-06, + "loss": 0.91733468, + "num_input_tokens_seen": 11068835, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 2.03125, + "step": 521, + "time_per_iteration": 2.508524179458618 + }, + { + "auxiliary_loss_clip": 0.01274208, + "auxiliary_loss_mlp": 0.010832, + "balance_loss_clip": 1.04889154, + "balance_loss_mlp": 1.07795191, + "epoch": 0.03138433789267999, + "flos": 23441337020160.0, + "grad_norm": 2.273639697525746, + "language_loss": 0.71327078, + "learning_rate": 3.999979940300456e-06, + "loss": 0.73684484, + "num_input_tokens_seen": 11088980, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.9609375, + "step": 522, + "time_per_iteration": 2.49629282951355 + }, + { + "auxiliary_loss_clip": 0.01278501, + "auxiliary_loss_mlp": 0.01083501, + "balance_loss_clip": 1.05150533, + "balance_loss_mlp": 1.07655358, + "epoch": 0.03144446114534796, + "flos": 18982164960000.0, + "grad_norm": 3.1208656196394706, + "language_loss": 0.84886295, + "learning_rate": 3.999978158061963e-06, + "loss": 0.87248302, + "num_input_tokens_seen": 11104300, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 2.015625, + "step": 523, + "time_per_iteration": 2.4674315452575684 + }, + { + "auxiliary_loss_clip": 0.01280597, + "auxiliary_loss_mlp": 0.01075539, + "balance_loss_clip": 1.04249442, + "balance_loss_mlp": 1.07655168, + "epoch": 0.031504584398015935, + "flos": 22637692080000.0, + "grad_norm": 1.9693639011355857, + "language_loss": 0.90419745, + "learning_rate": 3.999976299984099e-06, + "loss": 0.92775881, + "num_input_tokens_seen": 11123335, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 2.046875, + "step": 524, + "time_per_iteration": 2.480764627456665 + }, + { + "auxiliary_loss_clip": 0.01285248, + "auxiliary_loss_mlp": 0.01083988, + "balance_loss_clip": 1.05034757, + "balance_loss_mlp": 1.08102393, + "epoch": 0.0315647076506839, + "flos": 25297056339840.0, + "grad_norm": 2.4392367222760276, + "language_loss": 0.80040443, + "learning_rate": 3.999974366066933e-06, + "loss": 0.8240968, + "num_input_tokens_seen": 11140880, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 2.046875, + "step": 525, + "time_per_iteration": 2.5409629344940186 + }, + { + "auxiliary_loss_clip": 0.01277675, + "auxiliary_loss_mlp": 0.01082993, + "balance_loss_clip": 1.05025804, + "balance_loss_mlp": 1.07571197, + "epoch": 0.03162483090335187, + "flos": 16982839065600.0, + "grad_norm": 2.8378410017413658, + "language_loss": 0.80693865, + "learning_rate": 3.999972356310538e-06, + "loss": 0.83054531, + "num_input_tokens_seen": 11158710, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 2.03125, + "step": 526, + "time_per_iteration": 2.4509081840515137 + }, + { + "auxiliary_loss_clip": 0.01285808, + "auxiliary_loss_mlp": 0.01072361, + "balance_loss_clip": 1.03655052, + "balance_loss_mlp": 1.08127069, + "epoch": 0.03168495415601984, + "flos": 18734489706240.0, + "grad_norm": 2.27970800213601, + "language_loss": 0.81417823, + "learning_rate": 3.999970270714991e-06, + "loss": 0.83775997, + "num_input_tokens_seen": 11177550, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 2.046875, + "step": 527, + "time_per_iteration": 2.4760756492614746 + }, + { + "auxiliary_loss_clip": 0.01273782, + "auxiliary_loss_mlp": 0.01080634, + "balance_loss_clip": 1.04651666, + "balance_loss_mlp": 1.07408452, + "epoch": 0.03174507740868781, + "flos": 21214875473280.0, + "grad_norm": 2.59751390244888, + "language_loss": 0.93932182, + "learning_rate": 3.999968109280371e-06, + "loss": 0.96286595, + "num_input_tokens_seen": 11196230, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 2.0, + "step": 528, + "time_per_iteration": 2.4721155166625977 + }, + { + "auxiliary_loss_clip": 0.01273884, + "auxiliary_loss_mlp": 0.01073354, + "balance_loss_clip": 1.04083371, + "balance_loss_mlp": 1.07427406, + "epoch": 0.03180520066135578, + "flos": 24787663614720.0, + "grad_norm": 1.8844039207994492, + "language_loss": 0.84143054, + "learning_rate": 3.99996587200676e-06, + "loss": 0.86490291, + "num_input_tokens_seen": 11214935, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 2.0, + "step": 529, + "time_per_iteration": 2.5173239707946777 + }, + { + "auxiliary_loss_clip": 0.01278919, + "auxiliary_loss_mlp": 0.01087129, + "balance_loss_clip": 1.05530047, + "balance_loss_mlp": 1.08254409, + "epoch": 0.03186532391402375, + "flos": 24864261367680.0, + "grad_norm": 2.130233453276154, + "language_loss": 0.90547037, + "learning_rate": 3.999963558894243e-06, + "loss": 0.92913085, + "num_input_tokens_seen": 11235310, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 1.96875, + "step": 530, + "time_per_iteration": 2.5096359252929688 + }, + { + "auxiliary_loss_clip": 0.0127291, + "auxiliary_loss_mlp": 0.01073181, + "balance_loss_clip": 1.03930187, + "balance_loss_mlp": 1.07199419, + "epoch": 0.03192544716669172, + "flos": 21215055041280.0, + "grad_norm": 2.12169085676626, + "language_loss": 0.76197046, + "learning_rate": 3.999961169942907e-06, + "loss": 0.78543139, + "num_input_tokens_seen": 11254425, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 2.015625, + "step": 531, + "time_per_iteration": 2.503265142440796 + }, + { + "auxiliary_loss_clip": 0.01272973, + "auxiliary_loss_mlp": 0.01064442, + "balance_loss_clip": 1.03030038, + "balance_loss_mlp": 1.07424712, + "epoch": 0.03198557041935969, + "flos": 24353216616960.0, + "grad_norm": 2.621085079916904, + "language_loss": 0.9073056, + "learning_rate": 3.999958705152843e-06, + "loss": 0.9306798, + "num_input_tokens_seen": 11274595, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 1.9921875, + "step": 532, + "time_per_iteration": 2.506220817565918 + }, + { + "auxiliary_loss_clip": 0.01137355, + "auxiliary_loss_mlp": 0.01010615, + "balance_loss_clip": 1.00198412, + "balance_loss_mlp": 1.0428524, + "epoch": 0.032045693672027656, + "flos": 61827367587840.0, + "grad_norm": 0.7306749876416057, + "language_loss": 0.57931173, + "learning_rate": 3.9999561645241445e-06, + "loss": 0.60079145, + "num_input_tokens_seen": 11336705, + "router_z_loss_clip": 0.08642578, + "router_z_loss_mlp": 0.9453125, + "step": 533, + "time_per_iteration": 3.154953956604004 + }, + { + "auxiliary_loss_clip": 0.01271016, + "auxiliary_loss_mlp": 0.01084756, + "balance_loss_clip": 1.05209231, + "balance_loss_mlp": 1.07378936, + "epoch": 0.03210581692469563, + "flos": 28401174800640.0, + "grad_norm": 1.8972625930530718, + "language_loss": 0.86725944, + "learning_rate": 3.999953548056907e-06, + "loss": 0.89081717, + "num_input_tokens_seen": 11356820, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 1.96875, + "step": 534, + "time_per_iteration": 2.5384750366210938 + }, + { + "auxiliary_loss_clip": 0.01271847, + "auxiliary_loss_mlp": 0.01066511, + "balance_loss_clip": 1.03468204, + "balance_loss_mlp": 1.07573223, + "epoch": 0.03216594017736359, + "flos": 24717709877760.0, + "grad_norm": 2.118212102173022, + "language_loss": 0.77352351, + "learning_rate": 3.999950855751232e-06, + "loss": 0.79690707, + "num_input_tokens_seen": 11376645, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 1.9609375, + "step": 535, + "time_per_iteration": 2.517940044403076 + }, + { + "auxiliary_loss_clip": 0.01274503, + "auxiliary_loss_mlp": 0.01083227, + "balance_loss_clip": 1.05151725, + "balance_loss_mlp": 1.07644773, + "epoch": 0.032226063430031565, + "flos": 31175453646720.0, + "grad_norm": 2.176836888233088, + "language_loss": 0.8074764, + "learning_rate": 3.999948087607219e-06, + "loss": 0.83105373, + "num_input_tokens_seen": 11397310, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 1.984375, + "step": 536, + "time_per_iteration": 2.546128034591675 + }, + { + "auxiliary_loss_clip": 0.01275643, + "auxiliary_loss_mlp": 0.01077633, + "balance_loss_clip": 1.04361033, + "balance_loss_mlp": 1.07698941, + "epoch": 0.03228618668269954, + "flos": 32198225506560.0, + "grad_norm": 2.3353202427960627, + "language_loss": 0.70118421, + "learning_rate": 3.999945243624975e-06, + "loss": 0.72471696, + "num_input_tokens_seen": 11418475, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 1.9921875, + "step": 537, + "time_per_iteration": 2.578101634979248 + }, + { + "auxiliary_loss_clip": 0.01274556, + "auxiliary_loss_mlp": 0.01081628, + "balance_loss_clip": 1.04877353, + "balance_loss_mlp": 1.08040798, + "epoch": 0.0323463099353675, + "flos": 22670154996480.0, + "grad_norm": 2.1000918694055044, + "language_loss": 0.8250435, + "learning_rate": 3.999942323804607e-06, + "loss": 0.84860539, + "num_input_tokens_seen": 11436630, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.9375, + "step": 538, + "time_per_iteration": 2.4822683334350586 + }, + { + "auxiliary_loss_clip": 0.01280793, + "auxiliary_loss_mlp": 0.01078587, + "balance_loss_clip": 1.0458765, + "balance_loss_mlp": 1.0775007, + "epoch": 0.032406433188035474, + "flos": 26905172232960.0, + "grad_norm": 1.8128048759039839, + "language_loss": 0.78999949, + "learning_rate": 3.999939328146225e-06, + "loss": 0.81359327, + "num_input_tokens_seen": 11457275, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 2.03125, + "step": 539, + "time_per_iteration": 2.5495705604553223 + }, + { + "auxiliary_loss_clip": 0.01274183, + "auxiliary_loss_mlp": 0.01066988, + "balance_loss_clip": 1.03284597, + "balance_loss_mlp": 1.0766232, + "epoch": 0.03246655644070344, + "flos": 31503928544640.0, + "grad_norm": 2.6651388031929835, + "language_loss": 0.77802742, + "learning_rate": 3.999936256649943e-06, + "loss": 0.80143911, + "num_input_tokens_seen": 11476925, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 1.9765625, + "step": 540, + "time_per_iteration": 2.5547144412994385 + }, + { + "auxiliary_loss_clip": 0.01282159, + "auxiliary_loss_mlp": 0.01070004, + "balance_loss_clip": 1.03755546, + "balance_loss_mlp": 1.08122253, + "epoch": 0.03252667969337141, + "flos": 23218331431680.0, + "grad_norm": 2.2422114385304845, + "language_loss": 0.85410464, + "learning_rate": 3.999933109315878e-06, + "loss": 0.8776263, + "num_input_tokens_seen": 11496830, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 2.0, + "step": 541, + "time_per_iteration": 2.517545700073242 + }, + { + "auxiliary_loss_clip": 0.01271503, + "auxiliary_loss_mlp": 0.01083563, + "balance_loss_clip": 1.04906392, + "balance_loss_mlp": 1.07759655, + "epoch": 0.032586802946039384, + "flos": 14757454926720.0, + "grad_norm": 2.210152212848466, + "language_loss": 0.89072484, + "learning_rate": 3.9999298861441496e-06, + "loss": 0.91427547, + "num_input_tokens_seen": 11515605, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.9375, + "step": 542, + "time_per_iteration": 2.437566041946411 + }, + { + "auxiliary_loss_clip": 0.01272694, + "auxiliary_loss_mlp": 0.01075801, + "balance_loss_clip": 1.04289961, + "balance_loss_mlp": 1.07649362, + "epoch": 0.03264692619870735, + "flos": 24280677100800.0, + "grad_norm": 2.3494598042187236, + "language_loss": 0.71096039, + "learning_rate": 3.999926587134879e-06, + "loss": 0.73444533, + "num_input_tokens_seen": 11536230, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.9609375, + "step": 543, + "time_per_iteration": 2.5121288299560547 + }, + { + "auxiliary_loss_clip": 0.0127171, + "auxiliary_loss_mlp": 0.01086873, + "balance_loss_clip": 1.05411386, + "balance_loss_mlp": 1.07139826, + "epoch": 0.03270704945137532, + "flos": 22893160584960.0, + "grad_norm": 2.6617228213889375, + "language_loss": 0.91273057, + "learning_rate": 3.999923212288192e-06, + "loss": 0.93631637, + "num_input_tokens_seen": 11554715, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 2.0, + "step": 544, + "time_per_iteration": 2.529536008834839 + }, + { + "auxiliary_loss_clip": 0.01274727, + "auxiliary_loss_mlp": 0.01082721, + "balance_loss_clip": 1.05186987, + "balance_loss_mlp": 1.07790041, + "epoch": 0.032767172704043286, + "flos": 18041018757120.0, + "grad_norm": 3.144073602630947, + "language_loss": 0.6640051, + "learning_rate": 3.999919761604216e-06, + "loss": 0.68757957, + "num_input_tokens_seen": 11571370, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.96875, + "step": 545, + "time_per_iteration": 2.487250328063965 + }, + { + "auxiliary_loss_clip": 0.01272187, + "auxiliary_loss_mlp": 0.01069604, + "balance_loss_clip": 1.03715563, + "balance_loss_mlp": 1.07393909, + "epoch": 0.03282729595671126, + "flos": 22528739151360.0, + "grad_norm": 2.6288964335615805, + "language_loss": 0.91857421, + "learning_rate": 3.999916235083083e-06, + "loss": 0.94199216, + "num_input_tokens_seen": 11588560, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 1.984375, + "step": 546, + "time_per_iteration": 2.4893922805786133 + }, + { + "auxiliary_loss_clip": 0.0126813, + "auxiliary_loss_mlp": 0.01071134, + "balance_loss_clip": 1.03723049, + "balance_loss_mlp": 1.07095337, + "epoch": 0.03288741920937923, + "flos": 20410620001920.0, + "grad_norm": 2.4455611041839127, + "language_loss": 0.82002354, + "learning_rate": 3.999912632724925e-06, + "loss": 0.84341609, + "num_input_tokens_seen": 11605685, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 1.96875, + "step": 547, + "time_per_iteration": 2.4879701137542725 + }, + { + "auxiliary_loss_clip": 0.01271545, + "auxiliary_loss_mlp": 0.01070995, + "balance_loss_clip": 1.0376637, + "balance_loss_mlp": 1.07550538, + "epoch": 0.032947542462047195, + "flos": 20777986350720.0, + "grad_norm": 3.015836198351779, + "language_loss": 0.80919325, + "learning_rate": 3.999908954529881e-06, + "loss": 0.83261865, + "num_input_tokens_seen": 11626290, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 1.9609375, + "step": 548, + "time_per_iteration": 2.501983404159546 + }, + { + "auxiliary_loss_clip": 0.01270889, + "auxiliary_loss_mlp": 0.01079421, + "balance_loss_clip": 1.04499304, + "balance_loss_mlp": 1.07411838, + "epoch": 0.03300766571471517, + "flos": 19901263190400.0, + "grad_norm": 3.9904289991591217, + "language_loss": 0.67330974, + "learning_rate": 3.999905200498087e-06, + "loss": 0.69681287, + "num_input_tokens_seen": 11643950, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.96875, + "step": 549, + "time_per_iteration": 2.479069948196411 + }, + { + "auxiliary_loss_clip": 0.01265753, + "auxiliary_loss_mlp": 0.01075673, + "balance_loss_clip": 1.04286647, + "balance_loss_mlp": 1.07537639, + "epoch": 0.03306778896738313, + "flos": 17967760968960.0, + "grad_norm": 2.081726350608672, + "language_loss": 0.86137938, + "learning_rate": 3.999901370629689e-06, + "loss": 0.88479364, + "num_input_tokens_seen": 11662560, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.90625, + "step": 550, + "time_per_iteration": 2.435030221939087 + }, + { + "auxiliary_loss_clip": 0.01271779, + "auxiliary_loss_mlp": 0.01089379, + "balance_loss_clip": 1.05712056, + "balance_loss_mlp": 1.07876444, + "epoch": 0.033127912220051105, + "flos": 21653380707840.0, + "grad_norm": 2.0024940554917534, + "language_loss": 0.81302834, + "learning_rate": 3.99989746492483e-06, + "loss": 0.83663994, + "num_input_tokens_seen": 11682265, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 1.9296875, + "step": 551, + "time_per_iteration": 2.474317789077759 + }, + { + "auxiliary_loss_clip": 0.01278525, + "auxiliary_loss_mlp": 0.01080037, + "balance_loss_clip": 1.0469687, + "balance_loss_mlp": 1.0786469, + "epoch": 0.03318803547271908, + "flos": 30188376927360.0, + "grad_norm": 2.5540153370218697, + "language_loss": 0.85907811, + "learning_rate": 3.999893483383658e-06, + "loss": 0.88266373, + "num_input_tokens_seen": 11699300, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 2.0, + "step": 552, + "time_per_iteration": 2.5352437496185303 + }, + { + "auxiliary_loss_clip": 0.01276099, + "auxiliary_loss_mlp": 0.01077197, + "balance_loss_clip": 1.0428648, + "balance_loss_mlp": 1.07894135, + "epoch": 0.03324815872538704, + "flos": 20376038183040.0, + "grad_norm": 2.3148388677976253, + "language_loss": 0.928128, + "learning_rate": 3.999889426006326e-06, + "loss": 0.95166099, + "num_input_tokens_seen": 11716955, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.96875, + "step": 553, + "time_per_iteration": 2.4860291481018066 + }, + { + "auxiliary_loss_clip": 0.01270959, + "auxiliary_loss_mlp": 0.01072703, + "balance_loss_clip": 1.03858554, + "balance_loss_mlp": 1.0755136, + "epoch": 0.033308281978055014, + "flos": 24494560634880.0, + "grad_norm": 2.234190064541142, + "language_loss": 0.78874755, + "learning_rate": 3.999885292792986e-06, + "loss": 0.81218415, + "num_input_tokens_seen": 11736130, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 1.953125, + "step": 554, + "time_per_iteration": 2.4878416061401367 + }, + { + "auxiliary_loss_clip": 0.0126611, + "auxiliary_loss_mlp": 0.01082645, + "balance_loss_clip": 1.04838455, + "balance_loss_mlp": 1.07417822, + "epoch": 0.03336840523072298, + "flos": 23400326666880.0, + "grad_norm": 2.1365458646452424, + "language_loss": 0.82297659, + "learning_rate": 3.999881083743795e-06, + "loss": 0.84646416, + "num_input_tokens_seen": 11754425, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.9140625, + "step": 555, + "time_per_iteration": 2.4846394062042236 + }, + { + "auxiliary_loss_clip": 0.01270081, + "auxiliary_loss_mlp": 0.01075464, + "balance_loss_clip": 1.04156113, + "balance_loss_mlp": 1.07390678, + "epoch": 0.03342852848339095, + "flos": 30550571717760.0, + "grad_norm": 2.781828445596944, + "language_loss": 0.88624835, + "learning_rate": 3.999876798858914e-06, + "loss": 0.90970379, + "num_input_tokens_seen": 11772845, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 1.96875, + "step": 556, + "time_per_iteration": 2.5788674354553223 + }, + { + "auxiliary_loss_clip": 0.01269545, + "auxiliary_loss_mlp": 0.01079314, + "balance_loss_clip": 1.04531527, + "balance_loss_mlp": 1.07534254, + "epoch": 0.03348865173605892, + "flos": 22893304239360.0, + "grad_norm": 2.0860752820949586, + "language_loss": 0.83492053, + "learning_rate": 3.999872438138503e-06, + "loss": 0.85840911, + "num_input_tokens_seen": 11792850, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 1.9375, + "step": 557, + "time_per_iteration": 2.5352954864501953 + }, + { + "auxiliary_loss_clip": 0.01275093, + "auxiliary_loss_mlp": 0.0106652, + "balance_loss_clip": 1.03495288, + "balance_loss_mlp": 1.07979858, + "epoch": 0.03354877498872689, + "flos": 17676022705920.0, + "grad_norm": 9.145612151583265, + "language_loss": 0.94169575, + "learning_rate": 3.999868001582729e-06, + "loss": 0.96511185, + "num_input_tokens_seen": 11809670, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.953125, + "step": 558, + "time_per_iteration": 2.4541964530944824 + }, + { + "auxiliary_loss_clip": 0.01265501, + "auxiliary_loss_mlp": 0.01073065, + "balance_loss_clip": 1.0406878, + "balance_loss_mlp": 1.07178497, + "epoch": 0.03360889824139486, + "flos": 21652985658240.0, + "grad_norm": 2.48174106566098, + "language_loss": 0.7735827, + "learning_rate": 3.99986348919176e-06, + "loss": 0.7969684, + "num_input_tokens_seen": 11829665, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 1.9375, + "step": 559, + "time_per_iteration": 5.362890005111694 + }, + { + "auxiliary_loss_clip": 0.01268387, + "auxiliary_loss_mlp": 0.01078962, + "balance_loss_clip": 1.04818201, + "balance_loss_mlp": 1.07386613, + "epoch": 0.033669021494062826, + "flos": 21795730306560.0, + "grad_norm": 2.071149038386511, + "language_loss": 0.87681198, + "learning_rate": 3.9998589009657675e-06, + "loss": 0.90028548, + "num_input_tokens_seen": 11848190, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.953125, + "step": 560, + "time_per_iteration": 3.9536426067352295 + }, + { + "auxiliary_loss_clip": 0.01264547, + "auxiliary_loss_mlp": 0.01067998, + "balance_loss_clip": 1.0375762, + "balance_loss_mlp": 1.07323277, + "epoch": 0.0337291447467308, + "flos": 21866222747520.0, + "grad_norm": 2.2284071587683463, + "language_loss": 0.81380183, + "learning_rate": 3.999854236904925e-06, + "loss": 0.83712727, + "num_input_tokens_seen": 11864795, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.9140625, + "step": 561, + "time_per_iteration": 2.49826717376709 + }, + { + "auxiliary_loss_clip": 0.01263917, + "auxiliary_loss_mlp": 0.01071053, + "balance_loss_clip": 1.04029727, + "balance_loss_mlp": 1.07403696, + "epoch": 0.03378926799939877, + "flos": 24245951627520.0, + "grad_norm": 1.7768341081574646, + "language_loss": 0.82018232, + "learning_rate": 3.999849497009409e-06, + "loss": 0.84353203, + "num_input_tokens_seen": 11885275, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.90625, + "step": 562, + "time_per_iteration": 2.503990888595581 + }, + { + "auxiliary_loss_clip": 0.01269896, + "auxiliary_loss_mlp": 0.01075498, + "balance_loss_clip": 1.04352641, + "balance_loss_mlp": 1.07592142, + "epoch": 0.033849391252066735, + "flos": 16507812677760.0, + "grad_norm": 1.966221896086353, + "language_loss": 0.84028983, + "learning_rate": 3.999844681279401e-06, + "loss": 0.86374378, + "num_input_tokens_seen": 11903595, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 1.9375, + "step": 563, + "time_per_iteration": 2.464571952819824 + }, + { + "auxiliary_loss_clip": 0.01268432, + "auxiliary_loss_mlp": 0.01079949, + "balance_loss_clip": 1.04866886, + "balance_loss_mlp": 1.07648492, + "epoch": 0.03390951450473471, + "flos": 15669298609920.0, + "grad_norm": 2.359913311978066, + "language_loss": 0.94194812, + "learning_rate": 3.99983978971508e-06, + "loss": 0.96543193, + "num_input_tokens_seen": 11917815, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.921875, + "step": 564, + "time_per_iteration": 2.423762798309326 + }, + { + "auxiliary_loss_clip": 0.01267204, + "auxiliary_loss_mlp": 0.01070001, + "balance_loss_clip": 1.03745687, + "balance_loss_mlp": 1.07225537, + "epoch": 0.03396963775740267, + "flos": 22674787850880.0, + "grad_norm": 3.7666153248687277, + "language_loss": 0.94089758, + "learning_rate": 3.999834822316635e-06, + "loss": 0.96426964, + "num_input_tokens_seen": 11936305, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 1.953125, + "step": 565, + "time_per_iteration": 2.499417543411255 + }, + { + "auxiliary_loss_clip": 0.01140331, + "auxiliary_loss_mlp": 0.0102506, + "balance_loss_clip": 1.01714468, + "balance_loss_mlp": 1.04934859, + "epoch": 0.034029761010070644, + "flos": 64392683063040.0, + "grad_norm": 1.1198796781785882, + "language_loss": 0.54823005, + "learning_rate": 3.9998297790842535e-06, + "loss": 0.569884, + "num_input_tokens_seen": 11998940, + "router_z_loss_clip": 0.07910156, + "router_z_loss_mlp": 0.91015625, + "step": 566, + "time_per_iteration": 3.1322038173675537 + }, + { + "auxiliary_loss_clip": 0.01270043, + "auxiliary_loss_mlp": 0.01072204, + "balance_loss_clip": 1.03837276, + "balance_loss_mlp": 1.0753262, + "epoch": 0.034089884262738616, + "flos": 25004204755200.0, + "grad_norm": 2.6603630269915683, + "language_loss": 0.76780868, + "learning_rate": 3.999824660018126e-06, + "loss": 0.79123116, + "num_input_tokens_seen": 12018860, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 1.9453125, + "step": 567, + "time_per_iteration": 2.5351951122283936 + }, + { + "auxiliary_loss_clip": 0.01261299, + "auxiliary_loss_mlp": 0.01077897, + "balance_loss_clip": 1.04809463, + "balance_loss_mlp": 1.07400167, + "epoch": 0.03415000751540658, + "flos": 28439096584320.0, + "grad_norm": 4.563520524929296, + "language_loss": 0.80796623, + "learning_rate": 3.999819465118447e-06, + "loss": 0.83135819, + "num_input_tokens_seen": 12039675, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.875, + "step": 568, + "time_per_iteration": 2.558093309402466 + }, + { + "auxiliary_loss_clip": 0.01263323, + "auxiliary_loss_mlp": 0.01079335, + "balance_loss_clip": 1.04836476, + "balance_loss_mlp": 1.07628214, + "epoch": 0.034210130768074554, + "flos": 21468727866240.0, + "grad_norm": 1.809578126153619, + "language_loss": 0.86777622, + "learning_rate": 3.999814194385413e-06, + "loss": 0.89120281, + "num_input_tokens_seen": 12057680, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.875, + "step": 569, + "time_per_iteration": 2.500319719314575 + }, + { + "auxiliary_loss_clip": 0.01264002, + "auxiliary_loss_mlp": 0.01073079, + "balance_loss_clip": 1.04227519, + "balance_loss_mlp": 1.07425416, + "epoch": 0.03427025402074252, + "flos": 18697501676160.0, + "grad_norm": 1.8164454228173497, + "language_loss": 0.95802778, + "learning_rate": 3.9998088478192255e-06, + "loss": 0.98139858, + "num_input_tokens_seen": 12076135, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.8984375, + "step": 570, + "time_per_iteration": 2.473808526992798 + }, + { + "auxiliary_loss_clip": 0.01264689, + "auxiliary_loss_mlp": 0.01080759, + "balance_loss_clip": 1.04733253, + "balance_loss_mlp": 1.07053721, + "epoch": 0.03433037727341049, + "flos": 20849987162880.0, + "grad_norm": 2.217921822086313, + "language_loss": 0.79522127, + "learning_rate": 3.9998034254200846e-06, + "loss": 0.81867576, + "num_input_tokens_seen": 12094785, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 1.9375, + "step": 571, + "time_per_iteration": 2.48317813873291 + }, + { + "auxiliary_loss_clip": 0.01265335, + "auxiliary_loss_mlp": 0.01076969, + "balance_loss_clip": 1.04490221, + "balance_loss_mlp": 1.07593679, + "epoch": 0.03439050052607846, + "flos": 25410282986880.0, + "grad_norm": 2.3471183659940555, + "language_loss": 0.79962778, + "learning_rate": 3.999797927188199e-06, + "loss": 0.82305074, + "num_input_tokens_seen": 12114590, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 1.890625, + "step": 572, + "time_per_iteration": 2.5112764835357666 + }, + { + "auxiliary_loss_clip": 0.01270326, + "auxiliary_loss_mlp": 0.0106947, + "balance_loss_clip": 1.03871393, + "balance_loss_mlp": 1.07574439, + "epoch": 0.03445062377874643, + "flos": 17640147997440.0, + "grad_norm": 1.9544136074887903, + "language_loss": 0.84374899, + "learning_rate": 3.999792353123774e-06, + "loss": 0.86714697, + "num_input_tokens_seen": 12132390, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.9453125, + "step": 573, + "time_per_iteration": 2.474212408065796 + }, + { + "auxiliary_loss_clip": 0.01266726, + "auxiliary_loss_mlp": 0.01064214, + "balance_loss_clip": 1.03460276, + "balance_loss_mlp": 1.07282329, + "epoch": 0.0345107470314144, + "flos": 16764502245120.0, + "grad_norm": 3.553507560277694, + "language_loss": 0.76376265, + "learning_rate": 3.999786703227023e-06, + "loss": 0.78707206, + "num_input_tokens_seen": 12149035, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.9375, + "step": 574, + "time_per_iteration": 2.4510116577148438 + }, + { + "auxiliary_loss_clip": 0.01264596, + "auxiliary_loss_mlp": 0.01064423, + "balance_loss_clip": 1.03531194, + "balance_loss_mlp": 1.0731982, + "epoch": 0.03457087028408237, + "flos": 14684448533760.0, + "grad_norm": 2.5278817664157343, + "language_loss": 0.83801597, + "learning_rate": 3.9997809774981606e-06, + "loss": 0.86130619, + "num_input_tokens_seen": 12167530, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.9140625, + "step": 575, + "time_per_iteration": 2.459693193435669 + }, + { + "auxiliary_loss_clip": 0.01260171, + "auxiliary_loss_mlp": 0.01067742, + "balance_loss_clip": 1.03830886, + "balance_loss_mlp": 1.07501364, + "epoch": 0.03463099353675034, + "flos": 20011293527040.0, + "grad_norm": 2.241383472398266, + "language_loss": 0.83726245, + "learning_rate": 3.9997751759374025e-06, + "loss": 0.86054158, + "num_input_tokens_seen": 12186340, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.8515625, + "step": 576, + "time_per_iteration": 2.47292423248291 + }, + { + "auxiliary_loss_clip": 0.01267718, + "auxiliary_loss_mlp": 0.01074956, + "balance_loss_clip": 1.04582155, + "balance_loss_mlp": 1.08247435, + "epoch": 0.03469111678941831, + "flos": 25301150490240.0, + "grad_norm": 2.0876645490308334, + "language_loss": 0.8640908, + "learning_rate": 3.99976929854497e-06, + "loss": 0.88751757, + "num_input_tokens_seen": 12204090, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.8515625, + "step": 577, + "time_per_iteration": 2.529500961303711 + }, + { + "auxiliary_loss_clip": 0.01262371, + "auxiliary_loss_mlp": 0.01069797, + "balance_loss_clip": 1.04028082, + "balance_loss_mlp": 1.0769875, + "epoch": 0.034751240042086275, + "flos": 23259413612160.0, + "grad_norm": 3.2017547958107784, + "language_loss": 0.72333407, + "learning_rate": 3.9997633453210845e-06, + "loss": 0.74665576, + "num_input_tokens_seen": 12224850, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.859375, + "step": 578, + "time_per_iteration": 2.4868762493133545 + }, + { + "auxiliary_loss_clip": 0.01263036, + "auxiliary_loss_mlp": 0.01071071, + "balance_loss_clip": 1.04050565, + "balance_loss_mlp": 1.07441878, + "epoch": 0.03481136329475425, + "flos": 23769237300480.0, + "grad_norm": 1.8544904120227406, + "language_loss": 0.77664137, + "learning_rate": 3.999757316265973e-06, + "loss": 0.79998243, + "num_input_tokens_seen": 12244935, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.8828125, + "step": 579, + "time_per_iteration": 2.50669002532959 + }, + { + "auxiliary_loss_clip": 0.01260844, + "auxiliary_loss_mlp": 0.01077558, + "balance_loss_clip": 1.04634845, + "balance_loss_mlp": 1.07355189, + "epoch": 0.03487148654742222, + "flos": 20157521794560.0, + "grad_norm": 2.5351053977844136, + "language_loss": 0.86927247, + "learning_rate": 3.999751211379863e-06, + "loss": 0.89265645, + "num_input_tokens_seen": 12262140, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.875, + "step": 580, + "time_per_iteration": 2.505908966064453 + }, + { + "auxiliary_loss_clip": 0.01266331, + "auxiliary_loss_mlp": 0.01063953, + "balance_loss_clip": 1.03536677, + "balance_loss_mlp": 1.07510614, + "epoch": 0.034931609800090184, + "flos": 15669585918720.0, + "grad_norm": 4.565959491833327, + "language_loss": 0.82161844, + "learning_rate": 3.999745030662987e-06, + "loss": 0.84492135, + "num_input_tokens_seen": 12280930, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.9140625, + "step": 581, + "time_per_iteration": 2.4735610485076904 + }, + { + "auxiliary_loss_clip": 0.01263493, + "auxiliary_loss_mlp": 0.01067149, + "balance_loss_clip": 1.03832436, + "balance_loss_mlp": 1.07712197, + "epoch": 0.034991733052758156, + "flos": 16362374509440.0, + "grad_norm": 2.2699668532214377, + "language_loss": 0.77498174, + "learning_rate": 3.99973877411558e-06, + "loss": 0.79828823, + "num_input_tokens_seen": 12299125, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.8671875, + "step": 582, + "time_per_iteration": 2.4596173763275146 + }, + { + "auxiliary_loss_clip": 0.01261728, + "auxiliary_loss_mlp": 0.01075668, + "balance_loss_clip": 1.04467332, + "balance_loss_mlp": 1.07715631, + "epoch": 0.03505185630542612, + "flos": 19387309438080.0, + "grad_norm": 2.0991939318744692, + "language_loss": 0.87632537, + "learning_rate": 3.999732441737877e-06, + "loss": 0.89969933, + "num_input_tokens_seen": 12316905, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 1.84375, + "step": 583, + "time_per_iteration": 2.46062970161438 + }, + { + "auxiliary_loss_clip": 0.01268555, + "auxiliary_loss_mlp": 0.01082553, + "balance_loss_clip": 1.05167794, + "balance_loss_mlp": 1.07587278, + "epoch": 0.03511197955809409, + "flos": 21323828401920.0, + "grad_norm": 2.3581841085942004, + "language_loss": 0.80997103, + "learning_rate": 3.99972603353012e-06, + "loss": 0.83348215, + "num_input_tokens_seen": 12335070, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.921875, + "step": 584, + "time_per_iteration": 2.4776926040649414 + }, + { + "auxiliary_loss_clip": 0.01262257, + "auxiliary_loss_mlp": 0.01063212, + "balance_loss_clip": 1.03326654, + "balance_loss_mlp": 1.0725317, + "epoch": 0.035172102810762065, + "flos": 14136595320960.0, + "grad_norm": 2.6245680316153743, + "language_loss": 0.92654932, + "learning_rate": 3.999719549492551e-06, + "loss": 0.94980395, + "num_input_tokens_seen": 12350315, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 1.8984375, + "step": 585, + "time_per_iteration": 2.486678123474121 + }, + { + "auxiliary_loss_clip": 0.01262479, + "auxiliary_loss_mlp": 0.01070226, + "balance_loss_clip": 1.04011369, + "balance_loss_mlp": 1.07368612, + "epoch": 0.03523222606343003, + "flos": 20296890564480.0, + "grad_norm": 2.4855014647160245, + "language_loss": 0.87484592, + "learning_rate": 3.9997129896254165e-06, + "loss": 0.89817297, + "num_input_tokens_seen": 12366030, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.890625, + "step": 586, + "time_per_iteration": 2.457772970199585 + }, + { + "auxiliary_loss_clip": 0.01269677, + "auxiliary_loss_mlp": 0.01071192, + "balance_loss_clip": 1.04137754, + "balance_loss_mlp": 1.07875896, + "epoch": 0.035292349316098, + "flos": 20375822701440.0, + "grad_norm": 1.7854143394247532, + "language_loss": 0.76574278, + "learning_rate": 3.999706353928965e-06, + "loss": 0.78915149, + "num_input_tokens_seen": 12384895, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.90625, + "step": 587, + "time_per_iteration": 2.4794015884399414 + }, + { + "auxiliary_loss_clip": 0.01269924, + "auxiliary_loss_mlp": 0.01061103, + "balance_loss_clip": 1.02991772, + "balance_loss_mlp": 1.07701528, + "epoch": 0.03535247256876597, + "flos": 21468871520640.0, + "grad_norm": 1.6805414217886456, + "language_loss": 0.78441286, + "learning_rate": 3.999699642403449e-06, + "loss": 0.80772316, + "num_input_tokens_seen": 12404980, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.9296875, + "step": 588, + "time_per_iteration": 2.4755733013153076 + }, + { + "auxiliary_loss_clip": 0.01267146, + "auxiliary_loss_mlp": 0.01071411, + "balance_loss_clip": 1.03850961, + "balance_loss_mlp": 1.07600832, + "epoch": 0.03541259582143394, + "flos": 23623044946560.0, + "grad_norm": 2.6477303031273185, + "language_loss": 0.94003904, + "learning_rate": 3.99969285504912e-06, + "loss": 0.96342462, + "num_input_tokens_seen": 12423835, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.9140625, + "step": 589, + "time_per_iteration": 2.515296459197998 + }, + { + "auxiliary_loss_clip": 0.01269747, + "auxiliary_loss_mlp": 0.01067695, + "balance_loss_clip": 1.03803611, + "balance_loss_mlp": 1.07632184, + "epoch": 0.03547271907410191, + "flos": 33726367768320.0, + "grad_norm": 2.4870139863099157, + "language_loss": 0.84060037, + "learning_rate": 3.99968599186624e-06, + "loss": 0.86397475, + "num_input_tokens_seen": 12443135, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.9375, + "step": 590, + "time_per_iteration": 2.583080291748047 + }, + { + "auxiliary_loss_clip": 0.01259593, + "auxiliary_loss_mlp": 0.01062628, + "balance_loss_clip": 1.0342319, + "balance_loss_mlp": 1.07476449, + "epoch": 0.03553284232676988, + "flos": 21142695093120.0, + "grad_norm": 2.031404841890899, + "language_loss": 0.86889851, + "learning_rate": 3.999679052855065e-06, + "loss": 0.89212072, + "num_input_tokens_seen": 12462895, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.8515625, + "step": 591, + "time_per_iteration": 2.497912883758545 + }, + { + "auxiliary_loss_clip": 0.01264593, + "auxiliary_loss_mlp": 0.01070221, + "balance_loss_clip": 1.03917849, + "balance_loss_mlp": 1.07271862, + "epoch": 0.03559296557943785, + "flos": 20046593617920.0, + "grad_norm": 3.1144902928375586, + "language_loss": 0.82980722, + "learning_rate": 3.999672038015861e-06, + "loss": 0.85315537, + "num_input_tokens_seen": 12481515, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 1.921875, + "step": 592, + "time_per_iteration": 2.463977813720703 + }, + { + "auxiliary_loss_clip": 0.01128612, + "auxiliary_loss_mlp": 0.01022486, + "balance_loss_clip": 1.01590526, + "balance_loss_mlp": 1.03881407, + "epoch": 0.035653088832105814, + "flos": 60334597244160.0, + "grad_norm": 0.8806680605255408, + "language_loss": 0.59741807, + "learning_rate": 3.999664947348893e-06, + "loss": 0.61892909, + "num_input_tokens_seen": 12548220, + "router_z_loss_clip": 0.06591797, + "router_z_loss_mlp": 0.8984375, + "step": 593, + "time_per_iteration": 3.1275696754455566 + }, + { + "auxiliary_loss_clip": 0.01262803, + "auxiliary_loss_mlp": 0.01070928, + "balance_loss_clip": 1.03988624, + "balance_loss_mlp": 1.07810974, + "epoch": 0.035713212084773786, + "flos": 20113135562880.0, + "grad_norm": 1.8853114596204945, + "language_loss": 0.87042278, + "learning_rate": 3.999657780854429e-06, + "loss": 0.89376009, + "num_input_tokens_seen": 12566105, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 1.84375, + "step": 594, + "time_per_iteration": 2.522805690765381 + }, + { + "auxiliary_loss_clip": 0.01262874, + "auxiliary_loss_mlp": 0.01064868, + "balance_loss_clip": 1.03539896, + "balance_loss_mlp": 1.07309461, + "epoch": 0.03577333533744176, + "flos": 26285785084800.0, + "grad_norm": 2.3431313884364395, + "language_loss": 0.83481348, + "learning_rate": 3.999650538532742e-06, + "loss": 0.85809088, + "num_input_tokens_seen": 12586680, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.8984375, + "step": 595, + "time_per_iteration": 2.565220832824707 + }, + { + "auxiliary_loss_clip": 0.01261367, + "auxiliary_loss_mlp": 0.01072242, + "balance_loss_clip": 1.04216576, + "balance_loss_mlp": 1.07610273, + "epoch": 0.035833458590109724, + "flos": 10889732211840.0, + "grad_norm": 3.1278930526147426, + "language_loss": 0.96185803, + "learning_rate": 3.999643220384106e-06, + "loss": 0.98519421, + "num_input_tokens_seen": 12601605, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.859375, + "step": 596, + "time_per_iteration": 2.460515260696411 + }, + { + "auxiliary_loss_clip": 0.0126361, + "auxiliary_loss_mlp": 0.0107037, + "balance_loss_clip": 1.04185498, + "balance_loss_mlp": 1.07627654, + "epoch": 0.035893581842777696, + "flos": 22090198003200.0, + "grad_norm": 2.2167421176017204, + "language_loss": 0.82718551, + "learning_rate": 3.999635826408799e-06, + "loss": 0.85052526, + "num_input_tokens_seen": 12620365, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.875, + "step": 597, + "time_per_iteration": 2.5076701641082764 + }, + { + "auxiliary_loss_clip": 0.01261023, + "auxiliary_loss_mlp": 0.01069081, + "balance_loss_clip": 1.03956461, + "balance_loss_mlp": 1.0784421, + "epoch": 0.03595370509544566, + "flos": 23038347358080.0, + "grad_norm": 2.168981908539252, + "language_loss": 0.81386817, + "learning_rate": 3.999628356607101e-06, + "loss": 0.83716923, + "num_input_tokens_seen": 12641140, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.828125, + "step": 598, + "time_per_iteration": 2.531188726425171 + }, + { + "auxiliary_loss_clip": 0.01254264, + "auxiliary_loss_mlp": 0.0106961, + "balance_loss_clip": 1.03894937, + "balance_loss_mlp": 1.07570839, + "epoch": 0.03601382834811363, + "flos": 20777734955520.0, + "grad_norm": 1.9075541218278638, + "language_loss": 0.81387949, + "learning_rate": 3.999620810979295e-06, + "loss": 0.83711827, + "num_input_tokens_seen": 12661080, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.7890625, + "step": 599, + "time_per_iteration": 2.511871576309204 + }, + { + "auxiliary_loss_clip": 0.01262476, + "auxiliary_loss_mlp": 0.01074253, + "balance_loss_clip": 1.04557085, + "balance_loss_mlp": 1.07350755, + "epoch": 0.036073951600781605, + "flos": 23951627585280.0, + "grad_norm": 2.1528215266255604, + "language_loss": 0.86115932, + "learning_rate": 3.999613189525668e-06, + "loss": 0.88452661, + "num_input_tokens_seen": 12678270, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.890625, + "step": 600, + "time_per_iteration": 2.50054669380188 + }, + { + "auxiliary_loss_clip": 0.01254617, + "auxiliary_loss_mlp": 0.01080731, + "balance_loss_clip": 1.05133438, + "balance_loss_mlp": 1.06909621, + "epoch": 0.03613407485344957, + "flos": 18912283050240.0, + "grad_norm": 3.928737875146519, + "language_loss": 0.82175761, + "learning_rate": 3.999605492246508e-06, + "loss": 0.84511113, + "num_input_tokens_seen": 12697295, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.8515625, + "step": 601, + "time_per_iteration": 6.795202255249023 + }, + { + "auxiliary_loss_clip": 0.01253245, + "auxiliary_loss_mlp": 0.01056304, + "balance_loss_clip": 1.02666831, + "balance_loss_mlp": 1.07096183, + "epoch": 0.03619419810611754, + "flos": 23038526926080.0, + "grad_norm": 2.2629653513719252, + "language_loss": 0.75467926, + "learning_rate": 3.999597719142107e-06, + "loss": 0.77777481, + "num_input_tokens_seen": 12716165, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.8203125, + "step": 602, + "time_per_iteration": 2.503629446029663 + }, + { + "auxiliary_loss_clip": 0.01252806, + "auxiliary_loss_mlp": 0.01057069, + "balance_loss_clip": 1.02833962, + "balance_loss_mlp": 1.07078326, + "epoch": 0.03625432135878551, + "flos": 29457774293760.0, + "grad_norm": 1.9962737747137984, + "language_loss": 0.80078572, + "learning_rate": 3.999589870212761e-06, + "loss": 0.82388449, + "num_input_tokens_seen": 12735475, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.8203125, + "step": 603, + "time_per_iteration": 2.568368911743164 + }, + { + "auxiliary_loss_clip": 0.01258325, + "auxiliary_loss_mlp": 0.01061531, + "balance_loss_clip": 1.03320646, + "balance_loss_mlp": 1.07597041, + "epoch": 0.03631444461145348, + "flos": 23508525409920.0, + "grad_norm": 1.9836566776981934, + "language_loss": 0.86801207, + "learning_rate": 3.9995819454587664e-06, + "loss": 0.89121068, + "num_input_tokens_seen": 12754540, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.8203125, + "step": 604, + "time_per_iteration": 2.496415376663208 + }, + { + "auxiliary_loss_clip": 0.01260423, + "auxiliary_loss_mlp": 0.01065702, + "balance_loss_clip": 1.03549433, + "balance_loss_mlp": 1.07688427, + "epoch": 0.03637456786412145, + "flos": 16618130323200.0, + "grad_norm": 3.252638522711271, + "language_loss": 0.81078291, + "learning_rate": 3.999573944880424e-06, + "loss": 0.83404416, + "num_input_tokens_seen": 12773050, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.828125, + "step": 605, + "time_per_iteration": 2.46071457862854 + }, + { + "auxiliary_loss_clip": 0.01255946, + "auxiliary_loss_mlp": 0.01067125, + "balance_loss_clip": 1.04012406, + "balance_loss_mlp": 1.07317901, + "epoch": 0.03643469111678942, + "flos": 15851832549120.0, + "grad_norm": 2.2162807408147964, + "language_loss": 0.85624671, + "learning_rate": 3.9995658684780375e-06, + "loss": 0.87947738, + "num_input_tokens_seen": 12791240, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.828125, + "step": 606, + "time_per_iteration": 2.450775623321533 + }, + { + "auxiliary_loss_clip": 0.01262483, + "auxiliary_loss_mlp": 0.01072166, + "balance_loss_clip": 1.04279351, + "balance_loss_mlp": 1.07551849, + "epoch": 0.03649481436945739, + "flos": 23620387340160.0, + "grad_norm": 2.1498788116147125, + "language_loss": 0.82370651, + "learning_rate": 3.999557716251912e-06, + "loss": 0.84705305, + "num_input_tokens_seen": 12812245, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.8671875, + "step": 607, + "time_per_iteration": 2.4969747066497803 + }, + { + "auxiliary_loss_clip": 0.01255757, + "auxiliary_loss_mlp": 0.01063348, + "balance_loss_clip": 1.035954, + "balance_loss_mlp": 1.07488835, + "epoch": 0.036554937622125354, + "flos": 21755581879680.0, + "grad_norm": 3.329641026295442, + "language_loss": 0.8315016, + "learning_rate": 3.999549488202358e-06, + "loss": 0.8546927, + "num_input_tokens_seen": 12831085, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.8046875, + "step": 608, + "time_per_iteration": 2.4648640155792236 + }, + { + "auxiliary_loss_clip": 0.01260127, + "auxiliary_loss_mlp": 0.0106578, + "balance_loss_clip": 1.03533435, + "balance_loss_mlp": 1.0769459, + "epoch": 0.036615060874793326, + "flos": 17819772935040.0, + "grad_norm": 2.072924568315734, + "language_loss": 0.82258713, + "learning_rate": 3.999541184329688e-06, + "loss": 0.84584618, + "num_input_tokens_seen": 12849115, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.828125, + "step": 609, + "time_per_iteration": 2.4761714935302734 + }, + { + "auxiliary_loss_clip": 0.01266536, + "auxiliary_loss_mlp": 0.01080333, + "balance_loss_clip": 1.05247378, + "balance_loss_mlp": 1.08229148, + "epoch": 0.0366751841274613, + "flos": 26753808320640.0, + "grad_norm": 2.279075715646142, + "language_loss": 0.7924515, + "learning_rate": 3.999532804634215e-06, + "loss": 0.81592017, + "num_input_tokens_seen": 12868005, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.84375, + "step": 610, + "time_per_iteration": 2.512223958969116 + }, + { + "auxiliary_loss_clip": 0.01265179, + "auxiliary_loss_mlp": 0.01076881, + "balance_loss_clip": 1.04767442, + "balance_loss_mlp": 1.07819688, + "epoch": 0.03673530738012926, + "flos": 22196960202240.0, + "grad_norm": 2.108980449215705, + "language_loss": 0.87263799, + "learning_rate": 3.9995243491162575e-06, + "loss": 0.89605856, + "num_input_tokens_seen": 12886890, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.8671875, + "step": 611, + "time_per_iteration": 2.488800525665283 + }, + { + "auxiliary_loss_clip": 0.01257304, + "auxiliary_loss_mlp": 0.01084406, + "balance_loss_clip": 1.05577183, + "balance_loss_mlp": 1.0769043, + "epoch": 0.036795430632797235, + "flos": 24681655601280.0, + "grad_norm": 2.0539399448943145, + "language_loss": 0.72783852, + "learning_rate": 3.999515817776136e-06, + "loss": 0.75125557, + "num_input_tokens_seen": 12906130, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.8046875, + "step": 612, + "time_per_iteration": 2.4950740337371826 + }, + { + "auxiliary_loss_clip": 0.01258776, + "auxiliary_loss_mlp": 0.01069045, + "balance_loss_clip": 1.03999329, + "balance_loss_mlp": 1.07377708, + "epoch": 0.0368555538854652, + "flos": 17748921358080.0, + "grad_norm": 2.903841869182041, + "language_loss": 0.7909385, + "learning_rate": 3.999507210614175e-06, + "loss": 0.81421661, + "num_input_tokens_seen": 12925260, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.8515625, + "step": 613, + "time_per_iteration": 2.4849369525909424 + }, + { + "auxiliary_loss_clip": 0.01253943, + "auxiliary_loss_mlp": 0.01079095, + "balance_loss_clip": 1.05141413, + "balance_loss_mlp": 1.07326341, + "epoch": 0.03691567713813317, + "flos": 20594554571520.0, + "grad_norm": 2.273957434397869, + "language_loss": 0.93266213, + "learning_rate": 3.9994985276307e-06, + "loss": 0.95599246, + "num_input_tokens_seen": 12944590, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.8125, + "step": 614, + "time_per_iteration": 2.4639992713928223 + }, + { + "auxiliary_loss_clip": 0.01263574, + "auxiliary_loss_mlp": 0.01075313, + "balance_loss_clip": 1.04415178, + "balance_loss_mlp": 1.07938302, + "epoch": 0.036975800390801145, + "flos": 33650380546560.0, + "grad_norm": 2.901964177226116, + "language_loss": 0.72534943, + "learning_rate": 3.999489768826041e-06, + "loss": 0.74873829, + "num_input_tokens_seen": 12964785, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.84375, + "step": 615, + "time_per_iteration": 2.601372480392456 + }, + { + "auxiliary_loss_clip": 0.01258092, + "auxiliary_loss_mlp": 0.01071353, + "balance_loss_clip": 1.04299331, + "balance_loss_mlp": 1.07278967, + "epoch": 0.03703592364346911, + "flos": 28293694329600.0, + "grad_norm": 2.023635364571096, + "language_loss": 0.81449711, + "learning_rate": 3.999480934200528e-06, + "loss": 0.83779156, + "num_input_tokens_seen": 12986705, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.8515625, + "step": 616, + "time_per_iteration": 2.5325467586517334 + }, + { + "auxiliary_loss_clip": 0.01256707, + "auxiliary_loss_mlp": 0.01063142, + "balance_loss_clip": 1.03643894, + "balance_loss_mlp": 1.07431316, + "epoch": 0.03709604689613708, + "flos": 31504215853440.0, + "grad_norm": 1.9753277492127743, + "language_loss": 0.67868775, + "learning_rate": 3.999472023754499e-06, + "loss": 0.7018863, + "num_input_tokens_seen": 13010560, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.828125, + "step": 617, + "time_per_iteration": 2.5784177780151367 + }, + { + "auxiliary_loss_clip": 0.01263095, + "auxiliary_loss_mlp": 0.01064585, + "balance_loss_clip": 1.0349381, + "balance_loss_mlp": 1.07892454, + "epoch": 0.03715617014880505, + "flos": 19609381272960.0, + "grad_norm": 3.556814357499394, + "language_loss": 0.80340034, + "learning_rate": 3.99946303748829e-06, + "loss": 0.8266772, + "num_input_tokens_seen": 13028935, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.84375, + "step": 618, + "time_per_iteration": 2.4876792430877686 + }, + { + "auxiliary_loss_clip": 0.01261829, + "auxiliary_loss_mlp": 0.01070874, + "balance_loss_clip": 1.04059458, + "balance_loss_mlp": 1.07458091, + "epoch": 0.03721629340147302, + "flos": 15924192497280.0, + "grad_norm": 2.355648226269084, + "language_loss": 0.91115171, + "learning_rate": 3.999453975402242e-06, + "loss": 0.93447876, + "num_input_tokens_seen": 13046000, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.875, + "step": 619, + "time_per_iteration": 2.4804162979125977 + }, + { + "auxiliary_loss_clip": 0.01259898, + "auxiliary_loss_mlp": 0.01077134, + "balance_loss_clip": 1.04871452, + "balance_loss_mlp": 1.07845378, + "epoch": 0.03727641665414099, + "flos": 21104090951040.0, + "grad_norm": 2.218621959424752, + "language_loss": 0.94397002, + "learning_rate": 3.9994448374967e-06, + "loss": 0.96734041, + "num_input_tokens_seen": 13062995, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.8125, + "step": 620, + "time_per_iteration": 2.4592232704162598 + }, + { + "auxiliary_loss_clip": 0.01257463, + "auxiliary_loss_mlp": 0.01077616, + "balance_loss_clip": 1.04750419, + "balance_loss_mlp": 1.07455909, + "epoch": 0.037336539906808956, + "flos": 24131683486080.0, + "grad_norm": 1.8159025601621845, + "language_loss": 0.77105826, + "learning_rate": 3.999435623772008e-06, + "loss": 0.7944091, + "num_input_tokens_seen": 13084120, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.828125, + "step": 621, + "time_per_iteration": 2.53365159034729 + }, + { + "auxiliary_loss_clip": 0.01255819, + "auxiliary_loss_mlp": 0.01059811, + "balance_loss_clip": 1.03084326, + "balance_loss_mlp": 1.07761526, + "epoch": 0.03739666315947693, + "flos": 22346384780160.0, + "grad_norm": 2.793013868715132, + "language_loss": 0.86895752, + "learning_rate": 3.999426334228518e-06, + "loss": 0.89211386, + "num_input_tokens_seen": 13100035, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.78125, + "step": 622, + "time_per_iteration": 2.472726583480835 + }, + { + "auxiliary_loss_clip": 0.01258428, + "auxiliary_loss_mlp": 0.01064577, + "balance_loss_clip": 1.03591871, + "balance_loss_mlp": 1.07622766, + "epoch": 0.0374567864121449, + "flos": 20449511452800.0, + "grad_norm": 2.261361439009279, + "language_loss": 0.90376818, + "learning_rate": 3.999416968866581e-06, + "loss": 0.9269982, + "num_input_tokens_seen": 13118070, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.8203125, + "step": 623, + "time_per_iteration": 2.486699104309082 + }, + { + "auxiliary_loss_clip": 0.0125978, + "auxiliary_loss_mlp": 0.01075147, + "balance_loss_clip": 1.04626298, + "balance_loss_mlp": 1.07841158, + "epoch": 0.037516909664812866, + "flos": 19208043636480.0, + "grad_norm": 1.9910669563462169, + "language_loss": 0.84149444, + "learning_rate": 3.999407527686551e-06, + "loss": 0.86484373, + "num_input_tokens_seen": 13136355, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.8125, + "step": 624, + "time_per_iteration": 2.4514520168304443 + }, + { + "auxiliary_loss_clip": 0.01261437, + "auxiliary_loss_mlp": 0.01067743, + "balance_loss_clip": 1.03867936, + "balance_loss_mlp": 1.0750618, + "epoch": 0.03757703291748084, + "flos": 35005218664320.0, + "grad_norm": 2.4867963928692554, + "language_loss": 0.66228586, + "learning_rate": 3.999398010688788e-06, + "loss": 0.68557763, + "num_input_tokens_seen": 13155435, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.8671875, + "step": 625, + "time_per_iteration": 2.5765273571014404 + }, + { + "auxiliary_loss_clip": 0.01253583, + "auxiliary_loss_mlp": 0.0106714, + "balance_loss_clip": 1.03697979, + "balance_loss_mlp": 1.07435441, + "epoch": 0.0376371561701488, + "flos": 25483899911040.0, + "grad_norm": 2.071255255654034, + "language_loss": 0.77375329, + "learning_rate": 3.999388417873652e-06, + "loss": 0.79696059, + "num_input_tokens_seen": 13174295, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.7890625, + "step": 626, + "time_per_iteration": 2.5022406578063965 + }, + { + "auxiliary_loss_clip": 0.01258684, + "auxiliary_loss_mlp": 0.01074389, + "balance_loss_clip": 1.04499173, + "balance_loss_mlp": 1.07735705, + "epoch": 0.037697279422816775, + "flos": 18185630912640.0, + "grad_norm": 2.2077512286027288, + "language_loss": 0.81357861, + "learning_rate": 3.999378749241506e-06, + "loss": 0.83690929, + "num_input_tokens_seen": 13192500, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.8125, + "step": 627, + "time_per_iteration": 2.4750607013702393 + }, + { + "auxiliary_loss_clip": 0.01261632, + "auxiliary_loss_mlp": 0.01076941, + "balance_loss_clip": 1.04768682, + "balance_loss_mlp": 1.07859111, + "epoch": 0.03775740267548475, + "flos": 24644272521600.0, + "grad_norm": 3.546199216596373, + "language_loss": 0.88572276, + "learning_rate": 3.999369004792719e-06, + "loss": 0.90910852, + "num_input_tokens_seen": 13213470, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.828125, + "step": 628, + "time_per_iteration": 2.571899890899658 + }, + { + "auxiliary_loss_clip": 0.01253553, + "auxiliary_loss_mlp": 0.01067038, + "balance_loss_clip": 1.03864217, + "balance_loss_mlp": 1.07086658, + "epoch": 0.03781752592815271, + "flos": 21288205088640.0, + "grad_norm": 2.488861546346732, + "language_loss": 0.79683006, + "learning_rate": 3.999359184527658e-06, + "loss": 0.82003593, + "num_input_tokens_seen": 13232365, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.828125, + "step": 629, + "time_per_iteration": 2.486675262451172 + }, + { + "auxiliary_loss_clip": 0.01258011, + "auxiliary_loss_mlp": 0.01067816, + "balance_loss_clip": 1.03977799, + "balance_loss_mlp": 1.07458425, + "epoch": 0.037877649180820684, + "flos": 22089623385600.0, + "grad_norm": 1.7117761504495859, + "language_loss": 0.76808703, + "learning_rate": 3.999349288446696e-06, + "loss": 0.79134536, + "num_input_tokens_seen": 13251920, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.8359375, + "step": 630, + "time_per_iteration": 2.494297742843628 + }, + { + "auxiliary_loss_clip": 0.01262746, + "auxiliary_loss_mlp": 0.01070638, + "balance_loss_clip": 1.04250503, + "balance_loss_mlp": 1.07651484, + "epoch": 0.03793777243348865, + "flos": 14501339976960.0, + "grad_norm": 2.6765452133705403, + "language_loss": 0.91492796, + "learning_rate": 3.99933931655021e-06, + "loss": 0.93826187, + "num_input_tokens_seen": 13267440, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.859375, + "step": 631, + "time_per_iteration": 2.4605348110198975 + }, + { + "auxiliary_loss_clip": 0.01252436, + "auxiliary_loss_mlp": 0.01076716, + "balance_loss_clip": 1.04560196, + "balance_loss_mlp": 1.07244229, + "epoch": 0.03799789568615662, + "flos": 21908418249600.0, + "grad_norm": 1.669704350294595, + "language_loss": 0.9207651, + "learning_rate": 3.999329268838575e-06, + "loss": 0.94405663, + "num_input_tokens_seen": 13287850, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 1.796875, + "step": 632, + "time_per_iteration": 2.518498659133911 + }, + { + "auxiliary_loss_clip": 0.01255106, + "auxiliary_loss_mlp": 0.01058467, + "balance_loss_clip": 1.03069162, + "balance_loss_mlp": 1.07462335, + "epoch": 0.03805801893882459, + "flos": 24827021942400.0, + "grad_norm": 2.0828864645498872, + "language_loss": 0.8341018, + "learning_rate": 3.999319145312175e-06, + "loss": 0.85723758, + "num_input_tokens_seen": 13307760, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.8046875, + "step": 633, + "time_per_iteration": 2.5217537879943848 + }, + { + "auxiliary_loss_clip": 0.01258224, + "auxiliary_loss_mlp": 0.01071025, + "balance_loss_clip": 1.04153264, + "balance_loss_mlp": 1.07408428, + "epoch": 0.03811814219149256, + "flos": 30482952364800.0, + "grad_norm": 1.6987522649376106, + "language_loss": 0.69638437, + "learning_rate": 3.999308945971392e-06, + "loss": 0.71967685, + "num_input_tokens_seen": 13331230, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.84375, + "step": 634, + "time_per_iteration": 2.5694239139556885 + }, + { + "auxiliary_loss_clip": 0.01127675, + "auxiliary_loss_mlp": 0.01017483, + "balance_loss_clip": 1.0106163, + "balance_loss_mlp": 1.04225707, + "epoch": 0.03817826544416053, + "flos": 66992577379200.0, + "grad_norm": 0.8852243261294688, + "language_loss": 0.61585373, + "learning_rate": 3.999298670816614e-06, + "loss": 0.63730532, + "num_input_tokens_seen": 13394760, + "router_z_loss_clip": 0.06884766, + "router_z_loss_mlp": 0.8515625, + "step": 635, + "time_per_iteration": 3.1059212684631348 + }, + { + "auxiliary_loss_clip": 0.01253433, + "auxiliary_loss_mlp": 0.01068627, + "balance_loss_clip": 1.03930187, + "balance_loss_mlp": 1.07354546, + "epoch": 0.038238388696828496, + "flos": 20485350247680.0, + "grad_norm": 2.2313569204055246, + "language_loss": 0.83721048, + "learning_rate": 3.9992883198482294e-06, + "loss": 0.86043108, + "num_input_tokens_seen": 13412775, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.796875, + "step": 636, + "time_per_iteration": 2.4975383281707764 + }, + { + "auxiliary_loss_clip": 0.01258014, + "auxiliary_loss_mlp": 0.01077997, + "balance_loss_clip": 1.04852867, + "balance_loss_mlp": 1.07623935, + "epoch": 0.03829851194949647, + "flos": 17965893461760.0, + "grad_norm": 2.4018992949787847, + "language_loss": 0.79327047, + "learning_rate": 3.999277893066632e-06, + "loss": 0.8166306, + "num_input_tokens_seen": 13427835, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.8203125, + "step": 637, + "time_per_iteration": 2.4560744762420654 + }, + { + "auxiliary_loss_clip": 0.01258084, + "auxiliary_loss_mlp": 0.01073075, + "balance_loss_clip": 1.04342771, + "balance_loss_mlp": 1.07309079, + "epoch": 0.03835863520216444, + "flos": 22456522857600.0, + "grad_norm": 2.8779285506389924, + "language_loss": 0.8410306, + "learning_rate": 3.999267390472215e-06, + "loss": 0.86434221, + "num_input_tokens_seen": 13447295, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.8515625, + "step": 638, + "time_per_iteration": 2.504343271255493 + }, + { + "auxiliary_loss_clip": 0.01263348, + "auxiliary_loss_mlp": 0.01067898, + "balance_loss_clip": 1.03717756, + "balance_loss_mlp": 1.07495832, + "epoch": 0.038418758454832405, + "flos": 22164425458560.0, + "grad_norm": 2.5416523890288976, + "language_loss": 0.70099992, + "learning_rate": 3.999256812065381e-06, + "loss": 0.72431237, + "num_input_tokens_seen": 13468455, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.890625, + "step": 639, + "time_per_iteration": 2.52817964553833 + }, + { + "auxiliary_loss_clip": 0.01259266, + "auxiliary_loss_mlp": 0.01075603, + "balance_loss_clip": 1.04463232, + "balance_loss_mlp": 1.07514286, + "epoch": 0.03847888170750038, + "flos": 22747435107840.0, + "grad_norm": 2.42201861797838, + "language_loss": 0.85030365, + "learning_rate": 3.999246157846526e-06, + "loss": 0.8736524, + "num_input_tokens_seen": 13489085, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.84375, + "step": 640, + "time_per_iteration": 2.503262758255005 + }, + { + "auxiliary_loss_clip": 0.0126167, + "auxiliary_loss_mlp": 0.0107911, + "balance_loss_clip": 1.04725742, + "balance_loss_mlp": 1.07574821, + "epoch": 0.03853900496016834, + "flos": 22711201263360.0, + "grad_norm": 2.3722848939528953, + "language_loss": 0.82117289, + "learning_rate": 3.9992354278160574e-06, + "loss": 0.84458065, + "num_input_tokens_seen": 13509120, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 1.859375, + "step": 641, + "time_per_iteration": 2.51052188873291 + }, + { + "auxiliary_loss_clip": 0.01118992, + "auxiliary_loss_mlp": 0.01008303, + "balance_loss_clip": 1.00205636, + "balance_loss_mlp": 1.03414774, + "epoch": 0.038599128212836314, + "flos": 70399136355840.0, + "grad_norm": 0.9008353353488252, + "language_loss": 0.6540072, + "learning_rate": 3.999224621974381e-06, + "loss": 0.67528021, + "num_input_tokens_seen": 13562005, + "router_z_loss_clip": 0.06225586, + "router_z_loss_mlp": 0.8515625, + "step": 642, + "time_per_iteration": 4.430839538574219 + }, + { + "auxiliary_loss_clip": 0.01256856, + "auxiliary_loss_mlp": 0.01062608, + "balance_loss_clip": 1.03433132, + "balance_loss_mlp": 1.07364345, + "epoch": 0.03865925146550429, + "flos": 23295144666240.0, + "grad_norm": 1.9870813050305103, + "language_loss": 0.79512584, + "learning_rate": 3.999213740321906e-06, + "loss": 0.81832051, + "num_input_tokens_seen": 13582185, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.8359375, + "step": 643, + "time_per_iteration": 5.386199951171875 + }, + { + "auxiliary_loss_clip": 0.01255871, + "auxiliary_loss_mlp": 0.01074414, + "balance_loss_clip": 1.0456841, + "balance_loss_mlp": 1.07266629, + "epoch": 0.03871937471817225, + "flos": 21430446946560.0, + "grad_norm": 2.074949815918338, + "language_loss": 0.82926929, + "learning_rate": 3.999202782859046e-06, + "loss": 0.85257208, + "num_input_tokens_seen": 13599555, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.828125, + "step": 644, + "time_per_iteration": 2.45499587059021 + }, + { + "auxiliary_loss_clip": 0.01260265, + "auxiliary_loss_mlp": 0.0106622, + "balance_loss_clip": 1.03503489, + "balance_loss_mlp": 1.07482159, + "epoch": 0.038779497970840224, + "flos": 34277309550720.0, + "grad_norm": 2.258008571643512, + "language_loss": 0.82131916, + "learning_rate": 3.9991917495862165e-06, + "loss": 0.84458399, + "num_input_tokens_seen": 13621160, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.859375, + "step": 645, + "time_per_iteration": 2.610435724258423 + }, + { + "auxiliary_loss_clip": 0.0126099, + "auxiliary_loss_mlp": 0.01070847, + "balance_loss_clip": 1.04121125, + "balance_loss_mlp": 1.07544899, + "epoch": 0.03883962122350819, + "flos": 22748189293440.0, + "grad_norm": 2.4729923618605554, + "language_loss": 0.82006776, + "learning_rate": 3.9991806405038345e-06, + "loss": 0.84338611, + "num_input_tokens_seen": 13641915, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.8515625, + "step": 646, + "time_per_iteration": 2.4771342277526855 + }, + { + "auxiliary_loss_clip": 0.01260575, + "auxiliary_loss_mlp": 0.01080585, + "balance_loss_clip": 1.05123544, + "balance_loss_mlp": 1.07928514, + "epoch": 0.03889974447617616, + "flos": 21945837242880.0, + "grad_norm": 1.8327945326632593, + "language_loss": 0.81973422, + "learning_rate": 3.999169455612323e-06, + "loss": 0.84314579, + "num_input_tokens_seen": 13661410, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.8125, + "step": 647, + "time_per_iteration": 2.522347927093506 + }, + { + "auxiliary_loss_clip": 0.01260388, + "auxiliary_loss_mlp": 0.01069051, + "balance_loss_clip": 1.03965366, + "balance_loss_mlp": 1.07776546, + "epoch": 0.03895986772884413, + "flos": 31504826384640.0, + "grad_norm": 1.9222642653000834, + "language_loss": 0.84699827, + "learning_rate": 3.999158194912106e-06, + "loss": 0.87029266, + "num_input_tokens_seen": 13681705, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.828125, + "step": 648, + "time_per_iteration": 2.561929941177368 + }, + { + "auxiliary_loss_clip": 0.01258218, + "auxiliary_loss_mlp": 0.01071465, + "balance_loss_clip": 1.041448, + "balance_loss_mlp": 1.07636404, + "epoch": 0.0390199909815121, + "flos": 19901011795200.0, + "grad_norm": 3.7283662397985053, + "language_loss": 0.84446943, + "learning_rate": 3.9991468584036086e-06, + "loss": 0.86776626, + "num_input_tokens_seen": 13700400, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.8203125, + "step": 649, + "time_per_iteration": 2.477027416229248 + }, + { + "auxiliary_loss_clip": 0.01259496, + "auxiliary_loss_mlp": 0.01070031, + "balance_loss_clip": 1.03977561, + "balance_loss_mlp": 1.07551885, + "epoch": 0.03908011423418007, + "flos": 21612478095360.0, + "grad_norm": 1.8508721849532739, + "language_loss": 0.79670662, + "learning_rate": 3.999135446087263e-06, + "loss": 0.8200019, + "num_input_tokens_seen": 13720145, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.84375, + "step": 650, + "time_per_iteration": 2.482440948486328 + }, + { + "auxiliary_loss_clip": 0.0125375, + "auxiliary_loss_mlp": 0.01072982, + "balance_loss_clip": 1.04314423, + "balance_loss_mlp": 1.07259929, + "epoch": 0.039140237486848035, + "flos": 18661411486080.0, + "grad_norm": 2.708739352564946, + "language_loss": 0.78509629, + "learning_rate": 3.9991239579635e-06, + "loss": 0.80836356, + "num_input_tokens_seen": 13737500, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.8125, + "step": 651, + "time_per_iteration": 2.4757516384124756 + }, + { + "auxiliary_loss_clip": 0.01255418, + "auxiliary_loss_mlp": 0.01082545, + "balance_loss_clip": 1.05004883, + "balance_loss_mlp": 1.0719974, + "epoch": 0.03920036073951601, + "flos": 18661124177280.0, + "grad_norm": 2.7896665115169244, + "language_loss": 0.88031149, + "learning_rate": 3.999112394032757e-06, + "loss": 0.90369117, + "num_input_tokens_seen": 13754750, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 1.8359375, + "step": 652, + "time_per_iteration": 2.4425668716430664 + }, + { + "auxiliary_loss_clip": 0.01249027, + "auxiliary_loss_mlp": 0.01069663, + "balance_loss_clip": 1.0411005, + "balance_loss_mlp": 1.07108784, + "epoch": 0.03926048399218398, + "flos": 31354468053120.0, + "grad_norm": 3.185528651545475, + "language_loss": 0.79044777, + "learning_rate": 3.999100754295471e-06, + "loss": 0.81363463, + "num_input_tokens_seen": 13771990, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.78125, + "step": 653, + "time_per_iteration": 2.5651934146881104 + }, + { + "auxiliary_loss_clip": 0.01264568, + "auxiliary_loss_mlp": 0.01070462, + "balance_loss_clip": 1.03996825, + "balance_loss_mlp": 1.07603264, + "epoch": 0.039320607244851945, + "flos": 29603499770880.0, + "grad_norm": 2.207303268368246, + "language_loss": 0.86304128, + "learning_rate": 3.999089038752085e-06, + "loss": 0.88639158, + "num_input_tokens_seen": 13792750, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.8828125, + "step": 654, + "time_per_iteration": 2.533297061920166 + }, + { + "auxiliary_loss_clip": 0.01115043, + "auxiliary_loss_mlp": 0.01012751, + "balance_loss_clip": 1.00710094, + "balance_loss_mlp": 1.03246427, + "epoch": 0.03938073049751992, + "flos": 66534609951360.0, + "grad_norm": 0.7205066186016396, + "language_loss": 0.49900642, + "learning_rate": 3.999077247403041e-06, + "loss": 0.5202843, + "num_input_tokens_seen": 13858570, + "router_z_loss_clip": 0.05639648, + "router_z_loss_mlp": 0.82421875, + "step": 655, + "time_per_iteration": 3.1399919986724854 + }, + { + "auxiliary_loss_clip": 0.01251012, + "auxiliary_loss_mlp": 0.01066863, + "balance_loss_clip": 1.03866971, + "balance_loss_mlp": 1.07330465, + "epoch": 0.03944085375018788, + "flos": 23367827836800.0, + "grad_norm": 2.4228021909793918, + "language_loss": 0.80845964, + "learning_rate": 3.9990653802487886e-06, + "loss": 0.83163846, + "num_input_tokens_seen": 13876335, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.78125, + "step": 656, + "time_per_iteration": 2.5063297748565674 + }, + { + "auxiliary_loss_clip": 0.01264211, + "auxiliary_loss_mlp": 0.0109165, + "balance_loss_clip": 1.0566026, + "balance_loss_mlp": 1.07672703, + "epoch": 0.039500977002855854, + "flos": 18548292579840.0, + "grad_norm": 2.8602268717749526, + "language_loss": 0.76602596, + "learning_rate": 3.999053437289776e-06, + "loss": 0.78958458, + "num_input_tokens_seen": 13892640, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 1.875, + "step": 657, + "time_per_iteration": 2.4405555725097656 + }, + { + "auxiliary_loss_clip": 0.01258331, + "auxiliary_loss_mlp": 0.01071967, + "balance_loss_clip": 1.04192615, + "balance_loss_mlp": 1.07452726, + "epoch": 0.039561100255523826, + "flos": 25338174433920.0, + "grad_norm": 2.1526815744488945, + "language_loss": 0.81690443, + "learning_rate": 3.999041418526457e-06, + "loss": 0.84020746, + "num_input_tokens_seen": 13910085, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.84375, + "step": 658, + "time_per_iteration": 2.5383949279785156 + }, + { + "auxiliary_loss_clip": 0.01252051, + "auxiliary_loss_mlp": 0.01072669, + "balance_loss_clip": 1.04091132, + "balance_loss_mlp": 1.07283425, + "epoch": 0.03962122350819179, + "flos": 18219889509120.0, + "grad_norm": 2.2075021313123777, + "language_loss": 0.91331315, + "learning_rate": 3.999029323959287e-06, + "loss": 0.93656039, + "num_input_tokens_seen": 13928800, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.796875, + "step": 659, + "time_per_iteration": 2.4678854942321777 + }, + { + "auxiliary_loss_clip": 0.01259034, + "auxiliary_loss_mlp": 0.01066414, + "balance_loss_clip": 1.03699267, + "balance_loss_mlp": 1.07427669, + "epoch": 0.03968134676085976, + "flos": 20522230536960.0, + "grad_norm": 2.5412719342676215, + "language_loss": 0.79241848, + "learning_rate": 3.999017153588724e-06, + "loss": 0.81567293, + "num_input_tokens_seen": 13948325, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.8515625, + "step": 660, + "time_per_iteration": 2.5135834217071533 + }, + { + "auxiliary_loss_clip": 0.01255641, + "auxiliary_loss_mlp": 0.01070807, + "balance_loss_clip": 1.04017007, + "balance_loss_mlp": 1.07534087, + "epoch": 0.03974147001352773, + "flos": 22422587483520.0, + "grad_norm": 1.6909533460123631, + "language_loss": 0.81942898, + "learning_rate": 3.999004907415231e-06, + "loss": 0.84269351, + "num_input_tokens_seen": 13969090, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.796875, + "step": 661, + "time_per_iteration": 2.513702154159546 + }, + { + "auxiliary_loss_clip": 0.01112947, + "auxiliary_loss_mlp": 0.01010967, + "balance_loss_clip": 1.00519753, + "balance_loss_mlp": 1.03039932, + "epoch": 0.0398015932661957, + "flos": 71128769322240.0, + "grad_norm": 0.9113020435813882, + "language_loss": 0.69376045, + "learning_rate": 3.998992585439272e-06, + "loss": 0.7149995, + "num_input_tokens_seen": 14037555, + "router_z_loss_clip": 0.05761719, + "router_z_loss_mlp": 0.82421875, + "step": 662, + "time_per_iteration": 3.2435107231140137 + }, + { + "auxiliary_loss_clip": 0.01260063, + "auxiliary_loss_mlp": 0.01071537, + "balance_loss_clip": 1.04113865, + "balance_loss_mlp": 1.0779382, + "epoch": 0.03986171651886367, + "flos": 16800951571200.0, + "grad_norm": 2.025040011333182, + "language_loss": 0.83253002, + "learning_rate": 3.998980187661314e-06, + "loss": 0.85584599, + "num_input_tokens_seen": 14055765, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.8125, + "step": 663, + "time_per_iteration": 2.5213887691497803 + }, + { + "auxiliary_loss_clip": 0.01261822, + "auxiliary_loss_mlp": 0.0106269, + "balance_loss_clip": 1.032125, + "balance_loss_mlp": 1.07768416, + "epoch": 0.03992183977153164, + "flos": 24535068197760.0, + "grad_norm": 2.8595031628608143, + "language_loss": 0.87538105, + "learning_rate": 3.998967714081826e-06, + "loss": 0.89862621, + "num_input_tokens_seen": 14074195, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.84375, + "step": 664, + "time_per_iteration": 2.516810655593872 + }, + { + "auxiliary_loss_clip": 0.0125116, + "auxiliary_loss_mlp": 0.01060628, + "balance_loss_clip": 1.03065872, + "balance_loss_mlp": 1.07347679, + "epoch": 0.03998196302419961, + "flos": 15595897167360.0, + "grad_norm": 2.3519362819230625, + "language_loss": 0.84738994, + "learning_rate": 3.998955164701281e-06, + "loss": 0.87050784, + "num_input_tokens_seen": 14090215, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 1.7734375, + "step": 665, + "time_per_iteration": 2.4348978996276855 + }, + { + "auxiliary_loss_clip": 0.01263346, + "auxiliary_loss_mlp": 0.01087391, + "balance_loss_clip": 1.05525231, + "balance_loss_mlp": 1.07680821, + "epoch": 0.04004208627686758, + "flos": 25305065072640.0, + "grad_norm": 2.1279588772882687, + "language_loss": 0.81491798, + "learning_rate": 3.998942539520158e-06, + "loss": 0.83842534, + "num_input_tokens_seen": 14112150, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 1.8671875, + "step": 666, + "time_per_iteration": 2.564187526702881 + }, + { + "auxiliary_loss_clip": 0.01252779, + "auxiliary_loss_mlp": 0.01074876, + "balance_loss_clip": 1.04276049, + "balance_loss_mlp": 1.07225358, + "epoch": 0.04010220952953555, + "flos": 23475847011840.0, + "grad_norm": 2.9939634291419526, + "language_loss": 0.87121451, + "learning_rate": 3.998929838538932e-06, + "loss": 0.89449108, + "num_input_tokens_seen": 14131475, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 1.8046875, + "step": 667, + "time_per_iteration": 2.547909736633301 + }, + { + "auxiliary_loss_clip": 0.0125258, + "auxiliary_loss_mlp": 0.01066007, + "balance_loss_clip": 1.03661036, + "balance_loss_mlp": 1.07692444, + "epoch": 0.04016233278220352, + "flos": 18617025254400.0, + "grad_norm": 2.627098567014159, + "language_loss": 0.80619991, + "learning_rate": 3.998917061758087e-06, + "loss": 0.82938576, + "num_input_tokens_seen": 14146165, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.7578125, + "step": 668, + "time_per_iteration": 2.441667079925537 + }, + { + "auxiliary_loss_clip": 0.01110895, + "auxiliary_loss_mlp": 0.01011229, + "balance_loss_clip": 1.0053643, + "balance_loss_mlp": 1.02968836, + "epoch": 0.040222456034871484, + "flos": 70906194696960.0, + "grad_norm": 0.7872457900726799, + "language_loss": 0.60042131, + "learning_rate": 3.998904209178107e-06, + "loss": 0.62164247, + "num_input_tokens_seen": 14215005, + "router_z_loss_clip": 0.05859375, + "router_z_loss_mlp": 0.8125, + "step": 669, + "time_per_iteration": 3.200874090194702 + }, + { + "auxiliary_loss_clip": 0.01253738, + "auxiliary_loss_mlp": 0.0107276, + "balance_loss_clip": 1.0431962, + "balance_loss_mlp": 1.07228541, + "epoch": 0.040282579287539456, + "flos": 23764712186880.0, + "grad_norm": 1.7415828974469272, + "language_loss": 0.86405391, + "learning_rate": 3.9988912807994785e-06, + "loss": 0.88731897, + "num_input_tokens_seen": 14235510, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.8125, + "step": 670, + "time_per_iteration": 2.5169434547424316 + }, + { + "auxiliary_loss_clip": 0.0124964, + "auxiliary_loss_mlp": 0.01070621, + "balance_loss_clip": 1.0414381, + "balance_loss_mlp": 1.07305872, + "epoch": 0.04034270254020743, + "flos": 18478518410880.0, + "grad_norm": 1.9261739939324196, + "language_loss": 0.752123, + "learning_rate": 3.998878276622692e-06, + "loss": 0.7753256, + "num_input_tokens_seen": 14254565, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.765625, + "step": 671, + "time_per_iteration": 2.514566421508789 + }, + { + "auxiliary_loss_clip": 0.01259516, + "auxiliary_loss_mlp": 0.01075144, + "balance_loss_clip": 1.04472136, + "balance_loss_mlp": 1.0774349, + "epoch": 0.040402825792875394, + "flos": 17201858244480.0, + "grad_norm": 2.0846907245314688, + "language_loss": 0.92279977, + "learning_rate": 3.998865196648242e-06, + "loss": 0.94614637, + "num_input_tokens_seen": 14271885, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.8203125, + "step": 672, + "time_per_iteration": 2.464036226272583 + }, + { + "auxiliary_loss_clip": 0.01253491, + "auxiliary_loss_mlp": 0.01071171, + "balance_loss_clip": 1.03921115, + "balance_loss_mlp": 1.07329202, + "epoch": 0.040462949045543366, + "flos": 19172168928000.0, + "grad_norm": 1.816355722874097, + "language_loss": 0.90220857, + "learning_rate": 3.998852040876622e-06, + "loss": 0.92545515, + "num_input_tokens_seen": 14289670, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 1.796875, + "step": 673, + "time_per_iteration": 2.450547456741333 + }, + { + "auxiliary_loss_clip": 0.01249229, + "auxiliary_loss_mlp": 0.01077482, + "balance_loss_clip": 1.0463202, + "balance_loss_mlp": 1.07150948, + "epoch": 0.04052307229821133, + "flos": 24019821555840.0, + "grad_norm": 2.117589951798075, + "language_loss": 0.74881005, + "learning_rate": 3.998838809308334e-06, + "loss": 0.77207714, + "num_input_tokens_seen": 14309285, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.78125, + "step": 674, + "time_per_iteration": 2.5444436073303223 + }, + { + "auxiliary_loss_clip": 0.01260981, + "auxiliary_loss_mlp": 0.01061202, + "balance_loss_clip": 1.03036261, + "balance_loss_mlp": 1.07609737, + "epoch": 0.0405831955508793, + "flos": 16436601964800.0, + "grad_norm": 2.2422867770418797, + "language_loss": 0.78305578, + "learning_rate": 3.9988255019438766e-06, + "loss": 0.80627763, + "num_input_tokens_seen": 14328300, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.84375, + "step": 675, + "time_per_iteration": 2.4525954723358154 + }, + { + "auxiliary_loss_clip": 0.01252319, + "auxiliary_loss_mlp": 0.01078615, + "balance_loss_clip": 1.04578447, + "balance_loss_mlp": 1.07254028, + "epoch": 0.040643318803547275, + "flos": 24279922915200.0, + "grad_norm": 1.7072695919905723, + "language_loss": 0.76650077, + "learning_rate": 3.998812118783757e-06, + "loss": 0.78981006, + "num_input_tokens_seen": 14346395, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.796875, + "step": 676, + "time_per_iteration": 2.530043840408325 + }, + { + "auxiliary_loss_clip": 0.01258388, + "auxiliary_loss_mlp": 0.01076398, + "balance_loss_clip": 1.04564214, + "balance_loss_mlp": 1.0750767, + "epoch": 0.04070344205621524, + "flos": 17712076982400.0, + "grad_norm": 2.3168648577819138, + "language_loss": 0.85182011, + "learning_rate": 3.9987986598284804e-06, + "loss": 0.87516803, + "num_input_tokens_seen": 14364605, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.828125, + "step": 677, + "time_per_iteration": 2.4390082359313965 + }, + { + "auxiliary_loss_clip": 0.01249568, + "auxiliary_loss_mlp": 0.01068372, + "balance_loss_clip": 1.03804517, + "balance_loss_mlp": 1.071486, + "epoch": 0.04076356530888321, + "flos": 26177658168960.0, + "grad_norm": 1.7808730288109123, + "language_loss": 0.76348364, + "learning_rate": 3.998785125078559e-06, + "loss": 0.78666306, + "num_input_tokens_seen": 14385265, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.78125, + "step": 678, + "time_per_iteration": 2.5151596069335938 + }, + { + "auxiliary_loss_clip": 0.01250603, + "auxiliary_loss_mlp": 0.01066495, + "balance_loss_clip": 1.03807509, + "balance_loss_mlp": 1.07162285, + "epoch": 0.04082368856155118, + "flos": 35773455772800.0, + "grad_norm": 1.9938089142752387, + "language_loss": 0.82114184, + "learning_rate": 3.998771514534505e-06, + "loss": 0.84431279, + "num_input_tokens_seen": 14406090, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.7890625, + "step": 679, + "time_per_iteration": 2.5701568126678467 + }, + { + "auxiliary_loss_clip": 0.01255726, + "auxiliary_loss_mlp": 0.01057721, + "balance_loss_clip": 1.02799058, + "balance_loss_mlp": 1.07693028, + "epoch": 0.04088381181421915, + "flos": 28146640049280.0, + "grad_norm": 1.893911305727382, + "language_loss": 0.76349533, + "learning_rate": 3.998757828196835e-06, + "loss": 0.7866298, + "num_input_tokens_seen": 14425130, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.7890625, + "step": 680, + "time_per_iteration": 2.5289864540100098 + }, + { + "auxiliary_loss_clip": 0.01255007, + "auxiliary_loss_mlp": 0.01065478, + "balance_loss_clip": 1.03305268, + "balance_loss_mlp": 1.07167506, + "epoch": 0.04094393506688712, + "flos": 27597673514880.0, + "grad_norm": 1.7999776318515568, + "language_loss": 0.83315849, + "learning_rate": 3.9987440660660685e-06, + "loss": 0.8563633, + "num_input_tokens_seen": 14447355, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 1.8359375, + "step": 681, + "time_per_iteration": 2.5313305854797363 + }, + { + "auxiliary_loss_clip": 0.01253144, + "auxiliary_loss_mlp": 0.01064685, + "balance_loss_clip": 1.03302324, + "balance_loss_mlp": 1.07082057, + "epoch": 0.04100405831955509, + "flos": 23112036109440.0, + "grad_norm": 1.6690976928218293, + "language_loss": 0.71312869, + "learning_rate": 3.998730228142726e-06, + "loss": 0.73630697, + "num_input_tokens_seen": 14466790, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.828125, + "step": 682, + "time_per_iteration": 2.5190017223358154 + }, + { + "auxiliary_loss_clip": 0.01251459, + "auxiliary_loss_mlp": 0.01068202, + "balance_loss_clip": 1.03911471, + "balance_loss_mlp": 1.07090235, + "epoch": 0.04106418157222306, + "flos": 20156731695360.0, + "grad_norm": 1.7744847161326498, + "language_loss": 0.72373003, + "learning_rate": 3.998716314427333e-06, + "loss": 0.74692667, + "num_input_tokens_seen": 14485195, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.8046875, + "step": 683, + "time_per_iteration": 2.473156690597534 + }, + { + "auxiliary_loss_clip": 0.01250706, + "auxiliary_loss_mlp": 0.01075324, + "balance_loss_clip": 1.04540253, + "balance_loss_mlp": 1.07707, + "epoch": 0.041124304824891024, + "flos": 17420697855360.0, + "grad_norm": 2.316908811268422, + "language_loss": 0.81263745, + "learning_rate": 3.998702324920417e-06, + "loss": 0.83589774, + "num_input_tokens_seen": 14503370, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 1.734375, + "step": 684, + "time_per_iteration": 5.34027099609375 + }, + { + "auxiliary_loss_clip": 0.01251905, + "auxiliary_loss_mlp": 0.01072266, + "balance_loss_clip": 1.04053211, + "balance_loss_mlp": 1.07572865, + "epoch": 0.041184428077558996, + "flos": 25780163287680.0, + "grad_norm": 1.5327144156887007, + "language_loss": 0.90501672, + "learning_rate": 3.9986882596225085e-06, + "loss": 0.92825842, + "num_input_tokens_seen": 14526415, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 1.765625, + "step": 685, + "time_per_iteration": 3.918776750564575 + }, + { + "auxiliary_loss_clip": 0.01253389, + "auxiliary_loss_mlp": 0.010703, + "balance_loss_clip": 1.04002118, + "balance_loss_mlp": 1.07458997, + "epoch": 0.04124455133022697, + "flos": 22964766347520.0, + "grad_norm": 2.0402082016953234, + "language_loss": 0.87871253, + "learning_rate": 3.998674118534141e-06, + "loss": 0.90194941, + "num_input_tokens_seen": 14546595, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.7890625, + "step": 686, + "time_per_iteration": 2.481177806854248 + }, + { + "auxiliary_loss_clip": 0.01258153, + "auxiliary_loss_mlp": 0.01071669, + "balance_loss_clip": 1.04158103, + "balance_loss_mlp": 1.07474661, + "epoch": 0.04130467458289493, + "flos": 21289067015040.0, + "grad_norm": 1.7716861202834375, + "language_loss": 0.71645427, + "learning_rate": 3.998659901655851e-06, + "loss": 0.73975253, + "num_input_tokens_seen": 14566590, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.8359375, + "step": 687, + "time_per_iteration": 2.4720261096954346 + }, + { + "auxiliary_loss_clip": 0.01252382, + "auxiliary_loss_mlp": 0.01070684, + "balance_loss_clip": 1.04262209, + "balance_loss_mlp": 1.07918715, + "epoch": 0.041364797835562905, + "flos": 19974233669760.0, + "grad_norm": 2.117746024922212, + "language_loss": 0.8642537, + "learning_rate": 3.998645608988177e-06, + "loss": 0.88748431, + "num_input_tokens_seen": 14585965, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.734375, + "step": 688, + "time_per_iteration": 2.46706223487854 + }, + { + "auxiliary_loss_clip": 0.01249454, + "auxiliary_loss_mlp": 0.01083042, + "balance_loss_clip": 1.05338287, + "balance_loss_mlp": 1.07534754, + "epoch": 0.04142492108823087, + "flos": 21906227520000.0, + "grad_norm": 2.6487514234328304, + "language_loss": 0.83326006, + "learning_rate": 3.998631240531661e-06, + "loss": 0.85658503, + "num_input_tokens_seen": 14606015, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.7421875, + "step": 689, + "time_per_iteration": 2.4689462184906006 + }, + { + "auxiliary_loss_clip": 0.01248134, + "auxiliary_loss_mlp": 0.01077255, + "balance_loss_clip": 1.04847789, + "balance_loss_mlp": 1.07176828, + "epoch": 0.04148504434089884, + "flos": 27639617621760.0, + "grad_norm": 1.7821885346326607, + "language_loss": 0.68391848, + "learning_rate": 3.998616796286848e-06, + "loss": 0.70717239, + "num_input_tokens_seen": 14629955, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.765625, + "step": 690, + "time_per_iteration": 2.5583128929138184 + }, + { + "auxiliary_loss_clip": 0.012458, + "auxiliary_loss_mlp": 0.01071299, + "balance_loss_clip": 1.04197323, + "balance_loss_mlp": 1.07094526, + "epoch": 0.041545167593566815, + "flos": 20518387781760.0, + "grad_norm": 1.747700039366933, + "language_loss": 0.74933273, + "learning_rate": 3.998602276254286e-06, + "loss": 0.77250373, + "num_input_tokens_seen": 14648000, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.75, + "step": 691, + "time_per_iteration": 2.4566729068756104 + }, + { + "auxiliary_loss_clip": 0.01246178, + "auxiliary_loss_mlp": 0.0107911, + "balance_loss_clip": 1.04890203, + "balance_loss_mlp": 1.07268727, + "epoch": 0.04160529084623478, + "flos": 11868907939200.0, + "grad_norm": 2.450885846250815, + "language_loss": 0.84518701, + "learning_rate": 3.998587680434526e-06, + "loss": 0.86843991, + "num_input_tokens_seen": 14662235, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.734375, + "step": 692, + "time_per_iteration": 2.4667932987213135 + }, + { + "auxiliary_loss_clip": 0.01252043, + "auxiliary_loss_mlp": 0.01072791, + "balance_loss_clip": 1.04124784, + "balance_loss_mlp": 1.07099986, + "epoch": 0.04166541409890275, + "flos": 14828306503680.0, + "grad_norm": 9.166238009589804, + "language_loss": 0.89107299, + "learning_rate": 3.99857300882812e-06, + "loss": 0.9143213, + "num_input_tokens_seen": 14676065, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.8125, + "step": 693, + "time_per_iteration": 2.4823052883148193 + }, + { + "auxiliary_loss_clip": 0.01254961, + "auxiliary_loss_mlp": 0.01065864, + "balance_loss_clip": 1.03637171, + "balance_loss_mlp": 1.07755136, + "epoch": 0.04172553735157072, + "flos": 25808137004160.0, + "grad_norm": 2.1462970179067646, + "language_loss": 0.82179356, + "learning_rate": 3.998558261435626e-06, + "loss": 0.84500182, + "num_input_tokens_seen": 14694955, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.7734375, + "step": 694, + "time_per_iteration": 2.564098834991455 + }, + { + "auxiliary_loss_clip": 0.01253069, + "auxiliary_loss_mlp": 0.01067691, + "balance_loss_clip": 1.03791225, + "balance_loss_mlp": 1.07214785, + "epoch": 0.04178566060423869, + "flos": 24279815174400.0, + "grad_norm": 2.057768586122239, + "language_loss": 0.83656573, + "learning_rate": 3.9985434382576015e-06, + "loss": 0.85977334, + "num_input_tokens_seen": 14715510, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.8125, + "step": 695, + "time_per_iteration": 2.5122969150543213 + }, + { + "auxiliary_loss_clip": 0.01249861, + "auxiliary_loss_mlp": 0.01073319, + "balance_loss_clip": 1.04270577, + "balance_loss_mlp": 1.07313716, + "epoch": 0.04184578385690666, + "flos": 18222008411520.0, + "grad_norm": 2.138642052855673, + "language_loss": 0.8441087, + "learning_rate": 3.99852853929461e-06, + "loss": 0.86734056, + "num_input_tokens_seen": 14731755, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.765625, + "step": 696, + "time_per_iteration": 2.462756872177124 + }, + { + "auxiliary_loss_clip": 0.01247863, + "auxiliary_loss_mlp": 0.01073791, + "balance_loss_clip": 1.04253471, + "balance_loss_mlp": 1.07146811, + "epoch": 0.041905907109574626, + "flos": 22776342577920.0, + "grad_norm": 2.042298821772003, + "language_loss": 0.93134123, + "learning_rate": 3.998513564547216e-06, + "loss": 0.95455778, + "num_input_tokens_seen": 14750810, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.765625, + "step": 697, + "time_per_iteration": 2.5189502239227295 + }, + { + "auxiliary_loss_clip": 0.0124398, + "auxiliary_loss_mlp": 0.01069004, + "balance_loss_clip": 1.04048967, + "balance_loss_mlp": 1.07146859, + "epoch": 0.0419660303622426, + "flos": 20156947176960.0, + "grad_norm": 2.2837511795811207, + "language_loss": 0.83989406, + "learning_rate": 3.998498514015987e-06, + "loss": 0.86302388, + "num_input_tokens_seen": 14768435, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.71875, + "step": 698, + "time_per_iteration": 2.5080907344818115 + }, + { + "auxiliary_loss_clip": 0.01247569, + "auxiliary_loss_mlp": 0.01086724, + "balance_loss_clip": 1.05551505, + "balance_loss_mlp": 1.0711751, + "epoch": 0.042026153614910564, + "flos": 23076376882560.0, + "grad_norm": 1.9405760650289445, + "language_loss": 0.91369909, + "learning_rate": 3.998483387701495e-06, + "loss": 0.93704206, + "num_input_tokens_seen": 14786690, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.7578125, + "step": 699, + "time_per_iteration": 2.4667766094207764 + }, + { + "auxiliary_loss_clip": 0.01113685, + "auxiliary_loss_mlp": 0.01024099, + "balance_loss_clip": 1.01842487, + "balance_loss_mlp": 1.03384757, + "epoch": 0.042086276867578536, + "flos": 64495243370880.0, + "grad_norm": 0.8964375713204716, + "language_loss": 0.67850006, + "learning_rate": 3.998468185604312e-06, + "loss": 0.69987792, + "num_input_tokens_seen": 14853840, + "router_z_loss_clip": 0.05664062, + "router_z_loss_mlp": 0.796875, + "step": 700, + "time_per_iteration": 3.1214911937713623 + }, + { + "auxiliary_loss_clip": 0.01254452, + "auxiliary_loss_mlp": 0.01078478, + "balance_loss_clip": 1.04695964, + "balance_loss_mlp": 1.07502532, + "epoch": 0.04214640012024651, + "flos": 15487016065920.0, + "grad_norm": 2.6789371965697524, + "language_loss": 0.89020562, + "learning_rate": 3.998452907725016e-06, + "loss": 0.913535, + "num_input_tokens_seen": 14869580, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 1.796875, + "step": 701, + "time_per_iteration": 2.46085524559021 + }, + { + "auxiliary_loss_clip": 0.01250018, + "auxiliary_loss_mlp": 0.0107128, + "balance_loss_clip": 1.04085803, + "balance_loss_mlp": 1.07681179, + "epoch": 0.04220652337291447, + "flos": 23877040993920.0, + "grad_norm": 2.2592774096130794, + "language_loss": 0.67494118, + "learning_rate": 3.998437554064184e-06, + "loss": 0.69815421, + "num_input_tokens_seen": 14891065, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.734375, + "step": 702, + "time_per_iteration": 2.5170979499816895 + }, + { + "auxiliary_loss_clip": 0.01112958, + "auxiliary_loss_mlp": 0.01006834, + "balance_loss_clip": 1.00161314, + "balance_loss_mlp": 1.03296542, + "epoch": 0.042266646625582445, + "flos": 63795451628160.0, + "grad_norm": 0.8426087453226233, + "language_loss": 0.60777819, + "learning_rate": 3.9984221246224006e-06, + "loss": 0.62897617, + "num_input_tokens_seen": 14954815, + "router_z_loss_clip": 0.05224609, + "router_z_loss_mlp": 0.80078125, + "step": 703, + "time_per_iteration": 3.155794143676758 + }, + { + "auxiliary_loss_clip": 0.01112196, + "auxiliary_loss_mlp": 0.01010352, + "balance_loss_clip": 1.0050354, + "balance_loss_mlp": 1.03251982, + "epoch": 0.04232676987825041, + "flos": 50018863345920.0, + "grad_norm": 1.0167549333074237, + "language_loss": 0.5776214, + "learning_rate": 3.9984066194002494e-06, + "loss": 0.59884691, + "num_input_tokens_seen": 15003050, + "router_z_loss_clip": 0.05322266, + "router_z_loss_mlp": 0.796875, + "step": 704, + "time_per_iteration": 2.95633602142334 + }, + { + "auxiliary_loss_clip": 0.01252148, + "auxiliary_loss_mlp": 0.01070665, + "balance_loss_clip": 1.0397656, + "balance_loss_mlp": 1.07432342, + "epoch": 0.04238689313091838, + "flos": 21616105368960.0, + "grad_norm": 2.1970745802550624, + "language_loss": 0.87708455, + "learning_rate": 3.998391038398319e-06, + "loss": 0.90031266, + "num_input_tokens_seen": 15021990, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.78125, + "step": 705, + "time_per_iteration": 2.51802921295166 + }, + { + "auxiliary_loss_clip": 0.01238458, + "auxiliary_loss_mlp": 0.01062417, + "balance_loss_clip": 1.03498721, + "balance_loss_mlp": 1.06876624, + "epoch": 0.042447016383586354, + "flos": 19135109070720.0, + "grad_norm": 1.7054575923778923, + "language_loss": 0.71612352, + "learning_rate": 3.998375381617201e-06, + "loss": 0.73913229, + "num_input_tokens_seen": 15040700, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.6953125, + "step": 706, + "time_per_iteration": 2.464270830154419 + }, + { + "auxiliary_loss_clip": 0.01243119, + "auxiliary_loss_mlp": 0.01068207, + "balance_loss_clip": 1.03816676, + "balance_loss_mlp": 1.07029784, + "epoch": 0.04250713963625432, + "flos": 24426007528320.0, + "grad_norm": 2.0927829932503714, + "language_loss": 0.93480223, + "learning_rate": 3.9983596490574875e-06, + "loss": 0.95791554, + "num_input_tokens_seen": 15056725, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.7265625, + "step": 707, + "time_per_iteration": 2.5087966918945312 + }, + { + "auxiliary_loss_clip": 0.01245928, + "auxiliary_loss_mlp": 0.01065311, + "balance_loss_clip": 1.03441203, + "balance_loss_mlp": 1.0676806, + "epoch": 0.04256726288892229, + "flos": 30367391333760.0, + "grad_norm": 2.3244890877745883, + "language_loss": 0.81275034, + "learning_rate": 3.998343840719776e-06, + "loss": 0.83586276, + "num_input_tokens_seen": 15077550, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.78125, + "step": 708, + "time_per_iteration": 2.557119607925415 + }, + { + "auxiliary_loss_clip": 0.01251091, + "auxiliary_loss_mlp": 0.01073266, + "balance_loss_clip": 1.04239082, + "balance_loss_mlp": 1.07195199, + "epoch": 0.04262738614159026, + "flos": 16362661818240.0, + "grad_norm": 2.2553269788690224, + "language_loss": 0.82229173, + "learning_rate": 3.998327956604666e-06, + "loss": 0.84553528, + "num_input_tokens_seen": 15094955, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.796875, + "step": 709, + "time_per_iteration": 2.4828600883483887 + }, + { + "auxiliary_loss_clip": 0.01256006, + "auxiliary_loss_mlp": 0.01064315, + "balance_loss_clip": 1.03389335, + "balance_loss_mlp": 1.07517564, + "epoch": 0.04268750939425823, + "flos": 20412379768320.0, + "grad_norm": 2.534138916450152, + "language_loss": 0.85063422, + "learning_rate": 3.99831199671276e-06, + "loss": 0.87383747, + "num_input_tokens_seen": 15113395, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.8125, + "step": 710, + "time_per_iteration": 2.453641653060913 + }, + { + "auxiliary_loss_clip": 0.01254724, + "auxiliary_loss_mlp": 0.01070713, + "balance_loss_clip": 1.04114938, + "balance_loss_mlp": 1.07757199, + "epoch": 0.0427476326469262, + "flos": 20302959962880.0, + "grad_norm": 3.316207411440496, + "language_loss": 0.84996349, + "learning_rate": 3.998295961044662e-06, + "loss": 0.87321782, + "num_input_tokens_seen": 15132920, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.7734375, + "step": 711, + "time_per_iteration": 2.4685802459716797 + }, + { + "auxiliary_loss_clip": 0.01246695, + "auxiliary_loss_mlp": 0.01069917, + "balance_loss_clip": 1.03827882, + "balance_loss_mlp": 1.07044697, + "epoch": 0.042807755899594166, + "flos": 21650794928640.0, + "grad_norm": 2.000925777751644, + "language_loss": 0.85439169, + "learning_rate": 3.9982798496009804e-06, + "loss": 0.87755781, + "num_input_tokens_seen": 15153115, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.765625, + "step": 712, + "time_per_iteration": 2.5002598762512207 + }, + { + "auxiliary_loss_clip": 0.01252579, + "auxiliary_loss_mlp": 0.0107294, + "balance_loss_clip": 1.0445205, + "balance_loss_mlp": 1.0701685, + "epoch": 0.04286787915226214, + "flos": 21435007973760.0, + "grad_norm": 3.2453781921901728, + "language_loss": 0.90829903, + "learning_rate": 3.998263662382328e-06, + "loss": 0.9315542, + "num_input_tokens_seen": 15172770, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.8203125, + "step": 713, + "time_per_iteration": 2.4908998012542725 + }, + { + "auxiliary_loss_clip": 0.01108903, + "auxiliary_loss_mlp": 0.01017546, + "balance_loss_clip": 1.01187158, + "balance_loss_mlp": 1.0288384, + "epoch": 0.04292800240493011, + "flos": 66397970615040.0, + "grad_norm": 0.8777811618173876, + "language_loss": 0.63746506, + "learning_rate": 3.9982473993893165e-06, + "loss": 0.65872955, + "num_input_tokens_seen": 15240055, + "router_z_loss_clip": 0.05664062, + "router_z_loss_mlp": 0.80078125, + "step": 714, + "time_per_iteration": 3.158921480178833 + }, + { + "auxiliary_loss_clip": 0.01249012, + "auxiliary_loss_mlp": 0.01080593, + "balance_loss_clip": 1.05076694, + "balance_loss_mlp": 1.07545531, + "epoch": 0.042988125657598075, + "flos": 31650264552960.0, + "grad_norm": 2.1622955343434382, + "language_loss": 0.74528754, + "learning_rate": 3.998231060622563e-06, + "loss": 0.76858354, + "num_input_tokens_seen": 15261585, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 1.734375, + "step": 715, + "time_per_iteration": 2.5759642124176025 + }, + { + "auxiliary_loss_clip": 0.01250142, + "auxiliary_loss_mlp": 0.01077038, + "balance_loss_clip": 1.04534006, + "balance_loss_mlp": 1.07450986, + "epoch": 0.04304824891026605, + "flos": 33248468292480.0, + "grad_norm": 2.2108029839954213, + "language_loss": 0.72630137, + "learning_rate": 3.998214646082688e-06, + "loss": 0.74957311, + "num_input_tokens_seen": 15281160, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.7578125, + "step": 716, + "time_per_iteration": 2.5973668098449707 + }, + { + "auxiliary_loss_clip": 0.01104967, + "auxiliary_loss_mlp": 0.01006876, + "balance_loss_clip": 1.00091577, + "balance_loss_mlp": 1.02687418, + "epoch": 0.04310837216293401, + "flos": 64064782782720.0, + "grad_norm": 0.9052113850529176, + "language_loss": 0.65557301, + "learning_rate": 3.998198155770314e-06, + "loss": 0.67669141, + "num_input_tokens_seen": 15344505, + "router_z_loss_clip": 0.05957031, + "router_z_loss_mlp": 0.78125, + "step": 717, + "time_per_iteration": 3.114957571029663 + }, + { + "auxiliary_loss_clip": 0.01104969, + "auxiliary_loss_mlp": 0.01003955, + "balance_loss_clip": 0.99780369, + "balance_loss_mlp": 1.02667391, + "epoch": 0.043168495415601985, + "flos": 61343757849600.0, + "grad_norm": 0.9880116621267147, + "language_loss": 0.58762264, + "learning_rate": 3.998181589686065e-06, + "loss": 0.60871184, + "num_input_tokens_seen": 15404050, + "router_z_loss_clip": 0.0612793, + "router_z_loss_mlp": 0.78125, + "step": 718, + "time_per_iteration": 2.910278797149658 + }, + { + "auxiliary_loss_clip": 0.01248398, + "auxiliary_loss_mlp": 0.01074809, + "balance_loss_clip": 1.04314709, + "balance_loss_mlp": 1.0758605, + "epoch": 0.04322861866826996, + "flos": 20704261685760.0, + "grad_norm": 1.8513004644505335, + "language_loss": 0.91198725, + "learning_rate": 3.99816494783057e-06, + "loss": 0.93521935, + "num_input_tokens_seen": 15424190, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.7265625, + "step": 719, + "time_per_iteration": 2.492509126663208 + }, + { + "auxiliary_loss_clip": 0.01244347, + "auxiliary_loss_mlp": 0.0107141, + "balance_loss_clip": 1.04208493, + "balance_loss_mlp": 1.06931555, + "epoch": 0.04328874192093792, + "flos": 30373352991360.0, + "grad_norm": 1.803377327315558, + "language_loss": 0.66468138, + "learning_rate": 3.99814823020446e-06, + "loss": 0.68783891, + "num_input_tokens_seen": 15446500, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.75, + "step": 720, + "time_per_iteration": 2.6061203479766846 + }, + { + "auxiliary_loss_clip": 0.01244682, + "auxiliary_loss_mlp": 0.01079523, + "balance_loss_clip": 1.04895782, + "balance_loss_mlp": 1.07152998, + "epoch": 0.043348865173605894, + "flos": 21944795748480.0, + "grad_norm": 1.8832143461121282, + "language_loss": 0.77743989, + "learning_rate": 3.9981314368083684e-06, + "loss": 0.80068195, + "num_input_tokens_seen": 15465830, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.734375, + "step": 721, + "time_per_iteration": 2.5255632400512695 + }, + { + "auxiliary_loss_clip": 0.01251204, + "auxiliary_loss_mlp": 0.0108774, + "balance_loss_clip": 1.05879569, + "balance_loss_mlp": 1.07584524, + "epoch": 0.04340898842627386, + "flos": 15264225959040.0, + "grad_norm": 3.027898330451403, + "language_loss": 0.87873065, + "learning_rate": 3.998114567642933e-06, + "loss": 0.90212011, + "num_input_tokens_seen": 15479985, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.75, + "step": 722, + "time_per_iteration": 2.536283493041992 + }, + { + "auxiliary_loss_clip": 0.0125365, + "auxiliary_loss_mlp": 0.01075404, + "balance_loss_clip": 1.04660296, + "balance_loss_mlp": 1.0758208, + "epoch": 0.04346911167894183, + "flos": 27965434913280.0, + "grad_norm": 30.376200688873947, + "language_loss": 0.84770942, + "learning_rate": 3.998097622708792e-06, + "loss": 0.87099999, + "num_input_tokens_seen": 15501545, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.78125, + "step": 723, + "time_per_iteration": 2.5167360305786133 + }, + { + "auxiliary_loss_clip": 0.01256754, + "auxiliary_loss_mlp": 0.01076494, + "balance_loss_clip": 1.04638171, + "balance_loss_mlp": 1.07828176, + "epoch": 0.0435292349316098, + "flos": 29242202820480.0, + "grad_norm": 1.9203333396820472, + "language_loss": 0.82793808, + "learning_rate": 3.99808060200659e-06, + "loss": 0.85127056, + "num_input_tokens_seen": 15521725, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.78125, + "step": 724, + "time_per_iteration": 2.5561347007751465 + }, + { + "auxiliary_loss_clip": 0.0125067, + "auxiliary_loss_mlp": 0.01090321, + "balance_loss_clip": 1.05975556, + "balance_loss_mlp": 1.07561088, + "epoch": 0.04358935818427777, + "flos": 20558356640640.0, + "grad_norm": 1.8200683460759586, + "language_loss": 0.79530561, + "learning_rate": 3.998063505536971e-06, + "loss": 0.81871551, + "num_input_tokens_seen": 15540910, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.75, + "step": 725, + "time_per_iteration": 2.4551918506622314 + }, + { + "auxiliary_loss_clip": 0.0126067, + "auxiliary_loss_mlp": 0.01076358, + "balance_loss_clip": 1.04529178, + "balance_loss_mlp": 1.07715642, + "epoch": 0.04364948143694574, + "flos": 14464926564480.0, + "grad_norm": 2.8106150104808485, + "language_loss": 0.87100697, + "learning_rate": 3.998046333300584e-06, + "loss": 0.89437729, + "num_input_tokens_seen": 15558640, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 1.8359375, + "step": 726, + "time_per_iteration": 5.350574731826782 + }, + { + "auxiliary_loss_clip": 0.01106916, + "auxiliary_loss_mlp": 0.01011263, + "balance_loss_clip": 1.00542223, + "balance_loss_mlp": 1.02866364, + "epoch": 0.043709604689613706, + "flos": 50067268922880.0, + "grad_norm": 0.9088619113669424, + "language_loss": 0.5587045, + "learning_rate": 3.998029085298079e-06, + "loss": 0.57988632, + "num_input_tokens_seen": 15612975, + "router_z_loss_clip": 0.05834961, + "router_z_loss_mlp": 0.78125, + "step": 727, + "time_per_iteration": 3.1539440155029297 + }, + { + "auxiliary_loss_clip": 0.01251236, + "auxiliary_loss_mlp": 0.01076851, + "balance_loss_clip": 1.04676282, + "balance_loss_mlp": 1.07453549, + "epoch": 0.04376972794228168, + "flos": 13991588115840.0, + "grad_norm": 2.397861957488019, + "language_loss": 0.82248902, + "learning_rate": 3.998011761530112e-06, + "loss": 0.84576982, + "num_input_tokens_seen": 15631070, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.765625, + "step": 728, + "time_per_iteration": 2.4445250034332275 + }, + { + "auxiliary_loss_clip": 0.01244631, + "auxiliary_loss_mlp": 0.01068516, + "balance_loss_clip": 1.0395956, + "balance_loss_mlp": 1.07265663, + "epoch": 0.04382985119494965, + "flos": 22009901149440.0, + "grad_norm": 2.2715062050859745, + "language_loss": 0.77187145, + "learning_rate": 3.997994361997338e-06, + "loss": 0.79500294, + "num_input_tokens_seen": 15647825, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.71875, + "step": 729, + "time_per_iteration": 2.5091514587402344 + }, + { + "auxiliary_loss_clip": 0.0125233, + "auxiliary_loss_mlp": 0.01074681, + "balance_loss_clip": 1.04502177, + "balance_loss_mlp": 1.07452357, + "epoch": 0.043889974447617615, + "flos": 24206521472640.0, + "grad_norm": 2.258754879989397, + "language_loss": 0.9515503, + "learning_rate": 3.997976886700417e-06, + "loss": 0.97482038, + "num_input_tokens_seen": 15668260, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.78125, + "step": 730, + "time_per_iteration": 2.4795522689819336 + }, + { + "auxiliary_loss_clip": 0.0124716, + "auxiliary_loss_mlp": 0.01065838, + "balance_loss_clip": 1.03496313, + "balance_loss_mlp": 1.07000017, + "epoch": 0.04395009770028559, + "flos": 17274541415040.0, + "grad_norm": 2.2097226025839483, + "language_loss": 0.88016784, + "learning_rate": 3.997959335640013e-06, + "loss": 0.90329784, + "num_input_tokens_seen": 15685630, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.7734375, + "step": 731, + "time_per_iteration": 2.4678709506988525 + }, + { + "auxiliary_loss_clip": 0.01251191, + "auxiliary_loss_mlp": 0.01073318, + "balance_loss_clip": 1.04589999, + "balance_loss_mlp": 1.07521737, + "epoch": 0.04401022095295355, + "flos": 12310286261760.0, + "grad_norm": 3.3707184473936587, + "language_loss": 0.88656235, + "learning_rate": 3.997941708816791e-06, + "loss": 0.90980744, + "num_input_tokens_seen": 15698645, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.7578125, + "step": 732, + "time_per_iteration": 2.4135851860046387 + }, + { + "auxiliary_loss_clip": 0.01251086, + "auxiliary_loss_mlp": 0.01073165, + "balance_loss_clip": 1.04288554, + "balance_loss_mlp": 1.07443762, + "epoch": 0.044070344205621524, + "flos": 20959658363520.0, + "grad_norm": 2.131822645051773, + "language_loss": 0.86010063, + "learning_rate": 3.997924006231419e-06, + "loss": 0.88334322, + "num_input_tokens_seen": 15716775, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.765625, + "step": 733, + "time_per_iteration": 2.491278648376465 + }, + { + "auxiliary_loss_clip": 0.01256254, + "auxiliary_loss_mlp": 0.01078649, + "balance_loss_clip": 1.04715347, + "balance_loss_mlp": 1.07624841, + "epoch": 0.044130467458289496, + "flos": 13845288021120.0, + "grad_norm": 2.0564057381838885, + "language_loss": 0.91515708, + "learning_rate": 3.9979062278845685e-06, + "loss": 0.93850613, + "num_input_tokens_seen": 15733320, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 1.796875, + "step": 734, + "time_per_iteration": 2.451258897781372 + }, + { + "auxiliary_loss_clip": 0.01247796, + "auxiliary_loss_mlp": 0.01064289, + "balance_loss_clip": 1.03696656, + "balance_loss_mlp": 1.07613921, + "epoch": 0.04419059071095746, + "flos": 28655063107200.0, + "grad_norm": 1.8863467898976456, + "language_loss": 0.77831066, + "learning_rate": 3.9978883737769125e-06, + "loss": 0.8014316, + "num_input_tokens_seen": 15752705, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.71875, + "step": 735, + "time_per_iteration": 2.558958053588867 + }, + { + "auxiliary_loss_clip": 0.01240634, + "auxiliary_loss_mlp": 0.01063468, + "balance_loss_clip": 1.03526342, + "balance_loss_mlp": 1.06886315, + "epoch": 0.04425071396362543, + "flos": 28183304856960.0, + "grad_norm": 2.1337917025346074, + "language_loss": 0.88456166, + "learning_rate": 3.9978704439091305e-06, + "loss": 0.90760267, + "num_input_tokens_seen": 15772800, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.71875, + "step": 736, + "time_per_iteration": 2.5100033283233643 + }, + { + "auxiliary_loss_clip": 0.01242163, + "auxiliary_loss_mlp": 0.01067088, + "balance_loss_clip": 1.03995562, + "balance_loss_mlp": 1.07473993, + "epoch": 0.0443108372162934, + "flos": 23658452778240.0, + "grad_norm": 1.954630170969084, + "language_loss": 0.84155536, + "learning_rate": 3.997852438281901e-06, + "loss": 0.86464787, + "num_input_tokens_seen": 15793665, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.671875, + "step": 737, + "time_per_iteration": 2.5015766620635986 + }, + { + "auxiliary_loss_clip": 0.01251899, + "auxiliary_loss_mlp": 0.01072468, + "balance_loss_clip": 1.04077065, + "balance_loss_mlp": 1.07667851, + "epoch": 0.04437096046896137, + "flos": 33979861025280.0, + "grad_norm": 2.0376910697928947, + "language_loss": 0.8518666, + "learning_rate": 3.997834356895906e-06, + "loss": 0.87511027, + "num_input_tokens_seen": 15813175, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.75, + "step": 738, + "time_per_iteration": 2.5576610565185547 + }, + { + "auxiliary_loss_clip": 0.01112093, + "auxiliary_loss_mlp": 0.01046041, + "balance_loss_clip": 1.04048622, + "balance_loss_mlp": 1.03298163, + "epoch": 0.04443108372162934, + "flos": 67397506375680.0, + "grad_norm": 0.8684121686227821, + "language_loss": 0.59110028, + "learning_rate": 3.9978161997518324e-06, + "loss": 0.61268163, + "num_input_tokens_seen": 15872050, + "router_z_loss_clip": 0.05566406, + "router_z_loss_mlp": 0.7890625, + "step": 739, + "time_per_iteration": 3.0643718242645264 + }, + { + "auxiliary_loss_clip": 0.0124678, + "auxiliary_loss_mlp": 0.01070548, + "balance_loss_clip": 1.04220033, + "balance_loss_mlp": 1.07513726, + "epoch": 0.04449120697429731, + "flos": 29752672953600.0, + "grad_norm": 2.1860888775648695, + "language_loss": 0.91622591, + "learning_rate": 3.997797966850369e-06, + "loss": 0.93939924, + "num_input_tokens_seen": 15891085, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.71875, + "step": 740, + "time_per_iteration": 2.5448389053344727 + }, + { + "auxiliary_loss_clip": 0.01252276, + "auxiliary_loss_mlp": 0.01063948, + "balance_loss_clip": 1.03693473, + "balance_loss_mlp": 1.07766986, + "epoch": 0.04455133022696528, + "flos": 36502119072000.0, + "grad_norm": 2.01644947055736, + "language_loss": 0.71842492, + "learning_rate": 3.997779658192205e-06, + "loss": 0.74158716, + "num_input_tokens_seen": 15914225, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.7421875, + "step": 741, + "time_per_iteration": 2.5979790687561035 + }, + { + "auxiliary_loss_clip": 0.01240373, + "auxiliary_loss_mlp": 0.01073056, + "balance_loss_clip": 1.04532838, + "balance_loss_mlp": 1.07044411, + "epoch": 0.044611453479633245, + "flos": 28803661672320.0, + "grad_norm": 1.722907957661965, + "language_loss": 0.88555831, + "learning_rate": 3.997761273778037e-06, + "loss": 0.9086926, + "num_input_tokens_seen": 15934540, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.703125, + "step": 742, + "time_per_iteration": 2.6367549896240234 + }, + { + "auxiliary_loss_clip": 0.0124233, + "auxiliary_loss_mlp": 0.01061826, + "balance_loss_clip": 1.03253651, + "balance_loss_mlp": 1.07209873, + "epoch": 0.04467157673230122, + "flos": 20010970304640.0, + "grad_norm": 2.0306401320231693, + "language_loss": 0.83823264, + "learning_rate": 3.997742813608561e-06, + "loss": 0.86127412, + "num_input_tokens_seen": 15952560, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.703125, + "step": 743, + "time_per_iteration": 2.516587972640991 + }, + { + "auxiliary_loss_clip": 0.01249271, + "auxiliary_loss_mlp": 0.01068722, + "balance_loss_clip": 1.04161429, + "balance_loss_mlp": 1.07474804, + "epoch": 0.04473169998496919, + "flos": 18004964480640.0, + "grad_norm": 3.0889105946672704, + "language_loss": 0.79948521, + "learning_rate": 3.997724277684479e-06, + "loss": 0.8226651, + "num_input_tokens_seen": 15970620, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.75, + "step": 744, + "time_per_iteration": 2.44805645942688 + }, + { + "auxiliary_loss_clip": 0.01243449, + "auxiliary_loss_mlp": 0.01067337, + "balance_loss_clip": 1.04037201, + "balance_loss_mlp": 1.07279778, + "epoch": 0.044791823237637154, + "flos": 20631722169600.0, + "grad_norm": 2.388036535067576, + "language_loss": 0.85400093, + "learning_rate": 3.99770566600649e-06, + "loss": 0.87710881, + "num_input_tokens_seen": 15987325, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.703125, + "step": 745, + "time_per_iteration": 2.4790430068969727 + }, + { + "auxiliary_loss_clip": 0.01242131, + "auxiliary_loss_mlp": 0.01064523, + "balance_loss_clip": 1.03569877, + "balance_loss_mlp": 1.0714339, + "epoch": 0.04485194649030513, + "flos": 31176171918720.0, + "grad_norm": 2.1215702602167688, + "language_loss": 0.6866799, + "learning_rate": 3.997686978575302e-06, + "loss": 0.70974648, + "num_input_tokens_seen": 16008310, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.7109375, + "step": 746, + "time_per_iteration": 2.5645759105682373 + }, + { + "auxiliary_loss_clip": 0.01250748, + "auxiliary_loss_mlp": 0.010776, + "balance_loss_clip": 1.04748797, + "balance_loss_mlp": 1.0783143, + "epoch": 0.04491206974297309, + "flos": 26143291831680.0, + "grad_norm": 2.1376273799467547, + "language_loss": 0.68823957, + "learning_rate": 3.997668215391625e-06, + "loss": 0.71152306, + "num_input_tokens_seen": 16029620, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.7265625, + "step": 747, + "time_per_iteration": 2.5267317295074463 + }, + { + "auxiliary_loss_clip": 0.01248685, + "auxiliary_loss_mlp": 0.01079454, + "balance_loss_clip": 1.04984236, + "balance_loss_mlp": 1.07314527, + "epoch": 0.044972192995641064, + "flos": 20667668705280.0, + "grad_norm": 1.9669744064389407, + "language_loss": 0.66721869, + "learning_rate": 3.997649376456168e-06, + "loss": 0.69050002, + "num_input_tokens_seen": 16049065, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.75, + "step": 748, + "time_per_iteration": 2.4818925857543945 + }, + { + "auxiliary_loss_clip": 0.01250197, + "auxiliary_loss_mlp": 0.01082391, + "balance_loss_clip": 1.05320835, + "balance_loss_mlp": 1.07779491, + "epoch": 0.045032316248309036, + "flos": 16106834177280.0, + "grad_norm": 2.650057046326624, + "language_loss": 0.76540357, + "learning_rate": 3.997630461769647e-06, + "loss": 0.78872949, + "num_input_tokens_seen": 16066765, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.7265625, + "step": 749, + "time_per_iteration": 2.4454426765441895 + }, + { + "auxiliary_loss_clip": 0.01251335, + "auxiliary_loss_mlp": 0.01077492, + "balance_loss_clip": 1.04883409, + "balance_loss_mlp": 1.0770005, + "epoch": 0.045092439500977, + "flos": 17858843953920.0, + "grad_norm": 2.0345099055640317, + "language_loss": 0.88970172, + "learning_rate": 3.997611471332778e-06, + "loss": 0.91298997, + "num_input_tokens_seen": 16085980, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.7421875, + "step": 750, + "time_per_iteration": 2.458716630935669 + }, + { + "auxiliary_loss_clip": 0.01247033, + "auxiliary_loss_mlp": 0.01074335, + "balance_loss_clip": 1.04295921, + "balance_loss_mlp": 1.07139015, + "epoch": 0.04515256275364497, + "flos": 24462815990400.0, + "grad_norm": 2.3716924268159367, + "language_loss": 0.74869245, + "learning_rate": 3.9975924051462825e-06, + "loss": 0.77190608, + "num_input_tokens_seen": 16106260, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 1.7578125, + "step": 751, + "time_per_iteration": 2.5231218338012695 + }, + { + "auxiliary_loss_clip": 0.01243504, + "auxiliary_loss_mlp": 0.01073697, + "balance_loss_clip": 1.04573071, + "balance_loss_mlp": 1.07175446, + "epoch": 0.04521268600631294, + "flos": 20916385453440.0, + "grad_norm": 2.2224468826240975, + "language_loss": 0.69360238, + "learning_rate": 3.997573263210883e-06, + "loss": 0.7167744, + "num_input_tokens_seen": 16123475, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.71875, + "step": 752, + "time_per_iteration": 2.4620048999786377 + }, + { + "auxiliary_loss_clip": 0.01244736, + "auxiliary_loss_mlp": 0.01057192, + "balance_loss_clip": 1.02927327, + "balance_loss_mlp": 1.07154715, + "epoch": 0.04527280925898091, + "flos": 13371374954880.0, + "grad_norm": 2.984649176219999, + "language_loss": 0.91634125, + "learning_rate": 3.997554045527305e-06, + "loss": 0.9393605, + "num_input_tokens_seen": 16138335, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.734375, + "step": 753, + "time_per_iteration": 2.4722437858581543 + }, + { + "auxiliary_loss_clip": 0.01249124, + "auxiliary_loss_mlp": 0.01075497, + "balance_loss_clip": 1.04728031, + "balance_loss_mlp": 1.07501864, + "epoch": 0.04533293251164888, + "flos": 23254565276160.0, + "grad_norm": 2.2056938633592975, + "language_loss": 0.91197902, + "learning_rate": 3.997534752096277e-06, + "loss": 0.93522525, + "num_input_tokens_seen": 16157110, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.7421875, + "step": 754, + "time_per_iteration": 2.472975492477417 + }, + { + "auxiliary_loss_clip": 0.01238249, + "auxiliary_loss_mlp": 0.0107062, + "balance_loss_clip": 1.04144955, + "balance_loss_mlp": 1.07163191, + "epoch": 0.04539305576431685, + "flos": 12422004537600.0, + "grad_norm": 2.234660546964849, + "language_loss": 0.78528345, + "learning_rate": 3.997515382918531e-06, + "loss": 0.80837214, + "num_input_tokens_seen": 16174155, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.671875, + "step": 755, + "time_per_iteration": 2.4704174995422363 + }, + { + "auxiliary_loss_clip": 0.01248815, + "auxiliary_loss_mlp": 0.0107981, + "balance_loss_clip": 1.05100918, + "balance_loss_mlp": 1.07416105, + "epoch": 0.04545317901698482, + "flos": 16070995382400.0, + "grad_norm": 1.9667934561660614, + "language_loss": 0.78451371, + "learning_rate": 3.9974959379948015e-06, + "loss": 0.80779994, + "num_input_tokens_seen": 16192240, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.75, + "step": 756, + "time_per_iteration": 2.4873547554016113 + }, + { + "auxiliary_loss_clip": 0.01118229, + "auxiliary_loss_mlp": 0.010118, + "balance_loss_clip": 1.00600612, + "balance_loss_mlp": 1.03558636, + "epoch": 0.045513302269652785, + "flos": 66396139021440.0, + "grad_norm": 0.8118987787253854, + "language_loss": 0.62730747, + "learning_rate": 3.997476417325827e-06, + "loss": 0.64860779, + "num_input_tokens_seen": 16255775, + "router_z_loss_clip": 0.05786133, + "router_z_loss_mlp": 0.828125, + "step": 757, + "time_per_iteration": 3.1292941570281982 + }, + { + "auxiliary_loss_clip": 0.01242797, + "auxiliary_loss_mlp": 0.01069674, + "balance_loss_clip": 1.04220784, + "balance_loss_mlp": 1.0731318, + "epoch": 0.04557342552232076, + "flos": 21471169991040.0, + "grad_norm": 1.5194495460848947, + "language_loss": 0.84329176, + "learning_rate": 3.997456820912346e-06, + "loss": 0.86641645, + "num_input_tokens_seen": 16277015, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.6953125, + "step": 758, + "time_per_iteration": 2.498905658721924 + }, + { + "auxiliary_loss_clip": 0.01237511, + "auxiliary_loss_mlp": 0.01067085, + "balance_loss_clip": 1.0405376, + "balance_loss_mlp": 1.06733441, + "epoch": 0.04563354877498873, + "flos": 23732680233600.0, + "grad_norm": 2.0933163310434963, + "language_loss": 0.88315606, + "learning_rate": 3.997437148755101e-06, + "loss": 0.90620202, + "num_input_tokens_seen": 16296005, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.703125, + "step": 759, + "time_per_iteration": 2.5122711658477783 + }, + { + "auxiliary_loss_clip": 0.01248241, + "auxiliary_loss_mlp": 0.01075804, + "balance_loss_clip": 1.04644299, + "balance_loss_mlp": 1.075526, + "epoch": 0.045693672027656694, + "flos": 25735741142400.0, + "grad_norm": 2.170817451496144, + "language_loss": 0.73644727, + "learning_rate": 3.9974174008548405e-06, + "loss": 0.75968778, + "num_input_tokens_seen": 16315300, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.7265625, + "step": 760, + "time_per_iteration": 2.511322021484375 + }, + { + "auxiliary_loss_clip": 0.01244913, + "auxiliary_loss_mlp": 0.01073409, + "balance_loss_clip": 1.04630077, + "balance_loss_mlp": 1.07509935, + "epoch": 0.045753795280324666, + "flos": 19719016560000.0, + "grad_norm": 2.192184725657734, + "language_loss": 0.82177126, + "learning_rate": 3.9973975772123105e-06, + "loss": 0.84495443, + "num_input_tokens_seen": 16333820, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.6953125, + "step": 761, + "time_per_iteration": 2.4831535816192627 + }, + { + "auxiliary_loss_clip": 0.01238913, + "auxiliary_loss_mlp": 0.0107061, + "balance_loss_clip": 1.04245305, + "balance_loss_mlp": 1.06961203, + "epoch": 0.04581391853299264, + "flos": 23255786338560.0, + "grad_norm": 1.7986428347309282, + "language_loss": 0.79732436, + "learning_rate": 3.997377677828266e-06, + "loss": 0.82041955, + "num_input_tokens_seen": 16355290, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.6953125, + "step": 762, + "time_per_iteration": 2.52453875541687 + }, + { + "auxiliary_loss_clip": 0.01117014, + "auxiliary_loss_mlp": 0.01036706, + "balance_loss_clip": 1.03112733, + "balance_loss_mlp": 1.03455913, + "epoch": 0.0458740417856606, + "flos": 64231155601920.0, + "grad_norm": 1.008821564963746, + "language_loss": 0.58659625, + "learning_rate": 3.9973577027034585e-06, + "loss": 0.60813344, + "num_input_tokens_seen": 16415995, + "router_z_loss_clip": 0.0559082, + "router_z_loss_mlp": 0.82421875, + "step": 763, + "time_per_iteration": 3.1429429054260254 + }, + { + "auxiliary_loss_clip": 0.01245459, + "auxiliary_loss_mlp": 0.01081866, + "balance_loss_clip": 1.05381632, + "balance_loss_mlp": 1.07288039, + "epoch": 0.045934165038328575, + "flos": 20770121272320.0, + "grad_norm": 2.8717486924500517, + "language_loss": 0.87752867, + "learning_rate": 3.9973376518386475e-06, + "loss": 0.9008019, + "num_input_tokens_seen": 16433120, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.7265625, + "step": 764, + "time_per_iteration": 2.4727554321289062 + }, + { + "auxiliary_loss_clip": 0.01248006, + "auxiliary_loss_mlp": 0.01079864, + "balance_loss_clip": 1.05192137, + "balance_loss_mlp": 1.07565248, + "epoch": 0.04599428829099654, + "flos": 30262891691520.0, + "grad_norm": 1.9426139778845304, + "language_loss": 0.86118066, + "learning_rate": 3.997317525234592e-06, + "loss": 0.88445938, + "num_input_tokens_seen": 16453360, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.71875, + "step": 765, + "time_per_iteration": 2.5370731353759766 + }, + { + "auxiliary_loss_clip": 0.01248646, + "auxiliary_loss_mlp": 0.01070241, + "balance_loss_clip": 1.03912735, + "balance_loss_mlp": 1.07336497, + "epoch": 0.04605441154366451, + "flos": 23038921975680.0, + "grad_norm": 3.0624701923152453, + "language_loss": 0.87846982, + "learning_rate": 3.997297322892056e-06, + "loss": 0.90165865, + "num_input_tokens_seen": 16471160, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 1.75, + "step": 766, + "time_per_iteration": 2.475677013397217 + }, + { + "auxiliary_loss_clip": 0.01239894, + "auxiliary_loss_mlp": 0.01067957, + "balance_loss_clip": 1.03979921, + "balance_loss_mlp": 1.06896472, + "epoch": 0.046114534796332485, + "flos": 22017407091840.0, + "grad_norm": 2.616885530601855, + "language_loss": 0.84314167, + "learning_rate": 3.997277044811806e-06, + "loss": 0.86622024, + "num_input_tokens_seen": 16488940, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.703125, + "step": 767, + "time_per_iteration": 2.465449810028076 + }, + { + "auxiliary_loss_clip": 0.01245421, + "auxiliary_loss_mlp": 0.01060911, + "balance_loss_clip": 1.03249097, + "balance_loss_mlp": 1.07569289, + "epoch": 0.04617465804900045, + "flos": 29862380067840.0, + "grad_norm": 2.056931367891973, + "language_loss": 0.87013769, + "learning_rate": 3.99725669099461e-06, + "loss": 0.89320099, + "num_input_tokens_seen": 16509505, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.703125, + "step": 768, + "time_per_iteration": 5.441957235336304 + }, + { + "auxiliary_loss_clip": 0.01238542, + "auxiliary_loss_mlp": 0.01069073, + "balance_loss_clip": 1.04184508, + "balance_loss_mlp": 1.06768477, + "epoch": 0.04623478130166842, + "flos": 25630056351360.0, + "grad_norm": 2.1199205591749033, + "language_loss": 0.75022334, + "learning_rate": 3.9972362614412395e-06, + "loss": 0.77329946, + "num_input_tokens_seen": 16528840, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.703125, + "step": 769, + "time_per_iteration": 2.5294675827026367 + }, + { + "auxiliary_loss_clip": 0.01238179, + "auxiliary_loss_mlp": 0.01063477, + "balance_loss_clip": 1.03734684, + "balance_loss_mlp": 1.07084632, + "epoch": 0.04629490455433639, + "flos": 20449080489600.0, + "grad_norm": 1.886534334963383, + "language_loss": 0.86162585, + "learning_rate": 3.997215756152471e-06, + "loss": 0.88464236, + "num_input_tokens_seen": 16548335, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.671875, + "step": 770, + "time_per_iteration": 2.4646449089050293 + }, + { + "auxiliary_loss_clip": 0.01248004, + "auxiliary_loss_mlp": 0.01066015, + "balance_loss_clip": 1.0385015, + "balance_loss_mlp": 1.07160687, + "epoch": 0.04635502780700436, + "flos": 23148736830720.0, + "grad_norm": 2.8625416592988477, + "language_loss": 0.87259042, + "learning_rate": 3.99719517512908e-06, + "loss": 0.89573061, + "num_input_tokens_seen": 16567725, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.765625, + "step": 771, + "time_per_iteration": 2.512622117996216 + }, + { + "auxiliary_loss_clip": 0.01246333, + "auxiliary_loss_mlp": 0.01076832, + "balance_loss_clip": 1.04726815, + "balance_loss_mlp": 1.06911707, + "epoch": 0.04641515105967233, + "flos": 23292020183040.0, + "grad_norm": 2.3640102097360587, + "language_loss": 0.83736801, + "learning_rate": 3.997174518371848e-06, + "loss": 0.86059964, + "num_input_tokens_seen": 16588175, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.7734375, + "step": 772, + "time_per_iteration": 2.509572982788086 + }, + { + "auxiliary_loss_clip": 0.01243608, + "auxiliary_loss_mlp": 0.01064058, + "balance_loss_clip": 1.03721189, + "balance_loss_mlp": 1.07392263, + "epoch": 0.046475274312340296, + "flos": 25115204759040.0, + "grad_norm": 2.3097217333215694, + "language_loss": 0.73399591, + "learning_rate": 3.997153785881557e-06, + "loss": 0.75707257, + "num_input_tokens_seen": 16607735, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.6953125, + "step": 773, + "time_per_iteration": 2.5539331436157227 + }, + { + "auxiliary_loss_clip": 0.01240234, + "auxiliary_loss_mlp": 0.01065536, + "balance_loss_clip": 1.03624654, + "balance_loss_mlp": 1.07288945, + "epoch": 0.04653539756500827, + "flos": 25264916645760.0, + "grad_norm": 2.066531290075925, + "language_loss": 0.78523052, + "learning_rate": 3.997132977658996e-06, + "loss": 0.80828828, + "num_input_tokens_seen": 16627225, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.671875, + "step": 774, + "time_per_iteration": 2.5350210666656494 + }, + { + "auxiliary_loss_clip": 0.01239038, + "auxiliary_loss_mlp": 0.01065848, + "balance_loss_clip": 1.03955007, + "balance_loss_mlp": 1.07101154, + "epoch": 0.046595520817676234, + "flos": 35404150089600.0, + "grad_norm": 2.187480231527322, + "language_loss": 0.73357666, + "learning_rate": 3.997112093704952e-06, + "loss": 0.75662553, + "num_input_tokens_seen": 16647785, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.6796875, + "step": 775, + "time_per_iteration": 2.6102981567382812 + }, + { + "auxiliary_loss_clip": 0.01240703, + "auxiliary_loss_mlp": 0.01059246, + "balance_loss_clip": 1.03096998, + "balance_loss_mlp": 1.06996655, + "epoch": 0.046655644070344206, + "flos": 18112516778880.0, + "grad_norm": 1.5904648869830247, + "language_loss": 0.77037287, + "learning_rate": 3.997091134020217e-06, + "loss": 0.79337239, + "num_input_tokens_seen": 16667555, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.703125, + "step": 776, + "time_per_iteration": 2.4713642597198486 + }, + { + "auxiliary_loss_clip": 0.01236202, + "auxiliary_loss_mlp": 0.01063775, + "balance_loss_clip": 1.03790653, + "balance_loss_mlp": 1.06914115, + "epoch": 0.04671576732301218, + "flos": 29205286617600.0, + "grad_norm": 1.9751950676431418, + "language_loss": 0.70967531, + "learning_rate": 3.997070098605585e-06, + "loss": 0.73267508, + "num_input_tokens_seen": 16686875, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.671875, + "step": 777, + "time_per_iteration": 2.540151596069336 + }, + { + "auxiliary_loss_clip": 0.01242182, + "auxiliary_loss_mlp": 0.01078178, + "balance_loss_clip": 1.04999709, + "balance_loss_mlp": 1.07221043, + "epoch": 0.04677589057568014, + "flos": 30478319510400.0, + "grad_norm": 1.9852588200641685, + "language_loss": 0.76756501, + "learning_rate": 3.997048987461856e-06, + "loss": 0.79076868, + "num_input_tokens_seen": 16706420, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.703125, + "step": 778, + "time_per_iteration": 2.5299642086029053 + }, + { + "auxiliary_loss_clip": 0.01236882, + "auxiliary_loss_mlp": 0.01068399, + "balance_loss_clip": 1.04049253, + "balance_loss_mlp": 1.06948996, + "epoch": 0.046836013828348115, + "flos": 20557674282240.0, + "grad_norm": 2.9364819041983576, + "language_loss": 0.78900939, + "learning_rate": 3.997027800589829e-06, + "loss": 0.81206226, + "num_input_tokens_seen": 16726390, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.671875, + "step": 779, + "time_per_iteration": 2.4999477863311768 + }, + { + "auxiliary_loss_clip": 0.01230899, + "auxiliary_loss_mlp": 0.01065999, + "balance_loss_clip": 1.03997588, + "balance_loss_mlp": 1.06776333, + "epoch": 0.04689613708101608, + "flos": 25447378757760.0, + "grad_norm": 1.7037291099106273, + "language_loss": 0.77051055, + "learning_rate": 3.997006537990308e-06, + "loss": 0.7934795, + "num_input_tokens_seen": 16748965, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.6328125, + "step": 780, + "time_per_iteration": 2.54770565032959 + }, + { + "auxiliary_loss_clip": 0.01235667, + "auxiliary_loss_mlp": 0.01067194, + "balance_loss_clip": 1.04187369, + "balance_loss_mlp": 1.07070863, + "epoch": 0.04695626033368405, + "flos": 23001395241600.0, + "grad_norm": 2.6789342331958745, + "language_loss": 0.76432645, + "learning_rate": 3.996985199664099e-06, + "loss": 0.78735507, + "num_input_tokens_seen": 16768620, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.6484375, + "step": 781, + "time_per_iteration": 2.5040361881256104 + }, + { + "auxiliary_loss_clip": 0.01245917, + "auxiliary_loss_mlp": 0.01072818, + "balance_loss_clip": 1.04468417, + "balance_loss_mlp": 1.07423282, + "epoch": 0.047016383586352024, + "flos": 29133357632640.0, + "grad_norm": 2.2171800145032736, + "language_loss": 0.74027473, + "learning_rate": 3.99696378561201e-06, + "loss": 0.76346207, + "num_input_tokens_seen": 16789755, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.71875, + "step": 782, + "time_per_iteration": 2.528890371322632 + }, + { + "auxiliary_loss_clip": 0.01241991, + "auxiliary_loss_mlp": 0.01060851, + "balance_loss_clip": 1.03549504, + "balance_loss_mlp": 1.07483578, + "epoch": 0.04707650683901999, + "flos": 14976330451200.0, + "grad_norm": 6.219089205177081, + "language_loss": 0.8032757, + "learning_rate": 3.996942295834855e-06, + "loss": 0.82630414, + "num_input_tokens_seen": 16807585, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.671875, + "step": 783, + "time_per_iteration": 2.4866061210632324 + }, + { + "auxiliary_loss_clip": 0.01232605, + "auxiliary_loss_mlp": 0.01059533, + "balance_loss_clip": 1.03417742, + "balance_loss_mlp": 1.07062817, + "epoch": 0.04713663009168796, + "flos": 21651118151040.0, + "grad_norm": 2.0172272756643816, + "language_loss": 0.81289953, + "learning_rate": 3.996920730333448e-06, + "loss": 0.83582091, + "num_input_tokens_seen": 16827220, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.6171875, + "step": 784, + "time_per_iteration": 2.476659059524536 + }, + { + "auxiliary_loss_clip": 0.01238913, + "auxiliary_loss_mlp": 0.01072248, + "balance_loss_clip": 1.04597473, + "balance_loss_mlp": 1.0683856, + "epoch": 0.04719675334435593, + "flos": 21325408600320.0, + "grad_norm": 2.171254656371271, + "language_loss": 0.8076694, + "learning_rate": 3.996899089108607e-06, + "loss": 0.83078098, + "num_input_tokens_seen": 16846230, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.703125, + "step": 785, + "time_per_iteration": 2.493598461151123 + }, + { + "auxiliary_loss_clip": 0.01241548, + "auxiliary_loss_mlp": 0.01061941, + "balance_loss_clip": 1.03752661, + "balance_loss_mlp": 1.0762614, + "epoch": 0.0472568765970239, + "flos": 17931383470080.0, + "grad_norm": 2.444819858404617, + "language_loss": 0.89981294, + "learning_rate": 3.996877372161152e-06, + "loss": 0.92284781, + "num_input_tokens_seen": 16865325, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.65625, + "step": 786, + "time_per_iteration": 2.4573311805725098 + }, + { + "auxiliary_loss_clip": 0.012413, + "auxiliary_loss_mlp": 0.01070001, + "balance_loss_clip": 1.04055619, + "balance_loss_mlp": 1.06742501, + "epoch": 0.04731699984969187, + "flos": 18077324428800.0, + "grad_norm": 3.379381752409287, + "language_loss": 0.76639462, + "learning_rate": 3.9968555794919065e-06, + "loss": 0.78950763, + "num_input_tokens_seen": 16882930, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.734375, + "step": 787, + "time_per_iteration": 2.447611093521118 + }, + { + "auxiliary_loss_clip": 0.01247236, + "auxiliary_loss_mlp": 0.01071736, + "balance_loss_clip": 1.04431772, + "balance_loss_mlp": 1.0765723, + "epoch": 0.047377123102359836, + "flos": 23185078416000.0, + "grad_norm": 2.4642209511959403, + "language_loss": 0.80851126, + "learning_rate": 3.996833711101698e-06, + "loss": 0.83170098, + "num_input_tokens_seen": 16900710, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.7109375, + "step": 788, + "time_per_iteration": 2.4679956436157227 + }, + { + "auxiliary_loss_clip": 0.01236983, + "auxiliary_loss_mlp": 0.01074337, + "balance_loss_clip": 1.04551244, + "balance_loss_mlp": 1.07285857, + "epoch": 0.04743724635502781, + "flos": 22747794243840.0, + "grad_norm": 2.2318634793178127, + "language_loss": 0.84819949, + "learning_rate": 3.996811766991355e-06, + "loss": 0.87131274, + "num_input_tokens_seen": 16919210, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.640625, + "step": 789, + "time_per_iteration": 2.4982516765594482 + }, + { + "auxiliary_loss_clip": 0.01242053, + "auxiliary_loss_mlp": 0.01066276, + "balance_loss_clip": 1.04006219, + "balance_loss_mlp": 1.07367456, + "epoch": 0.04749736960769577, + "flos": 17238702620160.0, + "grad_norm": 1.948517450129577, + "language_loss": 0.82196069, + "learning_rate": 3.996789747161709e-06, + "loss": 0.84504396, + "num_input_tokens_seen": 16937125, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.6875, + "step": 790, + "time_per_iteration": 2.4380602836608887 + }, + { + "auxiliary_loss_clip": 0.01236299, + "auxiliary_loss_mlp": 0.01062348, + "balance_loss_clip": 1.03524029, + "balance_loss_mlp": 1.06857598, + "epoch": 0.047557492860363745, + "flos": 40479261592320.0, + "grad_norm": 2.8806939749630054, + "language_loss": 0.88245451, + "learning_rate": 3.996767651613597e-06, + "loss": 0.90544093, + "num_input_tokens_seen": 16958610, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.671875, + "step": 791, + "time_per_iteration": 2.6723573207855225 + }, + { + "auxiliary_loss_clip": 0.01239952, + "auxiliary_loss_mlp": 0.010655, + "balance_loss_clip": 1.03826034, + "balance_loss_mlp": 1.07212687, + "epoch": 0.04761761611303172, + "flos": 18698004466560.0, + "grad_norm": 2.2584516419561464, + "language_loss": 0.90245461, + "learning_rate": 3.996745480347854e-06, + "loss": 0.92550921, + "num_input_tokens_seen": 16977300, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.671875, + "step": 792, + "time_per_iteration": 2.4627771377563477 + }, + { + "auxiliary_loss_clip": 0.01241845, + "auxiliary_loss_mlp": 0.01074856, + "balance_loss_clip": 1.04874945, + "balance_loss_mlp": 1.07157969, + "epoch": 0.04767773936569968, + "flos": 20921987975040.0, + "grad_norm": 1.9386484459236437, + "language_loss": 0.7310667, + "learning_rate": 3.996723233365324e-06, + "loss": 0.75423372, + "num_input_tokens_seen": 16994950, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.703125, + "step": 793, + "time_per_iteration": 2.512657403945923 + }, + { + "auxiliary_loss_clip": 0.0124198, + "auxiliary_loss_mlp": 0.01067209, + "balance_loss_clip": 1.03969526, + "balance_loss_mlp": 1.07207203, + "epoch": 0.047737862618367655, + "flos": 23732680233600.0, + "grad_norm": 2.0117940746735123, + "language_loss": 0.86102074, + "learning_rate": 3.996700910666847e-06, + "loss": 0.88411266, + "num_input_tokens_seen": 17014760, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.703125, + "step": 794, + "time_per_iteration": 2.510611057281494 + }, + { + "auxiliary_loss_clip": 0.0123999, + "auxiliary_loss_mlp": 0.01074174, + "balance_loss_clip": 1.04701805, + "balance_loss_mlp": 1.06925917, + "epoch": 0.04779798587103562, + "flos": 23695764030720.0, + "grad_norm": 3.4118642482115384, + "language_loss": 0.69812739, + "learning_rate": 3.996678512253272e-06, + "loss": 0.72126907, + "num_input_tokens_seen": 17032715, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.7109375, + "step": 795, + "time_per_iteration": 2.500420093536377 + }, + { + "auxiliary_loss_clip": 0.01236981, + "auxiliary_loss_mlp": 0.01070364, + "balance_loss_clip": 1.0432204, + "balance_loss_mlp": 1.06999111, + "epoch": 0.04785810912370359, + "flos": 23183641872000.0, + "grad_norm": 2.0479238599532135, + "language_loss": 0.81053579, + "learning_rate": 3.996656038125449e-06, + "loss": 0.83360916, + "num_input_tokens_seen": 17052215, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.671875, + "step": 796, + "time_per_iteration": 2.4838409423828125 + }, + { + "auxiliary_loss_clip": 0.0124002, + "auxiliary_loss_mlp": 0.01058331, + "balance_loss_clip": 1.03129458, + "balance_loss_mlp": 1.07190371, + "epoch": 0.047918232376371564, + "flos": 18040623707520.0, + "grad_norm": 2.3456590334750858, + "language_loss": 0.81249642, + "learning_rate": 3.996633488284228e-06, + "loss": 0.83547997, + "num_input_tokens_seen": 17069225, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.6796875, + "step": 797, + "time_per_iteration": 2.466343402862549 + }, + { + "auxiliary_loss_clip": 0.01122032, + "auxiliary_loss_mlp": 0.0100279, + "balance_loss_clip": 0.9972828, + "balance_loss_mlp": 1.03672731, + "epoch": 0.04797835562903953, + "flos": 62442588758400.0, + "grad_norm": 0.9120921080635288, + "language_loss": 0.64447635, + "learning_rate": 3.996610862730465e-06, + "loss": 0.66572458, + "num_input_tokens_seen": 17126680, + "router_z_loss_clip": 0.05517578, + "router_z_loss_mlp": 0.8515625, + "step": 798, + "time_per_iteration": 3.0081863403320312 + }, + { + "auxiliary_loss_clip": 0.01243937, + "auxiliary_loss_mlp": 0.01070197, + "balance_loss_clip": 1.04285014, + "balance_loss_mlp": 1.06894708, + "epoch": 0.0480384788817075, + "flos": 21507296094720.0, + "grad_norm": 7.0153313624744005, + "language_loss": 0.90794134, + "learning_rate": 3.996588161465018e-06, + "loss": 0.93108267, + "num_input_tokens_seen": 17144835, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.75, + "step": 799, + "time_per_iteration": 2.4872424602508545 + }, + { + "auxiliary_loss_clip": 0.01242621, + "auxiliary_loss_mlp": 0.01069655, + "balance_loss_clip": 1.04220068, + "balance_loss_mlp": 1.07567, + "epoch": 0.048098602134375466, + "flos": 21726710323200.0, + "grad_norm": 2.1467314479540818, + "language_loss": 0.86701, + "learning_rate": 3.996565384488748e-06, + "loss": 0.89013278, + "num_input_tokens_seen": 17165030, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.671875, + "step": 800, + "time_per_iteration": 2.477720022201538 + }, + { + "auxiliary_loss_clip": 0.01243518, + "auxiliary_loss_mlp": 0.0106979, + "balance_loss_clip": 1.04362369, + "balance_loss_mlp": 1.07207572, + "epoch": 0.04815872538704344, + "flos": 22931082368640.0, + "grad_norm": 7.517902152046504, + "language_loss": 0.84513009, + "learning_rate": 3.996542531802518e-06, + "loss": 0.86826313, + "num_input_tokens_seen": 17184895, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.71875, + "step": 801, + "time_per_iteration": 2.487889528274536 + }, + { + "auxiliary_loss_clip": 0.01242116, + "auxiliary_loss_mlp": 0.01071409, + "balance_loss_clip": 1.04470587, + "balance_loss_mlp": 1.07289147, + "epoch": 0.04821884863971141, + "flos": 43174716042240.0, + "grad_norm": 1.97564705550146, + "language_loss": 0.79967415, + "learning_rate": 3.996519603407196e-06, + "loss": 0.82280934, + "num_input_tokens_seen": 17208225, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.6953125, + "step": 802, + "time_per_iteration": 2.6496224403381348 + }, + { + "auxiliary_loss_clip": 0.01238875, + "auxiliary_loss_mlp": 0.01065547, + "balance_loss_clip": 1.03963101, + "balance_loss_mlp": 1.07270598, + "epoch": 0.048278971892379376, + "flos": 18620006083200.0, + "grad_norm": 2.8331626885697725, + "language_loss": 0.86420751, + "learning_rate": 3.996496599303649e-06, + "loss": 0.88725173, + "num_input_tokens_seen": 17226305, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.6640625, + "step": 803, + "time_per_iteration": 2.4806807041168213 + }, + { + "auxiliary_loss_clip": 0.01238315, + "auxiliary_loss_mlp": 0.01061166, + "balance_loss_clip": 1.0346303, + "balance_loss_mlp": 1.07398677, + "epoch": 0.04833909514504735, + "flos": 20230061310720.0, + "grad_norm": 2.229653749186784, + "language_loss": 0.85436332, + "learning_rate": 3.996473519492753e-06, + "loss": 0.87735808, + "num_input_tokens_seen": 17244545, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.640625, + "step": 804, + "time_per_iteration": 2.458303213119507 + }, + { + "auxiliary_loss_clip": 0.01239413, + "auxiliary_loss_mlp": 0.01066878, + "balance_loss_clip": 1.04099822, + "balance_loss_mlp": 1.07286024, + "epoch": 0.04839921839771532, + "flos": 24645170361600.0, + "grad_norm": 2.2509331098011645, + "language_loss": 0.86119306, + "learning_rate": 3.99645036397538e-06, + "loss": 0.88425595, + "num_input_tokens_seen": 17265730, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.6640625, + "step": 805, + "time_per_iteration": 2.5021419525146484 + }, + { + "auxiliary_loss_clip": 0.01235031, + "auxiliary_loss_mlp": 0.01067273, + "balance_loss_clip": 1.04115391, + "balance_loss_mlp": 1.06942892, + "epoch": 0.048459341650383285, + "flos": 24827452905600.0, + "grad_norm": 1.8866019303880346, + "language_loss": 0.68034315, + "learning_rate": 3.9964271327524085e-06, + "loss": 0.70336622, + "num_input_tokens_seen": 17284820, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.65625, + "step": 806, + "time_per_iteration": 2.4904568195343018 + }, + { + "auxiliary_loss_clip": 0.01235579, + "auxiliary_loss_mlp": 0.01064526, + "balance_loss_clip": 1.03847933, + "balance_loss_mlp": 1.07208037, + "epoch": 0.04851946490305126, + "flos": 22163204396160.0, + "grad_norm": 2.221107161276338, + "language_loss": 0.7716608, + "learning_rate": 3.9964038258247214e-06, + "loss": 0.79466188, + "num_input_tokens_seen": 17305085, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.6328125, + "step": 807, + "time_per_iteration": 2.498624563217163 + }, + { + "auxiliary_loss_clip": 0.01232532, + "auxiliary_loss_mlp": 0.01071643, + "balance_loss_clip": 1.04567873, + "balance_loss_mlp": 1.06831741, + "epoch": 0.04857958815571922, + "flos": 19792022952960.0, + "grad_norm": 2.844770488216335, + "language_loss": 0.86509991, + "learning_rate": 3.9963804431932005e-06, + "loss": 0.88814163, + "num_input_tokens_seen": 17322715, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.640625, + "step": 808, + "time_per_iteration": 2.444673538208008 + }, + { + "auxiliary_loss_clip": 0.01242847, + "auxiliary_loss_mlp": 0.01070908, + "balance_loss_clip": 1.04441929, + "balance_loss_mlp": 1.07261682, + "epoch": 0.048639711408387194, + "flos": 18697968552960.0, + "grad_norm": 1.9428867449931826, + "language_loss": 0.90154302, + "learning_rate": 3.996356984858732e-06, + "loss": 0.92468053, + "num_input_tokens_seen": 17341455, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.703125, + "step": 809, + "time_per_iteration": 5.353702545166016 + }, + { + "auxiliary_loss_clip": 0.01242102, + "auxiliary_loss_mlp": 0.01070004, + "balance_loss_clip": 1.0432415, + "balance_loss_mlp": 1.07577538, + "epoch": 0.048699834661055166, + "flos": 24863507182080.0, + "grad_norm": 2.12821080633451, + "language_loss": 0.84360719, + "learning_rate": 3.996333450822208e-06, + "loss": 0.86672825, + "num_input_tokens_seen": 17360765, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.65625, + "step": 810, + "time_per_iteration": 3.8935022354125977 + }, + { + "auxiliary_loss_clip": 0.01240735, + "auxiliary_loss_mlp": 0.01066961, + "balance_loss_clip": 1.04049659, + "balance_loss_mlp": 1.07189715, + "epoch": 0.04875995791372313, + "flos": 20704010290560.0, + "grad_norm": 1.7610993085905569, + "language_loss": 0.80875039, + "learning_rate": 3.99630984108452e-06, + "loss": 0.8318274, + "num_input_tokens_seen": 17380625, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.6875, + "step": 811, + "time_per_iteration": 2.5000643730163574 + }, + { + "auxiliary_loss_clip": 0.01232044, + "auxiliary_loss_mlp": 0.0107527, + "balance_loss_clip": 1.04991412, + "balance_loss_mlp": 1.06997907, + "epoch": 0.048820081166391104, + "flos": 18588297352320.0, + "grad_norm": 2.0417171226218715, + "language_loss": 0.74768531, + "learning_rate": 3.9962861556465615e-06, + "loss": 0.77075845, + "num_input_tokens_seen": 17399355, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.625, + "step": 812, + "time_per_iteration": 2.4853179454803467 + }, + { + "auxiliary_loss_clip": 0.01233917, + "auxiliary_loss_mlp": 0.01074517, + "balance_loss_clip": 1.04924428, + "balance_loss_mlp": 1.07263327, + "epoch": 0.04888020441905907, + "flos": 22707322594560.0, + "grad_norm": 1.8904091966919716, + "language_loss": 0.89845109, + "learning_rate": 3.996262394509233e-06, + "loss": 0.92153537, + "num_input_tokens_seen": 17418240, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.609375, + "step": 813, + "time_per_iteration": 2.6731016635894775 + }, + { + "auxiliary_loss_clip": 0.01232344, + "auxiliary_loss_mlp": 0.01058653, + "balance_loss_clip": 1.03429866, + "balance_loss_mlp": 1.07083082, + "epoch": 0.04894032767172704, + "flos": 22784351310720.0, + "grad_norm": 2.028357820963791, + "language_loss": 0.74551463, + "learning_rate": 3.9962385576734335e-06, + "loss": 0.76842451, + "num_input_tokens_seen": 17436250, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.6171875, + "step": 814, + "time_per_iteration": 2.509963035583496 + }, + { + "auxiliary_loss_clip": 0.01235531, + "auxiliary_loss_mlp": 0.01067085, + "balance_loss_clip": 1.04074001, + "balance_loss_mlp": 1.07073569, + "epoch": 0.04900045092439501, + "flos": 25516147345920.0, + "grad_norm": 2.3605733083261464, + "language_loss": 0.83740532, + "learning_rate": 3.9962146451400675e-06, + "loss": 0.86043149, + "num_input_tokens_seen": 17455750, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.6484375, + "step": 815, + "time_per_iteration": 2.5490894317626953 + }, + { + "auxiliary_loss_clip": 0.01239776, + "auxiliary_loss_mlp": 0.0106033, + "balance_loss_clip": 1.03396082, + "balance_loss_mlp": 1.07326484, + "epoch": 0.04906057417706298, + "flos": 25958136199680.0, + "grad_norm": 2.271155414035229, + "language_loss": 0.90803105, + "learning_rate": 3.996190656910043e-06, + "loss": 0.93103218, + "num_input_tokens_seen": 17474995, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.6640625, + "step": 816, + "time_per_iteration": 2.5273053646087646 + }, + { + "auxiliary_loss_clip": 0.01240454, + "auxiliary_loss_mlp": 0.01060699, + "balance_loss_clip": 1.03410304, + "balance_loss_mlp": 1.0732162, + "epoch": 0.04912069742973095, + "flos": 18624638937600.0, + "grad_norm": 3.2321750342473603, + "language_loss": 0.79924619, + "learning_rate": 3.996166592984268e-06, + "loss": 0.82225776, + "num_input_tokens_seen": 17493395, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.671875, + "step": 817, + "time_per_iteration": 2.5095019340515137 + }, + { + "auxiliary_loss_clip": 0.0123455, + "auxiliary_loss_mlp": 0.01074727, + "balance_loss_clip": 1.04864395, + "balance_loss_mlp": 1.07184172, + "epoch": 0.049180820682398915, + "flos": 23699786353920.0, + "grad_norm": 1.8264850687392937, + "language_loss": 0.84520394, + "learning_rate": 3.996142453363656e-06, + "loss": 0.86829674, + "num_input_tokens_seen": 17514565, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.625, + "step": 818, + "time_per_iteration": 2.5476157665252686 + }, + { + "auxiliary_loss_clip": 0.01243386, + "auxiliary_loss_mlp": 0.01067455, + "balance_loss_clip": 1.04041791, + "balance_loss_mlp": 1.07401037, + "epoch": 0.04924094393506689, + "flos": 22420396753920.0, + "grad_norm": 2.779535734169796, + "language_loss": 0.75307131, + "learning_rate": 3.996118238049124e-06, + "loss": 0.77617967, + "num_input_tokens_seen": 17534590, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.6953125, + "step": 819, + "time_per_iteration": 2.5486624240875244 + }, + { + "auxiliary_loss_clip": 0.01239669, + "auxiliary_loss_mlp": 0.01061583, + "balance_loss_clip": 1.03858793, + "balance_loss_mlp": 1.07577193, + "epoch": 0.04930106718773486, + "flos": 15738246766080.0, + "grad_norm": 2.1475545017813853, + "language_loss": 0.85166955, + "learning_rate": 3.996093947041586e-06, + "loss": 0.87468207, + "num_input_tokens_seen": 17551900, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.640625, + "step": 820, + "time_per_iteration": 2.4565298557281494 + }, + { + "auxiliary_loss_clip": 0.0123627, + "auxiliary_loss_mlp": 0.01061843, + "balance_loss_clip": 1.03602266, + "balance_loss_mlp": 1.07061315, + "epoch": 0.049361190440402825, + "flos": 26250628648320.0, + "grad_norm": 1.902695357085614, + "language_loss": 0.9041872, + "learning_rate": 3.996069580341966e-06, + "loss": 0.92716837, + "num_input_tokens_seen": 17571485, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.65625, + "step": 821, + "time_per_iteration": 2.5412514209747314 + }, + { + "auxiliary_loss_clip": 0.01233424, + "auxiliary_loss_mlp": 0.01073041, + "balance_loss_clip": 1.04773307, + "balance_loss_mlp": 1.06951392, + "epoch": 0.0494213136930708, + "flos": 21252366293760.0, + "grad_norm": 2.0531707528144274, + "language_loss": 0.8941884, + "learning_rate": 3.996045137951188e-06, + "loss": 0.91725308, + "num_input_tokens_seen": 17591410, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.640625, + "step": 822, + "time_per_iteration": 2.5171031951904297 + }, + { + "auxiliary_loss_clip": 0.01237258, + "auxiliary_loss_mlp": 0.01059943, + "balance_loss_clip": 1.03295374, + "balance_loss_mlp": 1.0742538, + "epoch": 0.04948143694573876, + "flos": 27965506740480.0, + "grad_norm": 2.060390808888412, + "language_loss": 0.67537785, + "learning_rate": 3.996020619870178e-06, + "loss": 0.69834983, + "num_input_tokens_seen": 17612010, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.6328125, + "step": 823, + "time_per_iteration": 2.5744235515594482 + }, + { + "auxiliary_loss_clip": 0.01120581, + "auxiliary_loss_mlp": 0.01008389, + "balance_loss_clip": 1.00323892, + "balance_loss_mlp": 1.04174662, + "epoch": 0.049541560198406734, + "flos": 66180995533440.0, + "grad_norm": 1.3777513990451415, + "language_loss": 0.62206292, + "learning_rate": 3.995996026099866e-06, + "loss": 0.64335263, + "num_input_tokens_seen": 17673430, + "router_z_loss_clip": 0.05151367, + "router_z_loss_mlp": 0.7890625, + "step": 824, + "time_per_iteration": 3.13708758354187 + }, + { + "auxiliary_loss_clip": 0.01240025, + "auxiliary_loss_mlp": 0.01070032, + "balance_loss_clip": 1.0431149, + "balance_loss_mlp": 1.07293963, + "epoch": 0.049601683451074706, + "flos": 22892693708160.0, + "grad_norm": 2.021638376413324, + "language_loss": 0.90364408, + "learning_rate": 3.995971356641185e-06, + "loss": 0.92674464, + "num_input_tokens_seen": 17689545, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.671875, + "step": 825, + "time_per_iteration": 2.519487142562866 + }, + { + "auxiliary_loss_clip": 0.01237141, + "auxiliary_loss_mlp": 0.01064311, + "balance_loss_clip": 1.03678548, + "balance_loss_mlp": 1.0713625, + "epoch": 0.04966180670374267, + "flos": 21433643256960.0, + "grad_norm": 23.06748840114486, + "language_loss": 0.66790086, + "learning_rate": 3.9959466114950695e-06, + "loss": 0.69091535, + "num_input_tokens_seen": 17705965, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.65625, + "step": 826, + "time_per_iteration": 2.486837387084961 + }, + { + "auxiliary_loss_clip": 0.01236344, + "auxiliary_loss_mlp": 0.01062091, + "balance_loss_clip": 1.0362581, + "balance_loss_mlp": 1.07166433, + "epoch": 0.04972192995641064, + "flos": 23107367341440.0, + "grad_norm": 5.4656671498779845, + "language_loss": 0.78386623, + "learning_rate": 3.995921790662459e-06, + "loss": 0.80685055, + "num_input_tokens_seen": 17724580, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.640625, + "step": 827, + "time_per_iteration": 2.517092704772949 + }, + { + "auxiliary_loss_clip": 0.0124052, + "auxiliary_loss_mlp": 0.01072288, + "balance_loss_clip": 1.04553795, + "balance_loss_mlp": 1.07333767, + "epoch": 0.04978205320907861, + "flos": 40406147458560.0, + "grad_norm": 2.8940457048653916, + "language_loss": 0.78592682, + "learning_rate": 3.995896894144294e-06, + "loss": 0.80905491, + "num_input_tokens_seen": 17747755, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.671875, + "step": 828, + "time_per_iteration": 2.6536450386047363 + }, + { + "auxiliary_loss_clip": 0.01227721, + "auxiliary_loss_mlp": 0.01058804, + "balance_loss_clip": 1.03428292, + "balance_loss_mlp": 1.06777728, + "epoch": 0.04984217646174658, + "flos": 25228539146880.0, + "grad_norm": 2.330577425067274, + "language_loss": 0.83493364, + "learning_rate": 3.995871921941519e-06, + "loss": 0.85779881, + "num_input_tokens_seen": 17768550, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.6015625, + "step": 829, + "time_per_iteration": 2.5744268894195557 + }, + { + "auxiliary_loss_clip": 0.01235678, + "auxiliary_loss_mlp": 0.01073434, + "balance_loss_clip": 1.04433525, + "balance_loss_mlp": 1.07021666, + "epoch": 0.04990229971441455, + "flos": 15959636242560.0, + "grad_norm": 2.2375926111489743, + "language_loss": 0.75055873, + "learning_rate": 3.99584687405508e-06, + "loss": 0.77364987, + "num_input_tokens_seen": 17786080, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.65625, + "step": 830, + "time_per_iteration": 2.5045461654663086 + }, + { + "auxiliary_loss_clip": 0.01233457, + "auxiliary_loss_mlp": 0.01065961, + "balance_loss_clip": 1.03935385, + "balance_loss_mlp": 1.06966341, + "epoch": 0.04996242296708252, + "flos": 18405116968320.0, + "grad_norm": 1.962979792887244, + "language_loss": 0.79379636, + "learning_rate": 3.995821750485929e-06, + "loss": 0.81679052, + "num_input_tokens_seen": 17803635, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.640625, + "step": 831, + "time_per_iteration": 2.5924267768859863 + }, + { + "auxiliary_loss_clip": 0.01237676, + "auxiliary_loss_mlp": 0.01070014, + "balance_loss_clip": 1.04487276, + "balance_loss_mlp": 1.07213569, + "epoch": 0.05002254621975049, + "flos": 17858053854720.0, + "grad_norm": 2.758266217871517, + "language_loss": 0.91538632, + "learning_rate": 3.995796551235016e-06, + "loss": 0.93846321, + "num_input_tokens_seen": 17822190, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.65625, + "step": 832, + "time_per_iteration": 2.653150796890259 + }, + { + "auxiliary_loss_clip": 0.01230534, + "auxiliary_loss_mlp": 0.01081981, + "balance_loss_clip": 1.05747163, + "balance_loss_mlp": 1.07053018, + "epoch": 0.050082669472418455, + "flos": 45660273367680.0, + "grad_norm": 1.9700093948003867, + "language_loss": 0.83139837, + "learning_rate": 3.9957712763032974e-06, + "loss": 0.85452354, + "num_input_tokens_seen": 17846915, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.6015625, + "step": 833, + "time_per_iteration": 2.73848819732666 + }, + { + "auxiliary_loss_clip": 0.0123523, + "auxiliary_loss_mlp": 0.01058285, + "balance_loss_clip": 1.0318923, + "balance_loss_mlp": 1.06913459, + "epoch": 0.05014279272508643, + "flos": 37962067363200.0, + "grad_norm": 2.433665596415918, + "language_loss": 0.8254565, + "learning_rate": 3.995745925691733e-06, + "loss": 0.84839165, + "num_input_tokens_seen": 17867270, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.65625, + "step": 834, + "time_per_iteration": 2.6406352519989014 + }, + { + "auxiliary_loss_clip": 0.01236789, + "auxiliary_loss_mlp": 0.01063828, + "balance_loss_clip": 1.03710127, + "balance_loss_mlp": 1.07138014, + "epoch": 0.0502029159777544, + "flos": 20996179516800.0, + "grad_norm": 2.099554255469436, + "language_loss": 0.91758966, + "learning_rate": 3.995720499401282e-06, + "loss": 0.94059587, + "num_input_tokens_seen": 17884880, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.65625, + "step": 835, + "time_per_iteration": 2.4750425815582275 + }, + { + "auxiliary_loss_clip": 0.0123437, + "auxiliary_loss_mlp": 0.01071713, + "balance_loss_clip": 1.04372287, + "balance_loss_mlp": 1.06699944, + "epoch": 0.050263039230422364, + "flos": 15888066393600.0, + "grad_norm": 2.4903656252358735, + "language_loss": 0.76346481, + "learning_rate": 3.995694997432911e-06, + "loss": 0.78652561, + "num_input_tokens_seen": 17903695, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.671875, + "step": 836, + "time_per_iteration": 2.4839258193969727 + }, + { + "auxiliary_loss_clip": 0.01229978, + "auxiliary_loss_mlp": 0.01072782, + "balance_loss_clip": 1.04696083, + "balance_loss_mlp": 1.07100809, + "epoch": 0.050323162483090336, + "flos": 23732752060800.0, + "grad_norm": 2.1380784235063066, + "language_loss": 0.8360337, + "learning_rate": 3.9956694197875855e-06, + "loss": 0.85906136, + "num_input_tokens_seen": 17920745, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.5859375, + "step": 837, + "time_per_iteration": 2.5140485763549805 + }, + { + "auxiliary_loss_clip": 0.01233502, + "auxiliary_loss_mlp": 0.01065592, + "balance_loss_clip": 1.0403192, + "balance_loss_mlp": 1.07245386, + "epoch": 0.0503832857357583, + "flos": 20266223328000.0, + "grad_norm": 2.225982034212064, + "language_loss": 0.73137468, + "learning_rate": 3.995643766466275e-06, + "loss": 0.75436556, + "num_input_tokens_seen": 17938220, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.609375, + "step": 838, + "time_per_iteration": 2.5128419399261475 + }, + { + "auxiliary_loss_clip": 0.01229023, + "auxiliary_loss_mlp": 0.0106788, + "balance_loss_clip": 1.04195237, + "balance_loss_mlp": 1.06636167, + "epoch": 0.05044340898842627, + "flos": 17785011548160.0, + "grad_norm": 1.886796600099776, + "language_loss": 0.83328462, + "learning_rate": 3.995618037469953e-06, + "loss": 0.85625362, + "num_input_tokens_seen": 17957325, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.6328125, + "step": 839, + "time_per_iteration": 2.499415874481201 + }, + { + "auxiliary_loss_clip": 0.01228207, + "auxiliary_loss_mlp": 0.01066651, + "balance_loss_clip": 1.04128349, + "balance_loss_mlp": 1.06866539, + "epoch": 0.050503532241094246, + "flos": 22966526113920.0, + "grad_norm": 2.2056506497336765, + "language_loss": 0.85777193, + "learning_rate": 3.995592232799595e-06, + "loss": 0.8807205, + "num_input_tokens_seen": 17975875, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.59375, + "step": 840, + "time_per_iteration": 2.522038698196411 + }, + { + "auxiliary_loss_clip": 0.01235877, + "auxiliary_loss_mlp": 0.01063775, + "balance_loss_clip": 1.03691697, + "balance_loss_mlp": 1.07246661, + "epoch": 0.05056365549376221, + "flos": 22776989022720.0, + "grad_norm": 2.034102412822674, + "language_loss": 0.94658732, + "learning_rate": 3.99556635245618e-06, + "loss": 0.96958393, + "num_input_tokens_seen": 17994340, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.6328125, + "step": 841, + "time_per_iteration": 2.4996211528778076 + }, + { + "auxiliary_loss_clip": 0.01234454, + "auxiliary_loss_mlp": 0.01066453, + "balance_loss_clip": 1.03958321, + "balance_loss_mlp": 1.07130527, + "epoch": 0.05062377874643018, + "flos": 30916968399360.0, + "grad_norm": 2.030819255438432, + "language_loss": 0.77387047, + "learning_rate": 3.995540396440688e-06, + "loss": 0.79687953, + "num_input_tokens_seen": 18015260, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.6328125, + "step": 842, + "time_per_iteration": 2.6253628730773926 + }, + { + "auxiliary_loss_clip": 0.01238804, + "auxiliary_loss_mlp": 0.01067813, + "balance_loss_clip": 1.041659, + "balance_loss_mlp": 1.07278991, + "epoch": 0.05068390199909815, + "flos": 19647159402240.0, + "grad_norm": 2.283727909175907, + "language_loss": 0.78014457, + "learning_rate": 3.995514364754105e-06, + "loss": 0.80321074, + "num_input_tokens_seen": 18033960, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.6640625, + "step": 843, + "time_per_iteration": 2.5158324241638184 + }, + { + "auxiliary_loss_clip": 0.01237695, + "auxiliary_loss_mlp": 0.01061566, + "balance_loss_clip": 1.036461, + "balance_loss_mlp": 1.07266212, + "epoch": 0.05074402525176612, + "flos": 37962103276800.0, + "grad_norm": 2.249210505837228, + "language_loss": 0.82952344, + "learning_rate": 3.995488257397417e-06, + "loss": 0.85251611, + "num_input_tokens_seen": 18056700, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.6484375, + "step": 844, + "time_per_iteration": 2.6476500034332275 + }, + { + "auxiliary_loss_clip": 0.01229818, + "auxiliary_loss_mlp": 0.0106479, + "balance_loss_clip": 1.03935087, + "balance_loss_mlp": 1.06871867, + "epoch": 0.05080414850443409, + "flos": 22054610603520.0, + "grad_norm": 2.3236550986537368, + "language_loss": 0.76042783, + "learning_rate": 3.995462074371614e-06, + "loss": 0.78337395, + "num_input_tokens_seen": 18075815, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.609375, + "step": 845, + "time_per_iteration": 2.502901315689087 + }, + { + "auxiliary_loss_clip": 0.01229682, + "auxiliary_loss_mlp": 0.01075672, + "balance_loss_clip": 1.04924285, + "balance_loss_mlp": 1.06694174, + "epoch": 0.05086427175710206, + "flos": 20225787592320.0, + "grad_norm": 2.2528566199281905, + "language_loss": 0.87468004, + "learning_rate": 3.99543581567769e-06, + "loss": 0.89773357, + "num_input_tokens_seen": 18095095, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.625, + "step": 846, + "time_per_iteration": 2.5271859169006348 + }, + { + "auxiliary_loss_clip": 0.01230653, + "auxiliary_loss_mlp": 0.01070334, + "balance_loss_clip": 1.04521692, + "balance_loss_mlp": 1.06982791, + "epoch": 0.05092439500977003, + "flos": 15159223526400.0, + "grad_norm": 1.95159927266484, + "language_loss": 0.87571466, + "learning_rate": 3.9954094813166394e-06, + "loss": 0.89872456, + "num_input_tokens_seen": 18112675, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.609375, + "step": 847, + "time_per_iteration": 2.4566030502319336 + }, + { + "auxiliary_loss_clip": 0.01226009, + "auxiliary_loss_mlp": 0.01071018, + "balance_loss_clip": 1.04489946, + "balance_loss_mlp": 1.06883907, + "epoch": 0.050984518262437994, + "flos": 22055149307520.0, + "grad_norm": 2.141846591022022, + "language_loss": 0.81706643, + "learning_rate": 3.995383071289462e-06, + "loss": 0.84003675, + "num_input_tokens_seen": 18130745, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.5703125, + "step": 848, + "time_per_iteration": 2.4695050716400146 + }, + { + "auxiliary_loss_clip": 0.0123182, + "auxiliary_loss_mlp": 0.01077851, + "balance_loss_clip": 1.0522449, + "balance_loss_mlp": 1.07167053, + "epoch": 0.05104464151510597, + "flos": 30225329043840.0, + "grad_norm": 1.898868752622741, + "language_loss": 0.87266076, + "learning_rate": 3.995356585597158e-06, + "loss": 0.89575738, + "num_input_tokens_seen": 18152410, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.6015625, + "step": 849, + "time_per_iteration": 2.5472936630249023 + }, + { + "auxiliary_loss_clip": 0.0122487, + "auxiliary_loss_mlp": 0.01062562, + "balance_loss_clip": 1.03743291, + "balance_loss_mlp": 1.06569946, + "epoch": 0.05110476476777394, + "flos": 18332900674560.0, + "grad_norm": 1.8637209623848903, + "language_loss": 0.83340889, + "learning_rate": 3.995330024240732e-06, + "loss": 0.85628319, + "num_input_tokens_seen": 18170870, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.59375, + "step": 850, + "time_per_iteration": 2.493814468383789 + }, + { + "auxiliary_loss_clip": 0.01229016, + "auxiliary_loss_mlp": 0.01064236, + "balance_loss_clip": 1.03847528, + "balance_loss_mlp": 1.06816506, + "epoch": 0.051164888020441904, + "flos": 37998732170880.0, + "grad_norm": 2.1400408414194154, + "language_loss": 0.6501807, + "learning_rate": 3.995303387221192e-06, + "loss": 0.67311323, + "num_input_tokens_seen": 18191555, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.609375, + "step": 851, + "time_per_iteration": 5.443026065826416 + }, + { + "auxiliary_loss_clip": 0.01228781, + "auxiliary_loss_mlp": 0.01071453, + "balance_loss_clip": 1.04424942, + "balance_loss_mlp": 1.0674876, + "epoch": 0.051225011273109876, + "flos": 23038634666880.0, + "grad_norm": 2.2562645326336686, + "language_loss": 0.8376134, + "learning_rate": 3.995276674539547e-06, + "loss": 0.86061573, + "num_input_tokens_seen": 18208620, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.609375, + "step": 852, + "time_per_iteration": 2.4753623008728027 + }, + { + "auxiliary_loss_clip": 0.01231223, + "auxiliary_loss_mlp": 0.01068594, + "balance_loss_clip": 1.04190326, + "balance_loss_mlp": 1.06879044, + "epoch": 0.05128513452577785, + "flos": 18259822454400.0, + "grad_norm": 1.9405819970113303, + "language_loss": 0.80252314, + "learning_rate": 3.995249886196811e-06, + "loss": 0.82552135, + "num_input_tokens_seen": 18226370, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.625, + "step": 853, + "time_per_iteration": 2.5048112869262695 + }, + { + "auxiliary_loss_clip": 0.01226539, + "auxiliary_loss_mlp": 0.01060743, + "balance_loss_clip": 1.03432584, + "balance_loss_mlp": 1.06710184, + "epoch": 0.05134525777844581, + "flos": 27198957571200.0, + "grad_norm": 1.8237562231360178, + "language_loss": 0.75846469, + "learning_rate": 3.995223022193999e-06, + "loss": 0.7813375, + "num_input_tokens_seen": 18247075, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.59375, + "step": 854, + "time_per_iteration": 2.53165602684021 + }, + { + "auxiliary_loss_clip": 0.01233418, + "auxiliary_loss_mlp": 0.0106357, + "balance_loss_clip": 1.03678393, + "balance_loss_mlp": 1.07139039, + "epoch": 0.051405381031113785, + "flos": 28362247436160.0, + "grad_norm": 2.718422527893707, + "language_loss": 0.81173462, + "learning_rate": 3.99519608253213e-06, + "loss": 0.83470446, + "num_input_tokens_seen": 18265680, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.625, + "step": 855, + "time_per_iteration": 2.5610744953155518 + }, + { + "auxiliary_loss_clip": 0.01113278, + "auxiliary_loss_mlp": 0.01020682, + "balance_loss_clip": 1.01534104, + "balance_loss_mlp": 1.03902698, + "epoch": 0.05146550428378175, + "flos": 65618169327360.0, + "grad_norm": 1.0071030268205712, + "language_loss": 0.65609074, + "learning_rate": 3.995169067212227e-06, + "loss": 0.67743033, + "num_input_tokens_seen": 18327015, + "router_z_loss_clip": 0.0534668, + "router_z_loss_mlp": 0.7421875, + "step": 856, + "time_per_iteration": 3.0546581745147705 + }, + { + "auxiliary_loss_clip": 0.01224884, + "auxiliary_loss_mlp": 0.01053813, + "balance_loss_clip": 1.02823043, + "balance_loss_mlp": 1.06811357, + "epoch": 0.05152562753644972, + "flos": 22054861998720.0, + "grad_norm": 1.8111088050205955, + "language_loss": 0.76996124, + "learning_rate": 3.9951419762353116e-06, + "loss": 0.79274821, + "num_input_tokens_seen": 18345235, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.5625, + "step": 857, + "time_per_iteration": 2.6051554679870605 + }, + { + "auxiliary_loss_clip": 0.01229705, + "auxiliary_loss_mlp": 0.01057595, + "balance_loss_clip": 1.03130889, + "balance_loss_mlp": 1.06846082, + "epoch": 0.051585750789117694, + "flos": 18509544783360.0, + "grad_norm": 3.7937823779894377, + "language_loss": 0.88893878, + "learning_rate": 3.995114809602412e-06, + "loss": 0.91181171, + "num_input_tokens_seen": 18362350, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.6171875, + "step": 858, + "time_per_iteration": 2.4517769813537598 + }, + { + "auxiliary_loss_clip": 0.01228685, + "auxiliary_loss_mlp": 0.01056497, + "balance_loss_clip": 1.03000832, + "balance_loss_mlp": 1.06902003, + "epoch": 0.05164587404178566, + "flos": 23730238108800.0, + "grad_norm": 1.9531750101692102, + "language_loss": 0.75199753, + "learning_rate": 3.9950875673145605e-06, + "loss": 0.77484941, + "num_input_tokens_seen": 18383390, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.59375, + "step": 859, + "time_per_iteration": 2.5090014934539795 + }, + { + "auxiliary_loss_clip": 0.01237239, + "auxiliary_loss_mlp": 0.01070917, + "balance_loss_clip": 1.04280758, + "balance_loss_mlp": 1.06980002, + "epoch": 0.05170599729445363, + "flos": 16252882876800.0, + "grad_norm": 2.092452223155828, + "language_loss": 0.90812773, + "learning_rate": 3.995060249372788e-06, + "loss": 0.93120927, + "num_input_tokens_seen": 18399220, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.671875, + "step": 860, + "time_per_iteration": 2.437220335006714 + }, + { + "auxiliary_loss_clip": 0.01231057, + "auxiliary_loss_mlp": 0.01060698, + "balance_loss_clip": 1.03568769, + "balance_loss_mlp": 1.0717634, + "epoch": 0.0517661205471216, + "flos": 23985922095360.0, + "grad_norm": 1.9189860758016508, + "language_loss": 0.82252973, + "learning_rate": 3.99503285577813e-06, + "loss": 0.8454473, + "num_input_tokens_seen": 18419005, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.59375, + "step": 861, + "time_per_iteration": 2.50883412361145 + }, + { + "auxiliary_loss_clip": 0.01233216, + "auxiliary_loss_mlp": 0.01057472, + "balance_loss_clip": 1.03177071, + "balance_loss_mlp": 1.0704143, + "epoch": 0.05182624379978957, + "flos": 29277718392960.0, + "grad_norm": 2.0352629197197762, + "language_loss": 0.78607392, + "learning_rate": 3.995005386531627e-06, + "loss": 0.80898082, + "num_input_tokens_seen": 18440550, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.625, + "step": 862, + "time_per_iteration": 2.548757314682007 + }, + { + "auxiliary_loss_clip": 0.01229413, + "auxiliary_loss_mlp": 0.01068334, + "balance_loss_clip": 1.04402709, + "balance_loss_mlp": 1.07291067, + "epoch": 0.05188636705245754, + "flos": 24170826332160.0, + "grad_norm": 1.9841587361763113, + "language_loss": 0.88999134, + "learning_rate": 3.9949778416343195e-06, + "loss": 0.91296881, + "num_input_tokens_seen": 18461950, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.5625, + "step": 863, + "time_per_iteration": 2.506289005279541 + }, + { + "auxiliary_loss_clip": 0.01238268, + "auxiliary_loss_mlp": 0.01064282, + "balance_loss_clip": 1.03712606, + "balance_loss_mlp": 1.07635331, + "epoch": 0.051946490305125506, + "flos": 26760703731840.0, + "grad_norm": 2.003999649515418, + "language_loss": 0.7575798, + "learning_rate": 3.9949502210872525e-06, + "loss": 0.78060532, + "num_input_tokens_seen": 18480555, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.625, + "step": 864, + "time_per_iteration": 2.515944480895996 + }, + { + "auxiliary_loss_clip": 0.01236545, + "auxiliary_loss_mlp": 0.01069508, + "balance_loss_clip": 1.04228067, + "balance_loss_mlp": 1.07355332, + "epoch": 0.05200661355779348, + "flos": 21502519585920.0, + "grad_norm": 1.9298630836237705, + "language_loss": 0.7919569, + "learning_rate": 3.994922524891474e-06, + "loss": 0.81501746, + "num_input_tokens_seen": 18499645, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.6328125, + "step": 865, + "time_per_iteration": 2.485499620437622 + }, + { + "auxiliary_loss_clip": 0.0123268, + "auxiliary_loss_mlp": 0.0106684, + "balance_loss_clip": 1.04144871, + "balance_loss_mlp": 1.07079291, + "epoch": 0.05206673681046144, + "flos": 18114492026880.0, + "grad_norm": 2.366131428952597, + "language_loss": 0.85700798, + "learning_rate": 3.994894753048032e-06, + "loss": 0.88000321, + "num_input_tokens_seen": 18516810, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.6171875, + "step": 866, + "time_per_iteration": 2.4854226112365723 + }, + { + "auxiliary_loss_clip": 0.01242589, + "auxiliary_loss_mlp": 0.01065926, + "balance_loss_clip": 1.03910398, + "balance_loss_mlp": 1.0804987, + "epoch": 0.052126860063129415, + "flos": 17524191916800.0, + "grad_norm": 2.535209572965093, + "language_loss": 0.8680315, + "learning_rate": 3.9948669055579815e-06, + "loss": 0.89111662, + "num_input_tokens_seen": 18532510, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.625, + "step": 867, + "time_per_iteration": 2.4644970893859863 + }, + { + "auxiliary_loss_clip": 0.01231644, + "auxiliary_loss_mlp": 0.01073847, + "balance_loss_clip": 1.05021977, + "balance_loss_mlp": 1.07513499, + "epoch": 0.05218698331579739, + "flos": 32598054771840.0, + "grad_norm": 1.64188364663517, + "language_loss": 0.63562089, + "learning_rate": 3.9948389824223785e-06, + "loss": 0.65867579, + "num_input_tokens_seen": 18557380, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.5625, + "step": 868, + "time_per_iteration": 2.567958354949951 + }, + { + "auxiliary_loss_clip": 0.01236968, + "auxiliary_loss_mlp": 0.01065922, + "balance_loss_clip": 1.03753829, + "balance_loss_mlp": 1.07263327, + "epoch": 0.05224710656846535, + "flos": 22127293774080.0, + "grad_norm": 2.1448269109564198, + "language_loss": 0.83076257, + "learning_rate": 3.994810983642281e-06, + "loss": 0.85379148, + "num_input_tokens_seen": 18575720, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.6484375, + "step": 869, + "time_per_iteration": 2.5021841526031494 + }, + { + "auxiliary_loss_clip": 0.01237154, + "auxiliary_loss_mlp": 0.01057742, + "balance_loss_clip": 1.03201652, + "balance_loss_mlp": 1.07245827, + "epoch": 0.052307229821133325, + "flos": 11145092976000.0, + "grad_norm": 2.352948725027126, + "language_loss": 0.87544227, + "learning_rate": 3.994782909218751e-06, + "loss": 0.89839119, + "num_input_tokens_seen": 18592185, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.6484375, + "step": 870, + "time_per_iteration": 2.459662437438965 + }, + { + "auxiliary_loss_clip": 0.01238457, + "auxiliary_loss_mlp": 0.01067184, + "balance_loss_clip": 1.04135191, + "balance_loss_mlp": 1.07536197, + "epoch": 0.05236735307380129, + "flos": 19128070005120.0, + "grad_norm": 1.9212028950510787, + "language_loss": 0.80554998, + "learning_rate": 3.994754759152854e-06, + "loss": 0.82860637, + "num_input_tokens_seen": 18609560, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.6328125, + "step": 871, + "time_per_iteration": 2.4701170921325684 + }, + { + "auxiliary_loss_clip": 0.01234905, + "auxiliary_loss_mlp": 0.01064695, + "balance_loss_clip": 1.04009032, + "balance_loss_mlp": 1.07576704, + "epoch": 0.05242747632646926, + "flos": 20960663944320.0, + "grad_norm": 1.5975290841395262, + "language_loss": 0.81374049, + "learning_rate": 3.994726533445656e-06, + "loss": 0.8367365, + "num_input_tokens_seen": 18629405, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.59375, + "step": 872, + "time_per_iteration": 2.4886369705200195 + }, + { + "auxiliary_loss_clip": 0.0111147, + "auxiliary_loss_mlp": 0.0101489, + "balance_loss_clip": 1.00952566, + "balance_loss_mlp": 1.03955865, + "epoch": 0.052487599579137234, + "flos": 65020542842880.0, + "grad_norm": 0.8879269166117758, + "language_loss": 0.61579192, + "learning_rate": 3.9946982320982274e-06, + "loss": 0.63705552, + "num_input_tokens_seen": 18681480, + "router_z_loss_clip": 0.05371094, + "router_z_loss_mlp": 0.71875, + "step": 873, + "time_per_iteration": 2.9913430213928223 + }, + { + "auxiliary_loss_clip": 0.01231663, + "auxiliary_loss_mlp": 0.01058247, + "balance_loss_clip": 1.03245032, + "balance_loss_mlp": 1.07107997, + "epoch": 0.0525477228318052, + "flos": 23288859786240.0, + "grad_norm": 1.8426182555123698, + "language_loss": 0.88426232, + "learning_rate": 3.994669855111643e-06, + "loss": 0.90716141, + "num_input_tokens_seen": 18700390, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.6015625, + "step": 874, + "time_per_iteration": 2.4794461727142334 + }, + { + "auxiliary_loss_clip": 0.0123222, + "auxiliary_loss_mlp": 0.01062298, + "balance_loss_clip": 1.03626251, + "balance_loss_mlp": 1.06908488, + "epoch": 0.05260784608447317, + "flos": 32230221546240.0, + "grad_norm": 2.2494767595307628, + "language_loss": 0.74779439, + "learning_rate": 3.994641402486977e-06, + "loss": 0.77073956, + "num_input_tokens_seen": 18721280, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.6328125, + "step": 875, + "time_per_iteration": 2.5768113136291504 + }, + { + "auxiliary_loss_clip": 0.01228414, + "auxiliary_loss_mlp": 0.01061086, + "balance_loss_clip": 1.03412056, + "balance_loss_mlp": 1.06905699, + "epoch": 0.052667969337141136, + "flos": 24463211040000.0, + "grad_norm": 2.052141253618648, + "language_loss": 0.92836702, + "learning_rate": 3.99461287422531e-06, + "loss": 0.951262, + "num_input_tokens_seen": 18741545, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.59375, + "step": 876, + "time_per_iteration": 2.535587787628174 + }, + { + "auxiliary_loss_clip": 0.0110653, + "auxiliary_loss_mlp": 0.01009036, + "balance_loss_clip": 1.00379074, + "balance_loss_mlp": 1.03698087, + "epoch": 0.05272809258980911, + "flos": 57784329567360.0, + "grad_norm": 0.854570032578524, + "language_loss": 0.62934959, + "learning_rate": 3.994584270327722e-06, + "loss": 0.6505053, + "num_input_tokens_seen": 18801400, + "router_z_loss_clip": 0.05249023, + "router_z_loss_mlp": 0.6953125, + "step": 877, + "time_per_iteration": 3.094581127166748 + }, + { + "auxiliary_loss_clip": 0.01231545, + "auxiliary_loss_mlp": 0.01069359, + "balance_loss_clip": 1.04174972, + "balance_loss_mlp": 1.06975055, + "epoch": 0.05278821584247708, + "flos": 17420805596160.0, + "grad_norm": 2.154366240232031, + "language_loss": 0.85691291, + "learning_rate": 3.994555590795299e-06, + "loss": 0.87992191, + "num_input_tokens_seen": 18819670, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.6171875, + "step": 878, + "time_per_iteration": 2.5052285194396973 + }, + { + "auxiliary_loss_clip": 0.01232133, + "auxiliary_loss_mlp": 0.01063559, + "balance_loss_clip": 1.03754723, + "balance_loss_mlp": 1.06974411, + "epoch": 0.052848339095145046, + "flos": 26137258346880.0, + "grad_norm": 2.0833089409086942, + "language_loss": 0.82790506, + "learning_rate": 3.9945268356291275e-06, + "loss": 0.85086197, + "num_input_tokens_seen": 18840580, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.625, + "step": 879, + "time_per_iteration": 2.564312219619751 + }, + { + "auxiliary_loss_clip": 0.01227867, + "auxiliary_loss_mlp": 0.01066877, + "balance_loss_clip": 1.04011488, + "balance_loss_mlp": 1.06966615, + "epoch": 0.05290846234781302, + "flos": 16472081623680.0, + "grad_norm": 4.271066320440391, + "language_loss": 0.84404933, + "learning_rate": 3.9944980048302985e-06, + "loss": 0.86699677, + "num_input_tokens_seen": 18859295, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.578125, + "step": 880, + "time_per_iteration": 2.4854133129119873 + }, + { + "auxiliary_loss_clip": 0.01233797, + "auxiliary_loss_mlp": 0.01069821, + "balance_loss_clip": 1.04360688, + "balance_loss_mlp": 1.07206059, + "epoch": 0.05296858560048098, + "flos": 19865173000320.0, + "grad_norm": 3.515636761469604, + "language_loss": 0.87156737, + "learning_rate": 3.994469098399906e-06, + "loss": 0.89460361, + "num_input_tokens_seen": 18877485, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.625, + "step": 881, + "time_per_iteration": 2.476846933364868 + }, + { + "auxiliary_loss_clip": 0.01228751, + "auxiliary_loss_mlp": 0.01065941, + "balance_loss_clip": 1.03789103, + "balance_loss_mlp": 1.06813371, + "epoch": 0.053028708853148955, + "flos": 24388588535040.0, + "grad_norm": 1.9345214626214409, + "language_loss": 0.87682849, + "learning_rate": 3.994440116339046e-06, + "loss": 0.89977539, + "num_input_tokens_seen": 18898275, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.609375, + "step": 882, + "time_per_iteration": 2.6449031829833984 + }, + { + "auxiliary_loss_clip": 0.01233714, + "auxiliary_loss_mlp": 0.01065669, + "balance_loss_clip": 1.03825057, + "balance_loss_mlp": 1.07030129, + "epoch": 0.05308883210581693, + "flos": 36393166143360.0, + "grad_norm": 2.7245054008776814, + "language_loss": 0.68869275, + "learning_rate": 3.994411058648816e-06, + "loss": 0.71168661, + "num_input_tokens_seen": 18920665, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.6328125, + "step": 883, + "time_per_iteration": 2.620363235473633 + }, + { + "auxiliary_loss_clip": 0.01225388, + "auxiliary_loss_mlp": 0.01060527, + "balance_loss_clip": 1.03461075, + "balance_loss_mlp": 1.06937146, + "epoch": 0.05314895535848489, + "flos": 22855095146880.0, + "grad_norm": 1.9628498458506696, + "language_loss": 0.75887203, + "learning_rate": 3.994381925330319e-06, + "loss": 0.78173113, + "num_input_tokens_seen": 18939835, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.5546875, + "step": 884, + "time_per_iteration": 2.4948067665100098 + }, + { + "auxiliary_loss_clip": 0.01225729, + "auxiliary_loss_mlp": 0.01063879, + "balance_loss_clip": 1.03870201, + "balance_loss_mlp": 1.06921601, + "epoch": 0.053209078611152864, + "flos": 12860330204160.0, + "grad_norm": 2.00306560312032, + "language_loss": 0.85323638, + "learning_rate": 3.994352716384659e-06, + "loss": 0.87613249, + "num_input_tokens_seen": 18958405, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.5625, + "step": 885, + "time_per_iteration": 2.5159530639648438 + }, + { + "auxiliary_loss_clip": 0.01228523, + "auxiliary_loss_mlp": 0.01068973, + "balance_loss_clip": 1.04205549, + "balance_loss_mlp": 1.06673646, + "epoch": 0.05326920186382083, + "flos": 12164596698240.0, + "grad_norm": 2.6316893825734344, + "language_loss": 0.85726082, + "learning_rate": 3.994323431812945e-06, + "loss": 0.88023585, + "num_input_tokens_seen": 18975445, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.6171875, + "step": 886, + "time_per_iteration": 2.4650700092315674 + }, + { + "auxiliary_loss_clip": 0.01226585, + "auxiliary_loss_mlp": 0.01066459, + "balance_loss_clip": 1.03908896, + "balance_loss_mlp": 1.06944001, + "epoch": 0.0533293251164888, + "flos": 22704485420160.0, + "grad_norm": 2.1517488326805214, + "language_loss": 0.89229804, + "learning_rate": 3.994294071616286e-06, + "loss": 0.91522843, + "num_input_tokens_seen": 18991930, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.5703125, + "step": 887, + "time_per_iteration": 2.5020337104797363 + }, + { + "auxiliary_loss_clip": 0.01227687, + "auxiliary_loss_mlp": 0.01070962, + "balance_loss_clip": 1.04270935, + "balance_loss_mlp": 1.06604195, + "epoch": 0.053389448369156774, + "flos": 26940939200640.0, + "grad_norm": 2.2836036404275593, + "language_loss": 0.75076836, + "learning_rate": 3.994264635795796e-06, + "loss": 0.77375484, + "num_input_tokens_seen": 19009790, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.6171875, + "step": 888, + "time_per_iteration": 2.5055694580078125 + }, + { + "auxiliary_loss_clip": 0.0123028, + "auxiliary_loss_mlp": 0.0107639, + "balance_loss_clip": 1.0480895, + "balance_loss_mlp": 1.07113457, + "epoch": 0.05344957162182474, + "flos": 25556331686400.0, + "grad_norm": 2.032914331295681, + "language_loss": 0.88330352, + "learning_rate": 3.994235124352592e-06, + "loss": 0.90637028, + "num_input_tokens_seen": 19030170, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.59375, + "step": 889, + "time_per_iteration": 2.5147650241851807 + }, + { + "auxiliary_loss_clip": 0.01222875, + "auxiliary_loss_mlp": 0.01053174, + "balance_loss_clip": 1.02748489, + "balance_loss_mlp": 1.06732821, + "epoch": 0.05350969487449271, + "flos": 19719591177600.0, + "grad_norm": 1.9726085703824752, + "language_loss": 0.88269985, + "learning_rate": 3.994205537287791e-06, + "loss": 0.90546036, + "num_input_tokens_seen": 19048075, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.5546875, + "step": 890, + "time_per_iteration": 2.490300416946411 + }, + { + "auxiliary_loss_clip": 0.01225662, + "auxiliary_loss_mlp": 0.0107145, + "balance_loss_clip": 1.04612982, + "balance_loss_mlp": 1.06690812, + "epoch": 0.053569818127160676, + "flos": 27016351804800.0, + "grad_norm": 2.320271972022273, + "language_loss": 0.93251556, + "learning_rate": 3.994175874602517e-06, + "loss": 0.95548671, + "num_input_tokens_seen": 19067465, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.59375, + "step": 891, + "time_per_iteration": 2.5133957862854004 + }, + { + "auxiliary_loss_clip": 0.01225404, + "auxiliary_loss_mlp": 0.01062212, + "balance_loss_clip": 1.03506804, + "balance_loss_mlp": 1.06682086, + "epoch": 0.05362994137982865, + "flos": 13188338225280.0, + "grad_norm": 2.238230674372026, + "language_loss": 0.71759057, + "learning_rate": 3.994146136297893e-06, + "loss": 0.74046671, + "num_input_tokens_seen": 19085505, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.5859375, + "step": 892, + "time_per_iteration": 2.5544779300689697 + }, + { + "auxiliary_loss_clip": 0.01229119, + "auxiliary_loss_mlp": 0.01067529, + "balance_loss_clip": 1.0421617, + "balance_loss_mlp": 1.06946719, + "epoch": 0.05369006463249662, + "flos": 28658008022400.0, + "grad_norm": 2.3204520758070037, + "language_loss": 0.82304287, + "learning_rate": 3.994116322375049e-06, + "loss": 0.84600937, + "num_input_tokens_seen": 19104360, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.6015625, + "step": 893, + "time_per_iteration": 5.3903117179870605 + }, + { + "auxiliary_loss_clip": 0.0122945, + "auxiliary_loss_mlp": 0.01070342, + "balance_loss_clip": 1.04430699, + "balance_loss_mlp": 1.0679965, + "epoch": 0.053750187885164585, + "flos": 28913153304960.0, + "grad_norm": 2.3808217776212937, + "language_loss": 0.81695569, + "learning_rate": 3.994086432835114e-06, + "loss": 0.83995366, + "num_input_tokens_seen": 19124680, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.609375, + "step": 894, + "time_per_iteration": 2.52809476852417 + }, + { + "auxiliary_loss_clip": 0.01227471, + "auxiliary_loss_mlp": 0.01065449, + "balance_loss_clip": 1.03915119, + "balance_loss_mlp": 1.06881404, + "epoch": 0.05381031113783256, + "flos": 15158828476800.0, + "grad_norm": 2.5337894710206093, + "language_loss": 0.76043701, + "learning_rate": 3.994056467679221e-06, + "loss": 0.7833662, + "num_input_tokens_seen": 19142895, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.5859375, + "step": 895, + "time_per_iteration": 2.4810688495635986 + }, + { + "auxiliary_loss_clip": 0.01238307, + "auxiliary_loss_mlp": 0.01057353, + "balance_loss_clip": 1.03022122, + "balance_loss_mlp": 1.07260597, + "epoch": 0.05387043439050053, + "flos": 21835232288640.0, + "grad_norm": 2.2065839001211156, + "language_loss": 0.86456096, + "learning_rate": 3.9940264269085065e-06, + "loss": 0.88751751, + "num_input_tokens_seen": 19163125, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.65625, + "step": 896, + "time_per_iteration": 2.522254467010498 + }, + { + "auxiliary_loss_clip": 0.01231325, + "auxiliary_loss_mlp": 0.01062675, + "balance_loss_clip": 1.03495908, + "balance_loss_mlp": 1.06809413, + "epoch": 0.053930557643168495, + "flos": 17310308382720.0, + "grad_norm": 2.1680285530564274, + "language_loss": 0.87949234, + "learning_rate": 3.9939963105241115e-06, + "loss": 0.90243232, + "num_input_tokens_seen": 19179385, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.6328125, + "step": 897, + "time_per_iteration": 2.457918167114258 + }, + { + "auxiliary_loss_clip": 0.0122574, + "auxiliary_loss_mlp": 0.01063765, + "balance_loss_clip": 1.03570318, + "balance_loss_mlp": 1.06723523, + "epoch": 0.05399068089583647, + "flos": 17348481561600.0, + "grad_norm": 1.7359050724031848, + "language_loss": 0.9035244, + "learning_rate": 3.993966118527175e-06, + "loss": 0.9264195, + "num_input_tokens_seen": 19198725, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.5859375, + "step": 898, + "time_per_iteration": 2.4593143463134766 + }, + { + "auxiliary_loss_clip": 0.01234899, + "auxiliary_loss_mlp": 0.01084595, + "balance_loss_clip": 1.05808282, + "balance_loss_mlp": 1.07024622, + "epoch": 0.05405080414850443, + "flos": 17486952491520.0, + "grad_norm": 3.958355519485596, + "language_loss": 0.91756964, + "learning_rate": 3.993935850918845e-06, + "loss": 0.94076455, + "num_input_tokens_seen": 19212380, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.6484375, + "step": 899, + "time_per_iteration": 2.4461729526519775 + }, + { + "auxiliary_loss_clip": 0.01225208, + "auxiliary_loss_mlp": 0.01065344, + "balance_loss_clip": 1.03964233, + "balance_loss_mlp": 1.06601286, + "epoch": 0.054110927401172404, + "flos": 24496787278080.0, + "grad_norm": 2.6493739136310643, + "language_loss": 0.75594276, + "learning_rate": 3.9939055077002665e-06, + "loss": 0.77884829, + "num_input_tokens_seen": 19232235, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.59375, + "step": 900, + "time_per_iteration": 2.5180957317352295 + }, + { + "auxiliary_loss_clip": 0.01231903, + "auxiliary_loss_mlp": 0.01059763, + "balance_loss_clip": 1.03413296, + "balance_loss_mlp": 1.06860638, + "epoch": 0.054171050653840376, + "flos": 22930040874240.0, + "grad_norm": 2.2496787705299908, + "language_loss": 0.7377668, + "learning_rate": 3.993875088872592e-06, + "loss": 0.76068342, + "num_input_tokens_seen": 19251460, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.6328125, + "step": 901, + "time_per_iteration": 2.49638032913208 + }, + { + "auxiliary_loss_clip": 0.01221671, + "auxiliary_loss_mlp": 0.01074944, + "balance_loss_clip": 1.04982698, + "balance_loss_mlp": 1.06662059, + "epoch": 0.05423117390650834, + "flos": 12933192942720.0, + "grad_norm": 2.0553503619333586, + "language_loss": 0.85004938, + "learning_rate": 3.9938445944369745e-06, + "loss": 0.87301552, + "num_input_tokens_seen": 19269060, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.546875, + "step": 902, + "time_per_iteration": 2.5067105293273926 + }, + { + "auxiliary_loss_clip": 0.01226177, + "auxiliary_loss_mlp": 0.0106299, + "balance_loss_clip": 1.03638279, + "balance_loss_mlp": 1.06769705, + "epoch": 0.05429129715917631, + "flos": 19901335017600.0, + "grad_norm": 2.0002475654879195, + "language_loss": 0.8655951, + "learning_rate": 3.993814024394569e-06, + "loss": 0.8884868, + "num_input_tokens_seen": 19288620, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5859375, + "step": 903, + "time_per_iteration": 2.522193670272827 + }, + { + "auxiliary_loss_clip": 0.01227512, + "auxiliary_loss_mlp": 0.01062198, + "balance_loss_clip": 1.03622222, + "balance_loss_mlp": 1.06904316, + "epoch": 0.05435142041184428, + "flos": 16908611610240.0, + "grad_norm": 2.4298091072226855, + "language_loss": 0.74835998, + "learning_rate": 3.993783378746537e-06, + "loss": 0.77125704, + "num_input_tokens_seen": 19306615, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.578125, + "step": 904, + "time_per_iteration": 2.456969976425171 + }, + { + "auxiliary_loss_clip": 0.0123038, + "auxiliary_loss_mlp": 0.01073252, + "balance_loss_clip": 1.04685879, + "balance_loss_mlp": 1.06905615, + "epoch": 0.05441154366451225, + "flos": 23948323534080.0, + "grad_norm": 2.0843949675352356, + "language_loss": 0.85750329, + "learning_rate": 3.993752657494039e-06, + "loss": 0.8805396, + "num_input_tokens_seen": 19321680, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.609375, + "step": 905, + "time_per_iteration": 2.5358731746673584 + }, + { + "auxiliary_loss_clip": 0.01227222, + "auxiliary_loss_mlp": 0.01078235, + "balance_loss_clip": 1.05317712, + "balance_loss_mlp": 1.07247257, + "epoch": 0.05447166691718022, + "flos": 19975382904960.0, + "grad_norm": 1.7937911991915148, + "language_loss": 0.74028552, + "learning_rate": 3.993721860638241e-06, + "loss": 0.76334012, + "num_input_tokens_seen": 19339760, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.546875, + "step": 906, + "time_per_iteration": 2.468331813812256 + }, + { + "auxiliary_loss_clip": 0.01228766, + "auxiliary_loss_mlp": 0.01065896, + "balance_loss_clip": 1.03909731, + "balance_loss_mlp": 1.06858826, + "epoch": 0.05453179016984819, + "flos": 24936513575040.0, + "grad_norm": 2.220044948377472, + "language_loss": 0.87410975, + "learning_rate": 3.993690988180309e-06, + "loss": 0.89705634, + "num_input_tokens_seen": 19359585, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.6015625, + "step": 907, + "time_per_iteration": 2.5177390575408936 + }, + { + "auxiliary_loss_clip": 0.01227557, + "auxiliary_loss_mlp": 0.01071851, + "balance_loss_clip": 1.04521942, + "balance_loss_mlp": 1.07002556, + "epoch": 0.05459191342251616, + "flos": 18115102558080.0, + "grad_norm": 1.8689281211501179, + "language_loss": 0.86915505, + "learning_rate": 3.9936600401214165e-06, + "loss": 0.89214909, + "num_input_tokens_seen": 19378590, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.578125, + "step": 908, + "time_per_iteration": 2.45135498046875 + }, + { + "auxiliary_loss_clip": 0.01225417, + "auxiliary_loss_mlp": 0.01068459, + "balance_loss_clip": 1.04073071, + "balance_loss_mlp": 1.06842148, + "epoch": 0.054652036675184125, + "flos": 19208295031680.0, + "grad_norm": 2.409525813232516, + "language_loss": 0.89454836, + "learning_rate": 3.9936290164627345e-06, + "loss": 0.91748714, + "num_input_tokens_seen": 19397910, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.5703125, + "step": 909, + "time_per_iteration": 2.4702625274658203 + }, + { + "auxiliary_loss_clip": 0.01231345, + "auxiliary_loss_mlp": 0.01075786, + "balance_loss_clip": 1.04773629, + "balance_loss_mlp": 1.06930447, + "epoch": 0.0547121599278521, + "flos": 16325745615360.0, + "grad_norm": 2.4022545211155593, + "language_loss": 0.70942473, + "learning_rate": 3.99359791720544e-06, + "loss": 0.73249602, + "num_input_tokens_seen": 19415950, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.625, + "step": 910, + "time_per_iteration": 2.4530797004699707 + }, + { + "auxiliary_loss_clip": 0.01224757, + "auxiliary_loss_mlp": 0.01055797, + "balance_loss_clip": 1.03002357, + "balance_loss_mlp": 1.06815219, + "epoch": 0.05477228318052007, + "flos": 20339014239360.0, + "grad_norm": 2.0100188286094745, + "language_loss": 0.8349818, + "learning_rate": 3.993566742350714e-06, + "loss": 0.85778737, + "num_input_tokens_seen": 19435275, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.5625, + "step": 911, + "time_per_iteration": 2.4792025089263916 + }, + { + "auxiliary_loss_clip": 0.01224017, + "auxiliary_loss_mlp": 0.01072081, + "balance_loss_clip": 1.04524732, + "balance_loss_mlp": 1.06649613, + "epoch": 0.054832406433188034, + "flos": 21973092687360.0, + "grad_norm": 2.746196883211308, + "language_loss": 0.76096344, + "learning_rate": 3.993535491899736e-06, + "loss": 0.7839244, + "num_input_tokens_seen": 19452090, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.578125, + "step": 912, + "time_per_iteration": 2.4651522636413574 + }, + { + "auxiliary_loss_clip": 0.01219912, + "auxiliary_loss_mlp": 0.01052416, + "balance_loss_clip": 1.02733433, + "balance_loss_mlp": 1.06664968, + "epoch": 0.054892529685856006, + "flos": 16398931576320.0, + "grad_norm": 2.385296939765248, + "language_loss": 0.82667339, + "learning_rate": 3.993504165853694e-06, + "loss": 0.84939671, + "num_input_tokens_seen": 19470865, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.53125, + "step": 913, + "time_per_iteration": 2.475384473800659 + }, + { + "auxiliary_loss_clip": 0.01224168, + "auxiliary_loss_mlp": 0.01061883, + "balance_loss_clip": 1.03633678, + "balance_loss_mlp": 1.07065797, + "epoch": 0.05495265293852397, + "flos": 23912341084800.0, + "grad_norm": 2.227172084037845, + "language_loss": 0.83470452, + "learning_rate": 3.993472764213772e-06, + "loss": 0.85756505, + "num_input_tokens_seen": 19492145, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.53125, + "step": 914, + "time_per_iteration": 2.5741806030273438 + }, + { + "auxiliary_loss_clip": 0.01229195, + "auxiliary_loss_mlp": 0.01057782, + "balance_loss_clip": 1.03324902, + "balance_loss_mlp": 1.07264161, + "epoch": 0.055012776191191944, + "flos": 23586954756480.0, + "grad_norm": 2.897688985464872, + "language_loss": 0.9010309, + "learning_rate": 3.9934412869811655e-06, + "loss": 0.92390066, + "num_input_tokens_seen": 19511015, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.5625, + "step": 915, + "time_per_iteration": 2.492981433868408 + }, + { + "auxiliary_loss_clip": 0.01225584, + "auxiliary_loss_mlp": 0.01055475, + "balance_loss_clip": 1.03046489, + "balance_loss_mlp": 1.0708915, + "epoch": 0.055072899443859916, + "flos": 17528501548800.0, + "grad_norm": 1.870109983937874, + "language_loss": 0.89555848, + "learning_rate": 3.993409734157064e-06, + "loss": 0.91836905, + "num_input_tokens_seen": 19529040, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.546875, + "step": 916, + "time_per_iteration": 2.4621188640594482 + }, + { + "auxiliary_loss_clip": 0.01228011, + "auxiliary_loss_mlp": 0.01072271, + "balance_loss_clip": 1.04593801, + "balance_loss_mlp": 1.06942379, + "epoch": 0.05513302269652788, + "flos": 21687172427520.0, + "grad_norm": 1.7933741103180343, + "language_loss": 0.80085957, + "learning_rate": 3.993378105742666e-06, + "loss": 0.82386243, + "num_input_tokens_seen": 19549540, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.5859375, + "step": 917, + "time_per_iteration": 2.49455189704895 + }, + { + "auxiliary_loss_clip": 0.01225592, + "auxiliary_loss_mlp": 0.01058516, + "balance_loss_clip": 1.03270769, + "balance_loss_mlp": 1.06678224, + "epoch": 0.05519314594919585, + "flos": 21613340021760.0, + "grad_norm": 1.9216560267302982, + "language_loss": 0.79673612, + "learning_rate": 3.9933464017391705e-06, + "loss": 0.81957722, + "num_input_tokens_seen": 19567570, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.59375, + "step": 918, + "time_per_iteration": 2.504734516143799 + }, + { + "auxiliary_loss_clip": 0.01223712, + "auxiliary_loss_mlp": 0.01059794, + "balance_loss_clip": 1.03414011, + "balance_loss_mlp": 1.06658053, + "epoch": 0.05525326920186382, + "flos": 21798567480960.0, + "grad_norm": 1.9394116717498289, + "language_loss": 0.89132315, + "learning_rate": 3.99331462214778e-06, + "loss": 0.91415823, + "num_input_tokens_seen": 19585330, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.5703125, + "step": 919, + "time_per_iteration": 2.5087900161743164 + }, + { + "auxiliary_loss_clip": 0.01219042, + "auxiliary_loss_mlp": 0.01068553, + "balance_loss_clip": 1.0427916, + "balance_loss_mlp": 1.06515777, + "epoch": 0.05531339245453179, + "flos": 28439635288320.0, + "grad_norm": 2.688355226699252, + "language_loss": 0.87421197, + "learning_rate": 3.993282766969699e-06, + "loss": 0.89708793, + "num_input_tokens_seen": 19604970, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.5390625, + "step": 920, + "time_per_iteration": 2.536914348602295 + }, + { + "auxiliary_loss_clip": 0.01223828, + "auxiliary_loss_mlp": 0.01063036, + "balance_loss_clip": 1.03733468, + "balance_loss_mlp": 1.06937671, + "epoch": 0.05537351570719976, + "flos": 37375143131520.0, + "grad_norm": 2.1255302161497704, + "language_loss": 0.65921712, + "learning_rate": 3.993250836206136e-06, + "loss": 0.68208569, + "num_input_tokens_seen": 19626235, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.546875, + "step": 921, + "time_per_iteration": 2.643416166305542 + }, + { + "auxiliary_loss_clip": 0.01229793, + "auxiliary_loss_mlp": 0.0106877, + "balance_loss_clip": 1.03969455, + "balance_loss_mlp": 1.0698204, + "epoch": 0.05543363895986773, + "flos": 20084479488000.0, + "grad_norm": 2.143682946402907, + "language_loss": 0.71841472, + "learning_rate": 3.993218829858301e-06, + "loss": 0.74140036, + "num_input_tokens_seen": 19644305, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.6015625, + "step": 922, + "time_per_iteration": 2.4544074535369873 + }, + { + "auxiliary_loss_clip": 0.0122536, + "auxiliary_loss_mlp": 0.01070183, + "balance_loss_clip": 1.04346824, + "balance_loss_mlp": 1.0669136, + "epoch": 0.0554937622125357, + "flos": 24533200690560.0, + "grad_norm": 2.766492717488127, + "language_loss": 0.82548857, + "learning_rate": 3.993186747927408e-06, + "loss": 0.84844404, + "num_input_tokens_seen": 19662130, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5859375, + "step": 923, + "time_per_iteration": 2.490915536880493 + }, + { + "auxiliary_loss_clip": 0.01221243, + "auxiliary_loss_mlp": 0.01068053, + "balance_loss_clip": 1.04194593, + "balance_loss_mlp": 1.06429458, + "epoch": 0.055553885465203665, + "flos": 14320063013760.0, + "grad_norm": 2.2095756655687397, + "language_loss": 0.78808558, + "learning_rate": 3.993154590414675e-06, + "loss": 0.81097853, + "num_input_tokens_seen": 19680715, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.5625, + "step": 924, + "time_per_iteration": 2.45884108543396 + }, + { + "auxiliary_loss_clip": 0.0121918, + "auxiliary_loss_mlp": 0.01059373, + "balance_loss_clip": 1.03245521, + "balance_loss_mlp": 1.06480467, + "epoch": 0.05561400871787164, + "flos": 27381132374400.0, + "grad_norm": 1.9513803878946447, + "language_loss": 1.02250028, + "learning_rate": 3.993122357321319e-06, + "loss": 1.04528582, + "num_input_tokens_seen": 19700535, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.546875, + "step": 925, + "time_per_iteration": 2.5296268463134766 + }, + { + "auxiliary_loss_clip": 0.01220429, + "auxiliary_loss_mlp": 0.01055633, + "balance_loss_clip": 1.02975261, + "balance_loss_mlp": 1.0634799, + "epoch": 0.05567413197053961, + "flos": 23221096778880.0, + "grad_norm": 2.3756260245044687, + "language_loss": 0.80808276, + "learning_rate": 3.993090048648564e-06, + "loss": 0.83084333, + "num_input_tokens_seen": 19718825, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.5703125, + "step": 926, + "time_per_iteration": 2.496299982070923 + }, + { + "auxiliary_loss_clip": 0.01229405, + "auxiliary_loss_mlp": 0.01068259, + "balance_loss_clip": 1.04049563, + "balance_loss_mlp": 1.06743848, + "epoch": 0.055734255223207574, + "flos": 25264952559360.0, + "grad_norm": 2.4713559623940924, + "language_loss": 0.73378903, + "learning_rate": 3.993057664397634e-06, + "loss": 0.75676566, + "num_input_tokens_seen": 19739080, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.6171875, + "step": 927, + "time_per_iteration": 2.5607478618621826 + }, + { + "auxiliary_loss_clip": 0.01103967, + "auxiliary_loss_mlp": 0.01014529, + "balance_loss_clip": 1.00837731, + "balance_loss_mlp": 1.03639269, + "epoch": 0.055794378475875546, + "flos": 66503116702080.0, + "grad_norm": 0.7814837823676635, + "language_loss": 0.5989722, + "learning_rate": 3.9930252045697585e-06, + "loss": 0.62015712, + "num_input_tokens_seen": 19802960, + "router_z_loss_clip": 0.0612793, + "router_z_loss_mlp": 0.67578125, + "step": 928, + "time_per_iteration": 3.0945305824279785 + }, + { + "auxiliary_loss_clip": 0.01223562, + "auxiliary_loss_mlp": 0.01066756, + "balance_loss_clip": 1.04035151, + "balance_loss_mlp": 1.06729245, + "epoch": 0.05585450172854351, + "flos": 25337635729920.0, + "grad_norm": 2.3037954576101587, + "language_loss": 0.95011377, + "learning_rate": 3.992992669166168e-06, + "loss": 0.97301698, + "num_input_tokens_seen": 19822765, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.5625, + "step": 929, + "time_per_iteration": 2.527270555496216 + }, + { + "auxiliary_loss_clip": 0.01221186, + "auxiliary_loss_mlp": 0.01067668, + "balance_loss_clip": 1.03924894, + "balance_loss_mlp": 1.06494856, + "epoch": 0.05591462498121148, + "flos": 33911738881920.0, + "grad_norm": 2.1540114832188553, + "language_loss": 0.71827871, + "learning_rate": 3.992960058188094e-06, + "loss": 0.74116725, + "num_input_tokens_seen": 19843590, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.5625, + "step": 930, + "time_per_iteration": 2.57513689994812 + }, + { + "auxiliary_loss_clip": 0.01227654, + "auxiliary_loss_mlp": 0.01062398, + "balance_loss_clip": 1.0355165, + "balance_loss_mlp": 1.06905401, + "epoch": 0.055974748233879455, + "flos": 17930880679680.0, + "grad_norm": 2.336481182624628, + "language_loss": 0.85333288, + "learning_rate": 3.992927371636776e-06, + "loss": 0.87623346, + "num_input_tokens_seen": 19860230, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.5859375, + "step": 931, + "time_per_iteration": 2.459167957305908 + }, + { + "auxiliary_loss_clip": 0.01224553, + "auxiliary_loss_mlp": 0.01072004, + "balance_loss_clip": 1.0448482, + "balance_loss_mlp": 1.06556344, + "epoch": 0.05603487148654742, + "flos": 24021976371840.0, + "grad_norm": 1.9723738142749898, + "language_loss": 0.83577204, + "learning_rate": 3.9928946095134525e-06, + "loss": 0.85873753, + "num_input_tokens_seen": 19880795, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.59375, + "step": 932, + "time_per_iteration": 2.4930593967437744 + }, + { + "auxiliary_loss_clip": 0.01223225, + "auxiliary_loss_mlp": 0.01067756, + "balance_loss_clip": 1.04012322, + "balance_loss_mlp": 1.06712675, + "epoch": 0.05609499473921539, + "flos": 17307758517120.0, + "grad_norm": 2.411257667891357, + "language_loss": 0.73405433, + "learning_rate": 3.992861771819365e-06, + "loss": 0.75696409, + "num_input_tokens_seen": 19897960, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.5625, + "step": 933, + "time_per_iteration": 2.526521682739258 + }, + { + "auxiliary_loss_clip": 0.01219811, + "auxiliary_loss_mlp": 0.01070368, + "balance_loss_clip": 1.04328358, + "balance_loss_mlp": 1.06432819, + "epoch": 0.05615511799188336, + "flos": 20994742972800.0, + "grad_norm": 2.577929883809357, + "language_loss": 0.86850882, + "learning_rate": 3.99282885855576e-06, + "loss": 0.89141059, + "num_input_tokens_seen": 19913315, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.5546875, + "step": 934, + "time_per_iteration": 5.338034391403198 + }, + { + "auxiliary_loss_clip": 0.01220003, + "auxiliary_loss_mlp": 0.01069693, + "balance_loss_clip": 1.04495692, + "balance_loss_mlp": 1.06842983, + "epoch": 0.05621524124455133, + "flos": 17273535834240.0, + "grad_norm": 2.2060919587088965, + "language_loss": 0.80243224, + "learning_rate": 3.992795869723885e-06, + "loss": 0.82532918, + "num_input_tokens_seen": 19928790, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.515625, + "step": 935, + "time_per_iteration": 3.8198087215423584 + }, + { + "auxiliary_loss_clip": 0.01094527, + "auxiliary_loss_mlp": 0.01008984, + "balance_loss_clip": 1.00321388, + "balance_loss_mlp": 1.02876139, + "epoch": 0.0562753644972193, + "flos": 58719370458240.0, + "grad_norm": 0.8225714537835027, + "language_loss": 0.69179416, + "learning_rate": 3.99276280532499e-06, + "loss": 0.71282923, + "num_input_tokens_seen": 19988785, + "router_z_loss_clip": 0.05761719, + "router_z_loss_mlp": 0.65625, + "step": 936, + "time_per_iteration": 2.9585764408111572 + }, + { + "auxiliary_loss_clip": 0.01220636, + "auxiliary_loss_mlp": 0.01067113, + "balance_loss_clip": 1.04123259, + "balance_loss_mlp": 1.06387568, + "epoch": 0.05633548774988727, + "flos": 17457039440640.0, + "grad_norm": 2.5168182860703237, + "language_loss": 0.75900578, + "learning_rate": 3.992729665360331e-06, + "loss": 0.78188324, + "num_input_tokens_seen": 20007685, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.5703125, + "step": 937, + "time_per_iteration": 2.4891855716705322 + }, + { + "auxiliary_loss_clip": 0.01091743, + "auxiliary_loss_mlp": 0.01002728, + "balance_loss_clip": 0.99738711, + "balance_loss_mlp": 1.02642298, + "epoch": 0.05639561100255524, + "flos": 70654928083200.0, + "grad_norm": 0.8631606334327763, + "language_loss": 0.64287508, + "learning_rate": 3.992696449831162e-06, + "loss": 0.66381979, + "num_input_tokens_seen": 20072750, + "router_z_loss_clip": 0.0534668, + "router_z_loss_mlp": 0.65625, + "step": 938, + "time_per_iteration": 3.0239782333374023 + }, + { + "auxiliary_loss_clip": 0.01226335, + "auxiliary_loss_mlp": 0.0107197, + "balance_loss_clip": 1.04487348, + "balance_loss_mlp": 1.06571174, + "epoch": 0.056455734255223204, + "flos": 20485996692480.0, + "grad_norm": 4.570077538128457, + "language_loss": 0.7903074, + "learning_rate": 3.992663158738745e-06, + "loss": 0.81329048, + "num_input_tokens_seen": 20089070, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.609375, + "step": 939, + "time_per_iteration": 2.494706630706787 + }, + { + "auxiliary_loss_clip": 0.012214, + "auxiliary_loss_mlp": 0.01063948, + "balance_loss_clip": 1.03868759, + "balance_loss_mlp": 1.0669229, + "epoch": 0.056515857507891176, + "flos": 22053569109120.0, + "grad_norm": 1.950609958048397, + "language_loss": 0.73893893, + "learning_rate": 3.992629792084341e-06, + "loss": 0.76179242, + "num_input_tokens_seen": 20108790, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.546875, + "step": 940, + "time_per_iteration": 2.5279061794281006 + }, + { + "auxiliary_loss_clip": 0.01220257, + "auxiliary_loss_mlp": 0.01064421, + "balance_loss_clip": 1.03776574, + "balance_loss_mlp": 1.06722569, + "epoch": 0.05657598076055915, + "flos": 24025316336640.0, + "grad_norm": 1.9142676693922898, + "language_loss": 0.70475829, + "learning_rate": 3.992596349869216e-06, + "loss": 0.72760499, + "num_input_tokens_seen": 20128455, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.53125, + "step": 941, + "time_per_iteration": 2.551604747772217 + }, + { + "auxiliary_loss_clip": 0.01218348, + "auxiliary_loss_mlp": 0.01058281, + "balance_loss_clip": 1.03229308, + "balance_loss_mlp": 1.06624675, + "epoch": 0.05663610401322711, + "flos": 20480609652480.0, + "grad_norm": 2.3045436850665917, + "language_loss": 0.80928791, + "learning_rate": 3.992562832094637e-06, + "loss": 0.83205426, + "num_input_tokens_seen": 20145775, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.515625, + "step": 942, + "time_per_iteration": 2.515646457672119 + }, + { + "auxiliary_loss_clip": 0.01214197, + "auxiliary_loss_mlp": 0.01057859, + "balance_loss_clip": 1.03245521, + "balance_loss_mlp": 1.062042, + "epoch": 0.056696227265895086, + "flos": 21069042255360.0, + "grad_norm": 2.7900678467193205, + "language_loss": 0.88067353, + "learning_rate": 3.9925292387618755e-06, + "loss": 0.9033941, + "num_input_tokens_seen": 20164315, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.5234375, + "step": 943, + "time_per_iteration": 2.674614191055298 + }, + { + "auxiliary_loss_clip": 0.01220399, + "auxiliary_loss_mlp": 0.01056577, + "balance_loss_clip": 1.03182912, + "balance_loss_mlp": 1.06757212, + "epoch": 0.05675635051856306, + "flos": 17821317219840.0, + "grad_norm": 2.6837069047913924, + "language_loss": 0.75092185, + "learning_rate": 3.992495569872206e-06, + "loss": 0.77369165, + "num_input_tokens_seen": 20182760, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.5234375, + "step": 944, + "time_per_iteration": 2.5806639194488525 + }, + { + "auxiliary_loss_clip": 0.01215674, + "auxiliary_loss_mlp": 0.01062669, + "balance_loss_clip": 1.0385294, + "balance_loss_mlp": 1.06267428, + "epoch": 0.05681647377123102, + "flos": 23114945111040.0, + "grad_norm": 1.7462690351912153, + "language_loss": 0.79321784, + "learning_rate": 3.992461825426906e-06, + "loss": 0.8160013, + "num_input_tokens_seen": 20203830, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.53125, + "step": 945, + "time_per_iteration": 2.695613384246826 + }, + { + "auxiliary_loss_clip": 0.01218347, + "auxiliary_loss_mlp": 0.01061935, + "balance_loss_clip": 1.03628159, + "balance_loss_mlp": 1.06407309, + "epoch": 0.056876597023898995, + "flos": 16070528505600.0, + "grad_norm": 2.1794845223078556, + "language_loss": 0.82465631, + "learning_rate": 3.992428005427252e-06, + "loss": 0.84745914, + "num_input_tokens_seen": 20220365, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.546875, + "step": 946, + "time_per_iteration": 2.6081790924072266 + }, + { + "auxiliary_loss_clip": 0.01223255, + "auxiliary_loss_mlp": 0.01059618, + "balance_loss_clip": 1.03258097, + "balance_loss_mlp": 1.06615055, + "epoch": 0.05693672027656696, + "flos": 16835641130880.0, + "grad_norm": 2.7693395657309297, + "language_loss": 0.7904911, + "learning_rate": 3.992394109874529e-06, + "loss": 0.8133198, + "num_input_tokens_seen": 20238640, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.5703125, + "step": 947, + "time_per_iteration": 2.460472822189331 + }, + { + "auxiliary_loss_clip": 0.01227462, + "auxiliary_loss_mlp": 0.01065027, + "balance_loss_clip": 1.03890848, + "balance_loss_mlp": 1.06883287, + "epoch": 0.05699684352923493, + "flos": 21389113370880.0, + "grad_norm": 7.046260534289203, + "language_loss": 0.85772789, + "learning_rate": 3.9923601387700225e-06, + "loss": 0.88065279, + "num_input_tokens_seen": 20251025, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.5859375, + "step": 948, + "time_per_iteration": 2.4560892581939697 + }, + { + "auxiliary_loss_clip": 0.01217019, + "auxiliary_loss_mlp": 0.01060985, + "balance_loss_clip": 1.03374553, + "balance_loss_mlp": 1.06329989, + "epoch": 0.057056966781902904, + "flos": 15560309767680.0, + "grad_norm": 1.8055084405958775, + "language_loss": 0.87044799, + "learning_rate": 3.992326092115019e-06, + "loss": 0.89322805, + "num_input_tokens_seen": 20269775, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.5390625, + "step": 949, + "time_per_iteration": 2.4843316078186035 + }, + { + "auxiliary_loss_clip": 0.01212611, + "auxiliary_loss_mlp": 0.01066298, + "balance_loss_clip": 1.04170561, + "balance_loss_mlp": 1.06284809, + "epoch": 0.05711709003457087, + "flos": 19937856170880.0, + "grad_norm": 2.230679935648155, + "language_loss": 0.79035759, + "learning_rate": 3.992291969910811e-06, + "loss": 0.81314665, + "num_input_tokens_seen": 20287715, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.4921875, + "step": 950, + "time_per_iteration": 2.468172311782837 + }, + { + "auxiliary_loss_clip": 0.01221984, + "auxiliary_loss_mlp": 0.01068601, + "balance_loss_clip": 1.04365039, + "balance_loss_mlp": 1.06574106, + "epoch": 0.05717721328723884, + "flos": 30332701774080.0, + "grad_norm": 2.0871877141587682, + "language_loss": 0.8244521, + "learning_rate": 3.992257772158691e-06, + "loss": 0.84735799, + "num_input_tokens_seen": 20307070, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.5625, + "step": 951, + "time_per_iteration": 2.5418505668640137 + }, + { + "auxiliary_loss_clip": 0.01215404, + "auxiliary_loss_mlp": 0.01062639, + "balance_loss_clip": 1.03568625, + "balance_loss_mlp": 1.06129527, + "epoch": 0.05723733653990681, + "flos": 23654358627840.0, + "grad_norm": 2.5400916768099426, + "language_loss": 0.86685216, + "learning_rate": 3.992223498859958e-06, + "loss": 0.88963258, + "num_input_tokens_seen": 20324945, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.5390625, + "step": 952, + "time_per_iteration": 2.513356924057007 + }, + { + "auxiliary_loss_clip": 0.0122001, + "auxiliary_loss_mlp": 0.01062958, + "balance_loss_clip": 1.03415656, + "balance_loss_mlp": 1.06145215, + "epoch": 0.05729745979257478, + "flos": 22055759838720.0, + "grad_norm": 1.725154467975805, + "language_loss": 0.79043579, + "learning_rate": 3.9921891500159084e-06, + "loss": 0.81326544, + "num_input_tokens_seen": 20346135, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.5859375, + "step": 953, + "time_per_iteration": 2.490940570831299 + }, + { + "auxiliary_loss_clip": 0.01223554, + "auxiliary_loss_mlp": 0.01063244, + "balance_loss_clip": 1.03592086, + "balance_loss_mlp": 1.06757712, + "epoch": 0.05735758304524275, + "flos": 19604353368960.0, + "grad_norm": 2.2937199779067106, + "language_loss": 0.87086606, + "learning_rate": 3.992154725627848e-06, + "loss": 0.89373398, + "num_input_tokens_seen": 20364450, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.5625, + "step": 954, + "time_per_iteration": 2.495039701461792 + }, + { + "auxiliary_loss_clip": 0.01221375, + "auxiliary_loss_mlp": 0.01062344, + "balance_loss_clip": 1.03707159, + "balance_loss_mlp": 1.06446028, + "epoch": 0.057417706297910716, + "flos": 19099018880640.0, + "grad_norm": 2.3514674671771933, + "language_loss": 0.87789929, + "learning_rate": 3.9921202256970804e-06, + "loss": 0.90073651, + "num_input_tokens_seen": 20383500, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.5625, + "step": 955, + "time_per_iteration": 2.5018017292022705 + }, + { + "auxiliary_loss_clip": 0.01214985, + "auxiliary_loss_mlp": 0.01065732, + "balance_loss_clip": 1.04000711, + "balance_loss_mlp": 1.06217909, + "epoch": 0.05747782955057869, + "flos": 16654507822080.0, + "grad_norm": 3.7193659196918576, + "language_loss": 0.89682388, + "learning_rate": 3.992085650224914e-06, + "loss": 0.919631, + "num_input_tokens_seen": 20400295, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.53125, + "step": 956, + "time_per_iteration": 2.43306565284729 + }, + { + "auxiliary_loss_clip": 0.01212174, + "auxiliary_loss_mlp": 0.0105844, + "balance_loss_clip": 1.03232098, + "balance_loss_mlp": 1.06344521, + "epoch": 0.05753795280324665, + "flos": 14502058248960.0, + "grad_norm": 1.7667772588634594, + "language_loss": 0.75335747, + "learning_rate": 3.99205099921266e-06, + "loss": 0.77606356, + "num_input_tokens_seen": 20419085, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.484375, + "step": 957, + "time_per_iteration": 2.469240665435791 + }, + { + "auxiliary_loss_clip": 0.01218166, + "auxiliary_loss_mlp": 0.01075955, + "balance_loss_clip": 1.04713011, + "balance_loss_mlp": 1.06214452, + "epoch": 0.057598076055914625, + "flos": 18076318848000.0, + "grad_norm": 1.8974624224625587, + "language_loss": 0.79871029, + "learning_rate": 3.992016272661633e-06, + "loss": 0.82165146, + "num_input_tokens_seen": 20437465, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.5625, + "step": 958, + "time_per_iteration": 2.5016849040985107 + }, + { + "auxiliary_loss_clip": 0.01214009, + "auxiliary_loss_mlp": 0.01062008, + "balance_loss_clip": 1.03780818, + "balance_loss_mlp": 1.06024444, + "epoch": 0.0576581993085826, + "flos": 22124600254080.0, + "grad_norm": 2.5702669091422234, + "language_loss": 0.88410264, + "learning_rate": 3.99198147057315e-06, + "loss": 0.90686285, + "num_input_tokens_seen": 20456235, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.5390625, + "step": 959, + "time_per_iteration": 2.4830191135406494 + }, + { + "auxiliary_loss_clip": 0.01211651, + "auxiliary_loss_mlp": 0.01063458, + "balance_loss_clip": 1.03832912, + "balance_loss_mlp": 1.0626018, + "epoch": 0.05771832256125056, + "flos": 33181746779520.0, + "grad_norm": 2.6997220185951347, + "language_loss": 0.78556621, + "learning_rate": 3.991946592948529e-06, + "loss": 0.8083173, + "num_input_tokens_seen": 20476825, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.4921875, + "step": 960, + "time_per_iteration": 2.569218397140503 + }, + { + "auxiliary_loss_clip": 0.01217172, + "auxiliary_loss_mlp": 0.01063539, + "balance_loss_clip": 1.03685999, + "balance_loss_mlp": 1.06168103, + "epoch": 0.057778445813918534, + "flos": 24170143973760.0, + "grad_norm": 4.159271492638429, + "language_loss": 0.932491, + "learning_rate": 3.991911639789094e-06, + "loss": 0.95529813, + "num_input_tokens_seen": 20496965, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.5546875, + "step": 961, + "time_per_iteration": 2.511765480041504 + }, + { + "auxiliary_loss_clip": 0.01215042, + "auxiliary_loss_mlp": 0.01070899, + "balance_loss_clip": 1.04411268, + "balance_loss_mlp": 1.06039667, + "epoch": 0.0578385690665865, + "flos": 29643037666560.0, + "grad_norm": 2.532017623976099, + "language_loss": 0.6822986, + "learning_rate": 3.991876611096169e-06, + "loss": 0.70515805, + "num_input_tokens_seen": 20518035, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.546875, + "step": 962, + "time_per_iteration": 2.544498920440674 + }, + { + "auxiliary_loss_clip": 0.01214012, + "auxiliary_loss_mlp": 0.01068596, + "balance_loss_clip": 1.04461062, + "balance_loss_mlp": 1.06268489, + "epoch": 0.05789869231925447, + "flos": 20885430908160.0, + "grad_norm": 2.445305128304827, + "language_loss": 0.88187808, + "learning_rate": 3.991841506871084e-06, + "loss": 0.90470415, + "num_input_tokens_seen": 20534740, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.515625, + "step": 963, + "time_per_iteration": 2.459487199783325 + }, + { + "auxiliary_loss_clip": 0.01222623, + "auxiliary_loss_mlp": 0.01058866, + "balance_loss_clip": 1.03337944, + "balance_loss_mlp": 1.06633568, + "epoch": 0.057958815571922444, + "flos": 26031106679040.0, + "grad_norm": 2.5656796350524473, + "language_loss": 0.84858835, + "learning_rate": 3.99180632711517e-06, + "loss": 0.87140322, + "num_input_tokens_seen": 20553485, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.5625, + "step": 964, + "time_per_iteration": 2.5268235206604004 + }, + { + "auxiliary_loss_clip": 0.01216658, + "auxiliary_loss_mlp": 0.01067828, + "balance_loss_clip": 1.04157782, + "balance_loss_mlp": 1.06309247, + "epoch": 0.05801893882459041, + "flos": 18077683564800.0, + "grad_norm": 2.846103019544017, + "language_loss": 0.77748007, + "learning_rate": 3.99177107182976e-06, + "loss": 0.80032492, + "num_input_tokens_seen": 20572155, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.5390625, + "step": 965, + "time_per_iteration": 2.4572315216064453 + }, + { + "auxiliary_loss_clip": 0.01211478, + "auxiliary_loss_mlp": 0.01068539, + "balance_loss_clip": 1.04424393, + "balance_loss_mlp": 1.0614084, + "epoch": 0.05807906207725838, + "flos": 17748885444480.0, + "grad_norm": 2.4479010977704463, + "language_loss": 0.80922461, + "learning_rate": 3.99173574101619e-06, + "loss": 0.83202475, + "num_input_tokens_seen": 20590395, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.5, + "step": 966, + "time_per_iteration": 2.4682776927948 + }, + { + "auxiliary_loss_clip": 0.01212307, + "auxiliary_loss_mlp": 0.01061872, + "balance_loss_clip": 1.03730273, + "balance_loss_mlp": 1.06173599, + "epoch": 0.058139185329926346, + "flos": 18040372312320.0, + "grad_norm": 1.8643875206872442, + "language_loss": 0.76291096, + "learning_rate": 3.9917003346758035e-06, + "loss": 0.78565276, + "num_input_tokens_seen": 20608435, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.5, + "step": 967, + "time_per_iteration": 2.453474521636963 + }, + { + "auxiliary_loss_clip": 0.01084806, + "auxiliary_loss_mlp": 0.01040579, + "balance_loss_clip": 1.03547657, + "balance_loss_mlp": 1.02152586, + "epoch": 0.05819930858259432, + "flos": 62363297485440.0, + "grad_norm": 0.7926144837125159, + "language_loss": 0.57362092, + "learning_rate": 3.991664852809939e-06, + "loss": 0.59487474, + "num_input_tokens_seen": 20668575, + "router_z_loss_clip": 0.05102539, + "router_z_loss_mlp": 0.6328125, + "step": 968, + "time_per_iteration": 2.994419574737549 + }, + { + "auxiliary_loss_clip": 0.01218807, + "auxiliary_loss_mlp": 0.01055354, + "balance_loss_clip": 1.02865148, + "balance_loss_mlp": 1.06574845, + "epoch": 0.05825943183526229, + "flos": 19135360465920.0, + "grad_norm": 2.057389892616485, + "language_loss": 0.82289147, + "learning_rate": 3.991629295419945e-06, + "loss": 0.84563303, + "num_input_tokens_seen": 20687355, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.53125, + "step": 969, + "time_per_iteration": 2.4718844890594482 + }, + { + "auxiliary_loss_clip": 0.01217673, + "auxiliary_loss_mlp": 0.01056826, + "balance_loss_clip": 1.03105259, + "balance_loss_mlp": 1.06392384, + "epoch": 0.058319555087930255, + "flos": 29022465369600.0, + "grad_norm": 2.1897875503845725, + "language_loss": 0.780442, + "learning_rate": 3.991593662507167e-06, + "loss": 0.80318701, + "num_input_tokens_seen": 20705710, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.5390625, + "step": 970, + "time_per_iteration": 2.5632171630859375 + }, + { + "auxiliary_loss_clip": 0.01216631, + "auxiliary_loss_mlp": 0.01054997, + "balance_loss_clip": 1.02809155, + "balance_loss_mlp": 1.06188202, + "epoch": 0.05837967834059823, + "flos": 18879999701760.0, + "grad_norm": 2.6802242915962, + "language_loss": 0.92492616, + "learning_rate": 3.991557954072958e-06, + "loss": 0.94764245, + "num_input_tokens_seen": 20722405, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.546875, + "step": 971, + "time_per_iteration": 2.4642531871795654 + }, + { + "auxiliary_loss_clip": 0.01210603, + "auxiliary_loss_mlp": 0.010597, + "balance_loss_clip": 1.03439212, + "balance_loss_mlp": 1.05865097, + "epoch": 0.05843980159326619, + "flos": 25703062744320.0, + "grad_norm": 3.0470884327064276, + "language_loss": 0.86133701, + "learning_rate": 3.991522170118673e-06, + "loss": 0.88404, + "num_input_tokens_seen": 20741480, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.515625, + "step": 972, + "time_per_iteration": 2.5298526287078857 + }, + { + "auxiliary_loss_clip": 0.01212752, + "auxiliary_loss_mlp": 0.01064681, + "balance_loss_clip": 1.04038596, + "balance_loss_mlp": 1.0636549, + "epoch": 0.058499924845934165, + "flos": 25552129795200.0, + "grad_norm": 2.0754734138997906, + "language_loss": 0.87340444, + "learning_rate": 3.991486310645667e-06, + "loss": 0.89617872, + "num_input_tokens_seen": 20759685, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.4921875, + "step": 973, + "time_per_iteration": 2.5198311805725098 + }, + { + "auxiliary_loss_clip": 0.01213937, + "auxiliary_loss_mlp": 0.01067264, + "balance_loss_clip": 1.04070425, + "balance_loss_mlp": 1.06140256, + "epoch": 0.05856004809860214, + "flos": 16436171001600.0, + "grad_norm": 3.2539468590332707, + "language_loss": 0.74868345, + "learning_rate": 3.991450375655301e-06, + "loss": 0.77149546, + "num_input_tokens_seen": 20778180, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5234375, + "step": 974, + "time_per_iteration": 2.465268850326538 + }, + { + "auxiliary_loss_clip": 0.0121359, + "auxiliary_loss_mlp": 0.01059075, + "balance_loss_clip": 1.03308713, + "balance_loss_mlp": 1.06260133, + "epoch": 0.0586201713512701, + "flos": 39458824116480.0, + "grad_norm": 1.7891188847385684, + "language_loss": 0.76707923, + "learning_rate": 3.991414365148936e-06, + "loss": 0.78980577, + "num_input_tokens_seen": 20802705, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.515625, + "step": 975, + "time_per_iteration": 2.633850336074829 + }, + { + "auxiliary_loss_clip": 0.01216778, + "auxiliary_loss_mlp": 0.01068456, + "balance_loss_clip": 1.04332697, + "balance_loss_mlp": 1.0621978, + "epoch": 0.058680294603938074, + "flos": 23365170230400.0, + "grad_norm": 2.0981769673049326, + "language_loss": 0.76878488, + "learning_rate": 3.99137827912794e-06, + "loss": 0.79163718, + "num_input_tokens_seen": 20822540, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.546875, + "step": 976, + "time_per_iteration": 6.8309245109558105 + }, + { + "auxiliary_loss_clip": 0.01210296, + "auxiliary_loss_mlp": 0.01061517, + "balance_loss_clip": 1.03606534, + "balance_loss_mlp": 1.0585494, + "epoch": 0.05874041785660604, + "flos": 32232017226240.0, + "grad_norm": 1.8109666318996334, + "language_loss": 0.87465948, + "learning_rate": 3.991342117593679e-06, + "loss": 0.89737761, + "num_input_tokens_seen": 20844175, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.515625, + "step": 977, + "time_per_iteration": 2.5693395137786865 + }, + { + "auxiliary_loss_clip": 0.01213396, + "auxiliary_loss_mlp": 0.01064383, + "balance_loss_clip": 1.0380497, + "balance_loss_mlp": 1.06246471, + "epoch": 0.05880054110927401, + "flos": 22310043194880.0, + "grad_norm": 1.7886661734827753, + "language_loss": 0.79517525, + "learning_rate": 3.991305880547527e-06, + "loss": 0.81795299, + "num_input_tokens_seen": 20864730, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.5078125, + "step": 978, + "time_per_iteration": 2.51609206199646 + }, + { + "auxiliary_loss_clip": 0.01218239, + "auxiliary_loss_mlp": 0.01069938, + "balance_loss_clip": 1.04339027, + "balance_loss_mlp": 1.06304932, + "epoch": 0.05886066436194198, + "flos": 27380450016000.0, + "grad_norm": 2.6270410794651102, + "language_loss": 0.80902123, + "learning_rate": 3.991269567990855e-06, + "loss": 0.83190298, + "num_input_tokens_seen": 20885200, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.546875, + "step": 979, + "time_per_iteration": 2.527127504348755 + }, + { + "auxiliary_loss_clip": 0.01085971, + "auxiliary_loss_mlp": 0.01009543, + "balance_loss_clip": 1.0044651, + "balance_loss_mlp": 1.02304745, + "epoch": 0.05892078761460995, + "flos": 59584493525760.0, + "grad_norm": 0.94528472512207, + "language_loss": 0.59059429, + "learning_rate": 3.9912331799250415e-06, + "loss": 0.61154944, + "num_input_tokens_seen": 20940325, + "router_z_loss_clip": 0.05078125, + "router_z_loss_mlp": 0.62890625, + "step": 980, + "time_per_iteration": 2.9545915126800537 + }, + { + "auxiliary_loss_clip": 0.01210703, + "auxiliary_loss_mlp": 0.0106402, + "balance_loss_clip": 1.03747201, + "balance_loss_mlp": 1.0622623, + "epoch": 0.05898091086727792, + "flos": 15414081500160.0, + "grad_norm": 2.3915266710240917, + "language_loss": 0.86397457, + "learning_rate": 3.9911967163514665e-06, + "loss": 0.88672185, + "num_input_tokens_seen": 20958220, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.484375, + "step": 981, + "time_per_iteration": 2.4726293087005615 + }, + { + "auxiliary_loss_clip": 0.01212695, + "auxiliary_loss_mlp": 0.01057503, + "balance_loss_clip": 1.03423381, + "balance_loss_mlp": 1.06214404, + "epoch": 0.059041034119945886, + "flos": 23655328295040.0, + "grad_norm": 1.9485203495729437, + "language_loss": 0.79623365, + "learning_rate": 3.991160177271513e-06, + "loss": 0.81893563, + "num_input_tokens_seen": 20978920, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.5078125, + "step": 982, + "time_per_iteration": 2.5271458625793457 + }, + { + "auxiliary_loss_clip": 0.01219179, + "auxiliary_loss_mlp": 0.01060762, + "balance_loss_clip": 1.03571582, + "balance_loss_mlp": 1.06248748, + "epoch": 0.05910115737261386, + "flos": 24754087376640.0, + "grad_norm": 2.5320957946125437, + "language_loss": 0.84376037, + "learning_rate": 3.9911235626865654e-06, + "loss": 0.86655974, + "num_input_tokens_seen": 20999490, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.5625, + "step": 983, + "time_per_iteration": 2.526364803314209 + }, + { + "auxiliary_loss_clip": 0.01212847, + "auxiliary_loss_mlp": 0.01067453, + "balance_loss_clip": 1.04361129, + "balance_loss_mlp": 1.06317604, + "epoch": 0.05916128062528183, + "flos": 11728749070080.0, + "grad_norm": 1.8446015864025267, + "language_loss": 0.84607553, + "learning_rate": 3.9910868725980125e-06, + "loss": 0.86887848, + "num_input_tokens_seen": 21017865, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.4921875, + "step": 984, + "time_per_iteration": 2.456803321838379 + }, + { + "auxiliary_loss_clip": 0.01211466, + "auxiliary_loss_mlp": 0.01059154, + "balance_loss_clip": 1.03551483, + "balance_loss_mlp": 1.06338882, + "epoch": 0.059221403877949795, + "flos": 21902995296000.0, + "grad_norm": 2.3276500524021495, + "language_loss": 0.77875566, + "learning_rate": 3.9910501070072465e-06, + "loss": 0.80146182, + "num_input_tokens_seen": 21035900, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.484375, + "step": 985, + "time_per_iteration": 2.504096508026123 + }, + { + "auxiliary_loss_clip": 0.01215785, + "auxiliary_loss_mlp": 0.01061307, + "balance_loss_clip": 1.03661919, + "balance_loss_mlp": 1.06191659, + "epoch": 0.05928152713061777, + "flos": 20514580940160.0, + "grad_norm": 2.294716701848832, + "language_loss": 0.90598249, + "learning_rate": 3.991013265915661e-06, + "loss": 0.92875338, + "num_input_tokens_seen": 21053235, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.5390625, + "step": 986, + "time_per_iteration": 2.4882049560546875 + }, + { + "auxiliary_loss_clip": 0.01215421, + "auxiliary_loss_mlp": 0.01062373, + "balance_loss_clip": 1.03534794, + "balance_loss_mlp": 1.06017947, + "epoch": 0.05934165038328574, + "flos": 24495135252480.0, + "grad_norm": 3.8181645576894256, + "language_loss": 0.7589798, + "learning_rate": 3.9909763493246525e-06, + "loss": 0.78175771, + "num_input_tokens_seen": 21073090, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.546875, + "step": 987, + "time_per_iteration": 2.492412805557251 + }, + { + "auxiliary_loss_clip": 0.01216653, + "auxiliary_loss_mlp": 0.01059941, + "balance_loss_clip": 1.03491902, + "balance_loss_mlp": 1.06059265, + "epoch": 0.059401773635953704, + "flos": 38728041914880.0, + "grad_norm": 2.1447391932017843, + "language_loss": 0.71525705, + "learning_rate": 3.990939357235621e-06, + "loss": 0.73802304, + "num_input_tokens_seen": 21094895, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.5625, + "step": 988, + "time_per_iteration": 2.6386756896972656 + }, + { + "auxiliary_loss_clip": 0.01081383, + "auxiliary_loss_mlp": 0.01012054, + "balance_loss_clip": 1.00680876, + "balance_loss_mlp": 1.01888978, + "epoch": 0.059461896888621676, + "flos": 58023565125120.0, + "grad_norm": 0.9344259157338769, + "language_loss": 0.71159971, + "learning_rate": 3.99090228964997e-06, + "loss": 0.73253405, + "num_input_tokens_seen": 21147555, + "router_z_loss_clip": 0.05249023, + "router_z_loss_mlp": 0.625, + "step": 989, + "time_per_iteration": 2.903996706008911 + }, + { + "auxiliary_loss_clip": 0.01219656, + "auxiliary_loss_mlp": 0.01067443, + "balance_loss_clip": 1.0404067, + "balance_loss_mlp": 1.06221163, + "epoch": 0.05952202014128964, + "flos": 22127760650880.0, + "grad_norm": 1.89069901477269, + "language_loss": 0.78102934, + "learning_rate": 3.990865146569105e-06, + "loss": 0.80390036, + "num_input_tokens_seen": 21167845, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.578125, + "step": 990, + "time_per_iteration": 2.6252431869506836 + }, + { + "auxiliary_loss_clip": 0.01208224, + "auxiliary_loss_mlp": 0.0105602, + "balance_loss_clip": 1.0303905, + "balance_loss_mlp": 1.05700588, + "epoch": 0.059582143393957614, + "flos": 20445776438400.0, + "grad_norm": 2.077710223302236, + "language_loss": 0.86406755, + "learning_rate": 3.990827927994434e-06, + "loss": 0.88671005, + "num_input_tokens_seen": 21185085, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.515625, + "step": 991, + "time_per_iteration": 2.483853340148926 + }, + { + "auxiliary_loss_clip": 0.01216429, + "auxiliary_loss_mlp": 0.01065185, + "balance_loss_clip": 1.04030573, + "balance_loss_mlp": 1.06190968, + "epoch": 0.059642266646625586, + "flos": 20594877793920.0, + "grad_norm": 2.866628977756486, + "language_loss": 0.76876801, + "learning_rate": 3.9907906339273674e-06, + "loss": 0.79158413, + "num_input_tokens_seen": 21204230, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 1.546875, + "step": 992, + "time_per_iteration": 2.5149648189544678 + }, + { + "auxiliary_loss_clip": 0.01214781, + "auxiliary_loss_mlp": 0.0106048, + "balance_loss_clip": 1.03701937, + "balance_loss_mlp": 1.06251192, + "epoch": 0.05970238989929355, + "flos": 19352655792000.0, + "grad_norm": 2.726921793738851, + "language_loss": 0.74594641, + "learning_rate": 3.9907532643693215e-06, + "loss": 0.76869899, + "num_input_tokens_seen": 21222655, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.5234375, + "step": 993, + "time_per_iteration": 2.4739816188812256 + }, + { + "auxiliary_loss_clip": 0.01214249, + "auxiliary_loss_mlp": 0.01071365, + "balance_loss_clip": 1.04560351, + "balance_loss_mlp": 1.06326771, + "epoch": 0.05976251315196152, + "flos": 30264040926720.0, + "grad_norm": 3.2517233877247396, + "language_loss": 0.78911841, + "learning_rate": 3.990715819321712e-06, + "loss": 0.81197453, + "num_input_tokens_seen": 21242310, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.515625, + "step": 994, + "time_per_iteration": 2.5408835411071777 + }, + { + "auxiliary_loss_clip": 0.01214677, + "auxiliary_loss_mlp": 0.01082728, + "balance_loss_clip": 1.05768251, + "balance_loss_mlp": 1.06170893, + "epoch": 0.05982263640462949, + "flos": 23185150243200.0, + "grad_norm": 2.42517884603863, + "language_loss": 0.79639304, + "learning_rate": 3.99067829878596e-06, + "loss": 0.81936711, + "num_input_tokens_seen": 21261410, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.53125, + "step": 995, + "time_per_iteration": 2.5062758922576904 + }, + { + "auxiliary_loss_clip": 0.01212085, + "auxiliary_loss_mlp": 0.01065995, + "balance_loss_clip": 1.04022169, + "balance_loss_mlp": 1.05969059, + "epoch": 0.05988275965729746, + "flos": 27850879463040.0, + "grad_norm": 2.536496545288829, + "language_loss": 0.86939722, + "learning_rate": 3.990640702763487e-06, + "loss": 0.89217806, + "num_input_tokens_seen": 21280080, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.53125, + "step": 996, + "time_per_iteration": 2.5236001014709473 + }, + { + "auxiliary_loss_clip": 0.01217352, + "auxiliary_loss_mlp": 0.01070048, + "balance_loss_clip": 1.04098463, + "balance_loss_mlp": 1.06309104, + "epoch": 0.05994288290996543, + "flos": 24680003575680.0, + "grad_norm": 4.013698471354103, + "language_loss": 0.88192105, + "learning_rate": 3.990603031255718e-06, + "loss": 0.90479505, + "num_input_tokens_seen": 21296765, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.546875, + "step": 997, + "time_per_iteration": 2.483116626739502 + }, + { + "auxiliary_loss_clip": 0.01079761, + "auxiliary_loss_mlp": 0.01004407, + "balance_loss_clip": 0.9993524, + "balance_loss_mlp": 1.01837301, + "epoch": 0.0600030061626334, + "flos": 69929568835200.0, + "grad_norm": 1.020759515587473, + "language_loss": 0.75442117, + "learning_rate": 3.990565284264083e-06, + "loss": 0.77526283, + "num_input_tokens_seen": 21363345, + "router_z_loss_clip": 0.05053711, + "router_z_loss_mlp": 0.6171875, + "step": 998, + "time_per_iteration": 3.152331590652466 + }, + { + "auxiliary_loss_clip": 0.01213812, + "auxiliary_loss_mlp": 0.01067708, + "balance_loss_clip": 1.04179215, + "balance_loss_mlp": 1.0626508, + "epoch": 0.06006312941530137, + "flos": 26540140268160.0, + "grad_norm": 1.8375420281697645, + "language_loss": 0.75796127, + "learning_rate": 3.990527461790013e-06, + "loss": 0.7807765, + "num_input_tokens_seen": 21385290, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.515625, + "step": 999, + "time_per_iteration": 2.5051238536834717 + }, + { + "auxiliary_loss_clip": 0.01212853, + "auxiliary_loss_mlp": 0.0106227, + "balance_loss_clip": 1.03575778, + "balance_loss_mlp": 1.05894446, + "epoch": 0.060123252667969335, + "flos": 27344000689920.0, + "grad_norm": 1.9091686508511199, + "language_loss": 0.82658899, + "learning_rate": 3.990489563834943e-06, + "loss": 0.8493402, + "num_input_tokens_seen": 21407625, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5390625, + "step": 1000, + "time_per_iteration": 2.5369935035705566 + }, + { + "auxiliary_loss_clip": 0.01215386, + "auxiliary_loss_mlp": 0.01057348, + "balance_loss_clip": 1.03282714, + "balance_loss_mlp": 1.06143069, + "epoch": 0.06018337592063731, + "flos": 27016710940800.0, + "grad_norm": 3.4065508827059783, + "language_loss": 0.85644853, + "learning_rate": 3.990451590400309e-06, + "loss": 0.8791759, + "num_input_tokens_seen": 21426835, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.5390625, + "step": 1001, + "time_per_iteration": 2.4972190856933594 + }, + { + "auxiliary_loss_clip": 0.0120879, + "auxiliary_loss_mlp": 0.01063055, + "balance_loss_clip": 1.0376749, + "balance_loss_mlp": 1.0587517, + "epoch": 0.06024349917330528, + "flos": 25592960580480.0, + "grad_norm": 2.156321640703371, + "language_loss": 0.74386394, + "learning_rate": 3.990413541487551e-06, + "loss": 0.76658237, + "num_input_tokens_seen": 21444920, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.5, + "step": 1002, + "time_per_iteration": 2.531172275543213 + }, + { + "auxiliary_loss_clip": 0.01211576, + "auxiliary_loss_mlp": 0.01065904, + "balance_loss_clip": 1.04019034, + "balance_loss_mlp": 1.06015134, + "epoch": 0.060303622425973244, + "flos": 26133271937280.0, + "grad_norm": 3.1165374575777145, + "language_loss": 0.75346643, + "learning_rate": 3.990375417098112e-06, + "loss": 0.77624118, + "num_input_tokens_seen": 21463555, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.515625, + "step": 1003, + "time_per_iteration": 2.508817434310913 + }, + { + "auxiliary_loss_clip": 0.01219434, + "auxiliary_loss_mlp": 0.01065938, + "balance_loss_clip": 1.04047489, + "balance_loss_mlp": 1.06255794, + "epoch": 0.060363745678641216, + "flos": 20377187418240.0, + "grad_norm": 2.2578292515807603, + "language_loss": 0.70071733, + "learning_rate": 3.990337217233437e-06, + "loss": 0.723571, + "num_input_tokens_seen": 21481990, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.5625, + "step": 1004, + "time_per_iteration": 2.480116844177246 + }, + { + "auxiliary_loss_clip": 0.01218526, + "auxiliary_loss_mlp": 0.01073584, + "balance_loss_clip": 1.04810917, + "balance_loss_mlp": 1.06360686, + "epoch": 0.06042386893130918, + "flos": 17749172753280.0, + "grad_norm": 2.248554137518493, + "language_loss": 0.83246684, + "learning_rate": 3.990298941894976e-06, + "loss": 0.85538793, + "num_input_tokens_seen": 21500385, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.546875, + "step": 1005, + "time_per_iteration": 2.449733018875122 + }, + { + "auxiliary_loss_clip": 0.01077543, + "auxiliary_loss_mlp": 0.01007523, + "balance_loss_clip": 1.00306416, + "balance_loss_mlp": 1.0157814, + "epoch": 0.06048399218397715, + "flos": 68538496872960.0, + "grad_norm": 0.8959746990508154, + "language_loss": 0.59000289, + "learning_rate": 3.9902605910841794e-06, + "loss": 0.61085355, + "num_input_tokens_seen": 21561040, + "router_z_loss_clip": 0.04467773, + "router_z_loss_mlp": 0.6171875, + "step": 1006, + "time_per_iteration": 3.1583423614501953 + }, + { + "auxiliary_loss_clip": 0.01209886, + "auxiliary_loss_mlp": 0.0105727, + "balance_loss_clip": 1.03203392, + "balance_loss_mlp": 1.05658197, + "epoch": 0.060544115436645125, + "flos": 23258515772160.0, + "grad_norm": 2.271524805944984, + "language_loss": 0.7428897, + "learning_rate": 3.990222164802503e-06, + "loss": 0.76556122, + "num_input_tokens_seen": 21580655, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.53125, + "step": 1007, + "time_per_iteration": 2.49139666557312 + }, + { + "auxiliary_loss_clip": 0.01212867, + "auxiliary_loss_mlp": 0.01055047, + "balance_loss_clip": 1.02930975, + "balance_loss_mlp": 1.05897522, + "epoch": 0.06060423868931309, + "flos": 23878441624320.0, + "grad_norm": 1.8583948299039934, + "language_loss": 0.80739897, + "learning_rate": 3.9901836630514006e-06, + "loss": 0.83007812, + "num_input_tokens_seen": 21599650, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.5390625, + "step": 1008, + "time_per_iteration": 2.4990036487579346 + }, + { + "auxiliary_loss_clip": 0.01213893, + "auxiliary_loss_mlp": 0.01055804, + "balance_loss_clip": 1.03082955, + "balance_loss_mlp": 1.06254637, + "epoch": 0.06066436194198106, + "flos": 18728061171840.0, + "grad_norm": 1.935763632111394, + "language_loss": 0.77840835, + "learning_rate": 3.990145085832335e-06, + "loss": 0.80110532, + "num_input_tokens_seen": 21617550, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.515625, + "step": 1009, + "time_per_iteration": 2.4785048961639404 + }, + { + "auxiliary_loss_clip": 0.01210213, + "auxiliary_loss_mlp": 0.01059495, + "balance_loss_clip": 1.03537917, + "balance_loss_mlp": 1.06082368, + "epoch": 0.06072448519464903, + "flos": 24640465680000.0, + "grad_norm": 2.1058592784097567, + "language_loss": 0.93059653, + "learning_rate": 3.990106433146769e-06, + "loss": 0.95329368, + "num_input_tokens_seen": 21635865, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.4921875, + "step": 1010, + "time_per_iteration": 2.507596015930176 + }, + { + "auxiliary_loss_clip": 0.01219036, + "auxiliary_loss_mlp": 0.01066438, + "balance_loss_clip": 1.0390203, + "balance_loss_mlp": 1.05885124, + "epoch": 0.060784608447317, + "flos": 17378825575680.0, + "grad_norm": 3.1716667034247843, + "language_loss": 0.71846473, + "learning_rate": 3.9900677049961665e-06, + "loss": 0.74131954, + "num_input_tokens_seen": 21653945, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.6015625, + "step": 1011, + "time_per_iteration": 2.4791040420532227 + }, + { + "auxiliary_loss_clip": 0.01214432, + "auxiliary_loss_mlp": 0.01070485, + "balance_loss_clip": 1.04388905, + "balance_loss_mlp": 1.05902421, + "epoch": 0.06084473169998497, + "flos": 23692208584320.0, + "grad_norm": 2.5871469840663535, + "language_loss": 0.87542284, + "learning_rate": 3.990028901381999e-06, + "loss": 0.89827204, + "num_input_tokens_seen": 21671230, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5546875, + "step": 1012, + "time_per_iteration": 2.4876151084899902 + }, + { + "auxiliary_loss_clip": 0.01206171, + "auxiliary_loss_mlp": 0.01060353, + "balance_loss_clip": 1.03577256, + "balance_loss_mlp": 1.05505085, + "epoch": 0.06090485495265294, + "flos": 23546339452800.0, + "grad_norm": 1.8956263482043672, + "language_loss": 0.76679665, + "learning_rate": 3.989990022305734e-06, + "loss": 0.78946191, + "num_input_tokens_seen": 21691155, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.515625, + "step": 1013, + "time_per_iteration": 2.4874446392059326 + }, + { + "auxiliary_loss_clip": 0.01215089, + "auxiliary_loss_mlp": 0.01067055, + "balance_loss_clip": 1.03946972, + "balance_loss_mlp": 1.05924904, + "epoch": 0.06096497820532091, + "flos": 20339301548160.0, + "grad_norm": 2.654718290448769, + "language_loss": 0.85651302, + "learning_rate": 3.98995106776885e-06, + "loss": 0.87933445, + "num_input_tokens_seen": 21707405, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.5546875, + "step": 1014, + "time_per_iteration": 2.483774423599243 + }, + { + "auxiliary_loss_clip": 0.0122011, + "auxiliary_loss_mlp": 0.01067578, + "balance_loss_clip": 1.03996944, + "balance_loss_mlp": 1.06207335, + "epoch": 0.061025101457988874, + "flos": 26939035779840.0, + "grad_norm": 2.4287988001966028, + "language_loss": 0.72807163, + "learning_rate": 3.98991203777282e-06, + "loss": 0.75094855, + "num_input_tokens_seen": 21728090, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.578125, + "step": 1015, + "time_per_iteration": 2.520026206970215 + }, + { + "auxiliary_loss_clip": 0.01207162, + "auxiliary_loss_mlp": 0.01068406, + "balance_loss_clip": 1.04290748, + "balance_loss_mlp": 1.0576005, + "epoch": 0.061085224710656846, + "flos": 25375054723200.0, + "grad_norm": 1.6555956389633335, + "language_loss": 0.79197502, + "learning_rate": 3.9898729323191275e-06, + "loss": 0.8147307, + "num_input_tokens_seen": 21747950, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4921875, + "step": 1016, + "time_per_iteration": 2.5177054405212402 + }, + { + "auxiliary_loss_clip": 0.01210352, + "auxiliary_loss_mlp": 0.01060413, + "balance_loss_clip": 1.03609443, + "balance_loss_mlp": 1.0571332, + "epoch": 0.06114534796332482, + "flos": 24824759385600.0, + "grad_norm": 1.934405213560846, + "language_loss": 0.76170123, + "learning_rate": 3.989833751409254e-06, + "loss": 0.78440881, + "num_input_tokens_seen": 21767900, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.53125, + "step": 1017, + "time_per_iteration": 2.517730951309204 + }, + { + "auxiliary_loss_clip": 0.01220983, + "auxiliary_loss_mlp": 0.01069505, + "balance_loss_clip": 1.04331422, + "balance_loss_mlp": 1.06240773, + "epoch": 0.061205471215992784, + "flos": 20631434860800.0, + "grad_norm": 1.873264658326973, + "language_loss": 0.86145842, + "learning_rate": 3.989794495044685e-06, + "loss": 0.88436329, + "num_input_tokens_seen": 21787375, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.5859375, + "step": 1018, + "time_per_iteration": 5.324457883834839 + }, + { + "auxiliary_loss_clip": 0.01206709, + "auxiliary_loss_mlp": 0.01071464, + "balance_loss_clip": 1.045012, + "balance_loss_mlp": 1.05659163, + "epoch": 0.061265594468660756, + "flos": 16508351381760.0, + "grad_norm": 2.696758126666256, + "language_loss": 0.77535981, + "learning_rate": 3.989755163226909e-06, + "loss": 0.79814154, + "num_input_tokens_seen": 21806275, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.5, + "step": 1019, + "time_per_iteration": 2.453768253326416 + }, + { + "auxiliary_loss_clip": 0.01210848, + "auxiliary_loss_mlp": 0.01061489, + "balance_loss_clip": 1.03559661, + "balance_loss_mlp": 1.05749679, + "epoch": 0.06132571772132872, + "flos": 26246211275520.0, + "grad_norm": 1.8458417378275351, + "language_loss": 0.84254557, + "learning_rate": 3.989715755957418e-06, + "loss": 0.86526895, + "num_input_tokens_seen": 21826430, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.53125, + "step": 1020, + "time_per_iteration": 2.5126123428344727 + }, + { + "auxiliary_loss_clip": 0.01217116, + "auxiliary_loss_mlp": 0.01060663, + "balance_loss_clip": 1.0352596, + "balance_loss_mlp": 1.06234074, + "epoch": 0.06138584097399669, + "flos": 37414788768000.0, + "grad_norm": 2.186416819505148, + "language_loss": 0.79234397, + "learning_rate": 3.989676273237705e-06, + "loss": 0.81512177, + "num_input_tokens_seen": 21847800, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.546875, + "step": 1021, + "time_per_iteration": 2.583918333053589 + }, + { + "auxiliary_loss_clip": 0.01207219, + "auxiliary_loss_mlp": 0.01064403, + "balance_loss_clip": 1.04207504, + "balance_loss_mlp": 1.05748677, + "epoch": 0.061445964226664665, + "flos": 17420661941760.0, + "grad_norm": 2.2026341390443434, + "language_loss": 0.87493509, + "learning_rate": 3.9896367150692705e-06, + "loss": 0.89765131, + "num_input_tokens_seen": 21863385, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.5, + "step": 1022, + "time_per_iteration": 2.441298007965088 + }, + { + "auxiliary_loss_clip": 0.01213359, + "auxiliary_loss_mlp": 0.0106856, + "balance_loss_clip": 1.04353857, + "balance_loss_mlp": 1.06052542, + "epoch": 0.06150608747933263, + "flos": 22600021691520.0, + "grad_norm": 1.752710779550117, + "language_loss": 0.82776564, + "learning_rate": 3.989597081453611e-06, + "loss": 0.85058486, + "num_input_tokens_seen": 21881880, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.53125, + "step": 1023, + "time_per_iteration": 2.5027952194213867 + }, + { + "auxiliary_loss_clip": 0.01079025, + "auxiliary_loss_mlp": 0.01003997, + "balance_loss_clip": 0.99944335, + "balance_loss_mlp": 1.01796818, + "epoch": 0.0615662107320006, + "flos": 56741482005120.0, + "grad_norm": 0.8999264202466762, + "language_loss": 0.65078986, + "learning_rate": 3.989557372392231e-06, + "loss": 0.67162001, + "num_input_tokens_seen": 21940550, + "router_z_loss_clip": 0.0456543, + "router_z_loss_mlp": 0.609375, + "step": 1024, + "time_per_iteration": 3.0969655513763428 + }, + { + "auxiliary_loss_clip": 0.01212272, + "auxiliary_loss_mlp": 0.01066841, + "balance_loss_clip": 1.04123473, + "balance_loss_mlp": 1.05936897, + "epoch": 0.06162633398466857, + "flos": 22564793427840.0, + "grad_norm": 1.9303372998519377, + "language_loss": 0.88293028, + "learning_rate": 3.989517587886636e-06, + "loss": 0.90572149, + "num_input_tokens_seen": 21958390, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.53125, + "step": 1025, + "time_per_iteration": 2.5229876041412354 + }, + { + "auxiliary_loss_clip": 0.01212316, + "auxiliary_loss_mlp": 0.01059432, + "balance_loss_clip": 1.03513718, + "balance_loss_mlp": 1.05916524, + "epoch": 0.06168645723733654, + "flos": 25593104234880.0, + "grad_norm": 1.519276165786755, + "language_loss": 0.84567487, + "learning_rate": 3.989477727938335e-06, + "loss": 0.86839235, + "num_input_tokens_seen": 21978625, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.53125, + "step": 1026, + "time_per_iteration": 2.5304806232452393 + }, + { + "auxiliary_loss_clip": 0.01212365, + "auxiliary_loss_mlp": 0.01071013, + "balance_loss_clip": 1.04614556, + "balance_loss_mlp": 1.05798197, + "epoch": 0.06174658049000451, + "flos": 15997917162240.0, + "grad_norm": 1.9431802827698534, + "language_loss": 0.82320756, + "learning_rate": 3.989437792548839e-06, + "loss": 0.84604132, + "num_input_tokens_seen": 21996035, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 1.546875, + "step": 1027, + "time_per_iteration": 2.4508368968963623 + }, + { + "auxiliary_loss_clip": 0.01209611, + "auxiliary_loss_mlp": 0.01058329, + "balance_loss_clip": 1.03343821, + "balance_loss_mlp": 1.05799866, + "epoch": 0.06180670374267248, + "flos": 11285970117120.0, + "grad_norm": 2.262386050001272, + "language_loss": 0.84232426, + "learning_rate": 3.989397781719663e-06, + "loss": 0.86500365, + "num_input_tokens_seen": 22011625, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 1.515625, + "step": 1028, + "time_per_iteration": 2.4485137462615967 + }, + { + "auxiliary_loss_clip": 0.01077664, + "auxiliary_loss_mlp": 0.01009618, + "balance_loss_clip": 1.00544536, + "balance_loss_mlp": 1.01686025, + "epoch": 0.06186682699534045, + "flos": 65130142216320.0, + "grad_norm": 0.9476883841381922, + "language_loss": 0.60497737, + "learning_rate": 3.989357695452323e-06, + "loss": 0.6258502, + "num_input_tokens_seen": 22066035, + "router_z_loss_clip": 0.04174805, + "router_z_loss_mlp": 0.609375, + "step": 1029, + "time_per_iteration": 2.8714137077331543 + }, + { + "auxiliary_loss_clip": 0.0120304, + "auxiliary_loss_mlp": 0.01066238, + "balance_loss_clip": 1.0419786, + "balance_loss_mlp": 1.05338669, + "epoch": 0.061926950248008414, + "flos": 21105742976640.0, + "grad_norm": 2.297452518318954, + "language_loss": 0.82309926, + "learning_rate": 3.98931753374834e-06, + "loss": 0.84579194, + "num_input_tokens_seen": 22085015, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.4921875, + "step": 1030, + "time_per_iteration": 2.4705348014831543 + }, + { + "auxiliary_loss_clip": 0.01214194, + "auxiliary_loss_mlp": 0.01071397, + "balance_loss_clip": 1.04586279, + "balance_loss_mlp": 1.06025672, + "epoch": 0.061987073500676386, + "flos": 17748454481280.0, + "grad_norm": 2.391039807046215, + "language_loss": 0.80262065, + "learning_rate": 3.989277296609237e-06, + "loss": 0.82547653, + "num_input_tokens_seen": 22102775, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.5390625, + "step": 1031, + "time_per_iteration": 2.447964906692505 + }, + { + "auxiliary_loss_clip": 0.0121101, + "auxiliary_loss_mlp": 0.01075497, + "balance_loss_clip": 1.04919958, + "balance_loss_mlp": 1.05865717, + "epoch": 0.06204719675334436, + "flos": 21836237869440.0, + "grad_norm": 1.6245278130098144, + "language_loss": 0.77141201, + "learning_rate": 3.98923698403654e-06, + "loss": 0.79427713, + "num_input_tokens_seen": 22121680, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.5234375, + "step": 1032, + "time_per_iteration": 2.475891590118408 + }, + { + "auxiliary_loss_clip": 0.01205906, + "auxiliary_loss_mlp": 0.01069412, + "balance_loss_clip": 1.04350805, + "balance_loss_mlp": 1.05307126, + "epoch": 0.06210732000601232, + "flos": 19353697286400.0, + "grad_norm": 3.949793190746779, + "language_loss": 0.89276892, + "learning_rate": 3.989196596031776e-06, + "loss": 0.91552204, + "num_input_tokens_seen": 22138155, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.53125, + "step": 1033, + "time_per_iteration": 2.5332658290863037 + }, + { + "auxiliary_loss_clip": 0.01212647, + "auxiliary_loss_mlp": 0.01059216, + "balance_loss_clip": 1.03437293, + "balance_loss_mlp": 1.05739737, + "epoch": 0.062167443258680295, + "flos": 24749382695040.0, + "grad_norm": 2.160025730572359, + "language_loss": 0.84795135, + "learning_rate": 3.989156132596479e-06, + "loss": 0.87066996, + "num_input_tokens_seen": 22157420, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.5546875, + "step": 1034, + "time_per_iteration": 2.507636785507202 + }, + { + "auxiliary_loss_clip": 0.01202421, + "auxiliary_loss_mlp": 0.01060051, + "balance_loss_clip": 1.03399241, + "balance_loss_mlp": 1.05694687, + "epoch": 0.06222756651134827, + "flos": 34458478773120.0, + "grad_norm": 3.176440156188905, + "language_loss": 0.81156218, + "learning_rate": 3.989115593732182e-06, + "loss": 0.83418697, + "num_input_tokens_seen": 22178620, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.453125, + "step": 1035, + "time_per_iteration": 2.624635696411133 + }, + { + "auxiliary_loss_clip": 0.01212161, + "auxiliary_loss_mlp": 0.01068413, + "balance_loss_clip": 1.04051828, + "balance_loss_mlp": 1.06080353, + "epoch": 0.06228768976401623, + "flos": 25666469763840.0, + "grad_norm": 2.252599829484163, + "language_loss": 0.78701359, + "learning_rate": 3.989074979440421e-06, + "loss": 0.80981934, + "num_input_tokens_seen": 22197125, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.515625, + "step": 1036, + "time_per_iteration": 2.53027081489563 + }, + { + "auxiliary_loss_clip": 0.01204167, + "auxiliary_loss_mlp": 0.01068533, + "balance_loss_clip": 1.04334402, + "balance_loss_mlp": 1.05620134, + "epoch": 0.062347813016684205, + "flos": 25295619795840.0, + "grad_norm": 1.670767972712633, + "language_loss": 0.86802149, + "learning_rate": 3.989034289722739e-06, + "loss": 0.8907485, + "num_input_tokens_seen": 22217575, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.4765625, + "step": 1037, + "time_per_iteration": 2.506011724472046 + }, + { + "auxiliary_loss_clip": 0.01206019, + "auxiliary_loss_mlp": 0.01057504, + "balance_loss_clip": 1.02990723, + "balance_loss_mlp": 1.05728471, + "epoch": 0.06240793626935217, + "flos": 26907039740160.0, + "grad_norm": 2.1914513209480933, + "language_loss": 0.81051469, + "learning_rate": 3.988993524580676e-06, + "loss": 0.83314991, + "num_input_tokens_seen": 22236840, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.484375, + "step": 1038, + "time_per_iteration": 2.486758232116699 + }, + { + "auxiliary_loss_clip": 0.01205947, + "auxiliary_loss_mlp": 0.01072566, + "balance_loss_clip": 1.04587555, + "balance_loss_mlp": 1.05856836, + "epoch": 0.06246805952202014, + "flos": 21615782146560.0, + "grad_norm": 2.3663261426095965, + "language_loss": 0.85336804, + "learning_rate": 3.98895268401578e-06, + "loss": 0.87615323, + "num_input_tokens_seen": 22256465, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.46875, + "step": 1039, + "time_per_iteration": 2.489241123199463 + }, + { + "auxiliary_loss_clip": 0.01207559, + "auxiliary_loss_mlp": 0.01067632, + "balance_loss_clip": 1.0417757, + "balance_loss_mlp": 1.05744672, + "epoch": 0.0625281827746881, + "flos": 19311896833920.0, + "grad_norm": 1.9774289629637263, + "language_loss": 0.80853289, + "learning_rate": 3.9889117680296e-06, + "loss": 0.83128488, + "num_input_tokens_seen": 22274025, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.5, + "step": 1040, + "time_per_iteration": 2.480022668838501 + }, + { + "auxiliary_loss_clip": 0.01213203, + "auxiliary_loss_mlp": 0.01067746, + "balance_loss_clip": 1.04155588, + "balance_loss_mlp": 1.06227219, + "epoch": 0.06258830602735609, + "flos": 27745769289600.0, + "grad_norm": 2.535271913081881, + "language_loss": 0.69440711, + "learning_rate": 3.988870776623685e-06, + "loss": 0.71721661, + "num_input_tokens_seen": 22292245, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.5078125, + "step": 1041, + "time_per_iteration": 2.5417978763580322 + }, + { + "auxiliary_loss_clip": 0.01210541, + "auxiliary_loss_mlp": 0.0106006, + "balance_loss_clip": 1.03360724, + "balance_loss_mlp": 1.05743289, + "epoch": 0.06264842928002405, + "flos": 23222605150080.0, + "grad_norm": 1.9564735382917973, + "language_loss": 0.80983013, + "learning_rate": 3.9888297097995905e-06, + "loss": 0.83253616, + "num_input_tokens_seen": 22311455, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.53125, + "step": 1042, + "time_per_iteration": 2.478926181793213 + }, + { + "auxiliary_loss_clip": 0.01210242, + "auxiliary_loss_mlp": 0.01057893, + "balance_loss_clip": 1.03352678, + "balance_loss_mlp": 1.05925727, + "epoch": 0.06270855253269202, + "flos": 38399495189760.0, + "grad_norm": 1.9466384226705415, + "language_loss": 0.76463902, + "learning_rate": 3.988788567558874e-06, + "loss": 0.78732038, + "num_input_tokens_seen": 22333750, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.515625, + "step": 1043, + "time_per_iteration": 2.6262781620025635 + }, + { + "auxiliary_loss_clip": 0.01203702, + "auxiliary_loss_mlp": 0.01066445, + "balance_loss_clip": 1.04174471, + "balance_loss_mlp": 1.05835676, + "epoch": 0.06276867578535998, + "flos": 22453542028800.0, + "grad_norm": 1.8860277298285366, + "language_loss": 0.92454541, + "learning_rate": 3.988747349903097e-06, + "loss": 0.94724691, + "num_input_tokens_seen": 22351940, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.453125, + "step": 1044, + "time_per_iteration": 2.4886953830718994 + }, + { + "auxiliary_loss_clip": 0.01204359, + "auxiliary_loss_mlp": 0.01073486, + "balance_loss_clip": 1.04824948, + "balance_loss_mlp": 1.05475259, + "epoch": 0.06282879903802796, + "flos": 22930435923840.0, + "grad_norm": 1.9539908597303346, + "language_loss": 0.8581354, + "learning_rate": 3.988706056833821e-06, + "loss": 0.88091385, + "num_input_tokens_seen": 22372085, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.5, + "step": 1045, + "time_per_iteration": 2.5382347106933594 + }, + { + "auxiliary_loss_clip": 0.01203094, + "auxiliary_loss_mlp": 0.01073753, + "balance_loss_clip": 1.04900479, + "balance_loss_mlp": 1.05618775, + "epoch": 0.06288892229069593, + "flos": 34819237019520.0, + "grad_norm": 2.0798822187092094, + "language_loss": 0.77675486, + "learning_rate": 3.9886646883526125e-06, + "loss": 0.79952335, + "num_input_tokens_seen": 22392020, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.46875, + "step": 1046, + "time_per_iteration": 2.548157215118408 + }, + { + "auxiliary_loss_clip": 0.01206605, + "auxiliary_loss_mlp": 0.01074859, + "balance_loss_clip": 1.04981279, + "balance_loss_mlp": 1.05837655, + "epoch": 0.06294904554336389, + "flos": 19427134642560.0, + "grad_norm": 2.197016946040243, + "language_loss": 0.77317166, + "learning_rate": 3.988623244461039e-06, + "loss": 0.79598629, + "num_input_tokens_seen": 22411180, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.4765625, + "step": 1047, + "time_per_iteration": 2.467973232269287 + }, + { + "auxiliary_loss_clip": 0.0121283, + "auxiliary_loss_mlp": 0.01061299, + "balance_loss_clip": 1.03584743, + "balance_loss_mlp": 1.05874014, + "epoch": 0.06300916879603187, + "flos": 40661867358720.0, + "grad_norm": 2.3103480986625753, + "language_loss": 0.7696203, + "learning_rate": 3.988581725160672e-06, + "loss": 0.79236162, + "num_input_tokens_seen": 22435105, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.546875, + "step": 1048, + "time_per_iteration": 2.636072874069214 + }, + { + "auxiliary_loss_clip": 0.01209565, + "auxiliary_loss_mlp": 0.01072791, + "balance_loss_clip": 1.0470655, + "balance_loss_mlp": 1.0583266, + "epoch": 0.06306929204869983, + "flos": 23804142341760.0, + "grad_norm": 2.2069714466600656, + "language_loss": 0.77757037, + "learning_rate": 3.988540130453087e-06, + "loss": 0.80039394, + "num_input_tokens_seen": 22452710, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.515625, + "step": 1049, + "time_per_iteration": 2.5173420906066895 + }, + { + "auxiliary_loss_clip": 0.01207985, + "auxiliary_loss_mlp": 0.01065489, + "balance_loss_clip": 1.04047871, + "balance_loss_mlp": 1.05734015, + "epoch": 0.0631294153013678, + "flos": 18915802583040.0, + "grad_norm": 2.316298014027776, + "language_loss": 0.83165503, + "learning_rate": 3.988498460339862e-06, + "loss": 0.85438979, + "num_input_tokens_seen": 22470175, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.5078125, + "step": 1050, + "time_per_iteration": 2.4742541313171387 + }, + { + "auxiliary_loss_clip": 0.01204381, + "auxiliary_loss_mlp": 0.01062607, + "balance_loss_clip": 1.03852665, + "balance_loss_mlp": 1.05776763, + "epoch": 0.06318953855403578, + "flos": 24280174310400.0, + "grad_norm": 2.1475970013183563, + "language_loss": 0.76909173, + "learning_rate": 3.988456714822575e-06, + "loss": 0.79176152, + "num_input_tokens_seen": 22490020, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.46875, + "step": 1051, + "time_per_iteration": 2.4629740715026855 + }, + { + "auxiliary_loss_clip": 0.01207556, + "auxiliary_loss_mlp": 0.01070398, + "balance_loss_clip": 1.04487562, + "balance_loss_mlp": 1.05788827, + "epoch": 0.06324966180670374, + "flos": 22528918719360.0, + "grad_norm": 2.090947022989376, + "language_loss": 0.80053556, + "learning_rate": 3.98841489390281e-06, + "loss": 0.82331514, + "num_input_tokens_seen": 22509685, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.4921875, + "step": 1052, + "time_per_iteration": 2.4729230403900146 + }, + { + "auxiliary_loss_clip": 0.01209047, + "auxiliary_loss_mlp": 0.01064567, + "balance_loss_clip": 1.03911567, + "balance_loss_mlp": 1.05839717, + "epoch": 0.06330978505937171, + "flos": 15778107884160.0, + "grad_norm": 2.21177767113968, + "language_loss": 0.78088665, + "learning_rate": 3.988372997582155e-06, + "loss": 0.80362272, + "num_input_tokens_seen": 22527905, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.5078125, + "step": 1053, + "time_per_iteration": 2.433969736099243 + }, + { + "auxiliary_loss_clip": 0.01209123, + "auxiliary_loss_mlp": 0.01055135, + "balance_loss_clip": 1.03094769, + "balance_loss_mlp": 1.0578481, + "epoch": 0.06336990831203967, + "flos": 21471098163840.0, + "grad_norm": 1.8421697124920164, + "language_loss": 0.84737611, + "learning_rate": 3.988331025862195e-06, + "loss": 0.8700186, + "num_input_tokens_seen": 22546335, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.515625, + "step": 1054, + "time_per_iteration": 2.4986183643341064 + }, + { + "auxiliary_loss_clip": 0.01205973, + "auxiliary_loss_mlp": 0.01065192, + "balance_loss_clip": 1.04051518, + "balance_loss_mlp": 1.05870843, + "epoch": 0.06343003156470765, + "flos": 18478877546880.0, + "grad_norm": 1.9255333357469135, + "language_loss": 0.8566432, + "learning_rate": 3.9882889787445225e-06, + "loss": 0.87935483, + "num_input_tokens_seen": 22563885, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.4765625, + "step": 1055, + "time_per_iteration": 2.5098037719726562 + }, + { + "auxiliary_loss_clip": 0.0121179, + "auxiliary_loss_mlp": 0.01071097, + "balance_loss_clip": 1.0451932, + "balance_loss_mlp": 1.05891657, + "epoch": 0.06349015481737562, + "flos": 25154886309120.0, + "grad_norm": 2.390503126540762, + "language_loss": 0.80966836, + "learning_rate": 3.988246856230734e-06, + "loss": 0.83249724, + "num_input_tokens_seen": 22583035, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.53125, + "step": 1056, + "time_per_iteration": 2.4944088459014893 + }, + { + "auxiliary_loss_clip": 0.01211818, + "auxiliary_loss_mlp": 0.01065832, + "balance_loss_clip": 1.03816319, + "balance_loss_mlp": 1.05503476, + "epoch": 0.06355027807004358, + "flos": 26871775562880.0, + "grad_norm": 2.70684555522199, + "language_loss": 0.81153649, + "learning_rate": 3.988204658322426e-06, + "loss": 0.83431304, + "num_input_tokens_seen": 22605055, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.5703125, + "step": 1057, + "time_per_iteration": 2.5327882766723633 + }, + { + "auxiliary_loss_clip": 0.0119703, + "auxiliary_loss_mlp": 0.01056395, + "balance_loss_clip": 1.03401923, + "balance_loss_mlp": 1.054492, + "epoch": 0.06361040132271156, + "flos": 21396691140480.0, + "grad_norm": 2.2830641052403826, + "language_loss": 0.8369416, + "learning_rate": 3.988162385021196e-06, + "loss": 0.85947585, + "num_input_tokens_seen": 22623760, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.421875, + "step": 1058, + "time_per_iteration": 2.4742424488067627 + }, + { + "auxiliary_loss_clip": 0.01208572, + "auxiliary_loss_mlp": 0.01067718, + "balance_loss_clip": 1.03934646, + "balance_loss_mlp": 1.05714464, + "epoch": 0.06367052457537953, + "flos": 25733765894400.0, + "grad_norm": 1.9712110015930453, + "language_loss": 0.87264961, + "learning_rate": 3.988120036328651e-06, + "loss": 0.8954125, + "num_input_tokens_seen": 22643000, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.515625, + "step": 1059, + "time_per_iteration": 5.514882564544678 + }, + { + "auxiliary_loss_clip": 0.01213823, + "auxiliary_loss_mlp": 0.01068179, + "balance_loss_clip": 1.04273927, + "balance_loss_mlp": 1.06130195, + "epoch": 0.0637306478280475, + "flos": 17631420992640.0, + "grad_norm": 2.227642611819728, + "language_loss": 0.9117676, + "learning_rate": 3.988077612246394e-06, + "loss": 0.9345876, + "num_input_tokens_seen": 22660460, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.5234375, + "step": 1060, + "time_per_iteration": 3.8977622985839844 + }, + { + "auxiliary_loss_clip": 0.01204952, + "auxiliary_loss_mlp": 0.01062848, + "balance_loss_clip": 1.03691983, + "balance_loss_mlp": 1.05582809, + "epoch": 0.06379077108071547, + "flos": 13662610427520.0, + "grad_norm": 1.9159755464944204, + "language_loss": 0.87713706, + "learning_rate": 3.988035112776035e-06, + "loss": 0.89981508, + "num_input_tokens_seen": 22679270, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.4921875, + "step": 1061, + "time_per_iteration": 2.4825663566589355 + }, + { + "auxiliary_loss_clip": 0.01213048, + "auxiliary_loss_mlp": 0.01066139, + "balance_loss_clip": 1.03862584, + "balance_loss_mlp": 1.05683804, + "epoch": 0.06385089433338344, + "flos": 28478849961600.0, + "grad_norm": 2.167309005799961, + "language_loss": 0.771905, + "learning_rate": 3.987992537919185e-06, + "loss": 0.79469687, + "num_input_tokens_seen": 22699330, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.5625, + "step": 1062, + "time_per_iteration": 2.5576398372650146 + }, + { + "auxiliary_loss_clip": 0.01206834, + "auxiliary_loss_mlp": 0.01063844, + "balance_loss_clip": 1.03896523, + "balance_loss_mlp": 1.05504322, + "epoch": 0.0639110175860514, + "flos": 24311057028480.0, + "grad_norm": 2.0414192004570872, + "language_loss": 0.86835265, + "learning_rate": 3.987949887677459e-06, + "loss": 0.89105946, + "num_input_tokens_seen": 22717945, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 1.515625, + "step": 1063, + "time_per_iteration": 2.472473382949829 + }, + { + "auxiliary_loss_clip": 0.01206458, + "auxiliary_loss_mlp": 0.0106328, + "balance_loss_clip": 1.03747082, + "balance_loss_mlp": 1.05539751, + "epoch": 0.06397114083871938, + "flos": 22090772620800.0, + "grad_norm": 2.0150359019026185, + "language_loss": 0.8051579, + "learning_rate": 3.9879071620524744e-06, + "loss": 0.82785529, + "num_input_tokens_seen": 22736790, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.515625, + "step": 1064, + "time_per_iteration": 2.478205919265747 + }, + { + "auxiliary_loss_clip": 0.01207278, + "auxiliary_loss_mlp": 0.01070567, + "balance_loss_clip": 1.04409075, + "balance_loss_mlp": 1.05682254, + "epoch": 0.06403126409138735, + "flos": 19572824206080.0, + "grad_norm": 2.254194289767691, + "language_loss": 0.84650666, + "learning_rate": 3.987864361045851e-06, + "loss": 0.86928511, + "num_input_tokens_seen": 22754745, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5078125, + "step": 1065, + "time_per_iteration": 2.456456184387207 + }, + { + "auxiliary_loss_clip": 0.01207067, + "auxiliary_loss_mlp": 0.01055171, + "balance_loss_clip": 1.03099585, + "balance_loss_mlp": 1.05966115, + "epoch": 0.06409138734405531, + "flos": 40807413267840.0, + "grad_norm": 1.66169186591579, + "language_loss": 0.68201709, + "learning_rate": 3.987821484659211e-06, + "loss": 0.70463943, + "num_input_tokens_seen": 22776780, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.46875, + "step": 1066, + "time_per_iteration": 2.6294829845428467 + }, + { + "auxiliary_loss_clip": 0.01206291, + "auxiliary_loss_mlp": 0.01076738, + "balance_loss_clip": 1.05003476, + "balance_loss_mlp": 1.05877519, + "epoch": 0.06415151059672328, + "flos": 20441610460800.0, + "grad_norm": 3.704601442813356, + "language_loss": 0.90345579, + "learning_rate": 3.987778532894181e-06, + "loss": 0.9262861, + "num_input_tokens_seen": 22793915, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.4765625, + "step": 1067, + "time_per_iteration": 2.459721565246582 + }, + { + "auxiliary_loss_clip": 0.01207052, + "auxiliary_loss_mlp": 0.01068129, + "balance_loss_clip": 1.04364336, + "balance_loss_mlp": 1.05625772, + "epoch": 0.06421163384939126, + "flos": 18072045129600.0, + "grad_norm": 2.8684947664405436, + "language_loss": 0.8343029, + "learning_rate": 3.987735505752391e-06, + "loss": 0.85705471, + "num_input_tokens_seen": 22812670, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.515625, + "step": 1068, + "time_per_iteration": 2.4611129760742188 + }, + { + "auxiliary_loss_clip": 0.01205753, + "auxiliary_loss_mlp": 0.01064379, + "balance_loss_clip": 1.03963113, + "balance_loss_mlp": 1.05991328, + "epoch": 0.06427175710205922, + "flos": 25119442563840.0, + "grad_norm": 2.4683216708617053, + "language_loss": 0.89402264, + "learning_rate": 3.987692403235471e-06, + "loss": 0.91672397, + "num_input_tokens_seen": 22832440, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.453125, + "step": 1069, + "time_per_iteration": 2.486241340637207 + }, + { + "auxiliary_loss_clip": 0.01206711, + "auxiliary_loss_mlp": 0.01082225, + "balance_loss_clip": 1.05555749, + "balance_loss_mlp": 1.05718124, + "epoch": 0.06433188035472719, + "flos": 17380549428480.0, + "grad_norm": 2.6076700233042396, + "language_loss": 0.95764256, + "learning_rate": 3.987649225345056e-06, + "loss": 0.98053193, + "num_input_tokens_seen": 22845495, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5, + "step": 1070, + "time_per_iteration": 2.413357734680176 + }, + { + "auxiliary_loss_clip": 0.01209924, + "auxiliary_loss_mlp": 0.01057318, + "balance_loss_clip": 1.0309608, + "balance_loss_mlp": 1.05859673, + "epoch": 0.06439200360739517, + "flos": 23546267625600.0, + "grad_norm": 1.8004745601001504, + "language_loss": 0.8819589, + "learning_rate": 3.987605972082782e-06, + "loss": 0.90463126, + "num_input_tokens_seen": 22865390, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.515625, + "step": 1071, + "time_per_iteration": 2.4717295169830322 + }, + { + "auxiliary_loss_clip": 0.01204044, + "auxiliary_loss_mlp": 0.01052011, + "balance_loss_clip": 1.02799058, + "balance_loss_mlp": 1.056633, + "epoch": 0.06445212686006313, + "flos": 21979772616960.0, + "grad_norm": 1.6498592642907823, + "language_loss": 0.75996184, + "learning_rate": 3.987562643450292e-06, + "loss": 0.78252238, + "num_input_tokens_seen": 22885495, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.4765625, + "step": 1072, + "time_per_iteration": 2.486936330795288 + }, + { + "auxiliary_loss_clip": 0.01207782, + "auxiliary_loss_mlp": 0.010661, + "balance_loss_clip": 1.03951669, + "balance_loss_mlp": 1.05679154, + "epoch": 0.0645122501127311, + "flos": 25921291824000.0, + "grad_norm": 1.95165590675185, + "language_loss": 0.80415034, + "learning_rate": 3.987519239449226e-06, + "loss": 0.82688916, + "num_input_tokens_seen": 22904845, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5078125, + "step": 1073, + "time_per_iteration": 2.476189613342285 + }, + { + "auxiliary_loss_clip": 0.01200054, + "auxiliary_loss_mlp": 0.01059954, + "balance_loss_clip": 1.03563547, + "balance_loss_mlp": 1.05634785, + "epoch": 0.06457237336539907, + "flos": 25626034028160.0, + "grad_norm": 1.7105520573330508, + "language_loss": 0.80205524, + "learning_rate": 3.987475760081233e-06, + "loss": 0.82465529, + "num_input_tokens_seen": 22925940, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.4375, + "step": 1074, + "time_per_iteration": 2.499133586883545 + }, + { + "auxiliary_loss_clip": 0.01204265, + "auxiliary_loss_mlp": 0.01060595, + "balance_loss_clip": 1.03469074, + "balance_loss_mlp": 1.05560029, + "epoch": 0.06463249661806704, + "flos": 19463979018240.0, + "grad_norm": 2.398999995550556, + "language_loss": 0.79203326, + "learning_rate": 3.987432205347958e-06, + "loss": 0.81468183, + "num_input_tokens_seen": 22944375, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.484375, + "step": 1075, + "time_per_iteration": 2.46777606010437 + }, + { + "auxiliary_loss_clip": 0.01207545, + "auxiliary_loss_mlp": 0.01064646, + "balance_loss_clip": 1.04086363, + "balance_loss_mlp": 1.05960226, + "epoch": 0.064692619870735, + "flos": 24498044254080.0, + "grad_norm": 2.7671348430420712, + "language_loss": 0.87819242, + "learning_rate": 3.987388575251055e-06, + "loss": 0.90091443, + "num_input_tokens_seen": 22959145, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.484375, + "step": 1076, + "time_per_iteration": 2.487417221069336 + }, + { + "auxiliary_loss_clip": 0.01199028, + "auxiliary_loss_mlp": 0.01053729, + "balance_loss_clip": 1.02918351, + "balance_loss_mlp": 1.05429745, + "epoch": 0.06475274312340297, + "flos": 17018677860480.0, + "grad_norm": 2.1388407300528534, + "language_loss": 0.80692923, + "learning_rate": 3.98734486979218e-06, + "loss": 0.82945681, + "num_input_tokens_seen": 22978100, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.4453125, + "step": 1077, + "time_per_iteration": 2.4290995597839355 + }, + { + "auxiliary_loss_clip": 0.01211867, + "auxiliary_loss_mlp": 0.01071702, + "balance_loss_clip": 1.04566646, + "balance_loss_mlp": 1.05862093, + "epoch": 0.06481286637607095, + "flos": 24572379450240.0, + "grad_norm": 2.618517400605346, + "language_loss": 0.91640681, + "learning_rate": 3.987301088972986e-06, + "loss": 0.93924248, + "num_input_tokens_seen": 22997285, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.53125, + "step": 1078, + "time_per_iteration": 2.500995635986328 + }, + { + "auxiliary_loss_clip": 0.01212712, + "auxiliary_loss_mlp": 0.01062475, + "balance_loss_clip": 1.03698754, + "balance_loss_mlp": 1.05874825, + "epoch": 0.06487298962873891, + "flos": 21105635235840.0, + "grad_norm": 2.106125999672554, + "language_loss": 0.78772497, + "learning_rate": 3.987257232795137e-06, + "loss": 0.81047684, + "num_input_tokens_seen": 23016285, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.5390625, + "step": 1079, + "time_per_iteration": 2.4510841369628906 + }, + { + "auxiliary_loss_clip": 0.01204732, + "auxiliary_loss_mlp": 0.01061369, + "balance_loss_clip": 1.03619218, + "balance_loss_mlp": 1.05602205, + "epoch": 0.06493311288140688, + "flos": 24608182331520.0, + "grad_norm": 2.051955253501364, + "language_loss": 0.69555283, + "learning_rate": 3.987213301260294e-06, + "loss": 0.7182138, + "num_input_tokens_seen": 23036420, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.484375, + "step": 1080, + "time_per_iteration": 2.5024302005767822 + }, + { + "auxiliary_loss_clip": 0.01204586, + "auxiliary_loss_mlp": 0.01063302, + "balance_loss_clip": 1.03649211, + "balance_loss_mlp": 1.05477285, + "epoch": 0.06499323613407486, + "flos": 25337994865920.0, + "grad_norm": 1.85895294752556, + "language_loss": 0.72094852, + "learning_rate": 3.987169294370123e-06, + "loss": 0.74362737, + "num_input_tokens_seen": 23056945, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.5, + "step": 1081, + "time_per_iteration": 2.5032622814178467 + }, + { + "auxiliary_loss_clip": 0.01201777, + "auxiliary_loss_mlp": 0.01064533, + "balance_loss_clip": 1.03867674, + "balance_loss_mlp": 1.0554111, + "epoch": 0.06505335938674282, + "flos": 20375714960640.0, + "grad_norm": 2.6422342029105863, + "language_loss": 0.84621316, + "learning_rate": 3.987125212126294e-06, + "loss": 0.86887628, + "num_input_tokens_seen": 23074940, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.46875, + "step": 1082, + "time_per_iteration": 2.459564447402954 + }, + { + "auxiliary_loss_clip": 0.01214386, + "auxiliary_loss_mlp": 0.01067955, + "balance_loss_clip": 1.04106104, + "balance_loss_mlp": 1.05817008, + "epoch": 0.06511348263941079, + "flos": 25337923038720.0, + "grad_norm": 2.177850298461163, + "language_loss": 0.8303026, + "learning_rate": 3.987081054530478e-06, + "loss": 0.85312605, + "num_input_tokens_seen": 23093420, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.5625, + "step": 1083, + "time_per_iteration": 2.504584550857544 + }, + { + "auxiliary_loss_clip": 0.01206291, + "auxiliary_loss_mlp": 0.01065303, + "balance_loss_clip": 1.03852844, + "balance_loss_mlp": 1.05794787, + "epoch": 0.06517360589207877, + "flos": 20332801186560.0, + "grad_norm": 2.6002614807121227, + "language_loss": 0.79689312, + "learning_rate": 3.987036821584348e-06, + "loss": 0.81960905, + "num_input_tokens_seen": 23111550, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.484375, + "step": 1084, + "time_per_iteration": 2.4530820846557617 + }, + { + "auxiliary_loss_clip": 0.01204762, + "auxiliary_loss_mlp": 0.01060872, + "balance_loss_clip": 1.03489637, + "balance_loss_mlp": 1.05634058, + "epoch": 0.06523372914474673, + "flos": 31681650061440.0, + "grad_norm": 2.1191367521188074, + "language_loss": 0.66211331, + "learning_rate": 3.986992513289584e-06, + "loss": 0.68476963, + "num_input_tokens_seen": 23130335, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.484375, + "step": 1085, + "time_per_iteration": 2.5733256340026855 + }, + { + "auxiliary_loss_clip": 0.01198609, + "auxiliary_loss_mlp": 0.01069188, + "balance_loss_clip": 1.04436827, + "balance_loss_mlp": 1.05400848, + "epoch": 0.0652938523974147, + "flos": 20778165918720.0, + "grad_norm": 1.9997547556569089, + "language_loss": 0.76998973, + "learning_rate": 3.9869481296478645e-06, + "loss": 0.79266769, + "num_input_tokens_seen": 23152380, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.4453125, + "step": 1086, + "time_per_iteration": 2.4958763122558594 + }, + { + "auxiliary_loss_clip": 0.01199669, + "auxiliary_loss_mlp": 0.01063299, + "balance_loss_clip": 1.03763306, + "balance_loss_mlp": 1.05291176, + "epoch": 0.06535397565008266, + "flos": 16690993061760.0, + "grad_norm": 2.1546414392836977, + "language_loss": 0.85154319, + "learning_rate": 3.986903670660872e-06, + "loss": 0.87417287, + "num_input_tokens_seen": 23171630, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.46875, + "step": 1087, + "time_per_iteration": 2.4456934928894043 + }, + { + "auxiliary_loss_clip": 0.01204231, + "auxiliary_loss_mlp": 0.01061167, + "balance_loss_clip": 1.03609776, + "balance_loss_mlp": 1.05594206, + "epoch": 0.06541409890275064, + "flos": 26868220116480.0, + "grad_norm": 1.7775330808837086, + "language_loss": 0.77970594, + "learning_rate": 3.9868591363302945e-06, + "loss": 0.80235994, + "num_input_tokens_seen": 23192520, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.484375, + "step": 1088, + "time_per_iteration": 2.53505277633667 + }, + { + "auxiliary_loss_clip": 0.01204134, + "auxiliary_loss_mlp": 0.01066637, + "balance_loss_clip": 1.04329574, + "balance_loss_mlp": 1.05602646, + "epoch": 0.06547422215541861, + "flos": 20521620005760.0, + "grad_norm": 1.9036978890371752, + "language_loss": 0.71191919, + "learning_rate": 3.9868145266578186e-06, + "loss": 0.73462689, + "num_input_tokens_seen": 23210710, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.484375, + "step": 1089, + "time_per_iteration": 2.4569168090820312 + }, + { + "auxiliary_loss_clip": 0.01204151, + "auxiliary_loss_mlp": 0.0105997, + "balance_loss_clip": 1.03566289, + "balance_loss_mlp": 1.05729651, + "epoch": 0.06553434540808657, + "flos": 22016616992640.0, + "grad_norm": 1.7924808842614686, + "language_loss": 0.85504186, + "learning_rate": 3.9867698416451366e-06, + "loss": 0.8776831, + "num_input_tokens_seen": 23230305, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.46875, + "step": 1090, + "time_per_iteration": 2.4624812602996826 + }, + { + "auxiliary_loss_clip": 0.01204567, + "auxiliary_loss_mlp": 0.01062106, + "balance_loss_clip": 1.0365001, + "balance_loss_mlp": 1.05594897, + "epoch": 0.06559446866075455, + "flos": 24608649208320.0, + "grad_norm": 2.2382380061135945, + "language_loss": 0.72027361, + "learning_rate": 3.9867250812939434e-06, + "loss": 0.74294031, + "num_input_tokens_seen": 23249015, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.484375, + "step": 1091, + "time_per_iteration": 2.4911999702453613 + }, + { + "auxiliary_loss_clip": 0.01201014, + "auxiliary_loss_mlp": 0.0106187, + "balance_loss_clip": 1.03638279, + "balance_loss_mlp": 1.05507159, + "epoch": 0.06565459191342252, + "flos": 24274679529600.0, + "grad_norm": 2.7948943762047525, + "language_loss": 0.82525271, + "learning_rate": 3.986680245605936e-06, + "loss": 0.8478815, + "num_input_tokens_seen": 23265105, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4609375, + "step": 1092, + "time_per_iteration": 2.510835886001587 + }, + { + "auxiliary_loss_clip": 0.01205888, + "auxiliary_loss_mlp": 0.01064535, + "balance_loss_clip": 1.03716493, + "balance_loss_mlp": 1.05484402, + "epoch": 0.06571471516609048, + "flos": 24787124910720.0, + "grad_norm": 4.994634192306823, + "language_loss": 0.71286589, + "learning_rate": 3.986635334582814e-06, + "loss": 0.73557013, + "num_input_tokens_seen": 23283950, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.515625, + "step": 1093, + "time_per_iteration": 2.528994560241699 + }, + { + "auxiliary_loss_clip": 0.01204526, + "auxiliary_loss_mlp": 0.01063177, + "balance_loss_clip": 1.03668869, + "balance_loss_mlp": 1.05701041, + "epoch": 0.06577483841875846, + "flos": 26214071581440.0, + "grad_norm": 1.8259988866114194, + "language_loss": 0.87971264, + "learning_rate": 3.986590348226282e-06, + "loss": 0.90238965, + "num_input_tokens_seen": 23305005, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.4765625, + "step": 1094, + "time_per_iteration": 2.50201678276062 + }, + { + "auxiliary_loss_clip": 0.01205803, + "auxiliary_loss_mlp": 0.0106202, + "balance_loss_clip": 1.0350548, + "balance_loss_mlp": 1.0575459, + "epoch": 0.06583496167142643, + "flos": 25080802508160.0, + "grad_norm": 1.6349502946236962, + "language_loss": 0.81364405, + "learning_rate": 3.986545286538044e-06, + "loss": 0.83632231, + "num_input_tokens_seen": 23323220, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.484375, + "step": 1095, + "time_per_iteration": 2.4947729110717773 + }, + { + "auxiliary_loss_clip": 0.01200923, + "auxiliary_loss_mlp": 0.01057353, + "balance_loss_clip": 1.03414297, + "balance_loss_mlp": 1.05544913, + "epoch": 0.06589508492409439, + "flos": 25629804956160.0, + "grad_norm": 2.4379029944224215, + "language_loss": 0.69712919, + "learning_rate": 3.986500149519811e-06, + "loss": 0.7197119, + "num_input_tokens_seen": 23342235, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.453125, + "step": 1096, + "time_per_iteration": 2.5135879516601562 + }, + { + "auxiliary_loss_clip": 0.01206873, + "auxiliary_loss_mlp": 0.01069815, + "balance_loss_clip": 1.04451883, + "balance_loss_mlp": 1.0592947, + "epoch": 0.06595520817676236, + "flos": 23621249266560.0, + "grad_norm": 1.7715259730160258, + "language_loss": 0.77498722, + "learning_rate": 3.986454937173292e-06, + "loss": 0.79775411, + "num_input_tokens_seen": 23363680, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.4765625, + "step": 1097, + "time_per_iteration": 2.4872820377349854 + }, + { + "auxiliary_loss_clip": 0.0120653, + "auxiliary_loss_mlp": 0.01063548, + "balance_loss_clip": 1.03814423, + "balance_loss_mlp": 1.05785179, + "epoch": 0.06601533142943034, + "flos": 33801708545280.0, + "grad_norm": 1.7376479388989727, + "language_loss": 0.77846545, + "learning_rate": 3.986409649500203e-06, + "loss": 0.80116618, + "num_input_tokens_seen": 23385590, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.484375, + "step": 1098, + "time_per_iteration": 2.583075761795044 + }, + { + "auxiliary_loss_clip": 0.01204454, + "auxiliary_loss_mlp": 0.01071542, + "balance_loss_clip": 1.04483891, + "balance_loss_mlp": 1.05739522, + "epoch": 0.0660754546820983, + "flos": 20259184262400.0, + "grad_norm": 1.9398633669636132, + "language_loss": 0.81675154, + "learning_rate": 3.986364286502261e-06, + "loss": 0.83951151, + "num_input_tokens_seen": 23402945, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.46875, + "step": 1099, + "time_per_iteration": 2.446969985961914 + }, + { + "auxiliary_loss_clip": 0.01195976, + "auxiliary_loss_mlp": 0.01052705, + "balance_loss_clip": 1.02801692, + "balance_loss_mlp": 1.0519135, + "epoch": 0.06613557793476627, + "flos": 19354164163200.0, + "grad_norm": 2.0018625732470245, + "language_loss": 0.82619941, + "learning_rate": 3.986318848181186e-06, + "loss": 0.84868616, + "num_input_tokens_seen": 23421410, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.4375, + "step": 1100, + "time_per_iteration": 2.4545743465423584 + }, + { + "auxiliary_loss_clip": 0.01204382, + "auxiliary_loss_mlp": 0.01060672, + "balance_loss_clip": 1.03630555, + "balance_loss_mlp": 1.05827951, + "epoch": 0.06619570118743424, + "flos": 13772568936960.0, + "grad_norm": 2.362466383483127, + "language_loss": 0.73439336, + "learning_rate": 3.986273334538702e-06, + "loss": 0.7570439, + "num_input_tokens_seen": 23438870, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.4609375, + "step": 1101, + "time_per_iteration": 6.740786790847778 + }, + { + "auxiliary_loss_clip": 0.0119874, + "auxiliary_loss_mlp": 0.01062411, + "balance_loss_clip": 1.03829539, + "balance_loss_mlp": 1.05373132, + "epoch": 0.06625582444010221, + "flos": 17857874286720.0, + "grad_norm": 2.46656505058328, + "language_loss": 0.86047602, + "learning_rate": 3.986227745576533e-06, + "loss": 0.88308758, + "num_input_tokens_seen": 23456975, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.453125, + "step": 1102, + "time_per_iteration": 2.4480903148651123 + }, + { + "auxiliary_loss_clip": 0.01200394, + "auxiliary_loss_mlp": 0.01057443, + "balance_loss_clip": 1.0322063, + "balance_loss_mlp": 1.05588222, + "epoch": 0.06631594769277017, + "flos": 11838707579520.0, + "grad_norm": 2.0494810685505995, + "language_loss": 0.81707513, + "learning_rate": 3.98618208129641e-06, + "loss": 0.83965349, + "num_input_tokens_seen": 23473440, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.4453125, + "step": 1103, + "time_per_iteration": 2.4419338703155518 + }, + { + "auxiliary_loss_clip": 0.01203538, + "auxiliary_loss_mlp": 0.01063441, + "balance_loss_clip": 1.04029047, + "balance_loss_mlp": 1.05891824, + "epoch": 0.06637607094543815, + "flos": 19793351756160.0, + "grad_norm": 1.7865556655629211, + "language_loss": 0.82059169, + "learning_rate": 3.986136341700063e-06, + "loss": 0.84326148, + "num_input_tokens_seen": 23493880, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.4453125, + "step": 1104, + "time_per_iteration": 2.506230115890503 + }, + { + "auxiliary_loss_clip": 0.01195268, + "auxiliary_loss_mlp": 0.01047754, + "balance_loss_clip": 1.02229071, + "balance_loss_mlp": 1.05232382, + "epoch": 0.06643619419810612, + "flos": 25485659677440.0, + "grad_norm": 1.6089454783719872, + "language_loss": 0.80542791, + "learning_rate": 3.986090526789227e-06, + "loss": 0.82785821, + "num_input_tokens_seen": 23514920, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4296875, + "step": 1105, + "time_per_iteration": 2.524385929107666 + }, + { + "auxiliary_loss_clip": 0.01197193, + "auxiliary_loss_mlp": 0.01060252, + "balance_loss_clip": 1.03720832, + "balance_loss_mlp": 1.05697632, + "epoch": 0.06649631745077408, + "flos": 16946533393920.0, + "grad_norm": 1.8452117827451007, + "language_loss": 0.96738935, + "learning_rate": 3.986044636565639e-06, + "loss": 0.98996383, + "num_input_tokens_seen": 23531635, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.40625, + "step": 1106, + "time_per_iteration": 2.455122470855713 + }, + { + "auxiliary_loss_clip": 0.01204143, + "auxiliary_loss_mlp": 0.01060087, + "balance_loss_clip": 1.03436136, + "balance_loss_mlp": 1.05509543, + "epoch": 0.06655644070344206, + "flos": 17858592558720.0, + "grad_norm": 1.9568581550144768, + "language_loss": 0.82766026, + "learning_rate": 3.985998671031039e-06, + "loss": 0.85030258, + "num_input_tokens_seen": 23551020, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.4921875, + "step": 1107, + "time_per_iteration": 2.4554357528686523 + }, + { + "auxiliary_loss_clip": 0.01077187, + "auxiliary_loss_mlp": 0.01010186, + "balance_loss_clip": 1.0061568, + "balance_loss_mlp": 1.01696265, + "epoch": 0.06661656395611003, + "flos": 61419350021760.0, + "grad_norm": 0.8235952583150978, + "language_loss": 0.56729984, + "learning_rate": 3.9859526301871705e-06, + "loss": 0.58817357, + "num_input_tokens_seen": 23610675, + "router_z_loss_clip": 0.0402832, + "router_z_loss_mlp": 0.6015625, + "step": 1108, + "time_per_iteration": 3.0248770713806152 + }, + { + "auxiliary_loss_clip": 0.01200435, + "auxiliary_loss_mlp": 0.01065514, + "balance_loss_clip": 1.04034865, + "balance_loss_mlp": 1.05397463, + "epoch": 0.066676687208778, + "flos": 20662856282880.0, + "grad_norm": 2.4203653272420693, + "language_loss": 0.72493321, + "learning_rate": 3.9859065140357795e-06, + "loss": 0.74759269, + "num_input_tokens_seen": 23628710, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.46875, + "step": 1109, + "time_per_iteration": 2.4559717178344727 + }, + { + "auxiliary_loss_clip": 0.01197389, + "auxiliary_loss_mlp": 0.01063103, + "balance_loss_clip": 1.03759217, + "balance_loss_mlp": 1.05389571, + "epoch": 0.06673681046144596, + "flos": 20923280864640.0, + "grad_norm": 3.084593088047962, + "language_loss": 0.78256035, + "learning_rate": 3.985860322578614e-06, + "loss": 0.80516529, + "num_input_tokens_seen": 23649160, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4375, + "step": 1110, + "time_per_iteration": 2.4989912509918213 + }, + { + "auxiliary_loss_clip": 0.01201522, + "auxiliary_loss_mlp": 0.01057407, + "balance_loss_clip": 1.0334934, + "balance_loss_mlp": 1.05598152, + "epoch": 0.06679693371411394, + "flos": 31065818359680.0, + "grad_norm": 2.197430378352105, + "language_loss": 0.71290207, + "learning_rate": 3.985814055817427e-06, + "loss": 0.73549128, + "num_input_tokens_seen": 23671995, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.453125, + "step": 1111, + "time_per_iteration": 2.5445287227630615 + }, + { + "auxiliary_loss_clip": 0.0120524, + "auxiliary_loss_mlp": 0.01062473, + "balance_loss_clip": 1.03833365, + "balance_loss_mlp": 1.05788755, + "epoch": 0.0668570569667819, + "flos": 21726135705600.0, + "grad_norm": 1.8078370838130353, + "language_loss": 0.78315711, + "learning_rate": 3.985767713753971e-06, + "loss": 0.80583429, + "num_input_tokens_seen": 23690705, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.4765625, + "step": 1112, + "time_per_iteration": 2.4791040420532227 + }, + { + "auxiliary_loss_clip": 0.01203172, + "auxiliary_loss_mlp": 0.01058254, + "balance_loss_clip": 1.03426933, + "balance_loss_mlp": 1.05794001, + "epoch": 0.06691718021944987, + "flos": 22747255539840.0, + "grad_norm": 2.0430507180103943, + "language_loss": 0.78819263, + "learning_rate": 3.985721296390005e-06, + "loss": 0.81080687, + "num_input_tokens_seen": 23709990, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.453125, + "step": 1113, + "time_per_iteration": 2.4637296199798584 + }, + { + "auxiliary_loss_clip": 0.01195153, + "auxiliary_loss_mlp": 0.01053406, + "balance_loss_clip": 1.03056598, + "balance_loss_mlp": 1.05255365, + "epoch": 0.06697730347211785, + "flos": 16545626720640.0, + "grad_norm": 2.035611213247421, + "language_loss": 0.82393003, + "learning_rate": 3.985674803727289e-06, + "loss": 0.84641558, + "num_input_tokens_seen": 23728485, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.4296875, + "step": 1114, + "time_per_iteration": 2.434006452560425 + }, + { + "auxiliary_loss_clip": 0.01076, + "auxiliary_loss_mlp": 0.01008619, + "balance_loss_clip": 1.00463712, + "balance_loss_mlp": 1.0165143, + "epoch": 0.06703742672478581, + "flos": 59782326658560.0, + "grad_norm": 0.8339607525511222, + "language_loss": 0.58126414, + "learning_rate": 3.985628235767584e-06, + "loss": 0.60211033, + "num_input_tokens_seen": 23786650, + "router_z_loss_clip": 0.03979492, + "router_z_loss_mlp": 0.59375, + "step": 1115, + "time_per_iteration": 3.020782709121704 + }, + { + "auxiliary_loss_clip": 0.01200335, + "auxiliary_loss_mlp": 0.01059737, + "balance_loss_clip": 1.03427422, + "balance_loss_mlp": 1.05479646, + "epoch": 0.06709754997745378, + "flos": 16800197385600.0, + "grad_norm": 2.8263674595854464, + "language_loss": 0.91123891, + "learning_rate": 3.985581592512658e-06, + "loss": 0.93383968, + "num_input_tokens_seen": 23802555, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.453125, + "step": 1116, + "time_per_iteration": 2.446439504623413 + }, + { + "auxiliary_loss_clip": 0.01209259, + "auxiliary_loss_mlp": 0.01067721, + "balance_loss_clip": 1.04323506, + "balance_loss_mlp": 1.06065357, + "epoch": 0.06715767323012176, + "flos": 22123917895680.0, + "grad_norm": 2.019283248682947, + "language_loss": 0.8709814, + "learning_rate": 3.985534873964279e-06, + "loss": 0.89375114, + "num_input_tokens_seen": 23822945, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.484375, + "step": 1117, + "time_per_iteration": 2.486212968826294 + }, + { + "auxiliary_loss_clip": 0.01074244, + "auxiliary_loss_mlp": 0.0100646, + "balance_loss_clip": 1.00250196, + "balance_loss_mlp": 1.01550937, + "epoch": 0.06721779648278972, + "flos": 66618100137600.0, + "grad_norm": 0.9454776991467404, + "language_loss": 0.59798217, + "learning_rate": 3.985488080124218e-06, + "loss": 0.6187892, + "num_input_tokens_seen": 23874075, + "router_z_loss_clip": 0.03955078, + "router_z_loss_mlp": 0.5859375, + "step": 1118, + "time_per_iteration": 3.0197594165802 + }, + { + "auxiliary_loss_clip": 0.01201284, + "auxiliary_loss_mlp": 0.01056466, + "balance_loss_clip": 1.03255224, + "balance_loss_mlp": 1.05418777, + "epoch": 0.06727791973545769, + "flos": 22382474970240.0, + "grad_norm": 3.7568577616727468, + "language_loss": 0.83498162, + "learning_rate": 3.985441210994251e-06, + "loss": 0.85755914, + "num_input_tokens_seen": 23889720, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.4765625, + "step": 1119, + "time_per_iteration": 2.4535257816314697 + }, + { + "auxiliary_loss_clip": 0.01199216, + "auxiliary_loss_mlp": 0.01059451, + "balance_loss_clip": 1.03732562, + "balance_loss_mlp": 1.0562222, + "epoch": 0.06733804298812565, + "flos": 24280210224000.0, + "grad_norm": 1.8165724331790314, + "language_loss": 0.8480413, + "learning_rate": 3.9853942665761545e-06, + "loss": 0.87062794, + "num_input_tokens_seen": 23909385, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.4296875, + "step": 1120, + "time_per_iteration": 2.533182382583618 + }, + { + "auxiliary_loss_clip": 0.01208718, + "auxiliary_loss_mlp": 0.01067102, + "balance_loss_clip": 1.04269981, + "balance_loss_mlp": 1.0602659, + "epoch": 0.06739816624079363, + "flos": 15918230839680.0, + "grad_norm": 2.032922437281707, + "language_loss": 0.78959441, + "learning_rate": 3.985347246871708e-06, + "loss": 0.81235266, + "num_input_tokens_seen": 23926830, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.484375, + "step": 1121, + "time_per_iteration": 2.4523215293884277 + }, + { + "auxiliary_loss_clip": 0.01072018, + "auxiliary_loss_mlp": 0.01004747, + "balance_loss_clip": 1.00033593, + "balance_loss_mlp": 1.0132587, + "epoch": 0.0674582894934616, + "flos": 71398567353600.0, + "grad_norm": 0.7615352754050735, + "language_loss": 0.58346939, + "learning_rate": 3.985300151882694e-06, + "loss": 0.60423702, + "num_input_tokens_seen": 23992640, + "router_z_loss_clip": 0.04418945, + "router_z_loss_mlp": 0.5859375, + "step": 1122, + "time_per_iteration": 3.2087855339050293 + }, + { + "auxiliary_loss_clip": 0.0120309, + "auxiliary_loss_mlp": 0.01065913, + "balance_loss_clip": 1.04245234, + "balance_loss_mlp": 1.0584271, + "epoch": 0.06751841274612956, + "flos": 25264952559360.0, + "grad_norm": 2.0430211727412098, + "language_loss": 0.71546745, + "learning_rate": 3.985252981610901e-06, + "loss": 0.73815745, + "num_input_tokens_seen": 24011135, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.4453125, + "step": 1123, + "time_per_iteration": 2.5017640590667725 + }, + { + "auxiliary_loss_clip": 0.01201701, + "auxiliary_loss_mlp": 0.01057362, + "balance_loss_clip": 1.03216124, + "balance_loss_mlp": 1.05484593, + "epoch": 0.06757853599879754, + "flos": 23802741711360.0, + "grad_norm": 1.8376842720828679, + "language_loss": 0.79288971, + "learning_rate": 3.985205736058114e-06, + "loss": 0.81548035, + "num_input_tokens_seen": 24030695, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.46875, + "step": 1124, + "time_per_iteration": 2.4980688095092773 + }, + { + "auxiliary_loss_clip": 0.01196564, + "auxiliary_loss_mlp": 0.01054377, + "balance_loss_clip": 1.03204954, + "balance_loss_mlp": 1.05469489, + "epoch": 0.0676386592514655, + "flos": 21033742164480.0, + "grad_norm": 2.0983993205372253, + "language_loss": 0.71198726, + "learning_rate": 3.985158415226128e-06, + "loss": 0.73449671, + "num_input_tokens_seen": 24050680, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.421875, + "step": 1125, + "time_per_iteration": 2.4704325199127197 + }, + { + "auxiliary_loss_clip": 0.01198895, + "auxiliary_loss_mlp": 0.01068522, + "balance_loss_clip": 1.04247451, + "balance_loss_mlp": 1.05620742, + "epoch": 0.06769878250413347, + "flos": 25556331686400.0, + "grad_norm": 2.9171204901367243, + "language_loss": 0.80814254, + "learning_rate": 3.985111019116736e-06, + "loss": 0.83081663, + "num_input_tokens_seen": 24067205, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.421875, + "step": 1126, + "time_per_iteration": 2.5046803951263428 + }, + { + "auxiliary_loss_clip": 0.01070877, + "auxiliary_loss_mlp": 0.01002736, + "balance_loss_clip": 0.9986586, + "balance_loss_mlp": 1.01286924, + "epoch": 0.06775890575680145, + "flos": 70655251305600.0, + "grad_norm": 0.7804116507992601, + "language_loss": 0.59733766, + "learning_rate": 3.985063547731735e-06, + "loss": 0.61807376, + "num_input_tokens_seen": 24131320, + "router_z_loss_clip": 0.04077148, + "router_z_loss_mlp": 0.578125, + "step": 1127, + "time_per_iteration": 3.0877249240875244 + }, + { + "auxiliary_loss_clip": 0.01199514, + "auxiliary_loss_mlp": 0.01056848, + "balance_loss_clip": 1.03376949, + "balance_loss_mlp": 1.05723238, + "epoch": 0.06781902900946941, + "flos": 24235500769920.0, + "grad_norm": 2.13286114653412, + "language_loss": 0.81392133, + "learning_rate": 3.985016001072925e-06, + "loss": 0.83648497, + "num_input_tokens_seen": 24149930, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.421875, + "step": 1128, + "time_per_iteration": 2.5406885147094727 + }, + { + "auxiliary_loss_clip": 0.01208088, + "auxiliary_loss_mlp": 0.0105195, + "balance_loss_clip": 1.02692807, + "balance_loss_mlp": 1.0598706, + "epoch": 0.06787915226213738, + "flos": 22417523665920.0, + "grad_norm": 3.047918834731733, + "language_loss": 0.76034033, + "learning_rate": 3.984968379142109e-06, + "loss": 0.78294069, + "num_input_tokens_seen": 24169590, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.484375, + "step": 1129, + "time_per_iteration": 2.486829996109009 + }, + { + "auxiliary_loss_clip": 0.01201584, + "auxiliary_loss_mlp": 0.01061333, + "balance_loss_clip": 1.03721654, + "balance_loss_mlp": 1.05536139, + "epoch": 0.06793927551480534, + "flos": 37706922080640.0, + "grad_norm": 1.8621491947103987, + "language_loss": 0.72340226, + "learning_rate": 3.984920681941094e-06, + "loss": 0.74603146, + "num_input_tokens_seen": 24189965, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.4609375, + "step": 1130, + "time_per_iteration": 2.6195991039276123 + }, + { + "auxiliary_loss_clip": 0.01197626, + "auxiliary_loss_mlp": 0.01063599, + "balance_loss_clip": 1.03957844, + "balance_loss_mlp": 1.05584192, + "epoch": 0.06799939876747332, + "flos": 20631398947200.0, + "grad_norm": 2.3479224842049917, + "language_loss": 0.80624223, + "learning_rate": 3.984872909471688e-06, + "loss": 0.82885444, + "num_input_tokens_seen": 24208045, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.421875, + "step": 1131, + "time_per_iteration": 2.4917030334472656 + }, + { + "auxiliary_loss_clip": 0.01196301, + "auxiliary_loss_mlp": 0.0106802, + "balance_loss_clip": 1.04398775, + "balance_loss_mlp": 1.05550814, + "epoch": 0.06805952202014129, + "flos": 14864755829760.0, + "grad_norm": 2.1673533627141652, + "language_loss": 0.8104949, + "learning_rate": 3.984825061735701e-06, + "loss": 0.83313811, + "num_input_tokens_seen": 24223805, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.40625, + "step": 1132, + "time_per_iteration": 2.4325902462005615 + }, + { + "auxiliary_loss_clip": 0.01199688, + "auxiliary_loss_mlp": 0.01069367, + "balance_loss_clip": 1.04525137, + "balance_loss_mlp": 1.05629563, + "epoch": 0.06811964527280925, + "flos": 48909434947200.0, + "grad_norm": 1.450417149602266, + "language_loss": 0.63629937, + "learning_rate": 3.9847771387349495e-06, + "loss": 0.65898991, + "num_input_tokens_seen": 24249475, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.4296875, + "step": 1133, + "time_per_iteration": 2.7164230346679688 + }, + { + "auxiliary_loss_clip": 0.01203203, + "auxiliary_loss_mlp": 0.01059397, + "balance_loss_clip": 1.03194308, + "balance_loss_mlp": 1.05427325, + "epoch": 0.06817976852547723, + "flos": 15377273038080.0, + "grad_norm": 2.5027083277203963, + "language_loss": 0.74811196, + "learning_rate": 3.9847291404712506e-06, + "loss": 0.77073789, + "num_input_tokens_seen": 24267980, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.484375, + "step": 1134, + "time_per_iteration": 2.420506000518799 + }, + { + "auxiliary_loss_clip": 0.01201452, + "auxiliary_loss_mlp": 0.01064371, + "balance_loss_clip": 1.04088652, + "balance_loss_mlp": 1.05952573, + "epoch": 0.0682398917781452, + "flos": 20155690200960.0, + "grad_norm": 2.0759609389962037, + "language_loss": 0.87245119, + "learning_rate": 3.984681066946423e-06, + "loss": 0.89510942, + "num_input_tokens_seen": 24286805, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.421875, + "step": 1135, + "time_per_iteration": 2.464738607406616 + }, + { + "auxiliary_loss_clip": 0.01200809, + "auxiliary_loss_mlp": 0.01055494, + "balance_loss_clip": 1.03010249, + "balance_loss_mlp": 1.05388534, + "epoch": 0.06830001503081316, + "flos": 23440618748160.0, + "grad_norm": 2.383261313924855, + "language_loss": 0.78335494, + "learning_rate": 3.984632918162291e-06, + "loss": 0.80591798, + "num_input_tokens_seen": 24305855, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.46875, + "step": 1136, + "time_per_iteration": 2.4486002922058105 + }, + { + "auxiliary_loss_clip": 0.01206211, + "auxiliary_loss_mlp": 0.01073979, + "balance_loss_clip": 1.04906416, + "balance_loss_mlp": 1.06089664, + "epoch": 0.06836013828348114, + "flos": 34349813153280.0, + "grad_norm": 3.2008110915617207, + "language_loss": 0.83941948, + "learning_rate": 3.984584694120679e-06, + "loss": 0.86222148, + "num_input_tokens_seen": 24326535, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 1.453125, + "step": 1137, + "time_per_iteration": 2.5714635848999023 + }, + { + "auxiliary_loss_clip": 0.01199575, + "auxiliary_loss_mlp": 0.01061827, + "balance_loss_clip": 1.03806889, + "balance_loss_mlp": 1.05628538, + "epoch": 0.06842026153614911, + "flos": 23148844571520.0, + "grad_norm": 2.067587662099544, + "language_loss": 0.78669268, + "learning_rate": 3.984536394823418e-06, + "loss": 0.80930662, + "num_input_tokens_seen": 24345810, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.4375, + "step": 1138, + "time_per_iteration": 2.459437370300293 + }, + { + "auxiliary_loss_clip": 0.01202271, + "auxiliary_loss_mlp": 0.01059469, + "balance_loss_clip": 1.03480506, + "balance_loss_mlp": 1.05729747, + "epoch": 0.06848038478881707, + "flos": 24608972430720.0, + "grad_norm": 2.606905885529735, + "language_loss": 0.85683703, + "learning_rate": 3.984488020272336e-06, + "loss": 0.87945449, + "num_input_tokens_seen": 24366095, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.453125, + "step": 1139, + "time_per_iteration": 2.5198936462402344 + }, + { + "auxiliary_loss_clip": 0.01201061, + "auxiliary_loss_mlp": 0.01057605, + "balance_loss_clip": 1.03297663, + "balance_loss_mlp": 1.05803108, + "epoch": 0.06854050804148504, + "flos": 40880994278400.0, + "grad_norm": 1.7528507300348692, + "language_loss": 0.74826896, + "learning_rate": 3.984439570469271e-06, + "loss": 0.77085567, + "num_input_tokens_seen": 24388665, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.4296875, + "step": 1140, + "time_per_iteration": 2.6609106063842773 + }, + { + "auxiliary_loss_clip": 0.01198151, + "auxiliary_loss_mlp": 0.01062313, + "balance_loss_clip": 1.03698146, + "balance_loss_mlp": 1.05620885, + "epoch": 0.06860063129415302, + "flos": 31686354743040.0, + "grad_norm": 2.210262717529583, + "language_loss": 0.68083167, + "learning_rate": 3.9843910454160574e-06, + "loss": 0.70343632, + "num_input_tokens_seen": 24407705, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.421875, + "step": 1141, + "time_per_iteration": 2.5661122798919678 + }, + { + "auxiliary_loss_clip": 0.01205913, + "auxiliary_loss_mlp": 0.0106664, + "balance_loss_clip": 1.04098654, + "balance_loss_mlp": 1.05848837, + "epoch": 0.06866075454682098, + "flos": 26542007775360.0, + "grad_norm": 1.82433360121009, + "language_loss": 0.79399014, + "learning_rate": 3.984342445114538e-06, + "loss": 0.8167156, + "num_input_tokens_seen": 24428390, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.46875, + "step": 1142, + "time_per_iteration": 2.5499107837677 + }, + { + "auxiliary_loss_clip": 0.0120232, + "auxiliary_loss_mlp": 0.01061074, + "balance_loss_clip": 1.03650475, + "balance_loss_mlp": 1.05730164, + "epoch": 0.06872087779948895, + "flos": 29789768724480.0, + "grad_norm": 1.6821535193321122, + "language_loss": 0.68701231, + "learning_rate": 3.984293769566553e-06, + "loss": 0.70964622, + "num_input_tokens_seen": 24450810, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.4453125, + "step": 1143, + "time_per_iteration": 5.380373239517212 + }, + { + "auxiliary_loss_clip": 0.01196375, + "auxiliary_loss_mlp": 0.01059216, + "balance_loss_clip": 1.03670955, + "balance_loss_mlp": 1.05885804, + "epoch": 0.06878100105215693, + "flos": 26941118768640.0, + "grad_norm": 1.8434796401844256, + "language_loss": 0.74694496, + "learning_rate": 3.98424501877395e-06, + "loss": 0.76950091, + "num_input_tokens_seen": 24469965, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.375, + "step": 1144, + "time_per_iteration": 2.536839485168457 + }, + { + "auxiliary_loss_clip": 0.01204332, + "auxiliary_loss_mlp": 0.01064223, + "balance_loss_clip": 1.03893876, + "balance_loss_mlp": 1.05654943, + "epoch": 0.06884112430482489, + "flos": 10670748946560.0, + "grad_norm": 2.296493270147659, + "language_loss": 0.91720247, + "learning_rate": 3.984196192738577e-06, + "loss": 0.93988806, + "num_input_tokens_seen": 24486370, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4765625, + "step": 1145, + "time_per_iteration": 2.44307017326355 + }, + { + "auxiliary_loss_clip": 0.01206887, + "auxiliary_loss_mlp": 0.01067692, + "balance_loss_clip": 1.04160893, + "balance_loss_mlp": 1.05779576, + "epoch": 0.06890124755749286, + "flos": 20193647898240.0, + "grad_norm": 2.4650333910918865, + "language_loss": 0.82189268, + "learning_rate": 3.984147291462285e-06, + "loss": 0.84463847, + "num_input_tokens_seen": 24503780, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.4921875, + "step": 1146, + "time_per_iteration": 2.4743804931640625 + }, + { + "auxiliary_loss_clip": 0.01198651, + "auxiliary_loss_mlp": 0.01061891, + "balance_loss_clip": 1.03869271, + "balance_loss_mlp": 1.05755806, + "epoch": 0.06896137081016084, + "flos": 20449224144000.0, + "grad_norm": 2.5935722439127744, + "language_loss": 0.85150343, + "learning_rate": 3.98409831494693e-06, + "loss": 0.87410891, + "num_input_tokens_seen": 24522320, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.4140625, + "step": 1147, + "time_per_iteration": 2.48410701751709 + }, + { + "auxiliary_loss_clip": 0.01201275, + "auxiliary_loss_mlp": 0.01064743, + "balance_loss_clip": 1.03988767, + "balance_loss_mlp": 1.05699074, + "epoch": 0.0690214940628288, + "flos": 18368703555840.0, + "grad_norm": 2.3932988353276645, + "language_loss": 0.86235052, + "learning_rate": 3.984049263194367e-06, + "loss": 0.88501072, + "num_input_tokens_seen": 24540445, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.4453125, + "step": 1148, + "time_per_iteration": 2.455441951751709 + }, + { + "auxiliary_loss_clip": 0.01199305, + "auxiliary_loss_mlp": 0.01058033, + "balance_loss_clip": 1.0337863, + "balance_loss_mlp": 1.05560231, + "epoch": 0.06908161731549677, + "flos": 20558033418240.0, + "grad_norm": 2.070658514783469, + "language_loss": 0.69185412, + "learning_rate": 3.9840001362064575e-06, + "loss": 0.71442747, + "num_input_tokens_seen": 24557105, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.4375, + "step": 1149, + "time_per_iteration": 2.461880922317505 + }, + { + "auxiliary_loss_clip": 0.01203307, + "auxiliary_loss_mlp": 0.0105502, + "balance_loss_clip": 1.0289495, + "balance_loss_mlp": 1.05679548, + "epoch": 0.06914174056816474, + "flos": 27563666313600.0, + "grad_norm": 2.828663566846353, + "language_loss": 0.84069788, + "learning_rate": 3.983950933985064e-06, + "loss": 0.86328113, + "num_input_tokens_seen": 24578240, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.4609375, + "step": 1150, + "time_per_iteration": 2.509122371673584 + }, + { + "auxiliary_loss_clip": 0.01206199, + "auxiliary_loss_mlp": 0.01058671, + "balance_loss_clip": 1.03453135, + "balance_loss_mlp": 1.06116164, + "epoch": 0.06920186382083271, + "flos": 15304015249920.0, + "grad_norm": 3.57752822218259, + "language_loss": 0.82044697, + "learning_rate": 3.983901656532052e-06, + "loss": 0.84309566, + "num_input_tokens_seen": 24593585, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.453125, + "step": 1151, + "time_per_iteration": 2.420128345489502 + }, + { + "auxiliary_loss_clip": 0.01201904, + "auxiliary_loss_mlp": 0.01062248, + "balance_loss_clip": 1.03883505, + "balance_loss_mlp": 1.06011868, + "epoch": 0.06926198707350067, + "flos": 25191227894400.0, + "grad_norm": 1.8279979065740934, + "language_loss": 0.85587418, + "learning_rate": 3.983852303849291e-06, + "loss": 0.87851566, + "num_input_tokens_seen": 24613110, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.4140625, + "step": 1152, + "time_per_iteration": 2.498180866241455 + }, + { + "auxiliary_loss_clip": 0.01198565, + "auxiliary_loss_mlp": 0.01060938, + "balance_loss_clip": 1.03747797, + "balance_loss_mlp": 1.05767703, + "epoch": 0.06932211032616864, + "flos": 13256137146240.0, + "grad_norm": 2.1251557516582995, + "language_loss": 0.90536988, + "learning_rate": 3.983802875938651e-06, + "loss": 0.92796487, + "num_input_tokens_seen": 24628795, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.40625, + "step": 1153, + "time_per_iteration": 2.422480821609497 + }, + { + "auxiliary_loss_clip": 0.01200757, + "auxiliary_loss_mlp": 0.01054623, + "balance_loss_clip": 1.03035152, + "balance_loss_mlp": 1.05790865, + "epoch": 0.06938223357883662, + "flos": 24827381078400.0, + "grad_norm": 2.190017778582164, + "language_loss": 0.81363368, + "learning_rate": 3.983753372802008e-06, + "loss": 0.83618748, + "num_input_tokens_seen": 24645480, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.4296875, + "step": 1154, + "time_per_iteration": 2.528118848800659 + }, + { + "auxiliary_loss_clip": 0.01202754, + "auxiliary_loss_mlp": 0.01068044, + "balance_loss_clip": 1.04476249, + "balance_loss_mlp": 1.06078768, + "epoch": 0.06944235683150458, + "flos": 27267977554560.0, + "grad_norm": 32.79102955334026, + "language_loss": 0.7560131, + "learning_rate": 3.983703794441237e-06, + "loss": 0.77872109, + "num_input_tokens_seen": 24664630, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.421875, + "step": 1155, + "time_per_iteration": 2.5010287761688232 + }, + { + "auxiliary_loss_clip": 0.01196108, + "auxiliary_loss_mlp": 0.01059268, + "balance_loss_clip": 1.03595114, + "balance_loss_mlp": 1.05511975, + "epoch": 0.06950248008417255, + "flos": 25808065176960.0, + "grad_norm": 1.6800097473238784, + "language_loss": 0.71119213, + "learning_rate": 3.98365414085822e-06, + "loss": 0.73374593, + "num_input_tokens_seen": 24684210, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.40625, + "step": 1156, + "time_per_iteration": 2.5034549236297607 + }, + { + "auxiliary_loss_clip": 0.01199728, + "auxiliary_loss_mlp": 0.01069841, + "balance_loss_clip": 1.04437828, + "balance_loss_mlp": 1.05711889, + "epoch": 0.06956260333684053, + "flos": 22271546793600.0, + "grad_norm": 2.0301788984863918, + "language_loss": 0.75299567, + "learning_rate": 3.98360441205484e-06, + "loss": 0.77569139, + "num_input_tokens_seen": 24702490, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4296875, + "step": 1157, + "time_per_iteration": 2.4654574394226074 + }, + { + "auxiliary_loss_clip": 0.0119867, + "auxiliary_loss_mlp": 0.0105715, + "balance_loss_clip": 1.03240204, + "balance_loss_mlp": 1.0551796, + "epoch": 0.0696227265895085, + "flos": 29681390413440.0, + "grad_norm": 1.6687264459000366, + "language_loss": 0.71895158, + "learning_rate": 3.983554608032982e-06, + "loss": 0.7415098, + "num_input_tokens_seen": 24724340, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.4375, + "step": 1158, + "time_per_iteration": 2.53495454788208 + }, + { + "auxiliary_loss_clip": 0.01202231, + "auxiliary_loss_mlp": 0.01063046, + "balance_loss_clip": 1.03764284, + "balance_loss_mlp": 1.05718327, + "epoch": 0.06968284984217646, + "flos": 25523545547520.0, + "grad_norm": 1.9777890540291267, + "language_loss": 0.79796576, + "learning_rate": 3.983504728794533e-06, + "loss": 0.82061857, + "num_input_tokens_seen": 24745550, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.453125, + "step": 1159, + "time_per_iteration": 2.511402130126953 + }, + { + "auxiliary_loss_clip": 0.01205534, + "auxiliary_loss_mlp": 0.0106614, + "balance_loss_clip": 1.03938961, + "balance_loss_mlp": 1.05860782, + "epoch": 0.06974297309484444, + "flos": 20698192287360.0, + "grad_norm": 5.094070474761981, + "language_loss": 0.810929, + "learning_rate": 3.983454774341387e-06, + "loss": 0.83364576, + "num_input_tokens_seen": 24762575, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.46875, + "step": 1160, + "time_per_iteration": 2.4580883979797363 + }, + { + "auxiliary_loss_clip": 0.01197544, + "auxiliary_loss_mlp": 0.01059119, + "balance_loss_clip": 1.03373909, + "balance_loss_mlp": 1.05382752, + "epoch": 0.0698030963475124, + "flos": 26505199313280.0, + "grad_norm": 1.8746427931419856, + "language_loss": 0.75958532, + "learning_rate": 3.983404744675437e-06, + "loss": 0.78215194, + "num_input_tokens_seen": 24782605, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4375, + "step": 1161, + "time_per_iteration": 2.5046370029449463 + }, + { + "auxiliary_loss_clip": 0.01195466, + "auxiliary_loss_mlp": 0.01062077, + "balance_loss_clip": 1.03642368, + "balance_loss_mlp": 1.05299318, + "epoch": 0.06986321960018037, + "flos": 23040430346880.0, + "grad_norm": 1.806880077375887, + "language_loss": 0.8285073, + "learning_rate": 3.9833546397985794e-06, + "loss": 0.85108274, + "num_input_tokens_seen": 24802910, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.4296875, + "step": 1162, + "time_per_iteration": 2.4779040813446045 + }, + { + "auxiliary_loss_clip": 0.01193968, + "auxiliary_loss_mlp": 0.01055987, + "balance_loss_clip": 1.03172803, + "balance_loss_mlp": 1.05355024, + "epoch": 0.06992334285284833, + "flos": 28584822061440.0, + "grad_norm": 1.8779282806609423, + "language_loss": 0.79095101, + "learning_rate": 3.983304459712716e-06, + "loss": 0.81345057, + "num_input_tokens_seen": 24823305, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.40625, + "step": 1163, + "time_per_iteration": 2.515899181365967 + }, + { + "auxiliary_loss_clip": 0.01198337, + "auxiliary_loss_mlp": 0.0106386, + "balance_loss_clip": 1.03728819, + "balance_loss_mlp": 1.05438375, + "epoch": 0.06998346610551631, + "flos": 20595344670720.0, + "grad_norm": 2.1142628107327233, + "language_loss": 0.79552305, + "learning_rate": 3.983254204419749e-06, + "loss": 0.81814498, + "num_input_tokens_seen": 24842155, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.4375, + "step": 1164, + "time_per_iteration": 2.476428747177124 + }, + { + "auxiliary_loss_clip": 0.0119937, + "auxiliary_loss_mlp": 0.0106576, + "balance_loss_clip": 1.0401659, + "balance_loss_mlp": 1.05587661, + "epoch": 0.07004358935818428, + "flos": 22528810978560.0, + "grad_norm": 1.4863162511761774, + "language_loss": 0.73198837, + "learning_rate": 3.983203873921583e-06, + "loss": 0.75463963, + "num_input_tokens_seen": 24862080, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.4375, + "step": 1165, + "time_per_iteration": 2.5053012371063232 + }, + { + "auxiliary_loss_clip": 0.01196916, + "auxiliary_loss_mlp": 0.01056731, + "balance_loss_clip": 1.03225732, + "balance_loss_mlp": 1.05550849, + "epoch": 0.07010371261085224, + "flos": 28949997680640.0, + "grad_norm": 1.690867173089168, + "language_loss": 0.81019437, + "learning_rate": 3.983153468220128e-06, + "loss": 0.83273077, + "num_input_tokens_seen": 24886165, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.4140625, + "step": 1166, + "time_per_iteration": 2.5378963947296143 + }, + { + "auxiliary_loss_clip": 0.01194011, + "auxiliary_loss_mlp": 0.0104974, + "balance_loss_clip": 1.02452731, + "balance_loss_mlp": 1.0534389, + "epoch": 0.07016383586352022, + "flos": 23659171050240.0, + "grad_norm": 4.886682439277329, + "language_loss": 0.84443307, + "learning_rate": 3.983102987317295e-06, + "loss": 0.86687052, + "num_input_tokens_seen": 24905775, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.40625, + "step": 1167, + "time_per_iteration": 2.5244622230529785 + }, + { + "auxiliary_loss_clip": 0.01201364, + "auxiliary_loss_mlp": 0.01057063, + "balance_loss_clip": 1.03188586, + "balance_loss_mlp": 1.05693448, + "epoch": 0.07022395911618819, + "flos": 19792130693760.0, + "grad_norm": 3.687845484368313, + "language_loss": 0.89423364, + "learning_rate": 3.983052431214997e-06, + "loss": 0.9168179, + "num_input_tokens_seen": 24924295, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.4453125, + "step": 1168, + "time_per_iteration": 2.49411678314209 + }, + { + "auxiliary_loss_clip": 0.01203973, + "auxiliary_loss_mlp": 0.01068913, + "balance_loss_clip": 1.04078007, + "balance_loss_mlp": 1.05737031, + "epoch": 0.07028408236885615, + "flos": 21689147675520.0, + "grad_norm": 2.629371766417224, + "language_loss": 0.88661098, + "learning_rate": 3.983001799915153e-06, + "loss": 0.9093399, + "num_input_tokens_seen": 24943210, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.46875, + "step": 1169, + "time_per_iteration": 2.4795143604278564 + }, + { + "auxiliary_loss_clip": 0.01203226, + "auxiliary_loss_mlp": 0.01069625, + "balance_loss_clip": 1.04397118, + "balance_loss_mlp": 1.05864179, + "epoch": 0.07034420562152413, + "flos": 25630271832960.0, + "grad_norm": 2.0154006947860705, + "language_loss": 0.84000075, + "learning_rate": 3.982951093419681e-06, + "loss": 0.86272925, + "num_input_tokens_seen": 24960360, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.4375, + "step": 1170, + "time_per_iteration": 2.501016616821289 + }, + { + "auxiliary_loss_clip": 0.01199625, + "auxiliary_loss_mlp": 0.01064997, + "balance_loss_clip": 1.03860402, + "balance_loss_mlp": 1.05753505, + "epoch": 0.0704043288741921, + "flos": 20810449267200.0, + "grad_norm": 1.945268169582358, + "language_loss": 0.75220597, + "learning_rate": 3.982900311730506e-06, + "loss": 0.77485222, + "num_input_tokens_seen": 24978290, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.421875, + "step": 1171, + "time_per_iteration": 2.4456748962402344 + }, + { + "auxiliary_loss_clip": 0.01199689, + "auxiliary_loss_mlp": 0.01058158, + "balance_loss_clip": 1.03393483, + "balance_loss_mlp": 1.05765915, + "epoch": 0.07046445212686006, + "flos": 25593176062080.0, + "grad_norm": 3.2481396571627923, + "language_loss": 0.88848841, + "learning_rate": 3.9828494548495514e-06, + "loss": 0.91106689, + "num_input_tokens_seen": 24997055, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.421875, + "step": 1172, + "time_per_iteration": 2.4970321655273438 + }, + { + "auxiliary_loss_clip": 0.01202846, + "auxiliary_loss_mlp": 0.01053059, + "balance_loss_clip": 1.02776241, + "balance_loss_mlp": 1.05584753, + "epoch": 0.07052457537952803, + "flos": 25556978131200.0, + "grad_norm": 1.6229718682058278, + "language_loss": 0.8212136, + "learning_rate": 3.982798522778748e-06, + "loss": 0.84377271, + "num_input_tokens_seen": 25017490, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.46875, + "step": 1173, + "time_per_iteration": 2.485822916030884 + }, + { + "auxiliary_loss_clip": 0.01200818, + "auxiliary_loss_mlp": 0.01061183, + "balance_loss_clip": 1.03574347, + "balance_loss_mlp": 1.05786848, + "epoch": 0.070584698632196, + "flos": 17968515154560.0, + "grad_norm": 2.056745883983527, + "language_loss": 0.81825697, + "learning_rate": 3.9827475155200245e-06, + "loss": 0.840877, + "num_input_tokens_seen": 25035660, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4296875, + "step": 1174, + "time_per_iteration": 2.4564759731292725 + }, + { + "auxiliary_loss_clip": 0.01199791, + "auxiliary_loss_mlp": 0.01060254, + "balance_loss_clip": 1.03538728, + "balance_loss_mlp": 1.0569849, + "epoch": 0.07064482188486397, + "flos": 25370888745600.0, + "grad_norm": 1.925446476900023, + "language_loss": 0.8511939, + "learning_rate": 3.982696433075317e-06, + "loss": 0.87379438, + "num_input_tokens_seen": 25054785, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 1.421875, + "step": 1175, + "time_per_iteration": 2.487058639526367 + }, + { + "auxiliary_loss_clip": 0.0120243, + "auxiliary_loss_mlp": 0.01067762, + "balance_loss_clip": 1.04362202, + "balance_loss_mlp": 1.05922508, + "epoch": 0.07070494513753194, + "flos": 24899848767360.0, + "grad_norm": 1.9716433558257507, + "language_loss": 0.8303746, + "learning_rate": 3.982645275446563e-06, + "loss": 0.85307658, + "num_input_tokens_seen": 25075180, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.4375, + "step": 1176, + "time_per_iteration": 2.511456251144409 + }, + { + "auxiliary_loss_clip": 0.01197689, + "auxiliary_loss_mlp": 0.01061097, + "balance_loss_clip": 1.03528786, + "balance_loss_mlp": 1.05717707, + "epoch": 0.07076506839019991, + "flos": 22338447874560.0, + "grad_norm": 2.3318965992312, + "language_loss": 0.74563694, + "learning_rate": 3.982594042635701e-06, + "loss": 0.76822478, + "num_input_tokens_seen": 25093035, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.40625, + "step": 1177, + "time_per_iteration": 2.460102081298828 + }, + { + "auxiliary_loss_clip": 0.01207406, + "auxiliary_loss_mlp": 0.01058724, + "balance_loss_clip": 1.033476, + "balance_loss_mlp": 1.06167924, + "epoch": 0.07082519164286788, + "flos": 18660800954880.0, + "grad_norm": 2.2206541819979995, + "language_loss": 0.86031914, + "learning_rate": 3.982542734644673e-06, + "loss": 0.88298053, + "num_input_tokens_seen": 25112520, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.4609375, + "step": 1178, + "time_per_iteration": 2.4605627059936523 + }, + { + "auxiliary_loss_clip": 0.01087089, + "auxiliary_loss_mlp": 0.01007975, + "balance_loss_clip": 1.00349271, + "balance_loss_mlp": 1.02766943, + "epoch": 0.07088531489553584, + "flos": 63654107610240.0, + "grad_norm": 0.8386980392448491, + "language_loss": 0.63242435, + "learning_rate": 3.982491351475427e-06, + "loss": 0.65337497, + "num_input_tokens_seen": 25177760, + "router_z_loss_clip": 0.04492188, + "router_z_loss_mlp": 0.59375, + "step": 1179, + "time_per_iteration": 3.156688690185547 + }, + { + "auxiliary_loss_clip": 0.01207076, + "auxiliary_loss_mlp": 0.01062734, + "balance_loss_clip": 1.03886819, + "balance_loss_mlp": 1.06038809, + "epoch": 0.07094543814820382, + "flos": 21572688804480.0, + "grad_norm": 2.3853497849810945, + "language_loss": 0.83326972, + "learning_rate": 3.98243989312991e-06, + "loss": 0.85596782, + "num_input_tokens_seen": 25195260, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.46875, + "step": 1180, + "time_per_iteration": 2.4823896884918213 + }, + { + "auxiliary_loss_clip": 0.01200915, + "auxiliary_loss_mlp": 0.01065839, + "balance_loss_clip": 1.04087663, + "balance_loss_mlp": 1.05910683, + "epoch": 0.07100556140087179, + "flos": 22089946608000.0, + "grad_norm": 2.1921067510196446, + "language_loss": 0.88595563, + "learning_rate": 3.982388359610074e-06, + "loss": 0.90862316, + "num_input_tokens_seen": 25212740, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.421875, + "step": 1181, + "time_per_iteration": 2.505908727645874 + }, + { + "auxiliary_loss_clip": 0.01200757, + "auxiliary_loss_mlp": 0.01060636, + "balance_loss_clip": 1.03607869, + "balance_loss_mlp": 1.05944347, + "epoch": 0.07106568465353975, + "flos": 47922286400640.0, + "grad_norm": 2.2303634282095257, + "language_loss": 0.83314365, + "learning_rate": 3.9823367509178725e-06, + "loss": 0.85575759, + "num_input_tokens_seen": 25236420, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.4140625, + "step": 1182, + "time_per_iteration": 2.7283623218536377 + }, + { + "auxiliary_loss_clip": 0.01199287, + "auxiliary_loss_mlp": 0.01065564, + "balance_loss_clip": 1.04006529, + "balance_loss_mlp": 1.06100821, + "epoch": 0.07112580790620772, + "flos": 23440798316160.0, + "grad_norm": 2.671395976555463, + "language_loss": 0.7925818, + "learning_rate": 3.982285067055262e-06, + "loss": 0.81523037, + "num_input_tokens_seen": 25255120, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.3828125, + "step": 1183, + "time_per_iteration": 2.5057172775268555 + }, + { + "auxiliary_loss_clip": 0.01201972, + "auxiliary_loss_mlp": 0.01059167, + "balance_loss_clip": 1.03441906, + "balance_loss_mlp": 1.05550563, + "epoch": 0.0711859311588757, + "flos": 31868888682240.0, + "grad_norm": 2.6492838430830963, + "language_loss": 0.78910172, + "learning_rate": 3.982233308024204e-06, + "loss": 0.8117131, + "num_input_tokens_seen": 25275150, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.46875, + "step": 1184, + "time_per_iteration": 5.494150638580322 + }, + { + "auxiliary_loss_clip": 0.01196982, + "auxiliary_loss_mlp": 0.01057128, + "balance_loss_clip": 1.03369188, + "balance_loss_mlp": 1.05884266, + "epoch": 0.07124605441154366, + "flos": 19610315026560.0, + "grad_norm": 2.546293211356889, + "language_loss": 0.7696892, + "learning_rate": 3.98218147382666e-06, + "loss": 0.79223031, + "num_input_tokens_seen": 25293680, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.375, + "step": 1185, + "time_per_iteration": 3.8873486518859863 + }, + { + "auxiliary_loss_clip": 0.01200052, + "auxiliary_loss_mlp": 0.01065088, + "balance_loss_clip": 1.0408771, + "balance_loss_mlp": 1.05808377, + "epoch": 0.07130617766421163, + "flos": 14684448533760.0, + "grad_norm": 2.519913974657541, + "language_loss": 0.65896261, + "learning_rate": 3.982129564464596e-06, + "loss": 0.68161404, + "num_input_tokens_seen": 25310050, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.421875, + "step": 1186, + "time_per_iteration": 2.44986891746521 + }, + { + "auxiliary_loss_clip": 0.01198722, + "auxiliary_loss_mlp": 0.01056267, + "balance_loss_clip": 1.03234124, + "balance_loss_mlp": 1.05906928, + "epoch": 0.07136630091687961, + "flos": 26067915141120.0, + "grad_norm": 2.0047668871213205, + "language_loss": 0.69673246, + "learning_rate": 3.98207757993998e-06, + "loss": 0.71928233, + "num_input_tokens_seen": 25331020, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.3984375, + "step": 1187, + "time_per_iteration": 2.517432451248169 + }, + { + "auxiliary_loss_clip": 0.01194056, + "auxiliary_loss_mlp": 0.01059861, + "balance_loss_clip": 1.03713942, + "balance_loss_mlp": 1.05690861, + "epoch": 0.07142642416954757, + "flos": 15669190869120.0, + "grad_norm": 2.6848541171122307, + "language_loss": 0.78598166, + "learning_rate": 3.9820255202547845e-06, + "loss": 0.80852079, + "num_input_tokens_seen": 25347875, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.375, + "step": 1188, + "time_per_iteration": 2.4682350158691406 + }, + { + "auxiliary_loss_clip": 0.01197809, + "auxiliary_loss_mlp": 0.01056931, + "balance_loss_clip": 1.03282666, + "balance_loss_mlp": 1.0588758, + "epoch": 0.07148654742221554, + "flos": 19755322231680.0, + "grad_norm": 2.0343008635273834, + "language_loss": 0.84854662, + "learning_rate": 3.981973385410981e-06, + "loss": 0.87109399, + "num_input_tokens_seen": 25366715, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.390625, + "step": 1189, + "time_per_iteration": 2.451464891433716 + }, + { + "auxiliary_loss_clip": 0.01193617, + "auxiliary_loss_mlp": 0.01062112, + "balance_loss_clip": 1.03707767, + "balance_loss_mlp": 1.05589187, + "epoch": 0.07154667067488352, + "flos": 23471824688640.0, + "grad_norm": 1.7193907035784557, + "language_loss": 0.77021295, + "learning_rate": 3.9819211754105494e-06, + "loss": 0.79277021, + "num_input_tokens_seen": 25385450, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.375, + "step": 1190, + "time_per_iteration": 2.5028254985809326 + }, + { + "auxiliary_loss_clip": 0.01200514, + "auxiliary_loss_mlp": 0.01065982, + "balance_loss_clip": 1.04018509, + "balance_loss_mlp": 1.0585537, + "epoch": 0.07160679392755148, + "flos": 18332936588160.0, + "grad_norm": 2.3385605637591302, + "language_loss": 0.75145626, + "learning_rate": 3.981868890255468e-06, + "loss": 0.77412122, + "num_input_tokens_seen": 25403940, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.421875, + "step": 1191, + "time_per_iteration": 2.461045980453491 + }, + { + "auxiliary_loss_clip": 0.0119767, + "auxiliary_loss_mlp": 0.01056581, + "balance_loss_clip": 1.03147578, + "balance_loss_mlp": 1.05730891, + "epoch": 0.07166691718021945, + "flos": 17747017937280.0, + "grad_norm": 3.3332115059632583, + "language_loss": 0.7360636, + "learning_rate": 3.981816529947719e-06, + "loss": 0.75860614, + "num_input_tokens_seen": 25420410, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.40625, + "step": 1192, + "time_per_iteration": 2.4944753646850586 + }, + { + "auxiliary_loss_clip": 0.01194068, + "auxiliary_loss_mlp": 0.01052089, + "balance_loss_clip": 1.02884293, + "balance_loss_mlp": 1.05358601, + "epoch": 0.07172704043288743, + "flos": 22451925916800.0, + "grad_norm": 2.1652973689026176, + "language_loss": 0.7830255, + "learning_rate": 3.9817640944892896e-06, + "loss": 0.80548704, + "num_input_tokens_seen": 25439415, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.40625, + "step": 1193, + "time_per_iteration": 2.487025737762451 + }, + { + "auxiliary_loss_clip": 0.01202609, + "auxiliary_loss_mlp": 0.01053593, + "balance_loss_clip": 1.02786815, + "balance_loss_mlp": 1.06034899, + "epoch": 0.07178716368555539, + "flos": 23222210100480.0, + "grad_norm": 1.9678931818636167, + "language_loss": 0.85748619, + "learning_rate": 3.981711583882166e-06, + "loss": 0.88004816, + "num_input_tokens_seen": 25458715, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.421875, + "step": 1194, + "time_per_iteration": 2.493823766708374 + }, + { + "auxiliary_loss_clip": 0.01197363, + "auxiliary_loss_mlp": 0.0106262, + "balance_loss_clip": 1.03886151, + "balance_loss_mlp": 1.05782473, + "epoch": 0.07184728693822336, + "flos": 25150828072320.0, + "grad_norm": 1.9701258602591958, + "language_loss": 0.81425989, + "learning_rate": 3.981658998128341e-06, + "loss": 0.83685976, + "num_input_tokens_seen": 25477985, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.3984375, + "step": 1195, + "time_per_iteration": 2.5168802738189697 + }, + { + "auxiliary_loss_clip": 0.01195742, + "auxiliary_loss_mlp": 0.01051594, + "balance_loss_clip": 1.02979064, + "balance_loss_mlp": 1.05720496, + "epoch": 0.07190741019089132, + "flos": 22711237176960.0, + "grad_norm": 1.9269272748189905, + "language_loss": 0.79917538, + "learning_rate": 3.981606337229808e-06, + "loss": 0.82164884, + "num_input_tokens_seen": 25497110, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.3828125, + "step": 1196, + "time_per_iteration": 2.4749536514282227 + }, + { + "auxiliary_loss_clip": 0.01193553, + "auxiliary_loss_mlp": 0.01069477, + "balance_loss_clip": 1.04418063, + "balance_loss_mlp": 1.05655897, + "epoch": 0.0719675334435593, + "flos": 29349791032320.0, + "grad_norm": 8.862292558474625, + "language_loss": 0.71015084, + "learning_rate": 3.9815536011885655e-06, + "loss": 0.73278111, + "num_input_tokens_seen": 25516555, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.3671875, + "step": 1197, + "time_per_iteration": 2.520514726638794 + }, + { + "auxiliary_loss_clip": 0.01192449, + "auxiliary_loss_mlp": 0.01052157, + "balance_loss_clip": 1.02845871, + "balance_loss_mlp": 1.05429292, + "epoch": 0.07202765669622727, + "flos": 17639788861440.0, + "grad_norm": 2.0584524946763767, + "language_loss": 0.86034989, + "learning_rate": 3.98150079000661e-06, + "loss": 0.88279593, + "num_input_tokens_seen": 25533895, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.3828125, + "step": 1198, + "time_per_iteration": 2.441458225250244 + }, + { + "auxiliary_loss_clip": 0.01194875, + "auxiliary_loss_mlp": 0.01061206, + "balance_loss_clip": 1.03724504, + "balance_loss_mlp": 1.05664325, + "epoch": 0.07208777994889523, + "flos": 21434038306560.0, + "grad_norm": 2.7240513490380307, + "language_loss": 0.83822477, + "learning_rate": 3.981447903685947e-06, + "loss": 0.8607856, + "num_input_tokens_seen": 25554195, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.3828125, + "step": 1199, + "time_per_iteration": 2.462790012359619 + }, + { + "auxiliary_loss_clip": 0.01201627, + "auxiliary_loss_mlp": 0.01055923, + "balance_loss_clip": 1.03351128, + "balance_loss_mlp": 1.06159616, + "epoch": 0.07214790320156321, + "flos": 26940867373440.0, + "grad_norm": 2.0725431151836453, + "language_loss": 0.76464498, + "learning_rate": 3.981394942228581e-06, + "loss": 0.78722042, + "num_input_tokens_seen": 25574155, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.3984375, + "step": 1200, + "time_per_iteration": 2.5007636547088623 + }, + { + "auxiliary_loss_clip": 0.01196382, + "auxiliary_loss_mlp": 0.01061794, + "balance_loss_clip": 1.0376662, + "balance_loss_mlp": 1.05783701, + "epoch": 0.07220802645423118, + "flos": 23879949995520.0, + "grad_norm": 1.959995672067427, + "language_loss": 0.82965535, + "learning_rate": 3.98134190563652e-06, + "loss": 0.85223711, + "num_input_tokens_seen": 25592735, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.390625, + "step": 1201, + "time_per_iteration": 2.4968512058258057 + }, + { + "auxiliary_loss_clip": 0.01198607, + "auxiliary_loss_mlp": 0.01059493, + "balance_loss_clip": 1.03372014, + "balance_loss_mlp": 1.05568862, + "epoch": 0.07226814970689914, + "flos": 19243631036160.0, + "grad_norm": 2.411287508312223, + "language_loss": 0.69041032, + "learning_rate": 3.981288793911775e-06, + "loss": 0.71299136, + "num_input_tokens_seen": 25611510, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.4296875, + "step": 1202, + "time_per_iteration": 2.456216812133789 + }, + { + "auxiliary_loss_clip": 0.01196785, + "auxiliary_loss_mlp": 0.01063607, + "balance_loss_clip": 1.03804839, + "balance_loss_mlp": 1.05721354, + "epoch": 0.07232827295956712, + "flos": 19172025273600.0, + "grad_norm": 1.9411904343348254, + "language_loss": 0.87723774, + "learning_rate": 3.98123560705636e-06, + "loss": 0.89984161, + "num_input_tokens_seen": 25629560, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.3984375, + "step": 1203, + "time_per_iteration": 2.449903964996338 + }, + { + "auxiliary_loss_clip": 0.01198028, + "auxiliary_loss_mlp": 0.01061987, + "balance_loss_clip": 1.03803837, + "balance_loss_mlp": 1.0546416, + "epoch": 0.07238839621223508, + "flos": 17639752947840.0, + "grad_norm": 1.819852916387131, + "language_loss": 0.7844671, + "learning_rate": 3.981182345072293e-06, + "loss": 0.80706728, + "num_input_tokens_seen": 25648330, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.4375, + "step": 1204, + "time_per_iteration": 2.449265480041504 + }, + { + "auxiliary_loss_clip": 0.01194984, + "auxiliary_loss_mlp": 0.01062187, + "balance_loss_clip": 1.0388217, + "balance_loss_mlp": 1.05605316, + "epoch": 0.07244851946490305, + "flos": 28292401440000.0, + "grad_norm": 1.8514893306986777, + "language_loss": 0.81960398, + "learning_rate": 3.981129007961593e-06, + "loss": 0.84217566, + "num_input_tokens_seen": 25669470, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.390625, + "step": 1205, + "time_per_iteration": 2.517423629760742 + }, + { + "auxiliary_loss_clip": 0.01199989, + "auxiliary_loss_mlp": 0.01067422, + "balance_loss_clip": 1.04250705, + "balance_loss_mlp": 1.05852747, + "epoch": 0.07250864271757101, + "flos": 22564829341440.0, + "grad_norm": 2.0830735488163254, + "language_loss": 0.76702261, + "learning_rate": 3.981075595726283e-06, + "loss": 0.78969669, + "num_input_tokens_seen": 25690470, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.4140625, + "step": 1206, + "time_per_iteration": 2.489978313446045 + }, + { + "auxiliary_loss_clip": 0.01193529, + "auxiliary_loss_mlp": 0.01055273, + "balance_loss_clip": 1.03071594, + "balance_loss_mlp": 1.05481935, + "epoch": 0.072568765970239, + "flos": 21762405463680.0, + "grad_norm": 1.8430962541821914, + "language_loss": 0.77246201, + "learning_rate": 3.981022108368387e-06, + "loss": 0.79495007, + "num_input_tokens_seen": 25709205, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.3828125, + "step": 1207, + "time_per_iteration": 2.4895267486572266 + }, + { + "auxiliary_loss_clip": 0.01194673, + "auxiliary_loss_mlp": 0.01049476, + "balance_loss_clip": 1.02816105, + "balance_loss_mlp": 1.05703962, + "epoch": 0.07262888922290696, + "flos": 25519702792320.0, + "grad_norm": 5.768853045708734, + "language_loss": 0.79723513, + "learning_rate": 3.9809685458899345e-06, + "loss": 0.81967664, + "num_input_tokens_seen": 25728485, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.375, + "step": 1208, + "time_per_iteration": 2.509073495864868 + }, + { + "auxiliary_loss_clip": 0.0119292, + "auxiliary_loss_mlp": 0.01054601, + "balance_loss_clip": 1.03204679, + "balance_loss_mlp": 1.05551386, + "epoch": 0.07268901247557492, + "flos": 21246548290560.0, + "grad_norm": 3.6873449148768063, + "language_loss": 0.78595626, + "learning_rate": 3.980914908292955e-06, + "loss": 0.80843151, + "num_input_tokens_seen": 25747730, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.375, + "step": 1209, + "time_per_iteration": 2.506157398223877 + }, + { + "auxiliary_loss_clip": 0.01194158, + "auxiliary_loss_mlp": 0.01056209, + "balance_loss_clip": 1.03409529, + "balance_loss_mlp": 1.05510461, + "epoch": 0.0727491357282429, + "flos": 25479302970240.0, + "grad_norm": 2.6193169355932104, + "language_loss": 0.81117678, + "learning_rate": 3.980861195579486e-06, + "loss": 0.83368045, + "num_input_tokens_seen": 25768050, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.390625, + "step": 1210, + "time_per_iteration": 2.4985666275024414 + }, + { + "auxiliary_loss_clip": 0.01192388, + "auxiliary_loss_mlp": 0.01061033, + "balance_loss_clip": 1.03688109, + "balance_loss_mlp": 1.0565064, + "epoch": 0.07280925898091087, + "flos": 24462169545600.0, + "grad_norm": 2.2378435782703834, + "language_loss": 0.84350932, + "learning_rate": 3.98080740775156e-06, + "loss": 0.86604351, + "num_input_tokens_seen": 25787985, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.359375, + "step": 1211, + "time_per_iteration": 2.4971728324890137 + }, + { + "auxiliary_loss_clip": 0.01189207, + "auxiliary_loss_mlp": 0.01051238, + "balance_loss_clip": 1.02931547, + "balance_loss_mlp": 1.05233216, + "epoch": 0.07286938223357883, + "flos": 18288191220480.0, + "grad_norm": 2.2910402501943516, + "language_loss": 0.90813953, + "learning_rate": 3.98075354481122e-06, + "loss": 0.9305439, + "num_input_tokens_seen": 25803620, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.375, + "step": 1212, + "time_per_iteration": 2.424874782562256 + }, + { + "auxiliary_loss_clip": 0.01191621, + "auxiliary_loss_mlp": 0.01051304, + "balance_loss_clip": 1.0286777, + "balance_loss_mlp": 1.05457211, + "epoch": 0.07292950548624681, + "flos": 21214803646080.0, + "grad_norm": 2.346480404505952, + "language_loss": 0.7238096, + "learning_rate": 3.9806996067605055e-06, + "loss": 0.74623883, + "num_input_tokens_seen": 25823315, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.3671875, + "step": 1213, + "time_per_iteration": 2.443542003631592 + }, + { + "auxiliary_loss_clip": 0.0119423, + "auxiliary_loss_mlp": 0.01051601, + "balance_loss_clip": 1.02848625, + "balance_loss_mlp": 1.05338192, + "epoch": 0.07298962873891478, + "flos": 24642009964800.0, + "grad_norm": 1.9141465843449694, + "language_loss": 0.84441102, + "learning_rate": 3.980645593601465e-06, + "loss": 0.86686933, + "num_input_tokens_seen": 25842605, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.40625, + "step": 1214, + "time_per_iteration": 2.500112295150757 + }, + { + "auxiliary_loss_clip": 0.01197246, + "auxiliary_loss_mlp": 0.0105819, + "balance_loss_clip": 1.03468192, + "balance_loss_mlp": 1.05678558, + "epoch": 0.07304975199158274, + "flos": 27052765217280.0, + "grad_norm": 2.82775499028919, + "language_loss": 0.83929181, + "learning_rate": 3.980591505336144e-06, + "loss": 0.86184609, + "num_input_tokens_seen": 25863030, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.40625, + "step": 1215, + "time_per_iteration": 2.539769411087036 + }, + { + "auxiliary_loss_clip": 0.01194493, + "auxiliary_loss_mlp": 0.01061008, + "balance_loss_clip": 1.03711891, + "balance_loss_mlp": 1.05474758, + "epoch": 0.07310987524425071, + "flos": 33549544091520.0, + "grad_norm": 1.8082751516232567, + "language_loss": 0.80984753, + "learning_rate": 3.980537341966595e-06, + "loss": 0.83240259, + "num_input_tokens_seen": 25888015, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.390625, + "step": 1216, + "time_per_iteration": 2.6036598682403564 + }, + { + "auxiliary_loss_clip": 0.01196444, + "auxiliary_loss_mlp": 0.01050548, + "balance_loss_clip": 1.02863717, + "balance_loss_mlp": 1.05746269, + "epoch": 0.07316999849691869, + "flos": 28110944908800.0, + "grad_norm": 2.8100743600713276, + "language_loss": 0.76112509, + "learning_rate": 3.980483103494872e-06, + "loss": 0.78359497, + "num_input_tokens_seen": 25908660, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.390625, + "step": 1217, + "time_per_iteration": 2.513061046600342 + }, + { + "auxiliary_loss_clip": 0.01192952, + "auxiliary_loss_mlp": 0.01055183, + "balance_loss_clip": 1.0347029, + "balance_loss_mlp": 1.05546904, + "epoch": 0.07323012174958665, + "flos": 14392602529920.0, + "grad_norm": 2.0751842608938142, + "language_loss": 0.86442709, + "learning_rate": 3.98042878992303e-06, + "loss": 0.88690841, + "num_input_tokens_seen": 25927215, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.375, + "step": 1218, + "time_per_iteration": 2.4514572620391846 + }, + { + "auxiliary_loss_clip": 0.01193593, + "auxiliary_loss_mlp": 0.01062446, + "balance_loss_clip": 1.03989124, + "balance_loss_mlp": 1.05405331, + "epoch": 0.07329024500225462, + "flos": 21616428591360.0, + "grad_norm": 1.9036635750322874, + "language_loss": 0.86757988, + "learning_rate": 3.9803744012531305e-06, + "loss": 0.8901403, + "num_input_tokens_seen": 25945500, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.3984375, + "step": 1219, + "time_per_iteration": 2.4501893520355225 + }, + { + "auxiliary_loss_clip": 0.01190573, + "auxiliary_loss_mlp": 0.01058106, + "balance_loss_clip": 1.03654075, + "balance_loss_mlp": 1.05260015, + "epoch": 0.0733503682549226, + "flos": 13224141106560.0, + "grad_norm": 2.320539289810395, + "language_loss": 0.84721315, + "learning_rate": 3.980319937487235e-06, + "loss": 0.86969984, + "num_input_tokens_seen": 25963105, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.375, + "step": 1220, + "time_per_iteration": 2.4651544094085693 + }, + { + "auxiliary_loss_clip": 0.01193314, + "auxiliary_loss_mlp": 0.01062531, + "balance_loss_clip": 1.04015541, + "balance_loss_mlp": 1.05455709, + "epoch": 0.07341049150759056, + "flos": 20886975192960.0, + "grad_norm": 2.803787378453645, + "language_loss": 0.76840538, + "learning_rate": 3.98026539862741e-06, + "loss": 0.79096377, + "num_input_tokens_seen": 25981690, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.390625, + "step": 1221, + "time_per_iteration": 2.4643850326538086 + }, + { + "auxiliary_loss_clip": 0.01195957, + "auxiliary_loss_mlp": 0.01059407, + "balance_loss_clip": 1.0369482, + "balance_loss_mlp": 1.05698907, + "epoch": 0.07347061476025853, + "flos": 15413614623360.0, + "grad_norm": 4.111967976062365, + "language_loss": 0.92201889, + "learning_rate": 3.980210784675722e-06, + "loss": 0.94457251, + "num_input_tokens_seen": 25999890, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.390625, + "step": 1222, + "time_per_iteration": 2.461393117904663 + }, + { + "auxiliary_loss_clip": 0.01197113, + "auxiliary_loss_mlp": 0.01056347, + "balance_loss_clip": 1.03440046, + "balance_loss_mlp": 1.05795276, + "epoch": 0.0735307380129265, + "flos": 11108859131520.0, + "grad_norm": 2.739326433562924, + "language_loss": 0.91106719, + "learning_rate": 3.980156095634242e-06, + "loss": 0.9336018, + "num_input_tokens_seen": 26016445, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.390625, + "step": 1223, + "time_per_iteration": 2.4616212844848633 + }, + { + "auxiliary_loss_clip": 0.01194512, + "auxiliary_loss_mlp": 0.01071192, + "balance_loss_clip": 1.04895926, + "balance_loss_mlp": 1.05628467, + "epoch": 0.07359086126559447, + "flos": 23732392924800.0, + "grad_norm": 2.5538951271380395, + "language_loss": 0.81946027, + "learning_rate": 3.980101331505045e-06, + "loss": 0.84211743, + "num_input_tokens_seen": 26036080, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.3828125, + "step": 1224, + "time_per_iteration": 2.555060386657715 + }, + { + "auxiliary_loss_clip": 0.01191919, + "auxiliary_loss_mlp": 0.01053854, + "balance_loss_clip": 1.02938056, + "balance_loss_mlp": 1.05385065, + "epoch": 0.07365098451826244, + "flos": 20993270515200.0, + "grad_norm": 2.209826315991058, + "language_loss": 0.83313572, + "learning_rate": 3.9800464922902076e-06, + "loss": 0.8555935, + "num_input_tokens_seen": 26055805, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.375, + "step": 1225, + "time_per_iteration": 2.5317656993865967 + }, + { + "auxiliary_loss_clip": 0.01194191, + "auxiliary_loss_mlp": 0.0105208, + "balance_loss_clip": 1.0300144, + "balance_loss_mlp": 1.05566537, + "epoch": 0.0737111077709304, + "flos": 19933582452480.0, + "grad_norm": 2.0864455990649144, + "language_loss": 0.9037565, + "learning_rate": 3.979991577991808e-06, + "loss": 0.92621917, + "num_input_tokens_seen": 26073905, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.3828125, + "step": 1226, + "time_per_iteration": 5.374137878417969 + }, + { + "auxiliary_loss_clip": 0.01201048, + "auxiliary_loss_mlp": 0.01048748, + "balance_loss_clip": 1.02451301, + "balance_loss_mlp": 1.05401981, + "epoch": 0.07377123102359838, + "flos": 16581537342720.0, + "grad_norm": 2.8833434676543, + "language_loss": 0.76944947, + "learning_rate": 3.97993658861193e-06, + "loss": 0.79194742, + "num_input_tokens_seen": 26091700, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.46875, + "step": 1227, + "time_per_iteration": 2.4528942108154297 + }, + { + "auxiliary_loss_clip": 0.01192324, + "auxiliary_loss_mlp": 0.01049537, + "balance_loss_clip": 1.02720916, + "balance_loss_mlp": 1.05810142, + "epoch": 0.07383135427626634, + "flos": 28328563457280.0, + "grad_norm": 1.6041059240123434, + "language_loss": 0.85634637, + "learning_rate": 3.9798815241526575e-06, + "loss": 0.87876499, + "num_input_tokens_seen": 26114105, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.34375, + "step": 1228, + "time_per_iteration": 2.5452229976654053 + }, + { + "auxiliary_loss_clip": 0.01194537, + "auxiliary_loss_mlp": 0.01061009, + "balance_loss_clip": 1.0383954, + "balance_loss_mlp": 1.05448794, + "epoch": 0.07389147752893431, + "flos": 20047168235520.0, + "grad_norm": 4.251776538682485, + "language_loss": 0.79688829, + "learning_rate": 3.97982638461608e-06, + "loss": 0.81944382, + "num_input_tokens_seen": 26131165, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.3984375, + "step": 1229, + "time_per_iteration": 2.501086711883545 + }, + { + "auxiliary_loss_clip": 0.01196019, + "auxiliary_loss_mlp": 0.01061374, + "balance_loss_clip": 1.03777039, + "balance_loss_mlp": 1.05632436, + "epoch": 0.07395160078160229, + "flos": 18114132890880.0, + "grad_norm": 2.028375336194412, + "language_loss": 0.78218549, + "learning_rate": 3.979771170004287e-06, + "loss": 0.8047595, + "num_input_tokens_seen": 26150040, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.3984375, + "step": 1230, + "time_per_iteration": 2.4474098682403564 + }, + { + "auxiliary_loss_clip": 0.01193092, + "auxiliary_loss_mlp": 0.01048754, + "balance_loss_clip": 1.02554393, + "balance_loss_mlp": 1.05599403, + "epoch": 0.07401172403427025, + "flos": 23586918842880.0, + "grad_norm": 1.924374124094053, + "language_loss": 0.81301343, + "learning_rate": 3.979715880319372e-06, + "loss": 0.83543187, + "num_input_tokens_seen": 26169380, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.375, + "step": 1231, + "time_per_iteration": 2.4861042499542236 + }, + { + "auxiliary_loss_clip": 0.01198041, + "auxiliary_loss_mlp": 0.01066474, + "balance_loss_clip": 1.04277539, + "balance_loss_mlp": 1.05443811, + "epoch": 0.07407184728693822, + "flos": 26359904799360.0, + "grad_norm": 2.4882746298902343, + "language_loss": 0.95111585, + "learning_rate": 3.979660515563434e-06, + "loss": 0.97376096, + "num_input_tokens_seen": 26189420, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.4375, + "step": 1232, + "time_per_iteration": 2.5074143409729004 + }, + { + "auxiliary_loss_clip": 0.01194092, + "auxiliary_loss_mlp": 0.01060623, + "balance_loss_clip": 1.03938031, + "balance_loss_mlp": 1.05667329, + "epoch": 0.0741319705396062, + "flos": 22200443821440.0, + "grad_norm": 2.246534337547551, + "language_loss": 0.80640733, + "learning_rate": 3.979605075738569e-06, + "loss": 0.82895458, + "num_input_tokens_seen": 26209300, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.375, + "step": 1233, + "time_per_iteration": 2.490816831588745 + }, + { + "auxiliary_loss_clip": 0.01198611, + "auxiliary_loss_mlp": 0.01060349, + "balance_loss_clip": 1.03488624, + "balance_loss_mlp": 1.05483365, + "epoch": 0.07419209379227416, + "flos": 39200482523520.0, + "grad_norm": 2.357402762223285, + "language_loss": 0.70458734, + "learning_rate": 3.979549560846883e-06, + "loss": 0.72717696, + "num_input_tokens_seen": 26228110, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4375, + "step": 1234, + "time_per_iteration": 2.605139970779419 + }, + { + "auxiliary_loss_clip": 0.01195848, + "auxiliary_loss_mlp": 0.01059615, + "balance_loss_clip": 1.03665543, + "balance_loss_mlp": 1.05792761, + "epoch": 0.07425221704494213, + "flos": 22781657790720.0, + "grad_norm": 2.1034220776692765, + "language_loss": 0.77058101, + "learning_rate": 3.979493970890478e-06, + "loss": 0.79313564, + "num_input_tokens_seen": 26247020, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.3828125, + "step": 1235, + "time_per_iteration": 2.4728853702545166 + }, + { + "auxiliary_loss_clip": 0.01189622, + "auxiliary_loss_mlp": 0.01053872, + "balance_loss_clip": 1.03123438, + "balance_loss_mlp": 1.05414248, + "epoch": 0.0743123402976101, + "flos": 22272983337600.0, + "grad_norm": 5.584514149172867, + "language_loss": 0.82648033, + "learning_rate": 3.979438305871464e-06, + "loss": 0.84891528, + "num_input_tokens_seen": 26265750, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.359375, + "step": 1236, + "time_per_iteration": 2.462069511413574 + }, + { + "auxiliary_loss_clip": 0.0119681, + "auxiliary_loss_mlp": 0.01057366, + "balance_loss_clip": 1.03385794, + "balance_loss_mlp": 1.05572712, + "epoch": 0.07437246355027807, + "flos": 29315029645440.0, + "grad_norm": 2.2536643652174724, + "language_loss": 0.75702679, + "learning_rate": 3.979382565791951e-06, + "loss": 0.77956861, + "num_input_tokens_seen": 26287905, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.40625, + "step": 1237, + "time_per_iteration": 2.5572054386138916 + }, + { + "auxiliary_loss_clip": 0.01192925, + "auxiliary_loss_mlp": 0.01060344, + "balance_loss_clip": 1.03817141, + "balance_loss_mlp": 1.05427146, + "epoch": 0.07443258680294604, + "flos": 31944732249600.0, + "grad_norm": 1.878495773650564, + "language_loss": 0.7740556, + "learning_rate": 3.979326750654053e-06, + "loss": 0.7965883, + "num_input_tokens_seen": 26311795, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.3828125, + "step": 1238, + "time_per_iteration": 2.5915493965148926 + }, + { + "auxiliary_loss_clip": 0.01198337, + "auxiliary_loss_mlp": 0.01055743, + "balance_loss_clip": 1.03222322, + "balance_loss_mlp": 1.05435395, + "epoch": 0.074492710055614, + "flos": 22675290641280.0, + "grad_norm": 2.0695087378138455, + "language_loss": 0.86322856, + "learning_rate": 3.9792708604598854e-06, + "loss": 0.88576937, + "num_input_tokens_seen": 26330330, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.4375, + "step": 1239, + "time_per_iteration": 2.4961507320404053 + }, + { + "auxiliary_loss_clip": 0.01194884, + "auxiliary_loss_mlp": 0.01049072, + "balance_loss_clip": 1.02401412, + "balance_loss_mlp": 1.05433989, + "epoch": 0.07455283330828198, + "flos": 21284901037440.0, + "grad_norm": 2.179426429753772, + "language_loss": 0.89070082, + "learning_rate": 3.979214895211569e-06, + "loss": 0.91314042, + "num_input_tokens_seen": 26348865, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.40625, + "step": 1240, + "time_per_iteration": 2.456801176071167 + }, + { + "auxiliary_loss_clip": 0.01197473, + "auxiliary_loss_mlp": 0.01058447, + "balance_loss_clip": 1.03325772, + "balance_loss_mlp": 1.05600643, + "epoch": 0.07461295656094995, + "flos": 24388408967040.0, + "grad_norm": 2.2624482063672513, + "language_loss": 0.88586551, + "learning_rate": 3.979158854911225e-06, + "loss": 0.90842468, + "num_input_tokens_seen": 26368210, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.4140625, + "step": 1241, + "time_per_iteration": 2.5667178630828857 + }, + { + "auxiliary_loss_clip": 0.01080695, + "auxiliary_loss_mlp": 0.01022083, + "balance_loss_clip": 1.018507, + "balance_loss_mlp": 1.02113318, + "epoch": 0.07467307981361791, + "flos": 62109660574080.0, + "grad_norm": 0.9233978594431768, + "language_loss": 0.63032585, + "learning_rate": 3.979102739560979e-06, + "loss": 0.65135366, + "num_input_tokens_seen": 26424890, + "router_z_loss_clip": 0.03564453, + "router_z_loss_mlp": 0.59375, + "step": 1242, + "time_per_iteration": 3.1321358680725098 + }, + { + "auxiliary_loss_clip": 0.012088, + "auxiliary_loss_mlp": 0.01059736, + "balance_loss_clip": 1.03305697, + "balance_loss_mlp": 1.05792046, + "epoch": 0.07473320306628589, + "flos": 24863148046080.0, + "grad_norm": 2.8956100556858004, + "language_loss": 0.62917286, + "learning_rate": 3.9790465491629595e-06, + "loss": 0.65185821, + "num_input_tokens_seen": 26446405, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.5078125, + "step": 1243, + "time_per_iteration": 2.5571463108062744 + }, + { + "auxiliary_loss_clip": 0.01196196, + "auxiliary_loss_mlp": 0.01052045, + "balance_loss_clip": 1.0280956, + "balance_loss_mlp": 1.05710852, + "epoch": 0.07479332631895386, + "flos": 24897442556160.0, + "grad_norm": 2.504235331520048, + "language_loss": 0.76465732, + "learning_rate": 3.978990283719296e-06, + "loss": 0.78713971, + "num_input_tokens_seen": 26466070, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.390625, + "step": 1244, + "time_per_iteration": 2.501621723175049 + }, + { + "auxiliary_loss_clip": 0.01197755, + "auxiliary_loss_mlp": 0.01058762, + "balance_loss_clip": 1.03462183, + "balance_loss_mlp": 1.05684423, + "epoch": 0.07485344957162182, + "flos": 17815247821440.0, + "grad_norm": 2.8968513367461495, + "language_loss": 0.69149882, + "learning_rate": 3.978933943232123e-06, + "loss": 0.714064, + "num_input_tokens_seen": 26479350, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.40625, + "step": 1245, + "time_per_iteration": 2.417921781539917 + }, + { + "auxiliary_loss_clip": 0.01196347, + "auxiliary_loss_mlp": 0.01052065, + "balance_loss_clip": 1.02768707, + "balance_loss_mlp": 1.05663347, + "epoch": 0.0749135728242898, + "flos": 25010202326400.0, + "grad_norm": 1.9272496045423029, + "language_loss": 0.88344061, + "learning_rate": 3.978877527703576e-06, + "loss": 0.90592474, + "num_input_tokens_seen": 26498255, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.3984375, + "step": 1246, + "time_per_iteration": 2.5631723403930664 + }, + { + "auxiliary_loss_clip": 0.01205457, + "auxiliary_loss_mlp": 0.01067222, + "balance_loss_clip": 1.04055524, + "balance_loss_mlp": 1.05656838, + "epoch": 0.07497369607695777, + "flos": 17822071405440.0, + "grad_norm": 2.4755370190447064, + "language_loss": 0.87921643, + "learning_rate": 3.9788210371357945e-06, + "loss": 0.90194321, + "num_input_tokens_seen": 26515375, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.4921875, + "step": 1247, + "time_per_iteration": 2.4602389335632324 + }, + { + "auxiliary_loss_clip": 0.01194073, + "auxiliary_loss_mlp": 0.01060013, + "balance_loss_clip": 1.03502667, + "balance_loss_mlp": 1.05565107, + "epoch": 0.07503381932962573, + "flos": 15121086261120.0, + "grad_norm": 2.2039165223770194, + "language_loss": 0.6477375, + "learning_rate": 3.978764471530921e-06, + "loss": 0.67027843, + "num_input_tokens_seen": 26533595, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.3828125, + "step": 1248, + "time_per_iteration": 2.4408388137817383 + }, + { + "auxiliary_loss_clip": 0.01192958, + "auxiliary_loss_mlp": 0.0106246, + "balance_loss_clip": 1.04016805, + "balance_loss_mlp": 1.0575254, + "epoch": 0.0750939425822937, + "flos": 12816734071680.0, + "grad_norm": 2.0641418493429713, + "language_loss": 0.73964334, + "learning_rate": 3.978707830891102e-06, + "loss": 0.76219749, + "num_input_tokens_seen": 26549405, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.359375, + "step": 1249, + "time_per_iteration": 2.443767547607422 + }, + { + "auxiliary_loss_clip": 0.01201286, + "auxiliary_loss_mlp": 0.01068388, + "balance_loss_clip": 1.0433774, + "balance_loss_mlp": 1.05842972, + "epoch": 0.07515406583496168, + "flos": 24206844695040.0, + "grad_norm": 2.607815988938315, + "language_loss": 0.81845009, + "learning_rate": 3.978651115218482e-06, + "loss": 0.84114683, + "num_input_tokens_seen": 26567200, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.4296875, + "step": 1250, + "time_per_iteration": 2.491236448287964 + }, + { + "auxiliary_loss_clip": 0.01197565, + "auxiliary_loss_mlp": 0.01061004, + "balance_loss_clip": 1.03703094, + "balance_loss_mlp": 1.05932856, + "epoch": 0.07521418908762964, + "flos": 26688164215680.0, + "grad_norm": 2.308634463940828, + "language_loss": 0.66713893, + "learning_rate": 3.978594324515215e-06, + "loss": 0.68972456, + "num_input_tokens_seen": 26586190, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.3828125, + "step": 1251, + "time_per_iteration": 2.5437874794006348 + }, + { + "auxiliary_loss_clip": 0.0107681, + "auxiliary_loss_mlp": 0.01002851, + "balance_loss_clip": 0.99946529, + "balance_loss_mlp": 1.02021933, + "epoch": 0.0752743123402976, + "flos": 59095140589440.0, + "grad_norm": 0.8978558428983584, + "language_loss": 0.70356798, + "learning_rate": 3.9785374587834515e-06, + "loss": 0.72436458, + "num_input_tokens_seen": 26650710, + "router_z_loss_clip": 0.03393555, + "router_z_loss_mlp": 0.56640625, + "step": 1252, + "time_per_iteration": 3.1170923709869385 + }, + { + "auxiliary_loss_clip": 0.01194007, + "auxiliary_loss_mlp": 0.01061281, + "balance_loss_clip": 1.03698599, + "balance_loss_mlp": 1.05419612, + "epoch": 0.07533443559296558, + "flos": 23477032160640.0, + "grad_norm": 2.9290655276351045, + "language_loss": 0.79516673, + "learning_rate": 3.97848051802535e-06, + "loss": 0.81771958, + "num_input_tokens_seen": 26669000, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.3984375, + "step": 1253, + "time_per_iteration": 2.4821414947509766 + }, + { + "auxiliary_loss_clip": 0.01199953, + "auxiliary_loss_mlp": 0.01065033, + "balance_loss_clip": 1.04125071, + "balance_loss_mlp": 1.05829906, + "epoch": 0.07539455884563355, + "flos": 20879110114560.0, + "grad_norm": 2.5751371148477995, + "language_loss": 0.93441045, + "learning_rate": 3.978423502243069e-06, + "loss": 0.95706034, + "num_input_tokens_seen": 26683075, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.421875, + "step": 1254, + "time_per_iteration": 2.4245519638061523 + }, + { + "auxiliary_loss_clip": 0.01191058, + "auxiliary_loss_mlp": 0.01062028, + "balance_loss_clip": 1.03849554, + "balance_loss_mlp": 1.05566263, + "epoch": 0.07545468209830151, + "flos": 27672906551040.0, + "grad_norm": 1.866823394820361, + "language_loss": 0.88030314, + "learning_rate": 3.97836641143877e-06, + "loss": 0.902834, + "num_input_tokens_seen": 26701875, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.359375, + "step": 1255, + "time_per_iteration": 2.5579185485839844 + }, + { + "auxiliary_loss_clip": 0.01192242, + "auxiliary_loss_mlp": 0.01064619, + "balance_loss_clip": 1.04009795, + "balance_loss_mlp": 1.05518413, + "epoch": 0.0755148053509695, + "flos": 14136990370560.0, + "grad_norm": 1.7574194703288544, + "language_loss": 0.79516619, + "learning_rate": 3.978309245614618e-06, + "loss": 0.81773484, + "num_input_tokens_seen": 26719050, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.3671875, + "step": 1256, + "time_per_iteration": 2.4203784465789795 + }, + { + "auxiliary_loss_clip": 0.01074137, + "auxiliary_loss_mlp": 0.01007102, + "balance_loss_clip": 1.00378788, + "balance_loss_mlp": 1.01769829, + "epoch": 0.07557492860363746, + "flos": 58235257929600.0, + "grad_norm": 0.8283025846018472, + "language_loss": 0.58016127, + "learning_rate": 3.9782520047727825e-06, + "loss": 0.60097361, + "num_input_tokens_seen": 26780650, + "router_z_loss_clip": 0.03320312, + "router_z_loss_mlp": 0.5625, + "step": 1257, + "time_per_iteration": 3.1732118129730225 + }, + { + "auxiliary_loss_clip": 0.0119581, + "auxiliary_loss_mlp": 0.01056297, + "balance_loss_clip": 1.03272927, + "balance_loss_mlp": 1.05982757, + "epoch": 0.07563505185630542, + "flos": 24644380262400.0, + "grad_norm": 3.1336739114125107, + "language_loss": 0.89859951, + "learning_rate": 3.978194688915432e-06, + "loss": 0.92112058, + "num_input_tokens_seen": 26798725, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.359375, + "step": 1258, + "time_per_iteration": 2.516925811767578 + }, + { + "auxiliary_loss_clip": 0.01192364, + "auxiliary_loss_mlp": 0.01054502, + "balance_loss_clip": 1.03181624, + "balance_loss_mlp": 1.05663717, + "epoch": 0.07569517510897339, + "flos": 15522998515200.0, + "grad_norm": 3.28312942247731, + "language_loss": 0.81211507, + "learning_rate": 3.978137298044741e-06, + "loss": 0.83458376, + "num_input_tokens_seen": 26817005, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.359375, + "step": 1259, + "time_per_iteration": 2.449533224105835 + }, + { + "auxiliary_loss_clip": 0.01193912, + "auxiliary_loss_mlp": 0.01058573, + "balance_loss_clip": 1.03593481, + "balance_loss_mlp": 1.05662787, + "epoch": 0.07575529836164137, + "flos": 22928532503040.0, + "grad_norm": 1.9172803769558988, + "language_loss": 0.75733984, + "learning_rate": 3.978079832162885e-06, + "loss": 0.77986467, + "num_input_tokens_seen": 26836655, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.375, + "step": 1260, + "time_per_iteration": 2.5003559589385986 + }, + { + "auxiliary_loss_clip": 0.01192246, + "auxiliary_loss_mlp": 0.01059755, + "balance_loss_clip": 1.03550828, + "balance_loss_mlp": 1.0552032, + "epoch": 0.07581542161430933, + "flos": 19500428344320.0, + "grad_norm": 1.8260195606442358, + "language_loss": 0.84695768, + "learning_rate": 3.978022291272044e-06, + "loss": 0.86947775, + "num_input_tokens_seen": 26854925, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.375, + "step": 1261, + "time_per_iteration": 2.4633476734161377 + }, + { + "auxiliary_loss_clip": 0.01200376, + "auxiliary_loss_mlp": 0.01060967, + "balance_loss_clip": 1.03828108, + "balance_loss_mlp": 1.05969536, + "epoch": 0.0758755448669773, + "flos": 24973465691520.0, + "grad_norm": 2.3160282321136334, + "language_loss": 0.8266682, + "learning_rate": 3.977964675374399e-06, + "loss": 0.84928167, + "num_input_tokens_seen": 26876170, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.40625, + "step": 1262, + "time_per_iteration": 2.5256471633911133 + }, + { + "auxiliary_loss_clip": 0.01192085, + "auxiliary_loss_mlp": 0.01061195, + "balance_loss_clip": 1.03703153, + "balance_loss_mlp": 1.0540688, + "epoch": 0.07593566811964528, + "flos": 22747973811840.0, + "grad_norm": 2.4581964181262776, + "language_loss": 0.8255769, + "learning_rate": 3.977906984472136e-06, + "loss": 0.84810972, + "num_input_tokens_seen": 26895005, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.3828125, + "step": 1263, + "time_per_iteration": 2.470656633377075 + }, + { + "auxiliary_loss_clip": 0.01195735, + "auxiliary_loss_mlp": 0.01056704, + "balance_loss_clip": 1.03381538, + "balance_loss_mlp": 1.05504882, + "epoch": 0.07599579137231324, + "flos": 23112395245440.0, + "grad_norm": 2.324943057092889, + "language_loss": 0.7591399, + "learning_rate": 3.977849218567442e-06, + "loss": 0.78166431, + "num_input_tokens_seen": 26913930, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.40625, + "step": 1264, + "time_per_iteration": 2.4715359210968018 + }, + { + "auxiliary_loss_clip": 0.0119596, + "auxiliary_loss_mlp": 0.01062168, + "balance_loss_clip": 1.03832579, + "balance_loss_mlp": 1.05711412, + "epoch": 0.07605591462498121, + "flos": 14502058248960.0, + "grad_norm": 2.1997185871944356, + "language_loss": 0.81106204, + "learning_rate": 3.977791377662507e-06, + "loss": 0.83364332, + "num_input_tokens_seen": 26931485, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.390625, + "step": 1265, + "time_per_iteration": 2.440000295639038 + }, + { + "auxiliary_loss_clip": 0.01195477, + "auxiliary_loss_mlp": 0.01056708, + "balance_loss_clip": 1.03408241, + "balance_loss_mlp": 1.05631864, + "epoch": 0.07611603787764919, + "flos": 23514199758720.0, + "grad_norm": 2.141616369936441, + "language_loss": 0.64935738, + "learning_rate": 3.977733461759524e-06, + "loss": 0.67187923, + "num_input_tokens_seen": 26951670, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.390625, + "step": 1266, + "time_per_iteration": 2.495001792907715 + }, + { + "auxiliary_loss_clip": 0.01194799, + "auxiliary_loss_mlp": 0.01060988, + "balance_loss_clip": 1.03752804, + "balance_loss_mlp": 1.05550349, + "epoch": 0.07617616113031715, + "flos": 21507188353920.0, + "grad_norm": 2.2514277899416606, + "language_loss": 0.79527593, + "learning_rate": 3.977675470860691e-06, + "loss": 0.81783378, + "num_input_tokens_seen": 26970335, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.390625, + "step": 1267, + "time_per_iteration": 2.4763970375061035 + }, + { + "auxiliary_loss_clip": 0.01194511, + "auxiliary_loss_mlp": 0.01051729, + "balance_loss_clip": 1.02975869, + "balance_loss_mlp": 1.05526185, + "epoch": 0.07623628438298512, + "flos": 14573161221120.0, + "grad_norm": 2.2740159695832682, + "language_loss": 0.7253381, + "learning_rate": 3.977617404968205e-06, + "loss": 0.74780059, + "num_input_tokens_seen": 26986025, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.390625, + "step": 1268, + "time_per_iteration": 3.8910977840423584 + }, + { + "auxiliary_loss_clip": 0.01192554, + "auxiliary_loss_mlp": 0.01057239, + "balance_loss_clip": 1.03447044, + "balance_loss_mlp": 1.05342031, + "epoch": 0.07629640763565308, + "flos": 14720395069440.0, + "grad_norm": 2.163449384012833, + "language_loss": 0.81891817, + "learning_rate": 3.977559264084269e-06, + "loss": 0.84141612, + "num_input_tokens_seen": 27004045, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.390625, + "step": 1269, + "time_per_iteration": 3.8643741607666016 + }, + { + "auxiliary_loss_clip": 0.01192657, + "auxiliary_loss_mlp": 0.01054484, + "balance_loss_clip": 1.03120267, + "balance_loss_mlp": 1.05559695, + "epoch": 0.07635653088832106, + "flos": 14902929008640.0, + "grad_norm": 3.2383492700687078, + "language_loss": 0.88135087, + "learning_rate": 3.977501048211088e-06, + "loss": 0.90382218, + "num_input_tokens_seen": 27022070, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.375, + "step": 1270, + "time_per_iteration": 2.4746575355529785 + }, + { + "auxiliary_loss_clip": 0.01198155, + "auxiliary_loss_mlp": 0.0105921, + "balance_loss_clip": 1.03559494, + "balance_loss_mlp": 1.05707884, + "epoch": 0.07641665414098903, + "flos": 26651571235200.0, + "grad_norm": 2.188682914143081, + "language_loss": 0.71113384, + "learning_rate": 3.977442757350869e-06, + "loss": 0.73370755, + "num_input_tokens_seen": 27041755, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.4140625, + "step": 1271, + "time_per_iteration": 2.529632091522217 + }, + { + "auxiliary_loss_clip": 0.01189637, + "auxiliary_loss_mlp": 0.01066344, + "balance_loss_clip": 1.04351556, + "balance_loss_mlp": 1.05675423, + "epoch": 0.07647677739365699, + "flos": 25192808092800.0, + "grad_norm": 1.9018984880968814, + "language_loss": 0.82745486, + "learning_rate": 3.977384391505823e-06, + "loss": 0.85001469, + "num_input_tokens_seen": 27061540, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.328125, + "step": 1272, + "time_per_iteration": 2.4950368404388428 + }, + { + "auxiliary_loss_clip": 0.01191491, + "auxiliary_loss_mlp": 0.01061838, + "balance_loss_clip": 1.03867579, + "balance_loss_mlp": 1.05351079, + "epoch": 0.07653690064632497, + "flos": 20558141159040.0, + "grad_norm": 2.0211474255264643, + "language_loss": 0.79951203, + "learning_rate": 3.977325950678162e-06, + "loss": 0.82204533, + "num_input_tokens_seen": 27081395, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.3828125, + "step": 1273, + "time_per_iteration": 2.490281105041504 + }, + { + "auxiliary_loss_clip": 0.01194744, + "auxiliary_loss_mlp": 0.01062211, + "balance_loss_clip": 1.03858376, + "balance_loss_mlp": 1.05600715, + "epoch": 0.07659702389899294, + "flos": 22269320150400.0, + "grad_norm": 1.848359088284866, + "language_loss": 0.81545758, + "learning_rate": 3.977267434870103e-06, + "loss": 0.83802712, + "num_input_tokens_seen": 27101175, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.390625, + "step": 1274, + "time_per_iteration": 2.499799966812134 + }, + { + "auxiliary_loss_clip": 0.01191932, + "auxiliary_loss_mlp": 0.01068548, + "balance_loss_clip": 1.04430115, + "balance_loss_mlp": 1.05469346, + "epoch": 0.0766571471516609, + "flos": 32636120209920.0, + "grad_norm": 1.991418246716423, + "language_loss": 0.73099387, + "learning_rate": 3.977208844083865e-06, + "loss": 0.75359869, + "num_input_tokens_seen": 27124505, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.375, + "step": 1275, + "time_per_iteration": 2.557973623275757 + }, + { + "auxiliary_loss_clip": 0.011939, + "auxiliary_loss_mlp": 0.01061514, + "balance_loss_clip": 1.0368377, + "balance_loss_mlp": 1.05536842, + "epoch": 0.07671727040432888, + "flos": 15267386355840.0, + "grad_norm": 2.1093684912214545, + "language_loss": 0.79584897, + "learning_rate": 3.9771501783216685e-06, + "loss": 0.81840312, + "num_input_tokens_seen": 27140960, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.390625, + "step": 1276, + "time_per_iteration": 2.4329752922058105 + }, + { + "auxiliary_loss_clip": 0.01196395, + "auxiliary_loss_mlp": 0.01051332, + "balance_loss_clip": 1.02838457, + "balance_loss_mlp": 1.05656397, + "epoch": 0.07677739365699685, + "flos": 28184094956160.0, + "grad_norm": 2.623540269613024, + "language_loss": 0.59020305, + "learning_rate": 3.97709143758574e-06, + "loss": 0.61268032, + "num_input_tokens_seen": 27160985, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.3984375, + "step": 1277, + "time_per_iteration": 2.5318989753723145 + }, + { + "auxiliary_loss_clip": 0.01200985, + "auxiliary_loss_mlp": 0.01057464, + "balance_loss_clip": 1.03433728, + "balance_loss_mlp": 1.05805659, + "epoch": 0.07683751690966481, + "flos": 18296128126080.0, + "grad_norm": 2.2944749333347096, + "language_loss": 0.74846482, + "learning_rate": 3.977032621878305e-06, + "loss": 0.77104926, + "num_input_tokens_seen": 27178390, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.4296875, + "step": 1278, + "time_per_iteration": 2.448615789413452 + }, + { + "auxiliary_loss_clip": 0.01190146, + "auxiliary_loss_mlp": 0.01051712, + "balance_loss_clip": 1.02943182, + "balance_loss_mlp": 1.05475163, + "epoch": 0.07689764016233278, + "flos": 21981101420160.0, + "grad_norm": 4.0999470067777075, + "language_loss": 0.88656616, + "learning_rate": 3.976973731201596e-06, + "loss": 0.90898478, + "num_input_tokens_seen": 27197505, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.359375, + "step": 1279, + "time_per_iteration": 2.4883790016174316 + }, + { + "auxiliary_loss_clip": 0.01189256, + "auxiliary_loss_mlp": 0.01062556, + "balance_loss_clip": 1.03973901, + "balance_loss_mlp": 1.05507362, + "epoch": 0.07695776341500075, + "flos": 22235995307520.0, + "grad_norm": 3.4596954186847393, + "language_loss": 0.82899994, + "learning_rate": 3.976914765557845e-06, + "loss": 0.85151803, + "num_input_tokens_seen": 27214260, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.34375, + "step": 1280, + "time_per_iteration": 2.459294319152832 + }, + { + "auxiliary_loss_clip": 0.01188755, + "auxiliary_loss_mlp": 0.01061811, + "balance_loss_clip": 1.03874409, + "balance_loss_mlp": 1.05492759, + "epoch": 0.07701788666766872, + "flos": 16143750380160.0, + "grad_norm": 1.9224222656998016, + "language_loss": 0.76059222, + "learning_rate": 3.9768557249492875e-06, + "loss": 0.78309786, + "num_input_tokens_seen": 27232525, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.3359375, + "step": 1281, + "time_per_iteration": 2.453183650970459 + }, + { + "auxiliary_loss_clip": 0.0119548, + "auxiliary_loss_mlp": 0.01054802, + "balance_loss_clip": 1.03128171, + "balance_loss_mlp": 1.05448353, + "epoch": 0.07707800992033668, + "flos": 19463045264640.0, + "grad_norm": 1.8937081587754587, + "language_loss": 0.75307631, + "learning_rate": 3.9767966093781634e-06, + "loss": 0.77557921, + "num_input_tokens_seen": 27249800, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.40625, + "step": 1282, + "time_per_iteration": 2.4526116847991943 + }, + { + "auxiliary_loss_clip": 0.01190337, + "auxiliary_loss_mlp": 0.01070616, + "balance_loss_clip": 1.04734671, + "balance_loss_mlp": 1.054286, + "epoch": 0.07713813317300466, + "flos": 18990281433600.0, + "grad_norm": 2.0304459145795963, + "language_loss": 0.8428033, + "learning_rate": 3.976737418846713e-06, + "loss": 0.86541283, + "num_input_tokens_seen": 27268895, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.359375, + "step": 1283, + "time_per_iteration": 2.468101739883423 + }, + { + "auxiliary_loss_clip": 0.01192768, + "auxiliary_loss_mlp": 0.01062747, + "balance_loss_clip": 1.0375464, + "balance_loss_mlp": 1.05560803, + "epoch": 0.07719825642567263, + "flos": 18113953322880.0, + "grad_norm": 2.622403612740989, + "language_loss": 0.75031364, + "learning_rate": 3.976678153357181e-06, + "loss": 0.77286887, + "num_input_tokens_seen": 27288180, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.375, + "step": 1284, + "time_per_iteration": 2.451749801635742 + }, + { + "auxiliary_loss_clip": 0.01188745, + "auxiliary_loss_mlp": 0.0106155, + "balance_loss_clip": 1.03947222, + "balance_loss_mlp": 1.05330253, + "epoch": 0.0772583796783406, + "flos": 42194426993280.0, + "grad_norm": 1.6448065546510353, + "language_loss": 0.75934827, + "learning_rate": 3.976618812911817e-06, + "loss": 0.78185129, + "num_input_tokens_seen": 27311815, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.359375, + "step": 1285, + "time_per_iteration": 2.664769411087036 + }, + { + "auxiliary_loss_clip": 0.01196484, + "auxiliary_loss_mlp": 0.01062869, + "balance_loss_clip": 1.0406251, + "balance_loss_mlp": 1.05862105, + "epoch": 0.07731850293100857, + "flos": 24753692327040.0, + "grad_norm": 1.8165785508620624, + "language_loss": 0.84204662, + "learning_rate": 3.9765593975128685e-06, + "loss": 0.86464012, + "num_input_tokens_seen": 27331890, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.375, + "step": 1286, + "time_per_iteration": 2.550670862197876 + }, + { + "auxiliary_loss_clip": 0.01196192, + "auxiliary_loss_mlp": 0.01055874, + "balance_loss_clip": 1.03271151, + "balance_loss_mlp": 1.05582845, + "epoch": 0.07737862618367654, + "flos": 17565884628480.0, + "grad_norm": 4.521300853065514, + "language_loss": 0.76725763, + "learning_rate": 3.97649990716259e-06, + "loss": 0.78977823, + "num_input_tokens_seen": 27348320, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.40625, + "step": 1287, + "time_per_iteration": 2.455627918243408 + }, + { + "auxiliary_loss_clip": 0.01190346, + "auxiliary_loss_mlp": 0.01058612, + "balance_loss_clip": 1.03636777, + "balance_loss_mlp": 1.05476642, + "epoch": 0.0774387494363445, + "flos": 25627147349760.0, + "grad_norm": 1.6785000972571258, + "language_loss": 0.84509134, + "learning_rate": 3.976440341863237e-06, + "loss": 0.86758095, + "num_input_tokens_seen": 27367670, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.359375, + "step": 1288, + "time_per_iteration": 2.500218629837036 + }, + { + "auxiliary_loss_clip": 0.01192387, + "auxiliary_loss_mlp": 0.01056799, + "balance_loss_clip": 1.0350318, + "balance_loss_mlp": 1.05364347, + "epoch": 0.07749887268901248, + "flos": 12239865648000.0, + "grad_norm": 2.192533837519805, + "language_loss": 0.85769016, + "learning_rate": 3.976380701617068e-06, + "loss": 0.88018203, + "num_input_tokens_seen": 27385485, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.390625, + "step": 1289, + "time_per_iteration": 2.4759440422058105 + }, + { + "auxiliary_loss_clip": 0.01189023, + "auxiliary_loss_mlp": 0.01047658, + "balance_loss_clip": 1.02563989, + "balance_loss_mlp": 1.05300641, + "epoch": 0.07755899594168045, + "flos": 25081736261760.0, + "grad_norm": 1.8877463184856607, + "language_loss": 0.85053366, + "learning_rate": 3.976320986426344e-06, + "loss": 0.87290049, + "num_input_tokens_seen": 27405110, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.359375, + "step": 1290, + "time_per_iteration": 2.5266401767730713 + }, + { + "auxiliary_loss_clip": 0.01185369, + "auxiliary_loss_mlp": 0.01059291, + "balance_loss_clip": 1.03541303, + "balance_loss_mlp": 1.05397463, + "epoch": 0.07761911919434841, + "flos": 14246410176000.0, + "grad_norm": 2.3980248629455834, + "language_loss": 0.90562832, + "learning_rate": 3.9762611962933315e-06, + "loss": 0.92807496, + "num_input_tokens_seen": 27422855, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.3125, + "step": 1291, + "time_per_iteration": 2.4760262966156006 + }, + { + "auxiliary_loss_clip": 0.01071114, + "auxiliary_loss_mlp": 0.01008288, + "balance_loss_clip": 1.00456893, + "balance_loss_mlp": 1.01656318, + "epoch": 0.07767924244701638, + "flos": 67237202954880.0, + "grad_norm": 0.9429671936579762, + "language_loss": 0.64993972, + "learning_rate": 3.9762013312202955e-06, + "loss": 0.67073375, + "num_input_tokens_seen": 27487190, + "router_z_loss_clip": 0.03710938, + "router_z_loss_mlp": 0.546875, + "step": 1292, + "time_per_iteration": 3.1508371829986572 + }, + { + "auxiliary_loss_clip": 0.0118873, + "auxiliary_loss_mlp": 0.01059018, + "balance_loss_clip": 1.03716707, + "balance_loss_mlp": 1.05293965, + "epoch": 0.07773936569968436, + "flos": 28550635292160.0, + "grad_norm": 1.7960778456946043, + "language_loss": 0.87610948, + "learning_rate": 3.9761413912095075e-06, + "loss": 0.89858699, + "num_input_tokens_seen": 27510465, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.359375, + "step": 1293, + "time_per_iteration": 2.6359729766845703 + }, + { + "auxiliary_loss_clip": 0.01193413, + "auxiliary_loss_mlp": 0.01062236, + "balance_loss_clip": 1.03789377, + "balance_loss_mlp": 1.05659533, + "epoch": 0.07779948895235232, + "flos": 27490264871040.0, + "grad_norm": 2.312065886688882, + "language_loss": 0.85111046, + "learning_rate": 3.976081376263239e-06, + "loss": 0.873667, + "num_input_tokens_seen": 27528645, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.3671875, + "step": 1294, + "time_per_iteration": 2.5151314735412598 + }, + { + "auxiliary_loss_clip": 0.01193943, + "auxiliary_loss_mlp": 0.01054926, + "balance_loss_clip": 1.03188252, + "balance_loss_mlp": 1.05702615, + "epoch": 0.07785961220502029, + "flos": 18223301301120.0, + "grad_norm": 2.728225366024782, + "language_loss": 0.79202414, + "learning_rate": 3.976021286383768e-06, + "loss": 0.81451285, + "num_input_tokens_seen": 27546165, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.3671875, + "step": 1295, + "time_per_iteration": 2.459510326385498 + }, + { + "auxiliary_loss_clip": 0.01188808, + "auxiliary_loss_mlp": 0.01052849, + "balance_loss_clip": 1.02966261, + "balance_loss_mlp": 1.05383039, + "epoch": 0.07791973545768827, + "flos": 24608218245120.0, + "grad_norm": 2.8222308711400834, + "language_loss": 0.88216382, + "learning_rate": 3.975961121573371e-06, + "loss": 0.90458035, + "num_input_tokens_seen": 27566520, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3515625, + "step": 1296, + "time_per_iteration": 2.492892026901245 + }, + { + "auxiliary_loss_clip": 0.01192946, + "auxiliary_loss_mlp": 0.01058016, + "balance_loss_clip": 1.03410244, + "balance_loss_mlp": 1.05591464, + "epoch": 0.07797985871035623, + "flos": 14282069402880.0, + "grad_norm": 3.2140473454082086, + "language_loss": 0.96160841, + "learning_rate": 3.9759008818343305e-06, + "loss": 0.98411804, + "num_input_tokens_seen": 27581960, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.375, + "step": 1297, + "time_per_iteration": 2.4668915271759033 + }, + { + "auxiliary_loss_clip": 0.01189875, + "auxiliary_loss_mlp": 0.01054366, + "balance_loss_clip": 1.032372, + "balance_loss_mlp": 1.05289149, + "epoch": 0.0780399819630242, + "flos": 26610453141120.0, + "grad_norm": 2.460261972702069, + "language_loss": 0.76087165, + "learning_rate": 3.97584056716893e-06, + "loss": 0.78331399, + "num_input_tokens_seen": 27601415, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.3671875, + "step": 1298, + "time_per_iteration": 2.5059781074523926 + }, + { + "auxiliary_loss_clip": 0.01192131, + "auxiliary_loss_mlp": 0.01061793, + "balance_loss_clip": 1.04039502, + "balance_loss_mlp": 1.05696058, + "epoch": 0.07810010521569218, + "flos": 21834514016640.0, + "grad_norm": 1.8752674736144914, + "language_loss": 0.80755305, + "learning_rate": 3.9757801775794575e-06, + "loss": 0.83009231, + "num_input_tokens_seen": 27621490, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.3515625, + "step": 1299, + "time_per_iteration": 2.5036020278930664 + }, + { + "auxiliary_loss_clip": 0.01183493, + "auxiliary_loss_mlp": 0.01056623, + "balance_loss_clip": 1.03402138, + "balance_loss_mlp": 1.05226159, + "epoch": 0.07816022846836014, + "flos": 25081233471360.0, + "grad_norm": 2.1903498852009813, + "language_loss": 0.86459941, + "learning_rate": 3.975719713068202e-06, + "loss": 0.88700056, + "num_input_tokens_seen": 27640600, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.3125, + "step": 1300, + "time_per_iteration": 2.4866278171539307 + }, + { + "auxiliary_loss_clip": 0.0118988, + "auxiliary_loss_mlp": 0.01052064, + "balance_loss_clip": 1.0284245, + "balance_loss_mlp": 1.05393028, + "epoch": 0.0782203517210281, + "flos": 40917515431680.0, + "grad_norm": 1.909902293479526, + "language_loss": 0.71778899, + "learning_rate": 3.975659173637458e-06, + "loss": 0.74020839, + "num_input_tokens_seen": 27663070, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.359375, + "step": 1301, + "time_per_iteration": 2.6491336822509766 + }, + { + "auxiliary_loss_clip": 0.01196178, + "auxiliary_loss_mlp": 0.0106414, + "balance_loss_clip": 1.04106081, + "balance_loss_mlp": 1.0586772, + "epoch": 0.07828047497369607, + "flos": 41172014269440.0, + "grad_norm": 1.5624281437346959, + "language_loss": 0.70860815, + "learning_rate": 3.97559855928952e-06, + "loss": 0.7312113, + "num_input_tokens_seen": 27686425, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.375, + "step": 1302, + "time_per_iteration": 2.635430335998535 + }, + { + "auxiliary_loss_clip": 0.01188946, + "auxiliary_loss_mlp": 0.01060723, + "balance_loss_clip": 1.03702378, + "balance_loss_mlp": 1.05438161, + "epoch": 0.07834059822636405, + "flos": 23508130360320.0, + "grad_norm": 2.152945758623263, + "language_loss": 0.8192755, + "learning_rate": 3.9755378700266864e-06, + "loss": 0.84177226, + "num_input_tokens_seen": 27704900, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.34375, + "step": 1303, + "time_per_iteration": 2.4861090183258057 + }, + { + "auxiliary_loss_clip": 0.01188578, + "auxiliary_loss_mlp": 0.01061933, + "balance_loss_clip": 1.03879452, + "balance_loss_mlp": 1.05351233, + "epoch": 0.07840072147903202, + "flos": 20193899293440.0, + "grad_norm": 1.8425530042965788, + "language_loss": 0.7497822, + "learning_rate": 3.9754771058512585e-06, + "loss": 0.77228731, + "num_input_tokens_seen": 27724890, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3515625, + "step": 1304, + "time_per_iteration": 2.464087963104248 + }, + { + "auxiliary_loss_clip": 0.01191658, + "auxiliary_loss_mlp": 0.0106237, + "balance_loss_clip": 1.03857565, + "balance_loss_mlp": 1.05645108, + "epoch": 0.07846084473169998, + "flos": 21360816432000.0, + "grad_norm": 1.696211405930565, + "language_loss": 0.76397038, + "learning_rate": 3.975416266765542e-06, + "loss": 0.78651059, + "num_input_tokens_seen": 27743115, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.359375, + "step": 1305, + "time_per_iteration": 2.486093521118164 + }, + { + "auxiliary_loss_clip": 0.01192283, + "auxiliary_loss_mlp": 0.01064575, + "balance_loss_clip": 1.04087615, + "balance_loss_mlp": 1.05527782, + "epoch": 0.07852096798436796, + "flos": 25410965345280.0, + "grad_norm": 2.2926357932273866, + "language_loss": 0.85035503, + "learning_rate": 3.975355352771841e-06, + "loss": 0.87292361, + "num_input_tokens_seen": 27763570, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.3671875, + "step": 1306, + "time_per_iteration": 2.496265172958374 + }, + { + "auxiliary_loss_clip": 0.0119039, + "auxiliary_loss_mlp": 0.0104404, + "balance_loss_clip": 1.02299976, + "balance_loss_mlp": 1.05652416, + "epoch": 0.07858109123703592, + "flos": 24571481610240.0, + "grad_norm": 3.0575778567802976, + "language_loss": 0.90087706, + "learning_rate": 3.975294363872468e-06, + "loss": 0.92322135, + "num_input_tokens_seen": 27780030, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.34375, + "step": 1307, + "time_per_iteration": 2.5122623443603516 + }, + { + "auxiliary_loss_clip": 0.01189263, + "auxiliary_loss_mlp": 0.01057091, + "balance_loss_clip": 1.03295124, + "balance_loss_mlp": 1.05417371, + "epoch": 0.07864121448970389, + "flos": 20698874645760.0, + "grad_norm": 1.8540925974151201, + "language_loss": 0.83408689, + "learning_rate": 3.975233300069735e-06, + "loss": 0.85655046, + "num_input_tokens_seen": 27796225, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.3515625, + "step": 1308, + "time_per_iteration": 2.4686944484710693 + }, + { + "auxiliary_loss_clip": 0.01186004, + "auxiliary_loss_mlp": 0.01053628, + "balance_loss_clip": 1.03177738, + "balance_loss_mlp": 1.05289674, + "epoch": 0.07870133774237187, + "flos": 22966526113920.0, + "grad_norm": 1.6283340971904061, + "language_loss": 0.77841777, + "learning_rate": 3.975172161365958e-06, + "loss": 0.80081415, + "num_input_tokens_seen": 27815975, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.328125, + "step": 1309, + "time_per_iteration": 5.444388151168823 + }, + { + "auxiliary_loss_clip": 0.0119416, + "auxiliary_loss_mlp": 0.01062294, + "balance_loss_clip": 1.0380106, + "balance_loss_mlp": 1.05386913, + "epoch": 0.07876146099503983, + "flos": 18842832103680.0, + "grad_norm": 1.9656388899868151, + "language_loss": 0.80146122, + "learning_rate": 3.975110947763453e-06, + "loss": 0.82402575, + "num_input_tokens_seen": 27832255, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.40625, + "step": 1310, + "time_per_iteration": 3.8553466796875 + }, + { + "auxiliary_loss_clip": 0.01185305, + "auxiliary_loss_mlp": 0.01052602, + "balance_loss_clip": 1.03067899, + "balance_loss_mlp": 1.05544043, + "epoch": 0.0788215842477078, + "flos": 23805794367360.0, + "grad_norm": 1.7115323272474947, + "language_loss": 0.73069102, + "learning_rate": 3.9750496592645435e-06, + "loss": 0.75307012, + "num_input_tokens_seen": 27852180, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.296875, + "step": 1311, + "time_per_iteration": 2.5299458503723145 + }, + { + "auxiliary_loss_clip": 0.01190682, + "auxiliary_loss_mlp": 0.01072627, + "balance_loss_clip": 1.04861844, + "balance_loss_mlp": 1.05650353, + "epoch": 0.07888170750037576, + "flos": 21579907438080.0, + "grad_norm": 1.9161215374898264, + "language_loss": 0.85871482, + "learning_rate": 3.974988295871553e-06, + "loss": 0.88134789, + "num_input_tokens_seen": 27871435, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.34375, + "step": 1312, + "time_per_iteration": 2.5490031242370605 + }, + { + "auxiliary_loss_clip": 0.01186476, + "auxiliary_loss_mlp": 0.01059916, + "balance_loss_clip": 1.03811264, + "balance_loss_mlp": 1.0555284, + "epoch": 0.07894183075304374, + "flos": 19864849777920.0, + "grad_norm": 1.7542323177910393, + "language_loss": 0.81968379, + "learning_rate": 3.9749268575868085e-06, + "loss": 0.84214771, + "num_input_tokens_seen": 27890625, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.3125, + "step": 1313, + "time_per_iteration": 2.507046699523926 + }, + { + "auxiliary_loss_clip": 0.0119261, + "auxiliary_loss_mlp": 0.0105996, + "balance_loss_clip": 1.03528404, + "balance_loss_mlp": 1.05271506, + "epoch": 0.07900195400571171, + "flos": 16143463071360.0, + "grad_norm": 3.109477065223649, + "language_loss": 0.73372161, + "learning_rate": 3.97486534441264e-06, + "loss": 0.7562474, + "num_input_tokens_seen": 27906530, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.3984375, + "step": 1314, + "time_per_iteration": 2.4396395683288574 + }, + { + "auxiliary_loss_clip": 0.01185115, + "auxiliary_loss_mlp": 0.01058505, + "balance_loss_clip": 1.03678489, + "balance_loss_mlp": 1.05120206, + "epoch": 0.07906207725837967, + "flos": 23730417676800.0, + "grad_norm": 1.579996187361532, + "language_loss": 0.79460657, + "learning_rate": 3.974803756351379e-06, + "loss": 0.81704271, + "num_input_tokens_seen": 27926725, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.34375, + "step": 1315, + "time_per_iteration": 2.493365526199341 + }, + { + "auxiliary_loss_clip": 0.011877, + "auxiliary_loss_mlp": 0.01060931, + "balance_loss_clip": 1.03592062, + "balance_loss_mlp": 1.05232, + "epoch": 0.07912220051104765, + "flos": 24315905364480.0, + "grad_norm": 1.9411836832725016, + "language_loss": 0.73614991, + "learning_rate": 3.974742093405362e-06, + "loss": 0.75863618, + "num_input_tokens_seen": 27947875, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.3515625, + "step": 1316, + "time_per_iteration": 2.4696316719055176 + }, + { + "auxiliary_loss_clip": 0.01193023, + "auxiliary_loss_mlp": 0.01063129, + "balance_loss_clip": 1.03940618, + "balance_loss_mlp": 1.05415511, + "epoch": 0.07918232376371562, + "flos": 18880035615360.0, + "grad_norm": 2.862910173072837, + "language_loss": 0.65148681, + "learning_rate": 3.974680355576927e-06, + "loss": 0.67404836, + "num_input_tokens_seen": 27965040, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.390625, + "step": 1317, + "time_per_iteration": 2.447847843170166 + }, + { + "auxiliary_loss_clip": 0.01197561, + "auxiliary_loss_mlp": 0.01063488, + "balance_loss_clip": 1.03899026, + "balance_loss_mlp": 1.05774999, + "epoch": 0.07924244701638358, + "flos": 27376284038400.0, + "grad_norm": 2.3478172138868967, + "language_loss": 0.7324174, + "learning_rate": 3.974618542868415e-06, + "loss": 0.75502789, + "num_input_tokens_seen": 27985330, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.3984375, + "step": 1318, + "time_per_iteration": 2.497406482696533 + }, + { + "auxiliary_loss_clip": 0.01188329, + "auxiliary_loss_mlp": 0.01057875, + "balance_loss_clip": 1.03557122, + "balance_loss_mlp": 1.05335736, + "epoch": 0.07930257026905156, + "flos": 25120340403840.0, + "grad_norm": 1.92969491679129, + "language_loss": 0.90610284, + "learning_rate": 3.97455665528217e-06, + "loss": 0.92856491, + "num_input_tokens_seen": 28007615, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.3515625, + "step": 1319, + "time_per_iteration": 2.5007200241088867 + }, + { + "auxiliary_loss_clip": 0.01188786, + "auxiliary_loss_mlp": 0.01054126, + "balance_loss_clip": 1.03086793, + "balance_loss_mlp": 1.05155873, + "epoch": 0.07936269352171953, + "flos": 21834478103040.0, + "grad_norm": 2.95797867210378, + "language_loss": 0.79765761, + "learning_rate": 3.974494692820539e-06, + "loss": 0.82008684, + "num_input_tokens_seen": 28027765, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.375, + "step": 1320, + "time_per_iteration": 2.4683783054351807 + }, + { + "auxiliary_loss_clip": 0.01190918, + "auxiliary_loss_mlp": 0.01057044, + "balance_loss_clip": 1.03448987, + "balance_loss_mlp": 1.05700457, + "epoch": 0.07942281677438749, + "flos": 16939889377920.0, + "grad_norm": 2.6163787894008363, + "language_loss": 0.69574934, + "learning_rate": 3.974432655485872e-06, + "loss": 0.71822894, + "num_input_tokens_seen": 28044225, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.34375, + "step": 1321, + "time_per_iteration": 2.466911554336548 + }, + { + "auxiliary_loss_clip": 0.01184231, + "auxiliary_loss_mlp": 0.01055954, + "balance_loss_clip": 1.03313756, + "balance_loss_mlp": 1.05313718, + "epoch": 0.07948294002705546, + "flos": 18986941468800.0, + "grad_norm": 1.926313653502779, + "language_loss": 0.83559513, + "learning_rate": 3.9743705432805195e-06, + "loss": 0.857997, + "num_input_tokens_seen": 28062915, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.3125, + "step": 1322, + "time_per_iteration": 2.465885639190674 + }, + { + "auxiliary_loss_clip": 0.01188233, + "auxiliary_loss_mlp": 0.01058763, + "balance_loss_clip": 1.03544521, + "balance_loss_mlp": 1.05104756, + "epoch": 0.07954306327972344, + "flos": 21653452535040.0, + "grad_norm": 1.8863777031262867, + "language_loss": 0.90437615, + "learning_rate": 3.974308356206838e-06, + "loss": 0.92684615, + "num_input_tokens_seen": 28082175, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.375, + "step": 1323, + "time_per_iteration": 2.465841293334961 + }, + { + "auxiliary_loss_clip": 0.0118735, + "auxiliary_loss_mlp": 0.01057162, + "balance_loss_clip": 1.03438115, + "balance_loss_mlp": 1.05414796, + "epoch": 0.0796031865323914, + "flos": 23220270766080.0, + "grad_norm": 1.6454981938510795, + "language_loss": 0.82583225, + "learning_rate": 3.974246094267187e-06, + "loss": 0.84827733, + "num_input_tokens_seen": 28102645, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.328125, + "step": 1324, + "time_per_iteration": 2.475486993789673 + }, + { + "auxiliary_loss_clip": 0.01188398, + "auxiliary_loss_mlp": 0.01049438, + "balance_loss_clip": 1.0255841, + "balance_loss_mlp": 1.05264676, + "epoch": 0.07966330978505937, + "flos": 23294534135040.0, + "grad_norm": 2.416918252865386, + "language_loss": 0.79654729, + "learning_rate": 3.974183757463925e-06, + "loss": 0.81892562, + "num_input_tokens_seen": 28122805, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.359375, + "step": 1325, + "time_per_iteration": 2.482555389404297 + }, + { + "auxiliary_loss_clip": 0.01190127, + "auxiliary_loss_mlp": 0.01064919, + "balance_loss_clip": 1.03989661, + "balance_loss_mlp": 1.05474687, + "epoch": 0.07972343303772735, + "flos": 18363783392640.0, + "grad_norm": 2.170521767048619, + "language_loss": 0.8812806, + "learning_rate": 3.974121345799418e-06, + "loss": 0.90383106, + "num_input_tokens_seen": 28140530, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.3515625, + "step": 1326, + "time_per_iteration": 2.466742753982544 + }, + { + "auxiliary_loss_clip": 0.01182901, + "auxiliary_loss_mlp": 0.01052255, + "balance_loss_clip": 1.02823424, + "balance_loss_mlp": 1.05014396, + "epoch": 0.07978355629039531, + "flos": 21762513204480.0, + "grad_norm": 2.3992518634606164, + "language_loss": 0.83013594, + "learning_rate": 3.974058859276032e-06, + "loss": 0.8524875, + "num_input_tokens_seen": 28159640, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.328125, + "step": 1327, + "time_per_iteration": 2.4989237785339355 + }, + { + "auxiliary_loss_clip": 0.0119143, + "auxiliary_loss_mlp": 0.0105424, + "balance_loss_clip": 1.03013575, + "balance_loss_mlp": 1.05436027, + "epoch": 0.07984367954306328, + "flos": 18551309322240.0, + "grad_norm": 2.1664091533416587, + "language_loss": 0.78452092, + "learning_rate": 3.9739962978961354e-06, + "loss": 0.80697763, + "num_input_tokens_seen": 28177050, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.375, + "step": 1328, + "time_per_iteration": 2.4402852058410645 + }, + { + "auxiliary_loss_clip": 0.01191637, + "auxiliary_loss_mlp": 0.01053331, + "balance_loss_clip": 1.02969217, + "balance_loss_mlp": 1.05460131, + "epoch": 0.07990380279573125, + "flos": 16904050583040.0, + "grad_norm": 2.484533735051083, + "language_loss": 0.74277186, + "learning_rate": 3.973933661662101e-06, + "loss": 0.76522154, + "num_input_tokens_seen": 28193245, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.375, + "step": 1329, + "time_per_iteration": 2.425388813018799 + }, + { + "auxiliary_loss_clip": 0.01185759, + "auxiliary_loss_mlp": 0.01060058, + "balance_loss_clip": 1.03731298, + "balance_loss_mlp": 1.05096054, + "epoch": 0.07996392604839922, + "flos": 24098358643200.0, + "grad_norm": 1.5753219993175995, + "language_loss": 0.81090498, + "learning_rate": 3.973870950576305e-06, + "loss": 0.83336312, + "num_input_tokens_seen": 28213570, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.3515625, + "step": 1330, + "time_per_iteration": 2.4831247329711914 + }, + { + "auxiliary_loss_clip": 0.01190834, + "auxiliary_loss_mlp": 0.01062422, + "balance_loss_clip": 1.03924823, + "balance_loss_mlp": 1.05348384, + "epoch": 0.08002404930106718, + "flos": 14278729438080.0, + "grad_norm": 2.322034822225311, + "language_loss": 0.88790143, + "learning_rate": 3.9738081646411255e-06, + "loss": 0.91043401, + "num_input_tokens_seen": 28229980, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.375, + "step": 1331, + "time_per_iteration": 2.4410722255706787 + }, + { + "auxiliary_loss_clip": 0.01193336, + "auxiliary_loss_mlp": 0.01058252, + "balance_loss_clip": 1.03414834, + "balance_loss_mlp": 1.05288279, + "epoch": 0.08008417255373516, + "flos": 40406219285760.0, + "grad_norm": 2.577873328737783, + "language_loss": 0.73332524, + "learning_rate": 3.973745303858942e-06, + "loss": 0.75584114, + "num_input_tokens_seen": 28253840, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.40625, + "step": 1332, + "time_per_iteration": 2.6054465770721436 + }, + { + "auxiliary_loss_clip": 0.01186558, + "auxiliary_loss_mlp": 0.01050644, + "balance_loss_clip": 1.02820885, + "balance_loss_mlp": 1.05179858, + "epoch": 0.08014429580640313, + "flos": 18478913460480.0, + "grad_norm": 1.9568005204239032, + "language_loss": 0.82994795, + "learning_rate": 3.973682368232138e-06, + "loss": 0.85232008, + "num_input_tokens_seen": 28271675, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.34375, + "step": 1333, + "time_per_iteration": 2.453457832336426 + }, + { + "auxiliary_loss_clip": 0.01187412, + "auxiliary_loss_mlp": 0.01055323, + "balance_loss_clip": 1.03272128, + "balance_loss_mlp": 1.05115032, + "epoch": 0.0802044190590711, + "flos": 22053461368320.0, + "grad_norm": 2.7771179443818466, + "language_loss": 0.74698973, + "learning_rate": 3.9736193577631015e-06, + "loss": 0.76941711, + "num_input_tokens_seen": 28291850, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.359375, + "step": 1334, + "time_per_iteration": 2.5768256187438965 + }, + { + "auxiliary_loss_clip": 0.01187182, + "auxiliary_loss_mlp": 0.01060862, + "balance_loss_clip": 1.03831935, + "balance_loss_mlp": 1.05457497, + "epoch": 0.08026454231173906, + "flos": 24572128055040.0, + "grad_norm": 2.0216765528325635, + "language_loss": 0.80279201, + "learning_rate": 3.973556272454221e-06, + "loss": 0.82527244, + "num_input_tokens_seen": 28310780, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.328125, + "step": 1335, + "time_per_iteration": 2.538670301437378 + }, + { + "auxiliary_loss_clip": 0.01078994, + "auxiliary_loss_mlp": 0.01011272, + "balance_loss_clip": 1.00802934, + "balance_loss_mlp": 1.02308655, + "epoch": 0.08032466556440704, + "flos": 52581841459200.0, + "grad_norm": 0.7427722697577622, + "language_loss": 0.56020629, + "learning_rate": 3.973493112307889e-06, + "loss": 0.58110893, + "num_input_tokens_seen": 28369985, + "router_z_loss_clip": 0.0324707, + "router_z_loss_mlp": 0.5625, + "step": 1336, + "time_per_iteration": 3.125026226043701 + }, + { + "auxiliary_loss_clip": 0.01188939, + "auxiliary_loss_mlp": 0.01054834, + "balance_loss_clip": 1.0331738, + "balance_loss_mlp": 1.05371606, + "epoch": 0.080384788817075, + "flos": 23842602829440.0, + "grad_norm": 2.050916847484745, + "language_loss": 0.67764497, + "learning_rate": 3.9734298773265005e-06, + "loss": 0.70008272, + "num_input_tokens_seen": 28388670, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.3515625, + "step": 1337, + "time_per_iteration": 2.506103038787842 + }, + { + "auxiliary_loss_clip": 0.01188826, + "auxiliary_loss_mlp": 0.01065102, + "balance_loss_clip": 1.04313135, + "balance_loss_mlp": 1.05480385, + "epoch": 0.08044491206974297, + "flos": 25300719527040.0, + "grad_norm": 1.8692893317328456, + "language_loss": 0.86701488, + "learning_rate": 3.973366567512453e-06, + "loss": 0.88955414, + "num_input_tokens_seen": 28411845, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.3359375, + "step": 1338, + "time_per_iteration": 2.5451908111572266 + }, + { + "auxiliary_loss_clip": 0.01188004, + "auxiliary_loss_mlp": 0.01060185, + "balance_loss_clip": 1.0368793, + "balance_loss_mlp": 1.05142283, + "epoch": 0.08050503532241095, + "flos": 22376549226240.0, + "grad_norm": 2.6265473040924725, + "language_loss": 0.87246621, + "learning_rate": 3.973303182868147e-06, + "loss": 0.89494807, + "num_input_tokens_seen": 28427875, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.359375, + "step": 1339, + "time_per_iteration": 2.450932502746582 + }, + { + "auxiliary_loss_clip": 0.01181336, + "auxiliary_loss_mlp": 0.01047749, + "balance_loss_clip": 1.02660179, + "balance_loss_mlp": 1.05106449, + "epoch": 0.08056515857507891, + "flos": 18369421827840.0, + "grad_norm": 2.428441908593999, + "language_loss": 0.88819683, + "learning_rate": 3.973239723395988e-06, + "loss": 0.91048771, + "num_input_tokens_seen": 28446615, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.296875, + "step": 1340, + "time_per_iteration": 2.4539895057678223 + }, + { + "auxiliary_loss_clip": 0.01072684, + "auxiliary_loss_mlp": 0.01003041, + "balance_loss_clip": 0.99951285, + "balance_loss_mlp": 1.01727247, + "epoch": 0.08062528182774688, + "flos": 51348130980480.0, + "grad_norm": 0.8886760882983712, + "language_loss": 0.64806795, + "learning_rate": 3.97317618909838e-06, + "loss": 0.66882515, + "num_input_tokens_seen": 28505290, + "router_z_loss_clip": 0.03540039, + "router_z_loss_mlp": 0.5546875, + "step": 1341, + "time_per_iteration": 3.0034360885620117 + }, + { + "auxiliary_loss_clip": 0.01193907, + "auxiliary_loss_mlp": 0.01060938, + "balance_loss_clip": 1.03577328, + "balance_loss_mlp": 1.05301166, + "epoch": 0.08068540508041486, + "flos": 17599712261760.0, + "grad_norm": 2.817345215565239, + "language_loss": 0.89616883, + "learning_rate": 3.973112579977733e-06, + "loss": 0.91871732, + "num_input_tokens_seen": 28522735, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.40625, + "step": 1342, + "time_per_iteration": 2.479701042175293 + }, + { + "auxiliary_loss_clip": 0.01194936, + "auxiliary_loss_mlp": 0.0105815, + "balance_loss_clip": 1.03334308, + "balance_loss_mlp": 1.05721259, + "epoch": 0.08074552833308282, + "flos": 10561185486720.0, + "grad_norm": 2.7453135307928216, + "language_loss": 0.76378155, + "learning_rate": 3.973048896036459e-06, + "loss": 0.78631246, + "num_input_tokens_seen": 28539460, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.375, + "step": 1343, + "time_per_iteration": 2.4969120025634766 + }, + { + "auxiliary_loss_clip": 0.01072196, + "auxiliary_loss_mlp": 0.01008036, + "balance_loss_clip": 1.00446022, + "balance_loss_mlp": 1.01791215, + "epoch": 0.08080565158575079, + "flos": 60840254954880.0, + "grad_norm": 0.8963318804352591, + "language_loss": 0.57395822, + "learning_rate": 3.972985137276974e-06, + "loss": 0.59476054, + "num_input_tokens_seen": 28599855, + "router_z_loss_clip": 0.03564453, + "router_z_loss_mlp": 0.54296875, + "step": 1344, + "time_per_iteration": 2.9917871952056885 + }, + { + "auxiliary_loss_clip": 0.01190985, + "auxiliary_loss_mlp": 0.0105771, + "balance_loss_clip": 1.03452373, + "balance_loss_mlp": 1.05523396, + "epoch": 0.08086577483841875, + "flos": 18332361970560.0, + "grad_norm": 2.677643541218582, + "language_loss": 0.86665964, + "learning_rate": 3.972921303701695e-06, + "loss": 0.88914657, + "num_input_tokens_seen": 28617585, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.359375, + "step": 1345, + "time_per_iteration": 2.4601447582244873 + }, + { + "auxiliary_loss_clip": 0.01187459, + "auxiliary_loss_mlp": 0.01054913, + "balance_loss_clip": 1.03289497, + "balance_loss_mlp": 1.05403256, + "epoch": 0.08092589809108673, + "flos": 21543601766400.0, + "grad_norm": 1.7098835991166323, + "language_loss": 0.87242532, + "learning_rate": 3.972857395313042e-06, + "loss": 0.894849, + "num_input_tokens_seen": 28636355, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.3359375, + "step": 1346, + "time_per_iteration": 2.4809892177581787 + }, + { + "auxiliary_loss_clip": 0.01185898, + "auxiliary_loss_mlp": 0.01054973, + "balance_loss_clip": 1.03256202, + "balance_loss_mlp": 1.05219567, + "epoch": 0.0809860213437547, + "flos": 22128012046080.0, + "grad_norm": 1.6659805361601863, + "language_loss": 0.92606491, + "learning_rate": 3.972793412113439e-06, + "loss": 0.94847363, + "num_input_tokens_seen": 28656260, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.3359375, + "step": 1347, + "time_per_iteration": 2.4802379608154297 + }, + { + "auxiliary_loss_clip": 0.0118757, + "auxiliary_loss_mlp": 0.01057822, + "balance_loss_clip": 1.03318167, + "balance_loss_mlp": 1.05471659, + "epoch": 0.08104614459642266, + "flos": 21725489260800.0, + "grad_norm": 9.453605004454174, + "language_loss": 0.89181751, + "learning_rate": 3.972729354105312e-06, + "loss": 0.91427147, + "num_input_tokens_seen": 28675865, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.328125, + "step": 1348, + "time_per_iteration": 2.4610300064086914 + }, + { + "auxiliary_loss_clip": 0.01185296, + "auxiliary_loss_mlp": 0.01056008, + "balance_loss_clip": 1.03420484, + "balance_loss_mlp": 1.05543983, + "epoch": 0.08110626784909064, + "flos": 23951878980480.0, + "grad_norm": 2.4916215003739355, + "language_loss": 0.76796132, + "learning_rate": 3.97266522129109e-06, + "loss": 0.7903744, + "num_input_tokens_seen": 28696255, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.296875, + "step": 1349, + "time_per_iteration": 2.4789178371429443 + }, + { + "auxiliary_loss_clip": 0.01187103, + "auxiliary_loss_mlp": 0.0105974, + "balance_loss_clip": 1.03669679, + "balance_loss_mlp": 1.05236626, + "epoch": 0.0811663911017586, + "flos": 19025689265280.0, + "grad_norm": 2.126949034470324, + "language_loss": 0.88571703, + "learning_rate": 3.972601013673205e-06, + "loss": 0.90818548, + "num_input_tokens_seen": 28713905, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.34375, + "step": 1350, + "time_per_iteration": 2.43094539642334 + }, + { + "auxiliary_loss_clip": 0.01184059, + "auxiliary_loss_mlp": 0.01061052, + "balance_loss_clip": 1.03773451, + "balance_loss_mlp": 1.05228257, + "epoch": 0.08122651435442657, + "flos": 15341290588800.0, + "grad_norm": 2.044220866897066, + "language_loss": 0.82058489, + "learning_rate": 3.972536731254092e-06, + "loss": 0.843036, + "num_input_tokens_seen": 28732075, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.3125, + "step": 1351, + "time_per_iteration": 6.688653469085693 + }, + { + "auxiliary_loss_clip": 0.01184193, + "auxiliary_loss_mlp": 0.01053712, + "balance_loss_clip": 1.02917862, + "balance_loss_mlp": 1.04863417, + "epoch": 0.08128663760709455, + "flos": 23221563655680.0, + "grad_norm": 1.9894600711485977, + "language_loss": 0.75347674, + "learning_rate": 3.972472374036189e-06, + "loss": 0.77585584, + "num_input_tokens_seen": 28751150, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.359375, + "step": 1352, + "time_per_iteration": 2.4888412952423096 + }, + { + "auxiliary_loss_clip": 0.01192461, + "auxiliary_loss_mlp": 0.01055559, + "balance_loss_clip": 1.03163338, + "balance_loss_mlp": 1.05483341, + "epoch": 0.08134676085976252, + "flos": 22965628273920.0, + "grad_norm": 1.7603053493114211, + "language_loss": 0.82833469, + "learning_rate": 3.972407942021935e-06, + "loss": 0.85081488, + "num_input_tokens_seen": 28773360, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.375, + "step": 1353, + "time_per_iteration": 2.522960901260376 + }, + { + "auxiliary_loss_clip": 0.01073388, + "auxiliary_loss_mlp": 0.01010471, + "balance_loss_clip": 1.00694275, + "balance_loss_mlp": 1.01996851, + "epoch": 0.08140688411243048, + "flos": 64322115816960.0, + "grad_norm": 0.8931676068679675, + "language_loss": 0.5970993, + "learning_rate": 3.972343435213775e-06, + "loss": 0.61793786, + "num_input_tokens_seen": 28833390, + "router_z_loss_clip": 0.03540039, + "router_z_loss_mlp": 0.53125, + "step": 1354, + "time_per_iteration": 3.0639474391937256 + }, + { + "auxiliary_loss_clip": 0.0118665, + "auxiliary_loss_mlp": 0.01060844, + "balance_loss_clip": 1.03764629, + "balance_loss_mlp": 1.05431724, + "epoch": 0.08146700736509845, + "flos": 22491858862080.0, + "grad_norm": 1.7981329827127455, + "language_loss": 0.82785606, + "learning_rate": 3.972278853614154e-06, + "loss": 0.85033101, + "num_input_tokens_seen": 28852430, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.3203125, + "step": 1355, + "time_per_iteration": 2.4664132595062256 + }, + { + "auxiliary_loss_clip": 0.01186535, + "auxiliary_loss_mlp": 0.01062061, + "balance_loss_clip": 1.03619206, + "balance_loss_mlp": 1.05146575, + "epoch": 0.08152713061776642, + "flos": 20447823513600.0, + "grad_norm": 1.9123465925299232, + "language_loss": 0.70799643, + "learning_rate": 3.972214197225521e-06, + "loss": 0.73048234, + "num_input_tokens_seen": 28870685, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.3515625, + "step": 1356, + "time_per_iteration": 2.509061813354492 + }, + { + "auxiliary_loss_clip": 0.01188256, + "auxiliary_loss_mlp": 0.01055944, + "balance_loss_clip": 1.03169644, + "balance_loss_mlp": 1.05148005, + "epoch": 0.08158725387043439, + "flos": 23550218121600.0, + "grad_norm": 2.53580294551395, + "language_loss": 0.70255458, + "learning_rate": 3.972149466050329e-06, + "loss": 0.72499657, + "num_input_tokens_seen": 28889860, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.3671875, + "step": 1357, + "time_per_iteration": 2.476951837539673 + }, + { + "auxiliary_loss_clip": 0.01191615, + "auxiliary_loss_mlp": 0.01053374, + "balance_loss_clip": 1.03067684, + "balance_loss_mlp": 1.05488217, + "epoch": 0.08164737712310235, + "flos": 22017335264640.0, + "grad_norm": 2.6163823683714953, + "language_loss": 0.84186697, + "learning_rate": 3.97208466009103e-06, + "loss": 0.86431682, + "num_input_tokens_seen": 28905865, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.3671875, + "step": 1358, + "time_per_iteration": 2.457376480102539 + }, + { + "auxiliary_loss_clip": 0.01190093, + "auxiliary_loss_mlp": 0.01056216, + "balance_loss_clip": 1.0310626, + "balance_loss_mlp": 1.05484545, + "epoch": 0.08170750037577033, + "flos": 23367827836800.0, + "grad_norm": 1.9894839389786314, + "language_loss": 1.02294087, + "learning_rate": 3.972019779350084e-06, + "loss": 1.04540396, + "num_input_tokens_seen": 28925250, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.3515625, + "step": 1359, + "time_per_iteration": 2.4723212718963623 + }, + { + "auxiliary_loss_clip": 0.01185855, + "auxiliary_loss_mlp": 0.01057366, + "balance_loss_clip": 1.03344035, + "balance_loss_mlp": 1.0511415, + "epoch": 0.0817676236284383, + "flos": 28397978490240.0, + "grad_norm": 2.0666688933075963, + "language_loss": 0.82969773, + "learning_rate": 3.971954823829951e-06, + "loss": 0.85212988, + "num_input_tokens_seen": 28943445, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.34375, + "step": 1360, + "time_per_iteration": 2.5143508911132812 + }, + { + "auxiliary_loss_clip": 0.01190184, + "auxiliary_loss_mlp": 0.01062181, + "balance_loss_clip": 1.03820777, + "balance_loss_mlp": 1.05335808, + "epoch": 0.08182774688110626, + "flos": 19208905562880.0, + "grad_norm": 2.14797754608813, + "language_loss": 0.72352278, + "learning_rate": 3.971889793533093e-06, + "loss": 0.74604642, + "num_input_tokens_seen": 28962695, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.3671875, + "step": 1361, + "time_per_iteration": 2.458034038543701 + }, + { + "auxiliary_loss_clip": 0.01179057, + "auxiliary_loss_mlp": 0.01057287, + "balance_loss_clip": 1.03249121, + "balance_loss_mlp": 1.04741335, + "epoch": 0.08188787013377424, + "flos": 22784099915520.0, + "grad_norm": 5.8589819193374515, + "language_loss": 0.76781029, + "learning_rate": 3.971824688461976e-06, + "loss": 0.79017377, + "num_input_tokens_seen": 28982120, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.3125, + "step": 1362, + "time_per_iteration": 2.472259759902954 + }, + { + "auxiliary_loss_clip": 0.01187551, + "auxiliary_loss_mlp": 0.01052018, + "balance_loss_clip": 1.0291419, + "balance_loss_mlp": 1.05449164, + "epoch": 0.08194799338644221, + "flos": 16468095214080.0, + "grad_norm": 2.631594675791475, + "language_loss": 0.72409523, + "learning_rate": 3.971759508619069e-06, + "loss": 0.74649096, + "num_input_tokens_seen": 28998100, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.328125, + "step": 1363, + "time_per_iteration": 2.4447264671325684 + }, + { + "auxiliary_loss_clip": 0.01189235, + "auxiliary_loss_mlp": 0.01061525, + "balance_loss_clip": 1.03603828, + "balance_loss_mlp": 1.05607057, + "epoch": 0.08200811663911017, + "flos": 23913633974400.0, + "grad_norm": 3.9166951523525464, + "language_loss": 0.77459586, + "learning_rate": 3.971694254006844e-06, + "loss": 0.79710352, + "num_input_tokens_seen": 29017095, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.328125, + "step": 1364, + "time_per_iteration": 2.475141763687134 + }, + { + "auxiliary_loss_clip": 0.01190144, + "auxiliary_loss_mlp": 0.01061328, + "balance_loss_clip": 1.03745019, + "balance_loss_mlp": 1.05500793, + "epoch": 0.08206823989177814, + "flos": 17896550256000.0, + "grad_norm": 2.6241179536013033, + "language_loss": 0.82025397, + "learning_rate": 3.971628924627776e-06, + "loss": 0.84276867, + "num_input_tokens_seen": 29037240, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.3515625, + "step": 1365, + "time_per_iteration": 2.493732452392578 + }, + { + "auxiliary_loss_clip": 0.0118713, + "auxiliary_loss_mlp": 0.0105741, + "balance_loss_clip": 1.03406882, + "balance_loss_mlp": 1.05614781, + "epoch": 0.08212836314444612, + "flos": 22088186841600.0, + "grad_norm": 3.3261283913074884, + "language_loss": 0.82173789, + "learning_rate": 3.97156352048434e-06, + "loss": 0.84418333, + "num_input_tokens_seen": 29056250, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.3125, + "step": 1366, + "time_per_iteration": 2.4809322357177734 + }, + { + "auxiliary_loss_clip": 0.01186928, + "auxiliary_loss_mlp": 0.0105891, + "balance_loss_clip": 1.03703475, + "balance_loss_mlp": 1.05126381, + "epoch": 0.08218848639711408, + "flos": 17597485618560.0, + "grad_norm": 2.8403828718649033, + "language_loss": 0.81534755, + "learning_rate": 3.97149804157902e-06, + "loss": 0.83780599, + "num_input_tokens_seen": 29073380, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.359375, + "step": 1367, + "time_per_iteration": 2.4577715396881104 + }, + { + "auxiliary_loss_clip": 0.01192402, + "auxiliary_loss_mlp": 0.01060775, + "balance_loss_clip": 1.03724277, + "balance_loss_mlp": 1.05413651, + "epoch": 0.08224860964978205, + "flos": 17857838373120.0, + "grad_norm": 2.3540874203263358, + "language_loss": 0.83644414, + "learning_rate": 3.9714324879142946e-06, + "loss": 0.85897589, + "num_input_tokens_seen": 29091330, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.3828125, + "step": 1368, + "time_per_iteration": 2.453547716140747 + }, + { + "auxiliary_loss_clip": 0.01181645, + "auxiliary_loss_mlp": 0.01049123, + "balance_loss_clip": 1.02694988, + "balance_loss_mlp": 1.05349994, + "epoch": 0.08230873290245003, + "flos": 25227533566080.0, + "grad_norm": 1.7360129433802456, + "language_loss": 0.81245828, + "learning_rate": 3.971366859492653e-06, + "loss": 0.83476603, + "num_input_tokens_seen": 29110375, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.28125, + "step": 1369, + "time_per_iteration": 2.527573585510254 + }, + { + "auxiliary_loss_clip": 0.01185735, + "auxiliary_loss_mlp": 0.01051149, + "balance_loss_clip": 1.02979898, + "balance_loss_mlp": 1.05528903, + "epoch": 0.08236885615511799, + "flos": 31759935753600.0, + "grad_norm": 2.240857135161324, + "language_loss": 0.74790901, + "learning_rate": 3.971301156316582e-06, + "loss": 0.77027786, + "num_input_tokens_seen": 29129395, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.3046875, + "step": 1370, + "time_per_iteration": 2.5205185413360596 + }, + { + "auxiliary_loss_clip": 0.01189372, + "auxiliary_loss_mlp": 0.01061396, + "balance_loss_clip": 1.03697038, + "balance_loss_mlp": 1.05480862, + "epoch": 0.08242897940778596, + "flos": 23185832601600.0, + "grad_norm": 1.6313231263601415, + "language_loss": 0.74633086, + "learning_rate": 3.971235378388573e-06, + "loss": 0.76883852, + "num_input_tokens_seen": 29148650, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.34375, + "step": 1371, + "time_per_iteration": 2.4995803833007812 + }, + { + "auxiliary_loss_clip": 0.01188254, + "auxiliary_loss_mlp": 0.01061601, + "balance_loss_clip": 1.03769946, + "balance_loss_mlp": 1.05410123, + "epoch": 0.08248910266045394, + "flos": 34491480393600.0, + "grad_norm": 2.0830704741847423, + "language_loss": 0.71080554, + "learning_rate": 3.971169525711122e-06, + "loss": 0.73330408, + "num_input_tokens_seen": 29170785, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.34375, + "step": 1372, + "time_per_iteration": 2.574457883834839 + }, + { + "auxiliary_loss_clip": 0.0118845, + "auxiliary_loss_mlp": 0.01051797, + "balance_loss_clip": 1.02750254, + "balance_loss_mlp": 1.05397415, + "epoch": 0.0825492259131219, + "flos": 13436228960640.0, + "grad_norm": 3.137320584176607, + "language_loss": 0.88010907, + "learning_rate": 3.9711035982867246e-06, + "loss": 0.90251154, + "num_input_tokens_seen": 29185210, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.34375, + "step": 1373, + "time_per_iteration": 2.485727310180664 + }, + { + "auxiliary_loss_clip": 0.01186594, + "auxiliary_loss_mlp": 0.01058909, + "balance_loss_clip": 1.03575897, + "balance_loss_mlp": 1.05331743, + "epoch": 0.08260934916578987, + "flos": 25812446636160.0, + "grad_norm": 1.7727067520163604, + "language_loss": 0.82349706, + "learning_rate": 3.971037596117882e-06, + "loss": 0.84595209, + "num_input_tokens_seen": 29205210, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.328125, + "step": 1374, + "time_per_iteration": 2.5223724842071533 + }, + { + "auxiliary_loss_clip": 0.01076427, + "auxiliary_loss_mlp": 0.01007461, + "balance_loss_clip": 1.0032891, + "balance_loss_mlp": 1.02371156, + "epoch": 0.08266947241845783, + "flos": 63460009491840.0, + "grad_norm": 0.8248734910296001, + "language_loss": 0.60630989, + "learning_rate": 3.970971519207095e-06, + "loss": 0.62714875, + "num_input_tokens_seen": 29265350, + "router_z_loss_clip": 0.04174805, + "router_z_loss_mlp": 0.5234375, + "step": 1375, + "time_per_iteration": 3.0909183025360107 + }, + { + "auxiliary_loss_clip": 0.01074233, + "auxiliary_loss_mlp": 0.01006319, + "balance_loss_clip": 1.00221813, + "balance_loss_mlp": 1.02162504, + "epoch": 0.08272959567112581, + "flos": 69993704568960.0, + "grad_norm": 0.9071425511101782, + "language_loss": 0.62149519, + "learning_rate": 3.970905367556871e-06, + "loss": 0.64230067, + "num_input_tokens_seen": 29321475, + "router_z_loss_clip": 0.04101562, + "router_z_loss_mlp": 0.52734375, + "step": 1376, + "time_per_iteration": 2.991158962249756 + }, + { + "auxiliary_loss_clip": 0.01195866, + "auxiliary_loss_mlp": 0.01069408, + "balance_loss_clip": 1.04624534, + "balance_loss_mlp": 1.05995989, + "epoch": 0.08278971892379378, + "flos": 20413205781120.0, + "grad_norm": 1.9826192893196872, + "language_loss": 0.82601643, + "learning_rate": 3.970839141169718e-06, + "loss": 0.84866917, + "num_input_tokens_seen": 29341405, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.359375, + "step": 1377, + "time_per_iteration": 2.5851728916168213 + }, + { + "auxiliary_loss_clip": 0.01188463, + "auxiliary_loss_mlp": 0.01057538, + "balance_loss_clip": 1.0342443, + "balance_loss_mlp": 1.05601847, + "epoch": 0.08284984217646174, + "flos": 26250233598720.0, + "grad_norm": 1.8760965133588865, + "language_loss": 0.84516692, + "learning_rate": 3.970772840048147e-06, + "loss": 0.86762691, + "num_input_tokens_seen": 29361955, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.3203125, + "step": 1378, + "time_per_iteration": 2.500251054763794 + }, + { + "auxiliary_loss_clip": 0.01190024, + "auxiliary_loss_mlp": 0.01062419, + "balance_loss_clip": 1.0383265, + "balance_loss_mlp": 1.05516553, + "epoch": 0.08290996542912972, + "flos": 27194683852800.0, + "grad_norm": 1.9551783234852504, + "language_loss": 0.87725681, + "learning_rate": 3.970706464194672e-06, + "loss": 0.89978123, + "num_input_tokens_seen": 29382395, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.3515625, + "step": 1379, + "time_per_iteration": 2.5428385734558105 + }, + { + "auxiliary_loss_clip": 0.01189534, + "auxiliary_loss_mlp": 0.01056049, + "balance_loss_clip": 1.03336358, + "balance_loss_mlp": 1.05776525, + "epoch": 0.08297008868179769, + "flos": 38618191146240.0, + "grad_norm": 1.7573789229703745, + "language_loss": 0.78658688, + "learning_rate": 3.970640013611812e-06, + "loss": 0.80904275, + "num_input_tokens_seen": 29404460, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.3125, + "step": 1380, + "time_per_iteration": 2.6350595951080322 + }, + { + "auxiliary_loss_clip": 0.01190411, + "auxiliary_loss_mlp": 0.01061393, + "balance_loss_clip": 1.03666866, + "balance_loss_mlp": 1.05878401, + "epoch": 0.08303021193446565, + "flos": 19974736460160.0, + "grad_norm": 2.2395713763978002, + "language_loss": 0.86146504, + "learning_rate": 3.970573488302083e-06, + "loss": 0.88398302, + "num_input_tokens_seen": 29422675, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.3125, + "step": 1381, + "time_per_iteration": 2.470153331756592 + }, + { + "auxiliary_loss_clip": 0.0119877, + "auxiliary_loss_mlp": 0.01060106, + "balance_loss_clip": 1.03604937, + "balance_loss_mlp": 1.06063581, + "epoch": 0.08309033518713363, + "flos": 13662646341120.0, + "grad_norm": 3.795546136319442, + "language_loss": 0.8817445, + "learning_rate": 3.970506888268011e-06, + "loss": 0.90433335, + "num_input_tokens_seen": 29439840, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.3828125, + "step": 1382, + "time_per_iteration": 2.4352822303771973 + }, + { + "auxiliary_loss_clip": 0.01190764, + "auxiliary_loss_mlp": 0.01059612, + "balance_loss_clip": 1.03728414, + "balance_loss_mlp": 1.0569818, + "epoch": 0.0831504584398016, + "flos": 17968551068160.0, + "grad_norm": 2.6234570747150734, + "language_loss": 0.77606535, + "learning_rate": 3.970440213512121e-06, + "loss": 0.79856908, + "num_input_tokens_seen": 29457360, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.34375, + "step": 1383, + "time_per_iteration": 2.45939040184021 + }, + { + "auxiliary_loss_clip": 0.01194291, + "auxiliary_loss_mlp": 0.01056287, + "balance_loss_clip": 1.03254008, + "balance_loss_mlp": 1.05730414, + "epoch": 0.08321058169246956, + "flos": 22601386408320.0, + "grad_norm": 2.1508484512905945, + "language_loss": 0.8293128, + "learning_rate": 3.97037346403694e-06, + "loss": 0.85181862, + "num_input_tokens_seen": 29477040, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.3671875, + "step": 1384, + "time_per_iteration": 2.4773356914520264 + }, + { + "auxiliary_loss_clip": 0.01198678, + "auxiliary_loss_mlp": 0.01055169, + "balance_loss_clip": 1.02937245, + "balance_loss_mlp": 1.05890989, + "epoch": 0.08327070494513754, + "flos": 22850426378880.0, + "grad_norm": 2.4890613364481893, + "language_loss": 0.84828049, + "learning_rate": 3.970306639845e-06, + "loss": 0.87081897, + "num_input_tokens_seen": 29492010, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.3984375, + "step": 1385, + "time_per_iteration": 2.5084009170532227 + }, + { + "auxiliary_loss_clip": 0.01194904, + "auxiliary_loss_mlp": 0.01066074, + "balance_loss_clip": 1.04257774, + "balance_loss_mlp": 1.05825758, + "epoch": 0.0833308281978055, + "flos": 22782986593920.0, + "grad_norm": 2.123672194513448, + "language_loss": 0.68744183, + "learning_rate": 3.970239740938835e-06, + "loss": 0.7100516, + "num_input_tokens_seen": 29511850, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.3671875, + "step": 1386, + "time_per_iteration": 2.477592945098877 + }, + { + "auxiliary_loss_clip": 0.01191265, + "auxiliary_loss_mlp": 0.0105612, + "balance_loss_clip": 1.03186047, + "balance_loss_mlp": 1.05579662, + "epoch": 0.08339095145047347, + "flos": 20812604083200.0, + "grad_norm": 1.7726596290820096, + "language_loss": 0.82067239, + "learning_rate": 3.97017276732098e-06, + "loss": 0.84314626, + "num_input_tokens_seen": 29531415, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.359375, + "step": 1387, + "time_per_iteration": 2.529261350631714 + }, + { + "auxiliary_loss_clip": 0.01196512, + "auxiliary_loss_mlp": 0.0107016, + "balance_loss_clip": 1.04474461, + "balance_loss_mlp": 1.05739772, + "epoch": 0.08345107470314143, + "flos": 18515326872960.0, + "grad_norm": 2.385304875072474, + "language_loss": 0.77194649, + "learning_rate": 3.970105718993978e-06, + "loss": 0.79461324, + "num_input_tokens_seen": 29549525, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.390625, + "step": 1388, + "time_per_iteration": 2.4517693519592285 + }, + { + "auxiliary_loss_clip": 0.01187734, + "auxiliary_loss_mlp": 0.01059717, + "balance_loss_clip": 1.0351125, + "balance_loss_mlp": 1.0574429, + "epoch": 0.08351119795580941, + "flos": 18807567926400.0, + "grad_norm": 2.246368739161805, + "language_loss": 0.79078835, + "learning_rate": 3.970038595960369e-06, + "loss": 0.81326282, + "num_input_tokens_seen": 29568705, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.3046875, + "step": 1389, + "time_per_iteration": 2.4999983310699463 + }, + { + "auxiliary_loss_clip": 0.01194109, + "auxiliary_loss_mlp": 0.01056803, + "balance_loss_clip": 1.03368866, + "balance_loss_mlp": 1.05773938, + "epoch": 0.08357132120847738, + "flos": 18441817689600.0, + "grad_norm": 4.533904477221136, + "language_loss": 0.87495124, + "learning_rate": 3.969971398222699e-06, + "loss": 0.89746046, + "num_input_tokens_seen": 29585855, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.359375, + "step": 1390, + "time_per_iteration": 2.438126802444458 + }, + { + "auxiliary_loss_clip": 0.01190886, + "auxiliary_loss_mlp": 0.01063167, + "balance_loss_clip": 1.03902745, + "balance_loss_mlp": 1.05621624, + "epoch": 0.08363144446114534, + "flos": 25922333318400.0, + "grad_norm": 1.6928828016377326, + "language_loss": 0.86753631, + "learning_rate": 3.969904125783517e-06, + "loss": 0.89007682, + "num_input_tokens_seen": 29607280, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.34375, + "step": 1391, + "time_per_iteration": 2.5615429878234863 + }, + { + "auxiliary_loss_clip": 0.01198327, + "auxiliary_loss_mlp": 0.01071606, + "balance_loss_clip": 1.0480268, + "balance_loss_mlp": 1.05904424, + "epoch": 0.08369156771381332, + "flos": 18041306065920.0, + "grad_norm": 4.090701354718017, + "language_loss": 0.87550449, + "learning_rate": 3.969836778645371e-06, + "loss": 0.89820385, + "num_input_tokens_seen": 29624130, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.390625, + "step": 1392, + "time_per_iteration": 2.4343698024749756 + }, + { + "auxiliary_loss_clip": 0.01190277, + "auxiliary_loss_mlp": 0.01060815, + "balance_loss_clip": 1.03682983, + "balance_loss_mlp": 1.05556941, + "epoch": 0.08375169096648129, + "flos": 22675111073280.0, + "grad_norm": 2.9857894096842457, + "language_loss": 0.80519998, + "learning_rate": 3.969769356810819e-06, + "loss": 0.82771087, + "num_input_tokens_seen": 29643210, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.34375, + "step": 1393, + "time_per_iteration": 3.9978342056274414 + }, + { + "auxiliary_loss_clip": 0.01191931, + "auxiliary_loss_mlp": 0.01054176, + "balance_loss_clip": 1.03098941, + "balance_loss_mlp": 1.05832088, + "epoch": 0.08381181421914925, + "flos": 26103215232000.0, + "grad_norm": 1.8413427873168604, + "language_loss": 0.84738398, + "learning_rate": 3.969701860282415e-06, + "loss": 0.86984503, + "num_input_tokens_seen": 29663920, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3359375, + "step": 1394, + "time_per_iteration": 3.995389461517334 + }, + { + "auxiliary_loss_clip": 0.01193271, + "auxiliary_loss_mlp": 0.010537, + "balance_loss_clip": 1.0296433, + "balance_loss_mlp": 1.05856824, + "epoch": 0.08387193747181723, + "flos": 20629782835200.0, + "grad_norm": 1.7688902284368797, + "language_loss": 0.82957625, + "learning_rate": 3.969634289062719e-06, + "loss": 0.85204601, + "num_input_tokens_seen": 29683825, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.34375, + "step": 1395, + "time_per_iteration": 2.5080416202545166 + }, + { + "auxiliary_loss_clip": 0.01194811, + "auxiliary_loss_mlp": 0.01062467, + "balance_loss_clip": 1.03683722, + "balance_loss_mlp": 1.05833054, + "epoch": 0.0839320607244852, + "flos": 13443196199040.0, + "grad_norm": 1.9626395114639965, + "language_loss": 0.82492781, + "learning_rate": 3.969566643154293e-06, + "loss": 0.84750068, + "num_input_tokens_seen": 29698775, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.3671875, + "step": 1396, + "time_per_iteration": 2.51763653755188 + }, + { + "auxiliary_loss_clip": 0.01191589, + "auxiliary_loss_mlp": 0.01058769, + "balance_loss_clip": 1.03253114, + "balance_loss_mlp": 1.05944824, + "epoch": 0.08399218397715316, + "flos": 23477247642240.0, + "grad_norm": 2.3756879295671367, + "language_loss": 0.7702114, + "learning_rate": 3.969498922559703e-06, + "loss": 0.79271495, + "num_input_tokens_seen": 29719430, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.3203125, + "step": 1397, + "time_per_iteration": 2.522019624710083 + }, + { + "auxiliary_loss_clip": 0.01191257, + "auxiliary_loss_mlp": 0.01050826, + "balance_loss_clip": 1.02635193, + "balance_loss_mlp": 1.05688787, + "epoch": 0.08405230722982113, + "flos": 25920717206400.0, + "grad_norm": 2.1333990758799795, + "language_loss": 0.77589226, + "learning_rate": 3.969431127281516e-06, + "loss": 0.79831308, + "num_input_tokens_seen": 29739685, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.34375, + "step": 1398, + "time_per_iteration": 2.499405860900879 + }, + { + "auxiliary_loss_clip": 0.01187104, + "auxiliary_loss_mlp": 0.01057261, + "balance_loss_clip": 1.03366995, + "balance_loss_mlp": 1.05604136, + "epoch": 0.0841124304824891, + "flos": 17967437746560.0, + "grad_norm": 6.547707007931562, + "language_loss": 0.94411373, + "learning_rate": 3.969363257322304e-06, + "loss": 0.96655744, + "num_input_tokens_seen": 29756165, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.3125, + "step": 1399, + "time_per_iteration": 2.458564043045044 + }, + { + "auxiliary_loss_clip": 0.01192876, + "auxiliary_loss_mlp": 0.01060981, + "balance_loss_clip": 1.03585184, + "balance_loss_mlp": 1.05564523, + "epoch": 0.08417255373515707, + "flos": 25629661301760.0, + "grad_norm": 2.3313569082148637, + "language_loss": 0.82052553, + "learning_rate": 3.96929531268464e-06, + "loss": 0.84306407, + "num_input_tokens_seen": 29776425, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.375, + "step": 1400, + "time_per_iteration": 2.511075258255005 + }, + { + "auxiliary_loss_clip": 0.01191821, + "auxiliary_loss_mlp": 0.01061122, + "balance_loss_clip": 1.03713727, + "balance_loss_mlp": 1.05681479, + "epoch": 0.08423267698782504, + "flos": 26249730808320.0, + "grad_norm": 3.6029570836648723, + "language_loss": 0.86615682, + "learning_rate": 3.969227293371099e-06, + "loss": 0.8886863, + "num_input_tokens_seen": 29796440, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.3515625, + "step": 1401, + "time_per_iteration": 2.5328855514526367 + }, + { + "auxiliary_loss_clip": 0.01190636, + "auxiliary_loss_mlp": 0.01063749, + "balance_loss_clip": 1.0383811, + "balance_loss_mlp": 1.05496573, + "epoch": 0.08429280024049302, + "flos": 20119707751680.0, + "grad_norm": 2.2778357332658543, + "language_loss": 0.87128234, + "learning_rate": 3.969159199384263e-06, + "loss": 0.89382625, + "num_input_tokens_seen": 29814755, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.359375, + "step": 1402, + "time_per_iteration": 2.4695520401000977 + }, + { + "auxiliary_loss_clip": 0.0118725, + "auxiliary_loss_mlp": 0.01056626, + "balance_loss_clip": 1.03340352, + "balance_loss_mlp": 1.0542388, + "epoch": 0.08435292349316098, + "flos": 42924526836480.0, + "grad_norm": 2.954964391273458, + "language_loss": 0.88680542, + "learning_rate": 3.9690910307267125e-06, + "loss": 0.90924418, + "num_input_tokens_seen": 29834785, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.328125, + "step": 1403, + "time_per_iteration": 2.6655161380767822 + }, + { + "auxiliary_loss_clip": 0.01189559, + "auxiliary_loss_mlp": 0.01056388, + "balance_loss_clip": 1.03105569, + "balance_loss_mlp": 1.05429792, + "epoch": 0.08441304674582895, + "flos": 22857285876480.0, + "grad_norm": 1.9645692036725415, + "language_loss": 0.80325729, + "learning_rate": 3.969022787401033e-06, + "loss": 0.82571673, + "num_input_tokens_seen": 29854695, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.359375, + "step": 1404, + "time_per_iteration": 2.5011603832244873 + }, + { + "auxiliary_loss_clip": 0.01195719, + "auxiliary_loss_mlp": 0.01066072, + "balance_loss_clip": 1.04089534, + "balance_loss_mlp": 1.05798006, + "epoch": 0.08447316999849692, + "flos": 18697501676160.0, + "grad_norm": 2.1059643070764027, + "language_loss": 0.83845061, + "learning_rate": 3.968954469409811e-06, + "loss": 0.86106849, + "num_input_tokens_seen": 29872180, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.375, + "step": 1405, + "time_per_iteration": 2.4612858295440674 + }, + { + "auxiliary_loss_clip": 0.01188265, + "auxiliary_loss_mlp": 0.01056168, + "balance_loss_clip": 1.03314888, + "balance_loss_mlp": 1.05381966, + "epoch": 0.08453329325116489, + "flos": 25483971738240.0, + "grad_norm": 1.7581309060245893, + "language_loss": 0.80343008, + "learning_rate": 3.968886076755639e-06, + "loss": 0.82587439, + "num_input_tokens_seen": 29893205, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.34375, + "step": 1406, + "time_per_iteration": 2.496676206588745 + }, + { + "auxiliary_loss_clip": 0.01192497, + "auxiliary_loss_mlp": 0.01065969, + "balance_loss_clip": 1.0421989, + "balance_loss_mlp": 1.05858994, + "epoch": 0.08459341650383286, + "flos": 20920048640640.0, + "grad_norm": 1.8241253914082192, + "language_loss": 0.79411483, + "learning_rate": 3.96881760944111e-06, + "loss": 0.8166995, + "num_input_tokens_seen": 29911970, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.3359375, + "step": 1407, + "time_per_iteration": 2.491055727005005 + }, + { + "auxiliary_loss_clip": 0.01188371, + "auxiliary_loss_mlp": 0.01055807, + "balance_loss_clip": 1.03234673, + "balance_loss_mlp": 1.05521655, + "epoch": 0.08465353975650082, + "flos": 13043079624960.0, + "grad_norm": 4.541456574357825, + "language_loss": 0.91929626, + "learning_rate": 3.968749067468819e-06, + "loss": 0.94173807, + "num_input_tokens_seen": 29929925, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.328125, + "step": 1408, + "time_per_iteration": 2.44599986076355 + }, + { + "auxiliary_loss_clip": 0.01074059, + "auxiliary_loss_mlp": 0.01025014, + "balance_loss_clip": 1.02074611, + "balance_loss_mlp": 1.02193737, + "epoch": 0.0847136630091688, + "flos": 60877422552960.0, + "grad_norm": 0.8980094129226197, + "language_loss": 0.61861706, + "learning_rate": 3.968680450841368e-06, + "loss": 0.63960779, + "num_input_tokens_seen": 29985950, + "router_z_loss_clip": 0.04272461, + "router_z_loss_mlp": 0.5234375, + "step": 1409, + "time_per_iteration": 3.1084799766540527 + }, + { + "auxiliary_loss_clip": 0.01180993, + "auxiliary_loss_mlp": 0.01060196, + "balance_loss_clip": 1.03784466, + "balance_loss_mlp": 1.05419254, + "epoch": 0.08477378626183676, + "flos": 22046530043520.0, + "grad_norm": 2.25814404402445, + "language_loss": 0.86819237, + "learning_rate": 3.968611759561355e-06, + "loss": 0.89060426, + "num_input_tokens_seen": 30004330, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.265625, + "step": 1410, + "time_per_iteration": 2.4854791164398193 + }, + { + "auxiliary_loss_clip": 0.01188551, + "auxiliary_loss_mlp": 0.01056537, + "balance_loss_clip": 1.0309782, + "balance_loss_mlp": 1.05453801, + "epoch": 0.08483390951450473, + "flos": 16690059308160.0, + "grad_norm": 2.048224684561652, + "language_loss": 0.74138093, + "learning_rate": 3.968542993631388e-06, + "loss": 0.76383173, + "num_input_tokens_seen": 30022555, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.3359375, + "step": 1411, + "time_per_iteration": 2.484879970550537 + }, + { + "auxiliary_loss_clip": 0.01068033, + "auxiliary_loss_mlp": 0.01005767, + "balance_loss_clip": 1.00169039, + "balance_loss_mlp": 1.01640451, + "epoch": 0.08489403276717271, + "flos": 51584640082560.0, + "grad_norm": 0.9041737870208939, + "language_loss": 0.56723791, + "learning_rate": 3.968474153054073e-06, + "loss": 0.58797586, + "num_input_tokens_seen": 30077220, + "router_z_loss_clip": 0.04077148, + "router_z_loss_mlp": 0.515625, + "step": 1412, + "time_per_iteration": 3.003227949142456 + }, + { + "auxiliary_loss_clip": 0.01183878, + "auxiliary_loss_mlp": 0.01062107, + "balance_loss_clip": 1.03855133, + "balance_loss_mlp": 1.05354273, + "epoch": 0.08495415601984067, + "flos": 17092330698240.0, + "grad_norm": 2.0338814511208883, + "language_loss": 0.89084172, + "learning_rate": 3.96840523783202e-06, + "loss": 0.91330159, + "num_input_tokens_seen": 30094600, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.3046875, + "step": 1413, + "time_per_iteration": 2.4545698165893555 + }, + { + "auxiliary_loss_clip": 0.01186591, + "auxiliary_loss_mlp": 0.01054482, + "balance_loss_clip": 1.03019929, + "balance_loss_mlp": 1.0562067, + "epoch": 0.08501427927250864, + "flos": 23148413608320.0, + "grad_norm": 2.1859301398641415, + "language_loss": 0.8807795, + "learning_rate": 3.968336247967844e-06, + "loss": 0.90319026, + "num_input_tokens_seen": 30114475, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.3046875, + "step": 1414, + "time_per_iteration": 2.4803147315979004 + }, + { + "auxiliary_loss_clip": 0.01185784, + "auxiliary_loss_mlp": 0.01056984, + "balance_loss_clip": 1.03497767, + "balance_loss_mlp": 1.0540117, + "epoch": 0.08507440252517662, + "flos": 19063467394560.0, + "grad_norm": 1.82577143383273, + "language_loss": 0.77434587, + "learning_rate": 3.96826718346416e-06, + "loss": 0.79677355, + "num_input_tokens_seen": 30133350, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.3125, + "step": 1415, + "time_per_iteration": 2.510671615600586 + }, + { + "auxiliary_loss_clip": 0.01185616, + "auxiliary_loss_mlp": 0.010657, + "balance_loss_clip": 1.0441227, + "balance_loss_mlp": 1.05612898, + "epoch": 0.08513452577784458, + "flos": 60182296600320.0, + "grad_norm": 1.848223104879299, + "language_loss": 0.70859981, + "learning_rate": 3.968198044323587e-06, + "loss": 0.73111296, + "num_input_tokens_seen": 30159005, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.296875, + "step": 1416, + "time_per_iteration": 2.827016592025757 + }, + { + "auxiliary_loss_clip": 0.01192002, + "auxiliary_loss_mlp": 0.0106124, + "balance_loss_clip": 1.03587198, + "balance_loss_mlp": 1.05693281, + "epoch": 0.08519464903051255, + "flos": 27308485117440.0, + "grad_norm": 1.9370001986884609, + "language_loss": 0.74855268, + "learning_rate": 3.968128830548748e-06, + "loss": 0.77108514, + "num_input_tokens_seen": 30179450, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.3515625, + "step": 1417, + "time_per_iteration": 2.51518177986145 + }, + { + "auxiliary_loss_clip": 0.01184448, + "auxiliary_loss_mlp": 0.01055419, + "balance_loss_clip": 1.03157723, + "balance_loss_mlp": 1.05394006, + "epoch": 0.08525477228318051, + "flos": 20266438809600.0, + "grad_norm": 2.566029486363868, + "language_loss": 0.82460356, + "learning_rate": 3.968059542142265e-06, + "loss": 0.84700227, + "num_input_tokens_seen": 30197235, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.3046875, + "step": 1418, + "time_per_iteration": 2.4632515907287598 + }, + { + "auxiliary_loss_clip": 0.01078096, + "auxiliary_loss_mlp": 0.01026146, + "balance_loss_clip": 1.02221191, + "balance_loss_mlp": 1.0269177, + "epoch": 0.08531489553584849, + "flos": 67615017183360.0, + "grad_norm": 0.8662062784105238, + "language_loss": 0.56616145, + "learning_rate": 3.9679901791067685e-06, + "loss": 0.58720386, + "num_input_tokens_seen": 30257410, + "router_z_loss_clip": 0.03930664, + "router_z_loss_mlp": 0.51171875, + "step": 1419, + "time_per_iteration": 3.0262646675109863 + }, + { + "auxiliary_loss_clip": 0.01185611, + "auxiliary_loss_mlp": 0.01062944, + "balance_loss_clip": 1.03858972, + "balance_loss_mlp": 1.05284262, + "epoch": 0.08537501878851646, + "flos": 27526965592320.0, + "grad_norm": 2.301787344693911, + "language_loss": 0.69764268, + "learning_rate": 3.967920741444886e-06, + "loss": 0.72012818, + "num_input_tokens_seen": 30277865, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.328125, + "step": 1420, + "time_per_iteration": 2.5173370838165283 + }, + { + "auxiliary_loss_clip": 0.01182824, + "auxiliary_loss_mlp": 0.01051954, + "balance_loss_clip": 1.02912498, + "balance_loss_mlp": 1.05232763, + "epoch": 0.08543514204118442, + "flos": 22784243569920.0, + "grad_norm": 1.56579546013663, + "language_loss": 0.87886292, + "learning_rate": 3.967851229159252e-06, + "loss": 0.90121067, + "num_input_tokens_seen": 30298545, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.3046875, + "step": 1421, + "time_per_iteration": 2.498198986053467 + }, + { + "auxiliary_loss_clip": 0.01069987, + "auxiliary_loss_mlp": 0.01004015, + "balance_loss_clip": 1.00034332, + "balance_loss_mlp": 1.01909983, + "epoch": 0.0854952652938524, + "flos": 60990721027200.0, + "grad_norm": 0.7935144939089421, + "language_loss": 0.63490081, + "learning_rate": 3.967781642252502e-06, + "loss": 0.65564084, + "num_input_tokens_seen": 30361725, + "router_z_loss_clip": 0.03662109, + "router_z_loss_mlp": 0.5078125, + "step": 1422, + "time_per_iteration": 3.050874948501587 + }, + { + "auxiliary_loss_clip": 0.01182797, + "auxiliary_loss_mlp": 0.01065038, + "balance_loss_clip": 1.04182768, + "balance_loss_mlp": 1.05538559, + "epoch": 0.08555538854652037, + "flos": 28038046256640.0, + "grad_norm": 2.040119561169685, + "language_loss": 0.83427018, + "learning_rate": 3.967711980727276e-06, + "loss": 0.85674852, + "num_input_tokens_seen": 30382180, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.2734375, + "step": 1423, + "time_per_iteration": 2.525075674057007 + }, + { + "auxiliary_loss_clip": 0.01190455, + "auxiliary_loss_mlp": 0.01059451, + "balance_loss_clip": 1.0365268, + "balance_loss_mlp": 1.05613029, + "epoch": 0.08561551179918833, + "flos": 23509279595520.0, + "grad_norm": 1.7627385415604107, + "language_loss": 0.74945033, + "learning_rate": 3.967642244586213e-06, + "loss": 0.77194929, + "num_input_tokens_seen": 30402980, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.34375, + "step": 1424, + "time_per_iteration": 2.523231029510498 + }, + { + "auxiliary_loss_clip": 0.01185893, + "auxiliary_loss_mlp": 0.01056266, + "balance_loss_clip": 1.03307986, + "balance_loss_mlp": 1.05510807, + "epoch": 0.08567563505185631, + "flos": 17926930183680.0, + "grad_norm": 1.9395290082560723, + "language_loss": 0.7574805, + "learning_rate": 3.96757243383196e-06, + "loss": 0.7799021, + "num_input_tokens_seen": 30420800, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3046875, + "step": 1425, + "time_per_iteration": 2.441420793533325 + }, + { + "auxiliary_loss_clip": 0.01183386, + "auxiliary_loss_mlp": 0.01053965, + "balance_loss_clip": 1.03092194, + "balance_loss_mlp": 1.05407834, + "epoch": 0.08573575830452428, + "flos": 19719519350400.0, + "grad_norm": 2.579491371045568, + "language_loss": 0.93504989, + "learning_rate": 3.9675025484671624e-06, + "loss": 0.95742333, + "num_input_tokens_seen": 30439620, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.296875, + "step": 1426, + "time_per_iteration": 2.4703657627105713 + }, + { + "auxiliary_loss_clip": 0.0119154, + "auxiliary_loss_mlp": 0.01066598, + "balance_loss_clip": 1.04115915, + "balance_loss_mlp": 1.05764198, + "epoch": 0.08579588155719224, + "flos": 17931563038080.0, + "grad_norm": 2.235647808517122, + "language_loss": 0.75003266, + "learning_rate": 3.967432588494471e-06, + "loss": 0.772614, + "num_input_tokens_seen": 30457300, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.34375, + "step": 1427, + "time_per_iteration": 2.4430549144744873 + }, + { + "auxiliary_loss_clip": 0.01182417, + "auxiliary_loss_mlp": 0.01061112, + "balance_loss_clip": 1.03907049, + "balance_loss_mlp": 1.05315089, + "epoch": 0.08585600480986022, + "flos": 16033324993920.0, + "grad_norm": 2.3372587699614726, + "language_loss": 0.81915152, + "learning_rate": 3.96736255391654e-06, + "loss": 0.84158677, + "num_input_tokens_seen": 30471580, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.296875, + "step": 1428, + "time_per_iteration": 2.454441785812378 + }, + { + "auxiliary_loss_clip": 0.01189987, + "auxiliary_loss_mlp": 0.01066735, + "balance_loss_clip": 1.04258347, + "balance_loss_mlp": 1.05586076, + "epoch": 0.08591612806252819, + "flos": 28657433404800.0, + "grad_norm": 2.395570851050941, + "language_loss": 0.79697371, + "learning_rate": 3.967292444736023e-06, + "loss": 0.81954098, + "num_input_tokens_seen": 30492720, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.34375, + "step": 1429, + "time_per_iteration": 2.5411579608917236 + }, + { + "auxiliary_loss_clip": 0.0119024, + "auxiliary_loss_mlp": 0.01062326, + "balance_loss_clip": 1.03952122, + "balance_loss_mlp": 1.05773449, + "epoch": 0.08597625131519615, + "flos": 20959119659520.0, + "grad_norm": 2.301464625204156, + "language_loss": 0.88055587, + "learning_rate": 3.967222260955578e-06, + "loss": 0.90308148, + "num_input_tokens_seen": 30509535, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.328125, + "step": 1430, + "time_per_iteration": 2.497523546218872 + }, + { + "auxiliary_loss_clip": 0.01184535, + "auxiliary_loss_mlp": 0.01072949, + "balance_loss_clip": 1.04995334, + "balance_loss_mlp": 1.05712664, + "epoch": 0.08603637456786412, + "flos": 23256360956160.0, + "grad_norm": 1.7504719201320615, + "language_loss": 0.81914723, + "learning_rate": 3.96715200257787e-06, + "loss": 0.84172201, + "num_input_tokens_seen": 30529490, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.2734375, + "step": 1431, + "time_per_iteration": 2.478731393814087 + }, + { + "auxiliary_loss_clip": 0.01184756, + "auxiliary_loss_mlp": 0.01056491, + "balance_loss_clip": 1.03337657, + "balance_loss_mlp": 1.05376828, + "epoch": 0.0860964978205321, + "flos": 28694170039680.0, + "grad_norm": 1.9949655353101803, + "language_loss": 0.77759397, + "learning_rate": 3.967081669605559e-06, + "loss": 0.80000651, + "num_input_tokens_seen": 30550205, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3125, + "step": 1432, + "time_per_iteration": 2.5344104766845703 + }, + { + "auxiliary_loss_clip": 0.0118072, + "auxiliary_loss_mlp": 0.01059034, + "balance_loss_clip": 1.03497803, + "balance_loss_mlp": 1.05027151, + "epoch": 0.08615662107320006, + "flos": 19318397195520.0, + "grad_norm": 2.2873036973179603, + "language_loss": 0.73330259, + "learning_rate": 3.967011262041315e-06, + "loss": 0.75570011, + "num_input_tokens_seen": 30568830, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.3046875, + "step": 1433, + "time_per_iteration": 2.4787938594818115 + }, + { + "auxiliary_loss_clip": 0.01188497, + "auxiliary_loss_mlp": 0.01058804, + "balance_loss_clip": 1.03375793, + "balance_loss_mlp": 1.05464733, + "epoch": 0.08621674432586802, + "flos": 15851688894720.0, + "grad_norm": 2.615593579271415, + "language_loss": 0.85741955, + "learning_rate": 3.9669407798878065e-06, + "loss": 0.87989259, + "num_input_tokens_seen": 30585730, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.3359375, + "step": 1434, + "time_per_iteration": 5.500946998596191 + }, + { + "auxiliary_loss_clip": 0.01182383, + "auxiliary_loss_mlp": 0.01054521, + "balance_loss_clip": 1.03139436, + "balance_loss_mlp": 1.05177212, + "epoch": 0.086276867578536, + "flos": 14100648785280.0, + "grad_norm": 3.0513138823403825, + "language_loss": 0.78913063, + "learning_rate": 3.966870223147707e-06, + "loss": 0.81149966, + "num_input_tokens_seen": 30603180, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3046875, + "step": 1435, + "time_per_iteration": 3.899777412414551 + }, + { + "auxiliary_loss_clip": 0.01070575, + "auxiliary_loss_mlp": 0.01027743, + "balance_loss_clip": 1.02428555, + "balance_loss_mlp": 1.02010655, + "epoch": 0.08633699083120397, + "flos": 70184857772160.0, + "grad_norm": 0.8910926846424677, + "language_loss": 0.57930011, + "learning_rate": 3.96679959182369e-06, + "loss": 0.60028332, + "num_input_tokens_seen": 30668895, + "router_z_loss_clip": 0.03466797, + "router_z_loss_mlp": 0.5078125, + "step": 1436, + "time_per_iteration": 3.179255247116089 + }, + { + "auxiliary_loss_clip": 0.01186059, + "auxiliary_loss_mlp": 0.01049386, + "balance_loss_clip": 1.02633083, + "balance_loss_mlp": 1.05314159, + "epoch": 0.08639711408387193, + "flos": 30298874140800.0, + "grad_norm": 2.429993259280604, + "language_loss": 0.68775386, + "learning_rate": 3.966728885918437e-06, + "loss": 0.71010828, + "num_input_tokens_seen": 30688955, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.328125, + "step": 1437, + "time_per_iteration": 2.529339551925659 + }, + { + "auxiliary_loss_clip": 0.01185365, + "auxiliary_loss_mlp": 0.01050306, + "balance_loss_clip": 1.02806163, + "balance_loss_mlp": 1.05388093, + "epoch": 0.08645723733653991, + "flos": 20297680663680.0, + "grad_norm": 2.5641138848438163, + "language_loss": 0.7274068, + "learning_rate": 3.966658105434627e-06, + "loss": 0.74976349, + "num_input_tokens_seen": 30706095, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.3125, + "step": 1438, + "time_per_iteration": 2.4840176105499268 + }, + { + "auxiliary_loss_clip": 0.01183596, + "auxiliary_loss_mlp": 0.01049035, + "balance_loss_clip": 1.02594447, + "balance_loss_mlp": 1.05472374, + "epoch": 0.08651736058920788, + "flos": 32890583134080.0, + "grad_norm": 1.681614476681305, + "language_loss": 0.64628494, + "learning_rate": 3.966587250374945e-06, + "loss": 0.66861117, + "num_input_tokens_seen": 30729025, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.2890625, + "step": 1439, + "time_per_iteration": 2.61686372756958 + }, + { + "auxiliary_loss_clip": 0.01187197, + "auxiliary_loss_mlp": 0.01055218, + "balance_loss_clip": 1.03131676, + "balance_loss_mlp": 1.05638909, + "epoch": 0.08657748384187584, + "flos": 22637368857600.0, + "grad_norm": 2.062065757985673, + "language_loss": 0.87748063, + "learning_rate": 3.966516320742077e-06, + "loss": 0.89990479, + "num_input_tokens_seen": 30746155, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.3125, + "step": 1440, + "time_per_iteration": 2.5116493701934814 + }, + { + "auxiliary_loss_clip": 0.01188419, + "auxiliary_loss_mlp": 0.01059749, + "balance_loss_clip": 1.03538251, + "balance_loss_mlp": 1.0540843, + "epoch": 0.08663760709454381, + "flos": 23658380951040.0, + "grad_norm": 2.4102507257620363, + "language_loss": 0.83243793, + "learning_rate": 3.9664453165387124e-06, + "loss": 0.85491961, + "num_input_tokens_seen": 30761410, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.34375, + "step": 1441, + "time_per_iteration": 2.5058300495147705 + }, + { + "auxiliary_loss_clip": 0.01070024, + "auxiliary_loss_mlp": 0.01009256, + "balance_loss_clip": 1.00525022, + "balance_loss_mlp": 1.01939523, + "epoch": 0.08669773034721179, + "flos": 62686564911360.0, + "grad_norm": 0.8461220926791603, + "language_loss": 0.60426581, + "learning_rate": 3.966374237767545e-06, + "loss": 0.62505859, + "num_input_tokens_seen": 30823010, + "router_z_loss_clip": 0.04003906, + "router_z_loss_mlp": 0.5078125, + "step": 1442, + "time_per_iteration": 3.1946628093719482 + }, + { + "auxiliary_loss_clip": 0.01192002, + "auxiliary_loss_mlp": 0.01057232, + "balance_loss_clip": 1.03379524, + "balance_loss_mlp": 1.05709028, + "epoch": 0.08675785359987975, + "flos": 20667489137280.0, + "grad_norm": 3.2809405592870835, + "language_loss": 0.79264277, + "learning_rate": 3.96630308443127e-06, + "loss": 0.81513512, + "num_input_tokens_seen": 30841980, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.3515625, + "step": 1443, + "time_per_iteration": 2.477691411972046 + }, + { + "auxiliary_loss_clip": 0.01185255, + "auxiliary_loss_mlp": 0.01054103, + "balance_loss_clip": 1.03170311, + "balance_loss_mlp": 1.05261874, + "epoch": 0.08681797685254772, + "flos": 26941118768640.0, + "grad_norm": 1.764762918327591, + "language_loss": 0.82248437, + "learning_rate": 3.966231856532584e-06, + "loss": 0.8448779, + "num_input_tokens_seen": 30863280, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.328125, + "step": 1444, + "time_per_iteration": 2.584773063659668 + }, + { + "auxiliary_loss_clip": 0.01189581, + "auxiliary_loss_mlp": 0.01049918, + "balance_loss_clip": 1.02745867, + "balance_loss_mlp": 1.05537939, + "epoch": 0.0868781001052157, + "flos": 17712831168000.0, + "grad_norm": 1.945627197742621, + "language_loss": 0.86856627, + "learning_rate": 3.966160554074189e-06, + "loss": 0.89096129, + "num_input_tokens_seen": 30881710, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.34375, + "step": 1445, + "time_per_iteration": 2.506258964538574 + }, + { + "auxiliary_loss_clip": 0.01189413, + "auxiliary_loss_mlp": 0.01054326, + "balance_loss_clip": 1.03303528, + "balance_loss_mlp": 1.05808067, + "epoch": 0.08693822335788366, + "flos": 19896522595200.0, + "grad_norm": 1.9763924186655837, + "language_loss": 0.81639445, + "learning_rate": 3.96608917705879e-06, + "loss": 0.8388319, + "num_input_tokens_seen": 30900225, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.3125, + "step": 1446, + "time_per_iteration": 2.531313180923462 + }, + { + "auxiliary_loss_clip": 0.010647, + "auxiliary_loss_mlp": 0.01005416, + "balance_loss_clip": 1.0020541, + "balance_loss_mlp": 1.0147202, + "epoch": 0.08699834661055163, + "flos": 67023747406080.0, + "grad_norm": 0.728477241136595, + "language_loss": 0.54725462, + "learning_rate": 3.966017725489091e-06, + "loss": 0.56795579, + "num_input_tokens_seen": 30959580, + "router_z_loss_clip": 0.03369141, + "router_z_loss_mlp": 0.5, + "step": 1447, + "time_per_iteration": 3.1009976863861084 + }, + { + "auxiliary_loss_clip": 0.01178637, + "auxiliary_loss_mlp": 0.01052877, + "balance_loss_clip": 1.03104973, + "balance_loss_mlp": 1.05198455, + "epoch": 0.0870584698632196, + "flos": 13480507451520.0, + "grad_norm": 2.2332818090387243, + "language_loss": 0.84593046, + "learning_rate": 3.965946199367804e-06, + "loss": 0.8682456, + "num_input_tokens_seen": 30976775, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.265625, + "step": 1448, + "time_per_iteration": 2.483792543411255 + }, + { + "auxiliary_loss_clip": 0.01185215, + "auxiliary_loss_mlp": 0.01056358, + "balance_loss_clip": 1.03386295, + "balance_loss_mlp": 1.0524509, + "epoch": 0.08711859311588757, + "flos": 16107013745280.0, + "grad_norm": 3.099884448391289, + "language_loss": 0.80688727, + "learning_rate": 3.965874598697638e-06, + "loss": 0.82930297, + "num_input_tokens_seen": 30990495, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.328125, + "step": 1449, + "time_per_iteration": 2.4637081623077393 + }, + { + "auxiliary_loss_clip": 0.01182046, + "auxiliary_loss_mlp": 0.01050023, + "balance_loss_clip": 1.02862501, + "balance_loss_mlp": 1.05370414, + "epoch": 0.08717871636855554, + "flos": 38472357928320.0, + "grad_norm": 4.183651889411507, + "language_loss": 0.71012592, + "learning_rate": 3.965802923481313e-06, + "loss": 0.73244655, + "num_input_tokens_seen": 31014080, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.2890625, + "step": 1450, + "time_per_iteration": 2.6521542072296143 + }, + { + "auxiliary_loss_clip": 0.0118314, + "auxiliary_loss_mlp": 0.01053244, + "balance_loss_clip": 1.03057098, + "balance_loss_mlp": 1.05502534, + "epoch": 0.0872388396212235, + "flos": 17600574188160.0, + "grad_norm": 1.8266796466048172, + "language_loss": 0.83492875, + "learning_rate": 3.965731173721542e-06, + "loss": 0.85729253, + "num_input_tokens_seen": 31031210, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.28125, + "step": 1451, + "time_per_iteration": 2.4866271018981934 + }, + { + "auxiliary_loss_clip": 0.01181156, + "auxiliary_loss_mlp": 0.01057138, + "balance_loss_clip": 1.03538203, + "balance_loss_mlp": 1.05371869, + "epoch": 0.08729896287389148, + "flos": 25259385951360.0, + "grad_norm": 1.850339391564711, + "language_loss": 0.74351519, + "learning_rate": 3.965659349421049e-06, + "loss": 0.76589811, + "num_input_tokens_seen": 31049710, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.2734375, + "step": 1452, + "time_per_iteration": 2.5450925827026367 + }, + { + "auxiliary_loss_clip": 0.01182798, + "auxiliary_loss_mlp": 0.01061481, + "balance_loss_clip": 1.03840256, + "balance_loss_mlp": 1.05121017, + "epoch": 0.08735908612655945, + "flos": 15632454234240.0, + "grad_norm": 3.3421371051734474, + "language_loss": 0.79840016, + "learning_rate": 3.965587450582556e-06, + "loss": 0.82084292, + "num_input_tokens_seen": 31066160, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3125, + "step": 1453, + "time_per_iteration": 2.49350643157959 + }, + { + "auxiliary_loss_clip": 0.01182604, + "auxiliary_loss_mlp": 0.0106489, + "balance_loss_clip": 1.04213262, + "balance_loss_mlp": 1.0545752, + "epoch": 0.08741920937922741, + "flos": 20339660684160.0, + "grad_norm": 1.982640213979625, + "language_loss": 0.71298045, + "learning_rate": 3.96551547720879e-06, + "loss": 0.73545539, + "num_input_tokens_seen": 31085270, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.28125, + "step": 1454, + "time_per_iteration": 2.534457206726074 + }, + { + "auxiliary_loss_clip": 0.01070391, + "auxiliary_loss_mlp": 0.01029091, + "balance_loss_clip": 1.02608728, + "balance_loss_mlp": 1.02026677, + "epoch": 0.08747933263189539, + "flos": 62819795433600.0, + "grad_norm": 0.7993884765543664, + "language_loss": 0.58655661, + "learning_rate": 3.96544342930248e-06, + "loss": 0.6075514, + "num_input_tokens_seen": 31148445, + "router_z_loss_clip": 0.0300293, + "router_z_loss_mlp": 0.5, + "step": 1455, + "time_per_iteration": 3.088113307952881 + }, + { + "auxiliary_loss_clip": 0.01182632, + "auxiliary_loss_mlp": 0.01058901, + "balance_loss_clip": 1.03638279, + "balance_loss_mlp": 1.05210626, + "epoch": 0.08753945588456336, + "flos": 33035877648000.0, + "grad_norm": 1.5590098662562957, + "language_loss": 0.77404714, + "learning_rate": 3.965371306866359e-06, + "loss": 0.79646254, + "num_input_tokens_seen": 31168770, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.3046875, + "step": 1456, + "time_per_iteration": 2.6145191192626953 + }, + { + "auxiliary_loss_clip": 0.01182283, + "auxiliary_loss_mlp": 0.01051585, + "balance_loss_clip": 1.02888715, + "balance_loss_mlp": 1.05235434, + "epoch": 0.08759957913723132, + "flos": 35547182046720.0, + "grad_norm": 2.3657198267749777, + "language_loss": 0.72391665, + "learning_rate": 3.96529910990316e-06, + "loss": 0.74625528, + "num_input_tokens_seen": 31189270, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.296875, + "step": 1457, + "time_per_iteration": 2.6438605785369873 + }, + { + "auxiliary_loss_clip": 0.01179054, + "auxiliary_loss_mlp": 0.01047648, + "balance_loss_clip": 1.02623844, + "balance_loss_mlp": 1.05207849, + "epoch": 0.0876597023898993, + "flos": 23911120022400.0, + "grad_norm": 1.5929331180335078, + "language_loss": 0.86215973, + "learning_rate": 3.965226838415622e-06, + "loss": 0.88442671, + "num_input_tokens_seen": 31210385, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.265625, + "step": 1458, + "time_per_iteration": 2.539658546447754 + }, + { + "auxiliary_loss_clip": 0.01189161, + "auxiliary_loss_mlp": 0.01059801, + "balance_loss_clip": 1.03694844, + "balance_loss_mlp": 1.05887103, + "epoch": 0.08771982564256726, + "flos": 18114025150080.0, + "grad_norm": 1.660016084678777, + "language_loss": 0.80662763, + "learning_rate": 3.965154492406486e-06, + "loss": 0.8291173, + "num_input_tokens_seen": 31229745, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.3046875, + "step": 1459, + "time_per_iteration": 2.4880902767181396 + }, + { + "auxiliary_loss_clip": 0.01187526, + "auxiliary_loss_mlp": 0.01054149, + "balance_loss_clip": 1.03057003, + "balance_loss_mlp": 1.05512893, + "epoch": 0.08777994889523523, + "flos": 17712005155200.0, + "grad_norm": 2.474003232718447, + "language_loss": 0.84058738, + "learning_rate": 3.9650820718784945e-06, + "loss": 0.86300415, + "num_input_tokens_seen": 31248280, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.328125, + "step": 1460, + "time_per_iteration": 2.4644060134887695 + }, + { + "auxiliary_loss_clip": 0.01178547, + "auxiliary_loss_mlp": 0.01054299, + "balance_loss_clip": 1.03287745, + "balance_loss_mlp": 1.05051732, + "epoch": 0.0878400721479032, + "flos": 12819930382080.0, + "grad_norm": 2.696872821623283, + "language_loss": 0.81030595, + "learning_rate": 3.965009576834394e-06, + "loss": 0.83263445, + "num_input_tokens_seen": 31262190, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.28125, + "step": 1461, + "time_per_iteration": 2.4456100463867188 + }, + { + "auxiliary_loss_clip": 0.01187345, + "auxiliary_loss_mlp": 0.01059901, + "balance_loss_clip": 1.03795433, + "balance_loss_mlp": 1.05579305, + "epoch": 0.08790019540057117, + "flos": 26392690938240.0, + "grad_norm": 1.656505593412751, + "language_loss": 0.76405656, + "learning_rate": 3.964937007276932e-06, + "loss": 0.786529, + "num_input_tokens_seen": 31283690, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.3125, + "step": 1462, + "time_per_iteration": 2.546812057495117 + }, + { + "auxiliary_loss_clip": 0.01190578, + "auxiliary_loss_mlp": 0.01058183, + "balance_loss_clip": 1.03431702, + "balance_loss_mlp": 1.05753493, + "epoch": 0.08796031865323914, + "flos": 19134031662720.0, + "grad_norm": 2.4277854967530663, + "language_loss": 0.74615479, + "learning_rate": 3.9648643632088634e-06, + "loss": 0.76864231, + "num_input_tokens_seen": 31302505, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.328125, + "step": 1463, + "time_per_iteration": 2.46510648727417 + }, + { + "auxiliary_loss_clip": 0.01189177, + "auxiliary_loss_mlp": 0.0106376, + "balance_loss_clip": 1.03929877, + "balance_loss_mlp": 1.05380559, + "epoch": 0.0880204419059071, + "flos": 26064287867520.0, + "grad_norm": 2.09054267836168, + "language_loss": 0.83423382, + "learning_rate": 3.964791644632941e-06, + "loss": 0.85676318, + "num_input_tokens_seen": 31323070, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.3515625, + "step": 1464, + "time_per_iteration": 2.5343735218048096 + }, + { + "auxiliary_loss_clip": 0.01183588, + "auxiliary_loss_mlp": 0.01069008, + "balance_loss_clip": 1.04659677, + "balance_loss_mlp": 1.05336595, + "epoch": 0.08808056515857508, + "flos": 22377842115840.0, + "grad_norm": 4.267071209901202, + "language_loss": 0.78351951, + "learning_rate": 3.964718851551923e-06, + "loss": 0.80604541, + "num_input_tokens_seen": 31341880, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.296875, + "step": 1465, + "time_per_iteration": 2.4745209217071533 + }, + { + "auxiliary_loss_clip": 0.01190864, + "auxiliary_loss_mlp": 0.01059186, + "balance_loss_clip": 1.0371089, + "balance_loss_mlp": 1.05628061, + "epoch": 0.08814068841124305, + "flos": 23185293897600.0, + "grad_norm": 1.8950228405880263, + "language_loss": 0.84698099, + "learning_rate": 3.9646459839685675e-06, + "loss": 0.86948144, + "num_input_tokens_seen": 31361995, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.34375, + "step": 1466, + "time_per_iteration": 2.4920802116394043 + }, + { + "auxiliary_loss_clip": 0.01185126, + "auxiliary_loss_mlp": 0.0105874, + "balance_loss_clip": 1.03556609, + "balance_loss_mlp": 1.05407715, + "epoch": 0.08820081166391101, + "flos": 25155281358720.0, + "grad_norm": 3.8136580791310783, + "language_loss": 0.84233636, + "learning_rate": 3.964573041885641e-06, + "loss": 0.86477506, + "num_input_tokens_seen": 31381515, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3125, + "step": 1467, + "time_per_iteration": 2.5413413047790527 + }, + { + "auxiliary_loss_clip": 0.01183856, + "auxiliary_loss_mlp": 0.01056021, + "balance_loss_clip": 1.03381276, + "balance_loss_mlp": 1.05462813, + "epoch": 0.08826093491657899, + "flos": 22231685675520.0, + "grad_norm": 1.7481416698073104, + "language_loss": 0.75517243, + "learning_rate": 3.964500025305907e-06, + "loss": 0.7775712, + "num_input_tokens_seen": 31400345, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.296875, + "step": 1468, + "time_per_iteration": 2.496363878250122 + }, + { + "auxiliary_loss_clip": 0.01184448, + "auxiliary_loss_mlp": 0.0105718, + "balance_loss_clip": 1.03623509, + "balance_loss_mlp": 1.05570245, + "epoch": 0.08832105816924696, + "flos": 22126826897280.0, + "grad_norm": 1.7579385887345491, + "language_loss": 0.80601043, + "learning_rate": 3.9644269342321355e-06, + "loss": 0.82842672, + "num_input_tokens_seen": 31419620, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.2890625, + "step": 1469, + "time_per_iteration": 2.5486512184143066 + }, + { + "auxiliary_loss_clip": 0.01187777, + "auxiliary_loss_mlp": 0.01054482, + "balance_loss_clip": 1.0321182, + "balance_loss_mlp": 1.05454695, + "epoch": 0.08838118142191492, + "flos": 17566495159680.0, + "grad_norm": 3.202810753535508, + "language_loss": 0.77607989, + "learning_rate": 3.9643537686670974e-06, + "loss": 0.7985025, + "num_input_tokens_seen": 31437970, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.3359375, + "step": 1470, + "time_per_iteration": 2.6632297039031982 + }, + { + "auxiliary_loss_clip": 0.01182287, + "auxiliary_loss_mlp": 0.0106647, + "balance_loss_clip": 1.04266429, + "balance_loss_mlp": 1.05412459, + "epoch": 0.0884413046745829, + "flos": 20777196251520.0, + "grad_norm": 1.774803600242038, + "language_loss": 0.84233272, + "learning_rate": 3.964280528613569e-06, + "loss": 0.86482024, + "num_input_tokens_seen": 31457040, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.28125, + "step": 1471, + "time_per_iteration": 2.5040950775146484 + }, + { + "auxiliary_loss_clip": 0.01178062, + "auxiliary_loss_mlp": 0.01052705, + "balance_loss_clip": 1.03247499, + "balance_loss_mlp": 1.05459309, + "epoch": 0.08850142792725087, + "flos": 22125462180480.0, + "grad_norm": 1.6761790638208889, + "language_loss": 0.83481324, + "learning_rate": 3.964207214074324e-06, + "loss": 0.85712093, + "num_input_tokens_seen": 31477520, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.234375, + "step": 1472, + "time_per_iteration": 2.5079073905944824 + }, + { + "auxiliary_loss_clip": 0.01185739, + "auxiliary_loss_mlp": 0.0105882, + "balance_loss_clip": 1.03597999, + "balance_loss_mlp": 1.05491877, + "epoch": 0.08856155117991883, + "flos": 22418744728320.0, + "grad_norm": 2.396127276436556, + "language_loss": 0.828246, + "learning_rate": 3.964133825052146e-06, + "loss": 0.85069156, + "num_input_tokens_seen": 31495575, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.3125, + "step": 1473, + "time_per_iteration": 2.4919679164886475 + }, + { + "auxiliary_loss_clip": 0.01183368, + "auxiliary_loss_mlp": 0.01061525, + "balance_loss_clip": 1.04040098, + "balance_loss_mlp": 1.05414963, + "epoch": 0.0886216744325868, + "flos": 29937002572800.0, + "grad_norm": 1.8346488607114506, + "language_loss": 0.78871369, + "learning_rate": 3.964060361549816e-06, + "loss": 0.81116265, + "num_input_tokens_seen": 31520020, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.296875, + "step": 1474, + "time_per_iteration": 2.5806753635406494 + }, + { + "auxiliary_loss_clip": 0.01181812, + "auxiliary_loss_mlp": 0.01057333, + "balance_loss_clip": 1.03413475, + "balance_loss_mlp": 1.05450511, + "epoch": 0.08868179768525478, + "flos": 23982833525760.0, + "grad_norm": 1.918961213895669, + "language_loss": 0.79045832, + "learning_rate": 3.963986823570121e-06, + "loss": 0.81284976, + "num_input_tokens_seen": 31539265, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.2734375, + "step": 1475, + "time_per_iteration": 2.495753765106201 + }, + { + "auxiliary_loss_clip": 0.01184034, + "auxiliary_loss_mlp": 0.01048109, + "balance_loss_clip": 1.0258882, + "balance_loss_mlp": 1.05443335, + "epoch": 0.08874192093792274, + "flos": 43177553216640.0, + "grad_norm": 1.6510632676992876, + "language_loss": 0.73973525, + "learning_rate": 3.963913211115848e-06, + "loss": 0.76205671, + "num_input_tokens_seen": 31563425, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.296875, + "step": 1476, + "time_per_iteration": 6.925957679748535 + }, + { + "auxiliary_loss_clip": 0.0118493, + "auxiliary_loss_mlp": 0.01060562, + "balance_loss_clip": 1.03723264, + "balance_loss_mlp": 1.05454326, + "epoch": 0.0888020441905907, + "flos": 32852445868800.0, + "grad_norm": 1.527991814504802, + "language_loss": 0.74644423, + "learning_rate": 3.9638395241897895e-06, + "loss": 0.76889908, + "num_input_tokens_seen": 31584525, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.3046875, + "step": 1477, + "time_per_iteration": 2.6033589839935303 + }, + { + "auxiliary_loss_clip": 0.01181345, + "auxiliary_loss_mlp": 0.01048179, + "balance_loss_clip": 1.02571976, + "balance_loss_mlp": 1.05315852, + "epoch": 0.08886216744325869, + "flos": 23149347361920.0, + "grad_norm": 2.4237564416671002, + "language_loss": 0.86488914, + "learning_rate": 3.963765762794739e-06, + "loss": 0.88718438, + "num_input_tokens_seen": 31603325, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.28125, + "step": 1478, + "time_per_iteration": 2.5188398361206055 + }, + { + "auxiliary_loss_clip": 0.01182629, + "auxiliary_loss_mlp": 0.01057749, + "balance_loss_clip": 1.03599334, + "balance_loss_mlp": 1.05417609, + "epoch": 0.08892229069592665, + "flos": 23331593992320.0, + "grad_norm": 7.715019285918926, + "language_loss": 0.77988106, + "learning_rate": 3.963691926933495e-06, + "loss": 0.80228484, + "num_input_tokens_seen": 31624820, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.28125, + "step": 1479, + "time_per_iteration": 2.50730562210083 + }, + { + "auxiliary_loss_clip": 0.01180801, + "auxiliary_loss_mlp": 0.01053517, + "balance_loss_clip": 1.02986622, + "balance_loss_mlp": 1.05275774, + "epoch": 0.08898241394859462, + "flos": 26213784272640.0, + "grad_norm": 2.3628139464189815, + "language_loss": 0.78267598, + "learning_rate": 3.9636180166088555e-06, + "loss": 0.80501914, + "num_input_tokens_seen": 31646080, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.28125, + "step": 1480, + "time_per_iteration": 2.512730360031128 + }, + { + "auxiliary_loss_clip": 0.01185027, + "auxiliary_loss_mlp": 0.01063075, + "balance_loss_clip": 1.03901875, + "balance_loss_mlp": 1.05357075, + "epoch": 0.0890425372012626, + "flos": 23550613171200.0, + "grad_norm": 3.1949876590170825, + "language_loss": 0.66627192, + "learning_rate": 3.963544031823624e-06, + "loss": 0.68875289, + "num_input_tokens_seen": 31665770, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.3125, + "step": 1481, + "time_per_iteration": 2.4874138832092285 + }, + { + "auxiliary_loss_clip": 0.0118244, + "auxiliary_loss_mlp": 0.01051994, + "balance_loss_clip": 1.03040504, + "balance_loss_mlp": 1.05519605, + "epoch": 0.08910266045393056, + "flos": 23002795872000.0, + "grad_norm": 1.9560930463008703, + "language_loss": 0.9644348, + "learning_rate": 3.9634699725806065e-06, + "loss": 0.98677909, + "num_input_tokens_seen": 31683805, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.2734375, + "step": 1482, + "time_per_iteration": 2.484274387359619 + }, + { + "auxiliary_loss_clip": 0.01190541, + "auxiliary_loss_mlp": 0.01055727, + "balance_loss_clip": 1.03306508, + "balance_loss_mlp": 1.0577234, + "epoch": 0.08916278370659853, + "flos": 31936508035200.0, + "grad_norm": 2.358614174414972, + "language_loss": 0.78436875, + "learning_rate": 3.96339583888261e-06, + "loss": 0.80683142, + "num_input_tokens_seen": 31704630, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.328125, + "step": 1483, + "time_per_iteration": 2.566199779510498 + }, + { + "auxiliary_loss_clip": 0.01183147, + "auxiliary_loss_mlp": 0.01072522, + "balance_loss_clip": 1.04891825, + "balance_loss_mlp": 1.05463076, + "epoch": 0.08922290695926649, + "flos": 17530404969600.0, + "grad_norm": 2.232834813834399, + "language_loss": 0.86091626, + "learning_rate": 3.963321630732448e-06, + "loss": 0.88347292, + "num_input_tokens_seen": 31723255, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.28125, + "step": 1484, + "time_per_iteration": 2.4742467403411865 + }, + { + "auxiliary_loss_clip": 0.01190947, + "auxiliary_loss_mlp": 0.01064822, + "balance_loss_clip": 1.04152799, + "balance_loss_mlp": 1.0570302, + "epoch": 0.08928303021193447, + "flos": 32125075459200.0, + "grad_norm": 1.7135103732453094, + "language_loss": 0.80460989, + "learning_rate": 3.963247348132932e-06, + "loss": 0.82716757, + "num_input_tokens_seen": 31747045, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.34375, + "step": 1485, + "time_per_iteration": 2.5808591842651367 + }, + { + "auxiliary_loss_clip": 0.01182644, + "auxiliary_loss_mlp": 0.01059654, + "balance_loss_clip": 1.03663421, + "balance_loss_mlp": 1.05256486, + "epoch": 0.08934315346460243, + "flos": 22125210785280.0, + "grad_norm": 2.0833446931013144, + "language_loss": 0.8295821, + "learning_rate": 3.96317299108688e-06, + "loss": 0.852005, + "num_input_tokens_seen": 31766615, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.296875, + "step": 1486, + "time_per_iteration": 2.5060923099517822 + }, + { + "auxiliary_loss_clip": 0.01184012, + "auxiliary_loss_mlp": 0.01060171, + "balance_loss_clip": 1.03749752, + "balance_loss_mlp": 1.05506349, + "epoch": 0.0894032767172704, + "flos": 22565583527040.0, + "grad_norm": 1.6673763915473876, + "language_loss": 0.76653707, + "learning_rate": 3.963098559597111e-06, + "loss": 0.78897893, + "num_input_tokens_seen": 31785855, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.296875, + "step": 1487, + "time_per_iteration": 2.4968059062957764 + }, + { + "auxiliary_loss_clip": 0.01181982, + "auxiliary_loss_mlp": 0.01063322, + "balance_loss_clip": 1.03908658, + "balance_loss_mlp": 1.05203557, + "epoch": 0.08946339996993838, + "flos": 20193396503040.0, + "grad_norm": 2.360836711926668, + "language_loss": 0.83246535, + "learning_rate": 3.963024053666449e-06, + "loss": 0.85491836, + "num_input_tokens_seen": 31804210, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.296875, + "step": 1488, + "time_per_iteration": 2.48189377784729 + }, + { + "auxiliary_loss_clip": 0.01180173, + "auxiliary_loss_mlp": 0.01051663, + "balance_loss_clip": 1.03020549, + "balance_loss_mlp": 1.05375743, + "epoch": 0.08952352322260634, + "flos": 48360181104000.0, + "grad_norm": 1.9508187836998312, + "language_loss": 0.71647823, + "learning_rate": 3.962949473297718e-06, + "loss": 0.73879659, + "num_input_tokens_seen": 31826150, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.265625, + "step": 1489, + "time_per_iteration": 2.701035737991333 + }, + { + "auxiliary_loss_clip": 0.01178824, + "auxiliary_loss_mlp": 0.01053682, + "balance_loss_clip": 1.03087783, + "balance_loss_mlp": 1.05088401, + "epoch": 0.08958364647527431, + "flos": 31793081028480.0, + "grad_norm": 1.8144641128553483, + "language_loss": 0.89490288, + "learning_rate": 3.962874818493745e-06, + "loss": 0.91722786, + "num_input_tokens_seen": 31848060, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.28125, + "step": 1490, + "time_per_iteration": 2.676098108291626 + }, + { + "auxiliary_loss_clip": 0.01187914, + "auxiliary_loss_mlp": 0.01064376, + "balance_loss_clip": 1.0416671, + "balance_loss_mlp": 1.05264366, + "epoch": 0.08964376972794229, + "flos": 23368186972800.0, + "grad_norm": 2.165908760559946, + "language_loss": 0.73276365, + "learning_rate": 3.9628000892573635e-06, + "loss": 0.75528657, + "num_input_tokens_seen": 31870040, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.3515625, + "step": 1491, + "time_per_iteration": 2.5531163215637207 + }, + { + "auxiliary_loss_clip": 0.01181575, + "auxiliary_loss_mlp": 0.01050632, + "balance_loss_clip": 1.02984166, + "balance_loss_mlp": 1.05362582, + "epoch": 0.08970389298061025, + "flos": 23294785530240.0, + "grad_norm": 1.6884120279290091, + "language_loss": 0.77121007, + "learning_rate": 3.9627252855914055e-06, + "loss": 0.79353207, + "num_input_tokens_seen": 31890400, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.28125, + "step": 1492, + "time_per_iteration": 2.485531806945801 + }, + { + "auxiliary_loss_clip": 0.01180742, + "auxiliary_loss_mlp": 0.01055458, + "balance_loss_clip": 1.03324914, + "balance_loss_mlp": 1.05471706, + "epoch": 0.08976401623327822, + "flos": 33761703772800.0, + "grad_norm": 2.0059524225222414, + "language_loss": 0.71168351, + "learning_rate": 3.962650407498707e-06, + "loss": 0.73404551, + "num_input_tokens_seen": 31913435, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.2578125, + "step": 1493, + "time_per_iteration": 2.5819149017333984 + }, + { + "auxiliary_loss_clip": 0.01184961, + "auxiliary_loss_mlp": 0.01056172, + "balance_loss_clip": 1.03304577, + "balance_loss_mlp": 1.05477107, + "epoch": 0.08982413948594618, + "flos": 23911335504000.0, + "grad_norm": 1.7443337417031568, + "language_loss": 0.86910093, + "learning_rate": 3.962575454982109e-06, + "loss": 0.89151227, + "num_input_tokens_seen": 31932435, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3046875, + "step": 1494, + "time_per_iteration": 2.491126775741577 + }, + { + "auxiliary_loss_clip": 0.01180854, + "auxiliary_loss_mlp": 0.01064445, + "balance_loss_clip": 1.04080594, + "balance_loss_mlp": 1.05289626, + "epoch": 0.08988426273861416, + "flos": 16837544551680.0, + "grad_norm": 1.7176751495851263, + "language_loss": 0.83065581, + "learning_rate": 3.962500428044454e-06, + "loss": 0.85310876, + "num_input_tokens_seen": 31950125, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.28125, + "step": 1495, + "time_per_iteration": 2.463747501373291 + }, + { + "auxiliary_loss_clip": 0.01187726, + "auxiliary_loss_mlp": 0.01056179, + "balance_loss_clip": 1.03410196, + "balance_loss_mlp": 1.05825078, + "epoch": 0.08994438599128213, + "flos": 14793365548800.0, + "grad_norm": 1.861203767183833, + "language_loss": 0.69813877, + "learning_rate": 3.962425326688585e-06, + "loss": 0.72057784, + "num_input_tokens_seen": 31968050, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.296875, + "step": 1496, + "time_per_iteration": 2.4409985542297363 + }, + { + "auxiliary_loss_clip": 0.01180533, + "auxiliary_loss_mlp": 0.01051241, + "balance_loss_clip": 1.03035557, + "balance_loss_mlp": 1.05325341, + "epoch": 0.09000450924395009, + "flos": 17384320356480.0, + "grad_norm": 1.6091347390483586, + "language_loss": 0.79913563, + "learning_rate": 3.962350150917351e-06, + "loss": 0.82145333, + "num_input_tokens_seen": 31985675, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.2734375, + "step": 1497, + "time_per_iteration": 2.492732048034668 + }, + { + "auxiliary_loss_clip": 0.01186533, + "auxiliary_loss_mlp": 0.0105809, + "balance_loss_clip": 1.03484416, + "balance_loss_mlp": 1.05299318, + "epoch": 0.09006463249661807, + "flos": 24280317964800.0, + "grad_norm": 2.3611651581227915, + "language_loss": 0.8262192, + "learning_rate": 3.9622749007336035e-06, + "loss": 0.84866548, + "num_input_tokens_seen": 32005180, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.3359375, + "step": 1498, + "time_per_iteration": 2.492124080657959 + }, + { + "auxiliary_loss_clip": 0.01188542, + "auxiliary_loss_mlp": 0.01061597, + "balance_loss_clip": 1.0402112, + "balance_loss_mlp": 1.05628157, + "epoch": 0.09012475574928604, + "flos": 13661928069120.0, + "grad_norm": 2.316244908481527, + "language_loss": 0.7849865, + "learning_rate": 3.962199576140195e-06, + "loss": 0.80748791, + "num_input_tokens_seen": 32022970, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.3203125, + "step": 1499, + "time_per_iteration": 2.455986738204956 + }, + { + "auxiliary_loss_clip": 0.0117942, + "auxiliary_loss_mlp": 0.01055125, + "balance_loss_clip": 1.03348815, + "balance_loss_mlp": 1.05351877, + "epoch": 0.090184879001954, + "flos": 23327751237120.0, + "grad_norm": 1.652937184766999, + "language_loss": 0.93453979, + "learning_rate": 3.962124177139981e-06, + "loss": 0.95688522, + "num_input_tokens_seen": 32043055, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.2578125, + "step": 1500, + "time_per_iteration": 2.481450080871582 + }, + { + "auxiliary_loss_clip": 0.01182931, + "auxiliary_loss_mlp": 0.01050934, + "balance_loss_clip": 1.0268302, + "balance_loss_mlp": 1.05170345, + "epoch": 0.09024500225462198, + "flos": 23002688131200.0, + "grad_norm": 2.9257189866461966, + "language_loss": 0.74465239, + "learning_rate": 3.962048703735822e-06, + "loss": 0.76699102, + "num_input_tokens_seen": 32061900, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.3125, + "step": 1501, + "time_per_iteration": 2.4806344509124756 + }, + { + "auxiliary_loss_clip": 0.01077215, + "auxiliary_loss_mlp": 0.01003378, + "balance_loss_clip": 0.99992049, + "balance_loss_mlp": 1.02834833, + "epoch": 0.09030512550728995, + "flos": 62189203242240.0, + "grad_norm": 0.7322723529864947, + "language_loss": 0.58304042, + "learning_rate": 3.96197315593058e-06, + "loss": 0.60384637, + "num_input_tokens_seen": 32122745, + "router_z_loss_clip": 0.03466797, + "router_z_loss_mlp": 0.48828125, + "step": 1502, + "time_per_iteration": 3.066755771636963 + }, + { + "auxiliary_loss_clip": 0.01178455, + "auxiliary_loss_mlp": 0.01047829, + "balance_loss_clip": 1.02655029, + "balance_loss_mlp": 1.05134845, + "epoch": 0.09036524875995791, + "flos": 38800689171840.0, + "grad_norm": 2.407651446444188, + "language_loss": 0.69502187, + "learning_rate": 3.961897533727119e-06, + "loss": 0.71728474, + "num_input_tokens_seen": 32145125, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.2734375, + "step": 1503, + "time_per_iteration": 2.608006000518799 + }, + { + "auxiliary_loss_clip": 0.01180913, + "auxiliary_loss_mlp": 0.01054911, + "balance_loss_clip": 1.03346539, + "balance_loss_mlp": 1.0508244, + "epoch": 0.09042537201262588, + "flos": 21690081429120.0, + "grad_norm": 2.015182939383952, + "language_loss": 0.86142361, + "learning_rate": 3.961821837128306e-06, + "loss": 0.88378185, + "num_input_tokens_seen": 32166255, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.3046875, + "step": 1504, + "time_per_iteration": 2.489906072616577 + }, + { + "auxiliary_loss_clip": 0.01188306, + "auxiliary_loss_mlp": 0.01064134, + "balance_loss_clip": 1.03871906, + "balance_loss_mlp": 1.05330658, + "epoch": 0.09048549526529386, + "flos": 22267021680000.0, + "grad_norm": 1.9466916160800904, + "language_loss": 0.72267938, + "learning_rate": 3.961746066137014e-06, + "loss": 0.74520379, + "num_input_tokens_seen": 32184010, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.3515625, + "step": 1505, + "time_per_iteration": 2.465965509414673 + }, + { + "auxiliary_loss_clip": 0.01179818, + "auxiliary_loss_mlp": 0.01054589, + "balance_loss_clip": 1.03203487, + "balance_loss_mlp": 1.05332816, + "epoch": 0.09054561851796182, + "flos": 14610939350400.0, + "grad_norm": 2.3726339000283447, + "language_loss": 0.80946511, + "learning_rate": 3.961670220756114e-06, + "loss": 0.83180916, + "num_input_tokens_seen": 32201635, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.265625, + "step": 1506, + "time_per_iteration": 2.4512932300567627 + }, + { + "auxiliary_loss_clip": 0.01179114, + "auxiliary_loss_mlp": 0.01049611, + "balance_loss_clip": 1.02859426, + "balance_loss_mlp": 1.0531404, + "epoch": 0.09060574177062979, + "flos": 27636169916160.0, + "grad_norm": 2.1533698580433254, + "language_loss": 0.76043189, + "learning_rate": 3.961594300988482e-06, + "loss": 0.78271914, + "num_input_tokens_seen": 32221940, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.2578125, + "step": 1507, + "time_per_iteration": 2.5277152061462402 + }, + { + "auxiliary_loss_clip": 0.01067186, + "auxiliary_loss_mlp": 0.01009923, + "balance_loss_clip": 1.00679994, + "balance_loss_mlp": 1.01922798, + "epoch": 0.09066586502329776, + "flos": 66085797513600.0, + "grad_norm": 0.7312512202665958, + "language_loss": 0.57670546, + "learning_rate": 3.961518306836998e-06, + "loss": 0.59747648, + "num_input_tokens_seen": 32276495, + "router_z_loss_clip": 0.03112793, + "router_z_loss_mlp": 0.48046875, + "step": 1508, + "time_per_iteration": 2.9330992698669434 + }, + { + "auxiliary_loss_clip": 0.01182207, + "auxiliary_loss_mlp": 0.01052694, + "balance_loss_clip": 1.0313319, + "balance_loss_mlp": 1.05309892, + "epoch": 0.09072598827596573, + "flos": 18916449027840.0, + "grad_norm": 2.072562238387217, + "language_loss": 0.85046542, + "learning_rate": 3.961442238304543e-06, + "loss": 0.87281442, + "num_input_tokens_seen": 32294130, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.2890625, + "step": 1509, + "time_per_iteration": 2.475606918334961 + }, + { + "auxiliary_loss_clip": 0.01189974, + "auxiliary_loss_mlp": 0.01065674, + "balance_loss_clip": 1.04158139, + "balance_loss_mlp": 1.05606115, + "epoch": 0.0907861115286337, + "flos": 24821742643200.0, + "grad_norm": 2.413703760690829, + "language_loss": 0.84302551, + "learning_rate": 3.961366095394002e-06, + "loss": 0.86558187, + "num_input_tokens_seen": 32313555, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.3359375, + "step": 1510, + "time_per_iteration": 2.576070785522461 + }, + { + "auxiliary_loss_clip": 0.01184002, + "auxiliary_loss_mlp": 0.01055545, + "balance_loss_clip": 1.0335387, + "balance_loss_mlp": 1.05408144, + "epoch": 0.09084623478130167, + "flos": 21652842003840.0, + "grad_norm": 1.9204492801986277, + "language_loss": 0.85558611, + "learning_rate": 3.961289878108262e-06, + "loss": 0.8779816, + "num_input_tokens_seen": 32331430, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.296875, + "step": 1511, + "time_per_iteration": 2.5085484981536865 + }, + { + "auxiliary_loss_clip": 0.01181957, + "auxiliary_loss_mlp": 0.01048579, + "balance_loss_clip": 1.02690685, + "balance_loss_mlp": 1.05469918, + "epoch": 0.09090635803396964, + "flos": 27639258485760.0, + "grad_norm": 1.5775523407684693, + "language_loss": 0.84897017, + "learning_rate": 3.9612135864502135e-06, + "loss": 0.87127548, + "num_input_tokens_seen": 32353705, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.2734375, + "step": 1512, + "time_per_iteration": 2.515565872192383 + }, + { + "auxiliary_loss_clip": 0.01175178, + "auxiliary_loss_mlp": 0.01049482, + "balance_loss_clip": 1.02888274, + "balance_loss_mlp": 1.05033123, + "epoch": 0.0909664812866376, + "flos": 17669127294720.0, + "grad_norm": 2.9006324958480167, + "language_loss": 0.86704344, + "learning_rate": 3.961137220422749e-06, + "loss": 0.88929009, + "num_input_tokens_seen": 32370520, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.25, + "step": 1513, + "time_per_iteration": 2.475271701812744 + }, + { + "auxiliary_loss_clip": 0.01180699, + "auxiliary_loss_mlp": 0.01051925, + "balance_loss_clip": 1.03170729, + "balance_loss_mlp": 1.0536902, + "epoch": 0.09102660453930557, + "flos": 23951448017280.0, + "grad_norm": 1.6716164971548293, + "language_loss": 0.86379707, + "learning_rate": 3.961060780028764e-06, + "loss": 0.8861233, + "num_input_tokens_seen": 32389105, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.2734375, + "step": 1514, + "time_per_iteration": 2.5317347049713135 + }, + { + "auxiliary_loss_clip": 0.0118192, + "auxiliary_loss_mlp": 0.01060131, + "balance_loss_clip": 1.03991365, + "balance_loss_mlp": 1.05550981, + "epoch": 0.09108672779197355, + "flos": 25812949426560.0, + "grad_norm": 1.9279276264910965, + "language_loss": 0.89882755, + "learning_rate": 3.960984265271159e-06, + "loss": 0.92124808, + "num_input_tokens_seen": 32408065, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.265625, + "step": 1515, + "time_per_iteration": 2.5507757663726807 + }, + { + "auxiliary_loss_clip": 0.011822, + "auxiliary_loss_mlp": 0.0105444, + "balance_loss_clip": 1.03174293, + "balance_loss_mlp": 1.05321527, + "epoch": 0.09114685104464151, + "flos": 29639482220160.0, + "grad_norm": 2.0145121179505905, + "language_loss": 0.85567206, + "learning_rate": 3.9609076761528335e-06, + "loss": 0.87803847, + "num_input_tokens_seen": 32427225, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.28125, + "step": 1516, + "time_per_iteration": 2.524787425994873 + }, + { + "auxiliary_loss_clip": 0.01182997, + "auxiliary_loss_mlp": 0.01053148, + "balance_loss_clip": 1.03130913, + "balance_loss_mlp": 1.05217946, + "epoch": 0.09120697429730948, + "flos": 33729635905920.0, + "grad_norm": 1.5232376391767188, + "language_loss": 0.81104374, + "learning_rate": 3.960831012676692e-06, + "loss": 0.83340514, + "num_input_tokens_seen": 32450510, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.3125, + "step": 1517, + "time_per_iteration": 2.5781173706054688 + }, + { + "auxiliary_loss_clip": 0.01185126, + "auxiliary_loss_mlp": 0.01068952, + "balance_loss_clip": 1.04729199, + "balance_loss_mlp": 1.05378699, + "epoch": 0.09126709754997746, + "flos": 18401381953920.0, + "grad_norm": 1.6026665805728266, + "language_loss": 0.78008473, + "learning_rate": 3.960754274845642e-06, + "loss": 0.80262554, + "num_input_tokens_seen": 32468425, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.3125, + "step": 1518, + "time_per_iteration": 4.000938653945923 + }, + { + "auxiliary_loss_clip": 0.01179619, + "auxiliary_loss_mlp": 0.01060053, + "balance_loss_clip": 1.03851235, + "balance_loss_mlp": 1.05189955, + "epoch": 0.09132722080264542, + "flos": 22091957769600.0, + "grad_norm": 1.883609624415087, + "language_loss": 0.86375809, + "learning_rate": 3.960677462662594e-06, + "loss": 0.88615477, + "num_input_tokens_seen": 32487510, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.28125, + "step": 1519, + "time_per_iteration": 3.945183277130127 + }, + { + "auxiliary_loss_clip": 0.01180521, + "auxiliary_loss_mlp": 0.01053198, + "balance_loss_clip": 1.03027439, + "balance_loss_mlp": 1.05196333, + "epoch": 0.09138734405531339, + "flos": 21033131633280.0, + "grad_norm": 2.4149150298084425, + "language_loss": 0.73425877, + "learning_rate": 3.96060057613046e-06, + "loss": 0.75659597, + "num_input_tokens_seen": 32507250, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.28125, + "step": 1520, + "time_per_iteration": 2.470977306365967 + }, + { + "auxiliary_loss_clip": 0.01181506, + "auxiliary_loss_mlp": 0.01055081, + "balance_loss_clip": 1.03299177, + "balance_loss_mlp": 1.0525614, + "epoch": 0.09144746730798137, + "flos": 20083940784000.0, + "grad_norm": 2.6960755220153825, + "language_loss": 0.85296613, + "learning_rate": 3.960523615252156e-06, + "loss": 0.87533194, + "num_input_tokens_seen": 32526045, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.2890625, + "step": 1521, + "time_per_iteration": 2.478440761566162 + }, + { + "auxiliary_loss_clip": 0.01183058, + "auxiliary_loss_mlp": 0.01057495, + "balance_loss_clip": 1.034917, + "balance_loss_mlp": 1.05319118, + "epoch": 0.09150759056064933, + "flos": 22778210085120.0, + "grad_norm": 2.1543470058122876, + "language_loss": 0.83979875, + "learning_rate": 3.960446580030599e-06, + "loss": 0.86220425, + "num_input_tokens_seen": 32546575, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.296875, + "step": 1522, + "time_per_iteration": 2.4761834144592285 + }, + { + "auxiliary_loss_clip": 0.01174804, + "auxiliary_loss_mlp": 0.01057417, + "balance_loss_clip": 1.03500533, + "balance_loss_mlp": 1.05125594, + "epoch": 0.0915677138133173, + "flos": 27564205017600.0, + "grad_norm": 2.174137545904809, + "language_loss": 0.810691, + "learning_rate": 3.960369470468711e-06, + "loss": 0.83301324, + "num_input_tokens_seen": 32568795, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.234375, + "step": 1523, + "time_per_iteration": 2.525385618209839 + }, + { + "auxiliary_loss_clip": 0.01182998, + "auxiliary_loss_mlp": 0.01063543, + "balance_loss_clip": 1.0426811, + "balance_loss_mlp": 1.05365944, + "epoch": 0.09162783706598528, + "flos": 17674765729920.0, + "grad_norm": 2.529065997296093, + "language_loss": 0.74591744, + "learning_rate": 3.960292286569418e-06, + "loss": 0.76838291, + "num_input_tokens_seen": 32587010, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.296875, + "step": 1524, + "time_per_iteration": 2.4293112754821777 + }, + { + "auxiliary_loss_clip": 0.01181121, + "auxiliary_loss_mlp": 0.01060116, + "balance_loss_clip": 1.03822935, + "balance_loss_mlp": 1.05373263, + "epoch": 0.09168796031865324, + "flos": 18478195188480.0, + "grad_norm": 2.0870290485059586, + "language_loss": 0.861516, + "learning_rate": 3.960215028335644e-06, + "loss": 0.88392842, + "num_input_tokens_seen": 32602375, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.2734375, + "step": 1525, + "time_per_iteration": 2.449774980545044 + }, + { + "auxiliary_loss_clip": 0.01181752, + "auxiliary_loss_mlp": 0.01047765, + "balance_loss_clip": 1.02577078, + "balance_loss_mlp": 1.05424511, + "epoch": 0.0917480835713212, + "flos": 29387605075200.0, + "grad_norm": 2.3600448138049597, + "language_loss": 0.74690467, + "learning_rate": 3.96013769577032e-06, + "loss": 0.76919985, + "num_input_tokens_seen": 32621460, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.28125, + "step": 1526, + "time_per_iteration": 2.5295088291168213 + }, + { + "auxiliary_loss_clip": 0.01177679, + "auxiliary_loss_mlp": 0.01052164, + "balance_loss_clip": 1.03058743, + "balance_loss_mlp": 1.05291057, + "epoch": 0.09180820682398917, + "flos": 19829262378240.0, + "grad_norm": 1.970734062299861, + "language_loss": 0.7736311, + "learning_rate": 3.960060288876378e-06, + "loss": 0.79592943, + "num_input_tokens_seen": 32640440, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.25, + "step": 1527, + "time_per_iteration": 2.465484142303467 + }, + { + "auxiliary_loss_clip": 0.01179355, + "auxiliary_loss_mlp": 0.01053495, + "balance_loss_clip": 1.03064227, + "balance_loss_mlp": 1.05090261, + "epoch": 0.09186833007665715, + "flos": 23841848643840.0, + "grad_norm": 1.9755082573034908, + "language_loss": 0.78465801, + "learning_rate": 3.959982807656753e-06, + "loss": 0.80698651, + "num_input_tokens_seen": 32660020, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.28125, + "step": 1528, + "time_per_iteration": 2.5257718563079834 + }, + { + "auxiliary_loss_clip": 0.01177926, + "auxiliary_loss_mlp": 0.01048723, + "balance_loss_clip": 1.0276351, + "balance_loss_mlp": 1.05085492, + "epoch": 0.09192845332932512, + "flos": 12932726065920.0, + "grad_norm": 2.6736868569465813, + "language_loss": 0.76880527, + "learning_rate": 3.959905252114384e-06, + "loss": 0.79107177, + "num_input_tokens_seen": 32678170, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.2734375, + "step": 1529, + "time_per_iteration": 2.4417288303375244 + }, + { + "auxiliary_loss_clip": 0.01180418, + "auxiliary_loss_mlp": 0.01053431, + "balance_loss_clip": 1.0306139, + "balance_loss_mlp": 1.05037212, + "epoch": 0.09198857658199308, + "flos": 24568177559040.0, + "grad_norm": 1.767002219307874, + "language_loss": 0.83118784, + "learning_rate": 3.959827622252211e-06, + "loss": 0.85352623, + "num_input_tokens_seen": 32697540, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.296875, + "step": 1530, + "time_per_iteration": 2.53367018699646 + }, + { + "auxiliary_loss_clip": 0.01173477, + "auxiliary_loss_mlp": 0.01059229, + "balance_loss_clip": 1.03723454, + "balance_loss_mlp": 1.05024123, + "epoch": 0.09204869983466106, + "flos": 20266941600000.0, + "grad_norm": 2.058190265763826, + "language_loss": 0.8408612, + "learning_rate": 3.959749918073179e-06, + "loss": 0.86318833, + "num_input_tokens_seen": 32716805, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.234375, + "step": 1531, + "time_per_iteration": 2.4784743785858154 + }, + { + "auxiliary_loss_clip": 0.01177383, + "auxiliary_loss_mlp": 0.01048961, + "balance_loss_clip": 1.02728868, + "balance_loss_mlp": 1.05083799, + "epoch": 0.09210882308732903, + "flos": 20885646389760.0, + "grad_norm": 1.8347699676368683, + "language_loss": 0.81135088, + "learning_rate": 3.959672139580233e-06, + "loss": 0.83361435, + "num_input_tokens_seen": 32736385, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.265625, + "step": 1532, + "time_per_iteration": 2.506875991821289 + }, + { + "auxiliary_loss_clip": 0.01179012, + "auxiliary_loss_mlp": 0.01052948, + "balance_loss_clip": 1.03044105, + "balance_loss_mlp": 1.05169332, + "epoch": 0.09216894633999699, + "flos": 30956326727040.0, + "grad_norm": 1.8650949584676202, + "language_loss": 0.83489287, + "learning_rate": 3.9595942867763235e-06, + "loss": 0.85721242, + "num_input_tokens_seen": 32757140, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.2734375, + "step": 1533, + "time_per_iteration": 2.5279369354248047 + }, + { + "auxiliary_loss_clip": 0.01181754, + "auxiliary_loss_mlp": 0.01048559, + "balance_loss_clip": 1.02662432, + "balance_loss_mlp": 1.05468941, + "epoch": 0.09222906959266497, + "flos": 13151565676800.0, + "grad_norm": 1.8226281566677605, + "language_loss": 0.89789164, + "learning_rate": 3.959516359664402e-06, + "loss": 0.92019475, + "num_input_tokens_seen": 32774860, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.2734375, + "step": 1534, + "time_per_iteration": 2.498732089996338 + }, + { + "auxiliary_loss_clip": 0.01178154, + "auxiliary_loss_mlp": 0.01064045, + "balance_loss_clip": 1.03958321, + "balance_loss_mlp": 1.04994035, + "epoch": 0.09228919284533293, + "flos": 25994477784960.0, + "grad_norm": 2.6410414613778777, + "language_loss": 0.75911283, + "learning_rate": 3.959438358247424e-06, + "loss": 0.78153479, + "num_input_tokens_seen": 32795250, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.28125, + "step": 1535, + "time_per_iteration": 2.5389468669891357 + }, + { + "auxiliary_loss_clip": 0.01170543, + "auxiliary_loss_mlp": 0.01043965, + "balance_loss_clip": 1.02404571, + "balance_loss_mlp": 1.04907823, + "epoch": 0.0923493160980009, + "flos": 18660800954880.0, + "grad_norm": 1.8388387816947327, + "language_loss": 0.81344318, + "learning_rate": 3.959360282528346e-06, + "loss": 0.83558822, + "num_input_tokens_seen": 32813805, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.2109375, + "step": 1536, + "time_per_iteration": 2.5075631141662598 + }, + { + "auxiliary_loss_clip": 0.01173873, + "auxiliary_loss_mlp": 0.01051939, + "balance_loss_clip": 1.0312202, + "balance_loss_mlp": 1.04995418, + "epoch": 0.09240943935066886, + "flos": 21140576190720.0, + "grad_norm": 2.109198419692537, + "language_loss": 0.8921392, + "learning_rate": 3.959282132510131e-06, + "loss": 0.91439736, + "num_input_tokens_seen": 32830960, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.234375, + "step": 1537, + "time_per_iteration": 2.4454562664031982 + }, + { + "auxiliary_loss_clip": 0.01177438, + "auxiliary_loss_mlp": 0.01059104, + "balance_loss_clip": 1.03638315, + "balance_loss_mlp": 1.05164456, + "epoch": 0.09246956260333684, + "flos": 20592435669120.0, + "grad_norm": 2.1959440535625285, + "language_loss": 0.8072964, + "learning_rate": 3.959203908195741e-06, + "loss": 0.82966185, + "num_input_tokens_seen": 32848275, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.2578125, + "step": 1538, + "time_per_iteration": 2.50838303565979 + }, + { + "auxiliary_loss_clip": 0.01066028, + "auxiliary_loss_mlp": 0.0101212, + "balance_loss_clip": 1.0091517, + "balance_loss_mlp": 1.01794529, + "epoch": 0.09252968585600481, + "flos": 67558710614400.0, + "grad_norm": 0.74443800558722, + "language_loss": 0.57375526, + "learning_rate": 3.959125609588142e-06, + "loss": 0.59453678, + "num_input_tokens_seen": 32917730, + "router_z_loss_clip": 0.02966309, + "router_z_loss_mlp": 0.48046875, + "step": 1539, + "time_per_iteration": 3.16038179397583 + }, + { + "auxiliary_loss_clip": 0.01179737, + "auxiliary_loss_mlp": 0.01051392, + "balance_loss_clip": 1.02958906, + "balance_loss_mlp": 1.05291581, + "epoch": 0.09258980910867277, + "flos": 17383853479680.0, + "grad_norm": 2.903908071477431, + "language_loss": 0.67164814, + "learning_rate": 3.959047236690304e-06, + "loss": 0.69395947, + "num_input_tokens_seen": 32934910, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.265625, + "step": 1540, + "time_per_iteration": 2.488809585571289 + }, + { + "auxiliary_loss_clip": 0.01178592, + "auxiliary_loss_mlp": 0.0104328, + "balance_loss_clip": 1.02154827, + "balance_loss_mlp": 1.05285096, + "epoch": 0.09264993236134075, + "flos": 19865927185920.0, + "grad_norm": 1.797248436862791, + "language_loss": 0.83666921, + "learning_rate": 3.958968789505198e-06, + "loss": 0.85888791, + "num_input_tokens_seen": 32953840, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.2578125, + "step": 1541, + "time_per_iteration": 2.5406758785247803 + }, + { + "auxiliary_loss_clip": 0.01060695, + "auxiliary_loss_mlp": 0.01009011, + "balance_loss_clip": 1.0061146, + "balance_loss_mlp": 1.01339245, + "epoch": 0.09271005561400872, + "flos": 62284401262080.0, + "grad_norm": 0.8904869203130611, + "language_loss": 0.6196329, + "learning_rate": 3.9588902680358e-06, + "loss": 0.64032996, + "num_input_tokens_seen": 33011410, + "router_z_loss_clip": 0.02893066, + "router_z_loss_mlp": 0.47265625, + "step": 1542, + "time_per_iteration": 3.0973262786865234 + }, + { + "auxiliary_loss_clip": 0.01178215, + "auxiliary_loss_mlp": 0.01055032, + "balance_loss_clip": 1.03486192, + "balance_loss_mlp": 1.05283189, + "epoch": 0.09277017886667668, + "flos": 23329870139520.0, + "grad_norm": 1.711071573157868, + "language_loss": 0.82672381, + "learning_rate": 3.958811672285086e-06, + "loss": 0.84905624, + "num_input_tokens_seen": 33031675, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.25, + "step": 1543, + "time_per_iteration": 2.489415168762207 + }, + { + "auxiliary_loss_clip": 0.0117317, + "auxiliary_loss_mlp": 0.01055984, + "balance_loss_clip": 1.03462195, + "balance_loss_mlp": 1.05128777, + "epoch": 0.09283030211934466, + "flos": 54745169875200.0, + "grad_norm": 1.6169278883375504, + "language_loss": 0.72058821, + "learning_rate": 3.958733002256038e-06, + "loss": 0.74287981, + "num_input_tokens_seen": 33056355, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.21875, + "step": 1544, + "time_per_iteration": 2.7986748218536377 + }, + { + "auxiliary_loss_clip": 0.01177062, + "auxiliary_loss_mlp": 0.01047649, + "balance_loss_clip": 1.0257864, + "balance_loss_mlp": 1.05111873, + "epoch": 0.09289042537201263, + "flos": 30334784762880.0, + "grad_norm": 1.7012123784712243, + "language_loss": 0.77617419, + "learning_rate": 3.958654257951637e-06, + "loss": 0.79842126, + "num_input_tokens_seen": 33079520, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.265625, + "step": 1545, + "time_per_iteration": 2.5673069953918457 + }, + { + "auxiliary_loss_clip": 0.01173726, + "auxiliary_loss_mlp": 0.01050414, + "balance_loss_clip": 1.029338, + "balance_loss_mlp": 1.0525856, + "epoch": 0.09295054862468059, + "flos": 17746838369280.0, + "grad_norm": 2.736353511607615, + "language_loss": 0.74531418, + "learning_rate": 3.9585754393748706e-06, + "loss": 0.76755565, + "num_input_tokens_seen": 33096135, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.2109375, + "step": 1546, + "time_per_iteration": 2.456806182861328 + }, + { + "auxiliary_loss_clip": 0.01180806, + "auxiliary_loss_mlp": 0.01051708, + "balance_loss_clip": 1.02968979, + "balance_loss_mlp": 1.05292201, + "epoch": 0.09301067187734856, + "flos": 23658021815040.0, + "grad_norm": 2.1086065935537284, + "language_loss": 0.84392273, + "learning_rate": 3.9584965465287275e-06, + "loss": 0.86624783, + "num_input_tokens_seen": 33115245, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.28125, + "step": 1547, + "time_per_iteration": 2.5041439533233643 + }, + { + "auxiliary_loss_clip": 0.01177454, + "auxiliary_loss_mlp": 0.01053084, + "balance_loss_clip": 1.03136444, + "balance_loss_mlp": 1.05125856, + "epoch": 0.09307079513001654, + "flos": 27527719777920.0, + "grad_norm": 7.120670718523448, + "language_loss": 0.67616034, + "learning_rate": 3.958417579416199e-06, + "loss": 0.6984657, + "num_input_tokens_seen": 33136640, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.2578125, + "step": 1548, + "time_per_iteration": 2.513141393661499 + }, + { + "auxiliary_loss_clip": 0.01178735, + "auxiliary_loss_mlp": 0.01053, + "balance_loss_clip": 1.03083944, + "balance_loss_mlp": 1.05175209, + "epoch": 0.0931309183826845, + "flos": 20627340710400.0, + "grad_norm": 2.761700755369037, + "language_loss": 0.83445251, + "learning_rate": 3.9583385380402795e-06, + "loss": 0.85676992, + "num_input_tokens_seen": 33155060, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.265625, + "step": 1549, + "time_per_iteration": 2.4822285175323486 + }, + { + "auxiliary_loss_clip": 0.01181659, + "auxiliary_loss_mlp": 0.01043887, + "balance_loss_clip": 1.02312112, + "balance_loss_mlp": 1.05560291, + "epoch": 0.09319104163535247, + "flos": 29020921084800.0, + "grad_norm": 1.7822943519837542, + "language_loss": 0.75744081, + "learning_rate": 3.958259422403966e-06, + "loss": 0.77969635, + "num_input_tokens_seen": 33175420, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.2578125, + "step": 1550, + "time_per_iteration": 2.5503265857696533 + }, + { + "auxiliary_loss_clip": 0.01179426, + "auxiliary_loss_mlp": 0.01069184, + "balance_loss_clip": 1.04579496, + "balance_loss_mlp": 1.05118561, + "epoch": 0.09325116488802045, + "flos": 25301545539840.0, + "grad_norm": 2.0184762942100876, + "language_loss": 0.83272278, + "learning_rate": 3.95818023251026e-06, + "loss": 0.85520893, + "num_input_tokens_seen": 33194120, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.28125, + "step": 1551, + "time_per_iteration": 2.4962081909179688 + }, + { + "auxiliary_loss_clip": 0.01060634, + "auxiliary_loss_mlp": 0.01007794, + "balance_loss_clip": 1.0051949, + "balance_loss_mlp": 1.01350796, + "epoch": 0.09331128814068841, + "flos": 61536203942400.0, + "grad_norm": 0.7800746873014213, + "language_loss": 0.6182366, + "learning_rate": 3.958100968362163e-06, + "loss": 0.6389209, + "num_input_tokens_seen": 33261080, + "router_z_loss_clip": 0.02600098, + "router_z_loss_mlp": 0.47070312, + "step": 1552, + "time_per_iteration": 3.2178378105163574 + }, + { + "auxiliary_loss_clip": 0.01059462, + "auxiliary_loss_mlp": 0.01003668, + "balance_loss_clip": 1.00099754, + "balance_loss_mlp": 1.01257896, + "epoch": 0.09337141139335638, + "flos": 53293700171520.0, + "grad_norm": 0.8330449834122059, + "language_loss": 0.5895977, + "learning_rate": 3.958021629962681e-06, + "loss": 0.61022902, + "num_input_tokens_seen": 33330235, + "router_z_loss_clip": 0.0267334, + "router_z_loss_mlp": 0.46875, + "step": 1553, + "time_per_iteration": 3.220923900604248 + }, + { + "auxiliary_loss_clip": 0.01178223, + "auxiliary_loss_mlp": 0.01058803, + "balance_loss_clip": 1.0369525, + "balance_loss_mlp": 1.05040002, + "epoch": 0.09343153464602436, + "flos": 23476852592640.0, + "grad_norm": 2.0753391269624797, + "language_loss": 0.87452686, + "learning_rate": 3.957942217314823e-06, + "loss": 0.89689714, + "num_input_tokens_seen": 33349035, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.28125, + "step": 1554, + "time_per_iteration": 2.5448763370513916 + }, + { + "auxiliary_loss_clip": 0.01174828, + "auxiliary_loss_mlp": 0.01052934, + "balance_loss_clip": 1.0310595, + "balance_loss_mlp": 1.05265594, + "epoch": 0.09349165789869232, + "flos": 19353481804800.0, + "grad_norm": 2.2438919833216913, + "language_loss": 0.81355709, + "learning_rate": 3.957862730421599e-06, + "loss": 0.83583468, + "num_input_tokens_seen": 33368060, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.21875, + "step": 1555, + "time_per_iteration": 2.4939990043640137 + }, + { + "auxiliary_loss_clip": 0.01058772, + "auxiliary_loss_mlp": 0.01008478, + "balance_loss_clip": 1.00581956, + "balance_loss_mlp": 1.01259685, + "epoch": 0.09355178115136029, + "flos": 67502580635520.0, + "grad_norm": 0.8701907042199977, + "language_loss": 0.59583747, + "learning_rate": 3.957783169286024e-06, + "loss": 0.61651003, + "num_input_tokens_seen": 33430825, + "router_z_loss_clip": 0.02661133, + "router_z_loss_mlp": 0.4609375, + "step": 1556, + "time_per_iteration": 3.0923824310302734 + }, + { + "auxiliary_loss_clip": 0.01177126, + "auxiliary_loss_mlp": 0.01056269, + "balance_loss_clip": 1.03518105, + "balance_loss_mlp": 1.05278862, + "epoch": 0.09361190440402825, + "flos": 37341638720640.0, + "grad_norm": 1.5891177576034032, + "language_loss": 0.84455961, + "learning_rate": 3.9577035339111155e-06, + "loss": 0.86689359, + "num_input_tokens_seen": 33454855, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.2421875, + "step": 1557, + "time_per_iteration": 2.5973968505859375 + }, + { + "auxiliary_loss_clip": 0.01175988, + "auxiliary_loss_mlp": 0.01061513, + "balance_loss_clip": 1.03799307, + "balance_loss_mlp": 1.05065048, + "epoch": 0.09367202765669623, + "flos": 24899705112960.0, + "grad_norm": 1.787574567308206, + "language_loss": 0.77987397, + "learning_rate": 3.957623824299893e-06, + "loss": 0.80224895, + "num_input_tokens_seen": 33476000, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.25, + "step": 1558, + "time_per_iteration": 2.5390286445617676 + }, + { + "auxiliary_loss_clip": 0.01178258, + "auxiliary_loss_mlp": 0.01054751, + "balance_loss_clip": 1.03268576, + "balance_loss_mlp": 1.05035424, + "epoch": 0.0937321509093642, + "flos": 15705568368000.0, + "grad_norm": 2.0310113035260873, + "language_loss": 0.7998119, + "learning_rate": 3.957544040455379e-06, + "loss": 0.822142, + "num_input_tokens_seen": 33493845, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.28125, + "step": 1559, + "time_per_iteration": 5.3233802318573 + }, + { + "auxiliary_loss_clip": 0.01172855, + "auxiliary_loss_mlp": 0.01063353, + "balance_loss_clip": 1.04146647, + "balance_loss_mlp": 1.05015147, + "epoch": 0.09379227416203216, + "flos": 20483698222080.0, + "grad_norm": 1.9877315441152976, + "language_loss": 0.76720232, + "learning_rate": 3.957464182380599e-06, + "loss": 0.78956437, + "num_input_tokens_seen": 33510850, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.2265625, + "step": 1560, + "time_per_iteration": 3.863935947418213 + }, + { + "auxiliary_loss_clip": 0.01180546, + "auxiliary_loss_mlp": 0.01059772, + "balance_loss_clip": 1.03793311, + "balance_loss_mlp": 1.05101645, + "epoch": 0.09385239741470014, + "flos": 24352498344960.0, + "grad_norm": 1.6628394684514, + "language_loss": 0.81219828, + "learning_rate": 3.95738425007858e-06, + "loss": 0.83460152, + "num_input_tokens_seen": 33530430, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.296875, + "step": 1561, + "time_per_iteration": 2.5050160884857178 + }, + { + "auxiliary_loss_clip": 0.01175131, + "auxiliary_loss_mlp": 0.01048338, + "balance_loss_clip": 1.02641547, + "balance_loss_mlp": 1.04764926, + "epoch": 0.0939125206673681, + "flos": 33291489807360.0, + "grad_norm": 2.307547697406205, + "language_loss": 0.61553764, + "learning_rate": 3.957304243552354e-06, + "loss": 0.63777232, + "num_input_tokens_seen": 33551975, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.2734375, + "step": 1562, + "time_per_iteration": 2.5884838104248047 + }, + { + "auxiliary_loss_clip": 0.01177686, + "auxiliary_loss_mlp": 0.01059886, + "balance_loss_clip": 1.03920364, + "balance_loss_mlp": 1.0552876, + "epoch": 0.09397264392003607, + "flos": 19244923925760.0, + "grad_norm": 2.5948914783661468, + "language_loss": 0.84981585, + "learning_rate": 3.957224162804956e-06, + "loss": 0.87219155, + "num_input_tokens_seen": 33569850, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.21875, + "step": 1563, + "time_per_iteration": 2.427928924560547 + }, + { + "auxiliary_loss_clip": 0.01172512, + "auxiliary_loss_mlp": 0.01048044, + "balance_loss_clip": 1.02767134, + "balance_loss_mlp": 1.05013323, + "epoch": 0.09403276717270405, + "flos": 19317930318720.0, + "grad_norm": 1.8141046481233785, + "language_loss": 0.76106739, + "learning_rate": 3.9571440078394205e-06, + "loss": 0.78327298, + "num_input_tokens_seen": 33590510, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.21875, + "step": 1564, + "time_per_iteration": 2.4996325969696045 + }, + { + "auxiliary_loss_clip": 0.01177295, + "auxiliary_loss_mlp": 0.01055133, + "balance_loss_clip": 1.03415227, + "balance_loss_mlp": 1.05290008, + "epoch": 0.09409289042537201, + "flos": 23583471137280.0, + "grad_norm": 2.0134268414891388, + "language_loss": 0.7971766, + "learning_rate": 3.9570637786587895e-06, + "loss": 0.81950086, + "num_input_tokens_seen": 33608810, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.25, + "step": 1565, + "time_per_iteration": 2.470870018005371 + }, + { + "auxiliary_loss_clip": 0.01175133, + "auxiliary_loss_mlp": 0.01069432, + "balance_loss_clip": 1.0479629, + "balance_loss_mlp": 1.0497129, + "epoch": 0.09415301367803998, + "flos": 20078446003200.0, + "grad_norm": 1.8353632925340597, + "language_loss": 0.75241816, + "learning_rate": 3.956983475266103e-06, + "loss": 0.77486378, + "num_input_tokens_seen": 33627265, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.2578125, + "step": 1566, + "time_per_iteration": 2.4962053298950195 + }, + { + "auxiliary_loss_clip": 0.0117411, + "auxiliary_loss_mlp": 0.01059014, + "balance_loss_clip": 1.03746092, + "balance_loss_mlp": 1.04822683, + "epoch": 0.09421313693070796, + "flos": 21062075016960.0, + "grad_norm": 2.55149440594841, + "language_loss": 0.77724433, + "learning_rate": 3.956903097664407e-06, + "loss": 0.79957557, + "num_input_tokens_seen": 33644810, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.2578125, + "step": 1567, + "time_per_iteration": 2.448511838912964 + }, + { + "auxiliary_loss_clip": 0.01178494, + "auxiliary_loss_mlp": 0.01054706, + "balance_loss_clip": 1.03504825, + "balance_loss_mlp": 1.05183101, + "epoch": 0.09427326018337592, + "flos": 24316156759680.0, + "grad_norm": 2.293964487000622, + "language_loss": 0.82571244, + "learning_rate": 3.956822645856749e-06, + "loss": 0.8480444, + "num_input_tokens_seen": 33665665, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.265625, + "step": 1568, + "time_per_iteration": 2.5221774578094482 + }, + { + "auxiliary_loss_clip": 0.01179838, + "auxiliary_loss_mlp": 0.01048346, + "balance_loss_clip": 1.02527881, + "balance_loss_mlp": 1.05191278, + "epoch": 0.09433338343604389, + "flos": 20263888944000.0, + "grad_norm": 4.3822924949764515, + "language_loss": 0.7658236, + "learning_rate": 3.9567421198461814e-06, + "loss": 0.78810549, + "num_input_tokens_seen": 33684760, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.28125, + "step": 1569, + "time_per_iteration": 2.464019775390625 + }, + { + "auxiliary_loss_clip": 0.01171203, + "auxiliary_loss_mlp": 0.01052053, + "balance_loss_clip": 1.03004718, + "balance_loss_mlp": 1.04984534, + "epoch": 0.09439350668871185, + "flos": 12742973493120.0, + "grad_norm": 2.11394347406088, + "language_loss": 0.86315012, + "learning_rate": 3.956661519635756e-06, + "loss": 0.88538271, + "num_input_tokens_seen": 33700750, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.21875, + "step": 1570, + "time_per_iteration": 2.479001998901367 + }, + { + "auxiliary_loss_clip": 0.01177967, + "auxiliary_loss_mlp": 0.01049184, + "balance_loss_clip": 1.02764273, + "balance_loss_mlp": 1.05340183, + "epoch": 0.09445362994137983, + "flos": 25962266263680.0, + "grad_norm": 1.6480791038221163, + "language_loss": 0.76531005, + "learning_rate": 3.95658084522853e-06, + "loss": 0.78758156, + "num_input_tokens_seen": 33724430, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.25, + "step": 1571, + "time_per_iteration": 2.5270462036132812 + }, + { + "auxiliary_loss_clip": 0.01169263, + "auxiliary_loss_mlp": 0.01049685, + "balance_loss_clip": 1.02848995, + "balance_loss_mlp": 1.0496099, + "epoch": 0.0945137531940478, + "flos": 19715353372800.0, + "grad_norm": 1.780883866775424, + "language_loss": 0.79518712, + "learning_rate": 3.956500096627561e-06, + "loss": 0.81737661, + "num_input_tokens_seen": 33743455, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1953125, + "step": 1572, + "time_per_iteration": 2.477403163909912 + }, + { + "auxiliary_loss_clip": 0.01172702, + "auxiliary_loss_mlp": 0.01053869, + "balance_loss_clip": 1.03288805, + "balance_loss_mlp": 1.05036175, + "epoch": 0.09457387644671576, + "flos": 23617047375360.0, + "grad_norm": 1.8458711299535766, + "language_loss": 0.87948155, + "learning_rate": 3.956419273835913e-06, + "loss": 0.90174723, + "num_input_tokens_seen": 33763435, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.21875, + "step": 1573, + "time_per_iteration": 2.5164122581481934 + }, + { + "auxiliary_loss_clip": 0.01177194, + "auxiliary_loss_mlp": 0.01059795, + "balance_loss_clip": 1.03533316, + "balance_loss_mlp": 1.05045378, + "epoch": 0.09463399969938374, + "flos": 26907291135360.0, + "grad_norm": 2.770313323609274, + "language_loss": 0.81827116, + "learning_rate": 3.95633837685665e-06, + "loss": 0.84064102, + "num_input_tokens_seen": 33784325, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.265625, + "step": 1574, + "time_per_iteration": 2.5540831089019775 + }, + { + "auxiliary_loss_clip": 0.01178056, + "auxiliary_loss_mlp": 0.01052269, + "balance_loss_clip": 1.03128815, + "balance_loss_mlp": 1.05359375, + "epoch": 0.0946941229520517, + "flos": 23659566099840.0, + "grad_norm": 2.139236970889498, + "language_loss": 0.80922085, + "learning_rate": 3.95625740569284e-06, + "loss": 0.83152413, + "num_input_tokens_seen": 33802510, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.25, + "step": 1575, + "time_per_iteration": 2.4874608516693115 + }, + { + "auxiliary_loss_clip": 0.01172567, + "auxiliary_loss_mlp": 0.01063693, + "balance_loss_clip": 1.04184198, + "balance_loss_mlp": 1.05048943, + "epoch": 0.09475424620471967, + "flos": 24134053783680.0, + "grad_norm": 2.1107661515601, + "language_loss": 0.86745369, + "learning_rate": 3.956176360347553e-06, + "loss": 0.88981628, + "num_input_tokens_seen": 33819980, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.21875, + "step": 1576, + "time_per_iteration": 2.514961004257202 + }, + { + "auxiliary_loss_clip": 0.01058351, + "auxiliary_loss_mlp": 0.01025821, + "balance_loss_clip": 1.02272177, + "balance_loss_mlp": 1.01272786, + "epoch": 0.09481436945738765, + "flos": 68426168065920.0, + "grad_norm": 0.9836929902555142, + "language_loss": 0.65832257, + "learning_rate": 3.956095240823862e-06, + "loss": 0.67916429, + "num_input_tokens_seen": 33878925, + "router_z_loss_clip": 0.03100586, + "router_z_loss_mlp": 0.45703125, + "step": 1577, + "time_per_iteration": 3.042998790740967 + }, + { + "auxiliary_loss_clip": 0.01175806, + "auxiliary_loss_mlp": 0.01045658, + "balance_loss_clip": 1.02504635, + "balance_loss_mlp": 1.05083144, + "epoch": 0.09487449271005562, + "flos": 16654076858880.0, + "grad_norm": 3.158821122445177, + "language_loss": 0.79113019, + "learning_rate": 3.956014047124844e-06, + "loss": 0.81334484, + "num_input_tokens_seen": 33897600, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.25, + "step": 1578, + "time_per_iteration": 2.492605447769165 + }, + { + "auxiliary_loss_clip": 0.01173104, + "auxiliary_loss_mlp": 0.01056494, + "balance_loss_clip": 1.03446436, + "balance_loss_mlp": 1.04935408, + "epoch": 0.09493461596272358, + "flos": 24275685110400.0, + "grad_norm": 1.6941125689582233, + "language_loss": 0.77994359, + "learning_rate": 3.955932779253578e-06, + "loss": 0.80223954, + "num_input_tokens_seen": 33917365, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.234375, + "step": 1579, + "time_per_iteration": 2.5021350383758545 + }, + { + "auxiliary_loss_clip": 0.01176838, + "auxiliary_loss_mlp": 0.01054415, + "balance_loss_clip": 1.0317533, + "balance_loss_mlp": 1.05228639, + "epoch": 0.09499473921539155, + "flos": 21870173243520.0, + "grad_norm": 2.3012950697800747, + "language_loss": 0.73576474, + "learning_rate": 3.955851437213144e-06, + "loss": 0.75807726, + "num_input_tokens_seen": 33936680, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.2421875, + "step": 1580, + "time_per_iteration": 2.500426769256592 + }, + { + "auxiliary_loss_clip": 0.01171524, + "auxiliary_loss_mlp": 0.01053034, + "balance_loss_clip": 1.03235102, + "balance_loss_mlp": 1.05162525, + "epoch": 0.09505486246805953, + "flos": 33547137880320.0, + "grad_norm": 2.820694860574998, + "language_loss": 0.77813822, + "learning_rate": 3.955770021006627e-06, + "loss": 0.80038381, + "num_input_tokens_seen": 33960685, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.203125, + "step": 1581, + "time_per_iteration": 2.569086790084839 + }, + { + "auxiliary_loss_clip": 0.01177083, + "auxiliary_loss_mlp": 0.0105881, + "balance_loss_clip": 1.03779316, + "balance_loss_mlp": 1.05315304, + "epoch": 0.09511498572072749, + "flos": 21215342350080.0, + "grad_norm": 2.1718701740895443, + "language_loss": 0.86914808, + "learning_rate": 3.955688530637116e-06, + "loss": 0.89150703, + "num_input_tokens_seen": 33980015, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.234375, + "step": 1582, + "time_per_iteration": 2.476386785507202 + }, + { + "auxiliary_loss_clip": 0.01178411, + "auxiliary_loss_mlp": 0.01057193, + "balance_loss_clip": 1.03394723, + "balance_loss_mlp": 1.05487967, + "epoch": 0.09517510897339546, + "flos": 14611262572800.0, + "grad_norm": 1.7496793522695477, + "language_loss": 0.66838771, + "learning_rate": 3.955606966107699e-06, + "loss": 0.6907438, + "num_input_tokens_seen": 33997705, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.234375, + "step": 1583, + "time_per_iteration": 2.4433302879333496 + }, + { + "auxiliary_loss_clip": 0.01180705, + "auxiliary_loss_mlp": 0.01052141, + "balance_loss_clip": 1.02919281, + "balance_loss_mlp": 1.0555923, + "epoch": 0.09523523222606343, + "flos": 27817339138560.0, + "grad_norm": 1.8272679383640855, + "language_loss": 0.70314872, + "learning_rate": 3.95552532742147e-06, + "loss": 0.7254771, + "num_input_tokens_seen": 34017465, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.25, + "step": 1584, + "time_per_iteration": 2.5352938175201416 + }, + { + "auxiliary_loss_clip": 0.01177345, + "auxiliary_loss_mlp": 0.01054432, + "balance_loss_clip": 1.0344646, + "balance_loss_mlp": 1.0527246, + "epoch": 0.0952953554787314, + "flos": 20706272847360.0, + "grad_norm": 1.5429491827095454, + "language_loss": 0.80649364, + "learning_rate": 3.955443614581525e-06, + "loss": 0.82881135, + "num_input_tokens_seen": 34038550, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.2421875, + "step": 1585, + "time_per_iteration": 2.5006139278411865 + }, + { + "auxiliary_loss_clip": 0.01179471, + "auxiliary_loss_mlp": 0.01056563, + "balance_loss_clip": 1.03301954, + "balance_loss_mlp": 1.05324364, + "epoch": 0.09535547873139937, + "flos": 24787627701120.0, + "grad_norm": 1.5763794615860258, + "language_loss": 0.7156626, + "learning_rate": 3.955361827590961e-06, + "loss": 0.73802292, + "num_input_tokens_seen": 34058665, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.265625, + "step": 1586, + "time_per_iteration": 2.510941982269287 + }, + { + "auxiliary_loss_clip": 0.01058124, + "auxiliary_loss_mlp": 0.010121, + "balance_loss_clip": 1.00946522, + "balance_loss_mlp": 1.01272035, + "epoch": 0.09541560198406734, + "flos": 71912194905600.0, + "grad_norm": 0.8128409972345002, + "language_loss": 0.55392706, + "learning_rate": 3.955279966452883e-06, + "loss": 0.57462931, + "num_input_tokens_seen": 34109655, + "router_z_loss_clip": 0.02636719, + "router_z_loss_mlp": 0.453125, + "step": 1587, + "time_per_iteration": 2.8747992515563965 + }, + { + "auxiliary_loss_clip": 0.0118109, + "auxiliary_loss_mlp": 0.01056077, + "balance_loss_clip": 1.0345006, + "balance_loss_mlp": 1.0550952, + "epoch": 0.09547572523673531, + "flos": 28982604251520.0, + "grad_norm": 1.813611272618652, + "language_loss": 0.81023234, + "learning_rate": 3.955198031170391e-06, + "loss": 0.83260405, + "num_input_tokens_seen": 34131115, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.265625, + "step": 1588, + "time_per_iteration": 2.5403292179107666 + }, + { + "auxiliary_loss_clip": 0.01178114, + "auxiliary_loss_mlp": 0.01054854, + "balance_loss_clip": 1.03290713, + "balance_loss_mlp": 1.05471849, + "epoch": 0.09553584848940327, + "flos": 24133910129280.0, + "grad_norm": 2.1843830695972835, + "language_loss": 0.81552076, + "learning_rate": 3.955116021746594e-06, + "loss": 0.83785045, + "num_input_tokens_seen": 34151925, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.234375, + "step": 1589, + "time_per_iteration": 2.4995651245117188 + }, + { + "auxiliary_loss_clip": 0.01175474, + "auxiliary_loss_mlp": 0.01051658, + "balance_loss_clip": 1.02901983, + "balance_loss_mlp": 1.05340207, + "epoch": 0.09559597174207124, + "flos": 42851376789120.0, + "grad_norm": 1.4497838373443381, + "language_loss": 0.65005404, + "learning_rate": 3.955033938184601e-06, + "loss": 0.67232537, + "num_input_tokens_seen": 34175395, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.21875, + "step": 1590, + "time_per_iteration": 2.7222375869750977 + }, + { + "auxiliary_loss_clip": 0.01173556, + "auxiliary_loss_mlp": 0.01051921, + "balance_loss_clip": 1.03036785, + "balance_loss_mlp": 1.05178595, + "epoch": 0.09565609499473922, + "flos": 32670845683200.0, + "grad_norm": 1.714913693600035, + "language_loss": 0.83272862, + "learning_rate": 3.954951780487526e-06, + "loss": 0.85498345, + "num_input_tokens_seen": 34197760, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.21875, + "step": 1591, + "time_per_iteration": 2.571277379989624 + }, + { + "auxiliary_loss_clip": 0.01179776, + "auxiliary_loss_mlp": 0.01055769, + "balance_loss_clip": 1.03419209, + "balance_loss_mlp": 1.05280709, + "epoch": 0.09571621824740718, + "flos": 18478410670080.0, + "grad_norm": 2.268244689889179, + "language_loss": 0.74068749, + "learning_rate": 3.9548695486584835e-06, + "loss": 0.76304293, + "num_input_tokens_seen": 34215330, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.265625, + "step": 1592, + "time_per_iteration": 2.446272373199463 + }, + { + "auxiliary_loss_clip": 0.01173297, + "auxiliary_loss_mlp": 0.0104948, + "balance_loss_clip": 1.0282129, + "balance_loss_mlp": 1.05028248, + "epoch": 0.09577634150007515, + "flos": 29387497334400.0, + "grad_norm": 1.9287746031752921, + "language_loss": 0.74135411, + "learning_rate": 3.954787242700592e-06, + "loss": 0.76358187, + "num_input_tokens_seen": 34237745, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.234375, + "step": 1593, + "time_per_iteration": 2.518038749694824 + }, + { + "auxiliary_loss_clip": 0.01175652, + "auxiliary_loss_mlp": 0.01051222, + "balance_loss_clip": 1.03061128, + "balance_loss_mlp": 1.05365515, + "epoch": 0.09583646475274313, + "flos": 22747830157440.0, + "grad_norm": 1.8251705146793997, + "language_loss": 0.69907188, + "learning_rate": 3.954704862616971e-06, + "loss": 0.72134066, + "num_input_tokens_seen": 34256565, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.21875, + "step": 1594, + "time_per_iteration": 2.5454983711242676 + }, + { + "auxiliary_loss_clip": 0.01174594, + "auxiliary_loss_mlp": 0.0105111, + "balance_loss_clip": 1.03062999, + "balance_loss_mlp": 1.05023921, + "epoch": 0.0958965880054111, + "flos": 23218367345280.0, + "grad_norm": 2.596137828422853, + "language_loss": 0.82464099, + "learning_rate": 3.954622408410747e-06, + "loss": 0.84689802, + "num_input_tokens_seen": 34275970, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.2421875, + "step": 1595, + "time_per_iteration": 2.472062826156616 + }, + { + "auxiliary_loss_clip": 0.01176658, + "auxiliary_loss_mlp": 0.01050558, + "balance_loss_clip": 1.02803886, + "balance_loss_mlp": 1.05217803, + "epoch": 0.09595671125807906, + "flos": 21324438933120.0, + "grad_norm": 2.0311987750358953, + "language_loss": 0.84673214, + "learning_rate": 3.954539880085045e-06, + "loss": 0.86900425, + "num_input_tokens_seen": 34295490, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.2421875, + "step": 1596, + "time_per_iteration": 2.4801599979400635 + }, + { + "auxiliary_loss_clip": 0.01181467, + "auxiliary_loss_mlp": 0.01051063, + "balance_loss_clip": 1.02871156, + "balance_loss_mlp": 1.05628884, + "epoch": 0.09601683451074704, + "flos": 39603472185600.0, + "grad_norm": 2.531539932785817, + "language_loss": 0.68993127, + "learning_rate": 3.9544572776429945e-06, + "loss": 0.71225667, + "num_input_tokens_seen": 34319990, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.25, + "step": 1597, + "time_per_iteration": 2.6195101737976074 + }, + { + "auxiliary_loss_clip": 0.01175632, + "auxiliary_loss_mlp": 0.0104509, + "balance_loss_clip": 1.02370429, + "balance_loss_mlp": 1.04902959, + "epoch": 0.096076957763415, + "flos": 23732716147200.0, + "grad_norm": 2.18946094151333, + "language_loss": 0.74929029, + "learning_rate": 3.954374601087729e-06, + "loss": 0.77149749, + "num_input_tokens_seen": 34339225, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.265625, + "step": 1598, + "time_per_iteration": 2.474071502685547 + }, + { + "auxiliary_loss_clip": 0.01179079, + "auxiliary_loss_mlp": 0.01048418, + "balance_loss_clip": 1.02574444, + "balance_loss_mlp": 1.05284083, + "epoch": 0.09613708101608297, + "flos": 34678108483200.0, + "grad_norm": 1.6350676424235815, + "language_loss": 0.69002283, + "learning_rate": 3.954291850422382e-06, + "loss": 0.7122978, + "num_input_tokens_seen": 34361020, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.265625, + "step": 1599, + "time_per_iteration": 2.5599992275238037 + }, + { + "auxiliary_loss_clip": 0.01174972, + "auxiliary_loss_mlp": 0.01056578, + "balance_loss_clip": 1.0358355, + "balance_loss_mlp": 1.05169392, + "epoch": 0.09619720426875093, + "flos": 20740028653440.0, + "grad_norm": 2.013538613147854, + "language_loss": 0.840271, + "learning_rate": 3.954209025650093e-06, + "loss": 0.8625865, + "num_input_tokens_seen": 34378630, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.234375, + "step": 1600, + "time_per_iteration": 2.4882116317749023 + }, + { + "auxiliary_loss_clip": 0.01174537, + "auxiliary_loss_mlp": 0.01052763, + "balance_loss_clip": 1.03162694, + "balance_loss_mlp": 1.05098653, + "epoch": 0.09625732752141891, + "flos": 13042720488960.0, + "grad_norm": 3.038904015519863, + "language_loss": 0.8034178, + "learning_rate": 3.954126126774001e-06, + "loss": 0.82569081, + "num_input_tokens_seen": 34397110, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.234375, + "step": 1601, + "time_per_iteration": 5.328745365142822 + }, + { + "auxiliary_loss_clip": 0.01178453, + "auxiliary_loss_mlp": 0.01052259, + "balance_loss_clip": 1.03031266, + "balance_loss_mlp": 1.05090928, + "epoch": 0.09631745077408688, + "flos": 22273629782400.0, + "grad_norm": 2.183236390866488, + "language_loss": 0.82405198, + "learning_rate": 3.954043153797251e-06, + "loss": 0.84635913, + "num_input_tokens_seen": 34414165, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.2734375, + "step": 1602, + "time_per_iteration": 2.4609556198120117 + }, + { + "auxiliary_loss_clip": 0.01172805, + "auxiliary_loss_mlp": 0.01051791, + "balance_loss_clip": 1.02926075, + "balance_loss_mlp": 1.05170703, + "epoch": 0.09637757402675484, + "flos": 24754266944640.0, + "grad_norm": 1.882331764966583, + "language_loss": 0.62527591, + "learning_rate": 3.953960106722989e-06, + "loss": 0.64752185, + "num_input_tokens_seen": 34434445, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.2109375, + "step": 1603, + "time_per_iteration": 2.4974379539489746 + }, + { + "auxiliary_loss_clip": 0.01178105, + "auxiliary_loss_mlp": 0.01054363, + "balance_loss_clip": 1.03049707, + "balance_loss_mlp": 1.05224609, + "epoch": 0.09643769727942282, + "flos": 22525758322560.0, + "grad_norm": 2.347327571135852, + "language_loss": 0.71259016, + "learning_rate": 3.953876985554364e-06, + "loss": 0.73491484, + "num_input_tokens_seen": 34453095, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.2578125, + "step": 1604, + "time_per_iteration": 2.5012693405151367 + }, + { + "auxiliary_loss_clip": 0.01172586, + "auxiliary_loss_mlp": 0.01056823, + "balance_loss_clip": 1.0368669, + "balance_loss_mlp": 1.05051208, + "epoch": 0.09649782053209079, + "flos": 30921026636160.0, + "grad_norm": 2.129697971326249, + "language_loss": 0.79487669, + "learning_rate": 3.953793790294527e-06, + "loss": 0.8171708, + "num_input_tokens_seen": 34473680, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.21875, + "step": 1605, + "time_per_iteration": 2.5392873287200928 + }, + { + "auxiliary_loss_clip": 0.01176232, + "auxiliary_loss_mlp": 0.01044253, + "balance_loss_clip": 1.02275968, + "balance_loss_mlp": 1.04916394, + "epoch": 0.09655794378475875, + "flos": 25337635729920.0, + "grad_norm": 3.698123586343809, + "language_loss": 0.74810207, + "learning_rate": 3.953710520946634e-06, + "loss": 0.77030694, + "num_input_tokens_seen": 34492610, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.2734375, + "step": 1606, + "time_per_iteration": 2.4922726154327393 + }, + { + "auxiliary_loss_clip": 0.01176161, + "auxiliary_loss_mlp": 0.01044775, + "balance_loss_clip": 1.02391386, + "balance_loss_mlp": 1.05243278, + "epoch": 0.09661806703742673, + "flos": 22346061557760.0, + "grad_norm": 1.649703340967918, + "language_loss": 0.75382137, + "learning_rate": 3.953627177513843e-06, + "loss": 0.77603066, + "num_input_tokens_seen": 34511855, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.234375, + "step": 1607, + "time_per_iteration": 2.4787087440490723 + }, + { + "auxiliary_loss_clip": 0.0117289, + "auxiliary_loss_mlp": 0.01042475, + "balance_loss_clip": 1.02206647, + "balance_loss_mlp": 1.04831934, + "epoch": 0.0966781902900947, + "flos": 17457578144640.0, + "grad_norm": 2.262571531890369, + "language_loss": 0.86648059, + "learning_rate": 3.953543759999312e-06, + "loss": 0.88863426, + "num_input_tokens_seen": 34528905, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.25, + "step": 1608, + "time_per_iteration": 2.435391664505005 + }, + { + "auxiliary_loss_clip": 0.01183391, + "auxiliary_loss_mlp": 0.01056654, + "balance_loss_clip": 1.03513622, + "balance_loss_mlp": 1.05276418, + "epoch": 0.09673831354276266, + "flos": 36903995412480.0, + "grad_norm": 2.2277980990408297, + "language_loss": 0.70968121, + "learning_rate": 3.953460268406207e-06, + "loss": 0.73208165, + "num_input_tokens_seen": 34548480, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.3046875, + "step": 1609, + "time_per_iteration": 2.599719762802124 + }, + { + "auxiliary_loss_clip": 0.01173214, + "auxiliary_loss_mlp": 0.01054271, + "balance_loss_clip": 1.03342104, + "balance_loss_mlp": 1.04860282, + "epoch": 0.09679843679543064, + "flos": 20701388597760.0, + "grad_norm": 3.7787270736621674, + "language_loss": 0.84566712, + "learning_rate": 3.953376702737693e-06, + "loss": 0.86794198, + "num_input_tokens_seen": 34565410, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.2421875, + "step": 1610, + "time_per_iteration": 2.446676254272461 + }, + { + "auxiliary_loss_clip": 0.01177531, + "auxiliary_loss_mlp": 0.01049914, + "balance_loss_clip": 1.02781224, + "balance_loss_mlp": 1.05382621, + "epoch": 0.0968585600480986, + "flos": 23514415240320.0, + "grad_norm": 2.0483419743874682, + "language_loss": 0.67360532, + "learning_rate": 3.953293062996939e-06, + "loss": 0.69587982, + "num_input_tokens_seen": 34584840, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.234375, + "step": 1611, + "time_per_iteration": 2.520211696624756 + }, + { + "auxiliary_loss_clip": 0.01177545, + "auxiliary_loss_mlp": 0.0105068, + "balance_loss_clip": 1.03000879, + "balance_loss_mlp": 1.05313492, + "epoch": 0.09691868330076657, + "flos": 20121072468480.0, + "grad_norm": 1.6625909003061596, + "language_loss": 0.81166416, + "learning_rate": 3.953209349187115e-06, + "loss": 0.83394641, + "num_input_tokens_seen": 34603360, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.2421875, + "step": 1612, + "time_per_iteration": 2.449491262435913 + }, + { + "auxiliary_loss_clip": 0.01180036, + "auxiliary_loss_mlp": 0.01061745, + "balance_loss_clip": 1.04027581, + "balance_loss_mlp": 1.05431938, + "epoch": 0.09697880655343454, + "flos": 16544692967040.0, + "grad_norm": 2.509420249413084, + "language_loss": 0.80708754, + "learning_rate": 3.953125561311398e-06, + "loss": 0.82950538, + "num_input_tokens_seen": 34620760, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.2578125, + "step": 1613, + "time_per_iteration": 2.4753763675689697 + }, + { + "auxiliary_loss_clip": 0.01173718, + "auxiliary_loss_mlp": 0.01052644, + "balance_loss_clip": 1.03019738, + "balance_loss_mlp": 1.05074048, + "epoch": 0.09703892980610251, + "flos": 26104184899200.0, + "grad_norm": 2.0025313344872484, + "language_loss": 0.84173608, + "learning_rate": 3.953041699372964e-06, + "loss": 0.86399966, + "num_input_tokens_seen": 34640695, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.2265625, + "step": 1614, + "time_per_iteration": 2.5492141246795654 + }, + { + "auxiliary_loss_clip": 0.01065917, + "auxiliary_loss_mlp": 0.010187, + "balance_loss_clip": 1.01610088, + "balance_loss_mlp": 1.019063, + "epoch": 0.09709905305877048, + "flos": 60443622000000.0, + "grad_norm": 0.7078098108364695, + "language_loss": 0.54584575, + "learning_rate": 3.952957763374992e-06, + "loss": 0.56669194, + "num_input_tokens_seen": 34702395, + "router_z_loss_clip": 0.02600098, + "router_z_loss_mlp": 0.46875, + "step": 1615, + "time_per_iteration": 3.1041057109832764 + }, + { + "auxiliary_loss_clip": 0.01065912, + "auxiliary_loss_mlp": 0.01007477, + "balance_loss_clip": 1.00491357, + "balance_loss_mlp": 1.01844954, + "epoch": 0.09715917631143844, + "flos": 57639932893440.0, + "grad_norm": 0.7637649269659756, + "language_loss": 0.5822649, + "learning_rate": 3.952873753320666e-06, + "loss": 0.60299873, + "num_input_tokens_seen": 34768910, + "router_z_loss_clip": 0.02563477, + "router_z_loss_mlp": 0.47460938, + "step": 1616, + "time_per_iteration": 3.215376377105713 + }, + { + "auxiliary_loss_clip": 0.01178513, + "auxiliary_loss_mlp": 0.01055808, + "balance_loss_clip": 1.03275275, + "balance_loss_mlp": 1.05275226, + "epoch": 0.09721929956410642, + "flos": 20558212986240.0, + "grad_norm": 1.690325520565165, + "language_loss": 0.69293094, + "learning_rate": 3.952789669213172e-06, + "loss": 0.71527421, + "num_input_tokens_seen": 34787680, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.2578125, + "step": 1617, + "time_per_iteration": 2.458017587661743 + }, + { + "auxiliary_loss_clip": 0.01176727, + "auxiliary_loss_mlp": 0.01055641, + "balance_loss_clip": 1.03116739, + "balance_loss_mlp": 1.05130577, + "epoch": 0.09727942281677439, + "flos": 27344359825920.0, + "grad_norm": 1.7927692696889819, + "language_loss": 0.80748308, + "learning_rate": 3.952705511055698e-06, + "loss": 0.8298068, + "num_input_tokens_seen": 34808330, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.25, + "step": 1618, + "time_per_iteration": 2.5471577644348145 + }, + { + "auxiliary_loss_clip": 0.01169902, + "auxiliary_loss_mlp": 0.01050477, + "balance_loss_clip": 1.03077149, + "balance_loss_mlp": 1.04996848, + "epoch": 0.09733954606944235, + "flos": 24900028335360.0, + "grad_norm": 1.5831304278494804, + "language_loss": 0.9288674, + "learning_rate": 3.952621278851435e-06, + "loss": 0.9510712, + "num_input_tokens_seen": 34830020, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1953125, + "step": 1619, + "time_per_iteration": 2.515282392501831 + }, + { + "auxiliary_loss_clip": 0.01171299, + "auxiliary_loss_mlp": 0.01052594, + "balance_loss_clip": 1.03150594, + "balance_loss_mlp": 1.05216622, + "epoch": 0.09739966932211033, + "flos": 31503928544640.0, + "grad_norm": 1.7974961209450113, + "language_loss": 0.88785303, + "learning_rate": 3.9525369726035784e-06, + "loss": 0.910092, + "num_input_tokens_seen": 34850330, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1875, + "step": 1620, + "time_per_iteration": 2.556744337081909 + }, + { + "auxiliary_loss_clip": 0.01175309, + "auxiliary_loss_mlp": 0.01056801, + "balance_loss_clip": 1.0339601, + "balance_loss_mlp": 1.05045033, + "epoch": 0.0974597925747783, + "flos": 23878764846720.0, + "grad_norm": 1.90931759761679, + "language_loss": 0.77130795, + "learning_rate": 3.952452592315324e-06, + "loss": 0.79362905, + "num_input_tokens_seen": 34871640, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.25, + "step": 1621, + "time_per_iteration": 2.491441011428833 + }, + { + "auxiliary_loss_clip": 0.01171563, + "auxiliary_loss_mlp": 0.01056002, + "balance_loss_clip": 1.03398418, + "balance_loss_mlp": 1.04859447, + "epoch": 0.09751991582744626, + "flos": 17019575700480.0, + "grad_norm": 1.9170880538391684, + "language_loss": 0.77856946, + "learning_rate": 3.952368137989871e-06, + "loss": 0.80084509, + "num_input_tokens_seen": 34888100, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.2265625, + "step": 1622, + "time_per_iteration": 2.4379701614379883 + }, + { + "auxiliary_loss_clip": 0.01177415, + "auxiliary_loss_mlp": 0.01056732, + "balance_loss_clip": 1.0349052, + "balance_loss_mlp": 1.05105746, + "epoch": 0.09758003908011423, + "flos": 28402826826240.0, + "grad_norm": 1.9420709042223125, + "language_loss": 0.85783195, + "learning_rate": 3.9522836096304225e-06, + "loss": 0.88017344, + "num_input_tokens_seen": 34910485, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.265625, + "step": 1623, + "time_per_iteration": 2.51741099357605 + }, + { + "auxiliary_loss_clip": 0.01172696, + "auxiliary_loss_mlp": 0.01056286, + "balance_loss_clip": 1.03498316, + "balance_loss_mlp": 1.05181813, + "epoch": 0.09764016233278221, + "flos": 18144297336960.0, + "grad_norm": 2.2833168401589656, + "language_loss": 0.80328369, + "learning_rate": 3.952199007240184e-06, + "loss": 0.8255735, + "num_input_tokens_seen": 34928615, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.2109375, + "step": 1624, + "time_per_iteration": 2.4646618366241455 + }, + { + "auxiliary_loss_clip": 0.01170952, + "auxiliary_loss_mlp": 0.01044517, + "balance_loss_clip": 1.02450192, + "balance_loss_mlp": 1.04799926, + "epoch": 0.09770028558545017, + "flos": 15265842071040.0, + "grad_norm": 2.7577002662180954, + "language_loss": 0.8575626, + "learning_rate": 3.952114330822364e-06, + "loss": 0.87971735, + "num_input_tokens_seen": 34946045, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.234375, + "step": 1625, + "time_per_iteration": 2.452615976333618 + }, + { + "auxiliary_loss_clip": 0.01176501, + "auxiliary_loss_mlp": 0.01055325, + "balance_loss_clip": 1.03445125, + "balance_loss_mlp": 1.05226421, + "epoch": 0.09776040883811814, + "flos": 23472435219840.0, + "grad_norm": 3.258883448957912, + "language_loss": 0.8539601, + "learning_rate": 3.952029580380172e-06, + "loss": 0.87627834, + "num_input_tokens_seen": 34962865, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.2421875, + "step": 1626, + "time_per_iteration": 2.4931013584136963 + }, + { + "auxiliary_loss_clip": 0.01181466, + "auxiliary_loss_mlp": 0.0105723, + "balance_loss_clip": 1.03493834, + "balance_loss_mlp": 1.05541551, + "epoch": 0.09782053209078612, + "flos": 24499480798080.0, + "grad_norm": 1.979888643217431, + "language_loss": 0.83329904, + "learning_rate": 3.9519447559168234e-06, + "loss": 0.85568601, + "num_input_tokens_seen": 34983505, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.2578125, + "step": 1627, + "time_per_iteration": 2.5056917667388916 + }, + { + "auxiliary_loss_clip": 0.01170161, + "auxiliary_loss_mlp": 0.01050744, + "balance_loss_clip": 1.03065729, + "balance_loss_mlp": 1.0488416, + "epoch": 0.09788065534345408, + "flos": 21580158833280.0, + "grad_norm": 1.7873285490487296, + "language_loss": 0.84291327, + "learning_rate": 3.951859857435534e-06, + "loss": 0.86512232, + "num_input_tokens_seen": 35001825, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.2109375, + "step": 1628, + "time_per_iteration": 2.4835076332092285 + }, + { + "auxiliary_loss_clip": 0.01169153, + "auxiliary_loss_mlp": 0.01052825, + "balance_loss_clip": 1.0321064, + "balance_loss_mlp": 1.04880238, + "epoch": 0.09794077859612205, + "flos": 23842459175040.0, + "grad_norm": 1.6092149858605884, + "language_loss": 0.75609362, + "learning_rate": 3.951774884939523e-06, + "loss": 0.77831334, + "num_input_tokens_seen": 35023075, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.203125, + "step": 1629, + "time_per_iteration": 2.4959983825683594 + }, + { + "auxiliary_loss_clip": 0.01175285, + "auxiliary_loss_mlp": 0.01046701, + "balance_loss_clip": 1.02412319, + "balance_loss_mlp": 1.0530107, + "epoch": 0.09800090184879003, + "flos": 23659889322240.0, + "grad_norm": 1.5982247062153871, + "language_loss": 0.78224194, + "learning_rate": 3.951689838432013e-06, + "loss": 0.80446172, + "num_input_tokens_seen": 35043480, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.21875, + "step": 1630, + "time_per_iteration": 2.5138731002807617 + }, + { + "auxiliary_loss_clip": 0.01177321, + "auxiliary_loss_mlp": 0.01051411, + "balance_loss_clip": 1.0292381, + "balance_loss_mlp": 1.05457997, + "epoch": 0.09806102510145799, + "flos": 17055773631360.0, + "grad_norm": 1.9134334701620013, + "language_loss": 0.86704385, + "learning_rate": 3.951604717916228e-06, + "loss": 0.8893311, + "num_input_tokens_seen": 35061490, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.2265625, + "step": 1631, + "time_per_iteration": 2.443878173828125 + }, + { + "auxiliary_loss_clip": 0.01172712, + "auxiliary_loss_mlp": 0.01050929, + "balance_loss_clip": 1.03065109, + "balance_loss_mlp": 1.05258322, + "epoch": 0.09812114835412596, + "flos": 23878477537920.0, + "grad_norm": 2.096430969489036, + "language_loss": 0.83111286, + "learning_rate": 3.9515195233953975e-06, + "loss": 0.85334921, + "num_input_tokens_seen": 35079670, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.203125, + "step": 1632, + "time_per_iteration": 2.478825807571411 + }, + { + "auxiliary_loss_clip": 0.01174956, + "auxiliary_loss_mlp": 0.01056552, + "balance_loss_clip": 1.0368464, + "balance_loss_mlp": 1.05281615, + "epoch": 0.09818127160679392, + "flos": 20595488325120.0, + "grad_norm": 1.5107232822128822, + "language_loss": 0.7877655, + "learning_rate": 3.951434254872751e-06, + "loss": 0.81008065, + "num_input_tokens_seen": 35099205, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.21875, + "step": 1633, + "time_per_iteration": 2.447930097579956 + }, + { + "auxiliary_loss_clip": 0.01169184, + "auxiliary_loss_mlp": 0.01049402, + "balance_loss_clip": 1.02833819, + "balance_loss_mlp": 1.04989707, + "epoch": 0.0982413948594619, + "flos": 15487339288320.0, + "grad_norm": 2.0663591821232865, + "language_loss": 0.73159611, + "learning_rate": 3.951348912351521e-06, + "loss": 0.75378191, + "num_input_tokens_seen": 35115270, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1875, + "step": 1634, + "time_per_iteration": 2.460265636444092 + }, + { + "auxiliary_loss_clip": 0.01179893, + "auxiliary_loss_mlp": 0.01062758, + "balance_loss_clip": 1.04026294, + "balance_loss_mlp": 1.0516957, + "epoch": 0.09830151811212987, + "flos": 24207958016640.0, + "grad_norm": 2.7516342600991868, + "language_loss": 0.72714394, + "learning_rate": 3.951263495834947e-06, + "loss": 0.74957043, + "num_input_tokens_seen": 35134065, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.28125, + "step": 1635, + "time_per_iteration": 2.4835710525512695 + }, + { + "auxiliary_loss_clip": 0.01177592, + "auxiliary_loss_mlp": 0.01055297, + "balance_loss_clip": 1.03301644, + "balance_loss_mlp": 1.05253148, + "epoch": 0.09836164136479783, + "flos": 20594590485120.0, + "grad_norm": 1.8458745824258636, + "language_loss": 0.7819975, + "learning_rate": 3.951178005326264e-06, + "loss": 0.80432636, + "num_input_tokens_seen": 35154870, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.25, + "step": 1636, + "time_per_iteration": 2.53061842918396 + }, + { + "auxiliary_loss_clip": 0.01173491, + "auxiliary_loss_mlp": 0.01056847, + "balance_loss_clip": 1.03498387, + "balance_loss_mlp": 1.05113721, + "epoch": 0.09842176461746581, + "flos": 19934157070080.0, + "grad_norm": 2.2976115041381386, + "language_loss": 0.70005965, + "learning_rate": 3.951092440828715e-06, + "loss": 0.722363, + "num_input_tokens_seen": 35171850, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.21875, + "step": 1637, + "time_per_iteration": 2.445742130279541 + }, + { + "auxiliary_loss_clip": 0.0117379, + "auxiliary_loss_mlp": 0.01053221, + "balance_loss_clip": 1.03175139, + "balance_loss_mlp": 1.05108416, + "epoch": 0.09848188787013377, + "flos": 21214659991680.0, + "grad_norm": 2.115587702667026, + "language_loss": 0.77395654, + "learning_rate": 3.951006802345545e-06, + "loss": 0.79622668, + "num_input_tokens_seen": 35188795, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.2265625, + "step": 1638, + "time_per_iteration": 2.4725139141082764 + }, + { + "auxiliary_loss_clip": 0.01170234, + "auxiliary_loss_mlp": 0.01046096, + "balance_loss_clip": 1.02524579, + "balance_loss_mlp": 1.05077171, + "epoch": 0.09854201112280174, + "flos": 30154226071680.0, + "grad_norm": 1.4162008179950134, + "language_loss": 0.7263118, + "learning_rate": 3.950921089880003e-06, + "loss": 0.74847507, + "num_input_tokens_seen": 35212100, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1953125, + "step": 1639, + "time_per_iteration": 2.5534512996673584 + }, + { + "auxiliary_loss_clip": 0.01173162, + "auxiliary_loss_mlp": 0.01040763, + "balance_loss_clip": 1.01943696, + "balance_loss_mlp": 1.05003214, + "epoch": 0.09860213437546972, + "flos": 21795730306560.0, + "grad_norm": 1.8280373897837945, + "language_loss": 0.88669002, + "learning_rate": 3.950835303435337e-06, + "loss": 0.90882927, + "num_input_tokens_seen": 35230390, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.234375, + "step": 1640, + "time_per_iteration": 2.4868786334991455 + }, + { + "auxiliary_loss_clip": 0.01173727, + "auxiliary_loss_mlp": 0.01037743, + "balance_loss_clip": 1.01685774, + "balance_loss_mlp": 1.05164635, + "epoch": 0.09866225762813768, + "flos": 21835555511040.0, + "grad_norm": 2.1859335509376527, + "language_loss": 0.8086108, + "learning_rate": 3.950749443014801e-06, + "loss": 0.83072555, + "num_input_tokens_seen": 35250405, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.21875, + "step": 1641, + "time_per_iteration": 2.5081584453582764 + }, + { + "auxiliary_loss_clip": 0.01173536, + "auxiliary_loss_mlp": 0.01054387, + "balance_loss_clip": 1.03130805, + "balance_loss_mlp": 1.05067503, + "epoch": 0.09872238088080565, + "flos": 17599855916160.0, + "grad_norm": 2.4983515693134417, + "language_loss": 0.85826755, + "learning_rate": 3.95066350862165e-06, + "loss": 0.88054669, + "num_input_tokens_seen": 35262820, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.2265625, + "step": 1642, + "time_per_iteration": 2.4351255893707275 + }, + { + "auxiliary_loss_clip": 0.01177694, + "auxiliary_loss_mlp": 0.01053725, + "balance_loss_clip": 1.0326128, + "balance_loss_mlp": 1.05365527, + "epoch": 0.09878250413347361, + "flos": 27636134002560.0, + "grad_norm": 1.7421144196917664, + "language_loss": 0.80859929, + "learning_rate": 3.950577500259144e-06, + "loss": 0.83091342, + "num_input_tokens_seen": 35284490, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.2421875, + "step": 1643, + "time_per_iteration": 3.9550716876983643 + }, + { + "auxiliary_loss_clip": 0.01170472, + "auxiliary_loss_mlp": 0.01063125, + "balance_loss_clip": 1.04138088, + "balance_loss_mlp": 1.0494256, + "epoch": 0.0988426273861416, + "flos": 16544728880640.0, + "grad_norm": 1.9624417465121429, + "language_loss": 0.8262763, + "learning_rate": 3.950491417930543e-06, + "loss": 0.84861231, + "num_input_tokens_seen": 35302815, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.2109375, + "step": 1644, + "time_per_iteration": 3.8253817558288574 + }, + { + "auxiliary_loss_clip": 0.01169448, + "auxiliary_loss_mlp": 0.01048566, + "balance_loss_clip": 1.02733469, + "balance_loss_mlp": 1.05048347, + "epoch": 0.09890275063880956, + "flos": 21215270522880.0, + "grad_norm": 1.7099323885745632, + "language_loss": 0.6819675, + "learning_rate": 3.9504052616391124e-06, + "loss": 0.70414758, + "num_input_tokens_seen": 35321175, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1953125, + "step": 1645, + "time_per_iteration": 2.4549567699432373 + }, + { + "auxiliary_loss_clip": 0.01065531, + "auxiliary_loss_mlp": 0.01023286, + "balance_loss_clip": 1.0206517, + "balance_loss_mlp": 1.01924491, + "epoch": 0.09896287389147752, + "flos": 59379372910080.0, + "grad_norm": 0.9514884974425206, + "language_loss": 0.60854232, + "learning_rate": 3.950319031388119e-06, + "loss": 0.62943053, + "num_input_tokens_seen": 35381740, + "router_z_loss_clip": 0.02636719, + "router_z_loss_mlp": 0.46289062, + "step": 1646, + "time_per_iteration": 2.9953765869140625 + }, + { + "auxiliary_loss_clip": 0.01170253, + "auxiliary_loss_mlp": 0.01049996, + "balance_loss_clip": 1.02725112, + "balance_loss_mlp": 1.04880357, + "epoch": 0.0990229971441455, + "flos": 29642678530560.0, + "grad_norm": 2.5496486678231425, + "language_loss": 0.73046064, + "learning_rate": 3.950232727180833e-06, + "loss": 0.75266314, + "num_input_tokens_seen": 35403760, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.2109375, + "step": 1647, + "time_per_iteration": 2.5241641998291016 + }, + { + "auxiliary_loss_clip": 0.01171762, + "auxiliary_loss_mlp": 0.0105645, + "balance_loss_clip": 1.03663731, + "balance_loss_mlp": 1.04955053, + "epoch": 0.09908312039681347, + "flos": 21834873152640.0, + "grad_norm": 1.8237647662791463, + "language_loss": 0.84120429, + "learning_rate": 3.950146349020525e-06, + "loss": 0.86348635, + "num_input_tokens_seen": 35424050, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.21875, + "step": 1648, + "time_per_iteration": 2.467717170715332 + }, + { + "auxiliary_loss_clip": 0.01061152, + "auxiliary_loss_mlp": 0.01009658, + "balance_loss_clip": 1.00701165, + "balance_loss_mlp": 1.0159142, + "epoch": 0.09914324364948143, + "flos": 57564304807680.0, + "grad_norm": 0.7437092318732932, + "language_loss": 0.55674303, + "learning_rate": 3.950059896910473e-06, + "loss": 0.57745123, + "num_input_tokens_seen": 35481690, + "router_z_loss_clip": 0.02648926, + "router_z_loss_mlp": 0.453125, + "step": 1649, + "time_per_iteration": 2.99874210357666 + }, + { + "auxiliary_loss_clip": 0.01165781, + "auxiliary_loss_mlp": 0.01046657, + "balance_loss_clip": 1.02598572, + "balance_loss_mlp": 1.04597533, + "epoch": 0.09920336690214941, + "flos": 34123934476800.0, + "grad_norm": 2.284847215884091, + "language_loss": 0.89930248, + "learning_rate": 3.949973370853954e-06, + "loss": 0.92142689, + "num_input_tokens_seen": 35498635, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1953125, + "step": 1650, + "time_per_iteration": 2.5542855262756348 + }, + { + "auxiliary_loss_clip": 0.01058202, + "auxiliary_loss_mlp": 0.01003693, + "balance_loss_clip": 1.00105858, + "balance_loss_mlp": 1.01395106, + "epoch": 0.09926349015481738, + "flos": 71216428464000.0, + "grad_norm": 0.8031298543824162, + "language_loss": 0.63733649, + "learning_rate": 3.94988677085425e-06, + "loss": 0.65795547, + "num_input_tokens_seen": 35565720, + "router_z_loss_clip": 0.02636719, + "router_z_loss_mlp": 0.44140625, + "step": 1651, + "time_per_iteration": 3.217806100845337 + }, + { + "auxiliary_loss_clip": 0.01168872, + "auxiliary_loss_mlp": 0.01054978, + "balance_loss_clip": 1.03318655, + "balance_loss_mlp": 1.04885435, + "epoch": 0.09932361340748534, + "flos": 23148700917120.0, + "grad_norm": 1.9462006377707899, + "language_loss": 0.88288587, + "learning_rate": 3.949800096914643e-06, + "loss": 0.90512443, + "num_input_tokens_seen": 35586000, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.203125, + "step": 1652, + "time_per_iteration": 2.5014448165893555 + }, + { + "auxiliary_loss_clip": 0.01174376, + "auxiliary_loss_mlp": 0.01057611, + "balance_loss_clip": 1.03692842, + "balance_loss_mlp": 1.05190849, + "epoch": 0.09938373666015332, + "flos": 19828651847040.0, + "grad_norm": 1.9500387106757973, + "language_loss": 0.82206833, + "learning_rate": 3.949713349038422e-06, + "loss": 0.84438825, + "num_input_tokens_seen": 35604355, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.2265625, + "step": 1653, + "time_per_iteration": 2.4881839752197266 + }, + { + "auxiliary_loss_clip": 0.01172582, + "auxiliary_loss_mlp": 0.010545, + "balance_loss_clip": 1.03330469, + "balance_loss_mlp": 1.04984093, + "epoch": 0.09944385991282129, + "flos": 22090664880000.0, + "grad_norm": 2.0314065071494136, + "language_loss": 0.79399735, + "learning_rate": 3.949626527228875e-06, + "loss": 0.81626815, + "num_input_tokens_seen": 35625495, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.2265625, + "step": 1654, + "time_per_iteration": 2.5269205570220947 + }, + { + "auxiliary_loss_clip": 0.01167439, + "auxiliary_loss_mlp": 0.01055854, + "balance_loss_clip": 1.03700721, + "balance_loss_mlp": 1.05072093, + "epoch": 0.09950398316548925, + "flos": 19828867328640.0, + "grad_norm": 1.5637423809135174, + "language_loss": 0.8088094, + "learning_rate": 3.949539631489295e-06, + "loss": 0.83104229, + "num_input_tokens_seen": 35645030, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.171875, + "step": 1655, + "time_per_iteration": 2.4652602672576904 + }, + { + "auxiliary_loss_clip": 0.01167369, + "auxiliary_loss_mlp": 0.0105576, + "balance_loss_clip": 1.03495777, + "balance_loss_mlp": 1.04891443, + "epoch": 0.09956410641815722, + "flos": 25003701964800.0, + "grad_norm": 1.9082198159511756, + "language_loss": 0.80947387, + "learning_rate": 3.9494526618229765e-06, + "loss": 0.83170521, + "num_input_tokens_seen": 35664305, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.1875, + "step": 1656, + "time_per_iteration": 2.4966416358947754 + }, + { + "auxiliary_loss_clip": 0.01170477, + "auxiliary_loss_mlp": 0.0106116, + "balance_loss_clip": 1.04066813, + "balance_loss_mlp": 1.05147541, + "epoch": 0.0996242296708252, + "flos": 19317714837120.0, + "grad_norm": 1.6268850155063674, + "language_loss": 0.88850212, + "learning_rate": 3.949365618233217e-06, + "loss": 0.91081852, + "num_input_tokens_seen": 35684060, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1875, + "step": 1657, + "time_per_iteration": 2.446124792098999 + }, + { + "auxiliary_loss_clip": 0.01175951, + "auxiliary_loss_mlp": 0.01063236, + "balance_loss_clip": 1.04088378, + "balance_loss_mlp": 1.05091214, + "epoch": 0.09968435292349316, + "flos": 21871609787520.0, + "grad_norm": 2.0057694643168302, + "language_loss": 0.84758937, + "learning_rate": 3.9492785007233195e-06, + "loss": 0.86998123, + "num_input_tokens_seen": 35703250, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.25, + "step": 1658, + "time_per_iteration": 2.457902669906616 + }, + { + "auxiliary_loss_clip": 0.01054631, + "auxiliary_loss_mlp": 0.01077242, + "balance_loss_clip": 1.07460773, + "balance_loss_mlp": 1.0110395, + "epoch": 0.09974447617616113, + "flos": 65384533313280.0, + "grad_norm": 0.9153195332104517, + "language_loss": 0.60843968, + "learning_rate": 3.949191309296585e-06, + "loss": 0.62975848, + "num_input_tokens_seen": 35762165, + "router_z_loss_clip": 0.02636719, + "router_z_loss_mlp": 0.43554688, + "step": 1659, + "time_per_iteration": 3.077805519104004 + }, + { + "auxiliary_loss_clip": 0.01170517, + "auxiliary_loss_mlp": 0.01052954, + "balance_loss_clip": 1.03155613, + "balance_loss_mlp": 1.04999721, + "epoch": 0.0998045994288291, + "flos": 23659817495040.0, + "grad_norm": 1.8691655756599186, + "language_loss": 0.85116851, + "learning_rate": 3.949104043956321e-06, + "loss": 0.87340325, + "num_input_tokens_seen": 35781520, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.2109375, + "step": 1660, + "time_per_iteration": 2.49082612991333 + }, + { + "auxiliary_loss_clip": 0.01171003, + "auxiliary_loss_mlp": 0.01056184, + "balance_loss_clip": 1.03393948, + "balance_loss_mlp": 1.05291247, + "epoch": 0.09986472268149707, + "flos": 19609704495360.0, + "grad_norm": 2.130922035700174, + "language_loss": 0.80037123, + "learning_rate": 3.949016704705836e-06, + "loss": 0.8226431, + "num_input_tokens_seen": 35799565, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.1796875, + "step": 1661, + "time_per_iteration": 2.4412636756896973 + }, + { + "auxiliary_loss_clip": 0.01175671, + "auxiliary_loss_mlp": 0.01050112, + "balance_loss_clip": 1.02801085, + "balance_loss_mlp": 1.05002224, + "epoch": 0.09992484593416504, + "flos": 26213317395840.0, + "grad_norm": 1.8939661728963775, + "language_loss": 0.83592767, + "learning_rate": 3.948929291548443e-06, + "loss": 0.85818553, + "num_input_tokens_seen": 35821085, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.2578125, + "step": 1662, + "time_per_iteration": 2.5200328826904297 + }, + { + "auxiliary_loss_clip": 0.01171098, + "auxiliary_loss_mlp": 0.01052384, + "balance_loss_clip": 1.02972281, + "balance_loss_mlp": 1.05104828, + "epoch": 0.09998496918683301, + "flos": 17493632421120.0, + "grad_norm": 2.1063962968477, + "language_loss": 0.88696563, + "learning_rate": 3.9488418044874546e-06, + "loss": 0.90920055, + "num_input_tokens_seen": 35839840, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.203125, + "step": 1663, + "time_per_iteration": 2.42790150642395 + }, + { + "auxiliary_loss_clip": 0.01174901, + "auxiliary_loss_mlp": 0.01052956, + "balance_loss_clip": 1.03084326, + "balance_loss_mlp": 1.05225635, + "epoch": 0.10004509243950098, + "flos": 22784925928320.0, + "grad_norm": 1.6888490247303796, + "language_loss": 0.7034179, + "learning_rate": 3.948754243526191e-06, + "loss": 0.72569644, + "num_input_tokens_seen": 35861545, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.21875, + "step": 1664, + "time_per_iteration": 2.512596368789673 + }, + { + "auxiliary_loss_clip": 0.01173831, + "auxiliary_loss_mlp": 0.01050685, + "balance_loss_clip": 1.02903676, + "balance_loss_mlp": 1.0535655, + "epoch": 0.10010521569216894, + "flos": 16253385667200.0, + "grad_norm": 2.1773983349048804, + "language_loss": 0.7878316, + "learning_rate": 3.94866660866797e-06, + "loss": 0.81007671, + "num_input_tokens_seen": 35878295, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.203125, + "step": 1665, + "time_per_iteration": 2.4271252155303955 + }, + { + "auxiliary_loss_clip": 0.0117847, + "auxiliary_loss_mlp": 0.01061559, + "balance_loss_clip": 1.0404706, + "balance_loss_mlp": 1.05681181, + "epoch": 0.10016533894483691, + "flos": 23402589223680.0, + "grad_norm": 1.663243771388797, + "language_loss": 0.70152062, + "learning_rate": 3.9485788999161165e-06, + "loss": 0.72392094, + "num_input_tokens_seen": 35898990, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.21875, + "step": 1666, + "time_per_iteration": 2.499131202697754 + }, + { + "auxiliary_loss_clip": 0.01173729, + "auxiliary_loss_mlp": 0.01060981, + "balance_loss_clip": 1.03777063, + "balance_loss_mlp": 1.0506525, + "epoch": 0.10022546219750489, + "flos": 19354164163200.0, + "grad_norm": 1.8121915129470096, + "language_loss": 0.791031, + "learning_rate": 3.948491117273956e-06, + "loss": 0.8133781, + "num_input_tokens_seen": 35916225, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.234375, + "step": 1667, + "time_per_iteration": 2.4429264068603516 + }, + { + "auxiliary_loss_clip": 0.0117317, + "auxiliary_loss_mlp": 0.01050651, + "balance_loss_clip": 1.02810836, + "balance_loss_mlp": 1.05261493, + "epoch": 0.10028558545017285, + "flos": 27085766837760.0, + "grad_norm": 2.9507555712476945, + "language_loss": 0.7715596, + "learning_rate": 3.948403260744817e-06, + "loss": 0.79379785, + "num_input_tokens_seen": 35934630, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.203125, + "step": 1668, + "time_per_iteration": 2.5223031044006348 + }, + { + "auxiliary_loss_clip": 0.01173457, + "auxiliary_loss_mlp": 0.01057389, + "balance_loss_clip": 1.03434563, + "balance_loss_mlp": 1.05256963, + "epoch": 0.10034570870284082, + "flos": 25847136195840.0, + "grad_norm": 1.9809152554972944, + "language_loss": 0.77852714, + "learning_rate": 3.948315330332031e-06, + "loss": 0.80083561, + "num_input_tokens_seen": 35953855, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.2109375, + "step": 1669, + "time_per_iteration": 2.5082881450653076 + }, + { + "auxiliary_loss_clip": 0.01181618, + "auxiliary_loss_mlp": 0.01059972, + "balance_loss_clip": 1.03641593, + "balance_loss_mlp": 1.05464602, + "epoch": 0.1004058319555088, + "flos": 26249587153920.0, + "grad_norm": 2.145889566444559, + "language_loss": 0.85461181, + "learning_rate": 3.948227326038933e-06, + "loss": 0.87702769, + "num_input_tokens_seen": 35974555, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.2734375, + "step": 1670, + "time_per_iteration": 2.5235135555267334 + }, + { + "auxiliary_loss_clip": 0.01166248, + "auxiliary_loss_mlp": 0.01057789, + "balance_loss_clip": 1.03681993, + "balance_loss_mlp": 1.0501771, + "epoch": 0.10046595520817676, + "flos": 25374480105600.0, + "grad_norm": 1.5986093935623644, + "language_loss": 0.76899171, + "learning_rate": 3.9481392478688586e-06, + "loss": 0.79123211, + "num_input_tokens_seen": 35996830, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.15625, + "step": 1671, + "time_per_iteration": 2.505441665649414 + }, + { + "auxiliary_loss_clip": 0.01059926, + "auxiliary_loss_mlp": 0.01022857, + "balance_loss_clip": 1.02019823, + "balance_loss_mlp": 1.01598763, + "epoch": 0.10052607846084473, + "flos": 67461821677440.0, + "grad_norm": 0.7900846916321359, + "language_loss": 0.60719293, + "learning_rate": 3.948051095825149e-06, + "loss": 0.62802076, + "num_input_tokens_seen": 36054465, + "router_z_loss_clip": 0.02661133, + "router_z_loss_mlp": 0.43945312, + "step": 1672, + "time_per_iteration": 3.07255482673645 + }, + { + "auxiliary_loss_clip": 0.01173395, + "auxiliary_loss_mlp": 0.01064348, + "balance_loss_clip": 1.04179382, + "balance_loss_mlp": 1.05045998, + "epoch": 0.10058620171351271, + "flos": 21360493209600.0, + "grad_norm": 2.0407855091156377, + "language_loss": 0.77119517, + "learning_rate": 3.947962869911147e-06, + "loss": 0.79357255, + "num_input_tokens_seen": 36073480, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.234375, + "step": 1673, + "time_per_iteration": 2.4693222045898438 + }, + { + "auxiliary_loss_clip": 0.01171478, + "auxiliary_loss_mlp": 0.01052114, + "balance_loss_clip": 1.03066778, + "balance_loss_mlp": 1.04964709, + "epoch": 0.10064632496618067, + "flos": 16800125558400.0, + "grad_norm": 2.2570599367002835, + "language_loss": 0.72829556, + "learning_rate": 3.947874570130197e-06, + "loss": 0.75053144, + "num_input_tokens_seen": 36091830, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.21875, + "step": 1674, + "time_per_iteration": 2.4534130096435547 + }, + { + "auxiliary_loss_clip": 0.01170516, + "auxiliary_loss_mlp": 0.01051148, + "balance_loss_clip": 1.03047729, + "balance_loss_mlp": 1.04903197, + "epoch": 0.10070644821884864, + "flos": 23624445576960.0, + "grad_norm": 2.043409325490185, + "language_loss": 0.79386973, + "learning_rate": 3.947786196485649e-06, + "loss": 0.81608635, + "num_input_tokens_seen": 36111400, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.21875, + "step": 1675, + "time_per_iteration": 2.496504545211792 + }, + { + "auxiliary_loss_clip": 0.01168157, + "auxiliary_loss_mlp": 0.01064762, + "balance_loss_clip": 1.04449606, + "balance_loss_mlp": 1.04908013, + "epoch": 0.1007665714715166, + "flos": 24462564595200.0, + "grad_norm": 2.0305638084579294, + "language_loss": 0.81565315, + "learning_rate": 3.947697748980853e-06, + "loss": 0.8379823, + "num_input_tokens_seen": 36129345, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1875, + "step": 1676, + "time_per_iteration": 2.5022919178009033 + }, + { + "auxiliary_loss_clip": 0.01174395, + "auxiliary_loss_mlp": 0.01058603, + "balance_loss_clip": 1.03713369, + "balance_loss_mlp": 1.05283856, + "epoch": 0.10082669472418458, + "flos": 16799119977600.0, + "grad_norm": 2.134524944411931, + "language_loss": 0.86155027, + "learning_rate": 3.947609227619163e-06, + "loss": 0.88388026, + "num_input_tokens_seen": 36146255, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.2109375, + "step": 1677, + "time_per_iteration": 2.44887113571167 + }, + { + "auxiliary_loss_clip": 0.01171962, + "auxiliary_loss_mlp": 0.01055328, + "balance_loss_clip": 1.03452563, + "balance_loss_mlp": 1.05113602, + "epoch": 0.10088681797685255, + "flos": 13553513844480.0, + "grad_norm": 5.349815535910457, + "language_loss": 0.86318195, + "learning_rate": 3.947520632403936e-06, + "loss": 0.88545489, + "num_input_tokens_seen": 36164050, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.2109375, + "step": 1678, + "time_per_iteration": 2.4373903274536133 + }, + { + "auxiliary_loss_clip": 0.01172423, + "auxiliary_loss_mlp": 0.01055078, + "balance_loss_clip": 1.03359675, + "balance_loss_mlp": 1.05214512, + "epoch": 0.10094694122952051, + "flos": 25265706744960.0, + "grad_norm": 2.6897314721028867, + "language_loss": 0.89726269, + "learning_rate": 3.947431963338532e-06, + "loss": 0.91953766, + "num_input_tokens_seen": 36183530, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.203125, + "step": 1679, + "time_per_iteration": 2.493959903717041 + }, + { + "auxiliary_loss_clip": 0.01056795, + "auxiliary_loss_mlp": 0.01017317, + "balance_loss_clip": 1.01468229, + "balance_loss_mlp": 1.01327634, + "epoch": 0.10100706448218849, + "flos": 69854299885440.0, + "grad_norm": 0.7831657514235874, + "language_loss": 0.53018153, + "learning_rate": 3.947343220426312e-06, + "loss": 0.55092263, + "num_input_tokens_seen": 36248550, + "router_z_loss_clip": 0.02636719, + "router_z_loss_mlp": 0.43554688, + "step": 1680, + "time_per_iteration": 3.15899658203125 + }, + { + "auxiliary_loss_clip": 0.01168402, + "auxiliary_loss_mlp": 0.01055332, + "balance_loss_clip": 1.03429151, + "balance_loss_mlp": 1.04983318, + "epoch": 0.10106718773485646, + "flos": 20007163463040.0, + "grad_norm": 1.657625192327098, + "language_loss": 0.76889706, + "learning_rate": 3.947254403670641e-06, + "loss": 0.79113436, + "num_input_tokens_seen": 36266065, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.1875, + "step": 1681, + "time_per_iteration": 2.446937322616577 + }, + { + "auxiliary_loss_clip": 0.01175341, + "auxiliary_loss_mlp": 0.01058478, + "balance_loss_clip": 1.03423131, + "balance_loss_mlp": 1.04937744, + "epoch": 0.10112731098752442, + "flos": 13479825093120.0, + "grad_norm": 2.135292201068385, + "language_loss": 0.93928307, + "learning_rate": 3.947165513074889e-06, + "loss": 0.96162128, + "num_input_tokens_seen": 36280960, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.2578125, + "step": 1682, + "time_per_iteration": 2.4357759952545166 + }, + { + "auxiliary_loss_clip": 0.01172101, + "auxiliary_loss_mlp": 0.01053977, + "balance_loss_clip": 1.03315091, + "balance_loss_mlp": 1.05045152, + "epoch": 0.1011874342401924, + "flos": 18515901490560.0, + "grad_norm": 5.112669241194533, + "language_loss": 0.87866408, + "learning_rate": 3.947076548642425e-06, + "loss": 0.90092492, + "num_input_tokens_seen": 36299010, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.21875, + "step": 1683, + "time_per_iteration": 2.427802562713623 + }, + { + "auxiliary_loss_clip": 0.01169341, + "auxiliary_loss_mlp": 0.01059869, + "balance_loss_clip": 1.03888798, + "balance_loss_mlp": 1.05144525, + "epoch": 0.10124755749286037, + "flos": 20702861055360.0, + "grad_norm": 1.7718228637860187, + "language_loss": 0.74768114, + "learning_rate": 3.946987510376624e-06, + "loss": 0.76997328, + "num_input_tokens_seen": 36318400, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.1796875, + "step": 1684, + "time_per_iteration": 5.332470417022705 + }, + { + "auxiliary_loss_clip": 0.01059522, + "auxiliary_loss_mlp": 0.01011499, + "balance_loss_clip": 1.00863802, + "balance_loss_mlp": 1.01624751, + "epoch": 0.10130768074552833, + "flos": 56109456247680.0, + "grad_norm": 0.760003339390084, + "language_loss": 0.61090153, + "learning_rate": 3.9468983982808615e-06, + "loss": 0.6316117, + "num_input_tokens_seen": 36381815, + "router_z_loss_clip": 0.02856445, + "router_z_loss_mlp": 0.43359375, + "step": 1685, + "time_per_iteration": 4.508171081542969 + }, + { + "auxiliary_loss_clip": 0.01169013, + "auxiliary_loss_mlp": 0.01049359, + "balance_loss_clip": 1.02769828, + "balance_loss_mlp": 1.04891801, + "epoch": 0.1013678039981963, + "flos": 33402346156800.0, + "grad_norm": 2.3224629698824075, + "language_loss": 0.61664945, + "learning_rate": 3.946809212358516e-06, + "loss": 0.63883317, + "num_input_tokens_seen": 36404320, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.203125, + "step": 1686, + "time_per_iteration": 2.586596965789795 + }, + { + "auxiliary_loss_clip": 0.01173787, + "auxiliary_loss_mlp": 0.01054454, + "balance_loss_clip": 1.03238797, + "balance_loss_mlp": 1.0545882, + "epoch": 0.10142792725086427, + "flos": 31905338008320.0, + "grad_norm": 2.1992592502117443, + "language_loss": 0.81408226, + "learning_rate": 3.946719952612972e-06, + "loss": 0.83636469, + "num_input_tokens_seen": 36427510, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1875, + "step": 1687, + "time_per_iteration": 2.5495810508728027 + }, + { + "auxiliary_loss_clip": 0.01173812, + "auxiliary_loss_mlp": 0.01051846, + "balance_loss_clip": 1.03055501, + "balance_loss_mlp": 1.0514555, + "epoch": 0.10148805050353224, + "flos": 28475905046400.0, + "grad_norm": 1.783489688966995, + "language_loss": 0.72360015, + "learning_rate": 3.94663061904761e-06, + "loss": 0.74585676, + "num_input_tokens_seen": 36448230, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.21875, + "step": 1688, + "time_per_iteration": 2.5242748260498047 + }, + { + "auxiliary_loss_clip": 0.01169898, + "auxiliary_loss_mlp": 0.01054433, + "balance_loss_clip": 1.03264165, + "balance_loss_mlp": 1.05043888, + "epoch": 0.1015481737562002, + "flos": 25148888737920.0, + "grad_norm": 1.9893327907397977, + "language_loss": 0.86880058, + "learning_rate": 3.94654121166582e-06, + "loss": 0.8910439, + "num_input_tokens_seen": 36464395, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.1875, + "step": 1689, + "time_per_iteration": 2.5283408164978027 + }, + { + "auxiliary_loss_clip": 0.01165961, + "auxiliary_loss_mlp": 0.01045526, + "balance_loss_clip": 1.02585626, + "balance_loss_mlp": 1.04692245, + "epoch": 0.10160829700886818, + "flos": 30882781630080.0, + "grad_norm": 1.8972643802531153, + "language_loss": 0.88054395, + "learning_rate": 3.946451730470993e-06, + "loss": 0.90265882, + "num_input_tokens_seen": 36486475, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1875, + "step": 1690, + "time_per_iteration": 2.5732247829437256 + }, + { + "auxiliary_loss_clip": 0.01170509, + "auxiliary_loss_mlp": 0.01051598, + "balance_loss_clip": 1.02961624, + "balance_loss_mlp": 1.04965854, + "epoch": 0.10166842026153615, + "flos": 20412020632320.0, + "grad_norm": 1.8841763324380914, + "language_loss": 0.83124495, + "learning_rate": 3.946362175466521e-06, + "loss": 0.85346603, + "num_input_tokens_seen": 36505310, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.203125, + "step": 1691, + "time_per_iteration": 2.453263282775879 + }, + { + "auxiliary_loss_clip": 0.01172636, + "auxiliary_loss_mlp": 0.01050561, + "balance_loss_clip": 1.028579, + "balance_loss_mlp": 1.05049825, + "epoch": 0.10172854351420411, + "flos": 33476968661760.0, + "grad_norm": 1.648035623213742, + "language_loss": 0.66938514, + "learning_rate": 3.946272546655801e-06, + "loss": 0.69161713, + "num_input_tokens_seen": 36529820, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.21875, + "step": 1692, + "time_per_iteration": 2.5865867137908936 + }, + { + "auxiliary_loss_clip": 0.01167535, + "auxiliary_loss_mlp": 0.01067279, + "balance_loss_clip": 1.04540372, + "balance_loss_mlp": 1.0471102, + "epoch": 0.1017886667668721, + "flos": 23550325862400.0, + "grad_norm": 1.649284734670808, + "language_loss": 0.75387824, + "learning_rate": 3.94618284404223e-06, + "loss": 0.77622634, + "num_input_tokens_seen": 36549000, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.203125, + "step": 1693, + "time_per_iteration": 2.499476194381714 + }, + { + "auxiliary_loss_clip": 0.01171507, + "auxiliary_loss_mlp": 0.01049517, + "balance_loss_clip": 1.02685595, + "balance_loss_mlp": 1.04984784, + "epoch": 0.10184879001954006, + "flos": 23296078419840.0, + "grad_norm": 1.6930931596653784, + "language_loss": 0.87206519, + "learning_rate": 3.9460930676292105e-06, + "loss": 0.89427543, + "num_input_tokens_seen": 36567515, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.21875, + "step": 1694, + "time_per_iteration": 2.483264923095703 + }, + { + "auxiliary_loss_clip": 0.01177185, + "auxiliary_loss_mlp": 0.01052768, + "balance_loss_clip": 1.03013015, + "balance_loss_mlp": 1.05056214, + "epoch": 0.10190891327220802, + "flos": 18333116156160.0, + "grad_norm": 3.1999162319303274, + "language_loss": 0.79579329, + "learning_rate": 3.946003217420147e-06, + "loss": 0.81809288, + "num_input_tokens_seen": 36586190, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.265625, + "step": 1695, + "time_per_iteration": 2.4574177265167236 + }, + { + "auxiliary_loss_clip": 0.01168528, + "auxiliary_loss_mlp": 0.01055372, + "balance_loss_clip": 1.03280592, + "balance_loss_mlp": 1.04648614, + "epoch": 0.10196903652487599, + "flos": 26465374108800.0, + "grad_norm": 1.7546035908378184, + "language_loss": 0.86581397, + "learning_rate": 3.945913293418447e-06, + "loss": 0.88805294, + "num_input_tokens_seen": 36607495, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.21875, + "step": 1696, + "time_per_iteration": 2.4986772537231445 + }, + { + "auxiliary_loss_clip": 0.01168623, + "auxiliary_loss_mlp": 0.01056747, + "balance_loss_clip": 1.03532469, + "balance_loss_mlp": 1.04927731, + "epoch": 0.10202915977754397, + "flos": 21869526798720.0, + "grad_norm": 1.97196247739744, + "language_loss": 0.82034266, + "learning_rate": 3.945823295627519e-06, + "loss": 0.84259629, + "num_input_tokens_seen": 36628555, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.1953125, + "step": 1697, + "time_per_iteration": 2.483682155609131 + }, + { + "auxiliary_loss_clip": 0.01170239, + "auxiliary_loss_mlp": 0.0104937, + "balance_loss_clip": 1.02674437, + "balance_loss_mlp": 1.0477041, + "epoch": 0.10208928303021193, + "flos": 22309755886080.0, + "grad_norm": 1.9483747561194416, + "language_loss": 0.80650747, + "learning_rate": 3.9457332240507775e-06, + "loss": 0.82870358, + "num_input_tokens_seen": 36646250, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.2265625, + "step": 1698, + "time_per_iteration": 2.4512858390808105 + }, + { + "auxiliary_loss_clip": 0.01172882, + "auxiliary_loss_mlp": 0.01048738, + "balance_loss_clip": 1.02756608, + "balance_loss_mlp": 1.05113077, + "epoch": 0.1021494062828799, + "flos": 22125569921280.0, + "grad_norm": 4.641294823605382, + "language_loss": 0.75680709, + "learning_rate": 3.945643078691637e-06, + "loss": 0.77902329, + "num_input_tokens_seen": 36666675, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.21875, + "step": 1699, + "time_per_iteration": 2.458315849304199 + }, + { + "auxiliary_loss_clip": 0.01171952, + "auxiliary_loss_mlp": 0.01048121, + "balance_loss_clip": 1.02606726, + "balance_loss_mlp": 1.05093145, + "epoch": 0.10220952953554788, + "flos": 19646728439040.0, + "grad_norm": 1.7623204527071121, + "language_loss": 0.79777479, + "learning_rate": 3.945552859553516e-06, + "loss": 0.81997555, + "num_input_tokens_seen": 36685225, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.2109375, + "step": 1700, + "time_per_iteration": 2.4692423343658447 + }, + { + "auxiliary_loss_clip": 0.01170922, + "auxiliary_loss_mlp": 0.01045823, + "balance_loss_clip": 1.02411532, + "balance_loss_mlp": 1.04850125, + "epoch": 0.10226965278821584, + "flos": 29787290686080.0, + "grad_norm": 1.8827887870563835, + "language_loss": 0.76854098, + "learning_rate": 3.945462566639836e-06, + "loss": 0.79070842, + "num_input_tokens_seen": 36705985, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.21875, + "step": 1701, + "time_per_iteration": 2.5015852451324463 + }, + { + "auxiliary_loss_clip": 0.01176415, + "auxiliary_loss_mlp": 0.01048843, + "balance_loss_clip": 1.02708709, + "balance_loss_mlp": 1.05213511, + "epoch": 0.10232977604088381, + "flos": 27016818681600.0, + "grad_norm": 2.1180628790190927, + "language_loss": 0.78123891, + "learning_rate": 3.945372199954019e-06, + "loss": 0.80349147, + "num_input_tokens_seen": 36725815, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.2421875, + "step": 1702, + "time_per_iteration": 2.4999852180480957 + }, + { + "auxiliary_loss_clip": 0.01168217, + "auxiliary_loss_mlp": 0.01046251, + "balance_loss_clip": 1.02586651, + "balance_loss_mlp": 1.0487566, + "epoch": 0.10238989929355179, + "flos": 20777519473920.0, + "grad_norm": 2.3091523831758765, + "language_loss": 0.94838184, + "learning_rate": 3.945281759499494e-06, + "loss": 0.97052652, + "num_input_tokens_seen": 36742345, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.1953125, + "step": 1703, + "time_per_iteration": 2.4586100578308105 + }, + { + "auxiliary_loss_clip": 0.01058115, + "auxiliary_loss_mlp": 0.01013234, + "balance_loss_clip": 1.01077867, + "balance_loss_mlp": 1.01462317, + "epoch": 0.10245002254621975, + "flos": 57698322451200.0, + "grad_norm": 0.8800585598511617, + "language_loss": 0.55092424, + "learning_rate": 3.94519124527969e-06, + "loss": 0.57163775, + "num_input_tokens_seen": 36798775, + "router_z_loss_clip": 0.02453613, + "router_z_loss_mlp": 0.43554688, + "step": 1704, + "time_per_iteration": 2.998384952545166 + }, + { + "auxiliary_loss_clip": 0.01170706, + "auxiliary_loss_mlp": 0.01050153, + "balance_loss_clip": 1.02790844, + "balance_loss_mlp": 1.04962945, + "epoch": 0.10251014579888772, + "flos": 16800125558400.0, + "grad_norm": 3.5257555777633174, + "language_loss": 0.83979154, + "learning_rate": 3.945100657298039e-06, + "loss": 0.86200017, + "num_input_tokens_seen": 36816295, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.2109375, + "step": 1705, + "time_per_iteration": 2.4242281913757324 + }, + { + "auxiliary_loss_clip": 0.01057951, + "auxiliary_loss_mlp": 0.01005039, + "balance_loss_clip": 1.00258374, + "balance_loss_mlp": 1.01514411, + "epoch": 0.1025702690515557, + "flos": 68565500922240.0, + "grad_norm": 0.7733309182053202, + "language_loss": 0.60434854, + "learning_rate": 3.9450099955579765e-06, + "loss": 0.62497854, + "num_input_tokens_seen": 36882030, + "router_z_loss_clip": 0.02453613, + "router_z_loss_mlp": 0.4296875, + "step": 1706, + "time_per_iteration": 3.127495765686035 + }, + { + "auxiliary_loss_clip": 0.01175774, + "auxiliary_loss_mlp": 0.01050349, + "balance_loss_clip": 1.02876019, + "balance_loss_mlp": 1.05214357, + "epoch": 0.10263039230422366, + "flos": 14866623336960.0, + "grad_norm": 2.0444921886168284, + "language_loss": 0.85967243, + "learning_rate": 3.94491926006294e-06, + "loss": 0.88193369, + "num_input_tokens_seen": 36899245, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.234375, + "step": 1707, + "time_per_iteration": 2.4486777782440186 + }, + { + "auxiliary_loss_clip": 0.01169845, + "auxiliary_loss_mlp": 0.01046854, + "balance_loss_clip": 1.02654099, + "balance_loss_mlp": 1.04891372, + "epoch": 0.10269051555689163, + "flos": 25337599816320.0, + "grad_norm": 1.6368034329364625, + "language_loss": 0.72840983, + "learning_rate": 3.944828450816369e-06, + "loss": 0.75057685, + "num_input_tokens_seen": 36920950, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.2109375, + "step": 1708, + "time_per_iteration": 2.5019850730895996 + }, + { + "auxiliary_loss_clip": 0.01168702, + "auxiliary_loss_mlp": 0.01054619, + "balance_loss_clip": 1.0325532, + "balance_loss_mlp": 1.0493356, + "epoch": 0.10275063880955959, + "flos": 21068826773760.0, + "grad_norm": 1.9016884094819633, + "language_loss": 0.90944314, + "learning_rate": 3.944737567821709e-06, + "loss": 0.93167639, + "num_input_tokens_seen": 36938900, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1953125, + "step": 1709, + "time_per_iteration": 2.4590399265289307 + }, + { + "auxiliary_loss_clip": 0.01173643, + "auxiliary_loss_mlp": 0.01055032, + "balance_loss_clip": 1.03357422, + "balance_loss_mlp": 1.05296373, + "epoch": 0.10281076206222757, + "flos": 30366780802560.0, + "grad_norm": 3.826538703219267, + "language_loss": 0.8828221, + "learning_rate": 3.944646611082406e-06, + "loss": 0.90510881, + "num_input_tokens_seen": 36957010, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.203125, + "step": 1710, + "time_per_iteration": 2.533165216445923 + }, + { + "auxiliary_loss_clip": 0.01167248, + "auxiliary_loss_mlp": 0.01053637, + "balance_loss_clip": 1.03229809, + "balance_loss_mlp": 1.04937959, + "epoch": 0.10287088531489554, + "flos": 22418313765120.0, + "grad_norm": 1.824520485293549, + "language_loss": 0.79264998, + "learning_rate": 3.944555580601908e-06, + "loss": 0.81485879, + "num_input_tokens_seen": 36977690, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1796875, + "step": 1711, + "time_per_iteration": 2.4947102069854736 + }, + { + "auxiliary_loss_clip": 0.01171963, + "auxiliary_loss_mlp": 0.01058195, + "balance_loss_clip": 1.03615332, + "balance_loss_mlp": 1.05005431, + "epoch": 0.1029310085675635, + "flos": 25115994858240.0, + "grad_norm": 2.0689984646996016, + "language_loss": 0.73589319, + "learning_rate": 3.944464476383668e-06, + "loss": 0.7581948, + "num_input_tokens_seen": 36997300, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.21875, + "step": 1712, + "time_per_iteration": 2.521899461746216 + }, + { + "auxiliary_loss_clip": 0.01166438, + "auxiliary_loss_mlp": 0.01058704, + "balance_loss_clip": 1.03902221, + "balance_loss_mlp": 1.04961872, + "epoch": 0.10299113182023148, + "flos": 19865639877120.0, + "grad_norm": 1.8460865361447714, + "language_loss": 0.86673403, + "learning_rate": 3.94437329843114e-06, + "loss": 0.8889854, + "num_input_tokens_seen": 37016110, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.1640625, + "step": 1713, + "time_per_iteration": 2.467824935913086 + }, + { + "auxiliary_loss_clip": 0.01166539, + "auxiliary_loss_mlp": 0.01059926, + "balance_loss_clip": 1.04019666, + "balance_loss_mlp": 1.04741335, + "epoch": 0.10305125507289944, + "flos": 20447608032000.0, + "grad_norm": 2.6691144860495126, + "language_loss": 0.72610664, + "learning_rate": 3.944282046747782e-06, + "loss": 0.74837124, + "num_input_tokens_seen": 37036405, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1875, + "step": 1714, + "time_per_iteration": 2.478605031967163 + }, + { + "auxiliary_loss_clip": 0.0117345, + "auxiliary_loss_mlp": 0.01057893, + "balance_loss_clip": 1.03542209, + "balance_loss_mlp": 1.04920006, + "epoch": 0.10311137832556741, + "flos": 26250772302720.0, + "grad_norm": 2.3323118637090605, + "language_loss": 0.91395295, + "learning_rate": 3.944190721337053e-06, + "loss": 0.93626636, + "num_input_tokens_seen": 37057580, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.2421875, + "step": 1715, + "time_per_iteration": 2.5223729610443115 + }, + { + "auxiliary_loss_clip": 0.01167345, + "auxiliary_loss_mlp": 0.01053609, + "balance_loss_clip": 1.03290248, + "balance_loss_mlp": 1.04737377, + "epoch": 0.10317150157823539, + "flos": 35298932175360.0, + "grad_norm": 1.9302110224144968, + "language_loss": 0.75736755, + "learning_rate": 3.944099322202418e-06, + "loss": 0.77957708, + "num_input_tokens_seen": 37079120, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.203125, + "step": 1716, + "time_per_iteration": 2.585345506668091 + }, + { + "auxiliary_loss_clip": 0.01171415, + "auxiliary_loss_mlp": 0.01068189, + "balance_loss_clip": 1.04601645, + "balance_loss_mlp": 1.04868793, + "epoch": 0.10323162483090335, + "flos": 25739943033600.0, + "grad_norm": 2.1161503252482747, + "language_loss": 0.85214567, + "learning_rate": 3.944007849347342e-06, + "loss": 0.87454176, + "num_input_tokens_seen": 37099710, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.2265625, + "step": 1717, + "time_per_iteration": 2.500964879989624 + }, + { + "auxiliary_loss_clip": 0.01169937, + "auxiliary_loss_mlp": 0.01055982, + "balance_loss_clip": 1.0358479, + "balance_loss_mlp": 1.05102515, + "epoch": 0.10329174808357132, + "flos": 16289870906880.0, + "grad_norm": 2.0308520014155746, + "language_loss": 0.82883167, + "learning_rate": 3.943916302775292e-06, + "loss": 0.85109091, + "num_input_tokens_seen": 37117775, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.1875, + "step": 1718, + "time_per_iteration": 2.436836004257202 + }, + { + "auxiliary_loss_clip": 0.01169212, + "auxiliary_loss_mlp": 0.01052655, + "balance_loss_clip": 1.03058898, + "balance_loss_mlp": 1.05092025, + "epoch": 0.10335187133623928, + "flos": 36687166963200.0, + "grad_norm": 1.8725763890619624, + "language_loss": 0.73192763, + "learning_rate": 3.943824682489742e-06, + "loss": 0.75414634, + "num_input_tokens_seen": 37140280, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1796875, + "step": 1719, + "time_per_iteration": 2.606293201446533 + }, + { + "auxiliary_loss_clip": 0.01172065, + "auxiliary_loss_mlp": 0.01046316, + "balance_loss_clip": 1.02637219, + "balance_loss_mlp": 1.05197001, + "epoch": 0.10341199458890726, + "flos": 14975648092800.0, + "grad_norm": 2.356604748076592, + "language_loss": 0.92601806, + "learning_rate": 3.9437329884941665e-06, + "loss": 0.94820189, + "num_input_tokens_seen": 37158350, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.203125, + "step": 1720, + "time_per_iteration": 2.4628992080688477 + }, + { + "auxiliary_loss_clip": 0.01167182, + "auxiliary_loss_mlp": 0.01054246, + "balance_loss_clip": 1.03239512, + "balance_loss_mlp": 1.04656935, + "epoch": 0.10347211784157523, + "flos": 21031587348480.0, + "grad_norm": 2.8075298743139174, + "language_loss": 0.79416633, + "learning_rate": 3.943641220792039e-06, + "loss": 0.81638062, + "num_input_tokens_seen": 37177120, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.2109375, + "step": 1721, + "time_per_iteration": 2.4713754653930664 + }, + { + "auxiliary_loss_clip": 0.0117694, + "auxiliary_loss_mlp": 0.01056525, + "balance_loss_clip": 1.03317165, + "balance_loss_mlp": 1.05172479, + "epoch": 0.1035322410942432, + "flos": 19792094780160.0, + "grad_norm": 2.496468299898097, + "language_loss": 0.80755401, + "learning_rate": 3.9435493793868434e-06, + "loss": 0.82988858, + "num_input_tokens_seen": 37195895, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.25, + "step": 1722, + "time_per_iteration": 2.4676520824432373 + }, + { + "auxiliary_loss_clip": 0.01056762, + "auxiliary_loss_mlp": 0.01049921, + "balance_loss_clip": 1.04772782, + "balance_loss_mlp": 1.013726, + "epoch": 0.10359236434691117, + "flos": 52698874947840.0, + "grad_norm": 0.9564367479099696, + "language_loss": 0.67185652, + "learning_rate": 3.943457464282059e-06, + "loss": 0.69292337, + "num_input_tokens_seen": 37247270, + "router_z_loss_clip": 0.02197266, + "router_z_loss_mlp": 0.4296875, + "step": 1723, + "time_per_iteration": 2.8474721908569336 + }, + { + "auxiliary_loss_clip": 0.01170693, + "auxiliary_loss_mlp": 0.01050183, + "balance_loss_clip": 1.02951217, + "balance_loss_mlp": 1.04747462, + "epoch": 0.10365248759957914, + "flos": 18405404277120.0, + "grad_norm": 2.780632359822339, + "language_loss": 0.77922273, + "learning_rate": 3.9433654754811745e-06, + "loss": 0.80143142, + "num_input_tokens_seen": 37265595, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.234375, + "step": 1724, + "time_per_iteration": 2.4311840534210205 + }, + { + "auxiliary_loss_clip": 0.01175556, + "auxiliary_loss_mlp": 0.01052899, + "balance_loss_clip": 1.03233576, + "balance_loss_mlp": 1.05101144, + "epoch": 0.1037126108522471, + "flos": 47553555335040.0, + "grad_norm": 1.8180629527722856, + "language_loss": 0.74894094, + "learning_rate": 3.943273412987676e-06, + "loss": 0.77122545, + "num_input_tokens_seen": 37286660, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.2421875, + "step": 1725, + "time_per_iteration": 2.6802284717559814 + }, + { + "auxiliary_loss_clip": 0.01170353, + "auxiliary_loss_mlp": 0.01049343, + "balance_loss_clip": 1.02852905, + "balance_loss_mlp": 1.05098462, + "epoch": 0.10377273410491508, + "flos": 22816670572800.0, + "grad_norm": 2.4392097975248244, + "language_loss": 0.75290418, + "learning_rate": 3.943181276805054e-06, + "loss": 0.77510113, + "num_input_tokens_seen": 37304915, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.1875, + "step": 1726, + "time_per_iteration": 5.461729049682617 + }, + { + "auxiliary_loss_clip": 0.01174745, + "auxiliary_loss_mlp": 0.01059612, + "balance_loss_clip": 1.03765321, + "balance_loss_mlp": 1.0527426, + "epoch": 0.10383285735758305, + "flos": 26138694890880.0, + "grad_norm": 1.8824890959349092, + "language_loss": 0.73943913, + "learning_rate": 3.9430890669368035e-06, + "loss": 0.76178271, + "num_input_tokens_seen": 37325265, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.21875, + "step": 1727, + "time_per_iteration": 3.883134126663208 + }, + { + "auxiliary_loss_clip": 0.01169505, + "auxiliary_loss_mlp": 0.01051483, + "balance_loss_clip": 1.03023946, + "balance_loss_mlp": 1.04815936, + "epoch": 0.10389298061025101, + "flos": 17091791994240.0, + "grad_norm": 2.187385195417556, + "language_loss": 0.84670323, + "learning_rate": 3.942996783386422e-06, + "loss": 0.86891311, + "num_input_tokens_seen": 37341650, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.2109375, + "step": 1728, + "time_per_iteration": 2.4405598640441895 + }, + { + "auxiliary_loss_clip": 0.01171168, + "auxiliary_loss_mlp": 0.0105142, + "balance_loss_clip": 1.02980709, + "balance_loss_mlp": 1.05098438, + "epoch": 0.10395310386291898, + "flos": 20776513893120.0, + "grad_norm": 2.4528097766615677, + "language_loss": 0.70985407, + "learning_rate": 3.942904426157406e-06, + "loss": 0.73207992, + "num_input_tokens_seen": 37360270, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.203125, + "step": 1729, + "time_per_iteration": 2.465688467025757 + }, + { + "auxiliary_loss_clip": 0.01170997, + "auxiliary_loss_mlp": 0.01059912, + "balance_loss_clip": 1.03679705, + "balance_loss_mlp": 1.05000722, + "epoch": 0.10401322711558696, + "flos": 12820540913280.0, + "grad_norm": 2.5788681057232625, + "language_loss": 0.81288344, + "learning_rate": 3.9428119952532605e-06, + "loss": 0.8351925, + "num_input_tokens_seen": 37375225, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.2109375, + "step": 1730, + "time_per_iteration": 2.4582717418670654 + }, + { + "auxiliary_loss_clip": 0.01167657, + "auxiliary_loss_mlp": 0.010515, + "balance_loss_clip": 1.03190255, + "balance_loss_mlp": 1.04836845, + "epoch": 0.10407335036825492, + "flos": 23184683366400.0, + "grad_norm": 2.1021084439253723, + "language_loss": 0.75932384, + "learning_rate": 3.942719490677489e-06, + "loss": 0.78151548, + "num_input_tokens_seen": 37395165, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.1953125, + "step": 1731, + "time_per_iteration": 2.4650096893310547 + }, + { + "auxiliary_loss_clip": 0.01164648, + "auxiliary_loss_mlp": 0.01046999, + "balance_loss_clip": 1.02762735, + "balance_loss_mlp": 1.04899907, + "epoch": 0.10413347362092289, + "flos": 26104184899200.0, + "grad_norm": 1.8082651510271561, + "language_loss": 0.82679468, + "learning_rate": 3.9426269124336e-06, + "loss": 0.84891117, + "num_input_tokens_seen": 37414845, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.15625, + "step": 1732, + "time_per_iteration": 2.5152552127838135 + }, + { + "auxiliary_loss_clip": 0.01169252, + "auxiliary_loss_mlp": 0.01048286, + "balance_loss_clip": 1.02881873, + "balance_loss_mlp": 1.05052853, + "epoch": 0.10419359687359087, + "flos": 12641059630080.0, + "grad_norm": 2.755876599624297, + "language_loss": 0.82947195, + "learning_rate": 3.942534260525104e-06, + "loss": 0.85164732, + "num_input_tokens_seen": 37432490, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.1875, + "step": 1733, + "time_per_iteration": 2.4426257610321045 + }, + { + "auxiliary_loss_clip": 0.01171007, + "auxiliary_loss_mlp": 0.01052347, + "balance_loss_clip": 1.03171146, + "balance_loss_mlp": 1.04982805, + "epoch": 0.10425372012625883, + "flos": 12125094716160.0, + "grad_norm": 2.4971959439308336, + "language_loss": 0.76446331, + "learning_rate": 3.942441534955514e-06, + "loss": 0.78669679, + "num_input_tokens_seen": 37449435, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.2109375, + "step": 1734, + "time_per_iteration": 2.4556663036346436 + }, + { + "auxiliary_loss_clip": 0.01165131, + "auxiliary_loss_mlp": 0.01047841, + "balance_loss_clip": 1.02795696, + "balance_loss_mlp": 1.04718471, + "epoch": 0.1043138433789268, + "flos": 25337563902720.0, + "grad_norm": 1.9861442095390862, + "language_loss": 0.74962163, + "learning_rate": 3.9423487357283465e-06, + "loss": 0.7717514, + "num_input_tokens_seen": 37469105, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.1796875, + "step": 1735, + "time_per_iteration": 2.4961798191070557 + }, + { + "auxiliary_loss_clip": 0.01172587, + "auxiliary_loss_mlp": 0.01048204, + "balance_loss_clip": 1.02724743, + "balance_loss_mlp": 1.05081487, + "epoch": 0.10437396663159478, + "flos": 29167149352320.0, + "grad_norm": 1.9829662552727403, + "language_loss": 0.79049939, + "learning_rate": 3.94225586284712e-06, + "loss": 0.8127073, + "num_input_tokens_seen": 37490540, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.21875, + "step": 1736, + "time_per_iteration": 2.530808448791504 + }, + { + "auxiliary_loss_clip": 0.01166906, + "auxiliary_loss_mlp": 0.01057245, + "balance_loss_clip": 1.03655005, + "balance_loss_mlp": 1.0491184, + "epoch": 0.10443408988426274, + "flos": 25080946162560.0, + "grad_norm": 1.8105684861006923, + "language_loss": 0.70339012, + "learning_rate": 3.942162916315356e-06, + "loss": 0.72563159, + "num_input_tokens_seen": 37511905, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.171875, + "step": 1737, + "time_per_iteration": 2.4789419174194336 + }, + { + "auxiliary_loss_clip": 0.01170601, + "auxiliary_loss_mlp": 0.01051121, + "balance_loss_clip": 1.02758932, + "balance_loss_mlp": 1.04718471, + "epoch": 0.1044942131369307, + "flos": 26759662237440.0, + "grad_norm": 2.004598680960266, + "language_loss": 0.81483257, + "learning_rate": 3.942069896136581e-06, + "loss": 0.83704984, + "num_input_tokens_seen": 37533635, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.234375, + "step": 1738, + "time_per_iteration": 2.505312442779541 + }, + { + "auxiliary_loss_clip": 0.0116919, + "auxiliary_loss_mlp": 0.01058357, + "balance_loss_clip": 1.0351944, + "balance_loss_mlp": 1.04712963, + "epoch": 0.10455433638959867, + "flos": 18442571875200.0, + "grad_norm": 4.442978598454381, + "language_loss": 0.750579, + "learning_rate": 3.9419768023143196e-06, + "loss": 0.77285445, + "num_input_tokens_seen": 37552035, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.21875, + "step": 1739, + "time_per_iteration": 2.4544031620025635 + }, + { + "auxiliary_loss_clip": 0.01168087, + "auxiliary_loss_mlp": 0.01055908, + "balance_loss_clip": 1.0349865, + "balance_loss_mlp": 1.04893625, + "epoch": 0.10461445964226665, + "flos": 23218977876480.0, + "grad_norm": 1.676051388115223, + "language_loss": 0.77279431, + "learning_rate": 3.941883634852104e-06, + "loss": 0.79503429, + "num_input_tokens_seen": 37571540, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1875, + "step": 1740, + "time_per_iteration": 2.489302635192871 + }, + { + "auxiliary_loss_clip": 0.01169756, + "auxiliary_loss_mlp": 0.01048525, + "balance_loss_clip": 1.02820003, + "balance_loss_mlp": 1.05093944, + "epoch": 0.10467458289493461, + "flos": 24345243797760.0, + "grad_norm": 2.1911967502326775, + "language_loss": 0.85983682, + "learning_rate": 3.941790393753467e-06, + "loss": 0.88201964, + "num_input_tokens_seen": 37588265, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1875, + "step": 1741, + "time_per_iteration": 2.4571211338043213 + }, + { + "auxiliary_loss_clip": 0.01171445, + "auxiliary_loss_mlp": 0.01053113, + "balance_loss_clip": 1.03091609, + "balance_loss_mlp": 1.04901385, + "epoch": 0.10473470614760258, + "flos": 21287953693440.0, + "grad_norm": 4.086245960730198, + "language_loss": 0.74991679, + "learning_rate": 3.941697079021942e-06, + "loss": 0.77216244, + "num_input_tokens_seen": 37606860, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.2265625, + "step": 1742, + "time_per_iteration": 2.4919426441192627 + }, + { + "auxiliary_loss_clip": 0.01171849, + "auxiliary_loss_mlp": 0.01059576, + "balance_loss_clip": 1.03914368, + "balance_loss_mlp": 1.05323386, + "epoch": 0.10479482940027056, + "flos": 21687208341120.0, + "grad_norm": 1.9550995481311175, + "language_loss": 0.87150526, + "learning_rate": 3.94160369066107e-06, + "loss": 0.89381945, + "num_input_tokens_seen": 37625210, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.1875, + "step": 1743, + "time_per_iteration": 2.470841884613037 + }, + { + "auxiliary_loss_clip": 0.01168292, + "auxiliary_loss_mlp": 0.01049872, + "balance_loss_clip": 1.02760363, + "balance_loss_mlp": 1.04964471, + "epoch": 0.10485495265293852, + "flos": 21573694385280.0, + "grad_norm": 2.1176645115958923, + "language_loss": 0.75532508, + "learning_rate": 3.941510228674391e-06, + "loss": 0.77750671, + "num_input_tokens_seen": 37644110, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.1875, + "step": 1744, + "time_per_iteration": 2.4725873470306396 + }, + { + "auxiliary_loss_clip": 0.01171079, + "auxiliary_loss_mlp": 0.01052914, + "balance_loss_clip": 1.03336394, + "balance_loss_mlp": 1.05184436, + "epoch": 0.10491507590560649, + "flos": 37961923708800.0, + "grad_norm": 2.151699961275852, + "language_loss": 0.79306591, + "learning_rate": 3.941416693065451e-06, + "loss": 0.81530583, + "num_input_tokens_seen": 37665800, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.1875, + "step": 1745, + "time_per_iteration": 2.5937912464141846 + }, + { + "auxiliary_loss_clip": 0.01166892, + "auxiliary_loss_mlp": 0.01062835, + "balance_loss_clip": 1.04194999, + "balance_loss_mlp": 1.047683, + "epoch": 0.10497519915827447, + "flos": 26396282298240.0, + "grad_norm": 2.087314316255438, + "language_loss": 0.82382894, + "learning_rate": 3.941323083837794e-06, + "loss": 0.8461262, + "num_input_tokens_seen": 37685095, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1875, + "step": 1746, + "time_per_iteration": 2.520306348800659 + }, + { + "auxiliary_loss_clip": 0.01170145, + "auxiliary_loss_mlp": 0.01062461, + "balance_loss_clip": 1.04186153, + "balance_loss_mlp": 1.05198646, + "epoch": 0.10503532241094243, + "flos": 40662190581120.0, + "grad_norm": 1.645771273172373, + "language_loss": 0.69951761, + "learning_rate": 3.941229400994971e-06, + "loss": 0.7218436, + "num_input_tokens_seen": 37707445, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1875, + "step": 1747, + "time_per_iteration": 2.618581771850586 + }, + { + "auxiliary_loss_clip": 0.01176288, + "auxiliary_loss_mlp": 0.01062255, + "balance_loss_clip": 1.04140496, + "balance_loss_mlp": 1.05136323, + "epoch": 0.1050954456636104, + "flos": 29789409588480.0, + "grad_norm": 2.3385484358742192, + "language_loss": 0.84245849, + "learning_rate": 3.941135644540535e-06, + "loss": 0.86484385, + "num_input_tokens_seen": 37728325, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.25, + "step": 1748, + "time_per_iteration": 2.539386034011841 + }, + { + "auxiliary_loss_clip": 0.01165269, + "auxiliary_loss_mlp": 0.01049548, + "balance_loss_clip": 1.02797103, + "balance_loss_mlp": 1.04729426, + "epoch": 0.10515556891627838, + "flos": 23948754497280.0, + "grad_norm": 1.8953667439120294, + "language_loss": 0.71491921, + "learning_rate": 3.941041814478041e-06, + "loss": 0.7370674, + "num_input_tokens_seen": 37748910, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.1796875, + "step": 1749, + "time_per_iteration": 2.481700897216797 + }, + { + "auxiliary_loss_clip": 0.01166695, + "auxiliary_loss_mlp": 0.01060715, + "balance_loss_clip": 1.0395906, + "balance_loss_mlp": 1.04953468, + "epoch": 0.10521569216894634, + "flos": 18259606972800.0, + "grad_norm": 1.9760411129591238, + "language_loss": 0.81960011, + "learning_rate": 3.940947910811047e-06, + "loss": 0.84187424, + "num_input_tokens_seen": 37765745, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.171875, + "step": 1750, + "time_per_iteration": 2.4454832077026367 + }, + { + "auxiliary_loss_clip": 0.01173904, + "auxiliary_loss_mlp": 0.01060945, + "balance_loss_clip": 1.03946304, + "balance_loss_mlp": 1.05259562, + "epoch": 0.10527581542161431, + "flos": 15630909949440.0, + "grad_norm": 2.3402404294313524, + "language_loss": 0.91871023, + "learning_rate": 3.940853933543114e-06, + "loss": 0.94105875, + "num_input_tokens_seen": 37780520, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.21875, + "step": 1751, + "time_per_iteration": 2.416607141494751 + }, + { + "auxiliary_loss_clip": 0.01166575, + "auxiliary_loss_mlp": 0.0104776, + "balance_loss_clip": 1.02698207, + "balance_loss_mlp": 1.04889047, + "epoch": 0.10533593867428227, + "flos": 18296559089280.0, + "grad_norm": 2.265296057434122, + "language_loss": 0.79560149, + "learning_rate": 3.940759882677805e-06, + "loss": 0.81774485, + "num_input_tokens_seen": 37799515, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.1796875, + "step": 1752, + "time_per_iteration": 2.46063494682312 + }, + { + "auxiliary_loss_clip": 0.01167711, + "auxiliary_loss_mlp": 0.01052906, + "balance_loss_clip": 1.03202033, + "balance_loss_mlp": 1.05050862, + "epoch": 0.10539606192695025, + "flos": 29023219555200.0, + "grad_norm": 2.1401152378303867, + "language_loss": 0.75782037, + "learning_rate": 3.940665758218686e-06, + "loss": 0.78002656, + "num_input_tokens_seen": 37818695, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.171875, + "step": 1753, + "time_per_iteration": 2.51767635345459 + }, + { + "auxiliary_loss_clip": 0.01172527, + "auxiliary_loss_mlp": 0.01057137, + "balance_loss_clip": 1.03436756, + "balance_loss_mlp": 1.04939532, + "epoch": 0.10545618517961822, + "flos": 19969313506560.0, + "grad_norm": 2.0790136174876546, + "language_loss": 0.84048498, + "learning_rate": 3.940571560169328e-06, + "loss": 0.86278164, + "num_input_tokens_seen": 37837860, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.234375, + "step": 1754, + "time_per_iteration": 2.4683756828308105 + }, + { + "auxiliary_loss_clip": 0.01175207, + "auxiliary_loss_mlp": 0.01053622, + "balance_loss_clip": 1.03044736, + "balance_loss_mlp": 1.05438888, + "epoch": 0.10551630843228618, + "flos": 16143427157760.0, + "grad_norm": 2.8736094439376645, + "language_loss": 0.68956709, + "learning_rate": 3.940477288533302e-06, + "loss": 0.71185535, + "num_input_tokens_seen": 37856260, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.2109375, + "step": 1755, + "time_per_iteration": 2.45597243309021 + }, + { + "auxiliary_loss_clip": 0.01172827, + "auxiliary_loss_mlp": 0.01061763, + "balance_loss_clip": 1.03989983, + "balance_loss_mlp": 1.05102587, + "epoch": 0.10557643168495416, + "flos": 23440115957760.0, + "grad_norm": 5.502613786824721, + "language_loss": 0.76718754, + "learning_rate": 3.940382943314182e-06, + "loss": 0.78953344, + "num_input_tokens_seen": 37876960, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.21875, + "step": 1756, + "time_per_iteration": 2.5239176750183105 + }, + { + "auxiliary_loss_clip": 0.01172125, + "auxiliary_loss_mlp": 0.01058013, + "balance_loss_clip": 1.03712726, + "balance_loss_mlp": 1.04982626, + "epoch": 0.10563655493762213, + "flos": 21799034357760.0, + "grad_norm": 1.7784869470084927, + "language_loss": 0.80162531, + "learning_rate": 3.940288524515547e-06, + "loss": 0.82392669, + "num_input_tokens_seen": 37897070, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.21875, + "step": 1757, + "time_per_iteration": 2.4551706314086914 + }, + { + "auxiliary_loss_clip": 0.01171845, + "auxiliary_loss_mlp": 0.01056344, + "balance_loss_clip": 1.03499317, + "balance_loss_mlp": 1.05132246, + "epoch": 0.10569667819029009, + "flos": 53800863275520.0, + "grad_norm": 1.631431596421375, + "language_loss": 0.78800333, + "learning_rate": 3.940194032140976e-06, + "loss": 0.81028521, + "num_input_tokens_seen": 37923635, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.203125, + "step": 1758, + "time_per_iteration": 2.7955896854400635 + }, + { + "auxiliary_loss_clip": 0.01177436, + "auxiliary_loss_mlp": 0.01050523, + "balance_loss_clip": 1.02865982, + "balance_loss_mlp": 1.05364573, + "epoch": 0.10575680144295807, + "flos": 22925515760640.0, + "grad_norm": 2.609159841262955, + "language_loss": 0.9189958, + "learning_rate": 3.940099466194054e-06, + "loss": 0.94127536, + "num_input_tokens_seen": 37942650, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.234375, + "step": 1759, + "time_per_iteration": 2.4853782653808594 + }, + { + "auxiliary_loss_clip": 0.01173064, + "auxiliary_loss_mlp": 0.01055702, + "balance_loss_clip": 1.03276575, + "balance_loss_mlp": 1.04970741, + "epoch": 0.10581692469562604, + "flos": 14136667148160.0, + "grad_norm": 2.498568213886603, + "language_loss": 0.76932353, + "learning_rate": 3.940004826678365e-06, + "loss": 0.79161119, + "num_input_tokens_seen": 37960660, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.234375, + "step": 1760, + "time_per_iteration": 2.470705509185791 + }, + { + "auxiliary_loss_clip": 0.01173982, + "auxiliary_loss_mlp": 0.01061265, + "balance_loss_clip": 1.03825736, + "balance_loss_mlp": 1.05152941, + "epoch": 0.105877047948294, + "flos": 25958674903680.0, + "grad_norm": 2.349800445259612, + "language_loss": 0.89282435, + "learning_rate": 3.939910113597498e-06, + "loss": 0.91517675, + "num_input_tokens_seen": 37978625, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.2265625, + "step": 1761, + "time_per_iteration": 2.491501569747925 + }, + { + "auxiliary_loss_clip": 0.01173015, + "auxiliary_loss_mlp": 0.01060542, + "balance_loss_clip": 1.03944254, + "balance_loss_mlp": 1.0518589, + "epoch": 0.10593717120096197, + "flos": 30664768032000.0, + "grad_norm": 2.4794664397863877, + "language_loss": 0.78304708, + "learning_rate": 3.9398153269550464e-06, + "loss": 0.80538261, + "num_input_tokens_seen": 38000005, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.2109375, + "step": 1762, + "time_per_iteration": 2.5563831329345703 + }, + { + "auxiliary_loss_clip": 0.01062071, + "auxiliary_loss_mlp": 0.01014008, + "balance_loss_clip": 1.0110991, + "balance_loss_mlp": 1.02000487, + "epoch": 0.10599729445362994, + "flos": 66436682497920.0, + "grad_norm": 0.753444103392694, + "language_loss": 0.60481733, + "learning_rate": 3.939720466754602e-06, + "loss": 0.62557811, + "num_input_tokens_seen": 38066165, + "router_z_loss_clip": 0.02905273, + "router_z_loss_mlp": 0.421875, + "step": 1763, + "time_per_iteration": 3.2239294052124023 + }, + { + "auxiliary_loss_clip": 0.01170891, + "auxiliary_loss_mlp": 0.01048971, + "balance_loss_clip": 1.02777529, + "balance_loss_mlp": 1.04924011, + "epoch": 0.10605741770629791, + "flos": 23948179879680.0, + "grad_norm": 2.054980370260194, + "language_loss": 0.8010751, + "learning_rate": 3.939625532999763e-06, + "loss": 0.82327372, + "num_input_tokens_seen": 38086150, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.21875, + "step": 1764, + "time_per_iteration": 2.476325273513794 + }, + { + "auxiliary_loss_clip": 0.01169028, + "auxiliary_loss_mlp": 0.01049345, + "balance_loss_clip": 1.02745855, + "balance_loss_mlp": 1.04961264, + "epoch": 0.10611754095896588, + "flos": 19387524919680.0, + "grad_norm": 1.7621956234955212, + "language_loss": 0.7999962, + "learning_rate": 3.9395305256941314e-06, + "loss": 0.82217997, + "num_input_tokens_seen": 38104205, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.1953125, + "step": 1765, + "time_per_iteration": 2.446593999862671 + }, + { + "auxiliary_loss_clip": 0.01167126, + "auxiliary_loss_mlp": 0.01054873, + "balance_loss_clip": 1.03394008, + "balance_loss_mlp": 1.04794002, + "epoch": 0.10617766421163385, + "flos": 22237755073920.0, + "grad_norm": 1.867239621884004, + "language_loss": 0.76693732, + "learning_rate": 3.939435444841306e-06, + "loss": 0.78915727, + "num_input_tokens_seen": 38122005, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1875, + "step": 1766, + "time_per_iteration": 2.4462356567382812 + }, + { + "auxiliary_loss_clip": 0.01170332, + "auxiliary_loss_mlp": 0.01059306, + "balance_loss_clip": 1.0366683, + "balance_loss_mlp": 1.05017042, + "epoch": 0.10623778746430182, + "flos": 28404407024640.0, + "grad_norm": 1.6580981789618001, + "language_loss": 0.77319431, + "learning_rate": 3.939340290444895e-06, + "loss": 0.79549068, + "num_input_tokens_seen": 38143365, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.203125, + "step": 1767, + "time_per_iteration": 2.542797088623047 + }, + { + "auxiliary_loss_clip": 0.01060068, + "auxiliary_loss_mlp": 0.01000453, + "balance_loss_clip": 0.99785471, + "balance_loss_mlp": 1.01804066, + "epoch": 0.10629791071696978, + "flos": 64234639221120.0, + "grad_norm": 0.6789245534488961, + "language_loss": 0.57902765, + "learning_rate": 3.939245062508506e-06, + "loss": 0.59963286, + "num_input_tokens_seen": 38210035, + "router_z_loss_clip": 0.02600098, + "router_z_loss_mlp": 0.421875, + "step": 1768, + "time_per_iteration": 6.071596384048462 + }, + { + "auxiliary_loss_clip": 0.01172748, + "auxiliary_loss_mlp": 0.01041813, + "balance_loss_clip": 1.0219171, + "balance_loss_mlp": 1.05201912, + "epoch": 0.10635803396963776, + "flos": 22747578762240.0, + "grad_norm": 1.446404125156032, + "language_loss": 0.86796767, + "learning_rate": 3.939149761035749e-06, + "loss": 0.89011335, + "num_input_tokens_seen": 38231230, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.2109375, + "step": 1769, + "time_per_iteration": 2.5106868743896484 + }, + { + "auxiliary_loss_clip": 0.01175908, + "auxiliary_loss_mlp": 0.01056805, + "balance_loss_clip": 1.03496528, + "balance_loss_mlp": 1.05300689, + "epoch": 0.10641815722230573, + "flos": 31395586147200.0, + "grad_norm": 1.766851816283336, + "language_loss": 0.61890501, + "learning_rate": 3.9390543860302395e-06, + "loss": 0.64123213, + "num_input_tokens_seen": 38253890, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.2265625, + "step": 1770, + "time_per_iteration": 2.5770323276519775 + }, + { + "auxiliary_loss_clip": 0.01061292, + "auxiliary_loss_mlp": 0.01003176, + "balance_loss_clip": 1.00058925, + "balance_loss_mlp": 1.01873469, + "epoch": 0.1064782804749737, + "flos": 58552527784320.0, + "grad_norm": 0.8864779346546747, + "language_loss": 0.57095039, + "learning_rate": 3.9389589374955925e-06, + "loss": 0.59159505, + "num_input_tokens_seen": 38304290, + "router_z_loss_clip": 0.02587891, + "router_z_loss_mlp": 0.42578125, + "step": 1771, + "time_per_iteration": 2.957993507385254 + }, + { + "auxiliary_loss_clip": 0.01174087, + "auxiliary_loss_mlp": 0.01063103, + "balance_loss_clip": 1.04187179, + "balance_loss_mlp": 1.05443954, + "epoch": 0.10653840372764166, + "flos": 23987825516160.0, + "grad_norm": 1.6398085638646198, + "language_loss": 0.88530469, + "learning_rate": 3.938863415435429e-06, + "loss": 0.90767658, + "num_input_tokens_seen": 38324725, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1953125, + "step": 1772, + "time_per_iteration": 2.520744562149048 + }, + { + "auxiliary_loss_clip": 0.01176768, + "auxiliary_loss_mlp": 0.01063607, + "balance_loss_clip": 1.03945482, + "balance_loss_mlp": 1.05091381, + "epoch": 0.10659852698030964, + "flos": 18294655668480.0, + "grad_norm": 2.8236986107629094, + "language_loss": 0.76021719, + "learning_rate": 3.93876781985337e-06, + "loss": 0.78262091, + "num_input_tokens_seen": 38340735, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.2578125, + "step": 1773, + "time_per_iteration": 2.4228129386901855 + }, + { + "auxiliary_loss_clip": 0.01171647, + "auxiliary_loss_mlp": 0.01063224, + "balance_loss_clip": 1.04087257, + "balance_loss_mlp": 1.05147731, + "epoch": 0.1066586502329776, + "flos": 32160591031680.0, + "grad_norm": 2.1931291175477177, + "language_loss": 0.83184093, + "learning_rate": 3.938672150753041e-06, + "loss": 0.85418963, + "num_input_tokens_seen": 38361315, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.203125, + "step": 1774, + "time_per_iteration": 2.5613787174224854 + }, + { + "auxiliary_loss_clip": 0.01177598, + "auxiliary_loss_mlp": 0.01054445, + "balance_loss_clip": 1.03262997, + "balance_loss_mlp": 1.05220413, + "epoch": 0.10671877348564557, + "flos": 17785155202560.0, + "grad_norm": 2.683505024819064, + "language_loss": 0.76297373, + "learning_rate": 3.9385764081380704e-06, + "loss": 0.78529418, + "num_input_tokens_seen": 38377425, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.25, + "step": 1775, + "time_per_iteration": 2.437676429748535 + }, + { + "auxiliary_loss_clip": 0.01057587, + "auxiliary_loss_mlp": 0.01006639, + "balance_loss_clip": 1.00413537, + "balance_loss_mlp": 1.01520467, + "epoch": 0.10677889673831355, + "flos": 63510177813120.0, + "grad_norm": 0.8253045983972309, + "language_loss": 0.57443953, + "learning_rate": 3.9384805920120876e-06, + "loss": 0.59508181, + "num_input_tokens_seen": 38440275, + "router_z_loss_clip": 0.02502441, + "router_z_loss_mlp": 0.42382812, + "step": 1776, + "time_per_iteration": 3.101378917694092 + }, + { + "auxiliary_loss_clip": 0.01176962, + "auxiliary_loss_mlp": 0.01059775, + "balance_loss_clip": 1.0365653, + "balance_loss_mlp": 1.05411029, + "epoch": 0.10683901999098151, + "flos": 22017694400640.0, + "grad_norm": 1.6481869723516467, + "language_loss": 0.83374244, + "learning_rate": 3.938384702378727e-06, + "loss": 0.8561098, + "num_input_tokens_seen": 38461820, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.2265625, + "step": 1777, + "time_per_iteration": 2.5109002590179443 + }, + { + "auxiliary_loss_clip": 0.01170133, + "auxiliary_loss_mlp": 0.01055162, + "balance_loss_clip": 1.03371584, + "balance_loss_mlp": 1.05298579, + "epoch": 0.10689914324364948, + "flos": 25042952551680.0, + "grad_norm": 2.6420984425067013, + "language_loss": 0.87275863, + "learning_rate": 3.938288739241625e-06, + "loss": 0.89501154, + "num_input_tokens_seen": 38482235, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.171875, + "step": 1778, + "time_per_iteration": 2.503103494644165 + }, + { + "auxiliary_loss_clip": 0.01175003, + "auxiliary_loss_mlp": 0.01053847, + "balance_loss_clip": 1.032354, + "balance_loss_mlp": 1.05328, + "epoch": 0.10695926649631746, + "flos": 16435129507200.0, + "grad_norm": 2.213225731734914, + "language_loss": 0.83970487, + "learning_rate": 3.938192702604417e-06, + "loss": 0.86199337, + "num_input_tokens_seen": 38500690, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.21875, + "step": 1779, + "time_per_iteration": 2.4574496746063232 + }, + { + "auxiliary_loss_clip": 0.01169562, + "auxiliary_loss_mlp": 0.01052117, + "balance_loss_clip": 1.03086162, + "balance_loss_mlp": 1.04975557, + "epoch": 0.10701938974898542, + "flos": 16979211792000.0, + "grad_norm": 2.4959309518827655, + "language_loss": 0.67064941, + "learning_rate": 3.9380965924707495e-06, + "loss": 0.69286621, + "num_input_tokens_seen": 38518405, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1953125, + "step": 1780, + "time_per_iteration": 2.447756052017212 + }, + { + "auxiliary_loss_clip": 0.01171847, + "auxiliary_loss_mlp": 0.01046888, + "balance_loss_clip": 1.02553749, + "balance_loss_mlp": 1.05183458, + "epoch": 0.10707951300165339, + "flos": 15888102307200.0, + "grad_norm": 2.25546613947904, + "language_loss": 0.91667759, + "learning_rate": 3.938000408844265e-06, + "loss": 0.93886495, + "num_input_tokens_seen": 38535060, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.203125, + "step": 1781, + "time_per_iteration": 2.4367144107818604 + }, + { + "auxiliary_loss_clip": 0.01175433, + "auxiliary_loss_mlp": 0.01046071, + "balance_loss_clip": 1.02524495, + "balance_loss_mlp": 1.05302, + "epoch": 0.10713963625432135, + "flos": 14247164361600.0, + "grad_norm": 2.202402738572802, + "language_loss": 0.79505372, + "learning_rate": 3.9379041517286105e-06, + "loss": 0.81726873, + "num_input_tokens_seen": 38552855, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.2265625, + "step": 1782, + "time_per_iteration": 2.4340877532958984 + }, + { + "auxiliary_loss_clip": 0.01175468, + "auxiliary_loss_mlp": 0.01052246, + "balance_loss_clip": 1.03055024, + "balance_loss_mlp": 1.0517509, + "epoch": 0.10719975950698933, + "flos": 16756780821120.0, + "grad_norm": 2.0445491568240994, + "language_loss": 0.78994977, + "learning_rate": 3.937807821127436e-06, + "loss": 0.81222689, + "num_input_tokens_seen": 38570075, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.234375, + "step": 1783, + "time_per_iteration": 2.434527635574341 + }, + { + "auxiliary_loss_clip": 0.01176375, + "auxiliary_loss_mlp": 0.01052212, + "balance_loss_clip": 1.02991986, + "balance_loss_mlp": 1.0529108, + "epoch": 0.1072598827596573, + "flos": 22710626645760.0, + "grad_norm": 1.8050343336808015, + "language_loss": 0.85956216, + "learning_rate": 3.937711417044395e-06, + "loss": 0.88184798, + "num_input_tokens_seen": 38587970, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.234375, + "step": 1784, + "time_per_iteration": 2.4867746829986572 + }, + { + "auxiliary_loss_clip": 0.01174134, + "auxiliary_loss_mlp": 0.01054075, + "balance_loss_clip": 1.03188968, + "balance_loss_mlp": 1.05080986, + "epoch": 0.10732000601232526, + "flos": 23258264376960.0, + "grad_norm": 3.0774406347184806, + "language_loss": 1.00899053, + "learning_rate": 3.937614939483143e-06, + "loss": 1.03127265, + "num_input_tokens_seen": 38605840, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.234375, + "step": 1785, + "time_per_iteration": 2.46663498878479 + }, + { + "auxiliary_loss_clip": 0.01171119, + "auxiliary_loss_mlp": 0.01057254, + "balance_loss_clip": 1.03636849, + "balance_loss_mlp": 1.05306709, + "epoch": 0.10738012926499324, + "flos": 24207060176640.0, + "grad_norm": 1.4495948735276882, + "language_loss": 0.85070992, + "learning_rate": 3.937518388447339e-06, + "loss": 0.87299371, + "num_input_tokens_seen": 38627070, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1796875, + "step": 1786, + "time_per_iteration": 2.505018949508667 + }, + { + "auxiliary_loss_clip": 0.01170035, + "auxiliary_loss_mlp": 0.01059108, + "balance_loss_clip": 1.035779, + "balance_loss_mlp": 1.04750311, + "epoch": 0.1074402525176612, + "flos": 20923065383040.0, + "grad_norm": 1.8788886178726656, + "language_loss": 0.78817046, + "learning_rate": 3.937421763940642e-06, + "loss": 0.81046188, + "num_input_tokens_seen": 38645840, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.21875, + "step": 1787, + "time_per_iteration": 2.443532705307007 + }, + { + "auxiliary_loss_clip": 0.01176938, + "auxiliary_loss_mlp": 0.01049821, + "balance_loss_clip": 1.02768385, + "balance_loss_mlp": 1.0517112, + "epoch": 0.10750037577032917, + "flos": 16946928443520.0, + "grad_norm": 2.551869220071384, + "language_loss": 0.82557851, + "learning_rate": 3.937325065966719e-06, + "loss": 0.84784609, + "num_input_tokens_seen": 38664770, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.25, + "step": 1788, + "time_per_iteration": 2.4807305335998535 + }, + { + "auxiliary_loss_clip": 0.01170019, + "auxiliary_loss_mlp": 0.0106343, + "balance_loss_clip": 1.04219902, + "balance_loss_mlp": 1.04939878, + "epoch": 0.10756049902299715, + "flos": 20266546550400.0, + "grad_norm": 2.778852512980128, + "language_loss": 0.77794182, + "learning_rate": 3.9372282945292335e-06, + "loss": 0.80027628, + "num_input_tokens_seen": 38683865, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.203125, + "step": 1789, + "time_per_iteration": 2.482330322265625 + }, + { + "auxiliary_loss_clip": 0.01173111, + "auxiliary_loss_mlp": 0.01061244, + "balance_loss_clip": 1.03631723, + "balance_loss_mlp": 1.05133712, + "epoch": 0.10762062227566511, + "flos": 23586523793280.0, + "grad_norm": 2.434124451319009, + "language_loss": 0.74467903, + "learning_rate": 3.937131449631859e-06, + "loss": 0.76702261, + "num_input_tokens_seen": 38702485, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 1.21875, + "step": 1790, + "time_per_iteration": 2.5921239852905273 + }, + { + "auxiliary_loss_clip": 0.01177807, + "auxiliary_loss_mlp": 0.01072366, + "balance_loss_clip": 1.04766607, + "balance_loss_mlp": 1.05428767, + "epoch": 0.10768074552833308, + "flos": 24310626065280.0, + "grad_norm": 2.5839507236364554, + "language_loss": 0.78495383, + "learning_rate": 3.9370345312782645e-06, + "loss": 0.80745554, + "num_input_tokens_seen": 38722475, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.234375, + "step": 1791, + "time_per_iteration": 2.5242488384246826 + }, + { + "auxiliary_loss_clip": 0.01167341, + "auxiliary_loss_mlp": 0.01053897, + "balance_loss_clip": 1.0330478, + "balance_loss_mlp": 1.05112934, + "epoch": 0.10774086878100106, + "flos": 25299965341440.0, + "grad_norm": 1.8605555947944812, + "language_loss": 0.70855284, + "learning_rate": 3.936937539472126e-06, + "loss": 0.73076522, + "num_input_tokens_seen": 38743285, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.15625, + "step": 1792, + "time_per_iteration": 2.5260751247406006 + }, + { + "auxiliary_loss_clip": 0.01175824, + "auxiliary_loss_mlp": 0.010463, + "balance_loss_clip": 1.02330506, + "balance_loss_mlp": 1.05109024, + "epoch": 0.10780099203366902, + "flos": 22054035985920.0, + "grad_norm": 2.973355145299492, + "language_loss": 0.76029646, + "learning_rate": 3.9368404742171236e-06, + "loss": 0.78251767, + "num_input_tokens_seen": 38763035, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.25, + "step": 1793, + "time_per_iteration": 2.5037007331848145 + }, + { + "auxiliary_loss_clip": 0.01171847, + "auxiliary_loss_mlp": 0.01060242, + "balance_loss_clip": 1.03793848, + "balance_loss_mlp": 1.0537113, + "epoch": 0.10786111528633699, + "flos": 22747471021440.0, + "grad_norm": 1.7251623627880495, + "language_loss": 0.85158944, + "learning_rate": 3.936743335516936e-06, + "loss": 0.87391031, + "num_input_tokens_seen": 38784900, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.1875, + "step": 1794, + "time_per_iteration": 2.5210132598876953 + }, + { + "auxiliary_loss_clip": 0.01180393, + "auxiliary_loss_mlp": 0.01052991, + "balance_loss_clip": 1.02954292, + "balance_loss_mlp": 1.05342674, + "epoch": 0.10792123853900495, + "flos": 20851064570880.0, + "grad_norm": 1.9245153565321482, + "language_loss": 0.74914879, + "learning_rate": 3.936646123375246e-06, + "loss": 0.77148265, + "num_input_tokens_seen": 38804695, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.265625, + "step": 1795, + "time_per_iteration": 2.486111879348755 + }, + { + "auxiliary_loss_clip": 0.0117609, + "auxiliary_loss_mlp": 0.01060963, + "balance_loss_clip": 1.03863525, + "balance_loss_mlp": 1.05227423, + "epoch": 0.10798136179167293, + "flos": 17748705876480.0, + "grad_norm": 2.917857918230487, + "language_loss": 0.8116014, + "learning_rate": 3.936548837795741e-06, + "loss": 0.83397192, + "num_input_tokens_seen": 38822395, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.234375, + "step": 1796, + "time_per_iteration": 2.4357504844665527 + }, + { + "auxiliary_loss_clip": 0.01177296, + "auxiliary_loss_mlp": 0.01075942, + "balance_loss_clip": 1.05260134, + "balance_loss_mlp": 1.05476594, + "epoch": 0.1080414850443409, + "flos": 13589639948160.0, + "grad_norm": 2.4043777768562293, + "language_loss": 0.73476732, + "learning_rate": 3.936451478782111e-06, + "loss": 0.75729972, + "num_input_tokens_seen": 38839865, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.21875, + "step": 1797, + "time_per_iteration": 2.477867841720581 + }, + { + "auxiliary_loss_clip": 0.01172695, + "auxiliary_loss_mlp": 0.01051138, + "balance_loss_clip": 1.03081274, + "balance_loss_mlp": 1.05260658, + "epoch": 0.10810160829700886, + "flos": 16253421580800.0, + "grad_norm": 3.1892188654982396, + "language_loss": 0.81348622, + "learning_rate": 3.936354046338046e-06, + "loss": 0.83572453, + "num_input_tokens_seen": 38857300, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.203125, + "step": 1798, + "time_per_iteration": 2.5060064792633057 + }, + { + "auxiliary_loss_clip": 0.011719, + "auxiliary_loss_mlp": 0.01053896, + "balance_loss_clip": 1.03075755, + "balance_loss_mlp": 1.0508821, + "epoch": 0.10816173154967684, + "flos": 15158002464000.0, + "grad_norm": 2.4195393058725623, + "language_loss": 0.85180116, + "learning_rate": 3.936256540467242e-06, + "loss": 0.87405908, + "num_input_tokens_seen": 38874960, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.2109375, + "step": 1799, + "time_per_iteration": 2.4546945095062256 + }, + { + "auxiliary_loss_clip": 0.01172982, + "auxiliary_loss_mlp": 0.01064124, + "balance_loss_clip": 1.04271412, + "balance_loss_mlp": 1.0546999, + "epoch": 0.10822185480234481, + "flos": 17785334770560.0, + "grad_norm": 2.2474252534922265, + "language_loss": 0.77365196, + "learning_rate": 3.9361589611733955e-06, + "loss": 0.79602301, + "num_input_tokens_seen": 38893610, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.1796875, + "step": 1800, + "time_per_iteration": 2.4650769233703613 + }, + { + "auxiliary_loss_clip": 0.01168665, + "auxiliary_loss_mlp": 0.01044543, + "balance_loss_clip": 1.02443254, + "balance_loss_mlp": 1.05136347, + "epoch": 0.10828197805501277, + "flos": 25556654908800.0, + "grad_norm": 2.2954016650766844, + "language_loss": 0.7287963, + "learning_rate": 3.9360613084602075e-06, + "loss": 0.7509284, + "num_input_tokens_seen": 38913485, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.171875, + "step": 1801, + "time_per_iteration": 2.5045113563537598 + }, + { + "auxiliary_loss_clip": 0.01177863, + "auxiliary_loss_mlp": 0.01048534, + "balance_loss_clip": 1.02785134, + "balance_loss_mlp": 1.05259442, + "epoch": 0.10834210130768075, + "flos": 28984435845120.0, + "grad_norm": 1.8364602771794378, + "language_loss": 0.66427058, + "learning_rate": 3.935963582331381e-06, + "loss": 0.68653458, + "num_input_tokens_seen": 38935650, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.25, + "step": 1802, + "time_per_iteration": 2.5547947883605957 + }, + { + "auxiliary_loss_clip": 0.01170285, + "auxiliary_loss_mlp": 0.01059138, + "balance_loss_clip": 1.03712058, + "balance_loss_mlp": 1.05202222, + "epoch": 0.10840222456034872, + "flos": 20264212166400.0, + "grad_norm": 1.7898565484043845, + "language_loss": 0.8136133, + "learning_rate": 3.935865782790621e-06, + "loss": 0.83590758, + "num_input_tokens_seen": 38954130, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1796875, + "step": 1803, + "time_per_iteration": 2.4758658409118652 + }, + { + "auxiliary_loss_clip": 0.0116949, + "auxiliary_loss_mlp": 0.01052862, + "balance_loss_clip": 1.031106, + "balance_loss_mlp": 1.05126929, + "epoch": 0.10846234781301668, + "flos": 19863054097920.0, + "grad_norm": 2.61974519761109, + "language_loss": 0.9122982, + "learning_rate": 3.9357679098416365e-06, + "loss": 0.93452168, + "num_input_tokens_seen": 38972905, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.1875, + "step": 1804, + "time_per_iteration": 2.4858944416046143 + }, + { + "auxiliary_loss_clip": 0.01175328, + "auxiliary_loss_mlp": 0.01052796, + "balance_loss_clip": 1.03031349, + "balance_loss_mlp": 1.05401301, + "epoch": 0.10852247106568465, + "flos": 26469037296000.0, + "grad_norm": 2.0091269076806078, + "language_loss": 0.7623654, + "learning_rate": 3.935669963488139e-06, + "loss": 0.78464663, + "num_input_tokens_seen": 38993255, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.2109375, + "step": 1805, + "time_per_iteration": 2.5379836559295654 + }, + { + "auxiliary_loss_clip": 0.01172079, + "auxiliary_loss_mlp": 0.01048159, + "balance_loss_clip": 1.02842999, + "balance_loss_mlp": 1.0535754, + "epoch": 0.10858259431835263, + "flos": 30081506987520.0, + "grad_norm": 1.8192828849331855, + "language_loss": 0.860416, + "learning_rate": 3.935571943733843e-06, + "loss": 0.88261837, + "num_input_tokens_seen": 39012610, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1875, + "step": 1806, + "time_per_iteration": 2.5523955821990967 + }, + { + "auxiliary_loss_clip": 0.01170931, + "auxiliary_loss_mlp": 0.01053704, + "balance_loss_clip": 1.03275895, + "balance_loss_mlp": 1.05068612, + "epoch": 0.10864271757102059, + "flos": 19063180085760.0, + "grad_norm": 5.439462316727856, + "language_loss": 0.80572915, + "learning_rate": 3.9354738505824635e-06, + "loss": 0.82797557, + "num_input_tokens_seen": 39030120, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.203125, + "step": 1807, + "time_per_iteration": 2.514390230178833 + }, + { + "auxiliary_loss_clip": 0.01171878, + "auxiliary_loss_mlp": 0.01051305, + "balance_loss_clip": 1.03168321, + "balance_loss_mlp": 1.05415583, + "epoch": 0.10870284082368856, + "flos": 24715052271360.0, + "grad_norm": 1.7684897552837426, + "language_loss": 0.78731525, + "learning_rate": 3.9353756840377225e-06, + "loss": 0.80954707, + "num_input_tokens_seen": 39049875, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.171875, + "step": 1808, + "time_per_iteration": 2.5084331035614014 + }, + { + "auxiliary_loss_clip": 0.01176105, + "auxiliary_loss_mlp": 0.01052005, + "balance_loss_clip": 1.03090501, + "balance_loss_mlp": 1.05633223, + "epoch": 0.10876296407635654, + "flos": 20627663932800.0, + "grad_norm": 1.6609588216066864, + "language_loss": 0.78927523, + "learning_rate": 3.935277444103342e-06, + "loss": 0.81155634, + "num_input_tokens_seen": 39068935, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1953125, + "step": 1809, + "time_per_iteration": 5.368049621582031 + }, + { + "auxiliary_loss_clip": 0.01171492, + "auxiliary_loss_mlp": 0.01053913, + "balance_loss_clip": 1.03318286, + "balance_loss_mlp": 1.05087388, + "epoch": 0.1088230873290245, + "flos": 21579835610880.0, + "grad_norm": 2.0370215842844197, + "language_loss": 0.8468523, + "learning_rate": 3.935179130783046e-06, + "loss": 0.86910635, + "num_input_tokens_seen": 39087370, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.203125, + "step": 1810, + "time_per_iteration": 3.904432535171509 + }, + { + "auxiliary_loss_clip": 0.01180342, + "auxiliary_loss_mlp": 0.01054633, + "balance_loss_clip": 1.03111291, + "balance_loss_mlp": 1.05665135, + "epoch": 0.10888321058169247, + "flos": 26469037296000.0, + "grad_norm": 1.9531179942167565, + "language_loss": 0.63677633, + "learning_rate": 3.935080744080564e-06, + "loss": 0.6591261, + "num_input_tokens_seen": 39106635, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.234375, + "step": 1811, + "time_per_iteration": 2.523650646209717 + }, + { + "auxiliary_loss_clip": 0.01171345, + "auxiliary_loss_mlp": 0.01048747, + "balance_loss_clip": 1.02737319, + "balance_loss_mlp": 1.05139136, + "epoch": 0.10894333383436045, + "flos": 25848608653440.0, + "grad_norm": 3.279966127836369, + "language_loss": 0.74238914, + "learning_rate": 3.934982283999626e-06, + "loss": 0.76459008, + "num_input_tokens_seen": 39126335, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.203125, + "step": 1812, + "time_per_iteration": 2.5579042434692383 + }, + { + "auxiliary_loss_clip": 0.01173457, + "auxiliary_loss_mlp": 0.01047521, + "balance_loss_clip": 1.02587295, + "balance_loss_mlp": 1.05391026, + "epoch": 0.10900345708702841, + "flos": 19537093152000.0, + "grad_norm": 1.9314487748153213, + "language_loss": 0.72647583, + "learning_rate": 3.934883750543966e-06, + "loss": 0.74868566, + "num_input_tokens_seen": 39144820, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.1875, + "step": 1813, + "time_per_iteration": 2.488762617111206 + }, + { + "auxiliary_loss_clip": 0.01174675, + "auxiliary_loss_mlp": 0.01051455, + "balance_loss_clip": 1.02999711, + "balance_loss_mlp": 1.05744648, + "epoch": 0.10906358033969638, + "flos": 23623296341760.0, + "grad_norm": 10.097396236718186, + "language_loss": 0.82224226, + "learning_rate": 3.93478514371732e-06, + "loss": 0.84450358, + "num_input_tokens_seen": 39165945, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.171875, + "step": 1814, + "time_per_iteration": 2.495798349380493 + }, + { + "auxiliary_loss_clip": 0.01176897, + "auxiliary_loss_mlp": 0.01057916, + "balance_loss_clip": 1.03670859, + "balance_loss_mlp": 1.05595291, + "epoch": 0.10912370359236434, + "flos": 21214731818880.0, + "grad_norm": 2.3551509805271422, + "language_loss": 0.84218144, + "learning_rate": 3.934686463523429e-06, + "loss": 0.86452949, + "num_input_tokens_seen": 39183520, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.2109375, + "step": 1815, + "time_per_iteration": 2.462663173675537 + }, + { + "auxiliary_loss_clip": 0.01175955, + "auxiliary_loss_mlp": 0.01053131, + "balance_loss_clip": 1.03054035, + "balance_loss_mlp": 1.05833483, + "epoch": 0.10918382684503232, + "flos": 13553190622080.0, + "grad_norm": 2.3954928768695027, + "language_loss": 0.71048725, + "learning_rate": 3.9345877099660315e-06, + "loss": 0.73277813, + "num_input_tokens_seen": 39201190, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.171875, + "step": 1816, + "time_per_iteration": 2.465953826904297 + }, + { + "auxiliary_loss_clip": 0.01178612, + "auxiliary_loss_mlp": 0.01063123, + "balance_loss_clip": 1.04061651, + "balance_loss_mlp": 1.056674, + "epoch": 0.10924395009770028, + "flos": 27964321591680.0, + "grad_norm": 2.0063973144433067, + "language_loss": 0.72811669, + "learning_rate": 3.9344888830488744e-06, + "loss": 0.75053406, + "num_input_tokens_seen": 39221210, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.21875, + "step": 1817, + "time_per_iteration": 2.5323143005371094 + }, + { + "auxiliary_loss_clip": 0.01178871, + "auxiliary_loss_mlp": 0.01054207, + "balance_loss_clip": 1.03167605, + "balance_loss_mlp": 1.05709267, + "epoch": 0.10930407335036825, + "flos": 25593750679680.0, + "grad_norm": 1.767365755633268, + "language_loss": 0.67279243, + "learning_rate": 3.934389982775706e-06, + "loss": 0.6951232, + "num_input_tokens_seen": 39242025, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.21875, + "step": 1818, + "time_per_iteration": 2.5450243949890137 + }, + { + "auxiliary_loss_clip": 0.01177017, + "auxiliary_loss_mlp": 0.01063325, + "balance_loss_clip": 1.04123521, + "balance_loss_mlp": 1.05534315, + "epoch": 0.10936419660303623, + "flos": 18406194376320.0, + "grad_norm": 2.0802139312896744, + "language_loss": 0.72992313, + "learning_rate": 3.934291009150275e-06, + "loss": 0.75232661, + "num_input_tokens_seen": 39259870, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.21875, + "step": 1819, + "time_per_iteration": 2.487644910812378 + }, + { + "auxiliary_loss_clip": 0.01180831, + "auxiliary_loss_mlp": 0.01051168, + "balance_loss_clip": 1.02994883, + "balance_loss_mlp": 1.06090236, + "epoch": 0.1094243198557042, + "flos": 23840052963840.0, + "grad_norm": 7.240077427900601, + "language_loss": 0.73943537, + "learning_rate": 3.934191962176335e-06, + "loss": 0.76175541, + "num_input_tokens_seen": 39278500, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.203125, + "step": 1820, + "time_per_iteration": 2.5064899921417236 + }, + { + "auxiliary_loss_clip": 0.01177081, + "auxiliary_loss_mlp": 0.0105084, + "balance_loss_clip": 1.02765381, + "balance_loss_mlp": 1.05699766, + "epoch": 0.10948444310837216, + "flos": 14643940970880.0, + "grad_norm": 2.1677198782015887, + "language_loss": 0.82586408, + "learning_rate": 3.934092841857642e-06, + "loss": 0.84814322, + "num_input_tokens_seen": 39294800, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.203125, + "step": 1821, + "time_per_iteration": 2.4487218856811523 + }, + { + "auxiliary_loss_clip": 0.01174491, + "auxiliary_loss_mlp": 0.01049191, + "balance_loss_clip": 1.0280906, + "balance_loss_mlp": 1.05549288, + "epoch": 0.10954456636104014, + "flos": 27818811596160.0, + "grad_norm": 2.4783722356243065, + "language_loss": 0.76171732, + "learning_rate": 3.933993648197955e-06, + "loss": 0.78395414, + "num_input_tokens_seen": 39314625, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1875, + "step": 1822, + "time_per_iteration": 2.5196666717529297 + }, + { + "auxiliary_loss_clip": 0.01175357, + "auxiliary_loss_mlp": 0.01051738, + "balance_loss_clip": 1.03070986, + "balance_loss_mlp": 1.05751145, + "epoch": 0.1096046896137081, + "flos": 33620934372480.0, + "grad_norm": 1.9066217775511896, + "language_loss": 0.79275787, + "learning_rate": 3.933894381201034e-06, + "loss": 0.81502879, + "num_input_tokens_seen": 39336465, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1796875, + "step": 1823, + "time_per_iteration": 2.5665249824523926 + }, + { + "auxiliary_loss_clip": 0.01176588, + "auxiliary_loss_mlp": 0.01047872, + "balance_loss_clip": 1.02583015, + "balance_loss_mlp": 1.05788529, + "epoch": 0.10966481286637607, + "flos": 26980010219520.0, + "grad_norm": 1.7066251744315906, + "language_loss": 0.79424715, + "learning_rate": 3.933795040870645e-06, + "loss": 0.81649172, + "num_input_tokens_seen": 39357930, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1875, + "step": 1824, + "time_per_iteration": 2.5238118171691895 + }, + { + "auxiliary_loss_clip": 0.01173809, + "auxiliary_loss_mlp": 0.01054255, + "balance_loss_clip": 1.03264284, + "balance_loss_mlp": 1.05610347, + "epoch": 0.10972493611904403, + "flos": 23036551678080.0, + "grad_norm": 2.2183246130345, + "language_loss": 0.87992203, + "learning_rate": 3.933695627210554e-06, + "loss": 0.90220273, + "num_input_tokens_seen": 39376380, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.171875, + "step": 1825, + "time_per_iteration": 2.48294734954834 + }, + { + "auxiliary_loss_clip": 0.01171104, + "auxiliary_loss_mlp": 0.01056568, + "balance_loss_clip": 1.03483629, + "balance_loss_mlp": 1.05362988, + "epoch": 0.10978505937171201, + "flos": 38104632443520.0, + "grad_norm": 1.8404731426595848, + "language_loss": 0.76462233, + "learning_rate": 3.933596140224532e-06, + "loss": 0.78689909, + "num_input_tokens_seen": 39399935, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.171875, + "step": 1826, + "time_per_iteration": 2.6397035121917725 + }, + { + "auxiliary_loss_clip": 0.01066703, + "auxiliary_loss_mlp": 0.01001412, + "balance_loss_clip": 0.9983961, + "balance_loss_mlp": 1.02257371, + "epoch": 0.10984518262437998, + "flos": 59849694616320.0, + "grad_norm": 0.8361632453995619, + "language_loss": 0.54999328, + "learning_rate": 3.93349657991635e-06, + "loss": 0.57067442, + "num_input_tokens_seen": 39460685, + "router_z_loss_clip": 0.03015137, + "router_z_loss_mlp": 0.44140625, + "step": 1827, + "time_per_iteration": 3.065896511077881 + }, + { + "auxiliary_loss_clip": 0.01064494, + "auxiliary_loss_mlp": 0.01003719, + "balance_loss_clip": 1.00082231, + "balance_loss_mlp": 1.02098036, + "epoch": 0.10990530587704794, + "flos": 66719837410560.0, + "grad_norm": 0.7348311418426204, + "language_loss": 0.55346334, + "learning_rate": 3.933396946289784e-06, + "loss": 0.57414544, + "num_input_tokens_seen": 39524765, + "router_z_loss_clip": 0.02893066, + "router_z_loss_mlp": 0.43359375, + "step": 1828, + "time_per_iteration": 3.0850460529327393 + }, + { + "auxiliary_loss_clip": 0.01180205, + "auxiliary_loss_mlp": 0.01063699, + "balance_loss_clip": 1.03967869, + "balance_loss_mlp": 1.05754089, + "epoch": 0.10996542912971592, + "flos": 25447199189760.0, + "grad_norm": 2.992065013624077, + "language_loss": 0.84191215, + "learning_rate": 3.933297239348612e-06, + "loss": 0.86435115, + "num_input_tokens_seen": 39543640, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.2265625, + "step": 1829, + "time_per_iteration": 2.5398643016815186 + }, + { + "auxiliary_loss_clip": 0.01181422, + "auxiliary_loss_mlp": 0.01057367, + "balance_loss_clip": 1.03348923, + "balance_loss_mlp": 1.05845475, + "epoch": 0.11002555238238389, + "flos": 44018186186880.0, + "grad_norm": 2.654516298718269, + "language_loss": 0.8878119, + "learning_rate": 3.933197459096614e-06, + "loss": 0.91019976, + "num_input_tokens_seen": 39567525, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.2265625, + "step": 1830, + "time_per_iteration": 2.6912100315093994 + }, + { + "auxiliary_loss_clip": 0.01061018, + "auxiliary_loss_mlp": 0.01017752, + "balance_loss_clip": 1.01497495, + "balance_loss_mlp": 1.01824236, + "epoch": 0.11008567563505185, + "flos": 54065133590400.0, + "grad_norm": 0.6883241829767079, + "language_loss": 0.55492055, + "learning_rate": 3.9330976055375756e-06, + "loss": 0.57570827, + "num_input_tokens_seen": 39628470, + "router_z_loss_clip": 0.02783203, + "router_z_loss_mlp": 0.42773438, + "step": 1831, + "time_per_iteration": 3.075678825378418 + }, + { + "auxiliary_loss_clip": 0.01183643, + "auxiliary_loss_mlp": 0.01072422, + "balance_loss_clip": 1.04829443, + "balance_loss_mlp": 1.05867732, + "epoch": 0.11014579888771983, + "flos": 24243150366720.0, + "grad_norm": 2.054835171188452, + "language_loss": 0.90726995, + "learning_rate": 3.932997678675282e-06, + "loss": 0.92983055, + "num_input_tokens_seen": 39646670, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.25, + "step": 1832, + "time_per_iteration": 2.5084948539733887 + }, + { + "auxiliary_loss_clip": 0.01058943, + "auxiliary_loss_mlp": 0.01015171, + "balance_loss_clip": 1.01245296, + "balance_loss_mlp": 1.01603723, + "epoch": 0.1102059221403878, + "flos": 57743965658880.0, + "grad_norm": 0.7159549093535102, + "language_loss": 0.59889859, + "learning_rate": 3.932897678513523e-06, + "loss": 0.61963969, + "num_input_tokens_seen": 39712915, + "router_z_loss_clip": 0.02722168, + "router_z_loss_mlp": 0.4296875, + "step": 1833, + "time_per_iteration": 3.0748977661132812 + }, + { + "auxiliary_loss_clip": 0.01175273, + "auxiliary_loss_mlp": 0.01050301, + "balance_loss_clip": 1.0277946, + "balance_loss_mlp": 1.05353165, + "epoch": 0.11026604539305576, + "flos": 16795923667200.0, + "grad_norm": 2.6030857455850303, + "language_loss": 0.8095156, + "learning_rate": 3.93279760505609e-06, + "loss": 0.83177137, + "num_input_tokens_seen": 39730650, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.21875, + "step": 1834, + "time_per_iteration": 2.452131509780884 + }, + { + "auxiliary_loss_clip": 0.01179876, + "auxiliary_loss_mlp": 0.0105407, + "balance_loss_clip": 1.0302285, + "balance_loss_mlp": 1.05899858, + "epoch": 0.11032616864572373, + "flos": 23988076911360.0, + "grad_norm": 2.5262438386564807, + "language_loss": 0.90514123, + "learning_rate": 3.932697458306779e-06, + "loss": 0.9274807, + "num_input_tokens_seen": 39751065, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.2109375, + "step": 1835, + "time_per_iteration": 2.5261363983154297 + }, + { + "auxiliary_loss_clip": 0.01177237, + "auxiliary_loss_mlp": 0.0105853, + "balance_loss_clip": 1.03445005, + "balance_loss_mlp": 1.05625033, + "epoch": 0.1103862918983917, + "flos": 19683141851520.0, + "grad_norm": 2.0785934228774003, + "language_loss": 0.63590646, + "learning_rate": 3.932597238269386e-06, + "loss": 0.65826416, + "num_input_tokens_seen": 39769245, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.2109375, + "step": 1836, + "time_per_iteration": 2.502586603164673 + }, + { + "auxiliary_loss_clip": 0.01173672, + "auxiliary_loss_mlp": 0.01057372, + "balance_loss_clip": 1.03547311, + "balance_loss_mlp": 1.05388379, + "epoch": 0.11044641515105967, + "flos": 32160878340480.0, + "grad_norm": 1.9330421575083043, + "language_loss": 0.72814602, + "learning_rate": 3.932496944947711e-06, + "loss": 0.75045645, + "num_input_tokens_seen": 39790830, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.1953125, + "step": 1837, + "time_per_iteration": 2.594910144805908 + }, + { + "auxiliary_loss_clip": 0.01179947, + "auxiliary_loss_mlp": 0.0105928, + "balance_loss_clip": 1.03733349, + "balance_loss_mlp": 1.05921102, + "epoch": 0.11050653840372764, + "flos": 16689233295360.0, + "grad_norm": 2.132041599419941, + "language_loss": 0.79049784, + "learning_rate": 3.93239657834556e-06, + "loss": 0.81289005, + "num_input_tokens_seen": 39809475, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.203125, + "step": 1838, + "time_per_iteration": 2.4922690391540527 + }, + { + "auxiliary_loss_clip": 0.01175098, + "auxiliary_loss_mlp": 0.01061476, + "balance_loss_clip": 1.03883791, + "balance_loss_mlp": 1.05623114, + "epoch": 0.11056666165639562, + "flos": 21208877902080.0, + "grad_norm": 4.130442583787946, + "language_loss": 0.71453696, + "learning_rate": 3.932296138466736e-06, + "loss": 0.73690271, + "num_input_tokens_seen": 39826355, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.1875, + "step": 1839, + "time_per_iteration": 2.5151031017303467 + }, + { + "auxiliary_loss_clip": 0.01183988, + "auxiliary_loss_mlp": 0.01053903, + "balance_loss_clip": 1.03082371, + "balance_loss_mlp": 1.05938148, + "epoch": 0.11062678490906358, + "flos": 19165488998400.0, + "grad_norm": 2.064820600929851, + "language_loss": 0.79099703, + "learning_rate": 3.93219562531505e-06, + "loss": 0.81337595, + "num_input_tokens_seen": 39845335, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.25, + "step": 1840, + "time_per_iteration": 2.487116575241089 + }, + { + "auxiliary_loss_clip": 0.01171241, + "auxiliary_loss_mlp": 0.01053863, + "balance_loss_clip": 1.03234553, + "balance_loss_mlp": 1.05329347, + "epoch": 0.11068690816173155, + "flos": 24895287740160.0, + "grad_norm": 2.0204098875762293, + "language_loss": 0.87691998, + "learning_rate": 3.932095038894311e-06, + "loss": 0.89917111, + "num_input_tokens_seen": 39865065, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.1796875, + "step": 1841, + "time_per_iteration": 2.5141868591308594 + }, + { + "auxiliary_loss_clip": 0.01170262, + "auxiliary_loss_mlp": 0.01053518, + "balance_loss_clip": 1.03126192, + "balance_loss_mlp": 1.05365491, + "epoch": 0.11074703141439952, + "flos": 16472368932480.0, + "grad_norm": 2.3404569451138535, + "language_loss": 0.90582979, + "learning_rate": 3.931994379208334e-06, + "loss": 0.92806768, + "num_input_tokens_seen": 39882780, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.1640625, + "step": 1842, + "time_per_iteration": 2.4583773612976074 + }, + { + "auxiliary_loss_clip": 0.0117179, + "auxiliary_loss_mlp": 0.0105155, + "balance_loss_clip": 1.03080761, + "balance_loss_mlp": 1.05210185, + "epoch": 0.11080715466706749, + "flos": 19172420323200.0, + "grad_norm": 2.171204868901281, + "language_loss": 0.85597986, + "learning_rate": 3.931893646260937e-06, + "loss": 0.87821329, + "num_input_tokens_seen": 39900295, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1953125, + "step": 1843, + "time_per_iteration": 2.4801278114318848 + }, + { + "auxiliary_loss_clip": 0.01174004, + "auxiliary_loss_mlp": 0.01060021, + "balance_loss_clip": 1.03645349, + "balance_loss_mlp": 1.05622911, + "epoch": 0.11086727791973545, + "flos": 27704687109120.0, + "grad_norm": 1.47825888700324, + "language_loss": 0.7494424, + "learning_rate": 3.931792840055941e-06, + "loss": 0.77178264, + "num_input_tokens_seen": 39922075, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.1796875, + "step": 1844, + "time_per_iteration": 2.526383876800537 + }, + { + "auxiliary_loss_clip": 0.01173241, + "auxiliary_loss_mlp": 0.01054334, + "balance_loss_clip": 1.0304563, + "balance_loss_mlp": 1.05405343, + "epoch": 0.11092740117240343, + "flos": 18514967736960.0, + "grad_norm": 2.0036363505702433, + "language_loss": 0.75732028, + "learning_rate": 3.931691960597165e-06, + "loss": 0.77959603, + "num_input_tokens_seen": 39940115, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.1953125, + "step": 1845, + "time_per_iteration": 2.463327169418335 + }, + { + "auxiliary_loss_clip": 0.01171011, + "auxiliary_loss_mlp": 0.01054645, + "balance_loss_clip": 1.03341389, + "balance_loss_mlp": 1.05351365, + "epoch": 0.1109875244250714, + "flos": 20522446018560.0, + "grad_norm": 1.6129010657048202, + "language_loss": 0.76336479, + "learning_rate": 3.9315910078884375e-06, + "loss": 0.7856214, + "num_input_tokens_seen": 39959920, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.171875, + "step": 1846, + "time_per_iteration": 2.465045928955078 + }, + { + "auxiliary_loss_clip": 0.01175917, + "auxiliary_loss_mlp": 0.01053852, + "balance_loss_clip": 1.03262115, + "balance_loss_mlp": 1.05392015, + "epoch": 0.11104764767773936, + "flos": 14098601710080.0, + "grad_norm": 2.9965527726637577, + "language_loss": 0.85611343, + "learning_rate": 3.931489981933584e-06, + "loss": 0.87841111, + "num_input_tokens_seen": 39974755, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.21875, + "step": 1847, + "time_per_iteration": 2.4493908882141113 + }, + { + "auxiliary_loss_clip": 0.01174539, + "auxiliary_loss_mlp": 0.01053148, + "balance_loss_clip": 1.03018796, + "balance_loss_mlp": 1.05326366, + "epoch": 0.11110777093040733, + "flos": 20594518657920.0, + "grad_norm": 3.3740806549350086, + "language_loss": 0.76464605, + "learning_rate": 3.931388882736438e-06, + "loss": 0.78692293, + "num_input_tokens_seen": 39993355, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.2109375, + "step": 1848, + "time_per_iteration": 2.4647536277770996 + }, + { + "auxiliary_loss_clip": 0.01173713, + "auxiliary_loss_mlp": 0.01048417, + "balance_loss_clip": 1.02754378, + "balance_loss_mlp": 1.05833888, + "epoch": 0.11116789418307531, + "flos": 21870065502720.0, + "grad_norm": 2.0750561163348173, + "language_loss": 0.77849847, + "learning_rate": 3.931287710300832e-06, + "loss": 0.8007198, + "num_input_tokens_seen": 40012410, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1484375, + "step": 1849, + "time_per_iteration": 2.514777660369873 + }, + { + "auxiliary_loss_clip": 0.01176001, + "auxiliary_loss_mlp": 0.01056414, + "balance_loss_clip": 1.03496861, + "balance_loss_mlp": 1.05422294, + "epoch": 0.11122801743574327, + "flos": 15523106256000.0, + "grad_norm": 3.6662643697478066, + "language_loss": 0.71315688, + "learning_rate": 3.931186464630601e-06, + "loss": 0.73548102, + "num_input_tokens_seen": 40029315, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.21875, + "step": 1850, + "time_per_iteration": 2.4420053958892822 + }, + { + "auxiliary_loss_clip": 0.01174469, + "auxiliary_loss_mlp": 0.01056777, + "balance_loss_clip": 1.03434181, + "balance_loss_mlp": 1.05444217, + "epoch": 0.11128814068841124, + "flos": 14392279307520.0, + "grad_norm": 2.2721050151861912, + "language_loss": 0.81174368, + "learning_rate": 3.931085145729588e-06, + "loss": 0.83405614, + "num_input_tokens_seen": 40045765, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.203125, + "step": 1851, + "time_per_iteration": 5.341679811477661 + }, + { + "auxiliary_loss_clip": 0.01173679, + "auxiliary_loss_mlp": 0.01051699, + "balance_loss_clip": 1.03126621, + "balance_loss_mlp": 1.05519962, + "epoch": 0.11134826394107922, + "flos": 16653933204480.0, + "grad_norm": 3.240427658931177, + "language_loss": 0.88860446, + "learning_rate": 3.930983753601631e-06, + "loss": 0.91085827, + "num_input_tokens_seen": 40061660, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.1875, + "step": 1852, + "time_per_iteration": 3.8281352519989014 + }, + { + "auxiliary_loss_clip": 0.01176515, + "auxiliary_loss_mlp": 0.01057817, + "balance_loss_clip": 1.03514326, + "balance_loss_mlp": 1.05636191, + "epoch": 0.11140838719374718, + "flos": 16690993061760.0, + "grad_norm": 2.0685366180695848, + "language_loss": 0.72092974, + "learning_rate": 3.930882288250578e-06, + "loss": 0.74327302, + "num_input_tokens_seen": 40080180, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.203125, + "step": 1853, + "time_per_iteration": 2.4896738529205322 + }, + { + "auxiliary_loss_clip": 0.01055744, + "auxiliary_loss_mlp": 0.01002079, + "balance_loss_clip": 0.99923038, + "balance_loss_mlp": 1.0132798, + "epoch": 0.11146851044641515, + "flos": 60976355587200.0, + "grad_norm": 0.7783537669608381, + "language_loss": 0.53647029, + "learning_rate": 3.930780749680273e-06, + "loss": 0.5570485, + "num_input_tokens_seen": 40138910, + "router_z_loss_clip": 0.02844238, + "router_z_loss_mlp": 0.42578125, + "step": 1854, + "time_per_iteration": 3.0189781188964844 + }, + { + "auxiliary_loss_clip": 0.01184355, + "auxiliary_loss_mlp": 0.01052254, + "balance_loss_clip": 1.02937746, + "balance_loss_mlp": 1.057657, + "epoch": 0.11152863369908313, + "flos": 22193835719040.0, + "grad_norm": 2.006296213399466, + "language_loss": 0.8394689, + "learning_rate": 3.9306791378945705e-06, + "loss": 0.861835, + "num_input_tokens_seen": 40157745, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.265625, + "step": 1855, + "time_per_iteration": 2.4908485412597656 + }, + { + "auxiliary_loss_clip": 0.01173641, + "auxiliary_loss_mlp": 0.0106694, + "balance_loss_clip": 1.04588723, + "balance_loss_mlp": 1.05353498, + "epoch": 0.11158875695175109, + "flos": 19537524115200.0, + "grad_norm": 2.2091175797191815, + "language_loss": 0.82098675, + "learning_rate": 3.9305774528973205e-06, + "loss": 0.84339261, + "num_input_tokens_seen": 40175375, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.203125, + "step": 1856, + "time_per_iteration": 2.4653480052948 + }, + { + "auxiliary_loss_clip": 0.01173222, + "auxiliary_loss_mlp": 0.01048519, + "balance_loss_clip": 1.02631092, + "balance_loss_mlp": 1.05662763, + "epoch": 0.11164888020441906, + "flos": 25442709989760.0, + "grad_norm": 2.9605277294776, + "language_loss": 0.8305279, + "learning_rate": 3.93047569469238e-06, + "loss": 0.85274535, + "num_input_tokens_seen": 40195715, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.1640625, + "step": 1857, + "time_per_iteration": 2.5205907821655273 + }, + { + "auxiliary_loss_clip": 0.01173614, + "auxiliary_loss_mlp": 0.01049346, + "balance_loss_clip": 1.0279119, + "balance_loss_mlp": 1.05195725, + "epoch": 0.11170900345708702, + "flos": 15632741543040.0, + "grad_norm": 2.3309612964817923, + "language_loss": 0.83037764, + "learning_rate": 3.930373863283608e-06, + "loss": 0.85260725, + "num_input_tokens_seen": 40213975, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.21875, + "step": 1858, + "time_per_iteration": 2.4567432403564453 + }, + { + "auxiliary_loss_clip": 0.01175678, + "auxiliary_loss_mlp": 0.01062921, + "balance_loss_clip": 1.04205894, + "balance_loss_mlp": 1.05549788, + "epoch": 0.111769126709755, + "flos": 23039424766080.0, + "grad_norm": 2.004830650729854, + "language_loss": 0.91120583, + "learning_rate": 3.930271958674866e-06, + "loss": 0.93359184, + "num_input_tokens_seen": 40233905, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.203125, + "step": 1859, + "time_per_iteration": 2.532048463821411 + }, + { + "auxiliary_loss_clip": 0.01173939, + "auxiliary_loss_mlp": 0.0105127, + "balance_loss_clip": 1.02983618, + "balance_loss_mlp": 1.05344319, + "epoch": 0.11182924996242297, + "flos": 20850705434880.0, + "grad_norm": 2.4768392741235306, + "language_loss": 0.81709313, + "learning_rate": 3.930169980870018e-06, + "loss": 0.83934522, + "num_input_tokens_seen": 40252810, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.203125, + "step": 1860, + "time_per_iteration": 2.4747087955474854 + }, + { + "auxiliary_loss_clip": 0.01170638, + "auxiliary_loss_mlp": 0.01056481, + "balance_loss_clip": 1.0361197, + "balance_loss_mlp": 1.05388653, + "epoch": 0.11188937321509093, + "flos": 17455315587840.0, + "grad_norm": 2.1256274007234937, + "language_loss": 0.75203162, + "learning_rate": 3.930067929872931e-06, + "loss": 0.77430284, + "num_input_tokens_seen": 40272000, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.171875, + "step": 1861, + "time_per_iteration": 2.4773240089416504 + }, + { + "auxiliary_loss_clip": 0.01169857, + "auxiliary_loss_mlp": 0.01052708, + "balance_loss_clip": 1.03318143, + "balance_loss_mlp": 1.05338371, + "epoch": 0.11194949646775891, + "flos": 24095916518400.0, + "grad_norm": 2.0016824982414776, + "language_loss": 0.88759935, + "learning_rate": 3.929965805687474e-06, + "loss": 0.90982509, + "num_input_tokens_seen": 40290660, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.1640625, + "step": 1862, + "time_per_iteration": 2.4750735759735107 + }, + { + "auxiliary_loss_clip": 0.01173358, + "auxiliary_loss_mlp": 0.01059619, + "balance_loss_clip": 1.03880525, + "balance_loss_mlp": 1.05597067, + "epoch": 0.11200961972042688, + "flos": 25153880728320.0, + "grad_norm": 2.1858127473987015, + "language_loss": 0.8707, + "learning_rate": 3.92986360831752e-06, + "loss": 0.89302975, + "num_input_tokens_seen": 40307820, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.171875, + "step": 1863, + "time_per_iteration": 2.510676860809326 + }, + { + "auxiliary_loss_clip": 0.01173984, + "auxiliary_loss_mlp": 0.01051873, + "balance_loss_clip": 1.0283289, + "balance_loss_mlp": 1.05463171, + "epoch": 0.11206974297309484, + "flos": 21288312829440.0, + "grad_norm": 2.0887108243102976, + "language_loss": 0.64630157, + "learning_rate": 3.929761337766945e-06, + "loss": 0.66856015, + "num_input_tokens_seen": 40327430, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.1953125, + "step": 1864, + "time_per_iteration": 2.4843807220458984 + }, + { + "auxiliary_loss_clip": 0.01171142, + "auxiliary_loss_mlp": 0.01051015, + "balance_loss_clip": 1.03169096, + "balance_loss_mlp": 1.05504417, + "epoch": 0.11212986622576282, + "flos": 18915982151040.0, + "grad_norm": 2.0715232833306874, + "language_loss": 0.73895639, + "learning_rate": 3.929658994039627e-06, + "loss": 0.76117796, + "num_input_tokens_seen": 40344545, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.15625, + "step": 1865, + "time_per_iteration": 2.4509596824645996 + }, + { + "auxiliary_loss_clip": 0.01169998, + "auxiliary_loss_mlp": 0.01051954, + "balance_loss_clip": 1.02928007, + "balance_loss_mlp": 1.05253589, + "epoch": 0.11218998947843078, + "flos": 22054754257920.0, + "grad_norm": 2.190736679244475, + "language_loss": 0.84019023, + "learning_rate": 3.929556577139446e-06, + "loss": 0.86240977, + "num_input_tokens_seen": 40362300, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.171875, + "step": 1866, + "time_per_iteration": 2.473715305328369 + }, + { + "auxiliary_loss_clip": 0.01169711, + "auxiliary_loss_mlp": 0.01048459, + "balance_loss_clip": 1.02737069, + "balance_loss_mlp": 1.05260134, + "epoch": 0.11225011273109875, + "flos": 24571697091840.0, + "grad_norm": 1.5419857436109028, + "language_loss": 0.81424987, + "learning_rate": 3.929454087070286e-06, + "loss": 0.83643156, + "num_input_tokens_seen": 40384720, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.171875, + "step": 1867, + "time_per_iteration": 2.5367391109466553 + }, + { + "auxiliary_loss_clip": 0.01172987, + "auxiliary_loss_mlp": 0.01055012, + "balance_loss_clip": 1.03473496, + "balance_loss_mlp": 1.05594015, + "epoch": 0.11231023598376672, + "flos": 28438665621120.0, + "grad_norm": 2.5308159777425976, + "language_loss": 0.86677599, + "learning_rate": 3.929351523836035e-06, + "loss": 0.88905597, + "num_input_tokens_seen": 40404000, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.171875, + "step": 1868, + "time_per_iteration": 2.5044100284576416 + }, + { + "auxiliary_loss_clip": 0.01172172, + "auxiliary_loss_mlp": 0.01049736, + "balance_loss_clip": 1.03026938, + "balance_loss_mlp": 1.05724931, + "epoch": 0.1123703592364347, + "flos": 14426466076800.0, + "grad_norm": 2.333499600894065, + "language_loss": 0.68059367, + "learning_rate": 3.9292488874405795e-06, + "loss": 0.70281279, + "num_input_tokens_seen": 40418665, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.1484375, + "step": 1869, + "time_per_iteration": 2.4462318420410156 + }, + { + "auxiliary_loss_clip": 0.01176659, + "auxiliary_loss_mlp": 0.01061629, + "balance_loss_clip": 1.03969407, + "balance_loss_mlp": 1.05456114, + "epoch": 0.11243048248910266, + "flos": 22236282616320.0, + "grad_norm": 2.049754856307833, + "language_loss": 0.7735095, + "learning_rate": 3.929146177887814e-06, + "loss": 0.79589236, + "num_input_tokens_seen": 40437870, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.21875, + "step": 1870, + "time_per_iteration": 2.488471031188965 + }, + { + "auxiliary_loss_clip": 0.01174025, + "auxiliary_loss_mlp": 0.01053264, + "balance_loss_clip": 1.03177094, + "balance_loss_mlp": 1.05264199, + "epoch": 0.11249060574177062, + "flos": 18584167288320.0, + "grad_norm": 1.8085683914823212, + "language_loss": 0.75747174, + "learning_rate": 3.929043395181631e-06, + "loss": 0.77974463, + "num_input_tokens_seen": 40455570, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.21875, + "step": 1871, + "time_per_iteration": 2.475358486175537 + }, + { + "auxiliary_loss_clip": 0.01171992, + "auxiliary_loss_mlp": 0.01049389, + "balance_loss_clip": 1.02936232, + "balance_loss_mlp": 1.05448031, + "epoch": 0.1125507289944386, + "flos": 22856567604480.0, + "grad_norm": 2.4822417703451265, + "language_loss": 0.81949306, + "learning_rate": 3.928940539325929e-06, + "loss": 0.84170687, + "num_input_tokens_seen": 40473600, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.171875, + "step": 1872, + "time_per_iteration": 2.4984912872314453 + }, + { + "auxiliary_loss_clip": 0.01172929, + "auxiliary_loss_mlp": 0.01052146, + "balance_loss_clip": 1.03183281, + "balance_loss_mlp": 1.05497694, + "epoch": 0.11261085224710657, + "flos": 19676390094720.0, + "grad_norm": 2.7250665555581937, + "language_loss": 0.83564019, + "learning_rate": 3.9288376103246095e-06, + "loss": 0.85789096, + "num_input_tokens_seen": 40490025, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1796875, + "step": 1873, + "time_per_iteration": 2.480172872543335 + }, + { + "auxiliary_loss_clip": 0.01175617, + "auxiliary_loss_mlp": 0.01053305, + "balance_loss_clip": 1.03089404, + "balance_loss_mlp": 1.05352998, + "epoch": 0.11267097549977453, + "flos": 26063246373120.0, + "grad_norm": 2.2103217259008985, + "language_loss": 0.91925669, + "learning_rate": 3.928734608181575e-06, + "loss": 0.9415459, + "num_input_tokens_seen": 40511580, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.21875, + "step": 1874, + "time_per_iteration": 2.5193865299224854 + }, + { + "auxiliary_loss_clip": 0.01168968, + "auxiliary_loss_mlp": 0.01057524, + "balance_loss_clip": 1.0376637, + "balance_loss_mlp": 1.0528394, + "epoch": 0.11273109875244251, + "flos": 21068036674560.0, + "grad_norm": 1.5656160151577971, + "language_loss": 0.7534616, + "learning_rate": 3.928631532900729e-06, + "loss": 0.77572656, + "num_input_tokens_seen": 40530155, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.15625, + "step": 1875, + "time_per_iteration": 2.509000062942505 + }, + { + "auxiliary_loss_clip": 0.01168067, + "auxiliary_loss_mlp": 0.01054767, + "balance_loss_clip": 1.03545499, + "balance_loss_mlp": 1.05498421, + "epoch": 0.11279122200511048, + "flos": 27088999061760.0, + "grad_norm": 1.875753927893446, + "language_loss": 0.71727258, + "learning_rate": 3.928528384485984e-06, + "loss": 0.73950088, + "num_input_tokens_seen": 40549500, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.1328125, + "step": 1876, + "time_per_iteration": 2.5222911834716797 + }, + { + "auxiliary_loss_clip": 0.01170022, + "auxiliary_loss_mlp": 0.01051226, + "balance_loss_clip": 1.03036463, + "balance_loss_mlp": 1.05574679, + "epoch": 0.11285134525777844, + "flos": 20187901722240.0, + "grad_norm": 2.408917627715415, + "language_loss": 0.76760256, + "learning_rate": 3.9284251629412475e-06, + "loss": 0.78981495, + "num_input_tokens_seen": 40567475, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.140625, + "step": 1877, + "time_per_iteration": 2.487870693206787 + }, + { + "auxiliary_loss_clip": 0.01173931, + "auxiliary_loss_mlp": 0.01057623, + "balance_loss_clip": 1.03555715, + "balance_loss_mlp": 1.05530918, + "epoch": 0.11291146851044641, + "flos": 12458453863680.0, + "grad_norm": 2.569804002246691, + "language_loss": 0.88132238, + "learning_rate": 3.928321868270436e-06, + "loss": 0.90363795, + "num_input_tokens_seen": 40583280, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1875, + "step": 1878, + "time_per_iteration": 2.4562089443206787 + }, + { + "auxiliary_loss_clip": 0.01171231, + "auxiliary_loss_mlp": 0.01046868, + "balance_loss_clip": 1.02628088, + "balance_loss_mlp": 1.05382609, + "epoch": 0.11297159176311439, + "flos": 23842315520640.0, + "grad_norm": 2.2792620862185036, + "language_loss": 0.81521666, + "learning_rate": 3.928218500477466e-06, + "loss": 0.83739763, + "num_input_tokens_seen": 40603080, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.171875, + "step": 1879, + "time_per_iteration": 2.515162944793701 + }, + { + "auxiliary_loss_clip": 0.01174903, + "auxiliary_loss_mlp": 0.01056113, + "balance_loss_clip": 1.03513217, + "balance_loss_mlp": 1.05591071, + "epoch": 0.11303171501578235, + "flos": 29930538124800.0, + "grad_norm": 2.9729184409385376, + "language_loss": 0.70101768, + "learning_rate": 3.928115059566259e-06, + "loss": 0.72332788, + "num_input_tokens_seen": 40623255, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.1875, + "step": 1880, + "time_per_iteration": 2.5420267581939697 + }, + { + "auxiliary_loss_clip": 0.01169399, + "auxiliary_loss_mlp": 0.01045441, + "balance_loss_clip": 1.02442431, + "balance_loss_mlp": 1.05396068, + "epoch": 0.11309183826845032, + "flos": 16180558842240.0, + "grad_norm": 1.7442831242084353, + "language_loss": 0.72337204, + "learning_rate": 3.928011545540734e-06, + "loss": 0.74552047, + "num_input_tokens_seen": 40641570, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.15625, + "step": 1881, + "time_per_iteration": 2.4648680686950684 + }, + { + "auxiliary_loss_clip": 0.01172977, + "auxiliary_loss_mlp": 0.01057236, + "balance_loss_clip": 1.03452694, + "balance_loss_mlp": 1.05385113, + "epoch": 0.1131519615211183, + "flos": 12020702814720.0, + "grad_norm": 2.4452990726029533, + "language_loss": 0.74243963, + "learning_rate": 3.927907958404819e-06, + "loss": 0.76474178, + "num_input_tokens_seen": 40658775, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.1953125, + "step": 1882, + "time_per_iteration": 2.459181547164917 + }, + { + "auxiliary_loss_clip": 0.01171271, + "auxiliary_loss_mlp": 0.01052266, + "balance_loss_clip": 1.03045106, + "balance_loss_mlp": 1.05493677, + "epoch": 0.11321208477378626, + "flos": 26250125857920.0, + "grad_norm": 2.8641228673356873, + "language_loss": 0.79328096, + "learning_rate": 3.92780429816244e-06, + "loss": 0.81551635, + "num_input_tokens_seen": 40679555, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.15625, + "step": 1883, + "time_per_iteration": 2.5236945152282715 + }, + { + "auxiliary_loss_clip": 0.01173507, + "auxiliary_loss_mlp": 0.01054538, + "balance_loss_clip": 1.03271067, + "balance_loss_mlp": 1.05288672, + "epoch": 0.11327220802645423, + "flos": 13626376583040.0, + "grad_norm": 3.0524763398538193, + "language_loss": 0.77151698, + "learning_rate": 3.927700564817529e-06, + "loss": 0.79379749, + "num_input_tokens_seen": 40697295, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.203125, + "step": 1884, + "time_per_iteration": 2.4974489212036133 + }, + { + "auxiliary_loss_clip": 0.01063959, + "auxiliary_loss_mlp": 0.01012749, + "balance_loss_clip": 1.0102694, + "balance_loss_mlp": 1.02156711, + "epoch": 0.1133323312791222, + "flos": 57191802814080.0, + "grad_norm": 0.7928734254501784, + "language_loss": 0.55183071, + "learning_rate": 3.927596758374019e-06, + "loss": 0.5725978, + "num_input_tokens_seen": 40758095, + "router_z_loss_clip": 0.02478027, + "router_z_loss_mlp": 0.42382812, + "step": 1885, + "time_per_iteration": 2.9756290912628174 + }, + { + "auxiliary_loss_clip": 0.01166272, + "auxiliary_loss_mlp": 0.01047922, + "balance_loss_clip": 1.02758515, + "balance_loss_mlp": 1.0534817, + "epoch": 0.11339245453179017, + "flos": 24351708245760.0, + "grad_norm": 5.752063942495911, + "language_loss": 0.90240276, + "learning_rate": 3.927492878835848e-06, + "loss": 0.92454469, + "num_input_tokens_seen": 40777140, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.125, + "step": 1886, + "time_per_iteration": 2.5031139850616455 + }, + { + "auxiliary_loss_clip": 0.01168969, + "auxiliary_loss_mlp": 0.01051145, + "balance_loss_clip": 1.03018832, + "balance_loss_mlp": 1.05306387, + "epoch": 0.11345257778445814, + "flos": 22670693700480.0, + "grad_norm": 2.0267704425546036, + "language_loss": 0.85101235, + "learning_rate": 3.927388926206953e-06, + "loss": 0.87321353, + "num_input_tokens_seen": 40797505, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.15625, + "step": 1887, + "time_per_iteration": 2.5177412033081055 + }, + { + "auxiliary_loss_clip": 0.01172698, + "auxiliary_loss_mlp": 0.01061982, + "balance_loss_clip": 1.0417881, + "balance_loss_mlp": 1.05554259, + "epoch": 0.11351270103712612, + "flos": 20988242611200.0, + "grad_norm": 5.5783153731033055, + "language_loss": 0.76168925, + "learning_rate": 3.927284900491277e-06, + "loss": 0.78403604, + "num_input_tokens_seen": 40812970, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.171875, + "step": 1888, + "time_per_iteration": 2.4965853691101074 + }, + { + "auxiliary_loss_clip": 0.01177743, + "auxiliary_loss_mlp": 0.01057849, + "balance_loss_clip": 1.03542566, + "balance_loss_mlp": 1.05632472, + "epoch": 0.11357282428979408, + "flos": 37347923600640.0, + "grad_norm": 2.114301103868513, + "language_loss": 0.68039739, + "learning_rate": 3.927180801692764e-06, + "loss": 0.70275331, + "num_input_tokens_seen": 40837745, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.21875, + "step": 1889, + "time_per_iteration": 2.643867015838623 + }, + { + "auxiliary_loss_clip": 0.01172679, + "auxiliary_loss_mlp": 0.01047613, + "balance_loss_clip": 1.02611947, + "balance_loss_mlp": 1.05620956, + "epoch": 0.11363294754246205, + "flos": 21757018423680.0, + "grad_norm": 2.158184033346157, + "language_loss": 0.84414917, + "learning_rate": 3.927076629815362e-06, + "loss": 0.86635208, + "num_input_tokens_seen": 40856490, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.1640625, + "step": 1890, + "time_per_iteration": 2.5018270015716553 + }, + { + "auxiliary_loss_clip": 0.01168344, + "auxiliary_loss_mlp": 0.01050115, + "balance_loss_clip": 1.02855039, + "balance_loss_mlp": 1.05288363, + "epoch": 0.11369307079513001, + "flos": 22601637803520.0, + "grad_norm": 2.2859967152973373, + "language_loss": 0.65099049, + "learning_rate": 3.926972384863022e-06, + "loss": 0.67317504, + "num_input_tokens_seen": 40874070, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.15625, + "step": 1891, + "time_per_iteration": 2.4870762825012207 + }, + { + "auxiliary_loss_clip": 0.01173219, + "auxiliary_loss_mlp": 0.01043072, + "balance_loss_clip": 1.02274656, + "balance_loss_mlp": 1.05397856, + "epoch": 0.11375319404779799, + "flos": 21944257044480.0, + "grad_norm": 2.358390081637715, + "language_loss": 0.87789619, + "learning_rate": 3.9268680668396956e-06, + "loss": 0.90005904, + "num_input_tokens_seen": 40892425, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1953125, + "step": 1892, + "time_per_iteration": 2.469215154647827 + }, + { + "auxiliary_loss_clip": 0.01173439, + "auxiliary_loss_mlp": 0.01066287, + "balance_loss_clip": 1.04509139, + "balance_loss_mlp": 1.05419993, + "epoch": 0.11381331730046595, + "flos": 26395456285440.0, + "grad_norm": 2.4185703679999775, + "language_loss": 0.72724342, + "learning_rate": 3.926763675749339e-06, + "loss": 0.7496407, + "num_input_tokens_seen": 40912190, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1875, + "step": 1893, + "time_per_iteration": 4.021688222885132 + }, + { + "auxiliary_loss_clip": 0.01169367, + "auxiliary_loss_mlp": 0.0105827, + "balance_loss_clip": 1.03531051, + "balance_loss_mlp": 1.05175805, + "epoch": 0.11387344055313392, + "flos": 23804716959360.0, + "grad_norm": 2.254020248775613, + "language_loss": 0.79367435, + "learning_rate": 3.92665921159591e-06, + "loss": 0.81595069, + "num_input_tokens_seen": 40928395, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.171875, + "step": 1894, + "time_per_iteration": 3.9190711975097656 + }, + { + "auxiliary_loss_clip": 0.01176791, + "auxiliary_loss_mlp": 0.01052535, + "balance_loss_clip": 1.03074312, + "balance_loss_mlp": 1.05530715, + "epoch": 0.1139335638058019, + "flos": 34522865902080.0, + "grad_norm": 2.587114905294773, + "language_loss": 0.78868139, + "learning_rate": 3.926554674383371e-06, + "loss": 0.81097472, + "num_input_tokens_seen": 40946555, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.21875, + "step": 1895, + "time_per_iteration": 2.5924861431121826 + }, + { + "auxiliary_loss_clip": 0.0106161, + "auxiliary_loss_mlp": 0.01001633, + "balance_loss_clip": 0.99917758, + "balance_loss_mlp": 1.01840448, + "epoch": 0.11399368705846986, + "flos": 70587811520640.0, + "grad_norm": 0.8005582337036792, + "language_loss": 0.63316774, + "learning_rate": 3.926450064115686e-06, + "loss": 0.65380025, + "num_input_tokens_seen": 41004910, + "router_z_loss_clip": 0.02453613, + "router_z_loss_mlp": 0.43359375, + "step": 1896, + "time_per_iteration": 3.143843412399292 + }, + { + "auxiliary_loss_clip": 0.01170086, + "auxiliary_loss_mlp": 0.01059473, + "balance_loss_clip": 1.03600097, + "balance_loss_mlp": 1.05385494, + "epoch": 0.11405381031113783, + "flos": 21324259365120.0, + "grad_norm": 1.6058527618620146, + "language_loss": 0.84707338, + "learning_rate": 3.926345380796821e-06, + "loss": 0.86936897, + "num_input_tokens_seen": 41026385, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.15625, + "step": 1897, + "time_per_iteration": 2.5120036602020264 + }, + { + "auxiliary_loss_clip": 0.0117262, + "auxiliary_loss_mlp": 0.01053072, + "balance_loss_clip": 1.03159046, + "balance_loss_mlp": 1.05385423, + "epoch": 0.11411393356380581, + "flos": 19719627091200.0, + "grad_norm": 2.3286063431421926, + "language_loss": 0.79776239, + "learning_rate": 3.9262406244307465e-06, + "loss": 0.8200193, + "num_input_tokens_seen": 41045315, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.1875, + "step": 1898, + "time_per_iteration": 2.5186216831207275 + }, + { + "auxiliary_loss_clip": 0.01174476, + "auxiliary_loss_mlp": 0.01056562, + "balance_loss_clip": 1.03330398, + "balance_loss_mlp": 1.05247831, + "epoch": 0.11417405681647377, + "flos": 17530440883200.0, + "grad_norm": 1.996095488823442, + "language_loss": 0.73049861, + "learning_rate": 3.926135795021435e-06, + "loss": 0.75280899, + "num_input_tokens_seen": 41063390, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.21875, + "step": 1899, + "time_per_iteration": 2.484767198562622 + }, + { + "auxiliary_loss_clip": 0.01059885, + "auxiliary_loss_mlp": 0.01004452, + "balance_loss_clip": 1.00197208, + "balance_loss_mlp": 1.0167762, + "epoch": 0.11423418006914174, + "flos": 59674666619520.0, + "grad_norm": 0.9092154832512579, + "language_loss": 0.63432097, + "learning_rate": 3.92603089257286e-06, + "loss": 0.65496433, + "num_input_tokens_seen": 41124180, + "router_z_loss_clip": 0.02478027, + "router_z_loss_mlp": 0.4296875, + "step": 1900, + "time_per_iteration": 3.0239956378936768 + }, + { + "auxiliary_loss_clip": 0.0117026, + "auxiliary_loss_mlp": 0.01058021, + "balance_loss_clip": 1.03600276, + "balance_loss_mlp": 1.05181098, + "epoch": 0.1142943033218097, + "flos": 22963114321920.0, + "grad_norm": 1.6715138036124124, + "language_loss": 0.78116465, + "learning_rate": 3.925925917089001e-06, + "loss": 0.80344748, + "num_input_tokens_seen": 41143485, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.1875, + "step": 1901, + "time_per_iteration": 2.5007457733154297 + }, + { + "auxiliary_loss_clip": 0.01172182, + "auxiliary_loss_mlp": 0.01059819, + "balance_loss_clip": 1.03894591, + "balance_loss_mlp": 1.05482793, + "epoch": 0.11435442657447768, + "flos": 18256267008000.0, + "grad_norm": 1.9023337273707566, + "language_loss": 0.83676988, + "learning_rate": 3.925820868573839e-06, + "loss": 0.85908997, + "num_input_tokens_seen": 41161695, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.171875, + "step": 1902, + "time_per_iteration": 2.4389002323150635 + }, + { + "auxiliary_loss_clip": 0.0117356, + "auxiliary_loss_mlp": 0.01050952, + "balance_loss_clip": 1.02856493, + "balance_loss_mlp": 1.05356252, + "epoch": 0.11441454982714565, + "flos": 24061191045120.0, + "grad_norm": 1.6958297254772137, + "language_loss": 0.77551281, + "learning_rate": 3.925715747031356e-06, + "loss": 0.79775804, + "num_input_tokens_seen": 41181715, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.203125, + "step": 1903, + "time_per_iteration": 2.503164768218994 + }, + { + "auxiliary_loss_clip": 0.01171838, + "auxiliary_loss_mlp": 0.01045456, + "balance_loss_clip": 1.02651334, + "balance_loss_mlp": 1.05437744, + "epoch": 0.11447467307981361, + "flos": 25337707557120.0, + "grad_norm": 2.553861289811236, + "language_loss": 0.75704938, + "learning_rate": 3.925610552465539e-06, + "loss": 0.77922231, + "num_input_tokens_seen": 41201770, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.171875, + "step": 1904, + "time_per_iteration": 2.5097854137420654 + }, + { + "auxiliary_loss_clip": 0.01171595, + "auxiliary_loss_mlp": 0.01053755, + "balance_loss_clip": 1.03192747, + "balance_loss_mlp": 1.05519056, + "epoch": 0.11453479633248159, + "flos": 21726063878400.0, + "grad_norm": 2.146045336495955, + "language_loss": 0.92476678, + "learning_rate": 3.9255052848803764e-06, + "loss": 0.94702017, + "num_input_tokens_seen": 41220590, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.1640625, + "step": 1905, + "time_per_iteration": 2.4905850887298584 + }, + { + "auxiliary_loss_clip": 0.0117632, + "auxiliary_loss_mlp": 0.01050773, + "balance_loss_clip": 1.02755141, + "balance_loss_mlp": 1.0496794, + "epoch": 0.11459491958514956, + "flos": 12969714096000.0, + "grad_norm": 2.457773566764277, + "language_loss": 0.77108872, + "learning_rate": 3.925399944279861e-06, + "loss": 0.7933597, + "num_input_tokens_seen": 41237250, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.265625, + "step": 1906, + "time_per_iteration": 2.4469265937805176 + }, + { + "auxiliary_loss_clip": 0.01173869, + "auxiliary_loss_mlp": 0.01053097, + "balance_loss_clip": 1.03072143, + "balance_loss_mlp": 1.05375302, + "epoch": 0.11465504283781752, + "flos": 22711273090560.0, + "grad_norm": 2.4555636334810593, + "language_loss": 0.81855345, + "learning_rate": 3.925294530667986e-06, + "loss": 0.84082305, + "num_input_tokens_seen": 41256680, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.203125, + "step": 1907, + "time_per_iteration": 2.499772071838379 + }, + { + "auxiliary_loss_clip": 0.01173651, + "auxiliary_loss_mlp": 0.0106593, + "balance_loss_clip": 1.045784, + "balance_loss_mlp": 1.05599511, + "epoch": 0.1147151660904855, + "flos": 23398387332480.0, + "grad_norm": 4.041607412488977, + "language_loss": 0.84798187, + "learning_rate": 3.92518904404875e-06, + "loss": 0.87037772, + "num_input_tokens_seen": 41270955, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.171875, + "step": 1908, + "time_per_iteration": 2.468519687652588 + }, + { + "auxiliary_loss_clip": 0.0105669, + "auxiliary_loss_mlp": 0.01004376, + "balance_loss_clip": 1.0018847, + "balance_loss_mlp": 1.01344705, + "epoch": 0.11477528934315347, + "flos": 63011843498880.0, + "grad_norm": 0.9477470057539497, + "language_loss": 0.6100027, + "learning_rate": 3.925083484426153e-06, + "loss": 0.63061339, + "num_input_tokens_seen": 41319180, + "router_z_loss_clip": 0.02490234, + "router_z_loss_mlp": 0.43164062, + "step": 1909, + "time_per_iteration": 2.8313472270965576 + }, + { + "auxiliary_loss_clip": 0.01174173, + "auxiliary_loss_mlp": 0.01052438, + "balance_loss_clip": 1.03223228, + "balance_loss_mlp": 1.05660319, + "epoch": 0.11483541259582143, + "flos": 16325601960960.0, + "grad_norm": 2.135894642259737, + "language_loss": 0.78793955, + "learning_rate": 3.924977851804197e-06, + "loss": 0.8102057, + "num_input_tokens_seen": 41337480, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.171875, + "step": 1910, + "time_per_iteration": 2.4613592624664307 + }, + { + "auxiliary_loss_clip": 0.01178149, + "auxiliary_loss_mlp": 0.01051798, + "balance_loss_clip": 1.03005373, + "balance_loss_mlp": 1.05803406, + "epoch": 0.1148955358484894, + "flos": 21580410228480.0, + "grad_norm": 3.035949872237615, + "language_loss": 0.76787984, + "learning_rate": 3.9248721461868875e-06, + "loss": 0.79017925, + "num_input_tokens_seen": 41354650, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.203125, + "step": 1911, + "time_per_iteration": 2.475069761276245 + }, + { + "auxiliary_loss_clip": 0.01166349, + "auxiliary_loss_mlp": 0.01051416, + "balance_loss_clip": 1.03048277, + "balance_loss_mlp": 1.05284548, + "epoch": 0.11495565910115738, + "flos": 27673696650240.0, + "grad_norm": 2.1144124150337023, + "language_loss": 0.7927531, + "learning_rate": 3.9247663675782336e-06, + "loss": 0.81493074, + "num_input_tokens_seen": 41376935, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1328125, + "step": 1912, + "time_per_iteration": 2.543607473373413 + }, + { + "auxiliary_loss_clip": 0.01169469, + "auxiliary_loss_mlp": 0.0105862, + "balance_loss_clip": 1.0369364, + "balance_loss_mlp": 1.05352569, + "epoch": 0.11501578235382534, + "flos": 20632368614400.0, + "grad_norm": 1.9322037304643997, + "language_loss": 0.7777245, + "learning_rate": 3.924660515982246e-06, + "loss": 0.80000544, + "num_input_tokens_seen": 41396105, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.1640625, + "step": 1913, + "time_per_iteration": 2.5093326568603516 + }, + { + "auxiliary_loss_clip": 0.01169525, + "auxiliary_loss_mlp": 0.01051154, + "balance_loss_clip": 1.02889776, + "balance_loss_mlp": 1.05118954, + "epoch": 0.1150759056064933, + "flos": 19829046896640.0, + "grad_norm": 3.783180746712747, + "language_loss": 0.70389271, + "learning_rate": 3.924554591402939e-06, + "loss": 0.72609949, + "num_input_tokens_seen": 41415600, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.1875, + "step": 1914, + "time_per_iteration": 2.5099785327911377 + }, + { + "auxiliary_loss_clip": 0.01058565, + "auxiliary_loss_mlp": 0.01002053, + "balance_loss_clip": 0.99943084, + "balance_loss_mlp": 1.01452589, + "epoch": 0.11513602885916129, + "flos": 70045776311040.0, + "grad_norm": 0.7556045547130329, + "language_loss": 0.61044526, + "learning_rate": 3.92444859384433e-06, + "loss": 0.63105142, + "num_input_tokens_seen": 41478760, + "router_z_loss_clip": 0.02624512, + "router_z_loss_mlp": 0.44140625, + "step": 1915, + "time_per_iteration": 3.1735148429870605 + }, + { + "auxiliary_loss_clip": 0.01172283, + "auxiliary_loss_mlp": 0.01054758, + "balance_loss_clip": 1.03273964, + "balance_loss_mlp": 1.05674434, + "epoch": 0.11519615211182925, + "flos": 15741730385280.0, + "grad_norm": 2.822924091618307, + "language_loss": 0.9323889, + "learning_rate": 3.924342523310436e-06, + "loss": 0.95465934, + "num_input_tokens_seen": 41495720, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.15625, + "step": 1916, + "time_per_iteration": 2.4806342124938965 + }, + { + "auxiliary_loss_clip": 0.01171086, + "auxiliary_loss_mlp": 0.01061893, + "balance_loss_clip": 1.03845596, + "balance_loss_mlp": 1.05340374, + "epoch": 0.11525627536449722, + "flos": 20667632791680.0, + "grad_norm": 1.8768677942494545, + "language_loss": 0.72286755, + "learning_rate": 3.9242363798052806e-06, + "loss": 0.7451973, + "num_input_tokens_seen": 41513585, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.171875, + "step": 1917, + "time_per_iteration": 2.519758701324463 + }, + { + "auxiliary_loss_clip": 0.01171782, + "auxiliary_loss_mlp": 0.0104867, + "balance_loss_clip": 1.02664053, + "balance_loss_mlp": 1.05521619, + "epoch": 0.1153163986171652, + "flos": 20303283185280.0, + "grad_norm": 2.2984335892825594, + "language_loss": 0.74389827, + "learning_rate": 3.92413016333289e-06, + "loss": 0.76610279, + "num_input_tokens_seen": 41533390, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1640625, + "step": 1918, + "time_per_iteration": 2.4867136478424072 + }, + { + "auxiliary_loss_clip": 0.01173604, + "auxiliary_loss_mlp": 0.01045744, + "balance_loss_clip": 1.02394044, + "balance_loss_mlp": 1.05273843, + "epoch": 0.11537652186983316, + "flos": 17639321984640.0, + "grad_norm": 2.1981507651696193, + "language_loss": 0.86515707, + "learning_rate": 3.92402387389729e-06, + "loss": 0.88735056, + "num_input_tokens_seen": 41551015, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.2109375, + "step": 1919, + "time_per_iteration": 2.4838428497314453 + }, + { + "auxiliary_loss_clip": 0.01168988, + "auxiliary_loss_mlp": 0.01054432, + "balance_loss_clip": 1.03190136, + "balance_loss_mlp": 1.05291939, + "epoch": 0.11543664512250112, + "flos": 21069401391360.0, + "grad_norm": 2.516832715272094, + "language_loss": 0.86640596, + "learning_rate": 3.923917511502512e-06, + "loss": 0.88864017, + "num_input_tokens_seen": 41568055, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.15625, + "step": 1920, + "time_per_iteration": 2.524017333984375 + }, + { + "auxiliary_loss_clip": 0.01167627, + "auxiliary_loss_mlp": 0.01047596, + "balance_loss_clip": 1.02549434, + "balance_loss_mlp": 1.05360281, + "epoch": 0.11549676837516909, + "flos": 22747542848640.0, + "grad_norm": 2.2143351457696525, + "language_loss": 0.79792106, + "learning_rate": 3.923811076152589e-06, + "loss": 0.82007331, + "num_input_tokens_seen": 41587435, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.140625, + "step": 1921, + "time_per_iteration": 2.494673252105713 + }, + { + "auxiliary_loss_clip": 0.01174597, + "auxiliary_loss_mlp": 0.01056005, + "balance_loss_clip": 1.03331947, + "balance_loss_mlp": 1.05358851, + "epoch": 0.11555689162783707, + "flos": 19168972617600.0, + "grad_norm": 8.96706495073623, + "language_loss": 0.78418177, + "learning_rate": 3.923704567851557e-06, + "loss": 0.8064878, + "num_input_tokens_seen": 41604975, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.2109375, + "step": 1922, + "time_per_iteration": 2.5293705463409424 + }, + { + "auxiliary_loss_clip": 0.01174074, + "auxiliary_loss_mlp": 0.01060645, + "balance_loss_clip": 1.03910375, + "balance_loss_mlp": 1.05410469, + "epoch": 0.11561701488050503, + "flos": 24572056227840.0, + "grad_norm": 1.8482726295091094, + "language_loss": 0.84187758, + "learning_rate": 3.923597986603456e-06, + "loss": 0.86422473, + "num_input_tokens_seen": 41626155, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.203125, + "step": 1923, + "time_per_iteration": 2.5203118324279785 + }, + { + "auxiliary_loss_clip": 0.01175585, + "auxiliary_loss_mlp": 0.01053498, + "balance_loss_clip": 1.03074098, + "balance_loss_mlp": 1.05742192, + "epoch": 0.115677138133173, + "flos": 17092546179840.0, + "grad_norm": 2.0576366068601666, + "language_loss": 0.80471247, + "learning_rate": 3.9234913324123264e-06, + "loss": 0.82700336, + "num_input_tokens_seen": 41644805, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.1796875, + "step": 1924, + "time_per_iteration": 2.48531436920166 + }, + { + "auxiliary_loss_clip": 0.01058783, + "auxiliary_loss_mlp": 0.01001491, + "balance_loss_clip": 0.99917841, + "balance_loss_mlp": 1.0154866, + "epoch": 0.11573726138584098, + "flos": 62703875266560.0, + "grad_norm": 0.810907468185892, + "language_loss": 0.6115036, + "learning_rate": 3.923384605282212e-06, + "loss": 0.6321063, + "num_input_tokens_seen": 41709345, + "router_z_loss_clip": 0.02307129, + "router_z_loss_mlp": 0.43359375, + "step": 1925, + "time_per_iteration": 3.112396478652954 + }, + { + "auxiliary_loss_clip": 0.01173159, + "auxiliary_loss_mlp": 0.01076027, + "balance_loss_clip": 1.05304384, + "balance_loss_mlp": 1.05447614, + "epoch": 0.11579738463850894, + "flos": 22601135013120.0, + "grad_norm": 2.806943429185086, + "language_loss": 0.7482335, + "learning_rate": 3.923277805217161e-06, + "loss": 0.77072537, + "num_input_tokens_seen": 41730210, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.1875, + "step": 1926, + "time_per_iteration": 2.4890315532684326 + }, + { + "auxiliary_loss_clip": 0.01174997, + "auxiliary_loss_mlp": 0.0106307, + "balance_loss_clip": 1.03873897, + "balance_loss_mlp": 1.0552361, + "epoch": 0.11585750789117691, + "flos": 21726135705600.0, + "grad_norm": 2.429758451090488, + "language_loss": 0.73112315, + "learning_rate": 3.923170932221222e-06, + "loss": 0.7535038, + "num_input_tokens_seen": 41750270, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.203125, + "step": 1927, + "time_per_iteration": 2.4673402309417725 + }, + { + "auxiliary_loss_clip": 0.0117016, + "auxiliary_loss_mlp": 0.01054777, + "balance_loss_clip": 1.03244913, + "balance_loss_mlp": 1.05291271, + "epoch": 0.11591763114384489, + "flos": 26287544851200.0, + "grad_norm": 2.854021270140142, + "language_loss": 0.86824137, + "learning_rate": 3.92306398629845e-06, + "loss": 0.89049077, + "num_input_tokens_seen": 41772975, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.171875, + "step": 1928, + "time_per_iteration": 2.530325412750244 + }, + { + "auxiliary_loss_clip": 0.01173569, + "auxiliary_loss_mlp": 0.01055147, + "balance_loss_clip": 1.03289056, + "balance_loss_mlp": 1.05469573, + "epoch": 0.11597775439651285, + "flos": 23000461488000.0, + "grad_norm": 1.71243688867153, + "language_loss": 0.77567977, + "learning_rate": 3.922956967452898e-06, + "loss": 0.79796684, + "num_input_tokens_seen": 41791765, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.1875, + "step": 1929, + "time_per_iteration": 2.489664316177368 + }, + { + "auxiliary_loss_clip": 0.01168882, + "auxiliary_loss_mlp": 0.01062437, + "balance_loss_clip": 1.04238629, + "balance_loss_mlp": 1.05385804, + "epoch": 0.11603787764918082, + "flos": 31941715507200.0, + "grad_norm": 1.6293868207273203, + "language_loss": 0.76724243, + "learning_rate": 3.922849875688626e-06, + "loss": 0.78955561, + "num_input_tokens_seen": 41815615, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.1484375, + "step": 1930, + "time_per_iteration": 2.5867533683776855 + }, + { + "auxiliary_loss_clip": 0.01169352, + "auxiliary_loss_mlp": 0.0105213, + "balance_loss_clip": 1.03027928, + "balance_loss_mlp": 1.05313969, + "epoch": 0.1160980009018488, + "flos": 22271654534400.0, + "grad_norm": 1.9270697111110349, + "language_loss": 0.72114342, + "learning_rate": 3.922742711009693e-06, + "loss": 0.74335825, + "num_input_tokens_seen": 41834810, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.1640625, + "step": 1931, + "time_per_iteration": 2.5218429565429688 + }, + { + "auxiliary_loss_clip": 0.01173627, + "auxiliary_loss_mlp": 0.0105412, + "balance_loss_clip": 1.03168511, + "balance_loss_mlp": 1.05528855, + "epoch": 0.11615812415451676, + "flos": 22783633038720.0, + "grad_norm": 1.5295866923660926, + "language_loss": 0.82133794, + "learning_rate": 3.922635473420164e-06, + "loss": 0.84361541, + "num_input_tokens_seen": 41854975, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.1875, + "step": 1932, + "time_per_iteration": 2.4879212379455566 + }, + { + "auxiliary_loss_clip": 0.01053319, + "auxiliary_loss_mlp": 0.01007659, + "balance_loss_clip": 1.00539386, + "balance_loss_mlp": 1.0111897, + "epoch": 0.11621824740718473, + "flos": 67146096107520.0, + "grad_norm": 0.7701959329661775, + "language_loss": 0.61053753, + "learning_rate": 3.922528162924105e-06, + "loss": 0.63114727, + "num_input_tokens_seen": 41911105, + "router_z_loss_clip": 0.02270508, + "router_z_loss_mlp": 0.421875, + "step": 1933, + "time_per_iteration": 2.960437059402466 + }, + { + "auxiliary_loss_clip": 0.01172297, + "auxiliary_loss_mlp": 0.01054299, + "balance_loss_clip": 1.03248382, + "balance_loss_mlp": 1.05259895, + "epoch": 0.11627837065985269, + "flos": 20375930442240.0, + "grad_norm": 2.2263920275904425, + "language_loss": 0.85587192, + "learning_rate": 3.922420779525586e-06, + "loss": 0.87813795, + "num_input_tokens_seen": 41931750, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.1953125, + "step": 1934, + "time_per_iteration": 5.3810875415802 + }, + { + "auxiliary_loss_clip": 0.01178805, + "auxiliary_loss_mlp": 0.01059072, + "balance_loss_clip": 1.03667259, + "balance_loss_mlp": 1.05852652, + "epoch": 0.11633849391252067, + "flos": 21725812483200.0, + "grad_norm": 2.481370623449466, + "language_loss": 0.65555394, + "learning_rate": 3.9223133232286776e-06, + "loss": 0.67793274, + "num_input_tokens_seen": 41949400, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.203125, + "step": 1935, + "time_per_iteration": 2.483814239501953 + }, + { + "auxiliary_loss_clip": 0.01176161, + "auxiliary_loss_mlp": 0.01053675, + "balance_loss_clip": 1.03352857, + "balance_loss_mlp": 1.05533004, + "epoch": 0.11639861716518864, + "flos": 18805341283200.0, + "grad_norm": 1.8046174937009931, + "language_loss": 0.75469184, + "learning_rate": 3.922205794037456e-06, + "loss": 0.77699012, + "num_input_tokens_seen": 41968100, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.2109375, + "step": 1936, + "time_per_iteration": 3.8786003589630127 + }, + { + "auxiliary_loss_clip": 0.01173369, + "auxiliary_loss_mlp": 0.0105617, + "balance_loss_clip": 1.0325551, + "balance_loss_mlp": 1.05320179, + "epoch": 0.1164587404178566, + "flos": 21214983214080.0, + "grad_norm": 1.9600676544166102, + "language_loss": 0.84061754, + "learning_rate": 3.922098191955998e-06, + "loss": 0.86291301, + "num_input_tokens_seen": 41986375, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.1953125, + "step": 1937, + "time_per_iteration": 2.5084798336029053 + }, + { + "auxiliary_loss_clip": 0.01166803, + "auxiliary_loss_mlp": 0.01045843, + "balance_loss_clip": 1.02533889, + "balance_loss_mlp": 1.05254185, + "epoch": 0.11651886367052458, + "flos": 27818632028160.0, + "grad_norm": 2.0067941571917927, + "language_loss": 0.76479459, + "learning_rate": 3.921990516988384e-06, + "loss": 0.78692102, + "num_input_tokens_seen": 42006055, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.140625, + "step": 1938, + "time_per_iteration": 2.5770225524902344 + }, + { + "auxiliary_loss_clip": 0.01177239, + "auxiliary_loss_mlp": 0.01051282, + "balance_loss_clip": 1.02963328, + "balance_loss_mlp": 1.05566061, + "epoch": 0.11657898692319255, + "flos": 22889569224960.0, + "grad_norm": 2.0274312317590084, + "language_loss": 0.79127967, + "learning_rate": 3.921882769138696e-06, + "loss": 0.8135649, + "num_input_tokens_seen": 42024995, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.21875, + "step": 1939, + "time_per_iteration": 2.5020864009857178 + }, + { + "auxiliary_loss_clip": 0.01173869, + "auxiliary_loss_mlp": 0.0105151, + "balance_loss_clip": 1.02886081, + "balance_loss_mlp": 1.05530274, + "epoch": 0.11663911017586051, + "flos": 24315905364480.0, + "grad_norm": 3.7077039427391343, + "language_loss": 0.86712289, + "learning_rate": 3.9217749484110215e-06, + "loss": 0.88937664, + "num_input_tokens_seen": 42042640, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.1875, + "step": 1940, + "time_per_iteration": 2.484750270843506 + }, + { + "auxiliary_loss_clip": 0.01172427, + "auxiliary_loss_mlp": 0.0105781, + "balance_loss_clip": 1.03699601, + "balance_loss_mlp": 1.05674481, + "epoch": 0.11669923342852849, + "flos": 42340152470400.0, + "grad_norm": 1.4506595925957548, + "language_loss": 0.75750297, + "learning_rate": 3.921667054809449e-06, + "loss": 0.7798053, + "num_input_tokens_seen": 42067005, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.15625, + "step": 1941, + "time_per_iteration": 2.7000842094421387 + }, + { + "auxiliary_loss_clip": 0.01167731, + "auxiliary_loss_mlp": 0.01059034, + "balance_loss_clip": 1.0375998, + "balance_loss_mlp": 1.05215478, + "epoch": 0.11675935668119646, + "flos": 14642288945280.0, + "grad_norm": 2.1675787105273256, + "language_loss": 0.8828994, + "learning_rate": 3.921559088338068e-06, + "loss": 0.90516704, + "num_input_tokens_seen": 42082295, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.15625, + "step": 1942, + "time_per_iteration": 2.460014581680298 + }, + { + "auxiliary_loss_clip": 0.01170106, + "auxiliary_loss_mlp": 0.01048326, + "balance_loss_clip": 1.02839422, + "balance_loss_mlp": 1.05465341, + "epoch": 0.11681947993386442, + "flos": 35116470063360.0, + "grad_norm": 1.688985931696262, + "language_loss": 0.67729998, + "learning_rate": 3.921451049000975e-06, + "loss": 0.69948429, + "num_input_tokens_seen": 42105295, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.15625, + "step": 1943, + "time_per_iteration": 2.5899837017059326 + }, + { + "auxiliary_loss_clip": 0.01170349, + "auxiliary_loss_mlp": 0.01046897, + "balance_loss_clip": 1.02586865, + "balance_loss_mlp": 1.05437136, + "epoch": 0.11687960318653239, + "flos": 38983259024640.0, + "grad_norm": 2.2767867948110263, + "language_loss": 0.69852126, + "learning_rate": 3.921342936802265e-06, + "loss": 0.72069371, + "num_input_tokens_seen": 42125520, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1640625, + "step": 1944, + "time_per_iteration": 2.6237125396728516 + }, + { + "auxiliary_loss_clip": 0.01166997, + "auxiliary_loss_mlp": 0.01045496, + "balance_loss_clip": 1.02513456, + "balance_loss_mlp": 1.05112338, + "epoch": 0.11693972643920036, + "flos": 25994980575360.0, + "grad_norm": 2.1059371232711572, + "language_loss": 0.82477605, + "learning_rate": 3.921234751746038e-06, + "loss": 0.84690094, + "num_input_tokens_seen": 42146335, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.15625, + "step": 1945, + "time_per_iteration": 2.519148349761963 + }, + { + "auxiliary_loss_clip": 0.01169071, + "auxiliary_loss_mlp": 0.01058941, + "balance_loss_clip": 1.03805542, + "balance_loss_mlp": 1.05241919, + "epoch": 0.11699984969186833, + "flos": 27272107618560.0, + "grad_norm": 2.378189536328268, + "language_loss": 0.7640717, + "learning_rate": 3.9211264938363975e-06, + "loss": 0.7863518, + "num_input_tokens_seen": 42165320, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1640625, + "step": 1946, + "time_per_iteration": 2.516782283782959 + }, + { + "auxiliary_loss_clip": 0.01169578, + "auxiliary_loss_mlp": 0.0105231, + "balance_loss_clip": 1.03249717, + "balance_loss_mlp": 1.05597568, + "epoch": 0.1170599729445363, + "flos": 15267853232640.0, + "grad_norm": 2.040115867247402, + "language_loss": 0.68749321, + "learning_rate": 3.921018163077448e-06, + "loss": 0.70971209, + "num_input_tokens_seen": 42182955, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.140625, + "step": 1947, + "time_per_iteration": 2.443979501724243 + }, + { + "auxiliary_loss_clip": 0.01173266, + "auxiliary_loss_mlp": 0.01062988, + "balance_loss_clip": 1.041924, + "balance_loss_mlp": 1.05761504, + "epoch": 0.11712009619720427, + "flos": 17164439251200.0, + "grad_norm": 1.892409556337103, + "language_loss": 0.84730887, + "learning_rate": 3.920909759473295e-06, + "loss": 0.86967146, + "num_input_tokens_seen": 42200760, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.15625, + "step": 1948, + "time_per_iteration": 2.456883192062378 + }, + { + "auxiliary_loss_clip": 0.01060706, + "auxiliary_loss_mlp": 0.01000375, + "balance_loss_clip": 0.99784815, + "balance_loss_mlp": 1.01743388, + "epoch": 0.11718021944987224, + "flos": 70940991997440.0, + "grad_norm": 0.8146373030628324, + "language_loss": 0.65102834, + "learning_rate": 3.920801283028054e-06, + "loss": 0.6716392, + "num_input_tokens_seen": 42265745, + "router_z_loss_clip": 0.02526855, + "router_z_loss_mlp": 0.43359375, + "step": 1949, + "time_per_iteration": 3.083716630935669 + }, + { + "auxiliary_loss_clip": 0.01168495, + "auxiliary_loss_mlp": 0.01056708, + "balance_loss_clip": 1.03614426, + "balance_loss_mlp": 1.05524707, + "epoch": 0.1172403427025402, + "flos": 27453456408960.0, + "grad_norm": 1.7265339558443402, + "language_loss": 0.71616268, + "learning_rate": 3.920692733745835e-06, + "loss": 0.73841476, + "num_input_tokens_seen": 42286245, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.1328125, + "step": 1950, + "time_per_iteration": 2.5140750408172607 + }, + { + "auxiliary_loss_clip": 0.01174036, + "auxiliary_loss_mlp": 0.0105899, + "balance_loss_clip": 1.03823543, + "balance_loss_mlp": 1.05524027, + "epoch": 0.11730046595520818, + "flos": 15668723992320.0, + "grad_norm": 13.047142281747327, + "language_loss": 0.76811576, + "learning_rate": 3.920584111630755e-06, + "loss": 0.79044604, + "num_input_tokens_seen": 42302710, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1875, + "step": 1951, + "time_per_iteration": 2.4511098861694336 + }, + { + "auxiliary_loss_clip": 0.01172385, + "auxiliary_loss_mlp": 0.0106409, + "balance_loss_clip": 1.04351449, + "balance_loss_mlp": 1.05736876, + "epoch": 0.11736058920787615, + "flos": 25630164092160.0, + "grad_norm": 2.4689531190361858, + "language_loss": 0.75770319, + "learning_rate": 3.9204754166869325e-06, + "loss": 0.78006792, + "num_input_tokens_seen": 42324115, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.15625, + "step": 1952, + "time_per_iteration": 2.5249404907226562 + }, + { + "auxiliary_loss_clip": 0.01170041, + "auxiliary_loss_mlp": 0.01060486, + "balance_loss_clip": 1.04038692, + "balance_loss_mlp": 1.05350161, + "epoch": 0.11742071246054411, + "flos": 21434289701760.0, + "grad_norm": 1.8929141854364566, + "language_loss": 0.71838403, + "learning_rate": 3.920366648918491e-06, + "loss": 0.74068928, + "num_input_tokens_seen": 42342505, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.171875, + "step": 1953, + "time_per_iteration": 2.5321006774902344 + }, + { + "auxiliary_loss_clip": 0.01178671, + "auxiliary_loss_mlp": 0.01054108, + "balance_loss_clip": 1.03186345, + "balance_loss_mlp": 1.05794597, + "epoch": 0.11748083571321208, + "flos": 15997845335040.0, + "grad_norm": 2.5505654209141317, + "language_loss": 0.7939415, + "learning_rate": 3.920257808329552e-06, + "loss": 0.81626928, + "num_input_tokens_seen": 42360525, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.203125, + "step": 1954, + "time_per_iteration": 2.477182149887085 + }, + { + "auxiliary_loss_clip": 0.01174109, + "auxiliary_loss_mlp": 0.01060284, + "balance_loss_clip": 1.03859961, + "balance_loss_mlp": 1.05628419, + "epoch": 0.11754095896588006, + "flos": 16180056051840.0, + "grad_norm": 2.1305529461824344, + "language_loss": 0.85609406, + "learning_rate": 3.920148894924246e-06, + "loss": 0.878438, + "num_input_tokens_seen": 42377045, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.1796875, + "step": 1955, + "time_per_iteration": 2.4685070514678955 + }, + { + "auxiliary_loss_clip": 0.01172636, + "auxiliary_loss_mlp": 0.01049417, + "balance_loss_clip": 1.02949762, + "balance_loss_mlp": 1.05551839, + "epoch": 0.11760108221854802, + "flos": 13261596013440.0, + "grad_norm": 3.149612339355701, + "language_loss": 0.77626467, + "learning_rate": 3.920039908706701e-06, + "loss": 0.79848516, + "num_input_tokens_seen": 42393960, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.171875, + "step": 1956, + "time_per_iteration": 2.469529151916504 + }, + { + "auxiliary_loss_clip": 0.01169266, + "auxiliary_loss_mlp": 0.01054147, + "balance_loss_clip": 1.03357112, + "balance_loss_mlp": 1.05667603, + "epoch": 0.11766120547121599, + "flos": 24498439303680.0, + "grad_norm": 4.253665449575931, + "language_loss": 0.80333984, + "learning_rate": 3.91993084968105e-06, + "loss": 0.82557392, + "num_input_tokens_seen": 42413160, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.125, + "step": 1957, + "time_per_iteration": 2.508272886276245 + }, + { + "auxiliary_loss_clip": 0.01176684, + "auxiliary_loss_mlp": 0.01049845, + "balance_loss_clip": 1.03003287, + "balance_loss_mlp": 1.05895627, + "epoch": 0.11772132872388397, + "flos": 17784005967360.0, + "grad_norm": 3.1587185145349737, + "language_loss": 0.77638769, + "learning_rate": 3.919821717851428e-06, + "loss": 0.79865301, + "num_input_tokens_seen": 42432590, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.1796875, + "step": 1958, + "time_per_iteration": 2.48563551902771 + }, + { + "auxiliary_loss_clip": 0.01174636, + "auxiliary_loss_mlp": 0.01048305, + "balance_loss_clip": 1.02640605, + "balance_loss_mlp": 1.05859971, + "epoch": 0.11778145197655193, + "flos": 13217030213760.0, + "grad_norm": 2.0966272081131985, + "language_loss": 0.76906043, + "learning_rate": 3.919712513221976e-06, + "loss": 0.79128981, + "num_input_tokens_seen": 42450135, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.15625, + "step": 1959, + "time_per_iteration": 2.4826674461364746 + }, + { + "auxiliary_loss_clip": 0.01171719, + "auxiliary_loss_mlp": 0.01050959, + "balance_loss_clip": 1.03128934, + "balance_loss_mlp": 1.05581582, + "epoch": 0.1178415752292199, + "flos": 20230204965120.0, + "grad_norm": 3.13785825532277, + "language_loss": 0.69989765, + "learning_rate": 3.919603235796832e-06, + "loss": 0.72212446, + "num_input_tokens_seen": 42470050, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.15625, + "step": 1960, + "time_per_iteration": 2.4965405464172363 + }, + { + "auxiliary_loss_clip": 0.01178622, + "auxiliary_loss_mlp": 0.01058274, + "balance_loss_clip": 1.03704309, + "balance_loss_mlp": 1.05921102, + "epoch": 0.11790169848188788, + "flos": 13040134709760.0, + "grad_norm": 2.5802576751796327, + "language_loss": 0.81135678, + "learning_rate": 3.9194938855801406e-06, + "loss": 0.83372575, + "num_input_tokens_seen": 42484335, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1953125, + "step": 1961, + "time_per_iteration": 2.456537961959839 + }, + { + "auxiliary_loss_clip": 0.01167569, + "auxiliary_loss_mlp": 0.01055573, + "balance_loss_clip": 1.03640413, + "balance_loss_mlp": 1.05682623, + "epoch": 0.11796182173455584, + "flos": 22265728790400.0, + "grad_norm": 3.5009623449342206, + "language_loss": 0.92335653, + "learning_rate": 3.919384462576049e-06, + "loss": 0.94558799, + "num_input_tokens_seen": 42502720, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.109375, + "step": 1962, + "time_per_iteration": 2.4831955432891846 + }, + { + "auxiliary_loss_clip": 0.01175087, + "auxiliary_loss_mlp": 0.01054037, + "balance_loss_clip": 1.03379536, + "balance_loss_mlp": 1.05849361, + "epoch": 0.1180219449872238, + "flos": 10635017892480.0, + "grad_norm": 2.1891263418172353, + "language_loss": 0.87132198, + "learning_rate": 3.919274966788707e-06, + "loss": 0.89361322, + "num_input_tokens_seen": 42519460, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.1640625, + "step": 1963, + "time_per_iteration": 2.453864097595215 + }, + { + "auxiliary_loss_clip": 0.01174267, + "auxiliary_loss_mlp": 0.01047313, + "balance_loss_clip": 1.02764392, + "balance_loss_mlp": 1.05800569, + "epoch": 0.11808206823989177, + "flos": 20923532259840.0, + "grad_norm": 2.1122466665000155, + "language_loss": 0.84163988, + "learning_rate": 3.919165398222265e-06, + "loss": 0.86385566, + "num_input_tokens_seen": 42539420, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1640625, + "step": 1964, + "time_per_iteration": 2.496471405029297 + }, + { + "auxiliary_loss_clip": 0.01178376, + "auxiliary_loss_mlp": 0.01056634, + "balance_loss_clip": 1.03699994, + "balance_loss_mlp": 1.06327403, + "epoch": 0.11814219149255975, + "flos": 20777770869120.0, + "grad_norm": 1.965243610427017, + "language_loss": 0.82994169, + "learning_rate": 3.919055756880879e-06, + "loss": 0.85229176, + "num_input_tokens_seen": 42558225, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.1484375, + "step": 1965, + "time_per_iteration": 2.46545672416687 + }, + { + "auxiliary_loss_clip": 0.01175057, + "auxiliary_loss_mlp": 0.01049044, + "balance_loss_clip": 1.02883816, + "balance_loss_mlp": 1.05948591, + "epoch": 0.11820231474522772, + "flos": 48759938542080.0, + "grad_norm": 1.6968751772896917, + "language_loss": 0.74517393, + "learning_rate": 3.918946042768707e-06, + "loss": 0.76741493, + "num_input_tokens_seen": 42580790, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.15625, + "step": 1966, + "time_per_iteration": 2.730928421020508 + }, + { + "auxiliary_loss_clip": 0.01185811, + "auxiliary_loss_mlp": 0.01055482, + "balance_loss_clip": 1.03552604, + "balance_loss_mlp": 1.0661025, + "epoch": 0.11826243799789568, + "flos": 16690598012160.0, + "grad_norm": 3.573953561090722, + "language_loss": 0.725128, + "learning_rate": 3.918836255889908e-06, + "loss": 0.74754095, + "num_input_tokens_seen": 42597355, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.1953125, + "step": 1967, + "time_per_iteration": 2.459409713745117 + }, + { + "auxiliary_loss_clip": 0.01174728, + "auxiliary_loss_mlp": 0.01044222, + "balance_loss_clip": 1.02409899, + "balance_loss_mlp": 1.0596199, + "epoch": 0.11832256125056366, + "flos": 16909868586240.0, + "grad_norm": 2.07735233424318, + "language_loss": 0.87874025, + "learning_rate": 3.9187263962486456e-06, + "loss": 0.90092969, + "num_input_tokens_seen": 42616060, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.15625, + "step": 1968, + "time_per_iteration": 2.474860191345215 + }, + { + "auxiliary_loss_clip": 0.0117476, + "auxiliary_loss_mlp": 0.01051094, + "balance_loss_clip": 1.03083992, + "balance_loss_mlp": 1.05980873, + "epoch": 0.11838268450323162, + "flos": 22820405587200.0, + "grad_norm": 2.3710109771053904, + "language_loss": 0.66827953, + "learning_rate": 3.918616463849087e-06, + "loss": 0.69053805, + "num_input_tokens_seen": 42636285, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1484375, + "step": 1969, + "time_per_iteration": 2.5025057792663574 + }, + { + "auxiliary_loss_clip": 0.01177024, + "auxiliary_loss_mlp": 0.01052173, + "balance_loss_clip": 1.03172874, + "balance_loss_mlp": 1.06375933, + "epoch": 0.11844280775589959, + "flos": 33545844990720.0, + "grad_norm": 2.0668162562591013, + "language_loss": 0.81199527, + "learning_rate": 3.918506458695399e-06, + "loss": 0.83428723, + "num_input_tokens_seen": 42658320, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1328125, + "step": 1970, + "time_per_iteration": 2.6005184650421143 + }, + { + "auxiliary_loss_clip": 0.01071753, + "auxiliary_loss_mlp": 0.0102596, + "balance_loss_clip": 1.02306354, + "balance_loss_mlp": 1.02803779, + "epoch": 0.11850293100856757, + "flos": 66350998604160.0, + "grad_norm": 0.8059191438251484, + "language_loss": 0.66145539, + "learning_rate": 3.918396380791754e-06, + "loss": 0.68243253, + "num_input_tokens_seen": 42721500, + "router_z_loss_clip": 0.02893066, + "router_z_loss_mlp": 0.4375, + "step": 1971, + "time_per_iteration": 3.0580737590789795 + }, + { + "auxiliary_loss_clip": 0.01173379, + "auxiliary_loss_mlp": 0.0105069, + "balance_loss_clip": 1.03112769, + "balance_loss_mlp": 1.0578413, + "epoch": 0.11856305426123553, + "flos": 24681045070080.0, + "grad_norm": 1.9720310647047086, + "language_loss": 0.79760695, + "learning_rate": 3.918286230142327e-06, + "loss": 0.81984764, + "num_input_tokens_seen": 42739825, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.15625, + "step": 1972, + "time_per_iteration": 2.5330677032470703 + }, + { + "auxiliary_loss_clip": 0.01174806, + "auxiliary_loss_mlp": 0.01052417, + "balance_loss_clip": 1.03144813, + "balance_loss_mlp": 1.06013465, + "epoch": 0.1186231775139035, + "flos": 24280102483200.0, + "grad_norm": 2.451560144092476, + "language_loss": 0.72162819, + "learning_rate": 3.918176006751292e-06, + "loss": 0.74390036, + "num_input_tokens_seen": 42758695, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.140625, + "step": 1973, + "time_per_iteration": 2.497079372406006 + }, + { + "auxiliary_loss_clip": 0.0117035, + "auxiliary_loss_mlp": 0.01043803, + "balance_loss_clip": 1.02407408, + "balance_loss_mlp": 1.05802357, + "epoch": 0.11868330076657148, + "flos": 21757413473280.0, + "grad_norm": 2.2680636805256897, + "language_loss": 0.71724641, + "learning_rate": 3.918065710622832e-06, + "loss": 0.73938787, + "num_input_tokens_seen": 42778510, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.125, + "step": 1974, + "time_per_iteration": 2.5145771503448486 + }, + { + "auxiliary_loss_clip": 0.01170733, + "auxiliary_loss_mlp": 0.01038557, + "balance_loss_clip": 1.01937568, + "balance_loss_mlp": 1.05660915, + "epoch": 0.11874342401923944, + "flos": 17193274894080.0, + "grad_norm": 2.192039880981389, + "language_loss": 0.77186036, + "learning_rate": 3.917955341761128e-06, + "loss": 0.7939533, + "num_input_tokens_seen": 42793995, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.140625, + "step": 1975, + "time_per_iteration": 2.4483766555786133 + }, + { + "auxiliary_loss_clip": 0.01172653, + "auxiliary_loss_mlp": 0.01051494, + "balance_loss_clip": 1.03212273, + "balance_loss_mlp": 1.06021976, + "epoch": 0.11880354727190741, + "flos": 15229572312960.0, + "grad_norm": 2.2667330410251596, + "language_loss": 0.7498399, + "learning_rate": 3.917844900170364e-06, + "loss": 0.77208138, + "num_input_tokens_seen": 42809000, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.125, + "step": 1976, + "time_per_iteration": 3.9421374797821045 + }, + { + "auxiliary_loss_clip": 0.01172444, + "auxiliary_loss_mlp": 0.01044553, + "balance_loss_clip": 1.02544367, + "balance_loss_mlp": 1.05979395, + "epoch": 0.11886367052457537, + "flos": 27309706179840.0, + "grad_norm": 1.6192257034176818, + "language_loss": 0.75191766, + "learning_rate": 3.91773438585473e-06, + "loss": 0.77408761, + "num_input_tokens_seen": 42831585, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.125, + "step": 1977, + "time_per_iteration": 3.9506070613861084 + }, + { + "auxiliary_loss_clip": 0.01172952, + "auxiliary_loss_mlp": 0.01053238, + "balance_loss_clip": 1.0338068, + "balance_loss_mlp": 1.05777454, + "epoch": 0.11892379377724335, + "flos": 21798280172160.0, + "grad_norm": 7.387040580957373, + "language_loss": 0.7393533, + "learning_rate": 3.9176237988184165e-06, + "loss": 0.76161528, + "num_input_tokens_seen": 42848420, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.15625, + "step": 1978, + "time_per_iteration": 2.4754912853240967 + }, + { + "auxiliary_loss_clip": 0.01168854, + "auxiliary_loss_mlp": 0.01048253, + "balance_loss_clip": 1.02872658, + "balance_loss_mlp": 1.05782461, + "epoch": 0.11898391702991132, + "flos": 13991013498240.0, + "grad_norm": 1.709416576437117, + "language_loss": 0.73273945, + "learning_rate": 3.917513139065616e-06, + "loss": 0.75491059, + "num_input_tokens_seen": 42866645, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.109375, + "step": 1979, + "time_per_iteration": 2.478938579559326 + }, + { + "auxiliary_loss_clip": 0.01172076, + "auxiliary_loss_mlp": 0.01048439, + "balance_loss_clip": 1.0286746, + "balance_loss_mlp": 1.05735934, + "epoch": 0.11904404028257928, + "flos": 32234567091840.0, + "grad_norm": 1.877436937799078, + "language_loss": 0.98387957, + "learning_rate": 3.917402406600525e-06, + "loss": 1.00608468, + "num_input_tokens_seen": 42888515, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1484375, + "step": 1980, + "time_per_iteration": 2.5758843421936035 + }, + { + "auxiliary_loss_clip": 0.01173349, + "auxiliary_loss_mlp": 0.01046819, + "balance_loss_clip": 1.02580202, + "balance_loss_mlp": 1.05741775, + "epoch": 0.11910416353524726, + "flos": 23586272398080.0, + "grad_norm": 1.8930015682875676, + "language_loss": 0.85929906, + "learning_rate": 3.917291601427342e-06, + "loss": 0.88150084, + "num_input_tokens_seen": 42909035, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.15625, + "step": 1981, + "time_per_iteration": 2.505051612854004 + }, + { + "auxiliary_loss_clip": 0.01172656, + "auxiliary_loss_mlp": 0.01057237, + "balance_loss_clip": 1.03601766, + "balance_loss_mlp": 1.057832, + "epoch": 0.11916428678791523, + "flos": 25333038789120.0, + "grad_norm": 1.9242535829958574, + "language_loss": 0.85007018, + "learning_rate": 3.91718072355027e-06, + "loss": 0.87236911, + "num_input_tokens_seen": 42927555, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1484375, + "step": 1982, + "time_per_iteration": 2.513012409210205 + }, + { + "auxiliary_loss_clip": 0.01166906, + "auxiliary_loss_mlp": 0.01046511, + "balance_loss_clip": 1.02667475, + "balance_loss_mlp": 1.05463564, + "epoch": 0.11922441004058319, + "flos": 19788431592960.0, + "grad_norm": 1.926275276354154, + "language_loss": 0.85026526, + "learning_rate": 3.917069772973513e-06, + "loss": 0.87239939, + "num_input_tokens_seen": 42945300, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.125, + "step": 1983, + "time_per_iteration": 2.4627623558044434 + }, + { + "auxiliary_loss_clip": 0.01172266, + "auxiliary_loss_mlp": 0.01049845, + "balance_loss_clip": 1.02980638, + "balance_loss_mlp": 1.05581713, + "epoch": 0.11928453329325117, + "flos": 21536347219200.0, + "grad_norm": 3.2679367356540894, + "language_loss": 0.77020949, + "learning_rate": 3.916958749701277e-06, + "loss": 0.79243064, + "num_input_tokens_seen": 42961295, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.15625, + "step": 1984, + "time_per_iteration": 2.466224193572998 + }, + { + "auxiliary_loss_clip": 0.01168386, + "auxiliary_loss_mlp": 0.01055095, + "balance_loss_clip": 1.03542554, + "balance_loss_mlp": 1.05464029, + "epoch": 0.11934465654591914, + "flos": 20815010294400.0, + "grad_norm": 1.7272493982968635, + "language_loss": 0.83323789, + "learning_rate": 3.9168476537377745e-06, + "loss": 0.85547268, + "num_input_tokens_seen": 42980330, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.140625, + "step": 1985, + "time_per_iteration": 2.485797882080078 + }, + { + "auxiliary_loss_clip": 0.01162278, + "auxiliary_loss_mlp": 0.01046498, + "balance_loss_clip": 1.02659011, + "balance_loss_mlp": 1.05230284, + "epoch": 0.1194047797985871, + "flos": 19060486565760.0, + "grad_norm": 1.9847962315308523, + "language_loss": 0.7379061, + "learning_rate": 3.916736485087216e-06, + "loss": 0.75999391, + "num_input_tokens_seen": 42996125, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.1015625, + "step": 1986, + "time_per_iteration": 2.4477651119232178 + }, + { + "auxiliary_loss_clip": 0.01167211, + "auxiliary_loss_mlp": 0.01055872, + "balance_loss_clip": 1.03664303, + "balance_loss_mlp": 1.05418456, + "epoch": 0.11946490305125507, + "flos": 27190805184000.0, + "grad_norm": 2.0940320364759573, + "language_loss": 0.7209813, + "learning_rate": 3.916625243753819e-06, + "loss": 0.74321216, + "num_input_tokens_seen": 43014180, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.125, + "step": 1987, + "time_per_iteration": 2.528564929962158 + }, + { + "auxiliary_loss_clip": 0.01166851, + "auxiliary_loss_mlp": 0.01053632, + "balance_loss_clip": 1.03256774, + "balance_loss_mlp": 1.05243921, + "epoch": 0.11952502630392305, + "flos": 21140791672320.0, + "grad_norm": 2.544292945564917, + "language_loss": 0.72455966, + "learning_rate": 3.916513929741799e-06, + "loss": 0.74676454, + "num_input_tokens_seen": 43032120, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.140625, + "step": 1988, + "time_per_iteration": 2.482295274734497 + }, + { + "auxiliary_loss_clip": 0.01168039, + "auxiliary_loss_mlp": 0.01063511, + "balance_loss_clip": 1.04195809, + "balance_loss_mlp": 1.05425191, + "epoch": 0.11958514955659101, + "flos": 22124241118080.0, + "grad_norm": 2.3919568417846544, + "language_loss": 0.80848205, + "learning_rate": 3.91640254305538e-06, + "loss": 0.83079755, + "num_input_tokens_seen": 43052215, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.140625, + "step": 1989, + "time_per_iteration": 2.5321335792541504 + }, + { + "auxiliary_loss_clip": 0.01171171, + "auxiliary_loss_mlp": 0.01051003, + "balance_loss_clip": 1.03040385, + "balance_loss_mlp": 1.05518925, + "epoch": 0.11964527280925898, + "flos": 17421452040960.0, + "grad_norm": 2.7848130249027077, + "language_loss": 0.76000333, + "learning_rate": 3.916291083698784e-06, + "loss": 0.78222507, + "num_input_tokens_seen": 43069720, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.15625, + "step": 1990, + "time_per_iteration": 2.4608383178710938 + }, + { + "auxiliary_loss_clip": 0.01060104, + "auxiliary_loss_mlp": 0.0101675, + "balance_loss_clip": 1.01392448, + "balance_loss_mlp": 1.01813149, + "epoch": 0.11970539606192696, + "flos": 70679741402880.0, + "grad_norm": 0.8877551125762418, + "language_loss": 0.55219597, + "learning_rate": 3.916179551676238e-06, + "loss": 0.57296449, + "num_input_tokens_seen": 43123130, + "router_z_loss_clip": 0.02819824, + "router_z_loss_mlp": 0.41992188, + "step": 1991, + "time_per_iteration": 3.0575883388519287 + }, + { + "auxiliary_loss_clip": 0.01166639, + "auxiliary_loss_mlp": 0.01048947, + "balance_loss_clip": 1.02905095, + "balance_loss_mlp": 1.05472517, + "epoch": 0.11976551931459492, + "flos": 21215019127680.0, + "grad_norm": 2.2244739837006797, + "language_loss": 0.78156978, + "learning_rate": 3.916067946991971e-06, + "loss": 0.8037256, + "num_input_tokens_seen": 43140015, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.1171875, + "step": 1992, + "time_per_iteration": 2.5395517349243164 + }, + { + "auxiliary_loss_clip": 0.01170251, + "auxiliary_loss_mlp": 0.0104925, + "balance_loss_clip": 1.02819777, + "balance_loss_mlp": 1.0534482, + "epoch": 0.11982564256726289, + "flos": 25989306226560.0, + "grad_norm": 1.898510109378507, + "language_loss": 0.78694016, + "learning_rate": 3.915956269650216e-06, + "loss": 0.80913514, + "num_input_tokens_seen": 43160105, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1640625, + "step": 1993, + "time_per_iteration": 2.5264625549316406 + }, + { + "auxiliary_loss_clip": 0.01165494, + "auxiliary_loss_mlp": 0.01058458, + "balance_loss_clip": 1.03837109, + "balance_loss_mlp": 1.05150676, + "epoch": 0.11988576581993086, + "flos": 21650866755840.0, + "grad_norm": 1.7590613991113047, + "language_loss": 0.82287014, + "learning_rate": 3.915844519655208e-06, + "loss": 0.8451097, + "num_input_tokens_seen": 43179835, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.140625, + "step": 1994, + "time_per_iteration": 2.4871127605438232 + }, + { + "auxiliary_loss_clip": 0.01166639, + "auxiliary_loss_mlp": 0.01054967, + "balance_loss_clip": 1.03551149, + "balance_loss_mlp": 1.05389762, + "epoch": 0.11994588907259883, + "flos": 17857407409920.0, + "grad_norm": 2.1035856813409786, + "language_loss": 0.87953222, + "learning_rate": 3.915732697011183e-06, + "loss": 0.9017483, + "num_input_tokens_seen": 43197210, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.125, + "step": 1995, + "time_per_iteration": 2.46690034866333 + }, + { + "auxiliary_loss_clip": 0.01169288, + "auxiliary_loss_mlp": 0.01057862, + "balance_loss_clip": 1.03692937, + "balance_loss_mlp": 1.05346155, + "epoch": 0.1200060123252668, + "flos": 24462744163200.0, + "grad_norm": 2.783456627489481, + "language_loss": 0.74206698, + "learning_rate": 3.9156208017223825e-06, + "loss": 0.76433849, + "num_input_tokens_seen": 43215050, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.15625, + "step": 1996, + "time_per_iteration": 2.5115768909454346 + }, + { + "auxiliary_loss_clip": 0.01167539, + "auxiliary_loss_mlp": 0.01052549, + "balance_loss_clip": 1.03138888, + "balance_loss_mlp": 1.05337763, + "epoch": 0.12006613557793476, + "flos": 18732191235840.0, + "grad_norm": 1.9342712291191904, + "language_loss": 0.88266122, + "learning_rate": 3.915508833793048e-06, + "loss": 0.90486217, + "num_input_tokens_seen": 43233900, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.140625, + "step": 1997, + "time_per_iteration": 2.4716532230377197 + }, + { + "auxiliary_loss_clip": 0.01167703, + "auxiliary_loss_mlp": 0.01063842, + "balance_loss_clip": 1.04287314, + "balance_loss_mlp": 1.05315256, + "epoch": 0.12012625883060274, + "flos": 22267739952000.0, + "grad_norm": 3.8633631849497054, + "language_loss": 0.78929418, + "learning_rate": 3.915396793227428e-06, + "loss": 0.81160963, + "num_input_tokens_seen": 43252105, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1484375, + "step": 1998, + "time_per_iteration": 2.4798996448516846 + }, + { + "auxiliary_loss_clip": 0.01170638, + "auxiliary_loss_mlp": 0.01048668, + "balance_loss_clip": 1.027318, + "balance_loss_mlp": 1.05610394, + "epoch": 0.1201863820832707, + "flos": 21758885930880.0, + "grad_norm": 2.053047413592738, + "language_loss": 0.73435485, + "learning_rate": 3.915284680029769e-06, + "loss": 0.75654793, + "num_input_tokens_seen": 43270315, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1484375, + "step": 1999, + "time_per_iteration": 2.5017611980438232 + }, + { + "auxiliary_loss_clip": 0.01169689, + "auxiliary_loss_mlp": 0.01065385, + "balance_loss_clip": 1.04436839, + "balance_loss_mlp": 1.05347967, + "epoch": 0.12024650533593867, + "flos": 21907987286400.0, + "grad_norm": 3.6093884580795677, + "language_loss": 0.74955112, + "learning_rate": 3.915172494204323e-06, + "loss": 0.77190185, + "num_input_tokens_seen": 43289935, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.15625, + "step": 2000, + "time_per_iteration": 2.5060245990753174 + }, + { + "auxiliary_loss_clip": 0.01170552, + "auxiliary_loss_mlp": 0.01050835, + "balance_loss_clip": 1.02997398, + "balance_loss_mlp": 1.05408299, + "epoch": 0.12030662858860665, + "flos": 21689219502720.0, + "grad_norm": 1.5368563042333518, + "language_loss": 0.84667969, + "learning_rate": 3.915060235755344e-06, + "loss": 0.86889356, + "num_input_tokens_seen": 43309325, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.171875, + "step": 2001, + "time_per_iteration": 2.499922752380371 + }, + { + "auxiliary_loss_clip": 0.01168457, + "auxiliary_loss_mlp": 0.0105136, + "balance_loss_clip": 1.03176236, + "balance_loss_mlp": 1.05330753, + "epoch": 0.12036675184127461, + "flos": 12933228856320.0, + "grad_norm": 2.074842616733997, + "language_loss": 0.73982531, + "learning_rate": 3.91494790468709e-06, + "loss": 0.76202351, + "num_input_tokens_seen": 43327010, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.15625, + "step": 2002, + "time_per_iteration": 2.486853837966919 + }, + { + "auxiliary_loss_clip": 0.01175825, + "auxiliary_loss_mlp": 0.01058049, + "balance_loss_clip": 1.03599501, + "balance_loss_mlp": 1.05508709, + "epoch": 0.12042687509394258, + "flos": 20851028657280.0, + "grad_norm": 2.832741043586106, + "language_loss": 0.78091669, + "learning_rate": 3.9148355010038185e-06, + "loss": 0.80325544, + "num_input_tokens_seen": 43345650, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.2109375, + "step": 2003, + "time_per_iteration": 2.4740982055664062 + }, + { + "auxiliary_loss_clip": 0.01166397, + "auxiliary_loss_mlp": 0.01050741, + "balance_loss_clip": 1.02979612, + "balance_loss_mlp": 1.0521121, + "epoch": 0.12048699834661056, + "flos": 23878513451520.0, + "grad_norm": 1.9652989098821625, + "language_loss": 0.72093791, + "learning_rate": 3.914723024709793e-06, + "loss": 0.74310923, + "num_input_tokens_seen": 43365555, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.140625, + "step": 2004, + "time_per_iteration": 2.5126965045928955 + }, + { + "auxiliary_loss_clip": 0.01174991, + "auxiliary_loss_mlp": 0.01061179, + "balance_loss_clip": 1.03877997, + "balance_loss_mlp": 1.0546937, + "epoch": 0.12054712159927852, + "flos": 19756363726080.0, + "grad_norm": 2.2150760255497945, + "language_loss": 0.78260767, + "learning_rate": 3.914610475809279e-06, + "loss": 0.80496937, + "num_input_tokens_seen": 43384990, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.203125, + "step": 2005, + "time_per_iteration": 2.4991190433502197 + }, + { + "auxiliary_loss_clip": 0.01056255, + "auxiliary_loss_mlp": 0.01005501, + "balance_loss_clip": 1.00284314, + "balance_loss_mlp": 1.01496482, + "epoch": 0.12060724485194649, + "flos": 51672763123200.0, + "grad_norm": 0.9233110616682776, + "language_loss": 0.58020771, + "learning_rate": 3.914497854306543e-06, + "loss": 0.60082525, + "num_input_tokens_seen": 43436335, + "router_z_loss_clip": 0.02661133, + "router_z_loss_mlp": 0.4140625, + "step": 2006, + "time_per_iteration": 2.8520798683166504 + }, + { + "auxiliary_loss_clip": 0.01165745, + "auxiliary_loss_mlp": 0.01049181, + "balance_loss_clip": 1.02958333, + "balance_loss_mlp": 1.05345094, + "epoch": 0.12066736810461445, + "flos": 18990425088000.0, + "grad_norm": 1.7247761793975513, + "language_loss": 0.76275218, + "learning_rate": 3.9143851602058575e-06, + "loss": 0.78490144, + "num_input_tokens_seen": 43456495, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.125, + "step": 2007, + "time_per_iteration": 2.50325083732605 + }, + { + "auxiliary_loss_clip": 0.01170732, + "auxiliary_loss_mlp": 0.01058411, + "balance_loss_clip": 1.03653646, + "balance_loss_mlp": 1.05348623, + "epoch": 0.12072749135728243, + "flos": 16471973882880.0, + "grad_norm": 3.332475401193337, + "language_loss": 0.82973194, + "learning_rate": 3.914272393511494e-06, + "loss": 0.85202336, + "num_input_tokens_seen": 43473085, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.171875, + "step": 2008, + "time_per_iteration": 2.4650609493255615 + }, + { + "auxiliary_loss_clip": 0.0116834, + "auxiliary_loss_mlp": 0.01054228, + "balance_loss_clip": 1.03319979, + "balance_loss_mlp": 1.05225682, + "epoch": 0.1207876146099504, + "flos": 18077108947200.0, + "grad_norm": 2.236244219024357, + "language_loss": 0.84184098, + "learning_rate": 3.91415955422773e-06, + "loss": 0.86406672, + "num_input_tokens_seen": 43491135, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1640625, + "step": 2009, + "time_per_iteration": 2.4602744579315186 + }, + { + "auxiliary_loss_clip": 0.01170159, + "auxiliary_loss_mlp": 0.01053411, + "balance_loss_clip": 1.03083277, + "balance_loss_mlp": 1.0551877, + "epoch": 0.12084773786261836, + "flos": 21871573873920.0, + "grad_norm": 1.7312486930792712, + "language_loss": 0.83945864, + "learning_rate": 3.914046642358844e-06, + "loss": 0.86169434, + "num_input_tokens_seen": 43510440, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.1484375, + "step": 2010, + "time_per_iteration": 2.480238437652588 + }, + { + "auxiliary_loss_clip": 0.01171814, + "auxiliary_loss_mlp": 0.01056176, + "balance_loss_clip": 1.03437304, + "balance_loss_mlp": 1.05634403, + "epoch": 0.12090786111528634, + "flos": 18333044328960.0, + "grad_norm": 1.658807365911602, + "language_loss": 0.84157598, + "learning_rate": 3.9139336579091174e-06, + "loss": 0.8638559, + "num_input_tokens_seen": 43530145, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.15625, + "step": 2011, + "time_per_iteration": 2.454406499862671 + }, + { + "auxiliary_loss_clip": 0.01172165, + "auxiliary_loss_mlp": 0.01055064, + "balance_loss_clip": 1.03386891, + "balance_loss_mlp": 1.055547, + "epoch": 0.1209679843679543, + "flos": 21105850717440.0, + "grad_norm": 1.879921554869875, + "language_loss": 0.96007967, + "learning_rate": 3.913820600882834e-06, + "loss": 0.9823519, + "num_input_tokens_seen": 43549315, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.171875, + "step": 2012, + "time_per_iteration": 2.479583740234375 + }, + { + "auxiliary_loss_clip": 0.01166488, + "auxiliary_loss_mlp": 0.01047788, + "balance_loss_clip": 1.026914, + "balance_loss_mlp": 1.05365777, + "epoch": 0.12102810762062227, + "flos": 29241053585280.0, + "grad_norm": 2.6055417591736036, + "language_loss": 0.80619711, + "learning_rate": 3.913707471284283e-06, + "loss": 0.82833993, + "num_input_tokens_seen": 43569240, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.125, + "step": 2013, + "time_per_iteration": 2.538651704788208 + }, + { + "auxiliary_loss_clip": 0.01172968, + "auxiliary_loss_mlp": 0.0104686, + "balance_loss_clip": 1.02444816, + "balance_loss_mlp": 1.05412138, + "epoch": 0.12108823087329025, + "flos": 17930701111680.0, + "grad_norm": 3.9791821612033953, + "language_loss": 0.77157021, + "learning_rate": 3.9135942691177515e-06, + "loss": 0.79376847, + "num_input_tokens_seen": 43587710, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.1875, + "step": 2014, + "time_per_iteration": 2.4411396980285645 + }, + { + "auxiliary_loss_clip": 0.01169091, + "auxiliary_loss_mlp": 0.01046827, + "balance_loss_clip": 1.02509499, + "balance_loss_mlp": 1.05448556, + "epoch": 0.12114835412595822, + "flos": 22091850028800.0, + "grad_norm": 2.028780359370303, + "language_loss": 0.86930937, + "learning_rate": 3.913480994387535e-06, + "loss": 0.89146852, + "num_input_tokens_seen": 43606000, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.140625, + "step": 2015, + "time_per_iteration": 2.4546844959259033 + }, + { + "auxiliary_loss_clip": 0.01159471, + "auxiliary_loss_mlp": 0.01047561, + "balance_loss_clip": 1.0268662, + "balance_loss_mlp": 1.04779112, + "epoch": 0.12120847737862618, + "flos": 20412343854720.0, + "grad_norm": 2.0866681231001762, + "language_loss": 0.69274801, + "learning_rate": 3.913367647097926e-06, + "loss": 0.71481836, + "num_input_tokens_seen": 43624815, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.109375, + "step": 2016, + "time_per_iteration": 2.469177007675171 + }, + { + "auxiliary_loss_clip": 0.01169041, + "auxiliary_loss_mlp": 0.01043193, + "balance_loss_clip": 1.02042413, + "balance_loss_mlp": 1.05407953, + "epoch": 0.12126860063129415, + "flos": 22309037614080.0, + "grad_norm": 3.095255398319528, + "language_loss": 0.80049825, + "learning_rate": 3.913254227253225e-06, + "loss": 0.82262057, + "num_input_tokens_seen": 43643960, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.15625, + "step": 2017, + "time_per_iteration": 2.459447145462036 + }, + { + "auxiliary_loss_clip": 0.01168347, + "auxiliary_loss_mlp": 0.01051308, + "balance_loss_clip": 1.0292666, + "balance_loss_mlp": 1.05315137, + "epoch": 0.12132872388396213, + "flos": 13699275235200.0, + "grad_norm": 2.364451122732105, + "language_loss": 0.69343489, + "learning_rate": 3.913140734857731e-06, + "loss": 0.71563143, + "num_input_tokens_seen": 43662650, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1484375, + "step": 2018, + "time_per_iteration": 3.919508695602417 + }, + { + "auxiliary_loss_clip": 0.0117102, + "auxiliary_loss_mlp": 0.0105213, + "balance_loss_clip": 1.03226995, + "balance_loss_mlp": 1.05712008, + "epoch": 0.12138884713663009, + "flos": 26466954307200.0, + "grad_norm": 2.162901456551013, + "language_loss": 0.72318506, + "learning_rate": 3.91302716991575e-06, + "loss": 0.74541652, + "num_input_tokens_seen": 43684205, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.140625, + "step": 2019, + "time_per_iteration": 3.910888433456421 + }, + { + "auxiliary_loss_clip": 0.01168573, + "auxiliary_loss_mlp": 0.01057878, + "balance_loss_clip": 1.03615856, + "balance_loss_mlp": 1.05187333, + "epoch": 0.12144897038929806, + "flos": 26141603892480.0, + "grad_norm": 1.8061721544245042, + "language_loss": 0.92484713, + "learning_rate": 3.912913532431586e-06, + "loss": 0.94711161, + "num_input_tokens_seen": 43706320, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.1640625, + "step": 2020, + "time_per_iteration": 2.5007998943328857 + }, + { + "auxiliary_loss_clip": 0.01168404, + "auxiliary_loss_mlp": 0.0105088, + "balance_loss_clip": 1.03064966, + "balance_loss_mlp": 1.05388308, + "epoch": 0.12150909364196603, + "flos": 24717530309760.0, + "grad_norm": 1.9478588429028871, + "language_loss": 0.77149868, + "learning_rate": 3.912799822409549e-06, + "loss": 0.79369152, + "num_input_tokens_seen": 43724805, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.140625, + "step": 2021, + "time_per_iteration": 2.522216796875 + }, + { + "auxiliary_loss_clip": 0.01165897, + "auxiliary_loss_mlp": 0.01046456, + "balance_loss_clip": 1.02586901, + "balance_loss_mlp": 1.05312037, + "epoch": 0.121569216894634, + "flos": 25186990089600.0, + "grad_norm": 2.0305604143992944, + "language_loss": 0.80324662, + "learning_rate": 3.912686039853952e-06, + "loss": 0.82537007, + "num_input_tokens_seen": 43742320, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.125, + "step": 2022, + "time_per_iteration": 2.518737316131592 + }, + { + "auxiliary_loss_clip": 0.01173528, + "auxiliary_loss_mlp": 0.01051897, + "balance_loss_clip": 1.03094029, + "balance_loss_mlp": 1.057019, + "epoch": 0.12162934014730196, + "flos": 13444094039040.0, + "grad_norm": 1.9019957932594662, + "language_loss": 0.8458122, + "learning_rate": 3.912572184769108e-06, + "loss": 0.86806649, + "num_input_tokens_seen": 43760665, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1640625, + "step": 2023, + "time_per_iteration": 2.4534339904785156 + }, + { + "auxiliary_loss_clip": 0.01169339, + "auxiliary_loss_mlp": 0.0104975, + "balance_loss_clip": 1.02916241, + "balance_loss_mlp": 1.05421007, + "epoch": 0.12168946339996994, + "flos": 16946138344320.0, + "grad_norm": 2.2004951084054234, + "language_loss": 0.85155022, + "learning_rate": 3.912458257159335e-06, + "loss": 0.87374109, + "num_input_tokens_seen": 43779020, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.15625, + "step": 2024, + "time_per_iteration": 2.436833143234253 + }, + { + "auxiliary_loss_clip": 0.0116415, + "auxiliary_loss_mlp": 0.010498, + "balance_loss_clip": 1.02974951, + "balance_loss_mlp": 1.04884946, + "epoch": 0.12174958665263791, + "flos": 29821585196160.0, + "grad_norm": 2.043367551334066, + "language_loss": 0.71662712, + "learning_rate": 3.912344257028954e-06, + "loss": 0.73876667, + "num_input_tokens_seen": 43798850, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.15625, + "step": 2025, + "time_per_iteration": 2.541215658187866 + }, + { + "auxiliary_loss_clip": 0.01168343, + "auxiliary_loss_mlp": 0.0104585, + "balance_loss_clip": 1.02564383, + "balance_loss_mlp": 1.05309796, + "epoch": 0.12180970990530587, + "flos": 24641902224000.0, + "grad_norm": 2.0848974538483755, + "language_loss": 0.75976777, + "learning_rate": 3.912230184382286e-06, + "loss": 0.7819097, + "num_input_tokens_seen": 43820130, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.15625, + "step": 2026, + "time_per_iteration": 2.529049873352051 + }, + { + "auxiliary_loss_clip": 0.01167307, + "auxiliary_loss_mlp": 0.01045796, + "balance_loss_clip": 1.02570963, + "balance_loss_mlp": 1.05251837, + "epoch": 0.12186983315797385, + "flos": 20521691832960.0, + "grad_norm": 2.6572777094172597, + "language_loss": 0.88875067, + "learning_rate": 3.912116039223659e-06, + "loss": 0.9108817, + "num_input_tokens_seen": 43838485, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.1484375, + "step": 2027, + "time_per_iteration": 2.472158432006836 + }, + { + "auxiliary_loss_clip": 0.01165413, + "auxiliary_loss_mlp": 0.01052054, + "balance_loss_clip": 1.03375518, + "balance_loss_mlp": 1.05316114, + "epoch": 0.12192995641064182, + "flos": 27818344719360.0, + "grad_norm": 2.343330799439898, + "language_loss": 0.75515145, + "learning_rate": 3.912001821557399e-06, + "loss": 0.77732611, + "num_input_tokens_seen": 43859080, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.125, + "step": 2028, + "time_per_iteration": 2.5286035537719727 + }, + { + "auxiliary_loss_clip": 0.01164893, + "auxiliary_loss_mlp": 0.010582, + "balance_loss_clip": 1.03758836, + "balance_loss_mlp": 1.05089998, + "epoch": 0.12199007966330978, + "flos": 22017119783040.0, + "grad_norm": 2.270604294931249, + "language_loss": 0.766294, + "learning_rate": 3.911887531387839e-06, + "loss": 0.78852487, + "num_input_tokens_seen": 43879030, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.140625, + "step": 2029, + "time_per_iteration": 2.479799747467041 + }, + { + "auxiliary_loss_clip": 0.0116289, + "auxiliary_loss_mlp": 0.01051159, + "balance_loss_clip": 1.03113246, + "balance_loss_mlp": 1.05001879, + "epoch": 0.12205020291597775, + "flos": 23295216493440.0, + "grad_norm": 2.2290592341985747, + "language_loss": 0.7955277, + "learning_rate": 3.911773168719313e-06, + "loss": 0.81766814, + "num_input_tokens_seen": 43898505, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.125, + "step": 2030, + "time_per_iteration": 2.479250431060791 + }, + { + "auxiliary_loss_clip": 0.01164659, + "auxiliary_loss_mlp": 0.01054283, + "balance_loss_clip": 1.03301597, + "balance_loss_mlp": 1.0526309, + "epoch": 0.12211032616864573, + "flos": 26031609469440.0, + "grad_norm": 3.9595633959777694, + "language_loss": 0.74556369, + "learning_rate": 3.911658733556155e-06, + "loss": 0.76775312, + "num_input_tokens_seen": 43917945, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.125, + "step": 2031, + "time_per_iteration": 2.4966888427734375 + }, + { + "auxiliary_loss_clip": 0.01166064, + "auxiliary_loss_mlp": 0.01045008, + "balance_loss_clip": 1.0269599, + "balance_loss_mlp": 1.05319047, + "epoch": 0.12217044942131369, + "flos": 20410943224320.0, + "grad_norm": 1.9774178696035418, + "language_loss": 0.75045705, + "learning_rate": 3.911544225902707e-06, + "loss": 0.77256775, + "num_input_tokens_seen": 43937385, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.1328125, + "step": 2032, + "time_per_iteration": 2.4545648097991943 + }, + { + "auxiliary_loss_clip": 0.01156748, + "auxiliary_loss_mlp": 0.01043308, + "balance_loss_clip": 1.02398455, + "balance_loss_mlp": 1.04844511, + "epoch": 0.12223057267398166, + "flos": 22857142222080.0, + "grad_norm": 1.6143118682838826, + "language_loss": 0.88853258, + "learning_rate": 3.911429645763311e-06, + "loss": 0.91053319, + "num_input_tokens_seen": 43958130, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0859375, + "step": 2033, + "time_per_iteration": 2.505521535873413 + }, + { + "auxiliary_loss_clip": 0.01170793, + "auxiliary_loss_mlp": 0.01050252, + "balance_loss_clip": 1.03059459, + "balance_loss_mlp": 1.05660009, + "epoch": 0.12229069592664964, + "flos": 20047563285120.0, + "grad_norm": 2.1152048244965096, + "language_loss": 0.65517056, + "learning_rate": 3.911314993142311e-06, + "loss": 0.67738092, + "num_input_tokens_seen": 43976800, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.140625, + "step": 2034, + "time_per_iteration": 2.4647884368896484 + }, + { + "auxiliary_loss_clip": 0.01167041, + "auxiliary_loss_mlp": 0.01055195, + "balance_loss_clip": 1.03425026, + "balance_loss_mlp": 1.05399358, + "epoch": 0.1223508191793176, + "flos": 22274240313600.0, + "grad_norm": 1.59634219760927, + "language_loss": 0.76435542, + "learning_rate": 3.911200268044055e-06, + "loss": 0.78657782, + "num_input_tokens_seen": 43996620, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.125, + "step": 2035, + "time_per_iteration": 2.483016014099121 + }, + { + "auxiliary_loss_clip": 0.01169828, + "auxiliary_loss_mlp": 0.01051267, + "balance_loss_clip": 1.03104889, + "balance_loss_mlp": 1.0543201, + "epoch": 0.12241094243198557, + "flos": 21285978445440.0, + "grad_norm": 1.8316823187763973, + "language_loss": 0.71407682, + "learning_rate": 3.911085470472892e-06, + "loss": 0.73628777, + "num_input_tokens_seen": 44016175, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.15625, + "step": 2036, + "time_per_iteration": 2.476471185684204 + }, + { + "auxiliary_loss_clip": 0.01168411, + "auxiliary_loss_mlp": 0.01051825, + "balance_loss_clip": 1.0309397, + "balance_loss_mlp": 1.05532706, + "epoch": 0.12247106568465355, + "flos": 17382381022080.0, + "grad_norm": 1.632988910709452, + "language_loss": 0.83352619, + "learning_rate": 3.910970600433178e-06, + "loss": 0.85572863, + "num_input_tokens_seen": 44035060, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.125, + "step": 2037, + "time_per_iteration": 2.476040840148926 + }, + { + "auxiliary_loss_clip": 0.0117386, + "auxiliary_loss_mlp": 0.01058093, + "balance_loss_clip": 1.03625405, + "balance_loss_mlp": 1.05652785, + "epoch": 0.12253118893732151, + "flos": 27045438842880.0, + "grad_norm": 2.722283338591856, + "language_loss": 0.80255699, + "learning_rate": 3.910855657929267e-06, + "loss": 0.82487655, + "num_input_tokens_seen": 44053330, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.171875, + "step": 2038, + "time_per_iteration": 2.5043163299560547 + }, + { + "auxiliary_loss_clip": 0.01058546, + "auxiliary_loss_mlp": 0.01007425, + "balance_loss_clip": 1.0051837, + "balance_loss_mlp": 1.01638949, + "epoch": 0.12259131218998948, + "flos": 53861518368000.0, + "grad_norm": 0.832889593555193, + "language_loss": 0.58671033, + "learning_rate": 3.910740642965518e-06, + "loss": 0.60737002, + "num_input_tokens_seen": 44107575, + "router_z_loss_clip": 0.02246094, + "router_z_loss_mlp": 0.421875, + "step": 2039, + "time_per_iteration": 2.9495608806610107 + }, + { + "auxiliary_loss_clip": 0.01172242, + "auxiliary_loss_mlp": 0.01049387, + "balance_loss_clip": 1.0277977, + "balance_loss_mlp": 1.05559754, + "epoch": 0.12265143544265744, + "flos": 17891917401600.0, + "grad_norm": 2.6229044060505298, + "language_loss": 0.80485016, + "learning_rate": 3.910625555546292e-06, + "loss": 0.82706642, + "num_input_tokens_seen": 44126075, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.171875, + "step": 2040, + "time_per_iteration": 2.4483039379119873 + }, + { + "auxiliary_loss_clip": 0.01166059, + "auxiliary_loss_mlp": 0.01050675, + "balance_loss_clip": 1.02977788, + "balance_loss_mlp": 1.05270815, + "epoch": 0.12271155869532542, + "flos": 21799932197760.0, + "grad_norm": 1.8235003945490114, + "language_loss": 0.82753873, + "learning_rate": 3.910510395675953e-06, + "loss": 0.84970617, + "num_input_tokens_seen": 44145605, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1328125, + "step": 2041, + "time_per_iteration": 2.4804372787475586 + }, + { + "auxiliary_loss_clip": 0.01170766, + "auxiliary_loss_mlp": 0.01049407, + "balance_loss_clip": 1.02846217, + "balance_loss_mlp": 1.05399048, + "epoch": 0.12277168194799339, + "flos": 19828759587840.0, + "grad_norm": 1.7522185366152092, + "language_loss": 0.66806722, + "learning_rate": 3.9103951633588694e-06, + "loss": 0.69026893, + "num_input_tokens_seen": 44164770, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1640625, + "step": 2042, + "time_per_iteration": 2.4683480262756348 + }, + { + "auxiliary_loss_clip": 0.01165609, + "auxiliary_loss_mlp": 0.01051247, + "balance_loss_clip": 1.03032589, + "balance_loss_mlp": 1.05184031, + "epoch": 0.12283180520066135, + "flos": 23221024951680.0, + "grad_norm": 1.8478924147346443, + "language_loss": 0.81661081, + "learning_rate": 3.910279858599409e-06, + "loss": 0.83877933, + "num_input_tokens_seen": 44184025, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.140625, + "step": 2043, + "time_per_iteration": 2.5265614986419678 + }, + { + "auxiliary_loss_clip": 0.01166463, + "auxiliary_loss_mlp": 0.01049773, + "balance_loss_clip": 1.02792168, + "balance_loss_mlp": 1.05028844, + "epoch": 0.12289192845332933, + "flos": 18588476920320.0, + "grad_norm": 2.0920421188484095, + "language_loss": 0.8049221, + "learning_rate": 3.910164481401946e-06, + "loss": 0.82708442, + "num_input_tokens_seen": 44202950, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.1640625, + "step": 2044, + "time_per_iteration": 2.45843768119812 + }, + { + "auxiliary_loss_clip": 0.0116264, + "auxiliary_loss_mlp": 0.01046352, + "balance_loss_clip": 1.02577674, + "balance_loss_mlp": 1.05169511, + "epoch": 0.1229520517059973, + "flos": 25769532862080.0, + "grad_norm": 1.7057283877293323, + "language_loss": 0.7796452, + "learning_rate": 3.910049031770853e-06, + "loss": 0.8017351, + "num_input_tokens_seen": 44221115, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.109375, + "step": 2045, + "time_per_iteration": 2.5117220878601074 + }, + { + "auxiliary_loss_clip": 0.01172524, + "auxiliary_loss_mlp": 0.01063382, + "balance_loss_clip": 1.04210341, + "balance_loss_mlp": 1.05461311, + "epoch": 0.12301217495866526, + "flos": 20887154760960.0, + "grad_norm": 2.0659302798736436, + "language_loss": 0.67135215, + "learning_rate": 3.90993350971051e-06, + "loss": 0.69371116, + "num_input_tokens_seen": 44240575, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1796875, + "step": 2046, + "time_per_iteration": 2.466304063796997 + }, + { + "auxiliary_loss_clip": 0.01166597, + "auxiliary_loss_mlp": 0.01058908, + "balance_loss_clip": 1.03793919, + "balance_loss_mlp": 1.05408335, + "epoch": 0.12307229821133324, + "flos": 22378811783040.0, + "grad_norm": 2.3143924335245654, + "language_loss": 0.72491664, + "learning_rate": 3.909817915225297e-06, + "loss": 0.7471717, + "num_input_tokens_seen": 44257145, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.125, + "step": 2047, + "time_per_iteration": 2.4625275135040283 + }, + { + "auxiliary_loss_clip": 0.01163998, + "auxiliary_loss_mlp": 0.0106421, + "balance_loss_clip": 1.04232347, + "balance_loss_mlp": 1.05105257, + "epoch": 0.1231324214640012, + "flos": 23367396873600.0, + "grad_norm": 1.6458989790549132, + "language_loss": 0.76394033, + "learning_rate": 3.909702248319597e-06, + "loss": 0.7862224, + "num_input_tokens_seen": 44278035, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.125, + "step": 2048, + "time_per_iteration": 2.508790969848633 + }, + { + "auxiliary_loss_clip": 0.01165989, + "auxiliary_loss_mlp": 0.01048998, + "balance_loss_clip": 1.03061616, + "balance_loss_mlp": 1.05322123, + "epoch": 0.12319254471666917, + "flos": 23767154311680.0, + "grad_norm": 2.118548028298143, + "language_loss": 0.84626836, + "learning_rate": 3.909586508997797e-06, + "loss": 0.86841822, + "num_input_tokens_seen": 44296980, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.125, + "step": 2049, + "time_per_iteration": 2.538325071334839 + }, + { + "auxiliary_loss_clip": 0.01164402, + "auxiliary_loss_mlp": 0.01053692, + "balance_loss_clip": 1.0336647, + "balance_loss_mlp": 1.05051267, + "epoch": 0.12325266796933713, + "flos": 23550146294400.0, + "grad_norm": 3.176509780932849, + "language_loss": 0.75351131, + "learning_rate": 3.909470697264285e-06, + "loss": 0.77569222, + "num_input_tokens_seen": 44318005, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.140625, + "step": 2050, + "time_per_iteration": 2.499915599822998 + }, + { + "auxiliary_loss_clip": 0.0116542, + "auxiliary_loss_mlp": 0.01054604, + "balance_loss_clip": 1.03431439, + "balance_loss_mlp": 1.05127048, + "epoch": 0.12331279122200511, + "flos": 24423996366720.0, + "grad_norm": 1.9728027261326873, + "language_loss": 0.80877042, + "learning_rate": 3.909354813123452e-06, + "loss": 0.83097064, + "num_input_tokens_seen": 44335260, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.140625, + "step": 2051, + "time_per_iteration": 2.5018789768218994 + }, + { + "auxiliary_loss_clip": 0.01164273, + "auxiliary_loss_mlp": 0.01053369, + "balance_loss_clip": 1.03338933, + "balance_loss_mlp": 1.05348301, + "epoch": 0.12337291447467308, + "flos": 25484294960640.0, + "grad_norm": 1.7756923294305167, + "language_loss": 0.79991698, + "learning_rate": 3.909238856579693e-06, + "loss": 0.82209337, + "num_input_tokens_seen": 44355315, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.109375, + "step": 2052, + "time_per_iteration": 2.4962196350097656 + }, + { + "auxiliary_loss_clip": 0.01167428, + "auxiliary_loss_mlp": 0.01059063, + "balance_loss_clip": 1.03793955, + "balance_loss_mlp": 1.0515492, + "epoch": 0.12343303772734104, + "flos": 23550002640000.0, + "grad_norm": 2.071130498978609, + "language_loss": 0.73757279, + "learning_rate": 3.909122827637406e-06, + "loss": 0.75983769, + "num_input_tokens_seen": 44373020, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.15625, + "step": 2053, + "time_per_iteration": 2.4748997688293457 + }, + { + "auxiliary_loss_clip": 0.01164856, + "auxiliary_loss_mlp": 0.01054483, + "balance_loss_clip": 1.03337085, + "balance_loss_mlp": 1.04912996, + "epoch": 0.12349316098000902, + "flos": 47557074867840.0, + "grad_norm": 2.5139588428492408, + "language_loss": 0.73835206, + "learning_rate": 3.909006726300991e-06, + "loss": 0.76054543, + "num_input_tokens_seen": 44397525, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.15625, + "step": 2054, + "time_per_iteration": 2.7009665966033936 + }, + { + "auxiliary_loss_clip": 0.01161738, + "auxiliary_loss_mlp": 0.01042094, + "balance_loss_clip": 1.02381933, + "balance_loss_mlp": 1.04980421, + "epoch": 0.12355328423267699, + "flos": 25045969294080.0, + "grad_norm": 2.0020033330801863, + "language_loss": 0.85107529, + "learning_rate": 3.908890552574849e-06, + "loss": 0.87311363, + "num_input_tokens_seen": 44415890, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.1171875, + "step": 2055, + "time_per_iteration": 2.5038392543792725 + }, + { + "auxiliary_loss_clip": 0.01164626, + "auxiliary_loss_mlp": 0.01053304, + "balance_loss_clip": 1.03445673, + "balance_loss_mlp": 1.05093932, + "epoch": 0.12361340748534495, + "flos": 27709140395520.0, + "grad_norm": 1.9818000135561404, + "language_loss": 0.77465194, + "learning_rate": 3.908774306463384e-06, + "loss": 0.79683125, + "num_input_tokens_seen": 44436625, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.140625, + "step": 2056, + "time_per_iteration": 2.5265629291534424 + }, + { + "auxiliary_loss_clip": 0.01162241, + "auxiliary_loss_mlp": 0.01055177, + "balance_loss_clip": 1.03486395, + "balance_loss_mlp": 1.04937708, + "epoch": 0.12367353073801293, + "flos": 26140598311680.0, + "grad_norm": 1.9976131339644834, + "language_loss": 0.83188522, + "learning_rate": 3.908657987971009e-06, + "loss": 0.85405934, + "num_input_tokens_seen": 44455265, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1328125, + "step": 2057, + "time_per_iteration": 2.502987861633301 + }, + { + "auxiliary_loss_clip": 0.0116756, + "auxiliary_loss_mlp": 0.01053922, + "balance_loss_clip": 1.03272629, + "balance_loss_mlp": 1.05169332, + "epoch": 0.1237336539906809, + "flos": 25156035544320.0, + "grad_norm": 1.751792200322901, + "language_loss": 0.78356105, + "learning_rate": 3.90854159710213e-06, + "loss": 0.80577588, + "num_input_tokens_seen": 44475815, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1640625, + "step": 2058, + "time_per_iteration": 2.5236053466796875 + }, + { + "auxiliary_loss_clip": 0.01167574, + "auxiliary_loss_mlp": 0.01052354, + "balance_loss_clip": 1.03086066, + "balance_loss_mlp": 1.05105174, + "epoch": 0.12379377724334886, + "flos": 15304589867520.0, + "grad_norm": 2.1327254817813124, + "language_loss": 0.83191061, + "learning_rate": 3.9084251338611624e-06, + "loss": 0.85410988, + "num_input_tokens_seen": 44494045, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.1640625, + "step": 2059, + "time_per_iteration": 5.313246726989746 + }, + { + "auxiliary_loss_clip": 0.01169117, + "auxiliary_loss_mlp": 0.01056711, + "balance_loss_clip": 1.0344671, + "balance_loss_mlp": 1.05206418, + "epoch": 0.12385390049601683, + "flos": 21316717509120.0, + "grad_norm": 2.990324814625926, + "language_loss": 0.81387389, + "learning_rate": 3.908308598252523e-06, + "loss": 0.83613217, + "num_input_tokens_seen": 44509120, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.171875, + "step": 2060, + "time_per_iteration": 3.8617331981658936 + }, + { + "auxiliary_loss_clip": 0.01163462, + "auxiliary_loss_mlp": 0.01050537, + "balance_loss_clip": 1.02928221, + "balance_loss_mlp": 1.04859161, + "epoch": 0.1239140237486848, + "flos": 15116309752320.0, + "grad_norm": 2.0129231677956105, + "language_loss": 0.86278749, + "learning_rate": 3.9081919902806306e-06, + "loss": 0.88492751, + "num_input_tokens_seen": 44525780, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1484375, + "step": 2061, + "time_per_iteration": 2.4531033039093018 + }, + { + "auxiliary_loss_clip": 0.01163888, + "auxiliary_loss_mlp": 0.01045306, + "balance_loss_clip": 1.02552915, + "balance_loss_mlp": 1.05163288, + "epoch": 0.12397414700135277, + "flos": 21976791788160.0, + "grad_norm": 2.146204871859891, + "language_loss": 0.84992719, + "learning_rate": 3.908075309949906e-06, + "loss": 0.87201917, + "num_input_tokens_seen": 44543125, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.125, + "step": 2062, + "time_per_iteration": 2.475050449371338 + }, + { + "auxiliary_loss_clip": 0.01167091, + "auxiliary_loss_mlp": 0.01057701, + "balance_loss_clip": 1.03600502, + "balance_loss_mlp": 1.05348217, + "epoch": 0.12403427025402074, + "flos": 13400892956160.0, + "grad_norm": 2.194910982672458, + "language_loss": 0.78651118, + "learning_rate": 3.907958557264774e-06, + "loss": 0.80875909, + "num_input_tokens_seen": 44560275, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.140625, + "step": 2063, + "time_per_iteration": 2.4638655185699463 + }, + { + "auxiliary_loss_clip": 0.01167155, + "auxiliary_loss_mlp": 0.01058063, + "balance_loss_clip": 1.03590226, + "balance_loss_mlp": 1.05330634, + "epoch": 0.12409439350668872, + "flos": 15304374385920.0, + "grad_norm": 2.133219584666701, + "language_loss": 0.79411167, + "learning_rate": 3.907841732229663e-06, + "loss": 0.81636381, + "num_input_tokens_seen": 44577640, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.1328125, + "step": 2064, + "time_per_iteration": 2.4441418647766113 + }, + { + "auxiliary_loss_clip": 0.01163006, + "auxiliary_loss_mlp": 0.01051291, + "balance_loss_clip": 1.03083503, + "balance_loss_mlp": 1.04955256, + "epoch": 0.12415451675935668, + "flos": 25009376313600.0, + "grad_norm": 2.2298036351802533, + "language_loss": 0.92358226, + "learning_rate": 3.907724834849002e-06, + "loss": 0.9457252, + "num_input_tokens_seen": 44594860, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1328125, + "step": 2065, + "time_per_iteration": 2.4909794330596924 + }, + { + "auxiliary_loss_clip": 0.01166011, + "auxiliary_loss_mlp": 0.01050011, + "balance_loss_clip": 1.02880335, + "balance_loss_mlp": 1.05061674, + "epoch": 0.12421464001202465, + "flos": 23659673840640.0, + "grad_norm": 1.7134253508315578, + "language_loss": 0.8042016, + "learning_rate": 3.907607865127225e-06, + "loss": 0.82636184, + "num_input_tokens_seen": 44614780, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.15625, + "step": 2066, + "time_per_iteration": 2.484276056289673 + }, + { + "auxiliary_loss_clip": 0.01052523, + "auxiliary_loss_mlp": 0.0100337, + "balance_loss_clip": 1.00111723, + "balance_loss_mlp": 1.01144505, + "epoch": 0.12427476326469263, + "flos": 65732904345600.0, + "grad_norm": 0.8687209975293121, + "language_loss": 0.63275361, + "learning_rate": 3.907490823068766e-06, + "loss": 0.65331256, + "num_input_tokens_seen": 44671240, + "router_z_loss_clip": 0.02258301, + "router_z_loss_mlp": 0.41015625, + "step": 2067, + "time_per_iteration": 3.0286524295806885 + }, + { + "auxiliary_loss_clip": 0.01166519, + "auxiliary_loss_mlp": 0.0105175, + "balance_loss_clip": 1.03103137, + "balance_loss_mlp": 1.05087852, + "epoch": 0.12433488651736059, + "flos": 24535427333760.0, + "grad_norm": 1.9774411847970965, + "language_loss": 0.93209147, + "learning_rate": 3.907373708678063e-06, + "loss": 0.95427418, + "num_input_tokens_seen": 44691050, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.15625, + "step": 2068, + "time_per_iteration": 2.4971697330474854 + }, + { + "auxiliary_loss_clip": 0.01167817, + "auxiliary_loss_mlp": 0.01049229, + "balance_loss_clip": 1.03079867, + "balance_loss_mlp": 1.053213, + "epoch": 0.12439500977002856, + "flos": 21031659175680.0, + "grad_norm": 1.9835561743386452, + "language_loss": 0.81277847, + "learning_rate": 3.9072565219595596e-06, + "loss": 0.83494884, + "num_input_tokens_seen": 44709850, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.1484375, + "step": 2069, + "time_per_iteration": 2.4772391319274902 + }, + { + "auxiliary_loss_clip": 0.01166777, + "auxiliary_loss_mlp": 0.01055339, + "balance_loss_clip": 1.03519261, + "balance_loss_mlp": 1.05177176, + "epoch": 0.12445513302269653, + "flos": 26830621555200.0, + "grad_norm": 1.606173275168009, + "language_loss": 0.77390277, + "learning_rate": 3.907139262917696e-06, + "loss": 0.79612398, + "num_input_tokens_seen": 44731475, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.1484375, + "step": 2070, + "time_per_iteration": 2.4962410926818848 + }, + { + "auxiliary_loss_clip": 0.01172971, + "auxiliary_loss_mlp": 0.01046497, + "balance_loss_clip": 1.02598071, + "balance_loss_mlp": 1.05637431, + "epoch": 0.1245152562753645, + "flos": 18368919037440.0, + "grad_norm": 2.418044156181854, + "language_loss": 0.80847198, + "learning_rate": 3.907021931556922e-06, + "loss": 0.83066666, + "num_input_tokens_seen": 44749685, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1640625, + "step": 2071, + "time_per_iteration": 2.452148199081421 + }, + { + "auxiliary_loss_clip": 0.01162159, + "auxiliary_loss_mlp": 0.01051578, + "balance_loss_clip": 1.03063262, + "balance_loss_mlp": 1.05134583, + "epoch": 0.12457537952803246, + "flos": 33107986200960.0, + "grad_norm": 1.802846280579791, + "language_loss": 0.77933639, + "learning_rate": 3.906904527881684e-06, + "loss": 0.80147374, + "num_input_tokens_seen": 44772165, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.109375, + "step": 2072, + "time_per_iteration": 2.5763509273529053 + }, + { + "auxiliary_loss_clip": 0.01166298, + "auxiliary_loss_mlp": 0.01054628, + "balance_loss_clip": 1.03480363, + "balance_loss_mlp": 1.05423427, + "epoch": 0.12463550278070043, + "flos": 22270217990400.0, + "grad_norm": 2.6278132513508976, + "language_loss": 0.74839735, + "learning_rate": 3.9067870518964355e-06, + "loss": 0.77060658, + "num_input_tokens_seen": 44790580, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.125, + "step": 2073, + "time_per_iteration": 2.4676945209503174 + }, + { + "auxiliary_loss_clip": 0.01162135, + "auxiliary_loss_mlp": 0.01050014, + "balance_loss_clip": 1.02904546, + "balance_loss_mlp": 1.04915833, + "epoch": 0.12469562603336841, + "flos": 14679025580160.0, + "grad_norm": 1.9457561725453951, + "language_loss": 0.90556443, + "learning_rate": 3.906669503605631e-06, + "loss": 0.92768592, + "num_input_tokens_seen": 44806730, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1328125, + "step": 2074, + "time_per_iteration": 2.4873156547546387 + }, + { + "auxiliary_loss_clip": 0.01168793, + "auxiliary_loss_mlp": 0.01051059, + "balance_loss_clip": 1.02843285, + "balance_loss_mlp": 1.05183172, + "epoch": 0.12475574928603637, + "flos": 24644775312000.0, + "grad_norm": 2.3814572559525877, + "language_loss": 0.83753067, + "learning_rate": 3.906551883013728e-06, + "loss": 0.85972917, + "num_input_tokens_seen": 44825550, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.171875, + "step": 2075, + "time_per_iteration": 2.500657320022583 + }, + { + "auxiliary_loss_clip": 0.01164838, + "auxiliary_loss_mlp": 0.01056086, + "balance_loss_clip": 1.0341754, + "balance_loss_mlp": 1.05080831, + "epoch": 0.12481587253870434, + "flos": 21762980081280.0, + "grad_norm": 2.1638910845289567, + "language_loss": 0.73802024, + "learning_rate": 3.9064341901251865e-06, + "loss": 0.76022947, + "num_input_tokens_seen": 44844155, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.140625, + "step": 2076, + "time_per_iteration": 2.5686564445495605 + }, + { + "auxiliary_loss_clip": 0.01161577, + "auxiliary_loss_mlp": 0.01043023, + "balance_loss_clip": 1.02346063, + "balance_loss_mlp": 1.05219531, + "epoch": 0.12487599579137232, + "flos": 21432529935360.0, + "grad_norm": 1.967733683791653, + "language_loss": 0.7551648, + "learning_rate": 3.906316424944469e-06, + "loss": 0.77721083, + "num_input_tokens_seen": 44863780, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.09375, + "step": 2077, + "time_per_iteration": 2.489954710006714 + }, + { + "auxiliary_loss_clip": 0.01163633, + "auxiliary_loss_mlp": 0.0105265, + "balance_loss_clip": 1.03104901, + "balance_loss_mlp": 1.05015802, + "epoch": 0.12493611904404028, + "flos": 16107624276480.0, + "grad_norm": 4.043491061132511, + "language_loss": 0.82077563, + "learning_rate": 3.906198587476043e-06, + "loss": 0.84293842, + "num_input_tokens_seen": 44881480, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.1328125, + "step": 2078, + "time_per_iteration": 2.445270299911499 + }, + { + "auxiliary_loss_clip": 0.01168396, + "auxiliary_loss_mlp": 0.0104761, + "balance_loss_clip": 1.02629507, + "balance_loss_mlp": 1.05372512, + "epoch": 0.12499624229670825, + "flos": 21580266574080.0, + "grad_norm": 2.023726857078381, + "language_loss": 0.75024784, + "learning_rate": 3.906080677724374e-06, + "loss": 0.77240789, + "num_input_tokens_seen": 44900390, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1484375, + "step": 2079, + "time_per_iteration": 2.4694364070892334 + }, + { + "auxiliary_loss_clip": 0.01173002, + "auxiliary_loss_mlp": 0.01056904, + "balance_loss_clip": 1.03578043, + "balance_loss_mlp": 1.05697465, + "epoch": 0.1250563655493762, + "flos": 25699040421120.0, + "grad_norm": 2.9314739831996124, + "language_loss": 0.83961046, + "learning_rate": 3.905962695693935e-06, + "loss": 0.86190951, + "num_input_tokens_seen": 44920375, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1640625, + "step": 2080, + "time_per_iteration": 2.52955961227417 + }, + { + "auxiliary_loss_clip": 0.01166024, + "auxiliary_loss_mlp": 0.0105753, + "balance_loss_clip": 1.0364058, + "balance_loss_mlp": 1.05275226, + "epoch": 0.12511648880204418, + "flos": 16909509450240.0, + "grad_norm": 2.0357346796271307, + "language_loss": 0.84575123, + "learning_rate": 3.9058446413892e-06, + "loss": 0.8679868, + "num_input_tokens_seen": 44938415, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1328125, + "step": 2081, + "time_per_iteration": 2.4380433559417725 + }, + { + "auxiliary_loss_clip": 0.0116507, + "auxiliary_loss_mlp": 0.01044581, + "balance_loss_clip": 1.02430391, + "balance_loss_mlp": 1.05154538, + "epoch": 0.12517661205471217, + "flos": 17567500740480.0, + "grad_norm": 1.660916229819668, + "language_loss": 0.76882648, + "learning_rate": 3.905726514814646e-06, + "loss": 0.790923, + "num_input_tokens_seen": 44957135, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1328125, + "step": 2082, + "time_per_iteration": 2.454939842224121 + }, + { + "auxiliary_loss_clip": 0.01182882, + "auxiliary_loss_mlp": 0.01052846, + "balance_loss_clip": 1.03117347, + "balance_loss_mlp": 1.06035674, + "epoch": 0.12523673530738014, + "flos": 16033791870720.0, + "grad_norm": 2.833832134330164, + "language_loss": 0.78994107, + "learning_rate": 3.9056083159747495e-06, + "loss": 0.81229836, + "num_input_tokens_seen": 44974480, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.2265625, + "step": 2083, + "time_per_iteration": 2.4439167976379395 + }, + { + "auxiliary_loss_clip": 0.01168103, + "auxiliary_loss_mlp": 0.01051445, + "balance_loss_clip": 1.02855682, + "balance_loss_mlp": 1.05132031, + "epoch": 0.1252968585600481, + "flos": 18807747494400.0, + "grad_norm": 2.376124844090109, + "language_loss": 0.89690113, + "learning_rate": 3.9054900448739966e-06, + "loss": 0.91909659, + "num_input_tokens_seen": 44990310, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.171875, + "step": 2084, + "time_per_iteration": 2.492274045944214 + }, + { + "auxiliary_loss_clip": 0.01168755, + "auxiliary_loss_mlp": 0.01049772, + "balance_loss_clip": 1.02876747, + "balance_loss_mlp": 1.05379784, + "epoch": 0.12535698181271607, + "flos": 27271568914560.0, + "grad_norm": 1.9059704425119062, + "language_loss": 0.79718572, + "learning_rate": 3.905371701516869e-06, + "loss": 0.81937099, + "num_input_tokens_seen": 45010720, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.1484375, + "step": 2085, + "time_per_iteration": 2.5295538902282715 + }, + { + "auxiliary_loss_clip": 0.0116658, + "auxiliary_loss_mlp": 0.01051649, + "balance_loss_clip": 1.03011954, + "balance_loss_mlp": 1.05235541, + "epoch": 0.12541710506538403, + "flos": 22054107813120.0, + "grad_norm": 1.9580642243137214, + "language_loss": 0.88227898, + "learning_rate": 3.905253285907856e-06, + "loss": 0.90446126, + "num_input_tokens_seen": 45030360, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.140625, + "step": 2086, + "time_per_iteration": 2.4508614540100098 + }, + { + "auxiliary_loss_clip": 0.01162238, + "auxiliary_loss_mlp": 0.01045013, + "balance_loss_clip": 1.02541506, + "balance_loss_mlp": 1.05238986, + "epoch": 0.125477228318052, + "flos": 12603173760000.0, + "grad_norm": 2.3707303368435957, + "language_loss": 0.87088495, + "learning_rate": 3.905134798051447e-06, + "loss": 0.89295745, + "num_input_tokens_seen": 45045085, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.09375, + "step": 2087, + "time_per_iteration": 2.4342494010925293 + }, + { + "auxiliary_loss_clip": 0.01166252, + "auxiliary_loss_mlp": 0.01056999, + "balance_loss_clip": 1.03444421, + "balance_loss_mlp": 1.05230761, + "epoch": 0.12553735157071996, + "flos": 23878549365120.0, + "grad_norm": 3.239876707553976, + "language_loss": 0.73480451, + "learning_rate": 3.905016237952136e-06, + "loss": 0.75703704, + "num_input_tokens_seen": 45065145, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.140625, + "step": 2088, + "time_per_iteration": 2.4926228523254395 + }, + { + "auxiliary_loss_clip": 0.01053685, + "auxiliary_loss_mlp": 0.01004858, + "balance_loss_clip": 1.00259304, + "balance_loss_mlp": 1.01231122, + "epoch": 0.12559747482338796, + "flos": 69920841830400.0, + "grad_norm": 0.759594920780347, + "language_loss": 0.61699253, + "learning_rate": 3.904897605614418e-06, + "loss": 0.63757795, + "num_input_tokens_seen": 45126230, + "router_z_loss_clip": 0.02270508, + "router_z_loss_mlp": 0.4140625, + "step": 2089, + "time_per_iteration": 3.0373222827911377 + }, + { + "auxiliary_loss_clip": 0.01165987, + "auxiliary_loss_mlp": 0.01057326, + "balance_loss_clip": 1.03522468, + "balance_loss_mlp": 1.05317736, + "epoch": 0.12565759807605592, + "flos": 24279563779200.0, + "grad_norm": 2.0159960445234746, + "language_loss": 0.78266793, + "learning_rate": 3.904778901042793e-06, + "loss": 0.80490106, + "num_input_tokens_seen": 45145545, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.125, + "step": 2090, + "time_per_iteration": 2.5307860374450684 + }, + { + "auxiliary_loss_clip": 0.01051163, + "auxiliary_loss_mlp": 0.01005786, + "balance_loss_clip": 1.00381935, + "balance_loss_mlp": 1.01062346, + "epoch": 0.12571772132872389, + "flos": 56451180286080.0, + "grad_norm": 0.749206069507312, + "language_loss": 0.59394926, + "learning_rate": 3.90466012424176e-06, + "loss": 0.61451876, + "num_input_tokens_seen": 45206845, + "router_z_loss_clip": 0.01965332, + "router_z_loss_mlp": 0.40625, + "step": 2091, + "time_per_iteration": 2.976081609725952 + }, + { + "auxiliary_loss_clip": 0.01166574, + "auxiliary_loss_mlp": 0.0105012, + "balance_loss_clip": 1.03016472, + "balance_loss_mlp": 1.0538522, + "epoch": 0.12577784458139185, + "flos": 41245846675200.0, + "grad_norm": 1.8692826570762828, + "language_loss": 0.63588953, + "learning_rate": 3.904541275215825e-06, + "loss": 0.6580565, + "num_input_tokens_seen": 45228495, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.125, + "step": 2092, + "time_per_iteration": 2.633054733276367 + }, + { + "auxiliary_loss_clip": 0.01169654, + "auxiliary_loss_mlp": 0.01059319, + "balance_loss_clip": 1.03770638, + "balance_loss_mlp": 1.05095637, + "epoch": 0.12583796783405982, + "flos": 19755501799680.0, + "grad_norm": 3.3800613541528257, + "language_loss": 0.80149096, + "learning_rate": 3.904422353969493e-06, + "loss": 0.82378066, + "num_input_tokens_seen": 45245720, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.1875, + "step": 2093, + "time_per_iteration": 2.4769086837768555 + }, + { + "auxiliary_loss_clip": 0.01166637, + "auxiliary_loss_mlp": 0.01065148, + "balance_loss_clip": 1.04385769, + "balance_loss_mlp": 1.05323935, + "epoch": 0.12589809108672778, + "flos": 22602104680320.0, + "grad_norm": 1.7179534274341421, + "language_loss": 0.75928843, + "learning_rate": 3.904303360507276e-06, + "loss": 0.78160632, + "num_input_tokens_seen": 45265650, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1328125, + "step": 2094, + "time_per_iteration": 2.4775569438934326 + }, + { + "auxiliary_loss_clip": 0.01163905, + "auxiliary_loss_mlp": 0.01053098, + "balance_loss_clip": 1.03322637, + "balance_loss_mlp": 1.05116057, + "epoch": 0.12595821433939577, + "flos": 45222845541120.0, + "grad_norm": 1.654740537988477, + "language_loss": 0.76833487, + "learning_rate": 3.9041842948336835e-06, + "loss": 0.79050487, + "num_input_tokens_seen": 45287790, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.125, + "step": 2095, + "time_per_iteration": 2.669593095779419 + }, + { + "auxiliary_loss_clip": 0.01166425, + "auxiliary_loss_mlp": 0.01064344, + "balance_loss_clip": 1.04330409, + "balance_loss_mlp": 1.05012596, + "epoch": 0.12601833759206374, + "flos": 14319811618560.0, + "grad_norm": 2.7658625824396568, + "language_loss": 0.8312341, + "learning_rate": 3.904065156953232e-06, + "loss": 0.85354173, + "num_input_tokens_seen": 45305720, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1640625, + "step": 2096, + "time_per_iteration": 2.446169853210449 + }, + { + "auxiliary_loss_clip": 0.01167307, + "auxiliary_loss_mlp": 0.0105403, + "balance_loss_clip": 1.03317988, + "balance_loss_mlp": 1.05236387, + "epoch": 0.1260784608447317, + "flos": 21288241002240.0, + "grad_norm": 1.9365429623482773, + "language_loss": 0.7532599, + "learning_rate": 3.903945946870439e-06, + "loss": 0.77547324, + "num_input_tokens_seen": 45325290, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1484375, + "step": 2097, + "time_per_iteration": 2.46520733833313 + }, + { + "auxiliary_loss_clip": 0.0116818, + "auxiliary_loss_mlp": 0.0105919, + "balance_loss_clip": 1.0399375, + "balance_loss_mlp": 1.05366278, + "epoch": 0.12613858409739967, + "flos": 26251311006720.0, + "grad_norm": 2.0415683165998004, + "language_loss": 0.8696878, + "learning_rate": 3.9038266645898246e-06, + "loss": 0.89196146, + "num_input_tokens_seen": 45344465, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.1484375, + "step": 2098, + "time_per_iteration": 2.488985061645508 + }, + { + "auxiliary_loss_clip": 0.01171506, + "auxiliary_loss_mlp": 0.01063691, + "balance_loss_clip": 1.03984964, + "balance_loss_mlp": 1.05263424, + "epoch": 0.12619870735006763, + "flos": 21579979265280.0, + "grad_norm": 1.8810788789855342, + "language_loss": 0.69538295, + "learning_rate": 3.903707310115912e-06, + "loss": 0.71773493, + "num_input_tokens_seen": 45362465, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.1875, + "step": 2099, + "time_per_iteration": 2.4791061878204346 + }, + { + "auxiliary_loss_clip": 0.01167442, + "auxiliary_loss_mlp": 0.01058165, + "balance_loss_clip": 1.03538442, + "balance_loss_mlp": 1.05016196, + "epoch": 0.1262588306027356, + "flos": 23367037737600.0, + "grad_norm": 3.489186386071109, + "language_loss": 0.81622505, + "learning_rate": 3.903587883453228e-06, + "loss": 0.83848113, + "num_input_tokens_seen": 45382700, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.171875, + "step": 2100, + "time_per_iteration": 2.4970083236694336 + }, + { + "auxiliary_loss_clip": 0.01170444, + "auxiliary_loss_mlp": 0.01056399, + "balance_loss_clip": 1.03558493, + "balance_loss_mlp": 1.05375385, + "epoch": 0.12631895385540357, + "flos": 23949185460480.0, + "grad_norm": 21.240028764463403, + "language_loss": 0.80653214, + "learning_rate": 3.903468384606302e-06, + "loss": 0.82880062, + "num_input_tokens_seen": 45401005, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.1640625, + "step": 2101, + "time_per_iteration": 5.441275596618652 + }, + { + "auxiliary_loss_clip": 0.01053889, + "auxiliary_loss_mlp": 0.01009667, + "balance_loss_clip": 1.00753367, + "balance_loss_mlp": 1.01423335, + "epoch": 0.12637907710807156, + "flos": 70282138780800.0, + "grad_norm": 0.7055092704674581, + "language_loss": 0.57077372, + "learning_rate": 3.903348813579662e-06, + "loss": 0.59140933, + "num_input_tokens_seen": 45466555, + "router_z_loss_clip": 0.0213623, + "router_z_loss_mlp": 0.39648438, + "step": 2102, + "time_per_iteration": 4.4595959186553955 + }, + { + "auxiliary_loss_clip": 0.01172982, + "auxiliary_loss_mlp": 0.0105633, + "balance_loss_clip": 1.03513408, + "balance_loss_mlp": 1.05443108, + "epoch": 0.12643920036073952, + "flos": 18915084311040.0, + "grad_norm": 1.9163731362545673, + "language_loss": 0.93033105, + "learning_rate": 3.903229170377845e-06, + "loss": 0.9526242, + "num_input_tokens_seen": 45485165, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1875, + "step": 2103, + "time_per_iteration": 2.4612908363342285 + }, + { + "auxiliary_loss_clip": 0.01160763, + "auxiliary_loss_mlp": 0.01039063, + "balance_loss_clip": 1.01929784, + "balance_loss_mlp": 1.05146646, + "epoch": 0.1264993236134075, + "flos": 27782470010880.0, + "grad_norm": 1.70771861982282, + "language_loss": 0.7804687, + "learning_rate": 3.903109455005387e-06, + "loss": 0.80246699, + "num_input_tokens_seen": 45504630, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.09375, + "step": 2104, + "time_per_iteration": 2.556351661682129 + }, + { + "auxiliary_loss_clip": 0.01173017, + "auxiliary_loss_mlp": 0.01056721, + "balance_loss_clip": 1.03659892, + "balance_loss_mlp": 1.05698192, + "epoch": 0.12655944686607545, + "flos": 24754697907840.0, + "grad_norm": 1.9983303318130716, + "language_loss": 0.81274837, + "learning_rate": 3.902989667466828e-06, + "loss": 0.83504581, + "num_input_tokens_seen": 45524885, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.15625, + "step": 2105, + "time_per_iteration": 2.4998059272766113 + }, + { + "auxiliary_loss_clip": 0.01177911, + "auxiliary_loss_mlp": 0.01057389, + "balance_loss_clip": 1.03515697, + "balance_loss_mlp": 1.05756688, + "epoch": 0.12661957011874342, + "flos": 24133048202880.0, + "grad_norm": 2.6618923007939728, + "language_loss": 0.83258855, + "learning_rate": 3.90286980776671e-06, + "loss": 0.85494161, + "num_input_tokens_seen": 45545000, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.203125, + "step": 2106, + "time_per_iteration": 2.4816856384277344 + }, + { + "auxiliary_loss_clip": 0.01170292, + "auxiliary_loss_mlp": 0.01048713, + "balance_loss_clip": 1.02755296, + "balance_loss_mlp": 1.05664992, + "epoch": 0.12667969337141138, + "flos": 24569614103040.0, + "grad_norm": 2.017673348074064, + "language_loss": 0.73717511, + "learning_rate": 3.902749875909578e-06, + "loss": 0.75936514, + "num_input_tokens_seen": 45564210, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.140625, + "step": 2107, + "time_per_iteration": 2.503575325012207 + }, + { + "auxiliary_loss_clip": 0.01166119, + "auxiliary_loss_mlp": 0.01046685, + "balance_loss_clip": 1.02683651, + "balance_loss_mlp": 1.05330598, + "epoch": 0.12673981662407935, + "flos": 22961677777920.0, + "grad_norm": 2.8409726657459213, + "language_loss": 0.79492414, + "learning_rate": 3.90262987189998e-06, + "loss": 0.81705213, + "num_input_tokens_seen": 45583030, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.125, + "step": 2108, + "time_per_iteration": 2.448009967803955 + }, + { + "auxiliary_loss_clip": 0.01168328, + "auxiliary_loss_mlp": 0.01048086, + "balance_loss_clip": 1.02635407, + "balance_loss_mlp": 1.05213785, + "epoch": 0.12679993987674734, + "flos": 17274864637440.0, + "grad_norm": 2.700834997101356, + "language_loss": 0.75458848, + "learning_rate": 3.902509795742467e-06, + "loss": 0.77675259, + "num_input_tokens_seen": 45602265, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.1640625, + "step": 2109, + "time_per_iteration": 2.463996171951294 + }, + { + "auxiliary_loss_clip": 0.01165378, + "auxiliary_loss_mlp": 0.01046335, + "balance_loss_clip": 1.02641523, + "balance_loss_mlp": 1.05309939, + "epoch": 0.1268600631294153, + "flos": 17275080119040.0, + "grad_norm": 5.620565406896926, + "language_loss": 0.82876229, + "learning_rate": 3.902389647441592e-06, + "loss": 0.85087943, + "num_input_tokens_seen": 45620595, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.125, + "step": 2110, + "time_per_iteration": 2.4536476135253906 + }, + { + "auxiliary_loss_clip": 0.01166918, + "auxiliary_loss_mlp": 0.01055332, + "balance_loss_clip": 1.03271818, + "balance_loss_mlp": 1.0524385, + "epoch": 0.12692018638208327, + "flos": 24061047390720.0, + "grad_norm": 1.8108257578185059, + "language_loss": 0.78553301, + "learning_rate": 3.90226942700191e-06, + "loss": 0.80775553, + "num_input_tokens_seen": 45641140, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.140625, + "step": 2111, + "time_per_iteration": 2.4898500442504883 + }, + { + "auxiliary_loss_clip": 0.01178398, + "auxiliary_loss_mlp": 0.01069762, + "balance_loss_clip": 1.04634905, + "balance_loss_mlp": 1.05599511, + "epoch": 0.12698030963475124, + "flos": 31831900652160.0, + "grad_norm": 2.2255287569010567, + "language_loss": 0.76852119, + "learning_rate": 3.902149134427982e-06, + "loss": 0.79100275, + "num_input_tokens_seen": 45662315, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.2265625, + "step": 2112, + "time_per_iteration": 2.534062623977661 + }, + { + "auxiliary_loss_clip": 0.0116691, + "auxiliary_loss_mlp": 0.01060346, + "balance_loss_clip": 1.03878117, + "balance_loss_mlp": 1.05138493, + "epoch": 0.1270404328874192, + "flos": 25187744275200.0, + "grad_norm": 1.901101750436338, + "language_loss": 0.85764933, + "learning_rate": 3.902028769724367e-06, + "loss": 0.87992191, + "num_input_tokens_seen": 45680335, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.15625, + "step": 2113, + "time_per_iteration": 2.4980924129486084 + }, + { + "auxiliary_loss_clip": 0.01166421, + "auxiliary_loss_mlp": 0.01057595, + "balance_loss_clip": 1.03581548, + "balance_loss_mlp": 1.05287683, + "epoch": 0.12710055614008717, + "flos": 15997342544640.0, + "grad_norm": 2.270588429793272, + "language_loss": 0.74000478, + "learning_rate": 3.9019083328956315e-06, + "loss": 0.76224494, + "num_input_tokens_seen": 45696240, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.1328125, + "step": 2114, + "time_per_iteration": 2.422631025314331 + }, + { + "auxiliary_loss_clip": 0.01170563, + "auxiliary_loss_mlp": 0.01057942, + "balance_loss_clip": 1.03504217, + "balance_loss_mlp": 1.05601084, + "epoch": 0.12716067939275516, + "flos": 15085642515840.0, + "grad_norm": 1.7902572486589996, + "language_loss": 0.83236456, + "learning_rate": 3.901787823946341e-06, + "loss": 0.85464966, + "num_input_tokens_seen": 45713695, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.1484375, + "step": 2115, + "time_per_iteration": 2.4601340293884277 + }, + { + "auxiliary_loss_clip": 0.01169954, + "auxiliary_loss_mlp": 0.01060607, + "balance_loss_clip": 1.03953075, + "balance_loss_mlp": 1.05397201, + "epoch": 0.12722080264542313, + "flos": 28366736636160.0, + "grad_norm": 1.532692301262898, + "language_loss": 0.86615002, + "learning_rate": 3.901667242881065e-06, + "loss": 0.88845563, + "num_input_tokens_seen": 45736655, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1640625, + "step": 2116, + "time_per_iteration": 2.5315732955932617 + }, + { + "auxiliary_loss_clip": 0.01164638, + "auxiliary_loss_mlp": 0.01050843, + "balance_loss_clip": 1.03062534, + "balance_loss_mlp": 1.05188024, + "epoch": 0.1272809258980911, + "flos": 32379897519360.0, + "grad_norm": 1.8525451323112498, + "language_loss": 0.70492947, + "learning_rate": 3.9015465897043775e-06, + "loss": 0.72708428, + "num_input_tokens_seen": 45758195, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.125, + "step": 2117, + "time_per_iteration": 2.6213905811309814 + }, + { + "auxiliary_loss_clip": 0.01168229, + "auxiliary_loss_mlp": 0.01055103, + "balance_loss_clip": 1.03346658, + "balance_loss_mlp": 1.05461121, + "epoch": 0.12734104915075906, + "flos": 16034402401920.0, + "grad_norm": 2.4058915352959294, + "language_loss": 0.86858076, + "learning_rate": 3.901425864420852e-06, + "loss": 0.89081407, + "num_input_tokens_seen": 45774280, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.140625, + "step": 2118, + "time_per_iteration": 2.4760360717773438 + }, + { + "auxiliary_loss_clip": 0.01164532, + "auxiliary_loss_mlp": 0.01048268, + "balance_loss_clip": 1.0279547, + "balance_loss_mlp": 1.0518508, + "epoch": 0.12740117240342702, + "flos": 18260325244800.0, + "grad_norm": 1.7933295144796901, + "language_loss": 0.87325591, + "learning_rate": 3.901305067035068e-06, + "loss": 0.89538383, + "num_input_tokens_seen": 45792760, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.125, + "step": 2119, + "time_per_iteration": 2.547213315963745 + }, + { + "auxiliary_loss_clip": 0.01167828, + "auxiliary_loss_mlp": 0.01051741, + "balance_loss_clip": 1.03024805, + "balance_loss_mlp": 1.05369782, + "epoch": 0.127461295656095, + "flos": 12121790664960.0, + "grad_norm": 2.4444945117671018, + "language_loss": 0.8769815, + "learning_rate": 3.901184197551605e-06, + "loss": 0.89917719, + "num_input_tokens_seen": 45804300, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.140625, + "step": 2120, + "time_per_iteration": 2.4568872451782227 + }, + { + "auxiliary_loss_clip": 0.01169401, + "auxiliary_loss_mlp": 0.0104623, + "balance_loss_clip": 1.02553487, + "balance_loss_mlp": 1.05405664, + "epoch": 0.12752141890876295, + "flos": 23149095966720.0, + "grad_norm": 1.8558714180118523, + "language_loss": 0.75193042, + "learning_rate": 3.901063255975046e-06, + "loss": 0.77408671, + "num_input_tokens_seen": 45823780, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1484375, + "step": 2121, + "time_per_iteration": 2.508117437362671 + }, + { + "auxiliary_loss_clip": 0.01167335, + "auxiliary_loss_mlp": 0.01050063, + "balance_loss_clip": 1.02895081, + "balance_loss_mlp": 1.05228865, + "epoch": 0.12758154216143094, + "flos": 21615997628160.0, + "grad_norm": 2.458066848563671, + "language_loss": 0.8294577, + "learning_rate": 3.900942242309978e-06, + "loss": 0.8516317, + "num_input_tokens_seen": 45840495, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.15625, + "step": 2122, + "time_per_iteration": 2.4878990650177 + }, + { + "auxiliary_loss_clip": 0.01168476, + "auxiliary_loss_mlp": 0.01050394, + "balance_loss_clip": 1.02924609, + "balance_loss_mlp": 1.05379128, + "epoch": 0.1276416654140989, + "flos": 15924874855680.0, + "grad_norm": 2.1208761223769375, + "language_loss": 0.79040462, + "learning_rate": 3.90082115656099e-06, + "loss": 0.81259328, + "num_input_tokens_seen": 45857735, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1484375, + "step": 2123, + "time_per_iteration": 2.512085199356079 + }, + { + "auxiliary_loss_clip": 0.0117181, + "auxiliary_loss_mlp": 0.01056255, + "balance_loss_clip": 1.03411841, + "balance_loss_mlp": 1.05565643, + "epoch": 0.12770178866676687, + "flos": 22382690451840.0, + "grad_norm": 1.7846776317234667, + "language_loss": 0.79227948, + "learning_rate": 3.900699998732673e-06, + "loss": 0.81456017, + "num_input_tokens_seen": 45876485, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1640625, + "step": 2124, + "time_per_iteration": 2.4865264892578125 + }, + { + "auxiliary_loss_clip": 0.01168084, + "auxiliary_loss_mlp": 0.01054179, + "balance_loss_clip": 1.03267348, + "balance_loss_mlp": 1.05149364, + "epoch": 0.12776191191943484, + "flos": 21652482867840.0, + "grad_norm": 2.8175561910153215, + "language_loss": 0.75565529, + "learning_rate": 3.900578768829623e-06, + "loss": 0.77787793, + "num_input_tokens_seen": 45894645, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.1640625, + "step": 2125, + "time_per_iteration": 2.514455795288086 + }, + { + "auxiliary_loss_clip": 0.01166899, + "auxiliary_loss_mlp": 0.01047376, + "balance_loss_clip": 1.02645469, + "balance_loss_mlp": 1.05262208, + "epoch": 0.1278220351721028, + "flos": 25735561574400.0, + "grad_norm": 2.1990589160087493, + "language_loss": 0.77811432, + "learning_rate": 3.900457466856434e-06, + "loss": 0.80025709, + "num_input_tokens_seen": 45913755, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.140625, + "step": 2126, + "time_per_iteration": 2.556657075881958 + }, + { + "auxiliary_loss_clip": 0.01167875, + "auxiliary_loss_mlp": 0.01050746, + "balance_loss_clip": 1.03124356, + "balance_loss_mlp": 1.05559683, + "epoch": 0.12788215842477077, + "flos": 41243224982400.0, + "grad_norm": 1.702389562623477, + "language_loss": 0.69255161, + "learning_rate": 3.9003360928177085e-06, + "loss": 0.71473777, + "num_input_tokens_seen": 45936095, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.125, + "step": 2127, + "time_per_iteration": 2.629990339279175 + }, + { + "auxiliary_loss_clip": 0.01050691, + "auxiliary_loss_mlp": 0.01005275, + "balance_loss_clip": 1.00326061, + "balance_loss_mlp": 1.01139402, + "epoch": 0.12794228167743876, + "flos": 70877430881280.0, + "grad_norm": 0.8552720802624753, + "language_loss": 0.62738979, + "learning_rate": 3.900214646718047e-06, + "loss": 0.64794946, + "num_input_tokens_seen": 46004655, + "router_z_loss_clip": 0.0201416, + "router_z_loss_mlp": 0.39257812, + "step": 2128, + "time_per_iteration": 3.1237356662750244 + }, + { + "auxiliary_loss_clip": 0.01168478, + "auxiliary_loss_mlp": 0.01048721, + "balance_loss_clip": 1.02646422, + "balance_loss_mlp": 1.05287039, + "epoch": 0.12800240493010673, + "flos": 16289727252480.0, + "grad_norm": 2.3711218915030368, + "language_loss": 0.77148604, + "learning_rate": 3.900093128562056e-06, + "loss": 0.79365802, + "num_input_tokens_seen": 46023610, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.15625, + "step": 2129, + "time_per_iteration": 2.4499564170837402 + }, + { + "auxiliary_loss_clip": 0.01179121, + "auxiliary_loss_mlp": 0.01052089, + "balance_loss_clip": 1.02902186, + "balance_loss_mlp": 1.05744195, + "epoch": 0.1280625281827747, + "flos": 20631542601600.0, + "grad_norm": 2.273395516882369, + "language_loss": 0.79321349, + "learning_rate": 3.899971538354343e-06, + "loss": 0.81552559, + "num_input_tokens_seen": 46041725, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.21875, + "step": 2130, + "time_per_iteration": 2.4536893367767334 + }, + { + "auxiliary_loss_clip": 0.0116812, + "auxiliary_loss_mlp": 0.01044456, + "balance_loss_clip": 1.02463198, + "balance_loss_mlp": 1.05328345, + "epoch": 0.12812265143544266, + "flos": 22638230784000.0, + "grad_norm": 2.267455405666958, + "language_loss": 0.70879477, + "learning_rate": 3.899849876099518e-06, + "loss": 0.73092055, + "num_input_tokens_seen": 46061095, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.1484375, + "step": 2131, + "time_per_iteration": 2.514155149459839 + }, + { + "auxiliary_loss_clip": 0.01166691, + "auxiliary_loss_mlp": 0.01051797, + "balance_loss_clip": 1.03007698, + "balance_loss_mlp": 1.05375445, + "epoch": 0.12818277468811062, + "flos": 34714701463680.0, + "grad_norm": 2.2952793086030376, + "language_loss": 0.72266257, + "learning_rate": 3.899728141802197e-06, + "loss": 0.74484742, + "num_input_tokens_seen": 46082670, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.125, + "step": 2132, + "time_per_iteration": 2.5662834644317627 + }, + { + "auxiliary_loss_clip": 0.01163765, + "auxiliary_loss_mlp": 0.01054914, + "balance_loss_clip": 1.03396928, + "balance_loss_mlp": 1.05281162, + "epoch": 0.1282428979407786, + "flos": 23112107936640.0, + "grad_norm": 2.1162344308699828, + "language_loss": 0.82306767, + "learning_rate": 3.8996063354669935e-06, + "loss": 0.84525442, + "num_input_tokens_seen": 46102410, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.109375, + "step": 2133, + "time_per_iteration": 2.488302230834961 + }, + { + "auxiliary_loss_clip": 0.01174206, + "auxiliary_loss_mlp": 0.01061813, + "balance_loss_clip": 1.03871, + "balance_loss_mlp": 1.05329132, + "epoch": 0.12830302119344655, + "flos": 20886508316160.0, + "grad_norm": 2.538367341661163, + "language_loss": 0.79631573, + "learning_rate": 3.899484457098528e-06, + "loss": 0.81867594, + "num_input_tokens_seen": 46121145, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.203125, + "step": 2134, + "time_per_iteration": 2.4610936641693115 + }, + { + "auxiliary_loss_clip": 0.01172893, + "auxiliary_loss_mlp": 0.01045118, + "balance_loss_clip": 1.02393413, + "balance_loss_mlp": 1.05650806, + "epoch": 0.12836314444611455, + "flos": 21397768548480.0, + "grad_norm": 2.033800341734765, + "language_loss": 0.83015293, + "learning_rate": 3.899362506701421e-06, + "loss": 0.85233301, + "num_input_tokens_seen": 46140740, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1640625, + "step": 2135, + "time_per_iteration": 2.4743056297302246 + }, + { + "auxiliary_loss_clip": 0.01165668, + "auxiliary_loss_mlp": 0.01061205, + "balance_loss_clip": 1.03842425, + "balance_loss_mlp": 1.05173945, + "epoch": 0.1284232676987825, + "flos": 13662466773120.0, + "grad_norm": 2.9021762622464853, + "language_loss": 0.77293968, + "learning_rate": 3.899240484280298e-06, + "loss": 0.79520839, + "num_input_tokens_seen": 46156805, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.140625, + "step": 2136, + "time_per_iteration": 2.4412362575531006 + }, + { + "auxiliary_loss_clip": 0.01051729, + "auxiliary_loss_mlp": 0.01001869, + "balance_loss_clip": 0.99983084, + "balance_loss_mlp": 1.01248765, + "epoch": 0.12848339095145048, + "flos": 59994737735040.0, + "grad_norm": 0.8943310105061408, + "language_loss": 0.59115362, + "learning_rate": 3.899118389839785e-06, + "loss": 0.61168963, + "num_input_tokens_seen": 46222085, + "router_z_loss_clip": 0.02038574, + "router_z_loss_mlp": 0.39257812, + "step": 2137, + "time_per_iteration": 3.2407264709472656 + }, + { + "auxiliary_loss_clip": 0.01164926, + "auxiliary_loss_mlp": 0.01052629, + "balance_loss_clip": 1.03207743, + "balance_loss_mlp": 1.04970789, + "epoch": 0.12854351420411844, + "flos": 13881378211200.0, + "grad_norm": 2.4694787743163404, + "language_loss": 0.81923193, + "learning_rate": 3.898996223384512e-06, + "loss": 0.84140748, + "num_input_tokens_seen": 46239970, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.15625, + "step": 2138, + "time_per_iteration": 2.441708564758301 + }, + { + "auxiliary_loss_clip": 0.01170897, + "auxiliary_loss_mlp": 0.01055556, + "balance_loss_clip": 1.03207207, + "balance_loss_mlp": 1.05353928, + "epoch": 0.1286036374567864, + "flos": 22637943475200.0, + "grad_norm": 2.804990264663657, + "language_loss": 0.79418135, + "learning_rate": 3.898873984919113e-06, + "loss": 0.81644583, + "num_input_tokens_seen": 46257740, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.171875, + "step": 2139, + "time_per_iteration": 2.5321907997131348 + }, + { + "auxiliary_loss_clip": 0.01169458, + "auxiliary_loss_mlp": 0.0104552, + "balance_loss_clip": 1.02488446, + "balance_loss_mlp": 1.05315363, + "epoch": 0.12866376070945437, + "flos": 16324775948160.0, + "grad_norm": 2.1742564972583667, + "language_loss": 0.84761363, + "learning_rate": 3.8987516744482215e-06, + "loss": 0.86976337, + "num_input_tokens_seen": 46275445, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.1640625, + "step": 2140, + "time_per_iteration": 2.469543933868408 + }, + { + "auxiliary_loss_clip": 0.01164368, + "auxiliary_loss_mlp": 0.01045521, + "balance_loss_clip": 1.02524316, + "balance_loss_mlp": 1.05079114, + "epoch": 0.12872388396212234, + "flos": 11874546374400.0, + "grad_norm": 2.376703775404894, + "language_loss": 0.85850012, + "learning_rate": 3.898629291976476e-06, + "loss": 0.88059902, + "num_input_tokens_seen": 46291710, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1328125, + "step": 2141, + "time_per_iteration": 2.473205327987671 + }, + { + "auxiliary_loss_clip": 0.0116884, + "auxiliary_loss_mlp": 0.01049008, + "balance_loss_clip": 1.0278126, + "balance_loss_mlp": 1.05059922, + "epoch": 0.12878400721479033, + "flos": 28366700722560.0, + "grad_norm": 3.411777854813752, + "language_loss": 0.68245387, + "learning_rate": 3.898506837508518e-06, + "loss": 0.7046324, + "num_input_tokens_seen": 46311335, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1796875, + "step": 2142, + "time_per_iteration": 2.5327556133270264 + }, + { + "auxiliary_loss_clip": 0.01171992, + "auxiliary_loss_mlp": 0.01048809, + "balance_loss_clip": 1.02702951, + "balance_loss_mlp": 1.05430341, + "epoch": 0.1288441304674583, + "flos": 25885632597120.0, + "grad_norm": 2.0295098459565692, + "language_loss": 0.82883704, + "learning_rate": 3.89838431104899e-06, + "loss": 0.85104507, + "num_input_tokens_seen": 46330985, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.171875, + "step": 2143, + "time_per_iteration": 4.014873743057251 + }, + { + "auxiliary_loss_clip": 0.01171398, + "auxiliary_loss_mlp": 0.01053828, + "balance_loss_clip": 1.03262091, + "balance_loss_mlp": 1.05572712, + "epoch": 0.12890425372012626, + "flos": 20813789232000.0, + "grad_norm": 1.7367706894947552, + "language_loss": 0.81788546, + "learning_rate": 3.898261712602539e-06, + "loss": 0.84013772, + "num_input_tokens_seen": 46351295, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.15625, + "step": 2144, + "time_per_iteration": 4.002255439758301 + }, + { + "auxiliary_loss_clip": 0.0116509, + "auxiliary_loss_mlp": 0.0105384, + "balance_loss_clip": 1.03108335, + "balance_loss_mlp": 1.04864693, + "epoch": 0.12896437697279423, + "flos": 22565870835840.0, + "grad_norm": 3.8817809862500727, + "language_loss": 0.78257203, + "learning_rate": 3.898139042173813e-06, + "loss": 0.80476135, + "num_input_tokens_seen": 46368600, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.1640625, + "step": 2145, + "time_per_iteration": 2.4952287673950195 + }, + { + "auxiliary_loss_clip": 0.01167211, + "auxiliary_loss_mlp": 0.01049919, + "balance_loss_clip": 1.02825832, + "balance_loss_mlp": 1.05031526, + "epoch": 0.1290245002254622, + "flos": 17493776075520.0, + "grad_norm": 2.1659704609946897, + "language_loss": 0.82622325, + "learning_rate": 3.898016299767465e-06, + "loss": 0.84839463, + "num_input_tokens_seen": 46387370, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.171875, + "step": 2146, + "time_per_iteration": 2.4898681640625 + }, + { + "auxiliary_loss_clip": 0.01165601, + "auxiliary_loss_mlp": 0.01051615, + "balance_loss_clip": 1.02959681, + "balance_loss_mlp": 1.05129158, + "epoch": 0.12908462347813016, + "flos": 36315957859200.0, + "grad_norm": 2.717320122986492, + "language_loss": 0.70446974, + "learning_rate": 3.897893485388149e-06, + "loss": 0.72664189, + "num_input_tokens_seen": 46409570, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.140625, + "step": 2147, + "time_per_iteration": 2.5964484214782715 + }, + { + "auxiliary_loss_clip": 0.01165989, + "auxiliary_loss_mlp": 0.01051149, + "balance_loss_clip": 1.03069305, + "balance_loss_mlp": 1.05166912, + "epoch": 0.12914474673079815, + "flos": 22528703237760.0, + "grad_norm": 2.443887417123452, + "language_loss": 0.71685153, + "learning_rate": 3.897770599040521e-06, + "loss": 0.73902297, + "num_input_tokens_seen": 46429320, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.140625, + "step": 2148, + "time_per_iteration": 2.4910573959350586 + }, + { + "auxiliary_loss_clip": 0.01165944, + "auxiliary_loss_mlp": 0.01046939, + "balance_loss_clip": 1.02681684, + "balance_loss_mlp": 1.05413008, + "epoch": 0.12920486998346611, + "flos": 21471888263040.0, + "grad_norm": 1.666574129953403, + "language_loss": 0.79379606, + "learning_rate": 3.897647640729242e-06, + "loss": 0.81592482, + "num_input_tokens_seen": 46450155, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.1171875, + "step": 2149, + "time_per_iteration": 2.495443820953369 + }, + { + "auxiliary_loss_clip": 0.01167493, + "auxiliary_loss_mlp": 0.01046346, + "balance_loss_clip": 1.02455473, + "balance_loss_mlp": 1.05306077, + "epoch": 0.12926499323613408, + "flos": 27308556944640.0, + "grad_norm": 2.1379132369478313, + "language_loss": 0.76475441, + "learning_rate": 3.897524610458975e-06, + "loss": 0.78689277, + "num_input_tokens_seen": 46470280, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.140625, + "step": 2150, + "time_per_iteration": 2.524395704269409 + }, + { + "auxiliary_loss_clip": 0.01166143, + "auxiliary_loss_mlp": 0.0105244, + "balance_loss_clip": 1.03124499, + "balance_loss_mlp": 1.05094671, + "epoch": 0.12932511648880204, + "flos": 22091131756800.0, + "grad_norm": 2.417935370690141, + "language_loss": 0.70735669, + "learning_rate": 3.8974015082343835e-06, + "loss": 0.72954249, + "num_input_tokens_seen": 46487605, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1484375, + "step": 2151, + "time_per_iteration": 2.5213184356689453 + }, + { + "auxiliary_loss_clip": 0.01165721, + "auxiliary_loss_mlp": 0.01044761, + "balance_loss_clip": 1.02502, + "balance_loss_mlp": 1.05457592, + "epoch": 0.12938523974147, + "flos": 20302780394880.0, + "grad_norm": 1.9866869590783298, + "language_loss": 0.84050369, + "learning_rate": 3.897278334060137e-06, + "loss": 0.86260849, + "num_input_tokens_seen": 46505100, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.109375, + "step": 2152, + "time_per_iteration": 2.4494428634643555 + }, + { + "auxiliary_loss_clip": 0.01167192, + "auxiliary_loss_mlp": 0.01057934, + "balance_loss_clip": 1.03689384, + "balance_loss_mlp": 1.05128813, + "epoch": 0.12944536299413797, + "flos": 19499961467520.0, + "grad_norm": 2.226463520109079, + "language_loss": 0.78646791, + "learning_rate": 3.897155087940906e-06, + "loss": 0.80871922, + "num_input_tokens_seen": 46524020, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.15625, + "step": 2153, + "time_per_iteration": 2.46975040435791 + }, + { + "auxiliary_loss_clip": 0.01163518, + "auxiliary_loss_mlp": 0.01052865, + "balance_loss_clip": 1.03220654, + "balance_loss_mlp": 1.05069268, + "epoch": 0.12950548624680594, + "flos": 27707919333120.0, + "grad_norm": 2.482522823334948, + "language_loss": 0.80135351, + "learning_rate": 3.897031769881364e-06, + "loss": 0.82351738, + "num_input_tokens_seen": 46544640, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.125, + "step": 2154, + "time_per_iteration": 2.558769941329956 + }, + { + "auxiliary_loss_clip": 0.01170487, + "auxiliary_loss_mlp": 0.01051039, + "balance_loss_clip": 1.02998686, + "balance_loss_mlp": 1.05522227, + "epoch": 0.12956560949947393, + "flos": 17565740974080.0, + "grad_norm": 2.0988715261553774, + "language_loss": 0.83128881, + "learning_rate": 3.896908379886188e-06, + "loss": 0.85350406, + "num_input_tokens_seen": 46561395, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1484375, + "step": 2155, + "time_per_iteration": 2.476299524307251 + }, + { + "auxiliary_loss_clip": 0.01164922, + "auxiliary_loss_mlp": 0.01050282, + "balance_loss_clip": 1.02961075, + "balance_loss_mlp": 1.05010283, + "epoch": 0.1296257327521419, + "flos": 20740711011840.0, + "grad_norm": 2.842594732542889, + "language_loss": 0.76062953, + "learning_rate": 3.896784917960055e-06, + "loss": 0.7827816, + "num_input_tokens_seen": 46579395, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1484375, + "step": 2156, + "time_per_iteration": 2.6024632453918457 + }, + { + "auxiliary_loss_clip": 0.01161875, + "auxiliary_loss_mlp": 0.01051596, + "balance_loss_clip": 1.03078222, + "balance_loss_mlp": 1.05121815, + "epoch": 0.12968585600480986, + "flos": 16395735265920.0, + "grad_norm": 1.9934077258859366, + "language_loss": 0.86546719, + "learning_rate": 3.896661384107648e-06, + "loss": 0.88760191, + "num_input_tokens_seen": 46597090, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.109375, + "step": 2157, + "time_per_iteration": 2.452826976776123 + }, + { + "auxiliary_loss_clip": 0.01164359, + "auxiliary_loss_mlp": 0.01059125, + "balance_loss_clip": 1.03745282, + "balance_loss_mlp": 1.04796743, + "epoch": 0.12974597925747783, + "flos": 28329533124480.0, + "grad_norm": 2.339899004847696, + "language_loss": 0.80590808, + "learning_rate": 3.896537778333651e-06, + "loss": 0.82814288, + "num_input_tokens_seen": 46617355, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.1640625, + "step": 2158, + "time_per_iteration": 2.5332443714141846 + }, + { + "auxiliary_loss_clip": 0.01169288, + "auxiliary_loss_mlp": 0.01055971, + "balance_loss_clip": 1.03510916, + "balance_loss_mlp": 1.05294585, + "epoch": 0.1298061025101458, + "flos": 9683025782400.0, + "grad_norm": 2.254282600322574, + "language_loss": 0.74603379, + "learning_rate": 3.896414100642752e-06, + "loss": 0.76828635, + "num_input_tokens_seen": 46633130, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1640625, + "step": 2159, + "time_per_iteration": 2.469038963317871 + }, + { + "auxiliary_loss_clip": 0.01158286, + "auxiliary_loss_mlp": 0.0105286, + "balance_loss_clip": 1.0323323, + "balance_loss_mlp": 1.04777908, + "epoch": 0.12986622576281376, + "flos": 27709535445120.0, + "grad_norm": 2.1260113568932746, + "language_loss": 0.8227706, + "learning_rate": 3.89629035103964e-06, + "loss": 0.84488213, + "num_input_tokens_seen": 46650575, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.109375, + "step": 2160, + "time_per_iteration": 2.516723155975342 + }, + { + "auxiliary_loss_clip": 0.01159917, + "auxiliary_loss_mlp": 0.01044215, + "balance_loss_clip": 1.02450943, + "balance_loss_mlp": 1.05318654, + "epoch": 0.12992634901548175, + "flos": 18802719590400.0, + "grad_norm": 1.6308358458278915, + "language_loss": 0.81877828, + "learning_rate": 3.896166529529008e-06, + "loss": 0.8408196, + "num_input_tokens_seen": 46668780, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0625, + "step": 2161, + "time_per_iteration": 2.4677131175994873 + }, + { + "auxiliary_loss_clip": 0.01161774, + "auxiliary_loss_mlp": 0.01056265, + "balance_loss_clip": 1.03479493, + "balance_loss_mlp": 1.05035043, + "epoch": 0.12998647226814972, + "flos": 29127575543040.0, + "grad_norm": 2.2782308625037686, + "language_loss": 0.82592809, + "learning_rate": 3.896042636115551e-06, + "loss": 0.84810847, + "num_input_tokens_seen": 46687550, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.109375, + "step": 2162, + "time_per_iteration": 2.5702993869781494 + }, + { + "auxiliary_loss_clip": 0.01164237, + "auxiliary_loss_mlp": 0.01054699, + "balance_loss_clip": 1.03454113, + "balance_loss_mlp": 1.04993796, + "epoch": 0.13004659552081768, + "flos": 19573686132480.0, + "grad_norm": 2.619296712638915, + "language_loss": 0.72762972, + "learning_rate": 3.895918670803968e-06, + "loss": 0.7498191, + "num_input_tokens_seen": 46706730, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.140625, + "step": 2163, + "time_per_iteration": 2.4531478881835938 + }, + { + "auxiliary_loss_clip": 0.0116607, + "auxiliary_loss_mlp": 0.01053845, + "balance_loss_clip": 1.03183889, + "balance_loss_mlp": 1.05107188, + "epoch": 0.13010671877348565, + "flos": 22490709626880.0, + "grad_norm": 2.0773433264348435, + "language_loss": 0.81498116, + "learning_rate": 3.895794633598958e-06, + "loss": 0.83718032, + "num_input_tokens_seen": 46724250, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1484375, + "step": 2164, + "time_per_iteration": 2.497072458267212 + }, + { + "auxiliary_loss_clip": 0.01164255, + "auxiliary_loss_mlp": 0.01042951, + "balance_loss_clip": 1.02381766, + "balance_loss_mlp": 1.05107093, + "epoch": 0.1301668420261536, + "flos": 23878226142720.0, + "grad_norm": 2.2040156749440523, + "language_loss": 0.72564822, + "learning_rate": 3.8956705245052256e-06, + "loss": 0.7477203, + "num_input_tokens_seen": 46744105, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.1328125, + "step": 2165, + "time_per_iteration": 2.515026807785034 + }, + { + "auxiliary_loss_clip": 0.01167589, + "auxiliary_loss_mlp": 0.01047652, + "balance_loss_clip": 1.02599204, + "balance_loss_mlp": 1.05286038, + "epoch": 0.13022696527882158, + "flos": 23150065633920.0, + "grad_norm": 2.8786436091142913, + "language_loss": 0.74697578, + "learning_rate": 3.8955463435274765e-06, + "loss": 0.76912814, + "num_input_tokens_seen": 46764250, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.1484375, + "step": 2166, + "time_per_iteration": 2.5301709175109863 + }, + { + "auxiliary_loss_clip": 0.01165477, + "auxiliary_loss_mlp": 0.01047606, + "balance_loss_clip": 1.02751899, + "balance_loss_mlp": 1.05156064, + "epoch": 0.13028708853148954, + "flos": 26908548111360.0, + "grad_norm": 1.5708346768068926, + "language_loss": 0.83053899, + "learning_rate": 3.895422090670421e-06, + "loss": 0.85266984, + "num_input_tokens_seen": 46786865, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.140625, + "step": 2167, + "time_per_iteration": 2.632035732269287 + }, + { + "auxiliary_loss_clip": 0.01163335, + "auxiliary_loss_mlp": 0.01060394, + "balance_loss_clip": 1.03931761, + "balance_loss_mlp": 1.05201721, + "epoch": 0.13034721178415754, + "flos": 21251468453760.0, + "grad_norm": 1.9158171210349437, + "language_loss": 0.83286303, + "learning_rate": 3.89529776593877e-06, + "loss": 0.85510027, + "num_input_tokens_seen": 46807030, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.109375, + "step": 2168, + "time_per_iteration": 2.4766387939453125 + }, + { + "auxiliary_loss_clip": 0.0116626, + "auxiliary_loss_mlp": 0.01052307, + "balance_loss_clip": 1.03075409, + "balance_loss_mlp": 1.05258656, + "epoch": 0.1304073350368255, + "flos": 18767239931520.0, + "grad_norm": 2.304013454801214, + "language_loss": 0.80027354, + "learning_rate": 3.8951733693372375e-06, + "loss": 0.82245922, + "num_input_tokens_seen": 46826280, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.140625, + "step": 2169, + "time_per_iteration": 2.5185413360595703 + }, + { + "auxiliary_loss_clip": 0.01166949, + "auxiliary_loss_mlp": 0.01045126, + "balance_loss_clip": 1.02329922, + "balance_loss_mlp": 1.05451608, + "epoch": 0.13046745828949347, + "flos": 28364653647360.0, + "grad_norm": 4.565704621626811, + "language_loss": 0.66456163, + "learning_rate": 3.8950489008705406e-06, + "loss": 0.68668246, + "num_input_tokens_seen": 46846505, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.125, + "step": 2170, + "time_per_iteration": 2.5556788444519043 + }, + { + "auxiliary_loss_clip": 0.01165384, + "auxiliary_loss_mlp": 0.01044241, + "balance_loss_clip": 1.02397573, + "balance_loss_mlp": 1.05294132, + "epoch": 0.13052758154216143, + "flos": 29605044055680.0, + "grad_norm": 1.848772151746763, + "language_loss": 0.66935396, + "learning_rate": 3.8949243605434e-06, + "loss": 0.69145024, + "num_input_tokens_seen": 46867380, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.125, + "step": 2171, + "time_per_iteration": 2.553422451019287 + }, + { + "auxiliary_loss_clip": 0.01164709, + "auxiliary_loss_mlp": 0.01048224, + "balance_loss_clip": 1.02649236, + "balance_loss_mlp": 1.05211711, + "epoch": 0.1305877047948294, + "flos": 19390864884480.0, + "grad_norm": 1.9479804069383955, + "language_loss": 0.71952963, + "learning_rate": 3.894799748360537e-06, + "loss": 0.74165899, + "num_input_tokens_seen": 46886810, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.125, + "step": 2172, + "time_per_iteration": 2.4801840782165527 + }, + { + "auxiliary_loss_clip": 0.01161466, + "auxiliary_loss_mlp": 0.01043706, + "balance_loss_clip": 1.02508521, + "balance_loss_mlp": 1.05435848, + "epoch": 0.13064782804749736, + "flos": 16873527000960.0, + "grad_norm": 1.8616776845407013, + "language_loss": 0.75547618, + "learning_rate": 3.894675064326678e-06, + "loss": 0.77752787, + "num_input_tokens_seen": 46905620, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.0703125, + "step": 2173, + "time_per_iteration": 2.4639194011688232 + }, + { + "auxiliary_loss_clip": 0.01165867, + "auxiliary_loss_mlp": 0.01055263, + "balance_loss_clip": 1.03406715, + "balance_loss_mlp": 1.05319107, + "epoch": 0.13070795130016533, + "flos": 24499085748480.0, + "grad_norm": 2.777389952877741, + "language_loss": 0.70484382, + "learning_rate": 3.894550308446551e-06, + "loss": 0.72705513, + "num_input_tokens_seen": 46925120, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.125, + "step": 2174, + "time_per_iteration": 2.4914908409118652 + }, + { + "auxiliary_loss_clip": 0.01055025, + "auxiliary_loss_mlp": 0.01011158, + "balance_loss_clip": 1.0087378, + "balance_loss_mlp": 1.0165, + "epoch": 0.13076807455283332, + "flos": 71054505953280.0, + "grad_norm": 0.8022263951171452, + "language_loss": 0.59071571, + "learning_rate": 3.894425480724886e-06, + "loss": 0.61137754, + "num_input_tokens_seen": 46988195, + "router_z_loss_clip": 0.02416992, + "router_z_loss_mlp": 0.38671875, + "step": 2175, + "time_per_iteration": 3.244633913040161 + }, + { + "auxiliary_loss_clip": 0.01164931, + "auxiliary_loss_mlp": 0.01051735, + "balance_loss_clip": 1.03214908, + "balance_loss_mlp": 1.05474329, + "epoch": 0.13082819780550128, + "flos": 20264499475200.0, + "grad_norm": 2.247504257537708, + "language_loss": 0.79946023, + "learning_rate": 3.894300581166417e-06, + "loss": 0.8216269, + "num_input_tokens_seen": 47004720, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.1015625, + "step": 2176, + "time_per_iteration": 2.439883232116699 + }, + { + "auxiliary_loss_clip": 0.01163907, + "auxiliary_loss_mlp": 0.01050259, + "balance_loss_clip": 1.02806199, + "balance_loss_mlp": 1.05234194, + "epoch": 0.13088832105816925, + "flos": 34203441231360.0, + "grad_norm": 1.8562517641565577, + "language_loss": 0.74595284, + "learning_rate": 3.894175609775881e-06, + "loss": 0.76809454, + "num_input_tokens_seen": 47024255, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.1171875, + "step": 2177, + "time_per_iteration": 2.561140298843384 + }, + { + "auxiliary_loss_clip": 0.01163674, + "auxiliary_loss_mlp": 0.01046693, + "balance_loss_clip": 1.024472, + "balance_loss_mlp": 1.05222929, + "epoch": 0.13094844431083721, + "flos": 17894970057600.0, + "grad_norm": 2.128567307625778, + "language_loss": 0.81855309, + "learning_rate": 3.894050566558015e-06, + "loss": 0.84065676, + "num_input_tokens_seen": 47042465, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.1171875, + "step": 2178, + "time_per_iteration": 2.458812713623047 + }, + { + "auxiliary_loss_clip": 0.01166111, + "auxiliary_loss_mlp": 0.01047933, + "balance_loss_clip": 1.02695179, + "balance_loss_mlp": 1.05466795, + "epoch": 0.13100856756350518, + "flos": 17311313963520.0, + "grad_norm": 2.66972533149016, + "language_loss": 0.74942935, + "learning_rate": 3.893925451517562e-06, + "loss": 0.77156973, + "num_input_tokens_seen": 47060370, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.109375, + "step": 2179, + "time_per_iteration": 2.4679782390594482 + }, + { + "auxiliary_loss_clip": 0.01161603, + "auxiliary_loss_mlp": 0.0105054, + "balance_loss_clip": 1.03079903, + "balance_loss_mlp": 1.05280709, + "epoch": 0.13106869081617314, + "flos": 22200551562240.0, + "grad_norm": 2.0560779031919636, + "language_loss": 0.84319234, + "learning_rate": 3.893800264659266e-06, + "loss": 0.86531377, + "num_input_tokens_seen": 47081415, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0859375, + "step": 2180, + "time_per_iteration": 2.567873477935791 + }, + { + "auxiliary_loss_clip": 0.01166279, + "auxiliary_loss_mlp": 0.01054601, + "balance_loss_clip": 1.03483582, + "balance_loss_mlp": 1.05700839, + "epoch": 0.13112881406884114, + "flos": 21763123735680.0, + "grad_norm": 2.214126283525484, + "language_loss": 0.8987745, + "learning_rate": 3.8936750059878746e-06, + "loss": 0.92098325, + "num_input_tokens_seen": 47099860, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.09375, + "step": 2181, + "time_per_iteration": 2.4802486896514893 + }, + { + "auxiliary_loss_clip": 0.01166281, + "auxiliary_loss_mlp": 0.01043829, + "balance_loss_clip": 1.02438569, + "balance_loss_mlp": 1.0557189, + "epoch": 0.1311889373215091, + "flos": 23331091201920.0, + "grad_norm": 1.8993602522657917, + "language_loss": 0.68657839, + "learning_rate": 3.893549675508137e-06, + "loss": 0.70867944, + "num_input_tokens_seen": 47118540, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.109375, + "step": 2182, + "time_per_iteration": 2.460148572921753 + }, + { + "auxiliary_loss_clip": 0.01167141, + "auxiliary_loss_mlp": 0.01048146, + "balance_loss_clip": 1.02745128, + "balance_loss_mlp": 1.05504203, + "epoch": 0.13124906057417707, + "flos": 21467363149440.0, + "grad_norm": 2.6442759836393277, + "language_loss": 0.78435183, + "learning_rate": 3.893424273224806e-06, + "loss": 0.80650467, + "num_input_tokens_seen": 47136710, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.125, + "step": 2183, + "time_per_iteration": 2.5462143421173096 + }, + { + "auxiliary_loss_clip": 0.01162472, + "auxiliary_loss_mlp": 0.01043905, + "balance_loss_clip": 1.02375841, + "balance_loss_mlp": 1.05238128, + "epoch": 0.13130918382684503, + "flos": 23255319461760.0, + "grad_norm": 2.788927255894662, + "language_loss": 0.85543215, + "learning_rate": 3.893298799142636e-06, + "loss": 0.87749588, + "num_input_tokens_seen": 47157155, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.1015625, + "step": 2184, + "time_per_iteration": 3.8904993534088135 + }, + { + "auxiliary_loss_clip": 0.01165934, + "auxiliary_loss_mlp": 0.01047649, + "balance_loss_clip": 1.0265255, + "balance_loss_mlp": 1.0529201, + "epoch": 0.131369307079513, + "flos": 20850274471680.0, + "grad_norm": 2.505672435211917, + "language_loss": 0.82206696, + "learning_rate": 3.893173253266387e-06, + "loss": 0.84420282, + "num_input_tokens_seen": 47176820, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1328125, + "step": 2185, + "time_per_iteration": 5.3855485916137695 + }, + { + "auxiliary_loss_clip": 0.01168066, + "auxiliary_loss_mlp": 0.0105393, + "balance_loss_clip": 1.03323543, + "balance_loss_mlp": 1.05440092, + "epoch": 0.13142943033218096, + "flos": 17858341163520.0, + "grad_norm": 2.0294565364346235, + "language_loss": 0.73037684, + "learning_rate": 3.893047635600818e-06, + "loss": 0.7525968, + "num_input_tokens_seen": 47195855, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1328125, + "step": 2186, + "time_per_iteration": 2.4839119911193848 + }, + { + "auxiliary_loss_clip": 0.01165928, + "auxiliary_loss_mlp": 0.01048235, + "balance_loss_clip": 1.02601433, + "balance_loss_mlp": 1.05449164, + "epoch": 0.13148955358484893, + "flos": 20996035862400.0, + "grad_norm": 2.0525608711513614, + "language_loss": 0.80174023, + "learning_rate": 3.892921946150693e-06, + "loss": 0.82388186, + "num_input_tokens_seen": 47214535, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.109375, + "step": 2187, + "time_per_iteration": 2.463906764984131 + }, + { + "auxiliary_loss_clip": 0.01053378, + "auxiliary_loss_mlp": 0.01005839, + "balance_loss_clip": 1.00344312, + "balance_loss_mlp": 1.01508641, + "epoch": 0.13154967683751692, + "flos": 70172467580160.0, + "grad_norm": 0.8435449169341035, + "language_loss": 0.58977342, + "learning_rate": 3.892796184920778e-06, + "loss": 0.61036563, + "num_input_tokens_seen": 47270300, + "router_z_loss_clip": 0.02392578, + "router_z_loss_mlp": 0.3828125, + "step": 2188, + "time_per_iteration": 3.1052041053771973 + }, + { + "auxiliary_loss_clip": 0.01169813, + "auxiliary_loss_mlp": 0.01050803, + "balance_loss_clip": 1.03037024, + "balance_loss_mlp": 1.05918622, + "epoch": 0.1316098000901849, + "flos": 20376145923840.0, + "grad_norm": 2.1443848583942846, + "language_loss": 0.74199927, + "learning_rate": 3.892670351915842e-06, + "loss": 0.76420546, + "num_input_tokens_seen": 47290720, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.109375, + "step": 2189, + "time_per_iteration": 2.5137264728546143 + }, + { + "auxiliary_loss_clip": 0.01166605, + "auxiliary_loss_mlp": 0.01049022, + "balance_loss_clip": 1.02894759, + "balance_loss_mlp": 1.05678558, + "epoch": 0.13166992334285285, + "flos": 23221132692480.0, + "grad_norm": 1.7642431940848833, + "language_loss": 0.72561657, + "learning_rate": 3.892544447140657e-06, + "loss": 0.74777287, + "num_input_tokens_seen": 47311820, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.1015625, + "step": 2190, + "time_per_iteration": 2.5053412914276123 + }, + { + "auxiliary_loss_clip": 0.01169095, + "auxiliary_loss_mlp": 0.01051343, + "balance_loss_clip": 1.03094649, + "balance_loss_mlp": 1.05706906, + "epoch": 0.13173004659552082, + "flos": 23330947547520.0, + "grad_norm": 8.700182749243472, + "language_loss": 0.74395585, + "learning_rate": 3.892418470599996e-06, + "loss": 0.76616025, + "num_input_tokens_seen": 47331605, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.1171875, + "step": 2191, + "time_per_iteration": 2.507687568664551 + }, + { + "auxiliary_loss_clip": 0.01168877, + "auxiliary_loss_mlp": 0.01049305, + "balance_loss_clip": 1.02841949, + "balance_loss_mlp": 1.05689156, + "epoch": 0.13179016984818878, + "flos": 21251504367360.0, + "grad_norm": 2.0250128968483403, + "language_loss": 0.79286075, + "learning_rate": 3.892292422298637e-06, + "loss": 0.8150425, + "num_input_tokens_seen": 47350455, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1171875, + "step": 2192, + "time_per_iteration": 2.5068893432617188 + }, + { + "auxiliary_loss_clip": 0.01168449, + "auxiliary_loss_mlp": 0.01053422, + "balance_loss_clip": 1.03290629, + "balance_loss_mlp": 1.05564141, + "epoch": 0.13185029310085675, + "flos": 17778690754560.0, + "grad_norm": 1.9285179647135495, + "language_loss": 0.84827602, + "learning_rate": 3.892166302241361e-06, + "loss": 0.87049472, + "num_input_tokens_seen": 47368225, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.125, + "step": 2193, + "time_per_iteration": 2.456409215927124 + }, + { + "auxiliary_loss_clip": 0.0105585, + "auxiliary_loss_mlp": 0.01002145, + "balance_loss_clip": 0.99976075, + "balance_loss_mlp": 1.0179081, + "epoch": 0.1319104163535247, + "flos": 69851785933440.0, + "grad_norm": 0.7727203010194038, + "language_loss": 0.54049635, + "learning_rate": 3.8920401104329475e-06, + "loss": 0.56107628, + "num_input_tokens_seen": 47427125, + "router_z_loss_clip": 0.02380371, + "router_z_loss_mlp": 0.37890625, + "step": 2194, + "time_per_iteration": 3.0569794178009033 + }, + { + "auxiliary_loss_clip": 0.01165932, + "auxiliary_loss_mlp": 0.01046302, + "balance_loss_clip": 1.02566671, + "balance_loss_mlp": 1.05514359, + "epoch": 0.1319705396061927, + "flos": 25193095401600.0, + "grad_norm": 1.7688784093808256, + "language_loss": 0.72086227, + "learning_rate": 3.891913846878185e-06, + "loss": 0.74298465, + "num_input_tokens_seen": 47450275, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.109375, + "step": 2195, + "time_per_iteration": 2.527435541152954 + }, + { + "auxiliary_loss_clip": 0.01173804, + "auxiliary_loss_mlp": 0.01045041, + "balance_loss_clip": 1.02310634, + "balance_loss_mlp": 1.05663633, + "epoch": 0.13203066285886067, + "flos": 20740459616640.0, + "grad_norm": 1.7664998702658374, + "language_loss": 0.78195536, + "learning_rate": 3.891787511581859e-06, + "loss": 0.80414379, + "num_input_tokens_seen": 47469155, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.171875, + "step": 2196, + "time_per_iteration": 2.4685165882110596 + }, + { + "auxiliary_loss_clip": 0.01169552, + "auxiliary_loss_mlp": 0.01046979, + "balance_loss_clip": 1.02714252, + "balance_loss_mlp": 1.05638218, + "epoch": 0.13209078611152864, + "flos": 22054395121920.0, + "grad_norm": 2.1663119445052295, + "language_loss": 0.74861938, + "learning_rate": 3.89166110454876e-06, + "loss": 0.77078474, + "num_input_tokens_seen": 47488405, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.1328125, + "step": 2197, + "time_per_iteration": 2.489504814147949 + }, + { + "auxiliary_loss_clip": 0.01170611, + "auxiliary_loss_mlp": 0.01044966, + "balance_loss_clip": 1.02430725, + "balance_loss_mlp": 1.05543399, + "epoch": 0.1321509093641966, + "flos": 16284950743680.0, + "grad_norm": 2.4378795089069674, + "language_loss": 0.8011694, + "learning_rate": 3.891534625783685e-06, + "loss": 0.82332516, + "num_input_tokens_seen": 47505650, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1484375, + "step": 2198, + "time_per_iteration": 2.437718391418457 + }, + { + "auxiliary_loss_clip": 0.0116676, + "auxiliary_loss_mlp": 0.01061419, + "balance_loss_clip": 1.04173732, + "balance_loss_mlp": 1.05483699, + "epoch": 0.13221103261686457, + "flos": 16983018633600.0, + "grad_norm": 2.4514815632850038, + "language_loss": 0.82552117, + "learning_rate": 3.891408075291425e-06, + "loss": 0.847803, + "num_input_tokens_seen": 47521540, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1171875, + "step": 2199, + "time_per_iteration": 2.47356915473938 + }, + { + "auxiliary_loss_clip": 0.01167277, + "auxiliary_loss_mlp": 0.01047633, + "balance_loss_clip": 1.02724838, + "balance_loss_mlp": 1.05458844, + "epoch": 0.13227115586953253, + "flos": 34233605677440.0, + "grad_norm": 2.465688895758548, + "language_loss": 0.68963099, + "learning_rate": 3.8912814530767826e-06, + "loss": 0.71178007, + "num_input_tokens_seen": 47543625, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.125, + "step": 2200, + "time_per_iteration": 2.5828843116760254 + }, + { + "auxiliary_loss_clip": 0.01166491, + "auxiliary_loss_mlp": 0.01055533, + "balance_loss_clip": 1.03420663, + "balance_loss_mlp": 1.05397916, + "epoch": 0.13233127912220052, + "flos": 20704656735360.0, + "grad_norm": 2.591612522060186, + "language_loss": 0.84600091, + "learning_rate": 3.891154759144557e-06, + "loss": 0.86822116, + "num_input_tokens_seen": 47563740, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.125, + "step": 2201, + "time_per_iteration": 2.5546202659606934 + }, + { + "auxiliary_loss_clip": 0.01168797, + "auxiliary_loss_mlp": 0.01054072, + "balance_loss_clip": 1.03315115, + "balance_loss_mlp": 1.05466592, + "epoch": 0.1323914023748685, + "flos": 25805048434560.0, + "grad_norm": 1.901870031688447, + "language_loss": 0.86978126, + "learning_rate": 3.891027993499554e-06, + "loss": 0.89200991, + "num_input_tokens_seen": 47582655, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.140625, + "step": 2202, + "time_per_iteration": 2.509300470352173 + }, + { + "auxiliary_loss_clip": 0.01164666, + "auxiliary_loss_mlp": 0.01042993, + "balance_loss_clip": 1.02364576, + "balance_loss_mlp": 1.05389142, + "epoch": 0.13245152562753645, + "flos": 21251540280960.0, + "grad_norm": 2.3614014237187084, + "language_loss": 0.72746712, + "learning_rate": 3.89090115614658e-06, + "loss": 0.74954367, + "num_input_tokens_seen": 47600875, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.109375, + "step": 2203, + "time_per_iteration": 2.5267388820648193 + }, + { + "auxiliary_loss_clip": 0.01167891, + "auxiliary_loss_mlp": 0.0105678, + "balance_loss_clip": 1.03781366, + "balance_loss_mlp": 1.05453348, + "epoch": 0.13251164888020442, + "flos": 26610955931520.0, + "grad_norm": 2.5436302639516, + "language_loss": 0.73248756, + "learning_rate": 3.890774247090444e-06, + "loss": 0.75473428, + "num_input_tokens_seen": 47619250, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.1328125, + "step": 2204, + "time_per_iteration": 2.5298051834106445 + }, + { + "auxiliary_loss_clip": 0.01168712, + "auxiliary_loss_mlp": 0.01053403, + "balance_loss_clip": 1.03211212, + "balance_loss_mlp": 1.05558085, + "epoch": 0.13257177213287238, + "flos": 29826541272960.0, + "grad_norm": 1.7540271848273767, + "language_loss": 0.78627133, + "learning_rate": 3.89064726633596e-06, + "loss": 0.80849254, + "num_input_tokens_seen": 47639445, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1328125, + "step": 2205, + "time_per_iteration": 2.5343189239501953 + }, + { + "auxiliary_loss_clip": 0.01166449, + "auxiliary_loss_mlp": 0.01053788, + "balance_loss_clip": 1.033391, + "balance_loss_mlp": 1.05560231, + "epoch": 0.13263189538554035, + "flos": 21288456483840.0, + "grad_norm": 2.234297854715259, + "language_loss": 0.78748876, + "learning_rate": 3.890520213887941e-06, + "loss": 0.80969107, + "num_input_tokens_seen": 47658740, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.109375, + "step": 2206, + "time_per_iteration": 2.473229169845581 + }, + { + "auxiliary_loss_clip": 0.01170953, + "auxiliary_loss_mlp": 0.01046503, + "balance_loss_clip": 1.02750087, + "balance_loss_mlp": 1.05758011, + "epoch": 0.13269201863820831, + "flos": 16874101618560.0, + "grad_norm": 2.3028539815574494, + "language_loss": 0.73993444, + "learning_rate": 3.890393089751208e-06, + "loss": 0.76210898, + "num_input_tokens_seen": 47676880, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.125, + "step": 2207, + "time_per_iteration": 2.479421854019165 + }, + { + "auxiliary_loss_clip": 0.01160402, + "auxiliary_loss_mlp": 0.0104899, + "balance_loss_clip": 1.02822387, + "balance_loss_mlp": 1.05323017, + "epoch": 0.1327521418908763, + "flos": 23768914078080.0, + "grad_norm": 2.4105539478543454, + "language_loss": 0.84151787, + "learning_rate": 3.890265893930578e-06, + "loss": 0.86361182, + "num_input_tokens_seen": 47696635, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.0703125, + "step": 2208, + "time_per_iteration": 2.501969337463379 + }, + { + "auxiliary_loss_clip": 0.01161894, + "auxiliary_loss_mlp": 0.01055633, + "balance_loss_clip": 1.03621435, + "balance_loss_mlp": 1.05553222, + "epoch": 0.13281226514354427, + "flos": 26505594362880.0, + "grad_norm": 1.9362156368998853, + "language_loss": 0.85323346, + "learning_rate": 3.890138626430876e-06, + "loss": 0.87540877, + "num_input_tokens_seen": 47717760, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0625, + "step": 2209, + "time_per_iteration": 2.509761333465576 + }, + { + "auxiliary_loss_clip": 0.01166975, + "auxiliary_loss_mlp": 0.01049621, + "balance_loss_clip": 1.03039217, + "balance_loss_mlp": 1.05628705, + "epoch": 0.13287238839621224, + "flos": 24498762526080.0, + "grad_norm": 2.055387861012722, + "language_loss": 0.81545013, + "learning_rate": 3.890011287256929e-06, + "loss": 0.83761609, + "num_input_tokens_seen": 47737685, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.109375, + "step": 2210, + "time_per_iteration": 2.4920527935028076 + }, + { + "auxiliary_loss_clip": 0.0106048, + "auxiliary_loss_mlp": 0.01003994, + "balance_loss_clip": 1.0016222, + "balance_loss_mlp": 1.02205014, + "epoch": 0.1329325116488802, + "flos": 67694344369920.0, + "grad_norm": 0.7616894664797615, + "language_loss": 0.57984382, + "learning_rate": 3.889883876413563e-06, + "loss": 0.6004886, + "num_input_tokens_seen": 47802415, + "router_z_loss_clip": 0.02368164, + "router_z_loss_mlp": 0.3828125, + "step": 2211, + "time_per_iteration": 3.1735260486602783 + }, + { + "auxiliary_loss_clip": 0.01059664, + "auxiliary_loss_mlp": 0.01005439, + "balance_loss_clip": 1.00312614, + "balance_loss_mlp": 1.02081084, + "epoch": 0.13299263490154817, + "flos": 72261894741120.0, + "grad_norm": 0.7970523185699088, + "language_loss": 0.55364317, + "learning_rate": 3.889756393905611e-06, + "loss": 0.57429421, + "num_input_tokens_seen": 47871485, + "router_z_loss_clip": 0.02307129, + "router_z_loss_mlp": 0.38671875, + "step": 2212, + "time_per_iteration": 3.142056465148926 + }, + { + "auxiliary_loss_clip": 0.01170665, + "auxiliary_loss_mlp": 0.01052255, + "balance_loss_clip": 1.03164423, + "balance_loss_mlp": 1.056463, + "epoch": 0.13305275815421613, + "flos": 17931275729280.0, + "grad_norm": 4.2694742121271645, + "language_loss": 0.74779308, + "learning_rate": 3.889628839737908e-06, + "loss": 0.77002227, + "num_input_tokens_seen": 47888315, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.140625, + "step": 2213, + "time_per_iteration": 2.4599013328552246 + }, + { + "auxiliary_loss_clip": 0.0115893, + "auxiliary_loss_mlp": 0.01047564, + "balance_loss_clip": 1.02889609, + "balance_loss_mlp": 1.05235839, + "epoch": 0.13311288140688413, + "flos": 22340889999360.0, + "grad_norm": 2.0343460890824927, + "language_loss": 0.79269958, + "learning_rate": 3.889501213915291e-06, + "loss": 0.81476456, + "num_input_tokens_seen": 47906600, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.0625, + "step": 2214, + "time_per_iteration": 2.4635097980499268 + }, + { + "auxiliary_loss_clip": 0.01166328, + "auxiliary_loss_mlp": 0.01051317, + "balance_loss_clip": 1.03062189, + "balance_loss_mlp": 1.05593503, + "epoch": 0.1331730046595521, + "flos": 31868888682240.0, + "grad_norm": 2.0399610331480407, + "language_loss": 0.69410872, + "learning_rate": 3.889373516442597e-06, + "loss": 0.71628523, + "num_input_tokens_seen": 47927630, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1015625, + "step": 2215, + "time_per_iteration": 2.5798754692077637 + }, + { + "auxiliary_loss_clip": 0.01166771, + "auxiliary_loss_mlp": 0.01046808, + "balance_loss_clip": 1.02725816, + "balance_loss_mlp": 1.05576539, + "epoch": 0.13323312791222006, + "flos": 22566589107840.0, + "grad_norm": 2.4518621177772175, + "language_loss": 0.81136751, + "learning_rate": 3.889245747324671e-06, + "loss": 0.83350337, + "num_input_tokens_seen": 47947935, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.109375, + "step": 2216, + "time_per_iteration": 2.49601674079895 + }, + { + "auxiliary_loss_clip": 0.01166215, + "auxiliary_loss_mlp": 0.01057297, + "balance_loss_clip": 1.03668606, + "balance_loss_mlp": 1.05610895, + "epoch": 0.13329325116488802, + "flos": 15085319293440.0, + "grad_norm": 3.5729384628186307, + "language_loss": 0.87350845, + "learning_rate": 3.889117906566356e-06, + "loss": 0.89574361, + "num_input_tokens_seen": 47965515, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.1015625, + "step": 2217, + "time_per_iteration": 2.435224771499634 + }, + { + "auxiliary_loss_clip": 0.01165439, + "auxiliary_loss_mlp": 0.01048261, + "balance_loss_clip": 1.02716112, + "balance_loss_mlp": 1.05609739, + "epoch": 0.133353374417556, + "flos": 27453671890560.0, + "grad_norm": 2.6393181601709057, + "language_loss": 0.73460543, + "learning_rate": 3.888989994172501e-06, + "loss": 0.75674248, + "num_input_tokens_seen": 47985675, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.09375, + "step": 2218, + "time_per_iteration": 2.4984188079833984 + }, + { + "auxiliary_loss_clip": 0.01164132, + "auxiliary_loss_mlp": 0.01044805, + "balance_loss_clip": 1.02401495, + "balance_loss_mlp": 1.05406141, + "epoch": 0.13341349767022395, + "flos": 24094695456000.0, + "grad_norm": 1.803125703936159, + "language_loss": 0.87483871, + "learning_rate": 3.8888620101479565e-06, + "loss": 0.89692807, + "num_input_tokens_seen": 48004985, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.09375, + "step": 2219, + "time_per_iteration": 2.4761111736297607 + }, + { + "auxiliary_loss_clip": 0.01166927, + "auxiliary_loss_mlp": 0.01051114, + "balance_loss_clip": 1.03198123, + "balance_loss_mlp": 1.05804753, + "epoch": 0.13347362092289192, + "flos": 24133335511680.0, + "grad_norm": 1.5604165479120375, + "language_loss": 0.77241862, + "learning_rate": 3.888733954497574e-06, + "loss": 0.79459906, + "num_input_tokens_seen": 48024965, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0859375, + "step": 2220, + "time_per_iteration": 2.5172770023345947 + }, + { + "auxiliary_loss_clip": 0.01158357, + "auxiliary_loss_mlp": 0.01044474, + "balance_loss_clip": 1.02599692, + "balance_loss_mlp": 1.05065227, + "epoch": 0.1335337441755599, + "flos": 18436538390400.0, + "grad_norm": 2.752699726256429, + "language_loss": 0.79361391, + "learning_rate": 3.888605827226212e-06, + "loss": 0.81564224, + "num_input_tokens_seen": 48040890, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.078125, + "step": 2221, + "time_per_iteration": 2.4729459285736084 + }, + { + "auxiliary_loss_clip": 0.01056162, + "auxiliary_loss_mlp": 0.01004009, + "balance_loss_clip": 1.00177944, + "balance_loss_mlp": 1.01797867, + "epoch": 0.13359386742822787, + "flos": 50611997652480.0, + "grad_norm": 0.9620212456786271, + "language_loss": 0.6890744, + "learning_rate": 3.8884776283387275e-06, + "loss": 0.70967615, + "num_input_tokens_seen": 48091855, + "router_z_loss_clip": 0.02233887, + "router_z_loss_mlp": 0.3828125, + "step": 2222, + "time_per_iteration": 2.9102694988250732 + }, + { + "auxiliary_loss_clip": 0.011664, + "auxiliary_loss_mlp": 0.01047762, + "balance_loss_clip": 1.02885592, + "balance_loss_mlp": 1.05645049, + "epoch": 0.13365399068089584, + "flos": 22778569221120.0, + "grad_norm": 1.8990549263762904, + "language_loss": 0.66966134, + "learning_rate": 3.888349357839982e-06, + "loss": 0.69180298, + "num_input_tokens_seen": 48111350, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.1015625, + "step": 2223, + "time_per_iteration": 2.4860363006591797 + }, + { + "auxiliary_loss_clip": 0.01162257, + "auxiliary_loss_mlp": 0.01055999, + "balance_loss_clip": 1.03584075, + "balance_loss_mlp": 1.05173874, + "epoch": 0.1337141139335638, + "flos": 12531603911040.0, + "grad_norm": 2.0940561003244738, + "language_loss": 0.82572883, + "learning_rate": 3.88822101573484e-06, + "loss": 0.84791142, + "num_input_tokens_seen": 48129840, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.109375, + "step": 2224, + "time_per_iteration": 2.453310966491699 + }, + { + "auxiliary_loss_clip": 0.01167505, + "auxiliary_loss_mlp": 0.01047104, + "balance_loss_clip": 1.0266118, + "balance_loss_mlp": 1.05410361, + "epoch": 0.13377423718623177, + "flos": 23038957889280.0, + "grad_norm": 2.0797940389634624, + "language_loss": 0.66006851, + "learning_rate": 3.888092602028167e-06, + "loss": 0.68221462, + "num_input_tokens_seen": 48149240, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1328125, + "step": 2225, + "time_per_iteration": 2.505760669708252 + }, + { + "auxiliary_loss_clip": 0.01164479, + "auxiliary_loss_mlp": 0.01054978, + "balance_loss_clip": 1.03491461, + "balance_loss_mlp": 1.05366707, + "epoch": 0.13383436043889974, + "flos": 16216397637120.0, + "grad_norm": 2.2490181158076545, + "language_loss": 0.89484501, + "learning_rate": 3.887964116724835e-06, + "loss": 0.91703951, + "num_input_tokens_seen": 48166330, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.109375, + "step": 2226, + "time_per_iteration": 3.827432632446289 + }, + { + "auxiliary_loss_clip": 0.01166768, + "auxiliary_loss_mlp": 0.01050683, + "balance_loss_clip": 1.03132319, + "balance_loss_mlp": 1.05492473, + "epoch": 0.1338944836915677, + "flos": 24279671520000.0, + "grad_norm": 2.0692514385202947, + "language_loss": 0.73874348, + "learning_rate": 3.887835559829712e-06, + "loss": 0.76091796, + "num_input_tokens_seen": 48187600, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.1171875, + "step": 2227, + "time_per_iteration": 5.469221115112305 + }, + { + "auxiliary_loss_clip": 0.01166021, + "auxiliary_loss_mlp": 0.01049972, + "balance_loss_clip": 1.02971888, + "balance_loss_mlp": 1.05582607, + "epoch": 0.1339546069442357, + "flos": 17598742594560.0, + "grad_norm": 2.597241668203809, + "language_loss": 0.8519839, + "learning_rate": 3.8877069313476764e-06, + "loss": 0.87414384, + "num_input_tokens_seen": 48204400, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1015625, + "step": 2228, + "time_per_iteration": 2.449289560317993 + }, + { + "auxiliary_loss_clip": 0.01162737, + "auxiliary_loss_mlp": 0.01047632, + "balance_loss_clip": 1.0275687, + "balance_loss_mlp": 1.05501461, + "epoch": 0.13401473019690366, + "flos": 18990065952000.0, + "grad_norm": 2.700498827765594, + "language_loss": 0.8100034, + "learning_rate": 3.8875782312836054e-06, + "loss": 0.83210707, + "num_input_tokens_seen": 48222180, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.078125, + "step": 2229, + "time_per_iteration": 2.454185962677002 + }, + { + "auxiliary_loss_clip": 0.01165405, + "auxiliary_loss_mlp": 0.01055372, + "balance_loss_clip": 1.03528547, + "balance_loss_mlp": 1.05576682, + "epoch": 0.13407485344957162, + "flos": 26943812288640.0, + "grad_norm": 2.350850930683171, + "language_loss": 0.73814881, + "learning_rate": 3.887449459642378e-06, + "loss": 0.76035661, + "num_input_tokens_seen": 48243245, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.09375, + "step": 2230, + "time_per_iteration": 2.538679838180542 + }, + { + "auxiliary_loss_clip": 0.0116587, + "auxiliary_loss_mlp": 0.01055192, + "balance_loss_clip": 1.03551102, + "balance_loss_mlp": 1.0541544, + "epoch": 0.1341349767022396, + "flos": 20339373375360.0, + "grad_norm": 8.27737726970052, + "language_loss": 0.79914325, + "learning_rate": 3.8873206164288785e-06, + "loss": 0.82135391, + "num_input_tokens_seen": 48262600, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.1171875, + "step": 2231, + "time_per_iteration": 2.436964273452759 + }, + { + "auxiliary_loss_clip": 0.0116777, + "auxiliary_loss_mlp": 0.01049092, + "balance_loss_clip": 1.02906466, + "balance_loss_mlp": 1.05716896, + "epoch": 0.13419509995490755, + "flos": 29862020931840.0, + "grad_norm": 1.9954658779127024, + "language_loss": 0.72341192, + "learning_rate": 3.887191701647992e-06, + "loss": 0.74558049, + "num_input_tokens_seen": 48285075, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.109375, + "step": 2232, + "time_per_iteration": 2.5315330028533936 + }, + { + "auxiliary_loss_clip": 0.01169038, + "auxiliary_loss_mlp": 0.01047761, + "balance_loss_clip": 1.02664888, + "balance_loss_mlp": 1.05505097, + "epoch": 0.13425522320757552, + "flos": 26942986275840.0, + "grad_norm": 2.53729194427275, + "language_loss": 0.65508974, + "learning_rate": 3.8870627153046066e-06, + "loss": 0.67725778, + "num_input_tokens_seen": 48301285, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.140625, + "step": 2233, + "time_per_iteration": 2.480006694793701 + }, + { + "auxiliary_loss_clip": 0.01161612, + "auxiliary_loss_mlp": 0.01047371, + "balance_loss_clip": 1.02687883, + "balance_loss_mlp": 1.05011904, + "epoch": 0.1343153464602435, + "flos": 15777281871360.0, + "grad_norm": 4.541384002557222, + "language_loss": 0.81492066, + "learning_rate": 3.886933657403615e-06, + "loss": 0.8370105, + "num_input_tokens_seen": 48317835, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1171875, + "step": 2234, + "time_per_iteration": 2.431962490081787 + }, + { + "auxiliary_loss_clip": 0.01165653, + "auxiliary_loss_mlp": 0.01054939, + "balance_loss_clip": 1.03466105, + "balance_loss_mlp": 1.05424869, + "epoch": 0.13437546971291148, + "flos": 24314756129280.0, + "grad_norm": 1.9481483268780417, + "language_loss": 0.82361299, + "learning_rate": 3.886804527949909e-06, + "loss": 0.84581894, + "num_input_tokens_seen": 48335670, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1171875, + "step": 2235, + "time_per_iteration": 2.4478979110717773 + }, + { + "auxiliary_loss_clip": 0.0116322, + "auxiliary_loss_mlp": 0.01055841, + "balance_loss_clip": 1.03378713, + "balance_loss_mlp": 1.05170834, + "epoch": 0.13443559296557944, + "flos": 26650673395200.0, + "grad_norm": 1.6568048404288893, + "language_loss": 0.86399209, + "learning_rate": 3.8866753269483864e-06, + "loss": 0.88618279, + "num_input_tokens_seen": 48357805, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1171875, + "step": 2236, + "time_per_iteration": 2.534761428833008 + }, + { + "auxiliary_loss_clip": 0.01166625, + "auxiliary_loss_mlp": 0.0104777, + "balance_loss_clip": 1.02712345, + "balance_loss_mlp": 1.05506372, + "epoch": 0.1344957162182474, + "flos": 21796197183360.0, + "grad_norm": 1.5401183277834882, + "language_loss": 0.76936173, + "learning_rate": 3.886546054403946e-06, + "loss": 0.79150563, + "num_input_tokens_seen": 48377845, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1171875, + "step": 2237, + "time_per_iteration": 2.454881191253662 + }, + { + "auxiliary_loss_clip": 0.0116441, + "auxiliary_loss_mlp": 0.01051932, + "balance_loss_clip": 1.02974725, + "balance_loss_mlp": 1.05312407, + "epoch": 0.13455583947091537, + "flos": 19865568049920.0, + "grad_norm": 1.976295310563951, + "language_loss": 0.78737688, + "learning_rate": 3.886416710321491e-06, + "loss": 0.80954033, + "num_input_tokens_seen": 48394735, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.1171875, + "step": 2238, + "time_per_iteration": 2.508364200592041 + }, + { + "auxiliary_loss_clip": 0.01162005, + "auxiliary_loss_mlp": 0.01051856, + "balance_loss_clip": 1.03057706, + "balance_loss_mlp": 1.0530107, + "epoch": 0.13461596272358334, + "flos": 30846835094400.0, + "grad_norm": 2.3078790626960246, + "language_loss": 0.67977941, + "learning_rate": 3.886287294705924e-06, + "loss": 0.70191795, + "num_input_tokens_seen": 48414200, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.09375, + "step": 2239, + "time_per_iteration": 2.533148765563965 + }, + { + "auxiliary_loss_clip": 0.01165153, + "auxiliary_loss_mlp": 0.01049226, + "balance_loss_clip": 1.02888918, + "balance_loss_mlp": 1.05296254, + "epoch": 0.1346760859762513, + "flos": 12494436312960.0, + "grad_norm": 2.7482132203763245, + "language_loss": 0.81085825, + "learning_rate": 3.8861578075621555e-06, + "loss": 0.83300203, + "num_input_tokens_seen": 48431065, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.125, + "step": 2240, + "time_per_iteration": 2.458702802658081 + }, + { + "auxiliary_loss_clip": 0.01166075, + "auxiliary_loss_mlp": 0.01050419, + "balance_loss_clip": 1.02958083, + "balance_loss_mlp": 1.05302262, + "epoch": 0.1347362092289193, + "flos": 21836022387840.0, + "grad_norm": 1.775061814751768, + "language_loss": 0.77491653, + "learning_rate": 3.886028248895093e-06, + "loss": 0.79708141, + "num_input_tokens_seen": 48450335, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1328125, + "step": 2241, + "time_per_iteration": 2.4814610481262207 + }, + { + "auxiliary_loss_clip": 0.01160364, + "auxiliary_loss_mlp": 0.01044969, + "balance_loss_clip": 1.0265156, + "balance_loss_mlp": 1.05368328, + "epoch": 0.13479633248158726, + "flos": 23509459163520.0, + "grad_norm": 1.708340264075402, + "language_loss": 0.83106101, + "learning_rate": 3.88589861870965e-06, + "loss": 0.85311437, + "num_input_tokens_seen": 48468555, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.0625, + "step": 2242, + "time_per_iteration": 2.531010627746582 + }, + { + "auxiliary_loss_clip": 0.01166889, + "auxiliary_loss_mlp": 0.01052169, + "balance_loss_clip": 1.03056788, + "balance_loss_mlp": 1.05465889, + "epoch": 0.13485645573425523, + "flos": 29344332165120.0, + "grad_norm": 3.594763109819468, + "language_loss": 0.64927268, + "learning_rate": 3.885768917010744e-06, + "loss": 0.67146331, + "num_input_tokens_seen": 48488515, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.125, + "step": 2243, + "time_per_iteration": 2.5215864181518555 + }, + { + "auxiliary_loss_clip": 0.01158013, + "auxiliary_loss_mlp": 0.01045929, + "balance_loss_clip": 1.02573538, + "balance_loss_mlp": 1.05214143, + "epoch": 0.1349165789869232, + "flos": 28037112503040.0, + "grad_norm": 1.6702464572283469, + "language_loss": 0.72275442, + "learning_rate": 3.8856391438032895e-06, + "loss": 0.74479383, + "num_input_tokens_seen": 48510515, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0625, + "step": 2244, + "time_per_iteration": 2.572275161743164 + }, + { + "auxiliary_loss_clip": 0.01161264, + "auxiliary_loss_mlp": 0.01052624, + "balance_loss_clip": 1.03339577, + "balance_loss_mlp": 1.0510093, + "epoch": 0.13497670223959116, + "flos": 22853730430080.0, + "grad_norm": 1.6251739599249553, + "language_loss": 0.86419517, + "learning_rate": 3.88550929909221e-06, + "loss": 0.886334, + "num_input_tokens_seen": 48529940, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.1015625, + "step": 2245, + "time_per_iteration": 2.4847447872161865 + }, + { + "auxiliary_loss_clip": 0.0115964, + "auxiliary_loss_mlp": 0.01049956, + "balance_loss_clip": 1.029953, + "balance_loss_mlp": 1.0534606, + "epoch": 0.13503682549225912, + "flos": 16504580453760.0, + "grad_norm": 1.986035604010071, + "language_loss": 0.79054129, + "learning_rate": 3.88537938288243e-06, + "loss": 0.81263721, + "num_input_tokens_seen": 48548190, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0625, + "step": 2246, + "time_per_iteration": 2.521500825881958 + }, + { + "auxiliary_loss_clip": 0.01053943, + "auxiliary_loss_mlp": 0.01006436, + "balance_loss_clip": 1.00378919, + "balance_loss_mlp": 1.01705432, + "epoch": 0.1350969487449271, + "flos": 70756303242240.0, + "grad_norm": 0.7498554605470831, + "language_loss": 0.60597092, + "learning_rate": 3.885249395178874e-06, + "loss": 0.6265747, + "num_input_tokens_seen": 48613165, + "router_z_loss_clip": 0.02648926, + "router_z_loss_mlp": 0.3671875, + "step": 2247, + "time_per_iteration": 3.209567070007324 + }, + { + "auxiliary_loss_clip": 0.0117261, + "auxiliary_loss_mlp": 0.01058621, + "balance_loss_clip": 1.03629315, + "balance_loss_mlp": 1.05673957, + "epoch": 0.13515707199759508, + "flos": 23075981832960.0, + "grad_norm": 2.930333372025318, + "language_loss": 0.81250268, + "learning_rate": 3.885119335986473e-06, + "loss": 0.83481503, + "num_input_tokens_seen": 48631705, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.15625, + "step": 2248, + "time_per_iteration": 2.5274717807769775 + }, + { + "auxiliary_loss_clip": 0.01157631, + "auxiliary_loss_mlp": 0.0104321, + "balance_loss_clip": 1.02503014, + "balance_loss_mlp": 1.0515008, + "epoch": 0.13521719525026304, + "flos": 23186371305600.0, + "grad_norm": 2.1598236051462383, + "language_loss": 0.77427459, + "learning_rate": 3.884989205310157e-06, + "loss": 0.79628301, + "num_input_tokens_seen": 48649740, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.0625, + "step": 2249, + "time_per_iteration": 2.475325345993042 + }, + { + "auxiliary_loss_clip": 0.01161564, + "auxiliary_loss_mlp": 0.01053869, + "balance_loss_clip": 1.03477216, + "balance_loss_mlp": 1.05408192, + "epoch": 0.135277318502931, + "flos": 24790931752320.0, + "grad_norm": 1.4620260499768896, + "language_loss": 0.84598488, + "learning_rate": 3.884859003154862e-06, + "loss": 0.86813927, + "num_input_tokens_seen": 48671565, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0703125, + "step": 2250, + "time_per_iteration": 2.5579018592834473 + }, + { + "auxiliary_loss_clip": 0.01162926, + "auxiliary_loss_mlp": 0.0104688, + "balance_loss_clip": 1.02586317, + "balance_loss_mlp": 1.05311561, + "epoch": 0.13533744175559898, + "flos": 21908525990400.0, + "grad_norm": 1.9830962049575767, + "language_loss": 0.8213973, + "learning_rate": 3.884728729525524e-06, + "loss": 0.84349537, + "num_input_tokens_seen": 48690425, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.1015625, + "step": 2251, + "time_per_iteration": 2.459254503250122 + }, + { + "auxiliary_loss_clip": 0.01160349, + "auxiliary_loss_mlp": 0.01053163, + "balance_loss_clip": 1.03144348, + "balance_loss_mlp": 1.05075097, + "epoch": 0.13539756500826694, + "flos": 21211643249280.0, + "grad_norm": 1.6927381248236872, + "language_loss": 0.85981321, + "learning_rate": 3.884598384427084e-06, + "loss": 0.88194835, + "num_input_tokens_seen": 48707505, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.09375, + "step": 2252, + "time_per_iteration": 2.508246421813965 + }, + { + "auxiliary_loss_clip": 0.01050697, + "auxiliary_loss_mlp": 0.0100648, + "balance_loss_clip": 1.00398886, + "balance_loss_mlp": 1.01368976, + "epoch": 0.1354576882609349, + "flos": 63242103634560.0, + "grad_norm": 0.7502755191421498, + "language_loss": 0.61736262, + "learning_rate": 3.884467967864485e-06, + "loss": 0.63793439, + "num_input_tokens_seen": 48775895, + "router_z_loss_clip": 0.02490234, + "router_z_loss_mlp": 0.37109375, + "step": 2253, + "time_per_iteration": 3.1357691287994385 + }, + { + "auxiliary_loss_clip": 0.01163708, + "auxiliary_loss_mlp": 0.01055809, + "balance_loss_clip": 1.0357219, + "balance_loss_mlp": 1.05454588, + "epoch": 0.1355178115136029, + "flos": 25483037984640.0, + "grad_norm": 2.033104819567641, + "language_loss": 0.89383745, + "learning_rate": 3.884337479842671e-06, + "loss": 0.91603261, + "num_input_tokens_seen": 48798370, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.09375, + "step": 2254, + "time_per_iteration": 2.4983997344970703 + }, + { + "auxiliary_loss_clip": 0.01166229, + "auxiliary_loss_mlp": 0.01050811, + "balance_loss_clip": 1.02786362, + "balance_loss_mlp": 1.05202925, + "epoch": 0.13557793476627086, + "flos": 21616967295360.0, + "grad_norm": 2.0851597725495843, + "language_loss": 0.84461302, + "learning_rate": 3.884206920366591e-06, + "loss": 0.86678338, + "num_input_tokens_seen": 48817955, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.140625, + "step": 2255, + "time_per_iteration": 2.4466094970703125 + }, + { + "auxiliary_loss_clip": 0.01159898, + "auxiliary_loss_mlp": 0.01046769, + "balance_loss_clip": 1.02632451, + "balance_loss_mlp": 1.05059099, + "epoch": 0.13563805801893883, + "flos": 24928253447040.0, + "grad_norm": 2.8290739743459126, + "language_loss": 0.7493006, + "learning_rate": 3.884076289441196e-06, + "loss": 0.77136725, + "num_input_tokens_seen": 48836330, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.09375, + "step": 2256, + "time_per_iteration": 2.49464750289917 + }, + { + "auxiliary_loss_clip": 0.01164681, + "auxiliary_loss_mlp": 0.01049021, + "balance_loss_clip": 1.02806377, + "balance_loss_mlp": 1.05080438, + "epoch": 0.1356981812716068, + "flos": 14750272206720.0, + "grad_norm": 4.107811937736733, + "language_loss": 0.83023381, + "learning_rate": 3.88394558707144e-06, + "loss": 0.85237086, + "num_input_tokens_seen": 48851890, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.140625, + "step": 2257, + "time_per_iteration": 2.4069128036499023 + }, + { + "auxiliary_loss_clip": 0.0116793, + "auxiliary_loss_mlp": 0.01054876, + "balance_loss_clip": 1.03272712, + "balance_loss_mlp": 1.05211377, + "epoch": 0.13575830452427476, + "flos": 11108571822720.0, + "grad_norm": 2.2162023158830655, + "language_loss": 0.82266492, + "learning_rate": 3.883814813262277e-06, + "loss": 0.84489298, + "num_input_tokens_seen": 48865510, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.15625, + "step": 2258, + "time_per_iteration": 2.4187939167022705 + }, + { + "auxiliary_loss_clip": 0.01161942, + "auxiliary_loss_mlp": 0.01051916, + "balance_loss_clip": 1.02890849, + "balance_loss_mlp": 1.05117583, + "epoch": 0.13581842777694272, + "flos": 17960290940160.0, + "grad_norm": 2.3528312033652434, + "language_loss": 0.82556236, + "learning_rate": 3.883683968018669e-06, + "loss": 0.84770095, + "num_input_tokens_seen": 48882360, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.109375, + "step": 2259, + "time_per_iteration": 2.4182498455047607 + }, + { + "auxiliary_loss_clip": 0.01162398, + "auxiliary_loss_mlp": 0.0105006, + "balance_loss_clip": 1.0313561, + "balance_loss_mlp": 1.05370188, + "epoch": 0.1358785510296107, + "flos": 22857142222080.0, + "grad_norm": 1.9951846625000045, + "language_loss": 0.73434722, + "learning_rate": 3.8835530513455755e-06, + "loss": 0.75647175, + "num_input_tokens_seen": 48902700, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0859375, + "step": 2260, + "time_per_iteration": 2.5117952823638916 + }, + { + "auxiliary_loss_clip": 0.01160597, + "auxiliary_loss_mlp": 0.01053624, + "balance_loss_clip": 1.03389525, + "balance_loss_mlp": 1.05164778, + "epoch": 0.13593867428227868, + "flos": 25739404329600.0, + "grad_norm": 2.6406640236232826, + "language_loss": 0.75450647, + "learning_rate": 3.883422063247961e-06, + "loss": 0.77664864, + "num_input_tokens_seen": 48922525, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.09375, + "step": 2261, + "time_per_iteration": 2.4773809909820557 + }, + { + "auxiliary_loss_clip": 0.01164897, + "auxiliary_loss_mlp": 0.01048665, + "balance_loss_clip": 1.02887654, + "balance_loss_mlp": 1.05329657, + "epoch": 0.13599879753494665, + "flos": 31249214225280.0, + "grad_norm": 1.9984757312973846, + "language_loss": 0.63141024, + "learning_rate": 3.883291003730794e-06, + "loss": 0.65354586, + "num_input_tokens_seen": 48942510, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.1171875, + "step": 2262, + "time_per_iteration": 2.5423331260681152 + }, + { + "auxiliary_loss_clip": 0.01161423, + "auxiliary_loss_mlp": 0.01043862, + "balance_loss_clip": 1.02458549, + "balance_loss_mlp": 1.05046105, + "epoch": 0.1360589207876146, + "flos": 23915034604800.0, + "grad_norm": 2.598036861128168, + "language_loss": 0.82363462, + "learning_rate": 3.883159872799043e-06, + "loss": 0.84568739, + "num_input_tokens_seen": 48962625, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.109375, + "step": 2263, + "time_per_iteration": 2.472050428390503 + }, + { + "auxiliary_loss_clip": 0.01166147, + "auxiliary_loss_mlp": 0.01061822, + "balance_loss_clip": 1.03914785, + "balance_loss_mlp": 1.05306447, + "epoch": 0.13611904404028258, + "flos": 19974197756160.0, + "grad_norm": 1.7757676532235749, + "language_loss": 0.87984985, + "learning_rate": 3.8830286704576815e-06, + "loss": 0.90212959, + "num_input_tokens_seen": 48982525, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.1328125, + "step": 2264, + "time_per_iteration": 2.4857943058013916 + }, + { + "auxiliary_loss_clip": 0.01163519, + "auxiliary_loss_mlp": 0.01048759, + "balance_loss_clip": 1.02700329, + "balance_loss_mlp": 1.05115557, + "epoch": 0.13617916729295054, + "flos": 15340644144000.0, + "grad_norm": 2.9904691281538693, + "language_loss": 0.7103616, + "learning_rate": 3.882897396711683e-06, + "loss": 0.73248434, + "num_input_tokens_seen": 48997605, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.125, + "step": 2265, + "time_per_iteration": 2.428753614425659 + }, + { + "auxiliary_loss_clip": 0.01160486, + "auxiliary_loss_mlp": 0.01041679, + "balance_loss_clip": 1.02187812, + "balance_loss_mlp": 1.05258036, + "epoch": 0.1362392905456185, + "flos": 27451445247360.0, + "grad_norm": 2.049615390343222, + "language_loss": 0.66760135, + "learning_rate": 3.882766051566027e-06, + "loss": 0.689623, + "num_input_tokens_seen": 49018535, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.078125, + "step": 2266, + "time_per_iteration": 2.4990508556365967 + }, + { + "auxiliary_loss_clip": 0.01159505, + "auxiliary_loss_mlp": 0.01060297, + "balance_loss_clip": 1.04079425, + "balance_loss_mlp": 1.05220675, + "epoch": 0.1362994137982865, + "flos": 25009017177600.0, + "grad_norm": 1.7538751206895893, + "language_loss": 0.76376909, + "learning_rate": 3.882634635025694e-06, + "loss": 0.78596711, + "num_input_tokens_seen": 49038865, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0703125, + "step": 2267, + "time_per_iteration": 2.485907554626465 + }, + { + "auxiliary_loss_clip": 0.0116058, + "auxiliary_loss_mlp": 0.01046136, + "balance_loss_clip": 1.02639508, + "balance_loss_mlp": 1.05051804, + "epoch": 0.13635953705095447, + "flos": 20303031790080.0, + "grad_norm": 2.002795226804265, + "language_loss": 0.81781995, + "learning_rate": 3.882503147095667e-06, + "loss": 0.83988714, + "num_input_tokens_seen": 49058010, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1015625, + "step": 2268, + "time_per_iteration": 3.890936851501465 + }, + { + "auxiliary_loss_clip": 0.01161581, + "auxiliary_loss_mlp": 0.01046085, + "balance_loss_clip": 1.02567649, + "balance_loss_mlp": 1.0542717, + "epoch": 0.13641966030362243, + "flos": 31358418549120.0, + "grad_norm": 2.071095479959133, + "language_loss": 0.76078153, + "learning_rate": 3.882371587780931e-06, + "loss": 0.78285825, + "num_input_tokens_seen": 49080330, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0703125, + "step": 2269, + "time_per_iteration": 4.03081202507019 + }, + { + "auxiliary_loss_clip": 0.01165717, + "auxiliary_loss_mlp": 0.0104628, + "balance_loss_clip": 1.02612138, + "balance_loss_mlp": 1.05518508, + "epoch": 0.1364797835562904, + "flos": 20478095700480.0, + "grad_norm": 2.039865659244694, + "language_loss": 0.80856502, + "learning_rate": 3.882239957086477e-06, + "loss": 0.83068502, + "num_input_tokens_seen": 49097035, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.109375, + "step": 2270, + "time_per_iteration": 2.431426525115967 + }, + { + "auxiliary_loss_clip": 0.01164608, + "auxiliary_loss_mlp": 0.01055428, + "balance_loss_clip": 1.03463817, + "balance_loss_mlp": 1.05227089, + "epoch": 0.13653990680895836, + "flos": 13078343802240.0, + "grad_norm": 2.715242097566801, + "language_loss": 0.75720018, + "learning_rate": 3.882108255017295e-06, + "loss": 0.77940053, + "num_input_tokens_seen": 49113945, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.125, + "step": 2271, + "time_per_iteration": 2.440701961517334 + }, + { + "auxiliary_loss_clip": 0.01161613, + "auxiliary_loss_mlp": 0.01052373, + "balance_loss_clip": 1.03149939, + "balance_loss_mlp": 1.05171776, + "epoch": 0.13660003006162633, + "flos": 16946712961920.0, + "grad_norm": 2.2487551674667565, + "language_loss": 0.80084515, + "learning_rate": 3.881976481578379e-06, + "loss": 0.82298499, + "num_input_tokens_seen": 49132855, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1015625, + "step": 2272, + "time_per_iteration": 2.4305598735809326 + }, + { + "auxiliary_loss_clip": 0.01056084, + "auxiliary_loss_mlp": 0.01011943, + "balance_loss_clip": 1.00937963, + "balance_loss_mlp": 1.01818228, + "epoch": 0.1366601533142943, + "flos": 68682749892480.0, + "grad_norm": 0.7032235049035468, + "language_loss": 0.60682511, + "learning_rate": 3.8818446367747255e-06, + "loss": 0.62750536, + "num_input_tokens_seen": 49198310, + "router_z_loss_clip": 0.02563477, + "router_z_loss_mlp": 0.37890625, + "step": 2273, + "time_per_iteration": 3.1601598262786865 + }, + { + "auxiliary_loss_clip": 0.01158579, + "auxiliary_loss_mlp": 0.01047766, + "balance_loss_clip": 1.02732205, + "balance_loss_mlp": 1.05170178, + "epoch": 0.13672027656696228, + "flos": 19244241567360.0, + "grad_norm": 1.7482195510707834, + "language_loss": 0.77978206, + "learning_rate": 3.881712720611336e-06, + "loss": 0.80184555, + "num_input_tokens_seen": 49217250, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0625, + "step": 2274, + "time_per_iteration": 2.448374032974243 + }, + { + "auxiliary_loss_clip": 0.01163563, + "auxiliary_loss_mlp": 0.01046845, + "balance_loss_clip": 1.02613878, + "balance_loss_mlp": 1.0536654, + "epoch": 0.13678039981963025, + "flos": 24534924543360.0, + "grad_norm": 2.152740159395537, + "language_loss": 0.78435361, + "learning_rate": 3.881580733093211e-06, + "loss": 0.80645764, + "num_input_tokens_seen": 49236615, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1015625, + "step": 2275, + "time_per_iteration": 2.4761078357696533 + }, + { + "auxiliary_loss_clip": 0.01161418, + "auxiliary_loss_mlp": 0.01039, + "balance_loss_clip": 1.02003431, + "balance_loss_mlp": 1.05312562, + "epoch": 0.13684052307229821, + "flos": 15669334523520.0, + "grad_norm": 2.879456622893362, + "language_loss": 0.81436646, + "learning_rate": 3.881448674225356e-06, + "loss": 0.83637059, + "num_input_tokens_seen": 49253935, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0859375, + "step": 2276, + "time_per_iteration": 2.453623056411743 + }, + { + "auxiliary_loss_clip": 0.01169888, + "auxiliary_loss_mlp": 0.01054109, + "balance_loss_clip": 1.03082716, + "balance_loss_mlp": 1.05443549, + "epoch": 0.13690064632496618, + "flos": 28364689560960.0, + "grad_norm": 2.7308629221608576, + "language_loss": 0.69347179, + "learning_rate": 3.881316544012779e-06, + "loss": 0.71571183, + "num_input_tokens_seen": 49273605, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.15625, + "step": 2277, + "time_per_iteration": 2.537464141845703 + }, + { + "auxiliary_loss_clip": 0.01162034, + "auxiliary_loss_mlp": 0.01051118, + "balance_loss_clip": 1.03056657, + "balance_loss_mlp": 1.05136657, + "epoch": 0.13696076957763414, + "flos": 23404779953280.0, + "grad_norm": 2.1796180013972384, + "language_loss": 0.80487186, + "learning_rate": 3.88118434246049e-06, + "loss": 0.82700336, + "num_input_tokens_seen": 49291785, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.109375, + "step": 2278, + "time_per_iteration": 2.478158950805664 + }, + { + "auxiliary_loss_clip": 0.01164216, + "auxiliary_loss_mlp": 0.01047731, + "balance_loss_clip": 1.02788246, + "balance_loss_mlp": 1.05658543, + "epoch": 0.1370208928303021, + "flos": 37196595601920.0, + "grad_norm": 2.2222454745927744, + "language_loss": 0.74863833, + "learning_rate": 3.881052069573502e-06, + "loss": 0.77075779, + "num_input_tokens_seen": 49311405, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.078125, + "step": 2279, + "time_per_iteration": 2.5930991172790527 + }, + { + "auxiliary_loss_clip": 0.01166611, + "auxiliary_loss_mlp": 0.0105256, + "balance_loss_clip": 1.03232992, + "balance_loss_mlp": 1.05331779, + "epoch": 0.13708101608297008, + "flos": 26976311118720.0, + "grad_norm": 2.3437990696634916, + "language_loss": 0.76614088, + "learning_rate": 3.880919725356831e-06, + "loss": 0.78833258, + "num_input_tokens_seen": 49331835, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.1328125, + "step": 2280, + "time_per_iteration": 2.527808666229248 + }, + { + "auxiliary_loss_clip": 0.01156674, + "auxiliary_loss_mlp": 0.01046301, + "balance_loss_clip": 1.0272876, + "balance_loss_mlp": 1.04930711, + "epoch": 0.13714113933563807, + "flos": 32556864850560.0, + "grad_norm": 1.7035700975942816, + "language_loss": 0.79808372, + "learning_rate": 3.880787309815496e-06, + "loss": 0.82011348, + "num_input_tokens_seen": 49352290, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.078125, + "step": 2281, + "time_per_iteration": 2.5486884117126465 + }, + { + "auxiliary_loss_clip": 0.01167882, + "auxiliary_loss_mlp": 0.01055632, + "balance_loss_clip": 1.03618872, + "balance_loss_mlp": 1.05488086, + "epoch": 0.13720126258830603, + "flos": 16101267569280.0, + "grad_norm": 1.697672260024265, + "language_loss": 0.83955061, + "learning_rate": 3.880654822954518e-06, + "loss": 0.86178571, + "num_input_tokens_seen": 49370285, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.125, + "step": 2282, + "time_per_iteration": 2.4731719493865967 + }, + { + "auxiliary_loss_clip": 0.01157557, + "auxiliary_loss_mlp": 0.01055496, + "balance_loss_clip": 1.03664923, + "balance_loss_mlp": 1.05028629, + "epoch": 0.137261385840974, + "flos": 18953544798720.0, + "grad_norm": 1.8152250836173982, + "language_loss": 0.73821312, + "learning_rate": 3.8805222647789195e-06, + "loss": 0.76034367, + "num_input_tokens_seen": 49389610, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0703125, + "step": 2283, + "time_per_iteration": 2.5041310787200928 + }, + { + "auxiliary_loss_clip": 0.01161767, + "auxiliary_loss_mlp": 0.01048138, + "balance_loss_clip": 1.02991104, + "balance_loss_mlp": 1.05546188, + "epoch": 0.13732150909364196, + "flos": 23295360147840.0, + "grad_norm": 2.845966051455131, + "language_loss": 0.83875519, + "learning_rate": 3.880389635293729e-06, + "loss": 0.86085427, + "num_input_tokens_seen": 49408390, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.0625, + "step": 2284, + "time_per_iteration": 2.489459991455078 + }, + { + "auxiliary_loss_clip": 0.01164156, + "auxiliary_loss_mlp": 0.01049019, + "balance_loss_clip": 1.02784729, + "balance_loss_mlp": 1.05016088, + "epoch": 0.13738163234630993, + "flos": 29351263489920.0, + "grad_norm": 1.9356174938409232, + "language_loss": 0.74778754, + "learning_rate": 3.880256934503974e-06, + "loss": 0.76991928, + "num_input_tokens_seen": 49427725, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.140625, + "step": 2285, + "time_per_iteration": 2.542114734649658 + }, + { + "auxiliary_loss_clip": 0.01158945, + "auxiliary_loss_mlp": 0.01045355, + "balance_loss_clip": 1.02680647, + "balance_loss_mlp": 1.05192137, + "epoch": 0.1374417555989779, + "flos": 26651319840000.0, + "grad_norm": 1.7476035379248278, + "language_loss": 0.74461651, + "learning_rate": 3.880124162414689e-06, + "loss": 0.7666595, + "num_input_tokens_seen": 49449000, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0703125, + "step": 2286, + "time_per_iteration": 2.52837872505188 + }, + { + "auxiliary_loss_clip": 0.01165905, + "auxiliary_loss_mlp": 0.01045032, + "balance_loss_clip": 1.02407491, + "balance_loss_mlp": 1.05466056, + "epoch": 0.1375018788516459, + "flos": 28403401443840.0, + "grad_norm": 2.4229799840234936, + "language_loss": 0.86074513, + "learning_rate": 3.879991319030908e-06, + "loss": 0.88285446, + "num_input_tokens_seen": 49468360, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.109375, + "step": 2287, + "time_per_iteration": 2.5267093181610107 + }, + { + "auxiliary_loss_clip": 0.01162503, + "auxiliary_loss_mlp": 0.01046382, + "balance_loss_clip": 1.02724862, + "balance_loss_mlp": 1.05281329, + "epoch": 0.13756200210431385, + "flos": 37413783187200.0, + "grad_norm": 2.1686670508464783, + "language_loss": 0.68304116, + "learning_rate": 3.879858404357666e-06, + "loss": 0.70512998, + "num_input_tokens_seen": 49493450, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.09375, + "step": 2288, + "time_per_iteration": 2.6589176654815674 + }, + { + "auxiliary_loss_clip": 0.01162886, + "auxiliary_loss_mlp": 0.01054184, + "balance_loss_clip": 1.03410959, + "balance_loss_mlp": 1.05404294, + "epoch": 0.13762212535698182, + "flos": 22711021695360.0, + "grad_norm": 3.8263362529629896, + "language_loss": 0.87251699, + "learning_rate": 3.879725418400005e-06, + "loss": 0.89468765, + "num_input_tokens_seen": 49511220, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.0859375, + "step": 2289, + "time_per_iteration": 2.4834415912628174 + }, + { + "auxiliary_loss_clip": 0.01154414, + "auxiliary_loss_mlp": 0.01045882, + "balance_loss_clip": 1.02735722, + "balance_loss_mlp": 1.0496552, + "epoch": 0.13768224860964978, + "flos": 23952130375680.0, + "grad_norm": 1.801469753111382, + "language_loss": 0.74045157, + "learning_rate": 3.879592361162969e-06, + "loss": 0.76245451, + "num_input_tokens_seen": 49529820, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.046875, + "step": 2290, + "time_per_iteration": 2.4901175498962402 + }, + { + "auxiliary_loss_clip": 0.01056657, + "auxiliary_loss_mlp": 0.01003238, + "balance_loss_clip": 1.00099707, + "balance_loss_mlp": 1.01923215, + "epoch": 0.13774237186231775, + "flos": 63590438753280.0, + "grad_norm": 0.7021136788609851, + "language_loss": 0.5160234, + "learning_rate": 3.8794592326516015e-06, + "loss": 0.53662229, + "num_input_tokens_seen": 49595325, + "router_z_loss_clip": 0.02246094, + "router_z_loss_mlp": 0.375, + "step": 2291, + "time_per_iteration": 3.1141176223754883 + }, + { + "auxiliary_loss_clip": 0.01158988, + "auxiliary_loss_mlp": 0.01049009, + "balance_loss_clip": 1.02856493, + "balance_loss_mlp": 1.05007744, + "epoch": 0.1378024951149857, + "flos": 24279456038400.0, + "grad_norm": 2.104305633549435, + "language_loss": 0.7090801, + "learning_rate": 3.879326032870952e-06, + "loss": 0.73116004, + "num_input_tokens_seen": 49615850, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.09375, + "step": 2292, + "time_per_iteration": 2.5535075664520264 + }, + { + "auxiliary_loss_clip": 0.01160381, + "auxiliary_loss_mlp": 0.01044896, + "balance_loss_clip": 1.02613211, + "balance_loss_mlp": 1.05272794, + "epoch": 0.13786261836765368, + "flos": 14021537080320.0, + "grad_norm": 2.835181445389694, + "language_loss": 0.79774708, + "learning_rate": 3.879192761826071e-06, + "loss": 0.81979978, + "num_input_tokens_seen": 49631860, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.078125, + "step": 2293, + "time_per_iteration": 2.4434242248535156 + }, + { + "auxiliary_loss_clip": 0.01159833, + "auxiliary_loss_mlp": 0.01050431, + "balance_loss_clip": 1.03065419, + "balance_loss_mlp": 1.0489893, + "epoch": 0.13792274162032167, + "flos": 28878679226880.0, + "grad_norm": 2.8100583587938566, + "language_loss": 0.78455698, + "learning_rate": 3.879059419522011e-06, + "loss": 0.80665964, + "num_input_tokens_seen": 49652145, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.109375, + "step": 2294, + "time_per_iteration": 2.5279018878936768 + }, + { + "auxiliary_loss_clip": 0.01156302, + "auxiliary_loss_mlp": 0.01044594, + "balance_loss_clip": 1.02679634, + "balance_loss_mlp": 1.05053687, + "epoch": 0.13798286487298964, + "flos": 21141150808320.0, + "grad_norm": 2.844605455172751, + "language_loss": 0.80448526, + "learning_rate": 3.878926005963831e-06, + "loss": 0.82649422, + "num_input_tokens_seen": 49669880, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 1.0546875, + "step": 2295, + "time_per_iteration": 2.46471905708313 + }, + { + "auxiliary_loss_clip": 0.01158353, + "auxiliary_loss_mlp": 0.01045588, + "balance_loss_clip": 1.02604938, + "balance_loss_mlp": 1.04990947, + "epoch": 0.1380429881256576, + "flos": 22487477402880.0, + "grad_norm": 1.905081494696058, + "language_loss": 0.78027165, + "learning_rate": 3.878792521156588e-06, + "loss": 0.80231106, + "num_input_tokens_seen": 49687255, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0859375, + "step": 2296, + "time_per_iteration": 2.489081859588623 + }, + { + "auxiliary_loss_clip": 0.0116031, + "auxiliary_loss_mlp": 0.01052914, + "balance_loss_clip": 1.03356612, + "balance_loss_mlp": 1.05272174, + "epoch": 0.13810311137832557, + "flos": 21393674398080.0, + "grad_norm": 1.8577842545242083, + "language_loss": 0.78632545, + "learning_rate": 3.8786589651053446e-06, + "loss": 0.80845773, + "num_input_tokens_seen": 49706650, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.078125, + "step": 2297, + "time_per_iteration": 2.479617118835449 + }, + { + "auxiliary_loss_clip": 0.01157952, + "auxiliary_loss_mlp": 0.01050362, + "balance_loss_clip": 1.03187263, + "balance_loss_mlp": 1.05133367, + "epoch": 0.13816323463099353, + "flos": 25989844930560.0, + "grad_norm": 2.1383795008624946, + "language_loss": 0.69005466, + "learning_rate": 3.878525337815164e-06, + "loss": 0.71213776, + "num_input_tokens_seen": 49725715, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0625, + "step": 2298, + "time_per_iteration": 2.4894726276397705 + }, + { + "auxiliary_loss_clip": 0.01163842, + "auxiliary_loss_mlp": 0.0105021, + "balance_loss_clip": 1.03075552, + "balance_loss_mlp": 1.05287397, + "epoch": 0.1382233578836615, + "flos": 19244313394560.0, + "grad_norm": 1.7932718261070644, + "language_loss": 0.86958891, + "learning_rate": 3.878391639291116e-06, + "loss": 0.89172935, + "num_input_tokens_seen": 49744710, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.109375, + "step": 2299, + "time_per_iteration": 2.4343175888061523 + }, + { + "auxiliary_loss_clip": 0.01158457, + "auxiliary_loss_mlp": 0.01052611, + "balance_loss_clip": 1.03221393, + "balance_loss_mlp": 1.05076718, + "epoch": 0.1382834811363295, + "flos": 25666290195840.0, + "grad_norm": 2.6477233854648015, + "language_loss": 0.7542398, + "learning_rate": 3.878257869538267e-06, + "loss": 0.7763505, + "num_input_tokens_seen": 49764300, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.078125, + "step": 2300, + "time_per_iteration": 2.5398943424224854 + }, + { + "auxiliary_loss_clip": 0.01160789, + "auxiliary_loss_mlp": 0.01050356, + "balance_loss_clip": 1.03088915, + "balance_loss_mlp": 1.05409729, + "epoch": 0.13834360438899745, + "flos": 19784193788160.0, + "grad_norm": 2.6084363319634956, + "language_loss": 0.82612532, + "learning_rate": 3.878124028561692e-06, + "loss": 0.8482368, + "num_input_tokens_seen": 49778380, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0625, + "step": 2301, + "time_per_iteration": 2.435732841491699 + }, + { + "auxiliary_loss_clip": 0.01155849, + "auxiliary_loss_mlp": 0.01043651, + "balance_loss_clip": 1.02461374, + "balance_loss_mlp": 1.04986811, + "epoch": 0.13840372764166542, + "flos": 26651858544000.0, + "grad_norm": 2.0886382571109987, + "language_loss": 0.85972583, + "learning_rate": 3.877990116366466e-06, + "loss": 0.8817209, + "num_input_tokens_seen": 49797460, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0625, + "step": 2302, + "time_per_iteration": 2.504011869430542 + }, + { + "auxiliary_loss_clip": 0.01055451, + "auxiliary_loss_mlp": 0.01009124, + "balance_loss_clip": 1.00688314, + "balance_loss_mlp": 1.0189817, + "epoch": 0.13846385089433338, + "flos": 70510998286080.0, + "grad_norm": 0.7554932596602951, + "language_loss": 0.65648526, + "learning_rate": 3.877856132957667e-06, + "loss": 0.677131, + "num_input_tokens_seen": 49868005, + "router_z_loss_clip": 0.02246094, + "router_z_loss_mlp": 0.36328125, + "step": 2303, + "time_per_iteration": 3.2563750743865967 + }, + { + "auxiliary_loss_clip": 0.0115535, + "auxiliary_loss_mlp": 0.01038432, + "balance_loss_clip": 1.01971662, + "balance_loss_mlp": 1.05022073, + "epoch": 0.13852397414700135, + "flos": 17348732956800.0, + "grad_norm": 2.0694955360834912, + "language_loss": 0.78234196, + "learning_rate": 3.877722078340374e-06, + "loss": 0.80427974, + "num_input_tokens_seen": 49885825, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.046875, + "step": 2304, + "time_per_iteration": 2.461975574493408 + }, + { + "auxiliary_loss_clip": 0.01161783, + "auxiliary_loss_mlp": 0.01038842, + "balance_loss_clip": 1.01991165, + "balance_loss_mlp": 1.05225086, + "epoch": 0.13858409739966931, + "flos": 21543781334400.0, + "grad_norm": 1.838077080535218, + "language_loss": 0.77824223, + "learning_rate": 3.877587952519672e-06, + "loss": 0.8002485, + "num_input_tokens_seen": 49905975, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.09375, + "step": 2305, + "time_per_iteration": 2.468254804611206 + }, + { + "auxiliary_loss_clip": 0.0115424, + "auxiliary_loss_mlp": 0.01045678, + "balance_loss_clip": 1.02732027, + "balance_loss_mlp": 1.04923558, + "epoch": 0.13864422065233728, + "flos": 21579907438080.0, + "grad_norm": 3.2063314507866947, + "language_loss": 0.87484217, + "learning_rate": 3.877453755500647e-06, + "loss": 0.89684129, + "num_input_tokens_seen": 49925800, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.046875, + "step": 2306, + "time_per_iteration": 2.4840242862701416 + }, + { + "auxiliary_loss_clip": 0.0105475, + "auxiliary_loss_mlp": 0.01012102, + "balance_loss_clip": 1.00995588, + "balance_loss_mlp": 1.01749539, + "epoch": 0.13870434390500527, + "flos": 53371156872960.0, + "grad_norm": 0.8793018572536648, + "language_loss": 0.59049129, + "learning_rate": 3.877319487288387e-06, + "loss": 0.6111598, + "num_input_tokens_seen": 49977620, + "router_z_loss_clip": 0.02148438, + "router_z_loss_mlp": 0.37304688, + "step": 2307, + "time_per_iteration": 3.1098880767822266 + }, + { + "auxiliary_loss_clip": 0.01164649, + "auxiliary_loss_mlp": 0.01043993, + "balance_loss_clip": 1.0233345, + "balance_loss_mlp": 1.05279016, + "epoch": 0.13876446715767324, + "flos": 22565906749440.0, + "grad_norm": 1.7539420555734833, + "language_loss": 0.79683769, + "learning_rate": 3.877185147887984e-06, + "loss": 0.81892413, + "num_input_tokens_seen": 49996650, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1171875, + "step": 2308, + "time_per_iteration": 2.5119385719299316 + }, + { + "auxiliary_loss_clip": 0.01159668, + "auxiliary_loss_mlp": 0.01042487, + "balance_loss_clip": 1.02331865, + "balance_loss_mlp": 1.05292201, + "epoch": 0.1388245904103412, + "flos": 20705231352960.0, + "grad_norm": 2.1876242684272342, + "language_loss": 0.78186178, + "learning_rate": 3.877050737304533e-06, + "loss": 0.80388331, + "num_input_tokens_seen": 50015640, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0625, + "step": 2309, + "time_per_iteration": 3.889902353286743 + }, + { + "auxiliary_loss_clip": 0.01164667, + "auxiliary_loss_mlp": 0.01044971, + "balance_loss_clip": 1.02517033, + "balance_loss_mlp": 1.05319023, + "epoch": 0.13888471366300917, + "flos": 20554729367040.0, + "grad_norm": 1.9671645437439387, + "language_loss": 0.67473733, + "learning_rate": 3.876916255543129e-06, + "loss": 0.69683367, + "num_input_tokens_seen": 50033500, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1171875, + "step": 2310, + "time_per_iteration": 5.331011056900024 + }, + { + "auxiliary_loss_clip": 0.01159907, + "auxiliary_loss_mlp": 0.01051301, + "balance_loss_clip": 1.03067827, + "balance_loss_mlp": 1.0511837, + "epoch": 0.13894483691567713, + "flos": 13838033473920.0, + "grad_norm": 1.8339330301012977, + "language_loss": 0.83962393, + "learning_rate": 3.8767817026088725e-06, + "loss": 0.86173606, + "num_input_tokens_seen": 50050075, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.0859375, + "step": 2311, + "time_per_iteration": 2.4287211894989014 + }, + { + "auxiliary_loss_clip": 0.01165031, + "auxiliary_loss_mlp": 0.01046165, + "balance_loss_clip": 1.02629256, + "balance_loss_mlp": 1.05262017, + "epoch": 0.1390049601683451, + "flos": 28031186759040.0, + "grad_norm": 2.2677083380951997, + "language_loss": 0.81788063, + "learning_rate": 3.876647078506866e-06, + "loss": 0.83999264, + "num_input_tokens_seen": 50070080, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.125, + "step": 2312, + "time_per_iteration": 2.5261852741241455 + }, + { + "auxiliary_loss_clip": 0.01165344, + "auxiliary_loss_mlp": 0.01045576, + "balance_loss_clip": 1.02634764, + "balance_loss_mlp": 1.05353236, + "epoch": 0.13906508342101306, + "flos": 26756860976640.0, + "grad_norm": 2.1868066623869202, + "language_loss": 0.86641061, + "learning_rate": 3.876512383242215e-06, + "loss": 0.88851982, + "num_input_tokens_seen": 50090040, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.1171875, + "step": 2313, + "time_per_iteration": 2.491847515106201 + }, + { + "auxiliary_loss_clip": 0.0116138, + "auxiliary_loss_mlp": 0.01052556, + "balance_loss_clip": 1.03208828, + "balance_loss_mlp": 1.05377281, + "epoch": 0.13912520667368106, + "flos": 24535104111360.0, + "grad_norm": 2.199884337980412, + "language_loss": 0.79629153, + "learning_rate": 3.876377616820024e-06, + "loss": 0.8184309, + "num_input_tokens_seen": 50110595, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.078125, + "step": 2314, + "time_per_iteration": 2.513545036315918 + }, + { + "auxiliary_loss_clip": 0.0116003, + "auxiliary_loss_mlp": 0.0104633, + "balance_loss_clip": 1.02668452, + "balance_loss_mlp": 1.05130863, + "epoch": 0.13918532992634902, + "flos": 19383215287680.0, + "grad_norm": 2.30759926974498, + "language_loss": 0.86246645, + "learning_rate": 3.876242779245409e-06, + "loss": 0.88453007, + "num_input_tokens_seen": 50125430, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0859375, + "step": 2315, + "time_per_iteration": 2.4236056804656982 + }, + { + "auxiliary_loss_clip": 0.01159066, + "auxiliary_loss_mlp": 0.01052564, + "balance_loss_clip": 1.03192866, + "balance_loss_mlp": 1.05146074, + "epoch": 0.139245453179017, + "flos": 21323756574720.0, + "grad_norm": 2.162038852448813, + "language_loss": 0.77074778, + "learning_rate": 3.876107870523477e-06, + "loss": 0.79286408, + "num_input_tokens_seen": 50144120, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.078125, + "step": 2316, + "time_per_iteration": 2.4574813842773438 + }, + { + "auxiliary_loss_clip": 0.01157842, + "auxiliary_loss_mlp": 0.01058721, + "balance_loss_clip": 1.03733492, + "balance_loss_mlp": 1.05045736, + "epoch": 0.13930557643168495, + "flos": 19500607912320.0, + "grad_norm": 1.6719823206156588, + "language_loss": 0.76972795, + "learning_rate": 3.875972890659349e-06, + "loss": 0.7918936, + "num_input_tokens_seen": 50162500, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.078125, + "step": 2317, + "time_per_iteration": 2.448096990585327 + }, + { + "auxiliary_loss_clip": 0.01162372, + "auxiliary_loss_mlp": 0.01049791, + "balance_loss_clip": 1.02993095, + "balance_loss_mlp": 1.05272126, + "epoch": 0.13936569968435292, + "flos": 25410821690880.0, + "grad_norm": 2.004328537884534, + "language_loss": 0.80159998, + "learning_rate": 3.875837839658139e-06, + "loss": 0.82372165, + "num_input_tokens_seen": 50182415, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.09375, + "step": 2318, + "time_per_iteration": 2.5152556896209717 + }, + { + "auxiliary_loss_clip": 0.01050424, + "auxiliary_loss_mlp": 0.0100261, + "balance_loss_clip": 1.00027394, + "balance_loss_mlp": 1.01373565, + "epoch": 0.13942582293702088, + "flos": 70771063731840.0, + "grad_norm": 0.8654041988705774, + "language_loss": 0.59008324, + "learning_rate": 3.87570271752497e-06, + "loss": 0.61061358, + "num_input_tokens_seen": 50245160, + "router_z_loss_clip": 0.02331543, + "router_z_loss_mlp": 0.3671875, + "step": 2319, + "time_per_iteration": 3.101083993911743 + }, + { + "auxiliary_loss_clip": 0.01162526, + "auxiliary_loss_mlp": 0.01053809, + "balance_loss_clip": 1.03365111, + "balance_loss_mlp": 1.05213809, + "epoch": 0.13948594618968888, + "flos": 35590885920000.0, + "grad_norm": 2.2307371496542356, + "language_loss": 0.65372109, + "learning_rate": 3.875567524264967e-06, + "loss": 0.67588449, + "num_input_tokens_seen": 50268215, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.109375, + "step": 2320, + "time_per_iteration": 2.580655336380005 + }, + { + "auxiliary_loss_clip": 0.01157047, + "auxiliary_loss_mlp": 0.01043656, + "balance_loss_clip": 1.02407002, + "balance_loss_mlp": 1.0507009, + "epoch": 0.13954606944235684, + "flos": 21105204272640.0, + "grad_norm": 1.6249908375914148, + "language_loss": 0.70695353, + "learning_rate": 3.875432259883256e-06, + "loss": 0.72896051, + "num_input_tokens_seen": 50288575, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0625, + "step": 2321, + "time_per_iteration": 2.4594380855560303 + }, + { + "auxiliary_loss_clip": 0.01158572, + "auxiliary_loss_mlp": 0.01055348, + "balance_loss_clip": 1.0345459, + "balance_loss_mlp": 1.04883599, + "epoch": 0.1396061926950248, + "flos": 25044425009280.0, + "grad_norm": 43.01057366099128, + "language_loss": 0.86161166, + "learning_rate": 3.875296924384965e-06, + "loss": 0.88375086, + "num_input_tokens_seen": 50308735, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.09375, + "step": 2322, + "time_per_iteration": 2.4912750720977783 + }, + { + "auxiliary_loss_clip": 0.01152207, + "auxiliary_loss_mlp": 0.01056736, + "balance_loss_clip": 1.0373404, + "balance_loss_mlp": 1.04840016, + "epoch": 0.13966631594769277, + "flos": 37634023428480.0, + "grad_norm": 1.7187096085030618, + "language_loss": 0.6682983, + "learning_rate": 3.875161517775226e-06, + "loss": 0.69038773, + "num_input_tokens_seen": 50331025, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0390625, + "step": 2323, + "time_per_iteration": 2.606084108352661 + }, + { + "auxiliary_loss_clip": 0.0116621, + "auxiliary_loss_mlp": 0.01051125, + "balance_loss_clip": 1.03068066, + "balance_loss_mlp": 1.05250573, + "epoch": 0.13972643920036074, + "flos": 16690993061760.0, + "grad_norm": 2.0268681764850665, + "language_loss": 0.89011461, + "learning_rate": 3.875026040059175e-06, + "loss": 0.91228795, + "num_input_tokens_seen": 50349725, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1328125, + "step": 2324, + "time_per_iteration": 2.458172559738159 + }, + { + "auxiliary_loss_clip": 0.01159494, + "auxiliary_loss_mlp": 0.01056649, + "balance_loss_clip": 1.03626466, + "balance_loss_mlp": 1.04949069, + "epoch": 0.1397865624530287, + "flos": 23331055288320.0, + "grad_norm": 4.4201897818475775, + "language_loss": 0.70700991, + "learning_rate": 3.8748904912419485e-06, + "loss": 0.7291714, + "num_input_tokens_seen": 50367965, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1015625, + "step": 2325, + "time_per_iteration": 2.4608585834503174 + }, + { + "auxiliary_loss_clip": 0.01161715, + "auxiliary_loss_mlp": 0.01055057, + "balance_loss_clip": 1.03568554, + "balance_loss_mlp": 1.05384755, + "epoch": 0.13984668570569667, + "flos": 22778317825920.0, + "grad_norm": 1.8512202881484865, + "language_loss": 0.81165004, + "learning_rate": 3.874754871328688e-06, + "loss": 0.83381784, + "num_input_tokens_seen": 50385605, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.078125, + "step": 2326, + "time_per_iteration": 2.474729537963867 + }, + { + "auxiliary_loss_clip": 0.01155088, + "auxiliary_loss_mlp": 0.01047016, + "balance_loss_clip": 1.02880073, + "balance_loss_mlp": 1.05092621, + "epoch": 0.13990680895836466, + "flos": 19464553635840.0, + "grad_norm": 1.806872548679543, + "language_loss": 0.88955671, + "learning_rate": 3.874619180324534e-06, + "loss": 0.9115777, + "num_input_tokens_seen": 50403985, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.0390625, + "step": 2327, + "time_per_iteration": 2.4512577056884766 + }, + { + "auxiliary_loss_clip": 0.01155487, + "auxiliary_loss_mlp": 0.0105816, + "balance_loss_clip": 1.03790593, + "balance_loss_mlp": 1.05021226, + "epoch": 0.13996693221103262, + "flos": 20303283185280.0, + "grad_norm": 2.4750320646827992, + "language_loss": 0.85236871, + "learning_rate": 3.874483418234632e-06, + "loss": 0.87450516, + "num_input_tokens_seen": 50421590, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.046875, + "step": 2328, + "time_per_iteration": 2.4724884033203125 + }, + { + "auxiliary_loss_clip": 0.01158673, + "auxiliary_loss_mlp": 0.0104927, + "balance_loss_clip": 1.02926636, + "balance_loss_mlp": 1.05120313, + "epoch": 0.1400270554637006, + "flos": 26617707688320.0, + "grad_norm": 1.653872228613324, + "language_loss": 0.74084997, + "learning_rate": 3.874347585064131e-06, + "loss": 0.76292944, + "num_input_tokens_seen": 50443945, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.078125, + "step": 2329, + "time_per_iteration": 2.5238442420959473 + }, + { + "auxiliary_loss_clip": 0.01156952, + "auxiliary_loss_mlp": 0.01050364, + "balance_loss_clip": 1.03070641, + "balance_loss_mlp": 1.04729962, + "epoch": 0.14008717871636855, + "flos": 19391475415680.0, + "grad_norm": 1.840223813628444, + "language_loss": 0.77969897, + "learning_rate": 3.874211680818183e-06, + "loss": 0.80177212, + "num_input_tokens_seen": 50462065, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.09375, + "step": 2330, + "time_per_iteration": 2.468606948852539 + }, + { + "auxiliary_loss_clip": 0.01156133, + "auxiliary_loss_mlp": 0.01046075, + "balance_loss_clip": 1.02738333, + "balance_loss_mlp": 1.0495398, + "epoch": 0.14014730196903652, + "flos": 15304266645120.0, + "grad_norm": 2.6993483396219506, + "language_loss": 0.72030222, + "learning_rate": 3.87407570550194e-06, + "loss": 0.74232423, + "num_input_tokens_seen": 50479565, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0625, + "step": 2331, + "time_per_iteration": 2.504417896270752 + }, + { + "auxiliary_loss_clip": 0.01150975, + "auxiliary_loss_mlp": 0.01053113, + "balance_loss_clip": 1.03333664, + "balance_loss_mlp": 1.05008936, + "epoch": 0.14020742522170448, + "flos": 14939701557120.0, + "grad_norm": 1.585347596838152, + "language_loss": 0.72609055, + "learning_rate": 3.873939659120557e-06, + "loss": 0.74813151, + "num_input_tokens_seen": 50497305, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0078125, + "step": 2332, + "time_per_iteration": 2.4244635105133057 + }, + { + "auxiliary_loss_clip": 0.01047328, + "auxiliary_loss_mlp": 0.01002801, + "balance_loss_clip": 1.00048828, + "balance_loss_mlp": 1.01059568, + "epoch": 0.14026754847437245, + "flos": 48824580044160.0, + "grad_norm": 0.8290843953692559, + "language_loss": 0.56071591, + "learning_rate": 3.873803541679196e-06, + "loss": 0.58121729, + "num_input_tokens_seen": 50549735, + "router_z_loss_clip": 0.02307129, + "router_z_loss_mlp": 0.3671875, + "step": 2333, + "time_per_iteration": 2.8934712409973145 + }, + { + "auxiliary_loss_clip": 0.01155339, + "auxiliary_loss_mlp": 0.01046057, + "balance_loss_clip": 1.02629185, + "balance_loss_mlp": 1.05001664, + "epoch": 0.14032767172704044, + "flos": 25773267876480.0, + "grad_norm": 1.7851490004805215, + "language_loss": 0.82529652, + "learning_rate": 3.873667353183016e-06, + "loss": 0.84731042, + "num_input_tokens_seen": 50570100, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0546875, + "step": 2334, + "time_per_iteration": 2.495786428451538 + }, + { + "auxiliary_loss_clip": 0.01155545, + "auxiliary_loss_mlp": 0.01048248, + "balance_loss_clip": 1.0293529, + "balance_loss_mlp": 1.05012262, + "epoch": 0.1403877949797084, + "flos": 21216312017280.0, + "grad_norm": 1.8251700419130605, + "language_loss": 0.81237197, + "learning_rate": 3.8735310936371825e-06, + "loss": 0.83440989, + "num_input_tokens_seen": 50589185, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0546875, + "step": 2335, + "time_per_iteration": 2.483055591583252 + }, + { + "auxiliary_loss_clip": 0.01163426, + "auxiliary_loss_mlp": 0.01051429, + "balance_loss_clip": 1.02829087, + "balance_loss_mlp": 1.05328035, + "epoch": 0.14044791823237637, + "flos": 22747973811840.0, + "grad_norm": 1.83822789048078, + "language_loss": 0.82159901, + "learning_rate": 3.873394763046862e-06, + "loss": 0.8437475, + "num_input_tokens_seen": 50609645, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.1015625, + "step": 2336, + "time_per_iteration": 2.4732770919799805 + }, + { + "auxiliary_loss_clip": 0.01157668, + "auxiliary_loss_mlp": 0.01044768, + "balance_loss_clip": 1.02526581, + "balance_loss_mlp": 1.05202782, + "epoch": 0.14050804148504434, + "flos": 22964443125120.0, + "grad_norm": 1.8506426201256954, + "language_loss": 0.80081403, + "learning_rate": 3.873258361417225e-06, + "loss": 0.82283843, + "num_input_tokens_seen": 50628385, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0546875, + "step": 2337, + "time_per_iteration": 2.4599671363830566 + }, + { + "auxiliary_loss_clip": 0.01155582, + "auxiliary_loss_mlp": 0.0104864, + "balance_loss_clip": 1.02911353, + "balance_loss_mlp": 1.04861474, + "epoch": 0.1405681647377123, + "flos": 22200336080640.0, + "grad_norm": 2.2474896580124963, + "language_loss": 0.7927807, + "learning_rate": 3.873121888753442e-06, + "loss": 0.81482291, + "num_input_tokens_seen": 50647260, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0703125, + "step": 2338, + "time_per_iteration": 2.4892208576202393 + }, + { + "auxiliary_loss_clip": 0.01165053, + "auxiliary_loss_mlp": 0.01046329, + "balance_loss_clip": 1.02577746, + "balance_loss_mlp": 1.05685067, + "epoch": 0.14062828799038027, + "flos": 23732787974400.0, + "grad_norm": 2.148660398501072, + "language_loss": 0.79827893, + "learning_rate": 3.87298534506069e-06, + "loss": 0.82039273, + "num_input_tokens_seen": 50666130, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.078125, + "step": 2339, + "time_per_iteration": 2.4672555923461914 + }, + { + "auxiliary_loss_clip": 0.01159986, + "auxiliary_loss_mlp": 0.01055012, + "balance_loss_clip": 1.03506875, + "balance_loss_mlp": 1.0527122, + "epoch": 0.14068841124304826, + "flos": 39202493685120.0, + "grad_norm": 1.7979240482106922, + "language_loss": 0.6582588, + "learning_rate": 3.872848730344146e-06, + "loss": 0.68040884, + "num_input_tokens_seen": 50687440, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0703125, + "step": 2340, + "time_per_iteration": 2.614506483078003 + }, + { + "auxiliary_loss_clip": 0.01155238, + "auxiliary_loss_mlp": 0.01048788, + "balance_loss_clip": 1.02936912, + "balance_loss_mlp": 1.05242825, + "epoch": 0.14074853449571623, + "flos": 20192283181440.0, + "grad_norm": 2.5431372850663334, + "language_loss": 0.78670812, + "learning_rate": 3.87271204460899e-06, + "loss": 0.80874836, + "num_input_tokens_seen": 50704030, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.03125, + "step": 2341, + "time_per_iteration": 2.4420077800750732 + }, + { + "auxiliary_loss_clip": 0.01156345, + "auxiliary_loss_mlp": 0.01050043, + "balance_loss_clip": 1.03058767, + "balance_loss_mlp": 1.05246425, + "epoch": 0.1408086577483842, + "flos": 18405871153920.0, + "grad_norm": 11.570217446637303, + "language_loss": 0.80154169, + "learning_rate": 3.8725752878604066e-06, + "loss": 0.82360554, + "num_input_tokens_seen": 50723305, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.03125, + "step": 2342, + "time_per_iteration": 2.4961190223693848 + }, + { + "auxiliary_loss_clip": 0.01159304, + "auxiliary_loss_mlp": 0.010435, + "balance_loss_clip": 1.02486777, + "balance_loss_mlp": 1.05673313, + "epoch": 0.14086878100105216, + "flos": 25264593423360.0, + "grad_norm": 1.9358851833739352, + "language_loss": 0.77974075, + "learning_rate": 3.87243846010358e-06, + "loss": 0.80176884, + "num_input_tokens_seen": 50743270, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.0234375, + "step": 2343, + "time_per_iteration": 2.479679584503174 + }, + { + "auxiliary_loss_clip": 0.01049361, + "auxiliary_loss_mlp": 0.0100492, + "balance_loss_clip": 1.0025475, + "balance_loss_mlp": 1.01255798, + "epoch": 0.14092890425372012, + "flos": 65978388869760.0, + "grad_norm": 0.8341361150670269, + "language_loss": 0.6155628, + "learning_rate": 3.872301561343699e-06, + "loss": 0.63610566, + "num_input_tokens_seen": 50802710, + "router_z_loss_clip": 0.02368164, + "router_z_loss_mlp": 0.3671875, + "step": 2344, + "time_per_iteration": 3.048691987991333 + }, + { + "auxiliary_loss_clip": 0.01151538, + "auxiliary_loss_mlp": 0.01040868, + "balance_loss_clip": 1.02309346, + "balance_loss_mlp": 1.04911709, + "epoch": 0.1409890275063881, + "flos": 23694973931520.0, + "grad_norm": 1.886714907416039, + "language_loss": 0.64591062, + "learning_rate": 3.872164591585956e-06, + "loss": 0.6678347, + "num_input_tokens_seen": 50822625, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 1.0234375, + "step": 2345, + "time_per_iteration": 2.509552240371704 + }, + { + "auxiliary_loss_clip": 0.01162324, + "auxiliary_loss_mlp": 0.01044831, + "balance_loss_clip": 1.023803, + "balance_loss_mlp": 1.05019534, + "epoch": 0.14104915075905605, + "flos": 23623152687360.0, + "grad_norm": 2.502398022219224, + "language_loss": 0.736485, + "learning_rate": 3.8720275508355435e-06, + "loss": 0.7585566, + "num_input_tokens_seen": 50842330, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1171875, + "step": 2346, + "time_per_iteration": 2.4962430000305176 + }, + { + "auxiliary_loss_clip": 0.01160187, + "auxiliary_loss_mlp": 0.01046171, + "balance_loss_clip": 1.02600074, + "balance_loss_mlp": 1.05144429, + "epoch": 0.14110927401172405, + "flos": 20595165102720.0, + "grad_norm": 2.4324488814849703, + "language_loss": 0.77868927, + "learning_rate": 3.8718904390976585e-06, + "loss": 0.80075288, + "num_input_tokens_seen": 50861035, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0859375, + "step": 2347, + "time_per_iteration": 2.4663050174713135 + }, + { + "auxiliary_loss_clip": 0.01155281, + "auxiliary_loss_mlp": 0.01046804, + "balance_loss_clip": 1.02852941, + "balance_loss_mlp": 1.04918981, + "epoch": 0.141169397264392, + "flos": 28548049512960.0, + "grad_norm": 1.7514485331985392, + "language_loss": 0.76446569, + "learning_rate": 3.8717532563775e-06, + "loss": 0.78648651, + "num_input_tokens_seen": 50880105, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.0625, + "step": 2348, + "time_per_iteration": 2.508002758026123 + }, + { + "auxiliary_loss_clip": 0.0115565, + "auxiliary_loss_mlp": 0.01043027, + "balance_loss_clip": 1.02346444, + "balance_loss_mlp": 1.0508523, + "epoch": 0.14122952051705998, + "flos": 17092258871040.0, + "grad_norm": 1.8350283773112115, + "language_loss": 0.8686446, + "learning_rate": 3.871616002680272e-06, + "loss": 0.89063132, + "num_input_tokens_seen": 50897720, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.046875, + "step": 2349, + "time_per_iteration": 2.4446985721588135 + }, + { + "auxiliary_loss_clip": 0.01156083, + "auxiliary_loss_mlp": 0.0104419, + "balance_loss_clip": 1.02478313, + "balance_loss_mlp": 1.05220377, + "epoch": 0.14128964376972794, + "flos": 28946801370240.0, + "grad_norm": 1.7285118920158233, + "language_loss": 0.8895669, + "learning_rate": 3.871478678011177e-06, + "loss": 0.9115696, + "num_input_tokens_seen": 50918385, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0390625, + "step": 2350, + "time_per_iteration": 2.49725341796875 + }, + { + "auxiliary_loss_clip": 0.0115943, + "auxiliary_loss_mlp": 0.01046195, + "balance_loss_clip": 1.02542889, + "balance_loss_mlp": 1.05281878, + "epoch": 0.1413497670223959, + "flos": 18989778643200.0, + "grad_norm": 1.8656651100546833, + "language_loss": 0.814816, + "learning_rate": 3.871341282375423e-06, + "loss": 0.83687228, + "num_input_tokens_seen": 50938270, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.0703125, + "step": 2351, + "time_per_iteration": 3.941416025161743 + }, + { + "auxiliary_loss_clip": 0.01157242, + "auxiliary_loss_mlp": 0.01040097, + "balance_loss_clip": 1.02102304, + "balance_loss_mlp": 1.05032706, + "epoch": 0.14140989027506387, + "flos": 29862236413440.0, + "grad_norm": 2.6782915885510286, + "language_loss": 0.82935351, + "learning_rate": 3.871203815778219e-06, + "loss": 0.85132694, + "num_input_tokens_seen": 50958155, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0703125, + "step": 2352, + "time_per_iteration": 5.431722640991211 + }, + { + "auxiliary_loss_clip": 0.01047453, + "auxiliary_loss_mlp": 0.01006216, + "balance_loss_clip": 1.00387907, + "balance_loss_mlp": 1.01053333, + "epoch": 0.14147001352773186, + "flos": 62079532041600.0, + "grad_norm": 0.90864091090638, + "language_loss": 0.61894125, + "learning_rate": 3.87106627822478e-06, + "loss": 0.63947791, + "num_input_tokens_seen": 51020705, + "router_z_loss_clip": 0.02331543, + "router_z_loss_mlp": 0.36914062, + "step": 2353, + "time_per_iteration": 3.0071640014648438 + }, + { + "auxiliary_loss_clip": 0.01154516, + "auxiliary_loss_mlp": 0.01047207, + "balance_loss_clip": 1.02807426, + "balance_loss_mlp": 1.05024958, + "epoch": 0.14153013678039983, + "flos": 22017514832640.0, + "grad_norm": 1.8535903324814498, + "language_loss": 0.87264848, + "learning_rate": 3.8709286697203196e-06, + "loss": 0.89466572, + "num_input_tokens_seen": 51039995, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0390625, + "step": 2354, + "time_per_iteration": 2.4613726139068604 + }, + { + "auxiliary_loss_clip": 0.01157155, + "auxiliary_loss_mlp": 0.01046298, + "balance_loss_clip": 1.02607965, + "balance_loss_mlp": 1.04953241, + "epoch": 0.1415902600330678, + "flos": 19720093968000.0, + "grad_norm": 1.9651075901387003, + "language_loss": 0.74872321, + "learning_rate": 3.870790990270057e-06, + "loss": 0.77075779, + "num_input_tokens_seen": 51059075, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.078125, + "step": 2355, + "time_per_iteration": 2.442379951477051 + }, + { + "auxiliary_loss_clip": 0.01047047, + "auxiliary_loss_mlp": 0.01002716, + "balance_loss_clip": 1.00052261, + "balance_loss_mlp": 1.01023293, + "epoch": 0.14165038328573576, + "flos": 65900929190400.0, + "grad_norm": 0.6790475533637321, + "language_loss": 0.5182299, + "learning_rate": 3.870653239879212e-06, + "loss": 0.53872752, + "num_input_tokens_seen": 51120380, + "router_z_loss_clip": 0.02197266, + "router_z_loss_mlp": 0.3671875, + "step": 2356, + "time_per_iteration": 2.9892258644104004 + }, + { + "auxiliary_loss_clip": 0.01156071, + "auxiliary_loss_mlp": 0.01053896, + "balance_loss_clip": 1.03495359, + "balance_loss_mlp": 1.05080867, + "epoch": 0.14171050653840372, + "flos": 12130158533760.0, + "grad_norm": 3.0630792396255053, + "language_loss": 0.70576489, + "learning_rate": 3.8705154185530095e-06, + "loss": 0.72786456, + "num_input_tokens_seen": 51136950, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0546875, + "step": 2357, + "time_per_iteration": 2.421844005584717 + }, + { + "auxiliary_loss_clip": 0.0116013, + "auxiliary_loss_mlp": 0.01050288, + "balance_loss_clip": 1.03169179, + "balance_loss_mlp": 1.05012453, + "epoch": 0.1417706297910717, + "flos": 20412487509120.0, + "grad_norm": 1.8720076771552743, + "language_loss": 0.82205695, + "learning_rate": 3.870377526296674e-06, + "loss": 0.84416115, + "num_input_tokens_seen": 51155175, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.09375, + "step": 2358, + "time_per_iteration": 2.4519011974334717 + }, + { + "auxiliary_loss_clip": 0.01160902, + "auxiliary_loss_mlp": 0.01047176, + "balance_loss_clip": 1.02663624, + "balance_loss_mlp": 1.051018, + "epoch": 0.14183075304373965, + "flos": 22380607463040.0, + "grad_norm": 6.439592826280342, + "language_loss": 0.7129705, + "learning_rate": 3.870239563115436e-06, + "loss": 0.73505127, + "num_input_tokens_seen": 51174500, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1015625, + "step": 2359, + "time_per_iteration": 2.4797613620758057 + }, + { + "auxiliary_loss_clip": 0.0115558, + "auxiliary_loss_mlp": 0.01043529, + "balance_loss_clip": 1.02374041, + "balance_loss_mlp": 1.04988599, + "epoch": 0.14189087629640765, + "flos": 21580913018880.0, + "grad_norm": 5.514404455287625, + "language_loss": 0.76040578, + "learning_rate": 3.870101529014526e-06, + "loss": 0.78239685, + "num_input_tokens_seen": 51194270, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.0546875, + "step": 2360, + "time_per_iteration": 2.4538815021514893 + }, + { + "auxiliary_loss_clip": 0.011559, + "auxiliary_loss_mlp": 0.01041926, + "balance_loss_clip": 1.02173233, + "balance_loss_mlp": 1.05221295, + "epoch": 0.1419509995490756, + "flos": 20008564093440.0, + "grad_norm": 2.1535632205539135, + "language_loss": 0.8188749, + "learning_rate": 3.869963423999178e-06, + "loss": 0.84085315, + "num_input_tokens_seen": 51211850, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0390625, + "step": 2361, + "time_per_iteration": 2.4411346912384033 + }, + { + "auxiliary_loss_clip": 0.01152529, + "auxiliary_loss_mlp": 0.01047315, + "balance_loss_clip": 1.02826524, + "balance_loss_mlp": 1.04964995, + "epoch": 0.14201112280174358, + "flos": 31941464112000.0, + "grad_norm": 2.775663525053056, + "language_loss": 0.74489617, + "learning_rate": 3.86982524807463e-06, + "loss": 0.76689464, + "num_input_tokens_seen": 51233545, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.03125, + "step": 2362, + "time_per_iteration": 2.530163049697876 + }, + { + "auxiliary_loss_clip": 0.01158195, + "auxiliary_loss_mlp": 0.01046424, + "balance_loss_clip": 1.0265274, + "balance_loss_mlp": 1.05187464, + "epoch": 0.14207124605441154, + "flos": 41464147582080.0, + "grad_norm": 4.478599792998506, + "language_loss": 0.73748112, + "learning_rate": 3.869687001246122e-06, + "loss": 0.75952733, + "num_input_tokens_seen": 51257615, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0625, + "step": 2363, + "time_per_iteration": 2.646651029586792 + }, + { + "auxiliary_loss_clip": 0.01156109, + "auxiliary_loss_mlp": 0.01045606, + "balance_loss_clip": 1.02605534, + "balance_loss_mlp": 1.05005693, + "epoch": 0.1421313693070795, + "flos": 31905086613120.0, + "grad_norm": 1.8353407682080387, + "language_loss": 0.72971261, + "learning_rate": 3.8695486835188946e-06, + "loss": 0.75172973, + "num_input_tokens_seen": 51279645, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0625, + "step": 2364, + "time_per_iteration": 2.5670576095581055 + }, + { + "auxiliary_loss_clip": 0.01152213, + "auxiliary_loss_mlp": 0.01048707, + "balance_loss_clip": 1.031183, + "balance_loss_mlp": 1.05015445, + "epoch": 0.14219149255974747, + "flos": 26871165031680.0, + "grad_norm": 4.452075303519762, + "language_loss": 0.90230036, + "learning_rate": 3.869410294898195e-06, + "loss": 0.92430955, + "num_input_tokens_seen": 51299775, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 1.015625, + "step": 2365, + "time_per_iteration": 2.5130062103271484 + }, + { + "auxiliary_loss_clip": 0.01155172, + "auxiliary_loss_mlp": 0.0104726, + "balance_loss_clip": 1.02735198, + "balance_loss_mlp": 1.04896259, + "epoch": 0.14225161581241544, + "flos": 27454426076160.0, + "grad_norm": 1.956458588852685, + "language_loss": 0.65377176, + "learning_rate": 3.869271835389268e-06, + "loss": 0.67579615, + "num_input_tokens_seen": 51319430, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0625, + "step": 2366, + "time_per_iteration": 2.5081095695495605 + }, + { + "auxiliary_loss_clip": 0.01152693, + "auxiliary_loss_mlp": 0.01056429, + "balance_loss_clip": 1.03640223, + "balance_loss_mlp": 1.04979372, + "epoch": 0.14231173906508343, + "flos": 10561436881920.0, + "grad_norm": 2.190613479881076, + "language_loss": 0.80414236, + "learning_rate": 3.8691333049973665e-06, + "loss": 0.82623357, + "num_input_tokens_seen": 51336045, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.03125, + "step": 2367, + "time_per_iteration": 2.4398317337036133 + }, + { + "auxiliary_loss_clip": 0.01158941, + "auxiliary_loss_mlp": 0.01054295, + "balance_loss_clip": 1.0333972, + "balance_loss_mlp": 1.05221498, + "epoch": 0.1423718623177514, + "flos": 28360882719360.0, + "grad_norm": 2.898581267606924, + "language_loss": 0.82619941, + "learning_rate": 3.868994703727742e-06, + "loss": 0.84833181, + "num_input_tokens_seen": 51357030, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.0703125, + "step": 2368, + "time_per_iteration": 2.512401580810547 + }, + { + "auxiliary_loss_clip": 0.01157439, + "auxiliary_loss_mlp": 0.01050054, + "balance_loss_clip": 1.0298835, + "balance_loss_mlp": 1.05165803, + "epoch": 0.14243198557041936, + "flos": 19354235990400.0, + "grad_norm": 2.7587049982231675, + "language_loss": 0.86971414, + "learning_rate": 3.868856031585652e-06, + "loss": 0.89178908, + "num_input_tokens_seen": 51374890, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0625, + "step": 2369, + "time_per_iteration": 2.444784164428711 + }, + { + "auxiliary_loss_clip": 0.01158905, + "auxiliary_loss_mlp": 0.01042779, + "balance_loss_clip": 1.02303767, + "balance_loss_mlp": 1.04913163, + "epoch": 0.14249210882308733, + "flos": 28806857982720.0, + "grad_norm": 1.4370193327140612, + "language_loss": 0.75704634, + "learning_rate": 3.868717288576354e-06, + "loss": 0.77906322, + "num_input_tokens_seen": 51398100, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.09375, + "step": 2370, + "time_per_iteration": 2.527740240097046 + }, + { + "auxiliary_loss_clip": 0.01154457, + "auxiliary_loss_mlp": 0.01058445, + "balance_loss_clip": 1.0384295, + "balance_loss_mlp": 1.04879546, + "epoch": 0.1425522320757553, + "flos": 21835016807040.0, + "grad_norm": 1.7319048865171518, + "language_loss": 0.82923144, + "learning_rate": 3.868578474705109e-06, + "loss": 0.85136044, + "num_input_tokens_seen": 51418745, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.0625, + "step": 2371, + "time_per_iteration": 2.4644808769226074 + }, + { + "auxiliary_loss_clip": 0.01158835, + "auxiliary_loss_mlp": 0.01051346, + "balance_loss_clip": 1.03171265, + "balance_loss_mlp": 1.05157602, + "epoch": 0.14261235532842326, + "flos": 17311457617920.0, + "grad_norm": 1.956158386855541, + "language_loss": 0.82575452, + "learning_rate": 3.868439589977181e-06, + "loss": 0.84785628, + "num_input_tokens_seen": 51437455, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0703125, + "step": 2372, + "time_per_iteration": 2.42240047454834 + }, + { + "auxiliary_loss_clip": 0.01157732, + "auxiliary_loss_mlp": 0.01051664, + "balance_loss_clip": 1.03175569, + "balance_loss_mlp": 1.05134308, + "epoch": 0.14267247858109125, + "flos": 18806741913600.0, + "grad_norm": 2.19442784605527, + "language_loss": 0.8396256, + "learning_rate": 3.868300634397836e-06, + "loss": 0.86171949, + "num_input_tokens_seen": 51455710, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0625, + "step": 2373, + "time_per_iteration": 2.444695472717285 + }, + { + "auxiliary_loss_clip": 0.01154816, + "auxiliary_loss_mlp": 0.01050946, + "balance_loss_clip": 1.03294528, + "balance_loss_mlp": 1.05012143, + "epoch": 0.14273260183375922, + "flos": 11358904682880.0, + "grad_norm": 2.034088541649992, + "language_loss": 0.86271042, + "learning_rate": 3.8681616079723445e-06, + "loss": 0.88476801, + "num_input_tokens_seen": 51471270, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.046875, + "step": 2374, + "time_per_iteration": 2.428062915802002 + }, + { + "auxiliary_loss_clip": 0.01161306, + "auxiliary_loss_mlp": 0.01050996, + "balance_loss_clip": 1.03024197, + "balance_loss_mlp": 1.05125451, + "epoch": 0.14279272508642718, + "flos": 27567688636800.0, + "grad_norm": 4.612229602439842, + "language_loss": 0.7919687, + "learning_rate": 3.868022510705977e-06, + "loss": 0.81409162, + "num_input_tokens_seen": 51492705, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1015625, + "step": 2375, + "time_per_iteration": 2.526838541030884 + }, + { + "auxiliary_loss_clip": 0.01157834, + "auxiliary_loss_mlp": 0.01056869, + "balance_loss_clip": 1.03655601, + "balance_loss_mlp": 1.05240607, + "epoch": 0.14285284833909515, + "flos": 16252559654400.0, + "grad_norm": 2.386247922788535, + "language_loss": 0.76400912, + "learning_rate": 3.867883342604009e-06, + "loss": 0.78615618, + "num_input_tokens_seen": 51510780, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.046875, + "step": 2376, + "time_per_iteration": 2.4554591178894043 + }, + { + "auxiliary_loss_clip": 0.01156552, + "auxiliary_loss_mlp": 0.0104949, + "balance_loss_clip": 1.02995205, + "balance_loss_mlp": 1.05075741, + "epoch": 0.1429129715917631, + "flos": 19755609540480.0, + "grad_norm": 2.9035160782842753, + "language_loss": 0.93037754, + "learning_rate": 3.867744103671717e-06, + "loss": 0.952438, + "num_input_tokens_seen": 51531400, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0625, + "step": 2377, + "time_per_iteration": 2.51385760307312 + }, + { + "auxiliary_loss_clip": 0.01157682, + "auxiliary_loss_mlp": 0.01051526, + "balance_loss_clip": 1.02991319, + "balance_loss_mlp": 1.05085003, + "epoch": 0.14297309484443108, + "flos": 21137092571520.0, + "grad_norm": 1.9751577144221115, + "language_loss": 0.91598773, + "learning_rate": 3.867604793914382e-06, + "loss": 0.93807983, + "num_input_tokens_seen": 51548215, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.0703125, + "step": 2378, + "time_per_iteration": 2.558563470840454 + }, + { + "auxiliary_loss_clip": 0.01159674, + "auxiliary_loss_mlp": 0.01044299, + "balance_loss_clip": 1.02410531, + "balance_loss_mlp": 1.051296, + "epoch": 0.14303321809709904, + "flos": 23586667447680.0, + "grad_norm": 1.745891074970689, + "language_loss": 0.73947102, + "learning_rate": 3.8674654133372864e-06, + "loss": 0.76151079, + "num_input_tokens_seen": 51566820, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0859375, + "step": 2379, + "time_per_iteration": 2.511359214782715 + }, + { + "auxiliary_loss_clip": 0.01156473, + "auxiliary_loss_mlp": 0.01056109, + "balance_loss_clip": 1.03636849, + "balance_loss_mlp": 1.05014992, + "epoch": 0.14309334134976703, + "flos": 15888281875200.0, + "grad_norm": 1.8640465231226504, + "language_loss": 0.79013336, + "learning_rate": 3.867325961945714e-06, + "loss": 0.81225914, + "num_input_tokens_seen": 51585075, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0625, + "step": 2380, + "time_per_iteration": 2.466219663619995 + }, + { + "auxiliary_loss_clip": 0.01162977, + "auxiliary_loss_mlp": 0.0105089, + "balance_loss_clip": 1.03124452, + "balance_loss_mlp": 1.05528164, + "epoch": 0.143153464602435, + "flos": 16325601960960.0, + "grad_norm": 2.3244590707621073, + "language_loss": 0.87958229, + "learning_rate": 3.867186439744955e-06, + "loss": 0.90172088, + "num_input_tokens_seen": 51603185, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.078125, + "step": 2381, + "time_per_iteration": 2.4476850032806396 + }, + { + "auxiliary_loss_clip": 0.01156941, + "auxiliary_loss_mlp": 0.01051059, + "balance_loss_clip": 1.03084123, + "balance_loss_mlp": 1.0517571, + "epoch": 0.14321358785510296, + "flos": 17092079303040.0, + "grad_norm": 2.599935932772449, + "language_loss": 0.76852649, + "learning_rate": 3.867046846740299e-06, + "loss": 0.7906065, + "num_input_tokens_seen": 51620880, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.046875, + "step": 2382, + "time_per_iteration": 2.4389045238494873 + }, + { + "auxiliary_loss_clip": 0.01157847, + "auxiliary_loss_mlp": 0.01053474, + "balance_loss_clip": 1.03338671, + "balance_loss_mlp": 1.05068171, + "epoch": 0.14327371110777093, + "flos": 26322916769280.0, + "grad_norm": 2.461149819336849, + "language_loss": 0.76948071, + "learning_rate": 3.866907182937039e-06, + "loss": 0.79159391, + "num_input_tokens_seen": 51640170, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0703125, + "step": 2383, + "time_per_iteration": 2.516038179397583 + }, + { + "auxiliary_loss_clip": 0.01158748, + "auxiliary_loss_mlp": 0.01050878, + "balance_loss_clip": 1.0299803, + "balance_loss_mlp": 1.05114412, + "epoch": 0.1433338343604389, + "flos": 18076462502400.0, + "grad_norm": 2.169581662424978, + "language_loss": 0.88202822, + "learning_rate": 3.866767448340471e-06, + "loss": 0.9041245, + "num_input_tokens_seen": 51656580, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.078125, + "step": 2384, + "time_per_iteration": 2.42138934135437 + }, + { + "auxiliary_loss_clip": 0.01164338, + "auxiliary_loss_mlp": 0.01049242, + "balance_loss_clip": 1.02780819, + "balance_loss_mlp": 1.05382657, + "epoch": 0.14339395761310686, + "flos": 15522783033600.0, + "grad_norm": 4.175812514986151, + "language_loss": 0.79225606, + "learning_rate": 3.866627642955895e-06, + "loss": 0.81439185, + "num_input_tokens_seen": 51674645, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.109375, + "step": 2385, + "time_per_iteration": 2.4439244270324707 + }, + { + "auxiliary_loss_clip": 0.01156029, + "auxiliary_loss_mlp": 0.01046717, + "balance_loss_clip": 1.02692771, + "balance_loss_mlp": 1.04881537, + "epoch": 0.14345408086577485, + "flos": 28548767784960.0, + "grad_norm": 1.9672730758223058, + "language_loss": 0.74989617, + "learning_rate": 3.866487766788612e-06, + "loss": 0.77192366, + "num_input_tokens_seen": 51695770, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.078125, + "step": 2386, + "time_per_iteration": 2.533304214477539 + }, + { + "auxiliary_loss_clip": 0.01159067, + "auxiliary_loss_mlp": 0.01047419, + "balance_loss_clip": 1.02777338, + "balance_loss_mlp": 1.05180025, + "epoch": 0.14351420411844282, + "flos": 20230061310720.0, + "grad_norm": 2.5174427688568626, + "language_loss": 0.78475344, + "learning_rate": 3.866347819843925e-06, + "loss": 0.80681831, + "num_input_tokens_seen": 51714165, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0703125, + "step": 2387, + "time_per_iteration": 2.4568724632263184 + }, + { + "auxiliary_loss_clip": 0.01157837, + "auxiliary_loss_mlp": 0.010548, + "balance_loss_clip": 1.03389072, + "balance_loss_mlp": 1.05092847, + "epoch": 0.14357432737111078, + "flos": 19865029345920.0, + "grad_norm": 2.559937991009886, + "language_loss": 0.82087159, + "learning_rate": 3.866207802127143e-06, + "loss": 0.84299791, + "num_input_tokens_seen": 51734440, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.0703125, + "step": 2388, + "time_per_iteration": 2.5136237144470215 + }, + { + "auxiliary_loss_clip": 0.01161514, + "auxiliary_loss_mlp": 0.01044981, + "balance_loss_clip": 1.02633715, + "balance_loss_mlp": 1.05393136, + "epoch": 0.14363445062377875, + "flos": 28256814040320.0, + "grad_norm": 2.471836270672028, + "language_loss": 0.82267237, + "learning_rate": 3.866067713643573e-06, + "loss": 0.84473729, + "num_input_tokens_seen": 51753730, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.078125, + "step": 2389, + "time_per_iteration": 2.576352119445801 + }, + { + "auxiliary_loss_clip": 0.01161426, + "auxiliary_loss_mlp": 0.01051291, + "balance_loss_clip": 1.03020322, + "balance_loss_mlp": 1.05032301, + "epoch": 0.1436945738764467, + "flos": 18186672407040.0, + "grad_norm": 2.165584666776674, + "language_loss": 0.82654548, + "learning_rate": 3.8659275543985285e-06, + "loss": 0.84867263, + "num_input_tokens_seen": 51771195, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.109375, + "step": 2390, + "time_per_iteration": 2.5145435333251953 + }, + { + "auxiliary_loss_clip": 0.01158378, + "auxiliary_loss_mlp": 0.01054186, + "balance_loss_clip": 1.03406334, + "balance_loss_mlp": 1.0510571, + "epoch": 0.14375469712911468, + "flos": 27307910499840.0, + "grad_norm": 3.0575281215329086, + "language_loss": 0.74616158, + "learning_rate": 3.865787324397324e-06, + "loss": 0.76828718, + "num_input_tokens_seen": 51792290, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.078125, + "step": 2391, + "time_per_iteration": 2.5368545055389404 + }, + { + "auxiliary_loss_clip": 0.01050934, + "auxiliary_loss_mlp": 0.01014282, + "balance_loss_clip": 1.0121367, + "balance_loss_mlp": 1.01461065, + "epoch": 0.14381482038178264, + "flos": 56891445287040.0, + "grad_norm": 0.8732258813949081, + "language_loss": 0.61769497, + "learning_rate": 3.865647023645277e-06, + "loss": 0.63834715, + "num_input_tokens_seen": 51843675, + "router_z_loss_clip": 0.02148438, + "router_z_loss_mlp": 0.36328125, + "step": 2392, + "time_per_iteration": 2.9315476417541504 + }, + { + "auxiliary_loss_clip": 0.01161818, + "auxiliary_loss_mlp": 0.01056559, + "balance_loss_clip": 1.03449333, + "balance_loss_mlp": 1.04981267, + "epoch": 0.14387494363445064, + "flos": 14282177143680.0, + "grad_norm": 2.638581894381379, + "language_loss": 0.76172751, + "learning_rate": 3.865506652147709e-06, + "loss": 0.78391123, + "num_input_tokens_seen": 51860285, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1171875, + "step": 2393, + "time_per_iteration": 3.857799530029297 + }, + { + "auxiliary_loss_clip": 0.01161345, + "auxiliary_loss_mlp": 0.01049065, + "balance_loss_clip": 1.02908611, + "balance_loss_mlp": 1.05249143, + "epoch": 0.1439350668871186, + "flos": 26761493831040.0, + "grad_norm": 1.8778469598095298, + "language_loss": 0.76782668, + "learning_rate": 3.865366209909941e-06, + "loss": 0.78993082, + "num_input_tokens_seen": 51880105, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.09375, + "step": 2394, + "time_per_iteration": 3.979130983352661 + }, + { + "auxiliary_loss_clip": 0.01158023, + "auxiliary_loss_mlp": 0.01048603, + "balance_loss_clip": 1.02836156, + "balance_loss_mlp": 1.05062532, + "epoch": 0.14399519013978657, + "flos": 40700040537600.0, + "grad_norm": 1.605706810552395, + "language_loss": 0.85831755, + "learning_rate": 3.8652256969372994e-06, + "loss": 0.88038385, + "num_input_tokens_seen": 51905175, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.078125, + "step": 2395, + "time_per_iteration": 2.652092933654785 + }, + { + "auxiliary_loss_clip": 0.01157831, + "auxiliary_loss_mlp": 0.01049814, + "balance_loss_clip": 1.03040648, + "balance_loss_mlp": 1.05241179, + "epoch": 0.14405531339245453, + "flos": 20557530627840.0, + "grad_norm": 1.5230484666362787, + "language_loss": 0.82984561, + "learning_rate": 3.865085113235113e-06, + "loss": 0.85192204, + "num_input_tokens_seen": 51924490, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0546875, + "step": 2396, + "time_per_iteration": 2.4647467136383057 + }, + { + "auxiliary_loss_clip": 0.01152766, + "auxiliary_loss_mlp": 0.01046059, + "balance_loss_clip": 1.02691364, + "balance_loss_mlp": 1.04947495, + "epoch": 0.1441154366451225, + "flos": 19572931946880.0, + "grad_norm": 2.435366869769497, + "language_loss": 0.82564163, + "learning_rate": 3.864944458808712e-06, + "loss": 0.8476299, + "num_input_tokens_seen": 51940490, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.03125, + "step": 2397, + "time_per_iteration": 2.4151055812835693 + }, + { + "auxiliary_loss_clip": 0.01161338, + "auxiliary_loss_mlp": 0.01047669, + "balance_loss_clip": 1.02689052, + "balance_loss_mlp": 1.05216622, + "epoch": 0.14417555989779046, + "flos": 18515721922560.0, + "grad_norm": 1.6104109289920625, + "language_loss": 0.79418427, + "learning_rate": 3.86480373366343e-06, + "loss": 0.81627429, + "num_input_tokens_seen": 51957910, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.09375, + "step": 2398, + "time_per_iteration": 2.4470388889312744 + }, + { + "auxiliary_loss_clip": 0.01158929, + "auxiliary_loss_mlp": 0.01052066, + "balance_loss_clip": 1.03246808, + "balance_loss_mlp": 1.05359757, + "epoch": 0.14423568315045843, + "flos": 26031681296640.0, + "grad_norm": 2.7500042291552433, + "language_loss": 0.64847696, + "learning_rate": 3.864662937804603e-06, + "loss": 0.67058688, + "num_input_tokens_seen": 51978010, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0546875, + "step": 2399, + "time_per_iteration": 2.5123891830444336 + }, + { + "auxiliary_loss_clip": 0.01157977, + "auxiliary_loss_mlp": 0.01044487, + "balance_loss_clip": 1.02472198, + "balance_loss_mlp": 1.05306005, + "epoch": 0.14429580640312642, + "flos": 21288743792640.0, + "grad_norm": 1.4896130870957418, + "language_loss": 0.82329226, + "learning_rate": 3.864522071237571e-06, + "loss": 0.84531689, + "num_input_tokens_seen": 51998515, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0546875, + "step": 2400, + "time_per_iteration": 2.4825797080993652 + }, + { + "auxiliary_loss_clip": 0.01162323, + "auxiliary_loss_mlp": 0.01052957, + "balance_loss_clip": 1.03165436, + "balance_loss_mlp": 1.053689, + "epoch": 0.14435592965579438, + "flos": 25627865621760.0, + "grad_norm": 1.540874002782335, + "language_loss": 0.74606794, + "learning_rate": 3.864381133967676e-06, + "loss": 0.76822078, + "num_input_tokens_seen": 52019270, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.0859375, + "step": 2401, + "time_per_iteration": 2.507983684539795 + }, + { + "auxiliary_loss_clip": 0.01156636, + "auxiliary_loss_mlp": 0.01046459, + "balance_loss_clip": 1.0269084, + "balance_loss_mlp": 1.05109596, + "epoch": 0.14441605290846235, + "flos": 22965053656320.0, + "grad_norm": 1.7568662987329828, + "language_loss": 0.80577219, + "learning_rate": 3.86424012600026e-06, + "loss": 0.82780313, + "num_input_tokens_seen": 52039315, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.046875, + "step": 2402, + "time_per_iteration": 2.4913880825042725 + }, + { + "auxiliary_loss_clip": 0.01156436, + "auxiliary_loss_mlp": 0.01048893, + "balance_loss_clip": 1.02880669, + "balance_loss_mlp": 1.05137098, + "epoch": 0.14447617616113032, + "flos": 17347655548800.0, + "grad_norm": 2.1115432529250753, + "language_loss": 0.84918672, + "learning_rate": 3.864099047340673e-06, + "loss": 0.87124002, + "num_input_tokens_seen": 52056555, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.046875, + "step": 2403, + "time_per_iteration": 2.4267525672912598 + }, + { + "auxiliary_loss_clip": 0.01155438, + "auxiliary_loss_mlp": 0.01053748, + "balance_loss_clip": 1.03312445, + "balance_loss_mlp": 1.04934669, + "epoch": 0.14453629941379828, + "flos": 24060185464320.0, + "grad_norm": 3.423742001713465, + "language_loss": 0.70017314, + "learning_rate": 3.863957897994262e-06, + "loss": 0.72226501, + "num_input_tokens_seen": 52075800, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.0625, + "step": 2404, + "time_per_iteration": 2.487827777862549 + }, + { + "auxiliary_loss_clip": 0.01151274, + "auxiliary_loss_mlp": 0.0104872, + "balance_loss_clip": 1.02976513, + "balance_loss_mlp": 1.0473218, + "epoch": 0.14459642266646625, + "flos": 14429554646400.0, + "grad_norm": 2.368746641876408, + "language_loss": 0.72847003, + "learning_rate": 3.863816677966381e-06, + "loss": 0.75046992, + "num_input_tokens_seen": 52092585, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0390625, + "step": 2405, + "time_per_iteration": 2.458444833755493 + }, + { + "auxiliary_loss_clip": 0.01152813, + "auxiliary_loss_mlp": 0.0104761, + "balance_loss_clip": 1.02879858, + "balance_loss_mlp": 1.04891181, + "epoch": 0.14465654591913424, + "flos": 9867032179200.0, + "grad_norm": 2.2064790582144473, + "language_loss": 0.73115766, + "learning_rate": 3.863675387262386e-06, + "loss": 0.75316191, + "num_input_tokens_seen": 52108990, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0390625, + "step": 2406, + "time_per_iteration": 2.4501168727874756 + }, + { + "auxiliary_loss_clip": 0.0115439, + "auxiliary_loss_mlp": 0.01052848, + "balance_loss_clip": 1.03161645, + "balance_loss_mlp": 1.04889357, + "epoch": 0.1447166691718022, + "flos": 24972926987520.0, + "grad_norm": 4.997473868200426, + "language_loss": 0.75399184, + "learning_rate": 3.8635340258876325e-06, + "loss": 0.77606416, + "num_input_tokens_seen": 52125385, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.0546875, + "step": 2407, + "time_per_iteration": 2.482008934020996 + }, + { + "auxiliary_loss_clip": 0.01151849, + "auxiliary_loss_mlp": 0.01043439, + "balance_loss_clip": 1.02418649, + "balance_loss_mlp": 1.04607177, + "epoch": 0.14477679242447017, + "flos": 21908023200000.0, + "grad_norm": 1.6082248834480546, + "language_loss": 0.79472804, + "learning_rate": 3.8633925938474826e-06, + "loss": 0.81668091, + "num_input_tokens_seen": 52144985, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.0625, + "step": 2408, + "time_per_iteration": 2.4657323360443115 + }, + { + "auxiliary_loss_clip": 0.01155517, + "auxiliary_loss_mlp": 0.01051689, + "balance_loss_clip": 1.03006482, + "balance_loss_mlp": 1.05088127, + "epoch": 0.14483691567713813, + "flos": 20740746925440.0, + "grad_norm": 2.1979655558708893, + "language_loss": 0.82594806, + "learning_rate": 3.863251091147299e-06, + "loss": 0.84802014, + "num_input_tokens_seen": 52163885, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.046875, + "step": 2409, + "time_per_iteration": 2.450345039367676 + }, + { + "auxiliary_loss_clip": 0.01156412, + "auxiliary_loss_mlp": 0.01053711, + "balance_loss_clip": 1.03411365, + "balance_loss_mlp": 1.05046105, + "epoch": 0.1448970389298061, + "flos": 35407705536000.0, + "grad_norm": 1.954409921875598, + "language_loss": 0.74561608, + "learning_rate": 3.863109517792446e-06, + "loss": 0.7677173, + "num_input_tokens_seen": 52184325, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0625, + "step": 2410, + "time_per_iteration": 2.5861988067626953 + }, + { + "auxiliary_loss_clip": 0.01154014, + "auxiliary_loss_mlp": 0.01047443, + "balance_loss_clip": 1.02883387, + "balance_loss_mlp": 1.04858971, + "epoch": 0.14495716218247406, + "flos": 15414368808960.0, + "grad_norm": 2.3844352739280597, + "language_loss": 0.81135416, + "learning_rate": 3.8629678737882945e-06, + "loss": 0.83336866, + "num_input_tokens_seen": 52202740, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0546875, + "step": 2411, + "time_per_iteration": 2.4708898067474365 + }, + { + "auxiliary_loss_clip": 0.0115486, + "auxiliary_loss_mlp": 0.01053033, + "balance_loss_clip": 1.03403103, + "balance_loss_mlp": 1.05123138, + "epoch": 0.14501728543514203, + "flos": 33693222493440.0, + "grad_norm": 1.954560524414831, + "language_loss": 0.69816971, + "learning_rate": 3.862826159140214e-06, + "loss": 0.7202487, + "num_input_tokens_seen": 52223100, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.03125, + "step": 2412, + "time_per_iteration": 2.5614776611328125 + }, + { + "auxiliary_loss_clip": 0.0115476, + "auxiliary_loss_mlp": 0.01046078, + "balance_loss_clip": 1.02640891, + "balance_loss_mlp": 1.05100143, + "epoch": 0.14507740868781002, + "flos": 15596112648960.0, + "grad_norm": 2.1541085269745803, + "language_loss": 0.77347231, + "learning_rate": 3.862684373853579e-06, + "loss": 0.79548067, + "num_input_tokens_seen": 52239690, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0390625, + "step": 2413, + "time_per_iteration": 2.4292590618133545 + }, + { + "auxiliary_loss_clip": 0.01049286, + "auxiliary_loss_mlp": 0.0100403, + "balance_loss_clip": 1.00175285, + "balance_loss_mlp": 1.01294982, + "epoch": 0.145137531940478, + "flos": 66675343438080.0, + "grad_norm": 0.9152840666775347, + "language_loss": 0.58887923, + "learning_rate": 3.8625425179337656e-06, + "loss": 0.60941237, + "num_input_tokens_seen": 52296705, + "router_z_loss_clip": 0.02282715, + "router_z_loss_mlp": 0.36328125, + "step": 2414, + "time_per_iteration": 2.9752402305603027 + }, + { + "auxiliary_loss_clip": 0.01048826, + "auxiliary_loss_mlp": 0.01001535, + "balance_loss_clip": 0.99943656, + "balance_loss_mlp": 1.01240802, + "epoch": 0.14519765519314595, + "flos": 67521578929920.0, + "grad_norm": 0.8348908268898737, + "language_loss": 0.6218617, + "learning_rate": 3.862400591386154e-06, + "loss": 0.64236534, + "num_input_tokens_seen": 52361830, + "router_z_loss_clip": 0.02099609, + "router_z_loss_mlp": 0.36328125, + "step": 2415, + "time_per_iteration": 3.039710521697998 + }, + { + "auxiliary_loss_clip": 0.01151709, + "auxiliary_loss_mlp": 0.01046414, + "balance_loss_clip": 1.02637458, + "balance_loss_mlp": 1.04699647, + "epoch": 0.14525777844581392, + "flos": 17198913329280.0, + "grad_norm": 1.8743578134099377, + "language_loss": 0.72001135, + "learning_rate": 3.8622585942161245e-06, + "loss": 0.74199259, + "num_input_tokens_seen": 52379420, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.046875, + "step": 2416, + "time_per_iteration": 2.4283041954040527 + }, + { + "auxiliary_loss_clip": 0.0104556, + "auxiliary_loss_mlp": 0.01005813, + "balance_loss_clip": 1.00379848, + "balance_loss_mlp": 1.01002693, + "epoch": 0.14531790169848188, + "flos": 65404609015680.0, + "grad_norm": 0.711670432605859, + "language_loss": 0.60392165, + "learning_rate": 3.8621165264290635e-06, + "loss": 0.62443542, + "num_input_tokens_seen": 52446290, + "router_z_loss_clip": 0.0201416, + "router_z_loss_mlp": 0.35546875, + "step": 2417, + "time_per_iteration": 3.0824739933013916 + }, + { + "auxiliary_loss_clip": 0.01155799, + "auxiliary_loss_mlp": 0.01055986, + "balance_loss_clip": 1.03639972, + "balance_loss_mlp": 1.04795754, + "epoch": 0.14537802495114985, + "flos": 32562467372160.0, + "grad_norm": 2.9144560714513363, + "language_loss": 0.79237175, + "learning_rate": 3.861974388030356e-06, + "loss": 0.8144896, + "num_input_tokens_seen": 52467295, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.078125, + "step": 2418, + "time_per_iteration": 2.564497947692871 + }, + { + "auxiliary_loss_clip": 0.01150145, + "auxiliary_loss_mlp": 0.01051645, + "balance_loss_clip": 1.03267837, + "balance_loss_mlp": 1.04712582, + "epoch": 0.1454381482038178, + "flos": 20226685432320.0, + "grad_norm": 1.8755047341617508, + "language_loss": 0.72032261, + "learning_rate": 3.861832179025394e-06, + "loss": 0.74234051, + "num_input_tokens_seen": 52487295, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.03125, + "step": 2419, + "time_per_iteration": 2.457617998123169 + }, + { + "auxiliary_loss_clip": 0.01153915, + "auxiliary_loss_mlp": 0.01053899, + "balance_loss_clip": 1.0335021, + "balance_loss_mlp": 1.05042267, + "epoch": 0.1454982714564858, + "flos": 22893124671360.0, + "grad_norm": 2.3659429121693525, + "language_loss": 0.90125811, + "learning_rate": 3.861689899419569e-06, + "loss": 0.92333627, + "num_input_tokens_seen": 52504220, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.03125, + "step": 2420, + "time_per_iteration": 2.456087827682495 + }, + { + "auxiliary_loss_clip": 0.01154143, + "auxiliary_loss_mlp": 0.01057012, + "balance_loss_clip": 1.0382725, + "balance_loss_mlp": 1.04868603, + "epoch": 0.14555839470915377, + "flos": 20229845829120.0, + "grad_norm": 2.2940003535379057, + "language_loss": 0.83309549, + "learning_rate": 3.861547549218276e-06, + "loss": 0.85520703, + "num_input_tokens_seen": 52521900, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0546875, + "step": 2421, + "time_per_iteration": 2.441432476043701 + }, + { + "auxiliary_loss_clip": 0.01153189, + "auxiliary_loss_mlp": 0.01053683, + "balance_loss_clip": 1.03400183, + "balance_loss_mlp": 1.04684627, + "epoch": 0.14561851796182174, + "flos": 22236282616320.0, + "grad_norm": 1.6167157199382733, + "language_loss": 0.81511533, + "learning_rate": 3.861405128426914e-06, + "loss": 0.83718407, + "num_input_tokens_seen": 52540495, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0625, + "step": 2422, + "time_per_iteration": 2.473010540008545 + }, + { + "auxiliary_loss_clip": 0.01046424, + "auxiliary_loss_mlp": 0.01017838, + "balance_loss_clip": 1.01558518, + "balance_loss_mlp": 1.01065397, + "epoch": 0.1456786412144897, + "flos": 52636786289280.0, + "grad_norm": 0.9226410759759552, + "language_loss": 0.63245702, + "learning_rate": 3.861262637050883e-06, + "loss": 0.65309966, + "num_input_tokens_seen": 52603305, + "router_z_loss_clip": 0.02258301, + "router_z_loss_mlp": 0.35742188, + "step": 2423, + "time_per_iteration": 3.0516433715820312 + }, + { + "auxiliary_loss_clip": 0.01155109, + "auxiliary_loss_mlp": 0.01045363, + "balance_loss_clip": 1.02756512, + "balance_loss_mlp": 1.05096769, + "epoch": 0.14573876446715767, + "flos": 23221671396480.0, + "grad_norm": 1.7656587875688796, + "language_loss": 0.8267172, + "learning_rate": 3.861120075095585e-06, + "loss": 0.84872198, + "num_input_tokens_seen": 52623435, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 1.046875, + "step": 2424, + "time_per_iteration": 2.4918792247772217 + }, + { + "auxiliary_loss_clip": 0.01153149, + "auxiliary_loss_mlp": 0.0104962, + "balance_loss_clip": 1.03071296, + "balance_loss_mlp": 1.04970837, + "epoch": 0.14579888771982563, + "flos": 18114384286080.0, + "grad_norm": 2.0603730404595915, + "language_loss": 0.79317909, + "learning_rate": 3.860977442566429e-06, + "loss": 0.81520677, + "num_input_tokens_seen": 52642255, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.03125, + "step": 2425, + "time_per_iteration": 2.4607083797454834 + }, + { + "auxiliary_loss_clip": 0.01155851, + "auxiliary_loss_mlp": 0.01048544, + "balance_loss_clip": 1.030007, + "balance_loss_mlp": 1.05136847, + "epoch": 0.14585901097249362, + "flos": 23001107932800.0, + "grad_norm": 2.4026453111661703, + "language_loss": 0.83269531, + "learning_rate": 3.860834739468821e-06, + "loss": 0.85473925, + "num_input_tokens_seen": 52658700, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0390625, + "step": 2426, + "time_per_iteration": 2.4615883827209473 + }, + { + "auxiliary_loss_clip": 0.01153183, + "auxiliary_loss_mlp": 0.0104253, + "balance_loss_clip": 1.02420735, + "balance_loss_mlp": 1.05100346, + "epoch": 0.1459191342251616, + "flos": 21908669644800.0, + "grad_norm": 1.78851961601388, + "language_loss": 0.86878085, + "learning_rate": 3.860691965808173e-06, + "loss": 0.89073801, + "num_input_tokens_seen": 52678140, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.0234375, + "step": 2427, + "time_per_iteration": 2.46846866607666 + }, + { + "auxiliary_loss_clip": 0.01159617, + "auxiliary_loss_mlp": 0.01046481, + "balance_loss_clip": 1.0264895, + "balance_loss_mlp": 1.05060291, + "epoch": 0.14597925747782955, + "flos": 14975504438400.0, + "grad_norm": 1.9424277979169204, + "language_loss": 0.66795039, + "learning_rate": 3.8605491215899e-06, + "loss": 0.69001138, + "num_input_tokens_seen": 52696825, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.0859375, + "step": 2428, + "time_per_iteration": 2.4277987480163574 + }, + { + "auxiliary_loss_clip": 0.01154279, + "auxiliary_loss_mlp": 0.01048778, + "balance_loss_clip": 1.02870345, + "balance_loss_mlp": 1.05036306, + "epoch": 0.14603938073049752, + "flos": 21068898600960.0, + "grad_norm": 1.7447652065053452, + "language_loss": 0.8363744, + "learning_rate": 3.860406206819417e-06, + "loss": 0.85840499, + "num_input_tokens_seen": 52715125, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0390625, + "step": 2429, + "time_per_iteration": 2.5208661556243896 + }, + { + "auxiliary_loss_clip": 0.01152615, + "auxiliary_loss_mlp": 0.01048492, + "balance_loss_clip": 1.02972817, + "balance_loss_mlp": 1.04804671, + "epoch": 0.14609950398316549, + "flos": 19864777950720.0, + "grad_norm": 1.723947749216575, + "language_loss": 0.78811824, + "learning_rate": 3.860263221502145e-06, + "loss": 0.8101294, + "num_input_tokens_seen": 52734015, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.046875, + "step": 2430, + "time_per_iteration": 2.460575580596924 + }, + { + "auxiliary_loss_clip": 0.0115835, + "auxiliary_loss_mlp": 0.01049311, + "balance_loss_clip": 1.03014231, + "balance_loss_mlp": 1.0529238, + "epoch": 0.14615962723583345, + "flos": 22418852469120.0, + "grad_norm": 2.3723861833809767, + "language_loss": 0.83178174, + "learning_rate": 3.860120165643504e-06, + "loss": 0.85385835, + "num_input_tokens_seen": 52753025, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0546875, + "step": 2431, + "time_per_iteration": 2.468472480773926 + }, + { + "auxiliary_loss_clip": 0.01158923, + "auxiliary_loss_mlp": 0.01053127, + "balance_loss_clip": 1.03244448, + "balance_loss_mlp": 1.05131185, + "epoch": 0.14621975048850142, + "flos": 22346241125760.0, + "grad_norm": 1.7402379411604871, + "language_loss": 0.78777766, + "learning_rate": 3.859977039248921e-06, + "loss": 0.80989814, + "num_input_tokens_seen": 52773420, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.078125, + "step": 2432, + "time_per_iteration": 2.4618513584136963 + }, + { + "auxiliary_loss_clip": 0.01153865, + "auxiliary_loss_mlp": 0.01052087, + "balance_loss_clip": 1.03158331, + "balance_loss_mlp": 1.04917812, + "epoch": 0.1462798737411694, + "flos": 24389163152640.0, + "grad_norm": 1.9105383938395448, + "language_loss": 0.79940903, + "learning_rate": 3.859833842323822e-06, + "loss": 0.82146859, + "num_input_tokens_seen": 52792870, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.046875, + "step": 2433, + "time_per_iteration": 2.4901435375213623 + }, + { + "auxiliary_loss_clip": 0.01152814, + "auxiliary_loss_mlp": 0.01051119, + "balance_loss_clip": 1.03149712, + "balance_loss_mlp": 1.05186844, + "epoch": 0.14633999699383737, + "flos": 19244672530560.0, + "grad_norm": 1.8984055506020234, + "language_loss": 0.78421938, + "learning_rate": 3.859690574873638e-06, + "loss": 0.80625868, + "num_input_tokens_seen": 52811615, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0078125, + "step": 2434, + "time_per_iteration": 3.833007335662842 + }, + { + "auxiliary_loss_clip": 0.01046525, + "auxiliary_loss_mlp": 0.01005945, + "balance_loss_clip": 1.00356054, + "balance_loss_mlp": 1.01038933, + "epoch": 0.14640012024650534, + "flos": 62660638270080.0, + "grad_norm": 0.8674820067375166, + "language_loss": 0.58373666, + "learning_rate": 3.8595472369038e-06, + "loss": 0.60426134, + "num_input_tokens_seen": 52873230, + "router_z_loss_clip": 0.02380371, + "router_z_loss_mlp": 0.36132812, + "step": 2435, + "time_per_iteration": 5.911077499389648 + }, + { + "auxiliary_loss_clip": 0.01147895, + "auxiliary_loss_mlp": 0.01045492, + "balance_loss_clip": 1.02620411, + "balance_loss_mlp": 1.04662895, + "epoch": 0.1464602434991733, + "flos": 12276243146880.0, + "grad_norm": 2.2832294661951753, + "language_loss": 0.88395989, + "learning_rate": 3.859403828419744e-06, + "loss": 0.90589368, + "num_input_tokens_seen": 52889325, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.015625, + "step": 2436, + "time_per_iteration": 2.440303325653076 + }, + { + "auxiliary_loss_clip": 0.01156296, + "auxiliary_loss_mlp": 0.01046658, + "balance_loss_clip": 1.02697682, + "balance_loss_mlp": 1.05032742, + "epoch": 0.14652036675184127, + "flos": 20922311197440.0, + "grad_norm": 2.0196076648737, + "language_loss": 0.74832988, + "learning_rate": 3.85926034942691e-06, + "loss": 0.7703594, + "num_input_tokens_seen": 52909705, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0625, + "step": 2437, + "time_per_iteration": 2.460806369781494 + }, + { + "auxiliary_loss_clip": 0.01153675, + "auxiliary_loss_mlp": 0.01044961, + "balance_loss_clip": 1.02374196, + "balance_loss_mlp": 1.04798007, + "epoch": 0.14658049000450923, + "flos": 27703681528320.0, + "grad_norm": 2.346268485469047, + "language_loss": 0.73932636, + "learning_rate": 3.859116799930736e-06, + "loss": 0.76131272, + "num_input_tokens_seen": 52930300, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.0546875, + "step": 2438, + "time_per_iteration": 2.5051729679107666 + }, + { + "auxiliary_loss_clip": 0.01154512, + "auxiliary_loss_mlp": 0.01041271, + "balance_loss_clip": 1.02310383, + "balance_loss_mlp": 1.05231857, + "epoch": 0.14664061325717723, + "flos": 24936513575040.0, + "grad_norm": 1.8289443089735578, + "language_loss": 0.74791402, + "learning_rate": 3.858973179936668e-06, + "loss": 0.76987189, + "num_input_tokens_seen": 52949955, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.0234375, + "step": 2439, + "time_per_iteration": 2.4596338272094727 + }, + { + "auxiliary_loss_clip": 0.01151843, + "auxiliary_loss_mlp": 0.01047986, + "balance_loss_clip": 1.02872145, + "balance_loss_mlp": 1.04913521, + "epoch": 0.1467007365098452, + "flos": 40297661406720.0, + "grad_norm": 2.106046924266039, + "language_loss": 0.74542844, + "learning_rate": 3.85882948945015e-06, + "loss": 0.76742673, + "num_input_tokens_seen": 52972905, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.03125, + "step": 2440, + "time_per_iteration": 2.613889217376709 + }, + { + "auxiliary_loss_clip": 0.01146734, + "auxiliary_loss_mlp": 0.01048348, + "balance_loss_clip": 1.02964425, + "balance_loss_mlp": 1.04660702, + "epoch": 0.14676085976251316, + "flos": 26541074021760.0, + "grad_norm": 1.6151911954653986, + "language_loss": 0.83047861, + "learning_rate": 3.85868572847663e-06, + "loss": 0.85242939, + "num_input_tokens_seen": 52994850, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0, + "step": 2441, + "time_per_iteration": 2.508570432662964 + }, + { + "auxiliary_loss_clip": 0.01157481, + "auxiliary_loss_mlp": 0.0104725, + "balance_loss_clip": 1.0275681, + "balance_loss_mlp": 1.04952955, + "epoch": 0.14682098301518112, + "flos": 23550110380800.0, + "grad_norm": 3.362343971731744, + "language_loss": 0.71562135, + "learning_rate": 3.858541897021563e-06, + "loss": 0.73766863, + "num_input_tokens_seen": 53014740, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.078125, + "step": 2442, + "time_per_iteration": 2.4903416633605957 + }, + { + "auxiliary_loss_clip": 0.01160717, + "auxiliary_loss_mlp": 0.01042253, + "balance_loss_clip": 1.02257109, + "balance_loss_mlp": 1.0510819, + "epoch": 0.1468811062678491, + "flos": 11651073909120.0, + "grad_norm": 3.2762909335645043, + "language_loss": 0.80804002, + "learning_rate": 3.8583979950904e-06, + "loss": 0.83006966, + "num_input_tokens_seen": 53029780, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.09375, + "step": 2443, + "time_per_iteration": 2.424539089202881 + }, + { + "auxiliary_loss_clip": 0.01154468, + "auxiliary_loss_mlp": 0.01049831, + "balance_loss_clip": 1.03005409, + "balance_loss_mlp": 1.0504694, + "epoch": 0.14694122952051705, + "flos": 23002616304000.0, + "grad_norm": 2.077049554342068, + "language_loss": 0.8297509, + "learning_rate": 3.858254022688599e-06, + "loss": 0.85179389, + "num_input_tokens_seen": 53048620, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0390625, + "step": 2444, + "time_per_iteration": 2.4937214851379395 + }, + { + "auxiliary_loss_clip": 0.01154781, + "auxiliary_loss_mlp": 0.01048939, + "balance_loss_clip": 1.02961493, + "balance_loss_mlp": 1.05025554, + "epoch": 0.14700135277318502, + "flos": 26502972670080.0, + "grad_norm": 1.763635964291881, + "language_loss": 0.71218902, + "learning_rate": 3.85810997982162e-06, + "loss": 0.73422623, + "num_input_tokens_seen": 53070055, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.046875, + "step": 2445, + "time_per_iteration": 2.491645336151123 + }, + { + "auxiliary_loss_clip": 0.01045345, + "auxiliary_loss_mlp": 0.01028611, + "balance_loss_clip": 1.02659595, + "balance_loss_mlp": 1.00942683, + "epoch": 0.147061476025853, + "flos": 59449434387840.0, + "grad_norm": 0.8232649654452494, + "language_loss": 0.63138294, + "learning_rate": 3.857965866494923e-06, + "loss": 0.6521225, + "num_input_tokens_seen": 53126945, + "router_z_loss_clip": 0.0201416, + "router_z_loss_mlp": 0.359375, + "step": 2446, + "time_per_iteration": 2.9610531330108643 + }, + { + "auxiliary_loss_clip": 0.01158924, + "auxiliary_loss_mlp": 0.0104308, + "balance_loss_clip": 1.02355385, + "balance_loss_mlp": 1.05348802, + "epoch": 0.14712159927852098, + "flos": 28330897841280.0, + "grad_norm": 1.8119571313268434, + "language_loss": 0.74937665, + "learning_rate": 3.857821682713975e-06, + "loss": 0.7713967, + "num_input_tokens_seen": 53149130, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0546875, + "step": 2447, + "time_per_iteration": 2.547112226486206 + }, + { + "auxiliary_loss_clip": 0.0115445, + "auxiliary_loss_mlp": 0.01046965, + "balance_loss_clip": 1.02838051, + "balance_loss_mlp": 1.04998112, + "epoch": 0.14718172253118894, + "flos": 27089825074560.0, + "grad_norm": 2.0554455972062744, + "language_loss": 0.85722244, + "learning_rate": 3.857677428484242e-06, + "loss": 0.87923658, + "num_input_tokens_seen": 53167120, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.046875, + "step": 2448, + "time_per_iteration": 2.519530773162842 + }, + { + "auxiliary_loss_clip": 0.01045412, + "auxiliary_loss_mlp": 0.01010534, + "balance_loss_clip": 1.0081377, + "balance_loss_mlp": 1.00952029, + "epoch": 0.1472418457838569, + "flos": 66706764860160.0, + "grad_norm": 0.7649510042513386, + "language_loss": 0.56836212, + "learning_rate": 3.857533103811195e-06, + "loss": 0.58892155, + "num_input_tokens_seen": 53227945, + "router_z_loss_clip": 0.02392578, + "router_z_loss_mlp": 0.359375, + "step": 2449, + "time_per_iteration": 3.0049068927764893 + }, + { + "auxiliary_loss_clip": 0.01150109, + "auxiliary_loss_mlp": 0.01044261, + "balance_loss_clip": 1.02462673, + "balance_loss_mlp": 1.04850447, + "epoch": 0.14730196903652487, + "flos": 19573578391680.0, + "grad_norm": 1.900224172693126, + "language_loss": 0.85544562, + "learning_rate": 3.857388708700307e-06, + "loss": 0.87738931, + "num_input_tokens_seen": 53244615, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.015625, + "step": 2450, + "time_per_iteration": 2.5826945304870605 + }, + { + "auxiliary_loss_clip": 0.01155696, + "auxiliary_loss_mlp": 0.01049879, + "balance_loss_clip": 1.03009009, + "balance_loss_mlp": 1.05074143, + "epoch": 0.14736209228919284, + "flos": 16071031296000.0, + "grad_norm": 2.029178420182481, + "language_loss": 0.74693608, + "learning_rate": 3.857244243157052e-06, + "loss": 0.76899183, + "num_input_tokens_seen": 53262205, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0546875, + "step": 2451, + "time_per_iteration": 2.4345250129699707 + }, + { + "auxiliary_loss_clip": 0.01146898, + "auxiliary_loss_mlp": 0.01039395, + "balance_loss_clip": 1.02092934, + "balance_loss_mlp": 1.04758763, + "epoch": 0.1474222155418608, + "flos": 23039460679680.0, + "grad_norm": 1.6073898366987713, + "language_loss": 0.82240498, + "learning_rate": 3.85709970718691e-06, + "loss": 0.8442679, + "num_input_tokens_seen": 53282445, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.9921875, + "step": 2452, + "time_per_iteration": 2.468869924545288 + }, + { + "auxiliary_loss_clip": 0.01154267, + "auxiliary_loss_mlp": 0.01038485, + "balance_loss_clip": 1.02032936, + "balance_loss_mlp": 1.05154371, + "epoch": 0.1474823387945288, + "flos": 17018641946880.0, + "grad_norm": 1.7191329381743174, + "language_loss": 0.74021572, + "learning_rate": 3.856955100795361e-06, + "loss": 0.76214325, + "num_input_tokens_seen": 53299060, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.03125, + "step": 2453, + "time_per_iteration": 2.433424472808838 + }, + { + "auxiliary_loss_clip": 0.01154761, + "auxiliary_loss_mlp": 0.01050025, + "balance_loss_clip": 1.03048682, + "balance_loss_mlp": 1.04918802, + "epoch": 0.14754246204719676, + "flos": 17895041884800.0, + "grad_norm": 2.171465059586897, + "language_loss": 0.76326835, + "learning_rate": 3.856810423987889e-06, + "loss": 0.78531623, + "num_input_tokens_seen": 53315970, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0546875, + "step": 2454, + "time_per_iteration": 2.419368028640747 + }, + { + "auxiliary_loss_clip": 0.01155198, + "auxiliary_loss_mlp": 0.01038866, + "balance_loss_clip": 1.01980472, + "balance_loss_mlp": 1.04922831, + "epoch": 0.14760258529986472, + "flos": 13079097987840.0, + "grad_norm": 2.006370127686132, + "language_loss": 0.8301537, + "learning_rate": 3.856665676769979e-06, + "loss": 0.85209435, + "num_input_tokens_seen": 53332940, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0625, + "step": 2455, + "time_per_iteration": 2.426819324493408 + }, + { + "auxiliary_loss_clip": 0.01157227, + "auxiliary_loss_mlp": 0.01044033, + "balance_loss_clip": 1.02519834, + "balance_loss_mlp": 1.04846048, + "epoch": 0.1476627085525327, + "flos": 30806399358720.0, + "grad_norm": 2.442844218228049, + "language_loss": 0.83938581, + "learning_rate": 3.85652085914712e-06, + "loss": 0.8613984, + "num_input_tokens_seen": 53353295, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.09375, + "step": 2456, + "time_per_iteration": 2.525296926498413 + }, + { + "auxiliary_loss_clip": 0.01151791, + "auxiliary_loss_mlp": 0.01043419, + "balance_loss_clip": 1.02459574, + "balance_loss_mlp": 1.04980254, + "epoch": 0.14772283180520066, + "flos": 21689434984320.0, + "grad_norm": 1.8839437807359896, + "language_loss": 0.84325618, + "learning_rate": 3.856375971124805e-06, + "loss": 0.86520827, + "num_input_tokens_seen": 53373410, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0234375, + "step": 2457, + "time_per_iteration": 2.471068859100342 + }, + { + "auxiliary_loss_clip": 0.01149123, + "auxiliary_loss_mlp": 0.01041136, + "balance_loss_clip": 1.02237296, + "balance_loss_mlp": 1.04932761, + "epoch": 0.14778295505786862, + "flos": 18770400328320.0, + "grad_norm": 1.9862753985638202, + "language_loss": 0.75645256, + "learning_rate": 3.856231012708527e-06, + "loss": 0.77835512, + "num_input_tokens_seen": 53391430, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.99609375, + "step": 2458, + "time_per_iteration": 2.44146466255188 + }, + { + "auxiliary_loss_clip": 0.01160318, + "auxiliary_loss_mlp": 0.01049421, + "balance_loss_clip": 1.0284996, + "balance_loss_mlp": 1.05119324, + "epoch": 0.1478430783105366, + "flos": 22893555634560.0, + "grad_norm": 2.405388225865701, + "language_loss": 0.83817005, + "learning_rate": 3.856085983903782e-06, + "loss": 0.86026746, + "num_input_tokens_seen": 53409960, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.09375, + "step": 2459, + "time_per_iteration": 2.470345973968506 + }, + { + "auxiliary_loss_clip": 0.01149406, + "auxiliary_loss_mlp": 0.01041796, + "balance_loss_clip": 1.02359271, + "balance_loss_mlp": 1.0489651, + "epoch": 0.14790320156320458, + "flos": 15085319293440.0, + "grad_norm": 2.6666731923680733, + "language_loss": 0.75856471, + "learning_rate": 3.855940884716071e-06, + "loss": 0.78047681, + "num_input_tokens_seen": 53426160, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.0, + "step": 2460, + "time_per_iteration": 2.4294657707214355 + }, + { + "auxiliary_loss_clip": 0.01157631, + "auxiliary_loss_mlp": 0.01042027, + "balance_loss_clip": 1.02260733, + "balance_loss_mlp": 1.05102873, + "epoch": 0.14796332481587254, + "flos": 26504768350080.0, + "grad_norm": 1.6904429322803973, + "language_loss": 0.81591463, + "learning_rate": 3.855795715150896e-06, + "loss": 0.83791113, + "num_input_tokens_seen": 53448530, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0703125, + "step": 2461, + "time_per_iteration": 2.4993178844451904 + }, + { + "auxiliary_loss_clip": 0.01159506, + "auxiliary_loss_mlp": 0.01046713, + "balance_loss_clip": 1.02611399, + "balance_loss_mlp": 1.05356562, + "epoch": 0.1480234480685405, + "flos": 17563191108480.0, + "grad_norm": 3.2471604819605036, + "language_loss": 0.65689576, + "learning_rate": 3.855650475213761e-06, + "loss": 0.678958, + "num_input_tokens_seen": 53465915, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.0625, + "step": 2462, + "time_per_iteration": 2.4197235107421875 + }, + { + "auxiliary_loss_clip": 0.0115574, + "auxiliary_loss_mlp": 0.01048819, + "balance_loss_clip": 1.02929282, + "balance_loss_mlp": 1.05148113, + "epoch": 0.14808357132120847, + "flos": 53582203232640.0, + "grad_norm": 1.4717210360784851, + "language_loss": 0.67368174, + "learning_rate": 3.8555051649101745e-06, + "loss": 0.69572735, + "num_input_tokens_seen": 53496055, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0390625, + "step": 2463, + "time_per_iteration": 2.774268865585327 + }, + { + "auxiliary_loss_clip": 0.01154664, + "auxiliary_loss_mlp": 0.01050077, + "balance_loss_clip": 1.03071713, + "balance_loss_mlp": 1.04978383, + "epoch": 0.14814369457387644, + "flos": 19829190551040.0, + "grad_norm": 2.177919724516607, + "language_loss": 0.76567936, + "learning_rate": 3.855359784245646e-06, + "loss": 0.78772676, + "num_input_tokens_seen": 53513790, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.046875, + "step": 2464, + "time_per_iteration": 2.4522674083709717 + }, + { + "auxiliary_loss_clip": 0.01152455, + "auxiliary_loss_mlp": 0.01049156, + "balance_loss_clip": 1.03089297, + "balance_loss_mlp": 1.05009413, + "epoch": 0.1482038178265444, + "flos": 23914962777600.0, + "grad_norm": 1.623144605896263, + "language_loss": 0.79623306, + "learning_rate": 3.855214333225688e-06, + "loss": 0.81824923, + "num_input_tokens_seen": 53533410, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.0234375, + "step": 2465, + "time_per_iteration": 2.4946794509887695 + }, + { + "auxiliary_loss_clip": 0.01159963, + "auxiliary_loss_mlp": 0.01045929, + "balance_loss_clip": 1.02543747, + "balance_loss_mlp": 1.0522809, + "epoch": 0.1482639410792124, + "flos": 24170503109760.0, + "grad_norm": 2.8838905575360925, + "language_loss": 0.76230991, + "learning_rate": 3.855068811855817e-06, + "loss": 0.78436887, + "num_input_tokens_seen": 53554775, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.078125, + "step": 2466, + "time_per_iteration": 2.4722483158111572 + }, + { + "auxiliary_loss_clip": 0.01052707, + "auxiliary_loss_mlp": 0.01020247, + "balance_loss_clip": 1.01781487, + "balance_loss_mlp": 1.01613474, + "epoch": 0.14832406433188036, + "flos": 66191051341440.0, + "grad_norm": 0.8013334536894682, + "language_loss": 0.60022712, + "learning_rate": 3.854923220141551e-06, + "loss": 0.62095666, + "num_input_tokens_seen": 53609675, + "router_z_loss_clip": 0.02429199, + "router_z_loss_mlp": 0.3671875, + "step": 2467, + "time_per_iteration": 3.0702927112579346 + }, + { + "auxiliary_loss_clip": 0.01154799, + "auxiliary_loss_mlp": 0.01043072, + "balance_loss_clip": 1.02393889, + "balance_loss_mlp": 1.05059397, + "epoch": 0.14838418758454833, + "flos": 25411252654080.0, + "grad_norm": 2.3345318496369405, + "language_loss": 0.87671721, + "learning_rate": 3.85477755808841e-06, + "loss": 0.89869595, + "num_input_tokens_seen": 53626950, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.046875, + "step": 2468, + "time_per_iteration": 2.5042202472686768 + }, + { + "auxiliary_loss_clip": 0.0115781, + "auxiliary_loss_mlp": 0.01052711, + "balance_loss_clip": 1.0322901, + "balance_loss_mlp": 1.05078602, + "epoch": 0.1484443108372163, + "flos": 23289901280640.0, + "grad_norm": 4.884804263226826, + "language_loss": 0.75884396, + "learning_rate": 3.854631825701919e-06, + "loss": 0.78094912, + "num_input_tokens_seen": 53644200, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0703125, + "step": 2469, + "time_per_iteration": 2.4750967025756836 + }, + { + "auxiliary_loss_clip": 0.01153722, + "auxiliary_loss_mlp": 0.01053888, + "balance_loss_clip": 1.03425384, + "balance_loss_mlp": 1.04954958, + "epoch": 0.14850443408988426, + "flos": 14647675985280.0, + "grad_norm": 2.457578452134473, + "language_loss": 0.76183128, + "learning_rate": 3.854486022987603e-06, + "loss": 0.78390741, + "num_input_tokens_seen": 53659650, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.046875, + "step": 2470, + "time_per_iteration": 2.4312937259674072 + }, + { + "auxiliary_loss_clip": 0.01152707, + "auxiliary_loss_mlp": 0.01045721, + "balance_loss_clip": 1.02651668, + "balance_loss_mlp": 1.05050206, + "epoch": 0.14856455734255222, + "flos": 23548314700800.0, + "grad_norm": 1.9398758609720104, + "language_loss": 0.72121894, + "learning_rate": 3.8543401499509905e-06, + "loss": 0.74320322, + "num_input_tokens_seen": 53680275, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0234375, + "step": 2471, + "time_per_iteration": 2.519866466522217 + }, + { + "auxiliary_loss_clip": 0.01160204, + "auxiliary_loss_mlp": 0.01047639, + "balance_loss_clip": 1.0272181, + "balance_loss_mlp": 1.0499022, + "epoch": 0.1486246805952202, + "flos": 18077288515200.0, + "grad_norm": 2.11598070664324, + "language_loss": 0.89739621, + "learning_rate": 3.854194206597615e-06, + "loss": 0.91947466, + "num_input_tokens_seen": 53698270, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.1015625, + "step": 2472, + "time_per_iteration": 2.4281632900238037 + }, + { + "auxiliary_loss_clip": 0.01155174, + "auxiliary_loss_mlp": 0.01049471, + "balance_loss_clip": 1.030123, + "balance_loss_mlp": 1.05059123, + "epoch": 0.14868480384788818, + "flos": 19353625459200.0, + "grad_norm": 4.013793804030176, + "language_loss": 0.80734539, + "learning_rate": 3.854048192933008e-06, + "loss": 0.82939184, + "num_input_tokens_seen": 53716845, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.046875, + "step": 2473, + "time_per_iteration": 2.4329466819763184 + }, + { + "auxiliary_loss_clip": 0.0115911, + "auxiliary_loss_mlp": 0.01063152, + "balance_loss_clip": 1.04358959, + "balance_loss_mlp": 1.05129409, + "epoch": 0.14874492710055615, + "flos": 22200192426240.0, + "grad_norm": 2.5981192604624526, + "language_loss": 0.77540123, + "learning_rate": 3.853902108962709e-06, + "loss": 0.79762381, + "num_input_tokens_seen": 53734970, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.078125, + "step": 2474, + "time_per_iteration": 2.453432083129883 + }, + { + "auxiliary_loss_clip": 0.01157718, + "auxiliary_loss_mlp": 0.01057886, + "balance_loss_clip": 1.03763211, + "balance_loss_mlp": 1.04955983, + "epoch": 0.1488050503532241, + "flos": 21103444506240.0, + "grad_norm": 1.8103491271764227, + "language_loss": 0.82315612, + "learning_rate": 3.853755954692255e-06, + "loss": 0.84531218, + "num_input_tokens_seen": 53753415, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.0859375, + "step": 2475, + "time_per_iteration": 2.4591174125671387 + }, + { + "auxiliary_loss_clip": 0.01157844, + "auxiliary_loss_mlp": 0.01058234, + "balance_loss_clip": 1.03985167, + "balance_loss_mlp": 1.05399168, + "epoch": 0.14886517360589208, + "flos": 12786569625600.0, + "grad_norm": 1.9240192853863896, + "language_loss": 0.80811602, + "learning_rate": 3.85360973012719e-06, + "loss": 0.83027685, + "num_input_tokens_seen": 53770305, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.0390625, + "step": 2476, + "time_per_iteration": 3.810553789138794 + }, + { + "auxiliary_loss_clip": 0.01148934, + "auxiliary_loss_mlp": 0.01053022, + "balance_loss_clip": 1.03467607, + "balance_loss_mlp": 1.05016851, + "epoch": 0.14892529685856004, + "flos": 29022860419200.0, + "grad_norm": 1.8396010916090604, + "language_loss": 0.77889222, + "learning_rate": 3.853463435273058e-06, + "loss": 0.80091178, + "num_input_tokens_seen": 53788895, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.98828125, + "step": 2477, + "time_per_iteration": 4.031312942504883 + }, + { + "auxiliary_loss_clip": 0.01048753, + "auxiliary_loss_mlp": 0.01018076, + "balance_loss_clip": 1.01581085, + "balance_loss_mlp": 1.01302671, + "epoch": 0.148985420111228, + "flos": 61926121054080.0, + "grad_norm": 0.8050876444063699, + "language_loss": 0.60130364, + "learning_rate": 3.853317070135407e-06, + "loss": 0.62197196, + "num_input_tokens_seen": 53850260, + "router_z_loss_clip": 0.02270508, + "router_z_loss_mlp": 0.35742188, + "step": 2478, + "time_per_iteration": 3.1073787212371826 + }, + { + "auxiliary_loss_clip": 0.01154836, + "auxiliary_loss_mlp": 0.01044957, + "balance_loss_clip": 1.02695656, + "balance_loss_mlp": 1.05078554, + "epoch": 0.149045543363896, + "flos": 23915106432000.0, + "grad_norm": 2.232556799389181, + "language_loss": 0.70951897, + "learning_rate": 3.853170634719787e-06, + "loss": 0.7315169, + "num_input_tokens_seen": 53867520, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0390625, + "step": 2479, + "time_per_iteration": 2.475215435028076 + }, + { + "auxiliary_loss_clip": 0.01153193, + "auxiliary_loss_mlp": 0.01050847, + "balance_loss_clip": 1.0313679, + "balance_loss_mlp": 1.04886127, + "epoch": 0.14910566661656396, + "flos": 23654394541440.0, + "grad_norm": 1.5896653051626852, + "language_loss": 0.80748487, + "learning_rate": 3.853024129031751e-06, + "loss": 0.82952535, + "num_input_tokens_seen": 53886620, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.046875, + "step": 2480, + "time_per_iteration": 2.4618492126464844 + }, + { + "auxiliary_loss_clip": 0.01156746, + "auxiliary_loss_mlp": 0.01047338, + "balance_loss_clip": 1.02838397, + "balance_loss_mlp": 1.05017209, + "epoch": 0.14916578986923193, + "flos": 20515299212160.0, + "grad_norm": 2.4101793906634894, + "language_loss": 0.84132183, + "learning_rate": 3.852877553076854e-06, + "loss": 0.86336267, + "num_input_tokens_seen": 53902230, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0625, + "step": 2481, + "time_per_iteration": 2.437391519546509 + }, + { + "auxiliary_loss_clip": 0.01152664, + "auxiliary_loss_mlp": 0.01051193, + "balance_loss_clip": 1.03046227, + "balance_loss_mlp": 1.04808569, + "epoch": 0.1492259131218999, + "flos": 22491822948480.0, + "grad_norm": 3.194199563979109, + "language_loss": 0.77347398, + "learning_rate": 3.8527309068606546e-06, + "loss": 0.79551256, + "num_input_tokens_seen": 53919475, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.046875, + "step": 2482, + "time_per_iteration": 2.4710068702697754 + }, + { + "auxiliary_loss_clip": 0.01161857, + "auxiliary_loss_mlp": 0.01040162, + "balance_loss_clip": 1.01939583, + "balance_loss_mlp": 1.05186439, + "epoch": 0.14928603637456786, + "flos": 23185868515200.0, + "grad_norm": 2.968394626295353, + "language_loss": 0.78719991, + "learning_rate": 3.852584190388713e-06, + "loss": 0.80922014, + "num_input_tokens_seen": 53939150, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1015625, + "step": 2483, + "time_per_iteration": 2.5075182914733887 + }, + { + "auxiliary_loss_clip": 0.0114759, + "auxiliary_loss_mlp": 0.01040314, + "balance_loss_clip": 1.02304077, + "balance_loss_mlp": 1.04774714, + "epoch": 0.14934615962723582, + "flos": 21653237053440.0, + "grad_norm": 1.642113570978582, + "language_loss": 0.70521605, + "learning_rate": 3.852437403666595e-06, + "loss": 0.72709513, + "num_input_tokens_seen": 53958735, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 1.0, + "step": 2484, + "time_per_iteration": 2.4810657501220703 + }, + { + "auxiliary_loss_clip": 0.01154341, + "auxiliary_loss_mlp": 0.01041731, + "balance_loss_clip": 1.02049971, + "balance_loss_mlp": 1.04769683, + "epoch": 0.1494062828799038, + "flos": 27010066924800.0, + "grad_norm": 2.5518326423103654, + "language_loss": 0.84396368, + "learning_rate": 3.852290546699863e-06, + "loss": 0.86592442, + "num_input_tokens_seen": 53975065, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.0703125, + "step": 2485, + "time_per_iteration": 2.47004771232605 + }, + { + "auxiliary_loss_clip": 0.01155612, + "auxiliary_loss_mlp": 0.01044521, + "balance_loss_clip": 1.02442229, + "balance_loss_mlp": 1.04906201, + "epoch": 0.14946640613257178, + "flos": 21214947300480.0, + "grad_norm": 2.1854599778658663, + "language_loss": 0.84902173, + "learning_rate": 3.8521436194940894e-06, + "loss": 0.87102306, + "num_input_tokens_seen": 53993330, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0625, + "step": 2486, + "time_per_iteration": 2.4553234577178955 + }, + { + "auxiliary_loss_clip": 0.011482, + "auxiliary_loss_mlp": 0.01038818, + "balance_loss_clip": 1.02208114, + "balance_loss_mlp": 1.04672825, + "epoch": 0.14952652938523975, + "flos": 13370872164480.0, + "grad_norm": 2.4579579723442855, + "language_loss": 0.74329305, + "learning_rate": 3.851996622054842e-06, + "loss": 0.76516318, + "num_input_tokens_seen": 54010515, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 1.015625, + "step": 2487, + "time_per_iteration": 2.436316967010498 + }, + { + "auxiliary_loss_clip": 0.01148703, + "auxiliary_loss_mlp": 0.01048053, + "balance_loss_clip": 1.02934861, + "balance_loss_mlp": 1.04707325, + "epoch": 0.1495866526379077, + "flos": 35517699959040.0, + "grad_norm": 2.1423480103066375, + "language_loss": 0.71837348, + "learning_rate": 3.8518495543877e-06, + "loss": 0.74034101, + "num_input_tokens_seen": 54031315, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 2488, + "time_per_iteration": 2.649794816970825 + }, + { + "auxiliary_loss_clip": 0.01156424, + "auxiliary_loss_mlp": 0.01046549, + "balance_loss_clip": 1.02780962, + "balance_loss_mlp": 1.04946375, + "epoch": 0.14964677589057568, + "flos": 17632749795840.0, + "grad_norm": 2.5167610907777513, + "language_loss": 0.70519507, + "learning_rate": 3.851702416498235e-06, + "loss": 0.72722483, + "num_input_tokens_seen": 54045965, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0703125, + "step": 2489, + "time_per_iteration": 2.416708469390869 + }, + { + "auxiliary_loss_clip": 0.01153385, + "auxiliary_loss_mlp": 0.01045512, + "balance_loss_clip": 1.02637911, + "balance_loss_mlp": 1.04785299, + "epoch": 0.14970689914324364, + "flos": 20185280029440.0, + "grad_norm": 6.063777716142612, + "language_loss": 0.81789696, + "learning_rate": 3.8515552083920295e-06, + "loss": 0.83988589, + "num_input_tokens_seen": 54059960, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0546875, + "step": 2490, + "time_per_iteration": 2.433284282684326 + }, + { + "auxiliary_loss_clip": 0.0115747, + "auxiliary_loss_mlp": 0.01042151, + "balance_loss_clip": 1.02357852, + "balance_loss_mlp": 1.05097246, + "epoch": 0.1497670223959116, + "flos": 37228699382400.0, + "grad_norm": 1.781748843431282, + "language_loss": 0.79878485, + "learning_rate": 3.851407930074666e-06, + "loss": 0.82078111, + "num_input_tokens_seen": 54079330, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0625, + "step": 2491, + "time_per_iteration": 2.616642475128174 + }, + { + "auxiliary_loss_clip": 0.0115457, + "auxiliary_loss_mlp": 0.01046038, + "balance_loss_clip": 1.02491403, + "balance_loss_mlp": 1.04683256, + "epoch": 0.1498271456485796, + "flos": 24455848752000.0, + "grad_norm": 2.263792295832721, + "language_loss": 0.90779251, + "learning_rate": 3.851260581551727e-06, + "loss": 0.9297986, + "num_input_tokens_seen": 54097555, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.078125, + "step": 2492, + "time_per_iteration": 2.508188009262085 + }, + { + "auxiliary_loss_clip": 0.01152347, + "auxiliary_loss_mlp": 0.01059815, + "balance_loss_clip": 1.04028893, + "balance_loss_mlp": 1.04883122, + "epoch": 0.14988726890124757, + "flos": 16253601148800.0, + "grad_norm": 2.7210225604175116, + "language_loss": 0.79162109, + "learning_rate": 3.851113162828802e-06, + "loss": 0.8137427, + "num_input_tokens_seen": 54115600, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.03125, + "step": 2493, + "time_per_iteration": 2.4228014945983887 + }, + { + "auxiliary_loss_clip": 0.01150881, + "auxiliary_loss_mlp": 0.01042857, + "balance_loss_clip": 1.02299631, + "balance_loss_mlp": 1.04643607, + "epoch": 0.14994739215391553, + "flos": 20666555383680.0, + "grad_norm": 2.8095511996528297, + "language_loss": 0.80186284, + "learning_rate": 3.85096567391148e-06, + "loss": 0.82380015, + "num_input_tokens_seen": 54135220, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.046875, + "step": 2494, + "time_per_iteration": 2.4774162769317627 + }, + { + "auxiliary_loss_clip": 0.01149241, + "auxiliary_loss_mlp": 0.01046465, + "balance_loss_clip": 1.02613974, + "balance_loss_mlp": 1.04731214, + "epoch": 0.1500075154065835, + "flos": 70652375239680.0, + "grad_norm": 1.9697458415941205, + "language_loss": 0.65825832, + "learning_rate": 3.850818114805354e-06, + "loss": 0.68021536, + "num_input_tokens_seen": 54161065, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.015625, + "step": 2495, + "time_per_iteration": 2.87758207321167 + }, + { + "auxiliary_loss_clip": 0.01053312, + "auxiliary_loss_mlp": 0.01002116, + "balance_loss_clip": 0.99992257, + "balance_loss_mlp": 1.01668406, + "epoch": 0.15006763865925146, + "flos": 68011937447040.0, + "grad_norm": 1.1924806916138095, + "language_loss": 0.59488082, + "learning_rate": 3.850670485516019e-06, + "loss": 0.61543506, + "num_input_tokens_seen": 54225095, + "router_z_loss_clip": 0.02197266, + "router_z_loss_mlp": 0.3671875, + "step": 2496, + "time_per_iteration": 3.0807061195373535 + }, + { + "auxiliary_loss_clip": 0.01152427, + "auxiliary_loss_mlp": 0.01054598, + "balance_loss_clip": 1.03467774, + "balance_loss_mlp": 1.0468092, + "epoch": 0.15012776191191943, + "flos": 18916269459840.0, + "grad_norm": 2.296903755979897, + "language_loss": 0.65457296, + "learning_rate": 3.850522786049075e-06, + "loss": 0.67664325, + "num_input_tokens_seen": 54243750, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0546875, + "step": 2497, + "time_per_iteration": 2.4403655529022217 + }, + { + "auxiliary_loss_clip": 0.01155934, + "auxiliary_loss_mlp": 0.01048581, + "balance_loss_clip": 1.03021121, + "balance_loss_mlp": 1.05125117, + "epoch": 0.1501878851645874, + "flos": 23701330638720.0, + "grad_norm": 1.4500790349521295, + "language_loss": 0.75247943, + "learning_rate": 3.850375016410121e-06, + "loss": 0.77452457, + "num_input_tokens_seen": 54266185, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.046875, + "step": 2498, + "time_per_iteration": 2.5286927223205566 + }, + { + "auxiliary_loss_clip": 0.01155949, + "auxiliary_loss_mlp": 0.01043093, + "balance_loss_clip": 1.02256477, + "balance_loss_mlp": 1.04910398, + "epoch": 0.15024800841725539, + "flos": 20412523422720.0, + "grad_norm": 2.1627878003877257, + "language_loss": 0.72073609, + "learning_rate": 3.850227176604761e-06, + "loss": 0.74272656, + "num_input_tokens_seen": 54283940, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0625, + "step": 2499, + "time_per_iteration": 2.4415009021759033 + }, + { + "auxiliary_loss_clip": 0.01153017, + "auxiliary_loss_mlp": 0.01049378, + "balance_loss_clip": 1.03001857, + "balance_loss_mlp": 1.04765654, + "epoch": 0.15030813166992335, + "flos": 31831002812160.0, + "grad_norm": 1.7935878764928508, + "language_loss": 0.7195605, + "learning_rate": 3.850079266638601e-06, + "loss": 0.74158442, + "num_input_tokens_seen": 54304830, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0625, + "step": 2500, + "time_per_iteration": 2.5504300594329834 + }, + { + "auxiliary_loss_clip": 0.01152715, + "auxiliary_loss_mlp": 0.01058033, + "balance_loss_clip": 1.03831601, + "balance_loss_mlp": 1.04960001, + "epoch": 0.15036825492259132, + "flos": 35657822914560.0, + "grad_norm": 2.491284008551419, + "language_loss": 0.64973354, + "learning_rate": 3.849931286517249e-06, + "loss": 0.67184103, + "num_input_tokens_seen": 54325595, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.03125, + "step": 2501, + "time_per_iteration": 2.587292432785034 + }, + { + "auxiliary_loss_clip": 0.01153217, + "auxiliary_loss_mlp": 0.01059755, + "balance_loss_clip": 1.03940582, + "balance_loss_mlp": 1.04861319, + "epoch": 0.15042837817525928, + "flos": 18838163335680.0, + "grad_norm": 2.0240839018319, + "language_loss": 0.83043593, + "learning_rate": 3.849783236246318e-06, + "loss": 0.85256565, + "num_input_tokens_seen": 54342180, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.046875, + "step": 2502, + "time_per_iteration": 2.470350980758667 + }, + { + "auxiliary_loss_clip": 0.01149694, + "auxiliary_loss_mlp": 0.01050766, + "balance_loss_clip": 1.03272963, + "balance_loss_mlp": 1.04702473, + "epoch": 0.15048850142792725, + "flos": 19535548867200.0, + "grad_norm": 2.3174234065433597, + "language_loss": 0.77197748, + "learning_rate": 3.849635115831421e-06, + "loss": 0.79398209, + "num_input_tokens_seen": 54360255, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.0234375, + "step": 2503, + "time_per_iteration": 2.6598432064056396 + }, + { + "auxiliary_loss_clip": 0.01151836, + "auxiliary_loss_mlp": 0.01043843, + "balance_loss_clip": 1.02585387, + "balance_loss_mlp": 1.04901898, + "epoch": 0.1505486246805952, + "flos": 22017550746240.0, + "grad_norm": 2.1270494317377007, + "language_loss": 0.85432625, + "learning_rate": 3.849486925278176e-06, + "loss": 0.87628305, + "num_input_tokens_seen": 54378260, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0234375, + "step": 2504, + "time_per_iteration": 2.7323355674743652 + }, + { + "auxiliary_loss_clip": 0.01150395, + "auxiliary_loss_mlp": 0.01041023, + "balance_loss_clip": 1.02322519, + "balance_loss_mlp": 1.04855871, + "epoch": 0.15060874793326318, + "flos": 20743153136640.0, + "grad_norm": 1.6383963769174188, + "language_loss": 0.83226919, + "learning_rate": 3.8493386645922e-06, + "loss": 0.85418344, + "num_input_tokens_seen": 54399745, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 1.015625, + "step": 2505, + "time_per_iteration": 2.4866323471069336 + }, + { + "auxiliary_loss_clip": 0.01150621, + "auxiliary_loss_mlp": 0.01046549, + "balance_loss_clip": 1.02851272, + "balance_loss_mlp": 1.04672468, + "epoch": 0.15066887118593117, + "flos": 16471902055680.0, + "grad_norm": 2.268670074130615, + "language_loss": 0.7639147, + "learning_rate": 3.849190333779117e-06, + "loss": 0.78588635, + "num_input_tokens_seen": 54417105, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.0390625, + "step": 2506, + "time_per_iteration": 2.4266390800476074 + }, + { + "auxiliary_loss_clip": 0.01156061, + "auxiliary_loss_mlp": 0.01043099, + "balance_loss_clip": 1.02452636, + "balance_loss_mlp": 1.04987144, + "epoch": 0.15072899443859913, + "flos": 19859319083520.0, + "grad_norm": 4.189374997051622, + "language_loss": 0.76202261, + "learning_rate": 3.849041932844552e-06, + "loss": 0.78401417, + "num_input_tokens_seen": 54433920, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0625, + "step": 2507, + "time_per_iteration": 2.477936029434204 + }, + { + "auxiliary_loss_clip": 0.01145213, + "auxiliary_loss_mlp": 0.01043256, + "balance_loss_clip": 1.02519584, + "balance_loss_mlp": 1.04538798, + "epoch": 0.1507891176912671, + "flos": 20776226584320.0, + "grad_norm": 2.4120052182021503, + "language_loss": 0.69041586, + "learning_rate": 3.848893461794131e-06, + "loss": 0.71230054, + "num_input_tokens_seen": 54451540, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.0, + "step": 2508, + "time_per_iteration": 2.4462738037109375 + }, + { + "auxiliary_loss_clip": 0.01156095, + "auxiliary_loss_mlp": 0.01046654, + "balance_loss_clip": 1.02870142, + "balance_loss_mlp": 1.05190873, + "epoch": 0.15084924094393506, + "flos": 23586631534080.0, + "grad_norm": 1.8904486830015208, + "language_loss": 0.77516425, + "learning_rate": 3.8487449206334845e-06, + "loss": 0.79719174, + "num_input_tokens_seen": 54470800, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0390625, + "step": 2509, + "time_per_iteration": 2.47723126411438 + }, + { + "auxiliary_loss_clip": 0.01160822, + "auxiliary_loss_mlp": 0.01052281, + "balance_loss_clip": 1.0307281, + "balance_loss_mlp": 1.05027628, + "epoch": 0.15090936419660303, + "flos": 18911313383040.0, + "grad_norm": 2.607083522867767, + "language_loss": 0.80497003, + "learning_rate": 3.848596309368246e-06, + "loss": 0.82710105, + "num_input_tokens_seen": 54486525, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.1015625, + "step": 2510, + "time_per_iteration": 2.4445176124572754 + }, + { + "auxiliary_loss_clip": 0.0115714, + "auxiliary_loss_mlp": 0.01053415, + "balance_loss_clip": 1.0336144, + "balance_loss_mlp": 1.05078745, + "epoch": 0.150969487449271, + "flos": 17928223073280.0, + "grad_norm": 2.033214689307001, + "language_loss": 0.73913604, + "learning_rate": 3.8484476280040495e-06, + "loss": 0.76124156, + "num_input_tokens_seen": 54503795, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0625, + "step": 2511, + "time_per_iteration": 2.4372222423553467 + }, + { + "auxiliary_loss_clip": 0.01152059, + "auxiliary_loss_mlp": 0.01040964, + "balance_loss_clip": 1.02332115, + "balance_loss_mlp": 1.04880548, + "epoch": 0.151029610701939, + "flos": 24243078539520.0, + "grad_norm": 2.077792778828972, + "language_loss": 0.6935091, + "learning_rate": 3.848298876546534e-06, + "loss": 0.71543926, + "num_input_tokens_seen": 54523025, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 1.03125, + "step": 2512, + "time_per_iteration": 2.5126166343688965 + }, + { + "auxiliary_loss_clip": 0.01154623, + "auxiliary_loss_mlp": 0.01047016, + "balance_loss_clip": 1.02903962, + "balance_loss_mlp": 1.05130434, + "epoch": 0.15108973395460695, + "flos": 30262496641920.0, + "grad_norm": 3.0703205269170364, + "language_loss": 0.73833334, + "learning_rate": 3.84815005500134e-06, + "loss": 0.76034975, + "num_input_tokens_seen": 54545025, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.03125, + "step": 2513, + "time_per_iteration": 2.5560262203216553 + }, + { + "auxiliary_loss_clip": 0.01052097, + "auxiliary_loss_mlp": 0.01001905, + "balance_loss_clip": 0.99995023, + "balance_loss_mlp": 1.01588845, + "epoch": 0.15114985720727492, + "flos": 60437624428800.0, + "grad_norm": 0.8742342414591, + "language_loss": 0.64759278, + "learning_rate": 3.84800116337411e-06, + "loss": 0.6681329, + "num_input_tokens_seen": 54604545, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.36328125, + "step": 2514, + "time_per_iteration": 3.0147135257720947 + }, + { + "auxiliary_loss_clip": 0.01150943, + "auxiliary_loss_mlp": 0.01043819, + "balance_loss_clip": 1.02588964, + "balance_loss_mlp": 1.04910421, + "epoch": 0.15120998045994288, + "flos": 20521691832960.0, + "grad_norm": 2.6951033245551597, + "language_loss": 0.73257691, + "learning_rate": 3.8478522016704916e-06, + "loss": 0.75452447, + "num_input_tokens_seen": 54620590, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0234375, + "step": 2515, + "time_per_iteration": 2.4640309810638428 + }, + { + "auxiliary_loss_clip": 0.01151787, + "auxiliary_loss_mlp": 0.0104255, + "balance_loss_clip": 1.02361989, + "balance_loss_mlp": 1.04967082, + "epoch": 0.15127010371261085, + "flos": 21178893024000.0, + "grad_norm": 1.8637331039353218, + "language_loss": 0.76990104, + "learning_rate": 3.8477031698961325e-06, + "loss": 0.79184443, + "num_input_tokens_seen": 54640410, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.015625, + "step": 2516, + "time_per_iteration": 2.4672725200653076 + }, + { + "auxiliary_loss_clip": 0.01049641, + "auxiliary_loss_mlp": 0.01003705, + "balance_loss_clip": 1.00177407, + "balance_loss_mlp": 1.01351547, + "epoch": 0.1513302269652788, + "flos": 65320648974720.0, + "grad_norm": 0.745436195681612, + "language_loss": 0.54673135, + "learning_rate": 3.8475540680566835e-06, + "loss": 0.56726485, + "num_input_tokens_seen": 54701430, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.36132812, + "step": 2517, + "time_per_iteration": 3.0677855014801025 + }, + { + "auxiliary_loss_clip": 0.01151686, + "auxiliary_loss_mlp": 0.0104095, + "balance_loss_clip": 1.02126849, + "balance_loss_mlp": 1.04780149, + "epoch": 0.15139035021794678, + "flos": 19135827342720.0, + "grad_norm": 2.2326216563166983, + "language_loss": 0.78515786, + "learning_rate": 3.8474048961577995e-06, + "loss": 0.8070842, + "num_input_tokens_seen": 54720845, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0390625, + "step": 2518, + "time_per_iteration": 3.8305110931396484 + }, + { + "auxiliary_loss_clip": 0.01159011, + "auxiliary_loss_mlp": 0.01048517, + "balance_loss_clip": 1.02851379, + "balance_loss_mlp": 1.05163026, + "epoch": 0.15145047347061477, + "flos": 26578564842240.0, + "grad_norm": 2.1364726943924772, + "language_loss": 0.70153689, + "learning_rate": 3.847255654205137e-06, + "loss": 0.72361219, + "num_input_tokens_seen": 54740495, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0703125, + "step": 2519, + "time_per_iteration": 3.9920616149902344 + }, + { + "auxiliary_loss_clip": 0.01151572, + "auxiliary_loss_mlp": 0.01044317, + "balance_loss_clip": 1.02549386, + "balance_loss_mlp": 1.04812384, + "epoch": 0.15151059672328274, + "flos": 20302959962880.0, + "grad_norm": 1.9802508383478334, + "language_loss": 0.79219216, + "learning_rate": 3.847106342204354e-06, + "loss": 0.81415105, + "num_input_tokens_seen": 54758415, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.03125, + "step": 2520, + "time_per_iteration": 2.4853925704956055 + }, + { + "auxiliary_loss_clip": 0.01155647, + "auxiliary_loss_mlp": 0.01050752, + "balance_loss_clip": 1.03090394, + "balance_loss_mlp": 1.05067897, + "epoch": 0.1515707199759507, + "flos": 27228367831680.0, + "grad_norm": 2.075013959426641, + "language_loss": 0.74324691, + "learning_rate": 3.846956960161114e-06, + "loss": 0.76531088, + "num_input_tokens_seen": 54779355, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.046875, + "step": 2521, + "time_per_iteration": 2.6154706478118896 + }, + { + "auxiliary_loss_clip": 0.01160623, + "auxiliary_loss_mlp": 0.01045817, + "balance_loss_clip": 1.02587366, + "balance_loss_mlp": 1.05273759, + "epoch": 0.15163084322861867, + "flos": 23587349806080.0, + "grad_norm": 2.7623729867934737, + "language_loss": 0.81996739, + "learning_rate": 3.84680750808108e-06, + "loss": 0.84203184, + "num_input_tokens_seen": 54799465, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.078125, + "step": 2522, + "time_per_iteration": 2.4873530864715576 + }, + { + "auxiliary_loss_clip": 0.0104876, + "auxiliary_loss_mlp": 0.01001752, + "balance_loss_clip": 0.99982071, + "balance_loss_mlp": 1.01252866, + "epoch": 0.15169096648128663, + "flos": 66889622021760.0, + "grad_norm": 0.824359498034346, + "language_loss": 0.57915509, + "learning_rate": 3.846657985969922e-06, + "loss": 0.59966022, + "num_input_tokens_seen": 54857665, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.36328125, + "step": 2523, + "time_per_iteration": 2.998990774154663 + }, + { + "auxiliary_loss_clip": 0.01153336, + "auxiliary_loss_mlp": 0.01050595, + "balance_loss_clip": 1.03147376, + "balance_loss_mlp": 1.04972816, + "epoch": 0.1517510897339546, + "flos": 29095435848960.0, + "grad_norm": 1.970015434384356, + "language_loss": 0.7485956, + "learning_rate": 3.8465083938333066e-06, + "loss": 0.77063495, + "num_input_tokens_seen": 54879895, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0390625, + "step": 2524, + "time_per_iteration": 2.570068836212158 + }, + { + "auxiliary_loss_clip": 0.0115237, + "auxiliary_loss_mlp": 0.01044934, + "balance_loss_clip": 1.02603889, + "balance_loss_mlp": 1.0488894, + "epoch": 0.1518112129866226, + "flos": 18406553512320.0, + "grad_norm": 1.8388163356316347, + "language_loss": 0.74780655, + "learning_rate": 3.8463587316769085e-06, + "loss": 0.76977956, + "num_input_tokens_seen": 54898245, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.03125, + "step": 2525, + "time_per_iteration": 2.431143283843994 + }, + { + "auxiliary_loss_clip": 0.01157293, + "auxiliary_loss_mlp": 0.01043467, + "balance_loss_clip": 1.02432156, + "balance_loss_mlp": 1.05145812, + "epoch": 0.15187133623929056, + "flos": 19425410789760.0, + "grad_norm": 1.8962457769996104, + "language_loss": 0.79644465, + "learning_rate": 3.846208999506402e-06, + "loss": 0.81845224, + "num_input_tokens_seen": 54917060, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0625, + "step": 2526, + "time_per_iteration": 2.5167391300201416 + }, + { + "auxiliary_loss_clip": 0.01151222, + "auxiliary_loss_mlp": 0.01044184, + "balance_loss_clip": 1.0271492, + "balance_loss_mlp": 1.05228162, + "epoch": 0.15193145949195852, + "flos": 17566207850880.0, + "grad_norm": 1.8025865198757494, + "language_loss": 0.84928662, + "learning_rate": 3.846059197327466e-06, + "loss": 0.87124068, + "num_input_tokens_seen": 54936365, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9921875, + "step": 2527, + "time_per_iteration": 2.4550719261169434 + }, + { + "auxiliary_loss_clip": 0.01151683, + "auxiliary_loss_mlp": 0.01041065, + "balance_loss_clip": 1.02321947, + "balance_loss_mlp": 1.04876995, + "epoch": 0.15199158274462649, + "flos": 36176265866880.0, + "grad_norm": 2.2810224367730156, + "language_loss": 0.69326001, + "learning_rate": 3.845909325145779e-06, + "loss": 0.71518755, + "num_input_tokens_seen": 54961365, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 1.03125, + "step": 2528, + "time_per_iteration": 2.610042095184326 + }, + { + "auxiliary_loss_clip": 0.01153606, + "auxiliary_loss_mlp": 0.01047581, + "balance_loss_clip": 1.0288415, + "balance_loss_mlp": 1.05137038, + "epoch": 0.15205170599729445, + "flos": 23074042498560.0, + "grad_norm": 2.490892546855648, + "language_loss": 0.86502308, + "learning_rate": 3.845759382967026e-06, + "loss": 0.88703495, + "num_input_tokens_seen": 54980750, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0234375, + "step": 2529, + "time_per_iteration": 2.4695634841918945 + }, + { + "auxiliary_loss_clip": 0.01147713, + "auxiliary_loss_mlp": 0.0103836, + "balance_loss_clip": 1.01989472, + "balance_loss_mlp": 1.04683101, + "epoch": 0.15211182924996242, + "flos": 21908382336000.0, + "grad_norm": 1.8772276619965056, + "language_loss": 0.83002013, + "learning_rate": 3.845609370796893e-06, + "loss": 0.85188091, + "num_input_tokens_seen": 54999675, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.0078125, + "step": 2530, + "time_per_iteration": 2.476238489151001 + }, + { + "auxiliary_loss_clip": 0.01153377, + "auxiliary_loss_mlp": 0.01044599, + "balance_loss_clip": 1.02550209, + "balance_loss_mlp": 1.04987955, + "epoch": 0.15217195250263038, + "flos": 13881521865600.0, + "grad_norm": 2.344030506991615, + "language_loss": 0.80540878, + "learning_rate": 3.845459288641066e-06, + "loss": 0.82738853, + "num_input_tokens_seen": 55018295, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.03125, + "step": 2531, + "time_per_iteration": 2.443617105484009 + }, + { + "auxiliary_loss_clip": 0.01149745, + "auxiliary_loss_mlp": 0.01049084, + "balance_loss_clip": 1.03138137, + "balance_loss_mlp": 1.04895151, + "epoch": 0.15223207575529837, + "flos": 24535319592960.0, + "grad_norm": 2.0816362099746017, + "language_loss": 0.79241651, + "learning_rate": 3.8453091365052394e-06, + "loss": 0.81440473, + "num_input_tokens_seen": 55037975, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 1.0078125, + "step": 2532, + "time_per_iteration": 2.5071239471435547 + }, + { + "auxiliary_loss_clip": 0.0115001, + "auxiliary_loss_mlp": 0.01046515, + "balance_loss_clip": 1.02694106, + "balance_loss_mlp": 1.04952455, + "epoch": 0.15229219900796634, + "flos": 25556798563200.0, + "grad_norm": 1.8298502444413876, + "language_loss": 0.87712961, + "learning_rate": 3.845158914395105e-06, + "loss": 0.89909488, + "num_input_tokens_seen": 55057135, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0, + "step": 2533, + "time_per_iteration": 2.5262463092803955 + }, + { + "auxiliary_loss_clip": 0.01150696, + "auxiliary_loss_mlp": 0.01047398, + "balance_loss_clip": 1.02932572, + "balance_loss_mlp": 1.04766071, + "epoch": 0.1523523222606343, + "flos": 18217806520320.0, + "grad_norm": 2.2606742211331556, + "language_loss": 0.79057097, + "learning_rate": 3.84500862231636e-06, + "loss": 0.81255192, + "num_input_tokens_seen": 55075525, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.03125, + "step": 2534, + "time_per_iteration": 2.4421815872192383 + }, + { + "auxiliary_loss_clip": 0.01156406, + "auxiliary_loss_mlp": 0.01041573, + "balance_loss_clip": 1.02177238, + "balance_loss_mlp": 1.04847312, + "epoch": 0.15241244551330227, + "flos": 13260087642240.0, + "grad_norm": 2.8989864742133933, + "language_loss": 0.76862979, + "learning_rate": 3.844858260274702e-06, + "loss": 0.7906096, + "num_input_tokens_seen": 55090845, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.078125, + "step": 2535, + "time_per_iteration": 2.4193530082702637 + }, + { + "auxiliary_loss_clip": 0.01156147, + "auxiliary_loss_mlp": 0.01040468, + "balance_loss_clip": 1.02153718, + "balance_loss_mlp": 1.04885459, + "epoch": 0.15247256876597023, + "flos": 19715568854400.0, + "grad_norm": 2.234687708038525, + "language_loss": 0.78185135, + "learning_rate": 3.844707828275835e-06, + "loss": 0.80381751, + "num_input_tokens_seen": 55108750, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0703125, + "step": 2536, + "time_per_iteration": 2.478066921234131 + }, + { + "auxiliary_loss_clip": 0.01150448, + "auxiliary_loss_mlp": 0.0105096, + "balance_loss_clip": 1.03305459, + "balance_loss_mlp": 1.05067229, + "epoch": 0.1525326920186382, + "flos": 20375858615040.0, + "grad_norm": 2.124557148089124, + "language_loss": 0.74979979, + "learning_rate": 3.844557326325461e-06, + "loss": 0.77181387, + "num_input_tokens_seen": 55126750, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0, + "step": 2537, + "time_per_iteration": 2.455779552459717 + }, + { + "auxiliary_loss_clip": 0.01152934, + "auxiliary_loss_mlp": 0.01043794, + "balance_loss_clip": 1.02545929, + "balance_loss_mlp": 1.04965043, + "epoch": 0.15259281527130616, + "flos": 13589963170560.0, + "grad_norm": 2.005826380833244, + "language_loss": 0.77631724, + "learning_rate": 3.8444067544292896e-06, + "loss": 0.79828459, + "num_input_tokens_seen": 55144690, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.03125, + "step": 2538, + "time_per_iteration": 2.527730941772461 + }, + { + "auxiliary_loss_clip": 0.01147714, + "auxiliary_loss_mlp": 0.01040159, + "balance_loss_clip": 1.02308786, + "balance_loss_mlp": 1.04806781, + "epoch": 0.15265293852397416, + "flos": 22860374446080.0, + "grad_norm": 1.6961003069906246, + "language_loss": 0.89707708, + "learning_rate": 3.844256112593029e-06, + "loss": 0.9189558, + "num_input_tokens_seen": 55166055, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.99609375, + "step": 2539, + "time_per_iteration": 2.485410451889038 + }, + { + "auxiliary_loss_clip": 0.01151642, + "auxiliary_loss_mlp": 0.01043152, + "balance_loss_clip": 1.02491331, + "balance_loss_mlp": 1.05028892, + "epoch": 0.15271306177664212, + "flos": 29238108670080.0, + "grad_norm": 2.1834515010765627, + "language_loss": 0.93514961, + "learning_rate": 3.844105400822391e-06, + "loss": 0.95709753, + "num_input_tokens_seen": 55186285, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.015625, + "step": 2540, + "time_per_iteration": 2.5399627685546875 + }, + { + "auxiliary_loss_clip": 0.01144897, + "auxiliary_loss_mlp": 0.01043966, + "balance_loss_clip": 1.0266571, + "balance_loss_mlp": 1.04625463, + "epoch": 0.1527731850293101, + "flos": 31246269310080.0, + "grad_norm": 1.9271166035098393, + "language_loss": 0.75039941, + "learning_rate": 3.843954619123092e-06, + "loss": 0.77228808, + "num_input_tokens_seen": 55207915, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.984375, + "step": 2541, + "time_per_iteration": 2.516559362411499 + }, + { + "auxiliary_loss_clip": 0.01147451, + "auxiliary_loss_mlp": 0.01048877, + "balance_loss_clip": 1.03025603, + "balance_loss_mlp": 1.04787207, + "epoch": 0.15283330828197805, + "flos": 22382079920640.0, + "grad_norm": 1.7480154890803248, + "language_loss": 0.81308234, + "learning_rate": 3.84380376750085e-06, + "loss": 0.83504558, + "num_input_tokens_seen": 55227860, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.99609375, + "step": 2542, + "time_per_iteration": 2.4681694507598877 + }, + { + "auxiliary_loss_clip": 0.01150381, + "auxiliary_loss_mlp": 0.01050782, + "balance_loss_clip": 1.03213799, + "balance_loss_mlp": 1.04772067, + "epoch": 0.15289343153464602, + "flos": 25520133755520.0, + "grad_norm": 2.009812895323552, + "language_loss": 0.77568293, + "learning_rate": 3.843652845961383e-06, + "loss": 0.79769456, + "num_input_tokens_seen": 55247330, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.0234375, + "step": 2543, + "time_per_iteration": 2.4899120330810547 + }, + { + "auxiliary_loss_clip": 0.01147346, + "auxiliary_loss_mlp": 0.01045177, + "balance_loss_clip": 1.0266397, + "balance_loss_mlp": 1.04692626, + "epoch": 0.15295355478731398, + "flos": 22710016114560.0, + "grad_norm": 2.3128696364379935, + "language_loss": 0.86483204, + "learning_rate": 3.843501854510416e-06, + "loss": 0.88675725, + "num_input_tokens_seen": 55266195, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0, + "step": 2544, + "time_per_iteration": 2.4774844646453857 + }, + { + "auxiliary_loss_clip": 0.01152485, + "auxiliary_loss_mlp": 0.010531, + "balance_loss_clip": 1.03287029, + "balance_loss_mlp": 1.04675508, + "epoch": 0.15301367803998198, + "flos": 23251907669760.0, + "grad_norm": 2.0966566192890106, + "language_loss": 0.8228749, + "learning_rate": 3.843350793153673e-06, + "loss": 0.84493077, + "num_input_tokens_seen": 55283305, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0546875, + "step": 2545, + "time_per_iteration": 2.4526925086975098 + }, + { + "auxiliary_loss_clip": 0.01149503, + "auxiliary_loss_mlp": 0.01044491, + "balance_loss_clip": 1.02614498, + "balance_loss_mlp": 1.04802954, + "epoch": 0.15307380129264994, + "flos": 25886279041920.0, + "grad_norm": 2.540509049886226, + "language_loss": 0.70711339, + "learning_rate": 3.843199661896884e-06, + "loss": 0.72905338, + "num_input_tokens_seen": 55303035, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.015625, + "step": 2546, + "time_per_iteration": 2.5009732246398926 + }, + { + "auxiliary_loss_clip": 0.01152354, + "auxiliary_loss_mlp": 0.01043417, + "balance_loss_clip": 1.02423596, + "balance_loss_mlp": 1.04967904, + "epoch": 0.1531339245453179, + "flos": 46973239205760.0, + "grad_norm": 1.5770850469719229, + "language_loss": 0.77521312, + "learning_rate": 3.843048460745779e-06, + "loss": 0.79717076, + "num_input_tokens_seen": 55327570, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.03125, + "step": 2547, + "time_per_iteration": 2.6822421550750732 + }, + { + "auxiliary_loss_clip": 0.01152263, + "auxiliary_loss_mlp": 0.01047861, + "balance_loss_clip": 1.02932382, + "balance_loss_mlp": 1.04904902, + "epoch": 0.15319404779798587, + "flos": 35882049565440.0, + "grad_norm": 2.0900989153424976, + "language_loss": 0.73985445, + "learning_rate": 3.842897189706092e-06, + "loss": 0.76185566, + "num_input_tokens_seen": 55351090, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.03125, + "step": 2548, + "time_per_iteration": 2.59080171585083 + }, + { + "auxiliary_loss_clip": 0.01150578, + "auxiliary_loss_mlp": 0.01050674, + "balance_loss_clip": 1.03158915, + "balance_loss_mlp": 1.04806828, + "epoch": 0.15325417105065384, + "flos": 25664638170240.0, + "grad_norm": 1.499185349529517, + "language_loss": 0.80589813, + "learning_rate": 3.842745848783558e-06, + "loss": 0.82791066, + "num_input_tokens_seen": 55371050, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0234375, + "step": 2549, + "time_per_iteration": 2.498096227645874 + }, + { + "auxiliary_loss_clip": 0.01150664, + "auxiliary_loss_mlp": 0.01048572, + "balance_loss_clip": 1.02951026, + "balance_loss_mlp": 1.04750037, + "epoch": 0.1533142943033218, + "flos": 18770831291520.0, + "grad_norm": 1.687491024735964, + "language_loss": 0.74760693, + "learning_rate": 3.842594437983917e-06, + "loss": 0.76959932, + "num_input_tokens_seen": 55390375, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.03125, + "step": 2550, + "time_per_iteration": 2.4740684032440186 + }, + { + "auxiliary_loss_clip": 0.01153822, + "auxiliary_loss_mlp": 0.01039682, + "balance_loss_clip": 1.02035773, + "balance_loss_mlp": 1.04903841, + "epoch": 0.15337441755598977, + "flos": 23107367341440.0, + "grad_norm": 2.205632522725416, + "language_loss": 0.76839805, + "learning_rate": 3.8424429573129115e-06, + "loss": 0.79033309, + "num_input_tokens_seen": 55408890, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.046875, + "step": 2551, + "time_per_iteration": 2.468886375427246 + }, + { + "auxiliary_loss_clip": 0.01045401, + "auxiliary_loss_mlp": 0.01020401, + "balance_loss_clip": 1.01873255, + "balance_loss_mlp": 1.0102303, + "epoch": 0.15343454080865776, + "flos": 59861079227520.0, + "grad_norm": 0.9464853846906186, + "language_loss": 0.56666422, + "learning_rate": 3.842291406776283e-06, + "loss": 0.58732224, + "num_input_tokens_seen": 55463815, + "router_z_loss_clip": 0.01672363, + "router_z_loss_mlp": 0.3515625, + "step": 2552, + "time_per_iteration": 3.0059380531311035 + }, + { + "auxiliary_loss_clip": 0.01152358, + "auxiliary_loss_mlp": 0.010458, + "balance_loss_clip": 1.02684569, + "balance_loss_mlp": 1.04793155, + "epoch": 0.15349466406132573, + "flos": 11910887959680.0, + "grad_norm": 3.2490122092843947, + "language_loss": 0.88505352, + "learning_rate": 3.84213978637978e-06, + "loss": 0.90703511, + "num_input_tokens_seen": 55481050, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.046875, + "step": 2553, + "time_per_iteration": 2.4523322582244873 + }, + { + "auxiliary_loss_clip": 0.01153624, + "auxiliary_loss_mlp": 0.01047537, + "balance_loss_clip": 1.02858269, + "balance_loss_mlp": 1.04771137, + "epoch": 0.1535547873139937, + "flos": 24096922099200.0, + "grad_norm": 1.8003580088176259, + "language_loss": 0.78462374, + "learning_rate": 3.841988096129152e-06, + "loss": 0.80663538, + "num_input_tokens_seen": 55500050, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0625, + "step": 2554, + "time_per_iteration": 2.48526668548584 + }, + { + "auxiliary_loss_clip": 0.01154341, + "auxiliary_loss_mlp": 0.01052395, + "balance_loss_clip": 1.03212881, + "balance_loss_mlp": 1.04941773, + "epoch": 0.15361491056666166, + "flos": 17566459246080.0, + "grad_norm": 2.4926146542113763, + "language_loss": 0.78344929, + "learning_rate": 3.841836336030151e-06, + "loss": 0.80551672, + "num_input_tokens_seen": 55518125, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.046875, + "step": 2555, + "time_per_iteration": 2.4687228202819824 + }, + { + "auxiliary_loss_clip": 0.01149124, + "auxiliary_loss_mlp": 0.01053536, + "balance_loss_clip": 1.03543973, + "balance_loss_mlp": 1.04890609, + "epoch": 0.15367503381932962, + "flos": 25046041121280.0, + "grad_norm": 1.6634961059278193, + "language_loss": 0.76901627, + "learning_rate": 3.8416845060885305e-06, + "loss": 0.7910428, + "num_input_tokens_seen": 55540960, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.0, + "step": 2556, + "time_per_iteration": 2.5006635189056396 + }, + { + "auxiliary_loss_clip": 0.01145988, + "auxiliary_loss_mlp": 0.0104239, + "balance_loss_clip": 1.02362633, + "balance_loss_mlp": 1.04657805, + "epoch": 0.15373515707199759, + "flos": 21507332008320.0, + "grad_norm": 1.8623555031997667, + "language_loss": 0.89489496, + "learning_rate": 3.84153260631005e-06, + "loss": 0.9167788, + "num_input_tokens_seen": 55559210, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.99609375, + "step": 2557, + "time_per_iteration": 2.4434657096862793 + }, + { + "auxiliary_loss_clip": 0.01151609, + "auxiliary_loss_mlp": 0.01046416, + "balance_loss_clip": 1.0263536, + "balance_loss_mlp": 1.04834831, + "epoch": 0.15379528032466555, + "flos": 25994729180160.0, + "grad_norm": 2.0348980361104587, + "language_loss": 0.7119934, + "learning_rate": 3.841380636700468e-06, + "loss": 0.73397368, + "num_input_tokens_seen": 55578925, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.03125, + "step": 2558, + "time_per_iteration": 2.490226984024048 + }, + { + "auxiliary_loss_clip": 0.01152232, + "auxiliary_loss_mlp": 0.01047681, + "balance_loss_clip": 1.02863097, + "balance_loss_mlp": 1.04888546, + "epoch": 0.15385540357733354, + "flos": 19277315015040.0, + "grad_norm": 2.2935483083292705, + "language_loss": 0.92370701, + "learning_rate": 3.841228597265548e-06, + "loss": 0.94570613, + "num_input_tokens_seen": 55597255, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.03125, + "step": 2559, + "time_per_iteration": 3.885131597518921 + }, + { + "auxiliary_loss_clip": 0.01155373, + "auxiliary_loss_mlp": 0.01053347, + "balance_loss_clip": 1.03331971, + "balance_loss_mlp": 1.05068171, + "epoch": 0.1539155268300015, + "flos": 28549126920960.0, + "grad_norm": 5.140445938018919, + "language_loss": 0.63637704, + "learning_rate": 3.841076488011055e-06, + "loss": 0.65846419, + "num_input_tokens_seen": 55619515, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.046875, + "step": 2560, + "time_per_iteration": 5.343889236450195 + }, + { + "auxiliary_loss_clip": 0.01153839, + "auxiliary_loss_mlp": 0.01046849, + "balance_loss_clip": 1.02725124, + "balance_loss_mlp": 1.04950392, + "epoch": 0.15397565008266947, + "flos": 23547883737600.0, + "grad_norm": 1.8613162525264346, + "language_loss": 0.88230681, + "learning_rate": 3.8409243089427574e-06, + "loss": 0.90431374, + "num_input_tokens_seen": 55640050, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.046875, + "step": 2561, + "time_per_iteration": 2.4648611545562744 + }, + { + "auxiliary_loss_clip": 0.01145331, + "auxiliary_loss_mlp": 0.01041909, + "balance_loss_clip": 1.02433765, + "balance_loss_mlp": 1.0477581, + "epoch": 0.15403577333533744, + "flos": 17129821518720.0, + "grad_norm": 1.8458305826175445, + "language_loss": 0.82909077, + "learning_rate": 3.840772060066425e-06, + "loss": 0.85096323, + "num_input_tokens_seen": 55658695, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9765625, + "step": 2562, + "time_per_iteration": 2.4327874183654785 + }, + { + "auxiliary_loss_clip": 0.01160792, + "auxiliary_loss_mlp": 0.0104767, + "balance_loss_clip": 1.02614117, + "balance_loss_mlp": 1.05274105, + "epoch": 0.1540958965880054, + "flos": 17894503180800.0, + "grad_norm": 1.8513620412223286, + "language_loss": 0.74713194, + "learning_rate": 3.840619741387832e-06, + "loss": 0.7692166, + "num_input_tokens_seen": 55676340, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.078125, + "step": 2563, + "time_per_iteration": 2.4246435165405273 + }, + { + "auxiliary_loss_clip": 0.01152598, + "auxiliary_loss_mlp": 0.01044039, + "balance_loss_clip": 1.02425051, + "balance_loss_mlp": 1.04708791, + "epoch": 0.15415601984067337, + "flos": 32161057908480.0, + "grad_norm": 4.308351588789828, + "language_loss": 0.75896233, + "learning_rate": 3.8404673529127534e-06, + "loss": 0.78092873, + "num_input_tokens_seen": 55698890, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.0546875, + "step": 2564, + "time_per_iteration": 2.5528018474578857 + }, + { + "auxiliary_loss_clip": 0.01149402, + "auxiliary_loss_mlp": 0.01050825, + "balance_loss_clip": 1.03233564, + "balance_loss_mlp": 1.04782677, + "epoch": 0.15421614309334136, + "flos": 24024418496640.0, + "grad_norm": 1.9915177170702032, + "language_loss": 0.70825899, + "learning_rate": 3.840314894646969e-06, + "loss": 0.73026133, + "num_input_tokens_seen": 55718535, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.015625, + "step": 2565, + "time_per_iteration": 2.5007505416870117 + }, + { + "auxiliary_loss_clip": 0.01149717, + "auxiliary_loss_mlp": 0.01050801, + "balance_loss_clip": 1.0315845, + "balance_loss_mlp": 1.04728019, + "epoch": 0.15427626634600933, + "flos": 24386290064640.0, + "grad_norm": 2.308308002927142, + "language_loss": 0.71535969, + "learning_rate": 3.840162366596259e-06, + "loss": 0.73736489, + "num_input_tokens_seen": 55738970, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.0234375, + "step": 2566, + "time_per_iteration": 2.498033285140991 + }, + { + "auxiliary_loss_clip": 0.01143736, + "auxiliary_loss_mlp": 0.01042132, + "balance_loss_clip": 1.02379811, + "balance_loss_mlp": 1.04381752, + "epoch": 0.1543363895986773, + "flos": 23331522165120.0, + "grad_norm": 1.7584763964610812, + "language_loss": 0.85129261, + "learning_rate": 3.840009768766408e-06, + "loss": 0.87315124, + "num_input_tokens_seen": 55759585, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.0, + "step": 2567, + "time_per_iteration": 2.46708083152771 + }, + { + "auxiliary_loss_clip": 0.01150763, + "auxiliary_loss_mlp": 0.01050725, + "balance_loss_clip": 1.03266454, + "balance_loss_mlp": 1.0491097, + "epoch": 0.15439651285134526, + "flos": 24274284480000.0, + "grad_norm": 2.4904852760766127, + "language_loss": 0.78025472, + "learning_rate": 3.839857101163202e-06, + "loss": 0.80226958, + "num_input_tokens_seen": 55779250, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.015625, + "step": 2568, + "time_per_iteration": 2.476029634475708 + }, + { + "auxiliary_loss_clip": 0.01150703, + "auxiliary_loss_mlp": 0.01039967, + "balance_loss_clip": 1.01974905, + "balance_loss_mlp": 1.04835856, + "epoch": 0.15445663610401322, + "flos": 22456163721600.0, + "grad_norm": 1.967048361077992, + "language_loss": 0.70183134, + "learning_rate": 3.83970436379243e-06, + "loss": 0.72373807, + "num_input_tokens_seen": 55800470, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0234375, + "step": 2569, + "time_per_iteration": 2.4566383361816406 + }, + { + "auxiliary_loss_clip": 0.011445, + "auxiliary_loss_mlp": 0.0104299, + "balance_loss_clip": 1.0243814, + "balance_loss_mlp": 1.04563344, + "epoch": 0.1545167593566812, + "flos": 22049510872320.0, + "grad_norm": 1.7954711420319855, + "language_loss": 0.76502788, + "learning_rate": 3.839551556659884e-06, + "loss": 0.78690279, + "num_input_tokens_seen": 55817795, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.98828125, + "step": 2570, + "time_per_iteration": 2.4543209075927734 + }, + { + "auxiliary_loss_clip": 0.01149071, + "auxiliary_loss_mlp": 0.01044712, + "balance_loss_clip": 1.02532816, + "balance_loss_mlp": 1.04811645, + "epoch": 0.15457688260934915, + "flos": 19318253541120.0, + "grad_norm": 7.2402617485583525, + "language_loss": 0.77214551, + "learning_rate": 3.839398679771359e-06, + "loss": 0.7940833, + "num_input_tokens_seen": 55836125, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0078125, + "step": 2571, + "time_per_iteration": 2.4532222747802734 + }, + { + "auxiliary_loss_clip": 0.0114992, + "auxiliary_loss_mlp": 0.01049579, + "balance_loss_clip": 1.03086352, + "balance_loss_mlp": 1.04835165, + "epoch": 0.15463700586201715, + "flos": 24133981956480.0, + "grad_norm": 1.949392721600437, + "language_loss": 0.82254899, + "learning_rate": 3.839245733132652e-06, + "loss": 0.84454399, + "num_input_tokens_seen": 55855280, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 2572, + "time_per_iteration": 2.4919703006744385 + }, + { + "auxiliary_loss_clip": 0.01152049, + "auxiliary_loss_mlp": 0.01047577, + "balance_loss_clip": 1.02838445, + "balance_loss_mlp": 1.04827368, + "epoch": 0.1546971291146851, + "flos": 22420935457920.0, + "grad_norm": 1.621727953381826, + "language_loss": 0.90506172, + "learning_rate": 3.839092716749563e-06, + "loss": 0.92705798, + "num_input_tokens_seen": 55875695, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.03125, + "step": 2573, + "time_per_iteration": 2.4679911136627197 + }, + { + "auxiliary_loss_clip": 0.01152025, + "auxiliary_loss_mlp": 0.01056653, + "balance_loss_clip": 1.03724563, + "balance_loss_mlp": 1.04919529, + "epoch": 0.15475725236735308, + "flos": 17530225401600.0, + "grad_norm": 1.7899098306423509, + "language_loss": 0.70378339, + "learning_rate": 3.838939630627893e-06, + "loss": 0.72587025, + "num_input_tokens_seen": 55894575, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0234375, + "step": 2574, + "time_per_iteration": 2.448148012161255 + }, + { + "auxiliary_loss_clip": 0.01150284, + "auxiliary_loss_mlp": 0.01048729, + "balance_loss_clip": 1.02798676, + "balance_loss_mlp": 1.04641008, + "epoch": 0.15481737562002104, + "flos": 22561740771840.0, + "grad_norm": 1.761755301023602, + "language_loss": 0.82718939, + "learning_rate": 3.838786474773448e-06, + "loss": 0.84917951, + "num_input_tokens_seen": 55912855, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.0390625, + "step": 2575, + "time_per_iteration": 2.4515788555145264 + }, + { + "auxiliary_loss_clip": 0.011498, + "auxiliary_loss_mlp": 0.01047927, + "balance_loss_clip": 1.02937794, + "balance_loss_mlp": 1.0456214, + "epoch": 0.154877498872689, + "flos": 24900567039360.0, + "grad_norm": 2.21774000772259, + "language_loss": 0.84661531, + "learning_rate": 3.838633249192036e-06, + "loss": 0.86859256, + "num_input_tokens_seen": 55932375, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0390625, + "step": 2576, + "time_per_iteration": 2.5052003860473633 + }, + { + "auxiliary_loss_clip": 0.01149148, + "auxiliary_loss_mlp": 0.01043114, + "balance_loss_clip": 1.02414751, + "balance_loss_mlp": 1.04679108, + "epoch": 0.15493762212535697, + "flos": 28147501975680.0, + "grad_norm": 1.816317520286285, + "language_loss": 0.81942815, + "learning_rate": 3.838479953889465e-06, + "loss": 0.84135079, + "num_input_tokens_seen": 55953970, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0234375, + "step": 2577, + "time_per_iteration": 2.5133895874023438 + }, + { + "auxiliary_loss_clip": 0.01151988, + "auxiliary_loss_mlp": 0.01049852, + "balance_loss_clip": 1.03090954, + "balance_loss_mlp": 1.04980743, + "epoch": 0.15499774537802496, + "flos": 25411073086080.0, + "grad_norm": 2.384736720709717, + "language_loss": 0.76260924, + "learning_rate": 3.8383265888715525e-06, + "loss": 0.78462768, + "num_input_tokens_seen": 55973120, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0234375, + "step": 2578, + "time_per_iteration": 2.5140793323516846 + }, + { + "auxiliary_loss_clip": 0.01151489, + "auxiliary_loss_mlp": 0.01045761, + "balance_loss_clip": 1.02630556, + "balance_loss_mlp": 1.04832911, + "epoch": 0.15505786863069293, + "flos": 22091562720000.0, + "grad_norm": 2.651100693067537, + "language_loss": 0.82420707, + "learning_rate": 3.83817315414411e-06, + "loss": 0.84617954, + "num_input_tokens_seen": 55993260, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.03125, + "step": 2579, + "time_per_iteration": 2.4410548210144043 + }, + { + "auxiliary_loss_clip": 0.01152359, + "auxiliary_loss_mlp": 0.01049402, + "balance_loss_clip": 1.03056741, + "balance_loss_mlp": 1.05137682, + "epoch": 0.1551179918833609, + "flos": 18917131386240.0, + "grad_norm": 1.6356270056083286, + "language_loss": 0.80460835, + "learning_rate": 3.838019649712958e-06, + "loss": 0.82662606, + "num_input_tokens_seen": 56012130, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0078125, + "step": 2580, + "time_per_iteration": 2.457929849624634 + }, + { + "auxiliary_loss_clip": 0.01050492, + "auxiliary_loss_mlp": 0.01014696, + "balance_loss_clip": 1.0128479, + "balance_loss_mlp": 1.01473403, + "epoch": 0.15517811513602886, + "flos": 66239172587520.0, + "grad_norm": 0.84873853717235, + "language_loss": 0.58840239, + "learning_rate": 3.8378660755839166e-06, + "loss": 0.60905427, + "num_input_tokens_seen": 56079045, + "router_z_loss_clip": 0.01843262, + "router_z_loss_mlp": 0.35742188, + "step": 2581, + "time_per_iteration": 3.1725480556488037 + }, + { + "auxiliary_loss_clip": 0.01152966, + "auxiliary_loss_mlp": 0.01044952, + "balance_loss_clip": 1.02615237, + "balance_loss_mlp": 1.04869819, + "epoch": 0.15523823838869683, + "flos": 24021078531840.0, + "grad_norm": 1.8637973548327127, + "language_loss": 0.85214508, + "learning_rate": 3.8377124317628095e-06, + "loss": 0.87412429, + "num_input_tokens_seen": 56098745, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0390625, + "step": 2582, + "time_per_iteration": 2.486454963684082 + }, + { + "auxiliary_loss_clip": 0.01150766, + "auxiliary_loss_mlp": 0.01055198, + "balance_loss_clip": 1.03534937, + "balance_loss_mlp": 1.04837251, + "epoch": 0.1552983616413648, + "flos": 20485062938880.0, + "grad_norm": 2.457099081417407, + "language_loss": 0.78432047, + "learning_rate": 3.8375587182554625e-06, + "loss": 0.80638009, + "num_input_tokens_seen": 56117655, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0234375, + "step": 2583, + "time_per_iteration": 2.468686580657959 + }, + { + "auxiliary_loss_clip": 0.01151702, + "auxiliary_loss_mlp": 0.01054386, + "balance_loss_clip": 1.03458571, + "balance_loss_mlp": 1.04853427, + "epoch": 0.15535848489403276, + "flos": 32123710742400.0, + "grad_norm": 1.6727812592242826, + "language_loss": 0.76121294, + "learning_rate": 3.837404935067705e-06, + "loss": 0.78327382, + "num_input_tokens_seen": 56141960, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.03125, + "step": 2584, + "time_per_iteration": 2.5471444129943848 + }, + { + "auxiliary_loss_clip": 0.01149066, + "auxiliary_loss_mlp": 0.01046504, + "balance_loss_clip": 1.02746594, + "balance_loss_mlp": 1.04740906, + "epoch": 0.15541860814670075, + "flos": 19098444263040.0, + "grad_norm": 2.0194610159936324, + "language_loss": 0.75623107, + "learning_rate": 3.837251082205368e-06, + "loss": 0.7781868, + "num_input_tokens_seen": 56161430, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.015625, + "step": 2585, + "time_per_iteration": 2.4448020458221436 + }, + { + "auxiliary_loss_clip": 0.01146182, + "auxiliary_loss_mlp": 0.01049826, + "balance_loss_clip": 1.03101528, + "balance_loss_mlp": 1.04662418, + "epoch": 0.1554787313993687, + "flos": 19172097100800.0, + "grad_norm": 2.233481730992117, + "language_loss": 0.611651, + "learning_rate": 3.837097159674286e-06, + "loss": 0.63361114, + "num_input_tokens_seen": 56179390, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.9921875, + "step": 2586, + "time_per_iteration": 2.4375994205474854 + }, + { + "auxiliary_loss_clip": 0.01150781, + "auxiliary_loss_mlp": 0.01047148, + "balance_loss_clip": 1.02814651, + "balance_loss_mlp": 1.04623449, + "epoch": 0.15553885465203668, + "flos": 16143822207360.0, + "grad_norm": 1.8194244944539537, + "language_loss": 0.8108865, + "learning_rate": 3.836943167480296e-06, + "loss": 0.83286583, + "num_input_tokens_seen": 56198020, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.046875, + "step": 2587, + "time_per_iteration": 2.4394617080688477 + }, + { + "auxiliary_loss_clip": 0.01155076, + "auxiliary_loss_mlp": 0.01058653, + "balance_loss_clip": 1.03512144, + "balance_loss_mlp": 1.04851258, + "epoch": 0.15559897790470464, + "flos": 25337779384320.0, + "grad_norm": 1.8978014455674168, + "language_loss": 0.88844347, + "learning_rate": 3.836789105629236e-06, + "loss": 0.91058075, + "num_input_tokens_seen": 56218165, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.0625, + "step": 2588, + "time_per_iteration": 2.519864559173584 + }, + { + "auxiliary_loss_clip": 0.01150101, + "auxiliary_loss_mlp": 0.01053957, + "balance_loss_clip": 1.03351235, + "balance_loss_mlp": 1.04859662, + "epoch": 0.1556591011573726, + "flos": 23148772744320.0, + "grad_norm": 2.6765596364055266, + "language_loss": 0.64950025, + "learning_rate": 3.83663497412695e-06, + "loss": 0.6715408, + "num_input_tokens_seen": 56237160, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.015625, + "step": 2589, + "time_per_iteration": 2.5106732845306396 + }, + { + "auxiliary_loss_clip": 0.01150618, + "auxiliary_loss_mlp": 0.01044948, + "balance_loss_clip": 1.02451587, + "balance_loss_mlp": 1.0483036, + "epoch": 0.15571922441004057, + "flos": 25370888745600.0, + "grad_norm": 1.7614316666112095, + "language_loss": 0.82610166, + "learning_rate": 3.836480772979281e-06, + "loss": 0.84805739, + "num_input_tokens_seen": 56257610, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0234375, + "step": 2590, + "time_per_iteration": 2.519573211669922 + }, + { + "auxiliary_loss_clip": 0.01151128, + "auxiliary_loss_mlp": 0.01047405, + "balance_loss_clip": 1.02761662, + "balance_loss_mlp": 1.04740536, + "epoch": 0.15577934766270854, + "flos": 14501375890560.0, + "grad_norm": 2.1478399705358195, + "language_loss": 0.78919029, + "learning_rate": 3.836326502192077e-06, + "loss": 0.81117558, + "num_input_tokens_seen": 56275215, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.0390625, + "step": 2591, + "time_per_iteration": 2.446871519088745 + }, + { + "auxiliary_loss_clip": 0.01150021, + "auxiliary_loss_mlp": 0.01051358, + "balance_loss_clip": 1.03271413, + "balance_loss_mlp": 1.04902434, + "epoch": 0.15583947091537653, + "flos": 37414537372800.0, + "grad_norm": 1.9877262596002243, + "language_loss": 0.64780253, + "learning_rate": 3.836172161771189e-06, + "loss": 0.66981632, + "num_input_tokens_seen": 56297130, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0078125, + "step": 2592, + "time_per_iteration": 2.5992095470428467 + }, + { + "auxiliary_loss_clip": 0.01156577, + "auxiliary_loss_mlp": 0.01052338, + "balance_loss_clip": 1.03195322, + "balance_loss_mlp": 1.0518856, + "epoch": 0.1558995941680445, + "flos": 21834729498240.0, + "grad_norm": 2.6077304694487062, + "language_loss": 0.81806099, + "learning_rate": 3.836017751722467e-06, + "loss": 0.84015012, + "num_input_tokens_seen": 56314995, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.046875, + "step": 2593, + "time_per_iteration": 2.4317471981048584 + }, + { + "auxiliary_loss_clip": 0.01148564, + "auxiliary_loss_mlp": 0.01049732, + "balance_loss_clip": 1.02876306, + "balance_loss_mlp": 1.04862404, + "epoch": 0.15595971742071246, + "flos": 19792633484160.0, + "grad_norm": 2.3131099691306445, + "language_loss": 0.72585857, + "learning_rate": 3.8358632720517695e-06, + "loss": 0.7478416, + "num_input_tokens_seen": 56334005, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.0, + "step": 2594, + "time_per_iteration": 2.454946994781494 + }, + { + "auxiliary_loss_clip": 0.01145676, + "auxiliary_loss_mlp": 0.01044453, + "balance_loss_clip": 1.02514088, + "balance_loss_mlp": 1.0476191, + "epoch": 0.15601984067338043, + "flos": 26722135503360.0, + "grad_norm": 1.980280068020953, + "language_loss": 0.8170377, + "learning_rate": 3.835708722764952e-06, + "loss": 0.83893895, + "num_input_tokens_seen": 56353795, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.98046875, + "step": 2595, + "time_per_iteration": 2.4859232902526855 + }, + { + "auxiliary_loss_clip": 0.01149214, + "auxiliary_loss_mlp": 0.01047121, + "balance_loss_clip": 1.02761889, + "balance_loss_mlp": 1.04722846, + "epoch": 0.1560799639260484, + "flos": 18369278173440.0, + "grad_norm": 2.3729637830877177, + "language_loss": 0.86587811, + "learning_rate": 3.835554103867876e-06, + "loss": 0.88784146, + "num_input_tokens_seen": 56373195, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.015625, + "step": 2596, + "time_per_iteration": 2.431851387023926 + }, + { + "auxiliary_loss_clip": 0.0114636, + "auxiliary_loss_mlp": 0.01043935, + "balance_loss_clip": 1.02558839, + "balance_loss_mlp": 1.04831815, + "epoch": 0.15614008717871636, + "flos": 22598980197120.0, + "grad_norm": 1.6624104890405602, + "language_loss": 0.68610018, + "learning_rate": 3.835399415366404e-06, + "loss": 0.70800316, + "num_input_tokens_seen": 56391525, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.98046875, + "step": 2597, + "time_per_iteration": 2.447265625 + }, + { + "auxiliary_loss_clip": 0.01144111, + "auxiliary_loss_mlp": 0.01040539, + "balance_loss_clip": 1.02210891, + "balance_loss_mlp": 1.04714298, + "epoch": 0.15620021043138435, + "flos": 22746860490240.0, + "grad_norm": 1.638980754682227, + "language_loss": 0.79885375, + "learning_rate": 3.8352446572664035e-06, + "loss": 0.82070029, + "num_input_tokens_seen": 56410715, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.97265625, + "step": 2598, + "time_per_iteration": 2.4641571044921875 + }, + { + "auxiliary_loss_clip": 0.01141262, + "auxiliary_loss_mlp": 0.01039052, + "balance_loss_clip": 1.02003777, + "balance_loss_mlp": 1.04484367, + "epoch": 0.15626033368405232, + "flos": 13114936782720.0, + "grad_norm": 2.19687533686526, + "language_loss": 0.82877028, + "learning_rate": 3.8350898295737405e-06, + "loss": 0.85057342, + "num_input_tokens_seen": 56429170, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.96484375, + "step": 2599, + "time_per_iteration": 2.419464111328125 + }, + { + "auxiliary_loss_clip": 0.01155461, + "auxiliary_loss_mlp": 0.01052363, + "balance_loss_clip": 1.03115571, + "balance_loss_mlp": 1.04991198, + "epoch": 0.15632045693672028, + "flos": 16472297105280.0, + "grad_norm": 3.412785735027946, + "language_loss": 0.81813747, + "learning_rate": 3.834934932294287e-06, + "loss": 0.84021574, + "num_input_tokens_seen": 56445685, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.0546875, + "step": 2600, + "time_per_iteration": 2.408848524093628 + }, + { + "auxiliary_loss_clip": 0.01152936, + "auxiliary_loss_mlp": 0.0104778, + "balance_loss_clip": 1.02813435, + "balance_loss_mlp": 1.05145574, + "epoch": 0.15638058018938825, + "flos": 20850346298880.0, + "grad_norm": 1.8570517134994367, + "language_loss": 0.8869983, + "learning_rate": 3.834779965433917e-06, + "loss": 0.90900552, + "num_input_tokens_seen": 56465900, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.015625, + "step": 2601, + "time_per_iteration": 3.8960022926330566 + }, + { + "auxiliary_loss_clip": 0.01155618, + "auxiliary_loss_mlp": 0.01064496, + "balance_loss_clip": 1.04250216, + "balance_loss_mlp": 1.05294669, + "epoch": 0.1564407034420562, + "flos": 21872220318720.0, + "grad_norm": 1.6572791804428935, + "language_loss": 0.78657669, + "learning_rate": 3.834624928998508e-06, + "loss": 0.80877781, + "num_input_tokens_seen": 56485020, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.0234375, + "step": 2602, + "time_per_iteration": 5.330498456954956 + }, + { + "auxiliary_loss_clip": 0.01148351, + "auxiliary_loss_mlp": 0.01041482, + "balance_loss_clip": 1.02178836, + "balance_loss_mlp": 1.04872918, + "epoch": 0.15650082669472418, + "flos": 21834549930240.0, + "grad_norm": 1.9481072701353659, + "language_loss": 0.73668396, + "learning_rate": 3.8344698229939376e-06, + "loss": 0.75858229, + "num_input_tokens_seen": 56505205, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.99609375, + "step": 2603, + "time_per_iteration": 2.4632985591888428 + }, + { + "auxiliary_loss_clip": 0.01152236, + "auxiliary_loss_mlp": 0.01051929, + "balance_loss_clip": 1.03205693, + "balance_loss_mlp": 1.05066442, + "epoch": 0.15656094994739214, + "flos": 13800542653440.0, + "grad_norm": 3.4624008692922583, + "language_loss": 0.87223339, + "learning_rate": 3.8343146474260865e-06, + "loss": 0.89427507, + "num_input_tokens_seen": 56521495, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.015625, + "step": 2604, + "time_per_iteration": 2.449589490890503 + }, + { + "auxiliary_loss_clip": 0.01151636, + "auxiliary_loss_mlp": 0.01043178, + "balance_loss_clip": 1.02404523, + "balance_loss_mlp": 1.04892218, + "epoch": 0.15662107320006013, + "flos": 27308197808640.0, + "grad_norm": 1.883819023069068, + "language_loss": 0.85465723, + "learning_rate": 3.834159402300841e-06, + "loss": 0.87660539, + "num_input_tokens_seen": 56540665, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0234375, + "step": 2605, + "time_per_iteration": 2.4958839416503906 + }, + { + "auxiliary_loss_clip": 0.01153078, + "auxiliary_loss_mlp": 0.01047461, + "balance_loss_clip": 1.0274334, + "balance_loss_mlp": 1.04840827, + "epoch": 0.1566811964527281, + "flos": 26685075646080.0, + "grad_norm": 2.4518366617864897, + "language_loss": 0.72954321, + "learning_rate": 3.834004087624087e-06, + "loss": 0.75154853, + "num_input_tokens_seen": 56560805, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.046875, + "step": 2606, + "time_per_iteration": 2.5142898559570312 + }, + { + "auxiliary_loss_clip": 0.01153185, + "auxiliary_loss_mlp": 0.01052184, + "balance_loss_clip": 1.03406429, + "balance_loss_mlp": 1.05257165, + "epoch": 0.15674131970539606, + "flos": 16103422385280.0, + "grad_norm": 1.9820673877795116, + "language_loss": 0.7643044, + "learning_rate": 3.8338487034017145e-06, + "loss": 0.78635812, + "num_input_tokens_seen": 56576335, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.0, + "step": 2607, + "time_per_iteration": 2.433779239654541 + }, + { + "auxiliary_loss_clip": 0.01150219, + "auxiliary_loss_mlp": 0.01046695, + "balance_loss_clip": 1.0282656, + "balance_loss_mlp": 1.05097091, + "epoch": 0.15680144295806403, + "flos": 19169690889600.0, + "grad_norm": 1.7850270515341367, + "language_loss": 0.8191157, + "learning_rate": 3.833693249639615e-06, + "loss": 0.8410849, + "num_input_tokens_seen": 56595880, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.9921875, + "step": 2608, + "time_per_iteration": 2.4599456787109375 + }, + { + "auxiliary_loss_clip": 0.0115477, + "auxiliary_loss_mlp": 0.01051079, + "balance_loss_clip": 1.03001475, + "balance_loss_mlp": 1.05087662, + "epoch": 0.156861566210732, + "flos": 20813430096000.0, + "grad_norm": 1.762197880640894, + "language_loss": 0.72479111, + "learning_rate": 3.833537726343684e-06, + "loss": 0.74684954, + "num_input_tokens_seen": 56615130, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.0390625, + "step": 2609, + "time_per_iteration": 2.478262424468994 + }, + { + "auxiliary_loss_clip": 0.0115339, + "auxiliary_loss_mlp": 0.01043612, + "balance_loss_clip": 1.02415729, + "balance_loss_mlp": 1.04881263, + "epoch": 0.15692168946339996, + "flos": 20047922421120.0, + "grad_norm": 1.8833233307981396, + "language_loss": 0.71974212, + "learning_rate": 3.833382133519818e-06, + "loss": 0.74171209, + "num_input_tokens_seen": 56634005, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.046875, + "step": 2610, + "time_per_iteration": 2.468616247177124 + }, + { + "auxiliary_loss_clip": 0.01153055, + "auxiliary_loss_mlp": 0.01052533, + "balance_loss_clip": 1.03119481, + "balance_loss_mlp": 1.04865789, + "epoch": 0.15698181271606793, + "flos": 21398019943680.0, + "grad_norm": 2.0486839750324117, + "language_loss": 0.72148776, + "learning_rate": 3.833226471173919e-06, + "loss": 0.74354362, + "num_input_tokens_seen": 56653480, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.046875, + "step": 2611, + "time_per_iteration": 2.4812967777252197 + }, + { + "auxiliary_loss_clip": 0.01152967, + "auxiliary_loss_mlp": 0.01044873, + "balance_loss_clip": 1.02517986, + "balance_loss_mlp": 1.05081797, + "epoch": 0.15704193596873592, + "flos": 20845785271680.0, + "grad_norm": 2.1526303920645153, + "language_loss": 0.70732605, + "learning_rate": 3.833070739311887e-06, + "loss": 0.72930443, + "num_input_tokens_seen": 56672270, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0234375, + "step": 2612, + "time_per_iteration": 2.4659905433654785 + }, + { + "auxiliary_loss_clip": 0.0115345, + "auxiliary_loss_mlp": 0.01053573, + "balance_loss_clip": 1.03448749, + "balance_loss_mlp": 1.05112672, + "epoch": 0.15710205922140388, + "flos": 21762908254080.0, + "grad_norm": 1.98698506128839, + "language_loss": 0.75649011, + "learning_rate": 3.83291493793963e-06, + "loss": 0.77856034, + "num_input_tokens_seen": 56691510, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0234375, + "step": 2613, + "time_per_iteration": 2.5053935050964355 + }, + { + "auxiliary_loss_clip": 0.01150247, + "auxiliary_loss_mlp": 0.01054631, + "balance_loss_clip": 1.03454411, + "balance_loss_mlp": 1.04870725, + "epoch": 0.15716218247407185, + "flos": 25007760201600.0, + "grad_norm": 1.7256548803860323, + "language_loss": 0.6593504, + "learning_rate": 3.832759067063055e-06, + "loss": 0.68139917, + "num_input_tokens_seen": 56712230, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.015625, + "step": 2614, + "time_per_iteration": 2.49568772315979 + }, + { + "auxiliary_loss_clip": 0.01154976, + "auxiliary_loss_mlp": 0.01050381, + "balance_loss_clip": 1.02972233, + "balance_loss_mlp": 1.04979289, + "epoch": 0.1572223057267398, + "flos": 20191780391040.0, + "grad_norm": 2.1509467282749055, + "language_loss": 0.7554003, + "learning_rate": 3.832603126688072e-06, + "loss": 0.7774539, + "num_input_tokens_seen": 56727490, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.0546875, + "step": 2615, + "time_per_iteration": 2.529383420944214 + }, + { + "auxiliary_loss_clip": 0.0115204, + "auxiliary_loss_mlp": 0.01052516, + "balance_loss_clip": 1.03374028, + "balance_loss_mlp": 1.05295634, + "epoch": 0.15728242897940778, + "flos": 20959514709120.0, + "grad_norm": 1.616950748432624, + "language_loss": 0.72989607, + "learning_rate": 3.832447116820594e-06, + "loss": 0.75194162, + "num_input_tokens_seen": 56747385, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9921875, + "step": 2616, + "time_per_iteration": 2.5096960067749023 + }, + { + "auxiliary_loss_clip": 0.01152584, + "auxiliary_loss_mlp": 0.01055054, + "balance_loss_clip": 1.03453839, + "balance_loss_mlp": 1.04991412, + "epoch": 0.15734255223207574, + "flos": 23038275530880.0, + "grad_norm": 3.5663633553154774, + "language_loss": 0.72316766, + "learning_rate": 3.832291037466539e-06, + "loss": 0.74524403, + "num_input_tokens_seen": 56768055, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0234375, + "step": 2617, + "time_per_iteration": 2.46756911277771 + }, + { + "auxiliary_loss_clip": 0.01151577, + "auxiliary_loss_mlp": 0.01043789, + "balance_loss_clip": 1.02453637, + "balance_loss_mlp": 1.05169988, + "epoch": 0.15740267548474374, + "flos": 20551281661440.0, + "grad_norm": 2.0296559288157563, + "language_loss": 0.74336463, + "learning_rate": 3.8321348886318235e-06, + "loss": 0.76531827, + "num_input_tokens_seen": 56785110, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.0, + "step": 2618, + "time_per_iteration": 2.4584109783172607 + }, + { + "auxiliary_loss_clip": 0.01156356, + "auxiliary_loss_mlp": 0.01051737, + "balance_loss_clip": 1.02976644, + "balance_loss_mlp": 1.05079079, + "epoch": 0.1574627987374117, + "flos": 22666922772480.0, + "grad_norm": 2.116136233608656, + "language_loss": 0.78624105, + "learning_rate": 3.8319786703223695e-06, + "loss": 0.80832201, + "num_input_tokens_seen": 56804975, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.0546875, + "step": 2619, + "time_per_iteration": 2.481902837753296 + }, + { + "auxiliary_loss_clip": 0.01151953, + "auxiliary_loss_mlp": 0.01052764, + "balance_loss_clip": 1.03373837, + "balance_loss_mlp": 1.05213726, + "epoch": 0.15752292199007967, + "flos": 16800664262400.0, + "grad_norm": 1.705564128099723, + "language_loss": 0.76632881, + "learning_rate": 3.831822382544101e-06, + "loss": 0.78837597, + "num_input_tokens_seen": 56822470, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0, + "step": 2620, + "time_per_iteration": 2.432645082473755 + }, + { + "auxiliary_loss_clip": 0.01153614, + "auxiliary_loss_mlp": 0.01050007, + "balance_loss_clip": 1.02901375, + "balance_loss_mlp": 1.05096626, + "epoch": 0.15758304524274763, + "flos": 29826002568960.0, + "grad_norm": 1.7942321132139696, + "language_loss": 0.70836174, + "learning_rate": 3.831666025302944e-06, + "loss": 0.73039794, + "num_input_tokens_seen": 56842100, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.03125, + "step": 2621, + "time_per_iteration": 2.5259244441986084 + }, + { + "auxiliary_loss_clip": 0.01156472, + "auxiliary_loss_mlp": 0.01049198, + "balance_loss_clip": 1.0277524, + "balance_loss_mlp": 1.05222857, + "epoch": 0.1576431684954156, + "flos": 53577426723840.0, + "grad_norm": 2.5825564073202467, + "language_loss": 0.71880406, + "learning_rate": 3.831509598604828e-06, + "loss": 0.74086076, + "num_input_tokens_seen": 56865920, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.046875, + "step": 2622, + "time_per_iteration": 2.738351583480835 + }, + { + "auxiliary_loss_clip": 0.01153726, + "auxiliary_loss_mlp": 0.01047401, + "balance_loss_clip": 1.02826762, + "balance_loss_mlp": 1.05162704, + "epoch": 0.15770329174808356, + "flos": 20813609664000.0, + "grad_norm": 1.7275011876813262, + "language_loss": 0.87603116, + "learning_rate": 3.831353102455684e-06, + "loss": 0.89804244, + "num_input_tokens_seen": 56885265, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0234375, + "step": 2623, + "time_per_iteration": 2.439276695251465 + }, + { + "auxiliary_loss_clip": 0.01153997, + "auxiliary_loss_mlp": 0.01046147, + "balance_loss_clip": 1.02774143, + "balance_loss_mlp": 1.05301619, + "epoch": 0.15776341500075153, + "flos": 24974004395520.0, + "grad_norm": 1.7488793041913886, + "language_loss": 0.82132548, + "learning_rate": 3.831196536861448e-06, + "loss": 0.84332693, + "num_input_tokens_seen": 56906710, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.0078125, + "step": 2624, + "time_per_iteration": 2.5011823177337646 + }, + { + "auxiliary_loss_clip": 0.01156666, + "auxiliary_loss_mlp": 0.01047764, + "balance_loss_clip": 1.02720022, + "balance_loss_mlp": 1.0518285, + "epoch": 0.15782353825341952, + "flos": 21907915459200.0, + "grad_norm": 2.213311097116894, + "language_loss": 0.79965818, + "learning_rate": 3.831039901828054e-06, + "loss": 0.82170242, + "num_input_tokens_seen": 56924275, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.046875, + "step": 2625, + "time_per_iteration": 2.469705581665039 + }, + { + "auxiliary_loss_clip": 0.01152837, + "auxiliary_loss_mlp": 0.01050956, + "balance_loss_clip": 1.03215635, + "balance_loss_mlp": 1.05189955, + "epoch": 0.15788366150608749, + "flos": 26177191292160.0, + "grad_norm": 2.0497226184185044, + "language_loss": 0.80393386, + "learning_rate": 3.830883197361445e-06, + "loss": 0.82597172, + "num_input_tokens_seen": 56941525, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0078125, + "step": 2626, + "time_per_iteration": 2.4822630882263184 + }, + { + "auxiliary_loss_clip": 0.01157567, + "auxiliary_loss_mlp": 0.01046921, + "balance_loss_clip": 1.02703679, + "balance_loss_mlp": 1.05660009, + "epoch": 0.15794378475875545, + "flos": 27709822753920.0, + "grad_norm": 1.8439314798963051, + "language_loss": 0.73819017, + "learning_rate": 3.830726423467561e-06, + "loss": 0.76023501, + "num_input_tokens_seen": 56962145, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0078125, + "step": 2627, + "time_per_iteration": 2.5146384239196777 + }, + { + "auxiliary_loss_clip": 0.01153645, + "auxiliary_loss_mlp": 0.01055765, + "balance_loss_clip": 1.03515387, + "balance_loss_mlp": 1.05136025, + "epoch": 0.15800390801142342, + "flos": 12130158533760.0, + "grad_norm": 2.581375347872909, + "language_loss": 0.84926289, + "learning_rate": 3.830569580152348e-06, + "loss": 0.87135696, + "num_input_tokens_seen": 56977505, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.0234375, + "step": 2628, + "time_per_iteration": 2.476461172103882 + }, + { + "auxiliary_loss_clip": 0.01152526, + "auxiliary_loss_mlp": 0.01039179, + "balance_loss_clip": 1.02045107, + "balance_loss_mlp": 1.05181646, + "epoch": 0.15806403126409138, + "flos": 20704728562560.0, + "grad_norm": 1.9330212081502065, + "language_loss": 0.76414472, + "learning_rate": 3.830412667421752e-06, + "loss": 0.78606176, + "num_input_tokens_seen": 56996770, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0078125, + "step": 2629, + "time_per_iteration": 2.4604575634002686 + }, + { + "auxiliary_loss_clip": 0.01157301, + "auxiliary_loss_mlp": 0.01053673, + "balance_loss_clip": 1.03277516, + "balance_loss_mlp": 1.05376625, + "epoch": 0.15812415451675935, + "flos": 17821712269440.0, + "grad_norm": 2.3335878107949624, + "language_loss": 0.73786485, + "learning_rate": 3.8302556852817245e-06, + "loss": 0.7599746, + "num_input_tokens_seen": 57014970, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.0390625, + "step": 2630, + "time_per_iteration": 2.4556961059570312 + }, + { + "auxiliary_loss_clip": 0.01159154, + "auxiliary_loss_mlp": 0.01049527, + "balance_loss_clip": 1.02934527, + "balance_loss_mlp": 1.05278432, + "epoch": 0.15818427776942734, + "flos": 20084048524800.0, + "grad_norm": 3.0799062126580385, + "language_loss": 0.83732498, + "learning_rate": 3.8300986337382184e-06, + "loss": 0.85941184, + "num_input_tokens_seen": 57034045, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0625, + "step": 2631, + "time_per_iteration": 2.46466326713562 + }, + { + "auxiliary_loss_clip": 0.0115417, + "auxiliary_loss_mlp": 0.01047476, + "balance_loss_clip": 1.02800894, + "balance_loss_mlp": 1.05072045, + "epoch": 0.1582444010220953, + "flos": 21214911386880.0, + "grad_norm": 1.8231521117013414, + "language_loss": 0.78509778, + "learning_rate": 3.8299415127971895e-06, + "loss": 0.80711424, + "num_input_tokens_seen": 57053695, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0390625, + "step": 2632, + "time_per_iteration": 2.4678170680999756 + }, + { + "auxiliary_loss_clip": 0.01160199, + "auxiliary_loss_mlp": 0.01058182, + "balance_loss_clip": 1.03766572, + "balance_loss_mlp": 1.05516291, + "epoch": 0.15830452427476327, + "flos": 17858341163520.0, + "grad_norm": 2.1429957658458374, + "language_loss": 0.83250827, + "learning_rate": 3.829784322464594e-06, + "loss": 0.8546921, + "num_input_tokens_seen": 57071290, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.046875, + "step": 2633, + "time_per_iteration": 2.4329495429992676 + }, + { + "auxiliary_loss_clip": 0.01161566, + "auxiliary_loss_mlp": 0.01046458, + "balance_loss_clip": 1.02641928, + "balance_loss_mlp": 1.05591452, + "epoch": 0.15836464752743123, + "flos": 24534960456960.0, + "grad_norm": 1.9651575849984717, + "language_loss": 0.77401066, + "learning_rate": 3.829627062746394e-06, + "loss": 0.79609084, + "num_input_tokens_seen": 57091465, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.0625, + "step": 2634, + "time_per_iteration": 2.4989452362060547 + }, + { + "auxiliary_loss_clip": 0.01158347, + "auxiliary_loss_mlp": 0.01049894, + "balance_loss_clip": 1.02961695, + "balance_loss_mlp": 1.05281138, + "epoch": 0.1584247707800992, + "flos": 20120821073280.0, + "grad_norm": 2.178604932363088, + "language_loss": 0.89144027, + "learning_rate": 3.829469733648552e-06, + "loss": 0.91352272, + "num_input_tokens_seen": 57110075, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.0546875, + "step": 2635, + "time_per_iteration": 2.45926570892334 + }, + { + "auxiliary_loss_clip": 0.0115666, + "auxiliary_loss_mlp": 0.0105615, + "balance_loss_clip": 1.03518081, + "balance_loss_mlp": 1.05145168, + "epoch": 0.15848489403276717, + "flos": 20375966355840.0, + "grad_norm": 2.07071202721755, + "language_loss": 0.75814605, + "learning_rate": 3.829312335177034e-06, + "loss": 0.78027415, + "num_input_tokens_seen": 57128945, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.046875, + "step": 2636, + "time_per_iteration": 2.4601919651031494 + }, + { + "auxiliary_loss_clip": 0.01159967, + "auxiliary_loss_mlp": 0.01046973, + "balance_loss_clip": 1.0252409, + "balance_loss_mlp": 1.05383635, + "epoch": 0.15854501728543513, + "flos": 39346890359040.0, + "grad_norm": 2.192817266182781, + "language_loss": 0.72065628, + "learning_rate": 3.82915486733781e-06, + "loss": 0.74272561, + "num_input_tokens_seen": 57152385, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.0625, + "step": 2637, + "time_per_iteration": 2.6509416103363037 + }, + { + "auxiliary_loss_clip": 0.01154792, + "auxiliary_loss_mlp": 0.01042754, + "balance_loss_clip": 1.02395523, + "balance_loss_mlp": 1.05307317, + "epoch": 0.15860514053810312, + "flos": 24864225454080.0, + "grad_norm": 1.9644709833035638, + "language_loss": 0.77938193, + "learning_rate": 3.82899733013685e-06, + "loss": 0.80135739, + "num_input_tokens_seen": 57172620, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 2638, + "time_per_iteration": 2.516597032546997 + }, + { + "auxiliary_loss_clip": 0.01160159, + "auxiliary_loss_mlp": 0.01062214, + "balance_loss_clip": 1.04074407, + "balance_loss_mlp": 1.05348861, + "epoch": 0.1586652637907711, + "flos": 26177694082560.0, + "grad_norm": 1.8473853011869859, + "language_loss": 0.75521988, + "learning_rate": 3.828839723580128e-06, + "loss": 0.77744359, + "num_input_tokens_seen": 57194680, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.0625, + "step": 2639, + "time_per_iteration": 2.5517024993896484 + }, + { + "auxiliary_loss_clip": 0.01159513, + "auxiliary_loss_mlp": 0.01061213, + "balance_loss_clip": 1.04115009, + "balance_loss_mlp": 1.0541048, + "epoch": 0.15872538704343905, + "flos": 19792058866560.0, + "grad_norm": 2.7935559917311212, + "language_loss": 0.81487972, + "learning_rate": 3.82868204767362e-06, + "loss": 0.83708692, + "num_input_tokens_seen": 57214675, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0546875, + "step": 2640, + "time_per_iteration": 2.5613112449645996 + }, + { + "auxiliary_loss_clip": 0.01152653, + "auxiliary_loss_mlp": 0.01050922, + "balance_loss_clip": 1.030406, + "balance_loss_mlp": 1.05107331, + "epoch": 0.15878551029610702, + "flos": 28475366342400.0, + "grad_norm": 1.4887809421561018, + "language_loss": 0.67051661, + "learning_rate": 3.828524302423306e-06, + "loss": 0.69255233, + "num_input_tokens_seen": 57235830, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.015625, + "step": 2641, + "time_per_iteration": 2.5603220462799072 + }, + { + "auxiliary_loss_clip": 0.01163302, + "auxiliary_loss_mlp": 0.01057677, + "balance_loss_clip": 1.03670835, + "balance_loss_mlp": 1.05338526, + "epoch": 0.15884563354877498, + "flos": 24206701040640.0, + "grad_norm": 2.894977763056953, + "language_loss": 0.7508198, + "learning_rate": 3.828366487835167e-06, + "loss": 0.77302957, + "num_input_tokens_seen": 57255970, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.09375, + "step": 2642, + "time_per_iteration": 2.4783003330230713 + }, + { + "auxiliary_loss_clip": 0.01154514, + "auxiliary_loss_mlp": 0.01054374, + "balance_loss_clip": 1.0343703, + "balance_loss_mlp": 1.05342579, + "epoch": 0.15890575680144295, + "flos": 23949795991680.0, + "grad_norm": 2.1233146618452046, + "language_loss": 0.70096999, + "learning_rate": 3.828208603915186e-06, + "loss": 0.72305882, + "num_input_tokens_seen": 57274435, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.015625, + "step": 2643, + "time_per_iteration": 3.8417530059814453 + }, + { + "auxiliary_loss_clip": 0.0115474, + "auxiliary_loss_mlp": 0.01046992, + "balance_loss_clip": 1.02801371, + "balance_loss_mlp": 1.05399418, + "epoch": 0.15896588005411091, + "flos": 21215019127680.0, + "grad_norm": 2.266510625665779, + "language_loss": 0.78172421, + "learning_rate": 3.828050650669353e-06, + "loss": 0.80374151, + "num_input_tokens_seen": 57293115, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0078125, + "step": 2644, + "time_per_iteration": 3.918332099914551 + }, + { + "auxiliary_loss_clip": 0.01155626, + "auxiliary_loss_mlp": 0.01054747, + "balance_loss_clip": 1.03432608, + "balance_loss_mlp": 1.05189228, + "epoch": 0.1590260033067789, + "flos": 24352390604160.0, + "grad_norm": 1.8745538844001242, + "language_loss": 0.82203078, + "learning_rate": 3.827892628103657e-06, + "loss": 0.84413457, + "num_input_tokens_seen": 57312565, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0390625, + "step": 2645, + "time_per_iteration": 2.484264373779297 + }, + { + "auxiliary_loss_clip": 0.01156639, + "auxiliary_loss_mlp": 0.01055562, + "balance_loss_clip": 1.0340929, + "balance_loss_mlp": 1.05192447, + "epoch": 0.15908612655944687, + "flos": 32048944583040.0, + "grad_norm": 1.974907168100252, + "language_loss": 0.69778836, + "learning_rate": 3.827734536224087e-06, + "loss": 0.71991032, + "num_input_tokens_seen": 57333360, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.046875, + "step": 2646, + "time_per_iteration": 2.5406665802001953 + }, + { + "auxiliary_loss_clip": 0.01151139, + "auxiliary_loss_mlp": 0.01046289, + "balance_loss_clip": 1.02738249, + "balance_loss_mlp": 1.05206954, + "epoch": 0.15914624981211484, + "flos": 17785370684160.0, + "grad_norm": 2.5066454352116914, + "language_loss": 0.62659109, + "learning_rate": 3.827576375036642e-06, + "loss": 0.64856541, + "num_input_tokens_seen": 57350575, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.9921875, + "step": 2647, + "time_per_iteration": 2.442711353302002 + }, + { + "auxiliary_loss_clip": 0.01155368, + "auxiliary_loss_mlp": 0.01052347, + "balance_loss_clip": 1.03226066, + "balance_loss_mlp": 1.05410099, + "epoch": 0.1592063730647828, + "flos": 17712507945600.0, + "grad_norm": 2.1253745247586204, + "language_loss": 0.8942067, + "learning_rate": 3.827418144547318e-06, + "loss": 0.91628385, + "num_input_tokens_seen": 57367570, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.015625, + "step": 2648, + "time_per_iteration": 2.4649319648742676 + }, + { + "auxiliary_loss_clip": 0.01152722, + "auxiliary_loss_mlp": 0.01049569, + "balance_loss_clip": 1.03141308, + "balance_loss_mlp": 1.05391204, + "epoch": 0.15926649631745077, + "flos": 18803545603200.0, + "grad_norm": 1.8651001097947648, + "language_loss": 0.91716385, + "learning_rate": 3.827259844762114e-06, + "loss": 0.93918669, + "num_input_tokens_seen": 57383980, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.98828125, + "step": 2649, + "time_per_iteration": 2.451261520385742 + }, + { + "auxiliary_loss_clip": 0.01163223, + "auxiliary_loss_mlp": 0.01049063, + "balance_loss_clip": 1.02802217, + "balance_loss_mlp": 1.05272281, + "epoch": 0.15932661957011873, + "flos": 17566243764480.0, + "grad_norm": 3.3226984417644028, + "language_loss": 0.71273595, + "learning_rate": 3.827101475687033e-06, + "loss": 0.73485881, + "num_input_tokens_seen": 57400840, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1015625, + "step": 2650, + "time_per_iteration": 2.43603253364563 + }, + { + "auxiliary_loss_clip": 0.01153823, + "auxiliary_loss_mlp": 0.01044738, + "balance_loss_clip": 1.02695203, + "balance_loss_mlp": 1.05372715, + "epoch": 0.15938674282278673, + "flos": 13334351011200.0, + "grad_norm": 2.4247432930640898, + "language_loss": 0.71116996, + "learning_rate": 3.826943037328082e-06, + "loss": 0.73315561, + "num_input_tokens_seen": 57419230, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 1.0, + "step": 2651, + "time_per_iteration": 2.467451572418213 + }, + { + "auxiliary_loss_clip": 0.01155622, + "auxiliary_loss_mlp": 0.01049144, + "balance_loss_clip": 1.02912855, + "balance_loss_mlp": 1.0513978, + "epoch": 0.1594468660754547, + "flos": 22488842119680.0, + "grad_norm": 1.909821572556346, + "language_loss": 0.7997523, + "learning_rate": 3.8267845296912674e-06, + "loss": 0.82179999, + "num_input_tokens_seen": 57439315, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.046875, + "step": 2652, + "time_per_iteration": 2.519624948501587 + }, + { + "auxiliary_loss_clip": 0.01153837, + "auxiliary_loss_mlp": 0.01045946, + "balance_loss_clip": 1.02665794, + "balance_loss_mlp": 1.05385149, + "epoch": 0.15950698932812266, + "flos": 15007320910080.0, + "grad_norm": 2.695147262103697, + "language_loss": 0.70050812, + "learning_rate": 3.826625952782601e-06, + "loss": 0.72250587, + "num_input_tokens_seen": 57454635, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.0, + "step": 2653, + "time_per_iteration": 2.439445972442627 + }, + { + "auxiliary_loss_clip": 0.01154814, + "auxiliary_loss_mlp": 0.01043059, + "balance_loss_clip": 1.02309155, + "balance_loss_mlp": 1.05308652, + "epoch": 0.15956711258079062, + "flos": 30155052084480.0, + "grad_norm": 2.046273350718209, + "language_loss": 0.76509416, + "learning_rate": 3.826467306608095e-06, + "loss": 0.7870729, + "num_input_tokens_seen": 57476805, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.015625, + "step": 2654, + "time_per_iteration": 2.529644012451172 + }, + { + "auxiliary_loss_clip": 0.01154147, + "auxiliary_loss_mlp": 0.01047134, + "balance_loss_clip": 1.02750051, + "balance_loss_mlp": 1.0526185, + "epoch": 0.1596272358334586, + "flos": 21032700670080.0, + "grad_norm": 1.961582700797155, + "language_loss": 0.8208828, + "learning_rate": 3.826308591173765e-06, + "loss": 0.84289569, + "num_input_tokens_seen": 57496400, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.015625, + "step": 2655, + "time_per_iteration": 2.4841158390045166 + }, + { + "auxiliary_loss_clip": 0.01154679, + "auxiliary_loss_mlp": 0.01049984, + "balance_loss_clip": 1.03166127, + "balance_loss_mlp": 1.05125904, + "epoch": 0.15968735908612655, + "flos": 15268032800640.0, + "grad_norm": 2.077546195878165, + "language_loss": 0.73565602, + "learning_rate": 3.826149806485631e-06, + "loss": 0.75770259, + "num_input_tokens_seen": 57513700, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.03125, + "step": 2656, + "time_per_iteration": 2.4727072715759277 + }, + { + "auxiliary_loss_clip": 0.01149623, + "auxiliary_loss_mlp": 0.01046235, + "balance_loss_clip": 1.02766216, + "balance_loss_mlp": 1.05170095, + "epoch": 0.15974748233879452, + "flos": 52665726695040.0, + "grad_norm": 1.884771930829773, + "language_loss": 0.77508467, + "learning_rate": 3.825990952549713e-06, + "loss": 0.79704326, + "num_input_tokens_seen": 57536180, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.98046875, + "step": 2657, + "time_per_iteration": 2.801560401916504 + }, + { + "auxiliary_loss_clip": 0.01154211, + "auxiliary_loss_mlp": 0.01048143, + "balance_loss_clip": 1.02910495, + "balance_loss_mlp": 1.05459499, + "epoch": 0.1598076055914625, + "flos": 18733232730240.0, + "grad_norm": 1.6493844029380673, + "language_loss": 0.74807733, + "learning_rate": 3.825832029372035e-06, + "loss": 0.77010089, + "num_input_tokens_seen": 57555025, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.99609375, + "step": 2658, + "time_per_iteration": 2.4434328079223633 + }, + { + "auxiliary_loss_clip": 0.01155878, + "auxiliary_loss_mlp": 0.01050595, + "balance_loss_clip": 1.02912521, + "balance_loss_mlp": 1.05291355, + "epoch": 0.15986772884413047, + "flos": 34349238535680.0, + "grad_norm": 1.8153435843839463, + "language_loss": 0.75194407, + "learning_rate": 3.825673036958624e-06, + "loss": 0.77400887, + "num_input_tokens_seen": 57577660, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.03125, + "step": 2659, + "time_per_iteration": 2.587700366973877 + }, + { + "auxiliary_loss_clip": 0.01159224, + "auxiliary_loss_mlp": 0.01052946, + "balance_loss_clip": 1.03295422, + "balance_loss_mlp": 1.05531979, + "epoch": 0.15992785209679844, + "flos": 22054969739520.0, + "grad_norm": 2.4521775760186526, + "language_loss": 0.90417045, + "learning_rate": 3.825513975315508e-06, + "loss": 0.92629218, + "num_input_tokens_seen": 57596335, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0390625, + "step": 2660, + "time_per_iteration": 2.45237398147583 + }, + { + "auxiliary_loss_clip": 0.01161295, + "auxiliary_loss_mlp": 0.0105014, + "balance_loss_clip": 1.0300889, + "balance_loss_mlp": 1.05822825, + "epoch": 0.1599879753494664, + "flos": 33066652625280.0, + "grad_norm": 2.0123178843036373, + "language_loss": 0.77552611, + "learning_rate": 3.82535484444872e-06, + "loss": 0.79764044, + "num_input_tokens_seen": 57616830, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.03125, + "step": 2661, + "time_per_iteration": 2.574652910232544 + }, + { + "auxiliary_loss_clip": 0.01158998, + "auxiliary_loss_mlp": 0.01048944, + "balance_loss_clip": 1.02913153, + "balance_loss_mlp": 1.05460262, + "epoch": 0.16004809860213437, + "flos": 28038010343040.0, + "grad_norm": 1.7348749157972516, + "language_loss": 0.74735796, + "learning_rate": 3.825195644364292e-06, + "loss": 0.76943737, + "num_input_tokens_seen": 57635515, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.046875, + "step": 2662, + "time_per_iteration": 2.506974935531616 + }, + { + "auxiliary_loss_clip": 0.01158039, + "auxiliary_loss_mlp": 0.01051532, + "balance_loss_clip": 1.03233898, + "balance_loss_mlp": 1.05416894, + "epoch": 0.16010822185480234, + "flos": 22780113505920.0, + "grad_norm": 2.0770925688556074, + "language_loss": 0.82047677, + "learning_rate": 3.825036375068263e-06, + "loss": 0.84257245, + "num_input_tokens_seen": 57654250, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0390625, + "step": 2663, + "time_per_iteration": 2.459630012512207 + }, + { + "auxiliary_loss_clip": 0.0116012, + "auxiliary_loss_mlp": 0.0104966, + "balance_loss_clip": 1.02978826, + "balance_loss_mlp": 1.05576038, + "epoch": 0.16016834510747033, + "flos": 20084012611200.0, + "grad_norm": 2.5815812177362454, + "language_loss": 0.7910682, + "learning_rate": 3.824877036566672e-06, + "loss": 0.81316602, + "num_input_tokens_seen": 57672645, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0390625, + "step": 2664, + "time_per_iteration": 2.4978790283203125 + }, + { + "auxiliary_loss_clip": 0.01156133, + "auxiliary_loss_mlp": 0.01051164, + "balance_loss_clip": 1.03222167, + "balance_loss_mlp": 1.05318165, + "epoch": 0.1602284683601383, + "flos": 21173829206400.0, + "grad_norm": 1.8148985254226184, + "language_loss": 0.93767202, + "learning_rate": 3.824717628865561e-06, + "loss": 0.95974499, + "num_input_tokens_seen": 57691055, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.03125, + "step": 2665, + "time_per_iteration": 2.467349052429199 + }, + { + "auxiliary_loss_clip": 0.01157965, + "auxiliary_loss_mlp": 0.01047224, + "balance_loss_clip": 1.02750635, + "balance_loss_mlp": 1.05352151, + "epoch": 0.16028859161280626, + "flos": 14647568244480.0, + "grad_norm": 1.9534389472193405, + "language_loss": 0.85255575, + "learning_rate": 3.824558151970974e-06, + "loss": 0.87460762, + "num_input_tokens_seen": 57707235, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.046875, + "step": 2666, + "time_per_iteration": 2.4229867458343506 + }, + { + "auxiliary_loss_clip": 0.01155877, + "auxiliary_loss_mlp": 0.01047711, + "balance_loss_clip": 1.02899504, + "balance_loss_mlp": 1.05404496, + "epoch": 0.16034871486547422, + "flos": 20990325600000.0, + "grad_norm": 1.873987360542769, + "language_loss": 0.81461811, + "learning_rate": 3.8243986058889595e-06, + "loss": 0.83665401, + "num_input_tokens_seen": 57724190, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 2667, + "time_per_iteration": 2.4989583492279053 + }, + { + "auxiliary_loss_clip": 0.01157612, + "auxiliary_loss_mlp": 0.01050501, + "balance_loss_clip": 1.03104627, + "balance_loss_mlp": 1.05707479, + "epoch": 0.1604088381181422, + "flos": 21397732634880.0, + "grad_norm": 2.676276626789842, + "language_loss": 0.74079859, + "learning_rate": 3.824238990625567e-06, + "loss": 0.76287973, + "num_input_tokens_seen": 57743620, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0078125, + "step": 2668, + "time_per_iteration": 2.463395357131958 + }, + { + "auxiliary_loss_clip": 0.01158531, + "auxiliary_loss_mlp": 0.01051768, + "balance_loss_clip": 1.03175282, + "balance_loss_mlp": 1.05527806, + "epoch": 0.16046896137081015, + "flos": 23877040993920.0, + "grad_norm": 1.6382268793433732, + "language_loss": 0.77214229, + "learning_rate": 3.824079306186848e-06, + "loss": 0.79424524, + "num_input_tokens_seen": 57764810, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.03125, + "step": 2669, + "time_per_iteration": 2.5107781887054443 + }, + { + "auxiliary_loss_clip": 0.01059914, + "auxiliary_loss_mlp": 0.01008943, + "balance_loss_clip": 1.0062964, + "balance_loss_mlp": 1.0249362, + "epoch": 0.16052908462347812, + "flos": 59806709015040.0, + "grad_norm": 0.8072457077707946, + "language_loss": 0.55571371, + "learning_rate": 3.823919552578861e-06, + "loss": 0.57640231, + "num_input_tokens_seen": 57824390, + "router_z_loss_clip": 0.02648926, + "router_z_loss_mlp": 0.34960938, + "step": 2670, + "time_per_iteration": 2.964386463165283 + }, + { + "auxiliary_loss_clip": 0.01157188, + "auxiliary_loss_mlp": 0.01043938, + "balance_loss_clip": 1.02544856, + "balance_loss_mlp": 1.05379438, + "epoch": 0.1605892078761461, + "flos": 18296559089280.0, + "grad_norm": 8.31640977393562, + "language_loss": 0.77088535, + "learning_rate": 3.82375972980766e-06, + "loss": 0.79289663, + "num_input_tokens_seen": 57843665, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.03125, + "step": 2671, + "time_per_iteration": 2.4722845554351807 + }, + { + "auxiliary_loss_clip": 0.01159298, + "auxiliary_loss_mlp": 0.01045605, + "balance_loss_clip": 1.02684164, + "balance_loss_mlp": 1.05666459, + "epoch": 0.16064933112881408, + "flos": 32160734686080.0, + "grad_norm": 1.9636142117953166, + "language_loss": 0.64497644, + "learning_rate": 3.8235998378793086e-06, + "loss": 0.66702545, + "num_input_tokens_seen": 57863305, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.03125, + "step": 2672, + "time_per_iteration": 2.5702145099639893 + }, + { + "auxiliary_loss_clip": 0.01157155, + "auxiliary_loss_mlp": 0.01042294, + "balance_loss_clip": 1.02128983, + "balance_loss_mlp": 1.05270457, + "epoch": 0.16070945438148204, + "flos": 19828795501440.0, + "grad_norm": 1.885579538712505, + "language_loss": 0.8533771, + "learning_rate": 3.8234398767998675e-06, + "loss": 0.87537158, + "num_input_tokens_seen": 57883025, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.046875, + "step": 2673, + "time_per_iteration": 2.4754209518432617 + }, + { + "auxiliary_loss_clip": 0.01156938, + "auxiliary_loss_mlp": 0.01055602, + "balance_loss_clip": 1.03718424, + "balance_loss_mlp": 1.05537605, + "epoch": 0.16076957763415, + "flos": 18913144976640.0, + "grad_norm": 2.484212796080384, + "language_loss": 0.72797197, + "learning_rate": 3.823279846575403e-06, + "loss": 0.75009739, + "num_input_tokens_seen": 57901430, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.015625, + "step": 2674, + "time_per_iteration": 2.4771230220794678 + }, + { + "auxiliary_loss_clip": 0.01153717, + "auxiliary_loss_mlp": 0.01047616, + "balance_loss_clip": 1.02745771, + "balance_loss_mlp": 1.05242229, + "epoch": 0.16082970088681797, + "flos": 16764358590720.0, + "grad_norm": 2.0917218572710143, + "language_loss": 0.84550452, + "learning_rate": 3.823119747211986e-06, + "loss": 0.86751789, + "num_input_tokens_seen": 57919550, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.015625, + "step": 2675, + "time_per_iteration": 2.4583237171173096 + }, + { + "auxiliary_loss_clip": 0.01158822, + "auxiliary_loss_mlp": 0.0104935, + "balance_loss_clip": 1.02890563, + "balance_loss_mlp": 1.0566349, + "epoch": 0.16088982413948594, + "flos": 35150261783040.0, + "grad_norm": 1.979365293626276, + "language_loss": 0.82605797, + "learning_rate": 3.822959578715685e-06, + "loss": 0.84813964, + "num_input_tokens_seen": 57939890, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0234375, + "step": 2676, + "time_per_iteration": 2.5966403484344482 + }, + { + "auxiliary_loss_clip": 0.01157172, + "auxiliary_loss_mlp": 0.01050632, + "balance_loss_clip": 1.03263116, + "balance_loss_mlp": 1.05701363, + "epoch": 0.1609499473921539, + "flos": 18625105814400.0, + "grad_norm": 1.9372140801278581, + "language_loss": 0.73252106, + "learning_rate": 3.822799341092573e-06, + "loss": 0.75459909, + "num_input_tokens_seen": 57957410, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0, + "step": 2677, + "time_per_iteration": 2.459545135498047 + }, + { + "auxiliary_loss_clip": 0.01153742, + "auxiliary_loss_mlp": 0.01046774, + "balance_loss_clip": 1.02774811, + "balance_loss_mlp": 1.05381799, + "epoch": 0.1610100706448219, + "flos": 33145728416640.0, + "grad_norm": 3.4714871699848, + "language_loss": 0.76175338, + "learning_rate": 3.822639034348728e-06, + "loss": 0.78375852, + "num_input_tokens_seen": 57977900, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0, + "step": 2678, + "time_per_iteration": 2.6220550537109375 + }, + { + "auxiliary_loss_clip": 0.01154476, + "auxiliary_loss_mlp": 0.01048242, + "balance_loss_clip": 1.02835846, + "balance_loss_mlp": 1.05157948, + "epoch": 0.16107019389748986, + "flos": 34676707852800.0, + "grad_norm": 1.6939354956764687, + "language_loss": 0.70202518, + "learning_rate": 3.822478658490228e-06, + "loss": 0.72405231, + "num_input_tokens_seen": 57998210, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.03125, + "step": 2679, + "time_per_iteration": 2.580995559692383 + }, + { + "auxiliary_loss_clip": 0.01054747, + "auxiliary_loss_mlp": 0.01023179, + "balance_loss_clip": 1.020854, + "balance_loss_mlp": 1.02026391, + "epoch": 0.16113031715015783, + "flos": 65713403260800.0, + "grad_norm": 0.8161414687228778, + "language_loss": 0.51844025, + "learning_rate": 3.822318213523154e-06, + "loss": 0.5392195, + "num_input_tokens_seen": 58059420, + "router_z_loss_clip": 0.02319336, + "router_z_loss_mlp": 0.34375, + "step": 2680, + "time_per_iteration": 3.105682849884033 + }, + { + "auxiliary_loss_clip": 0.01155604, + "auxiliary_loss_mlp": 0.01047691, + "balance_loss_clip": 1.02750874, + "balance_loss_mlp": 1.05157876, + "epoch": 0.1611904404028258, + "flos": 20810413353600.0, + "grad_norm": 1.8335073832427007, + "language_loss": 0.80319828, + "learning_rate": 3.8221576994535925e-06, + "loss": 0.82523119, + "num_input_tokens_seen": 58078370, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0390625, + "step": 2681, + "time_per_iteration": 2.4695565700531006 + }, + { + "auxiliary_loss_clip": 0.01151045, + "auxiliary_loss_mlp": 0.01058971, + "balance_loss_clip": 1.04031444, + "balance_loss_mlp": 1.05258918, + "epoch": 0.16125056365549376, + "flos": 27013335062400.0, + "grad_norm": 1.8021457293712753, + "language_loss": 0.69142133, + "learning_rate": 3.821997116287627e-06, + "loss": 0.71352148, + "num_input_tokens_seen": 58097395, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.984375, + "step": 2682, + "time_per_iteration": 2.5027854442596436 + }, + { + "auxiliary_loss_clip": 0.011576, + "auxiliary_loss_mlp": 0.01048243, + "balance_loss_clip": 1.02800107, + "balance_loss_mlp": 1.0559957, + "epoch": 0.16131068690816172, + "flos": 19276524915840.0, + "grad_norm": 1.8107912193408944, + "language_loss": 0.87568235, + "learning_rate": 3.821836464031348e-06, + "loss": 0.89774084, + "num_input_tokens_seen": 58115630, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.015625, + "step": 2683, + "time_per_iteration": 2.461944341659546 + }, + { + "auxiliary_loss_clip": 0.01156212, + "auxiliary_loss_mlp": 0.0105566, + "balance_loss_clip": 1.03587174, + "balance_loss_mlp": 1.05452991, + "epoch": 0.16137081016082971, + "flos": 35337931367040.0, + "grad_norm": 3.5824209574719035, + "language_loss": 0.74160969, + "learning_rate": 3.821675742690849e-06, + "loss": 0.76372838, + "num_input_tokens_seen": 58138655, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.015625, + "step": 2684, + "time_per_iteration": 4.005981206893921 + }, + { + "auxiliary_loss_clip": 0.01159701, + "auxiliary_loss_mlp": 0.01048543, + "balance_loss_clip": 1.02811038, + "balance_loss_mlp": 1.05543995, + "epoch": 0.16143093341349768, + "flos": 34235257703040.0, + "grad_norm": 1.919238603617177, + "language_loss": 0.70244128, + "learning_rate": 3.821514952272223e-06, + "loss": 0.72452366, + "num_input_tokens_seen": 58157440, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.046875, + "step": 2685, + "time_per_iteration": 5.387023448944092 + }, + { + "auxiliary_loss_clip": 0.0115036, + "auxiliary_loss_mlp": 0.01047397, + "balance_loss_clip": 1.0282284, + "balance_loss_mlp": 1.0518229, + "epoch": 0.16149105666616564, + "flos": 27999262546560.0, + "grad_norm": 1.8016019482814314, + "language_loss": 0.71518582, + "learning_rate": 3.821354092781567e-06, + "loss": 0.73716336, + "num_input_tokens_seen": 58176660, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.984375, + "step": 2686, + "time_per_iteration": 2.5451064109802246 + }, + { + "auxiliary_loss_clip": 0.01157161, + "auxiliary_loss_mlp": 0.01051189, + "balance_loss_clip": 1.03191292, + "balance_loss_mlp": 1.05551481, + "epoch": 0.1615511799188336, + "flos": 19422214479360.0, + "grad_norm": 1.8631629169214377, + "language_loss": 0.81521869, + "learning_rate": 3.821193164224981e-06, + "loss": 0.83730221, + "num_input_tokens_seen": 58195085, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.015625, + "step": 2687, + "time_per_iteration": 2.4542620182037354 + }, + { + "auxiliary_loss_clip": 0.01155843, + "auxiliary_loss_mlp": 0.01044301, + "balance_loss_clip": 1.02327275, + "balance_loss_mlp": 1.04894984, + "epoch": 0.16161130317150157, + "flos": 22854915578880.0, + "grad_norm": 1.8081463969498348, + "language_loss": 0.71823454, + "learning_rate": 3.821032166608568e-06, + "loss": 0.74023592, + "num_input_tokens_seen": 58213540, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.0625, + "step": 2688, + "time_per_iteration": 2.493476152420044 + }, + { + "auxiliary_loss_clip": 0.0115191, + "auxiliary_loss_mlp": 0.01045785, + "balance_loss_clip": 1.02730739, + "balance_loss_mlp": 1.05067098, + "epoch": 0.16167142642416954, + "flos": 26110577520000.0, + "grad_norm": 2.2392978206929555, + "language_loss": 0.76041406, + "learning_rate": 3.8208710999384325e-06, + "loss": 0.78239101, + "num_input_tokens_seen": 58236995, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.015625, + "step": 2689, + "time_per_iteration": 2.5840976238250732 + }, + { + "auxiliary_loss_clip": 0.01155388, + "auxiliary_loss_mlp": 0.01046669, + "balance_loss_clip": 1.02704763, + "balance_loss_mlp": 1.05417943, + "epoch": 0.1617315496768375, + "flos": 22779646629120.0, + "grad_norm": 1.9258973882551216, + "language_loss": 0.87260234, + "learning_rate": 3.820709964220683e-06, + "loss": 0.89462292, + "num_input_tokens_seen": 58257230, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.015625, + "step": 2690, + "time_per_iteration": 2.496943473815918 + }, + { + "auxiliary_loss_clip": 0.01151534, + "auxiliary_loss_mlp": 0.01047712, + "balance_loss_clip": 1.02980638, + "balance_loss_mlp": 1.05211663, + "epoch": 0.1617916729295055, + "flos": 22017299351040.0, + "grad_norm": 1.562024048541713, + "language_loss": 0.87728393, + "learning_rate": 3.8205487594614284e-06, + "loss": 0.89927632, + "num_input_tokens_seen": 58277080, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9921875, + "step": 2691, + "time_per_iteration": 2.510960817337036 + }, + { + "auxiliary_loss_clip": 0.01157097, + "auxiliary_loss_mlp": 0.01049167, + "balance_loss_clip": 1.02764988, + "balance_loss_mlp": 1.05021381, + "epoch": 0.16185179618217346, + "flos": 23438248450560.0, + "grad_norm": 2.082856606872889, + "language_loss": 0.82327259, + "learning_rate": 3.820387485666784e-06, + "loss": 0.84533525, + "num_input_tokens_seen": 58294815, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.0703125, + "step": 2692, + "time_per_iteration": 2.481032371520996 + }, + { + "auxiliary_loss_clip": 0.0115716, + "auxiliary_loss_mlp": 0.01048999, + "balance_loss_clip": 1.02835155, + "balance_loss_mlp": 1.05069244, + "epoch": 0.16191191943484143, + "flos": 25666110627840.0, + "grad_norm": 3.0763505181853454, + "language_loss": 0.80942917, + "learning_rate": 3.820226142842862e-06, + "loss": 0.83149081, + "num_input_tokens_seen": 58313215, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.0625, + "step": 2693, + "time_per_iteration": 2.493278980255127 + }, + { + "auxiliary_loss_clip": 0.01150662, + "auxiliary_loss_mlp": 0.01054953, + "balance_loss_clip": 1.03670192, + "balance_loss_mlp": 1.05223358, + "epoch": 0.1619720426875094, + "flos": 23477355383040.0, + "grad_norm": 1.7139740211881158, + "language_loss": 0.83639967, + "learning_rate": 3.820064730995783e-06, + "loss": 0.85845578, + "num_input_tokens_seen": 58333215, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.984375, + "step": 2694, + "time_per_iteration": 2.5051510334014893 + }, + { + "auxiliary_loss_clip": 0.01156309, + "auxiliary_loss_mlp": 0.01048183, + "balance_loss_clip": 1.02764273, + "balance_loss_mlp": 1.0509156, + "epoch": 0.16203216594017736, + "flos": 24133658734080.0, + "grad_norm": 1.9608549080280004, + "language_loss": 0.69125426, + "learning_rate": 3.819903250131667e-06, + "loss": 0.71329916, + "num_input_tokens_seen": 58351160, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0546875, + "step": 2695, + "time_per_iteration": 2.495098352432251 + }, + { + "auxiliary_loss_clip": 0.01159947, + "auxiliary_loss_mlp": 0.01054922, + "balance_loss_clip": 1.03391731, + "balance_loss_mlp": 1.05520689, + "epoch": 0.16209228919284532, + "flos": 22340889999360.0, + "grad_norm": 2.466913217352614, + "language_loss": 0.82403111, + "learning_rate": 3.819741700256637e-06, + "loss": 0.84617984, + "num_input_tokens_seen": 58368505, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.046875, + "step": 2696, + "time_per_iteration": 2.484523296356201 + }, + { + "auxiliary_loss_clip": 0.01161904, + "auxiliary_loss_mlp": 0.01056335, + "balance_loss_clip": 1.03529406, + "balance_loss_mlp": 1.05316591, + "epoch": 0.1621524124455133, + "flos": 15815131827840.0, + "grad_norm": 1.9982919021229957, + "language_loss": 0.8852337, + "learning_rate": 3.8195800813768194e-06, + "loss": 0.90741605, + "num_input_tokens_seen": 58385085, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.09375, + "step": 2697, + "time_per_iteration": 2.4806151390075684 + }, + { + "auxiliary_loss_clip": 0.01147135, + "auxiliary_loss_mlp": 0.01046149, + "balance_loss_clip": 1.02756453, + "balance_loss_mlp": 1.04989469, + "epoch": 0.16221253569818128, + "flos": 30186688988160.0, + "grad_norm": 1.4702975792509376, + "language_loss": 0.80172735, + "learning_rate": 3.819418393498343e-06, + "loss": 0.82366014, + "num_input_tokens_seen": 58406985, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.97265625, + "step": 2698, + "time_per_iteration": 2.532137393951416 + }, + { + "auxiliary_loss_clip": 0.01149805, + "auxiliary_loss_mlp": 0.01049018, + "balance_loss_clip": 1.02957439, + "balance_loss_mlp": 1.05167758, + "epoch": 0.16227265895084925, + "flos": 24605991601920.0, + "grad_norm": 1.5576448961090323, + "language_loss": 0.77258182, + "learning_rate": 3.819256636627339e-06, + "loss": 0.79456997, + "num_input_tokens_seen": 58426205, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.98046875, + "step": 2699, + "time_per_iteration": 2.514084577560425 + }, + { + "auxiliary_loss_clip": 0.01150261, + "auxiliary_loss_mlp": 0.0104371, + "balance_loss_clip": 1.0251497, + "balance_loss_mlp": 1.04891944, + "epoch": 0.1623327822035172, + "flos": 19573326996480.0, + "grad_norm": 2.038036982956784, + "language_loss": 0.85697722, + "learning_rate": 3.81909481076994e-06, + "loss": 0.87891692, + "num_input_tokens_seen": 58443830, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.015625, + "step": 2700, + "time_per_iteration": 2.4434289932250977 + }, + { + "auxiliary_loss_clip": 0.01147712, + "auxiliary_loss_mlp": 0.0104585, + "balance_loss_clip": 1.0247376, + "balance_loss_mlp": 1.04878318, + "epoch": 0.16239290545618518, + "flos": 26468462678400.0, + "grad_norm": 1.6982179557795123, + "language_loss": 0.80378878, + "learning_rate": 3.818932915932284e-06, + "loss": 0.82572436, + "num_input_tokens_seen": 58464405, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.98828125, + "step": 2701, + "time_per_iteration": 2.5267322063446045 + }, + { + "auxiliary_loss_clip": 0.01156296, + "auxiliary_loss_mlp": 0.01048895, + "balance_loss_clip": 1.02945244, + "balance_loss_mlp": 1.05514598, + "epoch": 0.16245302870885314, + "flos": 15851940289920.0, + "grad_norm": 1.5999982166608073, + "language_loss": 0.73006868, + "learning_rate": 3.818770952120511e-06, + "loss": 0.75212055, + "num_input_tokens_seen": 58483295, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0078125, + "step": 2702, + "time_per_iteration": 2.44750714302063 + }, + { + "auxiliary_loss_clip": 0.01153204, + "auxiliary_loss_mlp": 0.01050741, + "balance_loss_clip": 1.02986753, + "balance_loss_mlp": 1.05053687, + "epoch": 0.1625131519615211, + "flos": 14756521173120.0, + "grad_norm": 2.5386207662450464, + "language_loss": 0.73164749, + "learning_rate": 3.81860891934076e-06, + "loss": 0.7536869, + "num_input_tokens_seen": 58501205, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.0234375, + "step": 2703, + "time_per_iteration": 2.469242811203003 + }, + { + "auxiliary_loss_clip": 0.01150736, + "auxiliary_loss_mlp": 0.01046914, + "balance_loss_clip": 1.02538514, + "balance_loss_mlp": 1.04765964, + "epoch": 0.1625732752141891, + "flos": 28220508368640.0, + "grad_norm": 1.9216464968932823, + "language_loss": 0.70681584, + "learning_rate": 3.818446817599176e-06, + "loss": 0.72879231, + "num_input_tokens_seen": 58522315, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.03125, + "step": 2704, + "time_per_iteration": 2.5236263275146484 + }, + { + "auxiliary_loss_clip": 0.0105028, + "auxiliary_loss_mlp": 0.01003507, + "balance_loss_clip": 1.00091982, + "balance_loss_mlp": 1.01563144, + "epoch": 0.16263339846685707, + "flos": 67327947688320.0, + "grad_norm": 0.7797469934396678, + "language_loss": 0.53369009, + "learning_rate": 3.818284646901907e-06, + "loss": 0.55422795, + "num_input_tokens_seen": 58586695, + "router_z_loss_clip": 0.02587891, + "router_z_loss_mlp": 0.34765625, + "step": 2705, + "time_per_iteration": 3.0887868404388428 + }, + { + "auxiliary_loss_clip": 0.0115608, + "auxiliary_loss_mlp": 0.01048272, + "balance_loss_clip": 1.02873373, + "balance_loss_mlp": 1.05151534, + "epoch": 0.16269352171952503, + "flos": 14319165173760.0, + "grad_norm": 2.4525976943058896, + "language_loss": 0.75060308, + "learning_rate": 3.818122407255102e-06, + "loss": 0.77264655, + "num_input_tokens_seen": 58602435, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.046875, + "step": 2706, + "time_per_iteration": 2.439283847808838 + }, + { + "auxiliary_loss_clip": 0.01154579, + "auxiliary_loss_mlp": 0.01051686, + "balance_loss_clip": 1.03248119, + "balance_loss_mlp": 1.05240536, + "epoch": 0.162753644972193, + "flos": 28361205941760.0, + "grad_norm": 1.9153778871117788, + "language_loss": 0.7234174, + "learning_rate": 3.817960098664914e-06, + "loss": 0.74547994, + "num_input_tokens_seen": 58621275, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.015625, + "step": 2707, + "time_per_iteration": 2.51819109916687 + }, + { + "auxiliary_loss_clip": 0.01155215, + "auxiliary_loss_mlp": 0.01050366, + "balance_loss_clip": 1.03154302, + "balance_loss_mlp": 1.05275822, + "epoch": 0.16281376822486096, + "flos": 19937856170880.0, + "grad_norm": 3.869992791268662, + "language_loss": 0.83790398, + "learning_rate": 3.817797721137495e-06, + "loss": 0.85995972, + "num_input_tokens_seen": 58637550, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.03125, + "step": 2708, + "time_per_iteration": 2.4592010974884033 + }, + { + "auxiliary_loss_clip": 0.0115992, + "auxiliary_loss_mlp": 0.01049095, + "balance_loss_clip": 1.02768469, + "balance_loss_mlp": 1.05268705, + "epoch": 0.16287389147752893, + "flos": 21251719848960.0, + "grad_norm": 2.162290718142945, + "language_loss": 0.86529553, + "learning_rate": 3.817635274679006e-06, + "loss": 0.88738573, + "num_input_tokens_seen": 58654135, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.0703125, + "step": 2709, + "time_per_iteration": 2.4745054244995117 + }, + { + "auxiliary_loss_clip": 0.01154974, + "auxiliary_loss_mlp": 0.01054439, + "balance_loss_clip": 1.0353297, + "balance_loss_mlp": 1.05096519, + "epoch": 0.1629340147301969, + "flos": 19244672530560.0, + "grad_norm": 1.6782807127870958, + "language_loss": 0.91449893, + "learning_rate": 3.817472759295605e-06, + "loss": 0.93659306, + "num_input_tokens_seen": 58674320, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0390625, + "step": 2710, + "time_per_iteration": 2.4846651554107666 + }, + { + "auxiliary_loss_clip": 0.0115562, + "auxiliary_loss_mlp": 0.01054818, + "balance_loss_clip": 1.03549433, + "balance_loss_mlp": 1.05447197, + "epoch": 0.16299413798286488, + "flos": 21249816428160.0, + "grad_norm": 1.99410407833921, + "language_loss": 0.8129673, + "learning_rate": 3.817310174993453e-06, + "loss": 0.83507168, + "num_input_tokens_seen": 58691000, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0078125, + "step": 2711, + "time_per_iteration": 2.4878618717193604 + }, + { + "auxiliary_loss_clip": 0.01153665, + "auxiliary_loss_mlp": 0.01046499, + "balance_loss_clip": 1.02731824, + "balance_loss_mlp": 1.04737568, + "epoch": 0.16305426123553285, + "flos": 18770579896320.0, + "grad_norm": 2.7794575527068077, + "language_loss": 0.81605875, + "learning_rate": 3.817147521778719e-06, + "loss": 0.83806038, + "num_input_tokens_seen": 58710230, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0625, + "step": 2712, + "time_per_iteration": 2.4479072093963623 + }, + { + "auxiliary_loss_clip": 0.01158025, + "auxiliary_loss_mlp": 0.01058532, + "balance_loss_clip": 1.03858864, + "balance_loss_mlp": 1.05211174, + "epoch": 0.16311438448820081, + "flos": 22087648137600.0, + "grad_norm": 2.1959953506899774, + "language_loss": 0.76885653, + "learning_rate": 3.816984799657568e-06, + "loss": 0.79102206, + "num_input_tokens_seen": 58728610, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0625, + "step": 2713, + "time_per_iteration": 2.493394374847412 + }, + { + "auxiliary_loss_clip": 0.01155185, + "auxiliary_loss_mlp": 0.01062558, + "balance_loss_clip": 1.04290032, + "balance_loss_mlp": 1.05623782, + "epoch": 0.16317450774086878, + "flos": 16467700164480.0, + "grad_norm": 2.081844956712308, + "language_loss": 0.78926778, + "learning_rate": 3.8168220086361715e-06, + "loss": 0.8114453, + "num_input_tokens_seen": 58744385, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.98828125, + "step": 2714, + "time_per_iteration": 2.442214012145996 + }, + { + "auxiliary_loss_clip": 0.01155305, + "auxiliary_loss_mlp": 0.01059199, + "balance_loss_clip": 1.04011369, + "balance_loss_mlp": 1.05286288, + "epoch": 0.16323463099353674, + "flos": 24352929308160.0, + "grad_norm": 2.259619309439112, + "language_loss": 0.78143466, + "learning_rate": 3.816659148720702e-06, + "loss": 0.80357969, + "num_input_tokens_seen": 58763905, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0234375, + "step": 2715, + "time_per_iteration": 2.499178409576416 + }, + { + "auxiliary_loss_clip": 0.01150615, + "auxiliary_loss_mlp": 0.01047807, + "balance_loss_clip": 1.02973497, + "balance_loss_mlp": 1.04868412, + "epoch": 0.1632947542462047, + "flos": 24900782520960.0, + "grad_norm": 2.0916631483814783, + "language_loss": 0.81397748, + "learning_rate": 3.816496219917336e-06, + "loss": 0.8359617, + "num_input_tokens_seen": 58785580, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.015625, + "step": 2716, + "time_per_iteration": 2.5004689693450928 + }, + { + "auxiliary_loss_clip": 0.01158421, + "auxiliary_loss_mlp": 0.01057354, + "balance_loss_clip": 1.03853106, + "balance_loss_mlp": 1.05482328, + "epoch": 0.1633548774988727, + "flos": 24900279730560.0, + "grad_norm": 1.8793848003912939, + "language_loss": 0.86203027, + "learning_rate": 3.816333222232251e-06, + "loss": 0.88418794, + "num_input_tokens_seen": 58806075, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0390625, + "step": 2717, + "time_per_iteration": 2.5112617015838623 + }, + { + "auxiliary_loss_clip": 0.0115161, + "auxiliary_loss_mlp": 0.01046152, + "balance_loss_clip": 1.02725708, + "balance_loss_mlp": 1.05153894, + "epoch": 0.16341500075154067, + "flos": 30441798357120.0, + "grad_norm": 1.652261986612604, + "language_loss": 0.76514149, + "learning_rate": 3.816170155671629e-06, + "loss": 0.78711915, + "num_input_tokens_seen": 58827405, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0, + "step": 2718, + "time_per_iteration": 2.549654245376587 + }, + { + "auxiliary_loss_clip": 0.01156654, + "auxiliary_loss_mlp": 0.01045361, + "balance_loss_clip": 1.02696729, + "balance_loss_mlp": 1.05180717, + "epoch": 0.16347512400420863, + "flos": 22784530878720.0, + "grad_norm": 2.080955072975882, + "language_loss": 0.73027492, + "learning_rate": 3.816007020241652e-06, + "loss": 0.75229508, + "num_input_tokens_seen": 58847205, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.046875, + "step": 2719, + "time_per_iteration": 2.4911599159240723 + }, + { + "auxiliary_loss_clip": 0.01151759, + "auxiliary_loss_mlp": 0.01049636, + "balance_loss_clip": 1.03084862, + "balance_loss_mlp": 1.0492239, + "epoch": 0.1635352472568766, + "flos": 22633274707200.0, + "grad_norm": 1.6610037254914274, + "language_loss": 0.72384167, + "learning_rate": 3.815843815948507e-06, + "loss": 0.74585563, + "num_input_tokens_seen": 58866865, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0234375, + "step": 2720, + "time_per_iteration": 2.4733760356903076 + }, + { + "auxiliary_loss_clip": 0.01150132, + "auxiliary_loss_mlp": 0.01048266, + "balance_loss_clip": 1.02789283, + "balance_loss_mlp": 1.05076206, + "epoch": 0.16359537050954456, + "flos": 15522998515200.0, + "grad_norm": 2.2797021453727893, + "language_loss": 0.75100243, + "learning_rate": 3.8156805427983824e-06, + "loss": 0.77298641, + "num_input_tokens_seen": 58885200, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.9921875, + "step": 2721, + "time_per_iteration": 2.44942569732666 + }, + { + "auxiliary_loss_clip": 0.01155245, + "auxiliary_loss_mlp": 0.01049168, + "balance_loss_clip": 1.02893853, + "balance_loss_mlp": 1.0502317, + "epoch": 0.16365549376221253, + "flos": 22090162089600.0, + "grad_norm": 1.74959220753002, + "language_loss": 0.79254043, + "learning_rate": 3.8155172007974695e-06, + "loss": 0.81458461, + "num_input_tokens_seen": 58906385, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.046875, + "step": 2722, + "time_per_iteration": 2.4775915145874023 + }, + { + "auxiliary_loss_clip": 0.01158964, + "auxiliary_loss_mlp": 0.01049216, + "balance_loss_clip": 1.02675653, + "balance_loss_mlp": 1.05248678, + "epoch": 0.1637156170148805, + "flos": 24060400945920.0, + "grad_norm": 2.0539311275727634, + "language_loss": 0.8477816, + "learning_rate": 3.8153537899519624e-06, + "loss": 0.86986339, + "num_input_tokens_seen": 58925040, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.0625, + "step": 2723, + "time_per_iteration": 2.5084922313690186 + }, + { + "auxiliary_loss_clip": 0.01146914, + "auxiliary_loss_mlp": 0.01037208, + "balance_loss_clip": 1.0177772, + "balance_loss_mlp": 1.04940808, + "epoch": 0.1637757402675485, + "flos": 26685362954880.0, + "grad_norm": 2.0049787201865503, + "language_loss": 0.70883536, + "learning_rate": 3.815190310268058e-06, + "loss": 0.73067659, + "num_input_tokens_seen": 58944790, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.9765625, + "step": 2724, + "time_per_iteration": 2.5094263553619385 + }, + { + "auxiliary_loss_clip": 0.01150034, + "auxiliary_loss_mlp": 0.01044202, + "balance_loss_clip": 1.02583206, + "balance_loss_mlp": 1.05113125, + "epoch": 0.16383586352021645, + "flos": 16106941918080.0, + "grad_norm": 2.04326868324577, + "language_loss": 0.70914948, + "learning_rate": 3.815026761751955e-06, + "loss": 0.73109186, + "num_input_tokens_seen": 58962500, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.9921875, + "step": 2725, + "time_per_iteration": 2.495342254638672 + }, + { + "auxiliary_loss_clip": 0.01149794, + "auxiliary_loss_mlp": 0.01042547, + "balance_loss_clip": 1.02437937, + "balance_loss_mlp": 1.05219352, + "epoch": 0.16389598677288442, + "flos": 19165991788800.0, + "grad_norm": 1.9381311422505, + "language_loss": 0.8873682, + "learning_rate": 3.814863144409855e-06, + "loss": 0.90929163, + "num_input_tokens_seen": 58980355, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9765625, + "step": 2726, + "time_per_iteration": 3.983738660812378 + }, + { + "auxiliary_loss_clip": 0.01156798, + "auxiliary_loss_mlp": 0.01049309, + "balance_loss_clip": 1.02965117, + "balance_loss_mlp": 1.05406547, + "epoch": 0.16395611002555238, + "flos": 21507008785920.0, + "grad_norm": 1.8502717081228044, + "language_loss": 0.7439661, + "learning_rate": 3.814699458247963e-06, + "loss": 0.76602715, + "num_input_tokens_seen": 58999505, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.03125, + "step": 2727, + "time_per_iteration": 5.52494215965271 + }, + { + "auxiliary_loss_clip": 0.01150784, + "auxiliary_loss_mlp": 0.0105177, + "balance_loss_clip": 1.03429413, + "balance_loss_mlp": 1.05145037, + "epoch": 0.16401623327822035, + "flos": 21470918595840.0, + "grad_norm": 1.6814144838265654, + "language_loss": 0.82321334, + "learning_rate": 3.8145357032724855e-06, + "loss": 0.84523886, + "num_input_tokens_seen": 59017930, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9921875, + "step": 2728, + "time_per_iteration": 2.4621498584747314 + }, + { + "auxiliary_loss_clip": 0.01156146, + "auxiliary_loss_mlp": 0.01050932, + "balance_loss_clip": 1.03131044, + "balance_loss_mlp": 1.05167341, + "epoch": 0.1640763565308883, + "flos": 13626232928640.0, + "grad_norm": 2.4458707176630425, + "language_loss": 0.84766865, + "learning_rate": 3.814371879489633e-06, + "loss": 0.86973941, + "num_input_tokens_seen": 59035130, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0390625, + "step": 2729, + "time_per_iteration": 2.459495782852173 + }, + { + "auxiliary_loss_clip": 0.01151277, + "auxiliary_loss_mlp": 0.01044659, + "balance_loss_clip": 1.02661061, + "balance_loss_mlp": 1.04923487, + "epoch": 0.16413647978355628, + "flos": 15451464579840.0, + "grad_norm": 1.9327126112676087, + "language_loss": 0.72569054, + "learning_rate": 3.814207986905616e-06, + "loss": 0.74764991, + "num_input_tokens_seen": 59053080, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.0234375, + "step": 2730, + "time_per_iteration": 2.451016902923584 + }, + { + "auxiliary_loss_clip": 0.01153124, + "auxiliary_loss_mlp": 0.01053311, + "balance_loss_clip": 1.03243709, + "balance_loss_mlp": 1.04862678, + "epoch": 0.16419660303622427, + "flos": 45878682015360.0, + "grad_norm": 2.2141787283307854, + "language_loss": 0.74431163, + "learning_rate": 3.814044025526651e-06, + "loss": 0.76637596, + "num_input_tokens_seen": 59075610, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.046875, + "step": 2731, + "time_per_iteration": 2.6857874393463135 + }, + { + "auxiliary_loss_clip": 0.0115844, + "auxiliary_loss_mlp": 0.01048812, + "balance_loss_clip": 1.02818894, + "balance_loss_mlp": 1.05408466, + "epoch": 0.16425672628889224, + "flos": 18952826526720.0, + "grad_norm": 2.15833206643789, + "language_loss": 0.78783584, + "learning_rate": 3.8138799953589548e-06, + "loss": 0.80990839, + "num_input_tokens_seen": 59094555, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.046875, + "step": 2732, + "time_per_iteration": 2.44146728515625 + }, + { + "auxiliary_loss_clip": 0.01155842, + "auxiliary_loss_mlp": 0.01051717, + "balance_loss_clip": 1.03166568, + "balance_loss_mlp": 1.05211556, + "epoch": 0.1643168495415602, + "flos": 24312996362880.0, + "grad_norm": 1.9937390498547816, + "language_loss": 0.68943298, + "learning_rate": 3.8137158964087473e-06, + "loss": 0.71150857, + "num_input_tokens_seen": 59113515, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.0390625, + "step": 2733, + "time_per_iteration": 2.4981601238250732 + }, + { + "auxiliary_loss_clip": 0.01151384, + "auxiliary_loss_mlp": 0.01048132, + "balance_loss_clip": 1.02792621, + "balance_loss_mlp": 1.05054927, + "epoch": 0.16437697279422817, + "flos": 26428421992320.0, + "grad_norm": 2.20018793155086, + "language_loss": 0.80626202, + "learning_rate": 3.8135517286822508e-06, + "loss": 0.8282572, + "num_input_tokens_seen": 59133275, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0078125, + "step": 2734, + "time_per_iteration": 2.495030641555786 + }, + { + "auxiliary_loss_clip": 0.01152713, + "auxiliary_loss_mlp": 0.0105599, + "balance_loss_clip": 1.03638041, + "balance_loss_mlp": 1.05143905, + "epoch": 0.16443709604689613, + "flos": 34532239351680.0, + "grad_norm": 4.0691467716051175, + "language_loss": 0.82265377, + "learning_rate": 3.8133874921856914e-06, + "loss": 0.84474081, + "num_input_tokens_seen": 59154095, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0078125, + "step": 2735, + "time_per_iteration": 2.5911896228790283 + }, + { + "auxiliary_loss_clip": 0.01150004, + "auxiliary_loss_mlp": 0.01044021, + "balance_loss_clip": 1.02556753, + "balance_loss_mlp": 1.05158913, + "epoch": 0.1644972192995641, + "flos": 23258048895360.0, + "grad_norm": 2.5735103485950077, + "language_loss": 0.78697491, + "learning_rate": 3.813223186925296e-06, + "loss": 0.80891526, + "num_input_tokens_seen": 59173795, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.984375, + "step": 2736, + "time_per_iteration": 2.4699559211730957 + }, + { + "auxiliary_loss_clip": 0.01155005, + "auxiliary_loss_mlp": 0.01052588, + "balance_loss_clip": 1.03438449, + "balance_loss_mlp": 1.05231023, + "epoch": 0.1645573425522321, + "flos": 26979543342720.0, + "grad_norm": 1.680513335410081, + "language_loss": 0.81409019, + "learning_rate": 3.8130588129072964e-06, + "loss": 0.83616614, + "num_input_tokens_seen": 59191610, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.03125, + "step": 2737, + "time_per_iteration": 2.4892401695251465 + }, + { + "auxiliary_loss_clip": 0.0115392, + "auxiliary_loss_mlp": 0.01046744, + "balance_loss_clip": 1.02819467, + "balance_loss_mlp": 1.05107307, + "epoch": 0.16461746580490005, + "flos": 28731768600960.0, + "grad_norm": 1.8393773079816103, + "language_loss": 0.87291563, + "learning_rate": 3.8128943701379246e-06, + "loss": 0.89492232, + "num_input_tokens_seen": 59213000, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.03125, + "step": 2738, + "time_per_iteration": 2.54569935798645 + }, + { + "auxiliary_loss_clip": 0.01154293, + "auxiliary_loss_mlp": 0.01055893, + "balance_loss_clip": 1.03653371, + "balance_loss_mlp": 1.05139303, + "epoch": 0.16467758905756802, + "flos": 24930156867840.0, + "grad_norm": 2.0122721864238438, + "language_loss": 0.72351867, + "learning_rate": 3.8127298586234167e-06, + "loss": 0.74562055, + "num_input_tokens_seen": 59232340, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.03125, + "step": 2739, + "time_per_iteration": 2.5309460163116455 + }, + { + "auxiliary_loss_clip": 0.01148442, + "auxiliary_loss_mlp": 0.0104888, + "balance_loss_clip": 1.02991343, + "balance_loss_mlp": 1.04766631, + "epoch": 0.16473771231023598, + "flos": 24826519152000.0, + "grad_norm": 1.690107638621115, + "language_loss": 0.81735384, + "learning_rate": 3.8125652783700104e-06, + "loss": 0.8393271, + "num_input_tokens_seen": 59253950, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0078125, + "step": 2740, + "time_per_iteration": 2.5005404949188232 + }, + { + "auxiliary_loss_clip": 0.01157284, + "auxiliary_loss_mlp": 0.01053239, + "balance_loss_clip": 1.03176928, + "balance_loss_mlp": 1.05347896, + "epoch": 0.16479783556290395, + "flos": 39896072375040.0, + "grad_norm": 2.8033984026588756, + "language_loss": 0.69098473, + "learning_rate": 3.8124006293839475e-06, + "loss": 0.71308994, + "num_input_tokens_seen": 59275545, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.0390625, + "step": 2741, + "time_per_iteration": 2.6353659629821777 + }, + { + "auxiliary_loss_clip": 0.01151645, + "auxiliary_loss_mlp": 0.01044648, + "balance_loss_clip": 1.02588463, + "balance_loss_mlp": 1.04987025, + "epoch": 0.16485795881557191, + "flos": 19897061299200.0, + "grad_norm": 2.1078448839323167, + "language_loss": 0.79967189, + "learning_rate": 3.812235911671472e-06, + "loss": 0.82163477, + "num_input_tokens_seen": 59293480, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 2742, + "time_per_iteration": 2.4471442699432373 + }, + { + "auxiliary_loss_clip": 0.01150824, + "auxiliary_loss_mlp": 0.01053847, + "balance_loss_clip": 1.03373659, + "balance_loss_mlp": 1.05117011, + "epoch": 0.16491808206823988, + "flos": 20556129997440.0, + "grad_norm": 2.1468697804747823, + "language_loss": 0.84769481, + "learning_rate": 3.8120711252388274e-06, + "loss": 0.86974156, + "num_input_tokens_seen": 59313435, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0, + "step": 2743, + "time_per_iteration": 2.459146022796631 + }, + { + "auxiliary_loss_clip": 0.01149398, + "auxiliary_loss_mlp": 0.01052609, + "balance_loss_clip": 1.03359556, + "balance_loss_mlp": 1.05074859, + "epoch": 0.16497820532090787, + "flos": 23800802376960.0, + "grad_norm": 1.5853616537097488, + "language_loss": 0.85723281, + "learning_rate": 3.811906270092265e-06, + "loss": 0.87925285, + "num_input_tokens_seen": 59331535, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.984375, + "step": 2744, + "time_per_iteration": 2.4920642375946045 + }, + { + "auxiliary_loss_clip": 0.01147114, + "auxiliary_loss_mlp": 0.01046312, + "balance_loss_clip": 1.0283947, + "balance_loss_mlp": 1.05124998, + "epoch": 0.16503832857357584, + "flos": 25482642935040.0, + "grad_norm": 1.7300129139105382, + "language_loss": 0.82973897, + "learning_rate": 3.811741346238036e-06, + "loss": 0.85167319, + "num_input_tokens_seen": 59350680, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9609375, + "step": 2745, + "time_per_iteration": 2.490399122238159 + }, + { + "auxiliary_loss_clip": 0.0115758, + "auxiliary_loss_mlp": 0.01054165, + "balance_loss_clip": 1.03548467, + "balance_loss_mlp": 1.05477679, + "epoch": 0.1650984518262438, + "flos": 17676058619520.0, + "grad_norm": 2.19754759855213, + "language_loss": 0.76411253, + "learning_rate": 3.8115763536823923e-06, + "loss": 0.78622997, + "num_input_tokens_seen": 59367020, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.03125, + "step": 2746, + "time_per_iteration": 2.46258282661438 + }, + { + "auxiliary_loss_clip": 0.01152266, + "auxiliary_loss_mlp": 0.01052583, + "balance_loss_clip": 1.03387904, + "balance_loss_mlp": 1.05164099, + "epoch": 0.16515857507891177, + "flos": 18698327688960.0, + "grad_norm": 1.5978428663850568, + "language_loss": 0.80686736, + "learning_rate": 3.811411292431592e-06, + "loss": 0.82891583, + "num_input_tokens_seen": 59386075, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0078125, + "step": 2747, + "time_per_iteration": 2.4612972736358643 + }, + { + "auxiliary_loss_clip": 0.01158238, + "auxiliary_loss_mlp": 0.01048108, + "balance_loss_clip": 1.02848577, + "balance_loss_mlp": 1.05559731, + "epoch": 0.16521869833157973, + "flos": 15010481306880.0, + "grad_norm": 1.853069559467639, + "language_loss": 0.69463658, + "learning_rate": 3.8112461624918945e-06, + "loss": 0.71670008, + "num_input_tokens_seen": 59402690, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0234375, + "step": 2748, + "time_per_iteration": 2.4235999584198 + }, + { + "auxiliary_loss_clip": 0.01155731, + "auxiliary_loss_mlp": 0.0105142, + "balance_loss_clip": 1.03314471, + "balance_loss_mlp": 1.05482006, + "epoch": 0.1652788215842477, + "flos": 22121152548480.0, + "grad_norm": 2.265414403061137, + "language_loss": 0.87653661, + "learning_rate": 3.811080963869561e-06, + "loss": 0.89860809, + "num_input_tokens_seen": 59421130, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.0078125, + "step": 2749, + "time_per_iteration": 2.4706709384918213 + }, + { + "auxiliary_loss_clip": 0.01153325, + "auxiliary_loss_mlp": 0.01048781, + "balance_loss_clip": 1.02905142, + "balance_loss_mlp": 1.0509429, + "epoch": 0.16533894483691566, + "flos": 18333080242560.0, + "grad_norm": 2.3451981357461444, + "language_loss": 0.79248077, + "learning_rate": 3.8109156965708557e-06, + "loss": 0.81450188, + "num_input_tokens_seen": 59438970, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0234375, + "step": 2750, + "time_per_iteration": 2.4588990211486816 + }, + { + "auxiliary_loss_clip": 0.01151699, + "auxiliary_loss_mlp": 0.01045956, + "balance_loss_clip": 1.02657294, + "balance_loss_mlp": 1.05188382, + "epoch": 0.16539906808958366, + "flos": 22382115834240.0, + "grad_norm": 1.7653411133265118, + "language_loss": 0.95010567, + "learning_rate": 3.8107503606020455e-06, + "loss": 0.9720822, + "num_input_tokens_seen": 59458510, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.99609375, + "step": 2751, + "time_per_iteration": 2.4776439666748047 + }, + { + "auxiliary_loss_clip": 0.01152135, + "auxiliary_loss_mlp": 0.01045797, + "balance_loss_clip": 1.02762985, + "balance_loss_mlp": 1.05480134, + "epoch": 0.16545919134225162, + "flos": 22711093522560.0, + "grad_norm": 1.9833662518999209, + "language_loss": 0.71080822, + "learning_rate": 3.8105849559693997e-06, + "loss": 0.73278749, + "num_input_tokens_seen": 59477110, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.97265625, + "step": 2752, + "time_per_iteration": 2.4609227180480957 + }, + { + "auxiliary_loss_clip": 0.01051961, + "auxiliary_loss_mlp": 0.01021231, + "balance_loss_clip": 1.01878762, + "balance_loss_mlp": 1.01785779, + "epoch": 0.1655193145949196, + "flos": 67802974076160.0, + "grad_norm": 0.7698122762266473, + "language_loss": 0.54079807, + "learning_rate": 3.810419482679192e-06, + "loss": 0.56152999, + "num_input_tokens_seen": 59541155, + "router_z_loss_clip": 0.02441406, + "router_z_loss_mlp": 0.33984375, + "step": 2753, + "time_per_iteration": 3.161339282989502 + }, + { + "auxiliary_loss_clip": 0.01152964, + "auxiliary_loss_mlp": 0.01042006, + "balance_loss_clip": 1.02282572, + "balance_loss_mlp": 1.05254793, + "epoch": 0.16557943784758755, + "flos": 24280389792000.0, + "grad_norm": 1.9686645345026932, + "language_loss": 0.75467873, + "learning_rate": 3.8102539407376954e-06, + "loss": 0.77662838, + "num_input_tokens_seen": 59561155, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.0, + "step": 2754, + "time_per_iteration": 2.4837357997894287 + }, + { + "auxiliary_loss_clip": 0.01160718, + "auxiliary_loss_mlp": 0.01060834, + "balance_loss_clip": 1.03875661, + "balance_loss_mlp": 1.05358946, + "epoch": 0.16563956110025552, + "flos": 20083617561600.0, + "grad_norm": 3.81944507319113, + "language_loss": 0.87154973, + "learning_rate": 3.810088330151188e-06, + "loss": 0.89376527, + "num_input_tokens_seen": 59580460, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.0703125, + "step": 2755, + "time_per_iteration": 2.482780933380127 + }, + { + "auxiliary_loss_clip": 0.01148695, + "auxiliary_loss_mlp": 0.01052509, + "balance_loss_clip": 1.03348362, + "balance_loss_mlp": 1.04862666, + "epoch": 0.16569968435292348, + "flos": 28034454896640.0, + "grad_norm": 1.859731734913831, + "language_loss": 0.73258269, + "learning_rate": 3.80992265092595e-06, + "loss": 0.7545948, + "num_input_tokens_seen": 59600025, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0, + "step": 2756, + "time_per_iteration": 2.519432783126831 + }, + { + "auxiliary_loss_clip": 0.01149207, + "auxiliary_loss_mlp": 0.01049415, + "balance_loss_clip": 1.02999544, + "balance_loss_mlp": 1.05331099, + "epoch": 0.16575980760559147, + "flos": 26250233598720.0, + "grad_norm": 1.6628427585054586, + "language_loss": 0.74967468, + "learning_rate": 3.8097569030682636e-06, + "loss": 0.77166092, + "num_input_tokens_seen": 59620600, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.9609375, + "step": 2757, + "time_per_iteration": 2.5122530460357666 + }, + { + "auxiliary_loss_clip": 0.01154145, + "auxiliary_loss_mlp": 0.01044644, + "balance_loss_clip": 1.02590466, + "balance_loss_mlp": 1.05359447, + "epoch": 0.16581993085825944, + "flos": 26943955943040.0, + "grad_norm": 2.101183789218018, + "language_loss": 0.84532511, + "learning_rate": 3.8095910865844137e-06, + "loss": 0.86731303, + "num_input_tokens_seen": 59641385, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0078125, + "step": 2758, + "time_per_iteration": 2.5268592834472656 + }, + { + "auxiliary_loss_clip": 0.01153935, + "auxiliary_loss_mlp": 0.01051485, + "balance_loss_clip": 1.03382993, + "balance_loss_mlp": 1.05355358, + "epoch": 0.1658800541109274, + "flos": 21653632103040.0, + "grad_norm": 3.016772390052645, + "language_loss": 0.79003322, + "learning_rate": 3.809425201480689e-06, + "loss": 0.81208748, + "num_input_tokens_seen": 59659865, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 1.0, + "step": 2759, + "time_per_iteration": 2.468798875808716 + }, + { + "auxiliary_loss_clip": 0.01151828, + "auxiliary_loss_mlp": 0.01048294, + "balance_loss_clip": 1.02953088, + "balance_loss_mlp": 1.05121255, + "epoch": 0.16594017736359537, + "flos": 16435488643200.0, + "grad_norm": 4.81235802271706, + "language_loss": 0.75059134, + "learning_rate": 3.8092592477633793e-06, + "loss": 0.77259254, + "num_input_tokens_seen": 59678780, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0, + "step": 2760, + "time_per_iteration": 2.459453582763672 + }, + { + "auxiliary_loss_clip": 0.01158028, + "auxiliary_loss_mlp": 0.0104013, + "balance_loss_clip": 1.02139056, + "balance_loss_mlp": 1.05363011, + "epoch": 0.16600030061626334, + "flos": 22637297030400.0, + "grad_norm": 1.843496656605, + "language_loss": 0.73409051, + "learning_rate": 3.8090932254387774e-06, + "loss": 0.75607204, + "num_input_tokens_seen": 59698795, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.046875, + "step": 2761, + "time_per_iteration": 2.473264455795288 + }, + { + "auxiliary_loss_clip": 0.01154594, + "auxiliary_loss_mlp": 0.01046157, + "balance_loss_clip": 1.02709532, + "balance_loss_mlp": 1.05460942, + "epoch": 0.1660604238689313, + "flos": 26396569607040.0, + "grad_norm": 2.076392836835936, + "language_loss": 0.89255953, + "learning_rate": 3.8089271345131788e-06, + "loss": 0.91456699, + "num_input_tokens_seen": 59718795, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0, + "step": 2762, + "time_per_iteration": 2.4917852878570557 + }, + { + "auxiliary_loss_clip": 0.01153346, + "auxiliary_loss_mlp": 0.01052721, + "balance_loss_clip": 1.03327847, + "balance_loss_mlp": 1.0517025, + "epoch": 0.16612054712159927, + "flos": 23039999383680.0, + "grad_norm": 1.6634533311047424, + "language_loss": 0.87782222, + "learning_rate": 3.8087609749928822e-06, + "loss": 0.89988291, + "num_input_tokens_seen": 59737555, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.015625, + "step": 2763, + "time_per_iteration": 2.48002028465271 + }, + { + "auxiliary_loss_clip": 0.01051809, + "auxiliary_loss_mlp": 0.01013596, + "balance_loss_clip": 1.01105642, + "balance_loss_mlp": 1.01786494, + "epoch": 0.16618067037426726, + "flos": 59241225202560.0, + "grad_norm": 0.7771287992078079, + "language_loss": 0.59777391, + "learning_rate": 3.8085947468841885e-06, + "loss": 0.61842799, + "num_input_tokens_seen": 59800915, + "router_z_loss_clip": 0.02539062, + "router_z_loss_mlp": 0.33984375, + "step": 2764, + "time_per_iteration": 3.0722031593322754 + }, + { + "auxiliary_loss_clip": 0.01154679, + "auxiliary_loss_mlp": 0.01052765, + "balance_loss_clip": 1.03183234, + "balance_loss_mlp": 1.05292118, + "epoch": 0.16624079362693522, + "flos": 27198813916800.0, + "grad_norm": 1.8564974944455146, + "language_loss": 0.82349414, + "learning_rate": 3.808428450193401e-06, + "loss": 0.8455686, + "num_input_tokens_seen": 59822910, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.015625, + "step": 2765, + "time_per_iteration": 2.5071089267730713 + }, + { + "auxiliary_loss_clip": 0.01161301, + "auxiliary_loss_mlp": 0.01048817, + "balance_loss_clip": 1.02758563, + "balance_loss_mlp": 1.05308914, + "epoch": 0.1663009168796032, + "flos": 10925068216320.0, + "grad_norm": 2.1954568630881566, + "language_loss": 0.70029616, + "learning_rate": 3.8082620849268244e-06, + "loss": 0.72239733, + "num_input_tokens_seen": 59838805, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.078125, + "step": 2766, + "time_per_iteration": 2.417538642883301 + }, + { + "auxiliary_loss_clip": 0.01153227, + "auxiliary_loss_mlp": 0.01045171, + "balance_loss_clip": 1.02669311, + "balance_loss_mlp": 1.05449462, + "epoch": 0.16636104013227115, + "flos": 17894431353600.0, + "grad_norm": 2.3642497854018174, + "language_loss": 0.88693011, + "learning_rate": 3.808095651090769e-06, + "loss": 0.90891409, + "num_input_tokens_seen": 59855345, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.98828125, + "step": 2767, + "time_per_iteration": 2.447087287902832 + }, + { + "auxiliary_loss_clip": 0.01048639, + "auxiliary_loss_mlp": 0.01007692, + "balance_loss_clip": 1.0051651, + "balance_loss_mlp": 1.01474071, + "epoch": 0.16642116338493912, + "flos": 66726050463360.0, + "grad_norm": 0.659533193053428, + "language_loss": 0.52894622, + "learning_rate": 3.8079291486915447e-06, + "loss": 0.54950953, + "num_input_tokens_seen": 59917710, + "router_z_loss_clip": 0.02526855, + "router_z_loss_mlp": 0.33984375, + "step": 2768, + "time_per_iteration": 4.540286064147949 + }, + { + "auxiliary_loss_clip": 0.01156575, + "auxiliary_loss_mlp": 0.01051889, + "balance_loss_clip": 1.03196931, + "balance_loss_mlp": 1.05233693, + "epoch": 0.16648128663760708, + "flos": 19026048401280.0, + "grad_norm": 2.4421243199538543, + "language_loss": 0.84964579, + "learning_rate": 3.8077625777354667e-06, + "loss": 0.87173045, + "num_input_tokens_seen": 59935105, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.046875, + "step": 2769, + "time_per_iteration": 3.9888546466827393 + }, + { + "auxiliary_loss_clip": 0.01046706, + "auxiliary_loss_mlp": 0.01007405, + "balance_loss_clip": 1.00486565, + "balance_loss_mlp": 1.01284146, + "epoch": 0.16654140989027508, + "flos": 70134976759680.0, + "grad_norm": 0.809970645404753, + "language_loss": 0.57417655, + "learning_rate": 3.80759593822885e-06, + "loss": 0.59471762, + "num_input_tokens_seen": 59984085, + "router_z_loss_clip": 0.02539062, + "router_z_loss_mlp": 0.33984375, + "step": 2770, + "time_per_iteration": 2.909212350845337 + }, + { + "auxiliary_loss_clip": 0.01045765, + "auxiliary_loss_mlp": 0.01004174, + "balance_loss_clip": 1.00161099, + "balance_loss_mlp": 1.0120976, + "epoch": 0.16660153314294304, + "flos": 70272406195200.0, + "grad_norm": 0.8642108743281017, + "language_loss": 0.5621168, + "learning_rate": 3.807429230178015e-06, + "loss": 0.58261615, + "num_input_tokens_seen": 60043470, + "router_z_loss_clip": 0.02563477, + "router_z_loss_mlp": 0.3359375, + "step": 2771, + "time_per_iteration": 2.9000375270843506 + }, + { + "auxiliary_loss_clip": 0.01152287, + "auxiliary_loss_mlp": 0.01058074, + "balance_loss_clip": 1.03741515, + "balance_loss_mlp": 1.05137527, + "epoch": 0.166661656395611, + "flos": 23075048079360.0, + "grad_norm": 2.4271023422086593, + "language_loss": 0.70461071, + "learning_rate": 3.8072624535892817e-06, + "loss": 0.72671425, + "num_input_tokens_seen": 60063045, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.0078125, + "step": 2772, + "time_per_iteration": 2.45868182182312 + }, + { + "auxiliary_loss_clip": 0.01150213, + "auxiliary_loss_mlp": 0.01052488, + "balance_loss_clip": 1.03305721, + "balance_loss_mlp": 1.04914951, + "epoch": 0.16672177964827897, + "flos": 28366341586560.0, + "grad_norm": 1.8764675289735346, + "language_loss": 0.86201918, + "learning_rate": 3.807095608468975e-06, + "loss": 0.8840462, + "num_input_tokens_seen": 60081945, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0078125, + "step": 2773, + "time_per_iteration": 2.513784885406494 + }, + { + "auxiliary_loss_clip": 0.01152492, + "auxiliary_loss_mlp": 0.01046232, + "balance_loss_clip": 1.02808821, + "balance_loss_mlp": 1.05230188, + "epoch": 0.16678190290094694, + "flos": 19091010147840.0, + "grad_norm": 2.2216439453760595, + "language_loss": 0.81859678, + "learning_rate": 3.8069286948234224e-06, + "loss": 0.84058398, + "num_input_tokens_seen": 60096820, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.0, + "step": 2774, + "time_per_iteration": 2.4288830757141113 + }, + { + "auxiliary_loss_clip": 0.01155539, + "auxiliary_loss_mlp": 0.0104957, + "balance_loss_clip": 1.02955508, + "balance_loss_mlp": 1.05290627, + "epoch": 0.1668420261536149, + "flos": 21799106184960.0, + "grad_norm": 2.1125697386324576, + "language_loss": 0.83287829, + "learning_rate": 3.806761712658952e-06, + "loss": 0.85492939, + "num_input_tokens_seen": 60116140, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.0234375, + "step": 2775, + "time_per_iteration": 2.4773504734039307 + }, + { + "auxiliary_loss_clip": 0.01151, + "auxiliary_loss_mlp": 0.01053902, + "balance_loss_clip": 1.03599668, + "balance_loss_mlp": 1.0527029, + "epoch": 0.16690214940628287, + "flos": 19062533640960.0, + "grad_norm": 1.9011936520028738, + "language_loss": 0.80721045, + "learning_rate": 3.806594661981897e-06, + "loss": 0.82925946, + "num_input_tokens_seen": 60134235, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.984375, + "step": 2776, + "time_per_iteration": 2.4736995697021484 + }, + { + "auxiliary_loss_clip": 0.01147621, + "auxiliary_loss_mlp": 0.01053383, + "balance_loss_clip": 1.03457189, + "balance_loss_mlp": 1.05260348, + "epoch": 0.16696227265895086, + "flos": 18588548747520.0, + "grad_norm": 1.7922512358148395, + "language_loss": 0.798361, + "learning_rate": 3.8064275427985906e-06, + "loss": 0.82037103, + "num_input_tokens_seen": 60153275, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.953125, + "step": 2777, + "time_per_iteration": 2.4625258445739746 + }, + { + "auxiliary_loss_clip": 0.01149386, + "auxiliary_loss_mlp": 0.0105028, + "balance_loss_clip": 1.0313735, + "balance_loss_mlp": 1.05002642, + "epoch": 0.16702239591161883, + "flos": 23294139085440.0, + "grad_norm": 1.8218923631286437, + "language_loss": 0.85132945, + "learning_rate": 3.806260355115371e-06, + "loss": 0.87332618, + "num_input_tokens_seen": 60173215, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.99609375, + "step": 2778, + "time_per_iteration": 2.4819412231445312 + }, + { + "auxiliary_loss_clip": 0.01154381, + "auxiliary_loss_mlp": 0.0104532, + "balance_loss_clip": 1.02626991, + "balance_loss_mlp": 1.05222583, + "epoch": 0.1670825191642868, + "flos": 24425648392320.0, + "grad_norm": 2.6489491047564826, + "language_loss": 0.74133682, + "learning_rate": 3.8060930989385778e-06, + "loss": 0.76333386, + "num_input_tokens_seen": 60190515, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0234375, + "step": 2779, + "time_per_iteration": 2.510207176208496 + }, + { + "auxiliary_loss_clip": 0.0115174, + "auxiliary_loss_mlp": 0.01045601, + "balance_loss_clip": 1.02625358, + "balance_loss_mlp": 1.05116367, + "epoch": 0.16714264241695476, + "flos": 26797512193920.0, + "grad_norm": 2.2761441742273663, + "language_loss": 0.65382051, + "learning_rate": 3.805925774274554e-06, + "loss": 0.67579395, + "num_input_tokens_seen": 60211655, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0078125, + "step": 2780, + "time_per_iteration": 2.5250439643859863 + }, + { + "auxiliary_loss_clip": 0.01150325, + "auxiliary_loss_mlp": 0.01048314, + "balance_loss_clip": 1.02856088, + "balance_loss_mlp": 1.05120933, + "epoch": 0.16720276566962272, + "flos": 21835304115840.0, + "grad_norm": 2.0602280440022382, + "language_loss": 0.78563058, + "learning_rate": 3.805758381129643e-06, + "loss": 0.80761701, + "num_input_tokens_seen": 60230860, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.9921875, + "step": 2781, + "time_per_iteration": 2.4921979904174805 + }, + { + "auxiliary_loss_clip": 0.01153739, + "auxiliary_loss_mlp": 0.01049181, + "balance_loss_clip": 1.03065586, + "balance_loss_mlp": 1.05227423, + "epoch": 0.1672628889222907, + "flos": 21470415805440.0, + "grad_norm": 1.480266857331911, + "language_loss": 0.75262564, + "learning_rate": 3.805590919510193e-06, + "loss": 0.77465487, + "num_input_tokens_seen": 60250535, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.015625, + "step": 2782, + "time_per_iteration": 2.468590021133423 + }, + { + "auxiliary_loss_clip": 0.01159372, + "auxiliary_loss_mlp": 0.01052642, + "balance_loss_clip": 1.03141046, + "balance_loss_mlp": 1.05443954, + "epoch": 0.16732301217495865, + "flos": 30774008269440.0, + "grad_norm": 1.999958464394936, + "language_loss": 0.67841566, + "learning_rate": 3.8054233894225547e-06, + "loss": 0.70053571, + "num_input_tokens_seen": 60269530, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.0546875, + "step": 2783, + "time_per_iteration": 2.5312225818634033 + }, + { + "auxiliary_loss_clip": 0.01153889, + "auxiliary_loss_mlp": 0.01050749, + "balance_loss_clip": 1.03193808, + "balance_loss_mlp": 1.0538497, + "epoch": 0.16738313542762664, + "flos": 23474625949440.0, + "grad_norm": 2.209785525271013, + "language_loss": 0.70028126, + "learning_rate": 3.805255790873081e-06, + "loss": 0.72232759, + "num_input_tokens_seen": 60289900, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0, + "step": 2784, + "time_per_iteration": 2.4932820796966553 + }, + { + "auxiliary_loss_clip": 0.01154602, + "auxiliary_loss_mlp": 0.0105186, + "balance_loss_clip": 1.03087902, + "balance_loss_mlp": 1.05120277, + "epoch": 0.1674432586802946, + "flos": 29789086366080.0, + "grad_norm": 1.9638597335511054, + "language_loss": 0.60441053, + "learning_rate": 3.805088123868126e-06, + "loss": 0.62647516, + "num_input_tokens_seen": 60310025, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.03125, + "step": 2785, + "time_per_iteration": 2.527010440826416 + }, + { + "auxiliary_loss_clip": 0.0104901, + "auxiliary_loss_mlp": 0.01029558, + "balance_loss_clip": 1.02681625, + "balance_loss_mlp": 1.01595187, + "epoch": 0.16750338193296258, + "flos": 66136073575680.0, + "grad_norm": 0.8343482124814343, + "language_loss": 0.588, + "learning_rate": 3.8049203884140492e-06, + "loss": 0.60878569, + "num_input_tokens_seen": 60377800, + "router_z_loss_clip": 0.02746582, + "router_z_loss_mlp": 0.33007812, + "step": 2786, + "time_per_iteration": 3.1062281131744385 + }, + { + "auxiliary_loss_clip": 0.0115343, + "auxiliary_loss_mlp": 0.01044844, + "balance_loss_clip": 1.0253408, + "balance_loss_mlp": 1.05108333, + "epoch": 0.16756350518563054, + "flos": 25696777864320.0, + "grad_norm": 1.9494651562196093, + "language_loss": 0.75846571, + "learning_rate": 3.80475258451721e-06, + "loss": 0.78044844, + "num_input_tokens_seen": 60398215, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0234375, + "step": 2787, + "time_per_iteration": 2.51383900642395 + }, + { + "auxiliary_loss_clip": 0.0115361, + "auxiliary_loss_mlp": 0.0104169, + "balance_loss_clip": 1.02287841, + "balance_loss_mlp": 1.05218899, + "epoch": 0.1676236284382985, + "flos": 23836102467840.0, + "grad_norm": 2.088538847955111, + "language_loss": 0.77615869, + "learning_rate": 3.804584712183972e-06, + "loss": 0.79811174, + "num_input_tokens_seen": 60416910, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 2788, + "time_per_iteration": 2.4926373958587646 + }, + { + "auxiliary_loss_clip": 0.01048965, + "auxiliary_loss_mlp": 0.01004104, + "balance_loss_clip": 1.00154078, + "balance_loss_mlp": 1.01582766, + "epoch": 0.16768375169096647, + "flos": 59874902985600.0, + "grad_norm": 0.861309286667726, + "language_loss": 0.59360403, + "learning_rate": 3.8044167714207013e-06, + "loss": 0.61413473, + "num_input_tokens_seen": 60468660, + "router_z_loss_clip": 0.02563477, + "router_z_loss_mlp": 0.33203125, + "step": 2789, + "time_per_iteration": 2.9390883445739746 + }, + { + "auxiliary_loss_clip": 0.01153417, + "auxiliary_loss_mlp": 0.01052728, + "balance_loss_clip": 1.03262937, + "balance_loss_mlp": 1.05115533, + "epoch": 0.16774387494363446, + "flos": 38435657207040.0, + "grad_norm": 1.8582032581880512, + "language_loss": 0.70117038, + "learning_rate": 3.804248762233765e-06, + "loss": 0.72323185, + "num_input_tokens_seen": 60492370, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.015625, + "step": 2790, + "time_per_iteration": 2.6337287425994873 + }, + { + "auxiliary_loss_clip": 0.01154528, + "auxiliary_loss_mlp": 0.01057043, + "balance_loss_clip": 1.03852975, + "balance_loss_mlp": 1.05254579, + "epoch": 0.16780399819630243, + "flos": 22637620252800.0, + "grad_norm": 1.9267324208283758, + "language_loss": 0.7914235, + "learning_rate": 3.8040806846295356e-06, + "loss": 0.81353921, + "num_input_tokens_seen": 60512655, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0234375, + "step": 2791, + "time_per_iteration": 2.4992258548736572 + }, + { + "auxiliary_loss_clip": 0.01154296, + "auxiliary_loss_mlp": 0.0104755, + "balance_loss_clip": 1.02807093, + "balance_loss_mlp": 1.05311096, + "epoch": 0.1678641214489704, + "flos": 32891516887680.0, + "grad_norm": 1.670563786806713, + "language_loss": 0.71465087, + "learning_rate": 3.8039125386143853e-06, + "loss": 0.73666936, + "num_input_tokens_seen": 60533090, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.015625, + "step": 2792, + "time_per_iteration": 2.5886104106903076 + }, + { + "auxiliary_loss_clip": 0.01154826, + "auxiliary_loss_mlp": 0.01045884, + "balance_loss_clip": 1.02648878, + "balance_loss_mlp": 1.05179656, + "epoch": 0.16792424470163836, + "flos": 19974916028160.0, + "grad_norm": 2.423044729867527, + "language_loss": 0.72166264, + "learning_rate": 3.803744324194691e-06, + "loss": 0.74366981, + "num_input_tokens_seen": 60553190, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.03125, + "step": 2793, + "time_per_iteration": 2.5197043418884277 + }, + { + "auxiliary_loss_clip": 0.01153184, + "auxiliary_loss_mlp": 0.01054586, + "balance_loss_clip": 1.03502417, + "balance_loss_mlp": 1.05135465, + "epoch": 0.16798436795430632, + "flos": 19719878486400.0, + "grad_norm": 1.9474647186442988, + "language_loss": 0.77305138, + "learning_rate": 3.803576041376831e-06, + "loss": 0.79512912, + "num_input_tokens_seen": 60571995, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.015625, + "step": 2794, + "time_per_iteration": 2.467292547225952 + }, + { + "auxiliary_loss_clip": 0.01154384, + "auxiliary_loss_mlp": 0.01054467, + "balance_loss_clip": 1.03558397, + "balance_loss_mlp": 1.05253601, + "epoch": 0.1680444912069743, + "flos": 28104839596800.0, + "grad_norm": 2.2742759048834578, + "language_loss": 0.71613103, + "learning_rate": 3.803407690167187e-06, + "loss": 0.7382195, + "num_input_tokens_seen": 60591275, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.015625, + "step": 2795, + "time_per_iteration": 2.5272278785705566 + }, + { + "auxiliary_loss_clip": 0.01149377, + "auxiliary_loss_mlp": 0.01044626, + "balance_loss_clip": 1.02592218, + "balance_loss_mlp": 1.04932868, + "epoch": 0.16810461445964225, + "flos": 18075205526400.0, + "grad_norm": 1.942494339721957, + "language_loss": 0.83784455, + "learning_rate": 3.803239270572142e-06, + "loss": 0.8597846, + "num_input_tokens_seen": 60609235, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0, + "step": 2796, + "time_per_iteration": 2.448528289794922 + }, + { + "auxiliary_loss_clip": 0.01152862, + "auxiliary_loss_mlp": 0.01059215, + "balance_loss_clip": 1.03911614, + "balance_loss_mlp": 1.04904127, + "epoch": 0.16816473771231025, + "flos": 23878657105920.0, + "grad_norm": 1.6778887705488965, + "language_loss": 0.8109591, + "learning_rate": 3.8030707825980838e-06, + "loss": 0.83307993, + "num_input_tokens_seen": 60629880, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0390625, + "step": 2797, + "time_per_iteration": 2.5044567584991455 + }, + { + "auxiliary_loss_clip": 0.01147186, + "auxiliary_loss_mlp": 0.01044345, + "balance_loss_clip": 1.02766752, + "balance_loss_mlp": 1.05142093, + "epoch": 0.1682248609649782, + "flos": 22783597125120.0, + "grad_norm": 1.4189820060365406, + "language_loss": 0.74740726, + "learning_rate": 3.802902226251401e-06, + "loss": 0.76932257, + "num_input_tokens_seen": 60651175, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.95703125, + "step": 2798, + "time_per_iteration": 2.4913666248321533 + }, + { + "auxiliary_loss_clip": 0.01154688, + "auxiliary_loss_mlp": 0.01049917, + "balance_loss_clip": 1.03250098, + "balance_loss_mlp": 1.05462337, + "epoch": 0.16828498421764618, + "flos": 20705123612160.0, + "grad_norm": 1.8962576537558784, + "language_loss": 0.79592311, + "learning_rate": 3.8027336015384845e-06, + "loss": 0.81796914, + "num_input_tokens_seen": 60670210, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 1.0, + "step": 2799, + "time_per_iteration": 2.4844021797180176 + }, + { + "auxiliary_loss_clip": 0.01153892, + "auxiliary_loss_mlp": 0.01046392, + "balance_loss_clip": 1.02597189, + "balance_loss_mlp": 1.04983997, + "epoch": 0.16834510747031414, + "flos": 29420606695680.0, + "grad_norm": 2.7819182919151455, + "language_loss": 0.70778632, + "learning_rate": 3.8025649084657296e-06, + "loss": 0.72978926, + "num_input_tokens_seen": 60690895, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0390625, + "step": 2800, + "time_per_iteration": 2.548715829849243 + }, + { + "auxiliary_loss_clip": 0.01148463, + "auxiliary_loss_mlp": 0.01043839, + "balance_loss_clip": 1.02365637, + "balance_loss_mlp": 1.04882574, + "epoch": 0.1684052307229821, + "flos": 18145374744960.0, + "grad_norm": 1.9135359518782422, + "language_loss": 0.83549178, + "learning_rate": 3.8023961470395326e-06, + "loss": 0.85741478, + "num_input_tokens_seen": 60708280, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.99609375, + "step": 2801, + "time_per_iteration": 2.456601858139038 + }, + { + "auxiliary_loss_clip": 0.01149997, + "auxiliary_loss_mlp": 0.01052315, + "balance_loss_clip": 1.03355145, + "balance_loss_mlp": 1.04947591, + "epoch": 0.16846535397565007, + "flos": 16574929240320.0, + "grad_norm": 2.757874152621573, + "language_loss": 0.822721, + "learning_rate": 3.8022273172662933e-06, + "loss": 0.84474415, + "num_input_tokens_seen": 60724150, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0078125, + "step": 2802, + "time_per_iteration": 2.4426534175872803 + }, + { + "auxiliary_loss_clip": 0.01153107, + "auxiliary_loss_mlp": 0.01047694, + "balance_loss_clip": 1.02764344, + "balance_loss_mlp": 1.05123353, + "epoch": 0.16852547722831807, + "flos": 30408868563840.0, + "grad_norm": 1.4855905624355255, + "language_loss": 0.81064272, + "learning_rate": 3.802058419152413e-06, + "loss": 0.83265072, + "num_input_tokens_seen": 60746485, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.015625, + "step": 2803, + "time_per_iteration": 2.5615930557250977 + }, + { + "auxiliary_loss_clip": 0.01150255, + "auxiliary_loss_mlp": 0.01045652, + "balance_loss_clip": 1.02693641, + "balance_loss_mlp": 1.05246449, + "epoch": 0.16858560048098603, + "flos": 33507420416640.0, + "grad_norm": 2.2799183114600545, + "language_loss": 0.7645762, + "learning_rate": 3.801889452704297e-06, + "loss": 0.78653532, + "num_input_tokens_seen": 60762875, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9765625, + "step": 2804, + "time_per_iteration": 2.541059970855713 + }, + { + "auxiliary_loss_clip": 0.01045818, + "auxiliary_loss_mlp": 0.01026702, + "balance_loss_clip": 1.02452028, + "balance_loss_mlp": 1.01328063, + "epoch": 0.168645723733654, + "flos": 67370502326400.0, + "grad_norm": 0.8620881286764229, + "language_loss": 0.55414748, + "learning_rate": 3.8017204179283526e-06, + "loss": 0.57487267, + "num_input_tokens_seen": 60825510, + "router_z_loss_clip": 0.02185059, + "router_z_loss_mlp": 0.32421875, + "step": 2805, + "time_per_iteration": 3.033358573913574 + }, + { + "auxiliary_loss_clip": 0.01144187, + "auxiliary_loss_mlp": 0.01039064, + "balance_loss_clip": 1.02161169, + "balance_loss_mlp": 1.04741919, + "epoch": 0.16870584698632196, + "flos": 21324618501120.0, + "grad_norm": 1.9122963285347783, + "language_loss": 0.73038024, + "learning_rate": 3.8015513148309892e-06, + "loss": 0.75221276, + "num_input_tokens_seen": 60844440, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.96484375, + "step": 2806, + "time_per_iteration": 2.4699463844299316 + }, + { + "auxiliary_loss_clip": 0.01148467, + "auxiliary_loss_mlp": 0.01045307, + "balance_loss_clip": 1.02712786, + "balance_loss_mlp": 1.05072176, + "epoch": 0.16876597023898993, + "flos": 20740746925440.0, + "grad_norm": 1.9407491705316076, + "language_loss": 0.69966477, + "learning_rate": 3.80138214341862e-06, + "loss": 0.7216025, + "num_input_tokens_seen": 60863210, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.98046875, + "step": 2807, + "time_per_iteration": 2.4583139419555664 + }, + { + "auxiliary_loss_clip": 0.01149832, + "auxiliary_loss_mlp": 0.01051611, + "balance_loss_clip": 1.03196526, + "balance_loss_mlp": 1.05013919, + "epoch": 0.1688260934916579, + "flos": 20303498666880.0, + "grad_norm": 2.8028706291815912, + "language_loss": 0.70265883, + "learning_rate": 3.8012129036976587e-06, + "loss": 0.72467327, + "num_input_tokens_seen": 60882510, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.9921875, + "step": 2808, + "time_per_iteration": 2.4724719524383545 + }, + { + "auxiliary_loss_clip": 0.01154296, + "auxiliary_loss_mlp": 0.0104775, + "balance_loss_clip": 1.02792549, + "balance_loss_mlp": 1.05130935, + "epoch": 0.16888621674432586, + "flos": 20340702178560.0, + "grad_norm": 2.1293629398657954, + "language_loss": 0.80103064, + "learning_rate": 3.8010435956745236e-06, + "loss": 0.8230511, + "num_input_tokens_seen": 60901105, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.03125, + "step": 2809, + "time_per_iteration": 3.844451427459717 + }, + { + "auxiliary_loss_clip": 0.01155336, + "auxiliary_loss_mlp": 0.01051942, + "balance_loss_clip": 1.03301144, + "balance_loss_mlp": 1.050385, + "epoch": 0.16894633999699385, + "flos": 16244802316800.0, + "grad_norm": 2.0909159229075245, + "language_loss": 0.88465077, + "learning_rate": 3.8008742193556358e-06, + "loss": 0.9067235, + "num_input_tokens_seen": 60915340, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.046875, + "step": 2810, + "time_per_iteration": 5.43256688117981 + }, + { + "auxiliary_loss_clip": 0.0115459, + "auxiliary_loss_mlp": 0.01052284, + "balance_loss_clip": 1.03238845, + "balance_loss_mlp": 1.05188894, + "epoch": 0.16900646324966181, + "flos": 19610171372160.0, + "grad_norm": 2.324870160833927, + "language_loss": 0.92483926, + "learning_rate": 3.800704774747416e-06, + "loss": 0.94690794, + "num_input_tokens_seen": 60933735, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.03125, + "step": 2811, + "time_per_iteration": 2.4633538722991943 + }, + { + "auxiliary_loss_clip": 0.01157458, + "auxiliary_loss_mlp": 0.01049775, + "balance_loss_clip": 1.03154814, + "balance_loss_mlp": 1.05537057, + "epoch": 0.16906658650232978, + "flos": 22018089450240.0, + "grad_norm": 20.150047321728213, + "language_loss": 0.78719699, + "learning_rate": 3.800535261856291e-06, + "loss": 0.80926931, + "num_input_tokens_seen": 60953105, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.0234375, + "step": 2812, + "time_per_iteration": 2.475893974304199 + }, + { + "auxiliary_loss_clip": 0.01154531, + "auxiliary_loss_mlp": 0.01053249, + "balance_loss_clip": 1.0353322, + "balance_loss_mlp": 1.05427527, + "epoch": 0.16912670975499774, + "flos": 11763690024960.0, + "grad_norm": 2.3708558754635103, + "language_loss": 0.7492249, + "learning_rate": 3.8003656806886887e-06, + "loss": 0.7713027, + "num_input_tokens_seen": 60969150, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 1.0078125, + "step": 2813, + "time_per_iteration": 2.4622457027435303 + }, + { + "auxiliary_loss_clip": 0.01155154, + "auxiliary_loss_mlp": 0.01047749, + "balance_loss_clip": 1.02862835, + "balance_loss_mlp": 1.05231524, + "epoch": 0.1691868330076657, + "flos": 17161386595200.0, + "grad_norm": 2.6643465032783955, + "language_loss": 0.69000697, + "learning_rate": 3.8001960312510396e-06, + "loss": 0.71203601, + "num_input_tokens_seen": 60982825, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.03125, + "step": 2814, + "time_per_iteration": 2.442352771759033 + }, + { + "auxiliary_loss_clip": 0.01152587, + "auxiliary_loss_mlp": 0.01048898, + "balance_loss_clip": 1.03032494, + "balance_loss_mlp": 1.05269694, + "epoch": 0.16924695626033368, + "flos": 22416553998720.0, + "grad_norm": 3.3683342322522543, + "language_loss": 0.61842358, + "learning_rate": 3.800026313549776e-06, + "loss": 0.64043844, + "num_input_tokens_seen": 61000875, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0, + "step": 2815, + "time_per_iteration": 2.4859516620635986 + }, + { + "auxiliary_loss_clip": 0.01149812, + "auxiliary_loss_mlp": 0.01050269, + "balance_loss_clip": 1.03179121, + "balance_loss_mlp": 1.05104065, + "epoch": 0.16930707951300164, + "flos": 25739655724800.0, + "grad_norm": 1.9947957584318596, + "language_loss": 0.81983805, + "learning_rate": 3.7998565275913342e-06, + "loss": 0.84183884, + "num_input_tokens_seen": 61021940, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9921875, + "step": 2816, + "time_per_iteration": 2.5549440383911133 + }, + { + "auxiliary_loss_clip": 0.01156016, + "auxiliary_loss_mlp": 0.01049677, + "balance_loss_clip": 1.03072321, + "balance_loss_mlp": 1.05379295, + "epoch": 0.16936720276566963, + "flos": 22747040058240.0, + "grad_norm": 2.502019531770294, + "language_loss": 0.8722589, + "learning_rate": 3.799686673382153e-06, + "loss": 0.89431584, + "num_input_tokens_seen": 61040285, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0234375, + "step": 2817, + "time_per_iteration": 2.4906835556030273 + }, + { + "auxiliary_loss_clip": 0.01152128, + "auxiliary_loss_mlp": 0.01051154, + "balance_loss_clip": 1.03200889, + "balance_loss_mlp": 1.05302715, + "epoch": 0.1694273260183376, + "flos": 19573973441280.0, + "grad_norm": 1.7787508021643152, + "language_loss": 0.81666476, + "learning_rate": 3.799516750928672e-06, + "loss": 0.83869755, + "num_input_tokens_seen": 61059020, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.98828125, + "step": 2818, + "time_per_iteration": 2.4673428535461426 + }, + { + "auxiliary_loss_clip": 0.01151603, + "auxiliary_loss_mlp": 0.01052661, + "balance_loss_clip": 1.03339636, + "balance_loss_mlp": 1.05154157, + "epoch": 0.16948744927100556, + "flos": 12457843332480.0, + "grad_norm": 5.791836374282792, + "language_loss": 0.80712807, + "learning_rate": 3.799346760237336e-06, + "loss": 0.8291707, + "num_input_tokens_seen": 61074245, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0, + "step": 2819, + "time_per_iteration": 2.43947434425354 + }, + { + "auxiliary_loss_clip": 0.01048844, + "auxiliary_loss_mlp": 0.01007246, + "balance_loss_clip": 1.00504076, + "balance_loss_mlp": 1.01552486, + "epoch": 0.16954757252367353, + "flos": 71291694435840.0, + "grad_norm": 0.9491282523447765, + "language_loss": 0.61080176, + "learning_rate": 3.7991767013145902e-06, + "loss": 0.63136268, + "num_input_tokens_seen": 61127080, + "router_z_loss_clip": 0.02209473, + "router_z_loss_mlp": 0.33203125, + "step": 2820, + "time_per_iteration": 3.008953809738159 + }, + { + "auxiliary_loss_clip": 0.01152835, + "auxiliary_loss_mlp": 0.01049908, + "balance_loss_clip": 1.031335, + "balance_loss_mlp": 1.05163527, + "epoch": 0.1696076957763415, + "flos": 29606516513280.0, + "grad_norm": 2.1013484538112097, + "language_loss": 0.78625357, + "learning_rate": 3.7990065741668844e-06, + "loss": 0.808281, + "num_input_tokens_seen": 61146955, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.015625, + "step": 2821, + "time_per_iteration": 2.5363481044769287 + }, + { + "auxiliary_loss_clip": 0.01152889, + "auxiliary_loss_mlp": 0.01056486, + "balance_loss_clip": 1.03667343, + "balance_loss_mlp": 1.05229986, + "epoch": 0.16966781902900946, + "flos": 24388588535040.0, + "grad_norm": 2.87583667245789, + "language_loss": 0.78450388, + "learning_rate": 3.7988363788006685e-06, + "loss": 0.80659759, + "num_input_tokens_seen": 61166605, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.0078125, + "step": 2822, + "time_per_iteration": 2.4969065189361572 + }, + { + "auxiliary_loss_clip": 0.01147495, + "auxiliary_loss_mlp": 0.01050996, + "balance_loss_clip": 1.03299582, + "balance_loss_mlp": 1.04956698, + "epoch": 0.16972794228167745, + "flos": 23038814234880.0, + "grad_norm": 1.9220487825624015, + "language_loss": 0.75016022, + "learning_rate": 3.7986661152223967e-06, + "loss": 0.77214515, + "num_input_tokens_seen": 61186535, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9765625, + "step": 2823, + "time_per_iteration": 2.491588830947876 + }, + { + "auxiliary_loss_clip": 0.01151822, + "auxiliary_loss_mlp": 0.0105186, + "balance_loss_clip": 1.03198779, + "balance_loss_mlp": 1.05209637, + "epoch": 0.16978806553434542, + "flos": 35228691129600.0, + "grad_norm": 1.9648811068121905, + "language_loss": 0.60514438, + "learning_rate": 3.7984957834385257e-06, + "loss": 0.62718117, + "num_input_tokens_seen": 61208965, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.99609375, + "step": 2824, + "time_per_iteration": 2.6178910732269287 + }, + { + "auxiliary_loss_clip": 0.01151958, + "auxiliary_loss_mlp": 0.01040113, + "balance_loss_clip": 1.02030015, + "balance_loss_mlp": 1.05367076, + "epoch": 0.16984818878701338, + "flos": 32014290936960.0, + "grad_norm": 1.6856049786717988, + "language_loss": 0.73004806, + "learning_rate": 3.7983253834555144e-06, + "loss": 0.75196874, + "num_input_tokens_seen": 61230670, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.98046875, + "step": 2825, + "time_per_iteration": 2.559774398803711 + }, + { + "auxiliary_loss_clip": 0.01155697, + "auxiliary_loss_mlp": 0.01054546, + "balance_loss_clip": 1.03321934, + "balance_loss_mlp": 1.0505774, + "epoch": 0.16990831203968135, + "flos": 22818609907200.0, + "grad_norm": 1.7849035157466668, + "language_loss": 0.85660541, + "learning_rate": 3.7981549152798245e-06, + "loss": 0.87870789, + "num_input_tokens_seen": 61249510, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.0546875, + "step": 2826, + "time_per_iteration": 2.4860360622406006 + }, + { + "auxiliary_loss_clip": 0.0115502, + "auxiliary_loss_mlp": 0.01050706, + "balance_loss_clip": 1.03164482, + "balance_loss_mlp": 1.0515151, + "epoch": 0.1699684352923493, + "flos": 23039604334080.0, + "grad_norm": 2.3205594057943175, + "language_loss": 0.8232255, + "learning_rate": 3.7979843789179196e-06, + "loss": 0.84528267, + "num_input_tokens_seen": 61269440, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.03125, + "step": 2827, + "time_per_iteration": 2.4965107440948486 + }, + { + "auxiliary_loss_clip": 0.01153252, + "auxiliary_loss_mlp": 0.01049837, + "balance_loss_clip": 1.02965498, + "balance_loss_mlp": 1.05059743, + "epoch": 0.17002855854501728, + "flos": 21434110133760.0, + "grad_norm": 2.393760877815214, + "language_loss": 0.73652613, + "learning_rate": 3.797813774376267e-06, + "loss": 0.75855708, + "num_input_tokens_seen": 61288195, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0234375, + "step": 2828, + "time_per_iteration": 2.5726237297058105 + }, + { + "auxiliary_loss_clip": 0.01046718, + "auxiliary_loss_mlp": 0.01008554, + "balance_loss_clip": 1.00625372, + "balance_loss_mlp": 1.01360035, + "epoch": 0.17008868179768524, + "flos": 71453509205760.0, + "grad_norm": 0.76062911359866, + "language_loss": 0.56446254, + "learning_rate": 3.797643101661336e-06, + "loss": 0.5850153, + "num_input_tokens_seen": 61350850, + "router_z_loss_clip": 0.02294922, + "router_z_loss_mlp": 0.33203125, + "step": 2829, + "time_per_iteration": 3.1035284996032715 + }, + { + "auxiliary_loss_clip": 0.01148906, + "auxiliary_loss_mlp": 0.01048452, + "balance_loss_clip": 1.02912867, + "balance_loss_mlp": 1.04916263, + "epoch": 0.17014880505035324, + "flos": 24900315644160.0, + "grad_norm": 1.7229604876305038, + "language_loss": 0.83673382, + "learning_rate": 3.7974723607795983e-06, + "loss": 0.85870743, + "num_input_tokens_seen": 61370765, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.99609375, + "step": 2830, + "time_per_iteration": 2.5140810012817383 + }, + { + "auxiliary_loss_clip": 0.01150805, + "auxiliary_loss_mlp": 0.01048567, + "balance_loss_clip": 1.02792013, + "balance_loss_mlp": 1.04919207, + "epoch": 0.1702089283030212, + "flos": 29862415981440.0, + "grad_norm": 2.0065309441313337, + "language_loss": 0.77852297, + "learning_rate": 3.797301551737529e-06, + "loss": 0.80051666, + "num_input_tokens_seen": 61388935, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.015625, + "step": 2831, + "time_per_iteration": 2.524578094482422 + }, + { + "auxiliary_loss_clip": 0.01152075, + "auxiliary_loss_mlp": 0.01050912, + "balance_loss_clip": 1.03013349, + "balance_loss_mlp": 1.04948521, + "epoch": 0.17026905155568917, + "flos": 17744180762880.0, + "grad_norm": 2.1211873867699285, + "language_loss": 0.79345167, + "learning_rate": 3.7971306745416044e-06, + "loss": 0.81548154, + "num_input_tokens_seen": 61407350, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.0234375, + "step": 2832, + "time_per_iteration": 2.459954261779785 + }, + { + "auxiliary_loss_clip": 0.01151972, + "auxiliary_loss_mlp": 0.0104718, + "balance_loss_clip": 1.02836847, + "balance_loss_mlp": 1.05050385, + "epoch": 0.17032917480835713, + "flos": 23148665003520.0, + "grad_norm": 1.9382017652854369, + "language_loss": 0.89026237, + "learning_rate": 3.7969597291983046e-06, + "loss": 0.91225392, + "num_input_tokens_seen": 61429010, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 2833, + "time_per_iteration": 2.4812114238739014 + }, + { + "auxiliary_loss_clip": 0.0115284, + "auxiliary_loss_mlp": 0.01048999, + "balance_loss_clip": 1.02963924, + "balance_loss_mlp": 1.05124569, + "epoch": 0.1703892980610251, + "flos": 39202565512320.0, + "grad_norm": 2.853060698790674, + "language_loss": 0.72425497, + "learning_rate": 3.7967887157141115e-06, + "loss": 0.74627328, + "num_input_tokens_seen": 61450040, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.015625, + "step": 2834, + "time_per_iteration": 2.6271297931671143 + }, + { + "auxiliary_loss_clip": 0.01156378, + "auxiliary_loss_mlp": 0.01058486, + "balance_loss_clip": 1.03894782, + "balance_loss_mlp": 1.05294132, + "epoch": 0.17044942131369306, + "flos": 23039101543680.0, + "grad_norm": 1.9954265429463485, + "language_loss": 0.86434042, + "learning_rate": 3.7966176340955106e-06, + "loss": 0.88648909, + "num_input_tokens_seen": 61468585, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.03125, + "step": 2835, + "time_per_iteration": 2.4804999828338623 + }, + { + "auxiliary_loss_clip": 0.01155592, + "auxiliary_loss_mlp": 0.01051963, + "balance_loss_clip": 1.03007674, + "balance_loss_mlp": 1.05081642, + "epoch": 0.17050954456636103, + "flos": 17054983532160.0, + "grad_norm": 1.9180646463430515, + "language_loss": 0.73242748, + "learning_rate": 3.796446484348989e-06, + "loss": 0.75450307, + "num_input_tokens_seen": 61486330, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.046875, + "step": 2836, + "time_per_iteration": 2.4694178104400635 + }, + { + "auxiliary_loss_clip": 0.01156947, + "auxiliary_loss_mlp": 0.01048414, + "balance_loss_clip": 1.02599072, + "balance_loss_mlp": 1.05033076, + "epoch": 0.17056966781902902, + "flos": 16836969934080.0, + "grad_norm": 2.1253309510576717, + "language_loss": 0.79653537, + "learning_rate": 3.796275266481036e-06, + "loss": 0.81858897, + "num_input_tokens_seen": 61503950, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.0703125, + "step": 2837, + "time_per_iteration": 2.452153444290161 + }, + { + "auxiliary_loss_clip": 0.01150588, + "auxiliary_loss_mlp": 0.01045279, + "balance_loss_clip": 1.02550185, + "balance_loss_mlp": 1.05232143, + "epoch": 0.17062979107169698, + "flos": 17712543859200.0, + "grad_norm": 2.19906443062711, + "language_loss": 0.83575213, + "learning_rate": 3.7961039804981456e-06, + "loss": 0.85771078, + "num_input_tokens_seen": 61523550, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.98046875, + "step": 2838, + "time_per_iteration": 2.479573965072632 + }, + { + "auxiliary_loss_clip": 0.01148981, + "auxiliary_loss_mlp": 0.01045249, + "balance_loss_clip": 1.02660489, + "balance_loss_mlp": 1.05069315, + "epoch": 0.17068991432436495, + "flos": 22525040050560.0, + "grad_norm": 1.7423496230624245, + "language_loss": 0.93620354, + "learning_rate": 3.795932626406812e-06, + "loss": 0.95814586, + "num_input_tokens_seen": 61542720, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.984375, + "step": 2839, + "time_per_iteration": 2.5399010181427 + }, + { + "auxiliary_loss_clip": 0.01154457, + "auxiliary_loss_mlp": 0.01049077, + "balance_loss_clip": 1.0277859, + "balance_loss_mlp": 1.05050242, + "epoch": 0.17075003757703291, + "flos": 25882939077120.0, + "grad_norm": 2.8052720148780894, + "language_loss": 0.83847374, + "learning_rate": 3.7957612042135336e-06, + "loss": 0.86050916, + "num_input_tokens_seen": 61563040, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.0390625, + "step": 2840, + "time_per_iteration": 2.5449130535125732 + }, + { + "auxiliary_loss_clip": 0.01155521, + "auxiliary_loss_mlp": 0.01047778, + "balance_loss_clip": 1.02647519, + "balance_loss_mlp": 1.05213881, + "epoch": 0.17081016082970088, + "flos": 20120713332480.0, + "grad_norm": 2.014300966058614, + "language_loss": 0.76390004, + "learning_rate": 3.79558971392481e-06, + "loss": 0.78593302, + "num_input_tokens_seen": 61581890, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.03125, + "step": 2841, + "time_per_iteration": 2.4836723804473877 + }, + { + "auxiliary_loss_clip": 0.01152003, + "auxiliary_loss_mlp": 0.01052533, + "balance_loss_clip": 1.03243482, + "balance_loss_mlp": 1.04932261, + "epoch": 0.17087028408236885, + "flos": 24936477661440.0, + "grad_norm": 1.8874127741110907, + "language_loss": 0.77000463, + "learning_rate": 3.7954181555471443e-06, + "loss": 0.79205, + "num_input_tokens_seen": 61602095, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.03125, + "step": 2842, + "time_per_iteration": 2.5051841735839844 + }, + { + "auxiliary_loss_clip": 0.01148386, + "auxiliary_loss_mlp": 0.01046299, + "balance_loss_clip": 1.02647448, + "balance_loss_mlp": 1.0497905, + "epoch": 0.17093040733503684, + "flos": 19057864872960.0, + "grad_norm": 2.05566421297988, + "language_loss": 0.86086738, + "learning_rate": 3.795246529087043e-06, + "loss": 0.88281423, + "num_input_tokens_seen": 61620400, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.98828125, + "step": 2843, + "time_per_iteration": 2.4487509727478027 + }, + { + "auxiliary_loss_clip": 0.01150009, + "auxiliary_loss_mlp": 0.01046155, + "balance_loss_clip": 1.02696228, + "balance_loss_mlp": 1.05090249, + "epoch": 0.1709905305877048, + "flos": 13078954333440.0, + "grad_norm": 1.8875494657309706, + "language_loss": 0.6826812, + "learning_rate": 3.7950748345510126e-06, + "loss": 0.70464289, + "num_input_tokens_seen": 61637680, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.9921875, + "step": 2844, + "time_per_iteration": 2.4429779052734375 + }, + { + "auxiliary_loss_clip": 0.01150851, + "auxiliary_loss_mlp": 0.01054229, + "balance_loss_clip": 1.03371274, + "balance_loss_mlp": 1.05040824, + "epoch": 0.17105065384037277, + "flos": 19209336526080.0, + "grad_norm": 1.8058232236820264, + "language_loss": 0.78258789, + "learning_rate": 3.7949030719455646e-06, + "loss": 0.80463862, + "num_input_tokens_seen": 61655630, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0, + "step": 2845, + "time_per_iteration": 2.4377951622009277 + }, + { + "auxiliary_loss_clip": 0.01151786, + "auxiliary_loss_mlp": 0.01045909, + "balance_loss_clip": 1.02687097, + "balance_loss_mlp": 1.05064154, + "epoch": 0.17111077709304073, + "flos": 18515183218560.0, + "grad_norm": 2.746386155528142, + "language_loss": 0.77959955, + "learning_rate": 3.7947312412772127e-06, + "loss": 0.8015765, + "num_input_tokens_seen": 61673475, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0078125, + "step": 2846, + "time_per_iteration": 2.4196622371673584 + }, + { + "auxiliary_loss_clip": 0.01152165, + "auxiliary_loss_mlp": 0.0104791, + "balance_loss_clip": 1.02895534, + "balance_loss_mlp": 1.05158973, + "epoch": 0.1711709003457087, + "flos": 25082670015360.0, + "grad_norm": 1.7441395807388675, + "language_loss": 0.7942031, + "learning_rate": 3.794559342552472e-06, + "loss": 0.81620383, + "num_input_tokens_seen": 61693370, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0078125, + "step": 2847, + "time_per_iteration": 2.504087448120117 + }, + { + "auxiliary_loss_clip": 0.01148457, + "auxiliary_loss_mlp": 0.0104865, + "balance_loss_clip": 1.02913523, + "balance_loss_mlp": 1.04612017, + "epoch": 0.17123102359837666, + "flos": 17566387418880.0, + "grad_norm": 2.239997254259111, + "language_loss": 0.86818451, + "learning_rate": 3.7943873757778614e-06, + "loss": 0.89015555, + "num_input_tokens_seen": 61710820, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0234375, + "step": 2848, + "time_per_iteration": 2.438711643218994 + }, + { + "auxiliary_loss_clip": 0.0115323, + "auxiliary_loss_mlp": 0.01044307, + "balance_loss_clip": 1.02438748, + "balance_loss_mlp": 1.05133212, + "epoch": 0.17129114685104463, + "flos": 26173635845760.0, + "grad_norm": 1.715396677859901, + "language_loss": 0.75223613, + "learning_rate": 3.794215340959902e-06, + "loss": 0.77421153, + "num_input_tokens_seen": 61729855, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.015625, + "step": 2849, + "time_per_iteration": 2.4918415546417236 + }, + { + "auxiliary_loss_clip": 0.01047678, + "auxiliary_loss_mlp": 0.01003312, + "balance_loss_clip": 1.00107098, + "balance_loss_mlp": 1.01492834, + "epoch": 0.17135127010371262, + "flos": 69269710037760.0, + "grad_norm": 0.7949737728021388, + "language_loss": 0.57471085, + "learning_rate": 3.7940432381051163e-06, + "loss": 0.59522074, + "num_input_tokens_seen": 61790290, + "router_z_loss_clip": 0.02246094, + "router_z_loss_mlp": 0.328125, + "step": 2850, + "time_per_iteration": 3.057778835296631 + }, + { + "auxiliary_loss_clip": 0.01146039, + "auxiliary_loss_mlp": 0.0105304, + "balance_loss_clip": 1.03332317, + "balance_loss_mlp": 1.04852295, + "epoch": 0.1714113933563806, + "flos": 23550110380800.0, + "grad_norm": 2.4364727127987704, + "language_loss": 0.80988616, + "learning_rate": 3.793871067220031e-06, + "loss": 0.83187693, + "num_input_tokens_seen": 61809265, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.9765625, + "step": 2851, + "time_per_iteration": 3.887600898742676 + }, + { + "auxiliary_loss_clip": 0.01146778, + "auxiliary_loss_mlp": 0.0104369, + "balance_loss_clip": 1.02549911, + "balance_loss_mlp": 1.04858351, + "epoch": 0.17147151660904855, + "flos": 21142443697920.0, + "grad_norm": 2.035620688428962, + "language_loss": 0.93063158, + "learning_rate": 3.7936988283111764e-06, + "loss": 0.95253623, + "num_input_tokens_seen": 61828980, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.98046875, + "step": 2852, + "time_per_iteration": 3.920153856277466 + }, + { + "auxiliary_loss_clip": 0.01149404, + "auxiliary_loss_mlp": 0.01053071, + "balance_loss_clip": 1.03374732, + "balance_loss_mlp": 1.04728949, + "epoch": 0.17153163986171652, + "flos": 18624890332800.0, + "grad_norm": 1.8406206656402175, + "language_loss": 0.69480836, + "learning_rate": 3.7935265213850817e-06, + "loss": 0.71683311, + "num_input_tokens_seen": 61847915, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.015625, + "step": 2853, + "time_per_iteration": 2.4457037448883057 + }, + { + "auxiliary_loss_clip": 0.0115316, + "auxiliary_loss_mlp": 0.01050964, + "balance_loss_clip": 1.03150904, + "balance_loss_mlp": 1.05059445, + "epoch": 0.17159176311438448, + "flos": 18223265387520.0, + "grad_norm": 2.187977199847503, + "language_loss": 0.66505128, + "learning_rate": 3.7933541464482815e-06, + "loss": 0.68709248, + "num_input_tokens_seen": 61865570, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0234375, + "step": 2854, + "time_per_iteration": 2.4421632289886475 + }, + { + "auxiliary_loss_clip": 0.01144359, + "auxiliary_loss_mlp": 0.01044047, + "balance_loss_clip": 1.02520037, + "balance_loss_mlp": 1.04574227, + "epoch": 0.17165188636705245, + "flos": 20738987159040.0, + "grad_norm": 1.8257227486643586, + "language_loss": 0.89394444, + "learning_rate": 3.7931817035073124e-06, + "loss": 0.91582847, + "num_input_tokens_seen": 61883340, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.984375, + "step": 2855, + "time_per_iteration": 2.4601552486419678 + }, + { + "auxiliary_loss_clip": 0.01148566, + "auxiliary_loss_mlp": 0.01051381, + "balance_loss_clip": 1.03286791, + "balance_loss_mlp": 1.04792452, + "epoch": 0.17171200961972044, + "flos": 24899884680960.0, + "grad_norm": 2.515892939250119, + "language_loss": 0.83822739, + "learning_rate": 3.7930091925687134e-06, + "loss": 0.86022681, + "num_input_tokens_seen": 61900610, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0078125, + "step": 2856, + "time_per_iteration": 2.4747347831726074 + }, + { + "auxiliary_loss_clip": 0.01151618, + "auxiliary_loss_mlp": 0.01048758, + "balance_loss_clip": 1.02906466, + "balance_loss_mlp": 1.05112195, + "epoch": 0.1717721328723884, + "flos": 20157234485760.0, + "grad_norm": 1.9053156238546485, + "language_loss": 0.8645792, + "learning_rate": 3.792836613639026e-06, + "loss": 0.88658297, + "num_input_tokens_seen": 61916795, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0078125, + "step": 2857, + "time_per_iteration": 2.4460220336914062 + }, + { + "auxiliary_loss_clip": 0.01148045, + "auxiliary_loss_mlp": 0.0105234, + "balance_loss_clip": 1.03327847, + "balance_loss_mlp": 1.04805577, + "epoch": 0.17183225612505637, + "flos": 23361650697600.0, + "grad_norm": 2.139076633770832, + "language_loss": 0.77919662, + "learning_rate": 3.7926639667247947e-06, + "loss": 0.80120051, + "num_input_tokens_seen": 61936665, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0, + "step": 2858, + "time_per_iteration": 2.4459195137023926 + }, + { + "auxiliary_loss_clip": 0.01156262, + "auxiliary_loss_mlp": 0.01058687, + "balance_loss_clip": 1.03761101, + "balance_loss_mlp": 1.04760742, + "epoch": 0.17189237937772434, + "flos": 18114240631680.0, + "grad_norm": 2.423579883765011, + "language_loss": 0.77235049, + "learning_rate": 3.7924912518325663e-06, + "loss": 0.79449999, + "num_input_tokens_seen": 61954415, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.0859375, + "step": 2859, + "time_per_iteration": 2.43471360206604 + }, + { + "auxiliary_loss_clip": 0.01148379, + "auxiliary_loss_mlp": 0.01050312, + "balance_loss_clip": 1.03069019, + "balance_loss_mlp": 1.04920983, + "epoch": 0.1719525026303923, + "flos": 23258408031360.0, + "grad_norm": 3.774880148287903, + "language_loss": 0.77179611, + "learning_rate": 3.7923184689688902e-06, + "loss": 0.79378301, + "num_input_tokens_seen": 61973940, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.9921875, + "step": 2860, + "time_per_iteration": 2.463344097137451 + }, + { + "auxiliary_loss_clip": 0.01149457, + "auxiliary_loss_mlp": 0.01051045, + "balance_loss_clip": 1.03217435, + "balance_loss_mlp": 1.04703689, + "epoch": 0.17201262588306027, + "flos": 20810413353600.0, + "grad_norm": 2.1505291491255463, + "language_loss": 0.81964719, + "learning_rate": 3.792145618140317e-06, + "loss": 0.84165227, + "num_input_tokens_seen": 61991845, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0234375, + "step": 2861, + "time_per_iteration": 2.4505395889282227 + }, + { + "auxiliary_loss_clip": 0.01149339, + "auxiliary_loss_mlp": 0.01050609, + "balance_loss_clip": 1.03163123, + "balance_loss_mlp": 1.04897118, + "epoch": 0.17207274913572823, + "flos": 20375858615040.0, + "grad_norm": 4.22955926449596, + "language_loss": 0.85649675, + "learning_rate": 3.7919726993534038e-06, + "loss": 0.87849623, + "num_input_tokens_seen": 62009395, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0078125, + "step": 2862, + "time_per_iteration": 2.4392077922821045 + }, + { + "auxiliary_loss_clip": 0.01144423, + "auxiliary_loss_mlp": 0.01046105, + "balance_loss_clip": 1.02867651, + "balance_loss_mlp": 1.04785109, + "epoch": 0.17213287238839622, + "flos": 26797727675520.0, + "grad_norm": 2.3146804122881037, + "language_loss": 0.77874523, + "learning_rate": 3.7917997126147054e-06, + "loss": 0.80065054, + "num_input_tokens_seen": 62029005, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.96484375, + "step": 2863, + "time_per_iteration": 2.4745166301727295 + }, + { + "auxiliary_loss_clip": 0.01147347, + "auxiliary_loss_mlp": 0.01048138, + "balance_loss_clip": 1.02935052, + "balance_loss_mlp": 1.04726493, + "epoch": 0.1721929956410642, + "flos": 26030819370240.0, + "grad_norm": 1.7012031973405044, + "language_loss": 0.72191179, + "learning_rate": 3.7916266579307823e-06, + "loss": 0.74386668, + "num_input_tokens_seen": 62048730, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0, + "step": 2864, + "time_per_iteration": 2.496522903442383 + }, + { + "auxiliary_loss_clip": 0.01151447, + "auxiliary_loss_mlp": 0.01053526, + "balance_loss_clip": 1.03497648, + "balance_loss_mlp": 1.04935968, + "epoch": 0.17225311889373215, + "flos": 22273091078400.0, + "grad_norm": 1.6688219876641972, + "language_loss": 0.72896975, + "learning_rate": 3.7914535353081973e-06, + "loss": 0.75101948, + "num_input_tokens_seen": 62069000, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.015625, + "step": 2865, + "time_per_iteration": 2.468726396560669 + }, + { + "auxiliary_loss_clip": 0.01151587, + "auxiliary_loss_mlp": 0.01044873, + "balance_loss_clip": 1.02608538, + "balance_loss_mlp": 1.05194211, + "epoch": 0.17231324214640012, + "flos": 21287774125440.0, + "grad_norm": 3.1747822479918764, + "language_loss": 0.79011786, + "learning_rate": 3.7912803447535145e-06, + "loss": 0.81208247, + "num_input_tokens_seen": 62086750, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.99609375, + "step": 2866, + "time_per_iteration": 2.445716381072998 + }, + { + "auxiliary_loss_clip": 0.01149563, + "auxiliary_loss_mlp": 0.01046901, + "balance_loss_clip": 1.02651668, + "balance_loss_mlp": 1.04966402, + "epoch": 0.17237336539906808, + "flos": 19680735640320.0, + "grad_norm": 1.797659045411876, + "language_loss": 0.79865277, + "learning_rate": 3.7911070862733016e-06, + "loss": 0.82061744, + "num_input_tokens_seen": 62106240, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0, + "step": 2867, + "time_per_iteration": 2.4745590686798096 + }, + { + "auxiliary_loss_clip": 0.0114836, + "auxiliary_loss_mlp": 0.01037447, + "balance_loss_clip": 1.01887417, + "balance_loss_mlp": 1.04821014, + "epoch": 0.17243348865173605, + "flos": 17529650784000.0, + "grad_norm": 1.717941409951427, + "language_loss": 0.79707634, + "learning_rate": 3.7909337598741276e-06, + "loss": 0.81893444, + "num_input_tokens_seen": 62124895, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0, + "step": 2868, + "time_per_iteration": 2.4545693397521973 + }, + { + "auxiliary_loss_clip": 0.01157442, + "auxiliary_loss_mlp": 0.0104461, + "balance_loss_clip": 1.02645397, + "balance_loss_mlp": 1.0538218, + "epoch": 0.17249361190440402, + "flos": 18259858368000.0, + "grad_norm": 1.9332967921770021, + "language_loss": 0.84265673, + "learning_rate": 3.7907603655625674e-06, + "loss": 0.86467719, + "num_input_tokens_seen": 62143510, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.03125, + "step": 2869, + "time_per_iteration": 2.445429563522339 + }, + { + "auxiliary_loss_clip": 0.01151297, + "auxiliary_loss_mlp": 0.01052302, + "balance_loss_clip": 1.03226328, + "balance_loss_mlp": 1.04971075, + "epoch": 0.172553735157072, + "flos": 21174367910400.0, + "grad_norm": 2.3539211413688954, + "language_loss": 0.77522051, + "learning_rate": 3.7905869033451932e-06, + "loss": 0.79725653, + "num_input_tokens_seen": 62162285, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.015625, + "step": 2870, + "time_per_iteration": 2.4975087642669678 + }, + { + "auxiliary_loss_clip": 0.01146931, + "auxiliary_loss_mlp": 0.01043287, + "balance_loss_clip": 1.02609706, + "balance_loss_mlp": 1.05132568, + "epoch": 0.17261385840973997, + "flos": 22273270646400.0, + "grad_norm": 1.897031493968697, + "language_loss": 0.7680704, + "learning_rate": 3.7904133732285857e-06, + "loss": 0.78997254, + "num_input_tokens_seen": 62180970, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.95703125, + "step": 2871, + "time_per_iteration": 2.4777348041534424 + }, + { + "auxiliary_loss_clip": 0.01150344, + "auxiliary_loss_mlp": 0.01043916, + "balance_loss_clip": 1.02442563, + "balance_loss_mlp": 1.05061746, + "epoch": 0.17267398166240794, + "flos": 27922233830400.0, + "grad_norm": 2.240934958328371, + "language_loss": 0.74448204, + "learning_rate": 3.7902397752193228e-06, + "loss": 0.76642466, + "num_input_tokens_seen": 62198965, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0, + "step": 2872, + "time_per_iteration": 2.5021097660064697 + }, + { + "auxiliary_loss_clip": 0.01147343, + "auxiliary_loss_mlp": 0.01039942, + "balance_loss_clip": 1.02117848, + "balance_loss_mlp": 1.05127549, + "epoch": 0.1727341049150759, + "flos": 21945118970880.0, + "grad_norm": 1.8155923086100165, + "language_loss": 0.82694656, + "learning_rate": 3.790066109323988e-06, + "loss": 0.84881938, + "num_input_tokens_seen": 62219890, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9609375, + "step": 2873, + "time_per_iteration": 2.4852540493011475 + }, + { + "auxiliary_loss_clip": 0.01147589, + "auxiliary_loss_mlp": 0.01043231, + "balance_loss_clip": 1.0229888, + "balance_loss_mlp": 1.049196, + "epoch": 0.17279422816774387, + "flos": 18107883924480.0, + "grad_norm": 2.0464410919173814, + "language_loss": 0.75083232, + "learning_rate": 3.7898923755491678e-06, + "loss": 0.77274048, + "num_input_tokens_seen": 62237140, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.984375, + "step": 2874, + "time_per_iteration": 2.440610885620117 + }, + { + "auxiliary_loss_clip": 0.01151305, + "auxiliary_loss_mlp": 0.01044062, + "balance_loss_clip": 1.0238322, + "balance_loss_mlp": 1.0515728, + "epoch": 0.17285435142041183, + "flos": 21835447770240.0, + "grad_norm": 1.9230852666364326, + "language_loss": 0.8067199, + "learning_rate": 3.7897185739014487e-06, + "loss": 0.8286736, + "num_input_tokens_seen": 62255405, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.99609375, + "step": 2875, + "time_per_iteration": 2.478473424911499 + }, + { + "auxiliary_loss_clip": 0.01153488, + "auxiliary_loss_mlp": 0.01049908, + "balance_loss_clip": 1.02984488, + "balance_loss_mlp": 1.05083489, + "epoch": 0.17291447467307983, + "flos": 18368452160640.0, + "grad_norm": 2.5699127680633542, + "language_loss": 0.87525117, + "learning_rate": 3.7895447043874217e-06, + "loss": 0.89728516, + "num_input_tokens_seen": 62271280, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.03125, + "step": 2876, + "time_per_iteration": 2.4580602645874023 + }, + { + "auxiliary_loss_clip": 0.01150473, + "auxiliary_loss_mlp": 0.01042457, + "balance_loss_clip": 1.02384901, + "balance_loss_mlp": 1.05273616, + "epoch": 0.1729745979257478, + "flos": 18624638937600.0, + "grad_norm": 1.9567138745888089, + "language_loss": 0.84561193, + "learning_rate": 3.789370767013681e-06, + "loss": 0.86754125, + "num_input_tokens_seen": 62289140, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9765625, + "step": 2877, + "time_per_iteration": 2.4696123600006104 + }, + { + "auxiliary_loss_clip": 0.01151589, + "auxiliary_loss_mlp": 0.01041028, + "balance_loss_clip": 1.02179909, + "balance_loss_mlp": 1.05281305, + "epoch": 0.17303472117841576, + "flos": 22998234844800.0, + "grad_norm": 3.0724129461132406, + "language_loss": 0.79527134, + "learning_rate": 3.7891967617868204e-06, + "loss": 0.81719756, + "num_input_tokens_seen": 62307490, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.984375, + "step": 2878, + "time_per_iteration": 2.4739902019500732 + }, + { + "auxiliary_loss_clip": 0.01147004, + "auxiliary_loss_mlp": 0.01042956, + "balance_loss_clip": 1.02450228, + "balance_loss_mlp": 1.04968572, + "epoch": 0.17309484443108372, + "flos": 25664386775040.0, + "grad_norm": 1.9694378769308076, + "language_loss": 0.70306808, + "learning_rate": 3.78902268871344e-06, + "loss": 0.72496772, + "num_input_tokens_seen": 62328570, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.97265625, + "step": 2879, + "time_per_iteration": 2.5014665126800537 + }, + { + "auxiliary_loss_clip": 0.01151101, + "auxiliary_loss_mlp": 0.01050497, + "balance_loss_clip": 1.03156662, + "balance_loss_mlp": 1.05038834, + "epoch": 0.1731549676837517, + "flos": 13552903313280.0, + "grad_norm": 2.4431111997211734, + "language_loss": 0.83465785, + "learning_rate": 3.78884854780014e-06, + "loss": 0.85667384, + "num_input_tokens_seen": 62345735, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0078125, + "step": 2880, + "time_per_iteration": 2.433776378631592 + }, + { + "auxiliary_loss_clip": 0.01153087, + "auxiliary_loss_mlp": 0.01044219, + "balance_loss_clip": 1.0250026, + "balance_loss_mlp": 1.05171311, + "epoch": 0.17321509093641965, + "flos": 22857070394880.0, + "grad_norm": 2.135155165507549, + "language_loss": 0.80866969, + "learning_rate": 3.7886743390535236e-06, + "loss": 0.8306427, + "num_input_tokens_seen": 62365525, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.0078125, + "step": 2881, + "time_per_iteration": 2.4944772720336914 + }, + { + "auxiliary_loss_clip": 0.01148623, + "auxiliary_loss_mlp": 0.01043966, + "balance_loss_clip": 1.0263828, + "balance_loss_mlp": 1.05030859, + "epoch": 0.17327521418908762, + "flos": 24352785653760.0, + "grad_norm": 2.5502275528368066, + "language_loss": 0.77372867, + "learning_rate": 3.788500062480197e-06, + "loss": 0.79565454, + "num_input_tokens_seen": 62385160, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.984375, + "step": 2882, + "time_per_iteration": 2.5426836013793945 + }, + { + "auxiliary_loss_clip": 0.011482, + "auxiliary_loss_mlp": 0.01051627, + "balance_loss_clip": 1.03276825, + "balance_loss_mlp": 1.05005169, + "epoch": 0.1733353374417556, + "flos": 33105651816960.0, + "grad_norm": 1.8718611847068298, + "language_loss": 0.76652586, + "learning_rate": 3.788325718086769e-06, + "loss": 0.78852415, + "num_input_tokens_seen": 62405280, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.984375, + "step": 2883, + "time_per_iteration": 2.5733277797698975 + }, + { + "auxiliary_loss_clip": 0.01146516, + "auxiliary_loss_mlp": 0.01044391, + "balance_loss_clip": 1.0265696, + "balance_loss_mlp": 1.04944682, + "epoch": 0.17339546069442358, + "flos": 24388947671040.0, + "grad_norm": 1.945193845574475, + "language_loss": 0.85463524, + "learning_rate": 3.7881513058798503e-06, + "loss": 0.87654424, + "num_input_tokens_seen": 62423665, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.97265625, + "step": 2884, + "time_per_iteration": 2.4708735942840576 + }, + { + "auxiliary_loss_clip": 0.01149646, + "auxiliary_loss_mlp": 0.01039486, + "balance_loss_clip": 1.02122355, + "balance_loss_mlp": 1.05114794, + "epoch": 0.17345558394709154, + "flos": 27454174680960.0, + "grad_norm": 1.6148586475999513, + "language_loss": 0.73758793, + "learning_rate": 3.787976825866055e-06, + "loss": 0.75947917, + "num_input_tokens_seen": 62445170, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.984375, + "step": 2885, + "time_per_iteration": 2.5266878604888916 + }, + { + "auxiliary_loss_clip": 0.01147273, + "auxiliary_loss_mlp": 0.01044711, + "balance_loss_clip": 1.02775908, + "balance_loss_mlp": 1.05269074, + "epoch": 0.1735157071997595, + "flos": 24682158391680.0, + "grad_norm": 1.9690054244815705, + "language_loss": 0.70377076, + "learning_rate": 3.7878022780519998e-06, + "loss": 0.72569054, + "num_input_tokens_seen": 62466135, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9453125, + "step": 2886, + "time_per_iteration": 2.508695363998413 + }, + { + "auxiliary_loss_clip": 0.01146959, + "auxiliary_loss_mlp": 0.01040101, + "balance_loss_clip": 1.0212425, + "balance_loss_mlp": 1.04799545, + "epoch": 0.17357583045242747, + "flos": 21688932193920.0, + "grad_norm": 1.9665325510573808, + "language_loss": 0.69294798, + "learning_rate": 3.7876276624443024e-06, + "loss": 0.7148186, + "num_input_tokens_seen": 62483910, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.98828125, + "step": 2887, + "time_per_iteration": 2.4787776470184326 + }, + { + "auxiliary_loss_clip": 0.01149915, + "auxiliary_loss_mlp": 0.01049822, + "balance_loss_clip": 1.03180945, + "balance_loss_mlp": 1.05075955, + "epoch": 0.17363595370509544, + "flos": 15375728753280.0, + "grad_norm": 1.791000255721863, + "language_loss": 0.85391176, + "learning_rate": 3.787452979049585e-06, + "loss": 0.87590909, + "num_input_tokens_seen": 62501530, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9921875, + "step": 2888, + "time_per_iteration": 2.4234085083007812 + }, + { + "auxiliary_loss_clip": 0.01149617, + "auxiliary_loss_mlp": 0.01046928, + "balance_loss_clip": 1.02668667, + "balance_loss_mlp": 1.05046952, + "epoch": 0.1736960769577634, + "flos": 23440941970560.0, + "grad_norm": 3.660213605651755, + "language_loss": 0.78465497, + "learning_rate": 3.7872782278744718e-06, + "loss": 0.80662042, + "num_input_tokens_seen": 62521295, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.9921875, + "step": 2889, + "time_per_iteration": 2.5042123794555664 + }, + { + "auxiliary_loss_clip": 0.01146581, + "auxiliary_loss_mlp": 0.01047944, + "balance_loss_clip": 1.02913308, + "balance_loss_mlp": 1.05222893, + "epoch": 0.1737562002104314, + "flos": 18587830475520.0, + "grad_norm": 2.9081348702485723, + "language_loss": 0.83860242, + "learning_rate": 3.7871034089255883e-06, + "loss": 0.86054766, + "num_input_tokens_seen": 62539615, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.9453125, + "step": 2890, + "time_per_iteration": 2.4698500633239746 + }, + { + "auxiliary_loss_clip": 0.01150813, + "auxiliary_loss_mlp": 0.01047378, + "balance_loss_clip": 1.02880502, + "balance_loss_mlp": 1.05083108, + "epoch": 0.17381632346309936, + "flos": 15998060816640.0, + "grad_norm": 1.9935479009749588, + "language_loss": 0.82253492, + "learning_rate": 3.7869285222095653e-06, + "loss": 0.84451687, + "num_input_tokens_seen": 62556820, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0, + "step": 2891, + "time_per_iteration": 2.4478886127471924 + }, + { + "auxiliary_loss_clip": 0.01150226, + "auxiliary_loss_mlp": 0.01045776, + "balance_loss_clip": 1.02608275, + "balance_loss_mlp": 1.04824781, + "epoch": 0.17387644671576732, + "flos": 13369830670080.0, + "grad_norm": 2.3073165362682873, + "language_loss": 0.81479478, + "learning_rate": 3.7867535677330334e-06, + "loss": 0.8367548, + "num_input_tokens_seen": 62572450, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.015625, + "step": 2892, + "time_per_iteration": 2.4094645977020264 + }, + { + "auxiliary_loss_clip": 0.01154909, + "auxiliary_loss_mlp": 0.01055666, + "balance_loss_clip": 1.03519785, + "balance_loss_mlp": 1.05379355, + "epoch": 0.1739365699684353, + "flos": 26615516958720.0, + "grad_norm": 2.24459564009462, + "language_loss": 0.74480057, + "learning_rate": 3.786578545502627e-06, + "loss": 0.76690638, + "num_input_tokens_seen": 62592580, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0078125, + "step": 2893, + "time_per_iteration": 3.8296191692352295 + }, + { + "auxiliary_loss_clip": 0.01152082, + "auxiliary_loss_mlp": 0.010434, + "balance_loss_clip": 1.02375412, + "balance_loss_mlp": 1.05193436, + "epoch": 0.17399669322110325, + "flos": 23367971491200.0, + "grad_norm": 2.117368029368179, + "language_loss": 0.83073241, + "learning_rate": 3.7864034555249828e-06, + "loss": 0.85268712, + "num_input_tokens_seen": 62611220, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0, + "step": 2894, + "time_per_iteration": 3.9817075729370117 + }, + { + "auxiliary_loss_clip": 0.01150382, + "auxiliary_loss_mlp": 0.01047312, + "balance_loss_clip": 1.02523482, + "balance_loss_mlp": 1.05032384, + "epoch": 0.17405681647377122, + "flos": 22054107813120.0, + "grad_norm": 2.157907065313142, + "language_loss": 0.74051547, + "learning_rate": 3.786228297806741e-06, + "loss": 0.76249242, + "num_input_tokens_seen": 62629185, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.0, + "step": 2895, + "time_per_iteration": 2.461857318878174 + }, + { + "auxiliary_loss_clip": 0.01048544, + "auxiliary_loss_mlp": 0.01006984, + "balance_loss_clip": 1.00467134, + "balance_loss_mlp": 1.01600659, + "epoch": 0.1741169397264392, + "flos": 61457559114240.0, + "grad_norm": 0.8715266336267762, + "language_loss": 0.6273998, + "learning_rate": 3.7860530723545435e-06, + "loss": 0.64795506, + "num_input_tokens_seen": 62691895, + "router_z_loss_clip": 0.02307129, + "router_z_loss_mlp": 0.32421875, + "step": 2896, + "time_per_iteration": 3.1462173461914062 + }, + { + "auxiliary_loss_clip": 0.01148575, + "auxiliary_loss_mlp": 0.01041696, + "balance_loss_clip": 1.02160895, + "balance_loss_mlp": 1.04787612, + "epoch": 0.17417706297910718, + "flos": 27017680608000.0, + "grad_norm": 2.3238967096174923, + "language_loss": 0.75600475, + "learning_rate": 3.785877779175034e-06, + "loss": 0.77790749, + "num_input_tokens_seen": 62713790, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0078125, + "step": 2897, + "time_per_iteration": 2.4974682331085205 + }, + { + "auxiliary_loss_clip": 0.01147676, + "auxiliary_loss_mlp": 0.01042715, + "balance_loss_clip": 1.02354646, + "balance_loss_mlp": 1.05000067, + "epoch": 0.17423718623177514, + "flos": 33508856960640.0, + "grad_norm": 1.9004029304223122, + "language_loss": 0.69384712, + "learning_rate": 3.7857024182748606e-06, + "loss": 0.71575105, + "num_input_tokens_seen": 62736285, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.9765625, + "step": 2898, + "time_per_iteration": 2.5650558471679688 + }, + { + "auxiliary_loss_clip": 0.0115334, + "auxiliary_loss_mlp": 0.01049615, + "balance_loss_clip": 1.03026772, + "balance_loss_mlp": 1.05215359, + "epoch": 0.1742973094844431, + "flos": 27198634348800.0, + "grad_norm": 2.315885710988465, + "language_loss": 0.76069367, + "learning_rate": 3.7855269896606717e-06, + "loss": 0.78272319, + "num_input_tokens_seen": 62756240, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.015625, + "step": 2899, + "time_per_iteration": 2.5006191730499268 + }, + { + "auxiliary_loss_clip": 0.01145178, + "auxiliary_loss_mlp": 0.01045245, + "balance_loss_clip": 1.02571905, + "balance_loss_mlp": 1.04929495, + "epoch": 0.17435743273711107, + "flos": 22710734386560.0, + "grad_norm": 1.9440585306650153, + "language_loss": 0.72821134, + "learning_rate": 3.785351493339121e-06, + "loss": 0.75011557, + "num_input_tokens_seen": 62775910, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.9609375, + "step": 2900, + "time_per_iteration": 2.5199801921844482 + }, + { + "auxiliary_loss_clip": 0.01150074, + "auxiliary_loss_mlp": 0.01051215, + "balance_loss_clip": 1.03261876, + "balance_loss_mlp": 1.04989529, + "epoch": 0.17441755598977904, + "flos": 41646466039680.0, + "grad_norm": 1.6677330343015109, + "language_loss": 0.70085949, + "learning_rate": 3.785175929316863e-06, + "loss": 0.72287238, + "num_input_tokens_seen": 62799385, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0, + "step": 2901, + "time_per_iteration": 2.624864101409912 + }, + { + "auxiliary_loss_clip": 0.01152064, + "auxiliary_loss_mlp": 0.01048884, + "balance_loss_clip": 1.03022778, + "balance_loss_mlp": 1.05087507, + "epoch": 0.174477679242447, + "flos": 26287077974400.0, + "grad_norm": 1.7643324639769489, + "language_loss": 0.76549768, + "learning_rate": 3.7850002976005543e-06, + "loss": 0.78750718, + "num_input_tokens_seen": 62819380, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.0078125, + "step": 2902, + "time_per_iteration": 2.51680850982666 + }, + { + "auxiliary_loss_clip": 0.01150394, + "auxiliary_loss_mlp": 0.01056591, + "balance_loss_clip": 1.03741002, + "balance_loss_mlp": 1.04885221, + "epoch": 0.174537802495115, + "flos": 17858412990720.0, + "grad_norm": 2.129298660499851, + "language_loss": 0.81787169, + "learning_rate": 3.7848245981968558e-06, + "loss": 0.8399415, + "num_input_tokens_seen": 62836205, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.015625, + "step": 2903, + "time_per_iteration": 2.436877727508545 + }, + { + "auxiliary_loss_clip": 0.01148467, + "auxiliary_loss_mlp": 0.01041626, + "balance_loss_clip": 1.02255297, + "balance_loss_mlp": 1.04978609, + "epoch": 0.17459792574778296, + "flos": 16940715390720.0, + "grad_norm": 2.1703016783079327, + "language_loss": 0.73228866, + "learning_rate": 3.784648831112429e-06, + "loss": 0.75418955, + "num_input_tokens_seen": 62854045, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.98828125, + "step": 2904, + "time_per_iteration": 2.462775707244873 + }, + { + "auxiliary_loss_clip": 0.0114756, + "auxiliary_loss_mlp": 0.01045319, + "balance_loss_clip": 1.02719879, + "balance_loss_mlp": 1.04777265, + "epoch": 0.17465804900045093, + "flos": 25520026014720.0, + "grad_norm": 1.9374721445221084, + "language_loss": 0.64526325, + "learning_rate": 3.7844729963539406e-06, + "loss": 0.6671921, + "num_input_tokens_seen": 62873075, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.0, + "step": 2905, + "time_per_iteration": 2.468395233154297 + }, + { + "auxiliary_loss_clip": 0.01157304, + "auxiliary_loss_mlp": 0.01050089, + "balance_loss_clip": 1.0292747, + "balance_loss_mlp": 1.05202341, + "epoch": 0.1747181722531189, + "flos": 24129708238080.0, + "grad_norm": 1.804147248272645, + "language_loss": 0.79236615, + "learning_rate": 3.7842970939280566e-06, + "loss": 0.81444013, + "num_input_tokens_seen": 62892675, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.0546875, + "step": 2906, + "time_per_iteration": 2.4632725715637207 + }, + { + "auxiliary_loss_clip": 0.01150693, + "auxiliary_loss_mlp": 0.01055346, + "balance_loss_clip": 1.03577161, + "balance_loss_mlp": 1.05044913, + "epoch": 0.17477829550578686, + "flos": 17748813617280.0, + "grad_norm": 1.7929508882228948, + "language_loss": 0.81010377, + "learning_rate": 3.784121123841449e-06, + "loss": 0.83216417, + "num_input_tokens_seen": 62910675, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0, + "step": 2907, + "time_per_iteration": 2.4214229583740234 + }, + { + "auxiliary_loss_clip": 0.01152007, + "auxiliary_loss_mlp": 0.01050463, + "balance_loss_clip": 1.03161621, + "balance_loss_mlp": 1.05040026, + "epoch": 0.17483841875845482, + "flos": 15377344865280.0, + "grad_norm": 2.7402312811515515, + "language_loss": 0.81315112, + "learning_rate": 3.7839450861007886e-06, + "loss": 0.83517587, + "num_input_tokens_seen": 62928130, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.015625, + "step": 2908, + "time_per_iteration": 2.4340970516204834 + }, + { + "auxiliary_loss_clip": 0.01150458, + "auxiliary_loss_mlp": 0.01051278, + "balance_loss_clip": 1.03047633, + "balance_loss_mlp": 1.04978228, + "epoch": 0.17489854201112282, + "flos": 17163254102400.0, + "grad_norm": 2.419675279893618, + "language_loss": 0.80399191, + "learning_rate": 3.7837689807127518e-06, + "loss": 0.82600915, + "num_input_tokens_seen": 62944290, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.0078125, + "step": 2909, + "time_per_iteration": 2.4170033931732178 + }, + { + "auxiliary_loss_clip": 0.0115308, + "auxiliary_loss_mlp": 0.01053412, + "balance_loss_clip": 1.03319383, + "balance_loss_mlp": 1.05133021, + "epoch": 0.17495866526379078, + "flos": 19755286318080.0, + "grad_norm": 1.6998329053727648, + "language_loss": 0.76530939, + "learning_rate": 3.783592807684017e-06, + "loss": 0.78737426, + "num_input_tokens_seen": 62963505, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.015625, + "step": 2910, + "time_per_iteration": 2.457628011703491 + }, + { + "auxiliary_loss_clip": 0.01151475, + "auxiliary_loss_mlp": 0.01049527, + "balance_loss_clip": 1.02901077, + "balance_loss_mlp": 1.05060935, + "epoch": 0.17501878851645875, + "flos": 28511133310080.0, + "grad_norm": 1.6502133484544155, + "language_loss": 0.87255991, + "learning_rate": 3.7834165670212645e-06, + "loss": 0.89456993, + "num_input_tokens_seen": 62985020, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0078125, + "step": 2911, + "time_per_iteration": 2.5302672386169434 + }, + { + "auxiliary_loss_clip": 0.01148398, + "auxiliary_loss_mlp": 0.0105451, + "balance_loss_clip": 1.03349352, + "balance_loss_mlp": 1.04746377, + "epoch": 0.1750789117691267, + "flos": 17931203902080.0, + "grad_norm": 2.260601647926804, + "language_loss": 0.89586449, + "learning_rate": 3.7832402587311764e-06, + "loss": 0.91789353, + "num_input_tokens_seen": 63001745, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.0078125, + "step": 2912, + "time_per_iteration": 2.447650194168091 + }, + { + "auxiliary_loss_clip": 0.01150608, + "auxiliary_loss_mlp": 0.01050267, + "balance_loss_clip": 1.0302161, + "balance_loss_mlp": 1.04871392, + "epoch": 0.17513903502179468, + "flos": 18259427404800.0, + "grad_norm": 2.8836544870459813, + "language_loss": 0.7262938, + "learning_rate": 3.783063882820439e-06, + "loss": 0.74830252, + "num_input_tokens_seen": 63019750, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.015625, + "step": 2913, + "time_per_iteration": 2.423595666885376 + }, + { + "auxiliary_loss_clip": 0.01150722, + "auxiliary_loss_mlp": 0.01047113, + "balance_loss_clip": 1.02738369, + "balance_loss_mlp": 1.0522244, + "epoch": 0.17519915827446264, + "flos": 20704728562560.0, + "grad_norm": 2.243393227782369, + "language_loss": 0.68799925, + "learning_rate": 3.782887439295741e-06, + "loss": 0.70997757, + "num_input_tokens_seen": 63039500, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.984375, + "step": 2914, + "time_per_iteration": 2.46085262298584 + }, + { + "auxiliary_loss_clip": 0.01149727, + "auxiliary_loss_mlp": 0.01056566, + "balance_loss_clip": 1.03616977, + "balance_loss_mlp": 1.05143356, + "epoch": 0.1752592815271306, + "flos": 20523415685760.0, + "grad_norm": 1.8218690011087264, + "language_loss": 0.93755293, + "learning_rate": 3.782710928163772e-06, + "loss": 0.95961595, + "num_input_tokens_seen": 63059785, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.98046875, + "step": 2915, + "time_per_iteration": 2.457148551940918 + }, + { + "auxiliary_loss_clip": 0.01143068, + "auxiliary_loss_mlp": 0.01047095, + "balance_loss_clip": 1.02744889, + "balance_loss_mlp": 1.04722261, + "epoch": 0.1753194047797986, + "flos": 21799178012160.0, + "grad_norm": 1.8144768789670476, + "language_loss": 0.80869162, + "learning_rate": 3.782534349431226e-06, + "loss": 0.83059323, + "num_input_tokens_seen": 63079385, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.95703125, + "step": 2916, + "time_per_iteration": 2.4740476608276367 + }, + { + "auxiliary_loss_clip": 0.01150429, + "auxiliary_loss_mlp": 0.01056449, + "balance_loss_clip": 1.03663611, + "balance_loss_mlp": 1.04854608, + "epoch": 0.17537952803246656, + "flos": 20668351063680.0, + "grad_norm": 1.67512565222408, + "language_loss": 0.73645711, + "learning_rate": 3.782357703104799e-06, + "loss": 0.75852591, + "num_input_tokens_seen": 63098970, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.015625, + "step": 2917, + "time_per_iteration": 2.4484915733337402 + }, + { + "auxiliary_loss_clip": 0.01144993, + "auxiliary_loss_mlp": 0.01055794, + "balance_loss_clip": 1.03517044, + "balance_loss_mlp": 1.04897738, + "epoch": 0.17543965128513453, + "flos": 23295072839040.0, + "grad_norm": 12.675743752905372, + "language_loss": 0.77019119, + "learning_rate": 3.7821809891911897e-06, + "loss": 0.79219908, + "num_input_tokens_seen": 63118750, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 0.9609375, + "step": 2918, + "time_per_iteration": 2.4723429679870605 + }, + { + "auxiliary_loss_clip": 0.01154194, + "auxiliary_loss_mlp": 0.01046159, + "balance_loss_clip": 1.0260129, + "balance_loss_mlp": 1.05131745, + "epoch": 0.1754997745378025, + "flos": 29095615416960.0, + "grad_norm": 3.415786226656528, + "language_loss": 0.74196291, + "learning_rate": 3.782004207697098e-06, + "loss": 0.76396644, + "num_input_tokens_seen": 63136865, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.03125, + "step": 2919, + "time_per_iteration": 2.5049829483032227 + }, + { + "auxiliary_loss_clip": 0.01154792, + "auxiliary_loss_mlp": 0.01049917, + "balance_loss_clip": 1.03080809, + "balance_loss_mlp": 1.05090559, + "epoch": 0.17555989779047046, + "flos": 30371844620160.0, + "grad_norm": 1.7754050788280298, + "language_loss": 0.74211872, + "learning_rate": 3.781827358629228e-06, + "loss": 0.76416576, + "num_input_tokens_seen": 63158325, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0390625, + "step": 2920, + "time_per_iteration": 2.565361738204956 + }, + { + "auxiliary_loss_clip": 0.01144387, + "auxiliary_loss_mlp": 0.01039886, + "balance_loss_clip": 1.0219686, + "balance_loss_mlp": 1.04717219, + "epoch": 0.17562002104313842, + "flos": 23287746464640.0, + "grad_norm": 2.3164139995284834, + "language_loss": 0.7949307, + "learning_rate": 3.7816504419942873e-06, + "loss": 0.81677347, + "num_input_tokens_seen": 63173115, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.97265625, + "step": 2921, + "time_per_iteration": 2.4471213817596436 + }, + { + "auxiliary_loss_clip": 0.01153986, + "auxiliary_loss_mlp": 0.0104563, + "balance_loss_clip": 1.02569795, + "balance_loss_mlp": 1.05029321, + "epoch": 0.1756801442958064, + "flos": 24790500789120.0, + "grad_norm": 1.6170497741380607, + "language_loss": 0.87493849, + "learning_rate": 3.7814734577989823e-06, + "loss": 0.89693457, + "num_input_tokens_seen": 63192880, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0390625, + "step": 2922, + "time_per_iteration": 2.5042173862457275 + }, + { + "auxiliary_loss_clip": 0.01149338, + "auxiliary_loss_mlp": 0.01050477, + "balance_loss_clip": 1.03074801, + "balance_loss_mlp": 1.04808784, + "epoch": 0.17574026754847438, + "flos": 25771651764480.0, + "grad_norm": 2.3811708545321735, + "language_loss": 0.62097687, + "learning_rate": 3.7812964060500253e-06, + "loss": 0.64297503, + "num_input_tokens_seen": 63214395, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.015625, + "step": 2923, + "time_per_iteration": 2.5067484378814697 + }, + { + "auxiliary_loss_clip": 0.01154551, + "auxiliary_loss_mlp": 0.01048297, + "balance_loss_clip": 1.02775693, + "balance_loss_mlp": 1.05287814, + "epoch": 0.17580039080114235, + "flos": 17456608477440.0, + "grad_norm": 2.1344206016331797, + "language_loss": 0.80602306, + "learning_rate": 3.78111928675413e-06, + "loss": 0.82805157, + "num_input_tokens_seen": 63231020, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.015625, + "step": 2924, + "time_per_iteration": 2.453174114227295 + }, + { + "auxiliary_loss_clip": 0.0115147, + "auxiliary_loss_mlp": 0.01053673, + "balance_loss_clip": 1.03214407, + "balance_loss_mlp": 1.04809761, + "epoch": 0.1758605140538103, + "flos": 14864648088960.0, + "grad_norm": 3.672968077353321, + "language_loss": 0.70954067, + "learning_rate": 3.7809420999180126e-06, + "loss": 0.73159206, + "num_input_tokens_seen": 63246245, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.03125, + "step": 2925, + "time_per_iteration": 2.4666385650634766 + }, + { + "auxiliary_loss_clip": 0.01148763, + "auxiliary_loss_mlp": 0.0104438, + "balance_loss_clip": 1.02538979, + "balance_loss_mlp": 1.05147243, + "epoch": 0.17592063730647828, + "flos": 23004268329600.0, + "grad_norm": 1.6622274839000213, + "language_loss": 0.71700275, + "learning_rate": 3.7807648455483934e-06, + "loss": 0.73893416, + "num_input_tokens_seen": 63267790, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.97265625, + "step": 2926, + "time_per_iteration": 2.50289249420166 + }, + { + "auxiliary_loss_clip": 0.01150931, + "auxiliary_loss_mlp": 0.01043072, + "balance_loss_clip": 1.02191257, + "balance_loss_mlp": 1.04857433, + "epoch": 0.17598076055914624, + "flos": 20741501111040.0, + "grad_norm": 1.8916391197618272, + "language_loss": 0.84433806, + "learning_rate": 3.7805875236519918e-06, + "loss": 0.86627805, + "num_input_tokens_seen": 63286830, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.0234375, + "step": 2927, + "time_per_iteration": 2.447207450866699 + }, + { + "auxiliary_loss_clip": 0.01149947, + "auxiliary_loss_mlp": 0.01043802, + "balance_loss_clip": 1.02568233, + "balance_loss_mlp": 1.0506475, + "epoch": 0.1760408838118142, + "flos": 34092441227520.0, + "grad_norm": 1.8156588356210406, + "language_loss": 0.71879232, + "learning_rate": 3.7804101342355336e-06, + "loss": 0.74072987, + "num_input_tokens_seen": 63308870, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9921875, + "step": 2928, + "time_per_iteration": 2.585942029953003 + }, + { + "auxiliary_loss_clip": 0.01150116, + "auxiliary_loss_mlp": 0.01048544, + "balance_loss_clip": 1.028934, + "balance_loss_mlp": 1.05230594, + "epoch": 0.1761010070644822, + "flos": 24168384207360.0, + "grad_norm": 2.0402577824357886, + "language_loss": 0.83222824, + "learning_rate": 3.780232677305744e-06, + "loss": 0.85421479, + "num_input_tokens_seen": 63329005, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.9765625, + "step": 2929, + "time_per_iteration": 2.461101770401001 + }, + { + "auxiliary_loss_clip": 0.01149627, + "auxiliary_loss_mlp": 0.0104173, + "balance_loss_clip": 1.02298999, + "balance_loss_mlp": 1.0493536, + "epoch": 0.17616113031715017, + "flos": 26576697335040.0, + "grad_norm": 1.817429721867852, + "language_loss": 0.7933988, + "learning_rate": 3.7800551528693535e-06, + "loss": 0.81531239, + "num_input_tokens_seen": 63349390, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0, + "step": 2930, + "time_per_iteration": 2.491748571395874 + }, + { + "auxiliary_loss_clip": 0.01154203, + "auxiliary_loss_mlp": 0.0104708, + "balance_loss_clip": 1.02671921, + "balance_loss_mlp": 1.05319881, + "epoch": 0.17622125356981813, + "flos": 25666685245440.0, + "grad_norm": 2.194829469856105, + "language_loss": 0.76142448, + "learning_rate": 3.7798775609330927e-06, + "loss": 0.78343737, + "num_input_tokens_seen": 63368835, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0078125, + "step": 2931, + "time_per_iteration": 2.4907379150390625 + }, + { + "auxiliary_loss_clip": 0.01151699, + "auxiliary_loss_mlp": 0.0104003, + "balance_loss_clip": 1.0211587, + "balance_loss_mlp": 1.05108666, + "epoch": 0.1762813768224861, + "flos": 16508530949760.0, + "grad_norm": 2.8261445455709153, + "language_loss": 0.74740392, + "learning_rate": 3.779699901503696e-06, + "loss": 0.7693212, + "num_input_tokens_seen": 63385220, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0078125, + "step": 2932, + "time_per_iteration": 2.4252588748931885 + }, + { + "auxiliary_loss_clip": 0.01157373, + "auxiliary_loss_mlp": 0.01043179, + "balance_loss_clip": 1.0221262, + "balance_loss_mlp": 1.05086923, + "epoch": 0.17634150007515406, + "flos": 11211850402560.0, + "grad_norm": 2.4930669650063355, + "language_loss": 0.8968839, + "learning_rate": 3.7795221745879016e-06, + "loss": 0.9188894, + "num_input_tokens_seen": 63400865, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.0625, + "step": 2933, + "time_per_iteration": 2.4334278106689453 + }, + { + "auxiliary_loss_clip": 0.01147962, + "auxiliary_loss_mlp": 0.01047507, + "balance_loss_clip": 1.02980459, + "balance_loss_mlp": 1.05053639, + "epoch": 0.17640162332782203, + "flos": 23659925235840.0, + "grad_norm": 1.6616334836184845, + "language_loss": 0.88273364, + "learning_rate": 3.779344380192448e-06, + "loss": 0.90468836, + "num_input_tokens_seen": 63421390, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9765625, + "step": 2934, + "time_per_iteration": 3.891472578048706 + }, + { + "auxiliary_loss_clip": 0.01147552, + "auxiliary_loss_mlp": 0.0104641, + "balance_loss_clip": 1.02827823, + "balance_loss_mlp": 1.04972959, + "epoch": 0.17646174658049, + "flos": 53796984606720.0, + "grad_norm": 1.7575209177187046, + "language_loss": 0.70843625, + "learning_rate": 3.779166518324077e-06, + "loss": 0.73037589, + "num_input_tokens_seen": 63444715, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.98046875, + "step": 2935, + "time_per_iteration": 5.650984287261963 + }, + { + "auxiliary_loss_clip": 0.01157572, + "auxiliary_loss_mlp": 0.0104314, + "balance_loss_clip": 1.02405488, + "balance_loss_mlp": 1.05251908, + "epoch": 0.17652186983315798, + "flos": 24243868638720.0, + "grad_norm": 2.2448658169111795, + "language_loss": 0.69255942, + "learning_rate": 3.7789885889895325e-06, + "loss": 0.71456659, + "num_input_tokens_seen": 63465525, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0546875, + "step": 2936, + "time_per_iteration": 2.4864091873168945 + }, + { + "auxiliary_loss_clip": 0.01154775, + "auxiliary_loss_mlp": 0.01045313, + "balance_loss_clip": 1.02758646, + "balance_loss_mlp": 1.05530488, + "epoch": 0.17658199308582595, + "flos": 27454282421760.0, + "grad_norm": 1.883537128373794, + "language_loss": 0.71391022, + "learning_rate": 3.7788105921955634e-06, + "loss": 0.73591107, + "num_input_tokens_seen": 63485815, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.99609375, + "step": 2937, + "time_per_iteration": 2.5096240043640137 + }, + { + "auxiliary_loss_clip": 0.01158892, + "auxiliary_loss_mlp": 0.010448, + "balance_loss_clip": 1.02461779, + "balance_loss_mlp": 1.05530524, + "epoch": 0.17664211633849392, + "flos": 22418672901120.0, + "grad_norm": 2.165923066719211, + "language_loss": 0.7584855, + "learning_rate": 3.7786325279489184e-06, + "loss": 0.78052241, + "num_input_tokens_seen": 63503905, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.03125, + "step": 2938, + "time_per_iteration": 2.475069284439087 + }, + { + "auxiliary_loss_clip": 0.01153261, + "auxiliary_loss_mlp": 0.01043059, + "balance_loss_clip": 1.02466512, + "balance_loss_mlp": 1.05156195, + "epoch": 0.17670223959116188, + "flos": 24715124098560.0, + "grad_norm": 2.20477923303766, + "language_loss": 0.71130306, + "learning_rate": 3.7784543962563495e-06, + "loss": 0.73326623, + "num_input_tokens_seen": 63521985, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.015625, + "step": 2939, + "time_per_iteration": 2.4806766510009766 + }, + { + "auxiliary_loss_clip": 0.01153772, + "auxiliary_loss_mlp": 0.0104332, + "balance_loss_clip": 1.02421093, + "balance_loss_mlp": 1.0538342, + "epoch": 0.17676236284382985, + "flos": 22527051212160.0, + "grad_norm": 3.125031265469358, + "language_loss": 0.73781312, + "learning_rate": 3.7782761971246115e-06, + "loss": 0.7597841, + "num_input_tokens_seen": 63539830, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0, + "step": 2940, + "time_per_iteration": 2.5438694953918457 + }, + { + "auxiliary_loss_clip": 0.01154904, + "auxiliary_loss_mlp": 0.01045748, + "balance_loss_clip": 1.02568471, + "balance_loss_mlp": 1.05372643, + "epoch": 0.1768224860964978, + "flos": 12385160161920.0, + "grad_norm": 2.4976558026918703, + "language_loss": 0.85003591, + "learning_rate": 3.7780979305604616e-06, + "loss": 0.87204242, + "num_input_tokens_seen": 63555495, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0078125, + "step": 2941, + "time_per_iteration": 2.4616622924804688 + }, + { + "auxiliary_loss_clip": 0.01154492, + "auxiliary_loss_mlp": 0.0104577, + "balance_loss_clip": 1.02687514, + "balance_loss_mlp": 1.05292201, + "epoch": 0.1768826093491658, + "flos": 24353360271360.0, + "grad_norm": 2.199835477442084, + "language_loss": 0.7711162, + "learning_rate": 3.7779195965706607e-06, + "loss": 0.79311877, + "num_input_tokens_seen": 63575290, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.015625, + "step": 2942, + "time_per_iteration": 2.512493848800659 + }, + { + "auxiliary_loss_clip": 0.01154308, + "auxiliary_loss_mlp": 0.01044542, + "balance_loss_clip": 1.02514625, + "balance_loss_mlp": 1.05181623, + "epoch": 0.17694273260183377, + "flos": 23587062497280.0, + "grad_norm": 1.9811917296629065, + "language_loss": 0.80591762, + "learning_rate": 3.77774119516197e-06, + "loss": 0.82790613, + "num_input_tokens_seen": 63594670, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0234375, + "step": 2943, + "time_per_iteration": 2.4898416996002197 + }, + { + "auxiliary_loss_clip": 0.01154834, + "auxiliary_loss_mlp": 0.01050893, + "balance_loss_clip": 1.02953053, + "balance_loss_mlp": 1.05046725, + "epoch": 0.17700285585450173, + "flos": 26760991040640.0, + "grad_norm": 2.9958912509352866, + "language_loss": 0.80558729, + "learning_rate": 3.777562726341155e-06, + "loss": 0.82764459, + "num_input_tokens_seen": 63614780, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.046875, + "step": 2944, + "time_per_iteration": 2.533968448638916 + }, + { + "auxiliary_loss_clip": 0.01154843, + "auxiliary_loss_mlp": 0.01062464, + "balance_loss_clip": 1.04353368, + "balance_loss_mlp": 1.05239737, + "epoch": 0.1770629791071697, + "flos": 42776323320960.0, + "grad_norm": 1.992535786356086, + "language_loss": 0.73450243, + "learning_rate": 3.7773841901149835e-06, + "loss": 0.75667548, + "num_input_tokens_seen": 63637190, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0234375, + "step": 2945, + "time_per_iteration": 2.641890287399292 + }, + { + "auxiliary_loss_clip": 0.01152525, + "auxiliary_loss_mlp": 0.01050215, + "balance_loss_clip": 1.03179753, + "balance_loss_mlp": 1.05274916, + "epoch": 0.17712310235983766, + "flos": 17345572560000.0, + "grad_norm": 2.3259800829895028, + "language_loss": 0.7778489, + "learning_rate": 3.7772055864902256e-06, + "loss": 0.79987633, + "num_input_tokens_seen": 63652140, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.99609375, + "step": 2946, + "time_per_iteration": 2.420511484146118 + }, + { + "auxiliary_loss_clip": 0.01150621, + "auxiliary_loss_mlp": 0.01051141, + "balance_loss_clip": 1.03190041, + "balance_loss_mlp": 1.05060697, + "epoch": 0.17718322561250563, + "flos": 23878477537920.0, + "grad_norm": 1.9846715459481197, + "language_loss": 0.76240218, + "learning_rate": 3.7770269154736535e-06, + "loss": 0.78441978, + "num_input_tokens_seen": 63671700, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.0, + "step": 2947, + "time_per_iteration": 2.485795259475708 + }, + { + "auxiliary_loss_clip": 0.01148639, + "auxiliary_loss_mlp": 0.01046512, + "balance_loss_clip": 1.02725959, + "balance_loss_mlp": 1.04881549, + "epoch": 0.1772433488651736, + "flos": 36466352104320.0, + "grad_norm": 2.7031010106606654, + "language_loss": 0.71890748, + "learning_rate": 3.7768481770720424e-06, + "loss": 0.74085903, + "num_input_tokens_seen": 63691685, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.99609375, + "step": 2948, + "time_per_iteration": 2.598586320877075 + }, + { + "auxiliary_loss_clip": 0.01151482, + "auxiliary_loss_mlp": 0.01051629, + "balance_loss_clip": 1.03313947, + "balance_loss_mlp": 1.05261326, + "epoch": 0.1773034721178416, + "flos": 26684716510080.0, + "grad_norm": 1.809900152556277, + "language_loss": 0.81843233, + "learning_rate": 3.776669371292171e-06, + "loss": 0.8404634, + "num_input_tokens_seen": 63711720, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.98828125, + "step": 2949, + "time_per_iteration": 2.496962547302246 + }, + { + "auxiliary_loss_clip": 0.01051917, + "auxiliary_loss_mlp": 0.01007586, + "balance_loss_clip": 1.00552368, + "balance_loss_mlp": 1.01889789, + "epoch": 0.17736359537050955, + "flos": 57117467617920.0, + "grad_norm": 0.7669309197050882, + "language_loss": 0.64973593, + "learning_rate": 3.7764904981408186e-06, + "loss": 0.670331, + "num_input_tokens_seen": 63776280, + "router_z_loss_clip": 0.02062988, + "router_z_loss_mlp": 0.33007812, + "step": 2950, + "time_per_iteration": 3.1220879554748535 + }, + { + "auxiliary_loss_clip": 0.01145274, + "auxiliary_loss_mlp": 0.01049164, + "balance_loss_clip": 1.02992332, + "balance_loss_mlp": 1.04777181, + "epoch": 0.17742371862317752, + "flos": 27198203385600.0, + "grad_norm": 1.9502306021254343, + "language_loss": 0.83540517, + "learning_rate": 3.7763115576247686e-06, + "loss": 0.85734957, + "num_input_tokens_seen": 63797535, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.9765625, + "step": 2951, + "time_per_iteration": 2.5360641479492188 + }, + { + "auxiliary_loss_clip": 0.01153398, + "auxiliary_loss_mlp": 0.01055919, + "balance_loss_clip": 1.03710794, + "balance_loss_mlp": 1.04963326, + "epoch": 0.17748384187584548, + "flos": 20959694277120.0, + "grad_norm": 3.175759961241781, + "language_loss": 0.80564123, + "learning_rate": 3.776132549750806e-06, + "loss": 0.82773435, + "num_input_tokens_seen": 63817045, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0390625, + "step": 2952, + "time_per_iteration": 2.478635787963867 + }, + { + "auxiliary_loss_clip": 0.01150606, + "auxiliary_loss_mlp": 0.01051207, + "balance_loss_clip": 1.03157318, + "balance_loss_mlp": 1.05045855, + "epoch": 0.17754396512851345, + "flos": 25009986844800.0, + "grad_norm": 2.157061982289712, + "language_loss": 0.79982865, + "learning_rate": 3.7759534745257194e-06, + "loss": 0.82184678, + "num_input_tokens_seen": 63837665, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0, + "step": 2953, + "time_per_iteration": 2.5143978595733643 + }, + { + "auxiliary_loss_clip": 0.01152559, + "auxiliary_loss_mlp": 0.01048489, + "balance_loss_clip": 1.03003526, + "balance_loss_mlp": 1.05173969, + "epoch": 0.1776040883811814, + "flos": 32051566275840.0, + "grad_norm": 1.8943960347088487, + "language_loss": 0.88006002, + "learning_rate": 3.7757743319562994e-06, + "loss": 0.90207046, + "num_input_tokens_seen": 63858455, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.0078125, + "step": 2954, + "time_per_iteration": 2.575603485107422 + }, + { + "auxiliary_loss_clip": 0.01150383, + "auxiliary_loss_mlp": 0.01052239, + "balance_loss_clip": 1.0327127, + "balance_loss_mlp": 1.05101538, + "epoch": 0.17766421163384938, + "flos": 21574125348480.0, + "grad_norm": 2.123866524492404, + "language_loss": 0.84441978, + "learning_rate": 3.7755951220493386e-06, + "loss": 0.86644602, + "num_input_tokens_seen": 63876935, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.9921875, + "step": 2955, + "time_per_iteration": 2.476022958755493 + }, + { + "auxiliary_loss_clip": 0.01148363, + "auxiliary_loss_mlp": 0.0104412, + "balance_loss_clip": 1.02591681, + "balance_loss_mlp": 1.04843807, + "epoch": 0.17772433488651737, + "flos": 22419319345920.0, + "grad_norm": 2.0229859139182382, + "language_loss": 0.71172267, + "learning_rate": 3.7754158448116327e-06, + "loss": 0.73364747, + "num_input_tokens_seen": 63896815, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.0, + "step": 2956, + "time_per_iteration": 2.4795608520507812 + }, + { + "auxiliary_loss_clip": 0.0114893, + "auxiliary_loss_mlp": 0.01052163, + "balance_loss_clip": 1.03226662, + "balance_loss_mlp": 1.04974461, + "epoch": 0.17778445813918534, + "flos": 25629445820160.0, + "grad_norm": 1.891261769499534, + "language_loss": 0.82908547, + "learning_rate": 3.7752365002499795e-06, + "loss": 0.85109639, + "num_input_tokens_seen": 63916140, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.9921875, + "step": 2957, + "time_per_iteration": 2.494279384613037 + }, + { + "auxiliary_loss_clip": 0.01146796, + "auxiliary_loss_mlp": 0.01046422, + "balance_loss_clip": 1.02819514, + "balance_loss_mlp": 1.04814482, + "epoch": 0.1778445813918533, + "flos": 25628871202560.0, + "grad_norm": 1.926043663168548, + "language_loss": 0.75286758, + "learning_rate": 3.7750570883711807e-06, + "loss": 0.7747997, + "num_input_tokens_seen": 63935220, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.984375, + "step": 2958, + "time_per_iteration": 2.532339572906494 + }, + { + "auxiliary_loss_clip": 0.01153516, + "auxiliary_loss_mlp": 0.01043348, + "balance_loss_clip": 1.02483475, + "balance_loss_mlp": 1.05278933, + "epoch": 0.17790470464452127, + "flos": 22345522853760.0, + "grad_norm": 2.0794730574663265, + "language_loss": 0.79558724, + "learning_rate": 3.7748776091820397e-06, + "loss": 0.8175559, + "num_input_tokens_seen": 63954550, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.0078125, + "step": 2959, + "time_per_iteration": 2.45941424369812 + }, + { + "auxiliary_loss_clip": 0.01152953, + "auxiliary_loss_mlp": 0.01045372, + "balance_loss_clip": 1.02573824, + "balance_loss_mlp": 1.04968762, + "epoch": 0.17796482789718923, + "flos": 18765875214720.0, + "grad_norm": 2.284306220471852, + "language_loss": 0.52288693, + "learning_rate": 3.774698062689362e-06, + "loss": 0.5448702, + "num_input_tokens_seen": 63972425, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.03125, + "step": 2960, + "time_per_iteration": 2.4603421688079834 + }, + { + "auxiliary_loss_clip": 0.01154348, + "auxiliary_loss_mlp": 0.01055908, + "balance_loss_clip": 1.03685808, + "balance_loss_mlp": 1.05185843, + "epoch": 0.1780249511498572, + "flos": 23440941970560.0, + "grad_norm": 1.9615261009939866, + "language_loss": 0.89047921, + "learning_rate": 3.7745184488999548e-06, + "loss": 0.9125818, + "num_input_tokens_seen": 63992165, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0234375, + "step": 2961, + "time_per_iteration": 2.475848913192749 + }, + { + "auxiliary_loss_clip": 0.01151915, + "auxiliary_loss_mlp": 0.01051556, + "balance_loss_clip": 1.0313381, + "balance_loss_mlp": 1.04849648, + "epoch": 0.1780850744025252, + "flos": 23367468700800.0, + "grad_norm": 2.2193748892921517, + "language_loss": 0.79186273, + "learning_rate": 3.774338767820631e-06, + "loss": 0.81389749, + "num_input_tokens_seen": 64013470, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.03125, + "step": 2962, + "time_per_iteration": 2.508054256439209 + }, + { + "auxiliary_loss_clip": 0.011535, + "auxiliary_loss_mlp": 0.01051549, + "balance_loss_clip": 1.03175986, + "balance_loss_mlp": 1.0524615, + "epoch": 0.17814519765519315, + "flos": 13771994319360.0, + "grad_norm": 1.9550413638631114, + "language_loss": 0.74514943, + "learning_rate": 3.774159019458203e-06, + "loss": 0.76719993, + "num_input_tokens_seen": 64030975, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.015625, + "step": 2963, + "time_per_iteration": 2.4414234161376953 + }, + { + "auxiliary_loss_clip": 0.01156042, + "auxiliary_loss_mlp": 0.01048013, + "balance_loss_clip": 1.02822399, + "balance_loss_mlp": 1.05221784, + "epoch": 0.17820532090786112, + "flos": 21976396738560.0, + "grad_norm": 1.541363360665875, + "language_loss": 0.78624183, + "learning_rate": 3.7739792038194877e-06, + "loss": 0.80828238, + "num_input_tokens_seen": 64050075, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.0390625, + "step": 2964, + "time_per_iteration": 2.502497911453247 + }, + { + "auxiliary_loss_clip": 0.0115044, + "auxiliary_loss_mlp": 0.01056098, + "balance_loss_clip": 1.03661871, + "balance_loss_mlp": 1.05026746, + "epoch": 0.17826544416052909, + "flos": 24790752184320.0, + "grad_norm": 1.923237578914178, + "language_loss": 0.81686175, + "learning_rate": 3.7737993209113027e-06, + "loss": 0.83892715, + "num_input_tokens_seen": 64071920, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0, + "step": 2965, + "time_per_iteration": 2.538076400756836 + }, + { + "auxiliary_loss_clip": 0.01147349, + "auxiliary_loss_mlp": 0.01049832, + "balance_loss_clip": 1.03273785, + "balance_loss_mlp": 1.04941893, + "epoch": 0.17832556741319705, + "flos": 13879582531200.0, + "grad_norm": 2.2408088539265183, + "language_loss": 0.94580686, + "learning_rate": 3.7736193707404698e-06, + "loss": 0.96777868, + "num_input_tokens_seen": 64086835, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.98046875, + "step": 2966, + "time_per_iteration": 2.43082332611084 + }, + { + "auxiliary_loss_clip": 0.01149854, + "auxiliary_loss_mlp": 0.01045128, + "balance_loss_clip": 1.02470756, + "balance_loss_mlp": 1.05002928, + "epoch": 0.17838569066586502, + "flos": 36641703323520.0, + "grad_norm": 2.145285080590972, + "language_loss": 0.72469354, + "learning_rate": 3.7734393533138127e-06, + "loss": 0.74664342, + "num_input_tokens_seen": 64107360, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0, + "step": 2967, + "time_per_iteration": 2.5735998153686523 + }, + { + "auxiliary_loss_clip": 0.01145139, + "auxiliary_loss_mlp": 0.01044527, + "balance_loss_clip": 1.02613282, + "balance_loss_mlp": 1.04889679, + "epoch": 0.17844581391853298, + "flos": 18727271072640.0, + "grad_norm": 2.088672387523525, + "language_loss": 0.76831949, + "learning_rate": 3.773259268638157e-06, + "loss": 0.79021615, + "num_input_tokens_seen": 64124690, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.9609375, + "step": 2968, + "time_per_iteration": 2.437344789505005 + }, + { + "auxiliary_loss_clip": 0.01147824, + "auxiliary_loss_mlp": 0.01047277, + "balance_loss_clip": 1.0287044, + "balance_loss_mlp": 1.04982233, + "epoch": 0.17850593717120097, + "flos": 27378259286400.0, + "grad_norm": 3.3962137266502075, + "language_loss": 0.75934523, + "learning_rate": 3.7730791167203333e-06, + "loss": 0.78129619, + "num_input_tokens_seen": 64146315, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.98046875, + "step": 2969, + "time_per_iteration": 2.5003507137298584 + }, + { + "auxiliary_loss_clip": 0.01047445, + "auxiliary_loss_mlp": 0.01001591, + "balance_loss_clip": 0.99940914, + "balance_loss_mlp": 1.01426291, + "epoch": 0.17856606042386894, + "flos": 66996025084800.0, + "grad_norm": 0.8459028719848601, + "language_loss": 0.69080526, + "learning_rate": 3.772898897567171e-06, + "loss": 0.7112956, + "num_input_tokens_seen": 64210875, + "router_z_loss_clip": 0.02185059, + "router_z_loss_mlp": 0.33203125, + "step": 2970, + "time_per_iteration": 3.1193249225616455 + }, + { + "auxiliary_loss_clip": 0.01153596, + "auxiliary_loss_mlp": 0.01041832, + "balance_loss_clip": 1.0229373, + "balance_loss_mlp": 1.0498271, + "epoch": 0.1786261836765369, + "flos": 36977001805440.0, + "grad_norm": 2.0858657386647614, + "language_loss": 0.67452097, + "learning_rate": 3.772718611185505e-06, + "loss": 0.69647527, + "num_input_tokens_seen": 64230740, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0390625, + "step": 2971, + "time_per_iteration": 2.580946683883667 + }, + { + "auxiliary_loss_clip": 0.01146095, + "auxiliary_loss_mlp": 0.0105018, + "balance_loss_clip": 1.03059363, + "balance_loss_mlp": 1.04643905, + "epoch": 0.17868630692920487, + "flos": 24825441744000.0, + "grad_norm": 1.713623966203784, + "language_loss": 0.89631712, + "learning_rate": 3.7725382575821717e-06, + "loss": 0.91827983, + "num_input_tokens_seen": 64252300, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.99609375, + "step": 2972, + "time_per_iteration": 2.491608142852783 + }, + { + "auxiliary_loss_clip": 0.01150348, + "auxiliary_loss_mlp": 0.01056161, + "balance_loss_clip": 1.03762364, + "balance_loss_mlp": 1.05058205, + "epoch": 0.17874643018187283, + "flos": 16981977139200.0, + "grad_norm": 2.067523530387673, + "language_loss": 0.88030291, + "learning_rate": 3.77235783676401e-06, + "loss": 0.90236795, + "num_input_tokens_seen": 64270105, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0, + "step": 2973, + "time_per_iteration": 2.4357106685638428 + }, + { + "auxiliary_loss_clip": 0.01148396, + "auxiliary_loss_mlp": 0.01051569, + "balance_loss_clip": 1.03282917, + "balance_loss_mlp": 1.04979324, + "epoch": 0.1788065534345408, + "flos": 21032233793280.0, + "grad_norm": 2.1406659419236176, + "language_loss": 0.75648922, + "learning_rate": 3.7721773487378615e-06, + "loss": 0.77848881, + "num_input_tokens_seen": 64287250, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.984375, + "step": 2974, + "time_per_iteration": 2.484236478805542 + }, + { + "auxiliary_loss_clip": 0.01148515, + "auxiliary_loss_mlp": 0.01044686, + "balance_loss_clip": 1.02560067, + "balance_loss_mlp": 1.04925394, + "epoch": 0.17886667668720876, + "flos": 23987717775360.0, + "grad_norm": 2.8019304252630453, + "language_loss": 0.74556506, + "learning_rate": 3.7719967935105705e-06, + "loss": 0.76749712, + "num_input_tokens_seen": 64307140, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.9921875, + "step": 2975, + "time_per_iteration": 2.4658849239349365 + }, + { + "auxiliary_loss_clip": 0.01145454, + "auxiliary_loss_mlp": 0.0104533, + "balance_loss_clip": 1.02692378, + "balance_loss_mlp": 1.04805982, + "epoch": 0.17892679993987676, + "flos": 25739476156800.0, + "grad_norm": 1.5963289978134585, + "language_loss": 0.73245859, + "learning_rate": 3.7718161710889833e-06, + "loss": 0.7543664, + "num_input_tokens_seen": 64328760, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.97265625, + "step": 2976, + "time_per_iteration": 3.921170949935913 + }, + { + "auxiliary_loss_clip": 0.01140857, + "auxiliary_loss_mlp": 0.01040265, + "balance_loss_clip": 1.02455354, + "balance_loss_mlp": 1.04732931, + "epoch": 0.17898692319254472, + "flos": 25699686865920.0, + "grad_norm": 1.5556273460638488, + "language_loss": 0.77324069, + "learning_rate": 3.7716354814799495e-06, + "loss": 0.79505193, + "num_input_tokens_seen": 64348800, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.9375, + "step": 2977, + "time_per_iteration": 5.459218978881836 + }, + { + "auxiliary_loss_clip": 0.01150602, + "auxiliary_loss_mlp": 0.0105157, + "balance_loss_clip": 1.03352153, + "balance_loss_mlp": 1.05327988, + "epoch": 0.1790470464452127, + "flos": 19317786664320.0, + "grad_norm": 2.814268655584857, + "language_loss": 0.79470795, + "learning_rate": 3.7714547246903203e-06, + "loss": 0.81672966, + "num_input_tokens_seen": 64367955, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.97265625, + "step": 2978, + "time_per_iteration": 2.4917376041412354 + }, + { + "auxiliary_loss_clip": 0.01152273, + "auxiliary_loss_mlp": 0.0104187, + "balance_loss_clip": 1.022892, + "balance_loss_mlp": 1.04982674, + "epoch": 0.17910716969788065, + "flos": 30044267562240.0, + "grad_norm": 1.6585859201367117, + "language_loss": 0.76166439, + "learning_rate": 3.7712739007269508e-06, + "loss": 0.78360581, + "num_input_tokens_seen": 64389805, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0234375, + "step": 2979, + "time_per_iteration": 2.5283753871917725 + }, + { + "auxiliary_loss_clip": 0.01144584, + "auxiliary_loss_mlp": 0.01046117, + "balance_loss_clip": 1.0283196, + "balance_loss_mlp": 1.04760695, + "epoch": 0.17916729295054862, + "flos": 19427709260160.0, + "grad_norm": 2.3100878996861014, + "language_loss": 0.69246143, + "learning_rate": 3.7710930095966976e-06, + "loss": 0.7143684, + "num_input_tokens_seen": 64408220, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.96875, + "step": 2980, + "time_per_iteration": 2.452199935913086 + }, + { + "auxiliary_loss_clip": 0.01148553, + "auxiliary_loss_mlp": 0.01047507, + "balance_loss_clip": 1.02703881, + "balance_loss_mlp": 1.04957294, + "epoch": 0.17922741620321658, + "flos": 14611549881600.0, + "grad_norm": 1.6769030770257147, + "language_loss": 0.7077347, + "learning_rate": 3.7709120513064196e-06, + "loss": 0.72969532, + "num_input_tokens_seen": 64426380, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.98828125, + "step": 2981, + "time_per_iteration": 2.453328847885132 + }, + { + "auxiliary_loss_clip": 0.01151272, + "auxiliary_loss_mlp": 0.01057949, + "balance_loss_clip": 1.03929293, + "balance_loss_mlp": 1.05124855, + "epoch": 0.17928753945588458, + "flos": 17165301177600.0, + "grad_norm": 2.4096510966801916, + "language_loss": 0.82313269, + "learning_rate": 3.7707310258629796e-06, + "loss": 0.84522492, + "num_input_tokens_seen": 64444355, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.0, + "step": 2982, + "time_per_iteration": 2.4727423191070557 + }, + { + "auxiliary_loss_clip": 0.01145202, + "auxiliary_loss_mlp": 0.01048964, + "balance_loss_clip": 1.0309453, + "balance_loss_mlp": 1.04754186, + "epoch": 0.17934766270855254, + "flos": 31395622060800.0, + "grad_norm": 2.0170018574221404, + "language_loss": 0.82899523, + "learning_rate": 3.7705499332732413e-06, + "loss": 0.85093689, + "num_input_tokens_seen": 64467800, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9765625, + "step": 2983, + "time_per_iteration": 2.5544486045837402 + }, + { + "auxiliary_loss_clip": 0.01148269, + "auxiliary_loss_mlp": 0.01051569, + "balance_loss_clip": 1.03234076, + "balance_loss_mlp": 1.04676509, + "epoch": 0.1794077859612205, + "flos": 20814184281600.0, + "grad_norm": 2.0025677466759175, + "language_loss": 0.84977567, + "learning_rate": 3.7703687735440718e-06, + "loss": 0.87177408, + "num_input_tokens_seen": 64487230, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.015625, + "step": 2984, + "time_per_iteration": 2.461451530456543 + }, + { + "auxiliary_loss_clip": 0.01146636, + "auxiliary_loss_mlp": 0.01044432, + "balance_loss_clip": 1.02558494, + "balance_loss_mlp": 1.04734373, + "epoch": 0.17946790921388847, + "flos": 28986447006720.0, + "grad_norm": 2.5972673531528874, + "language_loss": 0.89526331, + "learning_rate": 3.7701875466823416e-06, + "loss": 0.91717398, + "num_input_tokens_seen": 64509165, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.9921875, + "step": 2985, + "time_per_iteration": 2.5644643306732178 + }, + { + "auxiliary_loss_clip": 0.01142965, + "auxiliary_loss_mlp": 0.01045202, + "balance_loss_clip": 1.02879906, + "balance_loss_mlp": 1.0478375, + "epoch": 0.17952803246655644, + "flos": 20737406960640.0, + "grad_norm": 1.9029387971382474, + "language_loss": 0.69863129, + "learning_rate": 3.770006252694922e-06, + "loss": 0.72051299, + "num_input_tokens_seen": 64527940, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.953125, + "step": 2986, + "time_per_iteration": 2.4629499912261963 + }, + { + "auxiliary_loss_clip": 0.01144523, + "auxiliary_loss_mlp": 0.01043434, + "balance_loss_clip": 1.02507591, + "balance_loss_mlp": 1.04828227, + "epoch": 0.1795881557192244, + "flos": 28255988027520.0, + "grad_norm": 2.203273814413497, + "language_loss": 0.77872753, + "learning_rate": 3.769824891588688e-06, + "loss": 0.80060714, + "num_input_tokens_seen": 64545230, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.96484375, + "step": 2987, + "time_per_iteration": 2.524712562561035 + }, + { + "auxiliary_loss_clip": 0.01149287, + "auxiliary_loss_mlp": 0.01043881, + "balance_loss_clip": 1.02412844, + "balance_loss_mlp": 1.04834962, + "epoch": 0.17964827897189237, + "flos": 18552027594240.0, + "grad_norm": 2.225668764256514, + "language_loss": 0.78012109, + "learning_rate": 3.7696434633705164e-06, + "loss": 0.8020528, + "num_input_tokens_seen": 64563820, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0078125, + "step": 2988, + "time_per_iteration": 2.4608163833618164 + }, + { + "auxiliary_loss_clip": 0.01048374, + "auxiliary_loss_mlp": 0.01007691, + "balance_loss_clip": 1.00570035, + "balance_loss_mlp": 1.0154314, + "epoch": 0.17970840222456036, + "flos": 58165088711040.0, + "grad_norm": 0.7961406236538413, + "language_loss": 0.62767559, + "learning_rate": 3.7694619680472875e-06, + "loss": 0.64823627, + "num_input_tokens_seen": 64621315, + "router_z_loss_clip": 0.01989746, + "router_z_loss_mlp": 0.33007812, + "step": 2989, + "time_per_iteration": 2.9831957817077637 + }, + { + "auxiliary_loss_clip": 0.01146079, + "auxiliary_loss_mlp": 0.01041184, + "balance_loss_clip": 1.02363658, + "balance_loss_mlp": 1.04836369, + "epoch": 0.17976852547722832, + "flos": 20300805146880.0, + "grad_norm": 3.4434429944335525, + "language_loss": 0.70464563, + "learning_rate": 3.7692804056258837e-06, + "loss": 0.72651821, + "num_input_tokens_seen": 64639885, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.98046875, + "step": 2990, + "time_per_iteration": 2.556100606918335 + }, + { + "auxiliary_loss_clip": 0.01146243, + "auxiliary_loss_mlp": 0.01039011, + "balance_loss_clip": 1.0210464, + "balance_loss_mlp": 1.04735422, + "epoch": 0.1798286487298963, + "flos": 39669367685760.0, + "grad_norm": 1.7649502456354873, + "language_loss": 0.68110204, + "learning_rate": 3.7690987761131893e-06, + "loss": 0.70295459, + "num_input_tokens_seen": 64661220, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.98828125, + "step": 2991, + "time_per_iteration": 2.6224544048309326 + }, + { + "auxiliary_loss_clip": 0.01145545, + "auxiliary_loss_mlp": 0.01040119, + "balance_loss_clip": 1.02194011, + "balance_loss_mlp": 1.04794931, + "epoch": 0.17988877198256426, + "flos": 25520313323520.0, + "grad_norm": 1.5716432326573742, + "language_loss": 0.82754636, + "learning_rate": 3.7689170795160924e-06, + "loss": 0.84940296, + "num_input_tokens_seen": 64682530, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9765625, + "step": 2992, + "time_per_iteration": 2.51824951171875 + }, + { + "auxiliary_loss_clip": 0.01138637, + "auxiliary_loss_mlp": 0.01040458, + "balance_loss_clip": 1.02301776, + "balance_loss_mlp": 1.04464579, + "epoch": 0.17994889523523222, + "flos": 18807496099200.0, + "grad_norm": 2.1353598877924806, + "language_loss": 0.81958085, + "learning_rate": 3.7687353158414822e-06, + "loss": 0.84137177, + "num_input_tokens_seen": 64701025, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.94140625, + "step": 2993, + "time_per_iteration": 2.4349074363708496 + }, + { + "auxiliary_loss_clip": 0.01143824, + "auxiliary_loss_mlp": 0.01047314, + "balance_loss_clip": 1.02889621, + "balance_loss_mlp": 1.04586673, + "epoch": 0.18000901848790019, + "flos": 21104450087040.0, + "grad_norm": 1.7254805142405878, + "language_loss": 0.78390837, + "learning_rate": 3.7685534850962517e-06, + "loss": 0.80581975, + "num_input_tokens_seen": 64719570, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.98046875, + "step": 2994, + "time_per_iteration": 2.4898691177368164 + }, + { + "auxiliary_loss_clip": 0.01148185, + "auxiliary_loss_mlp": 0.01043708, + "balance_loss_clip": 1.02642226, + "balance_loss_mlp": 1.04966068, + "epoch": 0.18006914174056818, + "flos": 19646441130240.0, + "grad_norm": 1.8689491925476576, + "language_loss": 0.80392146, + "learning_rate": 3.768371587287296e-06, + "loss": 0.82584035, + "num_input_tokens_seen": 64738110, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.984375, + "step": 2995, + "time_per_iteration": 2.4521572589874268 + }, + { + "auxiliary_loss_clip": 0.01144196, + "auxiliary_loss_mlp": 0.01046311, + "balance_loss_clip": 1.02939498, + "balance_loss_mlp": 1.04679298, + "epoch": 0.18012926499323614, + "flos": 19499889640320.0, + "grad_norm": 1.5635152056288029, + "language_loss": 0.84467834, + "learning_rate": 3.768189622421512e-06, + "loss": 0.86658335, + "num_input_tokens_seen": 64756345, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.97265625, + "step": 2996, + "time_per_iteration": 2.46993088722229 + }, + { + "auxiliary_loss_clip": 0.01139788, + "auxiliary_loss_mlp": 0.01041997, + "balance_loss_clip": 1.02493799, + "balance_loss_mlp": 1.04656756, + "epoch": 0.1801893882459041, + "flos": 19464553635840.0, + "grad_norm": 2.9197857622903793, + "language_loss": 0.88254511, + "learning_rate": 3.7680075905058006e-06, + "loss": 0.90436304, + "num_input_tokens_seen": 64776375, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9296875, + "step": 2997, + "time_per_iteration": 2.470113515853882 + }, + { + "auxiliary_loss_clip": 0.01148809, + "auxiliary_loss_mlp": 0.01043259, + "balance_loss_clip": 1.02435279, + "balance_loss_mlp": 1.04666877, + "epoch": 0.18024951149857207, + "flos": 26870590414080.0, + "grad_norm": 2.5635961030192935, + "language_loss": 0.8504566, + "learning_rate": 3.7678254915470643e-06, + "loss": 0.87237728, + "num_input_tokens_seen": 64796210, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0234375, + "step": 2998, + "time_per_iteration": 2.5252864360809326 + }, + { + "auxiliary_loss_clip": 0.0114547, + "auxiliary_loss_mlp": 0.01045025, + "balance_loss_clip": 1.02783537, + "balance_loss_mlp": 1.05022454, + "epoch": 0.18030963475124004, + "flos": 30226621933440.0, + "grad_norm": 1.8695557812200347, + "language_loss": 0.84270376, + "learning_rate": 3.7676433255522084e-06, + "loss": 0.86460871, + "num_input_tokens_seen": 64818590, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.953125, + "step": 2999, + "time_per_iteration": 2.5272696018218994 + }, + { + "auxiliary_loss_clip": 0.01143823, + "auxiliary_loss_mlp": 0.01044085, + "balance_loss_clip": 1.02577412, + "balance_loss_mlp": 1.04662383, + "epoch": 0.180369758003908, + "flos": 22307493329280.0, + "grad_norm": 1.7700032623605295, + "language_loss": 0.74753368, + "learning_rate": 3.76746109252814e-06, + "loss": 0.76941276, + "num_input_tokens_seen": 64838350, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.97265625, + "step": 3000, + "time_per_iteration": 2.4800922870635986 + }, + { + "auxiliary_loss_clip": 0.01143329, + "auxiliary_loss_mlp": 0.01060132, + "balance_loss_clip": 1.04111791, + "balance_loss_mlp": 1.04825568, + "epoch": 0.18042988125657597, + "flos": 23732033788800.0, + "grad_norm": 2.369063359757221, + "language_loss": 0.71625632, + "learning_rate": 3.76727879248177e-06, + "loss": 0.73829091, + "num_input_tokens_seen": 64858065, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.953125, + "step": 3001, + "time_per_iteration": 2.484309434890747 + }, + { + "auxiliary_loss_clip": 0.01148499, + "auxiliary_loss_mlp": 0.01048814, + "balance_loss_clip": 1.03010964, + "balance_loss_mlp": 1.04815364, + "epoch": 0.18049000450924396, + "flos": 24093582134400.0, + "grad_norm": 2.7240097708601225, + "language_loss": 0.87795258, + "learning_rate": 3.767096425420011e-06, + "loss": 0.89992571, + "num_input_tokens_seen": 64877305, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.0078125, + "step": 3002, + "time_per_iteration": 2.4881784915924072 + }, + { + "auxiliary_loss_clip": 0.011444, + "auxiliary_loss_mlp": 0.01044754, + "balance_loss_clip": 1.02689672, + "balance_loss_mlp": 1.04694915, + "epoch": 0.18055012776191193, + "flos": 22163168482560.0, + "grad_norm": 1.6880476069492312, + "language_loss": 0.80563951, + "learning_rate": 3.7669139913497788e-06, + "loss": 0.8275311, + "num_input_tokens_seen": 64896955, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.9765625, + "step": 3003, + "time_per_iteration": 2.452103614807129 + }, + { + "auxiliary_loss_clip": 0.0114812, + "auxiliary_loss_mlp": 0.01044755, + "balance_loss_clip": 1.02673101, + "balance_loss_mlp": 1.04780829, + "epoch": 0.1806102510145799, + "flos": 28913512440960.0, + "grad_norm": 2.4630533980116804, + "language_loss": 0.66931474, + "learning_rate": 3.7667314902779907e-06, + "loss": 0.69124347, + "num_input_tokens_seen": 64917080, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0, + "step": 3004, + "time_per_iteration": 2.5085701942443848 + }, + { + "auxiliary_loss_clip": 0.0114685, + "auxiliary_loss_mlp": 0.01050762, + "balance_loss_clip": 1.03184342, + "balance_loss_mlp": 1.04860806, + "epoch": 0.18067037426724786, + "flos": 19025689265280.0, + "grad_norm": 1.8927608809249736, + "language_loss": 0.85172975, + "learning_rate": 3.7665489222115677e-06, + "loss": 0.87370586, + "num_input_tokens_seen": 64935215, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.984375, + "step": 3005, + "time_per_iteration": 2.44529128074646 + }, + { + "auxiliary_loss_clip": 0.01141782, + "auxiliary_loss_mlp": 0.01042658, + "balance_loss_clip": 1.02611172, + "balance_loss_mlp": 1.04684031, + "epoch": 0.18073049751991582, + "flos": 27453635976960.0, + "grad_norm": 1.553419886600377, + "language_loss": 0.82951266, + "learning_rate": 3.766366287157432e-06, + "loss": 0.85135704, + "num_input_tokens_seen": 64956275, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.94921875, + "step": 3006, + "time_per_iteration": 2.532597780227661 + }, + { + "auxiliary_loss_clip": 0.01143778, + "auxiliary_loss_mlp": 0.01050753, + "balance_loss_clip": 1.0315007, + "balance_loss_mlp": 1.04581141, + "epoch": 0.1807906207725838, + "flos": 28729039167360.0, + "grad_norm": 1.6363768703600998, + "language_loss": 0.76883924, + "learning_rate": 3.7661835851225103e-06, + "loss": 0.79078454, + "num_input_tokens_seen": 64979390, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.98046875, + "step": 3007, + "time_per_iteration": 2.5265002250671387 + }, + { + "auxiliary_loss_clip": 0.01046842, + "auxiliary_loss_mlp": 0.01004593, + "balance_loss_clip": 1.00238752, + "balance_loss_mlp": 1.01358199, + "epoch": 0.18085074402525175, + "flos": 64466515468800.0, + "grad_norm": 0.8067080511403597, + "language_loss": 0.56949043, + "learning_rate": 3.7660008161137294e-06, + "loss": 0.59000474, + "num_input_tokens_seen": 65043135, + "router_z_loss_clip": 0.02209473, + "router_z_loss_mlp": 0.33203125, + "step": 3008, + "time_per_iteration": 3.1923961639404297 + }, + { + "auxiliary_loss_clip": 0.01148419, + "auxiliary_loss_mlp": 0.01048421, + "balance_loss_clip": 1.02878737, + "balance_loss_mlp": 1.04951596, + "epoch": 0.18091086727791975, + "flos": 23476960333440.0, + "grad_norm": 1.8063105677439477, + "language_loss": 0.67226636, + "learning_rate": 3.765817980138021e-06, + "loss": 0.69423479, + "num_input_tokens_seen": 65062845, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.98828125, + "step": 3009, + "time_per_iteration": 2.467525005340576 + }, + { + "auxiliary_loss_clip": 0.01147918, + "auxiliary_loss_mlp": 0.01047401, + "balance_loss_clip": 1.02993655, + "balance_loss_mlp": 1.04874969, + "epoch": 0.1809709905305877, + "flos": 24170467196160.0, + "grad_norm": 1.842230928142314, + "language_loss": 0.75573891, + "learning_rate": 3.7656350772023177e-06, + "loss": 0.77769208, + "num_input_tokens_seen": 65082110, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.98828125, + "step": 3010, + "time_per_iteration": 2.486067533493042 + }, + { + "auxiliary_loss_clip": 0.01141012, + "auxiliary_loss_mlp": 0.0104251, + "balance_loss_clip": 1.02585649, + "balance_loss_mlp": 1.04816866, + "epoch": 0.18103111378325568, + "flos": 21650902669440.0, + "grad_norm": 1.6130539386655762, + "language_loss": 0.66672593, + "learning_rate": 3.7654521073135553e-06, + "loss": 0.6885612, + "num_input_tokens_seen": 65101985, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9296875, + "step": 3011, + "time_per_iteration": 2.461749792098999 + }, + { + "auxiliary_loss_clip": 0.01142359, + "auxiliary_loss_mlp": 0.01048591, + "balance_loss_clip": 1.0309006, + "balance_loss_mlp": 1.04706419, + "epoch": 0.18109123703592364, + "flos": 53686918356480.0, + "grad_norm": 2.1517129990512927, + "language_loss": 0.71184897, + "learning_rate": 3.7652690704786723e-06, + "loss": 0.73375839, + "num_input_tokens_seen": 65129295, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.953125, + "step": 3012, + "time_per_iteration": 2.7380943298339844 + }, + { + "auxiliary_loss_clip": 0.01145213, + "auxiliary_loss_mlp": 0.01048493, + "balance_loss_clip": 1.03045654, + "balance_loss_mlp": 1.05109787, + "epoch": 0.1811513602885916, + "flos": 35845564325760.0, + "grad_norm": 2.2489260815019447, + "language_loss": 0.62039113, + "learning_rate": 3.765085966704609e-06, + "loss": 0.64232826, + "num_input_tokens_seen": 65150625, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.94140625, + "step": 3013, + "time_per_iteration": 2.5800936222076416 + }, + { + "auxiliary_loss_clip": 0.01145888, + "auxiliary_loss_mlp": 0.01050021, + "balance_loss_clip": 1.03303385, + "balance_loss_mlp": 1.04870379, + "epoch": 0.18121148354125957, + "flos": 23732572492800.0, + "grad_norm": 1.5535403171237991, + "language_loss": 0.76026124, + "learning_rate": 3.764902795998309e-06, + "loss": 0.7822203, + "num_input_tokens_seen": 65170880, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.97265625, + "step": 3014, + "time_per_iteration": 2.5049405097961426 + }, + { + "auxiliary_loss_clip": 0.01151342, + "auxiliary_loss_mlp": 0.01046668, + "balance_loss_clip": 1.02697504, + "balance_loss_mlp": 1.05086446, + "epoch": 0.18127160679392756, + "flos": 28728320895360.0, + "grad_norm": 1.7733972454950666, + "language_loss": 0.65696967, + "learning_rate": 3.7647195583667184e-06, + "loss": 0.67894971, + "num_input_tokens_seen": 65192530, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0, + "step": 3015, + "time_per_iteration": 2.52614426612854 + }, + { + "auxiliary_loss_clip": 0.01143858, + "auxiliary_loss_mlp": 0.01043977, + "balance_loss_clip": 1.0262742, + "balance_loss_mlp": 1.0490694, + "epoch": 0.18133173004659553, + "flos": 20485062938880.0, + "grad_norm": 1.7500400577379265, + "language_loss": 0.7809943, + "learning_rate": 3.764536253816785e-06, + "loss": 0.80287266, + "num_input_tokens_seen": 65211675, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9453125, + "step": 3016, + "time_per_iteration": 2.4736039638519287 + }, + { + "auxiliary_loss_clip": 0.01152649, + "auxiliary_loss_mlp": 0.01050768, + "balance_loss_clip": 1.03214788, + "balance_loss_mlp": 1.05294776, + "epoch": 0.1813918532992635, + "flos": 22852078404480.0, + "grad_norm": 1.6390488083316745, + "language_loss": 0.83498454, + "learning_rate": 3.7643528823554602e-06, + "loss": 0.85701871, + "num_input_tokens_seen": 65231185, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0, + "step": 3017, + "time_per_iteration": 2.454888105392456 + }, + { + "auxiliary_loss_clip": 0.01142751, + "auxiliary_loss_mlp": 0.01039497, + "balance_loss_clip": 1.02192545, + "balance_loss_mlp": 1.0486486, + "epoch": 0.18145197655193146, + "flos": 36065122208640.0, + "grad_norm": 2.2301629944757964, + "language_loss": 0.67067724, + "learning_rate": 3.764169443989697e-06, + "loss": 0.69249976, + "num_input_tokens_seen": 65251645, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.94140625, + "step": 3018, + "time_per_iteration": 3.950299024581909 + }, + { + "auxiliary_loss_clip": 0.01146405, + "auxiliary_loss_mlp": 0.0103774, + "balance_loss_clip": 1.01953697, + "balance_loss_mlp": 1.04928112, + "epoch": 0.18151209980459942, + "flos": 24023951619840.0, + "grad_norm": 2.174717508383113, + "language_loss": 0.75745898, + "learning_rate": 3.7639859387264518e-06, + "loss": 0.77930045, + "num_input_tokens_seen": 65271125, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.97265625, + "step": 3019, + "time_per_iteration": 3.9721574783325195 + }, + { + "auxiliary_loss_clip": 0.01149794, + "auxiliary_loss_mlp": 0.01045611, + "balance_loss_clip": 1.02653718, + "balance_loss_mlp": 1.05230832, + "epoch": 0.1815722230572674, + "flos": 23951627585280.0, + "grad_norm": 2.1373464597463574, + "language_loss": 0.81687438, + "learning_rate": 3.7638023665726834e-06, + "loss": 0.83882844, + "num_input_tokens_seen": 65290600, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.9765625, + "step": 3020, + "time_per_iteration": 2.510564088821411 + }, + { + "auxiliary_loss_clip": 0.01148401, + "auxiliary_loss_mlp": 0.01042965, + "balance_loss_clip": 1.02373672, + "balance_loss_mlp": 1.05124021, + "epoch": 0.18163234630993536, + "flos": 24386469632640.0, + "grad_norm": 2.9178918869439654, + "language_loss": 0.77220714, + "learning_rate": 3.763618727535352e-06, + "loss": 0.79412079, + "num_input_tokens_seen": 65311040, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.96875, + "step": 3021, + "time_per_iteration": 2.4856297969818115 + }, + { + "auxiliary_loss_clip": 0.01141247, + "auxiliary_loss_mlp": 0.0104233, + "balance_loss_clip": 1.02419829, + "balance_loss_mlp": 1.04617524, + "epoch": 0.18169246956260335, + "flos": 24681332378880.0, + "grad_norm": 1.7066661124221545, + "language_loss": 0.84841502, + "learning_rate": 3.763435021621422e-06, + "loss": 0.87025082, + "num_input_tokens_seen": 65332115, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.94921875, + "step": 3022, + "time_per_iteration": 2.4933700561523438 + }, + { + "auxiliary_loss_clip": 0.01148694, + "auxiliary_loss_mlp": 0.01042276, + "balance_loss_clip": 1.02296424, + "balance_loss_mlp": 1.0491302, + "epoch": 0.1817525928152713, + "flos": 24243294021120.0, + "grad_norm": 1.9452352079001236, + "language_loss": 0.69178426, + "learning_rate": 3.763251248837859e-06, + "loss": 0.7136941, + "num_input_tokens_seen": 65352210, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.9921875, + "step": 3023, + "time_per_iteration": 2.495107412338257 + }, + { + "auxiliary_loss_clip": 0.01144443, + "auxiliary_loss_mlp": 0.01044488, + "balance_loss_clip": 1.0261296, + "balance_loss_mlp": 1.04748738, + "epoch": 0.18181271606793928, + "flos": 16472081623680.0, + "grad_norm": 1.9417078000950883, + "language_loss": 0.73956865, + "learning_rate": 3.7630674091916317e-06, + "loss": 0.76145792, + "num_input_tokens_seen": 65370600, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.96875, + "step": 3024, + "time_per_iteration": 2.4531846046447754 + }, + { + "auxiliary_loss_clip": 0.01145205, + "auxiliary_loss_mlp": 0.01043186, + "balance_loss_clip": 1.02549553, + "balance_loss_mlp": 1.0490942, + "epoch": 0.18187283932060724, + "flos": 18581042805120.0, + "grad_norm": 2.344564071286257, + "language_loss": 0.88167858, + "learning_rate": 3.7628835026897123e-06, + "loss": 0.90356255, + "num_input_tokens_seen": 65387270, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9609375, + "step": 3025, + "time_per_iteration": 2.4708051681518555 + }, + { + "auxiliary_loss_clip": 0.01145802, + "auxiliary_loss_mlp": 0.01052568, + "balance_loss_clip": 1.03356552, + "balance_loss_mlp": 1.05046904, + "epoch": 0.1819329625732752, + "flos": 20266833859200.0, + "grad_norm": 2.755473586939447, + "language_loss": 0.79284346, + "learning_rate": 3.7626995293390735e-06, + "loss": 0.8148272, + "num_input_tokens_seen": 65406550, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.953125, + "step": 3026, + "time_per_iteration": 2.482987403869629 + }, + { + "auxiliary_loss_clip": 0.01149047, + "auxiliary_loss_mlp": 0.01053602, + "balance_loss_clip": 1.03424227, + "balance_loss_mlp": 1.0502665, + "epoch": 0.18199308582594317, + "flos": 25915186512000.0, + "grad_norm": 1.6571051349992714, + "language_loss": 0.76047945, + "learning_rate": 3.762515489146692e-06, + "loss": 0.78250599, + "num_input_tokens_seen": 65425955, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.98828125, + "step": 3027, + "time_per_iteration": 2.4952149391174316 + }, + { + "auxiliary_loss_clip": 0.01151758, + "auxiliary_loss_mlp": 0.01049071, + "balance_loss_clip": 1.03055763, + "balance_loss_mlp": 1.05106115, + "epoch": 0.18205320907861114, + "flos": 15377524433280.0, + "grad_norm": 1.7989426432275553, + "language_loss": 0.85400331, + "learning_rate": 3.762331382119546e-06, + "loss": 0.87601155, + "num_input_tokens_seen": 65442820, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0078125, + "step": 3028, + "time_per_iteration": 2.438113212585449 + }, + { + "auxiliary_loss_clip": 0.01144845, + "auxiliary_loss_mlp": 0.01043225, + "balance_loss_clip": 1.02543902, + "balance_loss_mlp": 1.04937243, + "epoch": 0.18211333233127913, + "flos": 25624310175360.0, + "grad_norm": 1.8205418995180693, + "language_loss": 0.82655656, + "learning_rate": 3.7621472082646183e-06, + "loss": 0.84843719, + "num_input_tokens_seen": 65461825, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.953125, + "step": 3029, + "time_per_iteration": 2.4866995811462402 + }, + { + "auxiliary_loss_clip": 0.01151958, + "auxiliary_loss_mlp": 0.01045395, + "balance_loss_clip": 1.02640462, + "balance_loss_mlp": 1.05306637, + "epoch": 0.1821734555839471, + "flos": 14976007228800.0, + "grad_norm": 2.0975281503542433, + "language_loss": 0.78150737, + "learning_rate": 3.761962967588891e-06, + "loss": 0.80348092, + "num_input_tokens_seen": 65479480, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.9921875, + "step": 3030, + "time_per_iteration": 2.458627700805664 + }, + { + "auxiliary_loss_clip": 0.01150751, + "auxiliary_loss_mlp": 0.01043659, + "balance_loss_clip": 1.02495515, + "balance_loss_mlp": 1.05141127, + "epoch": 0.18223357883661506, + "flos": 20194007034240.0, + "grad_norm": 1.955618442063123, + "language_loss": 0.85318518, + "learning_rate": 3.761778660099352e-06, + "loss": 0.87512928, + "num_input_tokens_seen": 65497775, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.99609375, + "step": 3031, + "time_per_iteration": 2.4492268562316895 + }, + { + "auxiliary_loss_clip": 0.01150203, + "auxiliary_loss_mlp": 0.01045881, + "balance_loss_clip": 1.02824974, + "balance_loss_mlp": 1.05232072, + "epoch": 0.18229370208928303, + "flos": 15231978524160.0, + "grad_norm": 1.8744751837074634, + "language_loss": 0.79713088, + "learning_rate": 3.76159428580299e-06, + "loss": 0.81909174, + "num_input_tokens_seen": 65516505, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.98046875, + "step": 3032, + "time_per_iteration": 2.4449126720428467 + }, + { + "auxiliary_loss_clip": 0.0115633, + "auxiliary_loss_mlp": 0.01044461, + "balance_loss_clip": 1.0260191, + "balance_loss_mlp": 1.05395341, + "epoch": 0.182353825341951, + "flos": 23840483927040.0, + "grad_norm": 2.0774072235136964, + "language_loss": 0.81420642, + "learning_rate": 3.761409844706795e-06, + "loss": 0.8362143, + "num_input_tokens_seen": 65536160, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.0234375, + "step": 3033, + "time_per_iteration": 2.47562575340271 + }, + { + "auxiliary_loss_clip": 0.01052781, + "auxiliary_loss_mlp": 0.01006645, + "balance_loss_clip": 1.00452268, + "balance_loss_mlp": 1.01995599, + "epoch": 0.18241394859461896, + "flos": 61190957393280.0, + "grad_norm": 0.8883360043233282, + "language_loss": 0.63479006, + "learning_rate": 3.7612253368177625e-06, + "loss": 0.6553843, + "num_input_tokens_seen": 65589375, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.328125, + "step": 3034, + "time_per_iteration": 2.9712142944335938 + }, + { + "auxiliary_loss_clip": 0.01148548, + "auxiliary_loss_mlp": 0.01043903, + "balance_loss_clip": 1.0263083, + "balance_loss_mlp": 1.05033147, + "epoch": 0.18247407184728695, + "flos": 18471694826880.0, + "grad_norm": 2.0132790953316113, + "language_loss": 0.79684323, + "learning_rate": 3.7610407621428893e-06, + "loss": 0.81876773, + "num_input_tokens_seen": 65606720, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.984375, + "step": 3035, + "time_per_iteration": 2.4517030715942383 + }, + { + "auxiliary_loss_clip": 0.01147231, + "auxiliary_loss_mlp": 0.01044908, + "balance_loss_clip": 1.02792096, + "balance_loss_mlp": 1.05231702, + "epoch": 0.18253419509995492, + "flos": 21795191602560.0, + "grad_norm": 2.217606261766961, + "language_loss": 0.84895855, + "learning_rate": 3.7608561206891735e-06, + "loss": 0.87087989, + "num_input_tokens_seen": 65625495, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.94921875, + "step": 3036, + "time_per_iteration": 2.5017378330230713 + }, + { + "auxiliary_loss_clip": 0.01142577, + "auxiliary_loss_mlp": 0.01042615, + "balance_loss_clip": 1.02524662, + "balance_loss_mlp": 1.04940438, + "epoch": 0.18259431835262288, + "flos": 20149764456960.0, + "grad_norm": 2.216717642760365, + "language_loss": 0.79836094, + "learning_rate": 3.760671412463617e-06, + "loss": 0.82021284, + "num_input_tokens_seen": 65643515, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9296875, + "step": 3037, + "time_per_iteration": 2.4591338634490967 + }, + { + "auxiliary_loss_clip": 0.01149617, + "auxiliary_loss_mlp": 0.01047298, + "balance_loss_clip": 1.02686584, + "balance_loss_mlp": 1.05208671, + "epoch": 0.18265444160529085, + "flos": 16981653916800.0, + "grad_norm": 2.68131613553598, + "language_loss": 0.79450762, + "learning_rate": 3.7604866374732246e-06, + "loss": 0.81647676, + "num_input_tokens_seen": 65658155, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.9765625, + "step": 3038, + "time_per_iteration": 2.440664768218994 + }, + { + "auxiliary_loss_clip": 0.0114731, + "auxiliary_loss_mlp": 0.01048244, + "balance_loss_clip": 1.03069699, + "balance_loss_mlp": 1.05140162, + "epoch": 0.1827145648579588, + "flos": 34423250509440.0, + "grad_norm": 2.3213350225315748, + "language_loss": 0.67311364, + "learning_rate": 3.7603017957250023e-06, + "loss": 0.69506919, + "num_input_tokens_seen": 65679310, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.95703125, + "step": 3039, + "time_per_iteration": 2.573272466659546 + }, + { + "auxiliary_loss_clip": 0.01148566, + "auxiliary_loss_mlp": 0.01051856, + "balance_loss_clip": 1.03323567, + "balance_loss_mlp": 1.05112875, + "epoch": 0.18277468811062678, + "flos": 53287017264000.0, + "grad_norm": 1.9125298187860031, + "language_loss": 0.73687911, + "learning_rate": 3.7601168872259593e-06, + "loss": 0.75888336, + "num_input_tokens_seen": 65705235, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9765625, + "step": 3040, + "time_per_iteration": 2.771242618560791 + }, + { + "auxiliary_loss_clip": 0.0114474, + "auxiliary_loss_mlp": 0.01042775, + "balance_loss_clip": 1.02418995, + "balance_loss_mlp": 1.04849768, + "epoch": 0.18283481136329474, + "flos": 31650659602560.0, + "grad_norm": 1.8780343880464916, + "language_loss": 0.60176188, + "learning_rate": 3.7599319119831075e-06, + "loss": 0.62363702, + "num_input_tokens_seen": 65727575, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9609375, + "step": 3041, + "time_per_iteration": 2.670691967010498 + }, + { + "auxiliary_loss_clip": 0.01146425, + "auxiliary_loss_mlp": 0.01055713, + "balance_loss_clip": 1.03756928, + "balance_loss_mlp": 1.05012786, + "epoch": 0.18289493461596273, + "flos": 53137664513280.0, + "grad_norm": 1.7488247873746179, + "language_loss": 0.60361505, + "learning_rate": 3.7597468700034616e-06, + "loss": 0.6256364, + "num_input_tokens_seen": 65751370, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9609375, + "step": 3042, + "time_per_iteration": 2.7942960262298584 + }, + { + "auxiliary_loss_clip": 0.01144442, + "auxiliary_loss_mlp": 0.0104919, + "balance_loss_clip": 1.03143954, + "balance_loss_mlp": 1.04945385, + "epoch": 0.1829550578686307, + "flos": 25589369220480.0, + "grad_norm": 1.6831322617730042, + "language_loss": 0.8769263, + "learning_rate": 3.7595617612940374e-06, + "loss": 0.8988626, + "num_input_tokens_seen": 65771040, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.94921875, + "step": 3043, + "time_per_iteration": 2.524871587753296 + }, + { + "auxiliary_loss_clip": 0.01149409, + "auxiliary_loss_mlp": 0.01049211, + "balance_loss_clip": 1.03005409, + "balance_loss_mlp": 1.05107832, + "epoch": 0.18301518112129866, + "flos": 22601422321920.0, + "grad_norm": 1.9464603469819268, + "language_loss": 0.707008, + "learning_rate": 3.7593765858618552e-06, + "loss": 0.72899425, + "num_input_tokens_seen": 65789345, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.984375, + "step": 3044, + "time_per_iteration": 2.483335018157959 + }, + { + "auxiliary_loss_clip": 0.01150484, + "auxiliary_loss_mlp": 0.01055406, + "balance_loss_clip": 1.03552175, + "balance_loss_mlp": 1.04929996, + "epoch": 0.18307530437396663, + "flos": 34020799551360.0, + "grad_norm": 2.0901220952627497, + "language_loss": 0.64385587, + "learning_rate": 3.7591913437139365e-06, + "loss": 0.66591471, + "num_input_tokens_seen": 65810990, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.015625, + "step": 3045, + "time_per_iteration": 2.592855453491211 + }, + { + "auxiliary_loss_clip": 0.01145205, + "auxiliary_loss_mlp": 0.01054969, + "balance_loss_clip": 1.0368377, + "balance_loss_mlp": 1.04977548, + "epoch": 0.1831354276266346, + "flos": 21279765392640.0, + "grad_norm": 2.998731206361719, + "language_loss": 0.79165137, + "learning_rate": 3.7590060348573066e-06, + "loss": 0.81365317, + "num_input_tokens_seen": 65827230, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.953125, + "step": 3046, + "time_per_iteration": 2.5034587383270264 + }, + { + "auxiliary_loss_clip": 0.01146985, + "auxiliary_loss_mlp": 0.01048107, + "balance_loss_clip": 1.02908087, + "balance_loss_mlp": 1.04764223, + "epoch": 0.18319555087930256, + "flos": 21032952065280.0, + "grad_norm": 3.3529268295267016, + "language_loss": 0.78991181, + "learning_rate": 3.7588206592989903e-06, + "loss": 0.81186271, + "num_input_tokens_seen": 65845900, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.9921875, + "step": 3047, + "time_per_iteration": 2.5140535831451416 + }, + { + "auxiliary_loss_clip": 0.01145799, + "auxiliary_loss_mlp": 0.01046901, + "balance_loss_clip": 1.02923381, + "balance_loss_mlp": 1.05111742, + "epoch": 0.18325567413197055, + "flos": 34382958428160.0, + "grad_norm": 1.5613113238500957, + "language_loss": 0.80888635, + "learning_rate": 3.7586352170460194e-06, + "loss": 0.83081341, + "num_input_tokens_seen": 65868730, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9453125, + "step": 3048, + "time_per_iteration": 2.5848963260650635 + }, + { + "auxiliary_loss_clip": 0.01148052, + "auxiliary_loss_mlp": 0.01042108, + "balance_loss_clip": 1.02283192, + "balance_loss_mlp": 1.0502528, + "epoch": 0.18331579738463852, + "flos": 20558464381440.0, + "grad_norm": 1.8161394933049422, + "language_loss": 0.86232805, + "learning_rate": 3.758449708105424e-06, + "loss": 0.88422966, + "num_input_tokens_seen": 65888420, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.9765625, + "step": 3049, + "time_per_iteration": 2.4665114879608154 + }, + { + "auxiliary_loss_clip": 0.01154476, + "auxiliary_loss_mlp": 0.01043247, + "balance_loss_clip": 1.02364874, + "balance_loss_mlp": 1.05159521, + "epoch": 0.18337592063730648, + "flos": 19607872901760.0, + "grad_norm": 2.2703740748038066, + "language_loss": 0.77160966, + "learning_rate": 3.75826413248424e-06, + "loss": 0.79358685, + "num_input_tokens_seen": 65905840, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.03125, + "step": 3050, + "time_per_iteration": 2.4525256156921387 + }, + { + "auxiliary_loss_clip": 0.01146286, + "auxiliary_loss_mlp": 0.01045385, + "balance_loss_clip": 1.02683592, + "balance_loss_mlp": 1.04867804, + "epoch": 0.18343604388997445, + "flos": 20850885002880.0, + "grad_norm": 2.010292972394078, + "language_loss": 0.99174476, + "learning_rate": 3.7580784901895035e-06, + "loss": 1.0136615, + "num_input_tokens_seen": 65922845, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9765625, + "step": 3051, + "time_per_iteration": 2.4559926986694336 + }, + { + "auxiliary_loss_clip": 0.01145751, + "auxiliary_loss_mlp": 0.01039078, + "balance_loss_clip": 1.02096963, + "balance_loss_mlp": 1.050529, + "epoch": 0.1834961671426424, + "flos": 24394370624640.0, + "grad_norm": 1.5992624239842805, + "language_loss": 0.86153144, + "learning_rate": 3.7578927812282542e-06, + "loss": 0.8833797, + "num_input_tokens_seen": 65945555, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.953125, + "step": 3052, + "time_per_iteration": 2.559396505355835 + }, + { + "auxiliary_loss_clip": 0.01145626, + "auxiliary_loss_mlp": 0.0105059, + "balance_loss_clip": 1.03267264, + "balance_loss_mlp": 1.04985499, + "epoch": 0.18355629039531038, + "flos": 21251612108160.0, + "grad_norm": 1.8182752776897229, + "language_loss": 0.73004341, + "learning_rate": 3.7577070056075356e-06, + "loss": 0.75200558, + "num_input_tokens_seen": 65963965, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.95703125, + "step": 3053, + "time_per_iteration": 2.4481074810028076 + }, + { + "auxiliary_loss_clip": 0.01150169, + "auxiliary_loss_mlp": 0.01049972, + "balance_loss_clip": 1.03051662, + "balance_loss_mlp": 1.05208337, + "epoch": 0.18361641364797834, + "flos": 28656499651200.0, + "grad_norm": 1.6467304764216655, + "language_loss": 0.62212563, + "learning_rate": 3.7575211633343902e-06, + "loss": 0.64412701, + "num_input_tokens_seen": 65985965, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.98046875, + "step": 3054, + "time_per_iteration": 2.5701377391815186 + }, + { + "auxiliary_loss_clip": 0.01146023, + "auxiliary_loss_mlp": 0.01042771, + "balance_loss_clip": 1.02510393, + "balance_loss_mlp": 1.04962707, + "epoch": 0.18367653690064634, + "flos": 20918827578240.0, + "grad_norm": 2.2210920593094325, + "language_loss": 0.78501689, + "learning_rate": 3.7573352544158663e-06, + "loss": 0.80690485, + "num_input_tokens_seen": 66005645, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9609375, + "step": 3055, + "time_per_iteration": 2.47776198387146 + }, + { + "auxiliary_loss_clip": 0.0114232, + "auxiliary_loss_mlp": 0.01053937, + "balance_loss_clip": 1.03622222, + "balance_loss_mlp": 1.04779387, + "epoch": 0.1837366601533143, + "flos": 28765596234240.0, + "grad_norm": 1.894881128028073, + "language_loss": 0.70218527, + "learning_rate": 3.757149278859014e-06, + "loss": 0.72414786, + "num_input_tokens_seen": 66025675, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9453125, + "step": 3056, + "time_per_iteration": 2.541361093521118 + }, + { + "auxiliary_loss_clip": 0.0114918, + "auxiliary_loss_mlp": 0.01043721, + "balance_loss_clip": 1.02612543, + "balance_loss_mlp": 1.05066419, + "epoch": 0.18379678340598227, + "flos": 21251432540160.0, + "grad_norm": 1.4932354373853338, + "language_loss": 0.8028152, + "learning_rate": 3.7569632366708842e-06, + "loss": 0.82474422, + "num_input_tokens_seen": 66046125, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.984375, + "step": 3057, + "time_per_iteration": 2.4718995094299316 + }, + { + "auxiliary_loss_clip": 0.0115229, + "auxiliary_loss_mlp": 0.01049373, + "balance_loss_clip": 1.02864265, + "balance_loss_mlp": 1.04847729, + "epoch": 0.18385690665865023, + "flos": 20449619193600.0, + "grad_norm": 2.0112890674266914, + "language_loss": 0.82289785, + "learning_rate": 3.756777127858533e-06, + "loss": 0.84491444, + "num_input_tokens_seen": 66064375, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.0390625, + "step": 3058, + "time_per_iteration": 2.4653379917144775 + }, + { + "auxiliary_loss_clip": 0.01148012, + "auxiliary_loss_mlp": 0.01046535, + "balance_loss_clip": 1.02818882, + "balance_loss_mlp": 1.04893029, + "epoch": 0.1839170299113182, + "flos": 26140562398080.0, + "grad_norm": 2.205773819593527, + "language_loss": 0.85894352, + "learning_rate": 3.756590952429017e-06, + "loss": 0.88088906, + "num_input_tokens_seen": 66084590, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.9921875, + "step": 3059, + "time_per_iteration": 4.0151047706604 + }, + { + "auxiliary_loss_clip": 0.01145706, + "auxiliary_loss_mlp": 0.01045338, + "balance_loss_clip": 1.02724195, + "balance_loss_mlp": 1.04931092, + "epoch": 0.18397715316398616, + "flos": 31758032332800.0, + "grad_norm": 1.70952354928268, + "language_loss": 0.72799402, + "learning_rate": 3.756404710389396e-06, + "loss": 0.74990445, + "num_input_tokens_seen": 66107105, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9609375, + "step": 3060, + "time_per_iteration": 5.466471195220947 + }, + { + "auxiliary_loss_clip": 0.01151276, + "auxiliary_loss_mlp": 0.01042783, + "balance_loss_clip": 1.02375722, + "balance_loss_mlp": 1.05253565, + "epoch": 0.18403727641665413, + "flos": 24611989173120.0, + "grad_norm": 1.7373746338425942, + "language_loss": 0.72797298, + "learning_rate": 3.7562184017467323e-06, + "loss": 0.74991357, + "num_input_tokens_seen": 66129295, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.98828125, + "step": 3061, + "time_per_iteration": 2.5244035720825195 + }, + { + "auxiliary_loss_clip": 0.01146537, + "auxiliary_loss_mlp": 0.0104557, + "balance_loss_clip": 1.02697313, + "balance_loss_mlp": 1.05087519, + "epoch": 0.18409739966932212, + "flos": 23439900476160.0, + "grad_norm": 1.8714044833418495, + "language_loss": 0.81622046, + "learning_rate": 3.7560320265080906e-06, + "loss": 0.83814156, + "num_input_tokens_seen": 66146910, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.95703125, + "step": 3062, + "time_per_iteration": 2.4767649173736572 + }, + { + "auxiliary_loss_clip": 0.01154667, + "auxiliary_loss_mlp": 0.01045371, + "balance_loss_clip": 1.02681041, + "balance_loss_mlp": 1.05394542, + "epoch": 0.18415752292199009, + "flos": 21872112577920.0, + "grad_norm": 1.7582970194369052, + "language_loss": 0.72718614, + "learning_rate": 3.7558455846805383e-06, + "loss": 0.74918652, + "num_input_tokens_seen": 66165370, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0078125, + "step": 3063, + "time_per_iteration": 2.5082144737243652 + }, + { + "auxiliary_loss_clip": 0.01146453, + "auxiliary_loss_mlp": 0.01041784, + "balance_loss_clip": 1.02516627, + "balance_loss_mlp": 1.04935837, + "epoch": 0.18421764617465805, + "flos": 25410678036480.0, + "grad_norm": 2.1216519555610183, + "language_loss": 0.65496099, + "learning_rate": 3.7556590762711463e-06, + "loss": 0.6768434, + "num_input_tokens_seen": 66186210, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.96875, + "step": 3064, + "time_per_iteration": 2.523141622543335 + }, + { + "auxiliary_loss_clip": 0.01149329, + "auxiliary_loss_mlp": 0.01048992, + "balance_loss_clip": 1.03081298, + "balance_loss_mlp": 1.05274165, + "epoch": 0.18427776942732602, + "flos": 27198131558400.0, + "grad_norm": 2.6163412642887947, + "language_loss": 0.68768656, + "learning_rate": 3.7554725012869853e-06, + "loss": 0.70966971, + "num_input_tokens_seen": 66204800, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.96875, + "step": 3065, + "time_per_iteration": 2.5244293212890625 + }, + { + "auxiliary_loss_clip": 0.01151353, + "auxiliary_loss_mlp": 0.01047403, + "balance_loss_clip": 1.02819824, + "balance_loss_mlp": 1.05120087, + "epoch": 0.18433789267999398, + "flos": 27852351920640.0, + "grad_norm": 4.932084281869228, + "language_loss": 0.72561431, + "learning_rate": 3.7552858597351318e-06, + "loss": 0.74760187, + "num_input_tokens_seen": 66222195, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0, + "step": 3066, + "time_per_iteration": 2.5428919792175293 + }, + { + "auxiliary_loss_clip": 0.01148706, + "auxiliary_loss_mlp": 0.01043732, + "balance_loss_clip": 1.02608871, + "balance_loss_mlp": 1.05074954, + "epoch": 0.18439801593266195, + "flos": 17856940533120.0, + "grad_norm": 1.9825677919996112, + "language_loss": 0.82477474, + "learning_rate": 3.7550991516226622e-06, + "loss": 0.84669906, + "num_input_tokens_seen": 66239505, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.98046875, + "step": 3067, + "time_per_iteration": 2.4500880241394043 + }, + { + "auxiliary_loss_clip": 0.01048916, + "auxiliary_loss_mlp": 0.01007477, + "balance_loss_clip": 1.00535476, + "balance_loss_mlp": 1.01668859, + "epoch": 0.18445813918532994, + "flos": 56389522590720.0, + "grad_norm": 0.7924805733675573, + "language_loss": 0.59706604, + "learning_rate": 3.754912376956657e-06, + "loss": 0.61763, + "num_input_tokens_seen": 66295695, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.32226562, + "step": 3068, + "time_per_iteration": 2.9375104904174805 + }, + { + "auxiliary_loss_clip": 0.01153283, + "auxiliary_loss_mlp": 0.01040124, + "balance_loss_clip": 1.02232563, + "balance_loss_mlp": 1.05714762, + "epoch": 0.1845182624379979, + "flos": 20957180325120.0, + "grad_norm": 1.8708990955689164, + "language_loss": 0.76227212, + "learning_rate": 3.7547255357441987e-06, + "loss": 0.78420615, + "num_input_tokens_seen": 66315315, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9609375, + "step": 3069, + "time_per_iteration": 2.462446451187134 + }, + { + "auxiliary_loss_clip": 0.01151625, + "auxiliary_loss_mlp": 0.01040566, + "balance_loss_clip": 1.02233863, + "balance_loss_mlp": 1.05299067, + "epoch": 0.18457838569066587, + "flos": 20485170679680.0, + "grad_norm": 1.7428293735192475, + "language_loss": 0.84803855, + "learning_rate": 3.7545386279923718e-06, + "loss": 0.86996043, + "num_input_tokens_seen": 66333675, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.984375, + "step": 3070, + "time_per_iteration": 2.4887194633483887 + }, + { + "auxiliary_loss_clip": 0.01152145, + "auxiliary_loss_mlp": 0.01042624, + "balance_loss_clip": 1.02462363, + "balance_loss_mlp": 1.05298758, + "epoch": 0.18463850894333383, + "flos": 25010022758400.0, + "grad_norm": 1.9722863584187038, + "language_loss": 0.77370453, + "learning_rate": 3.754351653708265e-06, + "loss": 0.79565221, + "num_input_tokens_seen": 66354075, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.98828125, + "step": 3071, + "time_per_iteration": 2.482213258743286 + }, + { + "auxiliary_loss_clip": 0.01152228, + "auxiliary_loss_mlp": 0.01048541, + "balance_loss_clip": 1.03042173, + "balance_loss_mlp": 1.05342758, + "epoch": 0.1846986321960018, + "flos": 16800628348800.0, + "grad_norm": 2.705053980849468, + "language_loss": 0.77691031, + "learning_rate": 3.7541646128989674e-06, + "loss": 0.79891801, + "num_input_tokens_seen": 66372520, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9921875, + "step": 3072, + "time_per_iteration": 2.466387987136841 + }, + { + "auxiliary_loss_clip": 0.01150023, + "auxiliary_loss_mlp": 0.01042519, + "balance_loss_clip": 1.02339804, + "balance_loss_mlp": 1.05013216, + "epoch": 0.18475875544866976, + "flos": 20814327936000.0, + "grad_norm": 1.8173375196390826, + "language_loss": 0.8607235, + "learning_rate": 3.7539775055715715e-06, + "loss": 0.88264889, + "num_input_tokens_seen": 66390745, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0, + "step": 3073, + "time_per_iteration": 2.4510810375213623 + }, + { + "auxiliary_loss_clip": 0.01151045, + "auxiliary_loss_mlp": 0.01045152, + "balance_loss_clip": 1.02851045, + "balance_loss_mlp": 1.05339348, + "epoch": 0.18481887870133773, + "flos": 22601422321920.0, + "grad_norm": 2.2059027996031877, + "language_loss": 0.92005521, + "learning_rate": 3.7537903317331732e-06, + "loss": 0.9420172, + "num_input_tokens_seen": 66410525, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.97265625, + "step": 3074, + "time_per_iteration": 2.473710298538208 + }, + { + "auxiliary_loss_clip": 0.01146992, + "auxiliary_loss_mlp": 0.01044255, + "balance_loss_clip": 1.02490735, + "balance_loss_mlp": 1.05028176, + "epoch": 0.18487900195400572, + "flos": 29458815788160.0, + "grad_norm": 1.9913742546968862, + "language_loss": 0.65041798, + "learning_rate": 3.75360309139087e-06, + "loss": 0.67233044, + "num_input_tokens_seen": 66432535, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.96875, + "step": 3075, + "time_per_iteration": 2.533724784851074 + }, + { + "auxiliary_loss_clip": 0.01149493, + "auxiliary_loss_mlp": 0.01043367, + "balance_loss_clip": 1.02578402, + "balance_loss_mlp": 1.053177, + "epoch": 0.1849391252066737, + "flos": 20628777254400.0, + "grad_norm": 1.709240712607824, + "language_loss": 0.72323918, + "learning_rate": 3.753415784551761e-06, + "loss": 0.74516779, + "num_input_tokens_seen": 66450620, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9609375, + "step": 3076, + "time_per_iteration": 2.4544899463653564 + }, + { + "auxiliary_loss_clip": 0.01153692, + "auxiliary_loss_mlp": 0.01046042, + "balance_loss_clip": 1.0280292, + "balance_loss_mlp": 1.05341136, + "epoch": 0.18499924845934165, + "flos": 14428549065600.0, + "grad_norm": 2.4900368363969854, + "language_loss": 0.80860448, + "learning_rate": 3.7532284112229507e-06, + "loss": 0.83060181, + "num_input_tokens_seen": 66467865, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0, + "step": 3077, + "time_per_iteration": 2.45137882232666 + }, + { + "auxiliary_loss_clip": 0.01146798, + "auxiliary_loss_mlp": 0.01045715, + "balance_loss_clip": 1.02816749, + "balance_loss_mlp": 1.05103469, + "epoch": 0.18505937171200962, + "flos": 23727652329600.0, + "grad_norm": 1.7908770900539794, + "language_loss": 0.78764129, + "learning_rate": 3.7530409714115424e-06, + "loss": 0.8095665, + "num_input_tokens_seen": 66486245, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.95703125, + "step": 3078, + "time_per_iteration": 2.477393865585327 + }, + { + "auxiliary_loss_clip": 0.01147874, + "auxiliary_loss_mlp": 0.01044325, + "balance_loss_clip": 1.02714717, + "balance_loss_mlp": 1.05057585, + "epoch": 0.18511949496467758, + "flos": 25957489754880.0, + "grad_norm": 1.8549646444276375, + "language_loss": 0.7758081, + "learning_rate": 3.7528534651246453e-06, + "loss": 0.79773009, + "num_input_tokens_seen": 66506510, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9765625, + "step": 3079, + "time_per_iteration": 2.5069448947906494 + }, + { + "auxiliary_loss_clip": 0.01143899, + "auxiliary_loss_mlp": 0.0104323, + "balance_loss_clip": 1.02581406, + "balance_loss_mlp": 1.04723024, + "epoch": 0.18517961821734555, + "flos": 42413553912960.0, + "grad_norm": 2.3452692712375893, + "language_loss": 0.81668431, + "learning_rate": 3.752665892369369e-06, + "loss": 0.83855557, + "num_input_tokens_seen": 66530960, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.96484375, + "step": 3080, + "time_per_iteration": 2.688206911087036 + }, + { + "auxiliary_loss_clip": 0.01149652, + "auxiliary_loss_mlp": 0.01046927, + "balance_loss_clip": 1.02812803, + "balance_loss_mlp": 1.05079699, + "epoch": 0.18523974147001354, + "flos": 24097568544000.0, + "grad_norm": 2.0276132956863764, + "language_loss": 0.7435087, + "learning_rate": 3.7524782531528266e-06, + "loss": 0.7654745, + "num_input_tokens_seen": 66550275, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.98828125, + "step": 3081, + "time_per_iteration": 2.5003983974456787 + }, + { + "auxiliary_loss_clip": 0.01151656, + "auxiliary_loss_mlp": 0.01050271, + "balance_loss_clip": 1.03124547, + "balance_loss_mlp": 1.05527234, + "epoch": 0.1852998647226815, + "flos": 27375278457600.0, + "grad_norm": 2.070281784994394, + "language_loss": 0.71532816, + "learning_rate": 3.7522905474821334e-06, + "loss": 0.73734742, + "num_input_tokens_seen": 66569040, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.9609375, + "step": 3082, + "time_per_iteration": 2.514004707336426 + }, + { + "auxiliary_loss_clip": 0.011545, + "auxiliary_loss_mlp": 0.01050471, + "balance_loss_clip": 1.03155267, + "balance_loss_mlp": 1.05488813, + "epoch": 0.18535998797534947, + "flos": 18332757020160.0, + "grad_norm": 1.869200996989063, + "language_loss": 0.69338834, + "learning_rate": 3.752102775364407e-06, + "loss": 0.71543807, + "num_input_tokens_seen": 66587775, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.9921875, + "step": 3083, + "time_per_iteration": 2.446418523788452 + }, + { + "auxiliary_loss_clip": 0.0114679, + "auxiliary_loss_mlp": 0.01049301, + "balance_loss_clip": 1.03187287, + "balance_loss_mlp": 1.05216169, + "epoch": 0.18542011122801744, + "flos": 37845859887360.0, + "grad_norm": 4.022344342016001, + "language_loss": 0.68854296, + "learning_rate": 3.751914936806767e-06, + "loss": 0.71050388, + "num_input_tokens_seen": 66610800, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9453125, + "step": 3084, + "time_per_iteration": 2.5964090824127197 + }, + { + "auxiliary_loss_clip": 0.01145496, + "auxiliary_loss_mlp": 0.01043341, + "balance_loss_clip": 1.02541232, + "balance_loss_mlp": 1.04961908, + "epoch": 0.1854802344806854, + "flos": 25186128163200.0, + "grad_norm": 1.5883609883793584, + "language_loss": 0.77831411, + "learning_rate": 3.7517270318163377e-06, + "loss": 0.80020249, + "num_input_tokens_seen": 66630960, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9609375, + "step": 3085, + "time_per_iteration": 2.500401020050049 + }, + { + "auxiliary_loss_clip": 0.01145039, + "auxiliary_loss_mlp": 0.01053452, + "balance_loss_clip": 1.03557014, + "balance_loss_mlp": 1.04887915, + "epoch": 0.18554035773335337, + "flos": 26684788337280.0, + "grad_norm": 1.8880953488015286, + "language_loss": 0.73488086, + "learning_rate": 3.751539060400244e-06, + "loss": 0.7568658, + "num_input_tokens_seen": 66650585, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.9609375, + "step": 3086, + "time_per_iteration": 2.5121798515319824 + }, + { + "auxiliary_loss_clip": 0.0114717, + "auxiliary_loss_mlp": 0.01048198, + "balance_loss_clip": 1.02949429, + "balance_loss_mlp": 1.05223882, + "epoch": 0.18560048098602133, + "flos": 22346887570560.0, + "grad_norm": 4.074676999617497, + "language_loss": 0.70087367, + "learning_rate": 3.7513510225656132e-06, + "loss": 0.72282737, + "num_input_tokens_seen": 66670045, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.953125, + "step": 3087, + "time_per_iteration": 2.469980001449585 + }, + { + "auxiliary_loss_clip": 0.01149215, + "auxiliary_loss_mlp": 0.01048668, + "balance_loss_clip": 1.02928519, + "balance_loss_mlp": 1.05118215, + "epoch": 0.18566060423868933, + "flos": 17748526308480.0, + "grad_norm": 2.299065028063824, + "language_loss": 0.72731185, + "learning_rate": 3.7511629183195764e-06, + "loss": 0.74929065, + "num_input_tokens_seen": 66688790, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.98046875, + "step": 3088, + "time_per_iteration": 2.4569249153137207 + }, + { + "auxiliary_loss_clip": 0.01144422, + "auxiliary_loss_mlp": 0.01045089, + "balance_loss_clip": 1.02733839, + "balance_loss_mlp": 1.05015588, + "epoch": 0.1857207274913573, + "flos": 24677274142080.0, + "grad_norm": 2.023411505730453, + "language_loss": 0.91849768, + "learning_rate": 3.7509747476692663e-06, + "loss": 0.94039273, + "num_input_tokens_seen": 66708090, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.94140625, + "step": 3089, + "time_per_iteration": 2.5086276531219482 + }, + { + "auxiliary_loss_clip": 0.01146464, + "auxiliary_loss_mlp": 0.0104377, + "balance_loss_clip": 1.02573323, + "balance_loss_mlp": 1.05124271, + "epoch": 0.18578085074402526, + "flos": 28147825198080.0, + "grad_norm": 2.7535733421879174, + "language_loss": 0.57406759, + "learning_rate": 3.7507865106218176e-06, + "loss": 0.59596992, + "num_input_tokens_seen": 66727320, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.953125, + "step": 3090, + "time_per_iteration": 2.544934034347534 + }, + { + "auxiliary_loss_clip": 0.011443, + "auxiliary_loss_mlp": 0.01049477, + "balance_loss_clip": 1.03133333, + "balance_loss_mlp": 1.04945779, + "epoch": 0.18584097399669322, + "flos": 23951878980480.0, + "grad_norm": 1.9526543189913628, + "language_loss": 0.82229531, + "learning_rate": 3.7505982071843695e-06, + "loss": 0.84423304, + "num_input_tokens_seen": 66747505, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9453125, + "step": 3091, + "time_per_iteration": 2.5339536666870117 + }, + { + "auxiliary_loss_clip": 0.01149127, + "auxiliary_loss_mlp": 0.01049478, + "balance_loss_clip": 1.03165662, + "balance_loss_mlp": 1.05212235, + "epoch": 0.18590109724936119, + "flos": 17201678676480.0, + "grad_norm": 2.0588011246991127, + "language_loss": 0.83561456, + "learning_rate": 3.7504098373640617e-06, + "loss": 0.85760063, + "num_input_tokens_seen": 66766425, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.96875, + "step": 3092, + "time_per_iteration": 2.5091474056243896 + }, + { + "auxiliary_loss_clip": 0.01151013, + "auxiliary_loss_mlp": 0.01044436, + "balance_loss_clip": 1.02562487, + "balance_loss_mlp": 1.05010569, + "epoch": 0.18596122050202915, + "flos": 17234644383360.0, + "grad_norm": 4.142827775979207, + "language_loss": 0.93487823, + "learning_rate": 3.750221401168038e-06, + "loss": 0.95683277, + "num_input_tokens_seen": 66781130, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0078125, + "step": 3093, + "time_per_iteration": 2.4307475090026855 + }, + { + "auxiliary_loss_clip": 0.01146588, + "auxiliary_loss_mlp": 0.01038182, + "balance_loss_clip": 1.02115917, + "balance_loss_mlp": 1.05090082, + "epoch": 0.18602134375469712, + "flos": 19020733188480.0, + "grad_norm": 2.060946690404802, + "language_loss": 0.77380008, + "learning_rate": 3.750032898603443e-06, + "loss": 0.79564774, + "num_input_tokens_seen": 66797535, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.953125, + "step": 3094, + "time_per_iteration": 2.4520375728607178 + }, + { + "auxiliary_loss_clip": 0.01147212, + "auxiliary_loss_mlp": 0.01047698, + "balance_loss_clip": 1.03098452, + "balance_loss_mlp": 1.05099964, + "epoch": 0.1860814670073651, + "flos": 50950094417280.0, + "grad_norm": 1.6535165555915046, + "language_loss": 0.69985378, + "learning_rate": 3.749844329677425e-06, + "loss": 0.72180283, + "num_input_tokens_seen": 66821720, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9609375, + "step": 3095, + "time_per_iteration": 2.7395834922790527 + }, + { + "auxiliary_loss_clip": 0.01149572, + "auxiliary_loss_mlp": 0.01045107, + "balance_loss_clip": 1.02614033, + "balance_loss_mlp": 1.05169249, + "epoch": 0.18614159026003307, + "flos": 19390972625280.0, + "grad_norm": 1.9053555001005595, + "language_loss": 0.8077082, + "learning_rate": 3.749655694397135e-06, + "loss": 0.82965505, + "num_input_tokens_seen": 66839060, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.98046875, + "step": 3096, + "time_per_iteration": 2.4506232738494873 + }, + { + "auxiliary_loss_clip": 0.01147695, + "auxiliary_loss_mlp": 0.01047208, + "balance_loss_clip": 1.02883816, + "balance_loss_mlp": 1.05086875, + "epoch": 0.18620171351270104, + "flos": 21798782962560.0, + "grad_norm": 2.061308652340225, + "language_loss": 0.75101036, + "learning_rate": 3.7494669927697255e-06, + "loss": 0.77295941, + "num_input_tokens_seen": 66857760, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.96875, + "step": 3097, + "time_per_iteration": 2.46639347076416 + }, + { + "auxiliary_loss_clip": 0.01147181, + "auxiliary_loss_mlp": 0.01045993, + "balance_loss_clip": 1.02789688, + "balance_loss_mlp": 1.05196047, + "epoch": 0.186261836765369, + "flos": 16362877299840.0, + "grad_norm": 2.5365100966912664, + "language_loss": 0.66038394, + "learning_rate": 3.749278224802352e-06, + "loss": 0.68231571, + "num_input_tokens_seen": 66876460, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.953125, + "step": 3098, + "time_per_iteration": 2.46763014793396 + }, + { + "auxiliary_loss_clip": 0.01148744, + "auxiliary_loss_mlp": 0.01048857, + "balance_loss_clip": 1.02973545, + "balance_loss_mlp": 1.04978585, + "epoch": 0.18632196001803697, + "flos": 23370054480000.0, + "grad_norm": 1.6025275160282182, + "language_loss": 0.69907904, + "learning_rate": 3.7490893905021733e-06, + "loss": 0.72105503, + "num_input_tokens_seen": 66897960, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.98828125, + "step": 3099, + "time_per_iteration": 2.469336748123169 + }, + { + "auxiliary_loss_clip": 0.01147788, + "auxiliary_loss_mlp": 0.01052362, + "balance_loss_clip": 1.03290749, + "balance_loss_mlp": 1.04985309, + "epoch": 0.18638208327070493, + "flos": 22492002516480.0, + "grad_norm": 1.4888180158498334, + "language_loss": 0.71623552, + "learning_rate": 3.7489004898763494e-06, + "loss": 0.73823702, + "num_input_tokens_seen": 66917675, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.9765625, + "step": 3100, + "time_per_iteration": 2.510803699493408 + }, + { + "auxiliary_loss_clip": 0.01150621, + "auxiliary_loss_mlp": 0.01050424, + "balance_loss_clip": 1.03104091, + "balance_loss_mlp": 1.05147338, + "epoch": 0.18644220652337293, + "flos": 29165245931520.0, + "grad_norm": 2.2181859131844757, + "language_loss": 0.80163074, + "learning_rate": 3.7487115229320444e-06, + "loss": 0.82364118, + "num_input_tokens_seen": 66936000, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.9921875, + "step": 3101, + "time_per_iteration": 4.007607936859131 + }, + { + "auxiliary_loss_clip": 0.0114449, + "auxiliary_loss_mlp": 0.01043434, + "balance_loss_clip": 1.02606487, + "balance_loss_mlp": 1.05100489, + "epoch": 0.1865023297760409, + "flos": 24243796811520.0, + "grad_norm": 2.082156961368248, + "language_loss": 0.76803768, + "learning_rate": 3.7485224896764222e-06, + "loss": 0.78991693, + "num_input_tokens_seen": 66955700, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9375, + "step": 3102, + "time_per_iteration": 5.438685894012451 + }, + { + "auxiliary_loss_clip": 0.0114717, + "auxiliary_loss_mlp": 0.01041158, + "balance_loss_clip": 1.02322865, + "balance_loss_mlp": 1.04973269, + "epoch": 0.18656245302870886, + "flos": 19128716449920.0, + "grad_norm": 2.5595226686006565, + "language_loss": 0.76962835, + "learning_rate": 3.7483333901166525e-06, + "loss": 0.79151165, + "num_input_tokens_seen": 66972815, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9765625, + "step": 3103, + "time_per_iteration": 2.4742202758789062 + }, + { + "auxiliary_loss_clip": 0.0114783, + "auxiliary_loss_mlp": 0.0104302, + "balance_loss_clip": 1.02540123, + "balance_loss_mlp": 1.05014729, + "epoch": 0.18662257628137682, + "flos": 17786088956160.0, + "grad_norm": 1.966347666558745, + "language_loss": 0.79074025, + "learning_rate": 3.7481442242599054e-06, + "loss": 0.81264877, + "num_input_tokens_seen": 66992280, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9765625, + "step": 3104, + "time_per_iteration": 2.4873924255371094 + }, + { + "auxiliary_loss_clip": 0.01148715, + "auxiliary_loss_mlp": 0.01043939, + "balance_loss_clip": 1.02653468, + "balance_loss_mlp": 1.05237842, + "epoch": 0.1866826995340448, + "flos": 24024382583040.0, + "grad_norm": 1.943867006204371, + "language_loss": 0.8519029, + "learning_rate": 3.747954992113354e-06, + "loss": 0.87382948, + "num_input_tokens_seen": 67012220, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9609375, + "step": 3105, + "time_per_iteration": 2.488638162612915 + }, + { + "auxiliary_loss_clip": 0.01152184, + "auxiliary_loss_mlp": 0.01047951, + "balance_loss_clip": 1.02872288, + "balance_loss_mlp": 1.0491997, + "epoch": 0.18674282278671275, + "flos": 26141244756480.0, + "grad_norm": 1.7838474228223986, + "language_loss": 0.86952424, + "learning_rate": 3.7477656936841742e-06, + "loss": 0.89152563, + "num_input_tokens_seen": 67032030, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.03125, + "step": 3106, + "time_per_iteration": 2.5103402137756348 + }, + { + "auxiliary_loss_clip": 0.0115436, + "auxiliary_loss_mlp": 0.01044282, + "balance_loss_clip": 1.02623367, + "balance_loss_mlp": 1.05296755, + "epoch": 0.18680294603938072, + "flos": 19201938324480.0, + "grad_norm": 1.9680738799082358, + "language_loss": 0.78253353, + "learning_rate": 3.7475763289795445e-06, + "loss": 0.80451989, + "num_input_tokens_seen": 67048920, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.015625, + "step": 3107, + "time_per_iteration": 2.44567608833313 + }, + { + "auxiliary_loss_clip": 0.01150298, + "auxiliary_loss_mlp": 0.01051545, + "balance_loss_clip": 1.03179181, + "balance_loss_mlp": 1.05040216, + "epoch": 0.1868630692920487, + "flos": 28544889116160.0, + "grad_norm": 1.9125203241398734, + "language_loss": 0.74114668, + "learning_rate": 3.7473868980066446e-06, + "loss": 0.76316506, + "num_input_tokens_seen": 67068645, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0, + "step": 3108, + "time_per_iteration": 2.5254971981048584 + }, + { + "auxiliary_loss_clip": 0.0115125, + "auxiliary_loss_mlp": 0.01045574, + "balance_loss_clip": 1.02684629, + "balance_loss_mlp": 1.05332017, + "epoch": 0.18692319254471668, + "flos": 17238020261760.0, + "grad_norm": 1.6536820415924105, + "language_loss": 0.74707133, + "learning_rate": 3.747197400772658e-06, + "loss": 0.76903957, + "num_input_tokens_seen": 67087075, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.98046875, + "step": 3109, + "time_per_iteration": 2.426945924758911 + }, + { + "auxiliary_loss_clip": 0.01147996, + "auxiliary_loss_mlp": 0.0104719, + "balance_loss_clip": 1.02845001, + "balance_loss_mlp": 1.05078959, + "epoch": 0.18698331579738464, + "flos": 23185186156800.0, + "grad_norm": 1.4293009008592994, + "language_loss": 0.84324062, + "learning_rate": 3.747007837284772e-06, + "loss": 0.86519247, + "num_input_tokens_seen": 67108040, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.97265625, + "step": 3110, + "time_per_iteration": 2.4744956493377686 + }, + { + "auxiliary_loss_clip": 0.01154611, + "auxiliary_loss_mlp": 0.0104307, + "balance_loss_clip": 1.02472341, + "balance_loss_mlp": 1.05598927, + "epoch": 0.1870434390500526, + "flos": 25516721963520.0, + "grad_norm": 1.633662412254079, + "language_loss": 0.84753799, + "learning_rate": 3.7468182075501737e-06, + "loss": 0.86951482, + "num_input_tokens_seen": 67127605, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.984375, + "step": 3111, + "time_per_iteration": 2.4757230281829834 + }, + { + "auxiliary_loss_clip": 0.01149935, + "auxiliary_loss_mlp": 0.01042098, + "balance_loss_clip": 1.02408528, + "balance_loss_mlp": 1.05231404, + "epoch": 0.18710356230272057, + "flos": 19500823393920.0, + "grad_norm": 1.8513735900463348, + "language_loss": 0.76565534, + "learning_rate": 3.7466285115760536e-06, + "loss": 0.78757566, + "num_input_tokens_seen": 67145785, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9765625, + "step": 3112, + "time_per_iteration": 2.465552806854248 + }, + { + "auxiliary_loss_clip": 0.01150842, + "auxiliary_loss_mlp": 0.0104724, + "balance_loss_clip": 1.02907228, + "balance_loss_mlp": 1.0516355, + "epoch": 0.18716368555538854, + "flos": 26760847386240.0, + "grad_norm": 1.8580615351340177, + "language_loss": 0.64277315, + "learning_rate": 3.7464387493696046e-06, + "loss": 0.66475397, + "num_input_tokens_seen": 67165930, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9921875, + "step": 3113, + "time_per_iteration": 2.491805076599121 + }, + { + "auxiliary_loss_clip": 0.01155946, + "auxiliary_loss_mlp": 0.01047625, + "balance_loss_clip": 1.02817035, + "balance_loss_mlp": 1.0528996, + "epoch": 0.1872238088080565, + "flos": 25189827264000.0, + "grad_norm": 2.238258329288858, + "language_loss": 0.81043601, + "learning_rate": 3.746248920938024e-06, + "loss": 0.83247173, + "num_input_tokens_seen": 67185830, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.03125, + "step": 3114, + "time_per_iteration": 2.4947290420532227 + }, + { + "auxiliary_loss_clip": 0.01153492, + "auxiliary_loss_mlp": 0.01054246, + "balance_loss_clip": 1.03361082, + "balance_loss_mlp": 1.05319226, + "epoch": 0.1872839320607245, + "flos": 24134305178880.0, + "grad_norm": 2.2102322241331467, + "language_loss": 0.57819968, + "learning_rate": 3.74605902628851e-06, + "loss": 0.60027713, + "num_input_tokens_seen": 67206930, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.0, + "step": 3115, + "time_per_iteration": 2.4892075061798096 + }, + { + "auxiliary_loss_clip": 0.01151062, + "auxiliary_loss_mlp": 0.01056747, + "balance_loss_clip": 1.03873444, + "balance_loss_mlp": 1.05434299, + "epoch": 0.18734405531339246, + "flos": 21173793292800.0, + "grad_norm": 1.8141768865365742, + "language_loss": 0.71160758, + "learning_rate": 3.745869065428261e-06, + "loss": 0.73368567, + "num_input_tokens_seen": 67226290, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.96484375, + "step": 3116, + "time_per_iteration": 2.4705467224121094 + }, + { + "auxiliary_loss_clip": 0.01142667, + "auxiliary_loss_mlp": 0.01035702, + "balance_loss_clip": 1.01751065, + "balance_loss_mlp": 1.04771161, + "epoch": 0.18740417856606043, + "flos": 17237697039360.0, + "grad_norm": 1.8736078530078255, + "language_loss": 0.78733885, + "learning_rate": 3.7456790383644833e-06, + "loss": 0.80912256, + "num_input_tokens_seen": 67244410, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.94921875, + "step": 3117, + "time_per_iteration": 2.418527126312256 + }, + { + "auxiliary_loss_clip": 0.01151486, + "auxiliary_loss_mlp": 0.01048418, + "balance_loss_clip": 1.02898717, + "balance_loss_mlp": 1.05421317, + "epoch": 0.1874643018187284, + "flos": 32558049999360.0, + "grad_norm": 1.743274375857092, + "language_loss": 0.83945131, + "learning_rate": 3.745488945104381e-06, + "loss": 0.86145031, + "num_input_tokens_seen": 67264470, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.97265625, + "step": 3118, + "time_per_iteration": 2.5691416263580322 + }, + { + "auxiliary_loss_clip": 0.01151442, + "auxiliary_loss_mlp": 0.01049226, + "balance_loss_clip": 1.03109384, + "balance_loss_mlp": 1.0525409, + "epoch": 0.18752442507139636, + "flos": 23258156636160.0, + "grad_norm": 1.7594323212393352, + "language_loss": 0.76151264, + "learning_rate": 3.7452987856551636e-06, + "loss": 0.78351927, + "num_input_tokens_seen": 67284315, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.98828125, + "step": 3119, + "time_per_iteration": 2.459648847579956 + }, + { + "auxiliary_loss_clip": 0.01150997, + "auxiliary_loss_mlp": 0.01053702, + "balance_loss_clip": 1.03549838, + "balance_loss_mlp": 1.05181718, + "epoch": 0.18758454832406432, + "flos": 21760933006080.0, + "grad_norm": 1.593515591831454, + "language_loss": 0.81975627, + "learning_rate": 3.7451085600240406e-06, + "loss": 0.84180319, + "num_input_tokens_seen": 67302780, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9921875, + "step": 3120, + "time_per_iteration": 2.478870153427124 + }, + { + "auxiliary_loss_clip": 0.01148213, + "auxiliary_loss_mlp": 0.01043273, + "balance_loss_clip": 1.02526081, + "balance_loss_mlp": 1.05178094, + "epoch": 0.1876446715767323, + "flos": 29570210841600.0, + "grad_norm": 1.7598733043788508, + "language_loss": 0.8513701, + "learning_rate": 3.7449182682182263e-06, + "loss": 0.873285, + "num_input_tokens_seen": 67323405, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9609375, + "step": 3121, + "time_per_iteration": 2.5178277492523193 + }, + { + "auxiliary_loss_clip": 0.0115058, + "auxiliary_loss_mlp": 0.0104859, + "balance_loss_clip": 1.02976704, + "balance_loss_mlp": 1.05281448, + "epoch": 0.18770479482940028, + "flos": 30339992234880.0, + "grad_norm": 2.163070382320244, + "language_loss": 0.70038795, + "learning_rate": 3.744727910244937e-06, + "loss": 0.72237968, + "num_input_tokens_seen": 67345800, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9765625, + "step": 3122, + "time_per_iteration": 2.5523242950439453 + }, + { + "auxiliary_loss_clip": 0.0114817, + "auxiliary_loss_mlp": 0.01045591, + "balance_loss_clip": 1.02524245, + "balance_loss_mlp": 1.05194402, + "epoch": 0.18776491808206824, + "flos": 14465357527680.0, + "grad_norm": 2.352571744641408, + "language_loss": 0.7034744, + "learning_rate": 3.7445374861113905e-06, + "loss": 0.72541201, + "num_input_tokens_seen": 67363575, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.9609375, + "step": 3123, + "time_per_iteration": 2.4145569801330566 + }, + { + "auxiliary_loss_clip": 0.01149047, + "auxiliary_loss_mlp": 0.01047451, + "balance_loss_clip": 1.02968884, + "balance_loss_mlp": 1.05238771, + "epoch": 0.1878250413347362, + "flos": 24498547044480.0, + "grad_norm": 2.0330816469172097, + "language_loss": 0.73851109, + "learning_rate": 3.7443469958248066e-06, + "loss": 0.76047611, + "num_input_tokens_seen": 67381765, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.96875, + "step": 3124, + "time_per_iteration": 2.497352123260498 + }, + { + "auxiliary_loss_clip": 0.01153024, + "auxiliary_loss_mlp": 0.01050586, + "balance_loss_clip": 1.03027248, + "balance_loss_mlp": 1.05275774, + "epoch": 0.18788516458740417, + "flos": 39786185692800.0, + "grad_norm": 1.9990758157966066, + "language_loss": 0.80601895, + "learning_rate": 3.7441564393924106e-06, + "loss": 0.82805508, + "num_input_tokens_seen": 67405000, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.0, + "step": 3125, + "time_per_iteration": 2.605851411819458 + }, + { + "auxiliary_loss_clip": 0.01056257, + "auxiliary_loss_mlp": 0.01009024, + "balance_loss_clip": 1.00697315, + "balance_loss_mlp": 1.02352476, + "epoch": 0.18794528784007214, + "flos": 64699250664960.0, + "grad_norm": 0.9386177249275542, + "language_loss": 0.63591504, + "learning_rate": 3.7439658168214273e-06, + "loss": 0.65656781, + "num_input_tokens_seen": 67467140, + "router_z_loss_clip": 0.02050781, + "router_z_loss_mlp": 0.328125, + "step": 3126, + "time_per_iteration": 3.0943961143493652 + }, + { + "auxiliary_loss_clip": 0.01150221, + "auxiliary_loss_mlp": 0.01042071, + "balance_loss_clip": 1.02366543, + "balance_loss_mlp": 1.05439222, + "epoch": 0.1880054110927401, + "flos": 28622061486720.0, + "grad_norm": 1.7984129752859428, + "language_loss": 0.81274688, + "learning_rate": 3.7437751281190857e-06, + "loss": 0.83466977, + "num_input_tokens_seen": 67487980, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.9609375, + "step": 3127, + "time_per_iteration": 2.535048723220825 + }, + { + "auxiliary_loss_clip": 0.01054784, + "auxiliary_loss_mlp": 0.0100739, + "balance_loss_clip": 1.00543487, + "balance_loss_mlp": 1.02235639, + "epoch": 0.1880655343454081, + "flos": 64488958490880.0, + "grad_norm": 0.7620779230288282, + "language_loss": 0.6191628, + "learning_rate": 3.7435843732926164e-06, + "loss": 0.63978451, + "num_input_tokens_seen": 67552500, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.32421875, + "step": 3128, + "time_per_iteration": 3.1384503841400146 + }, + { + "auxiliary_loss_clip": 0.01153999, + "auxiliary_loss_mlp": 0.0104217, + "balance_loss_clip": 1.02329898, + "balance_loss_mlp": 1.05182266, + "epoch": 0.18812565759807606, + "flos": 32124464928000.0, + "grad_norm": 2.171302965646948, + "language_loss": 0.71237707, + "learning_rate": 3.7433935523492536e-06, + "loss": 0.73433876, + "num_input_tokens_seen": 67573295, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0234375, + "step": 3129, + "time_per_iteration": 2.560601234436035 + }, + { + "auxiliary_loss_clip": 0.01149923, + "auxiliary_loss_mlp": 0.01051091, + "balance_loss_clip": 1.03206491, + "balance_loss_mlp": 1.05224252, + "epoch": 0.18818578085074403, + "flos": 20624539449600.0, + "grad_norm": 2.040923932078449, + "language_loss": 0.85375232, + "learning_rate": 3.7432026652962314e-06, + "loss": 0.87576246, + "num_input_tokens_seen": 67590010, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.9765625, + "step": 3130, + "time_per_iteration": 2.4366040229797363 + }, + { + "auxiliary_loss_clip": 0.0114816, + "auxiliary_loss_mlp": 0.01044498, + "balance_loss_clip": 1.02507877, + "balance_loss_mlp": 1.04844868, + "epoch": 0.188245904103412, + "flos": 28840506048000.0, + "grad_norm": 1.9842347260172397, + "language_loss": 0.77227372, + "learning_rate": 3.7430117121407897e-06, + "loss": 0.7942003, + "num_input_tokens_seen": 67611110, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0, + "step": 3131, + "time_per_iteration": 2.503112554550171 + }, + { + "auxiliary_loss_clip": 0.01151098, + "auxiliary_loss_mlp": 0.0104766, + "balance_loss_clip": 1.02800202, + "balance_loss_mlp": 1.05402517, + "epoch": 0.18830602735607996, + "flos": 29420319386880.0, + "grad_norm": 1.8095346888628816, + "language_loss": 0.81244844, + "learning_rate": 3.74282069289017e-06, + "loss": 0.834436, + "num_input_tokens_seen": 67631990, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.96875, + "step": 3132, + "time_per_iteration": 2.5265986919403076 + }, + { + "auxiliary_loss_clip": 0.01154443, + "auxiliary_loss_mlp": 0.01048532, + "balance_loss_clip": 1.02939904, + "balance_loss_mlp": 1.05395401, + "epoch": 0.18836615060874792, + "flos": 28872933050880.0, + "grad_norm": 2.3595669444771135, + "language_loss": 0.79035556, + "learning_rate": 3.742629607551614e-06, + "loss": 0.81238532, + "num_input_tokens_seen": 67650490, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0, + "step": 3133, + "time_per_iteration": 2.500927209854126 + }, + { + "auxiliary_loss_clip": 0.01151251, + "auxiliary_loss_mlp": 0.0105121, + "balance_loss_clip": 1.03224421, + "balance_loss_mlp": 1.05204821, + "epoch": 0.18842627386141592, + "flos": 22601673717120.0, + "grad_norm": 4.024150314183157, + "language_loss": 0.82826144, + "learning_rate": 3.7424384561323698e-06, + "loss": 0.85028601, + "num_input_tokens_seen": 67668860, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.9921875, + "step": 3134, + "time_per_iteration": 2.4773380756378174 + }, + { + "auxiliary_loss_clip": 0.01146819, + "auxiliary_loss_mlp": 0.01047119, + "balance_loss_clip": 1.02847505, + "balance_loss_mlp": 1.05027199, + "epoch": 0.18848639711408388, + "flos": 24573600512640.0, + "grad_norm": 1.4735244825899, + "language_loss": 0.82783771, + "learning_rate": 3.742247238639684e-06, + "loss": 0.8497771, + "num_input_tokens_seen": 67690220, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.96484375, + "step": 3135, + "time_per_iteration": 2.4957115650177 + }, + { + "auxiliary_loss_clip": 0.01149872, + "auxiliary_loss_mlp": 0.01052674, + "balance_loss_clip": 1.03343356, + "balance_loss_mlp": 1.0503304, + "epoch": 0.18854652036675185, + "flos": 34166920078080.0, + "grad_norm": 1.8513380433423674, + "language_loss": 0.79031271, + "learning_rate": 3.7420559550808083e-06, + "loss": 0.81233823, + "num_input_tokens_seen": 67709820, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.9921875, + "step": 3136, + "time_per_iteration": 2.556800127029419 + }, + { + "auxiliary_loss_clip": 0.01150763, + "auxiliary_loss_mlp": 0.01048681, + "balance_loss_clip": 1.02947617, + "balance_loss_mlp": 1.05327463, + "epoch": 0.1886066436194198, + "flos": 24200236592640.0, + "grad_norm": 1.9366242888645147, + "language_loss": 0.81049621, + "learning_rate": 3.741864605462996e-06, + "loss": 0.83249068, + "num_input_tokens_seen": 67729490, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.9765625, + "step": 3137, + "time_per_iteration": 2.487513542175293 + }, + { + "auxiliary_loss_clip": 0.01151307, + "auxiliary_loss_mlp": 0.01057024, + "balance_loss_clip": 1.03913093, + "balance_loss_mlp": 1.05406666, + "epoch": 0.18866676687208778, + "flos": 21251109317760.0, + "grad_norm": 1.5870634004860276, + "language_loss": 0.8119483, + "learning_rate": 3.741673189793504e-06, + "loss": 0.83403158, + "num_input_tokens_seen": 67749665, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.97265625, + "step": 3138, + "time_per_iteration": 2.4554855823516846 + }, + { + "auxiliary_loss_clip": 0.01152889, + "auxiliary_loss_mlp": 0.01050697, + "balance_loss_clip": 1.03162408, + "balance_loss_mlp": 1.05190897, + "epoch": 0.18872689012475574, + "flos": 37308673013760.0, + "grad_norm": 1.760814692015778, + "language_loss": 0.636096, + "learning_rate": 3.7414817080795896e-06, + "loss": 0.6581319, + "num_input_tokens_seen": 67776230, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0078125, + "step": 3139, + "time_per_iteration": 2.6103553771972656 + }, + { + "auxiliary_loss_clip": 0.01146092, + "auxiliary_loss_mlp": 0.0105006, + "balance_loss_clip": 1.03046215, + "balance_loss_mlp": 1.04812348, + "epoch": 0.1887870133774237, + "flos": 21652303299840.0, + "grad_norm": 2.433795452320061, + "language_loss": 0.71546841, + "learning_rate": 3.741290160328514e-06, + "loss": 0.73742986, + "num_input_tokens_seen": 67795080, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.98046875, + "step": 3140, + "time_per_iteration": 2.4519457817077637 + }, + { + "auxiliary_loss_clip": 0.01147517, + "auxiliary_loss_mlp": 0.01047268, + "balance_loss_clip": 1.02764606, + "balance_loss_mlp": 1.04848385, + "epoch": 0.1888471366300917, + "flos": 15924659374080.0, + "grad_norm": 3.1391974719951574, + "language_loss": 0.87001872, + "learning_rate": 3.7410985465475412e-06, + "loss": 0.89196658, + "num_input_tokens_seen": 67813110, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.98828125, + "step": 3141, + "time_per_iteration": 2.4811747074127197 + }, + { + "auxiliary_loss_clip": 0.01153623, + "auxiliary_loss_mlp": 0.0104492, + "balance_loss_clip": 1.02460694, + "balance_loss_mlp": 1.05144691, + "epoch": 0.18890725988275966, + "flos": 18551955767040.0, + "grad_norm": 2.021325930100965, + "language_loss": 0.77418405, + "learning_rate": 3.7409068667439378e-06, + "loss": 0.79616946, + "num_input_tokens_seen": 67831070, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.0234375, + "step": 3142, + "time_per_iteration": 2.5001184940338135 + }, + { + "auxiliary_loss_clip": 0.01148279, + "auxiliary_loss_mlp": 0.01042631, + "balance_loss_clip": 1.02542925, + "balance_loss_mlp": 1.05104184, + "epoch": 0.18896738313542763, + "flos": 28840865184000.0, + "grad_norm": 1.6841374820722228, + "language_loss": 0.78446913, + "learning_rate": 3.740715120924971e-06, + "loss": 0.80637825, + "num_input_tokens_seen": 67852170, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.97265625, + "step": 3143, + "time_per_iteration": 3.9074132442474365 + }, + { + "auxiliary_loss_clip": 0.01150084, + "auxiliary_loss_mlp": 0.01049426, + "balance_loss_clip": 1.03081727, + "balance_loss_mlp": 1.05069065, + "epoch": 0.1890275063880956, + "flos": 22412747157120.0, + "grad_norm": 4.1822349926512485, + "language_loss": 0.71507585, + "learning_rate": 3.740523309097912e-06, + "loss": 0.73707104, + "num_input_tokens_seen": 67869945, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9921875, + "step": 3144, + "time_per_iteration": 3.981715679168701 + }, + { + "auxiliary_loss_clip": 0.01152034, + "auxiliary_loss_mlp": 0.01045652, + "balance_loss_clip": 1.02605355, + "balance_loss_mlp": 1.0513736, + "epoch": 0.18908762964076356, + "flos": 24243904552320.0, + "grad_norm": 2.6203593578621893, + "language_loss": 0.73683178, + "learning_rate": 3.7403314312700356e-06, + "loss": 0.75880861, + "num_input_tokens_seen": 67890240, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0078125, + "step": 3145, + "time_per_iteration": 2.5101706981658936 + }, + { + "auxiliary_loss_clip": 0.01143872, + "auxiliary_loss_mlp": 0.01045631, + "balance_loss_clip": 1.02783298, + "balance_loss_mlp": 1.04759097, + "epoch": 0.18914775289343153, + "flos": 16982910892800.0, + "grad_norm": 2.6756165752276027, + "language_loss": 0.77081764, + "learning_rate": 3.740139487448616e-06, + "loss": 0.79271269, + "num_input_tokens_seen": 67907825, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9609375, + "step": 3146, + "time_per_iteration": 2.4278056621551514 + }, + { + "auxiliary_loss_clip": 0.01148489, + "auxiliary_loss_mlp": 0.01046339, + "balance_loss_clip": 1.02811205, + "balance_loss_mlp": 1.04947495, + "epoch": 0.1892078761460995, + "flos": 21543781334400.0, + "grad_norm": 1.794796296308648, + "language_loss": 0.78377169, + "learning_rate": 3.7399474776409326e-06, + "loss": 0.80571997, + "num_input_tokens_seen": 67926670, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.98828125, + "step": 3147, + "time_per_iteration": 2.467607259750366 + }, + { + "auxiliary_loss_clip": 0.0114757, + "auxiliary_loss_mlp": 0.01048988, + "balance_loss_clip": 1.0310235, + "balance_loss_mlp": 1.0499115, + "epoch": 0.18926799939876748, + "flos": 23001538896000.0, + "grad_norm": 3.2769360880247853, + "language_loss": 0.67016155, + "learning_rate": 3.739755401854267e-06, + "loss": 0.69212711, + "num_input_tokens_seen": 67943645, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9765625, + "step": 3148, + "time_per_iteration": 2.4846954345703125 + }, + { + "auxiliary_loss_clip": 0.01145427, + "auxiliary_loss_mlp": 0.01037742, + "balance_loss_clip": 1.02037382, + "balance_loss_mlp": 1.04898858, + "epoch": 0.18932812265143545, + "flos": 22273019251200.0, + "grad_norm": 4.644784357412393, + "language_loss": 0.75978655, + "learning_rate": 3.739563260095902e-06, + "loss": 0.78161824, + "num_input_tokens_seen": 67962345, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.96484375, + "step": 3149, + "time_per_iteration": 2.4768459796905518 + }, + { + "auxiliary_loss_clip": 0.01143839, + "auxiliary_loss_mlp": 0.01047607, + "balance_loss_clip": 1.03028584, + "balance_loss_mlp": 1.05033517, + "epoch": 0.1893882459041034, + "flos": 18624423456000.0, + "grad_norm": 2.9181295874949735, + "language_loss": 0.81229341, + "learning_rate": 3.7393710523731245e-06, + "loss": 0.83420789, + "num_input_tokens_seen": 67979760, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9375, + "step": 3150, + "time_per_iteration": 2.42832088470459 + }, + { + "auxiliary_loss_clip": 0.01148187, + "auxiliary_loss_mlp": 0.01046446, + "balance_loss_clip": 1.02886271, + "balance_loss_mlp": 1.05068374, + "epoch": 0.18944836915677138, + "flos": 22892981016960.0, + "grad_norm": 2.066054594612055, + "language_loss": 0.84966886, + "learning_rate": 3.7391787786932215e-06, + "loss": 0.87161517, + "num_input_tokens_seen": 67996895, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9765625, + "step": 3151, + "time_per_iteration": 2.458054542541504 + }, + { + "auxiliary_loss_clip": 0.01148364, + "auxiliary_loss_mlp": 0.01052715, + "balance_loss_clip": 1.03441668, + "balance_loss_mlp": 1.04896331, + "epoch": 0.18950849240943934, + "flos": 26796542526720.0, + "grad_norm": 1.9128881662164896, + "language_loss": 0.7443462, + "learning_rate": 3.7389864390634857e-06, + "loss": 0.76635695, + "num_input_tokens_seen": 68018365, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.99609375, + "step": 3152, + "time_per_iteration": 2.4904792308807373 + }, + { + "auxiliary_loss_clip": 0.01146776, + "auxiliary_loss_mlp": 0.01048229, + "balance_loss_clip": 1.02937067, + "balance_loss_mlp": 1.0502255, + "epoch": 0.1895686156621073, + "flos": 24971239048320.0, + "grad_norm": 1.8661622565083957, + "language_loss": 0.75719136, + "learning_rate": 3.738794033491209e-06, + "loss": 0.77914143, + "num_input_tokens_seen": 68037985, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.96484375, + "step": 3153, + "time_per_iteration": 2.5026283264160156 + }, + { + "auxiliary_loss_clip": 0.01148349, + "auxiliary_loss_mlp": 0.01048305, + "balance_loss_clip": 1.03007817, + "balance_loss_mlp": 1.04962945, + "epoch": 0.1896287389147753, + "flos": 21944544353280.0, + "grad_norm": 1.8393709351558127, + "language_loss": 0.79529279, + "learning_rate": 3.7386015619836887e-06, + "loss": 0.81725931, + "num_input_tokens_seen": 68057975, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.984375, + "step": 3154, + "time_per_iteration": 2.4544081687927246 + }, + { + "auxiliary_loss_clip": 0.01151316, + "auxiliary_loss_mlp": 0.01047877, + "balance_loss_clip": 1.02919698, + "balance_loss_mlp": 1.04986668, + "epoch": 0.18968886216744327, + "flos": 18179058723840.0, + "grad_norm": 2.673670363277482, + "language_loss": 0.72798991, + "learning_rate": 3.738409024548223e-06, + "loss": 0.74998182, + "num_input_tokens_seen": 68074175, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 3155, + "time_per_iteration": 2.425431728363037 + }, + { + "auxiliary_loss_clip": 0.01145009, + "auxiliary_loss_mlp": 0.01048344, + "balance_loss_clip": 1.03042662, + "balance_loss_mlp": 1.04930019, + "epoch": 0.18974898542011123, + "flos": 20412487509120.0, + "grad_norm": 1.676026678838244, + "language_loss": 0.73911691, + "learning_rate": 3.7382164211921136e-06, + "loss": 0.76105046, + "num_input_tokens_seen": 68095230, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.95703125, + "step": 3156, + "time_per_iteration": 2.4683640003204346 + }, + { + "auxiliary_loss_clip": 0.01149399, + "auxiliary_loss_mlp": 0.01050259, + "balance_loss_clip": 1.03281915, + "balance_loss_mlp": 1.05195308, + "epoch": 0.1898091086727792, + "flos": 23985024255360.0, + "grad_norm": 1.5984593201401434, + "language_loss": 0.68251741, + "learning_rate": 3.7380237519226623e-06, + "loss": 0.70451397, + "num_input_tokens_seen": 68113805, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9765625, + "step": 3157, + "time_per_iteration": 2.472182512283325 + }, + { + "auxiliary_loss_clip": 0.01146139, + "auxiliary_loss_mlp": 0.01043457, + "balance_loss_clip": 1.02539706, + "balance_loss_mlp": 1.04914486, + "epoch": 0.18986923192544716, + "flos": 27637067756160.0, + "grad_norm": 1.9937577865402571, + "language_loss": 0.80197155, + "learning_rate": 3.737831016747176e-06, + "loss": 0.82386756, + "num_input_tokens_seen": 68133190, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.96875, + "step": 3158, + "time_per_iteration": 2.4978723526000977 + }, + { + "auxiliary_loss_clip": 0.01152812, + "auxiliary_loss_mlp": 0.01045212, + "balance_loss_clip": 1.02624583, + "balance_loss_mlp": 1.05201745, + "epoch": 0.18992935517811513, + "flos": 25484151306240.0, + "grad_norm": 1.9065090881698699, + "language_loss": 0.71940476, + "learning_rate": 3.737638215672964e-06, + "loss": 0.74138498, + "num_input_tokens_seen": 68152330, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0078125, + "step": 3159, + "time_per_iteration": 2.503129720687866 + }, + { + "auxiliary_loss_clip": 0.01150054, + "auxiliary_loss_mlp": 0.01049079, + "balance_loss_clip": 1.02987432, + "balance_loss_mlp": 1.05255282, + "epoch": 0.1899894784307831, + "flos": 17420805596160.0, + "grad_norm": 1.8597759984302606, + "language_loss": 0.85071993, + "learning_rate": 3.7374453487073366e-06, + "loss": 0.8727113, + "num_input_tokens_seen": 68170185, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.9765625, + "step": 3160, + "time_per_iteration": 2.45534348487854 + }, + { + "auxiliary_loss_clip": 0.01143204, + "auxiliary_loss_mlp": 0.01049047, + "balance_loss_clip": 1.03235734, + "balance_loss_mlp": 1.050807, + "epoch": 0.19004960168345109, + "flos": 27492240119040.0, + "grad_norm": 1.7120140162377986, + "language_loss": 0.73554128, + "learning_rate": 3.7372524158576074e-06, + "loss": 0.75746381, + "num_input_tokens_seen": 68191665, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.92578125, + "step": 3161, + "time_per_iteration": 2.5551726818084717 + }, + { + "auxiliary_loss_clip": 0.01150414, + "auxiliary_loss_mlp": 0.01047878, + "balance_loss_clip": 1.02982974, + "balance_loss_mlp": 1.05420387, + "epoch": 0.19010972493611905, + "flos": 38654676385920.0, + "grad_norm": 1.554139282497156, + "language_loss": 0.80939364, + "learning_rate": 3.7370594171310926e-06, + "loss": 0.83137655, + "num_input_tokens_seen": 68214635, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9609375, + "step": 3162, + "time_per_iteration": 2.609764337539673 + }, + { + "auxiliary_loss_clip": 0.01149524, + "auxiliary_loss_mlp": 0.01043018, + "balance_loss_clip": 1.02486265, + "balance_loss_mlp": 1.05257571, + "epoch": 0.19016984818878702, + "flos": 19244744357760.0, + "grad_norm": 1.8884975109329094, + "language_loss": 0.75600141, + "learning_rate": 3.73686635253511e-06, + "loss": 0.77792686, + "num_input_tokens_seen": 68232150, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.96875, + "step": 3163, + "time_per_iteration": 2.4494824409484863 + }, + { + "auxiliary_loss_clip": 0.01149917, + "auxiliary_loss_mlp": 0.0103951, + "balance_loss_clip": 1.02161682, + "balance_loss_mlp": 1.05577397, + "epoch": 0.19022997144145498, + "flos": 37596891744000.0, + "grad_norm": 1.5980783305445414, + "language_loss": 0.74197054, + "learning_rate": 3.736673222076982e-06, + "loss": 0.76386476, + "num_input_tokens_seen": 68253370, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.94140625, + "step": 3164, + "time_per_iteration": 2.5901739597320557 + }, + { + "auxiliary_loss_clip": 0.01148415, + "auxiliary_loss_mlp": 0.01039529, + "balance_loss_clip": 1.02151656, + "balance_loss_mlp": 1.05402589, + "epoch": 0.19029009469412295, + "flos": 61530921665280.0, + "grad_norm": 1.5830796140792522, + "language_loss": 0.66913098, + "learning_rate": 3.7364800257640313e-06, + "loss": 0.69101042, + "num_input_tokens_seen": 68278895, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9453125, + "step": 3165, + "time_per_iteration": 2.899500608444214 + }, + { + "auxiliary_loss_clip": 0.01148214, + "auxiliary_loss_mlp": 0.01045421, + "balance_loss_clip": 1.02624011, + "balance_loss_mlp": 1.05282831, + "epoch": 0.1903502179467909, + "flos": 13954851480960.0, + "grad_norm": 2.1716027754337257, + "language_loss": 0.7452209, + "learning_rate": 3.7362867636035835e-06, + "loss": 0.76715726, + "num_input_tokens_seen": 68294880, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.953125, + "step": 3166, + "time_per_iteration": 2.4325685501098633 + }, + { + "auxiliary_loss_clip": 0.01062623, + "auxiliary_loss_mlp": 0.01017161, + "balance_loss_clip": 1.01490772, + "balance_loss_mlp": 1.02902174, + "epoch": 0.1904103411994589, + "flos": 66899641916160.0, + "grad_norm": 0.8067170187870535, + "language_loss": 0.50396568, + "learning_rate": 3.736093435602968e-06, + "loss": 0.52476352, + "num_input_tokens_seen": 68359665, + "router_z_loss_clip": 0.02258301, + "router_z_loss_mlp": 0.3359375, + "step": 3167, + "time_per_iteration": 3.1095221042633057 + }, + { + "auxiliary_loss_clip": 0.01146367, + "auxiliary_loss_mlp": 0.01049594, + "balance_loss_clip": 1.03184342, + "balance_loss_mlp": 1.05208659, + "epoch": 0.19047046445212687, + "flos": 21908741472000.0, + "grad_norm": 1.7496006549093657, + "language_loss": 0.74235475, + "learning_rate": 3.7359000417695156e-06, + "loss": 0.76431435, + "num_input_tokens_seen": 68378950, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9453125, + "step": 3168, + "time_per_iteration": 2.483551502227783 + }, + { + "auxiliary_loss_clip": 0.01059618, + "auxiliary_loss_mlp": 0.01004786, + "balance_loss_clip": 1.00246131, + "balance_loss_mlp": 1.02649927, + "epoch": 0.19053058770479483, + "flos": 59255156701440.0, + "grad_norm": 0.8615778549663292, + "language_loss": 0.60097563, + "learning_rate": 3.73570658211056e-06, + "loss": 0.6216197, + "num_input_tokens_seen": 68434235, + "router_z_loss_clip": 0.02319336, + "router_z_loss_mlp": 0.33203125, + "step": 3169, + "time_per_iteration": 2.958176851272583 + }, + { + "auxiliary_loss_clip": 0.01152665, + "auxiliary_loss_mlp": 0.01051299, + "balance_loss_clip": 1.03371537, + "balance_loss_mlp": 1.05302989, + "epoch": 0.1905907109574628, + "flos": 23951304362880.0, + "grad_norm": 1.550337238497042, + "language_loss": 0.77976263, + "learning_rate": 3.735513056633436e-06, + "loss": 0.80180222, + "num_input_tokens_seen": 68453830, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.99609375, + "step": 3170, + "time_per_iteration": 2.5174756050109863 + }, + { + "auxiliary_loss_clip": 0.01145075, + "auxiliary_loss_mlp": 0.01046915, + "balance_loss_clip": 1.02960575, + "balance_loss_mlp": 1.05185819, + "epoch": 0.19065083421013077, + "flos": 20812316774400.0, + "grad_norm": 1.7193055204742105, + "language_loss": 0.78597021, + "learning_rate": 3.7353194653454834e-06, + "loss": 0.80789012, + "num_input_tokens_seen": 68473005, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.93359375, + "step": 3171, + "time_per_iteration": 2.4895551204681396 + }, + { + "auxiliary_loss_clip": 0.01149187, + "auxiliary_loss_mlp": 0.0104474, + "balance_loss_clip": 1.02617931, + "balance_loss_mlp": 1.05111575, + "epoch": 0.19071095746279873, + "flos": 31284981192960.0, + "grad_norm": 3.5246110250440386, + "language_loss": 0.78578937, + "learning_rate": 3.7351258082540426e-06, + "loss": 0.80772865, + "num_input_tokens_seen": 68493470, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.98046875, + "step": 3172, + "time_per_iteration": 2.574558973312378 + }, + { + "auxiliary_loss_clip": 0.01148239, + "auxiliary_loss_mlp": 0.01054453, + "balance_loss_clip": 1.03711963, + "balance_loss_mlp": 1.05253482, + "epoch": 0.1907710807154667, + "flos": 14356117290240.0, + "grad_norm": 1.581476317811461, + "language_loss": 0.80126482, + "learning_rate": 3.7349320853664576e-06, + "loss": 0.82329178, + "num_input_tokens_seen": 68511290, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.95703125, + "step": 3173, + "time_per_iteration": 2.464979410171509 + }, + { + "auxiliary_loss_clip": 0.01149716, + "auxiliary_loss_mlp": 0.01051904, + "balance_loss_clip": 1.03432083, + "balance_loss_mlp": 1.05250478, + "epoch": 0.1908312039681347, + "flos": 26907039740160.0, + "grad_norm": 1.9222394249434893, + "language_loss": 0.78740567, + "learning_rate": 3.7347382966900735e-06, + "loss": 0.8094219, + "num_input_tokens_seen": 68532575, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.96875, + "step": 3174, + "time_per_iteration": 2.540959358215332 + }, + { + "auxiliary_loss_clip": 0.01149777, + "auxiliary_loss_mlp": 0.01047648, + "balance_loss_clip": 1.03043461, + "balance_loss_mlp": 1.05367374, + "epoch": 0.19089132722080265, + "flos": 14494695960960.0, + "grad_norm": 1.8458147293094664, + "language_loss": 0.80757344, + "learning_rate": 3.7345444422322395e-06, + "loss": 0.82954776, + "num_input_tokens_seen": 68548760, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9609375, + "step": 3175, + "time_per_iteration": 2.441190481185913 + }, + { + "auxiliary_loss_clip": 0.01148992, + "auxiliary_loss_mlp": 0.01056395, + "balance_loss_clip": 1.03821599, + "balance_loss_mlp": 1.0521791, + "epoch": 0.19095145047347062, + "flos": 13952876232960.0, + "grad_norm": 2.3562328324004445, + "language_loss": 0.85142022, + "learning_rate": 3.7343505220003067e-06, + "loss": 0.87347412, + "num_input_tokens_seen": 68563100, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.96875, + "step": 3176, + "time_per_iteration": 2.4397072792053223 + }, + { + "auxiliary_loss_clip": 0.01152727, + "auxiliary_loss_mlp": 0.01056149, + "balance_loss_clip": 1.036515, + "balance_loss_mlp": 1.05395234, + "epoch": 0.19101157372613858, + "flos": 25301832848640.0, + "grad_norm": 2.002060812172469, + "language_loss": 0.81206596, + "learning_rate": 3.7341565360016285e-06, + "loss": 0.83415473, + "num_input_tokens_seen": 68581650, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.98828125, + "step": 3177, + "time_per_iteration": 2.4980266094207764 + }, + { + "auxiliary_loss_clip": 0.01144454, + "auxiliary_loss_mlp": 0.01048966, + "balance_loss_clip": 1.03073931, + "balance_loss_mlp": 1.0503974, + "epoch": 0.19107169697880655, + "flos": 20558212986240.0, + "grad_norm": 1.9374450898751996, + "language_loss": 0.74628592, + "learning_rate": 3.73396248424356e-06, + "loss": 0.76822007, + "num_input_tokens_seen": 68600360, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.94140625, + "step": 3178, + "time_per_iteration": 2.477679967880249 + }, + { + "auxiliary_loss_clip": 0.01147132, + "auxiliary_loss_mlp": 0.01039746, + "balance_loss_clip": 1.02273464, + "balance_loss_mlp": 1.05001104, + "epoch": 0.19113182023147451, + "flos": 22163204396160.0, + "grad_norm": 1.8429055258583904, + "language_loss": 0.8167876, + "learning_rate": 3.7337683667334606e-06, + "loss": 0.83865643, + "num_input_tokens_seen": 68617885, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.97265625, + "step": 3179, + "time_per_iteration": 2.452310800552368 + }, + { + "auxiliary_loss_clip": 0.0114904, + "auxiliary_loss_mlp": 0.01046544, + "balance_loss_clip": 1.02892482, + "balance_loss_mlp": 1.05279994, + "epoch": 0.19119194348414248, + "flos": 18581796990720.0, + "grad_norm": 2.1508657656276484, + "language_loss": 0.7946887, + "learning_rate": 3.733574183478691e-06, + "loss": 0.81664455, + "num_input_tokens_seen": 68634550, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.96484375, + "step": 3180, + "time_per_iteration": 2.451066732406616 + }, + { + "auxiliary_loss_clip": 0.0114304, + "auxiliary_loss_mlp": 0.01045985, + "balance_loss_clip": 1.02770984, + "balance_loss_mlp": 1.04780042, + "epoch": 0.19125206673681047, + "flos": 19026623018880.0, + "grad_norm": 2.916741655382754, + "language_loss": 0.79891652, + "learning_rate": 3.733379934486615e-06, + "loss": 0.82080674, + "num_input_tokens_seen": 68651895, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.94921875, + "step": 3181, + "time_per_iteration": 2.4310615062713623 + }, + { + "auxiliary_loss_clip": 0.0114616, + "auxiliary_loss_mlp": 0.01053832, + "balance_loss_clip": 1.03623664, + "balance_loss_mlp": 1.04858851, + "epoch": 0.19131218998947844, + "flos": 21690153256320.0, + "grad_norm": 1.7607714952320546, + "language_loss": 0.73820639, + "learning_rate": 3.7331856197645973e-06, + "loss": 0.76020634, + "num_input_tokens_seen": 68671500, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9765625, + "step": 3182, + "time_per_iteration": 2.4712350368499756 + }, + { + "auxiliary_loss_clip": 0.01148345, + "auxiliary_loss_mlp": 0.01048421, + "balance_loss_clip": 1.03093314, + "balance_loss_mlp": 1.05187011, + "epoch": 0.1913723132421464, + "flos": 18442500048000.0, + "grad_norm": 1.8018319163421928, + "language_loss": 0.6486634, + "learning_rate": 3.7329912393200084e-06, + "loss": 0.67063105, + "num_input_tokens_seen": 68690570, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.96484375, + "step": 3183, + "time_per_iteration": 2.440232753753662 + }, + { + "auxiliary_loss_clip": 0.01145449, + "auxiliary_loss_mlp": 0.01047983, + "balance_loss_clip": 1.02920759, + "balance_loss_mlp": 1.04864669, + "epoch": 0.19143243649481437, + "flos": 27160102033920.0, + "grad_norm": 1.760716170695104, + "language_loss": 0.73234087, + "learning_rate": 3.7327967931602173e-06, + "loss": 0.7542752, + "num_input_tokens_seen": 68709735, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.96875, + "step": 3184, + "time_per_iteration": 3.9211573600769043 + }, + { + "auxiliary_loss_clip": 0.01145752, + "auxiliary_loss_mlp": 0.01047876, + "balance_loss_clip": 1.0281471, + "balance_loss_mlp": 1.04738748, + "epoch": 0.19149255974748233, + "flos": 21718952985600.0, + "grad_norm": 2.1066155051108315, + "language_loss": 0.8784132, + "learning_rate": 3.732602281292598e-06, + "loss": 0.9003495, + "num_input_tokens_seen": 68727565, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.984375, + "step": 3185, + "time_per_iteration": 5.396124601364136 + }, + { + "auxiliary_loss_clip": 0.01143307, + "auxiliary_loss_mlp": 0.01046716, + "balance_loss_clip": 1.02803612, + "balance_loss_mlp": 1.04899192, + "epoch": 0.1915526830001503, + "flos": 22963293889920.0, + "grad_norm": 2.10102369978198, + "language_loss": 0.72667789, + "learning_rate": 3.7324077037245267e-06, + "loss": 0.74857807, + "num_input_tokens_seen": 68748110, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.94140625, + "step": 3186, + "time_per_iteration": 2.498241901397705 + }, + { + "auxiliary_loss_clip": 0.01153236, + "auxiliary_loss_mlp": 0.01042185, + "balance_loss_clip": 1.02244437, + "balance_loss_mlp": 1.054919, + "epoch": 0.1916128062528183, + "flos": 26140741966080.0, + "grad_norm": 2.264264166459479, + "language_loss": 0.83865881, + "learning_rate": 3.7322130604633825e-06, + "loss": 0.86061311, + "num_input_tokens_seen": 68769765, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.98046875, + "step": 3187, + "time_per_iteration": 2.527416467666626 + }, + { + "auxiliary_loss_clip": 0.01051867, + "auxiliary_loss_mlp": 0.01015636, + "balance_loss_clip": 1.01343083, + "balance_loss_mlp": 1.01988959, + "epoch": 0.19167292950548626, + "flos": 54925767457920.0, + "grad_norm": 0.8634842964488614, + "language_loss": 0.55803859, + "learning_rate": 3.732018351516544e-06, + "loss": 0.5787136, + "num_input_tokens_seen": 68826815, + "router_z_loss_clip": 0.02209473, + "router_z_loss_mlp": 0.3203125, + "step": 3188, + "time_per_iteration": 3.0815136432647705 + }, + { + "auxiliary_loss_clip": 0.01145462, + "auxiliary_loss_mlp": 0.01055783, + "balance_loss_clip": 1.03709126, + "balance_loss_mlp": 1.04972625, + "epoch": 0.19173305275815422, + "flos": 29935601942400.0, + "grad_norm": 1.71302722892552, + "language_loss": 0.70180511, + "learning_rate": 3.731823576891397e-06, + "loss": 0.72381759, + "num_input_tokens_seen": 68847585, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.95703125, + "step": 3189, + "time_per_iteration": 2.5380465984344482 + }, + { + "auxiliary_loss_clip": 0.01140421, + "auxiliary_loss_mlp": 0.01034792, + "balance_loss_clip": 1.01822233, + "balance_loss_mlp": 1.04853344, + "epoch": 0.1917931760108222, + "flos": 24752471264640.0, + "grad_norm": 2.222159201352765, + "language_loss": 0.74234986, + "learning_rate": 3.7316287365953266e-06, + "loss": 0.76410198, + "num_input_tokens_seen": 68866620, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.91796875, + "step": 3190, + "time_per_iteration": 2.5862700939178467 + }, + { + "auxiliary_loss_clip": 0.01143494, + "auxiliary_loss_mlp": 0.01056401, + "balance_loss_clip": 1.03818583, + "balance_loss_mlp": 1.04965627, + "epoch": 0.19185329926349015, + "flos": 18843550375680.0, + "grad_norm": 1.8818377537371913, + "language_loss": 0.8394708, + "learning_rate": 3.73143383063572e-06, + "loss": 0.86146975, + "num_input_tokens_seen": 68885515, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9375, + "step": 3191, + "time_per_iteration": 2.5077905654907227 + }, + { + "auxiliary_loss_clip": 0.01139779, + "auxiliary_loss_mlp": 0.01038816, + "balance_loss_clip": 1.02217412, + "balance_loss_mlp": 1.04766488, + "epoch": 0.19191342251615812, + "flos": 22086858038400.0, + "grad_norm": 1.7694679756443132, + "language_loss": 0.89325655, + "learning_rate": 3.73123885901997e-06, + "loss": 0.91504252, + "num_input_tokens_seen": 68903225, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.921875, + "step": 3192, + "time_per_iteration": 2.4738776683807373 + }, + { + "auxiliary_loss_clip": 0.01150885, + "auxiliary_loss_mlp": 0.0105345, + "balance_loss_clip": 1.03398299, + "balance_loss_mlp": 1.0531472, + "epoch": 0.19197354576882608, + "flos": 22199115018240.0, + "grad_norm": 2.352703418633998, + "language_loss": 0.74830496, + "learning_rate": 3.7310438217554687e-06, + "loss": 0.77034831, + "num_input_tokens_seen": 68922860, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.9765625, + "step": 3193, + "time_per_iteration": 2.47143816947937 + }, + { + "auxiliary_loss_clip": 0.01146927, + "auxiliary_loss_mlp": 0.01045614, + "balance_loss_clip": 1.02717233, + "balance_loss_mlp": 1.04918766, + "epoch": 0.19203366902149407, + "flos": 24896185580160.0, + "grad_norm": 1.7283890992056894, + "language_loss": 0.74733245, + "learning_rate": 3.730848718849612e-06, + "loss": 0.7692579, + "num_input_tokens_seen": 68943000, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.9765625, + "step": 3194, + "time_per_iteration": 2.5001959800720215 + }, + { + "auxiliary_loss_clip": 0.0105047, + "auxiliary_loss_mlp": 0.010055, + "balance_loss_clip": 1.00319958, + "balance_loss_mlp": 1.01851392, + "epoch": 0.19209379227416204, + "flos": 68416722789120.0, + "grad_norm": 0.7975785668902318, + "language_loss": 0.68455988, + "learning_rate": 3.7306535503097985e-06, + "loss": 0.70511955, + "num_input_tokens_seen": 69000255, + "router_z_loss_clip": 0.02294922, + "router_z_loss_mlp": 0.3203125, + "step": 3195, + "time_per_iteration": 3.014677047729492 + }, + { + "auxiliary_loss_clip": 0.01146296, + "auxiliary_loss_mlp": 0.01043268, + "balance_loss_clip": 1.0254823, + "balance_loss_mlp": 1.05066323, + "epoch": 0.19215391552683, + "flos": 22055185221120.0, + "grad_norm": 1.9672517867074575, + "language_loss": 0.72712696, + "learning_rate": 3.730458316143429e-06, + "loss": 0.74902254, + "num_input_tokens_seen": 69019665, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.95703125, + "step": 3196, + "time_per_iteration": 2.4855856895446777 + }, + { + "auxiliary_loss_clip": 0.01151669, + "auxiliary_loss_mlp": 0.01046783, + "balance_loss_clip": 1.0284251, + "balance_loss_mlp": 1.05643284, + "epoch": 0.19221403877949797, + "flos": 20302959962880.0, + "grad_norm": 1.8158077484015336, + "language_loss": 0.83774233, + "learning_rate": 3.7302630163579068e-06, + "loss": 0.85972691, + "num_input_tokens_seen": 69039055, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.953125, + "step": 3197, + "time_per_iteration": 2.4530181884765625 + }, + { + "auxiliary_loss_clip": 0.01146905, + "auxiliary_loss_mlp": 0.01044345, + "balance_loss_clip": 1.02565312, + "balance_loss_mlp": 1.05036283, + "epoch": 0.19227416203216594, + "flos": 23185329811200.0, + "grad_norm": 2.295881830513264, + "language_loss": 0.80459738, + "learning_rate": 3.7300676509606373e-06, + "loss": 0.82650983, + "num_input_tokens_seen": 69056370, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.96875, + "step": 3198, + "time_per_iteration": 2.4882590770721436 + }, + { + "auxiliary_loss_clip": 0.01148366, + "auxiliary_loss_mlp": 0.01050243, + "balance_loss_clip": 1.03090763, + "balance_loss_mlp": 1.04984999, + "epoch": 0.1923342852848339, + "flos": 25776607841280.0, + "grad_norm": 1.9800701307051174, + "language_loss": 0.7862891, + "learning_rate": 3.729872219959029e-06, + "loss": 0.80827522, + "num_input_tokens_seen": 69075915, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.984375, + "step": 3199, + "time_per_iteration": 2.507227659225464 + }, + { + "auxiliary_loss_clip": 0.01146428, + "auxiliary_loss_mlp": 0.01042987, + "balance_loss_clip": 1.02567828, + "balance_loss_mlp": 1.05150342, + "epoch": 0.19239440853750187, + "flos": 17128349061120.0, + "grad_norm": 2.05190707233933, + "language_loss": 0.83391261, + "learning_rate": 3.7296767233604934e-06, + "loss": 0.85580671, + "num_input_tokens_seen": 69094145, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.94921875, + "step": 3200, + "time_per_iteration": 2.459218978881836 + }, + { + "auxiliary_loss_clip": 0.01148087, + "auxiliary_loss_mlp": 0.01051054, + "balance_loss_clip": 1.03286231, + "balance_loss_mlp": 1.0524931, + "epoch": 0.19245453179016986, + "flos": 16435093593600.0, + "grad_norm": 2.0233550639398428, + "language_loss": 0.78678542, + "learning_rate": 3.729481161172443e-06, + "loss": 0.80877686, + "num_input_tokens_seen": 69111110, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.95703125, + "step": 3201, + "time_per_iteration": 2.435478448867798 + }, + { + "auxiliary_loss_clip": 0.01148745, + "auxiliary_loss_mlp": 0.01046904, + "balance_loss_clip": 1.02874875, + "balance_loss_mlp": 1.05050445, + "epoch": 0.19251465504283782, + "flos": 20230276792320.0, + "grad_norm": 2.1716175760371814, + "language_loss": 0.69168961, + "learning_rate": 3.7292855334022927e-06, + "loss": 0.71364617, + "num_input_tokens_seen": 69130280, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.984375, + "step": 3202, + "time_per_iteration": 2.4596354961395264 + }, + { + "auxiliary_loss_clip": 0.01145317, + "auxiliary_loss_mlp": 0.01035376, + "balance_loss_clip": 1.01790023, + "balance_loss_mlp": 1.05140352, + "epoch": 0.1925747782955058, + "flos": 19464374067840.0, + "grad_norm": 1.7015130302687178, + "language_loss": 0.91123176, + "learning_rate": 3.7290898400574627e-06, + "loss": 0.93303871, + "num_input_tokens_seen": 69149570, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9375, + "step": 3203, + "time_per_iteration": 2.4425902366638184 + }, + { + "auxiliary_loss_clip": 0.01147002, + "auxiliary_loss_mlp": 0.01050127, + "balance_loss_clip": 1.03127956, + "balance_loss_mlp": 1.05008471, + "epoch": 0.19263490154817375, + "flos": 17785586165760.0, + "grad_norm": 2.129263396651385, + "language_loss": 0.81766933, + "learning_rate": 3.7288940811453725e-06, + "loss": 0.83964062, + "num_input_tokens_seen": 69168190, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.96875, + "step": 3204, + "time_per_iteration": 2.4466230869293213 + }, + { + "auxiliary_loss_clip": 0.01143673, + "auxiliary_loss_mlp": 0.01047511, + "balance_loss_clip": 1.03022599, + "balance_loss_mlp": 1.0497942, + "epoch": 0.19269502480084172, + "flos": 17457075354240.0, + "grad_norm": 2.065510679734303, + "language_loss": 0.75797462, + "learning_rate": 3.7286982566734454e-06, + "loss": 0.77988648, + "num_input_tokens_seen": 69186950, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9375, + "step": 3205, + "time_per_iteration": 2.439906358718872 + }, + { + "auxiliary_loss_clip": 0.01150471, + "auxiliary_loss_mlp": 0.01047316, + "balance_loss_clip": 1.02958953, + "balance_loss_mlp": 1.05312991, + "epoch": 0.19275514805350968, + "flos": 21506901045120.0, + "grad_norm": 2.4125731541540465, + "language_loss": 0.83020669, + "learning_rate": 3.728502366649107e-06, + "loss": 0.85218459, + "num_input_tokens_seen": 69204850, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.97265625, + "step": 3206, + "time_per_iteration": 2.463888168334961 + }, + { + "auxiliary_loss_clip": 0.0104957, + "auxiliary_loss_mlp": 0.01003581, + "balance_loss_clip": 1.00139928, + "balance_loss_mlp": 1.01731467, + "epoch": 0.19281527130617768, + "flos": 47695979738880.0, + "grad_norm": 0.8499440783854421, + "language_loss": 0.60609913, + "learning_rate": 3.728306411079786e-06, + "loss": 0.62663066, + "num_input_tokens_seen": 69259200, + "router_z_loss_clip": 0.02185059, + "router_z_loss_mlp": 0.32421875, + "step": 3207, + "time_per_iteration": 2.8865902423858643 + }, + { + "auxiliary_loss_clip": 0.01147085, + "auxiliary_loss_mlp": 0.01045801, + "balance_loss_clip": 1.02789569, + "balance_loss_mlp": 1.05069125, + "epoch": 0.19287539455884564, + "flos": 11801252672640.0, + "grad_norm": 2.4047527057594564, + "language_loss": 0.75119245, + "learning_rate": 3.7281103899729125e-06, + "loss": 0.77312136, + "num_input_tokens_seen": 69275835, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.9609375, + "step": 3208, + "time_per_iteration": 2.4727799892425537 + }, + { + "auxiliary_loss_clip": 0.01146825, + "auxiliary_loss_mlp": 0.01048755, + "balance_loss_clip": 1.02921605, + "balance_loss_mlp": 1.04890394, + "epoch": 0.1929355178115136, + "flos": 20631434860800.0, + "grad_norm": 2.3372356299161696, + "language_loss": 0.60567236, + "learning_rate": 3.7279143033359195e-06, + "loss": 0.62762815, + "num_input_tokens_seen": 69294810, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.98046875, + "step": 3209, + "time_per_iteration": 2.4695677757263184 + }, + { + "auxiliary_loss_clip": 0.0114885, + "auxiliary_loss_mlp": 0.01049539, + "balance_loss_clip": 1.03003573, + "balance_loss_mlp": 1.04981887, + "epoch": 0.19299564106418157, + "flos": 40807916058240.0, + "grad_norm": 1.9457412312791633, + "language_loss": 0.80153656, + "learning_rate": 3.727718151176243e-06, + "loss": 0.82352048, + "num_input_tokens_seen": 69316065, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.9921875, + "step": 3210, + "time_per_iteration": 2.6459405422210693 + }, + { + "auxiliary_loss_clip": 0.01138808, + "auxiliary_loss_mlp": 0.01041334, + "balance_loss_clip": 1.02437103, + "balance_loss_mlp": 1.04580569, + "epoch": 0.19305576431684954, + "flos": 11361418634880.0, + "grad_norm": 2.107646167575127, + "language_loss": 0.82575119, + "learning_rate": 3.7275219335013217e-06, + "loss": 0.84755266, + "num_input_tokens_seen": 69332900, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9296875, + "step": 3211, + "time_per_iteration": 2.454702615737915 + }, + { + "auxiliary_loss_clip": 0.01046258, + "auxiliary_loss_mlp": 0.01012694, + "balance_loss_clip": 1.01057243, + "balance_loss_mlp": 1.01463401, + "epoch": 0.1931158875695175, + "flos": 54511895975040.0, + "grad_norm": 0.9758169311408023, + "language_loss": 0.63670558, + "learning_rate": 3.7273256503185953e-06, + "loss": 0.65729511, + "num_input_tokens_seen": 69382535, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.31640625, + "step": 3212, + "time_per_iteration": 2.914459705352783 + }, + { + "auxiliary_loss_clip": 0.01145937, + "auxiliary_loss_mlp": 0.01046347, + "balance_loss_clip": 1.02967, + "balance_loss_mlp": 1.05140018, + "epoch": 0.19317601082218547, + "flos": 19828436365440.0, + "grad_norm": 1.5978218597026725, + "language_loss": 0.76514798, + "learning_rate": 3.7271293016355074e-06, + "loss": 0.78707075, + "num_input_tokens_seen": 69400600, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9453125, + "step": 3213, + "time_per_iteration": 2.47961163520813 + }, + { + "auxiliary_loss_clip": 0.01147383, + "auxiliary_loss_mlp": 0.01047068, + "balance_loss_clip": 1.02823281, + "balance_loss_mlp": 1.04934072, + "epoch": 0.19323613407485346, + "flos": 13152068467200.0, + "grad_norm": 4.5461953882780115, + "language_loss": 0.70799339, + "learning_rate": 3.726932887459503e-06, + "loss": 0.72993791, + "num_input_tokens_seen": 69417350, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.98046875, + "step": 3214, + "time_per_iteration": 2.4547488689422607 + }, + { + "auxiliary_loss_clip": 0.01142593, + "auxiliary_loss_mlp": 0.01046871, + "balance_loss_clip": 1.02808392, + "balance_loss_mlp": 1.0470041, + "epoch": 0.19329625732752143, + "flos": 14027247342720.0, + "grad_norm": 2.2459266127411848, + "language_loss": 0.75352395, + "learning_rate": 3.72673640779803e-06, + "loss": 0.77541864, + "num_input_tokens_seen": 69431845, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.95703125, + "step": 3215, + "time_per_iteration": 2.4477176666259766 + }, + { + "auxiliary_loss_clip": 0.01139586, + "auxiliary_loss_mlp": 0.01053833, + "balance_loss_clip": 1.03685808, + "balance_loss_mlp": 1.04626155, + "epoch": 0.1933563805801894, + "flos": 23441732069760.0, + "grad_norm": 2.304207478946857, + "language_loss": 0.88559556, + "learning_rate": 3.72653986265854e-06, + "loss": 0.90752971, + "num_input_tokens_seen": 69453275, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9296875, + "step": 3216, + "time_per_iteration": 2.499464988708496 + }, + { + "auxiliary_loss_clip": 0.01140269, + "auxiliary_loss_mlp": 0.0104998, + "balance_loss_clip": 1.0330286, + "balance_loss_mlp": 1.0474, + "epoch": 0.19341650383285736, + "flos": 20485314334080.0, + "grad_norm": 1.5978066249985532, + "language_loss": 0.79762065, + "learning_rate": 3.726343252048485e-06, + "loss": 0.8195231, + "num_input_tokens_seen": 69471830, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9296875, + "step": 3217, + "time_per_iteration": 2.4428889751434326 + }, + { + "auxiliary_loss_clip": 0.0114893, + "auxiliary_loss_mlp": 0.01048467, + "balance_loss_clip": 1.0294652, + "balance_loss_mlp": 1.0504688, + "epoch": 0.19347662708552532, + "flos": 17858484817920.0, + "grad_norm": 2.6606972104147673, + "language_loss": 0.61408496, + "learning_rate": 3.7261465759753206e-06, + "loss": 0.63605893, + "num_input_tokens_seen": 69489320, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.984375, + "step": 3218, + "time_per_iteration": 2.4313230514526367 + }, + { + "auxiliary_loss_clip": 0.0114381, + "auxiliary_loss_mlp": 0.01040596, + "balance_loss_clip": 1.02315593, + "balance_loss_mlp": 1.04883909, + "epoch": 0.1935367503381933, + "flos": 18187247024640.0, + "grad_norm": 1.6811153728366703, + "language_loss": 0.80158418, + "learning_rate": 3.7259498344465053e-06, + "loss": 0.82342821, + "num_input_tokens_seen": 69506665, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.94921875, + "step": 3219, + "time_per_iteration": 2.4347593784332275 + }, + { + "auxiliary_loss_clip": 0.01145851, + "auxiliary_loss_mlp": 0.01048329, + "balance_loss_clip": 1.03010237, + "balance_loss_mlp": 1.05070114, + "epoch": 0.19359687359086128, + "flos": 15957122290560.0, + "grad_norm": 2.032012314604138, + "language_loss": 0.85781908, + "learning_rate": 3.7257530274694993e-06, + "loss": 0.87976086, + "num_input_tokens_seen": 69523835, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.94921875, + "step": 3220, + "time_per_iteration": 2.4572718143463135 + }, + { + "auxiliary_loss_clip": 0.01136805, + "auxiliary_loss_mlp": 0.0103947, + "balance_loss_clip": 1.02356791, + "balance_loss_mlp": 1.0477736, + "epoch": 0.19365699684352924, + "flos": 21215198695680.0, + "grad_norm": 2.087292049011103, + "language_loss": 0.84617937, + "learning_rate": 3.725556155051766e-06, + "loss": 0.86794209, + "num_input_tokens_seen": 69542620, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.890625, + "step": 3221, + "time_per_iteration": 2.4601354598999023 + }, + { + "auxiliary_loss_clip": 0.01142607, + "auxiliary_loss_mlp": 0.01049362, + "balance_loss_clip": 1.0331614, + "balance_loss_mlp": 1.05009556, + "epoch": 0.1937171200961972, + "flos": 17311098481920.0, + "grad_norm": 2.075109928662421, + "language_loss": 0.85929954, + "learning_rate": 3.7253592172007702e-06, + "loss": 0.88121927, + "num_input_tokens_seen": 69561130, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.92578125, + "step": 3222, + "time_per_iteration": 2.433027505874634 + }, + { + "auxiliary_loss_clip": 0.0114145, + "auxiliary_loss_mlp": 0.01040151, + "balance_loss_clip": 1.02212656, + "balance_loss_mlp": 1.04663789, + "epoch": 0.19377724334886517, + "flos": 22635968227200.0, + "grad_norm": 3.9278404759018053, + "language_loss": 0.78207982, + "learning_rate": 3.72516221392398e-06, + "loss": 0.80389583, + "num_input_tokens_seen": 69580425, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9453125, + "step": 3223, + "time_per_iteration": 2.4451496601104736 + }, + { + "auxiliary_loss_clip": 0.01139994, + "auxiliary_loss_mlp": 0.01047584, + "balance_loss_clip": 1.03013206, + "balance_loss_mlp": 1.04896808, + "epoch": 0.19383736660153314, + "flos": 15077813351040.0, + "grad_norm": 1.8200574771064912, + "language_loss": 0.75589085, + "learning_rate": 3.7249651452288653e-06, + "loss": 0.77776659, + "num_input_tokens_seen": 69597085, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.91015625, + "step": 3224, + "time_per_iteration": 2.4390981197357178 + }, + { + "auxiliary_loss_clip": 0.0114036, + "auxiliary_loss_mlp": 0.01039996, + "balance_loss_clip": 1.02274644, + "balance_loss_mlp": 1.04741263, + "epoch": 0.1938974898542011, + "flos": 47119934350080.0, + "grad_norm": 2.092202382915022, + "language_loss": 0.71141279, + "learning_rate": 3.7247680111229e-06, + "loss": 0.73321629, + "num_input_tokens_seen": 69618885, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9296875, + "step": 3225, + "time_per_iteration": 2.6690707206726074 + }, + { + "auxiliary_loss_clip": 0.01142605, + "auxiliary_loss_mlp": 0.01044348, + "balance_loss_clip": 1.0279572, + "balance_loss_mlp": 1.04787326, + "epoch": 0.19395761310686907, + "flos": 25812554376960.0, + "grad_norm": 2.058354492672399, + "language_loss": 0.6915803, + "learning_rate": 3.7245708116135585e-06, + "loss": 0.71344984, + "num_input_tokens_seen": 69638200, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9453125, + "step": 3226, + "time_per_iteration": 3.906217336654663 + }, + { + "auxiliary_loss_clip": 0.0114437, + "auxiliary_loss_mlp": 0.01044177, + "balance_loss_clip": 1.02562809, + "balance_loss_mlp": 1.05274427, + "epoch": 0.19401773635953706, + "flos": 23039604334080.0, + "grad_norm": 1.6131772564475266, + "language_loss": 0.76138854, + "learning_rate": 3.7243735467083193e-06, + "loss": 0.78327405, + "num_input_tokens_seen": 69657550, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9140625, + "step": 3227, + "time_per_iteration": 4.168737411499023 + }, + { + "auxiliary_loss_clip": 0.01140847, + "auxiliary_loss_mlp": 0.01042545, + "balance_loss_clip": 1.02547467, + "balance_loss_mlp": 1.04588878, + "epoch": 0.19407785961220503, + "flos": 15920780705280.0, + "grad_norm": 1.8539897665707572, + "language_loss": 0.69154215, + "learning_rate": 3.724176216414662e-06, + "loss": 0.7133761, + "num_input_tokens_seen": 69675005, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.94921875, + "step": 3228, + "time_per_iteration": 2.4857404232025146 + }, + { + "auxiliary_loss_clip": 0.01142054, + "auxiliary_loss_mlp": 0.01044016, + "balance_loss_clip": 1.02698135, + "balance_loss_mlp": 1.04929864, + "epoch": 0.194137982864873, + "flos": 25921722787200.0, + "grad_norm": 1.9069922854616745, + "language_loss": 0.7428174, + "learning_rate": 3.72397882074007e-06, + "loss": 0.76467812, + "num_input_tokens_seen": 69696455, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9296875, + "step": 3229, + "time_per_iteration": 2.5357918739318848 + }, + { + "auxiliary_loss_clip": 0.01141663, + "auxiliary_loss_mlp": 0.01041685, + "balance_loss_clip": 1.02443504, + "balance_loss_mlp": 1.04832351, + "epoch": 0.19419810611754096, + "flos": 13261344618240.0, + "grad_norm": 1.6963766145995596, + "language_loss": 0.65157712, + "learning_rate": 3.7237813596920285e-06, + "loss": 0.67341059, + "num_input_tokens_seen": 69714245, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.93359375, + "step": 3230, + "time_per_iteration": 2.4796855449676514 + }, + { + "auxiliary_loss_clip": 0.01137871, + "auxiliary_loss_mlp": 0.01044544, + "balance_loss_clip": 1.0268054, + "balance_loss_mlp": 1.04652202, + "epoch": 0.19425822937020892, + "flos": 15705568368000.0, + "grad_norm": 1.8877471342298004, + "language_loss": 0.8184334, + "learning_rate": 3.7235838332780254e-06, + "loss": 0.84025759, + "num_input_tokens_seen": 69731515, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9140625, + "step": 3231, + "time_per_iteration": 2.5315961837768555 + }, + { + "auxiliary_loss_clip": 0.01145592, + "auxiliary_loss_mlp": 0.01039112, + "balance_loss_clip": 1.02045608, + "balance_loss_mlp": 1.05067456, + "epoch": 0.1943183526228769, + "flos": 23105392093440.0, + "grad_norm": 1.787689187471357, + "language_loss": 0.86743605, + "learning_rate": 3.72338624150555e-06, + "loss": 0.88928306, + "num_input_tokens_seen": 69748885, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.94921875, + "step": 3232, + "time_per_iteration": 2.4916152954101562 + }, + { + "auxiliary_loss_clip": 0.01141636, + "auxiliary_loss_mlp": 0.01052447, + "balance_loss_clip": 1.03497076, + "balance_loss_mlp": 1.05008495, + "epoch": 0.19437847587554485, + "flos": 24712610146560.0, + "grad_norm": 1.5602267859616314, + "language_loss": 0.8513217, + "learning_rate": 3.723188584382096e-06, + "loss": 0.87326247, + "num_input_tokens_seen": 69767540, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9140625, + "step": 3233, + "time_per_iteration": 2.526118040084839 + }, + { + "auxiliary_loss_clip": 0.01145232, + "auxiliary_loss_mlp": 0.01053705, + "balance_loss_clip": 1.03603804, + "balance_loss_mlp": 1.04827857, + "epoch": 0.19443859912821285, + "flos": 23116130259840.0, + "grad_norm": 1.6631942166294669, + "language_loss": 0.89191484, + "learning_rate": 3.722990861915158e-06, + "loss": 0.91390419, + "num_input_tokens_seen": 69789340, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.96875, + "step": 3234, + "time_per_iteration": 2.4783849716186523 + }, + { + "auxiliary_loss_clip": 0.01143869, + "auxiliary_loss_mlp": 0.01043333, + "balance_loss_clip": 1.02493858, + "balance_loss_mlp": 1.04675341, + "epoch": 0.1944987223808808, + "flos": 15084385539840.0, + "grad_norm": 2.1776085062187374, + "language_loss": 0.78503513, + "learning_rate": 3.722793074112234e-06, + "loss": 0.80690718, + "num_input_tokens_seen": 69806470, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.96875, + "step": 3235, + "time_per_iteration": 2.4414284229278564 + }, + { + "auxiliary_loss_clip": 0.01146423, + "auxiliary_loss_mlp": 0.01041843, + "balance_loss_clip": 1.02545178, + "balance_loss_mlp": 1.05288744, + "epoch": 0.19455884563354878, + "flos": 17126876603520.0, + "grad_norm": 2.115791514531618, + "language_loss": 0.7937218, + "learning_rate": 3.7225952209808233e-06, + "loss": 0.81560451, + "num_input_tokens_seen": 69822655, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.93359375, + "step": 3236, + "time_per_iteration": 2.4394619464874268 + }, + { + "auxiliary_loss_clip": 0.01144391, + "auxiliary_loss_mlp": 0.01040175, + "balance_loss_clip": 1.02204323, + "balance_loss_mlp": 1.05156302, + "epoch": 0.19461896888621674, + "flos": 20193396503040.0, + "grad_norm": 2.445233321344346, + "language_loss": 0.75936478, + "learning_rate": 3.72239730252843e-06, + "loss": 0.78121042, + "num_input_tokens_seen": 69841895, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9296875, + "step": 3237, + "time_per_iteration": 2.544003486633301 + }, + { + "auxiliary_loss_clip": 0.01147227, + "auxiliary_loss_mlp": 0.01046687, + "balance_loss_clip": 1.03005719, + "balance_loss_mlp": 1.05079889, + "epoch": 0.1946790921388847, + "flos": 25301365971840.0, + "grad_norm": 2.0921387862929586, + "language_loss": 0.75056225, + "learning_rate": 3.7221993187625583e-06, + "loss": 0.77250135, + "num_input_tokens_seen": 69862220, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.96484375, + "step": 3238, + "time_per_iteration": 2.4795806407928467 + }, + { + "auxiliary_loss_clip": 0.01147117, + "auxiliary_loss_mlp": 0.01044554, + "balance_loss_clip": 1.02600455, + "balance_loss_mlp": 1.05317962, + "epoch": 0.19473921539155267, + "flos": 20193396503040.0, + "grad_norm": 1.8233855681516762, + "language_loss": 0.73016453, + "learning_rate": 3.7220012696907155e-06, + "loss": 0.75208122, + "num_input_tokens_seen": 69881830, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.94140625, + "step": 3239, + "time_per_iteration": 2.4695816040039062 + }, + { + "auxiliary_loss_clip": 0.01144581, + "auxiliary_loss_mlp": 0.01048537, + "balance_loss_clip": 1.03026247, + "balance_loss_mlp": 1.0505631, + "epoch": 0.19479933864422067, + "flos": 20887549810560.0, + "grad_norm": 1.897973355517785, + "language_loss": 0.73792124, + "learning_rate": 3.721803155320412e-06, + "loss": 0.75985241, + "num_input_tokens_seen": 69900515, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.94140625, + "step": 3240, + "time_per_iteration": 2.4483954906463623 + }, + { + "auxiliary_loss_clip": 0.0114635, + "auxiliary_loss_mlp": 0.01041908, + "balance_loss_clip": 1.02477801, + "balance_loss_mlp": 1.05221701, + "epoch": 0.19485946189688863, + "flos": 23295072839040.0, + "grad_norm": 1.8797415358152445, + "language_loss": 0.66685343, + "learning_rate": 3.7216049756591606e-06, + "loss": 0.68873608, + "num_input_tokens_seen": 69920060, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.94140625, + "step": 3241, + "time_per_iteration": 2.5644116401672363 + }, + { + "auxiliary_loss_clip": 0.01144249, + "auxiliary_loss_mlp": 0.01045443, + "balance_loss_clip": 1.0280863, + "balance_loss_mlp": 1.05193758, + "epoch": 0.1949195851495566, + "flos": 23295036925440.0, + "grad_norm": 1.4346271942222966, + "language_loss": 0.82889283, + "learning_rate": 3.7214067307144754e-06, + "loss": 0.85078967, + "num_input_tokens_seen": 69939820, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.921875, + "step": 3242, + "time_per_iteration": 2.476043701171875 + }, + { + "auxiliary_loss_clip": 0.01054708, + "auxiliary_loss_mlp": 0.01010683, + "balance_loss_clip": 1.00856066, + "balance_loss_mlp": 1.02379096, + "epoch": 0.19497970840222456, + "flos": 64962871557120.0, + "grad_norm": 0.8482804620416572, + "language_loss": 0.57572454, + "learning_rate": 3.721208420493875e-06, + "loss": 0.59637845, + "num_input_tokens_seen": 70002145, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.30859375, + "step": 3243, + "time_per_iteration": 3.1217525005340576 + }, + { + "auxiliary_loss_clip": 0.01144732, + "auxiliary_loss_mlp": 0.01043073, + "balance_loss_clip": 1.02573967, + "balance_loss_mlp": 1.05099249, + "epoch": 0.19503983165489253, + "flos": 19644717277440.0, + "grad_norm": 2.02063631868758, + "language_loss": 0.83243412, + "learning_rate": 3.7210100450048784e-06, + "loss": 0.85431218, + "num_input_tokens_seen": 70020510, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9375, + "step": 3244, + "time_per_iteration": 2.4848830699920654 + }, + { + "auxiliary_loss_clip": 0.01147429, + "auxiliary_loss_mlp": 0.01048127, + "balance_loss_clip": 1.03144979, + "balance_loss_mlp": 1.05495024, + "epoch": 0.1950999549075605, + "flos": 21141976821120.0, + "grad_norm": 1.8275576625869878, + "language_loss": 0.77049786, + "learning_rate": 3.7208116042550088e-06, + "loss": 0.79245341, + "num_input_tokens_seen": 70040760, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.921875, + "step": 3245, + "time_per_iteration": 2.5539040565490723 + }, + { + "auxiliary_loss_clip": 0.01141945, + "auxiliary_loss_mlp": 0.01041151, + "balance_loss_clip": 1.0235796, + "balance_loss_mlp": 1.04852772, + "epoch": 0.19516007816022846, + "flos": 20884820376960.0, + "grad_norm": 2.8639596298576055, + "language_loss": 0.84020388, + "learning_rate": 3.7206130982517906e-06, + "loss": 0.86203486, + "num_input_tokens_seen": 70058720, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9375, + "step": 3246, + "time_per_iteration": 2.5018341541290283 + }, + { + "auxiliary_loss_clip": 0.0114444, + "auxiliary_loss_mlp": 0.01045285, + "balance_loss_clip": 1.02834511, + "balance_loss_mlp": 1.04978824, + "epoch": 0.19522020141289645, + "flos": 16910515031040.0, + "grad_norm": 2.1267063345385777, + "language_loss": 0.7636531, + "learning_rate": 3.7204145270027514e-06, + "loss": 0.78555036, + "num_input_tokens_seen": 70076470, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9453125, + "step": 3247, + "time_per_iteration": 2.4512898921966553 + }, + { + "auxiliary_loss_clip": 0.01144663, + "auxiliary_loss_mlp": 0.01038699, + "balance_loss_clip": 1.02228367, + "balance_loss_mlp": 1.05077446, + "epoch": 0.19528032466556441, + "flos": 26724829023360.0, + "grad_norm": 1.4744510548582124, + "language_loss": 0.75330198, + "learning_rate": 3.720215890515421e-06, + "loss": 0.77513552, + "num_input_tokens_seen": 70096220, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9375, + "step": 3248, + "time_per_iteration": 2.5222222805023193 + }, + { + "auxiliary_loss_clip": 0.01140079, + "auxiliary_loss_mlp": 0.0104275, + "balance_loss_clip": 1.02590537, + "balance_loss_mlp": 1.04661679, + "epoch": 0.19534044791823238, + "flos": 21032808410880.0, + "grad_norm": 1.9881324270373204, + "language_loss": 0.78316575, + "learning_rate": 3.7200171887973316e-06, + "loss": 0.80499399, + "num_input_tokens_seen": 70114800, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.93359375, + "step": 3249, + "time_per_iteration": 2.475385904312134 + }, + { + "auxiliary_loss_clip": 0.01143906, + "auxiliary_loss_mlp": 0.01048238, + "balance_loss_clip": 1.0316205, + "balance_loss_mlp": 1.04948914, + "epoch": 0.19540057117090034, + "flos": 22344050396160.0, + "grad_norm": 1.839405294960197, + "language_loss": 0.73238158, + "learning_rate": 3.7198184218560176e-06, + "loss": 0.7543031, + "num_input_tokens_seen": 70134930, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9453125, + "step": 3250, + "time_per_iteration": 2.4548323154449463 + }, + { + "auxiliary_loss_clip": 0.01136082, + "auxiliary_loss_mlp": 0.01037994, + "balance_loss_clip": 1.02206779, + "balance_loss_mlp": 1.04583359, + "epoch": 0.1954606944235683, + "flos": 20301631159680.0, + "grad_norm": 1.9014920395959154, + "language_loss": 0.79582441, + "learning_rate": 3.719619589699017e-06, + "loss": 0.8175652, + "num_input_tokens_seen": 70152045, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.90234375, + "step": 3251, + "time_per_iteration": 2.4597084522247314 + }, + { + "auxiliary_loss_clip": 0.01142571, + "auxiliary_loss_mlp": 0.01041367, + "balance_loss_clip": 1.02441597, + "balance_loss_mlp": 1.04888558, + "epoch": 0.19552081767623627, + "flos": 17346865449600.0, + "grad_norm": 3.2143497379473613, + "language_loss": 0.83534026, + "learning_rate": 3.7194206923338695e-06, + "loss": 0.85717964, + "num_input_tokens_seen": 70169240, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9375, + "step": 3252, + "time_per_iteration": 2.4245967864990234 + }, + { + "auxiliary_loss_clip": 0.01143675, + "auxiliary_loss_mlp": 0.01048315, + "balance_loss_clip": 1.03026652, + "balance_loss_mlp": 1.04651105, + "epoch": 0.19558094092890424, + "flos": 31977626129280.0, + "grad_norm": 1.7806404718622555, + "language_loss": 0.73870194, + "learning_rate": 3.719221729768117e-06, + "loss": 0.76062191, + "num_input_tokens_seen": 70192690, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.96875, + "step": 3253, + "time_per_iteration": 2.5752809047698975 + }, + { + "auxiliary_loss_clip": 0.01142809, + "auxiliary_loss_mlp": 0.01040218, + "balance_loss_clip": 1.02352846, + "balance_loss_mlp": 1.04619944, + "epoch": 0.19564106418157223, + "flos": 22268889187200.0, + "grad_norm": 1.833285648050628, + "language_loss": 0.76684111, + "learning_rate": 3.7190227020093037e-06, + "loss": 0.78867137, + "num_input_tokens_seen": 70209685, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.96484375, + "step": 3254, + "time_per_iteration": 2.533993721008301 + }, + { + "auxiliary_loss_clip": 0.01044914, + "auxiliary_loss_mlp": 0.01004749, + "balance_loss_clip": 1.00268674, + "balance_loss_mlp": 1.01349974, + "epoch": 0.1957011874342402, + "flos": 54364554385920.0, + "grad_norm": 0.7652407497357797, + "language_loss": 0.55344874, + "learning_rate": 3.7188236090649774e-06, + "loss": 0.5739454, + "num_input_tokens_seen": 70265050, + "router_z_loss_clip": 0.02062988, + "router_z_loss_mlp": 0.3125, + "step": 3255, + "time_per_iteration": 3.164173126220703 + }, + { + "auxiliary_loss_clip": 0.01144973, + "auxiliary_loss_mlp": 0.01041369, + "balance_loss_clip": 1.02407217, + "balance_loss_mlp": 1.05057478, + "epoch": 0.19576131068690816, + "flos": 16506699356160.0, + "grad_norm": 2.650975615707017, + "language_loss": 0.7066443, + "learning_rate": 3.718624450942688e-06, + "loss": 0.7285077, + "num_input_tokens_seen": 70281830, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9453125, + "step": 3256, + "time_per_iteration": 2.496424436569214 + }, + { + "auxiliary_loss_clip": 0.01139601, + "auxiliary_loss_mlp": 0.0104318, + "balance_loss_clip": 1.02635908, + "balance_loss_mlp": 1.04647136, + "epoch": 0.19582143393957613, + "flos": 14719676797440.0, + "grad_norm": 2.256610935254856, + "language_loss": 0.80055118, + "learning_rate": 3.718425227649987e-06, + "loss": 0.82237899, + "num_input_tokens_seen": 70297420, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9296875, + "step": 3257, + "time_per_iteration": 2.4771504402160645 + }, + { + "auxiliary_loss_clip": 0.01143218, + "auxiliary_loss_mlp": 0.01042656, + "balance_loss_clip": 1.02637219, + "balance_loss_mlp": 1.05034149, + "epoch": 0.1958815571922441, + "flos": 24425504737920.0, + "grad_norm": 1.9567741269254724, + "language_loss": 0.74843282, + "learning_rate": 3.7182259391944292e-06, + "loss": 0.77029151, + "num_input_tokens_seen": 70319210, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.9296875, + "step": 3258, + "time_per_iteration": 2.6177120208740234 + }, + { + "auxiliary_loss_clip": 0.01142767, + "auxiliary_loss_mlp": 0.01036452, + "balance_loss_clip": 1.01932144, + "balance_loss_mlp": 1.04772139, + "epoch": 0.19594168044491206, + "flos": 24900279730560.0, + "grad_norm": 1.7410781544458231, + "language_loss": 0.74462247, + "learning_rate": 3.7180265855835714e-06, + "loss": 0.7664147, + "num_input_tokens_seen": 70339045, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.94921875, + "step": 3259, + "time_per_iteration": 2.54068660736084 + }, + { + "auxiliary_loss_clip": 0.01145135, + "auxiliary_loss_mlp": 0.01036775, + "balance_loss_clip": 1.01923943, + "balance_loss_mlp": 1.04965675, + "epoch": 0.19600180369758005, + "flos": 12057008486400.0, + "grad_norm": 2.380592438675979, + "language_loss": 0.77040654, + "learning_rate": 3.7178271668249735e-06, + "loss": 0.7922256, + "num_input_tokens_seen": 70356505, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.953125, + "step": 3260, + "time_per_iteration": 2.4983303546905518 + }, + { + "auxiliary_loss_clip": 0.01143361, + "auxiliary_loss_mlp": 0.01041828, + "balance_loss_clip": 1.02459061, + "balance_loss_mlp": 1.0486325, + "epoch": 0.19606192695024802, + "flos": 20850202644480.0, + "grad_norm": 2.011568492365706, + "language_loss": 0.82168972, + "learning_rate": 3.7176276829261975e-06, + "loss": 0.84354162, + "num_input_tokens_seen": 70375410, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9453125, + "step": 3261, + "time_per_iteration": 2.52164626121521 + }, + { + "auxiliary_loss_clip": 0.01144228, + "auxiliary_loss_mlp": 0.01042699, + "balance_loss_clip": 1.02510428, + "balance_loss_mlp": 1.05130327, + "epoch": 0.19612205020291598, + "flos": 28475509996800.0, + "grad_norm": 2.1812525814986112, + "language_loss": 0.76691413, + "learning_rate": 3.717428133894807e-06, + "loss": 0.78878343, + "num_input_tokens_seen": 70396315, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9296875, + "step": 3262, + "time_per_iteration": 2.513619899749756 + }, + { + "auxiliary_loss_clip": 0.01145398, + "auxiliary_loss_mlp": 0.01044459, + "balance_loss_clip": 1.02775788, + "balance_loss_mlp": 1.05290008, + "epoch": 0.19618217345558395, + "flos": 25556618995200.0, + "grad_norm": 1.7175684177653927, + "language_loss": 0.8667773, + "learning_rate": 3.71722851973837e-06, + "loss": 0.88867593, + "num_input_tokens_seen": 70417945, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.921875, + "step": 3263, + "time_per_iteration": 2.49373459815979 + }, + { + "auxiliary_loss_clip": 0.01140801, + "auxiliary_loss_mlp": 0.01041854, + "balance_loss_clip": 1.0251646, + "balance_loss_mlp": 1.04784787, + "epoch": 0.1962422967082519, + "flos": 25264413855360.0, + "grad_norm": 1.5660143494742738, + "language_loss": 0.74136549, + "learning_rate": 3.717028840464455e-06, + "loss": 0.76319206, + "num_input_tokens_seen": 70438690, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9296875, + "step": 3264, + "time_per_iteration": 2.4891843795776367 + }, + { + "auxiliary_loss_clip": 0.0114591, + "auxiliary_loss_mlp": 0.01049823, + "balance_loss_clip": 1.03340793, + "balance_loss_mlp": 1.05435038, + "epoch": 0.19630241996091988, + "flos": 18807352444800.0, + "grad_norm": 4.0742741532711975, + "language_loss": 0.78590196, + "learning_rate": 3.7168290960806344e-06, + "loss": 0.8078593, + "num_input_tokens_seen": 70455385, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9140625, + "step": 3265, + "time_per_iteration": 2.4226529598236084 + }, + { + "auxiliary_loss_clip": 0.01047401, + "auxiliary_loss_mlp": 0.01014864, + "balance_loss_clip": 1.01292133, + "balance_loss_mlp": 1.01652646, + "epoch": 0.19636254321358784, + "flos": 62321137896960.0, + "grad_norm": 0.7852387786228787, + "language_loss": 0.53459084, + "learning_rate": 3.716629286594483e-06, + "loss": 0.55521357, + "num_input_tokens_seen": 70514280, + "router_z_loss_clip": 0.01940918, + "router_z_loss_mlp": 0.30859375, + "step": 3266, + "time_per_iteration": 3.0519652366638184 + }, + { + "auxiliary_loss_clip": 0.01145434, + "auxiliary_loss_mlp": 0.01041492, + "balance_loss_clip": 1.02263319, + "balance_loss_mlp": 1.04800785, + "epoch": 0.19642266646625584, + "flos": 21069329564160.0, + "grad_norm": 1.9728388819613873, + "language_loss": 0.80503136, + "learning_rate": 3.7164294120135767e-06, + "loss": 0.82690066, + "num_input_tokens_seen": 70531800, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.9765625, + "step": 3267, + "time_per_iteration": 2.436455011367798 + }, + { + "auxiliary_loss_clip": 0.01138843, + "auxiliary_loss_mlp": 0.0104324, + "balance_loss_clip": 1.02726591, + "balance_loss_mlp": 1.04780269, + "epoch": 0.1964827897189238, + "flos": 14538651229440.0, + "grad_norm": 2.528633756775916, + "language_loss": 0.87031806, + "learning_rate": 3.7162294723454953e-06, + "loss": 0.89213896, + "num_input_tokens_seen": 70550615, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.91015625, + "step": 3268, + "time_per_iteration": 5.348580360412598 + }, + { + "auxiliary_loss_clip": 0.01141651, + "auxiliary_loss_mlp": 0.01045776, + "balance_loss_clip": 1.02865744, + "balance_loss_mlp": 1.04996669, + "epoch": 0.19654291297159177, + "flos": 19244636616960.0, + "grad_norm": 2.7845337804652086, + "language_loss": 0.69331455, + "learning_rate": 3.7160294675978197e-06, + "loss": 0.71518886, + "num_input_tokens_seen": 70568690, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9140625, + "step": 3269, + "time_per_iteration": 3.9386346340179443 + }, + { + "auxiliary_loss_clip": 0.01148343, + "auxiliary_loss_mlp": 0.01051701, + "balance_loss_clip": 1.03361702, + "balance_loss_mlp": 1.0530045, + "epoch": 0.19660303622425973, + "flos": 25775710001280.0, + "grad_norm": 2.4386480468071086, + "language_loss": 0.80760634, + "learning_rate": 3.715829397778135e-06, + "loss": 0.82960677, + "num_input_tokens_seen": 70588665, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.953125, + "step": 3270, + "time_per_iteration": 2.5130820274353027 + }, + { + "auxiliary_loss_clip": 0.01140062, + "auxiliary_loss_mlp": 0.01045089, + "balance_loss_clip": 1.02848363, + "balance_loss_mlp": 1.04726839, + "epoch": 0.1966631594769277, + "flos": 20595093275520.0, + "grad_norm": 1.857854204827715, + "language_loss": 0.83918732, + "learning_rate": 3.715629262894028e-06, + "loss": 0.86103886, + "num_input_tokens_seen": 70606900, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9296875, + "step": 3271, + "time_per_iteration": 2.4522581100463867 + }, + { + "auxiliary_loss_clip": 0.01139583, + "auxiliary_loss_mlp": 0.01046491, + "balance_loss_clip": 1.0297302, + "balance_loss_mlp": 1.04943895, + "epoch": 0.19672328272959566, + "flos": 23623188600960.0, + "grad_norm": 2.1376155358713835, + "language_loss": 0.80162311, + "learning_rate": 3.715429062953087e-06, + "loss": 0.82348382, + "num_input_tokens_seen": 70625955, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8984375, + "step": 3272, + "time_per_iteration": 2.4968738555908203 + }, + { + "auxiliary_loss_clip": 0.01145188, + "auxiliary_loss_mlp": 0.01045772, + "balance_loss_clip": 1.02766371, + "balance_loss_mlp": 1.05075002, + "epoch": 0.19678340598226365, + "flos": 23110922787840.0, + "grad_norm": 1.7855512393811417, + "language_loss": 0.80728978, + "learning_rate": 3.7152287979629043e-06, + "loss": 0.82919937, + "num_input_tokens_seen": 70646090, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9453125, + "step": 3273, + "time_per_iteration": 2.525407552719116 + }, + { + "auxiliary_loss_clip": 0.01142802, + "auxiliary_loss_mlp": 0.01051833, + "balance_loss_clip": 1.03454804, + "balance_loss_mlp": 1.04807115, + "epoch": 0.19684352923493162, + "flos": 24534852716160.0, + "grad_norm": 5.081990879764466, + "language_loss": 0.7791425, + "learning_rate": 3.7150284679310735e-06, + "loss": 0.80108881, + "num_input_tokens_seen": 70666065, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9453125, + "step": 3274, + "time_per_iteration": 2.527858018875122 + }, + { + "auxiliary_loss_clip": 0.01141542, + "auxiliary_loss_mlp": 0.01052239, + "balance_loss_clip": 1.03440571, + "balance_loss_mlp": 1.04765558, + "epoch": 0.19690365248759958, + "flos": 21796448578560.0, + "grad_norm": 2.1984029701042367, + "language_loss": 0.81144857, + "learning_rate": 3.7148280728651914e-06, + "loss": 0.83338642, + "num_input_tokens_seen": 70681580, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.9375, + "step": 3275, + "time_per_iteration": 2.451392412185669 + }, + { + "auxiliary_loss_clip": 0.01143631, + "auxiliary_loss_mlp": 0.01047389, + "balance_loss_clip": 1.02934027, + "balance_loss_mlp": 1.04772139, + "epoch": 0.19696377574026755, + "flos": 19056643810560.0, + "grad_norm": 1.90284229785688, + "language_loss": 0.81104618, + "learning_rate": 3.7146276127728563e-06, + "loss": 0.83295637, + "num_input_tokens_seen": 70697745, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9609375, + "step": 3276, + "time_per_iteration": 2.462033748626709 + }, + { + "auxiliary_loss_clip": 0.01142306, + "auxiliary_loss_mlp": 0.01038428, + "balance_loss_clip": 1.02132106, + "balance_loss_mlp": 1.04889154, + "epoch": 0.19702389899293551, + "flos": 22820656982400.0, + "grad_norm": 2.0909421048868126, + "language_loss": 0.89347923, + "learning_rate": 3.7144270876616713e-06, + "loss": 0.91528654, + "num_input_tokens_seen": 70715110, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.93359375, + "step": 3277, + "time_per_iteration": 2.4887003898620605 + }, + { + "auxiliary_loss_clip": 0.01146208, + "auxiliary_loss_mlp": 0.01047781, + "balance_loss_clip": 1.02804041, + "balance_loss_mlp": 1.04832077, + "epoch": 0.19708402224560348, + "flos": 22894237992960.0, + "grad_norm": 2.9974095646387573, + "language_loss": 0.62265754, + "learning_rate": 3.714226497539239e-06, + "loss": 0.64459741, + "num_input_tokens_seen": 70734715, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.9765625, + "step": 3278, + "time_per_iteration": 2.560401201248169 + }, + { + "auxiliary_loss_clip": 0.01144829, + "auxiliary_loss_mlp": 0.01054112, + "balance_loss_clip": 1.03562284, + "balance_loss_mlp": 1.04910243, + "epoch": 0.19714414549827144, + "flos": 25662519267840.0, + "grad_norm": 3.1131920881239936, + "language_loss": 0.73664343, + "learning_rate": 3.714025842413166e-06, + "loss": 0.75863284, + "num_input_tokens_seen": 70752650, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.95703125, + "step": 3279, + "time_per_iteration": 2.5036048889160156 + }, + { + "auxiliary_loss_clip": 0.01144667, + "auxiliary_loss_mlp": 0.01045176, + "balance_loss_clip": 1.02816486, + "balance_loss_mlp": 1.04906511, + "epoch": 0.19720426875093944, + "flos": 23915824704000.0, + "grad_norm": 1.6310774806952162, + "language_loss": 0.82451236, + "learning_rate": 3.713825122291061e-06, + "loss": 0.84641075, + "num_input_tokens_seen": 70772365, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.95703125, + "step": 3280, + "time_per_iteration": 2.499962091445923 + }, + { + "auxiliary_loss_clip": 0.01145271, + "auxiliary_loss_mlp": 0.01043645, + "balance_loss_clip": 1.02744484, + "balance_loss_mlp": 1.05086279, + "epoch": 0.1972643920036074, + "flos": 13881952828800.0, + "grad_norm": 1.847926035637751, + "language_loss": 0.77581155, + "learning_rate": 3.713624337180536e-06, + "loss": 0.79770064, + "num_input_tokens_seen": 70790340, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.94140625, + "step": 3281, + "time_per_iteration": 2.4610888957977295 + }, + { + "auxiliary_loss_clip": 0.01141389, + "auxiliary_loss_mlp": 0.01043048, + "balance_loss_clip": 1.02719295, + "balance_loss_mlp": 1.0507971, + "epoch": 0.19732451525627537, + "flos": 19863592801920.0, + "grad_norm": 1.593504057665797, + "language_loss": 0.79502213, + "learning_rate": 3.7134234870892045e-06, + "loss": 0.81686652, + "num_input_tokens_seen": 70809295, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.90625, + "step": 3282, + "time_per_iteration": 2.453230857849121 + }, + { + "auxiliary_loss_clip": 0.01149903, + "auxiliary_loss_mlp": 0.01044987, + "balance_loss_clip": 1.0279994, + "balance_loss_mlp": 1.05359089, + "epoch": 0.19738463850894333, + "flos": 24973429777920.0, + "grad_norm": 2.157912578421005, + "language_loss": 0.71937042, + "learning_rate": 3.7132225720246826e-06, + "loss": 0.7413193, + "num_input_tokens_seen": 70828765, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9609375, + "step": 3283, + "time_per_iteration": 2.5070157051086426 + }, + { + "auxiliary_loss_clip": 0.0114321, + "auxiliary_loss_mlp": 0.01041465, + "balance_loss_clip": 1.02462053, + "balance_loss_mlp": 1.04858577, + "epoch": 0.1974447617616113, + "flos": 18368883123840.0, + "grad_norm": 1.741034644212953, + "language_loss": 0.78832877, + "learning_rate": 3.7130215919945886e-06, + "loss": 0.81017548, + "num_input_tokens_seen": 70846805, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9453125, + "step": 3284, + "time_per_iteration": 2.436530113220215 + }, + { + "auxiliary_loss_clip": 0.01147439, + "auxiliary_loss_mlp": 0.01047462, + "balance_loss_clip": 1.02952087, + "balance_loss_mlp": 1.05069387, + "epoch": 0.19750488501427926, + "flos": 22892945103360.0, + "grad_norm": 2.0622477624774325, + "language_loss": 0.86366653, + "learning_rate": 3.7128205470065445e-06, + "loss": 0.88561547, + "num_input_tokens_seen": 70863805, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.96875, + "step": 3285, + "time_per_iteration": 2.4581058025360107 + }, + { + "auxiliary_loss_clip": 0.01143401, + "auxiliary_loss_mlp": 0.01042449, + "balance_loss_clip": 1.02571201, + "balance_loss_mlp": 1.0520879, + "epoch": 0.19756500826694723, + "flos": 21871502046720.0, + "grad_norm": 2.7361177014734372, + "language_loss": 0.88680863, + "learning_rate": 3.712619437068174e-06, + "loss": 0.90866709, + "num_input_tokens_seen": 70882660, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9140625, + "step": 3286, + "time_per_iteration": 2.472475290298462 + }, + { + "auxiliary_loss_clip": 0.01148385, + "auxiliary_loss_mlp": 0.01049125, + "balance_loss_clip": 1.03036189, + "balance_loss_mlp": 1.05260301, + "epoch": 0.19762513151961522, + "flos": 15158972131200.0, + "grad_norm": 2.2372981039860833, + "language_loss": 0.78297567, + "learning_rate": 3.712418262187102e-06, + "loss": 0.80495083, + "num_input_tokens_seen": 70898765, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.95703125, + "step": 3287, + "time_per_iteration": 2.417569875717163 + }, + { + "auxiliary_loss_clip": 0.01146192, + "auxiliary_loss_mlp": 0.01045423, + "balance_loss_clip": 1.02674246, + "balance_loss_mlp": 1.04974318, + "epoch": 0.1976852547722832, + "flos": 16979175878400.0, + "grad_norm": 2.197025185749627, + "language_loss": 0.81252837, + "learning_rate": 3.7122170223709584e-06, + "loss": 0.83444452, + "num_input_tokens_seen": 70916370, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.96484375, + "step": 3288, + "time_per_iteration": 2.4107155799865723 + }, + { + "auxiliary_loss_clip": 0.01139417, + "auxiliary_loss_mlp": 0.01049687, + "balance_loss_clip": 1.03315234, + "balance_loss_mlp": 1.04890108, + "epoch": 0.19774537802495115, + "flos": 20302924049280.0, + "grad_norm": 1.7615970311636253, + "language_loss": 0.72502065, + "learning_rate": 3.712015717627374e-06, + "loss": 0.74691164, + "num_input_tokens_seen": 70934870, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.90625, + "step": 3289, + "time_per_iteration": 2.4479291439056396 + }, + { + "auxiliary_loss_clip": 0.01144115, + "auxiliary_loss_mlp": 0.01045349, + "balance_loss_clip": 1.02807593, + "balance_loss_mlp": 1.0500598, + "epoch": 0.19780550127761912, + "flos": 27235478724480.0, + "grad_norm": 2.0523474932115833, + "language_loss": 0.7944051, + "learning_rate": 3.7118143479639813e-06, + "loss": 0.81629974, + "num_input_tokens_seen": 70955140, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9375, + "step": 3290, + "time_per_iteration": 2.499950408935547 + }, + { + "auxiliary_loss_clip": 0.0104544, + "auxiliary_loss_mlp": 0.01002976, + "balance_loss_clip": 1.00056827, + "balance_loss_mlp": 1.01336336, + "epoch": 0.19786562453028708, + "flos": 63550972684800.0, + "grad_norm": 0.9098407078047199, + "language_loss": 0.60440773, + "learning_rate": 3.711612913388418e-06, + "loss": 0.62489194, + "num_input_tokens_seen": 71012005, + "router_z_loss_clip": 0.02404785, + "router_z_loss_mlp": 0.3203125, + "step": 3291, + "time_per_iteration": 3.1538305282592773 + }, + { + "auxiliary_loss_clip": 0.01144863, + "auxiliary_loss_mlp": 0.01044766, + "balance_loss_clip": 1.02639592, + "balance_loss_mlp": 1.04670751, + "epoch": 0.19792574778295505, + "flos": 26286647011200.0, + "grad_norm": 2.151168561582294, + "language_loss": 0.81352198, + "learning_rate": 3.7114114139083204e-06, + "loss": 0.83541822, + "num_input_tokens_seen": 71031140, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.984375, + "step": 3292, + "time_per_iteration": 2.539417028427124 + }, + { + "auxiliary_loss_clip": 0.01137712, + "auxiliary_loss_mlp": 0.01047669, + "balance_loss_clip": 1.03051507, + "balance_loss_mlp": 1.04855824, + "epoch": 0.19798587103562304, + "flos": 19938107566080.0, + "grad_norm": 2.212806192124084, + "language_loss": 0.82146955, + "learning_rate": 3.7112098495313313e-06, + "loss": 0.84332335, + "num_input_tokens_seen": 71050250, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.890625, + "step": 3293, + "time_per_iteration": 2.438809394836426 + }, + { + "auxiliary_loss_clip": 0.01151271, + "auxiliary_loss_mlp": 0.01048402, + "balance_loss_clip": 1.02988923, + "balance_loss_mlp": 1.05333924, + "epoch": 0.198045994288291, + "flos": 20120282369280.0, + "grad_norm": 2.10438249616411, + "language_loss": 0.61268854, + "learning_rate": 3.711008220265093e-06, + "loss": 0.63468528, + "num_input_tokens_seen": 71068665, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9765625, + "step": 3294, + "time_per_iteration": 2.451650381088257 + }, + { + "auxiliary_loss_clip": 0.01143209, + "auxiliary_loss_mlp": 0.01043395, + "balance_loss_clip": 1.02681279, + "balance_loss_mlp": 1.05004907, + "epoch": 0.19810611754095897, + "flos": 17967653228160.0, + "grad_norm": 2.028666267444235, + "language_loss": 0.86983609, + "learning_rate": 3.710806526117251e-06, + "loss": 0.89170212, + "num_input_tokens_seen": 71085320, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9296875, + "step": 3295, + "time_per_iteration": 2.416771411895752 + }, + { + "auxiliary_loss_clip": 0.01141633, + "auxiliary_loss_mlp": 0.01051654, + "balance_loss_clip": 1.03529871, + "balance_loss_mlp": 1.04786801, + "epoch": 0.19816624079362694, + "flos": 15084996071040.0, + "grad_norm": 13.771873008268457, + "language_loss": 0.80491048, + "learning_rate": 3.7106047670954544e-06, + "loss": 0.82684338, + "num_input_tokens_seen": 71102020, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.9375, + "step": 3296, + "time_per_iteration": 2.450934648513794 + }, + { + "auxiliary_loss_clip": 0.01145402, + "auxiliary_loss_mlp": 0.01045523, + "balance_loss_clip": 1.02637851, + "balance_loss_mlp": 1.0482688, + "epoch": 0.1982263640462949, + "flos": 24900315644160.0, + "grad_norm": 2.0804115334054134, + "language_loss": 0.68406892, + "learning_rate": 3.710402943207354e-06, + "loss": 0.70597816, + "num_input_tokens_seen": 71123390, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.97265625, + "step": 3297, + "time_per_iteration": 2.5111610889434814 + }, + { + "auxiliary_loss_clip": 0.01138573, + "auxiliary_loss_mlp": 0.01040678, + "balance_loss_clip": 1.02440548, + "balance_loss_mlp": 1.04895413, + "epoch": 0.19828648729896287, + "flos": 20376181837440.0, + "grad_norm": 1.7575465421519259, + "language_loss": 0.81232154, + "learning_rate": 3.7102010544606016e-06, + "loss": 0.83411407, + "num_input_tokens_seen": 71141800, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.89453125, + "step": 3298, + "time_per_iteration": 2.472025156021118 + }, + { + "auxiliary_loss_clip": 0.01147375, + "auxiliary_loss_mlp": 0.01046386, + "balance_loss_clip": 1.02634668, + "balance_loss_mlp": 1.05001056, + "epoch": 0.19834661055163083, + "flos": 18880035615360.0, + "grad_norm": 2.343960149367745, + "language_loss": 0.85115641, + "learning_rate": 3.7099991008628544e-06, + "loss": 0.87309396, + "num_input_tokens_seen": 71159505, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 0.9765625, + "step": 3299, + "time_per_iteration": 2.4725356101989746 + }, + { + "auxiliary_loss_clip": 0.01045198, + "auxiliary_loss_mlp": 0.01003179, + "balance_loss_clip": 1.00097358, + "balance_loss_mlp": 1.0131526, + "epoch": 0.19840673380429882, + "flos": 60259184640000.0, + "grad_norm": 0.7731212371218976, + "language_loss": 0.53215671, + "learning_rate": 3.7097970824217706e-06, + "loss": 0.55264044, + "num_input_tokens_seen": 71223265, + "router_z_loss_clip": 0.02209473, + "router_z_loss_mlp": 0.3203125, + "step": 3300, + "time_per_iteration": 3.004054069519043 + }, + { + "auxiliary_loss_clip": 0.01142157, + "auxiliary_loss_mlp": 0.01051571, + "balance_loss_clip": 1.03298628, + "balance_loss_mlp": 1.04772329, + "epoch": 0.1984668570569668, + "flos": 19902017376000.0, + "grad_norm": 1.6138936044346288, + "language_loss": 0.73150593, + "learning_rate": 3.7095949991450093e-06, + "loss": 0.75344324, + "num_input_tokens_seen": 71242385, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9453125, + "step": 3301, + "time_per_iteration": 2.4547884464263916 + }, + { + "auxiliary_loss_clip": 0.01140885, + "auxiliary_loss_mlp": 0.01038256, + "balance_loss_clip": 1.02191293, + "balance_loss_mlp": 1.04811358, + "epoch": 0.19852698030963475, + "flos": 15630766295040.0, + "grad_norm": 2.437382428027231, + "language_loss": 0.88445318, + "learning_rate": 3.709392851040235e-06, + "loss": 0.90624458, + "num_input_tokens_seen": 71258990, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.9296875, + "step": 3302, + "time_per_iteration": 2.429579019546509 + }, + { + "auxiliary_loss_clip": 0.01142317, + "auxiliary_loss_mlp": 0.0104676, + "balance_loss_clip": 1.02940273, + "balance_loss_mlp": 1.04750872, + "epoch": 0.19858710356230272, + "flos": 43143007311360.0, + "grad_norm": 1.9503370408087137, + "language_loss": 0.73907369, + "learning_rate": 3.709190638115111e-06, + "loss": 0.76096445, + "num_input_tokens_seen": 71282770, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9453125, + "step": 3303, + "time_per_iteration": 2.627835273742676 + }, + { + "auxiliary_loss_clip": 0.01141217, + "auxiliary_loss_mlp": 0.0104825, + "balance_loss_clip": 1.03117871, + "balance_loss_mlp": 1.04874539, + "epoch": 0.19864722681497068, + "flos": 35144084643840.0, + "grad_norm": 1.8172241344194675, + "language_loss": 0.74761099, + "learning_rate": 3.7089883603773084e-06, + "loss": 0.76950562, + "num_input_tokens_seen": 71301410, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.921875, + "step": 3304, + "time_per_iteration": 2.551241397857666 + }, + { + "auxiliary_loss_clip": 0.01139854, + "auxiliary_loss_mlp": 0.01039407, + "balance_loss_clip": 1.02333784, + "balance_loss_mlp": 1.04763281, + "epoch": 0.19870735006763865, + "flos": 19426200888960.0, + "grad_norm": 2.605019982075021, + "language_loss": 0.85717452, + "learning_rate": 3.7087860178344955e-06, + "loss": 0.87896717, + "num_input_tokens_seen": 71319670, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.921875, + "step": 3305, + "time_per_iteration": 2.432363986968994 + }, + { + "auxiliary_loss_clip": 0.01141298, + "auxiliary_loss_mlp": 0.01040354, + "balance_loss_clip": 1.02408171, + "balance_loss_mlp": 1.04600525, + "epoch": 0.19876747332030664, + "flos": 23547380947200.0, + "grad_norm": 1.7555780714506408, + "language_loss": 0.68014234, + "learning_rate": 3.7085836104943445e-06, + "loss": 0.70195889, + "num_input_tokens_seen": 71339850, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.953125, + "step": 3306, + "time_per_iteration": 2.478422164916992 + }, + { + "auxiliary_loss_clip": 0.01137681, + "auxiliary_loss_mlp": 0.01036227, + "balance_loss_clip": 1.02098584, + "balance_loss_mlp": 1.0453912, + "epoch": 0.1988275965729746, + "flos": 19829406032640.0, + "grad_norm": 1.4744708200758283, + "language_loss": 0.76455241, + "learning_rate": 3.7083811383645332e-06, + "loss": 0.78629148, + "num_input_tokens_seen": 71359795, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.921875, + "step": 3307, + "time_per_iteration": 2.459176778793335 + }, + { + "auxiliary_loss_clip": 0.01140736, + "auxiliary_loss_mlp": 0.0104117, + "balance_loss_clip": 1.02520776, + "balance_loss_mlp": 1.04866791, + "epoch": 0.19888771982564257, + "flos": 23513625141120.0, + "grad_norm": 1.8666050855147507, + "language_loss": 0.75933248, + "learning_rate": 3.708178601452737e-06, + "loss": 0.78115153, + "num_input_tokens_seen": 71378885, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.921875, + "step": 3308, + "time_per_iteration": 2.483060121536255 + }, + { + "auxiliary_loss_clip": 0.01141228, + "auxiliary_loss_mlp": 0.01041046, + "balance_loss_clip": 1.02426159, + "balance_loss_mlp": 1.04736626, + "epoch": 0.19894784307831054, + "flos": 18150510389760.0, + "grad_norm": 1.6368693105847256, + "language_loss": 0.75640005, + "learning_rate": 3.7079759997666374e-06, + "loss": 0.7782228, + "num_input_tokens_seen": 71397285, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.94140625, + "step": 3309, + "time_per_iteration": 3.8069632053375244 + }, + { + "auxiliary_loss_clip": 0.01138354, + "auxiliary_loss_mlp": 0.01046592, + "balance_loss_clip": 1.02869844, + "balance_loss_mlp": 1.04665506, + "epoch": 0.1990079663309785, + "flos": 24276044246400.0, + "grad_norm": 1.6858420956549012, + "language_loss": 0.87646699, + "learning_rate": 3.707773333313917e-06, + "loss": 0.8983165, + "num_input_tokens_seen": 71415775, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.9140625, + "step": 3310, + "time_per_iteration": 3.9299721717834473 + }, + { + "auxiliary_loss_clip": 0.01138843, + "auxiliary_loss_mlp": 0.01041462, + "balance_loss_clip": 1.02431977, + "balance_loss_mlp": 1.04637599, + "epoch": 0.19906808958364647, + "flos": 34897666366080.0, + "grad_norm": 3.6845239503362412, + "language_loss": 0.64166129, + "learning_rate": 3.70757060210226e-06, + "loss": 0.66346431, + "num_input_tokens_seen": 71437315, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.92578125, + "step": 3311, + "time_per_iteration": 2.5747337341308594 + }, + { + "auxiliary_loss_clip": 0.01143032, + "auxiliary_loss_mlp": 0.01042216, + "balance_loss_clip": 1.02559805, + "balance_loss_mlp": 1.04768658, + "epoch": 0.19912821283631443, + "flos": 24024885373440.0, + "grad_norm": 2.462607887220823, + "language_loss": 0.74053729, + "learning_rate": 3.707367806139355e-06, + "loss": 0.76238978, + "num_input_tokens_seen": 71456320, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.953125, + "step": 3312, + "time_per_iteration": 2.471867799758911 + }, + { + "auxiliary_loss_clip": 0.01141113, + "auxiliary_loss_mlp": 0.01046905, + "balance_loss_clip": 1.03060961, + "balance_loss_mlp": 1.04843581, + "epoch": 0.19918833608898243, + "flos": 19859031774720.0, + "grad_norm": 2.2841450786746016, + "language_loss": 0.83511955, + "learning_rate": 3.7071649454328915e-06, + "loss": 0.8569997, + "num_input_tokens_seen": 71475360, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.92578125, + "step": 3313, + "time_per_iteration": 2.4846627712249756 + }, + { + "auxiliary_loss_clip": 0.01142431, + "auxiliary_loss_mlp": 0.01041924, + "balance_loss_clip": 1.02575922, + "balance_loss_mlp": 1.04944849, + "epoch": 0.1992484593416504, + "flos": 29095794984960.0, + "grad_norm": 3.438256379955746, + "language_loss": 0.80930895, + "learning_rate": 3.7069620199905625e-06, + "loss": 0.83115256, + "num_input_tokens_seen": 71496155, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.9296875, + "step": 3314, + "time_per_iteration": 2.525754928588867 + }, + { + "auxiliary_loss_clip": 0.01137185, + "auxiliary_loss_mlp": 0.01043596, + "balance_loss_clip": 1.0280745, + "balance_loss_mlp": 1.04706359, + "epoch": 0.19930858259431836, + "flos": 23295001011840.0, + "grad_norm": 1.5137591341622172, + "language_loss": 0.87549174, + "learning_rate": 3.7067590298200627e-06, + "loss": 0.89729953, + "num_input_tokens_seen": 71517295, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8984375, + "step": 3315, + "time_per_iteration": 2.5170931816101074 + }, + { + "auxiliary_loss_clip": 0.01141446, + "auxiliary_loss_mlp": 0.01046665, + "balance_loss_clip": 1.03032112, + "balance_loss_mlp": 1.04808092, + "epoch": 0.19936870584698632, + "flos": 25378825651200.0, + "grad_norm": 1.5984895942740787, + "language_loss": 0.71255141, + "learning_rate": 3.7065559749290892e-06, + "loss": 0.73443246, + "num_input_tokens_seen": 71540000, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9296875, + "step": 3316, + "time_per_iteration": 2.520071029663086 + }, + { + "auxiliary_loss_clip": 0.0105243, + "auxiliary_loss_mlp": 0.01028392, + "balance_loss_clip": 1.02646089, + "balance_loss_mlp": 1.01928639, + "epoch": 0.1994288290996543, + "flos": 62168053109760.0, + "grad_norm": 0.8439111854473917, + "language_loss": 0.66260874, + "learning_rate": 3.706352855325342e-06, + "loss": 0.68341696, + "num_input_tokens_seen": 71607880, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.33203125, + "step": 3317, + "time_per_iteration": 3.1460416316986084 + }, + { + "auxiliary_loss_clip": 0.01142295, + "auxiliary_loss_mlp": 0.01052969, + "balance_loss_clip": 1.03557682, + "balance_loss_mlp": 1.04575253, + "epoch": 0.19948895235232225, + "flos": 19025832919680.0, + "grad_norm": 2.672944172124665, + "language_loss": 0.74319738, + "learning_rate": 3.7061496710165233e-06, + "loss": 0.76515001, + "num_input_tokens_seen": 71625695, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.96484375, + "step": 3318, + "time_per_iteration": 2.6139748096466064 + }, + { + "auxiliary_loss_clip": 0.01134724, + "auxiliary_loss_mlp": 0.01043694, + "balance_loss_clip": 1.0282445, + "balance_loss_mlp": 1.04536486, + "epoch": 0.19954907560499022, + "flos": 37815803182080.0, + "grad_norm": 1.900050251198073, + "language_loss": 0.78860074, + "learning_rate": 3.7059464220103385e-06, + "loss": 0.81038487, + "num_input_tokens_seen": 71648520, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.89453125, + "step": 3319, + "time_per_iteration": 2.6014342308044434 + }, + { + "auxiliary_loss_clip": 0.01141458, + "auxiliary_loss_mlp": 0.01042777, + "balance_loss_clip": 1.02439499, + "balance_loss_mlp": 1.04806578, + "epoch": 0.1996091988576582, + "flos": 49565199594240.0, + "grad_norm": 2.0962453666662073, + "language_loss": 0.75462162, + "learning_rate": 3.7057431083144945e-06, + "loss": 0.77646399, + "num_input_tokens_seen": 71672185, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.93359375, + "step": 3320, + "time_per_iteration": 2.739485263824463 + }, + { + "auxiliary_loss_clip": 0.01139438, + "auxiliary_loss_mlp": 0.01042565, + "balance_loss_clip": 1.02613819, + "balance_loss_mlp": 1.04714417, + "epoch": 0.19966932211032618, + "flos": 22635788659200.0, + "grad_norm": 2.167317842134812, + "language_loss": 0.80547488, + "learning_rate": 3.705539729936701e-06, + "loss": 0.82729495, + "num_input_tokens_seen": 71692890, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.921875, + "step": 3321, + "time_per_iteration": 2.581353187561035 + }, + { + "auxiliary_loss_clip": 0.01049309, + "auxiliary_loss_mlp": 0.01003433, + "balance_loss_clip": 1.00151408, + "balance_loss_mlp": 1.01694489, + "epoch": 0.19972944536299414, + "flos": 54082117745280.0, + "grad_norm": 0.880630206553271, + "language_loss": 0.65178835, + "learning_rate": 3.7053362868846696e-06, + "loss": 0.67231572, + "num_input_tokens_seen": 71745815, + "router_z_loss_clip": 0.01916504, + "router_z_loss_mlp": 0.32421875, + "step": 3322, + "time_per_iteration": 2.9042704105377197 + }, + { + "auxiliary_loss_clip": 0.01050141, + "auxiliary_loss_mlp": 0.01003283, + "balance_loss_clip": 1.00130391, + "balance_loss_mlp": 1.01724231, + "epoch": 0.1997895686156621, + "flos": 69355031817600.0, + "grad_norm": 0.7916622121471568, + "language_loss": 0.56975091, + "learning_rate": 3.7051327791661153e-06, + "loss": 0.59028506, + "num_input_tokens_seen": 71806915, + "router_z_loss_clip": 0.01977539, + "router_z_loss_mlp": 0.328125, + "step": 3323, + "time_per_iteration": 3.2141411304473877 + }, + { + "auxiliary_loss_clip": 0.01139547, + "auxiliary_loss_mlp": 0.01035371, + "balance_loss_clip": 1.01859808, + "balance_loss_mlp": 1.04839373, + "epoch": 0.19984969186833007, + "flos": 18552063507840.0, + "grad_norm": 1.9849201654975537, + "language_loss": 0.80526733, + "learning_rate": 3.7049292067887555e-06, + "loss": 0.82701647, + "num_input_tokens_seen": 71824645, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9140625, + "step": 3324, + "time_per_iteration": 2.5455262660980225 + }, + { + "auxiliary_loss_clip": 0.01137077, + "auxiliary_loss_mlp": 0.01040613, + "balance_loss_clip": 1.02329218, + "balance_loss_mlp": 1.04540765, + "epoch": 0.19990981512099804, + "flos": 26429678968320.0, + "grad_norm": 1.8681208438308643, + "language_loss": 0.53681695, + "learning_rate": 3.7047255697603092e-06, + "loss": 0.55859387, + "num_input_tokens_seen": 71845125, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.91796875, + "step": 3325, + "time_per_iteration": 2.581782102584839 + }, + { + "auxiliary_loss_clip": 0.01138508, + "auxiliary_loss_mlp": 0.01039502, + "balance_loss_clip": 1.02337289, + "balance_loss_mlp": 1.04565668, + "epoch": 0.19996993837366603, + "flos": 16325997010560.0, + "grad_norm": 2.0672953846254027, + "language_loss": 0.86169922, + "learning_rate": 3.7045218680884984e-06, + "loss": 0.88347936, + "num_input_tokens_seen": 71863500, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.9296875, + "step": 3326, + "time_per_iteration": 2.494718551635742 + }, + { + "auxiliary_loss_clip": 0.01138244, + "auxiliary_loss_mlp": 0.01037965, + "balance_loss_clip": 1.02243209, + "balance_loss_mlp": 1.04851878, + "epoch": 0.200030061626334, + "flos": 20844169159680.0, + "grad_norm": 1.8653522915536895, + "language_loss": 0.71835959, + "learning_rate": 3.7043181017810476e-06, + "loss": 0.74012172, + "num_input_tokens_seen": 71881845, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8984375, + "step": 3327, + "time_per_iteration": 2.536574602127075 + }, + { + "auxiliary_loss_clip": 0.0114197, + "auxiliary_loss_mlp": 0.01041829, + "balance_loss_clip": 1.02368546, + "balance_loss_mlp": 1.04750776, + "epoch": 0.20009018487900196, + "flos": 23762629198080.0, + "grad_norm": 1.83111198959611, + "language_loss": 0.76588571, + "learning_rate": 3.7041142708456833e-06, + "loss": 0.78772372, + "num_input_tokens_seen": 71900940, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9453125, + "step": 3328, + "time_per_iteration": 2.5083916187286377 + }, + { + "auxiliary_loss_clip": 0.01698253, + "auxiliary_loss_mlp": 0.01552284, + "balance_loss_clip": 1.52980089, + "balance_loss_mlp": 1.56677365, + "epoch": 0.20015030813166992, + "flos": 28106162236800.0, + "grad_norm": 1.6482454448342019, + "language_loss": 1.03044438, + "learning_rate": 3.7039103752901353e-06, + "loss": 0.7143048, + "num_input_tokens_seen": 71921925, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.3125, + "step": 3329, + "time_per_iteration": 15.37552785873413 + }, + { + "auxiliary_loss_clip": 0.01146286, + "auxiliary_loss_mlp": 0.01050404, + "balance_loss_clip": 1.03149772, + "balance_loss_mlp": 1.0504123, + "epoch": 0.2002104313843379, + "flos": 26067160955520.0, + "grad_norm": 1.5519947176183269, + "language_loss": 0.81297028, + "learning_rate": 3.7037064151221353e-06, + "loss": 0.8349371, + "num_input_tokens_seen": 71941855, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.9609375, + "step": 3330, + "time_per_iteration": 2.500103712081909 + }, + { + "auxiliary_loss_clip": 0.01140997, + "auxiliary_loss_mlp": 0.01037259, + "balance_loss_clip": 1.01994956, + "balance_loss_mlp": 1.04669356, + "epoch": 0.20027055463700585, + "flos": 22966633854720.0, + "grad_norm": 2.032272994312633, + "language_loss": 0.76649368, + "learning_rate": 3.703502390349417e-06, + "loss": 0.78827626, + "num_input_tokens_seen": 71960915, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9453125, + "step": 3331, + "time_per_iteration": 2.4018712043762207 + }, + { + "auxiliary_loss_clip": 0.01141733, + "auxiliary_loss_mlp": 0.01045779, + "balance_loss_clip": 1.02819538, + "balance_loss_mlp": 1.04608667, + "epoch": 0.20033067788967382, + "flos": 17165660313600.0, + "grad_norm": 1.6582018653132529, + "language_loss": 0.79261309, + "learning_rate": 3.7032983009797176e-06, + "loss": 0.81448817, + "num_input_tokens_seen": 71979220, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.953125, + "step": 3332, + "time_per_iteration": 2.4550859928131104 + }, + { + "auxiliary_loss_clip": 0.01045684, + "auxiliary_loss_mlp": 0.01005368, + "balance_loss_clip": 1.0036391, + "balance_loss_mlp": 1.01433849, + "epoch": 0.2003908011423418, + "flos": 60825566292480.0, + "grad_norm": 0.9315137515082259, + "language_loss": 0.61990142, + "learning_rate": 3.703094147020776e-06, + "loss": 0.64041197, + "num_input_tokens_seen": 72033950, + "router_z_loss_clip": 0.01733398, + "router_z_loss_mlp": 0.31445312, + "step": 3333, + "time_per_iteration": 2.9623756408691406 + }, + { + "auxiliary_loss_clip": 0.01139681, + "auxiliary_loss_mlp": 0.01044905, + "balance_loss_clip": 1.02819228, + "balance_loss_mlp": 1.04501462, + "epoch": 0.20045092439500978, + "flos": 24206234163840.0, + "grad_norm": 2.1372355522021893, + "language_loss": 0.81203878, + "learning_rate": 3.7028899284803334e-06, + "loss": 0.8338846, + "num_input_tokens_seen": 72051395, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9453125, + "step": 3334, + "time_per_iteration": 2.49924373626709 + }, + { + "auxiliary_loss_clip": 0.01146523, + "auxiliary_loss_mlp": 0.01047388, + "balance_loss_clip": 1.02938735, + "balance_loss_mlp": 1.04878521, + "epoch": 0.20051104764767774, + "flos": 29387605075200.0, + "grad_norm": 2.1564721635267516, + "language_loss": 0.74261904, + "learning_rate": 3.702685645366134e-06, + "loss": 0.76455814, + "num_input_tokens_seen": 72071305, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9765625, + "step": 3335, + "time_per_iteration": 2.634608745574951 + }, + { + "auxiliary_loss_clip": 0.01150022, + "auxiliary_loss_mlp": 0.01058924, + "balance_loss_clip": 1.04205632, + "balance_loss_mlp": 1.05375338, + "epoch": 0.2005711709003457, + "flos": 23513804709120.0, + "grad_norm": 1.6943946878944693, + "language_loss": 0.79839814, + "learning_rate": 3.7024812976859243e-06, + "loss": 0.82048762, + "num_input_tokens_seen": 72090165, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9609375, + "step": 3336, + "time_per_iteration": 2.7025394439697266 + }, + { + "auxiliary_loss_clip": 0.01145798, + "auxiliary_loss_mlp": 0.01045992, + "balance_loss_clip": 1.02744317, + "balance_loss_mlp": 1.04703879, + "epoch": 0.20063129415301367, + "flos": 22523388024960.0, + "grad_norm": 1.9043375292422164, + "language_loss": 0.78031212, + "learning_rate": 3.7022768854474532e-06, + "loss": 0.80223, + "num_input_tokens_seen": 72107210, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.98828125, + "step": 3337, + "time_per_iteration": 2.5718014240264893 + }, + { + "auxiliary_loss_clip": 0.01143827, + "auxiliary_loss_mlp": 0.01045572, + "balance_loss_clip": 1.02708244, + "balance_loss_mlp": 1.0486424, + "epoch": 0.20069141740568164, + "flos": 25958243940480.0, + "grad_norm": 1.9983960159800889, + "language_loss": 0.6873948, + "learning_rate": 3.7020724086584724e-06, + "loss": 0.70928884, + "num_input_tokens_seen": 72126315, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.94921875, + "step": 3338, + "time_per_iteration": 2.5848047733306885 + }, + { + "auxiliary_loss_clip": 0.01143098, + "auxiliary_loss_mlp": 0.01049172, + "balance_loss_clip": 1.03263819, + "balance_loss_mlp": 1.04853702, + "epoch": 0.2007515406583496, + "flos": 24790608529920.0, + "grad_norm": 2.1061075345379576, + "language_loss": 0.68823779, + "learning_rate": 3.701867867326735e-06, + "loss": 0.71016049, + "num_input_tokens_seen": 72146470, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.9453125, + "step": 3339, + "time_per_iteration": 2.523771047592163 + }, + { + "auxiliary_loss_clip": 0.01149874, + "auxiliary_loss_mlp": 0.01038245, + "balance_loss_clip": 1.02217603, + "balance_loss_mlp": 1.05197799, + "epoch": 0.2008116639110176, + "flos": 37925582123520.0, + "grad_norm": 2.3080693694415872, + "language_loss": 0.66263533, + "learning_rate": 3.7016632614599974e-06, + "loss": 0.68451655, + "num_input_tokens_seen": 72166600, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.9765625, + "step": 3340, + "time_per_iteration": 2.647495985031128 + }, + { + "auxiliary_loss_clip": 0.01141947, + "auxiliary_loss_mlp": 0.0103392, + "balance_loss_clip": 1.01570475, + "balance_loss_mlp": 1.0457145, + "epoch": 0.20087178716368556, + "flos": 20740531443840.0, + "grad_norm": 2.8472305033219696, + "language_loss": 0.74124628, + "learning_rate": 3.701458591066019e-06, + "loss": 0.76300496, + "num_input_tokens_seen": 72185160, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.9609375, + "step": 3341, + "time_per_iteration": 2.511585235595703 + }, + { + "auxiliary_loss_clip": 0.01140464, + "auxiliary_loss_mlp": 0.01043131, + "balance_loss_clip": 1.02689481, + "balance_loss_mlp": 1.04846787, + "epoch": 0.20093191041635353, + "flos": 23842279607040.0, + "grad_norm": 2.1698717951472326, + "language_loss": 0.71578503, + "learning_rate": 3.70125385615256e-06, + "loss": 0.73762101, + "num_input_tokens_seen": 72205160, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.91796875, + "step": 3342, + "time_per_iteration": 2.561998128890991 + }, + { + "auxiliary_loss_clip": 0.01142187, + "auxiliary_loss_mlp": 0.01045325, + "balance_loss_clip": 1.02871895, + "balance_loss_mlp": 1.04746354, + "epoch": 0.2009920336690215, + "flos": 21792067119360.0, + "grad_norm": 1.9864957062525024, + "language_loss": 0.73130047, + "learning_rate": 3.701049056727384e-06, + "loss": 0.75317556, + "num_input_tokens_seen": 72223555, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9453125, + "step": 3343, + "time_per_iteration": 4.046127557754517 + }, + { + "auxiliary_loss_clip": 0.01142173, + "auxiliary_loss_mlp": 0.01050047, + "balance_loss_clip": 1.03252363, + "balance_loss_mlp": 1.04738092, + "epoch": 0.20105215692168946, + "flos": 26359222440960.0, + "grad_norm": 1.9813453341923526, + "language_loss": 0.81026411, + "learning_rate": 3.7008441927982574e-06, + "loss": 0.83218634, + "num_input_tokens_seen": 72242465, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.94921875, + "step": 3344, + "time_per_iteration": 2.520765542984009 + }, + { + "auxiliary_loss_clip": 0.01141139, + "auxiliary_loss_mlp": 0.01050367, + "balance_loss_clip": 1.03412437, + "balance_loss_mlp": 1.04661858, + "epoch": 0.20111228017435742, + "flos": 18807280617600.0, + "grad_norm": 2.7491478080862684, + "language_loss": 0.83503234, + "learning_rate": 3.700639264372948e-06, + "loss": 0.85694736, + "num_input_tokens_seen": 72260655, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.9453125, + "step": 3345, + "time_per_iteration": 4.064355373382568 + }, + { + "auxiliary_loss_clip": 0.01135224, + "auxiliary_loss_mlp": 0.01041726, + "balance_loss_clip": 1.02689624, + "balance_loss_mlp": 1.0464828, + "epoch": 0.20117240342702541, + "flos": 19975059682560.0, + "grad_norm": 1.723487885242635, + "language_loss": 0.67909771, + "learning_rate": 3.7004342714592283e-06, + "loss": 0.70086718, + "num_input_tokens_seen": 72279055, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.88671875, + "step": 3346, + "time_per_iteration": 2.521949291229248 + }, + { + "auxiliary_loss_clip": 0.01140579, + "auxiliary_loss_mlp": 0.01048866, + "balance_loss_clip": 1.03233206, + "balance_loss_mlp": 1.04726124, + "epoch": 0.20123252667969338, + "flos": 23142703345920.0, + "grad_norm": 2.272845003166824, + "language_loss": 0.73496711, + "learning_rate": 3.70022921406487e-06, + "loss": 0.75686157, + "num_input_tokens_seen": 72297895, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.93359375, + "step": 3347, + "time_per_iteration": 2.5316877365112305 + }, + { + "auxiliary_loss_clip": 0.01140927, + "auxiliary_loss_mlp": 0.01047237, + "balance_loss_clip": 1.03179908, + "balance_loss_mlp": 1.04827023, + "epoch": 0.20129264993236134, + "flos": 23221671396480.0, + "grad_norm": 1.7467826588499227, + "language_loss": 0.86716485, + "learning_rate": 3.70002409219765e-06, + "loss": 0.88904649, + "num_input_tokens_seen": 72318385, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.92578125, + "step": 3348, + "time_per_iteration": 2.5123202800750732 + }, + { + "auxiliary_loss_clip": 0.01138726, + "auxiliary_loss_mlp": 0.01041589, + "balance_loss_clip": 1.02335036, + "balance_loss_mlp": 1.04729295, + "epoch": 0.2013527731850293, + "flos": 21871466133120.0, + "grad_norm": 1.5886148695932183, + "language_loss": 0.71200913, + "learning_rate": 3.699818905865346e-06, + "loss": 0.73381227, + "num_input_tokens_seen": 72338235, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.9140625, + "step": 3349, + "time_per_iteration": 2.505594491958618 + }, + { + "auxiliary_loss_clip": 0.01144556, + "auxiliary_loss_mlp": 0.01048519, + "balance_loss_clip": 1.03016067, + "balance_loss_mlp": 1.04982185, + "epoch": 0.20141289643769728, + "flos": 18040803275520.0, + "grad_norm": 1.649154800785762, + "language_loss": 0.71079665, + "learning_rate": 3.6996136550757377e-06, + "loss": 0.73272741, + "num_input_tokens_seen": 72357825, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.9453125, + "step": 3350, + "time_per_iteration": 2.4927315711975098 + }, + { + "auxiliary_loss_clip": 0.01145933, + "auxiliary_loss_mlp": 0.01044553, + "balance_loss_clip": 1.02612305, + "balance_loss_mlp": 1.05045485, + "epoch": 0.20147301969036524, + "flos": 23951412103680.0, + "grad_norm": 3.2873247390310554, + "language_loss": 0.76327842, + "learning_rate": 3.69940833983661e-06, + "loss": 0.78518331, + "num_input_tokens_seen": 72376335, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.953125, + "step": 3351, + "time_per_iteration": 2.5077342987060547 + }, + { + "auxiliary_loss_clip": 0.01146641, + "auxiliary_loss_mlp": 0.01043619, + "balance_loss_clip": 1.02555871, + "balance_loss_mlp": 1.05069637, + "epoch": 0.2015331429430332, + "flos": 25588471380480.0, + "grad_norm": 1.662758000066145, + "language_loss": 0.80545723, + "learning_rate": 3.699202960155748e-06, + "loss": 0.8273598, + "num_input_tokens_seen": 72395440, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.95703125, + "step": 3352, + "time_per_iteration": 2.5717766284942627 + }, + { + "auxiliary_loss_clip": 0.01146315, + "auxiliary_loss_mlp": 0.01039569, + "balance_loss_clip": 1.02274823, + "balance_loss_mlp": 1.05210721, + "epoch": 0.2015932661957012, + "flos": 26724972677760.0, + "grad_norm": 1.7179856660366186, + "language_loss": 0.8027631, + "learning_rate": 3.6989975160409396e-06, + "loss": 0.82462192, + "num_input_tokens_seen": 72414670, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9453125, + "step": 3353, + "time_per_iteration": 2.6415467262268066 + }, + { + "auxiliary_loss_clip": 0.01140403, + "auxiliary_loss_mlp": 0.01041635, + "balance_loss_clip": 1.02512455, + "balance_loss_mlp": 1.04978478, + "epoch": 0.20165338944836916, + "flos": 15633136592640.0, + "grad_norm": 2.050762039112588, + "language_loss": 0.8946988, + "learning_rate": 3.6987920074999747e-06, + "loss": 0.91651917, + "num_input_tokens_seen": 72432210, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.90625, + "step": 3354, + "time_per_iteration": 2.4780237674713135 + }, + { + "auxiliary_loss_clip": 0.01052075, + "auxiliary_loss_mlp": 0.01011403, + "balance_loss_clip": 1.00948358, + "balance_loss_mlp": 1.0202148, + "epoch": 0.20171351270103713, + "flos": 57912529207680.0, + "grad_norm": 0.830112597874188, + "language_loss": 0.55839282, + "learning_rate": 3.6985864345406465e-06, + "loss": 0.57902759, + "num_input_tokens_seen": 72489225, + "router_z_loss_clip": 0.01916504, + "router_z_loss_mlp": 0.31835938, + "step": 3355, + "time_per_iteration": 3.0224292278289795 + }, + { + "auxiliary_loss_clip": 0.01140957, + "auxiliary_loss_mlp": 0.01045212, + "balance_loss_clip": 1.02891648, + "balance_loss_mlp": 1.05068707, + "epoch": 0.2017736359537051, + "flos": 20814363849600.0, + "grad_norm": 1.5257876958196368, + "language_loss": 0.84076762, + "learning_rate": 3.698380797170751e-06, + "loss": 0.86262929, + "num_input_tokens_seen": 72508715, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.90234375, + "step": 3356, + "time_per_iteration": 2.510615348815918 + }, + { + "auxiliary_loss_clip": 0.01152963, + "auxiliary_loss_mlp": 0.01043363, + "balance_loss_clip": 1.024194, + "balance_loss_mlp": 1.05356848, + "epoch": 0.20183375920637306, + "flos": 17092043389440.0, + "grad_norm": 2.9361880537925584, + "language_loss": 0.688007, + "learning_rate": 3.698175095398085e-06, + "loss": 0.70997024, + "num_input_tokens_seen": 72525135, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.9921875, + "step": 3357, + "time_per_iteration": 2.460022211074829 + }, + { + "auxiliary_loss_clip": 0.01144866, + "auxiliary_loss_mlp": 0.01040819, + "balance_loss_clip": 1.02280617, + "balance_loss_mlp": 1.0492487, + "epoch": 0.20189388245904102, + "flos": 18661339658880.0, + "grad_norm": 1.7490617907772006, + "language_loss": 0.71748042, + "learning_rate": 3.6979693292304493e-06, + "loss": 0.73933733, + "num_input_tokens_seen": 72543690, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.95703125, + "step": 3358, + "time_per_iteration": 2.563767194747925 + }, + { + "auxiliary_loss_clip": 0.01139733, + "auxiliary_loss_mlp": 0.01054955, + "balance_loss_clip": 1.03818202, + "balance_loss_mlp": 1.04849517, + "epoch": 0.20195400571170902, + "flos": 16797539779200.0, + "grad_norm": 2.042998238377631, + "language_loss": 0.83104217, + "learning_rate": 3.6977634986756463e-06, + "loss": 0.85298896, + "num_input_tokens_seen": 72560725, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9140625, + "step": 3359, + "time_per_iteration": 2.531332015991211 + }, + { + "auxiliary_loss_clip": 0.01052883, + "auxiliary_loss_mlp": 0.01001012, + "balance_loss_clip": 0.99911654, + "balance_loss_mlp": 1.02214265, + "epoch": 0.20201412896437698, + "flos": 67174716268800.0, + "grad_norm": 12.853939959466139, + "language_loss": 0.5895561, + "learning_rate": 3.697557603741482e-06, + "loss": 0.61009508, + "num_input_tokens_seen": 72621940, + "router_z_loss_clip": 0.0189209, + "router_z_loss_mlp": 0.30859375, + "step": 3360, + "time_per_iteration": 3.0536341667175293 + }, + { + "auxiliary_loss_clip": 0.01147837, + "auxiliary_loss_mlp": 0.01049077, + "balance_loss_clip": 1.03117216, + "balance_loss_mlp": 1.05149043, + "epoch": 0.20207425221704495, + "flos": 21325013550720.0, + "grad_norm": 2.4416015649532286, + "language_loss": 0.62138069, + "learning_rate": 3.697351644435763e-06, + "loss": 0.64334983, + "num_input_tokens_seen": 72639135, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.96484375, + "step": 3361, + "time_per_iteration": 2.512657403945923 + }, + { + "auxiliary_loss_clip": 0.0114522, + "auxiliary_loss_mlp": 0.01055979, + "balance_loss_clip": 1.03900385, + "balance_loss_mlp": 1.05156183, + "epoch": 0.2021343754697129, + "flos": 22527158952960.0, + "grad_norm": 2.0025961231737526, + "language_loss": 0.75524926, + "learning_rate": 3.6971456207662993e-06, + "loss": 0.77726126, + "num_input_tokens_seen": 72658525, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9375, + "step": 3362, + "time_per_iteration": 2.555492639541626 + }, + { + "auxiliary_loss_clip": 0.01145631, + "auxiliary_loss_mlp": 0.01046193, + "balance_loss_clip": 1.02926481, + "balance_loss_mlp": 1.05209327, + "epoch": 0.20219449872238088, + "flos": 19062785036160.0, + "grad_norm": 1.6135185744423872, + "language_loss": 0.76400363, + "learning_rate": 3.6969395327409035e-06, + "loss": 0.78592181, + "num_input_tokens_seen": 72678085, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9375, + "step": 3363, + "time_per_iteration": 2.486969470977783 + }, + { + "auxiliary_loss_clip": 0.01141408, + "auxiliary_loss_mlp": 0.01052858, + "balance_loss_clip": 1.03686023, + "balance_loss_mlp": 1.04736471, + "epoch": 0.20225462197504884, + "flos": 24717027519360.0, + "grad_norm": 2.0495916908721434, + "language_loss": 0.74606001, + "learning_rate": 3.696733380367391e-06, + "loss": 0.76800275, + "num_input_tokens_seen": 72698695, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.9375, + "step": 3364, + "time_per_iteration": 2.58673095703125 + }, + { + "auxiliary_loss_clip": 0.01144028, + "auxiliary_loss_mlp": 0.01052057, + "balance_loss_clip": 1.03390145, + "balance_loss_mlp": 1.04865253, + "epoch": 0.2023147452277168, + "flos": 22018304931840.0, + "grad_norm": 2.1992700083841084, + "language_loss": 0.71451771, + "learning_rate": 3.6965271636535783e-06, + "loss": 0.73647857, + "num_input_tokens_seen": 72717880, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.953125, + "step": 3365, + "time_per_iteration": 2.522470712661743 + }, + { + "auxiliary_loss_clip": 0.01147339, + "auxiliary_loss_mlp": 0.01052179, + "balance_loss_clip": 1.03516757, + "balance_loss_mlp": 1.05331004, + "epoch": 0.2023748684803848, + "flos": 17745365911680.0, + "grad_norm": 1.9561618637344158, + "language_loss": 0.85770535, + "learning_rate": 3.696320882607286e-06, + "loss": 0.87970054, + "num_input_tokens_seen": 72736410, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.94140625, + "step": 3366, + "time_per_iteration": 2.536529541015625 + }, + { + "auxiliary_loss_clip": 0.01143453, + "auxiliary_loss_mlp": 0.01044399, + "balance_loss_clip": 1.02761412, + "balance_loss_mlp": 1.0499506, + "epoch": 0.20243499173305277, + "flos": 31138932493440.0, + "grad_norm": 1.628387041142295, + "language_loss": 0.69651556, + "learning_rate": 3.696114537236335e-06, + "loss": 0.7183941, + "num_input_tokens_seen": 72758295, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.93359375, + "step": 3367, + "time_per_iteration": 2.5608372688293457 + }, + { + "auxiliary_loss_clip": 0.01145892, + "auxiliary_loss_mlp": 0.01043195, + "balance_loss_clip": 1.0235498, + "balance_loss_mlp": 1.04696274, + "epoch": 0.20249511498572073, + "flos": 33839235279360.0, + "grad_norm": 2.963599898430263, + "language_loss": 0.68230569, + "learning_rate": 3.6959081275485512e-06, + "loss": 0.70419657, + "num_input_tokens_seen": 72782495, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.98828125, + "step": 3368, + "time_per_iteration": 2.66802978515625 + }, + { + "auxiliary_loss_clip": 0.01143607, + "auxiliary_loss_mlp": 0.01049214, + "balance_loss_clip": 1.03178596, + "balance_loss_mlp": 1.0505259, + "epoch": 0.2025552382383887, + "flos": 21215629658880.0, + "grad_norm": 7.849671101524798, + "language_loss": 0.77025628, + "learning_rate": 3.6957016535517615e-06, + "loss": 0.79218459, + "num_input_tokens_seen": 72801885, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9296875, + "step": 3369, + "time_per_iteration": 2.5143446922302246 + }, + { + "auxiliary_loss_clip": 0.01145287, + "auxiliary_loss_mlp": 0.01057318, + "balance_loss_clip": 1.04029489, + "balance_loss_mlp": 1.04800487, + "epoch": 0.20261536149105666, + "flos": 14647388676480.0, + "grad_norm": 4.298107611861754, + "language_loss": 0.65408337, + "learning_rate": 3.695495115253795e-06, + "loss": 0.67610943, + "num_input_tokens_seen": 72816990, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.97265625, + "step": 3370, + "time_per_iteration": 2.503589630126953 + }, + { + "auxiliary_loss_clip": 0.01048919, + "auxiliary_loss_mlp": 0.01024768, + "balance_loss_clip": 1.02313519, + "balance_loss_mlp": 1.01856685, + "epoch": 0.20267548474372463, + "flos": 66783649921920.0, + "grad_norm": 0.6799262329378595, + "language_loss": 0.58101869, + "learning_rate": 3.6952885126624834e-06, + "loss": 0.60175562, + "num_input_tokens_seen": 72879240, + "router_z_loss_clip": 0.01635742, + "router_z_loss_mlp": 0.3046875, + "step": 3371, + "time_per_iteration": 3.1626369953155518 + }, + { + "auxiliary_loss_clip": 0.01143688, + "auxiliary_loss_mlp": 0.0104249, + "balance_loss_clip": 1.0254668, + "balance_loss_mlp": 1.04866266, + "epoch": 0.2027356079963926, + "flos": 24680793674880.0, + "grad_norm": 1.766606164011739, + "language_loss": 0.92068136, + "learning_rate": 3.6950818457856617e-06, + "loss": 0.94254309, + "num_input_tokens_seen": 72899030, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.94921875, + "step": 3372, + "time_per_iteration": 2.578045129776001 + }, + { + "auxiliary_loss_clip": 0.0114549, + "auxiliary_loss_mlp": 0.01044018, + "balance_loss_clip": 1.02538514, + "balance_loss_mlp": 1.05037856, + "epoch": 0.20279573124906058, + "flos": 26392762765440.0, + "grad_norm": 1.6491924635250923, + "language_loss": 0.78632712, + "learning_rate": 3.694875114631167e-06, + "loss": 0.80822217, + "num_input_tokens_seen": 72919190, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.953125, + "step": 3373, + "time_per_iteration": 2.5762507915496826 + }, + { + "auxiliary_loss_clip": 0.01137806, + "auxiliary_loss_mlp": 0.01039377, + "balance_loss_clip": 1.02162719, + "balance_loss_mlp": 1.04629672, + "epoch": 0.20285585450172855, + "flos": 33799984692480.0, + "grad_norm": 1.8751465027713456, + "language_loss": 0.71102971, + "learning_rate": 3.6946683192068377e-06, + "loss": 0.73280156, + "num_input_tokens_seen": 72939720, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9140625, + "step": 3374, + "time_per_iteration": 2.6212260723114014 + }, + { + "auxiliary_loss_clip": 0.01048807, + "auxiliary_loss_mlp": 0.01001811, + "balance_loss_clip": 1.00001132, + "balance_loss_mlp": 1.01811993, + "epoch": 0.20291597775439651, + "flos": 71164823598720.0, + "grad_norm": 0.9912238676598704, + "language_loss": 0.62450445, + "learning_rate": 3.694461459520516e-06, + "loss": 0.64501071, + "num_input_tokens_seen": 73000015, + "router_z_loss_clip": 0.01794434, + "router_z_loss_mlp": 0.30859375, + "step": 3375, + "time_per_iteration": 3.0768048763275146 + }, + { + "auxiliary_loss_clip": 0.01140549, + "auxiliary_loss_mlp": 0.01044631, + "balance_loss_clip": 1.02722621, + "balance_loss_mlp": 1.04769731, + "epoch": 0.20297610100706448, + "flos": 19494287118720.0, + "grad_norm": 1.6669967725054042, + "language_loss": 0.82450807, + "learning_rate": 3.6942545355800463e-06, + "loss": 0.84635985, + "num_input_tokens_seen": 73017675, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9296875, + "step": 3376, + "time_per_iteration": 2.5632758140563965 + }, + { + "auxiliary_loss_clip": 0.011433, + "auxiliary_loss_mlp": 0.01039932, + "balance_loss_clip": 1.02110839, + "balance_loss_mlp": 1.04692364, + "epoch": 0.20303622425973245, + "flos": 25044245441280.0, + "grad_norm": 2.2640770034372006, + "language_loss": 0.81587797, + "learning_rate": 3.6940475473932743e-06, + "loss": 0.83771032, + "num_input_tokens_seen": 73036135, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.96484375, + "step": 3377, + "time_per_iteration": 2.6376402378082275 + }, + { + "auxiliary_loss_clip": 0.01139097, + "auxiliary_loss_mlp": 0.01045773, + "balance_loss_clip": 1.02786779, + "balance_loss_mlp": 1.04670048, + "epoch": 0.2030963475124004, + "flos": 21979988098560.0, + "grad_norm": 4.046949512949318, + "language_loss": 0.769104, + "learning_rate": 3.69384049496805e-06, + "loss": 0.79095268, + "num_input_tokens_seen": 73054075, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.921875, + "step": 3378, + "time_per_iteration": 2.532942056655884 + }, + { + "auxiliary_loss_clip": 0.01143396, + "auxiliary_loss_mlp": 0.01043534, + "balance_loss_clip": 1.02493691, + "balance_loss_mlp": 1.04772687, + "epoch": 0.2031564707650684, + "flos": 19500392430720.0, + "grad_norm": 1.9870266088444717, + "language_loss": 0.79710048, + "learning_rate": 3.6936333783122242e-06, + "loss": 0.81896979, + "num_input_tokens_seen": 73073530, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.95703125, + "step": 3379, + "time_per_iteration": 2.5187509059906006 + }, + { + "auxiliary_loss_clip": 0.01137083, + "auxiliary_loss_mlp": 0.01038348, + "balance_loss_clip": 1.02162337, + "balance_loss_mlp": 1.04698288, + "epoch": 0.20321659401773637, + "flos": 22747075971840.0, + "grad_norm": 1.7003196517483214, + "language_loss": 0.86949915, + "learning_rate": 3.6934261974336505e-06, + "loss": 0.89125347, + "num_input_tokens_seen": 73092820, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8984375, + "step": 3380, + "time_per_iteration": 2.5350420475006104 + }, + { + "auxiliary_loss_clip": 0.01143485, + "auxiliary_loss_mlp": 0.01046611, + "balance_loss_clip": 1.02905154, + "balance_loss_mlp": 1.05103135, + "epoch": 0.20327671727040433, + "flos": 22455840499200.0, + "grad_norm": 1.9133898096862498, + "language_loss": 0.74515057, + "learning_rate": 3.693218952340186e-06, + "loss": 0.76705158, + "num_input_tokens_seen": 73113385, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.92578125, + "step": 3381, + "time_per_iteration": 2.5428466796875 + }, + { + "auxiliary_loss_clip": 0.01143807, + "auxiliary_loss_mlp": 0.01043772, + "balance_loss_clip": 1.0258193, + "balance_loss_mlp": 1.04754519, + "epoch": 0.2033368405230723, + "flos": 19535010163200.0, + "grad_norm": 1.741042372938858, + "language_loss": 0.79304886, + "learning_rate": 3.6930116430396895e-06, + "loss": 0.81492472, + "num_input_tokens_seen": 73131195, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9609375, + "step": 3382, + "time_per_iteration": 2.51084041595459 + }, + { + "auxiliary_loss_clip": 0.01146625, + "auxiliary_loss_mlp": 0.01040796, + "balance_loss_clip": 1.02123427, + "balance_loss_mlp": 1.04849267, + "epoch": 0.20339696377574026, + "flos": 13809233744640.0, + "grad_norm": 1.8514394244027284, + "language_loss": 0.80188596, + "learning_rate": 3.6928042695400214e-06, + "loss": 0.82376015, + "num_input_tokens_seen": 73148850, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.98046875, + "step": 3383, + "time_per_iteration": 2.5047500133514404 + }, + { + "auxiliary_loss_clip": 0.0113964, + "auxiliary_loss_mlp": 0.01042049, + "balance_loss_clip": 1.02401257, + "balance_loss_mlp": 1.04616201, + "epoch": 0.20345708702840823, + "flos": 20339409288960.0, + "grad_norm": 6.482166974991387, + "language_loss": 0.74195492, + "learning_rate": 3.6925968318490464e-06, + "loss": 0.76377177, + "num_input_tokens_seen": 73166775, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.93359375, + "step": 3384, + "time_per_iteration": 2.4931931495666504 + }, + { + "auxiliary_loss_clip": 0.01147866, + "auxiliary_loss_mlp": 0.01043488, + "balance_loss_clip": 1.02442586, + "balance_loss_mlp": 1.04929996, + "epoch": 0.2035172102810762, + "flos": 20333950421760.0, + "grad_norm": 2.292912234818254, + "language_loss": 0.76429737, + "learning_rate": 3.6923893299746293e-06, + "loss": 0.78621089, + "num_input_tokens_seen": 73183215, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.984375, + "step": 3385, + "time_per_iteration": 3.9999845027923584 + }, + { + "auxiliary_loss_clip": 0.01139546, + "auxiliary_loss_mlp": 0.01058955, + "balance_loss_clip": 1.04031098, + "balance_loss_mlp": 1.04538202, + "epoch": 0.2035773335337442, + "flos": 23330983461120.0, + "grad_norm": 1.8347755395186154, + "language_loss": 0.68259251, + "learning_rate": 3.692181763924639e-06, + "loss": 0.70457751, + "num_input_tokens_seen": 73203290, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.94140625, + "step": 3386, + "time_per_iteration": 2.525538682937622 + }, + { + "auxiliary_loss_clip": 0.01143921, + "auxiliary_loss_mlp": 0.01054172, + "balance_loss_clip": 1.0348835, + "balance_loss_mlp": 1.04785144, + "epoch": 0.20363745678641215, + "flos": 28330287310080.0, + "grad_norm": 1.949323793812955, + "language_loss": 0.81000078, + "learning_rate": 3.691974133706947e-06, + "loss": 0.83198166, + "num_input_tokens_seen": 73226185, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.9609375, + "step": 3387, + "time_per_iteration": 4.122355222702026 + }, + { + "auxiliary_loss_clip": 0.01137445, + "auxiliary_loss_mlp": 0.01040694, + "balance_loss_clip": 1.02331305, + "balance_loss_mlp": 1.04754424, + "epoch": 0.20369758003908012, + "flos": 18915658928640.0, + "grad_norm": 2.869822824167972, + "language_loss": 0.79960001, + "learning_rate": 3.6917664393294262e-06, + "loss": 0.82138139, + "num_input_tokens_seen": 73243300, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.8984375, + "step": 3388, + "time_per_iteration": 2.498455047607422 + }, + { + "auxiliary_loss_clip": 0.01142619, + "auxiliary_loss_mlp": 0.01040015, + "balance_loss_clip": 1.02120411, + "balance_loss_mlp": 1.04757476, + "epoch": 0.20375770329174808, + "flos": 19206499351680.0, + "grad_norm": 1.6489636222716584, + "language_loss": 0.71810246, + "learning_rate": 3.6915586807999527e-06, + "loss": 0.73992884, + "num_input_tokens_seen": 73261490, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.94921875, + "step": 3389, + "time_per_iteration": 2.4751241207122803 + }, + { + "auxiliary_loss_clip": 0.01140457, + "auxiliary_loss_mlp": 0.01048463, + "balance_loss_clip": 1.03108239, + "balance_loss_mlp": 1.04812241, + "epoch": 0.20381782654441605, + "flos": 19391008538880.0, + "grad_norm": 1.7476252287205662, + "language_loss": 0.87431413, + "learning_rate": 3.691350858126404e-06, + "loss": 0.89620328, + "num_input_tokens_seen": 73280180, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.921875, + "step": 3390, + "time_per_iteration": 2.5229172706604004 + }, + { + "auxiliary_loss_clip": 0.01138893, + "auxiliary_loss_mlp": 0.01044263, + "balance_loss_clip": 1.02673888, + "balance_loss_mlp": 1.04638386, + "epoch": 0.203877949797084, + "flos": 24827704300800.0, + "grad_norm": 3.0399462437196743, + "language_loss": 0.71092427, + "learning_rate": 3.691142971316662e-06, + "loss": 0.73275584, + "num_input_tokens_seen": 73300680, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.92578125, + "step": 3391, + "time_per_iteration": 2.528003454208374 + }, + { + "auxiliary_loss_clip": 0.01137362, + "auxiliary_loss_mlp": 0.01043664, + "balance_loss_clip": 1.02592552, + "balance_loss_mlp": 1.04483938, + "epoch": 0.20393807304975198, + "flos": 18003707504640.0, + "grad_norm": 2.517550673127581, + "language_loss": 0.85993969, + "learning_rate": 3.6909350203786086e-06, + "loss": 0.88174999, + "num_input_tokens_seen": 73316760, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.921875, + "step": 3392, + "time_per_iteration": 2.5080008506774902 + }, + { + "auxiliary_loss_clip": 0.01143294, + "auxiliary_loss_mlp": 0.01049793, + "balance_loss_clip": 1.03231716, + "balance_loss_mlp": 1.04759896, + "epoch": 0.20399819630241997, + "flos": 24206988349440.0, + "grad_norm": 1.5067582134175779, + "language_loss": 0.80730146, + "learning_rate": 3.69072700532013e-06, + "loss": 0.82923234, + "num_input_tokens_seen": 73339385, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.95703125, + "step": 3393, + "time_per_iteration": 2.5464906692504883 + }, + { + "auxiliary_loss_clip": 0.01139211, + "auxiliary_loss_mlp": 0.01039094, + "balance_loss_clip": 1.02236915, + "balance_loss_mlp": 1.0471251, + "epoch": 0.20405831955508794, + "flos": 20777124424320.0, + "grad_norm": 1.882536464234473, + "language_loss": 0.86276352, + "learning_rate": 3.6905189261491137e-06, + "loss": 0.88454658, + "num_input_tokens_seen": 73357235, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.921875, + "step": 3394, + "time_per_iteration": 2.495544195175171 + }, + { + "auxiliary_loss_clip": 0.01139364, + "auxiliary_loss_mlp": 0.01042637, + "balance_loss_clip": 1.02640033, + "balance_loss_mlp": 1.04756498, + "epoch": 0.2041184428077559, + "flos": 15486908325120.0, + "grad_norm": 2.9880936155816324, + "language_loss": 0.83455038, + "learning_rate": 3.69031078287345e-06, + "loss": 0.85637033, + "num_input_tokens_seen": 73374435, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.91796875, + "step": 3395, + "time_per_iteration": 2.4636099338531494 + }, + { + "auxiliary_loss_clip": 0.01144564, + "auxiliary_loss_mlp": 0.01035131, + "balance_loss_clip": 1.01753616, + "balance_loss_mlp": 1.04799199, + "epoch": 0.20417856606042387, + "flos": 15588463052160.0, + "grad_norm": 2.0105247570422877, + "language_loss": 0.83632553, + "learning_rate": 3.690102575501033e-06, + "loss": 0.85812247, + "num_input_tokens_seen": 73391025, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.96875, + "step": 3396, + "time_per_iteration": 2.507140636444092 + }, + { + "auxiliary_loss_clip": 0.01139778, + "auxiliary_loss_mlp": 0.01042511, + "balance_loss_clip": 1.02470088, + "balance_loss_mlp": 1.04775488, + "epoch": 0.20423868931309183, + "flos": 24279348297600.0, + "grad_norm": 1.9261630392212734, + "language_loss": 0.77139032, + "learning_rate": 3.6898943040397556e-06, + "loss": 0.79321325, + "num_input_tokens_seen": 73409270, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.91796875, + "step": 3397, + "time_per_iteration": 2.5000061988830566 + }, + { + "auxiliary_loss_clip": 0.01140053, + "auxiliary_loss_mlp": 0.01043864, + "balance_loss_clip": 1.027771, + "balance_loss_mlp": 1.0482713, + "epoch": 0.2042988125657598, + "flos": 18614870438400.0, + "grad_norm": 2.6022565941655285, + "language_loss": 0.87048233, + "learning_rate": 3.689685968497518e-06, + "loss": 0.89232147, + "num_input_tokens_seen": 73425225, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.91796875, + "step": 3398, + "time_per_iteration": 2.4879262447357178 + }, + { + "auxiliary_loss_clip": 0.01146457, + "auxiliary_loss_mlp": 0.01045529, + "balance_loss_clip": 1.02855396, + "balance_loss_mlp": 1.05200124, + "epoch": 0.2043589358184278, + "flos": 17851230270720.0, + "grad_norm": 2.0446998950436273, + "language_loss": 0.77973163, + "learning_rate": 3.6894775688822186e-06, + "loss": 0.8016516, + "num_input_tokens_seen": 73440940, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9453125, + "step": 3399, + "time_per_iteration": 2.4417104721069336 + }, + { + "auxiliary_loss_clip": 0.01142529, + "auxiliary_loss_mlp": 0.01039697, + "balance_loss_clip": 1.02180338, + "balance_loss_mlp": 1.0471437, + "epoch": 0.20441905907109575, + "flos": 21435223455360.0, + "grad_norm": 1.9372936252349278, + "language_loss": 0.76201475, + "learning_rate": 3.6892691052017603e-06, + "loss": 0.78383702, + "num_input_tokens_seen": 73458805, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.953125, + "step": 3400, + "time_per_iteration": 2.513378858566284 + }, + { + "auxiliary_loss_clip": 0.01140509, + "auxiliary_loss_mlp": 0.01043924, + "balance_loss_clip": 1.02709138, + "balance_loss_mlp": 1.04937315, + "epoch": 0.20447918232376372, + "flos": 27707703851520.0, + "grad_norm": 1.6590163779918286, + "language_loss": 0.79357922, + "learning_rate": 3.6890605774640487e-06, + "loss": 0.81542361, + "num_input_tokens_seen": 73479380, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9140625, + "step": 3401, + "time_per_iteration": 2.5628185272216797 + }, + { + "auxiliary_loss_clip": 0.01141107, + "auxiliary_loss_mlp": 0.01041447, + "balance_loss_clip": 1.02400649, + "balance_loss_mlp": 1.04659653, + "epoch": 0.20453930557643168, + "flos": 30524214113280.0, + "grad_norm": 1.682072453203677, + "language_loss": 0.69205511, + "learning_rate": 3.688851985676991e-06, + "loss": 0.71388066, + "num_input_tokens_seen": 73505105, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9453125, + "step": 3402, + "time_per_iteration": 2.653932571411133 + }, + { + "auxiliary_loss_clip": 0.01144935, + "auxiliary_loss_mlp": 0.01043349, + "balance_loss_clip": 1.02538395, + "balance_loss_mlp": 1.05008948, + "epoch": 0.20459942882909965, + "flos": 18987767481600.0, + "grad_norm": 2.6906490082479086, + "language_loss": 0.81077826, + "learning_rate": 3.688643329848496e-06, + "loss": 0.83266115, + "num_input_tokens_seen": 73523700, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.94921875, + "step": 3403, + "time_per_iteration": 2.518402099609375 + }, + { + "auxiliary_loss_clip": 0.01145331, + "auxiliary_loss_mlp": 0.0104575, + "balance_loss_clip": 1.02873933, + "balance_loss_mlp": 1.05067933, + "epoch": 0.20465955208176762, + "flos": 20339050152960.0, + "grad_norm": 1.7308307985558895, + "language_loss": 0.83497006, + "learning_rate": 3.6884346099864772e-06, + "loss": 0.85688084, + "num_input_tokens_seen": 73542625, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9453125, + "step": 3404, + "time_per_iteration": 2.5041427612304688 + }, + { + "auxiliary_loss_clip": 0.0114107, + "auxiliary_loss_mlp": 0.0104714, + "balance_loss_clip": 1.03018808, + "balance_loss_mlp": 1.04686713, + "epoch": 0.20471967533443558, + "flos": 21251288885760.0, + "grad_norm": 1.717424757849508, + "language_loss": 0.86319768, + "learning_rate": 3.6882258260988487e-06, + "loss": 0.88507974, + "num_input_tokens_seen": 73561450, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9453125, + "step": 3405, + "time_per_iteration": 2.5019404888153076 + }, + { + "auxiliary_loss_clip": 0.01138198, + "auxiliary_loss_mlp": 0.01042134, + "balance_loss_clip": 1.02558827, + "balance_loss_mlp": 1.04664326, + "epoch": 0.20477979858710357, + "flos": 14501555458560.0, + "grad_norm": 2.0734152439752327, + "language_loss": 0.84731919, + "learning_rate": 3.6880169781935276e-06, + "loss": 0.86912251, + "num_input_tokens_seen": 73577155, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9140625, + "step": 3406, + "time_per_iteration": 2.508274793624878 + }, + { + "auxiliary_loss_clip": 0.0114, + "auxiliary_loss_mlp": 0.01042004, + "balance_loss_clip": 1.02601814, + "balance_loss_mlp": 1.04885817, + "epoch": 0.20483992183977154, + "flos": 11400310085760.0, + "grad_norm": 2.0579137112366332, + "language_loss": 0.68086451, + "learning_rate": 3.6878080662784336e-06, + "loss": 0.70268458, + "num_input_tokens_seen": 73594900, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.9140625, + "step": 3407, + "time_per_iteration": 2.4675915241241455 + }, + { + "auxiliary_loss_clip": 0.01137483, + "auxiliary_loss_mlp": 0.0104729, + "balance_loss_clip": 1.03039861, + "balance_loss_mlp": 1.0469842, + "epoch": 0.2049000450924395, + "flos": 19060271084160.0, + "grad_norm": 2.4520435823789857, + "language_loss": 0.84025276, + "learning_rate": 3.6875990903614886e-06, + "loss": 0.86210054, + "num_input_tokens_seen": 73613810, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.90625, + "step": 3408, + "time_per_iteration": 2.4996185302734375 + }, + { + "auxiliary_loss_clip": 0.01144748, + "auxiliary_loss_mlp": 0.01045034, + "balance_loss_clip": 1.02851176, + "balance_loss_mlp": 1.05156052, + "epoch": 0.20496016834510747, + "flos": 14574561851520.0, + "grad_norm": 2.726731275915995, + "language_loss": 0.64288676, + "learning_rate": 3.6873900504506166e-06, + "loss": 0.66478455, + "num_input_tokens_seen": 73631495, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9296875, + "step": 3409, + "time_per_iteration": 2.469758987426758 + }, + { + "auxiliary_loss_clip": 0.01139054, + "auxiliary_loss_mlp": 0.01046007, + "balance_loss_clip": 1.0295676, + "balance_loss_mlp": 1.04638147, + "epoch": 0.20502029159777543, + "flos": 22126647329280.0, + "grad_norm": 1.319045584705984, + "language_loss": 0.80357087, + "learning_rate": 3.687180946553745e-06, + "loss": 0.82542145, + "num_input_tokens_seen": 73652840, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.92578125, + "step": 3410, + "time_per_iteration": 2.5167293548583984 + }, + { + "auxiliary_loss_clip": 0.01140553, + "auxiliary_loss_mlp": 0.01046311, + "balance_loss_clip": 1.0303905, + "balance_loss_mlp": 1.05014896, + "epoch": 0.2050804148504434, + "flos": 25367907916800.0, + "grad_norm": 2.259997857874164, + "language_loss": 0.75796056, + "learning_rate": 3.686971778678803e-06, + "loss": 0.7798292, + "num_input_tokens_seen": 73672150, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.90625, + "step": 3411, + "time_per_iteration": 2.5411264896392822 + }, + { + "auxiliary_loss_clip": 0.01144909, + "auxiliary_loss_mlp": 0.0104429, + "balance_loss_clip": 1.02817273, + "balance_loss_mlp": 1.05220985, + "epoch": 0.2051405381031114, + "flos": 23620171858560.0, + "grad_norm": 2.0004173274373183, + "language_loss": 0.73696554, + "learning_rate": 3.686762546833722e-06, + "loss": 0.75885755, + "num_input_tokens_seen": 73691940, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.9296875, + "step": 3412, + "time_per_iteration": 2.5047144889831543 + }, + { + "auxiliary_loss_clip": 0.01143761, + "auxiliary_loss_mlp": 0.01047167, + "balance_loss_clip": 1.03015614, + "balance_loss_mlp": 1.04735541, + "epoch": 0.20520066135577936, + "flos": 19565533745280.0, + "grad_norm": 2.0925027501904228, + "language_loss": 0.77863461, + "learning_rate": 3.6865532510264362e-06, + "loss": 0.8005439, + "num_input_tokens_seen": 73709080, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.96484375, + "step": 3413, + "time_per_iteration": 2.5472991466522217 + }, + { + "auxiliary_loss_clip": 0.01138869, + "auxiliary_loss_mlp": 0.01042643, + "balance_loss_clip": 1.02534604, + "balance_loss_mlp": 1.04989886, + "epoch": 0.20526078460844732, + "flos": 17676345928320.0, + "grad_norm": 1.912987525537943, + "language_loss": 0.84719825, + "learning_rate": 3.6863438912648823e-06, + "loss": 0.86901337, + "num_input_tokens_seen": 73727670, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.890625, + "step": 3414, + "time_per_iteration": 2.478729724884033 + }, + { + "auxiliary_loss_clip": 0.01138295, + "auxiliary_loss_mlp": 0.0104162, + "balance_loss_clip": 1.02496636, + "balance_loss_mlp": 1.04659235, + "epoch": 0.2053209078611153, + "flos": 21500328856320.0, + "grad_norm": 1.9076108002018353, + "language_loss": 0.80448711, + "learning_rate": 3.6861344675569986e-06, + "loss": 0.82628626, + "num_input_tokens_seen": 73747170, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.91796875, + "step": 3415, + "time_per_iteration": 2.5366415977478027 + }, + { + "auxiliary_loss_clip": 0.01137909, + "auxiliary_loss_mlp": 0.01037046, + "balance_loss_clip": 1.02154934, + "balance_loss_mlp": 1.04796863, + "epoch": 0.20538103111378325, + "flos": 25663524848640.0, + "grad_norm": 1.7629792917286327, + "language_loss": 0.72893143, + "learning_rate": 3.6859249799107275e-06, + "loss": 0.75068092, + "num_input_tokens_seen": 73767690, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8984375, + "step": 3416, + "time_per_iteration": 2.5656492710113525 + }, + { + "auxiliary_loss_clip": 0.01140135, + "auxiliary_loss_mlp": 0.01042271, + "balance_loss_clip": 1.02520072, + "balance_loss_mlp": 1.04695165, + "epoch": 0.20544115436645122, + "flos": 23148952312320.0, + "grad_norm": 2.5523210605949425, + "language_loss": 0.78623438, + "learning_rate": 3.6857154283340115e-06, + "loss": 0.80805844, + "num_input_tokens_seen": 73786900, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.93359375, + "step": 3417, + "time_per_iteration": 2.51582932472229 + }, + { + "auxiliary_loss_clip": 0.01140114, + "auxiliary_loss_mlp": 0.01046708, + "balance_loss_clip": 1.02948236, + "balance_loss_mlp": 1.04842472, + "epoch": 0.20550127761911918, + "flos": 19390433921280.0, + "grad_norm": 2.178207343470702, + "language_loss": 0.87390542, + "learning_rate": 3.685505812834798e-06, + "loss": 0.89577365, + "num_input_tokens_seen": 73804515, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.91796875, + "step": 3418, + "time_per_iteration": 2.4900615215301514 + }, + { + "auxiliary_loss_clip": 0.01139839, + "auxiliary_loss_mlp": 0.0104158, + "balance_loss_clip": 1.0251534, + "balance_loss_mlp": 1.04798996, + "epoch": 0.20556140087178718, + "flos": 22893124671360.0, + "grad_norm": 2.115759049165993, + "language_loss": 0.62156075, + "learning_rate": 3.685296133421035e-06, + "loss": 0.64337492, + "num_input_tokens_seen": 73822910, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.921875, + "step": 3419, + "time_per_iteration": 2.527057647705078 + }, + { + "auxiliary_loss_clip": 0.01143982, + "auxiliary_loss_mlp": 0.0104893, + "balance_loss_clip": 1.02977359, + "balance_loss_mlp": 1.04905963, + "epoch": 0.20562152412445514, + "flos": 19789652655360.0, + "grad_norm": 2.2865688080492466, + "language_loss": 0.86502206, + "learning_rate": 3.685086390100674e-06, + "loss": 0.88695121, + "num_input_tokens_seen": 73841160, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.953125, + "step": 3420, + "time_per_iteration": 2.532512664794922 + }, + { + "auxiliary_loss_clip": 0.01138181, + "auxiliary_loss_mlp": 0.0104181, + "balance_loss_clip": 1.02533531, + "balance_loss_mlp": 1.04659796, + "epoch": 0.2056816473771231, + "flos": 31501989210240.0, + "grad_norm": 2.535685660701584, + "language_loss": 0.70904821, + "learning_rate": 3.684876582881668e-06, + "loss": 0.73084807, + "num_input_tokens_seen": 73862795, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.91796875, + "step": 3421, + "time_per_iteration": 2.5924150943756104 + }, + { + "auxiliary_loss_clip": 0.0113664, + "auxiliary_loss_mlp": 0.01038524, + "balance_loss_clip": 1.02099967, + "balance_loss_mlp": 1.04581738, + "epoch": 0.20574177062979107, + "flos": 23258372117760.0, + "grad_norm": 3.5707952740494235, + "language_loss": 0.70370102, + "learning_rate": 3.6846667117719732e-06, + "loss": 0.72545266, + "num_input_tokens_seen": 73881525, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.90625, + "step": 3422, + "time_per_iteration": 2.499041795730591 + }, + { + "auxiliary_loss_clip": 0.01060302, + "auxiliary_loss_mlp": 0.01012319, + "balance_loss_clip": 1.01001859, + "balance_loss_mlp": 1.02983248, + "epoch": 0.20580189388245904, + "flos": 70312518708480.0, + "grad_norm": 0.7605512778953217, + "language_loss": 0.55499864, + "learning_rate": 3.684456776779548e-06, + "loss": 0.57572484, + "num_input_tokens_seen": 73937775, + "router_z_loss_clip": 0.02294922, + "router_z_loss_mlp": 0.3046875, + "step": 3423, + "time_per_iteration": 3.1569108963012695 + }, + { + "auxiliary_loss_clip": 0.0114215, + "auxiliary_loss_mlp": 0.01042807, + "balance_loss_clip": 1.02494931, + "balance_loss_mlp": 1.04882169, + "epoch": 0.205862017135127, + "flos": 30737846252160.0, + "grad_norm": 1.7754304652232902, + "language_loss": 0.71701574, + "learning_rate": 3.684246777912353e-06, + "loss": 0.73886526, + "num_input_tokens_seen": 73958250, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.9296875, + "step": 3424, + "time_per_iteration": 2.58278751373291 + }, + { + "auxiliary_loss_clip": 0.01140924, + "auxiliary_loss_mlp": 0.01046159, + "balance_loss_clip": 1.02920699, + "balance_loss_mlp": 1.05022514, + "epoch": 0.20592214038779497, + "flos": 21324546673920.0, + "grad_norm": 1.563470220797352, + "language_loss": 0.75031066, + "learning_rate": 3.684036715178351e-06, + "loss": 0.77218151, + "num_input_tokens_seen": 73977775, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.90625, + "step": 3425, + "time_per_iteration": 2.518050193786621 + }, + { + "auxiliary_loss_clip": 0.01145974, + "auxiliary_loss_mlp": 0.01057037, + "balance_loss_clip": 1.0404191, + "balance_loss_mlp": 1.0545603, + "epoch": 0.20598226364046296, + "flos": 22891652213760.0, + "grad_norm": 1.8081006382856646, + "language_loss": 0.88246548, + "learning_rate": 3.683826588585508e-06, + "loss": 0.90449566, + "num_input_tokens_seen": 73996590, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9140625, + "step": 3426, + "time_per_iteration": 2.5141823291778564 + }, + { + "auxiliary_loss_clip": 0.01139115, + "auxiliary_loss_mlp": 0.01046156, + "balance_loss_clip": 1.02927566, + "balance_loss_mlp": 1.04961991, + "epoch": 0.20604238689313092, + "flos": 23878549365120.0, + "grad_norm": 1.8273097367093476, + "language_loss": 0.76748925, + "learning_rate": 3.6836163981417926e-06, + "loss": 0.78934193, + "num_input_tokens_seen": 74015935, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.89453125, + "step": 3427, + "time_per_iteration": 4.068110227584839 + }, + { + "auxiliary_loss_clip": 0.01143208, + "auxiliary_loss_mlp": 0.01048853, + "balance_loss_clip": 1.03143609, + "balance_loss_mlp": 1.04978716, + "epoch": 0.2061025101457989, + "flos": 22491535639680.0, + "grad_norm": 1.6956079848027177, + "language_loss": 0.73914266, + "learning_rate": 3.683406143855174e-06, + "loss": 0.76106334, + "num_input_tokens_seen": 74036575, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.93359375, + "step": 3428, + "time_per_iteration": 2.5296199321746826 + }, + { + "auxiliary_loss_clip": 0.0113987, + "auxiliary_loss_mlp": 0.01049805, + "balance_loss_clip": 1.03188777, + "balance_loss_mlp": 1.04691577, + "epoch": 0.20616263339846685, + "flos": 22778928357120.0, + "grad_norm": 3.779292361126499, + "language_loss": 0.73553443, + "learning_rate": 3.6831958257336256e-06, + "loss": 0.75743121, + "num_input_tokens_seen": 74055365, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9296875, + "step": 3429, + "time_per_iteration": 3.979640483856201 + }, + { + "auxiliary_loss_clip": 0.01146724, + "auxiliary_loss_mlp": 0.01041423, + "balance_loss_clip": 1.0242331, + "balance_loss_mlp": 1.05180049, + "epoch": 0.20622275665113482, + "flos": 20882198684160.0, + "grad_norm": 1.8474903397728304, + "language_loss": 0.85301876, + "learning_rate": 3.6829854437851237e-06, + "loss": 0.87490022, + "num_input_tokens_seen": 74074875, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.94921875, + "step": 3430, + "time_per_iteration": 2.532275438308716 + }, + { + "auxiliary_loss_clip": 0.0114587, + "auxiliary_loss_mlp": 0.01052093, + "balance_loss_clip": 1.03411579, + "balance_loss_mlp": 1.05116892, + "epoch": 0.20628287990380278, + "flos": 19354415558400.0, + "grad_norm": 1.4715876867440674, + "language_loss": 0.69369543, + "learning_rate": 3.6827749980176444e-06, + "loss": 0.715675, + "num_input_tokens_seen": 74094505, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.94921875, + "step": 3431, + "time_per_iteration": 2.4857282638549805 + }, + { + "auxiliary_loss_clip": 0.01051719, + "auxiliary_loss_mlp": 0.01015472, + "balance_loss_clip": 1.01329005, + "balance_loss_mlp": 1.02078724, + "epoch": 0.20634300315647078, + "flos": 71517932248320.0, + "grad_norm": 0.8322663536180677, + "language_loss": 0.60249984, + "learning_rate": 3.6825644884391693e-06, + "loss": 0.62317169, + "num_input_tokens_seen": 74158500, + "router_z_loss_clip": 0.02185059, + "router_z_loss_mlp": 0.30859375, + "step": 3432, + "time_per_iteration": 3.250966787338257 + }, + { + "auxiliary_loss_clip": 0.01143675, + "auxiliary_loss_mlp": 0.01047086, + "balance_loss_clip": 1.03021789, + "balance_loss_mlp": 1.05125713, + "epoch": 0.20640312640913874, + "flos": 21723944976000.0, + "grad_norm": 1.7869258470827205, + "language_loss": 0.72495091, + "learning_rate": 3.682353915057679e-06, + "loss": 0.74685854, + "num_input_tokens_seen": 74176685, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.921875, + "step": 3433, + "time_per_iteration": 2.528576135635376 + }, + { + "auxiliary_loss_clip": 0.01143793, + "auxiliary_loss_mlp": 0.01050396, + "balance_loss_clip": 1.03295541, + "balance_loss_mlp": 1.04886997, + "epoch": 0.2064632496618067, + "flos": 20554621626240.0, + "grad_norm": 1.715054190412472, + "language_loss": 0.8721565, + "learning_rate": 3.6821432778811604e-06, + "loss": 0.8940984, + "num_input_tokens_seen": 74194935, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.94921875, + "step": 3434, + "time_per_iteration": 2.507589101791382 + }, + { + "auxiliary_loss_clip": 0.01144514, + "auxiliary_loss_mlp": 0.01043806, + "balance_loss_clip": 1.0269376, + "balance_loss_mlp": 1.04833162, + "epoch": 0.20652337291447467, + "flos": 29823273135360.0, + "grad_norm": 1.6274854163318595, + "language_loss": 0.69133317, + "learning_rate": 3.6819325769176004e-06, + "loss": 0.71321636, + "num_input_tokens_seen": 74215400, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9609375, + "step": 3435, + "time_per_iteration": 2.587930679321289 + }, + { + "auxiliary_loss_clip": 0.01140929, + "auxiliary_loss_mlp": 0.01041675, + "balance_loss_clip": 1.0241158, + "balance_loss_mlp": 1.04983366, + "epoch": 0.20658349616714264, + "flos": 26213640618240.0, + "grad_norm": 1.7028603597643168, + "language_loss": 0.8922776, + "learning_rate": 3.681721812174988e-06, + "loss": 0.91410363, + "num_input_tokens_seen": 74234090, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.91015625, + "step": 3436, + "time_per_iteration": 2.57295298576355 + }, + { + "auxiliary_loss_clip": 0.01144451, + "auxiliary_loss_mlp": 0.01035549, + "balance_loss_clip": 1.01856136, + "balance_loss_mlp": 1.05126333, + "epoch": 0.2066436194198106, + "flos": 25994370044160.0, + "grad_norm": 1.8990861512322268, + "language_loss": 0.76659, + "learning_rate": 3.6815109836613163e-06, + "loss": 0.78839004, + "num_input_tokens_seen": 74253345, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9296875, + "step": 3437, + "time_per_iteration": 2.5819849967956543 + }, + { + "auxiliary_loss_clip": 0.01142266, + "auxiliary_loss_mlp": 0.0104022, + "balance_loss_clip": 1.02397132, + "balance_loss_mlp": 1.04877901, + "epoch": 0.20670374267247857, + "flos": 21361067827200.0, + "grad_norm": 1.7925672188665596, + "language_loss": 0.77611911, + "learning_rate": 3.6813000913845795e-06, + "loss": 0.79794395, + "num_input_tokens_seen": 74271615, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.93359375, + "step": 3438, + "time_per_iteration": 2.5091731548309326 + }, + { + "auxiliary_loss_clip": 0.01047915, + "auxiliary_loss_mlp": 0.01005377, + "balance_loss_clip": 1.00348175, + "balance_loss_mlp": 1.01723933, + "epoch": 0.20676386592514656, + "flos": 66383281952640.0, + "grad_norm": 0.8367234589951487, + "language_loss": 0.67141807, + "learning_rate": 3.6810891353527747e-06, + "loss": 0.69195092, + "num_input_tokens_seen": 74331390, + "router_z_loss_clip": 0.0189209, + "router_z_loss_mlp": 0.30664062, + "step": 3439, + "time_per_iteration": 3.0797181129455566 + }, + { + "auxiliary_loss_clip": 0.01142942, + "auxiliary_loss_mlp": 0.01037044, + "balance_loss_clip": 1.02028275, + "balance_loss_mlp": 1.04791629, + "epoch": 0.20682398917781453, + "flos": 17274577328640.0, + "grad_norm": 2.0580501207842428, + "language_loss": 0.83931267, + "learning_rate": 3.6808781155739014e-06, + "loss": 0.86111259, + "num_input_tokens_seen": 74347335, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.94921875, + "step": 3440, + "time_per_iteration": 2.5015172958374023 + }, + { + "auxiliary_loss_clip": 0.01143016, + "auxiliary_loss_mlp": 0.01041686, + "balance_loss_clip": 1.02584338, + "balance_loss_mlp": 1.05009377, + "epoch": 0.2068841124304825, + "flos": 18077288515200.0, + "grad_norm": 1.9416657792651912, + "language_loss": 0.84825736, + "learning_rate": 3.6806670320559614e-06, + "loss": 0.87010437, + "num_input_tokens_seen": 74366310, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.9296875, + "step": 3441, + "time_per_iteration": 2.4866137504577637 + }, + { + "auxiliary_loss_clip": 0.01140001, + "auxiliary_loss_mlp": 0.01044739, + "balance_loss_clip": 1.02778697, + "balance_loss_mlp": 1.0502038, + "epoch": 0.20694423568315046, + "flos": 27347017432320.0, + "grad_norm": 1.6577892844013908, + "language_loss": 0.85889506, + "learning_rate": 3.680455884806959e-06, + "loss": 0.88074249, + "num_input_tokens_seen": 74387100, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8984375, + "step": 3442, + "time_per_iteration": 2.5914649963378906 + }, + { + "auxiliary_loss_clip": 0.01145487, + "auxiliary_loss_mlp": 0.01040291, + "balance_loss_clip": 1.02305317, + "balance_loss_mlp": 1.05208063, + "epoch": 0.20700435893581842, + "flos": 20229845829120.0, + "grad_norm": 1.9070439101703558, + "language_loss": 0.72829354, + "learning_rate": 3.6802446738349014e-06, + "loss": 0.75015128, + "num_input_tokens_seen": 74404460, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.93359375, + "step": 3443, + "time_per_iteration": 2.5210063457489014 + }, + { + "auxiliary_loss_clip": 0.01140016, + "auxiliary_loss_mlp": 0.01044032, + "balance_loss_clip": 1.02879703, + "balance_loss_mlp": 1.0496819, + "epoch": 0.2070644821884864, + "flos": 20631111638400.0, + "grad_norm": 2.5056876708900186, + "language_loss": 0.85428166, + "learning_rate": 3.680033399147797e-06, + "loss": 0.87612224, + "num_input_tokens_seen": 74423790, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.90625, + "step": 3444, + "time_per_iteration": 2.528493881225586 + }, + { + "auxiliary_loss_clip": 0.01047325, + "auxiliary_loss_mlp": 0.0100746, + "balance_loss_clip": 1.00537384, + "balance_loss_mlp": 1.01688242, + "epoch": 0.20712460544115438, + "flos": 65941077617280.0, + "grad_norm": 0.6978715278146553, + "language_loss": 0.57091653, + "learning_rate": 3.6798220607536585e-06, + "loss": 0.5914644, + "num_input_tokens_seen": 74488130, + "router_z_loss_clip": 0.02087402, + "router_z_loss_mlp": 0.3046875, + "step": 3445, + "time_per_iteration": 3.086552619934082 + }, + { + "auxiliary_loss_clip": 0.01140085, + "auxiliary_loss_mlp": 0.01050946, + "balance_loss_clip": 1.03356516, + "balance_loss_mlp": 1.04968095, + "epoch": 0.20718472869382235, + "flos": 19425734012160.0, + "grad_norm": 1.5621496076246746, + "language_loss": 0.78459281, + "learning_rate": 3.6796106586604987e-06, + "loss": 0.80650306, + "num_input_tokens_seen": 74506720, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.90625, + "step": 3446, + "time_per_iteration": 2.4844422340393066 + }, + { + "auxiliary_loss_clip": 0.01148285, + "auxiliary_loss_mlp": 0.01048146, + "balance_loss_clip": 1.02846456, + "balance_loss_mlp": 1.05057228, + "epoch": 0.2072448519464903, + "flos": 24499049834880.0, + "grad_norm": 2.157476270385918, + "language_loss": 0.62436825, + "learning_rate": 3.679399192876334e-06, + "loss": 0.64633256, + "num_input_tokens_seen": 74525330, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.9765625, + "step": 3447, + "time_per_iteration": 2.592799663543701 + }, + { + "auxiliary_loss_clip": 0.01141894, + "auxiliary_loss_mlp": 0.01047763, + "balance_loss_clip": 1.03071666, + "balance_loss_mlp": 1.04810297, + "epoch": 0.20730497519915828, + "flos": 23075694524160.0, + "grad_norm": 1.740614876967074, + "language_loss": 0.86066437, + "learning_rate": 3.679187663409184e-06, + "loss": 0.88256097, + "num_input_tokens_seen": 74544535, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9375, + "step": 3448, + "time_per_iteration": 2.5054237842559814 + }, + { + "auxiliary_loss_clip": 0.01140662, + "auxiliary_loss_mlp": 0.01044351, + "balance_loss_clip": 1.02576649, + "balance_loss_mlp": 1.04814398, + "epoch": 0.20736509845182624, + "flos": 21069042255360.0, + "grad_norm": 3.1117492515519665, + "language_loss": 0.75452864, + "learning_rate": 3.6789760702670696e-06, + "loss": 0.77637869, + "num_input_tokens_seen": 74562300, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.92578125, + "step": 3449, + "time_per_iteration": 2.506657838821411 + }, + { + "auxiliary_loss_clip": 0.01145667, + "auxiliary_loss_mlp": 0.01050496, + "balance_loss_clip": 1.03194678, + "balance_loss_mlp": 1.04896426, + "epoch": 0.2074252217044942, + "flos": 17633288499840.0, + "grad_norm": 1.7877143934577313, + "language_loss": 0.76703656, + "learning_rate": 3.6787644134580134e-06, + "loss": 0.78899819, + "num_input_tokens_seen": 74580080, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.96875, + "step": 3450, + "time_per_iteration": 2.479090929031372 + }, + { + "auxiliary_loss_clip": 0.01143955, + "auxiliary_loss_mlp": 0.01047659, + "balance_loss_clip": 1.0302192, + "balance_loss_mlp": 1.04780531, + "epoch": 0.20748534495716217, + "flos": 23546985897600.0, + "grad_norm": 1.5227053471466307, + "language_loss": 0.822101, + "learning_rate": 3.6785526929900436e-06, + "loss": 0.84401715, + "num_input_tokens_seen": 74598980, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9609375, + "step": 3451, + "time_per_iteration": 2.5465826988220215 + }, + { + "auxiliary_loss_clip": 0.01047156, + "auxiliary_loss_mlp": 0.01003865, + "balance_loss_clip": 1.00193334, + "balance_loss_mlp": 1.01645589, + "epoch": 0.20754546820983016, + "flos": 52252935598080.0, + "grad_norm": 0.7930757504147553, + "language_loss": 0.56569821, + "learning_rate": 3.6783409088711875e-06, + "loss": 0.58620846, + "num_input_tokens_seen": 74655275, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.30859375, + "step": 3452, + "time_per_iteration": 2.979168653488159 + }, + { + "auxiliary_loss_clip": 0.01144097, + "auxiliary_loss_mlp": 0.01045617, + "balance_loss_clip": 1.02765203, + "balance_loss_mlp": 1.0492605, + "epoch": 0.20760559146249813, + "flos": 20412379768320.0, + "grad_norm": 1.970927529953097, + "language_loss": 0.88332593, + "learning_rate": 3.6781290611094755e-06, + "loss": 0.90522313, + "num_input_tokens_seen": 74674560, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.94921875, + "step": 3453, + "time_per_iteration": 2.5404746532440186 + }, + { + "auxiliary_loss_clip": 0.01145334, + "auxiliary_loss_mlp": 0.01043412, + "balance_loss_clip": 1.02518487, + "balance_loss_mlp": 1.05121803, + "epoch": 0.2076657147151661, + "flos": 23186012169600.0, + "grad_norm": 1.6193396769615114, + "language_loss": 0.80056196, + "learning_rate": 3.6779171497129407e-06, + "loss": 0.82244939, + "num_input_tokens_seen": 74694500, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.94140625, + "step": 3454, + "time_per_iteration": 2.536154270172119 + }, + { + "auxiliary_loss_clip": 0.0114231, + "auxiliary_loss_mlp": 0.0104846, + "balance_loss_clip": 1.03128242, + "balance_loss_mlp": 1.04881716, + "epoch": 0.20772583796783406, + "flos": 18293219124480.0, + "grad_norm": 3.767477329453147, + "language_loss": 0.76424366, + "learning_rate": 3.6777051746896202e-06, + "loss": 0.78615135, + "num_input_tokens_seen": 74710485, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.93359375, + "step": 3455, + "time_per_iteration": 2.502450466156006 + }, + { + "auxiliary_loss_clip": 0.01141184, + "auxiliary_loss_mlp": 0.01049655, + "balance_loss_clip": 1.03247654, + "balance_loss_mlp": 1.04867601, + "epoch": 0.20778596122050202, + "flos": 17602800831360.0, + "grad_norm": 2.1876724852466163, + "language_loss": 0.80599815, + "learning_rate": 3.6774931360475516e-06, + "loss": 0.82790661, + "num_input_tokens_seen": 74727450, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.92578125, + "step": 3456, + "time_per_iteration": 2.495405673980713 + }, + { + "auxiliary_loss_clip": 0.01147485, + "auxiliary_loss_mlp": 0.01042924, + "balance_loss_clip": 1.02447069, + "balance_loss_mlp": 1.05180097, + "epoch": 0.20784608447317, + "flos": 23805578885760.0, + "grad_norm": 1.5859267830694757, + "language_loss": 0.77988815, + "learning_rate": 3.6772810337947745e-06, + "loss": 0.80179226, + "num_input_tokens_seen": 74746725, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.95703125, + "step": 3457, + "time_per_iteration": 2.5625829696655273 + }, + { + "auxiliary_loss_clip": 0.01149281, + "auxiliary_loss_mlp": 0.01054167, + "balance_loss_clip": 1.03461635, + "balance_loss_mlp": 1.05195451, + "epoch": 0.20790620772583795, + "flos": 17639286071040.0, + "grad_norm": 2.0073788397072136, + "language_loss": 0.83581042, + "learning_rate": 3.677068867939333e-06, + "loss": 0.85784483, + "num_input_tokens_seen": 74765255, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.96875, + "step": 3458, + "time_per_iteration": 2.470740556716919 + }, + { + "auxiliary_loss_clip": 0.01142717, + "auxiliary_loss_mlp": 0.01041287, + "balance_loss_clip": 1.02443111, + "balance_loss_mlp": 1.05063045, + "epoch": 0.20796633097850595, + "flos": 27673481168640.0, + "grad_norm": 1.732611194718632, + "language_loss": 0.76041365, + "learning_rate": 3.676856638489272e-06, + "loss": 0.78225368, + "num_input_tokens_seen": 74785710, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.921875, + "step": 3459, + "time_per_iteration": 2.5753207206726074 + }, + { + "auxiliary_loss_clip": 0.01138446, + "auxiliary_loss_mlp": 0.01041199, + "balance_loss_clip": 1.02451003, + "balance_loss_mlp": 1.04829502, + "epoch": 0.2080264542311739, + "flos": 19245606284160.0, + "grad_norm": 2.1264218253084386, + "language_loss": 0.77302521, + "learning_rate": 3.6766443454526382e-06, + "loss": 0.79482168, + "num_input_tokens_seen": 74804490, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8984375, + "step": 3460, + "time_per_iteration": 2.498760938644409 + }, + { + "auxiliary_loss_clip": 0.01143636, + "auxiliary_loss_mlp": 0.01050405, + "balance_loss_clip": 1.03284574, + "balance_loss_mlp": 1.04819179, + "epoch": 0.20808657748384188, + "flos": 27525924097920.0, + "grad_norm": 2.1644839576228296, + "language_loss": 0.75785947, + "learning_rate": 3.6764319888374836e-06, + "loss": 0.77979982, + "num_input_tokens_seen": 74826340, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.953125, + "step": 3461, + "time_per_iteration": 2.5850372314453125 + }, + { + "auxiliary_loss_clip": 0.01145604, + "auxiliary_loss_mlp": 0.010446, + "balance_loss_clip": 1.02645624, + "balance_loss_mlp": 1.0469749, + "epoch": 0.20814670073650984, + "flos": 26906931999360.0, + "grad_norm": 1.8484421465162717, + "language_loss": 0.88227051, + "learning_rate": 3.6762195686518604e-06, + "loss": 0.90417254, + "num_input_tokens_seen": 74844960, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.98828125, + "step": 3462, + "time_per_iteration": 2.558375358581543 + }, + { + "auxiliary_loss_clip": 0.01043601, + "auxiliary_loss_mlp": 0.0101247, + "balance_loss_clip": 1.01059818, + "balance_loss_mlp": 1.01278758, + "epoch": 0.2082068239891778, + "flos": 70175735717760.0, + "grad_norm": 0.7627714646141646, + "language_loss": 0.59057152, + "learning_rate": 3.6760070849038226e-06, + "loss": 0.6111322, + "num_input_tokens_seen": 74909075, + "router_z_loss_clip": 0.01867676, + "router_z_loss_mlp": 0.30859375, + "step": 3463, + "time_per_iteration": 3.2280492782592773 + }, + { + "auxiliary_loss_clip": 0.01144566, + "auxiliary_loss_mlp": 0.01049331, + "balance_loss_clip": 1.03056765, + "balance_loss_mlp": 1.04713821, + "epoch": 0.20826694724184577, + "flos": 24608074590720.0, + "grad_norm": 2.542529703880477, + "language_loss": 0.65831709, + "learning_rate": 3.675794537601429e-06, + "loss": 0.68025607, + "num_input_tokens_seen": 74928125, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.97265625, + "step": 3464, + "time_per_iteration": 2.5706918239593506 + }, + { + "auxiliary_loss_clip": 0.01147872, + "auxiliary_loss_mlp": 0.01050812, + "balance_loss_clip": 1.03160763, + "balance_loss_mlp": 1.0492928, + "epoch": 0.20832707049451377, + "flos": 12892829034240.0, + "grad_norm": 2.848617339554035, + "language_loss": 0.83536243, + "learning_rate": 3.6755819267527373e-06, + "loss": 0.85734928, + "num_input_tokens_seen": 74945090, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.984375, + "step": 3465, + "time_per_iteration": 2.535473585128784 + }, + { + "auxiliary_loss_clip": 0.01143191, + "auxiliary_loss_mlp": 0.01044869, + "balance_loss_clip": 1.02767932, + "balance_loss_mlp": 1.04802513, + "epoch": 0.20838719374718173, + "flos": 22198827709440.0, + "grad_norm": 3.628659863163492, + "language_loss": 0.81463158, + "learning_rate": 3.6753692523658113e-06, + "loss": 0.83651215, + "num_input_tokens_seen": 74963630, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.953125, + "step": 3466, + "time_per_iteration": 2.535311222076416 + }, + { + "auxiliary_loss_clip": 0.01146517, + "auxiliary_loss_mlp": 0.01044717, + "balance_loss_clip": 1.02863586, + "balance_loss_mlp": 1.05303347, + "epoch": 0.2084473169998497, + "flos": 15158648908800.0, + "grad_norm": 1.967186340276973, + "language_loss": 0.81678396, + "learning_rate": 3.675156514448716e-06, + "loss": 0.83869636, + "num_input_tokens_seen": 74981875, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.9375, + "step": 3467, + "time_per_iteration": 2.4783830642700195 + }, + { + "auxiliary_loss_clip": 0.01142574, + "auxiliary_loss_mlp": 0.01041679, + "balance_loss_clip": 1.02469158, + "balance_loss_mlp": 1.05200005, + "epoch": 0.20850744025251766, + "flos": 17456788045440.0, + "grad_norm": 2.0682841758185235, + "language_loss": 0.8186093, + "learning_rate": 3.674943713009518e-06, + "loss": 0.84045184, + "num_input_tokens_seen": 74999155, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.90625, + "step": 3468, + "time_per_iteration": 2.5275001525878906 + }, + { + "auxiliary_loss_clip": 0.0114752, + "auxiliary_loss_mlp": 0.01046643, + "balance_loss_clip": 1.02677095, + "balance_loss_mlp": 1.05024171, + "epoch": 0.20856756350518563, + "flos": 25698968593920.0, + "grad_norm": 1.9832892060266627, + "language_loss": 0.90227246, + "learning_rate": 3.6747308480562856e-06, + "loss": 0.92421412, + "num_input_tokens_seen": 75017850, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.9765625, + "step": 3469, + "time_per_iteration": 3.999607563018799 + }, + { + "auxiliary_loss_clip": 0.01147477, + "auxiliary_loss_mlp": 0.01051285, + "balance_loss_clip": 1.03329682, + "balance_loss_mlp": 1.0530771, + "epoch": 0.2086276867578536, + "flos": 37889060970240.0, + "grad_norm": 1.764094275638393, + "language_loss": 0.7643016, + "learning_rate": 3.674517919597092e-06, + "loss": 0.78628922, + "num_input_tokens_seen": 75039270, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9453125, + "step": 3470, + "time_per_iteration": 4.186570405960083 + }, + { + "auxiliary_loss_clip": 0.0114555, + "auxiliary_loss_mlp": 0.01048445, + "balance_loss_clip": 1.03039646, + "balance_loss_mlp": 1.05154145, + "epoch": 0.20868781001052156, + "flos": 25557049958400.0, + "grad_norm": 1.7254586081909284, + "language_loss": 0.7592454, + "learning_rate": 3.674304927640011e-06, + "loss": 0.78118539, + "num_input_tokens_seen": 75059350, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.94140625, + "step": 3471, + "time_per_iteration": 2.5700020790100098 + }, + { + "auxiliary_loss_clip": 0.01148899, + "auxiliary_loss_mlp": 0.01054484, + "balance_loss_clip": 1.03488564, + "balance_loss_mlp": 1.04796982, + "epoch": 0.20874793326318955, + "flos": 27529192235520.0, + "grad_norm": 1.907022336492936, + "language_loss": 0.75515926, + "learning_rate": 3.67409187219312e-06, + "loss": 0.77719313, + "num_input_tokens_seen": 75080150, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0078125, + "step": 3472, + "time_per_iteration": 2.555927038192749 + }, + { + "auxiliary_loss_clip": 0.01144631, + "auxiliary_loss_mlp": 0.01045397, + "balance_loss_clip": 1.02790928, + "balance_loss_mlp": 1.05051231, + "epoch": 0.20880805651585752, + "flos": 18548795370240.0, + "grad_norm": 1.9877478939715982, + "language_loss": 0.84168947, + "learning_rate": 3.6738787532644966e-06, + "loss": 0.86358976, + "num_input_tokens_seen": 75097920, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.94140625, + "step": 3473, + "time_per_iteration": 2.5261759757995605 + }, + { + "auxiliary_loss_clip": 0.01043725, + "auxiliary_loss_mlp": 0.0100228, + "balance_loss_clip": 1.00027776, + "balance_loss_mlp": 1.01290703, + "epoch": 0.20886817976852548, + "flos": 65946644225280.0, + "grad_norm": 0.8792852781400284, + "language_loss": 0.63631999, + "learning_rate": 3.6736655708622235e-06, + "loss": 0.65678006, + "num_input_tokens_seen": 75152410, + "router_z_loss_clip": 0.02001953, + "router_z_loss_mlp": 0.30859375, + "step": 3474, + "time_per_iteration": 3.025831460952759 + }, + { + "auxiliary_loss_clip": 0.01146356, + "auxiliary_loss_mlp": 0.01041236, + "balance_loss_clip": 1.02334285, + "balance_loss_mlp": 1.04993105, + "epoch": 0.20892830302119345, + "flos": 36539178929280.0, + "grad_norm": 2.882119897934913, + "language_loss": 0.69867098, + "learning_rate": 3.6734523249943844e-06, + "loss": 0.72054696, + "num_input_tokens_seen": 75173265, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.96484375, + "step": 3475, + "time_per_iteration": 2.751676559448242 + }, + { + "auxiliary_loss_clip": 0.01146508, + "auxiliary_loss_mlp": 0.01047852, + "balance_loss_clip": 1.02961278, + "balance_loss_mlp": 1.05162299, + "epoch": 0.2089884262738614, + "flos": 20956749361920.0, + "grad_norm": 1.4951270147360183, + "language_loss": 0.70032048, + "learning_rate": 3.673239015669065e-06, + "loss": 0.72226411, + "num_input_tokens_seen": 75193640, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.94921875, + "step": 3476, + "time_per_iteration": 2.5493083000183105 + }, + { + "auxiliary_loss_clip": 0.011446, + "auxiliary_loss_mlp": 0.01046029, + "balance_loss_clip": 1.02850533, + "balance_loss_mlp": 1.05099094, + "epoch": 0.20904854952652938, + "flos": 22784028088320.0, + "grad_norm": 2.0857679152031716, + "language_loss": 0.89590299, + "learning_rate": 3.6730256428943544e-06, + "loss": 0.91780925, + "num_input_tokens_seen": 75212545, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9375, + "step": 3477, + "time_per_iteration": 2.506962537765503 + }, + { + "auxiliary_loss_clip": 0.01142894, + "auxiliary_loss_mlp": 0.01047844, + "balance_loss_clip": 1.03005815, + "balance_loss_mlp": 1.04896593, + "epoch": 0.20910867277919734, + "flos": 27303277645440.0, + "grad_norm": 4.245750786990739, + "language_loss": 0.67988396, + "learning_rate": 3.672812206678344e-06, + "loss": 0.70179135, + "num_input_tokens_seen": 75230865, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9375, + "step": 3478, + "time_per_iteration": 2.57366681098938 + }, + { + "auxiliary_loss_clip": 0.01143008, + "auxiliary_loss_mlp": 0.0104171, + "balance_loss_clip": 1.02334023, + "balance_loss_mlp": 1.04826832, + "epoch": 0.20916879603186533, + "flos": 14319237000960.0, + "grad_norm": 2.137628491911851, + "language_loss": 0.85035646, + "learning_rate": 3.672598707029127e-06, + "loss": 0.87220371, + "num_input_tokens_seen": 75248285, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.94921875, + "step": 3479, + "time_per_iteration": 2.4716267585754395 + }, + { + "auxiliary_loss_clip": 0.01146636, + "auxiliary_loss_mlp": 0.01049707, + "balance_loss_clip": 1.03156328, + "balance_loss_mlp": 1.04972577, + "epoch": 0.2092289192845333, + "flos": 22273019251200.0, + "grad_norm": 2.2225866030569175, + "language_loss": 0.73807257, + "learning_rate": 3.6723851439548003e-06, + "loss": 0.76003599, + "num_input_tokens_seen": 75266310, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.96875, + "step": 3480, + "time_per_iteration": 2.4856386184692383 + }, + { + "auxiliary_loss_clip": 0.01141126, + "auxiliary_loss_mlp": 0.01047253, + "balance_loss_clip": 1.03113592, + "balance_loss_mlp": 1.04844785, + "epoch": 0.20928904253720126, + "flos": 14830712714880.0, + "grad_norm": 2.023418551380918, + "language_loss": 0.75601453, + "learning_rate": 3.67217151746346e-06, + "loss": 0.77789831, + "num_input_tokens_seen": 75284175, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.9296875, + "step": 3481, + "time_per_iteration": 2.4812443256378174 + }, + { + "auxiliary_loss_clip": 0.01145872, + "auxiliary_loss_mlp": 0.0104777, + "balance_loss_clip": 1.03051996, + "balance_loss_mlp": 1.05047393, + "epoch": 0.20934916578986923, + "flos": 23259162216960.0, + "grad_norm": 3.5251666716598273, + "language_loss": 0.85337639, + "learning_rate": 3.671957827563209e-06, + "loss": 0.87531281, + "num_input_tokens_seen": 75303465, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.953125, + "step": 3482, + "time_per_iteration": 2.521284580230713 + }, + { + "auxiliary_loss_clip": 0.01145664, + "auxiliary_loss_mlp": 0.01048133, + "balance_loss_clip": 1.02940559, + "balance_loss_mlp": 1.05097377, + "epoch": 0.2094092890425372, + "flos": 32014398677760.0, + "grad_norm": 2.8936854891166743, + "language_loss": 0.70626152, + "learning_rate": 3.6717440742621494e-06, + "loss": 0.72819948, + "num_input_tokens_seen": 75325290, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9453125, + "step": 3483, + "time_per_iteration": 2.5876524448394775 + }, + { + "auxiliary_loss_clip": 0.01146142, + "auxiliary_loss_mlp": 0.01060474, + "balance_loss_clip": 1.04193723, + "balance_loss_mlp": 1.04891169, + "epoch": 0.20946941229520516, + "flos": 20010647082240.0, + "grad_norm": 1.8606830424584557, + "language_loss": 0.74988431, + "learning_rate": 3.6715302575683865e-06, + "loss": 0.77195048, + "num_input_tokens_seen": 75343895, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.96875, + "step": 3484, + "time_per_iteration": 2.49701189994812 + }, + { + "auxiliary_loss_clip": 0.01143763, + "auxiliary_loss_mlp": 0.01048057, + "balance_loss_clip": 1.02991378, + "balance_loss_mlp": 1.05028141, + "epoch": 0.20952953554787315, + "flos": 30740072895360.0, + "grad_norm": 1.8378150509428508, + "language_loss": 0.70690203, + "learning_rate": 3.6713163774900292e-06, + "loss": 0.7288202, + "num_input_tokens_seen": 75367100, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9375, + "step": 3485, + "time_per_iteration": 2.5692059993743896 + }, + { + "auxiliary_loss_clip": 0.01146857, + "auxiliary_loss_mlp": 0.01045553, + "balance_loss_clip": 1.02712297, + "balance_loss_mlp": 1.05028093, + "epoch": 0.20958965880054112, + "flos": 27049209770880.0, + "grad_norm": 1.9069158447471781, + "language_loss": 0.82965356, + "learning_rate": 3.6711024340351875e-06, + "loss": 0.85157764, + "num_input_tokens_seen": 75389925, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.96875, + "step": 3486, + "time_per_iteration": 2.569308042526245 + }, + { + "auxiliary_loss_clip": 0.0114472, + "auxiliary_loss_mlp": 0.01050567, + "balance_loss_clip": 1.03309095, + "balance_loss_mlp": 1.04790449, + "epoch": 0.20964978205320908, + "flos": 34204123589760.0, + "grad_norm": 3.843984040964354, + "language_loss": 0.8699702, + "learning_rate": 3.6708884272119737e-06, + "loss": 0.89192313, + "num_input_tokens_seen": 75408575, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.96875, + "step": 3487, + "time_per_iteration": 2.608441114425659 + }, + { + "auxiliary_loss_clip": 0.01141021, + "auxiliary_loss_mlp": 0.01047132, + "balance_loss_clip": 1.0287739, + "balance_loss_mlp": 1.04695904, + "epoch": 0.20970990530587705, + "flos": 23477391296640.0, + "grad_norm": 2.4377115915778713, + "language_loss": 0.72369969, + "learning_rate": 3.670674357028504e-06, + "loss": 0.74558127, + "num_input_tokens_seen": 75427155, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.94140625, + "step": 3488, + "time_per_iteration": 2.529233694076538 + }, + { + "auxiliary_loss_clip": 0.01144055, + "auxiliary_loss_mlp": 0.01045689, + "balance_loss_clip": 1.02812946, + "balance_loss_mlp": 1.04897618, + "epoch": 0.209770028558545, + "flos": 18551452976640.0, + "grad_norm": 2.6657941113460764, + "language_loss": 0.80726898, + "learning_rate": 3.6704602234928945e-06, + "loss": 0.82916641, + "num_input_tokens_seen": 75444450, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.953125, + "step": 3489, + "time_per_iteration": 2.4847962856292725 + }, + { + "auxiliary_loss_clip": 0.01142088, + "auxiliary_loss_mlp": 0.01042551, + "balance_loss_clip": 1.0253495, + "balance_loss_mlp": 1.04718399, + "epoch": 0.20983015181121298, + "flos": 21617003208960.0, + "grad_norm": 1.7888402521564877, + "language_loss": 0.72827011, + "learning_rate": 3.670246026613266e-06, + "loss": 0.75011659, + "num_input_tokens_seen": 75462625, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.94921875, + "step": 3490, + "time_per_iteration": 2.543064594268799 + }, + { + "auxiliary_loss_clip": 0.01140159, + "auxiliary_loss_mlp": 0.0105099, + "balance_loss_clip": 1.03437209, + "balance_loss_mlp": 1.04955435, + "epoch": 0.20989027506388094, + "flos": 16614718531200.0, + "grad_norm": 5.073894522138561, + "language_loss": 0.70159817, + "learning_rate": 3.6700317663977415e-06, + "loss": 0.72350967, + "num_input_tokens_seen": 75480640, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.90625, + "step": 3491, + "time_per_iteration": 2.4785172939300537 + }, + { + "auxiliary_loss_clip": 0.01142629, + "auxiliary_loss_mlp": 0.01045142, + "balance_loss_clip": 1.02633047, + "balance_loss_mlp": 1.04678369, + "epoch": 0.20995039831654894, + "flos": 23216823060480.0, + "grad_norm": 3.7459720995568557, + "language_loss": 0.7931999, + "learning_rate": 3.669817442854444e-06, + "loss": 0.8150776, + "num_input_tokens_seen": 75494900, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9609375, + "step": 3492, + "time_per_iteration": 2.5213027000427246 + }, + { + "auxiliary_loss_clip": 0.01144565, + "auxiliary_loss_mlp": 0.01041079, + "balance_loss_clip": 1.02341175, + "balance_loss_mlp": 1.04977345, + "epoch": 0.2100105215692169, + "flos": 18147493647360.0, + "grad_norm": 1.9629392465329358, + "language_loss": 0.86883962, + "learning_rate": 3.669603055991502e-06, + "loss": 0.89069605, + "num_input_tokens_seen": 75513370, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9453125, + "step": 3493, + "time_per_iteration": 2.499797821044922 + }, + { + "auxiliary_loss_clip": 0.01139311, + "auxiliary_loss_mlp": 0.01040774, + "balance_loss_clip": 1.02408433, + "balance_loss_mlp": 1.04791212, + "epoch": 0.21007064482188487, + "flos": 15961611490560.0, + "grad_norm": 1.8525794886403055, + "language_loss": 0.68810928, + "learning_rate": 3.6693886058170455e-06, + "loss": 0.70991009, + "num_input_tokens_seen": 75532480, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9140625, + "step": 3494, + "time_per_iteration": 2.5374889373779297 + }, + { + "auxiliary_loss_clip": 0.01146689, + "auxiliary_loss_mlp": 0.01037903, + "balance_loss_clip": 1.02054656, + "balance_loss_mlp": 1.05010796, + "epoch": 0.21013076807455283, + "flos": 32234315696640.0, + "grad_norm": 1.7465496854212388, + "language_loss": 0.78900456, + "learning_rate": 3.6691740923392053e-06, + "loss": 0.81085044, + "num_input_tokens_seen": 75552745, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.96484375, + "step": 3495, + "time_per_iteration": 2.6390578746795654 + }, + { + "auxiliary_loss_clip": 0.0114231, + "auxiliary_loss_mlp": 0.01042653, + "balance_loss_clip": 1.02505755, + "balance_loss_mlp": 1.04696178, + "epoch": 0.2101908913272208, + "flos": 23696625957120.0, + "grad_norm": 1.7459726457298623, + "language_loss": 0.77192879, + "learning_rate": 3.668959515566116e-06, + "loss": 0.79377842, + "num_input_tokens_seen": 75574355, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.953125, + "step": 3496, + "time_per_iteration": 2.552386522293091 + }, + { + "auxiliary_loss_clip": 0.01145605, + "auxiliary_loss_mlp": 0.0105152, + "balance_loss_clip": 1.03297126, + "balance_loss_mlp": 1.04933989, + "epoch": 0.21025101457988876, + "flos": 20375786787840.0, + "grad_norm": 2.0396086665216777, + "language_loss": 0.82009852, + "learning_rate": 3.668744875505915e-06, + "loss": 0.84206975, + "num_input_tokens_seen": 75592215, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9609375, + "step": 3497, + "time_per_iteration": 2.498359441757202 + }, + { + "auxiliary_loss_clip": 0.01146873, + "auxiliary_loss_mlp": 0.01048221, + "balance_loss_clip": 1.03091133, + "balance_loss_mlp": 1.04979134, + "epoch": 0.21031113783255675, + "flos": 25775638174080.0, + "grad_norm": 2.5223195218779577, + "language_loss": 0.67314029, + "learning_rate": 3.668530172166741e-06, + "loss": 0.69509119, + "num_input_tokens_seen": 75610740, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.96875, + "step": 3498, + "time_per_iteration": 2.540766716003418 + }, + { + "auxiliary_loss_clip": 0.01145112, + "auxiliary_loss_mlp": 0.01045261, + "balance_loss_clip": 1.02679563, + "balance_loss_mlp": 1.04782224, + "epoch": 0.21037126108522472, + "flos": 22018197191040.0, + "grad_norm": 2.2477271783909414, + "language_loss": 0.80623376, + "learning_rate": 3.6683154055567352e-06, + "loss": 0.82813752, + "num_input_tokens_seen": 75631005, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.97265625, + "step": 3499, + "time_per_iteration": 2.5283098220825195 + }, + { + "auxiliary_loss_clip": 0.0114621, + "auxiliary_loss_mlp": 0.01043014, + "balance_loss_clip": 1.02612233, + "balance_loss_mlp": 1.05201602, + "epoch": 0.21043138433789269, + "flos": 25334403505920.0, + "grad_norm": 1.776862664007905, + "language_loss": 0.78366566, + "learning_rate": 3.668100575684043e-06, + "loss": 0.80555797, + "num_input_tokens_seen": 75650655, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.94140625, + "step": 3500, + "time_per_iteration": 2.5419158935546875 + }, + { + "auxiliary_loss_clip": 0.01142389, + "auxiliary_loss_mlp": 0.01042754, + "balance_loss_clip": 1.02524185, + "balance_loss_mlp": 1.0480907, + "epoch": 0.21049150759056065, + "flos": 25556654908800.0, + "grad_norm": 1.628727093990466, + "language_loss": 0.73989725, + "learning_rate": 3.6678856825568094e-06, + "loss": 0.76174867, + "num_input_tokens_seen": 75669895, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.94140625, + "step": 3501, + "time_per_iteration": 2.535419464111328 + }, + { + "auxiliary_loss_clip": 0.01140428, + "auxiliary_loss_mlp": 0.01041829, + "balance_loss_clip": 1.02429342, + "balance_loss_mlp": 1.04671168, + "epoch": 0.21055163084322862, + "flos": 24495602129280.0, + "grad_norm": 1.6206913905571714, + "language_loss": 0.75292969, + "learning_rate": 3.667670726183183e-06, + "loss": 0.77475226, + "num_input_tokens_seen": 75689535, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9375, + "step": 3502, + "time_per_iteration": 2.508277654647827 + }, + { + "auxiliary_loss_clip": 0.01141546, + "auxiliary_loss_mlp": 0.0104558, + "balance_loss_clip": 1.02796102, + "balance_loss_mlp": 1.0475595, + "epoch": 0.21061175409589658, + "flos": 25739045193600.0, + "grad_norm": 1.9145063235338367, + "language_loss": 0.77090263, + "learning_rate": 3.667455706571316e-06, + "loss": 0.7927739, + "num_input_tokens_seen": 75709265, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.94140625, + "step": 3503, + "time_per_iteration": 2.5607948303222656 + }, + { + "auxiliary_loss_clip": 0.01148374, + "auxiliary_loss_mlp": 0.01049218, + "balance_loss_clip": 1.02813029, + "balance_loss_mlp": 1.048738, + "epoch": 0.21067187734856455, + "flos": 18989168112000.0, + "grad_norm": 2.3817148130730144, + "language_loss": 0.77991742, + "learning_rate": 3.6672406237293617e-06, + "loss": 0.80189341, + "num_input_tokens_seen": 75727050, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.9921875, + "step": 3504, + "time_per_iteration": 2.495028018951416 + }, + { + "auxiliary_loss_clip": 0.01145149, + "auxiliary_loss_mlp": 0.01047751, + "balance_loss_clip": 1.02952361, + "balance_loss_mlp": 1.0473187, + "epoch": 0.21073200060123254, + "flos": 24681368292480.0, + "grad_norm": 1.5529728217373517, + "language_loss": 0.77045631, + "learning_rate": 3.6670254776654754e-06, + "loss": 0.79238534, + "num_input_tokens_seen": 75747175, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9765625, + "step": 3505, + "time_per_iteration": 2.5408663749694824 + }, + { + "auxiliary_loss_clip": 0.01138823, + "auxiliary_loss_mlp": 0.01046578, + "balance_loss_clip": 1.02931666, + "balance_loss_mlp": 1.04786968, + "epoch": 0.2107921238539005, + "flos": 28549342402560.0, + "grad_norm": 1.9911708078552777, + "language_loss": 0.63704473, + "learning_rate": 3.6668102683878163e-06, + "loss": 0.65889871, + "num_input_tokens_seen": 75767690, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.91015625, + "step": 3506, + "time_per_iteration": 2.564246892929077 + }, + { + "auxiliary_loss_clip": 0.01140339, + "auxiliary_loss_mlp": 0.01046628, + "balance_loss_clip": 1.02904439, + "balance_loss_mlp": 1.04773796, + "epoch": 0.21085224710656847, + "flos": 25885848078720.0, + "grad_norm": 1.8633964271687153, + "language_loss": 0.81863034, + "learning_rate": 3.6665949959045443e-06, + "loss": 0.84050006, + "num_input_tokens_seen": 75787255, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.92578125, + "step": 3507, + "time_per_iteration": 2.6049435138702393 + }, + { + "auxiliary_loss_clip": 0.011401, + "auxiliary_loss_mlp": 0.0104784, + "balance_loss_clip": 1.0299232, + "balance_loss_mlp": 1.04645514, + "epoch": 0.21091237035923643, + "flos": 14976294537600.0, + "grad_norm": 2.0263301336255135, + "language_loss": 0.75496012, + "learning_rate": 3.666379660223824e-06, + "loss": 0.77683949, + "num_input_tokens_seen": 75805890, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.93359375, + "step": 3508, + "time_per_iteration": 2.5366437435150146 + }, + { + "auxiliary_loss_clip": 0.01144539, + "auxiliary_loss_mlp": 0.01042146, + "balance_loss_clip": 1.02395463, + "balance_loss_mlp": 1.04809749, + "epoch": 0.2109724936119044, + "flos": 16362518163840.0, + "grad_norm": 2.1922875924351115, + "language_loss": 0.85395098, + "learning_rate": 3.6661642613538192e-06, + "loss": 0.87581778, + "num_input_tokens_seen": 75821620, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.96484375, + "step": 3509, + "time_per_iteration": 2.4895167350769043 + }, + { + "auxiliary_loss_clip": 0.01146568, + "auxiliary_loss_mlp": 0.01043895, + "balance_loss_clip": 1.02503562, + "balance_loss_mlp": 1.04908204, + "epoch": 0.21103261686457236, + "flos": 31502492000640.0, + "grad_norm": 1.5522473876542349, + "language_loss": 0.67803288, + "learning_rate": 3.6659487993026987e-06, + "loss": 0.69993746, + "num_input_tokens_seen": 75842490, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.9765625, + "step": 3510, + "time_per_iteration": 4.065294027328491 + }, + { + "auxiliary_loss_clip": 0.01143018, + "auxiliary_loss_mlp": 0.01041477, + "balance_loss_clip": 1.02381003, + "balance_loss_mlp": 1.04653811, + "epoch": 0.21109274011724033, + "flos": 27344072517120.0, + "grad_norm": 1.9784941086490475, + "language_loss": 0.7240749, + "learning_rate": 3.6657332740786327e-06, + "loss": 0.74591982, + "num_input_tokens_seen": 75865985, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.96484375, + "step": 3511, + "time_per_iteration": 2.5701003074645996 + }, + { + "auxiliary_loss_clip": 0.01148402, + "auxiliary_loss_mlp": 0.01039531, + "balance_loss_clip": 1.02020764, + "balance_loss_mlp": 1.05022192, + "epoch": 0.21115286336990832, + "flos": 17820383466240.0, + "grad_norm": 2.3544542512902322, + "language_loss": 0.69737375, + "learning_rate": 3.665517685689794e-06, + "loss": 0.71925306, + "num_input_tokens_seen": 75882745, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.98046875, + "step": 3512, + "time_per_iteration": 3.9019229412078857 + }, + { + "auxiliary_loss_clip": 0.01143526, + "auxiliary_loss_mlp": 0.01047621, + "balance_loss_clip": 1.02872658, + "balance_loss_mlp": 1.04680824, + "epoch": 0.2112129866225763, + "flos": 27197987904000.0, + "grad_norm": 1.6756724017558497, + "language_loss": 0.73159289, + "learning_rate": 3.6653020341443584e-06, + "loss": 0.7535044, + "num_input_tokens_seen": 75904305, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.96875, + "step": 3513, + "time_per_iteration": 2.5643980503082275 + }, + { + "auxiliary_loss_clip": 0.01140444, + "auxiliary_loss_mlp": 0.0103852, + "balance_loss_clip": 1.02212906, + "balance_loss_mlp": 1.04916954, + "epoch": 0.21127310987524425, + "flos": 23731279603200.0, + "grad_norm": 1.635076517146385, + "language_loss": 0.74235332, + "learning_rate": 3.665086319450502e-06, + "loss": 0.76414299, + "num_input_tokens_seen": 75923710, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9140625, + "step": 3514, + "time_per_iteration": 2.5240070819854736 + }, + { + "auxiliary_loss_clip": 0.01144119, + "auxiliary_loss_mlp": 0.01040689, + "balance_loss_clip": 1.02347541, + "balance_loss_mlp": 1.0482856, + "epoch": 0.21133323312791222, + "flos": 18332505624960.0, + "grad_norm": 1.7928371848293583, + "language_loss": 0.76707381, + "learning_rate": 3.6648705416164062e-06, + "loss": 0.78892195, + "num_input_tokens_seen": 75942625, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9609375, + "step": 3515, + "time_per_iteration": 2.526527166366577 + }, + { + "auxiliary_loss_clip": 0.0114362, + "auxiliary_loss_mlp": 0.01042748, + "balance_loss_clip": 1.02517664, + "balance_loss_mlp": 1.04956555, + "epoch": 0.21139335638058018, + "flos": 17931203902080.0, + "grad_norm": 1.8516547188762509, + "language_loss": 0.68242604, + "learning_rate": 3.6646547006502518e-06, + "loss": 0.70428967, + "num_input_tokens_seen": 75959930, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.94140625, + "step": 3516, + "time_per_iteration": 2.46085786819458 + }, + { + "auxiliary_loss_clip": 0.01145197, + "auxiliary_loss_mlp": 0.01047209, + "balance_loss_clip": 1.02883935, + "balance_loss_mlp": 1.04901481, + "epoch": 0.21145347963324815, + "flos": 24572092141440.0, + "grad_norm": 1.653683865815189, + "language_loss": 0.85012519, + "learning_rate": 3.664438796560225e-06, + "loss": 0.87204921, + "num_input_tokens_seen": 75980335, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.9609375, + "step": 3517, + "time_per_iteration": 2.5080301761627197 + }, + { + "auxiliary_loss_clip": 0.01141463, + "auxiliary_loss_mlp": 0.01037033, + "balance_loss_clip": 1.01965201, + "balance_loss_mlp": 1.04722667, + "epoch": 0.21151360288591614, + "flos": 35845959375360.0, + "grad_norm": 2.26725319642869, + "language_loss": 0.62925792, + "learning_rate": 3.664222829354512e-06, + "loss": 0.65104288, + "num_input_tokens_seen": 76002095, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.94140625, + "step": 3518, + "time_per_iteration": 2.5949900150299072 + }, + { + "auxiliary_loss_clip": 0.01142565, + "auxiliary_loss_mlp": 0.01049413, + "balance_loss_clip": 1.03290248, + "balance_loss_mlp": 1.04891765, + "epoch": 0.2115737261385841, + "flos": 24641579001600.0, + "grad_norm": 1.8284325952385483, + "language_loss": 0.88772321, + "learning_rate": 3.664006799041303e-06, + "loss": 0.90964293, + "num_input_tokens_seen": 76020425, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.9375, + "step": 3519, + "time_per_iteration": 2.5356082916259766 + }, + { + "auxiliary_loss_clip": 0.0114445, + "auxiliary_loss_mlp": 0.01049295, + "balance_loss_clip": 1.03184235, + "balance_loss_mlp": 1.04866135, + "epoch": 0.21163384939125207, + "flos": 25226887121280.0, + "grad_norm": 1.5988506078375424, + "language_loss": 0.81066215, + "learning_rate": 3.6637907056287886e-06, + "loss": 0.83259952, + "num_input_tokens_seen": 76041210, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.95703125, + "step": 3520, + "time_per_iteration": 2.5069239139556885 + }, + { + "auxiliary_loss_clip": 0.0113827, + "auxiliary_loss_mlp": 0.01046076, + "balance_loss_clip": 1.02926779, + "balance_loss_mlp": 1.0469681, + "epoch": 0.21169397264392004, + "flos": 26067520091520.0, + "grad_norm": 1.592359744312873, + "language_loss": 0.76163614, + "learning_rate": 3.6635745491251642e-06, + "loss": 0.78347969, + "num_input_tokens_seen": 76062685, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9140625, + "step": 3521, + "time_per_iteration": 2.560037851333618 + }, + { + "auxiliary_loss_clip": 0.0113934, + "auxiliary_loss_mlp": 0.0104393, + "balance_loss_clip": 1.02842069, + "balance_loss_mlp": 1.04592443, + "epoch": 0.211754095896588, + "flos": 23108265181440.0, + "grad_norm": 2.0717596449561024, + "language_loss": 0.75950933, + "learning_rate": 3.663358329538626e-06, + "loss": 0.78134197, + "num_input_tokens_seen": 76082300, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.93359375, + "step": 3522, + "time_per_iteration": 2.4758715629577637 + }, + { + "auxiliary_loss_clip": 0.01141462, + "auxiliary_loss_mlp": 0.01049727, + "balance_loss_clip": 1.03176177, + "balance_loss_mlp": 1.04737353, + "epoch": 0.21181421914925597, + "flos": 27922341571200.0, + "grad_norm": 2.026497436525855, + "language_loss": 0.70436251, + "learning_rate": 3.663142046877374e-06, + "loss": 0.72627443, + "num_input_tokens_seen": 76101135, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.94140625, + "step": 3523, + "time_per_iteration": 2.5368640422821045 + }, + { + "auxiliary_loss_clip": 0.01140964, + "auxiliary_loss_mlp": 0.01044975, + "balance_loss_clip": 1.02786803, + "balance_loss_mlp": 1.04820895, + "epoch": 0.21187434240192393, + "flos": 17128636369920.0, + "grad_norm": 2.216886450348082, + "language_loss": 0.76683456, + "learning_rate": 3.6629257011496085e-06, + "loss": 0.7886939, + "num_input_tokens_seen": 76119320, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.92578125, + "step": 3524, + "time_per_iteration": 2.5932695865631104 + }, + { + "auxiliary_loss_clip": 0.01139634, + "auxiliary_loss_mlp": 0.01042013, + "balance_loss_clip": 1.02533603, + "balance_loss_mlp": 1.04276347, + "epoch": 0.21193446565459192, + "flos": 22347318533760.0, + "grad_norm": 2.020092904399728, + "language_loss": 0.81433582, + "learning_rate": 3.6627092923635338e-06, + "loss": 0.83615232, + "num_input_tokens_seen": 76137445, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.96875, + "step": 3525, + "time_per_iteration": 2.5425641536712646 + }, + { + "auxiliary_loss_clip": 0.011388, + "auxiliary_loss_mlp": 0.01041718, + "balance_loss_clip": 1.02484989, + "balance_loss_mlp": 1.04668331, + "epoch": 0.2119945889072599, + "flos": 27199316707200.0, + "grad_norm": 2.1031950889850655, + "language_loss": 0.75104785, + "learning_rate": 3.662492820527356e-06, + "loss": 0.77285308, + "num_input_tokens_seen": 76159500, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.921875, + "step": 3526, + "time_per_iteration": 2.533210515975952 + }, + { + "auxiliary_loss_clip": 0.01142205, + "auxiliary_loss_mlp": 0.01041732, + "balance_loss_clip": 1.02466083, + "balance_loss_mlp": 1.04663801, + "epoch": 0.21205471215992786, + "flos": 20991869884800.0, + "grad_norm": 1.9135764326712537, + "language_loss": 0.77385598, + "learning_rate": 3.662276285649284e-06, + "loss": 0.79569542, + "num_input_tokens_seen": 76177990, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.95703125, + "step": 3527, + "time_per_iteration": 2.53898286819458 + }, + { + "auxiliary_loss_clip": 0.0113944, + "auxiliary_loss_mlp": 0.01045919, + "balance_loss_clip": 1.02797842, + "balance_loss_mlp": 1.0461328, + "epoch": 0.21211483541259582, + "flos": 20777663128320.0, + "grad_norm": 1.981008674330079, + "language_loss": 0.78037727, + "learning_rate": 3.662059687737528e-06, + "loss": 0.80223083, + "num_input_tokens_seen": 76197125, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9296875, + "step": 3528, + "time_per_iteration": 2.5360231399536133 + }, + { + "auxiliary_loss_clip": 0.01138776, + "auxiliary_loss_mlp": 0.01047702, + "balance_loss_clip": 1.03096509, + "balance_loss_mlp": 1.04611731, + "epoch": 0.21217495866526379, + "flos": 18989994124800.0, + "grad_norm": 1.7275367809487383, + "language_loss": 0.8170321, + "learning_rate": 3.6618430268003024e-06, + "loss": 0.83889693, + "num_input_tokens_seen": 76216215, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.92578125, + "step": 3529, + "time_per_iteration": 2.531228542327881 + }, + { + "auxiliary_loss_clip": 0.01141251, + "auxiliary_loss_mlp": 0.01044804, + "balance_loss_clip": 1.028234, + "balance_loss_mlp": 1.04647708, + "epoch": 0.21223508191793175, + "flos": 20667309569280.0, + "grad_norm": 2.1603106904513547, + "language_loss": 0.76616383, + "learning_rate": 3.6616263028458235e-06, + "loss": 0.78802443, + "num_input_tokens_seen": 76237010, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9453125, + "step": 3530, + "time_per_iteration": 2.5361740589141846 + }, + { + "auxiliary_loss_clip": 0.01136983, + "auxiliary_loss_mlp": 0.01041907, + "balance_loss_clip": 1.02593338, + "balance_loss_mlp": 1.0451746, + "epoch": 0.21229520517059972, + "flos": 21616464504960.0, + "grad_norm": 2.3391242970409873, + "language_loss": 0.82978404, + "learning_rate": 3.661409515882308e-06, + "loss": 0.85157299, + "num_input_tokens_seen": 76255965, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.91796875, + "step": 3531, + "time_per_iteration": 2.571411609649658 + }, + { + "auxiliary_loss_clip": 0.01141528, + "auxiliary_loss_mlp": 0.01039512, + "balance_loss_clip": 1.02149963, + "balance_loss_mlp": 1.04744506, + "epoch": 0.2123553284232677, + "flos": 13991049411840.0, + "grad_norm": 2.416019676502894, + "language_loss": 0.73473567, + "learning_rate": 3.661192665917977e-06, + "loss": 0.75654608, + "num_input_tokens_seen": 76272150, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.94140625, + "step": 3532, + "time_per_iteration": 2.473006248474121 + }, + { + "auxiliary_loss_clip": 0.01138524, + "auxiliary_loss_mlp": 0.01042643, + "balance_loss_clip": 1.02485681, + "balance_loss_mlp": 1.04561734, + "epoch": 0.21241545167593567, + "flos": 18296774570880.0, + "grad_norm": 1.7353898898315339, + "language_loss": 0.73855233, + "learning_rate": 3.660975752961054e-06, + "loss": 0.76036394, + "num_input_tokens_seen": 76291425, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.92578125, + "step": 3533, + "time_per_iteration": 2.526780366897583 + }, + { + "auxiliary_loss_clip": 0.01140469, + "auxiliary_loss_mlp": 0.01045491, + "balance_loss_clip": 1.02833724, + "balance_loss_mlp": 1.04576015, + "epoch": 0.21247557492860364, + "flos": 34713121265280.0, + "grad_norm": 1.8944995629732337, + "language_loss": 0.7098999, + "learning_rate": 3.6607587770197634e-06, + "loss": 0.73175949, + "num_input_tokens_seen": 76313975, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9453125, + "step": 3534, + "time_per_iteration": 2.6947309970855713 + }, + { + "auxiliary_loss_clip": 0.01141409, + "auxiliary_loss_mlp": 0.01038239, + "balance_loss_clip": 1.02032161, + "balance_loss_mlp": 1.04669714, + "epoch": 0.2125356981812716, + "flos": 22053820504320.0, + "grad_norm": 1.9387778569542722, + "language_loss": 0.71567297, + "learning_rate": 3.6605417381023346e-06, + "loss": 0.73746949, + "num_input_tokens_seen": 76330955, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9453125, + "step": 3535, + "time_per_iteration": 2.6022329330444336 + }, + { + "auxiliary_loss_clip": 0.01136751, + "auxiliary_loss_mlp": 0.01046685, + "balance_loss_clip": 1.0299238, + "balance_loss_mlp": 1.04549336, + "epoch": 0.21259582143393957, + "flos": 28548336821760.0, + "grad_norm": 1.8756666540330442, + "language_loss": 0.7040931, + "learning_rate": 3.660324636216996e-06, + "loss": 0.72592747, + "num_input_tokens_seen": 76352680, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.91015625, + "step": 3536, + "time_per_iteration": 2.6005256175994873 + }, + { + "auxiliary_loss_clip": 0.01140865, + "auxiliary_loss_mlp": 0.01044171, + "balance_loss_clip": 1.02706444, + "balance_loss_mlp": 1.04512393, + "epoch": 0.21265594468660753, + "flos": 20120892900480.0, + "grad_norm": 1.9573194210103453, + "language_loss": 0.88217437, + "learning_rate": 3.660107471371981e-06, + "loss": 0.90402472, + "num_input_tokens_seen": 76370750, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.95703125, + "step": 3537, + "time_per_iteration": 2.5565810203552246 + }, + { + "auxiliary_loss_clip": 0.01134343, + "auxiliary_loss_mlp": 0.01040555, + "balance_loss_clip": 1.02425885, + "balance_loss_mlp": 1.0437026, + "epoch": 0.21271606793927553, + "flos": 23076161400960.0, + "grad_norm": 1.957058885696691, + "language_loss": 0.80129743, + "learning_rate": 3.659890243575524e-06, + "loss": 0.82304639, + "num_input_tokens_seen": 76390610, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.90625, + "step": 3538, + "time_per_iteration": 2.5501785278320312 + }, + { + "auxiliary_loss_clip": 0.01135328, + "auxiliary_loss_mlp": 0.01041186, + "balance_loss_clip": 1.025653, + "balance_loss_mlp": 1.0446775, + "epoch": 0.2127761911919435, + "flos": 26388201738240.0, + "grad_norm": 1.587715235485788, + "language_loss": 0.87131894, + "learning_rate": 3.659672952835863e-06, + "loss": 0.89308405, + "num_input_tokens_seen": 76408860, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.90625, + "step": 3539, + "time_per_iteration": 2.5751259326934814 + }, + { + "auxiliary_loss_clip": 0.01139264, + "auxiliary_loss_mlp": 0.01045476, + "balance_loss_clip": 1.02914476, + "balance_loss_mlp": 1.04718518, + "epoch": 0.21283631444461146, + "flos": 20228265630720.0, + "grad_norm": 3.3040839486156184, + "language_loss": 0.57464051, + "learning_rate": 3.659455599161237e-06, + "loss": 0.59648788, + "num_input_tokens_seen": 76424980, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.921875, + "step": 3540, + "time_per_iteration": 2.4746458530426025 + }, + { + "auxiliary_loss_clip": 0.01140156, + "auxiliary_loss_mlp": 0.01040246, + "balance_loss_clip": 1.02330637, + "balance_loss_mlp": 1.04658604, + "epoch": 0.21289643769727942, + "flos": 13516992691200.0, + "grad_norm": 5.8376417218282874, + "language_loss": 0.76062799, + "learning_rate": 3.659238182559888e-06, + "loss": 0.78243208, + "num_input_tokens_seen": 76443135, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.93359375, + "step": 3541, + "time_per_iteration": 2.5111818313598633 + }, + { + "auxiliary_loss_clip": 0.0113571, + "auxiliary_loss_mlp": 0.01041682, + "balance_loss_clip": 1.02517211, + "balance_loss_mlp": 1.04530454, + "epoch": 0.2129565609499474, + "flos": 24827021942400.0, + "grad_norm": 1.9190227230034667, + "language_loss": 0.69458514, + "learning_rate": 3.6590207030400615e-06, + "loss": 0.71635908, + "num_input_tokens_seen": 76462470, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.90234375, + "step": 3542, + "time_per_iteration": 2.556300401687622 + }, + { + "auxiliary_loss_clip": 0.01134092, + "auxiliary_loss_mlp": 0.01034857, + "balance_loss_clip": 1.01945567, + "balance_loss_mlp": 1.04443789, + "epoch": 0.21301668420261535, + "flos": 23659242877440.0, + "grad_norm": 1.8172219669397587, + "language_loss": 0.75591409, + "learning_rate": 3.658803160610004e-06, + "loss": 0.77760351, + "num_input_tokens_seen": 76481995, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8984375, + "step": 3543, + "time_per_iteration": 2.54424786567688 + }, + { + "auxiliary_loss_clip": 0.01138428, + "auxiliary_loss_mlp": 0.01038735, + "balance_loss_clip": 1.02253413, + "balance_loss_mlp": 1.04843175, + "epoch": 0.21307680745528332, + "flos": 16362805472640.0, + "grad_norm": 2.1531603349332915, + "language_loss": 0.66787028, + "learning_rate": 3.6585855552779634e-06, + "loss": 0.68964195, + "num_input_tokens_seen": 76500245, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8984375, + "step": 3544, + "time_per_iteration": 2.516359329223633 + }, + { + "auxiliary_loss_clip": 0.01136471, + "auxiliary_loss_mlp": 0.01040176, + "balance_loss_clip": 1.0245831, + "balance_loss_mlp": 1.04379654, + "epoch": 0.2131369307079513, + "flos": 19099054794240.0, + "grad_norm": 1.9827170900636153, + "language_loss": 0.71089172, + "learning_rate": 3.6583678870521934e-06, + "loss": 0.73265821, + "num_input_tokens_seen": 76519535, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.92578125, + "step": 3545, + "time_per_iteration": 2.5377357006073 + }, + { + "auxiliary_loss_clip": 0.01138848, + "auxiliary_loss_mlp": 0.01046644, + "balance_loss_clip": 1.03095567, + "balance_loss_mlp": 1.04571509, + "epoch": 0.21319705396061928, + "flos": 30372275583360.0, + "grad_norm": 1.730364240275379, + "language_loss": 0.72334421, + "learning_rate": 3.658150155940946e-06, + "loss": 0.74519908, + "num_input_tokens_seen": 76542065, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.9296875, + "step": 3546, + "time_per_iteration": 2.5640652179718018 + }, + { + "auxiliary_loss_clip": 0.0113929, + "auxiliary_loss_mlp": 0.0104318, + "balance_loss_clip": 1.02695596, + "balance_loss_mlp": 1.0467453, + "epoch": 0.21325717721328724, + "flos": 21756192410880.0, + "grad_norm": 1.889324350950523, + "language_loss": 0.80698627, + "learning_rate": 3.657932361952479e-06, + "loss": 0.82881093, + "num_input_tokens_seen": 76560540, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.92578125, + "step": 3547, + "time_per_iteration": 2.527398109436035 + }, + { + "auxiliary_loss_clip": 0.01140759, + "auxiliary_loss_mlp": 0.0104395, + "balance_loss_clip": 1.02702212, + "balance_loss_mlp": 1.04538703, + "epoch": 0.2133173004659552, + "flos": 28730870760960.0, + "grad_norm": 3.232228952830713, + "language_loss": 0.74496448, + "learning_rate": 3.6577145050950504e-06, + "loss": 0.76681155, + "num_input_tokens_seen": 76581760, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.953125, + "step": 3548, + "time_per_iteration": 2.5493834018707275 + }, + { + "auxiliary_loss_clip": 0.01141872, + "auxiliary_loss_mlp": 0.01045412, + "balance_loss_clip": 1.02719641, + "balance_loss_mlp": 1.04663396, + "epoch": 0.21337742371862317, + "flos": 16837077674880.0, + "grad_norm": 2.0441969792992265, + "language_loss": 0.74135804, + "learning_rate": 3.657496585376922e-06, + "loss": 0.76323086, + "num_input_tokens_seen": 76599940, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.953125, + "step": 3549, + "time_per_iteration": 2.514817476272583 + }, + { + "auxiliary_loss_clip": 0.01142468, + "auxiliary_loss_mlp": 0.01046789, + "balance_loss_clip": 1.03063631, + "balance_loss_mlp": 1.04963064, + "epoch": 0.21343754697129114, + "flos": 24424930120320.0, + "grad_norm": 1.6981522694050752, + "language_loss": 0.80653727, + "learning_rate": 3.657278602806357e-06, + "loss": 0.82842982, + "num_input_tokens_seen": 76619580, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.92578125, + "step": 3550, + "time_per_iteration": 2.541501045227051 + }, + { + "auxiliary_loss_clip": 0.01136887, + "auxiliary_loss_mlp": 0.01044073, + "balance_loss_clip": 1.02883255, + "balance_loss_mlp": 1.04706621, + "epoch": 0.21349767022395913, + "flos": 19277817805440.0, + "grad_norm": 1.615115943492657, + "language_loss": 0.88341218, + "learning_rate": 3.657060557391621e-06, + "loss": 0.90522182, + "num_input_tokens_seen": 76638195, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8984375, + "step": 3551, + "time_per_iteration": 2.5310463905334473 + }, + { + "auxiliary_loss_clip": 0.01136336, + "auxiliary_loss_mlp": 0.0104486, + "balance_loss_clip": 1.02887464, + "balance_loss_mlp": 1.04430258, + "epoch": 0.2135577934766271, + "flos": 17347547808000.0, + "grad_norm": 2.1215125327645152, + "language_loss": 0.83415043, + "learning_rate": 3.656842449140983e-06, + "loss": 0.8559624, + "num_input_tokens_seen": 76656695, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.91796875, + "step": 3552, + "time_per_iteration": 3.974120616912842 + }, + { + "auxiliary_loss_clip": 0.0113546, + "auxiliary_loss_mlp": 0.01048241, + "balance_loss_clip": 1.03164101, + "balance_loss_mlp": 1.04522753, + "epoch": 0.21361791672929506, + "flos": 24057204635520.0, + "grad_norm": 1.7556537525349103, + "language_loss": 0.76692683, + "learning_rate": 3.656624278062713e-06, + "loss": 0.78876388, + "num_input_tokens_seen": 76677430, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.90234375, + "step": 3553, + "time_per_iteration": 3.964289903640747 + }, + { + "auxiliary_loss_clip": 0.0113607, + "auxiliary_loss_mlp": 0.01040019, + "balance_loss_clip": 1.02520156, + "balance_loss_mlp": 1.04556942, + "epoch": 0.21367803998196302, + "flos": 22162306556160.0, + "grad_norm": 1.6502841430946371, + "language_loss": 0.72946119, + "learning_rate": 3.6564060441650843e-06, + "loss": 0.75122207, + "num_input_tokens_seen": 76697615, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.90625, + "step": 3554, + "time_per_iteration": 2.5141818523406982 + }, + { + "auxiliary_loss_clip": 0.01137832, + "auxiliary_loss_mlp": 0.01036933, + "balance_loss_clip": 1.02121508, + "balance_loss_mlp": 1.04672861, + "epoch": 0.213738163234631, + "flos": 20886867452160.0, + "grad_norm": 1.9371755733444218, + "language_loss": 0.6745261, + "learning_rate": 3.6561877474563724e-06, + "loss": 0.69627374, + "num_input_tokens_seen": 76715685, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.91015625, + "step": 3555, + "time_per_iteration": 2.6116089820861816 + }, + { + "auxiliary_loss_clip": 0.01138406, + "auxiliary_loss_mlp": 0.01039389, + "balance_loss_clip": 1.02293801, + "balance_loss_mlp": 1.04564714, + "epoch": 0.21379828648729896, + "flos": 28403114135040.0, + "grad_norm": 2.2550763051095752, + "language_loss": 0.64778429, + "learning_rate": 3.6559693879448553e-06, + "loss": 0.66956222, + "num_input_tokens_seen": 76735405, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9296875, + "step": 3556, + "time_per_iteration": 2.553746223449707 + }, + { + "auxiliary_loss_clip": 0.01139299, + "auxiliary_loss_mlp": 0.01045701, + "balance_loss_clip": 1.02893996, + "balance_loss_mlp": 1.04656768, + "epoch": 0.21385840973996692, + "flos": 25479662106240.0, + "grad_norm": 1.6295299556205536, + "language_loss": 0.72333252, + "learning_rate": 3.6557509656388125e-06, + "loss": 0.74518251, + "num_input_tokens_seen": 76754395, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9296875, + "step": 3557, + "time_per_iteration": 2.6562533378601074 + }, + { + "auxiliary_loss_clip": 0.0114268, + "auxiliary_loss_mlp": 0.01039642, + "balance_loss_clip": 1.02189136, + "balance_loss_mlp": 1.04716706, + "epoch": 0.2139185329926349, + "flos": 28074280101120.0, + "grad_norm": 1.6722734443717013, + "language_loss": 0.67139357, + "learning_rate": 3.655532480546528e-06, + "loss": 0.6932168, + "num_input_tokens_seen": 76777210, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.953125, + "step": 3558, + "time_per_iteration": 2.5435290336608887 + }, + { + "auxiliary_loss_clip": 0.01142773, + "auxiliary_loss_mlp": 0.01036302, + "balance_loss_clip": 1.01943386, + "balance_loss_mlp": 1.04542494, + "epoch": 0.21397865624530288, + "flos": 19608698914560.0, + "grad_norm": 1.8839208997443517, + "language_loss": 0.79702216, + "learning_rate": 3.655313932676286e-06, + "loss": 0.81881285, + "num_input_tokens_seen": 76795830, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9765625, + "step": 3559, + "time_per_iteration": 2.5535330772399902 + }, + { + "auxiliary_loss_clip": 0.01137143, + "auxiliary_loss_mlp": 0.01044167, + "balance_loss_clip": 1.02822304, + "balance_loss_mlp": 1.04436731, + "epoch": 0.21403877949797084, + "flos": 24681476033280.0, + "grad_norm": 1.6653874224583467, + "language_loss": 0.67549068, + "learning_rate": 3.655095322036373e-06, + "loss": 0.69730377, + "num_input_tokens_seen": 76814700, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.9296875, + "step": 3560, + "time_per_iteration": 2.5241451263427734 + }, + { + "auxiliary_loss_clip": 0.0114283, + "auxiliary_loss_mlp": 0.01041365, + "balance_loss_clip": 1.02514052, + "balance_loss_mlp": 1.04846883, + "epoch": 0.2140989027506388, + "flos": 19861150677120.0, + "grad_norm": 1.8721878156787213, + "language_loss": 0.72995424, + "learning_rate": 3.65487664863508e-06, + "loss": 0.75179613, + "num_input_tokens_seen": 76833400, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.9453125, + "step": 3561, + "time_per_iteration": 2.5678720474243164 + }, + { + "auxiliary_loss_clip": 0.01142897, + "auxiliary_loss_mlp": 0.01044952, + "balance_loss_clip": 1.02817965, + "balance_loss_mlp": 1.04897678, + "epoch": 0.21415902600330677, + "flos": 19135324552320.0, + "grad_norm": 2.2783713689110243, + "language_loss": 0.77110738, + "learning_rate": 3.654657912480698e-06, + "loss": 0.79298586, + "num_input_tokens_seen": 76850645, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9375, + "step": 3562, + "time_per_iteration": 2.4598803520202637 + }, + { + "auxiliary_loss_clip": 0.01140561, + "auxiliary_loss_mlp": 0.01038127, + "balance_loss_clip": 1.02160454, + "balance_loss_mlp": 1.04795694, + "epoch": 0.21421914925597474, + "flos": 22272624201600.0, + "grad_norm": 1.5929440625910447, + "language_loss": 0.84534913, + "learning_rate": 3.6544391135815237e-06, + "loss": 0.867136, + "num_input_tokens_seen": 76870135, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.92578125, + "step": 3563, + "time_per_iteration": 2.5654757022857666 + }, + { + "auxiliary_loss_clip": 0.0114087, + "auxiliary_loss_mlp": 0.01038331, + "balance_loss_clip": 1.02227342, + "balance_loss_mlp": 1.04757166, + "epoch": 0.2142792725086427, + "flos": 33875109987840.0, + "grad_norm": 1.6134338415520206, + "language_loss": 0.76727796, + "learning_rate": 3.6542202519458507e-06, + "loss": 0.78907001, + "num_input_tokens_seen": 76893905, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.93359375, + "step": 3564, + "time_per_iteration": 2.591064214706421 + }, + { + "auxiliary_loss_clip": 0.01138542, + "auxiliary_loss_mlp": 0.01041793, + "balance_loss_clip": 1.02560401, + "balance_loss_mlp": 1.0467248, + "epoch": 0.2143393957613107, + "flos": 19860216923520.0, + "grad_norm": 1.880454163642384, + "language_loss": 0.88260084, + "learning_rate": 3.654001327581981e-06, + "loss": 0.90440416, + "num_input_tokens_seen": 76914205, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.91796875, + "step": 3565, + "time_per_iteration": 2.571242094039917 + }, + { + "auxiliary_loss_clip": 0.0105057, + "auxiliary_loss_mlp": 0.01009282, + "balance_loss_clip": 1.00739813, + "balance_loss_mlp": 1.0192101, + "epoch": 0.21439951901397866, + "flos": 68530093090560.0, + "grad_norm": 0.8403524328969202, + "language_loss": 0.52300179, + "learning_rate": 3.653782340498215e-06, + "loss": 0.54360026, + "num_input_tokens_seen": 76975650, + "router_z_loss_clip": 0.01879883, + "router_z_loss_mlp": 0.3125, + "step": 3566, + "time_per_iteration": 3.055588722229004 + }, + { + "auxiliary_loss_clip": 0.01136421, + "auxiliary_loss_mlp": 0.01036219, + "balance_loss_clip": 1.02093637, + "balance_loss_mlp": 1.04677701, + "epoch": 0.21445964226664663, + "flos": 19682998197120.0, + "grad_norm": 1.91490691342046, + "language_loss": 0.67412555, + "learning_rate": 3.6535632907028566e-06, + "loss": 0.69585192, + "num_input_tokens_seen": 76992615, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.89453125, + "step": 3567, + "time_per_iteration": 2.5511529445648193 + }, + { + "auxiliary_loss_clip": 0.01135888, + "auxiliary_loss_mlp": 0.01041672, + "balance_loss_clip": 1.02630615, + "balance_loss_mlp": 1.04691041, + "epoch": 0.2145197655193146, + "flos": 31107259676160.0, + "grad_norm": 1.6974661731729381, + "language_loss": 0.74437779, + "learning_rate": 3.6533441782042126e-06, + "loss": 0.7661534, + "num_input_tokens_seen": 77017005, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.890625, + "step": 3568, + "time_per_iteration": 2.613090753555298 + }, + { + "auxiliary_loss_clip": 0.01137867, + "auxiliary_loss_mlp": 0.01043309, + "balance_loss_clip": 1.02710819, + "balance_loss_mlp": 1.04578757, + "epoch": 0.21457988877198256, + "flos": 20120785159680.0, + "grad_norm": 1.7479940521784256, + "language_loss": 0.77864397, + "learning_rate": 3.6531250030105917e-06, + "loss": 0.80045569, + "num_input_tokens_seen": 77034990, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.921875, + "step": 3569, + "time_per_iteration": 2.567439317703247 + }, + { + "auxiliary_loss_clip": 0.01147794, + "auxiliary_loss_mlp": 0.01038363, + "balance_loss_clip": 1.01981413, + "balance_loss_mlp": 1.05039883, + "epoch": 0.21464001202465052, + "flos": 18588045957120.0, + "grad_norm": 2.3364918832975317, + "language_loss": 0.69533777, + "learning_rate": 3.6529057651303053e-06, + "loss": 0.71719933, + "num_input_tokens_seen": 77052610, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9765625, + "step": 3570, + "time_per_iteration": 2.489550828933716 + }, + { + "auxiliary_loss_clip": 0.01144243, + "auxiliary_loss_mlp": 0.01043749, + "balance_loss_clip": 1.02703631, + "balance_loss_mlp": 1.0480299, + "epoch": 0.21470013527731852, + "flos": 21835160461440.0, + "grad_norm": 2.465398793786977, + "language_loss": 0.78108835, + "learning_rate": 3.6526864645716666e-06, + "loss": 0.80296826, + "num_input_tokens_seen": 77072475, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9609375, + "step": 3571, + "time_per_iteration": 2.527509927749634 + }, + { + "auxiliary_loss_clip": 0.01143428, + "auxiliary_loss_mlp": 0.01043615, + "balance_loss_clip": 1.02556705, + "balance_loss_mlp": 1.0501976, + "epoch": 0.21476025852998648, + "flos": 17603195880960.0, + "grad_norm": 2.5347995603010767, + "language_loss": 0.82851684, + "learning_rate": 3.652467101342991e-06, + "loss": 0.85038722, + "num_input_tokens_seen": 77089930, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.93359375, + "step": 3572, + "time_per_iteration": 2.491955280303955 + }, + { + "auxiliary_loss_clip": 0.01144597, + "auxiliary_loss_mlp": 0.01039432, + "balance_loss_clip": 1.02248025, + "balance_loss_mlp": 1.04700291, + "epoch": 0.21482038178265445, + "flos": 24828135264000.0, + "grad_norm": 2.35018592277076, + "language_loss": 0.64916813, + "learning_rate": 3.652247675452598e-06, + "loss": 0.67100847, + "num_input_tokens_seen": 77108970, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9765625, + "step": 3573, + "time_per_iteration": 2.5238969326019287 + }, + { + "auxiliary_loss_clip": 0.01133482, + "auxiliary_loss_mlp": 0.01040895, + "balance_loss_clip": 1.02481413, + "balance_loss_mlp": 1.04417133, + "epoch": 0.2148805050353224, + "flos": 23258228463360.0, + "grad_norm": 2.2164535787006705, + "language_loss": 0.75577438, + "learning_rate": 3.652028186908807e-06, + "loss": 0.77751815, + "num_input_tokens_seen": 77126045, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.890625, + "step": 3574, + "time_per_iteration": 2.5497734546661377 + }, + { + "auxiliary_loss_clip": 0.01137499, + "auxiliary_loss_mlp": 0.01035076, + "balance_loss_clip": 1.01752853, + "balance_loss_mlp": 1.04568887, + "epoch": 0.21494062828799038, + "flos": 21321098968320.0, + "grad_norm": 1.959683075701339, + "language_loss": 0.72380054, + "learning_rate": 3.6518086357199416e-06, + "loss": 0.74552631, + "num_input_tokens_seen": 77144600, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.91796875, + "step": 3575, + "time_per_iteration": 2.539255142211914 + }, + { + "auxiliary_loss_clip": 0.01141362, + "auxiliary_loss_mlp": 0.01036894, + "balance_loss_clip": 1.02097976, + "balance_loss_mlp": 1.04890776, + "epoch": 0.21500075154065834, + "flos": 18843334894080.0, + "grad_norm": 1.6473570004326006, + "language_loss": 0.68102455, + "learning_rate": 3.6515890218943277e-06, + "loss": 0.70280713, + "num_input_tokens_seen": 77162965, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.92578125, + "step": 3576, + "time_per_iteration": 2.515245199203491 + }, + { + "auxiliary_loss_clip": 0.01144679, + "auxiliary_loss_mlp": 0.01041063, + "balance_loss_clip": 1.02347922, + "balance_loss_mlp": 1.04820943, + "epoch": 0.2150608747933263, + "flos": 18441997257600.0, + "grad_norm": 2.1450103743023936, + "language_loss": 0.88840854, + "learning_rate": 3.651369345440292e-06, + "loss": 0.91026592, + "num_input_tokens_seen": 77179960, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9609375, + "step": 3577, + "time_per_iteration": 2.4426753520965576 + }, + { + "auxiliary_loss_clip": 0.01054886, + "auxiliary_loss_mlp": 0.01006787, + "balance_loss_clip": 1.00466526, + "balance_loss_mlp": 1.02252448, + "epoch": 0.2151209980459943, + "flos": 66598242894720.0, + "grad_norm": 0.8177210285410575, + "language_loss": 0.56242883, + "learning_rate": 3.6511496063661654e-06, + "loss": 0.5830456, + "num_input_tokens_seen": 77239500, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.32421875, + "step": 3578, + "time_per_iteration": 3.0434820652008057 + }, + { + "auxiliary_loss_clip": 0.0114273, + "auxiliary_loss_mlp": 0.01039801, + "balance_loss_clip": 1.02345788, + "balance_loss_mlp": 1.04957211, + "epoch": 0.21518112129866226, + "flos": 21575885114880.0, + "grad_norm": 1.6812319537870581, + "language_loss": 0.88500881, + "learning_rate": 3.6509298046802807e-06, + "loss": 0.90683413, + "num_input_tokens_seen": 77254680, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.9296875, + "step": 3579, + "time_per_iteration": 2.4646458625793457 + }, + { + "auxiliary_loss_clip": 0.01140846, + "auxiliary_loss_mlp": 0.01042477, + "balance_loss_clip": 1.02551329, + "balance_loss_mlp": 1.04618824, + "epoch": 0.21524124455133023, + "flos": 20047635112320.0, + "grad_norm": 1.7668055337606152, + "language_loss": 0.78238297, + "learning_rate": 3.650709940390972e-06, + "loss": 0.80421615, + "num_input_tokens_seen": 77274060, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9453125, + "step": 3580, + "time_per_iteration": 2.5029854774475098 + }, + { + "auxiliary_loss_clip": 0.01138764, + "auxiliary_loss_mlp": 0.01042384, + "balance_loss_clip": 1.02557576, + "balance_loss_mlp": 1.04757452, + "epoch": 0.2153013678039982, + "flos": 23951807153280.0, + "grad_norm": 1.7955176576656944, + "language_loss": 0.73129165, + "learning_rate": 3.6504900135065775e-06, + "loss": 0.75310302, + "num_input_tokens_seen": 77293255, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9140625, + "step": 3581, + "time_per_iteration": 2.503103733062744 + }, + { + "auxiliary_loss_clip": 0.01137091, + "auxiliary_loss_mlp": 0.0104596, + "balance_loss_clip": 1.02723205, + "balance_loss_mlp": 1.04665411, + "epoch": 0.21536149105666616, + "flos": 20594841880320.0, + "grad_norm": 2.610409860459302, + "language_loss": 0.70739609, + "learning_rate": 3.6502700240354357e-06, + "loss": 0.72922659, + "num_input_tokens_seen": 77312390, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.90234375, + "step": 3582, + "time_per_iteration": 2.4840197563171387 + }, + { + "auxiliary_loss_clip": 0.01137402, + "auxiliary_loss_mlp": 0.01041337, + "balance_loss_clip": 1.02401567, + "balance_loss_mlp": 1.04602027, + "epoch": 0.21542161430933413, + "flos": 12860042895360.0, + "grad_norm": 2.8570718584923633, + "language_loss": 0.84140432, + "learning_rate": 3.650049971985889e-06, + "loss": 0.86319172, + "num_input_tokens_seen": 77330985, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9140625, + "step": 3583, + "time_per_iteration": 2.4435312747955322 + }, + { + "auxiliary_loss_clip": 0.01143933, + "auxiliary_loss_mlp": 0.01045352, + "balance_loss_clip": 1.02834046, + "balance_loss_mlp": 1.04859185, + "epoch": 0.21548173756200212, + "flos": 26103933504000.0, + "grad_norm": 3.180305067245919, + "language_loss": 0.83226246, + "learning_rate": 3.6498298573662824e-06, + "loss": 0.8541553, + "num_input_tokens_seen": 77350770, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.953125, + "step": 3584, + "time_per_iteration": 2.521476984024048 + }, + { + "auxiliary_loss_clip": 0.01136808, + "auxiliary_loss_mlp": 0.01046426, + "balance_loss_clip": 1.02816272, + "balance_loss_mlp": 1.04518461, + "epoch": 0.21554186081467008, + "flos": 22163779013760.0, + "grad_norm": 2.0358477693345667, + "language_loss": 0.90233314, + "learning_rate": 3.6496096801849625e-06, + "loss": 0.92416549, + "num_input_tokens_seen": 77370510, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.91796875, + "step": 3585, + "time_per_iteration": 2.464745283126831 + }, + { + "auxiliary_loss_clip": 0.01140925, + "auxiliary_loss_mlp": 0.01042254, + "balance_loss_clip": 1.02568388, + "balance_loss_mlp": 1.04832685, + "epoch": 0.21560198406733805, + "flos": 22966741595520.0, + "grad_norm": 2.8296186032289348, + "language_loss": 0.74414444, + "learning_rate": 3.649389440450277e-06, + "loss": 0.76597619, + "num_input_tokens_seen": 77390645, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9296875, + "step": 3586, + "time_per_iteration": 2.5062146186828613 + }, + { + "auxiliary_loss_clip": 0.01141021, + "auxiliary_loss_mlp": 0.01042527, + "balance_loss_clip": 1.02668393, + "balance_loss_mlp": 1.04796743, + "epoch": 0.215662107320006, + "flos": 22784064001920.0, + "grad_norm": 2.1680236591426416, + "language_loss": 0.83055526, + "learning_rate": 3.6491691381705804e-06, + "loss": 0.85239077, + "num_input_tokens_seen": 77409655, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.9296875, + "step": 3587, + "time_per_iteration": 2.4784295558929443 + }, + { + "auxiliary_loss_clip": 0.01138799, + "auxiliary_loss_mlp": 0.01041925, + "balance_loss_clip": 1.02438986, + "balance_loss_mlp": 1.04664946, + "epoch": 0.21572223057267398, + "flos": 30883859038080.0, + "grad_norm": 1.8176747371086701, + "language_loss": 0.75756669, + "learning_rate": 3.648948773354224e-06, + "loss": 0.77937388, + "num_input_tokens_seen": 77430560, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.921875, + "step": 3588, + "time_per_iteration": 2.5896053314208984 + }, + { + "auxiliary_loss_clip": 0.01137468, + "auxiliary_loss_mlp": 0.01039715, + "balance_loss_clip": 1.02294254, + "balance_loss_mlp": 1.04534698, + "epoch": 0.21578235382534194, + "flos": 26910487445760.0, + "grad_norm": 1.8272464683057401, + "language_loss": 0.81006658, + "learning_rate": 3.6487283460095643e-06, + "loss": 0.83183837, + "num_input_tokens_seen": 77455000, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.921875, + "step": 3589, + "time_per_iteration": 2.540090799331665 + }, + { + "auxiliary_loss_clip": 0.01142059, + "auxiliary_loss_mlp": 0.01039493, + "balance_loss_clip": 1.02341199, + "balance_loss_mlp": 1.04792953, + "epoch": 0.2158424770780099, + "flos": 24425720219520.0, + "grad_norm": 2.6129530472479154, + "language_loss": 0.72591126, + "learning_rate": 3.648507856144961e-06, + "loss": 0.74772674, + "num_input_tokens_seen": 77475075, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.94140625, + "step": 3590, + "time_per_iteration": 2.5113861560821533 + }, + { + "auxiliary_loss_clip": 0.01145271, + "auxiliary_loss_mlp": 0.01046731, + "balance_loss_clip": 1.02769351, + "balance_loss_mlp": 1.04830956, + "epoch": 0.2159026003306779, + "flos": 23949975559680.0, + "grad_norm": 2.0133132975130477, + "language_loss": 0.83914638, + "learning_rate": 3.648287303768775e-06, + "loss": 0.86106646, + "num_input_tokens_seen": 77495945, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.96875, + "step": 3591, + "time_per_iteration": 2.488309621810913 + }, + { + "auxiliary_loss_clip": 0.01145642, + "auxiliary_loss_mlp": 0.01050952, + "balance_loss_clip": 1.03167534, + "balance_loss_mlp": 1.04884136, + "epoch": 0.21596272358334587, + "flos": 30040963511040.0, + "grad_norm": 2.271326779903827, + "language_loss": 0.69294131, + "learning_rate": 3.6480666888893686e-06, + "loss": 0.71490723, + "num_input_tokens_seen": 77517140, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.96875, + "step": 3592, + "time_per_iteration": 2.571373462677002 + }, + { + "auxiliary_loss_clip": 0.01143219, + "auxiliary_loss_mlp": 0.01049672, + "balance_loss_clip": 1.03150403, + "balance_loss_mlp": 1.04881072, + "epoch": 0.21602284683601383, + "flos": 20376217751040.0, + "grad_norm": 2.3999192225546677, + "language_loss": 0.84150124, + "learning_rate": 3.647846011515108e-06, + "loss": 0.86343014, + "num_input_tokens_seen": 77536085, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9453125, + "step": 3593, + "time_per_iteration": 2.4590611457824707 + }, + { + "auxiliary_loss_clip": 0.01144804, + "auxiliary_loss_mlp": 0.01049477, + "balance_loss_clip": 1.03210783, + "balance_loss_mlp": 1.04839182, + "epoch": 0.2160829700886818, + "flos": 20777339905920.0, + "grad_norm": 2.850380650061706, + "language_loss": 0.75163305, + "learning_rate": 3.6476252716543625e-06, + "loss": 0.77357584, + "num_input_tokens_seen": 77553675, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.96484375, + "step": 3594, + "time_per_iteration": 3.9338901042938232 + }, + { + "auxiliary_loss_clip": 0.01139476, + "auxiliary_loss_mlp": 0.01043593, + "balance_loss_clip": 1.02666509, + "balance_loss_mlp": 1.04763508, + "epoch": 0.21614309334134976, + "flos": 22309755886080.0, + "grad_norm": 2.0680180645872057, + "language_loss": 0.80541027, + "learning_rate": 3.6474044693155007e-06, + "loss": 0.82724094, + "num_input_tokens_seen": 77573360, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.921875, + "step": 3595, + "time_per_iteration": 3.9857921600341797 + }, + { + "auxiliary_loss_clip": 0.01146272, + "auxiliary_loss_mlp": 0.01043286, + "balance_loss_clip": 1.0259887, + "balance_loss_mlp": 1.04883027, + "epoch": 0.21620321659401773, + "flos": 19609524927360.0, + "grad_norm": 2.3330392864683347, + "language_loss": 0.78089929, + "learning_rate": 3.647183604506897e-06, + "loss": 0.80279487, + "num_input_tokens_seen": 77591865, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.97265625, + "step": 3596, + "time_per_iteration": 2.4515480995178223 + }, + { + "auxiliary_loss_clip": 0.01138472, + "auxiliary_loss_mlp": 0.0104618, + "balance_loss_clip": 1.03006268, + "balance_loss_mlp": 1.04786897, + "epoch": 0.2162633398466857, + "flos": 18844555956480.0, + "grad_norm": 1.9545740457841054, + "language_loss": 0.83011472, + "learning_rate": 3.6469626772369253e-06, + "loss": 0.85196126, + "num_input_tokens_seen": 77611600, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.90625, + "step": 3597, + "time_per_iteration": 2.504703998565674 + }, + { + "auxiliary_loss_clip": 0.011446, + "auxiliary_loss_mlp": 0.01045187, + "balance_loss_clip": 1.02756798, + "balance_loss_mlp": 1.05029655, + "epoch": 0.21632346309935369, + "flos": 18768820129920.0, + "grad_norm": 1.5849845027976412, + "language_loss": 0.80171728, + "learning_rate": 3.6467416875139642e-06, + "loss": 0.82361513, + "num_input_tokens_seen": 77630665, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.94140625, + "step": 3598, + "time_per_iteration": 2.487013101577759 + }, + { + "auxiliary_loss_clip": 0.0114385, + "auxiliary_loss_mlp": 0.01045551, + "balance_loss_clip": 1.02745485, + "balance_loss_mlp": 1.0476619, + "epoch": 0.21638358635202165, + "flos": 26324173745280.0, + "grad_norm": 1.8175927270691912, + "language_loss": 0.82054996, + "learning_rate": 3.6465206353463934e-06, + "loss": 0.842444, + "num_input_tokens_seen": 77650835, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9609375, + "step": 3599, + "time_per_iteration": 2.5515315532684326 + }, + { + "auxiliary_loss_clip": 0.0113915, + "auxiliary_loss_mlp": 0.0104149, + "balance_loss_clip": 1.0253613, + "balance_loss_mlp": 1.04831243, + "epoch": 0.21644370960468962, + "flos": 20740854666240.0, + "grad_norm": 3.186477441139726, + "language_loss": 0.7654863, + "learning_rate": 3.6462995207425947e-06, + "loss": 0.78729272, + "num_input_tokens_seen": 77669000, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.90625, + "step": 3600, + "time_per_iteration": 2.5067033767700195 + }, + { + "auxiliary_loss_clip": 0.01139528, + "auxiliary_loss_mlp": 0.01043686, + "balance_loss_clip": 1.02842712, + "balance_loss_mlp": 1.04657555, + "epoch": 0.21650383285735758, + "flos": 23952238116480.0, + "grad_norm": 1.9514188507385115, + "language_loss": 0.80026001, + "learning_rate": 3.6460783437109533e-06, + "loss": 0.82209218, + "num_input_tokens_seen": 77688745, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.9296875, + "step": 3601, + "time_per_iteration": 2.5383710861206055 + }, + { + "auxiliary_loss_clip": 0.01142747, + "auxiliary_loss_mlp": 0.01047381, + "balance_loss_clip": 1.0306437, + "balance_loss_mlp": 1.04938436, + "epoch": 0.21656395611002555, + "flos": 23696087253120.0, + "grad_norm": 1.8096424478422806, + "language_loss": 0.83358335, + "learning_rate": 3.6458571042598565e-06, + "loss": 0.85548466, + "num_input_tokens_seen": 77708445, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.93359375, + "step": 3602, + "time_per_iteration": 2.525151491165161 + }, + { + "auxiliary_loss_clip": 0.01140411, + "auxiliary_loss_mlp": 0.01048188, + "balance_loss_clip": 1.03065276, + "balance_loss_mlp": 1.04670155, + "epoch": 0.2166240793626935, + "flos": 20666052593280.0, + "grad_norm": 1.6489882186888527, + "language_loss": 0.74271673, + "learning_rate": 3.645635802397693e-06, + "loss": 0.76460266, + "num_input_tokens_seen": 77728465, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9375, + "step": 3603, + "time_per_iteration": 2.5083842277526855 + }, + { + "auxiliary_loss_clip": 0.01140372, + "auxiliary_loss_mlp": 0.01043135, + "balance_loss_clip": 1.02723289, + "balance_loss_mlp": 1.05022252, + "epoch": 0.2166842026153615, + "flos": 21580410228480.0, + "grad_norm": 1.5478742891076147, + "language_loss": 0.73956323, + "learning_rate": 3.645414438132855e-06, + "loss": 0.76139832, + "num_input_tokens_seen": 77746735, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.90234375, + "step": 3604, + "time_per_iteration": 2.5100204944610596 + }, + { + "auxiliary_loss_clip": 0.01137594, + "auxiliary_loss_mlp": 0.01042667, + "balance_loss_clip": 1.02598965, + "balance_loss_mlp": 1.04853153, + "epoch": 0.21674432586802947, + "flos": 25629948610560.0, + "grad_norm": 2.2268823896980376, + "language_loss": 0.80375803, + "learning_rate": 3.6451930114737366e-06, + "loss": 0.82556069, + "num_input_tokens_seen": 77768105, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.890625, + "step": 3605, + "time_per_iteration": 2.5182228088378906 + }, + { + "auxiliary_loss_clip": 0.01064224, + "auxiliary_loss_mlp": 0.01010449, + "balance_loss_clip": 1.0086962, + "balance_loss_mlp": 1.02975249, + "epoch": 0.21680444912069743, + "flos": 56417783616000.0, + "grad_norm": 0.6948121220218867, + "language_loss": 0.58376318, + "learning_rate": 3.6449715224287347e-06, + "loss": 0.60450989, + "num_input_tokens_seen": 77833750, + "router_z_loss_clip": 0.01757812, + "router_z_loss_mlp": 0.34375, + "step": 3606, + "time_per_iteration": 3.1655373573303223 + }, + { + "auxiliary_loss_clip": 0.01145196, + "auxiliary_loss_mlp": 0.01046918, + "balance_loss_clip": 1.02921534, + "balance_loss_mlp": 1.04939568, + "epoch": 0.2168645723733654, + "flos": 23878944414720.0, + "grad_norm": 2.6754398361548613, + "language_loss": 0.73210037, + "learning_rate": 3.644749971006248e-06, + "loss": 0.75402147, + "num_input_tokens_seen": 77853780, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9609375, + "step": 3607, + "time_per_iteration": 2.508920431137085 + }, + { + "auxiliary_loss_clip": 0.01146221, + "auxiliary_loss_mlp": 0.0104816, + "balance_loss_clip": 1.02995718, + "balance_loss_mlp": 1.04935443, + "epoch": 0.21692469562603336, + "flos": 16946174257920.0, + "grad_norm": 2.5718647894236053, + "language_loss": 0.76626337, + "learning_rate": 3.6445283572146765e-06, + "loss": 0.78820717, + "num_input_tokens_seen": 77872575, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.96875, + "step": 3608, + "time_per_iteration": 2.440258502960205 + }, + { + "auxiliary_loss_clip": 0.01144868, + "auxiliary_loss_mlp": 0.01046046, + "balance_loss_clip": 1.02985787, + "balance_loss_mlp": 1.04866827, + "epoch": 0.21698481887870133, + "flos": 25119047514240.0, + "grad_norm": 1.796333172920123, + "language_loss": 0.74395084, + "learning_rate": 3.6443066810624255e-06, + "loss": 0.76586002, + "num_input_tokens_seen": 77892700, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.9609375, + "step": 3609, + "time_per_iteration": 2.5326688289642334 + }, + { + "auxiliary_loss_clip": 0.01143438, + "auxiliary_loss_mlp": 0.01048498, + "balance_loss_clip": 1.03137922, + "balance_loss_mlp": 1.04871368, + "epoch": 0.2170449421313693, + "flos": 17894682748800.0, + "grad_norm": 1.781486329059154, + "language_loss": 0.88848329, + "learning_rate": 3.6440849425579e-06, + "loss": 0.91040266, + "num_input_tokens_seen": 77911060, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9453125, + "step": 3610, + "time_per_iteration": 2.4611029624938965 + }, + { + "auxiliary_loss_clip": 0.01144855, + "auxiliary_loss_mlp": 0.01038228, + "balance_loss_clip": 1.02090693, + "balance_loss_mlp": 1.05045652, + "epoch": 0.2171050653840373, + "flos": 22638446265600.0, + "grad_norm": 2.036787917991119, + "language_loss": 0.77587712, + "learning_rate": 3.6438631417095095e-06, + "loss": 0.79770797, + "num_input_tokens_seen": 77929930, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9453125, + "step": 3611, + "time_per_iteration": 2.5187723636627197 + }, + { + "auxiliary_loss_clip": 0.01136409, + "auxiliary_loss_mlp": 0.01044629, + "balance_loss_clip": 1.02829766, + "balance_loss_mlp": 1.04609489, + "epoch": 0.21716518863670525, + "flos": 19499997381120.0, + "grad_norm": 2.067133307741882, + "language_loss": 0.63197911, + "learning_rate": 3.6436412785256637e-06, + "loss": 0.65378946, + "num_input_tokens_seen": 77949060, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.90234375, + "step": 3612, + "time_per_iteration": 2.4585959911346436 + }, + { + "auxiliary_loss_clip": 0.0114176, + "auxiliary_loss_mlp": 0.01042113, + "balance_loss_clip": 1.02504194, + "balance_loss_mlp": 1.04799449, + "epoch": 0.21722531188937322, + "flos": 19792022952960.0, + "grad_norm": 1.9312736490377453, + "language_loss": 0.75120652, + "learning_rate": 3.643419353014776e-06, + "loss": 0.77304518, + "num_input_tokens_seen": 77967920, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9375, + "step": 3613, + "time_per_iteration": 2.4866983890533447 + }, + { + "auxiliary_loss_clip": 0.01136544, + "auxiliary_loss_mlp": 0.01046281, + "balance_loss_clip": 1.02900767, + "balance_loss_mlp": 1.04560208, + "epoch": 0.21728543514204118, + "flos": 13334386924800.0, + "grad_norm": 3.0184875495721, + "language_loss": 0.70767504, + "learning_rate": 3.643197365185261e-06, + "loss": 0.72950327, + "num_input_tokens_seen": 77985330, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.90625, + "step": 3614, + "time_per_iteration": 2.4454689025878906 + }, + { + "auxiliary_loss_clip": 0.01141605, + "auxiliary_loss_mlp": 0.01046562, + "balance_loss_clip": 1.0288837, + "balance_loss_mlp": 1.0491401, + "epoch": 0.21734555839470915, + "flos": 15231870783360.0, + "grad_norm": 1.8064523730299737, + "language_loss": 0.7314586, + "learning_rate": 3.6429753150455378e-06, + "loss": 0.75334036, + "num_input_tokens_seen": 78003105, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.92578125, + "step": 3615, + "time_per_iteration": 2.488711357116699 + }, + { + "auxiliary_loss_clip": 0.01145923, + "auxiliary_loss_mlp": 0.0104763, + "balance_loss_clip": 1.02832997, + "balance_loss_mlp": 1.04751146, + "epoch": 0.2174056816473771, + "flos": 19973982274560.0, + "grad_norm": 2.7876016160510377, + "language_loss": 0.90045536, + "learning_rate": 3.6427532026040263e-06, + "loss": 0.92239082, + "num_input_tokens_seen": 78019655, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.984375, + "step": 3616, + "time_per_iteration": 2.4552054405212402 + }, + { + "auxiliary_loss_clip": 0.01143252, + "auxiliary_loss_mlp": 0.01041039, + "balance_loss_clip": 1.02356279, + "balance_loss_mlp": 1.04853153, + "epoch": 0.21746580490004508, + "flos": 16687293960960.0, + "grad_norm": 2.4503731233397383, + "language_loss": 0.8111589, + "learning_rate": 3.642531027869148e-06, + "loss": 0.83300173, + "num_input_tokens_seen": 78036025, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9453125, + "step": 3617, + "time_per_iteration": 2.465254068374634 + }, + { + "auxiliary_loss_clip": 0.01143954, + "auxiliary_loss_mlp": 0.01045828, + "balance_loss_clip": 1.02928162, + "balance_loss_mlp": 1.04851139, + "epoch": 0.21752592815271307, + "flos": 25772298209280.0, + "grad_norm": 1.7784831572545423, + "language_loss": 0.75509727, + "learning_rate": 3.642308790849329e-06, + "loss": 0.77699506, + "num_input_tokens_seen": 78055645, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.953125, + "step": 3618, + "time_per_iteration": 2.5263705253601074 + }, + { + "auxiliary_loss_clip": 0.0114255, + "auxiliary_loss_mlp": 0.01049263, + "balance_loss_clip": 1.03103614, + "balance_loss_mlp": 1.04738426, + "epoch": 0.21758605140538104, + "flos": 11254692349440.0, + "grad_norm": 1.9247647214638754, + "language_loss": 0.69221723, + "learning_rate": 3.642086491552996e-06, + "loss": 0.71413535, + "num_input_tokens_seen": 78071660, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.94921875, + "step": 3619, + "time_per_iteration": 2.4615654945373535 + }, + { + "auxiliary_loss_clip": 0.01145954, + "auxiliary_loss_mlp": 0.01044723, + "balance_loss_clip": 1.02723491, + "balance_loss_mlp": 1.04906762, + "epoch": 0.217646174658049, + "flos": 19242625455360.0, + "grad_norm": 1.7662634429670958, + "language_loss": 0.78337491, + "learning_rate": 3.641864129988579e-06, + "loss": 0.80528164, + "num_input_tokens_seen": 78091265, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.96875, + "step": 3620, + "time_per_iteration": 2.4954700469970703 + }, + { + "auxiliary_loss_clip": 0.01133661, + "auxiliary_loss_mlp": 0.0103768, + "balance_loss_clip": 1.02116966, + "balance_loss_mlp": 1.04363799, + "epoch": 0.21770629791071697, + "flos": 21945083057280.0, + "grad_norm": 2.0129000326388695, + "language_loss": 0.79769373, + "learning_rate": 3.641641706164509e-06, + "loss": 0.81940717, + "num_input_tokens_seen": 78110095, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.90234375, + "step": 3621, + "time_per_iteration": 2.490427255630493 + }, + { + "auxiliary_loss_clip": 0.01138307, + "auxiliary_loss_mlp": 0.01040724, + "balance_loss_clip": 1.02436888, + "balance_loss_mlp": 1.04595852, + "epoch": 0.21776642116338493, + "flos": 24936764970240.0, + "grad_norm": 1.7548460288059653, + "language_loss": 0.87967801, + "learning_rate": 3.641419220089221e-06, + "loss": 0.90146828, + "num_input_tokens_seen": 78129475, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.921875, + "step": 3622, + "time_per_iteration": 2.484462022781372 + }, + { + "auxiliary_loss_clip": 0.01142961, + "auxiliary_loss_mlp": 0.01040225, + "balance_loss_clip": 1.02067459, + "balance_loss_mlp": 1.04766297, + "epoch": 0.2178265444160529, + "flos": 17821317219840.0, + "grad_norm": 4.811459611972859, + "language_loss": 0.76945633, + "learning_rate": 3.641196671771152e-06, + "loss": 0.79128814, + "num_input_tokens_seen": 78146880, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.94921875, + "step": 3623, + "time_per_iteration": 2.4476547241210938 + }, + { + "auxiliary_loss_clip": 0.0114403, + "auxiliary_loss_mlp": 0.01048104, + "balance_loss_clip": 1.02992439, + "balance_loss_mlp": 1.04891419, + "epoch": 0.2178866676687209, + "flos": 17712902995200.0, + "grad_norm": 2.1152987510548615, + "language_loss": 0.84886312, + "learning_rate": 3.640974061218741e-06, + "loss": 0.8707844, + "num_input_tokens_seen": 78165065, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.94921875, + "step": 3624, + "time_per_iteration": 2.444913387298584 + }, + { + "auxiliary_loss_clip": 0.0114445, + "auxiliary_loss_mlp": 0.010571, + "balance_loss_clip": 1.0397315, + "balance_loss_mlp": 1.0487287, + "epoch": 0.21794679092138886, + "flos": 16945851035520.0, + "grad_norm": 2.345969751242133, + "language_loss": 0.77035248, + "learning_rate": 3.640751388440429e-06, + "loss": 0.79236794, + "num_input_tokens_seen": 78180005, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.95703125, + "step": 3625, + "time_per_iteration": 2.4511115550994873 + }, + { + "auxiliary_loss_clip": 0.01059313, + "auxiliary_loss_mlp": 0.01000008, + "balance_loss_clip": 0.99836272, + "balance_loss_mlp": 1.02361774, + "epoch": 0.21800691417405682, + "flos": 63718566566400.0, + "grad_norm": 0.8233389824181596, + "language_loss": 0.60720766, + "learning_rate": 3.64052865344466e-06, + "loss": 0.62780088, + "num_input_tokens_seen": 78245350, + "router_z_loss_clip": 0.01647949, + "router_z_loss_mlp": 0.35546875, + "step": 3626, + "time_per_iteration": 3.21004319190979 + }, + { + "auxiliary_loss_clip": 0.0114194, + "auxiliary_loss_mlp": 0.01047127, + "balance_loss_clip": 1.02858984, + "balance_loss_mlp": 1.04572678, + "epoch": 0.21806703742672479, + "flos": 21616392677760.0, + "grad_norm": 1.8978511257882154, + "language_loss": 0.90608853, + "learning_rate": 3.6403058562398795e-06, + "loss": 0.92797917, + "num_input_tokens_seen": 78264165, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9609375, + "step": 3627, + "time_per_iteration": 2.4744250774383545 + }, + { + "auxiliary_loss_clip": 0.01138482, + "auxiliary_loss_mlp": 0.01041219, + "balance_loss_clip": 1.02346826, + "balance_loss_mlp": 1.04541492, + "epoch": 0.21812716067939275, + "flos": 19354882435200.0, + "grad_norm": 1.8495097769686537, + "language_loss": 0.73612916, + "learning_rate": 3.6400829968345365e-06, + "loss": 0.75792623, + "num_input_tokens_seen": 78283745, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9296875, + "step": 3628, + "time_per_iteration": 2.4595446586608887 + }, + { + "auxiliary_loss_clip": 0.01137064, + "auxiliary_loss_mlp": 0.01039204, + "balance_loss_clip": 1.02232444, + "balance_loss_mlp": 1.04432046, + "epoch": 0.21818728393206072, + "flos": 23548063305600.0, + "grad_norm": 1.99633175048199, + "language_loss": 0.76800162, + "learning_rate": 3.6398600752370826e-06, + "loss": 0.78976429, + "num_input_tokens_seen": 78302900, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.92578125, + "step": 3629, + "time_per_iteration": 2.487610101699829 + }, + { + "auxiliary_loss_clip": 0.01140004, + "auxiliary_loss_mlp": 0.01041342, + "balance_loss_clip": 1.02514172, + "balance_loss_mlp": 1.04701388, + "epoch": 0.21824740718472868, + "flos": 30225652266240.0, + "grad_norm": 1.5547294213075904, + "language_loss": 0.71320152, + "learning_rate": 3.63963709145597e-06, + "loss": 0.73501503, + "num_input_tokens_seen": 78326470, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.9296875, + "step": 3630, + "time_per_iteration": 2.608846426010132 + }, + { + "auxiliary_loss_clip": 0.01134439, + "auxiliary_loss_mlp": 0.01042587, + "balance_loss_clip": 1.0277338, + "balance_loss_mlp": 1.04635286, + "epoch": 0.21830753043739667, + "flos": 26134672567680.0, + "grad_norm": 1.8110131954886999, + "language_loss": 0.76331747, + "learning_rate": 3.6394140454996544e-06, + "loss": 0.78508776, + "num_input_tokens_seen": 78345810, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8828125, + "step": 3631, + "time_per_iteration": 2.53765869140625 + }, + { + "auxiliary_loss_clip": 0.01138964, + "auxiliary_loss_mlp": 0.0103994, + "balance_loss_clip": 1.0237397, + "balance_loss_mlp": 1.0455693, + "epoch": 0.21836765369006464, + "flos": 21720712752000.0, + "grad_norm": 2.0710075205659906, + "language_loss": 0.74879777, + "learning_rate": 3.639190937376594e-06, + "loss": 0.77058685, + "num_input_tokens_seen": 78364085, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.93359375, + "step": 3632, + "time_per_iteration": 2.484896421432495 + }, + { + "auxiliary_loss_clip": 0.01136054, + "auxiliary_loss_mlp": 0.01035961, + "balance_loss_clip": 1.02029681, + "balance_loss_mlp": 1.04511309, + "epoch": 0.2184277769427326, + "flos": 19937604775680.0, + "grad_norm": 1.966664682342333, + "language_loss": 0.83337629, + "learning_rate": 3.638967767095249e-06, + "loss": 0.8550964, + "num_input_tokens_seen": 78381385, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.91015625, + "step": 3633, + "time_per_iteration": 2.4721779823303223 + }, + { + "auxiliary_loss_clip": 0.01136294, + "auxiliary_loss_mlp": 0.01048418, + "balance_loss_clip": 1.03228879, + "balance_loss_mlp": 1.04592657, + "epoch": 0.21848790019540057, + "flos": 20340235301760.0, + "grad_norm": 1.8655293845238095, + "language_loss": 0.81782126, + "learning_rate": 3.6387445346640823e-06, + "loss": 0.83966839, + "num_input_tokens_seen": 78400500, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.90234375, + "step": 3634, + "time_per_iteration": 2.5514795780181885 + }, + { + "auxiliary_loss_clip": 0.01144011, + "auxiliary_loss_mlp": 0.01041001, + "balance_loss_clip": 1.02468133, + "balance_loss_mlp": 1.04863131, + "epoch": 0.21854802344806853, + "flos": 15450818135040.0, + "grad_norm": 2.010090632845536, + "language_loss": 0.75077927, + "learning_rate": 3.638521240091558e-06, + "loss": 0.77262932, + "num_input_tokens_seen": 78418340, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.953125, + "step": 3635, + "time_per_iteration": 4.07889199256897 + }, + { + "auxiliary_loss_clip": 0.01137664, + "auxiliary_loss_mlp": 0.01053987, + "balance_loss_clip": 1.03775024, + "balance_loss_mlp": 1.04744601, + "epoch": 0.2186081467007365, + "flos": 16320717711360.0, + "grad_norm": 2.2167396678675155, + "language_loss": 0.87881035, + "learning_rate": 3.6382978833861445e-06, + "loss": 0.90072685, + "num_input_tokens_seen": 78434375, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.90234375, + "step": 3636, + "time_per_iteration": 3.9134533405303955 + }, + { + "auxiliary_loss_clip": 0.01138959, + "auxiliary_loss_mlp": 0.01051316, + "balance_loss_clip": 1.03406608, + "balance_loss_mlp": 1.0456109, + "epoch": 0.2186682699534045, + "flos": 21689255416320.0, + "grad_norm": 1.9800006249435054, + "language_loss": 0.75948632, + "learning_rate": 3.638074464556311e-06, + "loss": 0.78138912, + "num_input_tokens_seen": 78451735, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.93359375, + "step": 3637, + "time_per_iteration": 2.5531604290008545 + }, + { + "auxiliary_loss_clip": 0.01143812, + "auxiliary_loss_mlp": 0.01042573, + "balance_loss_clip": 1.02445328, + "balance_loss_mlp": 1.04728055, + "epoch": 0.21872839320607246, + "flos": 17739260599680.0, + "grad_norm": 4.376345077988984, + "language_loss": 0.89677018, + "learning_rate": 3.63785098361053e-06, + "loss": 0.91863406, + "num_input_tokens_seen": 78462730, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.96875, + "step": 3638, + "time_per_iteration": 2.435544967651367 + }, + { + "auxiliary_loss_clip": 0.01140476, + "auxiliary_loss_mlp": 0.01050633, + "balance_loss_clip": 1.03377736, + "balance_loss_mlp": 1.04854274, + "epoch": 0.21878851645874042, + "flos": 18652289431680.0, + "grad_norm": 2.382131601644944, + "language_loss": 0.89958721, + "learning_rate": 3.637627440557275e-06, + "loss": 0.9214983, + "num_input_tokens_seen": 78476300, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.91796875, + "step": 3639, + "time_per_iteration": 2.448150634765625 + }, + { + "auxiliary_loss_clip": 0.01138473, + "auxiliary_loss_mlp": 0.01045558, + "balance_loss_clip": 1.02972686, + "balance_loss_mlp": 1.04632282, + "epoch": 0.2188486397114084, + "flos": 25557301353600.0, + "grad_norm": 1.7796744672676124, + "language_loss": 0.79038727, + "learning_rate": 3.637403835405024e-06, + "loss": 0.81222755, + "num_input_tokens_seen": 78496135, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.921875, + "step": 3640, + "time_per_iteration": 2.544577121734619 + }, + { + "auxiliary_loss_clip": 0.01142754, + "auxiliary_loss_mlp": 0.01051502, + "balance_loss_clip": 1.03291786, + "balance_loss_mlp": 1.05100346, + "epoch": 0.21890876296407635, + "flos": 17892061056000.0, + "grad_norm": 2.046383525913898, + "language_loss": 0.72049212, + "learning_rate": 3.637180168162255e-06, + "loss": 0.74243474, + "num_input_tokens_seen": 78513855, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.91796875, + "step": 3641, + "time_per_iteration": 2.465439558029175 + }, + { + "auxiliary_loss_clip": 0.01142611, + "auxiliary_loss_mlp": 0.01042223, + "balance_loss_clip": 1.02610588, + "balance_loss_mlp": 1.05203855, + "epoch": 0.21896888621674432, + "flos": 17749100926080.0, + "grad_norm": 2.4771917366671, + "language_loss": 0.80913448, + "learning_rate": 3.63695643883745e-06, + "loss": 0.8309828, + "num_input_tokens_seen": 78531740, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.90625, + "step": 3642, + "time_per_iteration": 2.4598801136016846 + }, + { + "auxiliary_loss_clip": 0.01144439, + "auxiliary_loss_mlp": 0.01040377, + "balance_loss_clip": 1.02319944, + "balance_loss_mlp": 1.05089164, + "epoch": 0.21902900946941228, + "flos": 23076161400960.0, + "grad_norm": 2.0352379603627684, + "language_loss": 0.71573192, + "learning_rate": 3.6367326474390928e-06, + "loss": 0.73758006, + "num_input_tokens_seen": 78549600, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9375, + "step": 3643, + "time_per_iteration": 2.4988484382629395 + }, + { + "auxiliary_loss_clip": 0.01144262, + "auxiliary_loss_mlp": 0.01048332, + "balance_loss_clip": 1.03115392, + "balance_loss_mlp": 1.05041492, + "epoch": 0.21908913272208028, + "flos": 48178545004800.0, + "grad_norm": 2.9224514767679763, + "language_loss": 0.68172711, + "learning_rate": 3.6365087939756696e-06, + "loss": 0.70365304, + "num_input_tokens_seen": 78573350, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9375, + "step": 3644, + "time_per_iteration": 2.721107244491577 + }, + { + "auxiliary_loss_clip": 0.01144867, + "auxiliary_loss_mlp": 0.01041697, + "balance_loss_clip": 1.0252583, + "balance_loss_mlp": 1.04905653, + "epoch": 0.21914925597474824, + "flos": 22236749493120.0, + "grad_norm": 2.1869112310362504, + "language_loss": 0.77744782, + "learning_rate": 3.636284878455669e-06, + "loss": 0.79931343, + "num_input_tokens_seen": 78591005, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9609375, + "step": 3645, + "time_per_iteration": 2.4838709831237793 + }, + { + "auxiliary_loss_clip": 0.01140139, + "auxiliary_loss_mlp": 0.01048358, + "balance_loss_clip": 1.03275371, + "balance_loss_mlp": 1.04988873, + "epoch": 0.2192093792274162, + "flos": 22125605834880.0, + "grad_norm": 1.575077237748942, + "language_loss": 0.82405865, + "learning_rate": 3.636060900887582e-06, + "loss": 0.84594363, + "num_input_tokens_seen": 78610645, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.90234375, + "step": 3646, + "time_per_iteration": 2.467958927154541 + }, + { + "auxiliary_loss_clip": 0.01137932, + "auxiliary_loss_mlp": 0.01036528, + "balance_loss_clip": 1.02050591, + "balance_loss_mlp": 1.04901123, + "epoch": 0.21926950248008417, + "flos": 15669442264320.0, + "grad_norm": 1.7225223193128734, + "language_loss": 0.83016759, + "learning_rate": 3.635836861279901e-06, + "loss": 0.85191214, + "num_input_tokens_seen": 78628340, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.890625, + "step": 3647, + "time_per_iteration": 2.4670159816741943 + }, + { + "auxiliary_loss_clip": 0.01137396, + "auxiliary_loss_mlp": 0.01046032, + "balance_loss_clip": 1.02991438, + "balance_loss_mlp": 1.04734278, + "epoch": 0.21932962573275214, + "flos": 30262496641920.0, + "grad_norm": 1.5879018059409027, + "language_loss": 0.72555232, + "learning_rate": 3.635612759641123e-06, + "loss": 0.74738657, + "num_input_tokens_seen": 78649355, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.90234375, + "step": 3648, + "time_per_iteration": 2.5572352409362793 + }, + { + "auxiliary_loss_clip": 0.01140287, + "auxiliary_loss_mlp": 0.01045097, + "balance_loss_clip": 1.02628565, + "balance_loss_mlp": 1.04563618, + "epoch": 0.2193897489854201, + "flos": 10780132838400.0, + "grad_norm": 2.3666125536095612, + "language_loss": 0.74363017, + "learning_rate": 3.635388595979745e-06, + "loss": 0.76548404, + "num_input_tokens_seen": 78664915, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9453125, + "step": 3649, + "time_per_iteration": 2.4465692043304443 + }, + { + "auxiliary_loss_clip": 0.01133567, + "auxiliary_loss_mlp": 0.01043993, + "balance_loss_clip": 1.02869856, + "balance_loss_mlp": 1.04609215, + "epoch": 0.21944987223808807, + "flos": 19133313390720.0, + "grad_norm": 2.0558746559562953, + "language_loss": 0.86408567, + "learning_rate": 3.635164370304267e-06, + "loss": 0.88586134, + "num_input_tokens_seen": 78681475, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.875, + "step": 3650, + "time_per_iteration": 2.4408226013183594 + }, + { + "auxiliary_loss_clip": 0.01137285, + "auxiliary_loss_mlp": 0.0104387, + "balance_loss_clip": 1.02747929, + "balance_loss_mlp": 1.04549015, + "epoch": 0.21950999549075606, + "flos": 22711093522560.0, + "grad_norm": 2.0425834927064934, + "language_loss": 0.83693743, + "learning_rate": 3.6349400826231927e-06, + "loss": 0.85874897, + "num_input_tokens_seen": 78702300, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.91796875, + "step": 3651, + "time_per_iteration": 2.502694845199585 + }, + { + "auxiliary_loss_clip": 0.01137563, + "auxiliary_loss_mlp": 0.01046031, + "balance_loss_clip": 1.02941298, + "balance_loss_mlp": 1.04595184, + "epoch": 0.21957011874342403, + "flos": 10561329141120.0, + "grad_norm": 1.8702009414404626, + "language_loss": 0.74629313, + "learning_rate": 3.634715732945027e-06, + "loss": 0.76812911, + "num_input_tokens_seen": 78720230, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9140625, + "step": 3652, + "time_per_iteration": 2.4422640800476074 + }, + { + "auxiliary_loss_clip": 0.01052644, + "auxiliary_loss_mlp": 0.01011234, + "balance_loss_clip": 1.00946999, + "balance_loss_mlp": 1.0194056, + "epoch": 0.219630241996092, + "flos": 65747913252480.0, + "grad_norm": 0.7344385056765022, + "language_loss": 0.51548386, + "learning_rate": 3.6344913212782764e-06, + "loss": 0.53612262, + "num_input_tokens_seen": 78780200, + "router_z_loss_clip": 0.0177002, + "router_z_loss_mlp": 0.33203125, + "step": 3653, + "time_per_iteration": 3.0743935108184814 + }, + { + "auxiliary_loss_clip": 0.01142335, + "auxiliary_loss_mlp": 0.01048616, + "balance_loss_clip": 1.03215361, + "balance_loss_mlp": 1.05115473, + "epoch": 0.21969036524875996, + "flos": 23696518216320.0, + "grad_norm": 1.781801507589209, + "language_loss": 0.75256276, + "learning_rate": 3.6342668476314514e-06, + "loss": 0.77447224, + "num_input_tokens_seen": 78800575, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.91015625, + "step": 3654, + "time_per_iteration": 2.4826300144195557 + }, + { + "auxiliary_loss_clip": 0.01143131, + "auxiliary_loss_mlp": 0.01041429, + "balance_loss_clip": 1.02499056, + "balance_loss_mlp": 1.04988194, + "epoch": 0.21975048850142792, + "flos": 19640910435840.0, + "grad_norm": 1.9986760770887892, + "language_loss": 0.72757828, + "learning_rate": 3.634042312013064e-06, + "loss": 0.74942386, + "num_input_tokens_seen": 78819585, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9296875, + "step": 3655, + "time_per_iteration": 2.494662284851074 + }, + { + "auxiliary_loss_clip": 0.01139919, + "auxiliary_loss_mlp": 0.01044993, + "balance_loss_clip": 1.02860177, + "balance_loss_mlp": 1.04802227, + "epoch": 0.21981061175409589, + "flos": 22448550038400.0, + "grad_norm": 1.6963533722566047, + "language_loss": 0.80971813, + "learning_rate": 3.6338177144316276e-06, + "loss": 0.83156729, + "num_input_tokens_seen": 78837330, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.91796875, + "step": 3656, + "time_per_iteration": 2.465020179748535 + }, + { + "auxiliary_loss_clip": 0.01141894, + "auxiliary_loss_mlp": 0.01039083, + "balance_loss_clip": 1.02267933, + "balance_loss_mlp": 1.05085039, + "epoch": 0.21987073500676388, + "flos": 18151049093760.0, + "grad_norm": 2.205234752003223, + "language_loss": 0.84668207, + "learning_rate": 3.63359305489566e-06, + "loss": 0.86849183, + "num_input_tokens_seen": 78854955, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.91015625, + "step": 3657, + "time_per_iteration": 2.4626548290252686 + }, + { + "auxiliary_loss_clip": 0.01138622, + "auxiliary_loss_mlp": 0.0103825, + "balance_loss_clip": 1.02126312, + "balance_loss_mlp": 1.0460434, + "epoch": 0.21993085825943184, + "flos": 25626177682560.0, + "grad_norm": 1.714181577212399, + "language_loss": 0.80485702, + "learning_rate": 3.6333683334136803e-06, + "loss": 0.8266257, + "num_input_tokens_seen": 78874965, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.92578125, + "step": 3658, + "time_per_iteration": 2.492835521697998 + }, + { + "auxiliary_loss_clip": 0.01053481, + "auxiliary_loss_mlp": 0.01002458, + "balance_loss_clip": 1.00065756, + "balance_loss_mlp": 1.02029002, + "epoch": 0.2199909815120998, + "flos": 70923217743360.0, + "grad_norm": 0.8995084923077876, + "language_loss": 0.58224851, + "learning_rate": 3.6331435499942095e-06, + "loss": 0.60280788, + "num_input_tokens_seen": 78937740, + "router_z_loss_clip": 0.01794434, + "router_z_loss_mlp": 0.33203125, + "step": 3659, + "time_per_iteration": 3.1709213256835938 + }, + { + "auxiliary_loss_clip": 0.01140235, + "auxiliary_loss_mlp": 0.0103939, + "balance_loss_clip": 1.02248645, + "balance_loss_mlp": 1.04958415, + "epoch": 0.22005110476476777, + "flos": 21543529939200.0, + "grad_norm": 2.4575828715719177, + "language_loss": 0.74535513, + "learning_rate": 3.632918704645772e-06, + "loss": 0.76715136, + "num_input_tokens_seen": 78955055, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.90625, + "step": 3660, + "time_per_iteration": 2.474397897720337 + }, + { + "auxiliary_loss_clip": 0.01139013, + "auxiliary_loss_mlp": 0.01040353, + "balance_loss_clip": 1.02336597, + "balance_loss_mlp": 1.04723859, + "epoch": 0.22011122801743574, + "flos": 22054502862720.0, + "grad_norm": 2.0332694306983723, + "language_loss": 0.81225419, + "learning_rate": 3.632693797376893e-06, + "loss": 0.83404779, + "num_input_tokens_seen": 78974895, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.91796875, + "step": 3661, + "time_per_iteration": 2.4926669597625732 + }, + { + "auxiliary_loss_clip": 0.01138494, + "auxiliary_loss_mlp": 0.01042707, + "balance_loss_clip": 1.02639949, + "balance_loss_mlp": 1.04773009, + "epoch": 0.2201713512701037, + "flos": 26687589598080.0, + "grad_norm": 1.8682139743879211, + "language_loss": 0.73236209, + "learning_rate": 3.632468828196102e-06, + "loss": 0.75417411, + "num_input_tokens_seen": 78994990, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.90625, + "step": 3662, + "time_per_iteration": 2.5111234188079834 + }, + { + "auxiliary_loss_clip": 0.01140855, + "auxiliary_loss_mlp": 0.01048578, + "balance_loss_clip": 1.03333092, + "balance_loss_mlp": 1.05132473, + "epoch": 0.22023147452277167, + "flos": 22162198815360.0, + "grad_norm": 1.6440107639340105, + "language_loss": 0.77800119, + "learning_rate": 3.632243797111929e-06, + "loss": 0.79989552, + "num_input_tokens_seen": 79014405, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.89453125, + "step": 3663, + "time_per_iteration": 2.485520601272583 + }, + { + "auxiliary_loss_clip": 0.01144475, + "auxiliary_loss_mlp": 0.01043185, + "balance_loss_clip": 1.02581656, + "balance_loss_mlp": 1.05125535, + "epoch": 0.22029159777543966, + "flos": 22523280284160.0, + "grad_norm": 3.566897500342904, + "language_loss": 0.80484056, + "learning_rate": 3.632018704132908e-06, + "loss": 0.8267172, + "num_input_tokens_seen": 79032375, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.93359375, + "step": 3664, + "time_per_iteration": 2.4827098846435547 + }, + { + "auxiliary_loss_clip": 0.01146334, + "auxiliary_loss_mlp": 0.01042617, + "balance_loss_clip": 1.02354348, + "balance_loss_mlp": 1.04959095, + "epoch": 0.22035172102810763, + "flos": 13042469093760.0, + "grad_norm": 2.530665000734818, + "language_loss": 0.76296824, + "learning_rate": 3.6317935492675742e-06, + "loss": 0.78485775, + "num_input_tokens_seen": 79049635, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.96875, + "step": 3665, + "time_per_iteration": 2.5118229389190674 + }, + { + "auxiliary_loss_clip": 0.01139389, + "auxiliary_loss_mlp": 0.01044667, + "balance_loss_clip": 1.0282042, + "balance_loss_mlp": 1.04779172, + "epoch": 0.2204118442807756, + "flos": 12165817760640.0, + "grad_norm": 2.7337119989610468, + "language_loss": 0.97959125, + "learning_rate": 3.631568332524466e-06, + "loss": 1.00143182, + "num_input_tokens_seen": 79062890, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9140625, + "step": 3666, + "time_per_iteration": 2.4461512565612793 + }, + { + "auxiliary_loss_clip": 0.01136729, + "auxiliary_loss_mlp": 0.01039342, + "balance_loss_clip": 1.02241421, + "balance_loss_mlp": 1.04582953, + "epoch": 0.22047196753344356, + "flos": 40108806673920.0, + "grad_norm": 2.115803047817727, + "language_loss": 0.80494016, + "learning_rate": 3.631343053912122e-06, + "loss": 0.82670087, + "num_input_tokens_seen": 79085495, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.90625, + "step": 3667, + "time_per_iteration": 2.65198016166687 + }, + { + "auxiliary_loss_clip": 0.01144733, + "auxiliary_loss_mlp": 0.01046593, + "balance_loss_clip": 1.02776945, + "balance_loss_mlp": 1.04882097, + "epoch": 0.22053209078611152, + "flos": 20701137202560.0, + "grad_norm": 1.916720089378095, + "language_loss": 0.77463895, + "learning_rate": 3.631117713439087e-06, + "loss": 0.79655218, + "num_input_tokens_seen": 79101820, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9609375, + "step": 3668, + "time_per_iteration": 2.459141254425049 + }, + { + "auxiliary_loss_clip": 0.0114207, + "auxiliary_loss_mlp": 0.01042745, + "balance_loss_clip": 1.02568614, + "balance_loss_mlp": 1.05058837, + "epoch": 0.2205922140387795, + "flos": 24716309247360.0, + "grad_norm": 1.730318389149699, + "language_loss": 0.71514869, + "learning_rate": 3.630892311113904e-06, + "loss": 0.73699689, + "num_input_tokens_seen": 79123320, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9140625, + "step": 3669, + "time_per_iteration": 2.550732135772705 + }, + { + "auxiliary_loss_clip": 0.01139227, + "auxiliary_loss_mlp": 0.01037839, + "balance_loss_clip": 1.02106571, + "balance_loss_mlp": 1.04615474, + "epoch": 0.22065233729144745, + "flos": 23477247642240.0, + "grad_norm": 2.0994504177928826, + "language_loss": 0.85294032, + "learning_rate": 3.6306668469451215e-06, + "loss": 0.87471098, + "num_input_tokens_seen": 79141615, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9296875, + "step": 3670, + "time_per_iteration": 2.4727606773376465 + }, + { + "auxiliary_loss_clip": 0.01147385, + "auxiliary_loss_mlp": 0.01040938, + "balance_loss_clip": 1.02360499, + "balance_loss_mlp": 1.05130565, + "epoch": 0.22071246054411545, + "flos": 35225566646400.0, + "grad_norm": 1.775856591734502, + "language_loss": 0.76796275, + "learning_rate": 3.6304413209412886e-06, + "loss": 0.789846, + "num_input_tokens_seen": 79164910, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9609375, + "step": 3671, + "time_per_iteration": 2.613104820251465 + }, + { + "auxiliary_loss_clip": 0.01140966, + "auxiliary_loss_mlp": 0.01034463, + "balance_loss_clip": 1.01758265, + "balance_loss_mlp": 1.0487864, + "epoch": 0.2207725837967834, + "flos": 18150294908160.0, + "grad_norm": 2.8820912362302202, + "language_loss": 0.80472648, + "learning_rate": 3.6302157331109573e-06, + "loss": 0.82648075, + "num_input_tokens_seen": 79179685, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.921875, + "step": 3672, + "time_per_iteration": 2.4365992546081543 + }, + { + "auxiliary_loss_clip": 0.01144646, + "auxiliary_loss_mlp": 0.01047711, + "balance_loss_clip": 1.03129566, + "balance_loss_mlp": 1.05145025, + "epoch": 0.22083270704945138, + "flos": 20479675898880.0, + "grad_norm": 1.8912849075471436, + "language_loss": 0.736193, + "learning_rate": 3.629990083462682e-06, + "loss": 0.75811654, + "num_input_tokens_seen": 79196285, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9296875, + "step": 3673, + "time_per_iteration": 2.4908931255340576 + }, + { + "auxiliary_loss_clip": 0.01145514, + "auxiliary_loss_mlp": 0.01037762, + "balance_loss_clip": 1.02064395, + "balance_loss_mlp": 1.05221379, + "epoch": 0.22089283030211934, + "flos": 34125801984000.0, + "grad_norm": 1.9375944290288487, + "language_loss": 0.76505005, + "learning_rate": 3.6297643720050203e-06, + "loss": 0.78688282, + "num_input_tokens_seen": 79216060, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9296875, + "step": 3674, + "time_per_iteration": 2.569312572479248 + }, + { + "auxiliary_loss_clip": 0.01142786, + "auxiliary_loss_mlp": 0.01043506, + "balance_loss_clip": 1.02518344, + "balance_loss_mlp": 1.05025005, + "epoch": 0.2209529535547873, + "flos": 18077216688000.0, + "grad_norm": 2.0287396146216055, + "language_loss": 0.74786556, + "learning_rate": 3.6295385987465293e-06, + "loss": 0.76972854, + "num_input_tokens_seen": 79235145, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.92578125, + "step": 3675, + "time_per_iteration": 2.4762706756591797 + }, + { + "auxiliary_loss_clip": 0.01141021, + "auxiliary_loss_mlp": 0.01040878, + "balance_loss_clip": 1.02395034, + "balance_loss_mlp": 1.0473659, + "epoch": 0.22101307680745527, + "flos": 27235335070080.0, + "grad_norm": 1.7527405009289938, + "language_loss": 0.80050498, + "learning_rate": 3.629312763695772e-06, + "loss": 0.82232398, + "num_input_tokens_seen": 79256960, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9375, + "step": 3676, + "time_per_iteration": 2.5846786499023438 + }, + { + "auxiliary_loss_clip": 0.0114147, + "auxiliary_loss_mlp": 0.01047315, + "balance_loss_clip": 1.03106666, + "balance_loss_mlp": 1.0474596, + "epoch": 0.22107320006012326, + "flos": 16543256423040.0, + "grad_norm": 1.974355382670518, + "language_loss": 0.75501895, + "learning_rate": 3.6290868668613107e-06, + "loss": 0.77690685, + "num_input_tokens_seen": 79274860, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.94140625, + "step": 3677, + "time_per_iteration": 4.02753758430481 + }, + { + "auxiliary_loss_clip": 0.01135837, + "auxiliary_loss_mlp": 0.01041033, + "balance_loss_clip": 1.02455878, + "balance_loss_mlp": 1.0449332, + "epoch": 0.22113332331279123, + "flos": 22054466949120.0, + "grad_norm": 2.0397766719275494, + "language_loss": 0.83412457, + "learning_rate": 3.628860908251712e-06, + "loss": 0.85589325, + "num_input_tokens_seen": 79294005, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.91015625, + "step": 3678, + "time_per_iteration": 3.9455032348632812 + }, + { + "auxiliary_loss_clip": 0.01140751, + "auxiliary_loss_mlp": 0.01046282, + "balance_loss_clip": 1.02903211, + "balance_loss_mlp": 1.04866314, + "epoch": 0.2211934465654592, + "flos": 26612787525120.0, + "grad_norm": 1.7724652071984504, + "language_loss": 0.89272189, + "learning_rate": 3.6286348878755452e-06, + "loss": 0.91459215, + "num_input_tokens_seen": 79314005, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.921875, + "step": 3679, + "time_per_iteration": 2.548166036605835 + }, + { + "auxiliary_loss_clip": 0.01142658, + "auxiliary_loss_mlp": 0.01053161, + "balance_loss_clip": 1.03517246, + "balance_loss_mlp": 1.04887235, + "epoch": 0.22125356981812716, + "flos": 16360363347840.0, + "grad_norm": 2.4577897330130773, + "language_loss": 0.86718571, + "learning_rate": 3.6284088057413803e-06, + "loss": 0.88914388, + "num_input_tokens_seen": 79331030, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9375, + "step": 3680, + "time_per_iteration": 2.468712329864502 + }, + { + "auxiliary_loss_clip": 0.0114123, + "auxiliary_loss_mlp": 0.01044656, + "balance_loss_clip": 1.02809739, + "balance_loss_mlp": 1.05175805, + "epoch": 0.22131369307079513, + "flos": 21651118151040.0, + "grad_norm": 2.0752123015423556, + "language_loss": 0.81897914, + "learning_rate": 3.6281826618577894e-06, + "loss": 0.84083802, + "num_input_tokens_seen": 79348560, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.89453125, + "step": 3681, + "time_per_iteration": 2.532210350036621 + }, + { + "auxiliary_loss_clip": 0.01136266, + "auxiliary_loss_mlp": 0.0103672, + "balance_loss_clip": 1.02076972, + "balance_loss_mlp": 1.04784071, + "epoch": 0.2213738163234631, + "flos": 19609524927360.0, + "grad_norm": 2.44274183004677, + "language_loss": 0.79908317, + "learning_rate": 3.62795645623335e-06, + "loss": 0.82081306, + "num_input_tokens_seen": 79367175, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.88671875, + "step": 3682, + "time_per_iteration": 2.491135358810425 + }, + { + "auxiliary_loss_clip": 0.01140313, + "auxiliary_loss_mlp": 0.01042047, + "balance_loss_clip": 1.02433276, + "balance_loss_mlp": 1.04739022, + "epoch": 0.22143393957613106, + "flos": 23623404082560.0, + "grad_norm": 2.2064811404605376, + "language_loss": 0.77283889, + "learning_rate": 3.627730188876638e-06, + "loss": 0.79466248, + "num_input_tokens_seen": 79388435, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9296875, + "step": 3683, + "time_per_iteration": 2.503041982650757 + }, + { + "auxiliary_loss_clip": 0.01141417, + "auxiliary_loss_mlp": 0.01045647, + "balance_loss_clip": 1.02824235, + "balance_loss_mlp": 1.04623342, + "epoch": 0.22149406282879905, + "flos": 26177801823360.0, + "grad_norm": 2.114071962716483, + "language_loss": 0.72779894, + "learning_rate": 3.627503859796234e-06, + "loss": 0.74966961, + "num_input_tokens_seen": 79407910, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.953125, + "step": 3684, + "time_per_iteration": 2.521495819091797 + }, + { + "auxiliary_loss_clip": 0.01142849, + "auxiliary_loss_mlp": 0.01043772, + "balance_loss_clip": 1.02598643, + "balance_loss_mlp": 1.05060613, + "epoch": 0.221554186081467, + "flos": 14538758970240.0, + "grad_norm": 1.9389187138945425, + "language_loss": 0.80108052, + "learning_rate": 3.6272774690007207e-06, + "loss": 0.82294679, + "num_input_tokens_seen": 79424020, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.921875, + "step": 3685, + "time_per_iteration": 2.436958074569702 + }, + { + "auxiliary_loss_clip": 0.01135153, + "auxiliary_loss_mlp": 0.01040139, + "balance_loss_clip": 1.02504683, + "balance_loss_mlp": 1.04634571, + "epoch": 0.22161430933413498, + "flos": 22238257864320.0, + "grad_norm": 1.5568750132404718, + "language_loss": 0.87128556, + "learning_rate": 3.6270510164986823e-06, + "loss": 0.89303845, + "num_input_tokens_seen": 79445605, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.88671875, + "step": 3686, + "time_per_iteration": 2.5519070625305176 + }, + { + "auxiliary_loss_clip": 0.01138026, + "auxiliary_loss_mlp": 0.01042632, + "balance_loss_clip": 1.02552581, + "balance_loss_mlp": 1.04762685, + "epoch": 0.22167443258680294, + "flos": 23476529370240.0, + "grad_norm": 1.942015126167962, + "language_loss": 0.77953136, + "learning_rate": 3.626824502298707e-06, + "loss": 0.8013379, + "num_input_tokens_seen": 79463850, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.90234375, + "step": 3687, + "time_per_iteration": 2.495084285736084 + }, + { + "auxiliary_loss_clip": 0.01146436, + "auxiliary_loss_mlp": 0.01048705, + "balance_loss_clip": 1.03085971, + "balance_loss_mlp": 1.05057812, + "epoch": 0.2217345558394709, + "flos": 23221132692480.0, + "grad_norm": 1.8313314390802422, + "language_loss": 0.84722549, + "learning_rate": 3.626597926409383e-06, + "loss": 0.86917698, + "num_input_tokens_seen": 79482845, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.95703125, + "step": 3688, + "time_per_iteration": 2.5029165744781494 + }, + { + "auxiliary_loss_clip": 0.01146721, + "auxiliary_loss_mlp": 0.0104649, + "balance_loss_clip": 1.02897787, + "balance_loss_mlp": 1.05005932, + "epoch": 0.22179467909213887, + "flos": 20011078045440.0, + "grad_norm": 2.7913489877281905, + "language_loss": 0.81395769, + "learning_rate": 3.6263712888393027e-06, + "loss": 0.83588976, + "num_input_tokens_seen": 79501550, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.96484375, + "step": 3689, + "time_per_iteration": 2.487032651901245 + }, + { + "auxiliary_loss_clip": 0.0114216, + "auxiliary_loss_mlp": 0.01044991, + "balance_loss_clip": 1.02758622, + "balance_loss_mlp": 1.04985952, + "epoch": 0.22185480234480687, + "flos": 19683034110720.0, + "grad_norm": 2.5504206662352082, + "language_loss": 0.70040542, + "learning_rate": 3.626144589597061e-06, + "loss": 0.72227693, + "num_input_tokens_seen": 79519680, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.921875, + "step": 3690, + "time_per_iteration": 2.5005807876586914 + }, + { + "auxiliary_loss_clip": 0.01145048, + "auxiliary_loss_mlp": 0.01038313, + "balance_loss_clip": 1.0202167, + "balance_loss_mlp": 1.04890513, + "epoch": 0.22191492559747483, + "flos": 21981316901760.0, + "grad_norm": 1.7318147752747124, + "language_loss": 0.72394359, + "learning_rate": 3.6259178286912528e-06, + "loss": 0.74577713, + "num_input_tokens_seen": 79539000, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9609375, + "step": 3691, + "time_per_iteration": 2.4835989475250244 + }, + { + "auxiliary_loss_clip": 0.01145815, + "auxiliary_loss_mlp": 0.01049746, + "balance_loss_clip": 1.03169739, + "balance_loss_mlp": 1.05317688, + "epoch": 0.2219750488501428, + "flos": 23222066446080.0, + "grad_norm": 2.1843836481793057, + "language_loss": 0.71611524, + "learning_rate": 3.625691006130477e-06, + "loss": 0.73807085, + "num_input_tokens_seen": 79559695, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.92578125, + "step": 3692, + "time_per_iteration": 2.515230655670166 + }, + { + "auxiliary_loss_clip": 0.01146831, + "auxiliary_loss_mlp": 0.01044658, + "balance_loss_clip": 1.02750337, + "balance_loss_mlp": 1.05008483, + "epoch": 0.22203517210281076, + "flos": 22453685683200.0, + "grad_norm": 2.7650002202849113, + "language_loss": 0.87580657, + "learning_rate": 3.6254641219233362e-06, + "loss": 0.89772147, + "num_input_tokens_seen": 79579095, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.96875, + "step": 3693, + "time_per_iteration": 2.5013554096221924 + }, + { + "auxiliary_loss_clip": 0.01138596, + "auxiliary_loss_mlp": 0.010363, + "balance_loss_clip": 1.02086258, + "balance_loss_mlp": 1.04947054, + "epoch": 0.22209529535547873, + "flos": 17564555825280.0, + "grad_norm": 3.031177285152565, + "language_loss": 0.85307622, + "learning_rate": 3.6252371760784325e-06, + "loss": 0.87482512, + "num_input_tokens_seen": 79596430, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.890625, + "step": 3694, + "time_per_iteration": 2.4828481674194336 + }, + { + "auxiliary_loss_clip": 0.01147368, + "auxiliary_loss_mlp": 0.01041631, + "balance_loss_clip": 1.02370214, + "balance_loss_mlp": 1.0491637, + "epoch": 0.2221554186081467, + "flos": 21469015175040.0, + "grad_norm": 1.9517253418741858, + "language_loss": 0.69055748, + "learning_rate": 3.6250101686043725e-06, + "loss": 0.71244752, + "num_input_tokens_seen": 79615825, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.984375, + "step": 3695, + "time_per_iteration": 2.49957537651062 + }, + { + "auxiliary_loss_clip": 0.01141491, + "auxiliary_loss_mlp": 0.0104003, + "balance_loss_clip": 1.02438951, + "balance_loss_mlp": 1.05095696, + "epoch": 0.22221554186081466, + "flos": 27673445255040.0, + "grad_norm": 1.4867456423055678, + "language_loss": 0.71710318, + "learning_rate": 3.6247830995097637e-06, + "loss": 0.73891842, + "num_input_tokens_seen": 79637875, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.90625, + "step": 3696, + "time_per_iteration": 2.5991299152374268 + }, + { + "auxiliary_loss_clip": 0.01140811, + "auxiliary_loss_mlp": 0.01040962, + "balance_loss_clip": 1.02387977, + "balance_loss_mlp": 1.0483942, + "epoch": 0.22227566511348265, + "flos": 25958926298880.0, + "grad_norm": 1.901791440824732, + "language_loss": 0.87694812, + "learning_rate": 3.624555968803217e-06, + "loss": 0.8987658, + "num_input_tokens_seen": 79656970, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.921875, + "step": 3697, + "time_per_iteration": 2.524841547012329 + }, + { + "auxiliary_loss_clip": 0.01134138, + "auxiliary_loss_mlp": 0.01045972, + "balance_loss_clip": 1.03020072, + "balance_loss_mlp": 1.04646909, + "epoch": 0.22233578836615062, + "flos": 39203678833920.0, + "grad_norm": 1.985465494359005, + "language_loss": 0.66109681, + "learning_rate": 3.624328776493346e-06, + "loss": 0.68289793, + "num_input_tokens_seen": 79680275, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.87890625, + "step": 3698, + "time_per_iteration": 2.6806552410125732 + }, + { + "auxiliary_loss_clip": 0.01143188, + "auxiliary_loss_mlp": 0.01038878, + "balance_loss_clip": 1.0208652, + "balance_loss_mlp": 1.049245, + "epoch": 0.22239591161881858, + "flos": 36283782251520.0, + "grad_norm": 1.9701476357110561, + "language_loss": 0.82699466, + "learning_rate": 3.6241015225887637e-06, + "loss": 0.84881532, + "num_input_tokens_seen": 79701255, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9375, + "step": 3699, + "time_per_iteration": 2.620795965194702 + }, + { + "auxiliary_loss_clip": 0.01141189, + "auxiliary_loss_mlp": 0.01044961, + "balance_loss_clip": 1.02789021, + "balance_loss_mlp": 1.04960978, + "epoch": 0.22245603487148655, + "flos": 19719591177600.0, + "grad_norm": 1.6593732889446324, + "language_loss": 0.79488564, + "learning_rate": 3.62387420709809e-06, + "loss": 0.81674713, + "num_input_tokens_seen": 79721315, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9140625, + "step": 3700, + "time_per_iteration": 2.4886739253997803 + }, + { + "auxiliary_loss_clip": 0.01148421, + "auxiliary_loss_mlp": 0.01045024, + "balance_loss_clip": 1.02639139, + "balance_loss_mlp": 1.05154204, + "epoch": 0.2225161581241545, + "flos": 46280450615040.0, + "grad_norm": 7.082418544009014, + "language_loss": 0.72063768, + "learning_rate": 3.623646830029943e-06, + "loss": 0.74257213, + "num_input_tokens_seen": 79742705, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.96875, + "step": 3701, + "time_per_iteration": 2.7293899059295654 + }, + { + "auxiliary_loss_clip": 0.01139069, + "auxiliary_loss_mlp": 0.0104219, + "balance_loss_clip": 1.02520323, + "balance_loss_mlp": 1.04706395, + "epoch": 0.22257628137682248, + "flos": 23696194993920.0, + "grad_norm": 1.9269634413479926, + "language_loss": 0.79704928, + "learning_rate": 3.6234193913929454e-06, + "loss": 0.81886196, + "num_input_tokens_seen": 79763000, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.921875, + "step": 3702, + "time_per_iteration": 2.5527849197387695 + }, + { + "auxiliary_loss_clip": 0.01132932, + "auxiliary_loss_mlp": 0.0104181, + "balance_loss_clip": 1.02487028, + "balance_loss_mlp": 1.04518211, + "epoch": 0.22263640462949044, + "flos": 19353984595200.0, + "grad_norm": 2.7410709876553447, + "language_loss": 0.78632712, + "learning_rate": 3.623191891195723e-06, + "loss": 0.80807453, + "num_input_tokens_seen": 79781335, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.87890625, + "step": 3703, + "time_per_iteration": 2.4955005645751953 + }, + { + "auxiliary_loss_clip": 0.01140692, + "auxiliary_loss_mlp": 0.01037524, + "balance_loss_clip": 1.01810527, + "balance_loss_mlp": 1.0468421, + "epoch": 0.22269652788215843, + "flos": 20776047016320.0, + "grad_norm": 1.8479834568020117, + "language_loss": 0.74212444, + "learning_rate": 3.6229643294469005e-06, + "loss": 0.7639066, + "num_input_tokens_seen": 79800150, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.9375, + "step": 3704, + "time_per_iteration": 2.5000903606414795 + }, + { + "auxiliary_loss_clip": 0.0113847, + "auxiliary_loss_mlp": 0.01042668, + "balance_loss_clip": 1.02618146, + "balance_loss_mlp": 1.05030012, + "epoch": 0.2227566511348264, + "flos": 47958843467520.0, + "grad_norm": 1.7361108874663713, + "language_loss": 0.64372134, + "learning_rate": 3.6227367061551074e-06, + "loss": 0.66553271, + "num_input_tokens_seen": 79822390, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8828125, + "step": 3705, + "time_per_iteration": 2.6993744373321533 + }, + { + "auxiliary_loss_clip": 0.01064369, + "auxiliary_loss_mlp": 0.01006302, + "balance_loss_clip": 1.00454926, + "balance_loss_mlp": 1.03098035, + "epoch": 0.22281677438749437, + "flos": 66218953230720.0, + "grad_norm": 1.353184132187748, + "language_loss": 0.65301311, + "learning_rate": 3.6225090213289766e-06, + "loss": 0.67371976, + "num_input_tokens_seen": 79873350, + "router_z_loss_clip": 0.01757812, + "router_z_loss_mlp": 0.33398438, + "step": 3706, + "time_per_iteration": 2.9832844734191895 + }, + { + "auxiliary_loss_clip": 0.01137982, + "auxiliary_loss_mlp": 0.01036629, + "balance_loss_clip": 1.02076256, + "balance_loss_mlp": 1.0461061, + "epoch": 0.22287689764016233, + "flos": 21871609787520.0, + "grad_norm": 3.09427451037038, + "language_loss": 0.80608439, + "learning_rate": 3.622281274977141e-06, + "loss": 0.82783049, + "num_input_tokens_seen": 79891715, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.91796875, + "step": 3707, + "time_per_iteration": 2.5236454010009766 + }, + { + "auxiliary_loss_clip": 0.01139003, + "auxiliary_loss_mlp": 0.01038491, + "balance_loss_clip": 1.02184916, + "balance_loss_mlp": 1.04706407, + "epoch": 0.2229370208928303, + "flos": 27672475587840.0, + "grad_norm": 2.0318896185848057, + "language_loss": 0.78124011, + "learning_rate": 3.6220534671082367e-06, + "loss": 0.80301505, + "num_input_tokens_seen": 79911175, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.921875, + "step": 3708, + "time_per_iteration": 2.5254104137420654 + }, + { + "auxiliary_loss_clip": 0.01142891, + "auxiliary_loss_mlp": 0.01039636, + "balance_loss_clip": 1.02291107, + "balance_loss_mlp": 1.04897153, + "epoch": 0.22299714414549826, + "flos": 30154657034880.0, + "grad_norm": 1.913582269302705, + "language_loss": 0.79989487, + "learning_rate": 3.6218255977309024e-06, + "loss": 0.82172012, + "num_input_tokens_seen": 79931875, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9375, + "step": 3709, + "time_per_iteration": 2.5528371334075928 + }, + { + "auxiliary_loss_clip": 0.01139166, + "auxiliary_loss_mlp": 0.01046119, + "balance_loss_clip": 1.02913201, + "balance_loss_mlp": 1.04580092, + "epoch": 0.22305726739816625, + "flos": 23143134309120.0, + "grad_norm": 2.062693768306912, + "language_loss": 0.68752408, + "learning_rate": 3.6215976668537787e-06, + "loss": 0.70937693, + "num_input_tokens_seen": 79952445, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.93359375, + "step": 3710, + "time_per_iteration": 2.511275053024292 + }, + { + "auxiliary_loss_clip": 0.01144244, + "auxiliary_loss_mlp": 0.01039149, + "balance_loss_clip": 1.0221858, + "balance_loss_mlp": 1.04812646, + "epoch": 0.22311739065083422, + "flos": 19172061187200.0, + "grad_norm": 2.3083581079415216, + "language_loss": 0.90696692, + "learning_rate": 3.6213696744855096e-06, + "loss": 0.92880082, + "num_input_tokens_seen": 79971030, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9609375, + "step": 3711, + "time_per_iteration": 2.4757487773895264 + }, + { + "auxiliary_loss_clip": 0.01138091, + "auxiliary_loss_mlp": 0.01051989, + "balance_loss_clip": 1.03406, + "balance_loss_mlp": 1.04603434, + "epoch": 0.22317751390350218, + "flos": 13617757319040.0, + "grad_norm": 2.758927620438821, + "language_loss": 0.89628232, + "learning_rate": 3.6211416206347395e-06, + "loss": 0.91818309, + "num_input_tokens_seen": 79982085, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.921875, + "step": 3712, + "time_per_iteration": 2.3870105743408203 + }, + { + "auxiliary_loss_clip": 0.01139482, + "auxiliary_loss_mlp": 0.01051487, + "balance_loss_clip": 1.03356993, + "balance_loss_mlp": 1.04956841, + "epoch": 0.22323763715617015, + "flos": 11029065068160.0, + "grad_norm": 3.039950461935961, + "language_loss": 0.74859631, + "learning_rate": 3.620913505310117e-06, + "loss": 0.77050602, + "num_input_tokens_seen": 79997460, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.8984375, + "step": 3713, + "time_per_iteration": 2.4336304664611816 + }, + { + "auxiliary_loss_clip": 0.01138793, + "auxiliary_loss_mlp": 0.01041826, + "balance_loss_clip": 1.02543497, + "balance_loss_mlp": 1.048329, + "epoch": 0.22329776040883811, + "flos": 41351531466240.0, + "grad_norm": 1.8221921578975473, + "language_loss": 0.62592143, + "learning_rate": 3.6206853285202917e-06, + "loss": 0.64772761, + "num_input_tokens_seen": 80022450, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90625, + "step": 3714, + "time_per_iteration": 2.6230995655059814 + }, + { + "auxiliary_loss_clip": 0.01139199, + "auxiliary_loss_mlp": 0.01036969, + "balance_loss_clip": 1.02073312, + "balance_loss_mlp": 1.04734552, + "epoch": 0.22335788366150608, + "flos": 25119478477440.0, + "grad_norm": 1.9329837891440178, + "language_loss": 0.79052407, + "learning_rate": 3.6204570902739164e-06, + "loss": 0.81228578, + "num_input_tokens_seen": 80042100, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.91796875, + "step": 3715, + "time_per_iteration": 2.510436534881592 + }, + { + "auxiliary_loss_clip": 0.011421, + "auxiliary_loss_mlp": 0.01050674, + "balance_loss_clip": 1.03372216, + "balance_loss_mlp": 1.05021942, + "epoch": 0.22341800691417404, + "flos": 16983377769600.0, + "grad_norm": 1.6633570096565886, + "language_loss": 0.77182817, + "learning_rate": 3.620228790579645e-06, + "loss": 0.79375589, + "num_input_tokens_seen": 80059690, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.921875, + "step": 3716, + "time_per_iteration": 2.4398605823516846 + }, + { + "auxiliary_loss_clip": 0.01141179, + "auxiliary_loss_mlp": 0.01047022, + "balance_loss_clip": 1.03046429, + "balance_loss_mlp": 1.04845762, + "epoch": 0.22347813016684204, + "flos": 14136738975360.0, + "grad_norm": 2.028714583879474, + "language_loss": 0.79209757, + "learning_rate": 3.6200004294461367e-06, + "loss": 0.81397963, + "num_input_tokens_seen": 80076060, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.92578125, + "step": 3717, + "time_per_iteration": 2.456042766571045 + }, + { + "auxiliary_loss_clip": 0.01143546, + "auxiliary_loss_mlp": 0.01041127, + "balance_loss_clip": 1.02377009, + "balance_loss_mlp": 1.04934192, + "epoch": 0.22353825341951, + "flos": 23583147914880.0, + "grad_norm": 2.2103373086531115, + "language_loss": 0.68029571, + "learning_rate": 3.6197720068820497e-06, + "loss": 0.70214242, + "num_input_tokens_seen": 80094760, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.94140625, + "step": 3718, + "time_per_iteration": 2.4818973541259766 + }, + { + "auxiliary_loss_clip": 0.01142458, + "auxiliary_loss_mlp": 0.01038613, + "balance_loss_clip": 1.02067208, + "balance_loss_mlp": 1.04784536, + "epoch": 0.22359837667217797, + "flos": 29824206888960.0, + "grad_norm": 1.9912565029374794, + "language_loss": 0.80194163, + "learning_rate": 3.619543522896045e-06, + "loss": 0.8237524, + "num_input_tokens_seen": 80114475, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.9453125, + "step": 3719, + "time_per_iteration": 3.985903263092041 + }, + { + "auxiliary_loss_clip": 0.01145808, + "auxiliary_loss_mlp": 0.01052597, + "balance_loss_clip": 1.03396416, + "balance_loss_mlp": 1.04785836, + "epoch": 0.22365849992484593, + "flos": 17603088140160.0, + "grad_norm": 2.0930960597239707, + "language_loss": 0.86421579, + "learning_rate": 3.6193149774967885e-06, + "loss": 0.88619983, + "num_input_tokens_seen": 80132920, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.98046875, + "step": 3720, + "time_per_iteration": 3.914626359939575 + }, + { + "auxiliary_loss_clip": 0.0114136, + "auxiliary_loss_mlp": 0.01033623, + "balance_loss_clip": 1.01682639, + "balance_loss_mlp": 1.05105066, + "epoch": 0.2237186231775139, + "flos": 22710949868160.0, + "grad_norm": 1.6398614781610892, + "language_loss": 0.74860299, + "learning_rate": 3.619086370692945e-06, + "loss": 0.77035284, + "num_input_tokens_seen": 80152845, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.90234375, + "step": 3721, + "time_per_iteration": 2.485271453857422 + }, + { + "auxiliary_loss_clip": 0.011451, + "auxiliary_loss_mlp": 0.01043389, + "balance_loss_clip": 1.0256865, + "balance_loss_mlp": 1.0494988, + "epoch": 0.22377874643018186, + "flos": 13371518609280.0, + "grad_norm": 2.928465692067959, + "language_loss": 0.78943181, + "learning_rate": 3.6188577024931844e-06, + "loss": 0.81131673, + "num_input_tokens_seen": 80170680, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.953125, + "step": 3722, + "time_per_iteration": 2.471928834915161 + }, + { + "auxiliary_loss_clip": 0.01140042, + "auxiliary_loss_mlp": 0.0104164, + "balance_loss_clip": 1.02551126, + "balance_loss_mlp": 1.05004597, + "epoch": 0.22383886968284986, + "flos": 17894970057600.0, + "grad_norm": 2.2482737248582247, + "language_loss": 0.82315016, + "learning_rate": 3.618628972906178e-06, + "loss": 0.84496701, + "num_input_tokens_seen": 80189030, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8984375, + "step": 3723, + "time_per_iteration": 2.4540791511535645 + }, + { + "auxiliary_loss_clip": 0.01144828, + "auxiliary_loss_mlp": 0.01044673, + "balance_loss_clip": 1.02729177, + "balance_loss_mlp": 1.05062389, + "epoch": 0.22389899293551782, + "flos": 23879123982720.0, + "grad_norm": 2.154682666342997, + "language_loss": 0.84433442, + "learning_rate": 3.6184001819405984e-06, + "loss": 0.86622941, + "num_input_tokens_seen": 80208365, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.94140625, + "step": 3724, + "time_per_iteration": 2.526204824447632 + }, + { + "auxiliary_loss_clip": 0.0114043, + "auxiliary_loss_mlp": 0.010395, + "balance_loss_clip": 1.02297735, + "balance_loss_mlp": 1.04889762, + "epoch": 0.2239591161881858, + "flos": 27272430840960.0, + "grad_norm": 2.178002887638817, + "language_loss": 0.79036546, + "learning_rate": 3.618171329605121e-06, + "loss": 0.81216478, + "num_input_tokens_seen": 80228685, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.9140625, + "step": 3725, + "time_per_iteration": 2.513136625289917 + }, + { + "auxiliary_loss_clip": 0.01139478, + "auxiliary_loss_mlp": 0.01039417, + "balance_loss_clip": 1.02271581, + "balance_loss_mlp": 1.04898071, + "epoch": 0.22401923944085375, + "flos": 22236857233920.0, + "grad_norm": 1.6732241790302085, + "language_loss": 0.77158499, + "learning_rate": 3.6179424159084254e-06, + "loss": 0.79337394, + "num_input_tokens_seen": 80247635, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.90625, + "step": 3726, + "time_per_iteration": 2.5645246505737305 + }, + { + "auxiliary_loss_clip": 0.01150164, + "auxiliary_loss_mlp": 0.01045662, + "balance_loss_clip": 1.02677917, + "balance_loss_mlp": 1.05054045, + "epoch": 0.22407936269352172, + "flos": 12053668521600.0, + "grad_norm": 2.7042555627132296, + "language_loss": 0.72376108, + "learning_rate": 3.6177134408591914e-06, + "loss": 0.74571931, + "num_input_tokens_seen": 80260045, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.99609375, + "step": 3727, + "time_per_iteration": 2.4437429904937744 + }, + { + "auxiliary_loss_clip": 0.0114439, + "auxiliary_loss_mlp": 0.01040468, + "balance_loss_clip": 1.02140689, + "balance_loss_mlp": 1.04682648, + "epoch": 0.22413948594618968, + "flos": 19353553632000.0, + "grad_norm": 2.2876633759350327, + "language_loss": 0.86584771, + "learning_rate": 3.6174844044661013e-06, + "loss": 0.88769633, + "num_input_tokens_seen": 80277680, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.9765625, + "step": 3728, + "time_per_iteration": 2.496020793914795 + }, + { + "auxiliary_loss_clip": 0.01143576, + "auxiliary_loss_mlp": 0.0104763, + "balance_loss_clip": 1.02838981, + "balance_loss_mlp": 1.05045211, + "epoch": 0.22419960919885765, + "flos": 24170000319360.0, + "grad_norm": 2.0817566504616734, + "language_loss": 0.80479026, + "learning_rate": 3.6172553067378406e-06, + "loss": 0.82670236, + "num_input_tokens_seen": 80294795, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.9296875, + "step": 3729, + "time_per_iteration": 2.4733448028564453 + }, + { + "auxiliary_loss_clip": 0.01136706, + "auxiliary_loss_mlp": 0.01046287, + "balance_loss_clip": 1.03019357, + "balance_loss_mlp": 1.04672551, + "epoch": 0.22425973245152564, + "flos": 27378977558400.0, + "grad_norm": 2.3054621640206205, + "language_loss": 0.86468041, + "learning_rate": 3.6170261476830964e-06, + "loss": 0.88651037, + "num_input_tokens_seen": 80315425, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8984375, + "step": 3730, + "time_per_iteration": 2.5348362922668457 + }, + { + "auxiliary_loss_clip": 0.01136756, + "auxiliary_loss_mlp": 0.0103563, + "balance_loss_clip": 1.01917958, + "balance_loss_mlp": 1.04737782, + "epoch": 0.2243198557041936, + "flos": 13735652734080.0, + "grad_norm": 1.75673058423422, + "language_loss": 0.73293322, + "learning_rate": 3.616796927310559e-06, + "loss": 0.75465709, + "num_input_tokens_seen": 80333905, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.89453125, + "step": 3731, + "time_per_iteration": 2.4397478103637695 + }, + { + "auxiliary_loss_clip": 0.01141304, + "auxiliary_loss_mlp": 0.0104035, + "balance_loss_clip": 1.02370882, + "balance_loss_mlp": 1.04893279, + "epoch": 0.22437997895686157, + "flos": 19530700531200.0, + "grad_norm": 2.4044438539905575, + "language_loss": 0.75237334, + "learning_rate": 3.6165676456289195e-06, + "loss": 0.77418989, + "num_input_tokens_seen": 80352165, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.921875, + "step": 3732, + "time_per_iteration": 2.476630926132202 + }, + { + "auxiliary_loss_clip": 0.01141784, + "auxiliary_loss_mlp": 0.01058138, + "balance_loss_clip": 1.04106712, + "balance_loss_mlp": 1.0494858, + "epoch": 0.22444010220952954, + "flos": 23696230907520.0, + "grad_norm": 1.8584104659795708, + "language_loss": 0.88037199, + "learning_rate": 3.616338302646873e-06, + "loss": 0.90237123, + "num_input_tokens_seen": 80371305, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.921875, + "step": 3733, + "time_per_iteration": 2.4723222255706787 + }, + { + "auxiliary_loss_clip": 0.01137652, + "auxiliary_loss_mlp": 0.01042602, + "balance_loss_clip": 1.02473271, + "balance_loss_mlp": 1.04564941, + "epoch": 0.2245002254621975, + "flos": 22382905933440.0, + "grad_norm": 1.6767676579772364, + "language_loss": 0.84200239, + "learning_rate": 3.6161088983731166e-06, + "loss": 0.86380494, + "num_input_tokens_seen": 80391020, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.921875, + "step": 3734, + "time_per_iteration": 2.5214619636535645 + }, + { + "auxiliary_loss_clip": 0.01143902, + "auxiliary_loss_mlp": 0.01048514, + "balance_loss_clip": 1.03170574, + "balance_loss_mlp": 1.0513525, + "epoch": 0.22456034871486547, + "flos": 26942303917440.0, + "grad_norm": 1.6368426378189131, + "language_loss": 0.76838279, + "learning_rate": 3.6158794328163482e-06, + "loss": 0.79030693, + "num_input_tokens_seen": 80411365, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.92578125, + "step": 3735, + "time_per_iteration": 2.5025858879089355 + }, + { + "auxiliary_loss_clip": 0.01134798, + "auxiliary_loss_mlp": 0.01047796, + "balance_loss_clip": 1.032215, + "balance_loss_mlp": 1.04791164, + "epoch": 0.22462047196753343, + "flos": 28983538005120.0, + "grad_norm": 3.6998773026048046, + "language_loss": 0.84505916, + "learning_rate": 3.6156499059852702e-06, + "loss": 0.86688507, + "num_input_tokens_seen": 80431075, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8671875, + "step": 3736, + "time_per_iteration": 2.581409454345703 + }, + { + "auxiliary_loss_clip": 0.0114079, + "auxiliary_loss_mlp": 0.01039504, + "balance_loss_clip": 1.02306545, + "balance_loss_mlp": 1.04848719, + "epoch": 0.22468059522020142, + "flos": 20011329440640.0, + "grad_norm": 2.2208030259376192, + "language_loss": 0.86398852, + "learning_rate": 3.615420317888586e-06, + "loss": 0.88579136, + "num_input_tokens_seen": 80449240, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.92578125, + "step": 3737, + "time_per_iteration": 2.4498212337493896 + }, + { + "auxiliary_loss_clip": 0.01141365, + "auxiliary_loss_mlp": 0.01047163, + "balance_loss_clip": 1.02917397, + "balance_loss_mlp": 1.0476644, + "epoch": 0.2247407184728694, + "flos": 29314239546240.0, + "grad_norm": 2.434824168439142, + "language_loss": 0.79145718, + "learning_rate": 3.6151906685350006e-06, + "loss": 0.81334245, + "num_input_tokens_seen": 80467900, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9375, + "step": 3738, + "time_per_iteration": 2.5505504608154297 + }, + { + "auxiliary_loss_clip": 0.01140019, + "auxiliary_loss_mlp": 0.01041393, + "balance_loss_clip": 1.02564526, + "balance_loss_mlp": 1.0471611, + "epoch": 0.22480084172553735, + "flos": 22310366417280.0, + "grad_norm": 2.2711438439691314, + "language_loss": 0.75895345, + "learning_rate": 3.614960957933224e-06, + "loss": 0.78076756, + "num_input_tokens_seen": 80487100, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.9296875, + "step": 3739, + "time_per_iteration": 2.458307981491089 + }, + { + "auxiliary_loss_clip": 0.01137257, + "auxiliary_loss_mlp": 0.0104211, + "balance_loss_clip": 1.0255754, + "balance_loss_mlp": 1.04610491, + "epoch": 0.22486096497820532, + "flos": 25591272641280.0, + "grad_norm": 1.9782758832921432, + "language_loss": 0.74705702, + "learning_rate": 3.6147311860919655e-06, + "loss": 0.76885068, + "num_input_tokens_seen": 80508625, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9140625, + "step": 3740, + "time_per_iteration": 2.5424981117248535 + }, + { + "auxiliary_loss_clip": 0.011377, + "auxiliary_loss_mlp": 0.01039355, + "balance_loss_clip": 1.02234411, + "balance_loss_mlp": 1.04691672, + "epoch": 0.22492108823087328, + "flos": 17639824775040.0, + "grad_norm": 2.174963459036685, + "language_loss": 0.76083958, + "learning_rate": 3.614501353019939e-06, + "loss": 0.78261012, + "num_input_tokens_seen": 80527345, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.90625, + "step": 3741, + "time_per_iteration": 2.4539613723754883 + }, + { + "auxiliary_loss_clip": 0.01140029, + "auxiliary_loss_mlp": 0.01038592, + "balance_loss_clip": 1.02263021, + "balance_loss_mlp": 1.05022252, + "epoch": 0.22498121148354125, + "flos": 16034653797120.0, + "grad_norm": 1.917686629559915, + "language_loss": 0.87458241, + "learning_rate": 3.6142714587258592e-06, + "loss": 0.89636862, + "num_input_tokens_seen": 80545545, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8984375, + "step": 3742, + "time_per_iteration": 2.483146905899048 + }, + { + "auxiliary_loss_clip": 0.01137495, + "auxiliary_loss_mlp": 0.01051324, + "balance_loss_clip": 1.03403831, + "balance_loss_mlp": 1.04824293, + "epoch": 0.22504133473620924, + "flos": 24023772051840.0, + "grad_norm": 2.0726823880461116, + "language_loss": 0.81939828, + "learning_rate": 3.614041503218444e-06, + "loss": 0.84128648, + "num_input_tokens_seen": 80565040, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.890625, + "step": 3743, + "time_per_iteration": 2.4786789417266846 + }, + { + "auxiliary_loss_clip": 0.01140562, + "auxiliary_loss_mlp": 0.01038532, + "balance_loss_clip": 1.02241504, + "balance_loss_mlp": 1.04843307, + "epoch": 0.2251014579888772, + "flos": 16763963541120.0, + "grad_norm": 3.9980575521347697, + "language_loss": 0.63616955, + "learning_rate": 3.6138114865064134e-06, + "loss": 0.65796053, + "num_input_tokens_seen": 80582815, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.921875, + "step": 3744, + "time_per_iteration": 2.4746344089508057 + }, + { + "auxiliary_loss_clip": 0.01137356, + "auxiliary_loss_mlp": 0.01042928, + "balance_loss_clip": 1.02634597, + "balance_loss_mlp": 1.04524422, + "epoch": 0.22516158124154517, + "flos": 13991013498240.0, + "grad_norm": 3.3106228370485806, + "language_loss": 0.75711048, + "learning_rate": 3.613581408598489e-06, + "loss": 0.77891332, + "num_input_tokens_seen": 80600865, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.921875, + "step": 3745, + "time_per_iteration": 2.4295878410339355 + }, + { + "auxiliary_loss_clip": 0.01136631, + "auxiliary_loss_mlp": 0.0103759, + "balance_loss_clip": 1.02142549, + "balance_loss_mlp": 1.04637384, + "epoch": 0.22522170449421314, + "flos": 14390016750720.0, + "grad_norm": 1.8117958881819525, + "language_loss": 0.80839783, + "learning_rate": 3.6133512695033965e-06, + "loss": 0.83013999, + "num_input_tokens_seen": 80617455, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.90234375, + "step": 3746, + "time_per_iteration": 2.4423928260803223 + }, + { + "auxiliary_loss_clip": 0.01138701, + "auxiliary_loss_mlp": 0.01045887, + "balance_loss_clip": 1.02903056, + "balance_loss_mlp": 1.04503584, + "epoch": 0.2252818277468811, + "flos": 23805542972160.0, + "grad_norm": 2.508960709641407, + "language_loss": 0.86067426, + "learning_rate": 3.613121069229862e-06, + "loss": 0.8825202, + "num_input_tokens_seen": 80635125, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.93359375, + "step": 3747, + "time_per_iteration": 2.471223831176758 + }, + { + "auxiliary_loss_clip": 0.01136945, + "auxiliary_loss_mlp": 0.01034039, + "balance_loss_clip": 1.01789808, + "balance_loss_mlp": 1.04515314, + "epoch": 0.22534195099954907, + "flos": 24718033100160.0, + "grad_norm": 1.812236682782158, + "language_loss": 0.76358509, + "learning_rate": 3.6128908077866145e-06, + "loss": 0.78529495, + "num_input_tokens_seen": 80656370, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.91796875, + "step": 3748, + "time_per_iteration": 2.525108575820923 + }, + { + "auxiliary_loss_clip": 0.01142287, + "auxiliary_loss_mlp": 0.01044202, + "balance_loss_clip": 1.0274291, + "balance_loss_mlp": 1.04882264, + "epoch": 0.22540207425221703, + "flos": 21032341534080.0, + "grad_norm": 1.7339876982656162, + "language_loss": 0.79497123, + "learning_rate": 3.6126604851823864e-06, + "loss": 0.81683606, + "num_input_tokens_seen": 80676495, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9375, + "step": 3749, + "time_per_iteration": 2.4881162643432617 + }, + { + "auxiliary_loss_clip": 0.01134115, + "auxiliary_loss_mlp": 0.01039256, + "balance_loss_clip": 1.02423525, + "balance_loss_mlp": 1.04609084, + "epoch": 0.22546219750488503, + "flos": 19390362094080.0, + "grad_norm": 1.6101192523185979, + "language_loss": 0.8009423, + "learning_rate": 3.6124301014259108e-06, + "loss": 0.82267606, + "num_input_tokens_seen": 80694755, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8828125, + "step": 3750, + "time_per_iteration": 2.4656643867492676 + }, + { + "auxiliary_loss_clip": 0.01140861, + "auxiliary_loss_mlp": 0.01044128, + "balance_loss_clip": 1.02733183, + "balance_loss_mlp": 1.04821157, + "epoch": 0.225522320757553, + "flos": 25192628524800.0, + "grad_norm": 2.418289881699729, + "language_loss": 0.81336129, + "learning_rate": 3.6121996565259244e-06, + "loss": 0.83521116, + "num_input_tokens_seen": 80713670, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.92578125, + "step": 3751, + "time_per_iteration": 2.4960029125213623 + }, + { + "auxiliary_loss_clip": 0.01141479, + "auxiliary_loss_mlp": 0.01038662, + "balance_loss_clip": 1.02242589, + "balance_loss_mlp": 1.04915667, + "epoch": 0.22558244401022096, + "flos": 17163110448000.0, + "grad_norm": 1.757449596716865, + "language_loss": 0.83989275, + "learning_rate": 3.611969150491165e-06, + "loss": 0.86169416, + "num_input_tokens_seen": 80731450, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.921875, + "step": 3752, + "time_per_iteration": 2.4668636322021484 + }, + { + "auxiliary_loss_clip": 0.01136965, + "auxiliary_loss_mlp": 0.01039126, + "balance_loss_clip": 1.02375996, + "balance_loss_mlp": 1.04671109, + "epoch": 0.22564256726288892, + "flos": 15231008856960.0, + "grad_norm": 1.7780915453784651, + "language_loss": 0.78616595, + "learning_rate": 3.611738583330375e-06, + "loss": 0.80792689, + "num_input_tokens_seen": 80748415, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.90234375, + "step": 3753, + "time_per_iteration": 2.4305062294006348 + }, + { + "auxiliary_loss_clip": 0.01137405, + "auxiliary_loss_mlp": 0.01038232, + "balance_loss_clip": 1.02113724, + "balance_loss_mlp": 1.04717183, + "epoch": 0.2257026905155569, + "flos": 34568652764160.0, + "grad_norm": 1.990408742554116, + "language_loss": 0.78284466, + "learning_rate": 3.611507955052295e-06, + "loss": 0.80460101, + "num_input_tokens_seen": 80770835, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.90234375, + "step": 3754, + "time_per_iteration": 2.584170341491699 + }, + { + "auxiliary_loss_clip": 0.0113674, + "auxiliary_loss_mlp": 0.01040681, + "balance_loss_clip": 1.0243969, + "balance_loss_mlp": 1.04882884, + "epoch": 0.22576281376822485, + "flos": 19938430788480.0, + "grad_norm": 1.915767444367904, + "language_loss": 0.70267534, + "learning_rate": 3.6112772656656727e-06, + "loss": 0.72444952, + "num_input_tokens_seen": 80787840, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.87890625, + "step": 3755, + "time_per_iteration": 2.458731174468994 + }, + { + "auxiliary_loss_clip": 0.01145193, + "auxiliary_loss_mlp": 0.0104804, + "balance_loss_clip": 1.031744, + "balance_loss_mlp": 1.0502069, + "epoch": 0.22582293702089282, + "flos": 24602005192320.0, + "grad_norm": 2.7446757969812783, + "language_loss": 0.77373838, + "learning_rate": 3.6110465151792547e-06, + "loss": 0.79567063, + "num_input_tokens_seen": 80806335, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.94921875, + "step": 3756, + "time_per_iteration": 2.5073161125183105 + }, + { + "auxiliary_loss_clip": 0.01145039, + "auxiliary_loss_mlp": 0.01042429, + "balance_loss_clip": 1.02498841, + "balance_loss_mlp": 1.05014277, + "epoch": 0.2258830602735608, + "flos": 23035438356480.0, + "grad_norm": 1.8909279955578986, + "language_loss": 0.82552433, + "learning_rate": 3.6108157036017916e-06, + "loss": 0.847399, + "num_input_tokens_seen": 80825355, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.94921875, + "step": 3757, + "time_per_iteration": 2.471353054046631 + }, + { + "auxiliary_loss_clip": 0.01140888, + "auxiliary_loss_mlp": 0.01039381, + "balance_loss_clip": 1.02258492, + "balance_loss_mlp": 1.04810619, + "epoch": 0.22594318352622877, + "flos": 22158427887360.0, + "grad_norm": 1.8410990661161322, + "language_loss": 0.73181808, + "learning_rate": 3.6105848309420358e-06, + "loss": 0.7536208, + "num_input_tokens_seen": 80842570, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.92578125, + "step": 3758, + "time_per_iteration": 2.5376477241516113 + }, + { + "auxiliary_loss_clip": 0.01144551, + "auxiliary_loss_mlp": 0.0104662, + "balance_loss_clip": 1.02985883, + "balance_loss_mlp": 1.04991663, + "epoch": 0.22600330677889674, + "flos": 20594303176320.0, + "grad_norm": 2.0967514749881015, + "language_loss": 0.77208662, + "learning_rate": 3.6103538972087412e-06, + "loss": 0.79399836, + "num_input_tokens_seen": 80858745, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9453125, + "step": 3759, + "time_per_iteration": 2.447608709335327 + }, + { + "auxiliary_loss_clip": 0.01141959, + "auxiliary_loss_mlp": 0.01043995, + "balance_loss_clip": 1.02643597, + "balance_loss_mlp": 1.04806697, + "epoch": 0.2260634300315647, + "flos": 35659798162560.0, + "grad_norm": 1.9036057015372598, + "language_loss": 0.78638428, + "learning_rate": 3.6101229024106655e-06, + "loss": 0.80824387, + "num_input_tokens_seen": 80880085, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.94140625, + "step": 3760, + "time_per_iteration": 4.231990098953247 + }, + { + "auxiliary_loss_clip": 0.01054354, + "auxiliary_loss_mlp": 0.01007925, + "balance_loss_clip": 1.00607765, + "balance_loss_mlp": 1.02028942, + "epoch": 0.22612355328423267, + "flos": 72090455126400.0, + "grad_norm": 0.9344871733021222, + "language_loss": 0.60090166, + "learning_rate": 3.609891846556569e-06, + "loss": 0.62152445, + "num_input_tokens_seen": 80937660, + "router_z_loss_clip": 0.01843262, + "router_z_loss_mlp": 0.33984375, + "step": 3761, + "time_per_iteration": 4.482504367828369 + }, + { + "auxiliary_loss_clip": 0.0114253, + "auxiliary_loss_mlp": 0.01044191, + "balance_loss_clip": 1.02678633, + "balance_loss_mlp": 1.0478611, + "epoch": 0.22618367653690064, + "flos": 22783776693120.0, + "grad_norm": 2.386395888426225, + "language_loss": 0.77400732, + "learning_rate": 3.609660729655211e-06, + "loss": 0.79587454, + "num_input_tokens_seen": 80956265, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9453125, + "step": 3762, + "time_per_iteration": 2.5162198543548584 + }, + { + "auxiliary_loss_clip": 0.01143363, + "auxiliary_loss_mlp": 0.01040981, + "balance_loss_clip": 1.02395821, + "balance_loss_mlp": 1.05073345, + "epoch": 0.22624379978956863, + "flos": 20448254476800.0, + "grad_norm": 2.10132066013886, + "language_loss": 0.78800118, + "learning_rate": 3.6094295517153573e-06, + "loss": 0.80984461, + "num_input_tokens_seen": 80975185, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.92578125, + "step": 3763, + "time_per_iteration": 2.4578778743743896 + }, + { + "auxiliary_loss_clip": 0.01145794, + "auxiliary_loss_mlp": 0.0105418, + "balance_loss_clip": 1.03583384, + "balance_loss_mlp": 1.05000031, + "epoch": 0.2263039230422366, + "flos": 17494314779520.0, + "grad_norm": 1.8659674868358982, + "language_loss": 0.91363662, + "learning_rate": 3.6091983127457743e-06, + "loss": 0.93563628, + "num_input_tokens_seen": 80992830, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.95703125, + "step": 3764, + "time_per_iteration": 2.536231517791748 + }, + { + "auxiliary_loss_clip": 0.01138186, + "auxiliary_loss_mlp": 0.01054666, + "balance_loss_clip": 1.03740454, + "balance_loss_mlp": 1.04773271, + "epoch": 0.22636404629490456, + "flos": 28329748606080.0, + "grad_norm": 1.6188972360392109, + "language_loss": 0.75211406, + "learning_rate": 3.6089670127552293e-06, + "loss": 0.77404261, + "num_input_tokens_seen": 81013675, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.90625, + "step": 3765, + "time_per_iteration": 2.516646146774292 + }, + { + "auxiliary_loss_clip": 0.01139986, + "auxiliary_loss_mlp": 0.01045374, + "balance_loss_clip": 1.02868426, + "balance_loss_mlp": 1.04855943, + "epoch": 0.22642416954757252, + "flos": 17489143221120.0, + "grad_norm": 1.9315012383394614, + "language_loss": 0.89618981, + "learning_rate": 3.608735651752494e-06, + "loss": 0.91804343, + "num_input_tokens_seen": 81030345, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9140625, + "step": 3766, + "time_per_iteration": 2.4829306602478027 + }, + { + "auxiliary_loss_clip": 0.01138287, + "auxiliary_loss_mlp": 0.01042769, + "balance_loss_clip": 1.02568591, + "balance_loss_mlp": 1.04891181, + "epoch": 0.2264842928002405, + "flos": 24384530298240.0, + "grad_norm": 1.6662033714223943, + "language_loss": 0.74710411, + "learning_rate": 3.6085042297463417e-06, + "loss": 0.76891464, + "num_input_tokens_seen": 81051000, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.89453125, + "step": 3767, + "time_per_iteration": 2.4989218711853027 + }, + { + "auxiliary_loss_clip": 0.011397, + "auxiliary_loss_mlp": 0.01044149, + "balance_loss_clip": 1.02664912, + "balance_loss_mlp": 1.04619229, + "epoch": 0.22654441605290845, + "flos": 19830519354240.0, + "grad_norm": 1.4804117361030718, + "language_loss": 0.7156831, + "learning_rate": 3.6082727467455477e-06, + "loss": 0.73752159, + "num_input_tokens_seen": 81071205, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.93359375, + "step": 3768, + "time_per_iteration": 2.5078160762786865 + }, + { + "auxiliary_loss_clip": 0.01143764, + "auxiliary_loss_mlp": 0.01054415, + "balance_loss_clip": 1.03682017, + "balance_loss_mlp": 1.05247319, + "epoch": 0.22660453930557642, + "flos": 27454569730560.0, + "grad_norm": 1.80046116612075, + "language_loss": 0.78268003, + "learning_rate": 3.6080412027588905e-06, + "loss": 0.80466181, + "num_input_tokens_seen": 81091880, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9140625, + "step": 3769, + "time_per_iteration": 2.5122978687286377 + }, + { + "auxiliary_loss_clip": 0.01142038, + "auxiliary_loss_mlp": 0.01042012, + "balance_loss_clip": 1.02465522, + "balance_loss_mlp": 1.0467639, + "epoch": 0.2266646625582444, + "flos": 23988148738560.0, + "grad_norm": 1.7393050758681738, + "language_loss": 0.68427956, + "learning_rate": 3.6078095977951488e-06, + "loss": 0.70612001, + "num_input_tokens_seen": 81113290, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.953125, + "step": 3770, + "time_per_iteration": 2.557098150253296 + }, + { + "auxiliary_loss_clip": 0.01141766, + "auxiliary_loss_mlp": 0.01041674, + "balance_loss_clip": 1.02537811, + "balance_loss_mlp": 1.04682195, + "epoch": 0.22672478581091238, + "flos": 26028054023040.0, + "grad_norm": 1.6251414008252867, + "language_loss": 0.80370939, + "learning_rate": 3.6075779318631067e-06, + "loss": 0.82554382, + "num_input_tokens_seen": 81133535, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.94921875, + "step": 3771, + "time_per_iteration": 2.5156240463256836 + }, + { + "auxiliary_loss_clip": 0.01135038, + "auxiliary_loss_mlp": 0.01045619, + "balance_loss_clip": 1.0290848, + "balance_loss_mlp": 1.04606724, + "epoch": 0.22678490906358034, + "flos": 23841812730240.0, + "grad_norm": 1.567346312954514, + "language_loss": 0.78844583, + "learning_rate": 3.6073462049715486e-06, + "loss": 0.81025243, + "num_input_tokens_seen": 81154650, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.890625, + "step": 3772, + "time_per_iteration": 2.539632558822632 + }, + { + "auxiliary_loss_clip": 0.0105304, + "auxiliary_loss_mlp": 0.01005348, + "balance_loss_clip": 1.00351191, + "balance_loss_mlp": 1.02012253, + "epoch": 0.2268450323162483, + "flos": 65048088574080.0, + "grad_norm": 0.6518085485856671, + "language_loss": 0.54334348, + "learning_rate": 3.607114417129261e-06, + "loss": 0.56392735, + "num_input_tokens_seen": 81221240, + "router_z_loss_clip": 0.01831055, + "router_z_loss_mlp": 0.33007812, + "step": 3773, + "time_per_iteration": 3.1463003158569336 + }, + { + "auxiliary_loss_clip": 0.01136639, + "auxiliary_loss_mlp": 0.01039094, + "balance_loss_clip": 1.02222633, + "balance_loss_mlp": 1.04712117, + "epoch": 0.22690515556891627, + "flos": 22526081544960.0, + "grad_norm": 1.9230264173849037, + "language_loss": 0.70101082, + "learning_rate": 3.6068825683450334e-06, + "loss": 0.72276813, + "num_input_tokens_seen": 81241520, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.89453125, + "step": 3774, + "time_per_iteration": 2.5099127292633057 + }, + { + "auxiliary_loss_clip": 0.01134613, + "auxiliary_loss_mlp": 0.01038845, + "balance_loss_clip": 1.02232277, + "balance_loss_mlp": 1.04480648, + "epoch": 0.22696527882158424, + "flos": 18223444955520.0, + "grad_norm": 2.4369678263863057, + "language_loss": 0.74585366, + "learning_rate": 3.606650658627658e-06, + "loss": 0.76758826, + "num_input_tokens_seen": 81256825, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8984375, + "step": 3775, + "time_per_iteration": 2.4441745281219482 + }, + { + "auxiliary_loss_clip": 0.0113676, + "auxiliary_loss_mlp": 0.01038998, + "balance_loss_clip": 1.02311933, + "balance_loss_mlp": 1.04534245, + "epoch": 0.22702540207425223, + "flos": 17019252478080.0, + "grad_norm": 2.175545430509675, + "language_loss": 0.8256253, + "learning_rate": 3.606418687985928e-06, + "loss": 0.8473829, + "num_input_tokens_seen": 81275695, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.9140625, + "step": 3776, + "time_per_iteration": 2.4418301582336426 + }, + { + "auxiliary_loss_clip": 0.01139885, + "auxiliary_loss_mlp": 0.01037889, + "balance_loss_clip": 1.02125907, + "balance_loss_mlp": 1.04619908, + "epoch": 0.2270855253269202, + "flos": 21325731822720.0, + "grad_norm": 2.75835757539417, + "language_loss": 0.83031607, + "learning_rate": 3.606186656428641e-06, + "loss": 0.85209382, + "num_input_tokens_seen": 81294920, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9375, + "step": 3777, + "time_per_iteration": 2.5585062503814697 + }, + { + "auxiliary_loss_clip": 0.01137385, + "auxiliary_loss_mlp": 0.01039138, + "balance_loss_clip": 1.02232909, + "balance_loss_mlp": 1.04596353, + "epoch": 0.22714564857958816, + "flos": 23550469516800.0, + "grad_norm": 2.6678368583827288, + "language_loss": 0.72658038, + "learning_rate": 3.6059545639645955e-06, + "loss": 0.74834561, + "num_input_tokens_seen": 81314275, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9140625, + "step": 3778, + "time_per_iteration": 2.5019333362579346 + }, + { + "auxiliary_loss_clip": 0.0113896, + "auxiliary_loss_mlp": 0.01040521, + "balance_loss_clip": 1.02386749, + "balance_loss_mlp": 1.04576886, + "epoch": 0.22720577183225613, + "flos": 25989880844160.0, + "grad_norm": 2.229609453971581, + "language_loss": 0.6414392, + "learning_rate": 3.605722410602591e-06, + "loss": 0.663234, + "num_input_tokens_seen": 81333890, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.93359375, + "step": 3779, + "time_per_iteration": 2.5082859992980957 + }, + { + "auxiliary_loss_clip": 0.01138378, + "auxiliary_loss_mlp": 0.01043458, + "balance_loss_clip": 1.02794909, + "balance_loss_mlp": 1.04837573, + "epoch": 0.2272658950849241, + "flos": 20814076540800.0, + "grad_norm": 1.9715072832436495, + "language_loss": 0.70546824, + "learning_rate": 3.6054901963514323e-06, + "loss": 0.72728658, + "num_input_tokens_seen": 81353640, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8984375, + "step": 3780, + "time_per_iteration": 2.4703643321990967 + }, + { + "auxiliary_loss_clip": 0.01140054, + "auxiliary_loss_mlp": 0.0104493, + "balance_loss_clip": 1.02689338, + "balance_loss_mlp": 1.0489254, + "epoch": 0.22732601833759206, + "flos": 23909324342400.0, + "grad_norm": 2.5454366084291133, + "language_loss": 0.89717996, + "learning_rate": 3.6052579212199246e-06, + "loss": 0.91902977, + "num_input_tokens_seen": 81371595, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9140625, + "step": 3781, + "time_per_iteration": 2.4812376499176025 + }, + { + "auxiliary_loss_clip": 0.0113992, + "auxiliary_loss_mlp": 0.01041852, + "balance_loss_clip": 1.02436364, + "balance_loss_mlp": 1.04648304, + "epoch": 0.22738614159026002, + "flos": 15924407978880.0, + "grad_norm": 2.4601522898780805, + "language_loss": 0.7434786, + "learning_rate": 3.6050255852168753e-06, + "loss": 0.76529634, + "num_input_tokens_seen": 81388435, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.93359375, + "step": 3782, + "time_per_iteration": 2.4665582180023193 + }, + { + "auxiliary_loss_clip": 0.01136804, + "auxiliary_loss_mlp": 0.01041674, + "balance_loss_clip": 1.02587914, + "balance_loss_mlp": 1.04467201, + "epoch": 0.22744626484292801, + "flos": 24205515891840.0, + "grad_norm": 1.6148985015615094, + "language_loss": 0.82393098, + "learning_rate": 3.604793188351095e-06, + "loss": 0.84571576, + "num_input_tokens_seen": 81410195, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.921875, + "step": 3783, + "time_per_iteration": 2.4820034503936768 + }, + { + "auxiliary_loss_clip": 0.01137013, + "auxiliary_loss_mlp": 0.01040248, + "balance_loss_clip": 1.02310586, + "balance_loss_mlp": 1.04418266, + "epoch": 0.22750638809559598, + "flos": 24791614110720.0, + "grad_norm": 2.4165791890347714, + "language_loss": 0.75874048, + "learning_rate": 3.6045607306313964e-06, + "loss": 0.78051311, + "num_input_tokens_seen": 81430060, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9296875, + "step": 3784, + "time_per_iteration": 2.5087246894836426 + }, + { + "auxiliary_loss_clip": 0.01134704, + "auxiliary_loss_mlp": 0.01039995, + "balance_loss_clip": 1.02303135, + "balance_loss_mlp": 1.04345798, + "epoch": 0.22756651134826394, + "flos": 22236498097920.0, + "grad_norm": 1.6490497895559066, + "language_loss": 0.70716858, + "learning_rate": 3.604328212066594e-06, + "loss": 0.72891551, + "num_input_tokens_seen": 81447375, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.91015625, + "step": 3785, + "time_per_iteration": 2.4733574390411377 + }, + { + "auxiliary_loss_clip": 0.01051525, + "auxiliary_loss_mlp": 0.01004421, + "balance_loss_clip": 1.00252521, + "balance_loss_mlp": 1.01740241, + "epoch": 0.2276266346009319, + "flos": 62707466626560.0, + "grad_norm": 0.8187947911361427, + "language_loss": 0.61915314, + "learning_rate": 3.6040956326655047e-06, + "loss": 0.63971269, + "num_input_tokens_seen": 81505235, + "router_z_loss_clip": 0.0189209, + "router_z_loss_mlp": 0.34179688, + "step": 3786, + "time_per_iteration": 3.0474631786346436 + }, + { + "auxiliary_loss_clip": 0.01143523, + "auxiliary_loss_mlp": 0.01042446, + "balance_loss_clip": 1.02488649, + "balance_loss_mlp": 1.04777002, + "epoch": 0.22768675785359987, + "flos": 18613936684800.0, + "grad_norm": 2.6740153696427247, + "language_loss": 0.86285794, + "learning_rate": 3.6038629924369486e-06, + "loss": 0.88471758, + "num_input_tokens_seen": 81518685, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.953125, + "step": 3787, + "time_per_iteration": 2.4331281185150146 + }, + { + "auxiliary_loss_clip": 0.01137002, + "auxiliary_loss_mlp": 0.01040164, + "balance_loss_clip": 1.02433276, + "balance_loss_mlp": 1.04612255, + "epoch": 0.22774688110626784, + "flos": 26870195364480.0, + "grad_norm": 1.2844293081892826, + "language_loss": 0.72555876, + "learning_rate": 3.6036302913897474e-06, + "loss": 0.74733031, + "num_input_tokens_seen": 81538940, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.91015625, + "step": 3788, + "time_per_iteration": 2.5378167629241943 + }, + { + "auxiliary_loss_clip": 0.01136486, + "auxiliary_loss_mlp": 0.01036201, + "balance_loss_clip": 1.01929688, + "balance_loss_mlp": 1.04552293, + "epoch": 0.2278070043589358, + "flos": 15553593924480.0, + "grad_norm": 2.4737623033533587, + "language_loss": 0.67524469, + "learning_rate": 3.6033975295327243e-06, + "loss": 0.69697154, + "num_input_tokens_seen": 81555525, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.91015625, + "step": 3789, + "time_per_iteration": 2.412086248397827 + }, + { + "auxiliary_loss_clip": 0.01136455, + "auxiliary_loss_mlp": 0.01041211, + "balance_loss_clip": 1.02416384, + "balance_loss_mlp": 1.04507327, + "epoch": 0.2278671276116038, + "flos": 22416805393920.0, + "grad_norm": 2.1501364843402335, + "language_loss": 0.76075745, + "learning_rate": 3.6031647068747065e-06, + "loss": 0.78253406, + "num_input_tokens_seen": 81576305, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9140625, + "step": 3790, + "time_per_iteration": 2.503600835800171 + }, + { + "auxiliary_loss_clip": 0.01134768, + "auxiliary_loss_mlp": 0.01038527, + "balance_loss_clip": 1.02174211, + "balance_loss_mlp": 1.04253387, + "epoch": 0.22792725086427176, + "flos": 20631363033600.0, + "grad_norm": 2.0794940610838397, + "language_loss": 0.90613973, + "learning_rate": 3.602931823424522e-06, + "loss": 0.92787266, + "num_input_tokens_seen": 81594115, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.921875, + "step": 3791, + "time_per_iteration": 2.4503557682037354 + }, + { + "auxiliary_loss_clip": 0.01138123, + "auxiliary_loss_mlp": 0.01036907, + "balance_loss_clip": 1.02000308, + "balance_loss_mlp": 1.04407096, + "epoch": 0.22798737411693973, + "flos": 31428946903680.0, + "grad_norm": 1.8390004860332834, + "language_loss": 0.82869208, + "learning_rate": 3.6026988791910026e-06, + "loss": 0.85044241, + "num_input_tokens_seen": 81615355, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.94140625, + "step": 3792, + "time_per_iteration": 2.5451550483703613 + }, + { + "auxiliary_loss_clip": 0.01045824, + "auxiliary_loss_mlp": 0.01012041, + "balance_loss_clip": 1.01015747, + "balance_loss_mlp": 1.01168287, + "epoch": 0.2280474973696077, + "flos": 52396685827200.0, + "grad_norm": 1.1436128607221614, + "language_loss": 0.65615487, + "learning_rate": 3.602465874182981e-06, + "loss": 0.67673355, + "num_input_tokens_seen": 81662075, + "router_z_loss_clip": 0.01879883, + "router_z_loss_mlp": 0.34179688, + "step": 3793, + "time_per_iteration": 2.7929015159606934 + }, + { + "auxiliary_loss_clip": 0.01142047, + "auxiliary_loss_mlp": 0.01050177, + "balance_loss_clip": 1.03241456, + "balance_loss_mlp": 1.04557967, + "epoch": 0.22810762062227566, + "flos": 26396066816640.0, + "grad_norm": 2.282271850248546, + "language_loss": 0.77100229, + "learning_rate": 3.602232808409293e-06, + "loss": 0.79292452, + "num_input_tokens_seen": 81681625, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.96484375, + "step": 3794, + "time_per_iteration": 2.4882023334503174 + }, + { + "auxiliary_loss_clip": 0.01139112, + "auxiliary_loss_mlp": 0.01038286, + "balance_loss_clip": 1.02146518, + "balance_loss_mlp": 1.04517698, + "epoch": 0.22816774387494362, + "flos": 25630271832960.0, + "grad_norm": 2.1931228295055716, + "language_loss": 0.80724937, + "learning_rate": 3.6019996818787755e-06, + "loss": 0.82902336, + "num_input_tokens_seen": 81701170, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9375, + "step": 3795, + "time_per_iteration": 2.475311279296875 + }, + { + "auxiliary_loss_clip": 0.0113575, + "auxiliary_loss_mlp": 0.01044854, + "balance_loss_clip": 1.02747297, + "balance_loss_mlp": 1.04336488, + "epoch": 0.22822786712761162, + "flos": 22451602694400.0, + "grad_norm": 1.8416311408581074, + "language_loss": 0.77002209, + "learning_rate": 3.6017664946002704e-06, + "loss": 0.79182816, + "num_input_tokens_seen": 81721265, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.92578125, + "step": 3796, + "time_per_iteration": 2.4734761714935303 + }, + { + "auxiliary_loss_clip": 0.01135997, + "auxiliary_loss_mlp": 0.01038978, + "balance_loss_clip": 1.02236056, + "balance_loss_mlp": 1.04312813, + "epoch": 0.22828799038027958, + "flos": 12202554395520.0, + "grad_norm": 2.506500245398156, + "language_loss": 0.9594354, + "learning_rate": 3.6015332465826188e-06, + "loss": 0.98118514, + "num_input_tokens_seen": 81736565, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9296875, + "step": 3797, + "time_per_iteration": 2.4146203994750977 + }, + { + "auxiliary_loss_clip": 0.01138366, + "auxiliary_loss_mlp": 0.01040269, + "balance_loss_clip": 1.02338922, + "balance_loss_mlp": 1.04537892, + "epoch": 0.22834811363294755, + "flos": 22085708803200.0, + "grad_norm": 1.6428427275001165, + "language_loss": 0.81446218, + "learning_rate": 3.601299937834666e-06, + "loss": 0.83624852, + "num_input_tokens_seen": 81756240, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9296875, + "step": 3798, + "time_per_iteration": 2.490849733352661 + }, + { + "auxiliary_loss_clip": 0.01137089, + "auxiliary_loss_mlp": 0.01038732, + "balance_loss_clip": 1.02080309, + "balance_loss_mlp": 1.04262519, + "epoch": 0.2284082368856155, + "flos": 24860634094080.0, + "grad_norm": 2.3515161945239833, + "language_loss": 0.78744864, + "learning_rate": 3.6010665683652596e-06, + "loss": 0.80920684, + "num_input_tokens_seen": 81775720, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9453125, + "step": 3799, + "time_per_iteration": 2.470564842224121 + }, + { + "auxiliary_loss_clip": 0.0113724, + "auxiliary_loss_mlp": 0.01050228, + "balance_loss_clip": 1.0332408, + "balance_loss_mlp": 1.04381084, + "epoch": 0.22846836013828348, + "flos": 23292882109440.0, + "grad_norm": 1.655995083326211, + "language_loss": 0.75234401, + "learning_rate": 3.6008331381832484e-06, + "loss": 0.77421868, + "num_input_tokens_seen": 81795830, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.93359375, + "step": 3800, + "time_per_iteration": 2.510788917541504 + }, + { + "auxiliary_loss_clip": 0.01137174, + "auxiliary_loss_mlp": 0.01039053, + "balance_loss_clip": 1.02320981, + "balance_loss_mlp": 1.04583156, + "epoch": 0.22852848339095144, + "flos": 27416288810880.0, + "grad_norm": 1.661997570582357, + "language_loss": 0.63433349, + "learning_rate": 3.600599647297484e-06, + "loss": 0.6560958, + "num_input_tokens_seen": 81815745, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.9140625, + "step": 3801, + "time_per_iteration": 2.503643035888672 + }, + { + "auxiliary_loss_clip": 0.01136229, + "auxiliary_loss_mlp": 0.0103618, + "balance_loss_clip": 1.02027762, + "balance_loss_mlp": 1.04721296, + "epoch": 0.2285886066436194, + "flos": 26321157002880.0, + "grad_norm": 1.7846583359688928, + "language_loss": 0.81602335, + "learning_rate": 3.60036609571682e-06, + "loss": 0.83774745, + "num_input_tokens_seen": 81835155, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.890625, + "step": 3802, + "time_per_iteration": 4.002788782119751 + }, + { + "auxiliary_loss_clip": 0.01138233, + "auxiliary_loss_mlp": 0.01047462, + "balance_loss_clip": 1.03027248, + "balance_loss_mlp": 1.04454207, + "epoch": 0.2286487298962874, + "flos": 29716475022720.0, + "grad_norm": 1.7683413549342115, + "language_loss": 0.78830242, + "learning_rate": 3.600132483450114e-06, + "loss": 0.81015933, + "num_input_tokens_seen": 81855655, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9375, + "step": 3803, + "time_per_iteration": 3.9494168758392334 + }, + { + "auxiliary_loss_clip": 0.01135958, + "auxiliary_loss_mlp": 0.01042656, + "balance_loss_clip": 1.02544212, + "balance_loss_mlp": 1.04115725, + "epoch": 0.22870885314895537, + "flos": 21287199507840.0, + "grad_norm": 1.6939241338011581, + "language_loss": 0.85561395, + "learning_rate": 3.5998988105062235e-06, + "loss": 0.87740004, + "num_input_tokens_seen": 81876385, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.94921875, + "step": 3804, + "time_per_iteration": 2.4504544734954834 + }, + { + "auxiliary_loss_clip": 0.01139159, + "auxiliary_loss_mlp": 0.01043693, + "balance_loss_clip": 1.02744436, + "balance_loss_mlp": 1.04339862, + "epoch": 0.22876897640162333, + "flos": 14939450161920.0, + "grad_norm": 2.1651494765134736, + "language_loss": 0.76485813, + "learning_rate": 3.59966507689401e-06, + "loss": 0.78668666, + "num_input_tokens_seen": 81893225, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.9609375, + "step": 3805, + "time_per_iteration": 2.4578893184661865 + }, + { + "auxiliary_loss_clip": 0.01139764, + "auxiliary_loss_mlp": 0.01043484, + "balance_loss_clip": 1.02560234, + "balance_loss_mlp": 1.04387915, + "epoch": 0.2288290996542913, + "flos": 18113917409280.0, + "grad_norm": 2.4014048134005628, + "language_loss": 0.79309744, + "learning_rate": 3.5994312826223363e-06, + "loss": 0.81492996, + "num_input_tokens_seen": 81911350, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.95703125, + "step": 3806, + "time_per_iteration": 2.415726900100708 + }, + { + "auxiliary_loss_clip": 0.01139425, + "auxiliary_loss_mlp": 0.01043738, + "balance_loss_clip": 1.02717948, + "balance_loss_mlp": 1.04547703, + "epoch": 0.22888922290695926, + "flos": 39855457071360.0, + "grad_norm": 2.230394288716221, + "language_loss": 0.69194484, + "learning_rate": 3.5991974277000684e-06, + "loss": 0.71377647, + "num_input_tokens_seen": 81935420, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9375, + "step": 3807, + "time_per_iteration": 2.6051764488220215 + }, + { + "auxiliary_loss_clip": 0.01144692, + "auxiliary_loss_mlp": 0.01053011, + "balance_loss_clip": 1.03484392, + "balance_loss_mlp": 1.04811931, + "epoch": 0.22894934615962723, + "flos": 23403774372480.0, + "grad_norm": 2.5207266425605668, + "language_loss": 0.65717816, + "learning_rate": 3.5989635121360733e-06, + "loss": 0.67915517, + "num_input_tokens_seen": 81953845, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.96484375, + "step": 3808, + "time_per_iteration": 2.463885545730591 + }, + { + "auxiliary_loss_clip": 0.01137242, + "auxiliary_loss_mlp": 0.01041588, + "balance_loss_clip": 1.02564931, + "balance_loss_mlp": 1.04470515, + "epoch": 0.22900946941229522, + "flos": 18843011671680.0, + "grad_norm": 1.8002654314964242, + "language_loss": 0.74498177, + "learning_rate": 3.598729535939222e-06, + "loss": 0.76677001, + "num_input_tokens_seen": 81972100, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.92578125, + "step": 3809, + "time_per_iteration": 2.4587652683258057 + }, + { + "auxiliary_loss_clip": 0.01138179, + "auxiliary_loss_mlp": 0.01042926, + "balance_loss_clip": 1.02695227, + "balance_loss_mlp": 1.04707646, + "epoch": 0.22906959266496318, + "flos": 22929394429440.0, + "grad_norm": 1.6413135962032894, + "language_loss": 0.81699908, + "learning_rate": 3.5984954991183862e-06, + "loss": 0.83881009, + "num_input_tokens_seen": 81992760, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.9140625, + "step": 3810, + "time_per_iteration": 2.454545736312866 + }, + { + "auxiliary_loss_clip": 0.01135521, + "auxiliary_loss_mlp": 0.01040213, + "balance_loss_clip": 1.02448893, + "balance_loss_mlp": 1.04428005, + "epoch": 0.22912971591763115, + "flos": 19354523299200.0, + "grad_norm": 2.1876822434942245, + "language_loss": 0.78671384, + "learning_rate": 3.598261401682441e-06, + "loss": 0.8084712, + "num_input_tokens_seen": 82009080, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.9140625, + "step": 3811, + "time_per_iteration": 2.4564197063446045 + }, + { + "auxiliary_loss_clip": 0.01135961, + "auxiliary_loss_mlp": 0.01046878, + "balance_loss_clip": 1.0296042, + "balance_loss_mlp": 1.04317403, + "epoch": 0.22918983917029911, + "flos": 19933546538880.0, + "grad_norm": 1.8120535445273127, + "language_loss": 0.82811391, + "learning_rate": 3.5980272436402632e-06, + "loss": 0.84994221, + "num_input_tokens_seen": 82026705, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9296875, + "step": 3812, + "time_per_iteration": 2.4357566833496094 + }, + { + "auxiliary_loss_clip": 0.01144518, + "auxiliary_loss_mlp": 0.01051465, + "balance_loss_clip": 1.03463292, + "balance_loss_mlp": 1.04750013, + "epoch": 0.22924996242296708, + "flos": 16690885320960.0, + "grad_norm": 3.041111828111396, + "language_loss": 0.82337058, + "learning_rate": 3.5977930250007324e-06, + "loss": 0.84533036, + "num_input_tokens_seen": 82043245, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.96875, + "step": 3813, + "time_per_iteration": 2.4521987438201904 + }, + { + "auxiliary_loss_clip": 0.01139715, + "auxiliary_loss_mlp": 0.01046648, + "balance_loss_clip": 1.03009009, + "balance_loss_mlp": 1.04595184, + "epoch": 0.22931008567563504, + "flos": 33036164956800.0, + "grad_norm": 3.1740680187078896, + "language_loss": 0.69927102, + "learning_rate": 3.5975587457727298e-06, + "loss": 0.72113466, + "num_input_tokens_seen": 82066870, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9375, + "step": 3814, + "time_per_iteration": 2.5528602600097656 + }, + { + "auxiliary_loss_clip": 0.01134595, + "auxiliary_loss_mlp": 0.01044391, + "balance_loss_clip": 1.02773738, + "balance_loss_mlp": 1.04310775, + "epoch": 0.229370208928303, + "flos": 23330696152320.0, + "grad_norm": 2.479981906508555, + "language_loss": 0.67106915, + "learning_rate": 3.597324405965139e-06, + "loss": 0.69285899, + "num_input_tokens_seen": 82083180, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9140625, + "step": 3815, + "time_per_iteration": 2.4768760204315186 + }, + { + "auxiliary_loss_clip": 0.01139552, + "auxiliary_loss_mlp": 0.01052238, + "balance_loss_clip": 1.03593004, + "balance_loss_mlp": 1.04644942, + "epoch": 0.229430332180971, + "flos": 28617213150720.0, + "grad_norm": 1.8467960453518941, + "language_loss": 0.83103681, + "learning_rate": 3.597090005586848e-06, + "loss": 0.85295475, + "num_input_tokens_seen": 82102950, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.9296875, + "step": 3816, + "time_per_iteration": 2.507967710494995 + }, + { + "auxiliary_loss_clip": 0.0113842, + "auxiliary_loss_mlp": 0.01038991, + "balance_loss_clip": 1.02221847, + "balance_loss_mlp": 1.04643357, + "epoch": 0.22949045543363897, + "flos": 17238199829760.0, + "grad_norm": 2.1171855882825636, + "language_loss": 0.86756372, + "learning_rate": 3.596855544646742e-06, + "loss": 0.8893379, + "num_input_tokens_seen": 82119510, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.91796875, + "step": 3817, + "time_per_iteration": 2.4445815086364746 + }, + { + "auxiliary_loss_clip": 0.01142243, + "auxiliary_loss_mlp": 0.01049311, + "balance_loss_clip": 1.03278852, + "balance_loss_mlp": 1.04829407, + "epoch": 0.22955057868630693, + "flos": 27489438858240.0, + "grad_norm": 2.403232678237585, + "language_loss": 0.75039381, + "learning_rate": 3.5966210231537154e-06, + "loss": 0.77230936, + "num_input_tokens_seen": 82140095, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.9375, + "step": 3818, + "time_per_iteration": 2.508527994155884 + }, + { + "auxiliary_loss_clip": 0.01141204, + "auxiliary_loss_mlp": 0.01041338, + "balance_loss_clip": 1.02426732, + "balance_loss_mlp": 1.04769611, + "epoch": 0.2296107019389749, + "flos": 23476421629440.0, + "grad_norm": 1.6537639427714739, + "language_loss": 0.74597251, + "learning_rate": 3.596386441116659e-06, + "loss": 0.76779795, + "num_input_tokens_seen": 82159510, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.93359375, + "step": 3819, + "time_per_iteration": 2.5009493827819824 + }, + { + "auxiliary_loss_clip": 0.01138376, + "auxiliary_loss_mlp": 0.01044786, + "balance_loss_clip": 1.02806103, + "balance_loss_mlp": 1.04632187, + "epoch": 0.22967082519164286, + "flos": 31285160760960.0, + "grad_norm": 1.815385500594849, + "language_loss": 0.80775046, + "learning_rate": 3.5961517985444684e-06, + "loss": 0.8295821, + "num_input_tokens_seen": 82179580, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.921875, + "step": 3820, + "time_per_iteration": 2.5374531745910645 + }, + { + "auxiliary_loss_clip": 0.01142613, + "auxiliary_loss_mlp": 0.01041984, + "balance_loss_clip": 1.02384043, + "balance_loss_mlp": 1.04725921, + "epoch": 0.22973094844431083, + "flos": 14642935390080.0, + "grad_norm": 2.0886359367899763, + "language_loss": 0.69226766, + "learning_rate": 3.595917095446042e-06, + "loss": 0.71411359, + "num_input_tokens_seen": 82195585, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.953125, + "step": 3821, + "time_per_iteration": 2.4539082050323486 + }, + { + "auxiliary_loss_clip": 0.0113954, + "auxiliary_loss_mlp": 0.01036487, + "balance_loss_clip": 1.01912975, + "balance_loss_mlp": 1.0466336, + "epoch": 0.2297910716969788, + "flos": 22823853292800.0, + "grad_norm": 1.623620301878745, + "language_loss": 0.82655883, + "learning_rate": 3.5956823318302796e-06, + "loss": 0.84831905, + "num_input_tokens_seen": 82217530, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9296875, + "step": 3822, + "time_per_iteration": 2.5025360584259033 + }, + { + "auxiliary_loss_clip": 0.01137437, + "auxiliary_loss_mlp": 0.01040965, + "balance_loss_clip": 1.02264285, + "balance_loss_mlp": 1.04520607, + "epoch": 0.2298511949496468, + "flos": 23039029716480.0, + "grad_norm": 1.581563173789708, + "language_loss": 0.66093826, + "learning_rate": 3.5954475077060833e-06, + "loss": 0.68272227, + "num_input_tokens_seen": 82237980, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.921875, + "step": 3823, + "time_per_iteration": 2.500643253326416 + }, + { + "auxiliary_loss_clip": 0.0104753, + "auxiliary_loss_mlp": 0.01001124, + "balance_loss_clip": 0.99913329, + "balance_loss_mlp": 1.01448655, + "epoch": 0.22991131820231475, + "flos": 66890914911360.0, + "grad_norm": 0.8191682875264555, + "language_loss": 0.56770015, + "learning_rate": 3.595212623082357e-06, + "loss": 0.58818674, + "num_input_tokens_seen": 82301785, + "router_z_loss_clip": 0.01989746, + "router_z_loss_mlp": 0.33203125, + "step": 3824, + "time_per_iteration": 3.1365485191345215 + }, + { + "auxiliary_loss_clip": 0.01135805, + "auxiliary_loss_mlp": 0.01039563, + "balance_loss_clip": 1.02363658, + "balance_loss_mlp": 1.04575276, + "epoch": 0.22997144145498272, + "flos": 17887248633600.0, + "grad_norm": 2.487273324074565, + "language_loss": 0.72840559, + "learning_rate": 3.594977677968009e-06, + "loss": 0.75015926, + "num_input_tokens_seen": 82317355, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.90234375, + "step": 3825, + "time_per_iteration": 2.444730758666992 + }, + { + "auxiliary_loss_clip": 0.01143286, + "auxiliary_loss_mlp": 0.01046033, + "balance_loss_clip": 1.02810407, + "balance_loss_mlp": 1.04978526, + "epoch": 0.23003156470765068, + "flos": 24676843178880.0, + "grad_norm": 1.8892090994393747, + "language_loss": 0.87760615, + "learning_rate": 3.5947426723719473e-06, + "loss": 0.89949936, + "num_input_tokens_seen": 82336645, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9375, + "step": 3826, + "time_per_iteration": 2.492682456970215 + }, + { + "auxiliary_loss_clip": 0.01142911, + "auxiliary_loss_mlp": 0.01043844, + "balance_loss_clip": 1.0258677, + "balance_loss_mlp": 1.04683542, + "epoch": 0.23009168796031865, + "flos": 15814126247040.0, + "grad_norm": 2.6663888482282623, + "language_loss": 0.81568289, + "learning_rate": 3.594507606303083e-06, + "loss": 0.8375504, + "num_input_tokens_seen": 82354225, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9609375, + "step": 3827, + "time_per_iteration": 2.488593578338623 + }, + { + "auxiliary_loss_clip": 0.01135849, + "auxiliary_loss_mlp": 0.01043921, + "balance_loss_clip": 1.02750623, + "balance_loss_mlp": 1.04553437, + "epoch": 0.2301518112129866, + "flos": 16212842190720.0, + "grad_norm": 1.8456206141648608, + "language_loss": 0.86791205, + "learning_rate": 3.5942724797703314e-06, + "loss": 0.88970977, + "num_input_tokens_seen": 82370240, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90625, + "step": 3828, + "time_per_iteration": 2.4386606216430664 + }, + { + "auxiliary_loss_clip": 0.01138396, + "auxiliary_loss_mlp": 0.01049169, + "balance_loss_clip": 1.03147864, + "balance_loss_mlp": 1.04512644, + "epoch": 0.2302119344656546, + "flos": 20595452411520.0, + "grad_norm": 2.106420485404446, + "language_loss": 0.70638877, + "learning_rate": 3.594037292782607e-06, + "loss": 0.72826439, + "num_input_tokens_seen": 82389145, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.93359375, + "step": 3829, + "time_per_iteration": 2.475399971008301 + }, + { + "auxiliary_loss_clip": 0.01139852, + "auxiliary_loss_mlp": 0.01038095, + "balance_loss_clip": 1.02241933, + "balance_loss_mlp": 1.05011487, + "epoch": 0.23027205771832257, + "flos": 26796901662720.0, + "grad_norm": 1.5719627508253273, + "language_loss": 0.84045994, + "learning_rate": 3.5938020453488293e-06, + "loss": 0.86223942, + "num_input_tokens_seen": 82409185, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8984375, + "step": 3830, + "time_per_iteration": 2.4943718910217285 + }, + { + "auxiliary_loss_clip": 0.01139071, + "auxiliary_loss_mlp": 0.01049012, + "balance_loss_clip": 1.03172636, + "balance_loss_mlp": 1.04637957, + "epoch": 0.23033218097099054, + "flos": 43873143068160.0, + "grad_norm": 1.733206127117623, + "language_loss": 0.66863495, + "learning_rate": 3.5935667374779177e-06, + "loss": 0.69051576, + "num_input_tokens_seen": 82432070, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.92578125, + "step": 3831, + "time_per_iteration": 2.6513662338256836 + }, + { + "auxiliary_loss_clip": 0.01141151, + "auxiliary_loss_mlp": 0.01042727, + "balance_loss_clip": 1.02603793, + "balance_loss_mlp": 1.04735637, + "epoch": 0.2303923042236585, + "flos": 26067663745920.0, + "grad_norm": 2.238850649877041, + "language_loss": 0.75253022, + "learning_rate": 3.5933313691787957e-06, + "loss": 0.77436894, + "num_input_tokens_seen": 82450625, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9375, + "step": 3832, + "time_per_iteration": 2.4889180660247803 + }, + { + "auxiliary_loss_clip": 0.01139559, + "auxiliary_loss_mlp": 0.01043075, + "balance_loss_clip": 1.02515745, + "balance_loss_mlp": 1.04709673, + "epoch": 0.23045242747632647, + "flos": 18296379521280.0, + "grad_norm": 1.8583815246829203, + "language_loss": 0.87474239, + "learning_rate": 3.593095940460389e-06, + "loss": 0.89656878, + "num_input_tokens_seen": 82468575, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.92578125, + "step": 3833, + "time_per_iteration": 2.46744966506958 + }, + { + "auxiliary_loss_clip": 0.01140821, + "auxiliary_loss_mlp": 0.01047215, + "balance_loss_clip": 1.02950096, + "balance_loss_mlp": 1.0478369, + "epoch": 0.23051255072899443, + "flos": 25520528805120.0, + "grad_norm": 3.2120713643012206, + "language_loss": 0.74875945, + "learning_rate": 3.592860451331624e-06, + "loss": 0.77063978, + "num_input_tokens_seen": 82488655, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9296875, + "step": 3834, + "time_per_iteration": 2.485504627227783 + }, + { + "auxiliary_loss_clip": 0.0113943, + "auxiliary_loss_mlp": 0.01051682, + "balance_loss_clip": 1.03408706, + "balance_loss_mlp": 1.0484879, + "epoch": 0.2305726739816624, + "flos": 21215198695680.0, + "grad_norm": 1.820281268490984, + "language_loss": 0.85338157, + "learning_rate": 3.592624901801432e-06, + "loss": 0.87529278, + "num_input_tokens_seen": 82507220, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.91015625, + "step": 3835, + "time_per_iteration": 2.4730474948883057 + }, + { + "auxiliary_loss_clip": 0.01146651, + "auxiliary_loss_mlp": 0.01049278, + "balance_loss_clip": 1.03142083, + "balance_loss_mlp": 1.04814029, + "epoch": 0.2306327972343304, + "flos": 23331127115520.0, + "grad_norm": 2.799799470431086, + "language_loss": 0.81974924, + "learning_rate": 3.5923892918787432e-06, + "loss": 0.84170854, + "num_input_tokens_seen": 82527920, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.984375, + "step": 3836, + "time_per_iteration": 2.464657783508301 + }, + { + "auxiliary_loss_clip": 0.0114557, + "auxiliary_loss_mlp": 0.01043707, + "balance_loss_clip": 1.02726793, + "balance_loss_mlp": 1.05202293, + "epoch": 0.23069292048699835, + "flos": 20666734951680.0, + "grad_norm": 1.7793450137018207, + "language_loss": 0.79603267, + "learning_rate": 3.5921536215724934e-06, + "loss": 0.81792545, + "num_input_tokens_seen": 82549040, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9375, + "step": 3837, + "time_per_iteration": 2.4715559482574463 + }, + { + "auxiliary_loss_clip": 0.01055276, + "auxiliary_loss_mlp": 0.01017826, + "balance_loss_clip": 1.01614499, + "balance_loss_mlp": 1.02046371, + "epoch": 0.23075304373966632, + "flos": 70454832393600.0, + "grad_norm": 0.9409846751082755, + "language_loss": 0.65487945, + "learning_rate": 3.5919178908916184e-06, + "loss": 0.67561042, + "num_input_tokens_seen": 82604070, + "router_z_loss_clip": 0.0168457, + "router_z_loss_mlp": 0.34765625, + "step": 3838, + "time_per_iteration": 2.9852375984191895 + }, + { + "auxiliary_loss_clip": 0.01139351, + "auxiliary_loss_mlp": 0.01047713, + "balance_loss_clip": 1.03131008, + "balance_loss_mlp": 1.04721856, + "epoch": 0.23081316699233428, + "flos": 16617986668800.0, + "grad_norm": 2.6310373190732648, + "language_loss": 0.7527796, + "learning_rate": 3.591682099845058e-06, + "loss": 0.77465028, + "num_input_tokens_seen": 82619665, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.921875, + "step": 3839, + "time_per_iteration": 2.4290778636932373 + }, + { + "auxiliary_loss_clip": 0.01145463, + "auxiliary_loss_mlp": 0.01042021, + "balance_loss_clip": 1.02486694, + "balance_loss_mlp": 1.0510757, + "epoch": 0.23087329024500225, + "flos": 13298081253120.0, + "grad_norm": 4.016837458595543, + "language_loss": 0.68691337, + "learning_rate": 3.591446248441752e-06, + "loss": 0.70878816, + "num_input_tokens_seen": 82637530, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9453125, + "step": 3840, + "time_per_iteration": 2.456422805786133 + }, + { + "auxiliary_loss_clip": 0.01143425, + "auxiliary_loss_mlp": 0.01039716, + "balance_loss_clip": 1.02084517, + "balance_loss_mlp": 1.04936612, + "epoch": 0.23093341349767021, + "flos": 17785729820160.0, + "grad_norm": 2.1574295618121426, + "language_loss": 0.79412574, + "learning_rate": 3.591210336690645e-06, + "loss": 0.81595719, + "num_input_tokens_seen": 82656130, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.9375, + "step": 3841, + "time_per_iteration": 2.4762818813323975 + }, + { + "auxiliary_loss_clip": 0.01141641, + "auxiliary_loss_mlp": 0.01041348, + "balance_loss_clip": 1.02557695, + "balance_loss_mlp": 1.04872346, + "epoch": 0.23099353675033818, + "flos": 23988076911360.0, + "grad_norm": 5.070488540070664, + "language_loss": 0.83171731, + "learning_rate": 3.590974364600683e-06, + "loss": 0.85354722, + "num_input_tokens_seen": 82675295, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.9296875, + "step": 3842, + "time_per_iteration": 2.4908032417297363 + }, + { + "auxiliary_loss_clip": 0.01139394, + "auxiliary_loss_mlp": 0.0104314, + "balance_loss_clip": 1.0255568, + "balance_loss_mlp": 1.04567111, + "epoch": 0.23105366000300617, + "flos": 35995168471680.0, + "grad_norm": 1.6842769818445011, + "language_loss": 0.66523731, + "learning_rate": 3.5907383321808135e-06, + "loss": 0.68706262, + "num_input_tokens_seen": 82703260, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9375, + "step": 3843, + "time_per_iteration": 2.6503937244415283 + }, + { + "auxiliary_loss_clip": 0.01138914, + "auxiliary_loss_mlp": 0.01043512, + "balance_loss_clip": 1.02642977, + "balance_loss_mlp": 1.04793119, + "epoch": 0.23111378325567414, + "flos": 31245335556480.0, + "grad_norm": 1.8910129932977493, + "language_loss": 0.77445257, + "learning_rate": 3.590502239439987e-06, + "loss": 0.79627681, + "num_input_tokens_seen": 82725060, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.91015625, + "step": 3844, + "time_per_iteration": 5.4645676612854 + }, + { + "auxiliary_loss_clip": 0.01142407, + "auxiliary_loss_mlp": 0.01041287, + "balance_loss_clip": 1.02321458, + "balance_loss_mlp": 1.04744804, + "epoch": 0.2311739065083421, + "flos": 19208223204480.0, + "grad_norm": 1.6615026518232119, + "language_loss": 0.77974623, + "learning_rate": 3.590266086387156e-06, + "loss": 0.80158317, + "num_input_tokens_seen": 82742960, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.94921875, + "step": 3845, + "time_per_iteration": 2.467289686203003 + }, + { + "auxiliary_loss_clip": 0.01133475, + "auxiliary_loss_mlp": 0.01032005, + "balance_loss_clip": 1.01687717, + "balance_loss_mlp": 1.04577661, + "epoch": 0.23123402976101007, + "flos": 23360178240000.0, + "grad_norm": 2.1438137502119425, + "language_loss": 0.76064527, + "learning_rate": 3.590029873031276e-06, + "loss": 0.78230006, + "num_input_tokens_seen": 82760205, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.875, + "step": 3846, + "time_per_iteration": 2.4985382556915283 + }, + { + "auxiliary_loss_clip": 0.01140881, + "auxiliary_loss_mlp": 0.0104335, + "balance_loss_clip": 1.02638626, + "balance_loss_mlp": 1.04725194, + "epoch": 0.23129415301367803, + "flos": 13735365425280.0, + "grad_norm": 2.4609763976845556, + "language_loss": 0.69493651, + "learning_rate": 3.589793599381304e-06, + "loss": 0.71677887, + "num_input_tokens_seen": 82778590, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9375, + "step": 3847, + "time_per_iteration": 2.4514195919036865 + }, + { + "auxiliary_loss_clip": 0.01048129, + "auxiliary_loss_mlp": 0.01002559, + "balance_loss_clip": 1.00074661, + "balance_loss_mlp": 1.01598144, + "epoch": 0.231354276266346, + "flos": 69737015001600.0, + "grad_norm": 0.7927409416341922, + "language_loss": 0.61051595, + "learning_rate": 3.589557265446198e-06, + "loss": 0.63102281, + "num_input_tokens_seen": 82833925, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.3203125, + "step": 3848, + "time_per_iteration": 2.981518030166626 + }, + { + "auxiliary_loss_clip": 0.011385, + "auxiliary_loss_mlp": 0.0104523, + "balance_loss_clip": 1.02789688, + "balance_loss_mlp": 1.04593349, + "epoch": 0.231414399519014, + "flos": 18835900778880.0, + "grad_norm": 2.568019101440284, + "language_loss": 0.7746805, + "learning_rate": 3.589320871234923e-06, + "loss": 0.79651785, + "num_input_tokens_seen": 82850625, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.92578125, + "step": 3849, + "time_per_iteration": 2.450693130493164 + }, + { + "auxiliary_loss_clip": 0.01139635, + "auxiliary_loss_mlp": 0.01042495, + "balance_loss_clip": 1.02551913, + "balance_loss_mlp": 1.04533124, + "epoch": 0.23147452277168196, + "flos": 36135470995200.0, + "grad_norm": 1.9223002445017061, + "language_loss": 0.71673942, + "learning_rate": 3.5890844167564405e-06, + "loss": 0.73856068, + "num_input_tokens_seen": 82872105, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9453125, + "step": 3850, + "time_per_iteration": 2.589395761489868 + }, + { + "auxiliary_loss_clip": 0.01137166, + "auxiliary_loss_mlp": 0.01035391, + "balance_loss_clip": 1.01870215, + "balance_loss_mlp": 1.04362154, + "epoch": 0.23153464602434992, + "flos": 20812927305600.0, + "grad_norm": 3.8422038584857665, + "language_loss": 0.75846308, + "learning_rate": 3.588847902019718e-06, + "loss": 0.78018856, + "num_input_tokens_seen": 82890595, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.93359375, + "step": 3851, + "time_per_iteration": 2.495729446411133 + }, + { + "auxiliary_loss_clip": 0.01138492, + "auxiliary_loss_mlp": 0.01040821, + "balance_loss_clip": 1.0234046, + "balance_loss_mlp": 1.04747272, + "epoch": 0.2315947692770179, + "flos": 19939256801280.0, + "grad_norm": 1.914141324585442, + "language_loss": 0.69797802, + "learning_rate": 3.588611327033723e-06, + "loss": 0.71977121, + "num_input_tokens_seen": 82908910, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.91015625, + "step": 3852, + "time_per_iteration": 2.478408098220825 + }, + { + "auxiliary_loss_clip": 0.01140513, + "auxiliary_loss_mlp": 0.01037305, + "balance_loss_clip": 1.0206399, + "balance_loss_mlp": 1.04643583, + "epoch": 0.23165489252968585, + "flos": 12855553695360.0, + "grad_norm": 2.1861380100726144, + "language_loss": 0.67030561, + "learning_rate": 3.588374691807428e-06, + "loss": 0.69208378, + "num_input_tokens_seen": 82925405, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.94140625, + "step": 3853, + "time_per_iteration": 2.4445838928222656 + }, + { + "auxiliary_loss_clip": 0.01141194, + "auxiliary_loss_mlp": 0.01035545, + "balance_loss_clip": 1.01815248, + "balance_loss_mlp": 1.04680121, + "epoch": 0.23171501578235382, + "flos": 30628282792320.0, + "grad_norm": 1.6671703506367506, + "language_loss": 0.79851103, + "learning_rate": 3.5881379963498053e-06, + "loss": 0.82027847, + "num_input_tokens_seen": 82945615, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9453125, + "step": 3854, + "time_per_iteration": 2.5455782413482666 + }, + { + "auxiliary_loss_clip": 0.01146661, + "auxiliary_loss_mlp": 0.01042654, + "balance_loss_clip": 1.02476025, + "balance_loss_mlp": 1.04726899, + "epoch": 0.23177513903502178, + "flos": 23842782397440.0, + "grad_norm": 3.8560715318244556, + "language_loss": 0.64987147, + "learning_rate": 3.587901240669831e-06, + "loss": 0.67176461, + "num_input_tokens_seen": 82967570, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9921875, + "step": 3855, + "time_per_iteration": 2.5996124744415283 + }, + { + "auxiliary_loss_clip": 0.01140829, + "auxiliary_loss_mlp": 0.01044083, + "balance_loss_clip": 1.02753139, + "balance_loss_mlp": 1.04570055, + "epoch": 0.23183526228768978, + "flos": 29570282668800.0, + "grad_norm": 2.1096123404526623, + "language_loss": 0.70711654, + "learning_rate": 3.5876644247764815e-06, + "loss": 0.72896564, + "num_input_tokens_seen": 82987435, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.953125, + "step": 3856, + "time_per_iteration": 2.5024092197418213 + }, + { + "auxiliary_loss_clip": 0.01137323, + "auxiliary_loss_mlp": 0.01035633, + "balance_loss_clip": 1.02062488, + "balance_loss_mlp": 1.0464257, + "epoch": 0.23189538554035774, + "flos": 34458694254720.0, + "grad_norm": 6.089384897844753, + "language_loss": 0.76997125, + "learning_rate": 3.5874275486787387e-06, + "loss": 0.79170084, + "num_input_tokens_seen": 83010505, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.91015625, + "step": 3857, + "time_per_iteration": 2.5962576866149902 + }, + { + "auxiliary_loss_clip": 0.01143962, + "auxiliary_loss_mlp": 0.01048446, + "balance_loss_clip": 1.03018308, + "balance_loss_mlp": 1.0477798, + "epoch": 0.2319555087930257, + "flos": 18003815245440.0, + "grad_norm": 3.478057752262005, + "language_loss": 0.91006696, + "learning_rate": 3.587190612385584e-06, + "loss": 0.93199098, + "num_input_tokens_seen": 83026705, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.96484375, + "step": 3858, + "time_per_iteration": 2.4276509284973145 + }, + { + "auxiliary_loss_clip": 0.01136894, + "auxiliary_loss_mlp": 0.01042737, + "balance_loss_clip": 1.02576208, + "balance_loss_mlp": 1.04679012, + "epoch": 0.23201563204569367, + "flos": 23143852581120.0, + "grad_norm": 2.1437168922033747, + "language_loss": 0.75995493, + "learning_rate": 3.5869536159060026e-06, + "loss": 0.78175128, + "num_input_tokens_seen": 83046500, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.8984375, + "step": 3859, + "time_per_iteration": 2.485426187515259 + }, + { + "auxiliary_loss_clip": 0.01136619, + "auxiliary_loss_mlp": 0.01036655, + "balance_loss_clip": 1.01962614, + "balance_loss_mlp": 1.04423487, + "epoch": 0.23207575529836164, + "flos": 20667991927680.0, + "grad_norm": 1.9055462071213993, + "language_loss": 0.84061682, + "learning_rate": 3.58671655924898e-06, + "loss": 0.86234951, + "num_input_tokens_seen": 83065280, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.92578125, + "step": 3860, + "time_per_iteration": 2.4607324600219727 + }, + { + "auxiliary_loss_clip": 0.01137991, + "auxiliary_loss_mlp": 0.01040318, + "balance_loss_clip": 1.02317619, + "balance_loss_mlp": 1.04656291, + "epoch": 0.2321358785510296, + "flos": 16472189364480.0, + "grad_norm": 2.1337823805291047, + "language_loss": 0.82972974, + "learning_rate": 3.586479442423508e-06, + "loss": 0.85151279, + "num_input_tokens_seen": 83082310, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9140625, + "step": 3861, + "time_per_iteration": 2.451805591583252 + }, + { + "auxiliary_loss_clip": 0.01142125, + "auxiliary_loss_mlp": 0.01043892, + "balance_loss_clip": 1.02702415, + "balance_loss_mlp": 1.04800034, + "epoch": 0.2321960018036976, + "flos": 21616320850560.0, + "grad_norm": 1.8456518711772996, + "language_loss": 0.85918242, + "learning_rate": 3.586242265438576e-06, + "loss": 0.8810426, + "num_input_tokens_seen": 83102065, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.94140625, + "step": 3862, + "time_per_iteration": 2.4582395553588867 + }, + { + "auxiliary_loss_clip": 0.01136451, + "auxiliary_loss_mlp": 0.01044214, + "balance_loss_clip": 1.02914643, + "balance_loss_mlp": 1.0468179, + "epoch": 0.23225612505636556, + "flos": 22271474966400.0, + "grad_norm": 1.3833481647146872, + "language_loss": 0.7492758, + "learning_rate": 3.5860050283031773e-06, + "loss": 0.7710824, + "num_input_tokens_seen": 83121445, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8984375, + "step": 3863, + "time_per_iteration": 2.496985912322998 + }, + { + "auxiliary_loss_clip": 0.01139904, + "auxiliary_loss_mlp": 0.01042767, + "balance_loss_clip": 1.02723408, + "balance_loss_mlp": 1.05037498, + "epoch": 0.23231624830903352, + "flos": 17052325925760.0, + "grad_norm": 2.003739732436234, + "language_loss": 0.74640852, + "learning_rate": 3.58576773102631e-06, + "loss": 0.76823521, + "num_input_tokens_seen": 83138175, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.89453125, + "step": 3864, + "time_per_iteration": 2.440204381942749 + }, + { + "auxiliary_loss_clip": 0.0113912, + "auxiliary_loss_mlp": 0.01035726, + "balance_loss_clip": 1.01952517, + "balance_loss_mlp": 1.0468204, + "epoch": 0.2323763715617015, + "flos": 34640043045120.0, + "grad_norm": 3.940820538439298, + "language_loss": 0.70690906, + "learning_rate": 3.5855303736169714e-06, + "loss": 0.72865754, + "num_input_tokens_seen": 83161975, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.921875, + "step": 3865, + "time_per_iteration": 2.598194122314453 + }, + { + "auxiliary_loss_clip": 0.01148702, + "auxiliary_loss_mlp": 0.01049623, + "balance_loss_clip": 1.03091884, + "balance_loss_mlp": 1.04987264, + "epoch": 0.23243649481436945, + "flos": 25551698832000.0, + "grad_norm": 1.9658537667403149, + "language_loss": 0.94853866, + "learning_rate": 3.5852929560841617e-06, + "loss": 0.97052193, + "num_input_tokens_seen": 83180905, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.98828125, + "step": 3866, + "time_per_iteration": 2.496276617050171 + }, + { + "auxiliary_loss_clip": 0.01138876, + "auxiliary_loss_mlp": 0.01040339, + "balance_loss_clip": 1.02412629, + "balance_loss_mlp": 1.04817796, + "epoch": 0.23249661806703742, + "flos": 20483482740480.0, + "grad_norm": 2.6667540210019123, + "language_loss": 0.72528732, + "learning_rate": 3.5850554784368846e-06, + "loss": 0.74707949, + "num_input_tokens_seen": 83196390, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.90625, + "step": 3867, + "time_per_iteration": 2.4933414459228516 + }, + { + "auxiliary_loss_clip": 0.01140693, + "auxiliary_loss_mlp": 0.01043897, + "balance_loss_clip": 1.02625418, + "balance_loss_mlp": 1.04734945, + "epoch": 0.23255674131970538, + "flos": 20376612800640.0, + "grad_norm": 1.8421111702540602, + "language_loss": 0.82411921, + "learning_rate": 3.584817940684145e-06, + "loss": 0.84596509, + "num_input_tokens_seen": 83216165, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.93359375, + "step": 3868, + "time_per_iteration": 2.4994540214538574 + }, + { + "auxiliary_loss_clip": 0.01136829, + "auxiliary_loss_mlp": 0.01040452, + "balance_loss_clip": 1.02433491, + "balance_loss_mlp": 1.04700828, + "epoch": 0.23261686457237338, + "flos": 17056096853760.0, + "grad_norm": 1.815886356300666, + "language_loss": 0.73335075, + "learning_rate": 3.58458034283495e-06, + "loss": 0.75512362, + "num_input_tokens_seen": 83233845, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8984375, + "step": 3869, + "time_per_iteration": 2.4486095905303955 + }, + { + "auxiliary_loss_clip": 0.01139645, + "auxiliary_loss_mlp": 0.01047185, + "balance_loss_clip": 1.03108525, + "balance_loss_mlp": 1.04929376, + "epoch": 0.23267698782504134, + "flos": 29169878785920.0, + "grad_norm": 1.6948965109205438, + "language_loss": 0.79564929, + "learning_rate": 3.5843426848983097e-06, + "loss": 0.81751764, + "num_input_tokens_seen": 83254930, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.90234375, + "step": 3870, + "time_per_iteration": 2.506114959716797 + }, + { + "auxiliary_loss_clip": 0.01143066, + "auxiliary_loss_mlp": 0.01043212, + "balance_loss_clip": 1.02574801, + "balance_loss_mlp": 1.04845953, + "epoch": 0.2327371110777093, + "flos": 21174655219200.0, + "grad_norm": 3.2368167151878797, + "language_loss": 0.70599115, + "learning_rate": 3.5841049668832357e-06, + "loss": 0.72785389, + "num_input_tokens_seen": 83272095, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9453125, + "step": 3871, + "time_per_iteration": 2.455266237258911 + }, + { + "auxiliary_loss_clip": 0.01145685, + "auxiliary_loss_mlp": 0.01055983, + "balance_loss_clip": 1.03674293, + "balance_loss_mlp": 1.05011845, + "epoch": 0.23279723433037727, + "flos": 24863112132480.0, + "grad_norm": 2.2694181422477313, + "language_loss": 0.69087327, + "learning_rate": 3.5838671887987433e-06, + "loss": 0.71289003, + "num_input_tokens_seen": 83290980, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.95703125, + "step": 3872, + "time_per_iteration": 2.482089042663574 + }, + { + "auxiliary_loss_clip": 0.01147162, + "auxiliary_loss_mlp": 0.01045167, + "balance_loss_clip": 1.0271188, + "balance_loss_mlp": 1.04984593, + "epoch": 0.23285735758304524, + "flos": 38800617344640.0, + "grad_norm": 1.4965805681858408, + "language_loss": 0.78046703, + "learning_rate": 3.5836293506538474e-06, + "loss": 0.80239034, + "num_input_tokens_seen": 83315175, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.97265625, + "step": 3873, + "time_per_iteration": 2.6179678440093994 + }, + { + "auxiliary_loss_clip": 0.01053819, + "auxiliary_loss_mlp": 0.01009657, + "balance_loss_clip": 1.00777328, + "balance_loss_mlp": 1.02347898, + "epoch": 0.2329174808357132, + "flos": 53944113692160.0, + "grad_norm": 0.841863213022928, + "language_loss": 0.60519493, + "learning_rate": 3.5833914524575687e-06, + "loss": 0.6258297, + "num_input_tokens_seen": 83372060, + "router_z_loss_clip": 0.01879883, + "router_z_loss_mlp": 0.3046875, + "step": 3874, + "time_per_iteration": 2.955524444580078 + }, + { + "auxiliary_loss_clip": 0.01142096, + "auxiliary_loss_mlp": 0.01044266, + "balance_loss_clip": 1.02695727, + "balance_loss_mlp": 1.04998708, + "epoch": 0.23297760408838117, + "flos": 21216024708480.0, + "grad_norm": 2.0817330720741287, + "language_loss": 0.8082279, + "learning_rate": 3.583153494218927e-06, + "loss": 0.83009154, + "num_input_tokens_seen": 83389795, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.921875, + "step": 3875, + "time_per_iteration": 2.4961941242218018 + }, + { + "auxiliary_loss_clip": 0.01141065, + "auxiliary_loss_mlp": 0.01039949, + "balance_loss_clip": 1.02440381, + "balance_loss_mlp": 1.04931068, + "epoch": 0.23303772734104916, + "flos": 28403006394240.0, + "grad_norm": 1.6586054731564495, + "language_loss": 0.60997009, + "learning_rate": 3.5829154759469464e-06, + "loss": 0.63178027, + "num_input_tokens_seen": 83410005, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.91796875, + "step": 3876, + "time_per_iteration": 2.5234174728393555 + }, + { + "auxiliary_loss_clip": 0.01144475, + "auxiliary_loss_mlp": 0.0104992, + "balance_loss_clip": 1.0319072, + "balance_loss_mlp": 1.05151403, + "epoch": 0.23309785059371713, + "flos": 24314720215680.0, + "grad_norm": 1.9912662806979935, + "language_loss": 0.70357525, + "learning_rate": 3.5826773976506523e-06, + "loss": 0.72551912, + "num_input_tokens_seen": 83430250, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9296875, + "step": 3877, + "time_per_iteration": 2.5117876529693604 + }, + { + "auxiliary_loss_clip": 0.01142635, + "auxiliary_loss_mlp": 0.0104808, + "balance_loss_clip": 1.02984059, + "balance_loss_mlp": 1.04846656, + "epoch": 0.2331579738463851, + "flos": 15992925171840.0, + "grad_norm": 2.20617127152986, + "language_loss": 0.81169856, + "learning_rate": 3.582439259339073e-06, + "loss": 0.83360565, + "num_input_tokens_seen": 83447950, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.94140625, + "step": 3878, + "time_per_iteration": 2.418745517730713 + }, + { + "auxiliary_loss_clip": 0.01145943, + "auxiliary_loss_mlp": 0.01047242, + "balance_loss_clip": 1.02914643, + "balance_loss_mlp": 1.04905999, + "epoch": 0.23321809709905306, + "flos": 36426957863040.0, + "grad_norm": 2.449565501872003, + "language_loss": 0.74765849, + "learning_rate": 3.5822010610212374e-06, + "loss": 0.76959032, + "num_input_tokens_seen": 83467785, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.96875, + "step": 3879, + "time_per_iteration": 2.627453088760376 + }, + { + "auxiliary_loss_clip": 0.0113984, + "auxiliary_loss_mlp": 0.01043428, + "balance_loss_clip": 1.02597582, + "balance_loss_mlp": 1.04611635, + "epoch": 0.23327822035172102, + "flos": 21324762155520.0, + "grad_norm": 2.3281305870509685, + "language_loss": 0.89896512, + "learning_rate": 3.5819628027061795e-06, + "loss": 0.92079782, + "num_input_tokens_seen": 83485390, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9375, + "step": 3880, + "time_per_iteration": 2.529181957244873 + }, + { + "auxiliary_loss_clip": 0.01144521, + "auxiliary_loss_mlp": 0.01046534, + "balance_loss_clip": 1.02926075, + "balance_loss_mlp": 1.05019975, + "epoch": 0.233338343604389, + "flos": 19171881619200.0, + "grad_norm": 1.7300006336865508, + "language_loss": 0.72026277, + "learning_rate": 3.5817244844029334e-06, + "loss": 0.74217331, + "num_input_tokens_seen": 83504890, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9453125, + "step": 3881, + "time_per_iteration": 2.5004756450653076 + }, + { + "auxiliary_loss_clip": 0.01138796, + "auxiliary_loss_mlp": 0.01044785, + "balance_loss_clip": 1.02798867, + "balance_loss_mlp": 1.04610527, + "epoch": 0.23339846685705698, + "flos": 26908368543360.0, + "grad_norm": 1.5765664683306326, + "language_loss": 0.67988127, + "learning_rate": 3.581486106120537e-06, + "loss": 0.70171714, + "num_input_tokens_seen": 83526475, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9296875, + "step": 3882, + "time_per_iteration": 2.5134541988372803 + }, + { + "auxiliary_loss_clip": 0.01143679, + "auxiliary_loss_mlp": 0.01057975, + "balance_loss_clip": 1.04020119, + "balance_loss_mlp": 1.0481658, + "epoch": 0.23345859010972494, + "flos": 32343160884480.0, + "grad_norm": 3.2831975264627116, + "language_loss": 0.76596051, + "learning_rate": 3.5812476678680287e-06, + "loss": 0.78797704, + "num_input_tokens_seen": 83546620, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.953125, + "step": 3883, + "time_per_iteration": 2.5556836128234863 + }, + { + "auxiliary_loss_clip": 0.01046918, + "auxiliary_loss_mlp": 0.01002528, + "balance_loss_clip": 1.00059688, + "balance_loss_mlp": 1.01619315, + "epoch": 0.2335187133623929, + "flos": 58484229050880.0, + "grad_norm": 0.7953130928556094, + "language_loss": 0.59102494, + "learning_rate": 3.58100916965445e-06, + "loss": 0.6115194, + "num_input_tokens_seen": 83616160, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.30859375, + "step": 3884, + "time_per_iteration": 3.210090398788452 + }, + { + "auxiliary_loss_clip": 0.01139917, + "auxiliary_loss_mlp": 0.01035881, + "balance_loss_clip": 1.0196687, + "balance_loss_mlp": 1.04723644, + "epoch": 0.23357883661506088, + "flos": 24502317972480.0, + "grad_norm": 3.4795297654408617, + "language_loss": 0.80128157, + "learning_rate": 3.5807706114888455e-06, + "loss": 0.82303953, + "num_input_tokens_seen": 83636795, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.92578125, + "step": 3885, + "time_per_iteration": 4.129857301712036 + }, + { + "auxiliary_loss_clip": 0.01139579, + "auxiliary_loss_mlp": 0.01039954, + "balance_loss_clip": 1.02257299, + "balance_loss_mlp": 1.04763317, + "epoch": 0.23363895986772884, + "flos": 18948516894720.0, + "grad_norm": 2.392049069504846, + "language_loss": 0.88482237, + "learning_rate": 3.580531993380261e-06, + "loss": 0.9066177, + "num_input_tokens_seen": 83654050, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.921875, + "step": 3886, + "time_per_iteration": 4.002579689025879 + }, + { + "auxiliary_loss_clip": 0.01143892, + "auxiliary_loss_mlp": 0.01041505, + "balance_loss_clip": 1.02452922, + "balance_loss_mlp": 1.04953825, + "epoch": 0.2336990831203968, + "flos": 31686821619840.0, + "grad_norm": 2.2740188667520815, + "language_loss": 0.73199034, + "learning_rate": 3.5802933153377445e-06, + "loss": 0.75384426, + "num_input_tokens_seen": 83673720, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9453125, + "step": 3887, + "time_per_iteration": 2.5730721950531006 + }, + { + "auxiliary_loss_clip": 0.0114256, + "auxiliary_loss_mlp": 0.01043796, + "balance_loss_clip": 1.02709508, + "balance_loss_mlp": 1.04827881, + "epoch": 0.23375920637306477, + "flos": 27709750926720.0, + "grad_norm": 1.8689872769958875, + "language_loss": 0.84098816, + "learning_rate": 3.5800545773703475e-06, + "loss": 0.86285174, + "num_input_tokens_seen": 83693470, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.94140625, + "step": 3888, + "time_per_iteration": 2.526090145111084 + }, + { + "auxiliary_loss_clip": 0.01140206, + "auxiliary_loss_mlp": 0.01051088, + "balance_loss_clip": 1.03400528, + "balance_loss_mlp": 1.04775357, + "epoch": 0.23381932962573276, + "flos": 17675627656320.0, + "grad_norm": 5.34722340994348, + "language_loss": 0.87174153, + "learning_rate": 3.5798157794871225e-06, + "loss": 0.89365447, + "num_input_tokens_seen": 83711620, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.921875, + "step": 3889, + "time_per_iteration": 2.465535879135132 + }, + { + "auxiliary_loss_clip": 0.01143335, + "auxiliary_loss_mlp": 0.0104294, + "balance_loss_clip": 1.02659607, + "balance_loss_mlp": 1.04914057, + "epoch": 0.23387945287840073, + "flos": 14390842763520.0, + "grad_norm": 4.26980733686294, + "language_loss": 0.7660414, + "learning_rate": 3.579576921697125e-06, + "loss": 0.78790414, + "num_input_tokens_seen": 83727890, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.94140625, + "step": 3890, + "time_per_iteration": 2.4164645671844482 + }, + { + "auxiliary_loss_clip": 0.01144006, + "auxiliary_loss_mlp": 0.01046428, + "balance_loss_clip": 1.02940536, + "balance_loss_mlp": 1.05018783, + "epoch": 0.2339395761310687, + "flos": 46097988503040.0, + "grad_norm": 3.12388753004446, + "language_loss": 0.73396742, + "learning_rate": 3.579338004009412e-06, + "loss": 0.75587177, + "num_input_tokens_seen": 83749370, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9375, + "step": 3891, + "time_per_iteration": 2.692251443862915 + }, + { + "auxiliary_loss_clip": 0.01136776, + "auxiliary_loss_mlp": 0.01040069, + "balance_loss_clip": 1.0227952, + "balance_loss_mlp": 1.04672241, + "epoch": 0.23399969938373666, + "flos": 22382044007040.0, + "grad_norm": 1.6638493558493535, + "language_loss": 0.82791233, + "learning_rate": 3.5790990264330433e-06, + "loss": 0.84968084, + "num_input_tokens_seen": 83769560, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.8984375, + "step": 3892, + "time_per_iteration": 2.4657654762268066 + }, + { + "auxiliary_loss_clip": 0.01143467, + "auxiliary_loss_mlp": 0.01042619, + "balance_loss_clip": 1.02550626, + "balance_loss_mlp": 1.04892194, + "epoch": 0.23405982263640462, + "flos": 43508542066560.0, + "grad_norm": 2.124834647136637, + "language_loss": 0.64928782, + "learning_rate": 3.578859988977082e-06, + "loss": 0.67114866, + "num_input_tokens_seen": 83795635, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9453125, + "step": 3893, + "time_per_iteration": 2.6640076637268066 + }, + { + "auxiliary_loss_clip": 0.01139173, + "auxiliary_loss_mlp": 0.01038221, + "balance_loss_clip": 1.02056575, + "balance_loss_mlp": 1.04930127, + "epoch": 0.2341199458890726, + "flos": 22564685687040.0, + "grad_norm": 2.3013698222001753, + "language_loss": 0.79011095, + "learning_rate": 3.5786208916505916e-06, + "loss": 0.81188488, + "num_input_tokens_seen": 83814090, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.8984375, + "step": 3894, + "time_per_iteration": 2.4596238136291504 + }, + { + "auxiliary_loss_clip": 0.01139997, + "auxiliary_loss_mlp": 0.01044293, + "balance_loss_clip": 1.02772284, + "balance_loss_mlp": 1.0473485, + "epoch": 0.23418006914174055, + "flos": 25633970933760.0, + "grad_norm": 1.4729608662155413, + "language_loss": 0.81608742, + "learning_rate": 3.5783817344626383e-06, + "loss": 0.83793032, + "num_input_tokens_seen": 83836870, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.92578125, + "step": 3895, + "time_per_iteration": 2.5229499340057373 + }, + { + "auxiliary_loss_clip": 0.01141397, + "auxiliary_loss_mlp": 0.01048743, + "balance_loss_clip": 1.03210139, + "balance_loss_mlp": 1.04895353, + "epoch": 0.23424019239440855, + "flos": 13545936074880.0, + "grad_norm": 2.370345363223057, + "language_loss": 0.79861861, + "learning_rate": 3.578142517422292e-06, + "loss": 0.82052004, + "num_input_tokens_seen": 83853275, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.92578125, + "step": 3896, + "time_per_iteration": 2.4219553470611572 + }, + { + "auxiliary_loss_clip": 0.01142956, + "auxiliary_loss_mlp": 0.01042754, + "balance_loss_clip": 1.02507555, + "balance_loss_mlp": 1.04863656, + "epoch": 0.2343003156470765, + "flos": 22419498913920.0, + "grad_norm": 1.6083647422684384, + "language_loss": 0.83279634, + "learning_rate": 3.577903240538623e-06, + "loss": 0.85465348, + "num_input_tokens_seen": 83872340, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9453125, + "step": 3897, + "time_per_iteration": 2.497347593307495 + }, + { + "auxiliary_loss_clip": 0.01144102, + "auxiliary_loss_mlp": 0.01048556, + "balance_loss_clip": 1.03093636, + "balance_loss_mlp": 1.04880857, + "epoch": 0.23436043889974448, + "flos": 14790815683200.0, + "grad_norm": 2.0551194275294784, + "language_loss": 0.79281437, + "learning_rate": 3.577663903820705e-06, + "loss": 0.8147409, + "num_input_tokens_seen": 83888795, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.953125, + "step": 3898, + "time_per_iteration": 2.4275295734405518 + }, + { + "auxiliary_loss_clip": 0.01139139, + "auxiliary_loss_mlp": 0.01047646, + "balance_loss_clip": 1.0316844, + "balance_loss_mlp": 1.05034626, + "epoch": 0.23442056215241244, + "flos": 22965700101120.0, + "grad_norm": 3.329769754331659, + "language_loss": 0.73955798, + "learning_rate": 3.577424507277614e-06, + "loss": 0.76142585, + "num_input_tokens_seen": 83906820, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.88671875, + "step": 3899, + "time_per_iteration": 2.5017077922821045 + }, + { + "auxiliary_loss_clip": 0.01141437, + "auxiliary_loss_mlp": 0.01051006, + "balance_loss_clip": 1.03412604, + "balance_loss_mlp": 1.04896975, + "epoch": 0.2344806854050804, + "flos": 23071887682560.0, + "grad_norm": 1.8374782290855665, + "language_loss": 0.75695914, + "learning_rate": 3.5771850509184277e-06, + "loss": 0.77888358, + "num_input_tokens_seen": 83926370, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.92578125, + "step": 3900, + "time_per_iteration": 2.4796969890594482 + }, + { + "auxiliary_loss_clip": 0.01137483, + "auxiliary_loss_mlp": 0.01049218, + "balance_loss_clip": 1.03224266, + "balance_loss_mlp": 1.04685295, + "epoch": 0.23454080865774837, + "flos": 16327074418560.0, + "grad_norm": 1.9641187800197561, + "language_loss": 0.66949147, + "learning_rate": 3.5769455347522256e-06, + "loss": 0.69135845, + "num_input_tokens_seen": 83944600, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.90625, + "step": 3901, + "time_per_iteration": 2.5052907466888428 + }, + { + "auxiliary_loss_clip": 0.01050259, + "auxiliary_loss_mlp": 0.0101831, + "balance_loss_clip": 1.01646185, + "balance_loss_mlp": 1.01950026, + "epoch": 0.23460093191041637, + "flos": 67760958142080.0, + "grad_norm": 0.7670843237762338, + "language_loss": 0.58209252, + "learning_rate": 3.576705958788091e-06, + "loss": 0.6027782, + "num_input_tokens_seen": 84005100, + "router_z_loss_clip": 0.01843262, + "router_z_loss_mlp": 0.30859375, + "step": 3902, + "time_per_iteration": 3.0522701740264893 + }, + { + "auxiliary_loss_clip": 0.01140756, + "auxiliary_loss_mlp": 0.01044175, + "balance_loss_clip": 1.02684176, + "balance_loss_mlp": 1.04932666, + "epoch": 0.23466105516308433, + "flos": 20077619990400.0, + "grad_norm": 1.9913375770157136, + "language_loss": 0.80411339, + "learning_rate": 3.576466323035108e-06, + "loss": 0.82596278, + "num_input_tokens_seen": 84023775, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9140625, + "step": 3903, + "time_per_iteration": 2.515796184539795 + }, + { + "auxiliary_loss_clip": 0.01139226, + "auxiliary_loss_mlp": 0.01039647, + "balance_loss_clip": 1.02274299, + "balance_loss_mlp": 1.04670942, + "epoch": 0.2347211784157523, + "flos": 24535714642560.0, + "grad_norm": 3.712536549247666, + "language_loss": 0.82183945, + "learning_rate": 3.5762266275023645e-06, + "loss": 0.84362817, + "num_input_tokens_seen": 84042605, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.92578125, + "step": 3904, + "time_per_iteration": 2.48119854927063 + }, + { + "auxiliary_loss_clip": 0.01140858, + "auxiliary_loss_mlp": 0.0104346, + "balance_loss_clip": 1.02642536, + "balance_loss_mlp": 1.05013537, + "epoch": 0.23478130166842026, + "flos": 23805040181760.0, + "grad_norm": 1.9990680719867946, + "language_loss": 0.7137326, + "learning_rate": 3.57598687219895e-06, + "loss": 0.7355758, + "num_input_tokens_seen": 84061520, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.90625, + "step": 3905, + "time_per_iteration": 2.494558811187744 + }, + { + "auxiliary_loss_clip": 0.01136571, + "auxiliary_loss_mlp": 0.01036326, + "balance_loss_clip": 1.01987517, + "balance_loss_mlp": 1.04811251, + "epoch": 0.23484142492108823, + "flos": 24093618048000.0, + "grad_norm": 1.865256832649412, + "language_loss": 0.70834756, + "learning_rate": 3.5757470571339543e-06, + "loss": 0.73007655, + "num_input_tokens_seen": 84081800, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8828125, + "step": 3906, + "time_per_iteration": 2.5057764053344727 + }, + { + "auxiliary_loss_clip": 0.01144181, + "auxiliary_loss_mlp": 0.01037627, + "balance_loss_clip": 1.01881528, + "balance_loss_mlp": 1.04728532, + "epoch": 0.2349015481737562, + "flos": 29095830898560.0, + "grad_norm": 2.129912307166789, + "language_loss": 0.73542202, + "learning_rate": 3.575507182316473e-06, + "loss": 0.75724012, + "num_input_tokens_seen": 84102340, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.96875, + "step": 3907, + "time_per_iteration": 2.5734074115753174 + }, + { + "auxiliary_loss_clip": 0.01141507, + "auxiliary_loss_mlp": 0.01047564, + "balance_loss_clip": 1.03004074, + "balance_loss_mlp": 1.04927719, + "epoch": 0.23496167142642416, + "flos": 18916305373440.0, + "grad_norm": 1.7646530569469054, + "language_loss": 0.72807813, + "learning_rate": 3.575267247755601e-06, + "loss": 0.74996883, + "num_input_tokens_seen": 84120370, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.921875, + "step": 3908, + "time_per_iteration": 2.438422441482544 + }, + { + "auxiliary_loss_clip": 0.01049243, + "auxiliary_loss_mlp": 0.01002133, + "balance_loss_clip": 1.00030959, + "balance_loss_mlp": 1.01835775, + "epoch": 0.23502179467909215, + "flos": 55868062896000.0, + "grad_norm": 1.0194055540826834, + "language_loss": 0.73271406, + "learning_rate": 3.5750272534604367e-06, + "loss": 0.75322783, + "num_input_tokens_seen": 84165515, + "router_z_loss_clip": 0.01818848, + "router_z_loss_mlp": 0.30859375, + "step": 3909, + "time_per_iteration": 2.8451788425445557 + }, + { + "auxiliary_loss_clip": 0.01139398, + "auxiliary_loss_mlp": 0.01043023, + "balance_loss_clip": 1.02607155, + "balance_loss_mlp": 1.04842734, + "epoch": 0.23508191793176011, + "flos": 23401763210880.0, + "grad_norm": 1.5487453833335116, + "language_loss": 0.87906706, + "learning_rate": 3.5747871994400822e-06, + "loss": 0.9008913, + "num_input_tokens_seen": 84184540, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.91015625, + "step": 3910, + "time_per_iteration": 2.4648385047912598 + }, + { + "auxiliary_loss_clip": 0.01141916, + "auxiliary_loss_mlp": 0.01040084, + "balance_loss_clip": 1.02370465, + "balance_loss_mlp": 1.04950166, + "epoch": 0.23514204118442808, + "flos": 20047671025920.0, + "grad_norm": 2.1910966534760297, + "language_loss": 0.75809109, + "learning_rate": 3.5745470857036386e-06, + "loss": 0.7799111, + "num_input_tokens_seen": 84202025, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.921875, + "step": 3911, + "time_per_iteration": 2.4715898036956787 + }, + { + "auxiliary_loss_clip": 0.01136455, + "auxiliary_loss_mlp": 0.01042737, + "balance_loss_clip": 1.02729297, + "balance_loss_mlp": 1.04807627, + "epoch": 0.23520216443709605, + "flos": 21580589796480.0, + "grad_norm": 1.9083148186883727, + "language_loss": 0.81775904, + "learning_rate": 3.5743069122602122e-06, + "loss": 0.83955097, + "num_input_tokens_seen": 84221895, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8828125, + "step": 3912, + "time_per_iteration": 2.4627628326416016 + }, + { + "auxiliary_loss_clip": 0.01139949, + "auxiliary_loss_mlp": 0.01050703, + "balance_loss_clip": 1.03270197, + "balance_loss_mlp": 1.04939759, + "epoch": 0.235262287689764, + "flos": 23185796688000.0, + "grad_norm": 1.7554989092460516, + "language_loss": 0.71664345, + "learning_rate": 3.574066679118909e-06, + "loss": 0.73854995, + "num_input_tokens_seen": 84240455, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.90625, + "step": 3913, + "time_per_iteration": 2.5080020427703857 + }, + { + "auxiliary_loss_clip": 0.01147528, + "auxiliary_loss_mlp": 0.01045028, + "balance_loss_clip": 1.02691996, + "balance_loss_mlp": 1.05220175, + "epoch": 0.23532241094243198, + "flos": 23185222070400.0, + "grad_norm": 1.7040704955860875, + "language_loss": 0.75903499, + "learning_rate": 3.57382638628884e-06, + "loss": 0.78096056, + "num_input_tokens_seen": 84261605, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.953125, + "step": 3914, + "time_per_iteration": 2.487429618835449 + }, + { + "auxiliary_loss_clip": 0.01141443, + "auxiliary_loss_mlp": 0.01040515, + "balance_loss_clip": 1.02307451, + "balance_loss_mlp": 1.05093837, + "epoch": 0.23538253419509997, + "flos": 17019324305280.0, + "grad_norm": 2.554647654086476, + "language_loss": 0.89353001, + "learning_rate": 3.5735860337791174e-06, + "loss": 0.9153496, + "num_input_tokens_seen": 84278675, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.90625, + "step": 3915, + "time_per_iteration": 2.500753402709961 + }, + { + "auxiliary_loss_clip": 0.01044736, + "auxiliary_loss_mlp": 0.01003661, + "balance_loss_clip": 1.00158656, + "balance_loss_mlp": 1.0141747, + "epoch": 0.23544265744776793, + "flos": 63448588967040.0, + "grad_norm": 0.8049654288159457, + "language_loss": 0.5935356, + "learning_rate": 3.573345621598854e-06, + "loss": 0.61401957, + "num_input_tokens_seen": 84329765, + "router_z_loss_clip": 0.02075195, + "router_z_loss_mlp": 0.3046875, + "step": 3916, + "time_per_iteration": 2.9926259517669678 + }, + { + "auxiliary_loss_clip": 0.01042644, + "auxiliary_loss_mlp": 0.01002857, + "balance_loss_clip": 1.00075865, + "balance_loss_mlp": 1.01226258, + "epoch": 0.2355027807004359, + "flos": 70515343831680.0, + "grad_norm": 0.7742950949727582, + "language_loss": 0.49486533, + "learning_rate": 3.5731051497571675e-06, + "loss": 0.51532036, + "num_input_tokens_seen": 84393680, + "router_z_loss_clip": 0.02099609, + "router_z_loss_mlp": 0.3046875, + "step": 3917, + "time_per_iteration": 3.085294723510742 + }, + { + "auxiliary_loss_clip": 0.01142529, + "auxiliary_loss_mlp": 0.01052441, + "balance_loss_clip": 1.03615093, + "balance_loss_mlp": 1.04923129, + "epoch": 0.23556290395310386, + "flos": 21434289701760.0, + "grad_norm": 2.000752484300541, + "language_loss": 0.76012552, + "learning_rate": 3.5728646182631756e-06, + "loss": 0.78207517, + "num_input_tokens_seen": 84412640, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.93359375, + "step": 3918, + "time_per_iteration": 2.4883201122283936 + }, + { + "auxiliary_loss_clip": 0.01145359, + "auxiliary_loss_mlp": 0.0104448, + "balance_loss_clip": 1.02805305, + "balance_loss_mlp": 1.04997587, + "epoch": 0.23562302720577183, + "flos": 18186421011840.0, + "grad_norm": 2.209135495431813, + "language_loss": 0.68728662, + "learning_rate": 3.5726240271259995e-06, + "loss": 0.709185, + "num_input_tokens_seen": 84431605, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.953125, + "step": 3919, + "time_per_iteration": 2.4489476680755615 + }, + { + "auxiliary_loss_clip": 0.01137524, + "auxiliary_loss_mlp": 0.01038874, + "balance_loss_clip": 1.02216101, + "balance_loss_mlp": 1.04864836, + "epoch": 0.2356831504584398, + "flos": 33730497832320.0, + "grad_norm": 1.8210843900818243, + "language_loss": 0.70324695, + "learning_rate": 3.5723833763547634e-06, + "loss": 0.72501087, + "num_input_tokens_seen": 84454210, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.88671875, + "step": 3920, + "time_per_iteration": 2.6011908054351807 + }, + { + "auxiliary_loss_clip": 0.01141332, + "auxiliary_loss_mlp": 0.01046958, + "balance_loss_clip": 1.03128195, + "balance_loss_mlp": 1.05122209, + "epoch": 0.23574327371110776, + "flos": 24932778560640.0, + "grad_norm": 1.6333300745229378, + "language_loss": 0.77596343, + "learning_rate": 3.5721426659585916e-06, + "loss": 0.79784632, + "num_input_tokens_seen": 84475540, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8984375, + "step": 3921, + "time_per_iteration": 2.498924732208252 + }, + { + "auxiliary_loss_clip": 0.01142015, + "auxiliary_loss_mlp": 0.01042804, + "balance_loss_clip": 1.02615058, + "balance_loss_mlp": 1.05108023, + "epoch": 0.23580339696377575, + "flos": 17822107319040.0, + "grad_norm": 2.5438781918161375, + "language_loss": 0.7561245, + "learning_rate": 3.571901895946612e-06, + "loss": 0.7779727, + "num_input_tokens_seen": 84494580, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.91015625, + "step": 3922, + "time_per_iteration": 2.467103958129883 + }, + { + "auxiliary_loss_clip": 0.01138646, + "auxiliary_loss_mlp": 0.01041381, + "balance_loss_clip": 1.02583599, + "balance_loss_mlp": 1.0489881, + "epoch": 0.23586352021644372, + "flos": 26286611097600.0, + "grad_norm": 2.3317912313524625, + "language_loss": 0.80016744, + "learning_rate": 3.571661066327956e-06, + "loss": 0.82196772, + "num_input_tokens_seen": 84513850, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8984375, + "step": 3923, + "time_per_iteration": 2.5075273513793945 + }, + { + "auxiliary_loss_clip": 0.01138213, + "auxiliary_loss_mlp": 0.01046068, + "balance_loss_clip": 1.02985525, + "balance_loss_mlp": 1.04845715, + "epoch": 0.23592364346911168, + "flos": 14246697484800.0, + "grad_norm": 1.9692150152538963, + "language_loss": 0.74753797, + "learning_rate": 3.571420177111754e-06, + "loss": 0.76938081, + "num_input_tokens_seen": 84532315, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8984375, + "step": 3924, + "time_per_iteration": 2.442448377609253 + }, + { + "auxiliary_loss_clip": 0.01141205, + "auxiliary_loss_mlp": 0.01046148, + "balance_loss_clip": 1.03013766, + "balance_loss_mlp": 1.04995513, + "epoch": 0.23598376672177965, + "flos": 18587938216320.0, + "grad_norm": 2.1681544357284093, + "language_loss": 0.82770467, + "learning_rate": 3.5711792283071416e-06, + "loss": 0.84957814, + "num_input_tokens_seen": 84550970, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.9140625, + "step": 3925, + "time_per_iteration": 2.44718337059021 + }, + { + "auxiliary_loss_clip": 0.01138195, + "auxiliary_loss_mlp": 0.01047882, + "balance_loss_clip": 1.03100252, + "balance_loss_mlp": 1.04645014, + "epoch": 0.2360438899744476, + "flos": 22675542036480.0, + "grad_norm": 1.8844556004317345, + "language_loss": 0.59408414, + "learning_rate": 3.5709382199232564e-06, + "loss": 0.61594486, + "num_input_tokens_seen": 84571655, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.91796875, + "step": 3926, + "time_per_iteration": 2.4840757846832275 + }, + { + "auxiliary_loss_clip": 0.01135063, + "auxiliary_loss_mlp": 0.01045392, + "balance_loss_clip": 1.02977526, + "balance_loss_mlp": 1.04721665, + "epoch": 0.23610401322711558, + "flos": 29570139014400.0, + "grad_norm": 1.967091588265342, + "language_loss": 0.71317631, + "learning_rate": 3.570697151969235e-06, + "loss": 0.73498082, + "num_input_tokens_seen": 84593130, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.87890625, + "step": 3927, + "time_per_iteration": 4.117234945297241 + }, + { + "auxiliary_loss_clip": 0.01137568, + "auxiliary_loss_mlp": 0.01044401, + "balance_loss_clip": 1.0295651, + "balance_loss_mlp": 1.04787612, + "epoch": 0.23616413647978354, + "flos": 17858520731520.0, + "grad_norm": 1.8263460078369782, + "language_loss": 0.75102496, + "learning_rate": 3.570456024454221e-06, + "loss": 0.77284467, + "num_input_tokens_seen": 84612410, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8984375, + "step": 3928, + "time_per_iteration": 3.9637200832366943 + }, + { + "auxiliary_loss_clip": 0.01137493, + "auxiliary_loss_mlp": 0.01048389, + "balance_loss_clip": 1.03086567, + "balance_loss_mlp": 1.04693556, + "epoch": 0.23622425973245154, + "flos": 11034847157760.0, + "grad_norm": 2.885999758146942, + "language_loss": 0.81520462, + "learning_rate": 3.5702148373873576e-06, + "loss": 0.83706343, + "num_input_tokens_seen": 84627610, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.90625, + "step": 3929, + "time_per_iteration": 2.499310255050659 + }, + { + "auxiliary_loss_clip": 0.01146116, + "auxiliary_loss_mlp": 0.01047853, + "balance_loss_clip": 1.02998328, + "balance_loss_mlp": 1.04974854, + "epoch": 0.2362843829851195, + "flos": 23404061681280.0, + "grad_norm": 4.669381706210694, + "language_loss": 0.7194528, + "learning_rate": 3.569973590777789e-06, + "loss": 0.74139249, + "num_input_tokens_seen": 84648415, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.9609375, + "step": 3930, + "time_per_iteration": 2.4964945316314697 + }, + { + "auxiliary_loss_clip": 0.01137432, + "auxiliary_loss_mlp": 0.0103904, + "balance_loss_clip": 1.02245224, + "balance_loss_mlp": 1.046561, + "epoch": 0.23634450623778747, + "flos": 39529855261440.0, + "grad_norm": 2.489267518834959, + "language_loss": 0.73764896, + "learning_rate": 3.569732284634665e-06, + "loss": 0.7594136, + "num_input_tokens_seen": 84670080, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.91015625, + "step": 3931, + "time_per_iteration": 2.6283528804779053 + }, + { + "auxiliary_loss_clip": 0.01140852, + "auxiliary_loss_mlp": 0.01039788, + "balance_loss_clip": 1.02245522, + "balance_loss_mlp": 1.04971111, + "epoch": 0.23640462949045543, + "flos": 24207167917440.0, + "grad_norm": 2.06419219579993, + "language_loss": 0.8026945, + "learning_rate": 3.569490918967136e-06, + "loss": 0.82450092, + "num_input_tokens_seen": 84686465, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9140625, + "step": 3932, + "time_per_iteration": 2.4901018142700195 + }, + { + "auxiliary_loss_clip": 0.01138855, + "auxiliary_loss_mlp": 0.01039585, + "balance_loss_clip": 1.02483916, + "balance_loss_mlp": 1.05032694, + "epoch": 0.2364647527431234, + "flos": 26177622255360.0, + "grad_norm": 1.5491195596348342, + "language_loss": 0.85760093, + "learning_rate": 3.5692494937843537e-06, + "loss": 0.87938541, + "num_input_tokens_seen": 84708825, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8828125, + "step": 3933, + "time_per_iteration": 2.5625483989715576 + }, + { + "auxiliary_loss_clip": 0.01146232, + "auxiliary_loss_mlp": 0.01037898, + "balance_loss_clip": 1.02008784, + "balance_loss_mlp": 1.0532943, + "epoch": 0.23652487599579136, + "flos": 22637009721600.0, + "grad_norm": 2.0322099534023685, + "language_loss": 0.8277775, + "learning_rate": 3.5690080090954727e-06, + "loss": 0.84961879, + "num_input_tokens_seen": 84726165, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9296875, + "step": 3934, + "time_per_iteration": 2.512068748474121 + }, + { + "auxiliary_loss_clip": 0.01141394, + "auxiliary_loss_mlp": 0.01037778, + "balance_loss_clip": 1.02102923, + "balance_loss_mlp": 1.04977798, + "epoch": 0.23658499924845935, + "flos": 21762261809280.0, + "grad_norm": 1.774494675769988, + "language_loss": 0.7864846, + "learning_rate": 3.5687664649096515e-06, + "loss": 0.80827636, + "num_input_tokens_seen": 84745815, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.91796875, + "step": 3935, + "time_per_iteration": 2.4996352195739746 + }, + { + "auxiliary_loss_clip": 0.01138141, + "auxiliary_loss_mlp": 0.01035042, + "balance_loss_clip": 1.01913905, + "balance_loss_mlp": 1.04973102, + "epoch": 0.23664512250112732, + "flos": 21798998444160.0, + "grad_norm": 1.7164724890649055, + "language_loss": 0.79656923, + "learning_rate": 3.5685248612360487e-06, + "loss": 0.81830108, + "num_input_tokens_seen": 84765415, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8828125, + "step": 3936, + "time_per_iteration": 2.4868710041046143 + }, + { + "auxiliary_loss_clip": 0.01138439, + "auxiliary_loss_mlp": 0.01036243, + "balance_loss_clip": 1.0192436, + "balance_loss_mlp": 1.04798818, + "epoch": 0.23670524575379528, + "flos": 22637871648000.0, + "grad_norm": 1.4334555797897097, + "language_loss": 0.78783411, + "learning_rate": 3.568283198083826e-06, + "loss": 0.80958092, + "num_input_tokens_seen": 84787080, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.90625, + "step": 3937, + "time_per_iteration": 2.499565362930298 + }, + { + "auxiliary_loss_clip": 0.01136409, + "auxiliary_loss_mlp": 0.01037518, + "balance_loss_clip": 1.02244997, + "balance_loss_mlp": 1.04970455, + "epoch": 0.23676536900646325, + "flos": 16725000263040.0, + "grad_norm": 2.078138882715826, + "language_loss": 0.85105085, + "learning_rate": 3.568041475462147e-06, + "loss": 0.8727901, + "num_input_tokens_seen": 84805395, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8671875, + "step": 3938, + "time_per_iteration": 2.449214220046997 + }, + { + "auxiliary_loss_clip": 0.01135246, + "auxiliary_loss_mlp": 0.01044603, + "balance_loss_clip": 1.0285933, + "balance_loss_mlp": 1.04824734, + "epoch": 0.23682549225913122, + "flos": 11135611785600.0, + "grad_norm": 2.4851234695326423, + "language_loss": 0.93872499, + "learning_rate": 3.5677996933801785e-06, + "loss": 0.96052349, + "num_input_tokens_seen": 84818090, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87109375, + "step": 3939, + "time_per_iteration": 2.415891647338867 + }, + { + "auxiliary_loss_clip": 0.01140924, + "auxiliary_loss_mlp": 0.01043341, + "balance_loss_clip": 1.02598429, + "balance_loss_mlp": 1.04769599, + "epoch": 0.23688561551179918, + "flos": 22559226819840.0, + "grad_norm": 1.6764835140151866, + "language_loss": 0.8238095, + "learning_rate": 3.567557851847088e-06, + "loss": 0.84565216, + "num_input_tokens_seen": 84837695, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9296875, + "step": 3940, + "time_per_iteration": 2.47468900680542 + }, + { + "auxiliary_loss_clip": 0.01145021, + "auxiliary_loss_mlp": 0.01042824, + "balance_loss_clip": 1.02592003, + "balance_loss_mlp": 1.04990602, + "epoch": 0.23694573876446715, + "flos": 18514895909760.0, + "grad_norm": 2.2107440191497054, + "language_loss": 0.88986713, + "learning_rate": 3.5673159508720464e-06, + "loss": 0.91174555, + "num_input_tokens_seen": 84854630, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.953125, + "step": 3941, + "time_per_iteration": 2.455631971359253 + }, + { + "auxiliary_loss_clip": 0.01136515, + "auxiliary_loss_mlp": 0.01043393, + "balance_loss_clip": 1.02580976, + "balance_loss_mlp": 1.04538155, + "epoch": 0.23700586201713514, + "flos": 15335723980800.0, + "grad_norm": 2.1526885300024072, + "language_loss": 0.84676927, + "learning_rate": 3.5670739904642274e-06, + "loss": 0.86856836, + "num_input_tokens_seen": 84871805, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9140625, + "step": 3942, + "time_per_iteration": 2.43743634223938 + }, + { + "auxiliary_loss_clip": 0.01140361, + "auxiliary_loss_mlp": 0.01045145, + "balance_loss_clip": 1.02769232, + "balance_loss_mlp": 1.04840159, + "epoch": 0.2370659852698031, + "flos": 23947605262080.0, + "grad_norm": 1.8547641010298248, + "language_loss": 0.80905575, + "learning_rate": 3.5668319706328065e-06, + "loss": 0.83091086, + "num_input_tokens_seen": 84889815, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.921875, + "step": 3943, + "time_per_iteration": 2.5058658123016357 + }, + { + "auxiliary_loss_clip": 0.01143585, + "auxiliary_loss_mlp": 0.01039213, + "balance_loss_clip": 1.02084267, + "balance_loss_mlp": 1.04731488, + "epoch": 0.23712610852247107, + "flos": 15332527670400.0, + "grad_norm": 2.308079684052438, + "language_loss": 0.67493033, + "learning_rate": 3.566589891386959e-06, + "loss": 0.69675827, + "num_input_tokens_seen": 84904380, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.9609375, + "step": 3944, + "time_per_iteration": 2.4276273250579834 + }, + { + "auxiliary_loss_clip": 0.01144217, + "auxiliary_loss_mlp": 0.01038427, + "balance_loss_clip": 1.02116549, + "balance_loss_mlp": 1.05084419, + "epoch": 0.23718623177513903, + "flos": 19682567233920.0, + "grad_norm": 2.061169456768298, + "language_loss": 0.75421506, + "learning_rate": 3.566347752735866e-06, + "loss": 0.77604151, + "num_input_tokens_seen": 84922935, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.93359375, + "step": 3945, + "time_per_iteration": 2.474323272705078 + }, + { + "auxiliary_loss_clip": 0.01137318, + "auxiliary_loss_mlp": 0.01039206, + "balance_loss_clip": 1.02304149, + "balance_loss_mlp": 1.0469377, + "epoch": 0.237246355027807, + "flos": 24973322037120.0, + "grad_norm": 1.6081639136691026, + "language_loss": 0.63469779, + "learning_rate": 3.5661055546887094e-06, + "loss": 0.65646303, + "num_input_tokens_seen": 84943685, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.90234375, + "step": 3946, + "time_per_iteration": 2.5087931156158447 + }, + { + "auxiliary_loss_clip": 0.01137558, + "auxiliary_loss_mlp": 0.01038922, + "balance_loss_clip": 1.02186346, + "balance_loss_mlp": 1.04692435, + "epoch": 0.23730647828047496, + "flos": 15377416692480.0, + "grad_norm": 2.27613511663784, + "language_loss": 0.77508283, + "learning_rate": 3.5658632972546734e-06, + "loss": 0.79684764, + "num_input_tokens_seen": 84959505, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.90625, + "step": 3947, + "time_per_iteration": 2.4716949462890625 + }, + { + "auxiliary_loss_clip": 0.01141281, + "auxiliary_loss_mlp": 0.0104192, + "balance_loss_clip": 1.02496827, + "balance_loss_mlp": 1.05008841, + "epoch": 0.23736660153314296, + "flos": 28150662372480.0, + "grad_norm": 1.6255497375782806, + "language_loss": 0.80575311, + "learning_rate": 3.565620980442944e-06, + "loss": 0.8275851, + "num_input_tokens_seen": 84982130, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.91015625, + "step": 3948, + "time_per_iteration": 2.5750784873962402 + }, + { + "auxiliary_loss_clip": 0.01138983, + "auxiliary_loss_mlp": 0.01043821, + "balance_loss_clip": 1.02715611, + "balance_loss_mlp": 1.04736018, + "epoch": 0.23742672478581092, + "flos": 22086570729600.0, + "grad_norm": 2.0638215262656696, + "language_loss": 0.80578661, + "learning_rate": 3.5653786042627107e-06, + "loss": 0.82761467, + "num_input_tokens_seen": 85000640, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.91796875, + "step": 3949, + "time_per_iteration": 2.512665271759033 + }, + { + "auxiliary_loss_clip": 0.01138607, + "auxiliary_loss_mlp": 0.0104063, + "balance_loss_clip": 1.02382135, + "balance_loss_mlp": 1.04584646, + "epoch": 0.2374868480384789, + "flos": 19537093152000.0, + "grad_norm": 1.8976071400358168, + "language_loss": 0.73124689, + "learning_rate": 3.565136168723163e-06, + "loss": 0.75303924, + "num_input_tokens_seen": 85018970, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.92578125, + "step": 3950, + "time_per_iteration": 2.4842302799224854 + }, + { + "auxiliary_loss_clip": 0.01135058, + "auxiliary_loss_mlp": 0.01034229, + "balance_loss_clip": 1.01944709, + "balance_loss_mlp": 1.04712903, + "epoch": 0.23754697129114685, + "flos": 19422501788160.0, + "grad_norm": 2.0688047231241247, + "language_loss": 0.73064256, + "learning_rate": 3.564893673833495e-06, + "loss": 0.75233537, + "num_input_tokens_seen": 85035905, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8828125, + "step": 3951, + "time_per_iteration": 2.5215439796447754 + }, + { + "auxiliary_loss_clip": 0.01144126, + "auxiliary_loss_mlp": 0.0104004, + "balance_loss_clip": 1.02258813, + "balance_loss_mlp": 1.0507673, + "epoch": 0.23760709454381482, + "flos": 19501002961920.0, + "grad_norm": 1.7591828710207016, + "language_loss": 0.73658371, + "learning_rate": 3.564651119602903e-06, + "loss": 0.75842535, + "num_input_tokens_seen": 85054560, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.93359375, + "step": 3952, + "time_per_iteration": 2.550182342529297 + }, + { + "auxiliary_loss_clip": 0.0113686, + "auxiliary_loss_mlp": 0.01037761, + "balance_loss_clip": 1.02213275, + "balance_loss_mlp": 1.04537988, + "epoch": 0.23766721779648278, + "flos": 27636600879360.0, + "grad_norm": 1.6791264380286672, + "language_loss": 0.71064484, + "learning_rate": 3.564408506040583e-06, + "loss": 0.73239112, + "num_input_tokens_seen": 85074425, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.9140625, + "step": 3953, + "time_per_iteration": 2.530381202697754 + }, + { + "auxiliary_loss_clip": 0.01140701, + "auxiliary_loss_mlp": 0.01042499, + "balance_loss_clip": 1.02522552, + "balance_loss_mlp": 1.04806364, + "epoch": 0.23772734104915075, + "flos": 23404348990080.0, + "grad_norm": 1.9696108021357461, + "language_loss": 0.81686246, + "learning_rate": 3.5641658331557356e-06, + "loss": 0.83869451, + "num_input_tokens_seen": 85092865, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.92578125, + "step": 3954, + "time_per_iteration": 2.491629123687744 + }, + { + "auxiliary_loss_clip": 0.01141999, + "auxiliary_loss_mlp": 0.01047189, + "balance_loss_clip": 1.02915251, + "balance_loss_mlp": 1.04870319, + "epoch": 0.23778746430181874, + "flos": 15705496540800.0, + "grad_norm": 2.155968963382196, + "language_loss": 0.65756261, + "learning_rate": 3.5639231009575634e-06, + "loss": 0.67945445, + "num_input_tokens_seen": 85110175, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.93359375, + "step": 3955, + "time_per_iteration": 2.4659719467163086 + }, + { + "auxiliary_loss_clip": 0.01138242, + "auxiliary_loss_mlp": 0.01053219, + "balance_loss_clip": 1.0362916, + "balance_loss_mlp": 1.04739583, + "epoch": 0.2378475875544867, + "flos": 19426452284160.0, + "grad_norm": 1.3846492045019327, + "language_loss": 0.83788121, + "learning_rate": 3.5636803094552704e-06, + "loss": 0.85979581, + "num_input_tokens_seen": 85129925, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.91015625, + "step": 3956, + "time_per_iteration": 2.48734712600708 + }, + { + "auxiliary_loss_clip": 0.011338, + "auxiliary_loss_mlp": 0.01040785, + "balance_loss_clip": 1.02471578, + "balance_loss_mlp": 1.04647636, + "epoch": 0.23790771080715467, + "flos": 22268565964800.0, + "grad_norm": 2.1805686912335656, + "language_loss": 0.85228634, + "learning_rate": 3.5634374586580635e-06, + "loss": 0.8740322, + "num_input_tokens_seen": 85147755, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87109375, + "step": 3957, + "time_per_iteration": 2.50199294090271 + }, + { + "auxiliary_loss_clip": 0.01139099, + "auxiliary_loss_mlp": 0.01041833, + "balance_loss_clip": 1.02686596, + "balance_loss_mlp": 1.04807806, + "epoch": 0.23796783405982264, + "flos": 20047311889920.0, + "grad_norm": 2.0218180107915757, + "language_loss": 0.70133704, + "learning_rate": 3.563194548575151e-06, + "loss": 0.72314632, + "num_input_tokens_seen": 85165270, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.91015625, + "step": 3958, + "time_per_iteration": 2.4798173904418945 + }, + { + "auxiliary_loss_clip": 0.01136893, + "auxiliary_loss_mlp": 0.01043034, + "balance_loss_clip": 1.02530742, + "balance_loss_mlp": 1.04581285, + "epoch": 0.2380279573124906, + "flos": 14245943299200.0, + "grad_norm": 3.373562251556634, + "language_loss": 0.65834582, + "learning_rate": 3.562951579215745e-06, + "loss": 0.68014508, + "num_input_tokens_seen": 85181555, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.91015625, + "step": 3959, + "time_per_iteration": 2.4558637142181396 + }, + { + "auxiliary_loss_clip": 0.01138452, + "auxiliary_loss_mlp": 0.01041764, + "balance_loss_clip": 1.02565885, + "balance_loss_mlp": 1.04832602, + "epoch": 0.23808808056515857, + "flos": 21179180332800.0, + "grad_norm": 1.7230243338870097, + "language_loss": 0.72128749, + "learning_rate": 3.5627085505890586e-06, + "loss": 0.74308968, + "num_input_tokens_seen": 85199455, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.90234375, + "step": 3960, + "time_per_iteration": 2.4831748008728027 + }, + { + "auxiliary_loss_clip": 0.01139565, + "auxiliary_loss_mlp": 0.01041249, + "balance_loss_clip": 1.0249052, + "balance_loss_mlp": 1.04867244, + "epoch": 0.23814820381782653, + "flos": 22528308188160.0, + "grad_norm": 1.8711627571775973, + "language_loss": 0.74181205, + "learning_rate": 3.562465462704307e-06, + "loss": 0.7636202, + "num_input_tokens_seen": 85219170, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.91015625, + "step": 3961, + "time_per_iteration": 2.5167927742004395 + }, + { + "auxiliary_loss_clip": 0.01138898, + "auxiliary_loss_mlp": 0.0105126, + "balance_loss_clip": 1.03318763, + "balance_loss_mlp": 1.04605162, + "epoch": 0.23820832707049452, + "flos": 22304332932480.0, + "grad_norm": 2.643011810367893, + "language_loss": 0.66067994, + "learning_rate": 3.5622223155707085e-06, + "loss": 0.68258154, + "num_input_tokens_seen": 85238480, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9296875, + "step": 3962, + "time_per_iteration": 2.4900338649749756 + }, + { + "auxiliary_loss_clip": 0.01138484, + "auxiliary_loss_mlp": 0.01050468, + "balance_loss_clip": 1.03387976, + "balance_loss_mlp": 1.04738379, + "epoch": 0.2382684503231625, + "flos": 24864225454080.0, + "grad_norm": 1.7740384877146562, + "language_loss": 0.74581182, + "learning_rate": 3.561979109197483e-06, + "loss": 0.76770139, + "num_input_tokens_seen": 85259180, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.91015625, + "step": 3963, + "time_per_iteration": 2.5409018993377686 + }, + { + "auxiliary_loss_clip": 0.01142407, + "auxiliary_loss_mlp": 0.01046013, + "balance_loss_clip": 1.02899039, + "balance_loss_mlp": 1.0498383, + "epoch": 0.23832857357583045, + "flos": 21871609787520.0, + "grad_norm": 2.0190521185084753, + "language_loss": 0.76898873, + "learning_rate": 3.5617358435938538e-06, + "loss": 0.79087293, + "num_input_tokens_seen": 85278550, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.92578125, + "step": 3964, + "time_per_iteration": 2.492861270904541 + }, + { + "auxiliary_loss_clip": 0.01137102, + "auxiliary_loss_mlp": 0.01044921, + "balance_loss_clip": 1.02911341, + "balance_loss_mlp": 1.04792333, + "epoch": 0.23838869682849842, + "flos": 21288061434240.0, + "grad_norm": 2.0459212281672956, + "language_loss": 0.71593058, + "learning_rate": 3.561492518769045e-06, + "loss": 0.73775077, + "num_input_tokens_seen": 85297345, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.890625, + "step": 3965, + "time_per_iteration": 2.5120911598205566 + }, + { + "auxiliary_loss_clip": 0.01134569, + "auxiliary_loss_mlp": 0.01047354, + "balance_loss_clip": 1.03158259, + "balance_loss_mlp": 1.04674065, + "epoch": 0.23844882008116638, + "flos": 16180594755840.0, + "grad_norm": 1.8902557347099018, + "language_loss": 0.78008091, + "learning_rate": 3.561249134732282e-06, + "loss": 0.80190015, + "num_input_tokens_seen": 85315105, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.87890625, + "step": 3966, + "time_per_iteration": 2.4576594829559326 + }, + { + "auxiliary_loss_clip": 0.01135801, + "auxiliary_loss_mlp": 0.01042292, + "balance_loss_clip": 1.02656794, + "balance_loss_mlp": 1.04652119, + "epoch": 0.23850894333383435, + "flos": 21069724613760.0, + "grad_norm": 2.8460709531404, + "language_loss": 0.68860286, + "learning_rate": 3.561005691492797e-06, + "loss": 0.71038377, + "num_input_tokens_seen": 85334735, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.89453125, + "step": 3967, + "time_per_iteration": 2.484840154647827 + }, + { + "auxiliary_loss_clip": 0.01137019, + "auxiliary_loss_mlp": 0.01053581, + "balance_loss_clip": 1.03739274, + "balance_loss_mlp": 1.04645443, + "epoch": 0.23856906658650234, + "flos": 17201606849280.0, + "grad_norm": 2.11266161128335, + "language_loss": 0.67849773, + "learning_rate": 3.5607621890598185e-06, + "loss": 0.70040375, + "num_input_tokens_seen": 85352875, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.90234375, + "step": 3968, + "time_per_iteration": 2.441445827484131 + }, + { + "auxiliary_loss_clip": 0.01134651, + "auxiliary_loss_mlp": 0.01038945, + "balance_loss_clip": 1.02318573, + "balance_loss_mlp": 1.0451827, + "epoch": 0.2386291898391703, + "flos": 29494223619840.0, + "grad_norm": 1.8948052650888014, + "language_loss": 0.76742399, + "learning_rate": 3.5605186274425823e-06, + "loss": 0.78916001, + "num_input_tokens_seen": 85372205, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.89453125, + "step": 3969, + "time_per_iteration": 5.413191318511963 + }, + { + "auxiliary_loss_clip": 0.01134724, + "auxiliary_loss_mlp": 0.01040503, + "balance_loss_clip": 1.02519631, + "balance_loss_mlp": 1.04734492, + "epoch": 0.23868931309183827, + "flos": 21142443697920.0, + "grad_norm": 2.7243772241637263, + "language_loss": 0.76300085, + "learning_rate": 3.5602750066503225e-06, + "loss": 0.78475308, + "num_input_tokens_seen": 85389705, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.875, + "step": 3970, + "time_per_iteration": 2.4792258739471436 + }, + { + "auxiliary_loss_clip": 0.01137573, + "auxiliary_loss_mlp": 0.01042951, + "balance_loss_clip": 1.02545094, + "balance_loss_mlp": 1.04645324, + "epoch": 0.23874943634450624, + "flos": 25659394784640.0, + "grad_norm": 3.3207921386663584, + "language_loss": 0.85399735, + "learning_rate": 3.5600313266922793e-06, + "loss": 0.87580258, + "num_input_tokens_seen": 85407855, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9140625, + "step": 3971, + "time_per_iteration": 2.500506639480591 + }, + { + "auxiliary_loss_clip": 0.01055799, + "auxiliary_loss_mlp": 0.01017218, + "balance_loss_clip": 1.01547742, + "balance_loss_mlp": 1.02590835, + "epoch": 0.2388095595971742, + "flos": 58986618624000.0, + "grad_norm": 0.7461637295582213, + "language_loss": 0.62814003, + "learning_rate": 3.5597875875776915e-06, + "loss": 0.64887029, + "num_input_tokens_seen": 85470885, + "router_z_loss_clip": 0.01745605, + "router_z_loss_mlp": 0.29882812, + "step": 3972, + "time_per_iteration": 3.173640012741089 + }, + { + "auxiliary_loss_clip": 0.0113938, + "auxiliary_loss_mlp": 0.01037064, + "balance_loss_clip": 1.02119696, + "balance_loss_mlp": 1.04922092, + "epoch": 0.23886968284984217, + "flos": 16800341040000.0, + "grad_norm": 1.9456864585596687, + "language_loss": 0.8170895, + "learning_rate": 3.5595437893158013e-06, + "loss": 0.8388539, + "num_input_tokens_seen": 85488460, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.90234375, + "step": 3973, + "time_per_iteration": 2.4529452323913574 + }, + { + "auxiliary_loss_clip": 0.01137225, + "auxiliary_loss_mlp": 0.01044983, + "balance_loss_clip": 1.02849591, + "balance_loss_mlp": 1.04869485, + "epoch": 0.23892980610251013, + "flos": 22382654538240.0, + "grad_norm": 1.6994626560625323, + "language_loss": 0.79299271, + "learning_rate": 3.5592999319158546e-06, + "loss": 0.81481481, + "num_input_tokens_seen": 85508590, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8828125, + "step": 3974, + "time_per_iteration": 2.5395772457122803 + }, + { + "auxiliary_loss_clip": 0.01139215, + "auxiliary_loss_mlp": 0.01038332, + "balance_loss_clip": 1.02155876, + "balance_loss_mlp": 1.04858148, + "epoch": 0.23898992935517813, + "flos": 12823198519680.0, + "grad_norm": 1.8925619228877844, + "language_loss": 0.84428573, + "learning_rate": 3.5590560153870984e-06, + "loss": 0.86606121, + "num_input_tokens_seen": 85525970, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.90625, + "step": 3975, + "time_per_iteration": 2.430361032485962 + }, + { + "auxiliary_loss_clip": 0.01135199, + "auxiliary_loss_mlp": 0.01037689, + "balance_loss_clip": 1.02215612, + "balance_loss_mlp": 1.0471369, + "epoch": 0.2390500526078461, + "flos": 22345666508160.0, + "grad_norm": 2.06825719132721, + "language_loss": 0.8375293, + "learning_rate": 3.5588120397387816e-06, + "loss": 0.85925817, + "num_input_tokens_seen": 85543700, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.87890625, + "step": 3976, + "time_per_iteration": 2.480534791946411 + }, + { + "auxiliary_loss_clip": 0.01132825, + "auxiliary_loss_mlp": 0.01032287, + "balance_loss_clip": 1.01798213, + "balance_loss_mlp": 1.04606938, + "epoch": 0.23911017586051406, + "flos": 22635142214400.0, + "grad_norm": 1.747752931490835, + "language_loss": 0.74532628, + "learning_rate": 3.5585680049801566e-06, + "loss": 0.76697731, + "num_input_tokens_seen": 85562765, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8671875, + "step": 3977, + "time_per_iteration": 2.4641239643096924 + }, + { + "auxiliary_loss_clip": 0.01138905, + "auxiliary_loss_mlp": 0.01045175, + "balance_loss_clip": 1.02818775, + "balance_loss_mlp": 1.04930067, + "epoch": 0.23917029911318202, + "flos": 23653281219840.0, + "grad_norm": 1.6638092474338306, + "language_loss": 0.72395146, + "learning_rate": 3.5583239111204764e-06, + "loss": 0.74579227, + "num_input_tokens_seen": 85581755, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.89453125, + "step": 3978, + "time_per_iteration": 2.5007903575897217 + }, + { + "auxiliary_loss_clip": 0.01143288, + "auxiliary_loss_mlp": 0.01042156, + "balance_loss_clip": 1.0256691, + "balance_loss_mlp": 1.05204654, + "epoch": 0.23923042236585, + "flos": 22783597125120.0, + "grad_norm": 2.0169903221822683, + "language_loss": 0.78654587, + "learning_rate": 3.558079758168997e-06, + "loss": 0.80840027, + "num_input_tokens_seen": 85599455, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.91015625, + "step": 3979, + "time_per_iteration": 2.5006349086761475 + }, + { + "auxiliary_loss_clip": 0.0113581, + "auxiliary_loss_mlp": 0.01044452, + "balance_loss_clip": 1.02769148, + "balance_loss_mlp": 1.04762173, + "epoch": 0.23929054561851795, + "flos": 28147717457280.0, + "grad_norm": 1.6987462202935262, + "language_loss": 0.81945407, + "learning_rate": 3.557835546134977e-06, + "loss": 0.84125668, + "num_input_tokens_seen": 85619970, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8828125, + "step": 3980, + "time_per_iteration": 2.5287020206451416 + }, + { + "auxiliary_loss_clip": 0.01136489, + "auxiliary_loss_mlp": 0.01036341, + "balance_loss_clip": 1.01974702, + "balance_loss_mlp": 1.04967999, + "epoch": 0.23935066887118592, + "flos": 21686525982720.0, + "grad_norm": 1.749461413213386, + "language_loss": 0.8401112, + "learning_rate": 3.5575912750276775e-06, + "loss": 0.86183953, + "num_input_tokens_seen": 85638850, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8671875, + "step": 3981, + "time_per_iteration": 2.466660261154175 + }, + { + "auxiliary_loss_clip": 0.01141626, + "auxiliary_loss_mlp": 0.01044752, + "balance_loss_clip": 1.0275383, + "balance_loss_mlp": 1.04951072, + "epoch": 0.2394107921238539, + "flos": 32122274198400.0, + "grad_norm": 3.6241006318049864, + "language_loss": 0.76872683, + "learning_rate": 3.5573469448563607e-06, + "loss": 0.79059052, + "num_input_tokens_seen": 85656285, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.921875, + "step": 3982, + "time_per_iteration": 2.558145046234131 + }, + { + "auxiliary_loss_clip": 0.01135351, + "auxiliary_loss_mlp": 0.01043953, + "balance_loss_clip": 1.02811027, + "balance_loss_mlp": 1.04844236, + "epoch": 0.23947091537652188, + "flos": 17019180650880.0, + "grad_norm": 6.059829142106342, + "language_loss": 0.77878481, + "learning_rate": 3.5571025556302915e-06, + "loss": 0.80057788, + "num_input_tokens_seen": 85673020, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8671875, + "step": 3983, + "time_per_iteration": 2.4443132877349854 + }, + { + "auxiliary_loss_clip": 0.01136897, + "auxiliary_loss_mlp": 0.0104106, + "balance_loss_clip": 1.02446592, + "balance_loss_mlp": 1.04759789, + "epoch": 0.23953103862918984, + "flos": 20593584904320.0, + "grad_norm": 1.9981470653963032, + "language_loss": 0.73163629, + "learning_rate": 3.556858107358737e-06, + "loss": 0.75341582, + "num_input_tokens_seen": 85692565, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.89453125, + "step": 3984, + "time_per_iteration": 2.491344690322876 + }, + { + "auxiliary_loss_clip": 0.01137825, + "auxiliary_loss_mlp": 0.01045273, + "balance_loss_clip": 1.02860713, + "balance_loss_mlp": 1.04674625, + "epoch": 0.2395911618818578, + "flos": 20704405340160.0, + "grad_norm": 2.064924146489818, + "language_loss": 0.79049474, + "learning_rate": 3.5566136000509674e-06, + "loss": 0.81232572, + "num_input_tokens_seen": 85709730, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.90625, + "step": 3985, + "time_per_iteration": 2.4587738513946533 + }, + { + "auxiliary_loss_clip": 0.01139616, + "auxiliary_loss_mlp": 0.01041503, + "balance_loss_clip": 1.02484989, + "balance_loss_mlp": 1.04980683, + "epoch": 0.23965128513452577, + "flos": 27053519402880.0, + "grad_norm": 2.0182764415160563, + "language_loss": 0.73312742, + "learning_rate": 3.556369033716254e-06, + "loss": 0.7549386, + "num_input_tokens_seen": 85730045, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8984375, + "step": 3986, + "time_per_iteration": 2.5608811378479004 + }, + { + "auxiliary_loss_clip": 0.0114189, + "auxiliary_loss_mlp": 0.01051013, + "balance_loss_clip": 1.03495562, + "balance_loss_mlp": 1.04923773, + "epoch": 0.23971140838719374, + "flos": 23144319457920.0, + "grad_norm": 2.2624046500679333, + "language_loss": 0.87836051, + "learning_rate": 3.556124408363871e-06, + "loss": 0.90028954, + "num_input_tokens_seen": 85747590, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.92578125, + "step": 3987, + "time_per_iteration": 2.461778402328491 + }, + { + "auxiliary_loss_clip": 0.01132193, + "auxiliary_loss_mlp": 0.01036037, + "balance_loss_clip": 1.02161288, + "balance_loss_mlp": 1.04831004, + "epoch": 0.23977153163986173, + "flos": 18034554309120.0, + "grad_norm": 2.3750633167266306, + "language_loss": 0.8308624, + "learning_rate": 3.5558797240030945e-06, + "loss": 0.85254467, + "num_input_tokens_seen": 85763460, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.83984375, + "step": 3988, + "time_per_iteration": 2.4527788162231445 + }, + { + "auxiliary_loss_clip": 0.01134459, + "auxiliary_loss_mlp": 0.01040007, + "balance_loss_clip": 1.02336502, + "balance_loss_mlp": 1.04686844, + "epoch": 0.2398316548925297, + "flos": 18113378705280.0, + "grad_norm": 1.649806875732991, + "language_loss": 0.85145879, + "learning_rate": 3.5556349806432035e-06, + "loss": 0.87320346, + "num_input_tokens_seen": 85782050, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.875, + "step": 3989, + "time_per_iteration": 2.43949031829834 + }, + { + "auxiliary_loss_clip": 0.01135126, + "auxiliary_loss_mlp": 0.01037638, + "balance_loss_clip": 1.02249837, + "balance_loss_mlp": 1.04763699, + "epoch": 0.23989177814519766, + "flos": 12567730014720.0, + "grad_norm": 2.0784071273800944, + "language_loss": 0.84493041, + "learning_rate": 3.555390178293477e-06, + "loss": 0.86665809, + "num_input_tokens_seen": 85797400, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.875, + "step": 3990, + "time_per_iteration": 2.4476051330566406 + }, + { + "auxiliary_loss_clip": 0.01133399, + "auxiliary_loss_mlp": 0.01040211, + "balance_loss_clip": 1.02507186, + "balance_loss_mlp": 1.0463922, + "epoch": 0.23995190139786562, + "flos": 25264593423360.0, + "grad_norm": 3.585202907729512, + "language_loss": 0.75312221, + "learning_rate": 3.5551453169631994e-06, + "loss": 0.77485824, + "num_input_tokens_seen": 85818995, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.87109375, + "step": 3991, + "time_per_iteration": 2.502324104309082 + }, + { + "auxiliary_loss_clip": 0.01050073, + "auxiliary_loss_mlp": 0.01009423, + "balance_loss_clip": 1.00774217, + "balance_loss_mlp": 1.02049088, + "epoch": 0.2400120246505336, + "flos": 61960379650560.0, + "grad_norm": 0.8894590829003932, + "language_loss": 0.63734841, + "learning_rate": 3.554900396661656e-06, + "loss": 0.65794337, + "num_input_tokens_seen": 85876695, + "router_z_loss_clip": 0.0168457, + "router_z_loss_mlp": 0.296875, + "step": 3992, + "time_per_iteration": 3.0017786026000977 + }, + { + "auxiliary_loss_clip": 0.01050397, + "auxiliary_loss_mlp": 0.01010168, + "balance_loss_clip": 1.00857067, + "balance_loss_mlp": 1.02071452, + "epoch": 0.24007214790320155, + "flos": 66708560540160.0, + "grad_norm": 0.7530514643625366, + "language_loss": 0.62963343, + "learning_rate": 3.5546554173981334e-06, + "loss": 0.65023899, + "num_input_tokens_seen": 85940990, + "router_z_loss_clip": 0.01599121, + "router_z_loss_mlp": 0.296875, + "step": 3993, + "time_per_iteration": 3.176184892654419 + }, + { + "auxiliary_loss_clip": 0.01140668, + "auxiliary_loss_mlp": 0.01047015, + "balance_loss_clip": 1.03085065, + "balance_loss_mlp": 1.05099177, + "epoch": 0.24013227115586952, + "flos": 25809070757760.0, + "grad_norm": 1.6383486345725178, + "language_loss": 0.76938868, + "learning_rate": 3.5544103791819218e-06, + "loss": 0.79126549, + "num_input_tokens_seen": 85961165, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8984375, + "step": 3994, + "time_per_iteration": 2.4940826892852783 + }, + { + "auxiliary_loss_clip": 0.01135853, + "auxiliary_loss_mlp": 0.01047966, + "balance_loss_clip": 1.0305258, + "balance_loss_mlp": 1.04680216, + "epoch": 0.2401923944085375, + "flos": 25557480921600.0, + "grad_norm": 1.7751147523393542, + "language_loss": 0.78457522, + "learning_rate": 3.5541652820223124e-06, + "loss": 0.80641341, + "num_input_tokens_seen": 85982710, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.890625, + "step": 3995, + "time_per_iteration": 2.5075032711029053 + }, + { + "auxiliary_loss_clip": 0.01047716, + "auxiliary_loss_mlp": 0.01003894, + "balance_loss_clip": 1.00232053, + "balance_loss_mlp": 1.01837659, + "epoch": 0.24025251766120548, + "flos": 54941138478720.0, + "grad_norm": 0.8913570860108078, + "language_loss": 0.63479292, + "learning_rate": 3.5539201259286006e-06, + "loss": 0.65530908, + "num_input_tokens_seen": 86046935, + "router_z_loss_clip": 0.01574707, + "router_z_loss_mlp": 0.29296875, + "step": 3996, + "time_per_iteration": 3.1365764141082764 + }, + { + "auxiliary_loss_clip": 0.01137569, + "auxiliary_loss_mlp": 0.01045722, + "balance_loss_clip": 1.02916384, + "balance_loss_mlp": 1.04678392, + "epoch": 0.24031264091387344, + "flos": 20631075724800.0, + "grad_norm": 2.906997418482602, + "language_loss": 0.7009505, + "learning_rate": 3.5536749109100808e-06, + "loss": 0.72278345, + "num_input_tokens_seen": 86064355, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.90625, + "step": 3997, + "time_per_iteration": 2.464714765548706 + }, + { + "auxiliary_loss_clip": 0.01134848, + "auxiliary_loss_mlp": 0.01042521, + "balance_loss_clip": 1.02654672, + "balance_loss_mlp": 1.04642928, + "epoch": 0.2403727641665414, + "flos": 20886256920960.0, + "grad_norm": 1.9831176119326495, + "language_loss": 0.87292743, + "learning_rate": 3.5534296369760535e-06, + "loss": 0.89470112, + "num_input_tokens_seen": 86081340, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8828125, + "step": 3998, + "time_per_iteration": 2.4639480113983154 + }, + { + "auxiliary_loss_clip": 0.01134933, + "auxiliary_loss_mlp": 0.0103944, + "balance_loss_clip": 1.02306032, + "balance_loss_mlp": 1.04208946, + "epoch": 0.24043288741920937, + "flos": 22820046451200.0, + "grad_norm": 1.9745565965944727, + "language_loss": 0.75798607, + "learning_rate": 3.5531843041358183e-06, + "loss": 0.77972972, + "num_input_tokens_seen": 86102260, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9296875, + "step": 3999, + "time_per_iteration": 2.4753127098083496 + }, + { + "auxiliary_loss_clip": 0.01132817, + "auxiliary_loss_mlp": 0.01038028, + "balance_loss_clip": 1.02317488, + "balance_loss_mlp": 1.04545271, + "epoch": 0.24049301067187734, + "flos": 27959652823680.0, + "grad_norm": 1.9306579449884984, + "language_loss": 0.72642016, + "learning_rate": 3.552938912398679e-06, + "loss": 0.74812865, + "num_input_tokens_seen": 86123400, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.875, + "step": 4000, + "time_per_iteration": 2.5172412395477295 + }, + { + "auxiliary_loss_clip": 0.01140243, + "auxiliary_loss_mlp": 0.01036911, + "balance_loss_clip": 1.02025795, + "balance_loss_mlp": 1.04728866, + "epoch": 0.24055313392454533, + "flos": 27451409333760.0, + "grad_norm": 2.4587541869300824, + "language_loss": 0.65991902, + "learning_rate": 3.5526934617739397e-06, + "loss": 0.68169051, + "num_input_tokens_seen": 86144060, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9296875, + "step": 4001, + "time_per_iteration": 2.511198043823242 + }, + { + "auxiliary_loss_clip": 0.01131233, + "auxiliary_loss_mlp": 0.01040424, + "balance_loss_clip": 1.02330589, + "balance_loss_mlp": 1.0427444, + "epoch": 0.2406132571772133, + "flos": 25556618995200.0, + "grad_norm": 2.6796652593661903, + "language_loss": 0.82567388, + "learning_rate": 3.5524479522709095e-06, + "loss": 0.84739041, + "num_input_tokens_seen": 86163005, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.8828125, + "step": 4002, + "time_per_iteration": 2.5147531032562256 + }, + { + "auxiliary_loss_clip": 0.01136125, + "auxiliary_loss_mlp": 0.01038837, + "balance_loss_clip": 1.02382851, + "balance_loss_mlp": 1.04682446, + "epoch": 0.24067338042988126, + "flos": 24791398629120.0, + "grad_norm": 1.8902513751119636, + "language_loss": 0.82875729, + "learning_rate": 3.552202383898897e-06, + "loss": 0.8505069, + "num_input_tokens_seen": 86182580, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.890625, + "step": 4003, + "time_per_iteration": 2.508004665374756 + }, + { + "auxiliary_loss_clip": 0.01134817, + "auxiliary_loss_mlp": 0.01037746, + "balance_loss_clip": 1.0214386, + "balance_loss_mlp": 1.04608846, + "epoch": 0.24073350368254923, + "flos": 21177923356800.0, + "grad_norm": 2.0497424292602835, + "language_loss": 0.87504768, + "learning_rate": 3.551956756667215e-06, + "loss": 0.89677334, + "num_input_tokens_seen": 86200665, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.88671875, + "step": 4004, + "time_per_iteration": 2.4581985473632812 + }, + { + "auxiliary_loss_clip": 0.01133072, + "auxiliary_loss_mlp": 0.0104917, + "balance_loss_clip": 1.03356552, + "balance_loss_mlp": 1.04228568, + "epoch": 0.2407936269352172, + "flos": 22494300986880.0, + "grad_norm": 1.9722136456468877, + "language_loss": 0.77630293, + "learning_rate": 3.551711070585177e-06, + "loss": 0.79812533, + "num_input_tokens_seen": 86221640, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.90625, + "step": 4005, + "time_per_iteration": 2.556365728378296 + }, + { + "auxiliary_loss_clip": 0.01130485, + "auxiliary_loss_mlp": 0.01036948, + "balance_loss_clip": 1.02141535, + "balance_loss_mlp": 1.04398429, + "epoch": 0.24085375018788516, + "flos": 18551129754240.0, + "grad_norm": 1.7295620858093623, + "language_loss": 0.78973985, + "learning_rate": 3.5514653256620995e-06, + "loss": 0.81141412, + "num_input_tokens_seen": 86240795, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8671875, + "step": 4006, + "time_per_iteration": 2.460961103439331 + }, + { + "auxiliary_loss_clip": 0.0113781, + "auxiliary_loss_mlp": 0.01038185, + "balance_loss_clip": 1.02072108, + "balance_loss_mlp": 1.04375279, + "epoch": 0.24091387344055312, + "flos": 24170539023360.0, + "grad_norm": 2.2017624810959346, + "language_loss": 0.71201313, + "learning_rate": 3.551219521907302e-06, + "loss": 0.73377299, + "num_input_tokens_seen": 86262000, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.94140625, + "step": 4007, + "time_per_iteration": 2.5169517993927 + }, + { + "auxiliary_loss_clip": 0.01131131, + "auxiliary_loss_mlp": 0.01039619, + "balance_loss_clip": 1.02459884, + "balance_loss_mlp": 1.04453456, + "epoch": 0.24097399669322112, + "flos": 11036319615360.0, + "grad_norm": 1.805972702734942, + "language_loss": 0.75857127, + "learning_rate": 3.5509736593301042e-06, + "loss": 0.7802788, + "num_input_tokens_seen": 86279680, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8671875, + "step": 4008, + "time_per_iteration": 2.4489922523498535 + }, + { + "auxiliary_loss_clip": 0.01131483, + "auxiliary_loss_mlp": 0.01034828, + "balance_loss_clip": 1.01940203, + "balance_loss_mlp": 1.04296207, + "epoch": 0.24103411994588908, + "flos": 17165085696000.0, + "grad_norm": 2.356516377050019, + "language_loss": 0.73922294, + "learning_rate": 3.5507277379398295e-06, + "loss": 0.76088601, + "num_input_tokens_seen": 86297180, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8828125, + "step": 4009, + "time_per_iteration": 2.4701087474823 + }, + { + "auxiliary_loss_clip": 0.01133056, + "auxiliary_loss_mlp": 0.0104248, + "balance_loss_clip": 1.02664948, + "balance_loss_mlp": 1.04632092, + "epoch": 0.24109424319855705, + "flos": 20667956014080.0, + "grad_norm": 1.636895821506206, + "language_loss": 0.79938453, + "learning_rate": 3.550481757745804e-06, + "loss": 0.82113993, + "num_input_tokens_seen": 86317660, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8671875, + "step": 4010, + "time_per_iteration": 3.9670608043670654 + }, + { + "auxiliary_loss_clip": 0.01131615, + "auxiliary_loss_mlp": 0.01047202, + "balance_loss_clip": 1.02923679, + "balance_loss_mlp": 1.04108143, + "epoch": 0.241154366451225, + "flos": 28181796485760.0, + "grad_norm": 2.295886994366384, + "language_loss": 0.70799017, + "learning_rate": 3.5502357187573555e-06, + "loss": 0.72977829, + "num_input_tokens_seen": 86338325, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.90625, + "step": 4011, + "time_per_iteration": 3.9544472694396973 + }, + { + "auxiliary_loss_clip": 0.01131445, + "auxiliary_loss_mlp": 0.01039733, + "balance_loss_clip": 1.02429593, + "balance_loss_mlp": 1.04258561, + "epoch": 0.24121448970389298, + "flos": 21689722293120.0, + "grad_norm": 1.6166610897431488, + "language_loss": 0.69062299, + "learning_rate": 3.5499896209838118e-06, + "loss": 0.71233475, + "num_input_tokens_seen": 86357615, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.88671875, + "step": 4012, + "time_per_iteration": 2.501347303390503 + }, + { + "auxiliary_loss_clip": 0.01133874, + "auxiliary_loss_mlp": 0.01039375, + "balance_loss_clip": 1.02145839, + "balance_loss_mlp": 1.04454589, + "epoch": 0.24127461295656094, + "flos": 39676191269760.0, + "grad_norm": 2.0861437601678303, + "language_loss": 0.73424822, + "learning_rate": 3.5497434644345073e-06, + "loss": 0.75598073, + "num_input_tokens_seen": 86380355, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.89453125, + "step": 4013, + "time_per_iteration": 2.6360883712768555 + }, + { + "auxiliary_loss_clip": 0.01133872, + "auxiliary_loss_mlp": 0.01036616, + "balance_loss_clip": 1.02110672, + "balance_loss_mlp": 1.04450822, + "epoch": 0.2413347362092289, + "flos": 19135863256320.0, + "grad_norm": 1.8416541794010313, + "language_loss": 0.88554955, + "learning_rate": 3.5494972491187753e-06, + "loss": 0.9072544, + "num_input_tokens_seen": 86399125, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.89453125, + "step": 4014, + "time_per_iteration": 2.4663264751434326 + }, + { + "auxiliary_loss_clip": 0.01137985, + "auxiliary_loss_mlp": 0.01043677, + "balance_loss_clip": 1.02643979, + "balance_loss_mlp": 1.04453659, + "epoch": 0.2413948594618969, + "flos": 26939430829440.0, + "grad_norm": 2.755357499792604, + "language_loss": 0.94270647, + "learning_rate": 3.549250975045952e-06, + "loss": 0.96452308, + "num_input_tokens_seen": 86418625, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.93359375, + "step": 4015, + "time_per_iteration": 2.470952033996582 + }, + { + "auxiliary_loss_clip": 0.01133849, + "auxiliary_loss_mlp": 0.01038159, + "balance_loss_clip": 1.02174377, + "balance_loss_mlp": 1.04334664, + "epoch": 0.24145498271456486, + "flos": 25228108183680.0, + "grad_norm": 1.8402084517778015, + "language_loss": 0.82513833, + "learning_rate": 3.5490046422253768e-06, + "loss": 0.84685838, + "num_input_tokens_seen": 86438375, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90625, + "step": 4016, + "time_per_iteration": 2.4922966957092285 + }, + { + "auxiliary_loss_clip": 0.01127395, + "auxiliary_loss_mlp": 0.01040098, + "balance_loss_clip": 1.02423143, + "balance_loss_mlp": 1.04197156, + "epoch": 0.24151510596723283, + "flos": 40661759617920.0, + "grad_norm": 3.4212830828584386, + "language_loss": 0.69553781, + "learning_rate": 3.54875825066639e-06, + "loss": 0.71721268, + "num_input_tokens_seen": 86463230, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8515625, + "step": 4017, + "time_per_iteration": 2.596977710723877 + }, + { + "auxiliary_loss_clip": 0.01135423, + "auxiliary_loss_mlp": 0.01046549, + "balance_loss_clip": 1.02959788, + "balance_loss_mlp": 1.04421043, + "epoch": 0.2415752292199008, + "flos": 18146667634560.0, + "grad_norm": 2.0038503347112084, + "language_loss": 0.85114455, + "learning_rate": 3.5485118003783353e-06, + "loss": 0.87296432, + "num_input_tokens_seen": 86481230, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.91015625, + "step": 4018, + "time_per_iteration": 2.440749406814575 + }, + { + "auxiliary_loss_clip": 0.01046553, + "auxiliary_loss_mlp": 0.01012788, + "balance_loss_clip": 1.0109762, + "balance_loss_mlp": 1.01676679, + "epoch": 0.24163535247256876, + "flos": 67288409792640.0, + "grad_norm": 0.8182663934779763, + "language_loss": 0.60620981, + "learning_rate": 3.548265291370558e-06, + "loss": 0.62680322, + "num_input_tokens_seen": 86541260, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.296875, + "step": 4019, + "time_per_iteration": 3.112665891647339 + }, + { + "auxiliary_loss_clip": 0.01134098, + "auxiliary_loss_mlp": 0.01038378, + "balance_loss_clip": 1.02299976, + "balance_loss_mlp": 1.04433608, + "epoch": 0.24169547572523672, + "flos": 24929941386240.0, + "grad_norm": 1.880182475838635, + "language_loss": 0.73690915, + "learning_rate": 3.5480187236524055e-06, + "loss": 0.75863391, + "num_input_tokens_seen": 86559580, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8984375, + "step": 4020, + "time_per_iteration": 2.5049281120300293 + }, + { + "auxiliary_loss_clip": 0.01134711, + "auxiliary_loss_mlp": 0.01037599, + "balance_loss_clip": 1.02199471, + "balance_loss_mlp": 1.04660118, + "epoch": 0.24175559897790472, + "flos": 18728312567040.0, + "grad_norm": 1.9671591580269927, + "language_loss": 0.82012737, + "learning_rate": 3.5477720972332285e-06, + "loss": 0.84185052, + "num_input_tokens_seen": 86577560, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8828125, + "step": 4021, + "time_per_iteration": 2.464092493057251 + }, + { + "auxiliary_loss_clip": 0.01137652, + "auxiliary_loss_mlp": 0.01048543, + "balance_loss_clip": 1.03036344, + "balance_loss_mlp": 1.04551053, + "epoch": 0.24181572223057268, + "flos": 23039281111680.0, + "grad_norm": 1.9434993168468309, + "language_loss": 0.76464498, + "learning_rate": 3.547525412122378e-06, + "loss": 0.78650689, + "num_input_tokens_seen": 86595350, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.921875, + "step": 4022, + "time_per_iteration": 2.4939990043640137 + }, + { + "auxiliary_loss_clip": 0.01140564, + "auxiliary_loss_mlp": 0.01042084, + "balance_loss_clip": 1.0248704, + "balance_loss_mlp": 1.04610109, + "epoch": 0.24187584548324065, + "flos": 20376145923840.0, + "grad_norm": 1.893594506248005, + "language_loss": 0.75172901, + "learning_rate": 3.5472786683292083e-06, + "loss": 0.77355558, + "num_input_tokens_seen": 86614805, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9453125, + "step": 4023, + "time_per_iteration": 2.442469358444214 + }, + { + "auxiliary_loss_clip": 0.0113425, + "auxiliary_loss_mlp": 0.010453, + "balance_loss_clip": 1.0288136, + "balance_loss_mlp": 1.04636168, + "epoch": 0.2419359687359086, + "flos": 21397517153280.0, + "grad_norm": 1.7406117596406352, + "language_loss": 0.81464303, + "learning_rate": 3.5470318658630766e-06, + "loss": 0.83643848, + "num_input_tokens_seen": 86633700, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.87890625, + "step": 4024, + "time_per_iteration": 2.45035719871521 + }, + { + "auxiliary_loss_clip": 0.01134068, + "auxiliary_loss_mlp": 0.01045811, + "balance_loss_clip": 1.02951503, + "balance_loss_mlp": 1.0462923, + "epoch": 0.24199609198857658, + "flos": 18369385914240.0, + "grad_norm": 1.8550338864746303, + "language_loss": 0.85851878, + "learning_rate": 3.5467850047333424e-06, + "loss": 0.88031757, + "num_input_tokens_seen": 86650905, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.87890625, + "step": 4025, + "time_per_iteration": 2.4191699028015137 + }, + { + "auxiliary_loss_clip": 0.01136643, + "auxiliary_loss_mlp": 0.01048637, + "balance_loss_clip": 1.03154194, + "balance_loss_mlp": 1.04397535, + "epoch": 0.24205621524124454, + "flos": 19463871277440.0, + "grad_norm": 1.9498897834730646, + "language_loss": 0.71243072, + "learning_rate": 3.546538084949365e-06, + "loss": 0.73428357, + "num_input_tokens_seen": 86669185, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9296875, + "step": 4026, + "time_per_iteration": 2.476792812347412 + }, + { + "auxiliary_loss_clip": 0.01132732, + "auxiliary_loss_mlp": 0.01041866, + "balance_loss_clip": 1.0259757, + "balance_loss_mlp": 1.04589748, + "epoch": 0.2421163384939125, + "flos": 14976330451200.0, + "grad_norm": 1.8853181761927913, + "language_loss": 0.64215046, + "learning_rate": 3.546291106520509e-06, + "loss": 0.66389644, + "num_input_tokens_seen": 86686805, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8671875, + "step": 4027, + "time_per_iteration": 2.443652868270874 + }, + { + "auxiliary_loss_clip": 0.01136833, + "auxiliary_loss_mlp": 0.01037586, + "balance_loss_clip": 1.02226758, + "balance_loss_mlp": 1.04601741, + "epoch": 0.2421764617465805, + "flos": 18662057930880.0, + "grad_norm": 2.5479611354975007, + "language_loss": 0.70294374, + "learning_rate": 3.5460440694561388e-06, + "loss": 0.72468793, + "num_input_tokens_seen": 86705520, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.91015625, + "step": 4028, + "time_per_iteration": 2.48252534866333 + }, + { + "auxiliary_loss_clip": 0.01044866, + "auxiliary_loss_mlp": 0.01007457, + "balance_loss_clip": 1.00585961, + "balance_loss_mlp": 1.01464319, + "epoch": 0.24223658499924847, + "flos": 64347327164160.0, + "grad_norm": 0.8570499142131055, + "language_loss": 0.55407649, + "learning_rate": 3.545796973765623e-06, + "loss": 0.57459968, + "num_input_tokens_seen": 86767320, + "router_z_loss_clip": 0.01599121, + "router_z_loss_mlp": 0.30078125, + "step": 4029, + "time_per_iteration": 3.094402551651001 + }, + { + "auxiliary_loss_clip": 0.01135659, + "auxiliary_loss_mlp": 0.01043386, + "balance_loss_clip": 1.02567101, + "balance_loss_mlp": 1.04526591, + "epoch": 0.24229670825191643, + "flos": 25775243124480.0, + "grad_norm": 2.019101437715354, + "language_loss": 0.73829788, + "learning_rate": 3.54554981945833e-06, + "loss": 0.76008832, + "num_input_tokens_seen": 86788110, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.90234375, + "step": 4030, + "time_per_iteration": 2.5176522731781006 + }, + { + "auxiliary_loss_clip": 0.01135714, + "auxiliary_loss_mlp": 0.01053146, + "balance_loss_clip": 1.03655171, + "balance_loss_mlp": 1.04541922, + "epoch": 0.2423568315045844, + "flos": 20667094087680.0, + "grad_norm": 2.062987020241499, + "language_loss": 0.76440287, + "learning_rate": 3.5453026065436343e-06, + "loss": 0.78629148, + "num_input_tokens_seen": 86807640, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.90234375, + "step": 4031, + "time_per_iteration": 2.4774179458618164 + }, + { + "auxiliary_loss_clip": 0.01140068, + "auxiliary_loss_mlp": 0.01046331, + "balance_loss_clip": 1.02974856, + "balance_loss_mlp": 1.0464952, + "epoch": 0.24241695475725236, + "flos": 22416805393920.0, + "grad_norm": 7.078640241023749, + "language_loss": 0.65947008, + "learning_rate": 3.5450553350309083e-06, + "loss": 0.68133402, + "num_input_tokens_seen": 86826795, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9375, + "step": 4032, + "time_per_iteration": 2.500488519668579 + }, + { + "auxiliary_loss_clip": 0.01130465, + "auxiliary_loss_mlp": 0.01046589, + "balance_loss_clip": 1.03026938, + "balance_loss_mlp": 1.04175007, + "epoch": 0.24247707800992033, + "flos": 17128995505920.0, + "grad_norm": 3.1167913511387995, + "language_loss": 0.81353086, + "learning_rate": 3.5448080049295286e-06, + "loss": 0.83530146, + "num_input_tokens_seen": 86843175, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.88671875, + "step": 4033, + "time_per_iteration": 2.434652805328369 + }, + { + "auxiliary_loss_clip": 0.0113019, + "auxiliary_loss_mlp": 0.01039201, + "balance_loss_clip": 1.02310205, + "balance_loss_mlp": 1.04302979, + "epoch": 0.2425372012625883, + "flos": 31613743399680.0, + "grad_norm": 2.0372289343003023, + "language_loss": 0.69200158, + "learning_rate": 3.5445606162488754e-06, + "loss": 0.71369547, + "num_input_tokens_seen": 86863185, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.87109375, + "step": 4034, + "time_per_iteration": 2.583693027496338 + }, + { + "auxiliary_loss_clip": 0.01132981, + "auxiliary_loss_mlp": 0.01036321, + "balance_loss_clip": 1.01868999, + "balance_loss_mlp": 1.04278564, + "epoch": 0.24259732451525629, + "flos": 16326032924160.0, + "grad_norm": 2.4913709616978554, + "language_loss": 0.95772272, + "learning_rate": 3.5443131689983283e-06, + "loss": 0.97941571, + "num_input_tokens_seen": 86880040, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8984375, + "step": 4035, + "time_per_iteration": 2.4757437705993652 + }, + { + "auxiliary_loss_clip": 0.01126986, + "auxiliary_loss_mlp": 0.01047233, + "balance_loss_clip": 1.03220701, + "balance_loss_mlp": 1.04172754, + "epoch": 0.24265744776792425, + "flos": 22856639431680.0, + "grad_norm": 2.0212510419571794, + "language_loss": 0.77875686, + "learning_rate": 3.5440656631872715e-06, + "loss": 0.80049908, + "num_input_tokens_seen": 86900610, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8515625, + "step": 4036, + "time_per_iteration": 2.5642547607421875 + }, + { + "auxiliary_loss_clip": 0.01134779, + "auxiliary_loss_mlp": 0.01043471, + "balance_loss_clip": 1.02642441, + "balance_loss_mlp": 1.04447269, + "epoch": 0.24271757102059222, + "flos": 21871573873920.0, + "grad_norm": 1.648393445666421, + "language_loss": 0.74427915, + "learning_rate": 3.5438180988250898e-06, + "loss": 0.76606166, + "num_input_tokens_seen": 86919385, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.90234375, + "step": 4037, + "time_per_iteration": 2.4529507160186768 + }, + { + "auxiliary_loss_clip": 0.01133991, + "auxiliary_loss_mlp": 0.0104144, + "balance_loss_clip": 1.02497733, + "balance_loss_mlp": 1.04398596, + "epoch": 0.24277769427326018, + "flos": 19208582340480.0, + "grad_norm": 2.7681997598872656, + "language_loss": 0.76223898, + "learning_rate": 3.543570475921171e-06, + "loss": 0.78399336, + "num_input_tokens_seen": 86938885, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8984375, + "step": 4038, + "time_per_iteration": 2.4618003368377686 + }, + { + "auxiliary_loss_clip": 0.01135029, + "auxiliary_loss_mlp": 0.01044933, + "balance_loss_clip": 1.02742147, + "balance_loss_mlp": 1.04415751, + "epoch": 0.24283781752592815, + "flos": 19499889640320.0, + "grad_norm": 2.0050890767905645, + "language_loss": 0.72632921, + "learning_rate": 3.543322794484905e-06, + "loss": 0.74812889, + "num_input_tokens_seen": 86957705, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.90625, + "step": 4039, + "time_per_iteration": 2.4261560440063477 + }, + { + "auxiliary_loss_clip": 0.01135297, + "auxiliary_loss_mlp": 0.01043184, + "balance_loss_clip": 1.02631593, + "balance_loss_mlp": 1.04608393, + "epoch": 0.2428979407785961, + "flos": 19902196944000.0, + "grad_norm": 1.6810247735848671, + "language_loss": 0.78330719, + "learning_rate": 3.5430750545256843e-06, + "loss": 0.80509198, + "num_input_tokens_seen": 86975845, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.890625, + "step": 4040, + "time_per_iteration": 2.4808037281036377 + }, + { + "auxiliary_loss_clip": 0.01128006, + "auxiliary_loss_mlp": 0.01034019, + "balance_loss_clip": 1.01912999, + "balance_loss_mlp": 1.04237986, + "epoch": 0.2429580640312641, + "flos": 24715878284160.0, + "grad_norm": 1.8145876332629047, + "language_loss": 0.80390251, + "learning_rate": 3.5428272560529027e-06, + "loss": 0.82552278, + "num_input_tokens_seen": 86994800, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.85546875, + "step": 4041, + "time_per_iteration": 2.482576847076416 + }, + { + "auxiliary_loss_clip": 0.01134051, + "auxiliary_loss_mlp": 0.01043295, + "balance_loss_clip": 1.02769041, + "balance_loss_mlp": 1.04653025, + "epoch": 0.24301818728393207, + "flos": 25630343660160.0, + "grad_norm": 4.455498217071982, + "language_loss": 0.76670969, + "learning_rate": 3.542579399075957e-06, + "loss": 0.78848314, + "num_input_tokens_seen": 87016845, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.875, + "step": 4042, + "time_per_iteration": 2.4944398403167725 + }, + { + "auxiliary_loss_clip": 0.01130826, + "auxiliary_loss_mlp": 0.01033112, + "balance_loss_clip": 1.01815128, + "balance_loss_mlp": 1.04393744, + "epoch": 0.24307831053660003, + "flos": 26141388410880.0, + "grad_norm": 1.7591863299055037, + "language_loss": 0.8139993, + "learning_rate": 3.542331483604246e-06, + "loss": 0.83563864, + "num_input_tokens_seen": 87036270, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8671875, + "step": 4043, + "time_per_iteration": 2.4965035915374756 + }, + { + "auxiliary_loss_clip": 0.01135997, + "auxiliary_loss_mlp": 0.01037391, + "balance_loss_clip": 1.02053475, + "balance_loss_mlp": 1.04298007, + "epoch": 0.243138433789268, + "flos": 14972415868800.0, + "grad_norm": 2.448799092011911, + "language_loss": 0.73345625, + "learning_rate": 3.5420835096471706e-06, + "loss": 0.75519013, + "num_input_tokens_seen": 87049920, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9296875, + "step": 4044, + "time_per_iteration": 2.42809796333313 + }, + { + "auxiliary_loss_clip": 0.01136098, + "auxiliary_loss_mlp": 0.01042356, + "balance_loss_clip": 1.0252496, + "balance_loss_mlp": 1.04730773, + "epoch": 0.24319855704193596, + "flos": 25191694771200.0, + "grad_norm": 1.780616714891853, + "language_loss": 0.83562207, + "learning_rate": 3.5418354772141337e-06, + "loss": 0.85740674, + "num_input_tokens_seen": 87068230, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.88671875, + "step": 4045, + "time_per_iteration": 2.4965107440948486 + }, + { + "auxiliary_loss_clip": 0.01134201, + "auxiliary_loss_mlp": 0.01045916, + "balance_loss_clip": 1.02944136, + "balance_loss_mlp": 1.04542089, + "epoch": 0.24325868029460393, + "flos": 22127221946880.0, + "grad_norm": 2.1598753545738663, + "language_loss": 0.86787856, + "learning_rate": 3.541587386314541e-06, + "loss": 0.88967973, + "num_input_tokens_seen": 87086435, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.890625, + "step": 4046, + "time_per_iteration": 2.5126357078552246 + }, + { + "auxiliary_loss_clip": 0.01128157, + "auxiliary_loss_mlp": 0.01041362, + "balance_loss_clip": 1.02526259, + "balance_loss_mlp": 1.04252553, + "epoch": 0.2433188035472719, + "flos": 23582106420480.0, + "grad_norm": 1.9885516182116696, + "language_loss": 0.7281425, + "learning_rate": 3.5413392369578e-06, + "loss": 0.7498377, + "num_input_tokens_seen": 87105340, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.859375, + "step": 4047, + "time_per_iteration": 2.4886271953582764 + }, + { + "auxiliary_loss_clip": 0.01133305, + "auxiliary_loss_mlp": 0.01039984, + "balance_loss_clip": 1.02243662, + "balance_loss_mlp": 1.0435816, + "epoch": 0.2433789267999399, + "flos": 24462815990400.0, + "grad_norm": 2.411807088840578, + "language_loss": 0.72845596, + "learning_rate": 3.5410910291533213e-06, + "loss": 0.75018883, + "num_input_tokens_seen": 87125780, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8984375, + "step": 4048, + "time_per_iteration": 2.522012710571289 + }, + { + "auxiliary_loss_clip": 0.01132229, + "auxiliary_loss_mlp": 0.01042433, + "balance_loss_clip": 1.02720952, + "balance_loss_mlp": 1.04504991, + "epoch": 0.24343905005260785, + "flos": 16727909264640.0, + "grad_norm": 4.923738678144707, + "language_loss": 0.72984087, + "learning_rate": 3.5408427629105155e-06, + "loss": 0.75158751, + "num_input_tokens_seen": 87144470, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.875, + "step": 4049, + "time_per_iteration": 2.4399380683898926 + }, + { + "auxiliary_loss_clip": 0.0112907, + "auxiliary_loss_mlp": 0.01041944, + "balance_loss_clip": 1.02654243, + "balance_loss_mlp": 1.04297137, + "epoch": 0.24349917330527582, + "flos": 20043756443520.0, + "grad_norm": 6.058583880667159, + "language_loss": 0.7388249, + "learning_rate": 3.5405944382387985e-06, + "loss": 0.760535, + "num_input_tokens_seen": 87162830, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.859375, + "step": 4050, + "time_per_iteration": 2.4589998722076416 + }, + { + "auxiliary_loss_clip": 0.01128476, + "auxiliary_loss_mlp": 0.01044223, + "balance_loss_clip": 1.02925062, + "balance_loss_mlp": 1.04373455, + "epoch": 0.24355929655794378, + "flos": 17420554200960.0, + "grad_norm": 3.083460080669968, + "language_loss": 0.74948591, + "learning_rate": 3.5403460551475854e-06, + "loss": 0.77121294, + "num_input_tokens_seen": 87180905, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.84765625, + "step": 4051, + "time_per_iteration": 2.4284183979034424 + }, + { + "auxiliary_loss_clip": 0.01129104, + "auxiliary_loss_mlp": 0.01038015, + "balance_loss_clip": 1.02251768, + "balance_loss_mlp": 1.04273975, + "epoch": 0.24361941981061175, + "flos": 25410929431680.0, + "grad_norm": 2.420510968298769, + "language_loss": 0.70638204, + "learning_rate": 3.540097613646296e-06, + "loss": 0.72805327, + "num_input_tokens_seen": 87202290, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.86328125, + "step": 4052, + "time_per_iteration": 5.468756675720215 + }, + { + "auxiliary_loss_clip": 0.01131368, + "auxiliary_loss_mlp": 0.01048115, + "balance_loss_clip": 1.03215313, + "balance_loss_mlp": 1.04370522, + "epoch": 0.2436795430632797, + "flos": 22820800636800.0, + "grad_norm": 1.61331134721481, + "language_loss": 0.81265736, + "learning_rate": 3.539849113744351e-06, + "loss": 0.83445215, + "num_input_tokens_seen": 87221650, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.875, + "step": 4053, + "time_per_iteration": 2.5280394554138184 + }, + { + "auxiliary_loss_clip": 0.01135173, + "auxiliary_loss_mlp": 0.01035062, + "balance_loss_clip": 1.01895714, + "balance_loss_mlp": 1.04522192, + "epoch": 0.2437396663159477, + "flos": 15157786982400.0, + "grad_norm": 1.5461481286352234, + "language_loss": 0.77842951, + "learning_rate": 3.539600555451172e-06, + "loss": 0.80013186, + "num_input_tokens_seen": 87238515, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8984375, + "step": 4054, + "time_per_iteration": 2.424604892730713 + }, + { + "auxiliary_loss_clip": 0.01128011, + "auxiliary_loss_mlp": 0.01044969, + "balance_loss_clip": 1.02990091, + "balance_loss_mlp": 1.04097724, + "epoch": 0.24379978956861567, + "flos": 22091131756800.0, + "grad_norm": 1.616998838355979, + "language_loss": 0.83784473, + "learning_rate": 3.5393519387761866e-06, + "loss": 0.85957456, + "num_input_tokens_seen": 87256290, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.87109375, + "step": 4055, + "time_per_iteration": 2.4814612865448 + }, + { + "auxiliary_loss_clip": 0.0113426, + "auxiliary_loss_mlp": 0.01038657, + "balance_loss_clip": 1.02194405, + "balance_loss_mlp": 1.04221749, + "epoch": 0.24385991282128364, + "flos": 31467766527360.0, + "grad_norm": 3.407480313131798, + "language_loss": 0.55291057, + "learning_rate": 3.5391032637288217e-06, + "loss": 0.57463974, + "num_input_tokens_seen": 87277085, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.921875, + "step": 4056, + "time_per_iteration": 2.5356216430664062 + }, + { + "auxiliary_loss_clip": 0.01133242, + "auxiliary_loss_mlp": 0.01043161, + "balance_loss_clip": 1.02626896, + "balance_loss_mlp": 1.04361272, + "epoch": 0.2439200360739516, + "flos": 23838795987840.0, + "grad_norm": 2.24663888381965, + "language_loss": 0.79832959, + "learning_rate": 3.538854530318506e-06, + "loss": 0.82009363, + "num_input_tokens_seen": 87293020, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8984375, + "step": 4057, + "time_per_iteration": 2.4915707111358643 + }, + { + "auxiliary_loss_clip": 0.01128391, + "auxiliary_loss_mlp": 0.01037777, + "balance_loss_clip": 1.02195764, + "balance_loss_mlp": 1.04218984, + "epoch": 0.24398015932661957, + "flos": 19169978198400.0, + "grad_norm": 1.7432058239394113, + "language_loss": 0.78817719, + "learning_rate": 3.538605738554673e-06, + "loss": 0.80983889, + "num_input_tokens_seen": 87311445, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.86328125, + "step": 4058, + "time_per_iteration": 2.426687002182007 + }, + { + "auxiliary_loss_clip": 0.01133605, + "auxiliary_loss_mlp": 0.01038515, + "balance_loss_clip": 1.02366126, + "balance_loss_mlp": 1.04273307, + "epoch": 0.24404028257928753, + "flos": 25262474520960.0, + "grad_norm": 1.688831116872718, + "language_loss": 0.85133582, + "learning_rate": 3.538356888446756e-06, + "loss": 0.87305701, + "num_input_tokens_seen": 87332055, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.90625, + "step": 4059, + "time_per_iteration": 2.499464511871338 + }, + { + "auxiliary_loss_clip": 0.01127196, + "auxiliary_loss_mlp": 0.01035816, + "balance_loss_clip": 1.02079606, + "balance_loss_mlp": 1.04288411, + "epoch": 0.2441004058319555, + "flos": 26467600752000.0, + "grad_norm": 1.6494662829711617, + "language_loss": 0.73770267, + "learning_rate": 3.5381079800041913e-06, + "loss": 0.75933278, + "num_input_tokens_seen": 87351295, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84375, + "step": 4060, + "time_per_iteration": 2.4955050945281982 + }, + { + "auxiliary_loss_clip": 0.01137637, + "auxiliary_loss_mlp": 0.01050854, + "balance_loss_clip": 1.03262711, + "balance_loss_mlp": 1.04506934, + "epoch": 0.2441605290846235, + "flos": 26760524163840.0, + "grad_norm": 1.8597953216817902, + "language_loss": 0.73587501, + "learning_rate": 3.5378590132364182e-06, + "loss": 0.75775993, + "num_input_tokens_seen": 87370650, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.92578125, + "step": 4061, + "time_per_iteration": 2.5002825260162354 + }, + { + "auxiliary_loss_clip": 0.01129662, + "auxiliary_loss_mlp": 0.01036553, + "balance_loss_clip": 1.02248669, + "balance_loss_mlp": 1.04437923, + "epoch": 0.24422065233729146, + "flos": 21105850717440.0, + "grad_norm": 1.6775055914479682, + "language_loss": 0.76006806, + "learning_rate": 3.5376099881528768e-06, + "loss": 0.78173012, + "num_input_tokens_seen": 87389020, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8515625, + "step": 4062, + "time_per_iteration": 2.478625535964966 + }, + { + "auxiliary_loss_clip": 0.01126984, + "auxiliary_loss_mlp": 0.01035611, + "balance_loss_clip": 1.0205195, + "balance_loss_mlp": 1.04376316, + "epoch": 0.24428077558995942, + "flos": 25263156879360.0, + "grad_norm": 1.7282475931571, + "language_loss": 0.85710216, + "learning_rate": 3.537360904763011e-06, + "loss": 0.87872803, + "num_input_tokens_seen": 87409695, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.83203125, + "step": 4063, + "time_per_iteration": 2.5161943435668945 + }, + { + "auxiliary_loss_clip": 0.01135931, + "auxiliary_loss_mlp": 0.01042417, + "balance_loss_clip": 1.02603722, + "balance_loss_mlp": 1.04589176, + "epoch": 0.24434089884262739, + "flos": 20485278420480.0, + "grad_norm": 6.32752237165424, + "language_loss": 0.68127096, + "learning_rate": 3.5371117630762656e-06, + "loss": 0.70305437, + "num_input_tokens_seen": 87428250, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8984375, + "step": 4064, + "time_per_iteration": 2.4434523582458496 + }, + { + "auxiliary_loss_clip": 0.01134926, + "auxiliary_loss_mlp": 0.01037547, + "balance_loss_clip": 1.02083397, + "balance_loss_mlp": 1.04318714, + "epoch": 0.24440102209529535, + "flos": 23621895711360.0, + "grad_norm": 1.5178524812834733, + "language_loss": 0.7003206, + "learning_rate": 3.536862563102088e-06, + "loss": 0.72204536, + "num_input_tokens_seen": 87449380, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.91796875, + "step": 4065, + "time_per_iteration": 2.513827085494995 + }, + { + "auxiliary_loss_clip": 0.01136726, + "auxiliary_loss_mlp": 0.01047876, + "balance_loss_clip": 1.02960134, + "balance_loss_mlp": 1.04461718, + "epoch": 0.24446114534796332, + "flos": 20554729367040.0, + "grad_norm": 2.0517728790430048, + "language_loss": 0.83912247, + "learning_rate": 3.5366133048499282e-06, + "loss": 0.86096847, + "num_input_tokens_seen": 87465365, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.921875, + "step": 4066, + "time_per_iteration": 2.4601314067840576 + }, + { + "auxiliary_loss_clip": 0.01053849, + "auxiliary_loss_mlp": 0.01006665, + "balance_loss_clip": 1.00455475, + "balance_loss_mlp": 1.02389407, + "epoch": 0.24452126860063128, + "flos": 60389575009920.0, + "grad_norm": 0.7387464995159381, + "language_loss": 0.52291965, + "learning_rate": 3.5363639883292374e-06, + "loss": 0.54352474, + "num_input_tokens_seen": 87522525, + "router_z_loss_clip": 0.02111816, + "router_z_loss_mlp": 0.29882812, + "step": 4067, + "time_per_iteration": 2.9973862171173096 + }, + { + "auxiliary_loss_clip": 0.01133754, + "auxiliary_loss_mlp": 0.01040771, + "balance_loss_clip": 1.0242008, + "balance_loss_mlp": 1.04483843, + "epoch": 0.24458139185329927, + "flos": 15121660878720.0, + "grad_norm": 3.022186633601072, + "language_loss": 0.71927387, + "learning_rate": 3.5361146135494706e-06, + "loss": 0.74101913, + "num_input_tokens_seen": 87539170, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.890625, + "step": 4068, + "time_per_iteration": 2.4484708309173584 + }, + { + "auxiliary_loss_clip": 0.01132664, + "auxiliary_loss_mlp": 0.01040777, + "balance_loss_clip": 1.02457666, + "balance_loss_mlp": 1.04505873, + "epoch": 0.24464151510596724, + "flos": 27998723842560.0, + "grad_norm": 1.494083672668599, + "language_loss": 0.77513826, + "learning_rate": 3.5358651805200835e-06, + "loss": 0.79687262, + "num_input_tokens_seen": 87558875, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.875, + "step": 4069, + "time_per_iteration": 2.5724000930786133 + }, + { + "auxiliary_loss_clip": 0.01133522, + "auxiliary_loss_mlp": 0.01047384, + "balance_loss_clip": 1.03101087, + "balance_loss_mlp": 1.04646873, + "epoch": 0.2447016383586352, + "flos": 19792884879360.0, + "grad_norm": 1.9755919994455295, + "language_loss": 0.80163878, + "learning_rate": 3.5356156892505347e-06, + "loss": 0.82344782, + "num_input_tokens_seen": 87576485, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.87109375, + "step": 4070, + "time_per_iteration": 2.4932186603546143 + }, + { + "auxiliary_loss_clip": 0.01130692, + "auxiliary_loss_mlp": 0.01045764, + "balance_loss_clip": 1.03018379, + "balance_loss_mlp": 1.04351497, + "epoch": 0.24476176161130317, + "flos": 26067340523520.0, + "grad_norm": 1.6271146290001441, + "language_loss": 0.8410303, + "learning_rate": 3.5353661397502854e-06, + "loss": 0.86279482, + "num_input_tokens_seen": 87598620, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.875, + "step": 4071, + "time_per_iteration": 2.5299296379089355 + }, + { + "auxiliary_loss_clip": 0.0113627, + "auxiliary_loss_mlp": 0.01045362, + "balance_loss_clip": 1.02795792, + "balance_loss_mlp": 1.04406631, + "epoch": 0.24482188486397113, + "flos": 18843550375680.0, + "grad_norm": 1.720640728536457, + "language_loss": 0.79751229, + "learning_rate": 3.535116532028798e-06, + "loss": 0.81932867, + "num_input_tokens_seen": 87616595, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.921875, + "step": 4072, + "time_per_iteration": 2.470327854156494 + }, + { + "auxiliary_loss_clip": 0.01129102, + "auxiliary_loss_mlp": 0.01043575, + "balance_loss_clip": 1.02906084, + "balance_loss_mlp": 1.04437995, + "epoch": 0.2448820081166391, + "flos": 21251791676160.0, + "grad_norm": 1.615929332251483, + "language_loss": 0.70322561, + "learning_rate": 3.5348668660955382e-06, + "loss": 0.7249524, + "num_input_tokens_seen": 87635755, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.84765625, + "step": 4073, + "time_per_iteration": 2.4951980113983154 + }, + { + "auxiliary_loss_clip": 0.01129351, + "auxiliary_loss_mlp": 0.01041111, + "balance_loss_clip": 1.02662683, + "balance_loss_mlp": 1.04456043, + "epoch": 0.2449421313693071, + "flos": 23950586090880.0, + "grad_norm": 2.5968867848691133, + "language_loss": 0.67692697, + "learning_rate": 3.5346171419599728e-06, + "loss": 0.69863164, + "num_input_tokens_seen": 87652885, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.84765625, + "step": 4074, + "time_per_iteration": 2.4697325229644775 + }, + { + "auxiliary_loss_clip": 0.01052266, + "auxiliary_loss_mlp": 0.01006876, + "balance_loss_clip": 1.00504053, + "balance_loss_mlp": 1.0222578, + "epoch": 0.24500225462197506, + "flos": 60687669980160.0, + "grad_norm": 0.896032421619399, + "language_loss": 0.68665123, + "learning_rate": 3.5343673596315718e-06, + "loss": 0.70724261, + "num_input_tokens_seen": 87713220, + "router_z_loss_clip": 0.01831055, + "router_z_loss_mlp": 0.30078125, + "step": 4075, + "time_per_iteration": 3.1993846893310547 + }, + { + "auxiliary_loss_clip": 0.01131428, + "auxiliary_loss_mlp": 0.01040459, + "balance_loss_clip": 1.02548659, + "balance_loss_mlp": 1.04603517, + "epoch": 0.24506237787464302, + "flos": 26284204886400.0, + "grad_norm": 2.243483207404797, + "language_loss": 0.79306483, + "learning_rate": 3.5341175191198063e-06, + "loss": 0.81478369, + "num_input_tokens_seen": 87732680, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.85546875, + "step": 4076, + "time_per_iteration": 2.542245388031006 + }, + { + "auxiliary_loss_clip": 0.01134594, + "auxiliary_loss_mlp": 0.01045082, + "balance_loss_clip": 1.02749884, + "balance_loss_mlp": 1.04342794, + "epoch": 0.245122501127311, + "flos": 20552287242240.0, + "grad_norm": 2.0630196459837618, + "language_loss": 0.82211018, + "learning_rate": 3.533867620434151e-06, + "loss": 0.84390688, + "num_input_tokens_seen": 87751880, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.91015625, + "step": 4077, + "time_per_iteration": 2.5165140628814697 + }, + { + "auxiliary_loss_clip": 0.01132098, + "auxiliary_loss_mlp": 0.01044565, + "balance_loss_clip": 1.02695799, + "balance_loss_mlp": 1.04380083, + "epoch": 0.24518262437997895, + "flos": 29132603447040.0, + "grad_norm": 12.782264679420269, + "language_loss": 0.61930454, + "learning_rate": 3.533617663584082e-06, + "loss": 0.64107114, + "num_input_tokens_seen": 87771795, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8828125, + "step": 4078, + "time_per_iteration": 2.5202372074127197 + }, + { + "auxiliary_loss_clip": 0.01129452, + "auxiliary_loss_mlp": 0.0103596, + "balance_loss_clip": 1.02035594, + "balance_loss_mlp": 1.04474652, + "epoch": 0.24524274763264692, + "flos": 23476924419840.0, + "grad_norm": 1.7044874550491866, + "language_loss": 0.75514519, + "learning_rate": 3.5333676485790765e-06, + "loss": 0.77679932, + "num_input_tokens_seen": 87793640, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84765625, + "step": 4079, + "time_per_iteration": 2.483339309692383 + }, + { + "auxiliary_loss_clip": 0.01129188, + "auxiliary_loss_mlp": 0.01042937, + "balance_loss_clip": 1.02686739, + "balance_loss_mlp": 1.04370368, + "epoch": 0.24530287088531488, + "flos": 17201175886080.0, + "grad_norm": 1.8257477744529516, + "language_loss": 0.74925131, + "learning_rate": 3.5331175754286173e-06, + "loss": 0.77097261, + "num_input_tokens_seen": 87812390, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.85546875, + "step": 4080, + "time_per_iteration": 2.4843389987945557 + }, + { + "auxiliary_loss_clip": 0.01125805, + "auxiliary_loss_mlp": 0.01039252, + "balance_loss_clip": 1.02375531, + "balance_loss_mlp": 1.04129529, + "epoch": 0.24536299413798288, + "flos": 14867449349760.0, + "grad_norm": 2.211780780293779, + "language_loss": 0.82807517, + "learning_rate": 3.532867444142186e-06, + "loss": 0.84972572, + "num_input_tokens_seen": 87830640, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.84375, + "step": 4081, + "time_per_iteration": 2.4753835201263428 + }, + { + "auxiliary_loss_clip": 0.01128982, + "auxiliary_loss_mlp": 0.01039204, + "balance_loss_clip": 1.02445221, + "balance_loss_mlp": 1.04313576, + "epoch": 0.24542311739065084, + "flos": 35262051886080.0, + "grad_norm": 4.1574914526272515, + "language_loss": 0.73153239, + "learning_rate": 3.532617254729267e-06, + "loss": 0.75321424, + "num_input_tokens_seen": 87850450, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.859375, + "step": 4082, + "time_per_iteration": 2.5975396633148193 + }, + { + "auxiliary_loss_clip": 0.01127179, + "auxiliary_loss_mlp": 0.01042857, + "balance_loss_clip": 1.02837873, + "balance_loss_mlp": 1.04274178, + "epoch": 0.2454832406433188, + "flos": 21503130117120.0, + "grad_norm": 1.543838453785988, + "language_loss": 0.71628594, + "learning_rate": 3.5323670071993485e-06, + "loss": 0.73798621, + "num_input_tokens_seen": 87868810, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.84375, + "step": 4083, + "time_per_iteration": 2.471519947052002 + }, + { + "auxiliary_loss_clip": 0.01131409, + "auxiliary_loss_mlp": 0.01040567, + "balance_loss_clip": 1.02285206, + "balance_loss_mlp": 1.04234004, + "epoch": 0.24554336389598677, + "flos": 14756664827520.0, + "grad_norm": 2.1941070650453094, + "language_loss": 0.74700832, + "learning_rate": 3.532116701561919e-06, + "loss": 0.76872808, + "num_input_tokens_seen": 87885685, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.890625, + "step": 4084, + "time_per_iteration": 2.4286506175994873 + }, + { + "auxiliary_loss_clip": 0.01126312, + "auxiliary_loss_mlp": 0.01035336, + "balance_loss_clip": 1.01986289, + "balance_loss_mlp": 1.04189909, + "epoch": 0.24560348714865474, + "flos": 14976402278400.0, + "grad_norm": 2.042106499003273, + "language_loss": 0.85206825, + "learning_rate": 3.531866337826471e-06, + "loss": 0.8736847, + "num_input_tokens_seen": 87903715, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84375, + "step": 4085, + "time_per_iteration": 2.4283318519592285 + }, + { + "auxiliary_loss_clip": 0.01130256, + "auxiliary_loss_mlp": 0.01048422, + "balance_loss_clip": 1.03209007, + "balance_loss_mlp": 1.04266381, + "epoch": 0.2456636104013227, + "flos": 22675326554880.0, + "grad_norm": 1.8090063737063005, + "language_loss": 0.7876097, + "learning_rate": 3.5316159160024982e-06, + "loss": 0.80939639, + "num_input_tokens_seen": 87923375, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.875, + "step": 4086, + "time_per_iteration": 2.478954792022705 + }, + { + "auxiliary_loss_clip": 0.01126651, + "auxiliary_loss_mlp": 0.01041575, + "balance_loss_clip": 1.02669752, + "balance_loss_mlp": 1.04330873, + "epoch": 0.2457237336539907, + "flos": 27417869009280.0, + "grad_norm": 1.6669278195562474, + "language_loss": 0.75269985, + "learning_rate": 3.531365436099496e-06, + "loss": 0.77438211, + "num_input_tokens_seen": 87943115, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8359375, + "step": 4087, + "time_per_iteration": 2.4871292114257812 + }, + { + "auxiliary_loss_clip": 0.01132319, + "auxiliary_loss_mlp": 0.01041093, + "balance_loss_clip": 1.02364135, + "balance_loss_mlp": 1.04574418, + "epoch": 0.24578385690665866, + "flos": 20412379768320.0, + "grad_norm": 2.5789657141026, + "language_loss": 0.79284519, + "learning_rate": 3.5311148981269635e-06, + "loss": 0.81457937, + "num_input_tokens_seen": 87959505, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.8671875, + "step": 4088, + "time_per_iteration": 2.479841709136963 + }, + { + "auxiliary_loss_clip": 0.01123487, + "auxiliary_loss_mlp": 0.01033801, + "balance_loss_clip": 1.0196631, + "balance_loss_mlp": 1.04091823, + "epoch": 0.24584398015932662, + "flos": 23915393740800.0, + "grad_norm": 1.6187757849670203, + "language_loss": 0.7736612, + "learning_rate": 3.5308643020944e-06, + "loss": 0.79523408, + "num_input_tokens_seen": 87979725, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.828125, + "step": 4089, + "time_per_iteration": 2.483436346054077 + }, + { + "auxiliary_loss_clip": 0.01129539, + "auxiliary_loss_mlp": 0.01040613, + "balance_loss_clip": 1.02440071, + "balance_loss_mlp": 1.04232669, + "epoch": 0.2459041034119946, + "flos": 41496359103360.0, + "grad_norm": 3.8690522662716416, + "language_loss": 0.81463957, + "learning_rate": 3.530613648011309e-06, + "loss": 0.83634108, + "num_input_tokens_seen": 87998270, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.87109375, + "step": 4090, + "time_per_iteration": 2.657944917678833 + }, + { + "auxiliary_loss_clip": 0.01132703, + "auxiliary_loss_mlp": 0.01049826, + "balance_loss_clip": 1.03265369, + "balance_loss_mlp": 1.04411578, + "epoch": 0.24596422666466256, + "flos": 19936814676480.0, + "grad_norm": 1.9398667366019489, + "language_loss": 0.72874928, + "learning_rate": 3.5303629358871946e-06, + "loss": 0.75057453, + "num_input_tokens_seen": 88016760, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.88671875, + "step": 4091, + "time_per_iteration": 2.448307991027832 + }, + { + "auxiliary_loss_clip": 0.0113285, + "auxiliary_loss_mlp": 0.01036521, + "balance_loss_clip": 1.02166772, + "balance_loss_mlp": 1.04811478, + "epoch": 0.24602434991733052, + "flos": 21544391865600.0, + "grad_norm": 1.9209724672120978, + "language_loss": 0.76486623, + "learning_rate": 3.5301121657315653e-06, + "loss": 0.78656, + "num_input_tokens_seen": 88036465, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.84765625, + "step": 4092, + "time_per_iteration": 2.510815143585205 + }, + { + "auxiliary_loss_clip": 0.01134482, + "auxiliary_loss_mlp": 0.01035407, + "balance_loss_clip": 1.01899242, + "balance_loss_mlp": 1.04404068, + "epoch": 0.24608447316999849, + "flos": 23185078416000.0, + "grad_norm": 2.544549098738024, + "language_loss": 0.80905128, + "learning_rate": 3.5298613375539287e-06, + "loss": 0.83075017, + "num_input_tokens_seen": 88053270, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90234375, + "step": 4093, + "time_per_iteration": 2.4658117294311523 + }, + { + "auxiliary_loss_clip": 0.01133522, + "auxiliary_loss_mlp": 0.01042815, + "balance_loss_clip": 1.02542281, + "balance_loss_mlp": 1.04285693, + "epoch": 0.24614459642266648, + "flos": 19641951930240.0, + "grad_norm": 1.9793331271335382, + "language_loss": 0.87355959, + "learning_rate": 3.529610451363797e-06, + "loss": 0.89532292, + "num_input_tokens_seen": 88072305, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.90625, + "step": 4094, + "time_per_iteration": 5.436578035354614 + }, + { + "auxiliary_loss_clip": 0.01055645, + "auxiliary_loss_mlp": 0.01004731, + "balance_loss_clip": 1.00285995, + "balance_loss_mlp": 1.02449679, + "epoch": 0.24620471967533444, + "flos": 61739816186880.0, + "grad_norm": 0.7591937233735362, + "language_loss": 0.57501638, + "learning_rate": 3.5293595071706833e-06, + "loss": 0.59562016, + "num_input_tokens_seen": 88137995, + "router_z_loss_clip": 0.01867676, + "router_z_loss_mlp": 0.3125, + "step": 4095, + "time_per_iteration": 3.1966967582702637 + }, + { + "auxiliary_loss_clip": 0.01055105, + "auxiliary_loss_mlp": 0.01001708, + "balance_loss_clip": 0.99987203, + "balance_loss_mlp": 1.02336812, + "epoch": 0.2462648429280024, + "flos": 69154436315520.0, + "grad_norm": 0.643968481445629, + "language_loss": 0.56195372, + "learning_rate": 3.5291085049841042e-06, + "loss": 0.58252186, + "num_input_tokens_seen": 88208490, + "router_z_loss_clip": 0.01831055, + "router_z_loss_mlp": 0.31640625, + "step": 4096, + "time_per_iteration": 3.187084436416626 + }, + { + "auxiliary_loss_clip": 0.01134655, + "auxiliary_loss_mlp": 0.01035351, + "balance_loss_clip": 1.02030087, + "balance_loss_mlp": 1.04697204, + "epoch": 0.24632496618067037, + "flos": 29459605887360.0, + "grad_norm": 2.0390556104017907, + "language_loss": 0.77674699, + "learning_rate": 3.5288574448135773e-06, + "loss": 0.79844701, + "num_input_tokens_seen": 88228050, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.875, + "step": 4097, + "time_per_iteration": 2.5585436820983887 + }, + { + "auxiliary_loss_clip": 0.01134087, + "auxiliary_loss_mlp": 0.01044655, + "balance_loss_clip": 1.02608228, + "balance_loss_mlp": 1.04491377, + "epoch": 0.24638508943333834, + "flos": 24316444068480.0, + "grad_norm": 2.135816170269485, + "language_loss": 0.76393569, + "learning_rate": 3.5286063266686235e-06, + "loss": 0.78572309, + "num_input_tokens_seen": 88248090, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.890625, + "step": 4098, + "time_per_iteration": 2.478665828704834 + }, + { + "auxiliary_loss_clip": 0.01133268, + "auxiliary_loss_mlp": 0.01040789, + "balance_loss_clip": 1.02568507, + "balance_loss_mlp": 1.04479909, + "epoch": 0.2464452126860063, + "flos": 26613254401920.0, + "grad_norm": 2.152719854213413, + "language_loss": 0.68733507, + "learning_rate": 3.528355150558764e-06, + "loss": 0.70907569, + "num_input_tokens_seen": 88267545, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8828125, + "step": 4099, + "time_per_iteration": 2.515821933746338 + }, + { + "auxiliary_loss_clip": 0.01124761, + "auxiliary_loss_mlp": 0.01041381, + "balance_loss_clip": 1.02621734, + "balance_loss_mlp": 1.04163074, + "epoch": 0.24650533593867427, + "flos": 31212405763200.0, + "grad_norm": 2.459538616056665, + "language_loss": 0.65975124, + "learning_rate": 3.5281039164935237e-06, + "loss": 0.68141258, + "num_input_tokens_seen": 88289785, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.83203125, + "step": 4100, + "time_per_iteration": 2.562962532043457 + }, + { + "auxiliary_loss_clip": 0.01051305, + "auxiliary_loss_mlp": 0.01002462, + "balance_loss_clip": 1.00055432, + "balance_loss_mlp": 1.02057505, + "epoch": 0.24656545919134226, + "flos": 68494002900480.0, + "grad_norm": 0.7078763540659354, + "language_loss": 0.61549371, + "learning_rate": 3.5278526244824304e-06, + "loss": 0.63603139, + "num_input_tokens_seen": 88357320, + "router_z_loss_clip": 0.01904297, + "router_z_loss_mlp": 0.30859375, + "step": 4101, + "time_per_iteration": 3.1617352962493896 + }, + { + "auxiliary_loss_clip": 0.01128803, + "auxiliary_loss_mlp": 0.01034815, + "balance_loss_clip": 1.01893687, + "balance_loss_mlp": 1.04385781, + "epoch": 0.24662558244401023, + "flos": 20084192179200.0, + "grad_norm": 1.7154022892986804, + "language_loss": 0.73020113, + "learning_rate": 3.527601274535012e-06, + "loss": 0.75183737, + "num_input_tokens_seen": 88377040, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8515625, + "step": 4102, + "time_per_iteration": 2.5522637367248535 + }, + { + "auxiliary_loss_clip": 0.01132375, + "auxiliary_loss_mlp": 0.0104013, + "balance_loss_clip": 1.02463281, + "balance_loss_mlp": 1.04294777, + "epoch": 0.2466857056966782, + "flos": 30701361012480.0, + "grad_norm": 2.2979425011191528, + "language_loss": 0.75574934, + "learning_rate": 3.5273498666608004e-06, + "loss": 0.7774744, + "num_input_tokens_seen": 88395085, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.89453125, + "step": 4103, + "time_per_iteration": 2.5117204189300537 + }, + { + "auxiliary_loss_clip": 0.01129454, + "auxiliary_loss_mlp": 0.01043396, + "balance_loss_clip": 1.02647424, + "balance_loss_mlp": 1.04096079, + "epoch": 0.24674582894934616, + "flos": 22528523669760.0, + "grad_norm": 2.002646106823912, + "language_loss": 0.78701174, + "learning_rate": 3.5270984008693288e-06, + "loss": 0.80874026, + "num_input_tokens_seen": 88413205, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.88671875, + "step": 4104, + "time_per_iteration": 2.5791869163513184 + }, + { + "auxiliary_loss_clip": 0.011264, + "auxiliary_loss_mlp": 0.01041575, + "balance_loss_clip": 1.02333593, + "balance_loss_mlp": 1.0411272, + "epoch": 0.24680595220201412, + "flos": 20704297599360.0, + "grad_norm": 1.7283937272898544, + "language_loss": 0.83567655, + "learning_rate": 3.526846877170133e-06, + "loss": 0.85735631, + "num_input_tokens_seen": 88431525, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.8515625, + "step": 4105, + "time_per_iteration": 2.447399854660034 + }, + { + "auxiliary_loss_clip": 0.01134164, + "auxiliary_loss_mlp": 0.01043152, + "balance_loss_clip": 1.02768457, + "balance_loss_mlp": 1.04806173, + "epoch": 0.2468660754546821, + "flos": 21831174051840.0, + "grad_norm": 1.7373974977996043, + "language_loss": 0.7646578, + "learning_rate": 3.52659529557275e-06, + "loss": 0.78643101, + "num_input_tokens_seen": 88451210, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.859375, + "step": 4106, + "time_per_iteration": 2.519059658050537 + }, + { + "auxiliary_loss_clip": 0.01127139, + "auxiliary_loss_mlp": 0.01042215, + "balance_loss_clip": 1.02539492, + "balance_loss_mlp": 1.04087114, + "epoch": 0.24692619870735008, + "flos": 15267709578240.0, + "grad_norm": 2.1665884513414513, + "language_loss": 0.72764528, + "learning_rate": 3.5263436560867205e-06, + "loss": 0.74933887, + "num_input_tokens_seen": 88467790, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.86328125, + "step": 4107, + "time_per_iteration": 2.4489266872406006 + }, + { + "auxiliary_loss_clip": 0.01131987, + "auxiliary_loss_mlp": 0.01048032, + "balance_loss_clip": 1.03173625, + "balance_loss_mlp": 1.0454886, + "epoch": 0.24698632196001805, + "flos": 29680097523840.0, + "grad_norm": 2.3712774609847274, + "language_loss": 0.65420353, + "learning_rate": 3.526091958721587e-06, + "loss": 0.67600369, + "num_input_tokens_seen": 88490330, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8671875, + "step": 4108, + "time_per_iteration": 2.5401792526245117 + }, + { + "auxiliary_loss_clip": 0.01131766, + "auxiliary_loss_mlp": 0.01046054, + "balance_loss_clip": 1.02961504, + "balance_loss_mlp": 1.04324555, + "epoch": 0.247046445212686, + "flos": 39165469741440.0, + "grad_norm": 2.174268382145969, + "language_loss": 0.72611141, + "learning_rate": 3.5258402034868936e-06, + "loss": 0.74788952, + "num_input_tokens_seen": 88512435, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8828125, + "step": 4109, + "time_per_iteration": 2.593358278274536 + }, + { + "auxiliary_loss_clip": 0.01133432, + "auxiliary_loss_mlp": 0.010446, + "balance_loss_clip": 1.02788687, + "balance_loss_mlp": 1.04414606, + "epoch": 0.24710656846535398, + "flos": 22998845376000.0, + "grad_norm": 1.7026194733932167, + "language_loss": 0.79302657, + "learning_rate": 3.5255883903921866e-06, + "loss": 0.81480682, + "num_input_tokens_seen": 88529780, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.89453125, + "step": 4110, + "time_per_iteration": 2.4776864051818848 + }, + { + "auxiliary_loss_clip": 0.01133691, + "auxiliary_loss_mlp": 0.01032561, + "balance_loss_clip": 1.01618171, + "balance_loss_mlp": 1.04541993, + "epoch": 0.24716669171802194, + "flos": 26432803451520.0, + "grad_norm": 2.5002063230568545, + "language_loss": 0.80653715, + "learning_rate": 3.5253365194470144e-06, + "loss": 0.82819968, + "num_input_tokens_seen": 88547200, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8828125, + "step": 4111, + "time_per_iteration": 2.4957237243652344 + }, + { + "auxiliary_loss_clip": 0.01130011, + "auxiliary_loss_mlp": 0.01039883, + "balance_loss_clip": 1.02517819, + "balance_loss_mlp": 1.04273677, + "epoch": 0.2472268149706899, + "flos": 23329870139520.0, + "grad_norm": 2.4547784256207663, + "language_loss": 0.75205207, + "learning_rate": 3.5250845906609294e-06, + "loss": 0.77375102, + "num_input_tokens_seen": 88566415, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.875, + "step": 4112, + "time_per_iteration": 2.481778860092163 + }, + { + "auxiliary_loss_clip": 0.01130648, + "auxiliary_loss_mlp": 0.01044261, + "balance_loss_clip": 1.02868617, + "balance_loss_mlp": 1.04366612, + "epoch": 0.24728693822335787, + "flos": 23768734510080.0, + "grad_norm": 1.9927491285660106, + "language_loss": 0.82454932, + "learning_rate": 3.5248326040434835e-06, + "loss": 0.8462984, + "num_input_tokens_seen": 88585225, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.87109375, + "step": 4113, + "time_per_iteration": 2.4658617973327637 + }, + { + "auxiliary_loss_clip": 0.01129834, + "auxiliary_loss_mlp": 0.01036833, + "balance_loss_clip": 1.0205375, + "balance_loss_mlp": 1.0423646, + "epoch": 0.24734706147602586, + "flos": 19317499355520.0, + "grad_norm": 2.834925175676511, + "language_loss": 0.87073094, + "learning_rate": 3.5245805596042322e-06, + "loss": 0.89239764, + "num_input_tokens_seen": 88603280, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.875, + "step": 4114, + "time_per_iteration": 2.4575555324554443 + }, + { + "auxiliary_loss_clip": 0.01130204, + "auxiliary_loss_mlp": 0.01038167, + "balance_loss_clip": 1.02274156, + "balance_loss_mlp": 1.04354906, + "epoch": 0.24740718472869383, + "flos": 28036932935040.0, + "grad_norm": 2.804779626044085, + "language_loss": 0.753479, + "learning_rate": 3.524328457352734e-06, + "loss": 0.7751627, + "num_input_tokens_seen": 88624925, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8671875, + "step": 4115, + "time_per_iteration": 2.5051238536834717 + }, + { + "auxiliary_loss_clip": 0.01052886, + "auxiliary_loss_mlp": 0.01002125, + "balance_loss_clip": 1.00016963, + "balance_loss_mlp": 1.02261877, + "epoch": 0.2474673079813618, + "flos": 68107569408000.0, + "grad_norm": 0.6664049604648837, + "language_loss": 0.58203655, + "learning_rate": 3.5240762972985475e-06, + "loss": 0.60258663, + "num_input_tokens_seen": 88691475, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.30273438, + "step": 4116, + "time_per_iteration": 3.172032117843628 + }, + { + "auxiliary_loss_clip": 0.01130845, + "auxiliary_loss_mlp": 0.01035114, + "balance_loss_clip": 1.01992679, + "balance_loss_mlp": 1.04510772, + "epoch": 0.24752743123402976, + "flos": 29462119839360.0, + "grad_norm": 1.6806447251481575, + "language_loss": 0.83616889, + "learning_rate": 3.523824079451235e-06, + "loss": 0.8578285, + "num_input_tokens_seen": 88713425, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.85546875, + "step": 4117, + "time_per_iteration": 2.5228748321533203 + }, + { + "auxiliary_loss_clip": 0.01053619, + "auxiliary_loss_mlp": 0.0100274, + "balance_loss_clip": 1.00073707, + "balance_loss_mlp": 1.02337885, + "epoch": 0.24758755448669773, + "flos": 58350459824640.0, + "grad_norm": 0.9069522642789956, + "language_loss": 0.63507527, + "learning_rate": 3.5235718038203602e-06, + "loss": 0.65563887, + "num_input_tokens_seen": 88769995, + "router_z_loss_clip": 0.02001953, + "router_z_loss_mlp": 0.30078125, + "step": 4118, + "time_per_iteration": 2.9459333419799805 + }, + { + "auxiliary_loss_clip": 0.0113153, + "auxiliary_loss_mlp": 0.01040526, + "balance_loss_clip": 1.02470684, + "balance_loss_mlp": 1.04544902, + "epoch": 0.2476476777393657, + "flos": 20484416494080.0, + "grad_norm": 1.5050779056214143, + "language_loss": 0.79252797, + "learning_rate": 3.523319470415491e-06, + "loss": 0.8142485, + "num_input_tokens_seen": 88789970, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.859375, + "step": 4119, + "time_per_iteration": 2.438519239425659 + }, + { + "auxiliary_loss_clip": 0.01129874, + "auxiliary_loss_mlp": 0.01039225, + "balance_loss_clip": 1.02359676, + "balance_loss_mlp": 1.04430819, + "epoch": 0.24770780099203366, + "flos": 20485853038080.0, + "grad_norm": 1.9430586352888408, + "language_loss": 0.73955107, + "learning_rate": 3.5230670792461943e-06, + "loss": 0.76124215, + "num_input_tokens_seen": 88810000, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.85546875, + "step": 4120, + "time_per_iteration": 2.4728164672851562 + }, + { + "auxiliary_loss_clip": 0.01132188, + "auxiliary_loss_mlp": 0.01047049, + "balance_loss_clip": 1.03010893, + "balance_loss_mlp": 1.0446558, + "epoch": 0.24776792424470165, + "flos": 15153405523200.0, + "grad_norm": 3.4886461941998563, + "language_loss": 0.88028777, + "learning_rate": 3.522814630322041e-06, + "loss": 0.90208006, + "num_input_tokens_seen": 88827515, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.875, + "step": 4121, + "time_per_iteration": 2.4117653369903564 + }, + { + "auxiliary_loss_clip": 0.01134577, + "auxiliary_loss_mlp": 0.01037836, + "balance_loss_clip": 1.02102745, + "balance_loss_mlp": 1.04516518, + "epoch": 0.2478280474973696, + "flos": 21725453347200.0, + "grad_norm": 2.7360865086006285, + "language_loss": 0.69088298, + "learning_rate": 3.5225621236526045e-06, + "loss": 0.71260709, + "num_input_tokens_seen": 88845025, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.89453125, + "step": 4122, + "time_per_iteration": 2.484830617904663 + }, + { + "auxiliary_loss_clip": 0.0113273, + "auxiliary_loss_mlp": 0.01040601, + "balance_loss_clip": 1.02224231, + "balance_loss_mlp": 1.04380226, + "epoch": 0.24788817075003758, + "flos": 20412200200320.0, + "grad_norm": 2.016808492688271, + "language_loss": 0.80196065, + "learning_rate": 3.5223095592474596e-06, + "loss": 0.82369387, + "num_input_tokens_seen": 88861740, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.890625, + "step": 4123, + "time_per_iteration": 2.43839955329895 + }, + { + "auxiliary_loss_clip": 0.01130784, + "auxiliary_loss_mlp": 0.01041496, + "balance_loss_clip": 1.02620113, + "balance_loss_mlp": 1.04464054, + "epoch": 0.24794829400270554, + "flos": 22594455083520.0, + "grad_norm": 2.3250466211888745, + "language_loss": 0.74919629, + "learning_rate": 3.5220569371161846e-06, + "loss": 0.77091914, + "num_input_tokens_seen": 88879740, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.86328125, + "step": 4124, + "time_per_iteration": 2.4909141063690186 + }, + { + "auxiliary_loss_clip": 0.01127616, + "auxiliary_loss_mlp": 0.01034142, + "balance_loss_clip": 1.01922846, + "balance_loss_mlp": 1.0432241, + "epoch": 0.2480084172553735, + "flos": 39676047615360.0, + "grad_norm": 1.6909299882519486, + "language_loss": 0.73759794, + "learning_rate": 3.521804257268357e-06, + "loss": 0.75921559, + "num_input_tokens_seen": 88904095, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.84375, + "step": 4125, + "time_per_iteration": 2.6068458557128906 + }, + { + "auxiliary_loss_clip": 0.01135393, + "auxiliary_loss_mlp": 0.01046005, + "balance_loss_clip": 1.02914929, + "balance_loss_mlp": 1.04383993, + "epoch": 0.24806854050804147, + "flos": 22053712763520.0, + "grad_norm": 2.376019449241759, + "language_loss": 0.69416726, + "learning_rate": 3.5215515197135595e-06, + "loss": 0.71598125, + "num_input_tokens_seen": 88920740, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9140625, + "step": 4126, + "time_per_iteration": 2.4516806602478027 + }, + { + "auxiliary_loss_clip": 0.01130323, + "auxiliary_loss_mlp": 0.01047803, + "balance_loss_clip": 1.03112614, + "balance_loss_mlp": 1.04299593, + "epoch": 0.24812866376070947, + "flos": 15486764670720.0, + "grad_norm": 2.081795572279456, + "language_loss": 0.81602275, + "learning_rate": 3.5212987244613764e-06, + "loss": 0.83780402, + "num_input_tokens_seen": 88938510, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.875, + "step": 4127, + "time_per_iteration": 2.482492446899414 + }, + { + "auxiliary_loss_clip": 0.01134053, + "auxiliary_loss_mlp": 0.01045575, + "balance_loss_clip": 1.02998269, + "balance_loss_mlp": 1.04527378, + "epoch": 0.24818878701337743, + "flos": 14757419013120.0, + "grad_norm": 5.2721581441441465, + "language_loss": 0.84604752, + "learning_rate": 3.5210458715213927e-06, + "loss": 0.86784381, + "num_input_tokens_seen": 88955235, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.88671875, + "step": 4128, + "time_per_iteration": 2.4577715396881104 + }, + { + "auxiliary_loss_clip": 0.01132567, + "auxiliary_loss_mlp": 0.01043389, + "balance_loss_clip": 1.02779055, + "balance_loss_mlp": 1.04397762, + "epoch": 0.2482489102660454, + "flos": 27089501852160.0, + "grad_norm": 3.598051635390234, + "language_loss": 0.65576231, + "learning_rate": 3.5207929609031973e-06, + "loss": 0.67752188, + "num_input_tokens_seen": 88975210, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8828125, + "step": 4129, + "time_per_iteration": 2.498321294784546 + }, + { + "auxiliary_loss_clip": 0.01130769, + "auxiliary_loss_mlp": 0.01043091, + "balance_loss_clip": 1.02573466, + "balance_loss_mlp": 1.04308498, + "epoch": 0.24830903351871336, + "flos": 26467528924800.0, + "grad_norm": 2.23477186449736, + "language_loss": 0.75251818, + "learning_rate": 3.5205399926163806e-06, + "loss": 0.77425677, + "num_input_tokens_seen": 88996120, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.875, + "step": 4130, + "time_per_iteration": 2.534014940261841 + }, + { + "auxiliary_loss_clip": 0.01132521, + "auxiliary_loss_mlp": 0.01048652, + "balance_loss_clip": 1.03198647, + "balance_loss_mlp": 1.04404271, + "epoch": 0.24836915677138133, + "flos": 10228436870400.0, + "grad_norm": 2.282827015603824, + "language_loss": 0.77323985, + "learning_rate": 3.520286966670535e-06, + "loss": 0.79505157, + "num_input_tokens_seen": 89008685, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8828125, + "step": 4131, + "time_per_iteration": 2.3971383571624756 + }, + { + "auxiliary_loss_clip": 0.011274, + "auxiliary_loss_mlp": 0.01036942, + "balance_loss_clip": 1.02241063, + "balance_loss_mlp": 1.0428257, + "epoch": 0.2484292800240493, + "flos": 30080429579520.0, + "grad_norm": 1.5452946340590639, + "language_loss": 0.83932686, + "learning_rate": 3.520033883075255e-06, + "loss": 0.86097032, + "num_input_tokens_seen": 89031160, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.84375, + "step": 4132, + "time_per_iteration": 2.552804470062256 + }, + { + "auxiliary_loss_clip": 0.01129759, + "auxiliary_loss_mlp": 0.01042276, + "balance_loss_clip": 1.02601552, + "balance_loss_mlp": 1.04280567, + "epoch": 0.24848940327671726, + "flos": 13442944803840.0, + "grad_norm": 2.4707160060639857, + "language_loss": 0.71077073, + "learning_rate": 3.5197807418401386e-06, + "loss": 0.73249108, + "num_input_tokens_seen": 89047235, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.87109375, + "step": 4133, + "time_per_iteration": 2.40258526802063 + }, + { + "auxiliary_loss_clip": 0.01138495, + "auxiliary_loss_mlp": 0.01044523, + "balance_loss_clip": 1.02486503, + "balance_loss_mlp": 1.0454644, + "epoch": 0.24854952652938525, + "flos": 19970247260160.0, + "grad_norm": 2.206352055564895, + "language_loss": 0.61492884, + "learning_rate": 3.5195275429747834e-06, + "loss": 0.63675898, + "num_input_tokens_seen": 89064790, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.9296875, + "step": 4134, + "time_per_iteration": 2.476027250289917 + }, + { + "auxiliary_loss_clip": 0.01133349, + "auxiliary_loss_mlp": 0.01037132, + "balance_loss_clip": 1.02063298, + "balance_loss_mlp": 1.04393268, + "epoch": 0.24860964978205322, + "flos": 18150187167360.0, + "grad_norm": 2.276340033899988, + "language_loss": 0.78899026, + "learning_rate": 3.5192742864887914e-06, + "loss": 0.81069505, + "num_input_tokens_seen": 89083250, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.89453125, + "step": 4135, + "time_per_iteration": 3.9668710231781006 + }, + { + "auxiliary_loss_clip": 0.01136879, + "auxiliary_loss_mlp": 0.01032054, + "balance_loss_clip": 1.01746297, + "balance_loss_mlp": 1.04908156, + "epoch": 0.24866977303472118, + "flos": 11728641329280.0, + "grad_norm": 2.12923907223803, + "language_loss": 0.82729924, + "learning_rate": 3.5190209723917662e-06, + "loss": 0.84898853, + "num_input_tokens_seen": 89100905, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.87890625, + "step": 4136, + "time_per_iteration": 3.8651821613311768 + }, + { + "auxiliary_loss_clip": 0.01135708, + "auxiliary_loss_mlp": 0.01045715, + "balance_loss_clip": 1.02919221, + "balance_loss_mlp": 1.04593039, + "epoch": 0.24872989628738915, + "flos": 34823582565120.0, + "grad_norm": 1.7063584090687087, + "language_loss": 0.70454097, + "learning_rate": 3.518767600693314e-06, + "loss": 0.72635514, + "num_input_tokens_seen": 89122630, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8984375, + "step": 4137, + "time_per_iteration": 2.581270456314087 + }, + { + "auxiliary_loss_clip": 0.01135031, + "auxiliary_loss_mlp": 0.01035007, + "balance_loss_clip": 1.0193553, + "balance_loss_mlp": 1.04428291, + "epoch": 0.2487900195400571, + "flos": 13699347062400.0, + "grad_norm": 2.0340803052703236, + "language_loss": 0.66840076, + "learning_rate": 3.518514171403042e-06, + "loss": 0.69010115, + "num_input_tokens_seen": 89141050, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.90625, + "step": 4138, + "time_per_iteration": 2.438858985900879 + }, + { + "auxiliary_loss_clip": 0.01130089, + "auxiliary_loss_mlp": 0.01035018, + "balance_loss_clip": 1.01977062, + "balance_loss_mlp": 1.0451256, + "epoch": 0.24885014279272508, + "flos": 25337815297920.0, + "grad_norm": 2.467393625239628, + "language_loss": 0.83937073, + "learning_rate": 3.51826068453056e-06, + "loss": 0.86102176, + "num_input_tokens_seen": 89160810, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8515625, + "step": 4139, + "time_per_iteration": 2.4858012199401855 + }, + { + "auxiliary_loss_clip": 0.01134672, + "auxiliary_loss_mlp": 0.0104164, + "balance_loss_clip": 1.02424788, + "balance_loss_mlp": 1.04416132, + "epoch": 0.24891026604539307, + "flos": 20631434860800.0, + "grad_norm": 1.5320149755260415, + "language_loss": 0.7864905, + "learning_rate": 3.518007140085481e-06, + "loss": 0.80825365, + "num_input_tokens_seen": 89180610, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.90625, + "step": 4140, + "time_per_iteration": 2.4608240127563477 + }, + { + "auxiliary_loss_clip": 0.01058216, + "auxiliary_loss_mlp": 0.01013447, + "balance_loss_clip": 1.01150382, + "balance_loss_mlp": 1.02780879, + "epoch": 0.24897038929806103, + "flos": 66960294030720.0, + "grad_norm": 0.8230161703115366, + "language_loss": 0.60980695, + "learning_rate": 3.51775353807742e-06, + "loss": 0.63052356, + "num_input_tokens_seen": 89241880, + "router_z_loss_clip": 0.01940918, + "router_z_loss_mlp": 0.3046875, + "step": 4141, + "time_per_iteration": 3.1306700706481934 + }, + { + "auxiliary_loss_clip": 0.01136317, + "auxiliary_loss_mlp": 0.01042658, + "balance_loss_clip": 1.02537298, + "balance_loss_mlp": 1.04692519, + "epoch": 0.249030512550729, + "flos": 36392555612160.0, + "grad_norm": 2.804889663143828, + "language_loss": 0.72997624, + "learning_rate": 3.5174998785159913e-06, + "loss": 0.75176597, + "num_input_tokens_seen": 89263340, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.89453125, + "step": 4142, + "time_per_iteration": 2.60341215133667 + }, + { + "auxiliary_loss_clip": 0.011336, + "auxiliary_loss_mlp": 0.01039484, + "balance_loss_clip": 1.02335465, + "balance_loss_mlp": 1.04601634, + "epoch": 0.24909063580339696, + "flos": 20154576879360.0, + "grad_norm": 2.0852522280017873, + "language_loss": 0.80985868, + "learning_rate": 3.5172461614108157e-06, + "loss": 0.83158958, + "num_input_tokens_seen": 89282870, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.875, + "step": 4143, + "time_per_iteration": 2.4621787071228027 + }, + { + "auxiliary_loss_clip": 0.01127478, + "auxiliary_loss_mlp": 0.01036024, + "balance_loss_clip": 1.02113485, + "balance_loss_mlp": 1.04291701, + "epoch": 0.24915075905606493, + "flos": 26396569607040.0, + "grad_norm": 1.8417531415701045, + "language_loss": 0.5884496, + "learning_rate": 3.5169923867715137e-06, + "loss": 0.61008459, + "num_input_tokens_seen": 89303830, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.84765625, + "step": 4144, + "time_per_iteration": 2.5253236293792725 + }, + { + "auxiliary_loss_clip": 0.0113091, + "auxiliary_loss_mlp": 0.0103722, + "balance_loss_clip": 1.02135301, + "balance_loss_mlp": 1.04400194, + "epoch": 0.2492108823087329, + "flos": 27527216987520.0, + "grad_norm": 2.2350400575734146, + "language_loss": 0.78882402, + "learning_rate": 3.516738554607708e-06, + "loss": 0.81050527, + "num_input_tokens_seen": 89324350, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8671875, + "step": 4145, + "time_per_iteration": 2.500868797302246 + }, + { + "auxiliary_loss_clip": 0.01141282, + "auxiliary_loss_mlp": 0.01049792, + "balance_loss_clip": 1.02981293, + "balance_loss_mlp": 1.04593182, + "epoch": 0.24927100556140086, + "flos": 16691388111360.0, + "grad_norm": 2.0986803435557415, + "language_loss": 0.65651333, + "learning_rate": 3.5164846649290253e-06, + "loss": 0.678424, + "num_input_tokens_seen": 89342875, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.953125, + "step": 4146, + "time_per_iteration": 2.482405424118042 + }, + { + "auxiliary_loss_clip": 0.01048172, + "auxiliary_loss_mlp": 0.01006681, + "balance_loss_clip": 1.00482178, + "balance_loss_mlp": 1.01849687, + "epoch": 0.24933112881406885, + "flos": 62772464286720.0, + "grad_norm": 3.0854856510049458, + "language_loss": 0.67327654, + "learning_rate": 3.5162307177450915e-06, + "loss": 0.69382501, + "num_input_tokens_seen": 89404925, + "router_z_loss_clip": 0.01855469, + "router_z_loss_mlp": 0.296875, + "step": 4147, + "time_per_iteration": 3.1769258975982666 + }, + { + "auxiliary_loss_clip": 0.01136528, + "auxiliary_loss_mlp": 0.01046222, + "balance_loss_clip": 1.02930617, + "balance_loss_mlp": 1.04857254, + "epoch": 0.24939125206673682, + "flos": 26651894457600.0, + "grad_norm": 2.0368820911017025, + "language_loss": 0.8893261, + "learning_rate": 3.5159767130655366e-06, + "loss": 0.91115361, + "num_input_tokens_seen": 89425090, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8828125, + "step": 4148, + "time_per_iteration": 2.5202085971832275 + }, + { + "auxiliary_loss_clip": 0.0113885, + "auxiliary_loss_mlp": 0.01045745, + "balance_loss_clip": 1.02649307, + "balance_loss_mlp": 1.04754162, + "epoch": 0.24945137531940478, + "flos": 20704333512960.0, + "grad_norm": 1.8605307211390085, + "language_loss": 0.68053228, + "learning_rate": 3.5157226508999935e-06, + "loss": 0.70237827, + "num_input_tokens_seen": 89442615, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.9140625, + "step": 4149, + "time_per_iteration": 2.455733060836792 + }, + { + "auxiliary_loss_clip": 0.01133288, + "auxiliary_loss_mlp": 0.01038884, + "balance_loss_clip": 1.02291596, + "balance_loss_mlp": 1.04694724, + "epoch": 0.24951149857207275, + "flos": 23768662682880.0, + "grad_norm": 2.99652773874907, + "language_loss": 0.71235985, + "learning_rate": 3.515468531258095e-06, + "loss": 0.73408163, + "num_input_tokens_seen": 89463025, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.86328125, + "step": 4150, + "time_per_iteration": 2.514190196990967 + }, + { + "auxiliary_loss_clip": 0.01134014, + "auxiliary_loss_mlp": 0.01049321, + "balance_loss_clip": 1.03256035, + "balance_loss_mlp": 1.04471052, + "epoch": 0.2495716218247407, + "flos": 15664881237120.0, + "grad_norm": 1.862035570914478, + "language_loss": 0.72954226, + "learning_rate": 3.515214354149478e-06, + "loss": 0.75137556, + "num_input_tokens_seen": 89480225, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.89453125, + "step": 4151, + "time_per_iteration": 2.4198975563049316 + }, + { + "auxiliary_loss_clip": 0.01141172, + "auxiliary_loss_mlp": 0.01049288, + "balance_loss_clip": 1.03213382, + "balance_loss_mlp": 1.04694724, + "epoch": 0.24963174507740868, + "flos": 24052499953920.0, + "grad_norm": 4.099427504771762, + "language_loss": 0.62436807, + "learning_rate": 3.514960119583781e-06, + "loss": 0.64627266, + "num_input_tokens_seen": 89496985, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.94140625, + "step": 4152, + "time_per_iteration": 2.563032865524292 + }, + { + "auxiliary_loss_clip": 0.01131413, + "auxiliary_loss_mlp": 0.01038045, + "balance_loss_clip": 1.02188039, + "balance_loss_mlp": 1.04631066, + "epoch": 0.24969186833007664, + "flos": 21799501234560.0, + "grad_norm": 2.3735561607913596, + "language_loss": 0.77219248, + "learning_rate": 3.514705827570645e-06, + "loss": 0.79388708, + "num_input_tokens_seen": 89514420, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8515625, + "step": 4153, + "time_per_iteration": 2.5059967041015625 + }, + { + "auxiliary_loss_clip": 0.01132512, + "auxiliary_loss_mlp": 0.01040076, + "balance_loss_clip": 1.0242573, + "balance_loss_mlp": 1.04642224, + "epoch": 0.24975199158274464, + "flos": 19938143479680.0, + "grad_norm": 2.164577963489155, + "language_loss": 0.76443702, + "learning_rate": 3.514451478119711e-06, + "loss": 0.78616285, + "num_input_tokens_seen": 89532925, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.859375, + "step": 4154, + "time_per_iteration": 2.48317551612854 + }, + { + "auxiliary_loss_clip": 0.01138697, + "auxiliary_loss_mlp": 0.0104451, + "balance_loss_clip": 1.02586532, + "balance_loss_mlp": 1.04451203, + "epoch": 0.2498121148354126, + "flos": 25338389915520.0, + "grad_norm": 2.2000943153895722, + "language_loss": 0.70740849, + "learning_rate": 3.5141970712406258e-06, + "loss": 0.72924054, + "num_input_tokens_seen": 89552855, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.94140625, + "step": 4155, + "time_per_iteration": 2.498227834701538 + }, + { + "auxiliary_loss_clip": 0.01137147, + "auxiliary_loss_mlp": 0.01050913, + "balance_loss_clip": 1.03379464, + "balance_loss_mlp": 1.04736114, + "epoch": 0.24987223808808057, + "flos": 20558787603840.0, + "grad_norm": 1.8252469259439843, + "language_loss": 0.7499637, + "learning_rate": 3.513942606943036e-06, + "loss": 0.77184427, + "num_input_tokens_seen": 89572830, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.8984375, + "step": 4156, + "time_per_iteration": 2.473536729812622 + }, + { + "auxiliary_loss_clip": 0.01132111, + "auxiliary_loss_mlp": 0.01040008, + "balance_loss_clip": 1.0244987, + "balance_loss_mlp": 1.04498601, + "epoch": 0.24993236134074853, + "flos": 19749037351680.0, + "grad_norm": 2.1247768054564333, + "language_loss": 0.76757634, + "learning_rate": 3.513688085236591e-06, + "loss": 0.78929752, + "num_input_tokens_seen": 89590345, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.87109375, + "step": 4157, + "time_per_iteration": 2.476402759552002 + }, + { + "auxiliary_loss_clip": 0.01135567, + "auxiliary_loss_mlp": 0.01044785, + "balance_loss_clip": 1.02821517, + "balance_loss_mlp": 1.04551077, + "epoch": 0.2499924845934165, + "flos": 18770292587520.0, + "grad_norm": 1.6430173172536622, + "language_loss": 0.81497854, + "learning_rate": 3.513433506130942e-06, + "loss": 0.8367821, + "num_input_tokens_seen": 89610295, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8984375, + "step": 4158, + "time_per_iteration": 2.4706146717071533 + }, + { + "auxiliary_loss_clip": 0.01134661, + "auxiliary_loss_mlp": 0.01031651, + "balance_loss_clip": 1.01533163, + "balance_loss_mlp": 1.04511046, + "epoch": 0.25005260784608446, + "flos": 16872198197760.0, + "grad_norm": 2.425058111765743, + "language_loss": 0.75573325, + "learning_rate": 3.5131788696357427e-06, + "loss": 0.77739644, + "num_input_tokens_seen": 89627795, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.89453125, + "step": 4159, + "time_per_iteration": 2.447530746459961 + }, + { + "auxiliary_loss_clip": 0.01137664, + "auxiliary_loss_mlp": 0.01036787, + "balance_loss_clip": 1.01928759, + "balance_loss_mlp": 1.04643881, + "epoch": 0.2501127310987524, + "flos": 22124923476480.0, + "grad_norm": 2.3851333770237044, + "language_loss": 0.71434534, + "learning_rate": 3.512924175760649e-06, + "loss": 0.73608989, + "num_input_tokens_seen": 89648090, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9140625, + "step": 4160, + "time_per_iteration": 2.4909448623657227 + }, + { + "auxiliary_loss_clip": 0.01045571, + "auxiliary_loss_mlp": 0.01008394, + "balance_loss_clip": 1.0062604, + "balance_loss_mlp": 1.01615632, + "epoch": 0.2501728543514204, + "flos": 69458061980160.0, + "grad_norm": 0.7574731626167057, + "language_loss": 0.56755257, + "learning_rate": 3.5126694245153186e-06, + "loss": 0.58809221, + "num_input_tokens_seen": 89710345, + "router_z_loss_clip": 0.0213623, + "router_z_loss_mlp": 0.29492188, + "step": 4161, + "time_per_iteration": 3.1169064044952393 + }, + { + "auxiliary_loss_clip": 0.01143652, + "auxiliary_loss_mlp": 0.01045602, + "balance_loss_clip": 1.02767324, + "balance_loss_mlp": 1.04854345, + "epoch": 0.25023297760408836, + "flos": 16289978647680.0, + "grad_norm": 1.822598728260487, + "language_loss": 0.8071059, + "learning_rate": 3.5124146159094125e-06, + "loss": 0.82899845, + "num_input_tokens_seen": 89729390, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.94921875, + "step": 4162, + "time_per_iteration": 2.4679477214813232 + }, + { + "auxiliary_loss_clip": 0.01136921, + "auxiliary_loss_mlp": 0.01039377, + "balance_loss_clip": 1.02212739, + "balance_loss_mlp": 1.04364812, + "epoch": 0.2502931008567563, + "flos": 12237998140800.0, + "grad_norm": 2.543272880301035, + "language_loss": 0.87439299, + "learning_rate": 3.5121597499525927e-06, + "loss": 0.89615595, + "num_input_tokens_seen": 89742805, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.93359375, + "step": 4163, + "time_per_iteration": 2.411324977874756 + }, + { + "auxiliary_loss_clip": 0.01135069, + "auxiliary_loss_mlp": 0.01036709, + "balance_loss_clip": 1.02013874, + "balance_loss_mlp": 1.04609334, + "epoch": 0.25035322410942434, + "flos": 23181882105600.0, + "grad_norm": 1.8835095650007205, + "language_loss": 0.83242726, + "learning_rate": 3.5119048266545232e-06, + "loss": 0.85414505, + "num_input_tokens_seen": 89761145, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.890625, + "step": 4164, + "time_per_iteration": 2.4910058975219727 + }, + { + "auxiliary_loss_clip": 0.01130392, + "auxiliary_loss_mlp": 0.01047055, + "balance_loss_clip": 1.03235698, + "balance_loss_mlp": 1.04616356, + "epoch": 0.2504133473620923, + "flos": 20917534688640.0, + "grad_norm": 1.7333709529875627, + "language_loss": 0.74548686, + "learning_rate": 3.5116498460248716e-06, + "loss": 0.76726139, + "num_input_tokens_seen": 89780905, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.84375, + "step": 4165, + "time_per_iteration": 2.4566714763641357 + }, + { + "auxiliary_loss_clip": 0.01139627, + "auxiliary_loss_mlp": 0.01045895, + "balance_loss_clip": 1.02819216, + "balance_loss_mlp": 1.04689348, + "epoch": 0.2504734706147603, + "flos": 20776549806720.0, + "grad_norm": 5.301488379412456, + "language_loss": 0.74214685, + "learning_rate": 3.5113948080733062e-06, + "loss": 0.76400197, + "num_input_tokens_seen": 89799230, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9296875, + "step": 4166, + "time_per_iteration": 2.462092161178589 + }, + { + "auxiliary_loss_clip": 0.01134276, + "auxiliary_loss_mlp": 0.01045442, + "balance_loss_clip": 1.02898526, + "balance_loss_mlp": 1.04551435, + "epoch": 0.25053359386742824, + "flos": 24349373861760.0, + "grad_norm": 1.9752225074857819, + "language_loss": 0.82011521, + "learning_rate": 3.5111397128094973e-06, + "loss": 0.84191239, + "num_input_tokens_seen": 89818240, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.890625, + "step": 4167, + "time_per_iteration": 2.482534885406494 + }, + { + "auxiliary_loss_clip": 0.01134736, + "auxiliary_loss_mlp": 0.01044102, + "balance_loss_clip": 1.0280689, + "balance_loss_mlp": 1.04616201, + "epoch": 0.2505937171200962, + "flos": 21214336769280.0, + "grad_norm": 2.42679689243218, + "language_loss": 0.79602242, + "learning_rate": 3.51088456024312e-06, + "loss": 0.81781083, + "num_input_tokens_seen": 89834485, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8828125, + "step": 4168, + "time_per_iteration": 2.463700532913208 + }, + { + "auxiliary_loss_clip": 0.01139283, + "auxiliary_loss_mlp": 0.01042051, + "balance_loss_clip": 1.02353752, + "balance_loss_mlp": 1.04523754, + "epoch": 0.25065384037276417, + "flos": 41427231379200.0, + "grad_norm": 2.966293758738445, + "language_loss": 0.70029891, + "learning_rate": 3.510629350383849e-06, + "loss": 0.72211224, + "num_input_tokens_seen": 89855645, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9375, + "step": 4169, + "time_per_iteration": 2.6148693561553955 + }, + { + "auxiliary_loss_clip": 0.01131562, + "auxiliary_loss_mlp": 0.0104538, + "balance_loss_clip": 1.02926338, + "balance_loss_mlp": 1.0446701, + "epoch": 0.25071396362543213, + "flos": 26102389219200.0, + "grad_norm": 1.8138505316100015, + "language_loss": 0.77564663, + "learning_rate": 3.510374083241361e-06, + "loss": 0.79741603, + "num_input_tokens_seen": 89874895, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8671875, + "step": 4170, + "time_per_iteration": 2.522921562194824 + }, + { + "auxiliary_loss_clip": 0.01137572, + "auxiliary_loss_mlp": 0.01043275, + "balance_loss_clip": 1.02731323, + "balance_loss_mlp": 1.04796529, + "epoch": 0.2507740868781001, + "flos": 19098982967040.0, + "grad_norm": 2.4512078878938404, + "language_loss": 0.76246989, + "learning_rate": 3.5101187588253368e-06, + "loss": 0.78427839, + "num_input_tokens_seen": 89891700, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8984375, + "step": 4171, + "time_per_iteration": 2.4322195053100586 + }, + { + "auxiliary_loss_clip": 0.01046694, + "auxiliary_loss_mlp": 0.01021172, + "balance_loss_clip": 1.01924038, + "balance_loss_mlp": 1.01739454, + "epoch": 0.25083421013076806, + "flos": 64341868296960.0, + "grad_norm": 0.8497756598481241, + "language_loss": 0.60047227, + "learning_rate": 3.509863377145458e-06, + "loss": 0.62115091, + "num_input_tokens_seen": 89955775, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.29296875, + "step": 4172, + "time_per_iteration": 3.1110994815826416 + }, + { + "auxiliary_loss_clip": 0.01137052, + "auxiliary_loss_mlp": 0.01042686, + "balance_loss_clip": 1.02567458, + "balance_loss_mlp": 1.04652381, + "epoch": 0.25089433338343603, + "flos": 24279599692800.0, + "grad_norm": 1.4442293166181488, + "language_loss": 0.78647727, + "learning_rate": 3.509607938211409e-06, + "loss": 0.80827463, + "num_input_tokens_seen": 89977150, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.90625, + "step": 4173, + "time_per_iteration": 2.481062889099121 + }, + { + "auxiliary_loss_clip": 0.01140203, + "auxiliary_loss_mlp": 0.01046542, + "balance_loss_clip": 1.0300796, + "balance_loss_mlp": 1.05017626, + "epoch": 0.250954456636104, + "flos": 14721472477440.0, + "grad_norm": 2.4202296115923883, + "language_loss": 0.83543748, + "learning_rate": 3.509352442032875e-06, + "loss": 0.85730493, + "num_input_tokens_seen": 89994925, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8984375, + "step": 4174, + "time_per_iteration": 2.4566147327423096 + }, + { + "auxiliary_loss_clip": 0.01138282, + "auxiliary_loss_mlp": 0.01040651, + "balance_loss_clip": 1.02299595, + "balance_loss_mlp": 1.04786515, + "epoch": 0.25101457988877196, + "flos": 22273593868800.0, + "grad_norm": 2.0903096624482624, + "language_loss": 0.71291864, + "learning_rate": 3.509096888619545e-06, + "loss": 0.73470795, + "num_input_tokens_seen": 90013235, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.90234375, + "step": 4175, + "time_per_iteration": 2.4616360664367676 + }, + { + "auxiliary_loss_clip": 0.01138348, + "auxiliary_loss_mlp": 0.01036282, + "balance_loss_clip": 1.01866269, + "balance_loss_mlp": 1.0460453, + "epoch": 0.2510747031414399, + "flos": 25188929424000.0, + "grad_norm": 2.247188920587568, + "language_loss": 0.80564427, + "learning_rate": 3.50884127798111e-06, + "loss": 0.82739055, + "num_input_tokens_seen": 90032150, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.921875, + "step": 4176, + "time_per_iteration": 2.525686740875244 + }, + { + "auxiliary_loss_clip": 0.01138723, + "auxiliary_loss_mlp": 0.01044107, + "balance_loss_clip": 1.02553427, + "balance_loss_mlp": 1.04782593, + "epoch": 0.25113482639410795, + "flos": 20704189858560.0, + "grad_norm": 2.362252442770041, + "language_loss": 0.83099151, + "learning_rate": 3.5085856101272623e-06, + "loss": 0.8528198, + "num_input_tokens_seen": 90049085, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.90625, + "step": 4177, + "time_per_iteration": 5.424759387969971 + }, + { + "auxiliary_loss_clip": 0.01135735, + "auxiliary_loss_mlp": 0.01043794, + "balance_loss_clip": 1.02675891, + "balance_loss_mlp": 1.04777622, + "epoch": 0.2511949496467759, + "flos": 21506936958720.0, + "grad_norm": 2.9753996759374846, + "language_loss": 0.8209883, + "learning_rate": 3.508329885067698e-06, + "loss": 0.84278357, + "num_input_tokens_seen": 90067695, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.87890625, + "step": 4178, + "time_per_iteration": 2.451418161392212 + }, + { + "auxiliary_loss_clip": 0.01130203, + "auxiliary_loss_mlp": 0.01042984, + "balance_loss_clip": 1.02702177, + "balance_loss_mlp": 1.04445124, + "epoch": 0.2512550728994439, + "flos": 20701999128960.0, + "grad_norm": 2.6671564243834505, + "language_loss": 0.75406277, + "learning_rate": 3.508074102812112e-06, + "loss": 0.77579463, + "num_input_tokens_seen": 90083890, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.859375, + "step": 4179, + "time_per_iteration": 2.4710347652435303 + }, + { + "auxiliary_loss_clip": 0.01135846, + "auxiliary_loss_mlp": 0.01048206, + "balance_loss_clip": 1.03050375, + "balance_loss_mlp": 1.04526711, + "epoch": 0.25131519615211184, + "flos": 18478626151680.0, + "grad_norm": 2.189208999533023, + "language_loss": 0.70452499, + "learning_rate": 3.507818263370206e-06, + "loss": 0.72636557, + "num_input_tokens_seen": 90100995, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.90625, + "step": 4180, + "time_per_iteration": 2.433922290802002 + }, + { + "auxiliary_loss_clip": 0.01132491, + "auxiliary_loss_mlp": 0.01041648, + "balance_loss_clip": 1.02485168, + "balance_loss_mlp": 1.04449701, + "epoch": 0.2513753194047798, + "flos": 20484955198080.0, + "grad_norm": 2.0603947372587244, + "language_loss": 0.85379761, + "learning_rate": 3.5075623667516796e-06, + "loss": 0.875539, + "num_input_tokens_seen": 90120365, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8828125, + "step": 4181, + "time_per_iteration": 2.4513771533966064 + }, + { + "auxiliary_loss_clip": 0.01135555, + "auxiliary_loss_mlp": 0.01042648, + "balance_loss_clip": 1.02608991, + "balance_loss_mlp": 1.0464716, + "epoch": 0.25143544265744777, + "flos": 37670077704960.0, + "grad_norm": 1.9568163341605829, + "language_loss": 0.67662674, + "learning_rate": 3.507306412966238e-06, + "loss": 0.69840884, + "num_input_tokens_seen": 90142610, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.890625, + "step": 4182, + "time_per_iteration": 2.588513135910034 + }, + { + "auxiliary_loss_clip": 0.01047089, + "auxiliary_loss_mlp": 0.01008874, + "balance_loss_clip": 1.00675201, + "balance_loss_mlp": 1.01742792, + "epoch": 0.25149556591011574, + "flos": 69367457923200.0, + "grad_norm": 0.8484678873575391, + "language_loss": 0.70098495, + "learning_rate": 3.5070504020235853e-06, + "loss": 0.72154456, + "num_input_tokens_seen": 90200555, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.296875, + "step": 4183, + "time_per_iteration": 3.0990090370178223 + }, + { + "auxiliary_loss_clip": 0.01129729, + "auxiliary_loss_mlp": 0.01038846, + "balance_loss_clip": 1.02088118, + "balance_loss_mlp": 1.04070854, + "epoch": 0.2515556891627837, + "flos": 13990402967040.0, + "grad_norm": 1.7162399200173233, + "language_loss": 0.7452544, + "learning_rate": 3.506794333933431e-06, + "loss": 0.76694012, + "num_input_tokens_seen": 90218120, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.890625, + "step": 4184, + "time_per_iteration": 2.4367544651031494 + }, + { + "auxiliary_loss_clip": 0.01137253, + "auxiliary_loss_mlp": 0.01045885, + "balance_loss_clip": 1.02888608, + "balance_loss_mlp": 1.04825735, + "epoch": 0.25161581241545167, + "flos": 22163527618560.0, + "grad_norm": 1.9130230292696613, + "language_loss": 0.82872695, + "learning_rate": 3.506538208705484e-06, + "loss": 0.85055834, + "num_input_tokens_seen": 90236790, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.890625, + "step": 4185, + "time_per_iteration": 2.4604692459106445 + }, + { + "auxiliary_loss_clip": 0.01047588, + "auxiliary_loss_mlp": 0.01003961, + "balance_loss_clip": 1.00194633, + "balance_loss_mlp": 1.01820421, + "epoch": 0.25167593566811963, + "flos": 69358407696000.0, + "grad_norm": 0.7885291752286397, + "language_loss": 0.61534387, + "learning_rate": 3.5062820263494574e-06, + "loss": 0.63585937, + "num_input_tokens_seen": 90297070, + "router_z_loss_clip": 0.0201416, + "router_z_loss_mlp": 0.29296875, + "step": 4186, + "time_per_iteration": 2.9629924297332764 + }, + { + "auxiliary_loss_clip": 0.01133243, + "auxiliary_loss_mlp": 0.01040873, + "balance_loss_clip": 1.02320647, + "balance_loss_mlp": 1.04432559, + "epoch": 0.2517360589207876, + "flos": 13261452359040.0, + "grad_norm": 2.1070381215060308, + "language_loss": 0.79260957, + "learning_rate": 3.5060257868750656e-06, + "loss": 0.81435084, + "num_input_tokens_seen": 90315255, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.890625, + "step": 4187, + "time_per_iteration": 2.454988479614258 + }, + { + "auxiliary_loss_clip": 0.01136483, + "auxiliary_loss_mlp": 0.01049456, + "balance_loss_clip": 1.03235006, + "balance_loss_mlp": 1.04733062, + "epoch": 0.25179618217345556, + "flos": 20376828282240.0, + "grad_norm": 1.5254881034867085, + "language_loss": 0.79854965, + "learning_rate": 3.5057694902920244e-06, + "loss": 0.82040906, + "num_input_tokens_seen": 90334990, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.890625, + "step": 4188, + "time_per_iteration": 2.4807493686676025 + }, + { + "auxiliary_loss_clip": 0.01135001, + "auxiliary_loss_mlp": 0.01046554, + "balance_loss_clip": 1.03022218, + "balance_loss_mlp": 1.04635882, + "epoch": 0.25185630542612353, + "flos": 27664718250240.0, + "grad_norm": 1.727912733373243, + "language_loss": 0.74509478, + "learning_rate": 3.5055131366100534e-06, + "loss": 0.76691031, + "num_input_tokens_seen": 90351825, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.88671875, + "step": 4189, + "time_per_iteration": 2.4887545108795166 + }, + { + "auxiliary_loss_clip": 0.01131737, + "auxiliary_loss_mlp": 0.01044524, + "balance_loss_clip": 1.02914619, + "balance_loss_mlp": 1.04616165, + "epoch": 0.25191642867879155, + "flos": 20996430912000.0, + "grad_norm": 1.957544272457229, + "language_loss": 0.84454727, + "learning_rate": 3.5052567258388745e-06, + "loss": 0.86630988, + "num_input_tokens_seen": 90369860, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.85546875, + "step": 4190, + "time_per_iteration": 2.4629735946655273 + }, + { + "auxiliary_loss_clip": 0.01134245, + "auxiliary_loss_mlp": 0.01044478, + "balance_loss_clip": 1.02633452, + "balance_loss_mlp": 1.04529381, + "epoch": 0.2519765519314595, + "flos": 21105671149440.0, + "grad_norm": 1.9468541382775664, + "language_loss": 0.75593925, + "learning_rate": 3.5050002579882082e-06, + "loss": 0.77772641, + "num_input_tokens_seen": 90389245, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.88671875, + "step": 4191, + "time_per_iteration": 2.451493263244629 + }, + { + "auxiliary_loss_clip": 0.01042669, + "auxiliary_loss_mlp": 0.0101771, + "balance_loss_clip": 1.01577878, + "balance_loss_mlp": 1.01320672, + "epoch": 0.2520366751841275, + "flos": 62744993360640.0, + "grad_norm": 0.7165761170014687, + "language_loss": 0.57155997, + "learning_rate": 3.5047437330677823e-06, + "loss": 0.59216374, + "num_input_tokens_seen": 90456735, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.29492188, + "step": 4192, + "time_per_iteration": 3.1455304622650146 + }, + { + "auxiliary_loss_clip": 0.01132992, + "auxiliary_loss_mlp": 0.01042104, + "balance_loss_clip": 1.02593958, + "balance_loss_mlp": 1.04640245, + "epoch": 0.25209679843679544, + "flos": 22230716008320.0, + "grad_norm": 2.0419031963399434, + "language_loss": 0.76306844, + "learning_rate": 3.504487151087323e-06, + "loss": 0.78481936, + "num_input_tokens_seen": 90474165, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8671875, + "step": 4193, + "time_per_iteration": 2.46201491355896 + }, + { + "auxiliary_loss_clip": 0.01136471, + "auxiliary_loss_mlp": 0.01048175, + "balance_loss_clip": 1.03115189, + "balance_loss_mlp": 1.04506373, + "epoch": 0.2521569216894634, + "flos": 12166643773440.0, + "grad_norm": 2.1192679618590007, + "language_loss": 0.84261906, + "learning_rate": 3.5042305120565598e-06, + "loss": 0.86446548, + "num_input_tokens_seen": 90491660, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9140625, + "step": 4194, + "time_per_iteration": 2.4525146484375 + }, + { + "auxiliary_loss_clip": 0.01138489, + "auxiliary_loss_mlp": 0.01049416, + "balance_loss_clip": 1.03404951, + "balance_loss_mlp": 1.04636192, + "epoch": 0.2522170449421314, + "flos": 23699786353920.0, + "grad_norm": 1.488794247862028, + "language_loss": 0.88176262, + "learning_rate": 3.5039738159852253e-06, + "loss": 0.90364158, + "num_input_tokens_seen": 90514025, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.921875, + "step": 4195, + "time_per_iteration": 2.507788896560669 + }, + { + "auxiliary_loss_clip": 0.01136069, + "auxiliary_loss_mlp": 0.01042605, + "balance_loss_clip": 1.02323329, + "balance_loss_mlp": 1.04540074, + "epoch": 0.25227716819479934, + "flos": 20955456472320.0, + "grad_norm": 2.8940350432545787, + "language_loss": 0.85288155, + "learning_rate": 3.503717062883053e-06, + "loss": 0.87466824, + "num_input_tokens_seen": 90533530, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.90625, + "step": 4196, + "time_per_iteration": 2.4843344688415527 + }, + { + "auxiliary_loss_clip": 0.01135455, + "auxiliary_loss_mlp": 0.01042942, + "balance_loss_clip": 1.02644312, + "balance_loss_mlp": 1.0454607, + "epoch": 0.2523372914474673, + "flos": 23331342597120.0, + "grad_norm": 1.6596186150335415, + "language_loss": 0.83368516, + "learning_rate": 3.5034602527597786e-06, + "loss": 0.85546911, + "num_input_tokens_seen": 90554025, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8984375, + "step": 4197, + "time_per_iteration": 2.480834484100342 + }, + { + "auxiliary_loss_clip": 0.01139173, + "auxiliary_loss_mlp": 0.01047624, + "balance_loss_clip": 1.02840698, + "balance_loss_mlp": 1.04775643, + "epoch": 0.25239741470013527, + "flos": 36970321875840.0, + "grad_norm": 2.7573342641631093, + "language_loss": 0.72406292, + "learning_rate": 3.5032033856251405e-06, + "loss": 0.74593097, + "num_input_tokens_seen": 90576930, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.9140625, + "step": 4198, + "time_per_iteration": 2.6081368923187256 + }, + { + "auxiliary_loss_clip": 0.01139571, + "auxiliary_loss_mlp": 0.01052953, + "balance_loss_clip": 1.03469038, + "balance_loss_mlp": 1.0462662, + "epoch": 0.25245753795280323, + "flos": 18515757836160.0, + "grad_norm": 1.9511850390779815, + "language_loss": 0.76798427, + "learning_rate": 3.50294646148888e-06, + "loss": 0.7899096, + "num_input_tokens_seen": 90595710, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.93359375, + "step": 4199, + "time_per_iteration": 2.463322162628174 + }, + { + "auxiliary_loss_clip": 0.01137047, + "auxiliary_loss_mlp": 0.01039237, + "balance_loss_clip": 1.02334595, + "balance_loss_mlp": 1.04600453, + "epoch": 0.2525176612054712, + "flos": 32344884737280.0, + "grad_norm": 1.6881838085079777, + "language_loss": 0.727651, + "learning_rate": 3.502689480360739e-06, + "loss": 0.74941385, + "num_input_tokens_seen": 90617945, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.91015625, + "step": 4200, + "time_per_iteration": 2.586298942565918 + }, + { + "auxiliary_loss_clip": 0.01134782, + "auxiliary_loss_mlp": 0.01047981, + "balance_loss_clip": 1.03206062, + "balance_loss_mlp": 1.04300654, + "epoch": 0.25257778445813917, + "flos": 45258217459200.0, + "grad_norm": 1.7166145531144803, + "language_loss": 0.82271791, + "learning_rate": 3.5024324422504616e-06, + "loss": 0.84454548, + "num_input_tokens_seen": 90640855, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.91796875, + "step": 4201, + "time_per_iteration": 2.6430721282958984 + }, + { + "auxiliary_loss_clip": 0.01138395, + "auxiliary_loss_mlp": 0.01046441, + "balance_loss_clip": 1.02960861, + "balance_loss_mlp": 1.04680324, + "epoch": 0.25263790771080713, + "flos": 23367791923200.0, + "grad_norm": 1.8945534984036327, + "language_loss": 0.74844849, + "learning_rate": 3.5021753471677965e-06, + "loss": 0.77029681, + "num_input_tokens_seen": 90661350, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9140625, + "step": 4202, + "time_per_iteration": 2.477376699447632 + }, + { + "auxiliary_loss_clip": 0.01133899, + "auxiliary_loss_mlp": 0.01041807, + "balance_loss_clip": 1.02545786, + "balance_loss_mlp": 1.04550529, + "epoch": 0.25269803096347515, + "flos": 18515039564160.0, + "grad_norm": 1.8769942277842264, + "language_loss": 0.73058856, + "learning_rate": 3.501918195122491e-06, + "loss": 0.75234556, + "num_input_tokens_seen": 90680540, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.88671875, + "step": 4203, + "time_per_iteration": 2.4526968002319336 + }, + { + "auxiliary_loss_clip": 0.01134593, + "auxiliary_loss_mlp": 0.01040695, + "balance_loss_clip": 1.02403569, + "balance_loss_mlp": 1.04434335, + "epoch": 0.2527581542161431, + "flos": 24610552629120.0, + "grad_norm": 1.7217444479200419, + "language_loss": 0.77377844, + "learning_rate": 3.501660986124297e-06, + "loss": 0.79553127, + "num_input_tokens_seen": 90703460, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.90234375, + "step": 4204, + "time_per_iteration": 2.540573835372925 + }, + { + "auxiliary_loss_clip": 0.01135598, + "auxiliary_loss_mlp": 0.01051513, + "balance_loss_clip": 1.03463292, + "balance_loss_mlp": 1.04443574, + "epoch": 0.2528182774688111, + "flos": 12641275111680.0, + "grad_norm": 3.2226665017353655, + "language_loss": 0.72443974, + "learning_rate": 3.5014037201829684e-06, + "loss": 0.74631095, + "num_input_tokens_seen": 90718815, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9140625, + "step": 4205, + "time_per_iteration": 2.405823230743408 + }, + { + "auxiliary_loss_clip": 0.01131667, + "auxiliary_loss_mlp": 0.01038371, + "balance_loss_clip": 1.02304697, + "balance_loss_mlp": 1.04673433, + "epoch": 0.25287840072147905, + "flos": 46936789879680.0, + "grad_norm": 1.4419344159614245, + "language_loss": 0.75674903, + "learning_rate": 3.50114639730826e-06, + "loss": 0.77844942, + "num_input_tokens_seen": 90742125, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.84765625, + "step": 4206, + "time_per_iteration": 2.7117254734039307 + }, + { + "auxiliary_loss_clip": 0.01134608, + "auxiliary_loss_mlp": 0.01041465, + "balance_loss_clip": 1.02502584, + "balance_loss_mlp": 1.04381466, + "epoch": 0.252938523974147, + "flos": 18879712392960.0, + "grad_norm": 1.8459801280493204, + "language_loss": 0.79013956, + "learning_rate": 3.5008890175099296e-06, + "loss": 0.81190026, + "num_input_tokens_seen": 90760785, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90625, + "step": 4207, + "time_per_iteration": 2.4338433742523193 + }, + { + "auxiliary_loss_clip": 0.01131159, + "auxiliary_loss_mlp": 0.01042915, + "balance_loss_clip": 1.02688169, + "balance_loss_mlp": 1.04521704, + "epoch": 0.252998647226815, + "flos": 21434720664960.0, + "grad_norm": 1.5263501886522268, + "language_loss": 0.76010746, + "learning_rate": 3.5006315807977375e-06, + "loss": 0.78184819, + "num_input_tokens_seen": 90780045, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.859375, + "step": 4208, + "time_per_iteration": 2.4712774753570557 + }, + { + "auxiliary_loss_clip": 0.01131243, + "auxiliary_loss_mlp": 0.01042204, + "balance_loss_clip": 1.02559781, + "balance_loss_mlp": 1.04407811, + "epoch": 0.25305877047948294, + "flos": 25442171285760.0, + "grad_norm": 1.8494822470113228, + "language_loss": 0.6965062, + "learning_rate": 3.5003740871814456e-06, + "loss": 0.71824062, + "num_input_tokens_seen": 90797980, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.87109375, + "step": 4209, + "time_per_iteration": 2.4723262786865234 + }, + { + "auxiliary_loss_clip": 0.01046036, + "auxiliary_loss_mlp": 0.00999993, + "balance_loss_clip": 0.99819291, + "balance_loss_mlp": 1.01643014, + "epoch": 0.2531188937321509, + "flos": 60185603629440.0, + "grad_norm": 0.7581785291884388, + "language_loss": 0.55080217, + "learning_rate": 3.5001165366708175e-06, + "loss": 0.57126248, + "num_input_tokens_seen": 90864865, + "router_z_loss_clip": 0.01794434, + "router_z_loss_mlp": 0.296875, + "step": 4210, + "time_per_iteration": 3.141958236694336 + }, + { + "auxiliary_loss_clip": 0.0113523, + "auxiliary_loss_mlp": 0.01034659, + "balance_loss_clip": 1.01853585, + "balance_loss_mlp": 1.04541481, + "epoch": 0.25317901698481887, + "flos": 19682387665920.0, + "grad_norm": 2.0581011511690606, + "language_loss": 0.8021341, + "learning_rate": 3.4998589292756204e-06, + "loss": 0.82383299, + "num_input_tokens_seen": 90882885, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8984375, + "step": 4211, + "time_per_iteration": 2.4423909187316895 + }, + { + "auxiliary_loss_clip": 0.01128499, + "auxiliary_loss_mlp": 0.01039387, + "balance_loss_clip": 1.02402079, + "balance_loss_mlp": 1.04284227, + "epoch": 0.25323914023748684, + "flos": 24424355502720.0, + "grad_norm": 1.6375033978461933, + "language_loss": 0.78310406, + "learning_rate": 3.499601265005622e-06, + "loss": 0.80478293, + "num_input_tokens_seen": 90902985, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.85546875, + "step": 4212, + "time_per_iteration": 2.535416841506958 + }, + { + "auxiliary_loss_clip": 0.01131331, + "auxiliary_loss_mlp": 0.01040125, + "balance_loss_clip": 1.02356696, + "balance_loss_mlp": 1.04314673, + "epoch": 0.2532992634901548, + "flos": 25447450584960.0, + "grad_norm": 2.0206536972721088, + "language_loss": 0.53393918, + "learning_rate": 3.4993435438705938e-06, + "loss": 0.55565375, + "num_input_tokens_seen": 90923550, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8828125, + "step": 4213, + "time_per_iteration": 2.488844871520996 + }, + { + "auxiliary_loss_clip": 0.01132972, + "auxiliary_loss_mlp": 0.01042806, + "balance_loss_clip": 1.02566385, + "balance_loss_mlp": 1.04508567, + "epoch": 0.25335938674282277, + "flos": 18880538405760.0, + "grad_norm": 2.6682600080383816, + "language_loss": 0.65329081, + "learning_rate": 3.499085765880308e-06, + "loss": 0.67504859, + "num_input_tokens_seen": 90943260, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.87890625, + "step": 4214, + "time_per_iteration": 2.478422164916992 + }, + { + "auxiliary_loss_clip": 0.01043385, + "auxiliary_loss_mlp": 0.0100812, + "balance_loss_clip": 1.00630808, + "balance_loss_mlp": 1.0142169, + "epoch": 0.25341950999549073, + "flos": 53062649936640.0, + "grad_norm": 0.8479929036578698, + "language_loss": 0.58049941, + "learning_rate": 3.4988279310445396e-06, + "loss": 0.60101438, + "num_input_tokens_seen": 90996295, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.29296875, + "step": 4215, + "time_per_iteration": 2.824084997177124 + }, + { + "auxiliary_loss_clip": 0.01133433, + "auxiliary_loss_mlp": 0.01043479, + "balance_loss_clip": 1.02636075, + "balance_loss_mlp": 1.04583967, + "epoch": 0.2534796332481587, + "flos": 39020247054720.0, + "grad_norm": 1.7693463876532338, + "language_loss": 0.83949232, + "learning_rate": 3.498570039373066e-06, + "loss": 0.86126143, + "num_input_tokens_seen": 91017545, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.875, + "step": 4216, + "time_per_iteration": 2.650329828262329 + }, + { + "auxiliary_loss_clip": 0.01134428, + "auxiliary_loss_mlp": 0.01041784, + "balance_loss_clip": 1.02504706, + "balance_loss_mlp": 1.04571652, + "epoch": 0.2535397565008267, + "flos": 23586990670080.0, + "grad_norm": 1.7652170119003572, + "language_loss": 0.80028123, + "learning_rate": 3.498312090875666e-06, + "loss": 0.82204342, + "num_input_tokens_seen": 91037715, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.88671875, + "step": 4217, + "time_per_iteration": 2.49381160736084 + }, + { + "auxiliary_loss_clip": 0.01129632, + "auxiliary_loss_mlp": 0.01039348, + "balance_loss_clip": 1.02422011, + "balance_loss_mlp": 1.04193234, + "epoch": 0.2535998797534947, + "flos": 19281373251840.0, + "grad_norm": 2.1701414828965464, + "language_loss": 0.75014293, + "learning_rate": 3.4980540855621218e-06, + "loss": 0.7718327, + "num_input_tokens_seen": 91055295, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.87890625, + "step": 4218, + "time_per_iteration": 2.4794864654541016 + }, + { + "auxiliary_loss_clip": 0.01135591, + "auxiliary_loss_mlp": 0.01041436, + "balance_loss_clip": 1.02462721, + "balance_loss_mlp": 1.04470503, + "epoch": 0.25366000300616265, + "flos": 24024382583040.0, + "grad_norm": 1.8718582993796022, + "language_loss": 0.74483025, + "learning_rate": 3.4977960234422167e-06, + "loss": 0.76660055, + "num_input_tokens_seen": 91075485, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.90625, + "step": 4219, + "time_per_iteration": 5.428370952606201 + }, + { + "auxiliary_loss_clip": 0.01137942, + "auxiliary_loss_mlp": 0.01052625, + "balance_loss_clip": 1.0351491, + "balance_loss_mlp": 1.04695058, + "epoch": 0.2537201262588306, + "flos": 16289368116480.0, + "grad_norm": 2.1507448030921057, + "language_loss": 0.81194967, + "learning_rate": 3.497537904525736e-06, + "loss": 0.83385527, + "num_input_tokens_seen": 91093620, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.91015625, + "step": 4220, + "time_per_iteration": 2.454045534133911 + }, + { + "auxiliary_loss_clip": 0.01134951, + "auxiliary_loss_mlp": 0.01047743, + "balance_loss_clip": 1.03007603, + "balance_loss_mlp": 1.04596126, + "epoch": 0.2537802495114986, + "flos": 23294677789440.0, + "grad_norm": 2.058400170489012, + "language_loss": 0.70873475, + "learning_rate": 3.497279728822468e-06, + "loss": 0.73056173, + "num_input_tokens_seen": 91114110, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.890625, + "step": 4221, + "time_per_iteration": 2.4728429317474365 + }, + { + "auxiliary_loss_clip": 0.01134228, + "auxiliary_loss_mlp": 0.01039832, + "balance_loss_clip": 1.02309537, + "balance_loss_mlp": 1.0444454, + "epoch": 0.25384037276416654, + "flos": 17639142416640.0, + "grad_norm": 2.3290205392002847, + "language_loss": 0.62039649, + "learning_rate": 3.497021496342202e-06, + "loss": 0.64213717, + "num_input_tokens_seen": 91133135, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8984375, + "step": 4222, + "time_per_iteration": 2.4465436935424805 + }, + { + "auxiliary_loss_clip": 0.01137839, + "auxiliary_loss_mlp": 0.01052178, + "balance_loss_clip": 1.0352385, + "balance_loss_mlp": 1.04635429, + "epoch": 0.2539004960168345, + "flos": 21507044699520.0, + "grad_norm": 1.6514367228652884, + "language_loss": 0.74686599, + "learning_rate": 3.496763207094731e-06, + "loss": 0.76876616, + "num_input_tokens_seen": 91151805, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9140625, + "step": 4223, + "time_per_iteration": 2.449887275695801 + }, + { + "auxiliary_loss_clip": 0.01134875, + "auxiliary_loss_mlp": 0.01035973, + "balance_loss_clip": 1.02001095, + "balance_loss_mlp": 1.04763556, + "epoch": 0.2539606192695025, + "flos": 23950909313280.0, + "grad_norm": 1.7274606282993847, + "language_loss": 0.79782087, + "learning_rate": 3.49650486108985e-06, + "loss": 0.81952935, + "num_input_tokens_seen": 91172270, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.875, + "step": 4224, + "time_per_iteration": 2.4809348583221436 + }, + { + "auxiliary_loss_clip": 0.01129812, + "auxiliary_loss_mlp": 0.01043453, + "balance_loss_clip": 1.02668035, + "balance_loss_mlp": 1.04306865, + "epoch": 0.25402074252217044, + "flos": 24169784837760.0, + "grad_norm": 1.7388314634599362, + "language_loss": 0.77813148, + "learning_rate": 3.496246458337354e-06, + "loss": 0.79986417, + "num_input_tokens_seen": 91192080, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8671875, + "step": 4225, + "time_per_iteration": 2.4813735485076904 + }, + { + "auxiliary_loss_clip": 0.01135622, + "auxiliary_loss_mlp": 0.01054065, + "balance_loss_clip": 1.03661263, + "balance_loss_mlp": 1.04603362, + "epoch": 0.2540808657748384, + "flos": 22303758314880.0, + "grad_norm": 1.6070040517314534, + "language_loss": 0.84763634, + "learning_rate": 3.4959879988470426e-06, + "loss": 0.86953318, + "num_input_tokens_seen": 91211450, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.89453125, + "step": 4226, + "time_per_iteration": 2.4583990573883057 + }, + { + "auxiliary_loss_clip": 0.01130131, + "auxiliary_loss_mlp": 0.0104498, + "balance_loss_clip": 1.0277667, + "balance_loss_mlp": 1.04317141, + "epoch": 0.25414098902750637, + "flos": 27599541022080.0, + "grad_norm": 2.4872704745527168, + "language_loss": 0.70759654, + "learning_rate": 3.4957294826287164e-06, + "loss": 0.72934765, + "num_input_tokens_seen": 91231835, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.8671875, + "step": 4227, + "time_per_iteration": 2.532057762145996 + }, + { + "auxiliary_loss_clip": 0.01041509, + "auxiliary_loss_mlp": 0.01000975, + "balance_loss_clip": 0.9989962, + "balance_loss_mlp": 1.01186037, + "epoch": 0.25420111228017434, + "flos": 58170834887040.0, + "grad_norm": 0.9701035361715339, + "language_loss": 0.61865914, + "learning_rate": 3.4954709096921785e-06, + "loss": 0.63908398, + "num_input_tokens_seen": 91288755, + "router_z_loss_clip": 0.01977539, + "router_z_loss_mlp": 0.296875, + "step": 4228, + "time_per_iteration": 2.9040682315826416 + }, + { + "auxiliary_loss_clip": 0.01136332, + "auxiliary_loss_mlp": 0.01037582, + "balance_loss_clip": 1.02026105, + "balance_loss_mlp": 1.04564357, + "epoch": 0.2542612355328423, + "flos": 11464409905920.0, + "grad_norm": 4.885618231754604, + "language_loss": 0.86024547, + "learning_rate": 3.4952122800472336e-06, + "loss": 0.88198459, + "num_input_tokens_seen": 91302485, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.90625, + "step": 4229, + "time_per_iteration": 2.404157876968384 + }, + { + "auxiliary_loss_clip": 0.0113505, + "auxiliary_loss_mlp": 0.01044312, + "balance_loss_clip": 1.02696753, + "balance_loss_mlp": 1.0466435, + "epoch": 0.2543213587855103, + "flos": 22965879669120.0, + "grad_norm": 1.8862111092995248, + "language_loss": 0.77280557, + "learning_rate": 3.4949535937036892e-06, + "loss": 0.79459918, + "num_input_tokens_seen": 91321120, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.8828125, + "step": 4230, + "time_per_iteration": 2.4956207275390625 + }, + { + "auxiliary_loss_clip": 0.01133757, + "auxiliary_loss_mlp": 0.01046935, + "balance_loss_clip": 1.02980483, + "balance_loss_mlp": 1.04598594, + "epoch": 0.2543814820381783, + "flos": 18253178438400.0, + "grad_norm": 1.9381647251913205, + "language_loss": 0.75116754, + "learning_rate": 3.4946948506713544e-06, + "loss": 0.77297449, + "num_input_tokens_seen": 91338575, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.87890625, + "step": 4231, + "time_per_iteration": 2.4570302963256836 + }, + { + "auxiliary_loss_clip": 0.0113225, + "auxiliary_loss_mlp": 0.01038768, + "balance_loss_clip": 1.02253127, + "balance_loss_mlp": 1.04484463, + "epoch": 0.25444160529084625, + "flos": 15632705629440.0, + "grad_norm": 2.3236339630790916, + "language_loss": 0.74055511, + "learning_rate": 3.4944360509600416e-06, + "loss": 0.76226532, + "num_input_tokens_seen": 91357355, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.875, + "step": 4232, + "time_per_iteration": 2.4537932872772217 + }, + { + "auxiliary_loss_clip": 0.01134838, + "auxiliary_loss_mlp": 0.01041691, + "balance_loss_clip": 1.02412581, + "balance_loss_mlp": 1.04658151, + "epoch": 0.2545017285435142, + "flos": 24601610142720.0, + "grad_norm": 1.8521853851823955, + "language_loss": 0.86557174, + "learning_rate": 3.4941771945795637e-06, + "loss": 0.88733703, + "num_input_tokens_seen": 91376515, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8828125, + "step": 4233, + "time_per_iteration": 2.4943323135375977 + }, + { + "auxiliary_loss_clip": 0.01125532, + "auxiliary_loss_mlp": 0.01040876, + "balance_loss_clip": 1.02570057, + "balance_loss_mlp": 1.04215169, + "epoch": 0.2545618517961822, + "flos": 24679069822080.0, + "grad_norm": 1.5280608213400515, + "language_loss": 0.74841732, + "learning_rate": 3.493918281539737e-06, + "loss": 0.7700814, + "num_input_tokens_seen": 91397595, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8359375, + "step": 4234, + "time_per_iteration": 2.541349172592163 + }, + { + "auxiliary_loss_clip": 0.01133471, + "auxiliary_loss_mlp": 0.01042663, + "balance_loss_clip": 1.02661681, + "balance_loss_mlp": 1.04286838, + "epoch": 0.25462197504885015, + "flos": 23915106432000.0, + "grad_norm": 1.542232814469661, + "language_loss": 0.7489568, + "learning_rate": 3.493659311850379e-06, + "loss": 0.77071816, + "num_input_tokens_seen": 91417775, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.90625, + "step": 4235, + "time_per_iteration": 2.5059099197387695 + }, + { + "auxiliary_loss_clip": 0.01141785, + "auxiliary_loss_mlp": 0.01044345, + "balance_loss_clip": 1.02570069, + "balance_loss_mlp": 1.04655004, + "epoch": 0.2546820983015181, + "flos": 24789387467520.0, + "grad_norm": 2.0015253194085645, + "language_loss": 0.64487904, + "learning_rate": 3.4934002855213106e-06, + "loss": 0.6667403, + "num_input_tokens_seen": 91437665, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.953125, + "step": 4236, + "time_per_iteration": 2.512286424636841 + }, + { + "auxiliary_loss_clip": 0.01131709, + "auxiliary_loss_mlp": 0.01032901, + "balance_loss_clip": 1.01757693, + "balance_loss_mlp": 1.04509079, + "epoch": 0.2547422215541861, + "flos": 18734130570240.0, + "grad_norm": 1.5430935122242522, + "language_loss": 0.67046815, + "learning_rate": 3.493141202562354e-06, + "loss": 0.69211423, + "num_input_tokens_seen": 91456705, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8671875, + "step": 4237, + "time_per_iteration": 2.455911636352539 + }, + { + "auxiliary_loss_clip": 0.01134325, + "auxiliary_loss_mlp": 0.01045903, + "balance_loss_clip": 1.02916634, + "balance_loss_mlp": 1.04509199, + "epoch": 0.25480234480685404, + "flos": 21032449274880.0, + "grad_norm": 1.9754127990153556, + "language_loss": 0.74863333, + "learning_rate": 3.492882062983333e-06, + "loss": 0.77043563, + "num_input_tokens_seen": 91475535, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.89453125, + "step": 4238, + "time_per_iteration": 2.4770114421844482 + }, + { + "auxiliary_loss_clip": 0.01136693, + "auxiliary_loss_mlp": 0.0104647, + "balance_loss_clip": 1.02848125, + "balance_loss_mlp": 1.04734778, + "epoch": 0.254862468059522, + "flos": 25082167224960.0, + "grad_norm": 1.8397193389954023, + "language_loss": 0.8033936, + "learning_rate": 3.492622866794074e-06, + "loss": 0.82522523, + "num_input_tokens_seen": 91499140, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.89453125, + "step": 4239, + "time_per_iteration": 2.5087499618530273 + }, + { + "auxiliary_loss_clip": 0.01131893, + "auxiliary_loss_mlp": 0.01041684, + "balance_loss_clip": 1.02457762, + "balance_loss_mlp": 1.04512548, + "epoch": 0.25492259131219, + "flos": 20558392554240.0, + "grad_norm": 1.749971041952711, + "language_loss": 0.77208781, + "learning_rate": 3.492363614004407e-06, + "loss": 0.7938236, + "num_input_tokens_seen": 91518335, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.8671875, + "step": 4240, + "time_per_iteration": 2.4757072925567627 + }, + { + "auxiliary_loss_clip": 0.01141112, + "auxiliary_loss_mlp": 0.01042401, + "balance_loss_clip": 1.02463925, + "balance_loss_mlp": 1.04773092, + "epoch": 0.25498271456485794, + "flos": 25042485674880.0, + "grad_norm": 2.0511352101670126, + "language_loss": 0.83254647, + "learning_rate": 3.492104304624162e-06, + "loss": 0.85438156, + "num_input_tokens_seen": 91537655, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.93359375, + "step": 4241, + "time_per_iteration": 2.5062708854675293 + }, + { + "auxiliary_loss_clip": 0.01136148, + "auxiliary_loss_mlp": 0.01044003, + "balance_loss_clip": 1.02761221, + "balance_loss_mlp": 1.0463624, + "epoch": 0.2550428378175259, + "flos": 26178412354560.0, + "grad_norm": 1.6663950411566644, + "language_loss": 0.73410285, + "learning_rate": 3.4918449386631725e-06, + "loss": 0.75590432, + "num_input_tokens_seen": 91557545, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8984375, + "step": 4242, + "time_per_iteration": 2.5570173263549805 + }, + { + "auxiliary_loss_clip": 0.01136205, + "auxiliary_loss_mlp": 0.01038733, + "balance_loss_clip": 1.02249646, + "balance_loss_mlp": 1.04695976, + "epoch": 0.2551029610701939, + "flos": 15267170874240.0, + "grad_norm": 2.4092613771466453, + "language_loss": 0.72371018, + "learning_rate": 3.491585516131273e-06, + "loss": 0.74545956, + "num_input_tokens_seen": 91574405, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.890625, + "step": 4243, + "time_per_iteration": 2.440492868423462 + }, + { + "auxiliary_loss_clip": 0.01136318, + "auxiliary_loss_mlp": 0.01041492, + "balance_loss_clip": 1.02507675, + "balance_loss_mlp": 1.04668963, + "epoch": 0.2551630843228619, + "flos": 18112193556480.0, + "grad_norm": 2.3937572910440847, + "language_loss": 0.81865323, + "learning_rate": 3.491326037038301e-06, + "loss": 0.84043133, + "num_input_tokens_seen": 91593755, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8984375, + "step": 4244, + "time_per_iteration": 2.4728784561157227 + }, + { + "auxiliary_loss_clip": 0.01044231, + "auxiliary_loss_mlp": 0.01002536, + "balance_loss_clip": 1.00084293, + "balance_loss_mlp": 1.01474202, + "epoch": 0.25522320757552985, + "flos": 70520192167680.0, + "grad_norm": 0.7400094393930867, + "language_loss": 0.5777986, + "learning_rate": 3.4910665013940967e-06, + "loss": 0.5982663, + "num_input_tokens_seen": 91660335, + "router_z_loss_clip": 0.01696777, + "router_z_loss_mlp": 0.29492188, + "step": 4245, + "time_per_iteration": 3.155487537384033 + }, + { + "auxiliary_loss_clip": 0.01135489, + "auxiliary_loss_mlp": 0.01049355, + "balance_loss_clip": 1.03248656, + "balance_loss_mlp": 1.04526567, + "epoch": 0.2552833308281978, + "flos": 22893088757760.0, + "grad_norm": 1.9776048921576397, + "language_loss": 0.65246034, + "learning_rate": 3.4908069092085015e-06, + "loss": 0.67430878, + "num_input_tokens_seen": 91678500, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.90234375, + "step": 4246, + "time_per_iteration": 2.4889461994171143 + }, + { + "auxiliary_loss_clip": 0.01127053, + "auxiliary_loss_mlp": 0.0104224, + "balance_loss_clip": 1.02715993, + "balance_loss_mlp": 1.04366493, + "epoch": 0.2553434540808658, + "flos": 22053605022720.0, + "grad_norm": 1.748925776992144, + "language_loss": 0.81467927, + "learning_rate": 3.4905472604913585e-06, + "loss": 0.83637214, + "num_input_tokens_seen": 91696430, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8359375, + "step": 4247, + "time_per_iteration": 2.4680213928222656 + }, + { + "auxiliary_loss_clip": 0.0114026, + "auxiliary_loss_mlp": 0.01045845, + "balance_loss_clip": 1.02718902, + "balance_loss_mlp": 1.04570985, + "epoch": 0.25540357733353375, + "flos": 16544190176640.0, + "grad_norm": 2.9702547035135165, + "language_loss": 0.83062297, + "learning_rate": 3.490287555252514e-06, + "loss": 0.85248411, + "num_input_tokens_seen": 91713270, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.9453125, + "step": 4248, + "time_per_iteration": 2.446810245513916 + }, + { + "auxiliary_loss_clip": 0.01136577, + "auxiliary_loss_mlp": 0.01045007, + "balance_loss_clip": 1.02793586, + "balance_loss_mlp": 1.04672599, + "epoch": 0.2554637005862017, + "flos": 17565022702080.0, + "grad_norm": 2.21885342952208, + "language_loss": 0.84529531, + "learning_rate": 3.4900277935018166e-06, + "loss": 0.86711109, + "num_input_tokens_seen": 91728865, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.8984375, + "step": 4249, + "time_per_iteration": 2.4372382164001465 + }, + { + "auxiliary_loss_clip": 0.01044447, + "auxiliary_loss_mlp": 0.01003984, + "balance_loss_clip": 1.00217199, + "balance_loss_mlp": 1.01503897, + "epoch": 0.2555238238388697, + "flos": 72244763953920.0, + "grad_norm": 0.7531523874953217, + "language_loss": 0.56312215, + "learning_rate": 3.489767975249115e-06, + "loss": 0.58360648, + "num_input_tokens_seen": 91787470, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.29492188, + "step": 4250, + "time_per_iteration": 3.047654628753662 + }, + { + "auxiliary_loss_clip": 0.01133573, + "auxiliary_loss_mlp": 0.01038351, + "balance_loss_clip": 1.02139914, + "balance_loss_mlp": 1.04434705, + "epoch": 0.25558394709153764, + "flos": 24389414547840.0, + "grad_norm": 2.1374171101673243, + "language_loss": 0.80306417, + "learning_rate": 3.4895081005042632e-06, + "loss": 0.82478344, + "num_input_tokens_seen": 91805640, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.890625, + "step": 4251, + "time_per_iteration": 2.4866387844085693 + }, + { + "auxiliary_loss_clip": 0.01042955, + "auxiliary_loss_mlp": 0.01004928, + "balance_loss_clip": 1.00307989, + "balance_loss_mlp": 1.01383376, + "epoch": 0.2556440703442056, + "flos": 69231213636480.0, + "grad_norm": 0.7958061962206047, + "language_loss": 0.66077995, + "learning_rate": 3.4892481692771146e-06, + "loss": 0.6812588, + "num_input_tokens_seen": 91869695, + "router_z_loss_clip": 0.01843262, + "router_z_loss_mlp": 0.29296875, + "step": 4252, + "time_per_iteration": 3.117496967315674 + }, + { + "auxiliary_loss_clip": 0.011309, + "auxiliary_loss_mlp": 0.01037068, + "balance_loss_clip": 1.02198839, + "balance_loss_mlp": 1.04373813, + "epoch": 0.2557041935968736, + "flos": 24863902231680.0, + "grad_norm": 2.169743717969613, + "language_loss": 0.73382849, + "learning_rate": 3.4889881815775267e-06, + "loss": 0.75550812, + "num_input_tokens_seen": 91889920, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.87109375, + "step": 4253, + "time_per_iteration": 2.5709948539733887 + }, + { + "auxiliary_loss_clip": 0.01134729, + "auxiliary_loss_mlp": 0.01044447, + "balance_loss_clip": 1.02873516, + "balance_loss_mlp": 1.04698956, + "epoch": 0.25576431684954154, + "flos": 22492110257280.0, + "grad_norm": 1.9741012093631007, + "language_loss": 0.72927308, + "learning_rate": 3.488728137415357e-06, + "loss": 0.75106484, + "num_input_tokens_seen": 91908665, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.875, + "step": 4254, + "time_per_iteration": 2.509932518005371 + }, + { + "auxiliary_loss_clip": 0.01133463, + "auxiliary_loss_mlp": 0.01043601, + "balance_loss_clip": 1.02636361, + "balance_loss_mlp": 1.04452896, + "epoch": 0.2558244401022095, + "flos": 19826748426240.0, + "grad_norm": 1.7290530974650873, + "language_loss": 0.80863065, + "learning_rate": 3.4884680368004675e-06, + "loss": 0.8304013, + "num_input_tokens_seen": 91927855, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.890625, + "step": 4255, + "time_per_iteration": 2.4473092555999756 + }, + { + "auxiliary_loss_clip": 0.01133499, + "auxiliary_loss_mlp": 0.01043496, + "balance_loss_clip": 1.02681875, + "balance_loss_mlp": 1.04673088, + "epoch": 0.2558845633548775, + "flos": 23220486247680.0, + "grad_norm": 1.512169748685899, + "language_loss": 0.85572308, + "learning_rate": 3.488207879742721e-06, + "loss": 0.87749302, + "num_input_tokens_seen": 91948500, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8671875, + "step": 4256, + "time_per_iteration": 2.500788927078247 + }, + { + "auxiliary_loss_clip": 0.01136428, + "auxiliary_loss_mlp": 0.01048361, + "balance_loss_clip": 1.03119493, + "balance_loss_mlp": 1.04482555, + "epoch": 0.2559446866075455, + "flos": 16837867774080.0, + "grad_norm": 4.026866255210063, + "language_loss": 0.74821836, + "learning_rate": 3.4879476662519826e-06, + "loss": 0.77006626, + "num_input_tokens_seen": 91968375, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9140625, + "step": 4257, + "time_per_iteration": 2.4511358737945557 + }, + { + "auxiliary_loss_clip": 0.01040508, + "auxiliary_loss_mlp": 0.01009541, + "balance_loss_clip": 1.00763345, + "balance_loss_mlp": 1.01154876, + "epoch": 0.25600480986021346, + "flos": 57593786895360.0, + "grad_norm": 0.8061088541165783, + "language_loss": 0.65227318, + "learning_rate": 3.4876873963381196e-06, + "loss": 0.67277366, + "num_input_tokens_seen": 92028490, + "router_z_loss_clip": 0.01904297, + "router_z_loss_mlp": 0.2890625, + "step": 4258, + "time_per_iteration": 2.9953789710998535 + }, + { + "auxiliary_loss_clip": 0.01131454, + "auxiliary_loss_mlp": 0.0103789, + "balance_loss_clip": 1.0202589, + "balance_loss_mlp": 1.04548264, + "epoch": 0.2560649331128814, + "flos": 27819529868160.0, + "grad_norm": 1.622828615893818, + "language_loss": 0.7647177, + "learning_rate": 3.4874270700110013e-06, + "loss": 0.78641111, + "num_input_tokens_seen": 92048060, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.859375, + "step": 4259, + "time_per_iteration": 2.5079360008239746 + }, + { + "auxiliary_loss_clip": 0.01038142, + "auxiliary_loss_mlp": 0.01004188, + "balance_loss_clip": 1.00237584, + "balance_loss_mlp": 1.0093925, + "epoch": 0.2561250563655494, + "flos": 70950509101440.0, + "grad_norm": 0.7946947905759578, + "language_loss": 0.58501768, + "learning_rate": 3.4871666872804994e-06, + "loss": 0.60544097, + "num_input_tokens_seen": 92118180, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.28710938, + "step": 4260, + "time_per_iteration": 4.636982202529907 + }, + { + "auxiliary_loss_clip": 0.01131187, + "auxiliary_loss_mlp": 0.01044504, + "balance_loss_clip": 1.02759969, + "balance_loss_mlp": 1.04300261, + "epoch": 0.25618517961821735, + "flos": 27012329481600.0, + "grad_norm": 1.8728817118968701, + "language_loss": 0.76659095, + "learning_rate": 3.4869062481564875e-06, + "loss": 0.7883479, + "num_input_tokens_seen": 92137570, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8828125, + "step": 4261, + "time_per_iteration": 3.974956750869751 + }, + { + "auxiliary_loss_clip": 0.01130829, + "auxiliary_loss_mlp": 0.01037912, + "balance_loss_clip": 1.02280843, + "balance_loss_mlp": 1.04460573, + "epoch": 0.2562453028708853, + "flos": 23068296322560.0, + "grad_norm": 1.6516780840688012, + "language_loss": 0.8323037, + "learning_rate": 3.486645752648842e-06, + "loss": 0.85399115, + "num_input_tokens_seen": 92157625, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.86328125, + "step": 4262, + "time_per_iteration": 2.5251948833465576 + }, + { + "auxiliary_loss_clip": 0.01136997, + "auxiliary_loss_mlp": 0.01048847, + "balance_loss_clip": 1.03123951, + "balance_loss_mlp": 1.04404712, + "epoch": 0.2563054261235533, + "flos": 15120942606720.0, + "grad_norm": 2.7380780768968016, + "language_loss": 0.74153852, + "learning_rate": 3.4863852007674405e-06, + "loss": 0.76339698, + "num_input_tokens_seen": 92175350, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9296875, + "step": 4263, + "time_per_iteration": 2.42657208442688 + }, + { + "auxiliary_loss_clip": 0.01133473, + "auxiliary_loss_mlp": 0.01051758, + "balance_loss_clip": 1.03533101, + "balance_loss_mlp": 1.04720163, + "epoch": 0.25636554937622125, + "flos": 27854865872640.0, + "grad_norm": 1.7828084139599185, + "language_loss": 0.82793939, + "learning_rate": 3.486124592522163e-06, + "loss": 0.84979165, + "num_input_tokens_seen": 92196070, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.86328125, + "step": 4264, + "time_per_iteration": 2.534097194671631 + }, + { + "auxiliary_loss_clip": 0.01134463, + "auxiliary_loss_mlp": 0.01041936, + "balance_loss_clip": 1.02506804, + "balance_loss_mlp": 1.04660988, + "epoch": 0.2564256726288892, + "flos": 28906509288960.0, + "grad_norm": 1.7080317762970965, + "language_loss": 0.7443161, + "learning_rate": 3.4858639279228924e-06, + "loss": 0.76608008, + "num_input_tokens_seen": 92216310, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.87890625, + "step": 4265, + "time_per_iteration": 2.51088809967041 + }, + { + "auxiliary_loss_clip": 0.01129849, + "auxiliary_loss_mlp": 0.01032538, + "balance_loss_clip": 1.01679027, + "balance_loss_mlp": 1.0425024, + "epoch": 0.2564857958815572, + "flos": 18514931823360.0, + "grad_norm": 1.644190377842657, + "language_loss": 0.8153013, + "learning_rate": 3.485603206979513e-06, + "loss": 0.83692515, + "num_input_tokens_seen": 92234510, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.875, + "step": 4266, + "time_per_iteration": 2.4706335067749023 + }, + { + "auxiliary_loss_clip": 0.01128054, + "auxiliary_loss_mlp": 0.01035268, + "balance_loss_clip": 1.01909137, + "balance_loss_mlp": 1.04252076, + "epoch": 0.25654591913422514, + "flos": 25808280658560.0, + "grad_norm": 1.6333370834261398, + "language_loss": 0.79287028, + "learning_rate": 3.4853424297019103e-06, + "loss": 0.81450343, + "num_input_tokens_seen": 92254070, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.85546875, + "step": 4267, + "time_per_iteration": 2.4819366931915283 + }, + { + "auxiliary_loss_clip": 0.01127366, + "auxiliary_loss_mlp": 0.01041101, + "balance_loss_clip": 1.02480555, + "balance_loss_mlp": 1.04406714, + "epoch": 0.2566060423868931, + "flos": 19099665325440.0, + "grad_norm": 1.7559000109968124, + "language_loss": 0.78708017, + "learning_rate": 3.4850815960999736e-06, + "loss": 0.80876482, + "num_input_tokens_seen": 92275060, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8359375, + "step": 4268, + "time_per_iteration": 2.4778378009796143 + }, + { + "auxiliary_loss_clip": 0.0113239, + "auxiliary_loss_mlp": 0.01037875, + "balance_loss_clip": 1.02198434, + "balance_loss_mlp": 1.04507172, + "epoch": 0.25666616563956113, + "flos": 23842674656640.0, + "grad_norm": 2.2514359992660204, + "language_loss": 0.68120348, + "learning_rate": 3.484820706183595e-06, + "loss": 0.70290613, + "num_input_tokens_seen": 92293610, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.87109375, + "step": 4269, + "time_per_iteration": 2.4696271419525146 + }, + { + "auxiliary_loss_clip": 0.01134604, + "auxiliary_loss_mlp": 0.01042059, + "balance_loss_clip": 1.0249877, + "balance_loss_mlp": 1.04593778, + "epoch": 0.2567262888922291, + "flos": 14604259420800.0, + "grad_norm": 4.018282830570473, + "language_loss": 0.78496158, + "learning_rate": 3.484559759962666e-06, + "loss": 0.80672824, + "num_input_tokens_seen": 92308305, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.88671875, + "step": 4270, + "time_per_iteration": 2.418912172317505 + }, + { + "auxiliary_loss_clip": 0.01139603, + "auxiliary_loss_mlp": 0.01037807, + "balance_loss_clip": 1.01953244, + "balance_loss_mlp": 1.04711556, + "epoch": 0.25678641214489706, + "flos": 32923117877760.0, + "grad_norm": 2.0502449379686256, + "language_loss": 0.68136632, + "learning_rate": 3.4842987574470816e-06, + "loss": 0.70314038, + "num_input_tokens_seen": 92329875, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.921875, + "step": 4271, + "time_per_iteration": 2.5410749912261963 + }, + { + "auxiliary_loss_clip": 0.01137314, + "auxiliary_loss_mlp": 0.0104993, + "balance_loss_clip": 1.0325973, + "balance_loss_mlp": 1.04592848, + "epoch": 0.256846535397565, + "flos": 24098933260800.0, + "grad_norm": 4.518410893879739, + "language_loss": 0.8741951, + "learning_rate": 3.4840376986467403e-06, + "loss": 0.8960675, + "num_input_tokens_seen": 92348780, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9140625, + "step": 4272, + "time_per_iteration": 2.5022568702697754 + }, + { + "auxiliary_loss_clip": 0.0113724, + "auxiliary_loss_mlp": 0.01044761, + "balance_loss_clip": 1.02734506, + "balance_loss_mlp": 1.04770613, + "epoch": 0.256906658650233, + "flos": 19718441942400.0, + "grad_norm": 1.953603621991432, + "language_loss": 0.81442308, + "learning_rate": 3.483776583571541e-06, + "loss": 0.83624303, + "num_input_tokens_seen": 92368175, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.8984375, + "step": 4273, + "time_per_iteration": 2.453834295272827 + }, + { + "auxiliary_loss_clip": 0.01131691, + "auxiliary_loss_mlp": 0.01041869, + "balance_loss_clip": 1.02492929, + "balance_loss_mlp": 1.04724693, + "epoch": 0.25696678190290095, + "flos": 22926018551040.0, + "grad_norm": 1.682161023261006, + "language_loss": 0.77215779, + "learning_rate": 3.4835154122313846e-06, + "loss": 0.79389334, + "num_input_tokens_seen": 92387755, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.84375, + "step": 4274, + "time_per_iteration": 2.486238956451416 + }, + { + "auxiliary_loss_clip": 0.01129914, + "auxiliary_loss_mlp": 0.01037046, + "balance_loss_clip": 1.02061856, + "balance_loss_mlp": 1.04450369, + "epoch": 0.2570269051555689, + "flos": 27307838672640.0, + "grad_norm": 1.8548211040661395, + "language_loss": 0.8401829, + "learning_rate": 3.4832541846361743e-06, + "loss": 0.86185247, + "num_input_tokens_seen": 92409850, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8515625, + "step": 4275, + "time_per_iteration": 2.5145719051361084 + }, + { + "auxiliary_loss_clip": 0.01133209, + "auxiliary_loss_mlp": 0.0103751, + "balance_loss_clip": 1.02078438, + "balance_loss_mlp": 1.04492021, + "epoch": 0.2570870284082369, + "flos": 27563414918400.0, + "grad_norm": 3.0116628321367678, + "language_loss": 0.78124094, + "learning_rate": 3.4829929007958175e-06, + "loss": 0.80294812, + "num_input_tokens_seen": 92431250, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8828125, + "step": 4276, + "time_per_iteration": 2.533989906311035 + }, + { + "auxiliary_loss_clip": 0.01133318, + "auxiliary_loss_mlp": 0.01043592, + "balance_loss_clip": 1.02723646, + "balance_loss_mlp": 1.04575086, + "epoch": 0.25714715166090485, + "flos": 28730834847360.0, + "grad_norm": 1.750550841347414, + "language_loss": 0.79439288, + "learning_rate": 3.4827315607202214e-06, + "loss": 0.81616199, + "num_input_tokens_seen": 92452065, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.875, + "step": 4277, + "time_per_iteration": 2.5131442546844482 + }, + { + "auxiliary_loss_clip": 0.01134263, + "auxiliary_loss_mlp": 0.01036244, + "balance_loss_clip": 1.01981688, + "balance_loss_mlp": 1.04671657, + "epoch": 0.2572072749135728, + "flos": 20116152305280.0, + "grad_norm": 2.0431628844466543, + "language_loss": 0.78804862, + "learning_rate": 3.482470164419295e-06, + "loss": 0.80975372, + "num_input_tokens_seen": 92470025, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.875, + "step": 4278, + "time_per_iteration": 2.4813432693481445 + }, + { + "auxiliary_loss_clip": 0.01137794, + "auxiliary_loss_mlp": 0.01039572, + "balance_loss_clip": 1.02299643, + "balance_loss_mlp": 1.04657972, + "epoch": 0.2572673981662408, + "flos": 26030855283840.0, + "grad_norm": 2.020871128069371, + "language_loss": 0.74624676, + "learning_rate": 3.482208711902952e-06, + "loss": 0.76802039, + "num_input_tokens_seen": 92489825, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9140625, + "step": 4279, + "time_per_iteration": 2.4989213943481445 + }, + { + "auxiliary_loss_clip": 0.01136508, + "auxiliary_loss_mlp": 0.01051836, + "balance_loss_clip": 1.03472984, + "balance_loss_mlp": 1.04528475, + "epoch": 0.25732752141890874, + "flos": 16106618695680.0, + "grad_norm": 2.295268067844067, + "language_loss": 0.85406947, + "learning_rate": 3.4819472031811065e-06, + "loss": 0.87595296, + "num_input_tokens_seen": 92507270, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9140625, + "step": 4280, + "time_per_iteration": 2.479163408279419 + }, + { + "auxiliary_loss_clip": 0.0113599, + "auxiliary_loss_mlp": 0.01041209, + "balance_loss_clip": 1.02362585, + "balance_loss_mlp": 1.04583955, + "epoch": 0.2573876446715767, + "flos": 22524429519360.0, + "grad_norm": 2.2211313624852447, + "language_loss": 0.78780186, + "learning_rate": 3.4816856382636744e-06, + "loss": 0.80957377, + "num_input_tokens_seen": 92526300, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8984375, + "step": 4281, + "time_per_iteration": 2.463003158569336 + }, + { + "auxiliary_loss_clip": 0.01134819, + "auxiliary_loss_mlp": 0.01039212, + "balance_loss_clip": 1.02277303, + "balance_loss_mlp": 1.0472312, + "epoch": 0.2574477679242447, + "flos": 23950837486080.0, + "grad_norm": 1.9444978312753, + "language_loss": 0.87356091, + "learning_rate": 3.4814240171605737e-06, + "loss": 0.89530122, + "num_input_tokens_seen": 92546465, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.875, + "step": 4282, + "time_per_iteration": 2.5049889087677 + }, + { + "auxiliary_loss_clip": 0.01137104, + "auxiliary_loss_mlp": 0.0104319, + "balance_loss_clip": 1.02739513, + "balance_loss_mlp": 1.04648709, + "epoch": 0.2575078911769127, + "flos": 21981711951360.0, + "grad_norm": 1.5754049466604292, + "language_loss": 0.70172656, + "learning_rate": 3.4811623398817267e-06, + "loss": 0.72352946, + "num_input_tokens_seen": 92567260, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.90625, + "step": 4283, + "time_per_iteration": 2.520315408706665 + }, + { + "auxiliary_loss_clip": 0.01132284, + "auxiliary_loss_mlp": 0.01042212, + "balance_loss_clip": 1.02698922, + "balance_loss_mlp": 1.04772711, + "epoch": 0.25756801442958066, + "flos": 21945406279680.0, + "grad_norm": 2.712350413324169, + "language_loss": 0.80323613, + "learning_rate": 3.4809006064370553e-06, + "loss": 0.82498109, + "num_input_tokens_seen": 92585425, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.84765625, + "step": 4284, + "time_per_iteration": 2.483292579650879 + }, + { + "auxiliary_loss_clip": 0.01134487, + "auxiliary_loss_mlp": 0.01040012, + "balance_loss_clip": 1.02538466, + "balance_loss_mlp": 1.04674387, + "epoch": 0.2576281376822486, + "flos": 35261980058880.0, + "grad_norm": 2.1742402973432893, + "language_loss": 0.70485193, + "learning_rate": 3.4806388168364835e-06, + "loss": 0.72659695, + "num_input_tokens_seen": 92604770, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.875, + "step": 4285, + "time_per_iteration": 2.564211130142212 + }, + { + "auxiliary_loss_clip": 0.01137353, + "auxiliary_loss_mlp": 0.01038151, + "balance_loss_clip": 1.02282071, + "balance_loss_mlp": 1.04953337, + "epoch": 0.2576882609349166, + "flos": 14132285688960.0, + "grad_norm": 2.328286971317511, + "language_loss": 0.58380014, + "learning_rate": 3.4803769710899402e-06, + "loss": 0.60555518, + "num_input_tokens_seen": 92622635, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.87890625, + "step": 4286, + "time_per_iteration": 2.4425430297851562 + }, + { + "auxiliary_loss_clip": 0.01139826, + "auxiliary_loss_mlp": 0.01043664, + "balance_loss_clip": 1.02702272, + "balance_loss_mlp": 1.04858327, + "epoch": 0.25774838418758456, + "flos": 23258336204160.0, + "grad_norm": 1.6452331987585218, + "language_loss": 0.64191288, + "learning_rate": 3.480115069207354e-06, + "loss": 0.66374773, + "num_input_tokens_seen": 92642960, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.91015625, + "step": 4287, + "time_per_iteration": 2.470015287399292 + }, + { + "auxiliary_loss_clip": 0.0113955, + "auxiliary_loss_mlp": 0.01040533, + "balance_loss_clip": 1.02293801, + "balance_loss_mlp": 1.04739881, + "epoch": 0.2578085074402525, + "flos": 22601745544320.0, + "grad_norm": 2.0830358142366148, + "language_loss": 0.72029591, + "learning_rate": 3.4798531111986557e-06, + "loss": 0.74209672, + "num_input_tokens_seen": 92662455, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.921875, + "step": 4288, + "time_per_iteration": 2.4983417987823486 + }, + { + "auxiliary_loss_clip": 0.01135736, + "auxiliary_loss_mlp": 0.01038411, + "balance_loss_clip": 1.02263355, + "balance_loss_mlp": 1.04882312, + "epoch": 0.2578686306929205, + "flos": 24571840746240.0, + "grad_norm": 1.9870049696680936, + "language_loss": 0.76965904, + "learning_rate": 3.4795910970737786e-06, + "loss": 0.79140055, + "num_input_tokens_seen": 92683520, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8671875, + "step": 4289, + "time_per_iteration": 2.4997475147247314 + }, + { + "auxiliary_loss_clip": 0.01134323, + "auxiliary_loss_mlp": 0.01040378, + "balance_loss_clip": 1.02311635, + "balance_loss_mlp": 1.04562807, + "epoch": 0.25792875394558845, + "flos": 18113953322880.0, + "grad_norm": 1.946897603323323, + "language_loss": 0.85123539, + "learning_rate": 3.4793290268426592e-06, + "loss": 0.87298238, + "num_input_tokens_seen": 92701450, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.88671875, + "step": 4290, + "time_per_iteration": 2.454871416091919 + }, + { + "auxiliary_loss_clip": 0.01140117, + "auxiliary_loss_mlp": 0.0105053, + "balance_loss_clip": 1.03159952, + "balance_loss_mlp": 1.04959655, + "epoch": 0.2579888771982564, + "flos": 17712902995200.0, + "grad_norm": 2.195715426849753, + "language_loss": 0.72170424, + "learning_rate": 3.4790669005152354e-06, + "loss": 0.74361074, + "num_input_tokens_seen": 92720355, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.90625, + "step": 4291, + "time_per_iteration": 2.4512693881988525 + }, + { + "auxiliary_loss_clip": 0.01142047, + "auxiliary_loss_mlp": 0.01041391, + "balance_loss_clip": 1.02436781, + "balance_loss_mlp": 1.05002344, + "epoch": 0.2580490004509244, + "flos": 16434878112000.0, + "grad_norm": 2.4805881311796423, + "language_loss": 0.80718195, + "learning_rate": 3.4788047181014458e-06, + "loss": 0.82901633, + "num_input_tokens_seen": 92736755, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.921875, + "step": 4292, + "time_per_iteration": 2.469034433364868 + }, + { + "auxiliary_loss_clip": 0.01141659, + "auxiliary_loss_mlp": 0.01044805, + "balance_loss_clip": 1.02767503, + "balance_loss_mlp": 1.05171072, + "epoch": 0.25810912370359235, + "flos": 33835141128960.0, + "grad_norm": 7.501455001056755, + "language_loss": 0.67646754, + "learning_rate": 3.4785424796112337e-06, + "loss": 0.69833219, + "num_input_tokens_seen": 92757655, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.8984375, + "step": 4293, + "time_per_iteration": 2.5785787105560303 + }, + { + "auxiliary_loss_clip": 0.01130106, + "auxiliary_loss_mlp": 0.01042426, + "balance_loss_clip": 1.02660704, + "balance_loss_mlp": 1.04503, + "epoch": 0.2581692469562603, + "flos": 25192197561600.0, + "grad_norm": 1.9136357435420137, + "language_loss": 0.75409257, + "learning_rate": 3.478280185054542e-06, + "loss": 0.77581787, + "num_input_tokens_seen": 92776100, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8515625, + "step": 4294, + "time_per_iteration": 2.5044636726379395 + }, + { + "auxiliary_loss_clip": 0.01136505, + "auxiliary_loss_mlp": 0.01047021, + "balance_loss_clip": 1.02974749, + "balance_loss_mlp": 1.04808116, + "epoch": 0.2582293702089283, + "flos": 34932212271360.0, + "grad_norm": 2.168244565891273, + "language_loss": 0.81049722, + "learning_rate": 3.478017834441318e-06, + "loss": 0.83233249, + "num_input_tokens_seen": 92798880, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.8828125, + "step": 4295, + "time_per_iteration": 2.5875558853149414 + }, + { + "auxiliary_loss_clip": 0.01140472, + "auxiliary_loss_mlp": 0.01046123, + "balance_loss_clip": 1.02797985, + "balance_loss_mlp": 1.04796624, + "epoch": 0.2582894934615963, + "flos": 26833746038400.0, + "grad_norm": 2.1973562505628026, + "language_loss": 0.72515166, + "learning_rate": 3.4777554277815096e-06, + "loss": 0.74701762, + "num_input_tokens_seen": 92817750, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.92578125, + "step": 4296, + "time_per_iteration": 2.535693407058716 + }, + { + "auxiliary_loss_clip": 0.01138613, + "auxiliary_loss_mlp": 0.01039903, + "balance_loss_clip": 1.02322531, + "balance_loss_mlp": 1.04918242, + "epoch": 0.25834961671426426, + "flos": 23515241253120.0, + "grad_norm": 1.8330269406357795, + "language_loss": 0.86766148, + "learning_rate": 3.477492965085067e-06, + "loss": 0.88944662, + "num_input_tokens_seen": 92837995, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.89453125, + "step": 4297, + "time_per_iteration": 2.5001306533813477 + }, + { + "auxiliary_loss_clip": 0.01137068, + "auxiliary_loss_mlp": 0.01048271, + "balance_loss_clip": 1.03208232, + "balance_loss_mlp": 1.04755223, + "epoch": 0.25840973996693223, + "flos": 22451028076800.0, + "grad_norm": 2.2622150737063955, + "language_loss": 0.84706259, + "learning_rate": 3.477230446361943e-06, + "loss": 0.86891592, + "num_input_tokens_seen": 92857245, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.89453125, + "step": 4298, + "time_per_iteration": 2.489917278289795 + }, + { + "auxiliary_loss_clip": 0.01137188, + "auxiliary_loss_mlp": 0.0103747, + "balance_loss_clip": 1.02069676, + "balance_loss_mlp": 1.04739285, + "epoch": 0.2584698632196002, + "flos": 11290854366720.0, + "grad_norm": 2.0676974538336266, + "language_loss": 0.83596241, + "learning_rate": 3.4769678716220927e-06, + "loss": 0.85770899, + "num_input_tokens_seen": 92873265, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8984375, + "step": 4299, + "time_per_iteration": 2.4274845123291016 + }, + { + "auxiliary_loss_clip": 0.0113508, + "auxiliary_loss_mlp": 0.01035558, + "balance_loss_clip": 1.01985788, + "balance_loss_mlp": 1.04795814, + "epoch": 0.25852998647226816, + "flos": 17929982839680.0, + "grad_norm": 2.477231855960524, + "language_loss": 0.82685435, + "learning_rate": 3.4767052408754726e-06, + "loss": 0.84856081, + "num_input_tokens_seen": 92890880, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.87109375, + "step": 4300, + "time_per_iteration": 2.4730846881866455 + }, + { + "auxiliary_loss_clip": 0.01137103, + "auxiliary_loss_mlp": 0.0104166, + "balance_loss_clip": 1.02492332, + "balance_loss_mlp": 1.04620934, + "epoch": 0.2585901097249361, + "flos": 33256117889280.0, + "grad_norm": 2.2046546957653077, + "language_loss": 0.67186987, + "learning_rate": 3.4764425541320417e-06, + "loss": 0.69365752, + "num_input_tokens_seen": 92910770, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.91015625, + "step": 4301, + "time_per_iteration": 2.5633106231689453 + }, + { + "auxiliary_loss_clip": 0.01141797, + "auxiliary_loss_mlp": 0.01039122, + "balance_loss_clip": 1.02191997, + "balance_loss_mlp": 1.04805672, + "epoch": 0.2586502329776041, + "flos": 18441278985600.0, + "grad_norm": 2.459016606739088, + "language_loss": 0.80929118, + "learning_rate": 3.4761798114017617e-06, + "loss": 0.83110034, + "num_input_tokens_seen": 92929520, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9375, + "step": 4302, + "time_per_iteration": 5.438407897949219 + }, + { + "auxiliary_loss_clip": 0.01138396, + "auxiliary_loss_mlp": 0.0104179, + "balance_loss_clip": 1.02535129, + "balance_loss_mlp": 1.04789591, + "epoch": 0.25871035623027205, + "flos": 17968120104960.0, + "grad_norm": 2.9925401825996545, + "language_loss": 0.92246419, + "learning_rate": 3.475917012694595e-06, + "loss": 0.94426608, + "num_input_tokens_seen": 92947890, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90625, + "step": 4303, + "time_per_iteration": 2.514573574066162 + }, + { + "auxiliary_loss_clip": 0.01139372, + "auxiliary_loss_mlp": 0.01036604, + "balance_loss_clip": 1.020046, + "balance_loss_mlp": 1.04932761, + "epoch": 0.25877047948294, + "flos": 27777729415680.0, + "grad_norm": 1.8070234866344623, + "language_loss": 0.67034984, + "learning_rate": 3.475654158020507e-06, + "loss": 0.69210964, + "num_input_tokens_seen": 92967690, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8984375, + "step": 4304, + "time_per_iteration": 2.540682315826416 + }, + { + "auxiliary_loss_clip": 0.01138164, + "auxiliary_loss_mlp": 0.0105089, + "balance_loss_clip": 1.03355694, + "balance_loss_mlp": 1.04595923, + "epoch": 0.258830602735608, + "flos": 27125843437440.0, + "grad_norm": 2.73594521825367, + "language_loss": 0.72829735, + "learning_rate": 3.4753912473894657e-06, + "loss": 0.75018799, + "num_input_tokens_seen": 92986830, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.921875, + "step": 4305, + "time_per_iteration": 2.580801248550415 + }, + { + "auxiliary_loss_clip": 0.01138565, + "auxiliary_loss_mlp": 0.01041261, + "balance_loss_clip": 1.02417874, + "balance_loss_mlp": 1.04731607, + "epoch": 0.25889072598827595, + "flos": 17891486438400.0, + "grad_norm": 2.196623082948333, + "language_loss": 0.75595653, + "learning_rate": 3.4751282808114403e-06, + "loss": 0.77775478, + "num_input_tokens_seen": 93002740, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9140625, + "step": 4306, + "time_per_iteration": 2.44267201423645 + }, + { + "auxiliary_loss_clip": 0.01045399, + "auxiliary_loss_mlp": 0.01003539, + "balance_loss_clip": 1.00138175, + "balance_loss_mlp": 1.01567113, + "epoch": 0.2589508492409439, + "flos": 53934955724160.0, + "grad_norm": 0.8506593293873899, + "language_loss": 0.5717386, + "learning_rate": 3.474865258296403e-06, + "loss": 0.59222794, + "num_input_tokens_seen": 93058645, + "router_z_loss_clip": 0.02160645, + "router_z_loss_mlp": 0.296875, + "step": 4307, + "time_per_iteration": 3.0457189083099365 + }, + { + "auxiliary_loss_clip": 0.01135839, + "auxiliary_loss_mlp": 0.01039878, + "balance_loss_clip": 1.02389181, + "balance_loss_mlp": 1.04729199, + "epoch": 0.2590109724936119, + "flos": 22125785402880.0, + "grad_norm": 1.7695447826328226, + "language_loss": 0.71543598, + "learning_rate": 3.474602179854327e-06, + "loss": 0.73719311, + "num_input_tokens_seen": 93077140, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8828125, + "step": 4308, + "time_per_iteration": 2.4612655639648438 + }, + { + "auxiliary_loss_clip": 0.0113812, + "auxiliary_loss_mlp": 0.01041087, + "balance_loss_clip": 1.02439809, + "balance_loss_mlp": 1.04625905, + "epoch": 0.2590710957462799, + "flos": 13474294398720.0, + "grad_norm": 2.097007373458932, + "language_loss": 0.84195936, + "learning_rate": 3.4743390454951886e-06, + "loss": 0.86375141, + "num_input_tokens_seen": 93093580, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.91796875, + "step": 4309, + "time_per_iteration": 2.458937883377075 + }, + { + "auxiliary_loss_clip": 0.01138522, + "auxiliary_loss_mlp": 0.01041522, + "balance_loss_clip": 1.02609062, + "balance_loss_mlp": 1.04893243, + "epoch": 0.25913121899894787, + "flos": 22307098279680.0, + "grad_norm": 1.520786669442297, + "language_loss": 0.8451637, + "learning_rate": 3.474075855228966e-06, + "loss": 0.8669641, + "num_input_tokens_seen": 93112345, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8984375, + "step": 4310, + "time_per_iteration": 2.453946828842163 + }, + { + "auxiliary_loss_clip": 0.0113925, + "auxiliary_loss_mlp": 0.01043346, + "balance_loss_clip": 1.02706194, + "balance_loss_mlp": 1.04705715, + "epoch": 0.25919134225161583, + "flos": 25811728364160.0, + "grad_norm": 2.3904067628525305, + "language_loss": 0.77478111, + "learning_rate": 3.473812609065639e-06, + "loss": 0.79660702, + "num_input_tokens_seen": 93131545, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.921875, + "step": 4311, + "time_per_iteration": 2.5142812728881836 + }, + { + "auxiliary_loss_clip": 0.01138542, + "auxiliary_loss_mlp": 0.01041115, + "balance_loss_clip": 1.0248189, + "balance_loss_mlp": 1.04691362, + "epoch": 0.2592514655042838, + "flos": 31212262108800.0, + "grad_norm": 3.1447136536803852, + "language_loss": 0.72220832, + "learning_rate": 3.4735493070151904e-06, + "loss": 0.74400491, + "num_input_tokens_seen": 93150730, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.91796875, + "step": 4312, + "time_per_iteration": 2.5275332927703857 + }, + { + "auxiliary_loss_clip": 0.01134993, + "auxiliary_loss_mlp": 0.01040705, + "balance_loss_clip": 1.02434921, + "balance_loss_mlp": 1.04480851, + "epoch": 0.25931158875695176, + "flos": 18474998878080.0, + "grad_norm": 2.2264539824076683, + "language_loss": 0.69908661, + "learning_rate": 3.4732859490876044e-06, + "loss": 0.72084355, + "num_input_tokens_seen": 93167895, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90234375, + "step": 4313, + "time_per_iteration": 2.479011058807373 + }, + { + "auxiliary_loss_clip": 0.01133563, + "auxiliary_loss_mlp": 0.01043367, + "balance_loss_clip": 1.02800131, + "balance_loss_mlp": 1.04467726, + "epoch": 0.2593717120096197, + "flos": 19207935895680.0, + "grad_norm": 1.7186396349483555, + "language_loss": 0.80486274, + "learning_rate": 3.473022535292867e-06, + "loss": 0.82663202, + "num_input_tokens_seen": 93187650, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.88671875, + "step": 4314, + "time_per_iteration": 2.443934679031372 + }, + { + "auxiliary_loss_clip": 0.01138226, + "auxiliary_loss_mlp": 0.01047643, + "balance_loss_clip": 1.03030992, + "balance_loss_mlp": 1.04506671, + "epoch": 0.2594318352622877, + "flos": 31248100903680.0, + "grad_norm": 2.0498851814527863, + "language_loss": 0.6687156, + "learning_rate": 3.472759065640968e-06, + "loss": 0.69057429, + "num_input_tokens_seen": 93207370, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9296875, + "step": 4315, + "time_per_iteration": 2.5375983715057373 + }, + { + "auxiliary_loss_clip": 0.01132586, + "auxiliary_loss_mlp": 0.01040869, + "balance_loss_clip": 1.02563381, + "balance_loss_mlp": 1.04426146, + "epoch": 0.25949195851495566, + "flos": 22237144542720.0, + "grad_norm": 1.5303062780919283, + "language_loss": 0.7911852, + "learning_rate": 3.4724955401418976e-06, + "loss": 0.81291974, + "num_input_tokens_seen": 93227925, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8828125, + "step": 4316, + "time_per_iteration": 2.448997735977173 + }, + { + "auxiliary_loss_clip": 0.01136257, + "auxiliary_loss_mlp": 0.01039906, + "balance_loss_clip": 1.02333546, + "balance_loss_mlp": 1.0446136, + "epoch": 0.2595520817676236, + "flos": 28075716645120.0, + "grad_norm": 1.687308210321376, + "language_loss": 0.77601087, + "learning_rate": 3.4722319588056487e-06, + "loss": 0.79777247, + "num_input_tokens_seen": 93250020, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9140625, + "step": 4317, + "time_per_iteration": 2.5545339584350586 + }, + { + "auxiliary_loss_clip": 0.01136641, + "auxiliary_loss_mlp": 0.01048751, + "balance_loss_clip": 1.03160882, + "balance_loss_mlp": 1.04599953, + "epoch": 0.2596122050202916, + "flos": 20190954378240.0, + "grad_norm": 2.5535432929686883, + "language_loss": 0.77773315, + "learning_rate": 3.4719683216422163e-06, + "loss": 0.79958701, + "num_input_tokens_seen": 93269070, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.90625, + "step": 4318, + "time_per_iteration": 2.450573682785034 + }, + { + "auxiliary_loss_clip": 0.01133741, + "auxiliary_loss_mlp": 0.01045128, + "balance_loss_clip": 1.02717471, + "balance_loss_mlp": 1.04450393, + "epoch": 0.25967232827295955, + "flos": 22527949052160.0, + "grad_norm": 1.801084946435003, + "language_loss": 0.76197278, + "learning_rate": 3.471704628661598e-06, + "loss": 0.78376144, + "num_input_tokens_seen": 93290250, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.890625, + "step": 4319, + "time_per_iteration": 2.5243709087371826 + }, + { + "auxiliary_loss_clip": 0.01131874, + "auxiliary_loss_mlp": 0.01037382, + "balance_loss_clip": 1.0216701, + "balance_loss_mlp": 1.04500592, + "epoch": 0.2597324515256275, + "flos": 21068252156160.0, + "grad_norm": 1.8511829127720039, + "language_loss": 0.76338619, + "learning_rate": 3.4714408798737925e-06, + "loss": 0.78507876, + "num_input_tokens_seen": 93310090, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8671875, + "step": 4320, + "time_per_iteration": 2.4792070388793945 + }, + { + "auxiliary_loss_clip": 0.01135729, + "auxiliary_loss_mlp": 0.01038323, + "balance_loss_clip": 1.02205038, + "balance_loss_mlp": 1.04641151, + "epoch": 0.2597925747782955, + "flos": 22050013662720.0, + "grad_norm": 1.7592602092397844, + "language_loss": 0.71143925, + "learning_rate": 3.471177075288801e-06, + "loss": 0.73317981, + "num_input_tokens_seen": 93329570, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.890625, + "step": 4321, + "time_per_iteration": 2.5381112098693848 + }, + { + "auxiliary_loss_clip": 0.01137982, + "auxiliary_loss_mlp": 0.01044713, + "balance_loss_clip": 1.02813125, + "balance_loss_mlp": 1.04517424, + "epoch": 0.2598526980309635, + "flos": 19536949497600.0, + "grad_norm": 2.037757848326605, + "language_loss": 0.74483943, + "learning_rate": 3.4709132149166277e-06, + "loss": 0.76666641, + "num_input_tokens_seen": 93347920, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9296875, + "step": 4322, + "time_per_iteration": 2.4379777908325195 + }, + { + "auxiliary_loss_clip": 0.01134471, + "auxiliary_loss_mlp": 0.01047461, + "balance_loss_clip": 1.03059244, + "balance_loss_mlp": 1.04368353, + "epoch": 0.25991282128363147, + "flos": 24495207079680.0, + "grad_norm": 1.9467125010752846, + "language_loss": 0.73674595, + "learning_rate": 3.470649298767278e-06, + "loss": 0.75856531, + "num_input_tokens_seen": 93367145, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.90625, + "step": 4323, + "time_per_iteration": 2.517399549484253 + }, + { + "auxiliary_loss_clip": 0.01141538, + "auxiliary_loss_mlp": 0.01044133, + "balance_loss_clip": 1.0263952, + "balance_loss_mlp": 1.04524922, + "epoch": 0.25997294453629943, + "flos": 24201457655040.0, + "grad_norm": 2.197207179409235, + "language_loss": 0.6710211, + "learning_rate": 3.4703853268507597e-06, + "loss": 0.69287789, + "num_input_tokens_seen": 93386555, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.96484375, + "step": 4324, + "time_per_iteration": 2.478419303894043 + }, + { + "auxiliary_loss_clip": 0.01132905, + "auxiliary_loss_mlp": 0.01043334, + "balance_loss_clip": 1.02839708, + "balance_loss_mlp": 1.04456055, + "epoch": 0.2600330677889674, + "flos": 31431460855680.0, + "grad_norm": 2.3342631450552838, + "language_loss": 0.70809424, + "learning_rate": 3.470121299177082e-06, + "loss": 0.72985667, + "num_input_tokens_seen": 93405590, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8828125, + "step": 4325, + "time_per_iteration": 2.5444648265838623 + }, + { + "auxiliary_loss_clip": 0.01133012, + "auxiliary_loss_mlp": 0.01037608, + "balance_loss_clip": 1.02139568, + "balance_loss_mlp": 1.04295206, + "epoch": 0.26009319104163536, + "flos": 32266527217920.0, + "grad_norm": 2.476658211689484, + "language_loss": 0.73041123, + "learning_rate": 3.469857215756257e-06, + "loss": 0.7521174, + "num_input_tokens_seen": 93424750, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.90234375, + "step": 4326, + "time_per_iteration": 2.5281147956848145 + }, + { + "auxiliary_loss_clip": 0.01127256, + "auxiliary_loss_mlp": 0.01038648, + "balance_loss_clip": 1.02424729, + "balance_loss_mlp": 1.04237306, + "epoch": 0.26015331429430333, + "flos": 26286754752000.0, + "grad_norm": 1.820673081097861, + "language_loss": 0.8661378, + "learning_rate": 3.4695930765982997e-06, + "loss": 0.88779688, + "num_input_tokens_seen": 93443465, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8515625, + "step": 4327, + "time_per_iteration": 2.4929087162017822 + }, + { + "auxiliary_loss_clip": 0.01138344, + "auxiliary_loss_mlp": 0.01049414, + "balance_loss_clip": 1.03121042, + "balance_loss_mlp": 1.04679346, + "epoch": 0.2602134375469713, + "flos": 21142335957120.0, + "grad_norm": 2.002075266566112, + "language_loss": 0.80111909, + "learning_rate": 3.4693288817132255e-06, + "loss": 0.82299662, + "num_input_tokens_seen": 93462580, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.9140625, + "step": 4328, + "time_per_iteration": 2.451131582260132 + }, + { + "auxiliary_loss_clip": 0.0112995, + "auxiliary_loss_mlp": 0.01040006, + "balance_loss_clip": 1.02394891, + "balance_loss_mlp": 1.04219353, + "epoch": 0.26027356079963926, + "flos": 25921327737600.0, + "grad_norm": 1.514483384647774, + "language_loss": 0.87428784, + "learning_rate": 3.4690646311110525e-06, + "loss": 0.89598739, + "num_input_tokens_seen": 93482790, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87890625, + "step": 4329, + "time_per_iteration": 2.522368907928467 + }, + { + "auxiliary_loss_clip": 0.01132983, + "auxiliary_loss_mlp": 0.010381, + "balance_loss_clip": 1.02261448, + "balance_loss_mlp": 1.04585731, + "epoch": 0.2603336840523072, + "flos": 26359222440960.0, + "grad_norm": 2.096665977126354, + "language_loss": 0.77746803, + "learning_rate": 3.468800324801802e-06, + "loss": 0.79917884, + "num_input_tokens_seen": 93498795, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.87109375, + "step": 4330, + "time_per_iteration": 2.4771482944488525 + }, + { + "auxiliary_loss_clip": 0.01134796, + "auxiliary_loss_mlp": 0.01047613, + "balance_loss_clip": 1.03136468, + "balance_loss_mlp": 1.04525268, + "epoch": 0.2603938073049752, + "flos": 23513661054720.0, + "grad_norm": 2.4595446714184654, + "language_loss": 0.75248575, + "learning_rate": 3.4685359627954958e-06, + "loss": 0.77430975, + "num_input_tokens_seen": 93518335, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.89453125, + "step": 4331, + "time_per_iteration": 2.5284199714660645 + }, + { + "auxiliary_loss_clip": 0.01137533, + "auxiliary_loss_mlp": 0.0103716, + "balance_loss_clip": 1.02158558, + "balance_loss_mlp": 1.05026567, + "epoch": 0.26045393055764315, + "flos": 25374300537600.0, + "grad_norm": 1.3491085383994963, + "language_loss": 0.69003588, + "learning_rate": 3.4682715451021584e-06, + "loss": 0.71178281, + "num_input_tokens_seen": 93539170, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.87109375, + "step": 4332, + "time_per_iteration": 2.476125478744507 + }, + { + "auxiliary_loss_clip": 0.0113624, + "auxiliary_loss_mlp": 0.01041054, + "balance_loss_clip": 1.02453184, + "balance_loss_mlp": 1.04542089, + "epoch": 0.2605140538103111, + "flos": 27635272076160.0, + "grad_norm": 2.3270567941112854, + "language_loss": 0.79674375, + "learning_rate": 3.4680070717318174e-06, + "loss": 0.81851673, + "num_input_tokens_seen": 93558480, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.91015625, + "step": 4333, + "time_per_iteration": 2.5234756469726562 + }, + { + "auxiliary_loss_clip": 0.01129676, + "auxiliary_loss_mlp": 0.01043365, + "balance_loss_clip": 1.02791548, + "balance_loss_mlp": 1.04336357, + "epoch": 0.2605741770629791, + "flos": 13769839503360.0, + "grad_norm": 1.7608965931322442, + "language_loss": 0.80725265, + "learning_rate": 3.467742542694501e-06, + "loss": 0.82898307, + "num_input_tokens_seen": 93575220, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.86328125, + "step": 4334, + "time_per_iteration": 2.4361026287078857 + }, + { + "auxiliary_loss_clip": 0.01132792, + "auxiliary_loss_mlp": 0.01038034, + "balance_loss_clip": 1.02128482, + "balance_loss_mlp": 1.04452491, + "epoch": 0.26063430031564705, + "flos": 26031681296640.0, + "grad_norm": 1.8337144126432974, + "language_loss": 0.80039275, + "learning_rate": 3.46747795800024e-06, + "loss": 0.822101, + "num_input_tokens_seen": 93597015, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.87890625, + "step": 4335, + "time_per_iteration": 2.5246174335479736 + }, + { + "auxiliary_loss_clip": 0.01043695, + "auxiliary_loss_mlp": 0.0102207, + "balance_loss_clip": 1.02024579, + "balance_loss_mlp": 1.01431763, + "epoch": 0.26069442356831507, + "flos": 62443809820800.0, + "grad_norm": 0.849908687169067, + "language_loss": 0.60851145, + "learning_rate": 3.467213317659068e-06, + "loss": 0.62916911, + "num_input_tokens_seen": 93657775, + "router_z_loss_clip": 0.01818848, + "router_z_loss_mlp": 0.29296875, + "step": 4336, + "time_per_iteration": 3.0349080562591553 + }, + { + "auxiliary_loss_clip": 0.01136323, + "auxiliary_loss_mlp": 0.01047902, + "balance_loss_clip": 1.03172541, + "balance_loss_mlp": 1.04599738, + "epoch": 0.26075454682098304, + "flos": 13626376583040.0, + "grad_norm": 6.860825703537795, + "language_loss": 0.77407634, + "learning_rate": 3.46694862168102e-06, + "loss": 0.79591858, + "num_input_tokens_seen": 93676145, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.90625, + "step": 4337, + "time_per_iteration": 2.4549763202667236 + }, + { + "auxiliary_loss_clip": 0.01135126, + "auxiliary_loss_mlp": 0.01045126, + "balance_loss_clip": 1.02755404, + "balance_loss_mlp": 1.04531193, + "epoch": 0.260814670073651, + "flos": 12126531260160.0, + "grad_norm": 2.1553767319060646, + "language_loss": 0.74116468, + "learning_rate": 3.4666838700761334e-06, + "loss": 0.76296723, + "num_input_tokens_seen": 93692480, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8984375, + "step": 4338, + "time_per_iteration": 2.4109654426574707 + }, + { + "auxiliary_loss_clip": 0.01137659, + "auxiliary_loss_mlp": 0.01042073, + "balance_loss_clip": 1.02495456, + "balance_loss_mlp": 1.0451895, + "epoch": 0.26087479332631897, + "flos": 15122522805120.0, + "grad_norm": 2.414973208379154, + "language_loss": 0.80645537, + "learning_rate": 3.466419062854447e-06, + "loss": 0.82825273, + "num_input_tokens_seen": 93710165, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.92578125, + "step": 4339, + "time_per_iteration": 2.4671595096588135 + }, + { + "auxiliary_loss_clip": 0.01133141, + "auxiliary_loss_mlp": 0.01038061, + "balance_loss_clip": 1.02287948, + "balance_loss_mlp": 1.04559159, + "epoch": 0.26093491657898693, + "flos": 24680937329280.0, + "grad_norm": 1.5844023841754464, + "language_loss": 0.76694596, + "learning_rate": 3.4661542000260033e-06, + "loss": 0.78865802, + "num_input_tokens_seen": 93730185, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.875, + "step": 4340, + "time_per_iteration": 2.4803388118743896 + }, + { + "auxiliary_loss_clip": 0.01137352, + "auxiliary_loss_mlp": 0.01037831, + "balance_loss_clip": 1.02185678, + "balance_loss_mlp": 1.04666209, + "epoch": 0.2609950398316549, + "flos": 25116138512640.0, + "grad_norm": 1.5290989424491332, + "language_loss": 0.82436979, + "learning_rate": 3.465889281600845e-06, + "loss": 0.84612167, + "num_input_tokens_seen": 93747690, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.90625, + "step": 4341, + "time_per_iteration": 2.5263681411743164 + }, + { + "auxiliary_loss_clip": 0.01134552, + "auxiliary_loss_mlp": 0.01039374, + "balance_loss_clip": 1.02236271, + "balance_loss_mlp": 1.04563117, + "epoch": 0.26105516308432286, + "flos": 28548588216960.0, + "grad_norm": 2.4125290221035773, + "language_loss": 0.76542389, + "learning_rate": 3.4656243075890183e-06, + "loss": 0.78716314, + "num_input_tokens_seen": 93767405, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.890625, + "step": 4342, + "time_per_iteration": 2.5043585300445557 + }, + { + "auxiliary_loss_clip": 0.01132446, + "auxiliary_loss_mlp": 0.010328, + "balance_loss_clip": 1.01570523, + "balance_loss_mlp": 1.04324019, + "epoch": 0.2611152863369908, + "flos": 39530609447040.0, + "grad_norm": 1.8018778201456855, + "language_loss": 0.66747689, + "learning_rate": 3.4653592780005707e-06, + "loss": 0.68912935, + "num_input_tokens_seen": 93789950, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.89453125, + "step": 4343, + "time_per_iteration": 2.6470234394073486 + }, + { + "auxiliary_loss_clip": 0.01136306, + "auxiliary_loss_mlp": 0.01041522, + "balance_loss_clip": 1.02467799, + "balance_loss_mlp": 1.04494977, + "epoch": 0.2611754095896588, + "flos": 13735329511680.0, + "grad_norm": 2.0339901471708646, + "language_loss": 0.73817015, + "learning_rate": 3.465094192845553e-06, + "loss": 0.75994843, + "num_input_tokens_seen": 93807835, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9140625, + "step": 4344, + "time_per_iteration": 5.431513071060181 + }, + { + "auxiliary_loss_clip": 0.0113578, + "auxiliary_loss_mlp": 0.01038278, + "balance_loss_clip": 1.02257776, + "balance_loss_mlp": 1.04692459, + "epoch": 0.26123553284232676, + "flos": 21506649649920.0, + "grad_norm": 3.7636245605224072, + "language_loss": 0.86394477, + "learning_rate": 3.4648290521340165e-06, + "loss": 0.88568532, + "num_input_tokens_seen": 93825670, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.890625, + "step": 4345, + "time_per_iteration": 2.4908552169799805 + }, + { + "auxiliary_loss_clip": 0.01129626, + "auxiliary_loss_mlp": 0.01039942, + "balance_loss_clip": 1.02422452, + "balance_loss_mlp": 1.04427588, + "epoch": 0.2612956560949947, + "flos": 21139786091520.0, + "grad_norm": 1.88977116996907, + "language_loss": 0.7612443, + "learning_rate": 3.464563855876015e-06, + "loss": 0.78293997, + "num_input_tokens_seen": 93844045, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.85546875, + "step": 4346, + "time_per_iteration": 2.4966983795166016 + }, + { + "auxiliary_loss_clip": 0.01133219, + "auxiliary_loss_mlp": 0.01041377, + "balance_loss_clip": 1.02547407, + "balance_loss_mlp": 1.04483962, + "epoch": 0.2613557793476627, + "flos": 25119011600640.0, + "grad_norm": 1.5621162347417301, + "language_loss": 0.75868237, + "learning_rate": 3.464298604081606e-06, + "loss": 0.78042835, + "num_input_tokens_seen": 93864380, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8828125, + "step": 4347, + "time_per_iteration": 2.5392181873321533 + }, + { + "auxiliary_loss_clip": 0.01133725, + "auxiliary_loss_mlp": 0.01033882, + "balance_loss_clip": 1.01846862, + "balance_loss_mlp": 1.04549503, + "epoch": 0.26141590260033065, + "flos": 26067699659520.0, + "grad_norm": 1.4125954345922265, + "language_loss": 0.73354399, + "learning_rate": 3.4640332967608476e-06, + "loss": 0.75522006, + "num_input_tokens_seen": 93885475, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8828125, + "step": 4348, + "time_per_iteration": 2.5206878185272217 + }, + { + "auxiliary_loss_clip": 0.01134547, + "auxiliary_loss_mlp": 0.01039621, + "balance_loss_clip": 1.02286005, + "balance_loss_mlp": 1.04503882, + "epoch": 0.2614760258529987, + "flos": 25701518459520.0, + "grad_norm": 1.8182616406273437, + "language_loss": 0.91063923, + "learning_rate": 3.463767933923799e-06, + "loss": 0.93238091, + "num_input_tokens_seen": 93905545, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.89453125, + "step": 4349, + "time_per_iteration": 2.526134967803955 + }, + { + "auxiliary_loss_clip": 0.01132572, + "auxiliary_loss_mlp": 0.01042392, + "balance_loss_clip": 1.02663279, + "balance_loss_mlp": 1.0461632, + "epoch": 0.26153614910566664, + "flos": 17457147181440.0, + "grad_norm": 1.7312169360414529, + "language_loss": 0.79879099, + "learning_rate": 3.463502515580524e-06, + "loss": 0.82054067, + "num_input_tokens_seen": 93924185, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.86328125, + "step": 4350, + "time_per_iteration": 2.4420506954193115 + }, + { + "auxiliary_loss_clip": 0.01129246, + "auxiliary_loss_mlp": 0.01039783, + "balance_loss_clip": 1.02388072, + "balance_loss_mlp": 1.04430401, + "epoch": 0.2615962723583346, + "flos": 17712831168000.0, + "grad_norm": 1.8647374515536046, + "language_loss": 0.62139511, + "learning_rate": 3.4632370417410866e-06, + "loss": 0.64308536, + "num_input_tokens_seen": 93942825, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8515625, + "step": 4351, + "time_per_iteration": 2.4613640308380127 + }, + { + "auxiliary_loss_clip": 0.01134641, + "auxiliary_loss_mlp": 0.01038195, + "balance_loss_clip": 1.02241123, + "balance_loss_mlp": 1.04469466, + "epoch": 0.26165639561100257, + "flos": 23257725672960.0, + "grad_norm": 2.09308554357217, + "language_loss": 0.83596927, + "learning_rate": 3.462971512415555e-06, + "loss": 0.85769767, + "num_input_tokens_seen": 93962045, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8984375, + "step": 4352, + "time_per_iteration": 2.4712979793548584 + }, + { + "auxiliary_loss_clip": 0.01045226, + "auxiliary_loss_mlp": 0.01000353, + "balance_loss_clip": 0.9986006, + "balance_loss_mlp": 1.01526213, + "epoch": 0.26171651886367053, + "flos": 66737970800640.0, + "grad_norm": 0.8010954727993301, + "language_loss": 0.70645392, + "learning_rate": 3.462705927613996e-06, + "loss": 0.72690976, + "num_input_tokens_seen": 94021175, + "router_z_loss_clip": 0.01757812, + "router_z_loss_mlp": 0.29882812, + "step": 4353, + "time_per_iteration": 3.026418447494507 + }, + { + "auxiliary_loss_clip": 0.01132608, + "auxiliary_loss_mlp": 0.01047561, + "balance_loss_clip": 1.03045464, + "balance_loss_mlp": 1.04494369, + "epoch": 0.2617766421163385, + "flos": 22349581090560.0, + "grad_norm": 1.7700850953213416, + "language_loss": 0.77393121, + "learning_rate": 3.4624402873464816e-06, + "loss": 0.79573292, + "num_input_tokens_seen": 94043370, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.875, + "step": 4354, + "time_per_iteration": 2.535482883453369 + }, + { + "auxiliary_loss_clip": 0.01138552, + "auxiliary_loss_mlp": 0.0104457, + "balance_loss_clip": 1.02826262, + "balance_loss_mlp": 1.04513574, + "epoch": 0.26183676536900646, + "flos": 26067125041920.0, + "grad_norm": 2.1625978203859826, + "language_loss": 0.68280292, + "learning_rate": 3.462174591623085e-06, + "loss": 0.70463413, + "num_input_tokens_seen": 94063510, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.93359375, + "step": 4355, + "time_per_iteration": 2.5276527404785156 + }, + { + "auxiliary_loss_clip": 0.01130838, + "auxiliary_loss_mlp": 0.01039393, + "balance_loss_clip": 1.02207148, + "balance_loss_mlp": 1.04375613, + "epoch": 0.26189688862167443, + "flos": 20996466825600.0, + "grad_norm": 1.9702640724114775, + "language_loss": 0.67509294, + "learning_rate": 3.4619088404538815e-06, + "loss": 0.69679523, + "num_input_tokens_seen": 94083865, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.87109375, + "step": 4356, + "time_per_iteration": 2.454436779022217 + }, + { + "auxiliary_loss_clip": 0.01043638, + "auxiliary_loss_mlp": 0.01003266, + "balance_loss_clip": 1.00139415, + "balance_loss_mlp": 1.01376009, + "epoch": 0.2619570118743424, + "flos": 65798261141760.0, + "grad_norm": 0.6781381277043278, + "language_loss": 0.53156137, + "learning_rate": 3.4616430338489487e-06, + "loss": 0.55203032, + "num_input_tokens_seen": 94144095, + "router_z_loss_clip": 0.01867676, + "router_z_loss_mlp": 0.29882812, + "step": 4357, + "time_per_iteration": 2.99239444732666 + }, + { + "auxiliary_loss_clip": 0.01138081, + "auxiliary_loss_mlp": 0.01045526, + "balance_loss_clip": 1.02955151, + "balance_loss_mlp": 1.04608119, + "epoch": 0.26201713512701036, + "flos": 28766817296640.0, + "grad_norm": 1.843205511563007, + "language_loss": 0.84329486, + "learning_rate": 3.4613771718183654e-06, + "loss": 0.86513096, + "num_input_tokens_seen": 94163035, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.921875, + "step": 4358, + "time_per_iteration": 2.511441707611084 + }, + { + "auxiliary_loss_clip": 0.0113833, + "auxiliary_loss_mlp": 0.01042477, + "balance_loss_clip": 1.02476251, + "balance_loss_mlp": 1.0450834, + "epoch": 0.2620772583796783, + "flos": 26432516142720.0, + "grad_norm": 2.1805365254718367, + "language_loss": 0.67303276, + "learning_rate": 3.4611112543722127e-06, + "loss": 0.69484085, + "num_input_tokens_seen": 94182520, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9296875, + "step": 4359, + "time_per_iteration": 2.5318756103515625 + }, + { + "auxiliary_loss_clip": 0.0113089, + "auxiliary_loss_mlp": 0.01042276, + "balance_loss_clip": 1.02725601, + "balance_loss_mlp": 1.04242957, + "epoch": 0.2621373816323463, + "flos": 20156552127360.0, + "grad_norm": 1.947910834650985, + "language_loss": 0.78673261, + "learning_rate": 3.4608452815205757e-06, + "loss": 0.80846429, + "num_input_tokens_seen": 94201795, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.88671875, + "step": 4360, + "time_per_iteration": 2.4551331996917725 + }, + { + "auxiliary_loss_clip": 0.01129221, + "auxiliary_loss_mlp": 0.01040075, + "balance_loss_clip": 1.02454162, + "balance_loss_mlp": 1.04250073, + "epoch": 0.26219750488501425, + "flos": 28621235473920.0, + "grad_norm": 1.9921513845886445, + "language_loss": 0.68169516, + "learning_rate": 3.4605792532735387e-06, + "loss": 0.70338809, + "num_input_tokens_seen": 94222390, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8671875, + "step": 4361, + "time_per_iteration": 2.57106351852417 + }, + { + "auxiliary_loss_clip": 0.01135372, + "auxiliary_loss_mlp": 0.01057475, + "balance_loss_clip": 1.04022598, + "balance_loss_mlp": 1.04400647, + "epoch": 0.2622576281376823, + "flos": 15042549173760.0, + "grad_norm": 1.9312179198305752, + "language_loss": 0.84310883, + "learning_rate": 3.46031316964119e-06, + "loss": 0.86503732, + "num_input_tokens_seen": 94239980, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9140625, + "step": 4362, + "time_per_iteration": 2.430020570755005 + }, + { + "auxiliary_loss_clip": 0.01133753, + "auxiliary_loss_mlp": 0.01040156, + "balance_loss_clip": 1.02282345, + "balance_loss_mlp": 1.04637551, + "epoch": 0.26231775139035024, + "flos": 26396174557440.0, + "grad_norm": 1.792780117353334, + "language_loss": 0.65294504, + "learning_rate": 3.4600470306336197e-06, + "loss": 0.67468411, + "num_input_tokens_seen": 94260715, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.875, + "step": 4363, + "time_per_iteration": 2.546393632888794 + }, + { + "auxiliary_loss_clip": 0.01042076, + "auxiliary_loss_mlp": 0.01004318, + "balance_loss_clip": 1.00252998, + "balance_loss_mlp": 1.0123173, + "epoch": 0.2623778746430182, + "flos": 65408918647680.0, + "grad_norm": 0.8867533167936222, + "language_loss": 0.61098528, + "learning_rate": 3.4597808362609194e-06, + "loss": 0.63144922, + "num_input_tokens_seen": 94321285, + "router_z_loss_clip": 0.01782227, + "router_z_loss_mlp": 0.296875, + "step": 4364, + "time_per_iteration": 3.150812864303589 + }, + { + "auxiliary_loss_clip": 0.01138346, + "auxiliary_loss_mlp": 0.01051385, + "balance_loss_clip": 1.03358722, + "balance_loss_mlp": 1.0468297, + "epoch": 0.26243799789568617, + "flos": 12604215254400.0, + "grad_norm": 2.424942653514092, + "language_loss": 0.71549827, + "learning_rate": 3.459514586533184e-06, + "loss": 0.73739558, + "num_input_tokens_seen": 94335420, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9140625, + "step": 4365, + "time_per_iteration": 2.493540048599243 + }, + { + "auxiliary_loss_clip": 0.0113494, + "auxiliary_loss_mlp": 0.01045115, + "balance_loss_clip": 1.02917075, + "balance_loss_mlp": 1.04654169, + "epoch": 0.26249812114835414, + "flos": 28623821253120.0, + "grad_norm": 1.8316261966241354, + "language_loss": 0.76925993, + "learning_rate": 3.459248281460509e-06, + "loss": 0.79106045, + "num_input_tokens_seen": 94357440, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8828125, + "step": 4366, + "time_per_iteration": 2.536853313446045 + }, + { + "auxiliary_loss_clip": 0.01135829, + "auxiliary_loss_mlp": 0.01042418, + "balance_loss_clip": 1.02684951, + "balance_loss_mlp": 1.04666197, + "epoch": 0.2625582444010221, + "flos": 14465393441280.0, + "grad_norm": 2.2091260788228975, + "language_loss": 0.75838757, + "learning_rate": 3.4589819210529927e-06, + "loss": 0.78017008, + "num_input_tokens_seen": 94375690, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.890625, + "step": 4367, + "time_per_iteration": 2.4576163291931152 + }, + { + "auxiliary_loss_clip": 0.01131307, + "auxiliary_loss_mlp": 0.01040361, + "balance_loss_clip": 1.02454233, + "balance_loss_mlp": 1.04452682, + "epoch": 0.26261836765369007, + "flos": 16613174246400.0, + "grad_norm": 2.1913456464974392, + "language_loss": 0.69633925, + "learning_rate": 3.458715505320736e-06, + "loss": 0.71805596, + "num_input_tokens_seen": 94393190, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8671875, + "step": 4368, + "time_per_iteration": 2.4301586151123047 + }, + { + "auxiliary_loss_clip": 0.01130278, + "auxiliary_loss_mlp": 0.01046678, + "balance_loss_clip": 1.02970243, + "balance_loss_mlp": 1.04319167, + "epoch": 0.26267849090635803, + "flos": 20519932066560.0, + "grad_norm": 1.7035150195415922, + "language_loss": 0.78589904, + "learning_rate": 3.458449034273841e-06, + "loss": 0.80766863, + "num_input_tokens_seen": 94410975, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.8671875, + "step": 4369, + "time_per_iteration": 2.489316701889038 + }, + { + "auxiliary_loss_clip": 0.01132105, + "auxiliary_loss_mlp": 0.01042711, + "balance_loss_clip": 1.02653408, + "balance_loss_mlp": 1.04431546, + "epoch": 0.262738614159026, + "flos": 21323936142720.0, + "grad_norm": 2.0413446884893047, + "language_loss": 0.83486217, + "learning_rate": 3.4581825079224133e-06, + "loss": 0.85661036, + "num_input_tokens_seen": 94429985, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.87890625, + "step": 4370, + "time_per_iteration": 2.4422430992126465 + }, + { + "auxiliary_loss_clip": 0.01136913, + "auxiliary_loss_mlp": 0.01050187, + "balance_loss_clip": 1.03060055, + "balance_loss_mlp": 1.04530215, + "epoch": 0.26279873741169396, + "flos": 17603590930560.0, + "grad_norm": 2.3340239620956287, + "language_loss": 0.70963454, + "learning_rate": 3.4579159262765575e-06, + "loss": 0.73150551, + "num_input_tokens_seen": 94448660, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.9140625, + "step": 4371, + "time_per_iteration": 2.5099778175354004 + }, + { + "auxiliary_loss_clip": 0.01043374, + "auxiliary_loss_mlp": 0.00999769, + "balance_loss_clip": 0.99784929, + "balance_loss_mlp": 1.01338005, + "epoch": 0.2628588606643619, + "flos": 60949746587520.0, + "grad_norm": 0.7657034729714577, + "language_loss": 0.56477904, + "learning_rate": 3.457649289346384e-06, + "loss": 0.58521044, + "num_input_tokens_seen": 94515630, + "router_z_loss_clip": 0.01916504, + "router_z_loss_mlp": 0.30078125, + "step": 4372, + "time_per_iteration": 3.244558572769165 + }, + { + "auxiliary_loss_clip": 0.01129835, + "auxiliary_loss_mlp": 0.01038918, + "balance_loss_clip": 1.02283084, + "balance_loss_mlp": 1.04335582, + "epoch": 0.2629189839170299, + "flos": 27016315891200.0, + "grad_norm": 1.7597219251079876, + "language_loss": 0.77415234, + "learning_rate": 3.4573825971420042e-06, + "loss": 0.79583991, + "num_input_tokens_seen": 94535385, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.86328125, + "step": 4373, + "time_per_iteration": 2.517784833908081 + }, + { + "auxiliary_loss_clip": 0.01131814, + "auxiliary_loss_mlp": 0.01041505, + "balance_loss_clip": 1.02563787, + "balance_loss_mlp": 1.04454422, + "epoch": 0.26297910716969786, + "flos": 17019863009280.0, + "grad_norm": 4.0873872332994905, + "language_loss": 0.71538949, + "learning_rate": 3.4571158496735294e-06, + "loss": 0.73712265, + "num_input_tokens_seen": 94552650, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.87109375, + "step": 4374, + "time_per_iteration": 2.442124605178833 + }, + { + "auxiliary_loss_clip": 0.01133779, + "auxiliary_loss_mlp": 0.01042, + "balance_loss_clip": 1.02435732, + "balance_loss_mlp": 1.0458709, + "epoch": 0.2630392304223659, + "flos": 24897370728960.0, + "grad_norm": 2.271567992891854, + "language_loss": 0.80945283, + "learning_rate": 3.4568490469510756e-06, + "loss": 0.83121061, + "num_input_tokens_seen": 94574075, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.875, + "step": 4375, + "time_per_iteration": 2.4889678955078125 + }, + { + "auxiliary_loss_clip": 0.0113002, + "auxiliary_loss_mlp": 0.0104209, + "balance_loss_clip": 1.0265336, + "balance_loss_mlp": 1.04366982, + "epoch": 0.26309935367503384, + "flos": 32854026067200.0, + "grad_norm": 2.3689389683703, + "language_loss": 0.65721256, + "learning_rate": 3.4565821889847603e-06, + "loss": 0.67893362, + "num_input_tokens_seen": 94594255, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.86328125, + "step": 4376, + "time_per_iteration": 2.563701629638672 + }, + { + "auxiliary_loss_clip": 0.01134504, + "auxiliary_loss_mlp": 0.0104592, + "balance_loss_clip": 1.02940989, + "balance_loss_mlp": 1.04445267, + "epoch": 0.2631594769277018, + "flos": 15887958652800.0, + "grad_norm": 1.8646607453842572, + "language_loss": 0.69517326, + "learning_rate": 3.4563152757847026e-06, + "loss": 0.71697748, + "num_input_tokens_seen": 94611410, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8984375, + "step": 4377, + "time_per_iteration": 2.486117124557495 + }, + { + "auxiliary_loss_clip": 0.01134243, + "auxiliary_loss_mlp": 0.01044305, + "balance_loss_clip": 1.02786613, + "balance_loss_mlp": 1.04500914, + "epoch": 0.2632196001803698, + "flos": 50804943557760.0, + "grad_norm": 1.711844873276418, + "language_loss": 0.7866202, + "learning_rate": 3.4560483073610233e-06, + "loss": 0.80840576, + "num_input_tokens_seen": 94636575, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.890625, + "step": 4378, + "time_per_iteration": 2.7608227729797363 + }, + { + "auxiliary_loss_clip": 0.01133245, + "auxiliary_loss_mlp": 0.01045029, + "balance_loss_clip": 1.03000844, + "balance_loss_mlp": 1.04554546, + "epoch": 0.26327972343303774, + "flos": 13733031041280.0, + "grad_norm": 2.6216377344963004, + "language_loss": 0.76320505, + "learning_rate": 3.455781283723846e-06, + "loss": 0.78498781, + "num_input_tokens_seen": 94654345, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.875, + "step": 4379, + "time_per_iteration": 2.4329168796539307 + }, + { + "auxiliary_loss_clip": 0.01138135, + "auxiliary_loss_mlp": 0.01041523, + "balance_loss_clip": 1.02252114, + "balance_loss_mlp": 1.04633284, + "epoch": 0.2633398466857057, + "flos": 23769057732480.0, + "grad_norm": 2.3003567904549156, + "language_loss": 0.78237861, + "learning_rate": 3.4555142048832975e-06, + "loss": 0.8041752, + "num_input_tokens_seen": 94673985, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.91796875, + "step": 4380, + "time_per_iteration": 2.5423548221588135 + }, + { + "auxiliary_loss_clip": 0.01135772, + "auxiliary_loss_mlp": 0.01040582, + "balance_loss_clip": 1.02419698, + "balance_loss_mlp": 1.0444113, + "epoch": 0.26339996993837367, + "flos": 27600223380480.0, + "grad_norm": 2.288842357619654, + "language_loss": 0.63811409, + "learning_rate": 3.4552470708495036e-06, + "loss": 0.65987766, + "num_input_tokens_seen": 94693145, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9140625, + "step": 4381, + "time_per_iteration": 2.5096213817596436 + }, + { + "auxiliary_loss_clip": 0.01131521, + "auxiliary_loss_mlp": 0.01037713, + "balance_loss_clip": 1.02148831, + "balance_loss_mlp": 1.04359913, + "epoch": 0.26346009319104163, + "flos": 16946317912320.0, + "grad_norm": 1.8729093449566216, + "language_loss": 0.82822418, + "learning_rate": 3.454979881632595e-06, + "loss": 0.84991652, + "num_input_tokens_seen": 94710185, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.87890625, + "step": 4382, + "time_per_iteration": 2.4691555500030518 + }, + { + "auxiliary_loss_clip": 0.01138155, + "auxiliary_loss_mlp": 0.010471, + "balance_loss_clip": 1.02902842, + "balance_loss_mlp": 1.04550982, + "epoch": 0.2635202164437096, + "flos": 37232218915200.0, + "grad_norm": 2.126733729537993, + "language_loss": 0.69686437, + "learning_rate": 3.4547126372427035e-06, + "loss": 0.71871686, + "num_input_tokens_seen": 94730280, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9296875, + "step": 4383, + "time_per_iteration": 2.5923891067504883 + }, + { + "auxiliary_loss_clip": 0.01135659, + "auxiliary_loss_mlp": 0.01042881, + "balance_loss_clip": 1.02732468, + "balance_loss_mlp": 1.04591441, + "epoch": 0.26358033969637756, + "flos": 20996359084800.0, + "grad_norm": 1.929045699346076, + "language_loss": 0.69191134, + "learning_rate": 3.4544453376899638e-06, + "loss": 0.71369672, + "num_input_tokens_seen": 94748560, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8984375, + "step": 4384, + "time_per_iteration": 2.5067081451416016 + }, + { + "auxiliary_loss_clip": 0.01132133, + "auxiliary_loss_mlp": 0.01039726, + "balance_loss_clip": 1.02319217, + "balance_loss_mlp": 1.04400492, + "epoch": 0.26364046294904553, + "flos": 27746092512000.0, + "grad_norm": 2.1647401570854075, + "language_loss": 0.6994158, + "learning_rate": 3.45417798298451e-06, + "loss": 0.72113448, + "num_input_tokens_seen": 94767570, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8828125, + "step": 4385, + "time_per_iteration": 4.062510251998901 + }, + { + "auxiliary_loss_clip": 0.01138578, + "auxiliary_loss_mlp": 0.01042633, + "balance_loss_clip": 1.02551472, + "balance_loss_mlp": 1.04978371, + "epoch": 0.2637005862017135, + "flos": 22893088757760.0, + "grad_norm": 2.0926426044309543, + "language_loss": 0.85188037, + "learning_rate": 3.453910573136482e-06, + "loss": 0.87369245, + "num_input_tokens_seen": 94784985, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.88671875, + "step": 4386, + "time_per_iteration": 3.9604547023773193 + }, + { + "auxiliary_loss_clip": 0.0113699, + "auxiliary_loss_mlp": 0.01041328, + "balance_loss_clip": 1.02487707, + "balance_loss_mlp": 1.04755282, + "epoch": 0.26376070945438146, + "flos": 15048834053760.0, + "grad_norm": 2.2248904155103637, + "language_loss": 0.77169371, + "learning_rate": 3.4536431081560196e-06, + "loss": 0.79347688, + "num_input_tokens_seen": 94802545, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.89453125, + "step": 4387, + "time_per_iteration": 2.472367286682129 + }, + { + "auxiliary_loss_clip": 0.01137279, + "auxiliary_loss_mlp": 0.01046481, + "balance_loss_clip": 1.0305903, + "balance_loss_mlp": 1.04989982, + "epoch": 0.2638208327070494, + "flos": 21141833166720.0, + "grad_norm": 3.996041212149396, + "language_loss": 0.76269597, + "learning_rate": 3.453375588053264e-06, + "loss": 0.78453362, + "num_input_tokens_seen": 94820730, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.875, + "step": 4388, + "time_per_iteration": 2.4858386516571045 + }, + { + "auxiliary_loss_clip": 0.01132552, + "auxiliary_loss_mlp": 0.01035954, + "balance_loss_clip": 1.01924086, + "balance_loss_mlp": 1.04387724, + "epoch": 0.26388095595971744, + "flos": 21725597001600.0, + "grad_norm": 1.9510825560869567, + "language_loss": 0.86210662, + "learning_rate": 3.4531080128383617e-06, + "loss": 0.88379163, + "num_input_tokens_seen": 94839175, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.88671875, + "step": 4389, + "time_per_iteration": 2.508162260055542 + }, + { + "auxiliary_loss_clip": 0.0104392, + "auxiliary_loss_mlp": 0.01009323, + "balance_loss_clip": 1.00736833, + "balance_loss_mlp": 1.01341343, + "epoch": 0.2639410792123854, + "flos": 65515537192320.0, + "grad_norm": 0.8096176904924934, + "language_loss": 0.60333931, + "learning_rate": 3.452840382521457e-06, + "loss": 0.6238718, + "num_input_tokens_seen": 94898865, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.3046875, + "step": 4390, + "time_per_iteration": 3.0593924522399902 + }, + { + "auxiliary_loss_clip": 0.01135834, + "auxiliary_loss_mlp": 0.01038563, + "balance_loss_clip": 1.02213633, + "balance_loss_mlp": 1.04522729, + "epoch": 0.2640012024650534, + "flos": 23948574929280.0, + "grad_norm": 1.7836890720002585, + "language_loss": 0.77702433, + "learning_rate": 3.4525726971127e-06, + "loss": 0.79876828, + "num_input_tokens_seen": 94917490, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90625, + "step": 4391, + "time_per_iteration": 2.5331051349639893 + }, + { + "auxiliary_loss_clip": 0.0104332, + "auxiliary_loss_mlp": 0.01003932, + "balance_loss_clip": 1.00221586, + "balance_loss_mlp": 1.01322889, + "epoch": 0.26406132571772134, + "flos": 56441163369600.0, + "grad_norm": 0.9020745061185262, + "language_loss": 0.58752227, + "learning_rate": 3.45230495662224e-06, + "loss": 0.60799479, + "num_input_tokens_seen": 94969065, + "router_z_loss_clip": 0.01721191, + "router_z_loss_mlp": 0.30078125, + "step": 4392, + "time_per_iteration": 3.047438144683838 + }, + { + "auxiliary_loss_clip": 0.01140884, + "auxiliary_loss_mlp": 0.0104677, + "balance_loss_clip": 1.03039694, + "balance_loss_mlp": 1.04925656, + "epoch": 0.2641214489703893, + "flos": 22090557139200.0, + "grad_norm": 2.5811541881681697, + "language_loss": 0.68459845, + "learning_rate": 3.4520371610602306e-06, + "loss": 0.70647496, + "num_input_tokens_seen": 94988540, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.91796875, + "step": 4393, + "time_per_iteration": 2.5537288188934326 + }, + { + "auxiliary_loss_clip": 0.01139955, + "auxiliary_loss_mlp": 0.01040744, + "balance_loss_clip": 1.02258813, + "balance_loss_mlp": 1.04662204, + "epoch": 0.26418157222305727, + "flos": 16544764794240.0, + "grad_norm": 1.8702197697463565, + "language_loss": 0.83116519, + "learning_rate": 3.4517693104368267e-06, + "loss": 0.85297221, + "num_input_tokens_seen": 95004810, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.93359375, + "step": 4394, + "time_per_iteration": 2.421211004257202 + }, + { + "auxiliary_loss_clip": 0.01143407, + "auxiliary_loss_mlp": 0.01042347, + "balance_loss_clip": 1.02357125, + "balance_loss_mlp": 1.04951847, + "epoch": 0.26424169547572524, + "flos": 18002486442240.0, + "grad_norm": 2.049654769643576, + "language_loss": 0.70211649, + "learning_rate": 3.4515014047621856e-06, + "loss": 0.72397399, + "num_input_tokens_seen": 95024085, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9375, + "step": 4395, + "time_per_iteration": 2.522111654281616 + }, + { + "auxiliary_loss_clip": 0.01135756, + "auxiliary_loss_mlp": 0.01035967, + "balance_loss_clip": 1.01925397, + "balance_loss_mlp": 1.04784906, + "epoch": 0.2643018187283932, + "flos": 16983162288000.0, + "grad_norm": 1.822626622734132, + "language_loss": 0.86866504, + "learning_rate": 3.4512334440464655e-06, + "loss": 0.89038229, + "num_input_tokens_seen": 95042515, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8828125, + "step": 4396, + "time_per_iteration": 2.4450392723083496 + }, + { + "auxiliary_loss_clip": 0.01042786, + "auxiliary_loss_mlp": 0.01024145, + "balance_loss_clip": 1.02226114, + "balance_loss_mlp": 1.01312816, + "epoch": 0.26436194198106117, + "flos": 59664359416320.0, + "grad_norm": 0.7917805441344085, + "language_loss": 0.54999918, + "learning_rate": 3.4509654282998277e-06, + "loss": 0.57066846, + "num_input_tokens_seen": 95094835, + "router_z_loss_clip": 0.01879883, + "router_z_loss_mlp": 0.296875, + "step": 4397, + "time_per_iteration": 2.8438708782196045 + }, + { + "auxiliary_loss_clip": 0.01134821, + "auxiliary_loss_mlp": 0.01052373, + "balance_loss_clip": 1.03567195, + "balance_loss_mlp": 1.04701614, + "epoch": 0.26442206523372913, + "flos": 32921322197760.0, + "grad_norm": 2.0493441687219724, + "language_loss": 0.77840483, + "learning_rate": 3.450697357532435e-06, + "loss": 0.80027676, + "num_input_tokens_seen": 95113480, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.87890625, + "step": 4398, + "time_per_iteration": 2.562499523162842 + }, + { + "auxiliary_loss_clip": 0.01141073, + "auxiliary_loss_mlp": 0.01040123, + "balance_loss_clip": 1.02262306, + "balance_loss_mlp": 1.05005002, + "epoch": 0.2644821884863971, + "flos": 21031300039680.0, + "grad_norm": 2.041566803030235, + "language_loss": 0.67037976, + "learning_rate": 3.4504292317544534e-06, + "loss": 0.69219166, + "num_input_tokens_seen": 95132580, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.91015625, + "step": 4399, + "time_per_iteration": 2.487778663635254 + }, + { + "auxiliary_loss_clip": 0.01128661, + "auxiliary_loss_mlp": 0.0103792, + "balance_loss_clip": 1.02288818, + "balance_loss_mlp": 1.04565811, + "epoch": 0.26454231173906506, + "flos": 20776801201920.0, + "grad_norm": 2.1160884119586303, + "language_loss": 0.86152196, + "learning_rate": 3.4501610509760504e-06, + "loss": 0.88318777, + "num_input_tokens_seen": 95152375, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 4400, + "time_per_iteration": 2.4837841987609863 + }, + { + "auxiliary_loss_clip": 0.01138875, + "auxiliary_loss_mlp": 0.01039625, + "balance_loss_clip": 1.02188635, + "balance_loss_mlp": 1.04813862, + "epoch": 0.264602434991733, + "flos": 16618669027200.0, + "grad_norm": 2.751022626956878, + "language_loss": 0.75779396, + "learning_rate": 3.4498928152073944e-06, + "loss": 0.77957898, + "num_input_tokens_seen": 95170265, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.90625, + "step": 4401, + "time_per_iteration": 2.548297166824341 + }, + { + "auxiliary_loss_clip": 0.01138206, + "auxiliary_loss_mlp": 0.01050893, + "balance_loss_clip": 1.03236771, + "balance_loss_mlp": 1.04606974, + "epoch": 0.26466255824440105, + "flos": 19062677295360.0, + "grad_norm": 1.9215434150559794, + "language_loss": 0.88267732, + "learning_rate": 3.4496245244586577e-06, + "loss": 0.90456831, + "num_input_tokens_seen": 95188655, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.921875, + "step": 4402, + "time_per_iteration": 2.4422647953033447 + }, + { + "auxiliary_loss_clip": 0.01135603, + "auxiliary_loss_mlp": 0.01048039, + "balance_loss_clip": 1.03151679, + "balance_loss_mlp": 1.04594266, + "epoch": 0.264722681497069, + "flos": 22638554006400.0, + "grad_norm": 1.8196807161845878, + "language_loss": 0.78123331, + "learning_rate": 3.4493561787400137e-06, + "loss": 0.80306977, + "num_input_tokens_seen": 95209615, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8984375, + "step": 4403, + "time_per_iteration": 2.587623357772827 + }, + { + "auxiliary_loss_clip": 0.0113528, + "auxiliary_loss_mlp": 0.01040463, + "balance_loss_clip": 1.02334428, + "balance_loss_mlp": 1.04440784, + "epoch": 0.264782804749737, + "flos": 22492253911680.0, + "grad_norm": 1.9946669841411302, + "language_loss": 0.87767446, + "learning_rate": 3.4490877780616387e-06, + "loss": 0.89943182, + "num_input_tokens_seen": 95227810, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.91015625, + "step": 4404, + "time_per_iteration": 2.492913246154785 + }, + { + "auxiliary_loss_clip": 0.01138307, + "auxiliary_loss_mlp": 0.01036911, + "balance_loss_clip": 1.02106786, + "balance_loss_mlp": 1.04683399, + "epoch": 0.26484292800240494, + "flos": 16800269212800.0, + "grad_norm": 1.7395093434050468, + "language_loss": 0.7593658, + "learning_rate": 3.448819322433709e-06, + "loss": 0.78111804, + "num_input_tokens_seen": 95245890, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.9140625, + "step": 4405, + "time_per_iteration": 2.508970260620117 + }, + { + "auxiliary_loss_clip": 0.01138042, + "auxiliary_loss_mlp": 0.01038835, + "balance_loss_clip": 1.02166891, + "balance_loss_mlp": 1.04870844, + "epoch": 0.2649030512550729, + "flos": 20449583280000.0, + "grad_norm": 1.9681610481113616, + "language_loss": 0.69979274, + "learning_rate": 3.4485508118664066e-06, + "loss": 0.72156149, + "num_input_tokens_seen": 95264955, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.89453125, + "step": 4406, + "time_per_iteration": 2.4548041820526123 + }, + { + "auxiliary_loss_clip": 0.01134971, + "auxiliary_loss_mlp": 0.01047688, + "balance_loss_clip": 1.03255999, + "balance_loss_mlp": 1.04781294, + "epoch": 0.2649631745077409, + "flos": 22416123035520.0, + "grad_norm": 1.7455123192469384, + "language_loss": 0.83764267, + "learning_rate": 3.448282246369912e-06, + "loss": 0.85946929, + "num_input_tokens_seen": 95284245, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.875, + "step": 4407, + "time_per_iteration": 2.5359292030334473 + }, + { + "auxiliary_loss_clip": 0.01134967, + "auxiliary_loss_mlp": 0.01032434, + "balance_loss_clip": 1.01566172, + "balance_loss_mlp": 1.04678226, + "epoch": 0.26502329776040884, + "flos": 35116110927360.0, + "grad_norm": 1.7942044569518307, + "language_loss": 0.76068008, + "learning_rate": 3.4480136259544084e-06, + "loss": 0.78235412, + "num_input_tokens_seen": 95307125, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8828125, + "step": 4408, + "time_per_iteration": 2.6124041080474854 + }, + { + "auxiliary_loss_clip": 0.011362, + "auxiliary_loss_mlp": 0.01034702, + "balance_loss_clip": 1.01832306, + "balance_loss_mlp": 1.04918611, + "epoch": 0.2650834210130768, + "flos": 38687498438400.0, + "grad_norm": 1.8724720588087471, + "language_loss": 0.70920485, + "learning_rate": 3.447744950630084e-06, + "loss": 0.73091388, + "num_input_tokens_seen": 95329150, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.87109375, + "step": 4409, + "time_per_iteration": 2.6539366245269775 + }, + { + "auxiliary_loss_clip": 0.01136441, + "auxiliary_loss_mlp": 0.01037034, + "balance_loss_clip": 1.01931942, + "balance_loss_mlp": 1.04666233, + "epoch": 0.26514354426574477, + "flos": 24716847951360.0, + "grad_norm": 1.7884535623295956, + "language_loss": 0.73085511, + "learning_rate": 3.4474762204071253e-06, + "loss": 0.75258988, + "num_input_tokens_seen": 95349880, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.8984375, + "step": 4410, + "time_per_iteration": 2.545083999633789 + }, + { + "auxiliary_loss_clip": 0.01139704, + "auxiliary_loss_mlp": 0.01049137, + "balance_loss_clip": 1.03218508, + "balance_loss_mlp": 1.04741001, + "epoch": 0.26520366751841273, + "flos": 20340055733760.0, + "grad_norm": 1.9280641145018393, + "language_loss": 0.73272175, + "learning_rate": 3.4472074352957244e-06, + "loss": 0.75461018, + "num_input_tokens_seen": 95368570, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.921875, + "step": 4411, + "time_per_iteration": 2.4818248748779297 + }, + { + "auxiliary_loss_clip": 0.01137094, + "auxiliary_loss_mlp": 0.01042757, + "balance_loss_clip": 1.02593684, + "balance_loss_mlp": 1.04815316, + "epoch": 0.2652637907710807, + "flos": 22343870828160.0, + "grad_norm": 2.073752901007566, + "language_loss": 0.82294202, + "learning_rate": 3.446938595306071e-06, + "loss": 0.84474051, + "num_input_tokens_seen": 95387065, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.88671875, + "step": 4412, + "time_per_iteration": 2.56634521484375 + }, + { + "auxiliary_loss_clip": 0.01134293, + "auxiliary_loss_mlp": 0.01047936, + "balance_loss_clip": 1.03161621, + "balance_loss_mlp": 1.04541004, + "epoch": 0.26532391402374866, + "flos": 19354235990400.0, + "grad_norm": 1.721718037322793, + "language_loss": 0.74245501, + "learning_rate": 3.4466697004483622e-06, + "loss": 0.76427728, + "num_input_tokens_seen": 95406345, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.890625, + "step": 4413, + "time_per_iteration": 2.4994029998779297 + }, + { + "auxiliary_loss_clip": 0.01046706, + "auxiliary_loss_mlp": 0.01018291, + "balance_loss_clip": 1.01659799, + "balance_loss_mlp": 1.0160358, + "epoch": 0.26538403727641663, + "flos": 44787611422080.0, + "grad_norm": 0.8825812455559224, + "language_loss": 0.56986731, + "learning_rate": 3.446400750732793e-06, + "loss": 0.59051728, + "num_input_tokens_seen": 95463595, + "router_z_loss_clip": 0.01696777, + "router_z_loss_mlp": 0.30664062, + "step": 4414, + "time_per_iteration": 2.9884986877441406 + }, + { + "auxiliary_loss_clip": 0.01128281, + "auxiliary_loss_mlp": 0.01041185, + "balance_loss_clip": 1.02605712, + "balance_loss_mlp": 1.04307461, + "epoch": 0.26544416052908465, + "flos": 28182119708160.0, + "grad_norm": 1.8727128035200367, + "language_loss": 0.74535894, + "learning_rate": 3.4461317461695625e-06, + "loss": 0.76705366, + "num_input_tokens_seen": 95484115, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8515625, + "step": 4415, + "time_per_iteration": 2.5531253814697266 + }, + { + "auxiliary_loss_clip": 0.01138825, + "auxiliary_loss_mlp": 0.01043694, + "balance_loss_clip": 1.02506185, + "balance_loss_mlp": 1.04656732, + "epoch": 0.2655042837817526, + "flos": 17565274097280.0, + "grad_norm": 2.3504707987247917, + "language_loss": 0.86662048, + "learning_rate": 3.4458626867688707e-06, + "loss": 0.88844568, + "num_input_tokens_seen": 95501435, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.921875, + "step": 4416, + "time_per_iteration": 2.4751384258270264 + }, + { + "auxiliary_loss_clip": 0.0113975, + "auxiliary_loss_mlp": 0.01042134, + "balance_loss_clip": 1.02439594, + "balance_loss_mlp": 1.0492208, + "epoch": 0.2655644070344206, + "flos": 23404636298880.0, + "grad_norm": 1.6281293305848954, + "language_loss": 0.76152384, + "learning_rate": 3.4455935725409217e-06, + "loss": 0.78334266, + "num_input_tokens_seen": 95520135, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.90625, + "step": 4417, + "time_per_iteration": 2.5017013549804688 + }, + { + "auxiliary_loss_clip": 0.01135215, + "auxiliary_loss_mlp": 0.01039785, + "balance_loss_clip": 1.02167702, + "balance_loss_mlp": 1.04778051, + "epoch": 0.26562453028708854, + "flos": 26468462678400.0, + "grad_norm": 1.7397383944852411, + "language_loss": 0.79984045, + "learning_rate": 3.4453244034959196e-06, + "loss": 0.82159042, + "num_input_tokens_seen": 95541705, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.875, + "step": 4418, + "time_per_iteration": 2.539454460144043 + }, + { + "auxiliary_loss_clip": 0.01138688, + "auxiliary_loss_mlp": 0.01046556, + "balance_loss_clip": 1.02983057, + "balance_loss_mlp": 1.04861307, + "epoch": 0.2656846535397565, + "flos": 19207576759680.0, + "grad_norm": 2.7780034581995965, + "language_loss": 0.67397833, + "learning_rate": 3.445055179644071e-06, + "loss": 0.69583082, + "num_input_tokens_seen": 95560300, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.90234375, + "step": 4419, + "time_per_iteration": 2.461444616317749 + }, + { + "auxiliary_loss_clip": 0.01139197, + "auxiliary_loss_mlp": 0.01045382, + "balance_loss_clip": 1.02739358, + "balance_loss_mlp": 1.04920876, + "epoch": 0.2657447767924245, + "flos": 30551325903360.0, + "grad_norm": 2.097903587873874, + "language_loss": 0.79365611, + "learning_rate": 3.444785900995585e-06, + "loss": 0.81550193, + "num_input_tokens_seen": 95580150, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.8984375, + "step": 4420, + "time_per_iteration": 2.5908427238464355 + }, + { + "auxiliary_loss_clip": 0.01141654, + "auxiliary_loss_mlp": 0.01049212, + "balance_loss_clip": 1.02990031, + "balance_loss_mlp": 1.0493983, + "epoch": 0.26580490004509244, + "flos": 20922742160640.0, + "grad_norm": 2.1223383047232933, + "language_loss": 0.81612432, + "learning_rate": 3.444516567560673e-06, + "loss": 0.83803296, + "num_input_tokens_seen": 95597570, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.921875, + "step": 4421, + "time_per_iteration": 2.4869320392608643 + }, + { + "auxiliary_loss_clip": 0.01134642, + "auxiliary_loss_mlp": 0.01037045, + "balance_loss_clip": 1.02027202, + "balance_loss_mlp": 1.04734015, + "epoch": 0.2658650232977604, + "flos": 43945682584320.0, + "grad_norm": 1.5724937400793966, + "language_loss": 0.65278006, + "learning_rate": 3.444247179349548e-06, + "loss": 0.67449689, + "num_input_tokens_seen": 95619415, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.87109375, + "step": 4422, + "time_per_iteration": 2.7370638847351074 + }, + { + "auxiliary_loss_clip": 0.01138513, + "auxiliary_loss_mlp": 0.01046085, + "balance_loss_clip": 1.02965808, + "balance_loss_mlp": 1.04750621, + "epoch": 0.26592514655042837, + "flos": 29716439109120.0, + "grad_norm": 2.411979213410041, + "language_loss": 0.73841226, + "learning_rate": 3.4439777363724252e-06, + "loss": 0.76025832, + "num_input_tokens_seen": 95639155, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.91015625, + "step": 4423, + "time_per_iteration": 2.5510191917419434 + }, + { + "auxiliary_loss_clip": 0.01136367, + "auxiliary_loss_mlp": 0.01046611, + "balance_loss_clip": 1.03017163, + "balance_loss_mlp": 1.04504442, + "epoch": 0.26598526980309634, + "flos": 46677730014720.0, + "grad_norm": 1.6317340067044743, + "language_loss": 0.77703154, + "learning_rate": 3.443708238639522e-06, + "loss": 0.79886127, + "num_input_tokens_seen": 95663320, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9140625, + "step": 4424, + "time_per_iteration": 2.809495449066162 + }, + { + "auxiliary_loss_clip": 0.01137168, + "auxiliary_loss_mlp": 0.01043079, + "balance_loss_clip": 1.02675951, + "balance_loss_mlp": 1.04695249, + "epoch": 0.2660453930557643, + "flos": 11509442582400.0, + "grad_norm": 2.064218808714238, + "language_loss": 0.79345673, + "learning_rate": 3.4434386861610573e-06, + "loss": 0.81525922, + "num_input_tokens_seen": 95680260, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.90234375, + "step": 4425, + "time_per_iteration": 2.48149037361145 + }, + { + "auxiliary_loss_clip": 0.01133425, + "auxiliary_loss_mlp": 0.01046814, + "balance_loss_clip": 1.03138816, + "balance_loss_mlp": 1.04685736, + "epoch": 0.26610551630843227, + "flos": 24791578197120.0, + "grad_norm": 1.774406296589384, + "language_loss": 0.80463314, + "learning_rate": 3.4431690789472532e-06, + "loss": 0.82643557, + "num_input_tokens_seen": 95701140, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8671875, + "step": 4426, + "time_per_iteration": 2.5968613624572754 + }, + { + "auxiliary_loss_clip": 0.01138948, + "auxiliary_loss_mlp": 0.0104835, + "balance_loss_clip": 1.03180957, + "balance_loss_mlp": 1.04982209, + "epoch": 0.26616563956110023, + "flos": 27636385397760.0, + "grad_norm": 1.8207507571493768, + "language_loss": 0.77337295, + "learning_rate": 3.442899417008333e-06, + "loss": 0.79524601, + "num_input_tokens_seen": 95722060, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.890625, + "step": 4427, + "time_per_iteration": 4.045380353927612 + }, + { + "auxiliary_loss_clip": 0.01133558, + "auxiliary_loss_mlp": 0.010331, + "balance_loss_clip": 1.01760316, + "balance_loss_mlp": 1.04737306, + "epoch": 0.26622576281376825, + "flos": 28362893880960.0, + "grad_norm": 1.8400253790543033, + "language_loss": 0.76800078, + "learning_rate": 3.4426297003545227e-06, + "loss": 0.78966737, + "num_input_tokens_seen": 95742495, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.86328125, + "step": 4428, + "time_per_iteration": 4.018831491470337 + }, + { + "auxiliary_loss_clip": 0.01135115, + "auxiliary_loss_mlp": 0.01034355, + "balance_loss_clip": 1.01858354, + "balance_loss_mlp": 1.04529297, + "epoch": 0.2662858860664362, + "flos": 18041341979520.0, + "grad_norm": 1.9075878866801723, + "language_loss": 0.83010298, + "learning_rate": 3.4423599289960495e-06, + "loss": 0.8517977, + "num_input_tokens_seen": 95761510, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8984375, + "step": 4429, + "time_per_iteration": 2.576535940170288 + }, + { + "auxiliary_loss_clip": 0.01133677, + "auxiliary_loss_mlp": 0.01042932, + "balance_loss_clip": 1.02644563, + "balance_loss_mlp": 1.04664719, + "epoch": 0.2663460093191042, + "flos": 22745818995840.0, + "grad_norm": 3.2197583620662082, + "language_loss": 0.72143924, + "learning_rate": 3.442090102943143e-06, + "loss": 0.74320537, + "num_input_tokens_seen": 95782385, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.87109375, + "step": 4430, + "time_per_iteration": 2.5262365341186523 + }, + { + "auxiliary_loss_clip": 0.01136153, + "auxiliary_loss_mlp": 0.01042808, + "balance_loss_clip": 1.02453375, + "balance_loss_mlp": 1.04667306, + "epoch": 0.26640613257177215, + "flos": 16508782344960.0, + "grad_norm": 2.382555523964676, + "language_loss": 0.81635833, + "learning_rate": 3.441820222206035e-06, + "loss": 0.83814788, + "num_input_tokens_seen": 95800595, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.89453125, + "step": 4431, + "time_per_iteration": 2.5135624408721924 + }, + { + "auxiliary_loss_clip": 0.01142285, + "auxiliary_loss_mlp": 0.01050952, + "balance_loss_clip": 1.03360736, + "balance_loss_mlp": 1.04865289, + "epoch": 0.2664662558244401, + "flos": 23075945919360.0, + "grad_norm": 2.34486467491615, + "language_loss": 0.76153386, + "learning_rate": 3.44155028679496e-06, + "loss": 0.78346616, + "num_input_tokens_seen": 95818480, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9375, + "step": 4432, + "time_per_iteration": 2.469515562057495 + }, + { + "auxiliary_loss_clip": 0.01136779, + "auxiliary_loss_mlp": 0.01044676, + "balance_loss_clip": 1.02711606, + "balance_loss_mlp": 1.04703665, + "epoch": 0.2665263790771081, + "flos": 23769273214080.0, + "grad_norm": 2.148919041496035, + "language_loss": 0.82521772, + "learning_rate": 3.441280296720154e-06, + "loss": 0.84703225, + "num_input_tokens_seen": 95837205, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8984375, + "step": 4433, + "time_per_iteration": 2.540174961090088 + }, + { + "auxiliary_loss_clip": 0.01138849, + "auxiliary_loss_mlp": 0.01048222, + "balance_loss_clip": 1.03065097, + "balance_loss_mlp": 1.04955435, + "epoch": 0.26658650232977604, + "flos": 28001273708160.0, + "grad_norm": 2.091984027516481, + "language_loss": 0.76638913, + "learning_rate": 3.441010251991854e-06, + "loss": 0.78825986, + "num_input_tokens_seen": 95858395, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.890625, + "step": 4434, + "time_per_iteration": 2.549769878387451 + }, + { + "auxiliary_loss_clip": 0.01133542, + "auxiliary_loss_mlp": 0.01043118, + "balance_loss_clip": 1.02770376, + "balance_loss_mlp": 1.04645348, + "epoch": 0.266646625582444, + "flos": 22163635359360.0, + "grad_norm": 2.251252650424801, + "language_loss": 0.82632279, + "learning_rate": 3.440740152620301e-06, + "loss": 0.84808934, + "num_input_tokens_seen": 95877875, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.87109375, + "step": 4435, + "time_per_iteration": 2.5329744815826416 + }, + { + "auxiliary_loss_clip": 0.01140704, + "auxiliary_loss_mlp": 0.0105698, + "balance_loss_clip": 1.03870487, + "balance_loss_mlp": 1.04742312, + "epoch": 0.266706748835112, + "flos": 27853537069440.0, + "grad_norm": 2.2611652281579397, + "language_loss": 0.87278962, + "learning_rate": 3.4404699986157376e-06, + "loss": 0.89476645, + "num_input_tokens_seen": 95895820, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.9296875, + "step": 4436, + "time_per_iteration": 2.5375254154205322 + }, + { + "auxiliary_loss_clip": 0.01136328, + "auxiliary_loss_mlp": 0.0104305, + "balance_loss_clip": 1.02670658, + "balance_loss_mlp": 1.04566383, + "epoch": 0.26676687208777994, + "flos": 25812123413760.0, + "grad_norm": 1.4304916595737875, + "language_loss": 0.78941, + "learning_rate": 3.440199789988407e-06, + "loss": 0.81120378, + "num_input_tokens_seen": 95918025, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90625, + "step": 4437, + "time_per_iteration": 2.591017007827759 + }, + { + "auxiliary_loss_clip": 0.01134502, + "auxiliary_loss_mlp": 0.01041567, + "balance_loss_clip": 1.02533066, + "balance_loss_mlp": 1.04595256, + "epoch": 0.2668269953404479, + "flos": 36064583504640.0, + "grad_norm": 2.0731379310987412, + "language_loss": 0.63412011, + "learning_rate": 3.439929526748556e-06, + "loss": 0.65588087, + "num_input_tokens_seen": 95937725, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8828125, + "step": 4438, + "time_per_iteration": 2.6429452896118164 + }, + { + "auxiliary_loss_clip": 0.01137556, + "auxiliary_loss_mlp": 0.01036987, + "balance_loss_clip": 1.02125144, + "balance_loss_mlp": 1.04869223, + "epoch": 0.26688711859311587, + "flos": 26570987072640.0, + "grad_norm": 1.8133794638407341, + "language_loss": 0.75628942, + "learning_rate": 3.4396592089064334e-06, + "loss": 0.77803481, + "num_input_tokens_seen": 95956335, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.890625, + "step": 4439, + "time_per_iteration": 2.5296032428741455 + }, + { + "auxiliary_loss_clip": 0.01140638, + "auxiliary_loss_mlp": 0.01038527, + "balance_loss_clip": 1.02052629, + "balance_loss_mlp": 1.04913759, + "epoch": 0.26694724184578383, + "flos": 26761565658240.0, + "grad_norm": 1.7792140134846064, + "language_loss": 0.71444011, + "learning_rate": 3.4393888364722897e-06, + "loss": 0.7362318, + "num_input_tokens_seen": 95977135, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9140625, + "step": 4440, + "time_per_iteration": 2.5714335441589355 + }, + { + "auxiliary_loss_clip": 0.01139576, + "auxiliary_loss_mlp": 0.01045623, + "balance_loss_clip": 1.02757502, + "balance_loss_mlp": 1.04816949, + "epoch": 0.2670073650984518, + "flos": 20959586536320.0, + "grad_norm": 1.8363906583736056, + "language_loss": 0.66291904, + "learning_rate": 3.439118409456376e-06, + "loss": 0.68477106, + "num_input_tokens_seen": 95995435, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9140625, + "step": 4441, + "time_per_iteration": 2.522589683532715 + }, + { + "auxiliary_loss_clip": 0.01137665, + "auxiliary_loss_mlp": 0.0104418, + "balance_loss_clip": 1.02654862, + "balance_loss_mlp": 1.04803538, + "epoch": 0.2670674883511198, + "flos": 28366054277760.0, + "grad_norm": 1.5597318548365904, + "language_loss": 0.76451373, + "learning_rate": 3.4388479278689486e-06, + "loss": 0.78633213, + "num_input_tokens_seen": 96016340, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.89453125, + "step": 4442, + "time_per_iteration": 2.5659492015838623 + }, + { + "auxiliary_loss_clip": 0.01060214, + "auxiliary_loss_mlp": 0.0100059, + "balance_loss_clip": 0.99855101, + "balance_loss_mlp": 1.02895594, + "epoch": 0.2671276116037878, + "flos": 58971319430400.0, + "grad_norm": 0.912864167592289, + "language_loss": 0.61270142, + "learning_rate": 3.4385773917202637e-06, + "loss": 0.63330936, + "num_input_tokens_seen": 96071205, + "router_z_loss_clip": 0.02038574, + "router_z_loss_mlp": 0.3125, + "step": 4443, + "time_per_iteration": 3.0256776809692383 + }, + { + "auxiliary_loss_clip": 0.01140806, + "auxiliary_loss_mlp": 0.01035952, + "balance_loss_clip": 1.01968026, + "balance_loss_mlp": 1.0495882, + "epoch": 0.26718773485645575, + "flos": 43945072053120.0, + "grad_norm": 1.5525166591100914, + "language_loss": 0.76200545, + "learning_rate": 3.4383068010205793e-06, + "loss": 0.78377306, + "num_input_tokens_seen": 96094240, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.91015625, + "step": 4444, + "time_per_iteration": 2.7414674758911133 + }, + { + "auxiliary_loss_clip": 0.0114013, + "auxiliary_loss_mlp": 0.01040836, + "balance_loss_clip": 1.02330077, + "balance_loss_mlp": 1.04932773, + "epoch": 0.2672478581091237, + "flos": 25228323665280.0, + "grad_norm": 3.16165776963455, + "language_loss": 0.80212528, + "learning_rate": 3.438036155780158e-06, + "loss": 0.82393491, + "num_input_tokens_seen": 96114105, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.90625, + "step": 4445, + "time_per_iteration": 2.5349111557006836 + }, + { + "auxiliary_loss_clip": 0.01139166, + "auxiliary_loss_mlp": 0.0104, + "balance_loss_clip": 1.02232134, + "balance_loss_mlp": 1.04797101, + "epoch": 0.2673079813617917, + "flos": 15268176455040.0, + "grad_norm": 2.3952290716593825, + "language_loss": 0.89144397, + "learning_rate": 3.43776545600926e-06, + "loss": 0.91323566, + "num_input_tokens_seen": 96132140, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.91015625, + "step": 4446, + "time_per_iteration": 2.5512521266937256 + }, + { + "auxiliary_loss_clip": 0.01140462, + "auxiliary_loss_mlp": 0.01047593, + "balance_loss_clip": 1.0311892, + "balance_loss_mlp": 1.04977763, + "epoch": 0.26736810461445965, + "flos": 25812733944960.0, + "grad_norm": 1.831363923725005, + "language_loss": 0.68259656, + "learning_rate": 3.437494701718153e-06, + "loss": 0.70447719, + "num_input_tokens_seen": 96152090, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90625, + "step": 4447, + "time_per_iteration": 2.5752837657928467 + }, + { + "auxiliary_loss_clip": 0.01140858, + "auxiliary_loss_mlp": 0.01040004, + "balance_loss_clip": 1.02261138, + "balance_loss_mlp": 1.04972827, + "epoch": 0.2674282278671276, + "flos": 24312709054080.0, + "grad_norm": 1.9862084341014827, + "language_loss": 0.82976532, + "learning_rate": 3.4372238929171026e-06, + "loss": 0.85157394, + "num_input_tokens_seen": 96170015, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.91015625, + "step": 4448, + "time_per_iteration": 2.6524059772491455 + }, + { + "auxiliary_loss_clip": 0.01137667, + "auxiliary_loss_mlp": 0.01048508, + "balance_loss_clip": 1.03110301, + "balance_loss_mlp": 1.04973495, + "epoch": 0.2674883511197956, + "flos": 22815521337600.0, + "grad_norm": 2.185461436072074, + "language_loss": 0.84288895, + "learning_rate": 3.436953029616378e-06, + "loss": 0.86475068, + "num_input_tokens_seen": 96188065, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.87890625, + "step": 4449, + "time_per_iteration": 2.5167598724365234 + }, + { + "auxiliary_loss_clip": 0.01148403, + "auxiliary_loss_mlp": 0.01047832, + "balance_loss_clip": 1.02892506, + "balance_loss_mlp": 1.05114913, + "epoch": 0.26754847437246354, + "flos": 25370170473600.0, + "grad_norm": 1.9936425417360089, + "language_loss": 0.84260273, + "learning_rate": 3.4366821118262506e-06, + "loss": 0.86456501, + "num_input_tokens_seen": 96205780, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.97265625, + "step": 4450, + "time_per_iteration": 2.555941343307495 + }, + { + "auxiliary_loss_clip": 0.01133946, + "auxiliary_loss_mlp": 0.0104095, + "balance_loss_clip": 1.02560782, + "balance_loss_mlp": 1.04674196, + "epoch": 0.2676085976251315, + "flos": 20230420446720.0, + "grad_norm": 1.900524277018137, + "language_loss": 0.81065774, + "learning_rate": 3.4364111395569937e-06, + "loss": 0.83240664, + "num_input_tokens_seen": 96224990, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.87109375, + "step": 4451, + "time_per_iteration": 2.5289859771728516 + }, + { + "auxiliary_loss_clip": 0.01140947, + "auxiliary_loss_mlp": 0.01041834, + "balance_loss_clip": 1.02593148, + "balance_loss_mlp": 1.05186319, + "epoch": 0.26766872087779947, + "flos": 28038225824640.0, + "grad_norm": 1.8040621200757803, + "language_loss": 0.86401796, + "learning_rate": 3.436140112818882e-06, + "loss": 0.88584578, + "num_input_tokens_seen": 96245345, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.890625, + "step": 4452, + "time_per_iteration": 2.617918014526367 + }, + { + "auxiliary_loss_clip": 0.01143372, + "auxiliary_loss_mlp": 0.01037743, + "balance_loss_clip": 1.02055311, + "balance_loss_mlp": 1.05132198, + "epoch": 0.26772884413046744, + "flos": 18325179250560.0, + "grad_norm": 1.9731948573099198, + "language_loss": 0.83129871, + "learning_rate": 3.435869031622194e-06, + "loss": 0.8531099, + "num_input_tokens_seen": 96259000, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.921875, + "step": 4453, + "time_per_iteration": 2.483130931854248 + }, + { + "auxiliary_loss_clip": 0.0113897, + "auxiliary_loss_mlp": 0.01046975, + "balance_loss_clip": 1.02936745, + "balance_loss_mlp": 1.04995108, + "epoch": 0.2677889673831354, + "flos": 22127509255680.0, + "grad_norm": 1.62656613015929, + "language_loss": 0.79744816, + "learning_rate": 3.435597895977208e-06, + "loss": 0.81930768, + "num_input_tokens_seen": 96277000, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.890625, + "step": 4454, + "time_per_iteration": 2.537853717803955 + }, + { + "auxiliary_loss_clip": 0.01141821, + "auxiliary_loss_mlp": 0.01042652, + "balance_loss_clip": 1.02599859, + "balance_loss_mlp": 1.04989707, + "epoch": 0.2678490906358034, + "flos": 23729699404800.0, + "grad_norm": 1.7640316216704761, + "language_loss": 0.7215519, + "learning_rate": 3.435326705894206e-06, + "loss": 0.74339664, + "num_input_tokens_seen": 96297010, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.91796875, + "step": 4455, + "time_per_iteration": 2.5023562908172607 + }, + { + "auxiliary_loss_clip": 0.01137457, + "auxiliary_loss_mlp": 0.01039805, + "balance_loss_clip": 1.02406991, + "balance_loss_mlp": 1.05066276, + "epoch": 0.2679092138884714, + "flos": 21762872340480.0, + "grad_norm": 1.5496021720121687, + "language_loss": 0.74044335, + "learning_rate": 3.435055461383471e-06, + "loss": 0.76221603, + "num_input_tokens_seen": 96315780, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8671875, + "step": 4456, + "time_per_iteration": 2.487581729888916 + }, + { + "auxiliary_loss_clip": 0.01141742, + "auxiliary_loss_mlp": 0.01038216, + "balance_loss_clip": 1.02121687, + "balance_loss_mlp": 1.04937947, + "epoch": 0.26796933714113935, + "flos": 19861186590720.0, + "grad_norm": 2.2089309948453697, + "language_loss": 0.70965469, + "learning_rate": 3.4347841624552896e-06, + "loss": 0.73145425, + "num_input_tokens_seen": 96333465, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.921875, + "step": 4457, + "time_per_iteration": 2.4584691524505615 + }, + { + "auxiliary_loss_clip": 0.01143072, + "auxiliary_loss_mlp": 0.010439, + "balance_loss_clip": 1.02681756, + "balance_loss_mlp": 1.05237103, + "epoch": 0.2680294603938073, + "flos": 20047886507520.0, + "grad_norm": 2.29797460876898, + "language_loss": 0.79029202, + "learning_rate": 3.4345128091199493e-06, + "loss": 0.81216174, + "num_input_tokens_seen": 96352005, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.90625, + "step": 4458, + "time_per_iteration": 2.6079578399658203 + }, + { + "auxiliary_loss_clip": 0.01052787, + "auxiliary_loss_mlp": 0.01006207, + "balance_loss_clip": 1.00439513, + "balance_loss_mlp": 1.02259135, + "epoch": 0.2680895836464753, + "flos": 72113763052800.0, + "grad_norm": 0.8640508796264214, + "language_loss": 0.58716619, + "learning_rate": 3.434241401387739e-06, + "loss": 0.60775614, + "num_input_tokens_seen": 96406265, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.30078125, + "step": 4459, + "time_per_iteration": 3.0725412368774414 + }, + { + "auxiliary_loss_clip": 0.0113409, + "auxiliary_loss_mlp": 0.01040081, + "balance_loss_clip": 1.02444053, + "balance_loss_mlp": 1.04671741, + "epoch": 0.26814970689914325, + "flos": 20449044576000.0, + "grad_norm": 2.0778557825519055, + "language_loss": 0.85224575, + "learning_rate": 3.4339699392689507e-06, + "loss": 0.87398744, + "num_input_tokens_seen": 96425225, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.875, + "step": 4460, + "time_per_iteration": 2.483299732208252 + }, + { + "auxiliary_loss_clip": 0.01136074, + "auxiliary_loss_mlp": 0.01043042, + "balance_loss_clip": 1.02653205, + "balance_loss_mlp": 1.04752469, + "epoch": 0.2682098301518112, + "flos": 17566674727680.0, + "grad_norm": 2.805871571962145, + "language_loss": 0.68256581, + "learning_rate": 3.4336984227738796e-06, + "loss": 0.70435691, + "num_input_tokens_seen": 96443780, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8828125, + "step": 4461, + "time_per_iteration": 2.439304828643799 + }, + { + "auxiliary_loss_clip": 0.01135713, + "auxiliary_loss_mlp": 0.01049055, + "balance_loss_clip": 1.03198409, + "balance_loss_mlp": 1.0470686, + "epoch": 0.2682699534044792, + "flos": 18333259810560.0, + "grad_norm": 1.5557483279788171, + "language_loss": 0.67342007, + "learning_rate": 3.43342685191282e-06, + "loss": 0.69526774, + "num_input_tokens_seen": 96464530, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.88671875, + "step": 4462, + "time_per_iteration": 2.5081140995025635 + }, + { + "auxiliary_loss_clip": 0.01136996, + "auxiliary_loss_mlp": 0.01041529, + "balance_loss_clip": 1.02413619, + "balance_loss_mlp": 1.04865909, + "epoch": 0.26833007665714714, + "flos": 25301294144640.0, + "grad_norm": 1.8707784514564991, + "language_loss": 0.6927141, + "learning_rate": 3.4331552266960705e-06, + "loss": 0.71449935, + "num_input_tokens_seen": 96483345, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.8828125, + "step": 4463, + "time_per_iteration": 2.5280556678771973 + }, + { + "auxiliary_loss_clip": 0.01140107, + "auxiliary_loss_mlp": 0.01041395, + "balance_loss_clip": 1.02414584, + "balance_loss_mlp": 1.04812574, + "epoch": 0.2683901999098151, + "flos": 16099759198080.0, + "grad_norm": 2.4976114648735304, + "language_loss": 0.77389008, + "learning_rate": 3.432883547133931e-06, + "loss": 0.79570508, + "num_input_tokens_seen": 96498305, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.921875, + "step": 4464, + "time_per_iteration": 2.469650983810425 + }, + { + "auxiliary_loss_clip": 0.01134508, + "auxiliary_loss_mlp": 0.01039425, + "balance_loss_clip": 1.02215123, + "balance_loss_mlp": 1.0458076, + "epoch": 0.2684503231624831, + "flos": 27308054154240.0, + "grad_norm": 1.844577670487785, + "language_loss": 0.70796561, + "learning_rate": 3.432611813236704e-06, + "loss": 0.72970498, + "num_input_tokens_seen": 96519740, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.88671875, + "step": 4465, + "time_per_iteration": 2.5685060024261475 + }, + { + "auxiliary_loss_clip": 0.01049569, + "auxiliary_loss_mlp": 0.0100238, + "balance_loss_clip": 1.00067484, + "balance_loss_mlp": 1.01956284, + "epoch": 0.26851044641515104, + "flos": 71858007239040.0, + "grad_norm": 0.6800540965400289, + "language_loss": 0.53096056, + "learning_rate": 3.4323400250146943e-06, + "loss": 0.55148005, + "num_input_tokens_seen": 96588870, + "router_z_loss_clip": 0.01708984, + "router_z_loss_mlp": 0.30078125, + "step": 4466, + "time_per_iteration": 3.2327654361724854 + }, + { + "auxiliary_loss_clip": 0.01133624, + "auxiliary_loss_mlp": 0.01039482, + "balance_loss_clip": 1.02219653, + "balance_loss_mlp": 1.04600596, + "epoch": 0.268570569667819, + "flos": 18733771434240.0, + "grad_norm": 2.0764143418179213, + "language_loss": 0.7343837, + "learning_rate": 3.4320681824782057e-06, + "loss": 0.75611472, + "num_input_tokens_seen": 96605100, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.875, + "step": 4467, + "time_per_iteration": 2.5052013397216797 + }, + { + "auxiliary_loss_clip": 0.01138792, + "auxiliary_loss_mlp": 0.01045438, + "balance_loss_clip": 1.0278548, + "balance_loss_mlp": 1.04801464, + "epoch": 0.268630692920487, + "flos": 18178376365440.0, + "grad_norm": 2.5834152956256555, + "language_loss": 0.80703115, + "learning_rate": 3.4317962856375493e-06, + "loss": 0.82887346, + "num_input_tokens_seen": 96621410, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.90625, + "step": 4468, + "time_per_iteration": 2.4547622203826904 + }, + { + "auxiliary_loss_clip": 0.01047735, + "auxiliary_loss_mlp": 0.01005617, + "balance_loss_clip": 1.00407946, + "balance_loss_mlp": 1.01768315, + "epoch": 0.268690816173155, + "flos": 68731768978560.0, + "grad_norm": 0.8449159500606429, + "language_loss": 0.59532088, + "learning_rate": 3.4315243345030334e-06, + "loss": 0.61585438, + "num_input_tokens_seen": 96684810, + "router_z_loss_clip": 0.01538086, + "router_z_loss_mlp": 0.30078125, + "step": 4469, + "time_per_iteration": 4.6310715675354 + }, + { + "auxiliary_loss_clip": 0.01137988, + "auxiliary_loss_mlp": 0.01045172, + "balance_loss_clip": 1.02687383, + "balance_loss_mlp": 1.04844749, + "epoch": 0.26875093942582295, + "flos": 23293636295040.0, + "grad_norm": 2.3316897890333954, + "language_loss": 0.81785607, + "learning_rate": 3.431252329084972e-06, + "loss": 0.83968771, + "num_input_tokens_seen": 96701920, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.89453125, + "step": 4470, + "time_per_iteration": 2.5501935482025146 + }, + { + "auxiliary_loss_clip": 0.01129268, + "auxiliary_loss_mlp": 0.01037606, + "balance_loss_clip": 1.02091098, + "balance_loss_mlp": 1.04484963, + "epoch": 0.2688110626784909, + "flos": 21543458112000.0, + "grad_norm": 1.6194658793917844, + "language_loss": 0.82648492, + "learning_rate": 3.4309802693936786e-06, + "loss": 0.84815365, + "num_input_tokens_seen": 96721260, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.84375, + "step": 4471, + "time_per_iteration": 2.559220552444458 + }, + { + "auxiliary_loss_clip": 0.0113472, + "auxiliary_loss_mlp": 0.010367, + "balance_loss_clip": 1.02042806, + "balance_loss_mlp": 1.04853129, + "epoch": 0.2688711859311589, + "flos": 28400600183040.0, + "grad_norm": 8.458966217412893, + "language_loss": 0.69382554, + "learning_rate": 3.43070815543947e-06, + "loss": 0.71553975, + "num_input_tokens_seen": 96740385, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.86328125, + "step": 4472, + "time_per_iteration": 2.561326742172241 + }, + { + "auxiliary_loss_clip": 0.01135298, + "auxiliary_loss_mlp": 0.01036687, + "balance_loss_clip": 1.02045035, + "balance_loss_mlp": 1.04783702, + "epoch": 0.26893130918382685, + "flos": 25994944661760.0, + "grad_norm": 1.596928542569954, + "language_loss": 0.67870784, + "learning_rate": 3.4304359872326656e-06, + "loss": 0.70042771, + "num_input_tokens_seen": 96761860, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.875, + "step": 4473, + "time_per_iteration": 2.5437636375427246 + }, + { + "auxiliary_loss_clip": 0.01133236, + "auxiliary_loss_mlp": 0.01044607, + "balance_loss_clip": 1.02844238, + "balance_loss_mlp": 1.04768729, + "epoch": 0.2689914324364948, + "flos": 20339624770560.0, + "grad_norm": 1.8504576821316179, + "language_loss": 0.82971931, + "learning_rate": 3.4301637647835843e-06, + "loss": 0.85149777, + "num_input_tokens_seen": 96781890, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.85546875, + "step": 4474, + "time_per_iteration": 2.474095582962036 + }, + { + "auxiliary_loss_clip": 0.01132567, + "auxiliary_loss_mlp": 0.01046818, + "balance_loss_clip": 1.03042698, + "balance_loss_mlp": 1.04697323, + "epoch": 0.2690515556891628, + "flos": 19464553635840.0, + "grad_norm": 2.0689967373005977, + "language_loss": 0.70303237, + "learning_rate": 3.4298914881025494e-06, + "loss": 0.72482622, + "num_input_tokens_seen": 96800390, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.85546875, + "step": 4475, + "time_per_iteration": 2.4865996837615967 + }, + { + "auxiliary_loss_clip": 0.01135068, + "auxiliary_loss_mlp": 0.01040112, + "balance_loss_clip": 1.02335167, + "balance_loss_mlp": 1.04614162, + "epoch": 0.26911167894183075, + "flos": 18146631720960.0, + "grad_norm": 1.7721029234489851, + "language_loss": 0.73711979, + "learning_rate": 3.4296191571998863e-06, + "loss": 0.75887156, + "num_input_tokens_seen": 96816685, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.890625, + "step": 4476, + "time_per_iteration": 2.477308988571167 + }, + { + "auxiliary_loss_clip": 0.01130545, + "auxiliary_loss_mlp": 0.01040281, + "balance_loss_clip": 1.02456927, + "balance_loss_mlp": 1.04561102, + "epoch": 0.2691718021944987, + "flos": 19975131509760.0, + "grad_norm": 1.720914514753409, + "language_loss": 0.80110955, + "learning_rate": 3.429346772085922e-06, + "loss": 0.8228178, + "num_input_tokens_seen": 96836285, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84765625, + "step": 4477, + "time_per_iteration": 2.497809648513794 + }, + { + "auxiliary_loss_clip": 0.01133072, + "auxiliary_loss_mlp": 0.01042879, + "balance_loss_clip": 1.02578449, + "balance_loss_mlp": 1.04442573, + "epoch": 0.2692319254471667, + "flos": 37447215770880.0, + "grad_norm": 1.9038830637231319, + "language_loss": 0.64580482, + "learning_rate": 3.429074332770984e-06, + "loss": 0.66756433, + "num_input_tokens_seen": 96857745, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.8828125, + "step": 4478, + "time_per_iteration": 2.6485564708709717 + }, + { + "auxiliary_loss_clip": 0.01130767, + "auxiliary_loss_mlp": 0.0104511, + "balance_loss_clip": 1.02876592, + "balance_loss_mlp": 1.04380882, + "epoch": 0.26929204869983464, + "flos": 22127796564480.0, + "grad_norm": 1.8571100614964546, + "language_loss": 0.80653036, + "learning_rate": 3.4288018392654047e-06, + "loss": 0.82828909, + "num_input_tokens_seen": 96877295, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8671875, + "step": 4479, + "time_per_iteration": 2.4851014614105225 + }, + { + "auxiliary_loss_clip": 0.01135761, + "auxiliary_loss_mlp": 0.01043964, + "balance_loss_clip": 1.02725112, + "balance_loss_mlp": 1.04611528, + "epoch": 0.2693521719525026, + "flos": 19792813052160.0, + "grad_norm": 2.4630797167742458, + "language_loss": 0.80834484, + "learning_rate": 3.4285292915795166e-06, + "loss": 0.83014214, + "num_input_tokens_seen": 96896160, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.89453125, + "step": 4480, + "time_per_iteration": 2.490147590637207 + }, + { + "auxiliary_loss_clip": 0.01124775, + "auxiliary_loss_mlp": 0.01036236, + "balance_loss_clip": 1.02066684, + "balance_loss_mlp": 1.04153395, + "epoch": 0.2694122952051706, + "flos": 20994383836800.0, + "grad_norm": 1.7677898796301312, + "language_loss": 0.77612787, + "learning_rate": 3.4282566897236543e-06, + "loss": 0.79773796, + "num_input_tokens_seen": 96915410, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.83203125, + "step": 4481, + "time_per_iteration": 2.4699158668518066 + }, + { + "auxiliary_loss_clip": 0.01134279, + "auxiliary_loss_mlp": 0.01044694, + "balance_loss_clip": 1.02737296, + "balance_loss_mlp": 1.04591584, + "epoch": 0.2694724184578386, + "flos": 25849291011840.0, + "grad_norm": 2.5981026313468525, + "language_loss": 0.74701524, + "learning_rate": 3.4279840337081547e-06, + "loss": 0.76880491, + "num_input_tokens_seen": 96937865, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.8828125, + "step": 4482, + "time_per_iteration": 2.556087017059326 + }, + { + "auxiliary_loss_clip": 0.01135034, + "auxiliary_loss_mlp": 0.01039094, + "balance_loss_clip": 1.02198792, + "balance_loss_mlp": 1.04693186, + "epoch": 0.26953254171050656, + "flos": 21726961718400.0, + "grad_norm": 1.852738059166697, + "language_loss": 0.72176206, + "learning_rate": 3.4277113235433584e-06, + "loss": 0.74350333, + "num_input_tokens_seen": 96957710, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.8828125, + "step": 4483, + "time_per_iteration": 2.4762344360351562 + }, + { + "auxiliary_loss_clip": 0.01133416, + "auxiliary_loss_mlp": 0.01043511, + "balance_loss_clip": 1.02635717, + "balance_loss_mlp": 1.04290676, + "epoch": 0.2695926649631745, + "flos": 19682926369920.0, + "grad_norm": 2.626283812761087, + "language_loss": 0.87107188, + "learning_rate": 3.427438559239605e-06, + "loss": 0.8928411, + "num_input_tokens_seen": 96975890, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.90625, + "step": 4484, + "time_per_iteration": 2.486185073852539 + }, + { + "auxiliary_loss_clip": 0.01131969, + "auxiliary_loss_mlp": 0.01040339, + "balance_loss_clip": 1.02447212, + "balance_loss_mlp": 1.04373026, + "epoch": 0.2696527882158425, + "flos": 32886596724480.0, + "grad_norm": 1.901905407661022, + "language_loss": 0.66389644, + "learning_rate": 3.427165740807239e-06, + "loss": 0.68561947, + "num_input_tokens_seen": 96998595, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8828125, + "step": 4485, + "time_per_iteration": 2.5674586296081543 + }, + { + "auxiliary_loss_clip": 0.01133447, + "auxiliary_loss_mlp": 0.01040269, + "balance_loss_clip": 1.02371132, + "balance_loss_mlp": 1.0445261, + "epoch": 0.26971291146851045, + "flos": 12124843320960.0, + "grad_norm": 2.8933932068842783, + "language_loss": 0.72378826, + "learning_rate": 3.426892868256604e-06, + "loss": 0.74552536, + "num_input_tokens_seen": 97013715, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.890625, + "step": 4486, + "time_per_iteration": 2.471036434173584 + }, + { + "auxiliary_loss_clip": 0.01137696, + "auxiliary_loss_mlp": 0.01038547, + "balance_loss_clip": 1.02257311, + "balance_loss_mlp": 1.04809284, + "epoch": 0.2697730347211784, + "flos": 22634459856000.0, + "grad_norm": 1.8546648123058087, + "language_loss": 0.83810318, + "learning_rate": 3.4266199415980495e-06, + "loss": 0.85986561, + "num_input_tokens_seen": 97031570, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8984375, + "step": 4487, + "time_per_iteration": 2.4867916107177734 + }, + { + "auxiliary_loss_clip": 0.01137573, + "auxiliary_loss_mlp": 0.01044901, + "balance_loss_clip": 1.02749646, + "balance_loss_mlp": 1.0477773, + "epoch": 0.2698331579738464, + "flos": 23513050523520.0, + "grad_norm": 2.2079504028023598, + "language_loss": 0.71220767, + "learning_rate": 3.4263469608419234e-06, + "loss": 0.73403245, + "num_input_tokens_seen": 97049815, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.8984375, + "step": 4488, + "time_per_iteration": 2.5174567699432373 + }, + { + "auxiliary_loss_clip": 0.01136886, + "auxiliary_loss_mlp": 0.01045434, + "balance_loss_clip": 1.02851868, + "balance_loss_mlp": 1.04792523, + "epoch": 0.26989328122651435, + "flos": 24641040297600.0, + "grad_norm": 1.6338784898376273, + "language_loss": 0.83736706, + "learning_rate": 3.426073925998578e-06, + "loss": 0.85919023, + "num_input_tokens_seen": 97067570, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.890625, + "step": 4489, + "time_per_iteration": 2.5314295291900635 + }, + { + "auxiliary_loss_clip": 0.01136964, + "auxiliary_loss_mlp": 0.01054545, + "balance_loss_clip": 1.03696203, + "balance_loss_mlp": 1.04693484, + "epoch": 0.2699534044791823, + "flos": 10772555068800.0, + "grad_norm": 2.5551945574509176, + "language_loss": 0.89805245, + "learning_rate": 3.4258008370783656e-06, + "loss": 0.91996753, + "num_input_tokens_seen": 97082180, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8984375, + "step": 4490, + "time_per_iteration": 2.4975826740264893 + }, + { + "auxiliary_loss_clip": 0.01128305, + "auxiliary_loss_mlp": 0.01042718, + "balance_loss_clip": 1.02741122, + "balance_loss_mlp": 1.04349554, + "epoch": 0.2700135277318503, + "flos": 36171597098880.0, + "grad_norm": 1.8455290723250308, + "language_loss": 0.73354411, + "learning_rate": 3.4255276940916434e-06, + "loss": 0.75525427, + "num_input_tokens_seen": 97103470, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.84765625, + "step": 4491, + "time_per_iteration": 2.6303470134735107 + }, + { + "auxiliary_loss_clip": 0.01138617, + "auxiliary_loss_mlp": 0.01043046, + "balance_loss_clip": 1.02613568, + "balance_loss_mlp": 1.04974079, + "epoch": 0.27007365098451824, + "flos": 17418614866560.0, + "grad_norm": 3.089516252272487, + "language_loss": 0.74379975, + "learning_rate": 3.4252544970487676e-06, + "loss": 0.7656163, + "num_input_tokens_seen": 97118100, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.890625, + "step": 4492, + "time_per_iteration": 2.5124619007110596 + }, + { + "auxiliary_loss_clip": 0.01133231, + "auxiliary_loss_mlp": 0.01040234, + "balance_loss_clip": 1.0241406, + "balance_loss_mlp": 1.04671812, + "epoch": 0.2701337742371862, + "flos": 23185688947200.0, + "grad_norm": 1.896651323252439, + "language_loss": 0.88740528, + "learning_rate": 3.4249812459600986e-06, + "loss": 0.90913987, + "num_input_tokens_seen": 97136765, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8671875, + "step": 4493, + "time_per_iteration": 2.480473756790161 + }, + { + "auxiliary_loss_clip": 0.01134006, + "auxiliary_loss_mlp": 0.01041715, + "balance_loss_clip": 1.02564538, + "balance_loss_mlp": 1.04676843, + "epoch": 0.2701938974898542, + "flos": 24389450461440.0, + "grad_norm": 1.468971775969503, + "language_loss": 0.70976114, + "learning_rate": 3.424707940835998e-06, + "loss": 0.73151839, + "num_input_tokens_seen": 97157470, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87109375, + "step": 4494, + "time_per_iteration": 2.5703446865081787 + }, + { + "auxiliary_loss_clip": 0.0112953, + "auxiliary_loss_mlp": 0.01034192, + "balance_loss_clip": 1.01920152, + "balance_loss_mlp": 1.04545951, + "epoch": 0.2702540207425222, + "flos": 26214322976640.0, + "grad_norm": 2.0322990364449325, + "language_loss": 0.86294192, + "learning_rate": 3.42443458168683e-06, + "loss": 0.88457918, + "num_input_tokens_seen": 97176905, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.83984375, + "step": 4495, + "time_per_iteration": 2.5428457260131836 + }, + { + "auxiliary_loss_clip": 0.01134, + "auxiliary_loss_mlp": 0.0104559, + "balance_loss_clip": 1.02968764, + "balance_loss_mlp": 1.04731214, + "epoch": 0.27031414399519016, + "flos": 22926377687040.0, + "grad_norm": 1.8698467905293557, + "language_loss": 0.76562083, + "learning_rate": 3.424161168522959e-06, + "loss": 0.7874167, + "num_input_tokens_seen": 97196380, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8671875, + "step": 4496, + "time_per_iteration": 2.5074446201324463 + }, + { + "auxiliary_loss_clip": 0.01048323, + "auxiliary_loss_mlp": 0.01012102, + "balance_loss_clip": 1.01042128, + "balance_loss_mlp": 1.01925802, + "epoch": 0.2703742672478581, + "flos": 63019780404480.0, + "grad_norm": 0.7221920911850954, + "language_loss": 0.50221699, + "learning_rate": 3.423887701354754e-06, + "loss": 0.52282125, + "num_input_tokens_seen": 97260100, + "router_z_loss_clip": 0.0168457, + "router_z_loss_mlp": 0.2890625, + "step": 4497, + "time_per_iteration": 3.110724687576294 + }, + { + "auxiliary_loss_clip": 0.01137008, + "auxiliary_loss_mlp": 0.01045622, + "balance_loss_clip": 1.03011322, + "balance_loss_mlp": 1.05020094, + "epoch": 0.2704343905005261, + "flos": 18840820942080.0, + "grad_norm": 1.6519561002314052, + "language_loss": 0.72420043, + "learning_rate": 3.4236141801925847e-06, + "loss": 0.74602675, + "num_input_tokens_seen": 97277935, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8671875, + "step": 4498, + "time_per_iteration": 2.522507429122925 + }, + { + "auxiliary_loss_clip": 0.01047265, + "auxiliary_loss_mlp": 0.0100549, + "balance_loss_clip": 1.0038569, + "balance_loss_mlp": 1.0182879, + "epoch": 0.27049451375319405, + "flos": 71233412618880.0, + "grad_norm": 0.7584910907853958, + "language_loss": 0.59222841, + "learning_rate": 3.4233406050468237e-06, + "loss": 0.61275595, + "num_input_tokens_seen": 97338845, + "router_z_loss_clip": 0.01635742, + "router_z_loss_mlp": 0.2890625, + "step": 4499, + "time_per_iteration": 3.1193060874938965 + }, + { + "auxiliary_loss_clip": 0.01132683, + "auxiliary_loss_mlp": 0.01036933, + "balance_loss_clip": 1.02085209, + "balance_loss_mlp": 1.04637063, + "epoch": 0.270554637005862, + "flos": 24278594112000.0, + "grad_norm": 2.0468109740969576, + "language_loss": 0.7361812, + "learning_rate": 3.4230669759278438e-06, + "loss": 0.75787735, + "num_input_tokens_seen": 97356640, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.86328125, + "step": 4500, + "time_per_iteration": 2.5073533058166504 + }, + { + "auxiliary_loss_clip": 0.01130893, + "auxiliary_loss_mlp": 0.01044299, + "balance_loss_clip": 1.02739513, + "balance_loss_mlp": 1.04379177, + "epoch": 0.27061476025853, + "flos": 17632318832640.0, + "grad_norm": 3.2528800155878765, + "language_loss": 0.80392325, + "learning_rate": 3.4227932928460215e-06, + "loss": 0.82567519, + "num_input_tokens_seen": 97372585, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.87109375, + "step": 4501, + "time_per_iteration": 2.4665989875793457 + }, + { + "auxiliary_loss_clip": 0.01134872, + "auxiliary_loss_mlp": 0.01044198, + "balance_loss_clip": 1.0278666, + "balance_loss_mlp": 1.04683352, + "epoch": 0.27067488351119795, + "flos": 22710123855360.0, + "grad_norm": 1.9148884605164396, + "language_loss": 0.72832727, + "learning_rate": 3.422519555811735e-06, + "loss": 0.75011796, + "num_input_tokens_seen": 97393315, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8828125, + "step": 4502, + "time_per_iteration": 2.511070489883423 + }, + { + "auxiliary_loss_clip": 0.01134337, + "auxiliary_loss_mlp": 0.01038575, + "balance_loss_clip": 1.0209558, + "balance_loss_mlp": 1.04282784, + "epoch": 0.2707350067638659, + "flos": 41719616087040.0, + "grad_norm": 1.724044037192685, + "language_loss": 0.68474984, + "learning_rate": 3.4222457648353642e-06, + "loss": 0.70647895, + "num_input_tokens_seen": 97417860, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9140625, + "step": 4503, + "time_per_iteration": 2.6554527282714844 + }, + { + "auxiliary_loss_clip": 0.01133759, + "auxiliary_loss_mlp": 0.01040282, + "balance_loss_clip": 1.02425468, + "balance_loss_mlp": 1.04659927, + "epoch": 0.2707951300165339, + "flos": 20193037367040.0, + "grad_norm": 2.0245220791315655, + "language_loss": 0.68488902, + "learning_rate": 3.4219719199272918e-06, + "loss": 0.7066294, + "num_input_tokens_seen": 97436780, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87109375, + "step": 4504, + "time_per_iteration": 2.4813036918640137 + }, + { + "auxiliary_loss_clip": 0.01135516, + "auxiliary_loss_mlp": 0.01043406, + "balance_loss_clip": 1.02811766, + "balance_loss_mlp": 1.05043292, + "epoch": 0.27085525326920185, + "flos": 21433966479360.0, + "grad_norm": 1.7616188880043606, + "language_loss": 0.75553012, + "learning_rate": 3.421698021097902e-06, + "loss": 0.77731931, + "num_input_tokens_seen": 97456190, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8515625, + "step": 4505, + "time_per_iteration": 2.482228994369507 + }, + { + "auxiliary_loss_clip": 0.01138199, + "auxiliary_loss_mlp": 0.01049925, + "balance_loss_clip": 1.03271127, + "balance_loss_mlp": 1.047171, + "epoch": 0.2709153765218698, + "flos": 17675232606720.0, + "grad_norm": 1.8888030992954683, + "language_loss": 0.73508286, + "learning_rate": 3.42142406835758e-06, + "loss": 0.75696409, + "num_input_tokens_seen": 97474545, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9140625, + "step": 4506, + "time_per_iteration": 2.493534803390503 + }, + { + "auxiliary_loss_clip": 0.01137077, + "auxiliary_loss_mlp": 0.01040919, + "balance_loss_clip": 1.02390218, + "balance_loss_mlp": 1.04818904, + "epoch": 0.2709754997745378, + "flos": 24456243801600.0, + "grad_norm": 2.012438120988393, + "language_loss": 0.80958861, + "learning_rate": 3.421150061716715e-06, + "loss": 0.83136857, + "num_input_tokens_seen": 97494520, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.890625, + "step": 4507, + "time_per_iteration": 2.488477945327759 + }, + { + "auxiliary_loss_clip": 0.01046128, + "auxiliary_loss_mlp": 0.01011944, + "balance_loss_clip": 1.0102514, + "balance_loss_mlp": 1.01738429, + "epoch": 0.2710356230272058, + "flos": 65210798206080.0, + "grad_norm": 0.7384209784394716, + "language_loss": 0.50892401, + "learning_rate": 3.420876001185698e-06, + "loss": 0.52950472, + "num_input_tokens_seen": 97552455, + "router_z_loss_clip": 0.01696777, + "router_z_loss_mlp": 0.28710938, + "step": 4508, + "time_per_iteration": 3.005894660949707 + }, + { + "auxiliary_loss_clip": 0.01129132, + "auxiliary_loss_mlp": 0.01039667, + "balance_loss_clip": 1.02413416, + "balance_loss_mlp": 1.04509401, + "epoch": 0.27109574627987376, + "flos": 25484438615040.0, + "grad_norm": 4.914093534195162, + "language_loss": 0.74373507, + "learning_rate": 3.4206018867749197e-06, + "loss": 0.76542306, + "num_input_tokens_seen": 97572650, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.83984375, + "step": 4509, + "time_per_iteration": 2.555645227432251 + }, + { + "auxiliary_loss_clip": 0.01126467, + "auxiliary_loss_mlp": 0.01039629, + "balance_loss_clip": 1.02418542, + "balance_loss_mlp": 1.04368544, + "epoch": 0.2711558695325417, + "flos": 19682782715520.0, + "grad_norm": 1.7859895301291084, + "language_loss": 0.71706283, + "learning_rate": 3.4203277184947757e-06, + "loss": 0.73872381, + "num_input_tokens_seen": 97591150, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.828125, + "step": 4510, + "time_per_iteration": 2.469756841659546 + }, + { + "auxiliary_loss_clip": 0.01133239, + "auxiliary_loss_mlp": 0.01034855, + "balance_loss_clip": 1.01921451, + "balance_loss_mlp": 1.04728365, + "epoch": 0.2712159927852097, + "flos": 18587758648320.0, + "grad_norm": 4.171230322312489, + "language_loss": 0.70698422, + "learning_rate": 3.4200534963556627e-06, + "loss": 0.72866517, + "num_input_tokens_seen": 97607410, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.859375, + "step": 4511, + "time_per_iteration": 3.9261832237243652 + }, + { + "auxiliary_loss_clip": 0.01133865, + "auxiliary_loss_mlp": 0.01043141, + "balance_loss_clip": 1.02660656, + "balance_loss_mlp": 1.04600286, + "epoch": 0.27127611603787766, + "flos": 25630235919360.0, + "grad_norm": 2.0859148079323564, + "language_loss": 0.80823237, + "learning_rate": 3.419779220367979e-06, + "loss": 0.83000243, + "num_input_tokens_seen": 97626870, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.87890625, + "step": 4512, + "time_per_iteration": 2.5112404823303223 + }, + { + "auxiliary_loss_clip": 0.01128916, + "auxiliary_loss_mlp": 0.01035298, + "balance_loss_clip": 1.02108788, + "balance_loss_mlp": 1.04543233, + "epoch": 0.2713362392905456, + "flos": 23148952312320.0, + "grad_norm": 1.880665339674376, + "language_loss": 0.80508482, + "learning_rate": 3.419504890542124e-06, + "loss": 0.82672697, + "num_input_tokens_seen": 97646595, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8359375, + "step": 4513, + "time_per_iteration": 2.5550525188446045 + }, + { + "auxiliary_loss_clip": 0.01132709, + "auxiliary_loss_mlp": 0.01042049, + "balance_loss_clip": 1.02668297, + "balance_loss_mlp": 1.04505134, + "epoch": 0.2713963625432136, + "flos": 18366045949440.0, + "grad_norm": 1.8883190176483522, + "language_loss": 0.88062817, + "learning_rate": 3.4192305068885026e-06, + "loss": 0.90237576, + "num_input_tokens_seen": 97665485, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.875, + "step": 4514, + "time_per_iteration": 2.4411823749542236 + }, + { + "auxiliary_loss_clip": 0.0113378, + "auxiliary_loss_mlp": 0.01041006, + "balance_loss_clip": 1.02475166, + "balance_loss_mlp": 1.04799736, + "epoch": 0.27145648579588155, + "flos": 22491751121280.0, + "grad_norm": 2.468440108941068, + "language_loss": 0.92064375, + "learning_rate": 3.418956069417517e-06, + "loss": 0.94239157, + "num_input_tokens_seen": 97683800, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.859375, + "step": 4515, + "time_per_iteration": 2.507073402404785 + }, + { + "auxiliary_loss_clip": 0.01140812, + "auxiliary_loss_mlp": 0.01050656, + "balance_loss_clip": 1.03202391, + "balance_loss_mlp": 1.04952395, + "epoch": 0.2715166090485495, + "flos": 19239177749760.0, + "grad_norm": 2.5869205534481017, + "language_loss": 0.73691195, + "learning_rate": 3.4186815781395756e-06, + "loss": 0.75882661, + "num_input_tokens_seen": 97700505, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.9140625, + "step": 4516, + "time_per_iteration": 2.4427852630615234 + }, + { + "auxiliary_loss_clip": 0.01134153, + "auxiliary_loss_mlp": 0.01040166, + "balance_loss_clip": 1.02352417, + "balance_loss_mlp": 1.0466857, + "epoch": 0.2715767323012175, + "flos": 17709598944000.0, + "grad_norm": 6.588152355110397, + "language_loss": 0.76239699, + "learning_rate": 3.4184070330650866e-06, + "loss": 0.78414017, + "num_input_tokens_seen": 97717410, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.875, + "step": 4517, + "time_per_iteration": 2.4891836643218994 + }, + { + "auxiliary_loss_clip": 0.01133662, + "auxiliary_loss_mlp": 0.01039085, + "balance_loss_clip": 1.02201402, + "balance_loss_mlp": 1.0473218, + "epoch": 0.27163685555388545, + "flos": 22382834106240.0, + "grad_norm": 2.2012309941627066, + "language_loss": 0.76785064, + "learning_rate": 3.4181324342044607e-06, + "loss": 0.78957808, + "num_input_tokens_seen": 97734545, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.86328125, + "step": 4518, + "time_per_iteration": 2.503117561340332 + }, + { + "auxiliary_loss_clip": 0.01133735, + "auxiliary_loss_mlp": 0.01039304, + "balance_loss_clip": 1.0241586, + "balance_loss_mlp": 1.04699707, + "epoch": 0.2716969788065534, + "flos": 22346708002560.0, + "grad_norm": 1.6415373198141725, + "language_loss": 0.68314338, + "learning_rate": 3.41785778156811e-06, + "loss": 0.7048738, + "num_input_tokens_seen": 97754000, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8671875, + "step": 4519, + "time_per_iteration": 2.573230028152466 + }, + { + "auxiliary_loss_clip": 0.01132669, + "auxiliary_loss_mlp": 0.01034398, + "balance_loss_clip": 1.01893663, + "balance_loss_mlp": 1.04631245, + "epoch": 0.2717571020592214, + "flos": 25228467319680.0, + "grad_norm": 2.6734918677628685, + "language_loss": 0.755759, + "learning_rate": 3.417583075166451e-06, + "loss": 0.7774297, + "num_input_tokens_seen": 97772080, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.86328125, + "step": 4520, + "time_per_iteration": 2.535546064376831 + }, + { + "auxiliary_loss_clip": 0.01138716, + "auxiliary_loss_mlp": 0.01044302, + "balance_loss_clip": 1.02628946, + "balance_loss_mlp": 1.0501039, + "epoch": 0.2718172253118894, + "flos": 20189769229440.0, + "grad_norm": 2.5201661256644523, + "language_loss": 0.76219606, + "learning_rate": 3.4173083150099e-06, + "loss": 0.78402621, + "num_input_tokens_seen": 97789370, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.88671875, + "step": 4521, + "time_per_iteration": 2.491654396057129 + }, + { + "auxiliary_loss_clip": 0.01137284, + "auxiliary_loss_mlp": 0.01047464, + "balance_loss_clip": 1.03102481, + "balance_loss_mlp": 1.04803133, + "epoch": 0.27187734856455736, + "flos": 14319129260160.0, + "grad_norm": 2.3970894391693967, + "language_loss": 0.75911158, + "learning_rate": 3.417033501108875e-06, + "loss": 0.78095901, + "num_input_tokens_seen": 97807385, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.890625, + "step": 4522, + "time_per_iteration": 2.471673011779785 + }, + { + "auxiliary_loss_clip": 0.01137707, + "auxiliary_loss_mlp": 0.0103702, + "balance_loss_clip": 1.02042627, + "balance_loss_mlp": 1.04873872, + "epoch": 0.27193747181722533, + "flos": 21107682311040.0, + "grad_norm": 5.0666434109354075, + "language_loss": 0.72895801, + "learning_rate": 3.416758633473798e-06, + "loss": 0.75070536, + "num_input_tokens_seen": 97827930, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.890625, + "step": 4523, + "time_per_iteration": 2.5152363777160645 + }, + { + "auxiliary_loss_clip": 0.01129262, + "auxiliary_loss_mlp": 0.01038594, + "balance_loss_clip": 1.02208352, + "balance_loss_mlp": 1.04448104, + "epoch": 0.2719975950698933, + "flos": 19682782715520.0, + "grad_norm": 1.5338044020439772, + "language_loss": 0.74324989, + "learning_rate": 3.4164837121150915e-06, + "loss": 0.76492846, + "num_input_tokens_seen": 97847440, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.84765625, + "step": 4524, + "time_per_iteration": 2.495253562927246 + }, + { + "auxiliary_loss_clip": 0.01135118, + "auxiliary_loss_mlp": 0.01039959, + "balance_loss_clip": 1.02380621, + "balance_loss_mlp": 1.04772878, + "epoch": 0.27205771832256126, + "flos": 24754482426240.0, + "grad_norm": 2.881398237919427, + "language_loss": 0.76651889, + "learning_rate": 3.4162087370431803e-06, + "loss": 0.78826964, + "num_input_tokens_seen": 97867620, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.875, + "step": 4525, + "time_per_iteration": 2.511634111404419 + }, + { + "auxiliary_loss_clip": 0.01131035, + "auxiliary_loss_mlp": 0.01049235, + "balance_loss_clip": 1.0334518, + "balance_loss_mlp": 1.04626358, + "epoch": 0.2721178415752292, + "flos": 21755581879680.0, + "grad_norm": 1.8599028556429251, + "language_loss": 0.81914634, + "learning_rate": 3.4159337082684926e-06, + "loss": 0.84094906, + "num_input_tokens_seen": 97884345, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.84765625, + "step": 4526, + "time_per_iteration": 2.495011568069458 + }, + { + "auxiliary_loss_clip": 0.01137971, + "auxiliary_loss_mlp": 0.01044775, + "balance_loss_clip": 1.02770483, + "balance_loss_mlp": 1.0466783, + "epoch": 0.2721779648278972, + "flos": 12676826597760.0, + "grad_norm": 3.313629745591453, + "language_loss": 0.77007318, + "learning_rate": 3.4156586258014566e-06, + "loss": 0.79190063, + "num_input_tokens_seen": 97901500, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9140625, + "step": 4527, + "time_per_iteration": 2.5181260108947754 + }, + { + "auxiliary_loss_clip": 0.0113407, + "auxiliary_loss_mlp": 0.01041797, + "balance_loss_clip": 1.02496481, + "balance_loss_mlp": 1.04637635, + "epoch": 0.27223808808056515, + "flos": 16253206099200.0, + "grad_norm": 2.1845797146290784, + "language_loss": 0.81825048, + "learning_rate": 3.415383489652503e-06, + "loss": 0.84000921, + "num_input_tokens_seen": 97917800, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.87890625, + "step": 4528, + "time_per_iteration": 2.469916582107544 + }, + { + "auxiliary_loss_clip": 0.01131576, + "auxiliary_loss_mlp": 0.01042667, + "balance_loss_clip": 1.0273608, + "balance_loss_mlp": 1.04669189, + "epoch": 0.2722982113332331, + "flos": 27745805203200.0, + "grad_norm": 1.6672454466706952, + "language_loss": 0.77123594, + "learning_rate": 3.4151082998320666e-06, + "loss": 0.79297841, + "num_input_tokens_seen": 97937225, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8515625, + "step": 4529, + "time_per_iteration": 2.5379140377044678 + }, + { + "auxiliary_loss_clip": 0.01133862, + "auxiliary_loss_mlp": 0.01044178, + "balance_loss_clip": 1.02900243, + "balance_loss_mlp": 1.04580855, + "epoch": 0.2723583345859011, + "flos": 21726243446400.0, + "grad_norm": 2.4153957329893228, + "language_loss": 0.8195889, + "learning_rate": 3.4148330563505805e-06, + "loss": 0.84136933, + "num_input_tokens_seen": 97956845, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8828125, + "step": 4530, + "time_per_iteration": 2.5363659858703613 + }, + { + "auxiliary_loss_clip": 0.01133042, + "auxiliary_loss_mlp": 0.01036315, + "balance_loss_clip": 1.02010226, + "balance_loss_mlp": 1.04630172, + "epoch": 0.27241845783856905, + "flos": 17347260499200.0, + "grad_norm": 2.1797176655983432, + "language_loss": 0.91650689, + "learning_rate": 3.4145577592184838e-06, + "loss": 0.93820047, + "num_input_tokens_seen": 97972465, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8671875, + "step": 4531, + "time_per_iteration": 2.508429765701294 + }, + { + "auxiliary_loss_clip": 0.01134833, + "auxiliary_loss_mlp": 0.01047772, + "balance_loss_clip": 1.03159511, + "balance_loss_mlp": 1.04611766, + "epoch": 0.272478581091237, + "flos": 24754302858240.0, + "grad_norm": 2.532443443519077, + "language_loss": 0.76107466, + "learning_rate": 3.4142824084462155e-06, + "loss": 0.78290069, + "num_input_tokens_seen": 97990770, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.88671875, + "step": 4532, + "time_per_iteration": 2.499457359313965 + }, + { + "auxiliary_loss_clip": 0.01130352, + "auxiliary_loss_mlp": 0.01034139, + "balance_loss_clip": 1.01861846, + "balance_loss_mlp": 1.04643464, + "epoch": 0.272538704343905, + "flos": 17890624512000.0, + "grad_norm": 3.1928401528407746, + "language_loss": 0.89197671, + "learning_rate": 3.4140070040442162e-06, + "loss": 0.91362166, + "num_input_tokens_seen": 98005775, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.83984375, + "step": 4533, + "time_per_iteration": 2.508202075958252 + }, + { + "auxiliary_loss_clip": 0.0113001, + "auxiliary_loss_mlp": 0.01037014, + "balance_loss_clip": 1.02118278, + "balance_loss_mlp": 1.04587626, + "epoch": 0.272598827596573, + "flos": 22932016122240.0, + "grad_norm": 2.096334750916122, + "language_loss": 0.7125262, + "learning_rate": 3.413731546022929e-06, + "loss": 0.73419642, + "num_input_tokens_seen": 98025750, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.84375, + "step": 4534, + "time_per_iteration": 2.5111024379730225 + }, + { + "auxiliary_loss_clip": 0.01134655, + "auxiliary_loss_mlp": 0.01040405, + "balance_loss_clip": 1.02315593, + "balance_loss_mlp": 1.04651427, + "epoch": 0.27265895084924097, + "flos": 24238409771520.0, + "grad_norm": 1.9613498766130548, + "language_loss": 0.91064882, + "learning_rate": 3.4134560343928005e-06, + "loss": 0.93239939, + "num_input_tokens_seen": 98044955, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.8828125, + "step": 4535, + "time_per_iteration": 2.5509371757507324 + }, + { + "auxiliary_loss_clip": 0.01138846, + "auxiliary_loss_mlp": 0.01039245, + "balance_loss_clip": 1.02262712, + "balance_loss_mlp": 1.05108571, + "epoch": 0.27271907410190893, + "flos": 27013155494400.0, + "grad_norm": 1.5906078149456282, + "language_loss": 0.72618866, + "learning_rate": 3.4131804691642778e-06, + "loss": 0.74796963, + "num_input_tokens_seen": 98065860, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.87890625, + "step": 4536, + "time_per_iteration": 2.5106241703033447 + }, + { + "auxiliary_loss_clip": 0.01133436, + "auxiliary_loss_mlp": 0.0103976, + "balance_loss_clip": 1.02302337, + "balance_loss_mlp": 1.04617631, + "epoch": 0.2727791973545769, + "flos": 34452588942720.0, + "grad_norm": 1.839444357786457, + "language_loss": 0.7144469, + "learning_rate": 3.41290485034781e-06, + "loss": 0.73617887, + "num_input_tokens_seen": 98085450, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.87109375, + "step": 4537, + "time_per_iteration": 2.588439464569092 + }, + { + "auxiliary_loss_clip": 0.01132537, + "auxiliary_loss_mlp": 0.01040014, + "balance_loss_clip": 1.02363503, + "balance_loss_mlp": 1.04501796, + "epoch": 0.27283932060724486, + "flos": 15041723160960.0, + "grad_norm": 2.431092364938405, + "language_loss": 0.78177559, + "learning_rate": 3.4126291779538485e-06, + "loss": 0.80350113, + "num_input_tokens_seen": 98099115, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.875, + "step": 4538, + "time_per_iteration": 2.438603639602661 + }, + { + "auxiliary_loss_clip": 0.01134265, + "auxiliary_loss_mlp": 0.01041521, + "balance_loss_clip": 1.02609527, + "balance_loss_mlp": 1.04698634, + "epoch": 0.2728994438599128, + "flos": 21652411040640.0, + "grad_norm": 1.4794812227008705, + "language_loss": 0.90038705, + "learning_rate": 3.412353451992847e-06, + "loss": 0.92214489, + "num_input_tokens_seen": 98118415, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.875, + "step": 4539, + "time_per_iteration": 2.5052709579467773 + }, + { + "auxiliary_loss_clip": 0.01132202, + "auxiliary_loss_mlp": 0.01041987, + "balance_loss_clip": 1.02414095, + "balance_loss_mlp": 1.04627967, + "epoch": 0.2729595671125808, + "flos": 17488424949120.0, + "grad_norm": 2.0712338481270884, + "language_loss": 0.88711655, + "learning_rate": 3.4120776724752607e-06, + "loss": 0.90885842, + "num_input_tokens_seen": 98136300, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.859375, + "step": 4540, + "time_per_iteration": 2.457939624786377 + }, + { + "auxiliary_loss_clip": 0.01133918, + "auxiliary_loss_mlp": 0.01033711, + "balance_loss_clip": 1.01771343, + "balance_loss_mlp": 1.04666936, + "epoch": 0.27301969036524876, + "flos": 19318145800320.0, + "grad_norm": 1.9363402300433894, + "language_loss": 0.81993663, + "learning_rate": 3.4118018394115476e-06, + "loss": 0.84161294, + "num_input_tokens_seen": 98154580, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.875, + "step": 4541, + "time_per_iteration": 2.461517333984375 + }, + { + "auxiliary_loss_clip": 0.01133224, + "auxiliary_loss_mlp": 0.01041774, + "balance_loss_clip": 1.02484596, + "balance_loss_mlp": 1.04623377, + "epoch": 0.2730798136179167, + "flos": 21065666376960.0, + "grad_norm": 1.8882731025231656, + "language_loss": 0.7925449, + "learning_rate": 3.4115259528121678e-06, + "loss": 0.81429487, + "num_input_tokens_seen": 98173115, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.87109375, + "step": 4542, + "time_per_iteration": 2.487905979156494 + }, + { + "auxiliary_loss_clip": 0.01136186, + "auxiliary_loss_mlp": 0.01040722, + "balance_loss_clip": 1.02441418, + "balance_loss_mlp": 1.04965162, + "epoch": 0.2731399368705847, + "flos": 19171737964800.0, + "grad_norm": 2.197105758262293, + "language_loss": 0.89471424, + "learning_rate": 3.411250012687582e-06, + "loss": 0.91648328, + "num_input_tokens_seen": 98190260, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8671875, + "step": 4543, + "time_per_iteration": 2.4903039932250977 + }, + { + "auxiliary_loss_clip": 0.01137887, + "auxiliary_loss_mlp": 0.01046974, + "balance_loss_clip": 1.02955735, + "balance_loss_mlp": 1.04841042, + "epoch": 0.27320006012325265, + "flos": 18290130554880.0, + "grad_norm": 2.084938235366164, + "language_loss": 0.63666493, + "learning_rate": 3.410974019048255e-06, + "loss": 0.65851355, + "num_input_tokens_seen": 98207115, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.89453125, + "step": 4544, + "time_per_iteration": 2.4529080390930176 + }, + { + "auxiliary_loss_clip": 0.01137894, + "auxiliary_loss_mlp": 0.01047722, + "balance_loss_clip": 1.03043687, + "balance_loss_mlp": 1.05032265, + "epoch": 0.2732601833759206, + "flos": 34860929731200.0, + "grad_norm": 1.5170655618085727, + "language_loss": 0.6996637, + "learning_rate": 3.410697971904651e-06, + "loss": 0.72151983, + "num_input_tokens_seen": 98230610, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.875, + "step": 4545, + "time_per_iteration": 2.6089117527008057 + }, + { + "auxiliary_loss_clip": 0.01048793, + "auxiliary_loss_mlp": 0.01019944, + "balance_loss_clip": 1.01828671, + "balance_loss_mlp": 1.01938868, + "epoch": 0.2733203066285886, + "flos": 53910824762880.0, + "grad_norm": 0.7273987605446792, + "language_loss": 0.61571473, + "learning_rate": 3.4104218712672383e-06, + "loss": 0.63640207, + "num_input_tokens_seen": 98293585, + "router_z_loss_clip": 0.01660156, + "router_z_loss_mlp": 0.29296875, + "step": 4546, + "time_per_iteration": 3.1125431060791016 + }, + { + "auxiliary_loss_clip": 0.01136724, + "auxiliary_loss_mlp": 0.0104828, + "balance_loss_clip": 1.03199649, + "balance_loss_mlp": 1.05012798, + "epoch": 0.2733804298812566, + "flos": 20660378244480.0, + "grad_norm": 1.9369682323358774, + "language_loss": 0.64982706, + "learning_rate": 3.410145717146488e-06, + "loss": 0.67167711, + "num_input_tokens_seen": 98311680, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8671875, + "step": 4547, + "time_per_iteration": 2.497563600540161 + }, + { + "auxiliary_loss_clip": 0.01132998, + "auxiliary_loss_mlp": 0.01041584, + "balance_loss_clip": 1.0262835, + "balance_loss_mlp": 1.04765081, + "epoch": 0.27344055313392457, + "flos": 25884339707520.0, + "grad_norm": 2.2377196076559183, + "language_loss": 0.77178854, + "learning_rate": 3.4098695095528694e-06, + "loss": 0.7935344, + "num_input_tokens_seen": 98330770, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8515625, + "step": 4548, + "time_per_iteration": 2.536813259124756 + }, + { + "auxiliary_loss_clip": 0.01133984, + "auxiliary_loss_mlp": 0.01043122, + "balance_loss_clip": 1.02854848, + "balance_loss_mlp": 1.04827595, + "epoch": 0.27350067638659253, + "flos": 22929753565440.0, + "grad_norm": 1.8894391736419274, + "language_loss": 0.82382214, + "learning_rate": 3.4095932484968585e-06, + "loss": 0.84559321, + "num_input_tokens_seen": 98349860, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.859375, + "step": 4549, + "time_per_iteration": 2.5156633853912354 + }, + { + "auxiliary_loss_clip": 0.01132691, + "auxiliary_loss_mlp": 0.0104484, + "balance_loss_clip": 1.02744722, + "balance_loss_mlp": 1.04482448, + "epoch": 0.2735607996392605, + "flos": 16574821499520.0, + "grad_norm": 2.2209993145005793, + "language_loss": 0.70675868, + "learning_rate": 3.4093169339889305e-06, + "loss": 0.72853404, + "num_input_tokens_seen": 98367040, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.875, + "step": 4550, + "time_per_iteration": 2.4510462284088135 + }, + { + "auxiliary_loss_clip": 0.0113302, + "auxiliary_loss_mlp": 0.01046908, + "balance_loss_clip": 1.03272784, + "balance_loss_mlp": 1.04789186, + "epoch": 0.27362092289192846, + "flos": 19645291895040.0, + "grad_norm": 2.43111621366583, + "language_loss": 0.78738058, + "learning_rate": 3.409040566039563e-06, + "loss": 0.80917984, + "num_input_tokens_seen": 98384010, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8515625, + "step": 4551, + "time_per_iteration": 2.470520496368408 + }, + { + "auxiliary_loss_clip": 0.01132621, + "auxiliary_loss_mlp": 0.01051474, + "balance_loss_clip": 1.03548765, + "balance_loss_mlp": 1.04601097, + "epoch": 0.27368104614459643, + "flos": 17639142416640.0, + "grad_norm": 2.681171335598487, + "language_loss": 0.70585275, + "learning_rate": 3.4087641446592362e-06, + "loss": 0.72769368, + "num_input_tokens_seen": 98399625, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8671875, + "step": 4552, + "time_per_iteration": 3.9179859161376953 + }, + { + "auxiliary_loss_clip": 0.01135382, + "auxiliary_loss_mlp": 0.01046031, + "balance_loss_clip": 1.02936506, + "balance_loss_mlp": 1.04864776, + "epoch": 0.2737411693972644, + "flos": 21580015178880.0, + "grad_norm": 2.3865688662341005, + "language_loss": 0.71857619, + "learning_rate": 3.408487669858431e-06, + "loss": 0.7403903, + "num_input_tokens_seen": 98417310, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8671875, + "step": 4553, + "time_per_iteration": 4.032766342163086 + }, + { + "auxiliary_loss_clip": 0.01131855, + "auxiliary_loss_mlp": 0.01044919, + "balance_loss_clip": 1.02853942, + "balance_loss_mlp": 1.04585433, + "epoch": 0.27380129264993236, + "flos": 25484043565440.0, + "grad_norm": 1.5870570208244068, + "language_loss": 0.59154749, + "learning_rate": 3.4082111416476337e-06, + "loss": 0.61331522, + "num_input_tokens_seen": 98438670, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.859375, + "step": 4554, + "time_per_iteration": 2.549534320831299 + }, + { + "auxiliary_loss_clip": 0.01138763, + "auxiliary_loss_mlp": 0.01041638, + "balance_loss_clip": 1.02448368, + "balance_loss_mlp": 1.04893517, + "epoch": 0.2738614159026003, + "flos": 18661196004480.0, + "grad_norm": 1.7727518382715788, + "language_loss": 0.73820007, + "learning_rate": 3.4079345600373275e-06, + "loss": 0.76000404, + "num_input_tokens_seen": 98456060, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.8984375, + "step": 4555, + "time_per_iteration": 2.5162432193756104 + }, + { + "auxiliary_loss_clip": 0.01136837, + "auxiliary_loss_mlp": 0.01039479, + "balance_loss_clip": 1.02348125, + "balance_loss_mlp": 1.04923606, + "epoch": 0.2739215391552683, + "flos": 23477139901440.0, + "grad_norm": 1.956724452661134, + "language_loss": 0.7785511, + "learning_rate": 3.407657925038002e-06, + "loss": 0.80031419, + "num_input_tokens_seen": 98473765, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.875, + "step": 4556, + "time_per_iteration": 2.5205135345458984 + }, + { + "auxiliary_loss_clip": 0.01145391, + "auxiliary_loss_mlp": 0.0105386, + "balance_loss_clip": 1.03640783, + "balance_loss_mlp": 1.04952264, + "epoch": 0.27398166240793626, + "flos": 17128636369920.0, + "grad_norm": 1.7956202604517526, + "language_loss": 0.82272434, + "learning_rate": 3.4073812366601473e-06, + "loss": 0.84471685, + "num_input_tokens_seen": 98490590, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9609375, + "step": 4557, + "time_per_iteration": 2.486485719680786 + }, + { + "auxiliary_loss_clip": 0.01133092, + "auxiliary_loss_mlp": 0.01042572, + "balance_loss_clip": 1.02691972, + "balance_loss_mlp": 1.04657316, + "epoch": 0.2740417856606042, + "flos": 23404744039680.0, + "grad_norm": 1.7971714372597054, + "language_loss": 0.72697943, + "learning_rate": 3.4071044949142547e-06, + "loss": 0.74873614, + "num_input_tokens_seen": 98510590, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.86328125, + "step": 4558, + "time_per_iteration": 2.5272727012634277 + }, + { + "auxiliary_loss_clip": 0.01131967, + "auxiliary_loss_mlp": 0.01048867, + "balance_loss_clip": 1.03243995, + "balance_loss_mlp": 1.04504418, + "epoch": 0.2741019089132722, + "flos": 12780428400000.0, + "grad_norm": 2.1318143008079686, + "language_loss": 0.6804775, + "learning_rate": 3.406827699810819e-06, + "loss": 0.70228577, + "num_input_tokens_seen": 98527875, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8671875, + "step": 4559, + "time_per_iteration": 2.4787509441375732 + }, + { + "auxiliary_loss_clip": 0.01131026, + "auxiliary_loss_mlp": 0.01043891, + "balance_loss_clip": 1.02750015, + "balance_loss_mlp": 1.04517901, + "epoch": 0.27416203216594015, + "flos": 20631542601600.0, + "grad_norm": 3.5500966853689673, + "language_loss": 0.71847737, + "learning_rate": 3.4065508513603353e-06, + "loss": 0.74022651, + "num_input_tokens_seen": 98547575, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.859375, + "step": 4560, + "time_per_iteration": 2.490152359008789 + }, + { + "auxiliary_loss_clip": 0.0113572, + "auxiliary_loss_mlp": 0.01041958, + "balance_loss_clip": 1.02642488, + "balance_loss_mlp": 1.04779601, + "epoch": 0.27422215541860817, + "flos": 26541576812160.0, + "grad_norm": 1.7948619898284635, + "language_loss": 0.80998009, + "learning_rate": 3.406273949573303e-06, + "loss": 0.83175689, + "num_input_tokens_seen": 98566290, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.87890625, + "step": 4561, + "time_per_iteration": 2.554872512817383 + }, + { + "auxiliary_loss_clip": 0.01136406, + "auxiliary_loss_mlp": 0.01041546, + "balance_loss_clip": 1.02600157, + "balance_loss_mlp": 1.04711854, + "epoch": 0.27428227867127614, + "flos": 23331163029120.0, + "grad_norm": 1.7370289005889625, + "language_loss": 0.7531321, + "learning_rate": 3.4059969944602214e-06, + "loss": 0.77491164, + "num_input_tokens_seen": 98586255, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.890625, + "step": 4562, + "time_per_iteration": 2.4925429821014404 + }, + { + "auxiliary_loss_clip": 0.01133486, + "auxiliary_loss_mlp": 0.01037482, + "balance_loss_clip": 1.02173424, + "balance_loss_mlp": 1.04701662, + "epoch": 0.2743424019239441, + "flos": 23035115134080.0, + "grad_norm": 1.598166418515773, + "language_loss": 0.74503827, + "learning_rate": 3.4057199860315928e-06, + "loss": 0.76674795, + "num_input_tokens_seen": 98606030, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.86328125, + "step": 4563, + "time_per_iteration": 2.5514259338378906 + }, + { + "auxiliary_loss_clip": 0.01138282, + "auxiliary_loss_mlp": 0.01045739, + "balance_loss_clip": 1.02798915, + "balance_loss_mlp": 1.04708612, + "epoch": 0.27440252517661207, + "flos": 21981101420160.0, + "grad_norm": 1.8271759108968861, + "language_loss": 0.62526429, + "learning_rate": 3.4054429242979213e-06, + "loss": 0.64710456, + "num_input_tokens_seen": 98625225, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9140625, + "step": 4564, + "time_per_iteration": 2.479156494140625 + }, + { + "auxiliary_loss_clip": 0.01136574, + "auxiliary_loss_mlp": 0.01042695, + "balance_loss_clip": 1.02513587, + "balance_loss_mlp": 1.04808652, + "epoch": 0.27446264842928003, + "flos": 40187451502080.0, + "grad_norm": 1.9245884320117708, + "language_loss": 0.78135669, + "learning_rate": 3.4051658092697135e-06, + "loss": 0.80314934, + "num_input_tokens_seen": 98649470, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8828125, + "step": 4565, + "time_per_iteration": 2.714069366455078 + }, + { + "auxiliary_loss_clip": 0.01133378, + "auxiliary_loss_mlp": 0.01039885, + "balance_loss_clip": 1.02443504, + "balance_loss_mlp": 1.04669619, + "epoch": 0.274522771681948, + "flos": 13479681438720.0, + "grad_norm": 2.3377831889988547, + "language_loss": 0.68350124, + "learning_rate": 3.404888640957477e-06, + "loss": 0.70523381, + "num_input_tokens_seen": 98666915, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8671875, + "step": 4566, + "time_per_iteration": 2.469357967376709 + }, + { + "auxiliary_loss_clip": 0.01133208, + "auxiliary_loss_mlp": 0.01046416, + "balance_loss_clip": 1.03211665, + "balance_loss_mlp": 1.04901338, + "epoch": 0.27458289493461596, + "flos": 28622133313920.0, + "grad_norm": 1.7938914020631171, + "language_loss": 0.60886472, + "learning_rate": 3.404611419371723e-06, + "loss": 0.63066101, + "num_input_tokens_seen": 98688240, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.84375, + "step": 4567, + "time_per_iteration": 2.5856754779815674 + }, + { + "auxiliary_loss_clip": 0.01134122, + "auxiliary_loss_mlp": 0.01042972, + "balance_loss_clip": 1.02597237, + "balance_loss_mlp": 1.04754972, + "epoch": 0.2746430181872839, + "flos": 20119815492480.0, + "grad_norm": 1.7650663548751138, + "language_loss": 0.82787997, + "learning_rate": 3.4043341445229627e-06, + "loss": 0.84965092, + "num_input_tokens_seen": 98708245, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.86328125, + "step": 4568, + "time_per_iteration": 2.476353168487549 + }, + { + "auxiliary_loss_clip": 0.0113839, + "auxiliary_loss_mlp": 0.01034679, + "balance_loss_clip": 1.01868141, + "balance_loss_mlp": 1.05012584, + "epoch": 0.2747031414399519, + "flos": 20193468330240.0, + "grad_norm": 2.0155686346894415, + "language_loss": 0.68656778, + "learning_rate": 3.4040568164217117e-06, + "loss": 0.7082985, + "num_input_tokens_seen": 98724575, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8828125, + "step": 4569, + "time_per_iteration": 2.5027451515197754 + }, + { + "auxiliary_loss_clip": 0.01133852, + "auxiliary_loss_mlp": 0.0103613, + "balance_loss_clip": 1.01947594, + "balance_loss_mlp": 1.0464673, + "epoch": 0.27476326469261986, + "flos": 13516346246400.0, + "grad_norm": 2.247407128453888, + "language_loss": 0.71138883, + "learning_rate": 3.4037794350784848e-06, + "loss": 0.73308867, + "num_input_tokens_seen": 98740700, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.875, + "step": 4570, + "time_per_iteration": 2.466845750808716 + }, + { + "auxiliary_loss_clip": 0.0104735, + "auxiliary_loss_mlp": 0.01010434, + "balance_loss_clip": 1.00881279, + "balance_loss_mlp": 1.01781416, + "epoch": 0.2748233879452878, + "flos": 65937127121280.0, + "grad_norm": 0.7344992896847644, + "language_loss": 0.55774754, + "learning_rate": 3.4035020005038014e-06, + "loss": 0.57832539, + "num_input_tokens_seen": 98803030, + "router_z_loss_clip": 0.01623535, + "router_z_loss_mlp": 0.296875, + "step": 4571, + "time_per_iteration": 3.192523241043091 + }, + { + "auxiliary_loss_clip": 0.01140339, + "auxiliary_loss_mlp": 0.01044242, + "balance_loss_clip": 1.02805328, + "balance_loss_mlp": 1.05039406, + "epoch": 0.2748835111979558, + "flos": 17384212615680.0, + "grad_norm": 3.6883594473706482, + "language_loss": 0.77785081, + "learning_rate": 3.4032245127081812e-06, + "loss": 0.79969662, + "num_input_tokens_seen": 98820505, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8984375, + "step": 4572, + "time_per_iteration": 2.4755914211273193 + }, + { + "auxiliary_loss_clip": 0.01129408, + "auxiliary_loss_mlp": 0.01036409, + "balance_loss_clip": 1.02200866, + "balance_loss_mlp": 1.04679561, + "epoch": 0.27494363445062375, + "flos": 23587565287680.0, + "grad_norm": 1.7042315716847805, + "language_loss": 0.81357443, + "learning_rate": 3.402946971702147e-06, + "loss": 0.83523262, + "num_input_tokens_seen": 98842150, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.828125, + "step": 4573, + "time_per_iteration": 2.540905237197876 + }, + { + "auxiliary_loss_clip": 0.01129787, + "auxiliary_loss_mlp": 0.01036343, + "balance_loss_clip": 1.02038062, + "balance_loss_mlp": 1.04580402, + "epoch": 0.2750037577032918, + "flos": 17164582905600.0, + "grad_norm": 1.7927939239771835, + "language_loss": 0.79077196, + "learning_rate": 3.402669377496223e-06, + "loss": 0.81243324, + "num_input_tokens_seen": 98861050, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.83984375, + "step": 4574, + "time_per_iteration": 2.451016664505005 + }, + { + "auxiliary_loss_clip": 0.01136155, + "auxiliary_loss_mlp": 0.01044603, + "balance_loss_clip": 1.02889121, + "balance_loss_mlp": 1.04886127, + "epoch": 0.27506388095595974, + "flos": 24491903028480.0, + "grad_norm": 2.232643844604772, + "language_loss": 0.74191976, + "learning_rate": 3.402391730100936e-06, + "loss": 0.76372731, + "num_input_tokens_seen": 98879695, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.875, + "step": 4575, + "time_per_iteration": 2.5744149684906006 + }, + { + "auxiliary_loss_clip": 0.01131901, + "auxiliary_loss_mlp": 0.01038458, + "balance_loss_clip": 1.02353263, + "balance_loss_mlp": 1.04711711, + "epoch": 0.2751240042086277, + "flos": 38764706722560.0, + "grad_norm": 1.8105072672356382, + "language_loss": 0.71877766, + "learning_rate": 3.402114029526814e-06, + "loss": 0.7404812, + "num_input_tokens_seen": 98902035, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.84765625, + "step": 4576, + "time_per_iteration": 2.634305715560913 + }, + { + "auxiliary_loss_clip": 0.01134924, + "auxiliary_loss_mlp": 0.01041859, + "balance_loss_clip": 1.02495503, + "balance_loss_mlp": 1.04823232, + "epoch": 0.27518412746129567, + "flos": 26907039740160.0, + "grad_norm": 1.7690392048384511, + "language_loss": 0.73200434, + "learning_rate": 3.4018362757843866e-06, + "loss": 0.75377214, + "num_input_tokens_seen": 98921835, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8671875, + "step": 4577, + "time_per_iteration": 2.5365946292877197 + }, + { + "auxiliary_loss_clip": 0.01137469, + "auxiliary_loss_mlp": 0.01038584, + "balance_loss_clip": 1.02182376, + "balance_loss_mlp": 1.04931974, + "epoch": 0.27524425071396363, + "flos": 24900531125760.0, + "grad_norm": 5.099060573221768, + "language_loss": 0.75943893, + "learning_rate": 3.401558468884188e-06, + "loss": 0.78119946, + "num_input_tokens_seen": 98939610, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8828125, + "step": 4578, + "time_per_iteration": 2.5121536254882812 + }, + { + "auxiliary_loss_clip": 0.01135832, + "auxiliary_loss_mlp": 0.01046459, + "balance_loss_clip": 1.02704024, + "balance_loss_mlp": 1.0475626, + "epoch": 0.2753043739666316, + "flos": 26288047641600.0, + "grad_norm": 2.3614458833507603, + "language_loss": 0.66299897, + "learning_rate": 3.4012806088367516e-06, + "loss": 0.68482184, + "num_input_tokens_seen": 98962250, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.8828125, + "step": 4579, + "time_per_iteration": 2.5445947647094727 + }, + { + "auxiliary_loss_clip": 0.01137742, + "auxiliary_loss_mlp": 0.01056341, + "balance_loss_clip": 1.03841197, + "balance_loss_mlp": 1.04862928, + "epoch": 0.27536449721929956, + "flos": 24206772867840.0, + "grad_norm": 1.9384727438162337, + "language_loss": 0.8013078, + "learning_rate": 3.4010026956526137e-06, + "loss": 0.82324862, + "num_input_tokens_seen": 98981845, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.890625, + "step": 4580, + "time_per_iteration": 2.4895741939544678 + }, + { + "auxiliary_loss_clip": 0.01138586, + "auxiliary_loss_mlp": 0.01044508, + "balance_loss_clip": 1.02581632, + "balance_loss_mlp": 1.05140579, + "epoch": 0.27542462047196753, + "flos": 19537272720000.0, + "grad_norm": 1.4702192551629332, + "language_loss": 0.67702103, + "learning_rate": 3.4007247293423137e-06, + "loss": 0.698852, + "num_input_tokens_seen": 99001855, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.87109375, + "step": 4581, + "time_per_iteration": 2.5905539989471436 + }, + { + "auxiliary_loss_clip": 0.01137135, + "auxiliary_loss_mlp": 0.01046005, + "balance_loss_clip": 1.03024602, + "balance_loss_mlp": 1.04847145, + "epoch": 0.2754847437246355, + "flos": 14319165173760.0, + "grad_norm": 1.8568978026073784, + "language_loss": 0.78120708, + "learning_rate": 3.400446709916392e-06, + "loss": 0.80303848, + "num_input_tokens_seen": 99019880, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.88671875, + "step": 4582, + "time_per_iteration": 2.467210531234741 + }, + { + "auxiliary_loss_clip": 0.01133579, + "auxiliary_loss_mlp": 0.01040863, + "balance_loss_clip": 1.02537727, + "balance_loss_mlp": 1.04905152, + "epoch": 0.27554486697730346, + "flos": 18838773866880.0, + "grad_norm": 2.5358708072067406, + "language_loss": 0.84527528, + "learning_rate": 3.4001686373853895e-06, + "loss": 0.86701977, + "num_input_tokens_seen": 99037570, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.84375, + "step": 4583, + "time_per_iteration": 2.511457920074463 + }, + { + "auxiliary_loss_clip": 0.01138165, + "auxiliary_loss_mlp": 0.01041348, + "balance_loss_clip": 1.02529025, + "balance_loss_mlp": 1.04905808, + "epoch": 0.2756049902299714, + "flos": 22382295402240.0, + "grad_norm": 2.037294788318467, + "language_loss": 0.67308438, + "learning_rate": 3.3998905117598528e-06, + "loss": 0.69487947, + "num_input_tokens_seen": 99056875, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.890625, + "step": 4584, + "time_per_iteration": 2.5193254947662354 + }, + { + "auxiliary_loss_clip": 0.01132805, + "auxiliary_loss_mlp": 0.01041645, + "balance_loss_clip": 1.02645802, + "balance_loss_mlp": 1.04761386, + "epoch": 0.2756651134826394, + "flos": 19573901614080.0, + "grad_norm": 1.737999785464117, + "language_loss": 0.77330101, + "learning_rate": 3.399612333050327e-06, + "loss": 0.7950455, + "num_input_tokens_seen": 99074685, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8515625, + "step": 4585, + "time_per_iteration": 2.5393707752227783 + }, + { + "auxiliary_loss_clip": 0.0114213, + "auxiliary_loss_mlp": 0.01039297, + "balance_loss_clip": 1.02227354, + "balance_loss_mlp": 1.0530591, + "epoch": 0.27572523673530736, + "flos": 23586559706880.0, + "grad_norm": 1.654604836009794, + "language_loss": 0.71854031, + "learning_rate": 3.399334101267362e-06, + "loss": 0.74035466, + "num_input_tokens_seen": 99095300, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.890625, + "step": 4586, + "time_per_iteration": 2.534979820251465 + }, + { + "auxiliary_loss_clip": 0.01136306, + "auxiliary_loss_mlp": 0.01035904, + "balance_loss_clip": 1.01996541, + "balance_loss_mlp": 1.04988265, + "epoch": 0.2757853599879754, + "flos": 22820118278400.0, + "grad_norm": 1.5248017982775213, + "language_loss": 0.80546939, + "learning_rate": 3.3990558164215073e-06, + "loss": 0.82719147, + "num_input_tokens_seen": 99115965, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.86328125, + "step": 4587, + "time_per_iteration": 2.5424065589904785 + }, + { + "auxiliary_loss_clip": 0.01133784, + "auxiliary_loss_mlp": 0.0103925, + "balance_loss_clip": 1.02356219, + "balance_loss_mlp": 1.04939508, + "epoch": 0.27584548324064334, + "flos": 18551704371840.0, + "grad_norm": 2.136921841599078, + "language_loss": 0.82694119, + "learning_rate": 3.398777478523316e-06, + "loss": 0.8486715, + "num_input_tokens_seen": 99134265, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.84375, + "step": 4588, + "time_per_iteration": 2.467923879623413 + }, + { + "auxiliary_loss_clip": 0.01132148, + "auxiliary_loss_mlp": 0.01038443, + "balance_loss_clip": 1.0228622, + "balance_loss_mlp": 1.04754925, + "epoch": 0.2759056064933113, + "flos": 23769883745280.0, + "grad_norm": 1.3980423175693042, + "language_loss": 0.75352502, + "learning_rate": 3.398499087583342e-06, + "loss": 0.775231, + "num_input_tokens_seen": 99156185, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84375, + "step": 4589, + "time_per_iteration": 2.535837173461914 + }, + { + "auxiliary_loss_clip": 0.01132096, + "auxiliary_loss_mlp": 0.01042232, + "balance_loss_clip": 1.02526879, + "balance_loss_mlp": 1.04686022, + "epoch": 0.27596572974597927, + "flos": 24281898163200.0, + "grad_norm": 1.7720046877472317, + "language_loss": 0.88438141, + "learning_rate": 3.398220643612143e-06, + "loss": 0.90612471, + "num_input_tokens_seen": 99176735, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.8515625, + "step": 4590, + "time_per_iteration": 2.5248916149139404 + }, + { + "auxiliary_loss_clip": 0.01135164, + "auxiliary_loss_mlp": 0.01045359, + "balance_loss_clip": 1.02946877, + "balance_loss_mlp": 1.04789972, + "epoch": 0.27602585299864724, + "flos": 35040985632000.0, + "grad_norm": 1.6299691755620427, + "language_loss": 0.7129395, + "learning_rate": 3.397942146620277e-06, + "loss": 0.73474467, + "num_input_tokens_seen": 99199765, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.87109375, + "step": 4591, + "time_per_iteration": 2.6112425327301025 + }, + { + "auxiliary_loss_clip": 0.01135759, + "auxiliary_loss_mlp": 0.01049557, + "balance_loss_clip": 1.03268862, + "balance_loss_mlp": 1.04847574, + "epoch": 0.2760859762513152, + "flos": 24309405002880.0, + "grad_norm": 1.8477043284936983, + "language_loss": 0.80190659, + "learning_rate": 3.3976635966183046e-06, + "loss": 0.82375979, + "num_input_tokens_seen": 99218435, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.875, + "step": 4592, + "time_per_iteration": 2.483894109725952 + }, + { + "auxiliary_loss_clip": 0.01048363, + "auxiliary_loss_mlp": 0.01005872, + "balance_loss_clip": 1.00416684, + "balance_loss_mlp": 1.0189774, + "epoch": 0.27614609950398317, + "flos": 71260739890560.0, + "grad_norm": 0.7716758671018623, + "language_loss": 0.61627746, + "learning_rate": 3.3973849936167886e-06, + "loss": 0.63681984, + "num_input_tokens_seen": 99276200, + "router_z_loss_clip": 0.01708984, + "router_z_loss_mlp": 0.29296875, + "step": 4593, + "time_per_iteration": 3.0616326332092285 + }, + { + "auxiliary_loss_clip": 0.01135076, + "auxiliary_loss_mlp": 0.01045597, + "balance_loss_clip": 1.02965856, + "balance_loss_mlp": 1.04938328, + "epoch": 0.27620622275665113, + "flos": 29674854138240.0, + "grad_norm": 1.8877557773606983, + "language_loss": 0.77589142, + "learning_rate": 3.3971063376262937e-06, + "loss": 0.79769808, + "num_input_tokens_seen": 99297625, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.859375, + "step": 4594, + "time_per_iteration": 4.043708086013794 + }, + { + "auxiliary_loss_clip": 0.01134807, + "auxiliary_loss_mlp": 0.01033386, + "balance_loss_clip": 1.01769793, + "balance_loss_mlp": 1.04991734, + "epoch": 0.2762663460093191, + "flos": 15378063137280.0, + "grad_norm": 1.7681451067423914, + "language_loss": 0.91645586, + "learning_rate": 3.3968276286573866e-06, + "loss": 0.93813777, + "num_input_tokens_seen": 99315790, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84765625, + "step": 4595, + "time_per_iteration": 3.973101854324341 + }, + { + "auxiliary_loss_clip": 0.01138485, + "auxiliary_loss_mlp": 0.01047274, + "balance_loss_clip": 1.03034675, + "balance_loss_mlp": 1.05122674, + "epoch": 0.27632646926198706, + "flos": 20704082117760.0, + "grad_norm": 1.7288059110569738, + "language_loss": 0.69101036, + "learning_rate": 3.3965488667206353e-06, + "loss": 0.71286798, + "num_input_tokens_seen": 99334615, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.87109375, + "step": 4596, + "time_per_iteration": 2.509199380874634 + }, + { + "auxiliary_loss_clip": 0.0114029, + "auxiliary_loss_mlp": 0.01041278, + "balance_loss_clip": 1.0249939, + "balance_loss_mlp": 1.04883707, + "epoch": 0.276386592514655, + "flos": 32813374849920.0, + "grad_norm": 2.01522187594791, + "language_loss": 0.63536406, + "learning_rate": 3.3962700518266113e-06, + "loss": 0.65717971, + "num_input_tokens_seen": 99356685, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.9140625, + "step": 4597, + "time_per_iteration": 2.5944221019744873 + }, + { + "auxiliary_loss_clip": 0.01133967, + "auxiliary_loss_mlp": 0.0104198, + "balance_loss_clip": 1.02629232, + "balance_loss_mlp": 1.05002272, + "epoch": 0.276446715767323, + "flos": 18551704371840.0, + "grad_norm": 2.1842552390134586, + "language_loss": 0.86612505, + "learning_rate": 3.395991183985887e-06, + "loss": 0.88788456, + "num_input_tokens_seen": 99374810, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.83984375, + "step": 4598, + "time_per_iteration": 2.4870996475219727 + }, + { + "auxiliary_loss_clip": 0.01135257, + "auxiliary_loss_mlp": 0.01042781, + "balance_loss_clip": 1.02586544, + "balance_loss_mlp": 1.04847229, + "epoch": 0.27650683901999096, + "flos": 22819615488000.0, + "grad_norm": 2.0694668215518996, + "language_loss": 0.79822165, + "learning_rate": 3.395712263209037e-06, + "loss": 0.82000202, + "num_input_tokens_seen": 99391290, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8671875, + "step": 4599, + "time_per_iteration": 2.4923834800720215 + }, + { + "auxiliary_loss_clip": 0.01140028, + "auxiliary_loss_mlp": 0.0104597, + "balance_loss_clip": 1.02965581, + "balance_loss_mlp": 1.04958415, + "epoch": 0.276566962272659, + "flos": 21361534704000.0, + "grad_norm": 1.9049018096400723, + "language_loss": 0.78357869, + "learning_rate": 3.395433289506639e-06, + "loss": 0.80543864, + "num_input_tokens_seen": 99409120, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.90625, + "step": 4600, + "time_per_iteration": 2.496173620223999 + }, + { + "auxiliary_loss_clip": 0.01139042, + "auxiliary_loss_mlp": 0.01046211, + "balance_loss_clip": 1.03007007, + "balance_loss_mlp": 1.04887986, + "epoch": 0.27662708552532694, + "flos": 17710604524800.0, + "grad_norm": 1.9474431855639402, + "language_loss": 0.73361742, + "learning_rate": 3.3951542628892694e-06, + "loss": 0.75546992, + "num_input_tokens_seen": 99426180, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.90234375, + "step": 4601, + "time_per_iteration": 2.475919246673584 + }, + { + "auxiliary_loss_clip": 0.01135661, + "auxiliary_loss_mlp": 0.01045476, + "balance_loss_clip": 1.02883482, + "balance_loss_mlp": 1.04879355, + "epoch": 0.2766872087779949, + "flos": 21252725429760.0, + "grad_norm": 1.9134344988482315, + "language_loss": 0.79341739, + "learning_rate": 3.3948751833675113e-06, + "loss": 0.81522876, + "num_input_tokens_seen": 99447720, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8671875, + "step": 4602, + "time_per_iteration": 2.511716842651367 + }, + { + "auxiliary_loss_clip": 0.01140401, + "auxiliary_loss_mlp": 0.01051234, + "balance_loss_clip": 1.03349614, + "balance_loss_mlp": 1.04920423, + "epoch": 0.2767473320306629, + "flos": 12931900053120.0, + "grad_norm": 2.260382216699142, + "language_loss": 0.76887643, + "learning_rate": 3.3945960509519455e-06, + "loss": 0.79079276, + "num_input_tokens_seen": 99464720, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.91015625, + "step": 4603, + "time_per_iteration": 2.4667811393737793 + }, + { + "auxiliary_loss_clip": 0.0112975, + "auxiliary_loss_mlp": 0.01040095, + "balance_loss_clip": 1.0252831, + "balance_loss_mlp": 1.04736543, + "epoch": 0.27680745528333084, + "flos": 15012851604480.0, + "grad_norm": 1.7288101924316703, + "language_loss": 0.81411278, + "learning_rate": 3.3943168656531585e-06, + "loss": 0.83581114, + "num_input_tokens_seen": 99482310, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.82421875, + "step": 4604, + "time_per_iteration": 2.4586222171783447 + }, + { + "auxiliary_loss_clip": 0.01135813, + "auxiliary_loss_mlp": 0.01031451, + "balance_loss_clip": 1.01516712, + "balance_loss_mlp": 1.04756212, + "epoch": 0.2768675785359988, + "flos": 22637835734400.0, + "grad_norm": 1.7513688477785454, + "language_loss": 0.69912565, + "learning_rate": 3.3940376274817363e-06, + "loss": 0.72079831, + "num_input_tokens_seen": 99501255, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8828125, + "step": 4605, + "time_per_iteration": 2.5138533115386963 + }, + { + "auxiliary_loss_clip": 0.01045677, + "auxiliary_loss_mlp": 0.01001918, + "balance_loss_clip": 1.00033224, + "balance_loss_mlp": 1.01580858, + "epoch": 0.27692770178866677, + "flos": 66130542881280.0, + "grad_norm": 0.7252635192802935, + "language_loss": 0.57151282, + "learning_rate": 3.3937583364482673e-06, + "loss": 0.59198874, + "num_input_tokens_seen": 99568925, + "router_z_loss_clip": 0.01586914, + "router_z_loss_mlp": 0.296875, + "step": 4606, + "time_per_iteration": 3.184955596923828 + }, + { + "auxiliary_loss_clip": 0.01136733, + "auxiliary_loss_mlp": 0.01049361, + "balance_loss_clip": 1.03234947, + "balance_loss_mlp": 1.0481658, + "epoch": 0.27698782504133473, + "flos": 26464979059200.0, + "grad_norm": 2.0717297663627825, + "language_loss": 0.69666946, + "learning_rate": 3.3934789925633424e-06, + "loss": 0.71853042, + "num_input_tokens_seen": 99588455, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.88671875, + "step": 4607, + "time_per_iteration": 2.5373001098632812 + }, + { + "auxiliary_loss_clip": 0.011299, + "auxiliary_loss_mlp": 0.01035631, + "balance_loss_clip": 1.02014589, + "balance_loss_mlp": 1.04721832, + "epoch": 0.2770479482940027, + "flos": 25884806584320.0, + "grad_norm": 3.332085537790215, + "language_loss": 0.6982615, + "learning_rate": 3.393199595837555e-06, + "loss": 0.71991682, + "num_input_tokens_seen": 99609355, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.828125, + "step": 4608, + "time_per_iteration": 2.5396809577941895 + }, + { + "auxiliary_loss_clip": 0.01135928, + "auxiliary_loss_mlp": 0.01037862, + "balance_loss_clip": 1.02185202, + "balance_loss_mlp": 1.04715931, + "epoch": 0.27710807154667066, + "flos": 22857249962880.0, + "grad_norm": 1.8242818121189563, + "language_loss": 0.72541273, + "learning_rate": 3.392920146281499e-06, + "loss": 0.74715054, + "num_input_tokens_seen": 99628780, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.88671875, + "step": 4609, + "time_per_iteration": 2.5383543968200684 + }, + { + "auxiliary_loss_clip": 0.01134274, + "auxiliary_loss_mlp": 0.01048832, + "balance_loss_clip": 1.03226149, + "balance_loss_mlp": 1.04623055, + "epoch": 0.27716819479933863, + "flos": 17711071401600.0, + "grad_norm": 2.2576811985082967, + "language_loss": 0.84010947, + "learning_rate": 3.3926406439057714e-06, + "loss": 0.86194062, + "num_input_tokens_seen": 99644545, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.87890625, + "step": 4610, + "time_per_iteration": 2.4456827640533447 + }, + { + "auxiliary_loss_clip": 0.01141086, + "auxiliary_loss_mlp": 0.01051097, + "balance_loss_clip": 1.03344178, + "balance_loss_mlp": 1.04996872, + "epoch": 0.2772283180520066, + "flos": 19646046080640.0, + "grad_norm": 2.570198611472629, + "language_loss": 0.68948054, + "learning_rate": 3.3923610887209705e-06, + "loss": 0.71140236, + "num_input_tokens_seen": 99663125, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9140625, + "step": 4611, + "time_per_iteration": 2.5342319011688232 + }, + { + "auxiliary_loss_clip": 0.01130823, + "auxiliary_loss_mlp": 0.01036995, + "balance_loss_clip": 1.0212357, + "balance_loss_mlp": 1.04892015, + "epoch": 0.27728844130467456, + "flos": 21032628842880.0, + "grad_norm": 2.354058548299899, + "language_loss": 0.73450744, + "learning_rate": 3.392081480737698e-06, + "loss": 0.75618565, + "num_input_tokens_seen": 99682645, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8203125, + "step": 4612, + "time_per_iteration": 2.472200632095337 + }, + { + "auxiliary_loss_clip": 0.01137408, + "auxiliary_loss_mlp": 0.0105089, + "balance_loss_clip": 1.03378379, + "balance_loss_mlp": 1.04807258, + "epoch": 0.2773485645573425, + "flos": 18989204025600.0, + "grad_norm": 2.166254073057622, + "language_loss": 0.66736221, + "learning_rate": 3.3918018199665563e-06, + "loss": 0.68924516, + "num_input_tokens_seen": 99700520, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.89453125, + "step": 4613, + "time_per_iteration": 2.5313632488250732 + }, + { + "auxiliary_loss_clip": 0.01131014, + "auxiliary_loss_mlp": 0.01043283, + "balance_loss_clip": 1.02721334, + "balance_loss_mlp": 1.04604864, + "epoch": 0.27740868781001055, + "flos": 21468440557440.0, + "grad_norm": 1.8826548789840187, + "language_loss": 0.79452634, + "learning_rate": 3.39152210641815e-06, + "loss": 0.81626928, + "num_input_tokens_seen": 99720355, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8515625, + "step": 4614, + "time_per_iteration": 2.4869751930236816 + }, + { + "auxiliary_loss_clip": 0.01135901, + "auxiliary_loss_mlp": 0.01043201, + "balance_loss_clip": 1.02684534, + "balance_loss_mlp": 1.0477469, + "epoch": 0.2774688110626785, + "flos": 19827825834240.0, + "grad_norm": 2.573597172535304, + "language_loss": 0.80251336, + "learning_rate": 3.3912423401030865e-06, + "loss": 0.8243044, + "num_input_tokens_seen": 99736090, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8828125, + "step": 4615, + "time_per_iteration": 2.521615505218506 + }, + { + "auxiliary_loss_clip": 0.01135416, + "auxiliary_loss_mlp": 0.01043384, + "balance_loss_clip": 1.02676582, + "balance_loss_mlp": 1.04627132, + "epoch": 0.2775289343153465, + "flos": 18216226321920.0, + "grad_norm": 2.403593727320557, + "language_loss": 0.63926548, + "learning_rate": 3.3909625210319735e-06, + "loss": 0.66105354, + "num_input_tokens_seen": 99751805, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.890625, + "step": 4616, + "time_per_iteration": 2.439410448074341 + }, + { + "auxiliary_loss_clip": 0.01133721, + "auxiliary_loss_mlp": 0.01041991, + "balance_loss_clip": 1.02593398, + "balance_loss_mlp": 1.04661143, + "epoch": 0.27758905756801444, + "flos": 16472476673280.0, + "grad_norm": 1.8467628074440183, + "language_loss": 0.82283223, + "learning_rate": 3.3906826492154226e-06, + "loss": 0.84458935, + "num_input_tokens_seen": 99770610, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87109375, + "step": 4617, + "time_per_iteration": 2.49495792388916 + }, + { + "auxiliary_loss_clip": 0.01133289, + "auxiliary_loss_mlp": 0.01041846, + "balance_loss_clip": 1.02587175, + "balance_loss_mlp": 1.04613662, + "epoch": 0.2776491808206824, + "flos": 18728240739840.0, + "grad_norm": 2.1015666973838942, + "language_loss": 0.76835418, + "learning_rate": 3.3904027246640458e-06, + "loss": 0.79010552, + "num_input_tokens_seen": 99787305, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87109375, + "step": 4618, + "time_per_iteration": 2.4882123470306396 + }, + { + "auxiliary_loss_clip": 0.01136682, + "auxiliary_loss_mlp": 0.0104057, + "balance_loss_clip": 1.02501273, + "balance_loss_mlp": 1.0495801, + "epoch": 0.27770930407335037, + "flos": 28038189911040.0, + "grad_norm": 1.6700061931983001, + "language_loss": 0.84698343, + "learning_rate": 3.390122747388459e-06, + "loss": 0.868756, + "num_input_tokens_seen": 99808940, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.87109375, + "step": 4619, + "time_per_iteration": 2.5996124744415283 + }, + { + "auxiliary_loss_clip": 0.01128767, + "auxiliary_loss_mlp": 0.0103795, + "balance_loss_clip": 1.02340662, + "balance_loss_mlp": 1.04523671, + "epoch": 0.27776942732601834, + "flos": 23549823072000.0, + "grad_norm": 1.4068177028172657, + "language_loss": 0.76720011, + "learning_rate": 3.3898427173992778e-06, + "loss": 0.78886724, + "num_input_tokens_seen": 99829575, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8359375, + "step": 4620, + "time_per_iteration": 2.4851698875427246 + }, + { + "auxiliary_loss_clip": 0.01130943, + "auxiliary_loss_mlp": 0.01036697, + "balance_loss_clip": 1.02126586, + "balance_loss_mlp": 1.04728413, + "epoch": 0.2778295505786863, + "flos": 23908713811200.0, + "grad_norm": 2.4956264272783084, + "language_loss": 0.78746819, + "learning_rate": 3.389562634707122e-06, + "loss": 0.80914462, + "num_input_tokens_seen": 99847575, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8359375, + "step": 4621, + "time_per_iteration": 2.543513774871826 + }, + { + "auxiliary_loss_clip": 0.01135835, + "auxiliary_loss_mlp": 0.01046354, + "balance_loss_clip": 1.02953315, + "balance_loss_mlp": 1.04871762, + "epoch": 0.27788967383135427, + "flos": 25554571920000.0, + "grad_norm": 1.9988562622182164, + "language_loss": 0.87520665, + "learning_rate": 3.389282499322611e-06, + "loss": 0.89702857, + "num_input_tokens_seen": 99864995, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.87109375, + "step": 4622, + "time_per_iteration": 2.4818174839019775 + }, + { + "auxiliary_loss_clip": 0.01133366, + "auxiliary_loss_mlp": 0.01046006, + "balance_loss_clip": 1.02960837, + "balance_loss_mlp": 1.04635906, + "epoch": 0.27794979708402223, + "flos": 16252631481600.0, + "grad_norm": 1.9062066208333321, + "language_loss": 0.81094646, + "learning_rate": 3.389002311256369e-06, + "loss": 0.83274019, + "num_input_tokens_seen": 99881540, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8671875, + "step": 4623, + "time_per_iteration": 2.509218692779541 + }, + { + "auxiliary_loss_clip": 0.01136736, + "auxiliary_loss_mlp": 0.01039194, + "balance_loss_clip": 1.02357817, + "balance_loss_mlp": 1.04981863, + "epoch": 0.2780099203366902, + "flos": 20667632791680.0, + "grad_norm": 1.93503772017796, + "language_loss": 0.81099498, + "learning_rate": 3.3887220705190204e-06, + "loss": 0.83275431, + "num_input_tokens_seen": 99899595, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8671875, + "step": 4624, + "time_per_iteration": 2.470041513442993 + }, + { + "auxiliary_loss_clip": 0.01135607, + "auxiliary_loss_mlp": 0.01041344, + "balance_loss_clip": 1.02563787, + "balance_loss_mlp": 1.05091214, + "epoch": 0.27807004358935816, + "flos": 17739583822080.0, + "grad_norm": 3.184384520938543, + "language_loss": 0.76514304, + "learning_rate": 3.388441777121191e-06, + "loss": 0.7869125, + "num_input_tokens_seen": 99913020, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.84765625, + "step": 4625, + "time_per_iteration": 2.4965567588806152 + }, + { + "auxiliary_loss_clip": 0.01133566, + "auxiliary_loss_mlp": 0.01041252, + "balance_loss_clip": 1.02439606, + "balance_loss_mlp": 1.04835677, + "epoch": 0.2781301668420261, + "flos": 16727119165440.0, + "grad_norm": 2.5511238477154095, + "language_loss": 0.70091927, + "learning_rate": 3.388161431073511e-06, + "loss": 0.7226674, + "num_input_tokens_seen": 99931405, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8515625, + "step": 4626, + "time_per_iteration": 2.462007522583008 + }, + { + "auxiliary_loss_clip": 0.01142353, + "auxiliary_loss_mlp": 0.01036943, + "balance_loss_clip": 1.01981282, + "balance_loss_mlp": 1.05177855, + "epoch": 0.27819029009469415, + "flos": 13844749317120.0, + "grad_norm": 2.1576082410571704, + "language_loss": 0.92738312, + "learning_rate": 3.38788103238661e-06, + "loss": 0.94917607, + "num_input_tokens_seen": 99948100, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.90625, + "step": 4627, + "time_per_iteration": 2.5731146335601807 + }, + { + "auxiliary_loss_clip": 0.01137702, + "auxiliary_loss_mlp": 0.01041394, + "balance_loss_clip": 1.02640903, + "balance_loss_mlp": 1.04856014, + "epoch": 0.2782504133473621, + "flos": 27089286370560.0, + "grad_norm": 4.44086075484182, + "language_loss": 0.85802954, + "learning_rate": 3.387600581071121e-06, + "loss": 0.87982047, + "num_input_tokens_seen": 99966470, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.890625, + "step": 4628, + "time_per_iteration": 2.502816915512085 + }, + { + "auxiliary_loss_clip": 0.01136721, + "auxiliary_loss_mlp": 0.01039197, + "balance_loss_clip": 1.02358079, + "balance_loss_mlp": 1.05035257, + "epoch": 0.2783105366000301, + "flos": 21068826773760.0, + "grad_norm": 1.4685731198996637, + "language_loss": 0.79003006, + "learning_rate": 3.387320077137679e-06, + "loss": 0.81178927, + "num_input_tokens_seen": 99985930, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.86328125, + "step": 4629, + "time_per_iteration": 2.544255256652832 + }, + { + "auxiliary_loss_clip": 0.01132865, + "auxiliary_loss_mlp": 0.01038616, + "balance_loss_clip": 1.02419138, + "balance_loss_mlp": 1.05083036, + "epoch": 0.27837065985269804, + "flos": 26501823434880.0, + "grad_norm": 1.4531737557023054, + "language_loss": 0.84322643, + "learning_rate": 3.3870395205969208e-06, + "loss": 0.86494124, + "num_input_tokens_seen": 100006235, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8203125, + "step": 4630, + "time_per_iteration": 2.514413833618164 + }, + { + "auxiliary_loss_clip": 0.01136217, + "auxiliary_loss_mlp": 0.01040004, + "balance_loss_clip": 1.02343392, + "balance_loss_mlp": 1.04834175, + "epoch": 0.278430783105366, + "flos": 20223201813120.0, + "grad_norm": 2.1800575167200997, + "language_loss": 0.80845618, + "learning_rate": 3.386758911459485e-06, + "loss": 0.83021843, + "num_input_tokens_seen": 100023655, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.87890625, + "step": 4631, + "time_per_iteration": 2.530393123626709 + }, + { + "auxiliary_loss_clip": 0.01141592, + "auxiliary_loss_mlp": 0.01050096, + "balance_loss_clip": 1.03403842, + "balance_loss_mlp": 1.05319762, + "epoch": 0.278490906358034, + "flos": 25592888753280.0, + "grad_norm": 2.154319840219951, + "language_loss": 0.71817827, + "learning_rate": 3.3864782497360126e-06, + "loss": 0.74009514, + "num_input_tokens_seen": 100043280, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8828125, + "step": 4632, + "time_per_iteration": 2.504826307296753 + }, + { + "auxiliary_loss_clip": 0.01135617, + "auxiliary_loss_mlp": 0.01040662, + "balance_loss_clip": 1.02571952, + "balance_loss_mlp": 1.05240536, + "epoch": 0.27855102961070194, + "flos": 16171544528640.0, + "grad_norm": 1.8401586776799086, + "language_loss": 0.82518554, + "learning_rate": 3.386197535437145e-06, + "loss": 0.84694839, + "num_input_tokens_seen": 100057690, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.83203125, + "step": 4633, + "time_per_iteration": 2.484894037246704 + }, + { + "auxiliary_loss_clip": 0.0113869, + "auxiliary_loss_mlp": 0.01044294, + "balance_loss_clip": 1.02622163, + "balance_loss_mlp": 1.05006409, + "epoch": 0.2786111528633699, + "flos": 22927598749440.0, + "grad_norm": 1.740894494158558, + "language_loss": 0.87933433, + "learning_rate": 3.385916768573529e-06, + "loss": 0.90116417, + "num_input_tokens_seen": 100075875, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.88671875, + "step": 4634, + "time_per_iteration": 2.465115785598755 + }, + { + "auxiliary_loss_clip": 0.01139508, + "auxiliary_loss_mlp": 0.0103873, + "balance_loss_clip": 1.02182591, + "balance_loss_mlp": 1.05175185, + "epoch": 0.27867127611603787, + "flos": 23404205335680.0, + "grad_norm": 1.5848956099548452, + "language_loss": 0.77060932, + "learning_rate": 3.38563594915581e-06, + "loss": 0.79239166, + "num_input_tokens_seen": 100092930, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.875, + "step": 4635, + "time_per_iteration": 2.5032925605773926 + }, + { + "auxiliary_loss_clip": 0.01137724, + "auxiliary_loss_mlp": 0.01045364, + "balance_loss_clip": 1.02843595, + "balance_loss_mlp": 1.04919934, + "epoch": 0.27873139936870583, + "flos": 19829010983040.0, + "grad_norm": 1.7277393232375848, + "language_loss": 0.65047133, + "learning_rate": 3.385355077194637e-06, + "loss": 0.67230225, + "num_input_tokens_seen": 100110790, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8828125, + "step": 4636, + "time_per_iteration": 4.078390121459961 + }, + { + "auxiliary_loss_clip": 0.01137292, + "auxiliary_loss_mlp": 0.01039979, + "balance_loss_clip": 1.02249098, + "balance_loss_mlp": 1.04898095, + "epoch": 0.2787915226213738, + "flos": 17707659609600.0, + "grad_norm": 2.3949865449269034, + "language_loss": 0.84131932, + "learning_rate": 3.3850741527006604e-06, + "loss": 0.86309206, + "num_input_tokens_seen": 100126970, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.8828125, + "step": 4637, + "time_per_iteration": 3.9023706912994385 + }, + { + "auxiliary_loss_clip": 0.01131584, + "auxiliary_loss_mlp": 0.0104016, + "balance_loss_clip": 1.02468669, + "balance_loss_mlp": 1.04683113, + "epoch": 0.27885164587404176, + "flos": 22090557139200.0, + "grad_norm": 1.9572077756422592, + "language_loss": 0.75880706, + "learning_rate": 3.384793175684533e-06, + "loss": 0.78052455, + "num_input_tokens_seen": 100146720, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84765625, + "step": 4638, + "time_per_iteration": 2.5291664600372314 + }, + { + "auxiliary_loss_clip": 0.01137756, + "auxiliary_loss_mlp": 0.01044501, + "balance_loss_clip": 1.0281812, + "balance_loss_mlp": 1.04918075, + "epoch": 0.27891176912670973, + "flos": 19207684500480.0, + "grad_norm": 1.663593201704466, + "language_loss": 0.71469444, + "learning_rate": 3.38451214615691e-06, + "loss": 0.73651695, + "num_input_tokens_seen": 100165920, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8828125, + "step": 4639, + "time_per_iteration": 2.4396321773529053 + }, + { + "auxiliary_loss_clip": 0.01135046, + "auxiliary_loss_mlp": 0.01035153, + "balance_loss_clip": 1.01814222, + "balance_loss_mlp": 1.0477488, + "epoch": 0.27897189237937775, + "flos": 27600007898880.0, + "grad_norm": 2.020838508390905, + "language_loss": 0.65634811, + "learning_rate": 3.384231064128447e-06, + "loss": 0.67805016, + "num_input_tokens_seen": 100185525, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.87109375, + "step": 4640, + "time_per_iteration": 2.524146556854248 + }, + { + "auxiliary_loss_clip": 0.01135389, + "auxiliary_loss_mlp": 0.01038864, + "balance_loss_clip": 1.02278829, + "balance_loss_mlp": 1.04838169, + "epoch": 0.2790320156320457, + "flos": 21178210665600.0, + "grad_norm": 1.8663182251903623, + "language_loss": 0.71682954, + "learning_rate": 3.383949929609804e-06, + "loss": 0.738572, + "num_input_tokens_seen": 100204850, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.87109375, + "step": 4641, + "time_per_iteration": 2.45416522026062 + }, + { + "auxiliary_loss_clip": 0.01137426, + "auxiliary_loss_mlp": 0.01043433, + "balance_loss_clip": 1.02620697, + "balance_loss_mlp": 1.04805887, + "epoch": 0.2790921388847137, + "flos": 22783920347520.0, + "grad_norm": 1.721157258136314, + "language_loss": 0.74843872, + "learning_rate": 3.383668742611641e-06, + "loss": 0.77024734, + "num_input_tokens_seen": 100224520, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.890625, + "step": 4642, + "time_per_iteration": 2.498901128768921 + }, + { + "auxiliary_loss_clip": 0.01136083, + "auxiliary_loss_mlp": 0.01041154, + "balance_loss_clip": 1.0241071, + "balance_loss_mlp": 1.04755557, + "epoch": 0.27915226213738165, + "flos": 23400649889280.0, + "grad_norm": 1.7771181879405247, + "language_loss": 0.85500491, + "learning_rate": 3.3833875031446205e-06, + "loss": 0.87677723, + "num_input_tokens_seen": 100243935, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.8828125, + "step": 4643, + "time_per_iteration": 2.4678151607513428 + }, + { + "auxiliary_loss_clip": 0.01135774, + "auxiliary_loss_mlp": 0.01044591, + "balance_loss_clip": 1.02831888, + "balance_loss_mlp": 1.04914284, + "epoch": 0.2792123853900496, + "flos": 22747794243840.0, + "grad_norm": 1.8372365182177028, + "language_loss": 0.8320173, + "learning_rate": 3.383106211219407e-06, + "loss": 0.85382092, + "num_input_tokens_seen": 100262290, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8671875, + "step": 4644, + "time_per_iteration": 2.4989511966705322 + }, + { + "auxiliary_loss_clip": 0.01137034, + "auxiliary_loss_mlp": 0.01039698, + "balance_loss_clip": 1.02340162, + "balance_loss_mlp": 1.04927874, + "epoch": 0.2792725086427176, + "flos": 15049372757760.0, + "grad_norm": 2.1578284197730246, + "language_loss": 0.7905547, + "learning_rate": 3.3828248668466673e-06, + "loss": 0.81232202, + "num_input_tokens_seen": 100280015, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.87890625, + "step": 4645, + "time_per_iteration": 2.444539785385132 + }, + { + "auxiliary_loss_clip": 0.01045698, + "auxiliary_loss_mlp": 0.01013694, + "balance_loss_clip": 1.01202476, + "balance_loss_mlp": 1.01603949, + "epoch": 0.27933263189538554, + "flos": 62544861757440.0, + "grad_norm": 0.7789852310638867, + "language_loss": 0.62276232, + "learning_rate": 3.3825434700370705e-06, + "loss": 0.64335632, + "num_input_tokens_seen": 100338935, + "router_z_loss_clip": 0.01672363, + "router_z_loss_mlp": 0.296875, + "step": 4646, + "time_per_iteration": 3.0487425327301025 + }, + { + "auxiliary_loss_clip": 0.01130687, + "auxiliary_loss_mlp": 0.01035262, + "balance_loss_clip": 1.02039671, + "balance_loss_mlp": 1.04760003, + "epoch": 0.2793927551480535, + "flos": 25118365155840.0, + "grad_norm": 1.6043045349905556, + "language_loss": 0.89379698, + "learning_rate": 3.3822620208012865e-06, + "loss": 0.91545647, + "num_input_tokens_seen": 100359905, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.83203125, + "step": 4647, + "time_per_iteration": 2.537818193435669 + }, + { + "auxiliary_loss_clip": 0.01137315, + "auxiliary_loss_mlp": 0.0104209, + "balance_loss_clip": 1.02559125, + "balance_loss_mlp": 1.04848313, + "epoch": 0.27945287840072147, + "flos": 21324582587520.0, + "grad_norm": 1.6404696751402497, + "language_loss": 0.87119055, + "learning_rate": 3.381980519149988e-06, + "loss": 0.89298457, + "num_input_tokens_seen": 100376955, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.890625, + "step": 4648, + "time_per_iteration": 2.490755081176758 + }, + { + "auxiliary_loss_clip": 0.01138515, + "auxiliary_loss_mlp": 0.01036468, + "balance_loss_clip": 1.01993406, + "balance_loss_mlp": 1.04894495, + "epoch": 0.27951300165338944, + "flos": 27450547407360.0, + "grad_norm": 4.859667262510518, + "language_loss": 0.72424746, + "learning_rate": 3.38169896509385e-06, + "loss": 0.74599725, + "num_input_tokens_seen": 100397545, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.89453125, + "step": 4649, + "time_per_iteration": 2.551149368286133 + }, + { + "auxiliary_loss_clip": 0.01134145, + "auxiliary_loss_mlp": 0.0104133, + "balance_loss_clip": 1.02275741, + "balance_loss_mlp": 1.04667568, + "epoch": 0.2795731249060574, + "flos": 15159008044800.0, + "grad_norm": 2.198213539311656, + "language_loss": 0.80241156, + "learning_rate": 3.381417358643549e-06, + "loss": 0.8241663, + "num_input_tokens_seen": 100415080, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.875, + "step": 4650, + "time_per_iteration": 2.495481252670288 + }, + { + "auxiliary_loss_clip": 0.01043234, + "auxiliary_loss_mlp": 0.01001825, + "balance_loss_clip": 1.00015628, + "balance_loss_mlp": 1.01336908, + "epoch": 0.27963324815872537, + "flos": 60120103178880.0, + "grad_norm": 1.2001935939690993, + "language_loss": 0.58821332, + "learning_rate": 3.3811356998097624e-06, + "loss": 0.60866392, + "num_input_tokens_seen": 100471105, + "router_z_loss_clip": 0.01672363, + "router_z_loss_mlp": 0.296875, + "step": 4651, + "time_per_iteration": 3.089278221130371 + }, + { + "auxiliary_loss_clip": 0.01136266, + "auxiliary_loss_mlp": 0.01041939, + "balance_loss_clip": 1.0239383, + "balance_loss_mlp": 1.04576242, + "epoch": 0.27969337141139333, + "flos": 21765960910080.0, + "grad_norm": 1.6305345142383205, + "language_loss": 0.74335963, + "learning_rate": 3.3808539886031726e-06, + "loss": 0.76514173, + "num_input_tokens_seen": 100492520, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.90625, + "step": 4652, + "time_per_iteration": 2.5034215450286865 + }, + { + "auxiliary_loss_clip": 0.01140774, + "auxiliary_loss_mlp": 0.01044834, + "balance_loss_clip": 1.02826357, + "balance_loss_mlp": 1.05137777, + "epoch": 0.27975349466406135, + "flos": 39851398834560.0, + "grad_norm": 2.1744902530470527, + "language_loss": 0.79703641, + "learning_rate": 3.380572225034461e-06, + "loss": 0.81889254, + "num_input_tokens_seen": 100512870, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.89453125, + "step": 4653, + "time_per_iteration": 2.654989242553711 + }, + { + "auxiliary_loss_clip": 0.0113484, + "auxiliary_loss_mlp": 0.01045549, + "balance_loss_clip": 1.02851391, + "balance_loss_mlp": 1.04782343, + "epoch": 0.2798136179167293, + "flos": 21579799697280.0, + "grad_norm": 2.2131663157599597, + "language_loss": 0.79123974, + "learning_rate": 3.380290409114312e-06, + "loss": 0.81304365, + "num_input_tokens_seen": 100531655, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.87109375, + "step": 4654, + "time_per_iteration": 2.4707679748535156 + }, + { + "auxiliary_loss_clip": 0.01139148, + "auxiliary_loss_mlp": 0.01041113, + "balance_loss_clip": 1.02370811, + "balance_loss_mlp": 1.04861951, + "epoch": 0.2798737411693973, + "flos": 21537676022400.0, + "grad_norm": 2.2002818233708497, + "language_loss": 0.80829996, + "learning_rate": 3.3800085408534127e-06, + "loss": 0.83010256, + "num_input_tokens_seen": 100548005, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.90625, + "step": 4655, + "time_per_iteration": 2.513359546661377 + }, + { + "auxiliary_loss_clip": 0.01135255, + "auxiliary_loss_mlp": 0.01040566, + "balance_loss_clip": 1.0232811, + "balance_loss_mlp": 1.04709148, + "epoch": 0.27993386442206525, + "flos": 26981051713920.0, + "grad_norm": 1.5763016498426998, + "language_loss": 0.8125751, + "learning_rate": 3.3797266202624506e-06, + "loss": 0.8343333, + "num_input_tokens_seen": 100567980, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.87890625, + "step": 4656, + "time_per_iteration": 2.519552707672119 + }, + { + "auxiliary_loss_clip": 0.01135028, + "auxiliary_loss_mlp": 0.01039911, + "balance_loss_clip": 1.02292323, + "balance_loss_mlp": 1.04802632, + "epoch": 0.2799939876747332, + "flos": 24349876652160.0, + "grad_norm": 1.6475258015019663, + "language_loss": 0.83235347, + "learning_rate": 3.3794446473521176e-06, + "loss": 0.85410285, + "num_input_tokens_seen": 100588630, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.87109375, + "step": 4657, + "time_per_iteration": 2.533052444458008 + }, + { + "auxiliary_loss_clip": 0.01136508, + "auxiliary_loss_mlp": 0.01042865, + "balance_loss_clip": 1.0267477, + "balance_loss_mlp": 1.04885554, + "epoch": 0.2800541109274012, + "flos": 33656988648960.0, + "grad_norm": 1.9420207304275756, + "language_loss": 0.63918132, + "learning_rate": 3.379162622133105e-06, + "loss": 0.66097504, + "num_input_tokens_seen": 100608775, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.875, + "step": 4658, + "time_per_iteration": 2.577223777770996 + }, + { + "auxiliary_loss_clip": 0.01137419, + "auxiliary_loss_mlp": 0.01048486, + "balance_loss_clip": 1.03177238, + "balance_loss_mlp": 1.04906631, + "epoch": 0.28011423418006914, + "flos": 21614417429760.0, + "grad_norm": 1.71469006603513, + "language_loss": 0.78447223, + "learning_rate": 3.3788805446161073e-06, + "loss": 0.80633128, + "num_input_tokens_seen": 100627975, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8828125, + "step": 4659, + "time_per_iteration": 2.5102882385253906 + }, + { + "auxiliary_loss_clip": 0.01141159, + "auxiliary_loss_mlp": 0.01052526, + "balance_loss_clip": 1.03565836, + "balance_loss_mlp": 1.05118299, + "epoch": 0.2801743574327371, + "flos": 23112431159040.0, + "grad_norm": 1.8275002529569282, + "language_loss": 0.79481149, + "learning_rate": 3.3785984148118215e-06, + "loss": 0.81674838, + "num_input_tokens_seen": 100645430, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8984375, + "step": 4660, + "time_per_iteration": 2.478348731994629 + }, + { + "auxiliary_loss_clip": 0.01134524, + "auxiliary_loss_mlp": 0.0103899, + "balance_loss_clip": 1.02289653, + "balance_loss_mlp": 1.04855609, + "epoch": 0.2802344806854051, + "flos": 12641418766080.0, + "grad_norm": 1.7763153734220711, + "language_loss": 0.80286032, + "learning_rate": 3.3783162327309453e-06, + "loss": 0.82459545, + "num_input_tokens_seen": 100663775, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.859375, + "step": 4661, + "time_per_iteration": 2.514369249343872 + }, + { + "auxiliary_loss_clip": 0.01140753, + "auxiliary_loss_mlp": 0.01055451, + "balance_loss_clip": 1.03888094, + "balance_loss_mlp": 1.05259752, + "epoch": 0.28029460393807304, + "flos": 37267878142080.0, + "grad_norm": 1.5344085017366311, + "language_loss": 0.78856266, + "learning_rate": 3.3780339983841794e-06, + "loss": 0.8105247, + "num_input_tokens_seen": 100686085, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8828125, + "step": 4662, + "time_per_iteration": 2.6068239212036133 + }, + { + "auxiliary_loss_clip": 0.01142079, + "auxiliary_loss_mlp": 0.01052002, + "balance_loss_clip": 1.03345299, + "balance_loss_mlp": 1.04998207, + "epoch": 0.280354727190741, + "flos": 20741106061440.0, + "grad_norm": 2.3559784459233923, + "language_loss": 0.70354843, + "learning_rate": 3.377751711782227e-06, + "loss": 0.72548926, + "num_input_tokens_seen": 100705135, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.921875, + "step": 4663, + "time_per_iteration": 2.530852794647217 + }, + { + "auxiliary_loss_clip": 0.01139833, + "auxiliary_loss_mlp": 0.01053723, + "balance_loss_clip": 1.03522193, + "balance_loss_mlp": 1.05016875, + "epoch": 0.28041485044340897, + "flos": 21471026336640.0, + "grad_norm": 1.7070620658846938, + "language_loss": 0.77552772, + "learning_rate": 3.377469372935791e-06, + "loss": 0.7974633, + "num_input_tokens_seen": 100724960, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.8984375, + "step": 4664, + "time_per_iteration": 2.5026586055755615 + }, + { + "auxiliary_loss_clip": 0.01132144, + "auxiliary_loss_mlp": 0.01043613, + "balance_loss_clip": 1.02688766, + "balance_loss_mlp": 1.04697514, + "epoch": 0.28047497369607693, + "flos": 14794263388800.0, + "grad_norm": 1.9676420802042491, + "language_loss": 0.79575229, + "learning_rate": 3.377186981855578e-06, + "loss": 0.81750983, + "num_input_tokens_seen": 100741995, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8515625, + "step": 4665, + "time_per_iteration": 2.496948003768921 + }, + { + "auxiliary_loss_clip": 0.01136069, + "auxiliary_loss_mlp": 0.01042713, + "balance_loss_clip": 1.02628565, + "balance_loss_mlp": 1.04934978, + "epoch": 0.2805350969487449, + "flos": 23070738447360.0, + "grad_norm": 8.778135585709748, + "language_loss": 0.80523062, + "learning_rate": 3.3769045385522968e-06, + "loss": 0.82701844, + "num_input_tokens_seen": 100758985, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8671875, + "step": 4666, + "time_per_iteration": 2.4551992416381836 + }, + { + "auxiliary_loss_clip": 0.0113922, + "auxiliary_loss_mlp": 0.01054727, + "balance_loss_clip": 1.03710806, + "balance_loss_mlp": 1.05058241, + "epoch": 0.2805952202014129, + "flos": 20479855466880.0, + "grad_norm": 2.0519370530418493, + "language_loss": 0.84514672, + "learning_rate": 3.376622043036658e-06, + "loss": 0.86708617, + "num_input_tokens_seen": 100777820, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.88671875, + "step": 4667, + "time_per_iteration": 2.503024101257324 + }, + { + "auxiliary_loss_clip": 0.01141868, + "auxiliary_loss_mlp": 0.0104464, + "balance_loss_clip": 1.02771258, + "balance_loss_mlp": 1.05165899, + "epoch": 0.2806553434540809, + "flos": 27417330305280.0, + "grad_norm": 1.59556786146991, + "language_loss": 0.79110259, + "learning_rate": 3.376339495319373e-06, + "loss": 0.81296772, + "num_input_tokens_seen": 100798205, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.90234375, + "step": 4668, + "time_per_iteration": 2.5109217166900635 + }, + { + "auxiliary_loss_clip": 0.01137821, + "auxiliary_loss_mlp": 0.01045025, + "balance_loss_clip": 1.02783513, + "balance_loss_mlp": 1.0472095, + "epoch": 0.28071546670674885, + "flos": 26505019745280.0, + "grad_norm": 5.202292388628492, + "language_loss": 0.7594949, + "learning_rate": 3.3760568954111563e-06, + "loss": 0.78132337, + "num_input_tokens_seen": 100819800, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.90625, + "step": 4669, + "time_per_iteration": 2.5443029403686523 + }, + { + "auxiliary_loss_clip": 0.01139015, + "auxiliary_loss_mlp": 0.01050472, + "balance_loss_clip": 1.03276944, + "balance_loss_mlp": 1.05060363, + "epoch": 0.2807755899594168, + "flos": 20558679863040.0, + "grad_norm": 2.249572842905479, + "language_loss": 0.78818107, + "learning_rate": 3.375774243322725e-06, + "loss": 0.81007588, + "num_input_tokens_seen": 100837880, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.8828125, + "step": 4670, + "time_per_iteration": 2.4583303928375244 + }, + { + "auxiliary_loss_clip": 0.01142576, + "auxiliary_loss_mlp": 0.010505, + "balance_loss_clip": 1.03272545, + "balance_loss_mlp": 1.05169237, + "epoch": 0.2808357132120848, + "flos": 24313319585280.0, + "grad_norm": 2.1344815005037323, + "language_loss": 0.78915119, + "learning_rate": 3.3754915390647955e-06, + "loss": 0.81108201, + "num_input_tokens_seen": 100856350, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.91015625, + "step": 4671, + "time_per_iteration": 2.576904296875 + }, + { + "auxiliary_loss_clip": 0.01136631, + "auxiliary_loss_mlp": 0.01039569, + "balance_loss_clip": 1.02419102, + "balance_loss_mlp": 1.05212355, + "epoch": 0.28089583646475275, + "flos": 26432408401920.0, + "grad_norm": 1.655300005604084, + "language_loss": 0.74891758, + "learning_rate": 3.37520878264809e-06, + "loss": 0.77067947, + "num_input_tokens_seen": 100876135, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84375, + "step": 4672, + "time_per_iteration": 2.5101001262664795 + }, + { + "auxiliary_loss_clip": 0.01139664, + "auxiliary_loss_mlp": 0.01048727, + "balance_loss_clip": 1.0297612, + "balance_loss_mlp": 1.05017138, + "epoch": 0.2809559597174207, + "flos": 23111820627840.0, + "grad_norm": 2.377632390973165, + "language_loss": 0.7485683, + "learning_rate": 3.3749259740833286e-06, + "loss": 0.77045226, + "num_input_tokens_seen": 100894790, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.89453125, + "step": 4673, + "time_per_iteration": 2.5559215545654297 + }, + { + "auxiliary_loss_clip": 0.0113758, + "auxiliary_loss_mlp": 0.01040684, + "balance_loss_clip": 1.02367294, + "balance_loss_mlp": 1.04911065, + "epoch": 0.2810160829700887, + "flos": 20923496346240.0, + "grad_norm": 2.162495737742732, + "language_loss": 0.72274792, + "learning_rate": 3.374643113381237e-06, + "loss": 0.74453062, + "num_input_tokens_seen": 100915100, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.88671875, + "step": 4674, + "time_per_iteration": 2.4755725860595703 + }, + { + "auxiliary_loss_clip": 0.01142202, + "auxiliary_loss_mlp": 0.01043016, + "balance_loss_clip": 1.02487254, + "balance_loss_mlp": 1.05152214, + "epoch": 0.28107620622275664, + "flos": 14355901808640.0, + "grad_norm": 1.8501022214838438, + "language_loss": 0.77636325, + "learning_rate": 3.374360200552541e-06, + "loss": 0.79821539, + "num_input_tokens_seen": 100932795, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.90625, + "step": 4675, + "time_per_iteration": 2.5076191425323486 + }, + { + "auxiliary_loss_clip": 0.011417, + "auxiliary_loss_mlp": 0.01048524, + "balance_loss_clip": 1.03059506, + "balance_loss_mlp": 1.05080581, + "epoch": 0.2811363294754246, + "flos": 20919078973440.0, + "grad_norm": 4.743769816525981, + "language_loss": 0.7033428, + "learning_rate": 3.374077235607968e-06, + "loss": 0.72524506, + "num_input_tokens_seen": 100950505, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.90625, + "step": 4676, + "time_per_iteration": 2.4664652347564697 + }, + { + "auxiliary_loss_clip": 0.01136213, + "auxiliary_loss_mlp": 0.01042174, + "balance_loss_clip": 1.02637279, + "balance_loss_mlp": 1.05219054, + "epoch": 0.28119645272809257, + "flos": 20594841880320.0, + "grad_norm": 1.6504598517134752, + "language_loss": 0.70294476, + "learning_rate": 3.3737942185582487e-06, + "loss": 0.7247287, + "num_input_tokens_seen": 100968790, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.83984375, + "step": 4677, + "time_per_iteration": 3.9926962852478027 + }, + { + "auxiliary_loss_clip": 0.01140831, + "auxiliary_loss_mlp": 0.01046995, + "balance_loss_clip": 1.02779067, + "balance_loss_mlp": 1.05172849, + "epoch": 0.28125657598076054, + "flos": 25337420248320.0, + "grad_norm": 1.7155329144241396, + "language_loss": 0.63506716, + "learning_rate": 3.3735111494141153e-06, + "loss": 0.65694547, + "num_input_tokens_seen": 100990205, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.890625, + "step": 4678, + "time_per_iteration": 5.452545642852783 + }, + { + "auxiliary_loss_clip": 0.01140503, + "auxiliary_loss_mlp": 0.01048157, + "balance_loss_clip": 1.031039, + "balance_loss_mlp": 1.05193949, + "epoch": 0.2813166992334285, + "flos": 24827093769600.0, + "grad_norm": 1.4644682748892532, + "language_loss": 0.70249045, + "learning_rate": 3.3732280281863013e-06, + "loss": 0.7243771, + "num_input_tokens_seen": 101009815, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.88671875, + "step": 4679, + "time_per_iteration": 2.557156801223755 + }, + { + "auxiliary_loss_clip": 0.01138678, + "auxiliary_loss_mlp": 0.01040208, + "balance_loss_clip": 1.02276742, + "balance_loss_mlp": 1.05024076, + "epoch": 0.2813768224860965, + "flos": 21760753438080.0, + "grad_norm": 1.8307759218313573, + "language_loss": 0.74600148, + "learning_rate": 3.3729448548855422e-06, + "loss": 0.76779038, + "num_input_tokens_seen": 101026780, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.8828125, + "step": 4680, + "time_per_iteration": 2.478760004043579 + }, + { + "auxiliary_loss_clip": 0.01140599, + "auxiliary_loss_mlp": 0.01043469, + "balance_loss_clip": 1.0268507, + "balance_loss_mlp": 1.0514679, + "epoch": 0.2814369457387645, + "flos": 24316803204480.0, + "grad_norm": 1.8069902018568411, + "language_loss": 0.77090317, + "learning_rate": 3.3726616295225774e-06, + "loss": 0.79274386, + "num_input_tokens_seen": 101046215, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.890625, + "step": 4681, + "time_per_iteration": 2.5532946586608887 + }, + { + "auxiliary_loss_clip": 0.01142988, + "auxiliary_loss_mlp": 0.01041039, + "balance_loss_clip": 1.02353942, + "balance_loss_mlp": 1.05301392, + "epoch": 0.28149706899143245, + "flos": 18515326872960.0, + "grad_norm": 4.33574203258507, + "language_loss": 0.74047244, + "learning_rate": 3.372378352108146e-06, + "loss": 0.76231277, + "num_input_tokens_seen": 101063365, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.8984375, + "step": 4682, + "time_per_iteration": 2.450707197189331 + }, + { + "auxiliary_loss_clip": 0.0113683, + "auxiliary_loss_mlp": 0.01043566, + "balance_loss_clip": 1.02712727, + "balance_loss_mlp": 1.04989302, + "epoch": 0.2815571922441004, + "flos": 24863255786880.0, + "grad_norm": 1.4103030378304897, + "language_loss": 0.80830532, + "learning_rate": 3.3720950226529894e-06, + "loss": 0.8301093, + "num_input_tokens_seen": 101083835, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8671875, + "step": 4683, + "time_per_iteration": 2.5405828952789307 + }, + { + "auxiliary_loss_clip": 0.01142223, + "auxiliary_loss_mlp": 0.01047785, + "balance_loss_clip": 1.02984428, + "balance_loss_mlp": 1.05146146, + "epoch": 0.2816173154967684, + "flos": 19901622326400.0, + "grad_norm": 1.6936052100643573, + "language_loss": 0.76107442, + "learning_rate": 3.371811641167852e-06, + "loss": 0.78297454, + "num_input_tokens_seen": 101101740, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.90625, + "step": 4684, + "time_per_iteration": 2.4734883308410645 + }, + { + "auxiliary_loss_clip": 0.01135013, + "auxiliary_loss_mlp": 0.01038474, + "balance_loss_clip": 1.02196348, + "balance_loss_mlp": 1.04849601, + "epoch": 0.28167743874943635, + "flos": 17491333950720.0, + "grad_norm": 1.9675146174992446, + "language_loss": 0.7601878, + "learning_rate": 3.3715282076634807e-06, + "loss": 0.7819227, + "num_input_tokens_seen": 101120480, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.86328125, + "step": 4685, + "time_per_iteration": 2.521883010864258 + }, + { + "auxiliary_loss_clip": 0.01136456, + "auxiliary_loss_mlp": 0.01043262, + "balance_loss_clip": 1.02666783, + "balance_loss_mlp": 1.05083728, + "epoch": 0.2817375620021043, + "flos": 25302120157440.0, + "grad_norm": 2.003036282603561, + "language_loss": 0.7616905, + "learning_rate": 3.3712447221506218e-06, + "loss": 0.78348768, + "num_input_tokens_seen": 101142910, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.85546875, + "step": 4686, + "time_per_iteration": 2.5261688232421875 + }, + { + "auxiliary_loss_clip": 0.01140126, + "auxiliary_loss_mlp": 0.01051506, + "balance_loss_clip": 1.03319538, + "balance_loss_mlp": 1.04916072, + "epoch": 0.2817976852547723, + "flos": 18693227957760.0, + "grad_norm": 2.230965321609006, + "language_loss": 0.63345516, + "learning_rate": 3.370961184640025e-06, + "loss": 0.65537149, + "num_input_tokens_seen": 101160030, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.90625, + "step": 4687, + "time_per_iteration": 2.473508834838867 + }, + { + "auxiliary_loss_clip": 0.0114172, + "auxiliary_loss_mlp": 0.01052796, + "balance_loss_clip": 1.03577256, + "balance_loss_mlp": 1.05180609, + "epoch": 0.28185780850744024, + "flos": 22742263549440.0, + "grad_norm": 1.9761865692880811, + "language_loss": 0.76504958, + "learning_rate": 3.3706775951424433e-06, + "loss": 0.7869947, + "num_input_tokens_seen": 101177675, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.8984375, + "step": 4688, + "time_per_iteration": 2.4815330505371094 + }, + { + "auxiliary_loss_clip": 0.01135292, + "auxiliary_loss_mlp": 0.01040309, + "balance_loss_clip": 1.02364409, + "balance_loss_mlp": 1.04902148, + "epoch": 0.2819179317601082, + "flos": 14933919467520.0, + "grad_norm": 2.291650314126009, + "language_loss": 0.78333032, + "learning_rate": 3.37039395366863e-06, + "loss": 0.80508631, + "num_input_tokens_seen": 101192225, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.86328125, + "step": 4689, + "time_per_iteration": 2.464221239089966 + }, + { + "auxiliary_loss_clip": 0.01136671, + "auxiliary_loss_mlp": 0.01042633, + "balance_loss_clip": 1.02566934, + "balance_loss_mlp": 1.04886627, + "epoch": 0.2819780550127762, + "flos": 23145325038720.0, + "grad_norm": 2.2251394110426896, + "language_loss": 0.77819848, + "learning_rate": 3.37011026022934e-06, + "loss": 0.79999155, + "num_input_tokens_seen": 101210870, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.87890625, + "step": 4690, + "time_per_iteration": 2.4802086353302 + }, + { + "auxiliary_loss_clip": 0.01138887, + "auxiliary_loss_mlp": 0.01044233, + "balance_loss_clip": 1.02809191, + "balance_loss_mlp": 1.04984617, + "epoch": 0.28203817826544414, + "flos": 21616356764160.0, + "grad_norm": 1.762007121853784, + "language_loss": 0.8775022, + "learning_rate": 3.369826514835332e-06, + "loss": 0.89933336, + "num_input_tokens_seen": 101229965, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.890625, + "step": 4691, + "time_per_iteration": 2.5098307132720947 + }, + { + "auxiliary_loss_clip": 0.01144357, + "auxiliary_loss_mlp": 0.01043906, + "balance_loss_clip": 1.02714467, + "balance_loss_mlp": 1.0519383, + "epoch": 0.2820983015181121, + "flos": 24026788794240.0, + "grad_norm": 2.144178457094415, + "language_loss": 0.81952238, + "learning_rate": 3.3695427174973654e-06, + "loss": 0.84140503, + "num_input_tokens_seen": 101250980, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.921875, + "step": 4692, + "time_per_iteration": 2.501150131225586 + }, + { + "auxiliary_loss_clip": 0.01137061, + "auxiliary_loss_mlp": 0.01039873, + "balance_loss_clip": 1.02284956, + "balance_loss_mlp": 1.04852128, + "epoch": 0.2821584247707801, + "flos": 30007925976960.0, + "grad_norm": 1.7100054669520195, + "language_loss": 0.74535745, + "learning_rate": 3.3692588682262022e-06, + "loss": 0.7671268, + "num_input_tokens_seen": 101273335, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.88671875, + "step": 4693, + "time_per_iteration": 2.581108808517456 + }, + { + "auxiliary_loss_clip": 0.01139239, + "auxiliary_loss_mlp": 0.01036265, + "balance_loss_clip": 1.01942062, + "balance_loss_mlp": 1.04924035, + "epoch": 0.2822185480234481, + "flos": 21396762967680.0, + "grad_norm": 1.6174705324311944, + "language_loss": 0.7761777, + "learning_rate": 3.3689749670326046e-06, + "loss": 0.79793274, + "num_input_tokens_seen": 101292110, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8984375, + "step": 4694, + "time_per_iteration": 2.479616403579712 + }, + { + "auxiliary_loss_clip": 0.01136707, + "auxiliary_loss_mlp": 0.01038934, + "balance_loss_clip": 1.02220941, + "balance_loss_mlp": 1.05057073, + "epoch": 0.28227867127611606, + "flos": 27452809964160.0, + "grad_norm": 2.0658621313481604, + "language_loss": 0.66812259, + "learning_rate": 3.3686910139273392e-06, + "loss": 0.68987906, + "num_input_tokens_seen": 101312815, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.86328125, + "step": 4695, + "time_per_iteration": 2.560234546661377 + }, + { + "auxiliary_loss_clip": 0.0114143, + "auxiliary_loss_mlp": 0.01047559, + "balance_loss_clip": 1.02859259, + "balance_loss_mlp": 1.05084562, + "epoch": 0.282338794528784, + "flos": 22593736811520.0, + "grad_norm": 2.206840044366299, + "language_loss": 0.75868189, + "learning_rate": 3.3684070089211736e-06, + "loss": 0.78057176, + "num_input_tokens_seen": 101329045, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.90625, + "step": 4696, + "time_per_iteration": 2.484731674194336 + }, + { + "auxiliary_loss_clip": 0.01142111, + "auxiliary_loss_mlp": 0.01049599, + "balance_loss_clip": 1.03283811, + "balance_loss_mlp": 1.05234432, + "epoch": 0.282398917781452, + "flos": 42010923386880.0, + "grad_norm": 4.801168729119655, + "language_loss": 0.62373543, + "learning_rate": 3.368122952024877e-06, + "loss": 0.64565253, + "num_input_tokens_seen": 101352715, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8984375, + "step": 4697, + "time_per_iteration": 2.6771903038024902 + }, + { + "auxiliary_loss_clip": 0.01131406, + "auxiliary_loss_mlp": 0.01035664, + "balance_loss_clip": 1.02003598, + "balance_loss_mlp": 1.0468322, + "epoch": 0.28245904103411995, + "flos": 23224724052480.0, + "grad_norm": 1.6839402690923742, + "language_loss": 0.73317522, + "learning_rate": 3.3678388432492214e-06, + "loss": 0.75484592, + "num_input_tokens_seen": 101374640, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84375, + "step": 4698, + "time_per_iteration": 2.5262162685394287 + }, + { + "auxiliary_loss_clip": 0.01130801, + "auxiliary_loss_mlp": 0.01044648, + "balance_loss_clip": 1.029091, + "balance_loss_mlp": 1.0463903, + "epoch": 0.2825191642867879, + "flos": 25374623760000.0, + "grad_norm": 2.1160143892835275, + "language_loss": 0.74896884, + "learning_rate": 3.3675546826049788e-06, + "loss": 0.77072334, + "num_input_tokens_seen": 101393595, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.84375, + "step": 4699, + "time_per_iteration": 2.5613014698028564 + }, + { + "auxiliary_loss_clip": 0.01139697, + "auxiliary_loss_mlp": 0.01041633, + "balance_loss_clip": 1.02369165, + "balance_loss_mlp": 1.05032122, + "epoch": 0.2825792875394559, + "flos": 17236799199360.0, + "grad_norm": 3.187545417707515, + "language_loss": 0.80256712, + "learning_rate": 3.3672704701029265e-06, + "loss": 0.8243804, + "num_input_tokens_seen": 101409265, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.89453125, + "step": 4700, + "time_per_iteration": 2.4355719089508057 + }, + { + "auxiliary_loss_clip": 0.01132153, + "auxiliary_loss_mlp": 0.01049134, + "balance_loss_clip": 1.03461456, + "balance_loss_mlp": 1.05022645, + "epoch": 0.28263941079212385, + "flos": 26723967096960.0, + "grad_norm": 1.7483881606912919, + "language_loss": 0.81309319, + "learning_rate": 3.3669862057538402e-06, + "loss": 0.8349061, + "num_input_tokens_seen": 101428365, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8203125, + "step": 4701, + "time_per_iteration": 2.590824842453003 + }, + { + "auxiliary_loss_clip": 0.0113653, + "auxiliary_loss_mlp": 0.01038832, + "balance_loss_clip": 1.02301347, + "balance_loss_mlp": 1.05007911, + "epoch": 0.2826995340447918, + "flos": 25921327737600.0, + "grad_norm": 2.214271940066586, + "language_loss": 0.73758674, + "learning_rate": 3.3667018895685004e-06, + "loss": 0.75934035, + "num_input_tokens_seen": 101447280, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.86328125, + "step": 4702, + "time_per_iteration": 2.496689796447754 + }, + { + "auxiliary_loss_clip": 0.01136189, + "auxiliary_loss_mlp": 0.01038892, + "balance_loss_clip": 1.02251232, + "balance_loss_mlp": 1.05127287, + "epoch": 0.2827596572974598, + "flos": 22379709623040.0, + "grad_norm": 1.7981890053968508, + "language_loss": 0.78189409, + "learning_rate": 3.3664175215576886e-06, + "loss": 0.8036449, + "num_input_tokens_seen": 101465435, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8515625, + "step": 4703, + "time_per_iteration": 2.5225300788879395 + }, + { + "auxiliary_loss_clip": 0.011353, + "auxiliary_loss_mlp": 0.01046746, + "balance_loss_clip": 1.02923465, + "balance_loss_mlp": 1.0484302, + "epoch": 0.28281978055012774, + "flos": 33547137880320.0, + "grad_norm": 1.6026897384097336, + "language_loss": 0.6944623, + "learning_rate": 3.3661331017321867e-06, + "loss": 0.71628278, + "num_input_tokens_seen": 101486355, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8671875, + "step": 4704, + "time_per_iteration": 2.5721168518066406 + }, + { + "auxiliary_loss_clip": 0.0113917, + "auxiliary_loss_mlp": 0.01043731, + "balance_loss_clip": 1.02685118, + "balance_loss_mlp": 1.05374229, + "epoch": 0.2828799038027957, + "flos": 23440870143360.0, + "grad_norm": 1.9868129767490792, + "language_loss": 0.69884789, + "learning_rate": 3.3658486301027807e-06, + "loss": 0.7206769, + "num_input_tokens_seen": 101505875, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.85546875, + "step": 4705, + "time_per_iteration": 2.532034397125244 + }, + { + "auxiliary_loss_clip": 0.01057982, + "auxiliary_loss_mlp": 0.01003525, + "balance_loss_clip": 1.0018208, + "balance_loss_mlp": 1.02761459, + "epoch": 0.2829400270554637, + "flos": 69873690251520.0, + "grad_norm": 0.7396595768854823, + "language_loss": 0.59243953, + "learning_rate": 3.3655641066802577e-06, + "loss": 0.61305463, + "num_input_tokens_seen": 101565045, + "router_z_loss_clip": 0.01708984, + "router_z_loss_mlp": 0.3046875, + "step": 4706, + "time_per_iteration": 3.1149942874908447 + }, + { + "auxiliary_loss_clip": 0.01135425, + "auxiliary_loss_mlp": 0.01040531, + "balance_loss_clip": 1.02586842, + "balance_loss_mlp": 1.05135274, + "epoch": 0.2830001503081317, + "flos": 24789028331520.0, + "grad_norm": 1.3972451569930537, + "language_loss": 0.82227451, + "learning_rate": 3.365279531475407e-06, + "loss": 0.84403402, + "num_input_tokens_seen": 101585825, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.83984375, + "step": 4707, + "time_per_iteration": 2.5387215614318848 + }, + { + "auxiliary_loss_clip": 0.01137999, + "auxiliary_loss_mlp": 0.01039747, + "balance_loss_clip": 1.02199709, + "balance_loss_mlp": 1.04914331, + "epoch": 0.28306027356079966, + "flos": 27669387018240.0, + "grad_norm": 1.4509576382878049, + "language_loss": 0.80561262, + "learning_rate": 3.36499490449902e-06, + "loss": 0.82739007, + "num_input_tokens_seen": 101606105, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.890625, + "step": 4708, + "time_per_iteration": 2.5140204429626465 + }, + { + "auxiliary_loss_clip": 0.0105521, + "auxiliary_loss_mlp": 0.01000508, + "balance_loss_clip": 0.99875605, + "balance_loss_mlp": 1.02517498, + "epoch": 0.2831203968134676, + "flos": 60527938199040.0, + "grad_norm": 0.9117312370003612, + "language_loss": 0.62801576, + "learning_rate": 3.3647102257618895e-06, + "loss": 0.64857292, + "num_input_tokens_seen": 101656875, + "router_z_loss_clip": 0.01757812, + "router_z_loss_mlp": 0.30078125, + "step": 4709, + "time_per_iteration": 2.936171054840088 + }, + { + "auxiliary_loss_clip": 0.01133236, + "auxiliary_loss_mlp": 0.01038943, + "balance_loss_clip": 1.02320743, + "balance_loss_mlp": 1.04888415, + "epoch": 0.2831805200661356, + "flos": 22054790171520.0, + "grad_norm": 1.3738384560226649, + "language_loss": 0.73850632, + "learning_rate": 3.3644254952748103e-06, + "loss": 0.76022816, + "num_input_tokens_seen": 101676225, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.84375, + "step": 4710, + "time_per_iteration": 2.4954519271850586 + }, + { + "auxiliary_loss_clip": 0.01137863, + "auxiliary_loss_mlp": 0.01049743, + "balance_loss_clip": 1.03191566, + "balance_loss_mlp": 1.04925823, + "epoch": 0.28324064331880355, + "flos": 22600668136320.0, + "grad_norm": 1.9168276099157815, + "language_loss": 0.79272872, + "learning_rate": 3.364140713048579e-06, + "loss": 0.81460476, + "num_input_tokens_seen": 101693710, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.88671875, + "step": 4711, + "time_per_iteration": 2.4867448806762695 + }, + { + "auxiliary_loss_clip": 0.01138366, + "auxiliary_loss_mlp": 0.01043891, + "balance_loss_clip": 1.02646244, + "balance_loss_mlp": 1.04965401, + "epoch": 0.2833007665714715, + "flos": 30404127968640.0, + "grad_norm": 2.0504814559042064, + "language_loss": 0.71246219, + "learning_rate": 3.363855879093996e-06, + "loss": 0.73428476, + "num_input_tokens_seen": 101714010, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.88671875, + "step": 4712, + "time_per_iteration": 2.575636863708496 + }, + { + "auxiliary_loss_clip": 0.01138441, + "auxiliary_loss_mlp": 0.01049763, + "balance_loss_clip": 1.03291881, + "balance_loss_mlp": 1.05000687, + "epoch": 0.2833608898241395, + "flos": 23549499849600.0, + "grad_norm": 1.8055678270358249, + "language_loss": 0.82008445, + "learning_rate": 3.3635709934218605e-06, + "loss": 0.84196651, + "num_input_tokens_seen": 101732995, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8828125, + "step": 4713, + "time_per_iteration": 2.493767499923706 + }, + { + "auxiliary_loss_clip": 0.01136571, + "auxiliary_loss_mlp": 0.01041134, + "balance_loss_clip": 1.02401519, + "balance_loss_mlp": 1.05028057, + "epoch": 0.28342101307680745, + "flos": 20266726118400.0, + "grad_norm": 1.7485744544400377, + "language_loss": 0.75356781, + "learning_rate": 3.3632860560429766e-06, + "loss": 0.77534491, + "num_input_tokens_seen": 101751385, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.86328125, + "step": 4714, + "time_per_iteration": 2.505153179168701 + }, + { + "auxiliary_loss_clip": 0.01136297, + "auxiliary_loss_mlp": 0.01045701, + "balance_loss_clip": 1.02967894, + "balance_loss_mlp": 1.04942465, + "epoch": 0.2834811363294754, + "flos": 30847050576000.0, + "grad_norm": 1.4087892826571713, + "language_loss": 0.78411347, + "learning_rate": 3.3630010669681494e-06, + "loss": 0.80593348, + "num_input_tokens_seen": 101773825, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8671875, + "step": 4715, + "time_per_iteration": 2.554814100265503 + }, + { + "auxiliary_loss_clip": 0.01135347, + "auxiliary_loss_mlp": 0.01036876, + "balance_loss_clip": 1.02042472, + "balance_loss_mlp": 1.04960322, + "epoch": 0.2835412595821434, + "flos": 22711021695360.0, + "grad_norm": 1.6801208741854476, + "language_loss": 0.73694074, + "learning_rate": 3.3627160262081845e-06, + "loss": 0.758663, + "num_input_tokens_seen": 101791920, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.859375, + "step": 4716, + "time_per_iteration": 2.5286571979522705 + }, + { + "auxiliary_loss_clip": 0.01139786, + "auxiliary_loss_mlp": 0.01041969, + "balance_loss_clip": 1.02437401, + "balance_loss_mlp": 1.04774714, + "epoch": 0.28360138283481134, + "flos": 18077719478400.0, + "grad_norm": 2.328876822443367, + "language_loss": 0.74648547, + "learning_rate": 3.3624309337738917e-06, + "loss": 0.76830298, + "num_input_tokens_seen": 101809515, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.921875, + "step": 4717, + "time_per_iteration": 2.46952223777771 + }, + { + "auxiliary_loss_clip": 0.01139264, + "auxiliary_loss_mlp": 0.01044702, + "balance_loss_clip": 1.02846563, + "balance_loss_mlp": 1.04963374, + "epoch": 0.2836615060874793, + "flos": 17854785717120.0, + "grad_norm": 1.4913957575980352, + "language_loss": 0.669999, + "learning_rate": 3.3621457896760813e-06, + "loss": 0.69183862, + "num_input_tokens_seen": 101827735, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.89453125, + "step": 4718, + "time_per_iteration": 2.4831228256225586 + }, + { + "auxiliary_loss_clip": 0.01137489, + "auxiliary_loss_mlp": 0.01046854, + "balance_loss_clip": 1.03000975, + "balance_loss_mlp": 1.04782009, + "epoch": 0.2837216293401473, + "flos": 25740302169600.0, + "grad_norm": 1.8756812569885382, + "language_loss": 0.72633672, + "learning_rate": 3.361860593925566e-06, + "loss": 0.74818015, + "num_input_tokens_seen": 101845970, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.89453125, + "step": 4719, + "time_per_iteration": 4.022828102111816 + }, + { + "auxiliary_loss_clip": 0.01135497, + "auxiliary_loss_mlp": 0.01041437, + "balance_loss_clip": 1.02554655, + "balance_loss_mlp": 1.04928601, + "epoch": 0.2837817525928153, + "flos": 20923532259840.0, + "grad_norm": 1.5135010931827333, + "language_loss": 0.80621493, + "learning_rate": 3.3615753465331605e-06, + "loss": 0.82798427, + "num_input_tokens_seen": 101865040, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.86328125, + "step": 4720, + "time_per_iteration": 5.367753505706787 + }, + { + "auxiliary_loss_clip": 0.0113932, + "auxiliary_loss_mlp": 0.01044199, + "balance_loss_clip": 1.02752209, + "balance_loss_mlp": 1.05115819, + "epoch": 0.28384187584548326, + "flos": 18916700423040.0, + "grad_norm": 1.7029911565101727, + "language_loss": 0.79467577, + "learning_rate": 3.3612900475096817e-06, + "loss": 0.81651098, + "num_input_tokens_seen": 101883735, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8828125, + "step": 4721, + "time_per_iteration": 2.50327730178833 + }, + { + "auxiliary_loss_clip": 0.01133729, + "auxiliary_loss_mlp": 0.01035212, + "balance_loss_clip": 1.01929736, + "balance_loss_mlp": 1.04810679, + "epoch": 0.2839019990981512, + "flos": 27343964776320.0, + "grad_norm": 2.0644081658079343, + "language_loss": 0.82823032, + "learning_rate": 3.3610046968659474e-06, + "loss": 0.84991974, + "num_input_tokens_seen": 101903025, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.859375, + "step": 4722, + "time_per_iteration": 2.4968478679656982 + }, + { + "auxiliary_loss_clip": 0.01137825, + "auxiliary_loss_mlp": 0.01039882, + "balance_loss_clip": 1.02364612, + "balance_loss_mlp": 1.05073261, + "epoch": 0.2839621223508192, + "flos": 18114312458880.0, + "grad_norm": 1.6187910677092856, + "language_loss": 0.70086461, + "learning_rate": 3.3607192946127785e-06, + "loss": 0.72264171, + "num_input_tokens_seen": 101922255, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.87109375, + "step": 4723, + "time_per_iteration": 2.4899258613586426 + }, + { + "auxiliary_loss_clip": 0.01135132, + "auxiliary_loss_mlp": 0.01044726, + "balance_loss_clip": 1.02747679, + "balance_loss_mlp": 1.04938078, + "epoch": 0.28402224560348716, + "flos": 26358360514560.0, + "grad_norm": 1.736224288784384, + "language_loss": 0.78556609, + "learning_rate": 3.360433840760998e-06, + "loss": 0.8073647, + "num_input_tokens_seen": 101943100, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.859375, + "step": 4724, + "time_per_iteration": 2.496594190597534 + }, + { + "auxiliary_loss_clip": 0.01139767, + "auxiliary_loss_mlp": 0.01043591, + "balance_loss_clip": 1.02660346, + "balance_loss_mlp": 1.05093193, + "epoch": 0.2840823688561551, + "flos": 24060795995520.0, + "grad_norm": 1.6232572980988387, + "language_loss": 0.92404163, + "learning_rate": 3.36014833532143e-06, + "loss": 0.94587529, + "num_input_tokens_seen": 101963160, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.88671875, + "step": 4725, + "time_per_iteration": 2.511526584625244 + }, + { + "auxiliary_loss_clip": 0.01140103, + "auxiliary_loss_mlp": 0.01043108, + "balance_loss_clip": 1.0257988, + "balance_loss_mlp": 1.05020452, + "epoch": 0.2841424921088231, + "flos": 29459821368960.0, + "grad_norm": 2.0539060112221645, + "language_loss": 0.88626051, + "learning_rate": 3.3598627783049e-06, + "loss": 0.90809256, + "num_input_tokens_seen": 101984300, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.8984375, + "step": 4726, + "time_per_iteration": 2.5431292057037354 + }, + { + "auxiliary_loss_clip": 0.01139706, + "auxiliary_loss_mlp": 0.01048538, + "balance_loss_clip": 1.03090727, + "balance_loss_mlp": 1.05034256, + "epoch": 0.28420261536149105, + "flos": 48100367053440.0, + "grad_norm": 2.15176079657567, + "language_loss": 0.78793001, + "learning_rate": 3.359577169722238e-06, + "loss": 0.80981243, + "num_input_tokens_seen": 102005765, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.89453125, + "step": 4727, + "time_per_iteration": 2.7037220001220703 + }, + { + "auxiliary_loss_clip": 0.01133758, + "auxiliary_loss_mlp": 0.01037347, + "balance_loss_clip": 1.02229071, + "balance_loss_mlp": 1.04985464, + "epoch": 0.284262738614159, + "flos": 25666146541440.0, + "grad_norm": 2.258515630996078, + "language_loss": 0.66358554, + "learning_rate": 3.3592915095842733e-06, + "loss": 0.68529654, + "num_input_tokens_seen": 102022755, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.83984375, + "step": 4728, + "time_per_iteration": 2.5066046714782715 + }, + { + "auxiliary_loss_clip": 0.01134281, + "auxiliary_loss_mlp": 0.01045863, + "balance_loss_clip": 1.02941179, + "balance_loss_mlp": 1.04727221, + "epoch": 0.284322861866827, + "flos": 19718980646400.0, + "grad_norm": 1.756924339447767, + "language_loss": 0.75958216, + "learning_rate": 3.3590057979018386e-06, + "loss": 0.78138363, + "num_input_tokens_seen": 102041850, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.87109375, + "step": 4729, + "time_per_iteration": 2.4989402294158936 + }, + { + "auxiliary_loss_clip": 0.01140784, + "auxiliary_loss_mlp": 0.01050786, + "balance_loss_clip": 1.03383398, + "balance_loss_mlp": 1.05095756, + "epoch": 0.28438298511949495, + "flos": 23915250086400.0, + "grad_norm": 1.9682162336594704, + "language_loss": 0.66691023, + "learning_rate": 3.3587200346857674e-06, + "loss": 0.68882596, + "num_input_tokens_seen": 102059500, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.8984375, + "step": 4730, + "time_per_iteration": 2.509514570236206 + }, + { + "auxiliary_loss_clip": 0.01138579, + "auxiliary_loss_mlp": 0.01039094, + "balance_loss_clip": 1.02232122, + "balance_loss_mlp": 1.05049443, + "epoch": 0.2844431083721629, + "flos": 26067340523520.0, + "grad_norm": 1.7814838549320247, + "language_loss": 0.74382442, + "learning_rate": 3.3584342199468965e-06, + "loss": 0.76560116, + "num_input_tokens_seen": 102080460, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8828125, + "step": 4731, + "time_per_iteration": 2.547813653945923 + }, + { + "auxiliary_loss_clip": 0.01136629, + "auxiliary_loss_mlp": 0.01033401, + "balance_loss_clip": 1.01700974, + "balance_loss_mlp": 1.04890573, + "epoch": 0.2845032316248309, + "flos": 25810435474560.0, + "grad_norm": 1.530013147894791, + "language_loss": 0.83553517, + "learning_rate": 3.3581483536960638e-06, + "loss": 0.85723549, + "num_input_tokens_seen": 102100950, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.87890625, + "step": 4732, + "time_per_iteration": 2.5120863914489746 + }, + { + "auxiliary_loss_clip": 0.01136161, + "auxiliary_loss_mlp": 0.0105072, + "balance_loss_clip": 1.03301716, + "balance_loss_mlp": 1.04855001, + "epoch": 0.2845633548774989, + "flos": 19823192979840.0, + "grad_norm": 1.9723104549008028, + "language_loss": 0.79331958, + "learning_rate": 3.357862435944109e-06, + "loss": 0.81518835, + "num_input_tokens_seen": 102119345, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.875, + "step": 4733, + "time_per_iteration": 2.5007243156433105 + }, + { + "auxiliary_loss_clip": 0.01142281, + "auxiliary_loss_mlp": 0.01047444, + "balance_loss_clip": 1.02999151, + "balance_loss_mlp": 1.05076027, + "epoch": 0.28462347813016686, + "flos": 23182815859200.0, + "grad_norm": 2.3591023601535834, + "language_loss": 0.71619761, + "learning_rate": 3.357576466701875e-06, + "loss": 0.73809481, + "num_input_tokens_seen": 102139050, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9140625, + "step": 4734, + "time_per_iteration": 2.482696771621704 + }, + { + "auxiliary_loss_clip": 0.01131669, + "auxiliary_loss_mlp": 0.01036215, + "balance_loss_clip": 1.02036047, + "balance_loss_mlp": 1.04631829, + "epoch": 0.2846836013828348, + "flos": 18660477732480.0, + "grad_norm": 1.8927344989841068, + "language_loss": 0.73762977, + "learning_rate": 3.3572904459802056e-06, + "loss": 0.75930858, + "num_input_tokens_seen": 102157935, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.85546875, + "step": 4735, + "time_per_iteration": 2.4837005138397217 + }, + { + "auxiliary_loss_clip": 0.011344, + "auxiliary_loss_mlp": 0.01045777, + "balance_loss_clip": 1.03006482, + "balance_loss_mlp": 1.04755783, + "epoch": 0.2847437246355028, + "flos": 14173511523840.0, + "grad_norm": 1.630230460143418, + "language_loss": 0.79573876, + "learning_rate": 3.357004373789946e-06, + "loss": 0.81754053, + "num_input_tokens_seen": 102175325, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8671875, + "step": 4736, + "time_per_iteration": 2.4434666633605957 + }, + { + "auxiliary_loss_clip": 0.01139538, + "auxiliary_loss_mlp": 0.0104413, + "balance_loss_clip": 1.02740479, + "balance_loss_mlp": 1.05133057, + "epoch": 0.28480384788817076, + "flos": 29278364837760.0, + "grad_norm": 2.7860738328288637, + "language_loss": 0.59551513, + "learning_rate": 3.3567182501419453e-06, + "loss": 0.61735177, + "num_input_tokens_seen": 102196625, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8828125, + "step": 4737, + "time_per_iteration": 2.580573558807373 + }, + { + "auxiliary_loss_clip": 0.01132871, + "auxiliary_loss_mlp": 0.01039338, + "balance_loss_clip": 1.02334046, + "balance_loss_mlp": 1.04766428, + "epoch": 0.2848639711408387, + "flos": 22601314581120.0, + "grad_norm": 1.7923236486738074, + "language_loss": 0.86353856, + "learning_rate": 3.356432075047052e-06, + "loss": 0.8852607, + "num_input_tokens_seen": 102214975, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8515625, + "step": 4738, + "time_per_iteration": 2.483482837677002 + }, + { + "auxiliary_loss_clip": 0.0113957, + "auxiliary_loss_mlp": 0.01045248, + "balance_loss_clip": 1.02778435, + "balance_loss_mlp": 1.04864287, + "epoch": 0.2849240943935067, + "flos": 17599460866560.0, + "grad_norm": 2.438418234236932, + "language_loss": 0.89730442, + "learning_rate": 3.356145848516118e-06, + "loss": 0.91915256, + "num_input_tokens_seen": 102231885, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.91015625, + "step": 4739, + "time_per_iteration": 2.4746406078338623 + }, + { + "auxiliary_loss_clip": 0.01137971, + "auxiliary_loss_mlp": 0.01041062, + "balance_loss_clip": 1.02450418, + "balance_loss_mlp": 1.05253863, + "epoch": 0.28498421764617465, + "flos": 24862573428480.0, + "grad_norm": 1.3849266219761887, + "language_loss": 0.7207197, + "learning_rate": 3.355859570559998e-06, + "loss": 0.74250996, + "num_input_tokens_seen": 102252725, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.85546875, + "step": 4740, + "time_per_iteration": 2.49682879447937 + }, + { + "auxiliary_loss_clip": 0.01135048, + "auxiliary_loss_mlp": 0.01036754, + "balance_loss_clip": 1.0209707, + "balance_loss_mlp": 1.04970956, + "epoch": 0.2850443408988426, + "flos": 22782555630720.0, + "grad_norm": 1.6055473402712246, + "language_loss": 0.77937335, + "learning_rate": 3.3555732411895477e-06, + "loss": 0.80109143, + "num_input_tokens_seen": 102271730, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8515625, + "step": 4741, + "time_per_iteration": 2.51096248626709 + }, + { + "auxiliary_loss_clip": 0.01136227, + "auxiliary_loss_mlp": 0.01045688, + "balance_loss_clip": 1.02828324, + "balance_loss_mlp": 1.04566443, + "epoch": 0.2851044641515106, + "flos": 18844053166080.0, + "grad_norm": 1.6279093143019605, + "language_loss": 0.76295173, + "learning_rate": 3.3552868604156235e-06, + "loss": 0.78477085, + "num_input_tokens_seen": 102291325, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.90625, + "step": 4742, + "time_per_iteration": 2.462972402572632 + }, + { + "auxiliary_loss_clip": 0.01139125, + "auxiliary_loss_mlp": 0.01048964, + "balance_loss_clip": 1.03039074, + "balance_loss_mlp": 1.04792476, + "epoch": 0.28516458740417855, + "flos": 18880502492160.0, + "grad_norm": 1.8587468959738758, + "language_loss": 0.5772593, + "learning_rate": 3.355000428249086e-06, + "loss": 0.59914023, + "num_input_tokens_seen": 102309000, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9140625, + "step": 4743, + "time_per_iteration": 2.511903762817383 + }, + { + "auxiliary_loss_clip": 0.01141389, + "auxiliary_loss_mlp": 0.01054233, + "balance_loss_clip": 1.03724515, + "balance_loss_mlp": 1.05195451, + "epoch": 0.2852247106568465, + "flos": 25299821687040.0, + "grad_norm": 2.12515026406258, + "language_loss": 0.74454999, + "learning_rate": 3.354713944700797e-06, + "loss": 0.7665062, + "num_input_tokens_seen": 102329240, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.89453125, + "step": 4744, + "time_per_iteration": 2.48883318901062 + }, + { + "auxiliary_loss_clip": 0.01135189, + "auxiliary_loss_mlp": 0.01043767, + "balance_loss_clip": 1.02801967, + "balance_loss_mlp": 1.04948175, + "epoch": 0.2852848339095145, + "flos": 11655383541120.0, + "grad_norm": 2.362002737479584, + "language_loss": 0.77483714, + "learning_rate": 3.3544274097816185e-06, + "loss": 0.79662669, + "num_input_tokens_seen": 102344440, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.859375, + "step": 4745, + "time_per_iteration": 2.4669532775878906 + }, + { + "auxiliary_loss_clip": 0.01130558, + "auxiliary_loss_mlp": 0.01039375, + "balance_loss_clip": 1.02363896, + "balance_loss_mlp": 1.04884791, + "epoch": 0.2853449571621825, + "flos": 12933228856320.0, + "grad_norm": 1.753549870597739, + "language_loss": 0.83101368, + "learning_rate": 3.3541408235024173e-06, + "loss": 0.85271305, + "num_input_tokens_seen": 102360985, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.81640625, + "step": 4746, + "time_per_iteration": 2.4236245155334473 + }, + { + "auxiliary_loss_clip": 0.01138419, + "auxiliary_loss_mlp": 0.01039496, + "balance_loss_clip": 1.02243769, + "balance_loss_mlp": 1.04718721, + "epoch": 0.28540508041485046, + "flos": 20010575255040.0, + "grad_norm": 1.6977094615171933, + "language_loss": 0.79818654, + "learning_rate": 3.3538541858740604e-06, + "loss": 0.81996572, + "num_input_tokens_seen": 102380320, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9140625, + "step": 4747, + "time_per_iteration": 2.47261118888855 + }, + { + "auxiliary_loss_clip": 0.01044617, + "auxiliary_loss_mlp": 0.01004042, + "balance_loss_clip": 1.00208712, + "balance_loss_mlp": 1.01364255, + "epoch": 0.28546520366751843, + "flos": 68139349966080.0, + "grad_norm": 0.7754058718106229, + "language_loss": 0.60505557, + "learning_rate": 3.3535674969074173e-06, + "loss": 0.62554216, + "num_input_tokens_seen": 102439140, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.30859375, + "step": 4748, + "time_per_iteration": 3.087096691131592 + }, + { + "auxiliary_loss_clip": 0.0113463, + "auxiliary_loss_mlp": 0.01041876, + "balance_loss_clip": 1.02596188, + "balance_loss_mlp": 1.04764485, + "epoch": 0.2855253269201864, + "flos": 13251540205440.0, + "grad_norm": 2.177788697298361, + "language_loss": 0.80300528, + "learning_rate": 3.3532807566133592e-06, + "loss": 0.82477033, + "num_input_tokens_seen": 102450990, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.87109375, + "step": 4749, + "time_per_iteration": 2.4132721424102783 + }, + { + "auxiliary_loss_clip": 0.01134988, + "auxiliary_loss_mlp": 0.01038736, + "balance_loss_clip": 1.022488, + "balance_loss_mlp": 1.04882109, + "epoch": 0.28558545017285436, + "flos": 28620876337920.0, + "grad_norm": 1.910787577049047, + "language_loss": 0.7067076, + "learning_rate": 3.3529939650027587e-06, + "loss": 0.72844481, + "num_input_tokens_seen": 102471820, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.86328125, + "step": 4750, + "time_per_iteration": 2.5576114654541016 + }, + { + "auxiliary_loss_clip": 0.01132669, + "auxiliary_loss_mlp": 0.01037305, + "balance_loss_clip": 1.02121782, + "balance_loss_mlp": 1.04961181, + "epoch": 0.2856455734255223, + "flos": 34130470752000.0, + "grad_norm": 1.569446011166348, + "language_loss": 0.81798106, + "learning_rate": 3.3527071220864917e-06, + "loss": 0.83968079, + "num_input_tokens_seen": 102492625, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.83203125, + "step": 4751, + "time_per_iteration": 2.5805511474609375 + }, + { + "auxiliary_loss_clip": 0.01134337, + "auxiliary_loss_mlp": 0.01044364, + "balance_loss_clip": 1.02847314, + "balance_loss_mlp": 1.04876757, + "epoch": 0.2857056966781903, + "flos": 39786149779200.0, + "grad_norm": 1.8824724995030706, + "language_loss": 0.80753136, + "learning_rate": 3.3524202278754353e-06, + "loss": 0.82931828, + "num_input_tokens_seen": 102514145, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.859375, + "step": 4752, + "time_per_iteration": 2.6541080474853516 + }, + { + "auxiliary_loss_clip": 0.01134255, + "auxiliary_loss_mlp": 0.01039105, + "balance_loss_clip": 1.02258289, + "balance_loss_mlp": 1.04778147, + "epoch": 0.28576581993085826, + "flos": 21872292145920.0, + "grad_norm": 1.8943096426553439, + "language_loss": 0.78827929, + "learning_rate": 3.3521332823804676e-06, + "loss": 0.81001288, + "num_input_tokens_seen": 102532365, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.86328125, + "step": 4753, + "time_per_iteration": 2.4775567054748535 + }, + { + "auxiliary_loss_clip": 0.01140638, + "auxiliary_loss_mlp": 0.01043914, + "balance_loss_clip": 1.02559114, + "balance_loss_mlp": 1.05078959, + "epoch": 0.2858259431835262, + "flos": 19091656592640.0, + "grad_norm": 2.205371578508451, + "language_loss": 0.89809895, + "learning_rate": 3.3518462856124704e-06, + "loss": 0.91994447, + "num_input_tokens_seen": 102548425, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.8984375, + "step": 4754, + "time_per_iteration": 2.486128091812134 + }, + { + "auxiliary_loss_clip": 0.01134093, + "auxiliary_loss_mlp": 0.01041613, + "balance_loss_clip": 1.02616322, + "balance_loss_mlp": 1.04897058, + "epoch": 0.2858860664361942, + "flos": 20334309557760.0, + "grad_norm": 1.932227485650823, + "language_loss": 0.8234359, + "learning_rate": 3.3515592375823267e-06, + "loss": 0.84519303, + "num_input_tokens_seen": 102566370, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8515625, + "step": 4755, + "time_per_iteration": 2.491184711456299 + }, + { + "auxiliary_loss_clip": 0.01133783, + "auxiliary_loss_mlp": 0.010447, + "balance_loss_clip": 1.02915466, + "balance_loss_mlp": 1.04667544, + "epoch": 0.28594618968886215, + "flos": 24461738582400.0, + "grad_norm": 1.4908389000148254, + "language_loss": 0.83846784, + "learning_rate": 3.351272138300922e-06, + "loss": 0.86025268, + "num_input_tokens_seen": 102588715, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.87109375, + "step": 4756, + "time_per_iteration": 2.5934014320373535 + }, + { + "auxiliary_loss_clip": 0.01048134, + "auxiliary_loss_mlp": 0.01008558, + "balance_loss_clip": 1.0067457, + "balance_loss_mlp": 1.01677859, + "epoch": 0.2860063129415301, + "flos": 71652850709760.0, + "grad_norm": 0.8659269702666513, + "language_loss": 0.61012161, + "learning_rate": 3.350984987779142e-06, + "loss": 0.63068855, + "num_input_tokens_seen": 102656715, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.3125, + "step": 4757, + "time_per_iteration": 3.2122225761413574 + }, + { + "auxiliary_loss_clip": 0.01137202, + "auxiliary_loss_mlp": 0.01033086, + "balance_loss_clip": 1.0173862, + "balance_loss_mlp": 1.05204773, + "epoch": 0.2860664361941981, + "flos": 20558679863040.0, + "grad_norm": 1.9457322051707677, + "language_loss": 0.65794766, + "learning_rate": 3.3506977860278756e-06, + "loss": 0.67965055, + "num_input_tokens_seen": 102676545, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8515625, + "step": 4758, + "time_per_iteration": 2.60023832321167 + }, + { + "auxiliary_loss_clip": 0.01134399, + "auxiliary_loss_mlp": 0.01036695, + "balance_loss_clip": 1.02027392, + "balance_loss_mlp": 1.04756904, + "epoch": 0.2861265594468661, + "flos": 35996389534080.0, + "grad_norm": 1.560843999265526, + "language_loss": 0.62950313, + "learning_rate": 3.3504105330580143e-06, + "loss": 0.65121412, + "num_input_tokens_seen": 102702875, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8671875, + "step": 4759, + "time_per_iteration": 2.6352102756500244 + }, + { + "auxiliary_loss_clip": 0.0113658, + "auxiliary_loss_mlp": 0.01042718, + "balance_loss_clip": 1.02611256, + "balance_loss_mlp": 1.05098844, + "epoch": 0.28618668269953407, + "flos": 20047419630720.0, + "grad_norm": 1.76909488275169, + "language_loss": 0.7385608, + "learning_rate": 3.3501232288804496e-06, + "loss": 0.76035368, + "num_input_tokens_seen": 102723160, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.85546875, + "step": 4760, + "time_per_iteration": 2.5397889614105225 + }, + { + "auxiliary_loss_clip": 0.01132288, + "auxiliary_loss_mlp": 0.01038545, + "balance_loss_clip": 1.02357185, + "balance_loss_mlp": 1.04949427, + "epoch": 0.28624680595220203, + "flos": 24971849579520.0, + "grad_norm": 1.9401243114633073, + "language_loss": 0.72422945, + "learning_rate": 3.3498358735060773e-06, + "loss": 0.74593776, + "num_input_tokens_seen": 102743855, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 4761, + "time_per_iteration": 4.029369592666626 + }, + { + "auxiliary_loss_clip": 0.01135721, + "auxiliary_loss_mlp": 0.0104628, + "balance_loss_clip": 1.0303421, + "balance_loss_mlp": 1.04875946, + "epoch": 0.28630692920487, + "flos": 22492253911680.0, + "grad_norm": 2.026540334724573, + "language_loss": 0.74605787, + "learning_rate": 3.349548466945793e-06, + "loss": 0.76787788, + "num_input_tokens_seen": 102761370, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.87109375, + "step": 4762, + "time_per_iteration": 3.9056994915008545 + }, + { + "auxiliary_loss_clip": 0.01134836, + "auxiliary_loss_mlp": 0.01045458, + "balance_loss_clip": 1.02963901, + "balance_loss_mlp": 1.05027771, + "epoch": 0.28636705245753796, + "flos": 21249888255360.0, + "grad_norm": 1.79451974437327, + "language_loss": 0.76088154, + "learning_rate": 3.349261009210496e-06, + "loss": 0.78268445, + "num_input_tokens_seen": 102780885, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.84375, + "step": 4763, + "time_per_iteration": 2.521223545074463 + }, + { + "auxiliary_loss_clip": 0.01133105, + "auxiliary_loss_mlp": 0.01035478, + "balance_loss_clip": 1.01907468, + "balance_loss_mlp": 1.04712808, + "epoch": 0.28642717571020593, + "flos": 24095772864000.0, + "grad_norm": 1.9430054907967222, + "language_loss": 0.76937616, + "learning_rate": 3.348973500311086e-06, + "loss": 0.79106188, + "num_input_tokens_seen": 102801000, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.859375, + "step": 4764, + "time_per_iteration": 2.4924814701080322 + }, + { + "auxiliary_loss_clip": 0.01137128, + "auxiliary_loss_mlp": 0.01041403, + "balance_loss_clip": 1.02354538, + "balance_loss_mlp": 1.04996395, + "epoch": 0.2864872989628739, + "flos": 22601386408320.0, + "grad_norm": 1.8973954036904035, + "language_loss": 0.71061826, + "learning_rate": 3.348685940258466e-06, + "loss": 0.73240352, + "num_input_tokens_seen": 102820230, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.87109375, + "step": 4765, + "time_per_iteration": 2.509204387664795 + }, + { + "auxiliary_loss_clip": 0.01131492, + "auxiliary_loss_mlp": 0.01037402, + "balance_loss_clip": 1.02149963, + "balance_loss_mlp": 1.04705501, + "epoch": 0.28654742221554186, + "flos": 32745073138560.0, + "grad_norm": 1.5129940587619137, + "language_loss": 0.75756145, + "learning_rate": 3.3483983290635395e-06, + "loss": 0.77925038, + "num_input_tokens_seen": 102842670, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.84375, + "step": 4766, + "time_per_iteration": 2.562422513961792 + }, + { + "auxiliary_loss_clip": 0.01135318, + "auxiliary_loss_mlp": 0.0103558, + "balance_loss_clip": 1.01960635, + "balance_loss_mlp": 1.05073392, + "epoch": 0.2866075454682098, + "flos": 26981626331520.0, + "grad_norm": 1.5780141248071407, + "language_loss": 0.77556801, + "learning_rate": 3.348110666737214e-06, + "loss": 0.79727697, + "num_input_tokens_seen": 102864480, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.84375, + "step": 4767, + "time_per_iteration": 2.5476057529449463 + }, + { + "auxiliary_loss_clip": 0.01133832, + "auxiliary_loss_mlp": 0.01041655, + "balance_loss_clip": 1.02591908, + "balance_loss_mlp": 1.04878676, + "epoch": 0.2866676687208778, + "flos": 23253847004160.0, + "grad_norm": 2.169490874338027, + "language_loss": 0.6494413, + "learning_rate": 3.3478229532903956e-06, + "loss": 0.67119616, + "num_input_tokens_seen": 102883740, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8515625, + "step": 4768, + "time_per_iteration": 2.4961044788360596 + }, + { + "auxiliary_loss_clip": 0.01137611, + "auxiliary_loss_mlp": 0.01044314, + "balance_loss_clip": 1.02807736, + "balance_loss_mlp": 1.04944301, + "epoch": 0.28672779197354575, + "flos": 21579727870080.0, + "grad_norm": 1.5253191671074575, + "language_loss": 0.70345664, + "learning_rate": 3.3475351887339967e-06, + "loss": 0.72527587, + "num_input_tokens_seen": 102902945, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8828125, + "step": 4769, + "time_per_iteration": 2.5243568420410156 + }, + { + "auxiliary_loss_clip": 0.01136117, + "auxiliary_loss_mlp": 0.01034848, + "balance_loss_clip": 1.01992261, + "balance_loss_mlp": 1.04866219, + "epoch": 0.2867879152262137, + "flos": 19865568049920.0, + "grad_norm": 1.7483868508562144, + "language_loss": 0.75552189, + "learning_rate": 3.3472473730789288e-06, + "loss": 0.77723145, + "num_input_tokens_seen": 102922405, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.875, + "step": 4770, + "time_per_iteration": 2.468655586242676 + }, + { + "auxiliary_loss_clip": 0.01137893, + "auxiliary_loss_mlp": 0.01043906, + "balance_loss_clip": 1.02745509, + "balance_loss_mlp": 1.0500282, + "epoch": 0.2868480384788817, + "flos": 28213325648640.0, + "grad_norm": 3.1666126901900107, + "language_loss": 0.6730839, + "learning_rate": 3.3469595063361045e-06, + "loss": 0.69490194, + "num_input_tokens_seen": 102938980, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.87890625, + "step": 4771, + "time_per_iteration": 2.5334818363189697 + }, + { + "auxiliary_loss_clip": 0.01046415, + "auxiliary_loss_mlp": 0.01005401, + "balance_loss_clip": 1.00367248, + "balance_loss_mlp": 1.01655006, + "epoch": 0.2869081617315497, + "flos": 65424286690560.0, + "grad_norm": 0.7694277286160668, + "language_loss": 0.56883639, + "learning_rate": 3.3466715885164414e-06, + "loss": 0.58935452, + "num_input_tokens_seen": 103000405, + "router_z_loss_clip": 0.01733398, + "router_z_loss_mlp": 0.29882812, + "step": 4772, + "time_per_iteration": 3.0373501777648926 + }, + { + "auxiliary_loss_clip": 0.01136901, + "auxiliary_loss_mlp": 0.01041473, + "balance_loss_clip": 1.02567768, + "balance_loss_mlp": 1.05014777, + "epoch": 0.28696828498421767, + "flos": 18660729127680.0, + "grad_norm": 2.6517872983988844, + "language_loss": 0.83356023, + "learning_rate": 3.346383619630856e-06, + "loss": 0.85534406, + "num_input_tokens_seen": 103017970, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8671875, + "step": 4773, + "time_per_iteration": 2.477537155151367 + }, + { + "auxiliary_loss_clip": 0.0113402, + "auxiliary_loss_mlp": 0.0103818, + "balance_loss_clip": 1.02159762, + "balance_loss_mlp": 1.04630029, + "epoch": 0.28702840823688563, + "flos": 23659745667840.0, + "grad_norm": 2.6367186533355356, + "language_loss": 0.77910906, + "learning_rate": 3.34609559969027e-06, + "loss": 0.80083102, + "num_input_tokens_seen": 103036385, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.875, + "step": 4774, + "time_per_iteration": 2.514545440673828 + }, + { + "auxiliary_loss_clip": 0.01136368, + "auxiliary_loss_mlp": 0.01037759, + "balance_loss_clip": 1.02091432, + "balance_loss_mlp": 1.05010271, + "epoch": 0.2870885314895536, + "flos": 13804744544640.0, + "grad_norm": 1.7122435327393783, + "language_loss": 0.73488462, + "learning_rate": 3.3458075287056034e-06, + "loss": 0.75662589, + "num_input_tokens_seen": 103052170, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.86328125, + "step": 4775, + "time_per_iteration": 2.4526851177215576 + }, + { + "auxiliary_loss_clip": 0.0113744, + "auxiliary_loss_mlp": 0.01037967, + "balance_loss_clip": 1.02267885, + "balance_loss_mlp": 1.05033445, + "epoch": 0.28714865474222157, + "flos": 17786771314560.0, + "grad_norm": 1.655187901014976, + "language_loss": 0.88345891, + "learning_rate": 3.34551940668778e-06, + "loss": 0.905213, + "num_input_tokens_seen": 103070510, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.87109375, + "step": 4776, + "time_per_iteration": 2.5487112998962402 + }, + { + "auxiliary_loss_clip": 0.01135791, + "auxiliary_loss_mlp": 0.01037024, + "balance_loss_clip": 1.02170587, + "balance_loss_mlp": 1.05060029, + "epoch": 0.28720877799488953, + "flos": 15997486199040.0, + "grad_norm": 1.7920640817181568, + "language_loss": 0.74046421, + "learning_rate": 3.345231233647726e-06, + "loss": 0.76219237, + "num_input_tokens_seen": 103089590, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8515625, + "step": 4777, + "time_per_iteration": 2.4858744144439697 + }, + { + "auxiliary_loss_clip": 0.01143681, + "auxiliary_loss_mlp": 0.01044417, + "balance_loss_clip": 1.02763224, + "balance_loss_mlp": 1.05306673, + "epoch": 0.2872689012475575, + "flos": 20923137210240.0, + "grad_norm": 1.9679293284940167, + "language_loss": 0.80052459, + "learning_rate": 3.3449430095963696e-06, + "loss": 0.82240558, + "num_input_tokens_seen": 103109080, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.90625, + "step": 4778, + "time_per_iteration": 2.536553382873535 + }, + { + "auxiliary_loss_clip": 0.01135156, + "auxiliary_loss_mlp": 0.01046142, + "balance_loss_clip": 1.03032279, + "balance_loss_mlp": 1.05058503, + "epoch": 0.28732902450022546, + "flos": 21325121291520.0, + "grad_norm": 1.6265242751714746, + "language_loss": 0.73940611, + "learning_rate": 3.3446547345446386e-06, + "loss": 0.76121908, + "num_input_tokens_seen": 103127755, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.84765625, + "step": 4779, + "time_per_iteration": 2.5068604946136475 + }, + { + "auxiliary_loss_clip": 0.01139025, + "auxiliary_loss_mlp": 0.01044309, + "balance_loss_clip": 1.02791739, + "balance_loss_mlp": 1.05089593, + "epoch": 0.2873891477528934, + "flos": 20850382212480.0, + "grad_norm": 1.5791887497798731, + "language_loss": 0.76378506, + "learning_rate": 3.3443664085034656e-06, + "loss": 0.78561842, + "num_input_tokens_seen": 103147035, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8828125, + "step": 4780, + "time_per_iteration": 2.6357336044311523 + }, + { + "auxiliary_loss_clip": 0.01132548, + "auxiliary_loss_mlp": 0.01042513, + "balance_loss_clip": 1.02789187, + "balance_loss_mlp": 1.04874134, + "epoch": 0.2874492710055614, + "flos": 17420051410560.0, + "grad_norm": 1.8554557560955622, + "language_loss": 0.81367111, + "learning_rate": 3.344078031483784e-06, + "loss": 0.83542168, + "num_input_tokens_seen": 103165410, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8359375, + "step": 4781, + "time_per_iteration": 2.484217405319214 + }, + { + "auxiliary_loss_clip": 0.01138983, + "auxiliary_loss_mlp": 0.01044127, + "balance_loss_clip": 1.02688909, + "balance_loss_mlp": 1.0511862, + "epoch": 0.28750939425822936, + "flos": 13406818700160.0, + "grad_norm": 1.9124031057386872, + "language_loss": 0.86249948, + "learning_rate": 3.3437896034965283e-06, + "loss": 0.88433063, + "num_input_tokens_seen": 103183710, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.87890625, + "step": 4782, + "time_per_iteration": 2.4822945594787598 + }, + { + "auxiliary_loss_clip": 0.0113749, + "auxiliary_loss_mlp": 0.0104499, + "balance_loss_clip": 1.02842641, + "balance_loss_mlp": 1.05222881, + "epoch": 0.2875695175108973, + "flos": 21870029589120.0, + "grad_norm": 1.5584901619772236, + "language_loss": 0.71195668, + "learning_rate": 3.3435011245526357e-06, + "loss": 0.73378146, + "num_input_tokens_seen": 103203790, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.85546875, + "step": 4783, + "time_per_iteration": 2.4959099292755127 + }, + { + "auxiliary_loss_clip": 0.01136896, + "auxiliary_loss_mlp": 0.0104062, + "balance_loss_clip": 1.02443171, + "balance_loss_mlp": 1.05179179, + "epoch": 0.2876296407635653, + "flos": 26245457089920.0, + "grad_norm": 3.6731562407195932, + "language_loss": 0.77011871, + "learning_rate": 3.343212594663047e-06, + "loss": 0.79189384, + "num_input_tokens_seen": 103223925, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8515625, + "step": 4784, + "time_per_iteration": 2.55037784576416 + }, + { + "auxiliary_loss_clip": 0.0113214, + "auxiliary_loss_mlp": 0.01041887, + "balance_loss_clip": 1.02603197, + "balance_loss_mlp": 1.04896331, + "epoch": 0.28768976401623325, + "flos": 25373654092800.0, + "grad_norm": 1.5223386635016902, + "language_loss": 0.75859249, + "learning_rate": 3.3429240138387015e-06, + "loss": 0.7803328, + "num_input_tokens_seen": 103244760, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.83203125, + "step": 4785, + "time_per_iteration": 2.526587724685669 + }, + { + "auxiliary_loss_clip": 0.01135192, + "auxiliary_loss_mlp": 0.01042659, + "balance_loss_clip": 1.02724528, + "balance_loss_mlp": 1.04946601, + "epoch": 0.28774988726890127, + "flos": 30664372982400.0, + "grad_norm": 1.9982438427344784, + "language_loss": 0.83033895, + "learning_rate": 3.3426353820905425e-06, + "loss": 0.85211748, + "num_input_tokens_seen": 103261995, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.859375, + "step": 4786, + "time_per_iteration": 2.5786821842193604 + }, + { + "auxiliary_loss_clip": 0.01134245, + "auxiliary_loss_mlp": 0.01033568, + "balance_loss_clip": 1.01899481, + "balance_loss_mlp": 1.04868317, + "epoch": 0.28781001052156924, + "flos": 20595452411520.0, + "grad_norm": 1.95457297040312, + "language_loss": 0.80007184, + "learning_rate": 3.342346699429516e-06, + "loss": 0.82174993, + "num_input_tokens_seen": 103279780, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.85546875, + "step": 4787, + "time_per_iteration": 2.4734396934509277 + }, + { + "auxiliary_loss_clip": 0.01136278, + "auxiliary_loss_mlp": 0.01039735, + "balance_loss_clip": 1.02397585, + "balance_loss_mlp": 1.04906642, + "epoch": 0.2878701337742372, + "flos": 26542330997760.0, + "grad_norm": 2.6671828195015044, + "language_loss": 0.83666658, + "learning_rate": 3.3420579658665677e-06, + "loss": 0.85842675, + "num_input_tokens_seen": 103300580, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.87109375, + "step": 4788, + "time_per_iteration": 2.5388548374176025 + }, + { + "auxiliary_loss_clip": 0.01137234, + "auxiliary_loss_mlp": 0.01046523, + "balance_loss_clip": 1.03135991, + "balance_loss_mlp": 1.05051816, + "epoch": 0.28793025702690517, + "flos": 28146855530880.0, + "grad_norm": 1.8168797658695668, + "language_loss": 0.73769903, + "learning_rate": 3.3417691814126468e-06, + "loss": 0.75953662, + "num_input_tokens_seen": 103320430, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8671875, + "step": 4789, + "time_per_iteration": 2.5259692668914795 + }, + { + "auxiliary_loss_clip": 0.01129641, + "auxiliary_loss_mlp": 0.01043253, + "balance_loss_clip": 1.02819657, + "balance_loss_mlp": 1.0466274, + "epoch": 0.28799038027957313, + "flos": 23805471144960.0, + "grad_norm": 1.7572733449240283, + "language_loss": 0.83982229, + "learning_rate": 3.341480346078704e-06, + "loss": 0.86155128, + "num_input_tokens_seen": 103337695, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 4790, + "time_per_iteration": 2.5347094535827637 + }, + { + "auxiliary_loss_clip": 0.01136016, + "auxiliary_loss_mlp": 0.01039193, + "balance_loss_clip": 1.02267063, + "balance_loss_mlp": 1.05011547, + "epoch": 0.2880505035322411, + "flos": 22344122223360.0, + "grad_norm": 1.8137236403798864, + "language_loss": 0.77924603, + "learning_rate": 3.3411914598756922e-06, + "loss": 0.80099815, + "num_input_tokens_seen": 103357010, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.859375, + "step": 4791, + "time_per_iteration": 2.475328207015991 + }, + { + "auxiliary_loss_clip": 0.0113676, + "auxiliary_loss_mlp": 0.01034669, + "balance_loss_clip": 1.01854002, + "balance_loss_mlp": 1.04824567, + "epoch": 0.28811062678490906, + "flos": 18004246208640.0, + "grad_norm": 1.933659829708973, + "language_loss": 0.70760292, + "learning_rate": 3.3409025228145654e-06, + "loss": 0.72931719, + "num_input_tokens_seen": 103375600, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.88671875, + "step": 4792, + "time_per_iteration": 2.4705538749694824 + }, + { + "auxiliary_loss_clip": 0.01135222, + "auxiliary_loss_mlp": 0.01035984, + "balance_loss_clip": 1.02065361, + "balance_loss_mlp": 1.04968917, + "epoch": 0.28817075003757703, + "flos": 22090880361600.0, + "grad_norm": 2.08648870526395, + "language_loss": 0.79392564, + "learning_rate": 3.3406135349062812e-06, + "loss": 0.81563771, + "num_input_tokens_seen": 103395225, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.85546875, + "step": 4793, + "time_per_iteration": 2.509697914123535 + }, + { + "auxiliary_loss_clip": 0.01131221, + "auxiliary_loss_mlp": 0.01040002, + "balance_loss_clip": 1.02479076, + "balance_loss_mlp": 1.04920101, + "epoch": 0.288230873290245, + "flos": 41683130847360.0, + "grad_norm": 1.6269924793239006, + "language_loss": 0.77731872, + "learning_rate": 3.340324496161797e-06, + "loss": 0.7990309, + "num_input_tokens_seen": 103417245, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8203125, + "step": 4794, + "time_per_iteration": 2.6943047046661377 + }, + { + "auxiliary_loss_clip": 0.01134923, + "auxiliary_loss_mlp": 0.0104449, + "balance_loss_clip": 1.02819395, + "balance_loss_mlp": 1.04913807, + "epoch": 0.28829099654291296, + "flos": 18624423456000.0, + "grad_norm": 2.663854929830155, + "language_loss": 0.8254813, + "learning_rate": 3.340035406592074e-06, + "loss": 0.84727538, + "num_input_tokens_seen": 103435500, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.859375, + "step": 4795, + "time_per_iteration": 2.4633255004882812 + }, + { + "auxiliary_loss_clip": 0.01129713, + "auxiliary_loss_mlp": 0.01038999, + "balance_loss_clip": 1.02387166, + "balance_loss_mlp": 1.04899204, + "epoch": 0.2883511197955809, + "flos": 24674832017280.0, + "grad_norm": 2.661730786650402, + "language_loss": 0.74650323, + "learning_rate": 3.339746266208074e-06, + "loss": 0.76819038, + "num_input_tokens_seen": 103451040, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.80859375, + "step": 4796, + "time_per_iteration": 2.5179266929626465 + }, + { + "auxiliary_loss_clip": 0.01138692, + "auxiliary_loss_mlp": 0.01040905, + "balance_loss_clip": 1.02334583, + "balance_loss_mlp": 1.04789257, + "epoch": 0.2884112430482489, + "flos": 23112143850240.0, + "grad_norm": 1.8865626242662115, + "language_loss": 0.72797763, + "learning_rate": 3.3394570750207614e-06, + "loss": 0.74977362, + "num_input_tokens_seen": 103471330, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.90625, + "step": 4797, + "time_per_iteration": 2.4910430908203125 + }, + { + "auxiliary_loss_clip": 0.01135339, + "auxiliary_loss_mlp": 0.01040629, + "balance_loss_clip": 1.02475667, + "balance_loss_mlp": 1.04989898, + "epoch": 0.28847136630091685, + "flos": 16873347432960.0, + "grad_norm": 2.109884297899412, + "language_loss": 0.74219149, + "learning_rate": 3.3391678330411017e-06, + "loss": 0.76395118, + "num_input_tokens_seen": 103488060, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8515625, + "step": 4798, + "time_per_iteration": 2.472590923309326 + }, + { + "auxiliary_loss_clip": 0.01134882, + "auxiliary_loss_mlp": 0.01043827, + "balance_loss_clip": 1.02631509, + "balance_loss_mlp": 1.04689598, + "epoch": 0.2885314895535849, + "flos": 25657527277440.0, + "grad_norm": 2.7660889265500996, + "language_loss": 0.64920753, + "learning_rate": 3.3388785402800642e-06, + "loss": 0.67099464, + "num_input_tokens_seen": 103503600, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.87890625, + "step": 4799, + "time_per_iteration": 2.4816339015960693 + }, + { + "auxiliary_loss_clip": 0.01136164, + "auxiliary_loss_mlp": 0.01043963, + "balance_loss_clip": 1.02784538, + "balance_loss_mlp": 1.04912758, + "epoch": 0.28859161280625284, + "flos": 21107251347840.0, + "grad_norm": 2.0794132014970272, + "language_loss": 0.82202137, + "learning_rate": 3.3385891967486178e-06, + "loss": 0.84382272, + "num_input_tokens_seen": 103524195, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.87109375, + "step": 4800, + "time_per_iteration": 2.5249674320220947 + }, + { + "auxiliary_loss_clip": 0.01128617, + "auxiliary_loss_mlp": 0.01038626, + "balance_loss_clip": 1.02312899, + "balance_loss_mlp": 1.04702258, + "epoch": 0.2886517360589208, + "flos": 26469540086400.0, + "grad_norm": 1.639042715490093, + "language_loss": 0.90946537, + "learning_rate": 3.3382998024577347e-06, + "loss": 0.93113768, + "num_input_tokens_seen": 103545235, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.81640625, + "step": 4801, + "time_per_iteration": 2.531658172607422 + }, + { + "auxiliary_loss_clip": 0.01133327, + "auxiliary_loss_mlp": 0.01038392, + "balance_loss_clip": 1.0221796, + "balance_loss_mlp": 1.04792547, + "epoch": 0.28871185931158877, + "flos": 25265275781760.0, + "grad_norm": 2.176318344562637, + "language_loss": 0.73644328, + "learning_rate": 3.33801035741839e-06, + "loss": 0.75816047, + "num_input_tokens_seen": 103563305, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8515625, + "step": 4802, + "time_per_iteration": 4.080524444580078 + }, + { + "auxiliary_loss_clip": 0.01040178, + "auxiliary_loss_mlp": 0.01006047, + "balance_loss_clip": 1.00423479, + "balance_loss_mlp": 1.01114249, + "epoch": 0.28877198256425674, + "flos": 66665431284480.0, + "grad_norm": 0.7820100192493779, + "language_loss": 0.63009298, + "learning_rate": 3.337720861641558e-06, + "loss": 0.65055525, + "num_input_tokens_seen": 103625025, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.2890625, + "step": 4803, + "time_per_iteration": 4.464243412017822 + }, + { + "auxiliary_loss_clip": 0.0112919, + "auxiliary_loss_mlp": 0.01046023, + "balance_loss_clip": 1.03008461, + "balance_loss_mlp": 1.04523563, + "epoch": 0.2888321058169247, + "flos": 20303031790080.0, + "grad_norm": 1.7581002683255658, + "language_loss": 0.70800668, + "learning_rate": 3.3374313151382165e-06, + "loss": 0.72975886, + "num_input_tokens_seen": 103644235, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8359375, + "step": 4804, + "time_per_iteration": 2.4655730724334717 + }, + { + "auxiliary_loss_clip": 0.01134858, + "auxiliary_loss_mlp": 0.0104232, + "balance_loss_clip": 1.02464128, + "balance_loss_mlp": 1.04650438, + "epoch": 0.28889222906959267, + "flos": 25516721963520.0, + "grad_norm": 1.8916446417141755, + "language_loss": 0.68253011, + "learning_rate": 3.337141717919346e-06, + "loss": 0.70430195, + "num_input_tokens_seen": 103664700, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.8828125, + "step": 4805, + "time_per_iteration": 2.53932523727417 + }, + { + "auxiliary_loss_clip": 0.01133301, + "auxiliary_loss_mlp": 0.01041795, + "balance_loss_clip": 1.0262022, + "balance_loss_mlp": 1.04706144, + "epoch": 0.28895235232226063, + "flos": 32671312560000.0, + "grad_norm": 1.968490446816616, + "language_loss": 0.69469118, + "learning_rate": 3.3368520699959272e-06, + "loss": 0.71644211, + "num_input_tokens_seen": 103686595, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.86328125, + "step": 4806, + "time_per_iteration": 2.558811902999878 + }, + { + "auxiliary_loss_clip": 0.0113054, + "auxiliary_loss_mlp": 0.01046922, + "balance_loss_clip": 1.031461, + "balance_loss_mlp": 1.04788303, + "epoch": 0.2890124755749286, + "flos": 29714679342720.0, + "grad_norm": 1.428284074184194, + "language_loss": 0.71372461, + "learning_rate": 3.3365623713789443e-06, + "loss": 0.73549926, + "num_input_tokens_seen": 103707525, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.828125, + "step": 4807, + "time_per_iteration": 2.5614373683929443 + }, + { + "auxiliary_loss_clip": 0.01132479, + "auxiliary_loss_mlp": 0.01043529, + "balance_loss_clip": 1.02736425, + "balance_loss_mlp": 1.04677331, + "epoch": 0.28907259882759656, + "flos": 22674464628480.0, + "grad_norm": 1.7487230864068215, + "language_loss": 0.81519878, + "learning_rate": 3.336272622079382e-06, + "loss": 0.83695877, + "num_input_tokens_seen": 103727905, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.85546875, + "step": 4808, + "time_per_iteration": 2.4744319915771484 + }, + { + "auxiliary_loss_clip": 0.01128992, + "auxiliary_loss_mlp": 0.0105043, + "balance_loss_clip": 1.03418779, + "balance_loss_mlp": 1.04669142, + "epoch": 0.2891327220802645, + "flos": 22566050403840.0, + "grad_norm": 1.636259514454852, + "language_loss": 0.78387201, + "learning_rate": 3.3359828221082276e-06, + "loss": 0.80566621, + "num_input_tokens_seen": 103748335, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8203125, + "step": 4809, + "time_per_iteration": 2.4998364448547363 + }, + { + "auxiliary_loss_clip": 0.01134273, + "auxiliary_loss_mlp": 0.0104619, + "balance_loss_clip": 1.02908349, + "balance_loss_mlp": 1.04490733, + "epoch": 0.2891928453329325, + "flos": 21652806090240.0, + "grad_norm": 1.6563631129995537, + "language_loss": 0.78611737, + "learning_rate": 3.3356929714764714e-06, + "loss": 0.80792195, + "num_input_tokens_seen": 103767020, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.890625, + "step": 4810, + "time_per_iteration": 2.4702351093292236 + }, + { + "auxiliary_loss_clip": 0.01129985, + "auxiliary_loss_mlp": 0.01045099, + "balance_loss_clip": 1.02966762, + "balance_loss_mlp": 1.04653728, + "epoch": 0.28925296858560046, + "flos": 23222102359680.0, + "grad_norm": 2.008599276638055, + "language_loss": 0.77134252, + "learning_rate": 3.3354030701951032e-06, + "loss": 0.79309338, + "num_input_tokens_seen": 103786355, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8359375, + "step": 4811, + "time_per_iteration": 2.502671718597412 + }, + { + "auxiliary_loss_clip": 0.01130702, + "auxiliary_loss_mlp": 0.01050867, + "balance_loss_clip": 1.03385544, + "balance_loss_mlp": 1.0460732, + "epoch": 0.2893130918382685, + "flos": 28621666437120.0, + "grad_norm": 1.3273574459957262, + "language_loss": 0.76748705, + "learning_rate": 3.335113118275117e-06, + "loss": 0.78930271, + "num_input_tokens_seen": 103809345, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.84375, + "step": 4812, + "time_per_iteration": 2.5386435985565186 + }, + { + "auxiliary_loss_clip": 0.01038211, + "auxiliary_loss_mlp": 0.01023073, + "balance_loss_clip": 1.02121317, + "balance_loss_mlp": 1.00933552, + "epoch": 0.28937321509093644, + "flos": 72301288982400.0, + "grad_norm": 0.8452992206378583, + "language_loss": 0.60239071, + "learning_rate": 3.3348231157275085e-06, + "loss": 0.62300354, + "num_input_tokens_seen": 103871180, + "router_z_loss_clip": 0.01855469, + "router_z_loss_mlp": 0.2890625, + "step": 4813, + "time_per_iteration": 3.227616548538208 + }, + { + "auxiliary_loss_clip": 0.01130233, + "auxiliary_loss_mlp": 0.0104328, + "balance_loss_clip": 1.02727079, + "balance_loss_mlp": 1.04549837, + "epoch": 0.2894333383436044, + "flos": 16216397637120.0, + "grad_norm": 1.8826759768804342, + "language_loss": 0.81616402, + "learning_rate": 3.3345330625632725e-06, + "loss": 0.83789915, + "num_input_tokens_seen": 103889040, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.84765625, + "step": 4814, + "time_per_iteration": 2.5389657020568848 + }, + { + "auxiliary_loss_clip": 0.01132807, + "auxiliary_loss_mlp": 0.01045738, + "balance_loss_clip": 1.0297873, + "balance_loss_mlp": 1.04464495, + "epoch": 0.2894934615962724, + "flos": 24828278918400.0, + "grad_norm": 1.6532361717230013, + "language_loss": 0.72615647, + "learning_rate": 3.3342429587934094e-06, + "loss": 0.74794197, + "num_input_tokens_seen": 103910380, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8828125, + "step": 4815, + "time_per_iteration": 2.517458438873291 + }, + { + "auxiliary_loss_clip": 0.01129383, + "auxiliary_loss_mlp": 0.0104797, + "balance_loss_clip": 1.03274667, + "balance_loss_mlp": 1.04815507, + "epoch": 0.28955358484894034, + "flos": 20449978329600.0, + "grad_norm": 1.520143184033477, + "language_loss": 0.70801306, + "learning_rate": 3.3339528044289198e-06, + "loss": 0.72978652, + "num_input_tokens_seen": 103929955, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8125, + "step": 4816, + "time_per_iteration": 2.5287740230560303 + }, + { + "auxiliary_loss_clip": 0.01135738, + "auxiliary_loss_mlp": 0.0104281, + "balance_loss_clip": 1.02590585, + "balance_loss_mlp": 1.04615664, + "epoch": 0.2896137081016083, + "flos": 22565188477440.0, + "grad_norm": 3.3715101323822174, + "language_loss": 0.74736607, + "learning_rate": 3.3336625994808055e-06, + "loss": 0.76915157, + "num_input_tokens_seen": 103948020, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8984375, + "step": 4817, + "time_per_iteration": 2.4828009605407715 + }, + { + "auxiliary_loss_clip": 0.01134031, + "auxiliary_loss_mlp": 0.0105341, + "balance_loss_clip": 1.03637469, + "balance_loss_mlp": 1.0465169, + "epoch": 0.28967383135427627, + "flos": 26687948734080.0, + "grad_norm": 1.754631597755812, + "language_loss": 0.76169789, + "learning_rate": 3.3333723439600723e-06, + "loss": 0.78357232, + "num_input_tokens_seen": 103968740, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.875, + "step": 4818, + "time_per_iteration": 2.5453133583068848 + }, + { + "auxiliary_loss_clip": 0.011322, + "auxiliary_loss_mlp": 0.01035125, + "balance_loss_clip": 1.0184474, + "balance_loss_mlp": 1.04606366, + "epoch": 0.28973395460694423, + "flos": 15558262692480.0, + "grad_norm": 1.8604375380991018, + "language_loss": 0.79827082, + "learning_rate": 3.3330820378777263e-06, + "loss": 0.81994408, + "num_input_tokens_seen": 103986005, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.859375, + "step": 4819, + "time_per_iteration": 2.4516472816467285 + }, + { + "auxiliary_loss_clip": 0.01135104, + "auxiliary_loss_mlp": 0.01043377, + "balance_loss_clip": 1.02553141, + "balance_loss_mlp": 1.04452121, + "epoch": 0.2897940778596122, + "flos": 18697465762560.0, + "grad_norm": 1.6026789889191464, + "language_loss": 0.78726941, + "learning_rate": 3.332791681244776e-06, + "loss": 0.80905426, + "num_input_tokens_seen": 104005070, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.90625, + "step": 4820, + "time_per_iteration": 2.512927770614624 + }, + { + "auxiliary_loss_clip": 0.0113352, + "auxiliary_loss_mlp": 0.01036477, + "balance_loss_clip": 1.0202527, + "balance_loss_mlp": 1.04560018, + "epoch": 0.28985420111228016, + "flos": 18770292587520.0, + "grad_norm": 2.352701358428358, + "language_loss": 0.73083222, + "learning_rate": 3.332501274072231e-06, + "loss": 0.75253224, + "num_input_tokens_seen": 104022945, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.87890625, + "step": 4821, + "time_per_iteration": 2.4575939178466797 + }, + { + "auxiliary_loss_clip": 0.01130585, + "auxiliary_loss_mlp": 0.01036495, + "balance_loss_clip": 1.01979387, + "balance_loss_mlp": 1.04503322, + "epoch": 0.28991432436494813, + "flos": 23069840607360.0, + "grad_norm": 1.843174914976853, + "language_loss": 0.72629523, + "learning_rate": 3.332210816371104e-06, + "loss": 0.74796605, + "num_input_tokens_seen": 104042080, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.85546875, + "step": 4822, + "time_per_iteration": 2.4981486797332764 + }, + { + "auxiliary_loss_clip": 0.01133236, + "auxiliary_loss_mlp": 0.01047323, + "balance_loss_clip": 1.03044343, + "balance_loss_mlp": 1.04679179, + "epoch": 0.2899744476176161, + "flos": 17603195880960.0, + "grad_norm": 1.7581642571514904, + "language_loss": 0.66571164, + "learning_rate": 3.3319203081524102e-06, + "loss": 0.68751729, + "num_input_tokens_seen": 104060975, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.86328125, + "step": 4823, + "time_per_iteration": 2.4363584518432617 + }, + { + "auxiliary_loss_clip": 0.01128693, + "auxiliary_loss_mlp": 0.01036254, + "balance_loss_clip": 1.02018452, + "balance_loss_mlp": 1.04382014, + "epoch": 0.29003457087028406, + "flos": 22309360836480.0, + "grad_norm": 3.6840420234688684, + "language_loss": 0.80786806, + "learning_rate": 3.331629749427164e-06, + "loss": 0.82951754, + "num_input_tokens_seen": 104081395, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8515625, + "step": 4824, + "time_per_iteration": 2.4978654384613037 + }, + { + "auxiliary_loss_clip": 0.01132559, + "auxiliary_loss_mlp": 0.01043716, + "balance_loss_clip": 1.02547669, + "balance_loss_mlp": 1.04512334, + "epoch": 0.2900946941229521, + "flos": 21944975316480.0, + "grad_norm": 1.8817460080316075, + "language_loss": 0.72507697, + "learning_rate": 3.331339140206385e-06, + "loss": 0.74683976, + "num_input_tokens_seen": 104099995, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.875, + "step": 4825, + "time_per_iteration": 2.4740118980407715 + }, + { + "auxiliary_loss_clip": 0.01136872, + "auxiliary_loss_mlp": 0.01035046, + "balance_loss_clip": 1.01760566, + "balance_loss_mlp": 1.04886889, + "epoch": 0.29015481737562004, + "flos": 17932173569280.0, + "grad_norm": 2.3450778905142813, + "language_loss": 0.73504382, + "learning_rate": 3.331048480501092e-06, + "loss": 0.75676298, + "num_input_tokens_seen": 104118930, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.8828125, + "step": 4826, + "time_per_iteration": 2.4689221382141113 + }, + { + "auxiliary_loss_clip": 0.01131943, + "auxiliary_loss_mlp": 0.01036484, + "balance_loss_clip": 1.02041411, + "balance_loss_mlp": 1.04524112, + "epoch": 0.290214940628288, + "flos": 22783525297920.0, + "grad_norm": 3.139827505949132, + "language_loss": 0.68472409, + "learning_rate": 3.3307577703223073e-06, + "loss": 0.70640838, + "num_input_tokens_seen": 104136940, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8671875, + "step": 4827, + "time_per_iteration": 2.5236809253692627 + }, + { + "auxiliary_loss_clip": 0.01136266, + "auxiliary_loss_mlp": 0.01036355, + "balance_loss_clip": 1.01816392, + "balance_loss_mlp": 1.04921937, + "epoch": 0.290275063880956, + "flos": 20006481104640.0, + "grad_norm": 1.8651963869616242, + "language_loss": 0.80072737, + "learning_rate": 3.3304670096810545e-06, + "loss": 0.82245356, + "num_input_tokens_seen": 104154280, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.87109375, + "step": 4828, + "time_per_iteration": 2.491584300994873 + }, + { + "auxiliary_loss_clip": 0.01133081, + "auxiliary_loss_mlp": 0.01042375, + "balance_loss_clip": 1.026335, + "balance_loss_mlp": 1.0482254, + "epoch": 0.29033518713362394, + "flos": 22053605022720.0, + "grad_norm": 2.2252387209358666, + "language_loss": 0.80475402, + "learning_rate": 3.33017619858836e-06, + "loss": 0.82650864, + "num_input_tokens_seen": 104172605, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.84765625, + "step": 4829, + "time_per_iteration": 2.473210334777832 + }, + { + "auxiliary_loss_clip": 0.01131629, + "auxiliary_loss_mlp": 0.01041142, + "balance_loss_clip": 1.02482176, + "balance_loss_mlp": 1.04794419, + "epoch": 0.2903953103862919, + "flos": 25630056351360.0, + "grad_norm": 1.544892870636461, + "language_loss": 0.82288766, + "learning_rate": 3.329885337055249e-06, + "loss": 0.84461534, + "num_input_tokens_seen": 104194120, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8359375, + "step": 4830, + "time_per_iteration": 2.52874755859375 + }, + { + "auxiliary_loss_clip": 0.01136051, + "auxiliary_loss_mlp": 0.01046661, + "balance_loss_clip": 1.02992344, + "balance_loss_mlp": 1.04847991, + "epoch": 0.29045543363895987, + "flos": 16945851035520.0, + "grad_norm": 2.366175746199002, + "language_loss": 0.78858435, + "learning_rate": 3.3295944250927546e-06, + "loss": 0.81041145, + "num_input_tokens_seen": 104210875, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.875, + "step": 4831, + "time_per_iteration": 2.5465588569641113 + }, + { + "auxiliary_loss_clip": 0.0112817, + "auxiliary_loss_mlp": 0.010386, + "balance_loss_clip": 1.02356744, + "balance_loss_mlp": 1.045138, + "epoch": 0.29051555689162784, + "flos": 26395492199040.0, + "grad_norm": 1.8105888440812088, + "language_loss": 0.74415791, + "learning_rate": 3.3293034627119055e-06, + "loss": 0.76582563, + "num_input_tokens_seen": 104229875, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 4832, + "time_per_iteration": 2.6398987770080566 + }, + { + "auxiliary_loss_clip": 0.0113002, + "auxiliary_loss_mlp": 0.01032742, + "balance_loss_clip": 1.01806784, + "balance_loss_mlp": 1.04516697, + "epoch": 0.2905756801442958, + "flos": 21103875469440.0, + "grad_norm": 1.6051950803449415, + "language_loss": 0.75986588, + "learning_rate": 3.329012449923736e-06, + "loss": 0.78149348, + "num_input_tokens_seen": 104250405, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.84765625, + "step": 4833, + "time_per_iteration": 2.4772675037384033 + }, + { + "auxiliary_loss_clip": 0.01129626, + "auxiliary_loss_mlp": 0.01037033, + "balance_loss_clip": 1.02108264, + "balance_loss_mlp": 1.04542434, + "epoch": 0.29063580339696377, + "flos": 15706071158400.0, + "grad_norm": 1.807689816327527, + "language_loss": 0.64523911, + "learning_rate": 3.3287213867392813e-06, + "loss": 0.6669057, + "num_input_tokens_seen": 104269185, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.83984375, + "step": 4834, + "time_per_iteration": 2.4944729804992676 + }, + { + "auxiliary_loss_clip": 0.0112967, + "auxiliary_loss_mlp": 0.01031422, + "balance_loss_clip": 1.01674771, + "balance_loss_mlp": 1.04650283, + "epoch": 0.29069592664963173, + "flos": 24644990793600.0, + "grad_norm": 1.5516449013863105, + "language_loss": 0.71436119, + "learning_rate": 3.3284302731695783e-06, + "loss": 0.73597211, + "num_input_tokens_seen": 104289400, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.83203125, + "step": 4835, + "time_per_iteration": 2.5122785568237305 + }, + { + "auxiliary_loss_clip": 0.01129192, + "auxiliary_loss_mlp": 0.01038882, + "balance_loss_clip": 1.02430248, + "balance_loss_mlp": 1.04510283, + "epoch": 0.2907560499022997, + "flos": 24973753000320.0, + "grad_norm": 2.123413568873549, + "language_loss": 0.79669547, + "learning_rate": 3.3281391092256668e-06, + "loss": 0.81837618, + "num_input_tokens_seen": 104310485, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.83984375, + "step": 4836, + "time_per_iteration": 2.533221483230591 + }, + { + "auxiliary_loss_clip": 0.01129403, + "auxiliary_loss_mlp": 0.01039274, + "balance_loss_clip": 1.02338338, + "balance_loss_mlp": 1.04589558, + "epoch": 0.29081617315496766, + "flos": 18657496903680.0, + "grad_norm": 1.6671781935549963, + "language_loss": 0.80777872, + "learning_rate": 3.3278478949185865e-06, + "loss": 0.82946539, + "num_input_tokens_seen": 104327330, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8359375, + "step": 4837, + "time_per_iteration": 2.4579083919525146 + }, + { + "auxiliary_loss_clip": 0.01131777, + "auxiliary_loss_mlp": 0.01037569, + "balance_loss_clip": 1.02170265, + "balance_loss_mlp": 1.04491532, + "epoch": 0.2908762964076356, + "flos": 35331035955840.0, + "grad_norm": 1.8624538054458508, + "language_loss": 0.67733121, + "learning_rate": 3.327556630259381e-06, + "loss": 0.69902468, + "num_input_tokens_seen": 104350350, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8671875, + "step": 4838, + "time_per_iteration": 2.613682270050049 + }, + { + "auxiliary_loss_clip": 0.01137044, + "auxiliary_loss_mlp": 0.010412, + "balance_loss_clip": 1.02485621, + "balance_loss_mlp": 1.04893696, + "epoch": 0.29093641966030365, + "flos": 23076305055360.0, + "grad_norm": 1.6135989987029238, + "language_loss": 0.71288264, + "learning_rate": 3.327265315259095e-06, + "loss": 0.73466504, + "num_input_tokens_seen": 104369995, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.87890625, + "step": 4839, + "time_per_iteration": 2.506908416748047 + }, + { + "auxiliary_loss_clip": 0.0112979, + "auxiliary_loss_mlp": 0.01038337, + "balance_loss_clip": 1.02341795, + "balance_loss_mlp": 1.04433274, + "epoch": 0.2909965429129716, + "flos": 35955415094400.0, + "grad_norm": 1.876317037835641, + "language_loss": 0.75619674, + "learning_rate": 3.326973949928776e-06, + "loss": 0.77787805, + "num_input_tokens_seen": 104392285, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.85546875, + "step": 4840, + "time_per_iteration": 2.6259472370147705 + }, + { + "auxiliary_loss_clip": 0.011316, + "auxiliary_loss_mlp": 0.01041868, + "balance_loss_clip": 1.02688372, + "balance_loss_mlp": 1.0469749, + "epoch": 0.2910566661656396, + "flos": 30880231764480.0, + "grad_norm": 1.9955793585576265, + "language_loss": 0.60459495, + "learning_rate": 3.326682534279471e-06, + "loss": 0.62632966, + "num_input_tokens_seen": 104412640, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84375, + "step": 4841, + "time_per_iteration": 2.5497686862945557 + }, + { + "auxiliary_loss_clip": 0.01133928, + "auxiliary_loss_mlp": 0.01038335, + "balance_loss_clip": 1.0215385, + "balance_loss_mlp": 1.0483892, + "epoch": 0.29111678941830754, + "flos": 30010188533760.0, + "grad_norm": 1.7266193979009703, + "language_loss": 0.71366, + "learning_rate": 3.326391068322232e-06, + "loss": 0.73538262, + "num_input_tokens_seen": 104435245, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.85546875, + "step": 4842, + "time_per_iteration": 2.5817017555236816 + }, + { + "auxiliary_loss_clip": 0.01131749, + "auxiliary_loss_mlp": 0.0103654, + "balance_loss_clip": 1.02188897, + "balance_loss_mlp": 1.04632473, + "epoch": 0.2911769126709755, + "flos": 22857393617280.0, + "grad_norm": 1.5806493177236067, + "language_loss": 0.72846174, + "learning_rate": 3.3260995520681098e-06, + "loss": 0.7501446, + "num_input_tokens_seen": 104455395, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.85546875, + "step": 4843, + "time_per_iteration": 2.4728853702545166 + }, + { + "auxiliary_loss_clip": 0.01132332, + "auxiliary_loss_mlp": 0.01038034, + "balance_loss_clip": 1.0223223, + "balance_loss_mlp": 1.04598284, + "epoch": 0.2912370359236435, + "flos": 21650507619840.0, + "grad_norm": 2.0237546438656393, + "language_loss": 0.5840022, + "learning_rate": 3.3258079855281602e-06, + "loss": 0.60570586, + "num_input_tokens_seen": 104473350, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.86328125, + "step": 4844, + "time_per_iteration": 3.9377825260162354 + }, + { + "auxiliary_loss_clip": 0.01136792, + "auxiliary_loss_mlp": 0.01042261, + "balance_loss_clip": 1.02518439, + "balance_loss_mlp": 1.04942751, + "epoch": 0.29129715917631144, + "flos": 22893340152960.0, + "grad_norm": 2.1502970284536493, + "language_loss": 0.86360186, + "learning_rate": 3.3255163687134396e-06, + "loss": 0.88539243, + "num_input_tokens_seen": 104492265, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.875, + "step": 4845, + "time_per_iteration": 5.415091276168823 + }, + { + "auxiliary_loss_clip": 0.01134782, + "auxiliary_loss_mlp": 0.01051996, + "balance_loss_clip": 1.03494883, + "balance_loss_mlp": 1.04779911, + "epoch": 0.2913572824289794, + "flos": 22674464628480.0, + "grad_norm": 1.7275133095664568, + "language_loss": 0.66684157, + "learning_rate": 3.3252247016350046e-06, + "loss": 0.68870938, + "num_input_tokens_seen": 104510755, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.8671875, + "step": 4846, + "time_per_iteration": 2.495901584625244 + }, + { + "auxiliary_loss_clip": 0.01131044, + "auxiliary_loss_mlp": 0.01042533, + "balance_loss_clip": 1.02700055, + "balance_loss_mlp": 1.04691291, + "epoch": 0.29141740568164737, + "flos": 23107403255040.0, + "grad_norm": 1.7117272730106567, + "language_loss": 0.70501876, + "learning_rate": 3.3249329843039166e-06, + "loss": 0.72675455, + "num_input_tokens_seen": 104530830, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.83984375, + "step": 4847, + "time_per_iteration": 2.50537109375 + }, + { + "auxiliary_loss_clip": 0.01131589, + "auxiliary_loss_mlp": 0.01035574, + "balance_loss_clip": 1.01918232, + "balance_loss_mlp": 1.04682243, + "epoch": 0.29147752893431533, + "flos": 23587026583680.0, + "grad_norm": 2.14972579950547, + "language_loss": 0.73494464, + "learning_rate": 3.324641216731237e-06, + "loss": 0.75661629, + "num_input_tokens_seen": 104550115, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.84765625, + "step": 4848, + "time_per_iteration": 2.506683111190796 + }, + { + "auxiliary_loss_clip": 0.01132855, + "auxiliary_loss_mlp": 0.01042107, + "balance_loss_clip": 1.02569222, + "balance_loss_mlp": 1.04670119, + "epoch": 0.2915376521869833, + "flos": 20591968792320.0, + "grad_norm": 2.106691725132959, + "language_loss": 0.76689458, + "learning_rate": 3.3243493989280295e-06, + "loss": 0.78864431, + "num_input_tokens_seen": 104566255, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.86328125, + "step": 4849, + "time_per_iteration": 2.475512742996216 + }, + { + "auxiliary_loss_clip": 0.01134647, + "auxiliary_loss_mlp": 0.01043325, + "balance_loss_clip": 1.02732718, + "balance_loss_mlp": 1.04683709, + "epoch": 0.29159777543965126, + "flos": 20811490761600.0, + "grad_norm": 1.7698868684834754, + "language_loss": 0.78437513, + "learning_rate": 3.3240575309053596e-06, + "loss": 0.80615485, + "num_input_tokens_seen": 104585235, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87890625, + "step": 4850, + "time_per_iteration": 2.4774062633514404 + }, + { + "auxiliary_loss_clip": 0.01130071, + "auxiliary_loss_mlp": 0.01038553, + "balance_loss_clip": 1.02231026, + "balance_loss_mlp": 1.04620552, + "epoch": 0.29165789869231923, + "flos": 24244155947520.0, + "grad_norm": 1.7416717517415665, + "language_loss": 0.75775445, + "learning_rate": 3.323765612674296e-06, + "loss": 0.77944064, + "num_input_tokens_seen": 104605315, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.83984375, + "step": 4851, + "time_per_iteration": 2.4973719120025635 + }, + { + "auxiliary_loss_clip": 0.01130818, + "auxiliary_loss_mlp": 0.01045515, + "balance_loss_clip": 1.03071558, + "balance_loss_mlp": 1.04819655, + "epoch": 0.29171802194498725, + "flos": 28949925853440.0, + "grad_norm": 1.378687766604426, + "language_loss": 0.77111661, + "learning_rate": 3.3234736442459078e-06, + "loss": 0.79287988, + "num_input_tokens_seen": 104626055, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.828125, + "step": 4852, + "time_per_iteration": 2.5339767932891846 + }, + { + "auxiliary_loss_clip": 0.01132717, + "auxiliary_loss_mlp": 0.01045328, + "balance_loss_clip": 1.0296402, + "balance_loss_mlp": 1.04735672, + "epoch": 0.2917781451976552, + "flos": 22598226011520.0, + "grad_norm": 1.5345579183576068, + "language_loss": 0.78385615, + "learning_rate": 3.3231816256312665e-06, + "loss": 0.80563664, + "num_input_tokens_seen": 104646005, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8515625, + "step": 4853, + "time_per_iteration": 2.511125087738037 + }, + { + "auxiliary_loss_clip": 0.0113401, + "auxiliary_loss_mlp": 0.01038526, + "balance_loss_clip": 1.02278996, + "balance_loss_mlp": 1.04668474, + "epoch": 0.2918382684503232, + "flos": 21574448570880.0, + "grad_norm": 2.984154109703724, + "language_loss": 0.87946999, + "learning_rate": 3.322889556841445e-06, + "loss": 0.90119541, + "num_input_tokens_seen": 104661620, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.875, + "step": 4854, + "time_per_iteration": 2.4654700756073 + }, + { + "auxiliary_loss_clip": 0.0113237, + "auxiliary_loss_mlp": 0.01052716, + "balance_loss_clip": 1.03352284, + "balance_loss_mlp": 1.04678071, + "epoch": 0.29189839170299114, + "flos": 24353503925760.0, + "grad_norm": 1.8357290509449282, + "language_loss": 0.86585724, + "learning_rate": 3.322597437887519e-06, + "loss": 0.88770819, + "num_input_tokens_seen": 104681445, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.85546875, + "step": 4855, + "time_per_iteration": 2.519432783126831 + }, + { + "auxiliary_loss_clip": 0.01043355, + "auxiliary_loss_mlp": 0.01004722, + "balance_loss_clip": 1.00283837, + "balance_loss_mlp": 1.01374364, + "epoch": 0.2919585149556591, + "flos": 71316726215040.0, + "grad_norm": 0.8090362112321295, + "language_loss": 0.60199535, + "learning_rate": 3.322305268780566e-06, + "loss": 0.6224761, + "num_input_tokens_seen": 104747945, + "router_z_loss_clip": 0.01879883, + "router_z_loss_mlp": 0.296875, + "step": 4856, + "time_per_iteration": 3.164905309677124 + }, + { + "auxiliary_loss_clip": 0.01130578, + "auxiliary_loss_mlp": 0.01039982, + "balance_loss_clip": 1.02499735, + "balance_loss_mlp": 1.04626632, + "epoch": 0.2920186382083271, + "flos": 15633208419840.0, + "grad_norm": 2.394144218040463, + "language_loss": 0.67995465, + "learning_rate": 3.322013049531664e-06, + "loss": 0.70166028, + "num_input_tokens_seen": 104766225, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.84375, + "step": 4857, + "time_per_iteration": 2.4615678787231445 + }, + { + "auxiliary_loss_clip": 0.01129998, + "auxiliary_loss_mlp": 0.01035751, + "balance_loss_clip": 1.0210768, + "balance_loss_mlp": 1.04613733, + "epoch": 0.29207876146099504, + "flos": 28366018364160.0, + "grad_norm": 2.1807634638236566, + "language_loss": 0.83958411, + "learning_rate": 3.321720780151895e-06, + "loss": 0.86124158, + "num_input_tokens_seen": 104785345, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.83984375, + "step": 4858, + "time_per_iteration": 2.561347723007202 + }, + { + "auxiliary_loss_clip": 0.01131346, + "auxiliary_loss_mlp": 0.0103964, + "balance_loss_clip": 1.02478647, + "balance_loss_mlp": 1.04746854, + "epoch": 0.292138884713663, + "flos": 21870963342720.0, + "grad_norm": 2.0714117361066298, + "language_loss": 0.77547097, + "learning_rate": 3.321428460652342e-06, + "loss": 0.79718083, + "num_input_tokens_seen": 104804560, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.83984375, + "step": 4859, + "time_per_iteration": 2.4801361560821533 + }, + { + "auxiliary_loss_clip": 0.01132855, + "auxiliary_loss_mlp": 0.01043796, + "balance_loss_clip": 1.02764332, + "balance_loss_mlp": 1.04424477, + "epoch": 0.29219900796633097, + "flos": 20992552243200.0, + "grad_norm": 2.0548529873010564, + "language_loss": 0.68948561, + "learning_rate": 3.3211360910440885e-06, + "loss": 0.71125209, + "num_input_tokens_seen": 104821105, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8828125, + "step": 4860, + "time_per_iteration": 2.531022071838379 + }, + { + "auxiliary_loss_clip": 0.01129954, + "auxiliary_loss_mlp": 0.01040561, + "balance_loss_clip": 1.0267868, + "balance_loss_mlp": 1.04821134, + "epoch": 0.29225913121899894, + "flos": 35004608133120.0, + "grad_norm": 2.771004145303475, + "language_loss": 0.75952631, + "learning_rate": 3.320843671338222e-06, + "loss": 0.78123146, + "num_input_tokens_seen": 104841440, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.81640625, + "step": 4861, + "time_per_iteration": 2.619257926940918 + }, + { + "auxiliary_loss_clip": 0.01129568, + "auxiliary_loss_mlp": 0.01048123, + "balance_loss_clip": 1.03350759, + "balance_loss_mlp": 1.04631817, + "epoch": 0.2923192544716669, + "flos": 13515663888000.0, + "grad_norm": 1.7230129115334698, + "language_loss": 0.91648388, + "learning_rate": 3.320551201545832e-06, + "loss": 0.93826073, + "num_input_tokens_seen": 104858210, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.83203125, + "step": 4862, + "time_per_iteration": 2.4596564769744873 + }, + { + "auxiliary_loss_clip": 0.01129785, + "auxiliary_loss_mlp": 0.01038215, + "balance_loss_clip": 1.02336144, + "balance_loss_mlp": 1.04544663, + "epoch": 0.29237937772433487, + "flos": 19463512141440.0, + "grad_norm": 2.061794510539927, + "language_loss": 0.73736131, + "learning_rate": 3.320258681678008e-06, + "loss": 0.75904131, + "num_input_tokens_seen": 104875620, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.84375, + "step": 4863, + "time_per_iteration": 2.4478728771209717 + }, + { + "auxiliary_loss_clip": 0.01125934, + "auxiliary_loss_mlp": 0.01038806, + "balance_loss_clip": 1.02474487, + "balance_loss_mlp": 1.04584527, + "epoch": 0.29243950097700283, + "flos": 20850597694080.0, + "grad_norm": 1.6779515608592832, + "language_loss": 0.78057373, + "learning_rate": 3.319966111745842e-06, + "loss": 0.80222106, + "num_input_tokens_seen": 104894600, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.80078125, + "step": 4864, + "time_per_iteration": 2.487544059753418 + }, + { + "auxiliary_loss_clip": 0.0113348, + "auxiliary_loss_mlp": 0.01045654, + "balance_loss_clip": 1.02927482, + "balance_loss_mlp": 1.04763806, + "epoch": 0.29249962422967085, + "flos": 23584225322880.0, + "grad_norm": 2.699456605470703, + "language_loss": 0.81919956, + "learning_rate": 3.319673491760429e-06, + "loss": 0.8409909, + "num_input_tokens_seen": 104914530, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.859375, + "step": 4865, + "time_per_iteration": 2.486553192138672 + }, + { + "auxiliary_loss_clip": 0.01130825, + "auxiliary_loss_mlp": 0.01040981, + "balance_loss_clip": 1.02523327, + "balance_loss_mlp": 1.04592669, + "epoch": 0.2925597474823388, + "flos": 22273342473600.0, + "grad_norm": 1.8393536761495908, + "language_loss": 0.85281575, + "learning_rate": 3.3193808217328645e-06, + "loss": 0.87453377, + "num_input_tokens_seen": 104933460, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8515625, + "step": 4866, + "time_per_iteration": 2.4981276988983154 + }, + { + "auxiliary_loss_clip": 0.01124877, + "auxiliary_loss_mlp": 0.01037248, + "balance_loss_clip": 1.02263868, + "balance_loss_mlp": 1.04323506, + "epoch": 0.2926198707350068, + "flos": 34456108475520.0, + "grad_norm": 1.627734535935432, + "language_loss": 0.755858, + "learning_rate": 3.3190881016742476e-06, + "loss": 0.77747923, + "num_input_tokens_seen": 104954495, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.81640625, + "step": 4867, + "time_per_iteration": 2.5813703536987305 + }, + { + "auxiliary_loss_clip": 0.01129928, + "auxiliary_loss_mlp": 0.01049325, + "balance_loss_clip": 1.03337526, + "balance_loss_mlp": 1.04375887, + "epoch": 0.29267999398767475, + "flos": 20704153944960.0, + "grad_norm": 4.179606236398783, + "language_loss": 0.73403615, + "learning_rate": 3.3187953315956776e-06, + "loss": 0.75582874, + "num_input_tokens_seen": 104971915, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.859375, + "step": 4868, + "time_per_iteration": 2.48374342918396 + }, + { + "auxiliary_loss_clip": 0.01128319, + "auxiliary_loss_mlp": 0.01033217, + "balance_loss_clip": 1.01857829, + "balance_loss_mlp": 1.04520726, + "epoch": 0.2927401172403427, + "flos": 18368667642240.0, + "grad_norm": 1.3015957921166281, + "language_loss": 0.74555755, + "learning_rate": 3.3185025115082566e-06, + "loss": 0.76717293, + "num_input_tokens_seen": 104991335, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.83203125, + "step": 4869, + "time_per_iteration": 2.458434820175171 + }, + { + "auxiliary_loss_clip": 0.01131121, + "auxiliary_loss_mlp": 0.01038828, + "balance_loss_clip": 1.02390289, + "balance_loss_mlp": 1.04639244, + "epoch": 0.2928002404930107, + "flos": 26104041244800.0, + "grad_norm": 1.465584897312906, + "language_loss": 0.76539874, + "learning_rate": 3.318209641423088e-06, + "loss": 0.78709823, + "num_input_tokens_seen": 105012015, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.84765625, + "step": 4870, + "time_per_iteration": 2.5194873809814453 + }, + { + "auxiliary_loss_clip": 0.01133405, + "auxiliary_loss_mlp": 0.01046415, + "balance_loss_clip": 1.03040564, + "balance_loss_mlp": 1.04584765, + "epoch": 0.29286036374567864, + "flos": 21324726241920.0, + "grad_norm": 2.259080578005736, + "language_loss": 0.67315602, + "learning_rate": 3.3179167213512777e-06, + "loss": 0.69495422, + "num_input_tokens_seen": 105031460, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.875, + "step": 4871, + "time_per_iteration": 2.4556169509887695 + }, + { + "auxiliary_loss_clip": 0.01125512, + "auxiliary_loss_mlp": 0.01039548, + "balance_loss_clip": 1.02509975, + "balance_loss_mlp": 1.04283524, + "epoch": 0.2929204869983466, + "flos": 29569492569600.0, + "grad_norm": 1.8081222369362746, + "language_loss": 0.76924586, + "learning_rate": 3.317623751303933e-06, + "loss": 0.79089642, + "num_input_tokens_seen": 105052965, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.828125, + "step": 4872, + "time_per_iteration": 2.575421094894409 + }, + { + "auxiliary_loss_clip": 0.01131072, + "auxiliary_loss_mlp": 0.01043663, + "balance_loss_clip": 1.0271883, + "balance_loss_mlp": 1.04527128, + "epoch": 0.2929806102510146, + "flos": 19058259922560.0, + "grad_norm": 2.2968152323379347, + "language_loss": 0.72835052, + "learning_rate": 3.317330731292164e-06, + "loss": 0.75009787, + "num_input_tokens_seen": 105071840, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.859375, + "step": 4873, + "time_per_iteration": 2.4370815753936768 + }, + { + "auxiliary_loss_clip": 0.01133087, + "auxiliary_loss_mlp": 0.01041861, + "balance_loss_clip": 1.02518392, + "balance_loss_mlp": 1.04519463, + "epoch": 0.29304073350368254, + "flos": 21944221130880.0, + "grad_norm": 1.8384173868300016, + "language_loss": 0.77871835, + "learning_rate": 3.3170376613270812e-06, + "loss": 0.80046785, + "num_input_tokens_seen": 105089445, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.87890625, + "step": 4874, + "time_per_iteration": 2.512613534927368 + }, + { + "auxiliary_loss_clip": 0.01135856, + "auxiliary_loss_mlp": 0.01045857, + "balance_loss_clip": 1.02962041, + "balance_loss_mlp": 1.04670048, + "epoch": 0.2931008567563505, + "flos": 15450818135040.0, + "grad_norm": 2.084283832751276, + "language_loss": 0.77047002, + "learning_rate": 3.3167445414197985e-06, + "loss": 0.79228717, + "num_input_tokens_seen": 105106210, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.890625, + "step": 4875, + "time_per_iteration": 2.487417221069336 + }, + { + "auxiliary_loss_clip": 0.01136141, + "auxiliary_loss_mlp": 0.0103441, + "balance_loss_clip": 1.01912785, + "balance_loss_mlp": 1.04909277, + "epoch": 0.29316098000901847, + "flos": 16983162288000.0, + "grad_norm": 1.6806867883636405, + "language_loss": 0.69183826, + "learning_rate": 3.316451371581431e-06, + "loss": 0.71354383, + "num_input_tokens_seen": 105124200, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.87109375, + "step": 4876, + "time_per_iteration": 2.4764888286590576 + }, + { + "auxiliary_loss_clip": 0.01128897, + "auxiliary_loss_mlp": 0.01045114, + "balance_loss_clip": 1.03027201, + "balance_loss_mlp": 1.04482532, + "epoch": 0.29322110326168643, + "flos": 16357705741440.0, + "grad_norm": 2.3621737524413913, + "language_loss": 0.8195532, + "learning_rate": 3.316158151823096e-06, + "loss": 0.84129333, + "num_input_tokens_seen": 105140400, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.83984375, + "step": 4877, + "time_per_iteration": 2.4738340377807617 + }, + { + "auxiliary_loss_clip": 0.01134087, + "auxiliary_loss_mlp": 0.01042806, + "balance_loss_clip": 1.02765405, + "balance_loss_mlp": 1.04704273, + "epoch": 0.29328122651435445, + "flos": 13990869843840.0, + "grad_norm": 1.8654341954981455, + "language_loss": 0.67843962, + "learning_rate": 3.315864882155911e-06, + "loss": 0.70020854, + "num_input_tokens_seen": 105157535, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.87109375, + "step": 4878, + "time_per_iteration": 2.4606332778930664 + }, + { + "auxiliary_loss_clip": 0.01130502, + "auxiliary_loss_mlp": 0.01041377, + "balance_loss_clip": 1.02624929, + "balance_loss_mlp": 1.04562759, + "epoch": 0.2933413497670224, + "flos": 25264593423360.0, + "grad_norm": 1.8286598598322423, + "language_loss": 0.7351383, + "learning_rate": 3.3155715625909982e-06, + "loss": 0.7568571, + "num_input_tokens_seen": 105175185, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.84765625, + "step": 4879, + "time_per_iteration": 2.502901315689087 + }, + { + "auxiliary_loss_clip": 0.01137009, + "auxiliary_loss_mlp": 0.01046436, + "balance_loss_clip": 1.02881706, + "balance_loss_mlp": 1.0484302, + "epoch": 0.2934014730196904, + "flos": 32123746656000.0, + "grad_norm": 2.0641755158914634, + "language_loss": 0.65864384, + "learning_rate": 3.3152781931394803e-06, + "loss": 0.68047822, + "num_input_tokens_seen": 105194540, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.88671875, + "step": 4880, + "time_per_iteration": 2.5785939693450928 + }, + { + "auxiliary_loss_clip": 0.01130839, + "auxiliary_loss_mlp": 0.01045571, + "balance_loss_clip": 1.02962136, + "balance_loss_mlp": 1.04453218, + "epoch": 0.29346159627235835, + "flos": 24352498344960.0, + "grad_norm": 2.157512175932489, + "language_loss": 0.70518327, + "learning_rate": 3.314984773812481e-06, + "loss": 0.72694737, + "num_input_tokens_seen": 105213215, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.86328125, + "step": 4881, + "time_per_iteration": 2.4913742542266846 + }, + { + "auxiliary_loss_clip": 0.01133082, + "auxiliary_loss_mlp": 0.01039157, + "balance_loss_clip": 1.02336192, + "balance_loss_mlp": 1.0471015, + "epoch": 0.2935217195250263, + "flos": 22746752749440.0, + "grad_norm": 2.112776228996839, + "language_loss": 0.83907056, + "learning_rate": 3.314691304621127e-06, + "loss": 0.86079299, + "num_input_tokens_seen": 105231585, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.859375, + "step": 4882, + "time_per_iteration": 2.4955010414123535 + }, + { + "auxiliary_loss_clip": 0.01135526, + "auxiliary_loss_mlp": 0.01041581, + "balance_loss_clip": 1.02495086, + "balance_loss_mlp": 1.0470233, + "epoch": 0.2935818427776943, + "flos": 21725561088000.0, + "grad_norm": 2.198383771985309, + "language_loss": 0.71811014, + "learning_rate": 3.314397785576548e-06, + "loss": 0.73988116, + "num_input_tokens_seen": 105250120, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8828125, + "step": 4883, + "time_per_iteration": 2.474574089050293 + }, + { + "auxiliary_loss_clip": 0.01132572, + "auxiliary_loss_mlp": 0.01038466, + "balance_loss_clip": 1.02225327, + "balance_loss_mlp": 1.04580843, + "epoch": 0.29364196603036224, + "flos": 23804968354560.0, + "grad_norm": 3.497082861184858, + "language_loss": 0.92629534, + "learning_rate": 3.3141042166898726e-06, + "loss": 0.94800568, + "num_input_tokens_seen": 105266065, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8671875, + "step": 4884, + "time_per_iteration": 2.4947426319122314 + }, + { + "auxiliary_loss_clip": 0.01138135, + "auxiliary_loss_mlp": 0.01045606, + "balance_loss_clip": 1.03032374, + "balance_loss_mlp": 1.05094171, + "epoch": 0.2937020892830302, + "flos": 23470064922240.0, + "grad_norm": 2.2315982417854876, + "language_loss": 0.73729408, + "learning_rate": 3.313810597972234e-06, + "loss": 0.75913155, + "num_input_tokens_seen": 105282155, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.87109375, + "step": 4885, + "time_per_iteration": 2.5076494216918945 + }, + { + "auxiliary_loss_clip": 0.01132864, + "auxiliary_loss_mlp": 0.0104824, + "balance_loss_clip": 1.03185511, + "balance_loss_mlp": 1.0468272, + "epoch": 0.2937622125356982, + "flos": 24272740195200.0, + "grad_norm": 2.1964333946604135, + "language_loss": 0.85011208, + "learning_rate": 3.3135169294347655e-06, + "loss": 0.87192315, + "num_input_tokens_seen": 105299225, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.859375, + "step": 4886, + "time_per_iteration": 3.911407232284546 + }, + { + "auxiliary_loss_clip": 0.01135202, + "auxiliary_loss_mlp": 0.01041375, + "balance_loss_clip": 1.02624702, + "balance_loss_mlp": 1.04678059, + "epoch": 0.29382233578836614, + "flos": 20662461233280.0, + "grad_norm": 2.1393217933297657, + "language_loss": 0.77027792, + "learning_rate": 3.313223211088603e-06, + "loss": 0.79204369, + "num_input_tokens_seen": 105315710, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.88671875, + "step": 4887, + "time_per_iteration": 3.906132936477661 + }, + { + "auxiliary_loss_clip": 0.01135916, + "auxiliary_loss_mlp": 0.01046614, + "balance_loss_clip": 1.03127122, + "balance_loss_mlp": 1.04697633, + "epoch": 0.2938824590410341, + "flos": 16545052103040.0, + "grad_norm": 2.1952396364021536, + "language_loss": 0.79558414, + "learning_rate": 3.3129294429448855e-06, + "loss": 0.8174094, + "num_input_tokens_seen": 105333505, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.890625, + "step": 4888, + "time_per_iteration": 2.4338221549987793 + }, + { + "auxiliary_loss_clip": 0.01130748, + "auxiliary_loss_mlp": 0.01032451, + "balance_loss_clip": 1.0173831, + "balance_loss_mlp": 1.04529762, + "epoch": 0.29394258229370207, + "flos": 37925474382720.0, + "grad_norm": 1.4299668586503376, + "language_loss": 0.55301261, + "learning_rate": 3.3126356250147517e-06, + "loss": 0.57464457, + "num_input_tokens_seen": 105355605, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.85546875, + "step": 4889, + "time_per_iteration": 2.637645959854126 + }, + { + "auxiliary_loss_clip": 0.01134449, + "auxiliary_loss_mlp": 0.01039798, + "balance_loss_clip": 1.02314413, + "balance_loss_mlp": 1.0465076, + "epoch": 0.29400270554637004, + "flos": 20044690197120.0, + "grad_norm": 1.9477461279926194, + "language_loss": 0.84309214, + "learning_rate": 3.3123417573093434e-06, + "loss": 0.86483455, + "num_input_tokens_seen": 105374225, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.87890625, + "step": 4890, + "time_per_iteration": 2.445218801498413 + }, + { + "auxiliary_loss_clip": 0.01135501, + "auxiliary_loss_mlp": 0.01039459, + "balance_loss_clip": 1.02402174, + "balance_loss_mlp": 1.04780436, + "epoch": 0.294062828799038, + "flos": 15266380775040.0, + "grad_norm": 1.9951401673219091, + "language_loss": 0.72357798, + "learning_rate": 3.3120478398398046e-06, + "loss": 0.74532759, + "num_input_tokens_seen": 105391565, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.875, + "step": 4891, + "time_per_iteration": 2.434298515319824 + }, + { + "auxiliary_loss_clip": 0.01134115, + "auxiliary_loss_mlp": 0.01045308, + "balance_loss_clip": 1.02910721, + "balance_loss_mlp": 1.04683042, + "epoch": 0.294122952051706, + "flos": 22747147799040.0, + "grad_norm": 1.9834299238301316, + "language_loss": 0.77230573, + "learning_rate": 3.3117538726172797e-06, + "loss": 0.79410005, + "num_input_tokens_seen": 105409840, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.875, + "step": 4892, + "time_per_iteration": 2.4628806114196777 + }, + { + "auxiliary_loss_clip": 0.01130172, + "auxiliary_loss_mlp": 0.01035757, + "balance_loss_clip": 1.01989055, + "balance_loss_mlp": 1.04514182, + "epoch": 0.294183075304374, + "flos": 24972891073920.0, + "grad_norm": 1.7053650125053033, + "language_loss": 0.7846024, + "learning_rate": 3.3114598556529164e-06, + "loss": 0.80626166, + "num_input_tokens_seen": 105428645, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8515625, + "step": 4893, + "time_per_iteration": 2.505946159362793 + }, + { + "auxiliary_loss_clip": 0.01132333, + "auxiliary_loss_mlp": 0.01048117, + "balance_loss_clip": 1.03252435, + "balance_loss_mlp": 1.04651928, + "epoch": 0.29424319855704195, + "flos": 30952986762240.0, + "grad_norm": 1.8389301673785101, + "language_loss": 0.85052156, + "learning_rate": 3.311165788957864e-06, + "loss": 0.87232608, + "num_input_tokens_seen": 105447480, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.85546875, + "step": 4894, + "time_per_iteration": 2.5221872329711914 + }, + { + "auxiliary_loss_clip": 0.01132661, + "auxiliary_loss_mlp": 0.01036474, + "balance_loss_clip": 1.02120304, + "balance_loss_mlp": 1.04568195, + "epoch": 0.2943033218097099, + "flos": 15231583474560.0, + "grad_norm": 2.595597690193387, + "language_loss": 0.9027828, + "learning_rate": 3.310871672543274e-06, + "loss": 0.92447418, + "num_input_tokens_seen": 105464600, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.87109375, + "step": 4895, + "time_per_iteration": 2.4466798305511475 + }, + { + "auxiliary_loss_clip": 0.01135692, + "auxiliary_loss_mlp": 0.01040955, + "balance_loss_clip": 1.02434874, + "balance_loss_mlp": 1.04720199, + "epoch": 0.2943634450623779, + "flos": 21725884310400.0, + "grad_norm": 3.001231056574592, + "language_loss": 0.86597103, + "learning_rate": 3.3105775064202982e-06, + "loss": 0.88773751, + "num_input_tokens_seen": 105481510, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8828125, + "step": 4896, + "time_per_iteration": 2.459611654281616 + }, + { + "auxiliary_loss_clip": 0.01134294, + "auxiliary_loss_mlp": 0.01050105, + "balance_loss_clip": 1.03402412, + "balance_loss_mlp": 1.04802299, + "epoch": 0.29442356831504585, + "flos": 22602104680320.0, + "grad_norm": 2.652800133974417, + "language_loss": 0.73196733, + "learning_rate": 3.3102832906000924e-06, + "loss": 0.75381136, + "num_input_tokens_seen": 105501390, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.86328125, + "step": 4897, + "time_per_iteration": 2.4981348514556885 + }, + { + "auxiliary_loss_clip": 0.01136241, + "auxiliary_loss_mlp": 0.01042547, + "balance_loss_clip": 1.02546394, + "balance_loss_mlp": 1.0458895, + "epoch": 0.2944836915677138, + "flos": 20011401267840.0, + "grad_norm": 1.867954953207583, + "language_loss": 0.73798919, + "learning_rate": 3.309989025093813e-06, + "loss": 0.75977707, + "num_input_tokens_seen": 105519600, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.90234375, + "step": 4898, + "time_per_iteration": 2.439952850341797 + }, + { + "auxiliary_loss_clip": 0.01142949, + "auxiliary_loss_mlp": 0.01042887, + "balance_loss_clip": 1.02471972, + "balance_loss_mlp": 1.05136585, + "epoch": 0.2945438148203818, + "flos": 20045875345920.0, + "grad_norm": 2.6754375338801477, + "language_loss": 0.70309317, + "learning_rate": 3.309694709912618e-06, + "loss": 0.72495157, + "num_input_tokens_seen": 105535970, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9140625, + "step": 4899, + "time_per_iteration": 2.4757347106933594 + }, + { + "auxiliary_loss_clip": 0.01135914, + "auxiliary_loss_mlp": 0.01041458, + "balance_loss_clip": 1.02458405, + "balance_loss_mlp": 1.0484879, + "epoch": 0.29460393807304974, + "flos": 23733542160000.0, + "grad_norm": 1.9063479453414416, + "language_loss": 0.79007781, + "learning_rate": 3.3094003450676685e-06, + "loss": 0.8118515, + "num_input_tokens_seen": 105556735, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.875, + "step": 4900, + "time_per_iteration": 2.50555419921875 + }, + { + "auxiliary_loss_clip": 0.01131673, + "auxiliary_loss_mlp": 0.01042831, + "balance_loss_clip": 1.02720261, + "balance_loss_mlp": 1.04425764, + "epoch": 0.2946640613257177, + "flos": 14976079056000.0, + "grad_norm": 1.709443882500664, + "language_loss": 0.80718857, + "learning_rate": 3.3091059305701268e-06, + "loss": 0.8289336, + "num_input_tokens_seen": 105574875, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.875, + "step": 4901, + "time_per_iteration": 2.481768846511841 + }, + { + "auxiliary_loss_clip": 0.01127885, + "auxiliary_loss_mlp": 0.01035027, + "balance_loss_clip": 1.02062666, + "balance_loss_mlp": 1.04583955, + "epoch": 0.2947241845783857, + "flos": 24243904552320.0, + "grad_norm": 1.9567596526300628, + "language_loss": 0.57923675, + "learning_rate": 3.308811466431157e-06, + "loss": 0.60086584, + "num_input_tokens_seen": 105594225, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8203125, + "step": 4902, + "time_per_iteration": 2.491337299346924 + }, + { + "auxiliary_loss_clip": 0.01131951, + "auxiliary_loss_mlp": 0.01038919, + "balance_loss_clip": 1.02416682, + "balance_loss_mlp": 1.045946, + "epoch": 0.29478430783105364, + "flos": 19938394874880.0, + "grad_norm": 1.6713771638909152, + "language_loss": 0.75298065, + "learning_rate": 3.308516952661925e-06, + "loss": 0.77468932, + "num_input_tokens_seen": 105614000, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.859375, + "step": 4903, + "time_per_iteration": 2.4884400367736816 + }, + { + "auxiliary_loss_clip": 0.01132991, + "auxiliary_loss_mlp": 0.01042452, + "balance_loss_clip": 1.02560806, + "balance_loss_mlp": 1.04630995, + "epoch": 0.2948444310837216, + "flos": 27381347856000.0, + "grad_norm": 1.8012466742437707, + "language_loss": 0.6254617, + "learning_rate": 3.3082223892736e-06, + "loss": 0.64721614, + "num_input_tokens_seen": 105634575, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8671875, + "step": 4904, + "time_per_iteration": 2.5288941860198975 + }, + { + "auxiliary_loss_clip": 0.01134735, + "auxiliary_loss_mlp": 0.01038462, + "balance_loss_clip": 1.02252424, + "balance_loss_mlp": 1.04603219, + "epoch": 0.2949045543363896, + "flos": 23405462311680.0, + "grad_norm": 1.5173763027357385, + "language_loss": 0.7301079, + "learning_rate": 3.3079277762773496e-06, + "loss": 0.75183994, + "num_input_tokens_seen": 105654385, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.88671875, + "step": 4905, + "time_per_iteration": 2.5069708824157715 + }, + { + "auxiliary_loss_clip": 0.01131529, + "auxiliary_loss_mlp": 0.01041997, + "balance_loss_clip": 1.02577305, + "balance_loss_mlp": 1.0456897, + "epoch": 0.2949646775890576, + "flos": 23951483930880.0, + "grad_norm": 1.6701950888056076, + "language_loss": 0.81584871, + "learning_rate": 3.3076331136843476e-06, + "loss": 0.8375839, + "num_input_tokens_seen": 105673570, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.859375, + "step": 4906, + "time_per_iteration": 2.473604202270508 + }, + { + "auxiliary_loss_clip": 0.01128251, + "auxiliary_loss_mlp": 0.01034193, + "balance_loss_clip": 1.01870799, + "balance_loss_mlp": 1.04443395, + "epoch": 0.29502480084172555, + "flos": 22784315397120.0, + "grad_norm": 1.9494272179492087, + "language_loss": 0.87158448, + "learning_rate": 3.3073384015057667e-06, + "loss": 0.89320892, + "num_input_tokens_seen": 105691940, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.83984375, + "step": 4907, + "time_per_iteration": 2.490842819213867 + }, + { + "auxiliary_loss_clip": 0.01135464, + "auxiliary_loss_mlp": 0.01042187, + "balance_loss_clip": 1.02623653, + "balance_loss_mlp": 1.04758191, + "epoch": 0.2950849240943935, + "flos": 19646656611840.0, + "grad_norm": 2.3387997458884833, + "language_loss": 0.81563503, + "learning_rate": 3.307043639752782e-06, + "loss": 0.83741152, + "num_input_tokens_seen": 105709825, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87890625, + "step": 4908, + "time_per_iteration": 2.4586410522460938 + }, + { + "auxiliary_loss_clip": 0.01054339, + "auxiliary_loss_mlp": 0.01042068, + "balance_loss_clip": 1.03970814, + "balance_loss_mlp": 1.0157342, + "epoch": 0.2951450473470615, + "flos": 71002829260800.0, + "grad_norm": 0.7811313355607663, + "language_loss": 0.57214808, + "learning_rate": 3.3067488284365728e-06, + "loss": 0.59311211, + "num_input_tokens_seen": 105766880, + "router_z_loss_clip": 0.02355957, + "router_z_loss_mlp": 0.38671875, + "step": 4909, + "time_per_iteration": 2.9739394187927246 + }, + { + "auxiliary_loss_clip": 0.01136234, + "auxiliary_loss_mlp": 0.01038405, + "balance_loss_clip": 1.02340245, + "balance_loss_mlp": 1.05156505, + "epoch": 0.29520517059972945, + "flos": 22966310632320.0, + "grad_norm": 1.44395719574742, + "language_loss": 0.86585498, + "learning_rate": 3.3064539675683163e-06, + "loss": 0.88760138, + "num_input_tokens_seen": 105786875, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.84765625, + "step": 4910, + "time_per_iteration": 2.4779117107391357 + }, + { + "auxiliary_loss_clip": 0.01126914, + "auxiliary_loss_mlp": 0.01040378, + "balance_loss_clip": 1.02551222, + "balance_loss_mlp": 1.04549575, + "epoch": 0.2952652938523974, + "flos": 20485673470080.0, + "grad_norm": 1.8630755123750513, + "language_loss": 0.72632295, + "learning_rate": 3.3061590571591946e-06, + "loss": 0.74799585, + "num_input_tokens_seen": 105805315, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8125, + "step": 4911, + "time_per_iteration": 2.4959700107574463 + }, + { + "auxiliary_loss_clip": 0.01131053, + "auxiliary_loss_mlp": 0.01037399, + "balance_loss_clip": 1.02239108, + "balance_loss_mlp": 1.04823601, + "epoch": 0.2953254171050654, + "flos": 19646584784640.0, + "grad_norm": 1.774615067737937, + "language_loss": 0.8988539, + "learning_rate": 3.3058640972203904e-06, + "loss": 0.92053854, + "num_input_tokens_seen": 105825125, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 4912, + "time_per_iteration": 2.4532997608184814 + }, + { + "auxiliary_loss_clip": 0.01131716, + "auxiliary_loss_mlp": 0.01046481, + "balance_loss_clip": 1.03022075, + "balance_loss_mlp": 1.04712319, + "epoch": 0.29538554035773334, + "flos": 22747973811840.0, + "grad_norm": 1.458226475428025, + "language_loss": 0.83448595, + "learning_rate": 3.3055690877630894e-06, + "loss": 0.85626793, + "num_input_tokens_seen": 105846085, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.84765625, + "step": 4913, + "time_per_iteration": 2.515580654144287 + }, + { + "auxiliary_loss_clip": 0.01129704, + "auxiliary_loss_mlp": 0.01039162, + "balance_loss_clip": 1.02385521, + "balance_loss_mlp": 1.0438993, + "epoch": 0.2954456636104013, + "flos": 21871861182720.0, + "grad_norm": 1.6602062940724112, + "language_loss": 0.77029538, + "learning_rate": 3.3052740287984765e-06, + "loss": 0.79198408, + "num_input_tokens_seen": 105865400, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.859375, + "step": 4914, + "time_per_iteration": 2.457158088684082 + }, + { + "auxiliary_loss_clip": 0.01128554, + "auxiliary_loss_mlp": 0.01039033, + "balance_loss_clip": 1.02302349, + "balance_loss_mlp": 1.04553497, + "epoch": 0.2955057868630693, + "flos": 40442560871040.0, + "grad_norm": 1.9027466376674422, + "language_loss": 0.81550008, + "learning_rate": 3.3049789203377424e-06, + "loss": 0.83717597, + "num_input_tokens_seen": 105887920, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.828125, + "step": 4915, + "time_per_iteration": 2.6669511795043945 + }, + { + "auxiliary_loss_clip": 0.01134085, + "auxiliary_loss_mlp": 0.01037926, + "balance_loss_clip": 1.02215445, + "balance_loss_mlp": 1.0477066, + "epoch": 0.29556591011573724, + "flos": 22564506119040.0, + "grad_norm": 1.9544787473030132, + "language_loss": 0.84415555, + "learning_rate": 3.3046837623920772e-06, + "loss": 0.8658756, + "num_input_tokens_seen": 105904035, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.86328125, + "step": 4916, + "time_per_iteration": 2.473867416381836 + }, + { + "auxiliary_loss_clip": 0.01127987, + "auxiliary_loss_mlp": 0.01033684, + "balance_loss_clip": 1.01874673, + "balance_loss_mlp": 1.04477537, + "epoch": 0.2956260333684052, + "flos": 22089300163200.0, + "grad_norm": 3.5737730841451225, + "language_loss": 0.69611692, + "learning_rate": 3.3043885549726723e-06, + "loss": 0.71773368, + "num_input_tokens_seen": 105922685, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.83203125, + "step": 4917, + "time_per_iteration": 2.5078670978546143 + }, + { + "auxiliary_loss_clip": 0.01134116, + "auxiliary_loss_mlp": 0.01041431, + "balance_loss_clip": 1.02550471, + "balance_loss_mlp": 1.04932523, + "epoch": 0.2956861566210732, + "flos": 16435488643200.0, + "grad_norm": 2.1750223310256507, + "language_loss": 0.90840054, + "learning_rate": 3.3040932980907226e-06, + "loss": 0.93015605, + "num_input_tokens_seen": 105940425, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.84765625, + "step": 4918, + "time_per_iteration": 2.438870668411255 + }, + { + "auxiliary_loss_clip": 0.01134586, + "auxiliary_loss_mlp": 0.01040808, + "balance_loss_clip": 1.02504885, + "balance_loss_mlp": 1.04929781, + "epoch": 0.2957462798737412, + "flos": 25812087500160.0, + "grad_norm": 1.9164121886210477, + "language_loss": 0.72399461, + "learning_rate": 3.303797991757425e-06, + "loss": 0.74574864, + "num_input_tokens_seen": 105960550, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8515625, + "step": 4919, + "time_per_iteration": 2.5533134937286377 + }, + { + "auxiliary_loss_clip": 0.01130751, + "auxiliary_loss_mlp": 0.01042531, + "balance_loss_clip": 1.02661633, + "balance_loss_mlp": 1.04704165, + "epoch": 0.29580640312640916, + "flos": 16690849407360.0, + "grad_norm": 1.7148380002351797, + "language_loss": 0.75758076, + "learning_rate": 3.3035026359839763e-06, + "loss": 0.77931356, + "num_input_tokens_seen": 105978820, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8359375, + "step": 4920, + "time_per_iteration": 2.4288933277130127 + }, + { + "auxiliary_loss_clip": 0.01138995, + "auxiliary_loss_mlp": 0.01045405, + "balance_loss_clip": 1.02953875, + "balance_loss_mlp": 1.05214858, + "epoch": 0.2958665263790771, + "flos": 23945594100480.0, + "grad_norm": 2.2591712667141075, + "language_loss": 0.68327153, + "learning_rate": 3.3032072307815774e-06, + "loss": 0.7051155, + "num_input_tokens_seen": 105997545, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8671875, + "step": 4921, + "time_per_iteration": 2.5978074073791504 + }, + { + "auxiliary_loss_clip": 0.01136262, + "auxiliary_loss_mlp": 0.01042633, + "balance_loss_clip": 1.02580023, + "balance_loss_mlp": 1.04953861, + "epoch": 0.2959266496317451, + "flos": 18478410670080.0, + "grad_norm": 1.8781945072150448, + "language_loss": 0.74265885, + "learning_rate": 3.3029117761614298e-06, + "loss": 0.76444781, + "num_input_tokens_seen": 106015320, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8671875, + "step": 4922, + "time_per_iteration": 2.4518954753875732 + }, + { + "auxiliary_loss_clip": 0.0113841, + "auxiliary_loss_mlp": 0.0103604, + "balance_loss_clip": 1.01932716, + "balance_loss_mlp": 1.04900336, + "epoch": 0.29598677288441305, + "flos": 25957489754880.0, + "grad_norm": 2.178664992776949, + "language_loss": 0.76679426, + "learning_rate": 3.302616272134737e-06, + "loss": 0.78853875, + "num_input_tokens_seen": 106034555, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.89453125, + "step": 4923, + "time_per_iteration": 2.5565848350524902 + }, + { + "auxiliary_loss_clip": 0.0113218, + "auxiliary_loss_mlp": 0.01039495, + "balance_loss_clip": 1.02359807, + "balance_loss_mlp": 1.04730439, + "epoch": 0.296046896137081, + "flos": 25155999630720.0, + "grad_norm": 1.616043641477794, + "language_loss": 0.86307567, + "learning_rate": 3.3023207187127042e-06, + "loss": 0.88479245, + "num_input_tokens_seen": 106054200, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8515625, + "step": 4924, + "time_per_iteration": 2.5081374645233154 + }, + { + "auxiliary_loss_clip": 0.01132422, + "auxiliary_loss_mlp": 0.01034495, + "balance_loss_clip": 1.01864016, + "balance_loss_mlp": 1.04767513, + "epoch": 0.296107019389749, + "flos": 21761148487680.0, + "grad_norm": 1.3983202546472309, + "language_loss": 0.8180936, + "learning_rate": 3.3020251159065396e-06, + "loss": 0.83976275, + "num_input_tokens_seen": 106074700, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.84765625, + "step": 4925, + "time_per_iteration": 2.5473146438598633 + }, + { + "auxiliary_loss_clip": 0.01132696, + "auxiliary_loss_mlp": 0.01036748, + "balance_loss_clip": 1.02175128, + "balance_loss_mlp": 1.04893184, + "epoch": 0.29616714264241695, + "flos": 17960039544960.0, + "grad_norm": 2.5479827750219735, + "language_loss": 0.85168374, + "learning_rate": 3.301729463727452e-06, + "loss": 0.87337816, + "num_input_tokens_seen": 106091415, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8359375, + "step": 4926, + "time_per_iteration": 2.4603803157806396 + }, + { + "auxiliary_loss_clip": 0.0113285, + "auxiliary_loss_mlp": 0.01039481, + "balance_loss_clip": 1.02391791, + "balance_loss_mlp": 1.04658842, + "epoch": 0.2962272658950849, + "flos": 15012779777280.0, + "grad_norm": 2.1014080951069913, + "language_loss": 0.85908806, + "learning_rate": 3.3014337621866527e-06, + "loss": 0.88081133, + "num_input_tokens_seen": 106109135, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.86328125, + "step": 4927, + "time_per_iteration": 2.4724504947662354 + }, + { + "auxiliary_loss_clip": 0.01129564, + "auxiliary_loss_mlp": 0.01039461, + "balance_loss_clip": 1.02434492, + "balance_loss_mlp": 1.04636681, + "epoch": 0.2962873891477529, + "flos": 14720861946240.0, + "grad_norm": 1.8730507383843338, + "language_loss": 0.80967462, + "learning_rate": 3.3011380112953553e-06, + "loss": 0.83136487, + "num_input_tokens_seen": 106125750, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.83203125, + "step": 4928, + "time_per_iteration": 5.46146297454834 + }, + { + "auxiliary_loss_clip": 0.01138553, + "auxiliary_loss_mlp": 0.01041915, + "balance_loss_clip": 1.023211, + "balance_loss_mlp": 1.04749835, + "epoch": 0.29634751240042084, + "flos": 26723787528960.0, + "grad_norm": 3.002605920988437, + "language_loss": 0.72472513, + "learning_rate": 3.300842211064773e-06, + "loss": 0.7465297, + "num_input_tokens_seen": 106142835, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.91015625, + "step": 4929, + "time_per_iteration": 2.4938502311706543 + }, + { + "auxiliary_loss_clip": 0.01136289, + "auxiliary_loss_mlp": 0.01043304, + "balance_loss_clip": 1.02631676, + "balance_loss_mlp": 1.04823208, + "epoch": 0.2964076356530888, + "flos": 14571293713920.0, + "grad_norm": 2.429634231323073, + "language_loss": 0.72424346, + "learning_rate": 3.3005463615061246e-06, + "loss": 0.74603939, + "num_input_tokens_seen": 106160680, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.8828125, + "step": 4930, + "time_per_iteration": 2.486492156982422 + }, + { + "auxiliary_loss_clip": 0.01059615, + "auxiliary_loss_mlp": 0.01002568, + "balance_loss_clip": 1.00047004, + "balance_loss_mlp": 1.0186131, + "epoch": 0.29646775890575683, + "flos": 63104315063040.0, + "grad_norm": 0.8134562784526058, + "language_loss": 0.60710716, + "learning_rate": 3.3002504626306275e-06, + "loss": 0.627729, + "num_input_tokens_seen": 106224415, + "router_z_loss_clip": 0.02099609, + "router_z_loss_mlp": 0.41015625, + "step": 4931, + "time_per_iteration": 3.002444267272949 + }, + { + "auxiliary_loss_clip": 0.01058931, + "auxiliary_loss_mlp": 0.01001224, + "balance_loss_clip": 0.99926931, + "balance_loss_mlp": 1.01823413, + "epoch": 0.2965278821584248, + "flos": 63067686168960.0, + "grad_norm": 0.7413672345708404, + "language_loss": 0.52383232, + "learning_rate": 3.2999545144495023e-06, + "loss": 0.54443383, + "num_input_tokens_seen": 106279140, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.40625, + "step": 4932, + "time_per_iteration": 2.974777936935425 + }, + { + "auxiliary_loss_clip": 0.01127694, + "auxiliary_loss_mlp": 0.01039106, + "balance_loss_clip": 1.02322757, + "balance_loss_mlp": 1.04449248, + "epoch": 0.29658800541109276, + "flos": 23768734510080.0, + "grad_norm": 3.155895790893495, + "language_loss": 0.81622797, + "learning_rate": 3.299658516973972e-06, + "loss": 0.83789599, + "num_input_tokens_seen": 106298190, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.83203125, + "step": 4933, + "time_per_iteration": 2.518906593322754 + }, + { + "auxiliary_loss_clip": 0.0112788, + "auxiliary_loss_mlp": 0.0103376, + "balance_loss_clip": 1.01854897, + "balance_loss_mlp": 1.04651821, + "epoch": 0.2966481286637607, + "flos": 23988543788160.0, + "grad_norm": 1.671865304120784, + "language_loss": 0.75257647, + "learning_rate": 3.299362470215261e-06, + "loss": 0.77419287, + "num_input_tokens_seen": 106319065, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8125, + "step": 4934, + "time_per_iteration": 2.5015640258789062 + }, + { + "auxiliary_loss_clip": 0.01134944, + "auxiliary_loss_mlp": 0.01045163, + "balance_loss_clip": 1.02837849, + "balance_loss_mlp": 1.04699588, + "epoch": 0.2967082519164287, + "flos": 17165157523200.0, + "grad_norm": 1.752558919138232, + "language_loss": 0.62510157, + "learning_rate": 3.299066374184594e-06, + "loss": 0.64690268, + "num_input_tokens_seen": 106338040, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.87890625, + "step": 4935, + "time_per_iteration": 2.462982654571533 + }, + { + "auxiliary_loss_clip": 0.01129673, + "auxiliary_loss_mlp": 0.01041832, + "balance_loss_clip": 1.02585804, + "balance_loss_mlp": 1.04613912, + "epoch": 0.29676837516909665, + "flos": 29387712816000.0, + "grad_norm": 1.4993711353436514, + "language_loss": 0.79789758, + "learning_rate": 3.2987702288932e-06, + "loss": 0.81961262, + "num_input_tokens_seen": 106358900, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8359375, + "step": 4936, + "time_per_iteration": 2.5267326831817627 + }, + { + "auxiliary_loss_clip": 0.01132719, + "auxiliary_loss_mlp": 0.01045272, + "balance_loss_clip": 1.02854681, + "balance_loss_mlp": 1.04649782, + "epoch": 0.2968284984217646, + "flos": 34751222616960.0, + "grad_norm": 1.8807271027259396, + "language_loss": 0.74074632, + "learning_rate": 3.298474034352309e-06, + "loss": 0.76252627, + "num_input_tokens_seen": 106381805, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.86328125, + "step": 4937, + "time_per_iteration": 2.607790946960449 + }, + { + "auxiliary_loss_clip": 0.01132772, + "auxiliary_loss_mlp": 0.01038823, + "balance_loss_clip": 1.0224793, + "balance_loss_mlp": 1.04839468, + "epoch": 0.2968886216744326, + "flos": 21544104556800.0, + "grad_norm": 1.629632810423829, + "language_loss": 0.7804476, + "learning_rate": 3.2981777905731526e-06, + "loss": 0.80216354, + "num_input_tokens_seen": 106402365, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.84375, + "step": 4938, + "time_per_iteration": 2.469116687774658 + }, + { + "auxiliary_loss_clip": 0.01134705, + "auxiliary_loss_mlp": 0.01041256, + "balance_loss_clip": 1.02543736, + "balance_loss_mlp": 1.04814208, + "epoch": 0.29694874492710055, + "flos": 12787323811200.0, + "grad_norm": 2.041677851061636, + "language_loss": 0.77017808, + "learning_rate": 3.297881497566964e-06, + "loss": 0.79193771, + "num_input_tokens_seen": 106419800, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8671875, + "step": 4939, + "time_per_iteration": 2.453615427017212 + }, + { + "auxiliary_loss_clip": 0.01136816, + "auxiliary_loss_mlp": 0.01036899, + "balance_loss_clip": 1.02075171, + "balance_loss_mlp": 1.04958081, + "epoch": 0.2970088681797685, + "flos": 24569973239040.0, + "grad_norm": 1.5588161926919628, + "language_loss": 0.78206903, + "learning_rate": 3.297585155344979e-06, + "loss": 0.80380619, + "num_input_tokens_seen": 106440300, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.875, + "step": 4940, + "time_per_iteration": 2.5125393867492676 + }, + { + "auxiliary_loss_clip": 0.01133351, + "auxiliary_loss_mlp": 0.01040737, + "balance_loss_clip": 1.0233798, + "balance_loss_mlp": 1.04633832, + "epoch": 0.2970689914324365, + "flos": 23659171050240.0, + "grad_norm": 3.9307439231373884, + "language_loss": 0.75487554, + "learning_rate": 3.297288763918435e-06, + "loss": 0.77661633, + "num_input_tokens_seen": 106460035, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.87109375, + "step": 4941, + "time_per_iteration": 2.5308516025543213 + }, + { + "auxiliary_loss_clip": 0.0113684, + "auxiliary_loss_mlp": 0.01050296, + "balance_loss_clip": 1.03295147, + "balance_loss_mlp": 1.04803753, + "epoch": 0.29712911468510445, + "flos": 39670301439360.0, + "grad_norm": 2.557458362521145, + "language_loss": 0.73998737, + "learning_rate": 3.2969923232985712e-06, + "loss": 0.7618587, + "num_input_tokens_seen": 106481095, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.890625, + "step": 4942, + "time_per_iteration": 2.6214303970336914 + }, + { + "auxiliary_loss_clip": 0.0113696, + "auxiliary_loss_mlp": 0.01047243, + "balance_loss_clip": 1.03017855, + "balance_loss_mlp": 1.04778039, + "epoch": 0.2971892379377724, + "flos": 26395312631040.0, + "grad_norm": 1.997792424787015, + "language_loss": 0.70484138, + "learning_rate": 3.2966958334966287e-06, + "loss": 0.72668344, + "num_input_tokens_seen": 106501590, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.890625, + "step": 4943, + "time_per_iteration": 2.533313751220703 + }, + { + "auxiliary_loss_clip": 0.01137748, + "auxiliary_loss_mlp": 0.01043005, + "balance_loss_clip": 1.02657795, + "balance_loss_mlp": 1.04838014, + "epoch": 0.2972493611904404, + "flos": 17603195880960.0, + "grad_norm": 1.9523342898428475, + "language_loss": 0.80111414, + "learning_rate": 3.2963992945238497e-06, + "loss": 0.82292169, + "num_input_tokens_seen": 106519430, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.89453125, + "step": 4944, + "time_per_iteration": 2.464364528656006 + }, + { + "auxiliary_loss_clip": 0.01129992, + "auxiliary_loss_mlp": 0.01044699, + "balance_loss_clip": 1.02979231, + "balance_loss_mlp": 1.04640603, + "epoch": 0.2973094844431084, + "flos": 20412774817920.0, + "grad_norm": 2.1633352367153105, + "language_loss": 0.83451837, + "learning_rate": 3.2961027063914795e-06, + "loss": 0.85626531, + "num_input_tokens_seen": 106535870, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8359375, + "step": 4945, + "time_per_iteration": 2.4981510639190674 + }, + { + "auxiliary_loss_clip": 0.011318, + "auxiliary_loss_mlp": 0.01039077, + "balance_loss_clip": 1.02353168, + "balance_loss_mlp": 1.04738569, + "epoch": 0.29736960769577636, + "flos": 17493488766720.0, + "grad_norm": 2.2158088930062747, + "language_loss": 0.66624904, + "learning_rate": 3.2958060691107654e-06, + "loss": 0.68795776, + "num_input_tokens_seen": 106553560, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84375, + "step": 4946, + "time_per_iteration": 2.526228666305542 + }, + { + "auxiliary_loss_clip": 0.0113802, + "auxiliary_loss_mlp": 0.01034492, + "balance_loss_clip": 1.01880383, + "balance_loss_mlp": 1.0509392, + "epoch": 0.2974297309484443, + "flos": 26103969417600.0, + "grad_norm": 1.7941079108563611, + "language_loss": 0.73766255, + "learning_rate": 3.2955093826929547e-06, + "loss": 0.75938767, + "num_input_tokens_seen": 106574115, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.87109375, + "step": 4947, + "time_per_iteration": 2.5380265712738037 + }, + { + "auxiliary_loss_clip": 0.01135403, + "auxiliary_loss_mlp": 0.01044741, + "balance_loss_clip": 1.02774215, + "balance_loss_mlp": 1.04653597, + "epoch": 0.2974898542011123, + "flos": 25666433850240.0, + "grad_norm": 2.40735653244717, + "language_loss": 0.7330308, + "learning_rate": 3.2952126471492985e-06, + "loss": 0.75483221, + "num_input_tokens_seen": 106593070, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.890625, + "step": 4948, + "time_per_iteration": 2.5096492767333984 + }, + { + "auxiliary_loss_clip": 0.01129361, + "auxiliary_loss_mlp": 0.01033618, + "balance_loss_clip": 1.01824629, + "balance_loss_mlp": 1.04442465, + "epoch": 0.29754997745378026, + "flos": 18661339658880.0, + "grad_norm": 2.0973131899278825, + "language_loss": 0.84031421, + "learning_rate": 3.2949158624910497e-06, + "loss": 0.86194396, + "num_input_tokens_seen": 106610695, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8515625, + "step": 4949, + "time_per_iteration": 2.4650402069091797 + }, + { + "auxiliary_loss_clip": 0.01129505, + "auxiliary_loss_mlp": 0.01036103, + "balance_loss_clip": 1.02019429, + "balance_loss_mlp": 1.04509461, + "epoch": 0.2976101007064482, + "flos": 22274599449600.0, + "grad_norm": 1.77267818675948, + "language_loss": 0.71322602, + "learning_rate": 3.2946190287294603e-06, + "loss": 0.73488206, + "num_input_tokens_seen": 106631300, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.84375, + "step": 4950, + "time_per_iteration": 2.491163969039917 + }, + { + "auxiliary_loss_clip": 0.01127031, + "auxiliary_loss_mlp": 0.01043355, + "balance_loss_clip": 1.02792883, + "balance_loss_mlp": 1.04543924, + "epoch": 0.2976702239591162, + "flos": 21945657674880.0, + "grad_norm": 1.7996518465212372, + "language_loss": 0.82192945, + "learning_rate": 3.294322145875789e-06, + "loss": 0.84363329, + "num_input_tokens_seen": 106650065, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.81640625, + "step": 4951, + "time_per_iteration": 2.5001299381256104 + }, + { + "auxiliary_loss_clip": 0.01127377, + "auxiliary_loss_mlp": 0.01035652, + "balance_loss_clip": 1.01936841, + "balance_loss_mlp": 1.04211378, + "epoch": 0.29773034721178415, + "flos": 24637197542400.0, + "grad_norm": 2.6816702718299763, + "language_loss": 0.73421168, + "learning_rate": 3.2940252139412912e-06, + "loss": 0.75584191, + "num_input_tokens_seen": 106668230, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8515625, + "step": 4952, + "time_per_iteration": 2.4888715744018555 + }, + { + "auxiliary_loss_clip": 0.01132645, + "auxiliary_loss_mlp": 0.01041244, + "balance_loss_clip": 1.0246501, + "balance_loss_mlp": 1.04677546, + "epoch": 0.2977904704644521, + "flos": 20557566541440.0, + "grad_norm": 1.7548041314188605, + "language_loss": 0.83702904, + "learning_rate": 3.293728232937228e-06, + "loss": 0.85876799, + "num_input_tokens_seen": 106687785, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.859375, + "step": 4953, + "time_per_iteration": 2.486267566680908 + }, + { + "auxiliary_loss_clip": 0.01131661, + "auxiliary_loss_mlp": 0.0103863, + "balance_loss_clip": 1.02330005, + "balance_loss_mlp": 1.04566419, + "epoch": 0.2978505937171201, + "flos": 18916449027840.0, + "grad_norm": 2.078619348093555, + "language_loss": 0.74560732, + "learning_rate": 3.2934312028748597e-06, + "loss": 0.7673102, + "num_input_tokens_seen": 106706875, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.859375, + "step": 4954, + "time_per_iteration": 2.454066276550293 + }, + { + "auxiliary_loss_clip": 0.01129016, + "auxiliary_loss_mlp": 0.01036885, + "balance_loss_clip": 1.02153063, + "balance_loss_mlp": 1.0450201, + "epoch": 0.29791071696978805, + "flos": 19317750750720.0, + "grad_norm": 1.9786208165821892, + "language_loss": 0.75643009, + "learning_rate": 3.293134123765452e-06, + "loss": 0.77808911, + "num_input_tokens_seen": 106725105, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.83984375, + "step": 4955, + "time_per_iteration": 2.487297773361206 + }, + { + "auxiliary_loss_clip": 0.01132846, + "auxiliary_loss_mlp": 0.01035521, + "balance_loss_clip": 1.01980329, + "balance_loss_mlp": 1.04604173, + "epoch": 0.297970840222456, + "flos": 18806813740800.0, + "grad_norm": 3.347495877937089, + "language_loss": 0.72235912, + "learning_rate": 3.2928369956202684e-06, + "loss": 0.74404275, + "num_input_tokens_seen": 106744780, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8671875, + "step": 4956, + "time_per_iteration": 2.453639507293701 + }, + { + "auxiliary_loss_clip": 0.01134178, + "auxiliary_loss_mlp": 0.01044496, + "balance_loss_clip": 1.02737164, + "balance_loss_mlp": 1.04482651, + "epoch": 0.298030963475124, + "flos": 22852760762880.0, + "grad_norm": 1.6786835957024704, + "language_loss": 0.79504669, + "learning_rate": 3.2925398184505754e-06, + "loss": 0.81683344, + "num_input_tokens_seen": 106764670, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.89453125, + "step": 4957, + "time_per_iteration": 2.4680192470550537 + }, + { + "auxiliary_loss_clip": 0.01133842, + "auxiliary_loss_mlp": 0.0103974, + "balance_loss_clip": 1.02283621, + "balance_loss_mlp": 1.04692602, + "epoch": 0.298091086727792, + "flos": 21868485304320.0, + "grad_norm": 1.5505958112028584, + "language_loss": 0.70515305, + "learning_rate": 3.2922425922676437e-06, + "loss": 0.7268889, + "num_input_tokens_seen": 106783695, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8671875, + "step": 4958, + "time_per_iteration": 2.463550090789795 + }, + { + "auxiliary_loss_clip": 0.01130665, + "auxiliary_loss_mlp": 0.0104304, + "balance_loss_clip": 1.02685153, + "balance_loss_mlp": 1.04660892, + "epoch": 0.29815120998045996, + "flos": 21175014355200.0, + "grad_norm": 1.6483091075690746, + "language_loss": 0.78709656, + "learning_rate": 3.291945317082743e-06, + "loss": 0.8088336, + "num_input_tokens_seen": 106803150, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.83984375, + "step": 4959, + "time_per_iteration": 2.4896273612976074 + }, + { + "auxiliary_loss_clip": 0.0112987, + "auxiliary_loss_mlp": 0.01045688, + "balance_loss_clip": 1.03010738, + "balance_loss_mlp": 1.04477429, + "epoch": 0.29821133323312793, + "flos": 19896271200000.0, + "grad_norm": 1.8058675414038505, + "language_loss": 0.79814601, + "learning_rate": 3.291647992907147e-06, + "loss": 0.81990159, + "num_input_tokens_seen": 106820705, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8515625, + "step": 4960, + "time_per_iteration": 2.4524307250976562 + }, + { + "auxiliary_loss_clip": 0.01133353, + "auxiliary_loss_mlp": 0.01047089, + "balance_loss_clip": 1.02998269, + "balance_loss_mlp": 1.04504156, + "epoch": 0.2982714564857959, + "flos": 12750766744320.0, + "grad_norm": 2.8105894923901418, + "language_loss": 0.73709917, + "learning_rate": 3.291350619752129e-06, + "loss": 0.75890362, + "num_input_tokens_seen": 106837335, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.8828125, + "step": 4961, + "time_per_iteration": 2.463160991668701 + }, + { + "auxiliary_loss_clip": 0.01132538, + "auxiliary_loss_mlp": 0.0103792, + "balance_loss_clip": 1.02301908, + "balance_loss_mlp": 1.0466218, + "epoch": 0.29833157973846386, + "flos": 22271905929600.0, + "grad_norm": 1.946317435202559, + "language_loss": 0.62041843, + "learning_rate": 3.291053197628967e-06, + "loss": 0.64212298, + "num_input_tokens_seen": 106856250, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.859375, + "step": 4962, + "time_per_iteration": 2.4734280109405518 + }, + { + "auxiliary_loss_clip": 0.0113099, + "auxiliary_loss_mlp": 0.01038012, + "balance_loss_clip": 1.02143037, + "balance_loss_mlp": 1.04580986, + "epoch": 0.2983917029911318, + "flos": 15372999319680.0, + "grad_norm": 1.708438122809617, + "language_loss": 0.83075964, + "learning_rate": 3.2907557265489375e-06, + "loss": 0.85244966, + "num_input_tokens_seen": 106873370, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8515625, + "step": 4963, + "time_per_iteration": 2.4676647186279297 + }, + { + "auxiliary_loss_clip": 0.01132139, + "auxiliary_loss_mlp": 0.01037543, + "balance_loss_clip": 1.02108073, + "balance_loss_mlp": 1.04811728, + "epoch": 0.2984518262437998, + "flos": 15377632174080.0, + "grad_norm": 2.8539744131594924, + "language_loss": 0.66537225, + "learning_rate": 3.290458206523322e-06, + "loss": 0.68706906, + "num_input_tokens_seen": 106890330, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.83984375, + "step": 4964, + "time_per_iteration": 2.425261974334717 + }, + { + "auxiliary_loss_clip": 0.01128116, + "auxiliary_loss_mlp": 0.01034534, + "balance_loss_clip": 1.01994288, + "balance_loss_mlp": 1.04498291, + "epoch": 0.29851194949646775, + "flos": 18108458542080.0, + "grad_norm": 1.6142193033036512, + "language_loss": 0.70836121, + "learning_rate": 3.2901606375634015e-06, + "loss": 0.72998774, + "num_input_tokens_seen": 106909190, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.83203125, + "step": 4965, + "time_per_iteration": 2.468221664428711 + }, + { + "auxiliary_loss_clip": 0.01137695, + "auxiliary_loss_mlp": 0.01047125, + "balance_loss_clip": 1.0309124, + "balance_loss_mlp": 1.05098724, + "epoch": 0.2985720727491357, + "flos": 22018233104640.0, + "grad_norm": 2.501073720290292, + "language_loss": 0.66185117, + "learning_rate": 3.289863019680461e-06, + "loss": 0.68369937, + "num_input_tokens_seen": 106927825, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8671875, + "step": 4966, + "time_per_iteration": 2.479327440261841 + }, + { + "auxiliary_loss_clip": 0.01134994, + "auxiliary_loss_mlp": 0.01040953, + "balance_loss_clip": 1.02595615, + "balance_loss_mlp": 1.04869342, + "epoch": 0.2986321960018037, + "flos": 13041355772160.0, + "grad_norm": 2.7651343279829215, + "language_loss": 0.74186444, + "learning_rate": 3.289565352885785e-06, + "loss": 0.76362395, + "num_input_tokens_seen": 106943155, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.86328125, + "step": 4967, + "time_per_iteration": 2.4752163887023926 + }, + { + "auxiliary_loss_clip": 0.01129475, + "auxiliary_loss_mlp": 0.01035044, + "balance_loss_clip": 1.02035177, + "balance_loss_mlp": 1.04422212, + "epoch": 0.29869231925447165, + "flos": 14465034305280.0, + "grad_norm": 1.9700123684688966, + "language_loss": 0.71222222, + "learning_rate": 3.2892676371906614e-06, + "loss": 0.73386747, + "num_input_tokens_seen": 106960295, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8515625, + "step": 4968, + "time_per_iteration": 2.448028564453125 + }, + { + "auxiliary_loss_clip": 0.01131577, + "auxiliary_loss_mlp": 0.0103395, + "balance_loss_clip": 1.01884651, + "balance_loss_mlp": 1.04596853, + "epoch": 0.2987524425071396, + "flos": 31650228639360.0, + "grad_norm": 2.0898000655075752, + "language_loss": 0.77127141, + "learning_rate": 3.2889698726063805e-06, + "loss": 0.79292667, + "num_input_tokens_seen": 106982870, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.85546875, + "step": 4969, + "time_per_iteration": 2.5737853050231934 + }, + { + "auxiliary_loss_clip": 0.01131698, + "auxiliary_loss_mlp": 0.01037718, + "balance_loss_clip": 1.022578, + "balance_loss_mlp": 1.04641569, + "epoch": 0.2988125657598076, + "flos": 21433427775360.0, + "grad_norm": 1.5683816051841135, + "language_loss": 0.69798505, + "learning_rate": 3.2886720591442327e-06, + "loss": 0.71967924, + "num_input_tokens_seen": 107002405, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8515625, + "step": 4970, + "time_per_iteration": 5.428143501281738 + }, + { + "auxiliary_loss_clip": 0.01135849, + "auxiliary_loss_mlp": 0.01045402, + "balance_loss_clip": 1.02831888, + "balance_loss_mlp": 1.04582572, + "epoch": 0.2988726890124756, + "flos": 18076965292800.0, + "grad_norm": 2.0403310419369314, + "language_loss": 0.85269564, + "learning_rate": 3.2883741968155103e-06, + "loss": 0.8745082, + "num_input_tokens_seen": 107017310, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.8984375, + "step": 4971, + "time_per_iteration": 2.4557158946990967 + }, + { + "auxiliary_loss_clip": 0.0113165, + "auxiliary_loss_mlp": 0.01044418, + "balance_loss_clip": 1.02905178, + "balance_loss_mlp": 1.0487361, + "epoch": 0.29893281226514357, + "flos": 21755653706880.0, + "grad_norm": 1.8300460221108372, + "language_loss": 0.79116535, + "learning_rate": 3.2880762856315107e-06, + "loss": 0.81292605, + "num_input_tokens_seen": 107034645, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.828125, + "step": 4972, + "time_per_iteration": 2.492119550704956 + }, + { + "auxiliary_loss_clip": 0.0113418, + "auxiliary_loss_mlp": 0.01040036, + "balance_loss_clip": 1.02457476, + "balance_loss_mlp": 1.0491786, + "epoch": 0.29899293551781153, + "flos": 16836718538880.0, + "grad_norm": 1.9080397703774756, + "language_loss": 0.85019803, + "learning_rate": 3.2877783256035285e-06, + "loss": 0.87194014, + "num_input_tokens_seen": 107051125, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84765625, + "step": 4973, + "time_per_iteration": 2.4409923553466797 + }, + { + "auxiliary_loss_clip": 0.01128243, + "auxiliary_loss_mlp": 0.0103693, + "balance_loss_clip": 1.02192163, + "balance_loss_mlp": 1.04866779, + "epoch": 0.2990530587704795, + "flos": 11729215946880.0, + "grad_norm": 1.5302170897903997, + "language_loss": 0.77397263, + "learning_rate": 3.287480316742863e-06, + "loss": 0.79562438, + "num_input_tokens_seen": 107068815, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.796875, + "step": 4974, + "time_per_iteration": 2.4786176681518555 + }, + { + "auxiliary_loss_clip": 0.01135129, + "auxiliary_loss_mlp": 0.01042004, + "balance_loss_clip": 1.02723432, + "balance_loss_mlp": 1.04905188, + "epoch": 0.29911318202314746, + "flos": 28039877850240.0, + "grad_norm": 2.0911748108299015, + "language_loss": 0.72264957, + "learning_rate": 3.287182259060815e-06, + "loss": 0.74442089, + "num_input_tokens_seen": 107090420, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.859375, + "step": 4975, + "time_per_iteration": 2.5267655849456787 + }, + { + "auxiliary_loss_clip": 0.01133427, + "auxiliary_loss_mlp": 0.01038056, + "balance_loss_clip": 1.02204621, + "balance_loss_mlp": 1.0501368, + "epoch": 0.2991733052758154, + "flos": 18733555952640.0, + "grad_norm": 4.957635138610608, + "language_loss": 0.76028466, + "learning_rate": 3.286884152568687e-06, + "loss": 0.78199953, + "num_input_tokens_seen": 107107255, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.83203125, + "step": 4976, + "time_per_iteration": 2.46476149559021 + }, + { + "auxiliary_loss_clip": 0.01131159, + "auxiliary_loss_mlp": 0.01039669, + "balance_loss_clip": 1.02464914, + "balance_loss_mlp": 1.04786563, + "epoch": 0.2992334285284834, + "flos": 15559160532480.0, + "grad_norm": 2.141179611311424, + "language_loss": 0.86060619, + "learning_rate": 3.2865859972777827e-06, + "loss": 0.88231456, + "num_input_tokens_seen": 107123840, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.83203125, + "step": 4977, + "time_per_iteration": 2.4342682361602783 + }, + { + "auxiliary_loss_clip": 0.01135764, + "auxiliary_loss_mlp": 0.01041989, + "balance_loss_clip": 1.02605033, + "balance_loss_mlp": 1.0510987, + "epoch": 0.29929355178115136, + "flos": 21797561900160.0, + "grad_norm": 1.6147948075287948, + "language_loss": 0.68286109, + "learning_rate": 3.2862877931994088e-06, + "loss": 0.7046386, + "num_input_tokens_seen": 107143475, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.84375, + "step": 4978, + "time_per_iteration": 2.539616823196411 + }, + { + "auxiliary_loss_clip": 0.01138133, + "auxiliary_loss_mlp": 0.01036989, + "balance_loss_clip": 1.02078843, + "balance_loss_mlp": 1.053123, + "epoch": 0.2993536750338193, + "flos": 21178533888000.0, + "grad_norm": 1.9781984123500023, + "language_loss": 0.7654568, + "learning_rate": 3.2859895403448726e-06, + "loss": 0.78720796, + "num_input_tokens_seen": 107161725, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8515625, + "step": 4979, + "time_per_iteration": 2.4865188598632812 + }, + { + "auxiliary_loss_clip": 0.01130641, + "auxiliary_loss_mlp": 0.01038072, + "balance_loss_clip": 1.02265859, + "balance_loss_mlp": 1.04520524, + "epoch": 0.2994137982864873, + "flos": 32122130544000.0, + "grad_norm": 1.7578947600277828, + "language_loss": 0.68300819, + "learning_rate": 3.285691238725484e-06, + "loss": 0.70469534, + "num_input_tokens_seen": 107183935, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.85546875, + "step": 4980, + "time_per_iteration": 2.6137757301330566 + }, + { + "auxiliary_loss_clip": 0.01132193, + "auxiliary_loss_mlp": 0.01039056, + "balance_loss_clip": 1.02396405, + "balance_loss_mlp": 1.05068171, + "epoch": 0.29947392153915525, + "flos": 21105419754240.0, + "grad_norm": 1.9242198828448243, + "language_loss": 0.73239923, + "learning_rate": 3.285392888352555e-06, + "loss": 0.75411171, + "num_input_tokens_seen": 107204285, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8125, + "step": 4981, + "time_per_iteration": 2.5342931747436523 + }, + { + "auxiliary_loss_clip": 0.01135451, + "auxiliary_loss_mlp": 0.01037274, + "balance_loss_clip": 1.02227712, + "balance_loss_mlp": 1.04691803, + "epoch": 0.2995340447918232, + "flos": 21542632099200.0, + "grad_norm": 1.470312251429405, + "language_loss": 0.86429024, + "learning_rate": 3.2850944892373987e-06, + "loss": 0.8860175, + "num_input_tokens_seen": 107225265, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.88671875, + "step": 4982, + "time_per_iteration": 2.5238943099975586 + }, + { + "auxiliary_loss_clip": 0.01138194, + "auxiliary_loss_mlp": 0.01041283, + "balance_loss_clip": 1.02446246, + "balance_loss_mlp": 1.04975057, + "epoch": 0.2995941680444912, + "flos": 16725143917440.0, + "grad_norm": 2.2481661066872904, + "language_loss": 0.86378068, + "learning_rate": 3.2847960413913307e-06, + "loss": 0.88557541, + "num_input_tokens_seen": 107241335, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8828125, + "step": 4983, + "time_per_iteration": 2.4477322101593018 + }, + { + "auxiliary_loss_clip": 0.01133456, + "auxiliary_loss_mlp": 0.01040756, + "balance_loss_clip": 1.02577138, + "balance_loss_mlp": 1.0483377, + "epoch": 0.2996542912971592, + "flos": 20923496346240.0, + "grad_norm": 1.8474343514891325, + "language_loss": 0.78286207, + "learning_rate": 3.284497544825668e-06, + "loss": 0.80460417, + "num_input_tokens_seen": 107259375, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8515625, + "step": 4984, + "time_per_iteration": 2.490079402923584 + }, + { + "auxiliary_loss_clip": 0.01136807, + "auxiliary_loss_mlp": 0.01046143, + "balance_loss_clip": 1.02960873, + "balance_loss_mlp": 1.05052662, + "epoch": 0.29971441454982717, + "flos": 25079868754560.0, + "grad_norm": 1.555514289558953, + "language_loss": 0.78418988, + "learning_rate": 3.2841989995517303e-06, + "loss": 0.80601943, + "num_input_tokens_seen": 107279890, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.86328125, + "step": 4985, + "time_per_iteration": 2.5188379287719727 + }, + { + "auxiliary_loss_clip": 0.01138287, + "auxiliary_loss_mlp": 0.01037976, + "balance_loss_clip": 1.02115583, + "balance_loss_mlp": 1.05010915, + "epoch": 0.29977453780249513, + "flos": 52555911840000.0, + "grad_norm": 3.8074401298215905, + "language_loss": 0.72157449, + "learning_rate": 3.283900405580837e-06, + "loss": 0.74333715, + "num_input_tokens_seen": 107303430, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8828125, + "step": 4986, + "time_per_iteration": 2.7730660438537598 + }, + { + "auxiliary_loss_clip": 0.01136069, + "auxiliary_loss_mlp": 0.01041734, + "balance_loss_clip": 1.02523577, + "balance_loss_mlp": 1.04813981, + "epoch": 0.2998346610551631, + "flos": 22237144542720.0, + "grad_norm": 1.7357810931981628, + "language_loss": 0.73332191, + "learning_rate": 3.283601762924312e-06, + "loss": 0.75509989, + "num_input_tokens_seen": 107323700, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.87890625, + "step": 4987, + "time_per_iteration": 2.4857406616210938 + }, + { + "auxiliary_loss_clip": 0.01131799, + "auxiliary_loss_mlp": 0.01036451, + "balance_loss_clip": 1.02162147, + "balance_loss_mlp": 1.04787469, + "epoch": 0.29989478430783106, + "flos": 16873203778560.0, + "grad_norm": 2.6184059112472817, + "language_loss": 0.80173379, + "learning_rate": 3.2833030715934793e-06, + "loss": 0.82341629, + "num_input_tokens_seen": 107341965, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.83984375, + "step": 4988, + "time_per_iteration": 2.477614641189575 + }, + { + "auxiliary_loss_clip": 0.01133993, + "auxiliary_loss_mlp": 0.01044495, + "balance_loss_clip": 1.02874756, + "balance_loss_mlp": 1.04897678, + "epoch": 0.29995490756049903, + "flos": 23768878164480.0, + "grad_norm": 1.615528223125509, + "language_loss": 0.70302641, + "learning_rate": 3.2830043315996658e-06, + "loss": 0.72481132, + "num_input_tokens_seen": 107362615, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8515625, + "step": 4989, + "time_per_iteration": 2.4942874908447266 + }, + { + "auxiliary_loss_clip": 0.01137636, + "auxiliary_loss_mlp": 0.01040507, + "balance_loss_clip": 1.02382946, + "balance_loss_mlp": 1.05045295, + "epoch": 0.300015030813167, + "flos": 14465321614080.0, + "grad_norm": 2.0547136882256654, + "language_loss": 0.85636222, + "learning_rate": 3.282705542954199e-06, + "loss": 0.87814367, + "num_input_tokens_seen": 107378980, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.87109375, + "step": 4990, + "time_per_iteration": 2.455134391784668 + }, + { + "auxiliary_loss_clip": 0.01135916, + "auxiliary_loss_mlp": 0.01035323, + "balance_loss_clip": 1.01884782, + "balance_loss_mlp": 1.04822564, + "epoch": 0.30007515406583496, + "flos": 25191982080000.0, + "grad_norm": 1.6641511475566748, + "language_loss": 0.67125142, + "learning_rate": 3.28240670566841e-06, + "loss": 0.69296378, + "num_input_tokens_seen": 107397640, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.87890625, + "step": 4991, + "time_per_iteration": 2.4928019046783447 + }, + { + "auxiliary_loss_clip": 0.01137405, + "auxiliary_loss_mlp": 0.0103888, + "balance_loss_clip": 1.02165437, + "balance_loss_mlp": 1.0479908, + "epoch": 0.3001352773185029, + "flos": 19391188106880.0, + "grad_norm": 1.5868946812173, + "language_loss": 0.78707612, + "learning_rate": 3.28210781975363e-06, + "loss": 0.80883896, + "num_input_tokens_seen": 107416020, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.89453125, + "step": 4992, + "time_per_iteration": 2.5030534267425537 + }, + { + "auxiliary_loss_clip": 0.01135049, + "auxiliary_loss_mlp": 0.01036168, + "balance_loss_clip": 1.02056348, + "balance_loss_mlp": 1.04976213, + "epoch": 0.3001954005711709, + "flos": 21543853161600.0, + "grad_norm": 1.8035914694742925, + "language_loss": 0.824085, + "learning_rate": 3.281808885221193e-06, + "loss": 0.84579718, + "num_input_tokens_seen": 107436340, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8515625, + "step": 4993, + "time_per_iteration": 2.475588083267212 + }, + { + "auxiliary_loss_clip": 0.01138101, + "auxiliary_loss_mlp": 0.01042764, + "balance_loss_clip": 1.02522802, + "balance_loss_mlp": 1.04808736, + "epoch": 0.30025552382383885, + "flos": 17384320356480.0, + "grad_norm": 2.0505124462232898, + "language_loss": 0.85850489, + "learning_rate": 3.2815099020824345e-06, + "loss": 0.88031358, + "num_input_tokens_seen": 107454585, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8984375, + "step": 4994, + "time_per_iteration": 2.47881817817688 + }, + { + "auxiliary_loss_clip": 0.0113641, + "auxiliary_loss_mlp": 0.01036445, + "balance_loss_clip": 1.02086425, + "balance_loss_mlp": 1.05017769, + "epoch": 0.3003156470765068, + "flos": 29533330552320.0, + "grad_norm": 1.5183999234373478, + "language_loss": 0.8111707, + "learning_rate": 3.2812108703486924e-06, + "loss": 0.83289921, + "num_input_tokens_seen": 107477180, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.86328125, + "step": 4995, + "time_per_iteration": 2.5481183528900146 + }, + { + "auxiliary_loss_clip": 0.0113418, + "auxiliary_loss_mlp": 0.01041404, + "balance_loss_clip": 1.02522683, + "balance_loss_mlp": 1.05089867, + "epoch": 0.3003757703291748, + "flos": 43646402465280.0, + "grad_norm": 1.7074459415862762, + "language_loss": 0.67098773, + "learning_rate": 3.2809117900313055e-06, + "loss": 0.69274354, + "num_input_tokens_seen": 107500250, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.83203125, + "step": 4996, + "time_per_iteration": 2.6810193061828613 + }, + { + "auxiliary_loss_clip": 0.01134671, + "auxiliary_loss_mlp": 0.01040082, + "balance_loss_clip": 1.02392912, + "balance_loss_mlp": 1.04883564, + "epoch": 0.30043589358184275, + "flos": 22528380015360.0, + "grad_norm": 1.7509046873587113, + "language_loss": 0.75304276, + "learning_rate": 3.280612661141615e-06, + "loss": 0.77479029, + "num_input_tokens_seen": 107520070, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.859375, + "step": 4997, + "time_per_iteration": 2.472226858139038 + }, + { + "auxiliary_loss_clip": 0.01132042, + "auxiliary_loss_mlp": 0.01038973, + "balance_loss_clip": 1.02372646, + "balance_loss_mlp": 1.04816282, + "epoch": 0.30049601683451077, + "flos": 20995892208000.0, + "grad_norm": 1.9401125864941864, + "language_loss": 0.77664721, + "learning_rate": 3.2803134836909646e-06, + "loss": 0.79835731, + "num_input_tokens_seen": 107539285, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.83984375, + "step": 4998, + "time_per_iteration": 2.495087146759033 + }, + { + "auxiliary_loss_clip": 0.01129805, + "auxiliary_loss_mlp": 0.01039417, + "balance_loss_clip": 1.02469468, + "balance_loss_mlp": 1.04812598, + "epoch": 0.30055614008717874, + "flos": 23916004272000.0, + "grad_norm": 1.5996751316274151, + "language_loss": 0.73429006, + "learning_rate": 3.2800142576906985e-06, + "loss": 0.75598228, + "num_input_tokens_seen": 107560260, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.81640625, + "step": 4999, + "time_per_iteration": 2.491774082183838 + }, + { + "auxiliary_loss_clip": 0.01134839, + "auxiliary_loss_mlp": 0.01037955, + "balance_loss_clip": 1.02250576, + "balance_loss_mlp": 1.0498935, + "epoch": 0.3006162633398467, + "flos": 19169798630400.0, + "grad_norm": 1.6017930279588588, + "language_loss": 0.756015, + "learning_rate": 3.2797149831521626e-06, + "loss": 0.77774298, + "num_input_tokens_seen": 107579260, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8515625, + "step": 5000, + "time_per_iteration": 2.572003126144409 + }, + { + "auxiliary_loss_clip": 0.01131295, + "auxiliary_loss_mlp": 0.01036738, + "balance_loss_clip": 1.02329731, + "balance_loss_mlp": 1.04886353, + "epoch": 0.30067638659251467, + "flos": 14679241061760.0, + "grad_norm": 1.977226227337592, + "language_loss": 0.81681275, + "learning_rate": 3.2794156600867073e-06, + "loss": 0.83849311, + "num_input_tokens_seen": 107595245, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.82421875, + "step": 5001, + "time_per_iteration": 2.4240355491638184 + }, + { + "auxiliary_loss_clip": 0.01136183, + "auxiliary_loss_mlp": 0.01041393, + "balance_loss_clip": 1.02538288, + "balance_loss_mlp": 1.05103087, + "epoch": 0.30073650984518263, + "flos": 23368007404800.0, + "grad_norm": 1.5846802536013025, + "language_loss": 0.8056432, + "learning_rate": 3.2791162885056815e-06, + "loss": 0.82741892, + "num_input_tokens_seen": 107613985, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8515625, + "step": 5002, + "time_per_iteration": 2.5848264694213867 + }, + { + "auxiliary_loss_clip": 0.01137551, + "auxiliary_loss_mlp": 0.01037496, + "balance_loss_clip": 1.02240372, + "balance_loss_mlp": 1.04907179, + "epoch": 0.3007966330978506, + "flos": 22966633854720.0, + "grad_norm": 1.6918091030667293, + "language_loss": 0.71209854, + "learning_rate": 3.2788168684204376e-06, + "loss": 0.73384899, + "num_input_tokens_seen": 107631435, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8828125, + "step": 5003, + "time_per_iteration": 2.4672186374664307 + }, + { + "auxiliary_loss_clip": 0.01136595, + "auxiliary_loss_mlp": 0.01038624, + "balance_loss_clip": 1.02377009, + "balance_loss_mlp": 1.05050564, + "epoch": 0.30085675635051856, + "flos": 27818452460160.0, + "grad_norm": 1.8725932973877313, + "language_loss": 0.70613277, + "learning_rate": 3.27851739984233e-06, + "loss": 0.72788501, + "num_input_tokens_seen": 107650530, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.859375, + "step": 5004, + "time_per_iteration": 2.579941511154175 + }, + { + "auxiliary_loss_clip": 0.01135872, + "auxiliary_loss_mlp": 0.01044061, + "balance_loss_clip": 1.02817035, + "balance_loss_mlp": 1.04977477, + "epoch": 0.3009168796031865, + "flos": 10882729059840.0, + "grad_norm": 2.8634075898885767, + "language_loss": 0.81359464, + "learning_rate": 3.278217882782715e-06, + "loss": 0.83539397, + "num_input_tokens_seen": 107662240, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.859375, + "step": 5005, + "time_per_iteration": 2.4043233394622803 + }, + { + "auxiliary_loss_clip": 0.01132041, + "auxiliary_loss_mlp": 0.01041947, + "balance_loss_clip": 1.02702177, + "balance_loss_mlp": 1.04792035, + "epoch": 0.3009770028558545, + "flos": 23805399317760.0, + "grad_norm": 2.9232502202927266, + "language_loss": 0.74906754, + "learning_rate": 3.2779183172529497e-06, + "loss": 0.77080745, + "num_input_tokens_seen": 107680330, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.84375, + "step": 5006, + "time_per_iteration": 2.5169718265533447 + }, + { + "auxiliary_loss_clip": 0.0113004, + "auxiliary_loss_mlp": 0.01041924, + "balance_loss_clip": 1.02712977, + "balance_loss_mlp": 1.04745531, + "epoch": 0.30103712610852246, + "flos": 26468211283200.0, + "grad_norm": 2.157802275476472, + "language_loss": 0.70810544, + "learning_rate": 3.2776187032643932e-06, + "loss": 0.72982514, + "num_input_tokens_seen": 107700020, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.828125, + "step": 5007, + "time_per_iteration": 2.500135898590088 + }, + { + "auxiliary_loss_clip": 0.01133792, + "auxiliary_loss_mlp": 0.01040278, + "balance_loss_clip": 1.02453065, + "balance_loss_mlp": 1.04947257, + "epoch": 0.3010972493611904, + "flos": 22856459863680.0, + "grad_norm": 2.301214894203853, + "language_loss": 0.76435697, + "learning_rate": 3.2773190408284075e-06, + "loss": 0.78609765, + "num_input_tokens_seen": 107718575, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.84375, + "step": 5008, + "time_per_iteration": 2.5071120262145996 + }, + { + "auxiliary_loss_clip": 0.0113182, + "auxiliary_loss_mlp": 0.01039886, + "balance_loss_clip": 1.02464485, + "balance_loss_mlp": 1.04823518, + "epoch": 0.3011573726138584, + "flos": 24053685102720.0, + "grad_norm": 1.7973688674758703, + "language_loss": 0.84830707, + "learning_rate": 3.2770193299563564e-06, + "loss": 0.87002409, + "num_input_tokens_seen": 107738635, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8359375, + "step": 5009, + "time_per_iteration": 2.531024694442749 + }, + { + "auxiliary_loss_clip": 0.01135897, + "auxiliary_loss_mlp": 0.01037547, + "balance_loss_clip": 1.0211432, + "balance_loss_mlp": 1.04830122, + "epoch": 0.30121749586652635, + "flos": 20259687052800.0, + "grad_norm": 1.9976209282841157, + "language_loss": 0.83813334, + "learning_rate": 3.276719570659604e-06, + "loss": 0.85986781, + "num_input_tokens_seen": 107753415, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.875, + "step": 5010, + "time_per_iteration": 2.4690375328063965 + }, + { + "auxiliary_loss_clip": 0.01130658, + "auxiliary_loss_mlp": 0.01034306, + "balance_loss_clip": 1.02003646, + "balance_loss_mlp": 1.04724431, + "epoch": 0.3012776191191944, + "flos": 26943058103040.0, + "grad_norm": 1.9597018241269177, + "language_loss": 0.85013181, + "learning_rate": 3.2764197629495176e-06, + "loss": 0.87178147, + "num_input_tokens_seen": 107773840, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.83203125, + "step": 5011, + "time_per_iteration": 2.501708745956421 + }, + { + "auxiliary_loss_clip": 0.01134213, + "auxiliary_loss_mlp": 0.01039104, + "balance_loss_clip": 1.02335644, + "balance_loss_mlp": 1.04754543, + "epoch": 0.30133774237186234, + "flos": 20412307941120.0, + "grad_norm": 2.0524404295798013, + "language_loss": 0.71966654, + "learning_rate": 3.2761199068374656e-06, + "loss": 0.74139971, + "num_input_tokens_seen": 107792020, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8671875, + "step": 5012, + "time_per_iteration": 3.979128360748291 + }, + { + "auxiliary_loss_clip": 0.01131878, + "auxiliary_loss_mlp": 0.01037572, + "balance_loss_clip": 1.0229032, + "balance_loss_mlp": 1.04721081, + "epoch": 0.3013978656245303, + "flos": 19792453916160.0, + "grad_norm": 1.9997819947408795, + "language_loss": 0.87396109, + "learning_rate": 3.275820002334819e-06, + "loss": 0.89565563, + "num_input_tokens_seen": 107809595, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.84765625, + "step": 5013, + "time_per_iteration": 2.467177629470825 + }, + { + "auxiliary_loss_clip": 0.01136565, + "auxiliary_loss_mlp": 0.01036881, + "balance_loss_clip": 1.0200367, + "balance_loss_mlp": 1.04842985, + "epoch": 0.30145798887719827, + "flos": 16249650652800.0, + "grad_norm": 3.4702040063697313, + "language_loss": 0.83367115, + "learning_rate": 3.2755200494529496e-06, + "loss": 0.85540557, + "num_input_tokens_seen": 107827230, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8828125, + "step": 5014, + "time_per_iteration": 2.4654901027679443 + }, + { + "auxiliary_loss_clip": 0.01128425, + "auxiliary_loss_mlp": 0.01033529, + "balance_loss_clip": 1.01896727, + "balance_loss_mlp": 1.0471499, + "epoch": 0.30151811212986623, + "flos": 24571733005440.0, + "grad_norm": 1.6346146355602116, + "language_loss": 0.68218327, + "learning_rate": 3.2752200482032323e-06, + "loss": 0.70380276, + "num_input_tokens_seen": 107847195, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8125, + "step": 5015, + "time_per_iteration": 2.4994328022003174 + }, + { + "auxiliary_loss_clip": 0.01132371, + "auxiliary_loss_mlp": 0.01038543, + "balance_loss_clip": 1.02309942, + "balance_loss_mlp": 1.04864407, + "epoch": 0.3015782353825342, + "flos": 21872076664320.0, + "grad_norm": 2.7110353723362635, + "language_loss": 0.74712509, + "learning_rate": 3.2749199985970436e-06, + "loss": 0.76883423, + "num_input_tokens_seen": 107866420, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8359375, + "step": 5016, + "time_per_iteration": 2.5168755054473877 + }, + { + "auxiliary_loss_clip": 0.01135364, + "auxiliary_loss_mlp": 0.01036445, + "balance_loss_clip": 1.0210197, + "balance_loss_mlp": 1.0498333, + "epoch": 0.30163835863520216, + "flos": 28769331248640.0, + "grad_norm": 1.6963436015958502, + "language_loss": 0.65179884, + "learning_rate": 3.2746199006457603e-06, + "loss": 0.67351693, + "num_input_tokens_seen": 107889090, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.85546875, + "step": 5017, + "time_per_iteration": 2.543577194213867 + }, + { + "auxiliary_loss_clip": 0.01134511, + "auxiliary_loss_mlp": 0.01043761, + "balance_loss_clip": 1.02860379, + "balance_loss_mlp": 1.05030179, + "epoch": 0.30169848188787013, + "flos": 22966202891520.0, + "grad_norm": 2.078433105892768, + "language_loss": 0.69045079, + "learning_rate": 3.2743197543607628e-06, + "loss": 0.71223348, + "num_input_tokens_seen": 107907520, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.84375, + "step": 5018, + "time_per_iteration": 2.498060464859009 + }, + { + "auxiliary_loss_clip": 0.01129538, + "auxiliary_loss_mlp": 0.01041131, + "balance_loss_clip": 1.02772546, + "balance_loss_mlp": 1.04842138, + "epoch": 0.3017586051405381, + "flos": 21835268202240.0, + "grad_norm": 1.9198297669603306, + "language_loss": 0.78841144, + "learning_rate": 3.2740195597534327e-06, + "loss": 0.81011814, + "num_input_tokens_seen": 107925650, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.8125, + "step": 5019, + "time_per_iteration": 2.4873573780059814 + }, + { + "auxiliary_loss_clip": 0.01134625, + "auxiliary_loss_mlp": 0.01041878, + "balance_loss_clip": 1.02695298, + "balance_loss_mlp": 1.05073094, + "epoch": 0.30181872839320606, + "flos": 22160403135360.0, + "grad_norm": 2.24109756344656, + "language_loss": 0.69867152, + "learning_rate": 3.2737193168351527e-06, + "loss": 0.72043651, + "num_input_tokens_seen": 107943975, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.83984375, + "step": 5020, + "time_per_iteration": 2.493370532989502 + }, + { + "auxiliary_loss_clip": 0.01136052, + "auxiliary_loss_mlp": 0.01040456, + "balance_loss_clip": 1.0256741, + "balance_loss_mlp": 1.04941368, + "epoch": 0.301878851645874, + "flos": 18114168804480.0, + "grad_norm": 1.9013759847828555, + "language_loss": 0.78134364, + "learning_rate": 3.2734190256173085e-06, + "loss": 0.80310869, + "num_input_tokens_seen": 107962950, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8671875, + "step": 5021, + "time_per_iteration": 2.4670474529266357 + }, + { + "auxiliary_loss_clip": 0.01133279, + "auxiliary_loss_mlp": 0.01029745, + "balance_loss_clip": 1.01527357, + "balance_loss_mlp": 1.04964936, + "epoch": 0.301938974898542, + "flos": 17602226213760.0, + "grad_norm": 2.3821225807179696, + "language_loss": 0.76075405, + "learning_rate": 3.2731186861112877e-06, + "loss": 0.78238434, + "num_input_tokens_seen": 107979700, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8359375, + "step": 5022, + "time_per_iteration": 2.4737884998321533 + }, + { + "auxiliary_loss_clip": 0.01133657, + "auxiliary_loss_mlp": 0.01042925, + "balance_loss_clip": 1.02791631, + "balance_loss_mlp": 1.04880631, + "epoch": 0.30199909815120995, + "flos": 11181219079680.0, + "grad_norm": 1.7684005868111572, + "language_loss": 0.69896525, + "learning_rate": 3.2728182983284793e-06, + "loss": 0.72073108, + "num_input_tokens_seen": 107996645, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84765625, + "step": 5023, + "time_per_iteration": 2.4453155994415283 + }, + { + "auxiliary_loss_clip": 0.01136173, + "auxiliary_loss_mlp": 0.01041698, + "balance_loss_clip": 1.02673686, + "balance_loss_mlp": 1.04927671, + "epoch": 0.302059221403878, + "flos": 21907843632000.0, + "grad_norm": 2.0912728997662127, + "language_loss": 0.71588898, + "learning_rate": 3.2725178622802724e-06, + "loss": 0.73766768, + "num_input_tokens_seen": 108015020, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8671875, + "step": 5024, + "time_per_iteration": 2.4998810291290283 + }, + { + "auxiliary_loss_clip": 0.0113052, + "auxiliary_loss_mlp": 0.01047301, + "balance_loss_clip": 1.0314939, + "balance_loss_mlp": 1.04858792, + "epoch": 0.30211934465654594, + "flos": 26396390039040.0, + "grad_norm": 1.6483742353836974, + "language_loss": 0.73955721, + "learning_rate": 3.272217377978061e-06, + "loss": 0.76133543, + "num_input_tokens_seen": 108036430, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8203125, + "step": 5025, + "time_per_iteration": 2.5167019367218018 + }, + { + "auxiliary_loss_clip": 0.0113244, + "auxiliary_loss_mlp": 0.01042201, + "balance_loss_clip": 1.02800322, + "balance_loss_mlp": 1.0518502, + "epoch": 0.3021794679092139, + "flos": 23400470321280.0, + "grad_norm": 1.4799709397217862, + "language_loss": 0.67022824, + "learning_rate": 3.2719168454332387e-06, + "loss": 0.6919747, + "num_input_tokens_seen": 108054250, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8046875, + "step": 5026, + "time_per_iteration": 2.5326507091522217 + }, + { + "auxiliary_loss_clip": 0.01134018, + "auxiliary_loss_mlp": 0.01043238, + "balance_loss_clip": 1.02799106, + "balance_loss_mlp": 1.05083036, + "epoch": 0.30223959116188187, + "flos": 20260979942400.0, + "grad_norm": 1.6876842646939136, + "language_loss": 0.85252607, + "learning_rate": 3.2716162646572034e-06, + "loss": 0.87429863, + "num_input_tokens_seen": 108071495, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.83203125, + "step": 5027, + "time_per_iteration": 2.4527347087860107 + }, + { + "auxiliary_loss_clip": 0.01129327, + "auxiliary_loss_mlp": 0.01045705, + "balance_loss_clip": 1.03187656, + "balance_loss_mlp": 1.04739702, + "epoch": 0.30229971441454984, + "flos": 26687840993280.0, + "grad_norm": 1.665552114762065, + "language_loss": 0.78757018, + "learning_rate": 3.271315635661351e-06, + "loss": 0.80932051, + "num_input_tokens_seen": 108092135, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.8203125, + "step": 5028, + "time_per_iteration": 2.5677576065063477 + }, + { + "auxiliary_loss_clip": 0.01132481, + "auxiliary_loss_mlp": 0.01044847, + "balance_loss_clip": 1.0295043, + "balance_loss_mlp": 1.04922223, + "epoch": 0.3023598376672178, + "flos": 34345323953280.0, + "grad_norm": 2.0260385179345346, + "language_loss": 0.76721144, + "learning_rate": 3.2710149584570826e-06, + "loss": 0.78898472, + "num_input_tokens_seen": 108112945, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.83203125, + "step": 5029, + "time_per_iteration": 2.611917734146118 + }, + { + "auxiliary_loss_clip": 0.01133028, + "auxiliary_loss_mlp": 0.01043332, + "balance_loss_clip": 1.02642775, + "balance_loss_mlp": 1.04855132, + "epoch": 0.30241996091988577, + "flos": 23112143850240.0, + "grad_norm": 1.944959289407135, + "language_loss": 0.81868339, + "learning_rate": 3.2707142330557993e-06, + "loss": 0.84044701, + "num_input_tokens_seen": 108130325, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.84375, + "step": 5030, + "time_per_iteration": 2.605531930923462 + }, + { + "auxiliary_loss_clip": 0.01132926, + "auxiliary_loss_mlp": 0.01045193, + "balance_loss_clip": 1.02982664, + "balance_loss_mlp": 1.04754734, + "epoch": 0.30248008417255373, + "flos": 19390002958080.0, + "grad_norm": 1.748277903644489, + "language_loss": 0.69869608, + "learning_rate": 3.270413459468905e-06, + "loss": 0.72047728, + "num_input_tokens_seen": 108150300, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.85546875, + "step": 5031, + "time_per_iteration": 2.496833086013794 + }, + { + "auxiliary_loss_clip": 0.01132221, + "auxiliary_loss_mlp": 0.01036128, + "balance_loss_clip": 1.02103615, + "balance_loss_mlp": 1.04892659, + "epoch": 0.3025402074252217, + "flos": 23769704177280.0, + "grad_norm": 1.8467264077922103, + "language_loss": 0.82302773, + "learning_rate": 3.2701126377078047e-06, + "loss": 0.84471118, + "num_input_tokens_seen": 108170330, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 5032, + "time_per_iteration": 2.5062966346740723 + }, + { + "auxiliary_loss_clip": 0.01140181, + "auxiliary_loss_mlp": 0.01046543, + "balance_loss_clip": 1.02991903, + "balance_loss_mlp": 1.05332685, + "epoch": 0.30260033067788966, + "flos": 25994118648960.0, + "grad_norm": 2.10117653020426, + "language_loss": 0.73383862, + "learning_rate": 3.269811767783906e-06, + "loss": 0.75570583, + "num_input_tokens_seen": 108191265, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8671875, + "step": 5033, + "time_per_iteration": 2.561467170715332 + }, + { + "auxiliary_loss_clip": 0.01130223, + "auxiliary_loss_mlp": 0.01045217, + "balance_loss_clip": 1.03000593, + "balance_loss_mlp": 1.04782772, + "epoch": 0.3026604539305576, + "flos": 25374551932800.0, + "grad_norm": 1.437497934350084, + "language_loss": 0.74057245, + "learning_rate": 3.2695108497086185e-06, + "loss": 0.76232684, + "num_input_tokens_seen": 108211615, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.82421875, + "step": 5034, + "time_per_iteration": 2.511861801147461 + }, + { + "auxiliary_loss_clip": 0.01131916, + "auxiliary_loss_mlp": 0.01033507, + "balance_loss_clip": 1.01840353, + "balance_loss_mlp": 1.04825819, + "epoch": 0.3027205771832256, + "flos": 25812733944960.0, + "grad_norm": 1.9672144407329994, + "language_loss": 0.71617639, + "learning_rate": 3.269209883493352e-06, + "loss": 0.73783064, + "num_input_tokens_seen": 108231080, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8359375, + "step": 5035, + "time_per_iteration": 2.545917272567749 + }, + { + "auxiliary_loss_clip": 0.0113067, + "auxiliary_loss_mlp": 0.01032255, + "balance_loss_clip": 1.01835537, + "balance_loss_mlp": 1.04876685, + "epoch": 0.30278070043589356, + "flos": 27344539393920.0, + "grad_norm": 1.774174351542542, + "language_loss": 0.87232339, + "learning_rate": 3.2689088691495196e-06, + "loss": 0.89395267, + "num_input_tokens_seen": 108251125, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.8203125, + "step": 5036, + "time_per_iteration": 2.5197184085845947 + }, + { + "auxiliary_loss_clip": 0.01131426, + "auxiliary_loss_mlp": 0.01043041, + "balance_loss_clip": 1.02679288, + "balance_loss_mlp": 1.04866219, + "epoch": 0.3028408236885616, + "flos": 24786227070720.0, + "grad_norm": 2.2121077897300134, + "language_loss": 0.77760899, + "learning_rate": 3.268607806688536e-06, + "loss": 0.7993536, + "num_input_tokens_seen": 108272545, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.828125, + "step": 5037, + "time_per_iteration": 2.5372917652130127 + }, + { + "auxiliary_loss_clip": 0.01133533, + "auxiliary_loss_mlp": 0.01041477, + "balance_loss_clip": 1.02603984, + "balance_loss_mlp": 1.04973745, + "epoch": 0.30290094694122954, + "flos": 12932474670720.0, + "grad_norm": 2.4260021818478634, + "language_loss": 0.77920854, + "learning_rate": 3.268306696121816e-06, + "loss": 0.80095863, + "num_input_tokens_seen": 108289725, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8359375, + "step": 5038, + "time_per_iteration": 2.4360761642456055 + }, + { + "auxiliary_loss_clip": 0.01128654, + "auxiliary_loss_mlp": 0.01034869, + "balance_loss_clip": 1.02073669, + "balance_loss_mlp": 1.04859674, + "epoch": 0.3029610701938975, + "flos": 25916443488000.0, + "grad_norm": 1.8428508909689656, + "language_loss": 0.74134624, + "learning_rate": 3.2680055374607804e-06, + "loss": 0.76298141, + "num_input_tokens_seen": 108310690, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.80078125, + "step": 5039, + "time_per_iteration": 2.520517587661743 + }, + { + "auxiliary_loss_clip": 0.01129815, + "auxiliary_loss_mlp": 0.01037874, + "balance_loss_clip": 1.02426052, + "balance_loss_mlp": 1.05003977, + "epoch": 0.3030211934465655, + "flos": 21980993679360.0, + "grad_norm": 1.8268154911840482, + "language_loss": 0.80263746, + "learning_rate": 3.267704330716847e-06, + "loss": 0.82431436, + "num_input_tokens_seen": 108328905, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.796875, + "step": 5040, + "time_per_iteration": 2.469822406768799 + }, + { + "auxiliary_loss_clip": 0.01131744, + "auxiliary_loss_mlp": 0.01036261, + "balance_loss_clip": 1.02227795, + "balance_loss_mlp": 1.05101466, + "epoch": 0.30308131669923344, + "flos": 20991977625600.0, + "grad_norm": 1.5747579863116856, + "language_loss": 0.81914759, + "learning_rate": 3.267403075901438e-06, + "loss": 0.8408277, + "num_input_tokens_seen": 108346680, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.8046875, + "step": 5041, + "time_per_iteration": 2.5240108966827393 + }, + { + "auxiliary_loss_clip": 0.01062494, + "auxiliary_loss_mlp": 0.01003022, + "balance_loss_clip": 1.00106716, + "balance_loss_mlp": 1.02890241, + "epoch": 0.3031414399519014, + "flos": 60548875827840.0, + "grad_norm": 0.7678965945904674, + "language_loss": 0.59521127, + "learning_rate": 3.267101773025978e-06, + "loss": 0.61586642, + "num_input_tokens_seen": 108413885, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.3359375, + "step": 5042, + "time_per_iteration": 3.169004440307617 + }, + { + "auxiliary_loss_clip": 0.01134227, + "auxiliary_loss_mlp": 0.01037406, + "balance_loss_clip": 1.02271986, + "balance_loss_mlp": 1.05006266, + "epoch": 0.30320156320456937, + "flos": 21907664064000.0, + "grad_norm": 1.6113397759888244, + "language_loss": 0.71136838, + "learning_rate": 3.266800422101892e-06, + "loss": 0.73308468, + "num_input_tokens_seen": 108433640, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.84375, + "step": 5043, + "time_per_iteration": 2.5217440128326416 + }, + { + "auxiliary_loss_clip": 0.01132657, + "auxiliary_loss_mlp": 0.01037151, + "balance_loss_clip": 1.02207136, + "balance_loss_mlp": 1.04824769, + "epoch": 0.30326168645723733, + "flos": 21652770176640.0, + "grad_norm": 2.6644669890018773, + "language_loss": 0.69351244, + "learning_rate": 3.266499023140606e-06, + "loss": 0.71521056, + "num_input_tokens_seen": 108452640, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84375, + "step": 5044, + "time_per_iteration": 2.4741897583007812 + }, + { + "auxiliary_loss_clip": 0.01129908, + "auxiliary_loss_mlp": 0.01037342, + "balance_loss_clip": 1.02252388, + "balance_loss_mlp": 1.04823565, + "epoch": 0.3033218097099053, + "flos": 21871286565120.0, + "grad_norm": 1.3748845619029404, + "language_loss": 0.77210236, + "learning_rate": 3.2661975761535513e-06, + "loss": 0.79377484, + "num_input_tokens_seen": 108472470, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.81640625, + "step": 5045, + "time_per_iteration": 2.5023043155670166 + }, + { + "auxiliary_loss_clip": 0.01132495, + "auxiliary_loss_mlp": 0.0103816, + "balance_loss_clip": 1.02240646, + "balance_loss_mlp": 1.04892182, + "epoch": 0.30338193296257326, + "flos": 27089717333760.0, + "grad_norm": 1.538768377317596, + "language_loss": 0.72444695, + "learning_rate": 3.2658960811521564e-06, + "loss": 0.74615347, + "num_input_tokens_seen": 108493025, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8359375, + "step": 5046, + "time_per_iteration": 2.5163753032684326 + }, + { + "auxiliary_loss_clip": 0.01134062, + "auxiliary_loss_mlp": 0.01042653, + "balance_loss_clip": 1.02611256, + "balance_loss_mlp": 1.04859519, + "epoch": 0.30344205621524123, + "flos": 19534363718400.0, + "grad_norm": 3.2419373644374176, + "language_loss": 0.80737638, + "learning_rate": 3.2655945381478564e-06, + "loss": 0.82914352, + "num_input_tokens_seen": 108513480, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.85546875, + "step": 5047, + "time_per_iteration": 2.547245979309082 + }, + { + "auxiliary_loss_clip": 0.01131045, + "auxiliary_loss_mlp": 0.01040382, + "balance_loss_clip": 1.02569556, + "balance_loss_mlp": 1.04871237, + "epoch": 0.3035021794679092, + "flos": 23910976368000.0, + "grad_norm": 1.9357354539113198, + "language_loss": 0.72334075, + "learning_rate": 3.265292947152084e-06, + "loss": 0.74505508, + "num_input_tokens_seen": 108533155, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.82421875, + "step": 5048, + "time_per_iteration": 2.494016170501709 + }, + { + "auxiliary_loss_clip": 0.01129755, + "auxiliary_loss_mlp": 0.01035796, + "balance_loss_clip": 1.02093613, + "balance_loss_mlp": 1.04574537, + "epoch": 0.30356230272057716, + "flos": 16143606725760.0, + "grad_norm": 1.7731178616486785, + "language_loss": 0.75098324, + "learning_rate": 3.2649913081762763e-06, + "loss": 0.7726388, + "num_input_tokens_seen": 108551900, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.83984375, + "step": 5049, + "time_per_iteration": 2.502979040145874 + }, + { + "auxiliary_loss_clip": 0.01133123, + "auxiliary_loss_mlp": 0.01037727, + "balance_loss_clip": 1.0226109, + "balance_loss_mlp": 1.04864645, + "epoch": 0.3036224259732452, + "flos": 28914697589760.0, + "grad_norm": 1.6762363098185904, + "language_loss": 0.8194561, + "learning_rate": 3.2646896212318717e-06, + "loss": 0.84116459, + "num_input_tokens_seen": 108574005, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.84375, + "step": 5050, + "time_per_iteration": 2.5254666805267334 + }, + { + "auxiliary_loss_clip": 0.01132852, + "auxiliary_loss_mlp": 0.01038752, + "balance_loss_clip": 1.02299261, + "balance_loss_mlp": 1.04868484, + "epoch": 0.30368254922591315, + "flos": 21105599322240.0, + "grad_norm": 2.8996577335854625, + "language_loss": 0.73712784, + "learning_rate": 3.2643878863303106e-06, + "loss": 0.7588439, + "num_input_tokens_seen": 108592715, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.83984375, + "step": 5051, + "time_per_iteration": 2.511455774307251 + }, + { + "auxiliary_loss_clip": 0.01130282, + "auxiliary_loss_mlp": 0.01035032, + "balance_loss_clip": 1.01967764, + "balance_loss_mlp": 1.04650712, + "epoch": 0.3037426724785811, + "flos": 23002293081600.0, + "grad_norm": 1.5939626777548828, + "language_loss": 0.76463652, + "learning_rate": 3.264086103483033e-06, + "loss": 0.78628969, + "num_input_tokens_seen": 108611770, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8359375, + "step": 5052, + "time_per_iteration": 2.478046417236328 + }, + { + "auxiliary_loss_clip": 0.01131682, + "auxiliary_loss_mlp": 0.01039995, + "balance_loss_clip": 1.02484894, + "balance_loss_mlp": 1.04609728, + "epoch": 0.3038027957312491, + "flos": 15632705629440.0, + "grad_norm": 1.8043694132732864, + "language_loss": 0.82780337, + "learning_rate": 3.2637842727014836e-06, + "loss": 0.84952009, + "num_input_tokens_seen": 108629070, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.85546875, + "step": 5053, + "time_per_iteration": 3.983353614807129 + }, + { + "auxiliary_loss_clip": 0.01130411, + "auxiliary_loss_mlp": 0.01042277, + "balance_loss_clip": 1.02661896, + "balance_loss_mlp": 1.04685903, + "epoch": 0.30386291898391704, + "flos": 12713994195840.0, + "grad_norm": 1.5364375285570075, + "language_loss": 0.70702368, + "learning_rate": 3.2634823939971083e-06, + "loss": 0.72875059, + "num_input_tokens_seen": 108646315, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8359375, + "step": 5054, + "time_per_iteration": 2.4379446506500244 + }, + { + "auxiliary_loss_clip": 0.01132155, + "auxiliary_loss_mlp": 0.01033029, + "balance_loss_clip": 1.01768088, + "balance_loss_mlp": 1.04817367, + "epoch": 0.303923042236585, + "flos": 26359437922560.0, + "grad_norm": 1.8280069054430388, + "language_loss": 0.69543922, + "learning_rate": 3.2631804673813545e-06, + "loss": 0.71709108, + "num_input_tokens_seen": 108665920, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.84375, + "step": 5055, + "time_per_iteration": 2.5247206687927246 + }, + { + "auxiliary_loss_clip": 0.01131491, + "auxiliary_loss_mlp": 0.01036764, + "balance_loss_clip": 1.02056348, + "balance_loss_mlp": 1.04682207, + "epoch": 0.30398316548925297, + "flos": 19719232041600.0, + "grad_norm": 2.038005952710024, + "language_loss": 0.67502165, + "learning_rate": 3.2628784928656707e-06, + "loss": 0.69670427, + "num_input_tokens_seen": 108683485, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.84765625, + "step": 5056, + "time_per_iteration": 2.4767425060272217 + }, + { + "auxiliary_loss_clip": 0.01130078, + "auxiliary_loss_mlp": 0.01039078, + "balance_loss_clip": 1.02434373, + "balance_loss_mlp": 1.04886115, + "epoch": 0.30404328874192094, + "flos": 24239846315520.0, + "grad_norm": 1.5579435169669187, + "language_loss": 0.82500231, + "learning_rate": 3.262576470461507e-06, + "loss": 0.84669387, + "num_input_tokens_seen": 108702700, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8125, + "step": 5057, + "time_per_iteration": 2.499105453491211 + }, + { + "auxiliary_loss_clip": 0.01129487, + "auxiliary_loss_mlp": 0.01036385, + "balance_loss_clip": 1.02171588, + "balance_loss_mlp": 1.04686213, + "epoch": 0.3041034119945889, + "flos": 24498942094080.0, + "grad_norm": 3.274565054245196, + "language_loss": 0.89040101, + "learning_rate": 3.2622744001803176e-06, + "loss": 0.91205966, + "num_input_tokens_seen": 108721860, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.828125, + "step": 5058, + "time_per_iteration": 2.4966368675231934 + }, + { + "auxiliary_loss_clip": 0.01131903, + "auxiliary_loss_mlp": 0.01042482, + "balance_loss_clip": 1.02681756, + "balance_loss_mlp": 1.04829955, + "epoch": 0.30416353524725687, + "flos": 28288881907200.0, + "grad_norm": 2.2189779437975274, + "language_loss": 0.71709251, + "learning_rate": 3.2619722820335564e-06, + "loss": 0.73883629, + "num_input_tokens_seen": 108743215, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8359375, + "step": 5059, + "time_per_iteration": 2.5429141521453857 + }, + { + "auxiliary_loss_clip": 0.01130965, + "auxiliary_loss_mlp": 0.01037733, + "balance_loss_clip": 1.0233928, + "balance_loss_mlp": 1.04720807, + "epoch": 0.30422365849992483, + "flos": 23660392112640.0, + "grad_norm": 10.158939103063299, + "language_loss": 0.73069966, + "learning_rate": 3.26167011603268e-06, + "loss": 0.75238669, + "num_input_tokens_seen": 108765505, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.8359375, + "step": 5060, + "time_per_iteration": 2.529862403869629 + }, + { + "auxiliary_loss_clip": 0.01132671, + "auxiliary_loss_mlp": 0.01034539, + "balance_loss_clip": 1.01979291, + "balance_loss_mlp": 1.04885316, + "epoch": 0.3042837817525928, + "flos": 22998773548800.0, + "grad_norm": 1.8510962431794071, + "language_loss": 0.76926744, + "learning_rate": 3.2613679021891463e-06, + "loss": 0.79093957, + "num_input_tokens_seen": 108783370, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8359375, + "step": 5061, + "time_per_iteration": 2.496739149093628 + }, + { + "auxiliary_loss_clip": 0.01138048, + "auxiliary_loss_mlp": 0.01039513, + "balance_loss_clip": 1.02312136, + "balance_loss_mlp": 1.0527482, + "epoch": 0.30434390500526076, + "flos": 22082332924800.0, + "grad_norm": 2.264413063412747, + "language_loss": 0.82064837, + "learning_rate": 3.261065640514415e-06, + "loss": 0.84242392, + "num_input_tokens_seen": 108797430, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8515625, + "step": 5062, + "time_per_iteration": 2.476290702819824 + }, + { + "auxiliary_loss_clip": 0.01128914, + "auxiliary_loss_mlp": 0.01032652, + "balance_loss_clip": 1.01821566, + "balance_loss_mlp": 1.04721808, + "epoch": 0.3044040282579287, + "flos": 25483504861440.0, + "grad_norm": 1.7072945635391377, + "language_loss": 0.74737656, + "learning_rate": 3.2607633310199483e-06, + "loss": 0.76899219, + "num_input_tokens_seen": 108816945, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.81640625, + "step": 5063, + "time_per_iteration": 2.5384082794189453 + }, + { + "auxiliary_loss_clip": 0.01132221, + "auxiliary_loss_mlp": 0.01037959, + "balance_loss_clip": 1.0214901, + "balance_loss_mlp": 1.04908288, + "epoch": 0.30446415151059675, + "flos": 21945478106880.0, + "grad_norm": 1.8176932093217915, + "language_loss": 0.84120226, + "learning_rate": 3.26046097371721e-06, + "loss": 0.86290407, + "num_input_tokens_seen": 108836615, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.83203125, + "step": 5064, + "time_per_iteration": 2.5108115673065186 + }, + { + "auxiliary_loss_clip": 0.01131651, + "auxiliary_loss_mlp": 0.01034827, + "balance_loss_clip": 1.01888871, + "balance_loss_mlp": 1.04751444, + "epoch": 0.3045242747632647, + "flos": 16435416816000.0, + "grad_norm": 1.7759562417820063, + "language_loss": 0.75990027, + "learning_rate": 3.2601585686176655e-06, + "loss": 0.78156507, + "num_input_tokens_seen": 108855165, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.84375, + "step": 5065, + "time_per_iteration": 2.5061376094818115 + }, + { + "auxiliary_loss_clip": 0.01133071, + "auxiliary_loss_mlp": 0.01040555, + "balance_loss_clip": 1.02470005, + "balance_loss_mlp": 1.04716659, + "epoch": 0.3045843980159327, + "flos": 31540341957120.0, + "grad_norm": 2.0133457948817406, + "language_loss": 0.62271762, + "learning_rate": 3.2598561157327814e-06, + "loss": 0.64445394, + "num_input_tokens_seen": 108874690, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.859375, + "step": 5066, + "time_per_iteration": 2.6000661849975586 + }, + { + "auxiliary_loss_clip": 0.01140413, + "auxiliary_loss_mlp": 0.01049285, + "balance_loss_clip": 1.03385913, + "balance_loss_mlp": 1.05344141, + "epoch": 0.30464452126860064, + "flos": 17853636481920.0, + "grad_norm": 1.7828452375691122, + "language_loss": 0.82887459, + "learning_rate": 3.2595536150740265e-06, + "loss": 0.85077155, + "num_input_tokens_seen": 108893140, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.87109375, + "step": 5067, + "time_per_iteration": 2.4754927158355713 + }, + { + "auxiliary_loss_clip": 0.01130661, + "auxiliary_loss_mlp": 0.01043304, + "balance_loss_clip": 1.02829516, + "balance_loss_mlp": 1.04839194, + "epoch": 0.3047046445212686, + "flos": 20631398947200.0, + "grad_norm": 2.0779895110277535, + "language_loss": 0.62978256, + "learning_rate": 3.259251066652873e-06, + "loss": 0.65152222, + "num_input_tokens_seen": 108911880, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8203125, + "step": 5068, + "time_per_iteration": 2.4957847595214844 + }, + { + "auxiliary_loss_clip": 0.01127866, + "auxiliary_loss_mlp": 0.01031598, + "balance_loss_clip": 1.01633286, + "balance_loss_mlp": 1.04544926, + "epoch": 0.3047647677739366, + "flos": 21287594557440.0, + "grad_norm": 1.6700683770947133, + "language_loss": 0.75058538, + "learning_rate": 3.258948470480793e-06, + "loss": 0.77217996, + "num_input_tokens_seen": 108930440, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.82421875, + "step": 5069, + "time_per_iteration": 2.487473964691162 + }, + { + "auxiliary_loss_clip": 0.0112831, + "auxiliary_loss_mlp": 0.01043362, + "balance_loss_clip": 1.02798414, + "balance_loss_mlp": 1.04746199, + "epoch": 0.30482489102660454, + "flos": 20995928121600.0, + "grad_norm": 1.839652658151057, + "language_loss": 0.75732648, + "learning_rate": 3.258645826569261e-06, + "loss": 0.7790432, + "num_input_tokens_seen": 108949125, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.80859375, + "step": 5070, + "time_per_iteration": 2.500335216522217 + }, + { + "auxiliary_loss_clip": 0.0113368, + "auxiliary_loss_mlp": 0.0103861, + "balance_loss_clip": 1.02229047, + "balance_loss_mlp": 1.04640067, + "epoch": 0.3048850142792725, + "flos": 26290812988800.0, + "grad_norm": 1.7318177446844936, + "language_loss": 0.81738281, + "learning_rate": 3.2583431349297527e-06, + "loss": 0.83910567, + "num_input_tokens_seen": 108972190, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.875, + "step": 5071, + "time_per_iteration": 2.5726318359375 + }, + { + "auxiliary_loss_clip": 0.01134597, + "auxiliary_loss_mlp": 0.01041754, + "balance_loss_clip": 1.02507651, + "balance_loss_mlp": 1.04737437, + "epoch": 0.30494513753194047, + "flos": 22346241125760.0, + "grad_norm": 1.5942809817556516, + "language_loss": 0.76252651, + "learning_rate": 3.2580403955737467e-06, + "loss": 0.78428996, + "num_input_tokens_seen": 108990325, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.875, + "step": 5072, + "time_per_iteration": 2.5147287845611572 + }, + { + "auxiliary_loss_clip": 0.01132045, + "auxiliary_loss_mlp": 0.01046119, + "balance_loss_clip": 1.03059769, + "balance_loss_mlp": 1.04904687, + "epoch": 0.30500526078460843, + "flos": 19537667769600.0, + "grad_norm": 2.176920469303851, + "language_loss": 0.71318722, + "learning_rate": 3.257737608512723e-06, + "loss": 0.73496878, + "num_input_tokens_seen": 109009505, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.83203125, + "step": 5073, + "time_per_iteration": 2.4736156463623047 + }, + { + "auxiliary_loss_clip": 0.01135708, + "auxiliary_loss_mlp": 0.0104584, + "balance_loss_clip": 1.02974713, + "balance_loss_mlp": 1.04842663, + "epoch": 0.3050653840372764, + "flos": 14465321614080.0, + "grad_norm": 2.146618897096623, + "language_loss": 0.7663309, + "learning_rate": 3.257434773758163e-06, + "loss": 0.78814638, + "num_input_tokens_seen": 109026350, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.875, + "step": 5074, + "time_per_iteration": 2.4547433853149414 + }, + { + "auxiliary_loss_clip": 0.01131716, + "auxiliary_loss_mlp": 0.01035183, + "balance_loss_clip": 1.02015638, + "balance_loss_mlp": 1.04879379, + "epoch": 0.30512550728994436, + "flos": 24243796811520.0, + "grad_norm": 1.8636036931869358, + "language_loss": 0.73939347, + "learning_rate": 3.25713189132155e-06, + "loss": 0.76106244, + "num_input_tokens_seen": 109044165, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 5075, + "time_per_iteration": 2.4922661781311035 + }, + { + "auxiliary_loss_clip": 0.01135073, + "auxiliary_loss_mlp": 0.01042646, + "balance_loss_clip": 1.02508652, + "balance_loss_mlp": 1.04769778, + "epoch": 0.30518563054261233, + "flos": 16360542915840.0, + "grad_norm": 2.14961805392919, + "language_loss": 0.75488788, + "learning_rate": 3.2568289612143703e-06, + "loss": 0.77666509, + "num_input_tokens_seen": 109060665, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.875, + "step": 5076, + "time_per_iteration": 2.471381187438965 + }, + { + "auxiliary_loss_clip": 0.0113449, + "auxiliary_loss_mlp": 0.01039246, + "balance_loss_clip": 1.02407038, + "balance_loss_mlp": 1.05137944, + "epoch": 0.30524575379528035, + "flos": 21579584215680.0, + "grad_norm": 1.505999917432091, + "language_loss": 0.79183954, + "learning_rate": 3.25652598344811e-06, + "loss": 0.81357688, + "num_input_tokens_seen": 109080035, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.828125, + "step": 5077, + "time_per_iteration": 2.5000534057617188 + }, + { + "auxiliary_loss_clip": 0.01127394, + "auxiliary_loss_mlp": 0.01030923, + "balance_loss_clip": 1.01739252, + "balance_loss_mlp": 1.0478642, + "epoch": 0.3053058770479483, + "flos": 16545231671040.0, + "grad_norm": 1.9961733055656423, + "language_loss": 0.74662113, + "learning_rate": 3.256222958034259e-06, + "loss": 0.76820433, + "num_input_tokens_seen": 109097385, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.796875, + "step": 5078, + "time_per_iteration": 2.4746944904327393 + }, + { + "auxiliary_loss_clip": 0.01130678, + "auxiliary_loss_mlp": 0.01047379, + "balance_loss_clip": 1.03203678, + "balance_loss_mlp": 1.04787958, + "epoch": 0.3053660003006163, + "flos": 12312907954560.0, + "grad_norm": 2.113994612729099, + "language_loss": 0.67216343, + "learning_rate": 3.255919884984307e-06, + "loss": 0.69394398, + "num_input_tokens_seen": 109115495, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.828125, + "step": 5079, + "time_per_iteration": 2.4575493335723877 + }, + { + "auxiliary_loss_clip": 0.01130366, + "auxiliary_loss_mlp": 0.01034996, + "balance_loss_clip": 1.02034521, + "balance_loss_mlp": 1.04758203, + "epoch": 0.30542612355328425, + "flos": 23112287504640.0, + "grad_norm": 1.7438542216491464, + "language_loss": 0.80291754, + "learning_rate": 3.2556167643097477e-06, + "loss": 0.82457113, + "num_input_tokens_seen": 109134235, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.828125, + "step": 5080, + "time_per_iteration": 2.490842342376709 + }, + { + "auxiliary_loss_clip": 0.01129694, + "auxiliary_loss_mlp": 0.01039719, + "balance_loss_clip": 1.02475858, + "balance_loss_mlp": 1.04612935, + "epoch": 0.3054862468059522, + "flos": 24389450461440.0, + "grad_norm": 2.2926909410882903, + "language_loss": 0.80971938, + "learning_rate": 3.255313596022074e-06, + "loss": 0.83141345, + "num_input_tokens_seen": 109152760, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8359375, + "step": 5081, + "time_per_iteration": 2.5298712253570557 + }, + { + "auxiliary_loss_clip": 0.01129539, + "auxiliary_loss_mlp": 0.01034881, + "balance_loss_clip": 1.01952672, + "balance_loss_mlp": 1.04690182, + "epoch": 0.3055463700586202, + "flos": 29386096704000.0, + "grad_norm": 1.691443128795128, + "language_loss": 0.71810889, + "learning_rate": 3.255010380132783e-06, + "loss": 0.73975313, + "num_input_tokens_seen": 109173925, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.828125, + "step": 5082, + "time_per_iteration": 2.5567750930786133 + }, + { + "auxiliary_loss_clip": 0.0113354, + "auxiliary_loss_mlp": 0.01038275, + "balance_loss_clip": 1.02073884, + "balance_loss_mlp": 1.0468955, + "epoch": 0.30560649331128814, + "flos": 25591775431680.0, + "grad_norm": 1.9955003311475592, + "language_loss": 0.73615241, + "learning_rate": 3.2547071166533736e-06, + "loss": 0.75787055, + "num_input_tokens_seen": 109192510, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8671875, + "step": 5083, + "time_per_iteration": 2.5083980560302734 + }, + { + "auxiliary_loss_clip": 0.01129694, + "auxiliary_loss_mlp": 0.0103765, + "balance_loss_clip": 1.02184248, + "balance_loss_mlp": 1.04441404, + "epoch": 0.3056666165639561, + "flos": 19128321400320.0, + "grad_norm": 3.7957379738132517, + "language_loss": 0.70895267, + "learning_rate": 3.254403805595344e-06, + "loss": 0.73062611, + "num_input_tokens_seen": 109210885, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8515625, + "step": 5084, + "time_per_iteration": 2.477665424346924 + }, + { + "auxiliary_loss_clip": 0.01134775, + "auxiliary_loss_mlp": 0.01032514, + "balance_loss_clip": 1.01631355, + "balance_loss_mlp": 1.04818797, + "epoch": 0.30572673981662407, + "flos": 15523860441600.0, + "grad_norm": 2.0055460894973933, + "language_loss": 0.78791595, + "learning_rate": 3.2541004469701962e-06, + "loss": 0.80958885, + "num_input_tokens_seen": 109229180, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8671875, + "step": 5085, + "time_per_iteration": 2.475783586502075 + }, + { + "auxiliary_loss_clip": 0.01127203, + "auxiliary_loss_mlp": 0.01037047, + "balance_loss_clip": 1.02187788, + "balance_loss_mlp": 1.04529142, + "epoch": 0.30578686306929204, + "flos": 21506541909120.0, + "grad_norm": 1.5510153728860234, + "language_loss": 0.77846372, + "learning_rate": 3.2537970407894342e-06, + "loss": 0.80010617, + "num_input_tokens_seen": 109249510, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8203125, + "step": 5086, + "time_per_iteration": 2.514472007751465 + }, + { + "auxiliary_loss_clip": 0.01132639, + "auxiliary_loss_mlp": 0.01041993, + "balance_loss_clip": 1.02592945, + "balance_loss_mlp": 1.04930758, + "epoch": 0.30584698632196, + "flos": 20954271323520.0, + "grad_norm": 1.7256556540888637, + "language_loss": 0.77121228, + "learning_rate": 3.253493587064563e-06, + "loss": 0.79295856, + "num_input_tokens_seen": 109268200, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8359375, + "step": 5087, + "time_per_iteration": 2.4817616939544678 + }, + { + "auxiliary_loss_clip": 0.01132683, + "auxiliary_loss_mlp": 0.01040214, + "balance_loss_clip": 1.02346563, + "balance_loss_mlp": 1.04716742, + "epoch": 0.30590710957462797, + "flos": 24681116897280.0, + "grad_norm": 2.0600622883478517, + "language_loss": 0.72582048, + "learning_rate": 3.2531900858070885e-06, + "loss": 0.74754953, + "num_input_tokens_seen": 109288370, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.85546875, + "step": 5088, + "time_per_iteration": 2.538318395614624 + }, + { + "auxiliary_loss_clip": 0.01135035, + "auxiliary_loss_mlp": 0.01039164, + "balance_loss_clip": 1.02300477, + "balance_loss_mlp": 1.04673004, + "epoch": 0.30596723282729593, + "flos": 17086907744640.0, + "grad_norm": 2.417480227404851, + "language_loss": 0.7889666, + "learning_rate": 3.252886537028521e-06, + "loss": 0.81070858, + "num_input_tokens_seen": 109306730, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8828125, + "step": 5089, + "time_per_iteration": 2.4561989307403564 + }, + { + "auxiliary_loss_clip": 0.0113113, + "auxiliary_loss_mlp": 0.0103884, + "balance_loss_clip": 1.02328289, + "balance_loss_mlp": 1.04813027, + "epoch": 0.30602735607996395, + "flos": 22857106308480.0, + "grad_norm": 2.044405318996134, + "language_loss": 0.77061844, + "learning_rate": 3.2525829407403703e-06, + "loss": 0.79231811, + "num_input_tokens_seen": 109327360, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.828125, + "step": 5090, + "time_per_iteration": 2.5215258598327637 + }, + { + "auxiliary_loss_clip": 0.01134524, + "auxiliary_loss_mlp": 0.01046182, + "balance_loss_clip": 1.02999353, + "balance_loss_mlp": 1.04693675, + "epoch": 0.3060874793326319, + "flos": 29861482227840.0, + "grad_norm": 1.7474050348479595, + "language_loss": 0.76481628, + "learning_rate": 3.2522792969541488e-06, + "loss": 0.78662336, + "num_input_tokens_seen": 109348135, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.875, + "step": 5091, + "time_per_iteration": 2.535468578338623 + }, + { + "auxiliary_loss_clip": 0.01133443, + "auxiliary_loss_mlp": 0.0103559, + "balance_loss_clip": 1.01955616, + "balance_loss_mlp": 1.04671383, + "epoch": 0.3061476025852999, + "flos": 20448577699200.0, + "grad_norm": 1.638842582319787, + "language_loss": 0.71933579, + "learning_rate": 3.2519756056813705e-06, + "loss": 0.7410261, + "num_input_tokens_seen": 109366220, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8671875, + "step": 5092, + "time_per_iteration": 2.512096405029297 + }, + { + "auxiliary_loss_clip": 0.01131454, + "auxiliary_loss_mlp": 0.01035497, + "balance_loss_clip": 1.02131701, + "balance_loss_mlp": 1.04765177, + "epoch": 0.30620772583796785, + "flos": 19391475415680.0, + "grad_norm": 1.9362192703697652, + "language_loss": 0.8216877, + "learning_rate": 3.2516718669335522e-06, + "loss": 0.84335721, + "num_input_tokens_seen": 109385260, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8359375, + "step": 5093, + "time_per_iteration": 2.464477300643921 + }, + { + "auxiliary_loss_clip": 0.01129105, + "auxiliary_loss_mlp": 0.01036711, + "balance_loss_clip": 1.02239954, + "balance_loss_mlp": 1.04639721, + "epoch": 0.3062678490906358, + "flos": 24024562151040.0, + "grad_norm": 1.6957020618246583, + "language_loss": 0.75365555, + "learning_rate": 3.2513680807222114e-06, + "loss": 0.77531368, + "num_input_tokens_seen": 109405025, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.828125, + "step": 5094, + "time_per_iteration": 2.5149855613708496 + }, + { + "auxiliary_loss_clip": 0.01128293, + "auxiliary_loss_mlp": 0.01039658, + "balance_loss_clip": 1.02464378, + "balance_loss_mlp": 1.04530072, + "epoch": 0.3063279723433038, + "flos": 19754639873280.0, + "grad_norm": 1.922814039194465, + "language_loss": 0.76033115, + "learning_rate": 3.251064247058868e-06, + "loss": 0.78201067, + "num_input_tokens_seen": 109422465, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 5095, + "time_per_iteration": 5.438723802566528 + }, + { + "auxiliary_loss_clip": 0.01127363, + "auxiliary_loss_mlp": 0.01038505, + "balance_loss_clip": 1.02325845, + "balance_loss_mlp": 1.04581833, + "epoch": 0.30638809559597174, + "flos": 22450022496000.0, + "grad_norm": 1.7577098515851188, + "language_loss": 0.8050971, + "learning_rate": 3.250760365955042e-06, + "loss": 0.82675582, + "num_input_tokens_seen": 109440575, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.81640625, + "step": 5096, + "time_per_iteration": 2.4706614017486572 + }, + { + "auxiliary_loss_clip": 0.01130131, + "auxiliary_loss_mlp": 0.01035159, + "balance_loss_clip": 1.02052069, + "balance_loss_mlp": 1.04556763, + "epoch": 0.3064482188486397, + "flos": 17165157523200.0, + "grad_norm": 2.0672553061960586, + "language_loss": 0.8209089, + "learning_rate": 3.250456437422258e-06, + "loss": 0.84256178, + "num_input_tokens_seen": 109459050, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.84375, + "step": 5097, + "time_per_iteration": 2.457242250442505 + }, + { + "auxiliary_loss_clip": 0.0112984, + "auxiliary_loss_mlp": 0.01039085, + "balance_loss_clip": 1.02227616, + "balance_loss_mlp": 1.04537082, + "epoch": 0.3065083421013077, + "flos": 23768483114880.0, + "grad_norm": 1.9081721986815667, + "language_loss": 0.77858478, + "learning_rate": 3.250152461472041e-06, + "loss": 0.80027401, + "num_input_tokens_seen": 109475860, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.84375, + "step": 5098, + "time_per_iteration": 2.4709839820861816 + }, + { + "auxiliary_loss_clip": 0.01128893, + "auxiliary_loss_mlp": 0.01035797, + "balance_loss_clip": 1.02057385, + "balance_loss_mlp": 1.0466584, + "epoch": 0.30656846535397564, + "flos": 26431833784320.0, + "grad_norm": 1.9501450681008343, + "language_loss": 0.83948421, + "learning_rate": 3.249848438115917e-06, + "loss": 0.86113107, + "num_input_tokens_seen": 109494760, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8203125, + "step": 5099, + "time_per_iteration": 2.537771224975586 + }, + { + "auxiliary_loss_clip": 0.01130145, + "auxiliary_loss_mlp": 0.01042266, + "balance_loss_clip": 1.02653074, + "balance_loss_mlp": 1.04364753, + "epoch": 0.3066285886066436, + "flos": 26651786716800.0, + "grad_norm": 2.2273819247618376, + "language_loss": 0.85744429, + "learning_rate": 3.2495443673654148e-06, + "loss": 0.87916839, + "num_input_tokens_seen": 109516480, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8671875, + "step": 5100, + "time_per_iteration": 2.5103259086608887 + }, + { + "auxiliary_loss_clip": 0.01129277, + "auxiliary_loss_mlp": 0.01038498, + "balance_loss_clip": 1.02259541, + "balance_loss_mlp": 1.04542243, + "epoch": 0.30668871185931157, + "flos": 15049947375360.0, + "grad_norm": 1.8863659276771934, + "language_loss": 0.79225194, + "learning_rate": 3.249240249232065e-06, + "loss": 0.81392968, + "num_input_tokens_seen": 109534615, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8359375, + "step": 5101, + "time_per_iteration": 2.4733920097351074 + }, + { + "auxiliary_loss_clip": 0.01131914, + "auxiliary_loss_mlp": 0.01045873, + "balance_loss_clip": 1.02869534, + "balance_loss_mlp": 1.04708326, + "epoch": 0.30674883511197953, + "flos": 20082109190400.0, + "grad_norm": 1.7393564952665503, + "language_loss": 0.79405224, + "learning_rate": 3.2489360837273998e-06, + "loss": 0.81583011, + "num_input_tokens_seen": 109554040, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.84765625, + "step": 5102, + "time_per_iteration": 2.4608778953552246 + }, + { + "auxiliary_loss_clip": 0.01134414, + "auxiliary_loss_mlp": 0.01038608, + "balance_loss_clip": 1.02135825, + "balance_loss_mlp": 1.04940438, + "epoch": 0.30680895836464755, + "flos": 22893807029760.0, + "grad_norm": 1.7201607461659805, + "language_loss": 0.88999605, + "learning_rate": 3.2486318708629532e-06, + "loss": 0.9117263, + "num_input_tokens_seen": 109574345, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.84765625, + "step": 5103, + "time_per_iteration": 2.5295228958129883 + }, + { + "auxiliary_loss_clip": 0.01131581, + "auxiliary_loss_mlp": 0.01041048, + "balance_loss_clip": 1.02549076, + "balance_loss_mlp": 1.04700959, + "epoch": 0.3068690816173155, + "flos": 23696159080320.0, + "grad_norm": 1.6453097169103326, + "language_loss": 0.74079049, + "learning_rate": 3.2483276106502607e-06, + "loss": 0.76251674, + "num_input_tokens_seen": 109593670, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84375, + "step": 5104, + "time_per_iteration": 2.4923107624053955 + }, + { + "auxiliary_loss_clip": 0.01132054, + "auxiliary_loss_mlp": 0.01042794, + "balance_loss_clip": 1.02690291, + "balance_loss_mlp": 1.04555643, + "epoch": 0.3069292048699835, + "flos": 23551044134400.0, + "grad_norm": 1.8308515164246026, + "language_loss": 0.73333633, + "learning_rate": 3.2480233031008605e-06, + "loss": 0.75508481, + "num_input_tokens_seen": 109613385, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.86328125, + "step": 5105, + "time_per_iteration": 2.542391777038574 + }, + { + "auxiliary_loss_clip": 0.01131684, + "auxiliary_loss_mlp": 0.01047183, + "balance_loss_clip": 1.03058875, + "balance_loss_mlp": 1.04582942, + "epoch": 0.30698932812265145, + "flos": 24531656405760.0, + "grad_norm": 5.5167708582846515, + "language_loss": 0.8714695, + "learning_rate": 3.2477189482262916e-06, + "loss": 0.89325809, + "num_input_tokens_seen": 109632395, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.859375, + "step": 5106, + "time_per_iteration": 2.5054032802581787 + }, + { + "auxiliary_loss_clip": 0.01136079, + "auxiliary_loss_mlp": 0.01048019, + "balance_loss_clip": 1.03128242, + "balance_loss_mlp": 1.04750919, + "epoch": 0.3070494513753194, + "flos": 20996430912000.0, + "grad_norm": 2.142568748510771, + "language_loss": 0.71183497, + "learning_rate": 3.2474145460380945e-06, + "loss": 0.73367596, + "num_input_tokens_seen": 109651380, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.88671875, + "step": 5107, + "time_per_iteration": 2.4980053901672363 + }, + { + "auxiliary_loss_clip": 0.01125715, + "auxiliary_loss_mlp": 0.01050168, + "balance_loss_clip": 1.03372955, + "balance_loss_mlp": 1.04304433, + "epoch": 0.3071095746279874, + "flos": 19025940660480.0, + "grad_norm": 1.7923615416213727, + "language_loss": 0.72302651, + "learning_rate": 3.247110096547814e-06, + "loss": 0.74478543, + "num_input_tokens_seen": 109670240, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.82421875, + "step": 5108, + "time_per_iteration": 2.4588091373443604 + }, + { + "auxiliary_loss_clip": 0.01129796, + "auxiliary_loss_mlp": 0.01039934, + "balance_loss_clip": 1.02435362, + "balance_loss_mlp": 1.04538584, + "epoch": 0.30716969788065535, + "flos": 21215521918080.0, + "grad_norm": 1.5361542639570684, + "language_loss": 0.85768104, + "learning_rate": 3.2468055997669926e-06, + "loss": 0.87937832, + "num_input_tokens_seen": 109690810, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84375, + "step": 5109, + "time_per_iteration": 2.5077664852142334 + }, + { + "auxiliary_loss_clip": 0.01129418, + "auxiliary_loss_mlp": 0.01036702, + "balance_loss_clip": 1.02176476, + "balance_loss_mlp": 1.04534364, + "epoch": 0.3072298211333233, + "flos": 25772765086080.0, + "grad_norm": 1.6710196569280569, + "language_loss": 0.67220587, + "learning_rate": 3.2465010557071788e-06, + "loss": 0.69386709, + "num_input_tokens_seen": 109711145, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.84375, + "step": 5110, + "time_per_iteration": 2.5019631385803223 + }, + { + "auxiliary_loss_clip": 0.01126741, + "auxiliary_loss_mlp": 0.01036309, + "balance_loss_clip": 1.0220511, + "balance_loss_mlp": 1.04472136, + "epoch": 0.3072899443859913, + "flos": 25848931875840.0, + "grad_norm": 1.5071731281437177, + "language_loss": 0.76981276, + "learning_rate": 3.246196464379919e-06, + "loss": 0.79144323, + "num_input_tokens_seen": 109731425, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8203125, + "step": 5111, + "time_per_iteration": 2.544111490249634 + }, + { + "auxiliary_loss_clip": 0.01130011, + "auxiliary_loss_mlp": 0.01040184, + "balance_loss_clip": 1.02486551, + "balance_loss_mlp": 1.04580235, + "epoch": 0.30735006763865924, + "flos": 25922800195200.0, + "grad_norm": 1.9077726149637915, + "language_loss": 0.67174292, + "learning_rate": 3.245891825796765e-06, + "loss": 0.69344485, + "num_input_tokens_seen": 109752720, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.84375, + "step": 5112, + "time_per_iteration": 2.5171637535095215 + }, + { + "auxiliary_loss_clip": 0.01136791, + "auxiliary_loss_mlp": 0.01041143, + "balance_loss_clip": 1.02382207, + "balance_loss_mlp": 1.04846382, + "epoch": 0.3074101908913272, + "flos": 30917004312960.0, + "grad_norm": 1.8925702151041777, + "language_loss": 0.798181, + "learning_rate": 3.2455871399692678e-06, + "loss": 0.81996036, + "num_input_tokens_seen": 109772840, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.8828125, + "step": 5113, + "time_per_iteration": 2.55889892578125 + }, + { + "auxiliary_loss_clip": 0.01130603, + "auxiliary_loss_mlp": 0.01041707, + "balance_loss_clip": 1.0257802, + "balance_loss_mlp": 1.04549623, + "epoch": 0.30747031414399517, + "flos": 18401058731520.0, + "grad_norm": 1.951625458848465, + "language_loss": 0.77243912, + "learning_rate": 3.2452824069089815e-06, + "loss": 0.79416221, + "num_input_tokens_seen": 109790150, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8515625, + "step": 5114, + "time_per_iteration": 2.4328107833862305 + }, + { + "auxiliary_loss_clip": 0.01132188, + "auxiliary_loss_mlp": 0.01037897, + "balance_loss_clip": 1.02079093, + "balance_loss_mlp": 1.04755759, + "epoch": 0.30753043739666314, + "flos": 22633166966400.0, + "grad_norm": 1.8985095809631356, + "language_loss": 0.62356925, + "learning_rate": 3.2449776266274623e-06, + "loss": 0.64527011, + "num_input_tokens_seen": 109807985, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.84765625, + "step": 5115, + "time_per_iteration": 2.480536699295044 + }, + { + "auxiliary_loss_clip": 0.01132859, + "auxiliary_loss_mlp": 0.01036205, + "balance_loss_clip": 1.02033865, + "balance_loss_mlp": 1.04663444, + "epoch": 0.3075905606493311, + "flos": 27344072517120.0, + "grad_norm": 3.0190652682973176, + "language_loss": 0.82743216, + "learning_rate": 3.2446727991362657e-06, + "loss": 0.84912288, + "num_input_tokens_seen": 109825920, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.859375, + "step": 5116, + "time_per_iteration": 2.5121662616729736 + }, + { + "auxiliary_loss_clip": 0.01131907, + "auxiliary_loss_mlp": 0.0103869, + "balance_loss_clip": 1.02322841, + "balance_loss_mlp": 1.04825926, + "epoch": 0.3076506839019991, + "flos": 22090808534400.0, + "grad_norm": 1.8681947014951163, + "language_loss": 0.75772393, + "learning_rate": 3.244367924446952e-06, + "loss": 0.77942991, + "num_input_tokens_seen": 109846220, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8359375, + "step": 5117, + "time_per_iteration": 2.48750376701355 + }, + { + "auxiliary_loss_clip": 0.0113581, + "auxiliary_loss_mlp": 0.01035582, + "balance_loss_clip": 1.01952434, + "balance_loss_mlp": 1.05018401, + "epoch": 0.3077108071546671, + "flos": 21289533891840.0, + "grad_norm": 2.225887232792708, + "language_loss": 0.71873093, + "learning_rate": 3.2440630025710826e-06, + "loss": 0.74044484, + "num_input_tokens_seen": 109863870, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.85546875, + "step": 5118, + "time_per_iteration": 2.4745492935180664 + }, + { + "auxiliary_loss_clip": 0.01130971, + "auxiliary_loss_mlp": 0.01039981, + "balance_loss_clip": 1.02442479, + "balance_loss_mlp": 1.04630661, + "epoch": 0.30777093040733505, + "flos": 21430985650560.0, + "grad_norm": 1.5789952404099556, + "language_loss": 0.74312431, + "learning_rate": 3.243758033520219e-06, + "loss": 0.76483381, + "num_input_tokens_seen": 109883500, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.84375, + "step": 5119, + "time_per_iteration": 2.5185489654541016 + }, + { + "auxiliary_loss_clip": 0.01136122, + "auxiliary_loss_mlp": 0.01051479, + "balance_loss_clip": 1.03291845, + "balance_loss_mlp": 1.04891181, + "epoch": 0.307831053660003, + "flos": 23149275534720.0, + "grad_norm": 1.733023320063412, + "language_loss": 0.80267692, + "learning_rate": 3.243453017305926e-06, + "loss": 0.82455289, + "num_input_tokens_seen": 109904620, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.875, + "step": 5120, + "time_per_iteration": 2.5592849254608154 + }, + { + "auxiliary_loss_clip": 0.01127219, + "auxiliary_loss_mlp": 0.01048202, + "balance_loss_clip": 1.03299093, + "balance_loss_mlp": 1.04384947, + "epoch": 0.307891176912671, + "flos": 17019755268480.0, + "grad_norm": 1.564134517039273, + "language_loss": 0.80110037, + "learning_rate": 3.24314795393977e-06, + "loss": 0.82285464, + "num_input_tokens_seen": 109922275, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8359375, + "step": 5121, + "time_per_iteration": 2.440516948699951 + }, + { + "auxiliary_loss_clip": 0.0113076, + "auxiliary_loss_mlp": 0.01035014, + "balance_loss_clip": 1.01981497, + "balance_loss_mlp": 1.0480212, + "epoch": 0.30795130016533895, + "flos": 27705046245120.0, + "grad_norm": 1.5001896125792977, + "language_loss": 0.82594395, + "learning_rate": 3.242842843433319e-06, + "loss": 0.84760171, + "num_input_tokens_seen": 109944265, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.828125, + "step": 5122, + "time_per_iteration": 2.510576009750366 + }, + { + "auxiliary_loss_clip": 0.01050329, + "auxiliary_loss_mlp": 0.01017411, + "balance_loss_clip": 1.01562333, + "balance_loss_mlp": 1.01982307, + "epoch": 0.3080114234180069, + "flos": 69058699591680.0, + "grad_norm": 0.7473381596642288, + "language_loss": 0.58639288, + "learning_rate": 3.242537685798143e-06, + "loss": 0.60707027, + "num_input_tokens_seen": 110014160, + "router_z_loss_clip": 0.01782227, + "router_z_loss_mlp": 0.3046875, + "step": 5123, + "time_per_iteration": 3.2167654037475586 + }, + { + "auxiliary_loss_clip": 0.01134332, + "auxiliary_loss_mlp": 0.01036733, + "balance_loss_clip": 1.01917315, + "balance_loss_mlp": 1.04640436, + "epoch": 0.3080715466706749, + "flos": 24060221377920.0, + "grad_norm": 1.5767520801619384, + "language_loss": 0.83622873, + "learning_rate": 3.242232481045813e-06, + "loss": 0.85793942, + "num_input_tokens_seen": 110034865, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.87890625, + "step": 5124, + "time_per_iteration": 2.474625587463379 + }, + { + "auxiliary_loss_clip": 0.01135515, + "auxiliary_loss_mlp": 0.01039715, + "balance_loss_clip": 1.02420545, + "balance_loss_mlp": 1.04945302, + "epoch": 0.30813166992334284, + "flos": 25848680480640.0, + "grad_norm": 1.8429802725909379, + "language_loss": 0.78703862, + "learning_rate": 3.2419272291879035e-06, + "loss": 0.80879092, + "num_input_tokens_seen": 110052930, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.859375, + "step": 5125, + "time_per_iteration": 2.5806493759155273 + }, + { + "auxiliary_loss_clip": 0.01134207, + "auxiliary_loss_mlp": 0.01037354, + "balance_loss_clip": 1.02050948, + "balance_loss_mlp": 1.04717779, + "epoch": 0.3081917931760108, + "flos": 20449619193600.0, + "grad_norm": 1.8928574451074776, + "language_loss": 0.6450479, + "learning_rate": 3.241621930235989e-06, + "loss": 0.66676342, + "num_input_tokens_seen": 110071765, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.87109375, + "step": 5126, + "time_per_iteration": 2.467099666595459 + }, + { + "auxiliary_loss_clip": 0.01129876, + "auxiliary_loss_mlp": 0.01039444, + "balance_loss_clip": 1.02367234, + "balance_loss_mlp": 1.04831636, + "epoch": 0.3082519164286788, + "flos": 22166257052160.0, + "grad_norm": 1.5538294270453243, + "language_loss": 0.86619091, + "learning_rate": 3.241316584201646e-06, + "loss": 0.88788408, + "num_input_tokens_seen": 110092660, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.81640625, + "step": 5127, + "time_per_iteration": 2.543095111846924 + }, + { + "auxiliary_loss_clip": 0.01129649, + "auxiliary_loss_mlp": 0.01040552, + "balance_loss_clip": 1.02439952, + "balance_loss_mlp": 1.04648781, + "epoch": 0.30831203968134674, + "flos": 28913404700160.0, + "grad_norm": 2.186420023793508, + "language_loss": 0.68816996, + "learning_rate": 3.2410111910964538e-06, + "loss": 0.70987189, + "num_input_tokens_seen": 110114960, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.83203125, + "step": 5128, + "time_per_iteration": 2.525390863418579 + }, + { + "auxiliary_loss_clip": 0.01134323, + "auxiliary_loss_mlp": 0.01041963, + "balance_loss_clip": 1.02571476, + "balance_loss_mlp": 1.04763198, + "epoch": 0.3083721629340147, + "flos": 25667726739840.0, + "grad_norm": 1.801256837086347, + "language_loss": 0.71226776, + "learning_rate": 3.240705750931993e-06, + "loss": 0.7340306, + "num_input_tokens_seen": 110135750, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8671875, + "step": 5129, + "time_per_iteration": 2.5417068004608154 + }, + { + "auxiliary_loss_clip": 0.01045915, + "auxiliary_loss_mlp": 0.01008464, + "balance_loss_clip": 1.00633001, + "balance_loss_mlp": 1.01580441, + "epoch": 0.3084322861866827, + "flos": 68212679581440.0, + "grad_norm": 0.9000157132793972, + "language_loss": 0.59171313, + "learning_rate": 3.240400263719846e-06, + "loss": 0.61225688, + "num_input_tokens_seen": 110189480, + "router_z_loss_clip": 0.0213623, + "router_z_loss_mlp": 0.30078125, + "step": 5130, + "time_per_iteration": 3.024799108505249 + }, + { + "auxiliary_loss_clip": 0.01135089, + "auxiliary_loss_mlp": 0.01038466, + "balance_loss_clip": 1.02233696, + "balance_loss_mlp": 1.0485276, + "epoch": 0.3084924094393507, + "flos": 20296495514880.0, + "grad_norm": 2.1422150520884773, + "language_loss": 0.72951442, + "learning_rate": 3.2400947294715957e-06, + "loss": 0.75124997, + "num_input_tokens_seen": 110206445, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8671875, + "step": 5131, + "time_per_iteration": 2.5145480632781982 + }, + { + "auxiliary_loss_clip": 0.01130631, + "auxiliary_loss_mlp": 0.01036573, + "balance_loss_clip": 1.02222049, + "balance_loss_mlp": 1.04737425, + "epoch": 0.30855253269201866, + "flos": 23949831905280.0, + "grad_norm": 1.759562546324366, + "language_loss": 0.71208251, + "learning_rate": 3.2397891481988303e-06, + "loss": 0.73375452, + "num_input_tokens_seen": 110226845, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.83203125, + "step": 5132, + "time_per_iteration": 2.4997506141662598 + }, + { + "auxiliary_loss_clip": 0.01128489, + "auxiliary_loss_mlp": 0.01040365, + "balance_loss_clip": 1.02580929, + "balance_loss_mlp": 1.04823279, + "epoch": 0.3086126559446866, + "flos": 19281876042240.0, + "grad_norm": 1.7072095629792627, + "language_loss": 0.8999784, + "learning_rate": 3.239483519913136e-06, + "loss": 0.92166698, + "num_input_tokens_seen": 110244095, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8046875, + "step": 5133, + "time_per_iteration": 2.4972143173217773 + }, + { + "auxiliary_loss_clip": 0.01136466, + "auxiliary_loss_mlp": 0.01047935, + "balance_loss_clip": 1.03186607, + "balance_loss_mlp": 1.04911399, + "epoch": 0.3086727791973546, + "flos": 33760770019200.0, + "grad_norm": 1.8506383958840185, + "language_loss": 0.67226613, + "learning_rate": 3.239177844626102e-06, + "loss": 0.6941101, + "num_input_tokens_seen": 110264240, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.875, + "step": 5134, + "time_per_iteration": 2.5700669288635254 + }, + { + "auxiliary_loss_clip": 0.0113384, + "auxiliary_loss_mlp": 0.01047239, + "balance_loss_clip": 1.0317775, + "balance_loss_mlp": 1.04718161, + "epoch": 0.30873290245002255, + "flos": 16034151006720.0, + "grad_norm": 2.423009332179396, + "language_loss": 0.82865155, + "learning_rate": 3.2388721223493197e-06, + "loss": 0.85046244, + "num_input_tokens_seen": 110282450, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8671875, + "step": 5135, + "time_per_iteration": 2.4712367057800293 + }, + { + "auxiliary_loss_clip": 0.0104583, + "auxiliary_loss_mlp": 0.01004049, + "balance_loss_clip": 1.00197494, + "balance_loss_mlp": 1.015975, + "epoch": 0.3087930257026905, + "flos": 65048304055680.0, + "grad_norm": 0.7120747448350507, + "language_loss": 0.55243868, + "learning_rate": 3.2385663530943824e-06, + "loss": 0.57293749, + "num_input_tokens_seen": 110343715, + "router_z_loss_clip": 0.02075195, + "router_z_loss_mlp": 0.29882812, + "step": 5136, + "time_per_iteration": 3.1432137489318848 + }, + { + "auxiliary_loss_clip": 0.01132561, + "auxiliary_loss_mlp": 0.01040613, + "balance_loss_clip": 1.02502048, + "balance_loss_mlp": 1.04724097, + "epoch": 0.3088531489553585, + "flos": 74738829824640.0, + "grad_norm": 1.9824711220984585, + "language_loss": 0.76057774, + "learning_rate": 3.2382605368728852e-06, + "loss": 0.78230941, + "num_input_tokens_seen": 110368430, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8515625, + "step": 5137, + "time_per_iteration": 5.764686822891235 + }, + { + "auxiliary_loss_clip": 0.0113183, + "auxiliary_loss_mlp": 0.01037097, + "balance_loss_clip": 1.02310133, + "balance_loss_mlp": 1.04696631, + "epoch": 0.30891327220802645, + "flos": 21142300043520.0, + "grad_norm": 2.0179579208290264, + "language_loss": 0.79909992, + "learning_rate": 3.237954673696424e-06, + "loss": 0.8207891, + "num_input_tokens_seen": 110386735, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.84765625, + "step": 5138, + "time_per_iteration": 2.45621657371521 + }, + { + "auxiliary_loss_clip": 0.01132491, + "auxiliary_loss_mlp": 0.01042876, + "balance_loss_clip": 1.02666378, + "balance_loss_mlp": 1.04560494, + "epoch": 0.3089733954606944, + "flos": 25664494515840.0, + "grad_norm": 1.4272945699581137, + "language_loss": 0.81220984, + "learning_rate": 3.2376487635765983e-06, + "loss": 0.83396351, + "num_input_tokens_seen": 110406820, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.87109375, + "step": 5139, + "time_per_iteration": 2.5283203125 + }, + { + "auxiliary_loss_clip": 0.01137198, + "auxiliary_loss_mlp": 0.01042206, + "balance_loss_clip": 1.02492023, + "balance_loss_mlp": 1.04787183, + "epoch": 0.3090335187133624, + "flos": 19427350124160.0, + "grad_norm": 2.1565991279061736, + "language_loss": 0.77528149, + "learning_rate": 3.2373428065250067e-06, + "loss": 0.79707557, + "num_input_tokens_seen": 110424225, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.89453125, + "step": 5140, + "time_per_iteration": 2.43929386138916 + }, + { + "auxiliary_loss_clip": 0.01129104, + "auxiliary_loss_mlp": 0.01044008, + "balance_loss_clip": 1.02920234, + "balance_loss_mlp": 1.04757929, + "epoch": 0.30909364196603034, + "flos": 20011329440640.0, + "grad_norm": 2.2023621297160156, + "language_loss": 0.78595555, + "learning_rate": 3.237036802553252e-06, + "loss": 0.80768663, + "num_input_tokens_seen": 110443310, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.81640625, + "step": 5141, + "time_per_iteration": 2.5164880752563477 + }, + { + "auxiliary_loss_clip": 0.01134378, + "auxiliary_loss_mlp": 0.01047349, + "balance_loss_clip": 1.03046894, + "balance_loss_mlp": 1.04716825, + "epoch": 0.3091537652186983, + "flos": 19677575243520.0, + "grad_norm": 2.127714885761315, + "language_loss": 0.87142885, + "learning_rate": 3.2367307516729377e-06, + "loss": 0.89324611, + "num_input_tokens_seen": 110460215, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.87109375, + "step": 5142, + "time_per_iteration": 2.4362974166870117 + }, + { + "auxiliary_loss_clip": 0.01131531, + "auxiliary_loss_mlp": 0.0104755, + "balance_loss_clip": 1.03220749, + "balance_loss_mlp": 1.04556274, + "epoch": 0.3092138884713663, + "flos": 17020042577280.0, + "grad_norm": 1.7972015737501748, + "language_loss": 0.7877624, + "learning_rate": 3.23642465389567e-06, + "loss": 0.80955315, + "num_input_tokens_seen": 110479385, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.859375, + "step": 5143, + "time_per_iteration": 2.459317445755005 + }, + { + "auxiliary_loss_clip": 0.01130331, + "auxiliary_loss_mlp": 0.01043432, + "balance_loss_clip": 1.02742219, + "balance_loss_mlp": 1.04593444, + "epoch": 0.3092740117240343, + "flos": 25009986844800.0, + "grad_norm": 1.9461458902951219, + "language_loss": 0.72098875, + "learning_rate": 3.236118509233055e-06, + "loss": 0.74272639, + "num_input_tokens_seen": 110499885, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.84375, + "step": 5144, + "time_per_iteration": 2.4872243404388428 + }, + { + "auxiliary_loss_clip": 0.01132569, + "auxiliary_loss_mlp": 0.01040748, + "balance_loss_clip": 1.02418947, + "balance_loss_mlp": 1.04587483, + "epoch": 0.30933413497670226, + "flos": 25590410714880.0, + "grad_norm": 1.7305751805857612, + "language_loss": 0.74054307, + "learning_rate": 3.235812317696702e-06, + "loss": 0.76227629, + "num_input_tokens_seen": 110519690, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8671875, + "step": 5145, + "time_per_iteration": 2.524683952331543 + }, + { + "auxiliary_loss_clip": 0.01132717, + "auxiliary_loss_mlp": 0.01045609, + "balance_loss_clip": 1.02951622, + "balance_loss_mlp": 1.04737079, + "epoch": 0.3093942582293702, + "flos": 24389665943040.0, + "grad_norm": 1.6607552662218326, + "language_loss": 0.76461762, + "learning_rate": 3.2355060792982224e-06, + "loss": 0.78640091, + "num_input_tokens_seen": 110540520, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8515625, + "step": 5146, + "time_per_iteration": 2.4848198890686035 + }, + { + "auxiliary_loss_clip": 0.01130265, + "auxiliary_loss_mlp": 0.01037136, + "balance_loss_clip": 1.02213407, + "balance_loss_mlp": 1.04672074, + "epoch": 0.3094543814820382, + "flos": 19646441130240.0, + "grad_norm": 2.385312171088194, + "language_loss": 0.66755533, + "learning_rate": 3.2351997940492286e-06, + "loss": 0.68922937, + "num_input_tokens_seen": 110557950, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8359375, + "step": 5147, + "time_per_iteration": 2.4861929416656494 + }, + { + "auxiliary_loss_clip": 0.01135751, + "auxiliary_loss_mlp": 0.01040015, + "balance_loss_clip": 1.02517319, + "balance_loss_mlp": 1.04931486, + "epoch": 0.30951450473470615, + "flos": 25663812157440.0, + "grad_norm": 2.0402709532397205, + "language_loss": 0.75148058, + "learning_rate": 3.2348934619613346e-06, + "loss": 0.77323824, + "num_input_tokens_seen": 110578215, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8671875, + "step": 5148, + "time_per_iteration": 2.505180597305298 + }, + { + "auxiliary_loss_clip": 0.01139245, + "auxiliary_loss_mlp": 0.0104464, + "balance_loss_clip": 1.02815318, + "balance_loss_mlp": 1.04876494, + "epoch": 0.3095746279873741, + "flos": 12020415505920.0, + "grad_norm": 2.1288750992632677, + "language_loss": 0.72576058, + "learning_rate": 3.2345870830461567e-06, + "loss": 0.74759942, + "num_input_tokens_seen": 110592990, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.90625, + "step": 5149, + "time_per_iteration": 2.4605252742767334 + }, + { + "auxiliary_loss_clip": 0.01133233, + "auxiliary_loss_mlp": 0.01041255, + "balance_loss_clip": 1.02442312, + "balance_loss_mlp": 1.0457058, + "epoch": 0.3096347512400421, + "flos": 23623044946560.0, + "grad_norm": 2.112154456836484, + "language_loss": 0.84981489, + "learning_rate": 3.2342806573153132e-06, + "loss": 0.87155974, + "num_input_tokens_seen": 110612130, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.875, + "step": 5150, + "time_per_iteration": 2.4866578578948975 + }, + { + "auxiliary_loss_clip": 0.01131574, + "auxiliary_loss_mlp": 0.01041618, + "balance_loss_clip": 1.02515531, + "balance_loss_mlp": 1.04593086, + "epoch": 0.30969487449271005, + "flos": 22529313768960.0, + "grad_norm": 1.9529089609254688, + "language_loss": 0.79053164, + "learning_rate": 3.233974184780424e-06, + "loss": 0.81226349, + "num_input_tokens_seen": 110632045, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.859375, + "step": 5151, + "time_per_iteration": 2.4936540126800537 + }, + { + "auxiliary_loss_clip": 0.01133842, + "auxiliary_loss_mlp": 0.01042555, + "balance_loss_clip": 1.02580595, + "balance_loss_mlp": 1.0471015, + "epoch": 0.309754997745378, + "flos": 15267925059840.0, + "grad_norm": 3.1311630498810774, + "language_loss": 0.67020154, + "learning_rate": 3.2336676654531084e-06, + "loss": 0.69196552, + "num_input_tokens_seen": 110649340, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8671875, + "step": 5152, + "time_per_iteration": 2.429640054702759 + }, + { + "auxiliary_loss_clip": 0.01132623, + "auxiliary_loss_mlp": 0.01043705, + "balance_loss_clip": 1.0275166, + "balance_loss_mlp": 1.04688787, + "epoch": 0.309815120998046, + "flos": 26979291947520.0, + "grad_norm": 12.57465651148819, + "language_loss": 0.82058132, + "learning_rate": 3.2333610993449926e-06, + "loss": 0.84234464, + "num_input_tokens_seen": 110668450, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.859375, + "step": 5153, + "time_per_iteration": 2.578856945037842 + }, + { + "auxiliary_loss_clip": 0.01133847, + "auxiliary_loss_mlp": 0.01042916, + "balance_loss_clip": 1.02788973, + "balance_loss_mlp": 1.04822588, + "epoch": 0.30987524425071394, + "flos": 21143161969920.0, + "grad_norm": 1.7956706783057126, + "language_loss": 0.73902357, + "learning_rate": 3.2330544864676997e-06, + "loss": 0.76079118, + "num_input_tokens_seen": 110689410, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.85546875, + "step": 5154, + "time_per_iteration": 2.5063655376434326 + }, + { + "auxiliary_loss_clip": 0.01133271, + "auxiliary_loss_mlp": 0.01039056, + "balance_loss_clip": 1.02287924, + "balance_loss_mlp": 1.04747653, + "epoch": 0.3099353675033819, + "flos": 15268284195840.0, + "grad_norm": 2.516871287947693, + "language_loss": 0.76051688, + "learning_rate": 3.232747826832858e-06, + "loss": 0.78224009, + "num_input_tokens_seen": 110707350, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.859375, + "step": 5155, + "time_per_iteration": 2.4838123321533203 + }, + { + "auxiliary_loss_clip": 0.01135928, + "auxiliary_loss_mlp": 0.01042973, + "balance_loss_clip": 1.02701044, + "balance_loss_mlp": 1.04871869, + "epoch": 0.30999549075604993, + "flos": 15413794191360.0, + "grad_norm": 1.7492301646526522, + "language_loss": 0.7883296, + "learning_rate": 3.232441120452094e-06, + "loss": 0.81011862, + "num_input_tokens_seen": 110724910, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87109375, + "step": 5156, + "time_per_iteration": 2.4420597553253174 + }, + { + "auxiliary_loss_clip": 0.01134302, + "auxiliary_loss_mlp": 0.01046544, + "balance_loss_clip": 1.02894902, + "balance_loss_mlp": 1.04688191, + "epoch": 0.3100556140087179, + "flos": 23184539712000.0, + "grad_norm": 3.007667649484548, + "language_loss": 0.75094402, + "learning_rate": 3.23213436733704e-06, + "loss": 0.77275252, + "num_input_tokens_seen": 110744010, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.875, + "step": 5157, + "time_per_iteration": 2.4922094345092773 + }, + { + "auxiliary_loss_clip": 0.01131262, + "auxiliary_loss_mlp": 0.01037688, + "balance_loss_clip": 1.02282262, + "balance_loss_mlp": 1.04701662, + "epoch": 0.31011573726138586, + "flos": 25742169676800.0, + "grad_norm": 1.583276716554569, + "language_loss": 0.69391131, + "learning_rate": 3.231827567499327e-06, + "loss": 0.71560085, + "num_input_tokens_seen": 110765835, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.84375, + "step": 5158, + "time_per_iteration": 2.5119874477386475 + }, + { + "auxiliary_loss_clip": 0.0113222, + "auxiliary_loss_mlp": 0.01040926, + "balance_loss_clip": 1.0260725, + "balance_loss_mlp": 1.04802489, + "epoch": 0.3101758605140538, + "flos": 20011329440640.0, + "grad_norm": 1.8674515495135584, + "language_loss": 0.84731698, + "learning_rate": 3.2315207209505896e-06, + "loss": 0.86904848, + "num_input_tokens_seen": 110784655, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.84375, + "step": 5159, + "time_per_iteration": 2.5553479194641113 + }, + { + "auxiliary_loss_clip": 0.01130577, + "auxiliary_loss_mlp": 0.01037318, + "balance_loss_clip": 1.0215224, + "balance_loss_mlp": 1.04617286, + "epoch": 0.3102359837667218, + "flos": 19135683688320.0, + "grad_norm": 1.6286624468626467, + "language_loss": 0.85222661, + "learning_rate": 3.231213827702462e-06, + "loss": 0.87390554, + "num_input_tokens_seen": 110802545, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.84375, + "step": 5160, + "time_per_iteration": 2.4521608352661133 + }, + { + "auxiliary_loss_clip": 0.01131067, + "auxiliary_loss_mlp": 0.01039214, + "balance_loss_clip": 1.02385354, + "balance_loss_mlp": 1.04720986, + "epoch": 0.31029610701938976, + "flos": 22265405568000.0, + "grad_norm": 2.1323719792042404, + "language_loss": 0.76438844, + "learning_rate": 3.230906887766584e-06, + "loss": 0.78609127, + "num_input_tokens_seen": 110820265, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.83984375, + "step": 5161, + "time_per_iteration": 2.4705073833465576 + }, + { + "auxiliary_loss_clip": 0.01133183, + "auxiliary_loss_mlp": 0.01040129, + "balance_loss_clip": 1.02420259, + "balance_loss_mlp": 1.04661226, + "epoch": 0.3103562302720577, + "flos": 20805349536000.0, + "grad_norm": 1.9681741891595628, + "language_loss": 0.81644946, + "learning_rate": 3.2305999011545924e-06, + "loss": 0.83818257, + "num_input_tokens_seen": 110836195, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8671875, + "step": 5162, + "time_per_iteration": 2.4359090328216553 + }, + { + "auxiliary_loss_clip": 0.01129525, + "auxiliary_loss_mlp": 0.01037231, + "balance_loss_clip": 1.0231998, + "balance_loss_mlp": 1.04580498, + "epoch": 0.3104163535247257, + "flos": 22344158136960.0, + "grad_norm": 1.6668116654420786, + "language_loss": 0.82879269, + "learning_rate": 3.2302928678781295e-06, + "loss": 0.85046029, + "num_input_tokens_seen": 110856420, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8359375, + "step": 5163, + "time_per_iteration": 2.536198854446411 + }, + { + "auxiliary_loss_clip": 0.01134589, + "auxiliary_loss_mlp": 0.01042558, + "balance_loss_clip": 1.02670264, + "balance_loss_mlp": 1.04848182, + "epoch": 0.31047647677739365, + "flos": 21689363157120.0, + "grad_norm": 1.61479678935284, + "language_loss": 0.76103258, + "learning_rate": 3.2299857879488376e-06, + "loss": 0.78280413, + "num_input_tokens_seen": 110876650, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.859375, + "step": 5164, + "time_per_iteration": 2.4736320972442627 + }, + { + "auxiliary_loss_clip": 0.01134485, + "auxiliary_loss_mlp": 0.01041252, + "balance_loss_clip": 1.02492666, + "balance_loss_mlp": 1.04932189, + "epoch": 0.3105366000300616, + "flos": 18917275040640.0, + "grad_norm": 1.73414256762253, + "language_loss": 0.74515426, + "learning_rate": 3.2296786613783626e-06, + "loss": 0.76691169, + "num_input_tokens_seen": 110894445, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8515625, + "step": 5165, + "time_per_iteration": 2.4788122177124023 + }, + { + "auxiliary_loss_clip": 0.01132367, + "auxiliary_loss_mlp": 0.01042006, + "balance_loss_clip": 1.02627063, + "balance_loss_mlp": 1.0472759, + "epoch": 0.3105967232827296, + "flos": 18260397072000.0, + "grad_norm": 2.461614607097325, + "language_loss": 0.75987816, + "learning_rate": 3.229371488178348e-06, + "loss": 0.78162187, + "num_input_tokens_seen": 110912855, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8515625, + "step": 5166, + "time_per_iteration": 2.4461371898651123 + }, + { + "auxiliary_loss_clip": 0.01133631, + "auxiliary_loss_mlp": 0.01045635, + "balance_loss_clip": 1.02939892, + "balance_loss_mlp": 1.04844868, + "epoch": 0.31065684653539755, + "flos": 17672144037120.0, + "grad_norm": 2.4324780660218557, + "language_loss": 0.73424876, + "learning_rate": 3.229064268360444e-06, + "loss": 0.75604147, + "num_input_tokens_seen": 110928025, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8515625, + "step": 5167, + "time_per_iteration": 2.4301631450653076 + }, + { + "auxiliary_loss_clip": 0.01047334, + "auxiliary_loss_mlp": 0.01006703, + "balance_loss_clip": 1.00467682, + "balance_loss_mlp": 1.01844001, + "epoch": 0.3107169697880655, + "flos": 68531996511360.0, + "grad_norm": 0.725291341239906, + "language_loss": 0.53031516, + "learning_rate": 3.2287570019362997e-06, + "loss": 0.55085552, + "num_input_tokens_seen": 110992215, + "router_z_loss_clip": 0.02026367, + "router_z_loss_mlp": 0.2890625, + "step": 5168, + "time_per_iteration": 3.1146020889282227 + }, + { + "auxiliary_loss_clip": 0.01133751, + "auxiliary_loss_mlp": 0.01043639, + "balance_loss_clip": 1.0269258, + "balance_loss_mlp": 1.0465318, + "epoch": 0.3107770930407335, + "flos": 13188733274880.0, + "grad_norm": 1.782356602828545, + "language_loss": 0.78745592, + "learning_rate": 3.2284496889175668e-06, + "loss": 0.80922985, + "num_input_tokens_seen": 111010400, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.87109375, + "step": 5169, + "time_per_iteration": 2.4755852222442627 + }, + { + "auxiliary_loss_clip": 0.01132974, + "auxiliary_loss_mlp": 0.01038846, + "balance_loss_clip": 1.02337217, + "balance_loss_mlp": 1.04640126, + "epoch": 0.3108372162934015, + "flos": 31580849520000.0, + "grad_norm": 1.536235209485244, + "language_loss": 0.6414057, + "learning_rate": 3.2281423293158986e-06, + "loss": 0.66312397, + "num_input_tokens_seen": 111033960, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8671875, + "step": 5170, + "time_per_iteration": 2.5690839290618896 + }, + { + "auxiliary_loss_clip": 0.01133399, + "auxiliary_loss_mlp": 0.01042243, + "balance_loss_clip": 1.02635252, + "balance_loss_mlp": 1.04721069, + "epoch": 0.31089733954606946, + "flos": 28729829266560.0, + "grad_norm": 2.41080559035864, + "language_loss": 0.77698815, + "learning_rate": 3.22783492314295e-06, + "loss": 0.79874456, + "num_input_tokens_seen": 111053265, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.86328125, + "step": 5171, + "time_per_iteration": 2.558258295059204 + }, + { + "auxiliary_loss_clip": 0.01132946, + "auxiliary_loss_mlp": 0.01053954, + "balance_loss_clip": 1.03769374, + "balance_loss_mlp": 1.04645526, + "epoch": 0.3109574627987374, + "flos": 19683249592320.0, + "grad_norm": 1.9319520361735263, + "language_loss": 0.83802366, + "learning_rate": 3.2275274704103785e-06, + "loss": 0.85989261, + "num_input_tokens_seen": 111071130, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8671875, + "step": 5172, + "time_per_iteration": 2.4601597785949707 + }, + { + "auxiliary_loss_clip": 0.01133186, + "auxiliary_loss_mlp": 0.01045771, + "balance_loss_clip": 1.02948654, + "balance_loss_mlp": 1.0467186, + "epoch": 0.3110175860514054, + "flos": 14683981656960.0, + "grad_norm": 1.9586589765002733, + "language_loss": 0.84225619, + "learning_rate": 3.227219971129842e-06, + "loss": 0.86404574, + "num_input_tokens_seen": 111089560, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8671875, + "step": 5173, + "time_per_iteration": 2.501840591430664 + }, + { + "auxiliary_loss_clip": 0.01128358, + "auxiliary_loss_mlp": 0.01034767, + "balance_loss_clip": 1.02038455, + "balance_loss_mlp": 1.04595959, + "epoch": 0.31107770930407336, + "flos": 25739655724800.0, + "grad_norm": 1.622637298809784, + "language_loss": 0.83323705, + "learning_rate": 3.226912425313001e-06, + "loss": 0.85486829, + "num_input_tokens_seen": 111109960, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.82421875, + "step": 5174, + "time_per_iteration": 2.507127285003662 + }, + { + "auxiliary_loss_clip": 0.01131648, + "auxiliary_loss_mlp": 0.01047063, + "balance_loss_clip": 1.03155434, + "balance_loss_mlp": 1.04670012, + "epoch": 0.3111378325567413, + "flos": 19208259118080.0, + "grad_norm": 2.3340025504670003, + "language_loss": 0.84681082, + "learning_rate": 3.2266048329715183e-06, + "loss": 0.86859798, + "num_input_tokens_seen": 111127960, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84765625, + "step": 5175, + "time_per_iteration": 2.4853246212005615 + }, + { + "auxiliary_loss_clip": 0.01133689, + "auxiliary_loss_mlp": 0.01047203, + "balance_loss_clip": 1.03029919, + "balance_loss_mlp": 1.04996502, + "epoch": 0.3111979558094093, + "flos": 23696374561920.0, + "grad_norm": 1.6466695594130172, + "language_loss": 0.83448446, + "learning_rate": 3.2262971941170575e-06, + "loss": 0.85629338, + "num_input_tokens_seen": 111146730, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8359375, + "step": 5176, + "time_per_iteration": 2.4759509563446045 + }, + { + "auxiliary_loss_clip": 0.01128858, + "auxiliary_loss_mlp": 0.01044446, + "balance_loss_clip": 1.02836514, + "balance_loss_mlp": 1.04442942, + "epoch": 0.31125807906207725, + "flos": 21033023892480.0, + "grad_norm": 1.7899579393784935, + "language_loss": 0.80820966, + "learning_rate": 3.2259895087612837e-06, + "loss": 0.8299427, + "num_input_tokens_seen": 111166295, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.84375, + "step": 5177, + "time_per_iteration": 2.5106611251831055 + }, + { + "auxiliary_loss_clip": 0.0113295, + "auxiliary_loss_mlp": 0.01041813, + "balance_loss_clip": 1.02629185, + "balance_loss_mlp": 1.048877, + "epoch": 0.3113182023147452, + "flos": 23076628277760.0, + "grad_norm": 1.9871899212943351, + "language_loss": 0.80703342, + "learning_rate": 3.2256817769158657e-06, + "loss": 0.82878101, + "num_input_tokens_seen": 111185665, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.84375, + "step": 5178, + "time_per_iteration": 4.0482330322265625 + }, + { + "auxiliary_loss_clip": 0.01131397, + "auxiliary_loss_mlp": 0.01048541, + "balance_loss_clip": 1.03310347, + "balance_loss_mlp": 1.04518402, + "epoch": 0.3113783255674132, + "flos": 11838994888320.0, + "grad_norm": 1.8347450184704097, + "language_loss": 0.81340981, + "learning_rate": 3.225373998592471e-06, + "loss": 0.83520925, + "num_input_tokens_seen": 111201615, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.86328125, + "step": 5179, + "time_per_iteration": 3.82991886138916 + }, + { + "auxiliary_loss_clip": 0.01132507, + "auxiliary_loss_mlp": 0.01049787, + "balance_loss_clip": 1.0338006, + "balance_loss_mlp": 1.04824936, + "epoch": 0.31143844882008115, + "flos": 16289547684480.0, + "grad_norm": 1.599561013411363, + "language_loss": 0.78199375, + "learning_rate": 3.2250661738027715e-06, + "loss": 0.8038168, + "num_input_tokens_seen": 111220515, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.84375, + "step": 5180, + "time_per_iteration": 2.4656291007995605 + }, + { + "auxiliary_loss_clip": 0.01130701, + "auxiliary_loss_mlp": 0.01035311, + "balance_loss_clip": 1.01915836, + "balance_loss_mlp": 1.04672408, + "epoch": 0.3114985720727491, + "flos": 23217792727680.0, + "grad_norm": 1.6380256774064115, + "language_loss": 0.83046079, + "learning_rate": 3.22475830255844e-06, + "loss": 0.85212088, + "num_input_tokens_seen": 111240395, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.84375, + "step": 5181, + "time_per_iteration": 2.5661914348602295 + }, + { + "auxiliary_loss_clip": 0.01128181, + "auxiliary_loss_mlp": 0.01043679, + "balance_loss_clip": 1.02903986, + "balance_loss_mlp": 1.0464232, + "epoch": 0.3115586953254171, + "flos": 30044626698240.0, + "grad_norm": 1.700886032828765, + "language_loss": 0.74084079, + "learning_rate": 3.2244503848711516e-06, + "loss": 0.76255929, + "num_input_tokens_seen": 111261100, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8203125, + "step": 5182, + "time_per_iteration": 2.5913209915161133 + }, + { + "auxiliary_loss_clip": 0.01136348, + "auxiliary_loss_mlp": 0.01050649, + "balance_loss_clip": 1.03479409, + "balance_loss_mlp": 1.04858768, + "epoch": 0.3116188185780851, + "flos": 25666326109440.0, + "grad_norm": 1.8010906920491343, + "language_loss": 0.70658493, + "learning_rate": 3.2241424207525815e-06, + "loss": 0.72845489, + "num_input_tokens_seen": 111281320, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.875, + "step": 5183, + "time_per_iteration": 2.4991438388824463 + }, + { + "auxiliary_loss_clip": 0.01045533, + "auxiliary_loss_mlp": 0.01014757, + "balance_loss_clip": 1.01301634, + "balance_loss_mlp": 1.01690507, + "epoch": 0.31167894183075306, + "flos": 69510058917120.0, + "grad_norm": 0.9414003998762589, + "language_loss": 0.59602594, + "learning_rate": 3.223834410214408e-06, + "loss": 0.61662877, + "num_input_tokens_seen": 111341405, + "router_z_loss_clip": 0.01745605, + "router_z_loss_mlp": 0.28515625, + "step": 5184, + "time_per_iteration": 3.0754520893096924 + }, + { + "auxiliary_loss_clip": 0.01130364, + "auxiliary_loss_mlp": 0.01047375, + "balance_loss_clip": 1.03264058, + "balance_loss_mlp": 1.04596519, + "epoch": 0.31173906508342103, + "flos": 14939845211520.0, + "grad_norm": 2.811836993883612, + "language_loss": 0.69750082, + "learning_rate": 3.223526353268311e-06, + "loss": 0.71927822, + "num_input_tokens_seen": 111358975, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.84375, + "step": 5185, + "time_per_iteration": 2.435033082962036 + }, + { + "auxiliary_loss_clip": 0.01136749, + "auxiliary_loss_mlp": 0.01048147, + "balance_loss_clip": 1.0323875, + "balance_loss_mlp": 1.05073345, + "epoch": 0.311799188336089, + "flos": 16176033728640.0, + "grad_norm": 2.346024133586612, + "language_loss": 0.63920057, + "learning_rate": 3.2232182499259725e-06, + "loss": 0.66104954, + "num_input_tokens_seen": 111375845, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.859375, + "step": 5186, + "time_per_iteration": 2.463900327682495 + }, + { + "auxiliary_loss_clip": 0.01137188, + "auxiliary_loss_mlp": 0.01049347, + "balance_loss_clip": 1.03219295, + "balance_loss_mlp": 1.04886758, + "epoch": 0.31185931158875696, + "flos": 25009627708800.0, + "grad_norm": 2.108066194391345, + "language_loss": 0.86249322, + "learning_rate": 3.2229101001990747e-06, + "loss": 0.88435853, + "num_input_tokens_seen": 111394150, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.8828125, + "step": 5187, + "time_per_iteration": 2.4854979515075684 + }, + { + "auxiliary_loss_clip": 0.01129847, + "auxiliary_loss_mlp": 0.0104672, + "balance_loss_clip": 1.03048384, + "balance_loss_mlp": 1.0451926, + "epoch": 0.3119194348414249, + "flos": 37232901273600.0, + "grad_norm": 1.7445298378798078, + "language_loss": 0.62983185, + "learning_rate": 3.2226019040993036e-06, + "loss": 0.6515975, + "num_input_tokens_seen": 111418355, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.84765625, + "step": 5188, + "time_per_iteration": 2.6161019802093506 + }, + { + "auxiliary_loss_clip": 0.01135744, + "auxiliary_loss_mlp": 0.01045566, + "balance_loss_clip": 1.02961564, + "balance_loss_mlp": 1.05116081, + "epoch": 0.3119795580940929, + "flos": 15012779777280.0, + "grad_norm": 2.1633857437120256, + "language_loss": 0.8347863, + "learning_rate": 3.222293661638346e-06, + "loss": 0.85659939, + "num_input_tokens_seen": 111435445, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.84375, + "step": 5189, + "time_per_iteration": 2.4360432624816895 + }, + { + "auxiliary_loss_clip": 0.01129905, + "auxiliary_loss_mlp": 0.01036753, + "balance_loss_clip": 1.0213753, + "balance_loss_mlp": 1.04657507, + "epoch": 0.31203968134676086, + "flos": 15998168557440.0, + "grad_norm": 1.6712014044776404, + "language_loss": 0.7916308, + "learning_rate": 3.22198537282789e-06, + "loss": 0.81329739, + "num_input_tokens_seen": 111453430, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.83203125, + "step": 5190, + "time_per_iteration": 2.472668170928955 + }, + { + "auxiliary_loss_clip": 0.01133914, + "auxiliary_loss_mlp": 0.01035258, + "balance_loss_clip": 1.01986194, + "balance_loss_mlp": 1.04946673, + "epoch": 0.3120998045994288, + "flos": 23837359443840.0, + "grad_norm": 1.4545499288259176, + "language_loss": 0.75318813, + "learning_rate": 3.2216770376796262e-06, + "loss": 0.77487987, + "num_input_tokens_seen": 111475325, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84375, + "step": 5191, + "time_per_iteration": 2.486673355102539 + }, + { + "auxiliary_loss_clip": 0.01049091, + "auxiliary_loss_mlp": 0.01002214, + "balance_loss_clip": 1.00025892, + "balance_loss_mlp": 1.02067924, + "epoch": 0.3121599278520968, + "flos": 69184205712000.0, + "grad_norm": 0.8451593954944295, + "language_loss": 0.63957787, + "learning_rate": 3.221368656205247e-06, + "loss": 0.66009092, + "num_input_tokens_seen": 111533960, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.28515625, + "step": 5192, + "time_per_iteration": 3.1464638710021973 + }, + { + "auxiliary_loss_clip": 0.01134311, + "auxiliary_loss_mlp": 0.01041004, + "balance_loss_clip": 1.02365923, + "balance_loss_mlp": 1.04795599, + "epoch": 0.31222005110476475, + "flos": 23806368984960.0, + "grad_norm": 1.6164756923867671, + "language_loss": 0.80154347, + "learning_rate": 3.221060228416446e-06, + "loss": 0.82329667, + "num_input_tokens_seen": 111554055, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.86328125, + "step": 5193, + "time_per_iteration": 2.5156989097595215 + }, + { + "auxiliary_loss_clip": 0.01131615, + "auxiliary_loss_mlp": 0.01042627, + "balance_loss_clip": 1.02610445, + "balance_loss_mlp": 1.045856, + "epoch": 0.3122801743574327, + "flos": 25226132935680.0, + "grad_norm": 1.8140889441731107, + "language_loss": 0.72050476, + "learning_rate": 3.2207517543249183e-06, + "loss": 0.74224722, + "num_input_tokens_seen": 111574305, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.85546875, + "step": 5194, + "time_per_iteration": 2.519972801208496 + }, + { + "auxiliary_loss_clip": 0.01133223, + "auxiliary_loss_mlp": 0.01039811, + "balance_loss_clip": 1.02471924, + "balance_loss_mlp": 1.04870749, + "epoch": 0.3123402976101007, + "flos": 22966490200320.0, + "grad_norm": 1.3544515008303952, + "language_loss": 0.76475823, + "learning_rate": 3.2204432339423616e-06, + "loss": 0.78648859, + "num_input_tokens_seen": 111595680, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84765625, + "step": 5195, + "time_per_iteration": 2.512247323989868 + }, + { + "auxiliary_loss_clip": 0.01131656, + "auxiliary_loss_mlp": 0.01042642, + "balance_loss_clip": 1.02718091, + "balance_loss_mlp": 1.0449183, + "epoch": 0.3124004208627687, + "flos": 25192089820800.0, + "grad_norm": 1.3526234536893298, + "language_loss": 0.7817502, + "learning_rate": 3.220134667280476e-06, + "loss": 0.80349314, + "num_input_tokens_seen": 111618135, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8671875, + "step": 5196, + "time_per_iteration": 2.528002977371216 + }, + { + "auxiliary_loss_clip": 0.01044386, + "auxiliary_loss_mlp": 0.01000444, + "balance_loss_clip": 0.99860841, + "balance_loss_mlp": 1.01643729, + "epoch": 0.31246054411543667, + "flos": 67485165517440.0, + "grad_norm": 0.7752479618797538, + "language_loss": 0.54834789, + "learning_rate": 3.2198260543509613e-06, + "loss": 0.56879622, + "num_input_tokens_seen": 111682220, + "router_z_loss_clip": 0.01831055, + "router_z_loss_mlp": 0.27929688, + "step": 5197, + "time_per_iteration": 3.0728254318237305 + }, + { + "auxiliary_loss_clip": 0.01130689, + "auxiliary_loss_mlp": 0.01038137, + "balance_loss_clip": 1.02328372, + "balance_loss_mlp": 1.0477525, + "epoch": 0.31252066736810463, + "flos": 17858520731520.0, + "grad_norm": 1.6543672060788046, + "language_loss": 0.66300559, + "learning_rate": 3.21951739516552e-06, + "loss": 0.68469381, + "num_input_tokens_seen": 111700815, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.828125, + "step": 5198, + "time_per_iteration": 2.4312028884887695 + }, + { + "auxiliary_loss_clip": 0.01135097, + "auxiliary_loss_mlp": 0.01037705, + "balance_loss_clip": 1.02156413, + "balance_loss_mlp": 1.0472604, + "epoch": 0.3125807906207726, + "flos": 18475034791680.0, + "grad_norm": 2.083859755504136, + "language_loss": 0.69763082, + "learning_rate": 3.219208689735857e-06, + "loss": 0.71935886, + "num_input_tokens_seen": 111718195, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.875, + "step": 5199, + "time_per_iteration": 2.454464912414551 + }, + { + "auxiliary_loss_clip": 0.01131797, + "auxiliary_loss_mlp": 0.01049575, + "balance_loss_clip": 1.0336132, + "balance_loss_mlp": 1.04692471, + "epoch": 0.31264091387344056, + "flos": 18946541646720.0, + "grad_norm": 1.8982997112015956, + "language_loss": 0.79004937, + "learning_rate": 3.2188999380736785e-06, + "loss": 0.81186306, + "num_input_tokens_seen": 111734440, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.84765625, + "step": 5200, + "time_per_iteration": 2.4382827281951904 + }, + { + "auxiliary_loss_clip": 0.01127793, + "auxiliary_loss_mlp": 0.01036846, + "balance_loss_clip": 1.02187347, + "balance_loss_mlp": 1.04621911, + "epoch": 0.3127010371261085, + "flos": 21468512384640.0, + "grad_norm": 2.042457973745699, + "language_loss": 0.83946276, + "learning_rate": 3.2185911401906917e-06, + "loss": 0.86110914, + "num_input_tokens_seen": 111751960, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.81640625, + "step": 5201, + "time_per_iteration": 2.475511074066162 + }, + { + "auxiliary_loss_clip": 0.01134303, + "auxiliary_loss_mlp": 0.01046403, + "balance_loss_clip": 1.02990484, + "balance_loss_mlp": 1.04985881, + "epoch": 0.3127611603787765, + "flos": 15336047203200.0, + "grad_norm": 2.37604325800411, + "language_loss": 0.69560832, + "learning_rate": 3.2182822960986072e-06, + "loss": 0.71741533, + "num_input_tokens_seen": 111769585, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.84375, + "step": 5202, + "time_per_iteration": 2.4265501499176025 + }, + { + "auxiliary_loss_clip": 0.01133329, + "auxiliary_loss_mlp": 0.01041339, + "balance_loss_clip": 1.02737963, + "balance_loss_mlp": 1.04759419, + "epoch": 0.31282128363144446, + "flos": 17602980399360.0, + "grad_norm": 1.800546738819683, + "language_loss": 0.84001613, + "learning_rate": 3.2179734058091358e-06, + "loss": 0.86176282, + "num_input_tokens_seen": 111787880, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.85546875, + "step": 5203, + "time_per_iteration": 2.480233907699585 + }, + { + "auxiliary_loss_clip": 0.01131997, + "auxiliary_loss_mlp": 0.01047163, + "balance_loss_clip": 1.03176749, + "balance_loss_mlp": 1.04697657, + "epoch": 0.3128814068841124, + "flos": 26756753235840.0, + "grad_norm": 2.9129021624211417, + "language_loss": 0.60623944, + "learning_rate": 3.2176644693339913e-06, + "loss": 0.62803102, + "num_input_tokens_seen": 111805950, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8515625, + "step": 5204, + "time_per_iteration": 2.50688099861145 + }, + { + "auxiliary_loss_clip": 0.01129885, + "auxiliary_loss_mlp": 0.01041088, + "balance_loss_clip": 1.02672338, + "balance_loss_mlp": 1.04707503, + "epoch": 0.3129415301367804, + "flos": 22272372806400.0, + "grad_norm": 1.6006708998064776, + "language_loss": 0.65964866, + "learning_rate": 3.217355486684887e-06, + "loss": 0.68135834, + "num_input_tokens_seen": 111826135, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.828125, + "step": 5205, + "time_per_iteration": 2.4824163913726807 + }, + { + "auxiliary_loss_clip": 0.01132532, + "auxiliary_loss_mlp": 0.01043219, + "balance_loss_clip": 1.0264169, + "balance_loss_mlp": 1.0476222, + "epoch": 0.31300165338944835, + "flos": 26464907232000.0, + "grad_norm": 1.9498647702732133, + "language_loss": 0.76618874, + "learning_rate": 3.2170464578735414e-06, + "loss": 0.78794622, + "num_input_tokens_seen": 111844700, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.84765625, + "step": 5206, + "time_per_iteration": 2.4947307109832764 + }, + { + "auxiliary_loss_clip": 0.0112786, + "auxiliary_loss_mlp": 0.01039372, + "balance_loss_clip": 1.02416039, + "balance_loss_mlp": 1.04438448, + "epoch": 0.3130617766421163, + "flos": 21944652094080.0, + "grad_norm": 3.088705810465425, + "language_loss": 0.83287984, + "learning_rate": 3.216737382911672e-06, + "loss": 0.85455215, + "num_input_tokens_seen": 111861585, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8359375, + "step": 5207, + "time_per_iteration": 2.4767825603485107 + }, + { + "auxiliary_loss_clip": 0.01128039, + "auxiliary_loss_mlp": 0.01041894, + "balance_loss_clip": 1.02784562, + "balance_loss_mlp": 1.04694057, + "epoch": 0.3131218998947843, + "flos": 23292774368640.0, + "grad_norm": 1.5219202808663073, + "language_loss": 0.71293664, + "learning_rate": 3.216428261810999e-06, + "loss": 0.73463601, + "num_input_tokens_seen": 111882950, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8125, + "step": 5208, + "time_per_iteration": 2.4853296279907227 + }, + { + "auxiliary_loss_clip": 0.01133198, + "auxiliary_loss_mlp": 0.01041056, + "balance_loss_clip": 1.02534437, + "balance_loss_mlp": 1.04957032, + "epoch": 0.3131820231474523, + "flos": 21139642437120.0, + "grad_norm": 1.8332946649412374, + "language_loss": 0.74547577, + "learning_rate": 3.2161190945832445e-06, + "loss": 0.76721835, + "num_input_tokens_seen": 111901640, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8359375, + "step": 5209, + "time_per_iteration": 2.5162742137908936 + }, + { + "auxiliary_loss_clip": 0.0113008, + "auxiliary_loss_mlp": 0.01040855, + "balance_loss_clip": 1.02695489, + "balance_loss_mlp": 1.04557538, + "epoch": 0.31324214640012027, + "flos": 23909863046400.0, + "grad_norm": 1.818845882779476, + "language_loss": 0.77656835, + "learning_rate": 3.2158098812401325e-06, + "loss": 0.79827774, + "num_input_tokens_seen": 111919615, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.84375, + "step": 5210, + "time_per_iteration": 2.4701180458068848 + }, + { + "auxiliary_loss_clip": 0.01125909, + "auxiliary_loss_mlp": 0.0103947, + "balance_loss_clip": 1.02443743, + "balance_loss_mlp": 1.04593706, + "epoch": 0.31330226965278823, + "flos": 22236929061120.0, + "grad_norm": 1.8627745841798442, + "language_loss": 0.79177994, + "learning_rate": 3.2155006217933874e-06, + "loss": 0.81343371, + "num_input_tokens_seen": 111938485, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.80078125, + "step": 5211, + "time_per_iteration": 2.482102870941162 + }, + { + "auxiliary_loss_clip": 0.01130248, + "auxiliary_loss_mlp": 0.01038221, + "balance_loss_clip": 1.02448201, + "balance_loss_mlp": 1.04849112, + "epoch": 0.3133623929054562, + "flos": 19753993428480.0, + "grad_norm": 1.64859412039223, + "language_loss": 0.79837513, + "learning_rate": 3.2151913162547367e-06, + "loss": 0.82005984, + "num_input_tokens_seen": 111956425, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.8203125, + "step": 5212, + "time_per_iteration": 2.460986852645874 + }, + { + "auxiliary_loss_clip": 0.01133278, + "auxiliary_loss_mlp": 0.01049778, + "balance_loss_clip": 1.03395939, + "balance_loss_mlp": 1.04740417, + "epoch": 0.31342251615812416, + "flos": 27162256849920.0, + "grad_norm": 2.096287390218497, + "language_loss": 0.71467483, + "learning_rate": 3.2148819646359097e-06, + "loss": 0.73650539, + "num_input_tokens_seen": 111975915, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.859375, + "step": 5213, + "time_per_iteration": 2.5129754543304443 + }, + { + "auxiliary_loss_clip": 0.01135204, + "auxiliary_loss_mlp": 0.01041521, + "balance_loss_clip": 1.02660799, + "balance_loss_mlp": 1.05014026, + "epoch": 0.31348263941079213, + "flos": 20229809915520.0, + "grad_norm": 5.183832853627301, + "language_loss": 0.77595121, + "learning_rate": 3.2145725669486374e-06, + "loss": 0.79771841, + "num_input_tokens_seen": 111995055, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8515625, + "step": 5214, + "time_per_iteration": 2.453228712081909 + }, + { + "auxiliary_loss_clip": 0.01126524, + "auxiliary_loss_mlp": 0.01034893, + "balance_loss_clip": 1.02082658, + "balance_loss_mlp": 1.04599309, + "epoch": 0.3135427626634601, + "flos": 24607643627520.0, + "grad_norm": 1.6576138068605464, + "language_loss": 0.82562625, + "learning_rate": 3.2142631232046517e-06, + "loss": 0.84724051, + "num_input_tokens_seen": 112015830, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8046875, + "step": 5215, + "time_per_iteration": 2.544684886932373 + }, + { + "auxiliary_loss_clip": 0.01131802, + "auxiliary_loss_mlp": 0.01037959, + "balance_loss_clip": 1.02242613, + "balance_loss_mlp": 1.04732776, + "epoch": 0.31360288591612806, + "flos": 20959873845120.0, + "grad_norm": 2.510877303679677, + "language_loss": 0.79557931, + "learning_rate": 3.213953633415686e-06, + "loss": 0.81727695, + "num_input_tokens_seen": 112035065, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.84375, + "step": 5216, + "time_per_iteration": 2.4559943675994873 + }, + { + "auxiliary_loss_clip": 0.0113211, + "auxiliary_loss_mlp": 0.01047322, + "balance_loss_clip": 1.03042984, + "balance_loss_mlp": 1.04632115, + "epoch": 0.313663009168796, + "flos": 26980513009920.0, + "grad_norm": 2.0079960226100293, + "language_loss": 0.68489361, + "learning_rate": 3.213644097593477e-06, + "loss": 0.70668793, + "num_input_tokens_seen": 112058405, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.859375, + "step": 5217, + "time_per_iteration": 2.524624824523926 + }, + { + "auxiliary_loss_clip": 0.01134019, + "auxiliary_loss_mlp": 0.01036007, + "balance_loss_clip": 1.02095652, + "balance_loss_mlp": 1.04952598, + "epoch": 0.313723132421464, + "flos": 18040911016320.0, + "grad_norm": 1.8597778329644077, + "language_loss": 0.80357039, + "learning_rate": 3.2133345157497624e-06, + "loss": 0.82527065, + "num_input_tokens_seen": 112076420, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84765625, + "step": 5218, + "time_per_iteration": 2.437819480895996 + }, + { + "auxiliary_loss_clip": 0.01130766, + "auxiliary_loss_mlp": 0.01041589, + "balance_loss_clip": 1.025931, + "balance_loss_mlp": 1.04692423, + "epoch": 0.31378325567413196, + "flos": 22488913946880.0, + "grad_norm": 2.311414379590861, + "language_loss": 0.68608415, + "learning_rate": 3.2130248878962813e-06, + "loss": 0.70780772, + "num_input_tokens_seen": 112090775, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8359375, + "step": 5219, + "time_per_iteration": 2.4811697006225586 + }, + { + "auxiliary_loss_clip": 0.01132783, + "auxiliary_loss_mlp": 0.01040103, + "balance_loss_clip": 1.02585125, + "balance_loss_mlp": 1.05002093, + "epoch": 0.3138433789267999, + "flos": 22419247518720.0, + "grad_norm": 1.886141735907444, + "language_loss": 0.7973401, + "learning_rate": 3.2127152140447747e-06, + "loss": 0.81906897, + "num_input_tokens_seen": 112110980, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.828125, + "step": 5220, + "time_per_iteration": 5.5014426708221436 + }, + { + "auxiliary_loss_clip": 0.01129795, + "auxiliary_loss_mlp": 0.0103477, + "balance_loss_clip": 1.02036917, + "balance_loss_mlp": 1.0470016, + "epoch": 0.3139035021794679, + "flos": 13005912026880.0, + "grad_norm": 1.696615671785811, + "language_loss": 0.72865409, + "learning_rate": 3.212405494206986e-06, + "loss": 0.75029969, + "num_input_tokens_seen": 112129020, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.828125, + "step": 5221, + "time_per_iteration": 2.4286248683929443 + }, + { + "auxiliary_loss_clip": 0.01129062, + "auxiliary_loss_mlp": 0.01037622, + "balance_loss_clip": 1.02370405, + "balance_loss_mlp": 1.0478735, + "epoch": 0.31396362543213585, + "flos": 16945994689920.0, + "grad_norm": 1.5798649053475948, + "language_loss": 0.8195132, + "learning_rate": 3.2120957283946588e-06, + "loss": 0.84118003, + "num_input_tokens_seen": 112147865, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.8125, + "step": 5222, + "time_per_iteration": 2.453622817993164 + }, + { + "auxiliary_loss_clip": 0.01133873, + "auxiliary_loss_mlp": 0.01044471, + "balance_loss_clip": 1.02744806, + "balance_loss_mlp": 1.04833627, + "epoch": 0.31402374868480387, + "flos": 20156731695360.0, + "grad_norm": 1.948806511089887, + "language_loss": 0.70150459, + "learning_rate": 3.2117859166195407e-06, + "loss": 0.723288, + "num_input_tokens_seen": 112166745, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.85546875, + "step": 5223, + "time_per_iteration": 2.442513942718506 + }, + { + "auxiliary_loss_clip": 0.01130042, + "auxiliary_loss_mlp": 0.01034314, + "balance_loss_clip": 1.01980042, + "balance_loss_mlp": 1.04643512, + "epoch": 0.31408387193747184, + "flos": 21251073404160.0, + "grad_norm": 1.6111281957709347, + "language_loss": 0.80361176, + "learning_rate": 3.211476058893379e-06, + "loss": 0.82525527, + "num_input_tokens_seen": 112185895, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8359375, + "step": 5224, + "time_per_iteration": 2.5533599853515625 + }, + { + "auxiliary_loss_clip": 0.01139668, + "auxiliary_loss_mlp": 0.01042146, + "balance_loss_clip": 1.02615976, + "balance_loss_mlp": 1.05134106, + "epoch": 0.3141439951901398, + "flos": 27484267299840.0, + "grad_norm": 1.9819108050216143, + "language_loss": 0.58416283, + "learning_rate": 3.2111661552279243e-06, + "loss": 0.60598099, + "num_input_tokens_seen": 112204465, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8828125, + "step": 5225, + "time_per_iteration": 2.493633508682251 + }, + { + "auxiliary_loss_clip": 0.01125852, + "auxiliary_loss_mlp": 0.01031717, + "balance_loss_clip": 1.01826406, + "balance_loss_mlp": 1.04575014, + "epoch": 0.31420411844280777, + "flos": 17852235851520.0, + "grad_norm": 1.9016989590060558, + "language_loss": 0.81870753, + "learning_rate": 3.2108562056349273e-06, + "loss": 0.84028322, + "num_input_tokens_seen": 112221635, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.80078125, + "step": 5226, + "time_per_iteration": 2.455474376678467 + }, + { + "auxiliary_loss_clip": 0.01132046, + "auxiliary_loss_mlp": 0.0103993, + "balance_loss_clip": 1.0245285, + "balance_loss_mlp": 1.04804921, + "epoch": 0.31426424169547573, + "flos": 21616967295360.0, + "grad_norm": 3.2929472014065864, + "language_loss": 0.73947561, + "learning_rate": 3.210546210126141e-06, + "loss": 0.7611953, + "num_input_tokens_seen": 112241240, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.83984375, + "step": 5227, + "time_per_iteration": 2.4582889080047607 + }, + { + "auxiliary_loss_clip": 0.01132913, + "auxiliary_loss_mlp": 0.01042937, + "balance_loss_clip": 1.02783334, + "balance_loss_mlp": 1.04827404, + "epoch": 0.3143243649481437, + "flos": 30920631586560.0, + "grad_norm": 1.9061545786481, + "language_loss": 0.67636049, + "learning_rate": 3.2102361687133213e-06, + "loss": 0.69811898, + "num_input_tokens_seen": 112262350, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84375, + "step": 5228, + "time_per_iteration": 2.572122573852539 + }, + { + "auxiliary_loss_clip": 0.01130676, + "auxiliary_loss_mlp": 0.01040068, + "balance_loss_clip": 1.02567399, + "balance_loss_mlp": 1.04645872, + "epoch": 0.31438448820081166, + "flos": 22821411168000.0, + "grad_norm": 1.857425256773369, + "language_loss": 0.79938543, + "learning_rate": 3.2099260814082254e-06, + "loss": 0.82109284, + "num_input_tokens_seen": 112283710, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.83984375, + "step": 5229, + "time_per_iteration": 2.4785192012786865 + }, + { + "auxiliary_loss_clip": 0.01129346, + "auxiliary_loss_mlp": 0.01039876, + "balance_loss_clip": 1.02474797, + "balance_loss_mlp": 1.04716849, + "epoch": 0.3144446114534796, + "flos": 23292127923840.0, + "grad_norm": 1.8246409730399047, + "language_loss": 0.70264775, + "learning_rate": 3.209615948222611e-06, + "loss": 0.72434002, + "num_input_tokens_seen": 112304285, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8203125, + "step": 5230, + "time_per_iteration": 2.504387140274048 + }, + { + "auxiliary_loss_clip": 0.01129413, + "auxiliary_loss_mlp": 0.01043609, + "balance_loss_clip": 1.02805161, + "balance_loss_mlp": 1.04486191, + "epoch": 0.3145047347061476, + "flos": 31355976424320.0, + "grad_norm": 1.680902640440715, + "language_loss": 0.79707456, + "learning_rate": 3.209305769168239e-06, + "loss": 0.81880474, + "num_input_tokens_seen": 112325110, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84375, + "step": 5231, + "time_per_iteration": 2.535352945327759 + }, + { + "auxiliary_loss_clip": 0.01129002, + "auxiliary_loss_mlp": 0.01042731, + "balance_loss_clip": 1.02675736, + "balance_loss_mlp": 1.04756021, + "epoch": 0.31456485795881556, + "flos": 10889552643840.0, + "grad_norm": 2.0146998384070254, + "language_loss": 0.8507638, + "learning_rate": 3.2089955442568704e-06, + "loss": 0.87248111, + "num_input_tokens_seen": 112339855, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8125, + "step": 5232, + "time_per_iteration": 2.5626280307769775 + }, + { + "auxiliary_loss_clip": 0.01127281, + "auxiliary_loss_mlp": 0.01049783, + "balance_loss_clip": 1.03439283, + "balance_loss_mlp": 1.0461762, + "epoch": 0.3146249812114835, + "flos": 17092438439040.0, + "grad_norm": 1.5681064196444345, + "language_loss": 0.7984041, + "learning_rate": 3.2086852735002692e-06, + "loss": 0.82017469, + "num_input_tokens_seen": 112358480, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.80859375, + "step": 5233, + "time_per_iteration": 2.4478254318237305 + }, + { + "auxiliary_loss_clip": 0.01132884, + "auxiliary_loss_mlp": 0.01038194, + "balance_loss_clip": 1.0233047, + "balance_loss_mlp": 1.04861724, + "epoch": 0.3146851044641515, + "flos": 55291442889600.0, + "grad_norm": 1.628646597563271, + "language_loss": 0.70788991, + "learning_rate": 3.2083749569102024e-06, + "loss": 0.72960073, + "num_input_tokens_seen": 112382350, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.84375, + "step": 5234, + "time_per_iteration": 2.775871992111206 + }, + { + "auxiliary_loss_clip": 0.01131513, + "auxiliary_loss_mlp": 0.01035924, + "balance_loss_clip": 1.0205102, + "balance_loss_mlp": 1.04739237, + "epoch": 0.31474522771681945, + "flos": 27015884928000.0, + "grad_norm": 1.8519873535555593, + "language_loss": 0.72068667, + "learning_rate": 3.2080645944984356e-06, + "loss": 0.74236101, + "num_input_tokens_seen": 112400260, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.83984375, + "step": 5235, + "time_per_iteration": 2.515869617462158 + }, + { + "auxiliary_loss_clip": 0.01126993, + "auxiliary_loss_mlp": 0.01036359, + "balance_loss_clip": 1.02204823, + "balance_loss_mlp": 1.04428434, + "epoch": 0.3148053509694875, + "flos": 21251935330560.0, + "grad_norm": 2.06424580772138, + "language_loss": 0.7832365, + "learning_rate": 3.2077541862767384e-06, + "loss": 0.80487001, + "num_input_tokens_seen": 112419400, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.828125, + "step": 5236, + "time_per_iteration": 2.5591800212860107 + }, + { + "auxiliary_loss_clip": 0.01134794, + "auxiliary_loss_mlp": 0.01041698, + "balance_loss_clip": 1.02609372, + "balance_loss_mlp": 1.04730821, + "epoch": 0.31486547422215544, + "flos": 31248675521280.0, + "grad_norm": 1.44778330648976, + "language_loss": 0.75856584, + "learning_rate": 3.207443732256881e-06, + "loss": 0.78033078, + "num_input_tokens_seen": 112440825, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.875, + "step": 5237, + "time_per_iteration": 2.5414791107177734 + }, + { + "auxiliary_loss_clip": 0.01125329, + "auxiliary_loss_mlp": 0.01037879, + "balance_loss_clip": 1.02424169, + "balance_loss_mlp": 1.04500508, + "epoch": 0.3149255974748234, + "flos": 19828615933440.0, + "grad_norm": 2.1889759499940813, + "language_loss": 0.79916662, + "learning_rate": 3.2071332324506372e-06, + "loss": 0.82079864, + "num_input_tokens_seen": 112459180, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.8046875, + "step": 5238, + "time_per_iteration": 2.484102725982666 + }, + { + "auxiliary_loss_clip": 0.01045144, + "auxiliary_loss_mlp": 0.0100711, + "balance_loss_clip": 1.0053103, + "balance_loss_mlp": 1.01739836, + "epoch": 0.31498572072749137, + "flos": 67683965339520.0, + "grad_norm": 0.8333107882681854, + "language_loss": 0.67920464, + "learning_rate": 3.2068226868697795e-06, + "loss": 0.69972724, + "num_input_tokens_seen": 112516680, + "router_z_loss_clip": 0.01794434, + "router_z_loss_mlp": 0.27734375, + "step": 5239, + "time_per_iteration": 3.0362496376037598 + }, + { + "auxiliary_loss_clip": 0.01130796, + "auxiliary_loss_mlp": 0.01038602, + "balance_loss_clip": 1.02197254, + "balance_loss_mlp": 1.04535258, + "epoch": 0.31504584398015933, + "flos": 19793136274560.0, + "grad_norm": 2.0536997136778847, + "language_loss": 0.82329869, + "learning_rate": 3.2065120955260846e-06, + "loss": 0.84499264, + "num_input_tokens_seen": 112535895, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.85546875, + "step": 5240, + "time_per_iteration": 2.5182995796203613 + }, + { + "auxiliary_loss_clip": 0.01130164, + "auxiliary_loss_mlp": 0.01039014, + "balance_loss_clip": 1.02451253, + "balance_loss_mlp": 1.04874361, + "epoch": 0.3151059672328273, + "flos": 26615409217920.0, + "grad_norm": 2.2630790499207962, + "language_loss": 0.80981195, + "learning_rate": 3.2062014584313302e-06, + "loss": 0.83150375, + "num_input_tokens_seen": 112557490, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8125, + "step": 5241, + "time_per_iteration": 2.5001909732818604 + }, + { + "auxiliary_loss_clip": 0.01129016, + "auxiliary_loss_mlp": 0.01036038, + "balance_loss_clip": 1.02131534, + "balance_loss_mlp": 1.04834199, + "epoch": 0.31516609048549526, + "flos": 24204438483840.0, + "grad_norm": 1.5804052674973608, + "language_loss": 0.74575627, + "learning_rate": 3.2058907755972956e-06, + "loss": 0.76740676, + "num_input_tokens_seen": 112577075, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8046875, + "step": 5242, + "time_per_iteration": 2.530768871307373 + }, + { + "auxiliary_loss_clip": 0.01129278, + "auxiliary_loss_mlp": 0.01034942, + "balance_loss_clip": 1.0189085, + "balance_loss_mlp": 1.04601228, + "epoch": 0.31522621373816323, + "flos": 25958710817280.0, + "grad_norm": 1.9335835713568477, + "language_loss": 0.74171245, + "learning_rate": 3.2055800470357626e-06, + "loss": 0.7633546, + "num_input_tokens_seen": 112597620, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.83203125, + "step": 5243, + "time_per_iteration": 2.495138168334961 + }, + { + "auxiliary_loss_clip": 0.01129958, + "auxiliary_loss_mlp": 0.01036952, + "balance_loss_clip": 1.02221215, + "balance_loss_mlp": 1.04677868, + "epoch": 0.3152863369908312, + "flos": 21908813299200.0, + "grad_norm": 3.400707627247709, + "language_loss": 0.64608908, + "learning_rate": 3.205269272758513e-06, + "loss": 0.66775823, + "num_input_tokens_seen": 112617150, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.83203125, + "step": 5244, + "time_per_iteration": 2.4930343627929688 + }, + { + "auxiliary_loss_clip": 0.01132393, + "auxiliary_loss_mlp": 0.01035579, + "balance_loss_clip": 1.02088022, + "balance_loss_mlp": 1.04716229, + "epoch": 0.31534646024349916, + "flos": 16281072074880.0, + "grad_norm": 2.1590647535644965, + "language_loss": 0.91464043, + "learning_rate": 3.2049584527773313e-06, + "loss": 0.93632007, + "num_input_tokens_seen": 112631090, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8515625, + "step": 5245, + "time_per_iteration": 2.4007837772369385 + }, + { + "auxiliary_loss_clip": 0.0113079, + "auxiliary_loss_mlp": 0.0104148, + "balance_loss_clip": 1.02636433, + "balance_loss_mlp": 1.04643655, + "epoch": 0.3154065834961671, + "flos": 24717243000960.0, + "grad_norm": 9.888646015204756, + "language_loss": 0.75272042, + "learning_rate": 3.2046475871040048e-06, + "loss": 0.77444315, + "num_input_tokens_seen": 112651220, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.84375, + "step": 5246, + "time_per_iteration": 2.4886202812194824 + }, + { + "auxiliary_loss_clip": 0.01131208, + "auxiliary_loss_mlp": 0.01040085, + "balance_loss_clip": 1.02524352, + "balance_loss_mlp": 1.04602718, + "epoch": 0.3154667067488351, + "flos": 35371148469120.0, + "grad_norm": 1.4670109155165818, + "language_loss": 0.6160199, + "learning_rate": 3.204336675750321e-06, + "loss": 0.63773286, + "num_input_tokens_seen": 112671560, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8515625, + "step": 5247, + "time_per_iteration": 2.567185640335083 + }, + { + "auxiliary_loss_clip": 0.01132287, + "auxiliary_loss_mlp": 0.01038251, + "balance_loss_clip": 1.02283072, + "balance_loss_mlp": 1.04756081, + "epoch": 0.31552683000150306, + "flos": 17456464823040.0, + "grad_norm": 2.2084660310503526, + "language_loss": 0.82410538, + "learning_rate": 3.2040257187280693e-06, + "loss": 0.84581077, + "num_input_tokens_seen": 112689790, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84375, + "step": 5248, + "time_per_iteration": 2.52426815032959 + }, + { + "auxiliary_loss_clip": 0.01129578, + "auxiliary_loss_mlp": 0.01050015, + "balance_loss_clip": 1.03413653, + "balance_loss_mlp": 1.04662156, + "epoch": 0.3155869532541711, + "flos": 18405763413120.0, + "grad_norm": 1.8083364563285407, + "language_loss": 0.85017586, + "learning_rate": 3.2037147160490423e-06, + "loss": 0.87197179, + "num_input_tokens_seen": 112708265, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.828125, + "step": 5249, + "time_per_iteration": 2.4549005031585693 + }, + { + "auxiliary_loss_clip": 0.0113226, + "auxiliary_loss_mlp": 0.01038074, + "balance_loss_clip": 1.02245772, + "balance_loss_mlp": 1.04802227, + "epoch": 0.31564707650683904, + "flos": 21579763783680.0, + "grad_norm": 1.8090626711780673, + "language_loss": 0.85569501, + "learning_rate": 3.2034036677250322e-06, + "loss": 0.87739837, + "num_input_tokens_seen": 112727820, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84375, + "step": 5250, + "time_per_iteration": 2.502629041671753 + }, + { + "auxiliary_loss_clip": 0.01128678, + "auxiliary_loss_mlp": 0.01042591, + "balance_loss_clip": 1.02766562, + "balance_loss_mlp": 1.04532385, + "epoch": 0.315707199759507, + "flos": 21030976817280.0, + "grad_norm": 4.215523946509053, + "language_loss": 0.68559456, + "learning_rate": 3.203092573767835e-06, + "loss": 0.70730722, + "num_input_tokens_seen": 112743140, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8359375, + "step": 5251, + "time_per_iteration": 2.4467368125915527 + }, + { + "auxiliary_loss_clip": 0.01130513, + "auxiliary_loss_mlp": 0.01039965, + "balance_loss_clip": 1.02487266, + "balance_loss_mlp": 1.04848695, + "epoch": 0.31576732301217497, + "flos": 26828861788800.0, + "grad_norm": 1.7890606859490685, + "language_loss": 0.78783, + "learning_rate": 3.202781434189246e-06, + "loss": 0.80953479, + "num_input_tokens_seen": 112764705, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8203125, + "step": 5252, + "time_per_iteration": 2.5056369304656982 + }, + { + "auxiliary_loss_clip": 0.01129131, + "auxiliary_loss_mlp": 0.01040491, + "balance_loss_clip": 1.02635264, + "balance_loss_mlp": 1.04820085, + "epoch": 0.31582744626484294, + "flos": 22711165349760.0, + "grad_norm": 1.7467438086499925, + "language_loss": 0.74374568, + "learning_rate": 3.202470249001066e-06, + "loss": 0.76544189, + "num_input_tokens_seen": 112785310, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.80859375, + "step": 5253, + "time_per_iteration": 2.485865592956543 + }, + { + "auxiliary_loss_clip": 0.01129339, + "auxiliary_loss_mlp": 0.01038803, + "balance_loss_clip": 1.02308559, + "balance_loss_mlp": 1.04530692, + "epoch": 0.3158875695175109, + "flos": 23951914894080.0, + "grad_norm": 1.6622002067810395, + "language_loss": 0.73305148, + "learning_rate": 3.2021590182150924e-06, + "loss": 0.75473285, + "num_input_tokens_seen": 112802905, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.83984375, + "step": 5254, + "time_per_iteration": 2.5044641494750977 + }, + { + "auxiliary_loss_clip": 0.01131731, + "auxiliary_loss_mlp": 0.01038119, + "balance_loss_clip": 1.02293146, + "balance_loss_mlp": 1.04714012, + "epoch": 0.31594769277017887, + "flos": 13261883322240.0, + "grad_norm": 1.9319514966089122, + "language_loss": 0.78156364, + "learning_rate": 3.201847741843128e-06, + "loss": 0.80326211, + "num_input_tokens_seen": 112820305, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.84375, + "step": 5255, + "time_per_iteration": 2.4380881786346436 + }, + { + "auxiliary_loss_clip": 0.01130732, + "auxiliary_loss_mlp": 0.0104233, + "balance_loss_clip": 1.02565229, + "balance_loss_mlp": 1.04770398, + "epoch": 0.31600781602284683, + "flos": 23368258800000.0, + "grad_norm": 2.551434599641695, + "language_loss": 0.78019011, + "learning_rate": 3.2015364198969772e-06, + "loss": 0.80192077, + "num_input_tokens_seen": 112841185, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.828125, + "step": 5256, + "time_per_iteration": 2.517211437225342 + }, + { + "auxiliary_loss_clip": 0.01125561, + "auxiliary_loss_mlp": 0.01035033, + "balance_loss_clip": 1.02159786, + "balance_loss_mlp": 1.04710865, + "epoch": 0.3160679392755148, + "flos": 19828580019840.0, + "grad_norm": 1.6136648036258991, + "language_loss": 0.71117795, + "learning_rate": 3.2012250523884453e-06, + "loss": 0.73278391, + "num_input_tokens_seen": 112860570, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.78515625, + "step": 5257, + "time_per_iteration": 2.4690449237823486 + }, + { + "auxiliary_loss_clip": 0.01129863, + "auxiliary_loss_mlp": 0.01037255, + "balance_loss_clip": 1.02207994, + "balance_loss_mlp": 1.04662931, + "epoch": 0.31612806252818276, + "flos": 20193216935040.0, + "grad_norm": 1.9672329013590102, + "language_loss": 0.77098101, + "learning_rate": 3.2009136393293393e-06, + "loss": 0.79265225, + "num_input_tokens_seen": 112877975, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.83203125, + "step": 5258, + "time_per_iteration": 2.4586384296417236 + }, + { + "auxiliary_loss_clip": 0.01130533, + "auxiliary_loss_mlp": 0.01037626, + "balance_loss_clip": 1.02291536, + "balance_loss_mlp": 1.04706669, + "epoch": 0.31618818578085073, + "flos": 24235967646720.0, + "grad_norm": 4.102208009404704, + "language_loss": 0.72829109, + "learning_rate": 3.200602180731467e-06, + "loss": 0.7499727, + "num_input_tokens_seen": 112896170, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8359375, + "step": 5259, + "time_per_iteration": 2.463867425918579 + }, + { + "auxiliary_loss_clip": 0.011339, + "auxiliary_loss_mlp": 0.01048149, + "balance_loss_clip": 1.03382003, + "balance_loss_mlp": 1.04840684, + "epoch": 0.3162483090335187, + "flos": 25081844002560.0, + "grad_norm": 1.940451679167918, + "language_loss": 0.66212165, + "learning_rate": 3.20029067660664e-06, + "loss": 0.68394214, + "num_input_tokens_seen": 112916180, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.85546875, + "step": 5260, + "time_per_iteration": 2.498173475265503 + }, + { + "auxiliary_loss_clip": 0.01125905, + "auxiliary_loss_mlp": 0.0103285, + "balance_loss_clip": 1.01806808, + "balance_loss_mlp": 1.04255199, + "epoch": 0.31630843228618666, + "flos": 26323383646080.0, + "grad_norm": 1.9564366458132632, + "language_loss": 0.72557104, + "learning_rate": 3.1999791269666706e-06, + "loss": 0.74715853, + "num_input_tokens_seen": 112936745, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8359375, + "step": 5261, + "time_per_iteration": 4.0577170848846436 + }, + { + "auxiliary_loss_clip": 0.01040968, + "auxiliary_loss_mlp": 0.01005761, + "balance_loss_clip": 1.00365114, + "balance_loss_mlp": 1.01333809, + "epoch": 0.3163685555388547, + "flos": 66758441552640.0, + "grad_norm": 0.7495327099187281, + "language_loss": 0.50639355, + "learning_rate": 3.1996675318233716e-06, + "loss": 0.52686083, + "num_input_tokens_seen": 112994845, + "router_z_loss_clip": 0.02111816, + "router_z_loss_mlp": 0.27734375, + "step": 5262, + "time_per_iteration": 5.9139063358306885 + }, + { + "auxiliary_loss_clip": 0.01133191, + "auxiliary_loss_mlp": 0.01038454, + "balance_loss_clip": 1.02408338, + "balance_loss_mlp": 1.04845881, + "epoch": 0.31642867879152264, + "flos": 25995662933760.0, + "grad_norm": 1.4936033884005069, + "language_loss": 0.85241222, + "learning_rate": 3.19935589118856e-06, + "loss": 0.87412858, + "num_input_tokens_seen": 113015125, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.84765625, + "step": 5263, + "time_per_iteration": 2.4966084957122803 + }, + { + "auxiliary_loss_clip": 0.01127359, + "auxiliary_loss_mlp": 0.01045858, + "balance_loss_clip": 1.03201818, + "balance_loss_mlp": 1.04657304, + "epoch": 0.3164888020441906, + "flos": 25774955815680.0, + "grad_norm": 1.4671140059184749, + "language_loss": 0.81675243, + "learning_rate": 3.1990442050740535e-06, + "loss": 0.83848464, + "num_input_tokens_seen": 113035535, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.80859375, + "step": 5264, + "time_per_iteration": 2.5126495361328125 + }, + { + "auxiliary_loss_clip": 0.01133844, + "auxiliary_loss_mlp": 0.0103675, + "balance_loss_clip": 1.02107441, + "balance_loss_mlp": 1.0484283, + "epoch": 0.3165489252968586, + "flos": 19756220071680.0, + "grad_norm": 1.6829803459821215, + "language_loss": 0.79974926, + "learning_rate": 3.19873247349167e-06, + "loss": 0.82145512, + "num_input_tokens_seen": 113052720, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8515625, + "step": 5265, + "time_per_iteration": 2.444263219833374 + }, + { + "auxiliary_loss_clip": 0.0113354, + "auxiliary_loss_mlp": 0.01039316, + "balance_loss_clip": 1.02361572, + "balance_loss_mlp": 1.04815876, + "epoch": 0.31660904854952654, + "flos": 23183929180800.0, + "grad_norm": 1.5672890574859826, + "language_loss": 0.74875605, + "learning_rate": 3.1984206964532307e-06, + "loss": 0.77048463, + "num_input_tokens_seen": 113071435, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8515625, + "step": 5266, + "time_per_iteration": 2.5323407649993896 + }, + { + "auxiliary_loss_clip": 0.01131974, + "auxiliary_loss_mlp": 0.01043072, + "balance_loss_clip": 1.02851653, + "balance_loss_mlp": 1.04640543, + "epoch": 0.3166691718021945, + "flos": 20408501099520.0, + "grad_norm": 2.021043754719528, + "language_loss": 0.78872609, + "learning_rate": 3.1981088739705585e-06, + "loss": 0.81047654, + "num_input_tokens_seen": 113088645, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.85546875, + "step": 5267, + "time_per_iteration": 2.4591164588928223 + }, + { + "auxiliary_loss_clip": 0.01042632, + "auxiliary_loss_mlp": 0.01004279, + "balance_loss_clip": 1.00216913, + "balance_loss_mlp": 1.01493907, + "epoch": 0.31672929505486247, + "flos": 70144781172480.0, + "grad_norm": 0.7322532755123746, + "language_loss": 0.57800645, + "learning_rate": 3.197797006055478e-06, + "loss": 0.59847558, + "num_input_tokens_seen": 113152775, + "router_z_loss_clip": 0.02111816, + "router_z_loss_mlp": 0.27734375, + "step": 5268, + "time_per_iteration": 3.061121702194214 + }, + { + "auxiliary_loss_clip": 0.01132182, + "auxiliary_loss_mlp": 0.01037998, + "balance_loss_clip": 1.02291262, + "balance_loss_mlp": 1.04683709, + "epoch": 0.31678941830753043, + "flos": 14355758154240.0, + "grad_norm": 1.8728828385616285, + "language_loss": 0.72881675, + "learning_rate": 3.197485092719815e-06, + "loss": 0.75051844, + "num_input_tokens_seen": 113171410, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.85546875, + "step": 5269, + "time_per_iteration": 2.4871747493743896 + }, + { + "auxiliary_loss_clip": 0.0113037, + "auxiliary_loss_mlp": 0.01039313, + "balance_loss_clip": 1.02470934, + "balance_loss_mlp": 1.04689598, + "epoch": 0.3168495415601984, + "flos": 22747722416640.0, + "grad_norm": 2.0592855460289394, + "language_loss": 0.79914796, + "learning_rate": 3.1971731339753973e-06, + "loss": 0.82084477, + "num_input_tokens_seen": 113189965, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8359375, + "step": 5270, + "time_per_iteration": 2.502607822418213 + }, + { + "auxiliary_loss_clip": 0.01134389, + "auxiliary_loss_mlp": 0.01041999, + "balance_loss_clip": 1.02582264, + "balance_loss_mlp": 1.04792333, + "epoch": 0.31690966481286637, + "flos": 20115254465280.0, + "grad_norm": 1.9728362515560998, + "language_loss": 0.79207718, + "learning_rate": 3.1968611298340545e-06, + "loss": 0.8138411, + "num_input_tokens_seen": 113206355, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8671875, + "step": 5271, + "time_per_iteration": 2.4412505626678467 + }, + { + "auxiliary_loss_clip": 0.0113132, + "auxiliary_loss_mlp": 0.01040651, + "balance_loss_clip": 1.02440262, + "balance_loss_mlp": 1.04685235, + "epoch": 0.31696978806553433, + "flos": 21178928937600.0, + "grad_norm": 1.769221166791082, + "language_loss": 0.73264146, + "learning_rate": 3.1965490803076173e-06, + "loss": 0.75436121, + "num_input_tokens_seen": 113225440, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.84375, + "step": 5272, + "time_per_iteration": 2.4992945194244385 + }, + { + "auxiliary_loss_clip": 0.0113408, + "auxiliary_loss_mlp": 0.01039209, + "balance_loss_clip": 1.02262676, + "balance_loss_mlp": 1.04613161, + "epoch": 0.3170299113182023, + "flos": 42997030439040.0, + "grad_norm": 1.9537759660060814, + "language_loss": 0.69159341, + "learning_rate": 3.1962369854079194e-06, + "loss": 0.71332633, + "num_input_tokens_seen": 113248840, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.87890625, + "step": 5273, + "time_per_iteration": 2.6510114669799805 + }, + { + "auxiliary_loss_clip": 0.01128979, + "auxiliary_loss_mlp": 0.01036407, + "balance_loss_clip": 1.02110016, + "balance_loss_mlp": 1.04609132, + "epoch": 0.31709003457087026, + "flos": 24460158384000.0, + "grad_norm": 1.4826309074588198, + "language_loss": 0.67691469, + "learning_rate": 3.195924845146795e-06, + "loss": 0.69856858, + "num_input_tokens_seen": 113269630, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.828125, + "step": 5274, + "time_per_iteration": 2.5467329025268555 + }, + { + "auxiliary_loss_clip": 0.01124583, + "auxiliary_loss_mlp": 0.01035156, + "balance_loss_clip": 1.02092862, + "balance_loss_mlp": 1.04432762, + "epoch": 0.3171501578235382, + "flos": 24135310759680.0, + "grad_norm": 1.5251182195487059, + "language_loss": 0.80846918, + "learning_rate": 3.195612659536081e-06, + "loss": 0.83006656, + "num_input_tokens_seen": 113291200, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8046875, + "step": 5275, + "time_per_iteration": 2.511544704437256 + }, + { + "auxiliary_loss_clip": 0.0113165, + "auxiliary_loss_mlp": 0.01044428, + "balance_loss_clip": 1.0286448, + "balance_loss_mlp": 1.04539275, + "epoch": 0.31721028107620625, + "flos": 18879712392960.0, + "grad_norm": 1.952892513614063, + "language_loss": 0.72608984, + "learning_rate": 3.1953004285876147e-06, + "loss": 0.7478506, + "num_input_tokens_seen": 113310170, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.859375, + "step": 5276, + "time_per_iteration": 2.5273983478546143 + }, + { + "auxiliary_loss_clip": 0.01124489, + "auxiliary_loss_mlp": 0.0103537, + "balance_loss_clip": 1.02098107, + "balance_loss_mlp": 1.04455817, + "epoch": 0.3172704043288742, + "flos": 23147874904320.0, + "grad_norm": 1.3590988237701342, + "language_loss": 0.77843654, + "learning_rate": 3.194988152313236e-06, + "loss": 0.80003512, + "num_input_tokens_seen": 113331140, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.796875, + "step": 5277, + "time_per_iteration": 2.51247501373291 + }, + { + "auxiliary_loss_clip": 0.0112964, + "auxiliary_loss_mlp": 0.01034254, + "balance_loss_clip": 1.01833999, + "balance_loss_mlp": 1.04444003, + "epoch": 0.3173305275815422, + "flos": 17858520731520.0, + "grad_norm": 1.8256288285105424, + "language_loss": 0.78756094, + "learning_rate": 3.1946758307247878e-06, + "loss": 0.80919981, + "num_input_tokens_seen": 113350030, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8515625, + "step": 5278, + "time_per_iteration": 2.5376405715942383 + }, + { + "auxiliary_loss_clip": 0.01037546, + "auxiliary_loss_mlp": 0.01002993, + "balance_loss_clip": 1.0011332, + "balance_loss_mlp": 1.00972891, + "epoch": 0.31739065083421014, + "flos": 59973476883840.0, + "grad_norm": 0.8755672893463982, + "language_loss": 0.62821174, + "learning_rate": 3.1943634638341114e-06, + "loss": 0.64861709, + "num_input_tokens_seen": 113395820, + "router_z_loss_clip": 0.01855469, + "router_z_loss_mlp": 0.27734375, + "step": 5279, + "time_per_iteration": 2.823489189147949 + }, + { + "auxiliary_loss_clip": 0.01133426, + "auxiliary_loss_mlp": 0.01040678, + "balance_loss_clip": 1.0242753, + "balance_loss_mlp": 1.04568505, + "epoch": 0.3174507740868781, + "flos": 23800981944960.0, + "grad_norm": 1.6672726712999033, + "language_loss": 0.8099947, + "learning_rate": 3.194051051653053e-06, + "loss": 0.83173573, + "num_input_tokens_seen": 113416835, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.87890625, + "step": 5280, + "time_per_iteration": 2.490154981613159 + }, + { + "auxiliary_loss_clip": 0.01130309, + "auxiliary_loss_mlp": 0.01044119, + "balance_loss_clip": 1.02963543, + "balance_loss_mlp": 1.04713202, + "epoch": 0.31751089733954607, + "flos": 27638899349760.0, + "grad_norm": 1.444928497123541, + "language_loss": 0.77968711, + "learning_rate": 3.19373859419346e-06, + "loss": 0.80143142, + "num_input_tokens_seen": 113440850, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.83203125, + "step": 5281, + "time_per_iteration": 2.590106248855591 + }, + { + "auxiliary_loss_clip": 0.01129621, + "auxiliary_loss_mlp": 0.01035628, + "balance_loss_clip": 1.02001119, + "balance_loss_mlp": 1.0464325, + "epoch": 0.31757102059221404, + "flos": 23769273214080.0, + "grad_norm": 1.6441690082428626, + "language_loss": 0.78319824, + "learning_rate": 3.193426091467179e-06, + "loss": 0.8048507, + "num_input_tokens_seen": 113461000, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.83203125, + "step": 5282, + "time_per_iteration": 2.4879021644592285 + }, + { + "auxiliary_loss_clip": 0.01133826, + "auxiliary_loss_mlp": 0.0103931, + "balance_loss_clip": 1.02429008, + "balance_loss_mlp": 1.04685783, + "epoch": 0.317631143844882, + "flos": 25264521596160.0, + "grad_norm": 2.066002014025373, + "language_loss": 0.66989815, + "learning_rate": 3.193113543486061e-06, + "loss": 0.69162953, + "num_input_tokens_seen": 113480820, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8671875, + "step": 5283, + "time_per_iteration": 2.4914467334747314 + }, + { + "auxiliary_loss_clip": 0.01037416, + "auxiliary_loss_mlp": 0.01002537, + "balance_loss_clip": 1.00047421, + "balance_loss_mlp": 1.00956297, + "epoch": 0.31769126709754997, + "flos": 55825939221120.0, + "grad_norm": 0.7287723120729913, + "language_loss": 0.52796859, + "learning_rate": 3.192800950261958e-06, + "loss": 0.5483681, + "num_input_tokens_seen": 113536910, + "router_z_loss_clip": 0.02062988, + "router_z_loss_mlp": 0.27734375, + "step": 5284, + "time_per_iteration": 3.0077779293060303 + }, + { + "auxiliary_loss_clip": 0.01137201, + "auxiliary_loss_mlp": 0.01037818, + "balance_loss_clip": 1.02314341, + "balance_loss_mlp": 1.04976773, + "epoch": 0.31775139035021793, + "flos": 16690562098560.0, + "grad_norm": 1.732541053937659, + "language_loss": 0.7061168, + "learning_rate": 3.1924883118067235e-06, + "loss": 0.72786701, + "num_input_tokens_seen": 113555480, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.875, + "step": 5285, + "time_per_iteration": 2.4796152114868164 + }, + { + "auxiliary_loss_clip": 0.0103775, + "auxiliary_loss_mlp": 0.01003604, + "balance_loss_clip": 1.00170827, + "balance_loss_mlp": 1.00987303, + "epoch": 0.3178115136028859, + "flos": 64227241019520.0, + "grad_norm": 0.8184329386673247, + "language_loss": 0.60497808, + "learning_rate": 3.1921756281322123e-06, + "loss": 0.6253916, + "num_input_tokens_seen": 113616790, + "router_z_loss_clip": 0.0189209, + "router_z_loss_mlp": 0.27929688, + "step": 5286, + "time_per_iteration": 3.060959815979004 + }, + { + "auxiliary_loss_clip": 0.01131379, + "auxiliary_loss_mlp": 0.01042363, + "balance_loss_clip": 1.02701449, + "balance_loss_mlp": 1.04520202, + "epoch": 0.31787163685555386, + "flos": 18697465762560.0, + "grad_norm": 1.8142745455991967, + "language_loss": 0.72112805, + "learning_rate": 3.1918628992502826e-06, + "loss": 0.74286544, + "num_input_tokens_seen": 113635320, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.86328125, + "step": 5287, + "time_per_iteration": 2.480926752090454 + }, + { + "auxiliary_loss_clip": 0.01131312, + "auxiliary_loss_mlp": 0.01047698, + "balance_loss_clip": 1.03083003, + "balance_loss_mlp": 1.04454064, + "epoch": 0.31793176010822183, + "flos": 21324762155520.0, + "grad_norm": 1.8467549942081902, + "language_loss": 0.75335222, + "learning_rate": 3.191550125172792e-06, + "loss": 0.77514231, + "num_input_tokens_seen": 113654000, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8671875, + "step": 5288, + "time_per_iteration": 2.4506337642669678 + }, + { + "auxiliary_loss_clip": 0.01123463, + "auxiliary_loss_mlp": 0.01036721, + "balance_loss_clip": 1.02344155, + "balance_loss_mlp": 1.04175711, + "epoch": 0.31799188336088985, + "flos": 20958688696320.0, + "grad_norm": 2.214262263159222, + "language_loss": 0.87642509, + "learning_rate": 3.1912373059116007e-06, + "loss": 0.89802694, + "num_input_tokens_seen": 113672375, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.8203125, + "step": 5289, + "time_per_iteration": 2.4887404441833496 + }, + { + "auxiliary_loss_clip": 0.01127988, + "auxiliary_loss_mlp": 0.01039305, + "balance_loss_clip": 1.02569127, + "balance_loss_mlp": 1.04635859, + "epoch": 0.3180520066135578, + "flos": 22491930689280.0, + "grad_norm": 1.8563377401537928, + "language_loss": 0.67677546, + "learning_rate": 3.190924441478572e-06, + "loss": 0.69844842, + "num_input_tokens_seen": 113692385, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.81640625, + "step": 5290, + "time_per_iteration": 2.4699981212615967 + }, + { + "auxiliary_loss_clip": 0.01130209, + "auxiliary_loss_mlp": 0.01045373, + "balance_loss_clip": 1.02983999, + "balance_loss_mlp": 1.04348135, + "epoch": 0.3181121298662258, + "flos": 27235335070080.0, + "grad_norm": 1.9889060202243536, + "language_loss": 0.79926544, + "learning_rate": 3.1906115318855687e-06, + "loss": 0.82102132, + "num_input_tokens_seen": 113712145, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8671875, + "step": 5291, + "time_per_iteration": 2.5350663661956787 + }, + { + "auxiliary_loss_clip": 0.011332, + "auxiliary_loss_mlp": 0.01037344, + "balance_loss_clip": 1.02160883, + "balance_loss_mlp": 1.04684091, + "epoch": 0.31817225311889374, + "flos": 23180158252800.0, + "grad_norm": 2.2851564798864694, + "language_loss": 0.79887748, + "learning_rate": 3.1902985771444577e-06, + "loss": 0.82058293, + "num_input_tokens_seen": 113731435, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8671875, + "step": 5292, + "time_per_iteration": 2.4561853408813477 + }, + { + "auxiliary_loss_clip": 0.01124086, + "auxiliary_loss_mlp": 0.01034983, + "balance_loss_clip": 1.02173245, + "balance_loss_mlp": 1.04506028, + "epoch": 0.3182323763715617, + "flos": 23258803080960.0, + "grad_norm": 1.6321803022225574, + "language_loss": 0.74406421, + "learning_rate": 3.1899855772671043e-06, + "loss": 0.76565492, + "num_input_tokens_seen": 113750825, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7890625, + "step": 5293, + "time_per_iteration": 2.562264919281006 + }, + { + "auxiliary_loss_clip": 0.01127349, + "auxiliary_loss_mlp": 0.0104221, + "balance_loss_clip": 1.02864981, + "balance_loss_mlp": 1.04655647, + "epoch": 0.3182924996242297, + "flos": 29016683280000.0, + "grad_norm": 1.669926034583184, + "language_loss": 0.74003655, + "learning_rate": 3.189672532265379e-06, + "loss": 0.7617321, + "num_input_tokens_seen": 113770010, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.80859375, + "step": 5294, + "time_per_iteration": 2.511491537094116 + }, + { + "auxiliary_loss_clip": 0.01131359, + "auxiliary_loss_mlp": 0.01034334, + "balance_loss_clip": 1.0186758, + "balance_loss_mlp": 1.04616928, + "epoch": 0.31835262287689764, + "flos": 20449188230400.0, + "grad_norm": 1.856323864882145, + "language_loss": 0.76211727, + "learning_rate": 3.189359442151152e-06, + "loss": 0.78377414, + "num_input_tokens_seen": 113788640, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8515625, + "step": 5295, + "time_per_iteration": 2.482302665710449 + }, + { + "auxiliary_loss_clip": 0.01134404, + "auxiliary_loss_mlp": 0.01042471, + "balance_loss_clip": 1.02765322, + "balance_loss_mlp": 1.04831004, + "epoch": 0.3184127461295656, + "flos": 25119478477440.0, + "grad_norm": 1.6316405915506296, + "language_loss": 0.69476807, + "learning_rate": 3.189046306936296e-06, + "loss": 0.71653676, + "num_input_tokens_seen": 113809515, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.859375, + "step": 5296, + "time_per_iteration": 2.4972259998321533 + }, + { + "auxiliary_loss_clip": 0.01129364, + "auxiliary_loss_mlp": 0.0103894, + "balance_loss_clip": 1.02421129, + "balance_loss_mlp": 1.04513788, + "epoch": 0.31847286938223357, + "flos": 25551231955200.0, + "grad_norm": 2.3772504575271367, + "language_loss": 0.77559733, + "learning_rate": 3.1887331266326846e-06, + "loss": 0.79728031, + "num_input_tokens_seen": 113829770, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.84375, + "step": 5297, + "time_per_iteration": 2.5681862831115723 + }, + { + "auxiliary_loss_clip": 0.01126969, + "auxiliary_loss_mlp": 0.01030144, + "balance_loss_clip": 1.01533866, + "balance_loss_mlp": 1.04480934, + "epoch": 0.31853299263490154, + "flos": 27782470010880.0, + "grad_norm": 1.9869765921291695, + "language_loss": 0.79451257, + "learning_rate": 3.1884199012521942e-06, + "loss": 0.81608367, + "num_input_tokens_seen": 113849320, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8203125, + "step": 5298, + "time_per_iteration": 2.4990038871765137 + }, + { + "auxiliary_loss_clip": 0.01132136, + "auxiliary_loss_mlp": 0.01039187, + "balance_loss_clip": 1.0245657, + "balance_loss_mlp": 1.04609096, + "epoch": 0.3185931158875695, + "flos": 22706747976960.0, + "grad_norm": 2.132815699592654, + "language_loss": 0.7431671, + "learning_rate": 3.1881066308067016e-06, + "loss": 0.7648803, + "num_input_tokens_seen": 113867860, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.859375, + "step": 5299, + "time_per_iteration": 2.4902234077453613 + }, + { + "auxiliary_loss_clip": 0.01130922, + "auxiliary_loss_mlp": 0.01042737, + "balance_loss_clip": 1.02775824, + "balance_loss_mlp": 1.04395795, + "epoch": 0.31865323914023747, + "flos": 24571517523840.0, + "grad_norm": 5.1444082132017925, + "language_loss": 0.7834971, + "learning_rate": 3.1877933153080873e-06, + "loss": 0.80523366, + "num_input_tokens_seen": 113886375, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8671875, + "step": 5300, + "time_per_iteration": 2.476113796234131 + }, + { + "auxiliary_loss_clip": 0.01127423, + "auxiliary_loss_mlp": 0.01038051, + "balance_loss_clip": 1.02245879, + "balance_loss_mlp": 1.04332328, + "epoch": 0.31871336239290543, + "flos": 18186564666240.0, + "grad_norm": 4.220537638442504, + "language_loss": 0.8416568, + "learning_rate": 3.1874799547682304e-06, + "loss": 0.86331153, + "num_input_tokens_seen": 113904065, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84375, + "step": 5301, + "time_per_iteration": 2.4672341346740723 + }, + { + "auxiliary_loss_clip": 0.01132761, + "auxiliary_loss_mlp": 0.01045513, + "balance_loss_clip": 1.0299325, + "balance_loss_mlp": 1.05064154, + "epoch": 0.31877348564557345, + "flos": 21826756679040.0, + "grad_norm": 2.4555807672502277, + "language_loss": 0.77689236, + "learning_rate": 3.187166549199015e-06, + "loss": 0.79867512, + "num_input_tokens_seen": 113918415, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8203125, + "step": 5302, + "time_per_iteration": 2.4480254650115967 + }, + { + "auxiliary_loss_clip": 0.011261, + "auxiliary_loss_mlp": 0.01037244, + "balance_loss_clip": 1.02197289, + "balance_loss_mlp": 1.0458461, + "epoch": 0.3188336088982414, + "flos": 22015252275840.0, + "grad_norm": 1.6601771821563076, + "language_loss": 0.79729378, + "learning_rate": 3.1868530986123255e-06, + "loss": 0.81892729, + "num_input_tokens_seen": 113938135, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8046875, + "step": 5303, + "time_per_iteration": 5.451193809509277 + }, + { + "auxiliary_loss_clip": 0.01137183, + "auxiliary_loss_mlp": 0.0104561, + "balance_loss_clip": 1.0295403, + "balance_loss_mlp": 1.04810047, + "epoch": 0.3188937321509094, + "flos": 20047886507520.0, + "grad_norm": 2.065727829234295, + "language_loss": 0.72734123, + "learning_rate": 3.186539603020047e-06, + "loss": 0.74916923, + "num_input_tokens_seen": 113957125, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.890625, + "step": 5304, + "time_per_iteration": 3.835230588912964 + }, + { + "auxiliary_loss_clip": 0.01126733, + "auxiliary_loss_mlp": 0.01039176, + "balance_loss_clip": 1.02546668, + "balance_loss_mlp": 1.04595399, + "epoch": 0.31895385540357735, + "flos": 25848105863040.0, + "grad_norm": 1.8866410100018438, + "language_loss": 0.71773344, + "learning_rate": 3.186226062434068e-06, + "loss": 0.73939252, + "num_input_tokens_seen": 113974875, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.80859375, + "step": 5305, + "time_per_iteration": 2.5330212116241455 + }, + { + "auxiliary_loss_clip": 0.01129402, + "auxiliary_loss_mlp": 0.0103604, + "balance_loss_clip": 1.02209806, + "balance_loss_mlp": 1.0472002, + "epoch": 0.3190139786562453, + "flos": 23477714519040.0, + "grad_norm": 1.6861128411196662, + "language_loss": 0.64708328, + "learning_rate": 3.1859124768662778e-06, + "loss": 0.66873765, + "num_input_tokens_seen": 113994450, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.82421875, + "step": 5306, + "time_per_iteration": 2.4788570404052734 + }, + { + "auxiliary_loss_clip": 0.01135221, + "auxiliary_loss_mlp": 0.01042556, + "balance_loss_clip": 1.02714205, + "balance_loss_mlp": 1.05026746, + "epoch": 0.3190741019089133, + "flos": 29095543589760.0, + "grad_norm": 2.161280639112344, + "language_loss": 0.79625881, + "learning_rate": 3.1855988463285678e-06, + "loss": 0.81803662, + "num_input_tokens_seen": 114013945, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84765625, + "step": 5307, + "time_per_iteration": 2.5614371299743652 + }, + { + "auxiliary_loss_clip": 0.0112354, + "auxiliary_loss_mlp": 0.01039882, + "balance_loss_clip": 1.02412832, + "balance_loss_mlp": 1.04311657, + "epoch": 0.31913422516158124, + "flos": 17129534209920.0, + "grad_norm": 1.727529620646192, + "language_loss": 0.77898794, + "learning_rate": 3.1852851708328308e-06, + "loss": 0.80062222, + "num_input_tokens_seen": 114031375, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8046875, + "step": 5308, + "time_per_iteration": 2.4443254470825195 + }, + { + "auxiliary_loss_clip": 0.01142678, + "auxiliary_loss_mlp": 0.01048979, + "balance_loss_clip": 1.03182518, + "balance_loss_mlp": 1.05046844, + "epoch": 0.3191943484142492, + "flos": 16069846147200.0, + "grad_norm": 5.1649453810283426, + "language_loss": 0.74302876, + "learning_rate": 3.184971450390961e-06, + "loss": 0.76494527, + "num_input_tokens_seen": 114048465, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.921875, + "step": 5309, + "time_per_iteration": 2.494800090789795 + }, + { + "auxiliary_loss_clip": 0.0112957, + "auxiliary_loss_mlp": 0.01034444, + "balance_loss_clip": 1.01998436, + "balance_loss_mlp": 1.04589248, + "epoch": 0.3192544716669172, + "flos": 22966166977920.0, + "grad_norm": 1.754429841361115, + "language_loss": 0.82606339, + "learning_rate": 3.184657685014856e-06, + "loss": 0.84770352, + "num_input_tokens_seen": 114068415, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8359375, + "step": 5310, + "time_per_iteration": 2.4630603790283203 + }, + { + "auxiliary_loss_clip": 0.01129012, + "auxiliary_loss_mlp": 0.01041266, + "balance_loss_clip": 1.02762246, + "balance_loss_mlp": 1.04536486, + "epoch": 0.31931459491958514, + "flos": 26870339018880.0, + "grad_norm": 1.4405475768569584, + "language_loss": 0.78319013, + "learning_rate": 3.184343874716412e-06, + "loss": 0.8048929, + "num_input_tokens_seen": 114088565, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.8359375, + "step": 5311, + "time_per_iteration": 2.5892724990844727 + }, + { + "auxiliary_loss_clip": 0.01130953, + "auxiliary_loss_mlp": 0.01040389, + "balance_loss_clip": 1.02419996, + "balance_loss_mlp": 1.04695129, + "epoch": 0.3193747181722531, + "flos": 21836525178240.0, + "grad_norm": 2.475613964939968, + "language_loss": 0.84316272, + "learning_rate": 3.1840300195075295e-06, + "loss": 0.86487615, + "num_input_tokens_seen": 114107160, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.83984375, + "step": 5312, + "time_per_iteration": 2.4625802040100098 + }, + { + "auxiliary_loss_clip": 0.01137215, + "auxiliary_loss_mlp": 0.01044515, + "balance_loss_clip": 1.02808809, + "balance_loss_mlp": 1.0480628, + "epoch": 0.31943484142492107, + "flos": 18324999682560.0, + "grad_norm": 2.3910939905221302, + "language_loss": 0.78584075, + "learning_rate": 3.1837161194001102e-06, + "loss": 0.80765808, + "num_input_tokens_seen": 114123420, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.890625, + "step": 5313, + "time_per_iteration": 2.548509359359741 + }, + { + "auxiliary_loss_clip": 0.01132383, + "auxiliary_loss_mlp": 0.01036276, + "balance_loss_clip": 1.02133918, + "balance_loss_mlp": 1.04814112, + "epoch": 0.31949496467758903, + "flos": 21615818060160.0, + "grad_norm": 2.1643333364087582, + "language_loss": 0.85868084, + "learning_rate": 3.183402174406057e-06, + "loss": 0.88036746, + "num_input_tokens_seen": 114139230, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.83984375, + "step": 5314, + "time_per_iteration": 2.4721946716308594 + }, + { + "auxiliary_loss_clip": 0.01131852, + "auxiliary_loss_mlp": 0.01040473, + "balance_loss_clip": 1.02502346, + "balance_loss_mlp": 1.04725409, + "epoch": 0.31955508793025705, + "flos": 21760214734080.0, + "grad_norm": 1.7188296838329389, + "language_loss": 0.79836512, + "learning_rate": 3.1830881845372747e-06, + "loss": 0.82008839, + "num_input_tokens_seen": 114159290, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84375, + "step": 5315, + "time_per_iteration": 2.512554407119751 + }, + { + "auxiliary_loss_clip": 0.01135172, + "auxiliary_loss_mlp": 0.01049715, + "balance_loss_clip": 1.03331804, + "balance_loss_mlp": 1.0493269, + "epoch": 0.319615211182925, + "flos": 17164331510400.0, + "grad_norm": 6.566744634036759, + "language_loss": 0.67652613, + "learning_rate": 3.18277414980567e-06, + "loss": 0.69837505, + "num_input_tokens_seen": 114177655, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.859375, + "step": 5316, + "time_per_iteration": 2.4364819526672363 + }, + { + "auxiliary_loss_clip": 0.01133826, + "auxiliary_loss_mlp": 0.01034907, + "balance_loss_clip": 1.02105474, + "balance_loss_mlp": 1.04888916, + "epoch": 0.319675334435593, + "flos": 28112812416000.0, + "grad_norm": 1.4751284993654519, + "language_loss": 0.69336772, + "learning_rate": 3.1824600702231515e-06, + "loss": 0.71505511, + "num_input_tokens_seen": 114200880, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.84765625, + "step": 5317, + "time_per_iteration": 2.6055562496185303 + }, + { + "auxiliary_loss_clip": 0.01043016, + "auxiliary_loss_mlp": 0.0100349, + "balance_loss_clip": 1.00143993, + "balance_loss_mlp": 1.01474404, + "epoch": 0.31973545768826095, + "flos": 69501119408640.0, + "grad_norm": 0.7259742625655435, + "language_loss": 0.53048342, + "learning_rate": 3.182145945801628e-06, + "loss": 0.5509485, + "num_input_tokens_seen": 114267145, + "router_z_loss_clip": 0.02050781, + "router_z_loss_mlp": 0.28320312, + "step": 5318, + "time_per_iteration": 3.200087308883667 + }, + { + "auxiliary_loss_clip": 0.01132062, + "auxiliary_loss_mlp": 0.01037492, + "balance_loss_clip": 1.02311563, + "balance_loss_mlp": 1.04900801, + "epoch": 0.3197955809409289, + "flos": 13699203408000.0, + "grad_norm": 1.839211184718713, + "language_loss": 0.83865941, + "learning_rate": 3.181831776553012e-06, + "loss": 0.8603549, + "num_input_tokens_seen": 114284630, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.83203125, + "step": 5319, + "time_per_iteration": 2.471498966217041 + }, + { + "auxiliary_loss_clip": 0.01131434, + "auxiliary_loss_mlp": 0.01042883, + "balance_loss_clip": 1.0279578, + "balance_loss_mlp": 1.04728413, + "epoch": 0.3198557041935969, + "flos": 33218124278400.0, + "grad_norm": 1.3959306603032393, + "language_loss": 0.63542199, + "learning_rate": 3.1815175624892165e-06, + "loss": 0.65716517, + "num_input_tokens_seen": 114305830, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.83984375, + "step": 5320, + "time_per_iteration": 2.5526087284088135 + }, + { + "auxiliary_loss_clip": 0.01136898, + "auxiliary_loss_mlp": 0.01040372, + "balance_loss_clip": 1.02528036, + "balance_loss_mlp": 1.04970324, + "epoch": 0.31991582744626484, + "flos": 23732033788800.0, + "grad_norm": 1.9943779690432752, + "language_loss": 0.70519614, + "learning_rate": 3.1812033036221567e-06, + "loss": 0.72696882, + "num_input_tokens_seen": 114325165, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.87109375, + "step": 5321, + "time_per_iteration": 2.5262763500213623 + }, + { + "auxiliary_loss_clip": 0.01141108, + "auxiliary_loss_mlp": 0.01056872, + "balance_loss_clip": 1.04030156, + "balance_loss_mlp": 1.05110431, + "epoch": 0.3199759506989328, + "flos": 18550842445440.0, + "grad_norm": 3.2234904552907238, + "language_loss": 0.86543447, + "learning_rate": 3.180888999963749e-06, + "loss": 0.88741434, + "num_input_tokens_seen": 114341310, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8984375, + "step": 5322, + "time_per_iteration": 2.4432008266448975 + }, + { + "auxiliary_loss_clip": 0.01132235, + "auxiliary_loss_mlp": 0.01035962, + "balance_loss_clip": 1.02119207, + "balance_loss_mlp": 1.04827893, + "epoch": 0.3200360739516008, + "flos": 22418888382720.0, + "grad_norm": 1.7854648356549414, + "language_loss": 0.82820231, + "learning_rate": 3.1805746515259123e-06, + "loss": 0.84988427, + "num_input_tokens_seen": 114360355, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.83984375, + "step": 5323, + "time_per_iteration": 2.554539680480957 + }, + { + "auxiliary_loss_clip": 0.01130058, + "auxiliary_loss_mlp": 0.01037837, + "balance_loss_clip": 1.02157664, + "balance_loss_mlp": 1.04700553, + "epoch": 0.32009619720426874, + "flos": 20595236929920.0, + "grad_norm": 1.8735349940723531, + "language_loss": 0.77858555, + "learning_rate": 3.1802602583205663e-06, + "loss": 0.8002646, + "num_input_tokens_seen": 114379220, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.828125, + "step": 5324, + "time_per_iteration": 2.452894687652588 + }, + { + "auxiliary_loss_clip": 0.0113163, + "auxiliary_loss_mlp": 0.01034726, + "balance_loss_clip": 1.01981282, + "balance_loss_mlp": 1.04770339, + "epoch": 0.3201563204569367, + "flos": 18147637301760.0, + "grad_norm": 1.8150910160625646, + "language_loss": 0.80162597, + "learning_rate": 3.1799458203596333e-06, + "loss": 0.82328951, + "num_input_tokens_seen": 114396365, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.83984375, + "step": 5325, + "time_per_iteration": 2.5261802673339844 + }, + { + "auxiliary_loss_clip": 0.01133025, + "auxiliary_loss_mlp": 0.01041931, + "balance_loss_clip": 1.02690446, + "balance_loss_mlp": 1.04872847, + "epoch": 0.32021644370960467, + "flos": 31684235840640.0, + "grad_norm": 1.8959189814779316, + "language_loss": 0.75171864, + "learning_rate": 3.179631337655037e-06, + "loss": 0.77346826, + "num_input_tokens_seen": 114416780, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84375, + "step": 5326, + "time_per_iteration": 2.5300135612487793 + }, + { + "auxiliary_loss_clip": 0.01129958, + "auxiliary_loss_mlp": 0.01037614, + "balance_loss_clip": 1.02285552, + "balance_loss_mlp": 1.04836321, + "epoch": 0.32027656696227264, + "flos": 26865921646080.0, + "grad_norm": 1.4421847054475023, + "language_loss": 0.80826092, + "learning_rate": 3.179316810218701e-06, + "loss": 0.82993662, + "num_input_tokens_seen": 114437405, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.81640625, + "step": 5327, + "time_per_iteration": 2.5393614768981934 + }, + { + "auxiliary_loss_clip": 0.01135097, + "auxiliary_loss_mlp": 0.01037836, + "balance_loss_clip": 1.022434, + "balance_loss_mlp": 1.04888535, + "epoch": 0.32033669021494066, + "flos": 24169928492160.0, + "grad_norm": 1.5386676468863185, + "language_loss": 0.77926928, + "learning_rate": 3.179002238062554e-06, + "loss": 0.80099857, + "num_input_tokens_seen": 114458505, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.859375, + "step": 5328, + "time_per_iteration": 2.471806287765503 + }, + { + "auxiliary_loss_clip": 0.011322, + "auxiliary_loss_mlp": 0.01041791, + "balance_loss_clip": 1.02550721, + "balance_loss_mlp": 1.04632294, + "epoch": 0.3203968134676086, + "flos": 24460768915200.0, + "grad_norm": 2.9951100938200765, + "language_loss": 0.73971635, + "learning_rate": 3.178687621198524e-06, + "loss": 0.76145625, + "num_input_tokens_seen": 114479050, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.859375, + "step": 5329, + "time_per_iteration": 2.52327561378479 + }, + { + "auxiliary_loss_clip": 0.01127399, + "auxiliary_loss_mlp": 0.01033731, + "balance_loss_clip": 1.02012336, + "balance_loss_mlp": 1.04675198, + "epoch": 0.3204569367202766, + "flos": 18004713085440.0, + "grad_norm": 2.060461898980319, + "language_loss": 0.71036464, + "learning_rate": 3.1783729596385415e-06, + "loss": 0.73197591, + "num_input_tokens_seen": 114497415, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.8046875, + "step": 5330, + "time_per_iteration": 2.4405477046966553 + }, + { + "auxiliary_loss_clip": 0.01136038, + "auxiliary_loss_mlp": 0.01049965, + "balance_loss_clip": 1.03343058, + "balance_loss_mlp": 1.0474323, + "epoch": 0.32051705997294455, + "flos": 30589678650240.0, + "grad_norm": 1.7909305839918348, + "language_loss": 0.80022657, + "learning_rate": 3.1780582533945376e-06, + "loss": 0.82208663, + "num_input_tokens_seen": 114518785, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8828125, + "step": 5331, + "time_per_iteration": 2.5934245586395264 + }, + { + "auxiliary_loss_clip": 0.01037799, + "auxiliary_loss_mlp": 0.01004509, + "balance_loss_clip": 1.00256538, + "balance_loss_mlp": 1.01001608, + "epoch": 0.3205771832256125, + "flos": 68417979765120.0, + "grad_norm": 0.8366333048595008, + "language_loss": 0.57806182, + "learning_rate": 3.177743502478447e-06, + "loss": 0.59848487, + "num_input_tokens_seen": 114577710, + "router_z_loss_clip": 0.01940918, + "router_z_loss_mlp": 0.27734375, + "step": 5332, + "time_per_iteration": 2.9984278678894043 + }, + { + "auxiliary_loss_clip": 0.01134361, + "auxiliary_loss_mlp": 0.01039294, + "balance_loss_clip": 1.02450585, + "balance_loss_mlp": 1.04747975, + "epoch": 0.3206373064782805, + "flos": 30443953173120.0, + "grad_norm": 1.7943987990453594, + "language_loss": 0.73309821, + "learning_rate": 3.177428706902205e-06, + "loss": 0.75483477, + "num_input_tokens_seen": 114598640, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.87109375, + "step": 5333, + "time_per_iteration": 2.554401159286499 + }, + { + "auxiliary_loss_clip": 0.01133668, + "auxiliary_loss_mlp": 0.01042462, + "balance_loss_clip": 1.02686942, + "balance_loss_mlp": 1.04836345, + "epoch": 0.32069742973094845, + "flos": 22054502862720.0, + "grad_norm": 1.5896288664703238, + "language_loss": 0.71050882, + "learning_rate": 3.1771138666777485e-06, + "loss": 0.73227012, + "num_input_tokens_seen": 114618780, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8515625, + "step": 5334, + "time_per_iteration": 2.468472957611084 + }, + { + "auxiliary_loss_clip": 0.01132404, + "auxiliary_loss_mlp": 0.01041967, + "balance_loss_clip": 1.02658951, + "balance_loss_mlp": 1.04644001, + "epoch": 0.3207575529836164, + "flos": 22054000072320.0, + "grad_norm": 1.9528247502362917, + "language_loss": 0.77601135, + "learning_rate": 3.1767989818170156e-06, + "loss": 0.797755, + "num_input_tokens_seen": 114637525, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.859375, + "step": 5335, + "time_per_iteration": 2.524211883544922 + }, + { + "auxiliary_loss_clip": 0.01131695, + "auxiliary_loss_mlp": 0.01040383, + "balance_loss_clip": 1.02519548, + "balance_loss_mlp": 1.04687452, + "epoch": 0.3208176762362844, + "flos": 34057536186240.0, + "grad_norm": 1.5197552931214375, + "language_loss": 0.68353152, + "learning_rate": 3.1764840523319477e-06, + "loss": 0.70525241, + "num_input_tokens_seen": 114659705, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.84765625, + "step": 5336, + "time_per_iteration": 2.5674326419830322 + }, + { + "auxiliary_loss_clip": 0.01131682, + "auxiliary_loss_mlp": 0.01045646, + "balance_loss_clip": 1.03027439, + "balance_loss_mlp": 1.04688144, + "epoch": 0.32087779948895234, + "flos": 21798711135360.0, + "grad_norm": 1.7063748564330914, + "language_loss": 0.7895453, + "learning_rate": 3.176169078234487e-06, + "loss": 0.81131858, + "num_input_tokens_seen": 114678340, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84765625, + "step": 5337, + "time_per_iteration": 2.5010595321655273 + }, + { + "auxiliary_loss_clip": 0.01124535, + "auxiliary_loss_mlp": 0.01035607, + "balance_loss_clip": 1.02194548, + "balance_loss_mlp": 1.04505002, + "epoch": 0.3209379227416203, + "flos": 21434110133760.0, + "grad_norm": 1.7193225847880926, + "language_loss": 0.73997593, + "learning_rate": 3.1758540595365766e-06, + "loss": 0.76157737, + "num_input_tokens_seen": 114696980, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.796875, + "step": 5338, + "time_per_iteration": 2.4961647987365723 + }, + { + "auxiliary_loss_clip": 0.01132045, + "auxiliary_loss_mlp": 0.01041805, + "balance_loss_clip": 1.02633142, + "balance_loss_mlp": 1.04477298, + "epoch": 0.3209980459942883, + "flos": 25849075530240.0, + "grad_norm": 1.8336519924948942, + "language_loss": 0.63149244, + "learning_rate": 3.1755389962501626e-06, + "loss": 0.65323097, + "num_input_tokens_seen": 114717330, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.87109375, + "step": 5339, + "time_per_iteration": 2.5218987464904785 + }, + { + "auxiliary_loss_clip": 0.01130495, + "auxiliary_loss_mlp": 0.01039604, + "balance_loss_clip": 1.02409506, + "balance_loss_mlp": 1.04546928, + "epoch": 0.32105816924695624, + "flos": 19099162535040.0, + "grad_norm": 1.814332726776551, + "language_loss": 0.81917858, + "learning_rate": 3.175223888387192e-06, + "loss": 0.84087962, + "num_input_tokens_seen": 114736320, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8515625, + "step": 5340, + "time_per_iteration": 2.427483558654785 + }, + { + "auxiliary_loss_clip": 0.0113181, + "auxiliary_loss_mlp": 0.01043319, + "balance_loss_clip": 1.02847123, + "balance_loss_mlp": 1.04696941, + "epoch": 0.3211182924996242, + "flos": 16581860565120.0, + "grad_norm": 1.7172536004624983, + "language_loss": 0.7620244, + "learning_rate": 3.1749087359596137e-06, + "loss": 0.78377569, + "num_input_tokens_seen": 114754575, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.84765625, + "step": 5341, + "time_per_iteration": 2.4785468578338623 + }, + { + "auxiliary_loss_clip": 0.01130847, + "auxiliary_loss_mlp": 0.01036241, + "balance_loss_clip": 1.02154231, + "balance_loss_mlp": 1.04897809, + "epoch": 0.3211784157522922, + "flos": 22672202071680.0, + "grad_norm": 1.9213308470980235, + "language_loss": 0.78627086, + "learning_rate": 3.1745935389793786e-06, + "loss": 0.80794168, + "num_input_tokens_seen": 114773590, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.81640625, + "step": 5342, + "time_per_iteration": 2.4524106979370117 + }, + { + "auxiliary_loss_clip": 0.01133398, + "auxiliary_loss_mlp": 0.01039022, + "balance_loss_clip": 1.02290499, + "balance_loss_mlp": 1.04772902, + "epoch": 0.3212385390049602, + "flos": 20558787603840.0, + "grad_norm": 3.762302479650767, + "language_loss": 0.74934483, + "learning_rate": 3.174278297458438e-06, + "loss": 0.77106899, + "num_input_tokens_seen": 114790775, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.859375, + "step": 5343, + "time_per_iteration": 2.4744415283203125 + }, + { + "auxiliary_loss_clip": 0.01130661, + "auxiliary_loss_mlp": 0.01035912, + "balance_loss_clip": 1.02040279, + "balance_loss_mlp": 1.04623377, + "epoch": 0.32129866225762815, + "flos": 24791147233920.0, + "grad_norm": 1.6135516142824962, + "language_loss": 0.82859504, + "learning_rate": 3.173963011408748e-06, + "loss": 0.85026079, + "num_input_tokens_seen": 114809835, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.84375, + "step": 5344, + "time_per_iteration": 2.47578763961792 + }, + { + "auxiliary_loss_clip": 0.01130938, + "auxiliary_loss_mlp": 0.01039787, + "balance_loss_clip": 1.02407503, + "balance_loss_mlp": 1.04474425, + "epoch": 0.3213587855102961, + "flos": 18366871962240.0, + "grad_norm": 2.07297685310976, + "language_loss": 0.79812628, + "learning_rate": 3.173647680842262e-06, + "loss": 0.81983352, + "num_input_tokens_seen": 114826505, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.859375, + "step": 5345, + "time_per_iteration": 5.33889365196228 + }, + { + "auxiliary_loss_clip": 0.01130545, + "auxiliary_loss_mlp": 0.01036519, + "balance_loss_clip": 1.02149296, + "balance_loss_mlp": 1.04473424, + "epoch": 0.3214189087629641, + "flos": 27015992668800.0, + "grad_norm": 1.8810220564208493, + "language_loss": 0.83404821, + "learning_rate": 3.1733323057709384e-06, + "loss": 0.85571885, + "num_input_tokens_seen": 114846140, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.859375, + "step": 5346, + "time_per_iteration": 2.500577688217163 + }, + { + "auxiliary_loss_clip": 0.01131977, + "auxiliary_loss_mlp": 0.0103944, + "balance_loss_clip": 1.02362108, + "balance_loss_mlp": 1.04492784, + "epoch": 0.32147903201563205, + "flos": 23148269953920.0, + "grad_norm": 1.4095386913443633, + "language_loss": 0.81571388, + "learning_rate": 3.1730168862067366e-06, + "loss": 0.83742809, + "num_input_tokens_seen": 114866660, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.87109375, + "step": 5347, + "time_per_iteration": 2.4491653442382812 + }, + { + "auxiliary_loss_clip": 0.01130206, + "auxiliary_loss_mlp": 0.01039058, + "balance_loss_clip": 1.02332854, + "balance_loss_mlp": 1.04715562, + "epoch": 0.3215391552683, + "flos": 16580747243520.0, + "grad_norm": 1.9965712334987884, + "language_loss": 0.79898697, + "learning_rate": 3.1727014221616164e-06, + "loss": 0.82067955, + "num_input_tokens_seen": 114882820, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.828125, + "step": 5348, + "time_per_iteration": 2.471261501312256 + }, + { + "auxiliary_loss_clip": 0.01132661, + "auxiliary_loss_mlp": 0.01047853, + "balance_loss_clip": 1.03262997, + "balance_loss_mlp": 1.04691792, + "epoch": 0.321599278520968, + "flos": 17821820010240.0, + "grad_norm": 1.9690807455187813, + "language_loss": 0.8506968, + "learning_rate": 3.172385913647542e-06, + "loss": 0.87250197, + "num_input_tokens_seen": 114900745, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.859375, + "step": 5349, + "time_per_iteration": 2.4376416206359863 + }, + { + "auxiliary_loss_clip": 0.01130553, + "auxiliary_loss_mlp": 0.01037186, + "balance_loss_clip": 1.02215409, + "balance_loss_mlp": 1.04589188, + "epoch": 0.32165940177363594, + "flos": 16251769555200.0, + "grad_norm": 1.7092259574450879, + "language_loss": 0.80862331, + "learning_rate": 3.172070360676475e-06, + "loss": 0.83030069, + "num_input_tokens_seen": 114917940, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84765625, + "step": 5350, + "time_per_iteration": 2.463998794555664 + }, + { + "auxiliary_loss_clip": 0.01129559, + "auxiliary_loss_mlp": 0.01040074, + "balance_loss_clip": 1.02545869, + "balance_loss_mlp": 1.04548049, + "epoch": 0.3217195250263039, + "flos": 27599900158080.0, + "grad_norm": 1.7709203173786705, + "language_loss": 0.79856229, + "learning_rate": 3.1717547632603828e-06, + "loss": 0.82025862, + "num_input_tokens_seen": 114937735, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.83984375, + "step": 5351, + "time_per_iteration": 2.5017340183258057 + }, + { + "auxiliary_loss_clip": 0.01129171, + "auxiliary_loss_mlp": 0.01040328, + "balance_loss_clip": 1.02396047, + "balance_loss_mlp": 1.04505897, + "epoch": 0.3217796482789719, + "flos": 21470595373440.0, + "grad_norm": 1.701097630272038, + "language_loss": 0.75491166, + "learning_rate": 3.1714391214112326e-06, + "loss": 0.77660662, + "num_input_tokens_seen": 114956630, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.83984375, + "step": 5352, + "time_per_iteration": 2.4916653633117676 + }, + { + "auxiliary_loss_clip": 0.01132153, + "auxiliary_loss_mlp": 0.01038225, + "balance_loss_clip": 1.02179837, + "balance_loss_mlp": 1.0472436, + "epoch": 0.32183977153163984, + "flos": 21215593745280.0, + "grad_norm": 1.8428416092094815, + "language_loss": 0.8174473, + "learning_rate": 3.1711234351409933e-06, + "loss": 0.83915108, + "num_input_tokens_seen": 114976470, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8515625, + "step": 5353, + "time_per_iteration": 2.4554946422576904 + }, + { + "auxiliary_loss_clip": 0.01127699, + "auxiliary_loss_mlp": 0.01037405, + "balance_loss_clip": 1.02147865, + "balance_loss_mlp": 1.04577875, + "epoch": 0.3218998947843078, + "flos": 24608182331520.0, + "grad_norm": 1.533417142425662, + "language_loss": 0.73054826, + "learning_rate": 3.1708077044616365e-06, + "loss": 0.75219929, + "num_input_tokens_seen": 114996710, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8203125, + "step": 5354, + "time_per_iteration": 2.521679639816284 + }, + { + "auxiliary_loss_clip": 0.01129194, + "auxiliary_loss_mlp": 0.01033035, + "balance_loss_clip": 1.01830053, + "balance_loss_mlp": 1.04482782, + "epoch": 0.3219600180369758, + "flos": 22270577126400.0, + "grad_norm": 1.5056594732405602, + "language_loss": 0.8349731, + "learning_rate": 3.1704919293851334e-06, + "loss": 0.8565954, + "num_input_tokens_seen": 115015775, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.84375, + "step": 5355, + "time_per_iteration": 2.4590871334075928 + }, + { + "auxiliary_loss_clip": 0.0113427, + "auxiliary_loss_mlp": 0.01045552, + "balance_loss_clip": 1.0299834, + "balance_loss_mlp": 1.04840243, + "epoch": 0.3220201412896438, + "flos": 14939126939520.0, + "grad_norm": 2.2450583198173737, + "language_loss": 0.71577442, + "learning_rate": 3.1701761099234597e-06, + "loss": 0.73757267, + "num_input_tokens_seen": 115034265, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.859375, + "step": 5356, + "time_per_iteration": 2.4499382972717285 + }, + { + "auxiliary_loss_clip": 0.01137452, + "auxiliary_loss_mlp": 0.0103626, + "balance_loss_clip": 1.0196538, + "balance_loss_mlp": 1.04720378, + "epoch": 0.32208026454231176, + "flos": 22667389649280.0, + "grad_norm": 2.5072162620412968, + "language_loss": 0.68480343, + "learning_rate": 3.1698602460885903e-06, + "loss": 0.70654052, + "num_input_tokens_seen": 115051945, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.90234375, + "step": 5357, + "time_per_iteration": 2.449125289916992 + }, + { + "auxiliary_loss_clip": 0.01042111, + "auxiliary_loss_mlp": 0.01002103, + "balance_loss_clip": 1.00029111, + "balance_loss_mlp": 1.01435876, + "epoch": 0.3221403877949797, + "flos": 64605130053120.0, + "grad_norm": 0.7023861387911429, + "language_loss": 0.58256829, + "learning_rate": 3.1695443378925035e-06, + "loss": 0.60301042, + "num_input_tokens_seen": 115119090, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.27734375, + "step": 5358, + "time_per_iteration": 3.1561930179595947 + }, + { + "auxiliary_loss_clip": 0.01130123, + "auxiliary_loss_mlp": 0.01041349, + "balance_loss_clip": 1.02506542, + "balance_loss_mlp": 1.04423356, + "epoch": 0.3222005110476477, + "flos": 20157019004160.0, + "grad_norm": 5.918956850418863, + "language_loss": 0.83524048, + "learning_rate": 3.1692283853471777e-06, + "loss": 0.85695517, + "num_input_tokens_seen": 115137755, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.859375, + "step": 5359, + "time_per_iteration": 2.4850337505340576 + }, + { + "auxiliary_loss_clip": 0.01132117, + "auxiliary_loss_mlp": 0.01034071, + "balance_loss_clip": 1.019122, + "balance_loss_mlp": 1.04514802, + "epoch": 0.32226063430031565, + "flos": 22674177319680.0, + "grad_norm": 1.5557598040672038, + "language_loss": 0.79817981, + "learning_rate": 3.168912388464595e-06, + "loss": 0.81984174, + "num_input_tokens_seen": 115158150, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8671875, + "step": 5360, + "time_per_iteration": 2.476698637008667 + }, + { + "auxiliary_loss_clip": 0.01040711, + "auxiliary_loss_mlp": 0.00999439, + "balance_loss_clip": 0.99754351, + "balance_loss_mlp": 1.01316833, + "epoch": 0.3223207575529836, + "flos": 63828525075840.0, + "grad_norm": 0.750004294413456, + "language_loss": 0.5697335, + "learning_rate": 3.168596347256737e-06, + "loss": 0.59013498, + "num_input_tokens_seen": 115212755, + "router_z_loss_clip": 0.0189209, + "router_z_loss_mlp": 0.27539062, + "step": 5361, + "time_per_iteration": 2.933368444442749 + }, + { + "auxiliary_loss_clip": 0.01129938, + "auxiliary_loss_mlp": 0.01039744, + "balance_loss_clip": 1.02452111, + "balance_loss_mlp": 1.04625082, + "epoch": 0.3223808808056516, + "flos": 26870123537280.0, + "grad_norm": 1.730134050345621, + "language_loss": 0.71349204, + "learning_rate": 3.168280261735588e-06, + "loss": 0.73518884, + "num_input_tokens_seen": 115233090, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8359375, + "step": 5362, + "time_per_iteration": 2.508444309234619 + }, + { + "auxiliary_loss_clip": 0.0113054, + "auxiliary_loss_mlp": 0.01040003, + "balance_loss_clip": 1.02606201, + "balance_loss_mlp": 1.04685211, + "epoch": 0.32244100405831955, + "flos": 26761350176640.0, + "grad_norm": 1.6566995758494631, + "language_loss": 0.74008292, + "learning_rate": 3.167964131913135e-06, + "loss": 0.76178837, + "num_input_tokens_seen": 115252645, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.8359375, + "step": 5363, + "time_per_iteration": 2.530428409576416 + }, + { + "auxiliary_loss_clip": 0.01134592, + "auxiliary_loss_mlp": 0.01040493, + "balance_loss_clip": 1.02481735, + "balance_loss_mlp": 1.04535139, + "epoch": 0.3225011273109875, + "flos": 23803029020160.0, + "grad_norm": 2.5112112412179624, + "language_loss": 0.77012563, + "learning_rate": 3.167647957801365e-06, + "loss": 0.79187649, + "num_input_tokens_seen": 115269085, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.890625, + "step": 5364, + "time_per_iteration": 2.475532054901123 + }, + { + "auxiliary_loss_clip": 0.01128803, + "auxiliary_loss_mlp": 0.01043179, + "balance_loss_clip": 1.02747917, + "balance_loss_mlp": 1.04455853, + "epoch": 0.3225612505636555, + "flos": 17274505501440.0, + "grad_norm": 2.1198351151285992, + "language_loss": 0.77043676, + "learning_rate": 3.1673317394122672e-06, + "loss": 0.79215652, + "num_input_tokens_seen": 115286470, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.84375, + "step": 5365, + "time_per_iteration": 2.4466004371643066 + }, + { + "auxiliary_loss_clip": 0.01133051, + "auxiliary_loss_mlp": 0.01049625, + "balance_loss_clip": 1.03444982, + "balance_loss_mlp": 1.04861832, + "epoch": 0.32262137381632344, + "flos": 23366247638400.0, + "grad_norm": 1.5183743876703555, + "language_loss": 0.76853883, + "learning_rate": 3.1670154767578333e-06, + "loss": 0.79036558, + "num_input_tokens_seen": 115307000, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.84375, + "step": 5366, + "time_per_iteration": 2.4716286659240723 + }, + { + "auxiliary_loss_clip": 0.01129975, + "auxiliary_loss_mlp": 0.01042819, + "balance_loss_clip": 1.02767324, + "balance_loss_mlp": 1.04463363, + "epoch": 0.3226814970689914, + "flos": 23258803080960.0, + "grad_norm": 1.6325357922005805, + "language_loss": 0.7200039, + "learning_rate": 3.166699169850055e-06, + "loss": 0.74173188, + "num_input_tokens_seen": 115325925, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8515625, + "step": 5367, + "time_per_iteration": 2.4936037063598633 + }, + { + "auxiliary_loss_clip": 0.01125689, + "auxiliary_loss_mlp": 0.01042014, + "balance_loss_clip": 1.02759588, + "balance_loss_mlp": 1.04335558, + "epoch": 0.32274162032165943, + "flos": 16395196561920.0, + "grad_norm": 1.8801069032327764, + "language_loss": 0.7456941, + "learning_rate": 3.1663828187009274e-06, + "loss": 0.76737112, + "num_input_tokens_seen": 115343705, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8203125, + "step": 5368, + "time_per_iteration": 2.436897039413452 + }, + { + "auxiliary_loss_clip": 0.01125271, + "auxiliary_loss_mlp": 0.0104042, + "balance_loss_clip": 1.02592432, + "balance_loss_mlp": 1.04390144, + "epoch": 0.3228017435743274, + "flos": 27855081354240.0, + "grad_norm": 1.5502047591083525, + "language_loss": 0.79212499, + "learning_rate": 3.1660664233224467e-06, + "loss": 0.81378186, + "num_input_tokens_seen": 115364170, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8125, + "step": 5369, + "time_per_iteration": 2.516191244125366 + }, + { + "auxiliary_loss_clip": 0.01124886, + "auxiliary_loss_mlp": 0.01034584, + "balance_loss_clip": 1.02042747, + "balance_loss_mlp": 1.04432988, + "epoch": 0.32286186682699536, + "flos": 19608770741760.0, + "grad_norm": 1.8370527927944635, + "language_loss": 0.83173579, + "learning_rate": 3.16574998372661e-06, + "loss": 0.85333049, + "num_input_tokens_seen": 115382495, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8046875, + "step": 5370, + "time_per_iteration": 2.423494338989258 + }, + { + "auxiliary_loss_clip": 0.01128659, + "auxiliary_loss_mlp": 0.01038158, + "balance_loss_clip": 1.02367377, + "balance_loss_mlp": 1.04524064, + "epoch": 0.3229219900796633, + "flos": 24134017870080.0, + "grad_norm": 1.743608915284185, + "language_loss": 0.83372939, + "learning_rate": 3.1654334999254177e-06, + "loss": 0.85539752, + "num_input_tokens_seen": 115399450, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8359375, + "step": 5371, + "time_per_iteration": 2.481677532196045 + }, + { + "auxiliary_loss_clip": 0.01131779, + "auxiliary_loss_mlp": 0.01048903, + "balance_loss_clip": 1.0323211, + "balance_loss_mlp": 1.04514813, + "epoch": 0.3229821133323313, + "flos": 17748705876480.0, + "grad_norm": 2.043238736788368, + "language_loss": 0.88539696, + "learning_rate": 3.1651169719308695e-06, + "loss": 0.90720367, + "num_input_tokens_seen": 115417700, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8671875, + "step": 5372, + "time_per_iteration": 2.434785842895508 + }, + { + "auxiliary_loss_clip": 0.01128015, + "auxiliary_loss_mlp": 0.01045541, + "balance_loss_clip": 1.03011537, + "balance_loss_mlp": 1.04532862, + "epoch": 0.32304223658499925, + "flos": 22346025644160.0, + "grad_norm": 1.9701661898720624, + "language_loss": 0.73064935, + "learning_rate": 3.1648003997549694e-06, + "loss": 0.75238496, + "num_input_tokens_seen": 115435840, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.828125, + "step": 5373, + "time_per_iteration": 2.509288787841797 + }, + { + "auxiliary_loss_clip": 0.01126431, + "auxiliary_loss_mlp": 0.01036263, + "balance_loss_clip": 1.0217371, + "balance_loss_mlp": 1.04496944, + "epoch": 0.3231023598376672, + "flos": 18478302929280.0, + "grad_norm": 2.118108535598075, + "language_loss": 0.81306481, + "learning_rate": 3.1644837834097214e-06, + "loss": 0.83469176, + "num_input_tokens_seen": 115454210, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8125, + "step": 5374, + "time_per_iteration": 2.43719744682312 + }, + { + "auxiliary_loss_clip": 0.01122361, + "auxiliary_loss_mlp": 0.01036065, + "balance_loss_clip": 1.02135515, + "balance_loss_mlp": 1.04158425, + "epoch": 0.3231624830903352, + "flos": 27636313570560.0, + "grad_norm": 2.0253542373007223, + "language_loss": 0.87507123, + "learning_rate": 3.1641671229071317e-06, + "loss": 0.89665556, + "num_input_tokens_seen": 115471785, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.80859375, + "step": 5375, + "time_per_iteration": 2.5192272663116455 + }, + { + "auxiliary_loss_clip": 0.0112955, + "auxiliary_loss_mlp": 0.01037551, + "balance_loss_clip": 1.02163696, + "balance_loss_mlp": 1.04312396, + "epoch": 0.32322260634300315, + "flos": 21726423014400.0, + "grad_norm": 1.8491566525281582, + "language_loss": 0.75873786, + "learning_rate": 3.1638504182592076e-06, + "loss": 0.78040886, + "num_input_tokens_seen": 115491405, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8671875, + "step": 5376, + "time_per_iteration": 2.463103771209717 + }, + { + "auxiliary_loss_clip": 0.01123814, + "auxiliary_loss_mlp": 0.0103315, + "balance_loss_clip": 1.01955426, + "balance_loss_mlp": 1.04269242, + "epoch": 0.3232827295956711, + "flos": 22637656166400.0, + "grad_norm": 1.5890241026671568, + "language_loss": 0.67173672, + "learning_rate": 3.1635336694779594e-06, + "loss": 0.69330645, + "num_input_tokens_seen": 115511555, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.8125, + "step": 5377, + "time_per_iteration": 2.5341343879699707 + }, + { + "auxiliary_loss_clip": 0.01127281, + "auxiliary_loss_mlp": 0.01046076, + "balance_loss_clip": 1.02922571, + "balance_loss_mlp": 1.04433763, + "epoch": 0.3233428528483391, + "flos": 26322593546880.0, + "grad_norm": 1.5071806558198568, + "language_loss": 0.7231617, + "learning_rate": 3.1632168765753982e-06, + "loss": 0.74489522, + "num_input_tokens_seen": 115532860, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.828125, + "step": 5378, + "time_per_iteration": 2.4838621616363525 + }, + { + "auxiliary_loss_clip": 0.01123972, + "auxiliary_loss_mlp": 0.01032073, + "balance_loss_clip": 1.0174818, + "balance_loss_mlp": 1.04056036, + "epoch": 0.32340297610100704, + "flos": 28585217111040.0, + "grad_norm": 1.9527598104570445, + "language_loss": 0.82083338, + "learning_rate": 3.1629000395635357e-06, + "loss": 0.84239388, + "num_input_tokens_seen": 115553850, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8359375, + "step": 5379, + "time_per_iteration": 2.5433154106140137 + }, + { + "auxiliary_loss_clip": 0.01127314, + "auxiliary_loss_mlp": 0.01032505, + "balance_loss_clip": 1.01805711, + "balance_loss_mlp": 1.04230165, + "epoch": 0.323463099353675, + "flos": 30773792787840.0, + "grad_norm": 1.9705325619840932, + "language_loss": 0.78379917, + "learning_rate": 3.162583158454388e-06, + "loss": 0.80539739, + "num_input_tokens_seen": 115575530, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8515625, + "step": 5380, + "time_per_iteration": 2.5306878089904785 + }, + { + "auxiliary_loss_clip": 0.0112988, + "auxiliary_loss_mlp": 0.01036402, + "balance_loss_clip": 1.02207887, + "balance_loss_mlp": 1.04637241, + "epoch": 0.32352322260634303, + "flos": 25228610974080.0, + "grad_norm": 1.5992937517204726, + "language_loss": 0.76871669, + "learning_rate": 3.1622662332599697e-06, + "loss": 0.79037952, + "num_input_tokens_seen": 115594885, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8359375, + "step": 5381, + "time_per_iteration": 2.545740842819214 + }, + { + "auxiliary_loss_clip": 0.0112089, + "auxiliary_loss_mlp": 0.01035663, + "balance_loss_clip": 1.02228761, + "balance_loss_mlp": 1.04212475, + "epoch": 0.323583345859011, + "flos": 23330480670720.0, + "grad_norm": 1.912812068704809, + "language_loss": 0.71864545, + "learning_rate": 3.1619492639922998e-06, + "loss": 0.74021101, + "num_input_tokens_seen": 115614080, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7890625, + "step": 5382, + "time_per_iteration": 2.488344430923462 + }, + { + "auxiliary_loss_clip": 0.01127382, + "auxiliary_loss_mlp": 0.0103402, + "balance_loss_clip": 1.0192976, + "balance_loss_mlp": 1.0424943, + "epoch": 0.32364346911167896, + "flos": 26207499392640.0, + "grad_norm": 2.8562908675977754, + "language_loss": 0.70752692, + "learning_rate": 3.1616322506633964e-06, + "loss": 0.72914088, + "num_input_tokens_seen": 115632820, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8515625, + "step": 5383, + "time_per_iteration": 2.5236711502075195 + }, + { + "auxiliary_loss_clip": 0.01123876, + "auxiliary_loss_mlp": 0.01039099, + "balance_loss_clip": 1.0259378, + "balance_loss_mlp": 1.0442363, + "epoch": 0.3237035923643469, + "flos": 23695764030720.0, + "grad_norm": 2.094388352971362, + "language_loss": 0.78742963, + "learning_rate": 3.161315193285283e-06, + "loss": 0.80905938, + "num_input_tokens_seen": 115652860, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.796875, + "step": 5384, + "time_per_iteration": 2.4685723781585693 + }, + { + "auxiliary_loss_clip": 0.0112912, + "auxiliary_loss_mlp": 0.01038116, + "balance_loss_clip": 1.0222249, + "balance_loss_mlp": 1.04443073, + "epoch": 0.3237637156170149, + "flos": 14428728633600.0, + "grad_norm": 2.069351852322995, + "language_loss": 0.74553645, + "learning_rate": 3.16099809186998e-06, + "loss": 0.76720881, + "num_input_tokens_seen": 115670940, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.84765625, + "step": 5385, + "time_per_iteration": 2.46968936920166 + }, + { + "auxiliary_loss_clip": 0.01127931, + "auxiliary_loss_mlp": 0.01035567, + "balance_loss_clip": 1.02101183, + "balance_loss_mlp": 1.04604125, + "epoch": 0.32382383886968286, + "flos": 31062981185280.0, + "grad_norm": 1.8196037573439483, + "language_loss": 0.72068852, + "learning_rate": 3.1606809464295145e-06, + "loss": 0.74232352, + "num_input_tokens_seen": 115691155, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8203125, + "step": 5386, + "time_per_iteration": 2.559480667114258 + }, + { + "auxiliary_loss_clip": 0.01128094, + "auxiliary_loss_mlp": 0.01036855, + "balance_loss_clip": 1.02119136, + "balance_loss_mlp": 1.04176617, + "epoch": 0.3238839621223508, + "flos": 23256935573760.0, + "grad_norm": 1.8525904099951498, + "language_loss": 0.94343817, + "learning_rate": 3.1603637569759095e-06, + "loss": 0.96508765, + "num_input_tokens_seen": 115710340, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.86328125, + "step": 5387, + "time_per_iteration": 5.378048896789551 + }, + { + "auxiliary_loss_clip": 0.0112709, + "auxiliary_loss_mlp": 0.01037979, + "balance_loss_clip": 1.02227962, + "balance_loss_mlp": 1.04373097, + "epoch": 0.3239440853750188, + "flos": 22964658606720.0, + "grad_norm": 2.7647642243142747, + "language_loss": 0.77544433, + "learning_rate": 3.1600465235211956e-06, + "loss": 0.79709506, + "num_input_tokens_seen": 115726745, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8359375, + "step": 5388, + "time_per_iteration": 2.4804563522338867 + }, + { + "auxiliary_loss_clip": 0.0112736, + "auxiliary_loss_mlp": 0.01030728, + "balance_loss_clip": 1.01554048, + "balance_loss_mlp": 1.04277194, + "epoch": 0.32400420862768675, + "flos": 36246614653440.0, + "grad_norm": 2.092216766577811, + "language_loss": 0.71867704, + "learning_rate": 3.1597292460774006e-06, + "loss": 0.74025786, + "num_input_tokens_seen": 115749385, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.84375, + "step": 5389, + "time_per_iteration": 2.5753331184387207 + }, + { + "auxiliary_loss_clip": 0.01128194, + "auxiliary_loss_mlp": 0.0103865, + "balance_loss_clip": 1.0233078, + "balance_loss_mlp": 1.04672205, + "epoch": 0.3240643318803547, + "flos": 21616500418560.0, + "grad_norm": 2.0374979548818497, + "language_loss": 0.80883735, + "learning_rate": 3.159411924656557e-06, + "loss": 0.83050573, + "num_input_tokens_seen": 115768105, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.81640625, + "step": 5390, + "time_per_iteration": 2.479557991027832 + }, + { + "auxiliary_loss_clip": 0.01130573, + "auxiliary_loss_mlp": 0.01044181, + "balance_loss_clip": 1.02911294, + "balance_loss_mlp": 1.04798484, + "epoch": 0.3241244551330227, + "flos": 23295611543040.0, + "grad_norm": 2.0682587448682384, + "language_loss": 0.72983515, + "learning_rate": 3.1590945592706967e-06, + "loss": 0.75158268, + "num_input_tokens_seen": 115787340, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.82421875, + "step": 5391, + "time_per_iteration": 2.4689247608184814 + }, + { + "auxiliary_loss_clip": 0.01125432, + "auxiliary_loss_mlp": 0.01041396, + "balance_loss_clip": 1.02728176, + "balance_loss_mlp": 1.04465139, + "epoch": 0.32418457838569065, + "flos": 14097236993280.0, + "grad_norm": 1.6356435132494873, + "language_loss": 0.77357036, + "learning_rate": 3.158777149931855e-06, + "loss": 0.79523861, + "num_input_tokens_seen": 115805565, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.80859375, + "step": 5392, + "time_per_iteration": 2.4942643642425537 + }, + { + "auxiliary_loss_clip": 0.01129141, + "auxiliary_loss_mlp": 0.01040265, + "balance_loss_clip": 1.02454162, + "balance_loss_mlp": 1.04454243, + "epoch": 0.3242447016383586, + "flos": 29752672953600.0, + "grad_norm": 2.035025217222515, + "language_loss": 0.62445068, + "learning_rate": 3.158459696652067e-06, + "loss": 0.64614469, + "num_input_tokens_seen": 115826725, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.84375, + "step": 5393, + "time_per_iteration": 2.5294058322906494 + }, + { + "auxiliary_loss_clip": 0.01127178, + "auxiliary_loss_mlp": 0.01037592, + "balance_loss_clip": 1.02292883, + "balance_loss_mlp": 1.0455395, + "epoch": 0.3243048248910266, + "flos": 24351205455360.0, + "grad_norm": 1.541011228274946, + "language_loss": 0.8250984, + "learning_rate": 3.158142199443371e-06, + "loss": 0.84674609, + "num_input_tokens_seen": 115846955, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.81640625, + "step": 5394, + "time_per_iteration": 2.5204803943634033 + }, + { + "auxiliary_loss_clip": 0.01125244, + "auxiliary_loss_mlp": 0.0104429, + "balance_loss_clip": 1.03089094, + "balance_loss_mlp": 1.04596353, + "epoch": 0.3243649481436946, + "flos": 24353037048960.0, + "grad_norm": 1.8431569167236632, + "language_loss": 0.81585443, + "learning_rate": 3.1578246583178076e-06, + "loss": 0.83754981, + "num_input_tokens_seen": 115865975, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.79296875, + "step": 5395, + "time_per_iteration": 2.481722116470337 + }, + { + "auxiliary_loss_clip": 0.01126361, + "auxiliary_loss_mlp": 0.01042015, + "balance_loss_clip": 1.02844906, + "balance_loss_mlp": 1.04834461, + "epoch": 0.32442507139636256, + "flos": 22925228451840.0, + "grad_norm": 3.644291671680186, + "language_loss": 0.83163011, + "learning_rate": 3.157507073287417e-06, + "loss": 0.8533138, + "num_input_tokens_seen": 115884950, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.78125, + "step": 5396, + "time_per_iteration": 2.5014734268188477 + }, + { + "auxiliary_loss_clip": 0.01133358, + "auxiliary_loss_mlp": 0.01039347, + "balance_loss_clip": 1.02392137, + "balance_loss_mlp": 1.04687238, + "epoch": 0.32448519464903053, + "flos": 22200192426240.0, + "grad_norm": 1.8637158339296453, + "language_loss": 0.75718713, + "learning_rate": 3.1571894443642414e-06, + "loss": 0.77891421, + "num_input_tokens_seen": 115904170, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8671875, + "step": 5397, + "time_per_iteration": 2.475958824157715 + }, + { + "auxiliary_loss_clip": 0.01125578, + "auxiliary_loss_mlp": 0.0103396, + "balance_loss_clip": 1.01953566, + "balance_loss_mlp": 1.04540443, + "epoch": 0.3245453179016985, + "flos": 18838450644480.0, + "grad_norm": 2.571224523552484, + "language_loss": 0.66835862, + "learning_rate": 3.1568717715603263e-06, + "loss": 0.68995398, + "num_input_tokens_seen": 115919255, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8046875, + "step": 5398, + "time_per_iteration": 2.447065830230713 + }, + { + "auxiliary_loss_clip": 0.01124854, + "auxiliary_loss_mlp": 0.0103244, + "balance_loss_clip": 1.0183022, + "balance_loss_mlp": 1.04326463, + "epoch": 0.32460544115436646, + "flos": 21178390233600.0, + "grad_norm": 1.4279244162742584, + "language_loss": 0.73232102, + "learning_rate": 3.156554054887718e-06, + "loss": 0.75389397, + "num_input_tokens_seen": 115938535, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8203125, + "step": 5399, + "time_per_iteration": 2.466137409210205 + }, + { + "auxiliary_loss_clip": 0.01129831, + "auxiliary_loss_mlp": 0.01035026, + "balance_loss_clip": 1.02016079, + "balance_loss_mlp": 1.04749155, + "epoch": 0.3246655644070344, + "flos": 21981137333760.0, + "grad_norm": 2.110147681467196, + "language_loss": 0.71391356, + "learning_rate": 3.1562362943584645e-06, + "loss": 0.73556215, + "num_input_tokens_seen": 115955005, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.82421875, + "step": 5400, + "time_per_iteration": 2.484243631362915 + }, + { + "auxiliary_loss_clip": 0.01128373, + "auxiliary_loss_mlp": 0.01035494, + "balance_loss_clip": 1.02108145, + "balance_loss_mlp": 1.04439175, + "epoch": 0.3247256876597024, + "flos": 32159729105280.0, + "grad_norm": 3.048924003265154, + "language_loss": 0.79583031, + "learning_rate": 3.155918489984614e-06, + "loss": 0.81746894, + "num_input_tokens_seen": 115975305, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.83984375, + "step": 5401, + "time_per_iteration": 2.5695505142211914 + }, + { + "auxiliary_loss_clip": 0.01130508, + "auxiliary_loss_mlp": 0.01042722, + "balance_loss_clip": 1.02642608, + "balance_loss_mlp": 1.04700303, + "epoch": 0.32478581091237035, + "flos": 20997544233600.0, + "grad_norm": 1.4209306386542333, + "language_loss": 0.87675726, + "learning_rate": 3.1556006417782196e-06, + "loss": 0.89848959, + "num_input_tokens_seen": 115994810, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8359375, + "step": 5402, + "time_per_iteration": 2.4811201095581055 + }, + { + "auxiliary_loss_clip": 0.01122645, + "auxiliary_loss_mlp": 0.01036689, + "balance_loss_clip": 1.02249742, + "balance_loss_mlp": 1.04369164, + "epoch": 0.3248459341650383, + "flos": 17924990849280.0, + "grad_norm": 1.934597728175988, + "language_loss": 0.84513289, + "learning_rate": 3.155282749751332e-06, + "loss": 0.86672628, + "num_input_tokens_seen": 116011095, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7890625, + "step": 5403, + "time_per_iteration": 2.418501377105713 + }, + { + "auxiliary_loss_clip": 0.01129275, + "auxiliary_loss_mlp": 0.01041866, + "balance_loss_clip": 1.02852631, + "balance_loss_mlp": 1.05024314, + "epoch": 0.3249060574177063, + "flos": 24535606901760.0, + "grad_norm": 2.0001546098828955, + "language_loss": 0.87642342, + "learning_rate": 3.154964813916007e-06, + "loss": 0.89813483, + "num_input_tokens_seen": 116028805, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7890625, + "step": 5404, + "time_per_iteration": 2.5094971656799316 + }, + { + "auxiliary_loss_clip": 0.01125879, + "auxiliary_loss_mlp": 0.01038939, + "balance_loss_clip": 1.02413273, + "balance_loss_mlp": 1.04579973, + "epoch": 0.32496618067037425, + "flos": 25994765093760.0, + "grad_norm": 1.6336968005079966, + "language_loss": 0.72491479, + "learning_rate": 3.1546468342843008e-06, + "loss": 0.74656296, + "num_input_tokens_seen": 116047765, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.80078125, + "step": 5405, + "time_per_iteration": 2.4927978515625 + }, + { + "auxiliary_loss_clip": 0.01125757, + "auxiliary_loss_mlp": 0.01035734, + "balance_loss_clip": 1.02147698, + "balance_loss_mlp": 1.04514825, + "epoch": 0.3250263039230422, + "flos": 19573757959680.0, + "grad_norm": 1.8637721662214948, + "language_loss": 0.83356953, + "learning_rate": 3.1543288108682707e-06, + "loss": 0.85518444, + "num_input_tokens_seen": 116068385, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.80859375, + "step": 5406, + "time_per_iteration": 2.534508228302002 + }, + { + "auxiliary_loss_clip": 0.01127659, + "auxiliary_loss_mlp": 0.01036296, + "balance_loss_clip": 1.02241969, + "balance_loss_mlp": 1.0469048, + "epoch": 0.3250864271757102, + "flos": 16763640318720.0, + "grad_norm": 1.836635199790601, + "language_loss": 0.8826412, + "learning_rate": 3.1540107436799764e-06, + "loss": 0.90428072, + "num_input_tokens_seen": 116085350, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.80859375, + "step": 5407, + "time_per_iteration": 2.4199326038360596 + }, + { + "auxiliary_loss_clip": 0.01127405, + "auxiliary_loss_mlp": 0.01036157, + "balance_loss_clip": 1.02160144, + "balance_loss_mlp": 1.04602861, + "epoch": 0.3251465504283782, + "flos": 27819458040960.0, + "grad_norm": 1.5140887230520799, + "language_loss": 0.69643426, + "learning_rate": 3.153692632731479e-06, + "loss": 0.71806979, + "num_input_tokens_seen": 116107560, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8125, + "step": 5408, + "time_per_iteration": 2.5646731853485107 + }, + { + "auxiliary_loss_clip": 0.01131319, + "auxiliary_loss_mlp": 0.01035172, + "balance_loss_clip": 1.02013946, + "balance_loss_mlp": 1.04438102, + "epoch": 0.32520667368104617, + "flos": 19063144172160.0, + "grad_norm": 1.6429750268405912, + "language_loss": 0.77442145, + "learning_rate": 3.153374478034841e-06, + "loss": 0.79608637, + "num_input_tokens_seen": 116125980, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.87109375, + "step": 5409, + "time_per_iteration": 2.450200080871582 + }, + { + "auxiliary_loss_clip": 0.01129924, + "auxiliary_loss_mlp": 0.01046371, + "balance_loss_clip": 1.03142262, + "balance_loss_mlp": 1.04331136, + "epoch": 0.32526679693371413, + "flos": 29382146208000.0, + "grad_norm": 2.3862040562488716, + "language_loss": 0.83582234, + "learning_rate": 3.1530562796021285e-06, + "loss": 0.85758531, + "num_input_tokens_seen": 116146530, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8671875, + "step": 5410, + "time_per_iteration": 2.5161662101745605 + }, + { + "auxiliary_loss_clip": 0.01121858, + "auxiliary_loss_mlp": 0.01034854, + "balance_loss_clip": 1.02089429, + "balance_loss_mlp": 1.04224813, + "epoch": 0.3253269201863821, + "flos": 20704513080960.0, + "grad_norm": 1.5577179591930796, + "language_loss": 0.71270931, + "learning_rate": 3.152738037445405e-06, + "loss": 0.73427641, + "num_input_tokens_seen": 116165695, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.796875, + "step": 5411, + "time_per_iteration": 2.4465057849884033 + }, + { + "auxiliary_loss_clip": 0.01125475, + "auxiliary_loss_mlp": 0.01039417, + "balance_loss_clip": 1.02544606, + "balance_loss_mlp": 1.04381669, + "epoch": 0.32538704343905006, + "flos": 29094142959360.0, + "grad_norm": 1.6024997274503978, + "language_loss": 0.83103073, + "learning_rate": 3.1524197515767403e-06, + "loss": 0.85267961, + "num_input_tokens_seen": 116185375, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.81640625, + "step": 5412, + "time_per_iteration": 2.512471914291382 + }, + { + "auxiliary_loss_clip": 0.01129762, + "auxiliary_loss_mlp": 0.01035505, + "balance_loss_clip": 1.01963782, + "balance_loss_mlp": 1.04417348, + "epoch": 0.325447166691718, + "flos": 24676124906880.0, + "grad_norm": 2.3149031646834577, + "language_loss": 0.80794364, + "learning_rate": 3.152101422008203e-06, + "loss": 0.82959628, + "num_input_tokens_seen": 116204335, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.85546875, + "step": 5413, + "time_per_iteration": 2.483309030532837 + }, + { + "auxiliary_loss_clip": 0.01128818, + "auxiliary_loss_mlp": 0.01042957, + "balance_loss_clip": 1.02723312, + "balance_loss_mlp": 1.04606462, + "epoch": 0.325507289944386, + "flos": 21543134889600.0, + "grad_norm": 1.5892127721025033, + "language_loss": 0.76887989, + "learning_rate": 3.151783048751864e-06, + "loss": 0.79059768, + "num_input_tokens_seen": 116222840, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.828125, + "step": 5414, + "time_per_iteration": 2.4696640968322754 + }, + { + "auxiliary_loss_clip": 0.01039619, + "auxiliary_loss_mlp": 0.01008091, + "balance_loss_clip": 1.00601661, + "balance_loss_mlp": 1.01271892, + "epoch": 0.32556741319705396, + "flos": 71518722347520.0, + "grad_norm": 0.9084647328862615, + "language_loss": 0.64009887, + "learning_rate": 3.1514646318197965e-06, + "loss": 0.66057593, + "num_input_tokens_seen": 116274940, + "router_z_loss_clip": 0.02075195, + "router_z_loss_mlp": 0.26953125, + "step": 5415, + "time_per_iteration": 2.982389450073242 + }, + { + "auxiliary_loss_clip": 0.01124624, + "auxiliary_loss_mlp": 0.01036817, + "balance_loss_clip": 1.02214265, + "balance_loss_mlp": 1.04286838, + "epoch": 0.3256275364497219, + "flos": 23732428838400.0, + "grad_norm": 2.942597496869342, + "language_loss": 0.74265057, + "learning_rate": 3.151146171224075e-06, + "loss": 0.764265, + "num_input_tokens_seen": 116297300, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.81640625, + "step": 5416, + "time_per_iteration": 2.526956558227539 + }, + { + "auxiliary_loss_clip": 0.01039656, + "auxiliary_loss_mlp": 0.01005548, + "balance_loss_clip": 1.00335431, + "balance_loss_mlp": 1.01254702, + "epoch": 0.3256876597023899, + "flos": 67289199891840.0, + "grad_norm": 0.7736939008633222, + "language_loss": 0.57947183, + "learning_rate": 3.1508276669767757e-06, + "loss": 0.59992385, + "num_input_tokens_seen": 116362370, + "router_z_loss_clip": 0.02197266, + "router_z_loss_mlp": 0.26953125, + "step": 5417, + "time_per_iteration": 3.1500296592712402 + }, + { + "auxiliary_loss_clip": 0.01038219, + "auxiliary_loss_mlp": 0.01002181, + "balance_loss_clip": 1.0002141, + "balance_loss_mlp": 1.01140058, + "epoch": 0.32574778295505785, + "flos": 71282323964160.0, + "grad_norm": 0.9133944403169288, + "language_loss": 0.63476181, + "learning_rate": 3.150509119089975e-06, + "loss": 0.65516579, + "num_input_tokens_seen": 116430365, + "router_z_loss_clip": 0.01965332, + "router_z_loss_mlp": 0.26953125, + "step": 5418, + "time_per_iteration": 3.1724026203155518 + }, + { + "auxiliary_loss_clip": 0.01125951, + "auxiliary_loss_mlp": 0.01041707, + "balance_loss_clip": 1.02739, + "balance_loss_mlp": 1.0441196, + "epoch": 0.3258079062077258, + "flos": 20776370238720.0, + "grad_norm": 3.240595355482155, + "language_loss": 0.69061959, + "learning_rate": 3.1501905275757537e-06, + "loss": 0.71229619, + "num_input_tokens_seen": 116447525, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.8203125, + "step": 5419, + "time_per_iteration": 2.4643847942352295 + }, + { + "auxiliary_loss_clip": 0.01125895, + "auxiliary_loss_mlp": 0.01035053, + "balance_loss_clip": 1.01951957, + "balance_loss_mlp": 1.04326844, + "epoch": 0.3258680294603938, + "flos": 22235456603520.0, + "grad_norm": 2.1209544014848443, + "language_loss": 0.77064359, + "learning_rate": 3.1498718924461926e-06, + "loss": 0.79225302, + "num_input_tokens_seen": 116466310, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.828125, + "step": 5420, + "time_per_iteration": 2.5241270065307617 + }, + { + "auxiliary_loss_clip": 0.01128645, + "auxiliary_loss_mlp": 0.01035079, + "balance_loss_clip": 1.01945043, + "balance_loss_mlp": 1.04400003, + "epoch": 0.3259281527130618, + "flos": 26979974305920.0, + "grad_norm": 1.4823274263144444, + "language_loss": 0.80134791, + "learning_rate": 3.1495532137133736e-06, + "loss": 0.82298517, + "num_input_tokens_seen": 116487825, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84375, + "step": 5421, + "time_per_iteration": 2.5376439094543457 + }, + { + "auxiliary_loss_clip": 0.01122338, + "auxiliary_loss_mlp": 0.01037347, + "balance_loss_clip": 1.02359045, + "balance_loss_mlp": 1.04254711, + "epoch": 0.32598827596572977, + "flos": 26214251149440.0, + "grad_norm": 1.5045024534641303, + "language_loss": 0.75446749, + "learning_rate": 3.149234491389381e-06, + "loss": 0.77606434, + "num_input_tokens_seen": 116509950, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.796875, + "step": 5422, + "time_per_iteration": 2.5713820457458496 + }, + { + "auxiliary_loss_clip": 0.01128336, + "auxiliary_loss_mlp": 0.01039164, + "balance_loss_clip": 1.02324986, + "balance_loss_mlp": 1.04553628, + "epoch": 0.32604839921839773, + "flos": 17639752947840.0, + "grad_norm": 2.780294141224906, + "language_loss": 0.62795889, + "learning_rate": 3.1489157254863026e-06, + "loss": 0.64963388, + "num_input_tokens_seen": 116527695, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.828125, + "step": 5423, + "time_per_iteration": 2.4667959213256836 + }, + { + "auxiliary_loss_clip": 0.01118583, + "auxiliary_loss_mlp": 0.01031258, + "balance_loss_clip": 1.01824594, + "balance_loss_mlp": 1.04085255, + "epoch": 0.3261085224710657, + "flos": 23622721724160.0, + "grad_norm": 4.488088575635961, + "language_loss": 0.74664211, + "learning_rate": 3.148596916016224e-06, + "loss": 0.76814055, + "num_input_tokens_seen": 116547800, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.77734375, + "step": 5424, + "time_per_iteration": 2.488187313079834 + }, + { + "auxiliary_loss_clip": 0.01122401, + "auxiliary_loss_mlp": 0.01035954, + "balance_loss_clip": 1.02231038, + "balance_loss_mlp": 1.04298568, + "epoch": 0.32616864572373366, + "flos": 23260455106560.0, + "grad_norm": 1.6359586167011877, + "language_loss": 0.76958472, + "learning_rate": 3.1482780629912355e-06, + "loss": 0.79116821, + "num_input_tokens_seen": 116568460, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.796875, + "step": 5425, + "time_per_iteration": 2.5025157928466797 + }, + { + "auxiliary_loss_clip": 0.01127865, + "auxiliary_loss_mlp": 0.01047272, + "balance_loss_clip": 1.03051138, + "balance_loss_mlp": 1.04193544, + "epoch": 0.32622876897640163, + "flos": 25593427457280.0, + "grad_norm": 4.663874352034687, + "language_loss": 0.78857136, + "learning_rate": 3.147959166423428e-06, + "loss": 0.8103227, + "num_input_tokens_seen": 116588705, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.859375, + "step": 5426, + "time_per_iteration": 2.484064817428589 + }, + { + "auxiliary_loss_clip": 0.01124966, + "auxiliary_loss_mlp": 0.01037083, + "balance_loss_clip": 1.02116871, + "balance_loss_mlp": 1.04324198, + "epoch": 0.3262888922290696, + "flos": 22418996123520.0, + "grad_norm": 1.7688447582142532, + "language_loss": 0.74363142, + "learning_rate": 3.147640226324893e-06, + "loss": 0.76525187, + "num_input_tokens_seen": 116608845, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.81640625, + "step": 5427, + "time_per_iteration": 2.4785962104797363 + }, + { + "auxiliary_loss_clip": 0.0112706, + "auxiliary_loss_mlp": 0.01043058, + "balance_loss_clip": 1.02742934, + "balance_loss_mlp": 1.04290414, + "epoch": 0.32634901548173756, + "flos": 19718908819200.0, + "grad_norm": 1.911492416062928, + "language_loss": 0.79305124, + "learning_rate": 3.1473212427077266e-06, + "loss": 0.8147524, + "num_input_tokens_seen": 116628145, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.83984375, + "step": 5428, + "time_per_iteration": 3.9864413738250732 + }, + { + "auxiliary_loss_clip": 0.01123467, + "auxiliary_loss_mlp": 0.01041045, + "balance_loss_clip": 1.02597678, + "balance_loss_mlp": 1.04084587, + "epoch": 0.3264091387344055, + "flos": 16142924367360.0, + "grad_norm": 1.7222830625250152, + "language_loss": 0.71369523, + "learning_rate": 3.147002215584023e-06, + "loss": 0.73534036, + "num_input_tokens_seen": 116646920, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.82421875, + "step": 5429, + "time_per_iteration": 3.8856096267700195 + }, + { + "auxiliary_loss_clip": 0.01124973, + "auxiliary_loss_mlp": 0.01038401, + "balance_loss_clip": 1.02448976, + "balance_loss_mlp": 1.04308093, + "epoch": 0.3264692619870735, + "flos": 16399075230720.0, + "grad_norm": 1.889570703315701, + "language_loss": 0.78612322, + "learning_rate": 3.146683144965881e-06, + "loss": 0.80775696, + "num_input_tokens_seen": 116665100, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.8203125, + "step": 5430, + "time_per_iteration": 2.4374818801879883 + }, + { + "auxiliary_loss_clip": 0.01128219, + "auxiliary_loss_mlp": 0.01037824, + "balance_loss_clip": 1.02077675, + "balance_loss_mlp": 1.04359281, + "epoch": 0.32652938523974145, + "flos": 22382331315840.0, + "grad_norm": 1.8594684871120744, + "language_loss": 0.83897448, + "learning_rate": 3.146364030865399e-06, + "loss": 0.86063492, + "num_input_tokens_seen": 116682205, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.84765625, + "step": 5431, + "time_per_iteration": 2.4513139724731445 + }, + { + "auxiliary_loss_clip": 0.01122027, + "auxiliary_loss_mlp": 0.01038117, + "balance_loss_clip": 1.02431297, + "balance_loss_mlp": 1.04116321, + "epoch": 0.3265895084924094, + "flos": 21908059113600.0, + "grad_norm": 1.7565110160676718, + "language_loss": 0.70459324, + "learning_rate": 3.146044873294678e-06, + "loss": 0.72619462, + "num_input_tokens_seen": 116702575, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.80859375, + "step": 5432, + "time_per_iteration": 2.529365301132202 + }, + { + "auxiliary_loss_clip": 0.01123519, + "auxiliary_loss_mlp": 0.01035948, + "balance_loss_clip": 1.02182746, + "balance_loss_mlp": 1.04076195, + "epoch": 0.3266496317450774, + "flos": 16067152627200.0, + "grad_norm": 1.4205622330102, + "language_loss": 0.84161848, + "learning_rate": 3.1457256722658203e-06, + "loss": 0.86321318, + "num_input_tokens_seen": 116720885, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.828125, + "step": 5433, + "time_per_iteration": 2.4302597045898438 + }, + { + "auxiliary_loss_clip": 0.01123612, + "auxiliary_loss_mlp": 0.01035544, + "balance_loss_clip": 1.02132881, + "balance_loss_mlp": 1.0439055, + "epoch": 0.3267097549977454, + "flos": 22528236360960.0, + "grad_norm": 1.4699213962063424, + "language_loss": 0.85906386, + "learning_rate": 3.145406427790931e-06, + "loss": 0.88065541, + "num_input_tokens_seen": 116740395, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.796875, + "step": 5434, + "time_per_iteration": 2.496676445007324 + }, + { + "auxiliary_loss_clip": 0.01129105, + "auxiliary_loss_mlp": 0.01035794, + "balance_loss_clip": 1.02083361, + "balance_loss_mlp": 1.04468119, + "epoch": 0.32676987825041337, + "flos": 27270419679360.0, + "grad_norm": 1.8331918492971015, + "language_loss": 0.87817061, + "learning_rate": 3.1450871398821147e-06, + "loss": 0.89981961, + "num_input_tokens_seen": 116758870, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.84375, + "step": 5435, + "time_per_iteration": 2.51159405708313 + }, + { + "auxiliary_loss_clip": 0.0112533, + "auxiliary_loss_mlp": 0.01035758, + "balance_loss_clip": 1.02140474, + "balance_loss_mlp": 1.04326773, + "epoch": 0.32683000150308134, + "flos": 11508257433600.0, + "grad_norm": 2.5496215899058443, + "language_loss": 0.76460963, + "learning_rate": 3.144767808551479e-06, + "loss": 0.78622043, + "num_input_tokens_seen": 116773440, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.8203125, + "step": 5436, + "time_per_iteration": 2.43637752532959 + }, + { + "auxiliary_loss_clip": 0.01125315, + "auxiliary_loss_mlp": 0.01034854, + "balance_loss_clip": 1.02040625, + "balance_loss_mlp": 1.04435849, + "epoch": 0.3268901247557493, + "flos": 25630200005760.0, + "grad_norm": 1.5905557916714361, + "language_loss": 0.72127515, + "learning_rate": 3.144448433811134e-06, + "loss": 0.74287689, + "num_input_tokens_seen": 116794375, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.80859375, + "step": 5437, + "time_per_iteration": 2.493673086166382 + }, + { + "auxiliary_loss_clip": 0.01126466, + "auxiliary_loss_mlp": 0.01039117, + "balance_loss_clip": 1.02236819, + "balance_loss_mlp": 1.04143524, + "epoch": 0.32695024800841727, + "flos": 24860849575680.0, + "grad_norm": 1.6336098458574233, + "language_loss": 0.64049256, + "learning_rate": 3.144129015673189e-06, + "loss": 0.66214842, + "num_input_tokens_seen": 116815095, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8515625, + "step": 5438, + "time_per_iteration": 2.5062596797943115 + }, + { + "auxiliary_loss_clip": 0.01126505, + "auxiliary_loss_mlp": 0.01034195, + "balance_loss_clip": 1.01943088, + "balance_loss_mlp": 1.04510128, + "epoch": 0.32701037126108523, + "flos": 28839249072000.0, + "grad_norm": 1.5452802319075516, + "language_loss": 0.74544024, + "learning_rate": 3.1438095541497576e-06, + "loss": 0.76704717, + "num_input_tokens_seen": 116836630, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8125, + "step": 5439, + "time_per_iteration": 2.501279830932617 + }, + { + "auxiliary_loss_clip": 0.01126727, + "auxiliary_loss_mlp": 0.01045237, + "balance_loss_clip": 1.02985907, + "balance_loss_mlp": 1.04374349, + "epoch": 0.3270704945137532, + "flos": 27965075777280.0, + "grad_norm": 2.6196339079167323, + "language_loss": 0.75183308, + "learning_rate": 3.1434900492529527e-06, + "loss": 0.77355272, + "num_input_tokens_seen": 116856880, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.828125, + "step": 5440, + "time_per_iteration": 2.507341146469116 + }, + { + "auxiliary_loss_clip": 0.01124779, + "auxiliary_loss_mlp": 0.01047409, + "balance_loss_clip": 1.03317571, + "balance_loss_mlp": 1.04308057, + "epoch": 0.32713061776642116, + "flos": 23690700213120.0, + "grad_norm": 1.9066250681455874, + "language_loss": 0.84613734, + "learning_rate": 3.1431705009948914e-06, + "loss": 0.86785924, + "num_input_tokens_seen": 116873770, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8203125, + "step": 5441, + "time_per_iteration": 2.4737346172332764 + }, + { + "auxiliary_loss_clip": 0.01126255, + "auxiliary_loss_mlp": 0.01042859, + "balance_loss_clip": 1.02743292, + "balance_loss_mlp": 1.04209113, + "epoch": 0.3271907410190891, + "flos": 22455625017600.0, + "grad_norm": 1.9602585650153952, + "language_loss": 0.8673979, + "learning_rate": 3.1428509093876897e-06, + "loss": 0.88908899, + "num_input_tokens_seen": 116891225, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84375, + "step": 5442, + "time_per_iteration": 2.4779980182647705 + }, + { + "auxiliary_loss_clip": 0.0113032, + "auxiliary_loss_mlp": 0.01038435, + "balance_loss_clip": 1.02193677, + "balance_loss_mlp": 1.04526424, + "epoch": 0.3272508642717571, + "flos": 22820118278400.0, + "grad_norm": 1.8849886885636646, + "language_loss": 0.77500421, + "learning_rate": 3.1425312744434668e-06, + "loss": 0.79669178, + "num_input_tokens_seen": 116912300, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8515625, + "step": 5443, + "time_per_iteration": 2.5263850688934326 + }, + { + "auxiliary_loss_clip": 0.01126577, + "auxiliary_loss_mlp": 0.01039448, + "balance_loss_clip": 1.02428412, + "balance_loss_mlp": 1.04207098, + "epoch": 0.32731098752442506, + "flos": 11801360413440.0, + "grad_norm": 2.0180593262473487, + "language_loss": 0.81630802, + "learning_rate": 3.142211596174343e-06, + "loss": 0.83796823, + "num_input_tokens_seen": 116929425, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.84765625, + "step": 5444, + "time_per_iteration": 2.447061061859131 + }, + { + "auxiliary_loss_clip": 0.0112612, + "auxiliary_loss_mlp": 0.01038044, + "balance_loss_clip": 1.02335095, + "balance_loss_mlp": 1.04356718, + "epoch": 0.327371110777093, + "flos": 21027780506880.0, + "grad_norm": 2.9587875585664523, + "language_loss": 0.59421074, + "learning_rate": 3.1418918745924423e-06, + "loss": 0.61585242, + "num_input_tokens_seen": 116948255, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.82421875, + "step": 5445, + "time_per_iteration": 2.4542667865753174 + }, + { + "auxiliary_loss_clip": 0.01128674, + "auxiliary_loss_mlp": 0.01039464, + "balance_loss_clip": 1.02397823, + "balance_loss_mlp": 1.04482532, + "epoch": 0.327431234029761, + "flos": 19062102677760.0, + "grad_norm": 2.043321690225375, + "language_loss": 0.88286638, + "learning_rate": 3.1415721097098865e-06, + "loss": 0.90454781, + "num_input_tokens_seen": 116964905, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8359375, + "step": 5446, + "time_per_iteration": 2.4518625736236572 + }, + { + "auxiliary_loss_clip": 0.01133247, + "auxiliary_loss_mlp": 0.01042878, + "balance_loss_clip": 1.02577102, + "balance_loss_mlp": 1.04609275, + "epoch": 0.32749135728242895, + "flos": 25849219184640.0, + "grad_norm": 1.9059445881205361, + "language_loss": 0.78455317, + "learning_rate": 3.141252301538802e-06, + "loss": 0.80631441, + "num_input_tokens_seen": 116983650, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.87109375, + "step": 5447, + "time_per_iteration": 2.488555669784546 + }, + { + "auxiliary_loss_clip": 0.01125433, + "auxiliary_loss_mlp": 0.01039956, + "balance_loss_clip": 1.02621138, + "balance_loss_mlp": 1.04297531, + "epoch": 0.327551480535097, + "flos": 20120533764480.0, + "grad_norm": 1.7948266966340543, + "language_loss": 0.73349774, + "learning_rate": 3.1409324500913157e-06, + "loss": 0.75515163, + "num_input_tokens_seen": 117003265, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.82421875, + "step": 5448, + "time_per_iteration": 2.460759162902832 + }, + { + "auxiliary_loss_clip": 0.01125074, + "auxiliary_loss_mlp": 0.0104344, + "balance_loss_clip": 1.02788281, + "balance_loss_mlp": 1.04221821, + "epoch": 0.32761160378776494, + "flos": 28803553931520.0, + "grad_norm": 1.3797343272994427, + "language_loss": 0.66896623, + "learning_rate": 3.1406125553795567e-06, + "loss": 0.69065142, + "num_input_tokens_seen": 117025370, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.828125, + "step": 5449, + "time_per_iteration": 2.5101547241210938 + }, + { + "auxiliary_loss_clip": 0.01125182, + "auxiliary_loss_mlp": 0.01035774, + "balance_loss_clip": 1.02111173, + "balance_loss_mlp": 1.04373384, + "epoch": 0.3276717270404329, + "flos": 26937778803840.0, + "grad_norm": 1.3889431777217922, + "language_loss": 0.65617704, + "learning_rate": 3.1402926174156556e-06, + "loss": 0.67778659, + "num_input_tokens_seen": 117044350, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8125, + "step": 5450, + "time_per_iteration": 2.4815587997436523 + }, + { + "auxiliary_loss_clip": 0.01126325, + "auxiliary_loss_mlp": 0.01041593, + "balance_loss_clip": 1.02644145, + "balance_loss_mlp": 1.04330397, + "epoch": 0.32773185029310087, + "flos": 25338425829120.0, + "grad_norm": 1.5376267502191867, + "language_loss": 0.77276003, + "learning_rate": 3.1399726362117437e-06, + "loss": 0.7944392, + "num_input_tokens_seen": 117064450, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.828125, + "step": 5451, + "time_per_iteration": 2.496264696121216 + }, + { + "auxiliary_loss_clip": 0.0112906, + "auxiliary_loss_mlp": 0.01039497, + "balance_loss_clip": 1.02348745, + "balance_loss_mlp": 1.04470944, + "epoch": 0.32779197354576883, + "flos": 26391721271040.0, + "grad_norm": 2.4373215337565015, + "language_loss": 0.7011131, + "learning_rate": 3.1396526117799555e-06, + "loss": 0.72279859, + "num_input_tokens_seen": 117083060, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.84375, + "step": 5452, + "time_per_iteration": 2.504953384399414 + }, + { + "auxiliary_loss_clip": 0.01121729, + "auxiliary_loss_mlp": 0.0103441, + "balance_loss_clip": 1.01944947, + "balance_loss_mlp": 1.04188132, + "epoch": 0.3278520967984368, + "flos": 24899381890560.0, + "grad_norm": 1.7019757848824575, + "language_loss": 0.78734571, + "learning_rate": 3.1393325441324256e-06, + "loss": 0.80890715, + "num_input_tokens_seen": 117101860, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.796875, + "step": 5453, + "time_per_iteration": 2.493701219558716 + }, + { + "auxiliary_loss_clip": 0.01126073, + "auxiliary_loss_mlp": 0.01030616, + "balance_loss_clip": 1.01610184, + "balance_loss_mlp": 1.04306984, + "epoch": 0.32791222005110476, + "flos": 29752996176000.0, + "grad_norm": 2.2894918901687333, + "language_loss": 0.75428879, + "learning_rate": 3.1390124332812916e-06, + "loss": 0.77585566, + "num_input_tokens_seen": 117123100, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.828125, + "step": 5454, + "time_per_iteration": 2.5295286178588867 + }, + { + "auxiliary_loss_clip": 0.01121153, + "auxiliary_loss_mlp": 0.01037163, + "balance_loss_clip": 1.02382326, + "balance_loss_mlp": 1.04198301, + "epoch": 0.32797234330377273, + "flos": 16508064072960.0, + "grad_norm": 2.0725507665811826, + "language_loss": 0.77059573, + "learning_rate": 3.1386922792386924e-06, + "loss": 0.79217887, + "num_input_tokens_seen": 117140515, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7890625, + "step": 5455, + "time_per_iteration": 2.426988124847412 + }, + { + "auxiliary_loss_clip": 0.0112837, + "auxiliary_loss_mlp": 0.01039409, + "balance_loss_clip": 1.02304173, + "balance_loss_mlp": 1.04281068, + "epoch": 0.3280324665564407, + "flos": 26577918397440.0, + "grad_norm": 1.669914346129418, + "language_loss": 0.74029738, + "learning_rate": 3.138372082016768e-06, + "loss": 0.76197511, + "num_input_tokens_seen": 117161485, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.85546875, + "step": 5456, + "time_per_iteration": 2.512131929397583 + }, + { + "auxiliary_loss_clip": 0.01126084, + "auxiliary_loss_mlp": 0.01049831, + "balance_loss_clip": 1.03444123, + "balance_loss_mlp": 1.04250574, + "epoch": 0.32809258980910866, + "flos": 22929969047040.0, + "grad_norm": 1.518027485126158, + "language_loss": 0.78283882, + "learning_rate": 3.1380518416276596e-06, + "loss": 0.80459797, + "num_input_tokens_seen": 117181870, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8359375, + "step": 5457, + "time_per_iteration": 2.4819135665893555 + }, + { + "auxiliary_loss_clip": 0.0112739, + "auxiliary_loss_mlp": 0.01038783, + "balance_loss_clip": 1.02432334, + "balance_loss_mlp": 1.04155684, + "epoch": 0.3281527130617766, + "flos": 22783848520320.0, + "grad_norm": 2.199350012619834, + "language_loss": 0.79332864, + "learning_rate": 3.1377315580835115e-06, + "loss": 0.81499034, + "num_input_tokens_seen": 117201380, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.859375, + "step": 5458, + "time_per_iteration": 2.4749457836151123 + }, + { + "auxiliary_loss_clip": 0.01123398, + "auxiliary_loss_mlp": 0.01035313, + "balance_loss_clip": 1.01988721, + "balance_loss_mlp": 1.04204702, + "epoch": 0.3282128363144446, + "flos": 21250678354560.0, + "grad_norm": 4.694290331797846, + "language_loss": 0.72896576, + "learning_rate": 3.1374112313964686e-06, + "loss": 0.75055289, + "num_input_tokens_seen": 117221040, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.81640625, + "step": 5459, + "time_per_iteration": 2.4506032466888428 + }, + { + "auxiliary_loss_clip": 0.01128673, + "auxiliary_loss_mlp": 0.01037647, + "balance_loss_clip": 1.02303815, + "balance_loss_mlp": 1.04444695, + "epoch": 0.32827295956711255, + "flos": 30843064166400.0, + "grad_norm": 1.8402325574836436, + "language_loss": 0.84511495, + "learning_rate": 3.1370908615786783e-06, + "loss": 0.86677814, + "num_input_tokens_seen": 117241395, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.84375, + "step": 5460, + "time_per_iteration": 2.521491527557373 + }, + { + "auxiliary_loss_clip": 0.01125172, + "auxiliary_loss_mlp": 0.01035571, + "balance_loss_clip": 1.02176023, + "balance_loss_mlp": 1.0420599, + "epoch": 0.3283330828197806, + "flos": 25915006944000.0, + "grad_norm": 1.7736363390075318, + "language_loss": 0.76822042, + "learning_rate": 3.136770448642288e-06, + "loss": 0.78982782, + "num_input_tokens_seen": 117259340, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.83203125, + "step": 5461, + "time_per_iteration": 2.4919962882995605 + }, + { + "auxiliary_loss_clip": 0.01129873, + "auxiliary_loss_mlp": 0.010367, + "balance_loss_clip": 1.02015376, + "balance_loss_mlp": 1.04589903, + "epoch": 0.32839320607244854, + "flos": 38582065042560.0, + "grad_norm": 1.6989905310418616, + "language_loss": 0.62835252, + "learning_rate": 3.1364499925994484e-06, + "loss": 0.65001822, + "num_input_tokens_seen": 117282375, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.83984375, + "step": 5462, + "time_per_iteration": 2.6128923892974854 + }, + { + "auxiliary_loss_clip": 0.0112585, + "auxiliary_loss_mlp": 0.01033948, + "balance_loss_clip": 1.02048922, + "balance_loss_mlp": 1.04426169, + "epoch": 0.3284533293251165, + "flos": 26650888876800.0, + "grad_norm": 1.8014296603715538, + "language_loss": 0.78155506, + "learning_rate": 3.1361294934623115e-06, + "loss": 0.80315304, + "num_input_tokens_seen": 117303830, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.8125, + "step": 5463, + "time_per_iteration": 2.5255165100097656 + }, + { + "auxiliary_loss_clip": 0.0112647, + "auxiliary_loss_mlp": 0.01034449, + "balance_loss_clip": 1.02001238, + "balance_loss_mlp": 1.04409099, + "epoch": 0.32851345257778447, + "flos": 15304158904320.0, + "grad_norm": 2.049558292675733, + "language_loss": 0.7029627, + "learning_rate": 3.1358089512430303e-06, + "loss": 0.72457188, + "num_input_tokens_seen": 117320665, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.82421875, + "step": 5464, + "time_per_iteration": 2.460951089859009 + }, + { + "auxiliary_loss_clip": 0.01127719, + "auxiliary_loss_mlp": 0.01039646, + "balance_loss_clip": 1.02505457, + "balance_loss_mlp": 1.04683673, + "epoch": 0.32857357583045244, + "flos": 23513732881920.0, + "grad_norm": 1.6142145677103121, + "language_loss": 0.72746348, + "learning_rate": 3.1354883659537594e-06, + "loss": 0.74913716, + "num_input_tokens_seen": 117339795, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.80859375, + "step": 5465, + "time_per_iteration": 2.4767887592315674 + }, + { + "auxiliary_loss_clip": 0.01128882, + "auxiliary_loss_mlp": 0.01036634, + "balance_loss_clip": 1.02208447, + "balance_loss_mlp": 1.04690027, + "epoch": 0.3286336990831204, + "flos": 20995209849600.0, + "grad_norm": 1.6282981827525145, + "language_loss": 0.82756901, + "learning_rate": 3.1351677376066567e-06, + "loss": 0.84922415, + "num_input_tokens_seen": 117359525, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8203125, + "step": 5466, + "time_per_iteration": 2.463127613067627 + }, + { + "auxiliary_loss_clip": 0.01127231, + "auxiliary_loss_mlp": 0.01037656, + "balance_loss_clip": 1.02343404, + "balance_loss_mlp": 1.04421949, + "epoch": 0.32869382233578837, + "flos": 23658811914240.0, + "grad_norm": 1.6977355395672606, + "language_loss": 0.79485095, + "learning_rate": 3.134847066213879e-06, + "loss": 0.81649983, + "num_input_tokens_seen": 117380320, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.83203125, + "step": 5467, + "time_per_iteration": 2.482245683670044 + }, + { + "auxiliary_loss_clip": 0.01128659, + "auxiliary_loss_mlp": 0.01034682, + "balance_loss_clip": 1.02011502, + "balance_loss_mlp": 1.0452255, + "epoch": 0.32875394558845633, + "flos": 25336522408320.0, + "grad_norm": 1.5356074654715184, + "language_loss": 0.74795353, + "learning_rate": 3.134526351787587e-06, + "loss": 0.76958692, + "num_input_tokens_seen": 117400695, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8359375, + "step": 5468, + "time_per_iteration": 2.4828743934631348 + }, + { + "auxiliary_loss_clip": 0.01136832, + "auxiliary_loss_mlp": 0.01041148, + "balance_loss_clip": 1.02467322, + "balance_loss_mlp": 1.04996455, + "epoch": 0.3288140688411243, + "flos": 14903108576640.0, + "grad_norm": 1.8525214053644714, + "language_loss": 0.78469932, + "learning_rate": 3.134205594339942e-06, + "loss": 0.8064791, + "num_input_tokens_seen": 117418800, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8671875, + "step": 5469, + "time_per_iteration": 2.455672264099121 + }, + { + "auxiliary_loss_clip": 0.01129512, + "auxiliary_loss_mlp": 0.01034665, + "balance_loss_clip": 1.02008545, + "balance_loss_mlp": 1.04602098, + "epoch": 0.32887419209379226, + "flos": 18551345235840.0, + "grad_norm": 1.646072726718358, + "language_loss": 0.82014406, + "learning_rate": 3.133884793883107e-06, + "loss": 0.84178579, + "num_input_tokens_seen": 117438220, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8359375, + "step": 5470, + "time_per_iteration": 5.531651020050049 + }, + { + "auxiliary_loss_clip": 0.0112936, + "auxiliary_loss_mlp": 0.01038355, + "balance_loss_clip": 1.02315605, + "balance_loss_mlp": 1.04359245, + "epoch": 0.3289343153464602, + "flos": 48105610439040.0, + "grad_norm": 1.806312825179731, + "language_loss": 0.67675972, + "learning_rate": 3.1335639504292478e-06, + "loss": 0.69843686, + "num_input_tokens_seen": 117462560, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.859375, + "step": 5471, + "time_per_iteration": 2.7400858402252197 + }, + { + "auxiliary_loss_clip": 0.01135248, + "auxiliary_loss_mlp": 0.01042507, + "balance_loss_clip": 1.02578163, + "balance_loss_mlp": 1.04856122, + "epoch": 0.3289944385991282, + "flos": 27600295207680.0, + "grad_norm": 1.6357076803377442, + "language_loss": 0.65059721, + "learning_rate": 3.1332430639905288e-06, + "loss": 0.67237478, + "num_input_tokens_seen": 117483665, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8671875, + "step": 5472, + "time_per_iteration": 2.530604124069214 + }, + { + "auxiliary_loss_clip": 0.01132922, + "auxiliary_loss_mlp": 0.01043552, + "balance_loss_clip": 1.0271014, + "balance_loss_mlp": 1.04821706, + "epoch": 0.32905456185179616, + "flos": 20120318282880.0, + "grad_norm": 1.6631612231063349, + "language_loss": 0.88497955, + "learning_rate": 3.13292213457912e-06, + "loss": 0.9067443, + "num_input_tokens_seen": 117503565, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.84765625, + "step": 5473, + "time_per_iteration": 2.521026611328125 + }, + { + "auxiliary_loss_clip": 0.01133162, + "auxiliary_loss_mlp": 0.01043181, + "balance_loss_clip": 1.02669442, + "balance_loss_mlp": 1.0483191, + "epoch": 0.3291146851044642, + "flos": 23180230080000.0, + "grad_norm": 2.3087074790673423, + "language_loss": 0.78349268, + "learning_rate": 3.1326011622071903e-06, + "loss": 0.80525613, + "num_input_tokens_seen": 117521460, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.84765625, + "step": 5474, + "time_per_iteration": 2.4769628047943115 + }, + { + "auxiliary_loss_clip": 0.01047146, + "auxiliary_loss_mlp": 0.00999487, + "balance_loss_clip": 0.99740046, + "balance_loss_mlp": 1.02056372, + "epoch": 0.32917480835713214, + "flos": 67621912594560.0, + "grad_norm": 0.888273800575083, + "language_loss": 0.60237771, + "learning_rate": 3.132280146886911e-06, + "loss": 0.62284404, + "num_input_tokens_seen": 117580550, + "router_z_loss_clip": 0.02087402, + "router_z_loss_mlp": 0.265625, + "step": 5475, + "time_per_iteration": 3.039971351623535 + }, + { + "auxiliary_loss_clip": 0.01133227, + "auxiliary_loss_mlp": 0.01051514, + "balance_loss_clip": 1.03437138, + "balance_loss_mlp": 1.04512429, + "epoch": 0.3292349316098001, + "flos": 27964537073280.0, + "grad_norm": 2.5350164106808766, + "language_loss": 0.76634103, + "learning_rate": 3.131959088630455e-06, + "loss": 0.78818846, + "num_input_tokens_seen": 117600645, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.8828125, + "step": 5476, + "time_per_iteration": 2.488698959350586 + }, + { + "auxiliary_loss_clip": 0.01131587, + "auxiliary_loss_mlp": 0.01040976, + "balance_loss_clip": 1.02640307, + "balance_loss_mlp": 1.04819024, + "epoch": 0.3292950548624681, + "flos": 20263673462400.0, + "grad_norm": 1.8435246505513339, + "language_loss": 0.74520677, + "learning_rate": 3.131637987449997e-06, + "loss": 0.76693243, + "num_input_tokens_seen": 117618880, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8359375, + "step": 5477, + "time_per_iteration": 2.533641815185547 + }, + { + "auxiliary_loss_clip": 0.01124642, + "auxiliary_loss_mlp": 0.01034244, + "balance_loss_clip": 1.02036786, + "balance_loss_mlp": 1.04507232, + "epoch": 0.32935517811513604, + "flos": 20812999132800.0, + "grad_norm": 1.9138938380730264, + "language_loss": 0.75581098, + "learning_rate": 3.131316843357713e-06, + "loss": 0.7773999, + "num_input_tokens_seen": 117636445, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.796875, + "step": 5478, + "time_per_iteration": 2.4541866779327393 + }, + { + "auxiliary_loss_clip": 0.01129718, + "auxiliary_loss_mlp": 0.01036647, + "balance_loss_clip": 1.02218664, + "balance_loss_mlp": 1.04736805, + "epoch": 0.329415301367804, + "flos": 18441853603200.0, + "grad_norm": 1.6780134795902322, + "language_loss": 0.80241555, + "learning_rate": 3.1309956563657807e-06, + "loss": 0.82407916, + "num_input_tokens_seen": 117653105, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8203125, + "step": 5479, + "time_per_iteration": 2.5348050594329834 + }, + { + "auxiliary_loss_clip": 0.01046129, + "auxiliary_loss_mlp": 0.01003977, + "balance_loss_clip": 1.00210512, + "balance_loss_mlp": 1.01921439, + "epoch": 0.32947542462047197, + "flos": 66323024887680.0, + "grad_norm": 0.7411588561506779, + "language_loss": 0.56543052, + "learning_rate": 3.1306744264863804e-06, + "loss": 0.58593154, + "num_input_tokens_seen": 117719225, + "router_z_loss_clip": 0.01867676, + "router_z_loss_mlp": 0.26953125, + "step": 5480, + "time_per_iteration": 3.121812343597412 + }, + { + "auxiliary_loss_clip": 0.01128951, + "auxiliary_loss_mlp": 0.01044263, + "balance_loss_clip": 1.02871847, + "balance_loss_mlp": 1.04606879, + "epoch": 0.32953554787313993, + "flos": 23221599569280.0, + "grad_norm": 1.656023636160042, + "language_loss": 0.77029848, + "learning_rate": 3.1303531537316915e-06, + "loss": 0.79203057, + "num_input_tokens_seen": 117738725, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.828125, + "step": 5481, + "time_per_iteration": 2.4819936752319336 + }, + { + "auxiliary_loss_clip": 0.01129556, + "auxiliary_loss_mlp": 0.01034734, + "balance_loss_clip": 1.02028024, + "balance_loss_mlp": 1.04622722, + "epoch": 0.3295956711258079, + "flos": 27009492307200.0, + "grad_norm": 1.8057287203311059, + "language_loss": 0.78732938, + "learning_rate": 3.130031838113899e-06, + "loss": 0.80897224, + "num_input_tokens_seen": 117757765, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.83203125, + "step": 5482, + "time_per_iteration": 2.501615285873413 + }, + { + "auxiliary_loss_clip": 0.01129525, + "auxiliary_loss_mlp": 0.01041437, + "balance_loss_clip": 1.02601135, + "balance_loss_mlp": 1.04573894, + "epoch": 0.32965579437847586, + "flos": 19171702051200.0, + "grad_norm": 1.6414395423474737, + "language_loss": 0.74055123, + "learning_rate": 3.129710479645185e-06, + "loss": 0.76226085, + "num_input_tokens_seen": 117776810, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8359375, + "step": 5483, + "time_per_iteration": 2.5213518142700195 + }, + { + "auxiliary_loss_clip": 0.01128456, + "auxiliary_loss_mlp": 0.01032447, + "balance_loss_clip": 1.0187676, + "balance_loss_mlp": 1.04614615, + "epoch": 0.32971591763114383, + "flos": 30482521401600.0, + "grad_norm": 1.8373674608308554, + "language_loss": 0.75627816, + "learning_rate": 3.1293890783377366e-06, + "loss": 0.77788723, + "num_input_tokens_seen": 117797730, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.8203125, + "step": 5484, + "time_per_iteration": 2.543795108795166 + }, + { + "auxiliary_loss_clip": 0.01129378, + "auxiliary_loss_mlp": 0.01038991, + "balance_loss_clip": 1.02441156, + "balance_loss_mlp": 1.04699099, + "epoch": 0.3297760408838118, + "flos": 16289583598080.0, + "grad_norm": 2.1329507570753243, + "language_loss": 0.7209897, + "learning_rate": 3.129067634203742e-06, + "loss": 0.74267334, + "num_input_tokens_seen": 117815365, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.82421875, + "step": 5485, + "time_per_iteration": 2.4598846435546875 + }, + { + "auxiliary_loss_clip": 0.01124565, + "auxiliary_loss_mlp": 0.01040064, + "balance_loss_clip": 1.02626562, + "balance_loss_mlp": 1.04448354, + "epoch": 0.32983616413647976, + "flos": 29530924341120.0, + "grad_norm": 1.7963509228415293, + "language_loss": 0.80416954, + "learning_rate": 3.128746147255388e-06, + "loss": 0.8258158, + "num_input_tokens_seen": 117836095, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.80078125, + "step": 5486, + "time_per_iteration": 2.5368754863739014 + }, + { + "auxiliary_loss_clip": 0.011236, + "auxiliary_loss_mlp": 0.01037413, + "balance_loss_clip": 1.02264309, + "balance_loss_mlp": 1.04300976, + "epoch": 0.3298962873891478, + "flos": 20631398947200.0, + "grad_norm": 2.3473245188806056, + "language_loss": 0.84351611, + "learning_rate": 3.1284246175048683e-06, + "loss": 0.86512625, + "num_input_tokens_seen": 117854655, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8046875, + "step": 5487, + "time_per_iteration": 2.5140841007232666 + }, + { + "auxiliary_loss_clip": 0.01131842, + "auxiliary_loss_mlp": 0.01040276, + "balance_loss_clip": 1.02440929, + "balance_loss_mlp": 1.04636502, + "epoch": 0.32995641064181574, + "flos": 14976007228800.0, + "grad_norm": 2.289610395509379, + "language_loss": 0.74163198, + "learning_rate": 3.1281030449643735e-06, + "loss": 0.76335323, + "num_input_tokens_seen": 117873300, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.85546875, + "step": 5488, + "time_per_iteration": 2.4159257411956787 + }, + { + "auxiliary_loss_clip": 0.01127802, + "auxiliary_loss_mlp": 0.01040136, + "balance_loss_clip": 1.02519917, + "balance_loss_mlp": 1.04548192, + "epoch": 0.3300165338944837, + "flos": 18661447399680.0, + "grad_norm": 2.3379517114480004, + "language_loss": 0.72564352, + "learning_rate": 3.127781429646098e-06, + "loss": 0.74732298, + "num_input_tokens_seen": 117891540, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.82421875, + "step": 5489, + "time_per_iteration": 2.4810056686401367 + }, + { + "auxiliary_loss_clip": 0.01122617, + "auxiliary_loss_mlp": 0.01033113, + "balance_loss_clip": 1.01852202, + "balance_loss_mlp": 1.04076719, + "epoch": 0.3300766571471517, + "flos": 25583730785280.0, + "grad_norm": 2.5348585918072235, + "language_loss": 0.88752508, + "learning_rate": 3.127459771562238e-06, + "loss": 0.90908241, + "num_input_tokens_seen": 117907690, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8203125, + "step": 5490, + "time_per_iteration": 2.448437452316284 + }, + { + "auxiliary_loss_clip": 0.01121475, + "auxiliary_loss_mlp": 0.01034389, + "balance_loss_clip": 1.02022719, + "balance_loss_mlp": 1.0403626, + "epoch": 0.33013678039981964, + "flos": 11363501623680.0, + "grad_norm": 1.9493471797358817, + "language_loss": 0.83395195, + "learning_rate": 3.1271380707249907e-06, + "loss": 0.85551059, + "num_input_tokens_seen": 117925640, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8125, + "step": 5491, + "time_per_iteration": 2.44634747505188 + }, + { + "auxiliary_loss_clip": 0.01126063, + "auxiliary_loss_mlp": 0.01039892, + "balance_loss_clip": 1.02492499, + "balance_loss_mlp": 1.04421842, + "epoch": 0.3301969036524876, + "flos": 24821203939200.0, + "grad_norm": 2.715750342336911, + "language_loss": 0.77514994, + "learning_rate": 3.126816327146554e-06, + "loss": 0.79680943, + "num_input_tokens_seen": 117944525, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.81640625, + "step": 5492, + "time_per_iteration": 2.4870479106903076 + }, + { + "auxiliary_loss_clip": 0.01131001, + "auxiliary_loss_mlp": 0.0104338, + "balance_loss_clip": 1.0269649, + "balance_loss_mlp": 1.04629827, + "epoch": 0.33025702690515557, + "flos": 15961144613760.0, + "grad_norm": 2.2776411561569265, + "language_loss": 0.7450884, + "learning_rate": 3.12649454083913e-06, + "loss": 0.76683223, + "num_input_tokens_seen": 117962515, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.84375, + "step": 5493, + "time_per_iteration": 2.4506607055664062 + }, + { + "auxiliary_loss_clip": 0.01045286, + "auxiliary_loss_mlp": 0.01012729, + "balance_loss_clip": 1.01074982, + "balance_loss_mlp": 1.01881337, + "epoch": 0.33031715015782354, + "flos": 59416755989760.0, + "grad_norm": 0.7955029917088393, + "language_loss": 0.53910893, + "learning_rate": 3.12617271181492e-06, + "loss": 0.55968904, + "num_input_tokens_seen": 118018780, + "router_z_loss_clip": 0.01977539, + "router_z_loss_mlp": 0.265625, + "step": 5494, + "time_per_iteration": 3.0042550563812256 + }, + { + "auxiliary_loss_clip": 0.01124159, + "auxiliary_loss_mlp": 0.01037133, + "balance_loss_clip": 1.02245855, + "balance_loss_mlp": 1.04378355, + "epoch": 0.3303772734104915, + "flos": 23184360144000.0, + "grad_norm": 1.6073630563578136, + "language_loss": 0.87087989, + "learning_rate": 3.1258508400861276e-06, + "loss": 0.89249277, + "num_input_tokens_seen": 118038610, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8046875, + "step": 5495, + "time_per_iteration": 2.4716837406158447 + }, + { + "auxiliary_loss_clip": 0.01128875, + "auxiliary_loss_mlp": 0.01047751, + "balance_loss_clip": 1.03133559, + "balance_loss_mlp": 1.04508138, + "epoch": 0.33043739666315947, + "flos": 33071896010880.0, + "grad_norm": 3.5655917637781784, + "language_loss": 0.73526418, + "learning_rate": 3.1255289256649587e-06, + "loss": 0.75703049, + "num_input_tokens_seen": 118055905, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8359375, + "step": 5496, + "time_per_iteration": 2.531670570373535 + }, + { + "auxiliary_loss_clip": 0.01124295, + "auxiliary_loss_mlp": 0.01028859, + "balance_loss_clip": 1.01509058, + "balance_loss_mlp": 1.04384971, + "epoch": 0.33049751991582743, + "flos": 24895431394560.0, + "grad_norm": 2.1703031984353514, + "language_loss": 0.72764325, + "learning_rate": 3.1252069685636196e-06, + "loss": 0.74917477, + "num_input_tokens_seen": 118073695, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.8046875, + "step": 5497, + "time_per_iteration": 2.5148839950561523 + }, + { + "auxiliary_loss_clip": 0.01123603, + "auxiliary_loss_mlp": 0.01033477, + "balance_loss_clip": 1.01861119, + "balance_loss_mlp": 1.04340625, + "epoch": 0.3305576431684954, + "flos": 29460575554560.0, + "grad_norm": 2.5654673530164307, + "language_loss": 0.80193126, + "learning_rate": 3.124884968794321e-06, + "loss": 0.82350206, + "num_input_tokens_seen": 118094030, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.80078125, + "step": 5498, + "time_per_iteration": 2.517765522003174 + }, + { + "auxiliary_loss_clip": 0.01123393, + "auxiliary_loss_mlp": 0.01038843, + "balance_loss_clip": 1.02397776, + "balance_loss_mlp": 1.03977811, + "epoch": 0.33061776642116336, + "flos": 22632305040000.0, + "grad_norm": 2.1435474357237405, + "language_loss": 0.76491725, + "learning_rate": 3.12456292636927e-06, + "loss": 0.78653955, + "num_input_tokens_seen": 118111665, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8359375, + "step": 5499, + "time_per_iteration": 2.5006067752838135 + }, + { + "auxiliary_loss_clip": 0.01122541, + "auxiliary_loss_mlp": 0.01031983, + "balance_loss_clip": 1.0175705, + "balance_loss_mlp": 1.04131985, + "epoch": 0.3306778896738313, + "flos": 25776320532480.0, + "grad_norm": 1.506886865759599, + "language_loss": 0.79332948, + "learning_rate": 3.124240841300681e-06, + "loss": 0.81487471, + "num_input_tokens_seen": 118132435, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8125, + "step": 5500, + "time_per_iteration": 2.4859495162963867 + }, + { + "auxiliary_loss_clip": 0.01129022, + "auxiliary_loss_mlp": 0.0103113, + "balance_loss_clip": 1.01607347, + "balance_loss_mlp": 1.04564214, + "epoch": 0.33073801292649935, + "flos": 36940552479360.0, + "grad_norm": 2.164639953437845, + "language_loss": 0.66065335, + "learning_rate": 3.1239187136007665e-06, + "loss": 0.68225485, + "num_input_tokens_seen": 118155255, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.83203125, + "step": 5501, + "time_per_iteration": 2.6189892292022705 + }, + { + "auxiliary_loss_clip": 0.01126823, + "auxiliary_loss_mlp": 0.01041122, + "balance_loss_clip": 1.02471876, + "balance_loss_mlp": 1.04285216, + "epoch": 0.3307981361791673, + "flos": 12967738848000.0, + "grad_norm": 2.260615362067107, + "language_loss": 0.77580702, + "learning_rate": 3.1235965432817417e-06, + "loss": 0.79748642, + "num_input_tokens_seen": 118169865, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.83984375, + "step": 5502, + "time_per_iteration": 2.4086782932281494 + }, + { + "auxiliary_loss_clip": 0.01130061, + "auxiliary_loss_mlp": 0.01039, + "balance_loss_clip": 1.02389622, + "balance_loss_mlp": 1.04632545, + "epoch": 0.3308582594318353, + "flos": 25374372364800.0, + "grad_norm": 2.045089737815956, + "language_loss": 0.72346115, + "learning_rate": 3.123274330355824e-06, + "loss": 0.74515176, + "num_input_tokens_seen": 118190760, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8359375, + "step": 5503, + "time_per_iteration": 2.5176749229431152 + }, + { + "auxiliary_loss_clip": 0.0112493, + "auxiliary_loss_mlp": 0.01033516, + "balance_loss_clip": 1.01865053, + "balance_loss_mlp": 1.04248357, + "epoch": 0.33091838268450324, + "flos": 26468570419200.0, + "grad_norm": 1.5402224202893484, + "language_loss": 0.75216055, + "learning_rate": 3.12295207483523e-06, + "loss": 0.77374506, + "num_input_tokens_seen": 118213620, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.82421875, + "step": 5504, + "time_per_iteration": 2.530212879180908 + }, + { + "auxiliary_loss_clip": 0.01127019, + "auxiliary_loss_mlp": 0.01038843, + "balance_loss_clip": 1.02438283, + "balance_loss_mlp": 1.04382253, + "epoch": 0.3309785059371712, + "flos": 24971167221120.0, + "grad_norm": 1.6148817370045387, + "language_loss": 0.70049053, + "learning_rate": 3.1226297767321816e-06, + "loss": 0.72214913, + "num_input_tokens_seen": 118235010, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.83203125, + "step": 5505, + "time_per_iteration": 2.5212292671203613 + }, + { + "auxiliary_loss_clip": 0.01126444, + "auxiliary_loss_mlp": 0.01041221, + "balance_loss_clip": 1.02720845, + "balance_loss_mlp": 1.04601455, + "epoch": 0.3310386291898392, + "flos": 20446710192000.0, + "grad_norm": 1.586520967819923, + "language_loss": 0.81541443, + "learning_rate": 3.122307436058899e-06, + "loss": 0.83709103, + "num_input_tokens_seen": 118255820, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.8046875, + "step": 5506, + "time_per_iteration": 2.5494561195373535 + }, + { + "auxiliary_loss_clip": 0.01128621, + "auxiliary_loss_mlp": 0.01037729, + "balance_loss_clip": 1.02277398, + "balance_loss_mlp": 1.04704857, + "epoch": 0.33109875244250714, + "flos": 23182672204800.0, + "grad_norm": 1.929478423939084, + "language_loss": 0.79097712, + "learning_rate": 3.121985052827606e-06, + "loss": 0.81264055, + "num_input_tokens_seen": 118274160, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.81640625, + "step": 5507, + "time_per_iteration": 2.498659610748291 + }, + { + "auxiliary_loss_clip": 0.01123401, + "auxiliary_loss_mlp": 0.01040623, + "balance_loss_clip": 1.02594829, + "balance_loss_mlp": 1.04136062, + "epoch": 0.3311588756951751, + "flos": 24168384207360.0, + "grad_norm": 1.6667627205960738, + "language_loss": 0.71733725, + "learning_rate": 3.1216626270505274e-06, + "loss": 0.73897743, + "num_input_tokens_seen": 118294385, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8203125, + "step": 5508, + "time_per_iteration": 2.478593111038208 + }, + { + "auxiliary_loss_clip": 0.01124563, + "auxiliary_loss_mlp": 0.0102968, + "balance_loss_clip": 1.01566064, + "balance_loss_mlp": 1.04539418, + "epoch": 0.33121899894784307, + "flos": 28145742209280.0, + "grad_norm": 2.030813517097255, + "language_loss": 0.72023594, + "learning_rate": 3.12134015873989e-06, + "loss": 0.74177837, + "num_input_tokens_seen": 118313105, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 5509, + "time_per_iteration": 2.539806842803955 + }, + { + "auxiliary_loss_clip": 0.01126062, + "auxiliary_loss_mlp": 0.01034216, + "balance_loss_clip": 1.01975, + "balance_loss_mlp": 1.04503942, + "epoch": 0.33127912220051103, + "flos": 29567660976000.0, + "grad_norm": 1.5191607241878, + "language_loss": 0.73049426, + "learning_rate": 3.121017647907921e-06, + "loss": 0.75209701, + "num_input_tokens_seen": 118335250, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8125, + "step": 5510, + "time_per_iteration": 2.536083698272705 + }, + { + "auxiliary_loss_clip": 0.01123553, + "auxiliary_loss_mlp": 0.01035708, + "balance_loss_clip": 1.02148628, + "balance_loss_mlp": 1.0429213, + "epoch": 0.331339245453179, + "flos": 14428836374400.0, + "grad_norm": 2.1286159820346984, + "language_loss": 0.87371129, + "learning_rate": 3.1206950945668508e-06, + "loss": 0.89530391, + "num_input_tokens_seen": 118351470, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8046875, + "step": 5511, + "time_per_iteration": 2.4380695819854736 + }, + { + "auxiliary_loss_clip": 0.01119745, + "auxiliary_loss_mlp": 0.01032859, + "balance_loss_clip": 1.01986468, + "balance_loss_mlp": 1.04396749, + "epoch": 0.33139936870584696, + "flos": 20887118847360.0, + "grad_norm": 1.6025966363766477, + "language_loss": 0.72926772, + "learning_rate": 3.12037249872891e-06, + "loss": 0.7507937, + "num_input_tokens_seen": 118370970, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7578125, + "step": 5512, + "time_per_iteration": 5.464786767959595 + }, + { + "auxiliary_loss_clip": 0.01124343, + "auxiliary_loss_mlp": 0.01041694, + "balance_loss_clip": 1.02759719, + "balance_loss_mlp": 1.04466701, + "epoch": 0.33145949195851493, + "flos": 36284356869120.0, + "grad_norm": 1.8365879467062751, + "language_loss": 0.72230887, + "learning_rate": 3.1200498604063317e-06, + "loss": 0.7439692, + "num_input_tokens_seen": 118393125, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.796875, + "step": 5513, + "time_per_iteration": 2.6175873279571533 + }, + { + "auxiliary_loss_clip": 0.01128264, + "auxiliary_loss_mlp": 0.01034719, + "balance_loss_clip": 1.01972222, + "balance_loss_mlp": 1.04398656, + "epoch": 0.33151961521118295, + "flos": 14279735018880.0, + "grad_norm": 1.8557947519919487, + "language_loss": 0.68629253, + "learning_rate": 3.1197271796113507e-06, + "loss": 0.70792234, + "num_input_tokens_seen": 118410860, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84375, + "step": 5514, + "time_per_iteration": 2.4340810775756836 + }, + { + "auxiliary_loss_clip": 0.01127749, + "auxiliary_loss_mlp": 0.01041934, + "balance_loss_clip": 1.0251019, + "balance_loss_mlp": 1.04505849, + "epoch": 0.3315797384638509, + "flos": 20774323163520.0, + "grad_norm": 2.411486097564539, + "language_loss": 0.66439879, + "learning_rate": 3.1194044563562026e-06, + "loss": 0.6860956, + "num_input_tokens_seen": 118429570, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.828125, + "step": 5515, + "time_per_iteration": 2.4983339309692383 + }, + { + "auxiliary_loss_clip": 0.01124572, + "auxiliary_loss_mlp": 0.01034351, + "balance_loss_clip": 1.01960468, + "balance_loss_mlp": 1.04258537, + "epoch": 0.3316398617165189, + "flos": 24679464871680.0, + "grad_norm": 1.4970111675637168, + "language_loss": 0.69111156, + "learning_rate": 3.1190816906531257e-06, + "loss": 0.71270084, + "num_input_tokens_seen": 118450285, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8203125, + "step": 5516, + "time_per_iteration": 2.515367031097412 + }, + { + "auxiliary_loss_clip": 0.0112502, + "auxiliary_loss_mlp": 0.01036046, + "balance_loss_clip": 1.02154398, + "balance_loss_mlp": 1.04021645, + "epoch": 0.33169998496918685, + "flos": 18587974129920.0, + "grad_norm": 2.365933570102145, + "language_loss": 0.80287617, + "learning_rate": 3.118758882514359e-06, + "loss": 0.82448685, + "num_input_tokens_seen": 118468270, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.84765625, + "step": 5517, + "time_per_iteration": 2.5149497985839844 + }, + { + "auxiliary_loss_clip": 0.01121413, + "auxiliary_loss_mlp": 0.01036775, + "balance_loss_clip": 1.02264261, + "balance_loss_mlp": 1.04258931, + "epoch": 0.3317601082218548, + "flos": 20193647898240.0, + "grad_norm": 2.188422581245926, + "language_loss": 0.74551105, + "learning_rate": 3.118436031952143e-06, + "loss": 0.76709294, + "num_input_tokens_seen": 118486615, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7890625, + "step": 5518, + "time_per_iteration": 2.450188159942627 + }, + { + "auxiliary_loss_clip": 0.01048984, + "auxiliary_loss_mlp": 0.01008888, + "balance_loss_clip": 1.00682592, + "balance_loss_mlp": 1.02244139, + "epoch": 0.3318202314745228, + "flos": 68974703637120.0, + "grad_norm": 0.6172932492598038, + "language_loss": 0.54346693, + "learning_rate": 3.1181131389787206e-06, + "loss": 0.56404567, + "num_input_tokens_seen": 118553580, + "router_z_loss_clip": 0.02062988, + "router_z_loss_mlp": 0.265625, + "step": 5519, + "time_per_iteration": 3.167750358581543 + }, + { + "auxiliary_loss_clip": 0.0112453, + "auxiliary_loss_mlp": 0.01039354, + "balance_loss_clip": 1.0239042, + "balance_loss_mlp": 1.0434345, + "epoch": 0.33188035472719074, + "flos": 21500113374720.0, + "grad_norm": 3.8105825888408855, + "language_loss": 0.78854358, + "learning_rate": 3.117790203606336e-06, + "loss": 0.81018245, + "num_input_tokens_seen": 118570280, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8125, + "step": 5520, + "time_per_iteration": 2.451781988143921 + }, + { + "auxiliary_loss_clip": 0.01121269, + "auxiliary_loss_mlp": 0.01032127, + "balance_loss_clip": 1.01835227, + "balance_loss_mlp": 1.04244733, + "epoch": 0.3319404779798587, + "flos": 28870490926080.0, + "grad_norm": 2.656623957411012, + "language_loss": 0.76576293, + "learning_rate": 3.1174672258472344e-06, + "loss": 0.78729689, + "num_input_tokens_seen": 118590455, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7890625, + "step": 5521, + "time_per_iteration": 2.525865077972412 + }, + { + "auxiliary_loss_clip": 0.01126792, + "auxiliary_loss_mlp": 0.01044731, + "balance_loss_clip": 1.02932894, + "balance_loss_mlp": 1.04259682, + "epoch": 0.33200060123252667, + "flos": 23076915586560.0, + "grad_norm": 3.3004720611075964, + "language_loss": 0.70353854, + "learning_rate": 3.117144205713664e-06, + "loss": 0.72525376, + "num_input_tokens_seen": 118609495, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.83984375, + "step": 5522, + "time_per_iteration": 2.472001791000366 + }, + { + "auxiliary_loss_clip": 0.01123475, + "auxiliary_loss_mlp": 0.01030854, + "balance_loss_clip": 1.01739514, + "balance_loss_mlp": 1.04362595, + "epoch": 0.33206072448519464, + "flos": 21142479611520.0, + "grad_norm": 1.7154852702320889, + "language_loss": 0.74052203, + "learning_rate": 3.1168211432178735e-06, + "loss": 0.76206541, + "num_input_tokens_seen": 118628720, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.80078125, + "step": 5523, + "time_per_iteration": 2.4924776554107666 + }, + { + "auxiliary_loss_clip": 0.01122263, + "auxiliary_loss_mlp": 0.0103648, + "balance_loss_clip": 1.0211792, + "balance_loss_mlp": 1.04308188, + "epoch": 0.3321208477378626, + "flos": 13079097987840.0, + "grad_norm": 1.6905303226226114, + "language_loss": 0.82272083, + "learning_rate": 3.116498038372114e-06, + "loss": 0.84430826, + "num_input_tokens_seen": 118645955, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.79296875, + "step": 5524, + "time_per_iteration": 2.439711332321167 + }, + { + "auxiliary_loss_clip": 0.01123508, + "auxiliary_loss_mlp": 0.01038508, + "balance_loss_clip": 1.0251627, + "balance_loss_mlp": 1.04402184, + "epoch": 0.33218097099053057, + "flos": 21215414177280.0, + "grad_norm": 1.6540586406432352, + "language_loss": 0.8307848, + "learning_rate": 3.116174891188636e-06, + "loss": 0.85240501, + "num_input_tokens_seen": 118665605, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.79296875, + "step": 5525, + "time_per_iteration": 2.4927310943603516 + }, + { + "auxiliary_loss_clip": 0.01044531, + "auxiliary_loss_mlp": 0.01006175, + "balance_loss_clip": 1.00405347, + "balance_loss_mlp": 1.01804781, + "epoch": 0.33224109424319853, + "flos": 64348979189760.0, + "grad_norm": 0.7716933739699889, + "language_loss": 0.5260945, + "learning_rate": 3.1158517016796945e-06, + "loss": 0.54660153, + "num_input_tokens_seen": 118728155, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.265625, + "step": 5526, + "time_per_iteration": 3.0598835945129395 + }, + { + "auxiliary_loss_clip": 0.01126467, + "auxiliary_loss_mlp": 0.01042827, + "balance_loss_clip": 1.02724671, + "balance_loss_mlp": 1.04371929, + "epoch": 0.33230121749586655, + "flos": 17346003523200.0, + "grad_norm": 2.1037159361855737, + "language_loss": 0.77490491, + "learning_rate": 3.1155284698575445e-06, + "loss": 0.79659784, + "num_input_tokens_seen": 118743955, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.828125, + "step": 5527, + "time_per_iteration": 2.4878480434417725 + }, + { + "auxiliary_loss_clip": 0.01126946, + "auxiliary_loss_mlp": 0.01044009, + "balance_loss_clip": 1.03025246, + "balance_loss_mlp": 1.04651201, + "epoch": 0.3323613407485345, + "flos": 20997041443200.0, + "grad_norm": 2.9813221594214494, + "language_loss": 0.72143763, + "learning_rate": 3.1152051957344434e-06, + "loss": 0.74314719, + "num_input_tokens_seen": 118763275, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.8046875, + "step": 5528, + "time_per_iteration": 2.4562795162200928 + }, + { + "auxiliary_loss_clip": 0.0112635, + "auxiliary_loss_mlp": 0.01036386, + "balance_loss_clip": 1.02256346, + "balance_loss_mlp": 1.04463542, + "epoch": 0.3324214640012025, + "flos": 13152535344000.0, + "grad_norm": 1.7054310511699202, + "language_loss": 0.82638806, + "learning_rate": 3.1148818793226497e-06, + "loss": 0.84801543, + "num_input_tokens_seen": 118781110, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.81640625, + "step": 5529, + "time_per_iteration": 2.474243640899658 + }, + { + "auxiliary_loss_clip": 0.01129499, + "auxiliary_loss_mlp": 0.01036464, + "balance_loss_clip": 1.02223659, + "balance_loss_mlp": 1.04554248, + "epoch": 0.33248158725387045, + "flos": 22273522041600.0, + "grad_norm": 1.9738718949190572, + "language_loss": 0.69718957, + "learning_rate": 3.114558520634423e-06, + "loss": 0.71884924, + "num_input_tokens_seen": 118800620, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.83984375, + "step": 5530, + "time_per_iteration": 2.471686840057373 + }, + { + "auxiliary_loss_clip": 0.01127236, + "auxiliary_loss_mlp": 0.01045423, + "balance_loss_clip": 1.02996182, + "balance_loss_mlp": 1.04500127, + "epoch": 0.3325417105065384, + "flos": 20740998320640.0, + "grad_norm": 2.4616968900166643, + "language_loss": 0.7616601, + "learning_rate": 3.1142351196820256e-06, + "loss": 0.78338665, + "num_input_tokens_seen": 118818725, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8203125, + "step": 5531, + "time_per_iteration": 2.473328113555908 + }, + { + "auxiliary_loss_clip": 0.01128043, + "auxiliary_loss_mlp": 0.01037476, + "balance_loss_clip": 1.0231235, + "balance_loss_mlp": 1.04481292, + "epoch": 0.3326018337592064, + "flos": 24790536702720.0, + "grad_norm": 1.7553607817915955, + "language_loss": 0.73413068, + "learning_rate": 3.1139116764777206e-06, + "loss": 0.75578588, + "num_input_tokens_seen": 118839390, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.83203125, + "step": 5532, + "time_per_iteration": 2.4864931106567383 + }, + { + "auxiliary_loss_clip": 0.01128977, + "auxiliary_loss_mlp": 0.01026777, + "balance_loss_clip": 1.01321709, + "balance_loss_mlp": 1.04721618, + "epoch": 0.33266195701187434, + "flos": 14501699112960.0, + "grad_norm": 2.2280638741168057, + "language_loss": 0.65813714, + "learning_rate": 3.1135881910337735e-06, + "loss": 0.67969465, + "num_input_tokens_seen": 118856275, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.8203125, + "step": 5533, + "time_per_iteration": 2.5232229232788086 + }, + { + "auxiliary_loss_clip": 0.01126882, + "auxiliary_loss_mlp": 0.01040338, + "balance_loss_clip": 1.02541876, + "balance_loss_mlp": 1.04451632, + "epoch": 0.3327220802645423, + "flos": 15304410299520.0, + "grad_norm": 1.9248590192503388, + "language_loss": 0.70790148, + "learning_rate": 3.113264663362451e-06, + "loss": 0.72957367, + "num_input_tokens_seen": 118873830, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.82421875, + "step": 5534, + "time_per_iteration": 2.418875217437744 + }, + { + "auxiliary_loss_clip": 0.01125629, + "auxiliary_loss_mlp": 0.01033414, + "balance_loss_clip": 1.01890588, + "balance_loss_mlp": 1.04565191, + "epoch": 0.3327822035172103, + "flos": 23477534951040.0, + "grad_norm": 1.8142926842561948, + "language_loss": 0.6684956, + "learning_rate": 3.1129410934760204e-06, + "loss": 0.69008601, + "num_input_tokens_seen": 118891560, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.80078125, + "step": 5535, + "time_per_iteration": 2.5031726360321045 + }, + { + "auxiliary_loss_clip": 0.01126804, + "auxiliary_loss_mlp": 0.01038594, + "balance_loss_clip": 1.02450383, + "balance_loss_mlp": 1.04416704, + "epoch": 0.33284232676987824, + "flos": 25374516019200.0, + "grad_norm": 2.1308907042960525, + "language_loss": 0.72915065, + "learning_rate": 3.1126174813867517e-06, + "loss": 0.75080466, + "num_input_tokens_seen": 118910260, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.828125, + "step": 5536, + "time_per_iteration": 2.494007110595703 + }, + { + "auxiliary_loss_clip": 0.01126771, + "auxiliary_loss_mlp": 0.01038616, + "balance_loss_clip": 1.02474046, + "balance_loss_mlp": 1.0450089, + "epoch": 0.3329024500225462, + "flos": 23694363400320.0, + "grad_norm": 1.6653416647198893, + "language_loss": 0.81801486, + "learning_rate": 3.112293827106917e-06, + "loss": 0.83966869, + "num_input_tokens_seen": 118929985, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.81640625, + "step": 5537, + "time_per_iteration": 2.611788272857666 + }, + { + "auxiliary_loss_clip": 0.01131655, + "auxiliary_loss_mlp": 0.01042421, + "balance_loss_clip": 1.02805638, + "balance_loss_mlp": 1.04771638, + "epoch": 0.33296257327521417, + "flos": 31723163205120.0, + "grad_norm": 1.938500745409862, + "language_loss": 0.71606827, + "learning_rate": 3.111970130648789e-06, + "loss": 0.73780894, + "num_input_tokens_seen": 118951355, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.83984375, + "step": 5538, + "time_per_iteration": 2.538574695587158 + }, + { + "auxiliary_loss_clip": 0.01125022, + "auxiliary_loss_mlp": 0.01030337, + "balance_loss_clip": 1.01642489, + "balance_loss_mlp": 1.04461074, + "epoch": 0.33302269652788213, + "flos": 22744705674240.0, + "grad_norm": 2.0173985756025417, + "language_loss": 0.7442342, + "learning_rate": 3.1116463920246424e-06, + "loss": 0.76578778, + "num_input_tokens_seen": 118970910, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.8046875, + "step": 5539, + "time_per_iteration": 2.539393424987793 + }, + { + "auxiliary_loss_clip": 0.01132315, + "auxiliary_loss_mlp": 0.01045465, + "balance_loss_clip": 1.03062367, + "balance_loss_mlp": 1.04543138, + "epoch": 0.33308281978055015, + "flos": 11473747441920.0, + "grad_norm": 1.8798801752229715, + "language_loss": 0.70726681, + "learning_rate": 3.1113226112467527e-06, + "loss": 0.72904468, + "num_input_tokens_seen": 118989200, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8671875, + "step": 5540, + "time_per_iteration": 2.460745096206665 + }, + { + "auxiliary_loss_clip": 0.01123042, + "auxiliary_loss_mlp": 0.01035986, + "balance_loss_clip": 1.02156138, + "balance_loss_mlp": 1.04151917, + "epoch": 0.3331429430332181, + "flos": 38213693112960.0, + "grad_norm": 2.212860979219503, + "language_loss": 0.60678709, + "learning_rate": 3.1109987883273983e-06, + "loss": 0.62837738, + "num_input_tokens_seen": 119011030, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8125, + "step": 5541, + "time_per_iteration": 2.643308162689209 + }, + { + "auxiliary_loss_clip": 0.01129096, + "auxiliary_loss_mlp": 0.01040856, + "balance_loss_clip": 1.0256207, + "balance_loss_mlp": 1.04428339, + "epoch": 0.3332030662858861, + "flos": 22528667324160.0, + "grad_norm": 1.7250198470895146, + "language_loss": 0.68636936, + "learning_rate": 3.1106749232788584e-06, + "loss": 0.70806885, + "num_input_tokens_seen": 119030620, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8515625, + "step": 5542, + "time_per_iteration": 2.472029209136963 + }, + { + "auxiliary_loss_clip": 0.0112742, + "auxiliary_loss_mlp": 0.01037594, + "balance_loss_clip": 1.02362895, + "balance_loss_mlp": 1.04488277, + "epoch": 0.33326318953855405, + "flos": 15997773507840.0, + "grad_norm": 1.6472310915335262, + "language_loss": 0.75526464, + "learning_rate": 3.110351016113414e-06, + "loss": 0.77691472, + "num_input_tokens_seen": 119048015, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.82421875, + "step": 5543, + "time_per_iteration": 2.453550100326538 + }, + { + "auxiliary_loss_clip": 0.01132047, + "auxiliary_loss_mlp": 0.01037735, + "balance_loss_clip": 1.02342415, + "balance_loss_mlp": 1.04834402, + "epoch": 0.333323312791222, + "flos": 25593535198080.0, + "grad_norm": 1.6694578175563026, + "language_loss": 0.75282717, + "learning_rate": 3.110027066843348e-06, + "loss": 0.77452493, + "num_input_tokens_seen": 119066280, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.8359375, + "step": 5544, + "time_per_iteration": 2.486992835998535 + }, + { + "auxiliary_loss_clip": 0.01124934, + "auxiliary_loss_mlp": 0.01033224, + "balance_loss_clip": 1.01910329, + "balance_loss_mlp": 1.04350412, + "epoch": 0.33338343604389, + "flos": 25119550304640.0, + "grad_norm": 1.4864809930890506, + "language_loss": 0.70886022, + "learning_rate": 3.1097030754809456e-06, + "loss": 0.73044181, + "num_input_tokens_seen": 119087680, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8125, + "step": 5545, + "time_per_iteration": 2.5813279151916504 + }, + { + "auxiliary_loss_clip": 0.01125022, + "auxiliary_loss_mlp": 0.01037243, + "balance_loss_clip": 1.02333164, + "balance_loss_mlp": 1.04530168, + "epoch": 0.33344355929655795, + "flos": 16947287579520.0, + "grad_norm": 1.7150542013191912, + "language_loss": 0.69300294, + "learning_rate": 3.1093790420384894e-06, + "loss": 0.7146256, + "num_input_tokens_seen": 119105820, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.796875, + "step": 5546, + "time_per_iteration": 2.4564788341522217 + }, + { + "auxiliary_loss_clip": 0.01129119, + "auxiliary_loss_mlp": 0.01037833, + "balance_loss_clip": 1.02340865, + "balance_loss_mlp": 1.04343665, + "epoch": 0.3335036825492259, + "flos": 27889591345920.0, + "grad_norm": 1.6632006519185205, + "language_loss": 0.64804697, + "learning_rate": 3.1090549665282702e-06, + "loss": 0.66971648, + "num_input_tokens_seen": 119126630, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.859375, + "step": 5547, + "time_per_iteration": 2.554959774017334 + }, + { + "auxiliary_loss_clip": 0.01127842, + "auxiliary_loss_mlp": 0.01030841, + "balance_loss_clip": 1.01782918, + "balance_loss_mlp": 1.0467664, + "epoch": 0.3335638058018939, + "flos": 16179553261440.0, + "grad_norm": 2.454082693277369, + "language_loss": 0.856148, + "learning_rate": 3.1087308489625742e-06, + "loss": 0.87773478, + "num_input_tokens_seen": 119143375, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.8125, + "step": 5548, + "time_per_iteration": 2.451032876968384 + }, + { + "auxiliary_loss_clip": 0.01129139, + "auxiliary_loss_mlp": 0.01036581, + "balance_loss_clip": 1.02100003, + "balance_loss_mlp": 1.04508662, + "epoch": 0.33362392905456184, + "flos": 39896108288640.0, + "grad_norm": 2.024965729715467, + "language_loss": 0.74754196, + "learning_rate": 3.1084066893536945e-06, + "loss": 0.76919919, + "num_input_tokens_seen": 119166450, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.83984375, + "step": 5549, + "time_per_iteration": 2.6875991821289062 + }, + { + "auxiliary_loss_clip": 0.01128755, + "auxiliary_loss_mlp": 0.01038769, + "balance_loss_clip": 1.02362955, + "balance_loss_mlp": 1.04486775, + "epoch": 0.3336840523072298, + "flos": 44271212567040.0, + "grad_norm": 1.8150391856089545, + "language_loss": 0.68361247, + "learning_rate": 3.108082487713921e-06, + "loss": 0.70528769, + "num_input_tokens_seen": 119189645, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.83984375, + "step": 5550, + "time_per_iteration": 2.640758752822876 + }, + { + "auxiliary_loss_clip": 0.0112866, + "auxiliary_loss_mlp": 0.01039899, + "balance_loss_clip": 1.02611244, + "balance_loss_mlp": 1.04545677, + "epoch": 0.33374417555989777, + "flos": 15085678429440.0, + "grad_norm": 1.742869766825136, + "language_loss": 0.60666394, + "learning_rate": 3.1077582440555495e-06, + "loss": 0.62834954, + "num_input_tokens_seen": 119208045, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.83203125, + "step": 5551, + "time_per_iteration": 2.454871654510498 + }, + { + "auxiliary_loss_clip": 0.01127389, + "auxiliary_loss_mlp": 0.01040643, + "balance_loss_clip": 1.02569366, + "balance_loss_mlp": 1.0459497, + "epoch": 0.33380429881256574, + "flos": 15849174942720.0, + "grad_norm": 1.6119589143573256, + "language_loss": 0.70450759, + "learning_rate": 3.1074339583908746e-06, + "loss": 0.72618788, + "num_input_tokens_seen": 119224910, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8125, + "step": 5552, + "time_per_iteration": 2.4226949214935303 + }, + { + "auxiliary_loss_clip": 0.01127587, + "auxiliary_loss_mlp": 0.01036933, + "balance_loss_clip": 1.02297902, + "balance_loss_mlp": 1.04462051, + "epoch": 0.33386442206523376, + "flos": 13480327883520.0, + "grad_norm": 2.0022942324560145, + "language_loss": 0.8289907, + "learning_rate": 3.107109630732192e-06, + "loss": 0.85063589, + "num_input_tokens_seen": 119243290, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.828125, + "step": 5553, + "time_per_iteration": 3.8951358795166016 + }, + { + "auxiliary_loss_clip": 0.01128647, + "auxiliary_loss_mlp": 0.01036827, + "balance_loss_clip": 1.0211153, + "balance_loss_mlp": 1.04528964, + "epoch": 0.3339245453179017, + "flos": 16690669839360.0, + "grad_norm": 2.095475541363027, + "language_loss": 0.81220448, + "learning_rate": 3.1067852610918017e-06, + "loss": 0.83385921, + "num_input_tokens_seen": 119261195, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.83203125, + "step": 5554, + "time_per_iteration": 3.8097896575927734 + }, + { + "auxiliary_loss_clip": 0.01128551, + "auxiliary_loss_mlp": 0.0104249, + "balance_loss_clip": 1.02811968, + "balance_loss_mlp": 1.0457983, + "epoch": 0.3339846685705697, + "flos": 24610624456320.0, + "grad_norm": 1.4459560856203526, + "language_loss": 0.81277251, + "learning_rate": 3.1064608494820032e-06, + "loss": 0.83448291, + "num_input_tokens_seen": 119282845, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.828125, + "step": 5555, + "time_per_iteration": 2.51686954498291 + }, + { + "auxiliary_loss_clip": 0.01126865, + "auxiliary_loss_mlp": 0.01038536, + "balance_loss_clip": 1.02469552, + "balance_loss_mlp": 1.04441357, + "epoch": 0.33404479182323765, + "flos": 30953812775040.0, + "grad_norm": 1.713035899616047, + "language_loss": 0.74563497, + "learning_rate": 3.106136395915099e-06, + "loss": 0.76728898, + "num_input_tokens_seen": 119304430, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.82421875, + "step": 5556, + "time_per_iteration": 2.550630807876587 + }, + { + "auxiliary_loss_clip": 0.0112773, + "auxiliary_loss_mlp": 0.01038673, + "balance_loss_clip": 1.02459431, + "balance_loss_mlp": 1.04586554, + "epoch": 0.3341049150759056, + "flos": 23513301918720.0, + "grad_norm": 1.4096864083862861, + "language_loss": 0.82588691, + "learning_rate": 3.105811900403391e-06, + "loss": 0.84755093, + "num_input_tokens_seen": 119323830, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8203125, + "step": 5557, + "time_per_iteration": 2.498108148574829 + }, + { + "auxiliary_loss_clip": 0.01129625, + "auxiliary_loss_mlp": 0.01045289, + "balance_loss_clip": 1.03055513, + "balance_loss_mlp": 1.04486346, + "epoch": 0.3341650383285736, + "flos": 24026824707840.0, + "grad_norm": 1.7414701325609587, + "language_loss": 0.80056083, + "learning_rate": 3.1054873629591855e-06, + "loss": 0.82230997, + "num_input_tokens_seen": 119346340, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.84765625, + "step": 5558, + "time_per_iteration": 2.5519607067108154 + }, + { + "auxiliary_loss_clip": 0.01129512, + "auxiliary_loss_mlp": 0.01034927, + "balance_loss_clip": 1.02159929, + "balance_loss_mlp": 1.04537535, + "epoch": 0.33422516158124155, + "flos": 24901967669760.0, + "grad_norm": 1.595273660638049, + "language_loss": 0.81953323, + "learning_rate": 3.105162783594788e-06, + "loss": 0.84117764, + "num_input_tokens_seen": 119367285, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.84375, + "step": 5559, + "time_per_iteration": 2.5202248096466064 + }, + { + "auxiliary_loss_clip": 0.01126195, + "auxiliary_loss_mlp": 0.01038303, + "balance_loss_clip": 1.02384293, + "balance_loss_mlp": 1.04450536, + "epoch": 0.3342852848339095, + "flos": 18333403464960.0, + "grad_norm": 2.784570608011319, + "language_loss": 0.72027284, + "learning_rate": 3.1048381623225074e-06, + "loss": 0.74191785, + "num_input_tokens_seen": 119385370, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.81640625, + "step": 5560, + "time_per_iteration": 2.453016757965088 + }, + { + "auxiliary_loss_clip": 0.01133571, + "auxiliary_loss_mlp": 0.01046441, + "balance_loss_clip": 1.03118193, + "balance_loss_mlp": 1.04679513, + "epoch": 0.3343454080865775, + "flos": 30046530119040.0, + "grad_norm": 2.584817000325422, + "language_loss": 0.74888778, + "learning_rate": 3.1045134991546526e-06, + "loss": 0.77068788, + "num_input_tokens_seen": 119409150, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8671875, + "step": 5561, + "time_per_iteration": 2.526980400085449 + }, + { + "auxiliary_loss_clip": 0.01128977, + "auxiliary_loss_mlp": 0.01038649, + "balance_loss_clip": 1.02410603, + "balance_loss_mlp": 1.04610825, + "epoch": 0.33440553133924544, + "flos": 16398823835520.0, + "grad_norm": 2.2689753945529176, + "language_loss": 0.69638503, + "learning_rate": 3.1041887941035355e-06, + "loss": 0.71806127, + "num_input_tokens_seen": 119426475, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.828125, + "step": 5562, + "time_per_iteration": 2.483530282974243 + }, + { + "auxiliary_loss_clip": 0.01127212, + "auxiliary_loss_mlp": 0.01041398, + "balance_loss_clip": 1.02821374, + "balance_loss_mlp": 1.04549575, + "epoch": 0.3344656545919134, + "flos": 24242072958720.0, + "grad_norm": 1.5595683236821118, + "language_loss": 0.65407914, + "learning_rate": 3.1038640471814685e-06, + "loss": 0.67576528, + "num_input_tokens_seen": 119446900, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.8203125, + "step": 5563, + "time_per_iteration": 2.489734649658203 + }, + { + "auxiliary_loss_clip": 0.01131891, + "auxiliary_loss_mlp": 0.0104331, + "balance_loss_clip": 1.027843, + "balance_loss_mlp": 1.0464654, + "epoch": 0.3345257778445814, + "flos": 52118843149440.0, + "grad_norm": 3.650208894964183, + "language_loss": 0.74457055, + "learning_rate": 3.103539258400766e-06, + "loss": 0.76632255, + "num_input_tokens_seen": 119470945, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.85546875, + "step": 5564, + "time_per_iteration": 2.7312686443328857 + }, + { + "auxiliary_loss_clip": 0.01049511, + "auxiliary_loss_mlp": 0.00999253, + "balance_loss_clip": 0.99735802, + "balance_loss_mlp": 1.02280784, + "epoch": 0.33458590109724934, + "flos": 68048602254720.0, + "grad_norm": 0.7800603717209338, + "language_loss": 0.55489159, + "learning_rate": 3.103214427773745e-06, + "loss": 0.57537925, + "num_input_tokens_seen": 119529925, + "router_z_loss_clip": 0.0189209, + "router_z_loss_mlp": 0.265625, + "step": 5565, + "time_per_iteration": 3.0266246795654297 + }, + { + "auxiliary_loss_clip": 0.01126829, + "auxiliary_loss_mlp": 0.0103706, + "balance_loss_clip": 1.02271366, + "balance_loss_mlp": 1.04589689, + "epoch": 0.3346460243499173, + "flos": 37414788768000.0, + "grad_norm": 1.7346222757402157, + "language_loss": 0.64754677, + "learning_rate": 3.102889555312721e-06, + "loss": 0.66918564, + "num_input_tokens_seen": 119550700, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.80859375, + "step": 5566, + "time_per_iteration": 2.5819363594055176 + }, + { + "auxiliary_loss_clip": 0.01128946, + "auxiliary_loss_mlp": 0.01040566, + "balance_loss_clip": 1.0259037, + "balance_loss_mlp": 1.04706717, + "epoch": 0.3347061476025853, + "flos": 18697358021760.0, + "grad_norm": 1.73011072762743, + "language_loss": 0.77735972, + "learning_rate": 3.102564641030016e-06, + "loss": 0.7990548, + "num_input_tokens_seen": 119569295, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8203125, + "step": 5567, + "time_per_iteration": 2.508108377456665 + }, + { + "auxiliary_loss_clip": 0.01130701, + "auxiliary_loss_mlp": 0.01040305, + "balance_loss_clip": 1.02480745, + "balance_loss_mlp": 1.04583585, + "epoch": 0.3347662708552533, + "flos": 13917827537280.0, + "grad_norm": 1.719738804733239, + "language_loss": 0.76512182, + "learning_rate": 3.102239684937949e-06, + "loss": 0.78683186, + "num_input_tokens_seen": 119587375, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84765625, + "step": 5568, + "time_per_iteration": 2.4344217777252197 + }, + { + "auxiliary_loss_clip": 0.01130164, + "auxiliary_loss_mlp": 0.01044901, + "balance_loss_clip": 1.02973104, + "balance_loss_mlp": 1.04528308, + "epoch": 0.33482639410792125, + "flos": 19750402068480.0, + "grad_norm": 2.265483767853782, + "language_loss": 0.71277773, + "learning_rate": 3.101914687048842e-06, + "loss": 0.73452842, + "num_input_tokens_seen": 119604530, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.84765625, + "step": 5569, + "time_per_iteration": 2.462592840194702 + }, + { + "auxiliary_loss_clip": 0.0112772, + "auxiliary_loss_mlp": 0.0103514, + "balance_loss_clip": 1.01920176, + "balance_loss_mlp": 1.04275155, + "epoch": 0.3348865173605892, + "flos": 16102991422080.0, + "grad_norm": 1.859999754882374, + "language_loss": 0.90291858, + "learning_rate": 3.10158964737502e-06, + "loss": 0.9245472, + "num_input_tokens_seen": 119621025, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8515625, + "step": 5570, + "time_per_iteration": 2.432124614715576 + }, + { + "auxiliary_loss_clip": 0.0112712, + "auxiliary_loss_mlp": 0.01030792, + "balance_loss_clip": 1.01634383, + "balance_loss_mlp": 1.04461455, + "epoch": 0.3349466406132572, + "flos": 25008945350400.0, + "grad_norm": 1.7333982724081918, + "language_loss": 0.80038494, + "learning_rate": 3.101264565928808e-06, + "loss": 0.82196403, + "num_input_tokens_seen": 119641725, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.82421875, + "step": 5571, + "time_per_iteration": 2.52752947807312 + }, + { + "auxiliary_loss_clip": 0.0104544, + "auxiliary_loss_mlp": 0.00998336, + "balance_loss_clip": 0.99651235, + "balance_loss_mlp": 1.01880455, + "epoch": 0.33500676386592515, + "flos": 54319991564160.0, + "grad_norm": 0.9063074837999179, + "language_loss": 0.55948162, + "learning_rate": 3.1009394427225335e-06, + "loss": 0.5799194, + "num_input_tokens_seen": 119693560, + "router_z_loss_clip": 0.01818848, + "router_z_loss_mlp": 0.265625, + "step": 5572, + "time_per_iteration": 3.0247979164123535 + }, + { + "auxiliary_loss_clip": 0.01131084, + "auxiliary_loss_mlp": 0.01046374, + "balance_loss_clip": 1.03212237, + "balance_loss_mlp": 1.04797339, + "epoch": 0.3350668871185931, + "flos": 26797332625920.0, + "grad_norm": 2.028320341949736, + "language_loss": 0.78112698, + "learning_rate": 3.1006142777685257e-06, + "loss": 0.80290151, + "num_input_tokens_seen": 119712935, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.83203125, + "step": 5573, + "time_per_iteration": 2.5152878761291504 + }, + { + "auxiliary_loss_clip": 0.01130248, + "auxiliary_loss_mlp": 0.01046989, + "balance_loss_clip": 1.03143215, + "balance_loss_mlp": 1.04525197, + "epoch": 0.3351270103712611, + "flos": 33510508986240.0, + "grad_norm": 2.1279768530108503, + "language_loss": 0.72473001, + "learning_rate": 3.1002890710791133e-06, + "loss": 0.7465024, + "num_input_tokens_seen": 119731680, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8515625, + "step": 5574, + "time_per_iteration": 2.543531656265259 + }, + { + "auxiliary_loss_clip": 0.01125319, + "auxiliary_loss_mlp": 0.0103147, + "balance_loss_clip": 1.017308, + "balance_loss_mlp": 1.04292774, + "epoch": 0.33518713362392905, + "flos": 26506240807680.0, + "grad_norm": 2.78085640379241, + "language_loss": 0.87911499, + "learning_rate": 3.0999638226666287e-06, + "loss": 0.90068293, + "num_input_tokens_seen": 119752155, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.82421875, + "step": 5575, + "time_per_iteration": 2.546952724456787 + }, + { + "auxiliary_loss_clip": 0.01132707, + "auxiliary_loss_mlp": 0.01045745, + "balance_loss_clip": 1.02899647, + "balance_loss_mlp": 1.04479516, + "epoch": 0.335247256876597, + "flos": 17232345912960.0, + "grad_norm": 2.569353520757799, + "language_loss": 0.82441479, + "learning_rate": 3.0996385325434063e-06, + "loss": 0.84619927, + "num_input_tokens_seen": 119769195, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.875, + "step": 5576, + "time_per_iteration": 2.414294958114624 + }, + { + "auxiliary_loss_clip": 0.01129312, + "auxiliary_loss_mlp": 0.01044917, + "balance_loss_clip": 1.0286808, + "balance_loss_mlp": 1.043697, + "epoch": 0.335307380129265, + "flos": 25629373992960.0, + "grad_norm": 3.008815557703919, + "language_loss": 0.73384887, + "learning_rate": 3.0993132007217806e-06, + "loss": 0.75559115, + "num_input_tokens_seen": 119786810, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.85546875, + "step": 5577, + "time_per_iteration": 2.50136399269104 + }, + { + "auxiliary_loss_clip": 0.01131921, + "auxiliary_loss_mlp": 0.0104202, + "balance_loss_clip": 1.02667177, + "balance_loss_mlp": 1.04811549, + "epoch": 0.33536750338193294, + "flos": 19680089195520.0, + "grad_norm": 1.7225109171896533, + "language_loss": 0.81555498, + "learning_rate": 3.0989878272140883e-06, + "loss": 0.8372944, + "num_input_tokens_seen": 119805395, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8359375, + "step": 5578, + "time_per_iteration": 2.431365728378296 + }, + { + "auxiliary_loss_clip": 0.01125183, + "auxiliary_loss_mlp": 0.0103725, + "balance_loss_clip": 1.02277184, + "balance_loss_mlp": 1.04578936, + "epoch": 0.3354276266346009, + "flos": 18332613365760.0, + "grad_norm": 1.8947087551065327, + "language_loss": 0.71785814, + "learning_rate": 3.0986624120326676e-06, + "loss": 0.73948246, + "num_input_tokens_seen": 119823135, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.79296875, + "step": 5579, + "time_per_iteration": 2.4519495964050293 + }, + { + "auxiliary_loss_clip": 0.01130811, + "auxiliary_loss_mlp": 0.01037429, + "balance_loss_clip": 1.02191353, + "balance_loss_mlp": 1.0456152, + "epoch": 0.3354877498872689, + "flos": 17858556645120.0, + "grad_norm": 2.0306401350469225, + "language_loss": 0.81084043, + "learning_rate": 3.0983369551898573e-06, + "loss": 0.83252287, + "num_input_tokens_seen": 119842265, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8515625, + "step": 5580, + "time_per_iteration": 2.427481174468994 + }, + { + "auxiliary_loss_clip": 0.01130056, + "auxiliary_loss_mlp": 0.01036614, + "balance_loss_clip": 1.02091432, + "balance_loss_mlp": 1.04496789, + "epoch": 0.3355478731399369, + "flos": 24717745791360.0, + "grad_norm": 1.8687829543354073, + "language_loss": 0.77912092, + "learning_rate": 3.0980114566980003e-06, + "loss": 0.80078757, + "num_input_tokens_seen": 119862500, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8515625, + "step": 5581, + "time_per_iteration": 2.5320229530334473 + }, + { + "auxiliary_loss_clip": 0.01132086, + "auxiliary_loss_mlp": 0.01045037, + "balance_loss_clip": 1.02735782, + "balance_loss_mlp": 1.04367673, + "epoch": 0.33560799639260486, + "flos": 16873886136960.0, + "grad_norm": 5.02896087449, + "language_loss": 0.74623251, + "learning_rate": 3.0976859165694384e-06, + "loss": 0.76800376, + "num_input_tokens_seen": 119880160, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.8828125, + "step": 5582, + "time_per_iteration": 2.421482801437378 + }, + { + "auxiliary_loss_clip": 0.0113015, + "auxiliary_loss_mlp": 0.01041343, + "balance_loss_clip": 1.02528524, + "balance_loss_mlp": 1.04456937, + "epoch": 0.3356681196452728, + "flos": 18333511205760.0, + "grad_norm": 1.790512330860928, + "language_loss": 0.82143587, + "learning_rate": 3.0973603348165166e-06, + "loss": 0.84315073, + "num_input_tokens_seen": 119899040, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.85546875, + "step": 5583, + "time_per_iteration": 2.4543566703796387 + }, + { + "auxiliary_loss_clip": 0.01127596, + "auxiliary_loss_mlp": 0.01044573, + "balance_loss_clip": 1.02991009, + "balance_loss_mlp": 1.04491317, + "epoch": 0.3357282428979408, + "flos": 34750612085760.0, + "grad_norm": 1.9267692381394996, + "language_loss": 0.7779209, + "learning_rate": 3.097034711451581e-06, + "loss": 0.79964256, + "num_input_tokens_seen": 119921120, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.828125, + "step": 5584, + "time_per_iteration": 2.6100947856903076 + }, + { + "auxiliary_loss_clip": 0.01129164, + "auxiliary_loss_mlp": 0.01038197, + "balance_loss_clip": 1.02343249, + "balance_loss_mlp": 1.04359186, + "epoch": 0.33578836615060875, + "flos": 21580087006080.0, + "grad_norm": 1.4758908421399493, + "language_loss": 0.75978506, + "learning_rate": 3.0967090464869795e-06, + "loss": 0.78145868, + "num_input_tokens_seen": 119940165, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.85546875, + "step": 5585, + "time_per_iteration": 2.4898715019226074 + }, + { + "auxiliary_loss_clip": 0.01121936, + "auxiliary_loss_mlp": 0.01037049, + "balance_loss_clip": 1.02170694, + "balance_loss_mlp": 1.04066801, + "epoch": 0.3358484894032767, + "flos": 24530291688960.0, + "grad_norm": 1.4987207146888684, + "language_loss": 0.77731383, + "learning_rate": 3.0963833399350608e-06, + "loss": 0.79890364, + "num_input_tokens_seen": 119959730, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8125, + "step": 5586, + "time_per_iteration": 2.4825005531311035 + }, + { + "auxiliary_loss_clip": 0.01136236, + "auxiliary_loss_mlp": 0.01048607, + "balance_loss_clip": 1.03070199, + "balance_loss_mlp": 1.04787183, + "epoch": 0.3359086126559447, + "flos": 22455589104000.0, + "grad_norm": 1.6235624689574053, + "language_loss": 0.81027555, + "learning_rate": 3.0960575918081756e-06, + "loss": 0.83212399, + "num_input_tokens_seen": 119979315, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.8828125, + "step": 5587, + "time_per_iteration": 2.486459493637085 + }, + { + "auxiliary_loss_clip": 0.01125436, + "auxiliary_loss_mlp": 0.01040884, + "balance_loss_clip": 1.0270915, + "balance_loss_mlp": 1.04548144, + "epoch": 0.33596873590861265, + "flos": 16543687386240.0, + "grad_norm": 1.7952449023594161, + "language_loss": 0.67014575, + "learning_rate": 3.095731802118677e-06, + "loss": 0.69180894, + "num_input_tokens_seen": 119996140, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.80078125, + "step": 5588, + "time_per_iteration": 2.435070753097534 + }, + { + "auxiliary_loss_clip": 0.01130516, + "auxiliary_loss_mlp": 0.0104412, + "balance_loss_clip": 1.02784824, + "balance_loss_mlp": 1.04568088, + "epoch": 0.3360288591612806, + "flos": 31175812782720.0, + "grad_norm": 1.6839710852868943, + "language_loss": 0.69882601, + "learning_rate": 3.095405970878919e-06, + "loss": 0.72057241, + "num_input_tokens_seen": 120017720, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.84765625, + "step": 5589, + "time_per_iteration": 2.548051118850708 + }, + { + "auxiliary_loss_clip": 0.01129859, + "auxiliary_loss_mlp": 0.01043753, + "balance_loss_clip": 1.02709961, + "balance_loss_mlp": 1.04461861, + "epoch": 0.3360889824139486, + "flos": 23696913265920.0, + "grad_norm": 2.1328325025080987, + "language_loss": 0.66886735, + "learning_rate": 3.0950800981012567e-06, + "loss": 0.69060349, + "num_input_tokens_seen": 120036335, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8515625, + "step": 5590, + "time_per_iteration": 2.4735047817230225 + }, + { + "auxiliary_loss_clip": 0.01126204, + "auxiliary_loss_mlp": 0.01045607, + "balance_loss_clip": 1.02993059, + "balance_loss_mlp": 1.04570127, + "epoch": 0.33614910566661654, + "flos": 19318109886720.0, + "grad_norm": 1.8322479695472769, + "language_loss": 0.73409903, + "learning_rate": 3.094754183798047e-06, + "loss": 0.75581712, + "num_input_tokens_seen": 120056120, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8046875, + "step": 5591, + "time_per_iteration": 2.4736244678497314 + }, + { + "auxiliary_loss_clip": 0.01127166, + "auxiliary_loss_mlp": 0.01042172, + "balance_loss_clip": 1.02686584, + "balance_loss_mlp": 1.04408562, + "epoch": 0.3362092289192845, + "flos": 16472261191680.0, + "grad_norm": 1.9183925576882788, + "language_loss": 0.69446647, + "learning_rate": 3.0944282279816493e-06, + "loss": 0.71615982, + "num_input_tokens_seen": 120073650, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.828125, + "step": 5592, + "time_per_iteration": 2.4232676029205322 + }, + { + "auxiliary_loss_clip": 0.0112535, + "auxiliary_loss_mlp": 0.01037895, + "balance_loss_clip": 1.02366149, + "balance_loss_mlp": 1.0442183, + "epoch": 0.33626935217195253, + "flos": 24243581329920.0, + "grad_norm": 2.4700576130478367, + "language_loss": 0.76281321, + "learning_rate": 3.094102230664423e-06, + "loss": 0.78444564, + "num_input_tokens_seen": 120093260, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8125, + "step": 5593, + "time_per_iteration": 2.4856812953948975 + }, + { + "auxiliary_loss_clip": 0.01128845, + "auxiliary_loss_mlp": 0.01044628, + "balance_loss_clip": 1.02703261, + "balance_loss_mlp": 1.04333365, + "epoch": 0.3363294754246205, + "flos": 19718765164800.0, + "grad_norm": 2.2267028217655516, + "language_loss": 0.71435678, + "learning_rate": 3.093776191858731e-06, + "loss": 0.73609149, + "num_input_tokens_seen": 120111830, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8515625, + "step": 5594, + "time_per_iteration": 2.437554359436035 + }, + { + "auxiliary_loss_clip": 0.0113233, + "auxiliary_loss_mlp": 0.01045948, + "balance_loss_clip": 1.02985501, + "balance_loss_mlp": 1.04690135, + "epoch": 0.33638959867728846, + "flos": 22596286677120.0, + "grad_norm": 1.637052204404589, + "language_loss": 0.80350173, + "learning_rate": 3.0934501115769363e-06, + "loss": 0.82528448, + "num_input_tokens_seen": 120130470, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.85546875, + "step": 5595, + "time_per_iteration": 5.51651668548584 + }, + { + "auxiliary_loss_clip": 0.0112868, + "auxiliary_loss_mlp": 0.01033292, + "balance_loss_clip": 1.01964831, + "balance_loss_mlp": 1.04542542, + "epoch": 0.3364497219299564, + "flos": 20994742972800.0, + "grad_norm": 1.8244163047079407, + "language_loss": 0.81611145, + "learning_rate": 3.0931239898314037e-06, + "loss": 0.83773112, + "num_input_tokens_seen": 120150735, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.83203125, + "step": 5596, + "time_per_iteration": 2.4959781169891357 + }, + { + "auxiliary_loss_clip": 0.01128091, + "auxiliary_loss_mlp": 0.01039521, + "balance_loss_clip": 1.02508509, + "balance_loss_mlp": 1.04461718, + "epoch": 0.3365098451826244, + "flos": 25228610974080.0, + "grad_norm": 1.7014468319312177, + "language_loss": 0.76001227, + "learning_rate": 3.0927978266344995e-06, + "loss": 0.78168839, + "num_input_tokens_seen": 120173230, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8359375, + "step": 5597, + "time_per_iteration": 2.4965333938598633 + }, + { + "auxiliary_loss_clip": 0.01126223, + "auxiliary_loss_mlp": 0.01037273, + "balance_loss_clip": 1.0233258, + "balance_loss_mlp": 1.04597533, + "epoch": 0.33656996843529235, + "flos": 24571697091840.0, + "grad_norm": 1.8007239192940239, + "language_loss": 0.78937811, + "learning_rate": 3.0924716219985916e-06, + "loss": 0.81101304, + "num_input_tokens_seen": 120191860, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.80078125, + "step": 5598, + "time_per_iteration": 2.587813377380371 + }, + { + "auxiliary_loss_clip": 0.01133011, + "auxiliary_loss_mlp": 0.01036012, + "balance_loss_clip": 1.02036011, + "balance_loss_mlp": 1.04606342, + "epoch": 0.3366300916879603, + "flos": 44091120752640.0, + "grad_norm": 1.4664560154247552, + "language_loss": 0.64197004, + "learning_rate": 3.0921453759360514e-06, + "loss": 0.66366023, + "num_input_tokens_seen": 120219195, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.87109375, + "step": 5599, + "time_per_iteration": 2.647618293762207 + }, + { + "auxiliary_loss_clip": 0.0113527, + "auxiliary_loss_mlp": 0.01043272, + "balance_loss_clip": 1.02685726, + "balance_loss_mlp": 1.0468514, + "epoch": 0.3366902149406283, + "flos": 13879869840000.0, + "grad_norm": 2.652004853610392, + "language_loss": 0.8172245, + "learning_rate": 3.091819088459249e-06, + "loss": 0.83900994, + "num_input_tokens_seen": 120232950, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.88671875, + "step": 5600, + "time_per_iteration": 2.441237211227417 + }, + { + "auxiliary_loss_clip": 0.01130498, + "auxiliary_loss_mlp": 0.01050016, + "balance_loss_clip": 1.03369582, + "balance_loss_mlp": 1.04399288, + "epoch": 0.33675033819329625, + "flos": 16253098358400.0, + "grad_norm": 3.359102963412802, + "language_loss": 0.82717538, + "learning_rate": 3.0914927595805573e-06, + "loss": 0.84898043, + "num_input_tokens_seen": 120248865, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.86328125, + "step": 5601, + "time_per_iteration": 2.4369428157806396 + }, + { + "auxiliary_loss_clip": 0.01127768, + "auxiliary_loss_mlp": 0.01032812, + "balance_loss_clip": 1.01911497, + "balance_loss_mlp": 1.04890418, + "epoch": 0.3368104614459642, + "flos": 17055809544960.0, + "grad_norm": 1.6511579237160083, + "language_loss": 0.82726496, + "learning_rate": 3.0911663893123507e-06, + "loss": 0.84887075, + "num_input_tokens_seen": 120267820, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7890625, + "step": 5602, + "time_per_iteration": 2.463291645050049 + }, + { + "auxiliary_loss_clip": 0.01130933, + "auxiliary_loss_mlp": 0.01055384, + "balance_loss_clip": 1.04039955, + "balance_loss_mlp": 1.04712546, + "epoch": 0.3368705846986322, + "flos": 17858628472320.0, + "grad_norm": 1.700541242008466, + "language_loss": 0.70208776, + "learning_rate": 3.0908399776670048e-06, + "loss": 0.72395098, + "num_input_tokens_seen": 120286540, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8359375, + "step": 5603, + "time_per_iteration": 2.4309756755828857 + }, + { + "auxiliary_loss_clip": 0.01133654, + "auxiliary_loss_mlp": 0.0103849, + "balance_loss_clip": 1.02392292, + "balance_loss_mlp": 1.04724145, + "epoch": 0.33693070795130015, + "flos": 22929502170240.0, + "grad_norm": 1.625433979180813, + "language_loss": 0.82925308, + "learning_rate": 3.090513524656898e-06, + "loss": 0.8509745, + "num_input_tokens_seen": 120307305, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.86328125, + "step": 5604, + "time_per_iteration": 2.4980010986328125 + }, + { + "auxiliary_loss_clip": 0.01129789, + "auxiliary_loss_mlp": 0.01042861, + "balance_loss_clip": 1.02782226, + "balance_loss_mlp": 1.0447166, + "epoch": 0.3369908312039681, + "flos": 22017443005440.0, + "grad_norm": 3.2518642032613654, + "language_loss": 0.73756403, + "learning_rate": 3.090187030294409e-06, + "loss": 0.75929046, + "num_input_tokens_seen": 120327845, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8515625, + "step": 5605, + "time_per_iteration": 2.4563212394714355 + }, + { + "auxiliary_loss_clip": 0.01132634, + "auxiliary_loss_mlp": 0.01040526, + "balance_loss_clip": 1.02520752, + "balance_loss_mlp": 1.04604197, + "epoch": 0.33705095445663613, + "flos": 11801970944640.0, + "grad_norm": 2.772980532366942, + "language_loss": 0.83487791, + "learning_rate": 3.089860494591919e-06, + "loss": 0.85660958, + "num_input_tokens_seen": 120343255, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8671875, + "step": 5606, + "time_per_iteration": 2.456441640853882 + }, + { + "auxiliary_loss_clip": 0.0112361, + "auxiliary_loss_mlp": 0.01039979, + "balance_loss_clip": 1.02549469, + "balance_loss_mlp": 1.0414753, + "epoch": 0.3371110777093041, + "flos": 25046400257280.0, + "grad_norm": 1.7790448991820722, + "language_loss": 0.67335433, + "learning_rate": 3.089533917561809e-06, + "loss": 0.69499022, + "num_input_tokens_seen": 120361745, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8203125, + "step": 5607, + "time_per_iteration": 2.4964821338653564 + }, + { + "auxiliary_loss_clip": 0.01130916, + "auxiliary_loss_mlp": 0.01041895, + "balance_loss_clip": 1.02694631, + "balance_loss_mlp": 1.04507923, + "epoch": 0.33717120096197206, + "flos": 26579031719040.0, + "grad_norm": 2.032375572186737, + "language_loss": 0.71093041, + "learning_rate": 3.089207299216464e-06, + "loss": 0.73265851, + "num_input_tokens_seen": 120380565, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.859375, + "step": 5608, + "time_per_iteration": 2.5247933864593506 + }, + { + "auxiliary_loss_clip": 0.01128549, + "auxiliary_loss_mlp": 0.01038175, + "balance_loss_clip": 1.0236311, + "balance_loss_mlp": 1.0446682, + "epoch": 0.33723132421464, + "flos": 15158541168000.0, + "grad_norm": 1.8968208773724307, + "language_loss": 0.79062563, + "learning_rate": 3.088880639568269e-06, + "loss": 0.81229293, + "num_input_tokens_seen": 120399235, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.83984375, + "step": 5609, + "time_per_iteration": 2.439502477645874 + }, + { + "auxiliary_loss_clip": 0.01129667, + "auxiliary_loss_mlp": 0.01042877, + "balance_loss_clip": 1.02706969, + "balance_loss_mlp": 1.04544735, + "epoch": 0.337291447467308, + "flos": 23436093634560.0, + "grad_norm": 2.0456898754189354, + "language_loss": 0.82218611, + "learning_rate": 3.0885539386296114e-06, + "loss": 0.84391159, + "num_input_tokens_seen": 120420095, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.84375, + "step": 5610, + "time_per_iteration": 2.502028226852417 + }, + { + "auxiliary_loss_clip": 0.01123686, + "auxiliary_loss_mlp": 0.01040586, + "balance_loss_clip": 1.02520823, + "balance_loss_mlp": 1.04264688, + "epoch": 0.33735157071997596, + "flos": 17238163916160.0, + "grad_norm": 1.8264685829582996, + "language_loss": 0.81998217, + "learning_rate": 3.088227196412879e-06, + "loss": 0.84162486, + "num_input_tokens_seen": 120437690, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8125, + "step": 5611, + "time_per_iteration": 2.4255177974700928 + }, + { + "auxiliary_loss_clip": 0.01130986, + "auxiliary_loss_mlp": 0.01044325, + "balance_loss_clip": 1.02728975, + "balance_loss_mlp": 1.04550552, + "epoch": 0.3374116939726439, + "flos": 28257388657920.0, + "grad_norm": 1.5753494383615703, + "language_loss": 0.79407716, + "learning_rate": 3.0879004129304626e-06, + "loss": 0.81583023, + "num_input_tokens_seen": 120459240, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.85546875, + "step": 5612, + "time_per_iteration": 2.537048578262329 + }, + { + "auxiliary_loss_clip": 0.01124133, + "auxiliary_loss_mlp": 0.01037075, + "balance_loss_clip": 1.02212596, + "balance_loss_mlp": 1.04021907, + "epoch": 0.3374718172253119, + "flos": 35919396731520.0, + "grad_norm": 2.519050824799004, + "language_loss": 0.70024467, + "learning_rate": 3.087573588194753e-06, + "loss": 0.72185683, + "num_input_tokens_seen": 120481090, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.83984375, + "step": 5613, + "time_per_iteration": 2.570373773574829 + }, + { + "auxiliary_loss_clip": 0.01129945, + "auxiliary_loss_mlp": 0.01037211, + "balance_loss_clip": 1.02203548, + "balance_loss_mlp": 1.04490113, + "epoch": 0.33753194047797985, + "flos": 18186672407040.0, + "grad_norm": 1.6646408753448763, + "language_loss": 0.79615057, + "learning_rate": 3.087246722218144e-06, + "loss": 0.81782216, + "num_input_tokens_seen": 120500045, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8515625, + "step": 5614, + "time_per_iteration": 2.4379053115844727 + }, + { + "auxiliary_loss_clip": 0.01126744, + "auxiliary_loss_mlp": 0.01040084, + "balance_loss_clip": 1.02331161, + "balance_loss_mlp": 1.04260945, + "epoch": 0.3375920637306478, + "flos": 23148916398720.0, + "grad_norm": 1.8534958586083128, + "language_loss": 0.90879035, + "learning_rate": 3.086919815013031e-06, + "loss": 0.93045861, + "num_input_tokens_seen": 120521125, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.84375, + "step": 5615, + "time_per_iteration": 2.4876632690429688 + }, + { + "auxiliary_loss_clip": 0.0112252, + "auxiliary_loss_mlp": 0.01040203, + "balance_loss_clip": 1.02596951, + "balance_loss_mlp": 1.04105914, + "epoch": 0.3376521869833158, + "flos": 23112215677440.0, + "grad_norm": 1.6970154369052728, + "language_loss": 0.80636102, + "learning_rate": 3.086592866591809e-06, + "loss": 0.82798827, + "num_input_tokens_seen": 120539180, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8125, + "step": 5616, + "time_per_iteration": 2.476569175720215 + }, + { + "auxiliary_loss_clip": 0.01131427, + "auxiliary_loss_mlp": 0.01044004, + "balance_loss_clip": 1.02715993, + "balance_loss_mlp": 1.04379678, + "epoch": 0.33771231023598375, + "flos": 19274585581440.0, + "grad_norm": 2.5053489219363754, + "language_loss": 0.84079826, + "learning_rate": 3.0862658769668774e-06, + "loss": 0.86255258, + "num_input_tokens_seen": 120556280, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.875, + "step": 5617, + "time_per_iteration": 2.4204065799713135 + }, + { + "auxiliary_loss_clip": 0.01125211, + "auxiliary_loss_mlp": 0.01036412, + "balance_loss_clip": 1.02190411, + "balance_loss_mlp": 1.04171932, + "epoch": 0.3377724334886517, + "flos": 18150187167360.0, + "grad_norm": 1.648273719366553, + "language_loss": 0.80173457, + "learning_rate": 3.0859388461506343e-06, + "loss": 0.82335079, + "num_input_tokens_seen": 120575395, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8359375, + "step": 5618, + "time_per_iteration": 2.4789302349090576 + }, + { + "auxiliary_loss_clip": 0.01128326, + "auxiliary_loss_mlp": 0.01034119, + "balance_loss_clip": 1.01895535, + "balance_loss_mlp": 1.04367077, + "epoch": 0.3378325567413197, + "flos": 25775997310080.0, + "grad_norm": 1.9548255306646998, + "language_loss": 0.70458674, + "learning_rate": 3.085611774155481e-06, + "loss": 0.72621119, + "num_input_tokens_seen": 120596075, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.84765625, + "step": 5619, + "time_per_iteration": 2.4674489498138428 + }, + { + "auxiliary_loss_clip": 0.01127452, + "auxiliary_loss_mlp": 0.01047075, + "balance_loss_clip": 1.0322814, + "balance_loss_mlp": 1.04403424, + "epoch": 0.3378926799939877, + "flos": 21317112558720.0, + "grad_norm": 5.009208052913787, + "language_loss": 0.69223797, + "learning_rate": 3.085284660993821e-06, + "loss": 0.7139833, + "num_input_tokens_seen": 120614195, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8359375, + "step": 5620, + "time_per_iteration": 2.475889205932617 + }, + { + "auxiliary_loss_clip": 0.01127115, + "auxiliary_loss_mlp": 0.01046185, + "balance_loss_clip": 1.03159392, + "balance_loss_mlp": 1.04497766, + "epoch": 0.33795280324665566, + "flos": 24900028335360.0, + "grad_norm": 2.0914960236262075, + "language_loss": 0.67498147, + "learning_rate": 3.084957506678058e-06, + "loss": 0.69671446, + "num_input_tokens_seen": 120634475, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8203125, + "step": 5621, + "time_per_iteration": 2.4732306003570557 + }, + { + "auxiliary_loss_clip": 0.01124388, + "auxiliary_loss_mlp": 0.01036572, + "balance_loss_clip": 1.02258897, + "balance_loss_mlp": 1.04336381, + "epoch": 0.33801292649932363, + "flos": 24753943722240.0, + "grad_norm": 1.811430245584347, + "language_loss": 0.82714671, + "learning_rate": 3.0846303112205975e-06, + "loss": 0.84875631, + "num_input_tokens_seen": 120654980, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.80859375, + "step": 5622, + "time_per_iteration": 2.5028531551361084 + }, + { + "auxiliary_loss_clip": 0.01122679, + "auxiliary_loss_mlp": 0.01041633, + "balance_loss_clip": 1.0279355, + "balance_loss_mlp": 1.04111528, + "epoch": 0.3380730497519916, + "flos": 26723967096960.0, + "grad_norm": 1.4271980952069887, + "language_loss": 0.73785996, + "learning_rate": 3.0843030746338464e-06, + "loss": 0.75950313, + "num_input_tokens_seen": 120676245, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.81640625, + "step": 5623, + "time_per_iteration": 2.483354091644287 + }, + { + "auxiliary_loss_clip": 0.01044412, + "auxiliary_loss_mlp": 0.01001556, + "balance_loss_clip": 0.99976796, + "balance_loss_mlp": 1.01787817, + "epoch": 0.33813317300465956, + "flos": 70035756416640.0, + "grad_norm": 0.7308868621653948, + "language_loss": 0.54898107, + "learning_rate": 3.083975796930215e-06, + "loss": 0.56944072, + "num_input_tokens_seen": 120741965, + "router_z_loss_clip": 0.01782227, + "router_z_loss_mlp": 0.265625, + "step": 5624, + "time_per_iteration": 3.2154293060302734 + }, + { + "auxiliary_loss_clip": 0.01128701, + "auxiliary_loss_mlp": 0.01040775, + "balance_loss_clip": 1.02536166, + "balance_loss_mlp": 1.04464245, + "epoch": 0.3381932962573275, + "flos": 24097317148800.0, + "grad_norm": 3.114382300094, + "language_loss": 0.73013008, + "learning_rate": 3.083648478122111e-06, + "loss": 0.75182486, + "num_input_tokens_seen": 120760410, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.83984375, + "step": 5625, + "time_per_iteration": 2.4632089138031006 + }, + { + "auxiliary_loss_clip": 0.01129587, + "auxiliary_loss_mlp": 0.01038275, + "balance_loss_clip": 1.02315879, + "balance_loss_mlp": 1.04408085, + "epoch": 0.3382534195099955, + "flos": 19278248768640.0, + "grad_norm": 1.7442247016960708, + "language_loss": 0.70501375, + "learning_rate": 3.0833211182219497e-06, + "loss": 0.72669238, + "num_input_tokens_seen": 120777705, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.85546875, + "step": 5626, + "time_per_iteration": 2.4782652854919434 + }, + { + "auxiliary_loss_clip": 0.01123049, + "auxiliary_loss_mlp": 0.01033781, + "balance_loss_clip": 1.01887965, + "balance_loss_mlp": 1.04265583, + "epoch": 0.33831354276266346, + "flos": 25226240676480.0, + "grad_norm": 1.496721640957227, + "language_loss": 0.81184483, + "learning_rate": 3.0829937172421425e-06, + "loss": 0.83341312, + "num_input_tokens_seen": 120798660, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8046875, + "step": 5627, + "time_per_iteration": 2.48683762550354 + }, + { + "auxiliary_loss_clip": 0.01133025, + "auxiliary_loss_mlp": 0.01038727, + "balance_loss_clip": 1.02332532, + "balance_loss_mlp": 1.04643917, + "epoch": 0.3383736660153314, + "flos": 23112000195840.0, + "grad_norm": 2.112092075284961, + "language_loss": 0.80725849, + "learning_rate": 3.0826662751951055e-06, + "loss": 0.82897604, + "num_input_tokens_seen": 120816705, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.86328125, + "step": 5628, + "time_per_iteration": 2.485978841781616 + }, + { + "auxiliary_loss_clip": 0.01125942, + "auxiliary_loss_mlp": 0.01032154, + "balance_loss_clip": 1.01716328, + "balance_loss_mlp": 1.04272234, + "epoch": 0.3384337892679994, + "flos": 23477139901440.0, + "grad_norm": 1.9378827683544937, + "language_loss": 0.77360773, + "learning_rate": 3.082338792093254e-06, + "loss": 0.79518872, + "num_input_tokens_seen": 120835375, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.83203125, + "step": 5629, + "time_per_iteration": 2.459749937057495 + }, + { + "auxiliary_loss_clip": 0.0112767, + "auxiliary_loss_mlp": 0.01042637, + "balance_loss_clip": 1.02604353, + "balance_loss_mlp": 1.0426172, + "epoch": 0.33849391252066735, + "flos": 19425805839360.0, + "grad_norm": 1.750727836719773, + "language_loss": 0.84873146, + "learning_rate": 3.0820112679490074e-06, + "loss": 0.87043452, + "num_input_tokens_seen": 120854260, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.84765625, + "step": 5630, + "time_per_iteration": 2.502168655395508 + }, + { + "auxiliary_loss_clip": 0.01128287, + "auxiliary_loss_mlp": 0.01039609, + "balance_loss_clip": 1.02593017, + "balance_loss_mlp": 1.04496086, + "epoch": 0.3385540357733353, + "flos": 21064840364160.0, + "grad_norm": 2.44277401951878, + "language_loss": 0.71778762, + "learning_rate": 3.0816837027747857e-06, + "loss": 0.73946661, + "num_input_tokens_seen": 120871590, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.83203125, + "step": 5631, + "time_per_iteration": 2.4541988372802734 + }, + { + "auxiliary_loss_clip": 0.01044995, + "auxiliary_loss_mlp": 0.01006836, + "balance_loss_clip": 1.0050118, + "balance_loss_mlp": 1.01844144, + "epoch": 0.3386141590260033, + "flos": 69208013450880.0, + "grad_norm": 0.84858361279948, + "language_loss": 0.56171906, + "learning_rate": 3.0813560965830084e-06, + "loss": 0.58223736, + "num_input_tokens_seen": 120925550, + "router_z_loss_clip": 0.01818848, + "router_z_loss_mlp": 0.265625, + "step": 5632, + "time_per_iteration": 3.130112409591675 + }, + { + "auxiliary_loss_clip": 0.01126092, + "auxiliary_loss_mlp": 0.01034757, + "balance_loss_clip": 1.01925933, + "balance_loss_mlp": 1.04301071, + "epoch": 0.3386742822786713, + "flos": 25519487310720.0, + "grad_norm": 1.4746675536042473, + "language_loss": 0.80288029, + "learning_rate": 3.0810284493861005e-06, + "loss": 0.82448882, + "num_input_tokens_seen": 120947620, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.828125, + "step": 5633, + "time_per_iteration": 2.4772210121154785 + }, + { + "auxiliary_loss_clip": 0.01126262, + "auxiliary_loss_mlp": 0.01031131, + "balance_loss_clip": 1.01671278, + "balance_loss_mlp": 1.04355168, + "epoch": 0.33873440553133927, + "flos": 23623116773760.0, + "grad_norm": 2.3860801146544692, + "language_loss": 0.59222949, + "learning_rate": 3.0807007611964855e-06, + "loss": 0.61380345, + "num_input_tokens_seen": 120965205, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.828125, + "step": 5634, + "time_per_iteration": 2.490783214569092 + }, + { + "auxiliary_loss_clip": 0.01124946, + "auxiliary_loss_mlp": 0.01033397, + "balance_loss_clip": 1.01930678, + "balance_loss_mlp": 1.04328096, + "epoch": 0.33879452878400723, + "flos": 17088882992640.0, + "grad_norm": 1.758176339753219, + "language_loss": 0.92591304, + "learning_rate": 3.080373032026589e-06, + "loss": 0.94749641, + "num_input_tokens_seen": 120983560, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.81640625, + "step": 5635, + "time_per_iteration": 2.4895272254943848 + }, + { + "auxiliary_loss_clip": 0.01123271, + "auxiliary_loss_mlp": 0.01030062, + "balance_loss_clip": 1.01594758, + "balance_loss_mlp": 1.04428411, + "epoch": 0.3388546520366752, + "flos": 15742053607680.0, + "grad_norm": 1.7397877385381144, + "language_loss": 0.74791968, + "learning_rate": 3.0800452618888386e-06, + "loss": 0.76945299, + "num_input_tokens_seen": 121001400, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 5636, + "time_per_iteration": 2.4868686199188232 + }, + { + "auxiliary_loss_clip": 0.01123089, + "auxiliary_loss_mlp": 0.01037449, + "balance_loss_clip": 1.02264357, + "balance_loss_mlp": 1.04291928, + "epoch": 0.33891477528934316, + "flos": 22418744728320.0, + "grad_norm": 1.533650755617547, + "language_loss": 0.83216572, + "learning_rate": 3.0797174507956637e-06, + "loss": 0.85377115, + "num_input_tokens_seen": 121021760, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.80078125, + "step": 5637, + "time_per_iteration": 5.43249249458313 + }, + { + "auxiliary_loss_clip": 0.0112926, + "auxiliary_loss_mlp": 0.01044612, + "balance_loss_clip": 1.02837586, + "balance_loss_mlp": 1.04624391, + "epoch": 0.3389748985420111, + "flos": 17274828723840.0, + "grad_norm": 1.6200031021198193, + "language_loss": 0.70037901, + "learning_rate": 3.079389598759495e-06, + "loss": 0.72211778, + "num_input_tokens_seen": 121041070, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.828125, + "step": 5638, + "time_per_iteration": 2.430814504623413 + }, + { + "auxiliary_loss_clip": 0.01128885, + "auxiliary_loss_mlp": 0.0104494, + "balance_loss_clip": 1.02993131, + "balance_loss_mlp": 1.0461942, + "epoch": 0.3390350217946791, + "flos": 27744979190400.0, + "grad_norm": 1.644027939558444, + "language_loss": 0.80699074, + "learning_rate": 3.079061705792765e-06, + "loss": 0.82872897, + "num_input_tokens_seen": 121060890, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 5639, + "time_per_iteration": 2.5219810009002686 + }, + { + "auxiliary_loss_clip": 0.01129363, + "auxiliary_loss_mlp": 0.01042542, + "balance_loss_clip": 1.02714002, + "balance_loss_mlp": 1.044734, + "epoch": 0.33909514504734706, + "flos": 20339804338560.0, + "grad_norm": 2.006873412015597, + "language_loss": 0.67907631, + "learning_rate": 3.078733771907907e-06, + "loss": 0.70079535, + "num_input_tokens_seen": 121079135, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84765625, + "step": 5640, + "time_per_iteration": 2.4252562522888184 + }, + { + "auxiliary_loss_clip": 0.01123424, + "auxiliary_loss_mlp": 0.01037389, + "balance_loss_clip": 1.02229738, + "balance_loss_mlp": 1.0432744, + "epoch": 0.339155268300015, + "flos": 14830030356480.0, + "grad_norm": 1.561334672972187, + "language_loss": 0.70158339, + "learning_rate": 3.0784057971173554e-06, + "loss": 0.72319156, + "num_input_tokens_seen": 121097685, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.80078125, + "step": 5641, + "time_per_iteration": 2.4703073501586914 + }, + { + "auxiliary_loss_clip": 0.01129782, + "auxiliary_loss_mlp": 0.01043462, + "balance_loss_clip": 1.02881122, + "balance_loss_mlp": 1.04692698, + "epoch": 0.339215391552683, + "flos": 26067951054720.0, + "grad_norm": 1.7323035027878293, + "language_loss": 0.87336594, + "learning_rate": 3.0780777814335483e-06, + "loss": 0.89509839, + "num_input_tokens_seen": 121115640, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.828125, + "step": 5642, + "time_per_iteration": 2.489957332611084 + }, + { + "auxiliary_loss_clip": 0.01119376, + "auxiliary_loss_mlp": 0.01030563, + "balance_loss_clip": 1.01812363, + "balance_loss_mlp": 1.04361117, + "epoch": 0.33927551480535095, + "flos": 14574705505920.0, + "grad_norm": 1.899951429632433, + "language_loss": 0.83783317, + "learning_rate": 3.077749724868924e-06, + "loss": 0.85933256, + "num_input_tokens_seen": 121132485, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7578125, + "step": 5643, + "time_per_iteration": 2.454176902770996 + }, + { + "auxiliary_loss_clip": 0.01122874, + "auxiliary_loss_mlp": 0.01041824, + "balance_loss_clip": 1.02779329, + "balance_loss_mlp": 1.04303253, + "epoch": 0.3393356380580189, + "flos": 23805578885760.0, + "grad_norm": 1.6286036888414737, + "language_loss": 0.76940101, + "learning_rate": 3.077421627435922e-06, + "loss": 0.79104799, + "num_input_tokens_seen": 121152935, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.796875, + "step": 5644, + "time_per_iteration": 2.46893048286438 + }, + { + "auxiliary_loss_clip": 0.01124612, + "auxiliary_loss_mlp": 0.01043858, + "balance_loss_clip": 1.02898121, + "balance_loss_mlp": 1.04242706, + "epoch": 0.3393957613106869, + "flos": 17347871030400.0, + "grad_norm": 4.638882451456986, + "language_loss": 0.62893367, + "learning_rate": 3.0770934891469832e-06, + "loss": 0.65061837, + "num_input_tokens_seen": 121169835, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8203125, + "step": 5645, + "time_per_iteration": 2.4539859294891357 + }, + { + "auxiliary_loss_clip": 0.01120022, + "auxiliary_loss_mlp": 0.01033694, + "balance_loss_clip": 1.02033067, + "balance_loss_mlp": 1.04122853, + "epoch": 0.3394558845633549, + "flos": 28433960939520.0, + "grad_norm": 2.1237754414429637, + "language_loss": 0.76276195, + "learning_rate": 3.076765310014552e-06, + "loss": 0.78429914, + "num_input_tokens_seen": 121190290, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7890625, + "step": 5646, + "time_per_iteration": 2.4913554191589355 + }, + { + "auxiliary_loss_clip": 0.01128945, + "auxiliary_loss_mlp": 0.01043856, + "balance_loss_clip": 1.02835846, + "balance_loss_mlp": 1.04360342, + "epoch": 0.33951600781602287, + "flos": 22086929865600.0, + "grad_norm": 1.9547585113359744, + "language_loss": 0.79175937, + "learning_rate": 3.0764370900510727e-06, + "loss": 0.81348741, + "num_input_tokens_seen": 121209060, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.85546875, + "step": 5647, + "time_per_iteration": 2.521603584289551 + }, + { + "auxiliary_loss_clip": 0.01128449, + "auxiliary_loss_mlp": 0.01040236, + "balance_loss_clip": 1.02541864, + "balance_loss_mlp": 1.04706085, + "epoch": 0.33957613106869083, + "flos": 23878262056320.0, + "grad_norm": 1.87789373580567, + "language_loss": 0.77358377, + "learning_rate": 3.0761088292689904e-06, + "loss": 0.79527068, + "num_input_tokens_seen": 121227480, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8125, + "step": 5648, + "time_per_iteration": 2.4812231063842773 + }, + { + "auxiliary_loss_clip": 0.0104448, + "auxiliary_loss_mlp": 0.01001624, + "balance_loss_clip": 0.99964541, + "balance_loss_mlp": 1.01817107, + "epoch": 0.3396362543213588, + "flos": 71242642414080.0, + "grad_norm": 0.7825270224300925, + "language_loss": 0.56261832, + "learning_rate": 3.075780527680754e-06, + "loss": 0.5830794, + "num_input_tokens_seen": 121291305, + "router_z_loss_clip": 0.01977539, + "router_z_loss_mlp": 0.26171875, + "step": 5649, + "time_per_iteration": 3.1050350666046143 + }, + { + "auxiliary_loss_clip": 0.01123703, + "auxiliary_loss_mlp": 0.01042955, + "balance_loss_clip": 1.02804756, + "balance_loss_mlp": 1.0422622, + "epoch": 0.33969637757402676, + "flos": 25921615046400.0, + "grad_norm": 1.5021179324123226, + "language_loss": 0.85269898, + "learning_rate": 3.0754521852988117e-06, + "loss": 0.87436557, + "num_input_tokens_seen": 121312740, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8125, + "step": 5650, + "time_per_iteration": 2.5013816356658936 + }, + { + "auxiliary_loss_clip": 0.01123225, + "auxiliary_loss_mlp": 0.01029214, + "balance_loss_clip": 1.01540327, + "balance_loss_mlp": 1.04317355, + "epoch": 0.33975650082669473, + "flos": 35261728663680.0, + "grad_norm": 1.6954461839420942, + "language_loss": 0.70868433, + "learning_rate": 3.0751238021356152e-06, + "loss": 0.73020875, + "num_input_tokens_seen": 121334220, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.80078125, + "step": 5651, + "time_per_iteration": 2.579455852508545 + }, + { + "auxiliary_loss_clip": 0.01123721, + "auxiliary_loss_mlp": 0.01037329, + "balance_loss_clip": 1.02354813, + "balance_loss_mlp": 1.04347372, + "epoch": 0.3398166240793627, + "flos": 16647001879680.0, + "grad_norm": 1.7042541017727943, + "language_loss": 0.81267643, + "learning_rate": 3.074795378203616e-06, + "loss": 0.83428693, + "num_input_tokens_seen": 121351870, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.8046875, + "step": 5652, + "time_per_iteration": 2.4690871238708496 + }, + { + "auxiliary_loss_clip": 0.01128696, + "auxiliary_loss_mlp": 0.01041185, + "balance_loss_clip": 1.02670693, + "balance_loss_mlp": 1.04464078, + "epoch": 0.33987674733203066, + "flos": 24062196625920.0, + "grad_norm": 1.8642865553854127, + "language_loss": 0.77315342, + "learning_rate": 3.0744669135152685e-06, + "loss": 0.79485226, + "num_input_tokens_seen": 121373400, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.83984375, + "step": 5653, + "time_per_iteration": 2.4836156368255615 + }, + { + "auxiliary_loss_clip": 0.01123907, + "auxiliary_loss_mlp": 0.01036159, + "balance_loss_clip": 1.02225959, + "balance_loss_mlp": 1.04310441, + "epoch": 0.3399368705846986, + "flos": 13250678279040.0, + "grad_norm": 4.3033812467068895, + "language_loss": 0.85072839, + "learning_rate": 3.0741384080830278e-06, + "loss": 0.87232912, + "num_input_tokens_seen": 121385225, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.80859375, + "step": 5654, + "time_per_iteration": 2.4139702320098877 + }, + { + "auxiliary_loss_clip": 0.01122836, + "auxiliary_loss_mlp": 0.01042828, + "balance_loss_clip": 1.02853489, + "balance_loss_mlp": 1.04074049, + "epoch": 0.3399969938373666, + "flos": 27012832272000.0, + "grad_norm": 5.132089356193866, + "language_loss": 0.65128249, + "learning_rate": 3.073809861919351e-06, + "loss": 0.67293918, + "num_input_tokens_seen": 121404735, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8203125, + "step": 5655, + "time_per_iteration": 2.475292444229126 + }, + { + "auxiliary_loss_clip": 0.01124776, + "auxiliary_loss_mlp": 0.01041891, + "balance_loss_clip": 1.02781832, + "balance_loss_mlp": 1.04365194, + "epoch": 0.34005711709003456, + "flos": 28550096588160.0, + "grad_norm": 1.4436453355930483, + "language_loss": 0.76766688, + "learning_rate": 3.073481275036697e-06, + "loss": 0.78933358, + "num_input_tokens_seen": 121426780, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8125, + "step": 5656, + "time_per_iteration": 2.550999879837036 + }, + { + "auxiliary_loss_clip": 0.01130894, + "auxiliary_loss_mlp": 0.01039317, + "balance_loss_clip": 1.02413023, + "balance_loss_mlp": 1.04413342, + "epoch": 0.3401172403427025, + "flos": 21617003208960.0, + "grad_norm": 1.5863892165941962, + "language_loss": 0.82438695, + "learning_rate": 3.073152647447525e-06, + "loss": 0.84608912, + "num_input_tokens_seen": 121447245, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8671875, + "step": 5657, + "time_per_iteration": 2.4573473930358887 + }, + { + "auxiliary_loss_clip": 0.01122831, + "auxiliary_loss_mlp": 0.01040787, + "balance_loss_clip": 1.02675629, + "balance_loss_mlp": 1.04342616, + "epoch": 0.3401773635953705, + "flos": 25885776251520.0, + "grad_norm": 1.6511746791476316, + "language_loss": 0.85153604, + "learning_rate": 3.0728239791642976e-06, + "loss": 0.87317222, + "num_input_tokens_seen": 121468165, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.79296875, + "step": 5658, + "time_per_iteration": 2.505319833755493 + }, + { + "auxiliary_loss_clip": 0.01042351, + "auxiliary_loss_mlp": 0.01002353, + "balance_loss_clip": 1.0001955, + "balance_loss_mlp": 1.01611352, + "epoch": 0.3402374868480385, + "flos": 65507995336320.0, + "grad_norm": 0.8147477326465351, + "language_loss": 0.60012162, + "learning_rate": 3.072495270199477e-06, + "loss": 0.62056863, + "num_input_tokens_seen": 121523795, + "router_z_loss_clip": 0.02160645, + "router_z_loss_mlp": 0.26171875, + "step": 5659, + "time_per_iteration": 3.024125814437866 + }, + { + "auxiliary_loss_clip": 0.01122626, + "auxiliary_loss_mlp": 0.01035679, + "balance_loss_clip": 1.02190423, + "balance_loss_mlp": 1.04398155, + "epoch": 0.34029761010070647, + "flos": 24060580513920.0, + "grad_norm": 1.936270792227836, + "language_loss": 0.67855251, + "learning_rate": 3.0721665205655284e-06, + "loss": 0.70013559, + "num_input_tokens_seen": 121542950, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.78515625, + "step": 5660, + "time_per_iteration": 2.5009706020355225 + }, + { + "auxiliary_loss_clip": 0.01125634, + "auxiliary_loss_mlp": 0.01045639, + "balance_loss_clip": 1.0307138, + "balance_loss_mlp": 1.04558277, + "epoch": 0.34035773335337444, + "flos": 27599720590080.0, + "grad_norm": 1.6106101267942714, + "language_loss": 0.67213613, + "learning_rate": 3.071837730274918e-06, + "loss": 0.69384885, + "num_input_tokens_seen": 121562765, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.80078125, + "step": 5661, + "time_per_iteration": 2.501034736633301 + }, + { + "auxiliary_loss_clip": 0.01123137, + "auxiliary_loss_mlp": 0.01037886, + "balance_loss_clip": 1.0241766, + "balance_loss_mlp": 1.04442382, + "epoch": 0.3404178566060424, + "flos": 20812783651200.0, + "grad_norm": 1.9145784194305409, + "language_loss": 0.78845918, + "learning_rate": 3.071508899340113e-06, + "loss": 0.81006938, + "num_input_tokens_seen": 121581610, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7890625, + "step": 5662, + "time_per_iteration": 2.4689018726348877 + }, + { + "auxiliary_loss_clip": 0.01123734, + "auxiliary_loss_mlp": 0.01039121, + "balance_loss_clip": 1.02395773, + "balance_loss_mlp": 1.04277706, + "epoch": 0.34047797985871037, + "flos": 26833566470400.0, + "grad_norm": 1.9415115692891318, + "language_loss": 0.73675144, + "learning_rate": 3.0711800277735833e-06, + "loss": 0.75838, + "num_input_tokens_seen": 121601885, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.80859375, + "step": 5663, + "time_per_iteration": 2.4802587032318115 + }, + { + "auxiliary_loss_clip": 0.01121343, + "auxiliary_loss_mlp": 0.01034164, + "balance_loss_clip": 1.02101541, + "balance_loss_mlp": 1.04342198, + "epoch": 0.34053810311137833, + "flos": 19682639061120.0, + "grad_norm": 2.0753473798431608, + "language_loss": 0.85900557, + "learning_rate": 3.0708511155877997e-06, + "loss": 0.88056058, + "num_input_tokens_seen": 121621335, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.77734375, + "step": 5664, + "time_per_iteration": 2.46343731880188 + }, + { + "auxiliary_loss_clip": 0.01127455, + "auxiliary_loss_mlp": 0.01033802, + "balance_loss_clip": 1.02055156, + "balance_loss_mlp": 1.0459125, + "epoch": 0.3405982263640463, + "flos": 21725740656000.0, + "grad_norm": 1.782528704092853, + "language_loss": 0.69047546, + "learning_rate": 3.070522162795235e-06, + "loss": 0.71208799, + "num_input_tokens_seen": 121641310, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.81640625, + "step": 5665, + "time_per_iteration": 2.4448721408843994 + }, + { + "auxiliary_loss_clip": 0.01123992, + "auxiliary_loss_mlp": 0.01035732, + "balance_loss_clip": 1.02006817, + "balance_loss_mlp": 1.04218054, + "epoch": 0.34065834961671426, + "flos": 18041629288320.0, + "grad_norm": 2.296518315240935, + "language_loss": 0.72806692, + "learning_rate": 3.0701931694083626e-06, + "loss": 0.74966413, + "num_input_tokens_seen": 121659625, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8203125, + "step": 5666, + "time_per_iteration": 2.4749717712402344 + }, + { + "auxiliary_loss_clip": 0.01126484, + "auxiliary_loss_mlp": 0.0103642, + "balance_loss_clip": 1.02236485, + "balance_loss_mlp": 1.04428983, + "epoch": 0.3407184728693822, + "flos": 21397337585280.0, + "grad_norm": 1.5083890198292058, + "language_loss": 0.73306108, + "learning_rate": 3.0698641354396576e-06, + "loss": 0.75469005, + "num_input_tokens_seen": 121679205, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8203125, + "step": 5667, + "time_per_iteration": 2.467684030532837 + }, + { + "auxiliary_loss_clip": 0.0104148, + "auxiliary_loss_mlp": 0.01001962, + "balance_loss_clip": 1.00007808, + "balance_loss_mlp": 1.01518095, + "epoch": 0.3407785961220502, + "flos": 68688101018880.0, + "grad_norm": 0.8424548288565059, + "language_loss": 0.6331358, + "learning_rate": 3.069535060901597e-06, + "loss": 0.65357018, + "num_input_tokens_seen": 121751085, + "router_z_loss_clip": 0.01879883, + "router_z_loss_mlp": 0.26367188, + "step": 5668, + "time_per_iteration": 3.233991861343384 + }, + { + "auxiliary_loss_clip": 0.01124776, + "auxiliary_loss_mlp": 0.01039147, + "balance_loss_clip": 1.02460372, + "balance_loss_mlp": 1.04407477, + "epoch": 0.34083871937471816, + "flos": 14064379027200.0, + "grad_norm": 2.1457172939364892, + "language_loss": 0.72030753, + "learning_rate": 3.0692059458066596e-06, + "loss": 0.74194676, + "num_input_tokens_seen": 121768565, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.80859375, + "step": 5669, + "time_per_iteration": 2.4226186275482178 + }, + { + "auxiliary_loss_clip": 0.01127607, + "auxiliary_loss_mlp": 0.01035265, + "balance_loss_clip": 1.02078128, + "balance_loss_mlp": 1.04468203, + "epoch": 0.3408988426273861, + "flos": 17085435287040.0, + "grad_norm": 1.9050671295461388, + "language_loss": 0.80285168, + "learning_rate": 3.0688767901673265e-06, + "loss": 0.82448041, + "num_input_tokens_seen": 121784925, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.828125, + "step": 5670, + "time_per_iteration": 2.4354984760284424 + }, + { + "auxiliary_loss_clip": 0.01127772, + "auxiliary_loss_mlp": 0.0103567, + "balance_loss_clip": 1.02122176, + "balance_loss_mlp": 1.04374027, + "epoch": 0.3409589658800541, + "flos": 24024562151040.0, + "grad_norm": 1.5994061750955757, + "language_loss": 0.76886785, + "learning_rate": 3.068547593996078e-06, + "loss": 0.79050225, + "num_input_tokens_seen": 121804425, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.83984375, + "step": 5671, + "time_per_iteration": 2.4775397777557373 + }, + { + "auxiliary_loss_clip": 0.01125342, + "auxiliary_loss_mlp": 0.01040087, + "balance_loss_clip": 1.02513266, + "balance_loss_mlp": 1.04437792, + "epoch": 0.34101908913272205, + "flos": 21142012734720.0, + "grad_norm": 1.9602332848552635, + "language_loss": 0.74416959, + "learning_rate": 3.0682183573053974e-06, + "loss": 0.7658239, + "num_input_tokens_seen": 121825145, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.80859375, + "step": 5672, + "time_per_iteration": 2.5027272701263428 + }, + { + "auxiliary_loss_clip": 0.01127201, + "auxiliary_loss_mlp": 0.01032286, + "balance_loss_clip": 1.01889324, + "balance_loss_mlp": 1.04523087, + "epoch": 0.3410792123853901, + "flos": 15702012921600.0, + "grad_norm": 1.991076139860355, + "language_loss": 0.73781157, + "learning_rate": 3.06788908010777e-06, + "loss": 0.75940639, + "num_input_tokens_seen": 121842185, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.8203125, + "step": 5673, + "time_per_iteration": 2.424955368041992 + }, + { + "auxiliary_loss_clip": 0.01123926, + "auxiliary_loss_mlp": 0.01036446, + "balance_loss_clip": 1.02243853, + "balance_loss_mlp": 1.04432535, + "epoch": 0.34113933563805804, + "flos": 23036012974080.0, + "grad_norm": 1.774655206888726, + "language_loss": 0.79900169, + "learning_rate": 3.067559762415682e-06, + "loss": 0.8206054, + "num_input_tokens_seen": 121862260, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.796875, + "step": 5674, + "time_per_iteration": 2.490407705307007 + }, + { + "auxiliary_loss_clip": 0.01041345, + "auxiliary_loss_mlp": 0.01001058, + "balance_loss_clip": 0.99942493, + "balance_loss_mlp": 1.01517344, + "epoch": 0.341199458890726, + "flos": 69614235336960.0, + "grad_norm": 0.7963469989165133, + "language_loss": 0.56096685, + "learning_rate": 3.0672304042416198e-06, + "loss": 0.58139086, + "num_input_tokens_seen": 121923560, + "router_z_loss_clip": 0.01635742, + "router_z_loss_mlp": 0.26171875, + "step": 5675, + "time_per_iteration": 3.223119020462036 + }, + { + "auxiliary_loss_clip": 0.01123194, + "auxiliary_loss_mlp": 0.01041089, + "balance_loss_clip": 1.0270282, + "balance_loss_mlp": 1.04428756, + "epoch": 0.34125958214339397, + "flos": 22346348866560.0, + "grad_norm": 1.6179892480447855, + "language_loss": 0.79029286, + "learning_rate": 3.0669010055980734e-06, + "loss": 0.81193566, + "num_input_tokens_seen": 121943515, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 5676, + "time_per_iteration": 2.4798848628997803 + }, + { + "auxiliary_loss_clip": 0.01123343, + "auxiliary_loss_mlp": 0.0103332, + "balance_loss_clip": 1.01836538, + "balance_loss_mlp": 1.0424788, + "epoch": 0.34131970539606193, + "flos": 21871933009920.0, + "grad_norm": 1.8072554320592242, + "language_loss": 0.85598934, + "learning_rate": 3.0665715664975357e-06, + "loss": 0.87755597, + "num_input_tokens_seen": 121962540, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.80859375, + "step": 5677, + "time_per_iteration": 2.4501733779907227 + }, + { + "auxiliary_loss_clip": 0.01122654, + "auxiliary_loss_mlp": 0.01041495, + "balance_loss_clip": 1.0266118, + "balance_loss_mlp": 1.04244757, + "epoch": 0.3413798286487299, + "flos": 24935723475840.0, + "grad_norm": 2.009404852791833, + "language_loss": 0.79283166, + "learning_rate": 3.0662420869524966e-06, + "loss": 0.81447315, + "num_input_tokens_seen": 121979830, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.80078125, + "step": 5678, + "time_per_iteration": 4.054651260375977 + }, + { + "auxiliary_loss_clip": 0.01123013, + "auxiliary_loss_mlp": 0.01033318, + "balance_loss_clip": 1.01983547, + "balance_loss_mlp": 1.04135132, + "epoch": 0.34143995190139786, + "flos": 25374372364800.0, + "grad_norm": 1.8818653655236122, + "language_loss": 0.74546856, + "learning_rate": 3.0659125669754506e-06, + "loss": 0.76703185, + "num_input_tokens_seen": 121999055, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.81640625, + "step": 5679, + "time_per_iteration": 3.9024462699890137 + }, + { + "auxiliary_loss_clip": 0.01042201, + "auxiliary_loss_mlp": 0.01001255, + "balance_loss_clip": 0.99970549, + "balance_loss_mlp": 1.01624846, + "epoch": 0.34150007515406583, + "flos": 67782578129280.0, + "grad_norm": 0.7519133883291979, + "language_loss": 0.59481025, + "learning_rate": 3.0655830065788923e-06, + "loss": 0.61524487, + "num_input_tokens_seen": 122067015, + "router_z_loss_clip": 0.01544189, + "router_z_loss_mlp": 0.25976562, + "step": 5680, + "time_per_iteration": 3.152480125427246 + }, + { + "auxiliary_loss_clip": 0.01121207, + "auxiliary_loss_mlp": 0.01033444, + "balance_loss_clip": 1.01953864, + "balance_loss_mlp": 1.04320455, + "epoch": 0.3415601984067338, + "flos": 20302421258880.0, + "grad_norm": 2.208026502208574, + "language_loss": 0.7233687, + "learning_rate": 3.0652534057753206e-06, + "loss": 0.74491525, + "num_input_tokens_seen": 122085295, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.78125, + "step": 5681, + "time_per_iteration": 2.4450337886810303 + }, + { + "auxiliary_loss_clip": 0.01118824, + "auxiliary_loss_mlp": 0.01041972, + "balance_loss_clip": 1.02798879, + "balance_loss_mlp": 1.04110432, + "epoch": 0.34162032165940176, + "flos": 26031178506240.0, + "grad_norm": 2.0075854608407058, + "language_loss": 0.7144351, + "learning_rate": 3.064923764577233e-06, + "loss": 0.7360431, + "num_input_tokens_seen": 122104020, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7734375, + "step": 5682, + "time_per_iteration": 2.53000807762146 + }, + { + "auxiliary_loss_clip": 0.01120348, + "auxiliary_loss_mlp": 0.01039153, + "balance_loss_clip": 1.02446055, + "balance_loss_mlp": 1.04079127, + "epoch": 0.3416804449120697, + "flos": 28803338449920.0, + "grad_norm": 1.4570201559150766, + "language_loss": 0.8396616, + "learning_rate": 3.0645940829971295e-06, + "loss": 0.86125666, + "num_input_tokens_seen": 122125080, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.796875, + "step": 5683, + "time_per_iteration": 2.511646270751953 + }, + { + "auxiliary_loss_clip": 0.01126192, + "auxiliary_loss_mlp": 0.01047188, + "balance_loss_clip": 1.03189898, + "balance_loss_mlp": 1.04384482, + "epoch": 0.3417405681647377, + "flos": 22601601889920.0, + "grad_norm": 2.5567263249521965, + "language_loss": 0.70622635, + "learning_rate": 3.0642643610475116e-06, + "loss": 0.72796011, + "num_input_tokens_seen": 122146350, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.82421875, + "step": 5684, + "time_per_iteration": 2.58811616897583 + }, + { + "auxiliary_loss_clip": 0.01120756, + "auxiliary_loss_mlp": 0.01034084, + "balance_loss_clip": 1.02119195, + "balance_loss_mlp": 1.0428822, + "epoch": 0.34180069141740566, + "flos": 24716237420160.0, + "grad_norm": 1.480860615854928, + "language_loss": 0.75386423, + "learning_rate": 3.0639345987408823e-06, + "loss": 0.77541268, + "num_input_tokens_seen": 122168085, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.78125, + "step": 5685, + "time_per_iteration": 2.485405445098877 + }, + { + "auxiliary_loss_clip": 0.01120925, + "auxiliary_loss_mlp": 0.0103682, + "balance_loss_clip": 1.02399325, + "balance_loss_mlp": 1.04268134, + "epoch": 0.3418608146700737, + "flos": 30518755246080.0, + "grad_norm": 1.6707381387615057, + "language_loss": 0.70186603, + "learning_rate": 3.0636047960897468e-06, + "loss": 0.72344351, + "num_input_tokens_seen": 122191040, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.78125, + "step": 5686, + "time_per_iteration": 2.5536224842071533 + }, + { + "auxiliary_loss_clip": 0.01121848, + "auxiliary_loss_mlp": 0.01042102, + "balance_loss_clip": 1.02681327, + "balance_loss_mlp": 1.04087019, + "epoch": 0.34192093792274164, + "flos": 15122343237120.0, + "grad_norm": 2.6880234800017844, + "language_loss": 0.77629769, + "learning_rate": 3.06327495310661e-06, + "loss": 0.79793721, + "num_input_tokens_seen": 122209225, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8125, + "step": 5687, + "time_per_iteration": 2.4526383876800537 + }, + { + "auxiliary_loss_clip": 0.01122013, + "auxiliary_loss_mlp": 0.01034386, + "balance_loss_clip": 1.01947296, + "balance_loss_mlp": 1.04425466, + "epoch": 0.3419810611754096, + "flos": 13187799521280.0, + "grad_norm": 1.7522626505921908, + "language_loss": 0.86505169, + "learning_rate": 3.062945069803981e-06, + "loss": 0.88661563, + "num_input_tokens_seen": 122226160, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.77734375, + "step": 5688, + "time_per_iteration": 2.457873821258545 + }, + { + "auxiliary_loss_clip": 0.01129554, + "auxiliary_loss_mlp": 0.01038372, + "balance_loss_clip": 1.02274323, + "balance_loss_mlp": 1.04438853, + "epoch": 0.34204118442807757, + "flos": 19536267139200.0, + "grad_norm": 1.6277101200549902, + "language_loss": 0.79875666, + "learning_rate": 3.0626151461943684e-06, + "loss": 0.82043588, + "num_input_tokens_seen": 122243115, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8515625, + "step": 5689, + "time_per_iteration": 2.4494895935058594 + }, + { + "auxiliary_loss_clip": 0.01124588, + "auxiliary_loss_mlp": 0.01038419, + "balance_loss_clip": 1.02351832, + "balance_loss_mlp": 1.04300821, + "epoch": 0.34210130768074554, + "flos": 15194846839680.0, + "grad_norm": 2.0745412821804057, + "language_loss": 0.7351048, + "learning_rate": 3.0622851822902834e-06, + "loss": 0.75673485, + "num_input_tokens_seen": 122261105, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.81640625, + "step": 5690, + "time_per_iteration": 2.448133945465088 + }, + { + "auxiliary_loss_clip": 0.01120421, + "auxiliary_loss_mlp": 0.01036215, + "balance_loss_clip": 1.02270865, + "balance_loss_mlp": 1.03998768, + "epoch": 0.3421614309334135, + "flos": 24936226266240.0, + "grad_norm": 2.433761635396741, + "language_loss": 0.7631194, + "learning_rate": 3.061955178104237e-06, + "loss": 0.78468573, + "num_input_tokens_seen": 122279995, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.8046875, + "step": 5691, + "time_per_iteration": 2.479569435119629 + }, + { + "auxiliary_loss_clip": 0.01120907, + "auxiliary_loss_mlp": 0.01041441, + "balance_loss_clip": 1.02782106, + "balance_loss_mlp": 1.0415988, + "epoch": 0.34222155418608147, + "flos": 21908633731200.0, + "grad_norm": 1.5387604656502187, + "language_loss": 0.68159282, + "learning_rate": 3.0616251336487447e-06, + "loss": 0.70321631, + "num_input_tokens_seen": 122299070, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.79296875, + "step": 5692, + "time_per_iteration": 2.490466356277466 + }, + { + "auxiliary_loss_clip": 0.01124667, + "auxiliary_loss_mlp": 0.01042741, + "balance_loss_clip": 1.02682638, + "balance_loss_mlp": 1.04275179, + "epoch": 0.34228167743874943, + "flos": 18114061063680.0, + "grad_norm": 2.6924087388900606, + "language_loss": 0.72292894, + "learning_rate": 3.06129504893632e-06, + "loss": 0.74460298, + "num_input_tokens_seen": 122316800, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8203125, + "step": 5693, + "time_per_iteration": 2.451026439666748 + }, + { + "auxiliary_loss_clip": 0.01122133, + "auxiliary_loss_mlp": 0.01037278, + "balance_loss_clip": 1.02408743, + "balance_loss_mlp": 1.0417974, + "epoch": 0.3423418006914174, + "flos": 21288600138240.0, + "grad_norm": 1.7157866574439644, + "language_loss": 0.75877678, + "learning_rate": 3.0609649239794813e-06, + "loss": 0.78037089, + "num_input_tokens_seen": 122335275, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.8046875, + "step": 5694, + "time_per_iteration": 2.499997615814209 + }, + { + "auxiliary_loss_clip": 0.01119791, + "auxiliary_loss_mlp": 0.01036927, + "balance_loss_clip": 1.02320051, + "balance_loss_mlp": 1.04253125, + "epoch": 0.34240192394408536, + "flos": 19823480288640.0, + "grad_norm": 1.9697512050835562, + "language_loss": 0.79815507, + "learning_rate": 3.060634758790747e-06, + "loss": 0.81972229, + "num_input_tokens_seen": 122353215, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7734375, + "step": 5695, + "time_per_iteration": 2.4279983043670654 + }, + { + "auxiliary_loss_clip": 0.01122261, + "auxiliary_loss_mlp": 0.01039624, + "balance_loss_clip": 1.0248661, + "balance_loss_mlp": 1.04168487, + "epoch": 0.3424620471967533, + "flos": 24535535074560.0, + "grad_norm": 1.7314755849975545, + "language_loss": 0.73487073, + "learning_rate": 3.060304553382635e-06, + "loss": 0.75648957, + "num_input_tokens_seen": 122372495, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8046875, + "step": 5696, + "time_per_iteration": 2.507782459259033 + }, + { + "auxiliary_loss_clip": 0.01122963, + "auxiliary_loss_mlp": 0.01047657, + "balance_loss_clip": 1.03301835, + "balance_loss_mlp": 1.0419805, + "epoch": 0.3425221704494213, + "flos": 25848895962240.0, + "grad_norm": 1.6676891559017708, + "language_loss": 0.70874155, + "learning_rate": 3.0599743077676685e-06, + "loss": 0.73044771, + "num_input_tokens_seen": 122394600, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.80859375, + "step": 5697, + "time_per_iteration": 2.4868175983428955 + }, + { + "auxiliary_loss_clip": 0.01122392, + "auxiliary_loss_mlp": 0.01034383, + "balance_loss_clip": 1.01949954, + "balance_loss_mlp": 1.04456246, + "epoch": 0.34258229370208926, + "flos": 21540513196800.0, + "grad_norm": 1.6712097888676536, + "language_loss": 0.81875223, + "learning_rate": 3.05964402195837e-06, + "loss": 0.84031999, + "num_input_tokens_seen": 122414700, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.77734375, + "step": 5698, + "time_per_iteration": 2.500499725341797 + }, + { + "auxiliary_loss_clip": 0.01121288, + "auxiliary_loss_mlp": 0.01043706, + "balance_loss_clip": 1.02712393, + "balance_loss_mlp": 1.03982306, + "epoch": 0.3426424169547573, + "flos": 23652778429440.0, + "grad_norm": 1.9988541020523172, + "language_loss": 0.69163442, + "learning_rate": 3.0593136959672645e-06, + "loss": 0.71328437, + "num_input_tokens_seen": 122432760, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8125, + "step": 5699, + "time_per_iteration": 2.4522063732147217 + }, + { + "auxiliary_loss_clip": 0.01123011, + "auxiliary_loss_mlp": 0.01035122, + "balance_loss_clip": 1.0211153, + "balance_loss_mlp": 1.0424068, + "epoch": 0.34270254020742524, + "flos": 24644883052800.0, + "grad_norm": 2.0139701241951196, + "language_loss": 0.72246462, + "learning_rate": 3.058983329806877e-06, + "loss": 0.74404591, + "num_input_tokens_seen": 122449105, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8046875, + "step": 5700, + "time_per_iteration": 2.4942879676818848 + }, + { + "auxiliary_loss_clip": 0.01123902, + "auxiliary_loss_mlp": 0.01033961, + "balance_loss_clip": 1.02018046, + "balance_loss_mlp": 1.04403377, + "epoch": 0.3427626634600932, + "flos": 20996754134400.0, + "grad_norm": 2.026861038115517, + "language_loss": 0.81818259, + "learning_rate": 3.0586529234897354e-06, + "loss": 0.83976114, + "num_input_tokens_seen": 122468700, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.796875, + "step": 5701, + "time_per_iteration": 2.4650135040283203 + }, + { + "auxiliary_loss_clip": 0.01124816, + "auxiliary_loss_mlp": 0.01032731, + "balance_loss_clip": 1.01886129, + "balance_loss_mlp": 1.04328442, + "epoch": 0.3428227867127612, + "flos": 21433786911360.0, + "grad_norm": 1.616013756330385, + "language_loss": 0.71818215, + "learning_rate": 3.0583224770283694e-06, + "loss": 0.73975766, + "num_input_tokens_seen": 122488160, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.81640625, + "step": 5702, + "time_per_iteration": 2.446018695831299 + }, + { + "auxiliary_loss_clip": 0.01038258, + "auxiliary_loss_mlp": 0.01007974, + "balance_loss_clip": 1.00623345, + "balance_loss_mlp": 1.01261425, + "epoch": 0.34288290996542914, + "flos": 55731782695680.0, + "grad_norm": 0.78067456401119, + "language_loss": 0.57387871, + "learning_rate": 3.057991990435309e-06, + "loss": 0.5943411, + "num_input_tokens_seen": 122542890, + "router_z_loss_clip": 0.01745605, + "router_z_loss_mlp": 0.2578125, + "step": 5703, + "time_per_iteration": 2.9596943855285645 + }, + { + "auxiliary_loss_clip": 0.01125647, + "auxiliary_loss_mlp": 0.01041995, + "balance_loss_clip": 1.02599692, + "balance_loss_mlp": 1.04436553, + "epoch": 0.3429430332180971, + "flos": 20156803522560.0, + "grad_norm": 1.8868866692845514, + "language_loss": 0.74849427, + "learning_rate": 3.057661463723086e-06, + "loss": 0.77017069, + "num_input_tokens_seen": 122561770, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8125, + "step": 5704, + "time_per_iteration": 2.475206136703491 + }, + { + "auxiliary_loss_clip": 0.01122188, + "auxiliary_loss_mlp": 0.01035238, + "balance_loss_clip": 1.0218513, + "balance_loss_mlp": 1.0432725, + "epoch": 0.34300315647076507, + "flos": 17965857548160.0, + "grad_norm": 2.4058395538044572, + "language_loss": 0.73303944, + "learning_rate": 3.0573308969042346e-06, + "loss": 0.75461364, + "num_input_tokens_seen": 122580580, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7890625, + "step": 5705, + "time_per_iteration": 2.435140609741211 + }, + { + "auxiliary_loss_clip": 0.0112299, + "auxiliary_loss_mlp": 0.01035313, + "balance_loss_clip": 1.0204711, + "balance_loss_mlp": 1.04320812, + "epoch": 0.34306327972343303, + "flos": 22086822124800.0, + "grad_norm": 3.54760070735666, + "language_loss": 0.79599071, + "learning_rate": 3.057000289991289e-06, + "loss": 0.81757367, + "num_input_tokens_seen": 122599810, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 5706, + "time_per_iteration": 2.4922068119049072 + }, + { + "auxiliary_loss_clip": 0.01127669, + "auxiliary_loss_mlp": 0.01032738, + "balance_loss_clip": 1.01815271, + "balance_loss_mlp": 1.04497337, + "epoch": 0.343123402976101, + "flos": 18442679616000.0, + "grad_norm": 1.9921713202453553, + "language_loss": 0.83170593, + "learning_rate": 3.056669642996787e-06, + "loss": 0.85330999, + "num_input_tokens_seen": 122616035, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.828125, + "step": 5707, + "time_per_iteration": 2.441812753677368 + }, + { + "auxiliary_loss_clip": 0.01126551, + "auxiliary_loss_mlp": 0.0103365, + "balance_loss_clip": 1.01919019, + "balance_loss_mlp": 1.04623604, + "epoch": 0.34318352622876896, + "flos": 17163685065600.0, + "grad_norm": 1.5424527465289883, + "language_loss": 0.75429368, + "learning_rate": 3.056338955933266e-06, + "loss": 0.77589571, + "num_input_tokens_seen": 122633785, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8046875, + "step": 5708, + "time_per_iteration": 2.448415756225586 + }, + { + "auxiliary_loss_clip": 0.01120092, + "auxiliary_loss_mlp": 0.01034667, + "balance_loss_clip": 1.02046943, + "balance_loss_mlp": 1.04284358, + "epoch": 0.34324364948143693, + "flos": 26688164215680.0, + "grad_norm": 1.6552343197625845, + "language_loss": 0.81159383, + "learning_rate": 3.0560082288132662e-06, + "loss": 0.83314145, + "num_input_tokens_seen": 122652100, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7734375, + "step": 5709, + "time_per_iteration": 2.488879919052124 + }, + { + "auxiliary_loss_clip": 0.01125291, + "auxiliary_loss_mlp": 0.01039585, + "balance_loss_clip": 1.0235213, + "balance_loss_mlp": 1.04413152, + "epoch": 0.3433037727341049, + "flos": 21251576194560.0, + "grad_norm": 2.1306910299424677, + "language_loss": 0.79152101, + "learning_rate": 3.055677461649329e-06, + "loss": 0.81316978, + "num_input_tokens_seen": 122669720, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8125, + "step": 5710, + "time_per_iteration": 2.487224817276001 + }, + { + "auxiliary_loss_clip": 0.01124884, + "auxiliary_loss_mlp": 0.01036256, + "balance_loss_clip": 1.0209142, + "balance_loss_mlp": 1.04181814, + "epoch": 0.34336389598677286, + "flos": 20629423699200.0, + "grad_norm": 1.821164645381994, + "language_loss": 0.69994622, + "learning_rate": 3.055346654453996e-06, + "loss": 0.72155762, + "num_input_tokens_seen": 122688715, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.828125, + "step": 5711, + "time_per_iteration": 2.471989631652832 + }, + { + "auxiliary_loss_clip": 0.01123068, + "auxiliary_loss_mlp": 0.01039448, + "balance_loss_clip": 1.02455926, + "balance_loss_mlp": 1.04235482, + "epoch": 0.3434240192394409, + "flos": 14538579402240.0, + "grad_norm": 1.7360043656013842, + "language_loss": 0.68002397, + "learning_rate": 3.055015807239812e-06, + "loss": 0.70164913, + "num_input_tokens_seen": 122706970, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.80859375, + "step": 5712, + "time_per_iteration": 2.440960168838501 + }, + { + "auxiliary_loss_clip": 0.01036814, + "auxiliary_loss_mlp": 0.01007067, + "balance_loss_clip": 1.00550556, + "balance_loss_mlp": 1.011006, + "epoch": 0.34348414249210885, + "flos": 58051538841600.0, + "grad_norm": 0.8415582534154722, + "language_loss": 0.58101094, + "learning_rate": 3.0546849200193226e-06, + "loss": 0.60144973, + "num_input_tokens_seen": 122758095, + "router_z_loss_clip": 0.01556396, + "router_z_loss_mlp": 0.2578125, + "step": 5713, + "time_per_iteration": 3.018573045730591 + }, + { + "auxiliary_loss_clip": 0.01122962, + "auxiliary_loss_mlp": 0.01042443, + "balance_loss_clip": 1.02773833, + "balance_loss_mlp": 1.04283524, + "epoch": 0.3435442657447768, + "flos": 20704441253760.0, + "grad_norm": 1.6636797952259372, + "language_loss": 0.80745685, + "learning_rate": 3.054353992805076e-06, + "loss": 0.82911092, + "num_input_tokens_seen": 122777815, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.80078125, + "step": 5714, + "time_per_iteration": 2.4916322231292725 + }, + { + "auxiliary_loss_clip": 0.01126185, + "auxiliary_loss_mlp": 0.01040552, + "balance_loss_clip": 1.02519822, + "balance_loss_mlp": 1.04508591, + "epoch": 0.3436043889974448, + "flos": 22930256355840.0, + "grad_norm": 1.759201097406795, + "language_loss": 0.71844554, + "learning_rate": 3.05402302560962e-06, + "loss": 0.7401129, + "num_input_tokens_seen": 122797555, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8125, + "step": 5715, + "time_per_iteration": 2.468292474746704 + }, + { + "auxiliary_loss_clip": 0.01036063, + "auxiliary_loss_mlp": 0.01006756, + "balance_loss_clip": 1.00499201, + "balance_loss_mlp": 1.01020741, + "epoch": 0.34366451225011274, + "flos": 58403285752320.0, + "grad_norm": 0.8941035310387452, + "language_loss": 0.65942305, + "learning_rate": 3.053692018445505e-06, + "loss": 0.67985129, + "num_input_tokens_seen": 122863955, + "router_z_loss_clip": 0.0177002, + "router_z_loss_mlp": 0.2578125, + "step": 5716, + "time_per_iteration": 3.101933717727661 + }, + { + "auxiliary_loss_clip": 0.0112152, + "auxiliary_loss_mlp": 0.0104123, + "balance_loss_clip": 1.02705014, + "balance_loss_mlp": 1.04254961, + "epoch": 0.3437246355027807, + "flos": 15596292216960.0, + "grad_norm": 2.0405702698755657, + "language_loss": 0.74612904, + "learning_rate": 3.0533609713252838e-06, + "loss": 0.76775646, + "num_input_tokens_seen": 122883000, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7890625, + "step": 5717, + "time_per_iteration": 2.426793098449707 + }, + { + "auxiliary_loss_clip": 0.01123044, + "auxiliary_loss_mlp": 0.01042851, + "balance_loss_clip": 1.02894473, + "balance_loss_mlp": 1.0413748, + "epoch": 0.34378475875544867, + "flos": 27672260106240.0, + "grad_norm": 1.6999619338826393, + "language_loss": 0.7507081, + "learning_rate": 3.0530298842615077e-06, + "loss": 0.77236706, + "num_input_tokens_seen": 122903265, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.81640625, + "step": 5718, + "time_per_iteration": 2.534557342529297 + }, + { + "auxiliary_loss_clip": 0.01125265, + "auxiliary_loss_mlp": 0.01040651, + "balance_loss_clip": 1.02563679, + "balance_loss_mlp": 1.04245746, + "epoch": 0.34384488200811664, + "flos": 31431496769280.0, + "grad_norm": 1.9991347741656986, + "language_loss": 0.63971305, + "learning_rate": 3.052698757266734e-06, + "loss": 0.66137218, + "num_input_tokens_seen": 122923860, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 5719, + "time_per_iteration": 2.5236892700195312 + }, + { + "auxiliary_loss_clip": 0.01124826, + "auxiliary_loss_mlp": 0.01038568, + "balance_loss_clip": 1.02251017, + "balance_loss_mlp": 1.0418756, + "epoch": 0.3439050052607846, + "flos": 24899920594560.0, + "grad_norm": 2.111950804429908, + "language_loss": 0.73612356, + "learning_rate": 3.0523675903535183e-06, + "loss": 0.75775748, + "num_input_tokens_seen": 122945305, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.828125, + "step": 5720, + "time_per_iteration": 5.3536376953125 + }, + { + "auxiliary_loss_clip": 0.0112352, + "auxiliary_loss_mlp": 0.01040582, + "balance_loss_clip": 1.02520978, + "balance_loss_mlp": 1.04300022, + "epoch": 0.34396512851345257, + "flos": 18150079426560.0, + "grad_norm": 1.805745396214866, + "language_loss": 0.74198145, + "learning_rate": 3.0520363835344173e-06, + "loss": 0.76362252, + "num_input_tokens_seen": 122962535, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8046875, + "step": 5721, + "time_per_iteration": 2.4301607608795166 + }, + { + "auxiliary_loss_clip": 0.01126876, + "auxiliary_loss_mlp": 0.0104413, + "balance_loss_clip": 1.0286088, + "balance_loss_mlp": 1.04481733, + "epoch": 0.34402525176612053, + "flos": 16034438315520.0, + "grad_norm": 3.5063882769532313, + "language_loss": 0.80132651, + "learning_rate": 3.051705136821992e-06, + "loss": 0.82303661, + "num_input_tokens_seen": 122979750, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8203125, + "step": 5722, + "time_per_iteration": 2.411731243133545 + }, + { + "auxiliary_loss_clip": 0.01122709, + "auxiliary_loss_mlp": 0.01032098, + "balance_loss_clip": 1.01809728, + "balance_loss_mlp": 1.04312289, + "epoch": 0.3440853750187885, + "flos": 21178641628800.0, + "grad_norm": 1.5863267197766868, + "language_loss": 0.8194539, + "learning_rate": 3.051373850228801e-06, + "loss": 0.84100199, + "num_input_tokens_seen": 122998955, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.796875, + "step": 5723, + "time_per_iteration": 2.476672410964966 + }, + { + "auxiliary_loss_clip": 0.01124156, + "auxiliary_loss_mlp": 0.01039991, + "balance_loss_clip": 1.02559686, + "balance_loss_mlp": 1.0428493, + "epoch": 0.34414549827145646, + "flos": 12677868092160.0, + "grad_norm": 1.852885568649272, + "language_loss": 0.8147676, + "learning_rate": 3.0510425237674096e-06, + "loss": 0.83640903, + "num_input_tokens_seen": 123016165, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.8125, + "step": 5724, + "time_per_iteration": 2.4115889072418213 + }, + { + "auxiliary_loss_clip": 0.01125316, + "auxiliary_loss_mlp": 0.01036091, + "balance_loss_clip": 1.0210526, + "balance_loss_mlp": 1.04397368, + "epoch": 0.3442056215241244, + "flos": 31284514316160.0, + "grad_norm": 1.759268883551978, + "language_loss": 0.6919744, + "learning_rate": 3.05071115745038e-06, + "loss": 0.71358848, + "num_input_tokens_seen": 123036900, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8125, + "step": 5725, + "time_per_iteration": 2.589571714401245 + }, + { + "auxiliary_loss_clip": 0.01130624, + "auxiliary_loss_mlp": 0.01042614, + "balance_loss_clip": 1.02578139, + "balance_loss_mlp": 1.04464412, + "epoch": 0.34426574477679245, + "flos": 23367289132800.0, + "grad_norm": 1.4578739764018875, + "language_loss": 0.69519544, + "learning_rate": 3.0503797512902773e-06, + "loss": 0.71692783, + "num_input_tokens_seen": 123057480, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.859375, + "step": 5726, + "time_per_iteration": 2.4600956439971924 + }, + { + "auxiliary_loss_clip": 0.01123936, + "auxiliary_loss_mlp": 0.0103637, + "balance_loss_clip": 1.02222002, + "balance_loss_mlp": 1.0427928, + "epoch": 0.3443258680294604, + "flos": 24535427333760.0, + "grad_norm": 1.656148044371735, + "language_loss": 0.73426235, + "learning_rate": 3.0500483052996703e-06, + "loss": 0.7558654, + "num_input_tokens_seen": 123076890, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8125, + "step": 5727, + "time_per_iteration": 2.5102531909942627 + }, + { + "auxiliary_loss_clip": 0.01125161, + "auxiliary_loss_mlp": 0.01041626, + "balance_loss_clip": 1.02636731, + "balance_loss_mlp": 1.04398954, + "epoch": 0.3443859912821284, + "flos": 20230133137920.0, + "grad_norm": 1.8280399137078096, + "language_loss": 0.87897557, + "learning_rate": 3.0497168194911257e-06, + "loss": 0.90064341, + "num_input_tokens_seen": 123092530, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8125, + "step": 5728, + "time_per_iteration": 2.4304542541503906 + }, + { + "auxiliary_loss_clip": 0.01122947, + "auxiliary_loss_mlp": 0.01045129, + "balance_loss_clip": 1.03106284, + "balance_loss_mlp": 1.04264569, + "epoch": 0.34444611453479634, + "flos": 24316515895680.0, + "grad_norm": 2.0505664478102426, + "language_loss": 0.70451075, + "learning_rate": 3.0493852938772143e-06, + "loss": 0.72619152, + "num_input_tokens_seen": 123110560, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8046875, + "step": 5729, + "time_per_iteration": 2.4979374408721924 + }, + { + "auxiliary_loss_clip": 0.01122265, + "auxiliary_loss_mlp": 0.0103421, + "balance_loss_clip": 1.01952362, + "balance_loss_mlp": 1.0427525, + "epoch": 0.3445062377874643, + "flos": 16983413683200.0, + "grad_norm": 1.7284434335955414, + "language_loss": 0.73995942, + "learning_rate": 3.0490537284705078e-06, + "loss": 0.7615242, + "num_input_tokens_seen": 123128655, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.796875, + "step": 5730, + "time_per_iteration": 2.4471776485443115 + }, + { + "auxiliary_loss_clip": 0.0112363, + "auxiliary_loss_mlp": 0.01041517, + "balance_loss_clip": 1.02693152, + "balance_loss_mlp": 1.04263377, + "epoch": 0.3445663610401323, + "flos": 20302708567680.0, + "grad_norm": 2.104777326243209, + "language_loss": 0.80005515, + "learning_rate": 3.048722123283578e-06, + "loss": 0.82170659, + "num_input_tokens_seen": 123145130, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8125, + "step": 5731, + "time_per_iteration": 2.454735279083252 + }, + { + "auxiliary_loss_clip": 0.01123928, + "auxiliary_loss_mlp": 0.0104428, + "balance_loss_clip": 1.02953923, + "balance_loss_mlp": 1.04394484, + "epoch": 0.34462648429280024, + "flos": 15888102307200.0, + "grad_norm": 2.039149215632527, + "language_loss": 0.78837991, + "learning_rate": 3.0483904783290006e-06, + "loss": 0.81006193, + "num_input_tokens_seen": 123162265, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.796875, + "step": 5732, + "time_per_iteration": 2.4177064895629883 + }, + { + "auxiliary_loss_clip": 0.01043649, + "auxiliary_loss_mlp": 0.01003776, + "balance_loss_clip": 1.00184536, + "balance_loss_mlp": 1.01788378, + "epoch": 0.3446866075454682, + "flos": 59311035285120.0, + "grad_norm": 0.7440231134556253, + "language_loss": 0.53498071, + "learning_rate": 3.0480587936193505e-06, + "loss": 0.55545497, + "num_input_tokens_seen": 123218620, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.2578125, + "step": 5733, + "time_per_iteration": 3.0976667404174805 + }, + { + "auxiliary_loss_clip": 0.0112691, + "auxiliary_loss_mlp": 0.01042834, + "balance_loss_clip": 1.02806389, + "balance_loss_mlp": 1.04630947, + "epoch": 0.34474673079813617, + "flos": 22343799000960.0, + "grad_norm": 1.6025085195413686, + "language_loss": 0.83345532, + "learning_rate": 3.047727069167207e-06, + "loss": 0.85515279, + "num_input_tokens_seen": 123237325, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8046875, + "step": 5734, + "time_per_iteration": 2.462327718734741 + }, + { + "auxiliary_loss_clip": 0.01125766, + "auxiliary_loss_mlp": 0.0103401, + "balance_loss_clip": 1.01899588, + "balance_loss_mlp": 1.04382658, + "epoch": 0.34480685405080413, + "flos": 27670141203840.0, + "grad_norm": 2.7233898634254525, + "language_loss": 0.9245038, + "learning_rate": 3.0473953049851478e-06, + "loss": 0.94610149, + "num_input_tokens_seen": 123258650, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8203125, + "step": 5735, + "time_per_iteration": 2.600933790206909 + }, + { + "auxiliary_loss_clip": 0.01129266, + "auxiliary_loss_mlp": 0.01041814, + "balance_loss_clip": 1.02607846, + "balance_loss_mlp": 1.04662871, + "epoch": 0.3448669773034721, + "flos": 22456020067200.0, + "grad_norm": 1.628548106881684, + "language_loss": 0.76666284, + "learning_rate": 3.0470635010857533e-06, + "loss": 0.78837371, + "num_input_tokens_seen": 123277155, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.828125, + "step": 5736, + "time_per_iteration": 2.4607973098754883 + }, + { + "auxiliary_loss_clip": 0.0113014, + "auxiliary_loss_mlp": 0.01043797, + "balance_loss_clip": 1.02948046, + "balance_loss_mlp": 1.04773998, + "epoch": 0.34492710055614006, + "flos": 24936190352640.0, + "grad_norm": 1.59823002014571, + "language_loss": 0.78745639, + "learning_rate": 3.0467316574816064e-06, + "loss": 0.80919576, + "num_input_tokens_seen": 123297640, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.82421875, + "step": 5737, + "time_per_iteration": 2.5059142112731934 + }, + { + "auxiliary_loss_clip": 0.0112976, + "auxiliary_loss_mlp": 0.01040269, + "balance_loss_clip": 1.02459311, + "balance_loss_mlp": 1.04445243, + "epoch": 0.34498722380880803, + "flos": 20120821073280.0, + "grad_norm": 2.0456946138928767, + "language_loss": 0.71714234, + "learning_rate": 3.0463997741852893e-06, + "loss": 0.73884267, + "num_input_tokens_seen": 123314370, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8515625, + "step": 5738, + "time_per_iteration": 2.4374310970306396 + }, + { + "auxiliary_loss_clip": 0.01129235, + "auxiliary_loss_mlp": 0.01042362, + "balance_loss_clip": 1.02727044, + "balance_loss_mlp": 1.04496205, + "epoch": 0.34504734706147605, + "flos": 28438126917120.0, + "grad_norm": 1.8999072115309161, + "language_loss": 0.81518626, + "learning_rate": 3.046067851209389e-06, + "loss": 0.83690214, + "num_input_tokens_seen": 123336085, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84375, + "step": 5739, + "time_per_iteration": 2.559990406036377 + }, + { + "auxiliary_loss_clip": 0.0112747, + "auxiliary_loss_mlp": 0.01045734, + "balance_loss_clip": 1.03067827, + "balance_loss_mlp": 1.04620492, + "epoch": 0.345107470314144, + "flos": 22674464628480.0, + "grad_norm": 2.6856273454827275, + "language_loss": 0.8322401, + "learning_rate": 3.0457358885664898e-06, + "loss": 0.85397214, + "num_input_tokens_seen": 123354460, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8125, + "step": 5740, + "time_per_iteration": 2.4684722423553467 + }, + { + "auxiliary_loss_clip": 0.01127563, + "auxiliary_loss_mlp": 0.01038564, + "balance_loss_clip": 1.0227803, + "balance_loss_mlp": 1.04611385, + "epoch": 0.345167593566812, + "flos": 20630716588800.0, + "grad_norm": 2.03424253553345, + "language_loss": 0.77135098, + "learning_rate": 3.045403886269181e-06, + "loss": 0.7930122, + "num_input_tokens_seen": 123373420, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8125, + "step": 5741, + "time_per_iteration": 2.48624587059021 + }, + { + "auxiliary_loss_clip": 0.01125981, + "auxiliary_loss_mlp": 0.01036869, + "balance_loss_clip": 1.02226019, + "balance_loss_mlp": 1.04276562, + "epoch": 0.34522771681947995, + "flos": 26214358890240.0, + "grad_norm": 1.4993687582247586, + "language_loss": 0.77224493, + "learning_rate": 3.045071844330053e-06, + "loss": 0.79387349, + "num_input_tokens_seen": 123394730, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.83203125, + "step": 5742, + "time_per_iteration": 2.5046300888061523 + }, + { + "auxiliary_loss_clip": 0.01123657, + "auxiliary_loss_mlp": 0.01039728, + "balance_loss_clip": 1.02479076, + "balance_loss_mlp": 1.04310095, + "epoch": 0.3452878400721479, + "flos": 19062354072960.0, + "grad_norm": 1.823337430242114, + "language_loss": 0.76346177, + "learning_rate": 3.0447397627616955e-06, + "loss": 0.78509557, + "num_input_tokens_seen": 123412895, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8046875, + "step": 5743, + "time_per_iteration": 2.4554226398468018 + }, + { + "auxiliary_loss_clip": 0.01124183, + "auxiliary_loss_mlp": 0.01036757, + "balance_loss_clip": 1.02278566, + "balance_loss_mlp": 1.04435802, + "epoch": 0.3453479633248159, + "flos": 27929739772800.0, + "grad_norm": 1.5691807126711539, + "language_loss": 0.70255435, + "learning_rate": 3.0444076415767016e-06, + "loss": 0.72416371, + "num_input_tokens_seen": 123432320, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.796875, + "step": 5744, + "time_per_iteration": 2.497314929962158 + }, + { + "auxiliary_loss_clip": 0.01121947, + "auxiliary_loss_mlp": 0.01036476, + "balance_loss_clip": 1.02205133, + "balance_loss_mlp": 1.04318309, + "epoch": 0.34540808657748384, + "flos": 19606113135360.0, + "grad_norm": 1.629619176768893, + "language_loss": 0.79692256, + "learning_rate": 3.044075480787665e-06, + "loss": 0.81850678, + "num_input_tokens_seen": 123450980, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7890625, + "step": 5745, + "time_per_iteration": 2.5154099464416504 + }, + { + "auxiliary_loss_clip": 0.01129348, + "auxiliary_loss_mlp": 0.01040489, + "balance_loss_clip": 1.02460432, + "balance_loss_mlp": 1.04556072, + "epoch": 0.3454682098301518, + "flos": 20411661496320.0, + "grad_norm": 1.7858540966841563, + "language_loss": 0.88775939, + "learning_rate": 3.043743280407182e-06, + "loss": 0.9094578, + "num_input_tokens_seen": 123469365, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8359375, + "step": 5746, + "time_per_iteration": 2.436028003692627 + }, + { + "auxiliary_loss_clip": 0.01129654, + "auxiliary_loss_mlp": 0.01039755, + "balance_loss_clip": 1.02438855, + "balance_loss_mlp": 1.04509354, + "epoch": 0.34552833308281977, + "flos": 21325121291520.0, + "grad_norm": 1.8755596522528313, + "language_loss": 0.64010286, + "learning_rate": 3.043411040447849e-06, + "loss": 0.66179693, + "num_input_tokens_seen": 123489425, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.84765625, + "step": 5747, + "time_per_iteration": 2.465817451477051 + }, + { + "auxiliary_loss_clip": 0.0112633, + "auxiliary_loss_mlp": 0.01035998, + "balance_loss_clip": 1.02193761, + "balance_loss_mlp": 1.04486203, + "epoch": 0.34558845633548774, + "flos": 36243633824640.0, + "grad_norm": 1.5413680181151455, + "language_loss": 0.72813559, + "learning_rate": 3.043078760922264e-06, + "loss": 0.74975884, + "num_input_tokens_seen": 123509970, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8125, + "step": 5748, + "time_per_iteration": 2.566849946975708 + }, + { + "auxiliary_loss_clip": 0.01123147, + "auxiliary_loss_mlp": 0.01034299, + "balance_loss_clip": 1.020715, + "balance_loss_mlp": 1.04517043, + "epoch": 0.3456485795881557, + "flos": 22450561200000.0, + "grad_norm": 1.6451707518978071, + "language_loss": 0.75697249, + "learning_rate": 3.042746441843029e-06, + "loss": 0.77854693, + "num_input_tokens_seen": 123531055, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.77734375, + "step": 5749, + "time_per_iteration": 2.5068271160125732 + }, + { + "auxiliary_loss_clip": 0.01036655, + "auxiliary_loss_mlp": 0.01004838, + "balance_loss_clip": 1.00293088, + "balance_loss_mlp": 1.01066136, + "epoch": 0.34570870284082367, + "flos": 62004299005440.0, + "grad_norm": 0.8931526891439046, + "language_loss": 0.62754983, + "learning_rate": 3.0424140832227437e-06, + "loss": 0.64796478, + "num_input_tokens_seen": 123584720, + "router_z_loss_clip": 0.01904297, + "router_z_loss_mlp": 0.25976562, + "step": 5750, + "time_per_iteration": 2.930236577987671 + }, + { + "auxiliary_loss_clip": 0.01119501, + "auxiliary_loss_mlp": 0.01033201, + "balance_loss_clip": 1.01933062, + "balance_loss_mlp": 1.04268134, + "epoch": 0.34576882609349163, + "flos": 22782196494720.0, + "grad_norm": 2.1199041216122314, + "language_loss": 0.80762947, + "learning_rate": 3.042081685074012e-06, + "loss": 0.82915652, + "num_input_tokens_seen": 123604465, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 5751, + "time_per_iteration": 2.4710936546325684 + }, + { + "auxiliary_loss_clip": 0.01121328, + "auxiliary_loss_mlp": 0.01046759, + "balance_loss_clip": 1.03268027, + "balance_loss_mlp": 1.04408574, + "epoch": 0.34582894934615965, + "flos": 12348818576640.0, + "grad_norm": 3.882107217624466, + "language_loss": 0.83630323, + "learning_rate": 3.041749247409439e-06, + "loss": 0.85798407, + "num_input_tokens_seen": 123622320, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7734375, + "step": 5752, + "time_per_iteration": 2.421095132827759 + }, + { + "auxiliary_loss_clip": 0.01036836, + "auxiliary_loss_mlp": 0.01002038, + "balance_loss_clip": 1.00014234, + "balance_loss_mlp": 1.01131189, + "epoch": 0.3458890725988276, + "flos": 70167691071360.0, + "grad_norm": 0.7425573992046552, + "language_loss": 0.63106978, + "learning_rate": 3.0414167702416296e-06, + "loss": 0.6514585, + "num_input_tokens_seen": 123678010, + "router_z_loss_clip": 0.0189209, + "router_z_loss_mlp": 0.25585938, + "step": 5753, + "time_per_iteration": 2.960430383682251 + }, + { + "auxiliary_loss_clip": 0.01122475, + "auxiliary_loss_mlp": 0.01040243, + "balance_loss_clip": 1.0252701, + "balance_loss_mlp": 1.0433172, + "epoch": 0.3459491958514956, + "flos": 17092582093440.0, + "grad_norm": 1.7337780765213762, + "language_loss": 0.70964289, + "learning_rate": 3.0410842535831914e-06, + "loss": 0.73127007, + "num_input_tokens_seen": 123696830, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.79296875, + "step": 5754, + "time_per_iteration": 2.473090171813965 + }, + { + "auxiliary_loss_clip": 0.01126645, + "auxiliary_loss_mlp": 0.01033305, + "balance_loss_clip": 1.01889825, + "balance_loss_mlp": 1.04436386, + "epoch": 0.34600931910416355, + "flos": 16650952375680.0, + "grad_norm": 3.1958037374869357, + "language_loss": 0.72880316, + "learning_rate": 3.0407516974467343e-06, + "loss": 0.75040269, + "num_input_tokens_seen": 123714360, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.82421875, + "step": 5755, + "time_per_iteration": 2.486187219619751 + }, + { + "auxiliary_loss_clip": 0.01122516, + "auxiliary_loss_mlp": 0.01034129, + "balance_loss_clip": 1.01985335, + "balance_loss_mlp": 1.04448533, + "epoch": 0.3460694423568315, + "flos": 38546190334080.0, + "grad_norm": 1.6620890991055186, + "language_loss": 0.72366977, + "learning_rate": 3.040419101844869e-06, + "loss": 0.74523616, + "num_input_tokens_seen": 123739250, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 5756, + "time_per_iteration": 2.6883044242858887 + }, + { + "auxiliary_loss_clip": 0.01036738, + "auxiliary_loss_mlp": 0.01004698, + "balance_loss_clip": 1.00295758, + "balance_loss_mlp": 1.01152658, + "epoch": 0.3461295656094995, + "flos": 72081479704320.0, + "grad_norm": 0.7127234008063932, + "language_loss": 0.62522227, + "learning_rate": 3.040086466790207e-06, + "loss": 0.64563662, + "num_input_tokens_seen": 123802845, + "router_z_loss_clip": 0.01745605, + "router_z_loss_mlp": 0.25195312, + "step": 5757, + "time_per_iteration": 3.0644619464874268 + }, + { + "auxiliary_loss_clip": 0.01036676, + "auxiliary_loss_mlp": 0.01006374, + "balance_loss_clip": 1.00465703, + "balance_loss_mlp": 1.01123941, + "epoch": 0.34618968886216744, + "flos": 65460089571840.0, + "grad_norm": 0.8513650993905141, + "language_loss": 0.59153563, + "learning_rate": 3.039753792295362e-06, + "loss": 0.61196613, + "num_input_tokens_seen": 123861805, + "router_z_loss_clip": 0.01721191, + "router_z_loss_mlp": 0.25390625, + "step": 5758, + "time_per_iteration": 3.0601916313171387 + }, + { + "auxiliary_loss_clip": 0.01126165, + "auxiliary_loss_mlp": 0.0103975, + "balance_loss_clip": 1.02576697, + "balance_loss_mlp": 1.04562724, + "epoch": 0.3462498121148354, + "flos": 23472542960640.0, + "grad_norm": 1.8469236817688628, + "language_loss": 0.71498728, + "learning_rate": 3.0394210783729487e-06, + "loss": 0.73664641, + "num_input_tokens_seen": 123881820, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.8046875, + "step": 5759, + "time_per_iteration": 2.4722588062286377 + }, + { + "auxiliary_loss_clip": 0.0112123, + "auxiliary_loss_mlp": 0.01046171, + "balance_loss_clip": 1.03079295, + "balance_loss_mlp": 1.04248834, + "epoch": 0.3463099353675034, + "flos": 24170790418560.0, + "grad_norm": 1.8727439754442439, + "language_loss": 0.83008277, + "learning_rate": 3.0390883250355836e-06, + "loss": 0.85175675, + "num_input_tokens_seen": 123903700, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.7890625, + "step": 5760, + "time_per_iteration": 2.5002012252807617 + }, + { + "auxiliary_loss_clip": 0.01035648, + "auxiliary_loss_mlp": 0.01005512, + "balance_loss_clip": 1.00358045, + "balance_loss_mlp": 1.01033783, + "epoch": 0.34637005862017134, + "flos": 63700609766400.0, + "grad_norm": 0.8745886359800412, + "language_loss": 0.5653646, + "learning_rate": 3.0387555322958865e-06, + "loss": 0.58577621, + "num_input_tokens_seen": 123960075, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.25390625, + "step": 5761, + "time_per_iteration": 3.0950896739959717 + }, + { + "auxiliary_loss_clip": 0.01120096, + "auxiliary_loss_mlp": 0.01038691, + "balance_loss_clip": 1.02470756, + "balance_loss_mlp": 1.04127657, + "epoch": 0.3464301818728393, + "flos": 13145532192000.0, + "grad_norm": 2.0018538772922883, + "language_loss": 0.95053494, + "learning_rate": 3.038422700166474e-06, + "loss": 0.97212291, + "num_input_tokens_seen": 123975805, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7890625, + "step": 5762, + "time_per_iteration": 5.290884256362915 + }, + { + "auxiliary_loss_clip": 0.01123668, + "auxiliary_loss_mlp": 0.01034396, + "balance_loss_clip": 1.01935804, + "balance_loss_mlp": 1.0417943, + "epoch": 0.34649030512550727, + "flos": 29315173299840.0, + "grad_norm": 2.194288284173203, + "language_loss": 0.69335818, + "learning_rate": 3.0380898286599692e-06, + "loss": 0.71493888, + "num_input_tokens_seen": 123997530, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8203125, + "step": 5763, + "time_per_iteration": 2.5411787033081055 + }, + { + "auxiliary_loss_clip": 0.01130216, + "auxiliary_loss_mlp": 0.01045092, + "balance_loss_clip": 1.02862906, + "balance_loss_mlp": 1.0458554, + "epoch": 0.34655042837817523, + "flos": 23730884553600.0, + "grad_norm": 2.0099592928074497, + "language_loss": 0.83589876, + "learning_rate": 3.0377569177889945e-06, + "loss": 0.85765183, + "num_input_tokens_seen": 124016375, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.84375, + "step": 5764, + "time_per_iteration": 2.48040771484375 + }, + { + "auxiliary_loss_clip": 0.01123556, + "auxiliary_loss_mlp": 0.01033291, + "balance_loss_clip": 1.0186758, + "balance_loss_mlp": 1.04343057, + "epoch": 0.34661055163084326, + "flos": 22054215553920.0, + "grad_norm": 2.159805793212971, + "language_loss": 0.67403859, + "learning_rate": 3.0374239675661722e-06, + "loss": 0.69560707, + "num_input_tokens_seen": 124033975, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.80078125, + "step": 5765, + "time_per_iteration": 2.502297878265381 + }, + { + "auxiliary_loss_clip": 0.01130095, + "auxiliary_loss_mlp": 0.01037318, + "balance_loss_clip": 1.02291703, + "balance_loss_mlp": 1.04937232, + "epoch": 0.3466706748835112, + "flos": 21799213925760.0, + "grad_norm": 2.083918060213648, + "language_loss": 0.77861524, + "learning_rate": 3.03709097800413e-06, + "loss": 0.80028939, + "num_input_tokens_seen": 124051930, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.80859375, + "step": 5766, + "time_per_iteration": 2.465325355529785 + }, + { + "auxiliary_loss_clip": 0.01122551, + "auxiliary_loss_mlp": 0.01035113, + "balance_loss_clip": 1.0215292, + "balance_loss_mlp": 1.04335451, + "epoch": 0.3467307981361792, + "flos": 19461680547840.0, + "grad_norm": 1.5377908130541305, + "language_loss": 0.73529994, + "learning_rate": 3.0367579491154943e-06, + "loss": 0.75687665, + "num_input_tokens_seen": 124071220, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7890625, + "step": 5767, + "time_per_iteration": 2.4656143188476562 + }, + { + "auxiliary_loss_clip": 0.01127128, + "auxiliary_loss_mlp": 0.01040956, + "balance_loss_clip": 1.02538764, + "balance_loss_mlp": 1.04720497, + "epoch": 0.34679092138884715, + "flos": 24827452905600.0, + "grad_norm": 2.233359981487989, + "language_loss": 0.77795279, + "learning_rate": 3.036424880912893e-06, + "loss": 0.79963356, + "num_input_tokens_seen": 124090140, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.796875, + "step": 5768, + "time_per_iteration": 2.4951131343841553 + }, + { + "auxiliary_loss_clip": 0.0103542, + "auxiliary_loss_mlp": 0.01008769, + "balance_loss_clip": 1.00693345, + "balance_loss_mlp": 1.01015306, + "epoch": 0.3468510446415151, + "flos": 63236070149760.0, + "grad_norm": 0.7739728920865777, + "language_loss": 0.57404095, + "learning_rate": 3.036091773408956e-06, + "loss": 0.59448284, + "num_input_tokens_seen": 124152025, + "router_z_loss_clip": 0.01831055, + "router_z_loss_mlp": 0.25195312, + "step": 5769, + "time_per_iteration": 3.0867085456848145 + }, + { + "auxiliary_loss_clip": 0.01135857, + "auxiliary_loss_mlp": 0.01043057, + "balance_loss_clip": 1.02577174, + "balance_loss_mlp": 1.04723847, + "epoch": 0.3469111678941831, + "flos": 12120713256960.0, + "grad_norm": 2.3808887206764244, + "language_loss": 0.85625517, + "learning_rate": 3.0357586266163154e-06, + "loss": 0.87804437, + "num_input_tokens_seen": 124165795, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.88671875, + "step": 5770, + "time_per_iteration": 2.4296391010284424 + }, + { + "auxiliary_loss_clip": 0.0103532, + "auxiliary_loss_mlp": 0.01003334, + "balance_loss_clip": 1.00152194, + "balance_loss_mlp": 1.01001954, + "epoch": 0.34697129114685105, + "flos": 65934110378880.0, + "grad_norm": 0.7779481231658855, + "language_loss": 0.59827816, + "learning_rate": 3.0354254405476036e-06, + "loss": 0.61866474, + "num_input_tokens_seen": 124222925, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.25390625, + "step": 5771, + "time_per_iteration": 2.858952522277832 + }, + { + "auxiliary_loss_clip": 0.0112466, + "auxiliary_loss_mlp": 0.01046684, + "balance_loss_clip": 1.03183091, + "balance_loss_mlp": 1.04478061, + "epoch": 0.347031414399519, + "flos": 34454205054720.0, + "grad_norm": 2.6949016474557475, + "language_loss": 0.71790159, + "learning_rate": 3.0350922152154557e-06, + "loss": 0.73961502, + "num_input_tokens_seen": 124240915, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 5772, + "time_per_iteration": 2.629441976547241 + }, + { + "auxiliary_loss_clip": 0.01125724, + "auxiliary_loss_mlp": 0.01041085, + "balance_loss_clip": 1.02608275, + "balance_loss_mlp": 1.04398608, + "epoch": 0.347091537652187, + "flos": 26944135511040.0, + "grad_norm": 1.4939658014033708, + "language_loss": 0.76165307, + "learning_rate": 3.034758950632507e-06, + "loss": 0.78332114, + "num_input_tokens_seen": 124262770, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.81640625, + "step": 5773, + "time_per_iteration": 2.5281848907470703 + }, + { + "auxiliary_loss_clip": 0.01127127, + "auxiliary_loss_mlp": 0.01043606, + "balance_loss_clip": 1.02811444, + "balance_loss_mlp": 1.04447389, + "epoch": 0.34715166090485494, + "flos": 21142228216320.0, + "grad_norm": 2.0748415381607717, + "language_loss": 0.70428938, + "learning_rate": 3.034425646811396e-06, + "loss": 0.72599673, + "num_input_tokens_seen": 124280950, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.828125, + "step": 5774, + "time_per_iteration": 2.4930198192596436 + }, + { + "auxiliary_loss_clip": 0.01125136, + "auxiliary_loss_mlp": 0.01040671, + "balance_loss_clip": 1.02630043, + "balance_loss_mlp": 1.04615033, + "epoch": 0.3472117841575229, + "flos": 23478001827840.0, + "grad_norm": 1.6801460468757594, + "language_loss": 0.76410925, + "learning_rate": 3.0340923037647602e-06, + "loss": 0.78576738, + "num_input_tokens_seen": 124299540, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7890625, + "step": 5775, + "time_per_iteration": 2.501793622970581 + }, + { + "auxiliary_loss_clip": 0.01129926, + "auxiliary_loss_mlp": 0.0104389, + "balance_loss_clip": 1.02778447, + "balance_loss_mlp": 1.04408336, + "epoch": 0.34727190741019087, + "flos": 17492806408320.0, + "grad_norm": 2.2786937073337956, + "language_loss": 0.78098702, + "learning_rate": 3.0337589215052404e-06, + "loss": 0.8027252, + "num_input_tokens_seen": 124316285, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.859375, + "step": 5776, + "time_per_iteration": 2.547508716583252 + }, + { + "auxiliary_loss_clip": 0.01034004, + "auxiliary_loss_mlp": 0.01012403, + "balance_loss_clip": 1.01073408, + "balance_loss_mlp": 1.00864577, + "epoch": 0.34733203066285884, + "flos": 65265491640960.0, + "grad_norm": 0.8366551978688649, + "language_loss": 0.63353252, + "learning_rate": 3.033425500045478e-06, + "loss": 0.65399659, + "num_input_tokens_seen": 124376650, + "router_z_loss_clip": 0.01672363, + "router_z_loss_mlp": 0.25390625, + "step": 5777, + "time_per_iteration": 3.118314743041992 + }, + { + "auxiliary_loss_clip": 0.01124542, + "auxiliary_loss_mlp": 0.01047894, + "balance_loss_clip": 1.03253984, + "balance_loss_mlp": 1.04198289, + "epoch": 0.3473921539155268, + "flos": 28658726294400.0, + "grad_norm": 2.1982821508403956, + "language_loss": 0.64399695, + "learning_rate": 3.033092039398119e-06, + "loss": 0.66572136, + "num_input_tokens_seen": 124396475, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.82421875, + "step": 5778, + "time_per_iteration": 2.5438621044158936 + }, + { + "auxiliary_loss_clip": 0.01128237, + "auxiliary_loss_mlp": 0.01054212, + "balance_loss_clip": 1.03947175, + "balance_loss_mlp": 1.04425573, + "epoch": 0.3474522771681948, + "flos": 40836895355520.0, + "grad_norm": 1.7264375706792277, + "language_loss": 0.71190178, + "learning_rate": 3.0327585395758046e-06, + "loss": 0.73372632, + "num_input_tokens_seen": 124416480, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.83984375, + "step": 5779, + "time_per_iteration": 2.6116013526916504 + }, + { + "auxiliary_loss_clip": 0.01128331, + "auxiliary_loss_mlp": 0.01048092, + "balance_loss_clip": 1.03270197, + "balance_loss_mlp": 1.04354596, + "epoch": 0.3475124004208628, + "flos": 24608577381120.0, + "grad_norm": 1.874853063849031, + "language_loss": 0.62552947, + "learning_rate": 3.0324250005911837e-06, + "loss": 0.64729369, + "num_input_tokens_seen": 124435950, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84765625, + "step": 5780, + "time_per_iteration": 2.5024712085723877 + }, + { + "auxiliary_loss_clip": 0.01124027, + "auxiliary_loss_mlp": 0.01041985, + "balance_loss_clip": 1.0278883, + "balance_loss_mlp": 1.04260445, + "epoch": 0.34757252367353075, + "flos": 22711309004160.0, + "grad_norm": 1.604616792806945, + "language_loss": 0.72373253, + "learning_rate": 3.0320914224569033e-06, + "loss": 0.74539268, + "num_input_tokens_seen": 124455410, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.81640625, + "step": 5781, + "time_per_iteration": 2.471235513687134 + }, + { + "auxiliary_loss_clip": 0.01125801, + "auxiliary_loss_mlp": 0.01050458, + "balance_loss_clip": 1.03416181, + "balance_loss_mlp": 1.04316914, + "epoch": 0.3476326469261987, + "flos": 19828184970240.0, + "grad_norm": 2.0942988164582266, + "language_loss": 0.76741016, + "learning_rate": 3.031757805185612e-06, + "loss": 0.78917271, + "num_input_tokens_seen": 124474870, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.828125, + "step": 5782, + "time_per_iteration": 2.4831414222717285 + }, + { + "auxiliary_loss_clip": 0.01123989, + "auxiliary_loss_mlp": 0.01036279, + "balance_loss_clip": 1.02140737, + "balance_loss_mlp": 1.04221606, + "epoch": 0.3476927701788667, + "flos": 19938107566080.0, + "grad_norm": 1.9917493867858045, + "language_loss": 0.62131268, + "learning_rate": 3.0314241487899622e-06, + "loss": 0.64291537, + "num_input_tokens_seen": 124494105, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8203125, + "step": 5783, + "time_per_iteration": 2.4483954906463623 + }, + { + "auxiliary_loss_clip": 0.01119293, + "auxiliary_loss_mlp": 0.01031994, + "balance_loss_clip": 1.01833832, + "balance_loss_mlp": 1.0410347, + "epoch": 0.34775289343153465, + "flos": 20735108490240.0, + "grad_norm": 1.6546414102961637, + "language_loss": 0.88575971, + "learning_rate": 3.031090453282605e-06, + "loss": 0.90727258, + "num_input_tokens_seen": 124512030, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.78125, + "step": 5784, + "time_per_iteration": 2.5281262397766113 + }, + { + "auxiliary_loss_clip": 0.01121731, + "auxiliary_loss_mlp": 0.01036934, + "balance_loss_clip": 1.02219379, + "balance_loss_mlp": 1.04283547, + "epoch": 0.3478130166842026, + "flos": 19354846521600.0, + "grad_norm": 1.7834042756277195, + "language_loss": 0.81664282, + "learning_rate": 3.0307567186761946e-06, + "loss": 0.83822948, + "num_input_tokens_seen": 124530980, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.7890625, + "step": 5785, + "time_per_iteration": 2.444279432296753 + }, + { + "auxiliary_loss_clip": 0.01126224, + "auxiliary_loss_mlp": 0.01037443, + "balance_loss_clip": 1.02301908, + "balance_loss_mlp": 1.04558039, + "epoch": 0.3478731399368706, + "flos": 22051198811520.0, + "grad_norm": 1.6236713309130966, + "language_loss": 0.80679643, + "learning_rate": 3.0304229449833862e-06, + "loss": 0.82843316, + "num_input_tokens_seen": 124549330, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8046875, + "step": 5786, + "time_per_iteration": 2.506639242172241 + }, + { + "auxiliary_loss_clip": 0.01123366, + "auxiliary_loss_mlp": 0.01033692, + "balance_loss_clip": 1.01860058, + "balance_loss_mlp": 1.0443275, + "epoch": 0.34793326318953854, + "flos": 18041449720320.0, + "grad_norm": 1.5789553434659291, + "language_loss": 0.74868137, + "learning_rate": 3.030089132216836e-06, + "loss": 0.77025199, + "num_input_tokens_seen": 124567200, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7890625, + "step": 5787, + "time_per_iteration": 2.4305543899536133 + }, + { + "auxiliary_loss_clip": 0.01122978, + "auxiliary_loss_mlp": 0.01037288, + "balance_loss_clip": 1.02276862, + "balance_loss_mlp": 1.04133916, + "epoch": 0.3479933864422065, + "flos": 29314670509440.0, + "grad_norm": 1.685205733624188, + "language_loss": 0.81207466, + "learning_rate": 3.029755280389203e-06, + "loss": 0.83367729, + "num_input_tokens_seen": 124587025, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.81640625, + "step": 5788, + "time_per_iteration": 2.58461332321167 + }, + { + "auxiliary_loss_clip": 0.01130932, + "auxiliary_loss_mlp": 0.01038586, + "balance_loss_clip": 1.02333927, + "balance_loss_mlp": 1.04716599, + "epoch": 0.3480535096948745, + "flos": 20120713332480.0, + "grad_norm": 1.7599288417752579, + "language_loss": 0.85399663, + "learning_rate": 3.029421389513147e-06, + "loss": 0.87569183, + "num_input_tokens_seen": 124605860, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8359375, + "step": 5789, + "time_per_iteration": 2.4460527896881104 + }, + { + "auxiliary_loss_clip": 0.01127788, + "auxiliary_loss_mlp": 0.01050411, + "balance_loss_clip": 1.03517616, + "balance_loss_mlp": 1.04420161, + "epoch": 0.34811363294754244, + "flos": 18548974938240.0, + "grad_norm": 1.9217222904205502, + "language_loss": 0.84973574, + "learning_rate": 3.029087459601328e-06, + "loss": 0.87151778, + "num_input_tokens_seen": 124624270, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8359375, + "step": 5790, + "time_per_iteration": 2.4690423011779785 + }, + { + "auxiliary_loss_clip": 0.0112493, + "auxiliary_loss_mlp": 0.01044909, + "balance_loss_clip": 1.0295074, + "balance_loss_mlp": 1.04403305, + "epoch": 0.3481737562002104, + "flos": 26870303105280.0, + "grad_norm": 2.0218239222922785, + "language_loss": 0.82098949, + "learning_rate": 3.0287534906664097e-06, + "loss": 0.8426879, + "num_input_tokens_seen": 124644005, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.80859375, + "step": 5791, + "time_per_iteration": 2.4949092864990234 + }, + { + "auxiliary_loss_clip": 0.01124824, + "auxiliary_loss_mlp": 0.01038811, + "balance_loss_clip": 1.02386248, + "balance_loss_mlp": 1.04235744, + "epoch": 0.3482338794528784, + "flos": 28908664104960.0, + "grad_norm": 1.7691925727921667, + "language_loss": 0.77531552, + "learning_rate": 3.028419482721056e-06, + "loss": 0.79695195, + "num_input_tokens_seen": 124663020, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.82421875, + "step": 5792, + "time_per_iteration": 2.5464468002319336 + }, + { + "auxiliary_loss_clip": 0.01121021, + "auxiliary_loss_mlp": 0.01031113, + "balance_loss_clip": 1.01623607, + "balance_loss_mlp": 1.04100966, + "epoch": 0.3482940027055464, + "flos": 22200767043840.0, + "grad_norm": 1.5041206153246893, + "language_loss": 0.81592953, + "learning_rate": 3.0280854357779325e-06, + "loss": 0.83745086, + "num_input_tokens_seen": 124682975, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.80078125, + "step": 5793, + "time_per_iteration": 2.454220771789551 + }, + { + "auxiliary_loss_clip": 0.01126572, + "auxiliary_loss_mlp": 0.01046613, + "balance_loss_clip": 1.03078222, + "balance_loss_mlp": 1.04426205, + "epoch": 0.34835412595821436, + "flos": 20302708567680.0, + "grad_norm": 1.7524057524538565, + "language_loss": 0.76222527, + "learning_rate": 3.027751349849706e-06, + "loss": 0.78395712, + "num_input_tokens_seen": 124701340, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8203125, + "step": 5794, + "time_per_iteration": 2.485077142715454 + }, + { + "auxiliary_loss_clip": 0.01121136, + "auxiliary_loss_mlp": 0.01036584, + "balance_loss_clip": 1.02165866, + "balance_loss_mlp": 1.04168189, + "epoch": 0.3484142492108823, + "flos": 20449691020800.0, + "grad_norm": 2.2347385462744165, + "language_loss": 0.56926, + "learning_rate": 3.0274172249490456e-06, + "loss": 0.59083712, + "num_input_tokens_seen": 124719165, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.796875, + "step": 5795, + "time_per_iteration": 2.4378490447998047 + }, + { + "auxiliary_loss_clip": 0.01121205, + "auxiliary_loss_mlp": 0.01036633, + "balance_loss_clip": 1.02250659, + "balance_loss_mlp": 1.04285967, + "epoch": 0.3484743724635503, + "flos": 24352929308160.0, + "grad_norm": 2.137832792929428, + "language_loss": 0.82437253, + "learning_rate": 3.0270830610886213e-06, + "loss": 0.84595084, + "num_input_tokens_seen": 124738670, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.78125, + "step": 5796, + "time_per_iteration": 2.5187671184539795 + }, + { + "auxiliary_loss_clip": 0.01120499, + "auxiliary_loss_mlp": 0.01029245, + "balance_loss_clip": 1.0153811, + "balance_loss_mlp": 1.043782, + "epoch": 0.34853449571621825, + "flos": 24353001135360.0, + "grad_norm": 1.7817355656860259, + "language_loss": 0.83580989, + "learning_rate": 3.0267488582811033e-06, + "loss": 0.85730731, + "num_input_tokens_seen": 124758760, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 5797, + "time_per_iteration": 2.518832206726074 + }, + { + "auxiliary_loss_clip": 0.01119982, + "auxiliary_loss_mlp": 0.01034692, + "balance_loss_clip": 1.02017224, + "balance_loss_mlp": 1.04206371, + "epoch": 0.3485946189688862, + "flos": 27267690245760.0, + "grad_norm": 1.7199370679887815, + "language_loss": 0.73215538, + "learning_rate": 3.026414616539167e-06, + "loss": 0.7537021, + "num_input_tokens_seen": 124777765, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 5798, + "time_per_iteration": 2.499967575073242 + }, + { + "auxiliary_loss_clip": 0.01123251, + "auxiliary_loss_mlp": 0.01041885, + "balance_loss_clip": 1.02660251, + "balance_loss_mlp": 1.04203498, + "epoch": 0.3486547422215542, + "flos": 20156695781760.0, + "grad_norm": 2.0872044860332597, + "language_loss": 0.75936413, + "learning_rate": 3.026080335875485e-06, + "loss": 0.78101552, + "num_input_tokens_seen": 124796775, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8125, + "step": 5799, + "time_per_iteration": 2.4452474117279053 + }, + { + "auxiliary_loss_clip": 0.01121272, + "auxiliary_loss_mlp": 0.01038695, + "balance_loss_clip": 1.0248909, + "balance_loss_mlp": 1.04197407, + "epoch": 0.34871486547422215, + "flos": 20230348619520.0, + "grad_norm": 1.7461935027983841, + "language_loss": 0.75557071, + "learning_rate": 3.025746016302734e-06, + "loss": 0.7771703, + "num_input_tokens_seen": 124815825, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.79296875, + "step": 5800, + "time_per_iteration": 2.4526796340942383 + }, + { + "auxiliary_loss_clip": 0.01129939, + "auxiliary_loss_mlp": 0.0104466, + "balance_loss_clip": 1.02854276, + "balance_loss_mlp": 1.04578733, + "epoch": 0.3487749887268901, + "flos": 44053234882560.0, + "grad_norm": 2.3150001070935127, + "language_loss": 0.67645729, + "learning_rate": 3.025411657833591e-06, + "loss": 0.69820327, + "num_input_tokens_seen": 124838420, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.84375, + "step": 5801, + "time_per_iteration": 2.644601821899414 + }, + { + "auxiliary_loss_clip": 0.01122812, + "auxiliary_loss_mlp": 0.0104057, + "balance_loss_clip": 1.02563334, + "balance_loss_mlp": 1.04446411, + "epoch": 0.3488351119795581, + "flos": 23295144666240.0, + "grad_norm": 1.9000140831486088, + "language_loss": 0.76785576, + "learning_rate": 3.025077260480735e-06, + "loss": 0.78948951, + "num_input_tokens_seen": 124857320, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.78515625, + "step": 5802, + "time_per_iteration": 2.46921968460083 + }, + { + "auxiliary_loss_clip": 0.01118956, + "auxiliary_loss_mlp": 0.01033761, + "balance_loss_clip": 1.01905692, + "balance_loss_mlp": 1.04294538, + "epoch": 0.34889523523222604, + "flos": 19934839428480.0, + "grad_norm": 1.750768588632487, + "language_loss": 0.78868455, + "learning_rate": 3.0247428242568474e-06, + "loss": 0.81021172, + "num_input_tokens_seen": 124875685, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.76171875, + "step": 5803, + "time_per_iteration": 3.979863405227661 + }, + { + "auxiliary_loss_clip": 0.01123257, + "auxiliary_loss_mlp": 0.01036614, + "balance_loss_clip": 1.02266085, + "balance_loss_mlp": 1.0410372, + "epoch": 0.348955358484894, + "flos": 30446179816320.0, + "grad_norm": 1.9657380954946277, + "language_loss": 0.67745399, + "learning_rate": 3.0244083491746085e-06, + "loss": 0.69905275, + "num_input_tokens_seen": 124895960, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.8203125, + "step": 5804, + "time_per_iteration": 3.8562989234924316 + }, + { + "auxiliary_loss_clip": 0.01121349, + "auxiliary_loss_mlp": 0.01044714, + "balance_loss_clip": 1.03001559, + "balance_loss_mlp": 1.0454638, + "epoch": 0.349015481737562, + "flos": 17999972490240.0, + "grad_norm": 2.669385195944029, + "language_loss": 0.76021814, + "learning_rate": 3.024073835246702e-06, + "loss": 0.78187871, + "num_input_tokens_seen": 124914140, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7578125, + "step": 5805, + "time_per_iteration": 2.458235263824463 + }, + { + "auxiliary_loss_clip": 0.01124464, + "auxiliary_loss_mlp": 0.01036858, + "balance_loss_clip": 1.02199244, + "balance_loss_mlp": 1.0451802, + "epoch": 0.34907560499023, + "flos": 27198490694400.0, + "grad_norm": 3.0752866237359884, + "language_loss": 0.67804134, + "learning_rate": 3.023739282485814e-06, + "loss": 0.69965458, + "num_input_tokens_seen": 124934180, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.79296875, + "step": 5806, + "time_per_iteration": 2.4840877056121826 + }, + { + "auxiliary_loss_clip": 0.01126527, + "auxiliary_loss_mlp": 0.01040199, + "balance_loss_clip": 1.02523851, + "balance_loss_mlp": 1.04571056, + "epoch": 0.34913572824289796, + "flos": 30226873328640.0, + "grad_norm": 1.4876164360326454, + "language_loss": 0.71957624, + "learning_rate": 3.023404690904629e-06, + "loss": 0.74124348, + "num_input_tokens_seen": 124956060, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8046875, + "step": 5807, + "time_per_iteration": 2.542815685272217 + }, + { + "auxiliary_loss_clip": 0.01123687, + "auxiliary_loss_mlp": 0.01038505, + "balance_loss_clip": 1.02295971, + "balance_loss_mlp": 1.04158592, + "epoch": 0.3491958514955659, + "flos": 29971907614080.0, + "grad_norm": 1.7054576034597768, + "language_loss": 0.74218416, + "learning_rate": 3.0230700605158364e-06, + "loss": 0.7638061, + "num_input_tokens_seen": 124976070, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8203125, + "step": 5808, + "time_per_iteration": 2.503438949584961 + }, + { + "auxiliary_loss_clip": 0.01122149, + "auxiliary_loss_mlp": 0.01048536, + "balance_loss_clip": 1.03412986, + "balance_loss_mlp": 1.04479396, + "epoch": 0.3492559747482339, + "flos": 22783273902720.0, + "grad_norm": 1.5095416937429198, + "language_loss": 0.84245461, + "learning_rate": 3.0227353913321238e-06, + "loss": 0.86416149, + "num_input_tokens_seen": 124996995, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7734375, + "step": 5809, + "time_per_iteration": 2.4860358238220215 + }, + { + "auxiliary_loss_clip": 0.01119567, + "auxiliary_loss_mlp": 0.01036784, + "balance_loss_clip": 1.02354026, + "balance_loss_mlp": 1.04322374, + "epoch": 0.34931609800090185, + "flos": 26068022881920.0, + "grad_norm": 1.8434153763939258, + "language_loss": 0.80251479, + "learning_rate": 3.0224006833661835e-06, + "loss": 0.82407832, + "num_input_tokens_seen": 125015600, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.76171875, + "step": 5810, + "time_per_iteration": 2.481653928756714 + }, + { + "auxiliary_loss_clip": 0.01124044, + "auxiliary_loss_mlp": 0.01039794, + "balance_loss_clip": 1.02613211, + "balance_loss_mlp": 1.04406404, + "epoch": 0.3493762212535698, + "flos": 29242023252480.0, + "grad_norm": 1.967526444092296, + "language_loss": 0.75335366, + "learning_rate": 3.0220659366307057e-06, + "loss": 0.77499199, + "num_input_tokens_seen": 125035290, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.80078125, + "step": 5811, + "time_per_iteration": 2.534524440765381 + }, + { + "auxiliary_loss_clip": 0.01128245, + "auxiliary_loss_mlp": 0.01039888, + "balance_loss_clip": 1.02543986, + "balance_loss_mlp": 1.04616523, + "epoch": 0.3494363445062378, + "flos": 27126058919040.0, + "grad_norm": 1.4977831051483896, + "language_loss": 0.80070162, + "learning_rate": 3.021731151138386e-06, + "loss": 0.82238293, + "num_input_tokens_seen": 125057130, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8203125, + "step": 5812, + "time_per_iteration": 2.503074884414673 + }, + { + "auxiliary_loss_clip": 0.01123214, + "auxiliary_loss_mlp": 0.01042781, + "balance_loss_clip": 1.02746272, + "balance_loss_mlp": 1.04195547, + "epoch": 0.34949646775890575, + "flos": 12276207233280.0, + "grad_norm": 1.9471141693502576, + "language_loss": 0.6923517, + "learning_rate": 3.021396326901918e-06, + "loss": 0.71401167, + "num_input_tokens_seen": 125073720, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8125, + "step": 5813, + "time_per_iteration": 2.4503591060638428 + }, + { + "auxiliary_loss_clip": 0.01122458, + "auxiliary_loss_mlp": 0.01039452, + "balance_loss_clip": 1.02492023, + "balance_loss_mlp": 1.04438448, + "epoch": 0.3495565910115737, + "flos": 17165516659200.0, + "grad_norm": 2.4036318537481334, + "language_loss": 0.77007949, + "learning_rate": 3.0210614639339998e-06, + "loss": 0.79169858, + "num_input_tokens_seen": 125090635, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.78125, + "step": 5814, + "time_per_iteration": 2.4173405170440674 + }, + { + "auxiliary_loss_clip": 0.01126142, + "auxiliary_loss_mlp": 0.01042541, + "balance_loss_clip": 1.02692485, + "balance_loss_mlp": 1.04406822, + "epoch": 0.3496167142642417, + "flos": 26465661417600.0, + "grad_norm": 1.5090517849605465, + "language_loss": 0.84283173, + "learning_rate": 3.020726562247328e-06, + "loss": 0.86451852, + "num_input_tokens_seen": 125110070, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8203125, + "step": 5815, + "time_per_iteration": 2.5173141956329346 + }, + { + "auxiliary_loss_clip": 0.01124466, + "auxiliary_loss_mlp": 0.01033251, + "balance_loss_clip": 1.01981044, + "balance_loss_mlp": 1.04368711, + "epoch": 0.34967683751690964, + "flos": 17414843938560.0, + "grad_norm": 2.123091285603595, + "language_loss": 0.77423191, + "learning_rate": 3.0203916218546024e-06, + "loss": 0.79580915, + "num_input_tokens_seen": 125125730, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.80859375, + "step": 5816, + "time_per_iteration": 2.413438558578491 + }, + { + "auxiliary_loss_clip": 0.01128865, + "auxiliary_loss_mlp": 0.01042596, + "balance_loss_clip": 1.02761126, + "balance_loss_mlp": 1.0468061, + "epoch": 0.3497369607695776, + "flos": 22600021691520.0, + "grad_norm": 2.144763996717865, + "language_loss": 0.58441401, + "learning_rate": 3.0200566427685246e-06, + "loss": 0.60612863, + "num_input_tokens_seen": 125146195, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8203125, + "step": 5817, + "time_per_iteration": 2.5161447525024414 + }, + { + "auxiliary_loss_clip": 0.01042618, + "auxiliary_loss_mlp": 0.01011257, + "balance_loss_clip": 1.00957632, + "balance_loss_mlp": 1.01738954, + "epoch": 0.34979708402224563, + "flos": 68529374818560.0, + "grad_norm": 0.8658844915790124, + "language_loss": 0.59855008, + "learning_rate": 3.0197216250017975e-06, + "loss": 0.61908889, + "num_input_tokens_seen": 125207790, + "router_z_loss_clip": 0.0168457, + "router_z_loss_mlp": 0.25195312, + "step": 5818, + "time_per_iteration": 3.105595111846924 + }, + { + "auxiliary_loss_clip": 0.01123632, + "auxiliary_loss_mlp": 0.01036752, + "balance_loss_clip": 1.02226782, + "balance_loss_mlp": 1.04561055, + "epoch": 0.3498572072749136, + "flos": 18989634988800.0, + "grad_norm": 3.0068929936640103, + "language_loss": 0.83458424, + "learning_rate": 3.019386568567123e-06, + "loss": 0.85618806, + "num_input_tokens_seen": 125226220, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 5819, + "time_per_iteration": 2.47537899017334 + }, + { + "auxiliary_loss_clip": 0.01123279, + "auxiliary_loss_mlp": 0.0103063, + "balance_loss_clip": 1.01655149, + "balance_loss_mlp": 1.04359841, + "epoch": 0.34991733052758156, + "flos": 27818883423360.0, + "grad_norm": 3.6330435008795483, + "language_loss": 0.70765841, + "learning_rate": 3.0190514734772083e-06, + "loss": 0.7291975, + "num_input_tokens_seen": 125247485, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.796875, + "step": 5820, + "time_per_iteration": 2.4817428588867188 + }, + { + "auxiliary_loss_clip": 0.01125706, + "auxiliary_loss_mlp": 0.01035768, + "balance_loss_clip": 1.022434, + "balance_loss_mlp": 1.04544306, + "epoch": 0.3499774537802495, + "flos": 33584197737600.0, + "grad_norm": 2.1579309336976547, + "language_loss": 0.70112801, + "learning_rate": 3.018716339744759e-06, + "loss": 0.7227428, + "num_input_tokens_seen": 125268625, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.80078125, + "step": 5821, + "time_per_iteration": 2.578753709793091 + }, + { + "auxiliary_loss_clip": 0.01131817, + "auxiliary_loss_mlp": 0.0103919, + "balance_loss_clip": 1.02328706, + "balance_loss_mlp": 1.04798198, + "epoch": 0.3500375770329175, + "flos": 23476744851840.0, + "grad_norm": 2.9634934958204076, + "language_loss": 0.73591399, + "learning_rate": 3.0183811673824842e-06, + "loss": 0.75762403, + "num_input_tokens_seen": 125287530, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.83984375, + "step": 5822, + "time_per_iteration": 2.469041109085083 + }, + { + "auxiliary_loss_clip": 0.01127055, + "auxiliary_loss_mlp": 0.01036959, + "balance_loss_clip": 1.02150989, + "balance_loss_mlp": 1.0447278, + "epoch": 0.35009770028558546, + "flos": 19026048401280.0, + "grad_norm": 1.5203539526389718, + "language_loss": 0.78104019, + "learning_rate": 3.018045956403094e-06, + "loss": 0.80268037, + "num_input_tokens_seen": 125307020, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.82421875, + "step": 5823, + "time_per_iteration": 2.4932196140289307 + }, + { + "auxiliary_loss_clip": 0.01038228, + "auxiliary_loss_mlp": 0.01001335, + "balance_loss_clip": 0.99964237, + "balance_loss_mlp": 1.01332808, + "epoch": 0.3501578235382534, + "flos": 68351868783360.0, + "grad_norm": 1.4438996436497689, + "language_loss": 0.59237444, + "learning_rate": 3.017710706819298e-06, + "loss": 0.61277008, + "num_input_tokens_seen": 125370445, + "router_z_loss_clip": 0.01696777, + "router_z_loss_mlp": 0.24902344, + "step": 5824, + "time_per_iteration": 3.109966278076172 + }, + { + "auxiliary_loss_clip": 0.01125511, + "auxiliary_loss_mlp": 0.01036598, + "balance_loss_clip": 1.0213685, + "balance_loss_mlp": 1.04462993, + "epoch": 0.3502179467909214, + "flos": 21250893836160.0, + "grad_norm": 1.8425293735622459, + "language_loss": 0.84740114, + "learning_rate": 3.017375418643811e-06, + "loss": 0.86902225, + "num_input_tokens_seen": 125388900, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.80859375, + "step": 5825, + "time_per_iteration": 2.4780030250549316 + }, + { + "auxiliary_loss_clip": 0.01125254, + "auxiliary_loss_mlp": 0.01038054, + "balance_loss_clip": 1.02292657, + "balance_loss_mlp": 1.04522121, + "epoch": 0.35027807004358935, + "flos": 11942955826560.0, + "grad_norm": 2.24584207136959, + "language_loss": 0.82778502, + "learning_rate": 3.0170400918893464e-06, + "loss": 0.84941804, + "num_input_tokens_seen": 125402675, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.80078125, + "step": 5826, + "time_per_iteration": 2.4147045612335205 + }, + { + "auxiliary_loss_clip": 0.01126938, + "auxiliary_loss_mlp": 0.01041103, + "balance_loss_clip": 1.02587962, + "balance_loss_mlp": 1.04480314, + "epoch": 0.3503381932962573, + "flos": 21470918595840.0, + "grad_norm": 1.5075773428374344, + "language_loss": 0.80714649, + "learning_rate": 3.0167047265686186e-06, + "loss": 0.8288269, + "num_input_tokens_seen": 125421360, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8203125, + "step": 5827, + "time_per_iteration": 2.4650330543518066 + }, + { + "auxiliary_loss_clip": 0.01123347, + "auxiliary_loss_mlp": 0.01035841, + "balance_loss_clip": 1.0220902, + "balance_loss_mlp": 1.04475152, + "epoch": 0.3503983165489253, + "flos": 21251109317760.0, + "grad_norm": 2.7582821019631836, + "language_loss": 0.70936024, + "learning_rate": 3.0163693226943467e-06, + "loss": 0.73095214, + "num_input_tokens_seen": 125440000, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.78515625, + "step": 5828, + "time_per_iteration": 2.4710564613342285 + }, + { + "auxiliary_loss_clip": 0.01130881, + "auxiliary_loss_mlp": 0.01043725, + "balance_loss_clip": 1.02666616, + "balance_loss_mlp": 1.04788435, + "epoch": 0.35045843980159325, + "flos": 27815723026560.0, + "grad_norm": 1.628373483521701, + "language_loss": 0.79397106, + "learning_rate": 3.016033880279248e-06, + "loss": 0.81571716, + "num_input_tokens_seen": 125460390, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.828125, + "step": 5829, + "time_per_iteration": 2.5081264972686768 + }, + { + "auxiliary_loss_clip": 0.01129997, + "auxiliary_loss_mlp": 0.0104564, + "balance_loss_clip": 1.02900994, + "balance_loss_mlp": 1.04607642, + "epoch": 0.3505185630542612, + "flos": 25921148169600.0, + "grad_norm": 1.7135270810407168, + "language_loss": 0.72111332, + "learning_rate": 3.0156983993360417e-06, + "loss": 0.74286962, + "num_input_tokens_seen": 125478410, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.83984375, + "step": 5830, + "time_per_iteration": 2.507263422012329 + }, + { + "auxiliary_loss_clip": 0.01122818, + "auxiliary_loss_mlp": 0.01033023, + "balance_loss_clip": 1.01801419, + "balance_loss_mlp": 1.04352021, + "epoch": 0.35057868630692923, + "flos": 20521763660160.0, + "grad_norm": 2.0188022258715996, + "language_loss": 0.88740343, + "learning_rate": 3.0153628798774513e-06, + "loss": 0.90896189, + "num_input_tokens_seen": 125495975, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.79296875, + "step": 5831, + "time_per_iteration": 2.4769816398620605 + }, + { + "auxiliary_loss_clip": 0.01122435, + "auxiliary_loss_mlp": 0.01040768, + "balance_loss_clip": 1.02560508, + "balance_loss_mlp": 1.04128802, + "epoch": 0.3506388095595972, + "flos": 20448649526400.0, + "grad_norm": 1.9377344606434141, + "language_loss": 0.78478962, + "learning_rate": 3.0150273219161985e-06, + "loss": 0.80642164, + "num_input_tokens_seen": 125515035, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8125, + "step": 5832, + "time_per_iteration": 2.458019971847534 + }, + { + "auxiliary_loss_clip": 0.01125835, + "auxiliary_loss_mlp": 0.01043672, + "balance_loss_clip": 1.02744734, + "balance_loss_mlp": 1.04360127, + "epoch": 0.35069893281226516, + "flos": 23109665811840.0, + "grad_norm": 1.8976688118149017, + "language_loss": 0.70859557, + "learning_rate": 3.014691725465008e-06, + "loss": 0.73029065, + "num_input_tokens_seen": 125535555, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.82421875, + "step": 5833, + "time_per_iteration": 2.494739055633545 + }, + { + "auxiliary_loss_clip": 0.01121087, + "auxiliary_loss_mlp": 0.01030807, + "balance_loss_clip": 1.01635337, + "balance_loss_mlp": 1.04384482, + "epoch": 0.35075905606493313, + "flos": 27271999877760.0, + "grad_norm": 1.3472514068868482, + "language_loss": 0.80878949, + "learning_rate": 3.014356090536606e-06, + "loss": 0.83030844, + "num_input_tokens_seen": 125558195, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7734375, + "step": 5834, + "time_per_iteration": 2.521343231201172 + }, + { + "auxiliary_loss_clip": 0.01124914, + "auxiliary_loss_mlp": 0.01043402, + "balance_loss_clip": 1.02823853, + "balance_loss_mlp": 1.04525888, + "epoch": 0.3508191793176011, + "flos": 19128608709120.0, + "grad_norm": 2.219662071096021, + "language_loss": 0.83629, + "learning_rate": 3.0140204171437183e-06, + "loss": 0.8579731, + "num_input_tokens_seen": 125575375, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.796875, + "step": 5835, + "time_per_iteration": 2.53587007522583 + }, + { + "auxiliary_loss_clip": 0.01123177, + "auxiliary_loss_mlp": 0.01043674, + "balance_loss_clip": 1.02932119, + "balance_loss_mlp": 1.04351568, + "epoch": 0.35087930257026906, + "flos": 25557588662400.0, + "grad_norm": 2.120648036265282, + "language_loss": 0.76607329, + "learning_rate": 3.0136847052990754e-06, + "loss": 0.78774178, + "num_input_tokens_seen": 125596745, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.796875, + "step": 5836, + "time_per_iteration": 2.54390549659729 + }, + { + "auxiliary_loss_clip": 0.01128097, + "auxiliary_loss_mlp": 0.01038562, + "balance_loss_clip": 1.02382731, + "balance_loss_mlp": 1.04872775, + "epoch": 0.350939425822937, + "flos": 18004246208640.0, + "grad_norm": 2.2292749531356986, + "language_loss": 0.77354801, + "learning_rate": 3.0133489550154074e-06, + "loss": 0.79521459, + "num_input_tokens_seen": 125613980, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.79296875, + "step": 5837, + "time_per_iteration": 2.4478273391723633 + }, + { + "auxiliary_loss_clip": 0.01123898, + "auxiliary_loss_mlp": 0.01044754, + "balance_loss_clip": 1.02998376, + "balance_loss_mlp": 1.04441822, + "epoch": 0.350999549075605, + "flos": 22273198819200.0, + "grad_norm": 1.6098451794116821, + "language_loss": 0.68129408, + "learning_rate": 3.0130131663054442e-06, + "loss": 0.70298064, + "num_input_tokens_seen": 125632100, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.79296875, + "step": 5838, + "time_per_iteration": 2.505833864212036 + }, + { + "auxiliary_loss_clip": 0.01122037, + "auxiliary_loss_mlp": 0.01034351, + "balance_loss_clip": 1.01945019, + "balance_loss_mlp": 1.04240978, + "epoch": 0.35105967232827295, + "flos": 14392279307520.0, + "grad_norm": 2.0937603738721173, + "language_loss": 0.83561182, + "learning_rate": 3.0126773391819215e-06, + "loss": 0.85717571, + "num_input_tokens_seen": 125649190, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.796875, + "step": 5839, + "time_per_iteration": 2.4378576278686523 + }, + { + "auxiliary_loss_clip": 0.01126069, + "auxiliary_loss_mlp": 0.01042905, + "balance_loss_clip": 1.02775335, + "balance_loss_mlp": 1.04351032, + "epoch": 0.3511197955809409, + "flos": 25082346792960.0, + "grad_norm": 1.6277808139419232, + "language_loss": 0.58590645, + "learning_rate": 3.012341473657572e-06, + "loss": 0.60759622, + "num_input_tokens_seen": 125668680, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.828125, + "step": 5840, + "time_per_iteration": 2.4883387088775635 + }, + { + "auxiliary_loss_clip": 0.01125241, + "auxiliary_loss_mlp": 0.01035574, + "balance_loss_clip": 1.02015984, + "balance_loss_mlp": 1.04445219, + "epoch": 0.3511799188336089, + "flos": 25884160139520.0, + "grad_norm": 2.7790843018814058, + "language_loss": 0.87061596, + "learning_rate": 3.0120055697451322e-06, + "loss": 0.89222413, + "num_input_tokens_seen": 125686935, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.80859375, + "step": 5841, + "time_per_iteration": 2.5035836696624756 + }, + { + "auxiliary_loss_clip": 0.01128185, + "auxiliary_loss_mlp": 0.01041931, + "balance_loss_clip": 1.02551615, + "balance_loss_mlp": 1.0455035, + "epoch": 0.35124004208627685, + "flos": 20083725302400.0, + "grad_norm": 1.6842451001577108, + "language_loss": 0.74924648, + "learning_rate": 3.0116696274573406e-06, + "loss": 0.77094764, + "num_input_tokens_seen": 125707180, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.828125, + "step": 5842, + "time_per_iteration": 2.4677891731262207 + }, + { + "auxiliary_loss_clip": 0.01125535, + "auxiliary_loss_mlp": 0.01040751, + "balance_loss_clip": 1.02552199, + "balance_loss_mlp": 1.04403496, + "epoch": 0.3513001653389448, + "flos": 17783431349760.0, + "grad_norm": 3.45436030057014, + "language_loss": 0.68184745, + "learning_rate": 3.0113336468069346e-06, + "loss": 0.70351034, + "num_input_tokens_seen": 125722780, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8125, + "step": 5843, + "time_per_iteration": 2.4356935024261475 + }, + { + "auxiliary_loss_clip": 0.01123467, + "auxiliary_loss_mlp": 0.01042343, + "balance_loss_clip": 1.02734041, + "balance_loss_mlp": 1.04418659, + "epoch": 0.3513602885916128, + "flos": 29387138198400.0, + "grad_norm": 3.71115813366519, + "language_loss": 0.65957326, + "learning_rate": 3.010997627806655e-06, + "loss": 0.68123138, + "num_input_tokens_seen": 125742110, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.79296875, + "step": 5844, + "time_per_iteration": 2.4961743354797363 + }, + { + "auxiliary_loss_clip": 0.01124887, + "auxiliary_loss_mlp": 0.01040447, + "balance_loss_clip": 1.02446079, + "balance_loss_mlp": 1.04466677, + "epoch": 0.3514204118442808, + "flos": 16179876483840.0, + "grad_norm": 2.036064641334285, + "language_loss": 0.75629944, + "learning_rate": 3.010661570469245e-06, + "loss": 0.77795279, + "num_input_tokens_seen": 125759980, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8046875, + "step": 5845, + "time_per_iteration": 5.325402498245239 + }, + { + "auxiliary_loss_clip": 0.01123091, + "auxiliary_loss_mlp": 0.01039418, + "balance_loss_clip": 1.02483845, + "balance_loss_mlp": 1.04537153, + "epoch": 0.35148053509694877, + "flos": 23834665923840.0, + "grad_norm": 2.494167784966283, + "language_loss": 0.73075795, + "learning_rate": 3.0103254748074465e-06, + "loss": 0.75238299, + "num_input_tokens_seen": 125772660, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.77734375, + "step": 5846, + "time_per_iteration": 2.4515323638916016 + }, + { + "auxiliary_loss_clip": 0.01127959, + "auxiliary_loss_mlp": 0.01040823, + "balance_loss_clip": 1.02587426, + "balance_loss_mlp": 1.04755926, + "epoch": 0.35154065834961673, + "flos": 20991295267200.0, + "grad_norm": 1.6229430725765215, + "language_loss": 0.75876832, + "learning_rate": 3.0099893408340046e-06, + "loss": 0.78045619, + "num_input_tokens_seen": 125791935, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8046875, + "step": 5847, + "time_per_iteration": 2.4869656562805176 + }, + { + "auxiliary_loss_clip": 0.0112227, + "auxiliary_loss_mlp": 0.01034732, + "balance_loss_clip": 1.02067161, + "balance_loss_mlp": 1.04212832, + "epoch": 0.3516007816022847, + "flos": 33255471444480.0, + "grad_norm": 2.14189752244475, + "language_loss": 0.72070903, + "learning_rate": 3.009653168561666e-06, + "loss": 0.74227905, + "num_input_tokens_seen": 125813455, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.80078125, + "step": 5848, + "time_per_iteration": 2.5580503940582275 + }, + { + "auxiliary_loss_clip": 0.01127957, + "auxiliary_loss_mlp": 0.01044586, + "balance_loss_clip": 1.02953017, + "balance_loss_mlp": 1.04648554, + "epoch": 0.35166090485495266, + "flos": 11726953390080.0, + "grad_norm": 2.252970750126207, + "language_loss": 0.89321303, + "learning_rate": 3.009316958003178e-06, + "loss": 0.91493851, + "num_input_tokens_seen": 125827660, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8125, + "step": 5849, + "time_per_iteration": 2.4167070388793945 + }, + { + "auxiliary_loss_clip": 0.01123705, + "auxiliary_loss_mlp": 0.01032745, + "balance_loss_clip": 1.01810622, + "balance_loss_mlp": 1.04373825, + "epoch": 0.3517210281076206, + "flos": 22638446265600.0, + "grad_norm": 2.8040734708025026, + "language_loss": 0.74810916, + "learning_rate": 3.0089807091712897e-06, + "loss": 0.76967371, + "num_input_tokens_seen": 125846655, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.80078125, + "step": 5850, + "time_per_iteration": 2.457970142364502 + }, + { + "auxiliary_loss_clip": 0.0112382, + "auxiliary_loss_mlp": 0.01032618, + "balance_loss_clip": 1.01809859, + "balance_loss_mlp": 1.04618788, + "epoch": 0.3517811513602886, + "flos": 21322750993920.0, + "grad_norm": 1.5003899492593988, + "language_loss": 0.7563765, + "learning_rate": 3.0086444220787515e-06, + "loss": 0.77794087, + "num_input_tokens_seen": 125866290, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.77734375, + "step": 5851, + "time_per_iteration": 2.48270845413208 + }, + { + "auxiliary_loss_clip": 0.01126446, + "auxiliary_loss_mlp": 0.01036787, + "balance_loss_clip": 1.0219928, + "balance_loss_mlp": 1.04683256, + "epoch": 0.35184127461295656, + "flos": 21032880238080.0, + "grad_norm": 2.074837490144385, + "language_loss": 0.87552518, + "learning_rate": 3.0083080967383165e-06, + "loss": 0.89715755, + "num_input_tokens_seen": 125884620, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 5852, + "time_per_iteration": 2.4690029621124268 + }, + { + "auxiliary_loss_clip": 0.01122074, + "auxiliary_loss_mlp": 0.01035979, + "balance_loss_clip": 1.02193666, + "balance_loss_mlp": 1.04361391, + "epoch": 0.3519013978656245, + "flos": 22455265881600.0, + "grad_norm": 2.0973347969099048, + "language_loss": 0.67880064, + "learning_rate": 3.007971733162737e-06, + "loss": 0.70038116, + "num_input_tokens_seen": 125902430, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.78515625, + "step": 5853, + "time_per_iteration": 2.4953458309173584 + }, + { + "auxiliary_loss_clip": 0.01125495, + "auxiliary_loss_mlp": 0.01034243, + "balance_loss_clip": 1.0195092, + "balance_loss_mlp": 1.04545975, + "epoch": 0.3519615211182925, + "flos": 13115295918720.0, + "grad_norm": 1.6680659623481517, + "language_loss": 0.8122859, + "learning_rate": 3.0076353313647686e-06, + "loss": 0.83388329, + "num_input_tokens_seen": 125920570, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.80078125, + "step": 5854, + "time_per_iteration": 2.4702916145324707 + }, + { + "auxiliary_loss_clip": 0.01122667, + "auxiliary_loss_mlp": 0.01030591, + "balance_loss_clip": 1.01734662, + "balance_loss_mlp": 1.04566765, + "epoch": 0.35202164437096045, + "flos": 19135144984320.0, + "grad_norm": 1.6003148952985655, + "language_loss": 0.73131359, + "learning_rate": 3.0072988913571666e-06, + "loss": 0.75284624, + "num_input_tokens_seen": 125939800, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.76953125, + "step": 5855, + "time_per_iteration": 2.4895823001861572 + }, + { + "auxiliary_loss_clip": 0.01120527, + "auxiliary_loss_mlp": 0.01039285, + "balance_loss_clip": 1.02549887, + "balance_loss_mlp": 1.04334307, + "epoch": 0.3520817676236284, + "flos": 26542187343360.0, + "grad_norm": 3.701560840262617, + "language_loss": 0.70894778, + "learning_rate": 3.006962413152691e-06, + "loss": 0.73054588, + "num_input_tokens_seen": 125958720, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7734375, + "step": 5856, + "time_per_iteration": 2.5133585929870605 + }, + { + "auxiliary_loss_clip": 0.01126422, + "auxiliary_loss_mlp": 0.01044153, + "balance_loss_clip": 1.02881038, + "balance_loss_mlp": 1.0456897, + "epoch": 0.3521418908762964, + "flos": 44893472803200.0, + "grad_norm": 1.8086114170356375, + "language_loss": 0.60915685, + "learning_rate": 3.0066258967640987e-06, + "loss": 0.63086259, + "num_input_tokens_seen": 125984310, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.80859375, + "step": 5857, + "time_per_iteration": 2.723238468170166 + }, + { + "auxiliary_loss_clip": 0.01123346, + "auxiliary_loss_mlp": 0.01039329, + "balance_loss_clip": 1.02434421, + "balance_loss_mlp": 1.04425693, + "epoch": 0.3522020141289644, + "flos": 20187398931840.0, + "grad_norm": 1.754440516271971, + "language_loss": 0.73341751, + "learning_rate": 3.006289342204152e-06, + "loss": 0.75504428, + "num_input_tokens_seen": 126002410, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7890625, + "step": 5858, + "time_per_iteration": 2.509556293487549 + }, + { + "auxiliary_loss_clip": 0.01125415, + "auxiliary_loss_mlp": 0.01041448, + "balance_loss_clip": 1.02720821, + "balance_loss_mlp": 1.04428148, + "epoch": 0.35226213738163237, + "flos": 27563917708800.0, + "grad_norm": 1.4710047028379252, + "language_loss": 0.76090813, + "learning_rate": 3.0059527494856126e-06, + "loss": 0.7825768, + "num_input_tokens_seen": 126022490, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8125, + "step": 5859, + "time_per_iteration": 2.584312677383423 + }, + { + "auxiliary_loss_clip": 0.0113226, + "auxiliary_loss_mlp": 0.01038835, + "balance_loss_clip": 1.0230875, + "balance_loss_mlp": 1.04828274, + "epoch": 0.35232226063430033, + "flos": 22966310632320.0, + "grad_norm": 1.6944630123418771, + "language_loss": 0.71475387, + "learning_rate": 3.0056161186212435e-06, + "loss": 0.73646474, + "num_input_tokens_seen": 126042895, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.83984375, + "step": 5860, + "time_per_iteration": 2.5120623111724854 + }, + { + "auxiliary_loss_clip": 0.01125655, + "auxiliary_loss_mlp": 0.0104098, + "balance_loss_clip": 1.02506578, + "balance_loss_mlp": 1.04208136, + "epoch": 0.3523823838869683, + "flos": 19168290259200.0, + "grad_norm": 2.10777684168558, + "language_loss": 0.6624974, + "learning_rate": 3.005279449623811e-06, + "loss": 0.68416381, + "num_input_tokens_seen": 126060130, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8359375, + "step": 5861, + "time_per_iteration": 2.4927096366882324 + }, + { + "auxiliary_loss_clip": 0.01123555, + "auxiliary_loss_mlp": 0.0103431, + "balance_loss_clip": 1.01994538, + "balance_loss_mlp": 1.04497313, + "epoch": 0.35244250713963626, + "flos": 17930988420480.0, + "grad_norm": 2.1064993181157843, + "language_loss": 0.66780227, + "learning_rate": 3.0049427425060815e-06, + "loss": 0.68938088, + "num_input_tokens_seen": 126077850, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78515625, + "step": 5862, + "time_per_iteration": 2.4275379180908203 + }, + { + "auxiliary_loss_clip": 0.01124727, + "auxiliary_loss_mlp": 0.01037294, + "balance_loss_clip": 1.02132034, + "balance_loss_mlp": 1.04420304, + "epoch": 0.35250263039230423, + "flos": 21432529935360.0, + "grad_norm": 2.0193315360348842, + "language_loss": 0.77049166, + "learning_rate": 3.0046059972808215e-06, + "loss": 0.79211187, + "num_input_tokens_seen": 126095985, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8046875, + "step": 5863, + "time_per_iteration": 2.504391670227051 + }, + { + "auxiliary_loss_clip": 0.0112515, + "auxiliary_loss_mlp": 0.01034667, + "balance_loss_clip": 1.02027822, + "balance_loss_mlp": 1.04449666, + "epoch": 0.3525627536449722, + "flos": 27416863428480.0, + "grad_norm": 2.7341123556359297, + "language_loss": 0.75018549, + "learning_rate": 3.0042692139608024e-06, + "loss": 0.77178371, + "num_input_tokens_seen": 126116070, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8046875, + "step": 5864, + "time_per_iteration": 2.4962751865386963 + }, + { + "auxiliary_loss_clip": 0.01123376, + "auxiliary_loss_mlp": 0.0104564, + "balance_loss_clip": 1.03110838, + "balance_loss_mlp": 1.04376507, + "epoch": 0.35262287689764016, + "flos": 24789818430720.0, + "grad_norm": 1.9972182581193567, + "language_loss": 0.79051632, + "learning_rate": 3.003932392558793e-06, + "loss": 0.81220651, + "num_input_tokens_seen": 126135205, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.796875, + "step": 5865, + "time_per_iteration": 2.5369789600372314 + }, + { + "auxiliary_loss_clip": 0.01130515, + "auxiliary_loss_mlp": 0.01045214, + "balance_loss_clip": 1.02901387, + "balance_loss_mlp": 1.04835618, + "epoch": 0.3526830001503081, + "flos": 17821604528640.0, + "grad_norm": 1.8375125007543296, + "language_loss": 0.81622374, + "learning_rate": 3.0035955330875677e-06, + "loss": 0.8379811, + "num_input_tokens_seen": 126151895, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8203125, + "step": 5866, + "time_per_iteration": 2.497587203979492 + }, + { + "auxiliary_loss_clip": 0.01131205, + "auxiliary_loss_mlp": 0.01037377, + "balance_loss_clip": 1.02081871, + "balance_loss_mlp": 1.04493296, + "epoch": 0.3527431234029761, + "flos": 18078114528000.0, + "grad_norm": 2.1796505180833696, + "language_loss": 0.84552217, + "learning_rate": 3.0032586355598986e-06, + "loss": 0.867208, + "num_input_tokens_seen": 126168515, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.86328125, + "step": 5867, + "time_per_iteration": 2.5673649311065674 + }, + { + "auxiliary_loss_clip": 0.01126594, + "auxiliary_loss_mlp": 0.01043148, + "balance_loss_clip": 1.02764452, + "balance_loss_mlp": 1.04441357, + "epoch": 0.35280324665564405, + "flos": 19427350124160.0, + "grad_norm": 2.2018810166756873, + "language_loss": 0.74618357, + "learning_rate": 3.0029216999885613e-06, + "loss": 0.76788092, + "num_input_tokens_seen": 126186460, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8203125, + "step": 5868, + "time_per_iteration": 2.4571762084960938 + }, + { + "auxiliary_loss_clip": 0.01127392, + "auxiliary_loss_mlp": 0.01039387, + "balance_loss_clip": 1.02433038, + "balance_loss_mlp": 1.04489541, + "epoch": 0.352863369908312, + "flos": 21504027957120.0, + "grad_norm": 2.0366485396940615, + "language_loss": 0.61648643, + "learning_rate": 3.0025847263863327e-06, + "loss": 0.63815421, + "num_input_tokens_seen": 126206170, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.82421875, + "step": 5869, + "time_per_iteration": 2.5125019550323486 + }, + { + "auxiliary_loss_clip": 0.01124688, + "auxiliary_loss_mlp": 0.01042493, + "balance_loss_clip": 1.02690625, + "balance_loss_mlp": 1.04286385, + "epoch": 0.35292349316098, + "flos": 22309504490880.0, + "grad_norm": 2.290977208251557, + "language_loss": 0.74328029, + "learning_rate": 3.0022477147659917e-06, + "loss": 0.76495212, + "num_input_tokens_seen": 126225605, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8203125, + "step": 5870, + "time_per_iteration": 2.4636306762695312 + }, + { + "auxiliary_loss_clip": 0.0112446, + "auxiliary_loss_mlp": 0.01036399, + "balance_loss_clip": 1.02097332, + "balance_loss_mlp": 1.04412317, + "epoch": 0.352983616413648, + "flos": 33109745967360.0, + "grad_norm": 1.44010977521146, + "language_loss": 0.71498513, + "learning_rate": 3.001910665140316e-06, + "loss": 0.73659372, + "num_input_tokens_seen": 126250230, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8046875, + "step": 5871, + "time_per_iteration": 2.629002094268799 + }, + { + "auxiliary_loss_clip": 0.01120822, + "auxiliary_loss_mlp": 0.01033704, + "balance_loss_clip": 1.01999545, + "balance_loss_mlp": 1.04340768, + "epoch": 0.35304373966631597, + "flos": 18696603836160.0, + "grad_norm": 2.215441176085892, + "language_loss": 0.74219513, + "learning_rate": 3.0015735775220873e-06, + "loss": 0.76374042, + "num_input_tokens_seen": 126268315, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 5872, + "time_per_iteration": 2.4672691822052 + }, + { + "auxiliary_loss_clip": 0.01121667, + "auxiliary_loss_mlp": 0.01036996, + "balance_loss_clip": 1.02291727, + "balance_loss_mlp": 1.04295182, + "epoch": 0.35310386291898394, + "flos": 23364954748800.0, + "grad_norm": 1.6120105579455812, + "language_loss": 0.82492435, + "learning_rate": 3.001236451924089e-06, + "loss": 0.84651101, + "num_input_tokens_seen": 126288390, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 5873, + "time_per_iteration": 2.549706220626831 + }, + { + "auxiliary_loss_clip": 0.01128213, + "auxiliary_loss_mlp": 0.01044661, + "balance_loss_clip": 1.02800715, + "balance_loss_mlp": 1.04399252, + "epoch": 0.3531639861716519, + "flos": 24461954064000.0, + "grad_norm": 1.8495868157058504, + "language_loss": 0.6583339, + "learning_rate": 3.000899288359104e-06, + "loss": 0.68006265, + "num_input_tokens_seen": 126305750, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.84375, + "step": 5874, + "time_per_iteration": 2.4949634075164795 + }, + { + "auxiliary_loss_clip": 0.01044147, + "auxiliary_loss_mlp": 0.01006984, + "balance_loss_clip": 1.00510025, + "balance_loss_mlp": 1.01915693, + "epoch": 0.35322410942431987, + "flos": 70312446881280.0, + "grad_norm": 0.771003921858337, + "language_loss": 0.61583531, + "learning_rate": 3.000562086839917e-06, + "loss": 0.63634658, + "num_input_tokens_seen": 126362495, + "router_z_loss_clip": 0.01879883, + "router_z_loss_mlp": 0.25, + "step": 5875, + "time_per_iteration": 2.9931485652923584 + }, + { + "auxiliary_loss_clip": 0.01124819, + "auxiliary_loss_mlp": 0.01043824, + "balance_loss_clip": 1.02995443, + "balance_loss_mlp": 1.04544568, + "epoch": 0.35328423267698783, + "flos": 19820894509440.0, + "grad_norm": 1.6836782364007539, + "language_loss": 0.800933, + "learning_rate": 3.0002248473793163e-06, + "loss": 0.82261944, + "num_input_tokens_seen": 126378320, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.796875, + "step": 5876, + "time_per_iteration": 2.443178415298462 + }, + { + "auxiliary_loss_clip": 0.01041911, + "auxiliary_loss_mlp": 0.01006634, + "balance_loss_clip": 1.00477409, + "balance_loss_mlp": 1.01663578, + "epoch": 0.3533443559296558, + "flos": 60826356391680.0, + "grad_norm": 1.6287450036197537, + "language_loss": 0.5674026, + "learning_rate": 2.999887569990088e-06, + "loss": 0.587888, + "num_input_tokens_seen": 126442735, + "router_z_loss_clip": 0.01855469, + "router_z_loss_mlp": 0.25195312, + "step": 5877, + "time_per_iteration": 3.1782116889953613 + }, + { + "auxiliary_loss_clip": 0.01124291, + "auxiliary_loss_mlp": 0.01030449, + "balance_loss_clip": 1.01677024, + "balance_loss_mlp": 1.04401922, + "epoch": 0.35340447918232376, + "flos": 24755775315840.0, + "grad_norm": 1.5579095187110108, + "language_loss": 0.71649593, + "learning_rate": 2.999550254685024e-06, + "loss": 0.73804337, + "num_input_tokens_seen": 126463090, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.80078125, + "step": 5878, + "time_per_iteration": 2.4984474182128906 + }, + { + "auxiliary_loss_clip": 0.01123007, + "auxiliary_loss_mlp": 0.0103937, + "balance_loss_clip": 1.02498162, + "balance_loss_mlp": 1.04198527, + "epoch": 0.3534646024349917, + "flos": 21796304924160.0, + "grad_norm": 1.9384917614544617, + "language_loss": 0.78492844, + "learning_rate": 2.9992129014769136e-06, + "loss": 0.80655217, + "num_input_tokens_seen": 126482105, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.8125, + "step": 5879, + "time_per_iteration": 2.5369913578033447 + }, + { + "auxiliary_loss_clip": 0.01126898, + "auxiliary_loss_mlp": 0.01045347, + "balance_loss_clip": 1.02870536, + "balance_loss_mlp": 1.04373121, + "epoch": 0.3535247256876597, + "flos": 20012119539840.0, + "grad_norm": 2.0656781659104917, + "language_loss": 0.63695049, + "learning_rate": 2.9988755103785493e-06, + "loss": 0.65867293, + "num_input_tokens_seen": 126502125, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.83203125, + "step": 5880, + "time_per_iteration": 2.457787036895752 + }, + { + "auxiliary_loss_clip": 0.01125585, + "auxiliary_loss_mlp": 0.01036241, + "balance_loss_clip": 1.02078009, + "balance_loss_mlp": 1.04375386, + "epoch": 0.35358484894032766, + "flos": 18187929383040.0, + "grad_norm": 3.125568384757795, + "language_loss": 0.65818816, + "learning_rate": 2.998538081402727e-06, + "loss": 0.67980647, + "num_input_tokens_seen": 126521950, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.81640625, + "step": 5881, + "time_per_iteration": 2.5198867321014404 + }, + { + "auxiliary_loss_clip": 0.01119138, + "auxiliary_loss_mlp": 0.01031894, + "balance_loss_clip": 1.01851869, + "balance_loss_mlp": 1.04197288, + "epoch": 0.3536449721929956, + "flos": 22820369673600.0, + "grad_norm": 1.3882047203281038, + "language_loss": 0.75280428, + "learning_rate": 2.998200614562239e-06, + "loss": 0.77431458, + "num_input_tokens_seen": 126542445, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7734375, + "step": 5882, + "time_per_iteration": 2.4526872634887695 + }, + { + "auxiliary_loss_clip": 0.01126623, + "auxiliary_loss_mlp": 0.01037746, + "balance_loss_clip": 1.02266037, + "balance_loss_mlp": 1.04543018, + "epoch": 0.3537050954456636, + "flos": 26432336574720.0, + "grad_norm": 2.123888211837838, + "language_loss": 0.70349854, + "learning_rate": 2.9978631098698847e-06, + "loss": 0.72514224, + "num_input_tokens_seen": 126560690, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8125, + "step": 5883, + "time_per_iteration": 2.538865566253662 + }, + { + "auxiliary_loss_clip": 0.01129519, + "auxiliary_loss_mlp": 0.01038175, + "balance_loss_clip": 1.0228982, + "balance_loss_mlp": 1.04584253, + "epoch": 0.3537652186983316, + "flos": 17197153562880.0, + "grad_norm": 2.009195754637657, + "language_loss": 0.78500903, + "learning_rate": 2.9975255673384614e-06, + "loss": 0.80668598, + "num_input_tokens_seen": 126577620, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8359375, + "step": 5884, + "time_per_iteration": 2.4410510063171387 + }, + { + "auxiliary_loss_clip": 0.0112138, + "auxiliary_loss_mlp": 0.01032576, + "balance_loss_clip": 1.01901007, + "balance_loss_mlp": 1.04336667, + "epoch": 0.3538253419509996, + "flos": 19536769929600.0, + "grad_norm": 1.8922441591552446, + "language_loss": 0.75478536, + "learning_rate": 2.9971879869807673e-06, + "loss": 0.77632499, + "num_input_tokens_seen": 126596235, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.78125, + "step": 5885, + "time_per_iteration": 2.555816650390625 + }, + { + "auxiliary_loss_clip": 0.01127447, + "auxiliary_loss_mlp": 0.01042213, + "balance_loss_clip": 1.02666783, + "balance_loss_mlp": 1.04478371, + "epoch": 0.35388546520366754, + "flos": 12128578335360.0, + "grad_norm": 2.2081606315958635, + "language_loss": 0.82679224, + "learning_rate": 2.996850368809606e-06, + "loss": 0.84848893, + "num_input_tokens_seen": 126612830, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.82421875, + "step": 5886, + "time_per_iteration": 2.482151985168457 + }, + { + "auxiliary_loss_clip": 0.01124743, + "auxiliary_loss_mlp": 0.01032293, + "balance_loss_clip": 1.01717782, + "balance_loss_mlp": 1.04533887, + "epoch": 0.3539455884563355, + "flos": 19678149861120.0, + "grad_norm": 2.4580910750403775, + "language_loss": 0.78723359, + "learning_rate": 2.9965127128377787e-06, + "loss": 0.80880398, + "num_input_tokens_seen": 126630910, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.79296875, + "step": 5887, + "time_per_iteration": 5.388309001922607 + }, + { + "auxiliary_loss_clip": 0.01122251, + "auxiliary_loss_mlp": 0.0104141, + "balance_loss_clip": 1.0269978, + "balance_loss_mlp": 1.04226518, + "epoch": 0.35400571170900347, + "flos": 18072045129600.0, + "grad_norm": 3.1093010737907867, + "language_loss": 0.65404654, + "learning_rate": 2.996175019078089e-06, + "loss": 0.67568314, + "num_input_tokens_seen": 126648365, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.80078125, + "step": 5888, + "time_per_iteration": 2.4438626766204834 + }, + { + "auxiliary_loss_clip": 0.01123271, + "auxiliary_loss_mlp": 0.01036138, + "balance_loss_clip": 1.02248812, + "balance_loss_mlp": 1.04373193, + "epoch": 0.35406583496167143, + "flos": 26068058795520.0, + "grad_norm": 1.6702882106954304, + "language_loss": 0.76662588, + "learning_rate": 2.9958372875433437e-06, + "loss": 0.78821993, + "num_input_tokens_seen": 126667500, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.796875, + "step": 5889, + "time_per_iteration": 2.503023624420166 + }, + { + "auxiliary_loss_clip": 0.01125083, + "auxiliary_loss_mlp": 0.01037766, + "balance_loss_clip": 1.02329397, + "balance_loss_mlp": 1.0469135, + "epoch": 0.3541259582143394, + "flos": 19792453916160.0, + "grad_norm": 1.7418080185903937, + "language_loss": 0.80142188, + "learning_rate": 2.9954995182463478e-06, + "loss": 0.82305038, + "num_input_tokens_seen": 126686820, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 5890, + "time_per_iteration": 2.4669902324676514 + }, + { + "auxiliary_loss_clip": 0.01118725, + "auxiliary_loss_mlp": 0.01034555, + "balance_loss_clip": 1.02204418, + "balance_loss_mlp": 1.04123974, + "epoch": 0.35418608146700736, + "flos": 24022084112640.0, + "grad_norm": 1.4765808553545194, + "language_loss": 0.79590207, + "learning_rate": 2.99516171119991e-06, + "loss": 0.81743479, + "num_input_tokens_seen": 126706965, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7734375, + "step": 5891, + "time_per_iteration": 2.491048812866211 + }, + { + "auxiliary_loss_clip": 0.01123501, + "auxiliary_loss_mlp": 0.01037192, + "balance_loss_clip": 1.02260685, + "balance_loss_mlp": 1.04425383, + "epoch": 0.35424620471967533, + "flos": 12385770693120.0, + "grad_norm": 2.0747162768055616, + "language_loss": 0.73339593, + "learning_rate": 2.9948238664168415e-06, + "loss": 0.7550028, + "num_input_tokens_seen": 126724015, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.79296875, + "step": 5892, + "time_per_iteration": 2.497422695159912 + }, + { + "auxiliary_loss_clip": 0.01124613, + "auxiliary_loss_mlp": 0.01038788, + "balance_loss_clip": 1.02425075, + "balance_loss_mlp": 1.04473233, + "epoch": 0.3543063279723433, + "flos": 19673624747520.0, + "grad_norm": 1.9338165898472526, + "language_loss": 0.66916019, + "learning_rate": 2.9944859839099518e-06, + "loss": 0.69079423, + "num_input_tokens_seen": 126737565, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.796875, + "step": 5893, + "time_per_iteration": 2.4516420364379883 + }, + { + "auxiliary_loss_clip": 0.01123079, + "auxiliary_loss_mlp": 0.01037638, + "balance_loss_clip": 1.02253413, + "balance_loss_mlp": 1.04405212, + "epoch": 0.35436645122501126, + "flos": 21909208348800.0, + "grad_norm": 1.878049090913109, + "language_loss": 0.69472313, + "learning_rate": 2.9941480636920533e-06, + "loss": 0.71633029, + "num_input_tokens_seen": 126756095, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7890625, + "step": 5894, + "time_per_iteration": 2.479174852371216 + }, + { + "auxiliary_loss_clip": 0.01123499, + "auxiliary_loss_mlp": 0.01033075, + "balance_loss_clip": 1.01983714, + "balance_loss_mlp": 1.04524636, + "epoch": 0.3544265744776792, + "flos": 21719527603200.0, + "grad_norm": 1.6954645527360779, + "language_loss": 0.74891931, + "learning_rate": 2.9938101057759615e-06, + "loss": 0.77048504, + "num_input_tokens_seen": 126775455, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.78125, + "step": 5895, + "time_per_iteration": 2.4786908626556396 + }, + { + "auxiliary_loss_clip": 0.01122907, + "auxiliary_loss_mlp": 0.01037799, + "balance_loss_clip": 1.02366102, + "balance_loss_mlp": 1.04388869, + "epoch": 0.3544866977303472, + "flos": 21213223447680.0, + "grad_norm": 2.0548310630504854, + "language_loss": 0.83688253, + "learning_rate": 2.993472110174491e-06, + "loss": 0.85848963, + "num_input_tokens_seen": 126792320, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 5896, + "time_per_iteration": 2.4765214920043945 + }, + { + "auxiliary_loss_clip": 0.01122608, + "auxiliary_loss_mlp": 0.01048408, + "balance_loss_clip": 1.03348279, + "balance_loss_mlp": 1.0444181, + "epoch": 0.35454682098301515, + "flos": 29311402371840.0, + "grad_norm": 1.6634726813042469, + "language_loss": 0.70031154, + "learning_rate": 2.9931340769004576e-06, + "loss": 0.7220217, + "num_input_tokens_seen": 126813680, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.78125, + "step": 5897, + "time_per_iteration": 2.5142548084259033 + }, + { + "auxiliary_loss_clip": 0.01121754, + "auxiliary_loss_mlp": 0.01038358, + "balance_loss_clip": 1.02430916, + "balance_loss_mlp": 1.04337025, + "epoch": 0.3546069442356832, + "flos": 24316587722880.0, + "grad_norm": 1.7331024671064506, + "language_loss": 0.82091749, + "learning_rate": 2.9927960059666816e-06, + "loss": 0.84251857, + "num_input_tokens_seen": 126834395, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.78515625, + "step": 5898, + "time_per_iteration": 2.4900712966918945 + }, + { + "auxiliary_loss_clip": 0.0112068, + "auxiliary_loss_mlp": 0.01036408, + "balance_loss_clip": 1.0234853, + "balance_loss_mlp": 1.04411018, + "epoch": 0.35466706748835114, + "flos": 22857285876480.0, + "grad_norm": 1.4876974136883365, + "language_loss": 0.73901182, + "learning_rate": 2.9924578973859804e-06, + "loss": 0.76058269, + "num_input_tokens_seen": 126855145, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.765625, + "step": 5899, + "time_per_iteration": 2.498659133911133 + }, + { + "auxiliary_loss_clip": 0.01121982, + "auxiliary_loss_mlp": 0.01042838, + "balance_loss_clip": 1.02825308, + "balance_loss_mlp": 1.04316258, + "epoch": 0.3547271907410191, + "flos": 28330107742080.0, + "grad_norm": 1.69682390123668, + "language_loss": 0.79345262, + "learning_rate": 2.9921197511711763e-06, + "loss": 0.81510079, + "num_input_tokens_seen": 126873790, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7890625, + "step": 5900, + "time_per_iteration": 2.548612594604492 + }, + { + "auxiliary_loss_clip": 0.01123598, + "auxiliary_loss_mlp": 0.01040285, + "balance_loss_clip": 1.02556252, + "balance_loss_mlp": 1.04530048, + "epoch": 0.35478731399368707, + "flos": 23514092017920.0, + "grad_norm": 1.7758743329418227, + "language_loss": 0.81637204, + "learning_rate": 2.991781567335093e-06, + "loss": 0.83801091, + "num_input_tokens_seen": 126892865, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.78125, + "step": 5901, + "time_per_iteration": 2.6031999588012695 + }, + { + "auxiliary_loss_clip": 0.01127681, + "auxiliary_loss_mlp": 0.01034926, + "balance_loss_clip": 1.02063251, + "balance_loss_mlp": 1.04535294, + "epoch": 0.35484743724635504, + "flos": 18624315715200.0, + "grad_norm": 1.92677562296577, + "language_loss": 0.75667071, + "learning_rate": 2.9914433458905525e-06, + "loss": 0.77829683, + "num_input_tokens_seen": 126911935, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.82421875, + "step": 5902, + "time_per_iteration": 2.528026819229126 + }, + { + "auxiliary_loss_clip": 0.0112195, + "auxiliary_loss_mlp": 0.01036748, + "balance_loss_clip": 1.02359962, + "balance_loss_mlp": 1.04320014, + "epoch": 0.354907560499023, + "flos": 17384499924480.0, + "grad_norm": 1.7304108811682997, + "language_loss": 0.70582771, + "learning_rate": 2.991105086850381e-06, + "loss": 0.72741467, + "num_input_tokens_seen": 126930040, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7890625, + "step": 5903, + "time_per_iteration": 2.423454999923706 + }, + { + "auxiliary_loss_clip": 0.01124223, + "auxiliary_loss_mlp": 0.01034819, + "balance_loss_clip": 1.0205555, + "balance_loss_mlp": 1.04234982, + "epoch": 0.35496768375169097, + "flos": 19208546426880.0, + "grad_norm": 2.52210089781831, + "language_loss": 0.74574983, + "learning_rate": 2.9907667902274053e-06, + "loss": 0.76734024, + "num_input_tokens_seen": 126948390, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8203125, + "step": 5904, + "time_per_iteration": 2.462024688720703 + }, + { + "auxiliary_loss_clip": 0.0112423, + "auxiliary_loss_mlp": 0.01040901, + "balance_loss_clip": 1.02649426, + "balance_loss_mlp": 1.04362941, + "epoch": 0.35502780700435893, + "flos": 18332792933760.0, + "grad_norm": 2.0389703534000443, + "language_loss": 0.78855121, + "learning_rate": 2.9904284560344536e-06, + "loss": 0.81020248, + "num_input_tokens_seen": 126964905, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.8046875, + "step": 5905, + "time_per_iteration": 2.418665885925293 + }, + { + "auxiliary_loss_clip": 0.0111773, + "auxiliary_loss_mlp": 0.01031377, + "balance_loss_clip": 1.0190388, + "balance_loss_mlp": 1.04383469, + "epoch": 0.3550879302570269, + "flos": 15448555578240.0, + "grad_norm": 2.1398902938273547, + "language_loss": 0.72515827, + "learning_rate": 2.990090084284356e-06, + "loss": 0.74664938, + "num_input_tokens_seen": 126982000, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.73828125, + "step": 5906, + "time_per_iteration": 2.441795825958252 + }, + { + "auxiliary_loss_clip": 0.01128267, + "auxiliary_loss_mlp": 0.01037607, + "balance_loss_clip": 1.02187109, + "balance_loss_mlp": 1.04545534, + "epoch": 0.35514805350969486, + "flos": 21979197999360.0, + "grad_norm": 2.0230910533888107, + "language_loss": 0.74762344, + "learning_rate": 2.9897516749899426e-06, + "loss": 0.7692821, + "num_input_tokens_seen": 126998390, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.828125, + "step": 5907, + "time_per_iteration": 2.4404122829437256 + }, + { + "auxiliary_loss_clip": 0.01123497, + "auxiliary_loss_mlp": 0.01033795, + "balance_loss_clip": 1.01939988, + "balance_loss_mlp": 1.04492426, + "epoch": 0.3552081767623628, + "flos": 29861949104640.0, + "grad_norm": 1.7742327577799557, + "language_loss": 0.75751841, + "learning_rate": 2.989413228164047e-06, + "loss": 0.77909136, + "num_input_tokens_seen": 127020220, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78515625, + "step": 5908, + "time_per_iteration": 2.5631895065307617 + }, + { + "auxiliary_loss_clip": 0.01126393, + "auxiliary_loss_mlp": 0.01033964, + "balance_loss_clip": 1.01961696, + "balance_loss_mlp": 1.04734707, + "epoch": 0.3552683000150308, + "flos": 26432264747520.0, + "grad_norm": 1.7057235578436956, + "language_loss": 0.68026733, + "learning_rate": 2.989074743819502e-06, + "loss": 0.70187092, + "num_input_tokens_seen": 127038585, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7890625, + "step": 5909, + "time_per_iteration": 2.480511426925659 + }, + { + "auxiliary_loss_clip": 0.0112022, + "auxiliary_loss_mlp": 0.01032654, + "balance_loss_clip": 1.01937413, + "balance_loss_mlp": 1.04523396, + "epoch": 0.35532842326769876, + "flos": 19785989468160.0, + "grad_norm": 3.5777269988287297, + "language_loss": 0.78628188, + "learning_rate": 2.988736221969144e-06, + "loss": 0.8078106, + "num_input_tokens_seen": 127056215, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75, + "step": 5910, + "time_per_iteration": 2.4763131141662598 + }, + { + "auxiliary_loss_clip": 0.01128543, + "auxiliary_loss_mlp": 0.01040834, + "balance_loss_clip": 1.02545595, + "balance_loss_mlp": 1.04625309, + "epoch": 0.3553885465203668, + "flos": 17239277237760.0, + "grad_norm": 1.525011794663279, + "language_loss": 0.70639479, + "learning_rate": 2.98839766262581e-06, + "loss": 0.72808856, + "num_input_tokens_seen": 127075825, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8203125, + "step": 5911, + "time_per_iteration": 2.4335598945617676 + }, + { + "auxiliary_loss_clip": 0.01119575, + "auxiliary_loss_mlp": 0.01035647, + "balance_loss_clip": 1.02149105, + "balance_loss_mlp": 1.04294884, + "epoch": 0.35544866977303474, + "flos": 14934350430720.0, + "grad_norm": 1.9668748220600272, + "language_loss": 0.87014282, + "learning_rate": 2.9880590658023366e-06, + "loss": 0.89169508, + "num_input_tokens_seen": 127091205, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.765625, + "step": 5912, + "time_per_iteration": 2.461251735687256 + }, + { + "auxiliary_loss_clip": 0.01123002, + "auxiliary_loss_mlp": 0.01032384, + "balance_loss_clip": 1.018556, + "balance_loss_mlp": 1.04507196, + "epoch": 0.3555087930257027, + "flos": 19756040503680.0, + "grad_norm": 1.7619620740638822, + "language_loss": 0.7701745, + "learning_rate": 2.9877204315115646e-06, + "loss": 0.79172838, + "num_input_tokens_seen": 127109210, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.78125, + "step": 5913, + "time_per_iteration": 2.4517738819122314 + }, + { + "auxiliary_loss_clip": 0.01124528, + "auxiliary_loss_mlp": 0.01033929, + "balance_loss_clip": 1.02001143, + "balance_loss_mlp": 1.04793298, + "epoch": 0.3555689162783707, + "flos": 21068252156160.0, + "grad_norm": 1.3300117090522248, + "language_loss": 0.82507938, + "learning_rate": 2.9873817597663353e-06, + "loss": 0.84666395, + "num_input_tokens_seen": 127128400, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.765625, + "step": 5914, + "time_per_iteration": 2.4964141845703125 + }, + { + "auxiliary_loss_clip": 0.01124534, + "auxiliary_loss_mlp": 0.01031988, + "balance_loss_clip": 1.01771307, + "balance_loss_mlp": 1.04573739, + "epoch": 0.35562903953103864, + "flos": 33069633454080.0, + "grad_norm": 2.1657623831524604, + "language_loss": 0.70703268, + "learning_rate": 2.98704305057949e-06, + "loss": 0.72859794, + "num_input_tokens_seen": 127149965, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7890625, + "step": 5915, + "time_per_iteration": 2.5425658226013184 + }, + { + "auxiliary_loss_clip": 0.01120767, + "auxiliary_loss_mlp": 0.01038436, + "balance_loss_clip": 1.0249182, + "balance_loss_mlp": 1.04248476, + "epoch": 0.3556891627837066, + "flos": 20557853850240.0, + "grad_norm": 1.7489130528457595, + "language_loss": 0.76365829, + "learning_rate": 2.9867043039638737e-06, + "loss": 0.78525031, + "num_input_tokens_seen": 127169865, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.78125, + "step": 5916, + "time_per_iteration": 2.49629545211792 + }, + { + "auxiliary_loss_clip": 0.01128234, + "auxiliary_loss_mlp": 0.01037051, + "balance_loss_clip": 1.02360404, + "balance_loss_mlp": 1.04853928, + "epoch": 0.35574928603637457, + "flos": 20703327932160.0, + "grad_norm": 1.96232440030472, + "language_loss": 0.88380635, + "learning_rate": 2.986365519932332e-06, + "loss": 0.90545923, + "num_input_tokens_seen": 127188075, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.796875, + "step": 5917, + "time_per_iteration": 2.4549498558044434 + }, + { + "auxiliary_loss_clip": 0.01123557, + "auxiliary_loss_mlp": 0.01025214, + "balance_loss_clip": 1.01144493, + "balance_loss_mlp": 1.04562521, + "epoch": 0.35580940928904253, + "flos": 15194595444480.0, + "grad_norm": 2.0473051476373048, + "language_loss": 0.74389327, + "learning_rate": 2.98602669849771e-06, + "loss": 0.76538098, + "num_input_tokens_seen": 127206065, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.78125, + "step": 5918, + "time_per_iteration": 2.448164701461792 + }, + { + "auxiliary_loss_clip": 0.01039303, + "auxiliary_loss_mlp": 0.01015071, + "balance_loss_clip": 1.01344931, + "balance_loss_mlp": 1.01430607, + "epoch": 0.3558695325417105, + "flos": 58639145431680.0, + "grad_norm": 1.0267040132589962, + "language_loss": 0.63732457, + "learning_rate": 2.985687839672857e-06, + "loss": 0.65786839, + "num_input_tokens_seen": 127257885, + "router_z_loss_clip": 0.01623535, + "router_z_loss_mlp": 0.25, + "step": 5919, + "time_per_iteration": 2.837815999984741 + }, + { + "auxiliary_loss_clip": 0.01124878, + "auxiliary_loss_mlp": 0.01032772, + "balance_loss_clip": 1.01805615, + "balance_loss_mlp": 1.04376245, + "epoch": 0.35592965579437846, + "flos": 22018233104640.0, + "grad_norm": 2.8747663216478503, + "language_loss": 0.73868048, + "learning_rate": 2.9853489434706223e-06, + "loss": 0.76025695, + "num_input_tokens_seen": 127275550, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.80859375, + "step": 5920, + "time_per_iteration": 2.4837357997894287 + }, + { + "auxiliary_loss_clip": 0.0112079, + "auxiliary_loss_mlp": 0.01034089, + "balance_loss_clip": 1.02015972, + "balance_loss_mlp": 1.04353166, + "epoch": 0.35598977904704643, + "flos": 23367684182400.0, + "grad_norm": 1.659561193633535, + "language_loss": 0.77124226, + "learning_rate": 2.985010009903857e-06, + "loss": 0.79279101, + "num_input_tokens_seen": 127295110, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7734375, + "step": 5921, + "time_per_iteration": 2.461014986038208 + }, + { + "auxiliary_loss_clip": 0.01122705, + "auxiliary_loss_mlp": 0.0103307, + "balance_loss_clip": 1.01968277, + "balance_loss_mlp": 1.04409981, + "epoch": 0.3560499022997144, + "flos": 17785334770560.0, + "grad_norm": 3.1644779785561563, + "language_loss": 0.67710596, + "learning_rate": 2.9846710389854133e-06, + "loss": 0.69866371, + "num_input_tokens_seen": 127312865, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7890625, + "step": 5922, + "time_per_iteration": 2.495504140853882 + }, + { + "auxiliary_loss_clip": 0.01122686, + "auxiliary_loss_mlp": 0.01029775, + "balance_loss_clip": 1.01567268, + "balance_loss_mlp": 1.04373431, + "epoch": 0.35611002555238236, + "flos": 20740459616640.0, + "grad_norm": 1.9745978513449503, + "language_loss": 0.79269004, + "learning_rate": 2.9843320307281454e-06, + "loss": 0.81421471, + "num_input_tokens_seen": 127331710, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 5923, + "time_per_iteration": 2.4515416622161865 + }, + { + "auxiliary_loss_clip": 0.01124058, + "auxiliary_loss_mlp": 0.01039223, + "balance_loss_clip": 1.02631271, + "balance_loss_mlp": 1.04502511, + "epoch": 0.3561701488050504, + "flos": 19462219251840.0, + "grad_norm": 1.7698063934253627, + "language_loss": 0.85475516, + "learning_rate": 2.983992985144908e-06, + "loss": 0.87638795, + "num_input_tokens_seen": 127350950, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7890625, + "step": 5924, + "time_per_iteration": 2.4790685176849365 + }, + { + "auxiliary_loss_clip": 0.01121235, + "auxiliary_loss_mlp": 0.01037826, + "balance_loss_clip": 1.02344394, + "balance_loss_mlp": 1.04368067, + "epoch": 0.35623027205771834, + "flos": 30774942023040.0, + "grad_norm": 1.844353158814239, + "language_loss": 0.77513188, + "learning_rate": 2.9836539022485578e-06, + "loss": 0.79672253, + "num_input_tokens_seen": 127369385, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7734375, + "step": 5925, + "time_per_iteration": 2.5064613819122314 + }, + { + "auxiliary_loss_clip": 0.01119102, + "auxiliary_loss_mlp": 0.01043971, + "balance_loss_clip": 1.0301789, + "balance_loss_mlp": 1.04067063, + "epoch": 0.3562903953103863, + "flos": 16981079299200.0, + "grad_norm": 1.7016119178915972, + "language_loss": 0.75874609, + "learning_rate": 2.9833147820519535e-06, + "loss": 0.78037679, + "num_input_tokens_seen": 127386965, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.78125, + "step": 5926, + "time_per_iteration": 2.451852798461914 + }, + { + "auxiliary_loss_clip": 0.01125239, + "auxiliary_loss_mlp": 0.01036384, + "balance_loss_clip": 1.02194762, + "balance_loss_mlp": 1.04408717, + "epoch": 0.3563505185630543, + "flos": 23839837482240.0, + "grad_norm": 2.0486133546267737, + "language_loss": 0.69321811, + "learning_rate": 2.9829756245679544e-06, + "loss": 0.71483439, + "num_input_tokens_seen": 127406075, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.80859375, + "step": 5927, + "time_per_iteration": 2.4770915508270264 + }, + { + "auxiliary_loss_clip": 0.01119921, + "auxiliary_loss_mlp": 0.01036862, + "balance_loss_clip": 1.0237366, + "balance_loss_mlp": 1.0428226, + "epoch": 0.35641064181572224, + "flos": 22273450214400.0, + "grad_norm": 1.8762651107969224, + "language_loss": 0.79633021, + "learning_rate": 2.9826364298094212e-06, + "loss": 0.81789798, + "num_input_tokens_seen": 127425350, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.76953125, + "step": 5928, + "time_per_iteration": 4.019433259963989 + }, + { + "auxiliary_loss_clip": 0.01120965, + "auxiliary_loss_mlp": 0.01039766, + "balance_loss_clip": 1.02581263, + "balance_loss_mlp": 1.04338682, + "epoch": 0.3564707650683902, + "flos": 23001251587200.0, + "grad_norm": 1.4128421638180557, + "language_loss": 0.81568098, + "learning_rate": 2.982297197789215e-06, + "loss": 0.83728826, + "num_input_tokens_seen": 127446335, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7734375, + "step": 5929, + "time_per_iteration": 3.869184970855713 + }, + { + "auxiliary_loss_clip": 0.01116246, + "auxiliary_loss_mlp": 0.01034774, + "balance_loss_clip": 1.02172661, + "balance_loss_mlp": 1.0402571, + "epoch": 0.35653088832105817, + "flos": 14684268965760.0, + "grad_norm": 1.7650523310611956, + "language_loss": 0.69981778, + "learning_rate": 2.981957928520201e-06, + "loss": 0.7213279, + "num_input_tokens_seen": 127462795, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7578125, + "step": 5930, + "time_per_iteration": 2.418992519378662 + }, + { + "auxiliary_loss_clip": 0.01123929, + "auxiliary_loss_mlp": 0.01043162, + "balance_loss_clip": 1.02858853, + "balance_loss_mlp": 1.04340863, + "epoch": 0.35659101157372614, + "flos": 23477068074240.0, + "grad_norm": 1.9164187115059894, + "language_loss": 0.67766178, + "learning_rate": 2.981618622015244e-06, + "loss": 0.69933271, + "num_input_tokens_seen": 127482675, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8046875, + "step": 5931, + "time_per_iteration": 2.4688074588775635 + }, + { + "auxiliary_loss_clip": 0.01121557, + "auxiliary_loss_mlp": 0.01033991, + "balance_loss_clip": 1.0203712, + "balance_loss_mlp": 1.04403675, + "epoch": 0.3566511348263941, + "flos": 26578672583040.0, + "grad_norm": 1.736290109138699, + "language_loss": 0.67451715, + "learning_rate": 2.981279278287211e-06, + "loss": 0.69607264, + "num_input_tokens_seen": 127502275, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.77734375, + "step": 5932, + "time_per_iteration": 2.4908299446105957 + }, + { + "auxiliary_loss_clip": 0.01118994, + "auxiliary_loss_mlp": 0.01031751, + "balance_loss_clip": 1.0182085, + "balance_loss_mlp": 1.04304647, + "epoch": 0.35671125807906207, + "flos": 13115008609920.0, + "grad_norm": 2.602576254435761, + "language_loss": 0.7878592, + "learning_rate": 2.980939897348969e-06, + "loss": 0.8093667, + "num_input_tokens_seen": 127520195, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 5933, + "time_per_iteration": 2.442464590072632 + }, + { + "auxiliary_loss_clip": 0.01122141, + "auxiliary_loss_mlp": 0.01042886, + "balance_loss_clip": 1.02893806, + "balance_loss_mlp": 1.04176354, + "epoch": 0.35677138133173003, + "flos": 33000577557120.0, + "grad_norm": 1.4946029259135472, + "language_loss": 0.69271672, + "learning_rate": 2.980600479213388e-06, + "loss": 0.71436697, + "num_input_tokens_seen": 127544495, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.8046875, + "step": 5934, + "time_per_iteration": 2.5435454845428467 + }, + { + "auxiliary_loss_clip": 0.01131019, + "auxiliary_loss_mlp": 0.0104198, + "balance_loss_clip": 1.02636409, + "balance_loss_mlp": 1.04726946, + "epoch": 0.356831504584398, + "flos": 20777842696320.0, + "grad_norm": 1.881720756405168, + "language_loss": 0.71268845, + "learning_rate": 2.9802610238933384e-06, + "loss": 0.73441839, + "num_input_tokens_seen": 127563810, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8359375, + "step": 5935, + "time_per_iteration": 2.460548162460327 + }, + { + "auxiliary_loss_clip": 0.01124043, + "auxiliary_loss_mlp": 0.01039228, + "balance_loss_clip": 1.02476776, + "balance_loss_mlp": 1.04411018, + "epoch": 0.35689162783706596, + "flos": 12165566365440.0, + "grad_norm": 2.474293421119334, + "language_loss": 0.78293073, + "learning_rate": 2.979921531401692e-06, + "loss": 0.8045634, + "num_input_tokens_seen": 127579065, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.796875, + "step": 5936, + "time_per_iteration": 2.4517645835876465 + }, + { + "auxiliary_loss_clip": 0.01121611, + "auxiliary_loss_mlp": 0.01039592, + "balance_loss_clip": 1.02472031, + "balance_loss_mlp": 1.04367638, + "epoch": 0.356951751089734, + "flos": 23841489507840.0, + "grad_norm": 1.4518862241402966, + "language_loss": 0.64218014, + "learning_rate": 2.9795820017513242e-06, + "loss": 0.66379213, + "num_input_tokens_seen": 127599105, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78125, + "step": 5937, + "time_per_iteration": 2.5837321281433105 + }, + { + "auxiliary_loss_clip": 0.01124449, + "auxiliary_loss_mlp": 0.01038603, + "balance_loss_clip": 1.02395844, + "balance_loss_mlp": 1.04442978, + "epoch": 0.35701187434240195, + "flos": 11722176881280.0, + "grad_norm": 2.5143509931773553, + "language_loss": 0.77877963, + "learning_rate": 2.9792424349551073e-06, + "loss": 0.80041015, + "num_input_tokens_seen": 127614940, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.80078125, + "step": 5938, + "time_per_iteration": 2.4190945625305176 + }, + { + "auxiliary_loss_clip": 0.0112532, + "auxiliary_loss_mlp": 0.01042565, + "balance_loss_clip": 1.02890944, + "balance_loss_mlp": 1.04582071, + "epoch": 0.3570719975950699, + "flos": 24898879100160.0, + "grad_norm": 1.8770011073758637, + "language_loss": 0.80256367, + "learning_rate": 2.9789028310259202e-06, + "loss": 0.82424247, + "num_input_tokens_seen": 127634960, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.796875, + "step": 5939, + "time_per_iteration": 2.5029094219207764 + }, + { + "auxiliary_loss_clip": 0.01126611, + "auxiliary_loss_mlp": 0.01035861, + "balance_loss_clip": 1.0213412, + "balance_loss_mlp": 1.04299128, + "epoch": 0.3571321208477379, + "flos": 25994836920960.0, + "grad_norm": 1.6875415435298406, + "language_loss": 0.79203522, + "learning_rate": 2.9785631899766395e-06, + "loss": 0.81365997, + "num_input_tokens_seen": 127654545, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8359375, + "step": 5940, + "time_per_iteration": 2.526545524597168 + }, + { + "auxiliary_loss_clip": 0.01124522, + "auxiliary_loss_mlp": 0.01031852, + "balance_loss_clip": 1.01704049, + "balance_loss_mlp": 1.0441246, + "epoch": 0.35719224410040584, + "flos": 14501663199360.0, + "grad_norm": 2.480743427796476, + "language_loss": 0.72739166, + "learning_rate": 2.9782235118201443e-06, + "loss": 0.74895537, + "num_input_tokens_seen": 127672320, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8046875, + "step": 5941, + "time_per_iteration": 2.4599413871765137 + }, + { + "auxiliary_loss_clip": 0.01123947, + "auxiliary_loss_mlp": 0.01040367, + "balance_loss_clip": 1.02546012, + "balance_loss_mlp": 1.04480743, + "epoch": 0.3572523673530738, + "flos": 31175453646720.0, + "grad_norm": 1.979069530543237, + "language_loss": 0.64202702, + "learning_rate": 2.9778837965693154e-06, + "loss": 0.66367018, + "num_input_tokens_seen": 127693315, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7890625, + "step": 5942, + "time_per_iteration": 2.5174636840820312 + }, + { + "auxiliary_loss_clip": 0.01123104, + "auxiliary_loss_mlp": 0.01036752, + "balance_loss_clip": 1.02194643, + "balance_loss_mlp": 1.04385567, + "epoch": 0.3573124906057418, + "flos": 15851976203520.0, + "grad_norm": 2.2469009256176053, + "language_loss": 0.74055374, + "learning_rate": 2.9775440442370354e-06, + "loss": 0.76215225, + "num_input_tokens_seen": 127711570, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.79296875, + "step": 5943, + "time_per_iteration": 2.5392913818359375 + }, + { + "auxiliary_loss_clip": 0.01039679, + "auxiliary_loss_mlp": 0.01008275, + "balance_loss_clip": 1.00640345, + "balance_loss_mlp": 1.01455188, + "epoch": 0.35737261385840974, + "flos": 60822729118080.0, + "grad_norm": 0.7872915284740177, + "language_loss": 0.60689372, + "learning_rate": 2.9772042548361867e-06, + "loss": 0.62737316, + "num_input_tokens_seen": 127772475, + "router_z_loss_clip": 0.01867676, + "router_z_loss_mlp": 0.25, + "step": 5944, + "time_per_iteration": 3.17051100730896 + }, + { + "auxiliary_loss_clip": 0.01121351, + "auxiliary_loss_mlp": 0.01034271, + "balance_loss_clip": 1.02003157, + "balance_loss_mlp": 1.04313469, + "epoch": 0.3574327371110777, + "flos": 18843765857280.0, + "grad_norm": 2.033108996495456, + "language_loss": 0.72646821, + "learning_rate": 2.976864428379655e-06, + "loss": 0.7480244, + "num_input_tokens_seen": 127790940, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 5945, + "time_per_iteration": 2.444373846054077 + }, + { + "auxiliary_loss_clip": 0.01121962, + "auxiliary_loss_mlp": 0.01039125, + "balance_loss_clip": 1.02446246, + "balance_loss_mlp": 1.04313612, + "epoch": 0.35749286036374567, + "flos": 23549679417600.0, + "grad_norm": 1.7423109631574678, + "language_loss": 0.81255424, + "learning_rate": 2.976524564880326e-06, + "loss": 0.8341651, + "num_input_tokens_seen": 127808275, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7890625, + "step": 5946, + "time_per_iteration": 2.470513343811035 + }, + { + "auxiliary_loss_clip": 0.01124684, + "auxiliary_loss_mlp": 0.01042743, + "balance_loss_clip": 1.02808666, + "balance_loss_mlp": 1.04524601, + "epoch": 0.35755298361641363, + "flos": 21105491581440.0, + "grad_norm": 1.9099881709146462, + "language_loss": 0.68893784, + "learning_rate": 2.9761846643510882e-06, + "loss": 0.71061212, + "num_input_tokens_seen": 127828840, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.79296875, + "step": 5947, + "time_per_iteration": 2.4653477668762207 + }, + { + "auxiliary_loss_clip": 0.01120435, + "auxiliary_loss_mlp": 0.01039661, + "balance_loss_clip": 1.02568388, + "balance_loss_mlp": 1.04441905, + "epoch": 0.3576131068690816, + "flos": 19245031666560.0, + "grad_norm": 1.655085874443405, + "language_loss": 0.75428057, + "learning_rate": 2.9758447268048297e-06, + "loss": 0.77588153, + "num_input_tokens_seen": 127846240, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7578125, + "step": 5948, + "time_per_iteration": 2.4385483264923096 + }, + { + "auxiliary_loss_clip": 0.01119692, + "auxiliary_loss_mlp": 0.0104111, + "balance_loss_clip": 1.02650094, + "balance_loss_mlp": 1.04049134, + "epoch": 0.35767323012174956, + "flos": 28654703971200.0, + "grad_norm": 2.354345427402619, + "language_loss": 0.70556438, + "learning_rate": 2.9755047522544415e-06, + "loss": 0.72717237, + "num_input_tokens_seen": 127866880, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.79296875, + "step": 5949, + "time_per_iteration": 2.4992663860321045 + }, + { + "auxiliary_loss_clip": 0.01121175, + "auxiliary_loss_mlp": 0.0103916, + "balance_loss_clip": 1.02567744, + "balance_loss_mlp": 1.04348552, + "epoch": 0.35773335337441753, + "flos": 17085363459840.0, + "grad_norm": 1.8941983472442732, + "language_loss": 0.77248389, + "learning_rate": 2.9751647407128154e-06, + "loss": 0.79408723, + "num_input_tokens_seen": 127883560, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.77734375, + "step": 5950, + "time_per_iteration": 2.4295101165771484 + }, + { + "auxiliary_loss_clip": 0.0112255, + "auxiliary_loss_mlp": 0.01038813, + "balance_loss_clip": 1.02394795, + "balance_loss_mlp": 1.04274225, + "epoch": 0.35779347662708555, + "flos": 15888605097600.0, + "grad_norm": 1.5707876816938207, + "language_loss": 0.72766685, + "learning_rate": 2.9748246921928445e-06, + "loss": 0.74928057, + "num_input_tokens_seen": 127902330, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 5951, + "time_per_iteration": 2.444349765777588 + }, + { + "auxiliary_loss_clip": 0.0112562, + "auxiliary_loss_mlp": 0.01039318, + "balance_loss_clip": 1.02452421, + "balance_loss_mlp": 1.04390478, + "epoch": 0.3578535998797535, + "flos": 28658834035200.0, + "grad_norm": 1.9955959935597258, + "language_loss": 0.69730532, + "learning_rate": 2.9744846067074236e-06, + "loss": 0.71895468, + "num_input_tokens_seen": 127922325, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.81640625, + "step": 5952, + "time_per_iteration": 2.49656081199646 + }, + { + "auxiliary_loss_clip": 0.01120518, + "auxiliary_loss_mlp": 0.010387, + "balance_loss_clip": 1.02497923, + "balance_loss_mlp": 1.04271066, + "epoch": 0.3579137231324215, + "flos": 37852432076160.0, + "grad_norm": 2.0583657570083416, + "language_loss": 0.69432503, + "learning_rate": 2.974144484269449e-06, + "loss": 0.71591723, + "num_input_tokens_seen": 127942635, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.77734375, + "step": 5953, + "time_per_iteration": 2.6221721172332764 + }, + { + "auxiliary_loss_clip": 0.0112099, + "auxiliary_loss_mlp": 0.01030421, + "balance_loss_clip": 1.01641417, + "balance_loss_mlp": 1.04322994, + "epoch": 0.35797384638508944, + "flos": 22346851656960.0, + "grad_norm": 1.5429391611916807, + "language_loss": 0.66673422, + "learning_rate": 2.9738043248918175e-06, + "loss": 0.68824828, + "num_input_tokens_seen": 127962520, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.77734375, + "step": 5954, + "time_per_iteration": 2.465116262435913 + }, + { + "auxiliary_loss_clip": 0.01123263, + "auxiliary_loss_mlp": 0.01037735, + "balance_loss_clip": 1.02383566, + "balance_loss_mlp": 1.04475307, + "epoch": 0.3580339696377574, + "flos": 13589711775360.0, + "grad_norm": 1.7040470297828096, + "language_loss": 0.74838006, + "learning_rate": 2.9734641285874282e-06, + "loss": 0.76998997, + "num_input_tokens_seen": 127981180, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.78515625, + "step": 5955, + "time_per_iteration": 2.4968783855438232 + }, + { + "auxiliary_loss_clip": 0.01117597, + "auxiliary_loss_mlp": 0.01035772, + "balance_loss_clip": 1.0219382, + "balance_loss_mlp": 1.04289603, + "epoch": 0.3580940928904254, + "flos": 23768231719680.0, + "grad_norm": 1.6820855707774873, + "language_loss": 0.76043999, + "learning_rate": 2.973123895369182e-06, + "loss": 0.78197372, + "num_input_tokens_seen": 127999725, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.74609375, + "step": 5956, + "time_per_iteration": 2.498699903488159 + }, + { + "auxiliary_loss_clip": 0.01117465, + "auxiliary_loss_mlp": 0.01034975, + "balance_loss_clip": 1.0214982, + "balance_loss_mlp": 1.04263568, + "epoch": 0.35815421614309334, + "flos": 19463871277440.0, + "grad_norm": 1.7390523407913014, + "language_loss": 0.73059452, + "learning_rate": 2.9727836252499805e-06, + "loss": 0.75211895, + "num_input_tokens_seen": 128018885, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 5957, + "time_per_iteration": 2.4503817558288574 + }, + { + "auxiliary_loss_clip": 0.0112235, + "auxiliary_loss_mlp": 0.01035332, + "balance_loss_clip": 1.02197433, + "balance_loss_mlp": 1.04503369, + "epoch": 0.3582143393957613, + "flos": 23368186972800.0, + "grad_norm": 2.990259024529503, + "language_loss": 0.70640051, + "learning_rate": 2.972443318242726e-06, + "loss": 0.7279774, + "num_input_tokens_seen": 128037875, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7734375, + "step": 5958, + "time_per_iteration": 2.4611945152282715 + }, + { + "auxiliary_loss_clip": 0.01116817, + "auxiliary_loss_mlp": 0.01029572, + "balance_loss_clip": 1.0165484, + "balance_loss_mlp": 1.0413444, + "epoch": 0.35827446264842927, + "flos": 26323275905280.0, + "grad_norm": 1.7206269565580243, + "language_loss": 0.88610697, + "learning_rate": 2.972102974360324e-06, + "loss": 0.90757084, + "num_input_tokens_seen": 128056045, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.75390625, + "step": 5959, + "time_per_iteration": 2.5129401683807373 + }, + { + "auxiliary_loss_clip": 0.01121057, + "auxiliary_loss_mlp": 0.01036795, + "balance_loss_clip": 1.02281785, + "balance_loss_mlp": 1.04400599, + "epoch": 0.35833458590109724, + "flos": 30446610779520.0, + "grad_norm": 1.483187088646708, + "language_loss": 0.58103061, + "learning_rate": 2.971762593615679e-06, + "loss": 0.6026091, + "num_input_tokens_seen": 128077815, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76953125, + "step": 5960, + "time_per_iteration": 2.5110409259796143 + }, + { + "auxiliary_loss_clip": 0.01120594, + "auxiliary_loss_mlp": 0.01037396, + "balance_loss_clip": 1.02201176, + "balance_loss_mlp": 1.04267251, + "epoch": 0.3583947091537652, + "flos": 14829886702080.0, + "grad_norm": 1.9323395592862886, + "language_loss": 0.76102602, + "learning_rate": 2.9714221760216993e-06, + "loss": 0.78260595, + "num_input_tokens_seen": 128095460, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.77734375, + "step": 5961, + "time_per_iteration": 2.46943736076355 + }, + { + "auxiliary_loss_clip": 0.01121367, + "auxiliary_loss_mlp": 0.01033122, + "balance_loss_clip": 1.01862621, + "balance_loss_mlp": 1.04458857, + "epoch": 0.35845483240643317, + "flos": 34240644743040.0, + "grad_norm": 1.8327349140058107, + "language_loss": 0.69974017, + "learning_rate": 2.971081721591294e-06, + "loss": 0.72128505, + "num_input_tokens_seen": 128118605, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.765625, + "step": 5962, + "time_per_iteration": 2.5654361248016357 + }, + { + "auxiliary_loss_clip": 0.01118608, + "auxiliary_loss_mlp": 0.01033334, + "balance_loss_clip": 1.0210433, + "balance_loss_mlp": 1.04321802, + "epoch": 0.35851495565910113, + "flos": 20960089326720.0, + "grad_norm": 1.5613001239774846, + "language_loss": 0.74749398, + "learning_rate": 2.9707412303373716e-06, + "loss": 0.76901346, + "num_input_tokens_seen": 128139205, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.75390625, + "step": 5963, + "time_per_iteration": 2.5135319232940674 + }, + { + "auxiliary_loss_clip": 0.01122172, + "auxiliary_loss_mlp": 0.01034993, + "balance_loss_clip": 1.02149796, + "balance_loss_mlp": 1.04597044, + "epoch": 0.35857507891176915, + "flos": 22309863626880.0, + "grad_norm": 1.5825069258384938, + "language_loss": 0.78811383, + "learning_rate": 2.9704007022728447e-06, + "loss": 0.80968547, + "num_input_tokens_seen": 128158765, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.76171875, + "step": 5964, + "time_per_iteration": 2.493169069290161 + }, + { + "auxiliary_loss_clip": 0.01124119, + "auxiliary_loss_mlp": 0.01033282, + "balance_loss_clip": 1.01870322, + "balance_loss_mlp": 1.04482806, + "epoch": 0.3586352021644371, + "flos": 23367863750400.0, + "grad_norm": 1.8296471859577264, + "language_loss": 0.66694742, + "learning_rate": 2.970060137410626e-06, + "loss": 0.6885215, + "num_input_tokens_seen": 128177850, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.79296875, + "step": 5965, + "time_per_iteration": 2.4995884895324707 + }, + { + "auxiliary_loss_clip": 0.01120182, + "auxiliary_loss_mlp": 0.01032631, + "balance_loss_clip": 1.01876068, + "balance_loss_mlp": 1.04270399, + "epoch": 0.3586953254171051, + "flos": 27849227437440.0, + "grad_norm": 4.210402322068537, + "language_loss": 0.79008359, + "learning_rate": 2.9697195357636294e-06, + "loss": 0.81161171, + "num_input_tokens_seen": 128196925, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7734375, + "step": 5966, + "time_per_iteration": 2.485438346862793 + }, + { + "auxiliary_loss_clip": 0.01121545, + "auxiliary_loss_mlp": 0.01037084, + "balance_loss_clip": 1.02238536, + "balance_loss_mlp": 1.04341781, + "epoch": 0.35875544866977305, + "flos": 19500500171520.0, + "grad_norm": 5.107721360348662, + "language_loss": 0.90911728, + "learning_rate": 2.9693788973447715e-06, + "loss": 0.93070352, + "num_input_tokens_seen": 128213955, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78125, + "step": 5967, + "time_per_iteration": 2.547287702560425 + }, + { + "auxiliary_loss_clip": 0.01125829, + "auxiliary_loss_mlp": 0.01041518, + "balance_loss_clip": 1.02648592, + "balance_loss_mlp": 1.04528475, + "epoch": 0.358815571922441, + "flos": 21471134077440.0, + "grad_norm": 1.7620117516801617, + "language_loss": 0.79739827, + "learning_rate": 2.9690382221669682e-06, + "loss": 0.81907177, + "num_input_tokens_seen": 128232980, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8046875, + "step": 5968, + "time_per_iteration": 2.4543471336364746 + }, + { + "auxiliary_loss_clip": 0.01127455, + "auxiliary_loss_mlp": 0.01052904, + "balance_loss_clip": 1.0384376, + "balance_loss_mlp": 1.04604244, + "epoch": 0.358875695175109, + "flos": 21835411856640.0, + "grad_norm": 2.0044885906540424, + "language_loss": 0.83642054, + "learning_rate": 2.9686975102431384e-06, + "loss": 0.85822409, + "num_input_tokens_seen": 128252795, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8125, + "step": 5969, + "time_per_iteration": 2.502815008163452 + }, + { + "auxiliary_loss_clip": 0.0111906, + "auxiliary_loss_mlp": 0.01032145, + "balance_loss_clip": 1.01893663, + "balance_loss_mlp": 1.04245603, + "epoch": 0.35893581842777694, + "flos": 32011633330560.0, + "grad_norm": 1.876228198696561, + "language_loss": 0.72377515, + "learning_rate": 2.968356761586202e-06, + "loss": 0.74528718, + "num_input_tokens_seen": 128273115, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.765625, + "step": 5970, + "time_per_iteration": 4.051819086074829 + }, + { + "auxiliary_loss_clip": 0.01119038, + "auxiliary_loss_mlp": 0.01035066, + "balance_loss_clip": 1.02178049, + "balance_loss_mlp": 1.0424037, + "epoch": 0.3589959416804449, + "flos": 20485817124480.0, + "grad_norm": 1.6844020581036279, + "language_loss": 0.79522693, + "learning_rate": 2.9680159762090805e-06, + "loss": 0.81676805, + "num_input_tokens_seen": 128292220, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.765625, + "step": 5971, + "time_per_iteration": 3.8910434246063232 + }, + { + "auxiliary_loss_clip": 0.01120261, + "auxiliary_loss_mlp": 0.01039002, + "balance_loss_clip": 1.02427924, + "balance_loss_mlp": 1.0402174, + "epoch": 0.3590560649331129, + "flos": 16180666583040.0, + "grad_norm": 1.924864359347905, + "language_loss": 0.78594625, + "learning_rate": 2.967675154124696e-06, + "loss": 0.80753887, + "num_input_tokens_seen": 128310305, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.80078125, + "step": 5972, + "time_per_iteration": 2.4272611141204834 + }, + { + "auxiliary_loss_clip": 0.01120688, + "auxiliary_loss_mlp": 0.01037349, + "balance_loss_clip": 1.02378309, + "balance_loss_mlp": 1.04185021, + "epoch": 0.35911618818578084, + "flos": 20375391738240.0, + "grad_norm": 3.2741380987368327, + "language_loss": 0.81252539, + "learning_rate": 2.9673342953459722e-06, + "loss": 0.83410573, + "num_input_tokens_seen": 128328305, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7890625, + "step": 5973, + "time_per_iteration": 2.469438314437866 + }, + { + "auxiliary_loss_clip": 0.0103695, + "auxiliary_loss_mlp": 0.01001955, + "balance_loss_clip": 0.9999882, + "balance_loss_mlp": 1.01160312, + "epoch": 0.3591763114384488, + "flos": 41236691685120.0, + "grad_norm": 0.9181567019376142, + "language_loss": 0.56828684, + "learning_rate": 2.9669933998858355e-06, + "loss": 0.58867586, + "num_input_tokens_seen": 128378380, + "router_z_loss_clip": 0.01965332, + "router_z_loss_mlp": 0.25390625, + "step": 5974, + "time_per_iteration": 2.918166399002075 + }, + { + "auxiliary_loss_clip": 0.01122634, + "auxiliary_loss_mlp": 0.01038804, + "balance_loss_clip": 1.02548242, + "balance_loss_mlp": 1.04407859, + "epoch": 0.35923643469111677, + "flos": 18695454600960.0, + "grad_norm": 1.6252506462115286, + "language_loss": 0.68750453, + "learning_rate": 2.9666524677572114e-06, + "loss": 0.7091189, + "num_input_tokens_seen": 128394315, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.78515625, + "step": 5975, + "time_per_iteration": 2.4578702449798584 + }, + { + "auxiliary_loss_clip": 0.01119888, + "auxiliary_loss_mlp": 0.01034451, + "balance_loss_clip": 1.02132642, + "balance_loss_mlp": 1.04269934, + "epoch": 0.35929655794378473, + "flos": 25009950931200.0, + "grad_norm": 1.7542310571392548, + "language_loss": 0.79961413, + "learning_rate": 2.96631149897303e-06, + "loss": 0.82115752, + "num_input_tokens_seen": 128414515, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7734375, + "step": 5976, + "time_per_iteration": 2.494723081588745 + }, + { + "auxiliary_loss_clip": 0.01119534, + "auxiliary_loss_mlp": 0.0104186, + "balance_loss_clip": 1.02761412, + "balance_loss_mlp": 1.04172039, + "epoch": 0.35935668119645275, + "flos": 14975576265600.0, + "grad_norm": 1.7409485188517788, + "language_loss": 0.79081398, + "learning_rate": 2.9659704935462194e-06, + "loss": 0.81242788, + "num_input_tokens_seen": 128430615, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.77734375, + "step": 5977, + "time_per_iteration": 2.4949100017547607 + }, + { + "auxiliary_loss_clip": 0.01116029, + "auxiliary_loss_mlp": 0.01034751, + "balance_loss_clip": 1.02151847, + "balance_loss_mlp": 1.04029524, + "epoch": 0.3594168044491207, + "flos": 21178138838400.0, + "grad_norm": 1.7920092294573908, + "language_loss": 0.80654621, + "learning_rate": 2.9656294514897102e-06, + "loss": 0.82805401, + "num_input_tokens_seen": 128449480, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7578125, + "step": 5978, + "time_per_iteration": 2.445866584777832 + }, + { + "auxiliary_loss_clip": 0.01122409, + "auxiliary_loss_mlp": 0.01034873, + "balance_loss_clip": 1.02046657, + "balance_loss_mlp": 1.04394007, + "epoch": 0.3594769277017887, + "flos": 27672152365440.0, + "grad_norm": 1.5382295990908517, + "language_loss": 0.67741489, + "learning_rate": 2.965288372816436e-06, + "loss": 0.69898772, + "num_input_tokens_seen": 128471465, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78515625, + "step": 5979, + "time_per_iteration": 2.538585662841797 + }, + { + "auxiliary_loss_clip": 0.01119324, + "auxiliary_loss_mlp": 0.01038492, + "balance_loss_clip": 1.02478838, + "balance_loss_mlp": 1.04136634, + "epoch": 0.35953705095445665, + "flos": 23002328995200.0, + "grad_norm": 2.3207911240165697, + "language_loss": 0.67176729, + "learning_rate": 2.9649472575393296e-06, + "loss": 0.69334549, + "num_input_tokens_seen": 128490645, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.78125, + "step": 5980, + "time_per_iteration": 2.4896938800811768 + }, + { + "auxiliary_loss_clip": 0.01123377, + "auxiliary_loss_mlp": 0.01039239, + "balance_loss_clip": 1.02377748, + "balance_loss_mlp": 1.0416832, + "epoch": 0.3595971742071246, + "flos": 25513992529920.0, + "grad_norm": 1.8107777091561479, + "language_loss": 0.71148199, + "learning_rate": 2.964606105671327e-06, + "loss": 0.73310816, + "num_input_tokens_seen": 128510225, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.81640625, + "step": 5981, + "time_per_iteration": 2.49064302444458 + }, + { + "auxiliary_loss_clip": 0.01121979, + "auxiliary_loss_mlp": 0.01038955, + "balance_loss_clip": 1.02387476, + "balance_loss_mlp": 1.0432086, + "epoch": 0.3596572974597926, + "flos": 29862559635840.0, + "grad_norm": 1.7933500913622242, + "language_loss": 0.71331298, + "learning_rate": 2.9642649172253635e-06, + "loss": 0.73492229, + "num_input_tokens_seen": 128530195, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7890625, + "step": 5982, + "time_per_iteration": 2.5167934894561768 + }, + { + "auxiliary_loss_clip": 0.01117371, + "auxiliary_loss_mlp": 0.01036663, + "balance_loss_clip": 1.02361536, + "balance_loss_mlp": 1.0427959, + "epoch": 0.35971742071246054, + "flos": 23112538899840.0, + "grad_norm": 1.6761533335073455, + "language_loss": 0.75808942, + "learning_rate": 2.9639236922143786e-06, + "loss": 0.77962971, + "num_input_tokens_seen": 128549990, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.74609375, + "step": 5983, + "time_per_iteration": 2.4915101528167725 + }, + { + "auxiliary_loss_clip": 0.01126703, + "auxiliary_loss_mlp": 0.01043227, + "balance_loss_clip": 1.02771819, + "balance_loss_mlp": 1.04474413, + "epoch": 0.3597775439651285, + "flos": 16725359399040.0, + "grad_norm": 2.1804669018597043, + "language_loss": 0.76302433, + "learning_rate": 2.96358243065131e-06, + "loss": 0.78472364, + "num_input_tokens_seen": 128567925, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8203125, + "step": 5984, + "time_per_iteration": 2.436640501022339 + }, + { + "auxiliary_loss_clip": 0.01118377, + "auxiliary_loss_mlp": 0.01037581, + "balance_loss_clip": 1.02356207, + "balance_loss_mlp": 1.0420785, + "epoch": 0.3598376672177965, + "flos": 19719483436800.0, + "grad_norm": 1.837904559260202, + "language_loss": 0.86617446, + "learning_rate": 2.9632411325490993e-06, + "loss": 0.88773406, + "num_input_tokens_seen": 128585655, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 5985, + "time_per_iteration": 2.476853609085083 + }, + { + "auxiliary_loss_clip": 0.0111809, + "auxiliary_loss_mlp": 0.01036238, + "balance_loss_clip": 1.02130079, + "balance_loss_mlp": 1.04078126, + "epoch": 0.35989779047046444, + "flos": 17311529445120.0, + "grad_norm": 1.416236209566339, + "language_loss": 0.72801065, + "learning_rate": 2.9628997979206884e-06, + "loss": 0.74955392, + "num_input_tokens_seen": 128604820, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.7734375, + "step": 5986, + "time_per_iteration": 2.443871021270752 + }, + { + "auxiliary_loss_clip": 0.01124328, + "auxiliary_loss_mlp": 0.0103792, + "balance_loss_clip": 1.02354908, + "balance_loss_mlp": 1.04230642, + "epoch": 0.3599579137231324, + "flos": 22711237176960.0, + "grad_norm": 1.880079313238184, + "language_loss": 0.73711401, + "learning_rate": 2.9625584267790204e-06, + "loss": 0.75873649, + "num_input_tokens_seen": 128623070, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.8203125, + "step": 5987, + "time_per_iteration": 2.517045736312866 + }, + { + "auxiliary_loss_clip": 0.01121357, + "auxiliary_loss_mlp": 0.01036656, + "balance_loss_clip": 1.0217309, + "balance_loss_mlp": 1.04161966, + "epoch": 0.36001803697580037, + "flos": 20959873845120.0, + "grad_norm": 1.8583263097896845, + "language_loss": 0.69824201, + "learning_rate": 2.9622170191370404e-06, + "loss": 0.71982217, + "num_input_tokens_seen": 128642430, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.796875, + "step": 5988, + "time_per_iteration": 2.484654426574707 + }, + { + "auxiliary_loss_clip": 0.01125207, + "auxiliary_loss_mlp": 0.01041231, + "balance_loss_clip": 1.02675915, + "balance_loss_mlp": 1.04297233, + "epoch": 0.36007816022846834, + "flos": 20485565729280.0, + "grad_norm": 1.851186734533378, + "language_loss": 0.72918314, + "learning_rate": 2.9618755750076953e-06, + "loss": 0.75084746, + "num_input_tokens_seen": 128661285, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8203125, + "step": 5989, + "time_per_iteration": 2.464378833770752 + }, + { + "auxiliary_loss_clip": 0.01120868, + "auxiliary_loss_mlp": 0.0103281, + "balance_loss_clip": 1.0194943, + "balance_loss_mlp": 1.04283333, + "epoch": 0.36013828348113636, + "flos": 28001237794560.0, + "grad_norm": 1.8425061302669492, + "language_loss": 0.79664916, + "learning_rate": 2.961534094403931e-06, + "loss": 0.81818593, + "num_input_tokens_seen": 128682210, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.78125, + "step": 5990, + "time_per_iteration": 2.4947755336761475 + }, + { + "auxiliary_loss_clip": 0.01121243, + "auxiliary_loss_mlp": 0.01028868, + "balance_loss_clip": 1.01472998, + "balance_loss_mlp": 1.04281235, + "epoch": 0.3601984067338043, + "flos": 20082181017600.0, + "grad_norm": 1.9352260247419832, + "language_loss": 0.84225297, + "learning_rate": 2.961192577338698e-06, + "loss": 0.86375415, + "num_input_tokens_seen": 128700445, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.78515625, + "step": 5991, + "time_per_iteration": 2.4728991985321045 + }, + { + "auxiliary_loss_clip": 0.01123597, + "auxiliary_loss_mlp": 0.01039266, + "balance_loss_clip": 1.02490079, + "balance_loss_mlp": 1.04197788, + "epoch": 0.3602585299864723, + "flos": 18617599872000.0, + "grad_norm": 1.9640325518662143, + "language_loss": 0.75616056, + "learning_rate": 2.9608510238249463e-06, + "loss": 0.77778924, + "num_input_tokens_seen": 128716855, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.81640625, + "step": 5992, + "time_per_iteration": 2.4422738552093506 + }, + { + "auxiliary_loss_clip": 0.01119253, + "auxiliary_loss_mlp": 0.01034904, + "balance_loss_clip": 1.02022302, + "balance_loss_mlp": 1.04177451, + "epoch": 0.36031865323914025, + "flos": 19573003774080.0, + "grad_norm": 6.32582004359923, + "language_loss": 0.77500135, + "learning_rate": 2.960509433875627e-06, + "loss": 0.79654288, + "num_input_tokens_seen": 128735835, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7734375, + "step": 5993, + "time_per_iteration": 2.4513776302337646 + }, + { + "auxiliary_loss_clip": 0.01124951, + "auxiliary_loss_mlp": 0.01036544, + "balance_loss_clip": 1.02281737, + "balance_loss_mlp": 1.04405534, + "epoch": 0.3603787764918082, + "flos": 17490615678720.0, + "grad_norm": 1.9096274983436938, + "language_loss": 0.74686468, + "learning_rate": 2.9601678075036943e-06, + "loss": 0.7684797, + "num_input_tokens_seen": 128752465, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.80859375, + "step": 5994, + "time_per_iteration": 2.4278860092163086 + }, + { + "auxiliary_loss_clip": 0.0112434, + "auxiliary_loss_mlp": 0.01038632, + "balance_loss_clip": 1.02506554, + "balance_loss_mlp": 1.04320991, + "epoch": 0.3604388997444762, + "flos": 15523393564800.0, + "grad_norm": 1.8397117218597796, + "language_loss": 0.68890274, + "learning_rate": 2.9598261447221024e-06, + "loss": 0.71053243, + "num_input_tokens_seen": 128770865, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.8125, + "step": 5995, + "time_per_iteration": 2.462557554244995 + }, + { + "auxiliary_loss_clip": 0.01124519, + "auxiliary_loss_mlp": 0.01040187, + "balance_loss_clip": 1.02548289, + "balance_loss_mlp": 1.04238582, + "epoch": 0.36049902299714415, + "flos": 17310883000320.0, + "grad_norm": 1.7352965040741237, + "language_loss": 0.82057822, + "learning_rate": 2.9594844455438057e-06, + "loss": 0.84222531, + "num_input_tokens_seen": 128789730, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8203125, + "step": 5996, + "time_per_iteration": 2.4284703731536865 + }, + { + "auxiliary_loss_clip": 0.01119849, + "auxiliary_loss_mlp": 0.01034523, + "balance_loss_clip": 1.02054608, + "balance_loss_mlp": 1.04242694, + "epoch": 0.3605591462498121, + "flos": 17056025026560.0, + "grad_norm": 1.56212250683249, + "language_loss": 0.73570979, + "learning_rate": 2.959142709981763e-06, + "loss": 0.75725353, + "num_input_tokens_seen": 128806610, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7734375, + "step": 5997, + "time_per_iteration": 2.4418485164642334 + }, + { + "auxiliary_loss_clip": 0.01120213, + "auxiliary_loss_mlp": 0.01036336, + "balance_loss_clip": 1.02272177, + "balance_loss_mlp": 1.04307055, + "epoch": 0.3606192695024801, + "flos": 16836862193280.0, + "grad_norm": 2.1655767572067637, + "language_loss": 0.68651283, + "learning_rate": 2.9588009380489337e-06, + "loss": 0.70807832, + "num_input_tokens_seen": 128824830, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 5998, + "time_per_iteration": 2.435884475708008 + }, + { + "auxiliary_loss_clip": 0.01124048, + "auxiliary_loss_mlp": 0.01034007, + "balance_loss_clip": 1.01983321, + "balance_loss_mlp": 1.04494119, + "epoch": 0.36067939275514804, + "flos": 12129655743360.0, + "grad_norm": 2.6750874406601914, + "language_loss": 0.77190387, + "learning_rate": 2.9584591297582758e-06, + "loss": 0.79348445, + "num_input_tokens_seen": 128838170, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7890625, + "step": 5999, + "time_per_iteration": 2.415649175643921 + }, + { + "auxiliary_loss_clip": 0.01123679, + "auxiliary_loss_mlp": 0.01037913, + "balance_loss_clip": 1.02381015, + "balance_loss_mlp": 1.04481769, + "epoch": 0.360739516007816, + "flos": 18041449720320.0, + "grad_norm": 2.719833162653021, + "language_loss": 0.78307509, + "learning_rate": 2.9581172851227516e-06, + "loss": 0.80469108, + "num_input_tokens_seen": 128855625, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7890625, + "step": 6000, + "time_per_iteration": 2.450085401535034 + }, + { + "auxiliary_loss_clip": 0.01122067, + "auxiliary_loss_mlp": 0.01034294, + "balance_loss_clip": 1.02061474, + "balance_loss_mlp": 1.04283905, + "epoch": 0.360799639260484, + "flos": 18549800951040.0, + "grad_norm": 1.6917067376727954, + "language_loss": 0.78621352, + "learning_rate": 2.9577754041553243e-06, + "loss": 0.80777717, + "num_input_tokens_seen": 128873540, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.79296875, + "step": 6001, + "time_per_iteration": 2.4247405529022217 + }, + { + "auxiliary_loss_clip": 0.01119251, + "auxiliary_loss_mlp": 0.0103132, + "balance_loss_clip": 1.01761651, + "balance_loss_mlp": 1.04341698, + "epoch": 0.36085976251315194, + "flos": 19682028529920.0, + "grad_norm": 1.9017223481518102, + "language_loss": 0.83743405, + "learning_rate": 2.9574334868689575e-06, + "loss": 0.85893983, + "num_input_tokens_seen": 128889925, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7578125, + "step": 6002, + "time_per_iteration": 2.4587790966033936 + }, + { + "auxiliary_loss_clip": 0.01117677, + "auxiliary_loss_mlp": 0.01030372, + "balance_loss_clip": 1.01753855, + "balance_loss_mlp": 1.04298413, + "epoch": 0.3609198857658199, + "flos": 24198943703040.0, + "grad_norm": 2.101850625944426, + "language_loss": 0.90627617, + "learning_rate": 2.9570915332766165e-06, + "loss": 0.92775667, + "num_input_tokens_seen": 128906890, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.75, + "step": 6003, + "time_per_iteration": 2.450408697128296 + }, + { + "auxiliary_loss_clip": 0.01040628, + "auxiliary_loss_mlp": 0.01013073, + "balance_loss_clip": 1.01102221, + "balance_loss_mlp": 1.01496768, + "epoch": 0.3609800090184879, + "flos": 57115995160320.0, + "grad_norm": 0.8843653445723816, + "language_loss": 0.53374904, + "learning_rate": 2.9567495433912693e-06, + "loss": 0.55428606, + "num_input_tokens_seen": 128965940, + "router_z_loss_clip": 0.02050781, + "router_z_loss_mlp": 0.25585938, + "step": 6004, + "time_per_iteration": 3.005659341812134 + }, + { + "auxiliary_loss_clip": 0.01121195, + "auxiliary_loss_mlp": 0.01037049, + "balance_loss_clip": 1.02152824, + "balance_loss_mlp": 1.04164577, + "epoch": 0.3610401322711559, + "flos": 20811239366400.0, + "grad_norm": 1.7248099575523852, + "language_loss": 0.77609527, + "learning_rate": 2.956407517225883e-06, + "loss": 0.7976777, + "num_input_tokens_seen": 128985835, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.796875, + "step": 6005, + "time_per_iteration": 2.4916067123413086 + }, + { + "auxiliary_loss_clip": 0.01124405, + "auxiliary_loss_mlp": 0.0103607, + "balance_loss_clip": 1.02230704, + "balance_loss_mlp": 1.04700613, + "epoch": 0.36110025552382385, + "flos": 13699167494400.0, + "grad_norm": 2.24467290311728, + "language_loss": 0.79267776, + "learning_rate": 2.956065454793429e-06, + "loss": 0.81428248, + "num_input_tokens_seen": 129003120, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7734375, + "step": 6006, + "time_per_iteration": 2.4366166591644287 + }, + { + "auxiliary_loss_clip": 0.01124848, + "auxiliary_loss_mlp": 0.01038917, + "balance_loss_clip": 1.02309775, + "balance_loss_mlp": 1.04587984, + "epoch": 0.3611603787764918, + "flos": 22455014486400.0, + "grad_norm": 1.7888636143213261, + "language_loss": 0.84360719, + "learning_rate": 2.955723356106876e-06, + "loss": 0.86524487, + "num_input_tokens_seen": 129021645, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.7890625, + "step": 6007, + "time_per_iteration": 2.51680850982666 + }, + { + "auxiliary_loss_clip": 0.01129854, + "auxiliary_loss_mlp": 0.01037601, + "balance_loss_clip": 1.02166319, + "balance_loss_mlp": 1.04622328, + "epoch": 0.3612205020291598, + "flos": 20886651970560.0, + "grad_norm": 2.0771979180574425, + "language_loss": 0.72564125, + "learning_rate": 2.955381221179198e-06, + "loss": 0.74731576, + "num_input_tokens_seen": 129038375, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8359375, + "step": 6008, + "time_per_iteration": 2.4473018646240234 + }, + { + "auxiliary_loss_clip": 0.01120564, + "auxiliary_loss_mlp": 0.01033967, + "balance_loss_clip": 1.02066362, + "balance_loss_mlp": 1.04255283, + "epoch": 0.36128062528182775, + "flos": 15741981780480.0, + "grad_norm": 1.9836274680059969, + "language_loss": 0.8284781, + "learning_rate": 2.955039050023368e-06, + "loss": 0.85002339, + "num_input_tokens_seen": 129056235, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.78125, + "step": 6009, + "time_per_iteration": 2.470031261444092 + }, + { + "auxiliary_loss_clip": 0.01125455, + "auxiliary_loss_mlp": 0.01043722, + "balance_loss_clip": 1.02945232, + "balance_loss_mlp": 1.04598057, + "epoch": 0.3613407485344957, + "flos": 16764502245120.0, + "grad_norm": 1.714442270200285, + "language_loss": 0.76139152, + "learning_rate": 2.954696842652362e-06, + "loss": 0.78308332, + "num_input_tokens_seen": 129072405, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.79296875, + "step": 6010, + "time_per_iteration": 2.446833848953247 + }, + { + "auxiliary_loss_clip": 0.01123758, + "auxiliary_loss_mlp": 0.01037408, + "balance_loss_clip": 1.0236752, + "balance_loss_mlp": 1.04619896, + "epoch": 0.3614008717871637, + "flos": 20371189847040.0, + "grad_norm": 1.905716478313633, + "language_loss": 0.82946253, + "learning_rate": 2.9543545990791554e-06, + "loss": 0.85107422, + "num_input_tokens_seen": 129090225, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.77734375, + "step": 6011, + "time_per_iteration": 2.508147716522217 + }, + { + "auxiliary_loss_clip": 0.01132257, + "auxiliary_loss_mlp": 0.01041461, + "balance_loss_clip": 1.0264287, + "balance_loss_mlp": 1.0491302, + "epoch": 0.36146099503983165, + "flos": 22776665800320.0, + "grad_norm": 1.8484903271380355, + "language_loss": 0.62762833, + "learning_rate": 2.954012319316727e-06, + "loss": 0.64936543, + "num_input_tokens_seen": 129107685, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 6012, + "time_per_iteration": 5.36588454246521 + }, + { + "auxiliary_loss_clip": 0.01118968, + "auxiliary_loss_mlp": 0.01034853, + "balance_loss_clip": 1.02112007, + "balance_loss_mlp": 1.04337454, + "epoch": 0.3615211182924996, + "flos": 22996654646400.0, + "grad_norm": 1.8689670235824563, + "language_loss": 0.84111822, + "learning_rate": 2.9536700033780565e-06, + "loss": 0.86265635, + "num_input_tokens_seen": 129125315, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7578125, + "step": 6013, + "time_per_iteration": 2.494051933288574 + }, + { + "auxiliary_loss_clip": 0.01124804, + "auxiliary_loss_mlp": 0.0104133, + "balance_loss_clip": 1.02690601, + "balance_loss_mlp": 1.04570448, + "epoch": 0.3615812415451676, + "flos": 16648079287680.0, + "grad_norm": 1.7351999387675028, + "language_loss": 0.91496456, + "learning_rate": 2.9533276512761228e-06, + "loss": 0.93662584, + "num_input_tokens_seen": 129141600, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7890625, + "step": 6014, + "time_per_iteration": 2.4356749057769775 + }, + { + "auxiliary_loss_clip": 0.01123597, + "auxiliary_loss_mlp": 0.01044534, + "balance_loss_clip": 1.03078914, + "balance_loss_mlp": 1.04549718, + "epoch": 0.36164136479783554, + "flos": 21320093387520.0, + "grad_norm": 1.727703603585928, + "language_loss": 0.73830914, + "learning_rate": 2.95298526302391e-06, + "loss": 0.75999045, + "num_input_tokens_seen": 129160665, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.78125, + "step": 6015, + "time_per_iteration": 2.4990644454956055 + }, + { + "auxiliary_loss_clip": 0.01125644, + "auxiliary_loss_mlp": 0.01038195, + "balance_loss_clip": 1.02394915, + "balance_loss_mlp": 1.04633307, + "epoch": 0.3617014880505035, + "flos": 24169569356160.0, + "grad_norm": 1.7277224025907603, + "language_loss": 0.65316677, + "learning_rate": 2.9526428386344e-06, + "loss": 0.67480516, + "num_input_tokens_seen": 129179220, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.79296875, + "step": 6016, + "time_per_iteration": 2.5260934829711914 + }, + { + "auxiliary_loss_clip": 0.01126131, + "auxiliary_loss_mlp": 0.01041518, + "balance_loss_clip": 1.02522171, + "balance_loss_mlp": 1.04727304, + "epoch": 0.3617616113031715, + "flos": 39014824101120.0, + "grad_norm": 1.744160138264151, + "language_loss": 0.72101283, + "learning_rate": 2.9523003781205785e-06, + "loss": 0.74268931, + "num_input_tokens_seen": 129200385, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.7890625, + "step": 6017, + "time_per_iteration": 2.638683795928955 + }, + { + "auxiliary_loss_clip": 0.01126121, + "auxiliary_loss_mlp": 0.01038852, + "balance_loss_clip": 1.02413559, + "balance_loss_mlp": 1.04454577, + "epoch": 0.3618217345558395, + "flos": 12130840892160.0, + "grad_norm": 1.9120538903838002, + "language_loss": 0.73590356, + "learning_rate": 2.9519578814954307e-06, + "loss": 0.75755334, + "num_input_tokens_seen": 129217395, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.81640625, + "step": 6018, + "time_per_iteration": 2.4477858543395996 + }, + { + "auxiliary_loss_clip": 0.01119909, + "auxiliary_loss_mlp": 0.01033819, + "balance_loss_clip": 1.02013361, + "balance_loss_mlp": 1.04458487, + "epoch": 0.36188185780850746, + "flos": 24935005203840.0, + "grad_norm": 1.754547200149591, + "language_loss": 0.69080901, + "learning_rate": 2.9516153487719448e-06, + "loss": 0.71234632, + "num_input_tokens_seen": 129238940, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75390625, + "step": 6019, + "time_per_iteration": 2.519831657409668 + }, + { + "auxiliary_loss_clip": 0.01124958, + "auxiliary_loss_mlp": 0.01034647, + "balance_loss_clip": 1.01980555, + "balance_loss_mlp": 1.0443728, + "epoch": 0.3619419810611754, + "flos": 20958832350720.0, + "grad_norm": 1.5467952079219929, + "language_loss": 0.76299942, + "learning_rate": 2.95127277996311e-06, + "loss": 0.78459549, + "num_input_tokens_seen": 129258240, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8046875, + "step": 6020, + "time_per_iteration": 2.4692177772521973 + }, + { + "auxiliary_loss_clip": 0.01125932, + "auxiliary_loss_mlp": 0.01043324, + "balance_loss_clip": 1.02814841, + "balance_loss_mlp": 1.04721653, + "epoch": 0.3620021043138434, + "flos": 22528882805760.0, + "grad_norm": 1.938447153390643, + "language_loss": 0.73921824, + "learning_rate": 2.9509301750819156e-06, + "loss": 0.76091087, + "num_input_tokens_seen": 129279040, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.78515625, + "step": 6021, + "time_per_iteration": 2.5069808959960938 + }, + { + "auxiliary_loss_clip": 0.01123146, + "auxiliary_loss_mlp": 0.01034023, + "balance_loss_clip": 1.02059376, + "balance_loss_mlp": 1.04596186, + "epoch": 0.36206222756651135, + "flos": 15596687266560.0, + "grad_norm": 1.8648032073369731, + "language_loss": 0.80978441, + "learning_rate": 2.9505875341413533e-06, + "loss": 0.83135605, + "num_input_tokens_seen": 129295415, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.76953125, + "step": 6022, + "time_per_iteration": 2.4620115756988525 + }, + { + "auxiliary_loss_clip": 0.01122576, + "auxiliary_loss_mlp": 0.0103516, + "balance_loss_clip": 1.02212477, + "balance_loss_mlp": 1.04778302, + "epoch": 0.3621223508191793, + "flos": 23587170238080.0, + "grad_norm": 1.6799220656127192, + "language_loss": 0.81351119, + "learning_rate": 2.950244857154417e-06, + "loss": 0.83508855, + "num_input_tokens_seen": 129312620, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.74609375, + "step": 6023, + "time_per_iteration": 2.4969308376312256 + }, + { + "auxiliary_loss_clip": 0.01124727, + "auxiliary_loss_mlp": 0.01034523, + "balance_loss_clip": 1.01975274, + "balance_loss_mlp": 1.04494548, + "epoch": 0.3621824740718473, + "flos": 22309899540480.0, + "grad_norm": 1.8793265875700644, + "language_loss": 0.79767907, + "learning_rate": 2.9499021441341e-06, + "loss": 0.81927156, + "num_input_tokens_seen": 129331825, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.796875, + "step": 6024, + "time_per_iteration": 2.468369245529175 + }, + { + "auxiliary_loss_clip": 0.01119855, + "auxiliary_loss_mlp": 0.01029797, + "balance_loss_clip": 1.01629043, + "balance_loss_mlp": 1.04456711, + "epoch": 0.36224259732451525, + "flos": 16763640318720.0, + "grad_norm": 1.7897574616215441, + "language_loss": 0.74720407, + "learning_rate": 2.9495593950933997e-06, + "loss": 0.7687006, + "num_input_tokens_seen": 129350400, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75390625, + "step": 6025, + "time_per_iteration": 2.4410412311553955 + }, + { + "auxiliary_loss_clip": 0.01120005, + "auxiliary_loss_mlp": 0.0103221, + "balance_loss_clip": 1.01849484, + "balance_loss_mlp": 1.04340899, + "epoch": 0.3623027205771832, + "flos": 23149742411520.0, + "grad_norm": 1.5522426900619628, + "language_loss": 0.72055018, + "learning_rate": 2.9492166100453107e-06, + "loss": 0.74207234, + "num_input_tokens_seen": 129371155, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.765625, + "step": 6026, + "time_per_iteration": 2.4997596740722656 + }, + { + "auxiliary_loss_clip": 0.01128673, + "auxiliary_loss_mlp": 0.01041263, + "balance_loss_clip": 1.02645707, + "balance_loss_mlp": 1.04604256, + "epoch": 0.3623628438298512, + "flos": 28549162834560.0, + "grad_norm": 2.401846993246305, + "language_loss": 0.79332775, + "learning_rate": 2.948873789002833e-06, + "loss": 0.81502712, + "num_input_tokens_seen": 129391230, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.82421875, + "step": 6027, + "time_per_iteration": 2.5326383113861084 + }, + { + "auxiliary_loss_clip": 0.0112338, + "auxiliary_loss_mlp": 0.01040685, + "balance_loss_clip": 1.02576041, + "balance_loss_mlp": 1.04399586, + "epoch": 0.36242296708251914, + "flos": 25484941405440.0, + "grad_norm": 1.7548337209278033, + "language_loss": 0.67809385, + "learning_rate": 2.9485309319789667e-06, + "loss": 0.69973445, + "num_input_tokens_seen": 129410065, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.796875, + "step": 6028, + "time_per_iteration": 2.548088788986206 + }, + { + "auxiliary_loss_clip": 0.0112104, + "auxiliary_loss_mlp": 0.01032512, + "balance_loss_clip": 1.01922584, + "balance_loss_mlp": 1.04415894, + "epoch": 0.3624830903351871, + "flos": 16290373697280.0, + "grad_norm": 1.63067637662311, + "language_loss": 0.85700679, + "learning_rate": 2.9481880389867117e-06, + "loss": 0.8785423, + "num_input_tokens_seen": 129428655, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.76953125, + "step": 6029, + "time_per_iteration": 2.429720878601074 + }, + { + "auxiliary_loss_clip": 0.01120137, + "auxiliary_loss_mlp": 0.01038059, + "balance_loss_clip": 1.02412939, + "balance_loss_mlp": 1.04442835, + "epoch": 0.36254321358785513, + "flos": 18296307694080.0, + "grad_norm": 1.6511023563359555, + "language_loss": 0.72693753, + "learning_rate": 2.9478451100390714e-06, + "loss": 0.74851942, + "num_input_tokens_seen": 129447845, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 6030, + "time_per_iteration": 2.4299302101135254 + }, + { + "auxiliary_loss_clip": 0.01123199, + "auxiliary_loss_mlp": 0.01041671, + "balance_loss_clip": 1.02529144, + "balance_loss_mlp": 1.04264557, + "epoch": 0.3626033368405231, + "flos": 14865294533760.0, + "grad_norm": 2.02536170930057, + "language_loss": 0.73986644, + "learning_rate": 2.94750214514905e-06, + "loss": 0.76151514, + "num_input_tokens_seen": 129463275, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8046875, + "step": 6031, + "time_per_iteration": 2.4376232624053955 + }, + { + "auxiliary_loss_clip": 0.01120355, + "auxiliary_loss_mlp": 0.01031654, + "balance_loss_clip": 1.0177424, + "balance_loss_mlp": 1.04309845, + "epoch": 0.36266346009319106, + "flos": 22306595489280.0, + "grad_norm": 1.8475328889194098, + "language_loss": 0.73286617, + "learning_rate": 2.9471591443296516e-06, + "loss": 0.75438625, + "num_input_tokens_seen": 129483205, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76953125, + "step": 6032, + "time_per_iteration": 2.4811155796051025 + }, + { + "auxiliary_loss_clip": 0.01121253, + "auxiliary_loss_mlp": 0.01038206, + "balance_loss_clip": 1.02412748, + "balance_loss_mlp": 1.0427382, + "epoch": 0.362723583345859, + "flos": 18222331633920.0, + "grad_norm": 1.684246043345259, + "language_loss": 0.77953577, + "learning_rate": 2.946816107593884e-06, + "loss": 0.80113035, + "num_input_tokens_seen": 129499885, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.78515625, + "step": 6033, + "time_per_iteration": 2.4283456802368164 + }, + { + "auxiliary_loss_clip": 0.01040416, + "auxiliary_loss_mlp": 0.01019079, + "balance_loss_clip": 1.01733828, + "balance_loss_mlp": 1.01487339, + "epoch": 0.362783706598527, + "flos": 68499174458880.0, + "grad_norm": 0.786107382559835, + "language_loss": 0.64822888, + "learning_rate": 2.9464730349547547e-06, + "loss": 0.66882384, + "num_input_tokens_seen": 129561885, + "router_z_loss_clip": 0.01745605, + "router_z_loss_mlp": 0.25585938, + "step": 6034, + "time_per_iteration": 3.1253511905670166 + }, + { + "auxiliary_loss_clip": 0.01118206, + "auxiliary_loss_mlp": 0.01035423, + "balance_loss_clip": 1.02139246, + "balance_loss_mlp": 1.04131126, + "epoch": 0.36284382985119495, + "flos": 26576589594240.0, + "grad_norm": 1.4985312456135769, + "language_loss": 0.90059769, + "learning_rate": 2.946129926425273e-06, + "loss": 0.92213392, + "num_input_tokens_seen": 129582325, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76953125, + "step": 6035, + "time_per_iteration": 2.4888923168182373 + }, + { + "auxiliary_loss_clip": 0.01122001, + "auxiliary_loss_mlp": 0.01034945, + "balance_loss_clip": 1.02030611, + "balance_loss_mlp": 1.04239392, + "epoch": 0.3629039531038629, + "flos": 20156767608960.0, + "grad_norm": 1.7493433732375512, + "language_loss": 0.73526931, + "learning_rate": 2.9457867820184496e-06, + "loss": 0.7568388, + "num_input_tokens_seen": 129600350, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.796875, + "step": 6036, + "time_per_iteration": 2.445058822631836 + }, + { + "auxiliary_loss_clip": 0.01124436, + "auxiliary_loss_mlp": 0.01029235, + "balance_loss_clip": 1.01500189, + "balance_loss_mlp": 1.04274487, + "epoch": 0.3629640763565309, + "flos": 18625716345600.0, + "grad_norm": 1.901551926176817, + "language_loss": 0.75938255, + "learning_rate": 2.945443601747297e-06, + "loss": 0.78091925, + "num_input_tokens_seen": 129618425, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.81640625, + "step": 6037, + "time_per_iteration": 2.422229766845703 + }, + { + "auxiliary_loss_clip": 0.0111661, + "auxiliary_loss_mlp": 0.0103799, + "balance_loss_clip": 1.0238812, + "balance_loss_mlp": 1.04227912, + "epoch": 0.36302419960919885, + "flos": 19571459489280.0, + "grad_norm": 1.6899683541385933, + "language_loss": 0.78120697, + "learning_rate": 2.945100385624828e-06, + "loss": 0.80275297, + "num_input_tokens_seen": 129636750, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 6038, + "time_per_iteration": 2.4582855701446533 + }, + { + "auxiliary_loss_clip": 0.0103994, + "auxiliary_loss_mlp": 0.01006466, + "balance_loss_clip": 1.00467765, + "balance_loss_mlp": 1.01452303, + "epoch": 0.3630843228618668, + "flos": 63797606444160.0, + "grad_norm": 0.8286249809211084, + "language_loss": 0.63413143, + "learning_rate": 2.9447571336640573e-06, + "loss": 0.65459549, + "num_input_tokens_seen": 129699030, + "router_z_loss_clip": 0.01782227, + "router_z_loss_mlp": 0.25390625, + "step": 6039, + "time_per_iteration": 3.1417860984802246 + }, + { + "auxiliary_loss_clip": 0.01120334, + "auxiliary_loss_mlp": 0.01035281, + "balance_loss_clip": 1.02175713, + "balance_loss_mlp": 1.04391789, + "epoch": 0.3631444461145348, + "flos": 21835160461440.0, + "grad_norm": 1.9215128015710738, + "language_loss": 0.70857447, + "learning_rate": 2.944413845878002e-06, + "loss": 0.73013067, + "num_input_tokens_seen": 129717135, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 6040, + "time_per_iteration": 2.505627155303955 + }, + { + "auxiliary_loss_clip": 0.0112497, + "auxiliary_loss_mlp": 0.01032549, + "balance_loss_clip": 1.01827383, + "balance_loss_mlp": 1.04445744, + "epoch": 0.36320456936720275, + "flos": 21722041555200.0, + "grad_norm": 2.327350689124367, + "language_loss": 0.81322253, + "learning_rate": 2.9440705222796783e-06, + "loss": 0.83479762, + "num_input_tokens_seen": 129735940, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8046875, + "step": 6041, + "time_per_iteration": 2.4475231170654297 + }, + { + "auxiliary_loss_clip": 0.01120173, + "auxiliary_loss_mlp": 0.01030159, + "balance_loss_clip": 1.01526928, + "balance_loss_mlp": 1.04150891, + "epoch": 0.3632646926198707, + "flos": 17019072910080.0, + "grad_norm": 2.252727008735842, + "language_loss": 0.83721769, + "learning_rate": 2.943727162882107e-06, + "loss": 0.85872102, + "num_input_tokens_seen": 129752790, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78515625, + "step": 6042, + "time_per_iteration": 2.461111545562744 + }, + { + "auxiliary_loss_clip": 0.01120803, + "auxiliary_loss_mlp": 0.01039778, + "balance_loss_clip": 1.02583623, + "balance_loss_mlp": 1.04390788, + "epoch": 0.36332481587253873, + "flos": 23331163029120.0, + "grad_norm": 1.6644116234057968, + "language_loss": 0.78122932, + "learning_rate": 2.9433837676983064e-06, + "loss": 0.80283511, + "num_input_tokens_seen": 129773655, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76953125, + "step": 6043, + "time_per_iteration": 2.477030038833618 + }, + { + "auxiliary_loss_clip": 0.01117368, + "auxiliary_loss_mlp": 0.01035057, + "balance_loss_clip": 1.02017403, + "balance_loss_mlp": 1.04266226, + "epoch": 0.3633849391252067, + "flos": 10743539857920.0, + "grad_norm": 3.8032713581650515, + "language_loss": 0.65792918, + "learning_rate": 2.943040336741298e-06, + "loss": 0.67945337, + "num_input_tokens_seen": 129791605, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.74609375, + "step": 6044, + "time_per_iteration": 2.471221446990967 + }, + { + "auxiliary_loss_clip": 0.01118191, + "auxiliary_loss_mlp": 0.01030901, + "balance_loss_clip": 1.01706135, + "balance_loss_mlp": 1.04186332, + "epoch": 0.36344506237787466, + "flos": 25849147357440.0, + "grad_norm": 1.74112377533005, + "language_loss": 0.80978471, + "learning_rate": 2.9426968700241066e-06, + "loss": 0.83127558, + "num_input_tokens_seen": 129811075, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76171875, + "step": 6045, + "time_per_iteration": 2.482147693634033 + }, + { + "auxiliary_loss_clip": 0.01122131, + "auxiliary_loss_mlp": 0.01038556, + "balance_loss_clip": 1.02388096, + "balance_loss_mlp": 1.04342091, + "epoch": 0.3635051856305426, + "flos": 30154046503680.0, + "grad_norm": 1.7414472049280392, + "language_loss": 0.64214617, + "learning_rate": 2.942353367559755e-06, + "loss": 0.66375309, + "num_input_tokens_seen": 129833755, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78515625, + "step": 6046, + "time_per_iteration": 2.593209743499756 + }, + { + "auxiliary_loss_clip": 0.01119542, + "auxiliary_loss_mlp": 0.01035387, + "balance_loss_clip": 1.02142787, + "balance_loss_mlp": 1.04214859, + "epoch": 0.3635653088832106, + "flos": 22198396746240.0, + "grad_norm": 1.623453692259123, + "language_loss": 0.77366132, + "learning_rate": 2.9420098293612692e-06, + "loss": 0.7952106, + "num_input_tokens_seen": 129854475, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7734375, + "step": 6047, + "time_per_iteration": 2.4650797843933105 + }, + { + "auxiliary_loss_clip": 0.01125471, + "auxiliary_loss_mlp": 0.01041953, + "balance_loss_clip": 1.02609777, + "balance_loss_mlp": 1.04148006, + "epoch": 0.36362543213587856, + "flos": 24787053083520.0, + "grad_norm": 1.508802610673932, + "language_loss": 0.79679012, + "learning_rate": 2.9416662554416767e-06, + "loss": 0.81846434, + "num_input_tokens_seen": 129873530, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8359375, + "step": 6048, + "time_per_iteration": 2.5329999923706055 + }, + { + "auxiliary_loss_clip": 0.01037747, + "auxiliary_loss_mlp": 0.01000265, + "balance_loss_clip": 0.99839348, + "balance_loss_mlp": 1.0124383, + "epoch": 0.3636855553885465, + "flos": 62526369231360.0, + "grad_norm": 0.7564639677567045, + "language_loss": 0.52584642, + "learning_rate": 2.9413226458140054e-06, + "loss": 0.54622656, + "num_input_tokens_seen": 129940400, + "router_z_loss_clip": 0.01867676, + "router_z_loss_mlp": 0.25390625, + "step": 6049, + "time_per_iteration": 3.1051762104034424 + }, + { + "auxiliary_loss_clip": 0.0112103, + "auxiliary_loss_mlp": 0.01036313, + "balance_loss_clip": 1.02172136, + "balance_loss_mlp": 1.04254675, + "epoch": 0.3637456786412145, + "flos": 24060652341120.0, + "grad_norm": 2.0453292842004833, + "language_loss": 0.86365628, + "learning_rate": 2.9409790004912845e-06, + "loss": 0.88522977, + "num_input_tokens_seen": 129958635, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.78515625, + "step": 6050, + "time_per_iteration": 2.469092845916748 + }, + { + "auxiliary_loss_clip": 0.01119484, + "auxiliary_loss_mlp": 0.01036489, + "balance_loss_clip": 1.0227803, + "balance_loss_mlp": 1.04309154, + "epoch": 0.36380580189388245, + "flos": 16691495852160.0, + "grad_norm": 1.7649295268136813, + "language_loss": 0.7855531, + "learning_rate": 2.940635319486546e-06, + "loss": 0.80711287, + "num_input_tokens_seen": 129977685, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76171875, + "step": 6051, + "time_per_iteration": 2.425166368484497 + }, + { + "auxiliary_loss_clip": 0.0111821, + "auxiliary_loss_mlp": 0.01034377, + "balance_loss_clip": 1.02044129, + "balance_loss_mlp": 1.04047346, + "epoch": 0.3638659251465504, + "flos": 25114091437440.0, + "grad_norm": 2.0280679706971423, + "language_loss": 0.83024764, + "learning_rate": 2.940291602812822e-06, + "loss": 0.8517735, + "num_input_tokens_seen": 129997530, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.77734375, + "step": 6052, + "time_per_iteration": 2.511826992034912 + }, + { + "auxiliary_loss_clip": 0.01114918, + "auxiliary_loss_mlp": 0.01034279, + "balance_loss_clip": 1.02146947, + "balance_loss_mlp": 1.03992438, + "epoch": 0.3639260483992184, + "flos": 23003011353600.0, + "grad_norm": 3.055248278017369, + "language_loss": 0.72156489, + "learning_rate": 2.939947850483145e-06, + "loss": 0.74305683, + "num_input_tokens_seen": 130017955, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.75, + "step": 6053, + "time_per_iteration": 4.030078887939453 + }, + { + "auxiliary_loss_clip": 0.01038499, + "auxiliary_loss_mlp": 0.01000787, + "balance_loss_clip": 0.99893934, + "balance_loss_mlp": 1.01315093, + "epoch": 0.36398617165188635, + "flos": 70716011160960.0, + "grad_norm": 0.7695228081579073, + "language_loss": 0.61234874, + "learning_rate": 2.9396040625105532e-06, + "loss": 0.63274157, + "num_input_tokens_seen": 130074275, + "router_z_loss_clip": 0.01843262, + "router_z_loss_mlp": 0.25390625, + "step": 6054, + "time_per_iteration": 4.498634576797485 + }, + { + "auxiliary_loss_clip": 0.01121607, + "auxiliary_loss_mlp": 0.01038553, + "balance_loss_clip": 1.02378917, + "balance_loss_mlp": 1.0425837, + "epoch": 0.3640462949045543, + "flos": 22235456603520.0, + "grad_norm": 1.9647165397438333, + "language_loss": 0.75846946, + "learning_rate": 2.9392602389080802e-06, + "loss": 0.78007108, + "num_input_tokens_seen": 130091375, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.7890625, + "step": 6055, + "time_per_iteration": 2.46478271484375 + }, + { + "auxiliary_loss_clip": 0.01121913, + "auxiliary_loss_mlp": 0.010384, + "balance_loss_clip": 1.0240891, + "balance_loss_mlp": 1.04369521, + "epoch": 0.3641064181572223, + "flos": 21543529939200.0, + "grad_norm": 1.6567803669377452, + "language_loss": 0.75263339, + "learning_rate": 2.938916379688765e-06, + "loss": 0.7742365, + "num_input_tokens_seen": 130111595, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78125, + "step": 6056, + "time_per_iteration": 2.4739041328430176 + }, + { + "auxiliary_loss_clip": 0.01120031, + "auxiliary_loss_mlp": 0.01039041, + "balance_loss_clip": 1.02447379, + "balance_loss_mlp": 1.04331231, + "epoch": 0.3641665414098903, + "flos": 22273306560000.0, + "grad_norm": 2.0844054878938607, + "language_loss": 0.80676425, + "learning_rate": 2.9385724848656468e-06, + "loss": 0.82835501, + "num_input_tokens_seen": 130131440, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.765625, + "step": 6057, + "time_per_iteration": 2.4778594970703125 + }, + { + "auxiliary_loss_clip": 0.01119344, + "auxiliary_loss_mlp": 0.0103916, + "balance_loss_clip": 1.02457452, + "balance_loss_mlp": 1.04333091, + "epoch": 0.36422666466255826, + "flos": 28329676778880.0, + "grad_norm": 1.8744131952209395, + "language_loss": 0.79986346, + "learning_rate": 2.9382285544517647e-06, + "loss": 0.82144856, + "num_input_tokens_seen": 130151375, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.76171875, + "step": 6058, + "time_per_iteration": 2.5267081260681152 + }, + { + "auxiliary_loss_clip": 0.01119278, + "auxiliary_loss_mlp": 0.01035858, + "balance_loss_clip": 1.02142191, + "balance_loss_mlp": 1.04207647, + "epoch": 0.36428678791522623, + "flos": 24170503109760.0, + "grad_norm": 1.8448855765347556, + "language_loss": 0.8485254, + "learning_rate": 2.9378845884601636e-06, + "loss": 0.87007678, + "num_input_tokens_seen": 130169960, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.76953125, + "step": 6059, + "time_per_iteration": 2.4876210689544678 + }, + { + "auxiliary_loss_clip": 0.01123355, + "auxiliary_loss_mlp": 0.01040047, + "balance_loss_clip": 1.02527666, + "balance_loss_mlp": 1.04397857, + "epoch": 0.3643469111678942, + "flos": 22528451842560.0, + "grad_norm": 1.4958849024653313, + "language_loss": 0.8783946, + "learning_rate": 2.937540586903884e-06, + "loss": 0.90002865, + "num_input_tokens_seen": 130189800, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.79296875, + "step": 6060, + "time_per_iteration": 2.516439199447632 + }, + { + "auxiliary_loss_clip": 0.01124396, + "auxiliary_loss_mlp": 0.0104086, + "balance_loss_clip": 1.02583957, + "balance_loss_mlp": 1.04366183, + "epoch": 0.36440703442056216, + "flos": 19426595938560.0, + "grad_norm": 2.6600271028380824, + "language_loss": 0.67965293, + "learning_rate": 2.937196549795971e-06, + "loss": 0.70130551, + "num_input_tokens_seen": 130206370, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.80859375, + "step": 6061, + "time_per_iteration": 2.4436440467834473 + }, + { + "auxiliary_loss_clip": 0.01127668, + "auxiliary_loss_mlp": 0.01039684, + "balance_loss_clip": 1.02444267, + "balance_loss_mlp": 1.04622734, + "epoch": 0.3644671576732301, + "flos": 18040515966720.0, + "grad_norm": 2.142951671935031, + "language_loss": 0.75072217, + "learning_rate": 2.9368524771494718e-06, + "loss": 0.77239573, + "num_input_tokens_seen": 130224445, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8125, + "step": 6062, + "time_per_iteration": 2.4325368404388428 + }, + { + "auxiliary_loss_clip": 0.011222, + "auxiliary_loss_mlp": 0.01035014, + "balance_loss_clip": 1.01910567, + "balance_loss_mlp": 1.04460645, + "epoch": 0.3645272809258981, + "flos": 21542811667200.0, + "grad_norm": 1.6782897381106048, + "language_loss": 0.72632384, + "learning_rate": 2.936508368977432e-06, + "loss": 0.74789596, + "num_input_tokens_seen": 130245380, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.77734375, + "step": 6063, + "time_per_iteration": 2.498168468475342 + }, + { + "auxiliary_loss_clip": 0.0112083, + "auxiliary_loss_mlp": 0.01039322, + "balance_loss_clip": 1.0249579, + "balance_loss_mlp": 1.04365671, + "epoch": 0.36458740417856605, + "flos": 22746860490240.0, + "grad_norm": 1.8702732296649918, + "language_loss": 0.68128121, + "learning_rate": 2.936164225292901e-06, + "loss": 0.70288265, + "num_input_tokens_seen": 130265575, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7734375, + "step": 6064, + "time_per_iteration": 2.4951584339141846 + }, + { + "auxiliary_loss_clip": 0.01125722, + "auxiliary_loss_mlp": 0.01046801, + "balance_loss_clip": 1.03205502, + "balance_loss_mlp": 1.04549003, + "epoch": 0.364647527431234, + "flos": 26140670138880.0, + "grad_norm": 1.679838788119498, + "language_loss": 0.74604851, + "learning_rate": 2.9358200461089297e-06, + "loss": 0.76777375, + "num_input_tokens_seen": 130286195, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8046875, + "step": 6065, + "time_per_iteration": 2.4980344772338867 + }, + { + "auxiliary_loss_clip": 0.01125488, + "auxiliary_loss_mlp": 0.01041621, + "balance_loss_clip": 1.02544403, + "balance_loss_mlp": 1.04464209, + "epoch": 0.364707650683902, + "flos": 31029907737600.0, + "grad_norm": 1.8520658730284223, + "language_loss": 0.75248677, + "learning_rate": 2.9354758314385676e-06, + "loss": 0.77415788, + "num_input_tokens_seen": 130306095, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8046875, + "step": 6066, + "time_per_iteration": 2.5525264739990234 + }, + { + "auxiliary_loss_clip": 0.01116764, + "auxiliary_loss_mlp": 0.01034497, + "balance_loss_clip": 1.02101445, + "balance_loss_mlp": 1.04115653, + "epoch": 0.36476777393656995, + "flos": 19572896033280.0, + "grad_norm": 2.55479391525507, + "language_loss": 0.76988614, + "learning_rate": 2.9351315812948684e-06, + "loss": 0.79139876, + "num_input_tokens_seen": 130324685, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 6067, + "time_per_iteration": 2.440595865249634 + }, + { + "auxiliary_loss_clip": 0.01120327, + "auxiliary_loss_mlp": 0.01037255, + "balance_loss_clip": 1.02422583, + "balance_loss_mlp": 1.04442596, + "epoch": 0.3648278971892379, + "flos": 17748849530880.0, + "grad_norm": 2.1532465459722574, + "language_loss": 0.70826519, + "learning_rate": 2.934787295690886e-06, + "loss": 0.72984099, + "num_input_tokens_seen": 130343855, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7578125, + "step": 6068, + "time_per_iteration": 2.4555468559265137 + }, + { + "auxiliary_loss_clip": 0.01123082, + "auxiliary_loss_mlp": 0.01037893, + "balance_loss_clip": 1.02359986, + "balance_loss_mlp": 1.04301953, + "epoch": 0.3648880204419059, + "flos": 17931167988480.0, + "grad_norm": 1.8428063971352102, + "language_loss": 0.73987395, + "learning_rate": 2.9344429746396755e-06, + "loss": 0.76148373, + "num_input_tokens_seen": 130362320, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.80078125, + "step": 6069, + "time_per_iteration": 2.4380593299865723 + }, + { + "auxiliary_loss_clip": 0.01124432, + "auxiliary_loss_mlp": 0.01035086, + "balance_loss_clip": 1.0203104, + "balance_loss_mlp": 1.04434299, + "epoch": 0.3649481436945739, + "flos": 22638266697600.0, + "grad_norm": 1.740540431199334, + "language_loss": 0.66149801, + "learning_rate": 2.9340986181542945e-06, + "loss": 0.68309319, + "num_input_tokens_seen": 130383165, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.80078125, + "step": 6070, + "time_per_iteration": 2.4852278232574463 + }, + { + "auxiliary_loss_clip": 0.01120342, + "auxiliary_loss_mlp": 0.01036109, + "balance_loss_clip": 1.02225685, + "balance_loss_mlp": 1.04412127, + "epoch": 0.36500826694724187, + "flos": 21579656042880.0, + "grad_norm": 1.5531027619052142, + "language_loss": 0.74474913, + "learning_rate": 2.9337542262477994e-06, + "loss": 0.76631367, + "num_input_tokens_seen": 130402425, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76171875, + "step": 6071, + "time_per_iteration": 2.483961820602417 + }, + { + "auxiliary_loss_clip": 0.01119978, + "auxiliary_loss_mlp": 0.01034213, + "balance_loss_clip": 1.01926446, + "balance_loss_mlp": 1.04232538, + "epoch": 0.36506839019990983, + "flos": 13772533023360.0, + "grad_norm": 2.0347636440980277, + "language_loss": 0.88132894, + "learning_rate": 2.9334097989332506e-06, + "loss": 0.90287089, + "num_input_tokens_seen": 130419440, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.77734375, + "step": 6072, + "time_per_iteration": 2.4083876609802246 + }, + { + "auxiliary_loss_clip": 0.01121735, + "auxiliary_loss_mlp": 0.01035678, + "balance_loss_clip": 1.02184379, + "balance_loss_mlp": 1.04389739, + "epoch": 0.3651285134525778, + "flos": 17274972378240.0, + "grad_norm": 2.230203116909298, + "language_loss": 0.72432441, + "learning_rate": 2.9330653362237094e-06, + "loss": 0.74589849, + "num_input_tokens_seen": 130438495, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.77734375, + "step": 6073, + "time_per_iteration": 2.4769015312194824 + }, + { + "auxiliary_loss_clip": 0.01123465, + "auxiliary_loss_mlp": 0.01039544, + "balance_loss_clip": 1.02520275, + "balance_loss_mlp": 1.04425395, + "epoch": 0.36518863670524576, + "flos": 21907987286400.0, + "grad_norm": 1.8811318432297164, + "language_loss": 0.66584921, + "learning_rate": 2.932720838132236e-06, + "loss": 0.68747932, + "num_input_tokens_seen": 130455575, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7890625, + "step": 6074, + "time_per_iteration": 2.4474194049835205 + }, + { + "auxiliary_loss_clip": 0.01117894, + "auxiliary_loss_mlp": 0.01032639, + "balance_loss_clip": 1.01891208, + "balance_loss_mlp": 1.04079318, + "epoch": 0.3652487599579137, + "flos": 27122180250240.0, + "grad_norm": 1.5068114870819531, + "language_loss": 0.72946787, + "learning_rate": 2.9323763046718954e-06, + "loss": 0.75097322, + "num_input_tokens_seen": 130476385, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76953125, + "step": 6075, + "time_per_iteration": 2.5063765048980713 + }, + { + "auxiliary_loss_clip": 0.01126029, + "auxiliary_loss_mlp": 0.01044219, + "balance_loss_clip": 1.02888894, + "balance_loss_mlp": 1.04484594, + "epoch": 0.3653088832105817, + "flos": 19755573626880.0, + "grad_norm": 2.7314154698808113, + "language_loss": 0.8938573, + "learning_rate": 2.9320317358557524e-06, + "loss": 0.91555977, + "num_input_tokens_seen": 130493630, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8125, + "step": 6076, + "time_per_iteration": 2.4518303871154785 + }, + { + "auxiliary_loss_clip": 0.01121617, + "auxiliary_loss_mlp": 0.01038999, + "balance_loss_clip": 1.02438378, + "balance_loss_mlp": 1.04457617, + "epoch": 0.36536900646324966, + "flos": 13115008609920.0, + "grad_norm": 2.2164690925931976, + "language_loss": 0.69506466, + "learning_rate": 2.931687131696872e-06, + "loss": 0.71667087, + "num_input_tokens_seen": 130510735, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.76953125, + "step": 6077, + "time_per_iteration": 2.447659730911255 + }, + { + "auxiliary_loss_clip": 0.01043202, + "auxiliary_loss_mlp": 0.01009421, + "balance_loss_clip": 1.00758541, + "balance_loss_mlp": 1.01693892, + "epoch": 0.3654291297159176, + "flos": 71100472383360.0, + "grad_norm": 0.7520139059893192, + "language_loss": 0.61798048, + "learning_rate": 2.9313424922083224e-06, + "loss": 0.63850671, + "num_input_tokens_seen": 130577050, + "router_z_loss_clip": 0.01831055, + "router_z_loss_mlp": 0.26171875, + "step": 6078, + "time_per_iteration": 3.1669509410858154 + }, + { + "auxiliary_loss_clip": 0.01119836, + "auxiliary_loss_mlp": 0.01036426, + "balance_loss_clip": 1.02238369, + "balance_loss_mlp": 1.04217839, + "epoch": 0.3654892529685856, + "flos": 23617478338560.0, + "grad_norm": 1.8851740765331422, + "language_loss": 0.78088033, + "learning_rate": 2.930997817403173e-06, + "loss": 0.80244297, + "num_input_tokens_seen": 130593780, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.77734375, + "step": 6079, + "time_per_iteration": 2.4570510387420654 + }, + { + "auxiliary_loss_clip": 0.01124854, + "auxiliary_loss_mlp": 0.01040383, + "balance_loss_clip": 1.02517176, + "balance_loss_mlp": 1.04497504, + "epoch": 0.36554937622125355, + "flos": 43470799850880.0, + "grad_norm": 2.129422570654268, + "language_loss": 0.62885886, + "learning_rate": 2.9306531072944913e-06, + "loss": 0.65051121, + "num_input_tokens_seen": 130615510, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.796875, + "step": 6080, + "time_per_iteration": 2.65580415725708 + }, + { + "auxiliary_loss_clip": 0.01122781, + "auxiliary_loss_mlp": 0.01034606, + "balance_loss_clip": 1.01974106, + "balance_loss_mlp": 1.04280567, + "epoch": 0.3656094994739215, + "flos": 23294641875840.0, + "grad_norm": 2.4061972925673385, + "language_loss": 0.67665905, + "learning_rate": 2.930308361895352e-06, + "loss": 0.69823289, + "num_input_tokens_seen": 130635410, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 6081, + "time_per_iteration": 2.4747202396392822 + }, + { + "auxiliary_loss_clip": 0.01124634, + "auxiliary_loss_mlp": 0.01038138, + "balance_loss_clip": 1.02287912, + "balance_loss_mlp": 1.04305673, + "epoch": 0.3656696227265895, + "flos": 24571984400640.0, + "grad_norm": 1.9082106177767983, + "language_loss": 0.74747473, + "learning_rate": 2.9299635812188257e-06, + "loss": 0.76910245, + "num_input_tokens_seen": 130657725, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.81640625, + "step": 6082, + "time_per_iteration": 2.5238633155822754 + }, + { + "auxiliary_loss_clip": 0.01126171, + "auxiliary_loss_mlp": 0.01029096, + "balance_loss_clip": 1.01576877, + "balance_loss_mlp": 1.04598689, + "epoch": 0.3657297459792575, + "flos": 27928375056000.0, + "grad_norm": 1.8091692998669453, + "language_loss": 0.82823056, + "learning_rate": 2.929618765277987e-06, + "loss": 0.84978318, + "num_input_tokens_seen": 130678360, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.8046875, + "step": 6083, + "time_per_iteration": 2.517704963684082 + }, + { + "auxiliary_loss_clip": 0.01041849, + "auxiliary_loss_mlp": 0.01002206, + "balance_loss_clip": 1.00026309, + "balance_loss_mlp": 1.01621974, + "epoch": 0.36578986923192547, + "flos": 67392622126080.0, + "grad_norm": 0.8152809684063654, + "language_loss": 0.59372437, + "learning_rate": 2.9292739140859125e-06, + "loss": 0.61416495, + "num_input_tokens_seen": 130742110, + "router_z_loss_clip": 0.01940918, + "router_z_loss_mlp": 0.25585938, + "step": 6084, + "time_per_iteration": 3.126275062561035 + }, + { + "auxiliary_loss_clip": 0.01121734, + "auxiliary_loss_mlp": 0.01036969, + "balance_loss_clip": 1.02273536, + "balance_loss_mlp": 1.04410744, + "epoch": 0.36584999248459343, + "flos": 20227511445120.0, + "grad_norm": 2.719357970509058, + "language_loss": 0.73096633, + "learning_rate": 2.9289290276556767e-06, + "loss": 0.75255334, + "num_input_tokens_seen": 130759870, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.77734375, + "step": 6085, + "time_per_iteration": 2.436722755432129 + }, + { + "auxiliary_loss_clip": 0.01122986, + "auxiliary_loss_mlp": 0.01028561, + "balance_loss_clip": 1.01485801, + "balance_loss_mlp": 1.0447793, + "epoch": 0.3659101157372614, + "flos": 19062461813760.0, + "grad_norm": 4.360512376704014, + "language_loss": 0.7831111, + "learning_rate": 2.9285841060003604e-06, + "loss": 0.80462652, + "num_input_tokens_seen": 130778510, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.78125, + "step": 6086, + "time_per_iteration": 2.557521104812622 + }, + { + "auxiliary_loss_clip": 0.0111444, + "auxiliary_loss_mlp": 0.01029117, + "balance_loss_clip": 1.0150919, + "balance_loss_mlp": 1.0403074, + "epoch": 0.36597023898992936, + "flos": 30810708990720.0, + "grad_norm": 1.7974113126538098, + "language_loss": 0.77105325, + "learning_rate": 2.9282391491330416e-06, + "loss": 0.79248881, + "num_input_tokens_seen": 130798535, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 6087, + "time_per_iteration": 2.544868230819702 + }, + { + "auxiliary_loss_clip": 0.01121777, + "auxiliary_loss_mlp": 0.01030672, + "balance_loss_clip": 1.01587856, + "balance_loss_mlp": 1.04190612, + "epoch": 0.36603036224259733, + "flos": 20521799573760.0, + "grad_norm": 5.741725291334025, + "language_loss": 0.70710862, + "learning_rate": 2.9278941570668002e-06, + "loss": 0.72863311, + "num_input_tokens_seen": 130816655, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.80078125, + "step": 6088, + "time_per_iteration": 2.491933822631836 + }, + { + "auxiliary_loss_clip": 0.01130389, + "auxiliary_loss_mlp": 0.01034477, + "balance_loss_clip": 1.01897383, + "balance_loss_mlp": 1.04569137, + "epoch": 0.3660904854952653, + "flos": 38329397798400.0, + "grad_norm": 1.6695945607154594, + "language_loss": 0.79878473, + "learning_rate": 2.92754912981472e-06, + "loss": 0.82043338, + "num_input_tokens_seen": 130841225, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84765625, + "step": 6089, + "time_per_iteration": 2.666814088821411 + }, + { + "auxiliary_loss_clip": 0.01119748, + "auxiliary_loss_mlp": 0.01031445, + "balance_loss_clip": 1.01816463, + "balance_loss_mlp": 1.04267049, + "epoch": 0.36615060874793326, + "flos": 21835555511040.0, + "grad_norm": 1.7190941707632215, + "language_loss": 0.71335226, + "learning_rate": 2.927204067389884e-06, + "loss": 0.73486418, + "num_input_tokens_seen": 130861050, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7734375, + "step": 6090, + "time_per_iteration": 2.5138063430786133 + }, + { + "auxiliary_loss_clip": 0.01118993, + "auxiliary_loss_mlp": 0.01041328, + "balance_loss_clip": 1.02757084, + "balance_loss_mlp": 1.04391527, + "epoch": 0.3662107320006012, + "flos": 16581537342720.0, + "grad_norm": 1.9784029627642763, + "language_loss": 0.74276829, + "learning_rate": 2.9268589698053763e-06, + "loss": 0.76437145, + "num_input_tokens_seen": 130879775, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.75, + "step": 6091, + "time_per_iteration": 2.437126636505127 + }, + { + "auxiliary_loss_clip": 0.01120866, + "auxiliary_loss_mlp": 0.01039193, + "balance_loss_clip": 1.02506638, + "balance_loss_mlp": 1.04396391, + "epoch": 0.3662708552532692, + "flos": 20958365473920.0, + "grad_norm": 1.8707748404117035, + "language_loss": 0.72492194, + "learning_rate": 2.926513837074284e-06, + "loss": 0.74652249, + "num_input_tokens_seen": 130898070, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76953125, + "step": 6092, + "time_per_iteration": 2.5038540363311768 + }, + { + "auxiliary_loss_clip": 0.01122728, + "auxiliary_loss_mlp": 0.01045071, + "balance_loss_clip": 1.03072441, + "balance_loss_mlp": 1.04359424, + "epoch": 0.36633097850593715, + "flos": 21902707987200.0, + "grad_norm": 1.9548617375197639, + "language_loss": 0.78251863, + "learning_rate": 2.9261686692096942e-06, + "loss": 0.8041966, + "num_input_tokens_seen": 130915250, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7890625, + "step": 6093, + "time_per_iteration": 2.453854560852051 + }, + { + "auxiliary_loss_clip": 0.01119754, + "auxiliary_loss_mlp": 0.01036262, + "balance_loss_clip": 1.02226686, + "balance_loss_mlp": 1.04095936, + "epoch": 0.3663911017586051, + "flos": 32854133808000.0, + "grad_norm": 1.7535936892187265, + "language_loss": 0.74123377, + "learning_rate": 2.925823466224696e-06, + "loss": 0.76279384, + "num_input_tokens_seen": 130936995, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7890625, + "step": 6094, + "time_per_iteration": 2.5953075885772705 + }, + { + "auxiliary_loss_clip": 0.01125058, + "auxiliary_loss_mlp": 0.01052761, + "balance_loss_clip": 1.0381875, + "balance_loss_mlp": 1.04492939, + "epoch": 0.3664512250112731, + "flos": 27271748482560.0, + "grad_norm": 1.5564182913572622, + "language_loss": 0.79226458, + "learning_rate": 2.9254782281323785e-06, + "loss": 0.81404281, + "num_input_tokens_seen": 130957970, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.80078125, + "step": 6095, + "time_per_iteration": 5.4338037967681885 + }, + { + "auxiliary_loss_clip": 0.01125087, + "auxiliary_loss_mlp": 0.01035776, + "balance_loss_clip": 1.02055264, + "balance_loss_mlp": 1.04422212, + "epoch": 0.3665113482639411, + "flos": 17784436930560.0, + "grad_norm": 2.287741364035224, + "language_loss": 0.73586392, + "learning_rate": 2.925132954945834e-06, + "loss": 0.75747252, + "num_input_tokens_seen": 130974915, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.80859375, + "step": 6096, + "time_per_iteration": 3.923590660095215 + }, + { + "auxiliary_loss_clip": 0.0112257, + "auxiliary_loss_mlp": 0.01033526, + "balance_loss_clip": 1.0195781, + "balance_loss_mlp": 1.04206252, + "epoch": 0.36657147151660907, + "flos": 27854614477440.0, + "grad_norm": 2.2038030169597875, + "language_loss": 0.67285162, + "learning_rate": 2.924787646678155e-06, + "loss": 0.69441259, + "num_input_tokens_seen": 130995745, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.8046875, + "step": 6097, + "time_per_iteration": 2.4843504428863525 + }, + { + "auxiliary_loss_clip": 0.01123525, + "auxiliary_loss_mlp": 0.01038839, + "balance_loss_clip": 1.0249629, + "balance_loss_mlp": 1.04401898, + "epoch": 0.36663159476927704, + "flos": 25374013228800.0, + "grad_norm": 1.6404590263223953, + "language_loss": 0.77676886, + "learning_rate": 2.9244423033424365e-06, + "loss": 0.79839253, + "num_input_tokens_seen": 131015545, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.796875, + "step": 6098, + "time_per_iteration": 2.5663979053497314 + }, + { + "auxiliary_loss_clip": 0.0111895, + "auxiliary_loss_mlp": 0.01038815, + "balance_loss_clip": 1.02467644, + "balance_loss_mlp": 1.04334557, + "epoch": 0.366691718021945, + "flos": 21357225072000.0, + "grad_norm": 1.7512654587161538, + "language_loss": 0.73807114, + "learning_rate": 2.9240969249517723e-06, + "loss": 0.7596488, + "num_input_tokens_seen": 131033990, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7578125, + "step": 6099, + "time_per_iteration": 2.442549705505371 + }, + { + "auxiliary_loss_clip": 0.01116483, + "auxiliary_loss_mlp": 0.01047226, + "balance_loss_clip": 1.03380322, + "balance_loss_mlp": 1.04073739, + "epoch": 0.36675184127461297, + "flos": 16800376953600.0, + "grad_norm": 1.739052204204903, + "language_loss": 0.84383607, + "learning_rate": 2.9237515115192602e-06, + "loss": 0.86547315, + "num_input_tokens_seen": 131050710, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 6100, + "time_per_iteration": 2.4783878326416016 + }, + { + "auxiliary_loss_clip": 0.01124265, + "auxiliary_loss_mlp": 0.01034789, + "balance_loss_clip": 1.02046633, + "balance_loss_mlp": 1.04215789, + "epoch": 0.36681196452728093, + "flos": 21906514828800.0, + "grad_norm": 2.450199870045222, + "language_loss": 0.70504647, + "learning_rate": 2.9234060630579992e-06, + "loss": 0.72663701, + "num_input_tokens_seen": 131071435, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.8203125, + "step": 6101, + "time_per_iteration": 2.4591257572174072 + }, + { + "auxiliary_loss_clip": 0.01121661, + "auxiliary_loss_mlp": 0.01041857, + "balance_loss_clip": 1.02629983, + "balance_loss_mlp": 1.04228854, + "epoch": 0.3668720877799489, + "flos": 17712436118400.0, + "grad_norm": 2.0513606804107543, + "language_loss": 0.76049435, + "learning_rate": 2.9230605795810865e-06, + "loss": 0.78212953, + "num_input_tokens_seen": 131088775, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.79296875, + "step": 6102, + "time_per_iteration": 2.491046190261841 + }, + { + "auxiliary_loss_clip": 0.01126584, + "auxiliary_loss_mlp": 0.01036727, + "balance_loss_clip": 1.02142096, + "balance_loss_mlp": 1.04445052, + "epoch": 0.36693221103261686, + "flos": 47045455499520.0, + "grad_norm": 1.6383228145690705, + "language_loss": 0.69930172, + "learning_rate": 2.922715061101625e-06, + "loss": 0.72093487, + "num_input_tokens_seen": 131112800, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8203125, + "step": 6103, + "time_per_iteration": 2.676790952682495 + }, + { + "auxiliary_loss_clip": 0.01121704, + "auxiliary_loss_mlp": 0.01036936, + "balance_loss_clip": 1.02213061, + "balance_loss_mlp": 1.0423454, + "epoch": 0.3669923342852848, + "flos": 15960929132160.0, + "grad_norm": 1.8701272650505458, + "language_loss": 0.71414149, + "learning_rate": 2.922369507632716e-06, + "loss": 0.73572791, + "num_input_tokens_seen": 131131150, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.79296875, + "step": 6104, + "time_per_iteration": 2.438197374343872 + }, + { + "auxiliary_loss_clip": 0.01121263, + "auxiliary_loss_mlp": 0.01032552, + "balance_loss_clip": 1.01794899, + "balance_loss_mlp": 1.04288161, + "epoch": 0.3670524575379528, + "flos": 19974485064960.0, + "grad_norm": 2.0275913231037923, + "language_loss": 0.81653488, + "learning_rate": 2.9220239191874617e-06, + "loss": 0.83807302, + "num_input_tokens_seen": 131150365, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78515625, + "step": 6105, + "time_per_iteration": 2.437201976776123 + }, + { + "auxiliary_loss_clip": 0.0112675, + "auxiliary_loss_mlp": 0.010372, + "balance_loss_clip": 1.02255476, + "balance_loss_mlp": 1.0441767, + "epoch": 0.36711258079062076, + "flos": 25702955003520.0, + "grad_norm": 1.7477833912391936, + "language_loss": 0.81079835, + "learning_rate": 2.9216782957789692e-06, + "loss": 0.83243787, + "num_input_tokens_seen": 131169310, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.82421875, + "step": 6106, + "time_per_iteration": 2.5447771549224854 + }, + { + "auxiliary_loss_clip": 0.01041229, + "auxiliary_loss_mlp": 0.0100622, + "balance_loss_clip": 1.00440836, + "balance_loss_mlp": 1.01511836, + "epoch": 0.3671727040432887, + "flos": 60772743342720.0, + "grad_norm": 0.6829750500510474, + "language_loss": 0.59212124, + "learning_rate": 2.9213326374203426e-06, + "loss": 0.6125958, + "num_input_tokens_seen": 131232900, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.26171875, + "step": 6107, + "time_per_iteration": 3.0983083248138428 + }, + { + "auxiliary_loss_clip": 0.01119584, + "auxiliary_loss_mlp": 0.01031391, + "balance_loss_clip": 1.01756859, + "balance_loss_mlp": 1.04195333, + "epoch": 0.3672328272959567, + "flos": 18661303745280.0, + "grad_norm": 1.5524752326282045, + "language_loss": 0.74417794, + "learning_rate": 2.92098694412469e-06, + "loss": 0.7656877, + "num_input_tokens_seen": 131250920, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.77734375, + "step": 6108, + "time_per_iteration": 2.5146114826202393 + }, + { + "auxiliary_loss_clip": 0.01120578, + "auxiliary_loss_mlp": 0.01036173, + "balance_loss_clip": 1.02218354, + "balance_loss_mlp": 1.04104972, + "epoch": 0.3672929505486247, + "flos": 15049049535360.0, + "grad_norm": 2.0732100862766294, + "language_loss": 0.73141801, + "learning_rate": 2.9206412159051213e-06, + "loss": 0.7529856, + "num_input_tokens_seen": 131267910, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.796875, + "step": 6109, + "time_per_iteration": 2.4597368240356445 + }, + { + "auxiliary_loss_clip": 0.01118669, + "auxiliary_loss_mlp": 0.01034099, + "balance_loss_clip": 1.02015734, + "balance_loss_mlp": 1.0407654, + "epoch": 0.3673530738012927, + "flos": 20589347099520.0, + "grad_norm": 1.8280489650426288, + "language_loss": 0.53282952, + "learning_rate": 2.920295452774744e-06, + "loss": 0.55435723, + "num_input_tokens_seen": 131287150, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.77734375, + "step": 6110, + "time_per_iteration": 2.5454814434051514 + }, + { + "auxiliary_loss_clip": 0.01120475, + "auxiliary_loss_mlp": 0.01034416, + "balance_loss_clip": 1.01949728, + "balance_loss_mlp": 1.04360104, + "epoch": 0.36741319705396064, + "flos": 21689830033920.0, + "grad_norm": 1.4515242715586747, + "language_loss": 0.8026799, + "learning_rate": 2.919949654746672e-06, + "loss": 0.82422882, + "num_input_tokens_seen": 131308225, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.76953125, + "step": 6111, + "time_per_iteration": 2.4838016033172607 + }, + { + "auxiliary_loss_clip": 0.01119124, + "auxiliary_loss_mlp": 0.01040745, + "balance_loss_clip": 1.02637434, + "balance_loss_mlp": 1.04195952, + "epoch": 0.3674733203066286, + "flos": 29862200499840.0, + "grad_norm": 1.7574831080907656, + "language_loss": 0.72220403, + "learning_rate": 2.9196038218340163e-06, + "loss": 0.74380273, + "num_input_tokens_seen": 131332115, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.76953125, + "step": 6112, + "time_per_iteration": 2.590109348297119 + }, + { + "auxiliary_loss_clip": 0.01120572, + "auxiliary_loss_mlp": 0.01039294, + "balance_loss_clip": 1.02503061, + "balance_loss_mlp": 1.04220295, + "epoch": 0.36753344355929657, + "flos": 18257021193600.0, + "grad_norm": 1.6166739673118746, + "language_loss": 0.85398543, + "learning_rate": 2.919257954049892e-06, + "loss": 0.87558413, + "num_input_tokens_seen": 131351885, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78515625, + "step": 6113, + "time_per_iteration": 2.4480674266815186 + }, + { + "auxiliary_loss_clip": 0.01122714, + "auxiliary_loss_mlp": 0.01036848, + "balance_loss_clip": 1.02228022, + "balance_loss_mlp": 1.04214144, + "epoch": 0.36759356681196453, + "flos": 25301150490240.0, + "grad_norm": 1.8814317352542869, + "language_loss": 0.78741604, + "learning_rate": 2.918912051407413e-06, + "loss": 0.80901164, + "num_input_tokens_seen": 131370245, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8046875, + "step": 6114, + "time_per_iteration": 2.4870779514312744 + }, + { + "auxiliary_loss_clip": 0.01125295, + "auxiliary_loss_mlp": 0.01044195, + "balance_loss_clip": 1.0278033, + "balance_loss_mlp": 1.04344988, + "epoch": 0.3676536900646325, + "flos": 21032952065280.0, + "grad_norm": 1.5830307408310422, + "language_loss": 0.66854429, + "learning_rate": 2.918566113919698e-06, + "loss": 0.69023919, + "num_input_tokens_seen": 131388115, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.81640625, + "step": 6115, + "time_per_iteration": 2.4361841678619385 + }, + { + "auxiliary_loss_clip": 0.01114869, + "auxiliary_loss_mlp": 0.01033503, + "balance_loss_clip": 1.01953745, + "balance_loss_mlp": 1.03984118, + "epoch": 0.36771381331730046, + "flos": 16288506190080.0, + "grad_norm": 2.406761648754093, + "language_loss": 0.76663208, + "learning_rate": 2.9182201415998636e-06, + "loss": 0.78811574, + "num_input_tokens_seen": 131404595, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75, + "step": 6116, + "time_per_iteration": 2.417569875717163 + }, + { + "auxiliary_loss_clip": 0.01119646, + "auxiliary_loss_mlp": 0.01040473, + "balance_loss_clip": 1.02685893, + "balance_loss_mlp": 1.04111099, + "epoch": 0.36777393656996843, + "flos": 22309971367680.0, + "grad_norm": 1.9705222106020779, + "language_loss": 0.62811542, + "learning_rate": 2.9178741344610286e-06, + "loss": 0.64971662, + "num_input_tokens_seen": 131423760, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.78515625, + "step": 6117, + "time_per_iteration": 2.443798065185547 + }, + { + "auxiliary_loss_clip": 0.01118324, + "auxiliary_loss_mlp": 0.0103365, + "balance_loss_clip": 1.019261, + "balance_loss_mlp": 1.04137671, + "epoch": 0.3678340598226364, + "flos": 26834069260800.0, + "grad_norm": 1.9131647495504847, + "language_loss": 0.72974634, + "learning_rate": 2.9175280925163156e-06, + "loss": 0.75126612, + "num_input_tokens_seen": 131444955, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.76953125, + "step": 6118, + "time_per_iteration": 2.531804084777832 + }, + { + "auxiliary_loss_clip": 0.01123956, + "auxiliary_loss_mlp": 0.01042313, + "balance_loss_clip": 1.02694678, + "balance_loss_mlp": 1.04156733, + "epoch": 0.36789418307530436, + "flos": 21761723105280.0, + "grad_norm": 2.002097677722335, + "language_loss": 0.72413695, + "learning_rate": 2.9171820157788445e-06, + "loss": 0.7457996, + "num_input_tokens_seen": 131465720, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.82421875, + "step": 6119, + "time_per_iteration": 2.4641144275665283 + }, + { + "auxiliary_loss_clip": 0.01121284, + "auxiliary_loss_mlp": 0.01032475, + "balance_loss_clip": 1.0179317, + "balance_loss_mlp": 1.04397964, + "epoch": 0.3679543063279723, + "flos": 15924192497280.0, + "grad_norm": 1.84976209385018, + "language_loss": 0.79848421, + "learning_rate": 2.9168359042617404e-06, + "loss": 0.82002181, + "num_input_tokens_seen": 131483080, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7734375, + "step": 6120, + "time_per_iteration": 2.487030029296875 + }, + { + "auxiliary_loss_clip": 0.01117761, + "auxiliary_loss_mlp": 0.01040133, + "balance_loss_clip": 1.02612031, + "balance_loss_mlp": 1.04084468, + "epoch": 0.3680144295806403, + "flos": 24275541456000.0, + "grad_norm": 1.8961465807450149, + "language_loss": 0.63855267, + "learning_rate": 2.916489757978126e-06, + "loss": 0.66013169, + "num_input_tokens_seen": 131502545, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76953125, + "step": 6121, + "time_per_iteration": 2.4573564529418945 + }, + { + "auxiliary_loss_clip": 0.01122895, + "auxiliary_loss_mlp": 0.01042434, + "balance_loss_clip": 1.02755642, + "balance_loss_mlp": 1.0431416, + "epoch": 0.36807455283330826, + "flos": 26104148985600.0, + "grad_norm": 1.8845840511442051, + "language_loss": 0.71209222, + "learning_rate": 2.9161435769411286e-06, + "loss": 0.73374552, + "num_input_tokens_seen": 131522155, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 6122, + "time_per_iteration": 2.5197854042053223 + }, + { + "auxiliary_loss_clip": 0.01116909, + "auxiliary_loss_mlp": 0.01034858, + "balance_loss_clip": 1.02091694, + "balance_loss_mlp": 1.04319501, + "epoch": 0.3681346760859763, + "flos": 24644990793600.0, + "grad_norm": 1.8566190114316727, + "language_loss": 0.69493115, + "learning_rate": 2.915797361163875e-06, + "loss": 0.71644878, + "num_input_tokens_seen": 131543865, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.734375, + "step": 6123, + "time_per_iteration": 2.5585381984710693 + }, + { + "auxiliary_loss_clip": 0.0112515, + "auxiliary_loss_mlp": 0.01039826, + "balance_loss_clip": 1.02426958, + "balance_loss_mlp": 1.04312396, + "epoch": 0.36819479933864424, + "flos": 23878369797120.0, + "grad_norm": 1.995367064863914, + "language_loss": 0.73392212, + "learning_rate": 2.9154511106594933e-06, + "loss": 0.7555719, + "num_input_tokens_seen": 131562155, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8203125, + "step": 6124, + "time_per_iteration": 2.56925368309021 + }, + { + "auxiliary_loss_clip": 0.01121929, + "auxiliary_loss_mlp": 0.01040848, + "balance_loss_clip": 1.02465916, + "balance_loss_mlp": 1.04337013, + "epoch": 0.3682549225913122, + "flos": 25553997302400.0, + "grad_norm": 1.997016319446362, + "language_loss": 0.74426562, + "learning_rate": 2.915104825441114e-06, + "loss": 0.76589334, + "num_input_tokens_seen": 131581695, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.7890625, + "step": 6125, + "time_per_iteration": 2.493232488632202 + }, + { + "auxiliary_loss_clip": 0.01124729, + "auxiliary_loss_mlp": 0.01046169, + "balance_loss_clip": 1.03009367, + "balance_loss_mlp": 1.04400194, + "epoch": 0.36831504584398017, + "flos": 16946605221120.0, + "grad_norm": 1.8135805598812564, + "language_loss": 0.78254056, + "learning_rate": 2.9147585055218686e-06, + "loss": 0.80424947, + "num_input_tokens_seen": 131599465, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8046875, + "step": 6126, + "time_per_iteration": 2.4767327308654785 + }, + { + "auxiliary_loss_clip": 0.01123227, + "auxiliary_loss_mlp": 0.0103777, + "balance_loss_clip": 1.02125943, + "balance_loss_mlp": 1.04164457, + "epoch": 0.36837516909664814, + "flos": 19865065259520.0, + "grad_norm": 2.275366104968191, + "language_loss": 0.66100526, + "learning_rate": 2.914412150914888e-06, + "loss": 0.68261528, + "num_input_tokens_seen": 131618330, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.81640625, + "step": 6127, + "time_per_iteration": 2.4442801475524902 + }, + { + "auxiliary_loss_clip": 0.01126304, + "auxiliary_loss_mlp": 0.01042916, + "balance_loss_clip": 1.02783585, + "balance_loss_mlp": 1.04527378, + "epoch": 0.3684352923493161, + "flos": 37626984362880.0, + "grad_norm": 1.809419798014635, + "language_loss": 0.70553637, + "learning_rate": 2.9140657616333074e-06, + "loss": 0.72722864, + "num_input_tokens_seen": 131638960, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.80859375, + "step": 6128, + "time_per_iteration": 2.6163570880889893 + }, + { + "auxiliary_loss_clip": 0.01121361, + "auxiliary_loss_mlp": 0.01041508, + "balance_loss_clip": 1.0266788, + "balance_loss_mlp": 1.04374862, + "epoch": 0.36849541560198407, + "flos": 14465501182080.0, + "grad_norm": 2.366686546837111, + "language_loss": 0.75425905, + "learning_rate": 2.9137193376902614e-06, + "loss": 0.77588773, + "num_input_tokens_seen": 131657440, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.77734375, + "step": 6129, + "time_per_iteration": 2.418318510055542 + }, + { + "auxiliary_loss_clip": 0.01119858, + "auxiliary_loss_mlp": 0.01041313, + "balance_loss_clip": 1.02652466, + "balance_loss_mlp": 1.0419023, + "epoch": 0.36855553885465203, + "flos": 25770753924480.0, + "grad_norm": 1.583632674026135, + "language_loss": 0.84801334, + "learning_rate": 2.9133728790988868e-06, + "loss": 0.86962497, + "num_input_tokens_seen": 131678035, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.78125, + "step": 6130, + "time_per_iteration": 2.4933249950408936 + }, + { + "auxiliary_loss_clip": 0.01041681, + "auxiliary_loss_mlp": 0.010081, + "balance_loss_clip": 1.00623989, + "balance_loss_mlp": 1.01602125, + "epoch": 0.36861566210732, + "flos": 65049417377280.0, + "grad_norm": 0.8093683158704721, + "language_loss": 0.60352623, + "learning_rate": 2.913026385872321e-06, + "loss": 0.62402403, + "num_input_tokens_seen": 131742470, + "router_z_loss_clip": 0.01855469, + "router_z_loss_mlp": 0.2578125, + "step": 6131, + "time_per_iteration": 3.1686718463897705 + }, + { + "auxiliary_loss_clip": 0.01117904, + "auxiliary_loss_mlp": 0.01031101, + "balance_loss_clip": 1.01657534, + "balance_loss_mlp": 1.04083943, + "epoch": 0.36867578535998796, + "flos": 30954495133440.0, + "grad_norm": 1.5510352980860918, + "language_loss": 0.72903317, + "learning_rate": 2.9126798580237034e-06, + "loss": 0.75052321, + "num_input_tokens_seen": 131764570, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.76953125, + "step": 6132, + "time_per_iteration": 2.54154109954834 + }, + { + "auxiliary_loss_clip": 0.01124361, + "auxiliary_loss_mlp": 0.0103786, + "balance_loss_clip": 1.02221942, + "balance_loss_mlp": 1.04263651, + "epoch": 0.3687359086126559, + "flos": 28837956182400.0, + "grad_norm": 1.665822939326855, + "language_loss": 0.74255228, + "learning_rate": 2.9123332955661736e-06, + "loss": 0.76417446, + "num_input_tokens_seen": 131785720, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.81640625, + "step": 6133, + "time_per_iteration": 2.501119375228882 + }, + { + "auxiliary_loss_clip": 0.01118037, + "auxiliary_loss_mlp": 0.01038324, + "balance_loss_clip": 1.02420318, + "balance_loss_mlp": 1.04308438, + "epoch": 0.3687960318653239, + "flos": 21396798881280.0, + "grad_norm": 1.60564703390979, + "language_loss": 0.71415824, + "learning_rate": 2.911986698512874e-06, + "loss": 0.73572183, + "num_input_tokens_seen": 131804430, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.75, + "step": 6134, + "time_per_iteration": 2.472978353500366 + }, + { + "auxiliary_loss_clip": 0.01121139, + "auxiliary_loss_mlp": 0.01035306, + "balance_loss_clip": 1.0202322, + "balance_loss_mlp": 1.04333591, + "epoch": 0.36885615511799186, + "flos": 20266043760000.0, + "grad_norm": 1.501197032587339, + "language_loss": 0.74985242, + "learning_rate": 2.9116400668769477e-06, + "loss": 0.77141684, + "num_input_tokens_seen": 131822060, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.77734375, + "step": 6135, + "time_per_iteration": 2.458523750305176 + }, + { + "auxiliary_loss_clip": 0.01043215, + "auxiliary_loss_mlp": 0.01004045, + "balance_loss_clip": 1.00209045, + "balance_loss_mlp": 1.01762199, + "epoch": 0.3689162783706599, + "flos": 63088836301440.0, + "grad_norm": 0.8063752733434837, + "language_loss": 0.5878793, + "learning_rate": 2.9112934006715376e-06, + "loss": 0.60835183, + "num_input_tokens_seen": 131880715, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.25585938, + "step": 6136, + "time_per_iteration": 2.9917385578155518 + }, + { + "auxiliary_loss_clip": 0.0112236, + "auxiliary_loss_mlp": 0.01035902, + "balance_loss_clip": 1.02095878, + "balance_loss_mlp": 1.04477668, + "epoch": 0.36897640162332784, + "flos": 10961984419200.0, + "grad_norm": 1.8816926848284692, + "language_loss": 0.78812146, + "learning_rate": 2.9109466999097918e-06, + "loss": 0.80970407, + "num_input_tokens_seen": 131895850, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.77734375, + "step": 6137, + "time_per_iteration": 6.900243520736694 + }, + { + "auxiliary_loss_clip": 0.01122666, + "auxiliary_loss_mlp": 0.01040761, + "balance_loss_clip": 1.02594304, + "balance_loss_mlp": 1.04392326, + "epoch": 0.3690365248759958, + "flos": 20704297599360.0, + "grad_norm": 2.0278297083458345, + "language_loss": 0.74142605, + "learning_rate": 2.9105999646048552e-06, + "loss": 0.76306027, + "num_input_tokens_seen": 131915775, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7890625, + "step": 6138, + "time_per_iteration": 2.5056889057159424 + }, + { + "auxiliary_loss_clip": 0.01127012, + "auxiliary_loss_mlp": 0.01038954, + "balance_loss_clip": 1.02365959, + "balance_loss_mlp": 1.04482222, + "epoch": 0.3690966481286638, + "flos": 31826369957760.0, + "grad_norm": 1.957735157830462, + "language_loss": 0.64818108, + "learning_rate": 2.9102531947698764e-06, + "loss": 0.66984075, + "num_input_tokens_seen": 131935715, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8203125, + "step": 6139, + "time_per_iteration": 2.5345380306243896 + }, + { + "auxiliary_loss_clip": 0.01119273, + "auxiliary_loss_mlp": 0.01040439, + "balance_loss_clip": 1.02563334, + "balance_loss_mlp": 1.04279661, + "epoch": 0.36915677138133174, + "flos": 13114936782720.0, + "grad_norm": 2.0918485574433734, + "language_loss": 0.71384197, + "learning_rate": 2.909906390418006e-06, + "loss": 0.73543906, + "num_input_tokens_seen": 131954120, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.765625, + "step": 6140, + "time_per_iteration": 2.4318323135375977 + }, + { + "auxiliary_loss_clip": 0.01042951, + "auxiliary_loss_mlp": 0.00999596, + "balance_loss_clip": 0.99771231, + "balance_loss_mlp": 1.01712704, + "epoch": 0.3692168946339997, + "flos": 68686879956480.0, + "grad_norm": 0.7479140823872853, + "language_loss": 0.59281325, + "learning_rate": 2.9095595515623934e-06, + "loss": 0.61323869, + "num_input_tokens_seen": 132017485, + "router_z_loss_clip": 0.01879883, + "router_z_loss_mlp": 0.2578125, + "step": 6141, + "time_per_iteration": 3.1505937576293945 + }, + { + "auxiliary_loss_clip": 0.01122987, + "auxiliary_loss_mlp": 0.0103975, + "balance_loss_clip": 1.02499199, + "balance_loss_mlp": 1.04369187, + "epoch": 0.36927701788666767, + "flos": 22017873968640.0, + "grad_norm": 1.768624510630746, + "language_loss": 0.7473368, + "learning_rate": 2.909212678216192e-06, + "loss": 0.76896417, + "num_input_tokens_seen": 132036760, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.79296875, + "step": 6142, + "time_per_iteration": 2.4768457412719727 + }, + { + "auxiliary_loss_clip": 0.01119694, + "auxiliary_loss_mlp": 0.01036766, + "balance_loss_clip": 1.02291358, + "balance_loss_mlp": 1.04270506, + "epoch": 0.36933714113933563, + "flos": 21835591424640.0, + "grad_norm": 2.5385068391341603, + "language_loss": 0.76985848, + "learning_rate": 2.908865770392555e-06, + "loss": 0.79142308, + "num_input_tokens_seen": 132056935, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76953125, + "step": 6143, + "time_per_iteration": 2.4604313373565674 + }, + { + "auxiliary_loss_clip": 0.01118955, + "auxiliary_loss_mlp": 0.01035839, + "balance_loss_clip": 1.02289248, + "balance_loss_mlp": 1.04277074, + "epoch": 0.3693972643920036, + "flos": 23691705793920.0, + "grad_norm": 1.4994482416842545, + "language_loss": 0.81616801, + "learning_rate": 2.9085188281046364e-06, + "loss": 0.83771598, + "num_input_tokens_seen": 132077285, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.76171875, + "step": 6144, + "time_per_iteration": 2.529298782348633 + }, + { + "auxiliary_loss_clip": 0.0112261, + "auxiliary_loss_mlp": 0.01038443, + "balance_loss_clip": 1.02425694, + "balance_loss_mlp": 1.04323006, + "epoch": 0.36945738764467156, + "flos": 22856747172480.0, + "grad_norm": 1.9122738225408384, + "language_loss": 0.77019674, + "learning_rate": 2.908171851365593e-06, + "loss": 0.79180729, + "num_input_tokens_seen": 132095520, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.79296875, + "step": 6145, + "time_per_iteration": 2.4642515182495117 + }, + { + "auxiliary_loss_clip": 0.01123051, + "auxiliary_loss_mlp": 0.01032135, + "balance_loss_clip": 1.01760387, + "balance_loss_mlp": 1.04384804, + "epoch": 0.36951751089733953, + "flos": 16615939593600.0, + "grad_norm": 1.7518336089815172, + "language_loss": 0.76903462, + "learning_rate": 2.9078248401885815e-06, + "loss": 0.79058653, + "num_input_tokens_seen": 132112810, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.79296875, + "step": 6146, + "time_per_iteration": 2.49208927154541 + }, + { + "auxiliary_loss_clip": 0.01125412, + "auxiliary_loss_mlp": 0.01042993, + "balance_loss_clip": 1.02746034, + "balance_loss_mlp": 1.04481673, + "epoch": 0.3695776341500075, + "flos": 18914545607040.0, + "grad_norm": 1.7861503855196468, + "language_loss": 0.80794239, + "learning_rate": 2.907477794586761e-06, + "loss": 0.82962638, + "num_input_tokens_seen": 132131615, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8046875, + "step": 6147, + "time_per_iteration": 2.417968988418579 + }, + { + "auxiliary_loss_clip": 0.01120028, + "auxiliary_loss_mlp": 0.01037464, + "balance_loss_clip": 1.0238626, + "balance_loss_mlp": 1.04083371, + "epoch": 0.36963775740267546, + "flos": 20808474019200.0, + "grad_norm": 1.7356953572419536, + "language_loss": 0.83196342, + "learning_rate": 2.9071307145732926e-06, + "loss": 0.85353833, + "num_input_tokens_seen": 132149585, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.79296875, + "step": 6148, + "time_per_iteration": 2.4493086338043213 + }, + { + "auxiliary_loss_clip": 0.01118838, + "auxiliary_loss_mlp": 0.01038426, + "balance_loss_clip": 1.02424645, + "balance_loss_mlp": 1.04304922, + "epoch": 0.3696978806553435, + "flos": 26061881656320.0, + "grad_norm": 2.337121678381176, + "language_loss": 0.74373478, + "learning_rate": 2.9067836001613357e-06, + "loss": 0.76530743, + "num_input_tokens_seen": 132165555, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7578125, + "step": 6149, + "time_per_iteration": 2.4594686031341553 + }, + { + "auxiliary_loss_clip": 0.01124701, + "auxiliary_loss_mlp": 0.01038071, + "balance_loss_clip": 1.02210915, + "balance_loss_mlp": 1.04449439, + "epoch": 0.36975800390801145, + "flos": 26833925606400.0, + "grad_norm": 1.7562888589836316, + "language_loss": 0.70538592, + "learning_rate": 2.906436451364054e-06, + "loss": 0.72701365, + "num_input_tokens_seen": 132185100, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.80078125, + "step": 6150, + "time_per_iteration": 2.5232975482940674 + }, + { + "auxiliary_loss_clip": 0.01121201, + "auxiliary_loss_mlp": 0.01039019, + "balance_loss_clip": 1.02470183, + "balance_loss_mlp": 1.04390609, + "epoch": 0.3698181271606794, + "flos": 21142623265920.0, + "grad_norm": 1.6469943204532072, + "language_loss": 0.82023048, + "learning_rate": 2.906089268194611e-06, + "loss": 0.84183264, + "num_input_tokens_seen": 132203930, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7734375, + "step": 6151, + "time_per_iteration": 2.448066473007202 + }, + { + "auxiliary_loss_clip": 0.01036606, + "auxiliary_loss_mlp": 0.01001329, + "balance_loss_clip": 0.99951726, + "balance_loss_mlp": 1.01119328, + "epoch": 0.3698782504133474, + "flos": 66742639568640.0, + "grad_norm": 0.838014312453704, + "language_loss": 0.63083476, + "learning_rate": 2.9057420506661726e-06, + "loss": 0.65121406, + "num_input_tokens_seen": 132263845, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.25390625, + "step": 6152, + "time_per_iteration": 3.170707941055298 + }, + { + "auxiliary_loss_clip": 0.01117624, + "auxiliary_loss_mlp": 0.01037374, + "balance_loss_clip": 1.02347398, + "balance_loss_mlp": 1.0429337, + "epoch": 0.36993837366601534, + "flos": 24311523905280.0, + "grad_norm": 1.8166659348284784, + "language_loss": 0.70360208, + "learning_rate": 2.9053947987919044e-06, + "loss": 0.72515202, + "num_input_tokens_seen": 132282350, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 6153, + "time_per_iteration": 2.480318546295166 + }, + { + "auxiliary_loss_clip": 0.01123537, + "auxiliary_loss_mlp": 0.01039275, + "balance_loss_clip": 1.02420688, + "balance_loss_mlp": 1.04319179, + "epoch": 0.3699984969186833, + "flos": 24349194293760.0, + "grad_norm": 2.0600031325492107, + "language_loss": 0.72201782, + "learning_rate": 2.9050475125849755e-06, + "loss": 0.74364597, + "num_input_tokens_seen": 132301930, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8046875, + "step": 6154, + "time_per_iteration": 2.48018479347229 + }, + { + "auxiliary_loss_clip": 0.0111958, + "auxiliary_loss_mlp": 0.01029952, + "balance_loss_clip": 1.01624274, + "balance_loss_mlp": 1.04201758, + "epoch": 0.37005862017135127, + "flos": 19829154637440.0, + "grad_norm": 1.8383479148193087, + "language_loss": 0.67877179, + "learning_rate": 2.9047001920585534e-06, + "loss": 0.70026708, + "num_input_tokens_seen": 132320915, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 6155, + "time_per_iteration": 2.454582929611206 + }, + { + "auxiliary_loss_clip": 0.01119091, + "auxiliary_loss_mlp": 0.01028886, + "balance_loss_clip": 1.01518905, + "balance_loss_mlp": 1.0420723, + "epoch": 0.37011874342401924, + "flos": 19573793873280.0, + "grad_norm": 1.7213710867444976, + "language_loss": 0.67835188, + "learning_rate": 2.9043528372258097e-06, + "loss": 0.6998316, + "num_input_tokens_seen": 132340415, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76953125, + "step": 6156, + "time_per_iteration": 2.456244707107544 + }, + { + "auxiliary_loss_clip": 0.01117904, + "auxiliary_loss_mlp": 0.01038018, + "balance_loss_clip": 1.02461255, + "balance_loss_mlp": 1.04180884, + "epoch": 0.3701788666766872, + "flos": 20374350243840.0, + "grad_norm": 1.7871024658649661, + "language_loss": 0.82324016, + "learning_rate": 2.904005448099916e-06, + "loss": 0.8447994, + "num_input_tokens_seen": 132358600, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 6157, + "time_per_iteration": 2.467258930206299 + }, + { + "auxiliary_loss_clip": 0.0112233, + "auxiliary_loss_mlp": 0.0103747, + "balance_loss_clip": 1.02214015, + "balance_loss_mlp": 1.04224074, + "epoch": 0.37023898992935517, + "flos": 15340931452800.0, + "grad_norm": 2.319348977212497, + "language_loss": 0.76519799, + "learning_rate": 2.9036580246940444e-06, + "loss": 0.78679597, + "num_input_tokens_seen": 132373160, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.80078125, + "step": 6158, + "time_per_iteration": 2.4462850093841553 + }, + { + "auxiliary_loss_clip": 0.01121021, + "auxiliary_loss_mlp": 0.01037892, + "balance_loss_clip": 1.02276468, + "balance_loss_mlp": 1.04128695, + "epoch": 0.37029911318202313, + "flos": 19573937527680.0, + "grad_norm": 2.3237426114128903, + "language_loss": 0.6888833, + "learning_rate": 2.9033105670213708e-06, + "loss": 0.71047246, + "num_input_tokens_seen": 132392345, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.796875, + "step": 6159, + "time_per_iteration": 2.444615364074707 + }, + { + "auxiliary_loss_clip": 0.0111775, + "auxiliary_loss_mlp": 0.0103647, + "balance_loss_clip": 1.02298164, + "balance_loss_mlp": 1.04054952, + "epoch": 0.3703592364346911, + "flos": 26213353309440.0, + "grad_norm": 1.7829911261722147, + "language_loss": 0.7101602, + "learning_rate": 2.9029630750950697e-06, + "loss": 0.73170245, + "num_input_tokens_seen": 132412620, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7734375, + "step": 6160, + "time_per_iteration": 2.4807472229003906 + }, + { + "auxiliary_loss_clip": 0.01114504, + "auxiliary_loss_mlp": 0.01030762, + "balance_loss_clip": 1.01808465, + "balance_loss_mlp": 1.04033566, + "epoch": 0.37041935968735906, + "flos": 20048317470720.0, + "grad_norm": 1.5671410195286926, + "language_loss": 0.79049259, + "learning_rate": 2.9026155489283176e-06, + "loss": 0.81194532, + "num_input_tokens_seen": 132431570, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7421875, + "step": 6161, + "time_per_iteration": 2.445615768432617 + }, + { + "auxiliary_loss_clip": 0.01119907, + "auxiliary_loss_mlp": 0.01037146, + "balance_loss_clip": 1.02266204, + "balance_loss_mlp": 1.04217172, + "epoch": 0.3704794829400271, + "flos": 24133802388480.0, + "grad_norm": 1.6578530571842398, + "language_loss": 0.7961942, + "learning_rate": 2.902267988534295e-06, + "loss": 0.81776464, + "num_input_tokens_seen": 132451525, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.77734375, + "step": 6162, + "time_per_iteration": 2.474179267883301 + }, + { + "auxiliary_loss_clip": 0.01118518, + "auxiliary_loss_mlp": 0.01035343, + "balance_loss_clip": 1.02122831, + "balance_loss_mlp": 1.04136944, + "epoch": 0.37053960619269505, + "flos": 14866874732160.0, + "grad_norm": 1.751569507310971, + "language_loss": 0.79592955, + "learning_rate": 2.9019203939261783e-06, + "loss": 0.81746811, + "num_input_tokens_seen": 132469875, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76953125, + "step": 6163, + "time_per_iteration": 2.429410696029663 + }, + { + "auxiliary_loss_clip": 0.01121642, + "auxiliary_loss_mlp": 0.01032856, + "balance_loss_clip": 1.01815772, + "balance_loss_mlp": 1.04239571, + "epoch": 0.370599729445363, + "flos": 21361498790400.0, + "grad_norm": 1.6995697719291154, + "language_loss": 0.68002689, + "learning_rate": 2.9015727651171507e-06, + "loss": 0.70157188, + "num_input_tokens_seen": 132488360, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.79296875, + "step": 6164, + "time_per_iteration": 2.4500439167022705 + }, + { + "auxiliary_loss_clip": 0.01125233, + "auxiliary_loss_mlp": 0.01035754, + "balance_loss_clip": 1.0206207, + "balance_loss_mlp": 1.04507017, + "epoch": 0.370659852698031, + "flos": 26829041356800.0, + "grad_norm": 2.4697759057606197, + "language_loss": 0.82807398, + "learning_rate": 2.9012251021203935e-06, + "loss": 0.84968388, + "num_input_tokens_seen": 132508630, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.80078125, + "step": 6165, + "time_per_iteration": 2.4863715171813965 + }, + { + "auxiliary_loss_clip": 0.01125688, + "auxiliary_loss_mlp": 0.01036899, + "balance_loss_clip": 1.02060854, + "balance_loss_mlp": 1.04388845, + "epoch": 0.37071997595069894, + "flos": 19099018880640.0, + "grad_norm": 1.8224972170046692, + "language_loss": 0.69500774, + "learning_rate": 2.9008774049490896e-06, + "loss": 0.71663356, + "num_input_tokens_seen": 132527465, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.81640625, + "step": 6166, + "time_per_iteration": 2.560605049133301 + }, + { + "auxiliary_loss_clip": 0.01038031, + "auxiliary_loss_mlp": 0.01006399, + "balance_loss_clip": 1.00471771, + "balance_loss_mlp": 1.01302195, + "epoch": 0.3707800992033669, + "flos": 52178384920320.0, + "grad_norm": 0.8093247029889314, + "language_loss": 0.56892115, + "learning_rate": 2.9005296736164244e-06, + "loss": 0.58936548, + "num_input_tokens_seen": 132579940, + "router_z_loss_clip": 0.0168457, + "router_z_loss_mlp": 0.25, + "step": 6167, + "time_per_iteration": 2.922917127609253 + }, + { + "auxiliary_loss_clip": 0.01118731, + "auxiliary_loss_mlp": 0.01033249, + "balance_loss_clip": 1.01992154, + "balance_loss_mlp": 1.04288507, + "epoch": 0.3708402224560349, + "flos": 19901837808000.0, + "grad_norm": 1.945139483069219, + "language_loss": 0.75539452, + "learning_rate": 2.900181908135584e-06, + "loss": 0.77691436, + "num_input_tokens_seen": 132598390, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7578125, + "step": 6168, + "time_per_iteration": 2.4489872455596924 + }, + { + "auxiliary_loss_clip": 0.01120115, + "auxiliary_loss_mlp": 0.01035803, + "balance_loss_clip": 1.02202857, + "balance_loss_mlp": 1.04180634, + "epoch": 0.37090034570870284, + "flos": 20007630339840.0, + "grad_norm": 2.5586684776543853, + "language_loss": 0.7432459, + "learning_rate": 2.899834108519755e-06, + "loss": 0.76480508, + "num_input_tokens_seen": 132616920, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.78125, + "step": 6169, + "time_per_iteration": 2.4537463188171387 + }, + { + "auxiliary_loss_clip": 0.01120897, + "auxiliary_loss_mlp": 0.01032585, + "balance_loss_clip": 1.01891184, + "balance_loss_mlp": 1.04480267, + "epoch": 0.3709604689613708, + "flos": 24134700228480.0, + "grad_norm": 1.3706540261028175, + "language_loss": 0.79311681, + "learning_rate": 2.899486274782127e-06, + "loss": 0.81465161, + "num_input_tokens_seen": 132637660, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7578125, + "step": 6170, + "time_per_iteration": 2.4723992347717285 + }, + { + "auxiliary_loss_clip": 0.01122845, + "auxiliary_loss_mlp": 0.01038875, + "balance_loss_clip": 1.02390242, + "balance_loss_mlp": 1.04451621, + "epoch": 0.37102059221403877, + "flos": 23876071326720.0, + "grad_norm": 1.6235616399590074, + "language_loss": 0.76385272, + "learning_rate": 2.8991384069358885e-06, + "loss": 0.78546989, + "num_input_tokens_seen": 132657635, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.78125, + "step": 6171, + "time_per_iteration": 2.5364768505096436 + }, + { + "auxiliary_loss_clip": 0.01123724, + "auxiliary_loss_mlp": 0.01031905, + "balance_loss_clip": 1.01663446, + "balance_loss_mlp": 1.04594254, + "epoch": 0.37108071546670673, + "flos": 14501268149760.0, + "grad_norm": 1.9768297571305458, + "language_loss": 0.80696416, + "learning_rate": 2.898790504994232e-06, + "loss": 0.82852054, + "num_input_tokens_seen": 132674455, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.77734375, + "step": 6172, + "time_per_iteration": 2.451099395751953 + }, + { + "auxiliary_loss_clip": 0.01124197, + "auxiliary_loss_mlp": 0.01037606, + "balance_loss_clip": 1.0219543, + "balance_loss_mlp": 1.04385138, + "epoch": 0.3711408387193747, + "flos": 34562619279360.0, + "grad_norm": 2.2157067962534875, + "language_loss": 0.59447742, + "learning_rate": 2.89844256897035e-06, + "loss": 0.61609542, + "num_input_tokens_seen": 132695140, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8046875, + "step": 6173, + "time_per_iteration": 2.5750677585601807 + }, + { + "auxiliary_loss_clip": 0.01121876, + "auxiliary_loss_mlp": 0.01036073, + "balance_loss_clip": 1.02122533, + "balance_loss_mlp": 1.04391754, + "epoch": 0.37120096197204266, + "flos": 17310703432320.0, + "grad_norm": 1.9248503394254857, + "language_loss": 0.81157243, + "learning_rate": 2.898094598877435e-06, + "loss": 0.83315188, + "num_input_tokens_seen": 132712470, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78125, + "step": 6174, + "time_per_iteration": 2.421182155609131 + }, + { + "auxiliary_loss_clip": 0.01117919, + "auxiliary_loss_mlp": 0.01033906, + "balance_loss_clip": 1.02035165, + "balance_loss_mlp": 1.04281855, + "epoch": 0.37126108522471063, + "flos": 30664049760000.0, + "grad_norm": 1.8542839121663495, + "language_loss": 0.79834068, + "learning_rate": 2.8977465947286826e-06, + "loss": 0.81985891, + "num_input_tokens_seen": 132732945, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.75, + "step": 6175, + "time_per_iteration": 2.533447027206421 + }, + { + "auxiliary_loss_clip": 0.01124428, + "auxiliary_loss_mlp": 0.01046668, + "balance_loss_clip": 1.03194535, + "balance_loss_mlp": 1.04644537, + "epoch": 0.37132120847737865, + "flos": 25155640494720.0, + "grad_norm": 1.6734071315129293, + "language_loss": 0.88764346, + "learning_rate": 2.89739855653729e-06, + "loss": 0.90935433, + "num_input_tokens_seen": 132752470, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.78125, + "step": 6176, + "time_per_iteration": 2.486224412918091 + }, + { + "auxiliary_loss_clip": 0.01122363, + "auxiliary_loss_mlp": 0.01036031, + "balance_loss_clip": 1.02174938, + "balance_loss_mlp": 1.04402244, + "epoch": 0.3713813317300466, + "flos": 21213474842880.0, + "grad_norm": 1.5809846817738957, + "language_loss": 0.73293233, + "learning_rate": 2.8970504843164546e-06, + "loss": 0.75451624, + "num_input_tokens_seen": 132771485, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 6177, + "time_per_iteration": 2.492033004760742 + }, + { + "auxiliary_loss_clip": 0.01119881, + "auxiliary_loss_mlp": 0.01039443, + "balance_loss_clip": 1.02551913, + "balance_loss_mlp": 1.04359818, + "epoch": 0.3714414549827146, + "flos": 21616644072960.0, + "grad_norm": 1.8832415058442271, + "language_loss": 0.75425023, + "learning_rate": 2.896702378079374e-06, + "loss": 0.77584344, + "num_input_tokens_seen": 132791465, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76171875, + "step": 6178, + "time_per_iteration": 4.005537748336792 + }, + { + "auxiliary_loss_clip": 0.01123036, + "auxiliary_loss_mlp": 0.01033082, + "balance_loss_clip": 1.01896191, + "balance_loss_mlp": 1.04618645, + "epoch": 0.37150157823538255, + "flos": 19972294335360.0, + "grad_norm": 1.761738877644596, + "language_loss": 0.7228415, + "learning_rate": 2.8963542378392502e-06, + "loss": 0.74440265, + "num_input_tokens_seen": 132810160, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76953125, + "step": 6179, + "time_per_iteration": 5.333393812179565 + }, + { + "auxiliary_loss_clip": 0.01122372, + "auxiliary_loss_mlp": 0.01035165, + "balance_loss_clip": 1.01987052, + "balance_loss_mlp": 1.04356897, + "epoch": 0.3715617014880505, + "flos": 24860562266880.0, + "grad_norm": 2.1666258639633518, + "language_loss": 0.69705212, + "learning_rate": 2.896006063609283e-06, + "loss": 0.71862751, + "num_input_tokens_seen": 132831265, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.7890625, + "step": 6180, + "time_per_iteration": 2.4896974563598633 + }, + { + "auxiliary_loss_clip": 0.01117383, + "auxiliary_loss_mlp": 0.01030855, + "balance_loss_clip": 1.01695561, + "balance_loss_mlp": 1.04157031, + "epoch": 0.3716218247407185, + "flos": 20449080489600.0, + "grad_norm": 1.7756296340851163, + "language_loss": 0.77702844, + "learning_rate": 2.8956578554026767e-06, + "loss": 0.79851079, + "num_input_tokens_seen": 132850005, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 6181, + "time_per_iteration": 2.4324231147766113 + }, + { + "auxiliary_loss_clip": 0.01118444, + "auxiliary_loss_mlp": 0.0103491, + "balance_loss_clip": 1.0202775, + "balance_loss_mlp": 1.04225945, + "epoch": 0.37168194799338644, + "flos": 24133479166080.0, + "grad_norm": 1.8526172549307973, + "language_loss": 0.78767365, + "learning_rate": 2.8953096132326343e-06, + "loss": 0.80920726, + "num_input_tokens_seen": 132865790, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.76171875, + "step": 6182, + "time_per_iteration": 2.47566819190979 + }, + { + "auxiliary_loss_clip": 0.01036072, + "auxiliary_loss_mlp": 0.01008449, + "balance_loss_clip": 1.00650644, + "balance_loss_mlp": 1.01082778, + "epoch": 0.3717420712460544, + "flos": 67408926900480.0, + "grad_norm": 0.7841437663574693, + "language_loss": 0.5748502, + "learning_rate": 2.894961337112362e-06, + "loss": 0.59529543, + "num_input_tokens_seen": 132921775, + "router_z_loss_clip": 0.01940918, + "router_z_loss_mlp": 0.25195312, + "step": 6183, + "time_per_iteration": 3.0538721084594727 + }, + { + "auxiliary_loss_clip": 0.01124733, + "auxiliary_loss_mlp": 0.010435, + "balance_loss_clip": 1.02772832, + "balance_loss_mlp": 1.04238844, + "epoch": 0.37180219449872237, + "flos": 22376908362240.0, + "grad_norm": 2.1996761862640715, + "language_loss": 0.76940209, + "learning_rate": 2.894613027055066e-06, + "loss": 0.79108441, + "num_input_tokens_seen": 132941060, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.82421875, + "step": 6184, + "time_per_iteration": 2.4653987884521484 + }, + { + "auxiliary_loss_clip": 0.0111964, + "auxiliary_loss_mlp": 0.01036854, + "balance_loss_clip": 1.02268612, + "balance_loss_mlp": 1.04353404, + "epoch": 0.37186231775139034, + "flos": 21869885934720.0, + "grad_norm": 13.965274526936179, + "language_loss": 0.72047049, + "learning_rate": 2.894264683073954e-06, + "loss": 0.74203539, + "num_input_tokens_seen": 132961850, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.76171875, + "step": 6185, + "time_per_iteration": 2.458340644836426 + }, + { + "auxiliary_loss_clip": 0.01117506, + "auxiliary_loss_mlp": 0.01027176, + "balance_loss_clip": 1.01282895, + "balance_loss_mlp": 1.04169369, + "epoch": 0.3719224410040583, + "flos": 22415225195520.0, + "grad_norm": 1.55661462109525, + "language_loss": 0.7702297, + "learning_rate": 2.8939163051822363e-06, + "loss": 0.79167652, + "num_input_tokens_seen": 132981625, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7578125, + "step": 6186, + "time_per_iteration": 2.4665393829345703 + }, + { + "auxiliary_loss_clip": 0.01125099, + "auxiliary_loss_mlp": 0.01041622, + "balance_loss_clip": 1.02608871, + "balance_loss_mlp": 1.0436089, + "epoch": 0.37198256425672627, + "flos": 25151223121920.0, + "grad_norm": 1.8483894715485976, + "language_loss": 0.83475709, + "learning_rate": 2.8935678933931224e-06, + "loss": 0.85642433, + "num_input_tokens_seen": 133001225, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8125, + "step": 6187, + "time_per_iteration": 2.520294427871704 + }, + { + "auxiliary_loss_clip": 0.01118811, + "auxiliary_loss_mlp": 0.0103693, + "balance_loss_clip": 1.02228546, + "balance_loss_mlp": 1.0421021, + "epoch": 0.37204268750939423, + "flos": 21138313633920.0, + "grad_norm": 2.555128723697134, + "language_loss": 0.84544367, + "learning_rate": 2.893219447719824e-06, + "loss": 0.86700106, + "num_input_tokens_seen": 133018820, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.765625, + "step": 6188, + "time_per_iteration": 2.4926793575286865 + }, + { + "auxiliary_loss_clip": 0.01121509, + "auxiliary_loss_mlp": 0.01034942, + "balance_loss_clip": 1.01966548, + "balance_loss_mlp": 1.04392672, + "epoch": 0.37210281076206225, + "flos": 21506829217920.0, + "grad_norm": 1.6829112555225307, + "language_loss": 0.65646267, + "learning_rate": 2.8928709681755548e-06, + "loss": 0.67802715, + "num_input_tokens_seen": 133040205, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.7734375, + "step": 6189, + "time_per_iteration": 2.447175979614258 + }, + { + "auxiliary_loss_clip": 0.01122656, + "auxiliary_loss_mlp": 0.01039465, + "balance_loss_clip": 1.02514815, + "balance_loss_mlp": 1.04456878, + "epoch": 0.3721629340147302, + "flos": 17347835116800.0, + "grad_norm": 2.6073714147883162, + "language_loss": 0.83948457, + "learning_rate": 2.8925224547735293e-06, + "loss": 0.8611058, + "num_input_tokens_seen": 133058095, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 6190, + "time_per_iteration": 2.4410126209259033 + }, + { + "auxiliary_loss_clip": 0.01125721, + "auxiliary_loss_mlp": 0.01033909, + "balance_loss_clip": 1.01949084, + "balance_loss_mlp": 1.04337156, + "epoch": 0.3722230572673982, + "flos": 16432400073600.0, + "grad_norm": 2.3404623023220643, + "language_loss": 0.88506198, + "learning_rate": 2.8921739075269633e-06, + "loss": 0.90665835, + "num_input_tokens_seen": 133071530, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.82421875, + "step": 6191, + "time_per_iteration": 2.452972650527954 + }, + { + "auxiliary_loss_clip": 0.01123549, + "auxiliary_loss_mlp": 0.01033146, + "balance_loss_clip": 1.01648057, + "balance_loss_mlp": 1.04218102, + "epoch": 0.37228318052006615, + "flos": 22674716023680.0, + "grad_norm": 1.570395080331924, + "language_loss": 0.74228191, + "learning_rate": 2.891825326449073e-06, + "loss": 0.76384884, + "num_input_tokens_seen": 133091410, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8125, + "step": 6192, + "time_per_iteration": 2.6486353874206543 + }, + { + "auxiliary_loss_clip": 0.01119485, + "auxiliary_loss_mlp": 0.01036496, + "balance_loss_clip": 1.02246475, + "balance_loss_mlp": 1.0427109, + "epoch": 0.3723433037727341, + "flos": 25265491263360.0, + "grad_norm": 2.4820365699908944, + "language_loss": 0.79760754, + "learning_rate": 2.8914767115530766e-06, + "loss": 0.81916732, + "num_input_tokens_seen": 133110365, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.765625, + "step": 6193, + "time_per_iteration": 2.525973081588745 + }, + { + "auxiliary_loss_clip": 0.01123101, + "auxiliary_loss_mlp": 0.01039009, + "balance_loss_clip": 1.02436423, + "balance_loss_mlp": 1.043504, + "epoch": 0.3724034270254021, + "flos": 10524664333440.0, + "grad_norm": 1.7895472081978328, + "language_loss": 0.84495157, + "learning_rate": 2.891128062852194e-06, + "loss": 0.86657262, + "num_input_tokens_seen": 133128255, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.796875, + "step": 6194, + "time_per_iteration": 2.419099807739258 + }, + { + "auxiliary_loss_clip": 0.01118251, + "auxiliary_loss_mlp": 0.01034958, + "balance_loss_clip": 1.02080166, + "balance_loss_mlp": 1.04037666, + "epoch": 0.37246355027807004, + "flos": 20266223328000.0, + "grad_norm": 2.9207659578016463, + "language_loss": 0.77555239, + "learning_rate": 2.890779380359646e-06, + "loss": 0.79708451, + "num_input_tokens_seen": 133143975, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.77734375, + "step": 6195, + "time_per_iteration": 2.3995044231414795 + }, + { + "auxiliary_loss_clip": 0.01119279, + "auxiliary_loss_mlp": 0.01032495, + "balance_loss_clip": 1.01814234, + "balance_loss_mlp": 1.0428412, + "epoch": 0.372523673530738, + "flos": 19500571998720.0, + "grad_norm": 1.677102671463593, + "language_loss": 0.79111922, + "learning_rate": 2.890430664088655e-06, + "loss": 0.81263697, + "num_input_tokens_seen": 133162935, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.765625, + "step": 6196, + "time_per_iteration": 2.445478916168213 + }, + { + "auxiliary_loss_clip": 0.01119995, + "auxiliary_loss_mlp": 0.01036406, + "balance_loss_clip": 1.02235723, + "balance_loss_mlp": 1.04315817, + "epoch": 0.372583796783406, + "flos": 16764250849920.0, + "grad_norm": 1.8393036550873767, + "language_loss": 0.8332746, + "learning_rate": 2.890081914052443e-06, + "loss": 0.85483867, + "num_input_tokens_seen": 133181180, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.765625, + "step": 6197, + "time_per_iteration": 2.392005443572998 + }, + { + "auxiliary_loss_clip": 0.01115911, + "auxiliary_loss_mlp": 0.01035382, + "balance_loss_clip": 1.0202899, + "balance_loss_mlp": 1.04070568, + "epoch": 0.37264392003607394, + "flos": 22637979388800.0, + "grad_norm": 2.267147370646453, + "language_loss": 0.64613056, + "learning_rate": 2.889733130264237e-06, + "loss": 0.66764355, + "num_input_tokens_seen": 133199615, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.75, + "step": 6198, + "time_per_iteration": 2.4624876976013184 + }, + { + "auxiliary_loss_clip": 0.0111678, + "auxiliary_loss_mlp": 0.01043759, + "balance_loss_clip": 1.02989507, + "balance_loss_mlp": 1.04129016, + "epoch": 0.3727040432887419, + "flos": 19973120348160.0, + "grad_norm": 2.4815957641530084, + "language_loss": 0.7439245, + "learning_rate": 2.889384312737261e-06, + "loss": 0.76552987, + "num_input_tokens_seen": 133219650, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75390625, + "step": 6199, + "time_per_iteration": 2.454932689666748 + }, + { + "auxiliary_loss_clip": 0.01117342, + "auxiliary_loss_mlp": 0.01032553, + "balance_loss_clip": 1.01881397, + "balance_loss_mlp": 1.04112601, + "epoch": 0.37276416654140987, + "flos": 63899122279680.0, + "grad_norm": 1.569210214205425, + "language_loss": 0.80711329, + "learning_rate": 2.889035461484742e-06, + "loss": 0.82861221, + "num_input_tokens_seen": 133245675, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76171875, + "step": 6200, + "time_per_iteration": 2.853854179382324 + }, + { + "auxiliary_loss_clip": 0.01118801, + "auxiliary_loss_mlp": 0.01040438, + "balance_loss_clip": 1.02588272, + "balance_loss_mlp": 1.04248428, + "epoch": 0.37282428979407783, + "flos": 39785970211200.0, + "grad_norm": 2.046105641958108, + "language_loss": 0.60723466, + "learning_rate": 2.88868657651991e-06, + "loss": 0.6288271, + "num_input_tokens_seen": 133266905, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.765625, + "step": 6201, + "time_per_iteration": 2.58642315864563 + }, + { + "auxiliary_loss_clip": 0.01122167, + "auxiliary_loss_mlp": 0.01032753, + "balance_loss_clip": 1.01813745, + "balance_loss_mlp": 1.04334736, + "epoch": 0.37288441304674586, + "flos": 22709046447360.0, + "grad_norm": 1.5967185311646992, + "language_loss": 0.72980845, + "learning_rate": 2.8883376578559934e-06, + "loss": 0.75135767, + "num_input_tokens_seen": 133286865, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7890625, + "step": 6202, + "time_per_iteration": 2.461116075515747 + }, + { + "auxiliary_loss_clip": 0.01120095, + "auxiliary_loss_mlp": 0.01034793, + "balance_loss_clip": 1.02064919, + "balance_loss_mlp": 1.04372942, + "epoch": 0.3729445362994138, + "flos": 18770292587520.0, + "grad_norm": 2.8761852736669793, + "language_loss": 0.739654, + "learning_rate": 2.8879887055062243e-06, + "loss": 0.76120287, + "num_input_tokens_seen": 133305295, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.765625, + "step": 6203, + "time_per_iteration": 2.4199976921081543 + }, + { + "auxiliary_loss_clip": 0.01113815, + "auxiliary_loss_mlp": 0.01033282, + "balance_loss_clip": 1.02083671, + "balance_loss_mlp": 1.03933048, + "epoch": 0.3730046595520818, + "flos": 22456199635200.0, + "grad_norm": 1.6894031212763305, + "language_loss": 0.81359541, + "learning_rate": 2.8876397194838353e-06, + "loss": 0.83506644, + "num_input_tokens_seen": 133324625, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.74609375, + "step": 6204, + "time_per_iteration": 2.527442693710327 + }, + { + "auxiliary_loss_clip": 0.01122288, + "auxiliary_loss_mlp": 0.01040396, + "balance_loss_clip": 1.02538753, + "balance_loss_mlp": 1.04287875, + "epoch": 0.37306478280474975, + "flos": 24316372241280.0, + "grad_norm": 1.5818895271767701, + "language_loss": 0.75028086, + "learning_rate": 2.8872906998020577e-06, + "loss": 0.77190769, + "num_input_tokens_seen": 133344625, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.796875, + "step": 6205, + "time_per_iteration": 2.515028953552246 + }, + { + "auxiliary_loss_clip": 0.01118084, + "auxiliary_loss_mlp": 0.01034946, + "balance_loss_clip": 1.02002704, + "balance_loss_mlp": 1.04183412, + "epoch": 0.3731249060574177, + "flos": 15815167741440.0, + "grad_norm": 1.8699710225203796, + "language_loss": 0.78044879, + "learning_rate": 2.886941646474128e-06, + "loss": 0.80197906, + "num_input_tokens_seen": 133363605, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.76171875, + "step": 6206, + "time_per_iteration": 2.433136224746704 + }, + { + "auxiliary_loss_clip": 0.01119546, + "auxiliary_loss_mlp": 0.01032561, + "balance_loss_clip": 1.01752925, + "balance_loss_mlp": 1.04182768, + "epoch": 0.3731850293100857, + "flos": 19828077229440.0, + "grad_norm": 2.1358392378140487, + "language_loss": 0.93595111, + "learning_rate": 2.886592559513283e-06, + "loss": 0.95747221, + "num_input_tokens_seen": 133379405, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7734375, + "step": 6207, + "time_per_iteration": 2.422592878341675 + }, + { + "auxiliary_loss_clip": 0.01120785, + "auxiliary_loss_mlp": 0.01031375, + "balance_loss_clip": 1.01774943, + "balance_loss_mlp": 1.04154027, + "epoch": 0.37324515256275365, + "flos": 19062354072960.0, + "grad_norm": 2.238385364236049, + "language_loss": 0.82666922, + "learning_rate": 2.886243438932759e-06, + "loss": 0.84819084, + "num_input_tokens_seen": 133397585, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.79296875, + "step": 6208, + "time_per_iteration": 2.5171287059783936 + }, + { + "auxiliary_loss_clip": 0.01122491, + "auxiliary_loss_mlp": 0.0103487, + "balance_loss_clip": 1.01911068, + "balance_loss_mlp": 1.04320371, + "epoch": 0.3733052758154216, + "flos": 20704333512960.0, + "grad_norm": 1.7601988102738153, + "language_loss": 0.73197794, + "learning_rate": 2.8858942847457953e-06, + "loss": 0.75355148, + "num_input_tokens_seen": 133415365, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.79296875, + "step": 6209, + "time_per_iteration": 2.480943202972412 + }, + { + "auxiliary_loss_clip": 0.01120081, + "auxiliary_loss_mlp": 0.01037238, + "balance_loss_clip": 1.02178252, + "balance_loss_mlp": 1.0430553, + "epoch": 0.3733653990680896, + "flos": 20193504243840.0, + "grad_norm": 1.4781766070975684, + "language_loss": 0.69951272, + "learning_rate": 2.8855450969656305e-06, + "loss": 0.72108591, + "num_input_tokens_seen": 133435700, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.76953125, + "step": 6210, + "time_per_iteration": 2.5063016414642334 + }, + { + "auxiliary_loss_clip": 0.01121548, + "auxiliary_loss_mlp": 0.01030573, + "balance_loss_clip": 1.01533842, + "balance_loss_mlp": 1.04171228, + "epoch": 0.37342552232075754, + "flos": 20339660684160.0, + "grad_norm": 1.960293983782413, + "language_loss": 0.77729124, + "learning_rate": 2.8851958756055073e-06, + "loss": 0.79881245, + "num_input_tokens_seen": 133455180, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.796875, + "step": 6211, + "time_per_iteration": 2.4845266342163086 + }, + { + "auxiliary_loss_clip": 0.01121905, + "auxiliary_loss_mlp": 0.01038644, + "balance_loss_clip": 1.0240593, + "balance_loss_mlp": 1.04219186, + "epoch": 0.3734856455734255, + "flos": 35517879527040.0, + "grad_norm": 1.9911666037414828, + "language_loss": 0.73026669, + "learning_rate": 2.884846620678668e-06, + "loss": 0.75187218, + "num_input_tokens_seen": 133476715, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.796875, + "step": 6212, + "time_per_iteration": 2.615323066711426 + }, + { + "auxiliary_loss_clip": 0.01130473, + "auxiliary_loss_mlp": 0.01047817, + "balance_loss_clip": 1.03231955, + "balance_loss_mlp": 1.04560018, + "epoch": 0.37354576882609347, + "flos": 21142300043520.0, + "grad_norm": 4.00760557025762, + "language_loss": 0.81895888, + "learning_rate": 2.884497332198356e-06, + "loss": 0.84074175, + "num_input_tokens_seen": 133494550, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.84765625, + "step": 6213, + "time_per_iteration": 2.4621500968933105 + }, + { + "auxiliary_loss_clip": 0.01119566, + "auxiliary_loss_mlp": 0.01039017, + "balance_loss_clip": 1.02433026, + "balance_loss_mlp": 1.04143643, + "epoch": 0.37360589207876144, + "flos": 21506793304320.0, + "grad_norm": 2.2631910468903014, + "language_loss": 0.7890203, + "learning_rate": 2.8841480101778167e-06, + "loss": 0.81060612, + "num_input_tokens_seen": 133512640, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.78125, + "step": 6214, + "time_per_iteration": 2.5582997798919678 + }, + { + "auxiliary_loss_clip": 0.01117625, + "auxiliary_loss_mlp": 0.01043035, + "balance_loss_clip": 1.02859902, + "balance_loss_mlp": 1.04069364, + "epoch": 0.37366601533142946, + "flos": 38435800861440.0, + "grad_norm": 1.7789401165216012, + "language_loss": 0.84881294, + "learning_rate": 2.883798654630296e-06, + "loss": 0.87041962, + "num_input_tokens_seen": 133535540, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.76953125, + "step": 6215, + "time_per_iteration": 2.6216535568237305 + }, + { + "auxiliary_loss_clip": 0.01121702, + "auxiliary_loss_mlp": 0.01041572, + "balance_loss_clip": 1.02595592, + "balance_loss_mlp": 1.04088581, + "epoch": 0.3737261385840974, + "flos": 18441171244800.0, + "grad_norm": 5.614431195109344, + "language_loss": 0.67669535, + "learning_rate": 2.8834492655690423e-06, + "loss": 0.69832802, + "num_input_tokens_seen": 133555795, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.80859375, + "step": 6216, + "time_per_iteration": 2.4592814445495605 + }, + { + "auxiliary_loss_clip": 0.01121492, + "auxiliary_loss_mlp": 0.01040499, + "balance_loss_clip": 1.02500176, + "balance_loss_mlp": 1.04252148, + "epoch": 0.3737862618367654, + "flos": 22929861306240.0, + "grad_norm": 2.041107256757408, + "language_loss": 0.65695626, + "learning_rate": 2.883099843007303e-06, + "loss": 0.67857617, + "num_input_tokens_seen": 133575905, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.7890625, + "step": 6217, + "time_per_iteration": 2.50801420211792 + }, + { + "auxiliary_loss_clip": 0.01122909, + "auxiliary_loss_mlp": 0.01039721, + "balance_loss_clip": 1.02378845, + "balance_loss_mlp": 1.04290843, + "epoch": 0.37384638508943335, + "flos": 15409664127360.0, + "grad_norm": 3.2488334570714725, + "language_loss": 0.80776107, + "learning_rate": 2.88275038695833e-06, + "loss": 0.82938731, + "num_input_tokens_seen": 133592585, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.80078125, + "step": 6218, + "time_per_iteration": 2.469524383544922 + }, + { + "auxiliary_loss_clip": 0.01117083, + "auxiliary_loss_mlp": 0.01032877, + "balance_loss_clip": 1.01851249, + "balance_loss_mlp": 1.04241216, + "epoch": 0.3739065083421013, + "flos": 24280820755200.0, + "grad_norm": 1.3682227753048604, + "language_loss": 0.78710622, + "learning_rate": 2.8824008974353736e-06, + "loss": 0.80860579, + "num_input_tokens_seen": 133615070, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.74609375, + "step": 6219, + "time_per_iteration": 2.595862627029419 + }, + { + "auxiliary_loss_clip": 0.01119648, + "auxiliary_loss_mlp": 0.0104261, + "balance_loss_clip": 1.02776265, + "balance_loss_mlp": 1.0430454, + "epoch": 0.3739666315947693, + "flos": 23002831785600.0, + "grad_norm": 2.1916352692915217, + "language_loss": 0.76985866, + "learning_rate": 2.8820513744516866e-06, + "loss": 0.79148126, + "num_input_tokens_seen": 133633490, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.765625, + "step": 6220, + "time_per_iteration": 6.68864631652832 + }, + { + "auxiliary_loss_clip": 0.01120187, + "auxiliary_loss_mlp": 0.01041991, + "balance_loss_clip": 1.02635062, + "balance_loss_mlp": 1.04149485, + "epoch": 0.37402675484743725, + "flos": 19391116279680.0, + "grad_norm": 1.921342744454882, + "language_loss": 0.82958305, + "learning_rate": 2.8817018180205235e-06, + "loss": 0.85120487, + "num_input_tokens_seen": 133653425, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.7890625, + "step": 6221, + "time_per_iteration": 3.9474618434906006 + }, + { + "auxiliary_loss_clip": 0.0111979, + "auxiliary_loss_mlp": 0.01042782, + "balance_loss_clip": 1.02852452, + "balance_loss_mlp": 1.04195023, + "epoch": 0.3740868781001052, + "flos": 17126158331520.0, + "grad_norm": 1.6461952088047174, + "language_loss": 0.75817096, + "learning_rate": 2.8813522281551387e-06, + "loss": 0.7797966, + "num_input_tokens_seen": 133670220, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 6222, + "time_per_iteration": 2.43192720413208 + }, + { + "auxiliary_loss_clip": 0.01121141, + "auxiliary_loss_mlp": 0.01033917, + "balance_loss_clip": 1.0191592, + "balance_loss_mlp": 1.04333961, + "epoch": 0.3741470013527732, + "flos": 20043505048320.0, + "grad_norm": 1.6728060456550218, + "language_loss": 0.70215583, + "learning_rate": 2.881002604868789e-06, + "loss": 0.72370636, + "num_input_tokens_seen": 133688910, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.77734375, + "step": 6223, + "time_per_iteration": 2.4719529151916504 + }, + { + "auxiliary_loss_clip": 0.01123096, + "auxiliary_loss_mlp": 0.010342, + "balance_loss_clip": 1.01976991, + "balance_loss_mlp": 1.04556298, + "epoch": 0.37420712460544114, + "flos": 36897279569280.0, + "grad_norm": 2.209456781749309, + "language_loss": 0.69100869, + "learning_rate": 2.8806529481747325e-06, + "loss": 0.71258163, + "num_input_tokens_seen": 133708690, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.77734375, + "step": 6224, + "time_per_iteration": 2.6382336616516113 + }, + { + "auxiliary_loss_clip": 0.01120784, + "auxiliary_loss_mlp": 0.01033639, + "balance_loss_clip": 1.01942348, + "balance_loss_mlp": 1.04488885, + "epoch": 0.3742672478581091, + "flos": 22201198007040.0, + "grad_norm": 1.8205395187863704, + "language_loss": 0.69828689, + "learning_rate": 2.880303258086228e-06, + "loss": 0.71983123, + "num_input_tokens_seen": 133728095, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7578125, + "step": 6225, + "time_per_iteration": 2.501041889190674 + }, + { + "auxiliary_loss_clip": 0.01118888, + "auxiliary_loss_mlp": 0.01038877, + "balance_loss_clip": 1.02376127, + "balance_loss_mlp": 1.04357982, + "epoch": 0.3743273711107771, + "flos": 24681547860480.0, + "grad_norm": 2.305559014636685, + "language_loss": 0.79056358, + "learning_rate": 2.879953534616536e-06, + "loss": 0.81214118, + "num_input_tokens_seen": 133745590, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.75390625, + "step": 6226, + "time_per_iteration": 2.485196113586426 + }, + { + "auxiliary_loss_clip": 0.01121484, + "auxiliary_loss_mlp": 0.01040329, + "balance_loss_clip": 1.02517128, + "balance_loss_mlp": 1.04342556, + "epoch": 0.37438749436344504, + "flos": 24459619680000.0, + "grad_norm": 2.1155280603994546, + "language_loss": 0.68059194, + "learning_rate": 2.879603777778917e-06, + "loss": 0.70221007, + "num_input_tokens_seen": 133766155, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.78125, + "step": 6227, + "time_per_iteration": 2.553396463394165 + }, + { + "auxiliary_loss_clip": 0.01119717, + "auxiliary_loss_mlp": 0.01034725, + "balance_loss_clip": 1.02044404, + "balance_loss_mlp": 1.04391932, + "epoch": 0.374447617616113, + "flos": 21798747048960.0, + "grad_norm": 1.719573737271176, + "language_loss": 0.82955533, + "learning_rate": 2.879253987586635e-06, + "loss": 0.85109973, + "num_input_tokens_seen": 133783185, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7578125, + "step": 6228, + "time_per_iteration": 2.449979305267334 + }, + { + "auxiliary_loss_clip": 0.01120724, + "auxiliary_loss_mlp": 0.01038988, + "balance_loss_clip": 1.0244565, + "balance_loss_mlp": 1.0452075, + "epoch": 0.374507740868781, + "flos": 17968191932160.0, + "grad_norm": 1.610770216359874, + "language_loss": 0.74802738, + "learning_rate": 2.8789041640529535e-06, + "loss": 0.76962447, + "num_input_tokens_seen": 133800975, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7578125, + "step": 6229, + "time_per_iteration": 2.4768621921539307 + }, + { + "auxiliary_loss_clip": 0.01121137, + "auxiliary_loss_mlp": 0.01039119, + "balance_loss_clip": 1.02384853, + "balance_loss_mlp": 1.04209936, + "epoch": 0.374567864121449, + "flos": 16105828596480.0, + "grad_norm": 1.8233250091751425, + "language_loss": 0.83350682, + "learning_rate": 2.8785543071911383e-06, + "loss": 0.85510933, + "num_input_tokens_seen": 133818020, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.7890625, + "step": 6230, + "time_per_iteration": 2.4503889083862305 + }, + { + "auxiliary_loss_clip": 0.01125186, + "auxiliary_loss_mlp": 0.01039118, + "balance_loss_clip": 1.02383518, + "balance_loss_mlp": 1.04665947, + "epoch": 0.37462798737411696, + "flos": 25773160135680.0, + "grad_norm": 1.8327028169227884, + "language_loss": 0.73589134, + "learning_rate": 2.878204417014456e-06, + "loss": 0.75753438, + "num_input_tokens_seen": 133840690, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.78515625, + "step": 6231, + "time_per_iteration": 2.5793888568878174 + }, + { + "auxiliary_loss_clip": 0.01126351, + "auxiliary_loss_mlp": 0.01042616, + "balance_loss_clip": 1.02754807, + "balance_loss_mlp": 1.04669595, + "epoch": 0.3746881106267849, + "flos": 16654507822080.0, + "grad_norm": 2.0748427868287536, + "language_loss": 0.72982037, + "learning_rate": 2.8778544935361735e-06, + "loss": 0.75151008, + "num_input_tokens_seen": 133858350, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.796875, + "step": 6232, + "time_per_iteration": 2.5400028228759766 + }, + { + "auxiliary_loss_clip": 0.01120736, + "auxiliary_loss_mlp": 0.01034639, + "balance_loss_clip": 1.01927304, + "balance_loss_mlp": 1.04244757, + "epoch": 0.3747482338794529, + "flos": 26177981391360.0, + "grad_norm": 1.7557793199484253, + "language_loss": 0.77042818, + "learning_rate": 2.877504536769561e-06, + "loss": 0.791982, + "num_input_tokens_seen": 133879775, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.78125, + "step": 6233, + "time_per_iteration": 2.6110641956329346 + }, + { + "auxiliary_loss_clip": 0.01124346, + "auxiliary_loss_mlp": 0.01039458, + "balance_loss_clip": 1.02521205, + "balance_loss_mlp": 1.04520559, + "epoch": 0.37480835713212085, + "flos": 12021061950720.0, + "grad_norm": 1.733253645903673, + "language_loss": 0.68936831, + "learning_rate": 2.8771545467278883e-06, + "loss": 0.71100628, + "num_input_tokens_seen": 133898295, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.79296875, + "step": 6234, + "time_per_iteration": 2.4476797580718994 + }, + { + "auxiliary_loss_clip": 0.01121608, + "auxiliary_loss_mlp": 0.01040174, + "balance_loss_clip": 1.02685833, + "balance_loss_mlp": 1.04514599, + "epoch": 0.3748684803847888, + "flos": 19679263182720.0, + "grad_norm": 1.8436539021155727, + "language_loss": 0.82329285, + "learning_rate": 2.8768045234244276e-06, + "loss": 0.84491062, + "num_input_tokens_seen": 133915230, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.765625, + "step": 6235, + "time_per_iteration": 2.4766016006469727 + }, + { + "auxiliary_loss_clip": 0.01127866, + "auxiliary_loss_mlp": 0.01032441, + "balance_loss_clip": 1.01823175, + "balance_loss_mlp": 1.04744995, + "epoch": 0.3749286036374568, + "flos": 20521189042560.0, + "grad_norm": 1.8082481713782126, + "language_loss": 0.77776909, + "learning_rate": 2.8764544668724517e-06, + "loss": 0.79937214, + "num_input_tokens_seen": 133934110, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8046875, + "step": 6236, + "time_per_iteration": 2.440678596496582 + }, + { + "auxiliary_loss_clip": 0.01124108, + "auxiliary_loss_mlp": 0.0104869, + "balance_loss_clip": 1.03139293, + "balance_loss_mlp": 1.04308259, + "epoch": 0.37498872689012475, + "flos": 20704620821760.0, + "grad_norm": 2.0063576687211704, + "language_loss": 0.73203218, + "learning_rate": 2.876104377085234e-06, + "loss": 0.7537601, + "num_input_tokens_seen": 133952395, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.80859375, + "step": 6237, + "time_per_iteration": 2.5782086849212646 + }, + { + "auxiliary_loss_clip": 0.01120953, + "auxiliary_loss_mlp": 0.01037906, + "balance_loss_clip": 1.02257562, + "balance_loss_mlp": 1.04084682, + "epoch": 0.3750488501427927, + "flos": 21574843620480.0, + "grad_norm": 2.2861902523152935, + "language_loss": 0.93017888, + "learning_rate": 2.8757542540760508e-06, + "loss": 0.9517675, + "num_input_tokens_seen": 133969635, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.80078125, + "step": 6238, + "time_per_iteration": 2.514997720718384 + }, + { + "auxiliary_loss_clip": 0.01121834, + "auxiliary_loss_mlp": 0.01033577, + "balance_loss_clip": 1.01821709, + "balance_loss_mlp": 1.04316592, + "epoch": 0.3751089733954607, + "flos": 15923869274880.0, + "grad_norm": 1.9811721217026943, + "language_loss": 0.71066076, + "learning_rate": 2.8754040978581777e-06, + "loss": 0.73221493, + "num_input_tokens_seen": 133987215, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.78515625, + "step": 6239, + "time_per_iteration": 2.5054962635040283 + }, + { + "auxiliary_loss_clip": 0.01127026, + "auxiliary_loss_mlp": 0.01031398, + "balance_loss_clip": 1.01659262, + "balance_loss_mlp": 1.04635918, + "epoch": 0.37516909664812864, + "flos": 36284644177920.0, + "grad_norm": 1.6550300124553972, + "language_loss": 0.6566934, + "learning_rate": 2.875053908444895e-06, + "loss": 0.67827761, + "num_input_tokens_seen": 134009250, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8046875, + "step": 6240, + "time_per_iteration": 2.5776519775390625 + }, + { + "auxiliary_loss_clip": 0.01124905, + "auxiliary_loss_mlp": 0.01031367, + "balance_loss_clip": 1.01703799, + "balance_loss_mlp": 1.04560649, + "epoch": 0.3752292199007966, + "flos": 13515915283200.0, + "grad_norm": 2.0148493018475877, + "language_loss": 0.75634778, + "learning_rate": 2.8747036858494795e-06, + "loss": 0.77791047, + "num_input_tokens_seen": 134026875, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.79296875, + "step": 6241, + "time_per_iteration": 2.503861904144287 + }, + { + "auxiliary_loss_clip": 0.01123464, + "auxiliary_loss_mlp": 0.01040235, + "balance_loss_clip": 1.02436805, + "balance_loss_mlp": 1.04321361, + "epoch": 0.3752893431534646, + "flos": 27198095644800.0, + "grad_norm": 2.5579725641576876, + "language_loss": 0.83610159, + "learning_rate": 2.874353430085213e-06, + "loss": 0.85773861, + "num_input_tokens_seen": 134047185, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.80078125, + "step": 6242, + "time_per_iteration": 2.4933042526245117 + }, + { + "auxiliary_loss_clip": 0.01122935, + "auxiliary_loss_mlp": 0.01038353, + "balance_loss_clip": 1.02435803, + "balance_loss_mlp": 1.04265308, + "epoch": 0.3753494664061326, + "flos": 30007674581760.0, + "grad_norm": 2.190530656574709, + "language_loss": 0.67888391, + "learning_rate": 2.8740031411653766e-06, + "loss": 0.70049673, + "num_input_tokens_seen": 134067330, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.8046875, + "step": 6243, + "time_per_iteration": 2.543820381164551 + }, + { + "auxiliary_loss_clip": 0.01121963, + "auxiliary_loss_mlp": 0.01038078, + "balance_loss_clip": 1.02241397, + "balance_loss_mlp": 1.04404676, + "epoch": 0.37540958965880056, + "flos": 24461954064000.0, + "grad_norm": 1.7974063962239055, + "language_loss": 0.84275806, + "learning_rate": 2.8736528191032535e-06, + "loss": 0.86435848, + "num_input_tokens_seen": 134085525, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.78125, + "step": 6244, + "time_per_iteration": 2.4710450172424316 + }, + { + "auxiliary_loss_clip": 0.01119065, + "auxiliary_loss_mlp": 0.01036596, + "balance_loss_clip": 1.02229667, + "balance_loss_mlp": 1.0436101, + "epoch": 0.3754697129114685, + "flos": 16508387295360.0, + "grad_norm": 2.387588700969948, + "language_loss": 0.83019805, + "learning_rate": 2.8733024639121277e-06, + "loss": 0.85175467, + "num_input_tokens_seen": 134101855, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.75390625, + "step": 6245, + "time_per_iteration": 2.4594197273254395 + }, + { + "auxiliary_loss_clip": 0.01122149, + "auxiliary_loss_mlp": 0.01037692, + "balance_loss_clip": 1.02207565, + "balance_loss_mlp": 1.04337263, + "epoch": 0.3755298361641365, + "flos": 19390900798080.0, + "grad_norm": 1.94802763897559, + "language_loss": 0.64043313, + "learning_rate": 2.8729520756052853e-06, + "loss": 0.66203153, + "num_input_tokens_seen": 134119360, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.7890625, + "step": 6246, + "time_per_iteration": 2.4522809982299805 + }, + { + "auxiliary_loss_clip": 0.01125162, + "auxiliary_loss_mlp": 0.01038669, + "balance_loss_clip": 1.0231837, + "balance_loss_mlp": 1.04382014, + "epoch": 0.37558995941680445, + "flos": 14720395069440.0, + "grad_norm": 1.7195896287931138, + "language_loss": 0.75146973, + "learning_rate": 2.8726016541960124e-06, + "loss": 0.77310807, + "num_input_tokens_seen": 134137475, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8125, + "step": 6247, + "time_per_iteration": 2.4527103900909424 + }, + { + "auxiliary_loss_clip": 0.01122539, + "auxiliary_loss_mlp": 0.01037762, + "balance_loss_clip": 1.02281308, + "balance_loss_mlp": 1.04276609, + "epoch": 0.3756500826694724, + "flos": 21689901861120.0, + "grad_norm": 3.472354315090956, + "language_loss": 0.55157161, + "learning_rate": 2.872251199697598e-06, + "loss": 0.5731746, + "num_input_tokens_seen": 134154580, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.796875, + "step": 6248, + "time_per_iteration": 2.4399521350860596 + }, + { + "auxiliary_loss_clip": 0.01119734, + "auxiliary_loss_mlp": 0.0103806, + "balance_loss_clip": 1.02334976, + "balance_loss_mlp": 1.04241502, + "epoch": 0.3757102059221404, + "flos": 26505666190080.0, + "grad_norm": 2.875026035710993, + "language_loss": 0.84247208, + "learning_rate": 2.8719007121233297e-06, + "loss": 0.86404997, + "num_input_tokens_seen": 134174285, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.7734375, + "step": 6249, + "time_per_iteration": 2.529763698577881 + }, + { + "auxiliary_loss_clip": 0.01120861, + "auxiliary_loss_mlp": 0.01033042, + "balance_loss_clip": 1.018713, + "balance_loss_mlp": 1.0427655, + "epoch": 0.37577032917480835, + "flos": 37338083274240.0, + "grad_norm": 1.7253468577749267, + "language_loss": 0.68124413, + "learning_rate": 2.8715501914864993e-06, + "loss": 0.70278323, + "num_input_tokens_seen": 134195940, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78125, + "step": 6250, + "time_per_iteration": 2.572439193725586 + }, + { + "auxiliary_loss_clip": 0.01124257, + "auxiliary_loss_mlp": 0.01042227, + "balance_loss_clip": 1.02791047, + "balance_loss_mlp": 1.04538727, + "epoch": 0.3758304524274763, + "flos": 21908597817600.0, + "grad_norm": 2.0419035804756716, + "language_loss": 0.77633286, + "learning_rate": 2.8711996378003987e-06, + "loss": 0.79799771, + "num_input_tokens_seen": 134212235, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7890625, + "step": 6251, + "time_per_iteration": 2.58437442779541 + }, + { + "auxiliary_loss_clip": 0.01120391, + "auxiliary_loss_mlp": 0.01033287, + "balance_loss_clip": 1.01910138, + "balance_loss_mlp": 1.04232824, + "epoch": 0.3758905756801443, + "flos": 36569343375360.0, + "grad_norm": 2.137051103462404, + "language_loss": 0.58463252, + "learning_rate": 2.8708490510783203e-06, + "loss": 0.60616934, + "num_input_tokens_seen": 134233810, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 6252, + "time_per_iteration": 2.6117262840270996 + }, + { + "auxiliary_loss_clip": 0.01124494, + "auxiliary_loss_mlp": 0.01043009, + "balance_loss_clip": 1.02730918, + "balance_loss_mlp": 1.04393482, + "epoch": 0.37595069893281224, + "flos": 24528783317760.0, + "grad_norm": 2.9959533965383836, + "language_loss": 0.89689183, + "learning_rate": 2.8704984313335584e-06, + "loss": 0.91856694, + "num_input_tokens_seen": 134252020, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8046875, + "step": 6253, + "time_per_iteration": 2.5241925716400146 + }, + { + "auxiliary_loss_clip": 0.01123311, + "auxiliary_loss_mlp": 0.01036763, + "balance_loss_clip": 1.0227623, + "balance_loss_mlp": 1.04618073, + "epoch": 0.3760108221854802, + "flos": 16435021766400.0, + "grad_norm": 1.9568868773694639, + "language_loss": 0.76368916, + "learning_rate": 2.8701477785794097e-06, + "loss": 0.78528988, + "num_input_tokens_seen": 134269495, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76953125, + "step": 6254, + "time_per_iteration": 2.44631028175354 + }, + { + "auxiliary_loss_clip": 0.011269, + "auxiliary_loss_mlp": 0.01044619, + "balance_loss_clip": 1.02906847, + "balance_loss_mlp": 1.04640615, + "epoch": 0.37607094543814823, + "flos": 13771742924160.0, + "grad_norm": 2.019237604940679, + "language_loss": 0.61830014, + "learning_rate": 2.869797092829169e-06, + "loss": 0.6400153, + "num_input_tokens_seen": 134287035, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8046875, + "step": 6255, + "time_per_iteration": 2.474303960800171 + }, + { + "auxiliary_loss_clip": 0.01125813, + "auxiliary_loss_mlp": 0.01037924, + "balance_loss_clip": 1.02204537, + "balance_loss_mlp": 1.0434109, + "epoch": 0.3761310686908162, + "flos": 19857918453120.0, + "grad_norm": 2.4357923747979675, + "language_loss": 0.74234015, + "learning_rate": 2.869446374096135e-06, + "loss": 0.76397753, + "num_input_tokens_seen": 134304840, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.82421875, + "step": 6256, + "time_per_iteration": 2.4332830905914307 + }, + { + "auxiliary_loss_clip": 0.01129168, + "auxiliary_loss_mlp": 0.01045861, + "balance_loss_clip": 1.03029239, + "balance_loss_mlp": 1.04842019, + "epoch": 0.37619119194348416, + "flos": 12750802657920.0, + "grad_norm": 1.807318668329893, + "language_loss": 0.70297635, + "learning_rate": 2.8690956223936088e-06, + "loss": 0.72472662, + "num_input_tokens_seen": 134323180, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.80859375, + "step": 6257, + "time_per_iteration": 2.600249767303467 + }, + { + "auxiliary_loss_clip": 0.01123849, + "auxiliary_loss_mlp": 0.01034306, + "balance_loss_clip": 1.01998889, + "balance_loss_mlp": 1.04582894, + "epoch": 0.3762513151961521, + "flos": 17530548624000.0, + "grad_norm": 1.8628634379537026, + "language_loss": 0.84647095, + "learning_rate": 2.868744837734889e-06, + "loss": 0.86805254, + "num_input_tokens_seen": 134341390, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78125, + "step": 6258, + "time_per_iteration": 2.443833351135254 + }, + { + "auxiliary_loss_clip": 0.01122949, + "auxiliary_loss_mlp": 0.01043602, + "balance_loss_clip": 1.02936888, + "balance_loss_mlp": 1.04430962, + "epoch": 0.3763114384488201, + "flos": 23617406511360.0, + "grad_norm": 1.514941849696829, + "language_loss": 0.81009686, + "learning_rate": 2.868394020133277e-06, + "loss": 0.83176237, + "num_input_tokens_seen": 134360425, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78515625, + "step": 6259, + "time_per_iteration": 2.5727832317352295 + }, + { + "auxiliary_loss_clip": 0.01130377, + "auxiliary_loss_mlp": 0.01042246, + "balance_loss_clip": 1.02660608, + "balance_loss_mlp": 1.04775453, + "epoch": 0.37637156170148806, + "flos": 25406978935680.0, + "grad_norm": 1.8915772167347047, + "language_loss": 0.71919596, + "learning_rate": 2.8680431696020783e-06, + "loss": 0.74092221, + "num_input_tokens_seen": 134379775, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.828125, + "step": 6260, + "time_per_iteration": 2.5225539207458496 + }, + { + "auxiliary_loss_clip": 0.0112693, + "auxiliary_loss_mlp": 0.01036069, + "balance_loss_clip": 1.02061951, + "balance_loss_mlp": 1.04538989, + "epoch": 0.376431684954156, + "flos": 23440906056960.0, + "grad_norm": 1.725193491542272, + "language_loss": 0.78423822, + "learning_rate": 2.867692286154594e-06, + "loss": 0.80586827, + "num_input_tokens_seen": 134400315, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.81640625, + "step": 6261, + "time_per_iteration": 2.4926671981811523 + }, + { + "auxiliary_loss_clip": 0.01132188, + "auxiliary_loss_mlp": 0.01043226, + "balance_loss_clip": 1.02784848, + "balance_loss_mlp": 1.04861188, + "epoch": 0.376491808206824, + "flos": 34204482725760.0, + "grad_norm": 1.7544905551461754, + "language_loss": 0.80327791, + "learning_rate": 2.867341369804132e-06, + "loss": 0.82503211, + "num_input_tokens_seen": 134422875, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8359375, + "step": 6262, + "time_per_iteration": 6.861605167388916 + }, + { + "auxiliary_loss_clip": 0.01121874, + "auxiliary_loss_mlp": 0.01032438, + "balance_loss_clip": 1.01796031, + "balance_loss_mlp": 1.04471791, + "epoch": 0.37655193145949195, + "flos": 35185669614720.0, + "grad_norm": 1.7128267856657793, + "language_loss": 0.80543715, + "learning_rate": 2.866990420563998e-06, + "loss": 0.82698023, + "num_input_tokens_seen": 134443025, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.76953125, + "step": 6263, + "time_per_iteration": 2.6574654579162598 + }, + { + "auxiliary_loss_clip": 0.01128017, + "auxiliary_loss_mlp": 0.01041966, + "balance_loss_clip": 1.02705324, + "balance_loss_mlp": 1.04757583, + "epoch": 0.3766120547121599, + "flos": 16761844638720.0, + "grad_norm": 2.7435231382382033, + "language_loss": 0.80158919, + "learning_rate": 2.866639438447501e-06, + "loss": 0.82328904, + "num_input_tokens_seen": 134460945, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8046875, + "step": 6264, + "time_per_iteration": 2.4326720237731934 + }, + { + "auxiliary_loss_clip": 0.01122852, + "auxiliary_loss_mlp": 0.01046441, + "balance_loss_clip": 1.03120613, + "balance_loss_mlp": 1.04323912, + "epoch": 0.3766721779648279, + "flos": 23550361776000.0, + "grad_norm": 2.2579254623504585, + "language_loss": 0.73604524, + "learning_rate": 2.8662884234679497e-06, + "loss": 0.75773823, + "num_input_tokens_seen": 134480440, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.796875, + "step": 6265, + "time_per_iteration": 2.481248617172241 + }, + { + "auxiliary_loss_clip": 0.01126145, + "auxiliary_loss_mlp": 0.01038865, + "balance_loss_clip": 1.02525079, + "balance_loss_mlp": 1.04878664, + "epoch": 0.37673230121749585, + "flos": 29129191655040.0, + "grad_norm": 1.6798839148056366, + "language_loss": 0.68685853, + "learning_rate": 2.865937375638654e-06, + "loss": 0.70850861, + "num_input_tokens_seen": 134501110, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 6266, + "time_per_iteration": 2.517972946166992 + }, + { + "auxiliary_loss_clip": 0.01129377, + "auxiliary_loss_mlp": 0.0104268, + "balance_loss_clip": 1.02746832, + "balance_loss_mlp": 1.04570127, + "epoch": 0.3767924244701638, + "flos": 28146783703680.0, + "grad_norm": 21.71943634627446, + "language_loss": 0.6330213, + "learning_rate": 2.8655862949729264e-06, + "loss": 0.65474188, + "num_input_tokens_seen": 134522460, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8359375, + "step": 6267, + "time_per_iteration": 2.534775733947754 + }, + { + "auxiliary_loss_clip": 0.01049589, + "auxiliary_loss_mlp": 0.01002617, + "balance_loss_clip": 1.00076914, + "balance_loss_mlp": 1.02342653, + "epoch": 0.37685254772283183, + "flos": 60797197526400.0, + "grad_norm": 0.7181832227527338, + "language_loss": 0.58946306, + "learning_rate": 2.8652351814840795e-06, + "loss": 0.60998511, + "num_input_tokens_seen": 134589545, + "router_z_loss_clip": 0.01843262, + "router_z_loss_mlp": 0.26171875, + "step": 6268, + "time_per_iteration": 3.168419361114502 + }, + { + "auxiliary_loss_clip": 0.011283, + "auxiliary_loss_mlp": 0.01038795, + "balance_loss_clip": 1.02268982, + "balance_loss_mlp": 1.04734302, + "epoch": 0.3769126709754998, + "flos": 26032543223040.0, + "grad_norm": 1.4797604992869704, + "language_loss": 0.65026355, + "learning_rate": 2.8648840351854283e-06, + "loss": 0.67193449, + "num_input_tokens_seen": 134610550, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8125, + "step": 6269, + "time_per_iteration": 2.5472333431243896 + }, + { + "auxiliary_loss_clip": 0.01127949, + "auxiliary_loss_mlp": 0.01038619, + "balance_loss_clip": 1.02263296, + "balance_loss_mlp": 1.05022144, + "epoch": 0.37697279422816776, + "flos": 23579879777280.0, + "grad_norm": 1.46875421159053, + "language_loss": 0.70592397, + "learning_rate": 2.8645328560902874e-06, + "loss": 0.72758961, + "num_input_tokens_seen": 134630485, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.77734375, + "step": 6270, + "time_per_iteration": 2.4763948917388916 + }, + { + "auxiliary_loss_clip": 0.01045864, + "auxiliary_loss_mlp": 0.0100198, + "balance_loss_clip": 1.00021577, + "balance_loss_mlp": 1.02014744, + "epoch": 0.3770329174808357, + "flos": 64745935367040.0, + "grad_norm": 0.7024360778923162, + "language_loss": 0.56136239, + "learning_rate": 2.8641816442119746e-06, + "loss": 0.58184087, + "num_input_tokens_seen": 134693510, + "router_z_loss_clip": 0.0177002, + "router_z_loss_mlp": 0.2578125, + "step": 6271, + "time_per_iteration": 3.0738816261291504 + }, + { + "auxiliary_loss_clip": 0.01124439, + "auxiliary_loss_mlp": 0.01039364, + "balance_loss_clip": 1.02326441, + "balance_loss_mlp": 1.04638743, + "epoch": 0.3770930407335037, + "flos": 21835304115840.0, + "grad_norm": 2.066611127756055, + "language_loss": 0.79340166, + "learning_rate": 2.8638303995638066e-06, + "loss": 0.81503969, + "num_input_tokens_seen": 134713115, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.78125, + "step": 6272, + "time_per_iteration": 2.4686055183410645 + }, + { + "auxiliary_loss_clip": 0.01122198, + "auxiliary_loss_mlp": 0.01031935, + "balance_loss_clip": 1.01802933, + "balance_loss_mlp": 1.04578209, + "epoch": 0.37715316398617166, + "flos": 22747901984640.0, + "grad_norm": 1.4641670728096365, + "language_loss": 0.74172843, + "learning_rate": 2.863479122159103e-06, + "loss": 0.76326972, + "num_input_tokens_seen": 134732635, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 6273, + "time_per_iteration": 2.5079009532928467 + }, + { + "auxiliary_loss_clip": 0.01124789, + "auxiliary_loss_mlp": 0.01045969, + "balance_loss_clip": 1.03112721, + "balance_loss_mlp": 1.04621577, + "epoch": 0.3772132872388396, + "flos": 18914581520640.0, + "grad_norm": 1.4163029825487425, + "language_loss": 0.71801323, + "learning_rate": 2.8631278120111858e-06, + "loss": 0.73972082, + "num_input_tokens_seen": 134750695, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78515625, + "step": 6274, + "time_per_iteration": 2.460338592529297 + }, + { + "auxiliary_loss_clip": 0.01128245, + "auxiliary_loss_mlp": 0.01036844, + "balance_loss_clip": 1.02277732, + "balance_loss_mlp": 1.04794264, + "epoch": 0.3772734104915076, + "flos": 17346219004800.0, + "grad_norm": 1.663376044288712, + "language_loss": 0.83692443, + "learning_rate": 2.8627764691333742e-06, + "loss": 0.85857534, + "num_input_tokens_seen": 134768935, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.80078125, + "step": 6275, + "time_per_iteration": 2.48319149017334 + }, + { + "auxiliary_loss_clip": 0.01121629, + "auxiliary_loss_mlp": 0.0103252, + "balance_loss_clip": 1.01949656, + "balance_loss_mlp": 1.04532933, + "epoch": 0.37733353374417555, + "flos": 32342370785280.0, + "grad_norm": 1.4340123311349162, + "language_loss": 0.75342453, + "learning_rate": 2.8624250935389935e-06, + "loss": 0.77496612, + "num_input_tokens_seen": 134791260, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.76171875, + "step": 6276, + "time_per_iteration": 2.5773236751556396 + }, + { + "auxiliary_loss_clip": 0.01127758, + "auxiliary_loss_mlp": 0.01042729, + "balance_loss_clip": 1.02724338, + "balance_loss_mlp": 1.04667568, + "epoch": 0.3773936569968435, + "flos": 23360681030400.0, + "grad_norm": 1.858122502551201, + "language_loss": 0.85519129, + "learning_rate": 2.862073685241366e-06, + "loss": 0.87689614, + "num_input_tokens_seen": 134808350, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8125, + "step": 6277, + "time_per_iteration": 2.5827369689941406 + }, + { + "auxiliary_loss_clip": 0.01123645, + "auxiliary_loss_mlp": 0.01032265, + "balance_loss_clip": 1.01833546, + "balance_loss_mlp": 1.04713118, + "epoch": 0.3774537802495115, + "flos": 21466788531840.0, + "grad_norm": 2.807350675061797, + "language_loss": 0.78055024, + "learning_rate": 2.861722244253818e-06, + "loss": 0.80210936, + "num_input_tokens_seen": 134826005, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 6278, + "time_per_iteration": 2.491334915161133 + }, + { + "auxiliary_loss_clip": 0.01128448, + "auxiliary_loss_mlp": 0.01044224, + "balance_loss_clip": 1.02795196, + "balance_loss_mlp": 1.04698181, + "epoch": 0.37751390350217945, + "flos": 24973717086720.0, + "grad_norm": 1.933979010172509, + "language_loss": 0.82702643, + "learning_rate": 2.8613707705896767e-06, + "loss": 0.84875309, + "num_input_tokens_seen": 134844995, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8125, + "step": 6279, + "time_per_iteration": 2.538426160812378 + }, + { + "auxiliary_loss_clip": 0.01125885, + "auxiliary_loss_mlp": 0.01037058, + "balance_loss_clip": 1.02310467, + "balance_loss_mlp": 1.04578614, + "epoch": 0.3775740267548474, + "flos": 27819098904960.0, + "grad_norm": 2.0225623598483358, + "language_loss": 0.74985826, + "learning_rate": 2.861019264262269e-06, + "loss": 0.77148765, + "num_input_tokens_seen": 134865285, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.80078125, + "step": 6280, + "time_per_iteration": 2.5161032676696777 + }, + { + "auxiliary_loss_clip": 0.01123339, + "auxiliary_loss_mlp": 0.01036466, + "balance_loss_clip": 1.02283478, + "balance_loss_mlp": 1.04662085, + "epoch": 0.3776341500075154, + "flos": 22565224391040.0, + "grad_norm": 1.4438938373085308, + "language_loss": 0.76017272, + "learning_rate": 2.8606677252849242e-06, + "loss": 0.78177071, + "num_input_tokens_seen": 134886535, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.765625, + "step": 6281, + "time_per_iteration": 2.504711151123047 + }, + { + "auxiliary_loss_clip": 0.01122332, + "auxiliary_loss_mlp": 0.01035077, + "balance_loss_clip": 1.02049732, + "balance_loss_mlp": 1.04368496, + "epoch": 0.3776942732601834, + "flos": 23077238808960.0, + "grad_norm": 1.7476205657776698, + "language_loss": 0.8391279, + "learning_rate": 2.860316153670974e-06, + "loss": 0.86070192, + "num_input_tokens_seen": 134907435, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.78515625, + "step": 6282, + "time_per_iteration": 2.4668593406677246 + }, + { + "auxiliary_loss_clip": 0.01120742, + "auxiliary_loss_mlp": 0.01037931, + "balance_loss_clip": 1.02337587, + "balance_loss_mlp": 1.04434681, + "epoch": 0.37775439651285136, + "flos": 21724411852800.0, + "grad_norm": 1.8037618077250128, + "language_loss": 0.70150751, + "learning_rate": 2.8599645494337484e-06, + "loss": 0.72309422, + "num_input_tokens_seen": 134925360, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.765625, + "step": 6283, + "time_per_iteration": 2.481948137283325 + }, + { + "auxiliary_loss_clip": 0.0112321, + "auxiliary_loss_mlp": 0.01045267, + "balance_loss_clip": 1.02967477, + "balance_loss_mlp": 1.04516089, + "epoch": 0.37781451976551933, + "flos": 23987753688960.0, + "grad_norm": 1.804590454145544, + "language_loss": 0.76529062, + "learning_rate": 2.859612912586581e-06, + "loss": 0.78697532, + "num_input_tokens_seen": 134944205, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.78125, + "step": 6284, + "time_per_iteration": 2.462968349456787 + }, + { + "auxiliary_loss_clip": 0.01130082, + "auxiliary_loss_mlp": 0.01034565, + "balance_loss_clip": 1.01884174, + "balance_loss_mlp": 1.0466392, + "epoch": 0.3778746430181873, + "flos": 13727967223680.0, + "grad_norm": 2.0529722445272167, + "language_loss": 0.85851312, + "learning_rate": 2.8592612431428055e-06, + "loss": 0.88015962, + "num_input_tokens_seen": 134960255, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8359375, + "step": 6285, + "time_per_iteration": 2.4435150623321533 + }, + { + "auxiliary_loss_clip": 0.01125611, + "auxiliary_loss_mlp": 0.0104034, + "balance_loss_clip": 1.0240438, + "balance_loss_mlp": 1.04457164, + "epoch": 0.37793476627085526, + "flos": 19460495399040.0, + "grad_norm": 1.9682053367320125, + "language_loss": 0.83967972, + "learning_rate": 2.858909541115758e-06, + "loss": 0.86133921, + "num_input_tokens_seen": 134978605, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8125, + "step": 6286, + "time_per_iteration": 2.4270951747894287 + }, + { + "auxiliary_loss_clip": 0.01123272, + "auxiliary_loss_mlp": 0.01041948, + "balance_loss_clip": 1.0268203, + "balance_loss_mlp": 1.04474115, + "epoch": 0.3779948895235232, + "flos": 10707018704640.0, + "grad_norm": 2.20319687907872, + "language_loss": 0.81550682, + "learning_rate": 2.858557806518775e-06, + "loss": 0.83715904, + "num_input_tokens_seen": 134995020, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.78515625, + "step": 6287, + "time_per_iteration": 2.4504740238189697 + }, + { + "auxiliary_loss_clip": 0.01121581, + "auxiliary_loss_mlp": 0.01040758, + "balance_loss_clip": 1.02559495, + "balance_loss_mlp": 1.04340911, + "epoch": 0.3780550127761912, + "flos": 22310007281280.0, + "grad_norm": 2.428511311582982, + "language_loss": 0.73038173, + "learning_rate": 2.8582060393651927e-06, + "loss": 0.75200516, + "num_input_tokens_seen": 135012620, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.78125, + "step": 6288, + "time_per_iteration": 2.4988601207733154 + }, + { + "auxiliary_loss_clip": 0.01126071, + "auxiliary_loss_mlp": 0.01036255, + "balance_loss_clip": 1.02103162, + "balance_loss_mlp": 1.04705048, + "epoch": 0.37811513602885916, + "flos": 28950644125440.0, + "grad_norm": 1.726028925404572, + "language_loss": 0.75453335, + "learning_rate": 2.857854239668352e-06, + "loss": 0.7761566, + "num_input_tokens_seen": 135033365, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.7890625, + "step": 6289, + "time_per_iteration": 2.5323870182037354 + }, + { + "auxiliary_loss_clip": 0.01121413, + "auxiliary_loss_mlp": 0.01038832, + "balance_loss_clip": 1.02428889, + "balance_loss_mlp": 1.04395676, + "epoch": 0.3781752592815271, + "flos": 23112933949440.0, + "grad_norm": 1.9121243331279245, + "language_loss": 0.7341041, + "learning_rate": 2.857502407441593e-06, + "loss": 0.75570655, + "num_input_tokens_seen": 135052185, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7734375, + "step": 6290, + "time_per_iteration": 2.4703667163848877 + }, + { + "auxiliary_loss_clip": 0.01126076, + "auxiliary_loss_mlp": 0.01040267, + "balance_loss_clip": 1.02388752, + "balance_loss_mlp": 1.0441103, + "epoch": 0.3782353825341951, + "flos": 19755932762880.0, + "grad_norm": 2.4130424762969502, + "language_loss": 0.79729307, + "learning_rate": 2.8571505426982566e-06, + "loss": 0.81895649, + "num_input_tokens_seen": 135070425, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8203125, + "step": 6291, + "time_per_iteration": 2.590517520904541 + }, + { + "auxiliary_loss_clip": 0.01124797, + "auxiliary_loss_mlp": 0.0103595, + "balance_loss_clip": 1.02038157, + "balance_loss_mlp": 1.04347014, + "epoch": 0.37829550578686305, + "flos": 22050839675520.0, + "grad_norm": 1.7851511943573266, + "language_loss": 0.76090503, + "learning_rate": 2.8567986454516854e-06, + "loss": 0.78251249, + "num_input_tokens_seen": 135090525, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8125, + "step": 6292, + "time_per_iteration": 2.486375570297241 + }, + { + "auxiliary_loss_clip": 0.0112214, + "auxiliary_loss_mlp": 0.01042986, + "balance_loss_clip": 1.02708387, + "balance_loss_mlp": 1.04380596, + "epoch": 0.378355629039531, + "flos": 16470357770880.0, + "grad_norm": 1.8744506208430416, + "language_loss": 0.69510674, + "learning_rate": 2.856446715715224e-06, + "loss": 0.71675801, + "num_input_tokens_seen": 135109575, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.78125, + "step": 6293, + "time_per_iteration": 2.477025032043457 + }, + { + "auxiliary_loss_clip": 0.01120173, + "auxiliary_loss_mlp": 0.01036754, + "balance_loss_clip": 1.02140629, + "balance_loss_mlp": 1.04180205, + "epoch": 0.378415752292199, + "flos": 19974844200960.0, + "grad_norm": 1.812028848861632, + "language_loss": 0.71631789, + "learning_rate": 2.8560947535022173e-06, + "loss": 0.73788714, + "num_input_tokens_seen": 135127000, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.78515625, + "step": 6294, + "time_per_iteration": 2.446382522583008 + }, + { + "auxiliary_loss_clip": 0.01128463, + "auxiliary_loss_mlp": 0.01035795, + "balance_loss_clip": 1.02050054, + "balance_loss_mlp": 1.04522586, + "epoch": 0.378475875544867, + "flos": 14647388676480.0, + "grad_norm": 2.0852903309957815, + "language_loss": 0.8254326, + "learning_rate": 2.855742758826011e-06, + "loss": 0.84707516, + "num_input_tokens_seen": 135145285, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.83203125, + "step": 6295, + "time_per_iteration": 2.4684417247772217 + }, + { + "auxiliary_loss_clip": 0.01123253, + "auxiliary_loss_mlp": 0.01033263, + "balance_loss_clip": 1.01870751, + "balance_loss_mlp": 1.04352689, + "epoch": 0.37853599879753497, + "flos": 26650996617600.0, + "grad_norm": 1.687128097470698, + "language_loss": 0.71806532, + "learning_rate": 2.8553907316999547e-06, + "loss": 0.73963046, + "num_input_tokens_seen": 135165240, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.796875, + "step": 6296, + "time_per_iteration": 2.515676975250244 + }, + { + "auxiliary_loss_clip": 0.01119269, + "auxiliary_loss_mlp": 0.01039959, + "balance_loss_clip": 1.02523708, + "balance_loss_mlp": 1.04370534, + "epoch": 0.37859612205020293, + "flos": 17311960408320.0, + "grad_norm": 1.741193546240543, + "language_loss": 0.77094543, + "learning_rate": 2.855038672137396e-06, + "loss": 0.79253769, + "num_input_tokens_seen": 135184045, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7578125, + "step": 6297, + "time_per_iteration": 2.4617502689361572 + }, + { + "auxiliary_loss_clip": 0.01123428, + "auxiliary_loss_mlp": 0.01035161, + "balance_loss_clip": 1.02042699, + "balance_loss_mlp": 1.04360187, + "epoch": 0.3786562453028709, + "flos": 18220392299520.0, + "grad_norm": 2.034703790395703, + "language_loss": 0.79179847, + "learning_rate": 2.854686580151684e-06, + "loss": 0.81338429, + "num_input_tokens_seen": 135202365, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.796875, + "step": 6298, + "time_per_iteration": 2.4516994953155518 + }, + { + "auxiliary_loss_clip": 0.01121762, + "auxiliary_loss_mlp": 0.01034647, + "balance_loss_clip": 1.02001977, + "balance_loss_mlp": 1.04453242, + "epoch": 0.37871636855553886, + "flos": 21214875473280.0, + "grad_norm": 2.0947541210526466, + "language_loss": 0.84758198, + "learning_rate": 2.8543344557561722e-06, + "loss": 0.86914611, + "num_input_tokens_seen": 135220955, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7734375, + "step": 6299, + "time_per_iteration": 2.4814558029174805 + }, + { + "auxiliary_loss_clip": 0.01123706, + "auxiliary_loss_mlp": 0.0103612, + "balance_loss_clip": 1.02153504, + "balance_loss_mlp": 1.04462421, + "epoch": 0.3787764918082068, + "flos": 20952727038720.0, + "grad_norm": 2.218392777517032, + "language_loss": 0.7657811, + "learning_rate": 2.8539822989642116e-06, + "loss": 0.78737932, + "num_input_tokens_seen": 135239715, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7890625, + "step": 6300, + "time_per_iteration": 2.4615044593811035 + }, + { + "auxiliary_loss_clip": 0.01127842, + "auxiliary_loss_mlp": 0.01039306, + "balance_loss_clip": 1.02135265, + "balance_loss_mlp": 1.04486537, + "epoch": 0.3788366150608748, + "flos": 17308009912320.0, + "grad_norm": 2.28104869272164, + "language_loss": 0.82490808, + "learning_rate": 2.8536301097891577e-06, + "loss": 0.84657955, + "num_input_tokens_seen": 135257035, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.828125, + "step": 6301, + "time_per_iteration": 2.4864752292633057 + }, + { + "auxiliary_loss_clip": 0.01119304, + "auxiliary_loss_mlp": 0.0104447, + "balance_loss_clip": 1.02967012, + "balance_loss_mlp": 1.04097867, + "epoch": 0.37889673831354276, + "flos": 24311092942080.0, + "grad_norm": 1.8461206090891127, + "language_loss": 0.67669666, + "learning_rate": 2.8532778882443636e-06, + "loss": 0.69833434, + "num_input_tokens_seen": 135275720, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78125, + "step": 6302, + "time_per_iteration": 2.501873016357422 + }, + { + "auxiliary_loss_clip": 0.01122155, + "auxiliary_loss_mlp": 0.0104003, + "balance_loss_clip": 1.02617788, + "balance_loss_mlp": 1.04561174, + "epoch": 0.3789568615662107, + "flos": 26683603188480.0, + "grad_norm": 1.9271400579859064, + "language_loss": 0.68487787, + "learning_rate": 2.8529256343431867e-06, + "loss": 0.7064997, + "num_input_tokens_seen": 135294140, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 6303, + "time_per_iteration": 4.003960371017456 + }, + { + "auxiliary_loss_clip": 0.01119108, + "auxiliary_loss_mlp": 0.01034602, + "balance_loss_clip": 1.02055335, + "balance_loss_mlp": 1.04143763, + "epoch": 0.3790169848188787, + "flos": 23585194990080.0, + "grad_norm": 1.8915662489351535, + "language_loss": 0.77611423, + "learning_rate": 2.8525733480989846e-06, + "loss": 0.79765135, + "num_input_tokens_seen": 135314845, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.77734375, + "step": 6304, + "time_per_iteration": 5.393261432647705 + }, + { + "auxiliary_loss_clip": 0.01127431, + "auxiliary_loss_mlp": 0.01037711, + "balance_loss_clip": 1.02176046, + "balance_loss_mlp": 1.04611588, + "epoch": 0.37907710807154665, + "flos": 18437436230400.0, + "grad_norm": 2.1278904960845724, + "language_loss": 0.80447114, + "learning_rate": 2.8522210295251146e-06, + "loss": 0.82612252, + "num_input_tokens_seen": 135333055, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8125, + "step": 6305, + "time_per_iteration": 2.471761703491211 + }, + { + "auxiliary_loss_clip": 0.01041012, + "auxiliary_loss_mlp": 0.0101182, + "balance_loss_clip": 1.01011562, + "balance_loss_mlp": 1.01491702, + "epoch": 0.3791372313242146, + "flos": 50107165954560.0, + "grad_norm": 0.9794242329238577, + "language_loss": 0.64524716, + "learning_rate": 2.8518686786349387e-06, + "loss": 0.66577548, + "num_input_tokens_seen": 135387865, + "router_z_loss_clip": 0.01708984, + "router_z_loss_mlp": 0.26171875, + "step": 6306, + "time_per_iteration": 2.9702882766723633 + }, + { + "auxiliary_loss_clip": 0.01126961, + "auxiliary_loss_mlp": 0.01048893, + "balance_loss_clip": 1.03371215, + "balance_loss_mlp": 1.04693508, + "epoch": 0.3791973545768826, + "flos": 24316551809280.0, + "grad_norm": 1.6253037153644523, + "language_loss": 0.73722827, + "learning_rate": 2.851516295441817e-06, + "loss": 0.75898677, + "num_input_tokens_seen": 135409095, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.80078125, + "step": 6307, + "time_per_iteration": 2.508127450942993 + }, + { + "auxiliary_loss_clip": 0.01124488, + "auxiliary_loss_mlp": 0.01040535, + "balance_loss_clip": 1.02550268, + "balance_loss_mlp": 1.04390907, + "epoch": 0.3792574778295506, + "flos": 21579907438080.0, + "grad_norm": 1.494726737463818, + "language_loss": 0.78469551, + "learning_rate": 2.851163879959112e-06, + "loss": 0.80634576, + "num_input_tokens_seen": 135429585, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8046875, + "step": 6308, + "time_per_iteration": 2.453012466430664 + }, + { + "auxiliary_loss_clip": 0.01120475, + "auxiliary_loss_mlp": 0.01040507, + "balance_loss_clip": 1.02552223, + "balance_loss_mlp": 1.04146767, + "epoch": 0.37931760108221857, + "flos": 22272731942400.0, + "grad_norm": 2.8302348181917263, + "language_loss": 0.73083341, + "learning_rate": 2.8508114322001876e-06, + "loss": 0.75244319, + "num_input_tokens_seen": 135446320, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7890625, + "step": 6309, + "time_per_iteration": 2.495020866394043 + }, + { + "auxiliary_loss_clip": 0.01122333, + "auxiliary_loss_mlp": 0.01039635, + "balance_loss_clip": 1.02509165, + "balance_loss_mlp": 1.04503894, + "epoch": 0.37937772433488653, + "flos": 19682998197120.0, + "grad_norm": 1.4661467923449947, + "language_loss": 0.78449893, + "learning_rate": 2.8504589521784083e-06, + "loss": 0.80611867, + "num_input_tokens_seen": 135465720, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7734375, + "step": 6310, + "time_per_iteration": 2.466533899307251 + }, + { + "auxiliary_loss_clip": 0.01121702, + "auxiliary_loss_mlp": 0.0103985, + "balance_loss_clip": 1.02562881, + "balance_loss_mlp": 1.04319441, + "epoch": 0.3794378475875545, + "flos": 19099378016640.0, + "grad_norm": 1.894743489836823, + "language_loss": 0.76103079, + "learning_rate": 2.8501064399071403e-06, + "loss": 0.7826463, + "num_input_tokens_seen": 135485155, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78515625, + "step": 6311, + "time_per_iteration": 2.4859142303466797 + }, + { + "auxiliary_loss_clip": 0.0112103, + "auxiliary_loss_mlp": 0.0103355, + "balance_loss_clip": 1.01906657, + "balance_loss_mlp": 1.04379332, + "epoch": 0.37949797084022246, + "flos": 20339660684160.0, + "grad_norm": 1.4829862533126659, + "language_loss": 0.71025705, + "learning_rate": 2.8497538953997504e-06, + "loss": 0.73180288, + "num_input_tokens_seen": 135502675, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7734375, + "step": 6312, + "time_per_iteration": 2.4632480144500732 + }, + { + "auxiliary_loss_clip": 0.01041554, + "auxiliary_loss_mlp": 0.01005886, + "balance_loss_clip": 1.00425243, + "balance_loss_mlp": 1.01538157, + "epoch": 0.37955809409289043, + "flos": 63972203477760.0, + "grad_norm": 0.7762054489660294, + "language_loss": 0.56084001, + "learning_rate": 2.849401318669608e-06, + "loss": 0.58131444, + "num_input_tokens_seen": 135562005, + "router_z_loss_clip": 0.01635742, + "router_z_loss_mlp": 0.26171875, + "step": 6313, + "time_per_iteration": 3.0646302700042725 + }, + { + "auxiliary_loss_clip": 0.0112246, + "auxiliary_loss_mlp": 0.01043557, + "balance_loss_clip": 1.02876949, + "balance_loss_mlp": 1.04362202, + "epoch": 0.3796182173455584, + "flos": 31540665179520.0, + "grad_norm": 4.480184070608776, + "language_loss": 0.7158128, + "learning_rate": 2.849048709730083e-06, + "loss": 0.73747301, + "num_input_tokens_seen": 135582600, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.7890625, + "step": 6314, + "time_per_iteration": 2.5263309478759766 + }, + { + "auxiliary_loss_clip": 0.01126357, + "auxiliary_loss_mlp": 0.01038649, + "balance_loss_clip": 1.02331841, + "balance_loss_mlp": 1.04427075, + "epoch": 0.37967834059822636, + "flos": 12130804978560.0, + "grad_norm": 1.7655759267809688, + "language_loss": 0.73132306, + "learning_rate": 2.848696068594545e-06, + "loss": 0.75297308, + "num_input_tokens_seen": 135600280, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8203125, + "step": 6315, + "time_per_iteration": 2.4753336906433105 + }, + { + "auxiliary_loss_clip": 0.0111862, + "auxiliary_loss_mlp": 0.01038847, + "balance_loss_clip": 1.02454782, + "balance_loss_mlp": 1.04206967, + "epoch": 0.3797384638508943, + "flos": 39348578298240.0, + "grad_norm": 2.0286726324195477, + "language_loss": 0.71049547, + "learning_rate": 2.8483433952763677e-06, + "loss": 0.73207021, + "num_input_tokens_seen": 135621560, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.765625, + "step": 6316, + "time_per_iteration": 2.636176824569702 + }, + { + "auxiliary_loss_clip": 0.01122, + "auxiliary_loss_mlp": 0.01038731, + "balance_loss_clip": 1.02524233, + "balance_loss_mlp": 1.04524136, + "epoch": 0.3797985871035623, + "flos": 34054016653440.0, + "grad_norm": 1.8086467732489355, + "language_loss": 0.65270519, + "learning_rate": 2.847990689788923e-06, + "loss": 0.67431247, + "num_input_tokens_seen": 135641745, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 6317, + "time_per_iteration": 2.595952033996582 + }, + { + "auxiliary_loss_clip": 0.01118597, + "auxiliary_loss_mlp": 0.0103544, + "balance_loss_clip": 1.02174878, + "balance_loss_mlp": 1.04161143, + "epoch": 0.37985871035623026, + "flos": 23222174186880.0, + "grad_norm": 2.0501625369641867, + "language_loss": 0.85361171, + "learning_rate": 2.8476379521455877e-06, + "loss": 0.87515211, + "num_input_tokens_seen": 135660650, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76953125, + "step": 6318, + "time_per_iteration": 2.4805264472961426 + }, + { + "auxiliary_loss_clip": 0.01124758, + "auxiliary_loss_mlp": 0.01040235, + "balance_loss_clip": 1.02443993, + "balance_loss_mlp": 1.04483223, + "epoch": 0.3799188336088982, + "flos": 18114958903680.0, + "grad_norm": 2.489676718863087, + "language_loss": 0.76274204, + "learning_rate": 2.8472851823597354e-06, + "loss": 0.784392, + "num_input_tokens_seen": 135679980, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.796875, + "step": 6319, + "time_per_iteration": 2.4780025482177734 + }, + { + "auxiliary_loss_clip": 0.01123743, + "auxiliary_loss_mlp": 0.01044293, + "balance_loss_clip": 1.02961218, + "balance_loss_mlp": 1.04587555, + "epoch": 0.3799789568615662, + "flos": 21871897096320.0, + "grad_norm": 1.6998661229427972, + "language_loss": 0.63923568, + "learning_rate": 2.846932380444744e-06, + "loss": 0.66091597, + "num_input_tokens_seen": 135699400, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78125, + "step": 6320, + "time_per_iteration": 2.4700872898101807 + }, + { + "auxiliary_loss_clip": 0.01121041, + "auxiliary_loss_mlp": 0.01038091, + "balance_loss_clip": 1.02375042, + "balance_loss_mlp": 1.04365289, + "epoch": 0.3800390801142342, + "flos": 32962943082240.0, + "grad_norm": 1.883216130529445, + "language_loss": 0.7112022, + "learning_rate": 2.846579546413992e-06, + "loss": 0.73279351, + "num_input_tokens_seen": 135723455, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7734375, + "step": 6321, + "time_per_iteration": 2.5686967372894287 + }, + { + "auxiliary_loss_clip": 0.01123308, + "auxiliary_loss_mlp": 0.01038205, + "balance_loss_clip": 1.02372098, + "balance_loss_mlp": 1.04298186, + "epoch": 0.38009920336690217, + "flos": 26907075653760.0, + "grad_norm": 1.720302384597662, + "language_loss": 0.74730933, + "learning_rate": 2.846226680280859e-06, + "loss": 0.76892447, + "num_input_tokens_seen": 135744335, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8046875, + "step": 6322, + "time_per_iteration": 2.5368685722351074 + }, + { + "auxiliary_loss_clip": 0.01121658, + "auxiliary_loss_mlp": 0.01036997, + "balance_loss_clip": 1.02155948, + "balance_loss_mlp": 1.04405749, + "epoch": 0.38015932661957014, + "flos": 22488913946880.0, + "grad_norm": 2.6715016816856787, + "language_loss": 0.84910119, + "learning_rate": 2.845873782058725e-06, + "loss": 0.87068772, + "num_input_tokens_seen": 135761440, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.77734375, + "step": 6323, + "time_per_iteration": 2.483771562576294 + }, + { + "auxiliary_loss_clip": 0.01123254, + "auxiliary_loss_mlp": 0.01035788, + "balance_loss_clip": 1.01983762, + "balance_loss_mlp": 1.04395103, + "epoch": 0.3802194498722381, + "flos": 21980993679360.0, + "grad_norm": 2.3955157937634586, + "language_loss": 0.73466647, + "learning_rate": 2.845520851760973e-06, + "loss": 0.75625694, + "num_input_tokens_seen": 135779955, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.79296875, + "step": 6324, + "time_per_iteration": 2.4709885120391846 + }, + { + "auxiliary_loss_clip": 0.0112564, + "auxiliary_loss_mlp": 0.01035755, + "balance_loss_clip": 1.02020979, + "balance_loss_mlp": 1.045573, + "epoch": 0.38027957312490607, + "flos": 21324869896320.0, + "grad_norm": 1.6580896914625747, + "language_loss": 0.84147018, + "learning_rate": 2.8451678894009847e-06, + "loss": 0.86308414, + "num_input_tokens_seen": 135799840, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.80078125, + "step": 6325, + "time_per_iteration": 2.4846572875976562 + }, + { + "auxiliary_loss_clip": 0.01122273, + "auxiliary_loss_mlp": 0.01032055, + "balance_loss_clip": 1.01833439, + "balance_loss_mlp": 1.04476464, + "epoch": 0.38033969637757403, + "flos": 16691244456960.0, + "grad_norm": 1.7291759572194114, + "language_loss": 0.79642469, + "learning_rate": 2.8448148949921465e-06, + "loss": 0.81796801, + "num_input_tokens_seen": 135817880, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.77734375, + "step": 6326, + "time_per_iteration": 2.4206631183624268 + }, + { + "auxiliary_loss_clip": 0.0111945, + "auxiliary_loss_mlp": 0.01038943, + "balance_loss_clip": 1.02524638, + "balance_loss_mlp": 1.04261708, + "epoch": 0.380399819630242, + "flos": 36210847685760.0, + "grad_norm": 1.8040593924859922, + "language_loss": 0.72696453, + "learning_rate": 2.844461868547842e-06, + "loss": 0.74854851, + "num_input_tokens_seen": 135838940, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76953125, + "step": 6327, + "time_per_iteration": 2.5964794158935547 + }, + { + "auxiliary_loss_clip": 0.01122464, + "auxiliary_loss_mlp": 0.01037019, + "balance_loss_clip": 1.02165246, + "balance_loss_mlp": 1.04614949, + "epoch": 0.38045994288290996, + "flos": 21288851533440.0, + "grad_norm": 1.6287717027141382, + "language_loss": 0.83090091, + "learning_rate": 2.844108810081459e-06, + "loss": 0.85249579, + "num_input_tokens_seen": 135858325, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.76171875, + "step": 6328, + "time_per_iteration": 2.4602181911468506 + }, + { + "auxiliary_loss_clip": 0.01120102, + "auxiliary_loss_mlp": 0.01032163, + "balance_loss_clip": 1.01746464, + "balance_loss_mlp": 1.04347932, + "epoch": 0.38052006613557793, + "flos": 20922885815040.0, + "grad_norm": 1.31755328246291, + "language_loss": 0.61384171, + "learning_rate": 2.843755719606385e-06, + "loss": 0.63536435, + "num_input_tokens_seen": 135878430, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.765625, + "step": 6329, + "time_per_iteration": 2.5268959999084473 + }, + { + "auxiliary_loss_clip": 0.01124125, + "auxiliary_loss_mlp": 0.01041725, + "balance_loss_clip": 1.02731824, + "balance_loss_mlp": 1.04603863, + "epoch": 0.3805801893882459, + "flos": 20990720649600.0, + "grad_norm": 1.7232754549878644, + "language_loss": 0.5586049, + "learning_rate": 2.8434025971360104e-06, + "loss": 0.58026338, + "num_input_tokens_seen": 135894755, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 6330, + "time_per_iteration": 2.450221061706543 + }, + { + "auxiliary_loss_clip": 0.01119473, + "auxiliary_loss_mlp": 0.01035672, + "balance_loss_clip": 1.02255917, + "balance_loss_mlp": 1.04540074, + "epoch": 0.38064031264091386, + "flos": 25558594243200.0, + "grad_norm": 1.7778053530951745, + "language_loss": 0.65694439, + "learning_rate": 2.8430494426837243e-06, + "loss": 0.67849582, + "num_input_tokens_seen": 135918275, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 6331, + "time_per_iteration": 2.544187545776367 + }, + { + "auxiliary_loss_clip": 0.01126283, + "auxiliary_loss_mlp": 0.01041557, + "balance_loss_clip": 1.02635133, + "balance_loss_mlp": 1.04744291, + "epoch": 0.3807004358935818, + "flos": 15085857997440.0, + "grad_norm": 1.725296368277029, + "language_loss": 0.75737906, + "learning_rate": 2.842696256262919e-06, + "loss": 0.77905744, + "num_input_tokens_seen": 135937430, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.7890625, + "step": 6332, + "time_per_iteration": 2.443654775619507 + }, + { + "auxiliary_loss_clip": 0.01123212, + "auxiliary_loss_mlp": 0.0104071, + "balance_loss_clip": 1.02546334, + "balance_loss_mlp": 1.04323936, + "epoch": 0.3807605591462498, + "flos": 16399398453120.0, + "grad_norm": 2.2212054448627425, + "language_loss": 0.81889552, + "learning_rate": 2.842343037886987e-06, + "loss": 0.84053469, + "num_input_tokens_seen": 135954210, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.80078125, + "step": 6333, + "time_per_iteration": 2.467007637023926 + }, + { + "auxiliary_loss_clip": 0.01121534, + "auxiliary_loss_mlp": 0.01033012, + "balance_loss_clip": 1.0190227, + "balance_loss_mlp": 1.04437923, + "epoch": 0.3808206823989178, + "flos": 29057083102080.0, + "grad_norm": 1.583221243495577, + "language_loss": 0.86192155, + "learning_rate": 2.8419897875693226e-06, + "loss": 0.88346696, + "num_input_tokens_seen": 135974425, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7734375, + "step": 6334, + "time_per_iteration": 2.521341323852539 + }, + { + "auxiliary_loss_clip": 0.01123312, + "auxiliary_loss_mlp": 0.01035753, + "balance_loss_clip": 1.02130485, + "balance_loss_mlp": 1.04498506, + "epoch": 0.3808808056515858, + "flos": 15705855676800.0, + "grad_norm": 2.2115670432842847, + "language_loss": 0.79179001, + "learning_rate": 2.841636505323321e-06, + "loss": 0.8133806, + "num_input_tokens_seen": 135991985, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 6335, + "time_per_iteration": 2.4648449420928955 + }, + { + "auxiliary_loss_clip": 0.01124606, + "auxiliary_loss_mlp": 0.01035281, + "balance_loss_clip": 1.02027273, + "balance_loss_mlp": 1.04485524, + "epoch": 0.38094092890425374, + "flos": 20704584908160.0, + "grad_norm": 1.872233235491229, + "language_loss": 0.72775364, + "learning_rate": 2.8412831911623795e-06, + "loss": 0.74935251, + "num_input_tokens_seen": 136010015, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.796875, + "step": 6336, + "time_per_iteration": 2.443255662918091 + }, + { + "auxiliary_loss_clip": 0.01119223, + "auxiliary_loss_mlp": 0.01031785, + "balance_loss_clip": 1.0180763, + "balance_loss_mlp": 1.0430727, + "epoch": 0.3810010521569217, + "flos": 20667956014080.0, + "grad_norm": 1.9910419737037044, + "language_loss": 0.69146657, + "learning_rate": 2.840929845099894e-06, + "loss": 0.71297657, + "num_input_tokens_seen": 136028440, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76171875, + "step": 6337, + "time_per_iteration": 2.4838876724243164 + }, + { + "auxiliary_loss_clip": 0.01124374, + "auxiliary_loss_mlp": 0.01035164, + "balance_loss_clip": 1.02016187, + "balance_loss_mlp": 1.04606009, + "epoch": 0.38106117540958967, + "flos": 31827626933760.0, + "grad_norm": 1.9033617326941272, + "language_loss": 0.63247615, + "learning_rate": 2.8405764671492652e-06, + "loss": 0.65407151, + "num_input_tokens_seen": 136048360, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.78125, + "step": 6338, + "time_per_iteration": 2.5538294315338135 + }, + { + "auxiliary_loss_clip": 0.01123732, + "auxiliary_loss_mlp": 0.01040443, + "balance_loss_clip": 1.02492189, + "balance_loss_mlp": 1.04498446, + "epoch": 0.38112129866225763, + "flos": 16902757693440.0, + "grad_norm": 1.8718033662194862, + "language_loss": 0.69288802, + "learning_rate": 2.8402230573238923e-06, + "loss": 0.71452975, + "num_input_tokens_seen": 136065500, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.7890625, + "step": 6339, + "time_per_iteration": 2.490813970565796 + }, + { + "auxiliary_loss_clip": 0.0112515, + "auxiliary_loss_mlp": 0.01040007, + "balance_loss_clip": 1.0256902, + "balance_loss_mlp": 1.0461787, + "epoch": 0.3811814219149256, + "flos": 20887226588160.0, + "grad_norm": 2.5980221539464914, + "language_loss": 0.68312418, + "learning_rate": 2.839869615637177e-06, + "loss": 0.70477575, + "num_input_tokens_seen": 136084060, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7890625, + "step": 6340, + "time_per_iteration": 2.4576282501220703 + }, + { + "auxiliary_loss_clip": 0.01124677, + "auxiliary_loss_mlp": 0.01036157, + "balance_loss_clip": 1.02026618, + "balance_loss_mlp": 1.04393721, + "epoch": 0.38124154516759357, + "flos": 16690813493760.0, + "grad_norm": 2.141170258916756, + "language_loss": 0.89404309, + "learning_rate": 2.839516142102522e-06, + "loss": 0.91565144, + "num_input_tokens_seen": 136102310, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.80859375, + "step": 6341, + "time_per_iteration": 2.4688920974731445 + }, + { + "auxiliary_loss_clip": 0.01126312, + "auxiliary_loss_mlp": 0.01040778, + "balance_loss_clip": 1.02477455, + "balance_loss_mlp": 1.04559851, + "epoch": 0.38130166842026153, + "flos": 19681956702720.0, + "grad_norm": 1.5516456894508346, + "language_loss": 0.74665564, + "learning_rate": 2.83916263673333e-06, + "loss": 0.76832652, + "num_input_tokens_seen": 136120725, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8046875, + "step": 6342, + "time_per_iteration": 2.4610931873321533 + }, + { + "auxiliary_loss_clip": 0.0112203, + "auxiliary_loss_mlp": 0.01034151, + "balance_loss_clip": 1.01900578, + "balance_loss_mlp": 1.04325199, + "epoch": 0.3813617916729295, + "flos": 22198432659840.0, + "grad_norm": 1.6121504127073445, + "language_loss": 0.83334327, + "learning_rate": 2.838809099543007e-06, + "loss": 0.85490513, + "num_input_tokens_seen": 136139105, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.7890625, + "step": 6343, + "time_per_iteration": 2.490952730178833 + }, + { + "auxiliary_loss_clip": 0.0112236, + "auxiliary_loss_mlp": 0.01037814, + "balance_loss_clip": 1.0233357, + "balance_loss_mlp": 1.04305577, + "epoch": 0.38142191492559746, + "flos": 19096899978240.0, + "grad_norm": 1.5912858717665679, + "language_loss": 0.76965082, + "learning_rate": 2.838455530544959e-06, + "loss": 0.79125255, + "num_input_tokens_seen": 136158265, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.79296875, + "step": 6344, + "time_per_iteration": 2.458669424057007 + }, + { + "auxiliary_loss_clip": 0.01126022, + "auxiliary_loss_mlp": 0.01039382, + "balance_loss_clip": 1.02413464, + "balance_loss_mlp": 1.04601693, + "epoch": 0.3814820381782654, + "flos": 24097748112000.0, + "grad_norm": 2.369132092535199, + "language_loss": 0.72790027, + "learning_rate": 2.838101929752593e-06, + "loss": 0.7495544, + "num_input_tokens_seen": 136176100, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.796875, + "step": 6345, + "time_per_iteration": 5.361874341964722 + }, + { + "auxiliary_loss_clip": 0.0112125, + "auxiliary_loss_mlp": 0.01035577, + "balance_loss_clip": 1.02172494, + "balance_loss_mlp": 1.04348969, + "epoch": 0.3815421614309334, + "flos": 15778502933760.0, + "grad_norm": 1.723509048793367, + "language_loss": 0.69687438, + "learning_rate": 2.8377482971793187e-06, + "loss": 0.71844268, + "num_input_tokens_seen": 136195125, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.77734375, + "step": 6346, + "time_per_iteration": 3.8780832290649414 + }, + { + "auxiliary_loss_clip": 0.0112555, + "auxiliary_loss_mlp": 0.01037985, + "balance_loss_clip": 1.02351856, + "balance_loss_mlp": 1.04639161, + "epoch": 0.38160228468360136, + "flos": 19899754819200.0, + "grad_norm": 1.8691929226070287, + "language_loss": 0.75860906, + "learning_rate": 2.8373946328385437e-06, + "loss": 0.78024441, + "num_input_tokens_seen": 136213885, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.79296875, + "step": 6347, + "time_per_iteration": 2.4724838733673096 + }, + { + "auxiliary_loss_clip": 0.01121549, + "auxiliary_loss_mlp": 0.01036633, + "balance_loss_clip": 1.02258432, + "balance_loss_mlp": 1.04272556, + "epoch": 0.3816624079362694, + "flos": 19281050029440.0, + "grad_norm": 1.5494744961647557, + "language_loss": 0.74775678, + "learning_rate": 2.8370409367436813e-06, + "loss": 0.76933861, + "num_input_tokens_seen": 136232700, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 6348, + "time_per_iteration": 2.4360201358795166 + }, + { + "auxiliary_loss_clip": 0.01121636, + "auxiliary_loss_mlp": 0.01034098, + "balance_loss_clip": 1.01947105, + "balance_loss_mlp": 1.04346061, + "epoch": 0.38172253118893734, + "flos": 21177564220800.0, + "grad_norm": 2.012782025185047, + "language_loss": 0.86987114, + "learning_rate": 2.836687208908142e-06, + "loss": 0.89142847, + "num_input_tokens_seen": 136248975, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78125, + "step": 6349, + "time_per_iteration": 2.4653983116149902 + }, + { + "auxiliary_loss_clip": 0.01121297, + "auxiliary_loss_mlp": 0.01040466, + "balance_loss_clip": 1.02576792, + "balance_loss_mlp": 1.04300261, + "epoch": 0.3817826544416053, + "flos": 17529219820800.0, + "grad_norm": 3.1419886249283624, + "language_loss": 0.76335979, + "learning_rate": 2.836333449345341e-06, + "loss": 0.78497744, + "num_input_tokens_seen": 136266710, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78125, + "step": 6350, + "time_per_iteration": 2.4111151695251465 + }, + { + "auxiliary_loss_clip": 0.01122319, + "auxiliary_loss_mlp": 0.01032772, + "balance_loss_clip": 1.01693547, + "balance_loss_mlp": 1.04389453, + "epoch": 0.38184277769427327, + "flos": 16326535714560.0, + "grad_norm": 2.0441694615934325, + "language_loss": 0.76182568, + "learning_rate": 2.8359796580686907e-06, + "loss": 0.78337657, + "num_input_tokens_seen": 136284445, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.78125, + "step": 6351, + "time_per_iteration": 2.449831485748291 + }, + { + "auxiliary_loss_clip": 0.0112512, + "auxiliary_loss_mlp": 0.01039723, + "balance_loss_clip": 1.0235939, + "balance_loss_mlp": 1.04464602, + "epoch": 0.38190290094694124, + "flos": 30443450382720.0, + "grad_norm": 1.6974231581634962, + "language_loss": 0.74360836, + "learning_rate": 2.8356258350916085e-06, + "loss": 0.76525676, + "num_input_tokens_seen": 136305730, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8046875, + "step": 6352, + "time_per_iteration": 2.5342295169830322 + }, + { + "auxiliary_loss_clip": 0.01117506, + "auxiliary_loss_mlp": 0.01033939, + "balance_loss_clip": 1.02103508, + "balance_loss_mlp": 1.04153097, + "epoch": 0.3819630241996092, + "flos": 14209924936320.0, + "grad_norm": 1.834359776939538, + "language_loss": 0.64362574, + "learning_rate": 2.8352719804275104e-06, + "loss": 0.66514015, + "num_input_tokens_seen": 136323850, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7578125, + "step": 6353, + "time_per_iteration": 2.434100866317749 + }, + { + "auxiliary_loss_clip": 0.01120333, + "auxiliary_loss_mlp": 0.01033689, + "balance_loss_clip": 1.02020061, + "balance_loss_mlp": 1.04363215, + "epoch": 0.38202314745227717, + "flos": 25009699536000.0, + "grad_norm": 1.6268216674771125, + "language_loss": 0.83035302, + "learning_rate": 2.834918094089816e-06, + "loss": 0.85189331, + "num_input_tokens_seen": 136344880, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 6354, + "time_per_iteration": 2.4903476238250732 + }, + { + "auxiliary_loss_clip": 0.0112166, + "auxiliary_loss_mlp": 0.01035018, + "balance_loss_clip": 1.02154744, + "balance_loss_mlp": 1.04571426, + "epoch": 0.38208327070494513, + "flos": 20814507504000.0, + "grad_norm": 1.7360324347242302, + "language_loss": 0.8071996, + "learning_rate": 2.834564176091943e-06, + "loss": 0.82876635, + "num_input_tokens_seen": 136366060, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 6355, + "time_per_iteration": 2.5086817741394043 + }, + { + "auxiliary_loss_clip": 0.0112186, + "auxiliary_loss_mlp": 0.01033195, + "balance_loss_clip": 1.01959288, + "balance_loss_mlp": 1.04464841, + "epoch": 0.3821433939576131, + "flos": 22637727993600.0, + "grad_norm": 1.7080815693685156, + "language_loss": 0.75032043, + "learning_rate": 2.8342102264473125e-06, + "loss": 0.77187097, + "num_input_tokens_seen": 136385625, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7734375, + "step": 6356, + "time_per_iteration": 2.471919298171997 + }, + { + "auxiliary_loss_clip": 0.01121242, + "auxiliary_loss_mlp": 0.0103649, + "balance_loss_clip": 1.02251887, + "balance_loss_mlp": 1.04420352, + "epoch": 0.38220351721028106, + "flos": 26869872142080.0, + "grad_norm": 1.8091380313160346, + "language_loss": 0.81251574, + "learning_rate": 2.833856245169348e-06, + "loss": 0.83409309, + "num_input_tokens_seen": 136405750, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76953125, + "step": 6357, + "time_per_iteration": 2.5302257537841797 + }, + { + "auxiliary_loss_clip": 0.01127375, + "auxiliary_loss_mlp": 0.01040855, + "balance_loss_clip": 1.02465415, + "balance_loss_mlp": 1.04773057, + "epoch": 0.38226364046294903, + "flos": 23367468700800.0, + "grad_norm": 3.08273691075534, + "language_loss": 0.77903318, + "learning_rate": 2.8335022322714695e-06, + "loss": 0.80071545, + "num_input_tokens_seen": 136426085, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.796875, + "step": 6358, + "time_per_iteration": 2.4700090885162354 + }, + { + "auxiliary_loss_clip": 0.01122323, + "auxiliary_loss_mlp": 0.01040593, + "balance_loss_clip": 1.02576303, + "balance_loss_mlp": 1.0432725, + "epoch": 0.382323763715617, + "flos": 19646225648640.0, + "grad_norm": 2.070211767582473, + "language_loss": 0.78700459, + "learning_rate": 2.8331481877671036e-06, + "loss": 0.80863374, + "num_input_tokens_seen": 136442670, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7890625, + "step": 6359, + "time_per_iteration": 2.4555094242095947 + }, + { + "auxiliary_loss_clip": 0.01118608, + "auxiliary_loss_mlp": 0.01041395, + "balance_loss_clip": 1.02698255, + "balance_loss_mlp": 1.04290545, + "epoch": 0.38238388696828496, + "flos": 54124741232640.0, + "grad_norm": 2.6399902686671113, + "language_loss": 0.69392359, + "learning_rate": 2.8327941116696754e-06, + "loss": 0.7155236, + "num_input_tokens_seen": 136465730, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7578125, + "step": 6360, + "time_per_iteration": 2.736069440841675 + }, + { + "auxiliary_loss_clip": 0.01118797, + "auxiliary_loss_mlp": 0.01031345, + "balance_loss_clip": 1.01632452, + "balance_loss_mlp": 1.04197633, + "epoch": 0.382444010220953, + "flos": 24936190352640.0, + "grad_norm": 1.9168722583294633, + "language_loss": 0.78836095, + "learning_rate": 2.83244000399261e-06, + "loss": 0.80986238, + "num_input_tokens_seen": 136487215, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.765625, + "step": 6361, + "time_per_iteration": 2.511254072189331 + }, + { + "auxiliary_loss_clip": 0.01115444, + "auxiliary_loss_mlp": 0.01036962, + "balance_loss_clip": 1.02274048, + "balance_loss_mlp": 1.04114652, + "epoch": 0.38250413347362094, + "flos": 42337351209600.0, + "grad_norm": 1.4566170801765106, + "language_loss": 0.65315771, + "learning_rate": 2.832085864749337e-06, + "loss": 0.67468172, + "num_input_tokens_seen": 136510365, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7421875, + "step": 6362, + "time_per_iteration": 2.632784128189087 + }, + { + "auxiliary_loss_clip": 0.01118848, + "auxiliary_loss_mlp": 0.01032495, + "balance_loss_clip": 1.01779675, + "balance_loss_mlp": 1.04175615, + "epoch": 0.3825642567262889, + "flos": 16289224462080.0, + "grad_norm": 1.8527291741217293, + "language_loss": 0.82063204, + "learning_rate": 2.8317316939532848e-06, + "loss": 0.84214544, + "num_input_tokens_seen": 136527100, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.76953125, + "step": 6363, + "time_per_iteration": 2.4478373527526855 + }, + { + "auxiliary_loss_clip": 0.01119064, + "auxiliary_loss_mlp": 0.01042512, + "balance_loss_clip": 1.02837944, + "balance_loss_mlp": 1.0446111, + "epoch": 0.3826243799789569, + "flos": 45654778586880.0, + "grad_norm": 1.811422380776527, + "language_loss": 0.58428323, + "learning_rate": 2.8313774916178825e-06, + "loss": 0.60589898, + "num_input_tokens_seen": 136550870, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7421875, + "step": 6364, + "time_per_iteration": 2.655128002166748 + }, + { + "auxiliary_loss_clip": 0.01123151, + "auxiliary_loss_mlp": 0.01039269, + "balance_loss_clip": 1.02496374, + "balance_loss_mlp": 1.04423463, + "epoch": 0.38268450323162484, + "flos": 25301581453440.0, + "grad_norm": 2.1451175401130893, + "language_loss": 0.68881112, + "learning_rate": 2.8310232577565635e-06, + "loss": 0.71043533, + "num_input_tokens_seen": 136569895, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7890625, + "step": 6365, + "time_per_iteration": 2.51526141166687 + }, + { + "auxiliary_loss_clip": 0.01121408, + "auxiliary_loss_mlp": 0.01036007, + "balance_loss_clip": 1.02065301, + "balance_loss_mlp": 1.04057527, + "epoch": 0.3827446264842928, + "flos": 21836022387840.0, + "grad_norm": 4.555943608034253, + "language_loss": 0.73442698, + "learning_rate": 2.830668992382758e-06, + "loss": 0.75600111, + "num_input_tokens_seen": 136588585, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8046875, + "step": 6366, + "time_per_iteration": 2.448585033416748 + }, + { + "auxiliary_loss_clip": 0.01120534, + "auxiliary_loss_mlp": 0.01035025, + "balance_loss_clip": 1.02026677, + "balance_loss_mlp": 1.04226327, + "epoch": 0.38280474973696077, + "flos": 25734591907200.0, + "grad_norm": 2.0234001922769327, + "language_loss": 0.68829554, + "learning_rate": 2.830314695509902e-06, + "loss": 0.70985115, + "num_input_tokens_seen": 136606640, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.78125, + "step": 6367, + "time_per_iteration": 2.569301128387451 + }, + { + "auxiliary_loss_clip": 0.0111708, + "auxiliary_loss_mlp": 0.01033716, + "balance_loss_clip": 1.01917887, + "balance_loss_mlp": 1.04202485, + "epoch": 0.38286487298962874, + "flos": 24895934184960.0, + "grad_norm": 4.344593393004367, + "language_loss": 0.6481666, + "learning_rate": 2.82996036715143e-06, + "loss": 0.66967463, + "num_input_tokens_seen": 136624940, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.75, + "step": 6368, + "time_per_iteration": 2.4531960487365723 + }, + { + "auxiliary_loss_clip": 0.01120319, + "auxiliary_loss_mlp": 0.01034921, + "balance_loss_clip": 1.02053833, + "balance_loss_mlp": 1.04277039, + "epoch": 0.3829249962422967, + "flos": 28543703967360.0, + "grad_norm": 1.315785818077373, + "language_loss": 0.68389189, + "learning_rate": 2.8296060073207763e-06, + "loss": 0.70544434, + "num_input_tokens_seen": 136645540, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7734375, + "step": 6369, + "time_per_iteration": 2.5403318405151367 + }, + { + "auxiliary_loss_clip": 0.01116957, + "auxiliary_loss_mlp": 0.01042768, + "balance_loss_clip": 1.02774167, + "balance_loss_mlp": 1.04172897, + "epoch": 0.38298511949496467, + "flos": 21471205904640.0, + "grad_norm": 1.7184057003296296, + "language_loss": 0.78214431, + "learning_rate": 2.8292516160313804e-06, + "loss": 0.80374157, + "num_input_tokens_seen": 136664530, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.75, + "step": 6370, + "time_per_iteration": 2.4397096633911133 + }, + { + "auxiliary_loss_clip": 0.01119925, + "auxiliary_loss_mlp": 0.01039652, + "balance_loss_clip": 1.02569818, + "balance_loss_mlp": 1.04368424, + "epoch": 0.38304524274763263, + "flos": 31679998035840.0, + "grad_norm": 2.8055794910549525, + "language_loss": 0.64556968, + "learning_rate": 2.8288971932966805e-06, + "loss": 0.66716546, + "num_input_tokens_seen": 136682315, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76171875, + "step": 6371, + "time_per_iteration": 2.5470147132873535 + }, + { + "auxiliary_loss_clip": 0.01124784, + "auxiliary_loss_mlp": 0.01037674, + "balance_loss_clip": 1.0221653, + "balance_loss_mlp": 1.04452634, + "epoch": 0.3831053660003006, + "flos": 25076816098560.0, + "grad_norm": 1.8238449128176952, + "language_loss": 0.72682339, + "learning_rate": 2.8285427391301155e-06, + "loss": 0.7484479, + "num_input_tokens_seen": 136701185, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.80078125, + "step": 6372, + "time_per_iteration": 2.47695255279541 + }, + { + "auxiliary_loss_clip": 0.01121696, + "auxiliary_loss_mlp": 0.01038223, + "balance_loss_clip": 1.02325058, + "balance_loss_mlp": 1.04308939, + "epoch": 0.38316548925296856, + "flos": 23259018562560.0, + "grad_norm": 1.5970403518130607, + "language_loss": 0.84758627, + "learning_rate": 2.8281882535451266e-06, + "loss": 0.86918551, + "num_input_tokens_seen": 136721265, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.7890625, + "step": 6373, + "time_per_iteration": 2.514571189880371 + }, + { + "auxiliary_loss_clip": 0.01124014, + "auxiliary_loss_mlp": 0.01043172, + "balance_loss_clip": 1.02784181, + "balance_loss_mlp": 1.04392529, + "epoch": 0.3832256125056366, + "flos": 34423465991040.0, + "grad_norm": 4.718004058381721, + "language_loss": 0.74721354, + "learning_rate": 2.8278337365551567e-06, + "loss": 0.76888537, + "num_input_tokens_seen": 136741885, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.80078125, + "step": 6374, + "time_per_iteration": 2.5505032539367676 + }, + { + "auxiliary_loss_clip": 0.01124139, + "auxiliary_loss_mlp": 0.01041765, + "balance_loss_clip": 1.02675653, + "balance_loss_mlp": 1.04414058, + "epoch": 0.38328573575830455, + "flos": 21762764599680.0, + "grad_norm": 2.8586580554057472, + "language_loss": 0.75701195, + "learning_rate": 2.8274791881736485e-06, + "loss": 0.77867097, + "num_input_tokens_seen": 136760905, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.80078125, + "step": 6375, + "time_per_iteration": 2.467555522918701 + }, + { + "auxiliary_loss_clip": 0.01122331, + "auxiliary_loss_mlp": 0.01037666, + "balance_loss_clip": 1.02300918, + "balance_loss_mlp": 1.04375613, + "epoch": 0.3833458590109725, + "flos": 17380010724480.0, + "grad_norm": 2.257221103761015, + "language_loss": 0.72827101, + "learning_rate": 2.8271246084140457e-06, + "loss": 0.7498709, + "num_input_tokens_seen": 136777240, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78515625, + "step": 6376, + "time_per_iteration": 2.4082555770874023 + }, + { + "auxiliary_loss_clip": 0.01118574, + "auxiliary_loss_mlp": 0.01039859, + "balance_loss_clip": 1.02455282, + "balance_loss_mlp": 1.04245007, + "epoch": 0.3834059822636405, + "flos": 29424557191680.0, + "grad_norm": 1.5879949283042905, + "language_loss": 0.67586625, + "learning_rate": 2.826769997289796e-06, + "loss": 0.69745058, + "num_input_tokens_seen": 136801040, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.76171875, + "step": 6377, + "time_per_iteration": 2.54896879196167 + }, + { + "auxiliary_loss_clip": 0.01124961, + "auxiliary_loss_mlp": 0.01039863, + "balance_loss_clip": 1.02448511, + "balance_loss_mlp": 1.04608607, + "epoch": 0.38346610551630844, + "flos": 21470739027840.0, + "grad_norm": 2.1973025079181117, + "language_loss": 0.72991705, + "learning_rate": 2.826415354814344e-06, + "loss": 0.75156534, + "num_input_tokens_seen": 136819495, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.7890625, + "step": 6378, + "time_per_iteration": 2.4442975521087646 + }, + { + "auxiliary_loss_clip": 0.01121801, + "auxiliary_loss_mlp": 0.0104221, + "balance_loss_clip": 1.02755964, + "balance_loss_mlp": 1.04327178, + "epoch": 0.3835262287689764, + "flos": 27561224188800.0, + "grad_norm": 1.6808845830991803, + "language_loss": 0.69162869, + "learning_rate": 2.8260606810011396e-06, + "loss": 0.71326876, + "num_input_tokens_seen": 136838840, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78515625, + "step": 6379, + "time_per_iteration": 2.529088258743286 + }, + { + "auxiliary_loss_clip": 0.01121458, + "auxiliary_loss_mlp": 0.01038205, + "balance_loss_clip": 1.02344704, + "balance_loss_mlp": 1.04552865, + "epoch": 0.3835863520216444, + "flos": 15523716787200.0, + "grad_norm": 1.6321901167852362, + "language_loss": 0.82979369, + "learning_rate": 2.8257059758636315e-06, + "loss": 0.85139024, + "num_input_tokens_seen": 136854425, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7578125, + "step": 6380, + "time_per_iteration": 2.4336190223693848 + }, + { + "auxiliary_loss_clip": 0.01120843, + "auxiliary_loss_mlp": 0.0103481, + "balance_loss_clip": 1.02090406, + "balance_loss_mlp": 1.04595208, + "epoch": 0.38364647527431234, + "flos": 21904934630400.0, + "grad_norm": 1.4297951270127425, + "language_loss": 0.81347466, + "learning_rate": 2.8253512394152697e-06, + "loss": 0.83503115, + "num_input_tokens_seen": 136874355, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 6381, + "time_per_iteration": 2.5029306411743164 + }, + { + "auxiliary_loss_clip": 0.0104681, + "auxiliary_loss_mlp": 0.01005882, + "balance_loss_clip": 1.00420141, + "balance_loss_mlp": 1.02098966, + "epoch": 0.3837065985269803, + "flos": 65534927558400.0, + "grad_norm": 0.796129115027233, + "language_loss": 0.60459685, + "learning_rate": 2.8249964716695068e-06, + "loss": 0.6251238, + "num_input_tokens_seen": 136937475, + "router_z_loss_clip": 0.0168457, + "router_z_loss_mlp": 0.2578125, + "step": 6382, + "time_per_iteration": 3.0525829792022705 + }, + { + "auxiliary_loss_clip": 0.01123582, + "auxiliary_loss_mlp": 0.01036921, + "balance_loss_clip": 1.02186477, + "balance_loss_mlp": 1.04358447, + "epoch": 0.38376672177964827, + "flos": 28256598558720.0, + "grad_norm": 2.302869327575685, + "language_loss": 0.66052485, + "learning_rate": 2.824641672639794e-06, + "loss": 0.68212986, + "num_input_tokens_seen": 136955805, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.796875, + "step": 6383, + "time_per_iteration": 2.5166289806365967 + }, + { + "auxiliary_loss_clip": 0.01124634, + "auxiliary_loss_mlp": 0.01033937, + "balance_loss_clip": 1.01944149, + "balance_loss_mlp": 1.04657924, + "epoch": 0.38382684503231623, + "flos": 20631363033600.0, + "grad_norm": 2.2385812040155932, + "language_loss": 0.74811673, + "learning_rate": 2.824286842339587e-06, + "loss": 0.76970243, + "num_input_tokens_seen": 136975240, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 6384, + "time_per_iteration": 2.4451465606689453 + }, + { + "auxiliary_loss_clip": 0.01120418, + "auxiliary_loss_mlp": 0.01036466, + "balance_loss_clip": 1.02219081, + "balance_loss_mlp": 1.04429483, + "epoch": 0.3838869682849842, + "flos": 19605825826560.0, + "grad_norm": 1.4336247312181014, + "language_loss": 0.75883526, + "learning_rate": 2.823931980782341e-06, + "loss": 0.78040409, + "num_input_tokens_seen": 136994985, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7578125, + "step": 6385, + "time_per_iteration": 2.4994513988494873 + }, + { + "auxiliary_loss_clip": 0.01046845, + "auxiliary_loss_mlp": 0.0100207, + "balance_loss_clip": 1.0002346, + "balance_loss_mlp": 1.02044809, + "epoch": 0.38394709153765216, + "flos": 56556110891520.0, + "grad_norm": 0.9433326566144719, + "language_loss": 0.67094183, + "learning_rate": 2.82357708798151e-06, + "loss": 0.69143105, + "num_input_tokens_seen": 137046290, + "router_z_loss_clip": 0.01831055, + "router_z_loss_mlp": 0.265625, + "step": 6386, + "time_per_iteration": 2.938122272491455 + }, + { + "auxiliary_loss_clip": 0.0112227, + "auxiliary_loss_mlp": 0.01032989, + "balance_loss_clip": 1.01933384, + "balance_loss_mlp": 1.0465281, + "epoch": 0.3840072147903202, + "flos": 15888748752000.0, + "grad_norm": 1.7796918810721745, + "language_loss": 0.72464442, + "learning_rate": 2.8232221639505547e-06, + "loss": 0.74619704, + "num_input_tokens_seen": 137064725, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7578125, + "step": 6387, + "time_per_iteration": 5.465053081512451 + }, + { + "auxiliary_loss_clip": 0.01120429, + "auxiliary_loss_mlp": 0.01038992, + "balance_loss_clip": 1.02478194, + "balance_loss_mlp": 1.0451014, + "epoch": 0.38406733804298815, + "flos": 28218030330240.0, + "grad_norm": 1.6321565887315352, + "language_loss": 0.81181073, + "learning_rate": 2.822867208702932e-06, + "loss": 0.8334049, + "num_input_tokens_seen": 137086030, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.75390625, + "step": 6388, + "time_per_iteration": 3.940337657928467 + }, + { + "auxiliary_loss_clip": 0.01117866, + "auxiliary_loss_mlp": 0.01035288, + "balance_loss_clip": 1.02183485, + "balance_loss_mlp": 1.04249692, + "epoch": 0.3841274612956561, + "flos": 18223588609920.0, + "grad_norm": 1.6383752800672902, + "language_loss": 0.76158738, + "learning_rate": 2.8225122222521026e-06, + "loss": 0.78311884, + "num_input_tokens_seen": 137105400, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75390625, + "step": 6389, + "time_per_iteration": 2.4720914363861084 + }, + { + "auxiliary_loss_clip": 0.01125023, + "auxiliary_loss_mlp": 0.01044293, + "balance_loss_clip": 1.02846217, + "balance_loss_mlp": 1.04541564, + "epoch": 0.3841875845483241, + "flos": 19792884879360.0, + "grad_norm": 1.5616719605863645, + "language_loss": 0.76284117, + "learning_rate": 2.8221572046115273e-06, + "loss": 0.78453434, + "num_input_tokens_seen": 137124985, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.796875, + "step": 6390, + "time_per_iteration": 2.4576520919799805 + }, + { + "auxiliary_loss_clip": 0.01124413, + "auxiliary_loss_mlp": 0.01048913, + "balance_loss_clip": 1.03295112, + "balance_loss_mlp": 1.04433882, + "epoch": 0.38424770780099204, + "flos": 29898829393920.0, + "grad_norm": 1.6285452565530243, + "language_loss": 0.70119178, + "learning_rate": 2.821802155794668e-06, + "loss": 0.72292501, + "num_input_tokens_seen": 137146745, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.80078125, + "step": 6391, + "time_per_iteration": 2.5657877922058105 + }, + { + "auxiliary_loss_clip": 0.01121063, + "auxiliary_loss_mlp": 0.01035269, + "balance_loss_clip": 1.01978421, + "balance_loss_mlp": 1.04267848, + "epoch": 0.38430783105366, + "flos": 20813717404800.0, + "grad_norm": 1.938766253942268, + "language_loss": 0.84100312, + "learning_rate": 2.8214470758149884e-06, + "loss": 0.86256641, + "num_input_tokens_seen": 137163195, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.78515625, + "step": 6392, + "time_per_iteration": 2.4366884231567383 + }, + { + "auxiliary_loss_clip": 0.01120524, + "auxiliary_loss_mlp": 0.01035539, + "balance_loss_clip": 1.0215621, + "balance_loss_mlp": 1.04348612, + "epoch": 0.384367954306328, + "flos": 10998577399680.0, + "grad_norm": 2.11211623143903, + "language_loss": 0.61170864, + "learning_rate": 2.8210919646859536e-06, + "loss": 0.63326931, + "num_input_tokens_seen": 137179330, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76953125, + "step": 6393, + "time_per_iteration": 2.428238868713379 + }, + { + "auxiliary_loss_clip": 0.01128297, + "auxiliary_loss_mlp": 0.01035177, + "balance_loss_clip": 1.01886964, + "balance_loss_mlp": 1.04589796, + "epoch": 0.38442807755899594, + "flos": 25338030779520.0, + "grad_norm": 2.3555579295861775, + "language_loss": 0.71295553, + "learning_rate": 2.820736822421029e-06, + "loss": 0.73459029, + "num_input_tokens_seen": 137198655, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.82421875, + "step": 6394, + "time_per_iteration": 2.483506679534912 + }, + { + "auxiliary_loss_clip": 0.01129724, + "auxiliary_loss_mlp": 0.01033781, + "balance_loss_clip": 1.01760483, + "balance_loss_mlp": 1.04732203, + "epoch": 0.3844882008116639, + "flos": 21069760527360.0, + "grad_norm": 2.3366242235467047, + "language_loss": 0.81172824, + "learning_rate": 2.8203816490336822e-06, + "loss": 0.83336329, + "num_input_tokens_seen": 137217120, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.82421875, + "step": 6395, + "time_per_iteration": 2.471301317214966 + }, + { + "auxiliary_loss_clip": 0.01126851, + "auxiliary_loss_mlp": 0.01043233, + "balance_loss_clip": 1.02880275, + "balance_loss_mlp": 1.04770553, + "epoch": 0.38454832406433187, + "flos": 17963235855360.0, + "grad_norm": 3.9526859148826707, + "language_loss": 0.70642132, + "learning_rate": 2.8200264445373813e-06, + "loss": 0.72812212, + "num_input_tokens_seen": 137234410, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7890625, + "step": 6396, + "time_per_iteration": 2.4121108055114746 + }, + { + "auxiliary_loss_clip": 0.01046507, + "auxiliary_loss_mlp": 0.00999241, + "balance_loss_clip": 0.99745274, + "balance_loss_mlp": 1.01972008, + "epoch": 0.38460844731699984, + "flos": 67924999555200.0, + "grad_norm": 0.8889613923167966, + "language_loss": 0.59708536, + "learning_rate": 2.8196712089455954e-06, + "loss": 0.61754286, + "num_input_tokens_seen": 137294940, + "router_z_loss_clip": 0.01782227, + "router_z_loss_mlp": 0.26757812, + "step": 6397, + "time_per_iteration": 3.1453351974487305 + }, + { + "auxiliary_loss_clip": 0.01123309, + "auxiliary_loss_mlp": 0.01031546, + "balance_loss_clip": 1.01682329, + "balance_loss_mlp": 1.0459342, + "epoch": 0.3846685705696678, + "flos": 25849075530240.0, + "grad_norm": 1.8498202803423767, + "language_loss": 0.84868926, + "learning_rate": 2.819315942271794e-06, + "loss": 0.87023783, + "num_input_tokens_seen": 137315035, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.7734375, + "step": 6398, + "time_per_iteration": 2.488083839416504 + }, + { + "auxiliary_loss_clip": 0.01121502, + "auxiliary_loss_mlp": 0.01032478, + "balance_loss_clip": 1.01826787, + "balance_loss_mlp": 1.0444839, + "epoch": 0.38472869382233577, + "flos": 16290194129280.0, + "grad_norm": 1.942979036208199, + "language_loss": 0.79634017, + "learning_rate": 2.8189606445294515e-06, + "loss": 0.81787992, + "num_input_tokens_seen": 137333155, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7734375, + "step": 6399, + "time_per_iteration": 2.4537224769592285 + }, + { + "auxiliary_loss_clip": 0.01124087, + "auxiliary_loss_mlp": 0.01036794, + "balance_loss_clip": 1.02149892, + "balance_loss_mlp": 1.04439902, + "epoch": 0.38478881707500373, + "flos": 19353122668800.0, + "grad_norm": 1.8928366067789952, + "language_loss": 0.67337728, + "learning_rate": 2.818605315732038e-06, + "loss": 0.69498605, + "num_input_tokens_seen": 137351515, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.796875, + "step": 6400, + "time_per_iteration": 2.434598207473755 + }, + { + "auxiliary_loss_clip": 0.0112665, + "auxiliary_loss_mlp": 0.0104604, + "balance_loss_clip": 1.030936, + "balance_loss_mlp": 1.04645705, + "epoch": 0.38484894032767175, + "flos": 24860849575680.0, + "grad_norm": 1.6542190438860391, + "language_loss": 0.73004973, + "learning_rate": 2.81824995589303e-06, + "loss": 0.7517767, + "num_input_tokens_seen": 137371255, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.80078125, + "step": 6401, + "time_per_iteration": 2.4963061809539795 + }, + { + "auxiliary_loss_clip": 0.01123878, + "auxiliary_loss_mlp": 0.01038712, + "balance_loss_clip": 1.02329874, + "balance_loss_mlp": 1.045017, + "epoch": 0.3849090635803397, + "flos": 14501806853760.0, + "grad_norm": 1.9430058457885813, + "language_loss": 0.71920168, + "learning_rate": 2.8178945650259012e-06, + "loss": 0.74082762, + "num_input_tokens_seen": 137388980, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.7890625, + "step": 6402, + "time_per_iteration": 2.426349639892578 + }, + { + "auxiliary_loss_clip": 0.01118079, + "auxiliary_loss_mlp": 0.01034485, + "balance_loss_clip": 1.02007246, + "balance_loss_mlp": 1.04232907, + "epoch": 0.3849691868330077, + "flos": 18515865576960.0, + "grad_norm": 1.7846208976590752, + "language_loss": 0.82449806, + "learning_rate": 2.817539143144128e-06, + "loss": 0.84602368, + "num_input_tokens_seen": 137406885, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7578125, + "step": 6403, + "time_per_iteration": 2.4700570106506348 + }, + { + "auxiliary_loss_clip": 0.0112163, + "auxiliary_loss_mlp": 0.01038876, + "balance_loss_clip": 1.02340865, + "balance_loss_mlp": 1.04500651, + "epoch": 0.38502931008567565, + "flos": 21616392677760.0, + "grad_norm": 1.8891944292176732, + "language_loss": 0.82468271, + "learning_rate": 2.817183690261189e-06, + "loss": 0.84628773, + "num_input_tokens_seen": 137425535, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.765625, + "step": 6404, + "time_per_iteration": 2.481968402862549 + }, + { + "auxiliary_loss_clip": 0.01122268, + "auxiliary_loss_mlp": 0.01035856, + "balance_loss_clip": 1.02136576, + "balance_loss_mlp": 1.04299283, + "epoch": 0.3850894333383436, + "flos": 25415346804480.0, + "grad_norm": 1.6334992055527433, + "language_loss": 0.69588619, + "learning_rate": 2.816828206390563e-06, + "loss": 0.71746749, + "num_input_tokens_seen": 137447700, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.79296875, + "step": 6405, + "time_per_iteration": 2.5947635173797607 + }, + { + "auxiliary_loss_clip": 0.01119754, + "auxiliary_loss_mlp": 0.01038243, + "balance_loss_clip": 1.02475476, + "balance_loss_mlp": 1.04411674, + "epoch": 0.3851495565910116, + "flos": 20227870581120.0, + "grad_norm": 1.9268009005119906, + "language_loss": 0.79068285, + "learning_rate": 2.816472691545729e-06, + "loss": 0.81226277, + "num_input_tokens_seen": 137462245, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75390625, + "step": 6406, + "time_per_iteration": 2.4195396900177 + }, + { + "auxiliary_loss_clip": 0.01125718, + "auxiliary_loss_mlp": 0.01037827, + "balance_loss_clip": 1.02247298, + "balance_loss_mlp": 1.04682863, + "epoch": 0.38520967984367954, + "flos": 16508459122560.0, + "grad_norm": 2.277779532957622, + "language_loss": 0.8438794, + "learning_rate": 2.8161171457401694e-06, + "loss": 0.86551487, + "num_input_tokens_seen": 137476455, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.7890625, + "step": 6407, + "time_per_iteration": 2.4518916606903076 + }, + { + "auxiliary_loss_clip": 0.01043854, + "auxiliary_loss_mlp": 0.01007721, + "balance_loss_clip": 1.00623727, + "balance_loss_mlp": 1.01778841, + "epoch": 0.3852698030963475, + "flos": 61313772971520.0, + "grad_norm": 0.8214817017046727, + "language_loss": 0.64868087, + "learning_rate": 2.815761568987365e-06, + "loss": 0.66919661, + "num_input_tokens_seen": 137539845, + "router_z_loss_clip": 0.01483154, + "router_z_loss_mlp": 0.25976562, + "step": 6408, + "time_per_iteration": 3.090940475463867 + }, + { + "auxiliary_loss_clip": 0.01123062, + "auxiliary_loss_mlp": 0.01043064, + "balance_loss_clip": 1.02676785, + "balance_loss_mlp": 1.04405272, + "epoch": 0.3853299263490155, + "flos": 22893016930560.0, + "grad_norm": 1.5501960898767924, + "language_loss": 0.73628408, + "learning_rate": 2.8154059613008e-06, + "loss": 0.7579453, + "num_input_tokens_seen": 137559880, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.7890625, + "step": 6409, + "time_per_iteration": 2.4831972122192383 + }, + { + "auxiliary_loss_clip": 0.01129844, + "auxiliary_loss_mlp": 0.01049195, + "balance_loss_clip": 1.03255367, + "balance_loss_mlp": 1.04574656, + "epoch": 0.38539004960168344, + "flos": 20047491457920.0, + "grad_norm": 2.0394333066705874, + "language_loss": 0.70208335, + "learning_rate": 2.81505032269396e-06, + "loss": 0.72387373, + "num_input_tokens_seen": 137578225, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.83984375, + "step": 6410, + "time_per_iteration": 2.430617332458496 + }, + { + "auxiliary_loss_clip": 0.01043682, + "auxiliary_loss_mlp": 0.01003736, + "balance_loss_clip": 1.0021385, + "balance_loss_mlp": 1.01802111, + "epoch": 0.3854501728543514, + "flos": 68730691570560.0, + "grad_norm": 0.6794214350275563, + "language_loss": 0.60311568, + "learning_rate": 2.81469465318033e-06, + "loss": 0.62358987, + "num_input_tokens_seen": 137645770, + "router_z_loss_clip": 0.01599121, + "router_z_loss_mlp": 0.2578125, + "step": 6411, + "time_per_iteration": 3.1681244373321533 + }, + { + "auxiliary_loss_clip": 0.01118542, + "auxiliary_loss_mlp": 0.01029148, + "balance_loss_clip": 1.01543355, + "balance_loss_mlp": 1.04146707, + "epoch": 0.38551029610701937, + "flos": 20485027025280.0, + "grad_norm": 1.9543275921913768, + "language_loss": 0.7770192, + "learning_rate": 2.814338952773397e-06, + "loss": 0.79849613, + "num_input_tokens_seen": 137664090, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76953125, + "step": 6412, + "time_per_iteration": 2.4670822620391846 + }, + { + "auxiliary_loss_clip": 0.01124348, + "auxiliary_loss_mlp": 0.01037148, + "balance_loss_clip": 1.02093506, + "balance_loss_mlp": 1.0437274, + "epoch": 0.38557041935968733, + "flos": 23471788775040.0, + "grad_norm": 1.7609162802618283, + "language_loss": 0.78148544, + "learning_rate": 2.8139832214866493e-06, + "loss": 0.80310041, + "num_input_tokens_seen": 137683190, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8046875, + "step": 6413, + "time_per_iteration": 2.4506192207336426 + }, + { + "auxiliary_loss_clip": 0.01040458, + "auxiliary_loss_mlp": 0.01006495, + "balance_loss_clip": 1.00485027, + "balance_loss_mlp": 1.01477003, + "epoch": 0.38563054261235535, + "flos": 63966636869760.0, + "grad_norm": 0.8068957555662655, + "language_loss": 0.61344963, + "learning_rate": 2.813627459333576e-06, + "loss": 0.63391918, + "num_input_tokens_seen": 137737315, + "router_z_loss_clip": 0.01647949, + "router_z_loss_mlp": 0.2578125, + "step": 6414, + "time_per_iteration": 2.897420883178711 + }, + { + "auxiliary_loss_clip": 0.01124739, + "auxiliary_loss_mlp": 0.0104191, + "balance_loss_clip": 1.02712834, + "balance_loss_mlp": 1.04452538, + "epoch": 0.3856906658650233, + "flos": 23987789602560.0, + "grad_norm": 2.3808373048749543, + "language_loss": 0.77121973, + "learning_rate": 2.8132716663276685e-06, + "loss": 0.79288626, + "num_input_tokens_seen": 137753535, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.80078125, + "step": 6415, + "time_per_iteration": 2.455246686935425 + }, + { + "auxiliary_loss_clip": 0.01115597, + "auxiliary_loss_mlp": 0.01032062, + "balance_loss_clip": 1.01916933, + "balance_loss_mlp": 1.04303658, + "epoch": 0.3857507891176913, + "flos": 25007436979200.0, + "grad_norm": 1.6468091717833364, + "language_loss": 0.79597795, + "learning_rate": 2.8129158424824173e-06, + "loss": 0.81745458, + "num_input_tokens_seen": 137773405, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7265625, + "step": 6416, + "time_per_iteration": 2.5162863731384277 + }, + { + "auxiliary_loss_clip": 0.0111887, + "auxiliary_loss_mlp": 0.01034214, + "balance_loss_clip": 1.02100587, + "balance_loss_mlp": 1.04190922, + "epoch": 0.38581091237035925, + "flos": 21536778182400.0, + "grad_norm": 1.6816352340920986, + "language_loss": 0.7957328, + "learning_rate": 2.8125599878113155e-06, + "loss": 0.81726366, + "num_input_tokens_seen": 137790810, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.76953125, + "step": 6417, + "time_per_iteration": 2.462679862976074 + }, + { + "auxiliary_loss_clip": 0.01116808, + "auxiliary_loss_mlp": 0.01037406, + "balance_loss_clip": 1.02369118, + "balance_loss_mlp": 1.03945839, + "epoch": 0.3858710356230272, + "flos": 17383889393280.0, + "grad_norm": 9.924006648688666, + "language_loss": 0.80246758, + "learning_rate": 2.8122041023278583e-06, + "loss": 0.82400978, + "num_input_tokens_seen": 137810265, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 6418, + "time_per_iteration": 2.4485208988189697 + }, + { + "auxiliary_loss_clip": 0.01114184, + "auxiliary_loss_mlp": 0.01033119, + "balance_loss_clip": 1.01992905, + "balance_loss_mlp": 1.03939319, + "epoch": 0.3859311588756952, + "flos": 20339588856960.0, + "grad_norm": 1.9958339666442106, + "language_loss": 0.79694712, + "learning_rate": 2.8118481860455407e-06, + "loss": 0.81842011, + "num_input_tokens_seen": 137828580, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.75, + "step": 6419, + "time_per_iteration": 2.4360008239746094 + }, + { + "auxiliary_loss_clip": 0.01115104, + "auxiliary_loss_mlp": 0.01034912, + "balance_loss_clip": 1.01972449, + "balance_loss_mlp": 1.04120576, + "epoch": 0.38599128212836314, + "flos": 26321157002880.0, + "grad_norm": 2.0553625572614678, + "language_loss": 0.67804086, + "learning_rate": 2.8114922389778573e-06, + "loss": 0.69954103, + "num_input_tokens_seen": 137846145, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.73828125, + "step": 6420, + "time_per_iteration": 2.489661931991577 + }, + { + "auxiliary_loss_clip": 0.01116038, + "auxiliary_loss_mlp": 0.01036432, + "balance_loss_clip": 1.02286029, + "balance_loss_mlp": 1.04163957, + "epoch": 0.3860514053810311, + "flos": 13553837066880.0, + "grad_norm": 2.4512212791744576, + "language_loss": 0.81831443, + "learning_rate": 2.8111362611383076e-06, + "loss": 0.83983916, + "num_input_tokens_seen": 137863705, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.74609375, + "step": 6421, + "time_per_iteration": 2.4278934001922607 + }, + { + "auxiliary_loss_clip": 0.01118285, + "auxiliary_loss_mlp": 0.01033321, + "balance_loss_clip": 1.01888454, + "balance_loss_mlp": 1.04031229, + "epoch": 0.3861115286336991, + "flos": 20954271323520.0, + "grad_norm": 2.2431145476637266, + "language_loss": 0.72079587, + "learning_rate": 2.8107802525403886e-06, + "loss": 0.74231195, + "num_input_tokens_seen": 137880285, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 6422, + "time_per_iteration": 2.441708564758301 + }, + { + "auxiliary_loss_clip": 0.01116019, + "auxiliary_loss_mlp": 0.01038012, + "balance_loss_clip": 1.02482104, + "balance_loss_mlp": 1.0425638, + "epoch": 0.38617165188636704, + "flos": 16362697731840.0, + "grad_norm": 1.6611822537555545, + "language_loss": 0.65814191, + "learning_rate": 2.8104242131976025e-06, + "loss": 0.6796822, + "num_input_tokens_seen": 137898335, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 6423, + "time_per_iteration": 2.4211878776550293 + }, + { + "auxiliary_loss_clip": 0.01121429, + "auxiliary_loss_mlp": 0.01039251, + "balance_loss_clip": 1.02561951, + "balance_loss_mlp": 1.0439117, + "epoch": 0.386231775139035, + "flos": 34787276893440.0, + "grad_norm": 1.965242475874499, + "language_loss": 0.68746173, + "learning_rate": 2.810068143123449e-06, + "loss": 0.70906854, + "num_input_tokens_seen": 137918605, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 6424, + "time_per_iteration": 2.5804436206817627 + }, + { + "auxiliary_loss_clip": 0.01117257, + "auxiliary_loss_mlp": 0.0103853, + "balance_loss_clip": 1.0243144, + "balance_loss_mlp": 1.04261661, + "epoch": 0.38629189839170297, + "flos": 21726171619200.0, + "grad_norm": 1.3808875353222407, + "language_loss": 0.72237349, + "learning_rate": 2.809712042331429e-06, + "loss": 0.74393135, + "num_input_tokens_seen": 137938245, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.74609375, + "step": 6425, + "time_per_iteration": 2.4568634033203125 + }, + { + "auxiliary_loss_clip": 0.01121874, + "auxiliary_loss_mlp": 0.01038367, + "balance_loss_clip": 1.02413344, + "balance_loss_mlp": 1.0424571, + "epoch": 0.38635202164437094, + "flos": 27923634460800.0, + "grad_norm": 2.566599175889616, + "language_loss": 0.80062914, + "learning_rate": 2.8093559108350484e-06, + "loss": 0.82223159, + "num_input_tokens_seen": 137956770, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.79296875, + "step": 6426, + "time_per_iteration": 2.5236575603485107 + }, + { + "auxiliary_loss_clip": 0.01123371, + "auxiliary_loss_mlp": 0.01036233, + "balance_loss_clip": 1.0222559, + "balance_loss_mlp": 1.04582727, + "epoch": 0.38641214489703896, + "flos": 23586631534080.0, + "grad_norm": 2.32293087490025, + "language_loss": 0.74624443, + "learning_rate": 2.80899974864781e-06, + "loss": 0.7678405, + "num_input_tokens_seen": 137977040, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7734375, + "step": 6427, + "time_per_iteration": 2.467555046081543 + }, + { + "auxiliary_loss_clip": 0.01118544, + "auxiliary_loss_mlp": 0.01039206, + "balance_loss_clip": 1.02530599, + "balance_loss_mlp": 1.04256904, + "epoch": 0.3864722681497069, + "flos": 12641239198080.0, + "grad_norm": 1.6951631816528543, + "language_loss": 0.69630527, + "learning_rate": 2.8086435557832203e-06, + "loss": 0.71788281, + "num_input_tokens_seen": 137993545, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 6428, + "time_per_iteration": 2.4336817264556885 + }, + { + "auxiliary_loss_clip": 0.01120968, + "auxiliary_loss_mlp": 0.01042024, + "balance_loss_clip": 1.02787971, + "balance_loss_mlp": 1.0427897, + "epoch": 0.3865323914023749, + "flos": 17598922162560.0, + "grad_norm": 2.175868568260599, + "language_loss": 0.84272587, + "learning_rate": 2.8082873322547863e-06, + "loss": 0.86435586, + "num_input_tokens_seen": 138010140, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.78125, + "step": 6429, + "time_per_iteration": 5.324048757553101 + }, + { + "auxiliary_loss_clip": 0.01121297, + "auxiliary_loss_mlp": 0.01037029, + "balance_loss_clip": 1.02358222, + "balance_loss_mlp": 1.04458523, + "epoch": 0.38659251465504285, + "flos": 18478949374080.0, + "grad_norm": 2.0434704200334726, + "language_loss": 0.808312, + "learning_rate": 2.807931078076015e-06, + "loss": 0.82989526, + "num_input_tokens_seen": 138028880, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 6430, + "time_per_iteration": 3.8362674713134766 + }, + { + "auxiliary_loss_clip": 0.01037896, + "auxiliary_loss_mlp": 0.01001686, + "balance_loss_clip": 1.00019002, + "balance_loss_mlp": 1.01247668, + "epoch": 0.3866526379077108, + "flos": 64165726978560.0, + "grad_norm": 0.7147232834997996, + "language_loss": 0.58793551, + "learning_rate": 2.807574793260416e-06, + "loss": 0.60833132, + "num_input_tokens_seen": 138098090, + "router_z_loss_clip": 0.01495361, + "router_z_loss_mlp": 0.25390625, + "step": 6431, + "time_per_iteration": 3.1054275035858154 + }, + { + "auxiliary_loss_clip": 0.01123522, + "auxiliary_loss_mlp": 0.01036133, + "balance_loss_clip": 1.0213275, + "balance_loss_mlp": 1.04425848, + "epoch": 0.3867127611603788, + "flos": 14388292897920.0, + "grad_norm": 1.8418420222570902, + "language_loss": 0.78914982, + "learning_rate": 2.8072184778215004e-06, + "loss": 0.81074637, + "num_input_tokens_seen": 138114735, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 6432, + "time_per_iteration": 2.441103458404541 + }, + { + "auxiliary_loss_clip": 0.01120861, + "auxiliary_loss_mlp": 0.01042942, + "balance_loss_clip": 1.02820802, + "balance_loss_mlp": 1.04033065, + "epoch": 0.38677288441304675, + "flos": 20010754823040.0, + "grad_norm": 3.1335187433073006, + "language_loss": 0.80734611, + "learning_rate": 2.806862131772779e-06, + "loss": 0.82898408, + "num_input_tokens_seen": 138130480, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8046875, + "step": 6433, + "time_per_iteration": 2.4334840774536133 + }, + { + "auxiliary_loss_clip": 0.01122101, + "auxiliary_loss_mlp": 0.01036931, + "balance_loss_clip": 1.02167201, + "balance_loss_mlp": 1.04427695, + "epoch": 0.3868330076657147, + "flos": 22236893147520.0, + "grad_norm": 1.9920607209076013, + "language_loss": 0.70712543, + "learning_rate": 2.806505755127765e-06, + "loss": 0.72871572, + "num_input_tokens_seen": 138150640, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.77734375, + "step": 6434, + "time_per_iteration": 2.4485912322998047 + }, + { + "auxiliary_loss_clip": 0.01122001, + "auxiliary_loss_mlp": 0.01037218, + "balance_loss_clip": 1.02259684, + "balance_loss_mlp": 1.04096544, + "epoch": 0.3868931309183827, + "flos": 16727442387840.0, + "grad_norm": 3.1146547904297615, + "language_loss": 0.77674437, + "learning_rate": 2.806149347899972e-06, + "loss": 0.79833651, + "num_input_tokens_seen": 138169700, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8125, + "step": 6435, + "time_per_iteration": 2.4734902381896973 + }, + { + "auxiliary_loss_clip": 0.01117121, + "auxiliary_loss_mlp": 0.01032568, + "balance_loss_clip": 1.01877558, + "balance_loss_mlp": 1.04157901, + "epoch": 0.38695325417105064, + "flos": 22674716023680.0, + "grad_norm": 1.6626735995393465, + "language_loss": 0.79557228, + "learning_rate": 2.805792910102915e-06, + "loss": 0.81706917, + "num_input_tokens_seen": 138185835, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.75390625, + "step": 6436, + "time_per_iteration": 2.461880922317505 + }, + { + "auxiliary_loss_clip": 0.01115966, + "auxiliary_loss_mlp": 0.01032941, + "balance_loss_clip": 1.01937521, + "balance_loss_mlp": 1.04099202, + "epoch": 0.3870133774237186, + "flos": 23112036109440.0, + "grad_norm": 1.7213495950653388, + "language_loss": 0.77057981, + "learning_rate": 2.8054364417501093e-06, + "loss": 0.79206884, + "num_input_tokens_seen": 138204080, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.75, + "step": 6437, + "time_per_iteration": 2.506342649459839 + }, + { + "auxiliary_loss_clip": 0.01118581, + "auxiliary_loss_mlp": 0.0104126, + "balance_loss_clip": 1.02759838, + "balance_loss_mlp": 1.0425818, + "epoch": 0.3870735006763866, + "flos": 17675699483520.0, + "grad_norm": 2.0991099349261013, + "language_loss": 0.8199805, + "learning_rate": 2.805079942855074e-06, + "loss": 0.84157896, + "num_input_tokens_seen": 138220710, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7578125, + "step": 6438, + "time_per_iteration": 2.4236960411071777 + }, + { + "auxiliary_loss_clip": 0.01119447, + "auxiliary_loss_mlp": 0.01039004, + "balance_loss_clip": 1.02413225, + "balance_loss_mlp": 1.04198575, + "epoch": 0.38713362392905454, + "flos": 23295791111040.0, + "grad_norm": 1.4416179830694351, + "language_loss": 0.75274503, + "learning_rate": 2.804723413431326e-06, + "loss": 0.77432954, + "num_input_tokens_seen": 138241720, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7734375, + "step": 6439, + "time_per_iteration": 2.4746499061584473 + }, + { + "auxiliary_loss_clip": 0.01115954, + "auxiliary_loss_mlp": 0.01030927, + "balance_loss_clip": 1.01804042, + "balance_loss_mlp": 1.04231787, + "epoch": 0.38719374718172256, + "flos": 21031192298880.0, + "grad_norm": 1.4591961315755648, + "language_loss": 0.74029297, + "learning_rate": 2.8043668534923855e-06, + "loss": 0.76176178, + "num_input_tokens_seen": 138261885, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.734375, + "step": 6440, + "time_per_iteration": 2.470442056655884 + }, + { + "auxiliary_loss_clip": 0.01120633, + "auxiliary_loss_mlp": 0.01041682, + "balance_loss_clip": 1.02755535, + "balance_loss_mlp": 1.04172719, + "epoch": 0.3872538704343905, + "flos": 19609776322560.0, + "grad_norm": 1.882594032026591, + "language_loss": 0.82420492, + "learning_rate": 2.804010263051774e-06, + "loss": 0.84582806, + "num_input_tokens_seen": 138280255, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 6441, + "time_per_iteration": 2.4857184886932373 + }, + { + "auxiliary_loss_clip": 0.01119023, + "auxiliary_loss_mlp": 0.01044385, + "balance_loss_clip": 1.03132594, + "balance_loss_mlp": 1.04210794, + "epoch": 0.3873139936870585, + "flos": 17530045833600.0, + "grad_norm": 2.099147848905264, + "language_loss": 0.81835496, + "learning_rate": 2.8036536421230118e-06, + "loss": 0.83998901, + "num_input_tokens_seen": 138296675, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.76953125, + "step": 6442, + "time_per_iteration": 2.4149296283721924 + }, + { + "auxiliary_loss_clip": 0.01116335, + "auxiliary_loss_mlp": 0.01035738, + "balance_loss_clip": 1.02142096, + "balance_loss_mlp": 1.04025602, + "epoch": 0.38737411693972645, + "flos": 17786555832960.0, + "grad_norm": 1.5694674536603201, + "language_loss": 0.83847654, + "learning_rate": 2.803296990719624e-06, + "loss": 0.85999727, + "num_input_tokens_seen": 138314985, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7578125, + "step": 6443, + "time_per_iteration": 2.4515957832336426 + }, + { + "auxiliary_loss_clip": 0.01039021, + "auxiliary_loss_mlp": 0.01007024, + "balance_loss_clip": 1.00551593, + "balance_loss_mlp": 1.0140909, + "epoch": 0.3874342401923944, + "flos": 58304637048960.0, + "grad_norm": 0.7719544775144753, + "language_loss": 0.50268674, + "learning_rate": 2.8029403088551327e-06, + "loss": 0.52314723, + "num_input_tokens_seen": 138373275, + "router_z_loss_clip": 0.01507568, + "router_z_loss_mlp": 0.24902344, + "step": 6444, + "time_per_iteration": 3.092834711074829 + }, + { + "auxiliary_loss_clip": 0.01115245, + "auxiliary_loss_mlp": 0.01037927, + "balance_loss_clip": 1.02502251, + "balance_loss_mlp": 1.04225266, + "epoch": 0.3874943634450624, + "flos": 17711933328000.0, + "grad_norm": 1.537835026490341, + "language_loss": 0.78736365, + "learning_rate": 2.802583596543065e-06, + "loss": 0.80889541, + "num_input_tokens_seen": 138391145, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7265625, + "step": 6445, + "time_per_iteration": 2.435347557067871 + }, + { + "auxiliary_loss_clip": 0.01115913, + "auxiliary_loss_mlp": 0.01033846, + "balance_loss_clip": 1.02055407, + "balance_loss_mlp": 1.04211605, + "epoch": 0.38755448669773035, + "flos": 19244852098560.0, + "grad_norm": 1.672895701432963, + "language_loss": 0.81121695, + "learning_rate": 2.8022268537969474e-06, + "loss": 0.83271456, + "num_input_tokens_seen": 138409875, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73828125, + "step": 6446, + "time_per_iteration": 2.469536781311035 + }, + { + "auxiliary_loss_clip": 0.01114406, + "auxiliary_loss_mlp": 0.01037114, + "balance_loss_clip": 1.02434087, + "balance_loss_mlp": 1.03933239, + "epoch": 0.3876146099503983, + "flos": 20594267262720.0, + "grad_norm": 1.877585125713849, + "language_loss": 0.77093089, + "learning_rate": 2.801870080630306e-06, + "loss": 0.79244608, + "num_input_tokens_seen": 138428965, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.75, + "step": 6447, + "time_per_iteration": 2.428525447845459 + }, + { + "auxiliary_loss_clip": 0.01116221, + "auxiliary_loss_mlp": 0.01032372, + "balance_loss_clip": 1.01940775, + "balance_loss_mlp": 1.04256356, + "epoch": 0.3876747332030663, + "flos": 19281121856640.0, + "grad_norm": 1.5240627220637166, + "language_loss": 0.75767821, + "learning_rate": 2.801513277056671e-06, + "loss": 0.7791642, + "num_input_tokens_seen": 138448090, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.734375, + "step": 6448, + "time_per_iteration": 2.4325876235961914 + }, + { + "auxiliary_loss_clip": 0.01115196, + "auxiliary_loss_mlp": 0.01033743, + "balance_loss_clip": 1.02023029, + "balance_loss_mlp": 1.04179466, + "epoch": 0.38773485645573424, + "flos": 18945895201920.0, + "grad_norm": 1.6442003276819328, + "language_loss": 0.75754648, + "learning_rate": 2.8011564430895725e-06, + "loss": 0.77903593, + "num_input_tokens_seen": 138466105, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 6449, + "time_per_iteration": 2.435208320617676 + }, + { + "auxiliary_loss_clip": 0.0111808, + "auxiliary_loss_mlp": 0.01033453, + "balance_loss_clip": 1.0194999, + "balance_loss_mlp": 1.03956699, + "epoch": 0.3877949797084022, + "flos": 23071348978560.0, + "grad_norm": 1.5394171504545016, + "language_loss": 0.78183508, + "learning_rate": 2.800799578742542e-06, + "loss": 0.80335045, + "num_input_tokens_seen": 138485160, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.78515625, + "step": 6450, + "time_per_iteration": 2.467933177947998 + }, + { + "auxiliary_loss_clip": 0.0112145, + "auxiliary_loss_mlp": 0.01036071, + "balance_loss_clip": 1.02190948, + "balance_loss_mlp": 1.04104686, + "epoch": 0.3878551029610702, + "flos": 29095543589760.0, + "grad_norm": 2.1284571270947263, + "language_loss": 0.77706474, + "learning_rate": 2.8004426840291106e-06, + "loss": 0.79863995, + "num_input_tokens_seen": 138504135, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8046875, + "step": 6451, + "time_per_iteration": 2.513192892074585 + }, + { + "auxiliary_loss_clip": 0.01112409, + "auxiliary_loss_mlp": 0.01026865, + "balance_loss_clip": 1.01337111, + "balance_loss_mlp": 1.03988457, + "epoch": 0.38791522621373814, + "flos": 20996394998400.0, + "grad_norm": 1.5965207120841256, + "language_loss": 0.7642619, + "learning_rate": 2.800085758962812e-06, + "loss": 0.7856546, + "num_input_tokens_seen": 138523955, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7265625, + "step": 6452, + "time_per_iteration": 2.453756809234619 + }, + { + "auxiliary_loss_clip": 0.01118677, + "auxiliary_loss_mlp": 0.01040253, + "balance_loss_clip": 1.02721739, + "balance_loss_mlp": 1.04313231, + "epoch": 0.3879753494664061, + "flos": 15486836497920.0, + "grad_norm": 1.5417712426283914, + "language_loss": 0.79843581, + "learning_rate": 2.799728803557182e-06, + "loss": 0.82002515, + "num_input_tokens_seen": 138541655, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7578125, + "step": 6453, + "time_per_iteration": 2.434788465499878 + }, + { + "auxiliary_loss_clip": 0.01126032, + "auxiliary_loss_mlp": 0.01037051, + "balance_loss_clip": 1.02257931, + "balance_loss_mlp": 1.0456028, + "epoch": 0.3880354727190741, + "flos": 22053964158720.0, + "grad_norm": 1.779502658436086, + "language_loss": 0.71759796, + "learning_rate": 2.7993718178257555e-06, + "loss": 0.73922884, + "num_input_tokens_seen": 138560860, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8046875, + "step": 6454, + "time_per_iteration": 2.456637382507324 + }, + { + "auxiliary_loss_clip": 0.01122488, + "auxiliary_loss_mlp": 0.01039713, + "balance_loss_clip": 1.02489531, + "balance_loss_mlp": 1.04253364, + "epoch": 0.3880955959717421, + "flos": 20340307128960.0, + "grad_norm": 2.1246626443539216, + "language_loss": 0.77918947, + "learning_rate": 2.7990148017820694e-06, + "loss": 0.80081153, + "num_input_tokens_seen": 138580200, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 6455, + "time_per_iteration": 2.4589757919311523 + }, + { + "auxiliary_loss_clip": 0.01118002, + "auxiliary_loss_mlp": 0.01034735, + "balance_loss_clip": 1.02040577, + "balance_loss_mlp": 1.04232621, + "epoch": 0.38815571922441006, + "flos": 23075407215360.0, + "grad_norm": 1.6339807395025958, + "language_loss": 0.75865024, + "learning_rate": 2.798657755439662e-06, + "loss": 0.78017759, + "num_input_tokens_seen": 138598315, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7578125, + "step": 6456, + "time_per_iteration": 2.4390318393707275 + }, + { + "auxiliary_loss_clip": 0.01121145, + "auxiliary_loss_mlp": 0.01033254, + "balance_loss_clip": 1.01944995, + "balance_loss_mlp": 1.04276633, + "epoch": 0.388215842477078, + "flos": 20776944856320.0, + "grad_norm": 2.085241252102015, + "language_loss": 0.60518527, + "learning_rate": 2.7983006788120726e-06, + "loss": 0.62672919, + "num_input_tokens_seen": 138615695, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.78515625, + "step": 6457, + "time_per_iteration": 2.459535837173462 + }, + { + "auxiliary_loss_clip": 0.01121291, + "auxiliary_loss_mlp": 0.01037459, + "balance_loss_clip": 1.02167547, + "balance_loss_mlp": 1.04195237, + "epoch": 0.388275965729746, + "flos": 20448182649600.0, + "grad_norm": 2.1234505206368475, + "language_loss": 0.80247247, + "learning_rate": 2.797943571912841e-06, + "loss": 0.82405996, + "num_input_tokens_seen": 138633180, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.79296875, + "step": 6458, + "time_per_iteration": 2.425049066543579 + }, + { + "auxiliary_loss_clip": 0.01120771, + "auxiliary_loss_mlp": 0.0103458, + "balance_loss_clip": 1.02072167, + "balance_loss_mlp": 1.04291797, + "epoch": 0.38833608898241395, + "flos": 27892392606720.0, + "grad_norm": 1.8371533851039183, + "language_loss": 0.81683058, + "learning_rate": 2.797586434755509e-06, + "loss": 0.83838403, + "num_input_tokens_seen": 138654785, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.77734375, + "step": 6459, + "time_per_iteration": 2.5234129428863525 + }, + { + "auxiliary_loss_clip": 0.01117247, + "auxiliary_loss_mlp": 0.01034445, + "balance_loss_clip": 1.02105141, + "balance_loss_mlp": 1.04261899, + "epoch": 0.3883962122350819, + "flos": 18076390675200.0, + "grad_norm": 3.3845315312390643, + "language_loss": 0.61609662, + "learning_rate": 2.7972292673536202e-06, + "loss": 0.63761353, + "num_input_tokens_seen": 138673330, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 6460, + "time_per_iteration": 2.4271440505981445 + }, + { + "auxiliary_loss_clip": 0.01121121, + "auxiliary_loss_mlp": 0.01034276, + "balance_loss_clip": 1.0216701, + "balance_loss_mlp": 1.04498553, + "epoch": 0.3884563354877499, + "flos": 23622254847360.0, + "grad_norm": 1.999840896697599, + "language_loss": 0.85928953, + "learning_rate": 2.796872069720717e-06, + "loss": 0.88084352, + "num_input_tokens_seen": 138694185, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.76171875, + "step": 6461, + "time_per_iteration": 2.4874932765960693 + }, + { + "auxiliary_loss_clip": 0.01121067, + "auxiliary_loss_mlp": 0.01041247, + "balance_loss_clip": 1.02712059, + "balance_loss_mlp": 1.04198229, + "epoch": 0.38851645874041785, + "flos": 27453528236160.0, + "grad_norm": 5.6194775515218085, + "language_loss": 0.71397054, + "learning_rate": 2.7965148418703456e-06, + "loss": 0.73559368, + "num_input_tokens_seen": 138714625, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 6462, + "time_per_iteration": 2.4839894771575928 + }, + { + "auxiliary_loss_clip": 0.01120048, + "auxiliary_loss_mlp": 0.01036945, + "balance_loss_clip": 1.02274752, + "balance_loss_mlp": 1.04190457, + "epoch": 0.3885765819930858, + "flos": 25228072270080.0, + "grad_norm": 2.13487298932128, + "language_loss": 0.7582581, + "learning_rate": 2.796157583816052e-06, + "loss": 0.77982807, + "num_input_tokens_seen": 138733585, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.78125, + "step": 6463, + "time_per_iteration": 2.4897215366363525 + }, + { + "auxiliary_loss_clip": 0.0112511, + "auxiliary_loss_mlp": 0.01046321, + "balance_loss_clip": 1.0305022, + "balance_loss_mlp": 1.04482341, + "epoch": 0.3886367052457538, + "flos": 16946605221120.0, + "grad_norm": 1.9442764767857983, + "language_loss": 0.70078236, + "learning_rate": 2.795800295571382e-06, + "loss": 0.72249663, + "num_input_tokens_seen": 138752335, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8046875, + "step": 6464, + "time_per_iteration": 2.4519219398498535 + }, + { + "auxiliary_loss_clip": 0.01118845, + "auxiliary_loss_mlp": 0.01036529, + "balance_loss_clip": 1.02258134, + "balance_loss_mlp": 1.04280329, + "epoch": 0.38869682849842174, + "flos": 27154140376320.0, + "grad_norm": 1.8350923871455525, + "language_loss": 0.69608724, + "learning_rate": 2.7954429771498858e-06, + "loss": 0.717641, + "num_input_tokens_seen": 138768450, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7578125, + "step": 6465, + "time_per_iteration": 2.524698495864868 + }, + { + "auxiliary_loss_clip": 0.01120474, + "auxiliary_loss_mlp": 0.01043161, + "balance_loss_clip": 1.02772307, + "balance_loss_mlp": 1.04204226, + "epoch": 0.3887569517510897, + "flos": 21063619301760.0, + "grad_norm": 2.02186972310505, + "language_loss": 0.77957165, + "learning_rate": 2.7950856285651117e-06, + "loss": 0.80120802, + "num_input_tokens_seen": 138786775, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.78125, + "step": 6466, + "time_per_iteration": 2.4420318603515625 + }, + { + "auxiliary_loss_clip": 0.0112437, + "auxiliary_loss_mlp": 0.01039754, + "balance_loss_clip": 1.02520418, + "balance_loss_mlp": 1.04476476, + "epoch": 0.38881707500375773, + "flos": 29497384016640.0, + "grad_norm": 1.578436157089315, + "language_loss": 0.69438803, + "learning_rate": 2.794728249830611e-06, + "loss": 0.71602929, + "num_input_tokens_seen": 138810100, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.796875, + "step": 6467, + "time_per_iteration": 2.526315212249756 + }, + { + "auxiliary_loss_clip": 0.01122941, + "auxiliary_loss_mlp": 0.01048409, + "balance_loss_clip": 1.03337657, + "balance_loss_mlp": 1.04374123, + "epoch": 0.3888771982564257, + "flos": 17488281294720.0, + "grad_norm": 2.7189933074164316, + "language_loss": 0.83444071, + "learning_rate": 2.794370840959936e-06, + "loss": 0.85615414, + "num_input_tokens_seen": 138825140, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.79296875, + "step": 6468, + "time_per_iteration": 2.433612108230591 + }, + { + "auxiliary_loss_clip": 0.01119544, + "auxiliary_loss_mlp": 0.01040242, + "balance_loss_clip": 1.02720666, + "balance_loss_mlp": 1.04250181, + "epoch": 0.38893732150909366, + "flos": 21942425450880.0, + "grad_norm": 5.890128393718138, + "language_loss": 0.84300733, + "learning_rate": 2.7940134019666383e-06, + "loss": 0.86460519, + "num_input_tokens_seen": 138844115, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.76953125, + "step": 6469, + "time_per_iteration": 2.501368284225464 + }, + { + "auxiliary_loss_clip": 0.011205, + "auxiliary_loss_mlp": 0.0104307, + "balance_loss_clip": 1.02871704, + "balance_loss_mlp": 1.0433706, + "epoch": 0.3889974447617616, + "flos": 24276367468800.0, + "grad_norm": 1.6566744770772097, + "language_loss": 0.74790764, + "learning_rate": 2.793655932864273e-06, + "loss": 0.76954335, + "num_input_tokens_seen": 138860860, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.76953125, + "step": 6470, + "time_per_iteration": 5.350924015045166 + }, + { + "auxiliary_loss_clip": 0.01120302, + "auxiliary_loss_mlp": 0.01041359, + "balance_loss_clip": 1.02632678, + "balance_loss_mlp": 1.04234362, + "epoch": 0.3890575680144296, + "flos": 25667116208640.0, + "grad_norm": 1.5254918915202156, + "language_loss": 0.74916464, + "learning_rate": 2.7932984336663953e-06, + "loss": 0.77078122, + "num_input_tokens_seen": 138881910, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.78125, + "step": 6471, + "time_per_iteration": 5.323298215866089 + }, + { + "auxiliary_loss_clip": 0.01121653, + "auxiliary_loss_mlp": 0.01045365, + "balance_loss_clip": 1.0310601, + "balance_loss_mlp": 1.04548645, + "epoch": 0.38911769126709755, + "flos": 22855274714880.0, + "grad_norm": 1.9258613787227117, + "language_loss": 0.68053186, + "learning_rate": 2.792940904386562e-06, + "loss": 0.70220202, + "num_input_tokens_seen": 138900975, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.76171875, + "step": 6472, + "time_per_iteration": 2.453610420227051 + }, + { + "auxiliary_loss_clip": 0.01120597, + "auxiliary_loss_mlp": 0.01046672, + "balance_loss_clip": 1.0330584, + "balance_loss_mlp": 1.04305148, + "epoch": 0.3891778145197655, + "flos": 25447522412160.0, + "grad_norm": 1.6233097762345425, + "language_loss": 0.76542008, + "learning_rate": 2.7925833450383293e-06, + "loss": 0.7870928, + "num_input_tokens_seen": 138920795, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7734375, + "step": 6473, + "time_per_iteration": 2.487966775894165 + }, + { + "auxiliary_loss_clip": 0.01123459, + "auxiliary_loss_mlp": 0.01046447, + "balance_loss_clip": 1.03157008, + "balance_loss_mlp": 1.04532015, + "epoch": 0.3892379377724335, + "flos": 14027965614720.0, + "grad_norm": 1.8986671727726652, + "language_loss": 0.70897496, + "learning_rate": 2.792225755635257e-06, + "loss": 0.73067403, + "num_input_tokens_seen": 138938770, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78125, + "step": 6474, + "time_per_iteration": 2.4192309379577637 + }, + { + "auxiliary_loss_clip": 0.01121654, + "auxiliary_loss_mlp": 0.01039414, + "balance_loss_clip": 1.02607441, + "balance_loss_mlp": 1.04441047, + "epoch": 0.38929806102510145, + "flos": 20157449967360.0, + "grad_norm": 1.400231739949646, + "language_loss": 0.68822956, + "learning_rate": 2.7918681361909046e-06, + "loss": 0.70984024, + "num_input_tokens_seen": 138958880, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7734375, + "step": 6475, + "time_per_iteration": 2.508747100830078 + }, + { + "auxiliary_loss_clip": 0.01129756, + "auxiliary_loss_mlp": 0.0104873, + "balance_loss_clip": 1.03369188, + "balance_loss_mlp": 1.04747105, + "epoch": 0.3893581842777694, + "flos": 22163958581760.0, + "grad_norm": 2.0025883037810055, + "language_loss": 0.76052523, + "learning_rate": 2.7915104867188332e-06, + "loss": 0.78231013, + "num_input_tokens_seen": 138977240, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.82421875, + "step": 6476, + "time_per_iteration": 2.4432644844055176 + }, + { + "auxiliary_loss_clip": 0.01040957, + "auxiliary_loss_mlp": 0.01003672, + "balance_loss_clip": 1.00199068, + "balance_loss_mlp": 1.01581097, + "epoch": 0.3894183075304374, + "flos": 67301877392640.0, + "grad_norm": 0.7803986728659921, + "language_loss": 0.58254546, + "learning_rate": 2.7911528072326055e-06, + "loss": 0.60299176, + "num_input_tokens_seen": 139039035, + "router_z_loss_clip": 0.0168457, + "router_z_loss_mlp": 0.25, + "step": 6477, + "time_per_iteration": 3.0704691410064697 + }, + { + "auxiliary_loss_clip": 0.01123971, + "auxiliary_loss_mlp": 0.01038208, + "balance_loss_clip": 1.02279997, + "balance_loss_mlp": 1.04507279, + "epoch": 0.38947843078310534, + "flos": 18547502480640.0, + "grad_norm": 1.75333723767605, + "language_loss": 0.77916539, + "learning_rate": 2.7907950977457832e-06, + "loss": 0.80078721, + "num_input_tokens_seen": 139055560, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.7890625, + "step": 6478, + "time_per_iteration": 2.488922357559204 + }, + { + "auxiliary_loss_clip": 0.01118156, + "auxiliary_loss_mlp": 0.0103482, + "balance_loss_clip": 1.0212301, + "balance_loss_mlp": 1.04128957, + "epoch": 0.3895385540357733, + "flos": 14605875532800.0, + "grad_norm": 1.928920480761015, + "language_loss": 0.82250136, + "learning_rate": 2.7904373582719317e-06, + "loss": 0.8440311, + "num_input_tokens_seen": 139071865, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.76953125, + "step": 6479, + "time_per_iteration": 2.4171228408813477 + }, + { + "auxiliary_loss_clip": 0.01118219, + "auxiliary_loss_mlp": 0.01036864, + "balance_loss_clip": 1.02262461, + "balance_loss_mlp": 1.04175949, + "epoch": 0.38959867728844133, + "flos": 19975203336960.0, + "grad_norm": 1.7024032073041733, + "language_loss": 0.80111545, + "learning_rate": 2.790079588824617e-06, + "loss": 0.82266629, + "num_input_tokens_seen": 139089640, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.765625, + "step": 6480, + "time_per_iteration": 2.4750797748565674 + }, + { + "auxiliary_loss_clip": 0.01117569, + "auxiliary_loss_mlp": 0.01027596, + "balance_loss_clip": 1.01428056, + "balance_loss_mlp": 1.04215932, + "epoch": 0.3896588005411093, + "flos": 22672130244480.0, + "grad_norm": 1.550121095479633, + "language_loss": 0.83083898, + "learning_rate": 2.7897217894174038e-06, + "loss": 0.85229063, + "num_input_tokens_seen": 139109365, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75390625, + "step": 6481, + "time_per_iteration": 2.4715166091918945 + }, + { + "auxiliary_loss_clip": 0.01117656, + "auxiliary_loss_mlp": 0.01037477, + "balance_loss_clip": 1.02437592, + "balance_loss_mlp": 1.04459131, + "epoch": 0.38971892379377726, + "flos": 20996035862400.0, + "grad_norm": 1.557560720892756, + "language_loss": 0.75559932, + "learning_rate": 2.789363960063863e-06, + "loss": 0.77715063, + "num_input_tokens_seen": 139128260, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73046875, + "step": 6482, + "time_per_iteration": 2.4623568058013916 + }, + { + "auxiliary_loss_clip": 0.01119557, + "auxiliary_loss_mlp": 0.01033451, + "balance_loss_clip": 1.01972985, + "balance_loss_mlp": 1.04252028, + "epoch": 0.3897790470464452, + "flos": 22528487756160.0, + "grad_norm": 3.29893715214875, + "language_loss": 0.79150903, + "learning_rate": 2.78900610077756e-06, + "loss": 0.81303906, + "num_input_tokens_seen": 139147315, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76953125, + "step": 6483, + "time_per_iteration": 2.4530816078186035 + }, + { + "auxiliary_loss_clip": 0.01117537, + "auxiliary_loss_mlp": 0.01028675, + "balance_loss_clip": 1.0135119, + "balance_loss_mlp": 1.04091668, + "epoch": 0.3898391702991132, + "flos": 26209905603840.0, + "grad_norm": 1.4423872752445677, + "language_loss": 0.79842782, + "learning_rate": 2.788648211572067e-06, + "loss": 0.81989002, + "num_input_tokens_seen": 139167270, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.765625, + "step": 6484, + "time_per_iteration": 2.511016845703125 + }, + { + "auxiliary_loss_clip": 0.01121595, + "auxiliary_loss_mlp": 0.01044531, + "balance_loss_clip": 1.02905726, + "balance_loss_mlp": 1.04556251, + "epoch": 0.38989929355178116, + "flos": 21065558636160.0, + "grad_norm": 1.7756536915325172, + "language_loss": 0.78321344, + "learning_rate": 2.7882902924609557e-06, + "loss": 0.80487472, + "num_input_tokens_seen": 139185970, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.76171875, + "step": 6485, + "time_per_iteration": 2.443439245223999 + }, + { + "auxiliary_loss_clip": 0.01121432, + "auxiliary_loss_mlp": 0.0103836, + "balance_loss_clip": 1.02298832, + "balance_loss_mlp": 1.0427072, + "epoch": 0.3899594168044491, + "flos": 25484115392640.0, + "grad_norm": 2.7221954850945425, + "language_loss": 0.85305119, + "learning_rate": 2.7879323434577965e-06, + "loss": 0.87464917, + "num_input_tokens_seen": 139203730, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.78515625, + "step": 6486, + "time_per_iteration": 2.5056657791137695 + }, + { + "auxiliary_loss_clip": 0.01120884, + "auxiliary_loss_mlp": 0.01033404, + "balance_loss_clip": 1.01942706, + "balance_loss_mlp": 1.04115701, + "epoch": 0.3900195400571171, + "flos": 31139363456640.0, + "grad_norm": 1.7551040773297495, + "language_loss": 0.85345674, + "learning_rate": 2.7875743645761645e-06, + "loss": 0.87499964, + "num_input_tokens_seen": 139222560, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.796875, + "step": 6487, + "time_per_iteration": 2.577178478240967 + }, + { + "auxiliary_loss_clip": 0.01117565, + "auxiliary_loss_mlp": 0.01032303, + "balance_loss_clip": 1.01737833, + "balance_loss_mlp": 1.04198551, + "epoch": 0.39007966330978505, + "flos": 20229917656320.0, + "grad_norm": 1.5246902220393208, + "language_loss": 0.73225224, + "learning_rate": 2.787216355829633e-06, + "loss": 0.75375092, + "num_input_tokens_seen": 139242165, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.75390625, + "step": 6488, + "time_per_iteration": 2.523616075515747 + }, + { + "auxiliary_loss_clip": 0.01123928, + "auxiliary_loss_mlp": 0.0103261, + "balance_loss_clip": 1.01795912, + "balance_loss_mlp": 1.04519773, + "epoch": 0.390139786562453, + "flos": 22528739151360.0, + "grad_norm": 2.5708303691917815, + "language_loss": 0.68585873, + "learning_rate": 2.786858317231779e-06, + "loss": 0.7074241, + "num_input_tokens_seen": 139262525, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7890625, + "step": 6489, + "time_per_iteration": 2.478531837463379 + }, + { + "auxiliary_loss_clip": 0.01115096, + "auxiliary_loss_mlp": 0.01041079, + "balance_loss_clip": 1.02680993, + "balance_loss_mlp": 1.04124475, + "epoch": 0.390199909815121, + "flos": 26432911192320.0, + "grad_norm": 1.801271673710844, + "language_loss": 0.81112868, + "learning_rate": 2.7865002487961788e-06, + "loss": 0.83269042, + "num_input_tokens_seen": 139282835, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.734375, + "step": 6490, + "time_per_iteration": 2.511854887008667 + }, + { + "auxiliary_loss_clip": 0.01121469, + "auxiliary_loss_mlp": 0.01033838, + "balance_loss_clip": 1.0193367, + "balance_loss_mlp": 1.04286718, + "epoch": 0.39026003306778895, + "flos": 17274577328640.0, + "grad_norm": 1.9146492238240407, + "language_loss": 0.89305747, + "learning_rate": 2.7861421505364104e-06, + "loss": 0.91461056, + "num_input_tokens_seen": 139299490, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78515625, + "step": 6491, + "time_per_iteration": 2.460026264190674 + }, + { + "auxiliary_loss_clip": 0.01121295, + "auxiliary_loss_mlp": 0.01035704, + "balance_loss_clip": 1.02187026, + "balance_loss_mlp": 1.04215312, + "epoch": 0.3903201563204569, + "flos": 24532841554560.0, + "grad_norm": 1.8200320241713732, + "language_loss": 0.78811067, + "learning_rate": 2.7857840224660523e-06, + "loss": 0.80968064, + "num_input_tokens_seen": 139317865, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7890625, + "step": 6492, + "time_per_iteration": 2.529750108718872 + }, + { + "auxiliary_loss_clip": 0.01122151, + "auxiliary_loss_mlp": 0.01037174, + "balance_loss_clip": 1.02316093, + "balance_loss_mlp": 1.04309416, + "epoch": 0.39038027957312493, + "flos": 23767944410880.0, + "grad_norm": 1.613220074099035, + "language_loss": 0.74635601, + "learning_rate": 2.7854258645986857e-06, + "loss": 0.76794928, + "num_input_tokens_seen": 139339840, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 6493, + "time_per_iteration": 2.506000280380249 + }, + { + "auxiliary_loss_clip": 0.01123496, + "auxiliary_loss_mlp": 0.0103661, + "balance_loss_clip": 1.02160168, + "balance_loss_mlp": 1.04215276, + "epoch": 0.3904404028257929, + "flos": 14100612871680.0, + "grad_norm": 1.9992899078543964, + "language_loss": 0.76100057, + "learning_rate": 2.7850676769478916e-06, + "loss": 0.78260159, + "num_input_tokens_seen": 139357555, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8125, + "step": 6494, + "time_per_iteration": 2.4696662425994873 + }, + { + "auxiliary_loss_clip": 0.01128232, + "auxiliary_loss_mlp": 0.01048514, + "balance_loss_clip": 1.03233767, + "balance_loss_mlp": 1.04337156, + "epoch": 0.39050052607846086, + "flos": 16910048154240.0, + "grad_norm": 2.027559897328472, + "language_loss": 0.74284697, + "learning_rate": 2.7847094595272525e-06, + "loss": 0.76461446, + "num_input_tokens_seen": 139374455, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.84765625, + "step": 6495, + "time_per_iteration": 2.4156551361083984 + }, + { + "auxiliary_loss_clip": 0.01121782, + "auxiliary_loss_mlp": 0.01041912, + "balance_loss_clip": 1.02683187, + "balance_loss_mlp": 1.04346669, + "epoch": 0.39056064933112883, + "flos": 25915761129600.0, + "grad_norm": 1.725682312794404, + "language_loss": 0.67885542, + "learning_rate": 2.784351212350352e-06, + "loss": 0.70049238, + "num_input_tokens_seen": 139394770, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.78125, + "step": 6496, + "time_per_iteration": 2.530111789703369 + }, + { + "auxiliary_loss_clip": 0.01038749, + "auxiliary_loss_mlp": 0.01000219, + "balance_loss_clip": 0.99871743, + "balance_loss_mlp": 1.01313972, + "epoch": 0.3906207725837968, + "flos": 60028421713920.0, + "grad_norm": 0.6624336186281815, + "language_loss": 0.53998011, + "learning_rate": 2.783992935430775e-06, + "loss": 0.56036979, + "num_input_tokens_seen": 139454760, + "router_z_loss_clip": 0.01501465, + "router_z_loss_mlp": 0.25585938, + "step": 6497, + "time_per_iteration": 3.140427589416504 + }, + { + "auxiliary_loss_clip": 0.01119875, + "auxiliary_loss_mlp": 0.01038317, + "balance_loss_clip": 1.02404737, + "balance_loss_mlp": 1.04236674, + "epoch": 0.39068089583646476, + "flos": 21068683119360.0, + "grad_norm": 2.818865741362812, + "language_loss": 0.68966502, + "learning_rate": 2.7836346287821068e-06, + "loss": 0.71124697, + "num_input_tokens_seen": 139472645, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7734375, + "step": 6498, + "time_per_iteration": 2.4631001949310303 + }, + { + "auxiliary_loss_clip": 0.01037794, + "auxiliary_loss_mlp": 0.01003613, + "balance_loss_clip": 1.00210512, + "balance_loss_mlp": 1.0124712, + "epoch": 0.3907410190891327, + "flos": 70445677403520.0, + "grad_norm": 1.032001330091421, + "language_loss": 0.51830518, + "learning_rate": 2.783276292417936e-06, + "loss": 0.5387193, + "num_input_tokens_seen": 139536730, + "router_z_loss_clip": 0.01507568, + "router_z_loss_mlp": 0.25390625, + "step": 6499, + "time_per_iteration": 3.1206116676330566 + }, + { + "auxiliary_loss_clip": 0.01122549, + "auxiliary_loss_mlp": 0.01043094, + "balance_loss_clip": 1.0266552, + "balance_loss_mlp": 1.04158521, + "epoch": 0.3908011423418007, + "flos": 27962454084480.0, + "grad_norm": 1.8695650437594764, + "language_loss": 0.73693466, + "learning_rate": 2.7829179263518487e-06, + "loss": 0.75859112, + "num_input_tokens_seen": 139557540, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.80859375, + "step": 6500, + "time_per_iteration": 2.5413036346435547 + }, + { + "auxiliary_loss_clip": 0.01125544, + "auxiliary_loss_mlp": 0.01041341, + "balance_loss_clip": 1.02720869, + "balance_loss_mlp": 1.04501247, + "epoch": 0.39086126559446865, + "flos": 24462097718400.0, + "grad_norm": 2.5451317073491353, + "language_loss": 0.68355215, + "learning_rate": 2.7825595305974354e-06, + "loss": 0.70522094, + "num_input_tokens_seen": 139576875, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8046875, + "step": 6501, + "time_per_iteration": 2.4725823402404785 + }, + { + "auxiliary_loss_clip": 0.01118681, + "auxiliary_loss_mlp": 0.01039085, + "balance_loss_clip": 1.02550125, + "balance_loss_mlp": 1.04143763, + "epoch": 0.3909213888471366, + "flos": 16941541403520.0, + "grad_norm": 1.6766627212042646, + "language_loss": 0.79162323, + "learning_rate": 2.782201105168287e-06, + "loss": 0.81320089, + "num_input_tokens_seen": 139594295, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7734375, + "step": 6502, + "time_per_iteration": 2.4758012294769287 + }, + { + "auxiliary_loss_clip": 0.01118294, + "auxiliary_loss_mlp": 0.01037474, + "balance_loss_clip": 1.02378237, + "balance_loss_mlp": 1.0435648, + "epoch": 0.3909815120998046, + "flos": 29278400751360.0, + "grad_norm": 2.24722484247342, + "language_loss": 0.79379106, + "learning_rate": 2.7818426500779932e-06, + "loss": 0.81534874, + "num_input_tokens_seen": 139614080, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 6503, + "time_per_iteration": 2.510356903076172 + }, + { + "auxiliary_loss_clip": 0.0111339, + "auxiliary_loss_mlp": 0.01034049, + "balance_loss_clip": 1.02076924, + "balance_loss_mlp": 1.03882694, + "epoch": 0.39104163535247255, + "flos": 18951246328320.0, + "grad_norm": 1.8991979162106922, + "language_loss": 0.71695077, + "learning_rate": 2.7814841653401485e-06, + "loss": 0.73842514, + "num_input_tokens_seen": 139632755, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.74609375, + "step": 6504, + "time_per_iteration": 2.474257230758667 + }, + { + "auxiliary_loss_clip": 0.01116218, + "auxiliary_loss_mlp": 0.01038584, + "balance_loss_clip": 1.02404082, + "balance_loss_mlp": 1.03938556, + "epoch": 0.3911017586051405, + "flos": 26323347732480.0, + "grad_norm": 1.4403698273396093, + "language_loss": 0.83054864, + "learning_rate": 2.7811256509683454e-06, + "loss": 0.85209668, + "num_input_tokens_seen": 139654205, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.765625, + "step": 6505, + "time_per_iteration": 2.4917776584625244 + }, + { + "auxiliary_loss_clip": 0.01118318, + "auxiliary_loss_mlp": 0.01039423, + "balance_loss_clip": 1.02379465, + "balance_loss_mlp": 1.04268944, + "epoch": 0.3911618818578085, + "flos": 21835770992640.0, + "grad_norm": 1.9728617659661118, + "language_loss": 0.71202552, + "learning_rate": 2.7807671069761797e-06, + "loss": 0.73360288, + "num_input_tokens_seen": 139673595, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.7578125, + "step": 6506, + "time_per_iteration": 2.4846489429473877 + }, + { + "auxiliary_loss_clip": 0.01115165, + "auxiliary_loss_mlp": 0.01038977, + "balance_loss_clip": 1.02529216, + "balance_loss_mlp": 1.04129732, + "epoch": 0.3912220051104765, + "flos": 16359680989440.0, + "grad_norm": 2.0442674369719547, + "language_loss": 0.74914789, + "learning_rate": 2.7804085333772477e-06, + "loss": 0.77068931, + "num_input_tokens_seen": 139690565, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73828125, + "step": 6507, + "time_per_iteration": 2.4173166751861572 + }, + { + "auxiliary_loss_clip": 0.01036092, + "auxiliary_loss_mlp": 0.01010532, + "balance_loss_clip": 1.00900638, + "balance_loss_mlp": 1.01097417, + "epoch": 0.39128212836314447, + "flos": 71050986420480.0, + "grad_norm": 0.7697412763639314, + "language_loss": 0.56554615, + "learning_rate": 2.7800499301851446e-06, + "loss": 0.58601236, + "num_input_tokens_seen": 139749420, + "router_z_loss_clip": 0.01525879, + "router_z_loss_mlp": 0.25195312, + "step": 6508, + "time_per_iteration": 3.222599744796753 + }, + { + "auxiliary_loss_clip": 0.01118923, + "auxiliary_loss_mlp": 0.0103919, + "balance_loss_clip": 1.0256958, + "balance_loss_mlp": 1.04224479, + "epoch": 0.39134225161581243, + "flos": 20331975173760.0, + "grad_norm": 1.8903485988869968, + "language_loss": 0.7639432, + "learning_rate": 2.779691297413471e-06, + "loss": 0.78552431, + "num_input_tokens_seen": 139766265, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 6509, + "time_per_iteration": 2.4504122734069824 + }, + { + "auxiliary_loss_clip": 0.01119308, + "auxiliary_loss_mlp": 0.01046298, + "balance_loss_clip": 1.02919126, + "balance_loss_mlp": 1.04120517, + "epoch": 0.3914023748684804, + "flos": 17018390551680.0, + "grad_norm": 2.5320410479027284, + "language_loss": 0.82538676, + "learning_rate": 2.779332635075825e-06, + "loss": 0.84704286, + "num_input_tokens_seen": 139782400, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.78125, + "step": 6510, + "time_per_iteration": 2.4280829429626465 + }, + { + "auxiliary_loss_clip": 0.01119081, + "auxiliary_loss_mlp": 0.01036031, + "balance_loss_clip": 1.02202439, + "balance_loss_mlp": 1.04137504, + "epoch": 0.39146249812114836, + "flos": 18405224709120.0, + "grad_norm": 1.9726874536239134, + "language_loss": 0.76478642, + "learning_rate": 2.7789739431858073e-06, + "loss": 0.78633761, + "num_input_tokens_seen": 139801435, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.77734375, + "step": 6511, + "time_per_iteration": 2.438093662261963 + }, + { + "auxiliary_loss_clip": 0.01035954, + "auxiliary_loss_mlp": 0.01004811, + "balance_loss_clip": 1.0033921, + "balance_loss_mlp": 1.01070499, + "epoch": 0.3915226213738163, + "flos": 67637355442560.0, + "grad_norm": 0.7278620231464888, + "language_loss": 0.57780313, + "learning_rate": 2.7786152217570196e-06, + "loss": 0.59821081, + "num_input_tokens_seen": 139869700, + "router_z_loss_clip": 0.01416016, + "router_z_loss_mlp": 0.25390625, + "step": 6512, + "time_per_iteration": 6.094903230667114 + }, + { + "auxiliary_loss_clip": 0.01120485, + "auxiliary_loss_mlp": 0.01036295, + "balance_loss_clip": 1.02039289, + "balance_loss_mlp": 1.04215658, + "epoch": 0.3915827446264843, + "flos": 26359330181760.0, + "grad_norm": 1.6857291908308145, + "language_loss": 0.69891763, + "learning_rate": 2.7782564708030647e-06, + "loss": 0.72048545, + "num_input_tokens_seen": 139890140, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.78125, + "step": 6513, + "time_per_iteration": 3.8939309120178223 + }, + { + "auxiliary_loss_clip": 0.01122702, + "auxiliary_loss_mlp": 0.01039276, + "balance_loss_clip": 1.02474439, + "balance_loss_mlp": 1.04184556, + "epoch": 0.39164286787915226, + "flos": 21943897908480.0, + "grad_norm": 2.2930968868818606, + "language_loss": 0.76267236, + "learning_rate": 2.7778976903375464e-06, + "loss": 0.7842921, + "num_input_tokens_seen": 139908020, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.80859375, + "step": 6514, + "time_per_iteration": 2.4622693061828613 + }, + { + "auxiliary_loss_clip": 0.01118584, + "auxiliary_loss_mlp": 0.01035563, + "balance_loss_clip": 1.02168727, + "balance_loss_mlp": 1.04042864, + "epoch": 0.3917029911318202, + "flos": 16399829416320.0, + "grad_norm": 1.7838082674219136, + "language_loss": 0.77452338, + "learning_rate": 2.7775388803740693e-06, + "loss": 0.79606491, + "num_input_tokens_seen": 139926180, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.78125, + "step": 6515, + "time_per_iteration": 2.4336462020874023 + }, + { + "auxiliary_loss_clip": 0.01114007, + "auxiliary_loss_mlp": 0.01038217, + "balance_loss_clip": 1.02564025, + "balance_loss_mlp": 1.03940558, + "epoch": 0.3917631143844882, + "flos": 26211701283840.0, + "grad_norm": 1.4542421972503212, + "language_loss": 0.79846406, + "learning_rate": 2.7771800409262406e-06, + "loss": 0.81998634, + "num_input_tokens_seen": 139947420, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.74609375, + "step": 6516, + "time_per_iteration": 2.500826597213745 + }, + { + "auxiliary_loss_clip": 0.01118601, + "auxiliary_loss_mlp": 0.01033224, + "balance_loss_clip": 1.01891923, + "balance_loss_mlp": 1.04082477, + "epoch": 0.39182323763715615, + "flos": 18548364407040.0, + "grad_norm": 2.228742695866407, + "language_loss": 0.70205939, + "learning_rate": 2.7768211720076665e-06, + "loss": 0.72357762, + "num_input_tokens_seen": 139965800, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.77734375, + "step": 6517, + "time_per_iteration": 2.425739288330078 + }, + { + "auxiliary_loss_clip": 0.01117481, + "auxiliary_loss_mlp": 0.01036962, + "balance_loss_clip": 1.0218817, + "balance_loss_mlp": 1.03986263, + "epoch": 0.3918833608898241, + "flos": 34313543395200.0, + "grad_norm": 1.595983335780194, + "language_loss": 0.72092575, + "learning_rate": 2.776462273631956e-06, + "loss": 0.74247015, + "num_input_tokens_seen": 139988140, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7734375, + "step": 6518, + "time_per_iteration": 2.559140205383301 + }, + { + "auxiliary_loss_clip": 0.01118745, + "auxiliary_loss_mlp": 0.01032386, + "balance_loss_clip": 1.0179677, + "balance_loss_mlp": 1.04041731, + "epoch": 0.3919434841424921, + "flos": 36939582812160.0, + "grad_norm": 1.563160017416143, + "language_loss": 0.61668754, + "learning_rate": 2.7761033458127177e-06, + "loss": 0.63819885, + "num_input_tokens_seen": 140010060, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 6519, + "time_per_iteration": 2.5673322677612305 + }, + { + "auxiliary_loss_clip": 0.01124684, + "auxiliary_loss_mlp": 0.0104391, + "balance_loss_clip": 1.02800775, + "balance_loss_mlp": 1.04341698, + "epoch": 0.3920036073951601, + "flos": 23508956373120.0, + "grad_norm": 2.4564373100444232, + "language_loss": 0.6693083, + "learning_rate": 2.775744388563563e-06, + "loss": 0.6909942, + "num_input_tokens_seen": 140029400, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8125, + "step": 6520, + "time_per_iteration": 2.487650156021118 + }, + { + "auxiliary_loss_clip": 0.0111526, + "auxiliary_loss_mlp": 0.01033685, + "balance_loss_clip": 1.01958799, + "balance_loss_mlp": 1.03966665, + "epoch": 0.39206373064782807, + "flos": 18406086635520.0, + "grad_norm": 1.7599889377917473, + "language_loss": 0.78522319, + "learning_rate": 2.775385401898104e-06, + "loss": 0.80671263, + "num_input_tokens_seen": 140048940, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7578125, + "step": 6521, + "time_per_iteration": 2.418458938598633 + }, + { + "auxiliary_loss_clip": 0.0112345, + "auxiliary_loss_mlp": 0.01036245, + "balance_loss_clip": 1.01853049, + "balance_loss_mlp": 1.04218912, + "epoch": 0.39212385390049603, + "flos": 12313051608960.0, + "grad_norm": 2.4256865138527353, + "language_loss": 0.70340407, + "learning_rate": 2.775026385829952e-06, + "loss": 0.7250011, + "num_input_tokens_seen": 140066380, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.8125, + "step": 6522, + "time_per_iteration": 2.435802936553955 + }, + { + "auxiliary_loss_clip": 0.01120666, + "auxiliary_loss_mlp": 0.01034593, + "balance_loss_clip": 1.02013338, + "balance_loss_mlp": 1.04137838, + "epoch": 0.392183977153164, + "flos": 19719160214400.0, + "grad_norm": 1.8374103087918643, + "language_loss": 0.76740485, + "learning_rate": 2.774667340372722e-06, + "loss": 0.78895748, + "num_input_tokens_seen": 140085275, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7890625, + "step": 6523, + "time_per_iteration": 2.4279329776763916 + }, + { + "auxiliary_loss_clip": 0.01120195, + "auxiliary_loss_mlp": 0.0103949, + "balance_loss_clip": 1.02522683, + "balance_loss_mlp": 1.04124415, + "epoch": 0.39224410040583196, + "flos": 33144902403840.0, + "grad_norm": 2.339335808739943, + "language_loss": 0.61661494, + "learning_rate": 2.7743082655400293e-06, + "loss": 0.63821173, + "num_input_tokens_seen": 140105105, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7890625, + "step": 6524, + "time_per_iteration": 2.5591442584991455 + }, + { + "auxiliary_loss_clip": 0.01117506, + "auxiliary_loss_mlp": 0.0103718, + "balance_loss_clip": 1.02181363, + "balance_loss_mlp": 1.03898454, + "epoch": 0.39230422365849993, + "flos": 27782434097280.0, + "grad_norm": 1.6728206813409823, + "language_loss": 0.73940414, + "learning_rate": 2.773949161345489e-06, + "loss": 0.76095104, + "num_input_tokens_seen": 140125645, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.78515625, + "step": 6525, + "time_per_iteration": 2.4897830486297607 + }, + { + "auxiliary_loss_clip": 0.01117533, + "auxiliary_loss_mlp": 0.01036063, + "balance_loss_clip": 1.02224112, + "balance_loss_mlp": 1.03882146, + "epoch": 0.3923643469111679, + "flos": 17931634865280.0, + "grad_norm": 2.0942212479104363, + "language_loss": 0.81385779, + "learning_rate": 2.773590027802719e-06, + "loss": 0.83539373, + "num_input_tokens_seen": 140141925, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.78515625, + "step": 6526, + "time_per_iteration": 2.442091226577759 + }, + { + "auxiliary_loss_clip": 0.01115953, + "auxiliary_loss_mlp": 0.01036718, + "balance_loss_clip": 1.02265131, + "balance_loss_mlp": 1.03931344, + "epoch": 0.39242447016383586, + "flos": 24059539019520.0, + "grad_norm": 1.56527231709598, + "language_loss": 0.69802964, + "learning_rate": 2.7732308649253383e-06, + "loss": 0.71955633, + "num_input_tokens_seen": 140160965, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.765625, + "step": 6527, + "time_per_iteration": 2.465498924255371 + }, + { + "auxiliary_loss_clip": 0.01116064, + "auxiliary_loss_mlp": 0.01029624, + "balance_loss_clip": 1.0154264, + "balance_loss_mlp": 1.04067612, + "epoch": 0.3924845934165038, + "flos": 10664069016960.0, + "grad_norm": 2.4439619967755983, + "language_loss": 0.82215756, + "learning_rate": 2.772871672726965e-06, + "loss": 0.84361446, + "num_input_tokens_seen": 140177780, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.75390625, + "step": 6528, + "time_per_iteration": 2.488581418991089 + }, + { + "auxiliary_loss_clip": 0.01114295, + "auxiliary_loss_mlp": 0.0103716, + "balance_loss_clip": 1.02282465, + "balance_loss_mlp": 1.04024255, + "epoch": 0.3925447166691718, + "flos": 31245910174080.0, + "grad_norm": 1.4897772961790412, + "language_loss": 0.68726033, + "learning_rate": 2.7725124512212205e-06, + "loss": 0.70877492, + "num_input_tokens_seen": 140201660, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7421875, + "step": 6529, + "time_per_iteration": 2.5409562587738037 + }, + { + "auxiliary_loss_clip": 0.0111896, + "auxiliary_loss_mlp": 0.01039977, + "balance_loss_clip": 1.02561271, + "balance_loss_mlp": 1.04070282, + "epoch": 0.39260483992183975, + "flos": 29415040087680.0, + "grad_norm": 2.9003920421281926, + "language_loss": 0.79728955, + "learning_rate": 2.7721532004217267e-06, + "loss": 0.81887889, + "num_input_tokens_seen": 140218585, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78125, + "step": 6530, + "time_per_iteration": 2.514547109603882 + }, + { + "auxiliary_loss_clip": 0.01112608, + "auxiliary_loss_mlp": 0.01036942, + "balance_loss_clip": 1.02267241, + "balance_loss_mlp": 1.03750181, + "epoch": 0.3926649631745077, + "flos": 22857788666880.0, + "grad_norm": 1.6221630004730245, + "language_loss": 0.75564003, + "learning_rate": 2.7717939203421063e-06, + "loss": 0.77713549, + "num_input_tokens_seen": 140239905, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.75, + "step": 6531, + "time_per_iteration": 2.4572982788085938 + }, + { + "auxiliary_loss_clip": 0.01038893, + "auxiliary_loss_mlp": 0.0100286, + "balance_loss_clip": 1.00127435, + "balance_loss_mlp": 1.01370025, + "epoch": 0.3927250864271757, + "flos": 63893881872000.0, + "grad_norm": 0.8170127744653651, + "language_loss": 0.60378772, + "learning_rate": 2.7714346109959822e-06, + "loss": 0.62420523, + "num_input_tokens_seen": 140293820, + "router_z_loss_clip": 0.01586914, + "router_z_loss_mlp": 0.25195312, + "step": 6532, + "time_per_iteration": 2.929732084274292 + }, + { + "auxiliary_loss_clip": 0.01036987, + "auxiliary_loss_mlp": 0.01003862, + "balance_loss_clip": 1.00225282, + "balance_loss_mlp": 1.0117799, + "epoch": 0.3927852096798437, + "flos": 68909741890560.0, + "grad_norm": 0.7837299971611431, + "language_loss": 0.55545104, + "learning_rate": 2.771075272396981e-06, + "loss": 0.57585955, + "num_input_tokens_seen": 140360420, + "router_z_loss_clip": 0.01611328, + "router_z_loss_mlp": 0.25195312, + "step": 6533, + "time_per_iteration": 3.1820483207702637 + }, + { + "auxiliary_loss_clip": 0.01120735, + "auxiliary_loss_mlp": 0.01037419, + "balance_loss_clip": 1.02316761, + "balance_loss_mlp": 1.04170942, + "epoch": 0.39284533293251167, + "flos": 29715972232320.0, + "grad_norm": 1.9313522305780093, + "language_loss": 0.75972468, + "learning_rate": 2.7707159045587284e-06, + "loss": 0.78130615, + "num_input_tokens_seen": 140381950, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7890625, + "step": 6534, + "time_per_iteration": 2.5650813579559326 + }, + { + "auxiliary_loss_clip": 0.01122617, + "auxiliary_loss_mlp": 0.01038287, + "balance_loss_clip": 1.02376163, + "balance_loss_mlp": 1.04177046, + "epoch": 0.39290545618517964, + "flos": 18552027594240.0, + "grad_norm": 2.213634574223379, + "language_loss": 0.78067005, + "learning_rate": 2.770356507494851e-06, + "loss": 0.802279, + "num_input_tokens_seen": 140399410, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.80859375, + "step": 6535, + "time_per_iteration": 2.447950839996338 + }, + { + "auxiliary_loss_clip": 0.01113628, + "auxiliary_loss_mlp": 0.01032655, + "balance_loss_clip": 1.01950026, + "balance_loss_mlp": 1.03985262, + "epoch": 0.3929655794378476, + "flos": 26249479413120.0, + "grad_norm": 2.091132286884177, + "language_loss": 0.68613565, + "learning_rate": 2.769997081218978e-06, + "loss": 0.70759845, + "num_input_tokens_seen": 140419055, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73828125, + "step": 6536, + "time_per_iteration": 2.4873242378234863 + }, + { + "auxiliary_loss_clip": 0.01112038, + "auxiliary_loss_mlp": 0.01035235, + "balance_loss_clip": 1.0223484, + "balance_loss_mlp": 1.03908086, + "epoch": 0.39302570269051557, + "flos": 29277933874560.0, + "grad_norm": 1.7105256577096235, + "language_loss": 0.69052541, + "learning_rate": 2.769637625744738e-06, + "loss": 0.71199811, + "num_input_tokens_seen": 140438800, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.73046875, + "step": 6537, + "time_per_iteration": 2.5867457389831543 + }, + { + "auxiliary_loss_clip": 0.01117392, + "auxiliary_loss_mlp": 0.01038473, + "balance_loss_clip": 1.02420986, + "balance_loss_mlp": 1.04011965, + "epoch": 0.39308582594318353, + "flos": 17347440067200.0, + "grad_norm": 1.6628056753547982, + "language_loss": 0.79044384, + "learning_rate": 2.769278141085763e-06, + "loss": 0.81200254, + "num_input_tokens_seen": 140456880, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7734375, + "step": 6538, + "time_per_iteration": 2.437757968902588 + }, + { + "auxiliary_loss_clip": 0.01034351, + "auxiliary_loss_mlp": 0.01009828, + "balance_loss_clip": 1.0084635, + "balance_loss_mlp": 1.00972295, + "epoch": 0.3931459491958515, + "flos": 61007094650880.0, + "grad_norm": 0.8042725449961473, + "language_loss": 0.61871827, + "learning_rate": 2.768918627255683e-06, + "loss": 0.63916004, + "num_input_tokens_seen": 140507510, + "router_z_loss_clip": 0.01367188, + "router_z_loss_mlp": 0.24609375, + "step": 6539, + "time_per_iteration": 2.9012601375579834 + }, + { + "auxiliary_loss_clip": 0.01115165, + "auxiliary_loss_mlp": 0.01038758, + "balance_loss_clip": 1.02417326, + "balance_loss_mlp": 1.03897023, + "epoch": 0.39320607244851946, + "flos": 39016009249920.0, + "grad_norm": 2.1025744829352306, + "language_loss": 0.68334043, + "learning_rate": 2.7685590842681315e-06, + "loss": 0.70487964, + "num_input_tokens_seen": 140528740, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.76171875, + "step": 6540, + "time_per_iteration": 2.617544412612915 + }, + { + "auxiliary_loss_clip": 0.01114279, + "auxiliary_loss_mlp": 0.01035483, + "balance_loss_clip": 1.02167249, + "balance_loss_mlp": 1.0387044, + "epoch": 0.3932661957011874, + "flos": 24679752180480.0, + "grad_norm": 1.7155589252050778, + "language_loss": 0.72714561, + "learning_rate": 2.7681995121367433e-06, + "loss": 0.74864328, + "num_input_tokens_seen": 140547560, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 6541, + "time_per_iteration": 2.5576202869415283 + }, + { + "auxiliary_loss_clip": 0.01034882, + "auxiliary_loss_mlp": 0.01010056, + "balance_loss_clip": 1.00863171, + "balance_loss_mlp": 1.0103662, + "epoch": 0.3933263189538554, + "flos": 70096552185600.0, + "grad_norm": 0.8254504926360222, + "language_loss": 0.60302341, + "learning_rate": 2.7678399108751516e-06, + "loss": 0.62347269, + "num_input_tokens_seen": 140601175, + "router_z_loss_clip": 0.01422119, + "router_z_loss_mlp": 0.24511719, + "step": 6542, + "time_per_iteration": 2.921311378479004 + }, + { + "auxiliary_loss_clip": 0.01115263, + "auxiliary_loss_mlp": 0.01035713, + "balance_loss_clip": 1.02204013, + "balance_loss_mlp": 1.03968477, + "epoch": 0.39338644220652336, + "flos": 22929071207040.0, + "grad_norm": 1.9294145782355336, + "language_loss": 0.82255107, + "learning_rate": 2.7674802804969947e-06, + "loss": 0.84406084, + "num_input_tokens_seen": 140622200, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75390625, + "step": 6543, + "time_per_iteration": 2.5267767906188965 + }, + { + "auxiliary_loss_clip": 0.01111884, + "auxiliary_loss_mlp": 0.01035514, + "balance_loss_clip": 1.02153063, + "balance_loss_mlp": 1.03692436, + "epoch": 0.3934465654591913, + "flos": 30848163897600.0, + "grad_norm": 1.6066266241550669, + "language_loss": 0.69336796, + "learning_rate": 2.767120621015908e-06, + "loss": 0.7148419, + "num_input_tokens_seen": 140643125, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75, + "step": 6544, + "time_per_iteration": 2.5192980766296387 + }, + { + "auxiliary_loss_clip": 0.01118616, + "auxiliary_loss_mlp": 0.01042644, + "balance_loss_clip": 1.02729011, + "balance_loss_mlp": 1.03997457, + "epoch": 0.3935066887118593, + "flos": 29236528471680.0, + "grad_norm": 1.880723151689185, + "language_loss": 0.75104976, + "learning_rate": 2.76676093244553e-06, + "loss": 0.77266246, + "num_input_tokens_seen": 140662500, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.7890625, + "step": 6545, + "time_per_iteration": 2.5483953952789307 + }, + { + "auxiliary_loss_clip": 0.01112383, + "auxiliary_loss_mlp": 0.01035537, + "balance_loss_clip": 1.02350879, + "balance_loss_mlp": 1.04072022, + "epoch": 0.3935668119645273, + "flos": 19135288638720.0, + "grad_norm": 1.4191511939867936, + "language_loss": 0.74600172, + "learning_rate": 2.7664012147995015e-06, + "loss": 0.76748097, + "num_input_tokens_seen": 140681960, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.71875, + "step": 6546, + "time_per_iteration": 2.435189962387085 + }, + { + "auxiliary_loss_clip": 0.01120275, + "auxiliary_loss_mlp": 0.01037024, + "balance_loss_clip": 1.02256405, + "balance_loss_mlp": 1.03998446, + "epoch": 0.3936269352171953, + "flos": 18516116972160.0, + "grad_norm": 2.8050093889996326, + "language_loss": 0.81520575, + "learning_rate": 2.7660414680914617e-06, + "loss": 0.83677876, + "num_input_tokens_seen": 140699170, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.80078125, + "step": 6547, + "time_per_iteration": 2.5359435081481934 + }, + { + "auxiliary_loss_clip": 0.0111424, + "auxiliary_loss_mlp": 0.01028344, + "balance_loss_clip": 1.01444387, + "balance_loss_mlp": 1.03795588, + "epoch": 0.39368705846986324, + "flos": 15632813370240.0, + "grad_norm": 2.282095961224954, + "language_loss": 0.84300089, + "learning_rate": 2.7656816923350525e-06, + "loss": 0.86442673, + "num_input_tokens_seen": 140714920, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.765625, + "step": 6548, + "time_per_iteration": 2.430497407913208 + }, + { + "auxiliary_loss_clip": 0.01110548, + "auxiliary_loss_mlp": 0.01030679, + "balance_loss_clip": 1.01784039, + "balance_loss_mlp": 1.0382576, + "epoch": 0.3937471817225312, + "flos": 21325839563520.0, + "grad_norm": 1.5261467823901598, + "language_loss": 0.72481942, + "learning_rate": 2.7653218875439174e-06, + "loss": 0.74623168, + "num_input_tokens_seen": 140734595, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.72265625, + "step": 6549, + "time_per_iteration": 2.484938383102417 + }, + { + "auxiliary_loss_clip": 0.01116832, + "auxiliary_loss_mlp": 0.01034368, + "balance_loss_clip": 1.02025914, + "balance_loss_mlp": 1.04114747, + "epoch": 0.39380730497519917, + "flos": 20776693461120.0, + "grad_norm": 1.525417369659451, + "language_loss": 0.77678335, + "learning_rate": 2.764962053731699e-06, + "loss": 0.79829538, + "num_input_tokens_seen": 140754050, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7578125, + "step": 6550, + "time_per_iteration": 2.4533822536468506 + }, + { + "auxiliary_loss_clip": 0.01112095, + "auxiliary_loss_mlp": 0.0103049, + "balance_loss_clip": 1.01695979, + "balance_loss_mlp": 1.03770638, + "epoch": 0.39386742822786713, + "flos": 21609784575360.0, + "grad_norm": 1.6825180459961226, + "language_loss": 0.81065381, + "learning_rate": 2.7646021909120434e-06, + "loss": 0.83207965, + "num_input_tokens_seen": 140771440, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7421875, + "step": 6551, + "time_per_iteration": 2.4740419387817383 + }, + { + "auxiliary_loss_clip": 0.01115626, + "auxiliary_loss_mlp": 0.01037041, + "balance_loss_clip": 1.02310574, + "balance_loss_mlp": 1.03833413, + "epoch": 0.3939275514805351, + "flos": 12414642249600.0, + "grad_norm": 2.2350138021364003, + "language_loss": 0.80241704, + "learning_rate": 2.764242299098596e-06, + "loss": 0.82394373, + "num_input_tokens_seen": 140786715, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7734375, + "step": 6552, + "time_per_iteration": 2.4066245555877686 + }, + { + "auxiliary_loss_clip": 0.01118032, + "auxiliary_loss_mlp": 0.01038605, + "balance_loss_clip": 1.02449059, + "balance_loss_mlp": 1.04108357, + "epoch": 0.39398767473320306, + "flos": 18552027594240.0, + "grad_norm": 2.2028177738118884, + "language_loss": 0.71154666, + "learning_rate": 2.763882378305003e-06, + "loss": 0.73311305, + "num_input_tokens_seen": 140804950, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.76953125, + "step": 6553, + "time_per_iteration": 2.454035997390747 + }, + { + "auxiliary_loss_clip": 0.01115775, + "auxiliary_loss_mlp": 0.01034177, + "balance_loss_clip": 1.02037239, + "balance_loss_mlp": 1.0409205, + "epoch": 0.39404779798587103, + "flos": 29308888419840.0, + "grad_norm": 1.9276274050376605, + "language_loss": 0.63445336, + "learning_rate": 2.7635224285449144e-06, + "loss": 0.65595293, + "num_input_tokens_seen": 140822800, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.75, + "step": 6554, + "time_per_iteration": 5.467530250549316 + }, + { + "auxiliary_loss_clip": 0.01116231, + "auxiliary_loss_mlp": 0.0103909, + "balance_loss_clip": 1.02620983, + "balance_loss_mlp": 1.041237, + "epoch": 0.394107921238539, + "flos": 34897055834880.0, + "grad_norm": 2.7325305725381703, + "language_loss": 0.79567587, + "learning_rate": 2.7631624498319796e-06, + "loss": 0.81722915, + "num_input_tokens_seen": 140842940, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.75, + "step": 6555, + "time_per_iteration": 3.9707608222961426 + }, + { + "auxiliary_loss_clip": 0.01119332, + "auxiliary_loss_mlp": 0.01036045, + "balance_loss_clip": 1.0209887, + "balance_loss_mlp": 1.04194546, + "epoch": 0.39416804449120696, + "flos": 25081413039360.0, + "grad_norm": 1.8303237809157376, + "language_loss": 0.71571302, + "learning_rate": 2.7628024421798473e-06, + "loss": 0.73726678, + "num_input_tokens_seen": 140863060, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7734375, + "step": 6556, + "time_per_iteration": 2.5013363361358643 + }, + { + "auxiliary_loss_clip": 0.01115996, + "auxiliary_loss_mlp": 0.0103255, + "balance_loss_clip": 1.01806605, + "balance_loss_mlp": 1.03954887, + "epoch": 0.3942281677438749, + "flos": 32306639731200.0, + "grad_norm": 2.056709462434603, + "language_loss": 0.83915412, + "learning_rate": 2.7624424056021705e-06, + "loss": 0.86063957, + "num_input_tokens_seen": 140883795, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.765625, + "step": 6557, + "time_per_iteration": 2.7162060737609863 + }, + { + "auxiliary_loss_clip": 0.01115806, + "auxiliary_loss_mlp": 0.01035387, + "balance_loss_clip": 1.02195859, + "balance_loss_mlp": 1.04014397, + "epoch": 0.3942882909965429, + "flos": 24936621315840.0, + "grad_norm": 3.2694171829217953, + "language_loss": 0.80285048, + "learning_rate": 2.7620823401126004e-06, + "loss": 0.8243624, + "num_input_tokens_seen": 140903055, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 6558, + "time_per_iteration": 2.466904401779175 + }, + { + "auxiliary_loss_clip": 0.01115408, + "auxiliary_loss_mlp": 0.01033231, + "balance_loss_clip": 1.02037418, + "balance_loss_mlp": 1.04165912, + "epoch": 0.39434841424921085, + "flos": 11874797769600.0, + "grad_norm": 1.7254990423790144, + "language_loss": 0.71022832, + "learning_rate": 2.761722245724792e-06, + "loss": 0.73171461, + "num_input_tokens_seen": 140920685, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.734375, + "step": 6559, + "time_per_iteration": 2.474142551422119 + }, + { + "auxiliary_loss_clip": 0.01120627, + "auxiliary_loss_mlp": 0.0103693, + "balance_loss_clip": 1.02111125, + "balance_loss_mlp": 1.04030299, + "epoch": 0.3944085375018789, + "flos": 16361620323840.0, + "grad_norm": 1.8853849407225942, + "language_loss": 0.80391413, + "learning_rate": 2.7613621224524003e-06, + "loss": 0.82548964, + "num_input_tokens_seen": 140937320, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8046875, + "step": 6560, + "time_per_iteration": 2.4220218658447266 + }, + { + "auxiliary_loss_clip": 0.01121865, + "auxiliary_loss_mlp": 0.01037184, + "balance_loss_clip": 1.022223, + "balance_loss_mlp": 1.04395843, + "epoch": 0.39446866075454684, + "flos": 10633365866880.0, + "grad_norm": 3.2514761912447283, + "language_loss": 0.83440554, + "learning_rate": 2.7610019703090803e-06, + "loss": 0.85599601, + "num_input_tokens_seen": 140954855, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.78125, + "step": 6561, + "time_per_iteration": 2.458305835723877 + }, + { + "auxiliary_loss_clip": 0.01117482, + "auxiliary_loss_mlp": 0.01038407, + "balance_loss_clip": 1.02458477, + "balance_loss_mlp": 1.04098439, + "epoch": 0.3945287840072148, + "flos": 18187498419840.0, + "grad_norm": 2.862241713271481, + "language_loss": 0.79548055, + "learning_rate": 2.7606417893084887e-06, + "loss": 0.81703943, + "num_input_tokens_seen": 140973250, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 6562, + "time_per_iteration": 2.4390974044799805 + }, + { + "auxiliary_loss_clip": 0.01113935, + "auxiliary_loss_mlp": 0.01036907, + "balance_loss_clip": 1.02301359, + "balance_loss_mlp": 1.04043949, + "epoch": 0.39458890725988277, + "flos": 23039891642880.0, + "grad_norm": 1.512260767998718, + "language_loss": 0.81355608, + "learning_rate": 2.7602815794642853e-06, + "loss": 0.83506453, + "num_input_tokens_seen": 140993050, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.734375, + "step": 6563, + "time_per_iteration": 2.518843650817871 + }, + { + "auxiliary_loss_clip": 0.0111742, + "auxiliary_loss_mlp": 0.01040253, + "balance_loss_clip": 1.02541161, + "balance_loss_mlp": 1.041682, + "epoch": 0.39464903051255074, + "flos": 17159052211200.0, + "grad_norm": 1.9438463538262531, + "language_loss": 0.69416577, + "learning_rate": 2.759921340790127e-06, + "loss": 0.71574247, + "num_input_tokens_seen": 141010815, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7578125, + "step": 6564, + "time_per_iteration": 2.446140766143799 + }, + { + "auxiliary_loss_clip": 0.01118477, + "auxiliary_loss_mlp": 0.01034591, + "balance_loss_clip": 1.02079892, + "balance_loss_mlp": 1.04157352, + "epoch": 0.3947091537652187, + "flos": 15889000147200.0, + "grad_norm": 3.234298893133154, + "language_loss": 0.83141822, + "learning_rate": 2.759561073299676e-06, + "loss": 0.8529489, + "num_input_tokens_seen": 141028720, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76953125, + "step": 6565, + "time_per_iteration": 2.474611520767212 + }, + { + "auxiliary_loss_clip": 0.01115253, + "auxiliary_loss_mlp": 0.01033237, + "balance_loss_clip": 1.02002859, + "balance_loss_mlp": 1.04039359, + "epoch": 0.39476927701788667, + "flos": 18545491319040.0, + "grad_norm": 1.7678460287206497, + "language_loss": 0.82917452, + "learning_rate": 2.7592007770065937e-06, + "loss": 0.85065943, + "num_input_tokens_seen": 141046025, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.75, + "step": 6566, + "time_per_iteration": 2.432832956314087 + }, + { + "auxiliary_loss_clip": 0.01122918, + "auxiliary_loss_mlp": 0.01038867, + "balance_loss_clip": 1.02493143, + "balance_loss_mlp": 1.04225016, + "epoch": 0.39482940027055463, + "flos": 22275712771200.0, + "grad_norm": 2.357536272997057, + "language_loss": 0.7778033, + "learning_rate": 2.7588404519245403e-06, + "loss": 0.79942119, + "num_input_tokens_seen": 141066865, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.8046875, + "step": 6567, + "time_per_iteration": 2.5020110607147217 + }, + { + "auxiliary_loss_clip": 0.01110775, + "auxiliary_loss_mlp": 0.01040399, + "balance_loss_clip": 1.02689242, + "balance_loss_mlp": 1.04026425, + "epoch": 0.3948895235232226, + "flos": 14757634494720.0, + "grad_norm": 2.0625384967809546, + "language_loss": 0.80381507, + "learning_rate": 2.758480098067182e-06, + "loss": 0.8253268, + "num_input_tokens_seen": 141084210, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.70703125, + "step": 6568, + "time_per_iteration": 2.464186429977417 + }, + { + "auxiliary_loss_clip": 0.01116352, + "auxiliary_loss_mlp": 0.01036196, + "balance_loss_clip": 1.02282655, + "balance_loss_mlp": 1.04130197, + "epoch": 0.39494964677589056, + "flos": 22565763095040.0, + "grad_norm": 1.6625556258765348, + "language_loss": 0.84206939, + "learning_rate": 2.7581197154481816e-06, + "loss": 0.86359489, + "num_input_tokens_seen": 141103895, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.75, + "step": 6569, + "time_per_iteration": 2.4947829246520996 + }, + { + "auxiliary_loss_clip": 0.01118805, + "auxiliary_loss_mlp": 0.01037428, + "balance_loss_clip": 1.02418959, + "balance_loss_mlp": 1.04450357, + "epoch": 0.3950097700285585, + "flos": 22963186149120.0, + "grad_norm": 1.920459843417803, + "language_loss": 0.74973899, + "learning_rate": 2.7577593040812066e-06, + "loss": 0.77130127, + "num_input_tokens_seen": 141124000, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7421875, + "step": 6570, + "time_per_iteration": 2.50211763381958 + }, + { + "auxiliary_loss_clip": 0.01117011, + "auxiliary_loss_mlp": 0.01037708, + "balance_loss_clip": 1.02365923, + "balance_loss_mlp": 1.04104555, + "epoch": 0.3950698932812265, + "flos": 20595236929920.0, + "grad_norm": 1.649568183340291, + "language_loss": 0.79813123, + "learning_rate": 2.757398863979922e-06, + "loss": 0.81967843, + "num_input_tokens_seen": 141142535, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 6571, + "time_per_iteration": 2.477740526199341 + }, + { + "auxiliary_loss_clip": 0.01116017, + "auxiliary_loss_mlp": 0.01041795, + "balance_loss_clip": 1.02846146, + "balance_loss_mlp": 1.04203689, + "epoch": 0.39513001653389446, + "flos": 20375786787840.0, + "grad_norm": 1.628324795196944, + "language_loss": 0.77873337, + "learning_rate": 2.757038395157997e-06, + "loss": 0.80031145, + "num_input_tokens_seen": 141161575, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73828125, + "step": 6572, + "time_per_iteration": 2.4463839530944824 + }, + { + "auxiliary_loss_clip": 0.01118116, + "auxiliary_loss_mlp": 0.01040167, + "balance_loss_clip": 1.02636874, + "balance_loss_mlp": 1.0404911, + "epoch": 0.3951901397865625, + "flos": 26463650256000.0, + "grad_norm": 1.6456702645470058, + "language_loss": 0.7506038, + "learning_rate": 2.7566778976291002e-06, + "loss": 0.77218664, + "num_input_tokens_seen": 141181150, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.77734375, + "step": 6573, + "time_per_iteration": 2.501692295074463 + }, + { + "auxiliary_loss_clip": 0.01114036, + "auxiliary_loss_mlp": 0.01034006, + "balance_loss_clip": 1.02165031, + "balance_loss_mlp": 1.04046559, + "epoch": 0.39525026303923044, + "flos": 43838345767680.0, + "grad_norm": 1.4003162240803297, + "language_loss": 0.67956495, + "learning_rate": 2.7563173714069017e-06, + "loss": 0.70104533, + "num_input_tokens_seen": 141206310, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.734375, + "step": 6574, + "time_per_iteration": 2.6566920280456543 + }, + { + "auxiliary_loss_clip": 0.01119799, + "auxiliary_loss_mlp": 0.01034669, + "balance_loss_clip": 1.01978612, + "balance_loss_mlp": 1.04216623, + "epoch": 0.3953103862918984, + "flos": 18040803275520.0, + "grad_norm": 2.170019312223073, + "language_loss": 0.71719187, + "learning_rate": 2.755956816505072e-06, + "loss": 0.73873657, + "num_input_tokens_seen": 141223925, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7734375, + "step": 6575, + "time_per_iteration": 2.463792085647583 + }, + { + "auxiliary_loss_clip": 0.01119276, + "auxiliary_loss_mlp": 0.01042807, + "balance_loss_clip": 1.02859664, + "balance_loss_mlp": 1.04105997, + "epoch": 0.3953705095445664, + "flos": 16976015481600.0, + "grad_norm": 2.0080051897694324, + "language_loss": 0.73535955, + "learning_rate": 2.7555962329372845e-06, + "loss": 0.75698036, + "num_input_tokens_seen": 141239010, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 6576, + "time_per_iteration": 2.409817934036255 + }, + { + "auxiliary_loss_clip": 0.01115385, + "auxiliary_loss_mlp": 0.01036906, + "balance_loss_clip": 1.0243237, + "balance_loss_mlp": 1.03979337, + "epoch": 0.39543063279723434, + "flos": 17411144837760.0, + "grad_norm": 2.36733568983198, + "language_loss": 0.83294857, + "learning_rate": 2.7552356207172124e-06, + "loss": 0.8544715, + "num_input_tokens_seen": 141252255, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7578125, + "step": 6577, + "time_per_iteration": 2.4421181678771973 + }, + { + "auxiliary_loss_clip": 0.01115466, + "auxiliary_loss_mlp": 0.01031962, + "balance_loss_clip": 1.01860428, + "balance_loss_mlp": 1.04138541, + "epoch": 0.3954907560499023, + "flos": 22784207656320.0, + "grad_norm": 3.8530294325048984, + "language_loss": 0.89916354, + "learning_rate": 2.75487497985853e-06, + "loss": 0.92063785, + "num_input_tokens_seen": 141269325, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7421875, + "step": 6578, + "time_per_iteration": 2.470369577407837 + }, + { + "auxiliary_loss_clip": 0.01118987, + "auxiliary_loss_mlp": 0.01037124, + "balance_loss_clip": 1.02170992, + "balance_loss_mlp": 1.04030561, + "epoch": 0.39555087930257027, + "flos": 21944400698880.0, + "grad_norm": 1.7408596896151103, + "language_loss": 0.77871025, + "learning_rate": 2.7545143103749117e-06, + "loss": 0.80027139, + "num_input_tokens_seen": 141288505, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.7890625, + "step": 6579, + "time_per_iteration": 2.4619040489196777 + }, + { + "auxiliary_loss_clip": 0.01119633, + "auxiliary_loss_mlp": 0.01031625, + "balance_loss_clip": 1.01760054, + "balance_loss_mlp": 1.0407021, + "epoch": 0.39561100255523823, + "flos": 20404622430720.0, + "grad_norm": 2.037188254408411, + "language_loss": 0.68324131, + "learning_rate": 2.754153612280037e-06, + "loss": 0.70475388, + "num_input_tokens_seen": 141303680, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 6580, + "time_per_iteration": 2.4363577365875244 + }, + { + "auxiliary_loss_clip": 0.01115068, + "auxiliary_loss_mlp": 0.01028446, + "balance_loss_clip": 1.01499939, + "balance_loss_mlp": 1.04099488, + "epoch": 0.3956711258079062, + "flos": 27964572986880.0, + "grad_norm": 1.613777567548473, + "language_loss": 0.58620721, + "learning_rate": 2.7537928855875797e-06, + "loss": 0.60764229, + "num_input_tokens_seen": 141324090, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7421875, + "step": 6581, + "time_per_iteration": 2.5704734325408936 + }, + { + "auxiliary_loss_clip": 0.01118807, + "auxiliary_loss_mlp": 0.01038995, + "balance_loss_clip": 1.02479148, + "balance_loss_mlp": 1.04165769, + "epoch": 0.39573124906057416, + "flos": 14428297670400.0, + "grad_norm": 2.015576445189345, + "language_loss": 0.698632, + "learning_rate": 2.7534321303112224e-06, + "loss": 0.72021002, + "num_input_tokens_seen": 141342235, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7734375, + "step": 6582, + "time_per_iteration": 2.4640939235687256 + }, + { + "auxiliary_loss_clip": 0.01118406, + "auxiliary_loss_mlp": 0.01035395, + "balance_loss_clip": 1.02167404, + "balance_loss_mlp": 1.0415566, + "epoch": 0.39579137231324213, + "flos": 18733699607040.0, + "grad_norm": 2.285451965985758, + "language_loss": 0.76454568, + "learning_rate": 2.753071346464642e-06, + "loss": 0.78608364, + "num_input_tokens_seen": 141361195, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76953125, + "step": 6583, + "time_per_iteration": 2.437396287918091 + }, + { + "auxiliary_loss_clip": 0.01118401, + "auxiliary_loss_mlp": 0.01030837, + "balance_loss_clip": 1.01708043, + "balance_loss_mlp": 1.04192805, + "epoch": 0.3958514955659101, + "flos": 17676417755520.0, + "grad_norm": 1.5685917359515968, + "language_loss": 0.65989023, + "learning_rate": 2.7527105340615207e-06, + "loss": 0.68138266, + "num_input_tokens_seen": 141378275, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.765625, + "step": 6584, + "time_per_iteration": 2.4562485218048096 + }, + { + "auxiliary_loss_clip": 0.01120331, + "auxiliary_loss_mlp": 0.01037784, + "balance_loss_clip": 1.02262115, + "balance_loss_mlp": 1.04122627, + "epoch": 0.39591161881857806, + "flos": 29309103901440.0, + "grad_norm": 2.6735523944320136, + "language_loss": 0.72423065, + "learning_rate": 2.7523496931155413e-06, + "loss": 0.74581182, + "num_input_tokens_seen": 141396960, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.7890625, + "step": 6585, + "time_per_iteration": 2.517333984375 + }, + { + "auxiliary_loss_clip": 0.0111653, + "auxiliary_loss_mlp": 0.01031743, + "balance_loss_clip": 1.01811159, + "balance_loss_mlp": 1.04010367, + "epoch": 0.3959717420712461, + "flos": 25771831332480.0, + "grad_norm": 1.986310622320223, + "language_loss": 0.73430967, + "learning_rate": 2.7519888236403856e-06, + "loss": 0.75579244, + "num_input_tokens_seen": 141417320, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.765625, + "step": 6586, + "time_per_iteration": 2.513847827911377 + }, + { + "auxiliary_loss_clip": 0.01117404, + "auxiliary_loss_mlp": 0.01031194, + "balance_loss_clip": 1.01738322, + "balance_loss_mlp": 1.04139459, + "epoch": 0.39603186532391405, + "flos": 20923783655040.0, + "grad_norm": 2.2420315368265915, + "language_loss": 0.71627617, + "learning_rate": 2.7516279256497382e-06, + "loss": 0.73776209, + "num_input_tokens_seen": 141435985, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 6587, + "time_per_iteration": 2.498534917831421 + }, + { + "auxiliary_loss_clip": 0.01038457, + "auxiliary_loss_mlp": 0.01003592, + "balance_loss_clip": 1.0020541, + "balance_loss_mlp": 1.01416993, + "epoch": 0.396091988576582, + "flos": 54880986176640.0, + "grad_norm": 0.9067384171744824, + "language_loss": 0.61162889, + "learning_rate": 2.751266999157285e-06, + "loss": 0.63204944, + "num_input_tokens_seen": 141486075, + "router_z_loss_clip": 0.01531982, + "router_z_loss_mlp": 0.2421875, + "step": 6588, + "time_per_iteration": 2.9129557609558105 + }, + { + "auxiliary_loss_clip": 0.01117429, + "auxiliary_loss_mlp": 0.01035443, + "balance_loss_clip": 1.0215075, + "balance_loss_mlp": 1.04087436, + "epoch": 0.39615211182925, + "flos": 20702896968960.0, + "grad_norm": 1.9745840784771536, + "language_loss": 0.81579673, + "learning_rate": 2.7509060441767115e-06, + "loss": 0.83732545, + "num_input_tokens_seen": 141505280, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 6589, + "time_per_iteration": 2.487581253051758 + }, + { + "auxiliary_loss_clip": 0.01118186, + "auxiliary_loss_mlp": 0.01033247, + "balance_loss_clip": 1.01858449, + "balance_loss_mlp": 1.04102254, + "epoch": 0.39621223508191794, + "flos": 20994312009600.0, + "grad_norm": 2.0157149751951606, + "language_loss": 0.70171028, + "learning_rate": 2.7505450607217057e-06, + "loss": 0.72322464, + "num_input_tokens_seen": 141523930, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7734375, + "step": 6590, + "time_per_iteration": 2.4837629795074463 + }, + { + "auxiliary_loss_clip": 0.01118811, + "auxiliary_loss_mlp": 0.01039001, + "balance_loss_clip": 1.02517259, + "balance_loss_mlp": 1.04276454, + "epoch": 0.3962723583345859, + "flos": 23368833417600.0, + "grad_norm": 1.6568331410473631, + "language_loss": 0.76061213, + "learning_rate": 2.750184048805956e-06, + "loss": 0.7821902, + "num_input_tokens_seen": 141541320, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76171875, + "step": 6591, + "time_per_iteration": 2.574401617050171 + }, + { + "auxiliary_loss_clip": 0.01119076, + "auxiliary_loss_mlp": 0.0104207, + "balance_loss_clip": 1.02803326, + "balance_loss_mlp": 1.04253912, + "epoch": 0.39633248158725387, + "flos": 25115599808640.0, + "grad_norm": 1.7800794685008139, + "language_loss": 0.79121935, + "learning_rate": 2.749823008443152e-06, + "loss": 0.81283081, + "num_input_tokens_seen": 141561880, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.765625, + "step": 6592, + "time_per_iteration": 2.5065057277679443 + }, + { + "auxiliary_loss_clip": 0.01112832, + "auxiliary_loss_mlp": 0.010329, + "balance_loss_clip": 1.01945305, + "balance_loss_mlp": 1.04020298, + "epoch": 0.39639260483992184, + "flos": 39787622236800.0, + "grad_norm": 1.6584377020479992, + "language_loss": 0.69372392, + "learning_rate": 2.7494619396469843e-06, + "loss": 0.71518123, + "num_input_tokens_seen": 141586460, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7265625, + "step": 6593, + "time_per_iteration": 2.691351890563965 + }, + { + "auxiliary_loss_clip": 0.01119923, + "auxiliary_loss_mlp": 0.01038681, + "balance_loss_clip": 1.02389932, + "balance_loss_mlp": 1.04100418, + "epoch": 0.3964527280925898, + "flos": 17347045017600.0, + "grad_norm": 1.6545825162449217, + "language_loss": 0.77913815, + "learning_rate": 2.7491008424311452e-06, + "loss": 0.80072421, + "num_input_tokens_seen": 141605955, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7890625, + "step": 6594, + "time_per_iteration": 2.452536106109619 + }, + { + "auxiliary_loss_clip": 0.01038921, + "auxiliary_loss_mlp": 0.01002091, + "balance_loss_clip": 1.0005945, + "balance_loss_mlp": 1.0146898, + "epoch": 0.39651285134525777, + "flos": 71717848369920.0, + "grad_norm": 0.9454940833877284, + "language_loss": 0.63038307, + "learning_rate": 2.7487397168093265e-06, + "loss": 0.65079319, + "num_input_tokens_seen": 141673140, + "router_z_loss_clip": 0.01495361, + "router_z_loss_mlp": 0.2421875, + "step": 6595, + "time_per_iteration": 6.018520355224609 + }, + { + "auxiliary_loss_clip": 0.01121925, + "auxiliary_loss_mlp": 0.01044146, + "balance_loss_clip": 1.02908421, + "balance_loss_mlp": 1.04294038, + "epoch": 0.39657297459792573, + "flos": 25775710001280.0, + "grad_norm": 2.072222886004575, + "language_loss": 0.6329869, + "learning_rate": 2.748378562795223e-06, + "loss": 0.65464759, + "num_input_tokens_seen": 141692955, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7890625, + "step": 6596, + "time_per_iteration": 5.302752494812012 + }, + { + "auxiliary_loss_clip": 0.01115587, + "auxiliary_loss_mlp": 0.01035004, + "balance_loss_clip": 1.02110457, + "balance_loss_mlp": 1.04157937, + "epoch": 0.3966330978505937, + "flos": 20266115587200.0, + "grad_norm": 2.0492451282774273, + "language_loss": 0.78553772, + "learning_rate": 2.7480173804025293e-06, + "loss": 0.80704355, + "num_input_tokens_seen": 141710680, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.73828125, + "step": 6597, + "time_per_iteration": 2.457028388977051 + }, + { + "auxiliary_loss_clip": 0.01121814, + "auxiliary_loss_mlp": 0.01040279, + "balance_loss_clip": 1.02558672, + "balance_loss_mlp": 1.04262114, + "epoch": 0.39669322110326166, + "flos": 20631183465600.0, + "grad_norm": 1.95592503590265, + "language_loss": 0.67559552, + "learning_rate": 2.747656169644941e-06, + "loss": 0.69721651, + "num_input_tokens_seen": 141729860, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.79296875, + "step": 6598, + "time_per_iteration": 2.4448981285095215 + }, + { + "auxiliary_loss_clip": 0.01117545, + "auxiliary_loss_mlp": 0.01034771, + "balance_loss_clip": 1.02153933, + "balance_loss_mlp": 1.0411458, + "epoch": 0.3967533443559297, + "flos": 21726063878400.0, + "grad_norm": 2.3323846151329235, + "language_loss": 0.78922117, + "learning_rate": 2.747294930536157e-06, + "loss": 0.81074429, + "num_input_tokens_seen": 141749060, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.765625, + "step": 6599, + "time_per_iteration": 2.4799394607543945 + }, + { + "auxiliary_loss_clip": 0.01117884, + "auxiliary_loss_mlp": 0.01032083, + "balance_loss_clip": 1.01680064, + "balance_loss_mlp": 1.04196167, + "epoch": 0.39681346760859765, + "flos": 25484151306240.0, + "grad_norm": 1.67964508136209, + "language_loss": 0.72716624, + "learning_rate": 2.7469336630898737e-06, + "loss": 0.74866593, + "num_input_tokens_seen": 141769860, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.7578125, + "step": 6600, + "time_per_iteration": 2.4940543174743652 + }, + { + "auxiliary_loss_clip": 0.01115602, + "auxiliary_loss_mlp": 0.01032131, + "balance_loss_clip": 1.01864827, + "balance_loss_mlp": 1.03997052, + "epoch": 0.3968735908612656, + "flos": 20959586536320.0, + "grad_norm": 1.9442093512958227, + "language_loss": 0.85773253, + "learning_rate": 2.746572367319791e-06, + "loss": 0.87920988, + "num_input_tokens_seen": 141788465, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 6601, + "time_per_iteration": 2.4826369285583496 + }, + { + "auxiliary_loss_clip": 0.01124374, + "auxiliary_loss_mlp": 0.01038225, + "balance_loss_clip": 1.02191091, + "balance_loss_mlp": 1.04298782, + "epoch": 0.3969337141139336, + "flos": 10707090531840.0, + "grad_norm": 2.3202277168625054, + "language_loss": 0.70015699, + "learning_rate": 2.7462110432396095e-06, + "loss": 0.72178292, + "num_input_tokens_seen": 141804955, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8125, + "step": 6602, + "time_per_iteration": 2.4452199935913086 + }, + { + "auxiliary_loss_clip": 0.01119686, + "auxiliary_loss_mlp": 0.01038286, + "balance_loss_clip": 1.02458847, + "balance_loss_mlp": 1.04225206, + "epoch": 0.39699383736660154, + "flos": 17593714690560.0, + "grad_norm": 2.564497124514123, + "language_loss": 0.83408487, + "learning_rate": 2.7458496908630305e-06, + "loss": 0.85566461, + "num_input_tokens_seen": 141820025, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 6603, + "time_per_iteration": 2.50046968460083 + }, + { + "auxiliary_loss_clip": 0.01115539, + "auxiliary_loss_mlp": 0.01032527, + "balance_loss_clip": 1.0192889, + "balance_loss_mlp": 1.04076076, + "epoch": 0.3970539606192695, + "flos": 17785945301760.0, + "grad_norm": 1.4733286794124776, + "language_loss": 0.72804213, + "learning_rate": 2.7454883102037563e-06, + "loss": 0.74952281, + "num_input_tokens_seen": 141838735, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.74609375, + "step": 6604, + "time_per_iteration": 2.435645580291748 + }, + { + "auxiliary_loss_clip": 0.01114015, + "auxiliary_loss_mlp": 0.01037208, + "balance_loss_clip": 1.02366602, + "balance_loss_mlp": 1.0427258, + "epoch": 0.3971140838719375, + "flos": 24789495208320.0, + "grad_norm": 1.694386771997249, + "language_loss": 0.82919562, + "learning_rate": 2.745126901275491e-06, + "loss": 0.85070789, + "num_input_tokens_seen": 141858090, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7109375, + "step": 6605, + "time_per_iteration": 2.538792371749878 + }, + { + "auxiliary_loss_clip": 0.01113567, + "auxiliary_loss_mlp": 0.0103244, + "balance_loss_clip": 1.02053654, + "balance_loss_mlp": 1.04017544, + "epoch": 0.39717420712460544, + "flos": 24243581329920.0, + "grad_norm": 1.515379376113219, + "language_loss": 0.73755872, + "learning_rate": 2.7447654640919383e-06, + "loss": 0.75901884, + "num_input_tokens_seen": 141877540, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.734375, + "step": 6606, + "time_per_iteration": 2.4766290187835693 + }, + { + "auxiliary_loss_clip": 0.0111968, + "auxiliary_loss_mlp": 0.01034451, + "balance_loss_clip": 1.0207423, + "balance_loss_mlp": 1.04279184, + "epoch": 0.3972343303772734, + "flos": 25884698843520.0, + "grad_norm": 1.9669838489657716, + "language_loss": 0.73925817, + "learning_rate": 2.744403998666805e-06, + "loss": 0.76079941, + "num_input_tokens_seen": 141897315, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76953125, + "step": 6607, + "time_per_iteration": 2.550140380859375 + }, + { + "auxiliary_loss_clip": 0.01121372, + "auxiliary_loss_mlp": 0.01034109, + "balance_loss_clip": 1.02045417, + "balance_loss_mlp": 1.04417753, + "epoch": 0.39729445362994137, + "flos": 45623716300800.0, + "grad_norm": 1.5241940789626238, + "language_loss": 0.67978024, + "learning_rate": 2.744042505013797e-06, + "loss": 0.70133507, + "num_input_tokens_seen": 141919580, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 6608, + "time_per_iteration": 2.70333194732666 + }, + { + "auxiliary_loss_clip": 0.01120221, + "auxiliary_loss_mlp": 0.01042402, + "balance_loss_clip": 1.0263803, + "balance_loss_mlp": 1.04247403, + "epoch": 0.39735457688260933, + "flos": 20193971120640.0, + "grad_norm": 2.3779993769587486, + "language_loss": 0.74649572, + "learning_rate": 2.7436809831466233e-06, + "loss": 0.76812196, + "num_input_tokens_seen": 141937045, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.77734375, + "step": 6609, + "time_per_iteration": 2.4810678958892822 + }, + { + "auxiliary_loss_clip": 0.01119236, + "auxiliary_loss_mlp": 0.01032767, + "balance_loss_clip": 1.01909387, + "balance_loss_mlp": 1.04284418, + "epoch": 0.3974147001352773, + "flos": 23331163029120.0, + "grad_norm": 4.182923272039756, + "language_loss": 0.71530509, + "learning_rate": 2.7433194330789927e-06, + "loss": 0.73682511, + "num_input_tokens_seen": 141956695, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.765625, + "step": 6610, + "time_per_iteration": 2.483358860015869 + }, + { + "auxiliary_loss_clip": 0.01110348, + "auxiliary_loss_mlp": 0.0103254, + "balance_loss_clip": 1.01881909, + "balance_loss_mlp": 1.03868747, + "epoch": 0.39747482338794526, + "flos": 21688644885120.0, + "grad_norm": 1.6591621928280806, + "language_loss": 0.7848928, + "learning_rate": 2.7429578548246133e-06, + "loss": 0.80632162, + "num_input_tokens_seen": 141975935, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71484375, + "step": 6611, + "time_per_iteration": 2.4707412719726562 + }, + { + "auxiliary_loss_clip": 0.01120047, + "auxiliary_loss_mlp": 0.01036907, + "balance_loss_clip": 1.0234127, + "balance_loss_mlp": 1.04496026, + "epoch": 0.3975349466406133, + "flos": 30988717816320.0, + "grad_norm": 1.7910222988347433, + "language_loss": 0.78681552, + "learning_rate": 2.7425962483971985e-06, + "loss": 0.80838501, + "num_input_tokens_seen": 141995750, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 6612, + "time_per_iteration": 2.552384614944458 + }, + { + "auxiliary_loss_clip": 0.01042423, + "auxiliary_loss_mlp": 0.01023175, + "balance_loss_clip": 1.02180374, + "balance_loss_mlp": 1.01794136, + "epoch": 0.39759506989328125, + "flos": 63683948833920.0, + "grad_norm": 0.8703127674216669, + "language_loss": 0.64956641, + "learning_rate": 2.742234613810459e-06, + "loss": 0.6702224, + "num_input_tokens_seen": 142057655, + "router_z_loss_clip": 0.01373291, + "router_z_loss_mlp": 0.24414062, + "step": 6613, + "time_per_iteration": 2.978494882583618 + }, + { + "auxiliary_loss_clip": 0.01116625, + "auxiliary_loss_mlp": 0.01031073, + "balance_loss_clip": 1.01683927, + "balance_loss_mlp": 1.04148316, + "epoch": 0.3976551931459492, + "flos": 23695835857920.0, + "grad_norm": 2.0550022834902797, + "language_loss": 0.71538055, + "learning_rate": 2.741872951078109e-06, + "loss": 0.73685759, + "num_input_tokens_seen": 142076020, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.75390625, + "step": 6614, + "time_per_iteration": 2.4898061752319336 + }, + { + "auxiliary_loss_clip": 0.01116246, + "auxiliary_loss_mlp": 0.0103036, + "balance_loss_clip": 1.01644266, + "balance_loss_mlp": 1.04124689, + "epoch": 0.3977153163986172, + "flos": 15669657745920.0, + "grad_norm": 1.8540793086422767, + "language_loss": 0.81317735, + "learning_rate": 2.741511260213862e-06, + "loss": 0.83464336, + "num_input_tokens_seen": 142093790, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 6615, + "time_per_iteration": 2.4708592891693115 + }, + { + "auxiliary_loss_clip": 0.01116463, + "auxiliary_loss_mlp": 0.01033552, + "balance_loss_clip": 1.02074313, + "balance_loss_mlp": 1.04221725, + "epoch": 0.39777543965128515, + "flos": 14064702249600.0, + "grad_norm": 2.466828000769562, + "language_loss": 0.67015827, + "learning_rate": 2.741149541231434e-06, + "loss": 0.69165838, + "num_input_tokens_seen": 142110545, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7421875, + "step": 6616, + "time_per_iteration": 2.4453790187835693 + }, + { + "auxiliary_loss_clip": 0.01120268, + "auxiliary_loss_mlp": 0.01034152, + "balance_loss_clip": 1.02032995, + "balance_loss_mlp": 1.04185963, + "epoch": 0.3978355629039531, + "flos": 23367468700800.0, + "grad_norm": 2.097035382924748, + "language_loss": 0.83857769, + "learning_rate": 2.740787794144541e-06, + "loss": 0.86012185, + "num_input_tokens_seen": 142128695, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.78125, + "step": 6617, + "time_per_iteration": 2.4740309715270996 + }, + { + "auxiliary_loss_clip": 0.01113934, + "auxiliary_loss_mlp": 0.01035523, + "balance_loss_clip": 1.02256477, + "balance_loss_mlp": 1.04305041, + "epoch": 0.3978956861566211, + "flos": 19062785036160.0, + "grad_norm": 1.6139116519566428, + "language_loss": 0.72253633, + "learning_rate": 2.7404260189669e-06, + "loss": 0.74403095, + "num_input_tokens_seen": 142148375, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 6618, + "time_per_iteration": 2.451362371444702 + }, + { + "auxiliary_loss_clip": 0.01117142, + "auxiliary_loss_mlp": 0.01035822, + "balance_loss_clip": 1.02070642, + "balance_loss_mlp": 1.04263783, + "epoch": 0.39795580940928904, + "flos": 30227699341440.0, + "grad_norm": 1.9091502235972209, + "language_loss": 0.65847683, + "learning_rate": 2.740064215712231e-06, + "loss": 0.6800065, + "num_input_tokens_seen": 142169735, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.74609375, + "step": 6619, + "time_per_iteration": 2.5479021072387695 + }, + { + "auxiliary_loss_clip": 0.01041684, + "auxiliary_loss_mlp": 0.00999907, + "balance_loss_clip": 0.99843466, + "balance_loss_mlp": 1.0170114, + "epoch": 0.398015932661957, + "flos": 69847224906240.0, + "grad_norm": 0.7720250582246381, + "language_loss": 0.58222711, + "learning_rate": 2.7397023843942527e-06, + "loss": 0.60264301, + "num_input_tokens_seen": 142229520, + "router_z_loss_clip": 0.01470947, + "router_z_loss_mlp": 0.24609375, + "step": 6620, + "time_per_iteration": 3.0502688884735107 + }, + { + "auxiliary_loss_clip": 0.01116159, + "auxiliary_loss_mlp": 0.01036059, + "balance_loss_clip": 1.02383971, + "balance_loss_mlp": 1.04254556, + "epoch": 0.39807605591462497, + "flos": 20157773189760.0, + "grad_norm": 1.5861085047038441, + "language_loss": 0.79551339, + "learning_rate": 2.739340525026686e-06, + "loss": 0.81703556, + "num_input_tokens_seen": 142247660, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.73828125, + "step": 6621, + "time_per_iteration": 2.4595162868499756 + }, + { + "auxiliary_loss_clip": 0.01115012, + "auxiliary_loss_mlp": 0.01030289, + "balance_loss_clip": 1.01709294, + "balance_loss_mlp": 1.04198873, + "epoch": 0.39813617916729294, + "flos": 21141761339520.0, + "grad_norm": 1.9955210259775171, + "language_loss": 0.78070045, + "learning_rate": 2.738978637623252e-06, + "loss": 0.80215347, + "num_input_tokens_seen": 142266990, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73046875, + "step": 6622, + "time_per_iteration": 2.487805128097534 + }, + { + "auxiliary_loss_clip": 0.01116184, + "auxiliary_loss_mlp": 0.01030398, + "balance_loss_clip": 1.01685607, + "balance_loss_mlp": 1.04132223, + "epoch": 0.3981963024199609, + "flos": 18988485753600.0, + "grad_norm": 1.5290489885204759, + "language_loss": 0.75010175, + "learning_rate": 2.738616722197674e-06, + "loss": 0.77156758, + "num_input_tokens_seen": 142287170, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.75, + "step": 6623, + "time_per_iteration": 2.464571714401245 + }, + { + "auxiliary_loss_clip": 0.01116211, + "auxiliary_loss_mlp": 0.01036455, + "balance_loss_clip": 1.02278805, + "balance_loss_mlp": 1.04220378, + "epoch": 0.39825642567262887, + "flos": 16575108808320.0, + "grad_norm": 1.7278538768787957, + "language_loss": 0.79535556, + "learning_rate": 2.7382547787636766e-06, + "loss": 0.81688213, + "num_input_tokens_seen": 142305405, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73828125, + "step": 6624, + "time_per_iteration": 2.4550037384033203 + }, + { + "auxiliary_loss_clip": 0.01120431, + "auxiliary_loss_mlp": 0.01041321, + "balance_loss_clip": 1.02627707, + "balance_loss_mlp": 1.04234707, + "epoch": 0.39831654892529683, + "flos": 22199833290240.0, + "grad_norm": 2.035642441182755, + "language_loss": 0.83558613, + "learning_rate": 2.7378928073349832e-06, + "loss": 0.85720372, + "num_input_tokens_seen": 142322710, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.78125, + "step": 6625, + "time_per_iteration": 2.456171989440918 + }, + { + "auxiliary_loss_clip": 0.01114643, + "auxiliary_loss_mlp": 0.01042231, + "balance_loss_clip": 1.02839124, + "balance_loss_mlp": 1.04085207, + "epoch": 0.39837667217796485, + "flos": 10487963612160.0, + "grad_norm": 2.051687002705142, + "language_loss": 0.86593187, + "learning_rate": 2.737530807925321e-06, + "loss": 0.88750064, + "num_input_tokens_seen": 142338535, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.73828125, + "step": 6626, + "time_per_iteration": 2.4335460662841797 + }, + { + "auxiliary_loss_clip": 0.01115116, + "auxiliary_loss_mlp": 0.01036656, + "balance_loss_clip": 1.02238643, + "balance_loss_mlp": 1.04094946, + "epoch": 0.3984367954306328, + "flos": 17965282930560.0, + "grad_norm": 2.3900066005878386, + "language_loss": 0.83897698, + "learning_rate": 2.737168780548417e-06, + "loss": 0.86049473, + "num_input_tokens_seen": 142354570, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7421875, + "step": 6627, + "time_per_iteration": 2.4269766807556152 + }, + { + "auxiliary_loss_clip": 0.01111344, + "auxiliary_loss_mlp": 0.01038178, + "balance_loss_clip": 1.02514243, + "balance_loss_mlp": 1.03955984, + "epoch": 0.3984969186833008, + "flos": 22711057608960.0, + "grad_norm": 1.4398151096773946, + "language_loss": 0.82760668, + "learning_rate": 2.736806725217998e-06, + "loss": 0.8491019, + "num_input_tokens_seen": 142374395, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71875, + "step": 6628, + "time_per_iteration": 2.529315948486328 + }, + { + "auxiliary_loss_clip": 0.01115476, + "auxiliary_loss_mlp": 0.01040725, + "balance_loss_clip": 1.027421, + "balance_loss_mlp": 1.04130399, + "epoch": 0.39855704193596875, + "flos": 23405785534080.0, + "grad_norm": 1.8256672588255014, + "language_loss": 0.70683473, + "learning_rate": 2.7364446419477945e-06, + "loss": 0.72839677, + "num_input_tokens_seen": 142396040, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7421875, + "step": 6629, + "time_per_iteration": 2.5025413036346436 + }, + { + "auxiliary_loss_clip": 0.01114279, + "auxiliary_loss_mlp": 0.01035106, + "balance_loss_clip": 1.02155161, + "balance_loss_mlp": 1.04309297, + "epoch": 0.3986171651886367, + "flos": 21251935330560.0, + "grad_norm": 4.278612279497538, + "language_loss": 0.80683714, + "learning_rate": 2.7360825307515366e-06, + "loss": 0.82833099, + "num_input_tokens_seen": 142415495, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7109375, + "step": 6630, + "time_per_iteration": 2.4792280197143555 + }, + { + "auxiliary_loss_clip": 0.01116417, + "auxiliary_loss_mlp": 0.01027934, + "balance_loss_clip": 1.01485634, + "balance_loss_mlp": 1.04143131, + "epoch": 0.3986772884413047, + "flos": 12458705258880.0, + "grad_norm": 1.8749880656247468, + "language_loss": 0.75354141, + "learning_rate": 2.7357203916429555e-06, + "loss": 0.7749849, + "num_input_tokens_seen": 142431865, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.75, + "step": 6631, + "time_per_iteration": 2.417546272277832 + }, + { + "auxiliary_loss_clip": 0.0111705, + "auxiliary_loss_mlp": 0.01035263, + "balance_loss_clip": 1.0218699, + "balance_loss_mlp": 1.04246461, + "epoch": 0.39873741169397264, + "flos": 19646117907840.0, + "grad_norm": 2.3246230169523194, + "language_loss": 0.7156167, + "learning_rate": 2.735358224635783e-06, + "loss": 0.73713982, + "num_input_tokens_seen": 142450595, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.74609375, + "step": 6632, + "time_per_iteration": 2.446089744567871 + }, + { + "auxiliary_loss_clip": 0.01111142, + "auxiliary_loss_mlp": 0.01037094, + "balance_loss_clip": 1.02449358, + "balance_loss_mlp": 1.03939462, + "epoch": 0.3987975349466406, + "flos": 21684766216320.0, + "grad_norm": 1.8450465759001686, + "language_loss": 0.74742806, + "learning_rate": 2.7349960297437533e-06, + "loss": 0.76891041, + "num_input_tokens_seen": 142466650, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71875, + "step": 6633, + "time_per_iteration": 2.431104898452759 + }, + { + "auxiliary_loss_clip": 0.011138, + "auxiliary_loss_mlp": 0.01027649, + "balance_loss_clip": 1.01455402, + "balance_loss_mlp": 1.03961205, + "epoch": 0.3988576581993086, + "flos": 23914064937600.0, + "grad_norm": 1.781985159362602, + "language_loss": 0.808864, + "learning_rate": 2.7346338069806e-06, + "loss": 0.83027852, + "num_input_tokens_seen": 142486165, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7421875, + "step": 6634, + "time_per_iteration": 2.471496105194092 + }, + { + "auxiliary_loss_clip": 0.01116689, + "auxiliary_loss_mlp": 0.01032338, + "balance_loss_clip": 1.01856947, + "balance_loss_mlp": 1.04252565, + "epoch": 0.39891778145197654, + "flos": 18149899858560.0, + "grad_norm": 1.7295196741572958, + "language_loss": 0.74605262, + "learning_rate": 2.7342715563600597e-06, + "loss": 0.7675429, + "num_input_tokens_seen": 142505035, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7421875, + "step": 6635, + "time_per_iteration": 2.4630682468414307 + }, + { + "auxiliary_loss_clip": 0.01120499, + "auxiliary_loss_mlp": 0.01044274, + "balance_loss_clip": 1.02930093, + "balance_loss_mlp": 1.04096711, + "epoch": 0.3989779047046445, + "flos": 22595281096320.0, + "grad_norm": 1.9670463450002986, + "language_loss": 0.66429746, + "learning_rate": 2.733909277895868e-06, + "loss": 0.68594521, + "num_input_tokens_seen": 142521870, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.796875, + "step": 6636, + "time_per_iteration": 2.454789876937866 + }, + { + "auxiliary_loss_clip": 0.0111332, + "auxiliary_loss_mlp": 0.01034295, + "balance_loss_clip": 1.02131867, + "balance_loss_mlp": 1.0403626, + "epoch": 0.39903802795731247, + "flos": 18077216688000.0, + "grad_norm": 1.695302941119513, + "language_loss": 0.81410646, + "learning_rate": 2.733546971601763e-06, + "loss": 0.83558261, + "num_input_tokens_seen": 142540455, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.73046875, + "step": 6637, + "time_per_iteration": 5.387745380401611 + }, + { + "auxiliary_loss_clip": 0.01040567, + "auxiliary_loss_mlp": 0.01000444, + "balance_loss_clip": 0.99893045, + "balance_loss_mlp": 1.0159328, + "epoch": 0.39909815120998043, + "flos": 70441367771520.0, + "grad_norm": 0.7139106827959352, + "language_loss": 0.53211641, + "learning_rate": 2.733184637491484e-06, + "loss": 0.55252659, + "num_input_tokens_seen": 142599665, + "router_z_loss_clip": 0.01513672, + "router_z_loss_mlp": 0.24609375, + "step": 6638, + "time_per_iteration": 4.465191125869751 + }, + { + "auxiliary_loss_clip": 0.01114413, + "auxiliary_loss_mlp": 0.01035276, + "balance_loss_clip": 1.02260959, + "balance_loss_mlp": 1.04064405, + "epoch": 0.39915827446264845, + "flos": 18549262247040.0, + "grad_norm": 1.9403504228046689, + "language_loss": 0.75377512, + "learning_rate": 2.732822275578769e-06, + "loss": 0.77527201, + "num_input_tokens_seen": 142618845, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.73828125, + "step": 6639, + "time_per_iteration": 2.4947104454040527 + }, + { + "auxiliary_loss_clip": 0.01112086, + "auxiliary_loss_mlp": 0.01030627, + "balance_loss_clip": 1.01788926, + "balance_loss_mlp": 1.04078937, + "epoch": 0.3992183977153164, + "flos": 29897249195520.0, + "grad_norm": 1.632879790681491, + "language_loss": 0.76217377, + "learning_rate": 2.7324598858773603e-06, + "loss": 0.78360093, + "num_input_tokens_seen": 142640885, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 6640, + "time_per_iteration": 2.524815320968628 + }, + { + "auxiliary_loss_clip": 0.01112883, + "auxiliary_loss_mlp": 0.01037412, + "balance_loss_clip": 1.02448368, + "balance_loss_mlp": 1.03855717, + "epoch": 0.3992785209679844, + "flos": 22565080736640.0, + "grad_norm": 2.5962495804033794, + "language_loss": 0.82264209, + "learning_rate": 2.7320974684009996e-06, + "loss": 0.84414506, + "num_input_tokens_seen": 142659340, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7421875, + "step": 6641, + "time_per_iteration": 2.4753921031951904 + }, + { + "auxiliary_loss_clip": 0.01116915, + "auxiliary_loss_mlp": 0.010322, + "balance_loss_clip": 1.01891971, + "balance_loss_mlp": 1.04188418, + "epoch": 0.39933864422065235, + "flos": 19682674974720.0, + "grad_norm": 2.015070946619467, + "language_loss": 0.7685014, + "learning_rate": 2.7317350231634288e-06, + "loss": 0.78999245, + "num_input_tokens_seen": 142677085, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75, + "step": 6642, + "time_per_iteration": 2.431239604949951 + }, + { + "auxiliary_loss_clip": 0.01114257, + "auxiliary_loss_mlp": 0.0103328, + "balance_loss_clip": 1.019642, + "balance_loss_mlp": 1.03963089, + "epoch": 0.3993987674733203, + "flos": 23038491012480.0, + "grad_norm": 2.2960488262105145, + "language_loss": 0.7247656, + "learning_rate": 2.731372550178393e-06, + "loss": 0.74624097, + "num_input_tokens_seen": 142694595, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 6643, + "time_per_iteration": 2.4759740829467773 + }, + { + "auxiliary_loss_clip": 0.01115242, + "auxiliary_loss_mlp": 0.01035377, + "balance_loss_clip": 1.0214113, + "balance_loss_mlp": 1.04014993, + "epoch": 0.3994588907259883, + "flos": 19390828970880.0, + "grad_norm": 1.5171926718970592, + "language_loss": 0.65988386, + "learning_rate": 2.7310100494596375e-06, + "loss": 0.68139005, + "num_input_tokens_seen": 142714175, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75, + "step": 6644, + "time_per_iteration": 2.437404155731201 + }, + { + "auxiliary_loss_clip": 0.01113182, + "auxiliary_loss_mlp": 0.01037023, + "balance_loss_clip": 1.0235281, + "balance_loss_mlp": 1.0386616, + "epoch": 0.39951901397865625, + "flos": 13734395758080.0, + "grad_norm": 1.956427678643188, + "language_loss": 0.78470129, + "learning_rate": 2.730647521020907e-06, + "loss": 0.80620331, + "num_input_tokens_seen": 142730955, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 6645, + "time_per_iteration": 2.44826078414917 + }, + { + "auxiliary_loss_clip": 0.01115381, + "auxiliary_loss_mlp": 0.01033765, + "balance_loss_clip": 1.02034187, + "balance_loss_mlp": 1.04042077, + "epoch": 0.3995791372313242, + "flos": 23586451966080.0, + "grad_norm": 1.409098570486763, + "language_loss": 0.69889182, + "learning_rate": 2.73028496487595e-06, + "loss": 0.72038329, + "num_input_tokens_seen": 142751200, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 6646, + "time_per_iteration": 2.4746181964874268 + }, + { + "auxiliary_loss_clip": 0.01113557, + "auxiliary_loss_mlp": 0.01035502, + "balance_loss_clip": 1.0222578, + "balance_loss_mlp": 1.03869605, + "epoch": 0.3996392604839922, + "flos": 21355896268800.0, + "grad_norm": 1.7478077072518943, + "language_loss": 0.72165501, + "learning_rate": 2.729922381038513e-06, + "loss": 0.74314553, + "num_input_tokens_seen": 142770170, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.74609375, + "step": 6647, + "time_per_iteration": 2.4814393520355225 + }, + { + "auxiliary_loss_clip": 0.01108545, + "auxiliary_loss_mlp": 0.01037927, + "balance_loss_clip": 1.02576208, + "balance_loss_mlp": 1.03874063, + "epoch": 0.39969938373666014, + "flos": 26032255914240.0, + "grad_norm": 1.4937426139380796, + "language_loss": 0.74371958, + "learning_rate": 2.7295597695223463e-06, + "loss": 0.76518434, + "num_input_tokens_seen": 142792680, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69921875, + "step": 6648, + "time_per_iteration": 2.4970345497131348 + }, + { + "auxiliary_loss_clip": 0.01115329, + "auxiliary_loss_mlp": 0.01036503, + "balance_loss_clip": 1.02300286, + "balance_loss_mlp": 1.04061389, + "epoch": 0.3997595069893281, + "flos": 20116367786880.0, + "grad_norm": 2.209642859907432, + "language_loss": 0.66124469, + "learning_rate": 2.7291971303412006e-06, + "loss": 0.68276298, + "num_input_tokens_seen": 142810510, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 6649, + "time_per_iteration": 2.4624104499816895 + }, + { + "auxiliary_loss_clip": 0.01116294, + "auxiliary_loss_mlp": 0.01036155, + "balance_loss_clip": 1.02280378, + "balance_loss_mlp": 1.0420115, + "epoch": 0.39981963024199607, + "flos": 27783403764480.0, + "grad_norm": 1.57860522688022, + "language_loss": 0.75273359, + "learning_rate": 2.728834463508826e-06, + "loss": 0.77425814, + "num_input_tokens_seen": 142832455, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7421875, + "step": 6650, + "time_per_iteration": 2.5091254711151123 + }, + { + "auxiliary_loss_clip": 0.01112744, + "auxiliary_loss_mlp": 0.01037491, + "balance_loss_clip": 1.02397823, + "balance_loss_mlp": 1.03905869, + "epoch": 0.39987975349466404, + "flos": 21944436612480.0, + "grad_norm": 1.4583647344722164, + "language_loss": 0.71954048, + "learning_rate": 2.728471769038975e-06, + "loss": 0.74104279, + "num_input_tokens_seen": 142852590, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73828125, + "step": 6651, + "time_per_iteration": 2.4820897579193115 + }, + { + "auxiliary_loss_clip": 0.01113579, + "auxiliary_loss_mlp": 0.01035523, + "balance_loss_clip": 1.02220726, + "balance_loss_mlp": 1.03815126, + "epoch": 0.39993987674733206, + "flos": 20704405340160.0, + "grad_norm": 1.787132664616244, + "language_loss": 0.72906494, + "learning_rate": 2.728109046945403e-06, + "loss": 0.75055599, + "num_input_tokens_seen": 142870595, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75390625, + "step": 6652, + "time_per_iteration": 2.4568119049072266 + }, + { + "auxiliary_loss_clip": 0.01039541, + "auxiliary_loss_mlp": 0.00999581, + "balance_loss_clip": 0.99819815, + "balance_loss_mlp": 1.01483345, + "epoch": 0.4, + "flos": 61525429862400.0, + "grad_norm": 0.8299860195083637, + "language_loss": 0.61066198, + "learning_rate": 2.727746297241862e-06, + "loss": 0.63105321, + "num_input_tokens_seen": 142925805, + "router_z_loss_clip": 0.01385498, + "router_z_loss_mlp": 0.24707031, + "step": 6653, + "time_per_iteration": 3.0071723461151123 + }, + { + "auxiliary_loss_clip": 0.01113323, + "auxiliary_loss_mlp": 0.01038964, + "balance_loss_clip": 1.02607179, + "balance_loss_mlp": 1.04303741, + "epoch": 0.400060123252668, + "flos": 14502309644160.0, + "grad_norm": 2.127427836980077, + "language_loss": 0.67038172, + "learning_rate": 2.7273835199421085e-06, + "loss": 0.6919046, + "num_input_tokens_seen": 142943145, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 6654, + "time_per_iteration": 2.442049026489258 + }, + { + "auxiliary_loss_clip": 0.01113347, + "auxiliary_loss_mlp": 0.01039111, + "balance_loss_clip": 1.02741051, + "balance_loss_mlp": 1.03887355, + "epoch": 0.40012024650533595, + "flos": 19093308618240.0, + "grad_norm": 2.299433298478917, + "language_loss": 0.89737195, + "learning_rate": 2.7270207150599e-06, + "loss": 0.91889656, + "num_input_tokens_seen": 142956925, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.74609375, + "step": 6655, + "time_per_iteration": 2.4836323261260986 + }, + { + "auxiliary_loss_clip": 0.01110377, + "auxiliary_loss_mlp": 0.0102991, + "balance_loss_clip": 1.01865685, + "balance_loss_mlp": 1.04077053, + "epoch": 0.4001803697580039, + "flos": 29351012094720.0, + "grad_norm": 1.5855954082229138, + "language_loss": 0.73497427, + "learning_rate": 2.7266578826089917e-06, + "loss": 0.75637716, + "num_input_tokens_seen": 142978040, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6953125, + "step": 6656, + "time_per_iteration": 2.5071847438812256 + }, + { + "auxiliary_loss_clip": 0.01116544, + "auxiliary_loss_mlp": 0.01046403, + "balance_loss_clip": 1.03248513, + "balance_loss_mlp": 1.04179835, + "epoch": 0.4002404930106719, + "flos": 20920048640640.0, + "grad_norm": 1.4675228136273628, + "language_loss": 0.7344414, + "learning_rate": 2.726295022603144e-06, + "loss": 0.75607085, + "num_input_tokens_seen": 142998390, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75, + "step": 6657, + "time_per_iteration": 2.575587034225464 + }, + { + "auxiliary_loss_clip": 0.01116565, + "auxiliary_loss_mlp": 0.0103855, + "balance_loss_clip": 1.02432823, + "balance_loss_mlp": 1.04162562, + "epoch": 0.40030061626333985, + "flos": 28405735827840.0, + "grad_norm": 1.4527474123065993, + "language_loss": 0.79588759, + "learning_rate": 2.725932135056117e-06, + "loss": 0.81743878, + "num_input_tokens_seen": 143021505, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.75, + "step": 6658, + "time_per_iteration": 2.7093567848205566 + }, + { + "auxiliary_loss_clip": 0.01115311, + "auxiliary_loss_mlp": 0.01041911, + "balance_loss_clip": 1.02917993, + "balance_loss_mlp": 1.0406971, + "epoch": 0.4003607395160078, + "flos": 25921615046400.0, + "grad_norm": 1.8904694620172307, + "language_loss": 0.77345288, + "learning_rate": 2.72556921998167e-06, + "loss": 0.79502499, + "num_input_tokens_seen": 143041375, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7421875, + "step": 6659, + "time_per_iteration": 2.5323445796966553 + }, + { + "auxiliary_loss_clip": 0.01105934, + "auxiliary_loss_mlp": 0.01028537, + "balance_loss_clip": 1.01713443, + "balance_loss_mlp": 1.03853416, + "epoch": 0.4004208627686758, + "flos": 20768648814720.0, + "grad_norm": 1.7715585064718242, + "language_loss": 0.72642064, + "learning_rate": 2.7252062773935662e-06, + "loss": 0.7477653, + "num_input_tokens_seen": 143058725, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.671875, + "step": 6660, + "time_per_iteration": 2.4459004402160645 + }, + { + "auxiliary_loss_clip": 0.01113964, + "auxiliary_loss_mlp": 0.01039676, + "balance_loss_clip": 1.02753496, + "balance_loss_mlp": 1.04069686, + "epoch": 0.40048098602134374, + "flos": 24681224638080.0, + "grad_norm": 1.7053131194953803, + "language_loss": 0.70897067, + "learning_rate": 2.7248433073055674e-06, + "loss": 0.73050702, + "num_input_tokens_seen": 143076995, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.73046875, + "step": 6661, + "time_per_iteration": 2.5339720249176025 + }, + { + "auxiliary_loss_clip": 0.011183, + "auxiliary_loss_mlp": 0.01041337, + "balance_loss_clip": 1.02808094, + "balance_loss_mlp": 1.04304504, + "epoch": 0.4005411092740117, + "flos": 23185688947200.0, + "grad_norm": 1.7756888608898216, + "language_loss": 0.75688839, + "learning_rate": 2.724480309731437e-06, + "loss": 0.77848476, + "num_input_tokens_seen": 143096780, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75390625, + "step": 6662, + "time_per_iteration": 2.4546353816986084 + }, + { + "auxiliary_loss_clip": 0.01115994, + "auxiliary_loss_mlp": 0.01033447, + "balance_loss_clip": 1.01979184, + "balance_loss_mlp": 1.03956914, + "epoch": 0.4006012325266797, + "flos": 17522324409600.0, + "grad_norm": 2.0032115325237076, + "language_loss": 0.66019243, + "learning_rate": 2.7241172846849417e-06, + "loss": 0.68168688, + "num_input_tokens_seen": 143112590, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.765625, + "step": 6663, + "time_per_iteration": 2.4437708854675293 + }, + { + "auxiliary_loss_clip": 0.01115313, + "auxiliary_loss_mlp": 0.01036965, + "balance_loss_clip": 1.02409601, + "balance_loss_mlp": 1.0406127, + "epoch": 0.40066135577934764, + "flos": 19857200181120.0, + "grad_norm": 2.5671112933527542, + "language_loss": 0.85808247, + "learning_rate": 2.7237542321798455e-06, + "loss": 0.87960517, + "num_input_tokens_seen": 143130220, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.74609375, + "step": 6664, + "time_per_iteration": 2.423644781112671 + }, + { + "auxiliary_loss_clip": 0.01116399, + "auxiliary_loss_mlp": 0.01033701, + "balance_loss_clip": 1.02062321, + "balance_loss_mlp": 1.04155052, + "epoch": 0.40072147903201566, + "flos": 18150007599360.0, + "grad_norm": 1.9940684324093096, + "language_loss": 0.84890211, + "learning_rate": 2.723391152229917e-06, + "loss": 0.87040305, + "num_input_tokens_seen": 143147160, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.75, + "step": 6665, + "time_per_iteration": 2.4386377334594727 + }, + { + "auxiliary_loss_clip": 0.01119995, + "auxiliary_loss_mlp": 0.01034483, + "balance_loss_clip": 1.02107859, + "balance_loss_mlp": 1.04381645, + "epoch": 0.4007816022846836, + "flos": 18661267831680.0, + "grad_norm": 1.7199178144884215, + "language_loss": 0.78264785, + "learning_rate": 2.7230280448489236e-06, + "loss": 0.8041926, + "num_input_tokens_seen": 143164605, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.76171875, + "step": 6666, + "time_per_iteration": 2.434093952178955 + }, + { + "auxiliary_loss_clip": 0.01118396, + "auxiliary_loss_mlp": 0.0103542, + "balance_loss_clip": 1.02121019, + "balance_loss_mlp": 1.04240537, + "epoch": 0.4008417255373516, + "flos": 25703170485120.0, + "grad_norm": 1.6354204552723763, + "language_loss": 0.73558462, + "learning_rate": 2.7226649100506333e-06, + "loss": 0.75712276, + "num_input_tokens_seen": 143183965, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.76171875, + "step": 6667, + "time_per_iteration": 2.520869255065918 + }, + { + "auxiliary_loss_clip": 0.01117838, + "auxiliary_loss_mlp": 0.0104414, + "balance_loss_clip": 1.02944148, + "balance_loss_mlp": 1.04147649, + "epoch": 0.40090184879001955, + "flos": 22858614679680.0, + "grad_norm": 1.370510933760038, + "language_loss": 0.75832677, + "learning_rate": 2.7223017478488183e-06, + "loss": 0.77994657, + "num_input_tokens_seen": 143204965, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.76171875, + "step": 6668, + "time_per_iteration": 2.475261688232422 + }, + { + "auxiliary_loss_clip": 0.0111899, + "auxiliary_loss_mlp": 0.01038268, + "balance_loss_clip": 1.02470183, + "balance_loss_mlp": 1.04511833, + "epoch": 0.4009619720426875, + "flos": 29059848449280.0, + "grad_norm": 1.7348003262037657, + "language_loss": 0.82309943, + "learning_rate": 2.721938558257248e-06, + "loss": 0.84467208, + "num_input_tokens_seen": 143225015, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.73828125, + "step": 6669, + "time_per_iteration": 2.530458927154541 + }, + { + "auxiliary_loss_clip": 0.0103961, + "auxiliary_loss_mlp": 0.01001267, + "balance_loss_clip": 0.99993151, + "balance_loss_mlp": 1.01565075, + "epoch": 0.4010220952953555, + "flos": 66059763131520.0, + "grad_norm": 0.698912500879513, + "language_loss": 0.53386176, + "learning_rate": 2.721575341289695e-06, + "loss": 0.55427051, + "num_input_tokens_seen": 143294925, + "router_z_loss_clip": 0.0133667, + "router_z_loss_mlp": 0.23925781, + "step": 6670, + "time_per_iteration": 3.247837781906128 + }, + { + "auxiliary_loss_clip": 0.01115169, + "auxiliary_loss_mlp": 0.01037927, + "balance_loss_clip": 1.02476037, + "balance_loss_mlp": 1.0415678, + "epoch": 0.40108221854802345, + "flos": 29642822184960.0, + "grad_norm": 1.8543411810419943, + "language_loss": 0.88405877, + "learning_rate": 2.7212120969599333e-06, + "loss": 0.9055897, + "num_input_tokens_seen": 143314170, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 6671, + "time_per_iteration": 2.5657830238342285 + }, + { + "auxiliary_loss_clip": 0.01115344, + "auxiliary_loss_mlp": 0.01034806, + "balance_loss_clip": 1.02088797, + "balance_loss_mlp": 1.04077482, + "epoch": 0.4011423418006914, + "flos": 19929560129280.0, + "grad_norm": 1.813982967664466, + "language_loss": 0.78926146, + "learning_rate": 2.720848825281736e-06, + "loss": 0.81076294, + "num_input_tokens_seen": 143330050, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.74609375, + "step": 6672, + "time_per_iteration": 2.444209337234497 + }, + { + "auxiliary_loss_clip": 0.01110996, + "auxiliary_loss_mlp": 0.01031049, + "balance_loss_clip": 1.01829374, + "balance_loss_mlp": 1.03889108, + "epoch": 0.4012024650533594, + "flos": 20084299920000.0, + "grad_norm": 1.9086088279717175, + "language_loss": 0.63218224, + "learning_rate": 2.72048552626888e-06, + "loss": 0.65360266, + "num_input_tokens_seen": 143348650, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 6673, + "time_per_iteration": 2.577171564102173 + }, + { + "auxiliary_loss_clip": 0.01114754, + "auxiliary_loss_mlp": 0.01033396, + "balance_loss_clip": 1.02027059, + "balance_loss_mlp": 1.0399313, + "epoch": 0.40126258830602735, + "flos": 21695719864320.0, + "grad_norm": 1.4529148407259798, + "language_loss": 0.80390126, + "learning_rate": 2.7201221999351402e-06, + "loss": 0.82538271, + "num_input_tokens_seen": 143370275, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.75, + "step": 6674, + "time_per_iteration": 2.5402464866638184 + }, + { + "auxiliary_loss_clip": 0.01119667, + "auxiliary_loss_mlp": 0.01030214, + "balance_loss_clip": 1.01687407, + "balance_loss_mlp": 1.04199886, + "epoch": 0.4013227115586953, + "flos": 12020379592320.0, + "grad_norm": 2.6082453610380574, + "language_loss": 0.82641548, + "learning_rate": 2.719758846294294e-06, + "loss": 0.84791422, + "num_input_tokens_seen": 143385390, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.77734375, + "step": 6675, + "time_per_iteration": 2.4605085849761963 + }, + { + "auxiliary_loss_clip": 0.0111374, + "auxiliary_loss_mlp": 0.01032911, + "balance_loss_clip": 1.0189873, + "balance_loss_mlp": 1.04002738, + "epoch": 0.4013828348113633, + "flos": 25447522412160.0, + "grad_norm": 1.7135878896985557, + "language_loss": 0.93308246, + "learning_rate": 2.71939546536012e-06, + "loss": 0.95454895, + "num_input_tokens_seen": 143404215, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.73828125, + "step": 6676, + "time_per_iteration": 2.496168851852417 + }, + { + "auxiliary_loss_clip": 0.01121217, + "auxiliary_loss_mlp": 0.01039781, + "balance_loss_clip": 1.02516031, + "balance_loss_mlp": 1.04100275, + "epoch": 0.40144295806403124, + "flos": 18582946225920.0, + "grad_norm": 4.942241320167032, + "language_loss": 0.79622304, + "learning_rate": 2.719032057146399e-06, + "loss": 0.81783295, + "num_input_tokens_seen": 143422245, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.80078125, + "step": 6677, + "time_per_iteration": 2.4565844535827637 + }, + { + "auxiliary_loss_clip": 0.01116689, + "auxiliary_loss_mlp": 0.01032915, + "balance_loss_clip": 1.01977801, + "balance_loss_mlp": 1.0429368, + "epoch": 0.4015030813166992, + "flos": 22930220442240.0, + "grad_norm": 3.7422980142657374, + "language_loss": 0.83766311, + "learning_rate": 2.71866862166691e-06, + "loss": 0.85915917, + "num_input_tokens_seen": 143443130, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73828125, + "step": 6678, + "time_per_iteration": 4.12173318862915 + }, + { + "auxiliary_loss_clip": 0.01114267, + "auxiliary_loss_mlp": 0.01037817, + "balance_loss_clip": 1.02480578, + "balance_loss_mlp": 1.04150224, + "epoch": 0.4015632045693672, + "flos": 20595057361920.0, + "grad_norm": 2.988298740497095, + "language_loss": 0.63948399, + "learning_rate": 2.718305158935434e-06, + "loss": 0.66100478, + "num_input_tokens_seen": 143461385, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7265625, + "step": 6679, + "time_per_iteration": 5.297976016998291 + }, + { + "auxiliary_loss_clip": 0.01112719, + "auxiliary_loss_mlp": 0.01029551, + "balance_loss_clip": 1.01653934, + "balance_loss_mlp": 1.04000115, + "epoch": 0.4016233278220352, + "flos": 23438930808960.0, + "grad_norm": 1.456514191681199, + "language_loss": 0.78654617, + "learning_rate": 2.7179416689657554e-06, + "loss": 0.80796885, + "num_input_tokens_seen": 143481750, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7265625, + "step": 6680, + "time_per_iteration": 2.467042922973633 + }, + { + "auxiliary_loss_clip": 0.01119549, + "auxiliary_loss_mlp": 0.0104553, + "balance_loss_clip": 1.03065872, + "balance_loss_mlp": 1.04160023, + "epoch": 0.40168345107470316, + "flos": 21431057477760.0, + "grad_norm": 1.6886011670643926, + "language_loss": 0.75628668, + "learning_rate": 2.7175781517716556e-06, + "loss": 0.77793747, + "num_input_tokens_seen": 143501540, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78125, + "step": 6681, + "time_per_iteration": 2.579265594482422 + }, + { + "auxiliary_loss_clip": 0.01118059, + "auxiliary_loss_mlp": 0.01030247, + "balance_loss_clip": 1.01727676, + "balance_loss_mlp": 1.04282522, + "epoch": 0.4017435743273711, + "flos": 22857214049280.0, + "grad_norm": 2.058228157074571, + "language_loss": 0.64001781, + "learning_rate": 2.7172146073669213e-06, + "loss": 0.66150093, + "num_input_tokens_seen": 143520530, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.75390625, + "step": 6682, + "time_per_iteration": 2.4423694610595703 + }, + { + "auxiliary_loss_clip": 0.01115099, + "auxiliary_loss_mlp": 0.01032652, + "balance_loss_clip": 1.01953304, + "balance_loss_mlp": 1.03868985, + "epoch": 0.4018036975800391, + "flos": 28622312881920.0, + "grad_norm": 1.6867457181896433, + "language_loss": 0.73334014, + "learning_rate": 2.716851035765337e-06, + "loss": 0.75481766, + "num_input_tokens_seen": 143540210, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.765625, + "step": 6683, + "time_per_iteration": 2.5543196201324463 + }, + { + "auxiliary_loss_clip": 0.01113172, + "auxiliary_loss_mlp": 0.01043204, + "balance_loss_clip": 1.02971554, + "balance_loss_mlp": 1.03814459, + "epoch": 0.40186382083270705, + "flos": 26651212099200.0, + "grad_norm": 1.6157462356379846, + "language_loss": 0.73054385, + "learning_rate": 2.7164874369806896e-06, + "loss": 0.75210762, + "num_input_tokens_seen": 143560940, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 6684, + "time_per_iteration": 2.584984302520752 + }, + { + "auxiliary_loss_clip": 0.01036703, + "auxiliary_loss_mlp": 0.01002873, + "balance_loss_clip": 1.0016098, + "balance_loss_mlp": 1.01262808, + "epoch": 0.401923944085375, + "flos": 59259969123840.0, + "grad_norm": 0.8051502477983452, + "language_loss": 0.60442972, + "learning_rate": 2.716123811026767e-06, + "loss": 0.62482548, + "num_input_tokens_seen": 143624015, + "router_z_loss_clip": 0.01263428, + "router_z_loss_mlp": 0.24023438, + "step": 6685, + "time_per_iteration": 3.2001583576202393 + }, + { + "auxiliary_loss_clip": 0.01118672, + "auxiliary_loss_mlp": 0.01032258, + "balance_loss_clip": 1.0184474, + "balance_loss_mlp": 1.0410161, + "epoch": 0.401984067338043, + "flos": 16982803152000.0, + "grad_norm": 2.1343445795660956, + "language_loss": 0.69979215, + "learning_rate": 2.715760157917357e-06, + "loss": 0.72130144, + "num_input_tokens_seen": 143642750, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.77734375, + "step": 6686, + "time_per_iteration": 2.486487627029419 + }, + { + "auxiliary_loss_clip": 0.01113204, + "auxiliary_loss_mlp": 0.01030839, + "balance_loss_clip": 1.01784527, + "balance_loss_mlp": 1.03917289, + "epoch": 0.40204419059071095, + "flos": 24972496024320.0, + "grad_norm": 1.4076322562781298, + "language_loss": 0.74622524, + "learning_rate": 2.7153964776662504e-06, + "loss": 0.76766562, + "num_input_tokens_seen": 143664515, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7421875, + "step": 6687, + "time_per_iteration": 2.4854915142059326 + }, + { + "auxiliary_loss_clip": 0.01117283, + "auxiliary_loss_mlp": 0.01035998, + "balance_loss_clip": 1.02219915, + "balance_loss_mlp": 1.04146934, + "epoch": 0.4021043138433789, + "flos": 23477463123840.0, + "grad_norm": 1.852699339351418, + "language_loss": 0.70648831, + "learning_rate": 2.7150327702872385e-06, + "loss": 0.72802114, + "num_input_tokens_seen": 143683135, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7578125, + "step": 6688, + "time_per_iteration": 2.452765703201294 + }, + { + "auxiliary_loss_clip": 0.01117224, + "auxiliary_loss_mlp": 0.0103962, + "balance_loss_clip": 1.02558923, + "balance_loss_mlp": 1.0390867, + "epoch": 0.4021644370960469, + "flos": 25995806588160.0, + "grad_norm": 1.7360862235805987, + "language_loss": 0.64509618, + "learning_rate": 2.7146690357941112e-06, + "loss": 0.6666646, + "num_input_tokens_seen": 143703985, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.78125, + "step": 6689, + "time_per_iteration": 2.5217337608337402 + }, + { + "auxiliary_loss_clip": 0.01117214, + "auxiliary_loss_mlp": 0.01033972, + "balance_loss_clip": 1.02059698, + "balance_loss_mlp": 1.03956485, + "epoch": 0.40222456034871484, + "flos": 13587987922560.0, + "grad_norm": 2.322807889185569, + "language_loss": 0.7306338, + "learning_rate": 2.7143052742006632e-06, + "loss": 0.75214565, + "num_input_tokens_seen": 143719245, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.77734375, + "step": 6690, + "time_per_iteration": 2.421478509902954 + }, + { + "auxiliary_loss_clip": 0.01114039, + "auxiliary_loss_mlp": 0.01036823, + "balance_loss_clip": 1.02357256, + "balance_loss_mlp": 1.03967643, + "epoch": 0.4022846836013828, + "flos": 24278019494400.0, + "grad_norm": 1.4867559931284213, + "language_loss": 0.74789405, + "learning_rate": 2.7139414855206872e-06, + "loss": 0.76940262, + "num_input_tokens_seen": 143739575, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7421875, + "step": 6691, + "time_per_iteration": 2.5322606563568115 + }, + { + "auxiliary_loss_clip": 0.01119421, + "auxiliary_loss_mlp": 0.01038807, + "balance_loss_clip": 1.02530634, + "balance_loss_mlp": 1.04281604, + "epoch": 0.40234480685405083, + "flos": 20151596050560.0, + "grad_norm": 1.5836527032457117, + "language_loss": 0.72676492, + "learning_rate": 2.7135776697679785e-06, + "loss": 0.74834728, + "num_input_tokens_seen": 143758515, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 6692, + "time_per_iteration": 2.486466407775879 + }, + { + "auxiliary_loss_clip": 0.01115579, + "auxiliary_loss_mlp": 0.01037632, + "balance_loss_clip": 1.02444792, + "balance_loss_mlp": 1.039814, + "epoch": 0.4024049301067188, + "flos": 22930220442240.0, + "grad_norm": 1.7516389520719526, + "language_loss": 0.83851349, + "learning_rate": 2.7132138269563333e-06, + "loss": 0.86004555, + "num_input_tokens_seen": 143776770, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7578125, + "step": 6693, + "time_per_iteration": 2.5068037509918213 + }, + { + "auxiliary_loss_clip": 0.01118781, + "auxiliary_loss_mlp": 0.01036408, + "balance_loss_clip": 1.02325296, + "balance_loss_mlp": 1.04313457, + "epoch": 0.40246505335938676, + "flos": 36028421487360.0, + "grad_norm": 1.699829604816944, + "language_loss": 0.71295136, + "learning_rate": 2.7128499570995483e-06, + "loss": 0.73450321, + "num_input_tokens_seen": 143798450, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7578125, + "step": 6694, + "time_per_iteration": 2.5704145431518555 + }, + { + "auxiliary_loss_clip": 0.01114045, + "auxiliary_loss_mlp": 0.01038433, + "balance_loss_clip": 1.02460432, + "balance_loss_mlp": 1.03981924, + "epoch": 0.4025251766120547, + "flos": 20594303176320.0, + "grad_norm": 2.0155422945498223, + "language_loss": 0.67754763, + "learning_rate": 2.7124860602114212e-06, + "loss": 0.69907242, + "num_input_tokens_seen": 143816995, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7421875, + "step": 6695, + "time_per_iteration": 2.4664762020111084 + }, + { + "auxiliary_loss_clip": 0.0111296, + "auxiliary_loss_mlp": 0.01030605, + "balance_loss_clip": 1.01736653, + "balance_loss_mlp": 1.03826809, + "epoch": 0.4025852998647227, + "flos": 64523932381440.0, + "grad_norm": 2.459399840574827, + "language_loss": 0.79355788, + "learning_rate": 2.7121221363057515e-06, + "loss": 0.81499356, + "num_input_tokens_seen": 143842090, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.74609375, + "step": 6696, + "time_per_iteration": 2.883577346801758 + }, + { + "auxiliary_loss_clip": 0.01118448, + "auxiliary_loss_mlp": 0.0103721, + "balance_loss_clip": 1.02291059, + "balance_loss_mlp": 1.04224885, + "epoch": 0.40264542311739066, + "flos": 20886292834560.0, + "grad_norm": 1.6846278858215487, + "language_loss": 0.70899725, + "learning_rate": 2.7117581853963393e-06, + "loss": 0.73055387, + "num_input_tokens_seen": 143860800, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.76171875, + "step": 6697, + "time_per_iteration": 2.4922237396240234 + }, + { + "auxiliary_loss_clip": 0.01113557, + "auxiliary_loss_mlp": 0.01038169, + "balance_loss_clip": 1.02555108, + "balance_loss_mlp": 1.04018331, + "epoch": 0.4027055463700586, + "flos": 26250197685120.0, + "grad_norm": 2.4926240162149162, + "language_loss": 0.61456931, + "learning_rate": 2.711394207496984e-06, + "loss": 0.63608658, + "num_input_tokens_seen": 143878950, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.734375, + "step": 6698, + "time_per_iteration": 2.4892961978912354 + }, + { + "auxiliary_loss_clip": 0.01115982, + "auxiliary_loss_mlp": 0.0103183, + "balance_loss_clip": 1.01840675, + "balance_loss_mlp": 1.03997493, + "epoch": 0.4027656696227266, + "flos": 20631398947200.0, + "grad_norm": 1.8414423865451628, + "language_loss": 0.76245844, + "learning_rate": 2.711030202621491e-06, + "loss": 0.78393662, + "num_input_tokens_seen": 143898385, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.76171875, + "step": 6699, + "time_per_iteration": 2.4576990604400635 + }, + { + "auxiliary_loss_clip": 0.01110513, + "auxiliary_loss_mlp": 0.01030361, + "balance_loss_clip": 1.0171113, + "balance_loss_mlp": 1.03855538, + "epoch": 0.40282579287539455, + "flos": 22346277039360.0, + "grad_norm": 1.5844300780087603, + "language_loss": 0.80345184, + "learning_rate": 2.7106661707836605e-06, + "loss": 0.82486057, + "num_input_tokens_seen": 143918795, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 6700, + "time_per_iteration": 2.4449126720428467 + }, + { + "auxiliary_loss_clip": 0.01117537, + "auxiliary_loss_mlp": 0.01043995, + "balance_loss_clip": 1.02886689, + "balance_loss_mlp": 1.03814912, + "epoch": 0.4028859161280625, + "flos": 29274988959360.0, + "grad_norm": 2.2662820598104227, + "language_loss": 0.74967611, + "learning_rate": 2.7103021119972977e-06, + "loss": 0.77129138, + "num_input_tokens_seen": 143938245, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.79296875, + "step": 6701, + "time_per_iteration": 2.5474703311920166 + }, + { + "auxiliary_loss_clip": 0.01112492, + "auxiliary_loss_mlp": 0.01038921, + "balance_loss_clip": 1.02598631, + "balance_loss_mlp": 1.03800225, + "epoch": 0.4029460393807305, + "flos": 28622312881920.0, + "grad_norm": 1.5176135502188826, + "language_loss": 0.65989178, + "learning_rate": 2.709938026276208e-06, + "loss": 0.6814059, + "num_input_tokens_seen": 143960995, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.74609375, + "step": 6702, + "time_per_iteration": 2.5158073902130127 + }, + { + "auxiliary_loss_clip": 0.01116121, + "auxiliary_loss_mlp": 0.01039218, + "balance_loss_clip": 1.02409053, + "balance_loss_mlp": 1.03949153, + "epoch": 0.40300616263339845, + "flos": 22601925112320.0, + "grad_norm": 1.577366316976287, + "language_loss": 0.66134161, + "learning_rate": 2.7095739136341964e-06, + "loss": 0.68289495, + "num_input_tokens_seen": 143979910, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.765625, + "step": 6703, + "time_per_iteration": 2.4974560737609863 + }, + { + "auxiliary_loss_clip": 0.01119665, + "auxiliary_loss_mlp": 0.01035087, + "balance_loss_clip": 1.0210917, + "balance_loss_mlp": 1.04285431, + "epoch": 0.4030662858860664, + "flos": 25520313323520.0, + "grad_norm": 2.6870156282512245, + "language_loss": 0.82005399, + "learning_rate": 2.709209774085071e-06, + "loss": 0.84160155, + "num_input_tokens_seen": 144000095, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.765625, + "step": 6704, + "time_per_iteration": 2.5040299892425537 + }, + { + "auxiliary_loss_clip": 0.01117271, + "auxiliary_loss_mlp": 0.01034919, + "balance_loss_clip": 1.02110291, + "balance_loss_mlp": 1.03974569, + "epoch": 0.40312640913873443, + "flos": 23586703361280.0, + "grad_norm": 2.5805971030690578, + "language_loss": 0.73468685, + "learning_rate": 2.7088456076426407e-06, + "loss": 0.75620878, + "num_input_tokens_seen": 144019695, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.77734375, + "step": 6705, + "time_per_iteration": 2.520252227783203 + }, + { + "auxiliary_loss_clip": 0.01113466, + "auxiliary_loss_mlp": 0.01035202, + "balance_loss_clip": 1.02208292, + "balance_loss_mlp": 1.03979278, + "epoch": 0.4031865323914024, + "flos": 20011042131840.0, + "grad_norm": 1.712587367637223, + "language_loss": 0.66288096, + "learning_rate": 2.708481414320713e-06, + "loss": 0.68436766, + "num_input_tokens_seen": 144038525, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.734375, + "step": 6706, + "time_per_iteration": 2.4254331588745117 + }, + { + "auxiliary_loss_clip": 0.01114724, + "auxiliary_loss_mlp": 0.0103993, + "balance_loss_clip": 1.02619088, + "balance_loss_mlp": 1.03957605, + "epoch": 0.40324665564407036, + "flos": 21871430219520.0, + "grad_norm": 1.3675174561755612, + "language_loss": 0.71328777, + "learning_rate": 2.7081171941330992e-06, + "loss": 0.73483431, + "num_input_tokens_seen": 144059485, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75, + "step": 6707, + "time_per_iteration": 2.5285422801971436 + }, + { + "auxiliary_loss_clip": 0.01109979, + "auxiliary_loss_mlp": 0.01035824, + "balance_loss_clip": 1.02169156, + "balance_loss_mlp": 1.03867698, + "epoch": 0.4033067788967383, + "flos": 23878728933120.0, + "grad_norm": 1.4937460074112463, + "language_loss": 0.80080485, + "learning_rate": 2.707752947093611e-06, + "loss": 0.82226288, + "num_input_tokens_seen": 144080265, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7109375, + "step": 6708, + "time_per_iteration": 2.4664134979248047 + }, + { + "auxiliary_loss_clip": 0.01117266, + "auxiliary_loss_mlp": 0.0103605, + "balance_loss_clip": 1.02170968, + "balance_loss_mlp": 1.03778601, + "epoch": 0.4033669021494063, + "flos": 17419907756160.0, + "grad_norm": 2.013607365016592, + "language_loss": 0.82944471, + "learning_rate": 2.70738867321606e-06, + "loss": 0.8509779, + "num_input_tokens_seen": 144098040, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.796875, + "step": 6709, + "time_per_iteration": 2.461277723312378 + }, + { + "auxiliary_loss_clip": 0.01119346, + "auxiliary_loss_mlp": 0.01038536, + "balance_loss_clip": 1.02454066, + "balance_loss_mlp": 1.04260051, + "epoch": 0.40342702540207426, + "flos": 29600554855680.0, + "grad_norm": 1.4165591336273893, + "language_loss": 0.71036613, + "learning_rate": 2.70702437251426e-06, + "loss": 0.73194492, + "num_input_tokens_seen": 144118265, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.765625, + "step": 6710, + "time_per_iteration": 2.5579922199249268 + }, + { + "auxiliary_loss_clip": 0.01116194, + "auxiliary_loss_mlp": 0.01038571, + "balance_loss_clip": 1.02461195, + "balance_loss_mlp": 1.04049003, + "epoch": 0.4034871486547422, + "flos": 11284605400320.0, + "grad_norm": 1.9864485278108117, + "language_loss": 0.85366702, + "learning_rate": 2.7066600450020236e-06, + "loss": 0.87521464, + "num_input_tokens_seen": 144133865, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7578125, + "step": 6711, + "time_per_iteration": 2.511082410812378 + }, + { + "auxiliary_loss_clip": 0.01116602, + "auxiliary_loss_mlp": 0.01034461, + "balance_loss_clip": 1.02038825, + "balance_loss_mlp": 1.04072142, + "epoch": 0.4035472719074102, + "flos": 15552839738880.0, + "grad_norm": 1.9069456024701996, + "language_loss": 0.76074743, + "learning_rate": 2.706295690693168e-06, + "loss": 0.78225803, + "num_input_tokens_seen": 144150125, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 6712, + "time_per_iteration": 2.419672727584839 + }, + { + "auxiliary_loss_clip": 0.0111779, + "auxiliary_loss_mlp": 0.01037728, + "balance_loss_clip": 1.02364349, + "balance_loss_mlp": 1.04200089, + "epoch": 0.40360739516007815, + "flos": 24674365140480.0, + "grad_norm": 2.1216019240756765, + "language_loss": 0.78926992, + "learning_rate": 2.7059313096015096e-06, + "loss": 0.81082511, + "num_input_tokens_seen": 144169295, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7578125, + "step": 6713, + "time_per_iteration": 2.520109176635742 + }, + { + "auxiliary_loss_clip": 0.01113814, + "auxiliary_loss_mlp": 0.01033041, + "balance_loss_clip": 1.01912916, + "balance_loss_mlp": 1.03721881, + "epoch": 0.4036675184127461, + "flos": 17304095329920.0, + "grad_norm": 1.8945946455640421, + "language_loss": 0.88507473, + "learning_rate": 2.705566901740865e-06, + "loss": 0.90654337, + "num_input_tokens_seen": 144185790, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.765625, + "step": 6714, + "time_per_iteration": 2.4076859951019287 + }, + { + "auxiliary_loss_clip": 0.01115997, + "auxiliary_loss_mlp": 0.01040851, + "balance_loss_clip": 1.02688611, + "balance_loss_mlp": 1.04049468, + "epoch": 0.4037276416654141, + "flos": 19864023765120.0, + "grad_norm": 2.116493132238348, + "language_loss": 0.69099832, + "learning_rate": 2.7052024671250527e-06, + "loss": 0.71256685, + "num_input_tokens_seen": 144205190, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75390625, + "step": 6715, + "time_per_iteration": 2.4805076122283936 + }, + { + "auxiliary_loss_clip": 0.01117346, + "auxiliary_loss_mlp": 0.01031825, + "balance_loss_clip": 1.01785374, + "balance_loss_mlp": 1.03944981, + "epoch": 0.40378776491808205, + "flos": 18296271780480.0, + "grad_norm": 7.495764991407429, + "language_loss": 0.76919901, + "learning_rate": 2.704838005767892e-06, + "loss": 0.79069078, + "num_input_tokens_seen": 144222705, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.78125, + "step": 6716, + "time_per_iteration": 2.4244720935821533 + }, + { + "auxiliary_loss_clip": 0.0111299, + "auxiliary_loss_mlp": 0.01037832, + "balance_loss_clip": 1.02485037, + "balance_loss_mlp": 1.03992844, + "epoch": 0.40384788817075, + "flos": 15049372757760.0, + "grad_norm": 1.8407988101654404, + "language_loss": 0.76272923, + "learning_rate": 2.7044735176832037e-06, + "loss": 0.78423738, + "num_input_tokens_seen": 144239545, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.73046875, + "step": 6717, + "time_per_iteration": 2.5080463886260986 + }, + { + "auxiliary_loss_clip": 0.01036903, + "auxiliary_loss_mlp": 0.01007011, + "balance_loss_clip": 1.00571179, + "balance_loss_mlp": 1.01217222, + "epoch": 0.40390801142341803, + "flos": 61929927895680.0, + "grad_norm": 0.940083561343906, + "language_loss": 0.60735488, + "learning_rate": 2.7041090028848084e-06, + "loss": 0.62779397, + "num_input_tokens_seen": 144288145, + "router_z_loss_clip": 0.01300049, + "router_z_loss_mlp": 0.24707031, + "step": 6718, + "time_per_iteration": 2.9391937255859375 + }, + { + "auxiliary_loss_clip": 0.01120577, + "auxiliary_loss_mlp": 0.01036292, + "balance_loss_clip": 1.02140856, + "balance_loss_mlp": 1.04066229, + "epoch": 0.403968134676086, + "flos": 22738779930240.0, + "grad_norm": 2.1744660134680776, + "language_loss": 0.74794078, + "learning_rate": 2.7037444613865306e-06, + "loss": 0.76950943, + "num_input_tokens_seen": 144302315, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 6719, + "time_per_iteration": 2.4630534648895264 + }, + { + "auxiliary_loss_clip": 0.01117045, + "auxiliary_loss_mlp": 0.01043036, + "balance_loss_clip": 1.02762175, + "balance_loss_mlp": 1.0402683, + "epoch": 0.40402825792875396, + "flos": 19784409269760.0, + "grad_norm": 2.5217598497166422, + "language_loss": 0.81235194, + "learning_rate": 2.7033798932021906e-06, + "loss": 0.83395278, + "num_input_tokens_seen": 144318990, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.76953125, + "step": 6720, + "time_per_iteration": 6.786137104034424 + }, + { + "auxiliary_loss_clip": 0.01113768, + "auxiliary_loss_mlp": 0.01030391, + "balance_loss_clip": 1.01644325, + "balance_loss_mlp": 1.0376296, + "epoch": 0.40408838118142193, + "flos": 19609273532160.0, + "grad_norm": 1.933287838521713, + "language_loss": 0.7720241, + "learning_rate": 2.7030152983456153e-06, + "loss": 0.79346573, + "num_input_tokens_seen": 144335765, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76171875, + "step": 6721, + "time_per_iteration": 3.9910030364990234 + }, + { + "auxiliary_loss_clip": 0.01112718, + "auxiliary_loss_mlp": 0.01026726, + "balance_loss_clip": 1.01460266, + "balance_loss_mlp": 1.04090941, + "epoch": 0.4041485044340899, + "flos": 24426043441920.0, + "grad_norm": 2.3110658804222566, + "language_loss": 0.7264756, + "learning_rate": 2.7026506768306304e-06, + "loss": 0.74787009, + "num_input_tokens_seen": 144355825, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.71875, + "step": 6722, + "time_per_iteration": 2.5377390384674072 + }, + { + "auxiliary_loss_clip": 0.01113608, + "auxiliary_loss_mlp": 0.01036417, + "balance_loss_clip": 1.02270842, + "balance_loss_mlp": 1.03896952, + "epoch": 0.40420862768675786, + "flos": 16760192613120.0, + "grad_norm": 1.7096890061042316, + "language_loss": 0.65681767, + "learning_rate": 2.7022860286710602e-06, + "loss": 0.67831796, + "num_input_tokens_seen": 144374320, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 6723, + "time_per_iteration": 2.429657220840454 + }, + { + "auxiliary_loss_clip": 0.01118233, + "auxiliary_loss_mlp": 0.01043022, + "balance_loss_clip": 1.02834117, + "balance_loss_mlp": 1.04056454, + "epoch": 0.4042687509394258, + "flos": 22491571553280.0, + "grad_norm": 1.4515559648574707, + "language_loss": 0.74074364, + "learning_rate": 2.701921353880734e-06, + "loss": 0.76235622, + "num_input_tokens_seen": 144394325, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7734375, + "step": 6724, + "time_per_iteration": 2.485166072845459 + }, + { + "auxiliary_loss_clip": 0.011096, + "auxiliary_loss_mlp": 0.01034503, + "balance_loss_clip": 1.02133048, + "balance_loss_mlp": 1.03799534, + "epoch": 0.4043288741920938, + "flos": 30336149479680.0, + "grad_norm": 1.783988932028688, + "language_loss": 0.74764013, + "learning_rate": 2.7015566524734787e-06, + "loss": 0.76908118, + "num_input_tokens_seen": 144412765, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71484375, + "step": 6725, + "time_per_iteration": 2.5141966342926025 + }, + { + "auxiliary_loss_clip": 0.01112534, + "auxiliary_loss_mlp": 0.01034717, + "balance_loss_clip": 1.02024531, + "balance_loss_mlp": 1.03874183, + "epoch": 0.40438899744476176, + "flos": 46348321363200.0, + "grad_norm": 1.8781247850607437, + "language_loss": 0.76928914, + "learning_rate": 2.701191924463126e-06, + "loss": 0.79076171, + "num_input_tokens_seen": 144435400, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.73828125, + "step": 6726, + "time_per_iteration": 2.685609817504883 + }, + { + "auxiliary_loss_clip": 0.01115432, + "auxiliary_loss_mlp": 0.01034649, + "balance_loss_clip": 1.02004611, + "balance_loss_mlp": 1.03858769, + "epoch": 0.4044491206974297, + "flos": 13333524998400.0, + "grad_norm": 2.1780936913008646, + "language_loss": 0.81682861, + "learning_rate": 2.7008271698635054e-06, + "loss": 0.83832943, + "num_input_tokens_seen": 144452925, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.76953125, + "step": 6727, + "time_per_iteration": 2.4221317768096924 + }, + { + "auxiliary_loss_clip": 0.0111635, + "auxiliary_loss_mlp": 0.01034771, + "balance_loss_clip": 1.02088916, + "balance_loss_mlp": 1.0411514, + "epoch": 0.4045092439500977, + "flos": 12093745121280.0, + "grad_norm": 2.0089286405461246, + "language_loss": 0.85300338, + "learning_rate": 2.700462388688447e-06, + "loss": 0.87451458, + "num_input_tokens_seen": 144470195, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 6728, + "time_per_iteration": 2.4719340801239014 + }, + { + "auxiliary_loss_clip": 0.01117368, + "auxiliary_loss_mlp": 0.01034182, + "balance_loss_clip": 1.02059257, + "balance_loss_mlp": 1.04241705, + "epoch": 0.40456936720276565, + "flos": 21179683123200.0, + "grad_norm": 1.6690883830899332, + "language_loss": 0.81804991, + "learning_rate": 2.700097580951786e-06, + "loss": 0.8395654, + "num_input_tokens_seen": 144490320, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.75, + "step": 6729, + "time_per_iteration": 2.4482905864715576 + }, + { + "auxiliary_loss_clip": 0.01114628, + "auxiliary_loss_mlp": 0.01034739, + "balance_loss_clip": 1.02092838, + "balance_loss_mlp": 1.04034996, + "epoch": 0.4046294904554336, + "flos": 23915286000000.0, + "grad_norm": 1.841339511320202, + "language_loss": 0.72582501, + "learning_rate": 2.6997327466673533e-06, + "loss": 0.74731869, + "num_input_tokens_seen": 144508990, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7421875, + "step": 6730, + "time_per_iteration": 2.537121295928955 + }, + { + "auxiliary_loss_clip": 0.01114402, + "auxiliary_loss_mlp": 0.01035728, + "balance_loss_clip": 1.0216732, + "balance_loss_mlp": 1.04037821, + "epoch": 0.4046896137081016, + "flos": 38071235773440.0, + "grad_norm": 1.6090983176176454, + "language_loss": 0.67394918, + "learning_rate": 2.699367885848985e-06, + "loss": 0.69545048, + "num_input_tokens_seen": 144529550, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 6731, + "time_per_iteration": 2.645958423614502 + }, + { + "auxiliary_loss_clip": 0.01114135, + "auxiliary_loss_mlp": 0.01034866, + "balance_loss_clip": 1.02196193, + "balance_loss_mlp": 1.03986645, + "epoch": 0.4047497369607696, + "flos": 23617262856960.0, + "grad_norm": 1.6078062973222544, + "language_loss": 0.74067897, + "learning_rate": 2.699002998510517e-06, + "loss": 0.76216894, + "num_input_tokens_seen": 144549310, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7421875, + "step": 6732, + "time_per_iteration": 2.5182886123657227 + }, + { + "auxiliary_loss_clip": 0.01114756, + "auxiliary_loss_mlp": 0.01028887, + "balance_loss_clip": 1.01650739, + "balance_loss_mlp": 1.04178488, + "epoch": 0.40480986021343757, + "flos": 12823593569280.0, + "grad_norm": 1.830865433765548, + "language_loss": 0.7690779, + "learning_rate": 2.6986380846657852e-06, + "loss": 0.79051435, + "num_input_tokens_seen": 144567430, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.73046875, + "step": 6733, + "time_per_iteration": 2.430748701095581 + }, + { + "auxiliary_loss_clip": 0.01120623, + "auxiliary_loss_mlp": 0.01038866, + "balance_loss_clip": 1.02358902, + "balance_loss_mlp": 1.04164028, + "epoch": 0.40486998346610553, + "flos": 23768770423680.0, + "grad_norm": 1.8916182343646197, + "language_loss": 0.7649287, + "learning_rate": 2.698273144328627e-06, + "loss": 0.78652358, + "num_input_tokens_seen": 144585975, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.7890625, + "step": 6734, + "time_per_iteration": 2.507070541381836 + }, + { + "auxiliary_loss_clip": 0.01121282, + "auxiliary_loss_mlp": 0.01030778, + "balance_loss_clip": 1.01729572, + "balance_loss_mlp": 1.04258728, + "epoch": 0.4049301067187735, + "flos": 22856818999680.0, + "grad_norm": 2.227264135735927, + "language_loss": 0.65026176, + "learning_rate": 2.6979081775128805e-06, + "loss": 0.67178231, + "num_input_tokens_seen": 144605225, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7890625, + "step": 6735, + "time_per_iteration": 2.4677040576934814 + }, + { + "auxiliary_loss_clip": 0.01113204, + "auxiliary_loss_mlp": 0.01034185, + "balance_loss_clip": 1.02154267, + "balance_loss_mlp": 1.04025424, + "epoch": 0.40499022997144146, + "flos": 22783992174720.0, + "grad_norm": 1.9551652085107198, + "language_loss": 0.83177966, + "learning_rate": 2.697543184232387e-06, + "loss": 0.85325354, + "num_input_tokens_seen": 144624145, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7265625, + "step": 6736, + "time_per_iteration": 2.5244226455688477 + }, + { + "auxiliary_loss_clip": 0.01121161, + "auxiliary_loss_mlp": 0.01037611, + "balance_loss_clip": 1.02344942, + "balance_loss_mlp": 1.04291666, + "epoch": 0.4050503532241094, + "flos": 23039352938880.0, + "grad_norm": 1.699075737504615, + "language_loss": 0.7520684, + "learning_rate": 2.6971781645009863e-06, + "loss": 0.77365613, + "num_input_tokens_seen": 144644470, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.78125, + "step": 6737, + "time_per_iteration": 2.510906457901001 + }, + { + "auxiliary_loss_clip": 0.01117535, + "auxiliary_loss_mlp": 0.0103775, + "balance_loss_clip": 1.02408242, + "balance_loss_mlp": 1.04335642, + "epoch": 0.4051104764767774, + "flos": 16647756065280.0, + "grad_norm": 2.288492776548484, + "language_loss": 0.71790028, + "learning_rate": 2.696813118332519e-06, + "loss": 0.73945308, + "num_input_tokens_seen": 144661055, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7421875, + "step": 6738, + "time_per_iteration": 2.514575481414795 + }, + { + "auxiliary_loss_clip": 0.01114478, + "auxiliary_loss_mlp": 0.01031321, + "balance_loss_clip": 1.01845288, + "balance_loss_mlp": 1.04022241, + "epoch": 0.40517059972944536, + "flos": 16358962717440.0, + "grad_norm": 2.003378473366394, + "language_loss": 0.75169361, + "learning_rate": 2.696448045740828e-06, + "loss": 0.77315164, + "num_input_tokens_seen": 144677935, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7421875, + "step": 6739, + "time_per_iteration": 2.4737000465393066 + }, + { + "auxiliary_loss_clip": 0.01119431, + "auxiliary_loss_mlp": 0.01034852, + "balance_loss_clip": 1.02107763, + "balance_loss_mlp": 1.04296541, + "epoch": 0.4052307229821133, + "flos": 28803374363520.0, + "grad_norm": 1.7865413260400147, + "language_loss": 0.73943472, + "learning_rate": 2.6960829467397576e-06, + "loss": 0.76097751, + "num_input_tokens_seen": 144697725, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.765625, + "step": 6740, + "time_per_iteration": 2.5434296131134033 + }, + { + "auxiliary_loss_clip": 0.0111643, + "auxiliary_loss_mlp": 0.01032682, + "balance_loss_clip": 1.0190562, + "balance_loss_mlp": 1.04310441, + "epoch": 0.4052908462347813, + "flos": 21397876289280.0, + "grad_norm": 1.5350516452213203, + "language_loss": 0.77179801, + "learning_rate": 2.695717821343153e-06, + "loss": 0.79328907, + "num_input_tokens_seen": 144718805, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.734375, + "step": 6741, + "time_per_iteration": 2.473451852798462 + }, + { + "auxiliary_loss_clip": 0.01120883, + "auxiliary_loss_mlp": 0.01035782, + "balance_loss_clip": 1.02082753, + "balance_loss_mlp": 1.04359269, + "epoch": 0.40535096948744925, + "flos": 22419067950720.0, + "grad_norm": 1.8990417013226273, + "language_loss": 0.70827335, + "learning_rate": 2.6953526695648577e-06, + "loss": 0.72983992, + "num_input_tokens_seen": 144737105, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.7734375, + "step": 6742, + "time_per_iteration": 2.4797537326812744 + }, + { + "auxiliary_loss_clip": 0.01121445, + "auxiliary_loss_mlp": 0.01029673, + "balance_loss_clip": 1.01517677, + "balance_loss_mlp": 1.04446578, + "epoch": 0.4054110927401172, + "flos": 17010776868480.0, + "grad_norm": 2.180199258846301, + "language_loss": 0.72242743, + "learning_rate": 2.6949874914187202e-06, + "loss": 0.74393857, + "num_input_tokens_seen": 144751350, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.76953125, + "step": 6743, + "time_per_iteration": 2.409444808959961 + }, + { + "auxiliary_loss_clip": 0.0112179, + "auxiliary_loss_mlp": 0.0103567, + "balance_loss_clip": 1.02128196, + "balance_loss_mlp": 1.04374886, + "epoch": 0.4054712159927852, + "flos": 21614848392960.0, + "grad_norm": 3.287949139408167, + "language_loss": 0.70554733, + "learning_rate": 2.694622286918588e-06, + "loss": 0.72712195, + "num_input_tokens_seen": 144770030, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 6744, + "time_per_iteration": 2.475775957107544 + }, + { + "auxiliary_loss_clip": 0.01116341, + "auxiliary_loss_mlp": 0.01034834, + "balance_loss_clip": 1.02154207, + "balance_loss_mlp": 1.04163671, + "epoch": 0.4055313392454532, + "flos": 25812554376960.0, + "grad_norm": 1.534678646828984, + "language_loss": 0.79982138, + "learning_rate": 2.6942570560783076e-06, + "loss": 0.82133317, + "num_input_tokens_seen": 144790965, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.74609375, + "step": 6745, + "time_per_iteration": 2.492379903793335 + }, + { + "auxiliary_loss_clip": 0.01120523, + "auxiliary_loss_mlp": 0.01033491, + "balance_loss_clip": 1.01904845, + "balance_loss_mlp": 1.0463028, + "epoch": 0.40559146249812117, + "flos": 14137098111360.0, + "grad_norm": 1.8557240822638386, + "language_loss": 0.66450787, + "learning_rate": 2.693891798911731e-06, + "loss": 0.68604791, + "num_input_tokens_seen": 144807755, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7421875, + "step": 6746, + "time_per_iteration": 2.4547531604766846 + }, + { + "auxiliary_loss_clip": 0.01118105, + "auxiliary_loss_mlp": 0.01029204, + "balance_loss_clip": 1.01573384, + "balance_loss_mlp": 1.04319298, + "epoch": 0.40565158575078913, + "flos": 41355481962240.0, + "grad_norm": 1.5006534813974708, + "language_loss": 0.5713616, + "learning_rate": 2.6935265154327075e-06, + "loss": 0.59283465, + "num_input_tokens_seen": 144832405, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 6747, + "time_per_iteration": 2.627912998199463 + }, + { + "auxiliary_loss_clip": 0.01119274, + "auxiliary_loss_mlp": 0.01041713, + "balance_loss_clip": 1.02859426, + "balance_loss_mlp": 1.04399908, + "epoch": 0.4057117090034571, + "flos": 28544529980160.0, + "grad_norm": 1.605109327396707, + "language_loss": 0.8454957, + "learning_rate": 2.693161205655089e-06, + "loss": 0.8671056, + "num_input_tokens_seen": 144853890, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.75390625, + "step": 6748, + "time_per_iteration": 2.5783345699310303 + }, + { + "auxiliary_loss_clip": 0.01120452, + "auxiliary_loss_mlp": 0.01035858, + "balance_loss_clip": 1.02210689, + "balance_loss_mlp": 1.04356313, + "epoch": 0.40577183225612506, + "flos": 18004066640640.0, + "grad_norm": 2.1468645636667705, + "language_loss": 0.81288636, + "learning_rate": 2.6927958695927287e-06, + "loss": 0.83444953, + "num_input_tokens_seen": 144871395, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76953125, + "step": 6749, + "time_per_iteration": 2.433042049407959 + }, + { + "auxiliary_loss_clip": 0.01120578, + "auxiliary_loss_mlp": 0.01037463, + "balance_loss_clip": 1.02395105, + "balance_loss_mlp": 1.04512405, + "epoch": 0.40583195550879303, + "flos": 19536734016000.0, + "grad_norm": 1.6093122324869749, + "language_loss": 0.75051296, + "learning_rate": 2.6924305072594784e-06, + "loss": 0.77209336, + "num_input_tokens_seen": 144890975, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 6750, + "time_per_iteration": 2.500444173812866 + }, + { + "auxiliary_loss_clip": 0.01120094, + "auxiliary_loss_mlp": 0.01034106, + "balance_loss_clip": 1.01919341, + "balance_loss_mlp": 1.04114318, + "epoch": 0.405892078761461, + "flos": 22309468577280.0, + "grad_norm": 2.1309201825140662, + "language_loss": 0.73826647, + "learning_rate": 2.692065118669195e-06, + "loss": 0.75980842, + "num_input_tokens_seen": 144908170, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.7890625, + "step": 6751, + "time_per_iteration": 2.4808826446533203 + }, + { + "auxiliary_loss_clip": 0.01120759, + "auxiliary_loss_mlp": 0.01031901, + "balance_loss_clip": 1.01758409, + "balance_loss_mlp": 1.04471755, + "epoch": 0.40595220201412896, + "flos": 25484402701440.0, + "grad_norm": 5.559089751596236, + "language_loss": 0.6666553, + "learning_rate": 2.6916997038357326e-06, + "loss": 0.68818188, + "num_input_tokens_seen": 144928020, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.76171875, + "step": 6752, + "time_per_iteration": 2.568223714828491 + }, + { + "auxiliary_loss_clip": 0.0112446, + "auxiliary_loss_mlp": 0.0103634, + "balance_loss_clip": 1.02189183, + "balance_loss_mlp": 1.04458666, + "epoch": 0.4060123252667969, + "flos": 49856004103680.0, + "grad_norm": 1.70284971706228, + "language_loss": 0.70600617, + "learning_rate": 2.691334262772948e-06, + "loss": 0.72761416, + "num_input_tokens_seen": 144951240, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.796875, + "step": 6753, + "time_per_iteration": 2.696746587753296 + }, + { + "auxiliary_loss_clip": 0.01119466, + "auxiliary_loss_mlp": 0.01035823, + "balance_loss_clip": 1.02145791, + "balance_loss_mlp": 1.04105067, + "epoch": 0.4060724485194649, + "flos": 21135476459520.0, + "grad_norm": 2.1929566205477804, + "language_loss": 0.71584499, + "learning_rate": 2.690968795494699e-06, + "loss": 0.73739791, + "num_input_tokens_seen": 144969100, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78515625, + "step": 6754, + "time_per_iteration": 2.49405837059021 + }, + { + "auxiliary_loss_clip": 0.01120059, + "auxiliary_loss_mlp": 0.0103975, + "balance_loss_clip": 1.02568889, + "balance_loss_mlp": 1.04273617, + "epoch": 0.40613257177213286, + "flos": 21758059918080.0, + "grad_norm": 1.7112877357577985, + "language_loss": 0.82864529, + "learning_rate": 2.690603302014844e-06, + "loss": 0.85024333, + "num_input_tokens_seen": 144987065, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7734375, + "step": 6755, + "time_per_iteration": 2.4666147232055664 + }, + { + "auxiliary_loss_clip": 0.01122705, + "auxiliary_loss_mlp": 0.01040879, + "balance_loss_clip": 1.02599001, + "balance_loss_mlp": 1.04292035, + "epoch": 0.4061926950248008, + "flos": 25555074710400.0, + "grad_norm": 1.484337354822898, + "language_loss": 0.70812732, + "learning_rate": 2.6902377823472426e-06, + "loss": 0.72976315, + "num_input_tokens_seen": 145007310, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 6756, + "time_per_iteration": 2.539236307144165 + }, + { + "auxiliary_loss_clip": 0.01120038, + "auxiliary_loss_mlp": 0.01041859, + "balance_loss_clip": 1.02724361, + "balance_loss_mlp": 1.04106975, + "epoch": 0.4062528182774688, + "flos": 23695799944320.0, + "grad_norm": 1.6617053894159006, + "language_loss": 0.79047221, + "learning_rate": 2.689872236505755e-06, + "loss": 0.81209117, + "num_input_tokens_seen": 145026210, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.79296875, + "step": 6757, + "time_per_iteration": 2.4614784717559814 + }, + { + "auxiliary_loss_clip": 0.01121935, + "auxiliary_loss_mlp": 0.0103313, + "balance_loss_clip": 1.01865852, + "balance_loss_mlp": 1.04454553, + "epoch": 0.4063129415301368, + "flos": 21726027964800.0, + "grad_norm": 1.5700268222495364, + "language_loss": 0.7851724, + "learning_rate": 2.6895066645042437e-06, + "loss": 0.806723, + "num_input_tokens_seen": 145045475, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7734375, + "step": 6758, + "time_per_iteration": 2.495060920715332 + }, + { + "auxiliary_loss_clip": 0.01116636, + "auxiliary_loss_mlp": 0.01031594, + "balance_loss_clip": 1.01692557, + "balance_loss_mlp": 1.04113591, + "epoch": 0.40637306478280477, + "flos": 12787575206400.0, + "grad_norm": 2.1344538838988454, + "language_loss": 0.88668954, + "learning_rate": 2.6891410663565703e-06, + "loss": 0.90817189, + "num_input_tokens_seen": 145062260, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.75390625, + "step": 6759, + "time_per_iteration": 2.410628318786621 + }, + { + "auxiliary_loss_clip": 0.01120377, + "auxiliary_loss_mlp": 0.01033455, + "balance_loss_clip": 1.01986527, + "balance_loss_mlp": 1.04366982, + "epoch": 0.40643318803547274, + "flos": 24024490323840.0, + "grad_norm": 2.0728742760332546, + "language_loss": 0.63888443, + "learning_rate": 2.688775442076598e-06, + "loss": 0.66042268, + "num_input_tokens_seen": 145082470, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.765625, + "step": 6760, + "time_per_iteration": 2.553819417953491 + }, + { + "auxiliary_loss_clip": 0.01119037, + "auxiliary_loss_mlp": 0.01032802, + "balance_loss_clip": 1.01796103, + "balance_loss_mlp": 1.0422858, + "epoch": 0.4064933112881407, + "flos": 25592421876480.0, + "grad_norm": 1.4242582463540345, + "language_loss": 0.75060493, + "learning_rate": 2.688409791678193e-06, + "loss": 0.77212334, + "num_input_tokens_seen": 145105685, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.765625, + "step": 6761, + "time_per_iteration": 2.520904302597046 + }, + { + "auxiliary_loss_clip": 0.01111351, + "auxiliary_loss_mlp": 0.01033598, + "balance_loss_clip": 1.02029395, + "balance_loss_mlp": 1.04054725, + "epoch": 0.40655343454080867, + "flos": 22054323294720.0, + "grad_norm": 1.4265975037167853, + "language_loss": 0.70109248, + "learning_rate": 2.6880441151752185e-06, + "loss": 0.72254199, + "num_input_tokens_seen": 145125590, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 6762, + "time_per_iteration": 6.884980916976929 + }, + { + "auxiliary_loss_clip": 0.01117935, + "auxiliary_loss_mlp": 0.01032239, + "balance_loss_clip": 1.01893568, + "balance_loss_mlp": 1.04316521, + "epoch": 0.40661355779347663, + "flos": 26468893641600.0, + "grad_norm": 2.223786523351799, + "language_loss": 0.73175049, + "learning_rate": 2.6876784125815433e-06, + "loss": 0.75325227, + "num_input_tokens_seen": 145146810, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75, + "step": 6763, + "time_per_iteration": 3.8783130645751953 + }, + { + "auxiliary_loss_clip": 0.01119915, + "auxiliary_loss_mlp": 0.01036194, + "balance_loss_clip": 1.02200174, + "balance_loss_mlp": 1.04246914, + "epoch": 0.4066736810461446, + "flos": 13261129136640.0, + "grad_norm": 1.725584811158307, + "language_loss": 0.6908524, + "learning_rate": 2.687312683911033e-06, + "loss": 0.71241343, + "num_input_tokens_seen": 145163130, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7734375, + "step": 6764, + "time_per_iteration": 2.4408676624298096 + }, + { + "auxiliary_loss_clip": 0.01123793, + "auxiliary_loss_mlp": 0.01040267, + "balance_loss_clip": 1.02481747, + "balance_loss_mlp": 1.04485261, + "epoch": 0.40673380429881256, + "flos": 28803625758720.0, + "grad_norm": 2.20566464671706, + "language_loss": 0.91570717, + "learning_rate": 2.686946929177557e-06, + "loss": 0.93734777, + "num_input_tokens_seen": 145181420, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.7890625, + "step": 6765, + "time_per_iteration": 2.4904191493988037 + }, + { + "auxiliary_loss_clip": 0.01122971, + "auxiliary_loss_mlp": 0.01041584, + "balance_loss_clip": 1.02672434, + "balance_loss_mlp": 1.04374599, + "epoch": 0.4067939275514805, + "flos": 12495334152960.0, + "grad_norm": 2.279622168201086, + "language_loss": 0.78459442, + "learning_rate": 2.6865811483949855e-06, + "loss": 0.80623996, + "num_input_tokens_seen": 145198545, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.79296875, + "step": 6766, + "time_per_iteration": 2.4594480991363525 + }, + { + "auxiliary_loss_clip": 0.01119893, + "auxiliary_loss_mlp": 0.01038568, + "balance_loss_clip": 1.02457929, + "balance_loss_mlp": 1.04144108, + "epoch": 0.4068540508041485, + "flos": 18770508069120.0, + "grad_norm": 1.9487336600068845, + "language_loss": 0.76438922, + "learning_rate": 2.6862153415771867e-06, + "loss": 0.78597391, + "num_input_tokens_seen": 145215835, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.78125, + "step": 6767, + "time_per_iteration": 2.4127700328826904 + }, + { + "auxiliary_loss_clip": 0.01119033, + "auxiliary_loss_mlp": 0.01036408, + "balance_loss_clip": 1.0229435, + "balance_loss_mlp": 1.0442543, + "epoch": 0.40691417405681646, + "flos": 28512821249280.0, + "grad_norm": 1.7431301492707811, + "language_loss": 0.77572781, + "learning_rate": 2.685849508738034e-06, + "loss": 0.79728222, + "num_input_tokens_seen": 145236555, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 6768, + "time_per_iteration": 2.5312347412109375 + }, + { + "auxiliary_loss_clip": 0.01118014, + "auxiliary_loss_mlp": 0.01031889, + "balance_loss_clip": 1.01861525, + "balance_loss_mlp": 1.04248428, + "epoch": 0.4069742973094844, + "flos": 20814040627200.0, + "grad_norm": 2.7094466648077935, + "language_loss": 0.87585759, + "learning_rate": 2.6854836498913995e-06, + "loss": 0.89735663, + "num_input_tokens_seen": 145254595, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75390625, + "step": 6769, + "time_per_iteration": 2.434276580810547 + }, + { + "auxiliary_loss_clip": 0.01120302, + "auxiliary_loss_mlp": 0.01032733, + "balance_loss_clip": 1.02028155, + "balance_loss_mlp": 1.04659963, + "epoch": 0.4070344205621524, + "flos": 21470272151040.0, + "grad_norm": 1.8989360481904207, + "language_loss": 0.80883789, + "learning_rate": 2.685117765051156e-06, + "loss": 0.83036822, + "num_input_tokens_seen": 145274005, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.73828125, + "step": 6770, + "time_per_iteration": 2.4768316745758057 + }, + { + "auxiliary_loss_clip": 0.01121746, + "auxiliary_loss_mlp": 0.01032044, + "balance_loss_clip": 1.01699948, + "balance_loss_mlp": 1.04308331, + "epoch": 0.4070945438148204, + "flos": 26830046937600.0, + "grad_norm": 1.6240016049823844, + "language_loss": 0.80161405, + "learning_rate": 2.6847518542311783e-06, + "loss": 0.82315195, + "num_input_tokens_seen": 145294850, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.78515625, + "step": 6771, + "time_per_iteration": 2.4864251613616943 + }, + { + "auxiliary_loss_clip": 0.01116481, + "auxiliary_loss_mlp": 0.01038824, + "balance_loss_clip": 1.02476382, + "balance_loss_mlp": 1.04181063, + "epoch": 0.4071546670674884, + "flos": 26354158623360.0, + "grad_norm": 1.5515756087522081, + "language_loss": 0.76267636, + "learning_rate": 2.6843859174453417e-06, + "loss": 0.7842294, + "num_input_tokens_seen": 145317050, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.74609375, + "step": 6772, + "time_per_iteration": 2.5570874214172363 + }, + { + "auxiliary_loss_clip": 0.01116059, + "auxiliary_loss_mlp": 0.01040475, + "balance_loss_clip": 1.0259316, + "balance_loss_mlp": 1.04014397, + "epoch": 0.40721479032015634, + "flos": 17895401020800.0, + "grad_norm": 1.6577007729475706, + "language_loss": 0.81418705, + "learning_rate": 2.6840199547075218e-06, + "loss": 0.83575237, + "num_input_tokens_seen": 145334480, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7578125, + "step": 6773, + "time_per_iteration": 2.4311835765838623 + }, + { + "auxiliary_loss_clip": 0.01040526, + "auxiliary_loss_mlp": 0.01005684, + "balance_loss_clip": 1.00416398, + "balance_loss_mlp": 1.01639521, + "epoch": 0.4072749135728243, + "flos": 49854570537600.0, + "grad_norm": 0.8363890316728796, + "language_loss": 0.6434871, + "learning_rate": 2.683653966031597e-06, + "loss": 0.66394925, + "num_input_tokens_seen": 145388695, + "router_z_loss_clip": 0.01519775, + "router_z_loss_mlp": 0.24121094, + "step": 6774, + "time_per_iteration": 2.987610340118408 + }, + { + "auxiliary_loss_clip": 0.01119504, + "auxiliary_loss_mlp": 0.01035806, + "balance_loss_clip": 1.02136981, + "balance_loss_mlp": 1.04115796, + "epoch": 0.40733503682549227, + "flos": 27563630400000.0, + "grad_norm": 13.875946104557459, + "language_loss": 0.72097111, + "learning_rate": 2.683287951431446e-06, + "loss": 0.74252421, + "num_input_tokens_seen": 145408240, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 6775, + "time_per_iteration": 2.5014185905456543 + }, + { + "auxiliary_loss_clip": 0.01118561, + "auxiliary_loss_mlp": 0.01041249, + "balance_loss_clip": 1.02736115, + "balance_loss_mlp": 1.04123604, + "epoch": 0.40739516007816023, + "flos": 22126970551680.0, + "grad_norm": 1.3741783359801052, + "language_loss": 0.77956975, + "learning_rate": 2.6829219109209474e-06, + "loss": 0.80116785, + "num_input_tokens_seen": 145428395, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7734375, + "step": 6776, + "time_per_iteration": 2.484910488128662 + }, + { + "auxiliary_loss_clip": 0.0112306, + "auxiliary_loss_mlp": 0.01038548, + "balance_loss_clip": 1.0240761, + "balance_loss_mlp": 1.04408884, + "epoch": 0.4074552833308282, + "flos": 23842243693440.0, + "grad_norm": 2.6337418369090404, + "language_loss": 0.79015827, + "learning_rate": 2.682555844513981e-06, + "loss": 0.81177437, + "num_input_tokens_seen": 145448290, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7890625, + "step": 6777, + "time_per_iteration": 2.4701852798461914 + }, + { + "auxiliary_loss_clip": 0.01039569, + "auxiliary_loss_mlp": 0.01005822, + "balance_loss_clip": 1.00424814, + "balance_loss_mlp": 1.01542926, + "epoch": 0.40751540658349616, + "flos": 58000008781440.0, + "grad_norm": 0.6828077953919364, + "language_loss": 0.5320037, + "learning_rate": 2.6821897522244286e-06, + "loss": 0.55245763, + "num_input_tokens_seen": 145509785, + "router_z_loss_clip": 0.01574707, + "router_z_loss_mlp": 0.2421875, + "step": 6778, + "time_per_iteration": 3.117647647857666 + }, + { + "auxiliary_loss_clip": 0.01119188, + "auxiliary_loss_mlp": 0.01041042, + "balance_loss_clip": 1.02658224, + "balance_loss_mlp": 1.04310179, + "epoch": 0.40757552983616413, + "flos": 21214659991680.0, + "grad_norm": 2.2984205071258272, + "language_loss": 0.82367444, + "learning_rate": 2.6818236340661718e-06, + "loss": 0.84527671, + "num_input_tokens_seen": 145528620, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.76171875, + "step": 6779, + "time_per_iteration": 2.4653449058532715 + }, + { + "auxiliary_loss_clip": 0.0111837, + "auxiliary_loss_mlp": 0.01037706, + "balance_loss_clip": 1.02289438, + "balance_loss_mlp": 1.0422008, + "epoch": 0.4076356530888321, + "flos": 26833530556800.0, + "grad_norm": 1.7439910283418456, + "language_loss": 0.7628178, + "learning_rate": 2.6814574900530957e-06, + "loss": 0.78437853, + "num_input_tokens_seen": 145547775, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.76171875, + "step": 6780, + "time_per_iteration": 2.5031514167785645 + }, + { + "auxiliary_loss_clip": 0.01114202, + "auxiliary_loss_mlp": 0.01030127, + "balance_loss_clip": 1.01759243, + "balance_loss_mlp": 1.04146945, + "epoch": 0.40769577634150006, + "flos": 12203021272320.0, + "grad_norm": 2.107375049179959, + "language_loss": 0.65990937, + "learning_rate": 2.6810913201990827e-06, + "loss": 0.68135262, + "num_input_tokens_seen": 145564465, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7265625, + "step": 6781, + "time_per_iteration": 2.431759834289551 + }, + { + "auxiliary_loss_clip": 0.01117153, + "auxiliary_loss_mlp": 0.01036981, + "balance_loss_clip": 1.02233076, + "balance_loss_mlp": 1.04050446, + "epoch": 0.407755899594168, + "flos": 33655264796160.0, + "grad_norm": 2.315782733130647, + "language_loss": 0.71046883, + "learning_rate": 2.6807251245180183e-06, + "loss": 0.73201013, + "num_input_tokens_seen": 145585965, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.765625, + "step": 6782, + "time_per_iteration": 2.567138433456421 + }, + { + "auxiliary_loss_clip": 0.01117461, + "auxiliary_loss_mlp": 0.01031478, + "balance_loss_clip": 1.01789367, + "balance_loss_mlp": 1.04120076, + "epoch": 0.407816022846836, + "flos": 20157342226560.0, + "grad_norm": 1.7193598407967954, + "language_loss": 0.82066965, + "learning_rate": 2.6803589030237897e-06, + "loss": 0.84215903, + "num_input_tokens_seen": 145605000, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.76171875, + "step": 6783, + "time_per_iteration": 2.46891188621521 + }, + { + "auxiliary_loss_clip": 0.01116877, + "auxiliary_loss_mlp": 0.01034742, + "balance_loss_clip": 1.02065194, + "balance_loss_mlp": 1.04063141, + "epoch": 0.40787614609950396, + "flos": 21178821196800.0, + "grad_norm": 1.6682285001774693, + "language_loss": 0.80728561, + "learning_rate": 2.679992655730283e-06, + "loss": 0.82880187, + "num_input_tokens_seen": 145623740, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 6784, + "time_per_iteration": 2.456216812133789 + }, + { + "auxiliary_loss_clip": 0.01122913, + "auxiliary_loss_mlp": 0.01041361, + "balance_loss_clip": 1.0258038, + "balance_loss_mlp": 1.04271793, + "epoch": 0.407936269352172, + "flos": 20520650338560.0, + "grad_norm": 1.7628578717327703, + "language_loss": 0.65640736, + "learning_rate": 2.679626382651386e-06, + "loss": 0.67805004, + "num_input_tokens_seen": 145643515, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.80078125, + "step": 6785, + "time_per_iteration": 2.46173357963562 + }, + { + "auxiliary_loss_clip": 0.01115104, + "auxiliary_loss_mlp": 0.01030368, + "balance_loss_clip": 1.01650357, + "balance_loss_mlp": 1.0397855, + "epoch": 0.40799639260483994, + "flos": 20118809911680.0, + "grad_norm": 1.9756209352263352, + "language_loss": 0.79518569, + "learning_rate": 2.679260083800989e-06, + "loss": 0.81664044, + "num_input_tokens_seen": 145660890, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75390625, + "step": 6786, + "time_per_iteration": 2.430769205093384 + }, + { + "auxiliary_loss_clip": 0.01115587, + "auxiliary_loss_mlp": 0.01036368, + "balance_loss_clip": 1.02349889, + "balance_loss_mlp": 1.04094195, + "epoch": 0.4080565158575079, + "flos": 20997328752000.0, + "grad_norm": 1.5131366331092475, + "language_loss": 0.81249726, + "learning_rate": 2.678893759192982e-06, + "loss": 0.8340168, + "num_input_tokens_seen": 145680070, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.74609375, + "step": 6787, + "time_per_iteration": 2.4589040279388428 + }, + { + "auxiliary_loss_clip": 0.01115847, + "auxiliary_loss_mlp": 0.01033088, + "balance_loss_clip": 1.01907516, + "balance_loss_mlp": 1.04059005, + "epoch": 0.40811663911017587, + "flos": 19317714837120.0, + "grad_norm": 1.9559544882723985, + "language_loss": 0.67917293, + "learning_rate": 2.678527408841255e-06, + "loss": 0.70066231, + "num_input_tokens_seen": 145698010, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.75390625, + "step": 6788, + "time_per_iteration": 2.4450576305389404 + }, + { + "auxiliary_loss_clip": 0.01116018, + "auxiliary_loss_mlp": 0.01041079, + "balance_loss_clip": 1.02644002, + "balance_loss_mlp": 1.03975677, + "epoch": 0.40817676236284384, + "flos": 40625382119040.0, + "grad_norm": 2.2689407766698584, + "language_loss": 0.6605472, + "learning_rate": 2.678161032759701e-06, + "loss": 0.68211812, + "num_input_tokens_seen": 145722215, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.765625, + "step": 6789, + "time_per_iteration": 2.6358134746551514 + }, + { + "auxiliary_loss_clip": 0.01116808, + "auxiliary_loss_mlp": 0.01035749, + "balance_loss_clip": 1.02133691, + "balance_loss_mlp": 1.0408318, + "epoch": 0.4082368856155118, + "flos": 20522086882560.0, + "grad_norm": 1.683929923970831, + "language_loss": 0.60006517, + "learning_rate": 2.6777946309622123e-06, + "loss": 0.62159079, + "num_input_tokens_seen": 145741090, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7578125, + "step": 6790, + "time_per_iteration": 2.4339373111724854 + }, + { + "auxiliary_loss_clip": 0.01117331, + "auxiliary_loss_mlp": 0.01041648, + "balance_loss_clip": 1.02752209, + "balance_loss_mlp": 1.04277873, + "epoch": 0.40829700886817977, + "flos": 11427745098240.0, + "grad_norm": 3.0836688581186538, + "language_loss": 0.69763649, + "learning_rate": 2.677428203462683e-06, + "loss": 0.71922624, + "num_input_tokens_seen": 145754985, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.74609375, + "step": 6791, + "time_per_iteration": 2.3970839977264404 + }, + { + "auxiliary_loss_clip": 0.01036371, + "auxiliary_loss_mlp": 0.01001155, + "balance_loss_clip": 0.99973643, + "balance_loss_mlp": 1.01245427, + "epoch": 0.40835713212084773, + "flos": 67330677121920.0, + "grad_norm": 0.7479961411193888, + "language_loss": 0.59600538, + "learning_rate": 2.6770617502750093e-06, + "loss": 0.61638063, + "num_input_tokens_seen": 145815260, + "router_z_loss_clip": 0.01416016, + "router_z_loss_mlp": 0.23828125, + "step": 6792, + "time_per_iteration": 3.0660579204559326 + }, + { + "auxiliary_loss_clip": 0.01122654, + "auxiliary_loss_mlp": 0.01046383, + "balance_loss_clip": 1.03205419, + "balance_loss_mlp": 1.04478419, + "epoch": 0.4084172553735157, + "flos": 21762010414080.0, + "grad_norm": 2.1865523890186975, + "language_loss": 0.8017205, + "learning_rate": 2.6766952714130857e-06, + "loss": 0.82341087, + "num_input_tokens_seen": 145832665, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 6793, + "time_per_iteration": 2.4930570125579834 + }, + { + "auxiliary_loss_clip": 0.01117695, + "auxiliary_loss_mlp": 0.01034941, + "balance_loss_clip": 1.0203917, + "balance_loss_mlp": 1.04145718, + "epoch": 0.40847737862618366, + "flos": 27417258478080.0, + "grad_norm": 1.7948567342085118, + "language_loss": 0.85040581, + "learning_rate": 2.6763287668908094e-06, + "loss": 0.87193215, + "num_input_tokens_seen": 145850240, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.76171875, + "step": 6794, + "time_per_iteration": 2.500248670578003 + }, + { + "auxiliary_loss_clip": 0.01119372, + "auxiliary_loss_mlp": 0.01036853, + "balance_loss_clip": 1.02316737, + "balance_loss_mlp": 1.04290628, + "epoch": 0.4085375018788516, + "flos": 18587255857920.0, + "grad_norm": 1.6403079662436217, + "language_loss": 0.79991007, + "learning_rate": 2.6759622367220788e-06, + "loss": 0.82147229, + "num_input_tokens_seen": 145869545, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.765625, + "step": 6795, + "time_per_iteration": 2.4969587326049805 + }, + { + "auxiliary_loss_clip": 0.01121457, + "auxiliary_loss_mlp": 0.01034155, + "balance_loss_clip": 1.01903319, + "balance_loss_mlp": 1.0415107, + "epoch": 0.4085976251315196, + "flos": 15411783029760.0, + "grad_norm": 3.0496031094407767, + "language_loss": 0.69604456, + "learning_rate": 2.675595680920792e-06, + "loss": 0.7176007, + "num_input_tokens_seen": 145884025, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.80078125, + "step": 6796, + "time_per_iteration": 2.415790319442749 + }, + { + "auxiliary_loss_clip": 0.01115637, + "auxiliary_loss_mlp": 0.01037628, + "balance_loss_clip": 1.02436018, + "balance_loss_mlp": 1.04028058, + "epoch": 0.40865774838418756, + "flos": 21252222639360.0, + "grad_norm": 1.6154855191434097, + "language_loss": 0.77814329, + "learning_rate": 2.6752290995008498e-06, + "loss": 0.799676, + "num_input_tokens_seen": 145903210, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75390625, + "step": 6797, + "time_per_iteration": 2.4960498809814453 + }, + { + "auxiliary_loss_clip": 0.01114842, + "auxiliary_loss_mlp": 0.01043476, + "balance_loss_clip": 1.03020835, + "balance_loss_mlp": 1.03869152, + "epoch": 0.4087178716368556, + "flos": 13772245714560.0, + "grad_norm": 2.268592052790042, + "language_loss": 0.85668063, + "learning_rate": 2.6748624924761523e-06, + "loss": 0.87826383, + "num_input_tokens_seen": 145920985, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.76171875, + "step": 6798, + "time_per_iteration": 2.4271299839019775 + }, + { + "auxiliary_loss_clip": 0.01115644, + "auxiliary_loss_mlp": 0.01035575, + "balance_loss_clip": 1.02341557, + "balance_loss_mlp": 1.04205322, + "epoch": 0.40877799488952354, + "flos": 23621752056960.0, + "grad_norm": 1.4625848333242037, + "language_loss": 0.8396889, + "learning_rate": 2.674495859860601e-06, + "loss": 0.86120105, + "num_input_tokens_seen": 145940350, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.734375, + "step": 6799, + "time_per_iteration": 2.5059525966644287 + }, + { + "auxiliary_loss_clip": 0.01118535, + "auxiliary_loss_mlp": 0.01043278, + "balance_loss_clip": 1.02861547, + "balance_loss_mlp": 1.04282522, + "epoch": 0.4088381181421915, + "flos": 20918791664640.0, + "grad_norm": 2.2336787226224453, + "language_loss": 0.83352369, + "learning_rate": 2.6741292016681e-06, + "loss": 0.85514188, + "num_input_tokens_seen": 145957460, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7578125, + "step": 6800, + "time_per_iteration": 2.441771984100342 + }, + { + "auxiliary_loss_clip": 0.01118367, + "auxiliary_loss_mlp": 0.01041201, + "balance_loss_clip": 1.02665734, + "balance_loss_mlp": 1.04080248, + "epoch": 0.4088982413948595, + "flos": 13297578462720.0, + "grad_norm": 1.815509221734431, + "language_loss": 0.74838769, + "learning_rate": 2.6737625179125514e-06, + "loss": 0.76998335, + "num_input_tokens_seen": 145975285, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.77734375, + "step": 6801, + "time_per_iteration": 2.4573957920074463 + }, + { + "auxiliary_loss_clip": 0.01118841, + "auxiliary_loss_mlp": 0.01039037, + "balance_loss_clip": 1.02418303, + "balance_loss_mlp": 1.04115379, + "epoch": 0.40895836464752744, + "flos": 15267673664640.0, + "grad_norm": 3.5876275394170682, + "language_loss": 0.79983771, + "learning_rate": 2.673395808607861e-06, + "loss": 0.8214165, + "num_input_tokens_seen": 145989150, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.77734375, + "step": 6802, + "time_per_iteration": 2.4583706855773926 + }, + { + "auxiliary_loss_clip": 0.01121333, + "auxiliary_loss_mlp": 0.01040482, + "balance_loss_clip": 1.02436495, + "balance_loss_mlp": 1.04269981, + "epoch": 0.4090184879001954, + "flos": 14501411804160.0, + "grad_norm": 1.9920926766799116, + "language_loss": 0.75564265, + "learning_rate": 2.673029073767934e-06, + "loss": 0.77726078, + "num_input_tokens_seen": 146006980, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.7890625, + "step": 6803, + "time_per_iteration": 3.8293817043304443 + }, + { + "auxiliary_loss_clip": 0.01117955, + "auxiliary_loss_mlp": 0.01037436, + "balance_loss_clip": 1.02296996, + "balance_loss_mlp": 1.04163659, + "epoch": 0.40907861115286337, + "flos": 13881593692800.0, + "grad_norm": 1.8273723177462575, + "language_loss": 0.78676009, + "learning_rate": 2.6726623134066764e-06, + "loss": 0.80831397, + "num_input_tokens_seen": 146025125, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.765625, + "step": 6804, + "time_per_iteration": 5.276589393615723 + }, + { + "auxiliary_loss_clip": 0.01121753, + "auxiliary_loss_mlp": 0.01038873, + "balance_loss_clip": 1.02486575, + "balance_loss_mlp": 1.04170704, + "epoch": 0.40913873440553133, + "flos": 28037615293440.0, + "grad_norm": 1.824409853433396, + "language_loss": 0.74958569, + "learning_rate": 2.672295527537998e-06, + "loss": 0.77119195, + "num_input_tokens_seen": 146044990, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.80078125, + "step": 6805, + "time_per_iteration": 2.4856061935424805 + }, + { + "auxiliary_loss_clip": 0.01121334, + "auxiliary_loss_mlp": 0.01040926, + "balance_loss_clip": 1.02701998, + "balance_loss_mlp": 1.04323924, + "epoch": 0.4091988576581993, + "flos": 21618188357760.0, + "grad_norm": 1.6270528279533119, + "language_loss": 0.79471934, + "learning_rate": 2.671928716175804e-06, + "loss": 0.816342, + "num_input_tokens_seen": 146066045, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.78125, + "step": 6806, + "time_per_iteration": 2.4999823570251465 + }, + { + "auxiliary_loss_clip": 0.01120343, + "auxiliary_loss_mlp": 0.01034262, + "balance_loss_clip": 1.02002871, + "balance_loss_mlp": 1.04182625, + "epoch": 0.40925898091086726, + "flos": 25224085860480.0, + "grad_norm": 1.8904572172377134, + "language_loss": 0.72131455, + "learning_rate": 2.671561879334007e-06, + "loss": 0.74286067, + "num_input_tokens_seen": 146086280, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78515625, + "step": 6807, + "time_per_iteration": 2.4900894165039062 + }, + { + "auxiliary_loss_clip": 0.01035827, + "auxiliary_loss_mlp": 0.01000695, + "balance_loss_clip": 0.99931204, + "balance_loss_mlp": 1.01169431, + "epoch": 0.40931910416353523, + "flos": 68930568800640.0, + "grad_norm": 0.8333385820049739, + "language_loss": 0.58798856, + "learning_rate": 2.6711950170265155e-06, + "loss": 0.60835379, + "num_input_tokens_seen": 146148840, + "router_z_loss_clip": 0.01385498, + "router_z_loss_mlp": 0.24121094, + "step": 6808, + "time_per_iteration": 3.1670446395874023 + }, + { + "auxiliary_loss_clip": 0.0111783, + "auxiliary_loss_mlp": 0.01047199, + "balance_loss_clip": 1.03397894, + "balance_loss_mlp": 1.04200959, + "epoch": 0.4093792274162032, + "flos": 20189553747840.0, + "grad_norm": 1.6310291749342813, + "language_loss": 0.54454345, + "learning_rate": 2.670828129267242e-06, + "loss": 0.56619376, + "num_input_tokens_seen": 146166195, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7578125, + "step": 6809, + "time_per_iteration": 2.445084571838379 + }, + { + "auxiliary_loss_clip": 0.01117961, + "auxiliary_loss_mlp": 0.01031185, + "balance_loss_clip": 1.0176785, + "balance_loss_mlp": 1.0413785, + "epoch": 0.40943935066887116, + "flos": 25228754628480.0, + "grad_norm": 1.8964783600080724, + "language_loss": 0.83296275, + "learning_rate": 2.6704612160700983e-06, + "loss": 0.85445428, + "num_input_tokens_seen": 146185045, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 6810, + "time_per_iteration": 2.507234573364258 + }, + { + "auxiliary_loss_clip": 0.01121577, + "auxiliary_loss_mlp": 0.01042346, + "balance_loss_clip": 1.02736187, + "balance_loss_mlp": 1.04350328, + "epoch": 0.4094994739215392, + "flos": 23255319461760.0, + "grad_norm": 2.219108175656967, + "language_loss": 0.77739668, + "learning_rate": 2.670094277448999e-06, + "loss": 0.79903591, + "num_input_tokens_seen": 146204655, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.78125, + "step": 6811, + "time_per_iteration": 2.4652421474456787 + }, + { + "auxiliary_loss_clip": 0.01118877, + "auxiliary_loss_mlp": 0.01033588, + "balance_loss_clip": 1.01804352, + "balance_loss_mlp": 1.04151464, + "epoch": 0.40955959717420715, + "flos": 17382165540480.0, + "grad_norm": 1.8555113442690365, + "language_loss": 0.69810557, + "learning_rate": 2.669727313417857e-06, + "loss": 0.7196303, + "num_input_tokens_seen": 146222000, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.7734375, + "step": 6812, + "time_per_iteration": 2.4447555541992188 + }, + { + "auxiliary_loss_clip": 0.0111498, + "auxiliary_loss_mlp": 0.01040667, + "balance_loss_clip": 1.02644539, + "balance_loss_mlp": 1.03930998, + "epoch": 0.4096197204268751, + "flos": 25082418620160.0, + "grad_norm": 1.4849650877087106, + "language_loss": 0.66131341, + "learning_rate": 2.6693603239905872e-06, + "loss": 0.68286985, + "num_input_tokens_seen": 146242630, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7578125, + "step": 6813, + "time_per_iteration": 2.461461067199707 + }, + { + "auxiliary_loss_clip": 0.01115791, + "auxiliary_loss_mlp": 0.01036723, + "balance_loss_clip": 1.02209592, + "balance_loss_mlp": 1.04076779, + "epoch": 0.4096798436795431, + "flos": 30586769648640.0, + "grad_norm": 1.8347983960230858, + "language_loss": 0.73899138, + "learning_rate": 2.6689933091811087e-06, + "loss": 0.76051652, + "num_input_tokens_seen": 146263070, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.75, + "step": 6814, + "time_per_iteration": 2.5444507598876953 + }, + { + "auxiliary_loss_clip": 0.01120309, + "auxiliary_loss_mlp": 0.01034627, + "balance_loss_clip": 1.02035785, + "balance_loss_mlp": 1.04147315, + "epoch": 0.40973996693221104, + "flos": 24133622820480.0, + "grad_norm": 2.162963447393967, + "language_loss": 0.65966797, + "learning_rate": 2.6686262690033357e-06, + "loss": 0.68121737, + "num_input_tokens_seen": 146282890, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7890625, + "step": 6815, + "time_per_iteration": 2.4877898693084717 + }, + { + "auxiliary_loss_clip": 0.01116543, + "auxiliary_loss_mlp": 0.01037411, + "balance_loss_clip": 1.02371955, + "balance_loss_mlp": 1.04337275, + "epoch": 0.409800090184879, + "flos": 23988974751360.0, + "grad_norm": 1.6370882031659308, + "language_loss": 0.76553667, + "learning_rate": 2.668259203471188e-06, + "loss": 0.78707623, + "num_input_tokens_seen": 146301755, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73046875, + "step": 6816, + "time_per_iteration": 2.5013954639434814 + }, + { + "auxiliary_loss_clip": 0.01119372, + "auxiliary_loss_mlp": 0.01038562, + "balance_loss_clip": 1.02404261, + "balance_loss_mlp": 1.04302227, + "epoch": 0.40986021343754697, + "flos": 16143678552960.0, + "grad_norm": 2.8457932880819463, + "language_loss": 0.81718624, + "learning_rate": 2.6678921125985843e-06, + "loss": 0.8387655, + "num_input_tokens_seen": 146316835, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.765625, + "step": 6817, + "time_per_iteration": 2.407566785812378 + }, + { + "auxiliary_loss_clip": 0.01121536, + "auxiliary_loss_mlp": 0.01037881, + "balance_loss_clip": 1.02179992, + "balance_loss_mlp": 1.04166436, + "epoch": 0.40992033669021494, + "flos": 24790824011520.0, + "grad_norm": 1.7366839484469832, + "language_loss": 0.79938078, + "learning_rate": 2.667524996399444e-06, + "loss": 0.82097495, + "num_input_tokens_seen": 146336650, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.80078125, + "step": 6818, + "time_per_iteration": 2.49364972114563 + }, + { + "auxiliary_loss_clip": 0.01114596, + "auxiliary_loss_mlp": 0.01036542, + "balance_loss_clip": 1.02288651, + "balance_loss_mlp": 1.03982878, + "epoch": 0.4099804599428829, + "flos": 29641888431360.0, + "grad_norm": 1.4683684500872527, + "language_loss": 0.65939564, + "learning_rate": 2.66715785488769e-06, + "loss": 0.68090701, + "num_input_tokens_seen": 146357640, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 6819, + "time_per_iteration": 2.5122451782226562 + }, + { + "auxiliary_loss_clip": 0.01123256, + "auxiliary_loss_mlp": 0.0103677, + "balance_loss_clip": 1.02191615, + "balance_loss_mlp": 1.04243147, + "epoch": 0.41004058319555087, + "flos": 24826590979200.0, + "grad_norm": 1.4566856211473176, + "language_loss": 0.85411352, + "learning_rate": 2.6667906880772428e-06, + "loss": 0.87571383, + "num_input_tokens_seen": 146379325, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.80859375, + "step": 6820, + "time_per_iteration": 2.4924051761627197 + }, + { + "auxiliary_loss_clip": 0.01116594, + "auxiliary_loss_mlp": 0.01033155, + "balance_loss_clip": 1.0189811, + "balance_loss_mlp": 1.04211807, + "epoch": 0.41010070644821883, + "flos": 25737464995200.0, + "grad_norm": 1.9363068637508836, + "language_loss": 0.71033639, + "learning_rate": 2.6664234959820256e-06, + "loss": 0.73183382, + "num_input_tokens_seen": 146398635, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7421875, + "step": 6821, + "time_per_iteration": 2.5236756801605225 + }, + { + "auxiliary_loss_clip": 0.01115707, + "auxiliary_loss_mlp": 0.01032479, + "balance_loss_clip": 1.01859689, + "balance_loss_mlp": 1.03997672, + "epoch": 0.4101608297008868, + "flos": 22346061557760.0, + "grad_norm": 2.2789873913326404, + "language_loss": 0.74732232, + "learning_rate": 2.6660562786159634e-06, + "loss": 0.76880419, + "num_input_tokens_seen": 146417585, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 6822, + "time_per_iteration": 2.485173225402832 + }, + { + "auxiliary_loss_clip": 0.01117367, + "auxiliary_loss_mlp": 0.01036407, + "balance_loss_clip": 1.02226305, + "balance_loss_mlp": 1.04145467, + "epoch": 0.41022095295355476, + "flos": 21945083057280.0, + "grad_norm": 1.8990120981529888, + "language_loss": 0.7503438, + "learning_rate": 2.6656890359929796e-06, + "loss": 0.77188146, + "num_input_tokens_seen": 146437035, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7578125, + "step": 6823, + "time_per_iteration": 2.46115779876709 + }, + { + "auxiliary_loss_clip": 0.01124707, + "auxiliary_loss_mlp": 0.01039141, + "balance_loss_clip": 1.02359605, + "balance_loss_mlp": 1.04229724, + "epoch": 0.4102810762062228, + "flos": 27450511493760.0, + "grad_norm": 2.6227876605231986, + "language_loss": 0.73347652, + "learning_rate": 2.665321768127001e-06, + "loss": 0.75511503, + "num_input_tokens_seen": 146457370, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8203125, + "step": 6824, + "time_per_iteration": 2.504561185836792 + }, + { + "auxiliary_loss_clip": 0.01120752, + "auxiliary_loss_mlp": 0.01035065, + "balance_loss_clip": 1.01985359, + "balance_loss_mlp": 1.04105759, + "epoch": 0.41034119945889075, + "flos": 24499265316480.0, + "grad_norm": 2.228764168551681, + "language_loss": 0.71601099, + "learning_rate": 2.6649544750319548e-06, + "loss": 0.73756915, + "num_input_tokens_seen": 146478105, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.796875, + "step": 6825, + "time_per_iteration": 2.476551055908203 + }, + { + "auxiliary_loss_clip": 0.01117579, + "auxiliary_loss_mlp": 0.0103678, + "balance_loss_clip": 1.02359533, + "balance_loss_mlp": 1.04292464, + "epoch": 0.4104013227115587, + "flos": 24352641999360.0, + "grad_norm": 1.9864880407367733, + "language_loss": 0.84743512, + "learning_rate": 2.664587156721768e-06, + "loss": 0.86897874, + "num_input_tokens_seen": 146497835, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.74609375, + "step": 6826, + "time_per_iteration": 2.492030382156372 + }, + { + "auxiliary_loss_clip": 0.01117058, + "auxiliary_loss_mlp": 0.01035255, + "balance_loss_clip": 1.02066422, + "balance_loss_mlp": 1.0431006, + "epoch": 0.4104614459642267, + "flos": 23729340268800.0, + "grad_norm": 1.962634793360081, + "language_loss": 0.66582263, + "learning_rate": 2.6642198132103696e-06, + "loss": 0.68734574, + "num_input_tokens_seen": 146517735, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.73828125, + "step": 6827, + "time_per_iteration": 2.4629759788513184 + }, + { + "auxiliary_loss_clip": 0.01113749, + "auxiliary_loss_mlp": 0.01032027, + "balance_loss_clip": 1.01799607, + "balance_loss_mlp": 1.03989482, + "epoch": 0.41052156921689464, + "flos": 22127976132480.0, + "grad_norm": 1.3616881749334155, + "language_loss": 0.72346127, + "learning_rate": 2.663852444511689e-06, + "loss": 0.74491906, + "num_input_tokens_seen": 146537640, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.73828125, + "step": 6828, + "time_per_iteration": 2.4807186126708984 + }, + { + "auxiliary_loss_clip": 0.01120586, + "auxiliary_loss_mlp": 0.01042781, + "balance_loss_clip": 1.02777803, + "balance_loss_mlp": 1.0410856, + "epoch": 0.4105816924695626, + "flos": 20084371747200.0, + "grad_norm": 1.900432401993592, + "language_loss": 0.83422399, + "learning_rate": 2.6634850506396574e-06, + "loss": 0.85585773, + "num_input_tokens_seen": 146554695, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.796875, + "step": 6829, + "time_per_iteration": 2.4298055171966553 + }, + { + "auxiliary_loss_clip": 0.01114334, + "auxiliary_loss_mlp": 0.0103303, + "balance_loss_clip": 1.01940441, + "balance_loss_mlp": 1.03960419, + "epoch": 0.4106418157222306, + "flos": 18076785724800.0, + "grad_norm": 1.5044787550344432, + "language_loss": 0.9002744, + "learning_rate": 2.663117631608206e-06, + "loss": 0.92174798, + "num_input_tokens_seen": 146573740, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 6830, + "time_per_iteration": 2.4607503414154053 + }, + { + "auxiliary_loss_clip": 0.01115903, + "auxiliary_loss_mlp": 0.01025937, + "balance_loss_clip": 1.01268673, + "balance_loss_mlp": 1.04088628, + "epoch": 0.41070193897489854, + "flos": 21647850013440.0, + "grad_norm": 2.455330668305064, + "language_loss": 0.65950698, + "learning_rate": 2.662750187431268e-06, + "loss": 0.68092537, + "num_input_tokens_seen": 146592885, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75, + "step": 6831, + "time_per_iteration": 2.4402008056640625 + }, + { + "auxiliary_loss_clip": 0.01114416, + "auxiliary_loss_mlp": 0.01035741, + "balance_loss_clip": 1.02233577, + "balance_loss_mlp": 1.04019713, + "epoch": 0.4107620622275665, + "flos": 26648195356800.0, + "grad_norm": 1.7503077174044546, + "language_loss": 0.69414657, + "learning_rate": 2.662382718122776e-06, + "loss": 0.71564817, + "num_input_tokens_seen": 146611995, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7421875, + "step": 6832, + "time_per_iteration": 2.4985976219177246 + }, + { + "auxiliary_loss_clip": 0.0111274, + "auxiliary_loss_mlp": 0.0103582, + "balance_loss_clip": 1.02265322, + "balance_loss_mlp": 1.03861785, + "epoch": 0.41082218548023447, + "flos": 18734310138240.0, + "grad_norm": 2.137055635154832, + "language_loss": 0.73675501, + "learning_rate": 2.662015223696666e-06, + "loss": 0.75824058, + "num_input_tokens_seen": 146628045, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 6833, + "time_per_iteration": 2.423802375793457 + }, + { + "auxiliary_loss_clip": 0.01120262, + "auxiliary_loss_mlp": 0.01041415, + "balance_loss_clip": 1.02648401, + "balance_loss_mlp": 1.04171228, + "epoch": 0.41088230873290243, + "flos": 22893771116160.0, + "grad_norm": 1.6404428787043481, + "language_loss": 0.72538, + "learning_rate": 2.6616477041668713e-06, + "loss": 0.74699682, + "num_input_tokens_seen": 146648355, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78515625, + "step": 6834, + "time_per_iteration": 2.5415680408477783 + }, + { + "auxiliary_loss_clip": 0.01119029, + "auxiliary_loss_mlp": 0.01044226, + "balance_loss_clip": 1.03027868, + "balance_loss_mlp": 1.04038835, + "epoch": 0.4109424319855704, + "flos": 24276978000000.0, + "grad_norm": 2.0754355899076717, + "language_loss": 0.71026015, + "learning_rate": 2.661280159547329e-06, + "loss": 0.7318927, + "num_input_tokens_seen": 146668370, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.78515625, + "step": 6835, + "time_per_iteration": 2.4709722995758057 + }, + { + "auxiliary_loss_clip": 0.01118649, + "auxiliary_loss_mlp": 0.01040202, + "balance_loss_clip": 1.02521181, + "balance_loss_mlp": 1.04203069, + "epoch": 0.41100255523823837, + "flos": 12969139478400.0, + "grad_norm": 1.9290870315127813, + "language_loss": 0.86998641, + "learning_rate": 2.660912589851978e-06, + "loss": 0.89157486, + "num_input_tokens_seen": 146686665, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.765625, + "step": 6836, + "time_per_iteration": 2.4478323459625244 + }, + { + "auxiliary_loss_clip": 0.01114601, + "auxiliary_loss_mlp": 0.01038609, + "balance_loss_clip": 1.02464342, + "balance_loss_mlp": 1.040609, + "epoch": 0.4110626784909064, + "flos": 23145648261120.0, + "grad_norm": 1.7219230799083993, + "language_loss": 0.69017011, + "learning_rate": 2.6605449950947547e-06, + "loss": 0.71170223, + "num_input_tokens_seen": 146706570, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.73828125, + "step": 6837, + "time_per_iteration": 2.4600830078125 + }, + { + "auxiliary_loss_clip": 0.01116898, + "auxiliary_loss_mlp": 0.01038198, + "balance_loss_clip": 1.02394605, + "balance_loss_mlp": 1.04047167, + "epoch": 0.41112280174357435, + "flos": 22747399194240.0, + "grad_norm": 1.7295939332860302, + "language_loss": 0.75087547, + "learning_rate": 2.660177375289599e-06, + "loss": 0.77242649, + "num_input_tokens_seen": 146723425, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.765625, + "step": 6838, + "time_per_iteration": 2.460449695587158 + }, + { + "auxiliary_loss_clip": 0.01115474, + "auxiliary_loss_mlp": 0.01035463, + "balance_loss_clip": 1.02075219, + "balance_loss_mlp": 1.04058707, + "epoch": 0.4111829249962423, + "flos": 21102403011840.0, + "grad_norm": 1.8679563507274572, + "language_loss": 0.82247162, + "learning_rate": 2.659809730450451e-06, + "loss": 0.84398103, + "num_input_tokens_seen": 146741640, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.75, + "step": 6839, + "time_per_iteration": 2.4339215755462646 + }, + { + "auxiliary_loss_clip": 0.01112221, + "auxiliary_loss_mlp": 0.01032315, + "balance_loss_clip": 1.01875496, + "balance_loss_mlp": 1.03766727, + "epoch": 0.4112430482489103, + "flos": 21505787723520.0, + "grad_norm": 1.9294791670505813, + "language_loss": 0.80338049, + "learning_rate": 2.6594420605912523e-06, + "loss": 0.82482588, + "num_input_tokens_seen": 146759195, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.74609375, + "step": 6840, + "time_per_iteration": 2.464096784591675 + }, + { + "auxiliary_loss_clip": 0.01111724, + "auxiliary_loss_mlp": 0.01034503, + "balance_loss_clip": 1.02119339, + "balance_loss_mlp": 1.03856099, + "epoch": 0.41130317150157825, + "flos": 19570022945280.0, + "grad_norm": 1.7525143939260106, + "language_loss": 0.67515284, + "learning_rate": 2.6590743657259442e-06, + "loss": 0.6966151, + "num_input_tokens_seen": 146774990, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73046875, + "step": 6841, + "time_per_iteration": 2.412872314453125 + }, + { + "auxiliary_loss_clip": 0.01035921, + "auxiliary_loss_mlp": 0.01010132, + "balance_loss_clip": 1.00880933, + "balance_loss_mlp": 1.01203704, + "epoch": 0.4113632947542462, + "flos": 62383157706240.0, + "grad_norm": 0.7700890610990695, + "language_loss": 0.5963515, + "learning_rate": 2.65870664586847e-06, + "loss": 0.61681211, + "num_input_tokens_seen": 146839610, + "router_z_loss_clip": 0.01324463, + "router_z_loss_mlp": 0.23828125, + "step": 6842, + "time_per_iteration": 3.167282819747925 + }, + { + "auxiliary_loss_clip": 0.01111896, + "auxiliary_loss_mlp": 0.01033298, + "balance_loss_clip": 1.02044773, + "balance_loss_mlp": 1.04057288, + "epoch": 0.4114234180069142, + "flos": 13918617636480.0, + "grad_norm": 2.121884132790859, + "language_loss": 0.69212461, + "learning_rate": 2.6583389010327742e-06, + "loss": 0.71357656, + "num_input_tokens_seen": 146857360, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71484375, + "step": 6843, + "time_per_iteration": 2.4664626121520996 + }, + { + "auxiliary_loss_clip": 0.01035393, + "auxiliary_loss_mlp": 0.01003775, + "balance_loss_clip": 1.00222576, + "balance_loss_mlp": 1.01154804, + "epoch": 0.41148354125958214, + "flos": 64928505219840.0, + "grad_norm": 0.7178401469554447, + "language_loss": 0.53669417, + "learning_rate": 2.6579711312328013e-06, + "loss": 0.55708587, + "num_input_tokens_seen": 146917055, + "router_z_loss_clip": 0.01550293, + "router_z_loss_mlp": 0.23828125, + "step": 6844, + "time_per_iteration": 3.0998694896698 + }, + { + "auxiliary_loss_clip": 0.0111189, + "auxiliary_loss_mlp": 0.0103482, + "balance_loss_clip": 1.02213013, + "balance_loss_mlp": 1.03937054, + "epoch": 0.4115436645122501, + "flos": 18728779443840.0, + "grad_norm": 1.6545259135728443, + "language_loss": 0.66114587, + "learning_rate": 2.6576033364824967e-06, + "loss": 0.68261302, + "num_input_tokens_seen": 146935215, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7265625, + "step": 6845, + "time_per_iteration": 6.8190038204193115 + }, + { + "auxiliary_loss_clip": 0.01113046, + "auxiliary_loss_mlp": 0.01034986, + "balance_loss_clip": 1.0221113, + "balance_loss_mlp": 1.04133987, + "epoch": 0.41160378776491807, + "flos": 16252918790400.0, + "grad_norm": 1.8380761864561301, + "language_loss": 0.70359266, + "learning_rate": 2.657235516795808e-06, + "loss": 0.72507298, + "num_input_tokens_seen": 146951970, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 6846, + "time_per_iteration": 3.941171646118164 + }, + { + "auxiliary_loss_clip": 0.01112317, + "auxiliary_loss_mlp": 0.01035623, + "balance_loss_clip": 1.02163363, + "balance_loss_mlp": 1.03892803, + "epoch": 0.41166391101758604, + "flos": 27970031854080.0, + "grad_norm": 1.507800360258476, + "language_loss": 0.64964008, + "learning_rate": 2.6568676721866826e-06, + "loss": 0.67111951, + "num_input_tokens_seen": 146975615, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.73046875, + "step": 6847, + "time_per_iteration": 2.5782458782196045 + }, + { + "auxiliary_loss_clip": 0.01112352, + "auxiliary_loss_mlp": 0.0104301, + "balance_loss_clip": 1.02921724, + "balance_loss_mlp": 1.03790998, + "epoch": 0.411724034270254, + "flos": 34131296764800.0, + "grad_norm": 1.3239337291849294, + "language_loss": 0.70368952, + "learning_rate": 2.656499802669069e-06, + "loss": 0.72524321, + "num_input_tokens_seen": 146998855, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7421875, + "step": 6848, + "time_per_iteration": 2.552729606628418 + }, + { + "auxiliary_loss_clip": 0.01035603, + "auxiliary_loss_mlp": 0.00998835, + "balance_loss_clip": 0.99738103, + "balance_loss_mlp": 1.01178169, + "epoch": 0.41178415752292197, + "flos": 67923670752000.0, + "grad_norm": 0.8862972606407307, + "language_loss": 0.56235039, + "learning_rate": 2.6561319082569174e-06, + "loss": 0.58269477, + "num_input_tokens_seen": 147062710, + "router_z_loss_clip": 0.01452637, + "router_z_loss_mlp": 0.23828125, + "step": 6849, + "time_per_iteration": 3.144639730453491 + }, + { + "auxiliary_loss_clip": 0.01112679, + "auxiliary_loss_mlp": 0.01036148, + "balance_loss_clip": 1.02255821, + "balance_loss_mlp": 1.04060721, + "epoch": 0.41184428077558993, + "flos": 34313938444800.0, + "grad_norm": 1.58670522574793, + "language_loss": 0.76169646, + "learning_rate": 2.6557639889641783e-06, + "loss": 0.78318465, + "num_input_tokens_seen": 147086075, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.71875, + "step": 6850, + "time_per_iteration": 2.5668234825134277 + }, + { + "auxiliary_loss_clip": 0.01111269, + "auxiliary_loss_mlp": 0.01033693, + "balance_loss_clip": 1.02075291, + "balance_loss_mlp": 1.03937149, + "epoch": 0.41190440402825795, + "flos": 35444118948480.0, + "grad_norm": 1.4904377439692653, + "language_loss": 0.67717403, + "learning_rate": 2.6553960448048025e-06, + "loss": 0.69862366, + "num_input_tokens_seen": 147107590, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71875, + "step": 6851, + "time_per_iteration": 2.588646650314331 + }, + { + "auxiliary_loss_clip": 0.01116771, + "auxiliary_loss_mlp": 0.01043217, + "balance_loss_clip": 1.02792835, + "balance_loss_mlp": 1.03957748, + "epoch": 0.4119645272809259, + "flos": 20849879422080.0, + "grad_norm": 2.5339755397297776, + "language_loss": 0.79547226, + "learning_rate": 2.655028075792743e-06, + "loss": 0.81707215, + "num_input_tokens_seen": 147123715, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.76953125, + "step": 6852, + "time_per_iteration": 2.4342472553253174 + }, + { + "auxiliary_loss_clip": 0.01120035, + "auxiliary_loss_mlp": 0.01033443, + "balance_loss_clip": 1.01818419, + "balance_loss_mlp": 1.04227197, + "epoch": 0.4120246505335939, + "flos": 27562050201600.0, + "grad_norm": 3.302073757908878, + "language_loss": 0.78002989, + "learning_rate": 2.6546600819419537e-06, + "loss": 0.80156463, + "num_input_tokens_seen": 147144290, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.77734375, + "step": 6853, + "time_per_iteration": 2.536959409713745 + }, + { + "auxiliary_loss_clip": 0.01118617, + "auxiliary_loss_mlp": 0.01037367, + "balance_loss_clip": 1.022156, + "balance_loss_mlp": 1.04021645, + "epoch": 0.41208477378626185, + "flos": 37815444046080.0, + "grad_norm": 1.636675456410819, + "language_loss": 0.65871978, + "learning_rate": 2.6542920632663883e-06, + "loss": 0.68027961, + "num_input_tokens_seen": 147166340, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.78515625, + "step": 6854, + "time_per_iteration": 2.587641477584839 + }, + { + "auxiliary_loss_clip": 0.01113423, + "auxiliary_loss_mlp": 0.01032528, + "balance_loss_clip": 1.01973081, + "balance_loss_mlp": 1.04029512, + "epoch": 0.4121448970389298, + "flos": 23440762402560.0, + "grad_norm": 1.819965675297277, + "language_loss": 0.83530807, + "learning_rate": 2.6539240197800023e-06, + "loss": 0.85676759, + "num_input_tokens_seen": 147184025, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.73046875, + "step": 6855, + "time_per_iteration": 2.5173020362854004 + }, + { + "auxiliary_loss_clip": 0.01112, + "auxiliary_loss_mlp": 0.01036416, + "balance_loss_clip": 1.02333903, + "balance_loss_mlp": 1.03945315, + "epoch": 0.4122050202915978, + "flos": 21325300859520.0, + "grad_norm": 1.701531451547931, + "language_loss": 0.7926302, + "learning_rate": 2.6535559514967517e-06, + "loss": 0.81411433, + "num_input_tokens_seen": 147202730, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 6856, + "time_per_iteration": 2.4496660232543945 + }, + { + "auxiliary_loss_clip": 0.01115557, + "auxiliary_loss_mlp": 0.01034607, + "balance_loss_clip": 1.021119, + "balance_loss_mlp": 1.04115629, + "epoch": 0.41226514354426574, + "flos": 17306286059520.0, + "grad_norm": 6.346447490864035, + "language_loss": 0.79253089, + "learning_rate": 2.6531878584305935e-06, + "loss": 0.81403255, + "num_input_tokens_seen": 147215315, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 6857, + "time_per_iteration": 2.454458236694336 + }, + { + "auxiliary_loss_clip": 0.01114343, + "auxiliary_loss_mlp": 0.01036012, + "balance_loss_clip": 1.02169538, + "balance_loss_mlp": 1.03821683, + "epoch": 0.4123252667969337, + "flos": 17638855107840.0, + "grad_norm": 1.6045712878894351, + "language_loss": 0.70696247, + "learning_rate": 2.6528197405954873e-06, + "loss": 0.72846603, + "num_input_tokens_seen": 147233330, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.76171875, + "step": 6858, + "time_per_iteration": 2.453808069229126 + }, + { + "auxiliary_loss_clip": 0.01113834, + "auxiliary_loss_mlp": 0.01035636, + "balance_loss_clip": 1.02162266, + "balance_loss_mlp": 1.04016411, + "epoch": 0.4123853900496017, + "flos": 46424811375360.0, + "grad_norm": 1.4836752505963042, + "language_loss": 0.59489501, + "learning_rate": 2.652451598005391e-06, + "loss": 0.61638969, + "num_input_tokens_seen": 147257780, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.73828125, + "step": 6859, + "time_per_iteration": 2.6645431518554688 + }, + { + "auxiliary_loss_clip": 0.01112236, + "auxiliary_loss_mlp": 0.01036677, + "balance_loss_clip": 1.02283669, + "balance_loss_mlp": 1.03694463, + "epoch": 0.41244551330226964, + "flos": 17675160779520.0, + "grad_norm": 2.017738864380765, + "language_loss": 0.73062313, + "learning_rate": 2.652083430674264e-06, + "loss": 0.75211227, + "num_input_tokens_seen": 147276055, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75390625, + "step": 6860, + "time_per_iteration": 2.4230310916900635 + }, + { + "auxiliary_loss_clip": 0.01111098, + "auxiliary_loss_mlp": 0.01033206, + "balance_loss_clip": 1.02037311, + "balance_loss_mlp": 1.03779876, + "epoch": 0.4125056365549376, + "flos": 18693730748160.0, + "grad_norm": 1.603033952512427, + "language_loss": 0.74057221, + "learning_rate": 2.651715238616068e-06, + "loss": 0.76201528, + "num_input_tokens_seen": 147293200, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.734375, + "step": 6861, + "time_per_iteration": 2.466261863708496 + }, + { + "auxiliary_loss_clip": 0.01111959, + "auxiliary_loss_mlp": 0.01031192, + "balance_loss_clip": 1.0190326, + "balance_loss_mlp": 1.04026282, + "epoch": 0.41256575980760557, + "flos": 17895293280000.0, + "grad_norm": 2.017273954904035, + "language_loss": 0.79431915, + "learning_rate": 2.651347021844765e-06, + "loss": 0.81575066, + "num_input_tokens_seen": 147310640, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.71875, + "step": 6862, + "time_per_iteration": 2.4272851943969727 + }, + { + "auxiliary_loss_clip": 0.01115421, + "auxiliary_loss_mlp": 0.01032509, + "balance_loss_clip": 1.01946771, + "balance_loss_mlp": 1.04104841, + "epoch": 0.41262588306027354, + "flos": 21981316901760.0, + "grad_norm": 1.7023318630513873, + "language_loss": 0.76025152, + "learning_rate": 2.650978780374318e-06, + "loss": 0.78173077, + "num_input_tokens_seen": 147329435, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.74609375, + "step": 6863, + "time_per_iteration": 2.491703987121582 + }, + { + "auxiliary_loss_clip": 0.01034073, + "auxiliary_loss_mlp": 0.01002883, + "balance_loss_clip": 1.00128579, + "balance_loss_mlp": 1.01038253, + "epoch": 0.41268600631294156, + "flos": 53350006740480.0, + "grad_norm": 0.6998724627349664, + "language_loss": 0.52726007, + "learning_rate": 2.650610514218691e-06, + "loss": 0.54762965, + "num_input_tokens_seen": 147385805, + "router_z_loss_clip": 0.01599121, + "router_z_loss_mlp": 0.23632812, + "step": 6864, + "time_per_iteration": 3.05096173286438 + }, + { + "auxiliary_loss_clip": 0.01117449, + "auxiliary_loss_mlp": 0.01034441, + "balance_loss_clip": 1.02002299, + "balance_loss_mlp": 1.04010963, + "epoch": 0.4127461295656095, + "flos": 24385356311040.0, + "grad_norm": 1.8277977271365335, + "language_loss": 0.72328234, + "learning_rate": 2.6502422233918468e-06, + "loss": 0.74480128, + "num_input_tokens_seen": 147405160, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7734375, + "step": 6865, + "time_per_iteration": 2.5138418674468994 + }, + { + "auxiliary_loss_clip": 0.0103371, + "auxiliary_loss_mlp": 0.01003681, + "balance_loss_clip": 1.00216091, + "balance_loss_mlp": 1.00997901, + "epoch": 0.4128062528182775, + "flos": 71705242696320.0, + "grad_norm": 0.9175964026476935, + "language_loss": 0.66545808, + "learning_rate": 2.649873907907753e-06, + "loss": 0.68583202, + "num_input_tokens_seen": 147460245, + "router_z_loss_clip": 0.01519775, + "router_z_loss_mlp": 0.23730469, + "step": 6866, + "time_per_iteration": 2.965301513671875 + }, + { + "auxiliary_loss_clip": 0.01111664, + "auxiliary_loss_mlp": 0.01037824, + "balance_loss_clip": 1.02442443, + "balance_loss_mlp": 1.03779757, + "epoch": 0.41286637607094545, + "flos": 17849111368320.0, + "grad_norm": 1.9494269702964535, + "language_loss": 0.80854523, + "learning_rate": 2.649505567780375e-06, + "loss": 0.8300401, + "num_input_tokens_seen": 147476200, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73828125, + "step": 6867, + "time_per_iteration": 2.4153382778167725 + }, + { + "auxiliary_loss_clip": 0.01118424, + "auxiliary_loss_mlp": 0.01037514, + "balance_loss_clip": 1.02335191, + "balance_loss_mlp": 1.04141474, + "epoch": 0.4129264993236134, + "flos": 25549544016000.0, + "grad_norm": 2.031901046820099, + "language_loss": 0.77580094, + "learning_rate": 2.6491372030236815e-06, + "loss": 0.7973603, + "num_input_tokens_seen": 147494315, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7734375, + "step": 6868, + "time_per_iteration": 2.535595178604126 + }, + { + "auxiliary_loss_clip": 0.01033303, + "auxiliary_loss_mlp": 0.00999485, + "balance_loss_clip": 0.99789923, + "balance_loss_mlp": 1.0095768, + "epoch": 0.4129866225762814, + "flos": 65414446364160.0, + "grad_norm": 0.8413704541135547, + "language_loss": 0.5779494, + "learning_rate": 2.64876881365164e-06, + "loss": 0.59827721, + "num_input_tokens_seen": 147543665, + "router_z_loss_clip": 0.01586914, + "router_z_loss_mlp": 0.23730469, + "step": 6869, + "time_per_iteration": 2.8164174556732178 + }, + { + "auxiliary_loss_clip": 0.01112645, + "auxiliary_loss_mlp": 0.01034982, + "balance_loss_clip": 1.02101028, + "balance_loss_mlp": 1.03904057, + "epoch": 0.41304674582894935, + "flos": 28876991287680.0, + "grad_norm": 1.6360017889096097, + "language_loss": 0.74995548, + "learning_rate": 2.64840039967822e-06, + "loss": 0.77143168, + "num_input_tokens_seen": 147564870, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.734375, + "step": 6870, + "time_per_iteration": 2.5370054244995117 + }, + { + "auxiliary_loss_clip": 0.01114255, + "auxiliary_loss_mlp": 0.01042162, + "balance_loss_clip": 1.02757072, + "balance_loss_mlp": 1.03925085, + "epoch": 0.4131068690816173, + "flos": 22891975436160.0, + "grad_norm": 1.504144022647526, + "language_loss": 0.83272427, + "learning_rate": 2.6480319611173912e-06, + "loss": 0.85428846, + "num_input_tokens_seen": 147584840, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.75, + "step": 6871, + "time_per_iteration": 2.596686601638794 + }, + { + "auxiliary_loss_clip": 0.01117357, + "auxiliary_loss_mlp": 0.01041675, + "balance_loss_clip": 1.02738237, + "balance_loss_mlp": 1.04108167, + "epoch": 0.4131669923342853, + "flos": 26065185707520.0, + "grad_norm": 5.838045745285431, + "language_loss": 0.68951505, + "learning_rate": 2.6476634979831263e-06, + "loss": 0.71110535, + "num_input_tokens_seen": 147604635, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.76171875, + "step": 6872, + "time_per_iteration": 2.6045477390289307 + }, + { + "auxiliary_loss_clip": 0.01115693, + "auxiliary_loss_mlp": 0.01035465, + "balance_loss_clip": 1.02197695, + "balance_loss_mlp": 1.04050374, + "epoch": 0.41322711558695324, + "flos": 19244564789760.0, + "grad_norm": 1.864312912622832, + "language_loss": 0.75716275, + "learning_rate": 2.6472950102893964e-06, + "loss": 0.7786743, + "num_input_tokens_seen": 147620700, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 6873, + "time_per_iteration": 2.4200570583343506 + }, + { + "auxiliary_loss_clip": 0.01117091, + "auxiliary_loss_mlp": 0.01033505, + "balance_loss_clip": 1.01943827, + "balance_loss_mlp": 1.04055679, + "epoch": 0.4132872388396212, + "flos": 22674464628480.0, + "grad_norm": 1.671510122752512, + "language_loss": 0.82721817, + "learning_rate": 2.6469264980501746e-06, + "loss": 0.84872413, + "num_input_tokens_seen": 147639490, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.765625, + "step": 6874, + "time_per_iteration": 2.4689133167266846 + }, + { + "auxiliary_loss_clip": 0.01116401, + "auxiliary_loss_mlp": 0.01034964, + "balance_loss_clip": 1.02054608, + "balance_loss_mlp": 1.0397824, + "epoch": 0.4133473620922892, + "flos": 20150195420160.0, + "grad_norm": 2.003609916019722, + "language_loss": 0.71075761, + "learning_rate": 2.646557961279436e-06, + "loss": 0.73227131, + "num_input_tokens_seen": 147657205, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.765625, + "step": 6875, + "time_per_iteration": 2.4145123958587646 + }, + { + "auxiliary_loss_clip": 0.01110907, + "auxiliary_loss_mlp": 0.01040098, + "balance_loss_clip": 1.02686, + "balance_loss_mlp": 1.04001451, + "epoch": 0.41340748534495714, + "flos": 24242755317120.0, + "grad_norm": 1.617534223510663, + "language_loss": 0.82538921, + "learning_rate": 2.646189399991154e-06, + "loss": 0.84689927, + "num_input_tokens_seen": 147677005, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 6876, + "time_per_iteration": 2.49533748626709 + }, + { + "auxiliary_loss_clip": 0.01118483, + "auxiliary_loss_mlp": 0.010388, + "balance_loss_clip": 1.02354097, + "balance_loss_mlp": 1.03916812, + "epoch": 0.41346760859762516, + "flos": 14392171566720.0, + "grad_norm": 2.858959916779265, + "language_loss": 0.65397477, + "learning_rate": 2.6458208141993048e-06, + "loss": 0.6755476, + "num_input_tokens_seen": 147693435, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.79296875, + "step": 6877, + "time_per_iteration": 2.4231626987457275 + }, + { + "auxiliary_loss_clip": 0.01113806, + "auxiliary_loss_mlp": 0.01031336, + "balance_loss_clip": 1.01795483, + "balance_loss_mlp": 1.04000914, + "epoch": 0.4135277318502931, + "flos": 22492002516480.0, + "grad_norm": 2.013643508242888, + "language_loss": 0.76686853, + "learning_rate": 2.6454522039178668e-06, + "loss": 0.78831995, + "num_input_tokens_seen": 147714000, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73828125, + "step": 6878, + "time_per_iteration": 2.492220640182495 + }, + { + "auxiliary_loss_clip": 0.01114835, + "auxiliary_loss_mlp": 0.01039959, + "balance_loss_clip": 1.02589822, + "balance_loss_mlp": 1.040084, + "epoch": 0.4135878551029611, + "flos": 22418744728320.0, + "grad_norm": 1.8674435899066546, + "language_loss": 0.80248523, + "learning_rate": 2.6450835691608154e-06, + "loss": 0.82403314, + "num_input_tokens_seen": 147731010, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.75, + "step": 6879, + "time_per_iteration": 2.458623170852661 + }, + { + "auxiliary_loss_clip": 0.01114903, + "auxiliary_loss_mlp": 0.01036028, + "balance_loss_clip": 1.02160931, + "balance_loss_mlp": 1.03960526, + "epoch": 0.41364797835562905, + "flos": 27053232094080.0, + "grad_norm": 1.9200458523415633, + "language_loss": 0.84693611, + "learning_rate": 2.6447149099421315e-06, + "loss": 0.86844546, + "num_input_tokens_seen": 147750880, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.75390625, + "step": 6880, + "time_per_iteration": 2.605189323425293 + }, + { + "auxiliary_loss_clip": 0.01116516, + "auxiliary_loss_mlp": 0.01028456, + "balance_loss_clip": 1.01478863, + "balance_loss_mlp": 1.04023683, + "epoch": 0.413708101608297, + "flos": 22967603521920.0, + "grad_norm": 1.672120688006926, + "language_loss": 0.70195448, + "learning_rate": 2.6443462262757927e-06, + "loss": 0.72340417, + "num_input_tokens_seen": 147771360, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76171875, + "step": 6881, + "time_per_iteration": 2.4585211277008057 + }, + { + "auxiliary_loss_clip": 0.01113486, + "auxiliary_loss_mlp": 0.01037109, + "balance_loss_clip": 1.02450848, + "balance_loss_mlp": 1.04145753, + "epoch": 0.413768224860965, + "flos": 13333991875200.0, + "grad_norm": 1.702675342664879, + "language_loss": 0.81404376, + "learning_rate": 2.6439775181757805e-06, + "loss": 0.83554971, + "num_input_tokens_seen": 147787440, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71875, + "step": 6882, + "time_per_iteration": 2.451544761657715 + }, + { + "auxiliary_loss_clip": 0.01121461, + "auxiliary_loss_mlp": 0.01047549, + "balance_loss_clip": 1.0311873, + "balance_loss_mlp": 1.04304028, + "epoch": 0.41382834811363295, + "flos": 20813968800000.0, + "grad_norm": 1.9410860498070561, + "language_loss": 0.69296026, + "learning_rate": 2.643608785656077e-06, + "loss": 0.71465033, + "num_input_tokens_seen": 147805720, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.78515625, + "step": 6883, + "time_per_iteration": 2.4320569038391113 + }, + { + "auxiliary_loss_clip": 0.01115479, + "auxiliary_loss_mlp": 0.01035258, + "balance_loss_clip": 1.02175713, + "balance_loss_mlp": 1.04087615, + "epoch": 0.4138884713663009, + "flos": 20667130001280.0, + "grad_norm": 1.7677749997866015, + "language_loss": 0.75449616, + "learning_rate": 2.643240028730663e-06, + "loss": 0.77600354, + "num_input_tokens_seen": 147824605, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 6884, + "time_per_iteration": 2.4846954345703125 + }, + { + "auxiliary_loss_clip": 0.01116957, + "auxiliary_loss_mlp": 0.01037789, + "balance_loss_clip": 1.02394891, + "balance_loss_mlp": 1.04013455, + "epoch": 0.4139485946189689, + "flos": 29056616225280.0, + "grad_norm": 1.3782226444678463, + "language_loss": 0.75763476, + "learning_rate": 2.642871247413523e-06, + "loss": 0.7791822, + "num_input_tokens_seen": 147845445, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76953125, + "step": 6885, + "time_per_iteration": 2.513087511062622 + }, + { + "auxiliary_loss_clip": 0.01117144, + "auxiliary_loss_mlp": 0.01038796, + "balance_loss_clip": 1.0245266, + "balance_loss_mlp": 1.0402348, + "epoch": 0.41400871787163684, + "flos": 24425720219520.0, + "grad_norm": 1.8637223642679819, + "language_loss": 0.69820571, + "learning_rate": 2.6425024417186414e-06, + "loss": 0.71976513, + "num_input_tokens_seen": 147865580, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.765625, + "step": 6886, + "time_per_iteration": 2.49245285987854 + }, + { + "auxiliary_loss_clip": 0.01118338, + "auxiliary_loss_mlp": 0.0103733, + "balance_loss_clip": 1.02326965, + "balance_loss_mlp": 1.04143095, + "epoch": 0.4140688411243048, + "flos": 19464050845440.0, + "grad_norm": 1.5567308495418615, + "language_loss": 0.7542249, + "learning_rate": 2.642133611660002e-06, + "loss": 0.77578151, + "num_input_tokens_seen": 147885230, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76953125, + "step": 6887, + "time_per_iteration": 6.723928451538086 + }, + { + "auxiliary_loss_clip": 0.01114585, + "auxiliary_loss_mlp": 0.01031306, + "balance_loss_clip": 1.01735878, + "balance_loss_mlp": 1.03900433, + "epoch": 0.4141289643769728, + "flos": 19313656600320.0, + "grad_norm": 1.8847126889252832, + "language_loss": 0.69881892, + "learning_rate": 2.641764757251592e-06, + "loss": 0.72027779, + "num_input_tokens_seen": 147903035, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75390625, + "step": 6888, + "time_per_iteration": 3.9012765884399414 + }, + { + "auxiliary_loss_clip": 0.01110331, + "auxiliary_loss_mlp": 0.0103512, + "balance_loss_clip": 1.02070749, + "balance_loss_mlp": 1.03661156, + "epoch": 0.41418908762964074, + "flos": 16726903683840.0, + "grad_norm": 1.8064637161795956, + "language_loss": 0.75730169, + "learning_rate": 2.6413958785073976e-06, + "loss": 0.7787562, + "num_input_tokens_seen": 147918745, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.73828125, + "step": 6889, + "time_per_iteration": 2.4043526649475098 + }, + { + "auxiliary_loss_clip": 0.01115863, + "auxiliary_loss_mlp": 0.01033766, + "balance_loss_clip": 1.020468, + "balance_loss_mlp": 1.04220176, + "epoch": 0.41424921088230876, + "flos": 25296840858240.0, + "grad_norm": 1.5362774650785178, + "language_loss": 0.80159467, + "learning_rate": 2.6410269754414074e-06, + "loss": 0.82309097, + "num_input_tokens_seen": 147938265, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.734375, + "step": 6890, + "time_per_iteration": 2.515199661254883 + }, + { + "auxiliary_loss_clip": 0.01113118, + "auxiliary_loss_mlp": 0.01042194, + "balance_loss_clip": 1.02752495, + "balance_loss_mlp": 1.04047, + "epoch": 0.4143093341349767, + "flos": 20960520289920.0, + "grad_norm": 1.56935265602887, + "language_loss": 0.74256909, + "learning_rate": 2.6406580480676113e-06, + "loss": 0.76412225, + "num_input_tokens_seen": 147957320, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7265625, + "step": 6891, + "time_per_iteration": 2.4265213012695312 + }, + { + "auxiliary_loss_clip": 0.01120303, + "auxiliary_loss_mlp": 0.01037383, + "balance_loss_clip": 1.02144444, + "balance_loss_mlp": 1.04260397, + "epoch": 0.4143694573876447, + "flos": 22017694400640.0, + "grad_norm": 1.5959140747346865, + "language_loss": 0.84173661, + "learning_rate": 2.6402890963999963e-06, + "loss": 0.86331344, + "num_input_tokens_seen": 147977045, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.77734375, + "step": 6892, + "time_per_iteration": 2.4921038150787354 + }, + { + "auxiliary_loss_clip": 0.01116229, + "auxiliary_loss_mlp": 0.01035744, + "balance_loss_clip": 1.02204704, + "balance_loss_mlp": 1.04263163, + "epoch": 0.41442958064031266, + "flos": 35697396723840.0, + "grad_norm": 1.6122583846612435, + "language_loss": 0.70197237, + "learning_rate": 2.6399201204525554e-06, + "loss": 0.72349209, + "num_input_tokens_seen": 147996905, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.734375, + "step": 6893, + "time_per_iteration": 2.548509359359741 + }, + { + "auxiliary_loss_clip": 0.01115822, + "auxiliary_loss_mlp": 0.01029427, + "balance_loss_clip": 1.01573586, + "balance_loss_mlp": 1.04117119, + "epoch": 0.4144897038929806, + "flos": 28293766156800.0, + "grad_norm": 1.3754181360448814, + "language_loss": 0.72850323, + "learning_rate": 2.639551120239279e-06, + "loss": 0.74995577, + "num_input_tokens_seen": 148017875, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 6894, + "time_per_iteration": 2.521559715270996 + }, + { + "auxiliary_loss_clip": 0.01118828, + "auxiliary_loss_mlp": 0.01032562, + "balance_loss_clip": 1.01859689, + "balance_loss_mlp": 1.04199624, + "epoch": 0.4145498271456486, + "flos": 11648093080320.0, + "grad_norm": 2.672622146105704, + "language_loss": 0.6200121, + "learning_rate": 2.63918209577416e-06, + "loss": 0.64152598, + "num_input_tokens_seen": 148032300, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.765625, + "step": 6895, + "time_per_iteration": 2.3899357318878174 + }, + { + "auxiliary_loss_clip": 0.01112997, + "auxiliary_loss_mlp": 0.01034538, + "balance_loss_clip": 1.02091241, + "balance_loss_mlp": 1.03973091, + "epoch": 0.41460995039831655, + "flos": 27235622378880.0, + "grad_norm": 1.6922649240649819, + "language_loss": 0.70685059, + "learning_rate": 2.638813047071192e-06, + "loss": 0.72832596, + "num_input_tokens_seen": 148053260, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.73046875, + "step": 6896, + "time_per_iteration": 2.5296781063079834 + }, + { + "auxiliary_loss_clip": 0.01115349, + "auxiliary_loss_mlp": 0.01040374, + "balance_loss_clip": 1.02541351, + "balance_loss_mlp": 1.03898549, + "epoch": 0.4146700736509845, + "flos": 25922369232000.0, + "grad_norm": 1.6224007586570597, + "language_loss": 0.72848749, + "learning_rate": 2.6384439741443696e-06, + "loss": 0.7500447, + "num_input_tokens_seen": 148072965, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.765625, + "step": 6897, + "time_per_iteration": 2.481219530105591 + }, + { + "auxiliary_loss_clip": 0.01115287, + "auxiliary_loss_mlp": 0.01043208, + "balance_loss_clip": 1.02870619, + "balance_loss_mlp": 1.04093742, + "epoch": 0.4147301969036525, + "flos": 26833243248000.0, + "grad_norm": 4.403783878749548, + "language_loss": 0.84646589, + "learning_rate": 2.6380748770076873e-06, + "loss": 0.86805081, + "num_input_tokens_seen": 148093240, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7421875, + "step": 6898, + "time_per_iteration": 2.5150201320648193 + }, + { + "auxiliary_loss_clip": 0.01112871, + "auxiliary_loss_mlp": 0.01031359, + "balance_loss_clip": 1.01719725, + "balance_loss_mlp": 1.03681874, + "epoch": 0.41479032015632045, + "flos": 20298291194880.0, + "grad_norm": 1.644475487803214, + "language_loss": 0.74555075, + "learning_rate": 2.6377057556751416e-06, + "loss": 0.76699305, + "num_input_tokens_seen": 148110925, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.76171875, + "step": 6899, + "time_per_iteration": 2.4348104000091553 + }, + { + "auxiliary_loss_clip": 0.0112093, + "auxiliary_loss_mlp": 0.01037394, + "balance_loss_clip": 1.02145016, + "balance_loss_mlp": 1.04058647, + "epoch": 0.4148504434089884, + "flos": 25264988472960.0, + "grad_norm": 1.717830619902866, + "language_loss": 0.75609112, + "learning_rate": 2.6373366101607306e-06, + "loss": 0.77767438, + "num_input_tokens_seen": 148130670, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8046875, + "step": 6900, + "time_per_iteration": 2.5260136127471924 + }, + { + "auxiliary_loss_clip": 0.01116235, + "auxiliary_loss_mlp": 0.01040453, + "balance_loss_clip": 1.02496767, + "balance_loss_mlp": 1.04113388, + "epoch": 0.4149105666616564, + "flos": 12822300679680.0, + "grad_norm": 2.5866137476185087, + "language_loss": 0.80409849, + "learning_rate": 2.6369674404784503e-06, + "loss": 0.82566535, + "num_input_tokens_seen": 148148350, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.75, + "step": 6901, + "time_per_iteration": 2.4218883514404297 + }, + { + "auxiliary_loss_clip": 0.01114876, + "auxiliary_loss_mlp": 0.01035504, + "balance_loss_clip": 1.02178299, + "balance_loss_mlp": 1.03989518, + "epoch": 0.41497068991432434, + "flos": 16763891713920.0, + "grad_norm": 1.8085429941764752, + "language_loss": 0.69120753, + "learning_rate": 2.6365982466423014e-06, + "loss": 0.71271133, + "num_input_tokens_seen": 148167550, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75, + "step": 6902, + "time_per_iteration": 2.525836944580078 + }, + { + "auxiliary_loss_clip": 0.0111323, + "auxiliary_loss_mlp": 0.01037724, + "balance_loss_clip": 1.02421129, + "balance_loss_mlp": 1.04042315, + "epoch": 0.4150308131669923, + "flos": 18000906243840.0, + "grad_norm": 2.1056004636318817, + "language_loss": 0.83287692, + "learning_rate": 2.6362290286662834e-06, + "loss": 0.85438645, + "num_input_tokens_seen": 148184740, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7265625, + "step": 6903, + "time_per_iteration": 2.402722120285034 + }, + { + "auxiliary_loss_clip": 0.01120503, + "auxiliary_loss_mlp": 0.01038275, + "balance_loss_clip": 1.02232492, + "balance_loss_mlp": 1.0413456, + "epoch": 0.41509093641966033, + "flos": 30044770352640.0, + "grad_norm": 1.8768082111891207, + "language_loss": 0.67704409, + "learning_rate": 2.6358597865643968e-06, + "loss": 0.69863188, + "num_input_tokens_seen": 148204605, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.79296875, + "step": 6904, + "time_per_iteration": 2.5442733764648438 + }, + { + "auxiliary_loss_clip": 0.01119512, + "auxiliary_loss_mlp": 0.01035175, + "balance_loss_clip": 1.02082872, + "balance_loss_mlp": 1.04166162, + "epoch": 0.4151510596723283, + "flos": 24279994742400.0, + "grad_norm": 1.5140892492412166, + "language_loss": 0.77502626, + "learning_rate": 2.635490520350643e-06, + "loss": 0.79657316, + "num_input_tokens_seen": 148224675, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.77734375, + "step": 6905, + "time_per_iteration": 2.471850633621216 + }, + { + "auxiliary_loss_clip": 0.01119242, + "auxiliary_loss_mlp": 0.01030653, + "balance_loss_clip": 1.0168426, + "balance_loss_mlp": 1.04261923, + "epoch": 0.41521118292499626, + "flos": 23476206147840.0, + "grad_norm": 2.8616602480779427, + "language_loss": 0.68461335, + "learning_rate": 2.635121230039025e-06, + "loss": 0.70611238, + "num_input_tokens_seen": 148243375, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 6906, + "time_per_iteration": 2.501025676727295 + }, + { + "auxiliary_loss_clip": 0.0111511, + "auxiliary_loss_mlp": 0.01034311, + "balance_loss_clip": 1.02097726, + "balance_loss_mlp": 1.041152, + "epoch": 0.4152713061776642, + "flos": 22125498094080.0, + "grad_norm": 3.9013632738704347, + "language_loss": 0.67466414, + "learning_rate": 2.6347519156435467e-06, + "loss": 0.69615829, + "num_input_tokens_seen": 148261140, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7421875, + "step": 6907, + "time_per_iteration": 2.467179298400879 + }, + { + "auxiliary_loss_clip": 0.01118262, + "auxiliary_loss_mlp": 0.01034081, + "balance_loss_clip": 1.02107513, + "balance_loss_mlp": 1.04266894, + "epoch": 0.4153314294303322, + "flos": 21251396626560.0, + "grad_norm": 1.8641722195673653, + "language_loss": 0.77219629, + "learning_rate": 2.6343825771782123e-06, + "loss": 0.79371971, + "num_input_tokens_seen": 148279655, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7578125, + "step": 6908, + "time_per_iteration": 2.5124471187591553 + }, + { + "auxiliary_loss_clip": 0.01043525, + "auxiliary_loss_mlp": 0.01011962, + "balance_loss_clip": 1.01046562, + "balance_loss_mlp": 1.01946032, + "epoch": 0.41539155268300015, + "flos": 57920681594880.0, + "grad_norm": 0.7844742119516283, + "language_loss": 0.64862758, + "learning_rate": 2.634013214657026e-06, + "loss": 0.66918248, + "num_input_tokens_seen": 148339005, + "router_z_loss_clip": 0.01495361, + "router_z_loss_mlp": 0.24023438, + "step": 6909, + "time_per_iteration": 3.0118794441223145 + }, + { + "auxiliary_loss_clip": 0.01116053, + "auxiliary_loss_mlp": 0.01037602, + "balance_loss_clip": 1.02441156, + "balance_loss_mlp": 1.04182351, + "epoch": 0.4154516759356681, + "flos": 21903677654400.0, + "grad_norm": 1.432390678759805, + "language_loss": 0.87292743, + "learning_rate": 2.633643828093996e-06, + "loss": 0.8944639, + "num_input_tokens_seen": 148358715, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 6910, + "time_per_iteration": 2.4972214698791504 + }, + { + "auxiliary_loss_clip": 0.01041579, + "auxiliary_loss_mlp": 0.01001773, + "balance_loss_clip": 1.00033653, + "balance_loss_mlp": 1.01748466, + "epoch": 0.4155117991883361, + "flos": 67833677226240.0, + "grad_norm": 0.808989444092677, + "language_loss": 0.6214478, + "learning_rate": 2.633274417503128e-06, + "loss": 0.64188129, + "num_input_tokens_seen": 148417280, + "router_z_loss_clip": 0.01434326, + "router_z_loss_mlp": 0.24023438, + "step": 6911, + "time_per_iteration": 3.040469169616699 + }, + { + "auxiliary_loss_clip": 0.01126363, + "auxiliary_loss_mlp": 0.01038312, + "balance_loss_clip": 1.02386987, + "balance_loss_mlp": 1.04570675, + "epoch": 0.41557192244100405, + "flos": 14282679934080.0, + "grad_norm": 2.7143139070983313, + "language_loss": 0.87920213, + "learning_rate": 2.6329049828984312e-06, + "loss": 0.90084887, + "num_input_tokens_seen": 148432610, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8046875, + "step": 6912, + "time_per_iteration": 2.449566602706909 + }, + { + "auxiliary_loss_clip": 0.01119018, + "auxiliary_loss_mlp": 0.01031086, + "balance_loss_clip": 1.01842034, + "balance_loss_mlp": 1.04461241, + "epoch": 0.415632045693672, + "flos": 24461954064000.0, + "grad_norm": 3.208266477782979, + "language_loss": 0.62984204, + "learning_rate": 2.632535524293914e-06, + "loss": 0.65134311, + "num_input_tokens_seen": 148451510, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.74609375, + "step": 6913, + "time_per_iteration": 2.4690184593200684 + }, + { + "auxiliary_loss_clip": 0.01117176, + "auxiliary_loss_mlp": 0.01030635, + "balance_loss_clip": 1.01793909, + "balance_loss_mlp": 1.04389513, + "epoch": 0.41569216894634, + "flos": 20115290378880.0, + "grad_norm": 1.933222600231973, + "language_loss": 0.75131822, + "learning_rate": 2.632166041703586e-06, + "loss": 0.77279633, + "num_input_tokens_seen": 148469945, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.734375, + "step": 6914, + "time_per_iteration": 2.483322858810425 + }, + { + "auxiliary_loss_clip": 0.01118579, + "auxiliary_loss_mlp": 0.01035638, + "balance_loss_clip": 1.0218277, + "balance_loss_mlp": 1.04198337, + "epoch": 0.41575229219900794, + "flos": 23798827128960.0, + "grad_norm": 1.8027192281548683, + "language_loss": 0.87621439, + "learning_rate": 2.631796535141458e-06, + "loss": 0.89775658, + "num_input_tokens_seen": 148486655, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.765625, + "step": 6915, + "time_per_iteration": 2.448347806930542 + }, + { + "auxiliary_loss_clip": 0.01120782, + "auxiliary_loss_mlp": 0.01038445, + "balance_loss_clip": 1.02461123, + "balance_loss_mlp": 1.0447371, + "epoch": 0.4158124154516759, + "flos": 23108229267840.0, + "grad_norm": 2.7843871284315007, + "language_loss": 0.71427178, + "learning_rate": 2.6314270046215426e-06, + "loss": 0.7358641, + "num_input_tokens_seen": 148505035, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76171875, + "step": 6916, + "time_per_iteration": 2.490709066390991 + }, + { + "auxiliary_loss_clip": 0.01124406, + "auxiliary_loss_mlp": 0.01032755, + "balance_loss_clip": 1.018736, + "balance_loss_mlp": 1.04548466, + "epoch": 0.41587253870434393, + "flos": 24242970798720.0, + "grad_norm": 1.511699121237688, + "language_loss": 0.71604288, + "learning_rate": 2.631057450157852e-06, + "loss": 0.73761451, + "num_input_tokens_seen": 148525575, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 6917, + "time_per_iteration": 2.471165895462036 + }, + { + "auxiliary_loss_clip": 0.01118269, + "auxiliary_loss_mlp": 0.01033966, + "balance_loss_clip": 1.0205791, + "balance_loss_mlp": 1.04267478, + "epoch": 0.4159326619570119, + "flos": 23881602021120.0, + "grad_norm": 1.6845020116344738, + "language_loss": 0.80811357, + "learning_rate": 2.6306878717643988e-06, + "loss": 0.82963598, + "num_input_tokens_seen": 148547270, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7578125, + "step": 6918, + "time_per_iteration": 2.526092767715454 + }, + { + "auxiliary_loss_clip": 0.01123257, + "auxiliary_loss_mlp": 0.01037246, + "balance_loss_clip": 1.02276754, + "balance_loss_mlp": 1.04565763, + "epoch": 0.41599278520967986, + "flos": 40626531354240.0, + "grad_norm": 1.4136427424617275, + "language_loss": 0.70455492, + "learning_rate": 2.6303182694551995e-06, + "loss": 0.72615993, + "num_input_tokens_seen": 148572100, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7734375, + "step": 6919, + "time_per_iteration": 2.6142234802246094 + }, + { + "auxiliary_loss_clip": 0.01122602, + "auxiliary_loss_mlp": 0.0103457, + "balance_loss_clip": 1.02063489, + "balance_loss_mlp": 1.04595828, + "epoch": 0.4160529084623478, + "flos": 18222942165120.0, + "grad_norm": 3.306135174045704, + "language_loss": 0.80995989, + "learning_rate": 2.6299486432442677e-06, + "loss": 0.83153164, + "num_input_tokens_seen": 148591245, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.765625, + "step": 6920, + "time_per_iteration": 2.4816763401031494 + }, + { + "auxiliary_loss_clip": 0.01123811, + "auxiliary_loss_mlp": 0.01037947, + "balance_loss_clip": 1.02265263, + "balance_loss_mlp": 1.04559636, + "epoch": 0.4161130317150158, + "flos": 13661963982720.0, + "grad_norm": 1.8850349699187139, + "language_loss": 0.66103178, + "learning_rate": 2.6295789931456195e-06, + "loss": 0.68264937, + "num_input_tokens_seen": 148607980, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.78125, + "step": 6921, + "time_per_iteration": 2.4444103240966797 + }, + { + "auxiliary_loss_clip": 0.01122422, + "auxiliary_loss_mlp": 0.01039999, + "balance_loss_clip": 1.02613473, + "balance_loss_mlp": 1.04591656, + "epoch": 0.41617315496768376, + "flos": 16178511767040.0, + "grad_norm": 2.004797667242706, + "language_loss": 0.80354667, + "learning_rate": 2.629209319173274e-06, + "loss": 0.82517087, + "num_input_tokens_seen": 148624490, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 6922, + "time_per_iteration": 2.4668424129486084 + }, + { + "auxiliary_loss_clip": 0.01124248, + "auxiliary_loss_mlp": 0.01032463, + "balance_loss_clip": 1.01878977, + "balance_loss_mlp": 1.04562068, + "epoch": 0.4162332782203517, + "flos": 26213317395840.0, + "grad_norm": 1.7750243686484017, + "language_loss": 0.67461836, + "learning_rate": 2.628839621341247e-06, + "loss": 0.69618553, + "num_input_tokens_seen": 148646490, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7890625, + "step": 6923, + "time_per_iteration": 2.500643014907837 + }, + { + "auxiliary_loss_clip": 0.0112335, + "auxiliary_loss_mlp": 0.01043172, + "balance_loss_clip": 1.02822304, + "balance_loss_mlp": 1.04540539, + "epoch": 0.4162934014730197, + "flos": 28183987215360.0, + "grad_norm": 1.7543246434734396, + "language_loss": 0.75878662, + "learning_rate": 2.6284698996635593e-06, + "loss": 0.78045189, + "num_input_tokens_seen": 148668580, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.77734375, + "step": 6924, + "time_per_iteration": 2.5196292400360107 + }, + { + "auxiliary_loss_clip": 0.01120451, + "auxiliary_loss_mlp": 0.01037748, + "balance_loss_clip": 1.02382445, + "balance_loss_mlp": 1.04238617, + "epoch": 0.41635352472568765, + "flos": 19865316654720.0, + "grad_norm": 1.7266126934206025, + "language_loss": 0.72481495, + "learning_rate": 2.62810015415423e-06, + "loss": 0.74639702, + "num_input_tokens_seen": 148688410, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.78125, + "step": 6925, + "time_per_iteration": 2.4335598945617676 + }, + { + "auxiliary_loss_clip": 0.01118059, + "auxiliary_loss_mlp": 0.01034152, + "balance_loss_clip": 1.02069342, + "balance_loss_mlp": 1.0413928, + "epoch": 0.4164136479783556, + "flos": 14935356011520.0, + "grad_norm": 1.8465053152696829, + "language_loss": 0.83475816, + "learning_rate": 2.6277303848272792e-06, + "loss": 0.85628033, + "num_input_tokens_seen": 148704855, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 6926, + "time_per_iteration": 2.5088613033294678 + }, + { + "auxiliary_loss_clip": 0.01115859, + "auxiliary_loss_mlp": 0.01035349, + "balance_loss_clip": 1.02305889, + "balance_loss_mlp": 1.04325294, + "epoch": 0.4164737712310236, + "flos": 21757593041280.0, + "grad_norm": 1.6423809052501923, + "language_loss": 0.86620545, + "learning_rate": 2.6273605916967302e-06, + "loss": 0.88771755, + "num_input_tokens_seen": 148723065, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.7265625, + "step": 6927, + "time_per_iteration": 2.534503936767578 + }, + { + "auxiliary_loss_clip": 0.01118504, + "auxiliary_loss_mlp": 0.0104184, + "balance_loss_clip": 1.0272553, + "balance_loss_mlp": 1.04246414, + "epoch": 0.41653389448369155, + "flos": 20740136394240.0, + "grad_norm": 1.9802013979545179, + "language_loss": 0.72300684, + "learning_rate": 2.626990774776604e-06, + "loss": 0.74461025, + "num_input_tokens_seen": 148741780, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.76171875, + "step": 6928, + "time_per_iteration": 3.88004732131958 + }, + { + "auxiliary_loss_clip": 0.01116658, + "auxiliary_loss_mlp": 0.01039078, + "balance_loss_clip": 1.02459407, + "balance_loss_mlp": 1.04092073, + "epoch": 0.4165940177363595, + "flos": 24972891073920.0, + "grad_norm": 1.862862690513255, + "language_loss": 0.78142846, + "learning_rate": 2.6266209340809254e-06, + "loss": 0.80298579, + "num_input_tokens_seen": 148759795, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7578125, + "step": 6929, + "time_per_iteration": 5.323524713516235 + }, + { + "auxiliary_loss_clip": 0.01119115, + "auxiliary_loss_mlp": 0.01034916, + "balance_loss_clip": 1.02201128, + "balance_loss_mlp": 1.0432961, + "epoch": 0.41665414098902753, + "flos": 20521727746560.0, + "grad_norm": 1.7470362991732848, + "language_loss": 0.71024638, + "learning_rate": 2.6262510696237182e-06, + "loss": 0.73178667, + "num_input_tokens_seen": 148778680, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7578125, + "step": 6930, + "time_per_iteration": 2.4636495113372803 + }, + { + "auxiliary_loss_clip": 0.01116513, + "auxiliary_loss_mlp": 0.01035142, + "balance_loss_clip": 1.02139127, + "balance_loss_mlp": 1.04026747, + "epoch": 0.4167142642416955, + "flos": 19682926369920.0, + "grad_norm": 1.7271533589437842, + "language_loss": 0.80665648, + "learning_rate": 2.625881181419007e-06, + "loss": 0.82817304, + "num_input_tokens_seen": 148796470, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.765625, + "step": 6931, + "time_per_iteration": 2.4350993633270264 + }, + { + "auxiliary_loss_clip": 0.01115154, + "auxiliary_loss_mlp": 0.01038095, + "balance_loss_clip": 1.02392721, + "balance_loss_mlp": 1.04003608, + "epoch": 0.41677438749436346, + "flos": 23763742519680.0, + "grad_norm": 1.8450466812598405, + "language_loss": 0.79109526, + "learning_rate": 2.6255112694808193e-06, + "loss": 0.81262779, + "num_input_tokens_seen": 148815300, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.75, + "step": 6932, + "time_per_iteration": 2.499152660369873 + }, + { + "auxiliary_loss_clip": 0.01117704, + "auxiliary_loss_mlp": 0.01039084, + "balance_loss_clip": 1.02421236, + "balance_loss_mlp": 1.04105997, + "epoch": 0.41683451074703143, + "flos": 30410053712640.0, + "grad_norm": 2.265953381144445, + "language_loss": 0.81735384, + "learning_rate": 2.6251413338231813e-06, + "loss": 0.83892173, + "num_input_tokens_seen": 148834315, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.765625, + "step": 6933, + "time_per_iteration": 2.5096874237060547 + }, + { + "auxiliary_loss_clip": 0.01119747, + "auxiliary_loss_mlp": 0.01037299, + "balance_loss_clip": 1.02184963, + "balance_loss_mlp": 1.04056907, + "epoch": 0.4168946339996994, + "flos": 21506757390720.0, + "grad_norm": 2.1923639109766144, + "language_loss": 0.76769817, + "learning_rate": 2.624771374460121e-06, + "loss": 0.78926861, + "num_input_tokens_seen": 148852420, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.79296875, + "step": 6934, + "time_per_iteration": 2.4590814113616943 + }, + { + "auxiliary_loss_clip": 0.01120428, + "auxiliary_loss_mlp": 0.01034168, + "balance_loss_clip": 1.02003551, + "balance_loss_mlp": 1.04396558, + "epoch": 0.41695475725236736, + "flos": 17638675539840.0, + "grad_norm": 1.774753965654226, + "language_loss": 0.67036676, + "learning_rate": 2.624401391405668e-06, + "loss": 0.69191271, + "num_input_tokens_seen": 148869305, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.765625, + "step": 6935, + "time_per_iteration": 2.4111990928649902 + }, + { + "auxiliary_loss_clip": 0.01120243, + "auxiliary_loss_mlp": 0.01040903, + "balance_loss_clip": 1.0266757, + "balance_loss_mlp": 1.04329324, + "epoch": 0.4170148805050353, + "flos": 15668903560320.0, + "grad_norm": 2.7357101171275504, + "language_loss": 0.73245633, + "learning_rate": 2.6240313846738513e-06, + "loss": 0.75406778, + "num_input_tokens_seen": 148886395, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.76953125, + "step": 6936, + "time_per_iteration": 2.452911376953125 + }, + { + "auxiliary_loss_clip": 0.01117055, + "auxiliary_loss_mlp": 0.01034796, + "balance_loss_clip": 1.02102733, + "balance_loss_mlp": 1.0418582, + "epoch": 0.4170750037577033, + "flos": 15159151699200.0, + "grad_norm": 1.8471548990860345, + "language_loss": 0.73746514, + "learning_rate": 2.6236613542787024e-06, + "loss": 0.75898361, + "num_input_tokens_seen": 148905235, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.75, + "step": 6937, + "time_per_iteration": 2.426177978515625 + }, + { + "auxiliary_loss_clip": 0.01116111, + "auxiliary_loss_mlp": 0.01035449, + "balance_loss_clip": 1.02194881, + "balance_loss_mlp": 1.04150152, + "epoch": 0.41713512701037125, + "flos": 28768289754240.0, + "grad_norm": 1.512143650526939, + "language_loss": 0.8406328, + "learning_rate": 2.6232913002342518e-06, + "loss": 0.8621484, + "num_input_tokens_seen": 148928130, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 6938, + "time_per_iteration": 2.543088436126709 + }, + { + "auxiliary_loss_clip": 0.0112279, + "auxiliary_loss_mlp": 0.01034707, + "balance_loss_clip": 1.01959753, + "balance_loss_mlp": 1.04346168, + "epoch": 0.4171952502630392, + "flos": 28256993608320.0, + "grad_norm": 2.0225615339435183, + "language_loss": 0.74319148, + "learning_rate": 2.6229212225545334e-06, + "loss": 0.76476645, + "num_input_tokens_seen": 148948790, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.79296875, + "step": 6939, + "time_per_iteration": 2.5119175910949707 + }, + { + "auxiliary_loss_clip": 0.01120397, + "auxiliary_loss_mlp": 0.01033285, + "balance_loss_clip": 1.01864552, + "balance_loss_mlp": 1.04396725, + "epoch": 0.4172553735157072, + "flos": 24571697091840.0, + "grad_norm": 1.7048101001333908, + "language_loss": 0.7502594, + "learning_rate": 2.622551121253579e-06, + "loss": 0.77179623, + "num_input_tokens_seen": 148967690, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.765625, + "step": 6940, + "time_per_iteration": 2.505476474761963 + }, + { + "auxiliary_loss_clip": 0.01118418, + "auxiliary_loss_mlp": 0.0103924, + "balance_loss_clip": 1.02621651, + "balance_loss_mlp": 1.04277742, + "epoch": 0.41731549676837515, + "flos": 27045797978880.0, + "grad_norm": 1.6601557953990327, + "language_loss": 0.71575844, + "learning_rate": 2.622180996345424e-06, + "loss": 0.73733509, + "num_input_tokens_seen": 148987150, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7578125, + "step": 6941, + "time_per_iteration": 2.4826831817626953 + }, + { + "auxiliary_loss_clip": 0.01120873, + "auxiliary_loss_mlp": 0.0103738, + "balance_loss_clip": 1.02307487, + "balance_loss_mlp": 1.04215777, + "epoch": 0.4173756200210431, + "flos": 28394063907840.0, + "grad_norm": 1.8824806717934597, + "language_loss": 0.73884863, + "learning_rate": 2.621810847844104e-06, + "loss": 0.76043111, + "num_input_tokens_seen": 149004895, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7890625, + "step": 6942, + "time_per_iteration": 2.510179281234741 + }, + { + "auxiliary_loss_clip": 0.01124355, + "auxiliary_loss_mlp": 0.01037141, + "balance_loss_clip": 1.02190626, + "balance_loss_mlp": 1.04450595, + "epoch": 0.41743574327371114, + "flos": 22521556431360.0, + "grad_norm": 2.1000096782313644, + "language_loss": 0.72619486, + "learning_rate": 2.6214406757636534e-06, + "loss": 0.74780977, + "num_input_tokens_seen": 149020970, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.80078125, + "step": 6943, + "time_per_iteration": 2.437713861465454 + }, + { + "auxiliary_loss_clip": 0.01121913, + "auxiliary_loss_mlp": 0.01033826, + "balance_loss_clip": 1.01844811, + "balance_loss_mlp": 1.04391849, + "epoch": 0.4174958665263791, + "flos": 30113431200000.0, + "grad_norm": 1.5914405962225948, + "language_loss": 0.63451827, + "learning_rate": 2.621070480118111e-06, + "loss": 0.6560756, + "num_input_tokens_seen": 149041795, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.78125, + "step": 6944, + "time_per_iteration": 2.5866405963897705 + }, + { + "auxiliary_loss_clip": 0.01118766, + "auxiliary_loss_mlp": 0.01031312, + "balance_loss_clip": 1.01747799, + "balance_loss_mlp": 1.04272938, + "epoch": 0.41755598977904707, + "flos": 25263444188160.0, + "grad_norm": 1.6963739292171327, + "language_loss": 0.7014094, + "learning_rate": 2.620700260921513e-06, + "loss": 0.72291017, + "num_input_tokens_seen": 149063700, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76171875, + "step": 6945, + "time_per_iteration": 2.4984183311462402 + }, + { + "auxiliary_loss_clip": 0.01116559, + "auxiliary_loss_mlp": 0.01041419, + "balance_loss_clip": 1.02556372, + "balance_loss_mlp": 1.04024088, + "epoch": 0.41761611303171503, + "flos": 19828580019840.0, + "grad_norm": 1.623733928455925, + "language_loss": 0.80850792, + "learning_rate": 2.620330018187899e-06, + "loss": 0.83008766, + "num_input_tokens_seen": 149082410, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.76171875, + "step": 6946, + "time_per_iteration": 2.5301356315612793 + }, + { + "auxiliary_loss_clip": 0.01118432, + "auxiliary_loss_mlp": 0.01036458, + "balance_loss_clip": 1.02281451, + "balance_loss_mlp": 1.04321134, + "epoch": 0.417676236284383, + "flos": 15523249910400.0, + "grad_norm": 2.2176705837507784, + "language_loss": 0.77525783, + "learning_rate": 2.6199597519313086e-06, + "loss": 0.79680669, + "num_input_tokens_seen": 149098745, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75390625, + "step": 6947, + "time_per_iteration": 2.432767391204834 + }, + { + "auxiliary_loss_clip": 0.01119017, + "auxiliary_loss_mlp": 0.01035109, + "balance_loss_clip": 1.0204227, + "balance_loss_mlp": 1.04268038, + "epoch": 0.41773635953705096, + "flos": 32524473761280.0, + "grad_norm": 2.207686964264854, + "language_loss": 0.71242738, + "learning_rate": 2.6195894621657825e-06, + "loss": 0.73396862, + "num_input_tokens_seen": 149122255, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.76171875, + "step": 6948, + "time_per_iteration": 2.565560817718506 + }, + { + "auxiliary_loss_clip": 0.01112399, + "auxiliary_loss_mlp": 0.01029195, + "balance_loss_clip": 1.01575994, + "balance_loss_mlp": 1.03894424, + "epoch": 0.4177964827897189, + "flos": 23440941970560.0, + "grad_norm": 1.5189916920378803, + "language_loss": 0.77142775, + "learning_rate": 2.619219148905362e-06, + "loss": 0.7928437, + "num_input_tokens_seen": 149142845, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.734375, + "step": 6949, + "time_per_iteration": 2.459484338760376 + }, + { + "auxiliary_loss_clip": 0.01122421, + "auxiliary_loss_mlp": 0.01036015, + "balance_loss_clip": 1.02156091, + "balance_loss_mlp": 1.04367769, + "epoch": 0.4178566060423869, + "flos": 22748907565440.0, + "grad_norm": 1.5094834159772865, + "language_loss": 0.81985492, + "learning_rate": 2.6188488121640888e-06, + "loss": 0.84143925, + "num_input_tokens_seen": 149163375, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7890625, + "step": 6950, + "time_per_iteration": 2.5348877906799316 + }, + { + "auxiliary_loss_clip": 0.01113505, + "auxiliary_loss_mlp": 0.01030512, + "balance_loss_clip": 1.01804328, + "balance_loss_mlp": 1.04157758, + "epoch": 0.41791672929505486, + "flos": 26032794618240.0, + "grad_norm": 1.3221945547908684, + "language_loss": 0.76189649, + "learning_rate": 2.618478451956007e-06, + "loss": 0.78333664, + "num_input_tokens_seen": 149185610, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71875, + "step": 6951, + "time_per_iteration": 2.5055410861968994 + }, + { + "auxiliary_loss_clip": 0.01121988, + "auxiliary_loss_mlp": 0.01034046, + "balance_loss_clip": 1.01894784, + "balance_loss_mlp": 1.04247046, + "epoch": 0.4179768525477228, + "flos": 19568694142080.0, + "grad_norm": 1.7645474682355455, + "language_loss": 0.72922826, + "learning_rate": 2.61810806829516e-06, + "loss": 0.75078857, + "num_input_tokens_seen": 149203990, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.796875, + "step": 6952, + "time_per_iteration": 2.499979019165039 + }, + { + "auxiliary_loss_clip": 0.01117763, + "auxiliary_loss_mlp": 0.01032178, + "balance_loss_clip": 1.01826596, + "balance_loss_mlp": 1.04266691, + "epoch": 0.4180369758003908, + "flos": 17783826399360.0, + "grad_norm": 3.0061867681934795, + "language_loss": 0.7182008, + "learning_rate": 2.617737661195593e-06, + "loss": 0.73970026, + "num_input_tokens_seen": 149221385, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 6953, + "time_per_iteration": 2.4045305252075195 + }, + { + "auxiliary_loss_clip": 0.01116286, + "auxiliary_loss_mlp": 0.01035837, + "balance_loss_clip": 1.02106667, + "balance_loss_mlp": 1.04293513, + "epoch": 0.41809709905305875, + "flos": 20960663944320.0, + "grad_norm": 1.696123367289706, + "language_loss": 0.76163101, + "learning_rate": 2.617367230671353e-06, + "loss": 0.78315222, + "num_input_tokens_seen": 149241175, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.734375, + "step": 6954, + "time_per_iteration": 2.5208778381347656 + }, + { + "auxiliary_loss_clip": 0.01117033, + "auxiliary_loss_mlp": 0.01037952, + "balance_loss_clip": 1.02243114, + "balance_loss_mlp": 1.0407306, + "epoch": 0.4181572223057267, + "flos": 22017622573440.0, + "grad_norm": 2.123626835554744, + "language_loss": 0.84569108, + "learning_rate": 2.616996776736485e-06, + "loss": 0.86724097, + "num_input_tokens_seen": 149259115, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.765625, + "step": 6955, + "time_per_iteration": 2.4470770359039307 + }, + { + "auxiliary_loss_clip": 0.01115036, + "auxiliary_loss_mlp": 0.01035899, + "balance_loss_clip": 1.02206469, + "balance_loss_mlp": 1.04131222, + "epoch": 0.4182173455583947, + "flos": 26245528917120.0, + "grad_norm": 1.7424753883235222, + "language_loss": 0.83219767, + "learning_rate": 2.616626299405037e-06, + "loss": 0.85370708, + "num_input_tokens_seen": 149278705, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.73828125, + "step": 6956, + "time_per_iteration": 2.53238582611084 + }, + { + "auxiliary_loss_clip": 0.01120034, + "auxiliary_loss_mlp": 0.01041481, + "balance_loss_clip": 1.02661586, + "balance_loss_mlp": 1.04286742, + "epoch": 0.4182774688110627, + "flos": 14791605782400.0, + "grad_norm": 2.117667338273699, + "language_loss": 0.71621263, + "learning_rate": 2.616255798691059e-06, + "loss": 0.73782784, + "num_input_tokens_seen": 149294040, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7734375, + "step": 6957, + "time_per_iteration": 2.4127233028411865 + }, + { + "auxiliary_loss_clip": 0.01116705, + "auxiliary_loss_mlp": 0.01037238, + "balance_loss_clip": 1.02450657, + "balance_loss_mlp": 1.0416609, + "epoch": 0.41833759206373067, + "flos": 20412020632320.0, + "grad_norm": 2.020066118448717, + "language_loss": 0.75841641, + "learning_rate": 2.6158852746085982e-06, + "loss": 0.77995586, + "num_input_tokens_seen": 149310385, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.75, + "step": 6958, + "time_per_iteration": 2.621243476867676 + }, + { + "auxiliary_loss_clip": 0.01116903, + "auxiliary_loss_mlp": 0.01031018, + "balance_loss_clip": 1.01718402, + "balance_loss_mlp": 1.04121447, + "epoch": 0.41839771531639863, + "flos": 23656333875840.0, + "grad_norm": 1.5992923753241641, + "language_loss": 0.76712382, + "learning_rate": 2.6155147271717066e-06, + "loss": 0.78860307, + "num_input_tokens_seen": 149328235, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 6959, + "time_per_iteration": 2.4936535358428955 + }, + { + "auxiliary_loss_clip": 0.01117896, + "auxiliary_loss_mlp": 0.01036611, + "balance_loss_clip": 1.02191257, + "balance_loss_mlp": 1.04106176, + "epoch": 0.4184578385690666, + "flos": 19754137082880.0, + "grad_norm": 1.629552094504433, + "language_loss": 0.76652783, + "learning_rate": 2.6151441563944347e-06, + "loss": 0.78807288, + "num_input_tokens_seen": 149347465, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.765625, + "step": 6960, + "time_per_iteration": 2.513699769973755 + }, + { + "auxiliary_loss_clip": 0.01111464, + "auxiliary_loss_mlp": 0.01030825, + "balance_loss_clip": 1.01822484, + "balance_loss_mlp": 1.04088879, + "epoch": 0.41851796182173456, + "flos": 20193396503040.0, + "grad_norm": 1.8359587043053753, + "language_loss": 0.75856298, + "learning_rate": 2.614773562290835e-06, + "loss": 0.7799859, + "num_input_tokens_seen": 149366685, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.70703125, + "step": 6961, + "time_per_iteration": 2.4798686504364014 + }, + { + "auxiliary_loss_clip": 0.01040549, + "auxiliary_loss_mlp": 0.010007, + "balance_loss_clip": 0.99909067, + "balance_loss_mlp": 1.01660466, + "epoch": 0.41857808507440253, + "flos": 59018794231680.0, + "grad_norm": 0.7788999280449799, + "language_loss": 0.5466665, + "learning_rate": 2.61440294487496e-06, + "loss": 0.56707895, + "num_input_tokens_seen": 149422925, + "router_z_loss_clip": 0.01611328, + "router_z_loss_mlp": 0.23925781, + "step": 6962, + "time_per_iteration": 3.0343000888824463 + }, + { + "auxiliary_loss_clip": 0.01119412, + "auxiliary_loss_mlp": 0.01036235, + "balance_loss_clip": 1.02266252, + "balance_loss_mlp": 1.04263735, + "epoch": 0.4186382083270705, + "flos": 18478805719680.0, + "grad_norm": 1.8026406871934313, + "language_loss": 0.85487044, + "learning_rate": 2.614032304160864e-06, + "loss": 0.87642694, + "num_input_tokens_seen": 149440820, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.765625, + "step": 6963, + "time_per_iteration": 2.4352054595947266 + }, + { + "auxiliary_loss_clip": 0.01117647, + "auxiliary_loss_mlp": 0.01035822, + "balance_loss_clip": 1.02210093, + "balance_loss_mlp": 1.04331315, + "epoch": 0.41869833157973846, + "flos": 21578758202880.0, + "grad_norm": 1.6053381131745172, + "language_loss": 0.70357138, + "learning_rate": 2.6136616401626014e-06, + "loss": 0.72510606, + "num_input_tokens_seen": 149461060, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7421875, + "step": 6964, + "time_per_iteration": 2.50482439994812 + }, + { + "auxiliary_loss_clip": 0.01113376, + "auxiliary_loss_mlp": 0.01035614, + "balance_loss_clip": 1.02268004, + "balance_loss_mlp": 1.04087543, + "epoch": 0.4187584548324064, + "flos": 35517412650240.0, + "grad_norm": 1.8351593031507138, + "language_loss": 0.70862091, + "learning_rate": 2.6132909528942273e-06, + "loss": 0.73011076, + "num_input_tokens_seen": 149483115, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.72265625, + "step": 6965, + "time_per_iteration": 2.6057491302490234 + }, + { + "auxiliary_loss_clip": 0.01113418, + "auxiliary_loss_mlp": 0.01032504, + "balance_loss_clip": 1.02033257, + "balance_loss_mlp": 1.0413456, + "epoch": 0.4188185780850744, + "flos": 18655880791680.0, + "grad_norm": 1.4950689447506187, + "language_loss": 0.7175675, + "learning_rate": 2.6129202423697997e-06, + "loss": 0.73902673, + "num_input_tokens_seen": 149501495, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.71875, + "step": 6966, + "time_per_iteration": 2.4892048835754395 + }, + { + "auxiliary_loss_clip": 0.01120204, + "auxiliary_loss_mlp": 0.0103471, + "balance_loss_clip": 1.02016091, + "balance_loss_mlp": 1.0421617, + "epoch": 0.41887870133774235, + "flos": 40333428374400.0, + "grad_norm": 2.333720493500319, + "language_loss": 0.71266413, + "learning_rate": 2.612549508603375e-06, + "loss": 0.73421323, + "num_input_tokens_seen": 149523170, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.78125, + "step": 6967, + "time_per_iteration": 2.604076862335205 + }, + { + "auxiliary_loss_clip": 0.01039013, + "auxiliary_loss_mlp": 0.01005246, + "balance_loss_clip": 1.00366104, + "balance_loss_mlp": 1.01515508, + "epoch": 0.4189388245904103, + "flos": 61371336516480.0, + "grad_norm": 0.6722087248044618, + "language_loss": 0.46224236, + "learning_rate": 2.612178751609011e-06, + "loss": 0.48268497, + "num_input_tokens_seen": 149583955, + "router_z_loss_clip": 0.01586914, + "router_z_loss_mlp": 0.23828125, + "step": 6968, + "time_per_iteration": 3.0401268005371094 + }, + { + "auxiliary_loss_clip": 0.01117965, + "auxiliary_loss_mlp": 0.01038122, + "balance_loss_clip": 1.02345359, + "balance_loss_mlp": 1.03981948, + "epoch": 0.4189989478430783, + "flos": 28215624119040.0, + "grad_norm": 1.6180807795397785, + "language_loss": 0.74930859, + "learning_rate": 2.6118079714007685e-06, + "loss": 0.77086943, + "num_input_tokens_seen": 149604440, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78125, + "step": 6969, + "time_per_iteration": 2.5126969814300537 + }, + { + "auxiliary_loss_clip": 0.01112428, + "auxiliary_loss_mlp": 0.01034589, + "balance_loss_clip": 1.02160668, + "balance_loss_mlp": 1.0382787, + "epoch": 0.4190590710957463, + "flos": 24565879088640.0, + "grad_norm": 2.2016737043444903, + "language_loss": 0.80248457, + "learning_rate": 2.611437167992705e-06, + "loss": 0.8239547, + "num_input_tokens_seen": 149623745, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7421875, + "step": 6970, + "time_per_iteration": 5.640556573867798 + }, + { + "auxiliary_loss_clip": 0.01114015, + "auxiliary_loss_mlp": 0.01030966, + "balance_loss_clip": 1.01774538, + "balance_loss_mlp": 1.04030848, + "epoch": 0.41911919434841427, + "flos": 21726027964800.0, + "grad_norm": 1.9623449568843938, + "language_loss": 0.82789886, + "learning_rate": 2.6110663413988835e-06, + "loss": 0.84934866, + "num_input_tokens_seen": 149643025, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 6971, + "time_per_iteration": 3.8554296493530273 + }, + { + "auxiliary_loss_clip": 0.01113275, + "auxiliary_loss_mlp": 0.01035215, + "balance_loss_clip": 1.02057588, + "balance_loss_mlp": 1.04049933, + "epoch": 0.41917931760108224, + "flos": 17601543855360.0, + "grad_norm": 1.6158786040890867, + "language_loss": 0.7468822, + "learning_rate": 2.6106954916333648e-06, + "loss": 0.76836711, + "num_input_tokens_seen": 149660695, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7265625, + "step": 6972, + "time_per_iteration": 2.474414587020874 + }, + { + "auxiliary_loss_clip": 0.01113414, + "auxiliary_loss_mlp": 0.01033398, + "balance_loss_clip": 1.02039838, + "balance_loss_mlp": 1.0393647, + "epoch": 0.4192394408537502, + "flos": 37816701022080.0, + "grad_norm": 1.4614195470734719, + "language_loss": 0.72808421, + "learning_rate": 2.610324618710212e-06, + "loss": 0.74955231, + "num_input_tokens_seen": 149682040, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7421875, + "step": 6973, + "time_per_iteration": 2.5945606231689453 + }, + { + "auxiliary_loss_clip": 0.0112256, + "auxiliary_loss_mlp": 0.01041918, + "balance_loss_clip": 1.02769673, + "balance_loss_mlp": 1.04242992, + "epoch": 0.41929956410641817, + "flos": 23107726477440.0, + "grad_norm": 2.1718837857164464, + "language_loss": 0.74863386, + "learning_rate": 2.609953722643489e-06, + "loss": 0.77027869, + "num_input_tokens_seen": 149700855, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8046875, + "step": 6974, + "time_per_iteration": 2.4790663719177246 + }, + { + "auxiliary_loss_clip": 0.01112575, + "auxiliary_loss_mlp": 0.01030019, + "balance_loss_clip": 1.01669776, + "balance_loss_mlp": 1.03879452, + "epoch": 0.41935968735908613, + "flos": 22524537260160.0, + "grad_norm": 2.8466202693933265, + "language_loss": 0.72836936, + "learning_rate": 2.609582803447259e-06, + "loss": 0.74979532, + "num_input_tokens_seen": 149717360, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73828125, + "step": 6975, + "time_per_iteration": 2.4560608863830566 + }, + { + "auxiliary_loss_clip": 0.01114785, + "auxiliary_loss_mlp": 0.01033126, + "balance_loss_clip": 1.01961374, + "balance_loss_mlp": 1.04139054, + "epoch": 0.4194198106117541, + "flos": 26870446759680.0, + "grad_norm": 1.6070899494887878, + "language_loss": 0.80725533, + "learning_rate": 2.6092118611355885e-06, + "loss": 0.82873446, + "num_input_tokens_seen": 149738975, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 6976, + "time_per_iteration": 2.5148777961730957 + }, + { + "auxiliary_loss_clip": 0.01112592, + "auxiliary_loss_mlp": 0.01025549, + "balance_loss_clip": 1.0124954, + "balance_loss_mlp": 1.03755522, + "epoch": 0.41947993386442206, + "flos": 19902412425600.0, + "grad_norm": 2.297468657248195, + "language_loss": 0.67767072, + "learning_rate": 2.6088408957225425e-06, + "loss": 0.6990521, + "num_input_tokens_seen": 149757055, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.75, + "step": 6977, + "time_per_iteration": 2.4294896125793457 + }, + { + "auxiliary_loss_clip": 0.01116519, + "auxiliary_loss_mlp": 0.01034878, + "balance_loss_clip": 1.02193213, + "balance_loss_mlp": 1.04046345, + "epoch": 0.41954005711709, + "flos": 17383889393280.0, + "grad_norm": 2.6461140984259304, + "language_loss": 0.80869353, + "learning_rate": 2.6084699072221898e-06, + "loss": 0.83020747, + "num_input_tokens_seen": 149772885, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.76171875, + "step": 6978, + "time_per_iteration": 2.4688472747802734 + }, + { + "auxiliary_loss_clip": 0.01114359, + "auxiliary_loss_mlp": 0.01036249, + "balance_loss_clip": 1.02207506, + "balance_loss_mlp": 1.0377202, + "epoch": 0.419600180369758, + "flos": 25003306915200.0, + "grad_norm": 1.725404980402679, + "language_loss": 0.82583737, + "learning_rate": 2.6080988956485964e-06, + "loss": 0.84734344, + "num_input_tokens_seen": 149791515, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.765625, + "step": 6979, + "time_per_iteration": 2.4702186584472656 + }, + { + "auxiliary_loss_clip": 0.01113345, + "auxiliary_loss_mlp": 0.01033062, + "balance_loss_clip": 1.0194428, + "balance_loss_mlp": 1.0388211, + "epoch": 0.41966030362242596, + "flos": 17383781652480.0, + "grad_norm": 1.8637978278873943, + "language_loss": 0.83381826, + "learning_rate": 2.6077278610158325e-06, + "loss": 0.85528231, + "num_input_tokens_seen": 149807250, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 6980, + "time_per_iteration": 2.5195069313049316 + }, + { + "auxiliary_loss_clip": 0.01116413, + "auxiliary_loss_mlp": 0.01032944, + "balance_loss_clip": 1.01975989, + "balance_loss_mlp": 1.03946161, + "epoch": 0.4197204268750939, + "flos": 22156165330560.0, + "grad_norm": 2.9241676519266004, + "language_loss": 0.79068786, + "learning_rate": 2.6073568033379665e-06, + "loss": 0.81218135, + "num_input_tokens_seen": 149821640, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.765625, + "step": 6981, + "time_per_iteration": 2.4457991123199463 + }, + { + "auxiliary_loss_clip": 0.01110331, + "auxiliary_loss_mlp": 0.01034012, + "balance_loss_clip": 1.02078593, + "balance_loss_mlp": 1.03806782, + "epoch": 0.4197805501277619, + "flos": 22084128604800.0, + "grad_norm": 1.6203222993930824, + "language_loss": 0.84426481, + "learning_rate": 2.6069857226290696e-06, + "loss": 0.86570823, + "num_input_tokens_seen": 149840545, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.72265625, + "step": 6982, + "time_per_iteration": 2.483635425567627 + }, + { + "auxiliary_loss_clip": 0.01116431, + "auxiliary_loss_mlp": 0.01036883, + "balance_loss_clip": 1.02191043, + "balance_loss_mlp": 1.03910255, + "epoch": 0.4198406733804299, + "flos": 26432192920320.0, + "grad_norm": 1.9325593989695682, + "language_loss": 0.56615967, + "learning_rate": 2.606614618903214e-06, + "loss": 0.58769286, + "num_input_tokens_seen": 149860375, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7734375, + "step": 6983, + "time_per_iteration": 2.4729864597320557 + }, + { + "auxiliary_loss_clip": 0.01114232, + "auxiliary_loss_mlp": 0.01035133, + "balance_loss_clip": 1.02243733, + "balance_loss_mlp": 1.0403446, + "epoch": 0.4199007966330979, + "flos": 12531029293440.0, + "grad_norm": 2.639890794043824, + "language_loss": 0.82404107, + "learning_rate": 2.606243492174471e-06, + "loss": 0.84553468, + "num_input_tokens_seen": 149877850, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.73828125, + "step": 6984, + "time_per_iteration": 2.4610702991485596 + }, + { + "auxiliary_loss_clip": 0.01113379, + "auxiliary_loss_mlp": 0.01028562, + "balance_loss_clip": 1.01515102, + "balance_loss_mlp": 1.03938794, + "epoch": 0.41996091988576584, + "flos": 21762944167680.0, + "grad_norm": 1.6654879970317658, + "language_loss": 0.78883481, + "learning_rate": 2.605872342456914e-06, + "loss": 0.81025428, + "num_input_tokens_seen": 149896110, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73828125, + "step": 6985, + "time_per_iteration": 2.4739370346069336 + }, + { + "auxiliary_loss_clip": 0.01118591, + "auxiliary_loss_mlp": 0.01034658, + "balance_loss_clip": 1.02042401, + "balance_loss_mlp": 1.03950381, + "epoch": 0.4200210431384338, + "flos": 26541935948160.0, + "grad_norm": 3.375844113891133, + "language_loss": 0.77833611, + "learning_rate": 2.6055011697646173e-06, + "loss": 0.79986858, + "num_input_tokens_seen": 149916495, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7890625, + "step": 6986, + "time_per_iteration": 2.5488531589508057 + }, + { + "auxiliary_loss_clip": 0.01111943, + "auxiliary_loss_mlp": 0.01030974, + "balance_loss_clip": 1.01886213, + "balance_loss_mlp": 1.03984082, + "epoch": 0.42008116639110177, + "flos": 26795824254720.0, + "grad_norm": 1.5789932508621725, + "language_loss": 0.72640669, + "learning_rate": 2.605129974111655e-06, + "loss": 0.74783587, + "num_input_tokens_seen": 149936445, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71875, + "step": 6987, + "time_per_iteration": 2.522143840789795 + }, + { + "auxiliary_loss_clip": 0.01117787, + "auxiliary_loss_mlp": 0.01042745, + "balance_loss_clip": 1.02886939, + "balance_loss_mlp": 1.04176915, + "epoch": 0.42014128964376973, + "flos": 32087333243520.0, + "grad_norm": 1.4538200585449164, + "language_loss": 0.75399673, + "learning_rate": 2.604758755512104e-06, + "loss": 0.77560198, + "num_input_tokens_seen": 149959430, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76171875, + "step": 6988, + "time_per_iteration": 2.57265305519104 + }, + { + "auxiliary_loss_clip": 0.01118811, + "auxiliary_loss_mlp": 0.01036964, + "balance_loss_clip": 1.02287364, + "balance_loss_mlp": 1.04034519, + "epoch": 0.4202014128964377, + "flos": 26467133875200.0, + "grad_norm": 1.6383736622893421, + "language_loss": 0.74155712, + "learning_rate": 2.60438751398004e-06, + "loss": 0.76311487, + "num_input_tokens_seen": 149980365, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.78125, + "step": 6989, + "time_per_iteration": 2.4846689701080322 + }, + { + "auxiliary_loss_clip": 0.01118468, + "auxiliary_loss_mlp": 0.01036151, + "balance_loss_clip": 1.02213192, + "balance_loss_mlp": 1.041116, + "epoch": 0.42026153614910566, + "flos": 13401216178560.0, + "grad_norm": 2.649933968591077, + "language_loss": 0.70989478, + "learning_rate": 2.6040162495295404e-06, + "loss": 0.73144102, + "num_input_tokens_seen": 149997375, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7734375, + "step": 6990, + "time_per_iteration": 2.5092554092407227 + }, + { + "auxiliary_loss_clip": 0.01038945, + "auxiliary_loss_mlp": 0.01004482, + "balance_loss_clip": 1.00287271, + "balance_loss_mlp": 1.01510215, + "epoch": 0.42032165940177363, + "flos": 60250457635200.0, + "grad_norm": 0.8281033043630844, + "language_loss": 0.60529578, + "learning_rate": 2.603644962174685e-06, + "loss": 0.62573004, + "num_input_tokens_seen": 150051230, + "router_z_loss_clip": 0.01611328, + "router_z_loss_mlp": 0.23828125, + "step": 6991, + "time_per_iteration": 2.921936511993408 + }, + { + "auxiliary_loss_clip": 0.01120177, + "auxiliary_loss_mlp": 0.01037059, + "balance_loss_clip": 1.02322519, + "balance_loss_mlp": 1.04332614, + "epoch": 0.4203817826544416, + "flos": 24535211852160.0, + "grad_norm": 1.5069916983433078, + "language_loss": 0.83222365, + "learning_rate": 2.6032736519295517e-06, + "loss": 0.85379601, + "num_input_tokens_seen": 150071135, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76953125, + "step": 6992, + "time_per_iteration": 2.495664358139038 + }, + { + "auxiliary_loss_clip": 0.01039195, + "auxiliary_loss_mlp": 0.01003357, + "balance_loss_clip": 1.00179517, + "balance_loss_mlp": 1.01546574, + "epoch": 0.42044190590710956, + "flos": 58820781530880.0, + "grad_norm": 0.8165124973650228, + "language_loss": 0.65523541, + "learning_rate": 2.6029023188082217e-06, + "loss": 0.67566097, + "num_input_tokens_seen": 150125220, + "router_z_loss_clip": 0.01556396, + "router_z_loss_mlp": 0.23730469, + "step": 6993, + "time_per_iteration": 3.078948736190796 + }, + { + "auxiliary_loss_clip": 0.01122889, + "auxiliary_loss_mlp": 0.01034205, + "balance_loss_clip": 1.01845777, + "balance_loss_mlp": 1.04213274, + "epoch": 0.4205020291597775, + "flos": 16436063260800.0, + "grad_norm": 2.0847143106579806, + "language_loss": 0.83213866, + "learning_rate": 2.6025309628247746e-06, + "loss": 0.85370958, + "num_input_tokens_seen": 150142300, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8046875, + "step": 6994, + "time_per_iteration": 2.42958402633667 + }, + { + "auxiliary_loss_clip": 0.01115372, + "auxiliary_loss_mlp": 0.01034094, + "balance_loss_clip": 1.02112424, + "balance_loss_mlp": 1.04195786, + "epoch": 0.4205621524124455, + "flos": 18405655672320.0, + "grad_norm": 1.6590785995391892, + "language_loss": 0.78497195, + "learning_rate": 2.6021595839932934e-06, + "loss": 0.8064667, + "num_input_tokens_seen": 150161345, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.734375, + "step": 6995, + "time_per_iteration": 2.4311602115631104 + }, + { + "auxiliary_loss_clip": 0.01113356, + "auxiliary_loss_mlp": 0.01031571, + "balance_loss_clip": 1.01849341, + "balance_loss_mlp": 1.04043221, + "epoch": 0.4206222756651135, + "flos": 25520097841920.0, + "grad_norm": 1.5317093362831764, + "language_loss": 0.79829741, + "learning_rate": 2.60178818232786e-06, + "loss": 0.81974673, + "num_input_tokens_seen": 150182420, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73046875, + "step": 6996, + "time_per_iteration": 2.5032711029052734 + }, + { + "auxiliary_loss_clip": 0.01118604, + "auxiliary_loss_mlp": 0.01031481, + "balance_loss_clip": 1.01837945, + "balance_loss_mlp": 1.04208779, + "epoch": 0.4206823989177815, + "flos": 15304338472320.0, + "grad_norm": 2.3208366966184837, + "language_loss": 0.7522642, + "learning_rate": 2.601416757842559e-06, + "loss": 0.77376509, + "num_input_tokens_seen": 150200175, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.765625, + "step": 6997, + "time_per_iteration": 2.4281609058380127 + }, + { + "auxiliary_loss_clip": 0.01117176, + "auxiliary_loss_mlp": 0.01038831, + "balance_loss_clip": 1.02492523, + "balance_loss_mlp": 1.03965962, + "epoch": 0.42074252217044944, + "flos": 15554096714880.0, + "grad_norm": 1.9779533128263025, + "language_loss": 0.76193553, + "learning_rate": 2.6010453105514743e-06, + "loss": 0.78349566, + "num_input_tokens_seen": 150217100, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7734375, + "step": 6998, + "time_per_iteration": 2.4484825134277344 + }, + { + "auxiliary_loss_clip": 0.01121567, + "auxiliary_loss_mlp": 0.01043992, + "balance_loss_clip": 1.02950823, + "balance_loss_mlp": 1.04302716, + "epoch": 0.4208026454231174, + "flos": 26145877610880.0, + "grad_norm": 1.545568275541188, + "language_loss": 0.76295245, + "learning_rate": 2.60067384046869e-06, + "loss": 0.78460807, + "num_input_tokens_seen": 150239830, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78515625, + "step": 6999, + "time_per_iteration": 2.5371389389038086 + }, + { + "auxiliary_loss_clip": 0.01116809, + "auxiliary_loss_mlp": 0.01039134, + "balance_loss_clip": 1.02512717, + "balance_loss_mlp": 1.04221511, + "epoch": 0.42086276867578537, + "flos": 23550110380800.0, + "grad_norm": 1.7925226690493865, + "language_loss": 0.64549243, + "learning_rate": 2.600302347608295e-06, + "loss": 0.66705179, + "num_input_tokens_seen": 150260690, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.74609375, + "step": 7000, + "time_per_iteration": 2.492664337158203 + }, + { + "auxiliary_loss_clip": 0.01117436, + "auxiliary_loss_mlp": 0.01037742, + "balance_loss_clip": 1.02347827, + "balance_loss_mlp": 1.04157186, + "epoch": 0.42092289192845334, + "flos": 18113414618880.0, + "grad_norm": 1.6489015448559594, + "language_loss": 0.76201057, + "learning_rate": 2.5999308319843743e-06, + "loss": 0.7835623, + "num_input_tokens_seen": 150279885, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7578125, + "step": 7001, + "time_per_iteration": 2.4374375343322754 + }, + { + "auxiliary_loss_clip": 0.01116209, + "auxiliary_loss_mlp": 0.0103509, + "balance_loss_clip": 1.02163076, + "balance_loss_mlp": 1.04236293, + "epoch": 0.4209830151811213, + "flos": 20006588845440.0, + "grad_norm": 1.558613926183474, + "language_loss": 0.86427414, + "learning_rate": 2.5995592936110154e-06, + "loss": 0.88578713, + "num_input_tokens_seen": 150297390, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73828125, + "step": 7002, + "time_per_iteration": 2.4840235710144043 + }, + { + "auxiliary_loss_clip": 0.01116213, + "auxiliary_loss_mlp": 0.01035955, + "balance_loss_clip": 1.02331328, + "balance_loss_mlp": 1.04153061, + "epoch": 0.42104313843378927, + "flos": 21978946604160.0, + "grad_norm": 2.8393435321353713, + "language_loss": 0.67447579, + "learning_rate": 2.5991877325023096e-06, + "loss": 0.69599748, + "num_input_tokens_seen": 150317390, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.75, + "step": 7003, + "time_per_iteration": 2.452779531478882 + }, + { + "auxiliary_loss_clip": 0.01120595, + "auxiliary_loss_mlp": 0.01042271, + "balance_loss_clip": 1.02727461, + "balance_loss_mlp": 1.04151964, + "epoch": 0.42110326168645723, + "flos": 25443966965760.0, + "grad_norm": 2.097012731379119, + "language_loss": 0.76887131, + "learning_rate": 2.598816148672344e-06, + "loss": 0.79049993, + "num_input_tokens_seen": 150337455, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7890625, + "step": 7004, + "time_per_iteration": 2.4988765716552734 + }, + { + "auxiliary_loss_clip": 0.0111532, + "auxiliary_loss_mlp": 0.01041124, + "balance_loss_clip": 1.02649117, + "balance_loss_mlp": 1.04101729, + "epoch": 0.4211633849391252, + "flos": 17822574195840.0, + "grad_norm": 1.5948979245136696, + "language_loss": 0.68152726, + "learning_rate": 2.59844454213521e-06, + "loss": 0.70309174, + "num_input_tokens_seen": 150355385, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7421875, + "step": 7005, + "time_per_iteration": 2.4434568881988525 + }, + { + "auxiliary_loss_clip": 0.01118015, + "auxiliary_loss_mlp": 0.01037165, + "balance_loss_clip": 1.02340817, + "balance_loss_mlp": 1.04088581, + "epoch": 0.42122350819179316, + "flos": 16282436791680.0, + "grad_norm": 1.9728430752981747, + "language_loss": 0.72047079, + "learning_rate": 2.5980729129049994e-06, + "loss": 0.74202257, + "num_input_tokens_seen": 150371750, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76953125, + "step": 7006, + "time_per_iteration": 2.4487879276275635 + }, + { + "auxiliary_loss_clip": 0.01117712, + "auxiliary_loss_mlp": 0.01033901, + "balance_loss_clip": 1.01978087, + "balance_loss_mlp": 1.04068065, + "epoch": 0.4212836314444611, + "flos": 19645866512640.0, + "grad_norm": 1.688876483049264, + "language_loss": 0.70708871, + "learning_rate": 2.5977012609958033e-06, + "loss": 0.72860485, + "num_input_tokens_seen": 150389955, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76953125, + "step": 7007, + "time_per_iteration": 2.437270164489746 + }, + { + "auxiliary_loss_clip": 0.01116417, + "auxiliary_loss_mlp": 0.01037894, + "balance_loss_clip": 1.02416158, + "balance_loss_mlp": 1.04059708, + "epoch": 0.4213437546971291, + "flos": 18369026778240.0, + "grad_norm": 1.7353334268618703, + "language_loss": 0.82159567, + "learning_rate": 2.5973295864217166e-06, + "loss": 0.84313881, + "num_input_tokens_seen": 150405780, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7578125, + "step": 7008, + "time_per_iteration": 2.460923194885254 + }, + { + "auxiliary_loss_clip": 0.011146, + "auxiliary_loss_mlp": 0.01040233, + "balance_loss_clip": 1.02642226, + "balance_loss_mlp": 1.03877473, + "epoch": 0.42140387794979706, + "flos": 27704507541120.0, + "grad_norm": 2.1040552452231505, + "language_loss": 0.71574211, + "learning_rate": 2.596957889196831e-06, + "loss": 0.7372905, + "num_input_tokens_seen": 150425615, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 7009, + "time_per_iteration": 2.501915693283081 + }, + { + "auxiliary_loss_clip": 0.01116238, + "auxiliary_loss_mlp": 0.01032831, + "balance_loss_clip": 1.01875222, + "balance_loss_mlp": 1.03954792, + "epoch": 0.4214640012024651, + "flos": 28147071012480.0, + "grad_norm": 2.7512785082136952, + "language_loss": 0.66407478, + "learning_rate": 2.596586169335243e-06, + "loss": 0.68556547, + "num_input_tokens_seen": 150445765, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.765625, + "step": 7010, + "time_per_iteration": 2.5036494731903076 + }, + { + "auxiliary_loss_clip": 0.01114938, + "auxiliary_loss_mlp": 0.01037239, + "balance_loss_clip": 1.02353597, + "balance_loss_mlp": 1.03993797, + "epoch": 0.42152412445513304, + "flos": 22997265177600.0, + "grad_norm": 1.553770179625671, + "language_loss": 0.7243132, + "learning_rate": 2.5962144268510477e-06, + "loss": 0.74583495, + "num_input_tokens_seen": 150464405, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75, + "step": 7011, + "time_per_iteration": 2.471482276916504 + }, + { + "auxiliary_loss_clip": 0.01036961, + "auxiliary_loss_mlp": 0.01009192, + "balance_loss_clip": 1.00765407, + "balance_loss_mlp": 1.01291788, + "epoch": 0.421584247707801, + "flos": 63749592938880.0, + "grad_norm": 0.789677431109339, + "language_loss": 0.54321265, + "learning_rate": 2.5958426617583417e-06, + "loss": 0.56367421, + "num_input_tokens_seen": 150520430, + "router_z_loss_clip": 0.01538086, + "router_z_loss_mlp": 0.24023438, + "step": 7012, + "time_per_iteration": 7.156486511230469 + }, + { + "auxiliary_loss_clip": 0.01118573, + "auxiliary_loss_mlp": 0.01033855, + "balance_loss_clip": 1.01982975, + "balance_loss_mlp": 1.04137254, + "epoch": 0.421644370960469, + "flos": 24314612474880.0, + "grad_norm": 1.3072085820070551, + "language_loss": 0.78510618, + "learning_rate": 2.5954708740712215e-06, + "loss": 0.80663049, + "num_input_tokens_seen": 150542610, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7734375, + "step": 7013, + "time_per_iteration": 2.4873650074005127 + }, + { + "auxiliary_loss_clip": 0.0111676, + "auxiliary_loss_mlp": 0.0103421, + "balance_loss_clip": 1.01945186, + "balance_loss_mlp": 1.0393039, + "epoch": 0.42170449421313694, + "flos": 23440690575360.0, + "grad_norm": 1.8972197450653994, + "language_loss": 0.8102268, + "learning_rate": 2.595099063803787e-06, + "loss": 0.83173645, + "num_input_tokens_seen": 150560970, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.7734375, + "step": 7014, + "time_per_iteration": 2.4698970317840576 + }, + { + "auxiliary_loss_clip": 0.01116577, + "auxiliary_loss_mlp": 0.01032875, + "balance_loss_clip": 1.01885617, + "balance_loss_mlp": 1.039801, + "epoch": 0.4217646174658049, + "flos": 23695476721920.0, + "grad_norm": 1.584816158328088, + "language_loss": 0.7775718, + "learning_rate": 2.5947272309701354e-06, + "loss": 0.79906625, + "num_input_tokens_seen": 150582615, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76953125, + "step": 7015, + "time_per_iteration": 2.48061203956604 + }, + { + "auxiliary_loss_clip": 0.01119879, + "auxiliary_loss_mlp": 0.01038639, + "balance_loss_clip": 1.02382123, + "balance_loss_mlp": 1.04211378, + "epoch": 0.42182474071847287, + "flos": 24971562270720.0, + "grad_norm": 1.4014002437510662, + "language_loss": 0.82126868, + "learning_rate": 2.594355375584368e-06, + "loss": 0.84285378, + "num_input_tokens_seen": 150603640, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.77734375, + "step": 7016, + "time_per_iteration": 2.4971818923950195 + }, + { + "auxiliary_loss_clip": 0.01119768, + "auxiliary_loss_mlp": 0.01033652, + "balance_loss_clip": 1.01964498, + "balance_loss_mlp": 1.04142356, + "epoch": 0.42188486397114083, + "flos": 22856639431680.0, + "grad_norm": 2.18227993050423, + "language_loss": 0.68093193, + "learning_rate": 2.593983497660586e-06, + "loss": 0.70246613, + "num_input_tokens_seen": 150622490, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.78125, + "step": 7017, + "time_per_iteration": 2.4511165618896484 + }, + { + "auxiliary_loss_clip": 0.01038936, + "auxiliary_loss_mlp": 0.00999099, + "balance_loss_clip": 0.9975912, + "balance_loss_mlp": 1.01494193, + "epoch": 0.4219449872238088, + "flos": 66975700965120.0, + "grad_norm": 0.6893654540123721, + "language_loss": 0.59420347, + "learning_rate": 2.5936115972128895e-06, + "loss": 0.61458385, + "num_input_tokens_seen": 150689545, + "router_z_loss_clip": 0.01507568, + "router_z_loss_mlp": 0.24023438, + "step": 7018, + "time_per_iteration": 3.1184492111206055 + }, + { + "auxiliary_loss_clip": 0.01118505, + "auxiliary_loss_mlp": 0.01034307, + "balance_loss_clip": 1.02027655, + "balance_loss_mlp": 1.03985381, + "epoch": 0.42200511047647676, + "flos": 13115367745920.0, + "grad_norm": 1.7697613946295114, + "language_loss": 0.75391936, + "learning_rate": 2.593239674255382e-06, + "loss": 0.77544749, + "num_input_tokens_seen": 150707610, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 7019, + "time_per_iteration": 2.415177822113037 + }, + { + "auxiliary_loss_clip": 0.01116733, + "auxiliary_loss_mlp": 0.01034757, + "balance_loss_clip": 1.01955771, + "balance_loss_mlp": 1.04044795, + "epoch": 0.42206523372914473, + "flos": 13991193066240.0, + "grad_norm": 2.151945399878188, + "language_loss": 0.69014722, + "learning_rate": 2.592867728802166e-06, + "loss": 0.71166205, + "num_input_tokens_seen": 150724530, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.76171875, + "step": 7020, + "time_per_iteration": 2.502906560897827 + }, + { + "auxiliary_loss_clip": 0.01115881, + "auxiliary_loss_mlp": 0.01032881, + "balance_loss_clip": 1.01976776, + "balance_loss_mlp": 1.04312158, + "epoch": 0.4221253569818127, + "flos": 21942317710080.0, + "grad_norm": 1.807686142219978, + "language_loss": 0.80839896, + "learning_rate": 2.592495760867347e-06, + "loss": 0.82988656, + "num_input_tokens_seen": 150742870, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 7021, + "time_per_iteration": 2.4480793476104736 + }, + { + "auxiliary_loss_clip": 0.01117987, + "auxiliary_loss_mlp": 0.01030288, + "balance_loss_clip": 1.01682925, + "balance_loss_mlp": 1.04118109, + "epoch": 0.42218548023448066, + "flos": 32192587071360.0, + "grad_norm": 1.7624230978889854, + "language_loss": 0.70018518, + "learning_rate": 2.5921237704650293e-06, + "loss": 0.721668, + "num_input_tokens_seen": 150765500, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 7022, + "time_per_iteration": 2.5637993812561035 + }, + { + "auxiliary_loss_clip": 0.01110409, + "auxiliary_loss_mlp": 0.01030174, + "balance_loss_clip": 1.01816332, + "balance_loss_mlp": 1.03993058, + "epoch": 0.4222456034871487, + "flos": 30118961894400.0, + "grad_norm": 1.4995673529455043, + "language_loss": 0.66985959, + "learning_rate": 2.5917517576093188e-06, + "loss": 0.69126534, + "num_input_tokens_seen": 150784945, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.703125, + "step": 7023, + "time_per_iteration": 2.518887996673584 + }, + { + "auxiliary_loss_clip": 0.01113824, + "auxiliary_loss_mlp": 0.01032224, + "balance_loss_clip": 1.01872325, + "balance_loss_mlp": 1.04102015, + "epoch": 0.42230572673981664, + "flos": 22127904305280.0, + "grad_norm": 1.5242794814383198, + "language_loss": 0.69374228, + "learning_rate": 2.591379722314322e-06, + "loss": 0.71520281, + "num_input_tokens_seen": 150803120, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7265625, + "step": 7024, + "time_per_iteration": 2.47479510307312 + }, + { + "auxiliary_loss_clip": 0.01115853, + "auxiliary_loss_mlp": 0.01036383, + "balance_loss_clip": 1.02272165, + "balance_loss_mlp": 1.0406878, + "epoch": 0.4223658499924846, + "flos": 22055077480320.0, + "grad_norm": 1.4987089123245305, + "language_loss": 0.76659822, + "learning_rate": 2.591007664594147e-06, + "loss": 0.78812057, + "num_input_tokens_seen": 150823135, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75, + "step": 7025, + "time_per_iteration": 2.459552526473999 + }, + { + "auxiliary_loss_clip": 0.01111611, + "auxiliary_loss_mlp": 0.01032742, + "balance_loss_clip": 1.01950371, + "balance_loss_mlp": 1.03944087, + "epoch": 0.4224259732451526, + "flos": 20410727742720.0, + "grad_norm": 1.7650754883430373, + "language_loss": 0.79574716, + "learning_rate": 2.5906355844629024e-06, + "loss": 0.81719071, + "num_input_tokens_seen": 150842070, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.72265625, + "step": 7026, + "time_per_iteration": 2.4876604080200195 + }, + { + "auxiliary_loss_clip": 0.01039298, + "auxiliary_loss_mlp": 0.00998847, + "balance_loss_clip": 0.99741668, + "balance_loss_mlp": 1.01518142, + "epoch": 0.42248609649782054, + "flos": 62846655828480.0, + "grad_norm": 0.7186593098349721, + "language_loss": 0.6191169, + "learning_rate": 2.5902634819346966e-06, + "loss": 0.63949835, + "num_input_tokens_seen": 150907450, + "router_z_loss_clip": 0.01428223, + "router_z_loss_mlp": 0.24121094, + "step": 7027, + "time_per_iteration": 3.1553335189819336 + }, + { + "auxiliary_loss_clip": 0.01115441, + "auxiliary_loss_mlp": 0.01038835, + "balance_loss_clip": 1.02524519, + "balance_loss_mlp": 1.04096365, + "epoch": 0.4225462197504885, + "flos": 26249946289920.0, + "grad_norm": 4.428318649676281, + "language_loss": 0.70515895, + "learning_rate": 2.5898913570236414e-06, + "loss": 0.72670174, + "num_input_tokens_seen": 150928040, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.74609375, + "step": 7028, + "time_per_iteration": 2.5373435020446777 + }, + { + "auxiliary_loss_clip": 0.01117282, + "auxiliary_loss_mlp": 0.01038664, + "balance_loss_clip": 1.02488303, + "balance_loss_mlp": 1.04104543, + "epoch": 0.42260634300315647, + "flos": 20521943228160.0, + "grad_norm": 1.8463743475085548, + "language_loss": 0.82555425, + "learning_rate": 2.589519209743846e-06, + "loss": 0.84711367, + "num_input_tokens_seen": 150945760, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76171875, + "step": 7029, + "time_per_iteration": 2.5120980739593506 + }, + { + "auxiliary_loss_clip": 0.0112087, + "auxiliary_loss_mlp": 0.010423, + "balance_loss_clip": 1.02790523, + "balance_loss_mlp": 1.04274035, + "epoch": 0.42266646625582444, + "flos": 24316731377280.0, + "grad_norm": 2.3903311172404, + "language_loss": 0.75230241, + "learning_rate": 2.589147040109424e-06, + "loss": 0.77393407, + "num_input_tokens_seen": 150965665, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 7030, + "time_per_iteration": 2.5118141174316406 + }, + { + "auxiliary_loss_clip": 0.01114314, + "auxiliary_loss_mlp": 0.01038089, + "balance_loss_clip": 1.02359271, + "balance_loss_mlp": 1.03835046, + "epoch": 0.4227265895084924, + "flos": 24204151175040.0, + "grad_norm": 1.9474535697331137, + "language_loss": 0.86421049, + "learning_rate": 2.588774848134486e-06, + "loss": 0.88573444, + "num_input_tokens_seen": 150982260, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7578125, + "step": 7031, + "time_per_iteration": 2.500140905380249 + }, + { + "auxiliary_loss_clip": 0.01115501, + "auxiliary_loss_mlp": 0.0103786, + "balance_loss_clip": 1.02328062, + "balance_loss_mlp": 1.04060841, + "epoch": 0.42278671276116037, + "flos": 16909760845440.0, + "grad_norm": 2.1339679402128717, + "language_loss": 0.72855937, + "learning_rate": 2.5884026338331473e-06, + "loss": 0.75009298, + "num_input_tokens_seen": 150999990, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.75, + "step": 7032, + "time_per_iteration": 2.477363109588623 + }, + { + "auxiliary_loss_clip": 0.0111615, + "auxiliary_loss_mlp": 0.01040791, + "balance_loss_clip": 1.02711725, + "balance_loss_mlp": 1.0390861, + "epoch": 0.42284683601382833, + "flos": 25411073086080.0, + "grad_norm": 1.7148750065903648, + "language_loss": 0.699175, + "learning_rate": 2.5880303972195222e-06, + "loss": 0.72074443, + "num_input_tokens_seen": 151021105, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 7033, + "time_per_iteration": 2.5661494731903076 + }, + { + "auxiliary_loss_clip": 0.01115751, + "auxiliary_loss_mlp": 0.01032627, + "balance_loss_clip": 1.01895976, + "balance_loss_mlp": 1.03992891, + "epoch": 0.4229069592664963, + "flos": 23040322606080.0, + "grad_norm": 1.8649473631938416, + "language_loss": 0.90448046, + "learning_rate": 2.5876581383077256e-06, + "loss": 0.92596424, + "num_input_tokens_seen": 151040665, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7578125, + "step": 7034, + "time_per_iteration": 2.4802892208099365 + }, + { + "auxiliary_loss_clip": 0.01112625, + "auxiliary_loss_mlp": 0.01036891, + "balance_loss_clip": 1.02369416, + "balance_loss_mlp": 1.03800857, + "epoch": 0.42296708251916426, + "flos": 26067448264320.0, + "grad_norm": 1.6052176008605175, + "language_loss": 0.77130729, + "learning_rate": 2.5872858571118723e-06, + "loss": 0.79280239, + "num_input_tokens_seen": 151061240, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.74609375, + "step": 7035, + "time_per_iteration": 2.5044498443603516 + }, + { + "auxiliary_loss_clip": 0.01118013, + "auxiliary_loss_mlp": 0.01040699, + "balance_loss_clip": 1.02682912, + "balance_loss_mlp": 1.0414331, + "epoch": 0.4230272057718323, + "flos": 19458376496640.0, + "grad_norm": 1.9123378440021823, + "language_loss": 0.82216996, + "learning_rate": 2.5869135536460817e-06, + "loss": 0.84375703, + "num_input_tokens_seen": 151076870, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 7036, + "time_per_iteration": 2.4178695678710938 + }, + { + "auxiliary_loss_clip": 0.01112842, + "auxiliary_loss_mlp": 0.01036995, + "balance_loss_clip": 1.02382207, + "balance_loss_mlp": 1.0403924, + "epoch": 0.42308732902450025, + "flos": 22383300983040.0, + "grad_norm": 1.6417488866700152, + "language_loss": 0.70871484, + "learning_rate": 2.58654122792447e-06, + "loss": 0.73021322, + "num_input_tokens_seen": 151095110, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7265625, + "step": 7037, + "time_per_iteration": 2.485499858856201 + }, + { + "auxiliary_loss_clip": 0.01115475, + "auxiliary_loss_mlp": 0.01037386, + "balance_loss_clip": 1.02303314, + "balance_loss_mlp": 1.03976059, + "epoch": 0.4231474522771682, + "flos": 20995425331200.0, + "grad_norm": 1.5138937767155718, + "language_loss": 0.77942061, + "learning_rate": 2.586168879961155e-06, + "loss": 0.80094922, + "num_input_tokens_seen": 151114355, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7578125, + "step": 7038, + "time_per_iteration": 2.4569690227508545 + }, + { + "auxiliary_loss_clip": 0.01120787, + "auxiliary_loss_mlp": 0.01044399, + "balance_loss_clip": 1.02919412, + "balance_loss_mlp": 1.04072356, + "epoch": 0.4232075755298362, + "flos": 14975863574400.0, + "grad_norm": 2.366884859254005, + "language_loss": 0.66797423, + "learning_rate": 2.585796509770259e-06, + "loss": 0.6896261, + "num_input_tokens_seen": 151131505, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.80078125, + "step": 7039, + "time_per_iteration": 2.441373825073242 + }, + { + "auxiliary_loss_clip": 0.01119114, + "auxiliary_loss_mlp": 0.01037872, + "balance_loss_clip": 1.02372193, + "balance_loss_mlp": 1.04042578, + "epoch": 0.42326769878250414, + "flos": 24532661986560.0, + "grad_norm": 1.6082175120791662, + "language_loss": 0.75897467, + "learning_rate": 2.5854241173658996e-06, + "loss": 0.78054452, + "num_input_tokens_seen": 151151555, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.78515625, + "step": 7040, + "time_per_iteration": 2.471653938293457 + }, + { + "auxiliary_loss_clip": 0.01117046, + "auxiliary_loss_mlp": 0.01035054, + "balance_loss_clip": 1.02067101, + "balance_loss_mlp": 1.03962982, + "epoch": 0.4233278220351721, + "flos": 26870303105280.0, + "grad_norm": 1.477939672492119, + "language_loss": 0.65098798, + "learning_rate": 2.5850517027621996e-06, + "loss": 0.67250896, + "num_input_tokens_seen": 151172385, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7734375, + "step": 7041, + "time_per_iteration": 2.502443313598633 + }, + { + "auxiliary_loss_clip": 0.01118281, + "auxiliary_loss_mlp": 0.01036522, + "balance_loss_clip": 1.02233624, + "balance_loss_mlp": 1.04045236, + "epoch": 0.4233879452878401, + "flos": 42814927463040.0, + "grad_norm": 1.7627160436135367, + "language_loss": 0.73621082, + "learning_rate": 2.5846792659732803e-06, + "loss": 0.75775892, + "num_input_tokens_seen": 151194930, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.77734375, + "step": 7042, + "time_per_iteration": 2.6498820781707764 + }, + { + "auxiliary_loss_clip": 0.01112749, + "auxiliary_loss_mlp": 0.01033182, + "balance_loss_clip": 1.02020609, + "balance_loss_mlp": 1.03977966, + "epoch": 0.42344806854050804, + "flos": 25229006023680.0, + "grad_norm": 1.3177903064215164, + "language_loss": 0.82185107, + "learning_rate": 2.5843068070132643e-06, + "loss": 0.84331036, + "num_input_tokens_seen": 151217905, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.73046875, + "step": 7043, + "time_per_iteration": 2.528604745864868 + }, + { + "auxiliary_loss_clip": 0.0111836, + "auxiliary_loss_mlp": 0.01041223, + "balance_loss_clip": 1.02608395, + "balance_loss_mlp": 1.04329216, + "epoch": 0.423508191793176, + "flos": 22778820616320.0, + "grad_norm": 2.3747778329738742, + "language_loss": 0.65231359, + "learning_rate": 2.5839343258962763e-06, + "loss": 0.67390943, + "num_input_tokens_seen": 151234580, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.75, + "step": 7044, + "time_per_iteration": 2.4399802684783936 + }, + { + "auxiliary_loss_clip": 0.01121384, + "auxiliary_loss_mlp": 0.01046534, + "balance_loss_clip": 1.03126323, + "balance_loss_mlp": 1.04322433, + "epoch": 0.42356831504584397, + "flos": 34637493179520.0, + "grad_norm": 1.7497316034691441, + "language_loss": 0.7502315, + "learning_rate": 2.5835618226364393e-06, + "loss": 0.77191073, + "num_input_tokens_seen": 151254765, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.78125, + "step": 7045, + "time_per_iteration": 2.612898588180542 + }, + { + "auxiliary_loss_clip": 0.01116302, + "auxiliary_loss_mlp": 0.01035318, + "balance_loss_clip": 1.02141845, + "balance_loss_mlp": 1.04219389, + "epoch": 0.42362843829851193, + "flos": 17596767346560.0, + "grad_norm": 2.1011396794876385, + "language_loss": 0.80564952, + "learning_rate": 2.5831892972478797e-06, + "loss": 0.82716572, + "num_input_tokens_seen": 151269045, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.73828125, + "step": 7046, + "time_per_iteration": 2.4105727672576904 + }, + { + "auxiliary_loss_clip": 0.01119082, + "auxiliary_loss_mlp": 0.0103605, + "balance_loss_clip": 1.021685, + "balance_loss_mlp": 1.04078197, + "epoch": 0.4236885615511799, + "flos": 22565691267840.0, + "grad_norm": 1.59844067944401, + "language_loss": 0.76846749, + "learning_rate": 2.5828167497447242e-06, + "loss": 0.7900188, + "num_input_tokens_seen": 151287530, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78125, + "step": 7047, + "time_per_iteration": 2.486297130584717 + }, + { + "auxiliary_loss_clip": 0.01116569, + "auxiliary_loss_mlp": 0.01034292, + "balance_loss_clip": 1.02102375, + "balance_loss_mlp": 1.04264975, + "epoch": 0.42374868480384786, + "flos": 26469216864000.0, + "grad_norm": 1.8697996227798281, + "language_loss": 0.67980373, + "learning_rate": 2.582444180141098e-06, + "loss": 0.70131224, + "num_input_tokens_seen": 151308905, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73828125, + "step": 7048, + "time_per_iteration": 2.5031991004943848 + }, + { + "auxiliary_loss_clip": 0.01119136, + "auxiliary_loss_mlp": 0.01038379, + "balance_loss_clip": 1.02371609, + "balance_loss_mlp": 1.04227185, + "epoch": 0.4238088080565159, + "flos": 20370220179840.0, + "grad_norm": 1.7311423758965327, + "language_loss": 0.7829181, + "learning_rate": 2.5820715884511307e-06, + "loss": 0.80449331, + "num_input_tokens_seen": 151326525, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.76953125, + "step": 7049, + "time_per_iteration": 2.549767255783081 + }, + { + "auxiliary_loss_clip": 0.01121261, + "auxiliary_loss_mlp": 0.01039585, + "balance_loss_clip": 1.02570868, + "balance_loss_mlp": 1.0433383, + "epoch": 0.42386893130918385, + "flos": 21172105353600.0, + "grad_norm": 1.7774881318176563, + "language_loss": 0.82656097, + "learning_rate": 2.5816989746889504e-06, + "loss": 0.84816945, + "num_input_tokens_seen": 151344675, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.78125, + "step": 7050, + "time_per_iteration": 2.498494863510132 + }, + { + "auxiliary_loss_clip": 0.01115122, + "auxiliary_loss_mlp": 0.01035845, + "balance_loss_clip": 1.02233815, + "balance_loss_mlp": 1.0382762, + "epoch": 0.4239290545618518, + "flos": 17675627656320.0, + "grad_norm": 2.0169322630318844, + "language_loss": 0.73429018, + "learning_rate": 2.581326338868687e-06, + "loss": 0.75579983, + "num_input_tokens_seen": 151360730, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 7051, + "time_per_iteration": 2.441920042037964 + }, + { + "auxiliary_loss_clip": 0.01118227, + "auxiliary_loss_mlp": 0.01033059, + "balance_loss_clip": 1.01983249, + "balance_loss_mlp": 1.04219055, + "epoch": 0.4239891778145198, + "flos": 24314504734080.0, + "grad_norm": 1.4713561275118965, + "language_loss": 0.86205333, + "learning_rate": 2.5809536810044706e-06, + "loss": 0.8835662, + "num_input_tokens_seen": 151380445, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7578125, + "step": 7052, + "time_per_iteration": 2.511756658554077 + }, + { + "auxiliary_loss_clip": 0.01116616, + "auxiliary_loss_mlp": 0.01041035, + "balance_loss_clip": 1.02657533, + "balance_loss_mlp": 1.03951788, + "epoch": 0.42404930106718774, + "flos": 20558428467840.0, + "grad_norm": 1.4100722391624452, + "language_loss": 0.7240659, + "learning_rate": 2.5805810011104323e-06, + "loss": 0.74564236, + "num_input_tokens_seen": 151399325, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7734375, + "step": 7053, + "time_per_iteration": 3.9099857807159424 + }, + { + "auxiliary_loss_clip": 0.01116742, + "auxiliary_loss_mlp": 0.01033237, + "balance_loss_clip": 1.0190872, + "balance_loss_mlp": 1.04233611, + "epoch": 0.4241094243198557, + "flos": 22308067946880.0, + "grad_norm": 1.5741365926511655, + "language_loss": 0.82153803, + "learning_rate": 2.580208299200704e-06, + "loss": 0.84303784, + "num_input_tokens_seen": 151417240, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.74609375, + "step": 7054, + "time_per_iteration": 5.327679634094238 + }, + { + "auxiliary_loss_clip": 0.01040448, + "auxiliary_loss_mlp": 0.0101831, + "balance_loss_clip": 1.01700425, + "balance_loss_mlp": 1.01674867, + "epoch": 0.4241695475725237, + "flos": 70612445272320.0, + "grad_norm": 0.7840713570529064, + "language_loss": 0.60388172, + "learning_rate": 2.5798355752894183e-06, + "loss": 0.62446928, + "num_input_tokens_seen": 151476015, + "router_z_loss_clip": 0.01306152, + "router_z_loss_mlp": 0.23632812, + "step": 7055, + "time_per_iteration": 3.0450727939605713 + }, + { + "auxiliary_loss_clip": 0.01119418, + "auxiliary_loss_mlp": 0.01041139, + "balance_loss_clip": 1.02651238, + "balance_loss_mlp": 1.04204714, + "epoch": 0.42422967082519164, + "flos": 14027462824320.0, + "grad_norm": 2.951771931203088, + "language_loss": 0.76762712, + "learning_rate": 2.5794628293907107e-06, + "loss": 0.78923267, + "num_input_tokens_seen": 151492035, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7734375, + "step": 7056, + "time_per_iteration": 2.442148447036743 + }, + { + "auxiliary_loss_clip": 0.01121258, + "auxiliary_loss_mlp": 0.01039415, + "balance_loss_clip": 1.02375674, + "balance_loss_mlp": 1.04127979, + "epoch": 0.4242897940778596, + "flos": 22345522853760.0, + "grad_norm": 2.7846662247260388, + "language_loss": 0.84346795, + "learning_rate": 2.579090061518714e-06, + "loss": 0.86507463, + "num_input_tokens_seen": 151508970, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.80078125, + "step": 7057, + "time_per_iteration": 2.474519968032837 + }, + { + "auxiliary_loss_clip": 0.01119549, + "auxiliary_loss_mlp": 0.010377, + "balance_loss_clip": 1.02272737, + "balance_loss_mlp": 1.04053187, + "epoch": 0.42434991733052757, + "flos": 22595855713920.0, + "grad_norm": 3.1820547358610605, + "language_loss": 0.82999814, + "learning_rate": 2.5787172716875642e-06, + "loss": 0.85157061, + "num_input_tokens_seen": 151525295, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.7890625, + "step": 7058, + "time_per_iteration": 2.473520517349243 + }, + { + "auxiliary_loss_clip": 0.01118587, + "auxiliary_loss_mlp": 0.01028517, + "balance_loss_clip": 1.01533902, + "balance_loss_mlp": 1.04417813, + "epoch": 0.42441004058319554, + "flos": 20011437181440.0, + "grad_norm": 1.7435131696457398, + "language_loss": 0.80453449, + "learning_rate": 2.5783444599113973e-06, + "loss": 0.82600558, + "num_input_tokens_seen": 151544435, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 7059, + "time_per_iteration": 2.4719533920288086 + }, + { + "auxiliary_loss_clip": 0.01120005, + "auxiliary_loss_mlp": 0.01033964, + "balance_loss_clip": 1.01860404, + "balance_loss_mlp": 1.041839, + "epoch": 0.4244701638358635, + "flos": 11144985235200.0, + "grad_norm": 1.9429107045123646, + "language_loss": 0.70341688, + "learning_rate": 2.57797162620435e-06, + "loss": 0.72495657, + "num_input_tokens_seen": 151559520, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.78125, + "step": 7060, + "time_per_iteration": 2.4377660751342773 + }, + { + "auxiliary_loss_clip": 0.0112048, + "auxiliary_loss_mlp": 0.01032925, + "balance_loss_clip": 1.01914454, + "balance_loss_mlp": 1.04378521, + "epoch": 0.42453028708853147, + "flos": 23987753688960.0, + "grad_norm": 1.5364996273974925, + "language_loss": 0.76182258, + "learning_rate": 2.577598770580562e-06, + "loss": 0.78335667, + "num_input_tokens_seen": 151579790, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.765625, + "step": 7061, + "time_per_iteration": 2.486786365509033 + }, + { + "auxiliary_loss_clip": 0.01122599, + "auxiliary_loss_mlp": 0.01038557, + "balance_loss_clip": 1.02319098, + "balance_loss_mlp": 1.04407752, + "epoch": 0.42459041034119943, + "flos": 18406338030720.0, + "grad_norm": 3.328289037638814, + "language_loss": 0.729635, + "learning_rate": 2.5772258930541693e-06, + "loss": 0.75124645, + "num_input_tokens_seen": 151598285, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.7890625, + "step": 7062, + "time_per_iteration": 2.474193572998047 + }, + { + "auxiliary_loss_clip": 0.01116832, + "auxiliary_loss_mlp": 0.0104003, + "balance_loss_clip": 1.02600455, + "balance_loss_mlp": 1.03964305, + "epoch": 0.42465053359386745, + "flos": 20958006337920.0, + "grad_norm": 1.701854582957673, + "language_loss": 0.66343361, + "learning_rate": 2.5768529936393137e-06, + "loss": 0.68500221, + "num_input_tokens_seen": 151615430, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7734375, + "step": 7063, + "time_per_iteration": 2.458003520965576 + }, + { + "auxiliary_loss_clip": 0.01115284, + "auxiliary_loss_mlp": 0.01031318, + "balance_loss_clip": 1.0181458, + "balance_loss_mlp": 1.04179168, + "epoch": 0.4247106568465354, + "flos": 33106190520960.0, + "grad_norm": 1.4878317325171677, + "language_loss": 0.78371775, + "learning_rate": 2.5764800723501354e-06, + "loss": 0.80518377, + "num_input_tokens_seen": 151637030, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 7064, + "time_per_iteration": 2.5735623836517334 + }, + { + "auxiliary_loss_clip": 0.01118889, + "auxiliary_loss_mlp": 0.01040848, + "balance_loss_clip": 1.02636456, + "balance_loss_mlp": 1.04172683, + "epoch": 0.4247707800992034, + "flos": 20046916840320.0, + "grad_norm": 1.8409826195637737, + "language_loss": 0.74893892, + "learning_rate": 2.5761071292007736e-06, + "loss": 0.7705363, + "num_input_tokens_seen": 151655745, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.76953125, + "step": 7065, + "time_per_iteration": 2.4962844848632812 + }, + { + "auxiliary_loss_clip": 0.01119456, + "auxiliary_loss_mlp": 0.01035708, + "balance_loss_clip": 1.0206933, + "balance_loss_mlp": 1.04322076, + "epoch": 0.42483090335187135, + "flos": 22385132576640.0, + "grad_norm": 1.415711347923808, + "language_loss": 0.72713453, + "learning_rate": 2.5757341642053725e-06, + "loss": 0.74868619, + "num_input_tokens_seen": 151678040, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.76171875, + "step": 7066, + "time_per_iteration": 2.551297426223755 + }, + { + "auxiliary_loss_clip": 0.01119285, + "auxiliary_loss_mlp": 0.01038218, + "balance_loss_clip": 1.02307224, + "balance_loss_mlp": 1.04031396, + "epoch": 0.4248910266045393, + "flos": 21356830022400.0, + "grad_norm": 1.9392042625935109, + "language_loss": 0.79517603, + "learning_rate": 2.5753611773780745e-06, + "loss": 0.81675112, + "num_input_tokens_seen": 151696410, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.7890625, + "step": 7067, + "time_per_iteration": 2.4871444702148438 + }, + { + "auxiliary_loss_clip": 0.010394, + "auxiliary_loss_mlp": 0.01005215, + "balance_loss_clip": 1.00373113, + "balance_loss_mlp": 1.01538539, + "epoch": 0.4249511498572073, + "flos": 64008114099840.0, + "grad_norm": 0.919528911316311, + "language_loss": 0.63477993, + "learning_rate": 2.574988168733022e-06, + "loss": 0.65522605, + "num_input_tokens_seen": 151756365, + "router_z_loss_clip": 0.01483154, + "router_z_loss_mlp": 0.24023438, + "step": 7068, + "time_per_iteration": 3.0116004943847656 + }, + { + "auxiliary_loss_clip": 0.01119716, + "auxiliary_loss_mlp": 0.01036194, + "balance_loss_clip": 1.02073288, + "balance_loss_mlp": 1.04235375, + "epoch": 0.42501127310987524, + "flos": 19607046888960.0, + "grad_norm": 1.681037886347605, + "language_loss": 0.72381866, + "learning_rate": 2.574615138284361e-06, + "loss": 0.74537772, + "num_input_tokens_seen": 151775165, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.7734375, + "step": 7069, + "time_per_iteration": 2.5046679973602295 + }, + { + "auxiliary_loss_clip": 0.01122307, + "auxiliary_loss_mlp": 0.01034999, + "balance_loss_clip": 1.01864338, + "balance_loss_mlp": 1.04424644, + "epoch": 0.4250713963625432, + "flos": 19462326992640.0, + "grad_norm": 3.2712432047864852, + "language_loss": 0.79297352, + "learning_rate": 2.5742420860462364e-06, + "loss": 0.81454653, + "num_input_tokens_seen": 151792620, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.78125, + "step": 7070, + "time_per_iteration": 2.43115496635437 + }, + { + "auxiliary_loss_clip": 0.01118123, + "auxiliary_loss_mlp": 0.01033635, + "balance_loss_clip": 1.01863861, + "balance_loss_mlp": 1.04104066, + "epoch": 0.4251315196152112, + "flos": 25337707557120.0, + "grad_norm": 1.8101520547589562, + "language_loss": 0.70179212, + "learning_rate": 2.573869012032795e-06, + "loss": 0.7233097, + "num_input_tokens_seen": 151812850, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7734375, + "step": 7071, + "time_per_iteration": 2.5141680240631104 + }, + { + "auxiliary_loss_clip": 0.01118096, + "auxiliary_loss_mlp": 0.01033548, + "balance_loss_clip": 1.01942205, + "balance_loss_mlp": 1.04123151, + "epoch": 0.42519164286787914, + "flos": 26359186527360.0, + "grad_norm": 2.3450864635540825, + "language_loss": 0.71075511, + "learning_rate": 2.5734959162581824e-06, + "loss": 0.73227149, + "num_input_tokens_seen": 151831785, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76953125, + "step": 7072, + "time_per_iteration": 2.489187002182007 + }, + { + "auxiliary_loss_clip": 0.01122118, + "auxiliary_loss_mlp": 0.01032433, + "balance_loss_clip": 1.01865244, + "balance_loss_mlp": 1.04270983, + "epoch": 0.4252517661205471, + "flos": 26031070765440.0, + "grad_norm": 1.5399076436438217, + "language_loss": 0.81655496, + "learning_rate": 2.5731227987365475e-06, + "loss": 0.83810043, + "num_input_tokens_seen": 151853885, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.79296875, + "step": 7073, + "time_per_iteration": 2.5192041397094727 + }, + { + "auxiliary_loss_clip": 0.0111768, + "auxiliary_loss_mlp": 0.01034416, + "balance_loss_clip": 1.02097535, + "balance_loss_mlp": 1.04180706, + "epoch": 0.42531188937321507, + "flos": 12713635059840.0, + "grad_norm": 2.1264240253054227, + "language_loss": 0.90777069, + "learning_rate": 2.5727496594820386e-06, + "loss": 0.92929167, + "num_input_tokens_seen": 151871780, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 7074, + "time_per_iteration": 2.418611526489258 + }, + { + "auxiliary_loss_clip": 0.01122809, + "auxiliary_loss_mlp": 0.01039179, + "balance_loss_clip": 1.0234437, + "balance_loss_mlp": 1.04282892, + "epoch": 0.42537201262588303, + "flos": 22091670460800.0, + "grad_norm": 1.5751331844442036, + "language_loss": 0.63971686, + "learning_rate": 2.572376498508805e-06, + "loss": 0.66133678, + "num_input_tokens_seen": 151891600, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.80078125, + "step": 7075, + "time_per_iteration": 2.5064475536346436 + }, + { + "auxiliary_loss_clip": 0.01114521, + "auxiliary_loss_mlp": 0.01030161, + "balance_loss_clip": 1.01708984, + "balance_loss_mlp": 1.04121399, + "epoch": 0.42543213587855105, + "flos": 23003119094400.0, + "grad_norm": 1.5599863464934922, + "language_loss": 0.73547149, + "learning_rate": 2.5720033158309973e-06, + "loss": 0.75691831, + "num_input_tokens_seen": 151911330, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.734375, + "step": 7076, + "time_per_iteration": 2.487424850463867 + }, + { + "auxiliary_loss_clip": 0.01122674, + "auxiliary_loss_mlp": 0.01040326, + "balance_loss_clip": 1.02565181, + "balance_loss_mlp": 1.04370356, + "epoch": 0.425492259131219, + "flos": 25082454533760.0, + "grad_norm": 1.8221025125090708, + "language_loss": 0.78215933, + "learning_rate": 2.571630111462766e-06, + "loss": 0.80378938, + "num_input_tokens_seen": 151930355, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7890625, + "step": 7077, + "time_per_iteration": 2.4964394569396973 + }, + { + "auxiliary_loss_clip": 0.01114549, + "auxiliary_loss_mlp": 0.01034843, + "balance_loss_clip": 1.0221417, + "balance_loss_mlp": 1.04220366, + "epoch": 0.425552382383887, + "flos": 22816850140800.0, + "grad_norm": 1.6016827264272244, + "language_loss": 0.73013902, + "learning_rate": 2.571256885418265e-06, + "loss": 0.75163293, + "num_input_tokens_seen": 151949695, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 7078, + "time_per_iteration": 2.47660756111145 + }, + { + "auxiliary_loss_clip": 0.01120871, + "auxiliary_loss_mlp": 0.01042162, + "balance_loss_clip": 1.02880406, + "balance_loss_mlp": 1.0461756, + "epoch": 0.42561250563655495, + "flos": 13553585671680.0, + "grad_norm": 1.731645410920913, + "language_loss": 0.79469633, + "learning_rate": 2.5708836377116445e-06, + "loss": 0.81632668, + "num_input_tokens_seen": 151967640, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.74609375, + "step": 7079, + "time_per_iteration": 2.499232769012451 + }, + { + "auxiliary_loss_clip": 0.0112172, + "auxiliary_loss_mlp": 0.01031179, + "balance_loss_clip": 1.0181613, + "balance_loss_mlp": 1.04761243, + "epoch": 0.4256726288892229, + "flos": 46978303023360.0, + "grad_norm": 1.4705007316204746, + "language_loss": 0.72263241, + "learning_rate": 2.5705103683570592e-06, + "loss": 0.74416137, + "num_input_tokens_seen": 151994020, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7421875, + "step": 7080, + "time_per_iteration": 2.732074499130249 + }, + { + "auxiliary_loss_clip": 0.0111869, + "auxiliary_loss_mlp": 0.01035587, + "balance_loss_clip": 1.02206242, + "balance_loss_mlp": 1.04246545, + "epoch": 0.4257327521418909, + "flos": 23586451966080.0, + "grad_norm": 2.328741773172896, + "language_loss": 0.80405676, + "learning_rate": 2.5701370773686646e-06, + "loss": 0.82559955, + "num_input_tokens_seen": 152013415, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.76171875, + "step": 7081, + "time_per_iteration": 2.6035380363464355 + }, + { + "auxiliary_loss_clip": 0.01115229, + "auxiliary_loss_mlp": 0.01030383, + "balance_loss_clip": 1.01753235, + "balance_loss_mlp": 1.04303384, + "epoch": 0.42579287539455885, + "flos": 18989994124800.0, + "grad_norm": 1.7894721227922463, + "language_loss": 0.81618208, + "learning_rate": 2.5697637647606138e-06, + "loss": 0.8376382, + "num_input_tokens_seen": 152030860, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.72265625, + "step": 7082, + "time_per_iteration": 2.444728374481201 + }, + { + "auxiliary_loss_clip": 0.01119852, + "auxiliary_loss_mlp": 0.0103706, + "balance_loss_clip": 1.02286816, + "balance_loss_mlp": 1.04368842, + "epoch": 0.4258529986472268, + "flos": 25191910252800.0, + "grad_norm": 2.6988843094625508, + "language_loss": 0.69388473, + "learning_rate": 2.569390430547065e-06, + "loss": 0.71545386, + "num_input_tokens_seen": 152050395, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.76171875, + "step": 7083, + "time_per_iteration": 2.5369133949279785 + }, + { + "auxiliary_loss_clip": 0.01040302, + "auxiliary_loss_mlp": 0.00999977, + "balance_loss_clip": 0.99864787, + "balance_loss_mlp": 1.01655924, + "epoch": 0.4259131218998948, + "flos": 69968280718080.0, + "grad_norm": 0.8706759407802692, + "language_loss": 0.67112887, + "learning_rate": 2.569017074742173e-06, + "loss": 0.69153166, + "num_input_tokens_seen": 152113555, + "router_z_loss_clip": 0.01330566, + "router_z_loss_mlp": 0.23828125, + "step": 7084, + "time_per_iteration": 3.1631839275360107 + }, + { + "auxiliary_loss_clip": 0.01118847, + "auxiliary_loss_mlp": 0.01044199, + "balance_loss_clip": 1.02887428, + "balance_loss_mlp": 1.04295874, + "epoch": 0.42597324515256274, + "flos": 18004964480640.0, + "grad_norm": 2.6244995349856595, + "language_loss": 0.78095287, + "learning_rate": 2.5686436973600964e-06, + "loss": 0.80258334, + "num_input_tokens_seen": 152131575, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.7578125, + "step": 7085, + "time_per_iteration": 2.493157148361206 + }, + { + "auxiliary_loss_clip": 0.01129017, + "auxiliary_loss_mlp": 0.01046431, + "balance_loss_clip": 1.03102934, + "balance_loss_mlp": 1.04819477, + "epoch": 0.4260333684052307, + "flos": 15158792563200.0, + "grad_norm": 2.071277468695464, + "language_loss": 0.75757217, + "learning_rate": 2.568270298414995e-06, + "loss": 0.77932662, + "num_input_tokens_seen": 152149435, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.80859375, + "step": 7086, + "time_per_iteration": 2.426295280456543 + }, + { + "auxiliary_loss_clip": 0.01119794, + "auxiliary_loss_mlp": 0.01037065, + "balance_loss_clip": 1.02280188, + "balance_loss_mlp": 1.0433557, + "epoch": 0.42609349165789867, + "flos": 14939342421120.0, + "grad_norm": 2.1734108107028147, + "language_loss": 0.8001647, + "learning_rate": 2.5678968779210255e-06, + "loss": 0.82173336, + "num_input_tokens_seen": 152166860, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.765625, + "step": 7087, + "time_per_iteration": 2.46087384223938 + }, + { + "auxiliary_loss_clip": 0.01123365, + "auxiliary_loss_mlp": 0.01032798, + "balance_loss_clip": 1.01846862, + "balance_loss_mlp": 1.04632342, + "epoch": 0.42615361491056664, + "flos": 23731961961600.0, + "grad_norm": 1.8444426655441133, + "language_loss": 0.6603114, + "learning_rate": 2.5675234358923505e-06, + "loss": 0.68187302, + "num_input_tokens_seen": 152187475, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.76953125, + "step": 7088, + "time_per_iteration": 2.481919527053833 + }, + { + "auxiliary_loss_clip": 0.01123249, + "auxiliary_loss_mlp": 0.01039067, + "balance_loss_clip": 1.02472591, + "balance_loss_mlp": 1.0449152, + "epoch": 0.42621373816323466, + "flos": 24936441747840.0, + "grad_norm": 1.8812259313043718, + "language_loss": 0.68482029, + "learning_rate": 2.56714997234313e-06, + "loss": 0.70644343, + "num_input_tokens_seen": 152207235, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78125, + "step": 7089, + "time_per_iteration": 2.523918628692627 + }, + { + "auxiliary_loss_clip": 0.01121302, + "auxiliary_loss_mlp": 0.01038776, + "balance_loss_clip": 1.02473295, + "balance_loss_mlp": 1.0418849, + "epoch": 0.4262738614159026, + "flos": 13552975140480.0, + "grad_norm": 2.8669230196035027, + "language_loss": 0.72897398, + "learning_rate": 2.566776487287525e-06, + "loss": 0.75057483, + "num_input_tokens_seen": 152224240, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.79296875, + "step": 7090, + "time_per_iteration": 2.4340648651123047 + }, + { + "auxiliary_loss_clip": 0.01124016, + "auxiliary_loss_mlp": 0.01046428, + "balance_loss_clip": 1.03208125, + "balance_loss_mlp": 1.04372311, + "epoch": 0.4263339846685706, + "flos": 29748794284800.0, + "grad_norm": 1.7953532910276222, + "language_loss": 0.75347531, + "learning_rate": 2.5664029807396994e-06, + "loss": 0.77517974, + "num_input_tokens_seen": 152242595, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.80078125, + "step": 7091, + "time_per_iteration": 2.5973541736602783 + }, + { + "auxiliary_loss_clip": 0.0111574, + "auxiliary_loss_mlp": 0.01034559, + "balance_loss_clip": 1.02188134, + "balance_loss_mlp": 1.04312468, + "epoch": 0.42639410792123855, + "flos": 16834204586880.0, + "grad_norm": 1.6821401092021848, + "language_loss": 0.82308388, + "learning_rate": 2.5660294527138156e-06, + "loss": 0.84458697, + "num_input_tokens_seen": 152260840, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7265625, + "step": 7092, + "time_per_iteration": 2.453181266784668 + }, + { + "auxiliary_loss_clip": 0.0112628, + "auxiliary_loss_mlp": 0.01045355, + "balance_loss_clip": 1.03138983, + "balance_loss_mlp": 1.0454514, + "epoch": 0.4264542311739065, + "flos": 28763118195840.0, + "grad_norm": 1.6505279256890275, + "language_loss": 0.73916072, + "learning_rate": 2.565655903224038e-06, + "loss": 0.76087701, + "num_input_tokens_seen": 152280580, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.80859375, + "step": 7093, + "time_per_iteration": 2.5176479816436768 + }, + { + "auxiliary_loss_clip": 0.01120741, + "auxiliary_loss_mlp": 0.01039533, + "balance_loss_clip": 1.02482259, + "balance_loss_mlp": 1.04376769, + "epoch": 0.4265143544265745, + "flos": 24713615727360.0, + "grad_norm": 2.5315083588078555, + "language_loss": 0.69390249, + "learning_rate": 2.565282332284532e-06, + "loss": 0.71550524, + "num_input_tokens_seen": 152298455, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.76953125, + "step": 7094, + "time_per_iteration": 2.489561080932617 + }, + { + "auxiliary_loss_clip": 0.01122789, + "auxiliary_loss_mlp": 0.01038187, + "balance_loss_clip": 1.02379799, + "balance_loss_mlp": 1.04475617, + "epoch": 0.42657447767924245, + "flos": 21865971352320.0, + "grad_norm": 1.6055215896501054, + "language_loss": 0.81466055, + "learning_rate": 2.564908739909464e-06, + "loss": 0.83627033, + "num_input_tokens_seen": 152316995, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 7095, + "time_per_iteration": 6.829655647277832 + }, + { + "auxiliary_loss_clip": 0.01122192, + "auxiliary_loss_mlp": 0.01044565, + "balance_loss_clip": 1.02972341, + "balance_loss_mlp": 1.04453826, + "epoch": 0.4266346009319104, + "flos": 21470236237440.0, + "grad_norm": 1.7098780852895776, + "language_loss": 0.80283463, + "learning_rate": 2.5645351261129996e-06, + "loss": 0.82450223, + "num_input_tokens_seen": 152334800, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.77734375, + "step": 7096, + "time_per_iteration": 3.894577980041504 + }, + { + "auxiliary_loss_clip": 0.01125109, + "auxiliary_loss_mlp": 0.01041794, + "balance_loss_clip": 1.02754259, + "balance_loss_mlp": 1.04520798, + "epoch": 0.4266947241845784, + "flos": 25519379569920.0, + "grad_norm": 1.947200367016257, + "language_loss": 0.65628326, + "learning_rate": 2.5641614909093066e-06, + "loss": 0.67795235, + "num_input_tokens_seen": 152355175, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.796875, + "step": 7097, + "time_per_iteration": 2.5192034244537354 + }, + { + "auxiliary_loss_clip": 0.01117089, + "auxiliary_loss_mlp": 0.0103085, + "balance_loss_clip": 1.01711667, + "balance_loss_mlp": 1.04297018, + "epoch": 0.42675484743724634, + "flos": 26541217676160.0, + "grad_norm": 1.8194330831870058, + "language_loss": 0.74512994, + "learning_rate": 2.5637878343125535e-06, + "loss": 0.76660931, + "num_input_tokens_seen": 152377245, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7421875, + "step": 7098, + "time_per_iteration": 2.498380661010742 + }, + { + "auxiliary_loss_clip": 0.01118318, + "auxiliary_loss_mlp": 0.01032967, + "balance_loss_clip": 1.01969302, + "balance_loss_mlp": 1.04259086, + "epoch": 0.4268149706899143, + "flos": 23112718467840.0, + "grad_norm": 1.7218259388529535, + "language_loss": 0.75169343, + "learning_rate": 2.5634141563369086e-06, + "loss": 0.77320623, + "num_input_tokens_seen": 152396985, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7578125, + "step": 7099, + "time_per_iteration": 2.4900684356689453 + }, + { + "auxiliary_loss_clip": 0.01122249, + "auxiliary_loss_mlp": 0.01039401, + "balance_loss_clip": 1.02458942, + "balance_loss_mlp": 1.0437479, + "epoch": 0.4268750939425823, + "flos": 22706532495360.0, + "grad_norm": 1.9952935228943551, + "language_loss": 0.83543229, + "learning_rate": 2.5630404569965432e-06, + "loss": 0.85704881, + "num_input_tokens_seen": 152415590, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78515625, + "step": 7100, + "time_per_iteration": 2.467902183532715 + }, + { + "auxiliary_loss_clip": 0.01121229, + "auxiliary_loss_mlp": 0.01034586, + "balance_loss_clip": 1.0208652, + "balance_loss_mlp": 1.04333866, + "epoch": 0.42693521719525024, + "flos": 25374875155200.0, + "grad_norm": 1.3501788659102136, + "language_loss": 0.82243335, + "learning_rate": 2.562666736305627e-06, + "loss": 0.84399146, + "num_input_tokens_seen": 152436735, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.77734375, + "step": 7101, + "time_per_iteration": 2.5363035202026367 + }, + { + "auxiliary_loss_clip": 0.01124462, + "auxiliary_loss_mlp": 0.01034069, + "balance_loss_clip": 1.01972795, + "balance_loss_mlp": 1.04426765, + "epoch": 0.42699534044791826, + "flos": 18150689957760.0, + "grad_norm": 1.8760573998828747, + "language_loss": 0.7243284, + "learning_rate": 2.5622929942783314e-06, + "loss": 0.74591374, + "num_input_tokens_seen": 152455685, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.80078125, + "step": 7102, + "time_per_iteration": 2.443894624710083 + }, + { + "auxiliary_loss_clip": 0.01117542, + "auxiliary_loss_mlp": 0.01033335, + "balance_loss_clip": 1.02012062, + "balance_loss_mlp": 1.04262853, + "epoch": 0.4270554637005862, + "flos": 13698413308800.0, + "grad_norm": 1.799822548331586, + "language_loss": 0.82910782, + "learning_rate": 2.5619192309288297e-06, + "loss": 0.85061657, + "num_input_tokens_seen": 152473500, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.75, + "step": 7103, + "time_per_iteration": 2.4751625061035156 + }, + { + "auxiliary_loss_clip": 0.01122919, + "auxiliary_loss_mlp": 0.01036815, + "balance_loss_clip": 1.02205122, + "balance_loss_mlp": 1.04319, + "epoch": 0.4271155869532542, + "flos": 17493596507520.0, + "grad_norm": 2.0452515416159227, + "language_loss": 0.73823762, + "learning_rate": 2.561545446271294e-06, + "loss": 0.759835, + "num_input_tokens_seen": 152491320, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.796875, + "step": 7104, + "time_per_iteration": 2.433727264404297 + }, + { + "auxiliary_loss_clip": 0.01120598, + "auxiliary_loss_mlp": 0.01031923, + "balance_loss_clip": 1.01842821, + "balance_loss_mlp": 1.04307532, + "epoch": 0.42717571020592215, + "flos": 32452293381120.0, + "grad_norm": 2.0713947006575713, + "language_loss": 0.75097072, + "learning_rate": 2.5611716403198987e-06, + "loss": 0.77249593, + "num_input_tokens_seen": 152511970, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.77734375, + "step": 7105, + "time_per_iteration": 2.553220748901367 + }, + { + "auxiliary_loss_clip": 0.01123627, + "auxiliary_loss_mlp": 0.01038594, + "balance_loss_clip": 1.02499223, + "balance_loss_mlp": 1.04497468, + "epoch": 0.4272358334585901, + "flos": 16253062444800.0, + "grad_norm": 1.944135826622959, + "language_loss": 0.7652669, + "learning_rate": 2.560797813088819e-06, + "loss": 0.78688908, + "num_input_tokens_seen": 152530515, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.78515625, + "step": 7106, + "time_per_iteration": 2.4320499897003174 + }, + { + "auxiliary_loss_clip": 0.01116905, + "auxiliary_loss_mlp": 0.0103438, + "balance_loss_clip": 1.02062345, + "balance_loss_mlp": 1.04073668, + "epoch": 0.4272959567112581, + "flos": 24200092938240.0, + "grad_norm": 1.7002032775641, + "language_loss": 0.79748225, + "learning_rate": 2.560423964592229e-06, + "loss": 0.81899506, + "num_input_tokens_seen": 152549295, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76171875, + "step": 7107, + "time_per_iteration": 2.5138087272644043 + }, + { + "auxiliary_loss_clip": 0.01118344, + "auxiliary_loss_mlp": 0.01032973, + "balance_loss_clip": 1.01978803, + "balance_loss_mlp": 1.04365969, + "epoch": 0.42735607996392605, + "flos": 27963495578880.0, + "grad_norm": 1.5777370161888564, + "language_loss": 0.67986816, + "learning_rate": 2.5600500948443075e-06, + "loss": 0.70138133, + "num_input_tokens_seen": 152570725, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.74609375, + "step": 7108, + "time_per_iteration": 2.5148940086364746 + }, + { + "auxiliary_loss_clip": 0.01118179, + "auxiliary_loss_mlp": 0.01037518, + "balance_loss_clip": 1.02417231, + "balance_loss_mlp": 1.04141963, + "epoch": 0.427416203216594, + "flos": 20295597674880.0, + "grad_norm": 1.697941372596268, + "language_loss": 0.71379381, + "learning_rate": 2.5596762038592294e-06, + "loss": 0.73535079, + "num_input_tokens_seen": 152588950, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.765625, + "step": 7109, + "time_per_iteration": 2.514293909072876 + }, + { + "auxiliary_loss_clip": 0.01119837, + "auxiliary_loss_mlp": 0.01032817, + "balance_loss_clip": 1.01668775, + "balance_loss_mlp": 1.04248762, + "epoch": 0.427476326469262, + "flos": 26943955943040.0, + "grad_norm": 1.808555345827523, + "language_loss": 0.64390564, + "learning_rate": 2.559302291651174e-06, + "loss": 0.66543221, + "num_input_tokens_seen": 152608965, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.7734375, + "step": 7110, + "time_per_iteration": 2.507896661758423 + }, + { + "auxiliary_loss_clip": 0.01121216, + "auxiliary_loss_mlp": 0.01033451, + "balance_loss_clip": 1.01876426, + "balance_loss_mlp": 1.04310989, + "epoch": 0.42753644972192995, + "flos": 25702847262720.0, + "grad_norm": 1.6911252843933642, + "language_loss": 0.76596475, + "learning_rate": 2.5589283582343197e-06, + "loss": 0.78751141, + "num_input_tokens_seen": 152630220, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78125, + "step": 7111, + "time_per_iteration": 2.5065102577209473 + }, + { + "auxiliary_loss_clip": 0.01122655, + "auxiliary_loss_mlp": 0.01034743, + "balance_loss_clip": 1.02051497, + "balance_loss_mlp": 1.04446638, + "epoch": 0.4275965729745979, + "flos": 18767419499520.0, + "grad_norm": 1.6101339491766522, + "language_loss": 0.73021042, + "learning_rate": 2.558554403622845e-06, + "loss": 0.75178432, + "num_input_tokens_seen": 152648835, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 7112, + "time_per_iteration": 2.462275266647339 + }, + { + "auxiliary_loss_clip": 0.0111568, + "auxiliary_loss_mlp": 0.01038733, + "balance_loss_clip": 1.02527392, + "balance_loss_mlp": 1.04112434, + "epoch": 0.4276566962272659, + "flos": 23764424878080.0, + "grad_norm": 1.5100904202471843, + "language_loss": 0.71723974, + "learning_rate": 2.5581804278309323e-06, + "loss": 0.7387839, + "num_input_tokens_seen": 152668375, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 7113, + "time_per_iteration": 2.517184019088745 + }, + { + "auxiliary_loss_clip": 0.01122905, + "auxiliary_loss_mlp": 0.01044494, + "balance_loss_clip": 1.03027248, + "balance_loss_mlp": 1.04463625, + "epoch": 0.42771681947993384, + "flos": 22492505306880.0, + "grad_norm": 4.019227207544938, + "language_loss": 0.62055492, + "learning_rate": 2.5578064308727617e-06, + "loss": 0.64222896, + "num_input_tokens_seen": 152689725, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 7114, + "time_per_iteration": 2.4808969497680664 + }, + { + "auxiliary_loss_clip": 0.01127351, + "auxiliary_loss_mlp": 0.01044357, + "balance_loss_clip": 1.02779305, + "balance_loss_mlp": 1.045439, + "epoch": 0.42777694273260186, + "flos": 25044712318080.0, + "grad_norm": 1.7285817614937915, + "language_loss": 0.64558339, + "learning_rate": 2.5574324127625153e-06, + "loss": 0.66730046, + "num_input_tokens_seen": 152709375, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8203125, + "step": 7115, + "time_per_iteration": 2.4979755878448486 + }, + { + "auxiliary_loss_clip": 0.01118312, + "auxiliary_loss_mlp": 0.01036556, + "balance_loss_clip": 1.02321672, + "balance_loss_mlp": 1.04225278, + "epoch": 0.4278370659852698, + "flos": 18661519226880.0, + "grad_norm": 1.5459011503250888, + "language_loss": 0.7331425, + "learning_rate": 2.5570583735143753e-06, + "loss": 0.75469118, + "num_input_tokens_seen": 152727510, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.76171875, + "step": 7116, + "time_per_iteration": 2.4514083862304688 + }, + { + "auxiliary_loss_clip": 0.01115475, + "auxiliary_loss_mlp": 0.01042446, + "balance_loss_clip": 1.02976263, + "balance_loss_mlp": 1.04102111, + "epoch": 0.4278971892379378, + "flos": 27308269635840.0, + "grad_norm": 1.5398002166428786, + "language_loss": 0.69214165, + "learning_rate": 2.5566843131425275e-06, + "loss": 0.7137208, + "num_input_tokens_seen": 152746670, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7421875, + "step": 7117, + "time_per_iteration": 2.522881269454956 + }, + { + "auxiliary_loss_clip": 0.01122059, + "auxiliary_loss_mlp": 0.01040733, + "balance_loss_clip": 1.02657676, + "balance_loss_mlp": 1.04530859, + "epoch": 0.42795731249060576, + "flos": 12888698970240.0, + "grad_norm": 2.268053258549222, + "language_loss": 0.69909632, + "learning_rate": 2.5563102316611536e-06, + "loss": 0.72072423, + "num_input_tokens_seen": 152760545, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.765625, + "step": 7118, + "time_per_iteration": 2.3870341777801514 + }, + { + "auxiliary_loss_clip": 0.01119033, + "auxiliary_loss_mlp": 0.01043307, + "balance_loss_clip": 1.02948511, + "balance_loss_mlp": 1.04353809, + "epoch": 0.4280174357432737, + "flos": 33401448316800.0, + "grad_norm": 2.1225989928468803, + "language_loss": 0.74740356, + "learning_rate": 2.55593612908444e-06, + "loss": 0.76902699, + "num_input_tokens_seen": 152780970, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75390625, + "step": 7119, + "time_per_iteration": 2.5487277507781982 + }, + { + "auxiliary_loss_clip": 0.01118083, + "auxiliary_loss_mlp": 0.01033891, + "balance_loss_clip": 1.02040291, + "balance_loss_mlp": 1.04196107, + "epoch": 0.4280775589959417, + "flos": 18259104182400.0, + "grad_norm": 1.8104905013477006, + "language_loss": 0.74987411, + "learning_rate": 2.555562005426573e-06, + "loss": 0.7713939, + "num_input_tokens_seen": 152798475, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.76171875, + "step": 7120, + "time_per_iteration": 2.415062427520752 + }, + { + "auxiliary_loss_clip": 0.01120406, + "auxiliary_loss_mlp": 0.01036574, + "balance_loss_clip": 1.02321029, + "balance_loss_mlp": 1.04422045, + "epoch": 0.42813768224860965, + "flos": 21471277731840.0, + "grad_norm": 1.6187265972443616, + "language_loss": 0.77002251, + "learning_rate": 2.5551878607017385e-06, + "loss": 0.7915923, + "num_input_tokens_seen": 152817555, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.76171875, + "step": 7121, + "time_per_iteration": 2.4686522483825684 + }, + { + "auxiliary_loss_clip": 0.01117478, + "auxiliary_loss_mlp": 0.01035893, + "balance_loss_clip": 1.02299464, + "balance_loss_mlp": 1.04225755, + "epoch": 0.4281978055012776, + "flos": 15669262696320.0, + "grad_norm": 1.8413618192799084, + "language_loss": 0.85525274, + "learning_rate": 2.554813694924126e-06, + "loss": 0.87678635, + "num_input_tokens_seen": 152836295, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.75, + "step": 7122, + "time_per_iteration": 2.4149863719940186 + }, + { + "auxiliary_loss_clip": 0.01114983, + "auxiliary_loss_mlp": 0.01034591, + "balance_loss_clip": 1.02088189, + "balance_loss_mlp": 1.04111362, + "epoch": 0.4282579287539456, + "flos": 17712005155200.0, + "grad_norm": 1.6495062264118223, + "language_loss": 0.81354666, + "learning_rate": 2.554439508107921e-06, + "loss": 0.83504236, + "num_input_tokens_seen": 152854950, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73828125, + "step": 7123, + "time_per_iteration": 2.4846510887145996 + }, + { + "auxiliary_loss_clip": 0.01116497, + "auxiliary_loss_mlp": 0.01035689, + "balance_loss_clip": 1.02171159, + "balance_loss_mlp": 1.04286349, + "epoch": 0.42831805200661355, + "flos": 19281157770240.0, + "grad_norm": 1.6842679543274752, + "language_loss": 0.81069416, + "learning_rate": 2.5540653002673153e-06, + "loss": 0.83221602, + "num_input_tokens_seen": 152873995, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.734375, + "step": 7124, + "time_per_iteration": 2.477781057357788 + }, + { + "auxiliary_loss_clip": 0.01116184, + "auxiliary_loss_mlp": 0.01039979, + "balance_loss_clip": 1.02485132, + "balance_loss_mlp": 1.04072952, + "epoch": 0.4283781752592815, + "flos": 19792633484160.0, + "grad_norm": 7.024350858631177, + "language_loss": 0.80178392, + "learning_rate": 2.553691071416498e-06, + "loss": 0.82334554, + "num_input_tokens_seen": 152892925, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.75390625, + "step": 7125, + "time_per_iteration": 2.466099262237549 + }, + { + "auxiliary_loss_clip": 0.01117521, + "auxiliary_loss_mlp": 0.01035658, + "balance_loss_clip": 1.0230993, + "balance_loss_mlp": 1.04386544, + "epoch": 0.4284382985119495, + "flos": 16508064072960.0, + "grad_norm": 1.7536027507395449, + "language_loss": 0.74772543, + "learning_rate": 2.553316821569659e-06, + "loss": 0.76925719, + "num_input_tokens_seen": 152910935, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.734375, + "step": 7126, + "time_per_iteration": 2.4476282596588135 + }, + { + "auxiliary_loss_clip": 0.01118141, + "auxiliary_loss_mlp": 0.01037481, + "balance_loss_clip": 1.02313387, + "balance_loss_mlp": 1.04261374, + "epoch": 0.42849842176461744, + "flos": 23330767979520.0, + "grad_norm": 2.2527301233175496, + "language_loss": 0.81376731, + "learning_rate": 2.5529425507409913e-06, + "loss": 0.83532357, + "num_input_tokens_seen": 152931030, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7578125, + "step": 7127, + "time_per_iteration": 2.50627064704895 + }, + { + "auxiliary_loss_clip": 0.01117482, + "auxiliary_loss_mlp": 0.01039553, + "balance_loss_clip": 1.02554011, + "balance_loss_mlp": 1.04140556, + "epoch": 0.4285585450172854, + "flos": 17274433674240.0, + "grad_norm": 1.7148593982179101, + "language_loss": 0.76451397, + "learning_rate": 2.5525682589446867e-06, + "loss": 0.78608435, + "num_input_tokens_seen": 152948085, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 7128, + "time_per_iteration": 2.4261910915374756 + }, + { + "auxiliary_loss_clip": 0.01119221, + "auxiliary_loss_mlp": 0.01034781, + "balance_loss_clip": 1.02018988, + "balance_loss_mlp": 1.04154372, + "epoch": 0.42861866826995343, + "flos": 24279599692800.0, + "grad_norm": 1.979642374109765, + "language_loss": 0.74111116, + "learning_rate": 2.552193946194937e-06, + "loss": 0.76265121, + "num_input_tokens_seen": 152966265, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.77734375, + "step": 7129, + "time_per_iteration": 2.4977691173553467 + }, + { + "auxiliary_loss_clip": 0.01119175, + "auxiliary_loss_mlp": 0.01034497, + "balance_loss_clip": 1.02102661, + "balance_loss_mlp": 1.04335773, + "epoch": 0.4286787915226214, + "flos": 24353108876160.0, + "grad_norm": 1.7995906720856931, + "language_loss": 0.77753568, + "learning_rate": 2.5518196125059394e-06, + "loss": 0.79907238, + "num_input_tokens_seen": 152986775, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 7130, + "time_per_iteration": 2.4983179569244385 + }, + { + "auxiliary_loss_clip": 0.01123055, + "auxiliary_loss_mlp": 0.01039363, + "balance_loss_clip": 1.02476025, + "balance_loss_mlp": 1.04523921, + "epoch": 0.42873891477528936, + "flos": 15449992122240.0, + "grad_norm": 1.8571755273934152, + "language_loss": 0.7349695, + "learning_rate": 2.551445257891886e-06, + "loss": 0.75659359, + "num_input_tokens_seen": 153003595, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.77734375, + "step": 7131, + "time_per_iteration": 2.5469563007354736 + }, + { + "auxiliary_loss_clip": 0.01120536, + "auxiliary_loss_mlp": 0.01036711, + "balance_loss_clip": 1.02257299, + "balance_loss_mlp": 1.04343748, + "epoch": 0.4287990380279573, + "flos": 17639573379840.0, + "grad_norm": 2.0596069487020268, + "language_loss": 0.76299751, + "learning_rate": 2.551070882366973e-06, + "loss": 0.78456992, + "num_input_tokens_seen": 153021960, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7734375, + "step": 7132, + "time_per_iteration": 2.432889223098755 + }, + { + "auxiliary_loss_clip": 0.01119567, + "auxiliary_loss_mlp": 0.01042884, + "balance_loss_clip": 1.02821565, + "balance_loss_mlp": 1.04352558, + "epoch": 0.4288591612806253, + "flos": 27162328677120.0, + "grad_norm": 1.5221162096651724, + "language_loss": 0.78525162, + "learning_rate": 2.550696485945397e-06, + "loss": 0.80687612, + "num_input_tokens_seen": 153042110, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7578125, + "step": 7133, + "time_per_iteration": 2.544379472732544 + }, + { + "auxiliary_loss_clip": 0.01120837, + "auxiliary_loss_mlp": 0.0103873, + "balance_loss_clip": 1.02484238, + "balance_loss_mlp": 1.04305482, + "epoch": 0.42891928453329325, + "flos": 17163182275200.0, + "grad_norm": 1.8479371259746051, + "language_loss": 0.75017452, + "learning_rate": 2.550322068641355e-06, + "loss": 0.77177012, + "num_input_tokens_seen": 153058925, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.77734375, + "step": 7134, + "time_per_iteration": 2.416792154312134 + }, + { + "auxiliary_loss_clip": 0.01114501, + "auxiliary_loss_mlp": 0.01031974, + "balance_loss_clip": 1.01937902, + "balance_loss_mlp": 1.04046178, + "epoch": 0.4289794077859612, + "flos": 18187031543040.0, + "grad_norm": 2.2902258120670975, + "language_loss": 0.84066433, + "learning_rate": 2.5499476304690455e-06, + "loss": 0.86212909, + "num_input_tokens_seen": 153078070, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7421875, + "step": 7135, + "time_per_iteration": 2.4513847827911377 + }, + { + "auxiliary_loss_clip": 0.01114319, + "auxiliary_loss_mlp": 0.01036122, + "balance_loss_clip": 1.02250218, + "balance_loss_mlp": 1.04050052, + "epoch": 0.4290395310386292, + "flos": 28256885867520.0, + "grad_norm": 1.9123929145525593, + "language_loss": 0.74716437, + "learning_rate": 2.549573171442666e-06, + "loss": 0.76866877, + "num_input_tokens_seen": 153096680, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73828125, + "step": 7136, + "time_per_iteration": 2.5260956287384033 + }, + { + "auxiliary_loss_clip": 0.01117454, + "auxiliary_loss_mlp": 0.01038013, + "balance_loss_clip": 1.0243752, + "balance_loss_mlp": 1.04027987, + "epoch": 0.42909965429129715, + "flos": 16216074414720.0, + "grad_norm": 1.9374198184766858, + "language_loss": 0.78982937, + "learning_rate": 2.5491986915764175e-06, + "loss": 0.81138408, + "num_input_tokens_seen": 153113305, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76953125, + "step": 7137, + "time_per_iteration": 6.664285898208618 + }, + { + "auxiliary_loss_clip": 0.01123569, + "auxiliary_loss_mlp": 0.01034938, + "balance_loss_clip": 1.02053773, + "balance_loss_mlp": 1.04498768, + "epoch": 0.4291597775439651, + "flos": 23112862122240.0, + "grad_norm": 1.8145904182691066, + "language_loss": 0.76599205, + "learning_rate": 2.548824190884499e-06, + "loss": 0.78757715, + "num_input_tokens_seen": 153132735, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7890625, + "step": 7138, + "time_per_iteration": 2.4640390872955322 + }, + { + "auxiliary_loss_clip": 0.01043511, + "auxiliary_loss_mlp": 0.01001663, + "balance_loss_clip": 1.00025678, + "balance_loss_mlp": 1.02006102, + "epoch": 0.4292199007966331, + "flos": 67546212681600.0, + "grad_norm": 0.7743592729173089, + "language_loss": 0.56193811, + "learning_rate": 2.548449669381113e-06, + "loss": 0.58238983, + "num_input_tokens_seen": 153187925, + "router_z_loss_clip": 0.01403809, + "router_z_loss_mlp": 0.234375, + "step": 7139, + "time_per_iteration": 2.938645362854004 + }, + { + "auxiliary_loss_clip": 0.01114131, + "auxiliary_loss_mlp": 0.01041532, + "balance_loss_clip": 1.02957499, + "balance_loss_mlp": 1.04185057, + "epoch": 0.42928002404930105, + "flos": 22999850956800.0, + "grad_norm": 1.6343660010586272, + "language_loss": 0.81107223, + "learning_rate": 2.5480751270804595e-06, + "loss": 0.83262885, + "num_input_tokens_seen": 153206990, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.72265625, + "step": 7140, + "time_per_iteration": 2.4621551036834717 + }, + { + "auxiliary_loss_clip": 0.01117324, + "auxiliary_loss_mlp": 0.01032433, + "balance_loss_clip": 1.01819944, + "balance_loss_mlp": 1.04155135, + "epoch": 0.429340147301969, + "flos": 11544922241280.0, + "grad_norm": 1.7453668118354997, + "language_loss": 0.81973499, + "learning_rate": 2.5477005639967424e-06, + "loss": 0.84123254, + "num_input_tokens_seen": 153222345, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7578125, + "step": 7141, + "time_per_iteration": 2.4552011489868164 + }, + { + "auxiliary_loss_clip": 0.011238, + "auxiliary_loss_mlp": 0.01040057, + "balance_loss_clip": 1.02569795, + "balance_loss_mlp": 1.04469872, + "epoch": 0.42940027055463703, + "flos": 25264988472960.0, + "grad_norm": 1.6365702711839187, + "language_loss": 0.86302745, + "learning_rate": 2.547325980144166e-06, + "loss": 0.88466609, + "num_input_tokens_seen": 153240570, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7890625, + "step": 7142, + "time_per_iteration": 2.466599464416504 + }, + { + "auxiliary_loss_clip": 0.01119038, + "auxiliary_loss_mlp": 0.01033549, + "balance_loss_clip": 1.0205493, + "balance_loss_mlp": 1.04692888, + "epoch": 0.429460393807305, + "flos": 23805004268160.0, + "grad_norm": 1.8779834210446977, + "language_loss": 0.78367496, + "learning_rate": 2.5469513755369323e-06, + "loss": 0.80520082, + "num_input_tokens_seen": 153259575, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71875, + "step": 7143, + "time_per_iteration": 2.528383493423462 + }, + { + "auxiliary_loss_clip": 0.01120121, + "auxiliary_loss_mlp": 0.01040708, + "balance_loss_clip": 1.02731538, + "balance_loss_mlp": 1.04566526, + "epoch": 0.42952051705997296, + "flos": 13918294414080.0, + "grad_norm": 2.185103050312315, + "language_loss": 0.76671416, + "learning_rate": 2.5465767501892484e-06, + "loss": 0.78832245, + "num_input_tokens_seen": 153276650, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.74609375, + "step": 7144, + "time_per_iteration": 2.4433047771453857 + }, + { + "auxiliary_loss_clip": 0.01119183, + "auxiliary_loss_mlp": 0.01031791, + "balance_loss_clip": 1.01801622, + "balance_loss_mlp": 1.043118, + "epoch": 0.4295806403126409, + "flos": 26760380509440.0, + "grad_norm": 2.969999234773645, + "language_loss": 0.73481476, + "learning_rate": 2.54620210411532e-06, + "loss": 0.75632453, + "num_input_tokens_seen": 153298025, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76171875, + "step": 7145, + "time_per_iteration": 2.5330073833465576 + }, + { + "auxiliary_loss_clip": 0.01120569, + "auxiliary_loss_mlp": 0.01038539, + "balance_loss_clip": 1.02447844, + "balance_loss_mlp": 1.04405165, + "epoch": 0.4296407635653089, + "flos": 20952619297920.0, + "grad_norm": 1.854643653820381, + "language_loss": 0.78928959, + "learning_rate": 2.545827437329352e-06, + "loss": 0.81088066, + "num_input_tokens_seen": 153315775, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.765625, + "step": 7146, + "time_per_iteration": 2.4481821060180664 + }, + { + "auxiliary_loss_clip": 0.01116396, + "auxiliary_loss_mlp": 0.0102847, + "balance_loss_clip": 1.01590514, + "balance_loss_mlp": 1.04295409, + "epoch": 0.42970088681797686, + "flos": 15852335339520.0, + "grad_norm": 1.9767254736067894, + "language_loss": 0.83134973, + "learning_rate": 2.5454527498455532e-06, + "loss": 0.85279846, + "num_input_tokens_seen": 153332765, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.734375, + "step": 7147, + "time_per_iteration": 2.500633478164673 + }, + { + "auxiliary_loss_clip": 0.01124897, + "auxiliary_loss_mlp": 0.01039853, + "balance_loss_clip": 1.02473724, + "balance_loss_mlp": 1.04802537, + "epoch": 0.4297610100706448, + "flos": 22382618624640.0, + "grad_norm": 1.8398177405042841, + "language_loss": 0.86894512, + "learning_rate": 2.545078041678131e-06, + "loss": 0.89059258, + "num_input_tokens_seen": 153350760, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.76953125, + "step": 7148, + "time_per_iteration": 2.481743097305298 + }, + { + "auxiliary_loss_clip": 0.01120854, + "auxiliary_loss_mlp": 0.01036737, + "balance_loss_clip": 1.02405918, + "balance_loss_mlp": 1.04469061, + "epoch": 0.4298211333233128, + "flos": 27925681536000.0, + "grad_norm": 1.5258683369520107, + "language_loss": 0.77855921, + "learning_rate": 2.5447033128412957e-06, + "loss": 0.80013508, + "num_input_tokens_seen": 153370765, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.76171875, + "step": 7149, + "time_per_iteration": 2.6060431003570557 + }, + { + "auxiliary_loss_clip": 0.01118454, + "auxiliary_loss_mlp": 0.01036469, + "balance_loss_clip": 1.02247977, + "balance_loss_mlp": 1.04456902, + "epoch": 0.42988125657598075, + "flos": 24425612478720.0, + "grad_norm": 1.7047076849986806, + "language_loss": 0.79828095, + "learning_rate": 2.544328563349256e-06, + "loss": 0.81983018, + "num_input_tokens_seen": 153390725, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.73828125, + "step": 7150, + "time_per_iteration": 2.4652955532073975 + }, + { + "auxiliary_loss_clip": 0.01125949, + "auxiliary_loss_mlp": 0.0104301, + "balance_loss_clip": 1.02763176, + "balance_loss_mlp": 1.0467031, + "epoch": 0.4299413798286487, + "flos": 15850180523520.0, + "grad_norm": 1.7972230644563891, + "language_loss": 0.74738395, + "learning_rate": 2.5439537932162222e-06, + "loss": 0.76907349, + "num_input_tokens_seen": 153408010, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.79296875, + "step": 7151, + "time_per_iteration": 2.5019421577453613 + }, + { + "auxiliary_loss_clip": 0.0112419, + "auxiliary_loss_mlp": 0.01036782, + "balance_loss_clip": 1.02284098, + "balance_loss_mlp": 1.0458225, + "epoch": 0.4300015030813167, + "flos": 22309504490880.0, + "grad_norm": 1.924911798883302, + "language_loss": 0.70084447, + "learning_rate": 2.543579002456406e-06, + "loss": 0.72245419, + "num_input_tokens_seen": 153426865, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.78515625, + "step": 7152, + "time_per_iteration": 2.456465482711792 + }, + { + "auxiliary_loss_clip": 0.01117938, + "auxiliary_loss_mlp": 0.01035992, + "balance_loss_clip": 1.02268243, + "balance_loss_mlp": 1.04186821, + "epoch": 0.43006162633398465, + "flos": 34897666366080.0, + "grad_norm": 1.5367633238023177, + "language_loss": 0.71064591, + "learning_rate": 2.54320419108402e-06, + "loss": 0.73218524, + "num_input_tokens_seen": 153449410, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7578125, + "step": 7153, + "time_per_iteration": 2.6120920181274414 + }, + { + "auxiliary_loss_clip": 0.01120146, + "auxiliary_loss_mlp": 0.01033168, + "balance_loss_clip": 1.01941729, + "balance_loss_mlp": 1.04342091, + "epoch": 0.4301217495866526, + "flos": 15961575576960.0, + "grad_norm": 1.8794751780958798, + "language_loss": 0.79155993, + "learning_rate": 2.542829359113276e-06, + "loss": 0.81309307, + "num_input_tokens_seen": 153467910, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.765625, + "step": 7154, + "time_per_iteration": 2.4222962856292725 + }, + { + "auxiliary_loss_clip": 0.0111738, + "auxiliary_loss_mlp": 0.01030563, + "balance_loss_clip": 1.01818347, + "balance_loss_mlp": 1.04361236, + "epoch": 0.43018187283932063, + "flos": 18770364414720.0, + "grad_norm": 1.4801057977091479, + "language_loss": 0.78793395, + "learning_rate": 2.542454506558389e-06, + "loss": 0.80941343, + "num_input_tokens_seen": 153487100, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.73828125, + "step": 7155, + "time_per_iteration": 2.4554193019866943 + }, + { + "auxiliary_loss_clip": 0.01117238, + "auxiliary_loss_mlp": 0.01028737, + "balance_loss_clip": 1.01582694, + "balance_loss_mlp": 1.04335082, + "epoch": 0.4302419960919886, + "flos": 20151703791360.0, + "grad_norm": 1.7176839192841982, + "language_loss": 0.88779187, + "learning_rate": 2.5420796334335723e-06, + "loss": 0.90925157, + "num_input_tokens_seen": 153505565, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.734375, + "step": 7156, + "time_per_iteration": 2.446831464767456 + }, + { + "auxiliary_loss_clip": 0.01120931, + "auxiliary_loss_mlp": 0.01033948, + "balance_loss_clip": 1.01953602, + "balance_loss_mlp": 1.04361558, + "epoch": 0.43030211934465656, + "flos": 26432731624320.0, + "grad_norm": 1.9517774058288286, + "language_loss": 0.82738447, + "learning_rate": 2.541704739753042e-06, + "loss": 0.84893334, + "num_input_tokens_seen": 153526130, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7734375, + "step": 7157, + "time_per_iteration": 2.5298144817352295 + }, + { + "auxiliary_loss_clip": 0.01124397, + "auxiliary_loss_mlp": 0.01035742, + "balance_loss_clip": 1.0210917, + "balance_loss_mlp": 1.04532623, + "epoch": 0.43036224259732453, + "flos": 24389234979840.0, + "grad_norm": 1.8458285726729726, + "language_loss": 0.72177351, + "learning_rate": 2.5413298255310132e-06, + "loss": 0.74337494, + "num_input_tokens_seen": 153546370, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7890625, + "step": 7158, + "time_per_iteration": 2.4691712856292725 + }, + { + "auxiliary_loss_clip": 0.0111825, + "auxiliary_loss_mlp": 0.0103104, + "balance_loss_clip": 1.01796317, + "balance_loss_mlp": 1.04215837, + "epoch": 0.4304223658499925, + "flos": 17201714590080.0, + "grad_norm": 2.077812294320108, + "language_loss": 0.82865965, + "learning_rate": 2.5409548907817034e-06, + "loss": 0.85015261, + "num_input_tokens_seen": 153562800, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7578125, + "step": 7159, + "time_per_iteration": 2.4462857246398926 + }, + { + "auxiliary_loss_clip": 0.01118056, + "auxiliary_loss_mlp": 0.01032317, + "balance_loss_clip": 1.01887655, + "balance_loss_mlp": 1.04236865, + "epoch": 0.43048248910266046, + "flos": 14903000835840.0, + "grad_norm": 2.094804075931644, + "language_loss": 0.83043528, + "learning_rate": 2.54057993551933e-06, + "loss": 0.85193908, + "num_input_tokens_seen": 153578395, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 7160, + "time_per_iteration": 2.587928533554077 + }, + { + "auxiliary_loss_clip": 0.01123066, + "auxiliary_loss_mlp": 0.01038163, + "balance_loss_clip": 1.02249885, + "balance_loss_mlp": 1.04402685, + "epoch": 0.4305426123553284, + "flos": 21579835610880.0, + "grad_norm": 3.027641474238522, + "language_loss": 0.77379316, + "learning_rate": 2.5402049597581116e-06, + "loss": 0.79540545, + "num_input_tokens_seen": 153596880, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.79296875, + "step": 7161, + "time_per_iteration": 2.502628803253174 + }, + { + "auxiliary_loss_clip": 0.01119327, + "auxiliary_loss_mlp": 0.01034462, + "balance_loss_clip": 1.020854, + "balance_loss_mlp": 1.04304039, + "epoch": 0.4306027356079964, + "flos": 22601278667520.0, + "grad_norm": 2.05136398687674, + "language_loss": 0.73137891, + "learning_rate": 2.5398299635122662e-06, + "loss": 0.75291681, + "num_input_tokens_seen": 153616570, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.765625, + "step": 7162, + "time_per_iteration": 2.439053773880005 + }, + { + "auxiliary_loss_clip": 0.01042786, + "auxiliary_loss_mlp": 0.01005692, + "balance_loss_clip": 1.00411832, + "balance_loss_mlp": 1.01966858, + "epoch": 0.43066285886066435, + "flos": 70672091806080.0, + "grad_norm": 0.7926335078551056, + "language_loss": 0.59016478, + "learning_rate": 2.5394549467960147e-06, + "loss": 0.61064959, + "num_input_tokens_seen": 153671450, + "router_z_loss_clip": 0.01574707, + "router_z_loss_mlp": 0.23046875, + "step": 7163, + "time_per_iteration": 2.9588072299957275 + }, + { + "auxiliary_loss_clip": 0.01115064, + "auxiliary_loss_mlp": 0.01035604, + "balance_loss_clip": 1.02299142, + "balance_loss_mlp": 1.04035139, + "epoch": 0.4307229821133323, + "flos": 26720591218560.0, + "grad_norm": 1.6277980092745115, + "language_loss": 0.79140532, + "learning_rate": 2.5390799096235783e-06, + "loss": 0.81291205, + "num_input_tokens_seen": 153691405, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.74609375, + "step": 7164, + "time_per_iteration": 2.484001398086548 + }, + { + "auxiliary_loss_clip": 0.01119155, + "auxiliary_loss_mlp": 0.01041343, + "balance_loss_clip": 1.0275383, + "balance_loss_mlp": 1.04078794, + "epoch": 0.4307831053660003, + "flos": 26177119464960.0, + "grad_norm": 1.8180486110770353, + "language_loss": 0.67282438, + "learning_rate": 2.538704852009177e-06, + "loss": 0.69442934, + "num_input_tokens_seen": 153711555, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.78125, + "step": 7165, + "time_per_iteration": 2.533599376678467 + }, + { + "auxiliary_loss_clip": 0.01119036, + "auxiliary_loss_mlp": 0.01043422, + "balance_loss_clip": 1.03069651, + "balance_loss_mlp": 1.04327762, + "epoch": 0.43084322861866825, + "flos": 18910343715840.0, + "grad_norm": 1.850302447549428, + "language_loss": 0.75248688, + "learning_rate": 2.538329773967034e-06, + "loss": 0.77411151, + "num_input_tokens_seen": 153730095, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7578125, + "step": 7166, + "time_per_iteration": 2.439861536026001 + }, + { + "auxiliary_loss_clip": 0.01117069, + "auxiliary_loss_mlp": 0.01036345, + "balance_loss_clip": 1.0239172, + "balance_loss_mlp": 1.04362941, + "epoch": 0.4309033518713362, + "flos": 26432911192320.0, + "grad_norm": 1.612504951400803, + "language_loss": 0.71537554, + "learning_rate": 2.537954675511372e-06, + "loss": 0.73690969, + "num_input_tokens_seen": 153749320, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.734375, + "step": 7167, + "time_per_iteration": 2.499190092086792 + }, + { + "auxiliary_loss_clip": 0.01111616, + "auxiliary_loss_mlp": 0.01034998, + "balance_loss_clip": 1.02232647, + "balance_loss_mlp": 1.03984129, + "epoch": 0.43096347512400424, + "flos": 21213295274880.0, + "grad_norm": 1.6022700342177734, + "language_loss": 0.78459173, + "learning_rate": 2.537579556656414e-06, + "loss": 0.80605787, + "num_input_tokens_seen": 153767825, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 7168, + "time_per_iteration": 2.4372310638427734 + }, + { + "auxiliary_loss_clip": 0.01118326, + "auxiliary_loss_mlp": 0.01041364, + "balance_loss_clip": 1.02733326, + "balance_loss_mlp": 1.04224193, + "epoch": 0.4310235983766722, + "flos": 16540131939840.0, + "grad_norm": 2.3121674941994383, + "language_loss": 0.82260263, + "learning_rate": 2.537204417416387e-06, + "loss": 0.8441996, + "num_input_tokens_seen": 153785350, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 7169, + "time_per_iteration": 2.4545183181762695 + }, + { + "auxiliary_loss_clip": 0.01038578, + "auxiliary_loss_mlp": 0.01010207, + "balance_loss_clip": 1.00865698, + "balance_loss_mlp": 1.0153358, + "epoch": 0.43108372162934017, + "flos": 64775704763520.0, + "grad_norm": 0.6800543146405372, + "language_loss": 0.60812157, + "learning_rate": 2.5368292578055132e-06, + "loss": 0.62860942, + "num_input_tokens_seen": 153856400, + "router_z_loss_clip": 0.01550293, + "router_z_loss_mlp": 0.23242188, + "step": 7170, + "time_per_iteration": 3.2204582691192627 + }, + { + "auxiliary_loss_clip": 0.01116832, + "auxiliary_loss_mlp": 0.01033041, + "balance_loss_clip": 1.02039874, + "balance_loss_mlp": 1.04148889, + "epoch": 0.43114384488200813, + "flos": 13444094039040.0, + "grad_norm": 2.0659828341911615, + "language_loss": 0.76225841, + "learning_rate": 2.536454077838021e-06, + "loss": 0.78375715, + "num_input_tokens_seen": 153875230, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.75390625, + "step": 7171, + "time_per_iteration": 2.465665817260742 + }, + { + "auxiliary_loss_clip": 0.01117327, + "auxiliary_loss_mlp": 0.0103468, + "balance_loss_clip": 1.02172232, + "balance_loss_mlp": 1.04197574, + "epoch": 0.4312039681346761, + "flos": 26286682924800.0, + "grad_norm": 1.6834410044967325, + "language_loss": 0.77283418, + "learning_rate": 2.5360788775281357e-06, + "loss": 0.7943542, + "num_input_tokens_seen": 153894740, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.75390625, + "step": 7172, + "time_per_iteration": 2.4916739463806152 + }, + { + "auxiliary_loss_clip": 0.0111787, + "auxiliary_loss_mlp": 0.01039799, + "balance_loss_clip": 1.02544653, + "balance_loss_mlp": 1.04015696, + "epoch": 0.43126409138734406, + "flos": 20376684627840.0, + "grad_norm": 1.7953579135961333, + "language_loss": 0.76852405, + "learning_rate": 2.535703656890086e-06, + "loss": 0.79010069, + "num_input_tokens_seen": 153913230, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.77734375, + "step": 7173, + "time_per_iteration": 2.4764745235443115 + }, + { + "auxiliary_loss_clip": 0.01115542, + "auxiliary_loss_mlp": 0.01029073, + "balance_loss_clip": 1.01571512, + "balance_loss_mlp": 1.04070854, + "epoch": 0.431324214640012, + "flos": 22123091882880.0, + "grad_norm": 1.4568106417702447, + "language_loss": 0.77103329, + "learning_rate": 2.5353284159381e-06, + "loss": 0.79247946, + "num_input_tokens_seen": 153933250, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.75, + "step": 7174, + "time_per_iteration": 2.4860222339630127 + }, + { + "auxiliary_loss_clip": 0.01119703, + "auxiliary_loss_mlp": 0.01032961, + "balance_loss_clip": 1.01815498, + "balance_loss_mlp": 1.04199743, + "epoch": 0.43138433789268, + "flos": 15231008856960.0, + "grad_norm": 1.4198827217143106, + "language_loss": 0.82505399, + "learning_rate": 2.534953154686407e-06, + "loss": 0.84658062, + "num_input_tokens_seen": 153951325, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.77734375, + "step": 7175, + "time_per_iteration": 2.462977647781372 + }, + { + "auxiliary_loss_clip": 0.01121086, + "auxiliary_loss_mlp": 0.01036761, + "balance_loss_clip": 1.0223192, + "balance_loss_mlp": 1.04153752, + "epoch": 0.43144446114534796, + "flos": 18150294908160.0, + "grad_norm": 2.338333143716513, + "language_loss": 0.74985862, + "learning_rate": 2.5345778731492366e-06, + "loss": 0.77143705, + "num_input_tokens_seen": 153966975, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.79296875, + "step": 7176, + "time_per_iteration": 2.4185218811035156 + }, + { + "auxiliary_loss_clip": 0.01117308, + "auxiliary_loss_mlp": 0.01034436, + "balance_loss_clip": 1.020643, + "balance_loss_mlp": 1.03969014, + "epoch": 0.4315045843980159, + "flos": 22929861306240.0, + "grad_norm": 1.6024853029290826, + "language_loss": 0.73364419, + "learning_rate": 2.534202571340819e-06, + "loss": 0.75516164, + "num_input_tokens_seen": 153986695, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.77734375, + "step": 7177, + "time_per_iteration": 2.487114667892456 + }, + { + "auxiliary_loss_clip": 0.01124437, + "auxiliary_loss_mlp": 0.01042376, + "balance_loss_clip": 1.0264492, + "balance_loss_mlp": 1.04060507, + "epoch": 0.4315647076506839, + "flos": 22126862810880.0, + "grad_norm": 1.878519248272382, + "language_loss": 0.81681836, + "learning_rate": 2.533827249275387e-06, + "loss": 0.83848649, + "num_input_tokens_seen": 154004710, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8359375, + "step": 7178, + "time_per_iteration": 2.443887948989868 + }, + { + "auxiliary_loss_clip": 0.01113093, + "auxiliary_loss_mlp": 0.01033286, + "balance_loss_clip": 1.01988733, + "balance_loss_mlp": 1.04052329, + "epoch": 0.43162483090335185, + "flos": 26871129118080.0, + "grad_norm": 1.4541906286028654, + "language_loss": 0.83824348, + "learning_rate": 2.5334519069671725e-06, + "loss": 0.8597073, + "num_input_tokens_seen": 154024320, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7265625, + "step": 7179, + "time_per_iteration": 5.329441547393799 + }, + { + "auxiliary_loss_clip": 0.01114931, + "auxiliary_loss_mlp": 0.01033766, + "balance_loss_clip": 1.02040303, + "balance_loss_mlp": 1.03945267, + "epoch": 0.4316849541560198, + "flos": 13913122855680.0, + "grad_norm": 2.045990303945265, + "language_loss": 0.75710779, + "learning_rate": 2.5330765444304075e-06, + "loss": 0.77859473, + "num_input_tokens_seen": 154041755, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.75390625, + "step": 7180, + "time_per_iteration": 2.5520315170288086 + }, + { + "auxiliary_loss_clip": 0.01116541, + "auxiliary_loss_mlp": 0.0103858, + "balance_loss_clip": 1.0240128, + "balance_loss_mlp": 1.03862667, + "epoch": 0.4317450774086878, + "flos": 16435165420800.0, + "grad_norm": 1.7639080321754919, + "language_loss": 0.81907403, + "learning_rate": 2.5327011616793274e-06, + "loss": 0.84062529, + "num_input_tokens_seen": 154056775, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.78125, + "step": 7181, + "time_per_iteration": 2.4059271812438965 + }, + { + "auxiliary_loss_clip": 0.01118777, + "auxiliary_loss_mlp": 0.0103845, + "balance_loss_clip": 1.02357888, + "balance_loss_mlp": 1.04020417, + "epoch": 0.4318052006613558, + "flos": 20554980762240.0, + "grad_norm": 1.5777864051255721, + "language_loss": 0.88434547, + "learning_rate": 2.532325758728165e-06, + "loss": 0.90591776, + "num_input_tokens_seen": 154075015, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78515625, + "step": 7182, + "time_per_iteration": 2.463463306427002 + }, + { + "auxiliary_loss_clip": 0.01113816, + "auxiliary_loss_mlp": 0.0103166, + "balance_loss_clip": 1.01873803, + "balance_loss_mlp": 1.03918862, + "epoch": 0.43186532391402377, + "flos": 22820046451200.0, + "grad_norm": 1.70694658333996, + "language_loss": 0.75826657, + "learning_rate": 2.5319503355911566e-06, + "loss": 0.77972138, + "num_input_tokens_seen": 154095170, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.74609375, + "step": 7183, + "time_per_iteration": 2.4562740325927734 + }, + { + "auxiliary_loss_clip": 0.01116225, + "auxiliary_loss_mlp": 0.01032272, + "balance_loss_clip": 1.01819921, + "balance_loss_mlp": 1.03917336, + "epoch": 0.43192544716669173, + "flos": 25556583081600.0, + "grad_norm": 2.311500131527462, + "language_loss": 0.77666485, + "learning_rate": 2.5315748922825393e-06, + "loss": 0.79814982, + "num_input_tokens_seen": 154116895, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76953125, + "step": 7184, + "time_per_iteration": 2.5283145904541016 + }, + { + "auxiliary_loss_clip": 0.01110208, + "auxiliary_loss_mlp": 0.01033883, + "balance_loss_clip": 1.02065074, + "balance_loss_mlp": 1.03938413, + "epoch": 0.4319855704193597, + "flos": 30954674701440.0, + "grad_norm": 1.5490664406704935, + "language_loss": 0.73325193, + "learning_rate": 2.5311994288165474e-06, + "loss": 0.75469285, + "num_input_tokens_seen": 154138395, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.70703125, + "step": 7185, + "time_per_iteration": 2.520885467529297 + }, + { + "auxiliary_loss_clip": 0.01118704, + "auxiliary_loss_mlp": 0.01037072, + "balance_loss_clip": 1.02283251, + "balance_loss_mlp": 1.03961062, + "epoch": 0.43204569367202766, + "flos": 24238732993920.0, + "grad_norm": 2.5540588454326, + "language_loss": 0.75974178, + "learning_rate": 2.530823945207421e-06, + "loss": 0.78129953, + "num_input_tokens_seen": 154156775, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7890625, + "step": 7186, + "time_per_iteration": 2.5005605220794678 + }, + { + "auxiliary_loss_clip": 0.01116031, + "auxiliary_loss_mlp": 0.01035244, + "balance_loss_clip": 1.02164185, + "balance_loss_mlp": 1.03987479, + "epoch": 0.43210581692469563, + "flos": 18406948561920.0, + "grad_norm": 5.067701176656461, + "language_loss": 0.76043296, + "learning_rate": 2.5304484414693962e-06, + "loss": 0.78194571, + "num_input_tokens_seen": 154177500, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.76171875, + "step": 7187, + "time_per_iteration": 2.4769227504730225 + }, + { + "auxiliary_loss_clip": 0.0103801, + "auxiliary_loss_mlp": 0.00999247, + "balance_loss_clip": 0.99792367, + "balance_loss_mlp": 1.0145607, + "epoch": 0.4321659401773636, + "flos": 49832378910720.0, + "grad_norm": 0.8526585096921939, + "language_loss": 0.68180382, + "learning_rate": 2.530072917616714e-06, + "loss": 0.70217645, + "num_input_tokens_seen": 154237110, + "router_z_loss_clip": 0.01324463, + "router_z_loss_mlp": 0.234375, + "step": 7188, + "time_per_iteration": 3.095301389694214 + }, + { + "auxiliary_loss_clip": 0.01112959, + "auxiliary_loss_mlp": 0.01035631, + "balance_loss_clip": 1.02231503, + "balance_loss_mlp": 1.03992498, + "epoch": 0.43222606343003156, + "flos": 17128564542720.0, + "grad_norm": 1.742468102969242, + "language_loss": 0.7809816, + "learning_rate": 2.529697373663614e-06, + "loss": 0.80246753, + "num_input_tokens_seen": 154253910, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73046875, + "step": 7189, + "time_per_iteration": 2.4332470893859863 + }, + { + "auxiliary_loss_clip": 0.01118752, + "auxiliary_loss_mlp": 0.01041299, + "balance_loss_clip": 1.0263027, + "balance_loss_mlp": 1.03817415, + "epoch": 0.4322861866826995, + "flos": 22749949059840.0, + "grad_norm": 1.8713383629003246, + "language_loss": 0.7119785, + "learning_rate": 2.5293218096243364e-06, + "loss": 0.73357898, + "num_input_tokens_seen": 154274770, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8046875, + "step": 7190, + "time_per_iteration": 2.494537115097046 + }, + { + "auxiliary_loss_clip": 0.01113042, + "auxiliary_loss_mlp": 0.0103584, + "balance_loss_clip": 1.02275729, + "balance_loss_mlp": 1.0380528, + "epoch": 0.4323463099353675, + "flos": 27891925729920.0, + "grad_norm": 1.5245278530879214, + "language_loss": 0.79833174, + "learning_rate": 2.5289462255131223e-06, + "loss": 0.81982064, + "num_input_tokens_seen": 154295035, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.75, + "step": 7191, + "time_per_iteration": 2.478376865386963 + }, + { + "auxiliary_loss_clip": 0.01113503, + "auxiliary_loss_mlp": 0.0103395, + "balance_loss_clip": 1.020944, + "balance_loss_mlp": 1.03872573, + "epoch": 0.43240643318803546, + "flos": 21614740652160.0, + "grad_norm": 1.7647822638177795, + "language_loss": 0.74647141, + "learning_rate": 2.5285706213442146e-06, + "loss": 0.76794595, + "num_input_tokens_seen": 154314905, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.75, + "step": 7192, + "time_per_iteration": 2.4613609313964844 + }, + { + "auxiliary_loss_clip": 0.011176, + "auxiliary_loss_mlp": 0.01041388, + "balance_loss_clip": 1.02696347, + "balance_loss_mlp": 1.04183233, + "epoch": 0.4324665564407034, + "flos": 17558378686080.0, + "grad_norm": 2.014554632256561, + "language_loss": 0.78898597, + "learning_rate": 2.5281949971318557e-06, + "loss": 0.81057584, + "num_input_tokens_seen": 154331740, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7578125, + "step": 7193, + "time_per_iteration": 2.4220309257507324 + }, + { + "auxiliary_loss_clip": 0.011155, + "auxiliary_loss_mlp": 0.01040121, + "balance_loss_clip": 1.02609015, + "balance_loss_mlp": 1.0394038, + "epoch": 0.4325266796933714, + "flos": 18402423448320.0, + "grad_norm": 1.7200377707292065, + "language_loss": 0.75406849, + "learning_rate": 2.5278193528902897e-06, + "loss": 0.77562475, + "num_input_tokens_seen": 154348740, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 7194, + "time_per_iteration": 2.466512441635132 + }, + { + "auxiliary_loss_clip": 0.01117198, + "auxiliary_loss_mlp": 0.01037831, + "balance_loss_clip": 1.02435398, + "balance_loss_mlp": 1.04108119, + "epoch": 0.4325868029460394, + "flos": 22564793427840.0, + "grad_norm": 5.005212308773382, + "language_loss": 0.60044503, + "learning_rate": 2.5274436886337613e-06, + "loss": 0.62199533, + "num_input_tokens_seen": 154368835, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.76171875, + "step": 7195, + "time_per_iteration": 2.4522454738616943 + }, + { + "auxiliary_loss_clip": 0.0111962, + "auxiliary_loss_mlp": 0.01041876, + "balance_loss_clip": 1.02713561, + "balance_loss_mlp": 1.04041934, + "epoch": 0.43264692619870737, + "flos": 14605516396800.0, + "grad_norm": 2.2806268233026628, + "language_loss": 0.64930809, + "learning_rate": 2.527068004376515e-06, + "loss": 0.67092311, + "num_input_tokens_seen": 154384620, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.7890625, + "step": 7196, + "time_per_iteration": 2.4453718662261963 + }, + { + "auxiliary_loss_clip": 0.011204, + "auxiliary_loss_mlp": 0.01034681, + "balance_loss_clip": 1.02024436, + "balance_loss_mlp": 1.04024911, + "epoch": 0.43270704945137534, + "flos": 21501657659520.0, + "grad_norm": 4.696072713783665, + "language_loss": 0.72759318, + "learning_rate": 2.526692300132797e-06, + "loss": 0.74914396, + "num_input_tokens_seen": 154402865, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.80078125, + "step": 7197, + "time_per_iteration": 2.500256061553955 + }, + { + "auxiliary_loss_clip": 0.01116404, + "auxiliary_loss_mlp": 0.01045003, + "balance_loss_clip": 1.03106129, + "balance_loss_mlp": 1.04246271, + "epoch": 0.4327671727040433, + "flos": 25155891889920.0, + "grad_norm": 1.598666024351184, + "language_loss": 0.72644413, + "learning_rate": 2.5263165759168547e-06, + "loss": 0.7480582, + "num_input_tokens_seen": 154423625, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7421875, + "step": 7198, + "time_per_iteration": 2.567762613296509 + }, + { + "auxiliary_loss_clip": 0.01115203, + "auxiliary_loss_mlp": 0.01034805, + "balance_loss_clip": 1.02138782, + "balance_loss_mlp": 1.03913903, + "epoch": 0.43282729595671127, + "flos": 25447163276160.0, + "grad_norm": 1.3766106050597056, + "language_loss": 0.81292808, + "learning_rate": 2.525940831742934e-06, + "loss": 0.83442813, + "num_input_tokens_seen": 154444775, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7578125, + "step": 7199, + "time_per_iteration": 2.4782636165618896 + }, + { + "auxiliary_loss_clip": 0.01118715, + "auxiliary_loss_mlp": 0.01041613, + "balance_loss_clip": 1.02829099, + "balance_loss_mlp": 1.04219055, + "epoch": 0.43288741920937923, + "flos": 24126116878080.0, + "grad_norm": 2.2182298419994346, + "language_loss": 0.68883061, + "learning_rate": 2.525565067625286e-06, + "loss": 0.71043384, + "num_input_tokens_seen": 154460815, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.765625, + "step": 7200, + "time_per_iteration": 2.4730873107910156 + }, + { + "auxiliary_loss_clip": 0.01117767, + "auxiliary_loss_mlp": 0.01043187, + "balance_loss_clip": 1.02809453, + "balance_loss_mlp": 1.04055738, + "epoch": 0.4329475424620472, + "flos": 19204955066880.0, + "grad_norm": 2.134839210265846, + "language_loss": 0.87135142, + "learning_rate": 2.525189283578157e-06, + "loss": 0.89296097, + "num_input_tokens_seen": 154479145, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7734375, + "step": 7201, + "time_per_iteration": 2.47463321685791 + }, + { + "auxiliary_loss_clip": 0.01125345, + "auxiliary_loss_mlp": 0.01042574, + "balance_loss_clip": 1.02696979, + "balance_loss_mlp": 1.04488945, + "epoch": 0.43300766571471516, + "flos": 22638374438400.0, + "grad_norm": 2.16649852661544, + "language_loss": 0.64551014, + "learning_rate": 2.5248134796157974e-06, + "loss": 0.66718936, + "num_input_tokens_seen": 154498905, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8046875, + "step": 7202, + "time_per_iteration": 2.520963668823242 + }, + { + "auxiliary_loss_clip": 0.0111734, + "auxiliary_loss_mlp": 0.01031708, + "balance_loss_clip": 1.01931047, + "balance_loss_mlp": 1.04092193, + "epoch": 0.4330677889673831, + "flos": 22121080721280.0, + "grad_norm": 1.7838197935762699, + "language_loss": 0.81707418, + "learning_rate": 2.5244376557524586e-06, + "loss": 0.83856463, + "num_input_tokens_seen": 154517270, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.765625, + "step": 7203, + "time_per_iteration": 2.474724531173706 + }, + { + "auxiliary_loss_clip": 0.01121178, + "auxiliary_loss_mlp": 0.01047095, + "balance_loss_clip": 1.03284955, + "balance_loss_mlp": 1.04118741, + "epoch": 0.4331279122200511, + "flos": 23221527742080.0, + "grad_norm": 1.864866510083204, + "language_loss": 0.81476939, + "learning_rate": 2.5240618120023912e-06, + "loss": 0.83645213, + "num_input_tokens_seen": 154535945, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.80078125, + "step": 7204, + "time_per_iteration": 2.527064323425293 + }, + { + "auxiliary_loss_clip": 0.01117221, + "auxiliary_loss_mlp": 0.01035641, + "balance_loss_clip": 1.02226007, + "balance_loss_mlp": 1.04050207, + "epoch": 0.43318803547271906, + "flos": 18259750627200.0, + "grad_norm": 1.78968083236078, + "language_loss": 0.73432428, + "learning_rate": 2.5236859483798468e-06, + "loss": 0.75585294, + "num_input_tokens_seen": 154554935, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.765625, + "step": 7205, + "time_per_iteration": 2.406350612640381 + }, + { + "auxiliary_loss_clip": 0.01116769, + "auxiliary_loss_mlp": 0.01037164, + "balance_loss_clip": 1.02414668, + "balance_loss_mlp": 1.04308569, + "epoch": 0.433248158725387, + "flos": 27418407713280.0, + "grad_norm": 1.6284714357196102, + "language_loss": 0.75110108, + "learning_rate": 2.5233100648990803e-06, + "loss": 0.77264041, + "num_input_tokens_seen": 154576065, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.73828125, + "step": 7206, + "time_per_iteration": 2.527343511581421 + }, + { + "auxiliary_loss_clip": 0.01115193, + "auxiliary_loss_mlp": 0.01036596, + "balance_loss_clip": 1.02247548, + "balance_loss_mlp": 1.03899562, + "epoch": 0.433308281978055, + "flos": 23218008209280.0, + "grad_norm": 2.1762520186821854, + "language_loss": 0.78700626, + "learning_rate": 2.522934161574342e-06, + "loss": 0.80852419, + "num_input_tokens_seen": 154595110, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 7207, + "time_per_iteration": 2.4470536708831787 + }, + { + "auxiliary_loss_clip": 0.01121794, + "auxiliary_loss_mlp": 0.0103555, + "balance_loss_clip": 1.02026772, + "balance_loss_mlp": 1.04215813, + "epoch": 0.433368405230723, + "flos": 15852407166720.0, + "grad_norm": 1.6893238531796995, + "language_loss": 0.81100202, + "learning_rate": 2.5225582384198888e-06, + "loss": 0.83257544, + "num_input_tokens_seen": 154612255, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.796875, + "step": 7208, + "time_per_iteration": 2.4634876251220703 + }, + { + "auxiliary_loss_clip": 0.0111942, + "auxiliary_loss_mlp": 0.01034218, + "balance_loss_clip": 1.02083671, + "balance_loss_mlp": 1.04337454, + "epoch": 0.433428528483391, + "flos": 19026084314880.0, + "grad_norm": 2.072374936090108, + "language_loss": 0.70074689, + "learning_rate": 2.5221822954499744e-06, + "loss": 0.72228324, + "num_input_tokens_seen": 154630440, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7578125, + "step": 7209, + "time_per_iteration": 2.4699575901031494 + }, + { + "auxiliary_loss_clip": 0.01113916, + "auxiliary_loss_mlp": 0.01034855, + "balance_loss_clip": 1.02102125, + "balance_loss_mlp": 1.0392952, + "epoch": 0.43348865173605894, + "flos": 24718248581760.0, + "grad_norm": 1.533200118487429, + "language_loss": 0.81202382, + "learning_rate": 2.5218063326788557e-06, + "loss": 0.83351159, + "num_input_tokens_seen": 154652515, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.74609375, + "step": 7210, + "time_per_iteration": 2.5462334156036377 + }, + { + "auxiliary_loss_clip": 0.01114494, + "auxiliary_loss_mlp": 0.01036333, + "balance_loss_clip": 1.02280319, + "balance_loss_mlp": 1.03895545, + "epoch": 0.4335487749887269, + "flos": 22090664880000.0, + "grad_norm": 1.7483210767520514, + "language_loss": 0.81570554, + "learning_rate": 2.5214303501207885e-06, + "loss": 0.83721387, + "num_input_tokens_seen": 154670965, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 7211, + "time_per_iteration": 2.4835634231567383 + }, + { + "auxiliary_loss_clip": 0.01113663, + "auxiliary_loss_mlp": 0.01033957, + "balance_loss_clip": 1.02150583, + "balance_loss_mlp": 1.03778863, + "epoch": 0.43360889824139487, + "flos": 22382941847040.0, + "grad_norm": 2.083548110229539, + "language_loss": 0.74785221, + "learning_rate": 2.521054347790029e-06, + "loss": 0.76932836, + "num_input_tokens_seen": 154689980, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7578125, + "step": 7212, + "time_per_iteration": 2.492600917816162 + }, + { + "auxiliary_loss_clip": 0.01117192, + "auxiliary_loss_mlp": 0.01032782, + "balance_loss_clip": 1.01990747, + "balance_loss_mlp": 1.04162407, + "epoch": 0.43366902149406283, + "flos": 17528286067200.0, + "grad_norm": 1.6640529640233686, + "language_loss": 0.76755834, + "learning_rate": 2.5206783257008375e-06, + "loss": 0.78905809, + "num_input_tokens_seen": 154706570, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.75390625, + "step": 7213, + "time_per_iteration": 2.4060752391815186 + }, + { + "auxiliary_loss_clip": 0.01114624, + "auxiliary_loss_mlp": 0.01034054, + "balance_loss_clip": 1.02070832, + "balance_loss_mlp": 1.03933454, + "epoch": 0.4337291447467308, + "flos": 19022672522880.0, + "grad_norm": 1.5718517519296942, + "language_loss": 0.64949977, + "learning_rate": 2.520302283867471e-06, + "loss": 0.67098659, + "num_input_tokens_seen": 154725210, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75, + "step": 7214, + "time_per_iteration": 2.512662410736084 + }, + { + "auxiliary_loss_clip": 0.01110495, + "auxiliary_loss_mlp": 0.01034308, + "balance_loss_clip": 1.02173781, + "balance_loss_mlp": 1.03869057, + "epoch": 0.43378926799939876, + "flos": 27234042180480.0, + "grad_norm": 1.5916808794412316, + "language_loss": 0.71483207, + "learning_rate": 2.519926222304191e-06, + "loss": 0.73628008, + "num_input_tokens_seen": 154745945, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71875, + "step": 7215, + "time_per_iteration": 2.5099971294403076 + }, + { + "auxiliary_loss_clip": 0.01113826, + "auxiliary_loss_mlp": 0.0103555, + "balance_loss_clip": 1.02224684, + "balance_loss_mlp": 1.04080701, + "epoch": 0.43384939125206673, + "flos": 15961108700160.0, + "grad_norm": 2.1029551712935692, + "language_loss": 0.7531544, + "learning_rate": 2.519550141025255e-06, + "loss": 0.77464819, + "num_input_tokens_seen": 154763580, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73046875, + "step": 7216, + "time_per_iteration": 2.496631383895874 + }, + { + "auxiliary_loss_clip": 0.01124083, + "auxiliary_loss_mlp": 0.01044464, + "balance_loss_clip": 1.02873421, + "balance_loss_mlp": 1.04232287, + "epoch": 0.4339095145047347, + "flos": 21793216354560.0, + "grad_norm": 2.4885665438006086, + "language_loss": 0.75943911, + "learning_rate": 2.519174040044927e-06, + "loss": 0.78112465, + "num_input_tokens_seen": 154776825, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.81640625, + "step": 7217, + "time_per_iteration": 2.4563424587249756 + }, + { + "auxiliary_loss_clip": 0.01116821, + "auxiliary_loss_mlp": 0.01034071, + "balance_loss_clip": 1.02048075, + "balance_loss_mlp": 1.04149795, + "epoch": 0.43396963775740266, + "flos": 14209853109120.0, + "grad_norm": 2.0012841708103677, + "language_loss": 0.73723286, + "learning_rate": 2.5187979193774664e-06, + "loss": 0.7587418, + "num_input_tokens_seen": 154794025, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.75390625, + "step": 7218, + "time_per_iteration": 2.5055034160614014 + }, + { + "auxiliary_loss_clip": 0.01119586, + "auxiliary_loss_mlp": 0.01030517, + "balance_loss_clip": 1.01706386, + "balance_loss_mlp": 1.0420804, + "epoch": 0.4340297610100706, + "flos": 19719052473600.0, + "grad_norm": 1.7121326309499156, + "language_loss": 0.68759704, + "learning_rate": 2.5184217790371367e-06, + "loss": 0.7090981, + "num_input_tokens_seen": 154813105, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7734375, + "step": 7219, + "time_per_iteration": 2.4480419158935547 + }, + { + "auxiliary_loss_clip": 0.01117032, + "auxiliary_loss_mlp": 0.01034297, + "balance_loss_clip": 1.02088046, + "balance_loss_mlp": 1.0424881, + "epoch": 0.4340898842627386, + "flos": 18953508885120.0, + "grad_norm": 1.5876624694807844, + "language_loss": 0.77227521, + "learning_rate": 2.518045619038202e-06, + "loss": 0.79378843, + "num_input_tokens_seen": 154833525, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7421875, + "step": 7220, + "time_per_iteration": 6.918288230895996 + }, + { + "auxiliary_loss_clip": 0.01116062, + "auxiliary_loss_mlp": 0.01035178, + "balance_loss_clip": 1.02162933, + "balance_loss_mlp": 1.04022503, + "epoch": 0.4341500075154066, + "flos": 22018304931840.0, + "grad_norm": 1.9118836764348202, + "language_loss": 0.69684327, + "learning_rate": 2.5176694393949243e-06, + "loss": 0.71835566, + "num_input_tokens_seen": 154853090, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7578125, + "step": 7221, + "time_per_iteration": 2.470270872116089 + }, + { + "auxiliary_loss_clip": 0.0111827, + "auxiliary_loss_mlp": 0.01037458, + "balance_loss_clip": 1.02436888, + "balance_loss_mlp": 1.04102325, + "epoch": 0.4342101307680746, + "flos": 23582465556480.0, + "grad_norm": 2.3043912227088206, + "language_loss": 0.64915985, + "learning_rate": 2.51729324012157e-06, + "loss": 0.67071712, + "num_input_tokens_seen": 154872055, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7734375, + "step": 7222, + "time_per_iteration": 2.553450584411621 + }, + { + "auxiliary_loss_clip": 0.01115314, + "auxiliary_loss_mlp": 0.01033134, + "balance_loss_clip": 1.01851892, + "balance_loss_mlp": 1.0400629, + "epoch": 0.43427025402074254, + "flos": 17967976450560.0, + "grad_norm": 1.98015103861908, + "language_loss": 0.73039752, + "learning_rate": 2.5169170212324053e-06, + "loss": 0.75188196, + "num_input_tokens_seen": 154886645, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.75390625, + "step": 7223, + "time_per_iteration": 2.4311954975128174 + }, + { + "auxiliary_loss_clip": 0.01117336, + "auxiliary_loss_mlp": 0.01030543, + "balance_loss_clip": 1.01639247, + "balance_loss_mlp": 1.03914881, + "epoch": 0.4343303772734105, + "flos": 26286395616000.0, + "grad_norm": 1.7516175042559776, + "language_loss": 0.93677819, + "learning_rate": 2.516540782741694e-06, + "loss": 0.95825702, + "num_input_tokens_seen": 154906775, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.78125, + "step": 7224, + "time_per_iteration": 2.5507140159606934 + }, + { + "auxiliary_loss_clip": 0.0111604, + "auxiliary_loss_mlp": 0.01035929, + "balance_loss_clip": 1.02230883, + "balance_loss_mlp": 1.04143298, + "epoch": 0.43439050052607847, + "flos": 26833961520000.0, + "grad_norm": 1.4456333860398556, + "language_loss": 0.61234355, + "learning_rate": 2.5161645246637056e-06, + "loss": 0.63386333, + "num_input_tokens_seen": 154926990, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 7225, + "time_per_iteration": 2.4982893466949463 + }, + { + "auxiliary_loss_clip": 0.01118584, + "auxiliary_loss_mlp": 0.01040064, + "balance_loss_clip": 1.02594388, + "balance_loss_mlp": 1.04326594, + "epoch": 0.43445062377874644, + "flos": 21397660807680.0, + "grad_norm": 1.8262630970377216, + "language_loss": 0.77771807, + "learning_rate": 2.5157882470127054e-06, + "loss": 0.79930449, + "num_input_tokens_seen": 154946210, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.75390625, + "step": 7226, + "time_per_iteration": 2.5427355766296387 + }, + { + "auxiliary_loss_clip": 0.0111488, + "auxiliary_loss_mlp": 0.01032848, + "balance_loss_clip": 1.01968753, + "balance_loss_mlp": 1.04169869, + "epoch": 0.4345107470314144, + "flos": 19901945548800.0, + "grad_norm": 1.6421213218207402, + "language_loss": 0.84485722, + "learning_rate": 2.515411949802964e-06, + "loss": 0.8663345, + "num_input_tokens_seen": 154964995, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73046875, + "step": 7227, + "time_per_iteration": 2.450390577316284 + }, + { + "auxiliary_loss_clip": 0.01115781, + "auxiliary_loss_mlp": 0.01035722, + "balance_loss_clip": 1.02163696, + "balance_loss_mlp": 1.04135513, + "epoch": 0.43457087028408237, + "flos": 26432623883520.0, + "grad_norm": 2.0443971193166735, + "language_loss": 0.76866895, + "learning_rate": 2.5150356330487498e-06, + "loss": 0.79018396, + "num_input_tokens_seen": 154984775, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 7228, + "time_per_iteration": 2.5690906047821045 + }, + { + "auxiliary_loss_clip": 0.01118098, + "auxiliary_loss_mlp": 0.01036235, + "balance_loss_clip": 1.02229989, + "balance_loss_mlp": 1.04278994, + "epoch": 0.43463099353675033, + "flos": 31868816855040.0, + "grad_norm": 1.4832672479414948, + "language_loss": 0.80732882, + "learning_rate": 2.5146592967643324e-06, + "loss": 0.82887214, + "num_input_tokens_seen": 155008125, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75390625, + "step": 7229, + "time_per_iteration": 2.552069902420044 + }, + { + "auxiliary_loss_clip": 0.01118257, + "auxiliary_loss_mlp": 0.01040852, + "balance_loss_clip": 1.02682161, + "balance_loss_mlp": 1.04213512, + "epoch": 0.4346911167894183, + "flos": 24571266128640.0, + "grad_norm": 2.091517296377785, + "language_loss": 0.81964421, + "learning_rate": 2.5142829409639834e-06, + "loss": 0.84123534, + "num_input_tokens_seen": 155027885, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 7230, + "time_per_iteration": 2.5944671630859375 + }, + { + "auxiliary_loss_clip": 0.01123399, + "auxiliary_loss_mlp": 0.01044498, + "balance_loss_clip": 1.03034186, + "balance_loss_mlp": 1.0445168, + "epoch": 0.43475124004208626, + "flos": 17090678672640.0, + "grad_norm": 2.146338977702966, + "language_loss": 0.77091062, + "learning_rate": 2.513906565661973e-06, + "loss": 0.79258955, + "num_input_tokens_seen": 155043375, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7890625, + "step": 7231, + "time_per_iteration": 2.460886001586914 + }, + { + "auxiliary_loss_clip": 0.01116567, + "auxiliary_loss_mlp": 0.01034718, + "balance_loss_clip": 1.02217722, + "balance_loss_mlp": 1.0421958, + "epoch": 0.4348113632947542, + "flos": 26104615862400.0, + "grad_norm": 1.391615561962781, + "language_loss": 0.6858201, + "learning_rate": 2.513530170872575e-06, + "loss": 0.70733297, + "num_input_tokens_seen": 155062930, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7421875, + "step": 7232, + "time_per_iteration": 2.614415407180786 + }, + { + "auxiliary_loss_clip": 0.01119763, + "auxiliary_loss_mlp": 0.01034466, + "balance_loss_clip": 1.02036333, + "balance_loss_mlp": 1.04160166, + "epoch": 0.4348714865474222, + "flos": 34200496316160.0, + "grad_norm": 1.6911603415584286, + "language_loss": 0.7200706, + "learning_rate": 2.5131537566100605e-06, + "loss": 0.74161285, + "num_input_tokens_seen": 155084980, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.78125, + "step": 7233, + "time_per_iteration": 2.5665411949157715 + }, + { + "auxiliary_loss_clip": 0.01120637, + "auxiliary_loss_mlp": 0.01040107, + "balance_loss_clip": 1.02490747, + "balance_loss_mlp": 1.04198027, + "epoch": 0.43493160980009016, + "flos": 31537468869120.0, + "grad_norm": 1.536262058034198, + "language_loss": 0.746382, + "learning_rate": 2.5127773228887053e-06, + "loss": 0.7679894, + "num_input_tokens_seen": 155107260, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.78515625, + "step": 7234, + "time_per_iteration": 2.577014207839966 + }, + { + "auxiliary_loss_clip": 0.01123093, + "auxiliary_loss_mlp": 0.01039703, + "balance_loss_clip": 1.02523136, + "balance_loss_mlp": 1.04223037, + "epoch": 0.4349917330527582, + "flos": 24061334699520.0, + "grad_norm": 1.829117772001415, + "language_loss": 0.58860987, + "learning_rate": 2.512400869722782e-06, + "loss": 0.61023784, + "num_input_tokens_seen": 155126720, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.80859375, + "step": 7235, + "time_per_iteration": 2.4759557247161865 + }, + { + "auxiliary_loss_clip": 0.01116416, + "auxiliary_loss_mlp": 0.01032457, + "balance_loss_clip": 1.01931453, + "balance_loss_mlp": 1.04053211, + "epoch": 0.43505185630542614, + "flos": 30519329863680.0, + "grad_norm": 1.4942606531447196, + "language_loss": 0.7751596, + "learning_rate": 2.512024397126566e-06, + "loss": 0.79664838, + "num_input_tokens_seen": 155148640, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7578125, + "step": 7236, + "time_per_iteration": 2.6113193035125732 + }, + { + "auxiliary_loss_clip": 0.01113405, + "auxiliary_loss_mlp": 0.01033592, + "balance_loss_clip": 1.01958489, + "balance_loss_mlp": 1.04001045, + "epoch": 0.4351119795580941, + "flos": 15735158196480.0, + "grad_norm": 1.713978383195529, + "language_loss": 0.8155449, + "learning_rate": 2.5116479051143345e-06, + "loss": 0.83701491, + "num_input_tokens_seen": 155165870, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.734375, + "step": 7237, + "time_per_iteration": 2.4341909885406494 + }, + { + "auxiliary_loss_clip": 0.01116801, + "auxiliary_loss_mlp": 0.01035584, + "balance_loss_clip": 1.02109957, + "balance_loss_mlp": 1.04103971, + "epoch": 0.4351721028107621, + "flos": 18731760272640.0, + "grad_norm": 3.0219595130639156, + "language_loss": 0.62897265, + "learning_rate": 2.5112713937003623e-06, + "loss": 0.65049648, + "num_input_tokens_seen": 155185315, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7578125, + "step": 7238, + "time_per_iteration": 2.5014469623565674 + }, + { + "auxiliary_loss_clip": 0.01111642, + "auxiliary_loss_mlp": 0.01041748, + "balance_loss_clip": 1.02848005, + "balance_loss_mlp": 1.03874493, + "epoch": 0.43523222606343004, + "flos": 25226887121280.0, + "grad_norm": 1.5839613956475427, + "language_loss": 0.85889554, + "learning_rate": 2.510894862898928e-06, + "loss": 0.88042951, + "num_input_tokens_seen": 155205790, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73046875, + "step": 7239, + "time_per_iteration": 2.4976143836975098 + }, + { + "auxiliary_loss_clip": 0.01118679, + "auxiliary_loss_mlp": 0.01032563, + "balance_loss_clip": 1.01896167, + "balance_loss_mlp": 1.0434041, + "epoch": 0.435292349316098, + "flos": 22709190101760.0, + "grad_norm": 1.4715329043565741, + "language_loss": 0.7269268, + "learning_rate": 2.510518312724309e-06, + "loss": 0.74843925, + "num_input_tokens_seen": 155226475, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.75, + "step": 7240, + "time_per_iteration": 2.5350124835968018 + }, + { + "auxiliary_loss_clip": 0.01119982, + "auxiliary_loss_mlp": 0.01033555, + "balance_loss_clip": 1.01897597, + "balance_loss_mlp": 1.04185855, + "epoch": 0.43535247256876597, + "flos": 25775889569280.0, + "grad_norm": 1.6878068305061695, + "language_loss": 0.81562793, + "learning_rate": 2.5101417431907842e-06, + "loss": 0.83716333, + "num_input_tokens_seen": 155247110, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.78125, + "step": 7241, + "time_per_iteration": 2.4924368858337402 + }, + { + "auxiliary_loss_clip": 0.01125084, + "auxiliary_loss_mlp": 0.01041861, + "balance_loss_clip": 1.02636945, + "balance_loss_mlp": 1.04387474, + "epoch": 0.43541259582143393, + "flos": 17528142412800.0, + "grad_norm": 3.067853888150903, + "language_loss": 0.79639387, + "learning_rate": 2.5097651543126345e-06, + "loss": 0.81806338, + "num_input_tokens_seen": 155261335, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8125, + "step": 7242, + "time_per_iteration": 2.4884228706359863 + }, + { + "auxiliary_loss_clip": 0.01118288, + "auxiliary_loss_mlp": 0.01035892, + "balance_loss_clip": 1.02146733, + "balance_loss_mlp": 1.03994465, + "epoch": 0.4354727190741019, + "flos": 15195205975680.0, + "grad_norm": 2.2924190339180135, + "language_loss": 0.6872946, + "learning_rate": 2.509388546104138e-06, + "loss": 0.70883644, + "num_input_tokens_seen": 155278510, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78515625, + "step": 7243, + "time_per_iteration": 2.428065538406372 + }, + { + "auxiliary_loss_clip": 0.01114023, + "auxiliary_loss_mlp": 0.0103125, + "balance_loss_clip": 1.01814318, + "balance_loss_mlp": 1.04141152, + "epoch": 0.43553284232676986, + "flos": 16649264436480.0, + "grad_norm": 1.6975937608840317, + "language_loss": 0.8125546, + "learning_rate": 2.5090119185795766e-06, + "loss": 0.83400726, + "num_input_tokens_seen": 155296450, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 7244, + "time_per_iteration": 2.4931905269622803 + }, + { + "auxiliary_loss_clip": 0.01118248, + "auxiliary_loss_mlp": 0.01030839, + "balance_loss_clip": 1.01785159, + "balance_loss_mlp": 1.0428431, + "epoch": 0.43559296557943783, + "flos": 23400865370880.0, + "grad_norm": 1.7229772693729426, + "language_loss": 0.74017537, + "learning_rate": 2.508635271753234e-06, + "loss": 0.7616663, + "num_input_tokens_seen": 155316080, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.75390625, + "step": 7245, + "time_per_iteration": 2.4678800106048584 + }, + { + "auxiliary_loss_clip": 0.01116663, + "auxiliary_loss_mlp": 0.0103805, + "balance_loss_clip": 1.0248003, + "balance_loss_mlp": 1.041008, + "epoch": 0.4356530888321058, + "flos": 22419067950720.0, + "grad_norm": 1.577710817669204, + "language_loss": 0.7671771, + "learning_rate": 2.508258605639389e-06, + "loss": 0.78872424, + "num_input_tokens_seen": 155336765, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75390625, + "step": 7246, + "time_per_iteration": 2.5109541416168213 + }, + { + "auxiliary_loss_clip": 0.01118541, + "auxiliary_loss_mlp": 0.01037789, + "balance_loss_clip": 1.02348995, + "balance_loss_mlp": 1.04209638, + "epoch": 0.43571321208477376, + "flos": 21616141282560.0, + "grad_norm": 1.7904357433283469, + "language_loss": 0.85364228, + "learning_rate": 2.5078819202523275e-06, + "loss": 0.87520564, + "num_input_tokens_seen": 155356440, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.765625, + "step": 7247, + "time_per_iteration": 2.4546074867248535 + }, + { + "auxiliary_loss_clip": 0.01117365, + "auxiliary_loss_mlp": 0.01039048, + "balance_loss_clip": 1.02600694, + "balance_loss_mlp": 1.0420599, + "epoch": 0.4357733353374418, + "flos": 23987358639360.0, + "grad_norm": 1.5214849587217785, + "language_loss": 0.72576565, + "learning_rate": 2.507505215606333e-06, + "loss": 0.74732977, + "num_input_tokens_seen": 155377070, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.75, + "step": 7248, + "time_per_iteration": 2.5288567543029785 + }, + { + "auxiliary_loss_clip": 0.01117005, + "auxiliary_loss_mlp": 0.01036462, + "balance_loss_clip": 1.02280688, + "balance_loss_mlp": 1.04225719, + "epoch": 0.43583345859010975, + "flos": 25264737077760.0, + "grad_norm": 1.6049303411594007, + "language_loss": 0.87276042, + "learning_rate": 2.5071284917156893e-06, + "loss": 0.8942951, + "num_input_tokens_seen": 155398415, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 7249, + "time_per_iteration": 2.497281312942505 + }, + { + "auxiliary_loss_clip": 0.0111866, + "auxiliary_loss_mlp": 0.01043972, + "balance_loss_clip": 1.03053117, + "balance_loss_mlp": 1.04112244, + "epoch": 0.4358935818427777, + "flos": 23696302734720.0, + "grad_norm": 1.835450546624213, + "language_loss": 0.81989753, + "learning_rate": 2.506751748594683e-06, + "loss": 0.84152383, + "num_input_tokens_seen": 155415625, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7734375, + "step": 7250, + "time_per_iteration": 2.5563321113586426 + }, + { + "auxiliary_loss_clip": 0.01124846, + "auxiliary_loss_mlp": 0.01038743, + "balance_loss_clip": 1.02484369, + "balance_loss_mlp": 1.04729581, + "epoch": 0.4359537050954457, + "flos": 29532827761920.0, + "grad_norm": 1.737362510880261, + "language_loss": 0.84760177, + "learning_rate": 2.5063749862575988e-06, + "loss": 0.86923766, + "num_input_tokens_seen": 155435505, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.77734375, + "step": 7251, + "time_per_iteration": 2.5427803993225098 + }, + { + "auxiliary_loss_clip": 0.01113729, + "auxiliary_loss_mlp": 0.01038592, + "balance_loss_clip": 1.02469254, + "balance_loss_mlp": 1.03979266, + "epoch": 0.43601382834811364, + "flos": 22711273090560.0, + "grad_norm": 1.5112002334274994, + "language_loss": 0.69018251, + "learning_rate": 2.5059982047187245e-06, + "loss": 0.71170568, + "num_input_tokens_seen": 155455425, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.73828125, + "step": 7252, + "time_per_iteration": 2.5041210651397705 + }, + { + "auxiliary_loss_clip": 0.01115762, + "auxiliary_loss_mlp": 0.01036375, + "balance_loss_clip": 1.02233779, + "balance_loss_mlp": 1.04257536, + "epoch": 0.4360739516007816, + "flos": 19098731571840.0, + "grad_norm": 1.7846888638519947, + "language_loss": 0.83733922, + "learning_rate": 2.505621403992348e-06, + "loss": 0.85886061, + "num_input_tokens_seen": 155474250, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.734375, + "step": 7253, + "time_per_iteration": 2.434375047683716 + }, + { + "auxiliary_loss_clip": 0.01116361, + "auxiliary_loss_mlp": 0.01038184, + "balance_loss_clip": 1.02386165, + "balance_loss_mlp": 1.04254532, + "epoch": 0.43613407485344957, + "flos": 23404420817280.0, + "grad_norm": 1.4489781171091827, + "language_loss": 0.70361209, + "learning_rate": 2.505244584092757e-06, + "loss": 0.72515762, + "num_input_tokens_seen": 155494685, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.73828125, + "step": 7254, + "time_per_iteration": 2.5304319858551025 + }, + { + "auxiliary_loss_clip": 0.01116723, + "auxiliary_loss_mlp": 0.01038221, + "balance_loss_clip": 1.02503693, + "balance_loss_mlp": 1.04295266, + "epoch": 0.43619419810611754, + "flos": 22637799820800.0, + "grad_norm": 2.261189856456705, + "language_loss": 0.80833256, + "learning_rate": 2.5048677450342406e-06, + "loss": 0.82988203, + "num_input_tokens_seen": 155513040, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73828125, + "step": 7255, + "time_per_iteration": 2.4619336128234863 + }, + { + "auxiliary_loss_clip": 0.01115842, + "auxiliary_loss_mlp": 0.0103715, + "balance_loss_clip": 1.02375722, + "balance_loss_mlp": 1.0402987, + "epoch": 0.4362543213587855, + "flos": 20047958334720.0, + "grad_norm": 1.6623402785544918, + "language_loss": 0.77301329, + "learning_rate": 2.504490886831089e-06, + "loss": 0.79454327, + "num_input_tokens_seen": 155530100, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75390625, + "step": 7256, + "time_per_iteration": 2.502201557159424 + }, + { + "auxiliary_loss_clip": 0.01117553, + "auxiliary_loss_mlp": 0.01039022, + "balance_loss_clip": 1.02568853, + "balance_loss_mlp": 1.04400241, + "epoch": 0.43631444461145347, + "flos": 21361319222400.0, + "grad_norm": 1.8521029690454978, + "language_loss": 0.76273203, + "learning_rate": 2.5041140094975922e-06, + "loss": 0.78429782, + "num_input_tokens_seen": 155549375, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.734375, + "step": 7257, + "time_per_iteration": 2.4721548557281494 + }, + { + "auxiliary_loss_clip": 0.01115455, + "auxiliary_loss_mlp": 0.01039484, + "balance_loss_clip": 1.02553642, + "balance_loss_mlp": 1.04027009, + "epoch": 0.43637456786412143, + "flos": 22418529246720.0, + "grad_norm": 1.675034420512285, + "language_loss": 0.73065001, + "learning_rate": 2.5037371130480417e-06, + "loss": 0.75219941, + "num_input_tokens_seen": 155569395, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75, + "step": 7258, + "time_per_iteration": 2.5251166820526123 + }, + { + "auxiliary_loss_clip": 0.0111727, + "auxiliary_loss_mlp": 0.01034289, + "balance_loss_clip": 1.02083004, + "balance_loss_mlp": 1.04163384, + "epoch": 0.4364346911167894, + "flos": 28548839612160.0, + "grad_norm": 2.491243867162561, + "language_loss": 0.76496607, + "learning_rate": 2.5033601974967297e-06, + "loss": 0.78648162, + "num_input_tokens_seen": 155589090, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 7259, + "time_per_iteration": 2.4948387145996094 + }, + { + "auxiliary_loss_clip": 0.01038123, + "auxiliary_loss_mlp": 0.01006149, + "balance_loss_clip": 1.00483215, + "balance_loss_mlp": 1.01505399, + "epoch": 0.43649481436945736, + "flos": 62659345380480.0, + "grad_norm": 0.7446610885032177, + "language_loss": 0.570382, + "learning_rate": 2.5029832628579483e-06, + "loss": 0.59082472, + "num_input_tokens_seen": 155648660, + "router_z_loss_clip": 0.01318359, + "router_z_loss_mlp": 0.23144531, + "step": 7260, + "time_per_iteration": 3.023712396621704 + }, + { + "auxiliary_loss_clip": 0.01119405, + "auxiliary_loss_mlp": 0.01044844, + "balance_loss_clip": 1.03061068, + "balance_loss_mlp": 1.0423255, + "epoch": 0.4365549376221254, + "flos": 30592120775040.0, + "grad_norm": 2.013500079504657, + "language_loss": 0.71356845, + "learning_rate": 2.5026063091459907e-06, + "loss": 0.7352109, + "num_input_tokens_seen": 155669945, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.76953125, + "step": 7261, + "time_per_iteration": 2.559830665588379 + }, + { + "auxiliary_loss_clip": 0.01117377, + "auxiliary_loss_mlp": 0.01045312, + "balance_loss_clip": 1.0311265, + "balance_loss_mlp": 1.04076374, + "epoch": 0.43661506087479335, + "flos": 17165875795200.0, + "grad_norm": 1.767533570577482, + "language_loss": 0.69423878, + "learning_rate": 2.5022293363751522e-06, + "loss": 0.71586561, + "num_input_tokens_seen": 155688555, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.765625, + "step": 7262, + "time_per_iteration": 5.4921791553497314 + }, + { + "auxiliary_loss_clip": 0.01111099, + "auxiliary_loss_mlp": 0.01029231, + "balance_loss_clip": 1.01699996, + "balance_loss_mlp": 1.04062569, + "epoch": 0.4366751841274613, + "flos": 22047499710720.0, + "grad_norm": 1.7128833789230435, + "language_loss": 0.80033064, + "learning_rate": 2.501852344559726e-06, + "loss": 0.82173395, + "num_input_tokens_seen": 155705370, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.70703125, + "step": 7263, + "time_per_iteration": 2.5026779174804688 + }, + { + "auxiliary_loss_clip": 0.0111778, + "auxiliary_loss_mlp": 0.01046117, + "balance_loss_clip": 1.03210425, + "balance_loss_mlp": 1.043383, + "epoch": 0.4367353073801293, + "flos": 15997306631040.0, + "grad_norm": 1.8087965620474522, + "language_loss": 0.75092399, + "learning_rate": 2.50147533371401e-06, + "loss": 0.77256304, + "num_input_tokens_seen": 155721890, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 7264, + "time_per_iteration": 2.487065553665161 + }, + { + "auxiliary_loss_clip": 0.01114844, + "auxiliary_loss_mlp": 0.01035156, + "balance_loss_clip": 1.02143478, + "balance_loss_mlp": 1.04089546, + "epoch": 0.43679543063279724, + "flos": 38217535868160.0, + "grad_norm": 1.8571442110240568, + "language_loss": 0.61855227, + "learning_rate": 2.501098303852298e-06, + "loss": 0.6400522, + "num_input_tokens_seen": 155743970, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.73828125, + "step": 7265, + "time_per_iteration": 2.5982677936553955 + }, + { + "auxiliary_loss_clip": 0.01112809, + "auxiliary_loss_mlp": 0.01031457, + "balance_loss_clip": 1.01859391, + "balance_loss_mlp": 1.04026711, + "epoch": 0.4368555538854652, + "flos": 15193230727680.0, + "grad_norm": 2.1628188735926845, + "language_loss": 0.72982574, + "learning_rate": 2.5007212549888884e-06, + "loss": 0.75126845, + "num_input_tokens_seen": 155761830, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7265625, + "step": 7266, + "time_per_iteration": 2.4690847396850586 + }, + { + "auxiliary_loss_clip": 0.0111929, + "auxiliary_loss_mlp": 0.01037863, + "balance_loss_clip": 1.0240345, + "balance_loss_mlp": 1.04332638, + "epoch": 0.4369156771381332, + "flos": 23069086421760.0, + "grad_norm": 2.2896909207829954, + "language_loss": 0.81570059, + "learning_rate": 2.5003441871380794e-06, + "loss": 0.83727205, + "num_input_tokens_seen": 155779610, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76171875, + "step": 7267, + "time_per_iteration": 2.463283061981201 + }, + { + "auxiliary_loss_clip": 0.01113248, + "auxiliary_loss_mlp": 0.01030451, + "balance_loss_clip": 1.01803577, + "balance_loss_mlp": 1.04085267, + "epoch": 0.43697580039080114, + "flos": 23441085624960.0, + "grad_norm": 1.9116109849221483, + "language_loss": 0.74723095, + "learning_rate": 2.4999671003141674e-06, + "loss": 0.76866794, + "num_input_tokens_seen": 155798765, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.72265625, + "step": 7268, + "time_per_iteration": 2.516263723373413 + }, + { + "auxiliary_loss_clip": 0.01121105, + "auxiliary_loss_mlp": 0.01042039, + "balance_loss_clip": 1.02736425, + "balance_loss_mlp": 1.04315591, + "epoch": 0.4370359236434691, + "flos": 18514680428160.0, + "grad_norm": 1.9119374296408282, + "language_loss": 0.7954827, + "learning_rate": 2.499589994531454e-06, + "loss": 0.81711417, + "num_input_tokens_seen": 155817750, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78125, + "step": 7269, + "time_per_iteration": 2.4647111892700195 + }, + { + "auxiliary_loss_clip": 0.01117424, + "auxiliary_loss_mlp": 0.01037217, + "balance_loss_clip": 1.02404499, + "balance_loss_mlp": 1.04315174, + "epoch": 0.43709604689613707, + "flos": 23222497409280.0, + "grad_norm": 2.072373926876921, + "language_loss": 0.75031221, + "learning_rate": 2.499212869804237e-06, + "loss": 0.77185863, + "num_input_tokens_seen": 155836490, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 7270, + "time_per_iteration": 2.4963974952697754 + }, + { + "auxiliary_loss_clip": 0.01116927, + "auxiliary_loss_mlp": 0.01029854, + "balance_loss_clip": 1.01639485, + "balance_loss_mlp": 1.04269087, + "epoch": 0.43715617014880503, + "flos": 23803711378560.0, + "grad_norm": 1.906091328168401, + "language_loss": 0.79437554, + "learning_rate": 2.4988357261468182e-06, + "loss": 0.81584334, + "num_input_tokens_seen": 155856225, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7421875, + "step": 7271, + "time_per_iteration": 2.487238645553589 + }, + { + "auxiliary_loss_clip": 0.01039832, + "auxiliary_loss_mlp": 0.01001038, + "balance_loss_clip": 0.99965489, + "balance_loss_mlp": 1.01678514, + "epoch": 0.437216293401473, + "flos": 61941204766080.0, + "grad_norm": 0.6948313241096988, + "language_loss": 0.54902828, + "learning_rate": 2.4984585635734993e-06, + "loss": 0.56943697, + "num_input_tokens_seen": 155916770, + "router_z_loss_clip": 0.01385498, + "router_z_loss_mlp": 0.23046875, + "step": 7272, + "time_per_iteration": 3.1392502784729004 + }, + { + "auxiliary_loss_clip": 0.011197, + "auxiliary_loss_mlp": 0.01042804, + "balance_loss_clip": 1.0286535, + "balance_loss_mlp": 1.04332781, + "epoch": 0.43727641665414096, + "flos": 21982250655360.0, + "grad_norm": 2.967819772960297, + "language_loss": 0.70136559, + "learning_rate": 2.498081382098581e-06, + "loss": 0.72299063, + "num_input_tokens_seen": 155936490, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.765625, + "step": 7273, + "time_per_iteration": 2.468592643737793 + }, + { + "auxiliary_loss_clip": 0.01119234, + "auxiliary_loss_mlp": 0.01039173, + "balance_loss_clip": 1.02515411, + "balance_loss_mlp": 1.04280722, + "epoch": 0.437336539906809, + "flos": 39530860842240.0, + "grad_norm": 1.832145479464728, + "language_loss": 0.75091398, + "learning_rate": 2.497704181736367e-06, + "loss": 0.77249801, + "num_input_tokens_seen": 155957595, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.765625, + "step": 7274, + "time_per_iteration": 2.669516086578369 + }, + { + "auxiliary_loss_clip": 0.01112894, + "auxiliary_loss_mlp": 0.01029332, + "balance_loss_clip": 1.01741123, + "balance_loss_mlp": 1.04002881, + "epoch": 0.43739666315947695, + "flos": 17457147181440.0, + "grad_norm": 1.8126381729021082, + "language_loss": 0.80507416, + "learning_rate": 2.49732696250116e-06, + "loss": 0.82649636, + "num_input_tokens_seen": 155975710, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.7265625, + "step": 7275, + "time_per_iteration": 2.455235481262207 + }, + { + "auxiliary_loss_clip": 0.01118348, + "auxiliary_loss_mlp": 0.01036773, + "balance_loss_clip": 1.02357626, + "balance_loss_mlp": 1.04496706, + "epoch": 0.4374567864121449, + "flos": 16358747235840.0, + "grad_norm": 2.065941875742038, + "language_loss": 0.80955482, + "learning_rate": 2.496949724407266e-06, + "loss": 0.83110607, + "num_input_tokens_seen": 155993090, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 7276, + "time_per_iteration": 2.543306827545166 + }, + { + "auxiliary_loss_clip": 0.01122471, + "auxiliary_loss_mlp": 0.01034927, + "balance_loss_clip": 1.02145052, + "balance_loss_mlp": 1.04409111, + "epoch": 0.4375169096648129, + "flos": 30587523834240.0, + "grad_norm": 1.794283698167311, + "language_loss": 0.73373604, + "learning_rate": 2.496572467468988e-06, + "loss": 0.75530994, + "num_input_tokens_seen": 156013685, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.78515625, + "step": 7277, + "time_per_iteration": 2.5931403636932373 + }, + { + "auxiliary_loss_clip": 0.01117806, + "auxiliary_loss_mlp": 0.01035681, + "balance_loss_clip": 1.0222764, + "balance_loss_mlp": 1.04351854, + "epoch": 0.43757703291748085, + "flos": 30555599621760.0, + "grad_norm": 1.8969119275678887, + "language_loss": 0.72953606, + "learning_rate": 2.4961951917006317e-06, + "loss": 0.75107086, + "num_input_tokens_seen": 156034300, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7421875, + "step": 7278, + "time_per_iteration": 2.576266288757324 + }, + { + "auxiliary_loss_clip": 0.0111536, + "auxiliary_loss_mlp": 0.01033831, + "balance_loss_clip": 1.02152252, + "balance_loss_mlp": 1.04212785, + "epoch": 0.4376371561701488, + "flos": 21397373498880.0, + "grad_norm": 1.6273415021791042, + "language_loss": 0.65815622, + "learning_rate": 2.4958178971165046e-06, + "loss": 0.6796481, + "num_input_tokens_seen": 156053805, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.734375, + "step": 7279, + "time_per_iteration": 2.4717864990234375 + }, + { + "auxiliary_loss_clip": 0.01122391, + "auxiliary_loss_mlp": 0.01034407, + "balance_loss_clip": 1.02098393, + "balance_loss_mlp": 1.04393768, + "epoch": 0.4376972794228168, + "flos": 23404384903680.0, + "grad_norm": 1.838486718423984, + "language_loss": 0.82088757, + "learning_rate": 2.4954405837309126e-06, + "loss": 0.84245551, + "num_input_tokens_seen": 156073295, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.78515625, + "step": 7280, + "time_per_iteration": 2.5370771884918213 + }, + { + "auxiliary_loss_clip": 0.01114089, + "auxiliary_loss_mlp": 0.01033911, + "balance_loss_clip": 1.0209589, + "balance_loss_mlp": 1.04176164, + "epoch": 0.43775740267548474, + "flos": 22892945103360.0, + "grad_norm": 1.430381072646336, + "language_loss": 0.76786566, + "learning_rate": 2.4950632515581653e-06, + "loss": 0.78934562, + "num_input_tokens_seen": 156094540, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.72265625, + "step": 7281, + "time_per_iteration": 2.5260467529296875 + }, + { + "auxiliary_loss_clip": 0.01116043, + "auxiliary_loss_mlp": 0.01038639, + "balance_loss_clip": 1.02582431, + "balance_loss_mlp": 1.04211211, + "epoch": 0.4378175259281527, + "flos": 23294390480640.0, + "grad_norm": 1.8435972134321474, + "language_loss": 0.7572853, + "learning_rate": 2.494685900612569e-06, + "loss": 0.77883214, + "num_input_tokens_seen": 156114070, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7421875, + "step": 7282, + "time_per_iteration": 2.5332953929901123 + }, + { + "auxiliary_loss_clip": 0.01119087, + "auxiliary_loss_mlp": 0.01039188, + "balance_loss_clip": 1.02581239, + "balance_loss_mlp": 1.04421043, + "epoch": 0.43787764918082067, + "flos": 23876897339520.0, + "grad_norm": 1.8874106414487752, + "language_loss": 0.8494271, + "learning_rate": 2.4943085309084333e-06, + "loss": 0.87100983, + "num_input_tokens_seen": 156132130, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.75, + "step": 7283, + "time_per_iteration": 2.458500623703003 + }, + { + "auxiliary_loss_clip": 0.01119709, + "auxiliary_loss_mlp": 0.01034549, + "balance_loss_clip": 1.02060771, + "balance_loss_mlp": 1.04216719, + "epoch": 0.43793777243348864, + "flos": 23988148738560.0, + "grad_norm": 1.9095323636494845, + "language_loss": 0.8005324, + "learning_rate": 2.49393114246007e-06, + "loss": 0.82207501, + "num_input_tokens_seen": 156150820, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7734375, + "step": 7284, + "time_per_iteration": 2.5258796215057373 + }, + { + "auxiliary_loss_clip": 0.01115467, + "auxiliary_loss_mlp": 0.01040827, + "balance_loss_clip": 1.02851903, + "balance_loss_mlp": 1.04236269, + "epoch": 0.4379978956861566, + "flos": 18624064320000.0, + "grad_norm": 1.535068058496724, + "language_loss": 0.8028115, + "learning_rate": 2.493553735281787e-06, + "loss": 0.82437444, + "num_input_tokens_seen": 156170125, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.734375, + "step": 7285, + "time_per_iteration": 2.4441394805908203 + }, + { + "auxiliary_loss_clip": 0.0111367, + "auxiliary_loss_mlp": 0.01028929, + "balance_loss_clip": 1.01576853, + "balance_loss_mlp": 1.04086363, + "epoch": 0.43805801893882457, + "flos": 21981388728960.0, + "grad_norm": 1.9937836479025883, + "language_loss": 0.75031531, + "learning_rate": 2.493176309387897e-06, + "loss": 0.77174133, + "num_input_tokens_seen": 156187320, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7265625, + "step": 7286, + "time_per_iteration": 2.539954423904419 + }, + { + "auxiliary_loss_clip": 0.01118753, + "auxiliary_loss_mlp": 0.01030019, + "balance_loss_clip": 1.01642346, + "balance_loss_mlp": 1.04179096, + "epoch": 0.43811814219149253, + "flos": 26393337383040.0, + "grad_norm": 1.7090844157721894, + "language_loss": 0.73834682, + "learning_rate": 2.492798864792712e-06, + "loss": 0.75983447, + "num_input_tokens_seen": 156207455, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.76953125, + "step": 7287, + "time_per_iteration": 2.5056257247924805 + }, + { + "auxiliary_loss_clip": 0.01117808, + "auxiliary_loss_mlp": 0.010426, + "balance_loss_clip": 1.02887869, + "balance_loss_mlp": 1.04187727, + "epoch": 0.43817826544416055, + "flos": 17493309198720.0, + "grad_norm": 1.8325493621162303, + "language_loss": 0.82288051, + "learning_rate": 2.492421401510545e-06, + "loss": 0.84448457, + "num_input_tokens_seen": 156226560, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7578125, + "step": 7288, + "time_per_iteration": 2.4812850952148438 + }, + { + "auxiliary_loss_clip": 0.01117047, + "auxiliary_loss_mlp": 0.01033722, + "balance_loss_clip": 1.02008474, + "balance_loss_mlp": 1.03895211, + "epoch": 0.4382383886968285, + "flos": 21581020759680.0, + "grad_norm": 1.476666560822241, + "language_loss": 0.84346598, + "learning_rate": 2.4920439195557093e-06, + "loss": 0.86497366, + "num_input_tokens_seen": 156246740, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.78125, + "step": 7289, + "time_per_iteration": 2.482379674911499 + }, + { + "auxiliary_loss_clip": 0.01119976, + "auxiliary_loss_mlp": 0.01036661, + "balance_loss_clip": 1.0235244, + "balance_loss_mlp": 1.04139173, + "epoch": 0.4382985119494965, + "flos": 27923742201600.0, + "grad_norm": 1.4352131560569001, + "language_loss": 0.78107727, + "learning_rate": 2.4916664189425183e-06, + "loss": 0.80264366, + "num_input_tokens_seen": 156266440, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.78515625, + "step": 7290, + "time_per_iteration": 2.5521459579467773 + }, + { + "auxiliary_loss_clip": 0.01115969, + "auxiliary_loss_mlp": 0.0104054, + "balance_loss_clip": 1.02761197, + "balance_loss_mlp": 1.04235792, + "epoch": 0.43835863520216445, + "flos": 24936836797440.0, + "grad_norm": 3.384239132873348, + "language_loss": 0.77987993, + "learning_rate": 2.491288899685288e-06, + "loss": 0.80144495, + "num_input_tokens_seen": 156286900, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.734375, + "step": 7291, + "time_per_iteration": 2.512519121170044 + }, + { + "auxiliary_loss_clip": 0.01117762, + "auxiliary_loss_mlp": 0.01031364, + "balance_loss_clip": 1.01792359, + "balance_loss_mlp": 1.04297888, + "epoch": 0.4384187584548324, + "flos": 33510293504640.0, + "grad_norm": 1.5428221976657872, + "language_loss": 0.65224636, + "learning_rate": 2.4909113617983325e-06, + "loss": 0.67373765, + "num_input_tokens_seen": 156307690, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 7292, + "time_per_iteration": 2.597714424133301 + }, + { + "auxiliary_loss_clip": 0.0111598, + "auxiliary_loss_mlp": 0.01031038, + "balance_loss_clip": 1.01864016, + "balance_loss_mlp": 1.03967905, + "epoch": 0.4384788817075004, + "flos": 23951052967680.0, + "grad_norm": 1.884679810356821, + "language_loss": 0.74216962, + "learning_rate": 2.49053380529597e-06, + "loss": 0.76363981, + "num_input_tokens_seen": 156326620, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.76171875, + "step": 7293, + "time_per_iteration": 2.4943923950195312 + }, + { + "auxiliary_loss_clip": 0.01119197, + "auxiliary_loss_mlp": 0.01040872, + "balance_loss_clip": 1.02732337, + "balance_loss_mlp": 1.04433274, + "epoch": 0.43853900496016834, + "flos": 19098516090240.0, + "grad_norm": 2.4110491255972684, + "language_loss": 0.78757977, + "learning_rate": 2.490156230192516e-06, + "loss": 0.8091805, + "num_input_tokens_seen": 156345495, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.75, + "step": 7294, + "time_per_iteration": 2.495358467102051 + }, + { + "auxiliary_loss_clip": 0.0111963, + "auxiliary_loss_mlp": 0.01041568, + "balance_loss_clip": 1.02864015, + "balance_loss_mlp": 1.04313052, + "epoch": 0.4385991282128363, + "flos": 13225362168960.0, + "grad_norm": 1.7229696907351246, + "language_loss": 0.73184276, + "learning_rate": 2.4897786365022883e-06, + "loss": 0.7534548, + "num_input_tokens_seen": 156363155, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.765625, + "step": 7295, + "time_per_iteration": 2.4645302295684814 + }, + { + "auxiliary_loss_clip": 0.01119056, + "auxiliary_loss_mlp": 0.01039679, + "balance_loss_clip": 1.02573109, + "balance_loss_mlp": 1.042575, + "epoch": 0.4386592514655043, + "flos": 14319883445760.0, + "grad_norm": 2.059865438640582, + "language_loss": 0.75337231, + "learning_rate": 2.4894010242396063e-06, + "loss": 0.77495956, + "num_input_tokens_seen": 156380940, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.765625, + "step": 7296, + "time_per_iteration": 2.46444034576416 + }, + { + "auxiliary_loss_clip": 0.01117475, + "auxiliary_loss_mlp": 0.01033002, + "balance_loss_clip": 1.01976418, + "balance_loss_mlp": 1.04255402, + "epoch": 0.43871937471817224, + "flos": 22784423137920.0, + "grad_norm": 1.6034841999072227, + "language_loss": 0.69515687, + "learning_rate": 2.4890233934187873e-06, + "loss": 0.71666169, + "num_input_tokens_seen": 156400415, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75, + "step": 7297, + "time_per_iteration": 2.4995949268341064 + }, + { + "auxiliary_loss_clip": 0.01115206, + "auxiliary_loss_mlp": 0.01031948, + "balance_loss_clip": 1.01913857, + "balance_loss_mlp": 1.04173827, + "epoch": 0.4387794979708402, + "flos": 28072304853120.0, + "grad_norm": 1.494373898338378, + "language_loss": 0.70457232, + "learning_rate": 2.4886457440541535e-06, + "loss": 0.72604382, + "num_input_tokens_seen": 156421120, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.734375, + "step": 7298, + "time_per_iteration": 2.574982166290283 + }, + { + "auxiliary_loss_clip": 0.01117164, + "auxiliary_loss_mlp": 0.01029544, + "balance_loss_clip": 1.01672888, + "balance_loss_mlp": 1.04384279, + "epoch": 0.43883962122350817, + "flos": 26249551240320.0, + "grad_norm": 1.5912334767066174, + "language_loss": 0.7241621, + "learning_rate": 2.4882680761600238e-06, + "loss": 0.74562919, + "num_input_tokens_seen": 156441535, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.734375, + "step": 7299, + "time_per_iteration": 2.539013385772705 + }, + { + "auxiliary_loss_clip": 0.01120808, + "auxiliary_loss_mlp": 0.01047348, + "balance_loss_clip": 1.03278041, + "balance_loss_mlp": 1.043944, + "epoch": 0.43889974447617613, + "flos": 25883765089920.0, + "grad_norm": 1.8082969607549542, + "language_loss": 0.77112591, + "learning_rate": 2.487890389750719e-06, + "loss": 0.79280752, + "num_input_tokens_seen": 156462015, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.76953125, + "step": 7300, + "time_per_iteration": 2.567291259765625 + }, + { + "auxiliary_loss_clip": 0.0111673, + "auxiliary_loss_mlp": 0.01037561, + "balance_loss_clip": 1.02442431, + "balance_loss_mlp": 1.04064155, + "epoch": 0.43895986772884416, + "flos": 25046615738880.0, + "grad_norm": 1.6241879676388415, + "language_loss": 0.70685148, + "learning_rate": 2.4875126848405626e-06, + "loss": 0.72839439, + "num_input_tokens_seen": 156482165, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7578125, + "step": 7301, + "time_per_iteration": 2.497025489807129 + }, + { + "auxiliary_loss_clip": 0.01122863, + "auxiliary_loss_mlp": 0.01033554, + "balance_loss_clip": 1.01932693, + "balance_loss_mlp": 1.04512143, + "epoch": 0.4390199909815121, + "flos": 25994585525760.0, + "grad_norm": 1.911748384222125, + "language_loss": 0.70491576, + "learning_rate": 2.4871349614438757e-06, + "loss": 0.72647995, + "num_input_tokens_seen": 156503170, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.77734375, + "step": 7302, + "time_per_iteration": 2.5212793350219727 + }, + { + "auxiliary_loss_clip": 0.011184, + "auxiliary_loss_mlp": 0.01039693, + "balance_loss_clip": 1.02676439, + "balance_loss_mlp": 1.04383337, + "epoch": 0.4390801142341801, + "flos": 29022249888000.0, + "grad_norm": 1.741042450815644, + "language_loss": 0.82304549, + "learning_rate": 2.486757219574983e-06, + "loss": 0.84462643, + "num_input_tokens_seen": 156523005, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.74609375, + "step": 7303, + "time_per_iteration": 2.5407814979553223 + }, + { + "auxiliary_loss_clip": 0.01123737, + "auxiliary_loss_mlp": 0.01042372, + "balance_loss_clip": 1.02753651, + "balance_loss_mlp": 1.04429436, + "epoch": 0.43914023748684805, + "flos": 33438544087680.0, + "grad_norm": 2.4492152950747412, + "language_loss": 0.68408841, + "learning_rate": 2.4863794592482067e-06, + "loss": 0.70574951, + "num_input_tokens_seen": 156544440, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 7304, + "time_per_iteration": 4.099287509918213 + }, + { + "auxiliary_loss_clip": 0.01116014, + "auxiliary_loss_mlp": 0.01039361, + "balance_loss_clip": 1.02631354, + "balance_loss_mlp": 1.04335666, + "epoch": 0.439200360739516, + "flos": 34531844302080.0, + "grad_norm": 1.4059546174528585, + "language_loss": 0.78115439, + "learning_rate": 2.486001680477873e-06, + "loss": 0.80270815, + "num_input_tokens_seen": 156565410, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 7305, + "time_per_iteration": 2.6079509258270264 + }, + { + "auxiliary_loss_clip": 0.01116718, + "auxiliary_loss_mlp": 0.01037045, + "balance_loss_clip": 1.02376556, + "balance_loss_mlp": 1.04186165, + "epoch": 0.439260483992184, + "flos": 21907843632000.0, + "grad_norm": 1.688110038500655, + "language_loss": 0.68754542, + "learning_rate": 2.485623883278308e-06, + "loss": 0.70908302, + "num_input_tokens_seen": 156584210, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.74609375, + "step": 7306, + "time_per_iteration": 2.4539954662323 + }, + { + "auxiliary_loss_clip": 0.01119821, + "auxiliary_loss_mlp": 0.0103523, + "balance_loss_clip": 1.0214076, + "balance_loss_mlp": 1.04369712, + "epoch": 0.43932060724485195, + "flos": 20996430912000.0, + "grad_norm": 1.4603628541776523, + "language_loss": 0.6270709, + "learning_rate": 2.4852460676638344e-06, + "loss": 0.64862138, + "num_input_tokens_seen": 156602730, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76171875, + "step": 7307, + "time_per_iteration": 2.490736484527588 + }, + { + "auxiliary_loss_clip": 0.01121175, + "auxiliary_loss_mlp": 0.01032991, + "balance_loss_clip": 1.02001536, + "balance_loss_mlp": 1.04338455, + "epoch": 0.4393807304975199, + "flos": 17747053850880.0, + "grad_norm": 1.9032558944481925, + "language_loss": 0.72409779, + "learning_rate": 2.4848682336487828e-06, + "loss": 0.74563944, + "num_input_tokens_seen": 156619405, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.77734375, + "step": 7308, + "time_per_iteration": 2.4319982528686523 + }, + { + "auxiliary_loss_clip": 0.01120176, + "auxiliary_loss_mlp": 0.01037474, + "balance_loss_clip": 1.02347863, + "balance_loss_mlp": 1.04077995, + "epoch": 0.4394408537501879, + "flos": 22528523669760.0, + "grad_norm": 1.6404677903158766, + "language_loss": 0.76631165, + "learning_rate": 2.4844903812474787e-06, + "loss": 0.78788805, + "num_input_tokens_seen": 156638165, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.79296875, + "step": 7309, + "time_per_iteration": 2.5045857429504395 + }, + { + "auxiliary_loss_clip": 0.01115088, + "auxiliary_loss_mlp": 0.01032267, + "balance_loss_clip": 1.01943445, + "balance_loss_mlp": 1.04314303, + "epoch": 0.43950097700285584, + "flos": 23440654661760.0, + "grad_norm": 1.788496009330223, + "language_loss": 0.70666951, + "learning_rate": 2.484112510474251e-06, + "loss": 0.72814304, + "num_input_tokens_seen": 156658845, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 7310, + "time_per_iteration": 2.4732789993286133 + }, + { + "auxiliary_loss_clip": 0.01120896, + "auxiliary_loss_mlp": 0.01036599, + "balance_loss_clip": 1.02293789, + "balance_loss_mlp": 1.04397106, + "epoch": 0.4395611002555238, + "flos": 23180696956800.0, + "grad_norm": 2.1134854859852505, + "language_loss": 0.75800377, + "learning_rate": 2.483734621343429e-06, + "loss": 0.77957869, + "num_input_tokens_seen": 156677275, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.765625, + "step": 7311, + "time_per_iteration": 2.5372462272644043 + }, + { + "auxiliary_loss_clip": 0.01119727, + "auxiliary_loss_mlp": 0.01034557, + "balance_loss_clip": 1.02171779, + "balance_loss_mlp": 1.04376173, + "epoch": 0.43962122350819177, + "flos": 22127365601280.0, + "grad_norm": 1.9313159099964634, + "language_loss": 0.8127231, + "learning_rate": 2.483356713869341e-06, + "loss": 0.83426595, + "num_input_tokens_seen": 156695815, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7578125, + "step": 7312, + "time_per_iteration": 2.4858858585357666 + }, + { + "auxiliary_loss_clip": 0.01115776, + "auxiliary_loss_mlp": 0.01037621, + "balance_loss_clip": 1.02404332, + "balance_loss_mlp": 1.04030704, + "epoch": 0.43968134676085974, + "flos": 17420554200960.0, + "grad_norm": 2.2005104401689177, + "language_loss": 0.85444236, + "learning_rate": 2.482978788066318e-06, + "loss": 0.87597632, + "num_input_tokens_seen": 156714385, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.75390625, + "step": 7313, + "time_per_iteration": 2.493032932281494 + }, + { + "auxiliary_loss_clip": 0.01119815, + "auxiliary_loss_mlp": 0.01035042, + "balance_loss_clip": 1.02176809, + "balance_loss_mlp": 1.04182911, + "epoch": 0.43974147001352776, + "flos": 18952646958720.0, + "grad_norm": 3.8587100296686145, + "language_loss": 0.67464912, + "learning_rate": 2.4826008439486904e-06, + "loss": 0.69619775, + "num_input_tokens_seen": 156732615, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.78125, + "step": 7314, + "time_per_iteration": 2.4542195796966553 + }, + { + "auxiliary_loss_clip": 0.01121265, + "auxiliary_loss_mlp": 0.01034379, + "balance_loss_clip": 1.02063417, + "balance_loss_mlp": 1.04389846, + "epoch": 0.4398015932661957, + "flos": 18953508885120.0, + "grad_norm": 1.8025616803524547, + "language_loss": 0.76954508, + "learning_rate": 2.4822228815307915e-06, + "loss": 0.79110146, + "num_input_tokens_seen": 156750920, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7734375, + "step": 7315, + "time_per_iteration": 2.4988253116607666 + }, + { + "auxiliary_loss_clip": 0.01117641, + "auxiliary_loss_mlp": 0.01032745, + "balance_loss_clip": 1.01938725, + "balance_loss_mlp": 1.04280567, + "epoch": 0.4398617165188637, + "flos": 24199913370240.0, + "grad_norm": 2.4575060004131895, + "language_loss": 0.74807358, + "learning_rate": 2.4818449008269523e-06, + "loss": 0.76957744, + "num_input_tokens_seen": 156768520, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.75, + "step": 7316, + "time_per_iteration": 2.530104398727417 + }, + { + "auxiliary_loss_clip": 0.0112083, + "auxiliary_loss_mlp": 0.01041846, + "balance_loss_clip": 1.02928746, + "balance_loss_mlp": 1.04640257, + "epoch": 0.43992183977153165, + "flos": 22236677665920.0, + "grad_norm": 2.8405076524150568, + "language_loss": 0.65180635, + "learning_rate": 2.481466901851506e-06, + "loss": 0.67343318, + "num_input_tokens_seen": 156788700, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.74609375, + "step": 7317, + "time_per_iteration": 2.5233771800994873 + }, + { + "auxiliary_loss_clip": 0.01121891, + "auxiliary_loss_mlp": 0.01034231, + "balance_loss_clip": 1.02082634, + "balance_loss_mlp": 1.04455566, + "epoch": 0.4399819630241996, + "flos": 18697465762560.0, + "grad_norm": 1.7710834755986071, + "language_loss": 0.7968365, + "learning_rate": 2.4810888846187865e-06, + "loss": 0.8183977, + "num_input_tokens_seen": 156806470, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7734375, + "step": 7318, + "time_per_iteration": 2.4618961811065674 + }, + { + "auxiliary_loss_clip": 0.01122714, + "auxiliary_loss_mlp": 0.01036897, + "balance_loss_clip": 1.02316427, + "balance_loss_mlp": 1.04423118, + "epoch": 0.4400420862768676, + "flos": 23879375377920.0, + "grad_norm": 1.4932738321413537, + "language_loss": 0.79472506, + "learning_rate": 2.4807108491431283e-06, + "loss": 0.81632113, + "num_input_tokens_seen": 156825895, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.78515625, + "step": 7319, + "time_per_iteration": 2.5342819690704346 + }, + { + "auxiliary_loss_clip": 0.01117114, + "auxiliary_loss_mlp": 0.01040515, + "balance_loss_clip": 1.02637124, + "balance_loss_mlp": 1.04102063, + "epoch": 0.44010220952953555, + "flos": 28037615293440.0, + "grad_norm": 1.641668171652613, + "language_loss": 0.80221331, + "learning_rate": 2.4803327954388667e-06, + "loss": 0.82378966, + "num_input_tokens_seen": 156845990, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 7320, + "time_per_iteration": 2.520888566970825 + }, + { + "auxiliary_loss_clip": 0.01116164, + "auxiliary_loss_mlp": 0.01036235, + "balance_loss_clip": 1.02323556, + "balance_loss_mlp": 1.04136741, + "epoch": 0.4401623327822035, + "flos": 23768985905280.0, + "grad_norm": 1.6986497736973376, + "language_loss": 0.69795078, + "learning_rate": 2.4799547235203376e-06, + "loss": 0.71947479, + "num_input_tokens_seen": 156866685, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.75, + "step": 7321, + "time_per_iteration": 2.5457892417907715 + }, + { + "auxiliary_loss_clip": 0.01039878, + "auxiliary_loss_mlp": 0.01008287, + "balance_loss_clip": 1.00702953, + "balance_loss_mlp": 1.01681685, + "epoch": 0.4402224560348715, + "flos": 70774583264640.0, + "grad_norm": 0.8741267032944617, + "language_loss": 0.56908953, + "learning_rate": 2.4795766334018763e-06, + "loss": 0.58957124, + "num_input_tokens_seen": 156923450, + "router_z_loss_clip": 0.01257324, + "router_z_loss_mlp": 0.23046875, + "step": 7322, + "time_per_iteration": 3.164207935333252 + }, + { + "auxiliary_loss_clip": 0.01117179, + "auxiliary_loss_mlp": 0.01029685, + "balance_loss_clip": 1.01813388, + "balance_loss_mlp": 1.04277694, + "epoch": 0.44028257928753944, + "flos": 22891795868160.0, + "grad_norm": 1.4567737767029483, + "language_loss": 0.76075542, + "learning_rate": 2.479198525097822e-06, + "loss": 0.78222406, + "num_input_tokens_seen": 156944795, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.74609375, + "step": 7323, + "time_per_iteration": 2.5279085636138916 + }, + { + "auxiliary_loss_clip": 0.01117385, + "auxiliary_loss_mlp": 0.01037268, + "balance_loss_clip": 1.02369034, + "balance_loss_mlp": 1.0409224, + "epoch": 0.4403427025402074, + "flos": 17895760156800.0, + "grad_norm": 1.5548582319563429, + "language_loss": 0.8034448, + "learning_rate": 2.478820398622511e-06, + "loss": 0.82499135, + "num_input_tokens_seen": 156962755, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.765625, + "step": 7324, + "time_per_iteration": 2.4854304790496826 + }, + { + "auxiliary_loss_clip": 0.01039688, + "auxiliary_loss_mlp": 0.01006776, + "balance_loss_clip": 1.00549471, + "balance_loss_mlp": 1.01659369, + "epoch": 0.4404028257928754, + "flos": 69562525708800.0, + "grad_norm": 0.66599266679982, + "language_loss": 0.54557002, + "learning_rate": 2.478442253990283e-06, + "loss": 0.56603467, + "num_input_tokens_seen": 157028095, + "router_z_loss_clip": 0.01281738, + "router_z_loss_mlp": 0.23144531, + "step": 7325, + "time_per_iteration": 3.081268787384033 + }, + { + "auxiliary_loss_clip": 0.01116252, + "auxiliary_loss_mlp": 0.0103012, + "balance_loss_clip": 1.01792467, + "balance_loss_mlp": 1.04348588, + "epoch": 0.44046294904554334, + "flos": 20923675914240.0, + "grad_norm": 1.5427042359768692, + "language_loss": 0.69823551, + "learning_rate": 2.4780640912154766e-06, + "loss": 0.71969926, + "num_input_tokens_seen": 157048365, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.7265625, + "step": 7326, + "time_per_iteration": 2.489088535308838 + }, + { + "auxiliary_loss_clip": 0.01112531, + "auxiliary_loss_mlp": 0.01029175, + "balance_loss_clip": 1.01634765, + "balance_loss_mlp": 1.03926969, + "epoch": 0.44052307229821136, + "flos": 23623475909760.0, + "grad_norm": 1.4106900729498488, + "language_loss": 0.76410896, + "learning_rate": 2.477685910312432e-06, + "loss": 0.78552604, + "num_input_tokens_seen": 157069130, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.734375, + "step": 7327, + "time_per_iteration": 2.5099427700042725 + }, + { + "auxiliary_loss_clip": 0.01112963, + "auxiliary_loss_mlp": 0.01032486, + "balance_loss_clip": 1.01947999, + "balance_loss_mlp": 1.04029953, + "epoch": 0.4405831955508793, + "flos": 17597665186560.0, + "grad_norm": 1.92290278058118, + "language_loss": 0.83856362, + "learning_rate": 2.4773077112954897e-06, + "loss": 0.86001813, + "num_input_tokens_seen": 157084940, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7265625, + "step": 7328, + "time_per_iteration": 2.453078269958496 + }, + { + "auxiliary_loss_clip": 0.01114955, + "auxiliary_loss_mlp": 0.01028238, + "balance_loss_clip": 1.01505983, + "balance_loss_mlp": 1.04100752, + "epoch": 0.4406433188035473, + "flos": 21463376739840.0, + "grad_norm": 2.489103584507488, + "language_loss": 0.77842677, + "learning_rate": 2.4769294941789908e-06, + "loss": 0.79985875, + "num_input_tokens_seen": 157102770, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73828125, + "step": 7329, + "time_per_iteration": 2.4908933639526367 + }, + { + "auxiliary_loss_clip": 0.01118689, + "auxiliary_loss_mlp": 0.01033841, + "balance_loss_clip": 1.02069247, + "balance_loss_mlp": 1.04125428, + "epoch": 0.44070344205621526, + "flos": 22673566788480.0, + "grad_norm": 1.7085588184823939, + "language_loss": 0.73343551, + "learning_rate": 2.476551258977278e-06, + "loss": 0.75496078, + "num_input_tokens_seen": 157122035, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7734375, + "step": 7330, + "time_per_iteration": 2.463330030441284 + }, + { + "auxiliary_loss_clip": 0.01116337, + "auxiliary_loss_mlp": 0.01032094, + "balance_loss_clip": 1.01974368, + "balance_loss_mlp": 1.04176283, + "epoch": 0.4407635653088832, + "flos": 23441193365760.0, + "grad_norm": 1.7732063146110093, + "language_loss": 0.74867487, + "learning_rate": 2.4761730057046936e-06, + "loss": 0.77015924, + "num_input_tokens_seen": 157142800, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.74609375, + "step": 7331, + "time_per_iteration": 2.5421340465545654 + }, + { + "auxiliary_loss_clip": 0.01111824, + "auxiliary_loss_mlp": 0.01030025, + "balance_loss_clip": 1.01797271, + "balance_loss_mlp": 1.03957462, + "epoch": 0.4408236885615512, + "flos": 24021294013440.0, + "grad_norm": 1.4577784912363292, + "language_loss": 0.76381409, + "learning_rate": 2.475794734375581e-06, + "loss": 0.78523266, + "num_input_tokens_seen": 157163295, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.72265625, + "step": 7332, + "time_per_iteration": 2.5218567848205566 + }, + { + "auxiliary_loss_clip": 0.01114527, + "auxiliary_loss_mlp": 0.0103924, + "balance_loss_clip": 1.02724767, + "balance_loss_mlp": 1.03985786, + "epoch": 0.44088381181421915, + "flos": 12676826597760.0, + "grad_norm": 1.6787739774558346, + "language_loss": 0.7317301, + "learning_rate": 2.475416445004285e-06, + "loss": 0.75326777, + "num_input_tokens_seen": 157180890, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.74609375, + "step": 7333, + "time_per_iteration": 2.4611384868621826 + }, + { + "auxiliary_loss_clip": 0.01113948, + "auxiliary_loss_mlp": 0.01034468, + "balance_loss_clip": 1.0218792, + "balance_loss_mlp": 1.04222834, + "epoch": 0.4409439350668871, + "flos": 24569865498240.0, + "grad_norm": 1.7946296457229314, + "language_loss": 0.79795265, + "learning_rate": 2.4750381376051493e-06, + "loss": 0.81943679, + "num_input_tokens_seen": 157200580, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71875, + "step": 7334, + "time_per_iteration": 2.4846577644348145 + }, + { + "auxiliary_loss_clip": 0.01123312, + "auxiliary_loss_mlp": 0.01041094, + "balance_loss_clip": 1.02473879, + "balance_loss_mlp": 1.04168534, + "epoch": 0.4410040583195551, + "flos": 22668574798080.0, + "grad_norm": 2.170087212124324, + "language_loss": 0.7549156, + "learning_rate": 2.47465981219252e-06, + "loss": 0.77655965, + "num_input_tokens_seen": 157218345, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.81640625, + "step": 7335, + "time_per_iteration": 2.5086324214935303 + }, + { + "auxiliary_loss_clip": 0.01118233, + "auxiliary_loss_mlp": 0.01039933, + "balance_loss_clip": 1.02661777, + "balance_loss_mlp": 1.04259086, + "epoch": 0.44106418157222305, + "flos": 10852528700160.0, + "grad_norm": 1.91450979477167, + "language_loss": 0.72583538, + "learning_rate": 2.4742814687807423e-06, + "loss": 0.74741697, + "num_input_tokens_seen": 157234395, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7578125, + "step": 7336, + "time_per_iteration": 2.436680555343628 + }, + { + "auxiliary_loss_clip": 0.01118765, + "auxiliary_loss_mlp": 0.01039128, + "balance_loss_clip": 1.0251267, + "balance_loss_mlp": 1.04040349, + "epoch": 0.441124304824891, + "flos": 21726710323200.0, + "grad_norm": 9.267090991138677, + "language_loss": 0.62665188, + "learning_rate": 2.473903107384165e-06, + "loss": 0.64823085, + "num_input_tokens_seen": 157254805, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.78515625, + "step": 7337, + "time_per_iteration": 2.484269618988037 + }, + { + "auxiliary_loss_clip": 0.01039049, + "auxiliary_loss_mlp": 0.0100578, + "balance_loss_clip": 1.00452268, + "balance_loss_mlp": 1.01618195, + "epoch": 0.441184428077559, + "flos": 63220486625280.0, + "grad_norm": 0.7410103266773326, + "language_loss": 0.52670205, + "learning_rate": 2.473524728017134e-06, + "loss": 0.54715037, + "num_input_tokens_seen": 157317870, + "router_z_loss_clip": 0.01257324, + "router_z_loss_mlp": 0.22851562, + "step": 7338, + "time_per_iteration": 3.104921340942383 + }, + { + "auxiliary_loss_clip": 0.01120745, + "auxiliary_loss_mlp": 0.01047395, + "balance_loss_clip": 1.03303015, + "balance_loss_mlp": 1.04076958, + "epoch": 0.44124455133022694, + "flos": 21177959270400.0, + "grad_norm": 1.7777015345810536, + "language_loss": 0.70687723, + "learning_rate": 2.473146330693997e-06, + "loss": 0.7285586, + "num_input_tokens_seen": 157336505, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.80078125, + "step": 7339, + "time_per_iteration": 2.5172934532165527 + }, + { + "auxiliary_loss_clip": 0.01113596, + "auxiliary_loss_mlp": 0.01038279, + "balance_loss_clip": 1.02603626, + "balance_loss_mlp": 1.04237795, + "epoch": 0.4413046745828949, + "flos": 17457865453440.0, + "grad_norm": 1.6032661325040427, + "language_loss": 0.69992614, + "learning_rate": 2.472767915429105e-06, + "loss": 0.7214449, + "num_input_tokens_seen": 157354995, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.7109375, + "step": 7340, + "time_per_iteration": 2.4677066802978516 + }, + { + "auxiliary_loss_clip": 0.0103753, + "auxiliary_loss_mlp": 0.01002043, + "balance_loss_clip": 1.00078511, + "balance_loss_mlp": 1.01463652, + "epoch": 0.4413647978355629, + "flos": 61586153804160.0, + "grad_norm": 0.8913600985584349, + "language_loss": 0.64017105, + "learning_rate": 2.4723894822368054e-06, + "loss": 0.66056681, + "num_input_tokens_seen": 157404260, + "router_z_loss_clip": 0.01257324, + "router_z_loss_mlp": 0.22851562, + "step": 7341, + "time_per_iteration": 2.87821888923645 + }, + { + "auxiliary_loss_clip": 0.01113838, + "auxiliary_loss_mlp": 0.01038155, + "balance_loss_clip": 1.02473783, + "balance_loss_mlp": 1.04029536, + "epoch": 0.4414249210882309, + "flos": 27527001505920.0, + "grad_norm": 2.415120536593597, + "language_loss": 0.73162079, + "learning_rate": 2.47201103113145e-06, + "loss": 0.75314075, + "num_input_tokens_seen": 157423045, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 7342, + "time_per_iteration": 2.6009373664855957 + }, + { + "auxiliary_loss_clip": 0.01114735, + "auxiliary_loss_mlp": 0.01037861, + "balance_loss_clip": 1.02390742, + "balance_loss_mlp": 1.03866804, + "epoch": 0.44148504434089886, + "flos": 23513984277120.0, + "grad_norm": 1.834134484008718, + "language_loss": 0.7961756, + "learning_rate": 2.4716325621273886e-06, + "loss": 0.81770158, + "num_input_tokens_seen": 157441815, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76171875, + "step": 7343, + "time_per_iteration": 2.5102362632751465 + }, + { + "auxiliary_loss_clip": 0.01113089, + "auxiliary_loss_mlp": 0.01030659, + "balance_loss_clip": 1.01745617, + "balance_loss_mlp": 1.03901291, + "epoch": 0.4415451675935668, + "flos": 21580589796480.0, + "grad_norm": 1.5507634652992637, + "language_loss": 0.76845753, + "learning_rate": 2.4712540752389725e-06, + "loss": 0.789895, + "num_input_tokens_seen": 157460470, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73828125, + "step": 7344, + "time_per_iteration": 2.517014741897583 + }, + { + "auxiliary_loss_clip": 0.01036094, + "auxiliary_loss_mlp": 0.01000265, + "balance_loss_clip": 0.99887604, + "balance_loss_mlp": 1.01319945, + "epoch": 0.4416052908462348, + "flos": 59006368126080.0, + "grad_norm": 0.7920555871551813, + "language_loss": 0.63752162, + "learning_rate": 2.470875570480556e-06, + "loss": 0.65788519, + "num_input_tokens_seen": 157512655, + "router_z_loss_clip": 0.01391602, + "router_z_loss_mlp": 0.22949219, + "step": 7345, + "time_per_iteration": 7.267446517944336 + }, + { + "auxiliary_loss_clip": 0.01121083, + "auxiliary_loss_mlp": 0.01039556, + "balance_loss_clip": 1.02610314, + "balance_loss_mlp": 1.04385495, + "epoch": 0.44166541409890275, + "flos": 26357642242560.0, + "grad_norm": 2.1109182100548596, + "language_loss": 0.86316586, + "learning_rate": 2.470497047866489e-06, + "loss": 0.88477224, + "num_input_tokens_seen": 157533700, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7734375, + "step": 7346, + "time_per_iteration": 2.5508806705474854 + }, + { + "auxiliary_loss_clip": 0.01118131, + "auxiliary_loss_mlp": 0.01040679, + "balance_loss_clip": 1.02691066, + "balance_loss_mlp": 1.04238844, + "epoch": 0.4417255373515707, + "flos": 20192678231040.0, + "grad_norm": 1.947149735733886, + "language_loss": 0.8050105, + "learning_rate": 2.470118507411128e-06, + "loss": 0.82659858, + "num_input_tokens_seen": 157551105, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7578125, + "step": 7347, + "time_per_iteration": 2.474933624267578 + }, + { + "auxiliary_loss_clip": 0.01117003, + "auxiliary_loss_mlp": 0.01037561, + "balance_loss_clip": 1.02367926, + "balance_loss_mlp": 1.04158723, + "epoch": 0.4417856606042387, + "flos": 17887895078400.0, + "grad_norm": 1.6941368254206504, + "language_loss": 0.82639945, + "learning_rate": 2.4697399491288263e-06, + "loss": 0.84794509, + "num_input_tokens_seen": 157568285, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 7348, + "time_per_iteration": 2.4525363445281982 + }, + { + "auxiliary_loss_clip": 0.01119343, + "auxiliary_loss_mlp": 0.01037184, + "balance_loss_clip": 1.02335548, + "balance_loss_mlp": 1.04179621, + "epoch": 0.44184578385690665, + "flos": 27964034282880.0, + "grad_norm": 1.5736626646923677, + "language_loss": 0.7025882, + "learning_rate": 2.469361373033938e-06, + "loss": 0.72415352, + "num_input_tokens_seen": 157590405, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7734375, + "step": 7349, + "time_per_iteration": 2.511890172958374 + }, + { + "auxiliary_loss_clip": 0.01117351, + "auxiliary_loss_mlp": 0.01038625, + "balance_loss_clip": 1.02426577, + "balance_loss_mlp": 1.03973794, + "epoch": 0.4419059071095746, + "flos": 23367899664000.0, + "grad_norm": 1.6465526230005572, + "language_loss": 0.74427998, + "learning_rate": 2.468982779140819e-06, + "loss": 0.76583976, + "num_input_tokens_seen": 157607420, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.77734375, + "step": 7350, + "time_per_iteration": 2.496570110321045 + }, + { + "auxiliary_loss_clip": 0.01116736, + "auxiliary_loss_mlp": 0.01034613, + "balance_loss_clip": 1.02167273, + "balance_loss_mlp": 1.0410589, + "epoch": 0.4419660303622426, + "flos": 15012169246080.0, + "grad_norm": 1.9521663807923895, + "language_loss": 0.80709779, + "learning_rate": 2.468604167463827e-06, + "loss": 0.8286112, + "num_input_tokens_seen": 157624990, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7578125, + "step": 7351, + "time_per_iteration": 2.432551860809326 + }, + { + "auxiliary_loss_clip": 0.01111348, + "auxiliary_loss_mlp": 0.01035932, + "balance_loss_clip": 1.02401161, + "balance_loss_mlp": 1.03947091, + "epoch": 0.44202615361491054, + "flos": 25371750672000.0, + "grad_norm": 1.5082806208548023, + "language_loss": 0.73055673, + "learning_rate": 2.4682255380173176e-06, + "loss": 0.75202954, + "num_input_tokens_seen": 157645300, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.71875, + "step": 7352, + "time_per_iteration": 2.515235424041748 + }, + { + "auxiliary_loss_clip": 0.01116736, + "auxiliary_loss_mlp": 0.01031954, + "balance_loss_clip": 1.0184238, + "balance_loss_mlp": 1.04159904, + "epoch": 0.4420862768675785, + "flos": 24681116897280.0, + "grad_norm": 1.8470037483547026, + "language_loss": 0.87457407, + "learning_rate": 2.467846890815649e-06, + "loss": 0.89606094, + "num_input_tokens_seen": 157664060, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.75, + "step": 7353, + "time_per_iteration": 2.4880294799804688 + }, + { + "auxiliary_loss_clip": 0.0111698, + "auxiliary_loss_mlp": 0.01035238, + "balance_loss_clip": 1.02288198, + "balance_loss_mlp": 1.04091954, + "epoch": 0.44214640012024653, + "flos": 19528437974400.0, + "grad_norm": 2.0344010928875567, + "language_loss": 0.75522006, + "learning_rate": 2.4674682258731795e-06, + "loss": 0.77674222, + "num_input_tokens_seen": 157680905, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.7578125, + "step": 7354, + "time_per_iteration": 2.454554319381714 + }, + { + "auxiliary_loss_clip": 0.01112104, + "auxiliary_loss_mlp": 0.01035786, + "balance_loss_clip": 1.0235672, + "balance_loss_mlp": 1.03940272, + "epoch": 0.4422065233729145, + "flos": 47557434003840.0, + "grad_norm": 1.7346650465528282, + "language_loss": 0.64754039, + "learning_rate": 2.467089543204268e-06, + "loss": 0.66901928, + "num_input_tokens_seen": 157701980, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.7265625, + "step": 7355, + "time_per_iteration": 2.711973190307617 + }, + { + "auxiliary_loss_clip": 0.0112036, + "auxiliary_loss_mlp": 0.01036556, + "balance_loss_clip": 1.02248383, + "balance_loss_mlp": 1.04187799, + "epoch": 0.44226664662558246, + "flos": 19281050029440.0, + "grad_norm": 1.914030541413853, + "language_loss": 0.78126168, + "learning_rate": 2.466710842823274e-06, + "loss": 0.80283082, + "num_input_tokens_seen": 157720555, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.78515625, + "step": 7356, + "time_per_iteration": 2.470214366912842 + }, + { + "auxiliary_loss_clip": 0.01118926, + "auxiliary_loss_mlp": 0.01036798, + "balance_loss_clip": 1.02317214, + "balance_loss_mlp": 1.0414896, + "epoch": 0.4423267698782504, + "flos": 17821820010240.0, + "grad_norm": 1.5192892311950144, + "language_loss": 0.7712661, + "learning_rate": 2.4663321247445577e-06, + "loss": 0.79282331, + "num_input_tokens_seen": 157739160, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 7357, + "time_per_iteration": 2.461174249649048 + }, + { + "auxiliary_loss_clip": 0.01117699, + "auxiliary_loss_mlp": 0.01038392, + "balance_loss_clip": 1.02454567, + "balance_loss_mlp": 1.0424664, + "epoch": 0.4423868931309184, + "flos": 29204424691200.0, + "grad_norm": 1.4937655647898813, + "language_loss": 0.73591524, + "learning_rate": 2.465953388982481e-06, + "loss": 0.75747615, + "num_input_tokens_seen": 157760020, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 7358, + "time_per_iteration": 2.556330919265747 + }, + { + "auxiliary_loss_clip": 0.01117067, + "auxiliary_loss_mlp": 0.01030767, + "balance_loss_clip": 1.01871514, + "balance_loss_mlp": 1.0415349, + "epoch": 0.44244701638358636, + "flos": 29713135057920.0, + "grad_norm": 1.6567493539100802, + "language_loss": 0.75616974, + "learning_rate": 2.465574635551405e-06, + "loss": 0.77764809, + "num_input_tokens_seen": 157780435, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.75390625, + "step": 7359, + "time_per_iteration": 2.50827693939209 + }, + { + "auxiliary_loss_clip": 0.01116785, + "auxiliary_loss_mlp": 0.01033041, + "balance_loss_clip": 1.01920068, + "balance_loss_mlp": 1.04107249, + "epoch": 0.4425071396362543, + "flos": 22930040874240.0, + "grad_norm": 1.743382279224751, + "language_loss": 0.7001307, + "learning_rate": 2.4651958644656923e-06, + "loss": 0.72162896, + "num_input_tokens_seen": 157799420, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 7360, + "time_per_iteration": 2.4941389560699463 + }, + { + "auxiliary_loss_clip": 0.01117522, + "auxiliary_loss_mlp": 0.01033558, + "balance_loss_clip": 1.0205518, + "balance_loss_mlp": 1.04113221, + "epoch": 0.4425672628889223, + "flos": 19792346175360.0, + "grad_norm": 2.0593935576965996, + "language_loss": 0.69252694, + "learning_rate": 2.4648170757397053e-06, + "loss": 0.71403772, + "num_input_tokens_seen": 157817025, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.76171875, + "step": 7361, + "time_per_iteration": 2.4985222816467285 + }, + { + "auxiliary_loss_clip": 0.01116054, + "auxiliary_loss_mlp": 0.01032222, + "balance_loss_clip": 1.01840568, + "balance_loss_mlp": 1.04025078, + "epoch": 0.44262738614159025, + "flos": 13662215377920.0, + "grad_norm": 3.464971296188532, + "language_loss": 0.82380062, + "learning_rate": 2.464438269387809e-06, + "loss": 0.84528339, + "num_input_tokens_seen": 157834345, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 7362, + "time_per_iteration": 2.5396664142608643 + }, + { + "auxiliary_loss_clip": 0.01120785, + "auxiliary_loss_mlp": 0.0103602, + "balance_loss_clip": 1.0216198, + "balance_loss_mlp": 1.0414443, + "epoch": 0.4426875093942582, + "flos": 14210212245120.0, + "grad_norm": 1.6248096382426125, + "language_loss": 0.74421227, + "learning_rate": 2.464059445424366e-06, + "loss": 0.76578033, + "num_input_tokens_seen": 157852290, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.79296875, + "step": 7363, + "time_per_iteration": 2.452195167541504 + }, + { + "auxiliary_loss_clip": 0.01036428, + "auxiliary_loss_mlp": 0.01011165, + "balance_loss_clip": 1.00969243, + "balance_loss_mlp": 1.0129478, + "epoch": 0.4427476326469262, + "flos": 70117525728000.0, + "grad_norm": 0.6750552451063064, + "language_loss": 0.55668789, + "learning_rate": 2.463680603863743e-06, + "loss": 0.57716382, + "num_input_tokens_seen": 157923060, + "router_z_loss_clip": 0.01470947, + "router_z_loss_mlp": 0.234375, + "step": 7364, + "time_per_iteration": 3.1631510257720947 + }, + { + "auxiliary_loss_clip": 0.0111342, + "auxiliary_loss_mlp": 0.01031288, + "balance_loss_clip": 1.01869917, + "balance_loss_mlp": 1.0388242, + "epoch": 0.44280775589959415, + "flos": 25445080287360.0, + "grad_norm": 1.5647849634077904, + "language_loss": 0.74008644, + "learning_rate": 2.463301744720305e-06, + "loss": 0.76153356, + "num_input_tokens_seen": 157944110, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.74609375, + "step": 7365, + "time_per_iteration": 2.5025317668914795 + }, + { + "auxiliary_loss_clip": 0.01113151, + "auxiliary_loss_mlp": 0.01038694, + "balance_loss_clip": 1.02544355, + "balance_loss_mlp": 1.0385282, + "epoch": 0.4428678791522621, + "flos": 22857214049280.0, + "grad_norm": 1.5168930353966135, + "language_loss": 0.74242592, + "learning_rate": 2.4629228680084184e-06, + "loss": 0.76394439, + "num_input_tokens_seen": 157964295, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.74609375, + "step": 7366, + "time_per_iteration": 2.4882071018218994 + }, + { + "auxiliary_loss_clip": 0.01117127, + "auxiliary_loss_mlp": 0.01032858, + "balance_loss_clip": 1.01911306, + "balance_loss_mlp": 1.04244351, + "epoch": 0.44292800240493013, + "flos": 25812446636160.0, + "grad_norm": 1.7268166919008578, + "language_loss": 0.73934573, + "learning_rate": 2.46254397374245e-06, + "loss": 0.7608456, + "num_input_tokens_seen": 157983970, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.74609375, + "step": 7367, + "time_per_iteration": 2.494215250015259 + }, + { + "auxiliary_loss_clip": 0.01115817, + "auxiliary_loss_mlp": 0.01038126, + "balance_loss_clip": 1.02484, + "balance_loss_mlp": 1.04093957, + "epoch": 0.4429881256575981, + "flos": 32416885549440.0, + "grad_norm": 1.708386000191459, + "language_loss": 0.7409333, + "learning_rate": 2.4621650619367677e-06, + "loss": 0.76247275, + "num_input_tokens_seen": 158006515, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75, + "step": 7368, + "time_per_iteration": 2.5647008419036865 + }, + { + "auxiliary_loss_clip": 0.01114523, + "auxiliary_loss_mlp": 0.01031707, + "balance_loss_clip": 1.01905274, + "balance_loss_mlp": 1.04091215, + "epoch": 0.44304824891026606, + "flos": 22163707186560.0, + "grad_norm": 1.8689444780395545, + "language_loss": 0.79986328, + "learning_rate": 2.4617861326057403e-06, + "loss": 0.82132554, + "num_input_tokens_seen": 158025565, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.734375, + "step": 7369, + "time_per_iteration": 2.4666872024536133 + }, + { + "auxiliary_loss_clip": 0.01112296, + "auxiliary_loss_mlp": 0.0102878, + "balance_loss_clip": 1.01627517, + "balance_loss_mlp": 1.04060125, + "epoch": 0.443108372162934, + "flos": 25338569483520.0, + "grad_norm": 1.7167890006148945, + "language_loss": 0.72231519, + "learning_rate": 2.461407185763737e-06, + "loss": 0.74372596, + "num_input_tokens_seen": 158045620, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71484375, + "step": 7370, + "time_per_iteration": 2.5508570671081543 + }, + { + "auxiliary_loss_clip": 0.01113663, + "auxiliary_loss_mlp": 0.01033079, + "balance_loss_clip": 1.02021682, + "balance_loss_mlp": 1.03883541, + "epoch": 0.443168495415602, + "flos": 23330947547520.0, + "grad_norm": 1.7515847136682843, + "language_loss": 0.70318949, + "learning_rate": 2.461028221425126e-06, + "loss": 0.72465694, + "num_input_tokens_seen": 158063505, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.75, + "step": 7371, + "time_per_iteration": 2.4617960453033447 + }, + { + "auxiliary_loss_clip": 0.01110948, + "auxiliary_loss_mlp": 0.01030075, + "balance_loss_clip": 1.01818371, + "balance_loss_mlp": 1.03891456, + "epoch": 0.44322861866826996, + "flos": 21871502046720.0, + "grad_norm": 2.199744355071377, + "language_loss": 0.68163198, + "learning_rate": 2.4606492396042786e-06, + "loss": 0.70304221, + "num_input_tokens_seen": 158080335, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.72265625, + "step": 7372, + "time_per_iteration": 2.4743239879608154 + }, + { + "auxiliary_loss_clip": 0.0111515, + "auxiliary_loss_mlp": 0.01030978, + "balance_loss_clip": 1.01702499, + "balance_loss_mlp": 1.03971767, + "epoch": 0.4432887419209379, + "flos": 20084407660800.0, + "grad_norm": 1.696523180994532, + "language_loss": 0.83959508, + "learning_rate": 2.4602702403155664e-06, + "loss": 0.86105639, + "num_input_tokens_seen": 158098955, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75390625, + "step": 7373, + "time_per_iteration": 2.44077467918396 + }, + { + "auxiliary_loss_clip": 0.01038641, + "auxiliary_loss_mlp": 0.01004854, + "balance_loss_clip": 1.00331616, + "balance_loss_mlp": 1.01527071, + "epoch": 0.4433488651736059, + "flos": 70035540935040.0, + "grad_norm": 0.8140024563186875, + "language_loss": 0.55299437, + "learning_rate": 2.4598912235733604e-06, + "loss": 0.57342935, + "num_input_tokens_seen": 158164110, + "router_z_loss_clip": 0.01538086, + "router_z_loss_mlp": 0.234375, + "step": 7374, + "time_per_iteration": 3.1360692977905273 + }, + { + "auxiliary_loss_clip": 0.01113767, + "auxiliary_loss_mlp": 0.01042375, + "balance_loss_clip": 1.02858198, + "balance_loss_mlp": 1.04092741, + "epoch": 0.44340898842627385, + "flos": 16282472705280.0, + "grad_norm": 2.2551701608050636, + "language_loss": 0.82651508, + "learning_rate": 2.4595121893920327e-06, + "loss": 0.84807646, + "num_input_tokens_seen": 158179850, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7265625, + "step": 7375, + "time_per_iteration": 2.4277329444885254 + }, + { + "auxiliary_loss_clip": 0.01116501, + "auxiliary_loss_mlp": 0.01029081, + "balance_loss_clip": 1.01610494, + "balance_loss_mlp": 1.04118764, + "epoch": 0.4434691116789418, + "flos": 16611989097600.0, + "grad_norm": 1.7856786314152562, + "language_loss": 0.83470213, + "learning_rate": 2.4591331377859578e-06, + "loss": 0.85615796, + "num_input_tokens_seen": 158196590, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.75390625, + "step": 7376, + "time_per_iteration": 2.481781482696533 + }, + { + "auxiliary_loss_clip": 0.01114604, + "auxiliary_loss_mlp": 0.01032944, + "balance_loss_clip": 1.02043331, + "balance_loss_mlp": 1.04121447, + "epoch": 0.4435292349316098, + "flos": 19063251912960.0, + "grad_norm": 1.7657537697851593, + "language_loss": 0.77321744, + "learning_rate": 2.4587540687695077e-06, + "loss": 0.79469293, + "num_input_tokens_seen": 158216355, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.734375, + "step": 7377, + "time_per_iteration": 2.4599812030792236 + }, + { + "auxiliary_loss_clip": 0.01112621, + "auxiliary_loss_mlp": 0.0102944, + "balance_loss_clip": 1.01692927, + "balance_loss_mlp": 1.04132032, + "epoch": 0.44358935818427775, + "flos": 21251324799360.0, + "grad_norm": 1.8620341755948002, + "language_loss": 0.75641978, + "learning_rate": 2.458374982357057e-06, + "loss": 0.77784032, + "num_input_tokens_seen": 158235825, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7109375, + "step": 7378, + "time_per_iteration": 2.5178849697113037 + }, + { + "auxiliary_loss_clip": 0.01114317, + "auxiliary_loss_mlp": 0.01035639, + "balance_loss_clip": 1.02302647, + "balance_loss_mlp": 1.04010391, + "epoch": 0.4436494814369457, + "flos": 12495298239360.0, + "grad_norm": 2.670150777415059, + "language_loss": 0.69005907, + "learning_rate": 2.457995878562982e-06, + "loss": 0.71155864, + "num_input_tokens_seen": 158254230, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7421875, + "step": 7379, + "time_per_iteration": 2.460470199584961 + }, + { + "auxiliary_loss_clip": 0.01116042, + "auxiliary_loss_mlp": 0.01029842, + "balance_loss_clip": 1.01689601, + "balance_loss_mlp": 1.04134107, + "epoch": 0.44370960468961373, + "flos": 23659853408640.0, + "grad_norm": 1.5614200394729, + "language_loss": 0.73110741, + "learning_rate": 2.457616757401656e-06, + "loss": 0.75256622, + "num_input_tokens_seen": 158273400, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.75, + "step": 7380, + "time_per_iteration": 2.5134148597717285 + }, + { + "auxiliary_loss_clip": 0.0111454, + "auxiliary_loss_mlp": 0.01031046, + "balance_loss_clip": 1.01841021, + "balance_loss_mlp": 1.0408597, + "epoch": 0.4437697279422817, + "flos": 32416849635840.0, + "grad_norm": 1.5217984285789272, + "language_loss": 0.6470772, + "learning_rate": 2.457237618887458e-06, + "loss": 0.66853309, + "num_input_tokens_seen": 158296840, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.734375, + "step": 7381, + "time_per_iteration": 2.5547850131988525 + }, + { + "auxiliary_loss_clip": 0.01116209, + "auxiliary_loss_mlp": 0.0103301, + "balance_loss_clip": 1.02020693, + "balance_loss_mlp": 1.04110599, + "epoch": 0.44382985119494966, + "flos": 18112875914880.0, + "grad_norm": 2.3862697145357394, + "language_loss": 0.8018291, + "learning_rate": 2.456858463034763e-06, + "loss": 0.82332134, + "num_input_tokens_seen": 158314935, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.75, + "step": 7382, + "time_per_iteration": 2.575241804122925 + }, + { + "auxiliary_loss_clip": 0.01118453, + "auxiliary_loss_mlp": 0.01039182, + "balance_loss_clip": 1.02631903, + "balance_loss_mlp": 1.04359293, + "epoch": 0.44388997444761763, + "flos": 30774151923840.0, + "grad_norm": 1.657830016653087, + "language_loss": 0.65369737, + "learning_rate": 2.456479289857949e-06, + "loss": 0.67527372, + "num_input_tokens_seen": 158334620, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.75, + "step": 7383, + "time_per_iteration": 2.530205726623535 + }, + { + "auxiliary_loss_clip": 0.01118822, + "auxiliary_loss_mlp": 0.01032867, + "balance_loss_clip": 1.01928902, + "balance_loss_mlp": 1.04226518, + "epoch": 0.4439500977002856, + "flos": 20339157893760.0, + "grad_norm": 3.0329093562680023, + "language_loss": 0.75660288, + "learning_rate": 2.4561000993713953e-06, + "loss": 0.77811974, + "num_input_tokens_seen": 158350550, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.765625, + "step": 7384, + "time_per_iteration": 2.5266385078430176 + }, + { + "auxiliary_loss_clip": 0.01118828, + "auxiliary_loss_mlp": 0.01033934, + "balance_loss_clip": 1.02092242, + "balance_loss_mlp": 1.04284334, + "epoch": 0.44401022095295356, + "flos": 20371225760640.0, + "grad_norm": 1.5666997146068944, + "language_loss": 0.81029254, + "learning_rate": 2.4557208915894796e-06, + "loss": 0.83182013, + "num_input_tokens_seen": 158369555, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.76171875, + "step": 7385, + "time_per_iteration": 2.4479992389678955 + }, + { + "auxiliary_loss_clip": 0.01116566, + "auxiliary_loss_mlp": 0.01035084, + "balance_loss_clip": 1.02111292, + "balance_loss_mlp": 1.04122996, + "epoch": 0.4440703442056215, + "flos": 20230635928320.0, + "grad_norm": 1.6468061831775258, + "language_loss": 0.82127023, + "learning_rate": 2.455341666526582e-06, + "loss": 0.84278667, + "num_input_tokens_seen": 158388045, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75390625, + "step": 7386, + "time_per_iteration": 2.48417067527771 + }, + { + "auxiliary_loss_clip": 0.01120079, + "auxiliary_loss_mlp": 0.01037797, + "balance_loss_clip": 1.02320611, + "balance_loss_mlp": 1.04189587, + "epoch": 0.4441304674582895, + "flos": 39494698824960.0, + "grad_norm": 1.953099317045194, + "language_loss": 0.69732893, + "learning_rate": 2.4549624241970832e-06, + "loss": 0.71890771, + "num_input_tokens_seen": 158410115, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.78125, + "step": 7387, + "time_per_iteration": 5.494876146316528 + }, + { + "auxiliary_loss_clip": 0.01114673, + "auxiliary_loss_mlp": 0.01038672, + "balance_loss_clip": 1.02546382, + "balance_loss_mlp": 1.03957582, + "epoch": 0.44419059071095746, + "flos": 14829671220480.0, + "grad_norm": 2.035383956259629, + "language_loss": 0.7170803, + "learning_rate": 2.4545831646153628e-06, + "loss": 0.73861378, + "num_input_tokens_seen": 158427765, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75, + "step": 7388, + "time_per_iteration": 2.4271323680877686 + }, + { + "auxiliary_loss_clip": 0.011178, + "auxiliary_loss_mlp": 0.01031227, + "balance_loss_clip": 1.01776195, + "balance_loss_mlp": 1.04137266, + "epoch": 0.4442507139636254, + "flos": 22637835734400.0, + "grad_norm": 1.4848855642281624, + "language_loss": 0.6881609, + "learning_rate": 2.4542038877958044e-06, + "loss": 0.70965117, + "num_input_tokens_seen": 158446375, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 7389, + "time_per_iteration": 2.4847142696380615 + }, + { + "auxiliary_loss_clip": 0.01115516, + "auxiliary_loss_mlp": 0.01032017, + "balance_loss_clip": 1.01918983, + "balance_loss_mlp": 1.04167664, + "epoch": 0.4443108372162934, + "flos": 38290721829120.0, + "grad_norm": 2.0051609497188587, + "language_loss": 0.74621141, + "learning_rate": 2.453824593752788e-06, + "loss": 0.76768672, + "num_input_tokens_seen": 158467260, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.73828125, + "step": 7390, + "time_per_iteration": 2.594834804534912 + }, + { + "auxiliary_loss_clip": 0.01116041, + "auxiliary_loss_mlp": 0.01033099, + "balance_loss_clip": 1.0202961, + "balance_loss_mlp": 1.04296565, + "epoch": 0.44437096046896135, + "flos": 17748993185280.0, + "grad_norm": 2.702415761973985, + "language_loss": 0.811364, + "learning_rate": 2.4534452825006988e-06, + "loss": 0.83285546, + "num_input_tokens_seen": 158486720, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.73046875, + "step": 7391, + "time_per_iteration": 2.4757862091064453 + }, + { + "auxiliary_loss_clip": 0.01116609, + "auxiliary_loss_mlp": 0.01034214, + "balance_loss_clip": 1.02070808, + "balance_loss_mlp": 1.04341137, + "epoch": 0.4444310837216293, + "flos": 13732348682880.0, + "grad_norm": 1.6224407429556025, + "language_loss": 0.73400211, + "learning_rate": 2.4530659540539185e-06, + "loss": 0.75551033, + "num_input_tokens_seen": 158502530, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 7392, + "time_per_iteration": 2.423929214477539 + }, + { + "auxiliary_loss_clip": 0.01113533, + "auxiliary_loss_mlp": 0.01029467, + "balance_loss_clip": 1.01705766, + "balance_loss_mlp": 1.03988051, + "epoch": 0.44449120697429734, + "flos": 25010238240000.0, + "grad_norm": 1.5529830220947678, + "language_loss": 0.79523122, + "learning_rate": 2.4526866084268313e-06, + "loss": 0.81666124, + "num_input_tokens_seen": 158522715, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.734375, + "step": 7393, + "time_per_iteration": 2.5162272453308105 + }, + { + "auxiliary_loss_clip": 0.01119885, + "auxiliary_loss_mlp": 0.01034531, + "balance_loss_clip": 1.02125716, + "balance_loss_mlp": 1.04248941, + "epoch": 0.4445513302269653, + "flos": 32671707609600.0, + "grad_norm": 1.9165659224437794, + "language_loss": 0.8090415, + "learning_rate": 2.4523072456338226e-06, + "loss": 0.83058566, + "num_input_tokens_seen": 158543615, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7734375, + "step": 7394, + "time_per_iteration": 2.5386714935302734 + }, + { + "auxiliary_loss_clip": 0.01114869, + "auxiliary_loss_mlp": 0.01039877, + "balance_loss_clip": 1.02772927, + "balance_loss_mlp": 1.04228508, + "epoch": 0.44461145347963327, + "flos": 11655814504320.0, + "grad_norm": 3.6807348725160502, + "language_loss": 0.79471326, + "learning_rate": 2.4519278656892785e-06, + "loss": 0.81626076, + "num_input_tokens_seen": 158560330, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.7265625, + "step": 7395, + "time_per_iteration": 2.4668092727661133 + }, + { + "auxiliary_loss_clip": 0.01114654, + "auxiliary_loss_mlp": 0.01033845, + "balance_loss_clip": 1.02162027, + "balance_loss_mlp": 1.04132056, + "epoch": 0.44467157673230123, + "flos": 20886759711360.0, + "grad_norm": 1.800276006342892, + "language_loss": 0.68493867, + "learning_rate": 2.451548468607584e-06, + "loss": 0.70642376, + "num_input_tokens_seen": 158579735, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.734375, + "step": 7396, + "time_per_iteration": 2.463660717010498 + }, + { + "auxiliary_loss_clip": 0.01117407, + "auxiliary_loss_mlp": 0.01031157, + "balance_loss_clip": 1.01831245, + "balance_loss_mlp": 1.0412426, + "epoch": 0.4447316999849692, + "flos": 18546137763840.0, + "grad_norm": 1.8246827609425533, + "language_loss": 0.81007254, + "learning_rate": 2.451169054403126e-06, + "loss": 0.83155811, + "num_input_tokens_seen": 158597075, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.76171875, + "step": 7397, + "time_per_iteration": 2.4812188148498535 + }, + { + "auxiliary_loss_clip": 0.01116158, + "auxiliary_loss_mlp": 0.01033503, + "balance_loss_clip": 1.02078366, + "balance_loss_mlp": 1.04323518, + "epoch": 0.44479182323763716, + "flos": 23769057732480.0, + "grad_norm": 1.7006854584246183, + "language_loss": 0.67145807, + "learning_rate": 2.450789623090293e-06, + "loss": 0.69295466, + "num_input_tokens_seen": 158616650, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7265625, + "step": 7398, + "time_per_iteration": 2.5075526237487793 + }, + { + "auxiliary_loss_clip": 0.01114723, + "auxiliary_loss_mlp": 0.01036485, + "balance_loss_clip": 1.02443874, + "balance_loss_mlp": 1.04204428, + "epoch": 0.44485194649030513, + "flos": 16543831040640.0, + "grad_norm": 1.9000444103330927, + "language_loss": 0.69551516, + "learning_rate": 2.450410174683472e-06, + "loss": 0.71702719, + "num_input_tokens_seen": 158634515, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.7265625, + "step": 7399, + "time_per_iteration": 2.522737741470337 + }, + { + "auxiliary_loss_clip": 0.01113023, + "auxiliary_loss_mlp": 0.01037037, + "balance_loss_clip": 1.02465105, + "balance_loss_mlp": 1.0408442, + "epoch": 0.4449120697429731, + "flos": 22600955445120.0, + "grad_norm": 1.713461165054691, + "language_loss": 0.7287724, + "learning_rate": 2.4500307091970514e-06, + "loss": 0.75027299, + "num_input_tokens_seen": 158653760, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71875, + "step": 7400, + "time_per_iteration": 2.4633662700653076 + }, + { + "auxiliary_loss_clip": 0.0111367, + "auxiliary_loss_mlp": 0.01030255, + "balance_loss_clip": 1.01755965, + "balance_loss_mlp": 1.04038024, + "epoch": 0.44497219299564106, + "flos": 20004864992640.0, + "grad_norm": 1.5216060200654076, + "language_loss": 0.85054708, + "learning_rate": 2.449651226645422e-06, + "loss": 0.87198627, + "num_input_tokens_seen": 158672190, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.734375, + "step": 7401, + "time_per_iteration": 2.5034339427948 + }, + { + "auxiliary_loss_clip": 0.01111761, + "auxiliary_loss_mlp": 0.01033771, + "balance_loss_clip": 1.02213049, + "balance_loss_mlp": 1.04065824, + "epoch": 0.445032316248309, + "flos": 25594253470080.0, + "grad_norm": 1.696028331559664, + "language_loss": 0.83296156, + "learning_rate": 2.449271727042973e-06, + "loss": 0.85441685, + "num_input_tokens_seen": 158694115, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.7109375, + "step": 7402, + "time_per_iteration": 2.501981258392334 + }, + { + "auxiliary_loss_clip": 0.0111579, + "auxiliary_loss_mlp": 0.01032303, + "balance_loss_clip": 1.01979768, + "balance_loss_mlp": 1.0420711, + "epoch": 0.445092439500977, + "flos": 21250426959360.0, + "grad_norm": 1.736524647333069, + "language_loss": 0.76953578, + "learning_rate": 2.4488922104040947e-06, + "loss": 0.7910167, + "num_input_tokens_seen": 158711000, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.73828125, + "step": 7403, + "time_per_iteration": 2.4778058528900146 + }, + { + "auxiliary_loss_clip": 0.01038113, + "auxiliary_loss_mlp": 0.01001838, + "balance_loss_clip": 1.00046086, + "balance_loss_mlp": 1.014925, + "epoch": 0.44515256275364495, + "flos": 57764900309760.0, + "grad_norm": 0.7475420058163609, + "language_loss": 0.60081208, + "learning_rate": 2.4485126767431793e-06, + "loss": 0.62121159, + "num_input_tokens_seen": 158769675, + "router_z_loss_clip": 0.01379395, + "router_z_loss_mlp": 0.23242188, + "step": 7404, + "time_per_iteration": 3.0548532009124756 + }, + { + "auxiliary_loss_clip": 0.01118666, + "auxiliary_loss_mlp": 0.01035192, + "balance_loss_clip": 1.02225208, + "balance_loss_mlp": 1.04285121, + "epoch": 0.4452126860063129, + "flos": 15596004908160.0, + "grad_norm": 1.6312624429793499, + "language_loss": 0.81696916, + "learning_rate": 2.4481331260746177e-06, + "loss": 0.83850771, + "num_input_tokens_seen": 158788215, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7578125, + "step": 7405, + "time_per_iteration": 2.474632978439331 + }, + { + "auxiliary_loss_clip": 0.0111153, + "auxiliary_loss_mlp": 0.01031071, + "balance_loss_clip": 1.01864958, + "balance_loss_mlp": 1.03843176, + "epoch": 0.4452728092589809, + "flos": 21617398258560.0, + "grad_norm": 1.4258557139975254, + "language_loss": 0.74869186, + "learning_rate": 2.4477535584128036e-06, + "loss": 0.77011788, + "num_input_tokens_seen": 158809090, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.73046875, + "step": 7406, + "time_per_iteration": 2.4767563343048096 + }, + { + "auxiliary_loss_clip": 0.01108887, + "auxiliary_loss_mlp": 0.01030592, + "balance_loss_clip": 1.01837921, + "balance_loss_mlp": 1.03819203, + "epoch": 0.4453329325116489, + "flos": 29497491757440.0, + "grad_norm": 1.5627122296340765, + "language_loss": 0.65510803, + "learning_rate": 2.447373973772129e-06, + "loss": 0.67650282, + "num_input_tokens_seen": 158828320, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.70703125, + "step": 7407, + "time_per_iteration": 2.5395827293395996 + }, + { + "auxiliary_loss_clip": 0.01118546, + "auxiliary_loss_mlp": 0.01029122, + "balance_loss_clip": 1.01691461, + "balance_loss_mlp": 1.04306138, + "epoch": 0.44539305576431687, + "flos": 21361139654400.0, + "grad_norm": 1.5061477696527659, + "language_loss": 0.67724633, + "learning_rate": 2.4469943721669887e-06, + "loss": 0.69872296, + "num_input_tokens_seen": 158847040, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.75390625, + "step": 7408, + "time_per_iteration": 2.462306261062622 + }, + { + "auxiliary_loss_clip": 0.0111265, + "auxiliary_loss_mlp": 0.01032186, + "balance_loss_clip": 1.01891828, + "balance_loss_mlp": 1.0386107, + "epoch": 0.44545317901698483, + "flos": 41427626428800.0, + "grad_norm": 1.4978343447976226, + "language_loss": 0.71923941, + "learning_rate": 2.4466147536117776e-06, + "loss": 0.74068785, + "num_input_tokens_seen": 158870490, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7421875, + "step": 7409, + "time_per_iteration": 2.674224615097046 + }, + { + "auxiliary_loss_clip": 0.01114249, + "auxiliary_loss_mlp": 0.01034826, + "balance_loss_clip": 1.02100968, + "balance_loss_mlp": 1.03980279, + "epoch": 0.4455133022696528, + "flos": 22055005653120.0, + "grad_norm": 2.031581575195052, + "language_loss": 0.64823419, + "learning_rate": 2.4462351181208895e-06, + "loss": 0.66972494, + "num_input_tokens_seen": 158889920, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7421875, + "step": 7410, + "time_per_iteration": 2.524874687194824 + }, + { + "auxiliary_loss_clip": 0.01120724, + "auxiliary_loss_mlp": 0.01033754, + "balance_loss_clip": 1.0200448, + "balance_loss_mlp": 1.04309118, + "epoch": 0.44557342552232077, + "flos": 23476960333440.0, + "grad_norm": 2.015615502497161, + "language_loss": 0.74042189, + "learning_rate": 2.4458554657087217e-06, + "loss": 0.76196671, + "num_input_tokens_seen": 158909580, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.77734375, + "step": 7411, + "time_per_iteration": 2.512510061264038 + }, + { + "auxiliary_loss_clip": 0.01112773, + "auxiliary_loss_mlp": 0.01031337, + "balance_loss_clip": 1.01900446, + "balance_loss_mlp": 1.04189968, + "epoch": 0.44563354877498873, + "flos": 19134678107520.0, + "grad_norm": 1.869475782048451, + "language_loss": 0.79242551, + "learning_rate": 2.4454757963896695e-06, + "loss": 0.81386662, + "num_input_tokens_seen": 158924600, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.70703125, + "step": 7412, + "time_per_iteration": 2.472858190536499 + }, + { + "auxiliary_loss_clip": 0.01114909, + "auxiliary_loss_mlp": 0.01035461, + "balance_loss_clip": 1.02299762, + "balance_loss_mlp": 1.03920937, + "epoch": 0.4456936720276567, + "flos": 13621420506240.0, + "grad_norm": 3.400478569187806, + "language_loss": 0.798675, + "learning_rate": 2.4450961101781304e-06, + "loss": 0.82017869, + "num_input_tokens_seen": 158939345, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7578125, + "step": 7413, + "time_per_iteration": 2.4117238521575928 + }, + { + "auxiliary_loss_clip": 0.01112114, + "auxiliary_loss_mlp": 0.01028076, + "balance_loss_clip": 1.01601171, + "balance_loss_mlp": 1.04039168, + "epoch": 0.44575379528032466, + "flos": 14713715139840.0, + "grad_norm": 1.7210919700182319, + "language_loss": 0.76510686, + "learning_rate": 2.4447164070885026e-06, + "loss": 0.7865088, + "num_input_tokens_seen": 158955855, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.71875, + "step": 7414, + "time_per_iteration": 2.460224151611328 + }, + { + "auxiliary_loss_clip": 0.01113268, + "auxiliary_loss_mlp": 0.01033582, + "balance_loss_clip": 1.02064216, + "balance_loss_mlp": 1.04047227, + "epoch": 0.4458139185329926, + "flos": 24170682677760.0, + "grad_norm": 1.4395051245379855, + "language_loss": 0.83344847, + "learning_rate": 2.4443366871351837e-06, + "loss": 0.85491699, + "num_input_tokens_seen": 158976315, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7265625, + "step": 7415, + "time_per_iteration": 2.487433910369873 + }, + { + "auxiliary_loss_clip": 0.01111103, + "auxiliary_loss_mlp": 0.01039317, + "balance_loss_clip": 1.02675223, + "balance_loss_mlp": 1.03786182, + "epoch": 0.4458740417856606, + "flos": 21762225895680.0, + "grad_norm": 1.5295363489819147, + "language_loss": 0.84025514, + "learning_rate": 2.4439569503325732e-06, + "loss": 0.86175931, + "num_input_tokens_seen": 158996725, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.734375, + "step": 7416, + "time_per_iteration": 2.4827380180358887 + }, + { + "auxiliary_loss_clip": 0.0111513, + "auxiliary_loss_mlp": 0.01031747, + "balance_loss_clip": 1.01872349, + "balance_loss_mlp": 1.03937066, + "epoch": 0.44593416503832856, + "flos": 21068790860160.0, + "grad_norm": 1.5840815969934987, + "language_loss": 0.8099134, + "learning_rate": 2.4435771966950706e-06, + "loss": 0.83138216, + "num_input_tokens_seen": 159017255, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7578125, + "step": 7417, + "time_per_iteration": 2.48150897026062 + }, + { + "auxiliary_loss_clip": 0.01115498, + "auxiliary_loss_mlp": 0.01039656, + "balance_loss_clip": 1.02679276, + "balance_loss_mlp": 1.04055572, + "epoch": 0.4459942882909965, + "flos": 22600488568320.0, + "grad_norm": 1.9543176040955477, + "language_loss": 0.81078619, + "learning_rate": 2.443197426237077e-06, + "loss": 0.83233768, + "num_input_tokens_seen": 159035010, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.75, + "step": 7418, + "time_per_iteration": 2.489847421646118 + }, + { + "auxiliary_loss_clip": 0.0111452, + "auxiliary_loss_mlp": 0.01029153, + "balance_loss_clip": 1.01647544, + "balance_loss_mlp": 1.04015303, + "epoch": 0.4460544115436645, + "flos": 26505486622080.0, + "grad_norm": 1.586204851514133, + "language_loss": 0.77404898, + "learning_rate": 2.442817638972991e-06, + "loss": 0.79548573, + "num_input_tokens_seen": 159055345, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7421875, + "step": 7419, + "time_per_iteration": 2.497434377670288 + }, + { + "auxiliary_loss_clip": 0.01112333, + "auxiliary_loss_mlp": 0.01034157, + "balance_loss_clip": 1.02190208, + "balance_loss_mlp": 1.03983605, + "epoch": 0.4461145347963325, + "flos": 17604021893760.0, + "grad_norm": 1.7862585645473121, + "language_loss": 0.72408056, + "learning_rate": 2.4424378349172176e-06, + "loss": 0.74554545, + "num_input_tokens_seen": 159074225, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.7265625, + "step": 7420, + "time_per_iteration": 2.459458351135254 + }, + { + "auxiliary_loss_clip": 0.01113499, + "auxiliary_loss_mlp": 0.0103005, + "balance_loss_clip": 1.0166688, + "balance_loss_mlp": 1.0416131, + "epoch": 0.44617465804900047, + "flos": 27268193036160.0, + "grad_norm": 1.6779849239209732, + "language_loss": 0.75009704, + "learning_rate": 2.442058014084156e-06, + "loss": 0.77153254, + "num_input_tokens_seen": 159095415, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.71875, + "step": 7421, + "time_per_iteration": 2.51987624168396 + }, + { + "auxiliary_loss_clip": 0.01110345, + "auxiliary_loss_mlp": 0.01032284, + "balance_loss_clip": 1.02002299, + "balance_loss_mlp": 1.04095602, + "epoch": 0.44623478130166844, + "flos": 17786412178560.0, + "grad_norm": 1.9054244397804427, + "language_loss": 0.76410532, + "learning_rate": 2.44167817648821e-06, + "loss": 0.78553158, + "num_input_tokens_seen": 159114615, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 7422, + "time_per_iteration": 2.4755024909973145 + }, + { + "auxiliary_loss_clip": 0.0111206, + "auxiliary_loss_mlp": 0.01031918, + "balance_loss_clip": 1.01975894, + "balance_loss_mlp": 1.03931499, + "epoch": 0.4462949045543364, + "flos": 23003011353600.0, + "grad_norm": 1.4448000656244153, + "language_loss": 0.65126681, + "learning_rate": 2.441298322143784e-06, + "loss": 0.6727066, + "num_input_tokens_seen": 159134370, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.7265625, + "step": 7423, + "time_per_iteration": 2.4828243255615234 + }, + { + "auxiliary_loss_clip": 0.01110236, + "auxiliary_loss_mlp": 0.01028687, + "balance_loss_clip": 1.01719534, + "balance_loss_mlp": 1.04027271, + "epoch": 0.44635502780700437, + "flos": 17820096157440.0, + "grad_norm": 1.510185037273786, + "language_loss": 0.78842837, + "learning_rate": 2.4409184510652807e-06, + "loss": 0.80981761, + "num_input_tokens_seen": 159152540, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.69921875, + "step": 7424, + "time_per_iteration": 2.4399938583374023 + }, + { + "auxiliary_loss_clip": 0.01111318, + "auxiliary_loss_mlp": 0.01032011, + "balance_loss_clip": 1.02010214, + "balance_loss_mlp": 1.04070699, + "epoch": 0.44641515105967233, + "flos": 26688020561280.0, + "grad_norm": 1.3563203456934205, + "language_loss": 0.80225039, + "learning_rate": 2.4405385632671063e-06, + "loss": 0.82368374, + "num_input_tokens_seen": 159173425, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.703125, + "step": 7425, + "time_per_iteration": 2.5406088829040527 + }, + { + "auxiliary_loss_clip": 0.01111697, + "auxiliary_loss_mlp": 0.01031502, + "balance_loss_clip": 1.0190568, + "balance_loss_mlp": 1.04027843, + "epoch": 0.4464752743123403, + "flos": 18913324544640.0, + "grad_norm": 2.6114514678489895, + "language_loss": 0.77294517, + "learning_rate": 2.4401586587636655e-06, + "loss": 0.79437709, + "num_input_tokens_seen": 159191210, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71484375, + "step": 7426, + "time_per_iteration": 2.4845876693725586 + }, + { + "auxiliary_loss_clip": 0.01112123, + "auxiliary_loss_mlp": 0.01028013, + "balance_loss_clip": 1.01636636, + "balance_loss_mlp": 1.03881311, + "epoch": 0.44653539756500826, + "flos": 29570318582400.0, + "grad_norm": 1.552934875151276, + "language_loss": 0.64668226, + "learning_rate": 2.4397787375693634e-06, + "loss": 0.66808361, + "num_input_tokens_seen": 159211755, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.734375, + "step": 7427, + "time_per_iteration": 2.540630340576172 + }, + { + "auxiliary_loss_clip": 0.01116984, + "auxiliary_loss_mlp": 0.01032083, + "balance_loss_clip": 1.02009046, + "balance_loss_mlp": 1.04497719, + "epoch": 0.44659552081767623, + "flos": 21468979261440.0, + "grad_norm": 1.583763048167789, + "language_loss": 0.75103819, + "learning_rate": 2.439398799698608e-06, + "loss": 0.77252889, + "num_input_tokens_seen": 159230315, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.71875, + "step": 7428, + "time_per_iteration": 3.8718421459198 + }, + { + "auxiliary_loss_clip": 0.01111211, + "auxiliary_loss_mlp": 0.01032965, + "balance_loss_clip": 1.0205195, + "balance_loss_mlp": 1.03955674, + "epoch": 0.4466556440703442, + "flos": 17931886260480.0, + "grad_norm": 1.8476152433667956, + "language_loss": 0.77595931, + "learning_rate": 2.439018845165806e-06, + "loss": 0.79740107, + "num_input_tokens_seen": 159249810, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71875, + "step": 7429, + "time_per_iteration": 5.381062984466553 + }, + { + "auxiliary_loss_clip": 0.01114674, + "auxiliary_loss_mlp": 0.01032028, + "balance_loss_clip": 1.01935029, + "balance_loss_mlp": 1.04038692, + "epoch": 0.44671576732301216, + "flos": 21107430915840.0, + "grad_norm": 1.5332211966047418, + "language_loss": 0.91229695, + "learning_rate": 2.438638873985366e-06, + "loss": 0.93376398, + "num_input_tokens_seen": 159271715, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7421875, + "step": 7430, + "time_per_iteration": 2.4677700996398926 + }, + { + "auxiliary_loss_clip": 0.0111698, + "auxiliary_loss_mlp": 0.01038071, + "balance_loss_clip": 1.02439737, + "balance_loss_mlp": 1.04052413, + "epoch": 0.4467758905756801, + "flos": 23508920459520.0, + "grad_norm": 1.5443417480404311, + "language_loss": 0.79630744, + "learning_rate": 2.4382588861716954e-06, + "loss": 0.81785798, + "num_input_tokens_seen": 159290690, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.765625, + "step": 7431, + "time_per_iteration": 2.567082405090332 + }, + { + "auxiliary_loss_clip": 0.0111745, + "auxiliary_loss_mlp": 0.01037244, + "balance_loss_clip": 1.02438116, + "balance_loss_mlp": 1.04187393, + "epoch": 0.4468360138283481, + "flos": 18734022829440.0, + "grad_norm": 2.0676923701008807, + "language_loss": 0.80376756, + "learning_rate": 2.437878881739204e-06, + "loss": 0.82531446, + "num_input_tokens_seen": 159309400, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.75390625, + "step": 7432, + "time_per_iteration": 2.4359145164489746 + }, + { + "auxiliary_loss_clip": 0.01115042, + "auxiliary_loss_mlp": 0.01036362, + "balance_loss_clip": 1.02394009, + "balance_loss_mlp": 1.03957176, + "epoch": 0.4468961370810161, + "flos": 23477139901440.0, + "grad_norm": 2.022128912320156, + "language_loss": 0.76601076, + "learning_rate": 2.437498860702301e-06, + "loss": 0.78752482, + "num_input_tokens_seen": 159327425, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.75390625, + "step": 7433, + "time_per_iteration": 2.48732852935791 + }, + { + "auxiliary_loss_clip": 0.0110862, + "auxiliary_loss_mlp": 0.01033692, + "balance_loss_clip": 1.0233326, + "balance_loss_mlp": 1.03873658, + "epoch": 0.4469562603336841, + "flos": 30075042539520.0, + "grad_norm": 1.6660023236153727, + "language_loss": 0.7773807, + "learning_rate": 2.437118823075398e-06, + "loss": 0.79880381, + "num_input_tokens_seen": 159345805, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.69921875, + "step": 7434, + "time_per_iteration": 2.501410961151123 + }, + { + "auxiliary_loss_clip": 0.01117105, + "auxiliary_loss_mlp": 0.01034097, + "balance_loss_clip": 1.02160966, + "balance_loss_mlp": 1.04261708, + "epoch": 0.44701638358635204, + "flos": 22456415116800.0, + "grad_norm": 1.6324454169441744, + "language_loss": 0.64255738, + "learning_rate": 2.436738768872905e-06, + "loss": 0.66406941, + "num_input_tokens_seen": 159364595, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.74609375, + "step": 7435, + "time_per_iteration": 2.506918430328369 + }, + { + "auxiliary_loss_clip": 0.01116438, + "auxiliary_loss_mlp": 0.01029509, + "balance_loss_clip": 1.01706398, + "balance_loss_mlp": 1.04181314, + "epoch": 0.44707650683902, + "flos": 24057851080320.0, + "grad_norm": 1.4705490989927619, + "language_loss": 0.83558768, + "learning_rate": 2.4363586981092346e-06, + "loss": 0.8570472, + "num_input_tokens_seen": 159385265, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.74609375, + "step": 7436, + "time_per_iteration": 2.482273817062378 + }, + { + "auxiliary_loss_clip": 0.01114793, + "auxiliary_loss_mlp": 0.01033883, + "balance_loss_clip": 1.02067423, + "balance_loss_mlp": 1.0400939, + "epoch": 0.44713663009168797, + "flos": 23766938830080.0, + "grad_norm": 1.6782401052542175, + "language_loss": 0.79564971, + "learning_rate": 2.435978610798798e-06, + "loss": 0.81713653, + "num_input_tokens_seen": 159405080, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.74609375, + "step": 7437, + "time_per_iteration": 2.519118309020996 + }, + { + "auxiliary_loss_clip": 0.01114275, + "auxiliary_loss_mlp": 0.01032586, + "balance_loss_clip": 1.02017021, + "balance_loss_mlp": 1.03965664, + "epoch": 0.44719675334435594, + "flos": 24499265316480.0, + "grad_norm": 1.5877629147247494, + "language_loss": 0.71921134, + "learning_rate": 2.435598506956009e-06, + "loss": 0.74067998, + "num_input_tokens_seen": 159424595, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.74609375, + "step": 7438, + "time_per_iteration": 2.4918689727783203 + }, + { + "auxiliary_loss_clip": 0.01114196, + "auxiliary_loss_mlp": 0.01034386, + "balance_loss_clip": 1.02114832, + "balance_loss_mlp": 1.03908634, + "epoch": 0.4472568765970239, + "flos": 29781759991680.0, + "grad_norm": 1.558408845854645, + "language_loss": 0.67469549, + "learning_rate": 2.4352183865952808e-06, + "loss": 0.6961813, + "num_input_tokens_seen": 159443865, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75, + "step": 7439, + "time_per_iteration": 2.549445390701294 + }, + { + "auxiliary_loss_clip": 0.01116567, + "auxiliary_loss_mlp": 0.01035107, + "balance_loss_clip": 1.0218277, + "balance_loss_mlp": 1.04164815, + "epoch": 0.44731699984969187, + "flos": 24643123286400.0, + "grad_norm": 1.6525243551580215, + "language_loss": 0.73600596, + "learning_rate": 2.4348382497310285e-06, + "loss": 0.7575227, + "num_input_tokens_seen": 159464525, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75, + "step": 7440, + "time_per_iteration": 2.487545967102051 + }, + { + "auxiliary_loss_clip": 0.01112285, + "auxiliary_loss_mlp": 0.01034061, + "balance_loss_clip": 1.02215195, + "balance_loss_mlp": 1.03937638, + "epoch": 0.44737712310235983, + "flos": 29455691304960.0, + "grad_norm": 1.5916362290459067, + "language_loss": 0.74376386, + "learning_rate": 2.4344580963776655e-06, + "loss": 0.76522732, + "num_input_tokens_seen": 159486385, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.73046875, + "step": 7441, + "time_per_iteration": 2.537848472595215 + }, + { + "auxiliary_loss_clip": 0.01116121, + "auxiliary_loss_mlp": 0.01039305, + "balance_loss_clip": 1.02585804, + "balance_loss_mlp": 1.04112506, + "epoch": 0.4474372463550278, + "flos": 24896832024960.0, + "grad_norm": 2.062950208020596, + "language_loss": 0.74780977, + "learning_rate": 2.4340779265496082e-06, + "loss": 0.769364, + "num_input_tokens_seen": 159503880, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 7442, + "time_per_iteration": 2.45829701423645 + }, + { + "auxiliary_loss_clip": 0.0111647, + "auxiliary_loss_mlp": 0.01034752, + "balance_loss_clip": 1.02123356, + "balance_loss_mlp": 1.03977489, + "epoch": 0.44749736960769576, + "flos": 33181603125120.0, + "grad_norm": 1.7358505546612006, + "language_loss": 0.7456758, + "learning_rate": 2.433697740261273e-06, + "loss": 0.76718801, + "num_input_tokens_seen": 159522980, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.76953125, + "step": 7443, + "time_per_iteration": 2.604759931564331 + }, + { + "auxiliary_loss_clip": 0.01111225, + "auxiliary_loss_mlp": 0.01028498, + "balance_loss_clip": 1.01500916, + "balance_loss_mlp": 1.0379262, + "epoch": 0.4475574928603637, + "flos": 21071807602560.0, + "grad_norm": 1.8898561004653542, + "language_loss": 0.77591091, + "learning_rate": 2.4333175375270748e-06, + "loss": 0.79730821, + "num_input_tokens_seen": 159543340, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 7444, + "time_per_iteration": 2.5373945236206055 + }, + { + "auxiliary_loss_clip": 0.01110179, + "auxiliary_loss_mlp": 0.01030626, + "balance_loss_clip": 1.01813924, + "balance_loss_mlp": 1.03841698, + "epoch": 0.4476176161130317, + "flos": 21862523646720.0, + "grad_norm": 2.3020631966175893, + "language_loss": 0.85495317, + "learning_rate": 2.4329373183614333e-06, + "loss": 0.87636125, + "num_input_tokens_seen": 159558210, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71875, + "step": 7445, + "time_per_iteration": 2.4707260131835938 + }, + { + "auxiliary_loss_clip": 0.01116558, + "auxiliary_loss_mlp": 0.01030825, + "balance_loss_clip": 1.01741982, + "balance_loss_mlp": 1.04191256, + "epoch": 0.4476777393656997, + "flos": 22528667324160.0, + "grad_norm": 3.672789877680737, + "language_loss": 0.64349431, + "learning_rate": 2.432557082778765e-06, + "loss": 0.66496813, + "num_input_tokens_seen": 159577920, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.74609375, + "step": 7446, + "time_per_iteration": 2.4802255630493164 + }, + { + "auxiliary_loss_clip": 0.0103814, + "auxiliary_loss_mlp": 0.0100263, + "balance_loss_clip": 1.00128329, + "balance_loss_mlp": 1.01421368, + "epoch": 0.4477378626183677, + "flos": 49017133877760.0, + "grad_norm": 0.7477421339074387, + "language_loss": 0.50242257, + "learning_rate": 2.4321768307934884e-06, + "loss": 0.52283025, + "num_input_tokens_seen": 159632295, + "router_z_loss_clip": 0.01348877, + "router_z_loss_mlp": 0.24023438, + "step": 7447, + "time_per_iteration": 2.9262073040008545 + }, + { + "auxiliary_loss_clip": 0.01037975, + "auxiliary_loss_mlp": 0.01002161, + "balance_loss_clip": 1.00088537, + "balance_loss_mlp": 1.01407075, + "epoch": 0.44779798587103564, + "flos": 56542179392640.0, + "grad_norm": 0.7583700928831021, + "language_loss": 0.59290731, + "learning_rate": 2.4317965624200235e-06, + "loss": 0.61330867, + "num_input_tokens_seen": 159698435, + "router_z_loss_clip": 0.01275635, + "router_z_loss_mlp": 0.23925781, + "step": 7448, + "time_per_iteration": 3.2298059463500977 + }, + { + "auxiliary_loss_clip": 0.01112419, + "auxiliary_loss_mlp": 0.01032837, + "balance_loss_clip": 1.02082074, + "balance_loss_mlp": 1.03913987, + "epoch": 0.4478581091237036, + "flos": 46498536040320.0, + "grad_norm": 1.4697324100578784, + "language_loss": 0.59226847, + "learning_rate": 2.431416277672789e-06, + "loss": 0.61372101, + "num_input_tokens_seen": 159722150, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.73046875, + "step": 7449, + "time_per_iteration": 2.667651891708374 + }, + { + "auxiliary_loss_clip": 0.01114654, + "auxiliary_loss_mlp": 0.01028568, + "balance_loss_clip": 1.01638436, + "balance_loss_mlp": 1.04082561, + "epoch": 0.4479182323763716, + "flos": 20814363849600.0, + "grad_norm": 1.6912833904949394, + "language_loss": 0.79799938, + "learning_rate": 2.4310359765662065e-06, + "loss": 0.8194316, + "num_input_tokens_seen": 159740550, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.73828125, + "step": 7450, + "time_per_iteration": 2.488041400909424 + }, + { + "auxiliary_loss_clip": 0.01112446, + "auxiliary_loss_mlp": 0.01031639, + "balance_loss_clip": 1.01900911, + "balance_loss_mlp": 1.03948057, + "epoch": 0.44797835562903954, + "flos": 14245979212800.0, + "grad_norm": 2.443005371711525, + "language_loss": 0.79474008, + "learning_rate": 2.430655659114697e-06, + "loss": 0.81618094, + "num_input_tokens_seen": 159758245, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.73046875, + "step": 7451, + "time_per_iteration": 2.4184346199035645 + }, + { + "auxiliary_loss_clip": 0.01037194, + "auxiliary_loss_mlp": 0.01000693, + "balance_loss_clip": 0.99944174, + "balance_loss_mlp": 1.01323009, + "epoch": 0.4480384788817075, + "flos": 63534560169600.0, + "grad_norm": 2.1611139577707608, + "language_loss": 0.62848771, + "learning_rate": 2.430275325332681e-06, + "loss": 0.64886659, + "num_input_tokens_seen": 159826790, + "router_z_loss_clip": 0.01251221, + "router_z_loss_mlp": 0.24023438, + "step": 7452, + "time_per_iteration": 3.1637966632843018 + }, + { + "auxiliary_loss_clip": 0.01115495, + "auxiliary_loss_mlp": 0.01036415, + "balance_loss_clip": 1.0227952, + "balance_loss_mlp": 1.04087877, + "epoch": 0.44809860213437547, + "flos": 21652626522240.0, + "grad_norm": 1.7752989444397396, + "language_loss": 0.62657529, + "learning_rate": 2.429894975234582e-06, + "loss": 0.64809442, + "num_input_tokens_seen": 159845805, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.74609375, + "step": 7453, + "time_per_iteration": 2.4473493099212646 + }, + { + "auxiliary_loss_clip": 0.01036714, + "auxiliary_loss_mlp": 0.01000711, + "balance_loss_clip": 0.99935836, + "balance_loss_mlp": 1.01265335, + "epoch": 0.44815872538704343, + "flos": 69190634246400.0, + "grad_norm": 0.7532005340797263, + "language_loss": 0.57028639, + "learning_rate": 2.4295146088348224e-06, + "loss": 0.59066069, + "num_input_tokens_seen": 159898860, + "router_z_loss_clip": 0.0135498, + "router_z_loss_mlp": 0.24023438, + "step": 7454, + "time_per_iteration": 2.9524526596069336 + }, + { + "auxiliary_loss_clip": 0.01111502, + "auxiliary_loss_mlp": 0.01027601, + "balance_loss_clip": 1.01563811, + "balance_loss_mlp": 1.03850055, + "epoch": 0.4482188486397114, + "flos": 12598289510400.0, + "grad_norm": 2.2509965352428334, + "language_loss": 0.75078607, + "learning_rate": 2.4291342261478255e-06, + "loss": 0.7721771, + "num_input_tokens_seen": 159911555, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.73046875, + "step": 7455, + "time_per_iteration": 2.4103891849517822 + }, + { + "auxiliary_loss_clip": 0.01112978, + "auxiliary_loss_mlp": 0.01029679, + "balance_loss_clip": 1.01761508, + "balance_loss_mlp": 1.03976846, + "epoch": 0.44827897189237936, + "flos": 34058182631040.0, + "grad_norm": 1.6579032105665654, + "language_loss": 0.76428723, + "learning_rate": 2.428753827188016e-06, + "loss": 0.78571379, + "num_input_tokens_seen": 159931470, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.734375, + "step": 7456, + "time_per_iteration": 2.5631935596466064 + }, + { + "auxiliary_loss_clip": 0.01115214, + "auxiliary_loss_mlp": 0.01035659, + "balance_loss_clip": 1.02398849, + "balance_loss_mlp": 1.04312015, + "epoch": 0.44833909514504733, + "flos": 25147416280320.0, + "grad_norm": 1.9831255862845865, + "language_loss": 0.76475745, + "learning_rate": 2.428373411969818e-06, + "loss": 0.78626615, + "num_input_tokens_seen": 159946115, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.71875, + "step": 7457, + "time_per_iteration": 2.464808702468872 + }, + { + "auxiliary_loss_clip": 0.01113345, + "auxiliary_loss_mlp": 0.01029291, + "balance_loss_clip": 1.01611805, + "balance_loss_mlp": 1.03910387, + "epoch": 0.4483992183977153, + "flos": 16179984224640.0, + "grad_norm": 1.9767465188311044, + "language_loss": 0.67705971, + "learning_rate": 2.4279929805076576e-06, + "loss": 0.69848609, + "num_input_tokens_seen": 159963915, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 7458, + "time_per_iteration": 2.4457101821899414 + }, + { + "auxiliary_loss_clip": 0.01116638, + "auxiliary_loss_mlp": 0.01031464, + "balance_loss_clip": 1.01787972, + "balance_loss_mlp": 1.04051626, + "epoch": 0.44845934165038326, + "flos": 17746048270080.0, + "grad_norm": 1.5619796593676711, + "language_loss": 0.72202468, + "learning_rate": 2.427612532815961e-06, + "loss": 0.74350572, + "num_input_tokens_seen": 159982140, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.76171875, + "step": 7459, + "time_per_iteration": 2.433029890060425 + }, + { + "auxiliary_loss_clip": 0.0110945, + "auxiliary_loss_mlp": 0.01029093, + "balance_loss_clip": 1.01676071, + "balance_loss_mlp": 1.03716815, + "epoch": 0.4485194649030513, + "flos": 21835914647040.0, + "grad_norm": 1.8000530949283695, + "language_loss": 0.69520539, + "learning_rate": 2.427232068909154e-06, + "loss": 0.71659082, + "num_input_tokens_seen": 160002280, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.72265625, + "step": 7460, + "time_per_iteration": 2.4872210025787354 + }, + { + "auxiliary_loss_clip": 0.01111602, + "auxiliary_loss_mlp": 0.01034327, + "balance_loss_clip": 1.02144051, + "balance_loss_mlp": 1.03848231, + "epoch": 0.44857958815571924, + "flos": 20084515401600.0, + "grad_norm": 1.9864484577730697, + "language_loss": 0.77204525, + "learning_rate": 2.4268515888016635e-06, + "loss": 0.79350454, + "num_input_tokens_seen": 160020260, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.734375, + "step": 7461, + "time_per_iteration": 2.455543279647827 + }, + { + "auxiliary_loss_clip": 0.01111999, + "auxiliary_loss_mlp": 0.01029771, + "balance_loss_clip": 1.0180943, + "balance_loss_mlp": 1.03780031, + "epoch": 0.4486397114083872, + "flos": 27053519402880.0, + "grad_norm": 1.7106561387980361, + "language_loss": 0.67983574, + "learning_rate": 2.4264710925079184e-06, + "loss": 0.70125341, + "num_input_tokens_seen": 160040240, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.7421875, + "step": 7462, + "time_per_iteration": 2.5366299152374268 + }, + { + "auxiliary_loss_clip": 0.01034999, + "auxiliary_loss_mlp": 0.01002003, + "balance_loss_clip": 1.00071561, + "balance_loss_mlp": 1.01134682, + "epoch": 0.4486998346610552, + "flos": 67321195931520.0, + "grad_norm": 0.7463947253576576, + "language_loss": 0.54503644, + "learning_rate": 2.4260905800423462e-06, + "loss": 0.56540644, + "num_input_tokens_seen": 160093865, + "router_z_loss_clip": 0.01287842, + "router_z_loss_mlp": 0.23632812, + "step": 7463, + "time_per_iteration": 3.0639255046844482 + }, + { + "auxiliary_loss_clip": 0.01110954, + "auxiliary_loss_mlp": 0.01029704, + "balance_loss_clip": 1.01699638, + "balance_loss_mlp": 1.03847826, + "epoch": 0.44875995791372314, + "flos": 27636816360960.0, + "grad_norm": 1.9527582175804243, + "language_loss": 0.75866246, + "learning_rate": 2.4257100514193775e-06, + "loss": 0.78006899, + "num_input_tokens_seen": 160113590, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 7464, + "time_per_iteration": 2.5135347843170166 + }, + { + "auxiliary_loss_clip": 0.01109858, + "auxiliary_loss_mlp": 0.01033694, + "balance_loss_clip": 1.02225554, + "balance_loss_mlp": 1.03903246, + "epoch": 0.4488200811663911, + "flos": 13005947940480.0, + "grad_norm": 1.8117694427226085, + "language_loss": 0.73671377, + "learning_rate": 2.425329506653441e-06, + "loss": 0.75814927, + "num_input_tokens_seen": 160131795, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.70703125, + "step": 7465, + "time_per_iteration": 2.433394432067871 + }, + { + "auxiliary_loss_clip": 0.01118642, + "auxiliary_loss_mlp": 0.01035747, + "balance_loss_clip": 1.02193666, + "balance_loss_mlp": 1.04127038, + "epoch": 0.44888020441905907, + "flos": 27489977562240.0, + "grad_norm": 1.824586312100338, + "language_loss": 0.7996276, + "learning_rate": 2.424948945758966e-06, + "loss": 0.82117152, + "num_input_tokens_seen": 160150635, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7734375, + "step": 7466, + "time_per_iteration": 2.5013458728790283 + }, + { + "auxiliary_loss_clip": 0.01114545, + "auxiliary_loss_mlp": 0.01031895, + "balance_loss_clip": 1.01967633, + "balance_loss_mlp": 1.04118383, + "epoch": 0.44894032767172704, + "flos": 18259678800000.0, + "grad_norm": 2.612382799524426, + "language_loss": 0.80522013, + "learning_rate": 2.4245683687503844e-06, + "loss": 0.82668447, + "num_input_tokens_seen": 160168615, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.734375, + "step": 7467, + "time_per_iteration": 2.4517929553985596 + }, + { + "auxiliary_loss_clip": 0.01109457, + "auxiliary_loss_mlp": 0.01031548, + "balance_loss_clip": 1.01998448, + "balance_loss_mlp": 1.03988719, + "epoch": 0.449000450924395, + "flos": 21579835610880.0, + "grad_norm": 1.7208509955346651, + "language_loss": 0.75153285, + "learning_rate": 2.424187775642129e-06, + "loss": 0.7729429, + "num_input_tokens_seen": 160187295, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6953125, + "step": 7468, + "time_per_iteration": 2.4585771560668945 + }, + { + "auxiliary_loss_clip": 0.01112269, + "auxiliary_loss_mlp": 0.01025298, + "balance_loss_clip": 1.01422918, + "balance_loss_mlp": 1.04034877, + "epoch": 0.44906057417706297, + "flos": 17967904623360.0, + "grad_norm": 1.8721286685005696, + "language_loss": 0.7099303, + "learning_rate": 2.4238071664486297e-06, + "loss": 0.73130596, + "num_input_tokens_seen": 160205115, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.71875, + "step": 7469, + "time_per_iteration": 2.420208692550659 + }, + { + "auxiliary_loss_clip": 0.01114048, + "auxiliary_loss_mlp": 0.01035745, + "balance_loss_clip": 1.02298415, + "balance_loss_mlp": 1.04046845, + "epoch": 0.44912069742973093, + "flos": 20047347803520.0, + "grad_norm": 1.7828692415308351, + "language_loss": 0.71891844, + "learning_rate": 2.4234265411843203e-06, + "loss": 0.74041635, + "num_input_tokens_seen": 160222580, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.734375, + "step": 7470, + "time_per_iteration": 5.381145477294922 + }, + { + "auxiliary_loss_clip": 0.01112344, + "auxiliary_loss_mlp": 0.01032399, + "balance_loss_clip": 1.01940536, + "balance_loss_mlp": 1.03871441, + "epoch": 0.4491808206823989, + "flos": 21033526682880.0, + "grad_norm": 2.1026178485463274, + "language_loss": 0.76912111, + "learning_rate": 2.423045899863634e-06, + "loss": 0.79056853, + "num_input_tokens_seen": 160241520, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.73828125, + "step": 7471, + "time_per_iteration": 3.925541400909424 + }, + { + "auxiliary_loss_clip": 0.01113353, + "auxiliary_loss_mlp": 0.01033085, + "balance_loss_clip": 1.02128363, + "balance_loss_mlp": 1.04100883, + "epoch": 0.44924094393506686, + "flos": 22967136645120.0, + "grad_norm": 1.8719894830330126, + "language_loss": 0.70339048, + "learning_rate": 2.4226652425010048e-06, + "loss": 0.72485489, + "num_input_tokens_seen": 160261815, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.7265625, + "step": 7472, + "time_per_iteration": 2.5138602256774902 + }, + { + "auxiliary_loss_clip": 0.01038244, + "auxiliary_loss_mlp": 0.01015151, + "balance_loss_clip": 1.01388156, + "balance_loss_mlp": 1.01404762, + "epoch": 0.4493010671877349, + "flos": 59233467864960.0, + "grad_norm": 0.7429949026472541, + "language_loss": 0.61734539, + "learning_rate": 2.4222845691108676e-06, + "loss": 0.63787931, + "num_input_tokens_seen": 160317070, + "router_z_loss_clip": 0.01269531, + "router_z_loss_mlp": 0.2421875, + "step": 7473, + "time_per_iteration": 3.0049262046813965 + }, + { + "auxiliary_loss_clip": 0.01114767, + "auxiliary_loss_mlp": 0.01037887, + "balance_loss_clip": 1.02495253, + "balance_loss_mlp": 1.04087818, + "epoch": 0.44936119044040285, + "flos": 18004892653440.0, + "grad_norm": 2.4001000632965828, + "language_loss": 0.78185022, + "learning_rate": 2.421903879707657e-06, + "loss": 0.80337679, + "num_input_tokens_seen": 160334980, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.73828125, + "step": 7474, + "time_per_iteration": 2.4396324157714844 + }, + { + "auxiliary_loss_clip": 0.01110455, + "auxiliary_loss_mlp": 0.01034912, + "balance_loss_clip": 1.0225265, + "balance_loss_mlp": 1.04009926, + "epoch": 0.4494213136930708, + "flos": 21251827589760.0, + "grad_norm": 1.704620828516005, + "language_loss": 0.72103465, + "learning_rate": 2.4215231743058086e-06, + "loss": 0.74248827, + "num_input_tokens_seen": 160354500, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.703125, + "step": 7475, + "time_per_iteration": 2.464167356491089 + }, + { + "auxiliary_loss_clip": 0.01112269, + "auxiliary_loss_mlp": 0.01030142, + "balance_loss_clip": 1.01847768, + "balance_loss_mlp": 1.03917694, + "epoch": 0.4494814369457388, + "flos": 27418695022080.0, + "grad_norm": 1.7869016250475191, + "language_loss": 0.76343799, + "learning_rate": 2.4211424529197594e-06, + "loss": 0.7848621, + "num_input_tokens_seen": 160373650, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.73046875, + "step": 7476, + "time_per_iteration": 2.529374837875366 + }, + { + "auxiliary_loss_clip": 0.01116691, + "auxiliary_loss_mlp": 0.010361, + "balance_loss_clip": 1.02194357, + "balance_loss_mlp": 1.04036331, + "epoch": 0.44954156019840674, + "flos": 22854053652480.0, + "grad_norm": 2.3312494175836034, + "language_loss": 0.71774453, + "learning_rate": 2.4207617155639464e-06, + "loss": 0.73927242, + "num_input_tokens_seen": 160393430, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.765625, + "step": 7477, + "time_per_iteration": 2.4914534091949463 + }, + { + "auxiliary_loss_clip": 0.01116651, + "auxiliary_loss_mlp": 0.01030749, + "balance_loss_clip": 1.01757061, + "balance_loss_mlp": 1.04089749, + "epoch": 0.4496016834510747, + "flos": 17201570935680.0, + "grad_norm": 2.2338487326584073, + "language_loss": 0.68136394, + "learning_rate": 2.4203809622528062e-06, + "loss": 0.70283794, + "num_input_tokens_seen": 160410545, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7578125, + "step": 7478, + "time_per_iteration": 2.4622039794921875 + }, + { + "auxiliary_loss_clip": 0.01112091, + "auxiliary_loss_mlp": 0.01032835, + "balance_loss_clip": 1.02097332, + "balance_loss_mlp": 1.04130244, + "epoch": 0.4496618067037427, + "flos": 18916628595840.0, + "grad_norm": 1.8288012816153718, + "language_loss": 0.89528286, + "learning_rate": 2.420000193000779e-06, + "loss": 0.91673213, + "num_input_tokens_seen": 160428105, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.7109375, + "step": 7479, + "time_per_iteration": 2.4738242626190186 + }, + { + "auxiliary_loss_clip": 0.01114783, + "auxiliary_loss_mlp": 0.01032708, + "balance_loss_clip": 1.01970804, + "balance_loss_mlp": 1.0423162, + "epoch": 0.44972192995641064, + "flos": 21031659175680.0, + "grad_norm": 2.1133613410879155, + "language_loss": 0.75824946, + "learning_rate": 2.419619407822302e-06, + "loss": 0.77972436, + "num_input_tokens_seen": 160448815, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7265625, + "step": 7480, + "time_per_iteration": 2.536190986633301 + }, + { + "auxiliary_loss_clip": 0.01116796, + "auxiliary_loss_mlp": 0.01031783, + "balance_loss_clip": 1.01906347, + "balance_loss_mlp": 1.04211199, + "epoch": 0.4497820532090786, + "flos": 20777088510720.0, + "grad_norm": 2.1813635775429794, + "language_loss": 0.80066407, + "learning_rate": 2.419238606731815e-06, + "loss": 0.82214987, + "num_input_tokens_seen": 160465940, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.74609375, + "step": 7481, + "time_per_iteration": 2.4618031978607178 + }, + { + "auxiliary_loss_clip": 0.01110042, + "auxiliary_loss_mlp": 0.01030863, + "balance_loss_clip": 1.01809597, + "balance_loss_mlp": 1.04028749, + "epoch": 0.44984217646174657, + "flos": 33802606385280.0, + "grad_norm": 1.5995355023246276, + "language_loss": 0.68636084, + "learning_rate": 2.418857789743758e-06, + "loss": 0.70776993, + "num_input_tokens_seen": 160486710, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 7482, + "time_per_iteration": 2.5711851119995117 + }, + { + "auxiliary_loss_clip": 0.0111451, + "auxiliary_loss_mlp": 0.01035168, + "balance_loss_clip": 1.02260911, + "balance_loss_mlp": 1.04059076, + "epoch": 0.44990229971441453, + "flos": 15518365660800.0, + "grad_norm": 2.0339843826279504, + "language_loss": 0.84802616, + "learning_rate": 2.418476956872571e-06, + "loss": 0.86952293, + "num_input_tokens_seen": 160503405, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.73828125, + "step": 7483, + "time_per_iteration": 2.4510746002197266 + }, + { + "auxiliary_loss_clip": 0.01120092, + "auxiliary_loss_mlp": 0.01034767, + "balance_loss_clip": 1.02177286, + "balance_loss_mlp": 1.04386485, + "epoch": 0.4499624229670825, + "flos": 29861913191040.0, + "grad_norm": 1.8187080510096723, + "language_loss": 0.80409968, + "learning_rate": 2.4180961081326967e-06, + "loss": 0.82564819, + "num_input_tokens_seen": 160525080, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.765625, + "step": 7484, + "time_per_iteration": 2.539834976196289 + }, + { + "auxiliary_loss_clip": 0.01118118, + "auxiliary_loss_mlp": 0.01028797, + "balance_loss_clip": 1.01529098, + "balance_loss_mlp": 1.03992271, + "epoch": 0.45002254621975046, + "flos": 18513674847360.0, + "grad_norm": 2.310143901315373, + "language_loss": 0.75594473, + "learning_rate": 2.4177152435385754e-06, + "loss": 0.77741385, + "num_input_tokens_seen": 160540895, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.78125, + "step": 7485, + "time_per_iteration": 2.408979892730713 + }, + { + "auxiliary_loss_clip": 0.01041505, + "auxiliary_loss_mlp": 0.01002218, + "balance_loss_clip": 1.00065601, + "balance_loss_mlp": 1.0170331, + "epoch": 0.4500826694724185, + "flos": 70420394229120.0, + "grad_norm": 0.7895891566174408, + "language_loss": 0.5867179, + "learning_rate": 2.4173343631046504e-06, + "loss": 0.60715508, + "num_input_tokens_seen": 160598270, + "router_z_loss_clip": 0.015625, + "router_z_loss_mlp": 0.24511719, + "step": 7486, + "time_per_iteration": 3.09049654006958 + }, + { + "auxiliary_loss_clip": 0.01113596, + "auxiliary_loss_mlp": 0.01031271, + "balance_loss_clip": 1.0184797, + "balance_loss_mlp": 1.04104531, + "epoch": 0.45014279272508645, + "flos": 15778897983360.0, + "grad_norm": 2.266854053846726, + "language_loss": 0.83153397, + "learning_rate": 2.4169534668453654e-06, + "loss": 0.85298264, + "num_input_tokens_seen": 160614720, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7265625, + "step": 7487, + "time_per_iteration": 2.431209087371826 + }, + { + "auxiliary_loss_clip": 0.01113173, + "auxiliary_loss_mlp": 0.01028971, + "balance_loss_clip": 1.01626313, + "balance_loss_mlp": 1.04103804, + "epoch": 0.4502029159777544, + "flos": 21799573061760.0, + "grad_norm": 1.5035728003068896, + "language_loss": 0.77055335, + "learning_rate": 2.4165725547751622e-06, + "loss": 0.79197478, + "num_input_tokens_seen": 160635170, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 7488, + "time_per_iteration": 2.5085837841033936 + }, + { + "auxiliary_loss_clip": 0.01121581, + "auxiliary_loss_mlp": 0.01038217, + "balance_loss_clip": 1.02446008, + "balance_loss_mlp": 1.04378915, + "epoch": 0.4502630392304224, + "flos": 28767966531840.0, + "grad_norm": 2.6401168824150574, + "language_loss": 0.71564645, + "learning_rate": 2.4161916269084858e-06, + "loss": 0.73724437, + "num_input_tokens_seen": 160654490, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.77734375, + "step": 7489, + "time_per_iteration": 2.5106120109558105 + }, + { + "auxiliary_loss_clip": 0.01119744, + "auxiliary_loss_mlp": 0.01032559, + "balance_loss_clip": 1.01856422, + "balance_loss_mlp": 1.04424906, + "epoch": 0.45032316248309034, + "flos": 15844182952320.0, + "grad_norm": 2.1685657644370853, + "language_loss": 0.6962117, + "learning_rate": 2.4158106832597817e-06, + "loss": 0.71773469, + "num_input_tokens_seen": 160669400, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7578125, + "step": 7490, + "time_per_iteration": 2.4383597373962402 + }, + { + "auxiliary_loss_clip": 0.01038961, + "auxiliary_loss_mlp": 0.01000463, + "balance_loss_clip": 0.99907476, + "balance_loss_mlp": 1.01472032, + "epoch": 0.4503832857357583, + "flos": 57853600945920.0, + "grad_norm": 1.805652104877531, + "language_loss": 0.56691748, + "learning_rate": 2.415429723843495e-06, + "loss": 0.5873118, + "num_input_tokens_seen": 160733820, + "router_z_loss_clip": 0.01391602, + "router_z_loss_mlp": 0.2421875, + "step": 7491, + "time_per_iteration": 3.0662994384765625 + }, + { + "auxiliary_loss_clip": 0.01111025, + "auxiliary_loss_mlp": 0.01029852, + "balance_loss_clip": 1.01719177, + "balance_loss_mlp": 1.03987265, + "epoch": 0.4504434089884263, + "flos": 23878082488320.0, + "grad_norm": 1.5869212574214921, + "language_loss": 0.79462028, + "learning_rate": 2.4150487486740713e-06, + "loss": 0.81602901, + "num_input_tokens_seen": 160753175, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 7492, + "time_per_iteration": 2.497849464416504 + }, + { + "auxiliary_loss_clip": 0.01119638, + "auxiliary_loss_mlp": 0.01038139, + "balance_loss_clip": 1.02474022, + "balance_loss_mlp": 1.04271042, + "epoch": 0.45050353224109424, + "flos": 17785083375360.0, + "grad_norm": 2.074371460837293, + "language_loss": 0.92560953, + "learning_rate": 2.4146677577659573e-06, + "loss": 0.9471873, + "num_input_tokens_seen": 160768310, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.76953125, + "step": 7493, + "time_per_iteration": 2.4717981815338135 + }, + { + "auxiliary_loss_clip": 0.01039013, + "auxiliary_loss_mlp": 0.01000993, + "balance_loss_clip": 0.99946707, + "balance_loss_mlp": 1.01443267, + "epoch": 0.4505636554937622, + "flos": 65063420703360.0, + "grad_norm": 0.8118074327791402, + "language_loss": 0.62908041, + "learning_rate": 2.4142867511336e-06, + "loss": 0.64948046, + "num_input_tokens_seen": 160827370, + "router_z_loss_clip": 0.01525879, + "router_z_loss_mlp": 0.24609375, + "step": 7494, + "time_per_iteration": 3.1021509170532227 + }, + { + "auxiliary_loss_clip": 0.01113816, + "auxiliary_loss_mlp": 0.01032596, + "balance_loss_clip": 1.02063334, + "balance_loss_mlp": 1.04122376, + "epoch": 0.45062377874643017, + "flos": 22200084685440.0, + "grad_norm": 1.4599772474200656, + "language_loss": 0.81980979, + "learning_rate": 2.4139057287914484e-06, + "loss": 0.8412739, + "num_input_tokens_seen": 160849140, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.7265625, + "step": 7495, + "time_per_iteration": 2.528707981109619 + }, + { + "auxiliary_loss_clip": 0.01114077, + "auxiliary_loss_mlp": 0.010313, + "balance_loss_clip": 1.01793659, + "balance_loss_mlp": 1.04069221, + "epoch": 0.45068390199909814, + "flos": 37670293186560.0, + "grad_norm": 1.6718702145442927, + "language_loss": 0.85639864, + "learning_rate": 2.41352469075395e-06, + "loss": 0.87785244, + "num_input_tokens_seen": 160871280, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.734375, + "step": 7496, + "time_per_iteration": 2.5862984657287598 + }, + { + "auxiliary_loss_clip": 0.0111579, + "auxiliary_loss_mlp": 0.0103187, + "balance_loss_clip": 1.01913798, + "balance_loss_mlp": 1.04234052, + "epoch": 0.4507440252517661, + "flos": 22302501338880.0, + "grad_norm": 2.117680053603533, + "language_loss": 0.76342994, + "learning_rate": 2.4131436370355534e-06, + "loss": 0.78490651, + "num_input_tokens_seen": 160888625, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.734375, + "step": 7497, + "time_per_iteration": 2.4831669330596924 + }, + { + "auxiliary_loss_clip": 0.01114815, + "auxiliary_loss_mlp": 0.01030719, + "balance_loss_clip": 1.01798773, + "balance_loss_mlp": 1.03939152, + "epoch": 0.45080414850443407, + "flos": 13188374138880.0, + "grad_norm": 2.971687057549937, + "language_loss": 0.75124824, + "learning_rate": 2.4127625676507088e-06, + "loss": 0.77270365, + "num_input_tokens_seen": 160907040, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.75390625, + "step": 7498, + "time_per_iteration": 2.4243438243865967 + }, + { + "auxiliary_loss_clip": 0.01116531, + "auxiliary_loss_mlp": 0.01041824, + "balance_loss_clip": 1.02853799, + "balance_loss_mlp": 1.04190993, + "epoch": 0.4508642717571021, + "flos": 21944939402880.0, + "grad_norm": 1.8265166276024245, + "language_loss": 0.70487583, + "learning_rate": 2.4123814826138663e-06, + "loss": 0.72645926, + "num_input_tokens_seen": 160927115, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.74609375, + "step": 7499, + "time_per_iteration": 2.496595859527588 + }, + { + "auxiliary_loss_clip": 0.01118241, + "auxiliary_loss_mlp": 0.0103412, + "balance_loss_clip": 1.02090549, + "balance_loss_mlp": 1.04258835, + "epoch": 0.45092439500977005, + "flos": 23367468700800.0, + "grad_norm": 1.819855114084185, + "language_loss": 0.76870257, + "learning_rate": 2.412000381939477e-06, + "loss": 0.79022616, + "num_input_tokens_seen": 160944405, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7578125, + "step": 7500, + "time_per_iteration": 2.4659407138824463 + }, + { + "auxiliary_loss_clip": 0.01114886, + "auxiliary_loss_mlp": 0.01032223, + "balance_loss_clip": 1.01943755, + "balance_loss_mlp": 1.04146719, + "epoch": 0.450984518262438, + "flos": 20772958446720.0, + "grad_norm": 1.7705256698152247, + "language_loss": 0.62966442, + "learning_rate": 2.411619265641992e-06, + "loss": 0.6511355, + "num_input_tokens_seen": 160961345, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.734375, + "step": 7501, + "time_per_iteration": 2.474149703979492 + }, + { + "auxiliary_loss_clip": 0.01117269, + "auxiliary_loss_mlp": 0.0103454, + "balance_loss_clip": 1.02093208, + "balance_loss_mlp": 1.04161, + "epoch": 0.451044641515106, + "flos": 17707372300800.0, + "grad_norm": 1.9049764473951474, + "language_loss": 0.84758866, + "learning_rate": 2.411238133735863e-06, + "loss": 0.86910677, + "num_input_tokens_seen": 160977330, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7578125, + "step": 7502, + "time_per_iteration": 2.419093370437622 + }, + { + "auxiliary_loss_clip": 0.01111337, + "auxiliary_loss_mlp": 0.01033516, + "balance_loss_clip": 1.02135682, + "balance_loss_mlp": 1.04026246, + "epoch": 0.45110476476777395, + "flos": 20594698225920.0, + "grad_norm": 1.4187712379612754, + "language_loss": 0.79906255, + "learning_rate": 2.4108569862355418e-06, + "loss": 0.8205111, + "num_input_tokens_seen": 160997280, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.7109375, + "step": 7503, + "time_per_iteration": 2.536954164505005 + }, + { + "auxiliary_loss_clip": 0.01112743, + "auxiliary_loss_mlp": 0.01036948, + "balance_loss_clip": 1.02458, + "balance_loss_mlp": 1.04287815, + "epoch": 0.4511648880204419, + "flos": 16034043265920.0, + "grad_norm": 3.706114905397956, + "language_loss": 0.80931562, + "learning_rate": 2.410475823155484e-06, + "loss": 0.83081251, + "num_input_tokens_seen": 161014235, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6953125, + "step": 7504, + "time_per_iteration": 2.4356000423431396 + }, + { + "auxiliary_loss_clip": 0.01112245, + "auxiliary_loss_mlp": 0.01034569, + "balance_loss_clip": 1.02284479, + "balance_loss_mlp": 1.04033744, + "epoch": 0.4512250112731099, + "flos": 23978811202560.0, + "grad_norm": 5.269565558405545, + "language_loss": 0.63377774, + "learning_rate": 2.4100946445101405e-06, + "loss": 0.6552459, + "num_input_tokens_seen": 161032360, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.71875, + "step": 7505, + "time_per_iteration": 2.4934160709381104 + }, + { + "auxiliary_loss_clip": 0.01036723, + "auxiliary_loss_mlp": 0.0101133, + "balance_loss_clip": 1.00969648, + "balance_loss_mlp": 1.01246166, + "epoch": 0.45128513452577784, + "flos": 71462308037760.0, + "grad_norm": 0.8504866778221882, + "language_loss": 0.5887711, + "learning_rate": 2.409713450313968e-06, + "loss": 0.60925162, + "num_input_tokens_seen": 161091360, + "router_z_loss_clip": 0.01635742, + "router_z_loss_mlp": 0.2421875, + "step": 7506, + "time_per_iteration": 3.1150898933410645 + }, + { + "auxiliary_loss_clip": 0.01112738, + "auxiliary_loss_mlp": 0.0103395, + "balance_loss_clip": 1.02087879, + "balance_loss_mlp": 1.04194486, + "epoch": 0.4513452577784458, + "flos": 22090844448000.0, + "grad_norm": 1.6347442617822043, + "language_loss": 0.79238498, + "learning_rate": 2.40933224058142e-06, + "loss": 0.81385183, + "num_input_tokens_seen": 161110825, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.70703125, + "step": 7507, + "time_per_iteration": 2.484036684036255 + }, + { + "auxiliary_loss_clip": 0.0111511, + "auxiliary_loss_mlp": 0.01033622, + "balance_loss_clip": 1.019871, + "balance_loss_mlp": 1.04084098, + "epoch": 0.4514053810311138, + "flos": 24276403382400.0, + "grad_norm": 1.5108356171854629, + "language_loss": 0.7397756, + "learning_rate": 2.4089510153269526e-06, + "loss": 0.76126289, + "num_input_tokens_seen": 161130685, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7421875, + "step": 7508, + "time_per_iteration": 2.4958505630493164 + }, + { + "auxiliary_loss_clip": 0.01112961, + "auxiliary_loss_mlp": 0.01036508, + "balance_loss_clip": 1.02423549, + "balance_loss_mlp": 1.04263186, + "epoch": 0.45146550428378174, + "flos": 17886781756800.0, + "grad_norm": 1.9053667394121476, + "language_loss": 0.78955048, + "learning_rate": 2.4085697745650217e-06, + "loss": 0.81104517, + "num_input_tokens_seen": 161147555, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 7509, + "time_per_iteration": 2.4640209674835205 + }, + { + "auxiliary_loss_clip": 0.01114289, + "auxiliary_loss_mlp": 0.01029902, + "balance_loss_clip": 1.01759398, + "balance_loss_mlp": 1.0420239, + "epoch": 0.4515256275364497, + "flos": 24243437675520.0, + "grad_norm": 1.8944319049742213, + "language_loss": 0.73495883, + "learning_rate": 2.4081885183100837e-06, + "loss": 0.75640076, + "num_input_tokens_seen": 161166255, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.72265625, + "step": 7510, + "time_per_iteration": 2.462289810180664 + }, + { + "auxiliary_loss_clip": 0.01115796, + "auxiliary_loss_mlp": 0.01032073, + "balance_loss_clip": 1.01856017, + "balance_loss_mlp": 1.04091644, + "epoch": 0.45158575078911767, + "flos": 20631039811200.0, + "grad_norm": 1.9974195471898801, + "language_loss": 0.77053016, + "learning_rate": 2.4078072465765964e-06, + "loss": 0.79200888, + "num_input_tokens_seen": 161184720, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 7511, + "time_per_iteration": 2.5831305980682373 + }, + { + "auxiliary_loss_clip": 0.01114808, + "auxiliary_loss_mlp": 0.01032776, + "balance_loss_clip": 1.01937711, + "balance_loss_mlp": 1.04086745, + "epoch": 0.45164587404178563, + "flos": 23327751237120.0, + "grad_norm": 1.734048899080759, + "language_loss": 0.79124206, + "learning_rate": 2.4074259593790174e-06, + "loss": 0.81271791, + "num_input_tokens_seen": 161204360, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73828125, + "step": 7512, + "time_per_iteration": 6.862476587295532 + }, + { + "auxiliary_loss_clip": 0.01118735, + "auxiliary_loss_mlp": 0.01037966, + "balance_loss_clip": 1.02435863, + "balance_loss_mlp": 1.04064548, + "epoch": 0.45170599729445365, + "flos": 23805973935360.0, + "grad_norm": 1.9681233127218394, + "language_loss": 0.87461096, + "learning_rate": 2.4070446567318053e-06, + "loss": 0.89617801, + "num_input_tokens_seen": 161223575, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.78125, + "step": 7513, + "time_per_iteration": 2.5551092624664307 + }, + { + "auxiliary_loss_clip": 0.01105419, + "auxiliary_loss_mlp": 0.01030567, + "balance_loss_clip": 1.01893246, + "balance_loss_mlp": 1.0379355, + "epoch": 0.4517661205471216, + "flos": 23512942782720.0, + "grad_norm": 1.6638824980939535, + "language_loss": 0.67135286, + "learning_rate": 2.406663338649419e-06, + "loss": 0.69271272, + "num_input_tokens_seen": 161243805, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.671875, + "step": 7514, + "time_per_iteration": 2.4804775714874268 + }, + { + "auxiliary_loss_clip": 0.01115847, + "auxiliary_loss_mlp": 0.01029857, + "balance_loss_clip": 1.01448536, + "balance_loss_mlp": 1.04221404, + "epoch": 0.4518262437997896, + "flos": 23513948363520.0, + "grad_norm": 2.644844833078513, + "language_loss": 0.69455916, + "learning_rate": 2.406282005146318e-06, + "loss": 0.71601617, + "num_input_tokens_seen": 161261450, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.734375, + "step": 7515, + "time_per_iteration": 2.530089855194092 + }, + { + "auxiliary_loss_clip": 0.01117096, + "auxiliary_loss_mlp": 0.01034746, + "balance_loss_clip": 1.02060795, + "balance_loss_mlp": 1.04084945, + "epoch": 0.45188636705245755, + "flos": 14568061489920.0, + "grad_norm": 2.154684023631233, + "language_loss": 0.81658673, + "learning_rate": 2.405900656236963e-06, + "loss": 0.83810514, + "num_input_tokens_seen": 161276965, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.76171875, + "step": 7516, + "time_per_iteration": 2.405810832977295 + }, + { + "auxiliary_loss_clip": 0.01111826, + "auxiliary_loss_mlp": 0.01032545, + "balance_loss_clip": 1.01940227, + "balance_loss_mlp": 1.04099917, + "epoch": 0.4519464903051255, + "flos": 19901550499200.0, + "grad_norm": 1.5513632113186169, + "language_loss": 0.65810448, + "learning_rate": 2.4055192919358137e-06, + "loss": 0.6795482, + "num_input_tokens_seen": 161295375, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.70703125, + "step": 7517, + "time_per_iteration": 2.487539768218994 + }, + { + "auxiliary_loss_clip": 0.0111082, + "auxiliary_loss_mlp": 0.0102731, + "balance_loss_clip": 1.01549673, + "balance_loss_mlp": 1.04066491, + "epoch": 0.4520066135577935, + "flos": 18844376388480.0, + "grad_norm": 1.7604175245242084, + "language_loss": 0.63401121, + "learning_rate": 2.405137912257333e-06, + "loss": 0.65539253, + "num_input_tokens_seen": 161313010, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.69921875, + "step": 7518, + "time_per_iteration": 2.4280178546905518 + }, + { + "auxiliary_loss_clip": 0.01112305, + "auxiliary_loss_mlp": 0.010337, + "balance_loss_clip": 1.02124858, + "balance_loss_mlp": 1.04022479, + "epoch": 0.45206673681046144, + "flos": 48214419713280.0, + "grad_norm": 1.4125127095428567, + "language_loss": 0.59552354, + "learning_rate": 2.404756517215982e-06, + "loss": 0.61698353, + "num_input_tokens_seen": 161336690, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71875, + "step": 7519, + "time_per_iteration": 2.706774950027466 + }, + { + "auxiliary_loss_clip": 0.01114162, + "auxiliary_loss_mlp": 0.01036796, + "balance_loss_clip": 1.02404702, + "balance_loss_mlp": 1.04053855, + "epoch": 0.4521268600631294, + "flos": 23842171866240.0, + "grad_norm": 1.3128892020538214, + "language_loss": 0.72288704, + "learning_rate": 2.404375106826223e-06, + "loss": 0.74439663, + "num_input_tokens_seen": 161357845, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.734375, + "step": 7520, + "time_per_iteration": 2.4802541732788086 + }, + { + "auxiliary_loss_clip": 0.01113212, + "auxiliary_loss_mlp": 0.01037416, + "balance_loss_clip": 1.0250659, + "balance_loss_mlp": 1.04033482, + "epoch": 0.4521869833157974, + "flos": 18843622202880.0, + "grad_norm": 1.8726393810843218, + "language_loss": 0.75520414, + "learning_rate": 2.4039936811025194e-06, + "loss": 0.77671039, + "num_input_tokens_seen": 161375160, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.73046875, + "step": 7521, + "time_per_iteration": 2.4384777545928955 + }, + { + "auxiliary_loss_clip": 0.0111833, + "auxiliary_loss_mlp": 0.01035726, + "balance_loss_clip": 1.022416, + "balance_loss_mlp": 1.04222465, + "epoch": 0.45224710656846534, + "flos": 19788072456960.0, + "grad_norm": 1.6736116772601735, + "language_loss": 0.67521721, + "learning_rate": 2.4036122400593343e-06, + "loss": 0.69675779, + "num_input_tokens_seen": 161393690, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.76171875, + "step": 7522, + "time_per_iteration": 2.4317188262939453 + }, + { + "auxiliary_loss_clip": 0.01109922, + "auxiliary_loss_mlp": 0.01033885, + "balance_loss_clip": 1.02090335, + "balance_loss_mlp": 1.03857231, + "epoch": 0.4523072298211333, + "flos": 28256131681920.0, + "grad_norm": 1.5002177443666298, + "language_loss": 0.60627949, + "learning_rate": 2.403230783711134e-06, + "loss": 0.62771761, + "num_input_tokens_seen": 161415015, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 7523, + "time_per_iteration": 2.5312907695770264 + }, + { + "auxiliary_loss_clip": 0.01116524, + "auxiliary_loss_mlp": 0.01039355, + "balance_loss_clip": 1.02556825, + "balance_loss_mlp": 1.0399549, + "epoch": 0.45236735307380127, + "flos": 11181039511680.0, + "grad_norm": 2.0404967948828796, + "language_loss": 0.78325248, + "learning_rate": 2.4028493120723813e-06, + "loss": 0.80481124, + "num_input_tokens_seen": 161432940, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.765625, + "step": 7524, + "time_per_iteration": 2.4078996181488037 + }, + { + "auxiliary_loss_clip": 0.01111336, + "auxiliary_loss_mlp": 0.01034812, + "balance_loss_clip": 1.02216387, + "balance_loss_mlp": 1.03912878, + "epoch": 0.45242747632646924, + "flos": 22601386408320.0, + "grad_norm": 1.9789251534337415, + "language_loss": 0.63518596, + "learning_rate": 2.4024678251575417e-06, + "loss": 0.65664744, + "num_input_tokens_seen": 161452215, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.72265625, + "step": 7525, + "time_per_iteration": 2.503176212310791 + }, + { + "auxiliary_loss_clip": 0.01112174, + "auxiliary_loss_mlp": 0.01035043, + "balance_loss_clip": 1.02262783, + "balance_loss_mlp": 1.04040241, + "epoch": 0.45248759957913726, + "flos": 18256267008000.0, + "grad_norm": 1.5288172547930599, + "language_loss": 0.79163349, + "learning_rate": 2.402086322981083e-06, + "loss": 0.8131057, + "num_input_tokens_seen": 161469520, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71875, + "step": 7526, + "time_per_iteration": 2.4558780193328857 + }, + { + "auxiliary_loss_clip": 0.01111805, + "auxiliary_loss_mlp": 0.01030821, + "balance_loss_clip": 1.01851869, + "balance_loss_mlp": 1.04029512, + "epoch": 0.4525477228318052, + "flos": 22450094323200.0, + "grad_norm": 1.6413449131819307, + "language_loss": 0.80729342, + "learning_rate": 2.40170480555747e-06, + "loss": 0.82871962, + "num_input_tokens_seen": 161487335, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.7109375, + "step": 7527, + "time_per_iteration": 2.470186948776245 + }, + { + "auxiliary_loss_clip": 0.0111113, + "auxiliary_loss_mlp": 0.01028615, + "balance_loss_clip": 1.01566291, + "balance_loss_mlp": 1.039428, + "epoch": 0.4526078460844732, + "flos": 29644869260160.0, + "grad_norm": 1.450835161887395, + "language_loss": 0.65505683, + "learning_rate": 2.4013232729011706e-06, + "loss": 0.67645425, + "num_input_tokens_seen": 161510095, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71875, + "step": 7528, + "time_per_iteration": 2.541700601577759 + }, + { + "auxiliary_loss_clip": 0.01110752, + "auxiliary_loss_mlp": 0.01032238, + "balance_loss_clip": 1.02031136, + "balance_loss_mlp": 1.03976476, + "epoch": 0.45266796933714115, + "flos": 23039747988480.0, + "grad_norm": 1.6649436204324595, + "language_loss": 0.7542727, + "learning_rate": 2.4009417250266525e-06, + "loss": 0.7757026, + "num_input_tokens_seen": 161528725, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.7109375, + "step": 7529, + "time_per_iteration": 2.5726876258850098 + }, + { + "auxiliary_loss_clip": 0.01112607, + "auxiliary_loss_mlp": 0.01030788, + "balance_loss_clip": 1.01853299, + "balance_loss_mlp": 1.03971684, + "epoch": 0.4527280925898091, + "flos": 14428405411200.0, + "grad_norm": 1.7825780716691442, + "language_loss": 0.73193467, + "learning_rate": 2.400560161948384e-06, + "loss": 0.75336862, + "num_input_tokens_seen": 161547195, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.73046875, + "step": 7530, + "time_per_iteration": 2.4584052562713623 + }, + { + "auxiliary_loss_clip": 0.01113111, + "auxiliary_loss_mlp": 0.01033658, + "balance_loss_clip": 1.02193975, + "balance_loss_mlp": 1.04003453, + "epoch": 0.4527882158424771, + "flos": 22925515760640.0, + "grad_norm": 1.6012488985464985, + "language_loss": 0.75947326, + "learning_rate": 2.400178583680834e-06, + "loss": 0.78094089, + "num_input_tokens_seen": 161565565, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.734375, + "step": 7531, + "time_per_iteration": 2.484959363937378 + }, + { + "auxiliary_loss_clip": 0.01108375, + "auxiliary_loss_mlp": 0.01034859, + "balance_loss_clip": 1.02182305, + "balance_loss_mlp": 1.0382148, + "epoch": 0.45284833909514505, + "flos": 25555326105600.0, + "grad_norm": 1.4359815558452909, + "language_loss": 0.66874713, + "learning_rate": 2.3997969902384717e-06, + "loss": 0.69017947, + "num_input_tokens_seen": 161586630, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 7532, + "time_per_iteration": 2.486598253250122 + }, + { + "auxiliary_loss_clip": 0.01112272, + "auxiliary_loss_mlp": 0.01035317, + "balance_loss_clip": 1.02322936, + "balance_loss_mlp": 1.04091084, + "epoch": 0.452908462347813, + "flos": 18150007599360.0, + "grad_norm": 2.0450394734969874, + "language_loss": 0.78902352, + "learning_rate": 2.399415381635768e-06, + "loss": 0.81049943, + "num_input_tokens_seen": 161603815, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71484375, + "step": 7533, + "time_per_iteration": 2.4407958984375 + }, + { + "auxiliary_loss_clip": 0.01115719, + "auxiliary_loss_mlp": 0.01034272, + "balance_loss_clip": 1.02032459, + "balance_loss_mlp": 1.03807485, + "epoch": 0.452968585600481, + "flos": 19062749122560.0, + "grad_norm": 1.646532255034537, + "language_loss": 0.83279264, + "learning_rate": 2.3990337578871927e-06, + "loss": 0.85429263, + "num_input_tokens_seen": 161622900, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7734375, + "step": 7534, + "time_per_iteration": 2.430670976638794 + }, + { + "auxiliary_loss_clip": 0.01113826, + "auxiliary_loss_mlp": 0.01034802, + "balance_loss_clip": 1.02148068, + "balance_loss_mlp": 1.03927064, + "epoch": 0.45302870885314894, + "flos": 22051737515520.0, + "grad_norm": 1.4654832124358697, + "language_loss": 0.76578003, + "learning_rate": 2.3986521190072176e-06, + "loss": 0.78726631, + "num_input_tokens_seen": 161641700, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.74609375, + "step": 7535, + "time_per_iteration": 2.4744579792022705 + }, + { + "auxiliary_loss_clip": 0.0110944, + "auxiliary_loss_mlp": 0.01031373, + "balance_loss_clip": 1.01957679, + "balance_loss_mlp": 1.03883696, + "epoch": 0.4530888321058169, + "flos": 20376217751040.0, + "grad_norm": 1.5977579258117844, + "language_loss": 0.80234635, + "learning_rate": 2.3982704650103138e-06, + "loss": 0.82375443, + "num_input_tokens_seen": 161661955, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.703125, + "step": 7536, + "time_per_iteration": 2.4481444358825684 + }, + { + "auxiliary_loss_clip": 0.01111518, + "auxiliary_loss_mlp": 0.0102989, + "balance_loss_clip": 1.0173198, + "balance_loss_mlp": 1.03711987, + "epoch": 0.4531489553584849, + "flos": 14830425406080.0, + "grad_norm": 2.0610118763249536, + "language_loss": 0.75895774, + "learning_rate": 2.3978887959109544e-06, + "loss": 0.78037184, + "num_input_tokens_seen": 161679245, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7421875, + "step": 7537, + "time_per_iteration": 2.430119276046753 + }, + { + "auxiliary_loss_clip": 0.01115071, + "auxiliary_loss_mlp": 0.01032409, + "balance_loss_clip": 1.02058339, + "balance_loss_mlp": 1.04172075, + "epoch": 0.45320907861115284, + "flos": 21944975316480.0, + "grad_norm": 2.095176663386117, + "language_loss": 0.76420474, + "learning_rate": 2.3975071117236118e-06, + "loss": 0.78567952, + "num_input_tokens_seen": 161698795, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.734375, + "step": 7538, + "time_per_iteration": 2.4675159454345703 + }, + { + "auxiliary_loss_clip": 0.01041439, + "auxiliary_loss_mlp": 0.0100041, + "balance_loss_clip": 0.99908096, + "balance_loss_mlp": 1.01700771, + "epoch": 0.45326920186382086, + "flos": 66251455038720.0, + "grad_norm": 0.7965700347609973, + "language_loss": 0.62345123, + "learning_rate": 2.3971254124627593e-06, + "loss": 0.64386964, + "num_input_tokens_seen": 161761980, + "router_z_loss_clip": 0.01330566, + "router_z_loss_mlp": 0.24414062, + "step": 7539, + "time_per_iteration": 3.0961101055145264 + }, + { + "auxiliary_loss_clip": 0.01112571, + "auxiliary_loss_mlp": 0.01036685, + "balance_loss_clip": 1.02466285, + "balance_loss_mlp": 1.04064226, + "epoch": 0.4533293251164888, + "flos": 14684233052160.0, + "grad_norm": 1.8102149318529874, + "language_loss": 0.65997463, + "learning_rate": 2.396743698142872e-06, + "loss": 0.68146718, + "num_input_tokens_seen": 161779455, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.71875, + "step": 7540, + "time_per_iteration": 2.418170928955078 + }, + { + "auxiliary_loss_clip": 0.01118532, + "auxiliary_loss_mlp": 0.01040664, + "balance_loss_clip": 1.02721667, + "balance_loss_mlp": 1.04177594, + "epoch": 0.4533894483691568, + "flos": 22601206840320.0, + "grad_norm": 1.6922846601909878, + "language_loss": 0.84666622, + "learning_rate": 2.396361968778424e-06, + "loss": 0.86825818, + "num_input_tokens_seen": 161798980, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 7541, + "time_per_iteration": 2.4960954189300537 + }, + { + "auxiliary_loss_clip": 0.01113117, + "auxiliary_loss_mlp": 0.01031114, + "balance_loss_clip": 1.01888943, + "balance_loss_mlp": 1.03968024, + "epoch": 0.45344957162182475, + "flos": 34751617666560.0, + "grad_norm": 1.7180151747286094, + "language_loss": 0.76435781, + "learning_rate": 2.395980224383889e-06, + "loss": 0.78580016, + "num_input_tokens_seen": 161819745, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.734375, + "step": 7542, + "time_per_iteration": 2.574286937713623 + }, + { + "auxiliary_loss_clip": 0.011146, + "auxiliary_loss_mlp": 0.01029849, + "balance_loss_clip": 1.01687872, + "balance_loss_mlp": 1.04101157, + "epoch": 0.4535096948744927, + "flos": 23550218121600.0, + "grad_norm": 1.4680148354813627, + "language_loss": 0.80267954, + "learning_rate": 2.395598464973746e-06, + "loss": 0.82412398, + "num_input_tokens_seen": 161838575, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.734375, + "step": 7543, + "time_per_iteration": 2.5228359699249268 + }, + { + "auxiliary_loss_clip": 0.01115681, + "auxiliary_loss_mlp": 0.01037869, + "balance_loss_clip": 1.02517343, + "balance_loss_mlp": 1.04107285, + "epoch": 0.4535698181271607, + "flos": 25557552748800.0, + "grad_norm": 1.6471991367559184, + "language_loss": 0.75933033, + "learning_rate": 2.395216690562469e-06, + "loss": 0.78086591, + "num_input_tokens_seen": 161858590, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.74609375, + "step": 7544, + "time_per_iteration": 2.4976110458374023 + }, + { + "auxiliary_loss_clip": 0.01117877, + "auxiliary_loss_mlp": 0.01033773, + "balance_loss_clip": 1.02154779, + "balance_loss_mlp": 1.04304671, + "epoch": 0.45362994137982865, + "flos": 24864117713280.0, + "grad_norm": 1.8438932042246456, + "language_loss": 0.75447458, + "learning_rate": 2.3948349011645355e-06, + "loss": 0.77599108, + "num_input_tokens_seen": 161878390, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.75, + "step": 7545, + "time_per_iteration": 2.5022737979888916 + }, + { + "auxiliary_loss_clip": 0.01114305, + "auxiliary_loss_mlp": 0.0102975, + "balance_loss_clip": 1.01697659, + "balance_loss_mlp": 1.04100811, + "epoch": 0.4536900646324966, + "flos": 30806794408320.0, + "grad_norm": 1.5497429650402368, + "language_loss": 0.7210325, + "learning_rate": 2.394453096794423e-06, + "loss": 0.74247307, + "num_input_tokens_seen": 161898610, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.734375, + "step": 7546, + "time_per_iteration": 2.5246150493621826 + }, + { + "auxiliary_loss_clip": 0.01118375, + "auxiliary_loss_mlp": 0.01032123, + "balance_loss_clip": 1.01857507, + "balance_loss_mlp": 1.04212511, + "epoch": 0.4537501878851646, + "flos": 23404313076480.0, + "grad_norm": 1.558937793954525, + "language_loss": 0.7557559, + "learning_rate": 2.394071277466609e-06, + "loss": 0.77726084, + "num_input_tokens_seen": 161918210, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.76171875, + "step": 7547, + "time_per_iteration": 2.4949920177459717 + }, + { + "auxiliary_loss_clip": 0.01116665, + "auxiliary_loss_mlp": 0.0103361, + "balance_loss_clip": 1.02041912, + "balance_loss_mlp": 1.04200041, + "epoch": 0.45381031113783254, + "flos": 18149289327360.0, + "grad_norm": 2.0285954992459865, + "language_loss": 0.69878972, + "learning_rate": 2.393689443195573e-06, + "loss": 0.72029251, + "num_input_tokens_seen": 161936950, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.74609375, + "step": 7548, + "time_per_iteration": 2.4486818313598633 + }, + { + "auxiliary_loss_clip": 0.01114191, + "auxiliary_loss_mlp": 0.01040331, + "balance_loss_clip": 1.02771258, + "balance_loss_mlp": 1.04018688, + "epoch": 0.4538704343905005, + "flos": 25336666062720.0, + "grad_norm": 2.0627316040888117, + "language_loss": 0.72691673, + "learning_rate": 2.393307593995794e-06, + "loss": 0.74846196, + "num_input_tokens_seen": 161955550, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7421875, + "step": 7549, + "time_per_iteration": 2.509470224380493 + }, + { + "auxiliary_loss_clip": 0.01112378, + "auxiliary_loss_mlp": 0.01029099, + "balance_loss_clip": 1.01698172, + "balance_loss_mlp": 1.04035378, + "epoch": 0.4539305576431685, + "flos": 28731445378560.0, + "grad_norm": 1.7136809619022837, + "language_loss": 0.65253317, + "learning_rate": 2.392925729881751e-06, + "loss": 0.67394793, + "num_input_tokens_seen": 161976760, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71875, + "step": 7550, + "time_per_iteration": 2.5133440494537354 + }, + { + "auxiliary_loss_clip": 0.01113494, + "auxiliary_loss_mlp": 0.01037344, + "balance_loss_clip": 1.0250591, + "balance_loss_mlp": 1.04179323, + "epoch": 0.45399068089583644, + "flos": 22492397566080.0, + "grad_norm": 1.6025854653449239, + "language_loss": 0.68823695, + "learning_rate": 2.3925438508679263e-06, + "loss": 0.70974535, + "num_input_tokens_seen": 161996120, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.71875, + "step": 7551, + "time_per_iteration": 2.5188024044036865 + }, + { + "auxiliary_loss_clip": 0.01113711, + "auxiliary_loss_mlp": 0.01033106, + "balance_loss_clip": 1.02022541, + "balance_loss_mlp": 1.03923821, + "epoch": 0.45405080414850446, + "flos": 12893403651840.0, + "grad_norm": 1.6542843637965088, + "language_loss": 0.79214859, + "learning_rate": 2.392161956968798e-06, + "loss": 0.81361675, + "num_input_tokens_seen": 162011125, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.74609375, + "step": 7552, + "time_per_iteration": 2.4087183475494385 + }, + { + "auxiliary_loss_clip": 0.01039804, + "auxiliary_loss_mlp": 0.01010172, + "balance_loss_clip": 1.00893259, + "balance_loss_mlp": 1.01586497, + "epoch": 0.4541109274011724, + "flos": 59766919724160.0, + "grad_norm": 0.8232859688183145, + "language_loss": 0.57765305, + "learning_rate": 2.39178004819885e-06, + "loss": 0.59815282, + "num_input_tokens_seen": 162068705, + "router_z_loss_clip": 0.01239014, + "router_z_loss_mlp": 0.24023438, + "step": 7553, + "time_per_iteration": 4.437517881393433 + }, + { + "auxiliary_loss_clip": 0.01110712, + "auxiliary_loss_mlp": 0.01035759, + "balance_loss_clip": 1.02388608, + "balance_loss_mlp": 1.03907371, + "epoch": 0.4541710506538404, + "flos": 28511743841280.0, + "grad_norm": 1.3573100009257986, + "language_loss": 0.76541936, + "learning_rate": 2.3913981245725626e-06, + "loss": 0.78688413, + "num_input_tokens_seen": 162089655, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.71875, + "step": 7554, + "time_per_iteration": 5.404860258102417 + }, + { + "auxiliary_loss_clip": 0.01116899, + "auxiliary_loss_mlp": 0.01032245, + "balance_loss_clip": 1.01859498, + "balance_loss_mlp": 1.04073453, + "epoch": 0.45423117390650836, + "flos": 17675591742720.0, + "grad_norm": 2.6663912268828156, + "language_loss": 0.77148789, + "learning_rate": 2.3910161861044194e-06, + "loss": 0.79297936, + "num_input_tokens_seen": 162108465, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76171875, + "step": 7555, + "time_per_iteration": 2.5254242420196533 + }, + { + "auxiliary_loss_clip": 0.01111282, + "auxiliary_loss_mlp": 0.01033205, + "balance_loss_clip": 1.02112269, + "balance_loss_mlp": 1.03910041, + "epoch": 0.4542912971591763, + "flos": 28072556248320.0, + "grad_norm": 1.268885764239303, + "language_loss": 0.72658741, + "learning_rate": 2.390634232808903e-06, + "loss": 0.74803221, + "num_input_tokens_seen": 162129910, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.72265625, + "step": 7556, + "time_per_iteration": 2.5096001625061035 + }, + { + "auxiliary_loss_clip": 0.01117527, + "auxiliary_loss_mlp": 0.01033023, + "balance_loss_clip": 1.01987422, + "balance_loss_mlp": 1.0412432, + "epoch": 0.4543514204118443, + "flos": 22671771108480.0, + "grad_norm": 1.9256457801142723, + "language_loss": 0.63244998, + "learning_rate": 2.3902522647004982e-06, + "loss": 0.65395546, + "num_input_tokens_seen": 162148840, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.76171875, + "step": 7557, + "time_per_iteration": 2.489269495010376 + }, + { + "auxiliary_loss_clip": 0.010384, + "auxiliary_loss_mlp": 0.01000398, + "balance_loss_clip": 0.99909872, + "balance_loss_mlp": 1.01432419, + "epoch": 0.45441154366451225, + "flos": 58216549921920.0, + "grad_norm": 0.6891763329400619, + "language_loss": 0.57655525, + "learning_rate": 2.3898702817936875e-06, + "loss": 0.59694326, + "num_input_tokens_seen": 162208500, + "router_z_loss_clip": 0.01300049, + "router_z_loss_mlp": 0.24023438, + "step": 7558, + "time_per_iteration": 2.9631850719451904 + }, + { + "auxiliary_loss_clip": 0.01117663, + "auxiliary_loss_mlp": 0.01034797, + "balance_loss_clip": 1.02106977, + "balance_loss_mlp": 1.04180217, + "epoch": 0.4544716669171802, + "flos": 16764286763520.0, + "grad_norm": 2.9054431891281847, + "language_loss": 0.56152129, + "learning_rate": 2.3894882841029573e-06, + "loss": 0.58304584, + "num_input_tokens_seen": 162224650, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7578125, + "step": 7559, + "time_per_iteration": 2.4718172550201416 + }, + { + "auxiliary_loss_clip": 0.01116333, + "auxiliary_loss_mlp": 0.01036141, + "balance_loss_clip": 1.02320707, + "balance_loss_mlp": 1.04311991, + "epoch": 0.4545317901698482, + "flos": 15925233991680.0, + "grad_norm": 2.1225715432080863, + "language_loss": 0.72038132, + "learning_rate": 2.389106271642792e-06, + "loss": 0.74190605, + "num_input_tokens_seen": 162242930, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.734375, + "step": 7560, + "time_per_iteration": 2.4289052486419678 + }, + { + "auxiliary_loss_clip": 0.01118313, + "auxiliary_loss_mlp": 0.01032424, + "balance_loss_clip": 1.01870942, + "balance_loss_mlp": 1.04184937, + "epoch": 0.45459191342251615, + "flos": 17639752947840.0, + "grad_norm": 1.8567895139214563, + "language_loss": 0.68786752, + "learning_rate": 2.3887242444276775e-06, + "loss": 0.70937485, + "num_input_tokens_seen": 162261455, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.765625, + "step": 7561, + "time_per_iteration": 2.483013153076172 + }, + { + "auxiliary_loss_clip": 0.01111487, + "auxiliary_loss_mlp": 0.01031067, + "balance_loss_clip": 1.01933646, + "balance_loss_mlp": 1.04098606, + "epoch": 0.4546520366751841, + "flos": 16176608346240.0, + "grad_norm": 1.6472040447099916, + "language_loss": 0.84813452, + "learning_rate": 2.3883422024721015e-06, + "loss": 0.86956006, + "num_input_tokens_seen": 162279725, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.703125, + "step": 7562, + "time_per_iteration": 2.435842752456665 + }, + { + "auxiliary_loss_clip": 0.0111239, + "auxiliary_loss_mlp": 0.01033708, + "balance_loss_clip": 1.02132261, + "balance_loss_mlp": 1.0416292, + "epoch": 0.4547121599278521, + "flos": 19751443562880.0, + "grad_norm": 1.8588056575997567, + "language_loss": 0.89808047, + "learning_rate": 2.38796014579055e-06, + "loss": 0.91954148, + "num_input_tokens_seen": 162297865, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.70703125, + "step": 7563, + "time_per_iteration": 2.4962618350982666 + }, + { + "auxiliary_loss_clip": 0.0111349, + "auxiliary_loss_mlp": 0.01037794, + "balance_loss_clip": 1.02425742, + "balance_loss_mlp": 1.03999305, + "epoch": 0.45477228318052004, + "flos": 19937461121280.0, + "grad_norm": 1.9222778596605532, + "language_loss": 0.71644425, + "learning_rate": 2.3875780743975097e-06, + "loss": 0.73795712, + "num_input_tokens_seen": 162316010, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 7564, + "time_per_iteration": 2.4343371391296387 + }, + { + "auxiliary_loss_clip": 0.01115348, + "auxiliary_loss_mlp": 0.01031628, + "balance_loss_clip": 1.01898563, + "balance_loss_mlp": 1.04060352, + "epoch": 0.454832406433188, + "flos": 21288312829440.0, + "grad_norm": 2.0985180699884496, + "language_loss": 0.67973971, + "learning_rate": 2.3871959883074713e-06, + "loss": 0.70120943, + "num_input_tokens_seen": 162336115, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.74609375, + "step": 7565, + "time_per_iteration": 2.5114333629608154 + }, + { + "auxiliary_loss_clip": 0.0111081, + "auxiliary_loss_mlp": 0.010292, + "balance_loss_clip": 1.01651037, + "balance_loss_mlp": 1.03948641, + "epoch": 0.45489252968585603, + "flos": 24498726612480.0, + "grad_norm": 1.555148092913002, + "language_loss": 0.80112624, + "learning_rate": 2.386813887534922e-06, + "loss": 0.8225264, + "num_input_tokens_seen": 162355705, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71484375, + "step": 7566, + "time_per_iteration": 2.4678473472595215 + }, + { + "auxiliary_loss_clip": 0.01114664, + "auxiliary_loss_mlp": 0.01028518, + "balance_loss_clip": 1.01451695, + "balance_loss_mlp": 1.04058981, + "epoch": 0.454952652938524, + "flos": 17092474352640.0, + "grad_norm": 1.5438575571986708, + "language_loss": 0.73526263, + "learning_rate": 2.3864317720943508e-06, + "loss": 0.75669444, + "num_input_tokens_seen": 162374055, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 7567, + "time_per_iteration": 2.4749765396118164 + }, + { + "auxiliary_loss_clip": 0.01117694, + "auxiliary_loss_mlp": 0.01031931, + "balance_loss_clip": 1.01924706, + "balance_loss_mlp": 1.04315984, + "epoch": 0.45501277619119196, + "flos": 27630387826560.0, + "grad_norm": 1.4420173241258303, + "language_loss": 0.80870211, + "learning_rate": 2.386049642000249e-06, + "loss": 0.83019841, + "num_input_tokens_seen": 162393560, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.74609375, + "step": 7568, + "time_per_iteration": 2.5098068714141846 + }, + { + "auxiliary_loss_clip": 0.01119299, + "auxiliary_loss_mlp": 0.01043854, + "balance_loss_clip": 1.02927494, + "balance_loss_mlp": 1.04110444, + "epoch": 0.4550728994438599, + "flos": 19974664632960.0, + "grad_norm": 1.9046518074434846, + "language_loss": 0.79472029, + "learning_rate": 2.3856674972671055e-06, + "loss": 0.81635177, + "num_input_tokens_seen": 162413170, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.78125, + "step": 7569, + "time_per_iteration": 2.5105931758880615 + }, + { + "auxiliary_loss_clip": 0.0111814, + "auxiliary_loss_mlp": 0.01032381, + "balance_loss_clip": 1.01811135, + "balance_loss_mlp": 1.04233003, + "epoch": 0.4551330226965279, + "flos": 26066873646720.0, + "grad_norm": 1.3375300297611126, + "language_loss": 0.74826288, + "learning_rate": 2.385285337909412e-06, + "loss": 0.76976812, + "num_input_tokens_seen": 162434080, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7578125, + "step": 7570, + "time_per_iteration": 2.5360968112945557 + }, + { + "auxiliary_loss_clip": 0.0111381, + "auxiliary_loss_mlp": 0.01037907, + "balance_loss_clip": 1.02502048, + "balance_loss_mlp": 1.04281187, + "epoch": 0.45519314594919585, + "flos": 32781091501440.0, + "grad_norm": 1.5540611030471656, + "language_loss": 0.74696088, + "learning_rate": 2.3849031639416596e-06, + "loss": 0.76847816, + "num_input_tokens_seen": 162455445, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 7571, + "time_per_iteration": 2.5796499252319336 + }, + { + "auxiliary_loss_clip": 0.01110782, + "auxiliary_loss_mlp": 0.01029523, + "balance_loss_clip": 1.01708317, + "balance_loss_mlp": 1.04096079, + "epoch": 0.4552532692018638, + "flos": 19172671718400.0, + "grad_norm": 1.522963408290285, + "language_loss": 0.81392241, + "learning_rate": 2.3845209753783414e-06, + "loss": 0.83532542, + "num_input_tokens_seen": 162474940, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.69921875, + "step": 7572, + "time_per_iteration": 2.452230215072632 + }, + { + "auxiliary_loss_clip": 0.01119128, + "auxiliary_loss_mlp": 0.01034444, + "balance_loss_clip": 1.02052081, + "balance_loss_mlp": 1.04266822, + "epoch": 0.4553133924545318, + "flos": 26027156183040.0, + "grad_norm": 2.158291075293226, + "language_loss": 0.72932756, + "learning_rate": 2.3841387722339486e-06, + "loss": 0.75086331, + "num_input_tokens_seen": 162493340, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 7573, + "time_per_iteration": 2.547351598739624 + }, + { + "auxiliary_loss_clip": 0.01119875, + "auxiliary_loss_mlp": 0.01036094, + "balance_loss_clip": 1.02106202, + "balance_loss_mlp": 1.04362583, + "epoch": 0.45537351570719975, + "flos": 30661535808000.0, + "grad_norm": 1.8799787689923733, + "language_loss": 0.74544156, + "learning_rate": 2.3837565545229748e-06, + "loss": 0.76700127, + "num_input_tokens_seen": 162514360, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.76171875, + "step": 7574, + "time_per_iteration": 2.512343406677246 + }, + { + "auxiliary_loss_clip": 0.01116382, + "auxiliary_loss_mlp": 0.01031805, + "balance_loss_clip": 1.01870358, + "balance_loss_mlp": 1.0413028, + "epoch": 0.4554336389598677, + "flos": 24353396184960.0, + "grad_norm": 1.8832109226527793, + "language_loss": 0.7161721, + "learning_rate": 2.383374322259915e-06, + "loss": 0.73765397, + "num_input_tokens_seen": 162535240, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.75, + "step": 7575, + "time_per_iteration": 2.516036033630371 + }, + { + "auxiliary_loss_clip": 0.01114571, + "auxiliary_loss_mlp": 0.01030488, + "balance_loss_clip": 1.0174526, + "balance_loss_mlp": 1.04138458, + "epoch": 0.4554937622125357, + "flos": 20557925677440.0, + "grad_norm": 1.7001526143902996, + "language_loss": 0.73163939, + "learning_rate": 2.3829920754592617e-06, + "loss": 0.75308996, + "num_input_tokens_seen": 162553880, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.734375, + "step": 7576, + "time_per_iteration": 2.446596145629883 + }, + { + "auxiliary_loss_clip": 0.01114194, + "auxiliary_loss_mlp": 0.01035671, + "balance_loss_clip": 1.02232563, + "balance_loss_mlp": 1.04252386, + "epoch": 0.45555388546520365, + "flos": 22820764723200.0, + "grad_norm": 1.8829162969496007, + "language_loss": 0.66556787, + "learning_rate": 2.382609814135511e-06, + "loss": 0.68706656, + "num_input_tokens_seen": 162574485, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.71484375, + "step": 7577, + "time_per_iteration": 2.496425151824951 + }, + { + "auxiliary_loss_clip": 0.01119433, + "auxiliary_loss_mlp": 0.01041229, + "balance_loss_clip": 1.02655983, + "balance_loss_mlp": 1.04481244, + "epoch": 0.4556140087178716, + "flos": 21725992051200.0, + "grad_norm": 1.905892479596231, + "language_loss": 0.74408162, + "learning_rate": 2.382227538303157e-06, + "loss": 0.76568818, + "num_input_tokens_seen": 162595130, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.74609375, + "step": 7578, + "time_per_iteration": 2.4517569541931152 + }, + { + "auxiliary_loss_clip": 0.01117156, + "auxiliary_loss_mlp": 0.01031849, + "balance_loss_clip": 1.01923108, + "balance_loss_mlp": 1.0432775, + "epoch": 0.45567413197053963, + "flos": 25994513698560.0, + "grad_norm": 1.9332037742405612, + "language_loss": 0.70189863, + "learning_rate": 2.381845247976697e-06, + "loss": 0.72338867, + "num_input_tokens_seen": 162615720, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.73828125, + "step": 7579, + "time_per_iteration": 2.5487825870513916 + }, + { + "auxiliary_loss_clip": 0.0111145, + "auxiliary_loss_mlp": 0.01032903, + "balance_loss_clip": 1.02031469, + "balance_loss_mlp": 1.03969145, + "epoch": 0.4557342552232076, + "flos": 21537604195200.0, + "grad_norm": 1.6152122780510265, + "language_loss": 0.78727221, + "learning_rate": 2.381462943170627e-06, + "loss": 0.8087157, + "num_input_tokens_seen": 162635825, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71875, + "step": 7580, + "time_per_iteration": 2.465355157852173 + }, + { + "auxiliary_loss_clip": 0.01115593, + "auxiliary_loss_mlp": 0.01028037, + "balance_loss_clip": 1.01463163, + "balance_loss_mlp": 1.04341292, + "epoch": 0.45579437847587556, + "flos": 40001972647680.0, + "grad_norm": 1.4438503581091628, + "language_loss": 0.68864352, + "learning_rate": 2.381080623899444e-06, + "loss": 0.71007979, + "num_input_tokens_seen": 162659130, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.72265625, + "step": 7581, + "time_per_iteration": 2.6738851070404053 + }, + { + "auxiliary_loss_clip": 0.01111798, + "auxiliary_loss_mlp": 0.0103026, + "balance_loss_clip": 1.01742125, + "balance_loss_mlp": 1.03975797, + "epoch": 0.4558545017285435, + "flos": 31138501530240.0, + "grad_norm": 1.5604567804249607, + "language_loss": 0.73416924, + "learning_rate": 2.3806982901776455e-06, + "loss": 0.75558978, + "num_input_tokens_seen": 162681665, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 7582, + "time_per_iteration": 2.5402657985687256 + }, + { + "auxiliary_loss_clip": 0.01118117, + "auxiliary_loss_mlp": 0.01045735, + "balance_loss_clip": 1.03065467, + "balance_loss_mlp": 1.04215884, + "epoch": 0.4559146249812115, + "flos": 21725776569600.0, + "grad_norm": 1.7600515256353326, + "language_loss": 0.72337949, + "learning_rate": 2.380315942019729e-06, + "loss": 0.74501801, + "num_input_tokens_seen": 162702040, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7578125, + "step": 7583, + "time_per_iteration": 2.51399564743042 + }, + { + "auxiliary_loss_clip": 0.01119408, + "auxiliary_loss_mlp": 0.01036162, + "balance_loss_clip": 1.02300692, + "balance_loss_mlp": 1.04282498, + "epoch": 0.45597474823387946, + "flos": 23805973935360.0, + "grad_norm": 1.711799016610791, + "language_loss": 0.72402817, + "learning_rate": 2.379933579440195e-06, + "loss": 0.74558389, + "num_input_tokens_seen": 162722375, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.765625, + "step": 7584, + "time_per_iteration": 2.4907238483428955 + }, + { + "auxiliary_loss_clip": 0.01116974, + "auxiliary_loss_mlp": 0.01032287, + "balance_loss_clip": 1.01922798, + "balance_loss_mlp": 1.04356861, + "epoch": 0.4560348714865474, + "flos": 31905661230720.0, + "grad_norm": 1.4921764730017937, + "language_loss": 0.68272889, + "learning_rate": 2.379551202453541e-06, + "loss": 0.70422149, + "num_input_tokens_seen": 162746095, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.734375, + "step": 7585, + "time_per_iteration": 2.5741868019104004 + }, + { + "auxiliary_loss_clip": 0.01114249, + "auxiliary_loss_mlp": 0.01031651, + "balance_loss_clip": 1.01928306, + "balance_loss_mlp": 1.04099321, + "epoch": 0.4560949947392154, + "flos": 22048828513920.0, + "grad_norm": 1.3206982799231843, + "language_loss": 0.76102924, + "learning_rate": 2.379168811074267e-06, + "loss": 0.78248823, + "num_input_tokens_seen": 162766330, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.73046875, + "step": 7586, + "time_per_iteration": 2.466991662979126 + }, + { + "auxiliary_loss_clip": 0.01112098, + "auxiliary_loss_mlp": 0.01028236, + "balance_loss_clip": 1.01651812, + "balance_loss_mlp": 1.0406158, + "epoch": 0.45615511799188335, + "flos": 24571804832640.0, + "grad_norm": 1.9114474136682882, + "language_loss": 0.77912259, + "learning_rate": 2.3787864053168747e-06, + "loss": 0.80052596, + "num_input_tokens_seen": 162784755, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.71484375, + "step": 7587, + "time_per_iteration": 2.534231185913086 + }, + { + "auxiliary_loss_clip": 0.01118125, + "auxiliary_loss_mlp": 0.01039323, + "balance_loss_clip": 1.02616787, + "balance_loss_mlp": 1.03976679, + "epoch": 0.4562152412445513, + "flos": 18330709944960.0, + "grad_norm": 2.2451216970422068, + "language_loss": 0.69211191, + "learning_rate": 2.378403985195863e-06, + "loss": 0.71368635, + "num_input_tokens_seen": 162803850, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.78125, + "step": 7588, + "time_per_iteration": 2.4104104042053223 + }, + { + "auxiliary_loss_clip": 0.011124, + "auxiliary_loss_mlp": 0.01034229, + "balance_loss_clip": 1.02178395, + "balance_loss_mlp": 1.0401839, + "epoch": 0.4562753644972193, + "flos": 13516525814400.0, + "grad_norm": 1.610626761932897, + "language_loss": 0.79335272, + "learning_rate": 2.378021550725735e-06, + "loss": 0.81481898, + "num_input_tokens_seen": 162820775, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.72265625, + "step": 7589, + "time_per_iteration": 2.465728998184204 + }, + { + "auxiliary_loss_clip": 0.01113978, + "auxiliary_loss_mlp": 0.01032914, + "balance_loss_clip": 1.01955092, + "balance_loss_mlp": 1.04108429, + "epoch": 0.45633548774988725, + "flos": 29639697701760.0, + "grad_norm": 2.193606067712595, + "language_loss": 0.6227479, + "learning_rate": 2.377639101920992e-06, + "loss": 0.64421678, + "num_input_tokens_seen": 162839695, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73046875, + "step": 7590, + "time_per_iteration": 2.509962558746338 + }, + { + "auxiliary_loss_clip": 0.0111218, + "auxiliary_loss_mlp": 0.01040835, + "balance_loss_clip": 1.02830625, + "balance_loss_mlp": 1.03874183, + "epoch": 0.4563956110025552, + "flos": 22233409528320.0, + "grad_norm": 5.263909382371274, + "language_loss": 0.72727275, + "learning_rate": 2.377256638796135e-06, + "loss": 0.74880284, + "num_input_tokens_seen": 162856095, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.734375, + "step": 7591, + "time_per_iteration": 2.529491424560547 + }, + { + "auxiliary_loss_clip": 0.01117071, + "auxiliary_loss_mlp": 0.01037677, + "balance_loss_clip": 1.02413523, + "balance_loss_mlp": 1.04252648, + "epoch": 0.45645573425522323, + "flos": 17092043389440.0, + "grad_norm": 2.0725698163141058, + "language_loss": 0.76985544, + "learning_rate": 2.3768741613656695e-06, + "loss": 0.79140294, + "num_input_tokens_seen": 162874070, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 7592, + "time_per_iteration": 2.4446723461151123 + }, + { + "auxiliary_loss_clip": 0.01113834, + "auxiliary_loss_mlp": 0.01028586, + "balance_loss_clip": 1.01604521, + "balance_loss_mlp": 1.04070461, + "epoch": 0.4565158575078912, + "flos": 20332334309760.0, + "grad_norm": 1.9266503814961675, + "language_loss": 0.69611561, + "learning_rate": 2.376491669644098e-06, + "loss": 0.71753979, + "num_input_tokens_seen": 162891000, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.73046875, + "step": 7593, + "time_per_iteration": 2.4879302978515625 + }, + { + "auxiliary_loss_clip": 0.01107529, + "auxiliary_loss_mlp": 0.01030986, + "balance_loss_clip": 1.01882672, + "balance_loss_mlp": 1.03803527, + "epoch": 0.45657598076055916, + "flos": 23983013093760.0, + "grad_norm": 2.17790627040614, + "language_loss": 0.84199911, + "learning_rate": 2.3761091636459248e-06, + "loss": 0.86338425, + "num_input_tokens_seen": 162910120, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 7594, + "time_per_iteration": 2.464733362197876 + }, + { + "auxiliary_loss_clip": 0.01035796, + "auxiliary_loss_mlp": 0.00998737, + "balance_loss_clip": 0.99745506, + "balance_loss_mlp": 1.01167154, + "epoch": 0.45663610401322713, + "flos": 69364297526400.0, + "grad_norm": 0.7964417819777524, + "language_loss": 0.52721512, + "learning_rate": 2.375726643385654e-06, + "loss": 0.54756045, + "num_input_tokens_seen": 162963720, + "router_z_loss_clip": 0.01281738, + "router_z_loss_mlp": 0.2421875, + "step": 7595, + "time_per_iteration": 6.0974061489105225 + }, + { + "auxiliary_loss_clip": 0.01117501, + "auxiliary_loss_mlp": 0.01031747, + "balance_loss_clip": 1.01843739, + "balance_loss_mlp": 1.04165292, + "epoch": 0.4566962272658951, + "flos": 15149095891200.0, + "grad_norm": 2.1595430840247714, + "language_loss": 0.87448329, + "learning_rate": 2.3753441088777915e-06, + "loss": 0.89597577, + "num_input_tokens_seen": 162975760, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7578125, + "step": 7596, + "time_per_iteration": 3.862628936767578 + }, + { + "auxiliary_loss_clip": 0.01113625, + "auxiliary_loss_mlp": 0.01039379, + "balance_loss_clip": 1.02698088, + "balance_loss_mlp": 1.03993344, + "epoch": 0.45675635051856306, + "flos": 18697465762560.0, + "grad_norm": 2.2425847761174196, + "language_loss": 0.77131474, + "learning_rate": 2.374961560136843e-06, + "loss": 0.79284477, + "num_input_tokens_seen": 162994865, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.734375, + "step": 7597, + "time_per_iteration": 2.4821672439575195 + }, + { + "auxiliary_loss_clip": 0.01113745, + "auxiliary_loss_mlp": 0.01034483, + "balance_loss_clip": 1.02122104, + "balance_loss_mlp": 1.04004443, + "epoch": 0.456816473771231, + "flos": 19098300608640.0, + "grad_norm": 1.7340388440754042, + "language_loss": 0.78560513, + "learning_rate": 2.374578997177314e-06, + "loss": 0.80708742, + "num_input_tokens_seen": 163014730, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73828125, + "step": 7598, + "time_per_iteration": 2.4350392818450928 + }, + { + "auxiliary_loss_clip": 0.01113148, + "auxiliary_loss_mlp": 0.01029189, + "balance_loss_clip": 1.01735115, + "balance_loss_mlp": 1.04057133, + "epoch": 0.456876597023899, + "flos": 28950069507840.0, + "grad_norm": 2.435026889485133, + "language_loss": 0.71715307, + "learning_rate": 2.374196420013712e-06, + "loss": 0.73857641, + "num_input_tokens_seen": 163033405, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.7265625, + "step": 7599, + "time_per_iteration": 2.5838844776153564 + }, + { + "auxiliary_loss_clip": 0.01108114, + "auxiliary_loss_mlp": 0.01035222, + "balance_loss_clip": 1.02238345, + "balance_loss_mlp": 1.03702497, + "epoch": 0.45693672027656695, + "flos": 23289470317440.0, + "grad_norm": 1.734840239500452, + "language_loss": 0.69377261, + "learning_rate": 2.373813828660544e-06, + "loss": 0.71520597, + "num_input_tokens_seen": 163051400, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7109375, + "step": 7600, + "time_per_iteration": 2.4499921798706055 + }, + { + "auxiliary_loss_clip": 0.01112216, + "auxiliary_loss_mlp": 0.01038134, + "balance_loss_clip": 1.02584386, + "balance_loss_mlp": 1.03979039, + "epoch": 0.4569968435292349, + "flos": 20558212986240.0, + "grad_norm": 1.9688741418230387, + "language_loss": 0.78654951, + "learning_rate": 2.373431223132319e-06, + "loss": 0.80805302, + "num_input_tokens_seen": 163069250, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.72265625, + "step": 7601, + "time_per_iteration": 2.555522918701172 + }, + { + "auxiliary_loss_clip": 0.01114065, + "auxiliary_loss_mlp": 0.01036912, + "balance_loss_clip": 1.02500272, + "balance_loss_mlp": 1.04013097, + "epoch": 0.4570569667819029, + "flos": 41282619223680.0, + "grad_norm": 1.706657696767707, + "language_loss": 0.71609282, + "learning_rate": 2.3730486034435448e-06, + "loss": 0.73760259, + "num_input_tokens_seen": 163091755, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.73828125, + "step": 7602, + "time_per_iteration": 2.6383092403411865 + }, + { + "auxiliary_loss_clip": 0.01112609, + "auxiliary_loss_mlp": 0.01031807, + "balance_loss_clip": 1.01735842, + "balance_loss_mlp": 1.03901231, + "epoch": 0.45711709003457085, + "flos": 26031573555840.0, + "grad_norm": 1.778856324344474, + "language_loss": 0.72776276, + "learning_rate": 2.372665969608729e-06, + "loss": 0.7492069, + "num_input_tokens_seen": 163111600, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.734375, + "step": 7603, + "time_per_iteration": 2.566542387008667 + }, + { + "auxiliary_loss_clip": 0.01113258, + "auxiliary_loss_mlp": 0.0103744, + "balance_loss_clip": 1.02284837, + "balance_loss_mlp": 1.03945732, + "epoch": 0.4571772132872388, + "flos": 22158068751360.0, + "grad_norm": 1.783042546573846, + "language_loss": 0.83495164, + "learning_rate": 2.372283321642383e-06, + "loss": 0.8564586, + "num_input_tokens_seen": 163127350, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.73828125, + "step": 7604, + "time_per_iteration": 2.4322941303253174 + }, + { + "auxiliary_loss_clip": 0.0112315, + "auxiliary_loss_mlp": 0.0103587, + "balance_loss_clip": 1.02152371, + "balance_loss_mlp": 1.04472041, + "epoch": 0.45723733653990684, + "flos": 23878872587520.0, + "grad_norm": 1.742561007105776, + "language_loss": 0.85827744, + "learning_rate": 2.371900659559016e-06, + "loss": 0.87986767, + "num_input_tokens_seen": 163145855, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78515625, + "step": 7605, + "time_per_iteration": 2.495654582977295 + }, + { + "auxiliary_loss_clip": 0.01116353, + "auxiliary_loss_mlp": 0.01035384, + "balance_loss_clip": 1.02216911, + "balance_loss_mlp": 1.04045463, + "epoch": 0.4572974597925748, + "flos": 16871803148160.0, + "grad_norm": 1.9150435252301277, + "language_loss": 0.73814523, + "learning_rate": 2.371517983373138e-06, + "loss": 0.75966263, + "num_input_tokens_seen": 163163830, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7578125, + "step": 7606, + "time_per_iteration": 2.472698926925659 + }, + { + "auxiliary_loss_clip": 0.01115234, + "auxiliary_loss_mlp": 0.0103916, + "balance_loss_clip": 1.02525389, + "balance_loss_mlp": 1.03985333, + "epoch": 0.45735758304524277, + "flos": 13771491528960.0, + "grad_norm": 4.395321075422478, + "language_loss": 0.7975688, + "learning_rate": 2.371135293099262e-06, + "loss": 0.81911278, + "num_input_tokens_seen": 163180700, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75390625, + "step": 7607, + "time_per_iteration": 2.500666618347168 + }, + { + "auxiliary_loss_clip": 0.01117549, + "auxiliary_loss_mlp": 0.01042987, + "balance_loss_clip": 1.0295403, + "balance_loss_mlp": 1.0436604, + "epoch": 0.45741770629791073, + "flos": 21100750986240.0, + "grad_norm": 2.5876510188713437, + "language_loss": 0.80827034, + "learning_rate": 2.3707525887518982e-06, + "loss": 0.82987565, + "num_input_tokens_seen": 163199450, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73828125, + "step": 7608, + "time_per_iteration": 2.454738140106201 + }, + { + "auxiliary_loss_clip": 0.0111299, + "auxiliary_loss_mlp": 0.01040349, + "balance_loss_clip": 1.02624631, + "balance_loss_mlp": 1.03830588, + "epoch": 0.4574778295505787, + "flos": 23112898035840.0, + "grad_norm": 1.6879461416077837, + "language_loss": 0.68500757, + "learning_rate": 2.370369870345559e-06, + "loss": 0.70654094, + "num_input_tokens_seen": 163217875, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.75, + "step": 7609, + "time_per_iteration": 2.567387580871582 + }, + { + "auxiliary_loss_clip": 0.01113281, + "auxiliary_loss_mlp": 0.01039479, + "balance_loss_clip": 1.02609158, + "balance_loss_mlp": 1.03981042, + "epoch": 0.45753795280324666, + "flos": 24352929308160.0, + "grad_norm": 1.861126687806453, + "language_loss": 0.80749559, + "learning_rate": 2.369987137894757e-06, + "loss": 0.82902324, + "num_input_tokens_seen": 163237430, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.734375, + "step": 7610, + "time_per_iteration": 2.5181450843811035 + }, + { + "auxiliary_loss_clip": 0.01115569, + "auxiliary_loss_mlp": 0.01034872, + "balance_loss_clip": 1.02122259, + "balance_loss_mlp": 1.04017019, + "epoch": 0.4575980760559146, + "flos": 16653789550080.0, + "grad_norm": 1.991436967054915, + "language_loss": 0.82063943, + "learning_rate": 2.3696043914140057e-06, + "loss": 0.84214383, + "num_input_tokens_seen": 163253905, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75390625, + "step": 7611, + "time_per_iteration": 2.5181667804718018 + }, + { + "auxiliary_loss_clip": 0.01117824, + "auxiliary_loss_mlp": 0.01030256, + "balance_loss_clip": 1.01684475, + "balance_loss_mlp": 1.04256463, + "epoch": 0.4576581993085826, + "flos": 35911423912320.0, + "grad_norm": 1.7999257820591783, + "language_loss": 0.74032104, + "learning_rate": 2.369221630917819e-06, + "loss": 0.76180184, + "num_input_tokens_seen": 163274285, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.75, + "step": 7612, + "time_per_iteration": 2.573192596435547 + }, + { + "auxiliary_loss_clip": 0.01111253, + "auxiliary_loss_mlp": 0.01031239, + "balance_loss_clip": 1.01775634, + "balance_loss_mlp": 1.03739977, + "epoch": 0.45771832256125056, + "flos": 20080421251200.0, + "grad_norm": 1.4998899682115554, + "language_loss": 0.84958243, + "learning_rate": 2.368838856420711e-06, + "loss": 0.87100732, + "num_input_tokens_seen": 163293150, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73828125, + "step": 7613, + "time_per_iteration": 2.519374132156372 + }, + { + "auxiliary_loss_clip": 0.01113962, + "auxiliary_loss_mlp": 0.01028737, + "balance_loss_clip": 1.01548696, + "balance_loss_mlp": 1.04007339, + "epoch": 0.4577784458139185, + "flos": 10744329957120.0, + "grad_norm": 2.119092433129462, + "language_loss": 0.75686407, + "learning_rate": 2.3684560679371965e-06, + "loss": 0.77829111, + "num_input_tokens_seen": 163310065, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73828125, + "step": 7614, + "time_per_iteration": 2.435258388519287 + }, + { + "auxiliary_loss_clip": 0.01111665, + "auxiliary_loss_mlp": 0.01031457, + "balance_loss_clip": 1.01870763, + "balance_loss_mlp": 1.03973377, + "epoch": 0.4578385690665865, + "flos": 21907269014400.0, + "grad_norm": 1.4729553038511707, + "language_loss": 0.74797261, + "learning_rate": 2.368073265481791e-06, + "loss": 0.76940382, + "num_input_tokens_seen": 163329415, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.71875, + "step": 7615, + "time_per_iteration": 2.4776275157928467 + }, + { + "auxiliary_loss_clip": 0.01037994, + "auxiliary_loss_mlp": 0.00999141, + "balance_loss_clip": 0.99766314, + "balance_loss_mlp": 1.01355577, + "epoch": 0.45789869231925445, + "flos": 64758286667520.0, + "grad_norm": 0.7822572530544061, + "language_loss": 0.57660586, + "learning_rate": 2.3676904490690105e-06, + "loss": 0.59697717, + "num_input_tokens_seen": 163385875, + "router_z_loss_clip": 0.01477051, + "router_z_loss_mlp": 0.24414062, + "step": 7616, + "time_per_iteration": 2.9986298084259033 + }, + { + "auxiliary_loss_clip": 0.01111756, + "auxiliary_loss_mlp": 0.01038669, + "balance_loss_clip": 1.0251503, + "balance_loss_mlp": 1.03939307, + "epoch": 0.4579588155719224, + "flos": 16144001775360.0, + "grad_norm": 1.5412759634284317, + "language_loss": 0.70953274, + "learning_rate": 2.3673076187133704e-06, + "loss": 0.73103696, + "num_input_tokens_seen": 163405170, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.72265625, + "step": 7617, + "time_per_iteration": 2.514575958251953 + }, + { + "auxiliary_loss_clip": 0.01116383, + "auxiliary_loss_mlp": 0.01032517, + "balance_loss_clip": 1.01886725, + "balance_loss_mlp": 1.04211044, + "epoch": 0.45801893882459044, + "flos": 21395541905280.0, + "grad_norm": 2.1003257335678245, + "language_loss": 0.76458549, + "learning_rate": 2.36692477442939e-06, + "loss": 0.78607446, + "num_input_tokens_seen": 163423155, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7421875, + "step": 7618, + "time_per_iteration": 2.431196689605713 + }, + { + "auxiliary_loss_clip": 0.01118549, + "auxiliary_loss_mlp": 0.01044975, + "balance_loss_clip": 1.0323689, + "balance_loss_mlp": 1.0429455, + "epoch": 0.4580790620772584, + "flos": 19536554448000.0, + "grad_norm": 1.7069120237831286, + "language_loss": 0.76705682, + "learning_rate": 2.366541916231585e-06, + "loss": 0.788692, + "num_input_tokens_seen": 163442450, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.75390625, + "step": 7619, + "time_per_iteration": 2.491133213043213 + }, + { + "auxiliary_loss_clip": 0.01112973, + "auxiliary_loss_mlp": 0.0103583, + "balance_loss_clip": 1.02378964, + "balance_loss_mlp": 1.04174709, + "epoch": 0.45813918532992637, + "flos": 16581070465920.0, + "grad_norm": 1.9887034550999254, + "language_loss": 0.7175532, + "learning_rate": 2.366159044134473e-06, + "loss": 0.73904121, + "num_input_tokens_seen": 163459810, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.7109375, + "step": 7620, + "time_per_iteration": 2.429659366607666 + }, + { + "auxiliary_loss_clip": 0.0111009, + "auxiliary_loss_mlp": 0.01028718, + "balance_loss_clip": 1.01643384, + "balance_loss_mlp": 1.03828478, + "epoch": 0.45819930858259433, + "flos": 42230301701760.0, + "grad_norm": 2.3637648648526035, + "language_loss": 0.78374821, + "learning_rate": 2.3657761581525748e-06, + "loss": 0.80513632, + "num_input_tokens_seen": 163482970, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.71875, + "step": 7621, + "time_per_iteration": 2.69990611076355 + }, + { + "auxiliary_loss_clip": 0.01037733, + "auxiliary_loss_mlp": 0.01001998, + "balance_loss_clip": 1.00071096, + "balance_loss_mlp": 1.01315987, + "epoch": 0.4582594318352623, + "flos": 63714795638400.0, + "grad_norm": 0.7958411378428579, + "language_loss": 0.6499809, + "learning_rate": 2.3653932583004063e-06, + "loss": 0.67037821, + "num_input_tokens_seen": 163545330, + "router_z_loss_clip": 0.01287842, + "router_z_loss_mlp": 0.24609375, + "step": 7622, + "time_per_iteration": 3.0476205348968506 + }, + { + "auxiliary_loss_clip": 0.01114449, + "auxiliary_loss_mlp": 0.01030125, + "balance_loss_clip": 1.01667762, + "balance_loss_mlp": 1.04142582, + "epoch": 0.45831955508793026, + "flos": 26869979882880.0, + "grad_norm": 1.9256202714320767, + "language_loss": 0.79611146, + "learning_rate": 2.3650103445924903e-06, + "loss": 0.81755722, + "num_input_tokens_seen": 163564620, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73046875, + "step": 7623, + "time_per_iteration": 2.547234535217285 + }, + { + "auxiliary_loss_clip": 0.01115872, + "auxiliary_loss_mlp": 0.01036985, + "balance_loss_clip": 1.02382421, + "balance_loss_mlp": 1.04050457, + "epoch": 0.45837967834059823, + "flos": 18733951002240.0, + "grad_norm": 1.996922752989922, + "language_loss": 0.70809233, + "learning_rate": 2.3646274170433452e-06, + "loss": 0.72962081, + "num_input_tokens_seen": 163581010, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.75390625, + "step": 7624, + "time_per_iteration": 2.442575693130493 + }, + { + "auxiliary_loss_clip": 0.01113872, + "auxiliary_loss_mlp": 0.01032309, + "balance_loss_clip": 1.01944637, + "balance_loss_mlp": 1.0383656, + "epoch": 0.4584398015932662, + "flos": 21178102924800.0, + "grad_norm": 2.876738245253823, + "language_loss": 0.7299192, + "learning_rate": 2.364244475667491e-06, + "loss": 0.75138104, + "num_input_tokens_seen": 163599955, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7578125, + "step": 7625, + "time_per_iteration": 2.53002667427063 + }, + { + "auxiliary_loss_clip": 0.01116016, + "auxiliary_loss_mlp": 0.01033086, + "balance_loss_clip": 1.02058113, + "balance_loss_mlp": 1.04226136, + "epoch": 0.45849992484593416, + "flos": 19790047704960.0, + "grad_norm": 3.1470354950748716, + "language_loss": 0.78132713, + "learning_rate": 2.363861520479451e-06, + "loss": 0.80281818, + "num_input_tokens_seen": 163618545, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.73828125, + "step": 7626, + "time_per_iteration": 2.4544708728790283 + }, + { + "auxiliary_loss_clip": 0.01117004, + "auxiliary_loss_mlp": 0.0103582, + "balance_loss_clip": 1.02270126, + "balance_loss_mlp": 1.04142714, + "epoch": 0.4585600480986021, + "flos": 18223265387520.0, + "grad_norm": 1.604401840334718, + "language_loss": 0.85191864, + "learning_rate": 2.3634785514937445e-06, + "loss": 0.87344688, + "num_input_tokens_seen": 163636055, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7578125, + "step": 7627, + "time_per_iteration": 2.478867769241333 + }, + { + "auxiliary_loss_clip": 0.01117716, + "auxiliary_loss_mlp": 0.01036428, + "balance_loss_clip": 1.02293992, + "balance_loss_mlp": 1.04074025, + "epoch": 0.4586201713512701, + "flos": 29022213974400.0, + "grad_norm": 1.506714204397822, + "language_loss": 0.69413865, + "learning_rate": 2.3630955687248953e-06, + "loss": 0.71568, + "num_input_tokens_seen": 163657485, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.76953125, + "step": 7628, + "time_per_iteration": 2.5127782821655273 + }, + { + "auxiliary_loss_clip": 0.01113376, + "auxiliary_loss_mlp": 0.01030025, + "balance_loss_clip": 1.01654255, + "balance_loss_mlp": 1.04060626, + "epoch": 0.45868029460393805, + "flos": 23404600385280.0, + "grad_norm": 1.5379008002675938, + "language_loss": 0.78294545, + "learning_rate": 2.3627125721874265e-06, + "loss": 0.8043794, + "num_input_tokens_seen": 163676030, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7265625, + "step": 7629, + "time_per_iteration": 2.4944000244140625 + }, + { + "auxiliary_loss_clip": 0.0111907, + "auxiliary_loss_mlp": 0.01039687, + "balance_loss_clip": 1.02578115, + "balance_loss_mlp": 1.04031289, + "epoch": 0.458740417856606, + "flos": 18221972497920.0, + "grad_norm": 2.0009780664883223, + "language_loss": 0.79405141, + "learning_rate": 2.3623295618958595e-06, + "loss": 0.81563896, + "num_input_tokens_seen": 163694490, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7890625, + "step": 7630, + "time_per_iteration": 2.443598747253418 + }, + { + "auxiliary_loss_clip": 0.0111732, + "auxiliary_loss_mlp": 0.01034928, + "balance_loss_clip": 1.02108812, + "balance_loss_mlp": 1.03952336, + "epoch": 0.458800541109274, + "flos": 34568760504960.0, + "grad_norm": 1.67887072973593, + "language_loss": 0.71819407, + "learning_rate": 2.3619465378647198e-06, + "loss": 0.73971653, + "num_input_tokens_seen": 163717035, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7734375, + "step": 7631, + "time_per_iteration": 2.613935708999634 + }, + { + "auxiliary_loss_clip": 0.01118321, + "auxiliary_loss_mlp": 0.0103646, + "balance_loss_clip": 1.02248299, + "balance_loss_mlp": 1.04306722, + "epoch": 0.458860664361942, + "flos": 17712112896000.0, + "grad_norm": 2.655938907200588, + "language_loss": 0.71337265, + "learning_rate": 2.361563500108531e-06, + "loss": 0.7349205, + "num_input_tokens_seen": 163734525, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75390625, + "step": 7632, + "time_per_iteration": 2.4854414463043213 + }, + { + "auxiliary_loss_clip": 0.01118604, + "auxiliary_loss_mlp": 0.01033523, + "balance_loss_clip": 1.0190748, + "balance_loss_mlp": 1.04055059, + "epoch": 0.45892078761460997, + "flos": 18441889516800.0, + "grad_norm": 15.51679170955813, + "language_loss": 0.69212449, + "learning_rate": 2.3611804486418178e-06, + "loss": 0.71364582, + "num_input_tokens_seen": 163752860, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 7633, + "time_per_iteration": 2.488741874694824 + }, + { + "auxiliary_loss_clip": 0.01115341, + "auxiliary_loss_mlp": 0.01036682, + "balance_loss_clip": 1.02366996, + "balance_loss_mlp": 1.04068875, + "epoch": 0.45898091086727794, + "flos": 22672956257280.0, + "grad_norm": 1.4724338826500494, + "language_loss": 0.80777454, + "learning_rate": 2.3607973834791062e-06, + "loss": 0.82929468, + "num_input_tokens_seen": 163772495, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.74609375, + "step": 7634, + "time_per_iteration": 2.4676551818847656 + }, + { + "auxiliary_loss_clip": 0.01118954, + "auxiliary_loss_mlp": 0.0103355, + "balance_loss_clip": 1.0188632, + "balance_loss_mlp": 1.04032791, + "epoch": 0.4590410341199459, + "flos": 21652949744640.0, + "grad_norm": 1.9575518559569576, + "language_loss": 0.81853092, + "learning_rate": 2.3604143046349216e-06, + "loss": 0.84005594, + "num_input_tokens_seen": 163791475, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78515625, + "step": 7635, + "time_per_iteration": 2.513383150100708 + }, + { + "auxiliary_loss_clip": 0.01112964, + "auxiliary_loss_mlp": 0.01040022, + "balance_loss_clip": 1.02696204, + "balance_loss_mlp": 1.04045606, + "epoch": 0.45910115737261387, + "flos": 36535372087680.0, + "grad_norm": 1.4265799385965707, + "language_loss": 0.64948833, + "learning_rate": 2.3600312121237905e-06, + "loss": 0.67101824, + "num_input_tokens_seen": 163812995, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 7636, + "time_per_iteration": 4.062237501144409 + }, + { + "auxiliary_loss_clip": 0.01114223, + "auxiliary_loss_mlp": 0.01029599, + "balance_loss_clip": 1.01690328, + "balance_loss_mlp": 1.04186797, + "epoch": 0.45916128062528183, + "flos": 24419866302720.0, + "grad_norm": 1.4568741521374282, + "language_loss": 0.80726147, + "learning_rate": 2.3596481059602395e-06, + "loss": 0.82869971, + "num_input_tokens_seen": 163833945, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 7637, + "time_per_iteration": 4.017204999923706 + }, + { + "auxiliary_loss_clip": 0.011204, + "auxiliary_loss_mlp": 0.01037267, + "balance_loss_clip": 1.02297974, + "balance_loss_mlp": 1.0438447, + "epoch": 0.4592214038779498, + "flos": 23221958705280.0, + "grad_norm": 1.56098785708404, + "language_loss": 0.75311542, + "learning_rate": 2.3592649861587965e-06, + "loss": 0.77469212, + "num_input_tokens_seen": 163853885, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.765625, + "step": 7638, + "time_per_iteration": 2.4801623821258545 + }, + { + "auxiliary_loss_clip": 0.01113334, + "auxiliary_loss_mlp": 0.01033656, + "balance_loss_clip": 1.02054262, + "balance_loss_mlp": 1.04093051, + "epoch": 0.45928152713061776, + "flos": 19172133014400.0, + "grad_norm": 1.6757486640396035, + "language_loss": 0.74225289, + "learning_rate": 2.358881852733989e-06, + "loss": 0.76372278, + "num_input_tokens_seen": 163871855, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 7639, + "time_per_iteration": 2.457977294921875 + }, + { + "auxiliary_loss_clip": 0.0111659, + "auxiliary_loss_mlp": 0.01034149, + "balance_loss_clip": 1.02073193, + "balance_loss_mlp": 1.0410862, + "epoch": 0.4593416503832857, + "flos": 22414686491520.0, + "grad_norm": 2.7996676169839856, + "language_loss": 0.68441081, + "learning_rate": 2.358498705700346e-06, + "loss": 0.70591819, + "num_input_tokens_seen": 163891450, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75390625, + "step": 7640, + "time_per_iteration": 2.4815306663513184 + }, + { + "auxiliary_loss_clip": 0.01116242, + "auxiliary_loss_mlp": 0.01039241, + "balance_loss_clip": 1.02532363, + "balance_loss_mlp": 1.03950286, + "epoch": 0.4594017736359537, + "flos": 18880215183360.0, + "grad_norm": 4.694339799219563, + "language_loss": 0.75290608, + "learning_rate": 2.3581155450723958e-06, + "loss": 0.77446091, + "num_input_tokens_seen": 163909345, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 7641, + "time_per_iteration": 2.4738545417785645 + }, + { + "auxiliary_loss_clip": 0.0111703, + "auxiliary_loss_mlp": 0.01031975, + "balance_loss_clip": 1.0180217, + "balance_loss_mlp": 1.041008, + "epoch": 0.45946189688862166, + "flos": 20518567349760.0, + "grad_norm": 1.7266679695779108, + "language_loss": 0.74649787, + "learning_rate": 2.357732370864668e-06, + "loss": 0.76798791, + "num_input_tokens_seen": 163926940, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76171875, + "step": 7642, + "time_per_iteration": 2.474160671234131 + }, + { + "auxiliary_loss_clip": 0.01036998, + "auxiliary_loss_mlp": 0.00999788, + "balance_loss_clip": 0.99855977, + "balance_loss_mlp": 1.01273584, + "epoch": 0.4595220201412896, + "flos": 61405990162560.0, + "grad_norm": 0.8383581259748949, + "language_loss": 0.58191991, + "learning_rate": 2.357349183091694e-06, + "loss": 0.60228777, + "num_input_tokens_seen": 163977785, + "router_z_loss_clip": 0.01226807, + "router_z_loss_mlp": 0.2421875, + "step": 7643, + "time_per_iteration": 2.810622453689575 + }, + { + "auxiliary_loss_clip": 0.01118319, + "auxiliary_loss_mlp": 0.01036506, + "balance_loss_clip": 1.02267814, + "balance_loss_mlp": 1.03810704, + "epoch": 0.4595821433939576, + "flos": 23330947547520.0, + "grad_norm": 1.5583198955297553, + "language_loss": 0.92945647, + "learning_rate": 2.3569659817680016e-06, + "loss": 0.95100462, + "num_input_tokens_seen": 163996630, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.80078125, + "step": 7644, + "time_per_iteration": 2.4740004539489746 + }, + { + "auxiliary_loss_clip": 0.01116764, + "auxiliary_loss_mlp": 0.01038017, + "balance_loss_clip": 1.02458835, + "balance_loss_mlp": 1.04016256, + "epoch": 0.4596422666466256, + "flos": 14282356711680.0, + "grad_norm": 1.923875093759249, + "language_loss": 0.8283661, + "learning_rate": 2.3565827669081243e-06, + "loss": 0.8499139, + "num_input_tokens_seen": 164013190, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 7645, + "time_per_iteration": 2.459575891494751 + }, + { + "auxiliary_loss_clip": 0.01035246, + "auxiliary_loss_mlp": 0.00999372, + "balance_loss_clip": 0.99805516, + "balance_loss_mlp": 1.0108279, + "epoch": 0.4597023898992936, + "flos": 65727337737600.0, + "grad_norm": 0.7553504929083139, + "language_loss": 0.59931064, + "learning_rate": 2.356199538526593e-06, + "loss": 0.6196568, + "num_input_tokens_seen": 164074030, + "router_z_loss_clip": 0.01318359, + "router_z_loss_mlp": 0.24414062, + "step": 7646, + "time_per_iteration": 3.0040318965911865 + }, + { + "auxiliary_loss_clip": 0.01116678, + "auxiliary_loss_mlp": 0.01032804, + "balance_loss_clip": 1.01953018, + "balance_loss_mlp": 1.04043436, + "epoch": 0.45976251315196154, + "flos": 26907075653760.0, + "grad_norm": 1.6094604606837348, + "language_loss": 0.72804034, + "learning_rate": 2.355816296637939e-06, + "loss": 0.74953508, + "num_input_tokens_seen": 164095515, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.76171875, + "step": 7647, + "time_per_iteration": 2.539550304412842 + }, + { + "auxiliary_loss_clip": 0.01114997, + "auxiliary_loss_mlp": 0.01034751, + "balance_loss_clip": 1.02135134, + "balance_loss_mlp": 1.03845108, + "epoch": 0.4598226364046295, + "flos": 26618066824320.0, + "grad_norm": 1.5906503149252664, + "language_loss": 0.66864169, + "learning_rate": 2.3554330412566957e-06, + "loss": 0.69013917, + "num_input_tokens_seen": 164117270, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.765625, + "step": 7648, + "time_per_iteration": 2.538694143295288 + }, + { + "auxiliary_loss_clip": 0.01112764, + "auxiliary_loss_mlp": 0.01033419, + "balance_loss_clip": 1.01969171, + "balance_loss_mlp": 1.03751159, + "epoch": 0.45988275965729747, + "flos": 24387762522240.0, + "grad_norm": 1.4797855079557312, + "language_loss": 0.78785735, + "learning_rate": 2.3550497723973953e-06, + "loss": 0.80931914, + "num_input_tokens_seen": 164137850, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.75, + "step": 7649, + "time_per_iteration": 2.5164248943328857 + }, + { + "auxiliary_loss_clip": 0.01113076, + "auxiliary_loss_mlp": 0.01037179, + "balance_loss_clip": 1.02412558, + "balance_loss_mlp": 1.03840113, + "epoch": 0.45994288290996543, + "flos": 24535822383360.0, + "grad_norm": 3.1550947466117303, + "language_loss": 0.69324255, + "learning_rate": 2.3546664900745726e-06, + "loss": 0.7147451, + "num_input_tokens_seen": 164157960, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.75, + "step": 7650, + "time_per_iteration": 2.5182442665100098 + }, + { + "auxiliary_loss_clip": 0.01118739, + "auxiliary_loss_mlp": 0.01039502, + "balance_loss_clip": 1.0245893, + "balance_loss_mlp": 1.03925538, + "epoch": 0.4600030061626334, + "flos": 14830245838080.0, + "grad_norm": 1.968615763904363, + "language_loss": 0.83896518, + "learning_rate": 2.354283194302761e-06, + "loss": 0.86054754, + "num_input_tokens_seen": 164174590, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.79296875, + "step": 7651, + "time_per_iteration": 2.4545249938964844 + }, + { + "auxiliary_loss_clip": 0.01114537, + "auxiliary_loss_mlp": 0.01030219, + "balance_loss_clip": 1.01685548, + "balance_loss_mlp": 1.04122114, + "epoch": 0.46006312941530136, + "flos": 18113845582080.0, + "grad_norm": 2.1703456469435944, + "language_loss": 0.75375223, + "learning_rate": 2.3538998850964948e-06, + "loss": 0.77519977, + "num_input_tokens_seen": 164192935, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.734375, + "step": 7652, + "time_per_iteration": 2.4435648918151855 + }, + { + "auxiliary_loss_clip": 0.01113746, + "auxiliary_loss_mlp": 0.01029979, + "balance_loss_clip": 1.01611495, + "balance_loss_mlp": 1.03735042, + "epoch": 0.46012325266796933, + "flos": 21976468565760.0, + "grad_norm": 1.8091521205399639, + "language_loss": 0.75805604, + "learning_rate": 2.3535165624703097e-06, + "loss": 0.77949333, + "num_input_tokens_seen": 164213160, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 7653, + "time_per_iteration": 2.530977487564087 + }, + { + "auxiliary_loss_clip": 0.01121671, + "auxiliary_loss_mlp": 0.01038496, + "balance_loss_clip": 1.02338028, + "balance_loss_mlp": 1.04202819, + "epoch": 0.4601833759206373, + "flos": 15268068714240.0, + "grad_norm": 2.3598469293633584, + "language_loss": 0.6584686, + "learning_rate": 2.353133226438741e-06, + "loss": 0.68007028, + "num_input_tokens_seen": 164229330, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.796875, + "step": 7654, + "time_per_iteration": 2.3942883014678955 + }, + { + "auxiliary_loss_clip": 0.01112793, + "auxiliary_loss_mlp": 0.0103367, + "balance_loss_clip": 1.02026534, + "balance_loss_mlp": 1.0375098, + "epoch": 0.46024349917330526, + "flos": 27088999061760.0, + "grad_norm": 1.647085409720671, + "language_loss": 0.79088843, + "learning_rate": 2.3527498770163248e-06, + "loss": 0.81235307, + "num_input_tokens_seen": 164248240, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.75390625, + "step": 7655, + "time_per_iteration": 2.5213396549224854 + }, + { + "auxiliary_loss_clip": 0.01110004, + "auxiliary_loss_mlp": 0.01030693, + "balance_loss_clip": 1.01755643, + "balance_loss_mlp": 1.03802609, + "epoch": 0.4603036224259732, + "flos": 24462923731200.0, + "grad_norm": 2.0582079675710134, + "language_loss": 0.67502171, + "learning_rate": 2.3523665142175985e-06, + "loss": 0.69642866, + "num_input_tokens_seen": 164268020, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 7656, + "time_per_iteration": 2.4714531898498535 + }, + { + "auxiliary_loss_clip": 0.01112759, + "auxiliary_loss_mlp": 0.01032502, + "balance_loss_clip": 1.01965153, + "balance_loss_mlp": 1.03784871, + "epoch": 0.4603637456786412, + "flos": 28109292883200.0, + "grad_norm": 1.7896797448491664, + "language_loss": 0.81050038, + "learning_rate": 2.351983138057098e-06, + "loss": 0.83195299, + "num_input_tokens_seen": 164287305, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.75, + "step": 7657, + "time_per_iteration": 2.549114227294922 + }, + { + "auxiliary_loss_clip": 0.01113625, + "auxiliary_loss_mlp": 0.01031549, + "balance_loss_clip": 1.01767325, + "balance_loss_mlp": 1.03843951, + "epoch": 0.4604238689313092, + "flos": 24348942898560.0, + "grad_norm": 2.212167065380131, + "language_loss": 0.70071685, + "learning_rate": 2.3515997485493623e-06, + "loss": 0.72216856, + "num_input_tokens_seen": 164306835, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 7658, + "time_per_iteration": 2.4548964500427246 + }, + { + "auxiliary_loss_clip": 0.0103337, + "auxiliary_loss_mlp": 0.01002994, + "balance_loss_clip": 1.00179565, + "balance_loss_mlp": 1.00924027, + "epoch": 0.4604839921839772, + "flos": 53606229431040.0, + "grad_norm": 0.9542906494873047, + "language_loss": 0.62159562, + "learning_rate": 2.351216345708928e-06, + "loss": 0.64195925, + "num_input_tokens_seen": 164367095, + "router_z_loss_clip": 0.01196289, + "router_z_loss_mlp": 0.2421875, + "step": 7659, + "time_per_iteration": 3.194460153579712 + }, + { + "auxiliary_loss_clip": 0.01114248, + "auxiliary_loss_mlp": 0.0103108, + "balance_loss_clip": 1.01774633, + "balance_loss_mlp": 1.04089022, + "epoch": 0.46054411543664514, + "flos": 31248424126080.0, + "grad_norm": 2.0710979138047123, + "language_loss": 0.68395913, + "learning_rate": 2.350832929550336e-06, + "loss": 0.70541239, + "num_input_tokens_seen": 164388895, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.734375, + "step": 7660, + "time_per_iteration": 2.5212934017181396 + }, + { + "auxiliary_loss_clip": 0.01112449, + "auxiliary_loss_mlp": 0.01041428, + "balance_loss_clip": 1.02767086, + "balance_loss_mlp": 1.03826356, + "epoch": 0.4606042386893131, + "flos": 24092863862400.0, + "grad_norm": 1.7599753910943126, + "language_loss": 0.76785183, + "learning_rate": 2.3504495000881227e-06, + "loss": 0.78939056, + "num_input_tokens_seen": 164409080, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7421875, + "step": 7661, + "time_per_iteration": 2.504199981689453 + }, + { + "auxiliary_loss_clip": 0.01111854, + "auxiliary_loss_mlp": 0.0103438, + "balance_loss_clip": 1.02109385, + "balance_loss_mlp": 1.03997183, + "epoch": 0.46066436194198107, + "flos": 26578457101440.0, + "grad_norm": 1.743819837097498, + "language_loss": 0.74565995, + "learning_rate": 2.3500660573368305e-06, + "loss": 0.76712227, + "num_input_tokens_seen": 164427585, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 7662, + "time_per_iteration": 2.479710817337036 + }, + { + "auxiliary_loss_clip": 0.01118488, + "auxiliary_loss_mlp": 0.01032606, + "balance_loss_clip": 1.01835489, + "balance_loss_mlp": 1.03899062, + "epoch": 0.46072448519464904, + "flos": 17775602184960.0, + "grad_norm": 2.744789888238294, + "language_loss": 0.78880358, + "learning_rate": 2.349682601310998e-06, + "loss": 0.81031454, + "num_input_tokens_seen": 164438455, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.796875, + "step": 7663, + "time_per_iteration": 2.433105230331421 + }, + { + "auxiliary_loss_clip": 0.01110139, + "auxiliary_loss_mlp": 0.01035881, + "balance_loss_clip": 1.02286935, + "balance_loss_mlp": 1.03860092, + "epoch": 0.460784608447317, + "flos": 15086109392640.0, + "grad_norm": 1.8568277173945746, + "language_loss": 0.73164225, + "learning_rate": 2.3492991320251653e-06, + "loss": 0.75310248, + "num_input_tokens_seen": 164456830, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71484375, + "step": 7664, + "time_per_iteration": 2.4182069301605225 + }, + { + "auxiliary_loss_clip": 0.01114696, + "auxiliary_loss_mlp": 0.01033369, + "balance_loss_clip": 1.02064347, + "balance_loss_mlp": 1.040645, + "epoch": 0.46084473169998497, + "flos": 18588261438720.0, + "grad_norm": 1.6231584574242337, + "language_loss": 0.72039741, + "learning_rate": 2.3489156494938753e-06, + "loss": 0.74187809, + "num_input_tokens_seen": 164475375, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.73828125, + "step": 7665, + "time_per_iteration": 2.4458460807800293 + }, + { + "auxiliary_loss_clip": 0.01115054, + "auxiliary_loss_mlp": 0.01032258, + "balance_loss_clip": 1.01965141, + "balance_loss_mlp": 1.03982568, + "epoch": 0.46090485495265293, + "flos": 19494789909120.0, + "grad_norm": 1.8683756247621939, + "language_loss": 0.78134775, + "learning_rate": 2.348532153731669e-06, + "loss": 0.80282086, + "num_input_tokens_seen": 164492040, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.75390625, + "step": 7666, + "time_per_iteration": 2.4217963218688965 + }, + { + "auxiliary_loss_clip": 0.01112281, + "auxiliary_loss_mlp": 0.01034367, + "balance_loss_clip": 1.02005553, + "balance_loss_mlp": 1.03926802, + "epoch": 0.4609649782053209, + "flos": 33364927163520.0, + "grad_norm": 1.2927592404362929, + "language_loss": 0.73972279, + "learning_rate": 2.348148644753088e-06, + "loss": 0.76118922, + "num_input_tokens_seen": 164513665, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.73046875, + "step": 7667, + "time_per_iteration": 2.586657762527466 + }, + { + "auxiliary_loss_clip": 0.0111122, + "auxiliary_loss_mlp": 0.010306, + "balance_loss_clip": 1.01803541, + "balance_loss_mlp": 1.03743756, + "epoch": 0.46102510145798886, + "flos": 23769165473280.0, + "grad_norm": 1.3923437909363505, + "language_loss": 0.75857067, + "learning_rate": 2.347765122572676e-06, + "loss": 0.77998888, + "num_input_tokens_seen": 164533890, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.73828125, + "step": 7668, + "time_per_iteration": 2.456688642501831 + }, + { + "auxiliary_loss_clip": 0.01112338, + "auxiliary_loss_mlp": 0.01029444, + "balance_loss_clip": 1.01699305, + "balance_loss_mlp": 1.04143405, + "epoch": 0.4610852247106568, + "flos": 23294821443840.0, + "grad_norm": 2.015120719246451, + "language_loss": 0.77794099, + "learning_rate": 2.347381587204975e-06, + "loss": 0.79935884, + "num_input_tokens_seen": 164553815, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 7669, + "time_per_iteration": 2.503912925720215 + }, + { + "auxiliary_loss_clip": 0.01112792, + "auxiliary_loss_mlp": 0.01029623, + "balance_loss_clip": 1.01688588, + "balance_loss_mlp": 1.03798747, + "epoch": 0.4611453479633248, + "flos": 25447450584960.0, + "grad_norm": 1.8162494299938103, + "language_loss": 0.82330608, + "learning_rate": 2.34699803866453e-06, + "loss": 0.84473014, + "num_input_tokens_seen": 164573125, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.75, + "step": 7670, + "time_per_iteration": 2.481456995010376 + }, + { + "auxiliary_loss_clip": 0.01111476, + "auxiliary_loss_mlp": 0.0103085, + "balance_loss_clip": 1.01781416, + "balance_loss_mlp": 1.03845906, + "epoch": 0.4612054712159928, + "flos": 21139606523520.0, + "grad_norm": 1.6076372414606255, + "language_loss": 0.63204038, + "learning_rate": 2.3466144769658845e-06, + "loss": 0.6534636, + "num_input_tokens_seen": 164592575, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73046875, + "step": 7671, + "time_per_iteration": 2.4743082523345947 + }, + { + "auxiliary_loss_clip": 0.01034608, + "auxiliary_loss_mlp": 0.01007042, + "balance_loss_clip": 1.00571287, + "balance_loss_mlp": 1.01008546, + "epoch": 0.4612655944686608, + "flos": 69959266404480.0, + "grad_norm": 0.6877278401983052, + "language_loss": 0.55879581, + "learning_rate": 2.346230902123583e-06, + "loss": 0.57921231, + "num_input_tokens_seen": 164659795, + "router_z_loss_clip": 0.01330566, + "router_z_loss_mlp": 0.24609375, + "step": 7672, + "time_per_iteration": 3.15800142288208 + }, + { + "auxiliary_loss_clip": 0.0111558, + "auxiliary_loss_mlp": 0.01035904, + "balance_loss_clip": 1.02255249, + "balance_loss_mlp": 1.04003441, + "epoch": 0.46132571772132874, + "flos": 16837149502080.0, + "grad_norm": 1.8329231831015789, + "language_loss": 0.70920408, + "learning_rate": 2.3458473141521715e-06, + "loss": 0.73071891, + "num_input_tokens_seen": 164678735, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7578125, + "step": 7673, + "time_per_iteration": 2.4639430046081543 + }, + { + "auxiliary_loss_clip": 0.01112366, + "auxiliary_loss_mlp": 0.01034204, + "balance_loss_clip": 1.02145457, + "balance_loss_mlp": 1.04083312, + "epoch": 0.4613858409739967, + "flos": 35808935431680.0, + "grad_norm": 1.6780898708072003, + "language_loss": 0.70402145, + "learning_rate": 2.345463713066195e-06, + "loss": 0.72548711, + "num_input_tokens_seen": 164700885, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71484375, + "step": 7674, + "time_per_iteration": 2.5660369396209717 + }, + { + "auxiliary_loss_clip": 0.01111645, + "auxiliary_loss_mlp": 0.01037491, + "balance_loss_clip": 1.02384138, + "balance_loss_mlp": 1.03684926, + "epoch": 0.4614459642266647, + "flos": 35266756567680.0, + "grad_norm": 1.5790047103218752, + "language_loss": 0.65408182, + "learning_rate": 2.3450800988801996e-06, + "loss": 0.67557311, + "num_input_tokens_seen": 164726960, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75, + "step": 7675, + "time_per_iteration": 2.616771697998047 + }, + { + "auxiliary_loss_clip": 0.01034961, + "auxiliary_loss_mlp": 0.010075, + "balance_loss_clip": 1.00611675, + "balance_loss_mlp": 1.01053035, + "epoch": 0.46150608747933264, + "flos": 66704610044160.0, + "grad_norm": 0.7425701763607123, + "language_loss": 0.58600932, + "learning_rate": 2.3446964716087327e-06, + "loss": 0.60643393, + "num_input_tokens_seen": 164788525, + "router_z_loss_clip": 0.01385498, + "router_z_loss_mlp": 0.24511719, + "step": 7676, + "time_per_iteration": 3.09281325340271 + }, + { + "auxiliary_loss_clip": 0.01034023, + "auxiliary_loss_mlp": 0.01002968, + "balance_loss_clip": 1.00172222, + "balance_loss_mlp": 1.00993788, + "epoch": 0.4615662107320006, + "flos": 55830177025920.0, + "grad_norm": 0.7891273111868267, + "language_loss": 0.62684548, + "learning_rate": 2.344312831266341e-06, + "loss": 0.64721537, + "num_input_tokens_seen": 164843525, + "router_z_loss_clip": 0.01245117, + "router_z_loss_mlp": 0.24121094, + "step": 7677, + "time_per_iteration": 2.9087297916412354 + }, + { + "auxiliary_loss_clip": 0.01112185, + "auxiliary_loss_mlp": 0.01031192, + "balance_loss_clip": 1.018502, + "balance_loss_mlp": 1.03929043, + "epoch": 0.46162633398466857, + "flos": 15483245137920.0, + "grad_norm": 2.8566258545012464, + "language_loss": 0.76442772, + "learning_rate": 2.3439291778675718e-06, + "loss": 0.78586149, + "num_input_tokens_seen": 164859895, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7265625, + "step": 7678, + "time_per_iteration": 3.80979061126709 + }, + { + "auxiliary_loss_clip": 0.01115647, + "auxiliary_loss_mlp": 0.01035074, + "balance_loss_clip": 1.02148438, + "balance_loss_mlp": 1.04122365, + "epoch": 0.46168645723733653, + "flos": 20011437181440.0, + "grad_norm": 1.9875640695173902, + "language_loss": 0.66738796, + "learning_rate": 2.343545511426974e-06, + "loss": 0.68889523, + "num_input_tokens_seen": 164878030, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7421875, + "step": 7679, + "time_per_iteration": 5.473088502883911 + }, + { + "auxiliary_loss_clip": 0.01112323, + "auxiliary_loss_mlp": 0.01038079, + "balance_loss_clip": 1.02563381, + "balance_loss_mlp": 1.03913581, + "epoch": 0.4617465804900045, + "flos": 20298542590080.0, + "grad_norm": 1.9247599304086902, + "language_loss": 0.69658661, + "learning_rate": 2.3431618319590963e-06, + "loss": 0.71809065, + "num_input_tokens_seen": 164895710, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.73046875, + "step": 7680, + "time_per_iteration": 2.435971736907959 + }, + { + "auxiliary_loss_clip": 0.01121586, + "auxiliary_loss_mlp": 0.01041647, + "balance_loss_clip": 1.02805138, + "balance_loss_mlp": 1.04467559, + "epoch": 0.46180670374267246, + "flos": 22346312952960.0, + "grad_norm": 3.979685754880411, + "language_loss": 0.63813865, + "learning_rate": 2.342778139478487e-06, + "loss": 0.65977097, + "num_input_tokens_seen": 164913365, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.76953125, + "step": 7681, + "time_per_iteration": 2.486614942550659 + }, + { + "auxiliary_loss_clip": 0.01111536, + "auxiliary_loss_mlp": 0.01029983, + "balance_loss_clip": 1.01790738, + "balance_loss_mlp": 1.03925776, + "epoch": 0.46186682699534043, + "flos": 19895696582400.0, + "grad_norm": 1.518283771877835, + "language_loss": 0.66871607, + "learning_rate": 2.342394433999697e-06, + "loss": 0.69013125, + "num_input_tokens_seen": 164931620, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.72265625, + "step": 7682, + "time_per_iteration": 2.434720516204834 + }, + { + "auxiliary_loss_clip": 0.01114783, + "auxiliary_loss_mlp": 0.01036687, + "balance_loss_clip": 1.02353811, + "balance_loss_mlp": 1.03967464, + "epoch": 0.4619269502480084, + "flos": 31503569408640.0, + "grad_norm": 2.2113144827233397, + "language_loss": 0.74337292, + "learning_rate": 2.342010715537275e-06, + "loss": 0.76488769, + "num_input_tokens_seen": 164950905, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.75, + "step": 7683, + "time_per_iteration": 2.532867908477783 + }, + { + "auxiliary_loss_clip": 0.01113653, + "auxiliary_loss_mlp": 0.0103323, + "balance_loss_clip": 1.02046251, + "balance_loss_mlp": 1.04082799, + "epoch": 0.46198707350067636, + "flos": 25009484054400.0, + "grad_norm": 1.7237723920320163, + "language_loss": 0.76637614, + "learning_rate": 2.3416269841057726e-06, + "loss": 0.78784502, + "num_input_tokens_seen": 164970950, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7265625, + "step": 7684, + "time_per_iteration": 2.4763615131378174 + }, + { + "auxiliary_loss_clip": 0.01121747, + "auxiliary_loss_mlp": 0.01039637, + "balance_loss_clip": 1.02557588, + "balance_loss_mlp": 1.04270399, + "epoch": 0.4620471967533444, + "flos": 18292357198080.0, + "grad_norm": 2.012138726469413, + "language_loss": 0.80012244, + "learning_rate": 2.3412432397197412e-06, + "loss": 0.82173628, + "num_input_tokens_seen": 164989855, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 7685, + "time_per_iteration": 2.467780113220215 + }, + { + "auxiliary_loss_clip": 0.01113653, + "auxiliary_loss_mlp": 0.01038402, + "balance_loss_clip": 1.02434742, + "balance_loss_mlp": 1.04206526, + "epoch": 0.46210732000601235, + "flos": 33985104410880.0, + "grad_norm": 2.0493507584177424, + "language_loss": 0.66546774, + "learning_rate": 2.340859482393731e-06, + "loss": 0.68698829, + "num_input_tokens_seen": 165012290, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.71484375, + "step": 7686, + "time_per_iteration": 2.5675110816955566 + }, + { + "auxiliary_loss_clip": 0.01115805, + "auxiliary_loss_mlp": 0.01031167, + "balance_loss_clip": 1.01730859, + "balance_loss_mlp": 1.03924084, + "epoch": 0.4621674432586803, + "flos": 25009412227200.0, + "grad_norm": 2.0396518023333243, + "language_loss": 0.73831183, + "learning_rate": 2.340475712142296e-06, + "loss": 0.75978148, + "num_input_tokens_seen": 165030810, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 7687, + "time_per_iteration": 2.5077569484710693 + }, + { + "auxiliary_loss_clip": 0.01113947, + "auxiliary_loss_mlp": 0.01030534, + "balance_loss_clip": 1.01686668, + "balance_loss_mlp": 1.04119587, + "epoch": 0.4622275665113483, + "flos": 22014031213440.0, + "grad_norm": 2.1950912061668784, + "language_loss": 0.74758142, + "learning_rate": 2.3400919289799873e-06, + "loss": 0.76902628, + "num_input_tokens_seen": 165050205, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7265625, + "step": 7688, + "time_per_iteration": 2.4487764835357666 + }, + { + "auxiliary_loss_clip": 0.01112886, + "auxiliary_loss_mlp": 0.01035944, + "balance_loss_clip": 1.0214963, + "balance_loss_mlp": 1.03912246, + "epoch": 0.46228768976401624, + "flos": 24058820747520.0, + "grad_norm": 1.6667608580722473, + "language_loss": 0.78718561, + "learning_rate": 2.3397081329213585e-06, + "loss": 0.80867392, + "num_input_tokens_seen": 165069370, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.73828125, + "step": 7689, + "time_per_iteration": 2.504210948944092 + }, + { + "auxiliary_loss_clip": 0.01118414, + "auxiliary_loss_mlp": 0.01040294, + "balance_loss_clip": 1.02561891, + "balance_loss_mlp": 1.04086494, + "epoch": 0.4623478130166842, + "flos": 26651391667200.0, + "grad_norm": 3.5840156670541448, + "language_loss": 0.56649667, + "learning_rate": 2.339324323980964e-06, + "loss": 0.58808374, + "num_input_tokens_seen": 165089610, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7734375, + "step": 7690, + "time_per_iteration": 2.4970550537109375 + }, + { + "auxiliary_loss_clip": 0.01113577, + "auxiliary_loss_mlp": 0.01034848, + "balance_loss_clip": 1.02076888, + "balance_loss_mlp": 1.03844917, + "epoch": 0.46240793626935217, + "flos": 20558428467840.0, + "grad_norm": 2.2671044925643202, + "language_loss": 0.82513797, + "learning_rate": 2.3389405021733562e-06, + "loss": 0.84662223, + "num_input_tokens_seen": 165109050, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.75, + "step": 7691, + "time_per_iteration": 2.4712584018707275 + }, + { + "auxiliary_loss_clip": 0.01115177, + "auxiliary_loss_mlp": 0.01028123, + "balance_loss_clip": 1.01502824, + "balance_loss_mlp": 1.04124403, + "epoch": 0.46246805952202014, + "flos": 22456055980800.0, + "grad_norm": 1.513473472081282, + "language_loss": 0.75326777, + "learning_rate": 2.338556667513091e-06, + "loss": 0.77470076, + "num_input_tokens_seen": 165130130, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73828125, + "step": 7692, + "time_per_iteration": 2.462574005126953 + }, + { + "auxiliary_loss_clip": 0.01117023, + "auxiliary_loss_mlp": 0.01036723, + "balance_loss_clip": 1.0225668, + "balance_loss_mlp": 1.04110909, + "epoch": 0.4625281827746881, + "flos": 35041308854400.0, + "grad_norm": 4.10345040195295, + "language_loss": 0.74055338, + "learning_rate": 2.338172820014723e-06, + "loss": 0.76209086, + "num_input_tokens_seen": 165152685, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7578125, + "step": 7693, + "time_per_iteration": 2.578394889831543 + }, + { + "auxiliary_loss_clip": 0.01114967, + "auxiliary_loss_mlp": 0.01035157, + "balance_loss_clip": 1.02170396, + "balance_loss_mlp": 1.04132485, + "epoch": 0.46258830602735607, + "flos": 21068647205760.0, + "grad_norm": 1.5049695528407014, + "language_loss": 0.85576218, + "learning_rate": 2.337788959692808e-06, + "loss": 0.87726343, + "num_input_tokens_seen": 165173315, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 7694, + "time_per_iteration": 2.447938919067383 + }, + { + "auxiliary_loss_clip": 0.01116538, + "auxiliary_loss_mlp": 0.01036993, + "balance_loss_clip": 1.02379656, + "balance_loss_mlp": 1.04131126, + "epoch": 0.46264842928002403, + "flos": 26177227205760.0, + "grad_norm": 2.103971064334481, + "language_loss": 0.78631961, + "learning_rate": 2.337405086561902e-06, + "loss": 0.80785489, + "num_input_tokens_seen": 165192395, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.75, + "step": 7695, + "time_per_iteration": 2.510712146759033 + }, + { + "auxiliary_loss_clip": 0.01110008, + "auxiliary_loss_mlp": 0.01034157, + "balance_loss_clip": 1.021294, + "balance_loss_mlp": 1.0382899, + "epoch": 0.462708552532692, + "flos": 16764214936320.0, + "grad_norm": 1.7164209999926379, + "language_loss": 0.72215033, + "learning_rate": 2.3370212006365606e-06, + "loss": 0.74359202, + "num_input_tokens_seen": 165211355, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 7696, + "time_per_iteration": 2.427879571914673 + }, + { + "auxiliary_loss_clip": 0.01116967, + "auxiliary_loss_mlp": 0.01044874, + "balance_loss_clip": 1.03040195, + "balance_loss_mlp": 1.04200339, + "epoch": 0.46276867578535996, + "flos": 15560453422080.0, + "grad_norm": 1.7618442658513396, + "language_loss": 0.69068033, + "learning_rate": 2.3366373019313423e-06, + "loss": 0.71229875, + "num_input_tokens_seen": 165229380, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.75, + "step": 7697, + "time_per_iteration": 2.4759252071380615 + }, + { + "auxiliary_loss_clip": 0.01115214, + "auxiliary_loss_mlp": 0.01030228, + "balance_loss_clip": 1.01716232, + "balance_loss_mlp": 1.0421176, + "epoch": 0.462828799038028, + "flos": 22415404763520.0, + "grad_norm": 1.7059169761391482, + "language_loss": 0.84603721, + "learning_rate": 2.3362533904608025e-06, + "loss": 0.8674916, + "num_input_tokens_seen": 165247200, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73046875, + "step": 7698, + "time_per_iteration": 2.4416439533233643 + }, + { + "auxiliary_loss_clip": 0.01114129, + "auxiliary_loss_mlp": 0.01033925, + "balance_loss_clip": 1.02094316, + "balance_loss_mlp": 1.04008198, + "epoch": 0.46288892229069595, + "flos": 21069580959360.0, + "grad_norm": 2.2131790671554894, + "language_loss": 0.71495068, + "learning_rate": 2.335869466239502e-06, + "loss": 0.73643124, + "num_input_tokens_seen": 165265825, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7421875, + "step": 7699, + "time_per_iteration": 2.477674722671509 + }, + { + "auxiliary_loss_clip": 0.01115631, + "auxiliary_loss_mlp": 0.01036078, + "balance_loss_clip": 1.02183843, + "balance_loss_mlp": 1.03854418, + "epoch": 0.4629490455433639, + "flos": 23185688947200.0, + "grad_norm": 1.667240614809052, + "language_loss": 0.7189334, + "learning_rate": 2.335485529281996e-06, + "loss": 0.7404505, + "num_input_tokens_seen": 165284380, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7734375, + "step": 7700, + "time_per_iteration": 2.4664909839630127 + }, + { + "auxiliary_loss_clip": 0.01113806, + "auxiliary_loss_mlp": 0.01036044, + "balance_loss_clip": 1.0229491, + "balance_loss_mlp": 1.04012191, + "epoch": 0.4630091687960319, + "flos": 18835541642880.0, + "grad_norm": 1.9820544405348388, + "language_loss": 0.7245025, + "learning_rate": 2.3351015796028467e-06, + "loss": 0.74600095, + "num_input_tokens_seen": 165300320, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73828125, + "step": 7701, + "time_per_iteration": 2.4769680500030518 + }, + { + "auxiliary_loss_clip": 0.01117689, + "auxiliary_loss_mlp": 0.01035148, + "balance_loss_clip": 1.02129054, + "balance_loss_mlp": 1.04037929, + "epoch": 0.46306929204869984, + "flos": 38907020407680.0, + "grad_norm": 1.837243395087381, + "language_loss": 0.64583158, + "learning_rate": 2.3347176172166114e-06, + "loss": 0.66735995, + "num_input_tokens_seen": 165318130, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7734375, + "step": 7702, + "time_per_iteration": 2.5806338787078857 + }, + { + "auxiliary_loss_clip": 0.0111042, + "auxiliary_loss_mlp": 0.01030383, + "balance_loss_clip": 1.01753259, + "balance_loss_mlp": 1.03832746, + "epoch": 0.4631294153013678, + "flos": 19644178573440.0, + "grad_norm": 1.912512853345874, + "language_loss": 0.73265111, + "learning_rate": 2.33433364213785e-06, + "loss": 0.7540592, + "num_input_tokens_seen": 165336225, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.71875, + "step": 7703, + "time_per_iteration": 2.482374429702759 + }, + { + "auxiliary_loss_clip": 0.01119217, + "auxiliary_loss_mlp": 0.01033067, + "balance_loss_clip": 1.01882708, + "balance_loss_mlp": 1.04163849, + "epoch": 0.4631895385540358, + "flos": 24608254158720.0, + "grad_norm": 1.555397834218836, + "language_loss": 0.68780202, + "learning_rate": 2.3339496543811243e-06, + "loss": 0.70932484, + "num_input_tokens_seen": 165355005, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7734375, + "step": 7704, + "time_per_iteration": 2.4661428928375244 + }, + { + "auxiliary_loss_clip": 0.01118717, + "auxiliary_loss_mlp": 0.01027068, + "balance_loss_clip": 1.01313281, + "balance_loss_mlp": 1.04138649, + "epoch": 0.46324966180670374, + "flos": 26320115508480.0, + "grad_norm": 4.360671756910266, + "language_loss": 0.80963224, + "learning_rate": 2.3335656539609934e-06, + "loss": 0.83109009, + "num_input_tokens_seen": 165374910, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7734375, + "step": 7705, + "time_per_iteration": 2.5129587650299072 + }, + { + "auxiliary_loss_clip": 0.01116357, + "auxiliary_loss_mlp": 0.01032014, + "balance_loss_clip": 1.01863885, + "balance_loss_mlp": 1.03983259, + "epoch": 0.4633097850593717, + "flos": 19240506552960.0, + "grad_norm": 1.6860050062378817, + "language_loss": 0.77783883, + "learning_rate": 2.3331816408920196e-06, + "loss": 0.79932249, + "num_input_tokens_seen": 165392590, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.765625, + "step": 7706, + "time_per_iteration": 2.4212512969970703 + }, + { + "auxiliary_loss_clip": 0.01109433, + "auxiliary_loss_mlp": 0.0102981, + "balance_loss_clip": 1.01676846, + "balance_loss_mlp": 1.03858304, + "epoch": 0.46336990831203967, + "flos": 22783166161920.0, + "grad_norm": 1.9896841653009631, + "language_loss": 0.69805431, + "learning_rate": 2.3327976151887654e-06, + "loss": 0.71944684, + "num_input_tokens_seen": 165411195, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.70703125, + "step": 7707, + "time_per_iteration": 2.452716112136841 + }, + { + "auxiliary_loss_clip": 0.0111828, + "auxiliary_loss_mlp": 0.01037502, + "balance_loss_clip": 1.02268386, + "balance_loss_mlp": 1.03958869, + "epoch": 0.46343003156470763, + "flos": 38210604543360.0, + "grad_norm": 1.9384057680294333, + "language_loss": 0.61103344, + "learning_rate": 2.332413576865791e-06, + "loss": 0.63259125, + "num_input_tokens_seen": 165430150, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7890625, + "step": 7708, + "time_per_iteration": 2.567363739013672 + }, + { + "auxiliary_loss_clip": 0.01115409, + "auxiliary_loss_mlp": 0.01033039, + "balance_loss_clip": 1.01932991, + "balance_loss_mlp": 1.0407182, + "epoch": 0.4634901548173756, + "flos": 31938555110400.0, + "grad_norm": 1.9580912850569934, + "language_loss": 0.77165091, + "learning_rate": 2.3320295259376614e-06, + "loss": 0.7931354, + "num_input_tokens_seen": 165450595, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 7709, + "time_per_iteration": 2.532893657684326 + }, + { + "auxiliary_loss_clip": 0.01120131, + "auxiliary_loss_mlp": 0.0103614, + "balance_loss_clip": 1.02199614, + "balance_loss_mlp": 1.04260027, + "epoch": 0.46355027807004356, + "flos": 20082540153600.0, + "grad_norm": 1.8889269845152723, + "language_loss": 0.76972783, + "learning_rate": 2.3316454624189385e-06, + "loss": 0.79129058, + "num_input_tokens_seen": 165469515, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7734375, + "step": 7710, + "time_per_iteration": 2.4608266353607178 + }, + { + "auxiliary_loss_clip": 0.01119418, + "auxiliary_loss_mlp": 0.01032956, + "balance_loss_clip": 1.01812005, + "balance_loss_mlp": 1.04201198, + "epoch": 0.4636104013227116, + "flos": 24061370613120.0, + "grad_norm": 8.865430766980356, + "language_loss": 0.73548961, + "learning_rate": 2.3312613863241865e-06, + "loss": 0.75701332, + "num_input_tokens_seen": 165488125, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7734375, + "step": 7711, + "time_per_iteration": 2.4964261054992676 + }, + { + "auxiliary_loss_clip": 0.01114775, + "auxiliary_loss_mlp": 0.01043054, + "balance_loss_clip": 1.02818859, + "balance_loss_mlp": 1.04039836, + "epoch": 0.46367052457537955, + "flos": 23914639555200.0, + "grad_norm": 1.6554647385393604, + "language_loss": 0.71667624, + "learning_rate": 2.33087729766797e-06, + "loss": 0.73825449, + "num_input_tokens_seen": 165509225, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.74609375, + "step": 7712, + "time_per_iteration": 2.46760630607605 + }, + { + "auxiliary_loss_clip": 0.01121722, + "auxiliary_loss_mlp": 0.01038556, + "balance_loss_clip": 1.02325535, + "balance_loss_mlp": 1.04231286, + "epoch": 0.4637306478280475, + "flos": 26396533693440.0, + "grad_norm": 3.3767356374822053, + "language_loss": 0.72924775, + "learning_rate": 2.3304931964648524e-06, + "loss": 0.7508505, + "num_input_tokens_seen": 165529945, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.79296875, + "step": 7713, + "time_per_iteration": 2.501405954360962 + }, + { + "auxiliary_loss_clip": 0.01117475, + "auxiliary_loss_mlp": 0.01034085, + "balance_loss_clip": 1.0192256, + "balance_loss_mlp": 1.0397234, + "epoch": 0.4637907710807155, + "flos": 21980706370560.0, + "grad_norm": 1.980318346106041, + "language_loss": 0.58787149, + "learning_rate": 2.3301090827294e-06, + "loss": 0.60938716, + "num_input_tokens_seen": 165550690, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.77734375, + "step": 7714, + "time_per_iteration": 2.495403528213501 + }, + { + "auxiliary_loss_clip": 0.01113059, + "auxiliary_loss_mlp": 0.01032058, + "balance_loss_clip": 1.01873016, + "balance_loss_mlp": 1.03932118, + "epoch": 0.46385089433338345, + "flos": 12422291846400.0, + "grad_norm": 2.071541116221401, + "language_loss": 0.70241058, + "learning_rate": 2.3297249564761784e-06, + "loss": 0.72386181, + "num_input_tokens_seen": 165567775, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73828125, + "step": 7715, + "time_per_iteration": 2.4438905715942383 + }, + { + "auxiliary_loss_clip": 0.01120226, + "auxiliary_loss_mlp": 0.01034367, + "balance_loss_clip": 1.0211767, + "balance_loss_mlp": 1.04094183, + "epoch": 0.4639110175860514, + "flos": 23915752876800.0, + "grad_norm": 2.6792778299233775, + "language_loss": 0.67974752, + "learning_rate": 2.3293408177197527e-06, + "loss": 0.70129347, + "num_input_tokens_seen": 165587010, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.796875, + "step": 7716, + "time_per_iteration": 2.4544179439544678 + }, + { + "auxiliary_loss_clip": 0.01119502, + "auxiliary_loss_mlp": 0.010334, + "balance_loss_clip": 1.01913667, + "balance_loss_mlp": 1.04161263, + "epoch": 0.4639711408387194, + "flos": 25300396304640.0, + "grad_norm": 1.7705358267642153, + "language_loss": 0.81100738, + "learning_rate": 2.328956666474691e-06, + "loss": 0.8325364, + "num_input_tokens_seen": 165607850, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 7717, + "time_per_iteration": 2.491530179977417 + }, + { + "auxiliary_loss_clip": 0.0111535, + "auxiliary_loss_mlp": 0.0103239, + "balance_loss_clip": 1.01868117, + "balance_loss_mlp": 1.04001844, + "epoch": 0.46403126409138734, + "flos": 21211822817280.0, + "grad_norm": 1.8289041555667496, + "language_loss": 0.73165905, + "learning_rate": 2.3285725027555593e-06, + "loss": 0.75313652, + "num_input_tokens_seen": 165627175, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75390625, + "step": 7718, + "time_per_iteration": 2.4480137825012207 + }, + { + "auxiliary_loss_clip": 0.01114178, + "auxiliary_loss_mlp": 0.01037785, + "balance_loss_clip": 1.02355695, + "balance_loss_mlp": 1.03966463, + "epoch": 0.4640913873440553, + "flos": 35845564325760.0, + "grad_norm": 1.5484606356008148, + "language_loss": 0.70390046, + "learning_rate": 2.3281883265769254e-06, + "loss": 0.72542012, + "num_input_tokens_seen": 165648340, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.74609375, + "step": 7719, + "time_per_iteration": 2.565831422805786 + }, + { + "auxiliary_loss_clip": 0.0112125, + "auxiliary_loss_mlp": 0.01039419, + "balance_loss_clip": 1.02523875, + "balance_loss_mlp": 1.0433172, + "epoch": 0.46415151059672327, + "flos": 19166207270400.0, + "grad_norm": 1.6620583446293502, + "language_loss": 0.86685133, + "learning_rate": 2.327804137953357e-06, + "loss": 0.88845801, + "num_input_tokens_seen": 165667195, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.77734375, + "step": 7720, + "time_per_iteration": 5.243311166763306 + }, + { + "auxiliary_loss_clip": 0.01036993, + "auxiliary_loss_mlp": 0.01001352, + "balance_loss_clip": 0.99992698, + "balance_loss_mlp": 1.01241243, + "epoch": 0.46421163384939124, + "flos": 58912750304640.0, + "grad_norm": 0.7219170830729655, + "language_loss": 0.55086505, + "learning_rate": 2.3274199368994226e-06, + "loss": 0.57124853, + "num_input_tokens_seen": 165726760, + "router_z_loss_clip": 0.01422119, + "router_z_loss_mlp": 0.24609375, + "step": 7721, + "time_per_iteration": 4.553914785385132 + }, + { + "auxiliary_loss_clip": 0.01116315, + "auxiliary_loss_mlp": 0.01037313, + "balance_loss_clip": 1.02322233, + "balance_loss_mlp": 1.041767, + "epoch": 0.4642717571020592, + "flos": 20157342226560.0, + "grad_norm": 2.566766868002949, + "language_loss": 0.79665279, + "learning_rate": 2.3270357234296918e-06, + "loss": 0.81818902, + "num_input_tokens_seen": 165745005, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.74609375, + "step": 7722, + "time_per_iteration": 2.445401430130005 + }, + { + "auxiliary_loss_clip": 0.01118781, + "auxiliary_loss_mlp": 0.01032902, + "balance_loss_clip": 1.01957417, + "balance_loss_mlp": 1.04163325, + "epoch": 0.46433188035472717, + "flos": 25046184775680.0, + "grad_norm": 1.5891837623192666, + "language_loss": 0.77772748, + "learning_rate": 2.3266514975587332e-06, + "loss": 0.79924428, + "num_input_tokens_seen": 165765750, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7734375, + "step": 7723, + "time_per_iteration": 2.4992403984069824 + }, + { + "auxiliary_loss_clip": 0.01112839, + "auxiliary_loss_mlp": 0.01030607, + "balance_loss_clip": 1.01748788, + "balance_loss_mlp": 1.03973961, + "epoch": 0.4643920036073952, + "flos": 28075644817920.0, + "grad_norm": 1.5026814907271808, + "language_loss": 0.68433344, + "learning_rate": 2.326267259301118e-06, + "loss": 0.70576787, + "num_input_tokens_seen": 165787515, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73046875, + "step": 7724, + "time_per_iteration": 2.496286630630493 + }, + { + "auxiliary_loss_clip": 0.01112054, + "auxiliary_loss_mlp": 0.0103399, + "balance_loss_clip": 1.02032912, + "balance_loss_mlp": 1.03761983, + "epoch": 0.46445212686006315, + "flos": 18369350000640.0, + "grad_norm": 2.246547977212262, + "language_loss": 0.67335129, + "learning_rate": 2.325883008671415e-06, + "loss": 0.6948117, + "num_input_tokens_seen": 165806675, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7421875, + "step": 7725, + "time_per_iteration": 2.471104621887207 + }, + { + "auxiliary_loss_clip": 0.01108683, + "auxiliary_loss_mlp": 0.01037158, + "balance_loss_clip": 1.02523649, + "balance_loss_mlp": 1.03763461, + "epoch": 0.4645122501127311, + "flos": 31721618920320.0, + "grad_norm": 1.6153664866621378, + "language_loss": 0.64700842, + "learning_rate": 2.3254987456841955e-06, + "loss": 0.66846681, + "num_input_tokens_seen": 165829835, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.7109375, + "step": 7726, + "time_per_iteration": 2.5408668518066406 + }, + { + "auxiliary_loss_clip": 0.01117387, + "auxiliary_loss_mlp": 0.01032558, + "balance_loss_clip": 1.01916456, + "balance_loss_mlp": 1.04313767, + "epoch": 0.4645723733653991, + "flos": 23768806337280.0, + "grad_norm": 1.8244750339479887, + "language_loss": 0.74908936, + "learning_rate": 2.3251144703540307e-06, + "loss": 0.77058876, + "num_input_tokens_seen": 165849380, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7421875, + "step": 7727, + "time_per_iteration": 2.4853005409240723 + }, + { + "auxiliary_loss_clip": 0.01114218, + "auxiliary_loss_mlp": 0.01036565, + "balance_loss_clip": 1.02248645, + "balance_loss_mlp": 1.03968906, + "epoch": 0.46463249661806705, + "flos": 33145512935040.0, + "grad_norm": 2.0019169498028657, + "language_loss": 0.78683269, + "learning_rate": 2.3247301826954936e-06, + "loss": 0.80834055, + "num_input_tokens_seen": 165868620, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 7728, + "time_per_iteration": 2.5397188663482666 + }, + { + "auxiliary_loss_clip": 0.0111559, + "auxiliary_loss_mlp": 0.01036908, + "balance_loss_clip": 1.02303171, + "balance_loss_mlp": 1.0405283, + "epoch": 0.464692619870735, + "flos": 18296020385280.0, + "grad_norm": 2.3286376832796343, + "language_loss": 0.76053888, + "learning_rate": 2.324345882723155e-06, + "loss": 0.78206384, + "num_input_tokens_seen": 165885915, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 7729, + "time_per_iteration": 2.4818129539489746 + }, + { + "auxiliary_loss_clip": 0.011162, + "auxiliary_loss_mlp": 0.0103847, + "balance_loss_clip": 1.02543473, + "balance_loss_mlp": 1.04205704, + "epoch": 0.464752743123403, + "flos": 22638051216000.0, + "grad_norm": 1.578112141950269, + "language_loss": 0.79568058, + "learning_rate": 2.323961570451588e-06, + "loss": 0.81722724, + "num_input_tokens_seen": 165905465, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7421875, + "step": 7730, + "time_per_iteration": 2.5124597549438477 + }, + { + "auxiliary_loss_clip": 0.01113512, + "auxiliary_loss_mlp": 0.01037643, + "balance_loss_clip": 1.0245595, + "balance_loss_mlp": 1.03948402, + "epoch": 0.46481286637607094, + "flos": 20412128373120.0, + "grad_norm": 1.5075999703309564, + "language_loss": 0.76621842, + "learning_rate": 2.3235772458953655e-06, + "loss": 0.78772998, + "num_input_tokens_seen": 165924640, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7421875, + "step": 7731, + "time_per_iteration": 2.4976460933685303 + }, + { + "auxiliary_loss_clip": 0.01112526, + "auxiliary_loss_mlp": 0.01031386, + "balance_loss_clip": 1.01798737, + "balance_loss_mlp": 1.0393635, + "epoch": 0.4648729896287389, + "flos": 34275406129920.0, + "grad_norm": 1.7163179847514425, + "language_loss": 0.65824252, + "learning_rate": 2.323192909069061e-06, + "loss": 0.67968166, + "num_input_tokens_seen": 165945765, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73046875, + "step": 7732, + "time_per_iteration": 2.5720393657684326 + }, + { + "auxiliary_loss_clip": 0.01116963, + "auxiliary_loss_mlp": 0.01036386, + "balance_loss_clip": 1.02186668, + "balance_loss_mlp": 1.03906608, + "epoch": 0.4649331128814069, + "flos": 21321781326720.0, + "grad_norm": 2.6101927282287454, + "language_loss": 0.72711408, + "learning_rate": 2.32280855998725e-06, + "loss": 0.74864757, + "num_input_tokens_seen": 165964025, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.78125, + "step": 7733, + "time_per_iteration": 2.4926271438598633 + }, + { + "auxiliary_loss_clip": 0.01036248, + "auxiliary_loss_mlp": 0.01002748, + "balance_loss_clip": 1.00131154, + "balance_loss_mlp": 1.01211238, + "epoch": 0.46499323613407484, + "flos": 58308515717760.0, + "grad_norm": 1.2459739814545432, + "language_loss": 0.51962316, + "learning_rate": 2.3224241986645057e-06, + "loss": 0.54001307, + "num_input_tokens_seen": 166021950, + "router_z_loss_clip": 0.01434326, + "router_z_loss_mlp": 0.2421875, + "step": 7734, + "time_per_iteration": 3.0107176303863525 + }, + { + "auxiliary_loss_clip": 0.01113986, + "auxiliary_loss_mlp": 0.0103542, + "balance_loss_clip": 1.02194381, + "balance_loss_mlp": 1.04043412, + "epoch": 0.4650533593867428, + "flos": 10889660384640.0, + "grad_norm": 2.036607770310226, + "language_loss": 0.75633866, + "learning_rate": 2.3220398251154035e-06, + "loss": 0.77783275, + "num_input_tokens_seen": 166039675, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 7735, + "time_per_iteration": 2.487781286239624 + }, + { + "auxiliary_loss_clip": 0.01111506, + "auxiliary_loss_mlp": 0.01040266, + "balance_loss_clip": 1.02682567, + "balance_loss_mlp": 1.03985715, + "epoch": 0.46511348263941077, + "flos": 19974592805760.0, + "grad_norm": 2.402877095125316, + "language_loss": 0.70207214, + "learning_rate": 2.321655439354519e-06, + "loss": 0.7235899, + "num_input_tokens_seen": 166057745, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71484375, + "step": 7736, + "time_per_iteration": 2.4449374675750732 + }, + { + "auxiliary_loss_clip": 0.0111302, + "auxiliary_loss_mlp": 0.0103234, + "balance_loss_clip": 1.0199604, + "balance_loss_mlp": 1.04052627, + "epoch": 0.46517360589207873, + "flos": 19678401256320.0, + "grad_norm": 1.6375102922586726, + "language_loss": 0.72185129, + "learning_rate": 2.321271041396427e-06, + "loss": 0.74330497, + "num_input_tokens_seen": 166076440, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7265625, + "step": 7737, + "time_per_iteration": 2.494582176208496 + }, + { + "auxiliary_loss_clip": 0.01118991, + "auxiliary_loss_mlp": 0.01038693, + "balance_loss_clip": 1.02450085, + "balance_loss_mlp": 1.04341006, + "epoch": 0.46523372914474675, + "flos": 16872665074560.0, + "grad_norm": 2.6166748549663605, + "language_loss": 0.83362406, + "learning_rate": 2.3208866312557065e-06, + "loss": 0.85520089, + "num_input_tokens_seen": 166092520, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7578125, + "step": 7738, + "time_per_iteration": 2.427828550338745 + }, + { + "auxiliary_loss_clip": 0.01037214, + "auxiliary_loss_mlp": 0.01002748, + "balance_loss_clip": 1.0013417, + "balance_loss_mlp": 1.0132978, + "epoch": 0.4652938523974147, + "flos": 53439138339840.0, + "grad_norm": 0.7680630195464891, + "language_loss": 0.57788324, + "learning_rate": 2.320502208946932e-06, + "loss": 0.59828281, + "num_input_tokens_seen": 166156285, + "router_z_loss_clip": 0.01403809, + "router_z_loss_mlp": 0.24023438, + "step": 7739, + "time_per_iteration": 3.133042335510254 + }, + { + "auxiliary_loss_clip": 0.01113786, + "auxiliary_loss_mlp": 0.01038133, + "balance_loss_clip": 1.02543104, + "balance_loss_mlp": 1.03974605, + "epoch": 0.4653539756500827, + "flos": 15231296165760.0, + "grad_norm": 1.823827375035505, + "language_loss": 0.8481009, + "learning_rate": 2.3201177744846815e-06, + "loss": 0.86962008, + "num_input_tokens_seen": 166173455, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7421875, + "step": 7740, + "time_per_iteration": 2.4921228885650635 + }, + { + "auxiliary_loss_clip": 0.0111501, + "auxiliary_loss_mlp": 0.01037681, + "balance_loss_clip": 1.02391815, + "balance_loss_mlp": 1.04139423, + "epoch": 0.46541409890275065, + "flos": 23732249270400.0, + "grad_norm": 1.5033977780241194, + "language_loss": 0.76110768, + "learning_rate": 2.3197333278835327e-06, + "loss": 0.7826345, + "num_input_tokens_seen": 166194370, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.734375, + "step": 7741, + "time_per_iteration": 2.4922451972961426 + }, + { + "auxiliary_loss_clip": 0.01117905, + "auxiliary_loss_mlp": 0.01032255, + "balance_loss_clip": 1.01915359, + "balance_loss_mlp": 1.0404247, + "epoch": 0.4654742221554186, + "flos": 20847329556480.0, + "grad_norm": 1.7276921705055903, + "language_loss": 0.80555934, + "learning_rate": 2.319348869158064e-06, + "loss": 0.82706094, + "num_input_tokens_seen": 166213195, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7734375, + "step": 7742, + "time_per_iteration": 2.4906904697418213 + }, + { + "auxiliary_loss_clip": 0.01116814, + "auxiliary_loss_mlp": 0.01039288, + "balance_loss_clip": 1.02518523, + "balance_loss_mlp": 1.04049921, + "epoch": 0.4655343454080866, + "flos": 20704836303360.0, + "grad_norm": 1.9912151117228205, + "language_loss": 0.72541988, + "learning_rate": 2.3189643983228555e-06, + "loss": 0.74698091, + "num_input_tokens_seen": 166231350, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.765625, + "step": 7743, + "time_per_iteration": 2.4746901988983154 + }, + { + "auxiliary_loss_clip": 0.01114723, + "auxiliary_loss_mlp": 0.01030494, + "balance_loss_clip": 1.01745892, + "balance_loss_mlp": 1.0409807, + "epoch": 0.46559446866075455, + "flos": 18989850470400.0, + "grad_norm": 2.076205829431248, + "language_loss": 0.71137214, + "learning_rate": 2.318579915392483e-06, + "loss": 0.73282433, + "num_input_tokens_seen": 166250530, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.734375, + "step": 7744, + "time_per_iteration": 2.4928057193756104 + }, + { + "auxiliary_loss_clip": 0.01112536, + "auxiliary_loss_mlp": 0.01033232, + "balance_loss_clip": 1.02108455, + "balance_loss_mlp": 1.04053736, + "epoch": 0.4656545919134225, + "flos": 34496364643200.0, + "grad_norm": 1.5849641227794893, + "language_loss": 0.85084593, + "learning_rate": 2.31819542038153e-06, + "loss": 0.87230361, + "num_input_tokens_seen": 166272545, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71875, + "step": 7745, + "time_per_iteration": 2.574612617492676 + }, + { + "auxiliary_loss_clip": 0.01112672, + "auxiliary_loss_mlp": 0.01039212, + "balance_loss_clip": 1.02561605, + "balance_loss_mlp": 1.04127502, + "epoch": 0.4657147151660905, + "flos": 24310554238080.0, + "grad_norm": 1.35434162506916, + "language_loss": 0.73171556, + "learning_rate": 2.317810913304574e-06, + "loss": 0.75323439, + "num_input_tokens_seen": 166292135, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.71484375, + "step": 7746, + "time_per_iteration": 2.5375149250030518 + }, + { + "auxiliary_loss_clip": 0.01112894, + "auxiliary_loss_mlp": 0.01035164, + "balance_loss_clip": 1.02271867, + "balance_loss_mlp": 1.04081106, + "epoch": 0.46577483841875844, + "flos": 58795139220480.0, + "grad_norm": 1.5285629366651527, + "language_loss": 0.6993416, + "learning_rate": 2.3174263941761963e-06, + "loss": 0.72082222, + "num_input_tokens_seen": 166316710, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71875, + "step": 7747, + "time_per_iteration": 2.792043685913086 + }, + { + "auxiliary_loss_clip": 0.01112826, + "auxiliary_loss_mlp": 0.01031484, + "balance_loss_clip": 1.01872873, + "balance_loss_mlp": 1.03958046, + "epoch": 0.4658349616714264, + "flos": 31321969223040.0, + "grad_norm": 1.4175797777041124, + "language_loss": 0.67509431, + "learning_rate": 2.317041863010978e-06, + "loss": 0.69653738, + "num_input_tokens_seen": 166338535, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.73046875, + "step": 7748, + "time_per_iteration": 2.625060796737671 + }, + { + "auxiliary_loss_clip": 0.01117966, + "auxiliary_loss_mlp": 0.01037997, + "balance_loss_clip": 1.02341771, + "balance_loss_mlp": 1.04018533, + "epoch": 0.46589508492409437, + "flos": 14860338456960.0, + "grad_norm": 2.247229042591788, + "language_loss": 0.63667625, + "learning_rate": 2.3166573198235007e-06, + "loss": 0.65823585, + "num_input_tokens_seen": 166355540, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.77734375, + "step": 7749, + "time_per_iteration": 2.4132370948791504 + }, + { + "auxiliary_loss_clip": 0.01117494, + "auxiliary_loss_mlp": 0.0103098, + "balance_loss_clip": 1.01702619, + "balance_loss_mlp": 1.04231274, + "epoch": 0.46595520817676234, + "flos": 12895989431040.0, + "grad_norm": 2.928439488128299, + "language_loss": 0.74594498, + "learning_rate": 2.3162727646283456e-06, + "loss": 0.76742983, + "num_input_tokens_seen": 166372635, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75, + "step": 7750, + "time_per_iteration": 2.494771718978882 + }, + { + "auxiliary_loss_clip": 0.01115846, + "auxiliary_loss_mlp": 0.0103076, + "balance_loss_clip": 1.01701522, + "balance_loss_mlp": 1.0404911, + "epoch": 0.46601533142943036, + "flos": 32854169721600.0, + "grad_norm": 2.044073047720548, + "language_loss": 0.7496438, + "learning_rate": 2.3158881974400963e-06, + "loss": 0.77110994, + "num_input_tokens_seen": 166393175, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75390625, + "step": 7751, + "time_per_iteration": 2.5510993003845215 + }, + { + "auxiliary_loss_clip": 0.01118875, + "auxiliary_loss_mlp": 0.01031961, + "balance_loss_clip": 1.01816297, + "balance_loss_mlp": 1.04188776, + "epoch": 0.4660754546820983, + "flos": 19967517826560.0, + "grad_norm": 1.8775850665267624, + "language_loss": 0.73678327, + "learning_rate": 2.3155036182733345e-06, + "loss": 0.7582916, + "num_input_tokens_seen": 166408630, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76953125, + "step": 7752, + "time_per_iteration": 2.5834901332855225 + }, + { + "auxiliary_loss_clip": 0.01118438, + "auxiliary_loss_mlp": 0.0103798, + "balance_loss_clip": 1.02401483, + "balance_loss_mlp": 1.041453, + "epoch": 0.4661355779347663, + "flos": 26688164215680.0, + "grad_norm": 2.485236836866318, + "language_loss": 0.69320381, + "learning_rate": 2.315119027142644e-06, + "loss": 0.71476793, + "num_input_tokens_seen": 166428170, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76953125, + "step": 7753, + "time_per_iteration": 2.522881507873535 + }, + { + "auxiliary_loss_clip": 0.01111836, + "auxiliary_loss_mlp": 0.01031808, + "balance_loss_clip": 1.01862359, + "balance_loss_mlp": 1.04056942, + "epoch": 0.46619570118743425, + "flos": 20959442881920.0, + "grad_norm": 1.8174540980864333, + "language_loss": 0.72607052, + "learning_rate": 2.3147344240626076e-06, + "loss": 0.74750698, + "num_input_tokens_seen": 166446705, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71484375, + "step": 7754, + "time_per_iteration": 2.5403332710266113 + }, + { + "auxiliary_loss_clip": 0.01115444, + "auxiliary_loss_mlp": 0.01027181, + "balance_loss_clip": 1.01410365, + "balance_loss_mlp": 1.04032147, + "epoch": 0.4662558244401022, + "flos": 24426079355520.0, + "grad_norm": 1.501284890447191, + "language_loss": 0.78961611, + "learning_rate": 2.3143498090478114e-06, + "loss": 0.81104231, + "num_input_tokens_seen": 166466750, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.75, + "step": 7755, + "time_per_iteration": 2.4917664527893066 + }, + { + "auxiliary_loss_clip": 0.01110479, + "auxiliary_loss_mlp": 0.01029481, + "balance_loss_clip": 1.01675534, + "balance_loss_mlp": 1.03968203, + "epoch": 0.4663159476927702, + "flos": 20595452411520.0, + "grad_norm": 1.6390600579035761, + "language_loss": 0.72281897, + "learning_rate": 2.3139651821128382e-06, + "loss": 0.74421859, + "num_input_tokens_seen": 166485400, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 7756, + "time_per_iteration": 2.549678325653076 + }, + { + "auxiliary_loss_clip": 0.01111703, + "auxiliary_loss_mlp": 0.01030785, + "balance_loss_clip": 1.01770794, + "balance_loss_mlp": 1.03845477, + "epoch": 0.46637607094543815, + "flos": 25661872823040.0, + "grad_norm": 1.8004000990726714, + "language_loss": 0.78193069, + "learning_rate": 2.313580543272274e-06, + "loss": 0.80335552, + "num_input_tokens_seen": 166505730, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.734375, + "step": 7757, + "time_per_iteration": 2.483161687850952 + }, + { + "auxiliary_loss_clip": 0.01114658, + "auxiliary_loss_mlp": 0.01030645, + "balance_loss_clip": 1.01855707, + "balance_loss_mlp": 1.04131472, + "epoch": 0.4664361941981061, + "flos": 24273853516800.0, + "grad_norm": 2.024129481036371, + "language_loss": 0.66473371, + "learning_rate": 2.313195892540705e-06, + "loss": 0.68618673, + "num_input_tokens_seen": 166523770, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.734375, + "step": 7758, + "time_per_iteration": 2.5083394050598145 + }, + { + "auxiliary_loss_clip": 0.01113834, + "auxiliary_loss_mlp": 0.01037253, + "balance_loss_clip": 1.0243423, + "balance_loss_mlp": 1.04062152, + "epoch": 0.4664963174507741, + "flos": 18405871153920.0, + "grad_norm": 1.603488256474455, + "language_loss": 0.74207008, + "learning_rate": 2.3128112299327147e-06, + "loss": 0.76358092, + "num_input_tokens_seen": 166542935, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.73046875, + "step": 7759, + "time_per_iteration": 2.424461841583252 + }, + { + "auxiliary_loss_clip": 0.01113311, + "auxiliary_loss_mlp": 0.01033749, + "balance_loss_clip": 1.02089834, + "balance_loss_mlp": 1.04054224, + "epoch": 0.46655644070344204, + "flos": 22455122227200.0, + "grad_norm": 1.4805046968385447, + "language_loss": 0.77701056, + "learning_rate": 2.312426555462893e-06, + "loss": 0.79848123, + "num_input_tokens_seen": 166563935, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.73046875, + "step": 7760, + "time_per_iteration": 2.5147666931152344 + }, + { + "auxiliary_loss_clip": 0.01109461, + "auxiliary_loss_mlp": 0.01028634, + "balance_loss_clip": 1.01549125, + "balance_loss_mlp": 1.03895068, + "epoch": 0.46661656395611, + "flos": 13808407731840.0, + "grad_norm": 1.6623756387577715, + "language_loss": 0.74081796, + "learning_rate": 2.3120418691458237e-06, + "loss": 0.76219893, + "num_input_tokens_seen": 166582175, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 7761, + "time_per_iteration": 3.816096305847168 + }, + { + "auxiliary_loss_clip": 0.01117051, + "auxiliary_loss_mlp": 0.01031991, + "balance_loss_clip": 1.01743007, + "balance_loss_mlp": 1.040905, + "epoch": 0.466676687208778, + "flos": 21652159645440.0, + "grad_norm": 1.9521312394592187, + "language_loss": 0.78150368, + "learning_rate": 2.3116571709960956e-06, + "loss": 0.80299413, + "num_input_tokens_seen": 166601870, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.76171875, + "step": 7762, + "time_per_iteration": 5.593664169311523 + }, + { + "auxiliary_loss_clip": 0.01036542, + "auxiliary_loss_mlp": 0.01002344, + "balance_loss_clip": 1.00103235, + "balance_loss_mlp": 1.0128268, + "epoch": 0.46673681046144594, + "flos": 68534259068160.0, + "grad_norm": 0.7996147947039336, + "language_loss": 0.59759605, + "learning_rate": 2.311272461028297e-06, + "loss": 0.61798495, + "num_input_tokens_seen": 166668960, + "router_z_loss_clip": 0.01312256, + "router_z_loss_mlp": 0.23828125, + "step": 7763, + "time_per_iteration": 4.692638874053955 + }, + { + "auxiliary_loss_clip": 0.01115827, + "auxiliary_loss_mlp": 0.01035827, + "balance_loss_clip": 1.02139115, + "balance_loss_mlp": 1.03950739, + "epoch": 0.46679693371411396, + "flos": 15814449469440.0, + "grad_norm": 2.0939196550691075, + "language_loss": 0.78502893, + "learning_rate": 2.3108877392570146e-06, + "loss": 0.80654544, + "num_input_tokens_seen": 166686110, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.765625, + "step": 7764, + "time_per_iteration": 2.437487840652466 + }, + { + "auxiliary_loss_clip": 0.01113145, + "auxiliary_loss_mlp": 0.01035809, + "balance_loss_clip": 1.02385855, + "balance_loss_mlp": 1.04100394, + "epoch": 0.4668570569667819, + "flos": 18514572687360.0, + "grad_norm": 1.8134732296760265, + "language_loss": 0.72272134, + "learning_rate": 2.310503005696839e-06, + "loss": 0.74421084, + "num_input_tokens_seen": 166703930, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.72265625, + "step": 7765, + "time_per_iteration": 2.4413938522338867 + }, + { + "auxiliary_loss_clip": 0.01114151, + "auxiliary_loss_mlp": 0.01034738, + "balance_loss_clip": 1.02123809, + "balance_loss_mlp": 1.03898025, + "epoch": 0.4669171802194499, + "flos": 19206643006080.0, + "grad_norm": 2.045608669049209, + "language_loss": 0.77604026, + "learning_rate": 2.3101182603623576e-06, + "loss": 0.79752916, + "num_input_tokens_seen": 166719940, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 7766, + "time_per_iteration": 2.4388277530670166 + }, + { + "auxiliary_loss_clip": 0.01112932, + "auxiliary_loss_mlp": 0.01033882, + "balance_loss_clip": 1.02094162, + "balance_loss_mlp": 1.03921056, + "epoch": 0.46697730347211786, + "flos": 12276135406080.0, + "grad_norm": 1.9270773145684021, + "language_loss": 0.65106744, + "learning_rate": 2.3097335032681607e-06, + "loss": 0.67253554, + "num_input_tokens_seen": 166738285, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.73828125, + "step": 7767, + "time_per_iteration": 2.4259531497955322 + }, + { + "auxiliary_loss_clip": 0.01115563, + "auxiliary_loss_mlp": 0.01036202, + "balance_loss_clip": 1.02361989, + "balance_loss_mlp": 1.04137385, + "epoch": 0.4670374267247858, + "flos": 23586739274880.0, + "grad_norm": 1.832674622819915, + "language_loss": 0.74584204, + "learning_rate": 2.3093487344288393e-06, + "loss": 0.76735973, + "num_input_tokens_seen": 166758170, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7421875, + "step": 7768, + "time_per_iteration": 2.5001304149627686 + }, + { + "auxiliary_loss_clip": 0.01114611, + "auxiliary_loss_mlp": 0.01031838, + "balance_loss_clip": 1.01907098, + "balance_loss_mlp": 1.04069757, + "epoch": 0.4670975499774538, + "flos": 15991093578240.0, + "grad_norm": 1.7275432453698176, + "language_loss": 0.70713127, + "learning_rate": 2.308963953858982e-06, + "loss": 0.72859579, + "num_input_tokens_seen": 166775750, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.73828125, + "step": 7769, + "time_per_iteration": 2.466909408569336 + }, + { + "auxiliary_loss_clip": 0.01113851, + "auxiliary_loss_mlp": 0.01033973, + "balance_loss_clip": 1.02159858, + "balance_loss_mlp": 1.03928077, + "epoch": 0.46715767323012175, + "flos": 15377596260480.0, + "grad_norm": 1.9729575937492385, + "language_loss": 0.8121224, + "learning_rate": 2.3085791615731803e-06, + "loss": 0.83360064, + "num_input_tokens_seen": 166791720, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.74609375, + "step": 7770, + "time_per_iteration": 2.458648204803467 + }, + { + "auxiliary_loss_clip": 0.01036054, + "auxiliary_loss_mlp": 0.01001838, + "balance_loss_clip": 1.00070572, + "balance_loss_mlp": 1.01253605, + "epoch": 0.4672177964827897, + "flos": 60252217401600.0, + "grad_norm": 0.7993613034211892, + "language_loss": 0.5567323, + "learning_rate": 2.3081943575860265e-06, + "loss": 0.57711124, + "num_input_tokens_seen": 166856360, + "router_z_loss_clip": 0.01135254, + "router_z_loss_mlp": 0.23632812, + "step": 7771, + "time_per_iteration": 3.0888803005218506 + }, + { + "auxiliary_loss_clip": 0.01111082, + "auxiliary_loss_mlp": 0.01036097, + "balance_loss_clip": 1.02332425, + "balance_loss_mlp": 1.03920853, + "epoch": 0.4672779197354577, + "flos": 27636134002560.0, + "grad_norm": 2.068311261086289, + "language_loss": 0.65702665, + "learning_rate": 2.3078095419121117e-06, + "loss": 0.67849845, + "num_input_tokens_seen": 166875925, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.71875, + "step": 7772, + "time_per_iteration": 2.5242044925689697 + }, + { + "auxiliary_loss_clip": 0.01112309, + "auxiliary_loss_mlp": 0.01032453, + "balance_loss_clip": 1.0201087, + "balance_loss_mlp": 1.04012156, + "epoch": 0.46733804298812565, + "flos": 31394257344000.0, + "grad_norm": 1.8148576314480773, + "language_loss": 0.63699466, + "learning_rate": 2.3074247145660283e-06, + "loss": 0.65844226, + "num_input_tokens_seen": 166896520, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.72265625, + "step": 7773, + "time_per_iteration": 2.5828921794891357 + }, + { + "auxiliary_loss_clip": 0.01114763, + "auxiliary_loss_mlp": 0.01034304, + "balance_loss_clip": 1.02112508, + "balance_loss_mlp": 1.04050922, + "epoch": 0.4673981662407936, + "flos": 19500607912320.0, + "grad_norm": 1.942265734861076, + "language_loss": 0.79793948, + "learning_rate": 2.3070398755623685e-06, + "loss": 0.81943017, + "num_input_tokens_seen": 166915370, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 7774, + "time_per_iteration": 2.448124647140503 + }, + { + "auxiliary_loss_clip": 0.01116733, + "auxiliary_loss_mlp": 0.01030065, + "balance_loss_clip": 1.01732183, + "balance_loss_mlp": 1.04113531, + "epoch": 0.4674582894934616, + "flos": 20521835487360.0, + "grad_norm": 1.627446474145158, + "language_loss": 0.77884328, + "learning_rate": 2.306655024915726e-06, + "loss": 0.80031127, + "num_input_tokens_seen": 166934875, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7578125, + "step": 7775, + "time_per_iteration": 2.527324676513672 + }, + { + "auxiliary_loss_clip": 0.01111153, + "auxiliary_loss_mlp": 0.01029234, + "balance_loss_clip": 1.01650286, + "balance_loss_mlp": 1.03931999, + "epoch": 0.46751841274612954, + "flos": 22090952188800.0, + "grad_norm": 1.8679682194131426, + "language_loss": 0.69634461, + "learning_rate": 2.306270162640694e-06, + "loss": 0.71774852, + "num_input_tokens_seen": 166954285, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 7776, + "time_per_iteration": 2.4637980461120605 + }, + { + "auxiliary_loss_clip": 0.01113537, + "auxiliary_loss_mlp": 0.01032642, + "balance_loss_clip": 1.02123928, + "balance_loss_mlp": 1.04122162, + "epoch": 0.46757853599879756, + "flos": 26980082046720.0, + "grad_norm": 1.3721760360464321, + "language_loss": 0.73558104, + "learning_rate": 2.3058852887518678e-06, + "loss": 0.75704277, + "num_input_tokens_seen": 166975975, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.72265625, + "step": 7777, + "time_per_iteration": 2.520732879638672 + }, + { + "auxiliary_loss_clip": 0.01113463, + "auxiliary_loss_mlp": 0.01029901, + "balance_loss_clip": 1.0170207, + "balance_loss_mlp": 1.04067683, + "epoch": 0.4676386592514655, + "flos": 24134053783680.0, + "grad_norm": 2.1302386072463717, + "language_loss": 0.69626892, + "learning_rate": 2.3055004032638394e-06, + "loss": 0.71770251, + "num_input_tokens_seen": 166996140, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7265625, + "step": 7778, + "time_per_iteration": 2.514420509338379 + }, + { + "auxiliary_loss_clip": 0.01114478, + "auxiliary_loss_mlp": 0.01039246, + "balance_loss_clip": 1.02606773, + "balance_loss_mlp": 1.04059839, + "epoch": 0.4676987825041335, + "flos": 25483720343040.0, + "grad_norm": 1.560538067350171, + "language_loss": 0.73252767, + "learning_rate": 2.305115506191206e-06, + "loss": 0.75406492, + "num_input_tokens_seen": 167016105, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 7779, + "time_per_iteration": 2.5243053436279297 + }, + { + "auxiliary_loss_clip": 0.01112098, + "auxiliary_loss_mlp": 0.01039794, + "balance_loss_clip": 1.02767682, + "balance_loss_mlp": 1.04009414, + "epoch": 0.46775890575680146, + "flos": 21945298538880.0, + "grad_norm": 1.5361358548392845, + "language_loss": 0.72206026, + "learning_rate": 2.304730597548562e-06, + "loss": 0.74357915, + "num_input_tokens_seen": 167036185, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71875, + "step": 7780, + "time_per_iteration": 2.462562322616577 + }, + { + "auxiliary_loss_clip": 0.01116485, + "auxiliary_loss_mlp": 0.01036348, + "balance_loss_clip": 1.02259159, + "balance_loss_mlp": 1.03972697, + "epoch": 0.4678190290094694, + "flos": 25228395492480.0, + "grad_norm": 2.377229275085917, + "language_loss": 0.73864317, + "learning_rate": 2.3043456773505023e-06, + "loss": 0.76017153, + "num_input_tokens_seen": 167054515, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.765625, + "step": 7781, + "time_per_iteration": 2.502406358718872 + }, + { + "auxiliary_loss_clip": 0.01117462, + "auxiliary_loss_mlp": 0.01035425, + "balance_loss_clip": 1.02216315, + "balance_loss_mlp": 1.04165602, + "epoch": 0.4678791522621374, + "flos": 32268358811520.0, + "grad_norm": 1.718665338253189, + "language_loss": 0.62727809, + "learning_rate": 2.3039607456116252e-06, + "loss": 0.64880699, + "num_input_tokens_seen": 167077245, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7578125, + "step": 7782, + "time_per_iteration": 2.5425686836242676 + }, + { + "auxiliary_loss_clip": 0.01117055, + "auxiliary_loss_mlp": 0.01039299, + "balance_loss_clip": 1.02660906, + "balance_loss_mlp": 1.0408988, + "epoch": 0.46793927551480535, + "flos": 27046480337280.0, + "grad_norm": 1.7203724678454408, + "language_loss": 0.62933487, + "learning_rate": 2.3035758023465254e-06, + "loss": 0.65089834, + "num_input_tokens_seen": 167097235, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.76171875, + "step": 7783, + "time_per_iteration": 2.5380141735076904 + }, + { + "auxiliary_loss_clip": 0.01122421, + "auxiliary_loss_mlp": 0.01036672, + "balance_loss_clip": 1.02271223, + "balance_loss_mlp": 1.04462993, + "epoch": 0.4679993987674733, + "flos": 17457398576640.0, + "grad_norm": 2.164400906730855, + "language_loss": 0.67745304, + "learning_rate": 2.303190847569801e-06, + "loss": 0.69904399, + "num_input_tokens_seen": 167113155, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.77734375, + "step": 7784, + "time_per_iteration": 2.4520463943481445 + }, + { + "auxiliary_loss_clip": 0.01110794, + "auxiliary_loss_mlp": 0.01031461, + "balance_loss_clip": 1.01965284, + "balance_loss_mlp": 1.03855705, + "epoch": 0.4680595220201413, + "flos": 17165121609600.0, + "grad_norm": 1.8603472350259396, + "language_loss": 0.84720063, + "learning_rate": 2.3028058812960497e-06, + "loss": 0.8686232, + "num_input_tokens_seen": 167131765, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.72265625, + "step": 7785, + "time_per_iteration": 2.459446907043457 + }, + { + "auxiliary_loss_clip": 0.01115116, + "auxiliary_loss_mlp": 0.01029458, + "balance_loss_clip": 1.01639259, + "balance_loss_mlp": 1.04066038, + "epoch": 0.46811964527280925, + "flos": 11327591001600.0, + "grad_norm": 2.0359259581468154, + "language_loss": 0.77018952, + "learning_rate": 2.3024209035398678e-06, + "loss": 0.79163527, + "num_input_tokens_seen": 167149030, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.74609375, + "step": 7786, + "time_per_iteration": 2.415062427520752 + }, + { + "auxiliary_loss_clip": 0.01110671, + "auxiliary_loss_mlp": 0.01027657, + "balance_loss_clip": 1.01558685, + "balance_loss_mlp": 1.0400672, + "epoch": 0.4681797685254772, + "flos": 24278809593600.0, + "grad_norm": 2.023612965965443, + "language_loss": 0.73795342, + "learning_rate": 2.302035914315856e-06, + "loss": 0.75933665, + "num_input_tokens_seen": 167167375, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.70703125, + "step": 7787, + "time_per_iteration": 2.5224268436431885 + }, + { + "auxiliary_loss_clip": 0.01113364, + "auxiliary_loss_mlp": 0.01039171, + "balance_loss_clip": 1.02599859, + "balance_loss_mlp": 1.04109263, + "epoch": 0.4682398917781452, + "flos": 31650372293760.0, + "grad_norm": 1.7002718084162438, + "language_loss": 0.65639925, + "learning_rate": 2.3016509136386116e-06, + "loss": 0.67792457, + "num_input_tokens_seen": 167188065, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.72265625, + "step": 7788, + "time_per_iteration": 2.534850835800171 + }, + { + "auxiliary_loss_clip": 0.01110419, + "auxiliary_loss_mlp": 0.01030066, + "balance_loss_clip": 1.01878858, + "balance_loss_mlp": 1.03911507, + "epoch": 0.46830001503081314, + "flos": 28110765340800.0, + "grad_norm": 1.9511727744147118, + "language_loss": 0.63813901, + "learning_rate": 2.3012659015227343e-06, + "loss": 0.65954381, + "num_input_tokens_seen": 167209675, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.71484375, + "step": 7789, + "time_per_iteration": 2.5479812622070312 + }, + { + "auxiliary_loss_clip": 0.01036451, + "auxiliary_loss_mlp": 0.01005013, + "balance_loss_clip": 1.00388098, + "balance_loss_mlp": 1.01292431, + "epoch": 0.4683601382834811, + "flos": 57881718316800.0, + "grad_norm": 0.7071467356489777, + "language_loss": 0.61922455, + "learning_rate": 2.300880877982825e-06, + "loss": 0.6396392, + "num_input_tokens_seen": 167273940, + "router_z_loss_clip": 0.01135254, + "router_z_loss_mlp": 0.23632812, + "step": 7790, + "time_per_iteration": 3.1510462760925293 + }, + { + "auxiliary_loss_clip": 0.01112801, + "auxiliary_loss_mlp": 0.0103052, + "balance_loss_clip": 1.01836109, + "balance_loss_mlp": 1.04223442, + "epoch": 0.46842026153614913, + "flos": 21871933009920.0, + "grad_norm": 1.5995715197713376, + "language_loss": 0.79338831, + "learning_rate": 2.3004958430334808e-06, + "loss": 0.81482148, + "num_input_tokens_seen": 167292730, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.703125, + "step": 7791, + "time_per_iteration": 2.5008740425109863 + }, + { + "auxiliary_loss_clip": 0.01114115, + "auxiliary_loss_mlp": 0.01035459, + "balance_loss_clip": 1.02297759, + "balance_loss_mlp": 1.04113936, + "epoch": 0.4684803847888171, + "flos": 24900818434560.0, + "grad_norm": 1.651557239680421, + "language_loss": 0.7484895, + "learning_rate": 2.3001107966893052e-06, + "loss": 0.76998532, + "num_input_tokens_seen": 167313460, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.73046875, + "step": 7792, + "time_per_iteration": 2.4964823722839355 + }, + { + "auxiliary_loss_clip": 0.01108357, + "auxiliary_loss_mlp": 0.01031288, + "balance_loss_clip": 1.01953983, + "balance_loss_mlp": 1.03747678, + "epoch": 0.46854050804148506, + "flos": 26251670142720.0, + "grad_norm": 1.7412725365893262, + "language_loss": 0.6822598, + "learning_rate": 2.299725738964898e-06, + "loss": 0.70365626, + "num_input_tokens_seen": 167335385, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.70703125, + "step": 7793, + "time_per_iteration": 2.5480096340179443 + }, + { + "auxiliary_loss_clip": 0.01112468, + "auxiliary_loss_mlp": 0.01027992, + "balance_loss_clip": 1.01638055, + "balance_loss_mlp": 1.04102671, + "epoch": 0.468600631294153, + "flos": 21579799697280.0, + "grad_norm": 1.577590367357015, + "language_loss": 0.73983628, + "learning_rate": 2.2993406698748607e-06, + "loss": 0.76124084, + "num_input_tokens_seen": 167353625, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.71484375, + "step": 7794, + "time_per_iteration": 2.453190803527832 + }, + { + "auxiliary_loss_clip": 0.01114261, + "auxiliary_loss_mlp": 0.01035788, + "balance_loss_clip": 1.0227052, + "balance_loss_mlp": 1.04182243, + "epoch": 0.468660754546821, + "flos": 25885632597120.0, + "grad_norm": 1.5518603627769951, + "language_loss": 0.63617218, + "learning_rate": 2.2989555894337953e-06, + "loss": 0.65767258, + "num_input_tokens_seen": 167374565, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 7795, + "time_per_iteration": 2.5087008476257324 + }, + { + "auxiliary_loss_clip": 0.01108593, + "auxiliary_loss_mlp": 0.01023894, + "balance_loss_clip": 1.01140058, + "balance_loss_mlp": 1.03883195, + "epoch": 0.46872087779948896, + "flos": 35475001666560.0, + "grad_norm": 1.6379638897021238, + "language_loss": 0.68002474, + "learning_rate": 2.298570497656304e-06, + "loss": 0.70134962, + "num_input_tokens_seen": 167395010, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 7796, + "time_per_iteration": 2.6073970794677734 + }, + { + "auxiliary_loss_clip": 0.01110063, + "auxiliary_loss_mlp": 0.01031393, + "balance_loss_clip": 1.01876903, + "balance_loss_mlp": 1.03811777, + "epoch": 0.4687810010521569, + "flos": 26396425952640.0, + "grad_norm": 1.6469110962479863, + "language_loss": 0.70039898, + "learning_rate": 2.2981853945569894e-06, + "loss": 0.72181356, + "num_input_tokens_seen": 167415285, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71875, + "step": 7797, + "time_per_iteration": 2.5202813148498535 + }, + { + "auxiliary_loss_clip": 0.01114247, + "auxiliary_loss_mlp": 0.01030143, + "balance_loss_clip": 1.01626134, + "balance_loss_mlp": 1.04066193, + "epoch": 0.4688411243048249, + "flos": 19972761212160.0, + "grad_norm": 5.424608495577661, + "language_loss": 0.67517138, + "learning_rate": 2.297800280150454e-06, + "loss": 0.69661522, + "num_input_tokens_seen": 167432405, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.734375, + "step": 7798, + "time_per_iteration": 2.425443649291992 + }, + { + "auxiliary_loss_clip": 0.01033599, + "auxiliary_loss_mlp": 0.00999727, + "balance_loss_clip": 0.99840373, + "balance_loss_mlp": 1.00991392, + "epoch": 0.46890124755749285, + "flos": 63977015900160.0, + "grad_norm": 0.9386412030406017, + "language_loss": 0.64531696, + "learning_rate": 2.2974151544513033e-06, + "loss": 0.66565025, + "num_input_tokens_seen": 167499365, + "router_z_loss_clip": 0.01324463, + "router_z_loss_mlp": 0.23730469, + "step": 7799, + "time_per_iteration": 3.2528939247131348 + }, + { + "auxiliary_loss_clip": 0.01108747, + "auxiliary_loss_mlp": 0.01025125, + "balance_loss_clip": 1.01308465, + "balance_loss_mlp": 1.03731787, + "epoch": 0.4689613708101608, + "flos": 23768985905280.0, + "grad_norm": 1.4163336480228355, + "language_loss": 0.72242683, + "learning_rate": 2.2970300174741395e-06, + "loss": 0.74376553, + "num_input_tokens_seen": 167520390, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.71484375, + "step": 7800, + "time_per_iteration": 2.481309175491333 + }, + { + "auxiliary_loss_clip": 0.01109702, + "auxiliary_loss_mlp": 0.01030663, + "balance_loss_clip": 1.01937377, + "balance_loss_mlp": 1.0401566, + "epoch": 0.4690214940628288, + "flos": 24788705109120.0, + "grad_norm": 2.26920520557406, + "language_loss": 0.72428536, + "learning_rate": 2.296644869233568e-06, + "loss": 0.74568903, + "num_input_tokens_seen": 167539865, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6953125, + "step": 7801, + "time_per_iteration": 2.491105079650879 + }, + { + "auxiliary_loss_clip": 0.01116821, + "auxiliary_loss_mlp": 0.0103741, + "balance_loss_clip": 1.02352786, + "balance_loss_mlp": 1.04097068, + "epoch": 0.46908161731549675, + "flos": 18077324428800.0, + "grad_norm": 2.06336431229611, + "language_loss": 0.62303418, + "learning_rate": 2.2962597097441936e-06, + "loss": 0.64457649, + "num_input_tokens_seen": 167558190, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 7802, + "time_per_iteration": 2.419229030609131 + }, + { + "auxiliary_loss_clip": 0.01113209, + "auxiliary_loss_mlp": 0.01033494, + "balance_loss_clip": 1.02101874, + "balance_loss_mlp": 1.03946614, + "epoch": 0.4691417405681647, + "flos": 25703350053120.0, + "grad_norm": 1.7578029510137774, + "language_loss": 0.73409998, + "learning_rate": 2.2958745390206206e-06, + "loss": 0.75556695, + "num_input_tokens_seen": 167577685, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.73828125, + "step": 7803, + "time_per_iteration": 3.984971523284912 + }, + { + "auxiliary_loss_clip": 0.01107549, + "auxiliary_loss_mlp": 0.01035068, + "balance_loss_clip": 1.02289057, + "balance_loss_mlp": 1.0363642, + "epoch": 0.46920186382083273, + "flos": 17457039440640.0, + "grad_norm": 2.1225810300999384, + "language_loss": 0.77638352, + "learning_rate": 2.2954893570774558e-06, + "loss": 0.79780972, + "num_input_tokens_seen": 167596390, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.7109375, + "step": 7804, + "time_per_iteration": 5.432345390319824 + }, + { + "auxiliary_loss_clip": 0.01108405, + "auxiliary_loss_mlp": 0.01026664, + "balance_loss_clip": 1.01417041, + "balance_loss_mlp": 1.03702545, + "epoch": 0.4692619870735007, + "flos": 20339445202560.0, + "grad_norm": 1.8629622532391696, + "language_loss": 0.77384996, + "learning_rate": 2.295104163929305e-06, + "loss": 0.79520065, + "num_input_tokens_seen": 167614980, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71484375, + "step": 7805, + "time_per_iteration": 3.873565196990967 + }, + { + "auxiliary_loss_clip": 0.01119773, + "auxiliary_loss_mlp": 0.01037518, + "balance_loss_clip": 1.02423811, + "balance_loss_mlp": 1.04193878, + "epoch": 0.46932211032616866, + "flos": 29496558003840.0, + "grad_norm": 1.5711850680288217, + "language_loss": 0.82902926, + "learning_rate": 2.2947189595907742e-06, + "loss": 0.85060221, + "num_input_tokens_seen": 167635895, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.78125, + "step": 7806, + "time_per_iteration": 2.554081439971924 + }, + { + "auxiliary_loss_clip": 0.0111231, + "auxiliary_loss_mlp": 0.01034772, + "balance_loss_clip": 1.02150404, + "balance_loss_mlp": 1.03812897, + "epoch": 0.4693822335788366, + "flos": 36211242735360.0, + "grad_norm": 1.7011762555096541, + "language_loss": 0.77454185, + "learning_rate": 2.294333744076472e-06, + "loss": 0.79601264, + "num_input_tokens_seen": 167657440, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7421875, + "step": 7807, + "time_per_iteration": 2.5786170959472656 + }, + { + "auxiliary_loss_clip": 0.01112504, + "auxiliary_loss_mlp": 0.01033071, + "balance_loss_clip": 1.01985693, + "balance_loss_mlp": 1.03987944, + "epoch": 0.4694423568315046, + "flos": 20338978325760.0, + "grad_norm": 1.9089254292763438, + "language_loss": 0.51788038, + "learning_rate": 2.2939485174010035e-06, + "loss": 0.53933609, + "num_input_tokens_seen": 167675025, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7265625, + "step": 7808, + "time_per_iteration": 2.4730944633483887 + }, + { + "auxiliary_loss_clip": 0.01034297, + "auxiliary_loss_mlp": 0.01010423, + "balance_loss_clip": 1.00899839, + "balance_loss_mlp": 1.01039815, + "epoch": 0.46950248008417256, + "flos": 64326353621760.0, + "grad_norm": 0.782722095319277, + "language_loss": 0.57725239, + "learning_rate": 2.293563279578978e-06, + "loss": 0.59769958, + "num_input_tokens_seen": 167729635, + "router_z_loss_clip": 0.01422119, + "router_z_loss_mlp": 0.23925781, + "step": 7809, + "time_per_iteration": 2.9356954097747803 + }, + { + "auxiliary_loss_clip": 0.01116298, + "auxiliary_loss_mlp": 0.01036482, + "balance_loss_clip": 1.0237031, + "balance_loss_mlp": 1.04176784, + "epoch": 0.4695626033368405, + "flos": 19200106730880.0, + "grad_norm": 2.074581573353579, + "language_loss": 0.72116458, + "learning_rate": 2.2931780306250045e-06, + "loss": 0.74269235, + "num_input_tokens_seen": 167745135, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.74609375, + "step": 7810, + "time_per_iteration": 2.493408679962158 + }, + { + "auxiliary_loss_clip": 0.01114023, + "auxiliary_loss_mlp": 0.0103688, + "balance_loss_clip": 1.02402329, + "balance_loss_mlp": 1.040115, + "epoch": 0.4696227265895085, + "flos": 23002436736000.0, + "grad_norm": 2.1541938985336992, + "language_loss": 0.8075912, + "learning_rate": 2.29279277055369e-06, + "loss": 0.82910025, + "num_input_tokens_seen": 167763875, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.73828125, + "step": 7811, + "time_per_iteration": 2.4555575847625732 + }, + { + "auxiliary_loss_clip": 0.01114703, + "auxiliary_loss_mlp": 0.0103453, + "balance_loss_clip": 1.02146435, + "balance_loss_mlp": 1.04074228, + "epoch": 0.46968284984217645, + "flos": 21870855601920.0, + "grad_norm": 1.576643907851126, + "language_loss": 0.8039701, + "learning_rate": 2.292407499379644e-06, + "loss": 0.82546234, + "num_input_tokens_seen": 167784895, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7421875, + "step": 7812, + "time_per_iteration": 2.4640350341796875 + }, + { + "auxiliary_loss_clip": 0.01109494, + "auxiliary_loss_mlp": 0.01029039, + "balance_loss_clip": 1.0166117, + "balance_loss_mlp": 1.03902435, + "epoch": 0.4697429730948444, + "flos": 19974987855360.0, + "grad_norm": 1.5853543039664872, + "language_loss": 0.73764664, + "learning_rate": 2.292022217117477e-06, + "loss": 0.75903195, + "num_input_tokens_seen": 167803185, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.703125, + "step": 7813, + "time_per_iteration": 2.4320507049560547 + }, + { + "auxiliary_loss_clip": 0.01110282, + "auxiliary_loss_mlp": 0.01028304, + "balance_loss_clip": 1.01483905, + "balance_loss_mlp": 1.03869295, + "epoch": 0.4698030963475124, + "flos": 15156206784000.0, + "grad_norm": 2.2861298905980756, + "language_loss": 0.84540617, + "learning_rate": 2.291636923781798e-06, + "loss": 0.86679196, + "num_input_tokens_seen": 167816550, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71484375, + "step": 7814, + "time_per_iteration": 2.4274749755859375 + }, + { + "auxiliary_loss_clip": 0.01107762, + "auxiliary_loss_mlp": 0.01036717, + "balance_loss_clip": 1.02381229, + "balance_loss_mlp": 1.03796697, + "epoch": 0.46986321960018035, + "flos": 15151178880000.0, + "grad_norm": 1.8672463737050276, + "language_loss": 0.81747186, + "learning_rate": 2.291251619387217e-06, + "loss": 0.83891666, + "num_input_tokens_seen": 167831845, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69921875, + "step": 7815, + "time_per_iteration": 2.4163284301757812 + }, + { + "auxiliary_loss_clip": 0.01113111, + "auxiliary_loss_mlp": 0.01033733, + "balance_loss_clip": 1.02026868, + "balance_loss_mlp": 1.03994465, + "epoch": 0.4699233428528483, + "flos": 23108911626240.0, + "grad_norm": 2.4869249923010917, + "language_loss": 0.77289331, + "learning_rate": 2.2908663039483468e-06, + "loss": 0.79436171, + "num_input_tokens_seen": 167850360, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73046875, + "step": 7816, + "time_per_iteration": 2.4678542613983154 + }, + { + "auxiliary_loss_clip": 0.01033373, + "auxiliary_loss_mlp": 0.01001411, + "balance_loss_clip": 0.9998135, + "balance_loss_mlp": 1.00933015, + "epoch": 0.46998346610551633, + "flos": 68105558246400.0, + "grad_norm": 0.8340649958424211, + "language_loss": 0.5901494, + "learning_rate": 2.290480977479796e-06, + "loss": 0.61049724, + "num_input_tokens_seen": 167908660, + "router_z_loss_clip": 0.01599121, + "router_z_loss_mlp": 0.24023438, + "step": 7817, + "time_per_iteration": 3.0594780445098877 + }, + { + "auxiliary_loss_clip": 0.01108016, + "auxiliary_loss_mlp": 0.01029692, + "balance_loss_clip": 1.01726496, + "balance_loss_mlp": 1.03904927, + "epoch": 0.4700435893581843, + "flos": 24129456842880.0, + "grad_norm": 1.7036287613919965, + "language_loss": 0.79255462, + "learning_rate": 2.2900956399961775e-06, + "loss": 0.81393164, + "num_input_tokens_seen": 167927905, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 7818, + "time_per_iteration": 2.5072269439697266 + }, + { + "auxiliary_loss_clip": 0.01109812, + "auxiliary_loss_mlp": 0.01032563, + "balance_loss_clip": 1.02011776, + "balance_loss_mlp": 1.03705192, + "epoch": 0.47010371261085226, + "flos": 20150518642560.0, + "grad_norm": 1.8212678437549825, + "language_loss": 0.83521211, + "learning_rate": 2.289710291512104e-06, + "loss": 0.85663581, + "num_input_tokens_seen": 167945995, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7265625, + "step": 7819, + "time_per_iteration": 2.4294557571411133 + }, + { + "auxiliary_loss_clip": 0.01114675, + "auxiliary_loss_mlp": 0.01035104, + "balance_loss_clip": 1.02144313, + "balance_loss_mlp": 1.0395112, + "epoch": 0.47016383586352023, + "flos": 15122199582720.0, + "grad_norm": 2.0332467146742457, + "language_loss": 0.75860727, + "learning_rate": 2.289324932042186e-06, + "loss": 0.78010511, + "num_input_tokens_seen": 167963380, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75390625, + "step": 7820, + "time_per_iteration": 2.446664333343506 + }, + { + "auxiliary_loss_clip": 0.0111083, + "auxiliary_loss_mlp": 0.01033323, + "balance_loss_clip": 1.02034664, + "balance_loss_mlp": 1.04058981, + "epoch": 0.4702239591161882, + "flos": 13552975140480.0, + "grad_norm": 1.889014789758207, + "language_loss": 0.73767376, + "learning_rate": 2.288939561601039e-06, + "loss": 0.75911528, + "num_input_tokens_seen": 167981740, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.703125, + "step": 7821, + "time_per_iteration": 2.4138526916503906 + }, + { + "auxiliary_loss_clip": 0.01110991, + "auxiliary_loss_mlp": 0.01040092, + "balance_loss_clip": 1.02792668, + "balance_loss_mlp": 1.04042852, + "epoch": 0.47028408236885616, + "flos": 24276511123200.0, + "grad_norm": 1.6752111617055698, + "language_loss": 0.88782346, + "learning_rate": 2.2885541802032746e-06, + "loss": 0.9093343, + "num_input_tokens_seen": 167999380, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.703125, + "step": 7822, + "time_per_iteration": 2.5215280055999756 + }, + { + "auxiliary_loss_clip": 0.01110261, + "auxiliary_loss_mlp": 0.01029425, + "balance_loss_clip": 1.01693165, + "balance_loss_mlp": 1.03927922, + "epoch": 0.4703442056215241, + "flos": 22856926740480.0, + "grad_norm": 1.5082152139738452, + "language_loss": 0.79467583, + "learning_rate": 2.2881687878635055e-06, + "loss": 0.8160727, + "num_input_tokens_seen": 168018395, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 7823, + "time_per_iteration": 2.4513280391693115 + }, + { + "auxiliary_loss_clip": 0.01034267, + "auxiliary_loss_mlp": 0.01003747, + "balance_loss_clip": 1.00228715, + "balance_loss_mlp": 1.01028728, + "epoch": 0.4704043288741921, + "flos": 69240227950080.0, + "grad_norm": 0.6886986665104876, + "language_loss": 0.56664526, + "learning_rate": 2.2877833845963487e-06, + "loss": 0.5870254, + "num_input_tokens_seen": 168084080, + "router_z_loss_clip": 0.0145874, + "router_z_loss_mlp": 0.24023438, + "step": 7824, + "time_per_iteration": 3.1640188694000244 + }, + { + "auxiliary_loss_clip": 0.01113151, + "auxiliary_loss_mlp": 0.01035787, + "balance_loss_clip": 1.02209568, + "balance_loss_mlp": 1.03935504, + "epoch": 0.47046445212686006, + "flos": 18041090584320.0, + "grad_norm": 1.7687808389256934, + "language_loss": 0.81284839, + "learning_rate": 2.2873979704164157e-06, + "loss": 0.83433783, + "num_input_tokens_seen": 168101555, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73828125, + "step": 7825, + "time_per_iteration": 2.4225590229034424 + }, + { + "auxiliary_loss_clip": 0.01114172, + "auxiliary_loss_mlp": 0.01030918, + "balance_loss_clip": 1.01788807, + "balance_loss_mlp": 1.04160166, + "epoch": 0.470524575379528, + "flos": 23951448017280.0, + "grad_norm": 1.5897626143629002, + "language_loss": 0.66397595, + "learning_rate": 2.287012545338324e-06, + "loss": 0.68542683, + "num_input_tokens_seen": 168121530, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 7826, + "time_per_iteration": 2.512421131134033 + }, + { + "auxiliary_loss_clip": 0.0111203, + "auxiliary_loss_mlp": 0.01036996, + "balance_loss_clip": 1.02366889, + "balance_loss_mlp": 1.03788161, + "epoch": 0.470584698632196, + "flos": 18113558273280.0, + "grad_norm": 2.2414984964582354, + "language_loss": 0.83768737, + "learning_rate": 2.2866271093766877e-06, + "loss": 0.85917771, + "num_input_tokens_seen": 168140335, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7421875, + "step": 7827, + "time_per_iteration": 2.449002504348755 + }, + { + "auxiliary_loss_clip": 0.01034449, + "auxiliary_loss_mlp": 0.01000576, + "balance_loss_clip": 0.99914598, + "balance_loss_mlp": 1.01066613, + "epoch": 0.47064482188486395, + "flos": 57251916224640.0, + "grad_norm": 0.821565097847141, + "language_loss": 0.55694902, + "learning_rate": 2.286241662546122e-06, + "loss": 0.57729936, + "num_input_tokens_seen": 168200535, + "router_z_loss_clip": 0.01428223, + "router_z_loss_mlp": 0.23828125, + "step": 7828, + "time_per_iteration": 3.0819802284240723 + }, + { + "auxiliary_loss_clip": 0.01109156, + "auxiliary_loss_mlp": 0.01028442, + "balance_loss_clip": 1.01605594, + "balance_loss_mlp": 1.03884375, + "epoch": 0.4707049451375319, + "flos": 17895077798400.0, + "grad_norm": 1.9071991460911069, + "language_loss": 0.81054831, + "learning_rate": 2.285856204861245e-06, + "loss": 0.8319242, + "num_input_tokens_seen": 168219610, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.703125, + "step": 7829, + "time_per_iteration": 2.415055513381958 + }, + { + "auxiliary_loss_clip": 0.01110764, + "auxiliary_loss_mlp": 0.01032324, + "balance_loss_clip": 1.02058768, + "balance_loss_mlp": 1.04020715, + "epoch": 0.47076506839019994, + "flos": 25232669210880.0, + "grad_norm": 1.3327561380149306, + "language_loss": 0.7576915, + "learning_rate": 2.2854707363366703e-06, + "loss": 0.77912241, + "num_input_tokens_seen": 168242505, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.703125, + "step": 7830, + "time_per_iteration": 2.5643560886383057 + }, + { + "auxiliary_loss_clip": 0.0111195, + "auxiliary_loss_mlp": 0.01032106, + "balance_loss_clip": 1.01860535, + "balance_loss_mlp": 1.04144919, + "epoch": 0.4708251916428679, + "flos": 13479681438720.0, + "grad_norm": 1.972485160119179, + "language_loss": 0.78818381, + "learning_rate": 2.2850852569870177e-06, + "loss": 0.80962437, + "num_input_tokens_seen": 168260220, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.703125, + "step": 7831, + "time_per_iteration": 2.4193694591522217 + }, + { + "auxiliary_loss_clip": 0.01115316, + "auxiliary_loss_mlp": 0.01035851, + "balance_loss_clip": 1.02204037, + "balance_loss_mlp": 1.03843021, + "epoch": 0.47088531489553587, + "flos": 30147833450880.0, + "grad_norm": 1.7552368254682797, + "language_loss": 0.76044565, + "learning_rate": 2.2846997668269033e-06, + "loss": 0.78195733, + "num_input_tokens_seen": 168277360, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76953125, + "step": 7832, + "time_per_iteration": 2.5059313774108887 + }, + { + "auxiliary_loss_clip": 0.01110226, + "auxiliary_loss_mlp": 0.01028235, + "balance_loss_clip": 1.0164752, + "balance_loss_mlp": 1.03971505, + "epoch": 0.47094543814820383, + "flos": 21798280172160.0, + "grad_norm": 1.221217846393107, + "language_loss": 0.74499595, + "learning_rate": 2.2843142658709454e-06, + "loss": 0.76638055, + "num_input_tokens_seen": 168296605, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.703125, + "step": 7833, + "time_per_iteration": 2.473198652267456 + }, + { + "auxiliary_loss_clip": 0.01111984, + "auxiliary_loss_mlp": 0.01035389, + "balance_loss_clip": 1.0222286, + "balance_loss_mlp": 1.04079628, + "epoch": 0.4710055614008718, + "flos": 23003011353600.0, + "grad_norm": 1.540147977988576, + "language_loss": 0.7563647, + "learning_rate": 2.283928754133762e-06, + "loss": 0.77783847, + "num_input_tokens_seen": 168316205, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 7834, + "time_per_iteration": 2.4742865562438965 + }, + { + "auxiliary_loss_clip": 0.01110721, + "auxiliary_loss_mlp": 0.01038538, + "balance_loss_clip": 1.02601528, + "balance_loss_mlp": 1.04030991, + "epoch": 0.47106568465353976, + "flos": 42741346452480.0, + "grad_norm": 1.3686611384111311, + "language_loss": 0.66174978, + "learning_rate": 2.283543231629972e-06, + "loss": 0.68324244, + "num_input_tokens_seen": 168338935, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 7835, + "time_per_iteration": 2.631727933883667 + }, + { + "auxiliary_loss_clip": 0.01034831, + "auxiliary_loss_mlp": 0.01005422, + "balance_loss_clip": 1.00418234, + "balance_loss_mlp": 1.01069164, + "epoch": 0.4711258079062077, + "flos": 68554008570240.0, + "grad_norm": 0.8728088219103824, + "language_loss": 0.62162638, + "learning_rate": 2.283157698374194e-06, + "loss": 0.64202893, + "num_input_tokens_seen": 168392800, + "router_z_loss_clip": 0.01239014, + "router_z_loss_mlp": 0.2421875, + "step": 7836, + "time_per_iteration": 3.0448570251464844 + }, + { + "auxiliary_loss_clip": 0.01113991, + "auxiliary_loss_mlp": 0.01035938, + "balance_loss_clip": 1.02254474, + "balance_loss_mlp": 1.03829992, + "epoch": 0.4711859311588757, + "flos": 25446588658560.0, + "grad_norm": 1.5467691894783375, + "language_loss": 0.69550622, + "learning_rate": 2.2827721543810475e-06, + "loss": 0.71700549, + "num_input_tokens_seen": 168412940, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7578125, + "step": 7837, + "time_per_iteration": 2.480307102203369 + }, + { + "auxiliary_loss_clip": 0.01113119, + "auxiliary_loss_mlp": 0.01041426, + "balance_loss_clip": 1.02703786, + "balance_loss_mlp": 1.03986847, + "epoch": 0.47124605441154366, + "flos": 21981891519360.0, + "grad_norm": 1.8364060529940534, + "language_loss": 0.66015977, + "learning_rate": 2.282386599665153e-06, + "loss": 0.68170524, + "num_input_tokens_seen": 168431995, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.734375, + "step": 7838, + "time_per_iteration": 2.461975336074829 + }, + { + "auxiliary_loss_clip": 0.01112229, + "auxiliary_loss_mlp": 0.01030934, + "balance_loss_clip": 1.01755917, + "balance_loss_mlp": 1.03790629, + "epoch": 0.4713061776642116, + "flos": 25412689198080.0, + "grad_norm": 1.9120341376079564, + "language_loss": 0.77139461, + "learning_rate": 2.2820010342411304e-06, + "loss": 0.79282629, + "num_input_tokens_seen": 168454585, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7421875, + "step": 7839, + "time_per_iteration": 2.4788944721221924 + }, + { + "auxiliary_loss_clip": 0.01107554, + "auxiliary_loss_mlp": 0.01028891, + "balance_loss_clip": 1.0168395, + "balance_loss_mlp": 1.03794789, + "epoch": 0.4713663009168796, + "flos": 26542259170560.0, + "grad_norm": 1.9130481219619113, + "language_loss": 0.72918046, + "learning_rate": 2.2816154581235993e-06, + "loss": 0.75054491, + "num_input_tokens_seen": 168471265, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6953125, + "step": 7840, + "time_per_iteration": 2.495239019393921 + }, + { + "auxiliary_loss_clip": 0.01108309, + "auxiliary_loss_mlp": 0.01028993, + "balance_loss_clip": 1.01623714, + "balance_loss_mlp": 1.03712356, + "epoch": 0.47142642416954755, + "flos": 23623583650560.0, + "grad_norm": 1.5808172060169028, + "language_loss": 0.74886942, + "learning_rate": 2.2812298713271833e-06, + "loss": 0.77024251, + "num_input_tokens_seen": 168491360, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 7841, + "time_per_iteration": 2.454484224319458 + }, + { + "auxiliary_loss_clip": 0.01109803, + "auxiliary_loss_mlp": 0.01032659, + "balance_loss_clip": 1.02002275, + "balance_loss_mlp": 1.03838921, + "epoch": 0.4714865474222155, + "flos": 22310150935680.0, + "grad_norm": 1.602853925212418, + "language_loss": 0.70333457, + "learning_rate": 2.280844273866501e-06, + "loss": 0.72475922, + "num_input_tokens_seen": 168511335, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71484375, + "step": 7842, + "time_per_iteration": 2.4781782627105713 + }, + { + "auxiliary_loss_clip": 0.01111668, + "auxiliary_loss_mlp": 0.01029251, + "balance_loss_clip": 1.01659727, + "balance_loss_mlp": 1.04060411, + "epoch": 0.4715466706748835, + "flos": 17822430541440.0, + "grad_norm": 2.29732654226483, + "language_loss": 0.78893888, + "learning_rate": 2.280458665756177e-06, + "loss": 0.81034797, + "num_input_tokens_seen": 168529920, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 7843, + "time_per_iteration": 2.4125685691833496 + }, + { + "auxiliary_loss_clip": 0.01110204, + "auxiliary_loss_mlp": 0.0103101, + "balance_loss_clip": 1.01920795, + "balance_loss_mlp": 1.03860044, + "epoch": 0.4716067939275515, + "flos": 23659530186240.0, + "grad_norm": 1.6968163407172614, + "language_loss": 0.74375969, + "learning_rate": 2.280073047010832e-06, + "loss": 0.76517189, + "num_input_tokens_seen": 168550595, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.71484375, + "step": 7844, + "time_per_iteration": 3.915900230407715 + }, + { + "auxiliary_loss_clip": 0.01110838, + "auxiliary_loss_mlp": 0.0104121, + "balance_loss_clip": 1.0281688, + "balance_loss_mlp": 1.03888059, + "epoch": 0.47166691718021947, + "flos": 17930162407680.0, + "grad_norm": 1.5835392600478553, + "language_loss": 0.78286111, + "learning_rate": 2.279687417645088e-06, + "loss": 0.80438167, + "num_input_tokens_seen": 168569765, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 7845, + "time_per_iteration": 3.8502118587493896 + }, + { + "auxiliary_loss_clip": 0.01105883, + "auxiliary_loss_mlp": 0.01033342, + "balance_loss_clip": 1.02098632, + "balance_loss_mlp": 1.03725934, + "epoch": 0.47172704043288743, + "flos": 26614583205120.0, + "grad_norm": 1.4155938367608039, + "language_loss": 0.7311433, + "learning_rate": 2.2793017776735703e-06, + "loss": 0.75253546, + "num_input_tokens_seen": 168591525, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6875, + "step": 7846, + "time_per_iteration": 5.374008655548096 + }, + { + "auxiliary_loss_clip": 0.01105934, + "auxiliary_loss_mlp": 0.01030323, + "balance_loss_clip": 1.01794863, + "balance_loss_mlp": 1.03715074, + "epoch": 0.4717871636855554, + "flos": 27922700707200.0, + "grad_norm": 1.2885600176299252, + "language_loss": 0.74075842, + "learning_rate": 2.2789161271109e-06, + "loss": 0.76212096, + "num_input_tokens_seen": 168611235, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 7847, + "time_per_iteration": 2.5333058834075928 + }, + { + "auxiliary_loss_clip": 0.01110234, + "auxiliary_loss_mlp": 0.01034306, + "balance_loss_clip": 1.02229548, + "balance_loss_mlp": 1.03908157, + "epoch": 0.47184728693822336, + "flos": 14502237816960.0, + "grad_norm": 1.6263943719256755, + "language_loss": 0.80717957, + "learning_rate": 2.278530465971703e-06, + "loss": 0.82862496, + "num_input_tokens_seen": 168628710, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.7109375, + "step": 7848, + "time_per_iteration": 2.408688545227051 + }, + { + "auxiliary_loss_clip": 0.01115584, + "auxiliary_loss_mlp": 0.01031135, + "balance_loss_clip": 1.01844501, + "balance_loss_mlp": 1.04345632, + "epoch": 0.47190741019089133, + "flos": 17856545483520.0, + "grad_norm": 1.7499376956487047, + "language_loss": 0.70086265, + "learning_rate": 2.2781447942706032e-06, + "loss": 0.72232985, + "num_input_tokens_seen": 168645645, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 7849, + "time_per_iteration": 2.453542709350586 + }, + { + "auxiliary_loss_clip": 0.01114658, + "auxiliary_loss_mlp": 0.01035623, + "balance_loss_clip": 1.02144289, + "balance_loss_mlp": 1.03961349, + "epoch": 0.4719675334435593, + "flos": 17895472848000.0, + "grad_norm": 2.1591296324254095, + "language_loss": 0.69831544, + "learning_rate": 2.277759112022224e-06, + "loss": 0.71981823, + "num_input_tokens_seen": 168664165, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.75, + "step": 7850, + "time_per_iteration": 2.421095371246338 + }, + { + "auxiliary_loss_clip": 0.01115823, + "auxiliary_loss_mlp": 0.01030409, + "balance_loss_clip": 1.0175221, + "balance_loss_mlp": 1.04188704, + "epoch": 0.47202765669622726, + "flos": 20704369426560.0, + "grad_norm": 1.815710496912415, + "language_loss": 0.75220203, + "learning_rate": 2.2773734192411916e-06, + "loss": 0.7736643, + "num_input_tokens_seen": 168681940, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.73828125, + "step": 7851, + "time_per_iteration": 2.4666483402252197 + }, + { + "auxiliary_loss_clip": 0.01112485, + "auxiliary_loss_mlp": 0.01036309, + "balance_loss_clip": 1.02262962, + "balance_loss_mlp": 1.03831601, + "epoch": 0.4720877799488952, + "flos": 16360255607040.0, + "grad_norm": 1.7847776856215107, + "language_loss": 0.76165771, + "learning_rate": 2.276987715942132e-06, + "loss": 0.78314561, + "num_input_tokens_seen": 168698830, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7421875, + "step": 7852, + "time_per_iteration": 2.415109395980835 + }, + { + "auxiliary_loss_clip": 0.01111391, + "auxiliary_loss_mlp": 0.01029022, + "balance_loss_clip": 1.01553345, + "balance_loss_mlp": 1.04077876, + "epoch": 0.4721479032015632, + "flos": 20668171495680.0, + "grad_norm": 1.4478461916623044, + "language_loss": 0.68933171, + "learning_rate": 2.2766020021396696e-06, + "loss": 0.71073586, + "num_input_tokens_seen": 168718305, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.70703125, + "step": 7853, + "time_per_iteration": 2.4654150009155273 + }, + { + "auxiliary_loss_clip": 0.01033922, + "auxiliary_loss_mlp": 0.00998653, + "balance_loss_clip": 0.99743122, + "balance_loss_mlp": 1.01008511, + "epoch": 0.47220802645423116, + "flos": 67750438435200.0, + "grad_norm": 0.6983660788322832, + "language_loss": 0.50161922, + "learning_rate": 2.276216277848432e-06, + "loss": 0.52194494, + "num_input_tokens_seen": 168782365, + "router_z_loss_clip": 0.01220703, + "router_z_loss_mlp": 0.23828125, + "step": 7854, + "time_per_iteration": 3.190991163253784 + }, + { + "auxiliary_loss_clip": 0.0111395, + "auxiliary_loss_mlp": 0.01032681, + "balance_loss_clip": 1.0189656, + "balance_loss_mlp": 1.04039025, + "epoch": 0.4722681497068991, + "flos": 20921449271040.0, + "grad_norm": 1.7794050652620443, + "language_loss": 0.63844812, + "learning_rate": 2.2758305430830455e-06, + "loss": 0.65991443, + "num_input_tokens_seen": 168800485, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.734375, + "step": 7855, + "time_per_iteration": 2.503614664077759 + }, + { + "auxiliary_loss_clip": 0.01111503, + "auxiliary_loss_mlp": 0.01035422, + "balance_loss_clip": 1.02274394, + "balance_loss_mlp": 1.0393486, + "epoch": 0.4723282729595671, + "flos": 28293083798400.0, + "grad_norm": 1.8062233622492851, + "language_loss": 0.75802517, + "learning_rate": 2.2754447978581376e-06, + "loss": 0.7794944, + "num_input_tokens_seen": 168818965, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 7856, + "time_per_iteration": 2.499197244644165 + }, + { + "auxiliary_loss_clip": 0.01108332, + "auxiliary_loss_mlp": 0.01034982, + "balance_loss_clip": 1.02270377, + "balance_loss_mlp": 1.03774405, + "epoch": 0.4723883962122351, + "flos": 27125053338240.0, + "grad_norm": 1.914023874649731, + "language_loss": 0.7484442, + "learning_rate": 2.2750590421883347e-06, + "loss": 0.76987731, + "num_input_tokens_seen": 168840355, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.70703125, + "step": 7857, + "time_per_iteration": 2.5192370414733887 + }, + { + "auxiliary_loss_clip": 0.01109783, + "auxiliary_loss_mlp": 0.01043222, + "balance_loss_clip": 1.03118157, + "balance_loss_mlp": 1.03967714, + "epoch": 0.47244851946490307, + "flos": 31537253387520.0, + "grad_norm": 1.4716352183066603, + "language_loss": 0.6482265, + "learning_rate": 2.2746732760882655e-06, + "loss": 0.66975653, + "num_input_tokens_seen": 168861765, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.703125, + "step": 7858, + "time_per_iteration": 2.5169341564178467 + }, + { + "auxiliary_loss_clip": 0.01107114, + "auxiliary_loss_mlp": 0.01034557, + "balance_loss_clip": 1.02124774, + "balance_loss_mlp": 1.03680444, + "epoch": 0.47250864271757104, + "flos": 20886544229760.0, + "grad_norm": 1.569061056560701, + "language_loss": 0.70402861, + "learning_rate": 2.2742874995725575e-06, + "loss": 0.72544539, + "num_input_tokens_seen": 168881310, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.703125, + "step": 7859, + "time_per_iteration": 2.4850962162017822 + }, + { + "auxiliary_loss_clip": 0.01115812, + "auxiliary_loss_mlp": 0.01037422, + "balance_loss_clip": 1.0245533, + "balance_loss_mlp": 1.03993118, + "epoch": 0.472568765970239, + "flos": 20522086882560.0, + "grad_norm": 1.957216681544069, + "language_loss": 0.62261212, + "learning_rate": 2.2739017126558413e-06, + "loss": 0.64414442, + "num_input_tokens_seen": 168899470, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7578125, + "step": 7860, + "time_per_iteration": 2.435559034347534 + }, + { + "auxiliary_loss_clip": 0.01114067, + "auxiliary_loss_mlp": 0.01039582, + "balance_loss_clip": 1.02632594, + "balance_loss_mlp": 1.03998029, + "epoch": 0.47262888922290697, + "flos": 35805200417280.0, + "grad_norm": 2.1159962326169097, + "language_loss": 0.71988773, + "learning_rate": 2.2735159153527445e-06, + "loss": 0.7414242, + "num_input_tokens_seen": 168921495, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73828125, + "step": 7861, + "time_per_iteration": 2.5884346961975098 + }, + { + "auxiliary_loss_clip": 0.0111296, + "auxiliary_loss_mlp": 0.01037156, + "balance_loss_clip": 1.02440643, + "balance_loss_mlp": 1.03970647, + "epoch": 0.47268901247557493, + "flos": 20667740532480.0, + "grad_norm": 1.8695032169355525, + "language_loss": 0.85058391, + "learning_rate": 2.273130107677896e-06, + "loss": 0.87208509, + "num_input_tokens_seen": 168940515, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.734375, + "step": 7862, + "time_per_iteration": 2.439347505569458 + }, + { + "auxiliary_loss_clip": 0.01111085, + "auxiliary_loss_mlp": 0.01030878, + "balance_loss_clip": 1.01822364, + "balance_loss_mlp": 1.03786755, + "epoch": 0.4727491357282429, + "flos": 19573291082880.0, + "grad_norm": 1.736958967740828, + "language_loss": 0.8456251, + "learning_rate": 2.272744289645927e-06, + "loss": 0.86704469, + "num_input_tokens_seen": 168958340, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.734375, + "step": 7863, + "time_per_iteration": 2.48335862159729 + }, + { + "auxiliary_loss_clip": 0.01112215, + "auxiliary_loss_mlp": 0.0103699, + "balance_loss_clip": 1.02422917, + "balance_loss_mlp": 1.04029155, + "epoch": 0.47280925898091086, + "flos": 18217231902720.0, + "grad_norm": 1.8450896018132297, + "language_loss": 0.65939879, + "learning_rate": 2.272358461271467e-06, + "loss": 0.68089092, + "num_input_tokens_seen": 168974850, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.71875, + "step": 7864, + "time_per_iteration": 2.430302381515503 + }, + { + "auxiliary_loss_clip": 0.01111041, + "auxiliary_loss_mlp": 0.01030923, + "balance_loss_clip": 1.01771474, + "balance_loss_mlp": 1.03911948, + "epoch": 0.4728693822335788, + "flos": 17821820010240.0, + "grad_norm": 1.898956112201793, + "language_loss": 0.65435767, + "learning_rate": 2.271972622569147e-06, + "loss": 0.67577726, + "num_input_tokens_seen": 168992860, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71875, + "step": 7865, + "time_per_iteration": 2.4585866928100586 + }, + { + "auxiliary_loss_clip": 0.01107492, + "auxiliary_loss_mlp": 0.01033897, + "balance_loss_clip": 1.02195215, + "balance_loss_mlp": 1.0378449, + "epoch": 0.4729295054862468, + "flos": 20595057361920.0, + "grad_norm": 2.8918998215840244, + "language_loss": 0.74357843, + "learning_rate": 2.2715867735535976e-06, + "loss": 0.76499236, + "num_input_tokens_seen": 169010325, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6953125, + "step": 7866, + "time_per_iteration": 2.4264490604400635 + }, + { + "auxiliary_loss_clip": 0.01111501, + "auxiliary_loss_mlp": 0.0102998, + "balance_loss_clip": 1.01718307, + "balance_loss_mlp": 1.03777552, + "epoch": 0.47298962873891476, + "flos": 23368079232000.0, + "grad_norm": 3.2754467592530476, + "language_loss": 0.8285951, + "learning_rate": 2.271200914239451e-06, + "loss": 0.85000992, + "num_input_tokens_seen": 169029840, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.734375, + "step": 7867, + "time_per_iteration": 2.4925811290740967 + }, + { + "auxiliary_loss_clip": 0.011073, + "auxiliary_loss_mlp": 0.01029056, + "balance_loss_clip": 1.01655674, + "balance_loss_mlp": 1.03702307, + "epoch": 0.4730497519915827, + "flos": 22052240305920.0, + "grad_norm": 1.5927913973026295, + "language_loss": 0.79137915, + "learning_rate": 2.2708150446413385e-06, + "loss": 0.81274265, + "num_input_tokens_seen": 169049975, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 7868, + "time_per_iteration": 2.454094171524048 + }, + { + "auxiliary_loss_clip": 0.01114352, + "auxiliary_loss_mlp": 0.01029772, + "balance_loss_clip": 1.01608682, + "balance_loss_mlp": 1.03858244, + "epoch": 0.4731098752442507, + "flos": 21069724613760.0, + "grad_norm": 2.558281214251347, + "language_loss": 0.74588537, + "learning_rate": 2.2704291647738915e-06, + "loss": 0.76732659, + "num_input_tokens_seen": 169069540, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7578125, + "step": 7869, + "time_per_iteration": 2.4809184074401855 + }, + { + "auxiliary_loss_clip": 0.01114593, + "auxiliary_loss_mlp": 0.01042175, + "balance_loss_clip": 1.02767277, + "balance_loss_mlp": 1.04122782, + "epoch": 0.4731699984969187, + "flos": 22528775064960.0, + "grad_norm": 1.571794234452096, + "language_loss": 0.73950672, + "learning_rate": 2.2700432746517443e-06, + "loss": 0.76107442, + "num_input_tokens_seen": 169089940, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.734375, + "step": 7870, + "time_per_iteration": 2.4553706645965576 + }, + { + "auxiliary_loss_clip": 0.01117025, + "auxiliary_loss_mlp": 0.01038302, + "balance_loss_clip": 1.02400887, + "balance_loss_mlp": 1.04082036, + "epoch": 0.4732301217495867, + "flos": 24898124914560.0, + "grad_norm": 1.9039581815830153, + "language_loss": 0.81513011, + "learning_rate": 2.2696573742895292e-06, + "loss": 0.83668333, + "num_input_tokens_seen": 169109650, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.76171875, + "step": 7871, + "time_per_iteration": 2.5156424045562744 + }, + { + "auxiliary_loss_clip": 0.0111227, + "auxiliary_loss_mlp": 0.01033696, + "balance_loss_clip": 1.02067888, + "balance_loss_mlp": 1.03990555, + "epoch": 0.47329024500225464, + "flos": 22784423137920.0, + "grad_norm": 1.6438263319482285, + "language_loss": 0.75679815, + "learning_rate": 2.269271463701879e-06, + "loss": 0.77825779, + "num_input_tokens_seen": 169128990, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 7872, + "time_per_iteration": 2.453831672668457 + }, + { + "auxiliary_loss_clip": 0.01110565, + "auxiliary_loss_mlp": 0.01034313, + "balance_loss_clip": 1.02088451, + "balance_loss_mlp": 1.03784847, + "epoch": 0.4733503682549226, + "flos": 38695902220800.0, + "grad_norm": 1.7923349992019921, + "language_loss": 0.67857021, + "learning_rate": 2.268885542903428e-06, + "loss": 0.700019, + "num_input_tokens_seen": 169154645, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7265625, + "step": 7873, + "time_per_iteration": 2.6532957553863525 + }, + { + "auxiliary_loss_clip": 0.01113022, + "auxiliary_loss_mlp": 0.01031944, + "balance_loss_clip": 1.01881886, + "balance_loss_mlp": 1.04162037, + "epoch": 0.47341049150759057, + "flos": 22966849336320.0, + "grad_norm": 1.6289748569468698, + "language_loss": 0.72085869, + "learning_rate": 2.26849961190881e-06, + "loss": 0.74230838, + "num_input_tokens_seen": 169174995, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71484375, + "step": 7874, + "time_per_iteration": 2.474073648452759 + }, + { + "auxiliary_loss_clip": 0.01113429, + "auxiliary_loss_mlp": 0.01035269, + "balance_loss_clip": 1.02190506, + "balance_loss_mlp": 1.03987253, + "epoch": 0.47347061476025853, + "flos": 14538471661440.0, + "grad_norm": 2.446593699000123, + "language_loss": 0.65108937, + "learning_rate": 2.26811367073266e-06, + "loss": 0.67257631, + "num_input_tokens_seen": 169191815, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.734375, + "step": 7875, + "time_per_iteration": 2.4433648586273193 + }, + { + "auxiliary_loss_clip": 0.01115895, + "auxiliary_loss_mlp": 0.0103072, + "balance_loss_clip": 1.01718342, + "balance_loss_mlp": 1.04219341, + "epoch": 0.4735307380129265, + "flos": 30263250827520.0, + "grad_norm": 2.56524610984038, + "language_loss": 0.81091076, + "learning_rate": 2.2677277193896125e-06, + "loss": 0.83237696, + "num_input_tokens_seen": 169210430, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73828125, + "step": 7876, + "time_per_iteration": 2.540485143661499 + }, + { + "auxiliary_loss_clip": 0.01108757, + "auxiliary_loss_mlp": 0.01035508, + "balance_loss_clip": 1.02232385, + "balance_loss_mlp": 1.0358628, + "epoch": 0.47359086126559446, + "flos": 19391044452480.0, + "grad_norm": 1.7859307736041579, + "language_loss": 0.7925123, + "learning_rate": 2.267341757894304e-06, + "loss": 0.81395495, + "num_input_tokens_seen": 169229295, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73046875, + "step": 7877, + "time_per_iteration": 2.627589225769043 + }, + { + "auxiliary_loss_clip": 0.01110689, + "auxiliary_loss_mlp": 0.01030438, + "balance_loss_clip": 1.01751554, + "balance_loss_mlp": 1.03852785, + "epoch": 0.47365098451826243, + "flos": 21939408708480.0, + "grad_norm": 1.8692095295200843, + "language_loss": 0.70723194, + "learning_rate": 2.2669557862613685e-06, + "loss": 0.72864318, + "num_input_tokens_seen": 169247855, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.72265625, + "step": 7878, + "time_per_iteration": 2.535684108734131 + }, + { + "auxiliary_loss_clip": 0.01108668, + "auxiliary_loss_mlp": 0.01030671, + "balance_loss_clip": 1.01792121, + "balance_loss_mlp": 1.03918552, + "epoch": 0.4737111077709304, + "flos": 25845053207040.0, + "grad_norm": 1.811278524460759, + "language_loss": 0.75030494, + "learning_rate": 2.2665698045054425e-06, + "loss": 0.77169836, + "num_input_tokens_seen": 169268860, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 7879, + "time_per_iteration": 2.518188953399658 + }, + { + "auxiliary_loss_clip": 0.01034961, + "auxiliary_loss_mlp": 0.01000904, + "balance_loss_clip": 0.99943775, + "balance_loss_mlp": 1.01098931, + "epoch": 0.47377123102359836, + "flos": 67760886314880.0, + "grad_norm": 0.7286317750961989, + "language_loss": 0.6135056, + "learning_rate": 2.266183812641164e-06, + "loss": 0.63386428, + "num_input_tokens_seen": 169331855, + "router_z_loss_clip": 0.01464844, + "router_z_loss_mlp": 0.24023438, + "step": 7880, + "time_per_iteration": 3.0518951416015625 + }, + { + "auxiliary_loss_clip": 0.01110775, + "auxiliary_loss_mlp": 0.01033424, + "balance_loss_clip": 1.01922059, + "balance_loss_mlp": 1.03901792, + "epoch": 0.4738313542762663, + "flos": 24315977191680.0, + "grad_norm": 1.5146846775966347, + "language_loss": 0.6795128, + "learning_rate": 2.2657978106831675e-06, + "loss": 0.70095479, + "num_input_tokens_seen": 169352175, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.71875, + "step": 7881, + "time_per_iteration": 2.5058367252349854 + }, + { + "auxiliary_loss_clip": 0.01111211, + "auxiliary_loss_mlp": 0.01028797, + "balance_loss_clip": 1.01614857, + "balance_loss_mlp": 1.03997886, + "epoch": 0.4738914775289343, + "flos": 20705339093760.0, + "grad_norm": 1.916106799054198, + "language_loss": 0.77455914, + "learning_rate": 2.265411798646092e-06, + "loss": 0.79595923, + "num_input_tokens_seen": 169371215, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 7882, + "time_per_iteration": 2.475503921508789 + }, + { + "auxiliary_loss_clip": 0.01113056, + "auxiliary_loss_mlp": 0.01030232, + "balance_loss_clip": 1.01675582, + "balance_loss_mlp": 1.03993428, + "epoch": 0.4739516007816023, + "flos": 25446337263360.0, + "grad_norm": 1.505527482540033, + "language_loss": 0.7617712, + "learning_rate": 2.2650257765445747e-06, + "loss": 0.78320408, + "num_input_tokens_seen": 169391745, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 7883, + "time_per_iteration": 2.5051398277282715 + }, + { + "auxiliary_loss_clip": 0.01111273, + "auxiliary_loss_mlp": 0.01029698, + "balance_loss_clip": 1.01724708, + "balance_loss_mlp": 1.03893495, + "epoch": 0.4740117240342703, + "flos": 19974341410560.0, + "grad_norm": 1.7576670192685107, + "language_loss": 0.71994746, + "learning_rate": 2.2646397443932525e-06, + "loss": 0.74135715, + "num_input_tokens_seen": 169409845, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7265625, + "step": 7884, + "time_per_iteration": 2.4406635761260986 + }, + { + "auxiliary_loss_clip": 0.01117273, + "auxiliary_loss_mlp": 0.0103414, + "balance_loss_clip": 1.02024651, + "balance_loss_mlp": 1.04002821, + "epoch": 0.47407184728693824, + "flos": 15661146222720.0, + "grad_norm": 2.026641651540024, + "language_loss": 0.82025737, + "learning_rate": 2.2642537022067655e-06, + "loss": 0.84177154, + "num_input_tokens_seen": 169426085, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7734375, + "step": 7885, + "time_per_iteration": 2.463895797729492 + }, + { + "auxiliary_loss_clip": 0.01115601, + "auxiliary_loss_mlp": 0.01034821, + "balance_loss_clip": 1.02152371, + "balance_loss_mlp": 1.04353762, + "epoch": 0.4741319705396062, + "flos": 18588800142720.0, + "grad_norm": 1.728500395905687, + "language_loss": 0.73431885, + "learning_rate": 2.263867649999751e-06, + "loss": 0.75582302, + "num_input_tokens_seen": 169444705, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 7886, + "time_per_iteration": 3.8351001739501953 + }, + { + "auxiliary_loss_clip": 0.01116571, + "auxiliary_loss_mlp": 0.01034684, + "balance_loss_clip": 1.02036691, + "balance_loss_mlp": 1.03938007, + "epoch": 0.47419209379227417, + "flos": 13261093223040.0, + "grad_norm": 2.1265145819393667, + "language_loss": 0.73465097, + "learning_rate": 2.263481587786849e-06, + "loss": 0.75616348, + "num_input_tokens_seen": 169460850, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.76953125, + "step": 7887, + "time_per_iteration": 5.437266111373901 + }, + { + "auxiliary_loss_clip": 0.01108649, + "auxiliary_loss_mlp": 0.01028216, + "balance_loss_clip": 1.01562774, + "balance_loss_mlp": 1.03885245, + "epoch": 0.47425221704494214, + "flos": 20044043752320.0, + "grad_norm": 1.895223723891788, + "language_loss": 0.77138984, + "learning_rate": 2.2630955155826993e-06, + "loss": 0.79275852, + "num_input_tokens_seen": 169478890, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69921875, + "step": 7888, + "time_per_iteration": 3.8908259868621826 + }, + { + "auxiliary_loss_clip": 0.01113126, + "auxiliary_loss_mlp": 0.01032757, + "balance_loss_clip": 1.02004313, + "balance_loss_mlp": 1.04045427, + "epoch": 0.4743123402976101, + "flos": 27271892136960.0, + "grad_norm": 1.663584432705133, + "language_loss": 0.72822642, + "learning_rate": 2.2627094334019406e-06, + "loss": 0.74968517, + "num_input_tokens_seen": 169499690, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7265625, + "step": 7889, + "time_per_iteration": 2.5004560947418213 + }, + { + "auxiliary_loss_clip": 0.01036118, + "auxiliary_loss_mlp": 0.01004378, + "balance_loss_clip": 1.00301266, + "balance_loss_mlp": 1.0120219, + "epoch": 0.47437246355027807, + "flos": 55393970261760.0, + "grad_norm": 1.138520548555467, + "language_loss": 0.5608511, + "learning_rate": 2.262323341259214e-06, + "loss": 0.58125609, + "num_input_tokens_seen": 169560475, + "router_z_loss_clip": 0.01367188, + "router_z_loss_mlp": 0.24121094, + "step": 7890, + "time_per_iteration": 3.116922378540039 + }, + { + "auxiliary_loss_clip": 0.01114938, + "auxiliary_loss_mlp": 0.0103492, + "balance_loss_clip": 1.02009606, + "balance_loss_mlp": 1.04115105, + "epoch": 0.47443258680294603, + "flos": 23878477537920.0, + "grad_norm": 2.185015538438359, + "language_loss": 0.6552254, + "learning_rate": 2.2619372391691605e-06, + "loss": 0.67672396, + "num_input_tokens_seen": 169580110, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.734375, + "step": 7891, + "time_per_iteration": 2.475003242492676 + }, + { + "auxiliary_loss_clip": 0.011182, + "auxiliary_loss_mlp": 0.01035949, + "balance_loss_clip": 1.02170992, + "balance_loss_mlp": 1.04182184, + "epoch": 0.474492710055614, + "flos": 21977761455360.0, + "grad_norm": 2.136023484028619, + "language_loss": 0.70221758, + "learning_rate": 2.26155112714642e-06, + "loss": 0.72375906, + "num_input_tokens_seen": 169597510, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.76171875, + "step": 7892, + "time_per_iteration": 2.45662260055542 + }, + { + "auxiliary_loss_clip": 0.01036198, + "auxiliary_loss_mlp": 0.01003564, + "balance_loss_clip": 1.00212717, + "balance_loss_mlp": 1.01211762, + "epoch": 0.47455283330828196, + "flos": 62557180122240.0, + "grad_norm": 0.8097608885887184, + "language_loss": 0.5861572, + "learning_rate": 2.2611650052056355e-06, + "loss": 0.60655481, + "num_input_tokens_seen": 169660010, + "router_z_loss_clip": 0.01434326, + "router_z_loss_mlp": 0.24121094, + "step": 7893, + "time_per_iteration": 3.1652448177337646 + }, + { + "auxiliary_loss_clip": 0.01114001, + "auxiliary_loss_mlp": 0.01033874, + "balance_loss_clip": 1.02131534, + "balance_loss_mlp": 1.04149461, + "epoch": 0.47461295656094993, + "flos": 12093637380480.0, + "grad_norm": 1.8991850536849317, + "language_loss": 0.77645361, + "learning_rate": 2.2607788733614463e-06, + "loss": 0.79793239, + "num_input_tokens_seen": 169678485, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7265625, + "step": 7894, + "time_per_iteration": 2.4849085807800293 + }, + { + "auxiliary_loss_clip": 0.01112228, + "auxiliary_loss_mlp": 0.01031855, + "balance_loss_clip": 1.01912403, + "balance_loss_mlp": 1.04029822, + "epoch": 0.4746730798136179, + "flos": 20884568981760.0, + "grad_norm": 1.6188047164673534, + "language_loss": 0.74456996, + "learning_rate": 2.260392731628497e-06, + "loss": 0.76601076, + "num_input_tokens_seen": 169697335, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 7895, + "time_per_iteration": 2.456735372543335 + }, + { + "auxiliary_loss_clip": 0.01110765, + "auxiliary_loss_mlp": 0.01029148, + "balance_loss_clip": 1.01553416, + "balance_loss_mlp": 1.03990245, + "epoch": 0.4747332030662859, + "flos": 19974808287360.0, + "grad_norm": 1.9073077974003343, + "language_loss": 0.82539713, + "learning_rate": 2.260006580021429e-06, + "loss": 0.84679627, + "num_input_tokens_seen": 169715395, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7109375, + "step": 7896, + "time_per_iteration": 2.5201456546783447 + }, + { + "auxiliary_loss_clip": 0.01110585, + "auxiliary_loss_mlp": 0.01029897, + "balance_loss_clip": 1.0161047, + "balance_loss_mlp": 1.03953171, + "epoch": 0.4747933263189539, + "flos": 16034186920320.0, + "grad_norm": 1.922550471395919, + "language_loss": 0.75487721, + "learning_rate": 2.259620418554886e-06, + "loss": 0.77628207, + "num_input_tokens_seen": 169733755, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7109375, + "step": 7897, + "time_per_iteration": 2.42526912689209 + }, + { + "auxiliary_loss_clip": 0.01116598, + "auxiliary_loss_mlp": 0.01036051, + "balance_loss_clip": 1.02316415, + "balance_loss_mlp": 1.04003334, + "epoch": 0.47485344957162184, + "flos": 13955102876160.0, + "grad_norm": 2.1696415620255145, + "language_loss": 0.63682836, + "learning_rate": 2.25923424724351e-06, + "loss": 0.65835488, + "num_input_tokens_seen": 169751390, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.765625, + "step": 7898, + "time_per_iteration": 2.443390369415283 + }, + { + "auxiliary_loss_clip": 0.01111767, + "auxiliary_loss_mlp": 0.01036151, + "balance_loss_clip": 1.02263284, + "balance_loss_mlp": 1.03901982, + "epoch": 0.4749135728242898, + "flos": 20449080489600.0, + "grad_norm": 2.0733269605967997, + "language_loss": 0.6999402, + "learning_rate": 2.258848066101946e-06, + "loss": 0.72141939, + "num_input_tokens_seen": 169769500, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7265625, + "step": 7899, + "time_per_iteration": 2.5906245708465576 + }, + { + "auxiliary_loss_clip": 0.01114723, + "auxiliary_loss_mlp": 0.01036945, + "balance_loss_clip": 1.02314603, + "balance_loss_mlp": 1.04054523, + "epoch": 0.4749736960769578, + "flos": 28949961767040.0, + "grad_norm": 1.8534573860401393, + "language_loss": 0.68523431, + "learning_rate": 2.258461875144837e-06, + "loss": 0.70675093, + "num_input_tokens_seen": 169789215, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7421875, + "step": 7900, + "time_per_iteration": 2.5417144298553467 + }, + { + "auxiliary_loss_clip": 0.01112761, + "auxiliary_loss_mlp": 0.01033859, + "balance_loss_clip": 1.02096641, + "balance_loss_mlp": 1.03979492, + "epoch": 0.47503381932962574, + "flos": 31938770592000.0, + "grad_norm": 1.9751823447072345, + "language_loss": 0.70783907, + "learning_rate": 2.2580756743868273e-06, + "loss": 0.72930533, + "num_input_tokens_seen": 169808825, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.73046875, + "step": 7901, + "time_per_iteration": 2.5215682983398438 + }, + { + "auxiliary_loss_clip": 0.01115181, + "auxiliary_loss_mlp": 0.01041261, + "balance_loss_clip": 1.02833235, + "balance_loss_mlp": 1.0420568, + "epoch": 0.4750939425822937, + "flos": 22127257860480.0, + "grad_norm": 1.7245601487210742, + "language_loss": 0.73674953, + "learning_rate": 2.2576894638425636e-06, + "loss": 0.75831395, + "num_input_tokens_seen": 169827590, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.734375, + "step": 7902, + "time_per_iteration": 2.4854226112365723 + }, + { + "auxiliary_loss_clip": 0.01108872, + "auxiliary_loss_mlp": 0.01033639, + "balance_loss_clip": 1.02169394, + "balance_loss_mlp": 1.03990698, + "epoch": 0.47515406583496167, + "flos": 20850094903680.0, + "grad_norm": 1.6802974507725348, + "language_loss": 0.68601072, + "learning_rate": 2.257303243526688e-06, + "loss": 0.70743585, + "num_input_tokens_seen": 169844925, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.69140625, + "step": 7903, + "time_per_iteration": 2.44101619720459 + }, + { + "auxiliary_loss_clip": 0.01108361, + "auxiliary_loss_mlp": 0.0103213, + "balance_loss_clip": 1.01995277, + "balance_loss_mlp": 1.03901863, + "epoch": 0.47521418908762963, + "flos": 17524802448000.0, + "grad_norm": 1.4630263980427167, + "language_loss": 0.7225582, + "learning_rate": 2.256917013453848e-06, + "loss": 0.74396306, + "num_input_tokens_seen": 169862705, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6953125, + "step": 7904, + "time_per_iteration": 2.469230890274048 + }, + { + "auxiliary_loss_clip": 0.01108968, + "auxiliary_loss_mlp": 0.01030528, + "balance_loss_clip": 1.01894665, + "balance_loss_mlp": 1.03912354, + "epoch": 0.4752743123402976, + "flos": 20559434048640.0, + "grad_norm": 1.669936371268517, + "language_loss": 0.86257637, + "learning_rate": 2.25653077363869e-06, + "loss": 0.88397133, + "num_input_tokens_seen": 169880155, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6953125, + "step": 7905, + "time_per_iteration": 2.442215919494629 + }, + { + "auxiliary_loss_clip": 0.0110692, + "auxiliary_loss_mlp": 0.0102936, + "balance_loss_clip": 1.01750422, + "balance_loss_mlp": 1.03796053, + "epoch": 0.47533443559296557, + "flos": 26360623071360.0, + "grad_norm": 1.6116801799731275, + "language_loss": 0.82223809, + "learning_rate": 2.2561445240958583e-06, + "loss": 0.84360093, + "num_input_tokens_seen": 169901525, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 7906, + "time_per_iteration": 2.503708600997925 + }, + { + "auxiliary_loss_clip": 0.01033043, + "auxiliary_loss_mlp": 0.01004824, + "balance_loss_clip": 1.00345886, + "balance_loss_mlp": 1.00910616, + "epoch": 0.47539455884563353, + "flos": 65949660967680.0, + "grad_norm": 0.6702574149317626, + "language_loss": 0.59028685, + "learning_rate": 2.255758264840002e-06, + "loss": 0.61066544, + "num_input_tokens_seen": 169970345, + "router_z_loss_clip": 0.01367188, + "router_z_loss_mlp": 0.23925781, + "step": 7907, + "time_per_iteration": 3.156270980834961 + }, + { + "auxiliary_loss_clip": 0.01112242, + "auxiliary_loss_mlp": 0.01036172, + "balance_loss_clip": 1.02349377, + "balance_loss_mlp": 1.04145598, + "epoch": 0.4754546820983015, + "flos": 17238128002560.0, + "grad_norm": 1.9115330257313565, + "language_loss": 0.81044137, + "learning_rate": 2.255371995885765e-06, + "loss": 0.83192551, + "num_input_tokens_seen": 169986440, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 7908, + "time_per_iteration": 2.4719884395599365 + }, + { + "auxiliary_loss_clip": 0.01115991, + "auxiliary_loss_mlp": 0.01033248, + "balance_loss_clip": 1.01944923, + "balance_loss_mlp": 1.04349983, + "epoch": 0.47551480535096946, + "flos": 19825886499840.0, + "grad_norm": 1.7275790068018955, + "language_loss": 0.73515987, + "learning_rate": 2.254985717247797e-06, + "loss": 0.75665224, + "num_input_tokens_seen": 170005705, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.72265625, + "step": 7909, + "time_per_iteration": 2.4672436714172363 + }, + { + "auxiliary_loss_clip": 0.01110088, + "auxiliary_loss_mlp": 0.01031421, + "balance_loss_clip": 1.01887441, + "balance_loss_mlp": 1.03941047, + "epoch": 0.4755749286036375, + "flos": 22163958581760.0, + "grad_norm": 1.618978075546398, + "language_loss": 0.75284743, + "learning_rate": 2.2545994289407457e-06, + "loss": 0.77426249, + "num_input_tokens_seen": 170023415, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.70703125, + "step": 7910, + "time_per_iteration": 2.498745918273926 + }, + { + "auxiliary_loss_clip": 0.0110873, + "auxiliary_loss_mlp": 0.01026701, + "balance_loss_clip": 1.01494122, + "balance_loss_mlp": 1.03872323, + "epoch": 0.47563505185630545, + "flos": 21648280976640.0, + "grad_norm": 1.8146975429148502, + "language_loss": 0.78950047, + "learning_rate": 2.2542131309792577e-06, + "loss": 0.81085479, + "num_input_tokens_seen": 170042395, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.703125, + "step": 7911, + "time_per_iteration": 2.4530739784240723 + }, + { + "auxiliary_loss_clip": 0.01112727, + "auxiliary_loss_mlp": 0.01030628, + "balance_loss_clip": 1.01709199, + "balance_loss_mlp": 1.03904319, + "epoch": 0.4756951751089734, + "flos": 20628777254400.0, + "grad_norm": 1.5788116451196046, + "language_loss": 0.75611186, + "learning_rate": 2.253826823377983e-06, + "loss": 0.77754539, + "num_input_tokens_seen": 170061610, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 7912, + "time_per_iteration": 2.468348741531372 + }, + { + "auxiliary_loss_clip": 0.01107815, + "auxiliary_loss_mlp": 0.01033048, + "balance_loss_clip": 1.02094245, + "balance_loss_mlp": 1.03746927, + "epoch": 0.4757552983616414, + "flos": 25848788221440.0, + "grad_norm": 1.4305595105203048, + "language_loss": 0.74305665, + "learning_rate": 2.253440506151569e-06, + "loss": 0.76446521, + "num_input_tokens_seen": 170083505, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.703125, + "step": 7913, + "time_per_iteration": 2.4857094287872314 + }, + { + "auxiliary_loss_clip": 0.01111637, + "auxiliary_loss_mlp": 0.01026142, + "balance_loss_clip": 1.01336265, + "balance_loss_mlp": 1.04057527, + "epoch": 0.47581542161430934, + "flos": 18223013992320.0, + "grad_norm": 1.9652679728787295, + "language_loss": 0.72320372, + "learning_rate": 2.253054179314666e-06, + "loss": 0.74458152, + "num_input_tokens_seen": 170100690, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7109375, + "step": 7914, + "time_per_iteration": 2.4559848308563232 + }, + { + "auxiliary_loss_clip": 0.01114052, + "auxiliary_loss_mlp": 0.01031259, + "balance_loss_clip": 1.0191946, + "balance_loss_mlp": 1.04203475, + "epoch": 0.4758755448669773, + "flos": 21579763783680.0, + "grad_norm": 1.960460869956429, + "language_loss": 0.64513958, + "learning_rate": 2.2526678428819227e-06, + "loss": 0.66659272, + "num_input_tokens_seen": 170119240, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71875, + "step": 7915, + "time_per_iteration": 2.4528729915618896 + }, + { + "auxiliary_loss_clip": 0.01106319, + "auxiliary_loss_mlp": 0.01032601, + "balance_loss_clip": 1.020257, + "balance_loss_mlp": 1.03847694, + "epoch": 0.47593566811964527, + "flos": 15231152511360.0, + "grad_norm": 1.6765568872542898, + "language_loss": 0.76760435, + "learning_rate": 2.2522814968679896e-06, + "loss": 0.7889936, + "num_input_tokens_seen": 170136450, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6796875, + "step": 7916, + "time_per_iteration": 2.4544637203216553 + }, + { + "auxiliary_loss_clip": 0.01109831, + "auxiliary_loss_mlp": 0.01029473, + "balance_loss_clip": 1.01720083, + "balance_loss_mlp": 1.038872, + "epoch": 0.47599579137231324, + "flos": 21543242630400.0, + "grad_norm": 1.7964770898598468, + "language_loss": 0.64513361, + "learning_rate": 2.2518951412875173e-06, + "loss": 0.66652668, + "num_input_tokens_seen": 170155295, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.70703125, + "step": 7917, + "time_per_iteration": 2.4966535568237305 + }, + { + "auxiliary_loss_clip": 0.01033431, + "auxiliary_loss_mlp": 0.01003778, + "balance_loss_clip": 1.00258541, + "balance_loss_mlp": 1.00975943, + "epoch": 0.4760559146249812, + "flos": 64554602595840.0, + "grad_norm": 0.8336021747517385, + "language_loss": 0.6568867, + "learning_rate": 2.2515087761551557e-06, + "loss": 0.67725885, + "num_input_tokens_seen": 170222325, + "router_z_loss_clip": 0.01190186, + "router_z_loss_mlp": 0.23632812, + "step": 7918, + "time_per_iteration": 3.0902352333068848 + }, + { + "auxiliary_loss_clip": 0.01111138, + "auxiliary_loss_mlp": 0.01031837, + "balance_loss_clip": 1.01937342, + "balance_loss_mlp": 1.03909731, + "epoch": 0.47611603787764917, + "flos": 22233876405120.0, + "grad_norm": 1.7210259476746916, + "language_loss": 0.6884234, + "learning_rate": 2.2511224014855563e-06, + "loss": 0.70985305, + "num_input_tokens_seen": 170241625, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71875, + "step": 7919, + "time_per_iteration": 2.451730728149414 + }, + { + "auxiliary_loss_clip": 0.01111075, + "auxiliary_loss_mlp": 0.0103435, + "balance_loss_clip": 1.02188087, + "balance_loss_mlp": 1.03897047, + "epoch": 0.47617616113031713, + "flos": 22780005765120.0, + "grad_norm": 1.5380536315740185, + "language_loss": 0.74750632, + "learning_rate": 2.2507360172933694e-06, + "loss": 0.7689606, + "num_input_tokens_seen": 170262470, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71875, + "step": 7920, + "time_per_iteration": 2.5365359783172607 + }, + { + "auxiliary_loss_clip": 0.0111556, + "auxiliary_loss_mlp": 0.01032511, + "balance_loss_clip": 1.01854539, + "balance_loss_mlp": 1.04174948, + "epoch": 0.4762362843829851, + "flos": 24133802388480.0, + "grad_norm": 1.4190261222987137, + "language_loss": 0.77478063, + "learning_rate": 2.2503496235932487e-06, + "loss": 0.79626137, + "num_input_tokens_seen": 170283460, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.73828125, + "step": 7921, + "time_per_iteration": 2.4841856956481934 + }, + { + "auxiliary_loss_clip": 0.01112061, + "auxiliary_loss_mlp": 0.01035062, + "balance_loss_clip": 1.02113843, + "balance_loss_mlp": 1.03917885, + "epoch": 0.47629640763565306, + "flos": 22452069571200.0, + "grad_norm": 1.531083685843196, + "language_loss": 0.78213, + "learning_rate": 2.249963220399845e-06, + "loss": 0.80360126, + "num_input_tokens_seen": 170304225, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7265625, + "step": 7922, + "time_per_iteration": 2.537930965423584 + }, + { + "auxiliary_loss_clip": 0.01115671, + "auxiliary_loss_mlp": 0.0103746, + "balance_loss_clip": 1.02360809, + "balance_loss_mlp": 1.04113102, + "epoch": 0.4763565308883211, + "flos": 11181398647680.0, + "grad_norm": 1.7101716924021442, + "language_loss": 0.72932559, + "learning_rate": 2.2495768077278104e-06, + "loss": 0.75085688, + "num_input_tokens_seen": 170322110, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.74609375, + "step": 7923, + "time_per_iteration": 2.4527640342712402 + }, + { + "auxiliary_loss_clip": 0.01110421, + "auxiliary_loss_mlp": 0.01032066, + "balance_loss_clip": 1.01978159, + "balance_loss_mlp": 1.03808331, + "epoch": 0.47641665414098905, + "flos": 22382151747840.0, + "grad_norm": 2.125534979901623, + "language_loss": 0.81915551, + "learning_rate": 2.2491903855917992e-06, + "loss": 0.84058034, + "num_input_tokens_seen": 170340700, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.7265625, + "step": 7924, + "time_per_iteration": 2.480109930038452 + }, + { + "auxiliary_loss_clip": 0.01120558, + "auxiliary_loss_mlp": 0.01038344, + "balance_loss_clip": 1.0246644, + "balance_loss_mlp": 1.04359889, + "epoch": 0.476476777393657, + "flos": 25046148862080.0, + "grad_norm": 1.7710398873833821, + "language_loss": 0.80079067, + "learning_rate": 2.2488039540064626e-06, + "loss": 0.82237971, + "num_input_tokens_seen": 170359780, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76953125, + "step": 7925, + "time_per_iteration": 2.4877142906188965 + }, + { + "auxiliary_loss_clip": 0.01110581, + "auxiliary_loss_mlp": 0.01035936, + "balance_loss_clip": 1.02343702, + "balance_loss_mlp": 1.03800642, + "epoch": 0.476536900646325, + "flos": 27269916888960.0, + "grad_norm": 2.066985409764694, + "language_loss": 0.72263825, + "learning_rate": 2.2484175129864558e-06, + "loss": 0.74410343, + "num_input_tokens_seen": 170381260, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7265625, + "step": 7926, + "time_per_iteration": 2.561140298843384 + }, + { + "auxiliary_loss_clip": 0.01116818, + "auxiliary_loss_mlp": 0.01029726, + "balance_loss_clip": 1.01623797, + "balance_loss_mlp": 1.04205072, + "epoch": 0.47659702389899294, + "flos": 25301401885440.0, + "grad_norm": 8.404578303652414, + "language_loss": 0.68589562, + "learning_rate": 2.248031062546432e-06, + "loss": 0.7073611, + "num_input_tokens_seen": 170400595, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 7927, + "time_per_iteration": 2.4860117435455322 + }, + { + "auxiliary_loss_clip": 0.01111384, + "auxiliary_loss_mlp": 0.01025704, + "balance_loss_clip": 1.0138253, + "balance_loss_mlp": 1.04121518, + "epoch": 0.4766571471516609, + "flos": 25992861672960.0, + "grad_norm": 1.5906069345122125, + "language_loss": 0.68003678, + "learning_rate": 2.247644602701045e-06, + "loss": 0.70140767, + "num_input_tokens_seen": 170421110, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.703125, + "step": 7928, + "time_per_iteration": 3.917212724685669 + }, + { + "auxiliary_loss_clip": 0.0111287, + "auxiliary_loss_mlp": 0.01028459, + "balance_loss_clip": 1.0160315, + "balance_loss_mlp": 1.04099739, + "epoch": 0.4767172704043289, + "flos": 16032211672320.0, + "grad_norm": 2.0359036820122762, + "language_loss": 0.79055941, + "learning_rate": 2.2472581334649496e-06, + "loss": 0.81197274, + "num_input_tokens_seen": 170436700, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71875, + "step": 7929, + "time_per_iteration": 5.38159441947937 + }, + { + "auxiliary_loss_clip": 0.01109888, + "auxiliary_loss_mlp": 0.01032742, + "balance_loss_clip": 1.02098787, + "balance_loss_mlp": 1.04033756, + "epoch": 0.47677739365699684, + "flos": 39235351651200.0, + "grad_norm": 1.8427147864954625, + "language_loss": 0.6634798, + "learning_rate": 2.2468716548528016e-06, + "loss": 0.68490613, + "num_input_tokens_seen": 170459555, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6953125, + "step": 7930, + "time_per_iteration": 4.1562559604644775 + }, + { + "auxiliary_loss_clip": 0.01110022, + "auxiliary_loss_mlp": 0.01030402, + "balance_loss_clip": 1.01830864, + "balance_loss_mlp": 1.03929853, + "epoch": 0.4768375169096648, + "flos": 24717781704960.0, + "grad_norm": 1.7695493738399266, + "language_loss": 0.80279613, + "learning_rate": 2.2464851668792555e-06, + "loss": 0.82420039, + "num_input_tokens_seen": 170479175, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.70703125, + "step": 7931, + "time_per_iteration": 2.483144760131836 + }, + { + "auxiliary_loss_clip": 0.01112785, + "auxiliary_loss_mlp": 0.01029869, + "balance_loss_clip": 1.01667237, + "balance_loss_mlp": 1.04009867, + "epoch": 0.47689764016233277, + "flos": 22528667324160.0, + "grad_norm": 1.714860616709588, + "language_loss": 0.75956833, + "learning_rate": 2.2460986695589678e-06, + "loss": 0.78099489, + "num_input_tokens_seen": 170498450, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7265625, + "step": 7932, + "time_per_iteration": 2.4789490699768066 + }, + { + "auxiliary_loss_clip": 0.0111028, + "auxiliary_loss_mlp": 0.01033967, + "balance_loss_clip": 1.02110386, + "balance_loss_mlp": 1.04108882, + "epoch": 0.47695776341500074, + "flos": 15120619384320.0, + "grad_norm": 2.3368480026304748, + "language_loss": 0.79639196, + "learning_rate": 2.245712162906593e-06, + "loss": 0.81783438, + "num_input_tokens_seen": 170516255, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69140625, + "step": 7933, + "time_per_iteration": 2.4574432373046875 + }, + { + "auxiliary_loss_clip": 0.01116858, + "auxiliary_loss_mlp": 0.0103583, + "balance_loss_clip": 1.02131057, + "balance_loss_mlp": 1.04114437, + "epoch": 0.4770178866676687, + "flos": 14678917839360.0, + "grad_norm": 1.7879612820388389, + "language_loss": 0.73776019, + "learning_rate": 2.2453256469367888e-06, + "loss": 0.759287, + "num_input_tokens_seen": 170532705, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7578125, + "step": 7934, + "time_per_iteration": 2.4703593254089355 + }, + { + "auxiliary_loss_clip": 0.0111259, + "auxiliary_loss_mlp": 0.01028961, + "balance_loss_clip": 1.01611567, + "balance_loss_mlp": 1.03858674, + "epoch": 0.47707800992033667, + "flos": 22565583527040.0, + "grad_norm": 1.719427707895152, + "language_loss": 0.7973842, + "learning_rate": 2.244939121664211e-06, + "loss": 0.81879967, + "num_input_tokens_seen": 170551925, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7421875, + "step": 7935, + "time_per_iteration": 2.459326982498169 + }, + { + "auxiliary_loss_clip": 0.01119081, + "auxiliary_loss_mlp": 0.01039794, + "balance_loss_clip": 1.02566767, + "balance_loss_mlp": 1.04244995, + "epoch": 0.4771381331730047, + "flos": 30918225375360.0, + "grad_norm": 1.7712234775739364, + "language_loss": 0.71105671, + "learning_rate": 2.2445525871035177e-06, + "loss": 0.73264545, + "num_input_tokens_seen": 170572320, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.76953125, + "step": 7936, + "time_per_iteration": 2.599914312362671 + }, + { + "auxiliary_loss_clip": 0.01112402, + "auxiliary_loss_mlp": 0.01028093, + "balance_loss_clip": 1.01529551, + "balance_loss_mlp": 1.03864932, + "epoch": 0.47719825642567265, + "flos": 25738901539200.0, + "grad_norm": 2.8731818732430927, + "language_loss": 0.68026948, + "learning_rate": 2.2441660432693656e-06, + "loss": 0.7016744, + "num_input_tokens_seen": 170589470, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.73828125, + "step": 7937, + "time_per_iteration": 2.4884297847747803 + }, + { + "auxiliary_loss_clip": 0.01035404, + "auxiliary_loss_mlp": 0.00999711, + "balance_loss_clip": 0.99838793, + "balance_loss_mlp": 1.01120472, + "epoch": 0.4772583796783406, + "flos": 66355128668160.0, + "grad_norm": 0.7133873095384958, + "language_loss": 0.56401992, + "learning_rate": 2.2437794901764128e-06, + "loss": 0.58437109, + "num_input_tokens_seen": 170662265, + "router_z_loss_clip": 0.01324463, + "router_z_loss_mlp": 0.2421875, + "step": 7938, + "time_per_iteration": 3.27707576751709 + }, + { + "auxiliary_loss_clip": 0.01113753, + "auxiliary_loss_mlp": 0.01032085, + "balance_loss_clip": 1.01889467, + "balance_loss_mlp": 1.04162848, + "epoch": 0.4773185029310086, + "flos": 22051091070720.0, + "grad_norm": 1.6305385471502185, + "language_loss": 0.88721037, + "learning_rate": 2.243392927839317e-06, + "loss": 0.9086687, + "num_input_tokens_seen": 170679680, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.72265625, + "step": 7939, + "time_per_iteration": 2.503838300704956 + }, + { + "auxiliary_loss_clip": 0.01110311, + "auxiliary_loss_mlp": 0.01032369, + "balance_loss_clip": 1.02037096, + "balance_loss_mlp": 1.03832293, + "epoch": 0.47737862618367655, + "flos": 16727801523840.0, + "grad_norm": 2.146362570276984, + "language_loss": 0.76661658, + "learning_rate": 2.2430063562727367e-06, + "loss": 0.78804338, + "num_input_tokens_seen": 170697340, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.72265625, + "step": 7940, + "time_per_iteration": 2.4230127334594727 + }, + { + "auxiliary_loss_clip": 0.01109098, + "auxiliary_loss_mlp": 0.01031125, + "balance_loss_clip": 1.0194304, + "balance_loss_mlp": 1.03975916, + "epoch": 0.4774387494363445, + "flos": 19609453100160.0, + "grad_norm": 1.568994035010224, + "language_loss": 0.84892023, + "learning_rate": 2.2426197754913322e-06, + "loss": 0.87032247, + "num_input_tokens_seen": 170714905, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6953125, + "step": 7941, + "time_per_iteration": 2.4640510082244873 + }, + { + "auxiliary_loss_clip": 0.01116894, + "auxiliary_loss_mlp": 0.01035857, + "balance_loss_clip": 1.02263689, + "balance_loss_mlp": 1.04307771, + "epoch": 0.4774988726890125, + "flos": 16653969118080.0, + "grad_norm": 2.0154740266117104, + "language_loss": 0.75996536, + "learning_rate": 2.24223318550976e-06, + "loss": 0.78149283, + "num_input_tokens_seen": 170731810, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73828125, + "step": 7942, + "time_per_iteration": 2.4304351806640625 + }, + { + "auxiliary_loss_clip": 0.01113984, + "auxiliary_loss_mlp": 0.01038477, + "balance_loss_clip": 1.02646661, + "balance_loss_mlp": 1.0415473, + "epoch": 0.47755899594168044, + "flos": 20485565729280.0, + "grad_norm": 1.8198127192389717, + "language_loss": 0.64578187, + "learning_rate": 2.241846586342682e-06, + "loss": 0.66730648, + "num_input_tokens_seen": 170750270, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.72265625, + "step": 7943, + "time_per_iteration": 2.469884157180786 + }, + { + "auxiliary_loss_clip": 0.01114805, + "auxiliary_loss_mlp": 0.01036635, + "balance_loss_clip": 1.02318239, + "balance_loss_mlp": 1.04029822, + "epoch": 0.4776191191943484, + "flos": 21652806090240.0, + "grad_norm": 1.6437441778624493, + "language_loss": 0.73638076, + "learning_rate": 2.2414599780047577e-06, + "loss": 0.75789517, + "num_input_tokens_seen": 170769015, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 7944, + "time_per_iteration": 2.462620258331299 + }, + { + "auxiliary_loss_clip": 0.0111381, + "auxiliary_loss_mlp": 0.01034914, + "balance_loss_clip": 1.02092481, + "balance_loss_mlp": 1.04105759, + "epoch": 0.4776792424470164, + "flos": 18770220760320.0, + "grad_norm": 2.2015870606275785, + "language_loss": 0.67936689, + "learning_rate": 2.2410733605106456e-06, + "loss": 0.70085418, + "num_input_tokens_seen": 170785725, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7265625, + "step": 7945, + "time_per_iteration": 2.498506784439087 + }, + { + "auxiliary_loss_clip": 0.01110287, + "auxiliary_loss_mlp": 0.01036964, + "balance_loss_clip": 1.02483487, + "balance_loss_mlp": 1.03805077, + "epoch": 0.47773936569968434, + "flos": 29715828577920.0, + "grad_norm": 1.8282867356700874, + "language_loss": 0.75330615, + "learning_rate": 2.240686733875009e-06, + "loss": 0.77477872, + "num_input_tokens_seen": 170804600, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.72265625, + "step": 7946, + "time_per_iteration": 2.5168514251708984 + }, + { + "auxiliary_loss_clip": 0.01116531, + "auxiliary_loss_mlp": 0.01041116, + "balance_loss_clip": 1.02759135, + "balance_loss_mlp": 1.04283607, + "epoch": 0.4777994889523523, + "flos": 24791542283520.0, + "grad_norm": 1.7491504350819331, + "language_loss": 0.79312646, + "learning_rate": 2.240300098112506e-06, + "loss": 0.81470287, + "num_input_tokens_seen": 170824230, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.734375, + "step": 7947, + "time_per_iteration": 2.5980498790740967 + }, + { + "auxiliary_loss_clip": 0.01107555, + "auxiliary_loss_mlp": 0.01036726, + "balance_loss_clip": 1.02433419, + "balance_loss_mlp": 1.0381552, + "epoch": 0.47785961220502027, + "flos": 17858161595520.0, + "grad_norm": 1.7633094448758173, + "language_loss": 0.73717982, + "learning_rate": 2.2399134532377998e-06, + "loss": 0.75862265, + "num_input_tokens_seen": 170843365, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6953125, + "step": 7948, + "time_per_iteration": 2.446190357208252 + }, + { + "auxiliary_loss_clip": 0.01115036, + "auxiliary_loss_mlp": 0.01033851, + "balance_loss_clip": 1.02050555, + "balance_loss_mlp": 1.04240656, + "epoch": 0.4779197354576883, + "flos": 20266546550400.0, + "grad_norm": 1.5048270934573464, + "language_loss": 0.77945703, + "learning_rate": 2.2395267992655514e-06, + "loss": 0.80094588, + "num_input_tokens_seen": 170863515, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7265625, + "step": 7949, + "time_per_iteration": 2.4999916553497314 + }, + { + "auxiliary_loss_clip": 0.01107805, + "auxiliary_loss_mlp": 0.01032834, + "balance_loss_clip": 1.02077556, + "balance_loss_mlp": 1.0387454, + "epoch": 0.47797985871035625, + "flos": 17056599644160.0, + "grad_norm": 2.112378262987889, + "language_loss": 0.74019569, + "learning_rate": 2.2391401362104227e-06, + "loss": 0.7616021, + "num_input_tokens_seen": 170881245, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6875, + "step": 7950, + "time_per_iteration": 2.4387645721435547 + }, + { + "auxiliary_loss_clip": 0.01110159, + "auxiliary_loss_mlp": 0.01039658, + "balance_loss_clip": 1.02609253, + "balance_loss_mlp": 1.03978574, + "epoch": 0.4780399819630242, + "flos": 31358418549120.0, + "grad_norm": 1.7104198942075015, + "language_loss": 0.74135828, + "learning_rate": 2.2387534640870756e-06, + "loss": 0.76285648, + "num_input_tokens_seen": 170901285, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.703125, + "step": 7951, + "time_per_iteration": 2.579258680343628 + }, + { + "auxiliary_loss_clip": 0.01112662, + "auxiliary_loss_mlp": 0.01032575, + "balance_loss_clip": 1.01945043, + "balance_loss_mlp": 1.03915167, + "epoch": 0.4781001052156922, + "flos": 24899597372160.0, + "grad_norm": 1.8112920130665326, + "language_loss": 0.79960251, + "learning_rate": 2.238366782910174e-06, + "loss": 0.82105488, + "num_input_tokens_seen": 170919740, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.734375, + "step": 7952, + "time_per_iteration": 2.5007214546203613 + }, + { + "auxiliary_loss_clip": 0.01114258, + "auxiliary_loss_mlp": 0.01040283, + "balance_loss_clip": 1.02687836, + "balance_loss_mlp": 1.04040217, + "epoch": 0.47816022846836015, + "flos": 18697717157760.0, + "grad_norm": 1.7026148138194093, + "language_loss": 0.78196061, + "learning_rate": 2.23798009269438e-06, + "loss": 0.80350602, + "num_input_tokens_seen": 170938510, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73828125, + "step": 7953, + "time_per_iteration": 2.4699995517730713 + }, + { + "auxiliary_loss_clip": 0.01114922, + "auxiliary_loss_mlp": 0.01036085, + "balance_loss_clip": 1.02362204, + "balance_loss_mlp": 1.0405128, + "epoch": 0.4782203517210281, + "flos": 11977573559040.0, + "grad_norm": 2.2363441879819224, + "language_loss": 0.84142399, + "learning_rate": 2.2375933934543566e-06, + "loss": 0.86293399, + "num_input_tokens_seen": 170951170, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7421875, + "step": 7954, + "time_per_iteration": 2.41294527053833 + }, + { + "auxiliary_loss_clip": 0.01109876, + "auxiliary_loss_mlp": 0.01035461, + "balance_loss_clip": 1.02254462, + "balance_loss_mlp": 1.03839588, + "epoch": 0.4782804749736961, + "flos": 20813501923200.0, + "grad_norm": 1.442835840236476, + "language_loss": 0.70588672, + "learning_rate": 2.237206685204768e-06, + "loss": 0.72734004, + "num_input_tokens_seen": 170970990, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71484375, + "step": 7955, + "time_per_iteration": 2.4867892265319824 + }, + { + "auxiliary_loss_clip": 0.0111092, + "auxiliary_loss_mlp": 0.01037464, + "balance_loss_clip": 1.02507281, + "balance_loss_mlp": 1.03925073, + "epoch": 0.47834059822636404, + "flos": 23840304359040.0, + "grad_norm": 1.5835230785797205, + "language_loss": 0.817267, + "learning_rate": 2.2368199679602787e-06, + "loss": 0.83875084, + "num_input_tokens_seen": 170991215, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71484375, + "step": 7956, + "time_per_iteration": 2.4756619930267334 + }, + { + "auxiliary_loss_clip": 0.0111219, + "auxiliary_loss_mlp": 0.01033269, + "balance_loss_clip": 1.01935172, + "balance_loss_mlp": 1.04097366, + "epoch": 0.478400721479032, + "flos": 22633777497600.0, + "grad_norm": 1.8961411498697718, + "language_loss": 0.84901869, + "learning_rate": 2.2364332417355516e-06, + "loss": 0.87047327, + "num_input_tokens_seen": 171007325, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7109375, + "step": 7957, + "time_per_iteration": 2.4848859310150146 + }, + { + "auxiliary_loss_clip": 0.01110703, + "auxiliary_loss_mlp": 0.01032705, + "balance_loss_clip": 1.02065289, + "balance_loss_mlp": 1.0396328, + "epoch": 0.4784608447317, + "flos": 19354954262400.0, + "grad_norm": 1.5799276625975138, + "language_loss": 0.79682672, + "learning_rate": 2.2360465065452527e-06, + "loss": 0.81826073, + "num_input_tokens_seen": 171025650, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.7109375, + "step": 7958, + "time_per_iteration": 2.439040422439575 + }, + { + "auxiliary_loss_clip": 0.01109825, + "auxiliary_loss_mlp": 0.01034266, + "balance_loss_clip": 1.02074742, + "balance_loss_mlp": 1.03806448, + "epoch": 0.47852096798436794, + "flos": 24021114445440.0, + "grad_norm": 2.0401185124291406, + "language_loss": 0.82728368, + "learning_rate": 2.235659762404047e-06, + "loss": 0.8487246, + "num_input_tokens_seen": 171045045, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71875, + "step": 7959, + "time_per_iteration": 2.500182867050171 + }, + { + "auxiliary_loss_clip": 0.01108176, + "auxiliary_loss_mlp": 0.01033073, + "balance_loss_clip": 1.0219152, + "balance_loss_mlp": 1.04054058, + "epoch": 0.4785810912370359, + "flos": 25666433850240.0, + "grad_norm": 2.3853858164000292, + "language_loss": 0.7333414, + "learning_rate": 2.235273009326599e-06, + "loss": 0.75475383, + "num_input_tokens_seen": 171062910, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.67578125, + "step": 7960, + "time_per_iteration": 2.4852850437164307 + }, + { + "auxiliary_loss_clip": 0.01108515, + "auxiliary_loss_mlp": 0.01036385, + "balance_loss_clip": 1.02413607, + "balance_loss_mlp": 1.03937268, + "epoch": 0.47864121448970387, + "flos": 21432134885760.0, + "grad_norm": 1.8739024393884087, + "language_loss": 0.77067018, + "learning_rate": 2.2348862473275745e-06, + "loss": 0.79211915, + "num_input_tokens_seen": 171080875, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69140625, + "step": 7961, + "time_per_iteration": 2.482361316680908 + }, + { + "auxiliary_loss_clip": 0.01108097, + "auxiliary_loss_mlp": 0.01030382, + "balance_loss_clip": 1.01817513, + "balance_loss_mlp": 1.03838158, + "epoch": 0.47870133774237184, + "flos": 16143894034560.0, + "grad_norm": 1.629700477315198, + "language_loss": 0.77528512, + "learning_rate": 2.2344994764216405e-06, + "loss": 0.7966699, + "num_input_tokens_seen": 171099190, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6953125, + "step": 7962, + "time_per_iteration": 2.427537679672241 + }, + { + "auxiliary_loss_clip": 0.01112825, + "auxiliary_loss_mlp": 0.01034413, + "balance_loss_clip": 1.02196801, + "balance_loss_mlp": 1.04174328, + "epoch": 0.47876146099503986, + "flos": 26906788344960.0, + "grad_norm": 1.5913499246445781, + "language_loss": 0.64895082, + "learning_rate": 2.2341126966234635e-06, + "loss": 0.67042321, + "num_input_tokens_seen": 171119060, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 7963, + "time_per_iteration": 2.51082181930542 + }, + { + "auxiliary_loss_clip": 0.01110812, + "auxiliary_loss_mlp": 0.01030041, + "balance_loss_clip": 1.01748848, + "balance_loss_mlp": 1.03972077, + "epoch": 0.4788215842477078, + "flos": 45332085778560.0, + "grad_norm": 1.658229101322456, + "language_loss": 0.77974397, + "learning_rate": 2.2337259079477083e-06, + "loss": 0.80115253, + "num_input_tokens_seen": 171141900, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7109375, + "step": 7964, + "time_per_iteration": 2.6512372493743896 + }, + { + "auxiliary_loss_clip": 0.01113836, + "auxiliary_loss_mlp": 0.0103048, + "balance_loss_clip": 1.01617479, + "balance_loss_mlp": 1.03944111, + "epoch": 0.4788817075003758, + "flos": 22237180456320.0, + "grad_norm": 1.7558149312417117, + "language_loss": 0.76227248, + "learning_rate": 2.233339110409044e-06, + "loss": 0.78371561, + "num_input_tokens_seen": 171161045, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.74609375, + "step": 7965, + "time_per_iteration": 2.4919536113739014 + }, + { + "auxiliary_loss_clip": 0.01108501, + "auxiliary_loss_mlp": 0.01032128, + "balance_loss_clip": 1.01957512, + "balance_loss_mlp": 1.0382036, + "epoch": 0.47894183075304375, + "flos": 16471183783680.0, + "grad_norm": 2.251400870531799, + "language_loss": 0.74590349, + "learning_rate": 2.232952304022137e-06, + "loss": 0.76730978, + "num_input_tokens_seen": 171179675, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 7966, + "time_per_iteration": 2.4254770278930664 + }, + { + "auxiliary_loss_clip": 0.01108309, + "auxiliary_loss_mlp": 0.01030103, + "balance_loss_clip": 1.0169003, + "balance_loss_mlp": 1.03785586, + "epoch": 0.4790019540057117, + "flos": 24282688262400.0, + "grad_norm": 1.521959054408531, + "language_loss": 0.72728515, + "learning_rate": 2.232565488801655e-06, + "loss": 0.74866927, + "num_input_tokens_seen": 171201175, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 7967, + "time_per_iteration": 2.522883892059326 + }, + { + "auxiliary_loss_clip": 0.01103831, + "auxiliary_loss_mlp": 0.01026385, + "balance_loss_clip": 1.01433849, + "balance_loss_mlp": 1.0371958, + "epoch": 0.4790620772583797, + "flos": 25666469763840.0, + "grad_norm": 2.344774601020355, + "language_loss": 0.79174602, + "learning_rate": 2.232178664762267e-06, + "loss": 0.81304824, + "num_input_tokens_seen": 171221750, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66796875, + "step": 7968, + "time_per_iteration": 2.4777579307556152 + }, + { + "auxiliary_loss_clip": 0.01035385, + "auxiliary_loss_mlp": 0.01007575, + "balance_loss_clip": 1.00622833, + "balance_loss_mlp": 1.0118711, + "epoch": 0.47912220051104765, + "flos": 69428077102080.0, + "grad_norm": 0.7636022901302345, + "language_loss": 0.62258303, + "learning_rate": 2.2317918319186408e-06, + "loss": 0.64301264, + "num_input_tokens_seen": 171292235, + "router_z_loss_clip": 0.01348877, + "router_z_loss_mlp": 0.23535156, + "step": 7969, + "time_per_iteration": 4.618057012557983 + }, + { + "auxiliary_loss_clip": 0.01107101, + "auxiliary_loss_mlp": 0.01027179, + "balance_loss_clip": 1.01555026, + "balance_loss_mlp": 1.04000521, + "epoch": 0.4791823237637156, + "flos": 24168922911360.0, + "grad_norm": 1.5307915717866403, + "language_loss": 0.77086926, + "learning_rate": 2.2314049902854446e-06, + "loss": 0.79221207, + "num_input_tokens_seen": 171312215, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 7970, + "time_per_iteration": 2.469363212585449 + }, + { + "auxiliary_loss_clip": 0.01106643, + "auxiliary_loss_mlp": 0.01032734, + "balance_loss_clip": 1.01962733, + "balance_loss_mlp": 1.03676999, + "epoch": 0.4792424470163836, + "flos": 24751465683840.0, + "grad_norm": 1.595425961628827, + "language_loss": 0.70320344, + "learning_rate": 2.231018139877349e-06, + "loss": 0.72459716, + "num_input_tokens_seen": 171332975, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.69921875, + "step": 7971, + "time_per_iteration": 5.436426401138306 + }, + { + "auxiliary_loss_clip": 0.01107204, + "auxiliary_loss_mlp": 0.01025624, + "balance_loss_clip": 1.01228452, + "balance_loss_mlp": 1.03725302, + "epoch": 0.47930257026905154, + "flos": 23257905240960.0, + "grad_norm": 1.2757793979028687, + "language_loss": 0.79909688, + "learning_rate": 2.230631280709021e-06, + "loss": 0.82042515, + "num_input_tokens_seen": 171353880, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.69921875, + "step": 7972, + "time_per_iteration": 2.4788928031921387 + }, + { + "auxiliary_loss_clip": 0.01109213, + "auxiliary_loss_mlp": 0.01025441, + "balance_loss_clip": 1.01220274, + "balance_loss_mlp": 1.03801394, + "epoch": 0.4793626935217195, + "flos": 14064091718400.0, + "grad_norm": 2.154896563362021, + "language_loss": 0.69762838, + "learning_rate": 2.2302444127951327e-06, + "loss": 0.71897495, + "num_input_tokens_seen": 171370930, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 7973, + "time_per_iteration": 2.462674140930176 + }, + { + "auxiliary_loss_clip": 0.01108438, + "auxiliary_loss_mlp": 0.01031526, + "balance_loss_clip": 1.01943266, + "balance_loss_mlp": 1.0401777, + "epoch": 0.4794228167743875, + "flos": 21798854789760.0, + "grad_norm": 1.7300676969557445, + "language_loss": 0.78652924, + "learning_rate": 2.2298575361503523e-06, + "loss": 0.80792892, + "num_input_tokens_seen": 171387575, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 7974, + "time_per_iteration": 2.523935079574585 + }, + { + "auxiliary_loss_clip": 0.01035404, + "auxiliary_loss_mlp": 0.01004075, + "balance_loss_clip": 1.00275135, + "balance_loss_mlp": 1.01174331, + "epoch": 0.47948294002705544, + "flos": 66968805553920.0, + "grad_norm": 0.7575595850509929, + "language_loss": 0.54076326, + "learning_rate": 2.2294706507893517e-06, + "loss": 0.56115806, + "num_input_tokens_seen": 171449980, + "router_z_loss_clip": 0.01324463, + "router_z_loss_mlp": 0.23632812, + "step": 7975, + "time_per_iteration": 3.120290756225586 + }, + { + "auxiliary_loss_clip": 0.01113379, + "auxiliary_loss_mlp": 0.01033426, + "balance_loss_clip": 1.01946688, + "balance_loss_mlp": 1.03872228, + "epoch": 0.47954306327972346, + "flos": 12422471414400.0, + "grad_norm": 2.0952625936259226, + "language_loss": 0.90246761, + "learning_rate": 2.2290837567268008e-06, + "loss": 0.92393565, + "num_input_tokens_seen": 171465290, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75, + "step": 7976, + "time_per_iteration": 2.4177215099334717 + }, + { + "auxiliary_loss_clip": 0.01113502, + "auxiliary_loss_mlp": 0.01034601, + "balance_loss_clip": 1.02070153, + "balance_loss_mlp": 1.03989267, + "epoch": 0.4796031865323914, + "flos": 18361951799040.0, + "grad_norm": 2.1692733838107148, + "language_loss": 0.73631197, + "learning_rate": 2.2286968539773713e-06, + "loss": 0.75779295, + "num_input_tokens_seen": 171481130, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.734375, + "step": 7977, + "time_per_iteration": 2.478994846343994 + }, + { + "auxiliary_loss_clip": 0.01105095, + "auxiliary_loss_mlp": 0.01033101, + "balance_loss_clip": 1.02095962, + "balance_loss_mlp": 1.03737617, + "epoch": 0.4796633097850594, + "flos": 21835088634240.0, + "grad_norm": 1.5189317692466735, + "language_loss": 0.78386033, + "learning_rate": 2.228309942555734e-06, + "loss": 0.80524224, + "num_input_tokens_seen": 171501140, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6796875, + "step": 7978, + "time_per_iteration": 2.441770315170288 + }, + { + "auxiliary_loss_clip": 0.01110092, + "auxiliary_loss_mlp": 0.01032979, + "balance_loss_clip": 1.02036691, + "balance_loss_mlp": 1.03895688, + "epoch": 0.47972343303772735, + "flos": 23437350610560.0, + "grad_norm": 1.9080949377976553, + "language_loss": 0.89561266, + "learning_rate": 2.22792302247656e-06, + "loss": 0.91704339, + "num_input_tokens_seen": 171519835, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 7979, + "time_per_iteration": 2.5005874633789062 + }, + { + "auxiliary_loss_clip": 0.01111373, + "auxiliary_loss_mlp": 0.01032357, + "balance_loss_clip": 1.01854038, + "balance_loss_mlp": 1.03977728, + "epoch": 0.4797835562903953, + "flos": 24899776940160.0, + "grad_norm": 1.512941625260848, + "language_loss": 0.77104276, + "learning_rate": 2.227536093754523e-06, + "loss": 0.79248011, + "num_input_tokens_seen": 171540980, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.71875, + "step": 7980, + "time_per_iteration": 2.514702320098877 + }, + { + "auxiliary_loss_clip": 0.01112304, + "auxiliary_loss_mlp": 0.01031739, + "balance_loss_clip": 1.0177083, + "balance_loss_mlp": 1.03812611, + "epoch": 0.4798436795430633, + "flos": 35042996793600.0, + "grad_norm": 1.6709892763913308, + "language_loss": 0.71718562, + "learning_rate": 2.227149156404295e-06, + "loss": 0.738626, + "num_input_tokens_seen": 171563600, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 7981, + "time_per_iteration": 2.606919050216675 + }, + { + "auxiliary_loss_clip": 0.01107255, + "auxiliary_loss_mlp": 0.01029759, + "balance_loss_clip": 1.01743317, + "balance_loss_mlp": 1.03878653, + "epoch": 0.47990380279573125, + "flos": 20590209025920.0, + "grad_norm": 1.7550369517172573, + "language_loss": 0.70141387, + "learning_rate": 2.2267622104405473e-06, + "loss": 0.72278404, + "num_input_tokens_seen": 171580700, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.68359375, + "step": 7982, + "time_per_iteration": 2.4303736686706543 + }, + { + "auxiliary_loss_clip": 0.01102477, + "auxiliary_loss_mlp": 0.0102651, + "balance_loss_clip": 1.01558483, + "balance_loss_mlp": 1.03694749, + "epoch": 0.4799639260483992, + "flos": 26359402008960.0, + "grad_norm": 2.256566494766253, + "language_loss": 0.70977259, + "learning_rate": 2.2263752558779544e-06, + "loss": 0.73106241, + "num_input_tokens_seen": 171602035, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.65625, + "step": 7983, + "time_per_iteration": 2.520749092102051 + }, + { + "auxiliary_loss_clip": 0.01032541, + "auxiliary_loss_mlp": 0.01011047, + "balance_loss_clip": 1.00992036, + "balance_loss_mlp": 1.00916195, + "epoch": 0.4800240493010672, + "flos": 70979021521920.0, + "grad_norm": 0.8049867321392653, + "language_loss": 0.59458363, + "learning_rate": 2.2259882927311883e-06, + "loss": 0.6150195, + "num_input_tokens_seen": 171659215, + "router_z_loss_clip": 0.0112915, + "router_z_loss_mlp": 0.234375, + "step": 7984, + "time_per_iteration": 3.0019614696502686 + }, + { + "auxiliary_loss_clip": 0.01107343, + "auxiliary_loss_mlp": 0.01031912, + "balance_loss_clip": 1.01912713, + "balance_loss_mlp": 1.0376364, + "epoch": 0.48008417255373514, + "flos": 17086656349440.0, + "grad_norm": 1.5803111762139084, + "language_loss": 0.66603255, + "learning_rate": 2.2256013210149247e-06, + "loss": 0.68742514, + "num_input_tokens_seen": 171675710, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 7985, + "time_per_iteration": 2.459381341934204 + }, + { + "auxiliary_loss_clip": 0.01108889, + "auxiliary_loss_mlp": 0.01036103, + "balance_loss_clip": 1.02279973, + "balance_loss_mlp": 1.03655791, + "epoch": 0.4801442958064031, + "flos": 15413435055360.0, + "grad_norm": 1.8105960725352928, + "language_loss": 0.70750952, + "learning_rate": 2.225214340743835e-06, + "loss": 0.72895944, + "num_input_tokens_seen": 171692510, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.72265625, + "step": 7986, + "time_per_iteration": 2.412890911102295 + }, + { + "auxiliary_loss_clip": 0.01113566, + "auxiliary_loss_mlp": 0.01038838, + "balance_loss_clip": 1.02515244, + "balance_loss_mlp": 1.03964305, + "epoch": 0.4802044190590711, + "flos": 11473747441920.0, + "grad_norm": 2.571002176109277, + "language_loss": 0.78704774, + "learning_rate": 2.2248273519325956e-06, + "loss": 0.80857182, + "num_input_tokens_seen": 171710235, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7421875, + "step": 7987, + "time_per_iteration": 2.464531898498535 + }, + { + "auxiliary_loss_clip": 0.01107017, + "auxiliary_loss_mlp": 0.01036694, + "balance_loss_clip": 1.02410507, + "balance_loss_mlp": 1.03615475, + "epoch": 0.48026454231173904, + "flos": 20951003185920.0, + "grad_norm": 1.8312114483143844, + "language_loss": 0.75309592, + "learning_rate": 2.2244403545958812e-06, + "loss": 0.77453303, + "num_input_tokens_seen": 171726715, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 7988, + "time_per_iteration": 2.4185469150543213 + }, + { + "auxiliary_loss_clip": 0.01113071, + "auxiliary_loss_mlp": 0.01029368, + "balance_loss_clip": 1.01667249, + "balance_loss_mlp": 1.04115009, + "epoch": 0.48032466556440706, + "flos": 20448110822400.0, + "grad_norm": 1.9770525324174564, + "language_loss": 0.78992975, + "learning_rate": 2.224053348748365e-06, + "loss": 0.81135416, + "num_input_tokens_seen": 171743605, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 7989, + "time_per_iteration": 2.4614450931549072 + }, + { + "auxiliary_loss_clip": 0.01113161, + "auxiliary_loss_mlp": 0.01036646, + "balance_loss_clip": 1.02273488, + "balance_loss_mlp": 1.03810394, + "epoch": 0.480384788817075, + "flos": 37120823861760.0, + "grad_norm": 1.6525338075260034, + "language_loss": 0.73414218, + "learning_rate": 2.223666334404724e-06, + "loss": 0.75564027, + "num_input_tokens_seen": 171765445, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 7990, + "time_per_iteration": 2.562366008758545 + }, + { + "auxiliary_loss_clip": 0.01032695, + "auxiliary_loss_mlp": 0.0100018, + "balance_loss_clip": 0.99901813, + "balance_loss_mlp": 1.00915992, + "epoch": 0.480444912069743, + "flos": 69552577641600.0, + "grad_norm": 1.0595345338831614, + "language_loss": 0.59085703, + "learning_rate": 2.223279311579633e-06, + "loss": 0.61118573, + "num_input_tokens_seen": 171830115, + "router_z_loss_clip": 0.01159668, + "router_z_loss_mlp": 0.23535156, + "step": 7991, + "time_per_iteration": 3.1877033710479736 + }, + { + "auxiliary_loss_clip": 0.01107688, + "auxiliary_loss_mlp": 0.01029352, + "balance_loss_clip": 1.01626837, + "balance_loss_mlp": 1.03751063, + "epoch": 0.48050503532241096, + "flos": 29822231640960.0, + "grad_norm": 1.8662124275999659, + "language_loss": 0.67495418, + "learning_rate": 2.222892280287768e-06, + "loss": 0.69632453, + "num_input_tokens_seen": 171849135, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 7992, + "time_per_iteration": 2.5135016441345215 + }, + { + "auxiliary_loss_clip": 0.01109706, + "auxiliary_loss_mlp": 0.0103267, + "balance_loss_clip": 1.01969361, + "balance_loss_mlp": 1.03664112, + "epoch": 0.4805651585750789, + "flos": 23948539015680.0, + "grad_norm": 1.6211148746347477, + "language_loss": 0.76493919, + "learning_rate": 2.2225052405438056e-06, + "loss": 0.78636301, + "num_input_tokens_seen": 171868880, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.73046875, + "step": 7993, + "time_per_iteration": 2.5075619220733643 + }, + { + "auxiliary_loss_clip": 0.01108105, + "auxiliary_loss_mlp": 0.01035536, + "balance_loss_clip": 1.02267301, + "balance_loss_mlp": 1.03899574, + "epoch": 0.4806252818277469, + "flos": 25665428269440.0, + "grad_norm": 1.5028541481112037, + "language_loss": 0.78277898, + "learning_rate": 2.222118192362422e-06, + "loss": 0.80421537, + "num_input_tokens_seen": 171889455, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69140625, + "step": 7994, + "time_per_iteration": 2.4792723655700684 + }, + { + "auxiliary_loss_clip": 0.01108503, + "auxiliary_loss_mlp": 0.01032981, + "balance_loss_clip": 1.02010691, + "balance_loss_mlp": 1.03752637, + "epoch": 0.48068540508041485, + "flos": 13151996640000.0, + "grad_norm": 1.8792905950371066, + "language_loss": 0.79627287, + "learning_rate": 2.2217311357582946e-06, + "loss": 0.81768769, + "num_input_tokens_seen": 171906070, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 7995, + "time_per_iteration": 2.4605226516723633 + }, + { + "auxiliary_loss_clip": 0.0110729, + "auxiliary_loss_mlp": 0.01029971, + "balance_loss_clip": 1.01676297, + "balance_loss_mlp": 1.03693795, + "epoch": 0.4807455283330828, + "flos": 21176738208000.0, + "grad_norm": 1.8681673839648991, + "language_loss": 0.8255161, + "learning_rate": 2.2213440707461e-06, + "loss": 0.84688872, + "num_input_tokens_seen": 171926515, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 7996, + "time_per_iteration": 2.4627599716186523 + }, + { + "auxiliary_loss_clip": 0.01108595, + "auxiliary_loss_mlp": 0.01028236, + "balance_loss_clip": 1.01562989, + "balance_loss_mlp": 1.03879523, + "epoch": 0.4808056515857508, + "flos": 12275991751680.0, + "grad_norm": 1.619215200240117, + "language_loss": 0.80642337, + "learning_rate": 2.220956997340516e-06, + "loss": 0.82779169, + "num_input_tokens_seen": 171943845, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69921875, + "step": 7997, + "time_per_iteration": 2.450486660003662 + }, + { + "auxiliary_loss_clip": 0.01108626, + "auxiliary_loss_mlp": 0.01034627, + "balance_loss_clip": 1.02174699, + "balance_loss_mlp": 1.03695917, + "epoch": 0.48086577483841875, + "flos": 24826052275200.0, + "grad_norm": 1.8605056175819474, + "language_loss": 0.72481054, + "learning_rate": 2.220569915556221e-06, + "loss": 0.74624306, + "num_input_tokens_seen": 171964970, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 7998, + "time_per_iteration": 2.484501361846924 + }, + { + "auxiliary_loss_clip": 0.0111064, + "auxiliary_loss_mlp": 0.01032099, + "balance_loss_clip": 1.01893795, + "balance_loss_mlp": 1.03890526, + "epoch": 0.4809258980910867, + "flos": 24465365856000.0, + "grad_norm": 1.7021894106986095, + "language_loss": 0.71182632, + "learning_rate": 2.220182825407892e-06, + "loss": 0.73325378, + "num_input_tokens_seen": 171986340, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71484375, + "step": 7999, + "time_per_iteration": 2.5011837482452393 + }, + { + "auxiliary_loss_clip": 0.01112886, + "auxiliary_loss_mlp": 0.0104057, + "balance_loss_clip": 1.02758801, + "balance_loss_mlp": 1.03862715, + "epoch": 0.4809860213437547, + "flos": 21215952881280.0, + "grad_norm": 2.087936802810397, + "language_loss": 0.71136171, + "learning_rate": 2.2197957269102083e-06, + "loss": 0.73289621, + "num_input_tokens_seen": 172007300, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7421875, + "step": 8000, + "time_per_iteration": 2.473083019256592 + }, + { + "auxiliary_loss_clip": 0.01111531, + "auxiliary_loss_mlp": 0.01036663, + "balance_loss_clip": 1.02291203, + "balance_loss_mlp": 1.03987443, + "epoch": 0.48104614459642264, + "flos": 37632084094080.0, + "grad_norm": 1.2945806687832948, + "language_loss": 0.75104553, + "learning_rate": 2.2194086200778485e-06, + "loss": 0.77252746, + "num_input_tokens_seen": 172029585, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.71484375, + "step": 8001, + "time_per_iteration": 2.6078953742980957 + }, + { + "auxiliary_loss_clip": 0.0111278, + "auxiliary_loss_mlp": 0.01040194, + "balance_loss_clip": 1.02701581, + "balance_loss_mlp": 1.03889596, + "epoch": 0.48110626784909066, + "flos": 18406122549120.0, + "grad_norm": 1.8640621993165467, + "language_loss": 0.81407833, + "learning_rate": 2.219021504925493e-06, + "loss": 0.83560812, + "num_input_tokens_seen": 172047495, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73828125, + "step": 8002, + "time_per_iteration": 2.4381091594696045 + }, + { + "auxiliary_loss_clip": 0.01113899, + "auxiliary_loss_mlp": 0.01038529, + "balance_loss_clip": 1.02415216, + "balance_loss_mlp": 1.04037309, + "epoch": 0.48116639110175863, + "flos": 28439814856320.0, + "grad_norm": 1.7407260367663493, + "language_loss": 0.71673185, + "learning_rate": 2.218634381467819e-06, + "loss": 0.7382561, + "num_input_tokens_seen": 172067625, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.734375, + "step": 8003, + "time_per_iteration": 2.5028979778289795 + }, + { + "auxiliary_loss_clip": 0.01110475, + "auxiliary_loss_mlp": 0.01038852, + "balance_loss_clip": 1.0263828, + "balance_loss_mlp": 1.04041362, + "epoch": 0.4812265143544266, + "flos": 21725237865600.0, + "grad_norm": 1.9713418243952783, + "language_loss": 0.82751715, + "learning_rate": 2.218247249719507e-06, + "loss": 0.84901035, + "num_input_tokens_seen": 172087885, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 8004, + "time_per_iteration": 2.4438235759735107 + }, + { + "auxiliary_loss_clip": 0.0112055, + "auxiliary_loss_mlp": 0.01044746, + "balance_loss_clip": 1.02951062, + "balance_loss_mlp": 1.04235947, + "epoch": 0.48128663760709456, + "flos": 13224679810560.0, + "grad_norm": 2.0081127141146964, + "language_loss": 0.77780354, + "learning_rate": 2.217860109695239e-06, + "loss": 0.7994566, + "num_input_tokens_seen": 172105815, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.78125, + "step": 8005, + "time_per_iteration": 2.4440789222717285 + }, + { + "auxiliary_loss_clip": 0.01109918, + "auxiliary_loss_mlp": 0.01035382, + "balance_loss_clip": 1.0218395, + "balance_loss_mlp": 1.03705537, + "epoch": 0.4813467608597625, + "flos": 24243437675520.0, + "grad_norm": 3.988142696329101, + "language_loss": 0.70656502, + "learning_rate": 2.217472961409692e-06, + "loss": 0.72801799, + "num_input_tokens_seen": 172126125, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7265625, + "step": 8006, + "time_per_iteration": 2.4627490043640137 + }, + { + "auxiliary_loss_clip": 0.0111164, + "auxiliary_loss_mlp": 0.01036734, + "balance_loss_clip": 1.02357328, + "balance_loss_mlp": 1.03939271, + "epoch": 0.4814068841124305, + "flos": 27480424544640.0, + "grad_norm": 1.9148811651735764, + "language_loss": 0.70463514, + "learning_rate": 2.2170858048775495e-06, + "loss": 0.72611892, + "num_input_tokens_seen": 172141945, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.72265625, + "step": 8007, + "time_per_iteration": 2.4923551082611084 + }, + { + "auxiliary_loss_clip": 0.01112639, + "auxiliary_loss_mlp": 0.01035836, + "balance_loss_clip": 1.02225244, + "balance_loss_mlp": 1.03924334, + "epoch": 0.48146700736509845, + "flos": 19572896033280.0, + "grad_norm": 2.0099977087556202, + "language_loss": 0.71720552, + "learning_rate": 2.2166986401134914e-06, + "loss": 0.7386902, + "num_input_tokens_seen": 172161095, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.734375, + "step": 8008, + "time_per_iteration": 2.443068742752075 + }, + { + "auxiliary_loss_clip": 0.01114704, + "auxiliary_loss_mlp": 0.01046807, + "balance_loss_clip": 1.0317508, + "balance_loss_mlp": 1.03984571, + "epoch": 0.4815271306177664, + "flos": 20627771673600.0, + "grad_norm": 1.7155117192574523, + "language_loss": 0.60448718, + "learning_rate": 2.216311467132199e-06, + "loss": 0.62610233, + "num_input_tokens_seen": 172178750, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.74609375, + "step": 8009, + "time_per_iteration": 2.4860730171203613 + }, + { + "auxiliary_loss_clip": 0.01041953, + "auxiliary_loss_mlp": 0.01003034, + "balance_loss_clip": 1.00188339, + "balance_loss_mlp": 1.01788867, + "epoch": 0.4815872538704344, + "flos": 67691076232320.0, + "grad_norm": 0.861211973736155, + "language_loss": 0.61329502, + "learning_rate": 2.2159242859483547e-06, + "loss": 0.6337449, + "num_input_tokens_seen": 172240235, + "router_z_loss_clip": 0.01147461, + "router_z_loss_mlp": 0.24121094, + "step": 8010, + "time_per_iteration": 3.073617935180664 + }, + { + "auxiliary_loss_clip": 0.01115187, + "auxiliary_loss_mlp": 0.01045892, + "balance_loss_clip": 1.03135991, + "balance_loss_mlp": 1.04191947, + "epoch": 0.48164737712310235, + "flos": 22820764723200.0, + "grad_norm": 2.200850795507016, + "language_loss": 0.73003197, + "learning_rate": 2.215537096576639e-06, + "loss": 0.75164282, + "num_input_tokens_seen": 172259875, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.734375, + "step": 8011, + "time_per_iteration": 3.875464677810669 + }, + { + "auxiliary_loss_clip": 0.01108987, + "auxiliary_loss_mlp": 0.01036612, + "balance_loss_clip": 1.02398205, + "balance_loss_mlp": 1.03922546, + "epoch": 0.4817075003757703, + "flos": 23733865382400.0, + "grad_norm": 1.7669872730797296, + "language_loss": 0.79906964, + "learning_rate": 2.2151498990317354e-06, + "loss": 0.82052571, + "num_input_tokens_seen": 172280150, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 8012, + "time_per_iteration": 5.410374164581299 + }, + { + "auxiliary_loss_clip": 0.01114267, + "auxiliary_loss_mlp": 0.01047469, + "balance_loss_clip": 1.03336632, + "balance_loss_mlp": 1.04086518, + "epoch": 0.4817676236284383, + "flos": 28182909807360.0, + "grad_norm": 1.5982967759080098, + "language_loss": 0.73816693, + "learning_rate": 2.214762693328326e-06, + "loss": 0.75978434, + "num_input_tokens_seen": 172300810, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.734375, + "step": 8013, + "time_per_iteration": 4.00807785987854 + }, + { + "auxiliary_loss_clip": 0.01112131, + "auxiliary_loss_mlp": 0.01033013, + "balance_loss_clip": 1.02043676, + "balance_loss_mlp": 1.04102039, + "epoch": 0.48182774688110624, + "flos": 17091756080640.0, + "grad_norm": 4.768803838152643, + "language_loss": 0.90554619, + "learning_rate": 2.214375479481094e-06, + "loss": 0.92699754, + "num_input_tokens_seen": 172317930, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 8014, + "time_per_iteration": 2.4615042209625244 + }, + { + "auxiliary_loss_clip": 0.01116604, + "auxiliary_loss_mlp": 0.01038374, + "balance_loss_clip": 1.02456379, + "balance_loss_mlp": 1.04058647, + "epoch": 0.4818878701337742, + "flos": 12567873669120.0, + "grad_norm": 3.0531094865391073, + "language_loss": 0.74407947, + "learning_rate": 2.213988257504722e-06, + "loss": 0.76562929, + "num_input_tokens_seen": 172336340, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 8015, + "time_per_iteration": 2.434838056564331 + }, + { + "auxiliary_loss_clip": 0.01117647, + "auxiliary_loss_mlp": 0.01040504, + "balance_loss_clip": 1.02588332, + "balance_loss_mlp": 1.04072225, + "epoch": 0.48194799338644223, + "flos": 24608505553920.0, + "grad_norm": 2.017951331310383, + "language_loss": 0.8059243, + "learning_rate": 2.213601027413894e-06, + "loss": 0.82750583, + "num_input_tokens_seen": 172354315, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.76953125, + "step": 8016, + "time_per_iteration": 2.513319492340088 + }, + { + "auxiliary_loss_clip": 0.01109398, + "auxiliary_loss_mlp": 0.01035038, + "balance_loss_clip": 1.02206254, + "balance_loss_mlp": 1.04101717, + "epoch": 0.4820081166391102, + "flos": 21105204272640.0, + "grad_norm": 2.4127244097624847, + "language_loss": 0.76781118, + "learning_rate": 2.2132137892232933e-06, + "loss": 0.78925556, + "num_input_tokens_seen": 172372695, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6796875, + "step": 8017, + "time_per_iteration": 2.4602606296539307 + }, + { + "auxiliary_loss_clip": 0.011107, + "auxiliary_loss_mlp": 0.01032569, + "balance_loss_clip": 1.01862764, + "balance_loss_mlp": 1.04151559, + "epoch": 0.48206823989177816, + "flos": 25264593423360.0, + "grad_norm": 1.9887798442379552, + "language_loss": 0.80156118, + "learning_rate": 2.2128265429476043e-06, + "loss": 0.82299387, + "num_input_tokens_seen": 172390905, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.69140625, + "step": 8018, + "time_per_iteration": 2.5529282093048096 + }, + { + "auxiliary_loss_clip": 0.01113443, + "auxiliary_loss_mlp": 0.01029419, + "balance_loss_clip": 1.01667559, + "balance_loss_mlp": 1.04109669, + "epoch": 0.4821283631444461, + "flos": 24645062620800.0, + "grad_norm": 1.7653706812529009, + "language_loss": 0.75843483, + "learning_rate": 2.2124392886015124e-06, + "loss": 0.77986348, + "num_input_tokens_seen": 172412295, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 8019, + "time_per_iteration": 2.4978489875793457 + }, + { + "auxiliary_loss_clip": 0.01112605, + "auxiliary_loss_mlp": 0.01036673, + "balance_loss_clip": 1.02286255, + "balance_loss_mlp": 1.03955722, + "epoch": 0.4821884863971141, + "flos": 23952094462080.0, + "grad_norm": 1.7828460534537498, + "language_loss": 0.78554976, + "learning_rate": 2.212052026199701e-06, + "loss": 0.80704254, + "num_input_tokens_seen": 172432625, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.73046875, + "step": 8020, + "time_per_iteration": 2.503870725631714 + }, + { + "auxiliary_loss_clip": 0.01112649, + "auxiliary_loss_mlp": 0.01034544, + "balance_loss_clip": 1.02043533, + "balance_loss_mlp": 1.04134321, + "epoch": 0.48224860964978206, + "flos": 17160668323200.0, + "grad_norm": 2.4275685595470207, + "language_loss": 0.69718045, + "learning_rate": 2.211664755756855e-06, + "loss": 0.71865243, + "num_input_tokens_seen": 172450010, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7109375, + "step": 8021, + "time_per_iteration": 2.4298038482666016 + }, + { + "auxiliary_loss_clip": 0.011165, + "auxiliary_loss_mlp": 0.01032417, + "balance_loss_clip": 1.01797438, + "balance_loss_mlp": 1.0407902, + "epoch": 0.48230873290245, + "flos": 23075838178560.0, + "grad_norm": 1.6547112313669838, + "language_loss": 0.62773043, + "learning_rate": 2.2112774772876603e-06, + "loss": 0.64921963, + "num_input_tokens_seen": 172469080, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7578125, + "step": 8022, + "time_per_iteration": 2.4862682819366455 + }, + { + "auxiliary_loss_clip": 0.01109497, + "auxiliary_loss_mlp": 0.01029473, + "balance_loss_clip": 1.01683092, + "balance_loss_mlp": 1.03976464, + "epoch": 0.482368856155118, + "flos": 19353517718400.0, + "grad_norm": 2.257171661165274, + "language_loss": 0.66345549, + "learning_rate": 2.2108901908068028e-06, + "loss": 0.68484527, + "num_input_tokens_seen": 172484850, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 8023, + "time_per_iteration": 2.4498074054718018 + }, + { + "auxiliary_loss_clip": 0.01109691, + "auxiliary_loss_mlp": 0.01035383, + "balance_loss_clip": 1.02181077, + "balance_loss_mlp": 1.0379076, + "epoch": 0.48242897940778595, + "flos": 20078984707200.0, + "grad_norm": 2.6609441563285485, + "language_loss": 0.76680458, + "learning_rate": 2.2105028963289683e-06, + "loss": 0.78825533, + "num_input_tokens_seen": 172503525, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.71875, + "step": 8024, + "time_per_iteration": 2.5641326904296875 + }, + { + "auxiliary_loss_clip": 0.01111982, + "auxiliary_loss_mlp": 0.01033968, + "balance_loss_clip": 1.01926339, + "balance_loss_mlp": 1.03856826, + "epoch": 0.4824891026604539, + "flos": 23403989854080.0, + "grad_norm": 1.4456982310337658, + "language_loss": 0.75299227, + "learning_rate": 2.2101155938688423e-06, + "loss": 0.77445179, + "num_input_tokens_seen": 172524360, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.734375, + "step": 8025, + "time_per_iteration": 2.4700748920440674 + }, + { + "auxiliary_loss_clip": 0.0111201, + "auxiliary_loss_mlp": 0.01034955, + "balance_loss_clip": 1.02159774, + "balance_loss_mlp": 1.04015994, + "epoch": 0.4825492259131219, + "flos": 20368675895040.0, + "grad_norm": 1.85740453148256, + "language_loss": 0.71010149, + "learning_rate": 2.209728283441112e-06, + "loss": 0.7315712, + "num_input_tokens_seen": 172541480, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.71875, + "step": 8026, + "time_per_iteration": 2.451942205429077 + }, + { + "auxiliary_loss_clip": 0.01115796, + "auxiliary_loss_mlp": 0.01043054, + "balance_loss_clip": 1.02739012, + "balance_loss_mlp": 1.04088664, + "epoch": 0.48260934916578985, + "flos": 14319021519360.0, + "grad_norm": 2.002376238963681, + "language_loss": 0.74738306, + "learning_rate": 2.209340965060465e-06, + "loss": 0.76897156, + "num_input_tokens_seen": 172559005, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.75, + "step": 8027, + "time_per_iteration": 2.511625051498413 + }, + { + "auxiliary_loss_clip": 0.01116324, + "auxiliary_loss_mlp": 0.01036177, + "balance_loss_clip": 1.02260458, + "balance_loss_mlp": 1.0418303, + "epoch": 0.4826694724184578, + "flos": 22121152548480.0, + "grad_norm": 1.8015680699639052, + "language_loss": 0.6744982, + "learning_rate": 2.2089536387415868e-06, + "loss": 0.69602323, + "num_input_tokens_seen": 172578435, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.74609375, + "step": 8028, + "time_per_iteration": 2.487610101699829 + }, + { + "auxiliary_loss_clip": 0.01114464, + "auxiliary_loss_mlp": 0.01039272, + "balance_loss_clip": 1.02490783, + "balance_loss_mlp": 1.04192257, + "epoch": 0.48272959567112583, + "flos": 16181169373440.0, + "grad_norm": 1.8869203156454395, + "language_loss": 0.73063505, + "learning_rate": 2.2085663044991655e-06, + "loss": 0.75217235, + "num_input_tokens_seen": 172596095, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7265625, + "step": 8029, + "time_per_iteration": 2.4256598949432373 + }, + { + "auxiliary_loss_clip": 0.01114009, + "auxiliary_loss_mlp": 0.01031401, + "balance_loss_clip": 1.01691651, + "balance_loss_mlp": 1.03949094, + "epoch": 0.4827897189237938, + "flos": 23180445561600.0, + "grad_norm": 1.9568889088417416, + "language_loss": 0.85374999, + "learning_rate": 2.2081789623478896e-06, + "loss": 0.87520409, + "num_input_tokens_seen": 172615255, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7421875, + "step": 8030, + "time_per_iteration": 2.4838480949401855 + }, + { + "auxiliary_loss_clip": 0.01111314, + "auxiliary_loss_mlp": 0.01032477, + "balance_loss_clip": 1.01917291, + "balance_loss_mlp": 1.03858352, + "epoch": 0.48284984217646176, + "flos": 21652626522240.0, + "grad_norm": 1.946134860300181, + "language_loss": 0.74173188, + "learning_rate": 2.2077916123024466e-06, + "loss": 0.76316977, + "num_input_tokens_seen": 172633185, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7265625, + "step": 8031, + "time_per_iteration": 2.475564956665039 + }, + { + "auxiliary_loss_clip": 0.01118074, + "auxiliary_loss_mlp": 0.01045075, + "balance_loss_clip": 1.03023958, + "balance_loss_mlp": 1.04181576, + "epoch": 0.48290996542912973, + "flos": 31467443304960.0, + "grad_norm": 1.8194651882134072, + "language_loss": 0.71833324, + "learning_rate": 2.2074042543775245e-06, + "loss": 0.73996472, + "num_input_tokens_seen": 172654280, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.76171875, + "step": 8032, + "time_per_iteration": 2.5389230251312256 + }, + { + "auxiliary_loss_clip": 0.01111799, + "auxiliary_loss_mlp": 0.01036984, + "balance_loss_clip": 1.02326274, + "balance_loss_mlp": 1.03896618, + "epoch": 0.4829700886817977, + "flos": 24461954064000.0, + "grad_norm": 1.5190699612157064, + "language_loss": 0.74008, + "learning_rate": 2.2070168885878126e-06, + "loss": 0.76156777, + "num_input_tokens_seen": 172675545, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7265625, + "step": 8033, + "time_per_iteration": 2.497344493865967 + }, + { + "auxiliary_loss_clip": 0.01118056, + "auxiliary_loss_mlp": 0.01037099, + "balance_loss_clip": 1.02273428, + "balance_loss_mlp": 1.04200494, + "epoch": 0.48303021193446566, + "flos": 25702164904320.0, + "grad_norm": 1.7070178882470917, + "language_loss": 0.82929307, + "learning_rate": 2.2066295149479996e-06, + "loss": 0.85084462, + "num_input_tokens_seen": 172696455, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.76171875, + "step": 8034, + "time_per_iteration": 2.504986524581909 + }, + { + "auxiliary_loss_clip": 0.01110784, + "auxiliary_loss_mlp": 0.01032285, + "balance_loss_clip": 1.01862347, + "balance_loss_mlp": 1.04048431, + "epoch": 0.4830903351871336, + "flos": 20085233673600.0, + "grad_norm": 2.2841237596844493, + "language_loss": 0.79519325, + "learning_rate": 2.2062421334727744e-06, + "loss": 0.81662393, + "num_input_tokens_seen": 172716720, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.703125, + "step": 8035, + "time_per_iteration": 2.497851610183716 + }, + { + "auxiliary_loss_clip": 0.01115806, + "auxiliary_loss_mlp": 0.01041785, + "balance_loss_clip": 1.02656746, + "balance_loss_mlp": 1.04139149, + "epoch": 0.4831504584398016, + "flos": 39452216014080.0, + "grad_norm": 1.7925521800027493, + "language_loss": 0.69359076, + "learning_rate": 2.2058547441768267e-06, + "loss": 0.71516669, + "num_input_tokens_seen": 172737435, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.7421875, + "step": 8036, + "time_per_iteration": 2.6260759830474854 + }, + { + "auxiliary_loss_clip": 0.01112129, + "auxiliary_loss_mlp": 0.01034751, + "balance_loss_clip": 1.0211308, + "balance_loss_mlp": 1.03983057, + "epoch": 0.48321058169246955, + "flos": 20006588845440.0, + "grad_norm": 2.034912964838748, + "language_loss": 0.72518653, + "learning_rate": 2.205467347074847e-06, + "loss": 0.74665534, + "num_input_tokens_seen": 172755700, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.72265625, + "step": 8037, + "time_per_iteration": 2.4452965259552 + }, + { + "auxiliary_loss_clip": 0.01120439, + "auxiliary_loss_mlp": 0.0103565, + "balance_loss_clip": 1.02053404, + "balance_loss_mlp": 1.04226792, + "epoch": 0.4832707049451375, + "flos": 20741465197440.0, + "grad_norm": 2.369475157435804, + "language_loss": 0.69122416, + "learning_rate": 2.205079942181525e-06, + "loss": 0.71278501, + "num_input_tokens_seen": 172775185, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.78125, + "step": 8038, + "time_per_iteration": 2.4694747924804688 + }, + { + "auxiliary_loss_clip": 0.01114495, + "auxiliary_loss_mlp": 0.01036639, + "balance_loss_clip": 1.02201188, + "balance_loss_mlp": 1.04133189, + "epoch": 0.4833308281978055, + "flos": 33145584762240.0, + "grad_norm": 1.4952565926757524, + "language_loss": 0.78972542, + "learning_rate": 2.20469252951155e-06, + "loss": 0.8112368, + "num_input_tokens_seen": 172796990, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.73046875, + "step": 8039, + "time_per_iteration": 2.5778839588165283 + }, + { + "auxiliary_loss_clip": 0.01116832, + "auxiliary_loss_mlp": 0.01032622, + "balance_loss_clip": 1.01874638, + "balance_loss_mlp": 1.04335415, + "epoch": 0.48339095145047345, + "flos": 19099234362240.0, + "grad_norm": 1.6799663014860025, + "language_loss": 0.76981616, + "learning_rate": 2.2043051090796143e-06, + "loss": 0.79131073, + "num_input_tokens_seen": 172814915, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.734375, + "step": 8040, + "time_per_iteration": 2.4846322536468506 + }, + { + "auxiliary_loss_clip": 0.01116146, + "auxiliary_loss_mlp": 0.01037901, + "balance_loss_clip": 1.02283335, + "balance_loss_mlp": 1.04120946, + "epoch": 0.4834510747031414, + "flos": 34459448440320.0, + "grad_norm": 1.5584368035119462, + "language_loss": 0.75443131, + "learning_rate": 2.203917680900409e-06, + "loss": 0.77597177, + "num_input_tokens_seen": 172837060, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.75, + "step": 8041, + "time_per_iteration": 2.5853140354156494 + }, + { + "auxiliary_loss_clip": 0.01115991, + "auxiliary_loss_mlp": 0.0103594, + "balance_loss_clip": 1.02178383, + "balance_loss_mlp": 1.04486728, + "epoch": 0.48351119795580944, + "flos": 27380845065600.0, + "grad_norm": 1.8135207231669344, + "language_loss": 0.66745925, + "learning_rate": 2.203530244988624e-06, + "loss": 0.68897855, + "num_input_tokens_seen": 172856545, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7109375, + "step": 8042, + "time_per_iteration": 2.5322182178497314 + }, + { + "auxiliary_loss_clip": 0.01040325, + "auxiliary_loss_mlp": 0.00998367, + "balance_loss_clip": 0.99714488, + "balance_loss_mlp": 1.0165081, + "epoch": 0.4835713212084774, + "flos": 67143941291520.0, + "grad_norm": 0.687656922942032, + "language_loss": 0.58557642, + "learning_rate": 2.2031428013589517e-06, + "loss": 0.60596335, + "num_input_tokens_seen": 172923055, + "router_z_loss_clip": 0.01220703, + "router_z_loss_mlp": 0.23828125, + "step": 8043, + "time_per_iteration": 3.1435444355010986 + }, + { + "auxiliary_loss_clip": 0.01115264, + "auxiliary_loss_mlp": 0.01034627, + "balance_loss_clip": 1.01982713, + "balance_loss_mlp": 1.04060805, + "epoch": 0.48363144446114537, + "flos": 17967473660160.0, + "grad_norm": 1.8614249809437893, + "language_loss": 0.71973354, + "learning_rate": 2.2027553500260847e-06, + "loss": 0.7412324, + "num_input_tokens_seen": 172940700, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7421875, + "step": 8044, + "time_per_iteration": 2.4688329696655273 + }, + { + "auxiliary_loss_clip": 0.01113296, + "auxiliary_loss_mlp": 0.01032041, + "balance_loss_clip": 1.01702118, + "balance_loss_mlp": 1.04181921, + "epoch": 0.48369156771381333, + "flos": 20593513077120.0, + "grad_norm": 1.358705165779184, + "language_loss": 0.75938857, + "learning_rate": 2.202367891004714e-06, + "loss": 0.78084195, + "num_input_tokens_seen": 172961125, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.71484375, + "step": 8045, + "time_per_iteration": 2.455991506576538 + }, + { + "auxiliary_loss_clip": 0.01117445, + "auxiliary_loss_mlp": 0.0104056, + "balance_loss_clip": 1.02640939, + "balance_loss_mlp": 1.04251719, + "epoch": 0.4837516909664813, + "flos": 22675075159680.0, + "grad_norm": 1.8505124624812508, + "language_loss": 0.69661564, + "learning_rate": 2.201980424309533e-06, + "loss": 0.71819568, + "num_input_tokens_seen": 172980405, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.75, + "step": 8046, + "time_per_iteration": 2.480437994003296 + }, + { + "auxiliary_loss_clip": 0.01113741, + "auxiliary_loss_mlp": 0.01036039, + "balance_loss_clip": 1.02159083, + "balance_loss_mlp": 1.04073739, + "epoch": 0.48381181421914926, + "flos": 25518625384320.0, + "grad_norm": 3.209923694390607, + "language_loss": 0.819103, + "learning_rate": 2.2015929499552337e-06, + "loss": 0.84060085, + "num_input_tokens_seen": 172999105, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.73046875, + "step": 8047, + "time_per_iteration": 2.4875996112823486 + }, + { + "auxiliary_loss_clip": 0.01111465, + "auxiliary_loss_mlp": 0.01031694, + "balance_loss_clip": 1.01802719, + "balance_loss_mlp": 1.04047942, + "epoch": 0.4838719374718172, + "flos": 24207491139840.0, + "grad_norm": 1.602624612336977, + "language_loss": 0.80215144, + "learning_rate": 2.2012054679565092e-06, + "loss": 0.82358307, + "num_input_tokens_seen": 173019935, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7109375, + "step": 8048, + "time_per_iteration": 2.5097532272338867 + }, + { + "auxiliary_loss_clip": 0.0111735, + "auxiliary_loss_mlp": 0.01036589, + "balance_loss_clip": 1.02204585, + "balance_loss_mlp": 1.0415504, + "epoch": 0.4839320607244852, + "flos": 26724577628160.0, + "grad_norm": 1.5504815305200743, + "language_loss": 0.81360143, + "learning_rate": 2.200817978328054e-06, + "loss": 0.83514082, + "num_input_tokens_seen": 173039700, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7578125, + "step": 8049, + "time_per_iteration": 2.5025296211242676 + }, + { + "auxiliary_loss_clip": 0.01111119, + "auxiliary_loss_mlp": 0.01034433, + "balance_loss_clip": 1.02170801, + "balance_loss_mlp": 1.04200411, + "epoch": 0.48399218397715316, + "flos": 20448900921600.0, + "grad_norm": 1.7765572151997517, + "language_loss": 0.72636938, + "learning_rate": 2.2004304810845602e-06, + "loss": 0.74782485, + "num_input_tokens_seen": 173059170, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 8050, + "time_per_iteration": 2.4983279705047607 + }, + { + "auxiliary_loss_clip": 0.01039152, + "auxiliary_loss_mlp": 0.01005399, + "balance_loss_clip": 1.00414741, + "balance_loss_mlp": 1.01505625, + "epoch": 0.4840523072298211, + "flos": 67180570185600.0, + "grad_norm": 0.7015070380534334, + "language_loss": 0.56459856, + "learning_rate": 2.200042976240723e-06, + "loss": 0.58504415, + "num_input_tokens_seen": 173119000, + "router_z_loss_clip": 0.01251221, + "router_z_loss_mlp": 0.24121094, + "step": 8051, + "time_per_iteration": 3.1124837398529053 + }, + { + "auxiliary_loss_clip": 0.01116144, + "auxiliary_loss_mlp": 0.01033942, + "balance_loss_clip": 1.0198456, + "balance_loss_mlp": 1.04258502, + "epoch": 0.4841124304824891, + "flos": 22411490181120.0, + "grad_norm": 2.416646260203107, + "language_loss": 0.7510823, + "learning_rate": 2.199655463811236e-06, + "loss": 0.77258313, + "num_input_tokens_seen": 173137570, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.734375, + "step": 8052, + "time_per_iteration": 3.970653772354126 + }, + { + "auxiliary_loss_clip": 0.01114314, + "auxiliary_loss_mlp": 0.01033056, + "balance_loss_clip": 1.01953709, + "balance_loss_mlp": 1.04124272, + "epoch": 0.48417255373515705, + "flos": 13843959217920.0, + "grad_norm": 3.0848333967382855, + "language_loss": 0.65859687, + "learning_rate": 2.1992679438107936e-06, + "loss": 0.68007052, + "num_input_tokens_seen": 173154355, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73046875, + "step": 8053, + "time_per_iteration": 2.489314079284668 + }, + { + "auxiliary_loss_clip": 0.01108306, + "auxiliary_loss_mlp": 0.01033063, + "balance_loss_clip": 1.01981306, + "balance_loss_mlp": 1.03776336, + "epoch": 0.484232676987825, + "flos": 31649689935360.0, + "grad_norm": 1.8753990029707186, + "language_loss": 0.6933912, + "learning_rate": 2.198880416254091e-06, + "loss": 0.71480489, + "num_input_tokens_seen": 173174845, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.703125, + "step": 8054, + "time_per_iteration": 4.118170976638794 + }, + { + "auxiliary_loss_clip": 0.01110556, + "auxiliary_loss_mlp": 0.01035408, + "balance_loss_clip": 1.02187181, + "balance_loss_mlp": 1.03860784, + "epoch": 0.48429280024049304, + "flos": 24095377814400.0, + "grad_norm": 1.7081803235265158, + "language_loss": 0.69577026, + "learning_rate": 2.1984928811558233e-06, + "loss": 0.7172299, + "num_input_tokens_seen": 173195025, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71875, + "step": 8055, + "time_per_iteration": 3.932403326034546 + }, + { + "auxiliary_loss_clip": 0.01115214, + "auxiliary_loss_mlp": 0.01036587, + "balance_loss_clip": 1.0229013, + "balance_loss_mlp": 1.04260492, + "epoch": 0.484352923493161, + "flos": 17530081747200.0, + "grad_norm": 2.9345474086324397, + "language_loss": 0.631603, + "learning_rate": 2.198105338530685e-06, + "loss": 0.65312105, + "num_input_tokens_seen": 173213065, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7265625, + "step": 8056, + "time_per_iteration": 2.4628608226776123 + }, + { + "auxiliary_loss_clip": 0.01110953, + "auxiliary_loss_mlp": 0.0103397, + "balance_loss_clip": 1.01945043, + "balance_loss_mlp": 1.03856075, + "epoch": 0.48441304674582897, + "flos": 29166862043520.0, + "grad_norm": 1.6727278675155979, + "language_loss": 0.67380416, + "learning_rate": 2.1977177883933726e-06, + "loss": 0.69525343, + "num_input_tokens_seen": 173234545, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7265625, + "step": 8057, + "time_per_iteration": 2.5488758087158203 + }, + { + "auxiliary_loss_clip": 0.0111047, + "auxiliary_loss_mlp": 0.01036278, + "balance_loss_clip": 1.02286661, + "balance_loss_mlp": 1.03944063, + "epoch": 0.48447316999849693, + "flos": 15886701676800.0, + "grad_norm": 1.62294394814829, + "language_loss": 0.81633735, + "learning_rate": 2.1973302307585827e-06, + "loss": 0.83780485, + "num_input_tokens_seen": 173252175, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7109375, + "step": 8058, + "time_per_iteration": 2.4864389896392822 + }, + { + "auxiliary_loss_clip": 0.01116596, + "auxiliary_loss_mlp": 0.01038397, + "balance_loss_clip": 1.02458692, + "balance_loss_mlp": 1.04142284, + "epoch": 0.4845332932511649, + "flos": 24381405815040.0, + "grad_norm": 1.5675258134335472, + "language_loss": 0.79917222, + "learning_rate": 2.1969426656410097e-06, + "loss": 0.82072222, + "num_input_tokens_seen": 173268790, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.75390625, + "step": 8059, + "time_per_iteration": 2.4964730739593506 + }, + { + "auxiliary_loss_clip": 0.01117834, + "auxiliary_loss_mlp": 0.0104156, + "balance_loss_clip": 1.02709424, + "balance_loss_mlp": 1.04217446, + "epoch": 0.48459341650383286, + "flos": 37116478316160.0, + "grad_norm": 2.4233986338774347, + "language_loss": 0.66882968, + "learning_rate": 2.196555093055352e-06, + "loss": 0.69042355, + "num_input_tokens_seen": 173288030, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7578125, + "step": 8060, + "time_per_iteration": 2.6209259033203125 + }, + { + "auxiliary_loss_clip": 0.01116591, + "auxiliary_loss_mlp": 0.01040178, + "balance_loss_clip": 1.02654088, + "balance_loss_mlp": 1.04357326, + "epoch": 0.48465353975650083, + "flos": 22966777509120.0, + "grad_norm": 1.8494683744964096, + "language_loss": 0.67328548, + "learning_rate": 2.1961675130163046e-06, + "loss": 0.69485319, + "num_input_tokens_seen": 173305965, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73046875, + "step": 8061, + "time_per_iteration": 2.460986614227295 + }, + { + "auxiliary_loss_clip": 0.01116735, + "auxiliary_loss_mlp": 0.0104191, + "balance_loss_clip": 1.0274322, + "balance_loss_mlp": 1.04356933, + "epoch": 0.4847136630091688, + "flos": 17707695523200.0, + "grad_norm": 2.133282380017761, + "language_loss": 0.82559311, + "learning_rate": 2.1957799255385653e-06, + "loss": 0.84717953, + "num_input_tokens_seen": 173321985, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.73046875, + "step": 8062, + "time_per_iteration": 2.453993320465088 + }, + { + "auxiliary_loss_clip": 0.01112047, + "auxiliary_loss_mlp": 0.01035491, + "balance_loss_clip": 1.022277, + "balance_loss_mlp": 1.04087675, + "epoch": 0.48477378626183676, + "flos": 22018269018240.0, + "grad_norm": 1.7643008090816974, + "language_loss": 0.7443378, + "learning_rate": 2.1953923306368325e-06, + "loss": 0.76581317, + "num_input_tokens_seen": 173341315, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 8063, + "time_per_iteration": 2.4603588581085205 + }, + { + "auxiliary_loss_clip": 0.01113086, + "auxiliary_loss_mlp": 0.01033324, + "balance_loss_clip": 1.01978183, + "balance_loss_mlp": 1.04069591, + "epoch": 0.4848339095145047, + "flos": 27962956874880.0, + "grad_norm": 1.6491790763512546, + "language_loss": 0.78826106, + "learning_rate": 2.1950047283258023e-06, + "loss": 0.80972517, + "num_input_tokens_seen": 173361055, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7265625, + "step": 8064, + "time_per_iteration": 2.5214664936065674 + }, + { + "auxiliary_loss_clip": 0.01110182, + "auxiliary_loss_mlp": 0.01036452, + "balance_loss_clip": 1.02426863, + "balance_loss_mlp": 1.04178667, + "epoch": 0.4848940327671727, + "flos": 21688752625920.0, + "grad_norm": 1.866783501124255, + "language_loss": 0.79383814, + "learning_rate": 2.194617118620173e-06, + "loss": 0.81530446, + "num_input_tokens_seen": 173379255, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.68359375, + "step": 8065, + "time_per_iteration": 2.445235013961792 + }, + { + "auxiliary_loss_clip": 0.01104927, + "auxiliary_loss_mlp": 0.01033365, + "balance_loss_clip": 1.02112269, + "balance_loss_mlp": 1.03714252, + "epoch": 0.48495415601984065, + "flos": 20631578515200.0, + "grad_norm": 2.505071872189949, + "language_loss": 0.76120496, + "learning_rate": 2.194229501534644e-06, + "loss": 0.78258789, + "num_input_tokens_seen": 173398370, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.67578125, + "step": 8066, + "time_per_iteration": 2.484790325164795 + }, + { + "auxiliary_loss_clip": 0.01111648, + "auxiliary_loss_mlp": 0.01033332, + "balance_loss_clip": 1.02022457, + "balance_loss_mlp": 1.04121971, + "epoch": 0.4850142792725086, + "flos": 25628152930560.0, + "grad_norm": 1.8377201756800503, + "language_loss": 0.7205655, + "learning_rate": 2.193841877083912e-06, + "loss": 0.74201524, + "num_input_tokens_seen": 173419595, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 8067, + "time_per_iteration": 2.4876203536987305 + }, + { + "auxiliary_loss_clip": 0.01112073, + "auxiliary_loss_mlp": 0.0103587, + "balance_loss_clip": 1.02231634, + "balance_loss_mlp": 1.04024172, + "epoch": 0.4850744025251766, + "flos": 13771958405760.0, + "grad_norm": 2.0010459311949393, + "language_loss": 0.79434109, + "learning_rate": 2.1934542452826767e-06, + "loss": 0.81582052, + "num_input_tokens_seen": 173435390, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.71875, + "step": 8068, + "time_per_iteration": 2.4537808895111084 + }, + { + "auxiliary_loss_clip": 0.01108507, + "auxiliary_loss_mlp": 0.01034395, + "balance_loss_clip": 1.02171147, + "balance_loss_mlp": 1.0385673, + "epoch": 0.4851345257778446, + "flos": 20261339078400.0, + "grad_norm": 1.4177927500996443, + "language_loss": 0.8413924, + "learning_rate": 2.193066606145638e-06, + "loss": 0.86282146, + "num_input_tokens_seen": 173454095, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 8069, + "time_per_iteration": 2.4553275108337402 + }, + { + "auxiliary_loss_clip": 0.0110935, + "auxiliary_loss_mlp": 0.0103299, + "balance_loss_clip": 1.02042496, + "balance_loss_mlp": 1.03913558, + "epoch": 0.48519464903051257, + "flos": 27089681420160.0, + "grad_norm": 1.6522403411207847, + "language_loss": 0.77863526, + "learning_rate": 2.192678959687493e-06, + "loss": 0.8000586, + "num_input_tokens_seen": 173475300, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 8070, + "time_per_iteration": 2.5032036304473877 + }, + { + "auxiliary_loss_clip": 0.01110754, + "auxiliary_loss_mlp": 0.01033168, + "balance_loss_clip": 1.01985812, + "balance_loss_mlp": 1.0400399, + "epoch": 0.48525477228318054, + "flos": 17127235739520.0, + "grad_norm": 2.1929202067055993, + "language_loss": 0.78031409, + "learning_rate": 2.192291305922943e-06, + "loss": 0.80175334, + "num_input_tokens_seen": 173492005, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.70703125, + "step": 8071, + "time_per_iteration": 2.4315407276153564 + }, + { + "auxiliary_loss_clip": 0.01109006, + "auxiliary_loss_mlp": 0.01032016, + "balance_loss_clip": 1.01822925, + "balance_loss_mlp": 1.03733289, + "epoch": 0.4853148955358485, + "flos": 28180324028160.0, + "grad_norm": 1.7778798626181176, + "language_loss": 0.72204757, + "learning_rate": 2.1919036448666873e-06, + "loss": 0.74345779, + "num_input_tokens_seen": 173511995, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.71875, + "step": 8072, + "time_per_iteration": 2.510474920272827 + }, + { + "auxiliary_loss_clip": 0.01116993, + "auxiliary_loss_mlp": 0.01039835, + "balance_loss_clip": 1.02580357, + "balance_loss_mlp": 1.04254019, + "epoch": 0.48537501878851647, + "flos": 17493309198720.0, + "grad_norm": 2.999761551965867, + "language_loss": 0.8779549, + "learning_rate": 2.1915159765334262e-06, + "loss": 0.89952314, + "num_input_tokens_seen": 173530215, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 8073, + "time_per_iteration": 2.4295654296875 + }, + { + "auxiliary_loss_clip": 0.01106811, + "auxiliary_loss_mlp": 0.01031305, + "balance_loss_clip": 1.01805508, + "balance_loss_mlp": 1.03857493, + "epoch": 0.48543514204118443, + "flos": 28584857975040.0, + "grad_norm": 1.702758380167849, + "language_loss": 0.60793108, + "learning_rate": 2.19112830093786e-06, + "loss": 0.62931222, + "num_input_tokens_seen": 173550920, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.68359375, + "step": 8074, + "time_per_iteration": 2.641831636428833 + }, + { + "auxiliary_loss_clip": 0.01112393, + "auxiliary_loss_mlp": 0.01039275, + "balance_loss_clip": 1.02540481, + "balance_loss_mlp": 1.03871894, + "epoch": 0.4854952652938524, + "flos": 20959981585920.0, + "grad_norm": 1.6649133015556126, + "language_loss": 0.73151296, + "learning_rate": 2.19074061809469e-06, + "loss": 0.75302958, + "num_input_tokens_seen": 173569065, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.734375, + "step": 8075, + "time_per_iteration": 2.4624290466308594 + }, + { + "auxiliary_loss_clip": 0.01108632, + "auxiliary_loss_mlp": 0.01035606, + "balance_loss_clip": 1.02328563, + "balance_loss_mlp": 1.04028702, + "epoch": 0.48555538854652036, + "flos": 66529543155840.0, + "grad_norm": 1.6285965401893183, + "language_loss": 0.82012558, + "learning_rate": 2.1903529280186163e-06, + "loss": 0.84156799, + "num_input_tokens_seen": 173596085, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.68359375, + "step": 8076, + "time_per_iteration": 2.902468681335449 + }, + { + "auxiliary_loss_clip": 0.01112144, + "auxiliary_loss_mlp": 0.01033517, + "balance_loss_clip": 1.01899099, + "balance_loss_mlp": 1.0407958, + "epoch": 0.4856155117991883, + "flos": 15924982596480.0, + "grad_norm": 1.793912725367087, + "language_loss": 0.86204815, + "learning_rate": 2.1899652307243407e-06, + "loss": 0.88350475, + "num_input_tokens_seen": 173613900, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7109375, + "step": 8077, + "time_per_iteration": 2.4470572471618652 + }, + { + "auxiliary_loss_clip": 0.01035955, + "auxiliary_loss_mlp": 0.01003512, + "balance_loss_clip": 1.00206935, + "balance_loss_mlp": 1.01168394, + "epoch": 0.4856756350518563, + "flos": 71047395060480.0, + "grad_norm": 0.9017192941717106, + "language_loss": 0.58489066, + "learning_rate": 2.189577526226564e-06, + "loss": 0.60528529, + "num_input_tokens_seen": 173671305, + "router_z_loss_clip": 0.0144043, + "router_z_loss_mlp": 0.24316406, + "step": 8078, + "time_per_iteration": 3.061302661895752 + }, + { + "auxiliary_loss_clip": 0.01115187, + "auxiliary_loss_mlp": 0.01030587, + "balance_loss_clip": 1.01750946, + "balance_loss_mlp": 1.04146993, + "epoch": 0.48573575830452426, + "flos": 29825679346560.0, + "grad_norm": 1.8290534457206422, + "language_loss": 0.72197151, + "learning_rate": 2.1891898145399884e-06, + "loss": 0.7434293, + "num_input_tokens_seen": 173692070, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.734375, + "step": 8079, + "time_per_iteration": 2.545018434524536 + }, + { + "auxiliary_loss_clip": 0.0111477, + "auxiliary_loss_mlp": 0.01029859, + "balance_loss_clip": 1.01643038, + "balance_loss_mlp": 1.04235518, + "epoch": 0.4857958815571922, + "flos": 17639501552640.0, + "grad_norm": 2.180592453343409, + "language_loss": 0.79515052, + "learning_rate": 2.1888020956793172e-06, + "loss": 0.81659681, + "num_input_tokens_seen": 173709785, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.72265625, + "step": 8080, + "time_per_iteration": 2.4793026447296143 + }, + { + "auxiliary_loss_clip": 0.01111199, + "auxiliary_loss_mlp": 0.01030093, + "balance_loss_clip": 1.01659858, + "balance_loss_mlp": 1.03938115, + "epoch": 0.4858560048098602, + "flos": 21105491581440.0, + "grad_norm": 2.102088815710231, + "language_loss": 0.83866465, + "learning_rate": 2.188414369659251e-06, + "loss": 0.86007756, + "num_input_tokens_seen": 173728770, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71875, + "step": 8081, + "time_per_iteration": 2.4615542888641357 + }, + { + "auxiliary_loss_clip": 0.0110941, + "auxiliary_loss_mlp": 0.01032193, + "balance_loss_clip": 1.01766098, + "balance_loss_mlp": 1.03858256, + "epoch": 0.4859161280625282, + "flos": 22090844448000.0, + "grad_norm": 1.4514708090647532, + "language_loss": 0.83281112, + "learning_rate": 2.1880266364944924e-06, + "loss": 0.85422719, + "num_input_tokens_seen": 173747355, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.70703125, + "step": 8082, + "time_per_iteration": 2.506359100341797 + }, + { + "auxiliary_loss_clip": 0.01111508, + "auxiliary_loss_mlp": 0.01032003, + "balance_loss_clip": 1.01930749, + "balance_loss_mlp": 1.04239488, + "epoch": 0.4859762513151962, + "flos": 17493452853120.0, + "grad_norm": 2.0513098734750153, + "language_loss": 0.87210095, + "learning_rate": 2.187638896199746e-06, + "loss": 0.89353603, + "num_input_tokens_seen": 173764825, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 8083, + "time_per_iteration": 2.4269142150878906 + }, + { + "auxiliary_loss_clip": 0.01109655, + "auxiliary_loss_mlp": 0.01038876, + "balance_loss_clip": 1.0264957, + "balance_loss_mlp": 1.03958535, + "epoch": 0.48603637456786414, + "flos": 18004246208640.0, + "grad_norm": 1.6599209376706838, + "language_loss": 0.8107174, + "learning_rate": 2.1872511487897126e-06, + "loss": 0.83220273, + "num_input_tokens_seen": 173783215, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69921875, + "step": 8084, + "time_per_iteration": 2.451949119567871 + }, + { + "auxiliary_loss_clip": 0.01112614, + "auxiliary_loss_mlp": 0.01035292, + "balance_loss_clip": 1.02148795, + "balance_loss_mlp": 1.04034543, + "epoch": 0.4860964978205321, + "flos": 22492038430080.0, + "grad_norm": 2.346430029405153, + "language_loss": 0.68347323, + "learning_rate": 2.186863394279098e-06, + "loss": 0.70495236, + "num_input_tokens_seen": 173801905, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.72265625, + "step": 8085, + "time_per_iteration": 2.499215841293335 + }, + { + "auxiliary_loss_clip": 0.0111142, + "auxiliary_loss_mlp": 0.01040793, + "balance_loss_clip": 1.0276444, + "balance_loss_mlp": 1.04064536, + "epoch": 0.48615662107320007, + "flos": 23372532518400.0, + "grad_norm": 1.46412171762657, + "language_loss": 0.77375883, + "learning_rate": 2.1864756326826046e-06, + "loss": 0.79528093, + "num_input_tokens_seen": 173824690, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.70703125, + "step": 8086, + "time_per_iteration": 2.541616678237915 + }, + { + "auxiliary_loss_clip": 0.01111956, + "auxiliary_loss_mlp": 0.01029921, + "balance_loss_clip": 1.01655173, + "balance_loss_mlp": 1.04059958, + "epoch": 0.48621674432586803, + "flos": 34418833136640.0, + "grad_norm": 1.9494281519542558, + "language_loss": 0.69733107, + "learning_rate": 2.1860878640149355e-06, + "loss": 0.71874988, + "num_input_tokens_seen": 173844450, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7109375, + "step": 8087, + "time_per_iteration": 2.5694613456726074 + }, + { + "auxiliary_loss_clip": 0.01115057, + "auxiliary_loss_mlp": 0.01037115, + "balance_loss_clip": 1.02278614, + "balance_loss_mlp": 1.03913963, + "epoch": 0.486276867578536, + "flos": 33107555237760.0, + "grad_norm": 1.610275852133116, + "language_loss": 0.72411895, + "learning_rate": 2.1857000882907974e-06, + "loss": 0.7456407, + "num_input_tokens_seen": 173864975, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7578125, + "step": 8088, + "time_per_iteration": 2.5770511627197266 + }, + { + "auxiliary_loss_clip": 0.01111259, + "auxiliary_loss_mlp": 0.01037547, + "balance_loss_clip": 1.02443993, + "balance_loss_mlp": 1.04033983, + "epoch": 0.48633699083120396, + "flos": 21470703114240.0, + "grad_norm": 1.6468852838011347, + "language_loss": 0.7557345, + "learning_rate": 2.185312305524892e-06, + "loss": 0.77722251, + "num_input_tokens_seen": 173883805, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 8089, + "time_per_iteration": 2.4625489711761475 + }, + { + "auxiliary_loss_clip": 0.01114004, + "auxiliary_loss_mlp": 0.01030212, + "balance_loss_clip": 1.0165205, + "balance_loss_mlp": 1.04078937, + "epoch": 0.48639711408387193, + "flos": 20084335833600.0, + "grad_norm": 1.5811587339913937, + "language_loss": 0.83939755, + "learning_rate": 2.184924515731926e-06, + "loss": 0.86083972, + "num_input_tokens_seen": 173903520, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73046875, + "step": 8090, + "time_per_iteration": 2.500293731689453 + }, + { + "auxiliary_loss_clip": 0.01107626, + "auxiliary_loss_mlp": 0.010336, + "balance_loss_clip": 1.02016521, + "balance_loss_mlp": 1.03945088, + "epoch": 0.4864572373365399, + "flos": 20778884190720.0, + "grad_norm": 1.6075799019512609, + "language_loss": 0.76256877, + "learning_rate": 2.1845367189266045e-06, + "loss": 0.78398097, + "num_input_tokens_seen": 173924255, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.68359375, + "step": 8091, + "time_per_iteration": 2.465998411178589 + }, + { + "auxiliary_loss_clip": 0.01110716, + "auxiliary_loss_mlp": 0.01030434, + "balance_loss_clip": 1.01714182, + "balance_loss_mlp": 1.03904068, + "epoch": 0.48651736058920786, + "flos": 26025360503040.0, + "grad_norm": 1.4690121920213544, + "language_loss": 0.80391169, + "learning_rate": 2.184148915123631e-06, + "loss": 0.82532316, + "num_input_tokens_seen": 173943285, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 8092, + "time_per_iteration": 2.509016513824463 + }, + { + "auxiliary_loss_clip": 0.01113066, + "auxiliary_loss_mlp": 0.01030079, + "balance_loss_clip": 1.01679361, + "balance_loss_mlp": 1.040061, + "epoch": 0.4865774838418758, + "flos": 20485601642880.0, + "grad_norm": 1.4222056252501818, + "language_loss": 0.71696734, + "learning_rate": 2.1837611043377126e-06, + "loss": 0.73839879, + "num_input_tokens_seen": 173962205, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73046875, + "step": 8093, + "time_per_iteration": 2.47951078414917 + }, + { + "auxiliary_loss_clip": 0.01109125, + "auxiliary_loss_mlp": 0.01032178, + "balance_loss_clip": 1.0194819, + "balance_loss_mlp": 1.03917289, + "epoch": 0.4866376070945438, + "flos": 23547704169600.0, + "grad_norm": 1.5524869827771763, + "language_loss": 0.67529863, + "learning_rate": 2.1833732865835545e-06, + "loss": 0.69671166, + "num_input_tokens_seen": 173980945, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 8094, + "time_per_iteration": 3.9874253273010254 + }, + { + "auxiliary_loss_clip": 0.01116894, + "auxiliary_loss_mlp": 0.01033116, + "balance_loss_clip": 1.01933527, + "balance_loss_mlp": 1.04218793, + "epoch": 0.4866977303472118, + "flos": 16690598012160.0, + "grad_norm": 1.8480915023468016, + "language_loss": 0.66936231, + "learning_rate": 2.1829854618758636e-06, + "loss": 0.69086242, + "num_input_tokens_seen": 173998860, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.74609375, + "step": 8095, + "time_per_iteration": 2.477593183517456 + }, + { + "auxiliary_loss_clip": 0.01112855, + "auxiliary_loss_mlp": 0.01032969, + "balance_loss_clip": 1.01847899, + "balance_loss_mlp": 1.04048705, + "epoch": 0.4867578535998798, + "flos": 17896011552000.0, + "grad_norm": 2.265808316415622, + "language_loss": 0.78996563, + "learning_rate": 2.182597630229345e-06, + "loss": 0.8114239, + "num_input_tokens_seen": 174016665, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.72265625, + "step": 8096, + "time_per_iteration": 5.404834985733032 + }, + { + "auxiliary_loss_clip": 0.01107949, + "auxiliary_loss_mlp": 0.01032056, + "balance_loss_clip": 1.01872253, + "balance_loss_mlp": 1.03737998, + "epoch": 0.48681797685254774, + "flos": 22637799820800.0, + "grad_norm": 1.7396987354687747, + "language_loss": 0.67313123, + "learning_rate": 2.1822097916587067e-06, + "loss": 0.69453126, + "num_input_tokens_seen": 174034800, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.70703125, + "step": 8097, + "time_per_iteration": 2.450967788696289 + }, + { + "auxiliary_loss_clip": 0.01108969, + "auxiliary_loss_mlp": 0.01033813, + "balance_loss_clip": 1.02071154, + "balance_loss_mlp": 1.03922939, + "epoch": 0.4868781001052157, + "flos": 20886077352960.0, + "grad_norm": 1.4534902730904964, + "language_loss": 0.71347374, + "learning_rate": 2.1818219461786543e-06, + "loss": 0.73490155, + "num_input_tokens_seen": 174054445, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.69921875, + "step": 8098, + "time_per_iteration": 2.4994144439697266 + }, + { + "auxiliary_loss_clip": 0.01116904, + "auxiliary_loss_mlp": 0.01037143, + "balance_loss_clip": 1.02274871, + "balance_loss_mlp": 1.04109979, + "epoch": 0.48693822335788367, + "flos": 41974940937600.0, + "grad_norm": 1.7962943745015671, + "language_loss": 0.66037756, + "learning_rate": 2.1814340938038956e-06, + "loss": 0.68191803, + "num_input_tokens_seen": 174077890, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7578125, + "step": 8099, + "time_per_iteration": 2.624321222305298 + }, + { + "auxiliary_loss_clip": 0.01107533, + "auxiliary_loss_mlp": 0.01032829, + "balance_loss_clip": 1.01988339, + "balance_loss_mlp": 1.03698707, + "epoch": 0.48699834661055164, + "flos": 24243294021120.0, + "grad_norm": 1.6079322443898665, + "language_loss": 0.66464651, + "learning_rate": 2.181046234549138e-06, + "loss": 0.68605012, + "num_input_tokens_seen": 174097460, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.70703125, + "step": 8100, + "time_per_iteration": 2.52364182472229 + }, + { + "auxiliary_loss_clip": 0.01108299, + "auxiliary_loss_mlp": 0.01030722, + "balance_loss_clip": 1.01802635, + "balance_loss_mlp": 1.03990841, + "epoch": 0.4870584698632196, + "flos": 25923877603200.0, + "grad_norm": 1.3375285332360751, + "language_loss": 0.76606798, + "learning_rate": 2.180658368429088e-06, + "loss": 0.78745818, + "num_input_tokens_seen": 174120775, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.68359375, + "step": 8101, + "time_per_iteration": 2.5515174865722656 + }, + { + "auxiliary_loss_clip": 0.01037344, + "auxiliary_loss_mlp": 0.01004126, + "balance_loss_clip": 1.00279069, + "balance_loss_mlp": 1.01343942, + "epoch": 0.48711859311588757, + "flos": 70211933648640.0, + "grad_norm": 0.6857117323737989, + "language_loss": 0.52317238, + "learning_rate": 2.1802704954584565e-06, + "loss": 0.54358709, + "num_input_tokens_seen": 174189135, + "router_z_loss_clip": 0.0133667, + "router_z_loss_mlp": 0.23925781, + "step": 8102, + "time_per_iteration": 3.2370035648345947 + }, + { + "auxiliary_loss_clip": 0.01110233, + "auxiliary_loss_mlp": 0.01033636, + "balance_loss_clip": 1.02098215, + "balance_loss_mlp": 1.03864419, + "epoch": 0.48717871636855553, + "flos": 12342964659840.0, + "grad_norm": 2.066543814817077, + "language_loss": 0.73703957, + "learning_rate": 2.1798826156519484e-06, + "loss": 0.75847828, + "num_input_tokens_seen": 174203250, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71484375, + "step": 8103, + "time_per_iteration": 2.401146650314331 + }, + { + "auxiliary_loss_clip": 0.01113681, + "auxiliary_loss_mlp": 0.01042266, + "balance_loss_clip": 1.02845609, + "balance_loss_mlp": 1.04083562, + "epoch": 0.4872388396212235, + "flos": 23477139901440.0, + "grad_norm": 2.0729106414348686, + "language_loss": 0.62816393, + "learning_rate": 2.1794947290242737e-06, + "loss": 0.64972341, + "num_input_tokens_seen": 174224145, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7265625, + "step": 8104, + "time_per_iteration": 2.489887237548828 + }, + { + "auxiliary_loss_clip": 0.01111014, + "auxiliary_loss_mlp": 0.01029613, + "balance_loss_clip": 1.01661348, + "balance_loss_mlp": 1.04093325, + "epoch": 0.48729896287389146, + "flos": 31427582186880.0, + "grad_norm": 2.098514623938467, + "language_loss": 0.68962336, + "learning_rate": 2.1791068355901413e-06, + "loss": 0.71102965, + "num_input_tokens_seen": 174244435, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.703125, + "step": 8105, + "time_per_iteration": 2.521994113922119 + }, + { + "auxiliary_loss_clip": 0.01106075, + "auxiliary_loss_mlp": 0.01029561, + "balance_loss_clip": 1.01682925, + "balance_loss_mlp": 1.0371716, + "epoch": 0.4873590861265594, + "flos": 19057936700160.0, + "grad_norm": 1.8440715600711883, + "language_loss": 0.73333305, + "learning_rate": 2.178718935364259e-06, + "loss": 0.75468934, + "num_input_tokens_seen": 174262710, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 8106, + "time_per_iteration": 2.471409797668457 + }, + { + "auxiliary_loss_clip": 0.01116936, + "auxiliary_loss_mlp": 0.01033734, + "balance_loss_clip": 1.01994157, + "balance_loss_mlp": 1.04300117, + "epoch": 0.4874192093792274, + "flos": 24348296453760.0, + "grad_norm": 1.861183691551934, + "language_loss": 0.77122629, + "learning_rate": 2.1783310283613373e-06, + "loss": 0.79273301, + "num_input_tokens_seen": 174281545, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.73828125, + "step": 8107, + "time_per_iteration": 2.4802913665771484 + }, + { + "auxiliary_loss_clip": 0.01109095, + "auxiliary_loss_mlp": 0.01027736, + "balance_loss_clip": 1.01563621, + "balance_loss_mlp": 1.04061639, + "epoch": 0.4874793326318954, + "flos": 23112610727040.0, + "grad_norm": 1.543990493512169, + "language_loss": 0.75148052, + "learning_rate": 2.1779431145960853e-06, + "loss": 0.77284884, + "num_input_tokens_seen": 174300290, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 8108, + "time_per_iteration": 2.4680538177490234 + }, + { + "auxiliary_loss_clip": 0.01108856, + "auxiliary_loss_mlp": 0.0102965, + "balance_loss_clip": 1.01803327, + "balance_loss_mlp": 1.04023099, + "epoch": 0.4875394558845634, + "flos": 19026156142080.0, + "grad_norm": 1.75674444511609, + "language_loss": 0.73340857, + "learning_rate": 2.177555194083212e-06, + "loss": 0.75479364, + "num_input_tokens_seen": 174318490, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6875, + "step": 8109, + "time_per_iteration": 2.4528889656066895 + }, + { + "auxiliary_loss_clip": 0.01108152, + "auxiliary_loss_mlp": 0.01030298, + "balance_loss_clip": 1.0175966, + "balance_loss_mlp": 1.0391928, + "epoch": 0.48759957913723134, + "flos": 21433607343360.0, + "grad_norm": 1.7970671112238439, + "language_loss": 0.78590822, + "learning_rate": 2.177167266837428e-06, + "loss": 0.80729276, + "num_input_tokens_seen": 174335505, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 8110, + "time_per_iteration": 2.4653971195220947 + }, + { + "auxiliary_loss_clip": 0.0111191, + "auxiliary_loss_mlp": 0.01040228, + "balance_loss_clip": 1.02730024, + "balance_loss_mlp": 1.04083896, + "epoch": 0.4876597023898993, + "flos": 17748669962880.0, + "grad_norm": 1.8027530171186463, + "language_loss": 0.72216076, + "learning_rate": 2.176779332873444e-06, + "loss": 0.74368215, + "num_input_tokens_seen": 174353990, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 8111, + "time_per_iteration": 2.4242806434631348 + }, + { + "auxiliary_loss_clip": 0.0111024, + "auxiliary_loss_mlp": 0.01034699, + "balance_loss_clip": 1.02137125, + "balance_loss_mlp": 1.04143023, + "epoch": 0.4877198256425673, + "flos": 17019647527680.0, + "grad_norm": 1.5451794032223725, + "language_loss": 0.75719351, + "learning_rate": 2.17639139220597e-06, + "loss": 0.77864289, + "num_input_tokens_seen": 174373425, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6875, + "step": 8112, + "time_per_iteration": 2.4681711196899414 + }, + { + "auxiliary_loss_clip": 0.0111572, + "auxiliary_loss_mlp": 0.01036111, + "balance_loss_clip": 1.0223484, + "balance_loss_mlp": 1.04125154, + "epoch": 0.48777994889523524, + "flos": 22384091082240.0, + "grad_norm": 1.5422638957013077, + "language_loss": 0.75012642, + "learning_rate": 2.1760034448497166e-06, + "loss": 0.77164471, + "num_input_tokens_seen": 174393070, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.74609375, + "step": 8113, + "time_per_iteration": 2.458070993423462 + }, + { + "auxiliary_loss_clip": 0.0103493, + "auxiliary_loss_mlp": 0.00999333, + "balance_loss_clip": 0.99799174, + "balance_loss_mlp": 1.01145339, + "epoch": 0.4878400721479032, + "flos": 61241772159360.0, + "grad_norm": 0.779968435998717, + "language_loss": 0.48876739, + "learning_rate": 2.1756154908193943e-06, + "loss": 0.50911003, + "num_input_tokens_seen": 174446880, + "router_z_loss_clip": 0.01342773, + "router_z_loss_mlp": 0.23535156, + "step": 8114, + "time_per_iteration": 2.964735507965088 + }, + { + "auxiliary_loss_clip": 0.01112827, + "auxiliary_loss_mlp": 0.01041502, + "balance_loss_clip": 1.02769804, + "balance_loss_mlp": 1.04015875, + "epoch": 0.48790019540057117, + "flos": 24536612482560.0, + "grad_norm": 1.346675786458265, + "language_loss": 0.76713175, + "learning_rate": 2.1752275301297155e-06, + "loss": 0.78867507, + "num_input_tokens_seen": 174468485, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7265625, + "step": 8115, + "time_per_iteration": 2.5008208751678467 + }, + { + "auxiliary_loss_clip": 0.01116462, + "auxiliary_loss_mlp": 0.01036306, + "balance_loss_clip": 1.02220368, + "balance_loss_mlp": 1.0430454, + "epoch": 0.48796031865323913, + "flos": 21833939399040.0, + "grad_norm": 2.9741706409780697, + "language_loss": 0.72150338, + "learning_rate": 2.1748395627953915e-06, + "loss": 0.74303102, + "num_input_tokens_seen": 174486360, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.734375, + "step": 8116, + "time_per_iteration": 2.471170425415039 + }, + { + "auxiliary_loss_clip": 0.01108955, + "auxiliary_loss_mlp": 0.01038046, + "balance_loss_clip": 1.02506459, + "balance_loss_mlp": 1.03951752, + "epoch": 0.4880204419059071, + "flos": 18588907883520.0, + "grad_norm": 1.626628974836948, + "language_loss": 0.63457322, + "learning_rate": 2.1744515888311335e-06, + "loss": 0.65604323, + "num_input_tokens_seen": 174505075, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6953125, + "step": 8117, + "time_per_iteration": 2.4408295154571533 + }, + { + "auxiliary_loss_clip": 0.01106242, + "auxiliary_loss_mlp": 0.01033541, + "balance_loss_clip": 1.02082098, + "balance_loss_mlp": 1.03648984, + "epoch": 0.48808056515857506, + "flos": 19172168928000.0, + "grad_norm": 1.7937040821955612, + "language_loss": 0.79223609, + "learning_rate": 2.1740636082516533e-06, + "loss": 0.81363392, + "num_input_tokens_seen": 174523385, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 8118, + "time_per_iteration": 2.4724843502044678 + }, + { + "auxiliary_loss_clip": 0.01111434, + "auxiliary_loss_mlp": 0.01037003, + "balance_loss_clip": 1.02359247, + "balance_loss_mlp": 1.03926289, + "epoch": 0.48814068841124303, + "flos": 20120497850880.0, + "grad_norm": 2.8027989615224427, + "language_loss": 0.63472134, + "learning_rate": 2.1736756210716645e-06, + "loss": 0.65620571, + "num_input_tokens_seen": 174542200, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.72265625, + "step": 8119, + "time_per_iteration": 2.478968381881714 + }, + { + "auxiliary_loss_clip": 0.01111182, + "auxiliary_loss_mlp": 0.01032744, + "balance_loss_clip": 1.02006578, + "balance_loss_mlp": 1.04054463, + "epoch": 0.488200811663911, + "flos": 22965592360320.0, + "grad_norm": 1.9034604660173908, + "language_loss": 0.72397757, + "learning_rate": 2.173287627305878e-06, + "loss": 0.74541688, + "num_input_tokens_seen": 174563620, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 8120, + "time_per_iteration": 2.5204596519470215 + }, + { + "auxiliary_loss_clip": 0.01109957, + "auxiliary_loss_mlp": 0.01034651, + "balance_loss_clip": 1.02122211, + "balance_loss_mlp": 1.03855026, + "epoch": 0.48826093491657896, + "flos": 33910697387520.0, + "grad_norm": 1.5930525886491658, + "language_loss": 0.63636339, + "learning_rate": 2.1728996269690075e-06, + "loss": 0.65780938, + "num_input_tokens_seen": 174586465, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71484375, + "step": 8121, + "time_per_iteration": 2.5647690296173096 + }, + { + "auxiliary_loss_clip": 0.01113983, + "auxiliary_loss_mlp": 0.01038884, + "balance_loss_clip": 1.02521062, + "balance_loss_mlp": 1.04131413, + "epoch": 0.488321058169247, + "flos": 23070307484160.0, + "grad_norm": 1.870740841609923, + "language_loss": 0.82433021, + "learning_rate": 2.1725116200757664e-06, + "loss": 0.84585893, + "num_input_tokens_seen": 174604035, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7265625, + "step": 8122, + "time_per_iteration": 2.4753966331481934 + }, + { + "auxiliary_loss_clip": 0.01113704, + "auxiliary_loss_mlp": 0.01034348, + "balance_loss_clip": 1.02019167, + "balance_loss_mlp": 1.04063094, + "epoch": 0.48838118142191494, + "flos": 19317714837120.0, + "grad_norm": 2.206764356510625, + "language_loss": 0.85308874, + "learning_rate": 2.172123606640866e-06, + "loss": 0.8745693, + "num_input_tokens_seen": 174621715, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.73046875, + "step": 8123, + "time_per_iteration": 2.5124545097351074 + }, + { + "auxiliary_loss_clip": 0.01111875, + "auxiliary_loss_mlp": 0.01033202, + "balance_loss_clip": 1.02075016, + "balance_loss_mlp": 1.03892267, + "epoch": 0.4884413046745829, + "flos": 25410678036480.0, + "grad_norm": 2.940858316224804, + "language_loss": 0.85766631, + "learning_rate": 2.1717355866790227e-06, + "loss": 0.87911713, + "num_input_tokens_seen": 174643835, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.73046875, + "step": 8124, + "time_per_iteration": 2.5632708072662354 + }, + { + "auxiliary_loss_clip": 0.01112362, + "auxiliary_loss_mlp": 0.01034904, + "balance_loss_clip": 1.02157593, + "balance_loss_mlp": 1.04022837, + "epoch": 0.4885014279272509, + "flos": 20991546662400.0, + "grad_norm": 2.663608167377633, + "language_loss": 0.79223049, + "learning_rate": 2.171347560204948e-06, + "loss": 0.81370318, + "num_input_tokens_seen": 174660955, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 8125, + "time_per_iteration": 2.4487855434417725 + }, + { + "auxiliary_loss_clip": 0.01108941, + "auxiliary_loss_mlp": 0.01033765, + "balance_loss_clip": 1.0211587, + "balance_loss_mlp": 1.03887916, + "epoch": 0.48856155117991884, + "flos": 13771599269760.0, + "grad_norm": 2.7973571608225063, + "language_loss": 0.72273839, + "learning_rate": 2.170959527233356e-06, + "loss": 0.74416542, + "num_input_tokens_seen": 174678270, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 8126, + "time_per_iteration": 2.437833309173584 + }, + { + "auxiliary_loss_clip": 0.01111271, + "auxiliary_loss_mlp": 0.01033174, + "balance_loss_clip": 1.02007866, + "balance_loss_mlp": 1.0383321, + "epoch": 0.4886216744325868, + "flos": 32087764206720.0, + "grad_norm": 1.6636646152839605, + "language_loss": 0.68598747, + "learning_rate": 2.1705714877789633e-06, + "loss": 0.70743197, + "num_input_tokens_seen": 174698360, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73046875, + "step": 8127, + "time_per_iteration": 2.593252420425415 + }, + { + "auxiliary_loss_clip": 0.01111716, + "auxiliary_loss_mlp": 0.01036256, + "balance_loss_clip": 1.02271378, + "balance_loss_mlp": 1.03772545, + "epoch": 0.48868179768525477, + "flos": 19610063631360.0, + "grad_norm": 2.237259843406747, + "language_loss": 0.76160932, + "learning_rate": 2.170183441856481e-06, + "loss": 0.78308904, + "num_input_tokens_seen": 174716755, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73828125, + "step": 8128, + "time_per_iteration": 2.4540648460388184 + }, + { + "auxiliary_loss_clip": 0.01111402, + "auxiliary_loss_mlp": 0.01033977, + "balance_loss_clip": 1.02170467, + "balance_loss_mlp": 1.03979826, + "epoch": 0.48874192093792274, + "flos": 21286912199040.0, + "grad_norm": 1.8007841393953645, + "language_loss": 0.75974828, + "learning_rate": 2.1697953894806265e-06, + "loss": 0.78120208, + "num_input_tokens_seen": 174735560, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.71875, + "step": 8129, + "time_per_iteration": 2.4460771083831787 + }, + { + "auxiliary_loss_clip": 0.01108237, + "auxiliary_loss_mlp": 0.01031838, + "balance_loss_clip": 1.01829541, + "balance_loss_mlp": 1.03739452, + "epoch": 0.4888020441905907, + "flos": 14173439696640.0, + "grad_norm": 2.2474332482435684, + "language_loss": 0.64869368, + "learning_rate": 2.169407330666114e-06, + "loss": 0.67009449, + "num_input_tokens_seen": 174752730, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.70703125, + "step": 8130, + "time_per_iteration": 2.4403305053710938 + }, + { + "auxiliary_loss_clip": 0.01104742, + "auxiliary_loss_mlp": 0.01033698, + "balance_loss_clip": 1.0213058, + "balance_loss_mlp": 1.03528643, + "epoch": 0.48886216744325867, + "flos": 24097891766400.0, + "grad_norm": 2.48357292354413, + "language_loss": 0.71885133, + "learning_rate": 2.169019265427658e-06, + "loss": 0.74023575, + "num_input_tokens_seen": 174772520, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6953125, + "step": 8131, + "time_per_iteration": 2.4774324893951416 + }, + { + "auxiliary_loss_clip": 0.01113099, + "auxiliary_loss_mlp": 0.01038002, + "balance_loss_clip": 1.02447748, + "balance_loss_mlp": 1.04011512, + "epoch": 0.48892229069592663, + "flos": 38431419402240.0, + "grad_norm": 1.6326145167913504, + "language_loss": 0.69524658, + "learning_rate": 2.1686311937799745e-06, + "loss": 0.7167576, + "num_input_tokens_seen": 174796540, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73046875, + "step": 8132, + "time_per_iteration": 2.5888383388519287 + }, + { + "auxiliary_loss_clip": 0.011075, + "auxiliary_loss_mlp": 0.0102913, + "balance_loss_clip": 1.01641083, + "balance_loss_mlp": 1.03793633, + "epoch": 0.4889824139485946, + "flos": 23843321101440.0, + "grad_norm": 1.374551885233197, + "language_loss": 0.70177239, + "learning_rate": 2.1682431157377797e-06, + "loss": 0.72313869, + "num_input_tokens_seen": 174817840, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 8133, + "time_per_iteration": 2.5105628967285156 + }, + { + "auxiliary_loss_clip": 0.01108745, + "auxiliary_loss_mlp": 0.01033533, + "balance_loss_clip": 1.02086735, + "balance_loss_mlp": 1.03843439, + "epoch": 0.48904253720126256, + "flos": 24425827960320.0, + "grad_norm": 1.701581568458854, + "language_loss": 0.70707083, + "learning_rate": 2.1678550313157883e-06, + "loss": 0.72849363, + "num_input_tokens_seen": 174837885, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 8134, + "time_per_iteration": 2.4894602298736572 + }, + { + "auxiliary_loss_clip": 0.01114154, + "auxiliary_loss_mlp": 0.01035613, + "balance_loss_clip": 1.02214789, + "balance_loss_mlp": 1.04088461, + "epoch": 0.4891026604539306, + "flos": 24170682677760.0, + "grad_norm": 2.0967568848691105, + "language_loss": 0.80384946, + "learning_rate": 2.167466940528718e-06, + "loss": 0.82534719, + "num_input_tokens_seen": 174855240, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 8135, + "time_per_iteration": 2.453099489212036 + }, + { + "auxiliary_loss_clip": 0.0110553, + "auxiliary_loss_mlp": 0.0103091, + "balance_loss_clip": 1.01895332, + "balance_loss_mlp": 1.03636014, + "epoch": 0.48916278370659855, + "flos": 21470954509440.0, + "grad_norm": 1.7196560423786724, + "language_loss": 0.74302435, + "learning_rate": 2.1670788433912843e-06, + "loss": 0.7643888, + "num_input_tokens_seen": 174875145, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.69140625, + "step": 8136, + "time_per_iteration": 3.877336025238037 + }, + { + "auxiliary_loss_clip": 0.0110843, + "auxiliary_loss_mlp": 0.01030189, + "balance_loss_clip": 1.01817274, + "balance_loss_mlp": 1.03903699, + "epoch": 0.4892229069592665, + "flos": 22309755886080.0, + "grad_norm": 2.212302237726986, + "language_loss": 0.73165262, + "learning_rate": 2.166690739918204e-06, + "loss": 0.75303876, + "num_input_tokens_seen": 174894770, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6953125, + "step": 8137, + "time_per_iteration": 5.387110471725464 + }, + { + "auxiliary_loss_clip": 0.01109302, + "auxiliary_loss_mlp": 0.01031575, + "balance_loss_clip": 1.01846206, + "balance_loss_mlp": 1.03721762, + "epoch": 0.4892830302119345, + "flos": 12786856934400.0, + "grad_norm": 1.8416541749331667, + "language_loss": 0.74448442, + "learning_rate": 2.1663026301241944e-06, + "loss": 0.76589316, + "num_input_tokens_seen": 174912780, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 8138, + "time_per_iteration": 3.9045798778533936 + }, + { + "auxiliary_loss_clip": 0.01108399, + "auxiliary_loss_mlp": 0.01033464, + "balance_loss_clip": 1.02114367, + "balance_loss_mlp": 1.039101, + "epoch": 0.48934315346460244, + "flos": 20813896972800.0, + "grad_norm": 1.5284975125240874, + "language_loss": 0.74403191, + "learning_rate": 2.165914514023972e-06, + "loss": 0.76545048, + "num_input_tokens_seen": 174931250, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.69140625, + "step": 8139, + "time_per_iteration": 2.4808132648468018 + }, + { + "auxiliary_loss_clip": 0.01109253, + "auxiliary_loss_mlp": 0.01034607, + "balance_loss_clip": 1.02220941, + "balance_loss_mlp": 1.03792441, + "epoch": 0.4894032767172704, + "flos": 19755537713280.0, + "grad_norm": 1.7092479760411836, + "language_loss": 0.61867124, + "learning_rate": 2.165526391632255e-06, + "loss": 0.64010978, + "num_input_tokens_seen": 174951105, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 8140, + "time_per_iteration": 2.4676973819732666 + }, + { + "auxiliary_loss_clip": 0.01110437, + "auxiliary_loss_mlp": 0.01040632, + "balance_loss_clip": 1.02696478, + "balance_loss_mlp": 1.03864169, + "epoch": 0.4894633999699384, + "flos": 17818982835840.0, + "grad_norm": 11.553990271771063, + "language_loss": 0.82090259, + "learning_rate": 2.1651382629637608e-06, + "loss": 0.84241331, + "num_input_tokens_seen": 174969120, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71875, + "step": 8141, + "time_per_iteration": 2.4469456672668457 + }, + { + "auxiliary_loss_clip": 0.01112856, + "auxiliary_loss_mlp": 0.01033863, + "balance_loss_clip": 1.02006459, + "balance_loss_mlp": 1.04014516, + "epoch": 0.48952352322260634, + "flos": 25523222325120.0, + "grad_norm": 1.575169950356119, + "language_loss": 0.72470534, + "learning_rate": 2.1647501280332066e-06, + "loss": 0.74617255, + "num_input_tokens_seen": 174991295, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7265625, + "step": 8142, + "time_per_iteration": 2.5793039798736572 + }, + { + "auxiliary_loss_clip": 0.01105636, + "auxiliary_loss_mlp": 0.01032347, + "balance_loss_clip": 1.02019358, + "balance_loss_mlp": 1.03645492, + "epoch": 0.4895836464752743, + "flos": 29055502903680.0, + "grad_norm": 1.7422772510583273, + "language_loss": 0.66720849, + "learning_rate": 2.1643619868553105e-06, + "loss": 0.68858832, + "num_input_tokens_seen": 175012830, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69140625, + "step": 8143, + "time_per_iteration": 2.529869556427002 + }, + { + "auxiliary_loss_clip": 0.01104447, + "auxiliary_loss_mlp": 0.01029513, + "balance_loss_clip": 1.01746714, + "balance_loss_mlp": 1.03620982, + "epoch": 0.48964376972794227, + "flos": 33546958312320.0, + "grad_norm": 1.6744857165672533, + "language_loss": 0.75076014, + "learning_rate": 2.163973839444793e-06, + "loss": 0.77209973, + "num_input_tokens_seen": 175035695, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.68359375, + "step": 8144, + "time_per_iteration": 2.5917482376098633 + }, + { + "auxiliary_loss_clip": 0.0110724, + "auxiliary_loss_mlp": 0.01028535, + "balance_loss_clip": 1.0158155, + "balance_loss_mlp": 1.0373745, + "epoch": 0.48970389298061023, + "flos": 22054035985920.0, + "grad_norm": 1.7401505251342857, + "language_loss": 0.75606745, + "learning_rate": 2.1635856858163695e-06, + "loss": 0.77742517, + "num_input_tokens_seen": 175056425, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 8145, + "time_per_iteration": 2.4766342639923096 + }, + { + "auxiliary_loss_clip": 0.01110696, + "auxiliary_loss_mlp": 0.01035586, + "balance_loss_clip": 1.0224849, + "balance_loss_mlp": 1.03849018, + "epoch": 0.4897640162332782, + "flos": 20084299920000.0, + "grad_norm": 1.7624340526507305, + "language_loss": 0.79901314, + "learning_rate": 2.163197525984761e-06, + "loss": 0.820476, + "num_input_tokens_seen": 175074800, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 8146, + "time_per_iteration": 2.461480140686035 + }, + { + "auxiliary_loss_clip": 0.01102906, + "auxiliary_loss_mlp": 0.01031626, + "balance_loss_clip": 1.01866233, + "balance_loss_mlp": 1.03510666, + "epoch": 0.48982413948594616, + "flos": 23806225330560.0, + "grad_norm": 1.6218674355963285, + "language_loss": 0.74327677, + "learning_rate": 2.162809359964687e-06, + "loss": 0.76462203, + "num_input_tokens_seen": 175094500, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.67578125, + "step": 8147, + "time_per_iteration": 2.4981865882873535 + }, + { + "auxiliary_loss_clip": 0.01109193, + "auxiliary_loss_mlp": 0.01028663, + "balance_loss_clip": 1.01614654, + "balance_loss_mlp": 1.0397613, + "epoch": 0.4898842627386142, + "flos": 17639645207040.0, + "grad_norm": 2.4473724892456126, + "language_loss": 0.83147472, + "learning_rate": 2.162421187770864e-06, + "loss": 0.8528533, + "num_input_tokens_seen": 175112920, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 8148, + "time_per_iteration": 2.4251036643981934 + }, + { + "auxiliary_loss_clip": 0.01104505, + "auxiliary_loss_mlp": 0.01027888, + "balance_loss_clip": 1.01701021, + "balance_loss_mlp": 1.03808641, + "epoch": 0.48994438599128215, + "flos": 16617914841600.0, + "grad_norm": 1.6244569398372493, + "language_loss": 0.73749536, + "learning_rate": 2.162033009418015e-06, + "loss": 0.75881934, + "num_input_tokens_seen": 175129910, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6640625, + "step": 8149, + "time_per_iteration": 2.4356369972229004 + }, + { + "auxiliary_loss_clip": 0.01112401, + "auxiliary_loss_mlp": 0.01030368, + "balance_loss_clip": 1.01667118, + "balance_loss_mlp": 1.03944612, + "epoch": 0.4900045092439501, + "flos": 26614834600320.0, + "grad_norm": 2.7362049095417516, + "language_loss": 0.75515091, + "learning_rate": 2.1616448249208567e-06, + "loss": 0.77657855, + "num_input_tokens_seen": 175148705, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7265625, + "step": 8150, + "time_per_iteration": 2.4834423065185547 + }, + { + "auxiliary_loss_clip": 0.01111432, + "auxiliary_loss_mlp": 0.01030069, + "balance_loss_clip": 1.0169735, + "balance_loss_mlp": 1.04018414, + "epoch": 0.4900646324966181, + "flos": 19902125116800.0, + "grad_norm": 2.027803048960678, + "language_loss": 0.72891176, + "learning_rate": 2.1612566342941106e-06, + "loss": 0.75032675, + "num_input_tokens_seen": 175167425, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 8151, + "time_per_iteration": 2.448648691177368 + }, + { + "auxiliary_loss_clip": 0.01033992, + "auxiliary_loss_mlp": 0.01002772, + "balance_loss_clip": 1.0015738, + "balance_loss_mlp": 1.01003349, + "epoch": 0.49012475574928605, + "flos": 59189620337280.0, + "grad_norm": 0.8338756787223442, + "language_loss": 0.54366148, + "learning_rate": 2.1608684375524977e-06, + "loss": 0.5640291, + "num_input_tokens_seen": 175227985, + "router_z_loss_clip": 0.01196289, + "router_z_loss_mlp": 0.24023438, + "step": 8152, + "time_per_iteration": 3.0414862632751465 + }, + { + "auxiliary_loss_clip": 0.01109949, + "auxiliary_loss_mlp": 0.01030719, + "balance_loss_clip": 1.01807642, + "balance_loss_mlp": 1.03726649, + "epoch": 0.490184879001954, + "flos": 45259797657600.0, + "grad_norm": 1.8071588573161568, + "language_loss": 0.61403525, + "learning_rate": 2.1604802347107364e-06, + "loss": 0.6354419, + "num_input_tokens_seen": 175251895, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7265625, + "step": 8153, + "time_per_iteration": 2.6923155784606934 + }, + { + "auxiliary_loss_clip": 0.01105747, + "auxiliary_loss_mlp": 0.01036584, + "balance_loss_clip": 1.02371526, + "balance_loss_mlp": 1.03589535, + "epoch": 0.490245002254622, + "flos": 28002135634560.0, + "grad_norm": 1.4691031789751592, + "language_loss": 0.76673591, + "learning_rate": 2.160092025783549e-06, + "loss": 0.78815919, + "num_input_tokens_seen": 175272770, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69921875, + "step": 8154, + "time_per_iteration": 2.490353584289551 + }, + { + "auxiliary_loss_clip": 0.01034079, + "auxiliary_loss_mlp": 0.01008709, + "balance_loss_clip": 1.00767767, + "balance_loss_mlp": 1.01043367, + "epoch": 0.49030512550728994, + "flos": 58951318533120.0, + "grad_norm": 0.9669855284605297, + "language_loss": 0.67019808, + "learning_rate": 2.1597038107856564e-06, + "loss": 0.69062597, + "num_input_tokens_seen": 175336320, + "router_z_loss_clip": 0.01031494, + "router_z_loss_mlp": 0.23632812, + "step": 8155, + "time_per_iteration": 3.1443841457366943 + }, + { + "auxiliary_loss_clip": 0.0110817, + "auxiliary_loss_mlp": 0.0102773, + "balance_loss_clip": 1.01594031, + "balance_loss_mlp": 1.03842843, + "epoch": 0.4903652487599579, + "flos": 19791843384960.0, + "grad_norm": 2.3165784732113965, + "language_loss": 0.76883155, + "learning_rate": 2.1593155897317784e-06, + "loss": 0.79019058, + "num_input_tokens_seen": 175353540, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.69921875, + "step": 8156, + "time_per_iteration": 2.4431064128875732 + }, + { + "auxiliary_loss_clip": 0.01107345, + "auxiliary_loss_mlp": 0.01029515, + "balance_loss_clip": 1.01737309, + "balance_loss_mlp": 1.03692055, + "epoch": 0.49042537201262587, + "flos": 21762082241280.0, + "grad_norm": 2.1340841853754084, + "language_loss": 0.83395588, + "learning_rate": 2.1589273626366377e-06, + "loss": 0.85532445, + "num_input_tokens_seen": 175370445, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.703125, + "step": 8157, + "time_per_iteration": 2.478027582168579 + }, + { + "auxiliary_loss_clip": 0.01108499, + "auxiliary_loss_mlp": 0.01032069, + "balance_loss_clip": 1.01971316, + "balance_loss_mlp": 1.03797531, + "epoch": 0.49048549526529384, + "flos": 18953042008320.0, + "grad_norm": 1.799550006100146, + "language_loss": 0.79893947, + "learning_rate": 2.158539129514956e-06, + "loss": 0.8203451, + "num_input_tokens_seen": 175389020, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.70703125, + "step": 8158, + "time_per_iteration": 2.453590154647827 + }, + { + "auxiliary_loss_clip": 0.0111001, + "auxiliary_loss_mlp": 0.01030333, + "balance_loss_clip": 1.01731563, + "balance_loss_mlp": 1.03768444, + "epoch": 0.4905456185179618, + "flos": 26906393295360.0, + "grad_norm": 2.6065217447562015, + "language_loss": 0.69529265, + "learning_rate": 2.158150890381454e-06, + "loss": 0.71669614, + "num_input_tokens_seen": 175409545, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.72265625, + "step": 8159, + "time_per_iteration": 2.531371593475342 + }, + { + "auxiliary_loss_clip": 0.01106025, + "auxiliary_loss_mlp": 0.01032529, + "balance_loss_clip": 1.01975548, + "balance_loss_mlp": 1.03706563, + "epoch": 0.49060574177062977, + "flos": 20412343854720.0, + "grad_norm": 1.8340548446534848, + "language_loss": 0.73084885, + "learning_rate": 2.157762645250854e-06, + "loss": 0.7522344, + "num_input_tokens_seen": 175429335, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 8160, + "time_per_iteration": 2.4504506587982178 + }, + { + "auxiliary_loss_clip": 0.01109213, + "auxiliary_loss_mlp": 0.0103886, + "balance_loss_clip": 1.02510881, + "balance_loss_mlp": 1.03650105, + "epoch": 0.4906658650232978, + "flos": 17493704248320.0, + "grad_norm": 1.9580885379656197, + "language_loss": 0.71372044, + "learning_rate": 2.1573743941378796e-06, + "loss": 0.73520112, + "num_input_tokens_seen": 175446955, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7265625, + "step": 8161, + "time_per_iteration": 2.4428305625915527 + }, + { + "auxiliary_loss_clip": 0.01106928, + "auxiliary_loss_mlp": 0.01033381, + "balance_loss_clip": 1.02100754, + "balance_loss_mlp": 1.03813958, + "epoch": 0.49072598827596575, + "flos": 26614439550720.0, + "grad_norm": 1.8633116916333885, + "language_loss": 0.67950338, + "learning_rate": 2.1569861370572517e-06, + "loss": 0.70090652, + "num_input_tokens_seen": 175468195, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6875, + "step": 8162, + "time_per_iteration": 2.478804349899292 + }, + { + "auxiliary_loss_clip": 0.01110496, + "auxiliary_loss_mlp": 0.01033543, + "balance_loss_clip": 1.01964319, + "balance_loss_mlp": 1.03701675, + "epoch": 0.4907861115286337, + "flos": 20412595249920.0, + "grad_norm": 1.7117590070355053, + "language_loss": 0.63264233, + "learning_rate": 2.1565978740236944e-06, + "loss": 0.65408272, + "num_input_tokens_seen": 175487455, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.734375, + "step": 8163, + "time_per_iteration": 2.474439859390259 + }, + { + "auxiliary_loss_clip": 0.01104573, + "auxiliary_loss_mlp": 0.01030344, + "balance_loss_clip": 1.01754081, + "balance_loss_mlp": 1.03680897, + "epoch": 0.4908462347813017, + "flos": 14064271286400.0, + "grad_norm": 5.481003364843308, + "language_loss": 0.76853907, + "learning_rate": 2.1562096050519293e-06, + "loss": 0.78988826, + "num_input_tokens_seen": 175504450, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 8164, + "time_per_iteration": 2.4202303886413574 + }, + { + "auxiliary_loss_clip": 0.01106417, + "auxiliary_loss_mlp": 0.01028083, + "balance_loss_clip": 1.01487494, + "balance_loss_mlp": 1.03511751, + "epoch": 0.49090635803396965, + "flos": 18735100237440.0, + "grad_norm": 1.943812351193686, + "language_loss": 0.76509839, + "learning_rate": 2.1558213301566806e-06, + "loss": 0.78644335, + "num_input_tokens_seen": 175523600, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71484375, + "step": 8165, + "time_per_iteration": 2.4495608806610107 + }, + { + "auxiliary_loss_clip": 0.01106443, + "auxiliary_loss_mlp": 0.0103224, + "balance_loss_clip": 1.01949036, + "balance_loss_mlp": 1.03724587, + "epoch": 0.4909664812866376, + "flos": 20558500295040.0, + "grad_norm": 1.5511500992998777, + "language_loss": 0.77538848, + "learning_rate": 2.1554330493526716e-06, + "loss": 0.79677534, + "num_input_tokens_seen": 175542720, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 8166, + "time_per_iteration": 2.431838274002075 + }, + { + "auxiliary_loss_clip": 0.01035489, + "auxiliary_loss_mlp": 0.00999269, + "balance_loss_clip": 0.99796408, + "balance_loss_mlp": 1.01166928, + "epoch": 0.4910266045393056, + "flos": 54684017948160.0, + "grad_norm": 0.7997768420675069, + "language_loss": 0.54261303, + "learning_rate": 2.1550447626546253e-06, + "loss": 0.56296062, + "num_input_tokens_seen": 175598640, + "router_z_loss_clip": 0.01306152, + "router_z_loss_mlp": 0.23828125, + "step": 8167, + "time_per_iteration": 3.1150460243225098 + }, + { + "auxiliary_loss_clip": 0.01104818, + "auxiliary_loss_mlp": 0.0103103, + "balance_loss_clip": 1.0184176, + "balance_loss_mlp": 1.03619838, + "epoch": 0.49108672779197354, + "flos": 16246454342400.0, + "grad_norm": 2.5337625100343173, + "language_loss": 0.85566431, + "learning_rate": 2.1546564700772665e-06, + "loss": 0.8770228, + "num_input_tokens_seen": 175615675, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 8168, + "time_per_iteration": 2.4139063358306885 + }, + { + "auxiliary_loss_clip": 0.01105043, + "auxiliary_loss_mlp": 0.01029235, + "balance_loss_clip": 1.01706409, + "balance_loss_mlp": 1.03805184, + "epoch": 0.4911468510446415, + "flos": 19825419623040.0, + "grad_norm": 1.6015963996367162, + "language_loss": 0.73052484, + "learning_rate": 2.1542681716353193e-06, + "loss": 0.75186759, + "num_input_tokens_seen": 175632255, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.671875, + "step": 8169, + "time_per_iteration": 2.45638370513916 + }, + { + "auxiliary_loss_clip": 0.01104357, + "auxiliary_loss_mlp": 0.01028462, + "balance_loss_clip": 1.01673138, + "balance_loss_mlp": 1.03472865, + "epoch": 0.4912069742973095, + "flos": 21212684743680.0, + "grad_norm": 1.6971136818289634, + "language_loss": 0.78070778, + "learning_rate": 2.1538798673435068e-06, + "loss": 0.80203593, + "num_input_tokens_seen": 175651625, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6953125, + "step": 8170, + "time_per_iteration": 2.4314279556274414 + }, + { + "auxiliary_loss_clip": 0.01108102, + "auxiliary_loss_mlp": 0.01033192, + "balance_loss_clip": 1.02121162, + "balance_loss_mlp": 1.03809822, + "epoch": 0.49126709754997744, + "flos": 19537129065600.0, + "grad_norm": 3.6606474387116363, + "language_loss": 0.75769788, + "learning_rate": 2.1534915572165545e-06, + "loss": 0.77911079, + "num_input_tokens_seen": 175669265, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.703125, + "step": 8171, + "time_per_iteration": 2.4608027935028076 + }, + { + "auxiliary_loss_clip": 0.01109941, + "auxiliary_loss_mlp": 0.0103435, + "balance_loss_clip": 1.02194011, + "balance_loss_mlp": 1.03800821, + "epoch": 0.4913272208026454, + "flos": 12239686080000.0, + "grad_norm": 2.121204048765929, + "language_loss": 0.81676465, + "learning_rate": 2.1531032412691875e-06, + "loss": 0.83820748, + "num_input_tokens_seen": 175686065, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71875, + "step": 8172, + "time_per_iteration": 2.44052791595459 + }, + { + "auxiliary_loss_clip": 0.01034483, + "auxiliary_loss_mlp": 0.00996712, + "balance_loss_clip": 0.99551356, + "balance_loss_mlp": 1.0111028, + "epoch": 0.49138734405531337, + "flos": 65465871661440.0, + "grad_norm": 0.6914312886696967, + "language_loss": 0.53323382, + "learning_rate": 2.1527149195161295e-06, + "loss": 0.55354571, + "num_input_tokens_seen": 175748595, + "router_z_loss_clip": 0.01196289, + "router_z_loss_mlp": 0.234375, + "step": 8173, + "time_per_iteration": 3.0708565711975098 + }, + { + "auxiliary_loss_clip": 0.01108663, + "auxiliary_loss_mlp": 0.01032993, + "balance_loss_clip": 1.01985621, + "balance_loss_mlp": 1.0374558, + "epoch": 0.4914474673079814, + "flos": 18439052342400.0, + "grad_norm": 1.811286975884668, + "language_loss": 0.62879664, + "learning_rate": 2.152326591972107e-06, + "loss": 0.65021324, + "num_input_tokens_seen": 175766770, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 8174, + "time_per_iteration": 2.4336249828338623 + }, + { + "auxiliary_loss_clip": 0.01106845, + "auxiliary_loss_mlp": 0.01034775, + "balance_loss_clip": 1.02208483, + "balance_loss_mlp": 1.03750002, + "epoch": 0.49150759056064935, + "flos": 21685053525120.0, + "grad_norm": 1.779537870111139, + "language_loss": 0.69111979, + "learning_rate": 2.1519382586518445e-06, + "loss": 0.71253598, + "num_input_tokens_seen": 175783605, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 8175, + "time_per_iteration": 2.4554460048675537 + }, + { + "auxiliary_loss_clip": 0.01106829, + "auxiliary_loss_mlp": 0.01032059, + "balance_loss_clip": 1.01980472, + "balance_loss_mlp": 1.03808653, + "epoch": 0.4915677138133173, + "flos": 22382439056640.0, + "grad_norm": 1.5246237839161791, + "language_loss": 0.74398279, + "learning_rate": 2.151549919570068e-06, + "loss": 0.76537168, + "num_input_tokens_seen": 175801390, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 8176, + "time_per_iteration": 2.4888904094696045 + }, + { + "auxiliary_loss_clip": 0.01107276, + "auxiliary_loss_mlp": 0.01042259, + "balance_loss_clip": 1.0297358, + "balance_loss_mlp": 1.03694725, + "epoch": 0.4916278370659853, + "flos": 18402890325120.0, + "grad_norm": 1.7568126082203932, + "language_loss": 0.69846892, + "learning_rate": 2.1511615747415036e-06, + "loss": 0.71996421, + "num_input_tokens_seen": 175819830, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 8177, + "time_per_iteration": 3.8634564876556396 + }, + { + "auxiliary_loss_clip": 0.01035127, + "auxiliary_loss_mlp": 0.00999453, + "balance_loss_clip": 0.99834442, + "balance_loss_mlp": 1.01137829, + "epoch": 0.49168796031865325, + "flos": 66609124715520.0, + "grad_norm": 0.6749706589091774, + "language_loss": 0.46188164, + "learning_rate": 2.150773224180877e-06, + "loss": 0.48222741, + "num_input_tokens_seen": 175881765, + "router_z_loss_clip": 0.0111084, + "router_z_loss_mlp": 0.23828125, + "step": 8178, + "time_per_iteration": 3.0891001224517822 + }, + { + "auxiliary_loss_clip": 0.01110485, + "auxiliary_loss_mlp": 0.01036426, + "balance_loss_clip": 1.02311015, + "balance_loss_mlp": 1.03835034, + "epoch": 0.4917480835713212, + "flos": 20959335141120.0, + "grad_norm": 1.813634772504209, + "language_loss": 0.66008747, + "learning_rate": 2.1503848679029147e-06, + "loss": 0.68155658, + "num_input_tokens_seen": 175901795, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 8179, + "time_per_iteration": 5.296982049942017 + }, + { + "auxiliary_loss_clip": 0.01111217, + "auxiliary_loss_mlp": 0.01035796, + "balance_loss_clip": 1.021873, + "balance_loss_mlp": 1.03712761, + "epoch": 0.4918082068239892, + "flos": 15772900412160.0, + "grad_norm": 1.8426949121819989, + "language_loss": 0.70288503, + "learning_rate": 2.149996505922343e-06, + "loss": 0.72435522, + "num_input_tokens_seen": 175917770, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7421875, + "step": 8180, + "time_per_iteration": 3.9257376194000244 + }, + { + "auxiliary_loss_clip": 0.01104266, + "auxiliary_loss_mlp": 0.01037933, + "balance_loss_clip": 1.02467656, + "balance_loss_mlp": 1.03577447, + "epoch": 0.49186833007665715, + "flos": 24604806453120.0, + "grad_norm": 1.68068912028803, + "language_loss": 0.83982801, + "learning_rate": 2.1496081382538895e-06, + "loss": 0.86125004, + "num_input_tokens_seen": 175937000, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.68359375, + "step": 8181, + "time_per_iteration": 2.464665174484253 + }, + { + "auxiliary_loss_clip": 0.01104535, + "auxiliary_loss_mlp": 0.01030918, + "balance_loss_clip": 1.01947999, + "balance_loss_mlp": 1.03746653, + "epoch": 0.4919284533293251, + "flos": 22090557139200.0, + "grad_norm": 2.0240623883749724, + "language_loss": 0.72286201, + "learning_rate": 2.1492197649122793e-06, + "loss": 0.74421656, + "num_input_tokens_seen": 175955170, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 8182, + "time_per_iteration": 2.5358242988586426 + }, + { + "auxiliary_loss_clip": 0.01108049, + "auxiliary_loss_mlp": 0.01031803, + "balance_loss_clip": 1.01904118, + "balance_loss_mlp": 1.03814411, + "epoch": 0.4919885765819931, + "flos": 23368043318400.0, + "grad_norm": 2.2040850478726357, + "language_loss": 0.72828728, + "learning_rate": 2.1488313859122412e-06, + "loss": 0.74968582, + "num_input_tokens_seen": 175973725, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 8183, + "time_per_iteration": 2.484051465988159 + }, + { + "auxiliary_loss_clip": 0.01110545, + "auxiliary_loss_mlp": 0.01031345, + "balance_loss_clip": 1.0178628, + "balance_loss_mlp": 1.03733599, + "epoch": 0.49204869983466104, + "flos": 21360493209600.0, + "grad_norm": 1.6157316160481727, + "language_loss": 0.77338606, + "learning_rate": 2.1484430012685015e-06, + "loss": 0.79480493, + "num_input_tokens_seen": 175993885, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 8184, + "time_per_iteration": 2.4630794525146484 + }, + { + "auxiliary_loss_clip": 0.01107787, + "auxiliary_loss_mlp": 0.01035741, + "balance_loss_clip": 1.02359986, + "balance_loss_mlp": 1.03868532, + "epoch": 0.492108823087329, + "flos": 21142695093120.0, + "grad_norm": 1.7266312313882144, + "language_loss": 0.71020061, + "learning_rate": 2.148054610995789e-06, + "loss": 0.73163593, + "num_input_tokens_seen": 176014210, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69140625, + "step": 8185, + "time_per_iteration": 2.4472904205322266 + }, + { + "auxiliary_loss_clip": 0.01109756, + "auxiliary_loss_mlp": 0.01037838, + "balance_loss_clip": 1.02348495, + "balance_loss_mlp": 1.03818357, + "epoch": 0.49216894633999697, + "flos": 25116605389440.0, + "grad_norm": 2.357724154899576, + "language_loss": 0.75007719, + "learning_rate": 2.147666215108831e-06, + "loss": 0.7715531, + "num_input_tokens_seen": 176033890, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.71484375, + "step": 8186, + "time_per_iteration": 2.497887372970581 + }, + { + "auxiliary_loss_clip": 0.01108113, + "auxiliary_loss_mlp": 0.01036969, + "balance_loss_clip": 1.0240649, + "balance_loss_mlp": 1.03769946, + "epoch": 0.49222906959266494, + "flos": 22637943475200.0, + "grad_norm": 2.2731376810200947, + "language_loss": 0.67426246, + "learning_rate": 2.1472778136223545e-06, + "loss": 0.69571328, + "num_input_tokens_seen": 176052720, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 8187, + "time_per_iteration": 2.4402377605438232 + }, + { + "auxiliary_loss_clip": 0.01105993, + "auxiliary_loss_mlp": 0.01034798, + "balance_loss_clip": 1.02205503, + "balance_loss_mlp": 1.03659558, + "epoch": 0.49228919284533296, + "flos": 20410548174720.0, + "grad_norm": 1.3838016666023416, + "language_loss": 0.66984355, + "learning_rate": 2.1468894065510894e-06, + "loss": 0.69125152, + "num_input_tokens_seen": 176072545, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 8188, + "time_per_iteration": 2.4889986515045166 + }, + { + "auxiliary_loss_clip": 0.01108628, + "auxiliary_loss_mlp": 0.01027775, + "balance_loss_clip": 1.01627779, + "balance_loss_mlp": 1.03854966, + "epoch": 0.4923493160980009, + "flos": 27122359818240.0, + "grad_norm": 1.5428848144341532, + "language_loss": 0.7457763, + "learning_rate": 2.1465009939097623e-06, + "loss": 0.76714027, + "num_input_tokens_seen": 176091490, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.703125, + "step": 8189, + "time_per_iteration": 2.4837827682495117 + }, + { + "auxiliary_loss_clip": 0.011062, + "auxiliary_loss_mlp": 0.01032136, + "balance_loss_clip": 1.01975584, + "balance_loss_mlp": 1.03744173, + "epoch": 0.4924094393506689, + "flos": 35736683224320.0, + "grad_norm": 1.5888967888129601, + "language_loss": 0.64360684, + "learning_rate": 2.146112575713104e-06, + "loss": 0.66499019, + "num_input_tokens_seen": 176113200, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 8190, + "time_per_iteration": 2.606388807296753 + }, + { + "auxiliary_loss_clip": 0.01107034, + "auxiliary_loss_mlp": 0.01027622, + "balance_loss_clip": 1.01528418, + "balance_loss_mlp": 1.0383538, + "epoch": 0.49246956260333685, + "flos": 20412487509120.0, + "grad_norm": 1.9368790872615624, + "language_loss": 0.71231604, + "learning_rate": 2.1457241519758413e-06, + "loss": 0.73366261, + "num_input_tokens_seen": 176132485, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6875, + "step": 8191, + "time_per_iteration": 2.4383578300476074 + }, + { + "auxiliary_loss_clip": 0.01106251, + "auxiliary_loss_mlp": 0.01033855, + "balance_loss_clip": 1.02162957, + "balance_loss_mlp": 1.03718042, + "epoch": 0.4925296858560048, + "flos": 38976938231040.0, + "grad_norm": 1.5667911589112589, + "language_loss": 0.71698356, + "learning_rate": 2.1453357227127043e-06, + "loss": 0.7383846, + "num_input_tokens_seen": 176155755, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69140625, + "step": 8192, + "time_per_iteration": 2.6127231121063232 + }, + { + "auxiliary_loss_clip": 0.01033253, + "auxiliary_loss_mlp": 0.01011533, + "balance_loss_clip": 1.01047826, + "balance_loss_mlp": 1.00980878, + "epoch": 0.4925898091086728, + "flos": 64278917712000.0, + "grad_norm": 0.7610920789142134, + "language_loss": 0.52138889, + "learning_rate": 2.1449472879384224e-06, + "loss": 0.54183674, + "num_input_tokens_seen": 176216295, + "router_z_loss_clip": 0.01055908, + "router_z_loss_mlp": 0.234375, + "step": 8193, + "time_per_iteration": 3.1151235103607178 + }, + { + "auxiliary_loss_clip": 0.01106303, + "auxiliary_loss_mlp": 0.01037325, + "balance_loss_clip": 1.02470672, + "balance_loss_mlp": 1.03862, + "epoch": 0.49264993236134075, + "flos": 23036372110080.0, + "grad_norm": 1.5012892842908303, + "language_loss": 0.77071059, + "learning_rate": 2.1445588476677246e-06, + "loss": 0.79214686, + "num_input_tokens_seen": 176235925, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.67578125, + "step": 8194, + "time_per_iteration": 2.4766407012939453 + }, + { + "auxiliary_loss_clip": 0.01104661, + "auxiliary_loss_mlp": 0.01029401, + "balance_loss_clip": 1.01783228, + "balance_loss_mlp": 1.03554666, + "epoch": 0.4927100556140087, + "flos": 24718212668160.0, + "grad_norm": 1.9786600447906189, + "language_loss": 0.70556259, + "learning_rate": 2.144170401915341e-06, + "loss": 0.7269032, + "num_input_tokens_seen": 176253865, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.69140625, + "step": 8195, + "time_per_iteration": 2.489412784576416 + }, + { + "auxiliary_loss_clip": 0.01108151, + "auxiliary_loss_mlp": 0.01027525, + "balance_loss_clip": 1.01537156, + "balance_loss_mlp": 1.0380609, + "epoch": 0.4927701788666767, + "flos": 23505544581120.0, + "grad_norm": 1.8494849345903903, + "language_loss": 0.81095743, + "learning_rate": 2.143781950696001e-06, + "loss": 0.83231419, + "num_input_tokens_seen": 176271525, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69921875, + "step": 8196, + "time_per_iteration": 2.5489988327026367 + }, + { + "auxiliary_loss_clip": 0.01108856, + "auxiliary_loss_mlp": 0.01033443, + "balance_loss_clip": 1.02033019, + "balance_loss_mlp": 1.03709757, + "epoch": 0.49283030211934464, + "flos": 22928891639040.0, + "grad_norm": 1.848981865854384, + "language_loss": 0.7100687, + "learning_rate": 2.1433934940244356e-06, + "loss": 0.73149174, + "num_input_tokens_seen": 176290810, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 8197, + "time_per_iteration": 2.4621787071228027 + }, + { + "auxiliary_loss_clip": 0.01105723, + "auxiliary_loss_mlp": 0.01031298, + "balance_loss_clip": 1.01988339, + "balance_loss_mlp": 1.03815627, + "epoch": 0.4928904253720126, + "flos": 16873024210560.0, + "grad_norm": 1.7362069513061655, + "language_loss": 0.84122622, + "learning_rate": 2.143005031915374e-06, + "loss": 0.86259645, + "num_input_tokens_seen": 176309165, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.67578125, + "step": 8198, + "time_per_iteration": 2.4596786499023438 + }, + { + "auxiliary_loss_clip": 0.01110423, + "auxiliary_loss_mlp": 0.01034702, + "balance_loss_clip": 1.02139831, + "balance_loss_mlp": 1.03913713, + "epoch": 0.4929505486246806, + "flos": 14866551509760.0, + "grad_norm": 1.767623263247313, + "language_loss": 0.76214266, + "learning_rate": 2.1426165643835467e-06, + "loss": 0.78359395, + "num_input_tokens_seen": 176324960, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 8199, + "time_per_iteration": 2.413482189178467 + }, + { + "auxiliary_loss_clip": 0.01109624, + "auxiliary_loss_mlp": 0.01035996, + "balance_loss_clip": 1.02215028, + "balance_loss_mlp": 1.03712904, + "epoch": 0.49301067187734854, + "flos": 23842351434240.0, + "grad_norm": 1.555242231339172, + "language_loss": 0.59918249, + "learning_rate": 2.1422280914436864e-06, + "loss": 0.62063873, + "num_input_tokens_seen": 176346195, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7265625, + "step": 8200, + "time_per_iteration": 2.515371561050415 + }, + { + "auxiliary_loss_clip": 0.01101467, + "auxiliary_loss_mlp": 0.01033481, + "balance_loss_clip": 1.02128601, + "balance_loss_mlp": 1.03560054, + "epoch": 0.49307079513001656, + "flos": 22491284244480.0, + "grad_norm": 1.4972351372180894, + "language_loss": 0.78781515, + "learning_rate": 2.1418396131105213e-06, + "loss": 0.80916464, + "num_input_tokens_seen": 176366735, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66015625, + "step": 8201, + "time_per_iteration": 2.4688665866851807 + }, + { + "auxiliary_loss_clip": 0.01112129, + "auxiliary_loss_mlp": 0.01032302, + "balance_loss_clip": 1.01858091, + "balance_loss_mlp": 1.03761029, + "epoch": 0.4931309183826845, + "flos": 15924587546880.0, + "grad_norm": 2.1515546014570766, + "language_loss": 0.67352241, + "learning_rate": 2.141451129398785e-06, + "loss": 0.69496673, + "num_input_tokens_seen": 176384475, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7421875, + "step": 8202, + "time_per_iteration": 2.6021947860717773 + }, + { + "auxiliary_loss_clip": 0.01106578, + "auxiliary_loss_mlp": 0.01030125, + "balance_loss_clip": 1.01781058, + "balance_loss_mlp": 1.03682148, + "epoch": 0.4931910416353525, + "flos": 27309059735040.0, + "grad_norm": 3.4273755266911845, + "language_loss": 0.75192142, + "learning_rate": 2.1410626403232076e-06, + "loss": 0.77328843, + "num_input_tokens_seen": 176402645, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.69921875, + "step": 8203, + "time_per_iteration": 2.501173496246338 + }, + { + "auxiliary_loss_clip": 0.01106791, + "auxiliary_loss_mlp": 0.01034465, + "balance_loss_clip": 1.0214237, + "balance_loss_mlp": 1.03780818, + "epoch": 0.49325116488802045, + "flos": 20806139635200.0, + "grad_norm": 2.0656815740777152, + "language_loss": 0.80908394, + "learning_rate": 2.1406741458985197e-06, + "loss": 0.83049649, + "num_input_tokens_seen": 176416715, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6875, + "step": 8204, + "time_per_iteration": 2.481666088104248 + }, + { + "auxiliary_loss_clip": 0.01105243, + "auxiliary_loss_mlp": 0.01033404, + "balance_loss_clip": 1.02180493, + "balance_loss_mlp": 1.03788805, + "epoch": 0.4933112881406884, + "flos": 19865963099520.0, + "grad_norm": 2.2280647806743183, + "language_loss": 0.65550953, + "learning_rate": 2.140285646139455e-06, + "loss": 0.67689598, + "num_input_tokens_seen": 176435755, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 8205, + "time_per_iteration": 2.439408302307129 + }, + { + "auxiliary_loss_clip": 0.01112282, + "auxiliary_loss_mlp": 0.01035253, + "balance_loss_clip": 1.02083468, + "balance_loss_mlp": 1.03837705, + "epoch": 0.4933714113933564, + "flos": 21827977741440.0, + "grad_norm": 1.7727903919462147, + "language_loss": 0.67009246, + "learning_rate": 2.139897141060744e-06, + "loss": 0.69156778, + "num_input_tokens_seen": 176453915, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.73828125, + "step": 8206, + "time_per_iteration": 2.4607954025268555 + }, + { + "auxiliary_loss_clip": 0.01106251, + "auxiliary_loss_mlp": 0.01026736, + "balance_loss_clip": 1.01473176, + "balance_loss_mlp": 1.03630567, + "epoch": 0.49343153464602435, + "flos": 27890130049920.0, + "grad_norm": 1.822649710507408, + "language_loss": 0.76363301, + "learning_rate": 2.1395086306771196e-06, + "loss": 0.78496289, + "num_input_tokens_seen": 176475175, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69921875, + "step": 8207, + "time_per_iteration": 2.508553981781006 + }, + { + "auxiliary_loss_clip": 0.01109244, + "auxiliary_loss_mlp": 0.01032575, + "balance_loss_clip": 1.01912785, + "balance_loss_mlp": 1.03869963, + "epoch": 0.4934916578986923, + "flos": 24681080983680.0, + "grad_norm": 2.308112072386131, + "language_loss": 0.59984541, + "learning_rate": 2.1391201150033147e-06, + "loss": 0.62126362, + "num_input_tokens_seen": 176494250, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.703125, + "step": 8208, + "time_per_iteration": 2.505990982055664 + }, + { + "auxiliary_loss_clip": 0.01108969, + "auxiliary_loss_mlp": 0.01029375, + "balance_loss_clip": 1.01622033, + "balance_loss_mlp": 1.03816974, + "epoch": 0.4935517811513603, + "flos": 23405139089280.0, + "grad_norm": 2.3772506823576407, + "language_loss": 0.7851491, + "learning_rate": 2.1387315940540598e-06, + "loss": 0.80653256, + "num_input_tokens_seen": 176513325, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.70703125, + "step": 8209, + "time_per_iteration": 2.4622652530670166 + }, + { + "auxiliary_loss_clip": 0.01105789, + "auxiliary_loss_mlp": 0.01030424, + "balance_loss_clip": 1.01728129, + "balance_loss_mlp": 1.03630066, + "epoch": 0.49361190440402825, + "flos": 21944508439680.0, + "grad_norm": 1.7984719462813816, + "language_loss": 0.78806269, + "learning_rate": 2.138343067844089e-06, + "loss": 0.80942488, + "num_input_tokens_seen": 176532915, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6953125, + "step": 8210, + "time_per_iteration": 2.4884698390960693 + }, + { + "auxiliary_loss_clip": 0.01111365, + "auxiliary_loss_mlp": 0.01032283, + "balance_loss_clip": 1.01888382, + "balance_loss_mlp": 1.0381912, + "epoch": 0.4936720276566962, + "flos": 25115671635840.0, + "grad_norm": 2.2650712316686903, + "language_loss": 0.81229484, + "learning_rate": 2.1379545363881363e-06, + "loss": 0.83373135, + "num_input_tokens_seen": 176552775, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73046875, + "step": 8211, + "time_per_iteration": 2.4839043617248535 + }, + { + "auxiliary_loss_clip": 0.01109974, + "auxiliary_loss_mlp": 0.01036004, + "balance_loss_clip": 1.02280188, + "balance_loss_mlp": 1.03911519, + "epoch": 0.4937321509093642, + "flos": 26358935132160.0, + "grad_norm": 2.6136684102444665, + "language_loss": 0.91496241, + "learning_rate": 2.137565999700933e-06, + "loss": 0.93642217, + "num_input_tokens_seen": 176572185, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 8212, + "time_per_iteration": 2.5103862285614014 + }, + { + "auxiliary_loss_clip": 0.01106972, + "auxiliary_loss_mlp": 0.01031672, + "balance_loss_clip": 1.01925647, + "balance_loss_mlp": 1.036484, + "epoch": 0.49379227416203214, + "flos": 22961390469120.0, + "grad_norm": 1.7787072133843917, + "language_loss": 0.64901662, + "learning_rate": 2.1371774577972138e-06, + "loss": 0.670403, + "num_input_tokens_seen": 176591490, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.703125, + "step": 8213, + "time_per_iteration": 2.460123300552368 + }, + { + "auxiliary_loss_clip": 0.01106125, + "auxiliary_loss_mlp": 0.01027241, + "balance_loss_clip": 1.01356125, + "balance_loss_mlp": 1.03668904, + "epoch": 0.49385239741470016, + "flos": 32489101843200.0, + "grad_norm": 1.9389339120527038, + "language_loss": 0.75199962, + "learning_rate": 2.136788910691711e-06, + "loss": 0.77333331, + "num_input_tokens_seen": 176612715, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.69140625, + "step": 8214, + "time_per_iteration": 2.5719900131225586 + }, + { + "auxiliary_loss_clip": 0.01109359, + "auxiliary_loss_mlp": 0.01035274, + "balance_loss_clip": 1.02212512, + "balance_loss_mlp": 1.03959298, + "epoch": 0.4939125206673681, + "flos": 22492864442880.0, + "grad_norm": 1.828808325177945, + "language_loss": 0.84395385, + "learning_rate": 2.1364003583991594e-06, + "loss": 0.86540014, + "num_input_tokens_seen": 176631950, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.6953125, + "step": 8215, + "time_per_iteration": 2.468804121017456 + }, + { + "auxiliary_loss_clip": 0.01101226, + "auxiliary_loss_mlp": 0.0102806, + "balance_loss_clip": 1.01656199, + "balance_loss_mlp": 1.03478694, + "epoch": 0.4939726439200361, + "flos": 31176351486720.0, + "grad_norm": 1.6051587100805058, + "language_loss": 0.82859147, + "learning_rate": 2.136011800934292e-06, + "loss": 0.84988439, + "num_input_tokens_seen": 176653060, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 8216, + "time_per_iteration": 2.5819287300109863 + }, + { + "auxiliary_loss_clip": 0.01107134, + "auxiliary_loss_mlp": 0.01031262, + "balance_loss_clip": 1.01918006, + "balance_loss_mlp": 1.03821325, + "epoch": 0.49403276717270406, + "flos": 22674213233280.0, + "grad_norm": 1.4383830441547378, + "language_loss": 0.74774921, + "learning_rate": 2.1356232383118442e-06, + "loss": 0.76913321, + "num_input_tokens_seen": 176673895, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 8217, + "time_per_iteration": 2.4628379344940186 + }, + { + "auxiliary_loss_clip": 0.01104285, + "auxiliary_loss_mlp": 0.01032577, + "balance_loss_clip": 1.01928544, + "balance_loss_mlp": 1.03777707, + "epoch": 0.494092890425372, + "flos": 20741070147840.0, + "grad_norm": 1.733886360732455, + "language_loss": 0.78829861, + "learning_rate": 2.1352346705465494e-06, + "loss": 0.80966723, + "num_input_tokens_seen": 176692550, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6640625, + "step": 8218, + "time_per_iteration": 2.4809412956237793 + }, + { + "auxiliary_loss_clip": 0.0110198, + "auxiliary_loss_mlp": 0.01035085, + "balance_loss_clip": 1.02269292, + "balance_loss_mlp": 1.03510332, + "epoch": 0.49415301367804, + "flos": 18369026778240.0, + "grad_norm": 2.0240627965271187, + "language_loss": 0.76301086, + "learning_rate": 2.134846097653142e-06, + "loss": 0.78438151, + "num_input_tokens_seen": 176709335, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 8219, + "time_per_iteration": 3.8202009201049805 + }, + { + "auxiliary_loss_clip": 0.01108105, + "auxiliary_loss_mlp": 0.01033934, + "balance_loss_clip": 1.02106571, + "balance_loss_mlp": 1.03764367, + "epoch": 0.49421313693070795, + "flos": 17530620451200.0, + "grad_norm": 1.6690505128843895, + "language_loss": 0.6190055, + "learning_rate": 2.134457519646357e-06, + "loss": 0.64042592, + "num_input_tokens_seen": 176727715, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 8220, + "time_per_iteration": 2.453230857849121 + }, + { + "auxiliary_loss_clip": 0.01106287, + "auxiliary_loss_mlp": 0.01029403, + "balance_loss_clip": 1.01656425, + "balance_loss_mlp": 1.03672814, + "epoch": 0.4942732601833759, + "flos": 20812173120000.0, + "grad_norm": 1.7319378421104112, + "language_loss": 0.72381485, + "learning_rate": 2.1340689365409296e-06, + "loss": 0.74517179, + "num_input_tokens_seen": 176747530, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 8221, + "time_per_iteration": 5.506774187088013 + }, + { + "auxiliary_loss_clip": 0.01106502, + "auxiliary_loss_mlp": 0.01037169, + "balance_loss_clip": 1.02521193, + "balance_loss_mlp": 1.04006767, + "epoch": 0.4943333834360439, + "flos": 15048941794560.0, + "grad_norm": 1.681203667545881, + "language_loss": 0.79131603, + "learning_rate": 2.133680348351595e-06, + "loss": 0.81275266, + "num_input_tokens_seen": 176765260, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6640625, + "step": 8222, + "time_per_iteration": 2.491175889968872 + }, + { + "auxiliary_loss_clip": 0.01108448, + "auxiliary_loss_mlp": 0.01034922, + "balance_loss_clip": 1.02147555, + "balance_loss_mlp": 1.03941715, + "epoch": 0.49439350668871185, + "flos": 16070420764800.0, + "grad_norm": 2.3506903054927015, + "language_loss": 0.73205507, + "learning_rate": 2.133291755093088e-06, + "loss": 0.75348878, + "num_input_tokens_seen": 176781770, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.69140625, + "step": 8223, + "time_per_iteration": 2.4359662532806396 + }, + { + "auxiliary_loss_clip": 0.01109917, + "auxiliary_loss_mlp": 0.01035963, + "balance_loss_clip": 1.02264762, + "balance_loss_mlp": 1.03850269, + "epoch": 0.4944536299413798, + "flos": 20880079781760.0, + "grad_norm": 1.7533498543998463, + "language_loss": 0.75144434, + "learning_rate": 2.132903156780144e-06, + "loss": 0.7729032, + "num_input_tokens_seen": 176800655, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.71484375, + "step": 8224, + "time_per_iteration": 2.5716288089752197 + }, + { + "auxiliary_loss_clip": 0.01111376, + "auxiliary_loss_mlp": 0.01030429, + "balance_loss_clip": 1.01807868, + "balance_loss_mlp": 1.04080439, + "epoch": 0.4945137531940478, + "flos": 26608908856320.0, + "grad_norm": 2.086998261136206, + "language_loss": 0.63982892, + "learning_rate": 2.1325145534274997e-06, + "loss": 0.66124696, + "num_input_tokens_seen": 176820610, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.70703125, + "step": 8225, + "time_per_iteration": 2.524048089981079 + }, + { + "auxiliary_loss_clip": 0.01107484, + "auxiliary_loss_mlp": 0.01033109, + "balance_loss_clip": 1.0206579, + "balance_loss_mlp": 1.03766608, + "epoch": 0.49457387644671574, + "flos": 23988148738560.0, + "grad_norm": 1.839126557537864, + "language_loss": 0.76359057, + "learning_rate": 2.1321259450498893e-06, + "loss": 0.78499651, + "num_input_tokens_seen": 176840520, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 8226, + "time_per_iteration": 2.5069961547851562 + }, + { + "auxiliary_loss_clip": 0.01109174, + "auxiliary_loss_mlp": 0.01039999, + "balance_loss_clip": 1.02578914, + "balance_loss_mlp": 1.03735518, + "epoch": 0.49463399969938376, + "flos": 26976598427520.0, + "grad_norm": 1.6377261486682646, + "language_loss": 0.71156305, + "learning_rate": 2.131737331662051e-06, + "loss": 0.73305476, + "num_input_tokens_seen": 176860265, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.71875, + "step": 8227, + "time_per_iteration": 2.4763920307159424 + }, + { + "auxiliary_loss_clip": 0.01112289, + "auxiliary_loss_mlp": 0.01034696, + "balance_loss_clip": 1.02160668, + "balance_loss_mlp": 1.03914213, + "epoch": 0.49469412295205173, + "flos": 29681534067840.0, + "grad_norm": 1.614424212368193, + "language_loss": 0.71484196, + "learning_rate": 2.131348713278718e-06, + "loss": 0.73631173, + "num_input_tokens_seen": 176882910, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.734375, + "step": 8228, + "time_per_iteration": 2.550083637237549 + }, + { + "auxiliary_loss_clip": 0.01105792, + "auxiliary_loss_mlp": 0.0103118, + "balance_loss_clip": 1.01829386, + "balance_loss_mlp": 1.03837276, + "epoch": 0.4947542462047197, + "flos": 24131791226880.0, + "grad_norm": 1.6200219454444607, + "language_loss": 0.83788311, + "learning_rate": 2.1309600899146304e-06, + "loss": 0.85925281, + "num_input_tokens_seen": 176903030, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.67578125, + "step": 8229, + "time_per_iteration": 2.474684238433838 + }, + { + "auxiliary_loss_clip": 0.01108289, + "auxiliary_loss_mlp": 0.01035108, + "balance_loss_clip": 1.02103567, + "balance_loss_mlp": 1.03685689, + "epoch": 0.49481436945738766, + "flos": 20045049333120.0, + "grad_norm": 2.055489394198818, + "language_loss": 0.75105131, + "learning_rate": 2.1305714615845227e-06, + "loss": 0.77248526, + "num_input_tokens_seen": 176919025, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.71484375, + "step": 8230, + "time_per_iteration": 2.506950616836548 + }, + { + "auxiliary_loss_clip": 0.01109012, + "auxiliary_loss_mlp": 0.01027613, + "balance_loss_clip": 1.01497638, + "balance_loss_mlp": 1.03868175, + "epoch": 0.4948744927100556, + "flos": 15669550005120.0, + "grad_norm": 2.703005059233118, + "language_loss": 0.79713035, + "learning_rate": 2.1301828283031314e-06, + "loss": 0.8184967, + "num_input_tokens_seen": 176937945, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 8231, + "time_per_iteration": 2.4176137447357178 + }, + { + "auxiliary_loss_clip": 0.01035427, + "auxiliary_loss_mlp": 0.01002857, + "balance_loss_clip": 1.00169492, + "balance_loss_mlp": 1.01191425, + "epoch": 0.4949346159627236, + "flos": 68872071502080.0, + "grad_norm": 0.7419788553124401, + "language_loss": 0.60237485, + "learning_rate": 2.1297941900851944e-06, + "loss": 0.62275773, + "num_input_tokens_seen": 177004575, + "router_z_loss_clip": 0.01159668, + "router_z_loss_mlp": 0.23535156, + "step": 8232, + "time_per_iteration": 3.183783531188965 + }, + { + "auxiliary_loss_clip": 0.0111307, + "auxiliary_loss_mlp": 0.01035045, + "balance_loss_clip": 1.02119923, + "balance_loss_mlp": 1.03889871, + "epoch": 0.49499473921539155, + "flos": 24790285307520.0, + "grad_norm": 1.7147216218758814, + "language_loss": 0.69257128, + "learning_rate": 2.1294055469454496e-06, + "loss": 0.71405244, + "num_input_tokens_seen": 177024155, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7421875, + "step": 8233, + "time_per_iteration": 2.477755546569824 + }, + { + "auxiliary_loss_clip": 0.01106869, + "auxiliary_loss_mlp": 0.01035041, + "balance_loss_clip": 1.02111769, + "balance_loss_mlp": 1.03714275, + "epoch": 0.4950548624680595, + "flos": 32707905540480.0, + "grad_norm": 3.246275947254348, + "language_loss": 0.6678468, + "learning_rate": 2.129016898898633e-06, + "loss": 0.68926585, + "num_input_tokens_seen": 177046185, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.6953125, + "step": 8234, + "time_per_iteration": 2.5594117641448975 + }, + { + "auxiliary_loss_clip": 0.0103478, + "auxiliary_loss_mlp": 0.01003988, + "balance_loss_clip": 1.00288522, + "balance_loss_mlp": 1.01140106, + "epoch": 0.4951149857207275, + "flos": 50082173066880.0, + "grad_norm": 0.8288840425421409, + "language_loss": 0.57987183, + "learning_rate": 2.128628245959482e-06, + "loss": 0.60025948, + "num_input_tokens_seen": 177099025, + "router_z_loss_clip": 0.01104736, + "router_z_loss_mlp": 0.234375, + "step": 8235, + "time_per_iteration": 3.0041370391845703 + }, + { + "auxiliary_loss_clip": 0.01108932, + "auxiliary_loss_mlp": 0.01037443, + "balance_loss_clip": 1.02345991, + "balance_loss_mlp": 1.03770208, + "epoch": 0.49517510897339545, + "flos": 22236785406720.0, + "grad_norm": 1.4917768542550827, + "language_loss": 0.76824737, + "learning_rate": 2.1282395881427355e-06, + "loss": 0.78971112, + "num_input_tokens_seen": 177118365, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7109375, + "step": 8236, + "time_per_iteration": 2.498105525970459 + }, + { + "auxiliary_loss_clip": 0.01107773, + "auxiliary_loss_mlp": 0.01032214, + "balance_loss_clip": 1.01948881, + "balance_loss_mlp": 1.03860247, + "epoch": 0.4952352322260634, + "flos": 25374120969600.0, + "grad_norm": 1.8006519774313887, + "language_loss": 0.72554326, + "learning_rate": 2.1278509254631315e-06, + "loss": 0.74694312, + "num_input_tokens_seen": 177136415, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 8237, + "time_per_iteration": 2.487849473953247 + }, + { + "auxiliary_loss_clip": 0.01105674, + "auxiliary_loss_mlp": 0.01032739, + "balance_loss_clip": 1.02024627, + "balance_loss_mlp": 1.03722131, + "epoch": 0.4952953554787314, + "flos": 24608721035520.0, + "grad_norm": 1.8061825502363815, + "language_loss": 0.75687563, + "learning_rate": 2.127462257935406e-06, + "loss": 0.77825987, + "num_input_tokens_seen": 177155690, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 8238, + "time_per_iteration": 2.4926116466522217 + }, + { + "auxiliary_loss_clip": 0.01110283, + "auxiliary_loss_mlp": 0.01034271, + "balance_loss_clip": 1.02057362, + "balance_loss_mlp": 1.03765702, + "epoch": 0.49535547873139935, + "flos": 17311278049920.0, + "grad_norm": 2.197202607879525, + "language_loss": 0.73434591, + "learning_rate": 2.1270735855743008e-06, + "loss": 0.75579149, + "num_input_tokens_seen": 177173350, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7265625, + "step": 8239, + "time_per_iteration": 2.4181203842163086 + }, + { + "auxiliary_loss_clip": 0.01109997, + "auxiliary_loss_mlp": 0.01037672, + "balance_loss_clip": 1.02266932, + "balance_loss_mlp": 1.03704619, + "epoch": 0.4954156019840673, + "flos": 20740315962240.0, + "grad_norm": 2.4131176994917936, + "language_loss": 0.78344893, + "learning_rate": 2.126684908394552e-06, + "loss": 0.80492562, + "num_input_tokens_seen": 177191115, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.73046875, + "step": 8240, + "time_per_iteration": 2.479642391204834 + }, + { + "auxiliary_loss_clip": 0.01104608, + "auxiliary_loss_mlp": 0.01040833, + "balance_loss_clip": 1.02865601, + "balance_loss_mlp": 1.03746533, + "epoch": 0.49547572523673533, + "flos": 12820684567680.0, + "grad_norm": 2.0234307188816993, + "language_loss": 0.85579056, + "learning_rate": 2.126296226410898e-06, + "loss": 0.87724495, + "num_input_tokens_seen": 177206155, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.671875, + "step": 8241, + "time_per_iteration": 2.4081263542175293 + }, + { + "auxiliary_loss_clip": 0.01106442, + "auxiliary_loss_mlp": 0.01035574, + "balance_loss_clip": 1.02337933, + "balance_loss_mlp": 1.03813624, + "epoch": 0.4955358484894033, + "flos": 15597046402560.0, + "grad_norm": 1.761079127200854, + "language_loss": 0.77041149, + "learning_rate": 2.1259075396380794e-06, + "loss": 0.79183173, + "num_input_tokens_seen": 177224815, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 8242, + "time_per_iteration": 2.4439215660095215 + }, + { + "auxiliary_loss_clip": 0.01106589, + "auxiliary_loss_mlp": 0.01031002, + "balance_loss_clip": 1.01821673, + "balance_loss_mlp": 1.03676701, + "epoch": 0.49559597174207126, + "flos": 26464368528000.0, + "grad_norm": 1.7216813067847012, + "language_loss": 0.67493725, + "learning_rate": 2.125518848090833e-06, + "loss": 0.6963132, + "num_input_tokens_seen": 177244490, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 8243, + "time_per_iteration": 2.4888081550598145 + }, + { + "auxiliary_loss_clip": 0.01107757, + "auxiliary_loss_mlp": 0.01030474, + "balance_loss_clip": 1.01805878, + "balance_loss_mlp": 1.03910422, + "epoch": 0.4956560949947392, + "flos": 23148234040320.0, + "grad_norm": 1.8355775234908949, + "language_loss": 0.68218768, + "learning_rate": 2.125130151783901e-06, + "loss": 0.70357001, + "num_input_tokens_seen": 177264340, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 8244, + "time_per_iteration": 2.481220245361328 + }, + { + "auxiliary_loss_clip": 0.01109231, + "auxiliary_loss_mlp": 0.01035961, + "balance_loss_clip": 1.02201915, + "balance_loss_mlp": 1.03828287, + "epoch": 0.4957162182474072, + "flos": 20773461237120.0, + "grad_norm": 1.8414695050792438, + "language_loss": 0.74998277, + "learning_rate": 2.12474145073202e-06, + "loss": 0.77143466, + "num_input_tokens_seen": 177283055, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7109375, + "step": 8245, + "time_per_iteration": 2.459244728088379 + }, + { + "auxiliary_loss_clip": 0.01105994, + "auxiliary_loss_mlp": 0.01029176, + "balance_loss_clip": 1.01628923, + "balance_loss_mlp": 1.03797877, + "epoch": 0.49577634150007516, + "flos": 18734202397440.0, + "grad_norm": 3.047248940663427, + "language_loss": 0.81496358, + "learning_rate": 2.1243527449499306e-06, + "loss": 0.83631527, + "num_input_tokens_seen": 177301140, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6796875, + "step": 8246, + "time_per_iteration": 2.54664945602417 + }, + { + "auxiliary_loss_clip": 0.01110421, + "auxiliary_loss_mlp": 0.01039175, + "balance_loss_clip": 1.02553713, + "balance_loss_mlp": 1.03858495, + "epoch": 0.4958364647527431, + "flos": 25554176870400.0, + "grad_norm": 1.7095262667552558, + "language_loss": 0.83750397, + "learning_rate": 2.1239640344523733e-06, + "loss": 0.85899985, + "num_input_tokens_seen": 177323095, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71875, + "step": 8247, + "time_per_iteration": 2.478410482406616 + }, + { + "auxiliary_loss_clip": 0.01111487, + "auxiliary_loss_mlp": 0.01030982, + "balance_loss_clip": 1.01897812, + "balance_loss_mlp": 1.04011726, + "epoch": 0.4958965880054111, + "flos": 24425325169920.0, + "grad_norm": 2.0177325188605018, + "language_loss": 0.83758432, + "learning_rate": 2.123575319254087e-06, + "loss": 0.85900903, + "num_input_tokens_seen": 177339845, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.7109375, + "step": 8248, + "time_per_iteration": 2.490619659423828 + }, + { + "auxiliary_loss_clip": 0.01109734, + "auxiliary_loss_mlp": 0.01028278, + "balance_loss_clip": 1.01518941, + "balance_loss_mlp": 1.03800774, + "epoch": 0.49595671125807905, + "flos": 25083460114560.0, + "grad_norm": 2.055191909263014, + "language_loss": 0.73715985, + "learning_rate": 2.123186599369812e-06, + "loss": 0.75853992, + "num_input_tokens_seen": 177359980, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 8249, + "time_per_iteration": 2.5232534408569336 + }, + { + "auxiliary_loss_clip": 0.0111234, + "auxiliary_loss_mlp": 0.01038359, + "balance_loss_clip": 1.02504992, + "balance_loss_mlp": 1.04018188, + "epoch": 0.496016834510747, + "flos": 16435883692800.0, + "grad_norm": 1.9063816639589337, + "language_loss": 0.76176995, + "learning_rate": 2.122797874814289e-06, + "loss": 0.78327698, + "num_input_tokens_seen": 177378580, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.72265625, + "step": 8250, + "time_per_iteration": 2.5368192195892334 + }, + { + "auxiliary_loss_clip": 0.01108406, + "auxiliary_loss_mlp": 0.0103451, + "balance_loss_clip": 1.02170718, + "balance_loss_mlp": 1.03792036, + "epoch": 0.496076957763415, + "flos": 23437925228160.0, + "grad_norm": 1.615677709430237, + "language_loss": 0.69986647, + "learning_rate": 2.1224091456022585e-06, + "loss": 0.72129565, + "num_input_tokens_seen": 177398790, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.703125, + "step": 8251, + "time_per_iteration": 2.4543070793151855 + }, + { + "auxiliary_loss_clip": 0.01108023, + "auxiliary_loss_mlp": 0.01027913, + "balance_loss_clip": 1.01586699, + "balance_loss_mlp": 1.03890181, + "epoch": 0.49613708101608295, + "flos": 16909509450240.0, + "grad_norm": 1.8749041446582064, + "language_loss": 0.79864365, + "learning_rate": 2.122020411748461e-06, + "loss": 0.82000297, + "num_input_tokens_seen": 177416515, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 8252, + "time_per_iteration": 2.4386792182922363 + }, + { + "auxiliary_loss_clip": 0.01108859, + "auxiliary_loss_mlp": 0.01028142, + "balance_loss_clip": 1.01384854, + "balance_loss_mlp": 1.03821409, + "epoch": 0.4961972042687509, + "flos": 16618094409600.0, + "grad_norm": 1.7863838823967775, + "language_loss": 0.80688357, + "learning_rate": 2.1216316732676363e-06, + "loss": 0.82825357, + "num_input_tokens_seen": 177434425, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.70703125, + "step": 8253, + "time_per_iteration": 2.440727710723877 + }, + { + "auxiliary_loss_clip": 0.01105434, + "auxiliary_loss_mlp": 0.01030191, + "balance_loss_clip": 1.01863384, + "balance_loss_mlp": 1.03654194, + "epoch": 0.49625732752141893, + "flos": 28956749437440.0, + "grad_norm": 1.548882190492268, + "language_loss": 0.67088544, + "learning_rate": 2.1212429301745275e-06, + "loss": 0.69224173, + "num_input_tokens_seen": 177459675, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6875, + "step": 8254, + "time_per_iteration": 2.575490951538086 + }, + { + "auxiliary_loss_clip": 0.011067, + "auxiliary_loss_mlp": 0.01035621, + "balance_loss_clip": 1.02257323, + "balance_loss_mlp": 1.03522658, + "epoch": 0.4963174507740869, + "flos": 23112359331840.0, + "grad_norm": 1.5646536445016186, + "language_loss": 0.73859739, + "learning_rate": 2.1208541824838743e-06, + "loss": 0.76002055, + "num_input_tokens_seen": 177478895, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71484375, + "step": 8255, + "time_per_iteration": 2.478703498840332 + }, + { + "auxiliary_loss_clip": 0.0110576, + "auxiliary_loss_mlp": 0.01034939, + "balance_loss_clip": 1.02208281, + "balance_loss_mlp": 1.0362165, + "epoch": 0.49637757402675486, + "flos": 13917863450880.0, + "grad_norm": 1.8563521426834817, + "language_loss": 0.81378329, + "learning_rate": 2.1204654302104183e-06, + "loss": 0.8351903, + "num_input_tokens_seen": 177494920, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 8256, + "time_per_iteration": 2.4312291145324707 + }, + { + "auxiliary_loss_clip": 0.01105024, + "auxiliary_loss_mlp": 0.01024955, + "balance_loss_clip": 1.01246178, + "balance_loss_mlp": 1.03679466, + "epoch": 0.49643769727942283, + "flos": 22309001700480.0, + "grad_norm": 1.8572652078491616, + "language_loss": 0.80710369, + "learning_rate": 2.120076673368901e-06, + "loss": 0.82840347, + "num_input_tokens_seen": 177515455, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 8257, + "time_per_iteration": 2.4589884281158447 + }, + { + "auxiliary_loss_clip": 0.01111951, + "auxiliary_loss_mlp": 0.0103531, + "balance_loss_clip": 1.02173841, + "balance_loss_mlp": 1.03759003, + "epoch": 0.4964978205320908, + "flos": 19500248776320.0, + "grad_norm": 2.788575980623821, + "language_loss": 0.66533971, + "learning_rate": 2.1196879119740647e-06, + "loss": 0.68681228, + "num_input_tokens_seen": 177534040, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7421875, + "step": 8258, + "time_per_iteration": 2.477653741836548 + }, + { + "auxiliary_loss_clip": 0.01103911, + "auxiliary_loss_mlp": 0.01028497, + "balance_loss_clip": 1.01674283, + "balance_loss_mlp": 1.03566313, + "epoch": 0.49655794378475876, + "flos": 23436524597760.0, + "grad_norm": 2.207120440649978, + "language_loss": 0.77672231, + "learning_rate": 2.1192991460406502e-06, + "loss": 0.79804647, + "num_input_tokens_seen": 177554510, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6796875, + "step": 8259, + "time_per_iteration": 2.482516050338745 + }, + { + "auxiliary_loss_clip": 0.01107983, + "auxiliary_loss_mlp": 0.01030822, + "balance_loss_clip": 1.01802468, + "balance_loss_mlp": 1.03903294, + "epoch": 0.4966180670374267, + "flos": 26831124345600.0, + "grad_norm": 1.536511866358609, + "language_loss": 0.78612608, + "learning_rate": 2.1189103755834e-06, + "loss": 0.80751413, + "num_input_tokens_seen": 177575780, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6875, + "step": 8260, + "time_per_iteration": 4.0255560874938965 + }, + { + "auxiliary_loss_clip": 0.0110786, + "auxiliary_loss_mlp": 0.01030696, + "balance_loss_clip": 1.01785684, + "balance_loss_mlp": 1.03662324, + "epoch": 0.4966781902900947, + "flos": 22009326531840.0, + "grad_norm": 4.674193904345997, + "language_loss": 0.76227403, + "learning_rate": 2.1185216006170573e-06, + "loss": 0.78365964, + "num_input_tokens_seen": 177588965, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 8261, + "time_per_iteration": 2.537996530532837 + }, + { + "auxiliary_loss_clip": 0.01104467, + "auxiliary_loss_mlp": 0.01028346, + "balance_loss_clip": 1.01622844, + "balance_loss_mlp": 1.03667367, + "epoch": 0.49673831354276266, + "flos": 26213353309440.0, + "grad_norm": 1.9998040798137362, + "language_loss": 0.89328134, + "learning_rate": 2.1181328211563627e-06, + "loss": 0.91460943, + "num_input_tokens_seen": 177608425, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 8262, + "time_per_iteration": 5.405071020126343 + }, + { + "auxiliary_loss_clip": 0.01104636, + "auxiliary_loss_mlp": 0.01029475, + "balance_loss_clip": 1.0172143, + "balance_loss_mlp": 1.03765512, + "epoch": 0.4967984367954306, + "flos": 23182277155200.0, + "grad_norm": 1.4087924984120455, + "language_loss": 0.73918653, + "learning_rate": 2.11774403721606e-06, + "loss": 0.76052761, + "num_input_tokens_seen": 177628240, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.671875, + "step": 8263, + "time_per_iteration": 3.9610228538513184 + }, + { + "auxiliary_loss_clip": 0.01112691, + "auxiliary_loss_mlp": 0.01033653, + "balance_loss_clip": 1.0196991, + "balance_loss_mlp": 1.04077482, + "epoch": 0.4968585600480986, + "flos": 19281445079040.0, + "grad_norm": 2.641620630884259, + "language_loss": 0.69445115, + "learning_rate": 2.1173552488108923e-06, + "loss": 0.71591461, + "num_input_tokens_seen": 177645920, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.71875, + "step": 8264, + "time_per_iteration": 2.4799907207489014 + }, + { + "auxiliary_loss_clip": 0.01105995, + "auxiliary_loss_mlp": 0.01028905, + "balance_loss_clip": 1.01585722, + "balance_loss_mlp": 1.03470981, + "epoch": 0.49691868330076655, + "flos": 22528703237760.0, + "grad_norm": 1.3808235907294704, + "language_loss": 0.64915001, + "learning_rate": 2.1169664559556007e-06, + "loss": 0.67049909, + "num_input_tokens_seen": 177667185, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 8265, + "time_per_iteration": 2.491708517074585 + }, + { + "auxiliary_loss_clip": 0.01034788, + "auxiliary_loss_mlp": 0.01001781, + "balance_loss_clip": 1.00064886, + "balance_loss_mlp": 1.01169205, + "epoch": 0.4969788065534345, + "flos": 66577128675840.0, + "grad_norm": 0.8684712318419048, + "language_loss": 0.53446817, + "learning_rate": 2.1165776586649304e-06, + "loss": 0.55483389, + "num_input_tokens_seen": 177733020, + "router_z_loss_clip": 0.01135254, + "router_z_loss_mlp": 0.23144531, + "step": 8266, + "time_per_iteration": 3.1343002319335938 + }, + { + "auxiliary_loss_clip": 0.01104137, + "auxiliary_loss_mlp": 0.0102799, + "balance_loss_clip": 1.01567531, + "balance_loss_mlp": 1.03706813, + "epoch": 0.49703892980610254, + "flos": 24059503105920.0, + "grad_norm": 3.469499482915289, + "language_loss": 0.79616332, + "learning_rate": 2.1161888569536223e-06, + "loss": 0.81748462, + "num_input_tokens_seen": 177753370, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.671875, + "step": 8267, + "time_per_iteration": 2.5316126346588135 + }, + { + "auxiliary_loss_clip": 0.01109343, + "auxiliary_loss_mlp": 0.01032461, + "balance_loss_clip": 1.01856148, + "balance_loss_mlp": 1.03869104, + "epoch": 0.4970990530587705, + "flos": 29126174912640.0, + "grad_norm": 2.5132671844419434, + "language_loss": 0.74805677, + "learning_rate": 2.1158000508364223e-06, + "loss": 0.76947474, + "num_input_tokens_seen": 177771530, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.70703125, + "step": 8268, + "time_per_iteration": 2.5102896690368652 + }, + { + "auxiliary_loss_clip": 0.0110689, + "auxiliary_loss_mlp": 0.01033853, + "balance_loss_clip": 1.01998329, + "balance_loss_mlp": 1.0366255, + "epoch": 0.49715917631143847, + "flos": 46026167258880.0, + "grad_norm": 1.9572065929893177, + "language_loss": 0.67818397, + "learning_rate": 2.115411240328073e-06, + "loss": 0.6995914, + "num_input_tokens_seen": 177796355, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.703125, + "step": 8269, + "time_per_iteration": 2.7194817066192627 + }, + { + "auxiliary_loss_clip": 0.0110431, + "auxiliary_loss_mlp": 0.01032199, + "balance_loss_clip": 1.01968217, + "balance_loss_mlp": 1.03744197, + "epoch": 0.49721929956410643, + "flos": 20191277600640.0, + "grad_norm": 1.6139896668987463, + "language_loss": 0.85450721, + "learning_rate": 2.1150224254433167e-06, + "loss": 0.87587237, + "num_input_tokens_seen": 177814300, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.671875, + "step": 8270, + "time_per_iteration": 2.4423561096191406 + }, + { + "auxiliary_loss_clip": 0.01108462, + "auxiliary_loss_mlp": 0.01029428, + "balance_loss_clip": 1.01834702, + "balance_loss_mlp": 1.03857064, + "epoch": 0.4972794228167744, + "flos": 21653560275840.0, + "grad_norm": 1.6811398863814482, + "language_loss": 0.71087623, + "learning_rate": 2.114633606196899e-06, + "loss": 0.73225504, + "num_input_tokens_seen": 177833615, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.69921875, + "step": 8271, + "time_per_iteration": 2.54892635345459 + }, + { + "auxiliary_loss_clip": 0.01109407, + "auxiliary_loss_mlp": 0.01029685, + "balance_loss_clip": 1.01633358, + "balance_loss_mlp": 1.03880143, + "epoch": 0.49733954606944236, + "flos": 24279743347200.0, + "grad_norm": 1.4557340389451365, + "language_loss": 0.7848624, + "learning_rate": 2.1142447826035635e-06, + "loss": 0.80625331, + "num_input_tokens_seen": 177855315, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.703125, + "step": 8272, + "time_per_iteration": 2.462470054626465 + }, + { + "auxiliary_loss_clip": 0.01109645, + "auxiliary_loss_mlp": 0.01035441, + "balance_loss_clip": 1.02275074, + "balance_loss_mlp": 1.03950167, + "epoch": 0.4973996693221103, + "flos": 37852575730560.0, + "grad_norm": 2.5057831430835686, + "language_loss": 0.66278791, + "learning_rate": 2.1138559546780544e-06, + "loss": 0.68423879, + "num_input_tokens_seen": 177875590, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 8273, + "time_per_iteration": 2.6735026836395264 + }, + { + "auxiliary_loss_clip": 0.01109746, + "auxiliary_loss_mlp": 0.01031477, + "balance_loss_clip": 1.01891851, + "balance_loss_mlp": 1.03968048, + "epoch": 0.4974597925747783, + "flos": 21361426963200.0, + "grad_norm": 1.871691944459235, + "language_loss": 0.77977264, + "learning_rate": 2.1134671224351163e-06, + "loss": 0.80118477, + "num_input_tokens_seen": 177894175, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69921875, + "step": 8274, + "time_per_iteration": 2.462465763092041 + }, + { + "auxiliary_loss_clip": 0.01110748, + "auxiliary_loss_mlp": 0.01032049, + "balance_loss_clip": 1.01864374, + "balance_loss_mlp": 1.03865933, + "epoch": 0.49751991582744626, + "flos": 30738133560960.0, + "grad_norm": 2.0388244744713724, + "language_loss": 0.75829184, + "learning_rate": 2.113078285889493e-06, + "loss": 0.77971983, + "num_input_tokens_seen": 177913920, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.72265625, + "step": 8275, + "time_per_iteration": 2.6034398078918457 + }, + { + "auxiliary_loss_clip": 0.01110746, + "auxiliary_loss_mlp": 0.01034383, + "balance_loss_clip": 1.01974416, + "balance_loss_mlp": 1.03761268, + "epoch": 0.4975800390801142, + "flos": 14100541044480.0, + "grad_norm": 1.9341151140441402, + "language_loss": 0.8392635, + "learning_rate": 2.1126894450559303e-06, + "loss": 0.86071479, + "num_input_tokens_seen": 177930425, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.734375, + "step": 8276, + "time_per_iteration": 2.435999870300293 + }, + { + "auxiliary_loss_clip": 0.01102851, + "auxiliary_loss_mlp": 0.01028231, + "balance_loss_clip": 1.01664937, + "balance_loss_mlp": 1.03633988, + "epoch": 0.4976401623327822, + "flos": 24207275658240.0, + "grad_norm": 1.3535075156355831, + "language_loss": 0.70188868, + "learning_rate": 2.112300599949172e-06, + "loss": 0.72319949, + "num_input_tokens_seen": 177949885, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 8277, + "time_per_iteration": 2.5726187229156494 + }, + { + "auxiliary_loss_clip": 0.01105349, + "auxiliary_loss_mlp": 0.01032526, + "balance_loss_clip": 1.01952052, + "balance_loss_mlp": 1.03669858, + "epoch": 0.49770028558545015, + "flos": 21136769349120.0, + "grad_norm": 1.773647946812319, + "language_loss": 0.82609779, + "learning_rate": 2.111911750583964e-06, + "loss": 0.84747648, + "num_input_tokens_seen": 177965720, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6875, + "step": 8278, + "time_per_iteration": 2.4459898471832275 + }, + { + "auxiliary_loss_clip": 0.01108155, + "auxiliary_loss_mlp": 0.01032407, + "balance_loss_clip": 1.01992559, + "balance_loss_mlp": 1.03671384, + "epoch": 0.4977604088381181, + "flos": 16763927627520.0, + "grad_norm": 1.8017237706358624, + "language_loss": 0.6784246, + "learning_rate": 2.111522896975052e-06, + "loss": 0.69983023, + "num_input_tokens_seen": 177983190, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71484375, + "step": 8279, + "time_per_iteration": 2.4793283939361572 + }, + { + "auxiliary_loss_clip": 0.0110668, + "auxiliary_loss_mlp": 0.01034393, + "balance_loss_clip": 1.0204277, + "balance_loss_mlp": 1.03561902, + "epoch": 0.49782053209078614, + "flos": 15703521292800.0, + "grad_norm": 1.9740212049853438, + "language_loss": 0.70469928, + "learning_rate": 2.1111340391371794e-06, + "loss": 0.72610998, + "num_input_tokens_seen": 178000155, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7109375, + "step": 8280, + "time_per_iteration": 2.427778482437134 + }, + { + "auxiliary_loss_clip": 0.01104778, + "auxiliary_loss_mlp": 0.01033065, + "balance_loss_clip": 1.02028, + "balance_loss_mlp": 1.03475237, + "epoch": 0.4978806553434541, + "flos": 24753692327040.0, + "grad_norm": 1.6232736941666084, + "language_loss": 0.64461923, + "learning_rate": 2.1107451770850936e-06, + "loss": 0.66599762, + "num_input_tokens_seen": 178021060, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 8281, + "time_per_iteration": 2.511054515838623 + }, + { + "auxiliary_loss_clip": 0.01109361, + "auxiliary_loss_mlp": 0.01035185, + "balance_loss_clip": 1.02175605, + "balance_loss_mlp": 1.03830338, + "epoch": 0.49794077859612207, + "flos": 13115726881920.0, + "grad_norm": 1.82873470978674, + "language_loss": 0.72714734, + "learning_rate": 2.1103563108335387e-06, + "loss": 0.74859279, + "num_input_tokens_seen": 178038180, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7109375, + "step": 8282, + "time_per_iteration": 2.417059898376465 + }, + { + "auxiliary_loss_clip": 0.01103243, + "auxiliary_loss_mlp": 0.01029512, + "balance_loss_clip": 1.01804423, + "balance_loss_mlp": 1.03591275, + "epoch": 0.49800090184879003, + "flos": 27525133998720.0, + "grad_norm": 1.6753255120783885, + "language_loss": 0.73373008, + "learning_rate": 2.109967440397263e-06, + "loss": 0.75505757, + "num_input_tokens_seen": 178057565, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.671875, + "step": 8283, + "time_per_iteration": 2.531747341156006 + }, + { + "auxiliary_loss_clip": 0.01106082, + "auxiliary_loss_mlp": 0.01037237, + "balance_loss_clip": 1.02446926, + "balance_loss_mlp": 1.03696167, + "epoch": 0.498061025101458, + "flos": 19792489829760.0, + "grad_norm": 1.6101503544989328, + "language_loss": 0.78866243, + "learning_rate": 2.1095785657910095e-06, + "loss": 0.81009555, + "num_input_tokens_seen": 178076965, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69140625, + "step": 8284, + "time_per_iteration": 2.4609432220458984 + }, + { + "auxiliary_loss_clip": 0.01113439, + "auxiliary_loss_mlp": 0.01038109, + "balance_loss_clip": 1.02398884, + "balance_loss_mlp": 1.0390476, + "epoch": 0.49812114835412596, + "flos": 29893909230720.0, + "grad_norm": 1.8191212695174297, + "language_loss": 0.73705399, + "learning_rate": 2.109189687029526e-06, + "loss": 0.75856948, + "num_input_tokens_seen": 178095105, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.74609375, + "step": 8285, + "time_per_iteration": 2.5364696979522705 + }, + { + "auxiliary_loss_clip": 0.01112037, + "auxiliary_loss_mlp": 0.01032281, + "balance_loss_clip": 1.01872683, + "balance_loss_mlp": 1.0420599, + "epoch": 0.49818127160679393, + "flos": 23147048891520.0, + "grad_norm": 1.6445235471758528, + "language_loss": 0.74477649, + "learning_rate": 2.1088008041275598e-06, + "loss": 0.76621962, + "num_input_tokens_seen": 178114505, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.69921875, + "step": 8286, + "time_per_iteration": 2.4888620376586914 + }, + { + "auxiliary_loss_clip": 0.01112849, + "auxiliary_loss_mlp": 0.0104004, + "balance_loss_clip": 1.02713549, + "balance_loss_mlp": 1.04156506, + "epoch": 0.4982413948594619, + "flos": 21652806090240.0, + "grad_norm": 1.7365216069979077, + "language_loss": 0.85467643, + "learning_rate": 2.1084119170998545e-06, + "loss": 0.87620533, + "num_input_tokens_seen": 178131595, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 8287, + "time_per_iteration": 2.5058188438415527 + }, + { + "auxiliary_loss_clip": 0.01107755, + "auxiliary_loss_mlp": 0.01025542, + "balance_loss_clip": 1.01267338, + "balance_loss_mlp": 1.03729916, + "epoch": 0.49830151811212986, + "flos": 32486982940800.0, + "grad_norm": 1.6348463305948138, + "language_loss": 0.72363204, + "learning_rate": 2.108023025961159e-06, + "loss": 0.74496502, + "num_input_tokens_seen": 178152055, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 8288, + "time_per_iteration": 2.528475046157837 + }, + { + "auxiliary_loss_clip": 0.0111456, + "auxiliary_loss_mlp": 0.01038212, + "balance_loss_clip": 1.02319193, + "balance_loss_mlp": 1.04041409, + "epoch": 0.4983616413647978, + "flos": 18142358002560.0, + "grad_norm": 2.900373689725773, + "language_loss": 0.80002087, + "learning_rate": 2.10763413072622e-06, + "loss": 0.82154852, + "num_input_tokens_seen": 178168150, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7421875, + "step": 8289, + "time_per_iteration": 2.4667603969573975 + }, + { + "auxiliary_loss_clip": 0.01106957, + "auxiliary_loss_mlp": 0.01033142, + "balance_loss_clip": 1.0199995, + "balance_loss_mlp": 1.03680038, + "epoch": 0.4984217646174658, + "flos": 19718836992000.0, + "grad_norm": 2.15669041751919, + "language_loss": 0.73524791, + "learning_rate": 2.107245231409784e-06, + "loss": 0.7566489, + "num_input_tokens_seen": 178186150, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 8290, + "time_per_iteration": 2.4318900108337402 + }, + { + "auxiliary_loss_clip": 0.01112096, + "auxiliary_loss_mlp": 0.01037317, + "balance_loss_clip": 1.02232039, + "balance_loss_mlp": 1.04070783, + "epoch": 0.49848188787013376, + "flos": 24936549488640.0, + "grad_norm": 1.4681011524205945, + "language_loss": 0.84016359, + "learning_rate": 2.106856328026598e-06, + "loss": 0.86165774, + "num_input_tokens_seen": 178207665, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7109375, + "step": 8291, + "time_per_iteration": 2.502545118331909 + }, + { + "auxiliary_loss_clip": 0.01117247, + "auxiliary_loss_mlp": 0.01037074, + "balance_loss_clip": 1.02307272, + "balance_loss_mlp": 1.04216146, + "epoch": 0.4985420111228017, + "flos": 22382439056640.0, + "grad_norm": 1.910804847598398, + "language_loss": 0.67084122, + "learning_rate": 2.106467420591409e-06, + "loss": 0.69238442, + "num_input_tokens_seen": 178226325, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.75, + "step": 8292, + "time_per_iteration": 2.4527781009674072 + }, + { + "auxiliary_loss_clip": 0.01108825, + "auxiliary_loss_mlp": 0.01031205, + "balance_loss_clip": 1.01933742, + "balance_loss_mlp": 1.03864646, + "epoch": 0.4986021343754697, + "flos": 16216469464320.0, + "grad_norm": 1.7642237687107358, + "language_loss": 0.67300534, + "learning_rate": 2.106078509118965e-06, + "loss": 0.69440567, + "num_input_tokens_seen": 178244960, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.703125, + "step": 8293, + "time_per_iteration": 2.4598476886749268 + }, + { + "auxiliary_loss_clip": 0.01110354, + "auxiliary_loss_mlp": 0.01028466, + "balance_loss_clip": 1.01615214, + "balance_loss_mlp": 1.03958893, + "epoch": 0.4986622576281377, + "flos": 23403594804480.0, + "grad_norm": 1.987515516196069, + "language_loss": 0.8202461, + "learning_rate": 2.1056895936240133e-06, + "loss": 0.84163427, + "num_input_tokens_seen": 178265400, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.7109375, + "step": 8294, + "time_per_iteration": 2.4827442169189453 + }, + { + "auxiliary_loss_clip": 0.01110277, + "auxiliary_loss_mlp": 0.01034377, + "balance_loss_clip": 1.02032816, + "balance_loss_mlp": 1.03937042, + "epoch": 0.49872238088080567, + "flos": 19974556892160.0, + "grad_norm": 1.7471179574646651, + "language_loss": 0.73073918, + "learning_rate": 2.1053006741213016e-06, + "loss": 0.7521857, + "num_input_tokens_seen": 178284535, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7109375, + "step": 8295, + "time_per_iteration": 2.4712820053100586 + }, + { + "auxiliary_loss_clip": 0.01108254, + "auxiliary_loss_mlp": 0.01036676, + "balance_loss_clip": 1.02435029, + "balance_loss_mlp": 1.03895998, + "epoch": 0.49878250413347364, + "flos": 22893016930560.0, + "grad_norm": 1.9200384732673381, + "language_loss": 0.673262, + "learning_rate": 2.1049117506255775e-06, + "loss": 0.69471127, + "num_input_tokens_seen": 178302425, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 8296, + "time_per_iteration": 2.45139479637146 + }, + { + "auxiliary_loss_clip": 0.01111689, + "auxiliary_loss_mlp": 0.01033799, + "balance_loss_clip": 1.0202632, + "balance_loss_mlp": 1.03996015, + "epoch": 0.4988426273861416, + "flos": 32598449821440.0, + "grad_norm": 1.713618634115876, + "language_loss": 0.64634776, + "learning_rate": 2.1045228231515895e-06, + "loss": 0.66780269, + "num_input_tokens_seen": 178323065, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.71875, + "step": 8297, + "time_per_iteration": 2.5514614582061768 + }, + { + "auxiliary_loss_clip": 0.0110753, + "auxiliary_loss_mlp": 0.01033016, + "balance_loss_clip": 1.02121472, + "balance_loss_mlp": 1.03931689, + "epoch": 0.49890275063880957, + "flos": 20923604087040.0, + "grad_norm": 1.9440676372274848, + "language_loss": 0.69621831, + "learning_rate": 2.1041338917140857e-06, + "loss": 0.71762383, + "num_input_tokens_seen": 178343985, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.68359375, + "step": 8298, + "time_per_iteration": 2.4699370861053467 + }, + { + "auxiliary_loss_clip": 0.01107047, + "auxiliary_loss_mlp": 0.01036879, + "balance_loss_clip": 1.02421331, + "balance_loss_mlp": 1.03804398, + "epoch": 0.49896287389147753, + "flos": 18624459369600.0, + "grad_norm": 2.087380746796303, + "language_loss": 0.84278095, + "learning_rate": 2.103744956327814e-06, + "loss": 0.86422026, + "num_input_tokens_seen": 178362345, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 8299, + "time_per_iteration": 2.4820563793182373 + }, + { + "auxiliary_loss_clip": 0.01113334, + "auxiliary_loss_mlp": 0.01037602, + "balance_loss_clip": 1.02327859, + "balance_loss_mlp": 1.03978848, + "epoch": 0.4990229971441455, + "flos": 24826555065600.0, + "grad_norm": 5.591354549929027, + "language_loss": 0.69272447, + "learning_rate": 2.1033560170075234e-06, + "loss": 0.71423382, + "num_input_tokens_seen": 178383190, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.734375, + "step": 8300, + "time_per_iteration": 2.473634719848633 + }, + { + "auxiliary_loss_clip": 0.01037164, + "auxiliary_loss_mlp": 0.01003582, + "balance_loss_clip": 1.00239551, + "balance_loss_mlp": 1.01397431, + "epoch": 0.49908312039681346, + "flos": 71384525136000.0, + "grad_norm": 0.7592353305728455, + "language_loss": 0.51136976, + "learning_rate": 2.1029670737679623e-06, + "loss": 0.5317772, + "num_input_tokens_seen": 178444250, + "router_z_loss_clip": 0.01184082, + "router_z_loss_mlp": 0.23242188, + "step": 8301, + "time_per_iteration": 3.1719589233398438 + }, + { + "auxiliary_loss_clip": 0.01106374, + "auxiliary_loss_mlp": 0.01040035, + "balance_loss_clip": 1.02670741, + "balance_loss_mlp": 1.03841138, + "epoch": 0.4991432436494814, + "flos": 19828651847040.0, + "grad_norm": 1.9297901828770159, + "language_loss": 0.84423494, + "learning_rate": 2.102578126623879e-06, + "loss": 0.86569905, + "num_input_tokens_seen": 178463250, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.6796875, + "step": 8302, + "time_per_iteration": 3.8624472618103027 + }, + { + "auxiliary_loss_clip": 0.01107901, + "auxiliary_loss_mlp": 0.01027812, + "balance_loss_clip": 1.0157299, + "balance_loss_mlp": 1.03963566, + "epoch": 0.4992033669021494, + "flos": 15121912273920.0, + "grad_norm": 1.7245012471823244, + "language_loss": 0.68831706, + "learning_rate": 2.102189175590024e-06, + "loss": 0.70967424, + "num_input_tokens_seen": 178481340, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 8303, + "time_per_iteration": 2.4496121406555176 + }, + { + "auxiliary_loss_clip": 0.01111721, + "auxiliary_loss_mlp": 0.01031821, + "balance_loss_clip": 1.01871395, + "balance_loss_mlp": 1.0395093, + "epoch": 0.49926349015481736, + "flos": 31207952476800.0, + "grad_norm": 1.8500063703376581, + "language_loss": 0.72523201, + "learning_rate": 2.101800220681144e-06, + "loss": 0.7466675, + "num_input_tokens_seen": 178501545, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.72265625, + "step": 8304, + "time_per_iteration": 5.351519346237183 + }, + { + "auxiliary_loss_clip": 0.01109868, + "auxiliary_loss_mlp": 0.01038641, + "balance_loss_clip": 1.02633858, + "balance_loss_mlp": 1.03971672, + "epoch": 0.4993236134074853, + "flos": 24900207903360.0, + "grad_norm": 2.113610055263332, + "language_loss": 0.81011766, + "learning_rate": 2.10141126191199e-06, + "loss": 0.83160275, + "num_input_tokens_seen": 178519700, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 8305, + "time_per_iteration": 3.9764394760131836 + }, + { + "auxiliary_loss_clip": 0.0103618, + "auxiliary_loss_mlp": 0.01001804, + "balance_loss_clip": 1.00061762, + "balance_loss_mlp": 1.01301277, + "epoch": 0.4993837366601533, + "flos": 70420573797120.0, + "grad_norm": 0.7225706425993785, + "language_loss": 0.56916559, + "learning_rate": 2.1010222992973107e-06, + "loss": 0.58954537, + "num_input_tokens_seen": 178576740, + "router_z_loss_clip": 0.01184082, + "router_z_loss_mlp": 0.23144531, + "step": 8306, + "time_per_iteration": 3.1952388286590576 + }, + { + "auxiliary_loss_clip": 0.01114208, + "auxiliary_loss_mlp": 0.01037149, + "balance_loss_clip": 1.02323711, + "balance_loss_mlp": 1.04268515, + "epoch": 0.4994438599128213, + "flos": 15961216440960.0, + "grad_norm": 1.791967653711514, + "language_loss": 0.82407033, + "learning_rate": 2.1006333328518556e-06, + "loss": 0.84558392, + "num_input_tokens_seen": 178594745, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.71484375, + "step": 8307, + "time_per_iteration": 2.4501423835754395 + }, + { + "auxiliary_loss_clip": 0.01108969, + "auxiliary_loss_mlp": 0.01033029, + "balance_loss_clip": 1.01987445, + "balance_loss_mlp": 1.03845966, + "epoch": 0.4995039831654893, + "flos": 27928303228800.0, + "grad_norm": 2.0869484891217973, + "language_loss": 0.60544026, + "learning_rate": 2.1002443625903748e-06, + "loss": 0.62686026, + "num_input_tokens_seen": 178614110, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 8308, + "time_per_iteration": 2.5023903846740723 + }, + { + "auxiliary_loss_clip": 0.01106463, + "auxiliary_loss_mlp": 0.01030347, + "balance_loss_clip": 1.0179193, + "balance_loss_mlp": 1.03760242, + "epoch": 0.49956410641815724, + "flos": 24204797619840.0, + "grad_norm": 1.5917355796130328, + "language_loss": 0.74632615, + "learning_rate": 2.0998553885276168e-06, + "loss": 0.76769423, + "num_input_tokens_seen": 178634170, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6875, + "step": 8309, + "time_per_iteration": 2.473018169403076 + }, + { + "auxiliary_loss_clip": 0.01109782, + "auxiliary_loss_mlp": 0.01034327, + "balance_loss_clip": 1.02136922, + "balance_loss_mlp": 1.03926158, + "epoch": 0.4996242296708252, + "flos": 16180127879040.0, + "grad_norm": 2.147167346860859, + "language_loss": 0.80117911, + "learning_rate": 2.0994664106783335e-06, + "loss": 0.82262021, + "num_input_tokens_seen": 178651775, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.703125, + "step": 8310, + "time_per_iteration": 2.4172844886779785 + }, + { + "auxiliary_loss_clip": 0.01112518, + "auxiliary_loss_mlp": 0.01035729, + "balance_loss_clip": 1.02339089, + "balance_loss_mlp": 1.04019213, + "epoch": 0.49968435292349317, + "flos": 16873527000960.0, + "grad_norm": 1.6036366291386785, + "language_loss": 0.70938641, + "learning_rate": 2.0990774290572735e-06, + "loss": 0.73086882, + "num_input_tokens_seen": 178669720, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.72265625, + "step": 8311, + "time_per_iteration": 2.4804234504699707 + }, + { + "auxiliary_loss_clip": 0.01111462, + "auxiliary_loss_mlp": 0.01034214, + "balance_loss_clip": 1.02229297, + "balance_loss_mlp": 1.04154408, + "epoch": 0.49974447617616113, + "flos": 14939521989120.0, + "grad_norm": 1.923283457940722, + "language_loss": 0.77138013, + "learning_rate": 2.098688443679187e-06, + "loss": 0.79283684, + "num_input_tokens_seen": 178686765, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69921875, + "step": 8312, + "time_per_iteration": 2.4233593940734863 + }, + { + "auxiliary_loss_clip": 0.01111451, + "auxiliary_loss_mlp": 0.01032282, + "balance_loss_clip": 1.01910901, + "balance_loss_mlp": 1.04093099, + "epoch": 0.4998045994288291, + "flos": 26651535321600.0, + "grad_norm": 1.7466795572602452, + "language_loss": 0.84205925, + "learning_rate": 2.0982994545588256e-06, + "loss": 0.86349666, + "num_input_tokens_seen": 178705845, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.70703125, + "step": 8313, + "time_per_iteration": 2.509953260421753 + }, + { + "auxiliary_loss_clip": 0.01111508, + "auxiliary_loss_mlp": 0.01029516, + "balance_loss_clip": 1.01633728, + "balance_loss_mlp": 1.03987491, + "epoch": 0.49986472268149706, + "flos": 20953768533120.0, + "grad_norm": 2.119225345296983, + "language_loss": 0.80887723, + "learning_rate": 2.097910461710939e-06, + "loss": 0.83028746, + "num_input_tokens_seen": 178723410, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71484375, + "step": 8314, + "time_per_iteration": 2.452765703201294 + }, + { + "auxiliary_loss_clip": 0.01113768, + "auxiliary_loss_mlp": 0.01041835, + "balance_loss_clip": 1.02763736, + "balance_loss_mlp": 1.0418222, + "epoch": 0.49992484593416503, + "flos": 22783884433920.0, + "grad_norm": 2.4967995028767778, + "language_loss": 0.79017889, + "learning_rate": 2.0975214651502773e-06, + "loss": 0.81173497, + "num_input_tokens_seen": 178743560, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.71875, + "step": 8315, + "time_per_iteration": 2.4926230907440186 + }, + { + "auxiliary_loss_clip": 0.01110205, + "auxiliary_loss_mlp": 0.01034063, + "balance_loss_clip": 1.02123618, + "balance_loss_mlp": 1.04051793, + "epoch": 0.499984969186833, + "flos": 46786970252160.0, + "grad_norm": 2.5792388666411274, + "language_loss": 0.73983908, + "learning_rate": 2.0971324648915926e-06, + "loss": 0.76128173, + "num_input_tokens_seen": 178767225, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 8316, + "time_per_iteration": 2.692228317260742 + }, + { + "auxiliary_loss_clip": 0.01109445, + "auxiliary_loss_mlp": 0.01032836, + "balance_loss_clip": 1.02058125, + "balance_loss_mlp": 1.04118443, + "epoch": 0.500045092439501, + "flos": 25556978131200.0, + "grad_norm": 1.4190232020266644, + "language_loss": 0.81204319, + "learning_rate": 2.0967434609496343e-06, + "loss": 0.83346593, + "num_input_tokens_seen": 178786810, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.68359375, + "step": 8317, + "time_per_iteration": 2.4997825622558594 + }, + { + "auxiliary_loss_clip": 0.01111618, + "auxiliary_loss_mlp": 0.01038384, + "balance_loss_clip": 1.02436495, + "balance_loss_mlp": 1.04001343, + "epoch": 0.5001052156921689, + "flos": 20704764476160.0, + "grad_norm": 1.649167878849496, + "language_loss": 0.83189869, + "learning_rate": 2.0963544533391548e-06, + "loss": 0.85339868, + "num_input_tokens_seen": 178805660, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.71875, + "step": 8318, + "time_per_iteration": 2.516118049621582 + }, + { + "auxiliary_loss_clip": 0.01111509, + "auxiliary_loss_mlp": 0.01030416, + "balance_loss_clip": 1.01778626, + "balance_loss_mlp": 1.04068375, + "epoch": 0.500165338944837, + "flos": 21251109317760.0, + "grad_norm": 1.8062739344487506, + "language_loss": 0.81684446, + "learning_rate": 2.0959654420749045e-06, + "loss": 0.83826375, + "num_input_tokens_seen": 178824780, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.70703125, + "step": 8319, + "time_per_iteration": 2.4977705478668213 + }, + { + "auxiliary_loss_clip": 0.01112348, + "auxiliary_loss_mlp": 0.01026791, + "balance_loss_clip": 1.01469707, + "balance_loss_mlp": 1.04046464, + "epoch": 0.5002254621975049, + "flos": 27854398995840.0, + "grad_norm": 1.7611824883833367, + "language_loss": 0.71951354, + "learning_rate": 2.095576427171635e-06, + "loss": 0.74090493, + "num_input_tokens_seen": 178845640, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71875, + "step": 8320, + "time_per_iteration": 2.5664663314819336 + }, + { + "auxiliary_loss_clip": 0.01116964, + "auxiliary_loss_mlp": 0.01043637, + "balance_loss_clip": 1.02903366, + "balance_loss_mlp": 1.03925049, + "epoch": 0.5002855854501729, + "flos": 15551941898880.0, + "grad_norm": 3.538267489088781, + "language_loss": 0.76840645, + "learning_rate": 2.0951874086440978e-06, + "loss": 0.79001242, + "num_input_tokens_seen": 178862290, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7734375, + "step": 8321, + "time_per_iteration": 2.5154004096984863 + }, + { + "auxiliary_loss_clip": 0.01113289, + "auxiliary_loss_mlp": 0.01038756, + "balance_loss_clip": 1.0255599, + "balance_loss_mlp": 1.04125774, + "epoch": 0.5003457087028408, + "flos": 16107408794880.0, + "grad_norm": 1.9154758393965534, + "language_loss": 0.82959068, + "learning_rate": 2.0947983865070455e-06, + "loss": 0.85111117, + "num_input_tokens_seen": 178879805, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71875, + "step": 8322, + "time_per_iteration": 2.4235384464263916 + }, + { + "auxiliary_loss_clip": 0.01114951, + "auxiliary_loss_mlp": 0.01035, + "balance_loss_clip": 1.02180934, + "balance_loss_mlp": 1.04190695, + "epoch": 0.5004058319555088, + "flos": 22710518904960.0, + "grad_norm": 2.1453827228353166, + "language_loss": 0.73670769, + "learning_rate": 2.094409360775228e-06, + "loss": 0.7582072, + "num_input_tokens_seen": 178896985, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73046875, + "step": 8323, + "time_per_iteration": 2.495490312576294 + }, + { + "auxiliary_loss_clip": 0.01111344, + "auxiliary_loss_mlp": 0.01035029, + "balance_loss_clip": 1.02152205, + "balance_loss_mlp": 1.04043198, + "epoch": 0.5004659552081767, + "flos": 30117956313600.0, + "grad_norm": 1.517177144462768, + "language_loss": 0.69255745, + "learning_rate": 2.0940203314633977e-06, + "loss": 0.71402115, + "num_input_tokens_seen": 178920605, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7109375, + "step": 8324, + "time_per_iteration": 2.534043550491333 + }, + { + "auxiliary_loss_clip": 0.01110991, + "auxiliary_loss_mlp": 0.01033694, + "balance_loss_clip": 1.02072978, + "balance_loss_mlp": 1.03958941, + "epoch": 0.5005260784608447, + "flos": 18624710764800.0, + "grad_norm": 1.9198571129878061, + "language_loss": 0.72153628, + "learning_rate": 2.0936312985863077e-06, + "loss": 0.7429831, + "num_input_tokens_seen": 178937760, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71484375, + "step": 8325, + "time_per_iteration": 2.4783544540405273 + }, + { + "auxiliary_loss_clip": 0.01114311, + "auxiliary_loss_mlp": 0.01038383, + "balance_loss_clip": 1.0237087, + "balance_loss_mlp": 1.04212904, + "epoch": 0.5005862017135126, + "flos": 24859987649280.0, + "grad_norm": 1.5620326365302057, + "language_loss": 0.73494631, + "learning_rate": 2.093242262158709e-06, + "loss": 0.7564733, + "num_input_tokens_seen": 178957985, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.72265625, + "step": 8326, + "time_per_iteration": 2.4836461544036865 + }, + { + "auxiliary_loss_clip": 0.01110122, + "auxiliary_loss_mlp": 0.01031456, + "balance_loss_clip": 1.0189389, + "balance_loss_mlp": 1.03965449, + "epoch": 0.5006463249661807, + "flos": 18734381965440.0, + "grad_norm": 1.5385455876451686, + "language_loss": 0.78168696, + "learning_rate": 2.0928532221953544e-06, + "loss": 0.80310273, + "num_input_tokens_seen": 178977070, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 8327, + "time_per_iteration": 2.477095127105713 + }, + { + "auxiliary_loss_clip": 0.01117029, + "auxiliary_loss_mlp": 0.01035945, + "balance_loss_clip": 1.02261126, + "balance_loss_mlp": 1.04402947, + "epoch": 0.5007064482188487, + "flos": 13042145871360.0, + "grad_norm": 2.31963767631444, + "language_loss": 0.88008773, + "learning_rate": 2.092464178710997e-06, + "loss": 0.90161747, + "num_input_tokens_seen": 178994175, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73046875, + "step": 8328, + "time_per_iteration": 2.479931116104126 + }, + { + "auxiliary_loss_clip": 0.01116123, + "auxiliary_loss_mlp": 0.01035928, + "balance_loss_clip": 1.02290463, + "balance_loss_mlp": 1.0408715, + "epoch": 0.5007665714715166, + "flos": 21288671965440.0, + "grad_norm": 2.0106246059801482, + "language_loss": 0.74407351, + "learning_rate": 2.092075131720388e-06, + "loss": 0.76559395, + "num_input_tokens_seen": 179013710, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.75, + "step": 8329, + "time_per_iteration": 2.480037212371826 + }, + { + "auxiliary_loss_clip": 0.01112626, + "auxiliary_loss_mlp": 0.01033135, + "balance_loss_clip": 1.02014136, + "balance_loss_mlp": 1.04276633, + "epoch": 0.5008266947241846, + "flos": 29754576374400.0, + "grad_norm": 2.2897047741072063, + "language_loss": 0.79602063, + "learning_rate": 2.091686081238281e-06, + "loss": 0.81747818, + "num_input_tokens_seen": 179035255, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.69921875, + "step": 8330, + "time_per_iteration": 2.529446601867676 + }, + { + "auxiliary_loss_clip": 0.0103803, + "auxiliary_loss_mlp": 0.01000333, + "balance_loss_clip": 0.99922389, + "balance_loss_mlp": 1.01505685, + "epoch": 0.5008868179768525, + "flos": 63557829204480.0, + "grad_norm": 0.7317803530986337, + "language_loss": 0.56073356, + "learning_rate": 2.0912970272794282e-06, + "loss": 0.58111727, + "num_input_tokens_seen": 179090915, + "router_z_loss_clip": 0.0111084, + "router_z_loss_mlp": 0.23046875, + "step": 8331, + "time_per_iteration": 2.89511775970459 + }, + { + "auxiliary_loss_clip": 0.01110931, + "auxiliary_loss_mlp": 0.0102697, + "balance_loss_clip": 1.01504326, + "balance_loss_mlp": 1.041206, + "epoch": 0.5009469412295205, + "flos": 27375637593600.0, + "grad_norm": 2.865515028785386, + "language_loss": 0.65518546, + "learning_rate": 2.0909079698585833e-06, + "loss": 0.67656446, + "num_input_tokens_seen": 179109160, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 8332, + "time_per_iteration": 2.497129201889038 + }, + { + "auxiliary_loss_clip": 0.01109356, + "auxiliary_loss_mlp": 0.01034733, + "balance_loss_clip": 1.02261496, + "balance_loss_mlp": 1.0400846, + "epoch": 0.5010070644821885, + "flos": 27378833904000.0, + "grad_norm": 1.477043934406584, + "language_loss": 0.74687374, + "learning_rate": 2.0905189089904993e-06, + "loss": 0.76831466, + "num_input_tokens_seen": 179130610, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69140625, + "step": 8333, + "time_per_iteration": 2.506769895553589 + }, + { + "auxiliary_loss_clip": 0.01114084, + "auxiliary_loss_mlp": 0.01034969, + "balance_loss_clip": 1.02242804, + "balance_loss_mlp": 1.04128885, + "epoch": 0.5010671877348565, + "flos": 20662748542080.0, + "grad_norm": 3.419508092200526, + "language_loss": 0.80619013, + "learning_rate": 2.090129844689929e-06, + "loss": 0.82768065, + "num_input_tokens_seen": 179147860, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7265625, + "step": 8334, + "time_per_iteration": 2.4492759704589844 + }, + { + "auxiliary_loss_clip": 0.01038411, + "auxiliary_loss_mlp": 0.00996695, + "balance_loss_clip": 0.99557459, + "balance_loss_mlp": 1.01541471, + "epoch": 0.5011273109875244, + "flos": 59128645000320.0, + "grad_norm": 0.8938151962133672, + "language_loss": 0.62658346, + "learning_rate": 2.089740776971626e-06, + "loss": 0.64693451, + "num_input_tokens_seen": 179210490, + "router_z_loss_clip": 0.01123047, + "router_z_loss_mlp": 0.23046875, + "step": 8335, + "time_per_iteration": 3.044527530670166 + }, + { + "auxiliary_loss_clip": 0.01108292, + "auxiliary_loss_mlp": 0.01027703, + "balance_loss_clip": 1.01548398, + "balance_loss_mlp": 1.03883338, + "epoch": 0.5011874342401924, + "flos": 25336342840320.0, + "grad_norm": 1.39366543335018, + "language_loss": 0.79443586, + "learning_rate": 2.0893517058503435e-06, + "loss": 0.81579578, + "num_input_tokens_seen": 179231360, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6953125, + "step": 8336, + "time_per_iteration": 2.5133562088012695 + }, + { + "auxiliary_loss_clip": 0.01111717, + "auxiliary_loss_mlp": 0.01031101, + "balance_loss_clip": 1.01791, + "balance_loss_mlp": 1.0402261, + "epoch": 0.5012475574928603, + "flos": 20229953569920.0, + "grad_norm": 1.7464580749308463, + "language_loss": 0.80139911, + "learning_rate": 2.088962631340836e-06, + "loss": 0.82282722, + "num_input_tokens_seen": 179250625, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71484375, + "step": 8337, + "time_per_iteration": 2.4671413898468018 + }, + { + "auxiliary_loss_clip": 0.01114807, + "auxiliary_loss_mlp": 0.01033341, + "balance_loss_clip": 1.0201329, + "balance_loss_mlp": 1.03992128, + "epoch": 0.5013076807455283, + "flos": 22710123855360.0, + "grad_norm": 1.859552309481282, + "language_loss": 0.79314995, + "learning_rate": 2.0885735534578555e-06, + "loss": 0.8146314, + "num_input_tokens_seen": 179267360, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.75, + "step": 8338, + "time_per_iteration": 2.4763965606689453 + }, + { + "auxiliary_loss_clip": 0.01112164, + "auxiliary_loss_mlp": 0.01031399, + "balance_loss_clip": 1.0178982, + "balance_loss_mlp": 1.0390203, + "epoch": 0.5013678039981962, + "flos": 24245161528320.0, + "grad_norm": 1.6104717001039177, + "language_loss": 0.85006964, + "learning_rate": 2.0881844722161583e-06, + "loss": 0.87150526, + "num_input_tokens_seen": 179289810, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73046875, + "step": 8339, + "time_per_iteration": 2.507951259613037 + }, + { + "auxiliary_loss_clip": 0.01110925, + "auxiliary_loss_mlp": 0.01038014, + "balance_loss_clip": 1.02476954, + "balance_loss_mlp": 1.03943646, + "epoch": 0.5014279272508643, + "flos": 26176688501760.0, + "grad_norm": 1.484784321746097, + "language_loss": 0.70492387, + "learning_rate": 2.0877953876304962e-06, + "loss": 0.72641325, + "num_input_tokens_seen": 179310620, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71484375, + "step": 8340, + "time_per_iteration": 2.5271620750427246 + }, + { + "auxiliary_loss_clip": 0.01116646, + "auxiliary_loss_mlp": 0.01035561, + "balance_loss_clip": 1.02178025, + "balance_loss_mlp": 1.04153883, + "epoch": 0.5014880505035323, + "flos": 21430446946560.0, + "grad_norm": 1.9114275861555547, + "language_loss": 0.77793235, + "learning_rate": 2.0874062997156245e-06, + "loss": 0.79945439, + "num_input_tokens_seen": 179329005, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.75, + "step": 8341, + "time_per_iteration": 2.467557430267334 + }, + { + "auxiliary_loss_clip": 0.01116354, + "auxiliary_loss_mlp": 0.01039659, + "balance_loss_clip": 1.02543771, + "balance_loss_mlp": 1.04048502, + "epoch": 0.5015481737562002, + "flos": 15770745596160.0, + "grad_norm": 2.478803711535475, + "language_loss": 0.8961392, + "learning_rate": 2.0870172084862975e-06, + "loss": 0.91769934, + "num_input_tokens_seen": 179343785, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7578125, + "step": 8342, + "time_per_iteration": 2.454822063446045 + }, + { + "auxiliary_loss_clip": 0.01110124, + "auxiliary_loss_mlp": 0.01036028, + "balance_loss_clip": 1.02272439, + "balance_loss_mlp": 1.03894877, + "epoch": 0.5016082970088682, + "flos": 26830801123200.0, + "grad_norm": 3.1772216639919906, + "language_loss": 0.76625615, + "learning_rate": 2.0866281139572682e-06, + "loss": 0.7877177, + "num_input_tokens_seen": 179364070, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 8343, + "time_per_iteration": 2.485499143600464 + }, + { + "auxiliary_loss_clip": 0.0110844, + "auxiliary_loss_mlp": 0.01027749, + "balance_loss_clip": 1.01584053, + "balance_loss_mlp": 1.03967083, + "epoch": 0.5016684202615361, + "flos": 21470595373440.0, + "grad_norm": 2.1220779506727574, + "language_loss": 0.67086864, + "learning_rate": 2.086239016143293e-06, + "loss": 0.69223046, + "num_input_tokens_seen": 179384225, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 8344, + "time_per_iteration": 3.88729190826416 + }, + { + "auxiliary_loss_clip": 0.01111927, + "auxiliary_loss_mlp": 0.01034179, + "balance_loss_clip": 1.02143502, + "balance_loss_mlp": 1.03998613, + "epoch": 0.5017285435142042, + "flos": 26246821806720.0, + "grad_norm": 1.9395231632627998, + "language_loss": 0.75212955, + "learning_rate": 2.0858499150591258e-06, + "loss": 0.77359062, + "num_input_tokens_seen": 179402595, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 8345, + "time_per_iteration": 2.4836034774780273 + }, + { + "auxiliary_loss_clip": 0.01112737, + "auxiliary_loss_mlp": 0.01031643, + "balance_loss_clip": 1.01769578, + "balance_loss_mlp": 1.04121828, + "epoch": 0.5017886667668721, + "flos": 20777555387520.0, + "grad_norm": 1.95370753247372, + "language_loss": 0.78477418, + "learning_rate": 2.0854608107195203e-06, + "loss": 0.80621803, + "num_input_tokens_seen": 179419635, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.71484375, + "step": 8346, + "time_per_iteration": 5.420297861099243 + }, + { + "auxiliary_loss_clip": 0.01110161, + "auxiliary_loss_mlp": 0.01036529, + "balance_loss_clip": 1.02408957, + "balance_loss_mlp": 1.03860831, + "epoch": 0.5018487900195401, + "flos": 20156408472960.0, + "grad_norm": 1.6533044146295508, + "language_loss": 0.69167304, + "learning_rate": 2.0850717031392333e-06, + "loss": 0.71313995, + "num_input_tokens_seen": 179438770, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71484375, + "step": 8347, + "time_per_iteration": 2.5022430419921875 + }, + { + "auxiliary_loss_clip": 0.01112834, + "auxiliary_loss_mlp": 0.01034397, + "balance_loss_clip": 1.02136123, + "balance_loss_mlp": 1.03990984, + "epoch": 0.501908913272208, + "flos": 18150689957760.0, + "grad_norm": 1.8545802319259819, + "language_loss": 0.71527761, + "learning_rate": 2.0846825923330174e-06, + "loss": 0.73674989, + "num_input_tokens_seen": 179457475, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73046875, + "step": 8348, + "time_per_iteration": 2.491255760192871 + }, + { + "auxiliary_loss_clip": 0.01108777, + "auxiliary_loss_mlp": 0.01032852, + "balance_loss_clip": 1.02089548, + "balance_loss_mlp": 1.04003596, + "epoch": 0.501969036524876, + "flos": 23112287504640.0, + "grad_norm": 1.6664488621380107, + "language_loss": 0.73957872, + "learning_rate": 2.0842934783156303e-06, + "loss": 0.76099503, + "num_input_tokens_seen": 179478140, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6875, + "step": 8349, + "time_per_iteration": 2.478173017501831 + }, + { + "auxiliary_loss_clip": 0.01111134, + "auxiliary_loss_mlp": 0.01030526, + "balance_loss_clip": 1.01726353, + "balance_loss_mlp": 1.03897953, + "epoch": 0.5020291597775439, + "flos": 11363214314880.0, + "grad_norm": 2.0979883436616915, + "language_loss": 0.63680947, + "learning_rate": 2.0839043611018266e-06, + "loss": 0.65822613, + "num_input_tokens_seen": 179494325, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.72265625, + "step": 8350, + "time_per_iteration": 2.407949686050415 + }, + { + "auxiliary_loss_clip": 0.01035777, + "auxiliary_loss_mlp": 0.01011664, + "balance_loss_clip": 1.01064515, + "balance_loss_mlp": 1.01269341, + "epoch": 0.5020892830302119, + "flos": 64011094928640.0, + "grad_norm": 1.0786206787107346, + "language_loss": 0.59814817, + "learning_rate": 2.0835152407063597e-06, + "loss": 0.6186226, + "num_input_tokens_seen": 179553545, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.23144531, + "step": 8351, + "time_per_iteration": 3.199061393737793 + }, + { + "auxiliary_loss_clip": 0.01111613, + "auxiliary_loss_mlp": 0.01034378, + "balance_loss_clip": 1.02156925, + "balance_loss_mlp": 1.0395788, + "epoch": 0.5021494062828799, + "flos": 23732859801600.0, + "grad_norm": 2.3062568387149365, + "language_loss": 0.75367033, + "learning_rate": 2.0831261171439873e-06, + "loss": 0.77513033, + "num_input_tokens_seen": 179573645, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.71875, + "step": 8352, + "time_per_iteration": 2.506408214569092 + }, + { + "auxiliary_loss_clip": 0.01113074, + "auxiliary_loss_mlp": 0.01032571, + "balance_loss_clip": 1.01963115, + "balance_loss_mlp": 1.04205072, + "epoch": 0.5022095295355479, + "flos": 21576747041280.0, + "grad_norm": 1.6126052392954302, + "language_loss": 0.71743786, + "learning_rate": 2.082736990429464e-06, + "loss": 0.73889434, + "num_input_tokens_seen": 179591435, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 8353, + "time_per_iteration": 2.469383478164673 + }, + { + "auxiliary_loss_clip": 0.01115894, + "auxiliary_loss_mlp": 0.0103681, + "balance_loss_clip": 1.02279735, + "balance_loss_mlp": 1.04492378, + "epoch": 0.5022696527882159, + "flos": 21397229844480.0, + "grad_norm": 3.986170886248432, + "language_loss": 0.73818904, + "learning_rate": 2.0823478605775455e-06, + "loss": 0.75971609, + "num_input_tokens_seen": 179609955, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7109375, + "step": 8354, + "time_per_iteration": 2.510967254638672 + }, + { + "auxiliary_loss_clip": 0.01111051, + "auxiliary_loss_mlp": 0.01036606, + "balance_loss_clip": 1.02324271, + "balance_loss_mlp": 1.04122615, + "epoch": 0.5023297760408838, + "flos": 27160712565120.0, + "grad_norm": 1.6375075569861386, + "language_loss": 0.72198367, + "learning_rate": 2.0819587276029884e-06, + "loss": 0.74346024, + "num_input_tokens_seen": 179630875, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.69921875, + "step": 8355, + "time_per_iteration": 2.5355918407440186 + }, + { + "auxiliary_loss_clip": 0.01113009, + "auxiliary_loss_mlp": 0.01036959, + "balance_loss_clip": 1.0234164, + "balance_loss_mlp": 1.04037476, + "epoch": 0.5023898992935518, + "flos": 26213820186240.0, + "grad_norm": 1.5634548911110102, + "language_loss": 0.81171584, + "learning_rate": 2.081569591520548e-06, + "loss": 0.83321553, + "num_input_tokens_seen": 179649835, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7265625, + "step": 8356, + "time_per_iteration": 2.5366694927215576 + }, + { + "auxiliary_loss_clip": 0.01115056, + "auxiliary_loss_mlp": 0.01038235, + "balance_loss_clip": 1.0234828, + "balance_loss_mlp": 1.03943825, + "epoch": 0.5024500225462197, + "flos": 13440323111040.0, + "grad_norm": 2.216032444638608, + "language_loss": 0.76043326, + "learning_rate": 2.0811804523449803e-06, + "loss": 0.78196621, + "num_input_tokens_seen": 179667605, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.7578125, + "step": 8357, + "time_per_iteration": 2.4454803466796875 + }, + { + "auxiliary_loss_clip": 0.01112875, + "auxiliary_loss_mlp": 0.01033956, + "balance_loss_clip": 1.0196929, + "balance_loss_mlp": 1.04054666, + "epoch": 0.5025101457988878, + "flos": 21579584215680.0, + "grad_norm": 1.6874014883711121, + "language_loss": 0.75969183, + "learning_rate": 2.0807913100910417e-06, + "loss": 0.78116012, + "num_input_tokens_seen": 179686910, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7265625, + "step": 8358, + "time_per_iteration": 2.4932358264923096 + }, + { + "auxiliary_loss_clip": 0.01111732, + "auxiliary_loss_mlp": 0.01034386, + "balance_loss_clip": 1.02163708, + "balance_loss_mlp": 1.04097748, + "epoch": 0.5025702690515557, + "flos": 24645134448000.0, + "grad_norm": 2.322067399050787, + "language_loss": 0.72372258, + "learning_rate": 2.0804021647734887e-06, + "loss": 0.74518377, + "num_input_tokens_seen": 179706395, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 8359, + "time_per_iteration": 2.500152826309204 + }, + { + "auxiliary_loss_clip": 0.01111655, + "auxiliary_loss_mlp": 0.01036283, + "balance_loss_clip": 1.02361679, + "balance_loss_mlp": 1.04144287, + "epoch": 0.5026303923042237, + "flos": 22090162089600.0, + "grad_norm": 1.6242275025336705, + "language_loss": 0.77095789, + "learning_rate": 2.080013016407077e-06, + "loss": 0.79243731, + "num_input_tokens_seen": 179725735, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 8360, + "time_per_iteration": 2.5194928646087646 + }, + { + "auxiliary_loss_clip": 0.01111322, + "auxiliary_loss_mlp": 0.01032708, + "balance_loss_clip": 1.02062035, + "balance_loss_mlp": 1.04179871, + "epoch": 0.5026905155568916, + "flos": 23697200574720.0, + "grad_norm": 1.6325944972725464, + "language_loss": 0.76545495, + "learning_rate": 2.0796238650065645e-06, + "loss": 0.78689528, + "num_input_tokens_seen": 179746150, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 8361, + "time_per_iteration": 2.4667415618896484 + }, + { + "auxiliary_loss_clip": 0.0111058, + "auxiliary_loss_mlp": 0.01033627, + "balance_loss_clip": 1.01973319, + "balance_loss_mlp": 1.03841019, + "epoch": 0.5027506388095596, + "flos": 25812410722560.0, + "grad_norm": 1.6123805658340187, + "language_loss": 0.84681976, + "learning_rate": 2.0792347105867065e-06, + "loss": 0.86826181, + "num_input_tokens_seen": 179767550, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.71875, + "step": 8362, + "time_per_iteration": 2.5463051795959473 + }, + { + "auxiliary_loss_clip": 0.01109115, + "auxiliary_loss_mlp": 0.01034772, + "balance_loss_clip": 1.02232695, + "balance_loss_mlp": 1.03756952, + "epoch": 0.5028107620622275, + "flos": 27526606456320.0, + "grad_norm": 1.4590070504225026, + "language_loss": 0.78211838, + "learning_rate": 2.0788455531622605e-06, + "loss": 0.80355728, + "num_input_tokens_seen": 179790075, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71484375, + "step": 8363, + "time_per_iteration": 2.5163207054138184 + }, + { + "auxiliary_loss_clip": 0.0110737, + "auxiliary_loss_mlp": 0.0103085, + "balance_loss_clip": 1.01799965, + "balance_loss_mlp": 1.04016399, + "epoch": 0.5028708853148955, + "flos": 24534278098560.0, + "grad_norm": 3.0044110074814627, + "language_loss": 0.75747573, + "learning_rate": 2.0784563927479838e-06, + "loss": 0.77885795, + "num_input_tokens_seen": 179806515, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.671875, + "step": 8364, + "time_per_iteration": 2.490145444869995 + }, + { + "auxiliary_loss_clip": 0.01106443, + "auxiliary_loss_mlp": 0.01029652, + "balance_loss_clip": 1.01749849, + "balance_loss_mlp": 1.03816295, + "epoch": 0.5029310085675635, + "flos": 20813609664000.0, + "grad_norm": 1.5639014752994398, + "language_loss": 0.69354087, + "learning_rate": 2.0780672293586317e-06, + "loss": 0.7149018, + "num_input_tokens_seen": 179826450, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.68359375, + "step": 8365, + "time_per_iteration": 2.473787307739258 + }, + { + "auxiliary_loss_clip": 0.01113542, + "auxiliary_loss_mlp": 0.01035128, + "balance_loss_clip": 1.02149057, + "balance_loss_mlp": 1.03982782, + "epoch": 0.5029911318202315, + "flos": 22342470197760.0, + "grad_norm": 1.442330503817835, + "language_loss": 0.73213601, + "learning_rate": 2.0776780630089635e-06, + "loss": 0.75362265, + "num_input_tokens_seen": 179846770, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.734375, + "step": 8366, + "time_per_iteration": 2.549877405166626 + }, + { + "auxiliary_loss_clip": 0.01109966, + "auxiliary_loss_mlp": 0.01032343, + "balance_loss_clip": 1.02064812, + "balance_loss_mlp": 1.04103982, + "epoch": 0.5030512550728995, + "flos": 24352713826560.0, + "grad_norm": 1.4509464249778803, + "language_loss": 0.78301162, + "learning_rate": 2.077288893713735e-06, + "loss": 0.80443466, + "num_input_tokens_seen": 179866585, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6875, + "step": 8367, + "time_per_iteration": 2.495147705078125 + }, + { + "auxiliary_loss_clip": 0.01108781, + "auxiliary_loss_mlp": 0.01030247, + "balance_loss_clip": 1.0180459, + "balance_loss_mlp": 1.03853226, + "epoch": 0.5031113783255674, + "flos": 18259930195200.0, + "grad_norm": 1.842981496070619, + "language_loss": 0.69923592, + "learning_rate": 2.0768997214877035e-06, + "loss": 0.72062624, + "num_input_tokens_seen": 179885575, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.703125, + "step": 8368, + "time_per_iteration": 2.4830057621002197 + }, + { + "auxiliary_loss_clip": 0.01035945, + "auxiliary_loss_mlp": 0.01007176, + "balance_loss_clip": 1.00621665, + "balance_loss_mlp": 1.01321661, + "epoch": 0.5031715015782354, + "flos": 57253173200640.0, + "grad_norm": 0.8570502115037558, + "language_loss": 0.63344997, + "learning_rate": 2.0765105463456274e-06, + "loss": 0.65388119, + "num_input_tokens_seen": 179939650, + "router_z_loss_clip": 0.00958252, + "router_z_loss_mlp": 0.22851562, + "step": 8369, + "time_per_iteration": 3.0224173069000244 + }, + { + "auxiliary_loss_clip": 0.0110829, + "auxiliary_loss_mlp": 0.01031598, + "balance_loss_clip": 1.01973677, + "balance_loss_mlp": 1.03877878, + "epoch": 0.5032316248309033, + "flos": 27527360641920.0, + "grad_norm": 2.153532760870157, + "language_loss": 0.60134995, + "learning_rate": 2.076121368302263e-06, + "loss": 0.62274879, + "num_input_tokens_seen": 179961765, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6953125, + "step": 8370, + "time_per_iteration": 2.570244073867798 + }, + { + "auxiliary_loss_clip": 0.01110861, + "auxiliary_loss_mlp": 0.01034812, + "balance_loss_clip": 1.02094817, + "balance_loss_mlp": 1.03846478, + "epoch": 0.5032917480835714, + "flos": 34495825939200.0, + "grad_norm": 1.5686803599666441, + "language_loss": 0.68485558, + "learning_rate": 2.0757321873723695e-06, + "loss": 0.7063123, + "num_input_tokens_seen": 179983015, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.72265625, + "step": 8371, + "time_per_iteration": 2.5606741905212402 + }, + { + "auxiliary_loss_clip": 0.01110798, + "auxiliary_loss_mlp": 0.01030836, + "balance_loss_clip": 1.01710284, + "balance_loss_mlp": 1.04021561, + "epoch": 0.5033518713362393, + "flos": 33656773167360.0, + "grad_norm": 2.6972353884187776, + "language_loss": 0.67238319, + "learning_rate": 2.0753430035707042e-06, + "loss": 0.6937995, + "num_input_tokens_seen": 180003210, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.703125, + "step": 8372, + "time_per_iteration": 2.5703678131103516 + }, + { + "auxiliary_loss_clip": 0.0110914, + "auxiliary_loss_mlp": 0.01035865, + "balance_loss_clip": 1.02197719, + "balance_loss_mlp": 1.03876567, + "epoch": 0.5034119945889073, + "flos": 28185495586560.0, + "grad_norm": 2.7198935997293683, + "language_loss": 0.66590893, + "learning_rate": 2.0749538169120235e-06, + "loss": 0.68735898, + "num_input_tokens_seen": 180025530, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.703125, + "step": 8373, + "time_per_iteration": 2.526221513748169 + }, + { + "auxiliary_loss_clip": 0.01106535, + "auxiliary_loss_mlp": 0.01028349, + "balance_loss_clip": 1.01558208, + "balance_loss_mlp": 1.03755879, + "epoch": 0.5034721178415752, + "flos": 21358697529600.0, + "grad_norm": 1.6286907446961802, + "language_loss": 0.74674404, + "learning_rate": 2.0745646274110872e-06, + "loss": 0.76809293, + "num_input_tokens_seen": 180043180, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69140625, + "step": 8374, + "time_per_iteration": 2.488349199295044 + }, + { + "auxiliary_loss_clip": 0.01111709, + "auxiliary_loss_mlp": 0.01037533, + "balance_loss_clip": 1.02400887, + "balance_loss_mlp": 1.04047632, + "epoch": 0.5035322410942432, + "flos": 22674823764480.0, + "grad_norm": 1.5485355079726564, + "language_loss": 0.67947745, + "learning_rate": 2.0741754350826525e-06, + "loss": 0.70096987, + "num_input_tokens_seen": 180062905, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7109375, + "step": 8375, + "time_per_iteration": 2.4445972442626953 + }, + { + "auxiliary_loss_clip": 0.01114075, + "auxiliary_loss_mlp": 0.01034329, + "balance_loss_clip": 1.02008343, + "balance_loss_mlp": 1.04047072, + "epoch": 0.5035923643469111, + "flos": 19828723674240.0, + "grad_norm": 1.8481066708574578, + "language_loss": 0.78526819, + "learning_rate": 2.0737862399414777e-06, + "loss": 0.8067522, + "num_input_tokens_seen": 180082000, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.734375, + "step": 8376, + "time_per_iteration": 2.468104124069214 + }, + { + "auxiliary_loss_clip": 0.0111296, + "auxiliary_loss_mlp": 0.0103107, + "balance_loss_clip": 1.01704502, + "balance_loss_mlp": 1.03864694, + "epoch": 0.5036524875995791, + "flos": 30514625182080.0, + "grad_norm": 2.8611372201727234, + "language_loss": 0.59723544, + "learning_rate": 2.0733970420023213e-06, + "loss": 0.61867571, + "num_input_tokens_seen": 180101340, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 8377, + "time_per_iteration": 2.5277962684631348 + }, + { + "auxiliary_loss_clip": 0.01108859, + "auxiliary_loss_mlp": 0.01034548, + "balance_loss_clip": 1.02114892, + "balance_loss_mlp": 1.03836918, + "epoch": 0.5037126108522471, + "flos": 14720574637440.0, + "grad_norm": 1.9462161897860946, + "language_loss": 0.76360452, + "learning_rate": 2.0730078412799425e-06, + "loss": 0.78503865, + "num_input_tokens_seen": 180119160, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.703125, + "step": 8378, + "time_per_iteration": 2.448323965072632 + }, + { + "auxiliary_loss_clip": 0.01109358, + "auxiliary_loss_mlp": 0.01034908, + "balance_loss_clip": 1.02211046, + "balance_loss_mlp": 1.03916407, + "epoch": 0.5037727341049151, + "flos": 25297702784640.0, + "grad_norm": 1.6531450393233522, + "language_loss": 0.74565625, + "learning_rate": 2.0726186377890985e-06, + "loss": 0.7670989, + "num_input_tokens_seen": 180138730, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.703125, + "step": 8379, + "time_per_iteration": 2.5036356449127197 + }, + { + "auxiliary_loss_clip": 0.01109557, + "auxiliary_loss_mlp": 0.01031633, + "balance_loss_clip": 1.01952767, + "balance_loss_mlp": 1.04144955, + "epoch": 0.5038328573575831, + "flos": 28541764632960.0, + "grad_norm": 5.059413081923233, + "language_loss": 0.6692574, + "learning_rate": 2.072229431544548e-06, + "loss": 0.6906693, + "num_input_tokens_seen": 180158810, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 8380, + "time_per_iteration": 2.524144411087036 + }, + { + "auxiliary_loss_clip": 0.01109006, + "auxiliary_loss_mlp": 0.01030794, + "balance_loss_clip": 1.01879573, + "balance_loss_mlp": 1.03999329, + "epoch": 0.503892980610251, + "flos": 31649869503360.0, + "grad_norm": 1.7991215942112995, + "language_loss": 0.63869506, + "learning_rate": 2.071840222561051e-06, + "loss": 0.66009307, + "num_input_tokens_seen": 180179700, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 8381, + "time_per_iteration": 2.5605592727661133 + }, + { + "auxiliary_loss_clip": 0.01108854, + "auxiliary_loss_mlp": 0.01035256, + "balance_loss_clip": 1.02296555, + "balance_loss_mlp": 1.04009557, + "epoch": 0.503953103862919, + "flos": 27089358197760.0, + "grad_norm": 1.6170974847944384, + "language_loss": 0.67252153, + "learning_rate": 2.071451010853365e-06, + "loss": 0.69396263, + "num_input_tokens_seen": 180199890, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 8382, + "time_per_iteration": 2.5227982997894287 + }, + { + "auxiliary_loss_clip": 0.01116241, + "auxiliary_loss_mlp": 0.01039894, + "balance_loss_clip": 1.02614903, + "balance_loss_mlp": 1.04075313, + "epoch": 0.5040132271155869, + "flos": 15632957024640.0, + "grad_norm": 2.0398701191748, + "language_loss": 0.62190729, + "learning_rate": 2.0710617964362506e-06, + "loss": 0.64346862, + "num_input_tokens_seen": 180217840, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.75390625, + "step": 8383, + "time_per_iteration": 2.43418288230896 + }, + { + "auxiliary_loss_clip": 0.01106599, + "auxiliary_loss_mlp": 0.01034318, + "balance_loss_clip": 1.02198625, + "balance_loss_mlp": 1.03885436, + "epoch": 0.504073350368255, + "flos": 13590106824960.0, + "grad_norm": 3.355380782185913, + "language_loss": 0.67041314, + "learning_rate": 2.070672579324465e-06, + "loss": 0.69182235, + "num_input_tokens_seen": 180236465, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.67578125, + "step": 8384, + "time_per_iteration": 2.450605630874634 + }, + { + "auxiliary_loss_clip": 0.01112035, + "auxiliary_loss_mlp": 0.01036023, + "balance_loss_clip": 1.02414393, + "balance_loss_mlp": 1.0412066, + "epoch": 0.5041334736209229, + "flos": 29058160510080.0, + "grad_norm": 1.6534299501213623, + "language_loss": 0.70829523, + "learning_rate": 2.0702833595327674e-06, + "loss": 0.72977579, + "num_input_tokens_seen": 180258025, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.70703125, + "step": 8385, + "time_per_iteration": 3.9600095748901367 + }, + { + "auxiliary_loss_clip": 0.01106768, + "auxiliary_loss_mlp": 0.01027134, + "balance_loss_clip": 1.0147717, + "balance_loss_mlp": 1.03961098, + "epoch": 0.5041935968735909, + "flos": 24608361899520.0, + "grad_norm": 2.2280411323646687, + "language_loss": 0.83021009, + "learning_rate": 2.069894137075919e-06, + "loss": 0.85154909, + "num_input_tokens_seen": 180277825, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.671875, + "step": 8386, + "time_per_iteration": 2.5137035846710205 + }, + { + "auxiliary_loss_clip": 0.01109584, + "auxiliary_loss_mlp": 0.01034789, + "balance_loss_clip": 1.02156925, + "balance_loss_mlp": 1.03921139, + "epoch": 0.5042537201262588, + "flos": 26286934320000.0, + "grad_norm": 1.4630184477724049, + "language_loss": 0.66776884, + "learning_rate": 2.0695049119686766e-06, + "loss": 0.6892125, + "num_input_tokens_seen": 180300465, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 8387, + "time_per_iteration": 5.38523268699646 + }, + { + "auxiliary_loss_clip": 0.01110278, + "auxiliary_loss_mlp": 0.01029754, + "balance_loss_clip": 1.01780963, + "balance_loss_mlp": 1.04077113, + "epoch": 0.5043138433789268, + "flos": 22017371178240.0, + "grad_norm": 1.3874005116173278, + "language_loss": 0.80059648, + "learning_rate": 2.0691156842258016e-06, + "loss": 0.82199681, + "num_input_tokens_seen": 180321050, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 8388, + "time_per_iteration": 3.938295364379883 + }, + { + "auxiliary_loss_clip": 0.01109371, + "auxiliary_loss_mlp": 0.0103035, + "balance_loss_clip": 1.0181793, + "balance_loss_mlp": 1.03903794, + "epoch": 0.5043739666315947, + "flos": 28767104605440.0, + "grad_norm": 2.6549702991910453, + "language_loss": 0.69832838, + "learning_rate": 2.0687264538620537e-06, + "loss": 0.71972561, + "num_input_tokens_seen": 180338870, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.703125, + "step": 8389, + "time_per_iteration": 2.514204978942871 + }, + { + "auxiliary_loss_clip": 0.01110176, + "auxiliary_loss_mlp": 0.01033206, + "balance_loss_clip": 1.02127957, + "balance_loss_mlp": 1.03844476, + "epoch": 0.5044340898842627, + "flos": 27599253713280.0, + "grad_norm": 1.5923484046165255, + "language_loss": 0.69297862, + "learning_rate": 2.068337220892191e-06, + "loss": 0.71441251, + "num_input_tokens_seen": 180361285, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.71875, + "step": 8390, + "time_per_iteration": 2.517423152923584 + }, + { + "auxiliary_loss_clip": 0.01034589, + "auxiliary_loss_mlp": 0.01005008, + "balance_loss_clip": 1.00389957, + "balance_loss_mlp": 1.0117954, + "epoch": 0.5044942131369307, + "flos": 67458050749440.0, + "grad_norm": 0.8182221752596884, + "language_loss": 0.52977288, + "learning_rate": 2.067947985330974e-06, + "loss": 0.55016881, + "num_input_tokens_seen": 180415170, + "router_z_loss_clip": 0.0111084, + "router_z_loss_mlp": 0.22851562, + "step": 8391, + "time_per_iteration": 2.8990061283111572 + }, + { + "auxiliary_loss_clip": 0.01034773, + "auxiliary_loss_mlp": 0.01000958, + "balance_loss_clip": 0.99989092, + "balance_loss_mlp": 1.01217151, + "epoch": 0.5045543363895987, + "flos": 58630849390080.0, + "grad_norm": 0.8813101083301623, + "language_loss": 0.60678625, + "learning_rate": 2.0675587471931628e-06, + "loss": 0.62714356, + "num_input_tokens_seen": 180468060, + "router_z_loss_clip": 0.01068115, + "router_z_loss_mlp": 0.2265625, + "step": 8392, + "time_per_iteration": 2.91495680809021 + }, + { + "auxiliary_loss_clip": 0.01106534, + "auxiliary_loss_mlp": 0.01032936, + "balance_loss_clip": 1.02103257, + "balance_loss_mlp": 1.03893185, + "epoch": 0.5046144596422667, + "flos": 22526620248960.0, + "grad_norm": 1.5806327501196855, + "language_loss": 0.84691715, + "learning_rate": 2.067169506493517e-06, + "loss": 0.86831182, + "num_input_tokens_seen": 180486610, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 8393, + "time_per_iteration": 2.5033814907073975 + }, + { + "auxiliary_loss_clip": 0.01110241, + "auxiliary_loss_mlp": 0.01028823, + "balance_loss_clip": 1.01680708, + "balance_loss_mlp": 1.04046786, + "epoch": 0.5046745828949346, + "flos": 27454246508160.0, + "grad_norm": 2.96195836984414, + "language_loss": 0.50628948, + "learning_rate": 2.0667802632467974e-06, + "loss": 0.52768016, + "num_input_tokens_seen": 180508135, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6953125, + "step": 8394, + "time_per_iteration": 2.492766857147217 + }, + { + "auxiliary_loss_clip": 0.01108439, + "auxiliary_loss_mlp": 0.01032191, + "balance_loss_clip": 1.01906633, + "balance_loss_mlp": 1.03773594, + "epoch": 0.5047347061476026, + "flos": 17274541415040.0, + "grad_norm": 1.6061893361767445, + "language_loss": 0.75181741, + "learning_rate": 2.0663910174677627e-06, + "loss": 0.7732237, + "num_input_tokens_seen": 180527000, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.70703125, + "step": 8395, + "time_per_iteration": 2.4661927223205566 + }, + { + "auxiliary_loss_clip": 0.01107947, + "auxiliary_loss_mlp": 0.01030847, + "balance_loss_clip": 1.01859236, + "balance_loss_mlp": 1.03834832, + "epoch": 0.5047948294002705, + "flos": 16649515831680.0, + "grad_norm": 2.243385214175979, + "language_loss": 0.67677552, + "learning_rate": 2.0660017691711737e-06, + "loss": 0.69816345, + "num_input_tokens_seen": 180544715, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6953125, + "step": 8396, + "time_per_iteration": 2.416499376296997 + }, + { + "auxiliary_loss_clip": 0.01109504, + "auxiliary_loss_mlp": 0.01027964, + "balance_loss_clip": 1.01623356, + "balance_loss_mlp": 1.0404129, + "epoch": 0.5048549526529386, + "flos": 26865706164480.0, + "grad_norm": 1.7915756184866887, + "language_loss": 0.79064161, + "learning_rate": 2.065612518371792e-06, + "loss": 0.81201625, + "num_input_tokens_seen": 180565365, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6875, + "step": 8397, + "time_per_iteration": 2.5530309677124023 + }, + { + "auxiliary_loss_clip": 0.01107401, + "auxiliary_loss_mlp": 0.01029415, + "balance_loss_clip": 1.01741672, + "balance_loss_mlp": 1.03848135, + "epoch": 0.5049150759056065, + "flos": 21833939399040.0, + "grad_norm": 1.652903699623706, + "language_loss": 0.66017222, + "learning_rate": 2.065223265084376e-06, + "loss": 0.68154037, + "num_input_tokens_seen": 180586670, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6875, + "step": 8398, + "time_per_iteration": 2.4544124603271484 + }, + { + "auxiliary_loss_clip": 0.01108938, + "auxiliary_loss_mlp": 0.01029574, + "balance_loss_clip": 1.017313, + "balance_loss_mlp": 1.0395267, + "epoch": 0.5049751991582745, + "flos": 21685807710720.0, + "grad_norm": 1.639047703672107, + "language_loss": 0.71633506, + "learning_rate": 2.064834009323688e-06, + "loss": 0.73772013, + "num_input_tokens_seen": 180605085, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6953125, + "step": 8399, + "time_per_iteration": 2.5301358699798584 + }, + { + "auxiliary_loss_clip": 0.01111081, + "auxiliary_loss_mlp": 0.01038179, + "balance_loss_clip": 1.02533388, + "balance_loss_mlp": 1.03947675, + "epoch": 0.5050353224109424, + "flos": 21359379888000.0, + "grad_norm": 1.6970917460172408, + "language_loss": 0.81506133, + "learning_rate": 2.0644447511044878e-06, + "loss": 0.83655393, + "num_input_tokens_seen": 180624370, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71484375, + "step": 8400, + "time_per_iteration": 2.4705498218536377 + }, + { + "auxiliary_loss_clip": 0.01109099, + "auxiliary_loss_mlp": 0.01029733, + "balance_loss_clip": 1.01716256, + "balance_loss_mlp": 1.03942847, + "epoch": 0.5050954456636104, + "flos": 22820082364800.0, + "grad_norm": 1.8569234799708698, + "language_loss": 0.79040837, + "learning_rate": 2.0640554904415362e-06, + "loss": 0.81179667, + "num_input_tokens_seen": 180642450, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69921875, + "step": 8401, + "time_per_iteration": 2.4791224002838135 + }, + { + "auxiliary_loss_clip": 0.01109433, + "auxiliary_loss_mlp": 0.01030061, + "balance_loss_clip": 1.01775241, + "balance_loss_mlp": 1.03751659, + "epoch": 0.5051555689162783, + "flos": 30448226891520.0, + "grad_norm": 1.5775455049866824, + "language_loss": 0.69999743, + "learning_rate": 2.063666227349593e-06, + "loss": 0.72139227, + "num_input_tokens_seen": 180665250, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.71875, + "step": 8402, + "time_per_iteration": 2.5591325759887695 + }, + { + "auxiliary_loss_clip": 0.01105942, + "auxiliary_loss_mlp": 0.0102692, + "balance_loss_clip": 1.01515996, + "balance_loss_mlp": 1.03572834, + "epoch": 0.5052156921689464, + "flos": 21287953693440.0, + "grad_norm": 1.822367858534602, + "language_loss": 0.68917859, + "learning_rate": 2.063276961843422e-06, + "loss": 0.71050715, + "num_input_tokens_seen": 180687425, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.703125, + "step": 8403, + "time_per_iteration": 2.5292510986328125 + }, + { + "auxiliary_loss_clip": 0.01106316, + "auxiliary_loss_mlp": 0.01034839, + "balance_loss_clip": 1.02275133, + "balance_loss_mlp": 1.03929162, + "epoch": 0.5052758154216143, + "flos": 25081305298560.0, + "grad_norm": 1.4593040849849852, + "language_loss": 0.85396838, + "learning_rate": 2.062887693937781e-06, + "loss": 0.87537992, + "num_input_tokens_seen": 180708725, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.671875, + "step": 8404, + "time_per_iteration": 2.4852187633514404 + }, + { + "auxiliary_loss_clip": 0.01107561, + "auxiliary_loss_mlp": 0.01027359, + "balance_loss_clip": 1.01565218, + "balance_loss_mlp": 1.03806567, + "epoch": 0.5053359386742823, + "flos": 20885502735360.0, + "grad_norm": 1.5717367434630007, + "language_loss": 0.75364089, + "learning_rate": 2.0624984236474322e-06, + "loss": 0.77499014, + "num_input_tokens_seen": 180727990, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6953125, + "step": 8405, + "time_per_iteration": 2.4850387573242188 + }, + { + "auxiliary_loss_clip": 0.01109835, + "auxiliary_loss_mlp": 0.01028348, + "balance_loss_clip": 1.01514542, + "balance_loss_mlp": 1.0388459, + "epoch": 0.5053960619269503, + "flos": 37743335493120.0, + "grad_norm": 1.5541955318463554, + "language_loss": 0.72983336, + "learning_rate": 2.0621091509871378e-06, + "loss": 0.75121522, + "num_input_tokens_seen": 180749765, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 8406, + "time_per_iteration": 2.59979510307312 + }, + { + "auxiliary_loss_clip": 0.01102813, + "auxiliary_loss_mlp": 0.01028972, + "balance_loss_clip": 1.01712823, + "balance_loss_mlp": 1.03577971, + "epoch": 0.5054561851796182, + "flos": 23513840622720.0, + "grad_norm": 1.7094740961502104, + "language_loss": 0.76863986, + "learning_rate": 2.0617198759716568e-06, + "loss": 0.7899577, + "num_input_tokens_seen": 180769580, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 8407, + "time_per_iteration": 2.527543067932129 + }, + { + "auxiliary_loss_clip": 0.01108813, + "auxiliary_loss_mlp": 0.01027236, + "balance_loss_clip": 1.01535106, + "balance_loss_mlp": 1.03706717, + "epoch": 0.5055163084322862, + "flos": 30410233280640.0, + "grad_norm": 1.6525886874932982, + "language_loss": 0.63115776, + "learning_rate": 2.0613305986157535e-06, + "loss": 0.65251827, + "num_input_tokens_seen": 180790295, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.71875, + "step": 8408, + "time_per_iteration": 2.53218150138855 + }, + { + "auxiliary_loss_clip": 0.01106822, + "auxiliary_loss_mlp": 0.01031761, + "balance_loss_clip": 1.01871967, + "balance_loss_mlp": 1.0382477, + "epoch": 0.5055764316849541, + "flos": 20259651139200.0, + "grad_norm": 1.695436010833495, + "language_loss": 0.63705122, + "learning_rate": 2.0609413189341865e-06, + "loss": 0.65843707, + "num_input_tokens_seen": 180807875, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6875, + "step": 8409, + "time_per_iteration": 2.4916255474090576 + }, + { + "auxiliary_loss_clip": 0.01105638, + "auxiliary_loss_mlp": 0.01025409, + "balance_loss_clip": 1.01410186, + "balance_loss_mlp": 1.03845859, + "epoch": 0.5056365549376222, + "flos": 26070895969920.0, + "grad_norm": 1.3247049855298083, + "language_loss": 0.70876539, + "learning_rate": 2.0605520369417193e-06, + "loss": 0.73007584, + "num_input_tokens_seen": 180831300, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 8410, + "time_per_iteration": 2.527935266494751 + }, + { + "auxiliary_loss_clip": 0.01107655, + "auxiliary_loss_mlp": 0.0103361, + "balance_loss_clip": 1.02100372, + "balance_loss_mlp": 1.03812361, + "epoch": 0.5056966781902901, + "flos": 19279074781440.0, + "grad_norm": 1.5323244298402565, + "language_loss": 0.79243749, + "learning_rate": 2.060162752653113e-06, + "loss": 0.81385016, + "num_input_tokens_seen": 180849055, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 8411, + "time_per_iteration": 2.4926035404205322 + }, + { + "auxiliary_loss_clip": 0.01107995, + "auxiliary_loss_mlp": 0.0103704, + "balance_loss_clip": 1.02357578, + "balance_loss_mlp": 1.03764153, + "epoch": 0.5057568014429581, + "flos": 21323325611520.0, + "grad_norm": 1.7118743762511017, + "language_loss": 0.81584603, + "learning_rate": 2.0597734660831285e-06, + "loss": 0.83729643, + "num_input_tokens_seen": 180867395, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.703125, + "step": 8412, + "time_per_iteration": 2.4696593284606934 + }, + { + "auxiliary_loss_clip": 0.0110966, + "auxiliary_loss_mlp": 0.0103257, + "balance_loss_clip": 1.02057767, + "balance_loss_mlp": 1.04071307, + "epoch": 0.505816924695626, + "flos": 17493596507520.0, + "grad_norm": 2.1036912411500555, + "language_loss": 0.80586725, + "learning_rate": 2.0593841772465283e-06, + "loss": 0.82728952, + "num_input_tokens_seen": 180886670, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6875, + "step": 8413, + "time_per_iteration": 2.4840738773345947 + }, + { + "auxiliary_loss_clip": 0.01111974, + "auxiliary_loss_mlp": 0.01032705, + "balance_loss_clip": 1.01959252, + "balance_loss_mlp": 1.04003644, + "epoch": 0.505877047948294, + "flos": 21142084561920.0, + "grad_norm": 1.7598991939758672, + "language_loss": 0.80167186, + "learning_rate": 2.0589948861580737e-06, + "loss": 0.82311857, + "num_input_tokens_seen": 180904645, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 8414, + "time_per_iteration": 2.4437410831451416 + }, + { + "auxiliary_loss_clip": 0.01106268, + "auxiliary_loss_mlp": 0.0102984, + "balance_loss_clip": 1.0174123, + "balance_loss_mlp": 1.03536403, + "epoch": 0.5059371712009619, + "flos": 36350036887680.0, + "grad_norm": 2.1880801569958486, + "language_loss": 0.62188816, + "learning_rate": 2.058605592832528e-06, + "loss": 0.64324927, + "num_input_tokens_seen": 180922340, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 8415, + "time_per_iteration": 2.617699384689331 + }, + { + "auxiliary_loss_clip": 0.01107633, + "auxiliary_loss_mlp": 0.01029289, + "balance_loss_clip": 1.01712978, + "balance_loss_mlp": 1.03840709, + "epoch": 0.50599729445363, + "flos": 22673387220480.0, + "grad_norm": 1.5996951654726725, + "language_loss": 0.81836188, + "learning_rate": 2.0582162972846515e-06, + "loss": 0.8397311, + "num_input_tokens_seen": 180941350, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69140625, + "step": 8416, + "time_per_iteration": 2.484717607498169 + }, + { + "auxiliary_loss_clip": 0.0110817, + "auxiliary_loss_mlp": 0.01033768, + "balance_loss_clip": 1.02253819, + "balance_loss_mlp": 1.04098511, + "epoch": 0.5060574177062979, + "flos": 22747866071040.0, + "grad_norm": 1.7782267995500585, + "language_loss": 0.79110944, + "learning_rate": 2.0578269995292078e-06, + "loss": 0.81252885, + "num_input_tokens_seen": 180960720, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 8417, + "time_per_iteration": 2.544739246368408 + }, + { + "auxiliary_loss_clip": 0.01103419, + "auxiliary_loss_mlp": 0.01030044, + "balance_loss_clip": 1.01789641, + "balance_loss_mlp": 1.03713858, + "epoch": 0.5061175409589659, + "flos": 21653201139840.0, + "grad_norm": 1.8205649281423022, + "language_loss": 0.62930262, + "learning_rate": 2.0574376995809588e-06, + "loss": 0.65063727, + "num_input_tokens_seen": 180979725, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 8418, + "time_per_iteration": 2.4795963764190674 + }, + { + "auxiliary_loss_clip": 0.01109111, + "auxiliary_loss_mlp": 0.01034585, + "balance_loss_clip": 1.02232397, + "balance_loss_mlp": 1.03859878, + "epoch": 0.5061776642116339, + "flos": 21616249023360.0, + "grad_norm": 2.1933090002480182, + "language_loss": 0.77840686, + "learning_rate": 2.0570483974546653e-06, + "loss": 0.79984379, + "num_input_tokens_seen": 180998980, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.703125, + "step": 8419, + "time_per_iteration": 2.491931915283203 + }, + { + "auxiliary_loss_clip": 0.0110836, + "auxiliary_loss_mlp": 0.01032721, + "balance_loss_clip": 1.01950645, + "balance_loss_mlp": 1.0373354, + "epoch": 0.5062377874643018, + "flos": 24426294837120.0, + "grad_norm": 1.7154546366730201, + "language_loss": 0.77258635, + "learning_rate": 2.0566590931650917e-06, + "loss": 0.79399723, + "num_input_tokens_seen": 181019165, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.70703125, + "step": 8420, + "time_per_iteration": 2.5963363647460938 + }, + { + "auxiliary_loss_clip": 0.01109095, + "auxiliary_loss_mlp": 0.01037587, + "balance_loss_clip": 1.02459884, + "balance_loss_mlp": 1.03782094, + "epoch": 0.5062979107169698, + "flos": 22524429519360.0, + "grad_norm": 1.679092087125118, + "language_loss": 0.77511621, + "learning_rate": 2.056269786726999e-06, + "loss": 0.79658306, + "num_input_tokens_seen": 181037110, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 8421, + "time_per_iteration": 2.4954135417938232 + }, + { + "auxiliary_loss_clip": 0.01105449, + "auxiliary_loss_mlp": 0.01029273, + "balance_loss_clip": 1.01762629, + "balance_loss_mlp": 1.03668654, + "epoch": 0.5063580339696377, + "flos": 24571984400640.0, + "grad_norm": 1.4641430762434493, + "language_loss": 0.66987717, + "learning_rate": 2.0558804781551512e-06, + "loss": 0.69122434, + "num_input_tokens_seen": 181057775, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6875, + "step": 8422, + "time_per_iteration": 2.4802937507629395 + }, + { + "auxiliary_loss_clip": 0.01109498, + "auxiliary_loss_mlp": 0.01032318, + "balance_loss_clip": 1.01998544, + "balance_loss_mlp": 1.04081178, + "epoch": 0.5064181572223058, + "flos": 22596143022720.0, + "grad_norm": 1.8050040320885787, + "language_loss": 0.81599188, + "learning_rate": 2.05549116746431e-06, + "loss": 0.83741009, + "num_input_tokens_seen": 181078260, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 8423, + "time_per_iteration": 2.591792345046997 + }, + { + "auxiliary_loss_clip": 0.01109343, + "auxiliary_loss_mlp": 0.01031623, + "balance_loss_clip": 1.01859319, + "balance_loss_mlp": 1.03820443, + "epoch": 0.5064782804749737, + "flos": 25994944661760.0, + "grad_norm": 1.8632464802837558, + "language_loss": 0.74227667, + "learning_rate": 2.055101854669237e-06, + "loss": 0.76368636, + "num_input_tokens_seen": 181098755, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 8424, + "time_per_iteration": 2.5076076984405518 + }, + { + "auxiliary_loss_clip": 0.01105301, + "auxiliary_loss_mlp": 0.01033895, + "balance_loss_clip": 1.02120495, + "balance_loss_mlp": 1.03742146, + "epoch": 0.5065384037276417, + "flos": 28553041503360.0, + "grad_norm": 1.6339612294396895, + "language_loss": 0.71546394, + "learning_rate": 2.0547125397846975e-06, + "loss": 0.73685586, + "num_input_tokens_seen": 181121570, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 8425, + "time_per_iteration": 2.570103406906128 + }, + { + "auxiliary_loss_clip": 0.01108568, + "auxiliary_loss_mlp": 0.01035107, + "balance_loss_clip": 1.02325118, + "balance_loss_mlp": 1.0379858, + "epoch": 0.5065985269803096, + "flos": 22966023323520.0, + "grad_norm": 1.6987499343502257, + "language_loss": 0.78614688, + "learning_rate": 2.0543232228254524e-06, + "loss": 0.80758357, + "num_input_tokens_seen": 181140240, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.703125, + "step": 8426, + "time_per_iteration": 2.4616403579711914 + }, + { + "auxiliary_loss_clip": 0.01111124, + "auxiliary_loss_mlp": 0.01035589, + "balance_loss_clip": 1.02312577, + "balance_loss_mlp": 1.03994358, + "epoch": 0.5066586502329776, + "flos": 21608563512960.0, + "grad_norm": 2.818748758654822, + "language_loss": 0.77855921, + "learning_rate": 2.053933903806265e-06, + "loss": 0.80002636, + "num_input_tokens_seen": 181158630, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 8427, + "time_per_iteration": 3.908625364303589 + }, + { + "auxiliary_loss_clip": 0.0110433, + "auxiliary_loss_mlp": 0.01026092, + "balance_loss_clip": 1.01382565, + "balance_loss_mlp": 1.03709817, + "epoch": 0.5067187734856455, + "flos": 20339912079360.0, + "grad_norm": 1.8142719003609429, + "language_loss": 0.71444368, + "learning_rate": 2.0535445827418997e-06, + "loss": 0.73574793, + "num_input_tokens_seen": 181176405, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.671875, + "step": 8428, + "time_per_iteration": 2.4540021419525146 + }, + { + "auxiliary_loss_clip": 0.0110492, + "auxiliary_loss_mlp": 0.01031645, + "balance_loss_clip": 1.01983786, + "balance_loss_mlp": 1.03622389, + "epoch": 0.5067788967383136, + "flos": 28841080665600.0, + "grad_norm": 1.6344761677930288, + "language_loss": 0.82693905, + "learning_rate": 2.0531552596471168e-06, + "loss": 0.84830469, + "num_input_tokens_seen": 181197595, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 8429, + "time_per_iteration": 3.977104902267456 + }, + { + "auxiliary_loss_clip": 0.01113682, + "auxiliary_loss_mlp": 0.01036238, + "balance_loss_clip": 1.02267253, + "balance_loss_mlp": 1.04074979, + "epoch": 0.5068390199909815, + "flos": 32450174478720.0, + "grad_norm": 2.1730745276419485, + "language_loss": 0.73167485, + "learning_rate": 2.052765934536682e-06, + "loss": 0.75317407, + "num_input_tokens_seen": 181218560, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7265625, + "step": 8430, + "time_per_iteration": 4.066487073898315 + }, + { + "auxiliary_loss_clip": 0.01109473, + "auxiliary_loss_mlp": 0.01031357, + "balance_loss_clip": 1.01953173, + "balance_loss_mlp": 1.03904748, + "epoch": 0.5068991432436495, + "flos": 23146582014720.0, + "grad_norm": 1.7614160050819483, + "language_loss": 0.76304209, + "learning_rate": 2.0523766074253575e-06, + "loss": 0.78445041, + "num_input_tokens_seen": 181237095, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.703125, + "step": 8431, + "time_per_iteration": 2.459061861038208 + }, + { + "auxiliary_loss_clip": 0.01107362, + "auxiliary_loss_mlp": 0.01031584, + "balance_loss_clip": 1.01899588, + "balance_loss_mlp": 1.0388869, + "epoch": 0.5069592664963174, + "flos": 19936096404480.0, + "grad_norm": 1.4179396940955034, + "language_loss": 0.72168291, + "learning_rate": 2.0519872783279074e-06, + "loss": 0.74307233, + "num_input_tokens_seen": 181255940, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 8432, + "time_per_iteration": 2.4937191009521484 + }, + { + "auxiliary_loss_clip": 0.01040308, + "auxiliary_loss_mlp": 0.0100546, + "balance_loss_clip": 1.00428617, + "balance_loss_mlp": 1.01756871, + "epoch": 0.5070193897489854, + "flos": 65793771941760.0, + "grad_norm": 0.7612043046384747, + "language_loss": 0.63704848, + "learning_rate": 2.0515979472590945e-06, + "loss": 0.65750623, + "num_input_tokens_seen": 181316945, + "router_z_loss_clip": 0.01171875, + "router_z_loss_mlp": 0.22753906, + "step": 8433, + "time_per_iteration": 3.10312819480896 + }, + { + "auxiliary_loss_clip": 0.01109071, + "auxiliary_loss_mlp": 0.01035563, + "balance_loss_clip": 1.02276051, + "balance_loss_mlp": 1.0391171, + "epoch": 0.5070795130016534, + "flos": 17275331514240.0, + "grad_norm": 1.7667352609332163, + "language_loss": 0.77104461, + "learning_rate": 2.051208614233681e-06, + "loss": 0.79249096, + "num_input_tokens_seen": 181335555, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 8434, + "time_per_iteration": 2.4761765003204346 + }, + { + "auxiliary_loss_clip": 0.01110101, + "auxiliary_loss_mlp": 0.01032152, + "balance_loss_clip": 1.01997447, + "balance_loss_mlp": 1.03937244, + "epoch": 0.5071396362543213, + "flos": 21069940095360.0, + "grad_norm": 1.7167508969307774, + "language_loss": 0.71062863, + "learning_rate": 2.0508192792664326e-06, + "loss": 0.73205119, + "num_input_tokens_seen": 181354580, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.70703125, + "step": 8435, + "time_per_iteration": 2.476259231567383 + }, + { + "auxiliary_loss_clip": 0.01111512, + "auxiliary_loss_mlp": 0.01034776, + "balance_loss_clip": 1.02086425, + "balance_loss_mlp": 1.04086459, + "epoch": 0.5071997595069894, + "flos": 23144822248320.0, + "grad_norm": 2.1519666669040407, + "language_loss": 0.71635526, + "learning_rate": 2.050429942372112e-06, + "loss": 0.73781812, + "num_input_tokens_seen": 181374320, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.70703125, + "step": 8436, + "time_per_iteration": 2.4717278480529785 + }, + { + "auxiliary_loss_clip": 0.0111073, + "auxiliary_loss_mlp": 0.01029431, + "balance_loss_clip": 1.01621652, + "balance_loss_mlp": 1.04132712, + "epoch": 0.5072598827596573, + "flos": 22747183712640.0, + "grad_norm": 1.5051036444651287, + "language_loss": 0.8370682, + "learning_rate": 2.050040603565483e-06, + "loss": 0.85846984, + "num_input_tokens_seen": 181392190, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.6953125, + "step": 8437, + "time_per_iteration": 2.51187801361084 + }, + { + "auxiliary_loss_clip": 0.01107572, + "auxiliary_loss_mlp": 0.01025487, + "balance_loss_clip": 1.01340485, + "balance_loss_mlp": 1.03941774, + "epoch": 0.5073200060123253, + "flos": 22566301799040.0, + "grad_norm": 1.8339895444539178, + "language_loss": 0.80925703, + "learning_rate": 2.049651262861309e-06, + "loss": 0.83058763, + "num_input_tokens_seen": 181413890, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 8438, + "time_per_iteration": 2.5101053714752197 + }, + { + "auxiliary_loss_clip": 0.01112175, + "auxiliary_loss_mlp": 0.0103796, + "balance_loss_clip": 1.023947, + "balance_loss_mlp": 1.04053128, + "epoch": 0.5073801292649932, + "flos": 25806341324160.0, + "grad_norm": 1.458277190934999, + "language_loss": 0.79797888, + "learning_rate": 2.0492619202743543e-06, + "loss": 0.81948024, + "num_input_tokens_seen": 181433240, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.71875, + "step": 8439, + "time_per_iteration": 2.5196681022644043 + }, + { + "auxiliary_loss_clip": 0.01107511, + "auxiliary_loss_mlp": 0.01033327, + "balance_loss_clip": 1.02176344, + "balance_loss_mlp": 1.03948164, + "epoch": 0.5074402525176612, + "flos": 25373941401600.0, + "grad_norm": 1.5054968059802218, + "language_loss": 0.7129699, + "learning_rate": 2.048872575819383e-06, + "loss": 0.73437822, + "num_input_tokens_seen": 181453535, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6796875, + "step": 8440, + "time_per_iteration": 2.482475757598877 + }, + { + "auxiliary_loss_clip": 0.01110635, + "auxiliary_loss_mlp": 0.0103111, + "balance_loss_clip": 1.01877761, + "balance_loss_mlp": 1.03933895, + "epoch": 0.5075003757703291, + "flos": 26064431521920.0, + "grad_norm": 1.6937518353915977, + "language_loss": 0.70555139, + "learning_rate": 2.048483229511158e-06, + "loss": 0.72696882, + "num_input_tokens_seen": 181474195, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.7109375, + "step": 8441, + "time_per_iteration": 2.5299065113067627 + }, + { + "auxiliary_loss_clip": 0.01113885, + "auxiliary_loss_mlp": 0.01035023, + "balance_loss_clip": 1.0219456, + "balance_loss_mlp": 1.04142308, + "epoch": 0.5075604990229972, + "flos": 21835447770240.0, + "grad_norm": 1.8980066327338418, + "language_loss": 0.63670987, + "learning_rate": 2.0480938813644445e-06, + "loss": 0.65819889, + "num_input_tokens_seen": 181494000, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.72265625, + "step": 8442, + "time_per_iteration": 2.4623775482177734 + }, + { + "auxiliary_loss_clip": 0.01108296, + "auxiliary_loss_mlp": 0.01027699, + "balance_loss_clip": 1.016011, + "balance_loss_mlp": 1.04047632, + "epoch": 0.5076206222756651, + "flos": 31978703537280.0, + "grad_norm": 1.5153774279484464, + "language_loss": 0.7150898, + "learning_rate": 2.047704531394006e-06, + "loss": 0.73644972, + "num_input_tokens_seen": 181515955, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 8443, + "time_per_iteration": 2.586273670196533 + }, + { + "auxiliary_loss_clip": 0.01110925, + "auxiliary_loss_mlp": 0.01033689, + "balance_loss_clip": 1.02046299, + "balance_loss_mlp": 1.03887248, + "epoch": 0.5076807455283331, + "flos": 36904031326080.0, + "grad_norm": 1.223488951652841, + "language_loss": 0.61766541, + "learning_rate": 2.047315179614607e-06, + "loss": 0.63911152, + "num_input_tokens_seen": 181540225, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 8444, + "time_per_iteration": 2.5941321849823 + }, + { + "auxiliary_loss_clip": 0.01107921, + "auxiliary_loss_mlp": 0.01032806, + "balance_loss_clip": 1.02056909, + "balance_loss_mlp": 1.0380075, + "epoch": 0.507740868781001, + "flos": 29862415981440.0, + "grad_norm": 1.7476957798256931, + "language_loss": 0.6370405, + "learning_rate": 2.046925826041012e-06, + "loss": 0.65844774, + "num_input_tokens_seen": 181560125, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69921875, + "step": 8445, + "time_per_iteration": 2.622295379638672 + }, + { + "auxiliary_loss_clip": 0.01042597, + "auxiliary_loss_mlp": 0.01005213, + "balance_loss_clip": 1.00411069, + "balance_loss_mlp": 1.019732, + "epoch": 0.507800992033669, + "flos": 61918974247680.0, + "grad_norm": 0.8272934825203048, + "language_loss": 0.61873507, + "learning_rate": 2.0465364706879845e-06, + "loss": 0.6392132, + "num_input_tokens_seen": 181618830, + "router_z_loss_clip": 0.01104736, + "router_z_loss_mlp": 0.22851562, + "step": 8446, + "time_per_iteration": 3.106067180633545 + }, + { + "auxiliary_loss_clip": 0.01107421, + "auxiliary_loss_mlp": 0.01028834, + "balance_loss_clip": 1.01656127, + "balance_loss_mlp": 1.03849411, + "epoch": 0.507861115286337, + "flos": 20700490757760.0, + "grad_norm": 1.6783761303243148, + "language_loss": 0.80458808, + "learning_rate": 2.04614711357029e-06, + "loss": 0.82595056, + "num_input_tokens_seen": 181637120, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 8447, + "time_per_iteration": 2.483449935913086 + }, + { + "auxiliary_loss_clip": 0.01109683, + "auxiliary_loss_mlp": 0.01031748, + "balance_loss_clip": 1.01955903, + "balance_loss_mlp": 1.04166472, + "epoch": 0.507921238539005, + "flos": 30847050576000.0, + "grad_norm": 1.6097524760484219, + "language_loss": 0.70526159, + "learning_rate": 2.0457577547026916e-06, + "loss": 0.72667593, + "num_input_tokens_seen": 181659965, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6796875, + "step": 8448, + "time_per_iteration": 2.5377211570739746 + }, + { + "auxiliary_loss_clip": 0.01108561, + "auxiliary_loss_mlp": 0.01030678, + "balance_loss_clip": 1.01906157, + "balance_loss_mlp": 1.04054332, + "epoch": 0.507981361791673, + "flos": 35700197984640.0, + "grad_norm": 1.775058362169557, + "language_loss": 0.72186208, + "learning_rate": 2.045368394099955e-06, + "loss": 0.74325454, + "num_input_tokens_seen": 181685290, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 8449, + "time_per_iteration": 2.6247637271881104 + }, + { + "auxiliary_loss_clip": 0.01105391, + "auxiliary_loss_mlp": 0.01030497, + "balance_loss_clip": 1.01862371, + "balance_loss_mlp": 1.0373019, + "epoch": 0.5080414850443409, + "flos": 27161466750720.0, + "grad_norm": 1.4717194557779922, + "language_loss": 0.72751403, + "learning_rate": 2.044979031776844e-06, + "loss": 0.74887294, + "num_input_tokens_seen": 181706080, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 8450, + "time_per_iteration": 2.5097148418426514 + }, + { + "auxiliary_loss_clip": 0.01112404, + "auxiliary_loss_mlp": 0.01033223, + "balance_loss_clip": 1.02104533, + "balance_loss_mlp": 1.04217696, + "epoch": 0.5081016082970089, + "flos": 27085192220160.0, + "grad_norm": 1.631370100986613, + "language_loss": 0.7704621, + "learning_rate": 2.0445896677481234e-06, + "loss": 0.7919184, + "num_input_tokens_seen": 181724805, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69921875, + "step": 8451, + "time_per_iteration": 2.5109496116638184 + }, + { + "auxiliary_loss_clip": 0.01109885, + "auxiliary_loss_mlp": 0.01036862, + "balance_loss_clip": 1.02502477, + "balance_loss_mlp": 1.03928411, + "epoch": 0.5081617315496768, + "flos": 22856531690880.0, + "grad_norm": 1.7784899256909827, + "language_loss": 0.8518312, + "learning_rate": 2.044200302028559e-06, + "loss": 0.8732987, + "num_input_tokens_seen": 181743725, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.703125, + "step": 8452, + "time_per_iteration": 2.4603476524353027 + }, + { + "auxiliary_loss_clip": 0.01115612, + "auxiliary_loss_mlp": 0.01036365, + "balance_loss_clip": 1.02284074, + "balance_loss_mlp": 1.04209125, + "epoch": 0.5082218548023448, + "flos": 16281898087680.0, + "grad_norm": 2.2856093940760274, + "language_loss": 0.78046912, + "learning_rate": 2.0438109346329143e-06, + "loss": 0.80198884, + "num_input_tokens_seen": 181757720, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.734375, + "step": 8453, + "time_per_iteration": 2.450873613357544 + }, + { + "auxiliary_loss_clip": 0.01106928, + "auxiliary_loss_mlp": 0.01034326, + "balance_loss_clip": 1.02200532, + "balance_loss_mlp": 1.03973246, + "epoch": 0.5082819780550127, + "flos": 24460768915200.0, + "grad_norm": 1.6556718901191125, + "language_loss": 0.7626555, + "learning_rate": 2.0434215655759544e-06, + "loss": 0.78406799, + "num_input_tokens_seen": 181778545, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.671875, + "step": 8454, + "time_per_iteration": 2.4831783771514893 + }, + { + "auxiliary_loss_clip": 0.01110162, + "auxiliary_loss_mlp": 0.01032609, + "balance_loss_clip": 1.01998448, + "balance_loss_mlp": 1.03985167, + "epoch": 0.5083421013076808, + "flos": 23403271582080.0, + "grad_norm": 1.7440679508015728, + "language_loss": 0.89345592, + "learning_rate": 2.0430321948724446e-06, + "loss": 0.91488367, + "num_input_tokens_seen": 181799495, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 8455, + "time_per_iteration": 2.48486590385437 + }, + { + "auxiliary_loss_clip": 0.01116133, + "auxiliary_loss_mlp": 0.01035999, + "balance_loss_clip": 1.02230144, + "balance_loss_mlp": 1.04198599, + "epoch": 0.5084022245603487, + "flos": 23872695448320.0, + "grad_norm": 2.029385394187206, + "language_loss": 0.62613618, + "learning_rate": 2.042642822537149e-06, + "loss": 0.64765751, + "num_input_tokens_seen": 181818400, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7421875, + "step": 8456, + "time_per_iteration": 2.476060390472412 + }, + { + "auxiliary_loss_clip": 0.01038842, + "auxiliary_loss_mlp": 0.00998694, + "balance_loss_clip": 0.99766272, + "balance_loss_mlp": 1.01592362, + "epoch": 0.5084623478130167, + "flos": 62873336655360.0, + "grad_norm": 0.816065361839575, + "language_loss": 0.62538505, + "learning_rate": 2.0422534485848343e-06, + "loss": 0.64576042, + "num_input_tokens_seen": 181875975, + "router_z_loss_clip": 0.01031494, + "router_z_loss_mlp": 0.22949219, + "step": 8457, + "time_per_iteration": 2.9627416133880615 + }, + { + "auxiliary_loss_clip": 0.01110833, + "auxiliary_loss_mlp": 0.01033568, + "balance_loss_clip": 1.02069306, + "balance_loss_mlp": 1.04062462, + "epoch": 0.5085224710656846, + "flos": 22346133384960.0, + "grad_norm": 1.5574868486202833, + "language_loss": 0.67412502, + "learning_rate": 2.0418640730302644e-06, + "loss": 0.69556904, + "num_input_tokens_seen": 181896450, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 8458, + "time_per_iteration": 2.4851465225219727 + }, + { + "auxiliary_loss_clip": 0.01109854, + "auxiliary_loss_mlp": 0.01031481, + "balance_loss_clip": 1.01840341, + "balance_loss_mlp": 1.03811622, + "epoch": 0.5085825943183526, + "flos": 26066263115520.0, + "grad_norm": 1.6253676139168076, + "language_loss": 0.77861875, + "learning_rate": 2.0414746958882043e-06, + "loss": 0.80003208, + "num_input_tokens_seen": 181916770, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 8459, + "time_per_iteration": 2.5043020248413086 + }, + { + "auxiliary_loss_clip": 0.01117652, + "auxiliary_loss_mlp": 0.01035652, + "balance_loss_clip": 1.02252126, + "balance_loss_mlp": 1.04386926, + "epoch": 0.5086427175710206, + "flos": 17420733768960.0, + "grad_norm": 2.213093169353168, + "language_loss": 0.81109118, + "learning_rate": 2.0410853171734196e-06, + "loss": 0.83262426, + "num_input_tokens_seen": 181932710, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73828125, + "step": 8460, + "time_per_iteration": 2.4239838123321533 + }, + { + "auxiliary_loss_clip": 0.01111375, + "auxiliary_loss_mlp": 0.010378, + "balance_loss_clip": 1.02565289, + "balance_loss_mlp": 1.03999329, + "epoch": 0.5087028408236886, + "flos": 20631758083200.0, + "grad_norm": 1.5640945155523684, + "language_loss": 0.6866132, + "learning_rate": 2.0406959369006754e-06, + "loss": 0.70810497, + "num_input_tokens_seen": 181950665, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.7109375, + "step": 8461, + "time_per_iteration": 2.469954490661621 + }, + { + "auxiliary_loss_clip": 0.01107585, + "auxiliary_loss_mlp": 0.01032567, + "balance_loss_clip": 1.01997876, + "balance_loss_mlp": 1.03908265, + "epoch": 0.5087629640763566, + "flos": 25593822506880.0, + "grad_norm": 1.5611830538381608, + "language_loss": 0.76059598, + "learning_rate": 2.0403065550847375e-06, + "loss": 0.7819975, + "num_input_tokens_seen": 181971270, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 8462, + "time_per_iteration": 2.4907591342926025 + }, + { + "auxiliary_loss_clip": 0.01111001, + "auxiliary_loss_mlp": 0.01036225, + "balance_loss_clip": 1.02376187, + "balance_loss_mlp": 1.04031515, + "epoch": 0.5088230873290245, + "flos": 13261631927040.0, + "grad_norm": 1.977849325123916, + "language_loss": 0.8121528, + "learning_rate": 2.0399171717403706e-06, + "loss": 0.83362508, + "num_input_tokens_seen": 181988410, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.70703125, + "step": 8463, + "time_per_iteration": 2.460604190826416 + }, + { + "auxiliary_loss_clip": 0.01109081, + "auxiliary_loss_mlp": 0.01037256, + "balance_loss_clip": 1.02527571, + "balance_loss_mlp": 1.03999758, + "epoch": 0.5088832105816925, + "flos": 20043469134720.0, + "grad_norm": 1.7045720874408852, + "language_loss": 0.7630803, + "learning_rate": 2.039527786882341e-06, + "loss": 0.78454363, + "num_input_tokens_seen": 182006530, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 8464, + "time_per_iteration": 2.4307475090026855 + }, + { + "auxiliary_loss_clip": 0.01037487, + "auxiliary_loss_mlp": 0.01005228, + "balance_loss_clip": 1.00426793, + "balance_loss_mlp": 1.01476121, + "epoch": 0.5089433338343604, + "flos": 67422179018880.0, + "grad_norm": 0.687733273493157, + "language_loss": 0.59352195, + "learning_rate": 2.0391384005254133e-06, + "loss": 0.61394918, + "num_input_tokens_seen": 182074240, + "router_z_loss_clip": 0.00958252, + "router_z_loss_mlp": 0.2265625, + "step": 8465, + "time_per_iteration": 3.1989307403564453 + }, + { + "auxiliary_loss_clip": 0.01106873, + "auxiliary_loss_mlp": 0.01035022, + "balance_loss_clip": 1.02263045, + "balance_loss_mlp": 1.03822207, + "epoch": 0.5090034570870284, + "flos": 22710339336960.0, + "grad_norm": 1.7579634525926484, + "language_loss": 0.79857922, + "learning_rate": 2.038749012684354e-06, + "loss": 0.81999815, + "num_input_tokens_seen": 182093360, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 8466, + "time_per_iteration": 2.472186326980591 + }, + { + "auxiliary_loss_clip": 0.01104861, + "auxiliary_loss_mlp": 0.01032192, + "balance_loss_clip": 1.01950181, + "balance_loss_mlp": 1.03679371, + "epoch": 0.5090635803396963, + "flos": 20445812352000.0, + "grad_norm": 1.5999387152583837, + "language_loss": 0.78222281, + "learning_rate": 2.0383596233739286e-06, + "loss": 0.80359334, + "num_input_tokens_seen": 182110170, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 8467, + "time_per_iteration": 2.4692180156707764 + }, + { + "auxiliary_loss_clip": 0.01107209, + "auxiliary_loss_mlp": 0.01032197, + "balance_loss_clip": 1.02041364, + "balance_loss_mlp": 1.03994191, + "epoch": 0.5091237035923644, + "flos": 23768878164480.0, + "grad_norm": 1.7540939283261232, + "language_loss": 0.7467652, + "learning_rate": 2.0379702326089013e-06, + "loss": 0.76815927, + "num_input_tokens_seen": 182129570, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 8468, + "time_per_iteration": 3.8722333908081055 + }, + { + "auxiliary_loss_clip": 0.01107691, + "auxiliary_loss_mlp": 0.01031114, + "balance_loss_clip": 1.01877546, + "balance_loss_mlp": 1.03856027, + "epoch": 0.5091838268450323, + "flos": 18327908684160.0, + "grad_norm": 1.7320149470681812, + "language_loss": 0.77835757, + "learning_rate": 2.03758084040404e-06, + "loss": 0.79974556, + "num_input_tokens_seen": 182147565, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.69140625, + "step": 8469, + "time_per_iteration": 2.4514496326446533 + }, + { + "auxiliary_loss_clip": 0.01112445, + "auxiliary_loss_mlp": 0.01035475, + "balance_loss_clip": 1.0221895, + "balance_loss_mlp": 1.04265046, + "epoch": 0.5092439500977003, + "flos": 29057621806080.0, + "grad_norm": 1.5013208791161945, + "language_loss": 0.69422746, + "learning_rate": 2.037191446774109e-06, + "loss": 0.71570665, + "num_input_tokens_seen": 182169695, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6953125, + "step": 8470, + "time_per_iteration": 2.5658817291259766 + }, + { + "auxiliary_loss_clip": 0.01112957, + "auxiliary_loss_mlp": 0.01033067, + "balance_loss_clip": 1.01997817, + "balance_loss_mlp": 1.04058552, + "epoch": 0.5093040733503682, + "flos": 13553908894080.0, + "grad_norm": 2.018231732442679, + "language_loss": 0.73409355, + "learning_rate": 2.0368020517338745e-06, + "loss": 0.75555384, + "num_input_tokens_seen": 182186385, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.72265625, + "step": 8471, + "time_per_iteration": 5.355906009674072 + }, + { + "auxiliary_loss_clip": 0.01036047, + "auxiliary_loss_mlp": 0.01003435, + "balance_loss_clip": 1.00242805, + "balance_loss_mlp": 1.01322865, + "epoch": 0.5093641966030362, + "flos": 68906617407360.0, + "grad_norm": 0.7572542385247485, + "language_loss": 0.58153868, + "learning_rate": 2.036412655298103e-06, + "loss": 0.60193354, + "num_input_tokens_seen": 182247095, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.22851562, + "step": 8472, + "time_per_iteration": 3.0752861499786377 + }, + { + "auxiliary_loss_clip": 0.01111139, + "auxiliary_loss_mlp": 0.01032478, + "balance_loss_clip": 1.02100456, + "balance_loss_mlp": 1.04138827, + "epoch": 0.5094243198557042, + "flos": 21580948932480.0, + "grad_norm": 1.783541878810952, + "language_loss": 0.69200397, + "learning_rate": 2.03602325748156e-06, + "loss": 0.71344012, + "num_input_tokens_seen": 182266380, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6953125, + "step": 8473, + "time_per_iteration": 2.4832053184509277 + }, + { + "auxiliary_loss_clip": 0.01109225, + "auxiliary_loss_mlp": 0.01033767, + "balance_loss_clip": 1.02144074, + "balance_loss_mlp": 1.03987551, + "epoch": 0.5094844431083722, + "flos": 28840721529600.0, + "grad_norm": 2.2073606957030143, + "language_loss": 0.85564739, + "learning_rate": 2.0356338582990105e-06, + "loss": 0.87707734, + "num_input_tokens_seen": 182284685, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6953125, + "step": 8474, + "time_per_iteration": 2.5068845748901367 + }, + { + "auxiliary_loss_clip": 0.01110669, + "auxiliary_loss_mlp": 0.01031974, + "balance_loss_clip": 1.01944494, + "balance_loss_mlp": 1.03983307, + "epoch": 0.5095445663610402, + "flos": 14976114969600.0, + "grad_norm": 2.014074019348489, + "language_loss": 0.64659619, + "learning_rate": 2.035244457765222e-06, + "loss": 0.66802263, + "num_input_tokens_seen": 182301810, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.70703125, + "step": 8475, + "time_per_iteration": 2.4363739490509033 + }, + { + "auxiliary_loss_clip": 0.01115225, + "auxiliary_loss_mlp": 0.01038799, + "balance_loss_clip": 1.02557325, + "balance_loss_mlp": 1.04094887, + "epoch": 0.5096046896137081, + "flos": 20777088510720.0, + "grad_norm": 4.024838672705198, + "language_loss": 0.81962836, + "learning_rate": 2.0348550558949605e-06, + "loss": 0.84116852, + "num_input_tokens_seen": 182320285, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 8476, + "time_per_iteration": 2.448249578475952 + }, + { + "auxiliary_loss_clip": 0.01111186, + "auxiliary_loss_mlp": 0.01035572, + "balance_loss_clip": 1.02019382, + "balance_loss_mlp": 1.03794646, + "epoch": 0.5096648128663761, + "flos": 23185078416000.0, + "grad_norm": 1.9611523426566915, + "language_loss": 0.81148994, + "learning_rate": 2.0344656527029917e-06, + "loss": 0.83295757, + "num_input_tokens_seen": 182339465, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.734375, + "step": 8477, + "time_per_iteration": 2.470248222351074 + }, + { + "auxiliary_loss_clip": 0.01111185, + "auxiliary_loss_mlp": 0.01029928, + "balance_loss_clip": 1.01584899, + "balance_loss_mlp": 1.03962493, + "epoch": 0.509724936119044, + "flos": 22309432663680.0, + "grad_norm": 1.8342280591951767, + "language_loss": 0.61682522, + "learning_rate": 2.034076248204082e-06, + "loss": 0.6382364, + "num_input_tokens_seen": 182358375, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.71484375, + "step": 8478, + "time_per_iteration": 2.4439172744750977 + }, + { + "auxiliary_loss_clip": 0.01108849, + "auxiliary_loss_mlp": 0.01037275, + "balance_loss_clip": 1.02540779, + "balance_loss_mlp": 1.03930426, + "epoch": 0.509785059371712, + "flos": 26287077974400.0, + "grad_norm": 1.4883331760724325, + "language_loss": 0.65860271, + "learning_rate": 2.0336868424129968e-06, + "loss": 0.6800639, + "num_input_tokens_seen": 182377935, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6953125, + "step": 8479, + "time_per_iteration": 2.4965710639953613 + }, + { + "auxiliary_loss_clip": 0.01107177, + "auxiliary_loss_mlp": 0.01031743, + "balance_loss_clip": 1.01974487, + "balance_loss_mlp": 1.0389936, + "epoch": 0.50984518262438, + "flos": 22964586779520.0, + "grad_norm": 1.620468938265791, + "language_loss": 0.69455707, + "learning_rate": 2.0332974353445037e-06, + "loss": 0.71594626, + "num_input_tokens_seen": 182396440, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 8480, + "time_per_iteration": 2.4500057697296143 + }, + { + "auxiliary_loss_clip": 0.01109301, + "auxiliary_loss_mlp": 0.01031568, + "balance_loss_clip": 1.01871157, + "balance_loss_mlp": 1.03733814, + "epoch": 0.509905305877048, + "flos": 26213389223040.0, + "grad_norm": 1.6808533459383284, + "language_loss": 0.79027826, + "learning_rate": 2.0329080270133688e-06, + "loss": 0.81168693, + "num_input_tokens_seen": 182415890, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 8481, + "time_per_iteration": 2.507157564163208 + }, + { + "auxiliary_loss_clip": 0.01104124, + "auxiliary_loss_mlp": 0.01034339, + "balance_loss_clip": 1.02170324, + "balance_loss_mlp": 1.03702283, + "epoch": 0.5099654291297159, + "flos": 20340055733760.0, + "grad_norm": 1.5080021873745288, + "language_loss": 0.83429766, + "learning_rate": 2.0325186174343578e-06, + "loss": 0.85568231, + "num_input_tokens_seen": 182434235, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.671875, + "step": 8482, + "time_per_iteration": 2.4544076919555664 + }, + { + "auxiliary_loss_clip": 0.0111291, + "auxiliary_loss_mlp": 0.01032891, + "balance_loss_clip": 1.01925349, + "balance_loss_mlp": 1.03990221, + "epoch": 0.5100255523823839, + "flos": 29054820545280.0, + "grad_norm": 1.7853243252822575, + "language_loss": 0.85625446, + "learning_rate": 2.032129206622238e-06, + "loss": 0.87771249, + "num_input_tokens_seen": 182454360, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.73046875, + "step": 8483, + "time_per_iteration": 2.519747734069824 + }, + { + "auxiliary_loss_clip": 0.01107969, + "auxiliary_loss_mlp": 0.01031848, + "balance_loss_clip": 1.01965284, + "balance_loss_mlp": 1.03712344, + "epoch": 0.5100856756350518, + "flos": 22455912326400.0, + "grad_norm": 1.7164607290812173, + "language_loss": 0.83208412, + "learning_rate": 2.031739794591775e-06, + "loss": 0.85348231, + "num_input_tokens_seen": 182471940, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.70703125, + "step": 8484, + "time_per_iteration": 2.4549949169158936 + }, + { + "auxiliary_loss_clip": 0.01109177, + "auxiliary_loss_mlp": 0.01028587, + "balance_loss_clip": 1.0154798, + "balance_loss_mlp": 1.03849459, + "epoch": 0.5101457988877198, + "flos": 19171055606400.0, + "grad_norm": 2.0216137506651983, + "language_loss": 0.81388122, + "learning_rate": 2.031350381357736e-06, + "loss": 0.83525884, + "num_input_tokens_seen": 182490685, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.70703125, + "step": 8485, + "time_per_iteration": 2.4612390995025635 + }, + { + "auxiliary_loss_clip": 0.01103382, + "auxiliary_loss_mlp": 0.01032642, + "balance_loss_clip": 1.02036929, + "balance_loss_mlp": 1.03675199, + "epoch": 0.5102059221403878, + "flos": 14866371941760.0, + "grad_norm": 2.1191716083834025, + "language_loss": 0.73653662, + "learning_rate": 2.0309609669348874e-06, + "loss": 0.7578969, + "num_input_tokens_seen": 182508325, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.66796875, + "step": 8486, + "time_per_iteration": 2.426042318344116 + }, + { + "auxiliary_loss_clip": 0.01112031, + "auxiliary_loss_mlp": 0.01031218, + "balance_loss_clip": 1.01824152, + "balance_loss_mlp": 1.03990436, + "epoch": 0.5102660453930558, + "flos": 22961103160320.0, + "grad_norm": 1.4808929350883289, + "language_loss": 0.69956315, + "learning_rate": 2.0305715513379953e-06, + "loss": 0.72099566, + "num_input_tokens_seen": 182527020, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.72265625, + "step": 8487, + "time_per_iteration": 2.5032570362091064 + }, + { + "auxiliary_loss_clip": 0.01108669, + "auxiliary_loss_mlp": 0.01033355, + "balance_loss_clip": 1.01987231, + "balance_loss_mlp": 1.04012084, + "epoch": 0.5103261686457238, + "flos": 23149311448320.0, + "grad_norm": 1.9552461936614123, + "language_loss": 0.72984374, + "learning_rate": 2.030182134581827e-06, + "loss": 0.75126404, + "num_input_tokens_seen": 182543505, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.6875, + "step": 8488, + "time_per_iteration": 2.454589605331421 + }, + { + "auxiliary_loss_clip": 0.011087, + "auxiliary_loss_mlp": 0.01032468, + "balance_loss_clip": 1.02002835, + "balance_loss_mlp": 1.03795087, + "epoch": 0.5103862918983917, + "flos": 14319237000960.0, + "grad_norm": 1.814097723080907, + "language_loss": 0.69584548, + "learning_rate": 2.0297927166811503e-06, + "loss": 0.71725714, + "num_input_tokens_seen": 182562250, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.70703125, + "step": 8489, + "time_per_iteration": 2.4295358657836914 + }, + { + "auxiliary_loss_clip": 0.01108544, + "auxiliary_loss_mlp": 0.01030245, + "balance_loss_clip": 1.01800227, + "balance_loss_mlp": 1.03788161, + "epoch": 0.5104464151510597, + "flos": 25848536826240.0, + "grad_norm": 1.8877500438207433, + "language_loss": 0.72447532, + "learning_rate": 2.0294032976507297e-06, + "loss": 0.7458632, + "num_input_tokens_seen": 182581910, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.70703125, + "step": 8490, + "time_per_iteration": 2.484398603439331 + }, + { + "auxiliary_loss_clip": 0.01105533, + "auxiliary_loss_mlp": 0.01028247, + "balance_loss_clip": 1.01649261, + "balance_loss_mlp": 1.03803921, + "epoch": 0.5105065384037276, + "flos": 21652913831040.0, + "grad_norm": 1.594832362291185, + "language_loss": 0.80287743, + "learning_rate": 2.0290138775053337e-06, + "loss": 0.82421523, + "num_input_tokens_seen": 182601350, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.67578125, + "step": 8491, + "time_per_iteration": 2.4715051651000977 + }, + { + "auxiliary_loss_clip": 0.01103108, + "auxiliary_loss_mlp": 0.01028042, + "balance_loss_clip": 1.0155549, + "balance_loss_mlp": 1.03651989, + "epoch": 0.5105666616563956, + "flos": 22491571553280.0, + "grad_norm": 2.311833139697555, + "language_loss": 0.79033649, + "learning_rate": 2.028624456259728e-06, + "loss": 0.81164801, + "num_input_tokens_seen": 182619660, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6640625, + "step": 8492, + "time_per_iteration": 2.4697651863098145 + }, + { + "auxiliary_loss_clip": 0.01114847, + "auxiliary_loss_mlp": 0.01038448, + "balance_loss_clip": 1.02560329, + "balance_loss_mlp": 1.04234147, + "epoch": 0.5106267849090635, + "flos": 22455768672000.0, + "grad_norm": 2.1680982451379607, + "language_loss": 0.77821648, + "learning_rate": 2.0282350339286804e-06, + "loss": 0.79974937, + "num_input_tokens_seen": 182639815, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7265625, + "step": 8493, + "time_per_iteration": 2.490349054336548 + }, + { + "auxiliary_loss_clip": 0.01109447, + "auxiliary_loss_mlp": 0.01029414, + "balance_loss_clip": 1.01608634, + "balance_loss_mlp": 1.03989387, + "epoch": 0.5106869081617316, + "flos": 23547093638400.0, + "grad_norm": 2.213061013784994, + "language_loss": 0.83690828, + "learning_rate": 2.0278456105269574e-06, + "loss": 0.85829687, + "num_input_tokens_seen": 182659655, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6953125, + "step": 8494, + "time_per_iteration": 2.4604976177215576 + }, + { + "auxiliary_loss_clip": 0.01112511, + "auxiliary_loss_mlp": 0.01033364, + "balance_loss_clip": 1.02189648, + "balance_loss_mlp": 1.04180336, + "epoch": 0.5107470314143995, + "flos": 26792987080320.0, + "grad_norm": 1.8678450133518327, + "language_loss": 0.79117751, + "learning_rate": 2.027456186069326e-06, + "loss": 0.81263626, + "num_input_tokens_seen": 182677075, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.70703125, + "step": 8495, + "time_per_iteration": 2.5202648639678955 + }, + { + "auxiliary_loss_clip": 0.01109453, + "auxiliary_loss_mlp": 0.01034821, + "balance_loss_clip": 1.02276945, + "balance_loss_mlp": 1.04033172, + "epoch": 0.5108071546670675, + "flos": 25739691638400.0, + "grad_norm": 1.5685043948688704, + "language_loss": 0.78221929, + "learning_rate": 2.0270667605705535e-06, + "loss": 0.80366194, + "num_input_tokens_seen": 182699625, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 8496, + "time_per_iteration": 2.499793767929077 + }, + { + "auxiliary_loss_clip": 0.01105005, + "auxiliary_loss_mlp": 0.01026512, + "balance_loss_clip": 1.01508582, + "balance_loss_mlp": 1.03803635, + "epoch": 0.5108672779197354, + "flos": 18697537589760.0, + "grad_norm": 1.9336450862291243, + "language_loss": 0.7876817, + "learning_rate": 2.0266773340454066e-06, + "loss": 0.8089968, + "num_input_tokens_seen": 182717020, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 8497, + "time_per_iteration": 2.450246572494507 + }, + { + "auxiliary_loss_clip": 0.01106851, + "auxiliary_loss_mlp": 0.01032259, + "balance_loss_clip": 1.0203619, + "balance_loss_mlp": 1.03829265, + "epoch": 0.5109274011724034, + "flos": 26688164215680.0, + "grad_norm": 1.6296784083005205, + "language_loss": 0.8186121, + "learning_rate": 2.0262879065086525e-06, + "loss": 0.84000313, + "num_input_tokens_seen": 182736955, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.68359375, + "step": 8498, + "time_per_iteration": 2.4860284328460693 + }, + { + "auxiliary_loss_clip": 0.0110713, + "auxiliary_loss_mlp": 0.01028216, + "balance_loss_clip": 1.01559711, + "balance_loss_mlp": 1.03989053, + "epoch": 0.5109875244250714, + "flos": 22784028088320.0, + "grad_norm": 1.9511970266493632, + "language_loss": 0.71084464, + "learning_rate": 2.0258984779750584e-06, + "loss": 0.73219806, + "num_input_tokens_seen": 182757620, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.671875, + "step": 8499, + "time_per_iteration": 2.488870859146118 + }, + { + "auxiliary_loss_clip": 0.01108699, + "auxiliary_loss_mlp": 0.01031134, + "balance_loss_clip": 1.01899862, + "balance_loss_mlp": 1.03962827, + "epoch": 0.5110476476777394, + "flos": 35588515622400.0, + "grad_norm": 1.470448999091522, + "language_loss": 0.72600758, + "learning_rate": 2.0255090484593914e-06, + "loss": 0.74740595, + "num_input_tokens_seen": 182780195, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69140625, + "step": 8500, + "time_per_iteration": 2.554612874984741 + }, + { + "auxiliary_loss_clip": 0.01113166, + "auxiliary_loss_mlp": 0.01032162, + "balance_loss_clip": 1.01870334, + "balance_loss_mlp": 1.03988254, + "epoch": 0.5111077709304074, + "flos": 19280798634240.0, + "grad_norm": 2.631045408977224, + "language_loss": 0.63011086, + "learning_rate": 2.0251196179764183e-06, + "loss": 0.65156412, + "num_input_tokens_seen": 182795765, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 8501, + "time_per_iteration": 2.4470977783203125 + }, + { + "auxiliary_loss_clip": 0.01109012, + "auxiliary_loss_mlp": 0.01033513, + "balance_loss_clip": 1.02117443, + "balance_loss_mlp": 1.03708565, + "epoch": 0.5111678941830753, + "flos": 20668207409280.0, + "grad_norm": 1.7479031643347964, + "language_loss": 0.8759163, + "learning_rate": 2.024730186540907e-06, + "loss": 0.89734155, + "num_input_tokens_seen": 182813120, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.71875, + "step": 8502, + "time_per_iteration": 2.4252443313598633 + }, + { + "auxiliary_loss_clip": 0.01103318, + "auxiliary_loss_mlp": 0.01033556, + "balance_loss_clip": 1.02192163, + "balance_loss_mlp": 1.0349071, + "epoch": 0.5112280174357433, + "flos": 26287903987200.0, + "grad_norm": 1.3950925269756227, + "language_loss": 0.82526219, + "learning_rate": 2.0243407541676253e-06, + "loss": 0.84663093, + "num_input_tokens_seen": 182835745, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.68359375, + "step": 8503, + "time_per_iteration": 2.5170319080352783 + }, + { + "auxiliary_loss_clip": 0.01038121, + "auxiliary_loss_mlp": 0.01001996, + "balance_loss_clip": 1.00103021, + "balance_loss_mlp": 1.01512361, + "epoch": 0.5112881406884112, + "flos": 59474247707520.0, + "grad_norm": 0.8658208518316733, + "language_loss": 0.63857049, + "learning_rate": 2.023951320871339e-06, + "loss": 0.65897167, + "num_input_tokens_seen": 182892540, + "router_z_loss_clip": 0.00964355, + "router_z_loss_mlp": 0.23046875, + "step": 8504, + "time_per_iteration": 3.098529577255249 + }, + { + "auxiliary_loss_clip": 0.01108099, + "auxiliary_loss_mlp": 0.01030933, + "balance_loss_clip": 1.01815391, + "balance_loss_mlp": 1.03960776, + "epoch": 0.5113482639410792, + "flos": 26468857728000.0, + "grad_norm": 3.195489539056655, + "language_loss": 0.84326482, + "learning_rate": 2.023561886666816e-06, + "loss": 0.86465514, + "num_input_tokens_seen": 182911515, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.68359375, + "step": 8505, + "time_per_iteration": 2.5145134925842285 + }, + { + "auxiliary_loss_clip": 0.01107392, + "auxiliary_loss_mlp": 0.01026895, + "balance_loss_clip": 1.01499188, + "balance_loss_mlp": 1.0399797, + "epoch": 0.5114083871937471, + "flos": 29895848565120.0, + "grad_norm": 1.9725783043316722, + "language_loss": 0.75117159, + "learning_rate": 2.0231724515688246e-06, + "loss": 0.77251446, + "num_input_tokens_seen": 182930860, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 8506, + "time_per_iteration": 2.529463052749634 + }, + { + "auxiliary_loss_clip": 0.01107977, + "auxiliary_loss_mlp": 0.01032843, + "balance_loss_clip": 1.01951551, + "balance_loss_mlp": 1.03808045, + "epoch": 0.5114685104464152, + "flos": 24314576561280.0, + "grad_norm": 1.6477689192158658, + "language_loss": 0.58288801, + "learning_rate": 2.022783015592131e-06, + "loss": 0.60429621, + "num_input_tokens_seen": 182949960, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.69921875, + "step": 8507, + "time_per_iteration": 2.515449047088623 + }, + { + "auxiliary_loss_clip": 0.01111035, + "auxiliary_loss_mlp": 0.01039811, + "balance_loss_clip": 1.02690697, + "balance_loss_mlp": 1.04132211, + "epoch": 0.5115286336990831, + "flos": 17019288391680.0, + "grad_norm": 1.6046089096743523, + "language_loss": 0.85276306, + "learning_rate": 2.022393578751503e-06, + "loss": 0.87427151, + "num_input_tokens_seen": 182968085, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 8508, + "time_per_iteration": 2.4760663509368896 + }, + { + "auxiliary_loss_clip": 0.01110329, + "auxiliary_loss_mlp": 0.01033776, + "balance_loss_clip": 1.02051985, + "balance_loss_mlp": 1.03969765, + "epoch": 0.5115887569517511, + "flos": 23659386531840.0, + "grad_norm": 1.6014168180464263, + "language_loss": 0.72123772, + "learning_rate": 2.022004141061709e-06, + "loss": 0.74267876, + "num_input_tokens_seen": 182987275, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.70703125, + "step": 8509, + "time_per_iteration": 2.5354809761047363 + }, + { + "auxiliary_loss_clip": 0.01107381, + "auxiliary_loss_mlp": 0.01032, + "balance_loss_clip": 1.02060962, + "balance_loss_mlp": 1.03980041, + "epoch": 0.511648880204419, + "flos": 16107193313280.0, + "grad_norm": 1.6675565589278303, + "language_loss": 0.75862014, + "learning_rate": 2.0216147025375153e-06, + "loss": 0.78001392, + "num_input_tokens_seen": 183004700, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.67578125, + "step": 8510, + "time_per_iteration": 3.945136785507202 + }, + { + "auxiliary_loss_clip": 0.01108162, + "auxiliary_loss_mlp": 0.01033651, + "balance_loss_clip": 1.02163482, + "balance_loss_mlp": 1.04065561, + "epoch": 0.511709003457087, + "flos": 32634970974720.0, + "grad_norm": 1.6646040073598372, + "language_loss": 0.71192694, + "learning_rate": 2.0212252631936907e-06, + "loss": 0.73334503, + "num_input_tokens_seen": 183025830, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 8511, + "time_per_iteration": 2.541703701019287 + }, + { + "auxiliary_loss_clip": 0.01105914, + "auxiliary_loss_mlp": 0.01029434, + "balance_loss_clip": 1.01763797, + "balance_loss_mlp": 1.03958058, + "epoch": 0.511769126709755, + "flos": 21762082241280.0, + "grad_norm": 2.060947746528677, + "language_loss": 0.66430634, + "learning_rate": 2.020835823045001e-06, + "loss": 0.68565977, + "num_input_tokens_seen": 183045140, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 8512, + "time_per_iteration": 5.427145481109619 + }, + { + "auxiliary_loss_clip": 0.01109669, + "auxiliary_loss_mlp": 0.01036594, + "balance_loss_clip": 1.02326632, + "balance_loss_mlp": 1.03883505, + "epoch": 0.511829249962423, + "flos": 23915357827200.0, + "grad_norm": 2.433145093070313, + "language_loss": 0.66578728, + "learning_rate": 2.0204463821062146e-06, + "loss": 0.6872499, + "num_input_tokens_seen": 183063935, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.70703125, + "step": 8513, + "time_per_iteration": 3.935227870941162 + }, + { + "auxiliary_loss_clip": 0.01106032, + "auxiliary_loss_mlp": 0.01033163, + "balance_loss_clip": 1.02099788, + "balance_loss_mlp": 1.03927946, + "epoch": 0.511889373215091, + "flos": 23727005884800.0, + "grad_norm": 2.0509279474405115, + "language_loss": 0.69136906, + "learning_rate": 2.0200569403921e-06, + "loss": 0.71276104, + "num_input_tokens_seen": 183084135, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6640625, + "step": 8514, + "time_per_iteration": 2.5390119552612305 + }, + { + "auxiliary_loss_clip": 0.01102947, + "auxiliary_loss_mlp": 0.01026976, + "balance_loss_clip": 1.01599109, + "balance_loss_mlp": 1.03685427, + "epoch": 0.5119494964677589, + "flos": 28111519526400.0, + "grad_norm": 1.6362442678403473, + "language_loss": 0.66014814, + "learning_rate": 2.019667497917424e-06, + "loss": 0.68144739, + "num_input_tokens_seen": 183104570, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.66015625, + "step": 8515, + "time_per_iteration": 2.492664098739624 + }, + { + "auxiliary_loss_clip": 0.01103893, + "auxiliary_loss_mlp": 0.01031754, + "balance_loss_clip": 1.02031612, + "balance_loss_mlp": 1.03691602, + "epoch": 0.5120096197204269, + "flos": 24973214296320.0, + "grad_norm": 2.89314496105325, + "language_loss": 0.74966168, + "learning_rate": 2.019278054696955e-06, + "loss": 0.77101815, + "num_input_tokens_seen": 183123850, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 8516, + "time_per_iteration": 2.5428519248962402 + }, + { + "auxiliary_loss_clip": 0.01111253, + "auxiliary_loss_mlp": 0.01033594, + "balance_loss_clip": 1.02181602, + "balance_loss_mlp": 1.04198885, + "epoch": 0.5120697429730948, + "flos": 17968012364160.0, + "grad_norm": 1.7790403014833382, + "language_loss": 0.77862155, + "learning_rate": 2.0188886107454595e-06, + "loss": 0.80007005, + "num_input_tokens_seen": 183141725, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.69140625, + "step": 8517, + "time_per_iteration": 2.4259724617004395 + }, + { + "auxiliary_loss_clip": 0.01110887, + "auxiliary_loss_mlp": 0.01031413, + "balance_loss_clip": 1.01897407, + "balance_loss_mlp": 1.03983212, + "epoch": 0.5121298662257628, + "flos": 23292343405440.0, + "grad_norm": 1.7905284866787141, + "language_loss": 0.73672384, + "learning_rate": 2.0184991660777063e-06, + "loss": 0.75814688, + "num_input_tokens_seen": 183161300, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 8518, + "time_per_iteration": 2.5707037448883057 + }, + { + "auxiliary_loss_clip": 0.01107458, + "auxiliary_loss_mlp": 0.01037485, + "balance_loss_clip": 1.02557039, + "balance_loss_mlp": 1.03892565, + "epoch": 0.5121899894784308, + "flos": 17311062568320.0, + "grad_norm": 1.6752140453085944, + "language_loss": 0.78055197, + "learning_rate": 2.0181097207084625e-06, + "loss": 0.80200136, + "num_input_tokens_seen": 183180495, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 8519, + "time_per_iteration": 2.417372226715088 + }, + { + "auxiliary_loss_clip": 0.01109296, + "auxiliary_loss_mlp": 0.0103241, + "balance_loss_clip": 1.02049518, + "balance_loss_mlp": 1.04082775, + "epoch": 0.5122501127310988, + "flos": 24930085040640.0, + "grad_norm": 1.573776111474748, + "language_loss": 0.79204106, + "learning_rate": 2.017720274652497e-06, + "loss": 0.8134582, + "num_input_tokens_seen": 183200330, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.68359375, + "step": 8520, + "time_per_iteration": 2.63323712348938 + }, + { + "auxiliary_loss_clip": 0.01112541, + "auxiliary_loss_mlp": 0.01039702, + "balance_loss_clip": 1.02623105, + "balance_loss_mlp": 1.03924751, + "epoch": 0.5123102359837667, + "flos": 18442859184000.0, + "grad_norm": 1.6319482307550086, + "language_loss": 0.81403995, + "learning_rate": 2.0173308279245765e-06, + "loss": 0.83556241, + "num_input_tokens_seen": 183218230, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 8521, + "time_per_iteration": 2.4723713397979736 + }, + { + "auxiliary_loss_clip": 0.01106273, + "auxiliary_loss_mlp": 0.01029547, + "balance_loss_clip": 1.01685691, + "balance_loss_mlp": 1.03599286, + "epoch": 0.5123703592364347, + "flos": 26684860164480.0, + "grad_norm": 1.90297827684807, + "language_loss": 0.68368387, + "learning_rate": 2.0169413805394692e-06, + "loss": 0.70504206, + "num_input_tokens_seen": 183236735, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 8522, + "time_per_iteration": 2.516411066055298 + }, + { + "auxiliary_loss_clip": 0.01115928, + "auxiliary_loss_mlp": 0.01041085, + "balance_loss_clip": 1.02506292, + "balance_loss_mlp": 1.04201221, + "epoch": 0.5124304824891026, + "flos": 28803948981120.0, + "grad_norm": 2.718510344621862, + "language_loss": 0.6155864, + "learning_rate": 2.0165519325119433e-06, + "loss": 0.63715655, + "num_input_tokens_seen": 183257550, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.73828125, + "step": 8523, + "time_per_iteration": 2.524775266647339 + }, + { + "auxiliary_loss_clip": 0.01110788, + "auxiliary_loss_mlp": 0.010355, + "balance_loss_clip": 1.0238173, + "balance_loss_mlp": 1.04113579, + "epoch": 0.5124906057417706, + "flos": 21761830846080.0, + "grad_norm": 2.0609816781673884, + "language_loss": 0.78066456, + "learning_rate": 2.0161624838567656e-06, + "loss": 0.80212736, + "num_input_tokens_seen": 183275515, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6953125, + "step": 8524, + "time_per_iteration": 2.526226043701172 + }, + { + "auxiliary_loss_clip": 0.01109029, + "auxiliary_loss_mlp": 0.0103495, + "balance_loss_clip": 1.02350545, + "balance_loss_mlp": 1.0413003, + "epoch": 0.5125507289944387, + "flos": 18880538405760.0, + "grad_norm": 1.8496964430325211, + "language_loss": 0.75055063, + "learning_rate": 2.015773034588706e-06, + "loss": 0.77199042, + "num_input_tokens_seen": 183293880, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6796875, + "step": 8525, + "time_per_iteration": 2.432555913925171 + }, + { + "auxiliary_loss_clip": 0.01112941, + "auxiliary_loss_mlp": 0.01036722, + "balance_loss_clip": 1.02385902, + "balance_loss_mlp": 1.04111516, + "epoch": 0.5126108522471066, + "flos": 35627838036480.0, + "grad_norm": 1.559913373859493, + "language_loss": 0.74452645, + "learning_rate": 2.015383584722531e-06, + "loss": 0.76602304, + "num_input_tokens_seen": 183315860, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 8526, + "time_per_iteration": 2.6282670497894287 + }, + { + "auxiliary_loss_clip": 0.01110533, + "auxiliary_loss_mlp": 0.01040593, + "balance_loss_clip": 1.02799845, + "balance_loss_mlp": 1.04028583, + "epoch": 0.5126709754997746, + "flos": 20190918464640.0, + "grad_norm": 1.490779495017149, + "language_loss": 0.65322489, + "learning_rate": 2.0149941342730088e-06, + "loss": 0.67473614, + "num_input_tokens_seen": 183335480, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 8527, + "time_per_iteration": 2.467350482940674 + }, + { + "auxiliary_loss_clip": 0.01108518, + "auxiliary_loss_mlp": 0.01039646, + "balance_loss_clip": 1.02852428, + "balance_loss_mlp": 1.04277444, + "epoch": 0.5127310987524425, + "flos": 18588548747520.0, + "grad_norm": 1.5603597457219889, + "language_loss": 0.74514449, + "learning_rate": 2.014604683254908e-06, + "loss": 0.76662612, + "num_input_tokens_seen": 183354395, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 8528, + "time_per_iteration": 2.513795852661133 + }, + { + "auxiliary_loss_clip": 0.01105318, + "auxiliary_loss_mlp": 0.01034313, + "balance_loss_clip": 1.02236843, + "balance_loss_mlp": 1.03608227, + "epoch": 0.5127912220051105, + "flos": 22454691264000.0, + "grad_norm": 1.756255656529514, + "language_loss": 0.83061087, + "learning_rate": 2.014215231682995e-06, + "loss": 0.85200721, + "num_input_tokens_seen": 183372980, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 8529, + "time_per_iteration": 2.4574379920959473 + }, + { + "auxiliary_loss_clip": 0.01106885, + "auxiliary_loss_mlp": 0.01032856, + "balance_loss_clip": 1.02045822, + "balance_loss_mlp": 1.03895748, + "epoch": 0.5128513452577784, + "flos": 19093703667840.0, + "grad_norm": 1.6787234743344808, + "language_loss": 0.73559862, + "learning_rate": 2.01382577957204e-06, + "loss": 0.75699604, + "num_input_tokens_seen": 183390160, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 8530, + "time_per_iteration": 2.4669532775878906 + }, + { + "auxiliary_loss_clip": 0.01039899, + "auxiliary_loss_mlp": 0.0100398, + "balance_loss_clip": 1.00278807, + "balance_loss_mlp": 1.01703906, + "epoch": 0.5129114685104464, + "flos": 67892285243520.0, + "grad_norm": 0.7465649329198393, + "language_loss": 0.60806251, + "learning_rate": 2.0134363269368095e-06, + "loss": 0.6285013, + "num_input_tokens_seen": 183455280, + "router_z_loss_clip": 0.01190186, + "router_z_loss_mlp": 0.22851562, + "step": 8531, + "time_per_iteration": 3.1615967750549316 + }, + { + "auxiliary_loss_clip": 0.01110166, + "auxiliary_loss_mlp": 0.01029475, + "balance_loss_clip": 1.01732779, + "balance_loss_mlp": 1.04014051, + "epoch": 0.5129715917631144, + "flos": 20449152316800.0, + "grad_norm": 1.6561974446519532, + "language_loss": 0.76540768, + "learning_rate": 2.0130468737920725e-06, + "loss": 0.78680408, + "num_input_tokens_seen": 183473955, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.703125, + "step": 8532, + "time_per_iteration": 2.4836883544921875 + }, + { + "auxiliary_loss_clip": 0.01107783, + "auxiliary_loss_mlp": 0.01031606, + "balance_loss_clip": 1.01894033, + "balance_loss_mlp": 1.03866601, + "epoch": 0.5130317150157824, + "flos": 35116146840960.0, + "grad_norm": 2.847315245703251, + "language_loss": 0.67183244, + "learning_rate": 2.012657420152597e-06, + "loss": 0.6932264, + "num_input_tokens_seen": 183497195, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 8533, + "time_per_iteration": 2.6025052070617676 + }, + { + "auxiliary_loss_clip": 0.01112515, + "auxiliary_loss_mlp": 0.01036644, + "balance_loss_clip": 1.02333999, + "balance_loss_mlp": 1.04080868, + "epoch": 0.5130918382684503, + "flos": 19791627903360.0, + "grad_norm": 1.8363553974693196, + "language_loss": 0.81724054, + "learning_rate": 2.01226796603315e-06, + "loss": 0.83873212, + "num_input_tokens_seen": 183513675, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 8534, + "time_per_iteration": 2.465374231338501 + }, + { + "auxiliary_loss_clip": 0.01111356, + "auxiliary_loss_mlp": 0.01035387, + "balance_loss_clip": 1.02167177, + "balance_loss_mlp": 1.0399549, + "epoch": 0.5131519615211183, + "flos": 26323096337280.0, + "grad_norm": 1.5787063577136407, + "language_loss": 0.63588178, + "learning_rate": 2.0118785114485017e-06, + "loss": 0.65734923, + "num_input_tokens_seen": 183535165, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71484375, + "step": 8535, + "time_per_iteration": 2.50287127494812 + }, + { + "auxiliary_loss_clip": 0.01111823, + "auxiliary_loss_mlp": 0.0102686, + "balance_loss_clip": 1.01434922, + "balance_loss_mlp": 1.04166365, + "epoch": 0.5132120847737862, + "flos": 19171917532800.0, + "grad_norm": 1.5428442042942097, + "language_loss": 0.69746888, + "learning_rate": 2.011489056413418e-06, + "loss": 0.71885574, + "num_input_tokens_seen": 183553780, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69921875, + "step": 8536, + "time_per_iteration": 2.459897041320801 + }, + { + "auxiliary_loss_clip": 0.01113816, + "auxiliary_loss_mlp": 0.01033547, + "balance_loss_clip": 1.01963568, + "balance_loss_mlp": 1.04082823, + "epoch": 0.5132722080264542, + "flos": 20230420446720.0, + "grad_norm": 2.3299626101952784, + "language_loss": 0.71215963, + "learning_rate": 2.011099600942669e-06, + "loss": 0.73363328, + "num_input_tokens_seen": 183572285, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7265625, + "step": 8537, + "time_per_iteration": 2.4840991497039795 + }, + { + "auxiliary_loss_clip": 0.01111456, + "auxiliary_loss_mlp": 0.0103313, + "balance_loss_clip": 1.02013016, + "balance_loss_mlp": 1.03927016, + "epoch": 0.5133323312791223, + "flos": 16469459930880.0, + "grad_norm": 6.302946358508802, + "language_loss": 0.80441952, + "learning_rate": 2.0107101450510214e-06, + "loss": 0.82586539, + "num_input_tokens_seen": 183589330, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71875, + "step": 8538, + "time_per_iteration": 2.4378812313079834 + }, + { + "auxiliary_loss_clip": 0.01107763, + "auxiliary_loss_mlp": 0.01031809, + "balance_loss_clip": 1.01880276, + "balance_loss_mlp": 1.03764546, + "epoch": 0.5133924545317902, + "flos": 26068094709120.0, + "grad_norm": 1.8808034234185624, + "language_loss": 0.78517324, + "learning_rate": 2.0103206887532437e-06, + "loss": 0.80656898, + "num_input_tokens_seen": 183609205, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.703125, + "step": 8539, + "time_per_iteration": 2.5144600868225098 + }, + { + "auxiliary_loss_clip": 0.0111221, + "auxiliary_loss_mlp": 0.01032928, + "balance_loss_clip": 1.02025044, + "balance_loss_mlp": 1.04009342, + "epoch": 0.5134525777844582, + "flos": 29131023248640.0, + "grad_norm": 1.5130664168284647, + "language_loss": 0.75880563, + "learning_rate": 2.009931232064105e-06, + "loss": 0.78025699, + "num_input_tokens_seen": 183629985, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 8540, + "time_per_iteration": 2.55734920501709 + }, + { + "auxiliary_loss_clip": 0.0111462, + "auxiliary_loss_mlp": 0.01033022, + "balance_loss_clip": 1.01905608, + "balance_loss_mlp": 1.04176068, + "epoch": 0.5135127010371261, + "flos": 17454776883840.0, + "grad_norm": 2.8219986700547555, + "language_loss": 0.74552548, + "learning_rate": 2.0095417749983724e-06, + "loss": 0.76700193, + "num_input_tokens_seen": 183648220, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.73046875, + "step": 8541, + "time_per_iteration": 2.432055711746216 + }, + { + "auxiliary_loss_clip": 0.01110326, + "auxiliary_loss_mlp": 0.01033335, + "balance_loss_clip": 1.02005482, + "balance_loss_mlp": 1.03941679, + "epoch": 0.5135728242897941, + "flos": 21944975316480.0, + "grad_norm": 1.945278300015613, + "language_loss": 0.70215029, + "learning_rate": 2.0091523175708162e-06, + "loss": 0.72358692, + "num_input_tokens_seen": 183668230, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 8542, + "time_per_iteration": 2.5227723121643066 + }, + { + "auxiliary_loss_clip": 0.01112639, + "auxiliary_loss_mlp": 0.01026897, + "balance_loss_clip": 1.01403403, + "balance_loss_mlp": 1.04146171, + "epoch": 0.513632947542462, + "flos": 22674859678080.0, + "grad_norm": 1.83289507202946, + "language_loss": 0.78898811, + "learning_rate": 2.0087628597962023e-06, + "loss": 0.8103835, + "num_input_tokens_seen": 183687800, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 8543, + "time_per_iteration": 2.4559075832366943 + }, + { + "auxiliary_loss_clip": 0.0111214, + "auxiliary_loss_mlp": 0.01037576, + "balance_loss_clip": 1.02426672, + "balance_loss_mlp": 1.04161441, + "epoch": 0.51369307079513, + "flos": 29457163762560.0, + "grad_norm": 1.9171309591761885, + "language_loss": 0.68051696, + "learning_rate": 2.008373401689299e-06, + "loss": 0.70201409, + "num_input_tokens_seen": 183709025, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.703125, + "step": 8544, + "time_per_iteration": 2.5344274044036865 + }, + { + "auxiliary_loss_clip": 0.01113551, + "auxiliary_loss_mlp": 0.01039409, + "balance_loss_clip": 1.02671301, + "balance_loss_mlp": 1.04096842, + "epoch": 0.513753194047798, + "flos": 18989347680000.0, + "grad_norm": 2.2205990317105395, + "language_loss": 0.7225253, + "learning_rate": 2.0079839432648765e-06, + "loss": 0.74405491, + "num_input_tokens_seen": 183725740, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7265625, + "step": 8545, + "time_per_iteration": 2.4303176403045654 + }, + { + "auxiliary_loss_clip": 0.01112226, + "auxiliary_loss_mlp": 0.01038034, + "balance_loss_clip": 1.02431881, + "balance_loss_mlp": 1.03957486, + "epoch": 0.513813317300466, + "flos": 17821855923840.0, + "grad_norm": 1.967971348268394, + "language_loss": 0.81898367, + "learning_rate": 2.0075944845377016e-06, + "loss": 0.84048629, + "num_input_tokens_seen": 183743995, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7265625, + "step": 8546, + "time_per_iteration": 2.4504597187042236 + }, + { + "auxiliary_loss_clip": 0.01111418, + "auxiliary_loss_mlp": 0.01033938, + "balance_loss_clip": 1.02099776, + "balance_loss_mlp": 1.03963637, + "epoch": 0.5138734405531339, + "flos": 24061191045120.0, + "grad_norm": 1.6545588723955058, + "language_loss": 0.73301136, + "learning_rate": 2.007205025522544e-06, + "loss": 0.75446492, + "num_input_tokens_seen": 183764150, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71875, + "step": 8547, + "time_per_iteration": 2.4682819843292236 + }, + { + "auxiliary_loss_clip": 0.01108626, + "auxiliary_loss_mlp": 0.01043301, + "balance_loss_clip": 1.03010488, + "balance_loss_mlp": 1.03783822, + "epoch": 0.5139335638058019, + "flos": 26097253574400.0, + "grad_norm": 1.620202866362127, + "language_loss": 0.73577881, + "learning_rate": 2.0068155662341702e-06, + "loss": 0.75729811, + "num_input_tokens_seen": 183783280, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.70703125, + "step": 8548, + "time_per_iteration": 2.511922597885132 + }, + { + "auxiliary_loss_clip": 0.01110019, + "auxiliary_loss_mlp": 0.01034147, + "balance_loss_clip": 1.02117133, + "balance_loss_mlp": 1.03852081, + "epoch": 0.5139936870584698, + "flos": 18917095472640.0, + "grad_norm": 1.506476906057379, + "language_loss": 0.82239324, + "learning_rate": 2.0064261066873495e-06, + "loss": 0.84383494, + "num_input_tokens_seen": 183800725, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71484375, + "step": 8549, + "time_per_iteration": 2.433605194091797 + }, + { + "auxiliary_loss_clip": 0.01110043, + "auxiliary_loss_mlp": 0.01035127, + "balance_loss_clip": 1.02292621, + "balance_loss_mlp": 1.04096317, + "epoch": 0.5140538103111378, + "flos": 16144001775360.0, + "grad_norm": 1.8131541317091766, + "language_loss": 0.72331119, + "learning_rate": 2.0060366468968504e-06, + "loss": 0.7447629, + "num_input_tokens_seen": 183818735, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69140625, + "step": 8550, + "time_per_iteration": 2.4659972190856934 + }, + { + "auxiliary_loss_clip": 0.01114255, + "auxiliary_loss_mlp": 0.01034858, + "balance_loss_clip": 1.02173352, + "balance_loss_mlp": 1.0404501, + "epoch": 0.5141139335638057, + "flos": 22420145358720.0, + "grad_norm": 1.6035097357113468, + "language_loss": 0.75497758, + "learning_rate": 2.0056471868774408e-06, + "loss": 0.77646863, + "num_input_tokens_seen": 183840015, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73828125, + "step": 8551, + "time_per_iteration": 2.453734874725342 + }, + { + "auxiliary_loss_clip": 0.01108366, + "auxiliary_loss_mlp": 0.01030625, + "balance_loss_clip": 1.01805425, + "balance_loss_mlp": 1.04017091, + "epoch": 0.5141740568164738, + "flos": 27089645506560.0, + "grad_norm": 1.6015349884444547, + "language_loss": 0.69001007, + "learning_rate": 2.0052577266438897e-06, + "loss": 0.71140003, + "num_input_tokens_seen": 183860145, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.68359375, + "step": 8552, + "time_per_iteration": 3.9047505855560303 + }, + { + "auxiliary_loss_clip": 0.01110174, + "auxiliary_loss_mlp": 0.01032262, + "balance_loss_clip": 1.01927972, + "balance_loss_mlp": 1.03868091, + "epoch": 0.5142341800691418, + "flos": 24973250209920.0, + "grad_norm": 1.7916575293353634, + "language_loss": 0.74736363, + "learning_rate": 2.004868266210965e-06, + "loss": 0.76878798, + "num_input_tokens_seen": 183880540, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71484375, + "step": 8553, + "time_per_iteration": 2.5039455890655518 + }, + { + "auxiliary_loss_clip": 0.01109768, + "auxiliary_loss_mlp": 0.01035209, + "balance_loss_clip": 1.02241778, + "balance_loss_mlp": 1.0397613, + "epoch": 0.5142943033218097, + "flos": 20704513080960.0, + "grad_norm": 1.707634664835445, + "language_loss": 0.68126231, + "learning_rate": 2.004478805593435e-06, + "loss": 0.70271206, + "num_input_tokens_seen": 183900895, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 8554, + "time_per_iteration": 5.488779544830322 + }, + { + "auxiliary_loss_clip": 0.01112685, + "auxiliary_loss_mlp": 0.01036304, + "balance_loss_clip": 1.02173042, + "balance_loss_mlp": 1.03879559, + "epoch": 0.5143544265744777, + "flos": 22925479847040.0, + "grad_norm": 2.3217393931515846, + "language_loss": 0.73303884, + "learning_rate": 2.004089344806068e-06, + "loss": 0.75452876, + "num_input_tokens_seen": 183920335, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.73828125, + "step": 8555, + "time_per_iteration": 3.866107940673828 + }, + { + "auxiliary_loss_clip": 0.01111396, + "auxiliary_loss_mlp": 0.01035591, + "balance_loss_clip": 1.02278817, + "balance_loss_mlp": 1.04023397, + "epoch": 0.5144145498271456, + "flos": 15921391236480.0, + "grad_norm": 2.3509367679077124, + "language_loss": 0.74724478, + "learning_rate": 2.003699883863633e-06, + "loss": 0.76871467, + "num_input_tokens_seen": 183936220, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7109375, + "step": 8556, + "time_per_iteration": 2.423941135406494 + }, + { + "auxiliary_loss_clip": 0.01105419, + "auxiliary_loss_mlp": 0.0103358, + "balance_loss_clip": 1.02135563, + "balance_loss_mlp": 1.03695798, + "epoch": 0.5144746730798136, + "flos": 19681238430720.0, + "grad_norm": 1.7510489074761373, + "language_loss": 0.86147487, + "learning_rate": 2.003310422780898e-06, + "loss": 0.88286483, + "num_input_tokens_seen": 183953250, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 8557, + "time_per_iteration": 2.4232289791107178 + }, + { + "auxiliary_loss_clip": 0.01105513, + "auxiliary_loss_mlp": 0.01033194, + "balance_loss_clip": 1.02162433, + "balance_loss_mlp": 1.03741109, + "epoch": 0.5145347963324816, + "flos": 23914711382400.0, + "grad_norm": 1.4648111070630687, + "language_loss": 0.89026904, + "learning_rate": 2.0029209615726307e-06, + "loss": 0.91165608, + "num_input_tokens_seen": 183973865, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6796875, + "step": 8558, + "time_per_iteration": 2.4937002658843994 + }, + { + "auxiliary_loss_clip": 0.01106843, + "auxiliary_loss_mlp": 0.01032131, + "balance_loss_clip": 1.01952434, + "balance_loss_mlp": 1.03844643, + "epoch": 0.5145949195851496, + "flos": 18260002022400.0, + "grad_norm": 1.959206520418211, + "language_loss": 0.65027267, + "learning_rate": 2.002531500253602e-06, + "loss": 0.67166239, + "num_input_tokens_seen": 183992555, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.68359375, + "step": 8559, + "time_per_iteration": 2.4625425338745117 + }, + { + "auxiliary_loss_clip": 0.01109462, + "auxiliary_loss_mlp": 0.01035258, + "balance_loss_clip": 1.02255082, + "balance_loss_mlp": 1.04041696, + "epoch": 0.5146550428378175, + "flos": 26213425136640.0, + "grad_norm": 1.5416961138531182, + "language_loss": 0.62973124, + "learning_rate": 2.002142038838577e-06, + "loss": 0.65117842, + "num_input_tokens_seen": 184010825, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 8560, + "time_per_iteration": 2.509413719177246 + }, + { + "auxiliary_loss_clip": 0.01107571, + "auxiliary_loss_mlp": 0.01030305, + "balance_loss_clip": 1.01798463, + "balance_loss_mlp": 1.03850913, + "epoch": 0.5147151660904855, + "flos": 22674177319680.0, + "grad_norm": 1.5387222778191898, + "language_loss": 0.69879884, + "learning_rate": 2.0017525773423265e-06, + "loss": 0.72017759, + "num_input_tokens_seen": 184030155, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69140625, + "step": 8561, + "time_per_iteration": 2.4802825450897217 + }, + { + "auxiliary_loss_clip": 0.01108154, + "auxiliary_loss_mlp": 0.01030825, + "balance_loss_clip": 1.01894569, + "balance_loss_mlp": 1.03752971, + "epoch": 0.5147752893431534, + "flos": 24972388283520.0, + "grad_norm": 1.5731273846161422, + "language_loss": 0.66646934, + "learning_rate": 2.0013631157796177e-06, + "loss": 0.68785918, + "num_input_tokens_seen": 184051440, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.70703125, + "step": 8562, + "time_per_iteration": 2.505180835723877 + }, + { + "auxiliary_loss_clip": 0.01110444, + "auxiliary_loss_mlp": 0.01030865, + "balance_loss_clip": 1.01824713, + "balance_loss_mlp": 1.03924227, + "epoch": 0.5148354125958214, + "flos": 22744669760640.0, + "grad_norm": 1.6680045222139546, + "language_loss": 0.77707577, + "learning_rate": 2.0009736541652188e-06, + "loss": 0.79848886, + "num_input_tokens_seen": 184070205, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 8563, + "time_per_iteration": 2.4935452938079834 + }, + { + "auxiliary_loss_clip": 0.01112567, + "auxiliary_loss_mlp": 0.01033673, + "balance_loss_clip": 1.01932585, + "balance_loss_mlp": 1.03827047, + "epoch": 0.5148955358484893, + "flos": 23068763199360.0, + "grad_norm": 2.1629374301288284, + "language_loss": 0.82324845, + "learning_rate": 2.0005841925139e-06, + "loss": 0.84471083, + "num_input_tokens_seen": 184087345, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7421875, + "step": 8564, + "time_per_iteration": 2.4590399265289307 + }, + { + "auxiliary_loss_clip": 0.01112048, + "auxiliary_loss_mlp": 0.01035558, + "balance_loss_clip": 1.0223794, + "balance_loss_mlp": 1.03859615, + "epoch": 0.5149556591011574, + "flos": 20340127560960.0, + "grad_norm": 1.7207643570499924, + "language_loss": 0.73255235, + "learning_rate": 2.0001947308404283e-06, + "loss": 0.75402838, + "num_input_tokens_seen": 184107110, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 8565, + "time_per_iteration": 2.471970558166504 + }, + { + "auxiliary_loss_clip": 0.01113674, + "auxiliary_loss_mlp": 0.01033516, + "balance_loss_clip": 1.01884174, + "balance_loss_mlp": 1.03977931, + "epoch": 0.5150157823538254, + "flos": 22638230784000.0, + "grad_norm": 1.8782058792026062, + "language_loss": 0.683079, + "learning_rate": 1.9998052691595715e-06, + "loss": 0.70455092, + "num_input_tokens_seen": 184127105, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.73828125, + "step": 8566, + "time_per_iteration": 2.4981720447540283 + }, + { + "auxiliary_loss_clip": 0.01109217, + "auxiliary_loss_mlp": 0.01029217, + "balance_loss_clip": 1.01639605, + "balance_loss_mlp": 1.03583431, + "epoch": 0.5150759056064933, + "flos": 26067627832320.0, + "grad_norm": 2.0482874573832177, + "language_loss": 0.78111541, + "learning_rate": 1.9994158074861005e-06, + "loss": 0.80249971, + "num_input_tokens_seen": 184148060, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.734375, + "step": 8567, + "time_per_iteration": 2.490272045135498 + }, + { + "auxiliary_loss_clip": 0.01113521, + "auxiliary_loss_mlp": 0.01033959, + "balance_loss_clip": 1.02054214, + "balance_loss_mlp": 1.04046249, + "epoch": 0.5151360288591613, + "flos": 25952641418880.0, + "grad_norm": 2.0737995601061274, + "language_loss": 0.790721, + "learning_rate": 1.9990263458347806e-06, + "loss": 0.81219578, + "num_input_tokens_seen": 184166175, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73046875, + "step": 8568, + "time_per_iteration": 2.602315902709961 + }, + { + "auxiliary_loss_clip": 0.01106374, + "auxiliary_loss_mlp": 0.01031237, + "balance_loss_clip": 1.01885664, + "balance_loss_mlp": 1.03637588, + "epoch": 0.5151961521118292, + "flos": 18507246312960.0, + "grad_norm": 2.0499636702484945, + "language_loss": 0.90935498, + "learning_rate": 1.9986368842203825e-06, + "loss": 0.93073106, + "num_input_tokens_seen": 184182600, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.703125, + "step": 8569, + "time_per_iteration": 2.430600643157959 + }, + { + "auxiliary_loss_clip": 0.01110259, + "auxiliary_loss_mlp": 0.01030056, + "balance_loss_clip": 1.01677024, + "balance_loss_mlp": 1.03865302, + "epoch": 0.5152562753644973, + "flos": 22233696837120.0, + "grad_norm": 1.6639049645433037, + "language_loss": 0.76229095, + "learning_rate": 1.998247422657674e-06, + "loss": 0.78369409, + "num_input_tokens_seen": 184202020, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 8570, + "time_per_iteration": 2.48988676071167 + }, + { + "auxiliary_loss_clip": 0.01108277, + "auxiliary_loss_mlp": 0.01037495, + "balance_loss_clip": 1.02357769, + "balance_loss_mlp": 1.03741157, + "epoch": 0.5153163986171652, + "flos": 38436555047040.0, + "grad_norm": 1.5896565556148876, + "language_loss": 0.7375021, + "learning_rate": 1.9978579611614227e-06, + "loss": 0.75895989, + "num_input_tokens_seen": 184224850, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7109375, + "step": 8571, + "time_per_iteration": 2.618849754333496 + }, + { + "auxiliary_loss_clip": 0.01035305, + "auxiliary_loss_mlp": 0.00998776, + "balance_loss_clip": 0.99780464, + "balance_loss_mlp": 1.0127461, + "epoch": 0.5153765218698332, + "flos": 66384503015040.0, + "grad_norm": 0.7780004501915253, + "language_loss": 0.52940249, + "learning_rate": 1.9974684997463984e-06, + "loss": 0.54974329, + "num_input_tokens_seen": 184288520, + "router_z_loss_clip": 0.00970459, + "router_z_loss_mlp": 0.22558594, + "step": 8572, + "time_per_iteration": 3.1418654918670654 + }, + { + "auxiliary_loss_clip": 0.01108043, + "auxiliary_loss_mlp": 0.01032788, + "balance_loss_clip": 1.02087331, + "balance_loss_mlp": 1.04004169, + "epoch": 0.5154366451225011, + "flos": 24024669891840.0, + "grad_norm": 1.7275406058075027, + "language_loss": 0.76217729, + "learning_rate": 1.9970790384273687e-06, + "loss": 0.78358561, + "num_input_tokens_seen": 184308565, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 8573, + "time_per_iteration": 2.4757239818573 + }, + { + "auxiliary_loss_clip": 0.01105818, + "auxiliary_loss_mlp": 0.01029217, + "balance_loss_clip": 1.01627111, + "balance_loss_mlp": 1.03679562, + "epoch": 0.5154967683751691, + "flos": 23468843859840.0, + "grad_norm": 1.9279490614808483, + "language_loss": 0.77039665, + "learning_rate": 1.996689577219102e-06, + "loss": 0.79174697, + "num_input_tokens_seen": 184326795, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6875, + "step": 8574, + "time_per_iteration": 2.478935718536377 + }, + { + "auxiliary_loss_clip": 0.01107394, + "auxiliary_loss_mlp": 0.01029105, + "balance_loss_clip": 1.01714277, + "balance_loss_mlp": 1.03757906, + "epoch": 0.515556891627837, + "flos": 23805650712960.0, + "grad_norm": 1.6824577114627284, + "language_loss": 0.85421538, + "learning_rate": 1.996300116136367e-06, + "loss": 0.87558043, + "num_input_tokens_seen": 184345990, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.69921875, + "step": 8575, + "time_per_iteration": 2.4811151027679443 + }, + { + "auxiliary_loss_clip": 0.01108061, + "auxiliary_loss_mlp": 0.01032098, + "balance_loss_clip": 1.01971185, + "balance_loss_mlp": 1.03703451, + "epoch": 0.515617014880505, + "flos": 19828544106240.0, + "grad_norm": 1.6692718685381052, + "language_loss": 0.76704675, + "learning_rate": 1.995910655193932e-06, + "loss": 0.78844833, + "num_input_tokens_seen": 184366300, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 8576, + "time_per_iteration": 2.490389108657837 + }, + { + "auxiliary_loss_clip": 0.011134, + "auxiliary_loss_mlp": 0.01031565, + "balance_loss_clip": 1.01836872, + "balance_loss_mlp": 1.03960061, + "epoch": 0.515677138133173, + "flos": 14245907385600.0, + "grad_norm": 3.052053268886893, + "language_loss": 0.75463682, + "learning_rate": 1.9955211944065654e-06, + "loss": 0.77608645, + "num_input_tokens_seen": 184383030, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73828125, + "step": 8577, + "time_per_iteration": 2.416757583618164 + }, + { + "auxiliary_loss_clip": 0.0111005, + "auxiliary_loss_mlp": 0.01037518, + "balance_loss_clip": 1.02441728, + "balance_loss_mlp": 1.0376997, + "epoch": 0.515737261385841, + "flos": 28289707920000.0, + "grad_norm": 1.834882992604573, + "language_loss": 0.80803275, + "learning_rate": 1.9951317337890353e-06, + "loss": 0.82950842, + "num_input_tokens_seen": 184403410, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 8578, + "time_per_iteration": 2.517292022705078 + }, + { + "auxiliary_loss_clip": 0.01104508, + "auxiliary_loss_mlp": 0.01032552, + "balance_loss_clip": 1.02046442, + "balance_loss_mlp": 1.0357188, + "epoch": 0.515797384638509, + "flos": 27891925729920.0, + "grad_norm": 1.7011032882300805, + "language_loss": 0.76299787, + "learning_rate": 1.9947422733561105e-06, + "loss": 0.78436846, + "num_input_tokens_seen": 184423830, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 8579, + "time_per_iteration": 2.4907805919647217 + }, + { + "auxiliary_loss_clip": 0.01109486, + "auxiliary_loss_mlp": 0.01031503, + "balance_loss_clip": 1.01890254, + "balance_loss_mlp": 1.03864014, + "epoch": 0.5158575078911769, + "flos": 23040071210880.0, + "grad_norm": 1.5884760036798964, + "language_loss": 0.79018867, + "learning_rate": 1.994352813122559e-06, + "loss": 0.81159854, + "num_input_tokens_seen": 184445050, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 8580, + "time_per_iteration": 2.490298271179199 + }, + { + "auxiliary_loss_clip": 0.01111804, + "auxiliary_loss_mlp": 0.01036819, + "balance_loss_clip": 1.0237354, + "balance_loss_mlp": 1.03874159, + "epoch": 0.5159176311438449, + "flos": 12641346938880.0, + "grad_norm": 2.2420547036898277, + "language_loss": 0.72657341, + "learning_rate": 1.99396335310315e-06, + "loss": 0.74805963, + "num_input_tokens_seen": 184460775, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73046875, + "step": 8581, + "time_per_iteration": 2.419196367263794 + }, + { + "auxiliary_loss_clip": 0.01107618, + "auxiliary_loss_mlp": 0.01030311, + "balance_loss_clip": 1.01844954, + "balance_loss_mlp": 1.03848028, + "epoch": 0.5159777543965128, + "flos": 15558154951680.0, + "grad_norm": 2.260602789840083, + "language_loss": 0.74468267, + "learning_rate": 1.9935738933126508e-06, + "loss": 0.76606196, + "num_input_tokens_seen": 184477365, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.69140625, + "step": 8582, + "time_per_iteration": 2.4235429763793945 + }, + { + "auxiliary_loss_clip": 0.01107491, + "auxiliary_loss_mlp": 0.01033441, + "balance_loss_clip": 1.02201486, + "balance_loss_mlp": 1.03820109, + "epoch": 0.5160378776491809, + "flos": 23221671396480.0, + "grad_norm": 3.661326019284234, + "language_loss": 0.66308093, + "learning_rate": 1.99318443376583e-06, + "loss": 0.68449032, + "num_input_tokens_seen": 184497045, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.69140625, + "step": 8583, + "time_per_iteration": 2.483489990234375 + }, + { + "auxiliary_loss_clip": 0.0111088, + "auxiliary_loss_mlp": 0.01036135, + "balance_loss_clip": 1.02315259, + "balance_loss_mlp": 1.04015112, + "epoch": 0.5160980009018488, + "flos": 21944616180480.0, + "grad_norm": 1.4772972874821377, + "language_loss": 0.75878769, + "learning_rate": 1.9927949744774568e-06, + "loss": 0.78025782, + "num_input_tokens_seen": 184517675, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.70703125, + "step": 8584, + "time_per_iteration": 2.469770908355713 + }, + { + "auxiliary_loss_clip": 0.01109729, + "auxiliary_loss_mlp": 0.01038496, + "balance_loss_clip": 1.026057, + "balance_loss_mlp": 1.03763115, + "epoch": 0.5161581241545168, + "flos": 22784064001920.0, + "grad_norm": 1.908038470800245, + "language_loss": 0.78773153, + "learning_rate": 1.9924055154622983e-06, + "loss": 0.80921382, + "num_input_tokens_seen": 184537745, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.72265625, + "step": 8585, + "time_per_iteration": 2.4765405654907227 + }, + { + "auxiliary_loss_clip": 0.01103444, + "auxiliary_loss_mlp": 0.01031519, + "balance_loss_clip": 1.01976502, + "balance_loss_mlp": 1.03624129, + "epoch": 0.5162182474071847, + "flos": 19675384513920.0, + "grad_norm": 2.394419079152278, + "language_loss": 0.81022364, + "learning_rate": 1.9920160567351238e-06, + "loss": 0.83157325, + "num_input_tokens_seen": 184553630, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.671875, + "step": 8586, + "time_per_iteration": 2.45131254196167 + }, + { + "auxiliary_loss_clip": 0.01107797, + "auxiliary_loss_mlp": 0.01033426, + "balance_loss_clip": 1.02106369, + "balance_loss_mlp": 1.03754663, + "epoch": 0.5162783706598527, + "flos": 20046198568320.0, + "grad_norm": 2.0375667228771572, + "language_loss": 0.71716821, + "learning_rate": 1.991626598310701e-06, + "loss": 0.73858047, + "num_input_tokens_seen": 184573530, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.703125, + "step": 8587, + "time_per_iteration": 2.464603900909424 + }, + { + "auxiliary_loss_clip": 0.0103385, + "auxiliary_loss_mlp": 0.01011507, + "balance_loss_clip": 1.01052976, + "balance_loss_mlp": 1.01128352, + "epoch": 0.5163384939125206, + "flos": 69959553713280.0, + "grad_norm": 0.7317367951541988, + "language_loss": 0.57798368, + "learning_rate": 1.9912371402037984e-06, + "loss": 0.59843719, + "num_input_tokens_seen": 184637875, + "router_z_loss_clip": 0.00976562, + "router_z_loss_mlp": 0.22558594, + "step": 8588, + "time_per_iteration": 3.0708353519439697 + }, + { + "auxiliary_loss_clip": 0.01106665, + "auxiliary_loss_mlp": 0.01038792, + "balance_loss_clip": 1.02560759, + "balance_loss_mlp": 1.03631115, + "epoch": 0.5163986171651886, + "flos": 17417034668160.0, + "grad_norm": 1.9433685436573729, + "language_loss": 0.7553345, + "learning_rate": 1.990847682429185e-06, + "loss": 0.77678907, + "num_input_tokens_seen": 184656125, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 8589, + "time_per_iteration": 2.4392945766448975 + }, + { + "auxiliary_loss_clip": 0.0110855, + "auxiliary_loss_mlp": 0.01032646, + "balance_loss_clip": 1.02110088, + "balance_loss_mlp": 1.03822279, + "epoch": 0.5164587404178566, + "flos": 21322679166720.0, + "grad_norm": 2.018268520776434, + "language_loss": 0.67597556, + "learning_rate": 1.990458225001627e-06, + "loss": 0.69738752, + "num_input_tokens_seen": 184675920, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.703125, + "step": 8590, + "time_per_iteration": 2.480978012084961 + }, + { + "auxiliary_loss_clip": 0.01034536, + "auxiliary_loss_mlp": 0.01003309, + "balance_loss_clip": 1.00217628, + "balance_loss_mlp": 1.01181984, + "epoch": 0.5165188636705246, + "flos": 68057149691520.0, + "grad_norm": 0.7844517010344912, + "language_loss": 0.5593977, + "learning_rate": 1.990068767935895e-06, + "loss": 0.57977605, + "num_input_tokens_seen": 184730520, + "router_z_loss_clip": 0.01135254, + "router_z_loss_mlp": 0.2265625, + "step": 8591, + "time_per_iteration": 3.0380799770355225 + }, + { + "auxiliary_loss_clip": 0.01101472, + "auxiliary_loss_mlp": 0.01023222, + "balance_loss_clip": 1.01192665, + "balance_loss_mlp": 1.03659964, + "epoch": 0.5165789869231926, + "flos": 19385657412480.0, + "grad_norm": 1.5513724058155185, + "language_loss": 0.81425416, + "learning_rate": 1.9896793112467566e-06, + "loss": 0.83550113, + "num_input_tokens_seen": 184748340, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6484375, + "step": 8592, + "time_per_iteration": 2.4280107021331787 + }, + { + "auxiliary_loss_clip": 0.0110705, + "auxiliary_loss_mlp": 0.01023209, + "balance_loss_clip": 1.01141334, + "balance_loss_mlp": 1.04046106, + "epoch": 0.5166391101758605, + "flos": 20960197067520.0, + "grad_norm": 1.8100942034895195, + "language_loss": 0.83394146, + "learning_rate": 1.989289854948979e-06, + "loss": 0.85524404, + "num_input_tokens_seen": 184766615, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66796875, + "step": 8593, + "time_per_iteration": 3.9351704120635986 + }, + { + "auxiliary_loss_clip": 0.01109969, + "auxiliary_loss_mlp": 0.01031901, + "balance_loss_clip": 1.02004552, + "balance_loss_mlp": 1.04028952, + "epoch": 0.5166992334285285, + "flos": 29462407148160.0, + "grad_norm": 1.576203753972958, + "language_loss": 0.68724298, + "learning_rate": 1.9889003990573314e-06, + "loss": 0.70866162, + "num_input_tokens_seen": 184788075, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6953125, + "step": 8594, + "time_per_iteration": 2.547206163406372 + }, + { + "auxiliary_loss_clip": 0.01105211, + "auxiliary_loss_mlp": 0.0102705, + "balance_loss_clip": 1.01459885, + "balance_loss_mlp": 1.03660214, + "epoch": 0.5167593566811964, + "flos": 20304360593280.0, + "grad_norm": 1.9981153431236998, + "language_loss": 0.77706152, + "learning_rate": 1.988510943586582e-06, + "loss": 0.79838419, + "num_input_tokens_seen": 184808710, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6875, + "step": 8595, + "time_per_iteration": 2.5214362144470215 + }, + { + "auxiliary_loss_clip": 0.01107198, + "auxiliary_loss_mlp": 0.0103521, + "balance_loss_clip": 1.02278233, + "balance_loss_mlp": 1.03896379, + "epoch": 0.5168194799338645, + "flos": 14611370313600.0, + "grad_norm": 1.5236872991766963, + "language_loss": 0.64860648, + "learning_rate": 1.9881214885514986e-06, + "loss": 0.67003053, + "num_input_tokens_seen": 184826475, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6796875, + "step": 8596, + "time_per_iteration": 5.460975885391235 + }, + { + "auxiliary_loss_clip": 0.01109553, + "auxiliary_loss_mlp": 0.01029844, + "balance_loss_clip": 1.01603329, + "balance_loss_mlp": 1.04030609, + "epoch": 0.5168796031865324, + "flos": 25007257411200.0, + "grad_norm": 1.6129264208414336, + "language_loss": 0.75417203, + "learning_rate": 1.9877320339668492e-06, + "loss": 0.77556598, + "num_input_tokens_seen": 184845245, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.6953125, + "step": 8597, + "time_per_iteration": 2.477386236190796 + }, + { + "auxiliary_loss_clip": 0.01108076, + "auxiliary_loss_mlp": 0.01025716, + "balance_loss_clip": 1.01356828, + "balance_loss_mlp": 1.03728151, + "epoch": 0.5169397264392004, + "flos": 26939969533440.0, + "grad_norm": 1.684107970499364, + "language_loss": 0.80853873, + "learning_rate": 1.987342579847403e-06, + "loss": 0.82987666, + "num_input_tokens_seen": 184866605, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.70703125, + "step": 8598, + "time_per_iteration": 2.5056118965148926 + }, + { + "auxiliary_loss_clip": 0.01107998, + "auxiliary_loss_mlp": 0.01038534, + "balance_loss_clip": 1.02550411, + "balance_loss_mlp": 1.03853858, + "epoch": 0.5169998496918683, + "flos": 25407804948480.0, + "grad_norm": 1.5161151475530301, + "language_loss": 0.75315893, + "learning_rate": 1.9869531262079273e-06, + "loss": 0.77462423, + "num_input_tokens_seen": 184886945, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6953125, + "step": 8599, + "time_per_iteration": 2.4907233715057373 + }, + { + "auxiliary_loss_clip": 0.01106465, + "auxiliary_loss_mlp": 0.01033371, + "balance_loss_clip": 1.02142024, + "balance_loss_mlp": 1.03874612, + "epoch": 0.5170599729445363, + "flos": 24680793674880.0, + "grad_norm": 5.031269669902368, + "language_loss": 0.72193408, + "learning_rate": 1.9865636730631904e-06, + "loss": 0.74333239, + "num_input_tokens_seen": 184905590, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6796875, + "step": 8600, + "time_per_iteration": 2.4958672523498535 + }, + { + "auxiliary_loss_clip": 0.01107841, + "auxiliary_loss_mlp": 0.01031875, + "balance_loss_clip": 1.01924503, + "balance_loss_mlp": 1.03902841, + "epoch": 0.5171200961972042, + "flos": 20994455664000.0, + "grad_norm": 1.5543027238719596, + "language_loss": 0.74527812, + "learning_rate": 1.9861742204279602e-06, + "loss": 0.76667523, + "num_input_tokens_seen": 184925555, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 8601, + "time_per_iteration": 2.4545562267303467 + }, + { + "auxiliary_loss_clip": 0.01108233, + "auxiliary_loss_mlp": 0.01038986, + "balance_loss_clip": 1.02540207, + "balance_loss_mlp": 1.03855383, + "epoch": 0.5171802194498722, + "flos": 22745639427840.0, + "grad_norm": 1.930843678841908, + "language_loss": 0.83770829, + "learning_rate": 1.9857847683170045e-06, + "loss": 0.85918051, + "num_input_tokens_seen": 184944490, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.6953125, + "step": 8602, + "time_per_iteration": 2.478315591812134 + }, + { + "auxiliary_loss_clip": 0.01109334, + "auxiliary_loss_mlp": 0.01030291, + "balance_loss_clip": 1.01727891, + "balance_loss_mlp": 1.03919971, + "epoch": 0.5172403427025402, + "flos": 28176732668160.0, + "grad_norm": 1.739467426965746, + "language_loss": 0.74487793, + "learning_rate": 1.9853953167450926e-06, + "loss": 0.76627421, + "num_input_tokens_seen": 184963190, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 8603, + "time_per_iteration": 2.541987180709839 + }, + { + "auxiliary_loss_clip": 0.01110457, + "auxiliary_loss_mlp": 0.0103389, + "balance_loss_clip": 1.02172458, + "balance_loss_mlp": 1.04043818, + "epoch": 0.5173004659552082, + "flos": 20337829090560.0, + "grad_norm": 2.0493295845447435, + "language_loss": 0.72732627, + "learning_rate": 1.9850058657269915e-06, + "loss": 0.74876976, + "num_input_tokens_seen": 184981220, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69921875, + "step": 8604, + "time_per_iteration": 2.464036226272583 + }, + { + "auxiliary_loss_clip": 0.01113997, + "auxiliary_loss_mlp": 0.01032385, + "balance_loss_clip": 1.01927209, + "balance_loss_mlp": 1.03878832, + "epoch": 0.5173605892078762, + "flos": 19063323740160.0, + "grad_norm": 1.890584135418456, + "language_loss": 0.85098851, + "learning_rate": 1.984616415277469e-06, + "loss": 0.87245226, + "num_input_tokens_seen": 184998810, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.75, + "step": 8605, + "time_per_iteration": 2.469414472579956 + }, + { + "auxiliary_loss_clip": 0.01107307, + "auxiliary_loss_mlp": 0.01024655, + "balance_loss_clip": 1.01271009, + "balance_loss_mlp": 1.03827572, + "epoch": 0.5174207124605441, + "flos": 27995168396160.0, + "grad_norm": 1.4962077074735805, + "language_loss": 0.64887142, + "learning_rate": 1.984226965411294e-06, + "loss": 0.67019105, + "num_input_tokens_seen": 185021185, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.69140625, + "step": 8606, + "time_per_iteration": 2.5391039848327637 + }, + { + "auxiliary_loss_clip": 0.01108829, + "auxiliary_loss_mlp": 0.01027754, + "balance_loss_clip": 1.0153147, + "balance_loss_mlp": 1.04041243, + "epoch": 0.5174808357132121, + "flos": 19496657416320.0, + "grad_norm": 1.6359731326945595, + "language_loss": 0.77811146, + "learning_rate": 1.983837516143234e-06, + "loss": 0.79947728, + "num_input_tokens_seen": 185038465, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 8607, + "time_per_iteration": 2.4382975101470947 + }, + { + "auxiliary_loss_clip": 0.01111137, + "auxiliary_loss_mlp": 0.01033709, + "balance_loss_clip": 1.02053022, + "balance_loss_mlp": 1.0399344, + "epoch": 0.51754095896588, + "flos": 22784171742720.0, + "grad_norm": 3.5447610791610638, + "language_loss": 0.72232366, + "learning_rate": 1.983448067488057e-06, + "loss": 0.74377209, + "num_input_tokens_seen": 185057340, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 8608, + "time_per_iteration": 2.511740207672119 + }, + { + "auxiliary_loss_clip": 0.01115322, + "auxiliary_loss_mlp": 0.01032677, + "balance_loss_clip": 1.01927149, + "balance_loss_mlp": 1.04073501, + "epoch": 0.5176010822185481, + "flos": 22669257156480.0, + "grad_norm": 1.8799970026389359, + "language_loss": 0.86513162, + "learning_rate": 1.983058619460531e-06, + "loss": 0.88661158, + "num_input_tokens_seen": 185074935, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 8609, + "time_per_iteration": 2.453684091567993 + }, + { + "auxiliary_loss_clip": 0.01108892, + "auxiliary_loss_mlp": 0.01030489, + "balance_loss_clip": 1.01888371, + "balance_loss_mlp": 1.03858495, + "epoch": 0.517661205471216, + "flos": 23951196622080.0, + "grad_norm": 1.565375500859336, + "language_loss": 0.73396695, + "learning_rate": 1.9826691720754237e-06, + "loss": 0.75536072, + "num_input_tokens_seen": 185095050, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.703125, + "step": 8610, + "time_per_iteration": 2.5529308319091797 + }, + { + "auxiliary_loss_clip": 0.01115772, + "auxiliary_loss_mlp": 0.01032009, + "balance_loss_clip": 1.01813269, + "balance_loss_mlp": 1.04202247, + "epoch": 0.517721328723884, + "flos": 15596076735360.0, + "grad_norm": 1.8297114771569651, + "language_loss": 0.67358816, + "learning_rate": 1.9822797253475034e-06, + "loss": 0.69506592, + "num_input_tokens_seen": 185112275, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.73828125, + "step": 8611, + "time_per_iteration": 2.4198501110076904 + }, + { + "auxiliary_loss_clip": 0.01108783, + "auxiliary_loss_mlp": 0.01030358, + "balance_loss_clip": 1.01808488, + "balance_loss_mlp": 1.0382731, + "epoch": 0.5177814519765519, + "flos": 20960197067520.0, + "grad_norm": 2.316941620789411, + "language_loss": 0.77502143, + "learning_rate": 1.9818902792915373e-06, + "loss": 0.79641283, + "num_input_tokens_seen": 185132165, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.70703125, + "step": 8612, + "time_per_iteration": 2.4943206310272217 + }, + { + "auxiliary_loss_clip": 0.01110636, + "auxiliary_loss_mlp": 0.01034055, + "balance_loss_clip": 1.02186632, + "balance_loss_mlp": 1.03938198, + "epoch": 0.5178415752292199, + "flos": 17967832796160.0, + "grad_norm": 1.9039649692993772, + "language_loss": 0.8192755, + "learning_rate": 1.981500833922294e-06, + "loss": 0.84072244, + "num_input_tokens_seen": 185151025, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.71484375, + "step": 8613, + "time_per_iteration": 2.434479236602783 + }, + { + "auxiliary_loss_clip": 0.01114755, + "auxiliary_loss_mlp": 0.01034084, + "balance_loss_clip": 1.02059531, + "balance_loss_mlp": 1.04346251, + "epoch": 0.5179016984818878, + "flos": 17821496787840.0, + "grad_norm": 2.1674567731422987, + "language_loss": 0.66747862, + "learning_rate": 1.981111389254541e-06, + "loss": 0.68896699, + "num_input_tokens_seen": 185168455, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7109375, + "step": 8614, + "time_per_iteration": 2.4598941802978516 + }, + { + "auxiliary_loss_clip": 0.01112182, + "auxiliary_loss_mlp": 0.0103035, + "balance_loss_clip": 1.01736188, + "balance_loss_mlp": 1.04048586, + "epoch": 0.5179618217345558, + "flos": 17820455293440.0, + "grad_norm": 1.9388641649707037, + "language_loss": 0.86660814, + "learning_rate": 1.9807219453030453e-06, + "loss": 0.88803345, + "num_input_tokens_seen": 185184415, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71875, + "step": 8615, + "time_per_iteration": 2.434614419937134 + }, + { + "auxiliary_loss_clip": 0.01110692, + "auxiliary_loss_mlp": 0.01040873, + "balance_loss_clip": 1.02877903, + "balance_loss_mlp": 1.04087663, + "epoch": 0.5180219449872238, + "flos": 22522131048960.0, + "grad_norm": 1.572223272426788, + "language_loss": 0.80601507, + "learning_rate": 1.9803325020825763e-06, + "loss": 0.82753074, + "num_input_tokens_seen": 185202910, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69921875, + "step": 8616, + "time_per_iteration": 2.489898920059204 + }, + { + "auxiliary_loss_clip": 0.01119523, + "auxiliary_loss_mlp": 0.01042995, + "balance_loss_clip": 1.02928019, + "balance_loss_mlp": 1.04558134, + "epoch": 0.5180820682398918, + "flos": 23915465568000.0, + "grad_norm": 1.6322050900799092, + "language_loss": 0.7524333, + "learning_rate": 1.9799430596079e-06, + "loss": 0.77405852, + "num_input_tokens_seen": 185223085, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7421875, + "step": 8617, + "time_per_iteration": 2.4741597175598145 + }, + { + "auxiliary_loss_clip": 0.0111036, + "auxiliary_loss_mlp": 0.01032777, + "balance_loss_clip": 1.01977718, + "balance_loss_mlp": 1.03946304, + "epoch": 0.5181421914925598, + "flos": 16979930064000.0, + "grad_norm": 1.8314484463575909, + "language_loss": 0.70137858, + "learning_rate": 1.979553617893785e-06, + "loss": 0.72280991, + "num_input_tokens_seen": 185241295, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.70703125, + "step": 8618, + "time_per_iteration": 2.4596426486968994 + }, + { + "auxiliary_loss_clip": 0.01036764, + "auxiliary_loss_mlp": 0.01001258, + "balance_loss_clip": 1.00007808, + "balance_loss_mlp": 1.01408625, + "epoch": 0.5182023147452277, + "flos": 66059870872320.0, + "grad_norm": 0.9556911586994957, + "language_loss": 0.67222798, + "learning_rate": 1.979164176954999e-06, + "loss": 0.69260818, + "num_input_tokens_seen": 185298295, + "router_z_loss_clip": 0.01177979, + "router_z_loss_mlp": 0.2265625, + "step": 8619, + "time_per_iteration": 3.0123016834259033 + }, + { + "auxiliary_loss_clip": 0.01107081, + "auxiliary_loss_mlp": 0.01032829, + "balance_loss_clip": 1.02055597, + "balance_loss_mlp": 1.03924203, + "epoch": 0.5182624379978957, + "flos": 18187749815040.0, + "grad_norm": 2.197431442121674, + "language_loss": 0.79314506, + "learning_rate": 1.97877473680631e-06, + "loss": 0.81454414, + "num_input_tokens_seen": 185317000, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6796875, + "step": 8620, + "time_per_iteration": 2.445173740386963 + }, + { + "auxiliary_loss_clip": 0.01108259, + "auxiliary_loss_mlp": 0.01038483, + "balance_loss_clip": 1.02625203, + "balance_loss_mlp": 1.03989077, + "epoch": 0.5183225612505636, + "flos": 14026708638720.0, + "grad_norm": 2.0514402600561765, + "language_loss": 0.81893396, + "learning_rate": 1.9783852974624846e-06, + "loss": 0.84040135, + "num_input_tokens_seen": 185331185, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 8621, + "time_per_iteration": 2.4382779598236084 + }, + { + "auxiliary_loss_clip": 0.01109273, + "auxiliary_loss_mlp": 0.01032378, + "balance_loss_clip": 1.02073121, + "balance_loss_mlp": 1.0391438, + "epoch": 0.5183826845032317, + "flos": 23659781581440.0, + "grad_norm": 1.9740999547408657, + "language_loss": 0.65540636, + "learning_rate": 1.9779958589382905e-06, + "loss": 0.67682284, + "num_input_tokens_seen": 185348955, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.69921875, + "step": 8622, + "time_per_iteration": 2.494173288345337 + }, + { + "auxiliary_loss_clip": 0.01114132, + "auxiliary_loss_mlp": 0.01038751, + "balance_loss_clip": 1.02528644, + "balance_loss_mlp": 1.04077148, + "epoch": 0.5184428077558996, + "flos": 15888605097600.0, + "grad_norm": 1.975231537474399, + "language_loss": 0.60350323, + "learning_rate": 1.977606421248497e-06, + "loss": 0.62503201, + "num_input_tokens_seen": 185367330, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 8623, + "time_per_iteration": 2.427819013595581 + }, + { + "auxiliary_loss_clip": 0.0110912, + "auxiliary_loss_mlp": 0.01032142, + "balance_loss_clip": 1.01995301, + "balance_loss_mlp": 1.03832614, + "epoch": 0.5185029310085676, + "flos": 21030833162880.0, + "grad_norm": 1.7021073046505133, + "language_loss": 0.76074666, + "learning_rate": 1.9772169844078685e-06, + "loss": 0.78215921, + "num_input_tokens_seen": 185385060, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.70703125, + "step": 8624, + "time_per_iteration": 2.4636356830596924 + }, + { + "auxiliary_loss_clip": 0.01109665, + "auxiliary_loss_mlp": 0.01036507, + "balance_loss_clip": 1.02441311, + "balance_loss_mlp": 1.03890038, + "epoch": 0.5185630542612355, + "flos": 26542690133760.0, + "grad_norm": 2.7326139645058456, + "language_loss": 0.71175325, + "learning_rate": 1.9768275484311756e-06, + "loss": 0.73321491, + "num_input_tokens_seen": 185403745, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.70703125, + "step": 8625, + "time_per_iteration": 2.4977569580078125 + }, + { + "auxiliary_loss_clip": 0.01110816, + "auxiliary_loss_mlp": 0.01034143, + "balance_loss_clip": 1.02223408, + "balance_loss_mlp": 1.03980732, + "epoch": 0.5186231775139035, + "flos": 20668422890880.0, + "grad_norm": 1.8950159086376122, + "language_loss": 0.67929721, + "learning_rate": 1.976438113333184e-06, + "loss": 0.70074677, + "num_input_tokens_seen": 185422620, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.7109375, + "step": 8626, + "time_per_iteration": 2.4934957027435303 + }, + { + "auxiliary_loss_clip": 0.01108701, + "auxiliary_loss_mlp": 0.010311, + "balance_loss_clip": 1.01889873, + "balance_loss_mlp": 1.03984976, + "epoch": 0.5186833007665714, + "flos": 20885502735360.0, + "grad_norm": 2.322377605069906, + "language_loss": 0.70487207, + "learning_rate": 1.9760486791286612e-06, + "loss": 0.72627008, + "num_input_tokens_seen": 185439380, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 8627, + "time_per_iteration": 2.445827007293701 + }, + { + "auxiliary_loss_clip": 0.01114683, + "auxiliary_loss_mlp": 0.01037557, + "balance_loss_clip": 1.02539158, + "balance_loss_mlp": 1.04147446, + "epoch": 0.5187434240192395, + "flos": 20886903365760.0, + "grad_norm": 1.9255563847501656, + "language_loss": 0.73209083, + "learning_rate": 1.9756592458323753e-06, + "loss": 0.75361323, + "num_input_tokens_seen": 185458830, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.734375, + "step": 8628, + "time_per_iteration": 2.500955581665039 + }, + { + "auxiliary_loss_clip": 0.01110668, + "auxiliary_loss_mlp": 0.01032241, + "balance_loss_clip": 1.02039731, + "balance_loss_mlp": 1.04147768, + "epoch": 0.5188035472719074, + "flos": 19859929614720.0, + "grad_norm": 3.3927220028721994, + "language_loss": 0.77245331, + "learning_rate": 1.9752698134590927e-06, + "loss": 0.79388249, + "num_input_tokens_seen": 185477270, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.69140625, + "step": 8629, + "time_per_iteration": 2.4560301303863525 + }, + { + "auxiliary_loss_clip": 0.01113327, + "auxiliary_loss_mlp": 0.01030634, + "balance_loss_clip": 1.0179081, + "balance_loss_mlp": 1.04206562, + "epoch": 0.5188636705245754, + "flos": 21138313633920.0, + "grad_norm": 2.1928775386787187, + "language_loss": 0.74820137, + "learning_rate": 1.9748803820235815e-06, + "loss": 0.76964092, + "num_input_tokens_seen": 185495795, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 8630, + "time_per_iteration": 2.496370792388916 + }, + { + "auxiliary_loss_clip": 0.01110145, + "auxiliary_loss_mlp": 0.01035209, + "balance_loss_clip": 1.02210796, + "balance_loss_mlp": 1.03882229, + "epoch": 0.5189237937772434, + "flos": 22419786222720.0, + "grad_norm": 1.6137116253106134, + "language_loss": 0.80663669, + "learning_rate": 1.9744909515406093e-06, + "loss": 0.82809031, + "num_input_tokens_seen": 185514885, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 8631, + "time_per_iteration": 2.4534530639648438 + }, + { + "auxiliary_loss_clip": 0.01112884, + "auxiliary_loss_mlp": 0.01032333, + "balance_loss_clip": 1.01893413, + "balance_loss_mlp": 1.04085588, + "epoch": 0.5189839170299113, + "flos": 25446696399360.0, + "grad_norm": 1.5022963557810187, + "language_loss": 0.74575752, + "learning_rate": 1.974101522024942e-06, + "loss": 0.76720965, + "num_input_tokens_seen": 185537155, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.71875, + "step": 8632, + "time_per_iteration": 2.5295352935791016 + }, + { + "auxiliary_loss_clip": 0.01105073, + "auxiliary_loss_mlp": 0.01030609, + "balance_loss_clip": 1.01810372, + "balance_loss_mlp": 1.03738809, + "epoch": 0.5190440402825793, + "flos": 18587722734720.0, + "grad_norm": 1.784064079335437, + "language_loss": 0.78812337, + "learning_rate": 1.9737120934913477e-06, + "loss": 0.80948019, + "num_input_tokens_seen": 185555520, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.67578125, + "step": 8633, + "time_per_iteration": 2.4241905212402344 + }, + { + "auxiliary_loss_clip": 0.01109914, + "auxiliary_loss_mlp": 0.01031089, + "balance_loss_clip": 1.01873302, + "balance_loss_mlp": 1.03893745, + "epoch": 0.5191041635352472, + "flos": 21908633731200.0, + "grad_norm": 1.7026702061892323, + "language_loss": 0.80149853, + "learning_rate": 1.9733226659545936e-06, + "loss": 0.82290852, + "num_input_tokens_seen": 185573855, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.70703125, + "step": 8634, + "time_per_iteration": 2.4851884841918945 + }, + { + "auxiliary_loss_clip": 0.01108415, + "auxiliary_loss_mlp": 0.01035144, + "balance_loss_clip": 1.02305627, + "balance_loss_mlp": 1.04024315, + "epoch": 0.5191642867879153, + "flos": 27527971173120.0, + "grad_norm": 1.4600796720036056, + "language_loss": 0.68628252, + "learning_rate": 1.9729332394294467e-06, + "loss": 0.70771807, + "num_input_tokens_seen": 185595145, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 8635, + "time_per_iteration": 3.921346426010132 + }, + { + "auxiliary_loss_clip": 0.01113121, + "auxiliary_loss_mlp": 0.01033538, + "balance_loss_clip": 1.02083683, + "balance_loss_mlp": 1.04083443, + "epoch": 0.5192244100405832, + "flos": 15705999331200.0, + "grad_norm": 1.6781612563386181, + "language_loss": 0.7704699, + "learning_rate": 1.9725438139306742e-06, + "loss": 0.79193652, + "num_input_tokens_seen": 185613320, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 8636, + "time_per_iteration": 2.45908260345459 + }, + { + "auxiliary_loss_clip": 0.01112314, + "auxiliary_loss_mlp": 0.01031183, + "balance_loss_clip": 1.01861811, + "balance_loss_mlp": 1.04090476, + "epoch": 0.5192845332932512, + "flos": 12057080313600.0, + "grad_norm": 1.9891179602637588, + "language_loss": 0.71459377, + "learning_rate": 1.9721543894730425e-06, + "loss": 0.73602873, + "num_input_tokens_seen": 185630730, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71484375, + "step": 8637, + "time_per_iteration": 5.353722810745239 + }, + { + "auxiliary_loss_clip": 0.01108688, + "auxiliary_loss_mlp": 0.01032602, + "balance_loss_clip": 1.01964426, + "balance_loss_mlp": 1.0394423, + "epoch": 0.5193446565459191, + "flos": 18953185662720.0, + "grad_norm": 3.7284266214304576, + "language_loss": 0.75943041, + "learning_rate": 1.9717649660713194e-06, + "loss": 0.78084332, + "num_input_tokens_seen": 185648515, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.69140625, + "step": 8638, + "time_per_iteration": 3.902477741241455 + }, + { + "auxiliary_loss_clip": 0.0110838, + "auxiliary_loss_mlp": 0.01030358, + "balance_loss_clip": 1.0175786, + "balance_loss_mlp": 1.03863966, + "epoch": 0.5194047797985871, + "flos": 20374960775040.0, + "grad_norm": 2.006346025426826, + "language_loss": 0.74846971, + "learning_rate": 1.971375543740272e-06, + "loss": 0.76985711, + "num_input_tokens_seen": 185665220, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 8639, + "time_per_iteration": 2.453634738922119 + }, + { + "auxiliary_loss_clip": 0.01109964, + "auxiliary_loss_mlp": 0.01028741, + "balance_loss_clip": 1.01604497, + "balance_loss_mlp": 1.04051375, + "epoch": 0.519464903051255, + "flos": 24353001135360.0, + "grad_norm": 1.6163455561126134, + "language_loss": 0.77538067, + "learning_rate": 1.9709861224946665e-06, + "loss": 0.79676771, + "num_input_tokens_seen": 185683750, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 8640, + "time_per_iteration": 2.482334613800049 + }, + { + "auxiliary_loss_clip": 0.01110191, + "auxiliary_loss_mlp": 0.01030568, + "balance_loss_clip": 1.01883161, + "balance_loss_mlp": 1.04175985, + "epoch": 0.519525026303923, + "flos": 14061829161600.0, + "grad_norm": 1.623082815057782, + "language_loss": 0.65734208, + "learning_rate": 1.97059670234927e-06, + "loss": 0.67874962, + "num_input_tokens_seen": 185700625, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.68359375, + "step": 8641, + "time_per_iteration": 2.4567995071411133 + }, + { + "auxiliary_loss_clip": 0.01109994, + "auxiliary_loss_mlp": 0.01033178, + "balance_loss_clip": 1.02142978, + "balance_loss_mlp": 1.04105425, + "epoch": 0.519585149556591, + "flos": 28835873193600.0, + "grad_norm": 1.8491224599980307, + "language_loss": 0.76197445, + "learning_rate": 1.97020728331885e-06, + "loss": 0.78340614, + "num_input_tokens_seen": 185721155, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.69140625, + "step": 8642, + "time_per_iteration": 2.5128276348114014 + }, + { + "auxiliary_loss_clip": 0.01109094, + "auxiliary_loss_mlp": 0.01031951, + "balance_loss_clip": 1.02001774, + "balance_loss_mlp": 1.04037452, + "epoch": 0.519645272809259, + "flos": 25373007648000.0, + "grad_norm": 1.4733024685255247, + "language_loss": 0.83179498, + "learning_rate": 1.9698178654181726e-06, + "loss": 0.85320538, + "num_input_tokens_seen": 185740990, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 8643, + "time_per_iteration": 2.5094587802886963 + }, + { + "auxiliary_loss_clip": 0.0111188, + "auxiliary_loss_mlp": 0.01042001, + "balance_loss_clip": 1.02856052, + "balance_loss_mlp": 1.03983521, + "epoch": 0.519705396061927, + "flos": 25372863993600.0, + "grad_norm": 1.5341454697133152, + "language_loss": 0.70307451, + "learning_rate": 1.969428448662004e-06, + "loss": 0.72461337, + "num_input_tokens_seen": 185762235, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71875, + "step": 8644, + "time_per_iteration": 2.5111963748931885 + }, + { + "auxiliary_loss_clip": 0.01110422, + "auxiliary_loss_mlp": 0.01031611, + "balance_loss_clip": 1.01967788, + "balance_loss_mlp": 1.03966331, + "epoch": 0.5197655193145949, + "flos": 28476228268800.0, + "grad_norm": 1.8635414079348847, + "language_loss": 0.80144334, + "learning_rate": 1.9690390330651133e-06, + "loss": 0.82286364, + "num_input_tokens_seen": 185783415, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.70703125, + "step": 8645, + "time_per_iteration": 2.529616117477417 + }, + { + "auxiliary_loss_clip": 0.01109035, + "auxiliary_loss_mlp": 0.01029263, + "balance_loss_clip": 1.01647151, + "balance_loss_mlp": 1.03836131, + "epoch": 0.5198256425672629, + "flos": 20009138711040.0, + "grad_norm": 1.899493861617854, + "language_loss": 0.78147799, + "learning_rate": 1.968649618642264e-06, + "loss": 0.80286086, + "num_input_tokens_seen": 185801345, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.70703125, + "step": 8646, + "time_per_iteration": 2.4409830570220947 + }, + { + "auxiliary_loss_clip": 0.01112803, + "auxiliary_loss_mlp": 0.01033352, + "balance_loss_clip": 1.02101934, + "balance_loss_mlp": 1.04184628, + "epoch": 0.5198857658199308, + "flos": 19828867328640.0, + "grad_norm": 1.8109153766187511, + "language_loss": 0.66239858, + "learning_rate": 1.9682602054082252e-06, + "loss": 0.68386012, + "num_input_tokens_seen": 185820815, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.7109375, + "step": 8647, + "time_per_iteration": 2.4503657817840576 + }, + { + "auxiliary_loss_clip": 0.01113411, + "auxiliary_loss_mlp": 0.01032738, + "balance_loss_clip": 1.01834917, + "balance_loss_mlp": 1.04010677, + "epoch": 0.5199458890725989, + "flos": 24461918150400.0, + "grad_norm": 4.112424605735972, + "language_loss": 0.71817285, + "learning_rate": 1.967870793377763e-06, + "loss": 0.73963439, + "num_input_tokens_seen": 185841450, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.734375, + "step": 8648, + "time_per_iteration": 2.49595308303833 + }, + { + "auxiliary_loss_clip": 0.01112873, + "auxiliary_loss_mlp": 0.0103029, + "balance_loss_clip": 1.01714706, + "balance_loss_mlp": 1.0411458, + "epoch": 0.5200060123252668, + "flos": 23404779953280.0, + "grad_norm": 1.6438613988660609, + "language_loss": 0.64412069, + "learning_rate": 1.967481382565642e-06, + "loss": 0.66555232, + "num_input_tokens_seen": 185859935, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 8649, + "time_per_iteration": 2.4781436920166016 + }, + { + "auxiliary_loss_clip": 0.01116558, + "auxiliary_loss_mlp": 0.01035075, + "balance_loss_clip": 1.02025771, + "balance_loss_mlp": 1.04224229, + "epoch": 0.5200661355779348, + "flos": 17201355454080.0, + "grad_norm": 1.8268985026448872, + "language_loss": 0.70691884, + "learning_rate": 1.9670919729866315e-06, + "loss": 0.72843516, + "num_input_tokens_seen": 185876795, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7421875, + "step": 8650, + "time_per_iteration": 2.4350762367248535 + }, + { + "auxiliary_loss_clip": 0.01108729, + "auxiliary_loss_mlp": 0.01028355, + "balance_loss_clip": 1.01559973, + "balance_loss_mlp": 1.03854239, + "epoch": 0.5201262588306027, + "flos": 18515075477760.0, + "grad_norm": 1.6557672224542628, + "language_loss": 0.7709741, + "learning_rate": 1.966702564655496e-06, + "loss": 0.79234493, + "num_input_tokens_seen": 185895570, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 8651, + "time_per_iteration": 2.4439852237701416 + }, + { + "auxiliary_loss_clip": 0.01115555, + "auxiliary_loss_mlp": 0.01035706, + "balance_loss_clip": 1.02171111, + "balance_loss_mlp": 1.04384518, + "epoch": 0.5201863820832707, + "flos": 18619395552000.0, + "grad_norm": 1.7772284952150523, + "language_loss": 0.78304142, + "learning_rate": 1.966313157587003e-06, + "loss": 0.80455399, + "num_input_tokens_seen": 185913700, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.71484375, + "step": 8652, + "time_per_iteration": 2.4581267833709717 + }, + { + "auxiliary_loss_clip": 0.01114617, + "auxiliary_loss_mlp": 0.01031149, + "balance_loss_clip": 1.01683807, + "balance_loss_mlp": 1.04281044, + "epoch": 0.5202465053359386, + "flos": 22857142222080.0, + "grad_norm": 2.0186078989624017, + "language_loss": 0.7027083, + "learning_rate": 1.9659237517959187e-06, + "loss": 0.72416592, + "num_input_tokens_seen": 185932460, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.71875, + "step": 8653, + "time_per_iteration": 2.4945242404937744 + }, + { + "auxiliary_loss_clip": 0.01114383, + "auxiliary_loss_mlp": 0.01040745, + "balance_loss_clip": 1.02703571, + "balance_loss_mlp": 1.04092932, + "epoch": 0.5203066285886067, + "flos": 21981532383360.0, + "grad_norm": 1.6276924489714153, + "language_loss": 0.78420818, + "learning_rate": 1.965534347297008e-06, + "loss": 0.80575949, + "num_input_tokens_seen": 185952030, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.734375, + "step": 8654, + "time_per_iteration": 2.4857122898101807 + }, + { + "auxiliary_loss_clip": 0.01117815, + "auxiliary_loss_mlp": 0.01038442, + "balance_loss_clip": 1.02450645, + "balance_loss_mlp": 1.04275405, + "epoch": 0.5203667518412746, + "flos": 20233329448320.0, + "grad_norm": 2.316843494652732, + "language_loss": 0.8424964, + "learning_rate": 1.9651449441050393e-06, + "loss": 0.86405897, + "num_input_tokens_seen": 185973130, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 8655, + "time_per_iteration": 2.48307728767395 + }, + { + "auxiliary_loss_clip": 0.0111092, + "auxiliary_loss_mlp": 0.01030844, + "balance_loss_clip": 1.01860702, + "balance_loss_mlp": 1.04225183, + "epoch": 0.5204268750939426, + "flos": 15705460627200.0, + "grad_norm": 3.712191764961765, + "language_loss": 0.65503991, + "learning_rate": 1.9647555422347777e-06, + "loss": 0.67645752, + "num_input_tokens_seen": 185990200, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 8656, + "time_per_iteration": 2.442760705947876 + }, + { + "auxiliary_loss_clip": 0.01114044, + "auxiliary_loss_mlp": 0.01029702, + "balance_loss_clip": 1.0173285, + "balance_loss_mlp": 1.04263127, + "epoch": 0.5204869983466105, + "flos": 27449469999360.0, + "grad_norm": 2.4919467158509385, + "language_loss": 0.73240453, + "learning_rate": 1.9643661417009893e-06, + "loss": 0.753842, + "num_input_tokens_seen": 186009880, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.71484375, + "step": 8657, + "time_per_iteration": 2.5198535919189453 + }, + { + "auxiliary_loss_clip": 0.01111884, + "auxiliary_loss_mlp": 0.01033998, + "balance_loss_clip": 1.02064037, + "balance_loss_mlp": 1.042382, + "epoch": 0.5205471215992785, + "flos": 20595452411520.0, + "grad_norm": 1.757060291742625, + "language_loss": 0.71675289, + "learning_rate": 1.9639767425184408e-06, + "loss": 0.73821175, + "num_input_tokens_seen": 186026680, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.6953125, + "step": 8658, + "time_per_iteration": 2.4651598930358887 + }, + { + "auxiliary_loss_clip": 0.0111093, + "auxiliary_loss_mlp": 0.01031842, + "balance_loss_clip": 1.01868176, + "balance_loss_mlp": 1.0400281, + "epoch": 0.5206072448519465, + "flos": 22127904305280.0, + "grad_norm": 1.6795003925123537, + "language_loss": 0.83473611, + "learning_rate": 1.963587344701897e-06, + "loss": 0.85616386, + "num_input_tokens_seen": 186046920, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 8659, + "time_per_iteration": 2.462956428527832 + }, + { + "auxiliary_loss_clip": 0.01119845, + "auxiliary_loss_mlp": 0.01039223, + "balance_loss_clip": 1.02470934, + "balance_loss_mlp": 1.04351366, + "epoch": 0.5206673681046144, + "flos": 18330422636160.0, + "grad_norm": 1.9135176980647008, + "language_loss": 0.75763941, + "learning_rate": 1.9631979482661253e-06, + "loss": 0.77923, + "num_input_tokens_seen": 186062090, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.76171875, + "step": 8660, + "time_per_iteration": 2.4544646739959717 + }, + { + "auxiliary_loss_clip": 0.01111893, + "auxiliary_loss_mlp": 0.0103427, + "balance_loss_clip": 1.02199721, + "balance_loss_mlp": 1.04152977, + "epoch": 0.5207274913572825, + "flos": 20230240878720.0, + "grad_norm": 1.7715737398241405, + "language_loss": 0.78001404, + "learning_rate": 1.9628085532258906e-06, + "loss": 0.80147564, + "num_input_tokens_seen": 186081135, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.703125, + "step": 8661, + "time_per_iteration": 2.4456324577331543 + }, + { + "auxiliary_loss_clip": 0.01113873, + "auxiliary_loss_mlp": 0.01030884, + "balance_loss_clip": 1.01818848, + "balance_loss_mlp": 1.0404228, + "epoch": 0.5207876146099504, + "flos": 22127042378880.0, + "grad_norm": 1.805356331270093, + "language_loss": 0.70643514, + "learning_rate": 1.9624191595959603e-06, + "loss": 0.72788274, + "num_input_tokens_seen": 186099700, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.734375, + "step": 8662, + "time_per_iteration": 2.5272181034088135 + }, + { + "auxiliary_loss_clip": 0.01110335, + "auxiliary_loss_mlp": 0.01032574, + "balance_loss_clip": 1.01835203, + "balance_loss_mlp": 1.04033709, + "epoch": 0.5208477378626184, + "flos": 23878908501120.0, + "grad_norm": 1.669754729528693, + "language_loss": 0.6935755, + "learning_rate": 1.962029767391098e-06, + "loss": 0.71500456, + "num_input_tokens_seen": 186119740, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.69921875, + "step": 8663, + "time_per_iteration": 2.468287706375122 + }, + { + "auxiliary_loss_clip": 0.01113011, + "auxiliary_loss_mlp": 0.01031152, + "balance_loss_clip": 1.01822364, + "balance_loss_mlp": 1.04173064, + "epoch": 0.5209078611152863, + "flos": 20961525870720.0, + "grad_norm": 2.618720199838109, + "language_loss": 0.76771712, + "learning_rate": 1.961640376626072e-06, + "loss": 0.7891587, + "num_input_tokens_seen": 186140645, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71484375, + "step": 8664, + "time_per_iteration": 2.519645929336548 + }, + { + "auxiliary_loss_clip": 0.01111987, + "auxiliary_loss_mlp": 0.01036724, + "balance_loss_clip": 1.02387905, + "balance_loss_mlp": 1.04057467, + "epoch": 0.5209679843679543, + "flos": 20667740532480.0, + "grad_norm": 1.987870026093088, + "language_loss": 0.76193488, + "learning_rate": 1.961250987315646e-06, + "loss": 0.78342199, + "num_input_tokens_seen": 186160130, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71484375, + "step": 8665, + "time_per_iteration": 2.4501259326934814 + }, + { + "auxiliary_loss_clip": 0.01111359, + "auxiliary_loss_mlp": 0.01033252, + "balance_loss_clip": 1.02113414, + "balance_loss_mlp": 1.04135728, + "epoch": 0.5210281076206222, + "flos": 20227295963520.0, + "grad_norm": 1.609030555811117, + "language_loss": 0.71689177, + "learning_rate": 1.960861599474586e-06, + "loss": 0.73833793, + "num_input_tokens_seen": 186179485, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69921875, + "step": 8666, + "time_per_iteration": 2.4961183071136475 + }, + { + "auxiliary_loss_clip": 0.01119663, + "auxiliary_loss_mlp": 0.01035445, + "balance_loss_clip": 1.02031779, + "balance_loss_mlp": 1.04257357, + "epoch": 0.5210882308732903, + "flos": 16069989801600.0, + "grad_norm": 2.081998488723945, + "language_loss": 0.68599117, + "learning_rate": 1.9604722131176592e-06, + "loss": 0.7075423, + "num_input_tokens_seen": 186197140, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.76953125, + "step": 8667, + "time_per_iteration": 2.4216842651367188 + }, + { + "auxiliary_loss_clip": 0.01107841, + "auxiliary_loss_mlp": 0.01034805, + "balance_loss_clip": 1.02247858, + "balance_loss_mlp": 1.03913903, + "epoch": 0.5211483541259582, + "flos": 24825298089600.0, + "grad_norm": 1.3811752682570164, + "language_loss": 0.81006289, + "learning_rate": 1.960082828259629e-06, + "loss": 0.83148932, + "num_input_tokens_seen": 186216800, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6875, + "step": 8668, + "time_per_iteration": 2.5712640285491943 + }, + { + "auxiliary_loss_clip": 0.01112305, + "auxiliary_loss_mlp": 0.01031429, + "balance_loss_clip": 1.0184648, + "balance_loss_mlp": 1.0413909, + "epoch": 0.5212084773786262, + "flos": 20370651143040.0, + "grad_norm": 2.7130530435254507, + "language_loss": 0.63821161, + "learning_rate": 1.9596934449152623e-06, + "loss": 0.65964901, + "num_input_tokens_seen": 186235320, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 8669, + "time_per_iteration": 2.485560894012451 + }, + { + "auxiliary_loss_clip": 0.01114118, + "auxiliary_loss_mlp": 0.01040749, + "balance_loss_clip": 1.02779722, + "balance_loss_mlp": 1.0434041, + "epoch": 0.5212686006312941, + "flos": 23145468693120.0, + "grad_norm": 1.5472632399176471, + "language_loss": 0.66420943, + "learning_rate": 1.959304063099325e-06, + "loss": 0.68575811, + "num_input_tokens_seen": 186254460, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 8670, + "time_per_iteration": 2.5161590576171875 + }, + { + "auxiliary_loss_clip": 0.01107902, + "auxiliary_loss_mlp": 0.01034298, + "balance_loss_clip": 1.02204931, + "balance_loss_mlp": 1.04005504, + "epoch": 0.5213287238839621, + "flos": 27774030314880.0, + "grad_norm": 2.0274420083477436, + "language_loss": 0.7666502, + "learning_rate": 1.9589146828265806e-06, + "loss": 0.78807229, + "num_input_tokens_seen": 186269465, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6796875, + "step": 8671, + "time_per_iteration": 2.4505884647369385 + }, + { + "auxiliary_loss_clip": 0.01117202, + "auxiliary_loss_mlp": 0.01035681, + "balance_loss_clip": 1.022246, + "balance_loss_mlp": 1.0442729, + "epoch": 0.5213888471366301, + "flos": 19937676602880.0, + "grad_norm": 6.168212064153821, + "language_loss": 0.78184325, + "learning_rate": 1.958525304111796e-06, + "loss": 0.80337209, + "num_input_tokens_seen": 186288660, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73046875, + "step": 8672, + "time_per_iteration": 2.501171350479126 + }, + { + "auxiliary_loss_clip": 0.0110814, + "auxiliary_loss_mlp": 0.0103169, + "balance_loss_clip": 1.01958418, + "balance_loss_mlp": 1.03945541, + "epoch": 0.521448970389298, + "flos": 16982731324800.0, + "grad_norm": 1.8428028532242804, + "language_loss": 0.72013724, + "learning_rate": 1.958135926969736e-06, + "loss": 0.74153554, + "num_input_tokens_seen": 186305760, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 8673, + "time_per_iteration": 2.4188430309295654 + }, + { + "auxiliary_loss_clip": 0.01110821, + "auxiliary_loss_mlp": 0.0102766, + "balance_loss_clip": 1.01467764, + "balance_loss_mlp": 1.04007983, + "epoch": 0.5215090936419661, + "flos": 18989706816000.0, + "grad_norm": 1.5425888836045836, + "language_loss": 0.75258517, + "learning_rate": 1.957746551415166e-06, + "loss": 0.77397001, + "num_input_tokens_seen": 186324135, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.70703125, + "step": 8674, + "time_per_iteration": 2.4615721702575684 + }, + { + "auxiliary_loss_clip": 0.01112251, + "auxiliary_loss_mlp": 0.01034993, + "balance_loss_clip": 1.02111149, + "balance_loss_mlp": 1.03926849, + "epoch": 0.521569216894634, + "flos": 16143427157760.0, + "grad_norm": 2.4005630002003198, + "language_loss": 0.86177206, + "learning_rate": 1.9573571774628506e-06, + "loss": 0.88324457, + "num_input_tokens_seen": 186340205, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7265625, + "step": 8675, + "time_per_iteration": 2.4192757606506348 + }, + { + "auxiliary_loss_clip": 0.01036097, + "auxiliary_loss_mlp": 0.00999914, + "balance_loss_clip": 0.99874002, + "balance_loss_mlp": 1.01361609, + "epoch": 0.521629340147302, + "flos": 57579493282560.0, + "grad_norm": 0.8810836824461878, + "language_loss": 0.6315189, + "learning_rate": 1.9569678051275556e-06, + "loss": 0.65187901, + "num_input_tokens_seen": 186396940, + "router_z_loss_clip": 0.01171875, + "router_z_loss_mlp": 0.22460938, + "step": 8676, + "time_per_iteration": 4.428101062774658 + }, + { + "auxiliary_loss_clip": 0.01110201, + "auxiliary_loss_mlp": 0.01030366, + "balance_loss_clip": 1.0180341, + "balance_loss_mlp": 1.04064405, + "epoch": 0.5216894633999699, + "flos": 26796901662720.0, + "grad_norm": 1.671918865817182, + "language_loss": 0.68830431, + "learning_rate": 1.956578434424046e-06, + "loss": 0.70970994, + "num_input_tokens_seen": 186418680, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6953125, + "step": 8677, + "time_per_iteration": 2.54658579826355 + }, + { + "auxiliary_loss_clip": 0.01110241, + "auxiliary_loss_mlp": 0.01030131, + "balance_loss_clip": 1.01739907, + "balance_loss_mlp": 1.03994, + "epoch": 0.5217495866526379, + "flos": 26358719650560.0, + "grad_norm": 1.5408434392952677, + "language_loss": 0.65516353, + "learning_rate": 1.956189065367086e-06, + "loss": 0.6765672, + "num_input_tokens_seen": 186438265, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 8678, + "time_per_iteration": 2.4848899841308594 + }, + { + "auxiliary_loss_clip": 0.01115921, + "auxiliary_loss_mlp": 0.01039888, + "balance_loss_clip": 1.02607715, + "balance_loss_mlp": 1.04188991, + "epoch": 0.5218097099053058, + "flos": 23584009841280.0, + "grad_norm": 2.860112109233836, + "language_loss": 0.69020754, + "learning_rate": 1.9557996979714414e-06, + "loss": 0.71176565, + "num_input_tokens_seen": 186456870, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7421875, + "step": 8679, + "time_per_iteration": 5.267160654067993 + }, + { + "auxiliary_loss_clip": 0.01114296, + "auxiliary_loss_mlp": 0.01037367, + "balance_loss_clip": 1.02467108, + "balance_loss_mlp": 1.04272938, + "epoch": 0.5218698331579739, + "flos": 18077396256000.0, + "grad_norm": 1.7057222009225053, + "language_loss": 0.66956079, + "learning_rate": 1.9554103322518764e-06, + "loss": 0.69107741, + "num_input_tokens_seen": 186476425, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71484375, + "step": 8680, + "time_per_iteration": 3.938239574432373 + }, + { + "auxiliary_loss_clip": 0.01112432, + "auxiliary_loss_mlp": 0.01035918, + "balance_loss_clip": 1.02248955, + "balance_loss_mlp": 1.04123902, + "epoch": 0.5219299564106418, + "flos": 19281121856640.0, + "grad_norm": 1.8837479968625288, + "language_loss": 0.83069575, + "learning_rate": 1.955020968223156e-06, + "loss": 0.85217923, + "num_input_tokens_seen": 186492555, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7109375, + "step": 8681, + "time_per_iteration": 2.475834369659424 + }, + { + "auxiliary_loss_clip": 0.01110385, + "auxiliary_loss_mlp": 0.01034309, + "balance_loss_clip": 1.02189326, + "balance_loss_mlp": 1.03964293, + "epoch": 0.5219900796633098, + "flos": 26651355753600.0, + "grad_norm": 1.7236617199536146, + "language_loss": 0.77448237, + "learning_rate": 1.9546316059000454e-06, + "loss": 0.79592931, + "num_input_tokens_seen": 186513190, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.70703125, + "step": 8682, + "time_per_iteration": 2.484111785888672 + }, + { + "auxiliary_loss_clip": 0.01112356, + "auxiliary_loss_mlp": 0.01043116, + "balance_loss_clip": 1.03124917, + "balance_loss_mlp": 1.041852, + "epoch": 0.5220502029159777, + "flos": 34312717382400.0, + "grad_norm": 1.4820765209382558, + "language_loss": 0.68982363, + "learning_rate": 1.9542422452973082e-06, + "loss": 0.71137834, + "num_input_tokens_seen": 186534830, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.703125, + "step": 8683, + "time_per_iteration": 2.579467535018921 + }, + { + "auxiliary_loss_clip": 0.01112188, + "auxiliary_loss_mlp": 0.01040104, + "balance_loss_clip": 1.02706265, + "balance_loss_mlp": 1.04016137, + "epoch": 0.5221103261686457, + "flos": 22156488552960.0, + "grad_norm": 1.598693343235541, + "language_loss": 0.7622329, + "learning_rate": 1.9538528864297104e-06, + "loss": 0.78375584, + "num_input_tokens_seen": 186554390, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 8684, + "time_per_iteration": 2.4642298221588135 + }, + { + "auxiliary_loss_clip": 0.01107617, + "auxiliary_loss_mlp": 0.01031395, + "balance_loss_clip": 1.01886606, + "balance_loss_mlp": 1.03845632, + "epoch": 0.5221704494213137, + "flos": 19208402772480.0, + "grad_norm": 1.6077803987399797, + "language_loss": 0.75887376, + "learning_rate": 1.9534635293120153e-06, + "loss": 0.7802639, + "num_input_tokens_seen": 186572360, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 8685, + "time_per_iteration": 2.4533908367156982 + }, + { + "auxiliary_loss_clip": 0.01113803, + "auxiliary_loss_mlp": 0.01038269, + "balance_loss_clip": 1.02562094, + "balance_loss_mlp": 1.0427258, + "epoch": 0.5222305726739817, + "flos": 19354056422400.0, + "grad_norm": 1.88354393014551, + "language_loss": 0.80851054, + "learning_rate": 1.9530741739589876e-06, + "loss": 0.83003128, + "num_input_tokens_seen": 186590655, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 8686, + "time_per_iteration": 2.430154323577881 + }, + { + "auxiliary_loss_clip": 0.01106791, + "auxiliary_loss_mlp": 0.01036694, + "balance_loss_clip": 1.02474344, + "balance_loss_mlp": 1.03876567, + "epoch": 0.5222906959266497, + "flos": 27814789272960.0, + "grad_norm": 1.664143868034185, + "language_loss": 0.70208037, + "learning_rate": 1.9526848203853927e-06, + "loss": 0.72351515, + "num_input_tokens_seen": 186610345, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6796875, + "step": 8687, + "time_per_iteration": 2.510512590408325 + }, + { + "auxiliary_loss_clip": 0.01107628, + "auxiliary_loss_mlp": 0.01033442, + "balance_loss_clip": 1.02171767, + "balance_loss_mlp": 1.03840709, + "epoch": 0.5223508191793176, + "flos": 12712988615040.0, + "grad_norm": 2.0206883326938407, + "language_loss": 0.82963884, + "learning_rate": 1.9522954686059936e-06, + "loss": 0.85104954, + "num_input_tokens_seen": 186624360, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.69140625, + "step": 8688, + "time_per_iteration": 2.4092836380004883 + }, + { + "auxiliary_loss_clip": 0.0110979, + "auxiliary_loss_mlp": 0.01033698, + "balance_loss_clip": 1.02107966, + "balance_loss_mlp": 1.04007506, + "epoch": 0.5224109424319856, + "flos": 15632238752640.0, + "grad_norm": 2.711188417076446, + "language_loss": 0.73736638, + "learning_rate": 1.9519061186355558e-06, + "loss": 0.75880128, + "num_input_tokens_seen": 186638680, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69921875, + "step": 8689, + "time_per_iteration": 2.4741477966308594 + }, + { + "auxiliary_loss_clip": 0.01109408, + "auxiliary_loss_mlp": 0.01033862, + "balance_loss_clip": 1.02147067, + "balance_loss_mlp": 1.04056704, + "epoch": 0.5224710656846535, + "flos": 15742233175680.0, + "grad_norm": 1.8604688899774438, + "language_loss": 0.82882619, + "learning_rate": 1.9515167704888417e-06, + "loss": 0.85025889, + "num_input_tokens_seen": 186655840, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 8690, + "time_per_iteration": 2.4194648265838623 + }, + { + "auxiliary_loss_clip": 0.01110389, + "auxiliary_loss_mlp": 0.01038197, + "balance_loss_clip": 1.02476192, + "balance_loss_mlp": 1.03937626, + "epoch": 0.5225311889373215, + "flos": 26030998938240.0, + "grad_norm": 2.3332187959772246, + "language_loss": 0.79397631, + "learning_rate": 1.9511274241806173e-06, + "loss": 0.81546217, + "num_input_tokens_seen": 186674150, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7109375, + "step": 8691, + "time_per_iteration": 2.52500319480896 + }, + { + "auxiliary_loss_clip": 0.01113798, + "auxiliary_loss_mlp": 0.0104147, + "balance_loss_clip": 1.02794003, + "balance_loss_mlp": 1.04154706, + "epoch": 0.5225913121899894, + "flos": 18369278173440.0, + "grad_norm": 1.8556717943569576, + "language_loss": 0.7679857, + "learning_rate": 1.950738079725646e-06, + "loss": 0.78953838, + "num_input_tokens_seen": 186690675, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.72265625, + "step": 8692, + "time_per_iteration": 2.4420764446258545 + }, + { + "auxiliary_loss_clip": 0.0110865, + "auxiliary_loss_mlp": 0.01032578, + "balance_loss_clip": 1.02139628, + "balance_loss_mlp": 1.04145277, + "epoch": 0.5226514354426575, + "flos": 29273516501760.0, + "grad_norm": 1.6990103355094375, + "language_loss": 0.72441196, + "learning_rate": 1.950348737138691e-06, + "loss": 0.74582422, + "num_input_tokens_seen": 186710380, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.671875, + "step": 8693, + "time_per_iteration": 2.551316261291504 + }, + { + "auxiliary_loss_clip": 0.01114591, + "auxiliary_loss_mlp": 0.01042549, + "balance_loss_clip": 1.02841115, + "balance_loss_mlp": 1.04073966, + "epoch": 0.5227115586953254, + "flos": 22853299466880.0, + "grad_norm": 1.780524663497215, + "language_loss": 0.81990045, + "learning_rate": 1.949959396434517e-06, + "loss": 0.84147185, + "num_input_tokens_seen": 186729135, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7421875, + "step": 8694, + "time_per_iteration": 2.4666013717651367 + }, + { + "auxiliary_loss_clip": 0.01036217, + "auxiliary_loss_mlp": 0.01006918, + "balance_loss_clip": 1.00584531, + "balance_loss_mlp": 1.01379716, + "epoch": 0.5227716819479934, + "flos": 57474419022720.0, + "grad_norm": 0.771665075265138, + "language_loss": 0.55743444, + "learning_rate": 1.949570057627888e-06, + "loss": 0.57786584, + "num_input_tokens_seen": 186791115, + "router_z_loss_clip": 0.01074219, + "router_z_loss_mlp": 0.22460938, + "step": 8695, + "time_per_iteration": 3.116420269012451 + }, + { + "auxiliary_loss_clip": 0.01111884, + "auxiliary_loss_mlp": 0.01033913, + "balance_loss_clip": 1.02121711, + "balance_loss_mlp": 1.04176521, + "epoch": 0.5228318052006613, + "flos": 13808264077440.0, + "grad_norm": 1.693403101851131, + "language_loss": 0.7333045, + "learning_rate": 1.9491807207335672e-06, + "loss": 0.75476253, + "num_input_tokens_seen": 186808660, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 8696, + "time_per_iteration": 2.437974452972412 + }, + { + "auxiliary_loss_clip": 0.01112043, + "auxiliary_loss_mlp": 0.01030931, + "balance_loss_clip": 1.01840782, + "balance_loss_mlp": 1.04123831, + "epoch": 0.5228919284533293, + "flos": 15596184476160.0, + "grad_norm": 1.6647399718358808, + "language_loss": 0.7097398, + "learning_rate": 1.948791385766319e-06, + "loss": 0.73116946, + "num_input_tokens_seen": 186825900, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.70703125, + "step": 8697, + "time_per_iteration": 2.5316948890686035 + }, + { + "auxiliary_loss_clip": 0.01107343, + "auxiliary_loss_mlp": 0.01028965, + "balance_loss_clip": 1.0171392, + "balance_loss_mlp": 1.04016519, + "epoch": 0.5229520517059973, + "flos": 22491499726080.0, + "grad_norm": 1.6518576838111187, + "language_loss": 0.80392116, + "learning_rate": 1.948402052740906e-06, + "loss": 0.82528424, + "num_input_tokens_seen": 186843735, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 8698, + "time_per_iteration": 2.4515864849090576 + }, + { + "auxiliary_loss_clip": 0.01110863, + "auxiliary_loss_mlp": 0.01034022, + "balance_loss_clip": 1.0218327, + "balance_loss_mlp": 1.04055512, + "epoch": 0.5230121749586653, + "flos": 22090880361600.0, + "grad_norm": 1.702568194733703, + "language_loss": 0.74550211, + "learning_rate": 1.948012721672093e-06, + "loss": 0.76695091, + "num_input_tokens_seen": 186862440, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.703125, + "step": 8699, + "time_per_iteration": 2.508180856704712 + }, + { + "auxiliary_loss_clip": 0.01114495, + "auxiliary_loss_mlp": 0.01030324, + "balance_loss_clip": 1.01700819, + "balance_loss_mlp": 1.04079318, + "epoch": 0.5230722982113333, + "flos": 22127150119680.0, + "grad_norm": 1.4994824070372519, + "language_loss": 0.73465139, + "learning_rate": 1.947623392574642e-06, + "loss": 0.75609958, + "num_input_tokens_seen": 186880940, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.734375, + "step": 8700, + "time_per_iteration": 2.455620765686035 + }, + { + "auxiliary_loss_clip": 0.01114495, + "auxiliary_loss_mlp": 0.01035916, + "balance_loss_clip": 1.02276719, + "balance_loss_mlp": 1.0418222, + "epoch": 0.5231324214640012, + "flos": 25009268572800.0, + "grad_norm": 1.82733314477648, + "language_loss": 0.66863132, + "learning_rate": 1.947234065463318e-06, + "loss": 0.69013548, + "num_input_tokens_seen": 186900785, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7265625, + "step": 8701, + "time_per_iteration": 2.5278706550598145 + }, + { + "auxiliary_loss_clip": 0.01107483, + "auxiliary_loss_mlp": 0.01034421, + "balance_loss_clip": 1.02162433, + "balance_loss_mlp": 1.03844106, + "epoch": 0.5231925447166692, + "flos": 25740517651200.0, + "grad_norm": 2.0326391886622686, + "language_loss": 0.66616488, + "learning_rate": 1.9468447403528826e-06, + "loss": 0.68758386, + "num_input_tokens_seen": 186920895, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69140625, + "step": 8702, + "time_per_iteration": 2.474238872528076 + }, + { + "auxiliary_loss_clip": 0.01110954, + "auxiliary_loss_mlp": 0.01033297, + "balance_loss_clip": 1.02040434, + "balance_loss_mlp": 1.04128182, + "epoch": 0.5232526679693371, + "flos": 21433930565760.0, + "grad_norm": 1.9248840397651374, + "language_loss": 0.7671175, + "learning_rate": 1.946455417258101e-06, + "loss": 0.78856003, + "num_input_tokens_seen": 186940605, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 8703, + "time_per_iteration": 2.466836929321289 + }, + { + "auxiliary_loss_clip": 0.01115826, + "auxiliary_loss_mlp": 0.0104125, + "balance_loss_clip": 1.02648616, + "balance_loss_mlp": 1.04065156, + "epoch": 0.5233127912220051, + "flos": 35298393471360.0, + "grad_norm": 2.7352924521395576, + "language_loss": 0.76380461, + "learning_rate": 1.9460660961937348e-06, + "loss": 0.78537536, + "num_input_tokens_seen": 186960820, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.75390625, + "step": 8704, + "time_per_iteration": 2.566021680831909 + }, + { + "auxiliary_loss_clip": 0.01109442, + "auxiliary_loss_mlp": 0.01039766, + "balance_loss_clip": 1.0272727, + "balance_loss_mlp": 1.04157901, + "epoch": 0.523372914474673, + "flos": 17051320344960.0, + "grad_norm": 1.6527680542100833, + "language_loss": 0.7804389, + "learning_rate": 1.9456767771745474e-06, + "loss": 0.80193096, + "num_input_tokens_seen": 186976240, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 8705, + "time_per_iteration": 2.4414021968841553 + }, + { + "auxiliary_loss_clip": 0.01113477, + "auxiliary_loss_mlp": 0.01028501, + "balance_loss_clip": 1.01545918, + "balance_loss_mlp": 1.04121351, + "epoch": 0.5234330377273411, + "flos": 18406302117120.0, + "grad_norm": 1.9173845394592544, + "language_loss": 0.69808084, + "learning_rate": 1.9452874602153027e-06, + "loss": 0.7195006, + "num_input_tokens_seen": 186992855, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.72265625, + "step": 8706, + "time_per_iteration": 2.4252305030822754 + }, + { + "auxiliary_loss_clip": 0.01033927, + "auxiliary_loss_mlp": 0.00999849, + "balance_loss_clip": 0.99876386, + "balance_loss_mlp": 1.01179016, + "epoch": 0.523493160980009, + "flos": 65850296970240.0, + "grad_norm": 0.6804801593959132, + "language_loss": 0.52532774, + "learning_rate": 1.9448981453307623e-06, + "loss": 0.5456655, + "num_input_tokens_seen": 187051205, + "router_z_loss_clip": 0.01086426, + "router_z_loss_mlp": 0.22167969, + "step": 8707, + "time_per_iteration": 3.142758369445801 + }, + { + "auxiliary_loss_clip": 0.01109991, + "auxiliary_loss_mlp": 0.01035147, + "balance_loss_clip": 1.02262449, + "balance_loss_mlp": 1.03904724, + "epoch": 0.523553284232677, + "flos": 21872076664320.0, + "grad_norm": 1.7383881327323734, + "language_loss": 0.74716955, + "learning_rate": 1.9445088325356904e-06, + "loss": 0.76862097, + "num_input_tokens_seen": 187070540, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7109375, + "step": 8708, + "time_per_iteration": 2.4591562747955322 + }, + { + "auxiliary_loss_clip": 0.01109127, + "auxiliary_loss_mlp": 0.01026089, + "balance_loss_clip": 1.01402545, + "balance_loss_mlp": 1.04014444, + "epoch": 0.5236134074853449, + "flos": 20848191482880.0, + "grad_norm": 1.691977522935515, + "language_loss": 0.77432841, + "learning_rate": 1.944119521844849e-06, + "loss": 0.79568058, + "num_input_tokens_seen": 187089975, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.69140625, + "step": 8709, + "time_per_iteration": 2.480982780456543 + }, + { + "auxiliary_loss_clip": 0.01114299, + "auxiliary_loss_mlp": 0.01034543, + "balance_loss_clip": 1.01927257, + "balance_loss_mlp": 1.03814077, + "epoch": 0.5236735307380129, + "flos": 25520421064320.0, + "grad_norm": 1.9878514646446084, + "language_loss": 0.8357569, + "learning_rate": 1.9437302132730003e-06, + "loss": 0.85724527, + "num_input_tokens_seen": 187108775, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.76171875, + "step": 8710, + "time_per_iteration": 2.4901626110076904 + }, + { + "auxiliary_loss_clip": 0.01107894, + "auxiliary_loss_mlp": 0.01026835, + "balance_loss_clip": 1.01440704, + "balance_loss_mlp": 1.03936791, + "epoch": 0.523733653990681, + "flos": 23583112001280.0, + "grad_norm": 1.6699101384293633, + "language_loss": 0.69427162, + "learning_rate": 1.943340906834908e-06, + "loss": 0.71561891, + "num_input_tokens_seen": 187128830, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 8711, + "time_per_iteration": 2.476573944091797 + }, + { + "auxiliary_loss_clip": 0.01108558, + "auxiliary_loss_mlp": 0.0103175, + "balance_loss_clip": 1.01879799, + "balance_loss_mlp": 1.03732038, + "epoch": 0.5237937772433489, + "flos": 21106245767040.0, + "grad_norm": 1.8448951706521464, + "language_loss": 0.83195686, + "learning_rate": 1.9429516025453345e-06, + "loss": 0.85335994, + "num_input_tokens_seen": 187149570, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 8712, + "time_per_iteration": 2.4485836029052734 + }, + { + "auxiliary_loss_clip": 0.01111097, + "auxiliary_loss_mlp": 0.01036681, + "balance_loss_clip": 1.02286506, + "balance_loss_mlp": 1.03859973, + "epoch": 0.5238539004960169, + "flos": 19172887200000.0, + "grad_norm": 1.7709353735200277, + "language_loss": 0.69517416, + "learning_rate": 1.9425623004190415e-06, + "loss": 0.71665198, + "num_input_tokens_seen": 187170575, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7265625, + "step": 8713, + "time_per_iteration": 2.496649980545044 + }, + { + "auxiliary_loss_clip": 0.01112233, + "auxiliary_loss_mlp": 0.01033578, + "balance_loss_clip": 1.01934421, + "balance_loss_mlp": 1.03752589, + "epoch": 0.5239140237486848, + "flos": 17888218300800.0, + "grad_norm": 2.61615049353435, + "language_loss": 0.76978022, + "learning_rate": 1.9421730004707925e-06, + "loss": 0.79123831, + "num_input_tokens_seen": 187187190, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.74609375, + "step": 8714, + "time_per_iteration": 2.42134428024292 + }, + { + "auxiliary_loss_clip": 0.01113875, + "auxiliary_loss_mlp": 0.01030729, + "balance_loss_clip": 1.01703143, + "balance_loss_mlp": 1.04200637, + "epoch": 0.5239741470013528, + "flos": 17930413802880.0, + "grad_norm": 1.883747352805191, + "language_loss": 0.75953126, + "learning_rate": 1.9417837027153483e-06, + "loss": 0.78097725, + "num_input_tokens_seen": 187204350, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71875, + "step": 8715, + "time_per_iteration": 2.453313112258911 + }, + { + "auxiliary_loss_clip": 0.01106451, + "auxiliary_loss_mlp": 0.01030996, + "balance_loss_clip": 1.01807356, + "balance_loss_mlp": 1.0377413, + "epoch": 0.5240342702540207, + "flos": 30993386584320.0, + "grad_norm": 1.4951701207047352, + "language_loss": 0.7078892, + "learning_rate": 1.9413944071674723e-06, + "loss": 0.72926366, + "num_input_tokens_seen": 187225605, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6875, + "step": 8716, + "time_per_iteration": 2.536285638809204 + }, + { + "auxiliary_loss_clip": 0.01107976, + "auxiliary_loss_mlp": 0.01035517, + "balance_loss_clip": 1.02394176, + "balance_loss_mlp": 1.03838778, + "epoch": 0.5240943935066887, + "flos": 25005066681600.0, + "grad_norm": 2.055978260271784, + "language_loss": 0.86706465, + "learning_rate": 1.941005113841926e-06, + "loss": 0.88849956, + "num_input_tokens_seen": 187241335, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6953125, + "step": 8717, + "time_per_iteration": 2.5015134811401367 + }, + { + "auxiliary_loss_clip": 0.01108796, + "auxiliary_loss_mlp": 0.01029411, + "balance_loss_clip": 1.01737654, + "balance_loss_mlp": 1.03882921, + "epoch": 0.5241545167593566, + "flos": 23659099223040.0, + "grad_norm": 1.8178940063432978, + "language_loss": 0.60516441, + "learning_rate": 1.9406158227534723e-06, + "loss": 0.6265465, + "num_input_tokens_seen": 187259925, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.69921875, + "step": 8718, + "time_per_iteration": 4.028836488723755 + }, + { + "auxiliary_loss_clip": 0.01112927, + "auxiliary_loss_mlp": 0.01031382, + "balance_loss_clip": 1.01830447, + "balance_loss_mlp": 1.04012215, + "epoch": 0.5242146400120247, + "flos": 23400398494080.0, + "grad_norm": 1.7437517815053911, + "language_loss": 0.71897364, + "learning_rate": 1.940226533916872e-06, + "loss": 0.74041677, + "num_input_tokens_seen": 187279035, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 8719, + "time_per_iteration": 2.455796003341675 + }, + { + "auxiliary_loss_clip": 0.01106409, + "auxiliary_loss_mlp": 0.01027949, + "balance_loss_clip": 1.0163976, + "balance_loss_mlp": 1.03797865, + "epoch": 0.5242747632646926, + "flos": 17749065012480.0, + "grad_norm": 1.705660803101178, + "language_loss": 0.72716737, + "learning_rate": 1.9398372473468877e-06, + "loss": 0.74851096, + "num_input_tokens_seen": 187297555, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.68359375, + "step": 8720, + "time_per_iteration": 2.445131301879883 + }, + { + "auxiliary_loss_clip": 0.01110289, + "auxiliary_loss_mlp": 0.01032373, + "balance_loss_clip": 1.01948094, + "balance_loss_mlp": 1.04000795, + "epoch": 0.5243348865173606, + "flos": 32597731549440.0, + "grad_norm": 1.6022030744217663, + "language_loss": 0.70251679, + "learning_rate": 1.939447963058281e-06, + "loss": 0.72394347, + "num_input_tokens_seen": 187320265, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 8721, + "time_per_iteration": 5.4637322425842285 + }, + { + "auxiliary_loss_clip": 0.01106478, + "auxiliary_loss_mlp": 0.01031051, + "balance_loss_clip": 1.01883805, + "balance_loss_mlp": 1.03700781, + "epoch": 0.5243950097700285, + "flos": 25484115392640.0, + "grad_norm": 1.710812698690052, + "language_loss": 0.86623824, + "learning_rate": 1.939058681065813e-06, + "loss": 0.88761353, + "num_input_tokens_seen": 187338045, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6953125, + "step": 8722, + "time_per_iteration": 2.4582130908966064 + }, + { + "auxiliary_loss_clip": 0.01107687, + "auxiliary_loss_mlp": 0.01032314, + "balance_loss_clip": 1.01850319, + "balance_loss_mlp": 1.03929901, + "epoch": 0.5244551330226965, + "flos": 15268391936640.0, + "grad_norm": 1.6752601944842513, + "language_loss": 0.79654807, + "learning_rate": 1.938669401384247e-06, + "loss": 0.8179481, + "num_input_tokens_seen": 187356040, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.68359375, + "step": 8723, + "time_per_iteration": 2.4436798095703125 + }, + { + "auxiliary_loss_clip": 0.0111223, + "auxiliary_loss_mlp": 0.01035425, + "balance_loss_clip": 1.02165055, + "balance_loss_mlp": 1.04074168, + "epoch": 0.5245152562753645, + "flos": 22237108629120.0, + "grad_norm": 2.2643940307400054, + "language_loss": 0.74980783, + "learning_rate": 1.9382801240283426e-06, + "loss": 0.77128434, + "num_input_tokens_seen": 187374185, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.71484375, + "step": 8724, + "time_per_iteration": 2.4523351192474365 + }, + { + "auxiliary_loss_clip": 0.01114812, + "auxiliary_loss_mlp": 0.01033971, + "balance_loss_clip": 1.0193913, + "balance_loss_mlp": 1.03920281, + "epoch": 0.5245753795280325, + "flos": 29426460612480.0, + "grad_norm": 1.7907307804166401, + "language_loss": 0.70031178, + "learning_rate": 1.9378908490128625e-06, + "loss": 0.72179961, + "num_input_tokens_seen": 187396640, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7578125, + "step": 8725, + "time_per_iteration": 2.548102617263794 + }, + { + "auxiliary_loss_clip": 0.01033499, + "auxiliary_loss_mlp": 0.00998708, + "balance_loss_clip": 0.99755734, + "balance_loss_mlp": 1.01092362, + "epoch": 0.5246355027807005, + "flos": 58834392785280.0, + "grad_norm": 0.7538969042021075, + "language_loss": 0.55637997, + "learning_rate": 1.937501576352568e-06, + "loss": 0.576702, + "num_input_tokens_seen": 187455945, + "router_z_loss_clip": 0.01147461, + "router_z_loss_mlp": 0.2265625, + "step": 8726, + "time_per_iteration": 3.055438995361328 + }, + { + "auxiliary_loss_clip": 0.01033831, + "auxiliary_loss_mlp": 0.00998072, + "balance_loss_clip": 0.99698144, + "balance_loss_mlp": 1.01147294, + "epoch": 0.5246956260333684, + "flos": 64526592965760.0, + "grad_norm": 0.8042859023243575, + "language_loss": 0.58400142, + "learning_rate": 1.937112306062219e-06, + "loss": 0.60432053, + "num_input_tokens_seen": 187519975, + "router_z_loss_clip": 0.01092529, + "router_z_loss_mlp": 0.22460938, + "step": 8727, + "time_per_iteration": 3.071913719177246 + }, + { + "auxiliary_loss_clip": 0.0111222, + "auxiliary_loss_mlp": 0.0103046, + "balance_loss_clip": 1.01701272, + "balance_loss_mlp": 1.03976107, + "epoch": 0.5247557492860364, + "flos": 24533631653760.0, + "grad_norm": 1.3114988788354258, + "language_loss": 0.70559728, + "learning_rate": 1.9367230381565786e-06, + "loss": 0.72702408, + "num_input_tokens_seen": 187541775, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.72265625, + "step": 8728, + "time_per_iteration": -0.15050816535949707 + }, + { + "auxiliary_loss_clip": 0.01108011, + "auxiliary_loss_mlp": 0.01026221, + "balance_loss_clip": 1.01421666, + "balance_loss_mlp": 1.03783965, + "epoch": 0.5248158725387043, + "flos": 18806131382400.0, + "grad_norm": 1.5256282262341387, + "language_loss": 0.6966821, + "learning_rate": 1.9363337726504062e-06, + "loss": 0.71802437, + "num_input_tokens_seen": 187560425, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.703125, + "step": 8729, + "time_per_iteration": 2.470921039581299 + }, + { + "auxiliary_loss_clip": 0.0111289, + "auxiliary_loss_mlp": 0.01031243, + "balance_loss_clip": 1.01859486, + "balance_loss_mlp": 1.04002178, + "epoch": 0.5248759957913723, + "flos": 20955851521920.0, + "grad_norm": 1.7430499295764175, + "language_loss": 0.83498538, + "learning_rate": 1.935944509558464e-06, + "loss": 0.85642672, + "num_input_tokens_seen": 187579930, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7265625, + "step": 8730, + "time_per_iteration": 2.447209358215332 + }, + { + "auxiliary_loss_clip": 0.01109708, + "auxiliary_loss_mlp": 0.01033665, + "balance_loss_clip": 1.02034974, + "balance_loss_mlp": 1.03944659, + "epoch": 0.5249361190440403, + "flos": 18660980522880.0, + "grad_norm": 2.372255604306618, + "language_loss": 0.79440451, + "learning_rate": 1.9355552488955125e-06, + "loss": 0.81583822, + "num_input_tokens_seen": 187595365, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.703125, + "step": 8731, + "time_per_iteration": 2.4764487743377686 + }, + { + "auxiliary_loss_clip": 0.01104468, + "auxiliary_loss_mlp": 0.0103625, + "balance_loss_clip": 1.02373886, + "balance_loss_mlp": 1.03691411, + "epoch": 0.5249962422967083, + "flos": 24863327614080.0, + "grad_norm": 1.577877427677953, + "language_loss": 0.83057785, + "learning_rate": 1.935165990676312e-06, + "loss": 0.8519851, + "num_input_tokens_seen": 187614715, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.67578125, + "step": 8732, + "time_per_iteration": 2.4856929779052734 + }, + { + "auxiliary_loss_clip": 0.01106984, + "auxiliary_loss_mlp": 0.01032336, + "balance_loss_clip": 1.02020669, + "balance_loss_mlp": 1.03737712, + "epoch": 0.5250563655493762, + "flos": 15262681674240.0, + "grad_norm": 1.6308728168221684, + "language_loss": 0.77874607, + "learning_rate": 1.9347767349156237e-06, + "loss": 0.80013925, + "num_input_tokens_seen": 187630745, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 8733, + "time_per_iteration": 2.440887212753296 + }, + { + "auxiliary_loss_clip": 0.01113418, + "auxiliary_loss_mlp": 0.01038191, + "balance_loss_clip": 1.02521539, + "balance_loss_mlp": 1.04069221, + "epoch": 0.5251164888020442, + "flos": 18625177641600.0, + "grad_norm": 1.8154235824744323, + "language_loss": 0.81740808, + "learning_rate": 1.934387481628208e-06, + "loss": 0.83892411, + "num_input_tokens_seen": 187648200, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7265625, + "step": 8734, + "time_per_iteration": 2.4394965171813965 + }, + { + "auxiliary_loss_clip": 0.01106918, + "auxiliary_loss_mlp": 0.01030684, + "balance_loss_clip": 1.01828647, + "balance_loss_mlp": 1.03909111, + "epoch": 0.5251766120547121, + "flos": 29710764760320.0, + "grad_norm": 1.3786944232239873, + "language_loss": 0.76792759, + "learning_rate": 1.933998230828826e-06, + "loss": 0.78930354, + "num_input_tokens_seen": 187669205, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 8735, + "time_per_iteration": 2.5392351150512695 + }, + { + "auxiliary_loss_clip": 0.01110743, + "auxiliary_loss_mlp": 0.01030467, + "balance_loss_clip": 1.01861119, + "balance_loss_mlp": 1.03907919, + "epoch": 0.5252367353073801, + "flos": 23440295525760.0, + "grad_norm": 1.5767625018953106, + "language_loss": 0.80153042, + "learning_rate": 1.9336089825322376e-06, + "loss": 0.8229425, + "num_input_tokens_seen": 187690890, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.71484375, + "step": 8736, + "time_per_iteration": 2.470860242843628 + }, + { + "auxiliary_loss_clip": 0.01110972, + "auxiliary_loss_mlp": 0.01032625, + "balance_loss_clip": 1.01979208, + "balance_loss_mlp": 1.04068267, + "epoch": 0.5252968585600482, + "flos": 30810708990720.0, + "grad_norm": 2.2098484474485716, + "language_loss": 0.69838667, + "learning_rate": 1.9332197367532033e-06, + "loss": 0.71982265, + "num_input_tokens_seen": 187713045, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 8737, + "time_per_iteration": 2.5947840213775635 + }, + { + "auxiliary_loss_clip": 0.01106725, + "auxiliary_loss_mlp": 0.0103151, + "balance_loss_clip": 1.01885569, + "balance_loss_mlp": 1.0369395, + "epoch": 0.5253569818127161, + "flos": 20628274464000.0, + "grad_norm": 1.4975240773091183, + "language_loss": 0.77464664, + "learning_rate": 1.9328304935064833e-06, + "loss": 0.79602897, + "num_input_tokens_seen": 187733640, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 8738, + "time_per_iteration": 2.4910526275634766 + }, + { + "auxiliary_loss_clip": 0.01034294, + "auxiliary_loss_mlp": 0.01014673, + "balance_loss_clip": 1.01349294, + "balance_loss_mlp": 1.01161027, + "epoch": 0.5254171050653841, + "flos": 63428695810560.0, + "grad_norm": 0.7501251002484244, + "language_loss": 0.54472572, + "learning_rate": 1.932441252806837e-06, + "loss": 0.56521541, + "num_input_tokens_seen": 187792930, + "router_z_loss_clip": 0.01177979, + "router_z_loss_mlp": 0.2265625, + "step": 8739, + "time_per_iteration": 3.0936102867126465 + }, + { + "auxiliary_loss_clip": 0.01108375, + "auxiliary_loss_mlp": 0.01032467, + "balance_loss_clip": 1.02058792, + "balance_loss_mlp": 1.03920436, + "epoch": 0.525477228318052, + "flos": 34670782108800.0, + "grad_norm": 4.076584700627864, + "language_loss": 0.847902, + "learning_rate": 1.9320520146690263e-06, + "loss": 0.86931044, + "num_input_tokens_seen": 187812495, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.69140625, + "step": 8740, + "time_per_iteration": 2.5510640144348145 + }, + { + "auxiliary_loss_clip": 0.01107783, + "auxiliary_loss_mlp": 0.01034609, + "balance_loss_clip": 1.02204442, + "balance_loss_mlp": 1.0391773, + "epoch": 0.52553735157072, + "flos": 17930844766080.0, + "grad_norm": 1.9479054855450806, + "language_loss": 0.69464219, + "learning_rate": 1.9316627791078093e-06, + "loss": 0.71606612, + "num_input_tokens_seen": 187829685, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 8741, + "time_per_iteration": 2.4474291801452637 + }, + { + "auxiliary_loss_clip": 0.01112521, + "auxiliary_loss_mlp": 0.01029406, + "balance_loss_clip": 1.01657915, + "balance_loss_mlp": 1.04100168, + "epoch": 0.5255974748233879, + "flos": 9940864584960.0, + "grad_norm": 1.7696604002482594, + "language_loss": 0.6591152, + "learning_rate": 1.931273546137947e-06, + "loss": 0.68053448, + "num_input_tokens_seen": 187846495, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.71484375, + "step": 8742, + "time_per_iteration": 2.4151360988616943 + }, + { + "auxiliary_loss_clip": 0.01112065, + "auxiliary_loss_mlp": 0.01035718, + "balance_loss_clip": 1.02191377, + "balance_loss_mlp": 1.03977919, + "epoch": 0.5256575980760559, + "flos": 16868427269760.0, + "grad_norm": 2.337521906395912, + "language_loss": 0.63094312, + "learning_rate": 1.9308843157741983e-06, + "loss": 0.65242094, + "num_input_tokens_seen": 187862010, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.72265625, + "step": 8743, + "time_per_iteration": 2.4369444847106934 + }, + { + "auxiliary_loss_clip": 0.01033192, + "auxiliary_loss_mlp": 0.01006558, + "balance_loss_clip": 1.00549126, + "balance_loss_mlp": 1.01085198, + "epoch": 0.5257177213287239, + "flos": 62386210362240.0, + "grad_norm": 0.7737212884291378, + "language_loss": 0.54199207, + "learning_rate": 1.930495088031323e-06, + "loss": 0.56238955, + "num_input_tokens_seen": 187922730, + "router_z_loss_clip": 0.01068115, + "router_z_loss_mlp": 0.22363281, + "step": 8744, + "time_per_iteration": 3.1759095191955566 + }, + { + "auxiliary_loss_clip": 0.01114357, + "auxiliary_loss_mlp": 0.01031493, + "balance_loss_clip": 1.01773655, + "balance_loss_mlp": 1.04095125, + "epoch": 0.5257778445813919, + "flos": 20776908942720.0, + "grad_norm": 2.20739797588364, + "language_loss": 0.75574982, + "learning_rate": 1.9301058629240814e-06, + "loss": 0.77720833, + "num_input_tokens_seen": 187940160, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.734375, + "step": 8745, + "time_per_iteration": 2.447798728942871 + }, + { + "auxiliary_loss_clip": 0.01109482, + "auxiliary_loss_mlp": 0.01035025, + "balance_loss_clip": 1.02291942, + "balance_loss_mlp": 1.03964972, + "epoch": 0.5258379678340598, + "flos": 17018606033280.0, + "grad_norm": 1.9635902719056224, + "language_loss": 0.80408484, + "learning_rate": 1.9297166404672324e-06, + "loss": 0.82552993, + "num_input_tokens_seen": 187958625, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69921875, + "step": 8746, + "time_per_iteration": 2.4415667057037354 + }, + { + "auxiliary_loss_clip": 0.01107231, + "auxiliary_loss_mlp": 0.01032675, + "balance_loss_clip": 1.02002132, + "balance_loss_mlp": 1.03842771, + "epoch": 0.5258980910867278, + "flos": 21068754946560.0, + "grad_norm": 1.8094795225841998, + "language_loss": 0.75289273, + "learning_rate": 1.9293274206755353e-06, + "loss": 0.77429175, + "num_input_tokens_seen": 187977575, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 8747, + "time_per_iteration": 2.4909451007843018 + }, + { + "auxiliary_loss_clip": 0.01103122, + "auxiliary_loss_mlp": 0.01029398, + "balance_loss_clip": 1.01716161, + "balance_loss_mlp": 1.03701103, + "epoch": 0.5259582143393957, + "flos": 18004461690240.0, + "grad_norm": 2.3964471896172554, + "language_loss": 0.82515085, + "learning_rate": 1.9289382035637505e-06, + "loss": 0.84647602, + "num_input_tokens_seen": 187996650, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6640625, + "step": 8748, + "time_per_iteration": 2.4266607761383057 + }, + { + "auxiliary_loss_clip": 0.01108131, + "auxiliary_loss_mlp": 0.01032899, + "balance_loss_clip": 1.01948202, + "balance_loss_mlp": 1.03713202, + "epoch": 0.5260183375920637, + "flos": 22783848520320.0, + "grad_norm": 1.9711847853488498, + "language_loss": 0.80562335, + "learning_rate": 1.9285489891466345e-06, + "loss": 0.82703364, + "num_input_tokens_seen": 188013510, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7109375, + "step": 8749, + "time_per_iteration": 2.4795496463775635 + }, + { + "auxiliary_loss_clip": 0.01108885, + "auxiliary_loss_mlp": 0.01033706, + "balance_loss_clip": 1.02076626, + "balance_loss_mlp": 1.04021406, + "epoch": 0.5260784608447318, + "flos": 27052406081280.0, + "grad_norm": 1.712765899743528, + "language_loss": 0.72119522, + "learning_rate": 1.9281597774389487e-06, + "loss": 0.74262118, + "num_input_tokens_seen": 188032085, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6875, + "step": 8750, + "time_per_iteration": 2.5028066635131836 + }, + { + "auxiliary_loss_clip": 0.01105706, + "auxiliary_loss_mlp": 0.01029058, + "balance_loss_clip": 1.0166955, + "balance_loss_mlp": 1.03688407, + "epoch": 0.5261385840973997, + "flos": 20662820369280.0, + "grad_norm": 1.3484208983844765, + "language_loss": 0.76440692, + "learning_rate": 1.9277705684554517e-06, + "loss": 0.78575456, + "num_input_tokens_seen": 188050590, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6875, + "step": 8751, + "time_per_iteration": 2.49141788482666 + }, + { + "auxiliary_loss_clip": 0.0110665, + "auxiliary_loss_mlp": 0.0103178, + "balance_loss_clip": 1.01973987, + "balance_loss_mlp": 1.03969383, + "epoch": 0.5261987073500677, + "flos": 23622649896960.0, + "grad_norm": 1.3930828226372818, + "language_loss": 0.75950229, + "learning_rate": 1.927381362210902e-06, + "loss": 0.78088653, + "num_input_tokens_seen": 188071620, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66796875, + "step": 8752, + "time_per_iteration": 2.4891488552093506 + }, + { + "auxiliary_loss_clip": 0.01110452, + "auxiliary_loss_mlp": 0.0102755, + "balance_loss_clip": 1.01418078, + "balance_loss_mlp": 1.03927755, + "epoch": 0.5262588306027356, + "flos": 27636241743360.0, + "grad_norm": 1.4497375157025647, + "language_loss": 0.6776315, + "learning_rate": 1.926992158720058e-06, + "loss": 0.69901145, + "num_input_tokens_seen": 188091740, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7109375, + "step": 8753, + "time_per_iteration": 2.5364086627960205 + }, + { + "auxiliary_loss_clip": 0.01108994, + "auxiliary_loss_mlp": 0.01032754, + "balance_loss_clip": 1.02072024, + "balance_loss_mlp": 1.04052699, + "epoch": 0.5263189538554036, + "flos": 21759711943680.0, + "grad_norm": 1.4822261150811287, + "language_loss": 0.83834231, + "learning_rate": 1.9266029579976785e-06, + "loss": 0.85975981, + "num_input_tokens_seen": 188111165, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6875, + "step": 8754, + "time_per_iteration": 2.4782354831695557 + }, + { + "auxiliary_loss_clip": 0.01108303, + "auxiliary_loss_mlp": 0.01031002, + "balance_loss_clip": 1.01821733, + "balance_loss_mlp": 1.03804278, + "epoch": 0.5263790771080715, + "flos": 14276359140480.0, + "grad_norm": 2.116384687985529, + "language_loss": 0.8708753, + "learning_rate": 1.926213760058522e-06, + "loss": 0.8922683, + "num_input_tokens_seen": 188127825, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.703125, + "step": 8755, + "time_per_iteration": 2.426422357559204 + }, + { + "auxiliary_loss_clip": 0.01031717, + "auxiliary_loss_mlp": 0.01000414, + "balance_loss_clip": 0.99934119, + "balance_loss_mlp": 1.0092082, + "epoch": 0.5264392003607395, + "flos": 65806413528960.0, + "grad_norm": 0.7185760813251492, + "language_loss": 0.58853483, + "learning_rate": 1.9258245649173477e-06, + "loss": 0.60885608, + "num_input_tokens_seen": 188194050, + "router_z_loss_clip": 0.01074219, + "router_z_loss_mlp": 0.22460938, + "step": 8756, + "time_per_iteration": 3.1429710388183594 + }, + { + "auxiliary_loss_clip": 0.01109539, + "auxiliary_loss_mlp": 0.01033323, + "balance_loss_clip": 1.02052546, + "balance_loss_mlp": 1.03787899, + "epoch": 0.5264993236134075, + "flos": 21032413361280.0, + "grad_norm": 4.297833550953773, + "language_loss": 0.70166421, + "learning_rate": 1.925435372588913e-06, + "loss": 0.72309285, + "num_input_tokens_seen": 188212565, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.71875, + "step": 8757, + "time_per_iteration": 2.4352152347564697 + }, + { + "auxiliary_loss_clip": 0.01108207, + "auxiliary_loss_mlp": 0.01030645, + "balance_loss_clip": 1.01828289, + "balance_loss_mlp": 1.03741014, + "epoch": 0.5265594468660755, + "flos": 16618202150400.0, + "grad_norm": 1.637312529409449, + "language_loss": 0.8773526, + "learning_rate": 1.9250461830879768e-06, + "loss": 0.89874113, + "num_input_tokens_seen": 188229505, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.70703125, + "step": 8758, + "time_per_iteration": 2.4447832107543945 + }, + { + "auxiliary_loss_clip": 0.0110992, + "auxiliary_loss_mlp": 0.01033594, + "balance_loss_clip": 1.02048147, + "balance_loss_mlp": 1.03790975, + "epoch": 0.5266195701187434, + "flos": 24134125610880.0, + "grad_norm": 1.3883962898678874, + "language_loss": 0.76014191, + "learning_rate": 1.9246569964292965e-06, + "loss": 0.78157705, + "num_input_tokens_seen": 188250395, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.72265625, + "step": 8759, + "time_per_iteration": 2.4818501472473145 + }, + { + "auxiliary_loss_clip": 0.01102801, + "auxiliary_loss_mlp": 0.01026631, + "balance_loss_clip": 1.01460838, + "balance_loss_mlp": 1.0357269, + "epoch": 0.5266796933714114, + "flos": 15844111125120.0, + "grad_norm": 1.9978294175433113, + "language_loss": 0.71896535, + "learning_rate": 1.9242678126276307e-06, + "loss": 0.74025965, + "num_input_tokens_seen": 188266785, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 8760, + "time_per_iteration": 3.8544509410858154 + }, + { + "auxiliary_loss_clip": 0.01113013, + "auxiliary_loss_mlp": 0.01034813, + "balance_loss_clip": 1.02161074, + "balance_loss_mlp": 1.03947306, + "epoch": 0.5267398166240793, + "flos": 20951434149120.0, + "grad_norm": 1.9164441807727424, + "language_loss": 0.76221085, + "learning_rate": 1.923878631697736e-06, + "loss": 0.78368914, + "num_input_tokens_seen": 188282525, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 8761, + "time_per_iteration": 2.43031907081604 + }, + { + "auxiliary_loss_clip": 0.01108816, + "auxiliary_loss_mlp": 0.01030109, + "balance_loss_clip": 1.01879597, + "balance_loss_mlp": 1.03958154, + "epoch": 0.5267999398767473, + "flos": 20996394998400.0, + "grad_norm": 1.712095639698782, + "language_loss": 0.70643085, + "learning_rate": 1.923489453654373e-06, + "loss": 0.7278201, + "num_input_tokens_seen": 188301395, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.69140625, + "step": 8762, + "time_per_iteration": 5.321688652038574 + }, + { + "auxiliary_loss_clip": 0.01031212, + "auxiliary_loss_mlp": 0.0100382, + "balance_loss_clip": 1.00266957, + "balance_loss_mlp": 1.00896931, + "epoch": 0.5268600631294152, + "flos": 66849401767680.0, + "grad_norm": 0.9468786857883086, + "language_loss": 0.65414345, + "learning_rate": 1.9231002785122963e-06, + "loss": 0.67449379, + "num_input_tokens_seen": 188357665, + "router_z_loss_clip": 0.01147461, + "router_z_loss_mlp": 0.22265625, + "step": 8763, + "time_per_iteration": 4.360533237457275 + }, + { + "auxiliary_loss_clip": 0.0110798, + "auxiliary_loss_mlp": 0.01031724, + "balance_loss_clip": 1.01927209, + "balance_loss_mlp": 1.03798556, + "epoch": 0.5269201863820833, + "flos": 17165552572800.0, + "grad_norm": 1.6073395480000416, + "language_loss": 0.70771408, + "learning_rate": 1.922711106286265e-06, + "loss": 0.72911114, + "num_input_tokens_seen": 188376935, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.69921875, + "step": 8764, + "time_per_iteration": 2.4463791847229004 + }, + { + "auxiliary_loss_clip": 0.0110759, + "auxiliary_loss_mlp": 0.01029413, + "balance_loss_clip": 1.01640153, + "balance_loss_mlp": 1.03704798, + "epoch": 0.5269803096347513, + "flos": 20522589672960.0, + "grad_norm": 1.6766716538329436, + "language_loss": 0.74135405, + "learning_rate": 1.9223219369910368e-06, + "loss": 0.76272404, + "num_input_tokens_seen": 188394995, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.703125, + "step": 8765, + "time_per_iteration": 2.4344265460968018 + }, + { + "auxiliary_loss_clip": 0.0110988, + "auxiliary_loss_mlp": 0.01033345, + "balance_loss_clip": 1.0194571, + "balance_loss_mlp": 1.03650451, + "epoch": 0.5270404328874192, + "flos": 27230989524480.0, + "grad_norm": 1.4935943977467754, + "language_loss": 0.85193348, + "learning_rate": 1.9219327706413677e-06, + "loss": 0.87336564, + "num_input_tokens_seen": 188415475, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.734375, + "step": 8766, + "time_per_iteration": 2.52951979637146 + }, + { + "auxiliary_loss_clip": 0.0111099, + "auxiliary_loss_mlp": 0.01033628, + "balance_loss_clip": 1.0204674, + "balance_loss_mlp": 1.03980124, + "epoch": 0.5271005561400872, + "flos": 23110491824640.0, + "grad_norm": 1.7377061989269131, + "language_loss": 0.79036993, + "learning_rate": 1.921543607252017e-06, + "loss": 0.8118161, + "num_input_tokens_seen": 188435665, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 8767, + "time_per_iteration": 2.4478976726531982 + }, + { + "auxiliary_loss_clip": 0.0110965, + "auxiliary_loss_mlp": 0.01032349, + "balance_loss_clip": 1.01897943, + "balance_loss_mlp": 1.03842282, + "epoch": 0.5271606793927551, + "flos": 22564793427840.0, + "grad_norm": 1.871676480421452, + "language_loss": 0.73691523, + "learning_rate": 1.9211544468377394e-06, + "loss": 0.75833523, + "num_input_tokens_seen": 188455405, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7109375, + "step": 8768, + "time_per_iteration": 2.464952230453491 + }, + { + "auxiliary_loss_clip": 0.01106727, + "auxiliary_loss_mlp": 0.0103356, + "balance_loss_clip": 1.02222896, + "balance_loss_mlp": 1.03777611, + "epoch": 0.5272208026454231, + "flos": 18764259102720.0, + "grad_norm": 3.4895191769574354, + "language_loss": 0.74093413, + "learning_rate": 1.9207652894132933e-06, + "loss": 0.76233703, + "num_input_tokens_seen": 188472940, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.69140625, + "step": 8769, + "time_per_iteration": 2.4464261531829834 + }, + { + "auxiliary_loss_clip": 0.01108124, + "auxiliary_loss_mlp": 0.01036366, + "balance_loss_clip": 1.02372384, + "balance_loss_mlp": 1.03890908, + "epoch": 0.5272809258980911, + "flos": 20412164286720.0, + "grad_norm": 1.6831893733690892, + "language_loss": 0.7382611, + "learning_rate": 1.920376134993436e-06, + "loss": 0.75970602, + "num_input_tokens_seen": 188493035, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 8770, + "time_per_iteration": 2.4870028495788574 + }, + { + "auxiliary_loss_clip": 0.01110065, + "auxiliary_loss_mlp": 0.01030567, + "balance_loss_clip": 1.01798415, + "balance_loss_mlp": 1.03966439, + "epoch": 0.5273410491507591, + "flos": 28256742213120.0, + "grad_norm": 1.642757388746556, + "language_loss": 0.68108106, + "learning_rate": 1.9199869835929224e-06, + "loss": 0.70248735, + "num_input_tokens_seen": 188513860, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 8771, + "time_per_iteration": 2.5180561542510986 + }, + { + "auxiliary_loss_clip": 0.01106371, + "auxiliary_loss_mlp": 0.0103421, + "balance_loss_clip": 1.02130556, + "balance_loss_mlp": 1.03755426, + "epoch": 0.527401172403427, + "flos": 22455158140800.0, + "grad_norm": 1.8518077177131755, + "language_loss": 0.76476532, + "learning_rate": 1.9195978352265115e-06, + "loss": 0.78617108, + "num_input_tokens_seen": 188533345, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6875, + "step": 8772, + "time_per_iteration": 2.491196870803833 + }, + { + "auxiliary_loss_clip": 0.0111024, + "auxiliary_loss_mlp": 0.01040318, + "balance_loss_clip": 1.0271337, + "balance_loss_mlp": 1.03862512, + "epoch": 0.527461295656095, + "flos": 21031084558080.0, + "grad_norm": 1.8756798124264933, + "language_loss": 0.65986812, + "learning_rate": 1.9192086899089585e-06, + "loss": 0.68137372, + "num_input_tokens_seen": 188551550, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71484375, + "step": 8773, + "time_per_iteration": 2.464393138885498 + }, + { + "auxiliary_loss_clip": 0.01109322, + "auxiliary_loss_mlp": 0.01039476, + "balance_loss_clip": 1.02802014, + "balance_loss_mlp": 1.03791332, + "epoch": 0.5275214189087629, + "flos": 26322018929280.0, + "grad_norm": 1.5758079694219151, + "language_loss": 0.86029238, + "learning_rate": 1.91881954765502e-06, + "loss": 0.88178039, + "num_input_tokens_seen": 188571615, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.71484375, + "step": 8774, + "time_per_iteration": 2.5185675621032715 + }, + { + "auxiliary_loss_clip": 0.01105827, + "auxiliary_loss_mlp": 0.01030351, + "balance_loss_clip": 1.01860261, + "balance_loss_mlp": 1.03663182, + "epoch": 0.5275815421614309, + "flos": 20047024581120.0, + "grad_norm": 1.5254562165137588, + "language_loss": 0.79877412, + "learning_rate": 1.9184304084794523e-06, + "loss": 0.82013589, + "num_input_tokens_seen": 188591965, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.69140625, + "step": 8775, + "time_per_iteration": 2.454387664794922 + }, + { + "auxiliary_loss_clip": 0.01104897, + "auxiliary_loss_mlp": 0.01037755, + "balance_loss_clip": 1.02507758, + "balance_loss_mlp": 1.03681672, + "epoch": 0.5276416654140988, + "flos": 21432206712960.0, + "grad_norm": 1.7390352493983339, + "language_loss": 0.83807105, + "learning_rate": 1.918041272397012e-06, + "loss": 0.85949761, + "num_input_tokens_seen": 188610675, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 8776, + "time_per_iteration": 2.5026144981384277 + }, + { + "auxiliary_loss_clip": 0.01108103, + "auxiliary_loss_mlp": 0.01028347, + "balance_loss_clip": 1.01603246, + "balance_loss_mlp": 1.03759074, + "epoch": 0.5277017886667669, + "flos": 17165085696000.0, + "grad_norm": 1.6658876230443522, + "language_loss": 0.68375832, + "learning_rate": 1.9176521394224547e-06, + "loss": 0.70512283, + "num_input_tokens_seen": 188628235, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 8777, + "time_per_iteration": 2.417186975479126 + }, + { + "auxiliary_loss_clip": 0.01108134, + "auxiliary_loss_mlp": 0.01037656, + "balance_loss_clip": 1.0258069, + "balance_loss_mlp": 1.04009652, + "epoch": 0.5277619119194349, + "flos": 20448146736000.0, + "grad_norm": 2.132165937202497, + "language_loss": 0.82494706, + "learning_rate": 1.9172630095705358e-06, + "loss": 0.84640491, + "num_input_tokens_seen": 188648925, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 8778, + "time_per_iteration": 2.487772226333618 + }, + { + "auxiliary_loss_clip": 0.01110776, + "auxiliary_loss_mlp": 0.01033482, + "balance_loss_clip": 1.02037513, + "balance_loss_mlp": 1.04014647, + "epoch": 0.5278220351721028, + "flos": 24061083304320.0, + "grad_norm": 2.126071455139116, + "language_loss": 0.79359961, + "learning_rate": 1.916873882856013e-06, + "loss": 0.8150422, + "num_input_tokens_seen": 188668125, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.70703125, + "step": 8779, + "time_per_iteration": 2.4676833152770996 + }, + { + "auxiliary_loss_clip": 0.01102313, + "auxiliary_loss_mlp": 0.01031379, + "balance_loss_clip": 1.01942825, + "balance_loss_mlp": 1.03535295, + "epoch": 0.5278821584247708, + "flos": 24642907804800.0, + "grad_norm": 2.916693496001438, + "language_loss": 0.7667526, + "learning_rate": 1.9164847592936406e-06, + "loss": 0.78808951, + "num_input_tokens_seen": 188684410, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66796875, + "step": 8780, + "time_per_iteration": 2.489880323410034 + }, + { + "auxiliary_loss_clip": 0.01113237, + "auxiliary_loss_mlp": 0.01030309, + "balance_loss_clip": 1.01724982, + "balance_loss_mlp": 1.04084253, + "epoch": 0.5279422816774387, + "flos": 35408244240000.0, + "grad_norm": 1.5814481661794648, + "language_loss": 0.69506466, + "learning_rate": 1.916095638898174e-06, + "loss": 0.71650016, + "num_input_tokens_seen": 188706130, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.72265625, + "step": 8781, + "time_per_iteration": 2.570308208465576 + }, + { + "auxiliary_loss_clip": 0.01105161, + "auxiliary_loss_mlp": 0.0103252, + "balance_loss_clip": 1.02130246, + "balance_loss_mlp": 1.03748012, + "epoch": 0.5280024049301068, + "flos": 22967028904320.0, + "grad_norm": 1.5392288400315197, + "language_loss": 0.72434068, + "learning_rate": 1.9157065216843696e-06, + "loss": 0.74571753, + "num_input_tokens_seen": 188725030, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6796875, + "step": 8782, + "time_per_iteration": 2.4902799129486084 + }, + { + "auxiliary_loss_clip": 0.01104346, + "auxiliary_loss_mlp": 0.01027508, + "balance_loss_clip": 1.01557565, + "balance_loss_mlp": 1.03629112, + "epoch": 0.5280625281827747, + "flos": 21507619317120.0, + "grad_norm": 1.9147695733655095, + "language_loss": 0.68684381, + "learning_rate": 1.915317407666982e-06, + "loss": 0.70816237, + "num_input_tokens_seen": 188744325, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 8783, + "time_per_iteration": 2.4489378929138184 + }, + { + "auxiliary_loss_clip": 0.01115533, + "auxiliary_loss_mlp": 0.01037336, + "balance_loss_clip": 1.02257824, + "balance_loss_mlp": 1.04052663, + "epoch": 0.5281226514354427, + "flos": 31208167958400.0, + "grad_norm": 1.8253305439767769, + "language_loss": 0.69502926, + "learning_rate": 1.9149282968607674e-06, + "loss": 0.71655798, + "num_input_tokens_seen": 188765100, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.75, + "step": 8784, + "time_per_iteration": 2.55877947807312 + }, + { + "auxiliary_loss_clip": 0.0111041, + "auxiliary_loss_mlp": 0.01032831, + "balance_loss_clip": 1.01936626, + "balance_loss_mlp": 1.03718495, + "epoch": 0.5281827746881106, + "flos": 25077821679360.0, + "grad_norm": 2.137542562274274, + "language_loss": 0.75317723, + "learning_rate": 1.91453918928048e-06, + "loss": 0.77460963, + "num_input_tokens_seen": 188783995, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73046875, + "step": 8785, + "time_per_iteration": 2.5042202472686768 + }, + { + "auxiliary_loss_clip": 0.01109301, + "auxiliary_loss_mlp": 0.01031712, + "balance_loss_clip": 1.01858115, + "balance_loss_mlp": 1.03923512, + "epoch": 0.5282428979407786, + "flos": 20631255292800.0, + "grad_norm": 1.5356836172740989, + "language_loss": 0.8301636, + "learning_rate": 1.9141500849408745e-06, + "loss": 0.85157377, + "num_input_tokens_seen": 188803120, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.69921875, + "step": 8786, + "time_per_iteration": 2.530207872390747 + }, + { + "auxiliary_loss_clip": 0.01102608, + "auxiliary_loss_mlp": 0.01023798, + "balance_loss_clip": 1.01268828, + "balance_loss_mlp": 1.03662145, + "epoch": 0.5283030211934465, + "flos": 22419391173120.0, + "grad_norm": 6.419117505425037, + "language_loss": 0.8292653, + "learning_rate": 1.9137609838567076e-06, + "loss": 0.85052931, + "num_input_tokens_seen": 188820960, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66015625, + "step": 8787, + "time_per_iteration": 2.450303792953491 + }, + { + "auxiliary_loss_clip": 0.0110476, + "auxiliary_loss_mlp": 0.01027361, + "balance_loss_clip": 1.01601219, + "balance_loss_mlp": 1.03739762, + "epoch": 0.5283631444461145, + "flos": 23615467176960.0, + "grad_norm": 1.657610649379585, + "language_loss": 0.83385652, + "learning_rate": 1.9133718860427316e-06, + "loss": 0.85517776, + "num_input_tokens_seen": 188837165, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.671875, + "step": 8788, + "time_per_iteration": 2.4752538204193115 + }, + { + "auxiliary_loss_clip": 0.01107311, + "auxiliary_loss_mlp": 0.01036961, + "balance_loss_clip": 1.02353776, + "balance_loss_mlp": 1.04022217, + "epoch": 0.5284232676987825, + "flos": 32671994918400.0, + "grad_norm": 1.6616469699693164, + "language_loss": 0.7467941, + "learning_rate": 1.9129827915137027e-06, + "loss": 0.76823682, + "num_input_tokens_seen": 188858555, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.671875, + "step": 8789, + "time_per_iteration": 2.5324580669403076 + }, + { + "auxiliary_loss_clip": 0.01109504, + "auxiliary_loss_mlp": 0.01032501, + "balance_loss_clip": 1.01999021, + "balance_loss_mlp": 1.03898668, + "epoch": 0.5284833909514505, + "flos": 26760919213440.0, + "grad_norm": 1.4692396487834778, + "language_loss": 0.69505095, + "learning_rate": 1.9125937002843754e-06, + "loss": 0.71647108, + "num_input_tokens_seen": 188879050, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 8790, + "time_per_iteration": 2.51625919342041 + }, + { + "auxiliary_loss_clip": 0.01104373, + "auxiliary_loss_mlp": 0.01027676, + "balance_loss_clip": 1.01602292, + "balance_loss_mlp": 1.03740895, + "epoch": 0.5285435142041185, + "flos": 22090700793600.0, + "grad_norm": 1.5973748463846205, + "language_loss": 0.78992987, + "learning_rate": 1.9122046123695036e-06, + "loss": 0.81125033, + "num_input_tokens_seen": 188898885, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.671875, + "step": 8791, + "time_per_iteration": 2.4552273750305176 + }, + { + "auxiliary_loss_clip": 0.01108186, + "auxiliary_loss_mlp": 0.01029947, + "balance_loss_clip": 1.01800871, + "balance_loss_mlp": 1.04050541, + "epoch": 0.5286036374567864, + "flos": 20375463565440.0, + "grad_norm": 1.8738977568036352, + "language_loss": 0.66256213, + "learning_rate": 1.9118155277838423e-06, + "loss": 0.68394351, + "num_input_tokens_seen": 188917225, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 8792, + "time_per_iteration": 2.485501527786255 + }, + { + "auxiliary_loss_clip": 0.01103536, + "auxiliary_loss_mlp": 0.01035011, + "balance_loss_clip": 1.02362621, + "balance_loss_mlp": 1.03610563, + "epoch": 0.5286637607094544, + "flos": 24352175122560.0, + "grad_norm": 2.0158719758485226, + "language_loss": 0.79919344, + "learning_rate": 1.9114264465421443e-06, + "loss": 0.82057893, + "num_input_tokens_seen": 188936120, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.671875, + "step": 8793, + "time_per_iteration": 2.4918789863586426 + }, + { + "auxiliary_loss_clip": 0.01108596, + "auxiliary_loss_mlp": 0.01036682, + "balance_loss_clip": 1.02393866, + "balance_loss_mlp": 1.03883982, + "epoch": 0.5287238839621223, + "flos": 17271165536640.0, + "grad_norm": 1.8030848585204593, + "language_loss": 0.84791529, + "learning_rate": 1.9110373686591645e-06, + "loss": 0.86936802, + "num_input_tokens_seen": 188953405, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 8794, + "time_per_iteration": 2.451828718185425 + }, + { + "auxiliary_loss_clip": 0.01110927, + "auxiliary_loss_mlp": 0.01032186, + "balance_loss_clip": 1.01894772, + "balance_loss_mlp": 1.03798628, + "epoch": 0.5287840072147904, + "flos": 17566890209280.0, + "grad_norm": 1.927550813134725, + "language_loss": 0.67570889, + "learning_rate": 1.9106482941496564e-06, + "loss": 0.69714004, + "num_input_tokens_seen": 188971150, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73046875, + "step": 8795, + "time_per_iteration": 2.4460599422454834 + }, + { + "auxiliary_loss_clip": 0.01107843, + "auxiliary_loss_mlp": 0.01029404, + "balance_loss_clip": 1.01716161, + "balance_loss_mlp": 1.03754616, + "epoch": 0.5288441304674583, + "flos": 18552099421440.0, + "grad_norm": 1.883468232968509, + "language_loss": 0.80662012, + "learning_rate": 1.910259223028374e-06, + "loss": 0.82799256, + "num_input_tokens_seen": 188989550, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.703125, + "step": 8796, + "time_per_iteration": 2.4592626094818115 + }, + { + "auxiliary_loss_clip": 0.01112299, + "auxiliary_loss_mlp": 0.0103268, + "balance_loss_clip": 1.01978111, + "balance_loss_mlp": 1.04186153, + "epoch": 0.5289042537201263, + "flos": 20814507504000.0, + "grad_norm": 1.9732503530858911, + "language_loss": 0.69071984, + "learning_rate": 1.909870155310071e-06, + "loss": 0.71216959, + "num_input_tokens_seen": 189008795, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 8797, + "time_per_iteration": 2.4451231956481934 + }, + { + "auxiliary_loss_clip": 0.01103286, + "auxiliary_loss_mlp": 0.01032584, + "balance_loss_clip": 1.02128911, + "balance_loss_mlp": 1.03739119, + "epoch": 0.5289643769727942, + "flos": 15735265937280.0, + "grad_norm": 1.7017381786261847, + "language_loss": 0.82339096, + "learning_rate": 1.9094810910095005e-06, + "loss": 0.84474969, + "num_input_tokens_seen": 189025540, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.66015625, + "step": 8798, + "time_per_iteration": 2.4694111347198486 + }, + { + "auxiliary_loss_clip": 0.01111092, + "auxiliary_loss_mlp": 0.0103436, + "balance_loss_clip": 1.02102065, + "balance_loss_mlp": 1.03840899, + "epoch": 0.5290245002254622, + "flos": 19537308633600.0, + "grad_norm": 2.0619187329461575, + "language_loss": 0.70591879, + "learning_rate": 1.9090920301414166e-06, + "loss": 0.72737336, + "num_input_tokens_seen": 189044885, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7265625, + "step": 8799, + "time_per_iteration": 2.456692695617676 + }, + { + "auxiliary_loss_clip": 0.01104599, + "auxiliary_loss_mlp": 0.01032577, + "balance_loss_clip": 1.02132988, + "balance_loss_mlp": 1.03975451, + "epoch": 0.5290846234781301, + "flos": 15815131827840.0, + "grad_norm": 1.8240531153484045, + "language_loss": 0.69601536, + "learning_rate": 1.9087029727205716e-06, + "loss": 0.71738708, + "num_input_tokens_seen": 189061280, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 8800, + "time_per_iteration": 2.490417242050171 + }, + { + "auxiliary_loss_clip": 0.01036269, + "auxiliary_loss_mlp": 0.01012691, + "balance_loss_clip": 1.01148117, + "balance_loss_mlp": 1.01404071, + "epoch": 0.5291447467307981, + "flos": 70057624821120.0, + "grad_norm": 0.998441198923784, + "language_loss": 0.57013941, + "learning_rate": 1.9083139187617193e-06, + "loss": 0.59062898, + "num_input_tokens_seen": 189114775, + "router_z_loss_clip": 0.01208496, + "router_z_loss_mlp": 0.22265625, + "step": 8801, + "time_per_iteration": 4.385375022888184 + }, + { + "auxiliary_loss_clip": 0.01109021, + "auxiliary_loss_mlp": 0.0103377, + "balance_loss_clip": 1.02168214, + "balance_loss_mlp": 1.03874719, + "epoch": 0.529204869983466, + "flos": 28364186770560.0, + "grad_norm": 1.5128121202389628, + "language_loss": 0.63942313, + "learning_rate": 1.9079248682796123e-06, + "loss": 0.66085106, + "num_input_tokens_seen": 189134700, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.703125, + "step": 8802, + "time_per_iteration": 2.5486578941345215 + }, + { + "auxiliary_loss_clip": 0.01104672, + "auxiliary_loss_mlp": 0.01029396, + "balance_loss_clip": 1.01684964, + "balance_loss_mlp": 1.03677487, + "epoch": 0.5292649932361341, + "flos": 33758830684800.0, + "grad_norm": 1.7172902320691381, + "language_loss": 0.68250531, + "learning_rate": 1.907535821289003e-06, + "loss": 0.70384604, + "num_input_tokens_seen": 189155365, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 8803, + "time_per_iteration": 2.576460361480713 + }, + { + "auxiliary_loss_clip": 0.01102984, + "auxiliary_loss_mlp": 0.01034692, + "balance_loss_clip": 1.02233613, + "balance_loss_mlp": 1.03654003, + "epoch": 0.5293251164888021, + "flos": 20447679859200.0, + "grad_norm": 1.6769492859989101, + "language_loss": 0.76551962, + "learning_rate": 1.9071467778046458e-06, + "loss": 0.78689635, + "num_input_tokens_seen": 189173885, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6640625, + "step": 8804, + "time_per_iteration": 4.018502473831177 + }, + { + "auxiliary_loss_clip": 0.0103564, + "auxiliary_loss_mlp": 0.01005394, + "balance_loss_clip": 1.00417256, + "balance_loss_mlp": 1.01327515, + "epoch": 0.52938523974147, + "flos": 66545312204160.0, + "grad_norm": 0.749734320345171, + "language_loss": 0.53018034, + "learning_rate": 1.906757737841291e-06, + "loss": 0.55059063, + "num_input_tokens_seen": 189236515, + "router_z_loss_clip": 0.01220703, + "router_z_loss_mlp": 0.22363281, + "step": 8805, + "time_per_iteration": 4.599541902542114 + }, + { + "auxiliary_loss_clip": 0.01034831, + "auxiliary_loss_mlp": 0.01001215, + "balance_loss_clip": 0.99995738, + "balance_loss_mlp": 1.0124402, + "epoch": 0.529445362994138, + "flos": 67151734542720.0, + "grad_norm": 0.7381494507925852, + "language_loss": 0.63778675, + "learning_rate": 1.906368701413693e-06, + "loss": 0.65814722, + "num_input_tokens_seen": 189300500, + "router_z_loss_clip": 0.01257324, + "router_z_loss_mlp": 0.22460938, + "step": 8806, + "time_per_iteration": 3.067852735519409 + }, + { + "auxiliary_loss_clip": 0.01110413, + "auxiliary_loss_mlp": 0.01034235, + "balance_loss_clip": 1.02229047, + "balance_loss_mlp": 1.03770947, + "epoch": 0.5295054862468059, + "flos": 17749316407680.0, + "grad_norm": 1.9894097123133165, + "language_loss": 0.72397399, + "learning_rate": 1.9059796685366026e-06, + "loss": 0.74542046, + "num_input_tokens_seen": 189319745, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.7265625, + "step": 8807, + "time_per_iteration": 2.4303808212280273 + }, + { + "auxiliary_loss_clip": 0.0110442, + "auxiliary_loss_mlp": 0.01029131, + "balance_loss_clip": 1.01775241, + "balance_loss_mlp": 1.03735805, + "epoch": 0.529565609499474, + "flos": 11397401084160.0, + "grad_norm": 4.619049711580288, + "language_loss": 0.69640231, + "learning_rate": 1.9055906392247723e-06, + "loss": 0.71773779, + "num_input_tokens_seen": 189334550, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.671875, + "step": 8808, + "time_per_iteration": 2.418649435043335 + }, + { + "auxiliary_loss_clip": 0.01105928, + "auxiliary_loss_mlp": 0.01029731, + "balance_loss_clip": 1.01817942, + "balance_loss_mlp": 1.03796387, + "epoch": 0.5296257327521419, + "flos": 17196363463680.0, + "grad_norm": 1.7756221154666856, + "language_loss": 0.8668943, + "learning_rate": 1.9052016134929554e-06, + "loss": 0.88825089, + "num_input_tokens_seen": 189351735, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6796875, + "step": 8809, + "time_per_iteration": 2.413883686065674 + }, + { + "auxiliary_loss_clip": 0.01112398, + "auxiliary_loss_mlp": 0.01034558, + "balance_loss_clip": 1.0209322, + "balance_loss_mlp": 1.03908372, + "epoch": 0.5296858560048099, + "flos": 39964086777600.0, + "grad_norm": 1.608353260814621, + "language_loss": 0.64362073, + "learning_rate": 1.9048125913559016e-06, + "loss": 0.66509026, + "num_input_tokens_seen": 189373105, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73046875, + "step": 8810, + "time_per_iteration": 2.6121585369110107 + }, + { + "auxiliary_loss_clip": 0.0110573, + "auxiliary_loss_mlp": 0.01035568, + "balance_loss_clip": 1.02372456, + "balance_loss_mlp": 1.03820479, + "epoch": 0.5297459792574778, + "flos": 20961418129920.0, + "grad_norm": 1.5055977388002117, + "language_loss": 0.68083066, + "learning_rate": 1.9044235728283646e-06, + "loss": 0.70224369, + "num_input_tokens_seen": 189394615, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 8811, + "time_per_iteration": 2.4806406497955322 + }, + { + "auxiliary_loss_clip": 0.010328, + "auxiliary_loss_mlp": 0.00998698, + "balance_loss_clip": 0.99739295, + "balance_loss_mlp": 1.01059103, + "epoch": 0.5298061025101458, + "flos": 66523620389760.0, + "grad_norm": 0.6652461754552681, + "language_loss": 0.53400505, + "learning_rate": 1.9040345579250953e-06, + "loss": 0.5543201, + "num_input_tokens_seen": 189459750, + "router_z_loss_clip": 0.01306152, + "router_z_loss_mlp": 0.22265625, + "step": 8812, + "time_per_iteration": 3.175478458404541 + }, + { + "auxiliary_loss_clip": 0.01032825, + "auxiliary_loss_mlp": 0.01000267, + "balance_loss_clip": 0.99906272, + "balance_loss_mlp": 1.01074851, + "epoch": 0.5298662257628137, + "flos": 67662994775040.0, + "grad_norm": 0.7207460213448722, + "language_loss": 0.56372511, + "learning_rate": 1.9036455466608453e-06, + "loss": 0.58405602, + "num_input_tokens_seen": 189527540, + "router_z_loss_clip": 0.01202393, + "router_z_loss_mlp": 0.22070312, + "step": 8813, + "time_per_iteration": 3.1315269470214844 + }, + { + "auxiliary_loss_clip": 0.01102589, + "auxiliary_loss_mlp": 0.01028915, + "balance_loss_clip": 1.01751852, + "balance_loss_mlp": 1.03824615, + "epoch": 0.5299263490154817, + "flos": 19646405216640.0, + "grad_norm": 1.5478508872520975, + "language_loss": 0.81618506, + "learning_rate": 1.9032565390503657e-06, + "loss": 0.8375001, + "num_input_tokens_seen": 189546900, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.640625, + "step": 8814, + "time_per_iteration": 2.431269884109497 + }, + { + "auxiliary_loss_clip": 0.01113436, + "auxiliary_loss_mlp": 0.01028986, + "balance_loss_clip": 1.01687407, + "balance_loss_mlp": 1.04241931, + "epoch": 0.5299864722681497, + "flos": 22055005653120.0, + "grad_norm": 1.5843849623618003, + "language_loss": 0.84997016, + "learning_rate": 1.9028675351084076e-06, + "loss": 0.8713944, + "num_input_tokens_seen": 189566490, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.7109375, + "step": 8815, + "time_per_iteration": 2.531074285507202 + }, + { + "auxiliary_loss_clip": 0.01105692, + "auxiliary_loss_mlp": 0.01030775, + "balance_loss_clip": 1.01968288, + "balance_loss_mlp": 1.03940964, + "epoch": 0.5300465955208177, + "flos": 21763698353280.0, + "grad_norm": 2.126267576495584, + "language_loss": 0.66768968, + "learning_rate": 1.9024785348497225e-06, + "loss": 0.68905437, + "num_input_tokens_seen": 189585580, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6640625, + "step": 8816, + "time_per_iteration": 2.525468111038208 + }, + { + "auxiliary_loss_clip": 0.01107527, + "auxiliary_loss_mlp": 0.01033036, + "balance_loss_clip": 1.0210259, + "balance_loss_mlp": 1.03860188, + "epoch": 0.5301067187734857, + "flos": 42996491735040.0, + "grad_norm": 1.7854125043951103, + "language_loss": 0.72206688, + "learning_rate": 1.9020895382890611e-06, + "loss": 0.74347246, + "num_input_tokens_seen": 189608485, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6875, + "step": 8817, + "time_per_iteration": 2.6937406063079834 + }, + { + "auxiliary_loss_clip": 0.01107001, + "auxiliary_loss_mlp": 0.01031476, + "balance_loss_clip": 1.01921499, + "balance_loss_mlp": 1.03620088, + "epoch": 0.5301668420261536, + "flos": 20554298403840.0, + "grad_norm": 1.6863401200151742, + "language_loss": 0.6522249, + "learning_rate": 1.9017005454411743e-06, + "loss": 0.67360961, + "num_input_tokens_seen": 189627815, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.70703125, + "step": 8818, + "time_per_iteration": 2.509539842605591 + }, + { + "auxiliary_loss_clip": 0.0110849, + "auxiliary_loss_mlp": 0.01027368, + "balance_loss_clip": 1.01462412, + "balance_loss_mlp": 1.0393914, + "epoch": 0.5302269652788216, + "flos": 17486665182720.0, + "grad_norm": 1.999877555758676, + "language_loss": 0.75154972, + "learning_rate": 1.9013115563208126e-06, + "loss": 0.77290833, + "num_input_tokens_seen": 189644850, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 8819, + "time_per_iteration": 2.473130702972412 + }, + { + "auxiliary_loss_clip": 0.01110233, + "auxiliary_loss_mlp": 0.0103388, + "balance_loss_clip": 1.0214107, + "balance_loss_mlp": 1.03858495, + "epoch": 0.5302870885314895, + "flos": 14574202715520.0, + "grad_norm": 2.27674417450437, + "language_loss": 0.82333302, + "learning_rate": 1.9009225709427267e-06, + "loss": 0.84477413, + "num_input_tokens_seen": 189660945, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71875, + "step": 8820, + "time_per_iteration": 2.4328434467315674 + }, + { + "auxiliary_loss_clip": 0.01106236, + "auxiliary_loss_mlp": 0.01031495, + "balance_loss_clip": 1.02030122, + "balance_loss_mlp": 1.03725612, + "epoch": 0.5303472117841576, + "flos": 23438032968960.0, + "grad_norm": 2.049749716635941, + "language_loss": 0.72593045, + "learning_rate": 1.9005335893216667e-06, + "loss": 0.74730772, + "num_input_tokens_seen": 189680425, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.69140625, + "step": 8821, + "time_per_iteration": 2.508608102798462 + }, + { + "auxiliary_loss_clip": 0.01102657, + "auxiliary_loss_mlp": 0.01028883, + "balance_loss_clip": 1.01779675, + "balance_loss_mlp": 1.0363605, + "epoch": 0.5304073350368255, + "flos": 22709010533760.0, + "grad_norm": 1.3923419148404492, + "language_loss": 0.73939008, + "learning_rate": 1.9001446114723824e-06, + "loss": 0.76070547, + "num_input_tokens_seen": 189700375, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6640625, + "step": 8822, + "time_per_iteration": 2.4427592754364014 + }, + { + "auxiliary_loss_clip": 0.01107083, + "auxiliary_loss_mlp": 0.01035958, + "balance_loss_clip": 1.02312553, + "balance_loss_mlp": 1.03773904, + "epoch": 0.5304674582894935, + "flos": 27928554624000.0, + "grad_norm": 1.6902308577802683, + "language_loss": 0.67477053, + "learning_rate": 1.8997556374096257e-06, + "loss": 0.69620097, + "num_input_tokens_seen": 189721225, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 8823, + "time_per_iteration": 2.5047175884246826 + }, + { + "auxiliary_loss_clip": 0.0110955, + "auxiliary_loss_mlp": 0.0103452, + "balance_loss_clip": 1.02113247, + "balance_loss_mlp": 1.03756142, + "epoch": 0.5305275815421614, + "flos": 21250642440960.0, + "grad_norm": 1.5189625554392572, + "language_loss": 0.69347805, + "learning_rate": 1.8993666671481444e-06, + "loss": 0.71491873, + "num_input_tokens_seen": 189740170, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71875, + "step": 8824, + "time_per_iteration": 2.4358925819396973 + }, + { + "auxiliary_loss_clip": 0.01104053, + "auxiliary_loss_mlp": 0.01030719, + "balance_loss_clip": 1.01879227, + "balance_loss_mlp": 1.03755724, + "epoch": 0.5305877047948294, + "flos": 17603088140160.0, + "grad_norm": 2.2315847136946956, + "language_loss": 0.75412273, + "learning_rate": 1.898977700702689e-06, + "loss": 0.77547044, + "num_input_tokens_seen": 189757890, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 8825, + "time_per_iteration": 2.480656385421753 + }, + { + "auxiliary_loss_clip": 0.01105338, + "auxiliary_loss_mlp": 0.01036746, + "balance_loss_clip": 1.02433622, + "balance_loss_mlp": 1.03730893, + "epoch": 0.5306478280474973, + "flos": 15195493284480.0, + "grad_norm": 2.0577399670241125, + "language_loss": 0.85668242, + "learning_rate": 1.8985887380880103e-06, + "loss": 0.87810326, + "num_input_tokens_seen": 189775390, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 8826, + "time_per_iteration": 2.422227621078491 + }, + { + "auxiliary_loss_clip": 0.01103641, + "auxiliary_loss_mlp": 0.01030185, + "balance_loss_clip": 1.01760268, + "balance_loss_mlp": 1.03594768, + "epoch": 0.5307079513001653, + "flos": 15341218761600.0, + "grad_norm": 1.3501660325975628, + "language_loss": 0.64042354, + "learning_rate": 1.8981997793188558e-06, + "loss": 0.66176176, + "num_input_tokens_seen": 189793975, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.67578125, + "step": 8827, + "time_per_iteration": 2.461434841156006 + }, + { + "auxiliary_loss_clip": 0.0110958, + "auxiliary_loss_mlp": 0.0103759, + "balance_loss_clip": 1.02452421, + "balance_loss_mlp": 1.03835428, + "epoch": 0.5307680745528333, + "flos": 43544452688640.0, + "grad_norm": 1.5699076783392119, + "language_loss": 0.60028976, + "learning_rate": 1.8978108244099762e-06, + "loss": 0.62176144, + "num_input_tokens_seen": 189817870, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71484375, + "step": 8828, + "time_per_iteration": 2.621673107147217 + }, + { + "auxiliary_loss_clip": 0.01109989, + "auxiliary_loss_mlp": 0.01030803, + "balance_loss_clip": 1.01791096, + "balance_loss_mlp": 1.03909802, + "epoch": 0.5308281978055013, + "flos": 20048928001920.0, + "grad_norm": 1.7449235888895405, + "language_loss": 0.81386358, + "learning_rate": 1.8974218733761208e-06, + "loss": 0.83527148, + "num_input_tokens_seen": 189837905, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 8829, + "time_per_iteration": 2.472055673599243 + }, + { + "auxiliary_loss_clip": 0.0110653, + "auxiliary_loss_mlp": 0.01033982, + "balance_loss_clip": 1.02196574, + "balance_loss_mlp": 1.03871477, + "epoch": 0.5308883210581693, + "flos": 20703938463360.0, + "grad_norm": 1.483207387046285, + "language_loss": 0.78292549, + "learning_rate": 1.8970329262320375e-06, + "loss": 0.80433053, + "num_input_tokens_seen": 189856970, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 8830, + "time_per_iteration": 2.4544272422790527 + }, + { + "auxiliary_loss_clip": 0.01106311, + "auxiliary_loss_mlp": 0.01031189, + "balance_loss_clip": 1.01877975, + "balance_loss_mlp": 1.03778768, + "epoch": 0.5309484443108372, + "flos": 14355506759040.0, + "grad_norm": 2.0257257472461525, + "language_loss": 0.80643964, + "learning_rate": 1.8966439829924768e-06, + "loss": 0.82781464, + "num_input_tokens_seen": 189872830, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 8831, + "time_per_iteration": 2.4307594299316406 + }, + { + "auxiliary_loss_clip": 0.01103982, + "auxiliary_loss_mlp": 0.01028528, + "balance_loss_clip": 1.01611233, + "balance_loss_mlp": 1.03561974, + "epoch": 0.5310085675635052, + "flos": 20010503427840.0, + "grad_norm": 2.026603228036347, + "language_loss": 0.73146117, + "learning_rate": 1.896255043672186e-06, + "loss": 0.75278628, + "num_input_tokens_seen": 189891635, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 8832, + "time_per_iteration": 2.429567813873291 + }, + { + "auxiliary_loss_clip": 0.01111675, + "auxiliary_loss_mlp": 0.01034067, + "balance_loss_clip": 1.02162194, + "balance_loss_mlp": 1.04065752, + "epoch": 0.5310686908161731, + "flos": 22127293774080.0, + "grad_norm": 1.9229428073701915, + "language_loss": 0.75382435, + "learning_rate": 1.8958661082859143e-06, + "loss": 0.77528179, + "num_input_tokens_seen": 189909050, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 8833, + "time_per_iteration": 2.4731011390686035 + }, + { + "auxiliary_loss_clip": 0.01107496, + "auxiliary_loss_mlp": 0.01030529, + "balance_loss_clip": 1.01733804, + "balance_loss_mlp": 1.03697777, + "epoch": 0.5311288140688412, + "flos": 24717889445760.0, + "grad_norm": 1.9718581367947616, + "language_loss": 0.73314357, + "learning_rate": 1.8954771768484103e-06, + "loss": 0.75452387, + "num_input_tokens_seen": 189927405, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 8834, + "time_per_iteration": 2.476289987564087 + }, + { + "auxiliary_loss_clip": 0.01113252, + "auxiliary_loss_mlp": 0.01033749, + "balance_loss_clip": 1.01980758, + "balance_loss_mlp": 1.03958392, + "epoch": 0.5311889373215091, + "flos": 24097712198400.0, + "grad_norm": 2.0084943443028975, + "language_loss": 0.77603996, + "learning_rate": 1.8950882493744226e-06, + "loss": 0.79750997, + "num_input_tokens_seen": 189947740, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.73828125, + "step": 8835, + "time_per_iteration": 2.512998104095459 + }, + { + "auxiliary_loss_clip": 0.01106643, + "auxiliary_loss_mlp": 0.01036561, + "balance_loss_clip": 1.02318025, + "balance_loss_mlp": 1.03647518, + "epoch": 0.5312490605741771, + "flos": 22017012042240.0, + "grad_norm": 1.8374817013403106, + "language_loss": 0.72753531, + "learning_rate": 1.8946993258786985e-06, + "loss": 0.74896735, + "num_input_tokens_seen": 189966495, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.703125, + "step": 8836, + "time_per_iteration": 2.4509310722351074 + }, + { + "auxiliary_loss_clip": 0.01108843, + "auxiliary_loss_mlp": 0.01033453, + "balance_loss_clip": 1.01986957, + "balance_loss_mlp": 1.03784788, + "epoch": 0.531309183826845, + "flos": 19390541662080.0, + "grad_norm": 2.66525227198108, + "language_loss": 0.80936503, + "learning_rate": 1.894310406375987e-06, + "loss": 0.83078802, + "num_input_tokens_seen": 189985325, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.70703125, + "step": 8837, + "time_per_iteration": 2.471662759780884 + }, + { + "auxiliary_loss_clip": 0.0110708, + "auxiliary_loss_mlp": 0.01028737, + "balance_loss_clip": 1.01615477, + "balance_loss_mlp": 1.03874159, + "epoch": 0.531369307079513, + "flos": 20190056538240.0, + "grad_norm": 1.8452061032611426, + "language_loss": 0.85926068, + "learning_rate": 1.893921490881035e-06, + "loss": 0.88061881, + "num_input_tokens_seen": 190003290, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.68359375, + "step": 8838, + "time_per_iteration": 2.4360713958740234 + }, + { + "auxiliary_loss_clip": 0.011058, + "auxiliary_loss_mlp": 0.01029554, + "balance_loss_clip": 1.01779366, + "balance_loss_mlp": 1.03785229, + "epoch": 0.5314294303321809, + "flos": 18880143356160.0, + "grad_norm": 1.8224224127823847, + "language_loss": 0.7208544, + "learning_rate": 1.8935325794085906e-06, + "loss": 0.74220788, + "num_input_tokens_seen": 190023260, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6796875, + "step": 8839, + "time_per_iteration": 2.4806606769561768 + }, + { + "auxiliary_loss_clip": 0.01106476, + "auxiliary_loss_mlp": 0.01033997, + "balance_loss_clip": 1.02167034, + "balance_loss_mlp": 1.03606987, + "epoch": 0.531489553584849, + "flos": 23040035297280.0, + "grad_norm": 1.889571361745381, + "language_loss": 0.76674354, + "learning_rate": 1.8931436719734023e-06, + "loss": 0.78814822, + "num_input_tokens_seen": 190042035, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 8840, + "time_per_iteration": 2.47389817237854 + }, + { + "auxiliary_loss_clip": 0.01108751, + "auxiliary_loss_mlp": 0.01031156, + "balance_loss_clip": 1.01798964, + "balance_loss_mlp": 1.03678751, + "epoch": 0.5315496768375169, + "flos": 19790478668160.0, + "grad_norm": 1.9758748106511805, + "language_loss": 0.77377498, + "learning_rate": 1.892754768590216e-06, + "loss": 0.79517406, + "num_input_tokens_seen": 190057545, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71875, + "step": 8841, + "time_per_iteration": 2.4235799312591553 + }, + { + "auxiliary_loss_clip": 0.0103176, + "auxiliary_loss_mlp": 0.01011801, + "balance_loss_clip": 1.01060319, + "balance_loss_mlp": 1.00937963, + "epoch": 0.5316098000901849, + "flos": 71023228185600.0, + "grad_norm": 0.6971901974616477, + "language_loss": 0.56793272, + "learning_rate": 1.8923658692737793e-06, + "loss": 0.5883683, + "num_input_tokens_seen": 190123800, + "router_z_loss_clip": 0.01196289, + "router_z_loss_mlp": 0.22363281, + "step": 8842, + "time_per_iteration": 3.1749658584594727 + }, + { + "auxiliary_loss_clip": 0.0111031, + "auxiliary_loss_mlp": 0.01038298, + "balance_loss_clip": 1.02445221, + "balance_loss_mlp": 1.03839254, + "epoch": 0.5316699233428529, + "flos": 16435560470400.0, + "grad_norm": 1.7048374639197847, + "language_loss": 0.73877072, + "learning_rate": 1.8919769740388407e-06, + "loss": 0.76025677, + "num_input_tokens_seen": 190141625, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.71875, + "step": 8843, + "time_per_iteration": 3.7764668464660645 + }, + { + "auxiliary_loss_clip": 0.01031369, + "auxiliary_loss_mlp": 0.01005783, + "balance_loss_clip": 1.00454903, + "balance_loss_mlp": 1.0092088, + "epoch": 0.5317300465955208, + "flos": 67420814302080.0, + "grad_norm": 0.8754586803272454, + "language_loss": 0.61063367, + "learning_rate": 1.891588082900145e-06, + "loss": 0.63100517, + "num_input_tokens_seen": 190198110, + "router_z_loss_clip": 0.0123291, + "router_z_loss_mlp": 0.22265625, + "step": 8844, + "time_per_iteration": 3.1397178173065186 + }, + { + "auxiliary_loss_clip": 0.01031644, + "auxiliary_loss_mlp": 0.01000918, + "balance_loss_clip": 0.9997676, + "balance_loss_mlp": 1.00950778, + "epoch": 0.5317901698481888, + "flos": 59508075340800.0, + "grad_norm": 0.9433503667086528, + "language_loss": 0.62195891, + "learning_rate": 1.8911991958724411e-06, + "loss": 0.64228451, + "num_input_tokens_seen": 190259950, + "router_z_loss_clip": 0.01147461, + "router_z_loss_mlp": 0.22167969, + "step": 8845, + "time_per_iteration": 3.0431036949157715 + }, + { + "auxiliary_loss_clip": 0.01107979, + "auxiliary_loss_mlp": 0.01032637, + "balance_loss_clip": 1.01908851, + "balance_loss_mlp": 1.0369339, + "epoch": 0.5318502931008567, + "flos": 19129219240320.0, + "grad_norm": 2.021195915673457, + "language_loss": 0.7583214, + "learning_rate": 1.890810312970474e-06, + "loss": 0.77972758, + "num_input_tokens_seen": 190278265, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7109375, + "step": 8846, + "time_per_iteration": 5.309458017349243 + }, + { + "auxiliary_loss_clip": 0.01106825, + "auxiliary_loss_mlp": 0.01031551, + "balance_loss_clip": 1.01994586, + "balance_loss_mlp": 1.03744686, + "epoch": 0.5319104163535248, + "flos": 24681045070080.0, + "grad_norm": 1.5634287795910362, + "language_loss": 0.75384724, + "learning_rate": 1.8904214342089903e-06, + "loss": 0.775231, + "num_input_tokens_seen": 190298400, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6953125, + "step": 8847, + "time_per_iteration": 2.4939441680908203 + }, + { + "auxiliary_loss_clip": 0.01104626, + "auxiliary_loss_mlp": 0.01029614, + "balance_loss_clip": 1.01720405, + "balance_loss_mlp": 1.03563881, + "epoch": 0.5319705396061927, + "flos": 19385513758080.0, + "grad_norm": 1.798053797011527, + "language_loss": 0.87663037, + "learning_rate": 1.8900325596027378e-06, + "loss": 0.89797276, + "num_input_tokens_seen": 190316235, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 8848, + "time_per_iteration": 2.417572498321533 + }, + { + "auxiliary_loss_clip": 0.01107802, + "auxiliary_loss_mlp": 0.01035349, + "balance_loss_clip": 1.02124095, + "balance_loss_mlp": 1.03765666, + "epoch": 0.5320306628588607, + "flos": 18259319664000.0, + "grad_norm": 2.6565378723095834, + "language_loss": 0.74641025, + "learning_rate": 1.8896436891664609e-06, + "loss": 0.76784182, + "num_input_tokens_seen": 190335060, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.703125, + "step": 8849, + "time_per_iteration": 2.4509243965148926 + }, + { + "auxiliary_loss_clip": 0.01107364, + "auxiliary_loss_mlp": 0.01029496, + "balance_loss_clip": 1.01655602, + "balance_loss_mlp": 1.03593016, + "epoch": 0.5320907861115286, + "flos": 23732321097600.0, + "grad_norm": 2.164126567755358, + "language_loss": 0.79812169, + "learning_rate": 1.8892548229149066e-06, + "loss": 0.81949031, + "num_input_tokens_seen": 190353265, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71484375, + "step": 8850, + "time_per_iteration": 2.45766544342041 + }, + { + "auxiliary_loss_clip": 0.01104904, + "auxiliary_loss_mlp": 0.01028828, + "balance_loss_clip": 1.01615, + "balance_loss_mlp": 1.03538489, + "epoch": 0.5321509093641966, + "flos": 34495251321600.0, + "grad_norm": 1.4483393548737078, + "language_loss": 0.54913849, + "learning_rate": 1.888865960862821e-06, + "loss": 0.57047582, + "num_input_tokens_seen": 190376575, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 8851, + "time_per_iteration": 2.607548713684082 + }, + { + "auxiliary_loss_clip": 0.01110841, + "auxiliary_loss_mlp": 0.01030593, + "balance_loss_clip": 1.01821876, + "balance_loss_mlp": 1.03916895, + "epoch": 0.5322110326168645, + "flos": 20010934391040.0, + "grad_norm": 1.7052679387317837, + "language_loss": 0.68385565, + "learning_rate": 1.8884771030249484e-06, + "loss": 0.70526993, + "num_input_tokens_seen": 190395185, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71484375, + "step": 8852, + "time_per_iteration": 2.4444568157196045 + }, + { + "auxiliary_loss_clip": 0.01031832, + "auxiliary_loss_mlp": 0.00999979, + "balance_loss_clip": 0.9987337, + "balance_loss_mlp": 1.00941014, + "epoch": 0.5322711558695326, + "flos": 64631164435200.0, + "grad_norm": 0.8061011864926959, + "language_loss": 0.62881088, + "learning_rate": 1.8880882494160357e-06, + "loss": 0.64912903, + "num_input_tokens_seen": 190452595, + "router_z_loss_clip": 0.01245117, + "router_z_loss_mlp": 0.22460938, + "step": 8853, + "time_per_iteration": 3.0409493446350098 + }, + { + "auxiliary_loss_clip": 0.01108315, + "auxiliary_loss_mlp": 0.01029698, + "balance_loss_clip": 1.01691902, + "balance_loss_mlp": 1.03633368, + "epoch": 0.5323312791222005, + "flos": 14939342421120.0, + "grad_norm": 2.2642894326377196, + "language_loss": 0.79002404, + "learning_rate": 1.8876994000508278e-06, + "loss": 0.81140411, + "num_input_tokens_seen": 190469140, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.71875, + "step": 8854, + "time_per_iteration": 2.4175822734832764 + }, + { + "auxiliary_loss_clip": 0.01103338, + "auxiliary_loss_mlp": 0.01028455, + "balance_loss_clip": 1.01717186, + "balance_loss_mlp": 1.03635907, + "epoch": 0.5323914023748685, + "flos": 23440834229760.0, + "grad_norm": 1.6616394070358602, + "language_loss": 0.73815715, + "learning_rate": 1.8873105549440698e-06, + "loss": 0.75947511, + "num_input_tokens_seen": 190489015, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.671875, + "step": 8855, + "time_per_iteration": 2.5298781394958496 + }, + { + "auxiliary_loss_clip": 0.01104403, + "auxiliary_loss_mlp": 0.01029662, + "balance_loss_clip": 1.01806259, + "balance_loss_mlp": 1.03597307, + "epoch": 0.5324515256275365, + "flos": 26286180134400.0, + "grad_norm": 1.9409120124024815, + "language_loss": 0.64495003, + "learning_rate": 1.886921714110507e-06, + "loss": 0.66629064, + "num_input_tokens_seen": 190508065, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.68359375, + "step": 8856, + "time_per_iteration": 2.483076333999634 + }, + { + "auxiliary_loss_clip": 0.01111855, + "auxiliary_loss_mlp": 0.01035084, + "balance_loss_clip": 1.02166665, + "balance_loss_mlp": 1.03986931, + "epoch": 0.5325116488802044, + "flos": 26870913636480.0, + "grad_norm": 1.6437419686120303, + "language_loss": 0.77630389, + "learning_rate": 1.8865328775648842e-06, + "loss": 0.79777324, + "num_input_tokens_seen": 190527045, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.71875, + "step": 8857, + "time_per_iteration": 2.534383773803711 + }, + { + "auxiliary_loss_clip": 0.01105473, + "auxiliary_loss_mlp": 0.01033387, + "balance_loss_clip": 1.02073884, + "balance_loss_mlp": 1.03602767, + "epoch": 0.5325717721328724, + "flos": 25884734757120.0, + "grad_norm": 2.590488147317335, + "language_loss": 0.71136224, + "learning_rate": 1.8861440453219456e-06, + "loss": 0.73275089, + "num_input_tokens_seen": 190544075, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 8858, + "time_per_iteration": 2.48335862159729 + }, + { + "auxiliary_loss_clip": 0.01108291, + "auxiliary_loss_mlp": 0.01033735, + "balance_loss_clip": 1.02001405, + "balance_loss_mlp": 1.03818965, + "epoch": 0.5326318953855403, + "flos": 21799321666560.0, + "grad_norm": 1.5574852735183802, + "language_loss": 0.69423437, + "learning_rate": 1.8857552173964367e-06, + "loss": 0.71565467, + "num_input_tokens_seen": 190566030, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.703125, + "step": 8859, + "time_per_iteration": 2.5293610095977783 + }, + { + "auxiliary_loss_clip": 0.01104952, + "auxiliary_loss_mlp": 0.01027434, + "balance_loss_clip": 1.01622272, + "balance_loss_mlp": 1.03947163, + "epoch": 0.5326920186382084, + "flos": 20922921728640.0, + "grad_norm": 1.5500879507245162, + "language_loss": 0.69682205, + "learning_rate": 1.8853663938031013e-06, + "loss": 0.71814591, + "num_input_tokens_seen": 190585605, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 8860, + "time_per_iteration": 2.453315019607544 + }, + { + "auxiliary_loss_clip": 0.01105883, + "auxiliary_loss_mlp": 0.01031982, + "balance_loss_clip": 1.02027583, + "balance_loss_mlp": 1.03789401, + "epoch": 0.5327521418908763, + "flos": 21433427775360.0, + "grad_norm": 1.830505462704671, + "language_loss": 0.78035998, + "learning_rate": 1.884977574556683e-06, + "loss": 0.80173862, + "num_input_tokens_seen": 190604625, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 8861, + "time_per_iteration": 2.4910025596618652 + }, + { + "auxiliary_loss_clip": 0.01107901, + "auxiliary_loss_mlp": 0.01037922, + "balance_loss_clip": 1.02470744, + "balance_loss_mlp": 1.03778684, + "epoch": 0.5328122651435443, + "flos": 21760250647680.0, + "grad_norm": 3.045684614472066, + "language_loss": 0.85532111, + "learning_rate": 1.8845887596719279e-06, + "loss": 0.87677932, + "num_input_tokens_seen": 190625060, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 8862, + "time_per_iteration": 2.4594204425811768 + }, + { + "auxiliary_loss_clip": 0.01108341, + "auxiliary_loss_mlp": 0.01035571, + "balance_loss_clip": 1.02181435, + "balance_loss_mlp": 1.03708994, + "epoch": 0.5328723883962122, + "flos": 18296487262080.0, + "grad_norm": 2.155580167277434, + "language_loss": 0.61776686, + "learning_rate": 1.8841999491635778e-06, + "loss": 0.63920593, + "num_input_tokens_seen": 190643150, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7109375, + "step": 8863, + "time_per_iteration": 2.431844472885132 + }, + { + "auxiliary_loss_clip": 0.01107834, + "auxiliary_loss_mlp": 0.01033129, + "balance_loss_clip": 1.02161896, + "balance_loss_mlp": 1.03979647, + "epoch": 0.5329325116488802, + "flos": 25374911068800.0, + "grad_norm": 1.808986842092349, + "language_loss": 0.73174077, + "learning_rate": 1.883811143046377e-06, + "loss": 0.7531504, + "num_input_tokens_seen": 190662725, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6796875, + "step": 8864, + "time_per_iteration": 2.481052875518799 + }, + { + "auxiliary_loss_clip": 0.01106149, + "auxiliary_loss_mlp": 0.01036127, + "balance_loss_clip": 1.02406275, + "balance_loss_mlp": 1.03704095, + "epoch": 0.5329926349015481, + "flos": 25592098654080.0, + "grad_norm": 1.770075213018519, + "language_loss": 0.64782691, + "learning_rate": 1.8834223413350702e-06, + "loss": 0.66924965, + "num_input_tokens_seen": 190683680, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.69140625, + "step": 8865, + "time_per_iteration": 2.5422523021698 + }, + { + "auxiliary_loss_clip": 0.01106424, + "auxiliary_loss_mlp": 0.010298, + "balance_loss_clip": 1.01711667, + "balance_loss_mlp": 1.0374155, + "epoch": 0.5330527581542162, + "flos": 22889605138560.0, + "grad_norm": 1.6788966461131323, + "language_loss": 0.78194928, + "learning_rate": 1.8830335440443989e-06, + "loss": 0.80331147, + "num_input_tokens_seen": 190703350, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 8866, + "time_per_iteration": 2.4783847332000732 + }, + { + "auxiliary_loss_clip": 0.01106298, + "auxiliary_loss_mlp": 0.01033575, + "balance_loss_clip": 1.02127266, + "balance_loss_mlp": 1.03756702, + "epoch": 0.5331128814068841, + "flos": 16026752805120.0, + "grad_norm": 2.4645319902700136, + "language_loss": 0.73618174, + "learning_rate": 1.882644751189108e-06, + "loss": 0.75758052, + "num_input_tokens_seen": 190721170, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 8867, + "time_per_iteration": 2.4607431888580322 + }, + { + "auxiliary_loss_clip": 0.01109812, + "auxiliary_loss_mlp": 0.01039021, + "balance_loss_clip": 1.02575922, + "balance_loss_mlp": 1.03957081, + "epoch": 0.5331730046595521, + "flos": 39344699629440.0, + "grad_norm": 1.616723113347984, + "language_loss": 0.72235525, + "learning_rate": 1.88225596278394e-06, + "loss": 0.7438435, + "num_input_tokens_seen": 190743795, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.703125, + "step": 8868, + "time_per_iteration": 2.6005828380584717 + }, + { + "auxiliary_loss_clip": 0.01107325, + "auxiliary_loss_mlp": 0.01032835, + "balance_loss_clip": 1.02044368, + "balance_loss_mlp": 1.03801441, + "epoch": 0.5332331279122201, + "flos": 24024382583040.0, + "grad_norm": 1.8848687711222403, + "language_loss": 0.78688312, + "learning_rate": 1.881867178843637e-06, + "loss": 0.80828476, + "num_input_tokens_seen": 190761560, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6953125, + "step": 8869, + "time_per_iteration": 2.527679681777954 + }, + { + "auxiliary_loss_clip": 0.01112421, + "auxiliary_loss_mlp": 0.0103673, + "balance_loss_clip": 1.02396262, + "balance_loss_mlp": 1.03942657, + "epoch": 0.533293251164888, + "flos": 17129318728320.0, + "grad_norm": 1.8336580730917733, + "language_loss": 0.75656843, + "learning_rate": 1.8814783993829434e-06, + "loss": 0.7780599, + "num_input_tokens_seen": 190778875, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7265625, + "step": 8870, + "time_per_iteration": 2.408651113510132 + }, + { + "auxiliary_loss_clip": 0.01112864, + "auxiliary_loss_mlp": 0.0103788, + "balance_loss_clip": 1.024266, + "balance_loss_mlp": 1.04069293, + "epoch": 0.533353374417556, + "flos": 22126360020480.0, + "grad_norm": 1.8439379115111716, + "language_loss": 0.75255805, + "learning_rate": 1.8810896244165997e-06, + "loss": 0.77406549, + "num_input_tokens_seen": 190799830, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.71875, + "step": 8871, + "time_per_iteration": 2.501173257827759 + }, + { + "auxiliary_loss_clip": 0.01109454, + "auxiliary_loss_mlp": 0.01032765, + "balance_loss_clip": 1.02014637, + "balance_loss_mlp": 1.03973055, + "epoch": 0.533413497670224, + "flos": 15011091838080.0, + "grad_norm": 1.7881983016452072, + "language_loss": 0.72249746, + "learning_rate": 1.8807008539593498e-06, + "loss": 0.74391973, + "num_input_tokens_seen": 190817155, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 8872, + "time_per_iteration": 2.4058215618133545 + }, + { + "auxiliary_loss_clip": 0.01110293, + "auxiliary_loss_mlp": 0.01038023, + "balance_loss_clip": 1.02498162, + "balance_loss_mlp": 1.04132104, + "epoch": 0.533473620922892, + "flos": 19609955890560.0, + "grad_norm": 1.7441588702127815, + "language_loss": 0.65051317, + "learning_rate": 1.880312088025936e-06, + "loss": 0.67199636, + "num_input_tokens_seen": 190835240, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6875, + "step": 8873, + "time_per_iteration": 2.4598374366760254 + }, + { + "auxiliary_loss_clip": 0.01108657, + "auxiliary_loss_mlp": 0.01037842, + "balance_loss_clip": 1.02549779, + "balance_loss_mlp": 1.03951979, + "epoch": 0.5335337441755599, + "flos": 14282644020480.0, + "grad_norm": 7.037025883542546, + "language_loss": 0.80012232, + "learning_rate": 1.879923326631099e-06, + "loss": 0.82158732, + "num_input_tokens_seen": 190851620, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.69140625, + "step": 8874, + "time_per_iteration": 2.43198299407959 + }, + { + "auxiliary_loss_clip": 0.0110808, + "auxiliary_loss_mlp": 0.01031501, + "balance_loss_clip": 1.01874542, + "balance_loss_mlp": 1.03897262, + "epoch": 0.5335938674282279, + "flos": 20814830726400.0, + "grad_norm": 2.558835697133273, + "language_loss": 0.70077014, + "learning_rate": 1.879534569789582e-06, + "loss": 0.72216594, + "num_input_tokens_seen": 190870545, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6875, + "step": 8875, + "time_per_iteration": 2.4746885299682617 + }, + { + "auxiliary_loss_clip": 0.01033299, + "auxiliary_loss_mlp": 0.01014121, + "balance_loss_clip": 1.01290536, + "balance_loss_mlp": 1.01076412, + "epoch": 0.5336539906808958, + "flos": 71396448451200.0, + "grad_norm": 0.7274620052615154, + "language_loss": 0.59653223, + "learning_rate": 1.879145817516126e-06, + "loss": 0.61700642, + "num_input_tokens_seen": 190931995, + "router_z_loss_clip": 0.012146, + "router_z_loss_mlp": 0.22460938, + "step": 8876, + "time_per_iteration": 3.1654725074768066 + }, + { + "auxiliary_loss_clip": 0.01107319, + "auxiliary_loss_mlp": 0.01031759, + "balance_loss_clip": 1.01971292, + "balance_loss_mlp": 1.0382477, + "epoch": 0.5337141139335638, + "flos": 20152996680960.0, + "grad_norm": 1.894052458703423, + "language_loss": 0.74833322, + "learning_rate": 1.8787570698254727e-06, + "loss": 0.76972401, + "num_input_tokens_seen": 190949890, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6875, + "step": 8877, + "time_per_iteration": 2.4836068153381348 + }, + { + "auxiliary_loss_clip": 0.01032923, + "auxiliary_loss_mlp": 0.0100501, + "balance_loss_clip": 1.00374663, + "balance_loss_mlp": 1.01051378, + "epoch": 0.5337742371862317, + "flos": 67728387484800.0, + "grad_norm": 0.7537185456157387, + "language_loss": 0.57229304, + "learning_rate": 1.8783683267323629e-06, + "loss": 0.59267235, + "num_input_tokens_seen": 191008480, + "router_z_loss_clip": 0.01263428, + "router_z_loss_mlp": 0.22460938, + "step": 8878, + "time_per_iteration": 2.9712772369384766 + }, + { + "auxiliary_loss_clip": 0.01111898, + "auxiliary_loss_mlp": 0.01034899, + "balance_loss_clip": 1.02161908, + "balance_loss_mlp": 1.04023981, + "epoch": 0.5338343604388998, + "flos": 25008909436800.0, + "grad_norm": 1.4246995459674998, + "language_loss": 0.72007561, + "learning_rate": 1.8779795882515395e-06, + "loss": 0.74154353, + "num_input_tokens_seen": 191028995, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 8879, + "time_per_iteration": 2.5073280334472656 + }, + { + "auxiliary_loss_clip": 0.01110375, + "auxiliary_loss_mlp": 0.01029972, + "balance_loss_clip": 1.01706791, + "balance_loss_mlp": 1.03980017, + "epoch": 0.5338944836915677, + "flos": 17601256546560.0, + "grad_norm": 2.331544880776984, + "language_loss": 0.8328526, + "learning_rate": 1.8775908543977416e-06, + "loss": 0.85425603, + "num_input_tokens_seen": 191045285, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 8880, + "time_per_iteration": 2.4154322147369385 + }, + { + "auxiliary_loss_clip": 0.01106028, + "auxiliary_loss_mlp": 0.0103408, + "balance_loss_clip": 1.02200413, + "balance_loss_mlp": 1.03857374, + "epoch": 0.5339546069442357, + "flos": 21724124544000.0, + "grad_norm": 1.3819058164028981, + "language_loss": 0.79567689, + "learning_rate": 1.8772021251857107e-06, + "loss": 0.81707799, + "num_input_tokens_seen": 191066105, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.671875, + "step": 8881, + "time_per_iteration": 2.4748446941375732 + }, + { + "auxiliary_loss_clip": 0.01032611, + "auxiliary_loss_mlp": 0.00999583, + "balance_loss_clip": 0.99825948, + "balance_loss_mlp": 1.01026177, + "epoch": 0.5340147301969036, + "flos": 69723583315200.0, + "grad_norm": 0.7951386121617492, + "language_loss": 0.59243226, + "learning_rate": 1.8768134006301882e-06, + "loss": 0.61275423, + "num_input_tokens_seen": 191126315, + "router_z_loss_clip": 0.01324463, + "router_z_loss_mlp": 0.22363281, + "step": 8882, + "time_per_iteration": 3.0554563999176025 + }, + { + "auxiliary_loss_clip": 0.01032284, + "auxiliary_loss_mlp": 0.01002778, + "balance_loss_clip": 1.0013417, + "balance_loss_mlp": 1.00965989, + "epoch": 0.5340748534495716, + "flos": 63880701580800.0, + "grad_norm": 0.8657705918333868, + "language_loss": 0.63714904, + "learning_rate": 1.876424680745913e-06, + "loss": 0.65749967, + "num_input_tokens_seen": 191174240, + "router_z_loss_clip": 0.01434326, + "router_z_loss_mlp": 0.2265625, + "step": 8883, + "time_per_iteration": 2.8666210174560547 + }, + { + "auxiliary_loss_clip": 0.01112111, + "auxiliary_loss_mlp": 0.01028534, + "balance_loss_clip": 1.01528406, + "balance_loss_mlp": 1.04020667, + "epoch": 0.5341349767022396, + "flos": 28694313694080.0, + "grad_norm": 2.5638154038033334, + "language_loss": 0.82000816, + "learning_rate": 1.8760359655476272e-06, + "loss": 0.84141463, + "num_input_tokens_seen": 191193335, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71875, + "step": 8884, + "time_per_iteration": 3.910738706588745 + }, + { + "auxiliary_loss_clip": 0.01104995, + "auxiliary_loss_mlp": 0.01028727, + "balance_loss_clip": 1.0165143, + "balance_loss_mlp": 1.03923178, + "epoch": 0.5341950999549075, + "flos": 16289691338880.0, + "grad_norm": 1.647799538914853, + "language_loss": 0.7224586, + "learning_rate": 1.8756472550500695e-06, + "loss": 0.74379575, + "num_input_tokens_seen": 191210900, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65625, + "step": 8885, + "time_per_iteration": 2.4330668449401855 + }, + { + "auxiliary_loss_clip": 0.01111824, + "auxiliary_loss_mlp": 0.01031625, + "balance_loss_clip": 1.01816654, + "balance_loss_mlp": 1.03816104, + "epoch": 0.5342552232075756, + "flos": 14355650413440.0, + "grad_norm": 1.9571098005847307, + "language_loss": 0.78834218, + "learning_rate": 1.87525854926798e-06, + "loss": 0.80977666, + "num_input_tokens_seen": 191226730, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 8886, + "time_per_iteration": 2.4285924434661865 + }, + { + "auxiliary_loss_clip": 0.01109212, + "auxiliary_loss_mlp": 0.01027987, + "balance_loss_clip": 1.01453424, + "balance_loss_mlp": 1.03859282, + "epoch": 0.5343153464602435, + "flos": 30297976300800.0, + "grad_norm": 1.4869737557636773, + "language_loss": 0.74745071, + "learning_rate": 1.8748698482160996e-06, + "loss": 0.76882267, + "num_input_tokens_seen": 191250435, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.70703125, + "step": 8887, + "time_per_iteration": 5.458622932434082 + }, + { + "auxiliary_loss_clip": 0.01106415, + "auxiliary_loss_mlp": 0.01025963, + "balance_loss_clip": 1.01351762, + "balance_loss_mlp": 1.03839684, + "epoch": 0.5343754697129115, + "flos": 15596292216960.0, + "grad_norm": 1.9580001729257437, + "language_loss": 0.68680072, + "learning_rate": 1.8744811519091663e-06, + "loss": 0.70812452, + "num_input_tokens_seen": 191268315, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6796875, + "step": 8888, + "time_per_iteration": 3.871016263961792 + }, + { + "auxiliary_loss_clip": 0.01115673, + "auxiliary_loss_mlp": 0.01037433, + "balance_loss_clip": 1.02426004, + "balance_loss_mlp": 1.03957748, + "epoch": 0.5344355929655794, + "flos": 16909617191040.0, + "grad_norm": 2.039365083298093, + "language_loss": 0.77427757, + "learning_rate": 1.8740924603619208e-06, + "loss": 0.79580867, + "num_input_tokens_seen": 191287000, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.76171875, + "step": 8889, + "time_per_iteration": 2.4321072101593018 + }, + { + "auxiliary_loss_clip": 0.01107574, + "auxiliary_loss_mlp": 0.01036398, + "balance_loss_clip": 1.02382183, + "balance_loss_mlp": 1.03896809, + "epoch": 0.5344957162182474, + "flos": 16798186224000.0, + "grad_norm": 1.7896399215033527, + "language_loss": 0.68882942, + "learning_rate": 1.873703773589102e-06, + "loss": 0.71026921, + "num_input_tokens_seen": 191304565, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 8890, + "time_per_iteration": 2.4512557983398438 + }, + { + "auxiliary_loss_clip": 0.01112757, + "auxiliary_loss_mlp": 0.01039562, + "balance_loss_clip": 1.02532864, + "balance_loss_mlp": 1.03882933, + "epoch": 0.5345558394709153, + "flos": 12705590413440.0, + "grad_norm": 3.075420511300943, + "language_loss": 0.77339637, + "learning_rate": 1.8733150916054483e-06, + "loss": 0.79491955, + "num_input_tokens_seen": 191318300, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.73828125, + "step": 8891, + "time_per_iteration": 2.4134135246276855 + }, + { + "auxiliary_loss_clip": 0.01106185, + "auxiliary_loss_mlp": 0.01030725, + "balance_loss_clip": 1.01904237, + "balance_loss_mlp": 1.03873932, + "epoch": 0.5346159627235834, + "flos": 22455050400000.0, + "grad_norm": 1.5298342127178157, + "language_loss": 0.73841035, + "learning_rate": 1.872926414425699e-06, + "loss": 0.75977939, + "num_input_tokens_seen": 191337925, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.67578125, + "step": 8892, + "time_per_iteration": 2.4843709468841553 + }, + { + "auxiliary_loss_clip": 0.0110608, + "auxiliary_loss_mlp": 0.0103126, + "balance_loss_clip": 1.01874948, + "balance_loss_mlp": 1.03663301, + "epoch": 0.5346760859762513, + "flos": 22415763899520.0, + "grad_norm": 1.5614617741562322, + "language_loss": 0.88069522, + "learning_rate": 1.8725377420645932e-06, + "loss": 0.90206861, + "num_input_tokens_seen": 191357120, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 8893, + "time_per_iteration": 2.445389747619629 + }, + { + "auxiliary_loss_clip": 0.0110385, + "auxiliary_loss_mlp": 0.01031179, + "balance_loss_clip": 1.01968718, + "balance_loss_mlp": 1.03617978, + "epoch": 0.5347362092289193, + "flos": 22816131868800.0, + "grad_norm": 1.5898186397759002, + "language_loss": 0.72623652, + "learning_rate": 1.872149074536869e-06, + "loss": 0.74758679, + "num_input_tokens_seen": 191375395, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6796875, + "step": 8894, + "time_per_iteration": 2.475914239883423 + }, + { + "auxiliary_loss_clip": 0.01106294, + "auxiliary_loss_mlp": 0.01030754, + "balance_loss_clip": 1.01774812, + "balance_loss_mlp": 1.03794241, + "epoch": 0.5347963324815872, + "flos": 23219480666880.0, + "grad_norm": 2.053516557339631, + "language_loss": 0.74730217, + "learning_rate": 1.8717604118572648e-06, + "loss": 0.7686727, + "num_input_tokens_seen": 191395595, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.68359375, + "step": 8895, + "time_per_iteration": 2.4524707794189453 + }, + { + "auxiliary_loss_clip": 0.01105976, + "auxiliary_loss_mlp": 0.01028564, + "balance_loss_clip": 1.01558769, + "balance_loss_mlp": 1.03688455, + "epoch": 0.5348564557342552, + "flos": 22601350494720.0, + "grad_norm": 1.7004701648033584, + "language_loss": 0.76999986, + "learning_rate": 1.8713717540405178e-06, + "loss": 0.79134524, + "num_input_tokens_seen": 191413730, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.69140625, + "step": 8896, + "time_per_iteration": 2.4727749824523926 + }, + { + "auxiliary_loss_clip": 0.01105321, + "auxiliary_loss_mlp": 0.01024889, + "balance_loss_clip": 1.01200807, + "balance_loss_mlp": 1.03771544, + "epoch": 0.5349165789869232, + "flos": 18002378701440.0, + "grad_norm": 1.674513516034323, + "language_loss": 0.78698516, + "learning_rate": 1.8709831011013676e-06, + "loss": 0.80828726, + "num_input_tokens_seen": 191432400, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.67578125, + "step": 8897, + "time_per_iteration": 2.437924861907959 + }, + { + "auxiliary_loss_clip": 0.01110861, + "auxiliary_loss_mlp": 0.01028076, + "balance_loss_clip": 1.015589, + "balance_loss_mlp": 1.04029751, + "epoch": 0.5349767022395912, + "flos": 17159770483200.0, + "grad_norm": 1.8516386867396797, + "language_loss": 0.75758165, + "learning_rate": 1.8705944530545509e-06, + "loss": 0.77897102, + "num_input_tokens_seen": 191448855, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 8898, + "time_per_iteration": 2.4490232467651367 + }, + { + "auxiliary_loss_clip": 0.0103315, + "auxiliary_loss_mlp": 0.00997269, + "balance_loss_clip": 0.99616033, + "balance_loss_mlp": 1.01073837, + "epoch": 0.5350368254922592, + "flos": 70992058158720.0, + "grad_norm": 0.8534656988697606, + "language_loss": 0.58027738, + "learning_rate": 1.8702058099148052e-06, + "loss": 0.60058159, + "num_input_tokens_seen": 191519690, + "router_z_loss_clip": 0.0111084, + "router_z_loss_mlp": 0.22460938, + "step": 8899, + "time_per_iteration": 3.2222988605499268 + }, + { + "auxiliary_loss_clip": 0.01105996, + "auxiliary_loss_mlp": 0.01028751, + "balance_loss_clip": 1.01625824, + "balance_loss_mlp": 1.03779793, + "epoch": 0.5350969487449271, + "flos": 27417833095680.0, + "grad_norm": 1.754025350675293, + "language_loss": 0.69734174, + "learning_rate": 1.869817171696868e-06, + "loss": 0.7186892, + "num_input_tokens_seen": 191539380, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 8900, + "time_per_iteration": 2.5348854064941406 + }, + { + "auxiliary_loss_clip": 0.01109931, + "auxiliary_loss_mlp": 0.01031276, + "balance_loss_clip": 1.01857448, + "balance_loss_mlp": 1.03874683, + "epoch": 0.5351570719975951, + "flos": 19316134638720.0, + "grad_norm": 1.712056344952118, + "language_loss": 0.71436262, + "learning_rate": 1.8694285384154777e-06, + "loss": 0.73577476, + "num_input_tokens_seen": 191557400, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 8901, + "time_per_iteration": 2.486694097518921 + }, + { + "auxiliary_loss_clip": 0.01108252, + "auxiliary_loss_mlp": 0.0102913, + "balance_loss_clip": 1.01632655, + "balance_loss_mlp": 1.03779531, + "epoch": 0.535217195250263, + "flos": 19828580019840.0, + "grad_norm": 2.0243685582186477, + "language_loss": 0.77403963, + "learning_rate": 1.8690399100853699e-06, + "loss": 0.79541337, + "num_input_tokens_seen": 191575860, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.703125, + "step": 8902, + "time_per_iteration": 2.4521291255950928 + }, + { + "auxiliary_loss_clip": 0.01103104, + "auxiliary_loss_mlp": 0.01032569, + "balance_loss_clip": 1.02103007, + "balance_loss_mlp": 1.03727639, + "epoch": 0.535277318502931, + "flos": 22127868391680.0, + "grad_norm": 1.5596437382067054, + "language_loss": 0.69763452, + "learning_rate": 1.868651286721281e-06, + "loss": 0.71899128, + "num_input_tokens_seen": 191595775, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 8903, + "time_per_iteration": 2.4639296531677246 + }, + { + "auxiliary_loss_clip": 0.01111291, + "auxiliary_loss_mlp": 0.01038911, + "balance_loss_clip": 1.02613187, + "balance_loss_mlp": 1.03885889, + "epoch": 0.5353374417555989, + "flos": 25045897466880.0, + "grad_norm": 1.4813880450748405, + "language_loss": 0.71867597, + "learning_rate": 1.86826266833795e-06, + "loss": 0.74017799, + "num_input_tokens_seen": 191617785, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7265625, + "step": 8904, + "time_per_iteration": 2.518556833267212 + }, + { + "auxiliary_loss_clip": 0.01109721, + "auxiliary_loss_mlp": 0.01035534, + "balance_loss_clip": 1.0223856, + "balance_loss_mlp": 1.03955388, + "epoch": 0.535397565008267, + "flos": 19388710068480.0, + "grad_norm": 1.7385404274740348, + "language_loss": 0.73125184, + "learning_rate": 1.8678740549501103e-06, + "loss": 0.75270438, + "num_input_tokens_seen": 191636900, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 8905, + "time_per_iteration": 2.481398582458496 + }, + { + "auxiliary_loss_clip": 0.01103053, + "auxiliary_loss_mlp": 0.01033307, + "balance_loss_clip": 1.02244139, + "balance_loss_mlp": 1.03704035, + "epoch": 0.5354576882609349, + "flos": 21471205904640.0, + "grad_norm": 1.4036286343955833, + "language_loss": 0.83569062, + "learning_rate": 1.8674854465725005e-06, + "loss": 0.85705423, + "num_input_tokens_seen": 191656720, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.66015625, + "step": 8906, + "time_per_iteration": 2.4822022914886475 + }, + { + "auxiliary_loss_clip": 0.01110585, + "auxiliary_loss_mlp": 0.01033769, + "balance_loss_clip": 1.02053666, + "balance_loss_mlp": 1.03906655, + "epoch": 0.5355178115136029, + "flos": 20777519473920.0, + "grad_norm": 3.1110381495397688, + "language_loss": 0.74120319, + "learning_rate": 1.8670968432198563e-06, + "loss": 0.76264668, + "num_input_tokens_seen": 191674445, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71484375, + "step": 8907, + "time_per_iteration": 2.4488067626953125 + }, + { + "auxiliary_loss_clip": 0.01109051, + "auxiliary_loss_mlp": 0.01028908, + "balance_loss_clip": 1.01639736, + "balance_loss_mlp": 1.03933167, + "epoch": 0.5355779347662708, + "flos": 23514020190720.0, + "grad_norm": 1.8326240405987804, + "language_loss": 0.77272546, + "learning_rate": 1.866708244906912e-06, + "loss": 0.79410505, + "num_input_tokens_seen": 191695000, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 8908, + "time_per_iteration": 2.5009818077087402 + }, + { + "auxiliary_loss_clip": 0.01111027, + "auxiliary_loss_mlp": 0.01035847, + "balance_loss_clip": 1.02252579, + "balance_loss_mlp": 1.039222, + "epoch": 0.5356380580189388, + "flos": 20303211358080.0, + "grad_norm": 9.969716540759343, + "language_loss": 0.7407465, + "learning_rate": 1.8663196516484055e-06, + "loss": 0.7622152, + "num_input_tokens_seen": 191713295, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.71875, + "step": 8909, + "time_per_iteration": 2.4272916316986084 + }, + { + "auxiliary_loss_clip": 0.01110397, + "auxiliary_loss_mlp": 0.01034564, + "balance_loss_clip": 1.02267265, + "balance_loss_mlp": 1.04071856, + "epoch": 0.5356981812716068, + "flos": 21361642444800.0, + "grad_norm": 1.9518435489791055, + "language_loss": 0.841941, + "learning_rate": 1.8659310634590702e-06, + "loss": 0.86339062, + "num_input_tokens_seen": 191732725, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69921875, + "step": 8910, + "time_per_iteration": 2.4678404331207275 + }, + { + "auxiliary_loss_clip": 0.01109272, + "auxiliary_loss_mlp": 0.01030589, + "balance_loss_clip": 1.0175302, + "balance_loss_mlp": 1.03802073, + "epoch": 0.5357583045242748, + "flos": 23111246010240.0, + "grad_norm": 1.5065365564315203, + "language_loss": 0.81728303, + "learning_rate": 1.8655424803536427e-06, + "loss": 0.83868158, + "num_input_tokens_seen": 191753765, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 8911, + "time_per_iteration": 2.482515335083008 + }, + { + "auxiliary_loss_clip": 0.01107773, + "auxiliary_loss_mlp": 0.01033913, + "balance_loss_clip": 1.02217102, + "balance_loss_mlp": 1.03894281, + "epoch": 0.5358184277769428, + "flos": 21141761339520.0, + "grad_norm": 1.8795354415042287, + "language_loss": 0.6902765, + "learning_rate": 1.8651539023468585e-06, + "loss": 0.71169335, + "num_input_tokens_seen": 191773560, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6875, + "step": 8912, + "time_per_iteration": 2.489625930786133 + }, + { + "auxiliary_loss_clip": 0.01110703, + "auxiliary_loss_mlp": 0.01035561, + "balance_loss_clip": 1.02269232, + "balance_loss_mlp": 1.04099894, + "epoch": 0.5358785510296107, + "flos": 16282400878080.0, + "grad_norm": 1.778457710383864, + "language_loss": 0.71355128, + "learning_rate": 1.8647653294534509e-06, + "loss": 0.73501396, + "num_input_tokens_seen": 191791255, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 8913, + "time_per_iteration": 2.4120781421661377 + }, + { + "auxiliary_loss_clip": 0.01114215, + "auxiliary_loss_mlp": 0.01036322, + "balance_loss_clip": 1.02322149, + "balance_loss_mlp": 1.04114628, + "epoch": 0.5359386742822787, + "flos": 16976877408000.0, + "grad_norm": 1.8082872891744106, + "language_loss": 0.72335684, + "learning_rate": 1.864376761688156e-06, + "loss": 0.7448622, + "num_input_tokens_seen": 191809325, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.734375, + "step": 8914, + "time_per_iteration": 2.466946840286255 + }, + { + "auxiliary_loss_clip": 0.01115012, + "auxiliary_loss_mlp": 0.01039705, + "balance_loss_clip": 1.02528632, + "balance_loss_mlp": 1.04084253, + "epoch": 0.5359987975349466, + "flos": 20812927305600.0, + "grad_norm": 2.2402764225711915, + "language_loss": 0.70448041, + "learning_rate": 1.8639881990657079e-06, + "loss": 0.72602755, + "num_input_tokens_seen": 191829795, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7421875, + "step": 8915, + "time_per_iteration": 2.5281713008880615 + }, + { + "auxiliary_loss_clip": 0.01108649, + "auxiliary_loss_mlp": 0.01036619, + "balance_loss_clip": 1.02335119, + "balance_loss_mlp": 1.03934813, + "epoch": 0.5360589207876146, + "flos": 22199941031040.0, + "grad_norm": 4.884439280571106, + "language_loss": 0.75188339, + "learning_rate": 1.8635996416008408e-06, + "loss": 0.77333617, + "num_input_tokens_seen": 191850840, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.69140625, + "step": 8916, + "time_per_iteration": 2.4901540279388428 + }, + { + "auxiliary_loss_clip": 0.01110696, + "auxiliary_loss_mlp": 0.01029597, + "balance_loss_clip": 1.01685333, + "balance_loss_mlp": 1.03908181, + "epoch": 0.5361190440402825, + "flos": 31394365084800.0, + "grad_norm": 2.001008974250462, + "language_loss": 0.72230595, + "learning_rate": 1.863211089308289e-06, + "loss": 0.74370885, + "num_input_tokens_seen": 191869520, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71484375, + "step": 8917, + "time_per_iteration": 2.5355899333953857 + }, + { + "auxiliary_loss_clip": 0.01109638, + "auxiliary_loss_mlp": 0.01037576, + "balance_loss_clip": 1.02460611, + "balance_loss_mlp": 1.04033589, + "epoch": 0.5361791672929506, + "flos": 16069882060800.0, + "grad_norm": 2.185479233449534, + "language_loss": 0.71158117, + "learning_rate": 1.8628225422027865e-06, + "loss": 0.73305333, + "num_input_tokens_seen": 191887240, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.69140625, + "step": 8918, + "time_per_iteration": 2.497854709625244 + }, + { + "auxiliary_loss_clip": 0.011106, + "auxiliary_loss_mlp": 0.01036514, + "balance_loss_clip": 1.02387154, + "balance_loss_mlp": 1.04111099, + "epoch": 0.5362392905456185, + "flos": 20740926493440.0, + "grad_norm": 1.4281907235735687, + "language_loss": 0.75156265, + "learning_rate": 1.862434000299067e-06, + "loss": 0.7730338, + "num_input_tokens_seen": 191905690, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 8919, + "time_per_iteration": 2.4522061347961426 + }, + { + "auxiliary_loss_clip": 0.01109131, + "auxiliary_loss_mlp": 0.01031322, + "balance_loss_clip": 1.0192163, + "balance_loss_mlp": 1.0374527, + "epoch": 0.5362994137982865, + "flos": 17340077779200.0, + "grad_norm": 1.9146697385716565, + "language_loss": 0.71194351, + "learning_rate": 1.862045463611864e-06, + "loss": 0.73334807, + "num_input_tokens_seen": 191920725, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71875, + "step": 8920, + "time_per_iteration": 2.4363694190979004 + }, + { + "auxiliary_loss_clip": 0.01106889, + "auxiliary_loss_mlp": 0.01031821, + "balance_loss_clip": 1.01886892, + "balance_loss_mlp": 1.03738046, + "epoch": 0.5363595370509544, + "flos": 42813957795840.0, + "grad_norm": 1.417495166440162, + "language_loss": 0.68572164, + "learning_rate": 1.8616569321559105e-06, + "loss": 0.7071088, + "num_input_tokens_seen": 191944645, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 8921, + "time_per_iteration": 2.659815788269043 + }, + { + "auxiliary_loss_clip": 0.01110885, + "auxiliary_loss_mlp": 0.01036076, + "balance_loss_clip": 1.02357066, + "balance_loss_mlp": 1.04096341, + "epoch": 0.5364196603036224, + "flos": 19171953446400.0, + "grad_norm": 1.806007791508249, + "language_loss": 0.81778204, + "learning_rate": 1.86126840594594e-06, + "loss": 0.83925164, + "num_input_tokens_seen": 191962265, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69921875, + "step": 8922, + "time_per_iteration": 2.4896881580352783 + }, + { + "auxiliary_loss_clip": 0.01109712, + "auxiliary_loss_mlp": 0.01028286, + "balance_loss_clip": 1.01601934, + "balance_loss_mlp": 1.03847456, + "epoch": 0.5364797835562904, + "flos": 17931060247680.0, + "grad_norm": 1.9048762186543056, + "language_loss": 0.76640022, + "learning_rate": 1.860879884996686e-06, + "loss": 0.78778023, + "num_input_tokens_seen": 191978850, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.7109375, + "step": 8923, + "time_per_iteration": 2.46250319480896 + }, + { + "auxiliary_loss_clip": 0.01112498, + "auxiliary_loss_mlp": 0.0103384, + "balance_loss_clip": 1.02061963, + "balance_loss_mlp": 1.04007745, + "epoch": 0.5365399068089584, + "flos": 30228058477440.0, + "grad_norm": 1.372230243923659, + "language_loss": 0.70459902, + "learning_rate": 1.8604913693228804e-06, + "loss": 0.72606242, + "num_input_tokens_seen": 192002000, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7265625, + "step": 8924, + "time_per_iteration": 2.5744879245758057 + }, + { + "auxiliary_loss_clip": 0.0111402, + "auxiliary_loss_mlp": 0.01036175, + "balance_loss_clip": 1.02251387, + "balance_loss_mlp": 1.04109585, + "epoch": 0.5366000300616264, + "flos": 24891696380160.0, + "grad_norm": 1.82023886715655, + "language_loss": 0.86756319, + "learning_rate": 1.8601028589392558e-06, + "loss": 0.88906515, + "num_input_tokens_seen": 192019100, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7265625, + "step": 8925, + "time_per_iteration": 2.4910149574279785 + }, + { + "auxiliary_loss_clip": 0.01110722, + "auxiliary_loss_mlp": 0.01031177, + "balance_loss_clip": 1.01847553, + "balance_loss_mlp": 1.03855276, + "epoch": 0.5366601533142943, + "flos": 29826649013760.0, + "grad_norm": 1.7557992545857284, + "language_loss": 0.77842706, + "learning_rate": 1.8597143538605455e-06, + "loss": 0.79984611, + "num_input_tokens_seen": 192041660, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 8926, + "time_per_iteration": 3.935426950454712 + }, + { + "auxiliary_loss_clip": 0.01108374, + "auxiliary_loss_mlp": 0.0103378, + "balance_loss_clip": 1.02207375, + "balance_loss_mlp": 1.04045248, + "epoch": 0.5367202765669623, + "flos": 27199352620800.0, + "grad_norm": 1.9312965019913735, + "language_loss": 0.66655087, + "learning_rate": 1.85932585410148e-06, + "loss": 0.68797243, + "num_input_tokens_seen": 192063540, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 8927, + "time_per_iteration": 2.547527313232422 + }, + { + "auxiliary_loss_clip": 0.01109886, + "auxiliary_loss_mlp": 0.01028352, + "balance_loss_clip": 1.01575708, + "balance_loss_mlp": 1.03839135, + "epoch": 0.5367803998196302, + "flos": 20229953569920.0, + "grad_norm": 1.6954569855299475, + "language_loss": 0.73241496, + "learning_rate": 1.8589373596767929e-06, + "loss": 0.75379729, + "num_input_tokens_seen": 192081760, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71484375, + "step": 8928, + "time_per_iteration": 2.432772636413574 + }, + { + "auxiliary_loss_clip": 0.01109785, + "auxiliary_loss_mlp": 0.01031284, + "balance_loss_clip": 1.01908278, + "balance_loss_mlp": 1.03883481, + "epoch": 0.5368405230722982, + "flos": 32154629374080.0, + "grad_norm": 1.7056756537874223, + "language_loss": 0.62998128, + "learning_rate": 1.8585488706012154e-06, + "loss": 0.65139198, + "num_input_tokens_seen": 192101620, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.7109375, + "step": 8929, + "time_per_iteration": 5.517207145690918 + }, + { + "auxiliary_loss_clip": 0.01109689, + "auxiliary_loss_mlp": 0.01031499, + "balance_loss_clip": 1.01881528, + "balance_loss_mlp": 1.03864491, + "epoch": 0.5369006463249661, + "flos": 26247935128320.0, + "grad_norm": 1.7096435666181475, + "language_loss": 0.65986609, + "learning_rate": 1.8581603868894781e-06, + "loss": 0.68127799, + "num_input_tokens_seen": 192121805, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 8930, + "time_per_iteration": 4.042668581008911 + }, + { + "auxiliary_loss_clip": 0.01105563, + "auxiliary_loss_mlp": 0.01029425, + "balance_loss_clip": 1.01673484, + "balance_loss_mlp": 1.03648782, + "epoch": 0.5369607695776342, + "flos": 26211306234240.0, + "grad_norm": 1.4058068619041801, + "language_loss": 0.66875708, + "learning_rate": 1.8577719085563136e-06, + "loss": 0.69010699, + "num_input_tokens_seen": 192141765, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 8931, + "time_per_iteration": 2.4965057373046875 + }, + { + "auxiliary_loss_clip": 0.01111171, + "auxiliary_loss_mlp": 0.01032988, + "balance_loss_clip": 1.01932693, + "balance_loss_mlp": 1.04157209, + "epoch": 0.5370208928303021, + "flos": 25009017177600.0, + "grad_norm": 1.7390938861026815, + "language_loss": 0.75847304, + "learning_rate": 1.8573834356164525e-06, + "loss": 0.77991474, + "num_input_tokens_seen": 192161560, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.6953125, + "step": 8932, + "time_per_iteration": 2.4885287284851074 + }, + { + "auxiliary_loss_clip": 0.01110784, + "auxiliary_loss_mlp": 0.0103335, + "balance_loss_clip": 1.01999855, + "balance_loss_mlp": 1.04103768, + "epoch": 0.5370810160829701, + "flos": 31792147274880.0, + "grad_norm": 1.8276755120836934, + "language_loss": 0.66255939, + "learning_rate": 1.8569949680846261e-06, + "loss": 0.68400073, + "num_input_tokens_seen": 192180190, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.69921875, + "step": 8933, + "time_per_iteration": 2.545335292816162 + }, + { + "auxiliary_loss_clip": 0.01106255, + "auxiliary_loss_mlp": 0.01032805, + "balance_loss_clip": 1.02077079, + "balance_loss_mlp": 1.03900647, + "epoch": 0.537141139335638, + "flos": 23842602829440.0, + "grad_norm": 1.6337429593741761, + "language_loss": 0.82865143, + "learning_rate": 1.856606505975565e-06, + "loss": 0.85004205, + "num_input_tokens_seen": 192198855, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 8934, + "time_per_iteration": 2.503974437713623 + }, + { + "auxiliary_loss_clip": 0.0110502, + "auxiliary_loss_mlp": 0.01035873, + "balance_loss_clip": 1.02293336, + "balance_loss_mlp": 1.03738618, + "epoch": 0.537201262588306, + "flos": 18508826511360.0, + "grad_norm": 1.7935675007471827, + "language_loss": 0.79473621, + "learning_rate": 1.856218049303999e-06, + "loss": 0.81614518, + "num_input_tokens_seen": 192216555, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.67578125, + "step": 8935, + "time_per_iteration": 2.4432904720306396 + }, + { + "auxiliary_loss_clip": 0.01109317, + "auxiliary_loss_mlp": 0.01037309, + "balance_loss_clip": 1.02450609, + "balance_loss_mlp": 1.03854251, + "epoch": 0.537261385840974, + "flos": 25662950231040.0, + "grad_norm": 1.6092738011459846, + "language_loss": 0.83558774, + "learning_rate": 1.855829598084659e-06, + "loss": 0.857054, + "num_input_tokens_seen": 192236910, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.70703125, + "step": 8936, + "time_per_iteration": 2.5320403575897217 + }, + { + "auxiliary_loss_clip": 0.01106939, + "auxiliary_loss_mlp": 0.01029184, + "balance_loss_clip": 1.0173173, + "balance_loss_mlp": 1.03860474, + "epoch": 0.537321509093642, + "flos": 40735017406080.0, + "grad_norm": 1.2642552304862777, + "language_loss": 0.72749949, + "learning_rate": 1.8554411523322754e-06, + "loss": 0.74886072, + "num_input_tokens_seen": 192260790, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.68359375, + "step": 8937, + "time_per_iteration": 2.6381869316101074 + }, + { + "auxiliary_loss_clip": 0.01110674, + "auxiliary_loss_mlp": 0.01028782, + "balance_loss_clip": 1.01589561, + "balance_loss_mlp": 1.03737688, + "epoch": 0.53738163234631, + "flos": 17238487138560.0, + "grad_norm": 2.79948851304012, + "language_loss": 0.81773913, + "learning_rate": 1.8550527120615778e-06, + "loss": 0.83913368, + "num_input_tokens_seen": 192277230, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.73046875, + "step": 8938, + "time_per_iteration": 2.4865500926971436 + }, + { + "auxiliary_loss_clip": 0.01115105, + "auxiliary_loss_mlp": 0.01035683, + "balance_loss_clip": 1.0231539, + "balance_loss_mlp": 1.04058433, + "epoch": 0.5374417555989779, + "flos": 12821977457280.0, + "grad_norm": 2.3721010649860403, + "language_loss": 0.80348092, + "learning_rate": 1.8546642772872957e-06, + "loss": 0.82498878, + "num_input_tokens_seen": 192292840, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7421875, + "step": 8939, + "time_per_iteration": 2.4440550804138184 + }, + { + "auxiliary_loss_clip": 0.01034483, + "auxiliary_loss_mlp": 0.01002274, + "balance_loss_clip": 1.00117719, + "balance_loss_mlp": 1.01246023, + "epoch": 0.5375018788516459, + "flos": 67256018703360.0, + "grad_norm": 0.7105496368182959, + "language_loss": 0.52484262, + "learning_rate": 1.8542758480241589e-06, + "loss": 0.54521012, + "num_input_tokens_seen": 192358240, + "router_z_loss_clip": 0.01098633, + "router_z_loss_mlp": 0.22070312, + "step": 8940, + "time_per_iteration": 3.091242790222168 + }, + { + "auxiliary_loss_clip": 0.01107473, + "auxiliary_loss_mlp": 0.01029266, + "balance_loss_clip": 1.01732159, + "balance_loss_mlp": 1.03880298, + "epoch": 0.5375620021043138, + "flos": 18114168804480.0, + "grad_norm": 1.7538523818266185, + "language_loss": 0.71252179, + "learning_rate": 1.8538874242868965e-06, + "loss": 0.73388922, + "num_input_tokens_seen": 192377370, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6875, + "step": 8941, + "time_per_iteration": 2.497748613357544 + }, + { + "auxiliary_loss_clip": 0.01106467, + "auxiliary_loss_mlp": 0.01030418, + "balance_loss_clip": 1.01807404, + "balance_loss_mlp": 1.03906739, + "epoch": 0.5376221253569818, + "flos": 23149383275520.0, + "grad_norm": 1.7257322220940274, + "language_loss": 0.7928313, + "learning_rate": 1.853499006090237e-06, + "loss": 0.81420016, + "num_input_tokens_seen": 192396450, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.671875, + "step": 8942, + "time_per_iteration": 2.5012340545654297 + }, + { + "auxiliary_loss_clip": 0.01113441, + "auxiliary_loss_mlp": 0.01036513, + "balance_loss_clip": 1.02305436, + "balance_loss_mlp": 1.04004788, + "epoch": 0.5376822486096497, + "flos": 29972302663680.0, + "grad_norm": 1.6646036710876846, + "language_loss": 0.69918364, + "learning_rate": 1.853110593448911e-06, + "loss": 0.72068322, + "num_input_tokens_seen": 192417390, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 8943, + "time_per_iteration": 2.5815587043762207 + }, + { + "auxiliary_loss_clip": 0.01032313, + "auxiliary_loss_mlp": 0.0099905, + "balance_loss_clip": 0.99804258, + "balance_loss_mlp": 1.01022053, + "epoch": 0.5377423718623178, + "flos": 54168950874240.0, + "grad_norm": 0.8193486791235207, + "language_loss": 0.59579939, + "learning_rate": 1.852722186377645e-06, + "loss": 0.61611301, + "num_input_tokens_seen": 192478060, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.22070312, + "step": 8944, + "time_per_iteration": 3.0560412406921387 + }, + { + "auxiliary_loss_clip": 0.01117959, + "auxiliary_loss_mlp": 0.01036857, + "balance_loss_clip": 1.02264094, + "balance_loss_mlp": 1.0415678, + "epoch": 0.5378024951149857, + "flos": 23257079228160.0, + "grad_norm": 2.048508714437824, + "language_loss": 0.77503264, + "learning_rate": 1.852333784891169e-06, + "loss": 0.79658085, + "num_input_tokens_seen": 192495985, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.765625, + "step": 8945, + "time_per_iteration": 2.4893672466278076 + }, + { + "auxiliary_loss_clip": 0.01109506, + "auxiliary_loss_mlp": 0.01034395, + "balance_loss_clip": 1.02192593, + "balance_loss_mlp": 1.03820658, + "epoch": 0.5378626183676537, + "flos": 24024095274240.0, + "grad_norm": 1.7269314210534699, + "language_loss": 0.68465722, + "learning_rate": 1.8519453890042112e-06, + "loss": 0.70609617, + "num_input_tokens_seen": 192515445, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7109375, + "step": 8946, + "time_per_iteration": 2.4605491161346436 + }, + { + "auxiliary_loss_clip": 0.01109149, + "auxiliary_loss_mlp": 0.0104377, + "balance_loss_clip": 1.03090715, + "balance_loss_mlp": 1.03953493, + "epoch": 0.5379227416203216, + "flos": 27161789973120.0, + "grad_norm": 1.7416668567009066, + "language_loss": 0.76750016, + "learning_rate": 1.851556998731498e-06, + "loss": 0.78902936, + "num_input_tokens_seen": 192536530, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 8947, + "time_per_iteration": 2.547470808029175 + }, + { + "auxiliary_loss_clip": 0.01108626, + "auxiliary_loss_mlp": 0.01029842, + "balance_loss_clip": 1.01731312, + "balance_loss_mlp": 1.03834343, + "epoch": 0.5379828648729896, + "flos": 24681619687680.0, + "grad_norm": 1.559080956726188, + "language_loss": 0.60268521, + "learning_rate": 1.8511686140877592e-06, + "loss": 0.62406987, + "num_input_tokens_seen": 192556075, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 8948, + "time_per_iteration": 2.486721992492676 + }, + { + "auxiliary_loss_clip": 0.01112593, + "auxiliary_loss_mlp": 0.01034497, + "balance_loss_clip": 1.0221529, + "balance_loss_mlp": 1.04152977, + "epoch": 0.5380429881256577, + "flos": 22523280284160.0, + "grad_norm": 1.6883046071040144, + "language_loss": 0.7951721, + "learning_rate": 1.8507802350877205e-06, + "loss": 0.816643, + "num_input_tokens_seen": 192575535, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.7109375, + "step": 8949, + "time_per_iteration": 2.504025936126709 + }, + { + "auxiliary_loss_clip": 0.01107717, + "auxiliary_loss_mlp": 0.0103256, + "balance_loss_clip": 1.01955473, + "balance_loss_mlp": 1.03890014, + "epoch": 0.5381031113783256, + "flos": 26979543342720.0, + "grad_norm": 1.5394027339965872, + "language_loss": 0.77871096, + "learning_rate": 1.850391861746111e-06, + "loss": 0.80011374, + "num_input_tokens_seen": 192594490, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6875, + "step": 8950, + "time_per_iteration": 2.4836034774780273 + }, + { + "auxiliary_loss_clip": 0.01108112, + "auxiliary_loss_mlp": 0.01031572, + "balance_loss_clip": 1.01990116, + "balance_loss_mlp": 1.04001009, + "epoch": 0.5381632346309936, + "flos": 24754087376640.0, + "grad_norm": 1.7709921726317892, + "language_loss": 0.72630781, + "learning_rate": 1.8500034940776573e-06, + "loss": 0.74770463, + "num_input_tokens_seen": 192615650, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 8951, + "time_per_iteration": 2.5027382373809814 + }, + { + "auxiliary_loss_clip": 0.01109504, + "auxiliary_loss_mlp": 0.0102749, + "balance_loss_clip": 1.01503229, + "balance_loss_mlp": 1.03817379, + "epoch": 0.5382233578836615, + "flos": 15560058372480.0, + "grad_norm": 1.739294207658579, + "language_loss": 0.75148916, + "learning_rate": 1.849615132097085e-06, + "loss": 0.7728591, + "num_input_tokens_seen": 192633840, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71484375, + "step": 8952, + "time_per_iteration": 2.423635244369507 + }, + { + "auxiliary_loss_clip": 0.01109041, + "auxiliary_loss_mlp": 0.01028303, + "balance_loss_clip": 1.01504064, + "balance_loss_mlp": 1.03914118, + "epoch": 0.5382834811363295, + "flos": 25084501608960.0, + "grad_norm": 1.5972619646266322, + "language_loss": 0.79724902, + "learning_rate": 1.8492267758191228e-06, + "loss": 0.81862247, + "num_input_tokens_seen": 192655890, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.69921875, + "step": 8953, + "time_per_iteration": 2.532107353210449 + }, + { + "auxiliary_loss_clip": 0.01106301, + "auxiliary_loss_mlp": 0.01033247, + "balance_loss_clip": 1.01993775, + "balance_loss_mlp": 1.03857923, + "epoch": 0.5383436043889974, + "flos": 13297901685120.0, + "grad_norm": 2.0280242140271336, + "language_loss": 0.80724108, + "learning_rate": 1.8488384252584964e-06, + "loss": 0.82863653, + "num_input_tokens_seen": 192673025, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.67578125, + "step": 8954, + "time_per_iteration": 2.404942512512207 + }, + { + "auxiliary_loss_clip": 0.01112016, + "auxiliary_loss_mlp": 0.01030551, + "balance_loss_clip": 1.01780725, + "balance_loss_mlp": 1.04119825, + "epoch": 0.5384037276416654, + "flos": 23039388852480.0, + "grad_norm": 2.327007095214437, + "language_loss": 0.76461661, + "learning_rate": 1.8484500804299318e-06, + "loss": 0.78604227, + "num_input_tokens_seen": 192692190, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 8955, + "time_per_iteration": 2.511826992034912 + }, + { + "auxiliary_loss_clip": 0.01110374, + "auxiliary_loss_mlp": 0.01036995, + "balance_loss_clip": 1.02414417, + "balance_loss_mlp": 1.04121125, + "epoch": 0.5384638508943334, + "flos": 20631147552000.0, + "grad_norm": 1.5710344626373696, + "language_loss": 0.7823422, + "learning_rate": 1.8480617413481557e-06, + "loss": 0.80381584, + "num_input_tokens_seen": 192710380, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69140625, + "step": 8956, + "time_per_iteration": 2.484722375869751 + }, + { + "auxiliary_loss_clip": 0.0103322, + "auxiliary_loss_mlp": 0.01002994, + "balance_loss_clip": 1.00186145, + "balance_loss_mlp": 1.01120663, + "epoch": 0.5385239741470014, + "flos": 66737683491840.0, + "grad_norm": 0.8559223539778376, + "language_loss": 0.63550651, + "learning_rate": 1.8476734080278932e-06, + "loss": 0.65586865, + "num_input_tokens_seen": 192768995, + "router_z_loss_clip": 0.01135254, + "router_z_loss_mlp": 0.22070312, + "step": 8957, + "time_per_iteration": 3.065546751022339 + }, + { + "auxiliary_loss_clip": 0.01032349, + "auxiliary_loss_mlp": 0.01008296, + "balance_loss_clip": 1.00706863, + "balance_loss_mlp": 1.01029825, + "epoch": 0.5385840973996693, + "flos": 64716058229760.0, + "grad_norm": 0.7038941855074313, + "language_loss": 0.5158186, + "learning_rate": 1.8472850804838705e-06, + "loss": 0.53622508, + "num_input_tokens_seen": 192825585, + "router_z_loss_clip": 0.01226807, + "router_z_loss_mlp": 0.22070312, + "step": 8958, + "time_per_iteration": 3.0705761909484863 + }, + { + "auxiliary_loss_clip": 0.01115886, + "auxiliary_loss_mlp": 0.01030672, + "balance_loss_clip": 1.01678383, + "balance_loss_mlp": 1.04319501, + "epoch": 0.5386442206523373, + "flos": 26141783460480.0, + "grad_norm": 1.5948521762422991, + "language_loss": 0.77216792, + "learning_rate": 1.8468967587308128e-06, + "loss": 0.79363346, + "num_input_tokens_seen": 192847335, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7265625, + "step": 8959, + "time_per_iteration": 2.4907429218292236 + }, + { + "auxiliary_loss_clip": 0.01109786, + "auxiliary_loss_mlp": 0.0103173, + "balance_loss_clip": 1.0190165, + "balance_loss_mlp": 1.03810203, + "epoch": 0.5387043439050052, + "flos": 18251849635200.0, + "grad_norm": 2.0946376118717493, + "language_loss": 0.83630693, + "learning_rate": 1.8465084427834455e-06, + "loss": 0.85772204, + "num_input_tokens_seen": 192862205, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 8960, + "time_per_iteration": 2.4251809120178223 + }, + { + "auxiliary_loss_clip": 0.01112347, + "auxiliary_loss_mlp": 0.01030452, + "balance_loss_clip": 1.01780403, + "balance_loss_mlp": 1.0417726, + "epoch": 0.5387644671576732, + "flos": 29788296266880.0, + "grad_norm": 1.575363596920687, + "language_loss": 0.78489578, + "learning_rate": 1.8461201326564933e-06, + "loss": 0.80632377, + "num_input_tokens_seen": 192883695, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 8961, + "time_per_iteration": 2.5358235836029053 + }, + { + "auxiliary_loss_clip": 0.01110674, + "auxiliary_loss_mlp": 0.01032204, + "balance_loss_clip": 1.01921666, + "balance_loss_mlp": 1.04004741, + "epoch": 0.5388245904103413, + "flos": 22374466237440.0, + "grad_norm": 1.7764783659945997, + "language_loss": 0.84602159, + "learning_rate": 1.845731828364681e-06, + "loss": 0.86745036, + "num_input_tokens_seen": 192900190, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.70703125, + "step": 8962, + "time_per_iteration": 2.462369918823242 + }, + { + "auxiliary_loss_clip": 0.01032215, + "auxiliary_loss_mlp": 0.00998189, + "balance_loss_clip": 0.99696141, + "balance_loss_mlp": 1.01020229, + "epoch": 0.5388847136630092, + "flos": 69807794751360.0, + "grad_norm": 0.7323858189394533, + "language_loss": 0.54189092, + "learning_rate": 1.8453435299227333e-06, + "loss": 0.56219494, + "num_input_tokens_seen": 192958675, + "router_z_loss_clip": 0.01226807, + "router_z_loss_mlp": 0.22070312, + "step": 8963, + "time_per_iteration": 3.000844717025757 + }, + { + "auxiliary_loss_clip": 0.01031141, + "auxiliary_loss_mlp": 0.00998281, + "balance_loss_clip": 0.99717277, + "balance_loss_mlp": 1.00911307, + "epoch": 0.5389448369156772, + "flos": 69822303845760.0, + "grad_norm": 0.8055122078658323, + "language_loss": 0.63433194, + "learning_rate": 1.8449552373453744e-06, + "loss": 0.65462613, + "num_input_tokens_seen": 193033135, + "router_z_loss_clip": 0.0111084, + "router_z_loss_mlp": 0.22070312, + "step": 8964, + "time_per_iteration": 3.241182565689087 + }, + { + "auxiliary_loss_clip": 0.01112883, + "auxiliary_loss_mlp": 0.01030674, + "balance_loss_clip": 1.01782298, + "balance_loss_mlp": 1.03918004, + "epoch": 0.5390049601683451, + "flos": 31722444933120.0, + "grad_norm": 1.532843563745025, + "language_loss": 0.69958258, + "learning_rate": 1.8445669506473287e-06, + "loss": 0.72101814, + "num_input_tokens_seen": 193055570, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.73828125, + "step": 8965, + "time_per_iteration": 2.524223804473877 + }, + { + "auxiliary_loss_clip": 0.01114315, + "auxiliary_loss_mlp": 0.01035135, + "balance_loss_clip": 1.02103257, + "balance_loss_mlp": 1.04133582, + "epoch": 0.5390650834210131, + "flos": 18113486446080.0, + "grad_norm": 2.362623955664157, + "language_loss": 0.81848061, + "learning_rate": 1.8441786698433192e-06, + "loss": 0.83997512, + "num_input_tokens_seen": 193073120, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.73046875, + "step": 8966, + "time_per_iteration": 2.477625608444214 + }, + { + "auxiliary_loss_clip": 0.01110928, + "auxiliary_loss_mlp": 0.0103216, + "balance_loss_clip": 1.01913619, + "balance_loss_mlp": 1.04063606, + "epoch": 0.539125206673681, + "flos": 17416711445760.0, + "grad_norm": 1.8348280049509287, + "language_loss": 0.72713602, + "learning_rate": 1.8437903949480706e-06, + "loss": 0.74856687, + "num_input_tokens_seen": 193090105, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 8967, + "time_per_iteration": 2.419088125228882 + }, + { + "auxiliary_loss_clip": 0.0110771, + "auxiliary_loss_mlp": 0.01031235, + "balance_loss_clip": 1.01884913, + "balance_loss_mlp": 1.03676677, + "epoch": 0.539185329926349, + "flos": 22198935450240.0, + "grad_norm": 1.8042691798262989, + "language_loss": 0.81596529, + "learning_rate": 1.8434021259763065e-06, + "loss": 0.83735478, + "num_input_tokens_seen": 193109325, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 8968, + "time_per_iteration": 3.8650004863739014 + }, + { + "auxiliary_loss_clip": 0.01111598, + "auxiliary_loss_mlp": 0.01030147, + "balance_loss_clip": 1.0168612, + "balance_loss_mlp": 1.0391978, + "epoch": 0.539245453179017, + "flos": 21434397442560.0, + "grad_norm": 1.5993373110169542, + "language_loss": 0.73938435, + "learning_rate": 1.8430138629427484e-06, + "loss": 0.76080179, + "num_input_tokens_seen": 193130595, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.72265625, + "step": 8969, + "time_per_iteration": 2.485146999359131 + }, + { + "auxiliary_loss_clip": 0.01111919, + "auxiliary_loss_mlp": 0.01032887, + "balance_loss_clip": 1.01886833, + "balance_loss_mlp": 1.03785658, + "epoch": 0.539305576431685, + "flos": 20735000749440.0, + "grad_norm": 2.3553854013154907, + "language_loss": 0.82165599, + "learning_rate": 1.8426256058621205e-06, + "loss": 0.84310412, + "num_input_tokens_seen": 193148930, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 8970, + "time_per_iteration": 2.4504613876342773 + }, + { + "auxiliary_loss_clip": 0.01109668, + "auxiliary_loss_mlp": 0.01032979, + "balance_loss_clip": 1.02005112, + "balance_loss_mlp": 1.03989851, + "epoch": 0.5393656996843529, + "flos": 30920452018560.0, + "grad_norm": 1.5328161731771237, + "language_loss": 0.75619417, + "learning_rate": 1.842237354749146e-06, + "loss": 0.77762067, + "num_input_tokens_seen": 193170140, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 8971, + "time_per_iteration": 5.434189558029175 + }, + { + "auxiliary_loss_clip": 0.01030677, + "auxiliary_loss_mlp": 0.00999826, + "balance_loss_clip": 0.99856228, + "balance_loss_mlp": 1.00854254, + "epoch": 0.5394258229370209, + "flos": 50317781351040.0, + "grad_norm": 0.8757990223887638, + "language_loss": 0.60310632, + "learning_rate": 1.8418491096185465e-06, + "loss": 0.62341136, + "num_input_tokens_seen": 193227235, + "router_z_loss_clip": 0.01263428, + "router_z_loss_mlp": 0.22167969, + "step": 8972, + "time_per_iteration": 3.070239782333374 + }, + { + "auxiliary_loss_clip": 0.01109336, + "auxiliary_loss_mlp": 0.01044193, + "balance_loss_clip": 1.03085351, + "balance_loss_mlp": 1.0389235, + "epoch": 0.5394859461896888, + "flos": 25411935012480.0, + "grad_norm": 1.4916710753135305, + "language_loss": 0.78427428, + "learning_rate": 1.841460870485045e-06, + "loss": 0.80580956, + "num_input_tokens_seen": 193248435, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.703125, + "step": 8973, + "time_per_iteration": 2.4841833114624023 + }, + { + "auxiliary_loss_clip": 0.01116334, + "auxiliary_loss_mlp": 0.01037039, + "balance_loss_clip": 1.02265668, + "balance_loss_mlp": 1.03959453, + "epoch": 0.5395460694423568, + "flos": 25478476957440.0, + "grad_norm": 2.2712479958365304, + "language_loss": 0.73893452, + "learning_rate": 1.8410726373633623e-06, + "loss": 0.76046824, + "num_input_tokens_seen": 193267490, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.765625, + "step": 8974, + "time_per_iteration": 2.5056395530700684 + }, + { + "auxiliary_loss_clip": 0.01029707, + "auxiliary_loss_mlp": 0.01005081, + "balance_loss_clip": 1.00388896, + "balance_loss_mlp": 1.00777423, + "epoch": 0.5396061926950249, + "flos": 53249493507840.0, + "grad_norm": 0.7339193766969773, + "language_loss": 0.51197326, + "learning_rate": 1.8406844102682215e-06, + "loss": 0.53232116, + "num_input_tokens_seen": 193326050, + "router_z_loss_clip": 0.01190186, + "router_z_loss_mlp": 0.21972656, + "step": 8975, + "time_per_iteration": 3.0552287101745605 + }, + { + "auxiliary_loss_clip": 0.01110098, + "auxiliary_loss_mlp": 0.01040418, + "balance_loss_clip": 1.02723336, + "balance_loss_mlp": 1.03983927, + "epoch": 0.5396663159476928, + "flos": 26725080418560.0, + "grad_norm": 1.5397959415241314, + "language_loss": 0.71919322, + "learning_rate": 1.840296189214344e-06, + "loss": 0.74069834, + "num_input_tokens_seen": 193348785, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 8976, + "time_per_iteration": 2.5368118286132812 + }, + { + "auxiliary_loss_clip": 0.01112047, + "auxiliary_loss_mlp": 0.01035595, + "balance_loss_clip": 1.02300107, + "balance_loss_mlp": 1.03994215, + "epoch": 0.5397264392003608, + "flos": 23253380127360.0, + "grad_norm": 2.148603673983975, + "language_loss": 0.70274073, + "learning_rate": 1.8399079742164509e-06, + "loss": 0.72421718, + "num_input_tokens_seen": 193367080, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71875, + "step": 8977, + "time_per_iteration": 2.4685816764831543 + }, + { + "auxiliary_loss_clip": 0.01113255, + "auxiliary_loss_mlp": 0.01034111, + "balance_loss_clip": 1.02102757, + "balance_loss_mlp": 1.04169548, + "epoch": 0.5397865624530287, + "flos": 18294188791680.0, + "grad_norm": 1.656094242871676, + "language_loss": 0.7241326, + "learning_rate": 1.8395197652892636e-06, + "loss": 0.7456063, + "num_input_tokens_seen": 193383715, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 8978, + "time_per_iteration": 2.4495601654052734 + }, + { + "auxiliary_loss_clip": 0.01118429, + "auxiliary_loss_mlp": 0.01032682, + "balance_loss_clip": 1.01778078, + "balance_loss_mlp": 1.04137743, + "epoch": 0.5398466857056967, + "flos": 15297514888320.0, + "grad_norm": 2.582100330429111, + "language_loss": 0.73947239, + "learning_rate": 1.8391315624475028e-06, + "loss": 0.76098353, + "num_input_tokens_seen": 193400560, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7734375, + "step": 8979, + "time_per_iteration": 2.467693328857422 + }, + { + "auxiliary_loss_clip": 0.0111825, + "auxiliary_loss_mlp": 0.01049486, + "balance_loss_clip": 1.03538978, + "balance_loss_mlp": 1.04216337, + "epoch": 0.5399068089583646, + "flos": 17821748183040.0, + "grad_norm": 2.0456901795615656, + "language_loss": 0.76959479, + "learning_rate": 1.8387433657058892e-06, + "loss": 0.79127216, + "num_input_tokens_seen": 193418680, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 8980, + "time_per_iteration": 2.5299665927886963 + }, + { + "auxiliary_loss_clip": 0.01111255, + "auxiliary_loss_mlp": 0.01035786, + "balance_loss_clip": 1.02332902, + "balance_loss_mlp": 1.0388093, + "epoch": 0.5399669322110326, + "flos": 27381635164800.0, + "grad_norm": 1.6658662418671077, + "language_loss": 0.81773221, + "learning_rate": 1.8383551750791431e-06, + "loss": 0.83920264, + "num_input_tokens_seen": 193439310, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7265625, + "step": 8981, + "time_per_iteration": 2.593594789505005 + }, + { + "auxiliary_loss_clip": 0.01113866, + "auxiliary_loss_mlp": 0.01032362, + "balance_loss_clip": 1.01837826, + "balance_loss_mlp": 1.03922904, + "epoch": 0.5400270554637006, + "flos": 20449116403200.0, + "grad_norm": 1.7978808319720327, + "language_loss": 0.66842318, + "learning_rate": 1.8379669905819857e-06, + "loss": 0.68988544, + "num_input_tokens_seen": 193458115, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.74609375, + "step": 8982, + "time_per_iteration": 2.5118813514709473 + }, + { + "auxiliary_loss_clip": 0.01110986, + "auxiliary_loss_mlp": 0.01039664, + "balance_loss_clip": 1.02715898, + "balance_loss_mlp": 1.03987551, + "epoch": 0.5400871787163686, + "flos": 21689578638720.0, + "grad_norm": 1.4560866330096367, + "language_loss": 0.82442951, + "learning_rate": 1.8375788122291358e-06, + "loss": 0.84593606, + "num_input_tokens_seen": 193477365, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7109375, + "step": 8983, + "time_per_iteration": 2.457221269607544 + }, + { + "auxiliary_loss_clip": 0.01110015, + "auxiliary_loss_mlp": 0.01035726, + "balance_loss_clip": 1.02204108, + "balance_loss_mlp": 1.03799057, + "epoch": 0.5401473019690365, + "flos": 19204739585280.0, + "grad_norm": 1.7289170608138429, + "language_loss": 0.7078771, + "learning_rate": 1.8371906400353138e-06, + "loss": 0.72933447, + "num_input_tokens_seen": 193495595, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71875, + "step": 8984, + "time_per_iteration": 2.4523980617523193 + }, + { + "auxiliary_loss_clip": 0.01115801, + "auxiliary_loss_mlp": 0.01034543, + "balance_loss_clip": 1.02000558, + "balance_loss_mlp": 1.04127955, + "epoch": 0.5402074252217045, + "flos": 20627376624000.0, + "grad_norm": 1.7555929792269789, + "language_loss": 0.80110276, + "learning_rate": 1.8368024740152386e-06, + "loss": 0.82260621, + "num_input_tokens_seen": 193514035, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7421875, + "step": 8985, + "time_per_iteration": 2.446753740310669 + }, + { + "auxiliary_loss_clip": 0.01104654, + "auxiliary_loss_mlp": 0.0102882, + "balance_loss_clip": 1.01560616, + "balance_loss_mlp": 1.03796721, + "epoch": 0.5402675484743724, + "flos": 24973465691520.0, + "grad_norm": 2.3719765019392844, + "language_loss": 0.78840292, + "learning_rate": 1.83641431418363e-06, + "loss": 0.80973768, + "num_input_tokens_seen": 193535445, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.66796875, + "step": 8986, + "time_per_iteration": 2.5318102836608887 + }, + { + "auxiliary_loss_clip": 0.01109855, + "auxiliary_loss_mlp": 0.01031286, + "balance_loss_clip": 1.01879263, + "balance_loss_mlp": 1.03847885, + "epoch": 0.5403276717270404, + "flos": 19459022941440.0, + "grad_norm": 1.6989773263518806, + "language_loss": 0.77060419, + "learning_rate": 1.8360261605552075e-06, + "loss": 0.79201555, + "num_input_tokens_seen": 193554780, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71484375, + "step": 8987, + "time_per_iteration": 2.524240732192993 + }, + { + "auxiliary_loss_clip": 0.01109666, + "auxiliary_loss_mlp": 0.01031219, + "balance_loss_clip": 1.0178858, + "balance_loss_mlp": 1.03889561, + "epoch": 0.5403877949797083, + "flos": 18442140912000.0, + "grad_norm": 2.580263640738581, + "language_loss": 0.71292162, + "learning_rate": 1.8356380131446887e-06, + "loss": 0.73433048, + "num_input_tokens_seen": 193573580, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 8988, + "time_per_iteration": 2.4638671875 + }, + { + "auxiliary_loss_clip": 0.01110225, + "auxiliary_loss_mlp": 0.01036979, + "balance_loss_clip": 1.0228405, + "balance_loss_mlp": 1.03822088, + "epoch": 0.5404479182323764, + "flos": 28292868316800.0, + "grad_norm": 2.2630612952232827, + "language_loss": 0.67666376, + "learning_rate": 1.8352498719667934e-06, + "loss": 0.69813585, + "num_input_tokens_seen": 193590490, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.71875, + "step": 8989, + "time_per_iteration": 2.508855104446411 + }, + { + "auxiliary_loss_clip": 0.01111455, + "auxiliary_loss_mlp": 0.01037396, + "balance_loss_clip": 1.02386594, + "balance_loss_mlp": 1.03881633, + "epoch": 0.5405080414850444, + "flos": 23367325046400.0, + "grad_norm": 1.5798861838358007, + "language_loss": 0.77628905, + "learning_rate": 1.8348617370362399e-06, + "loss": 0.79777759, + "num_input_tokens_seen": 193609900, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7265625, + "step": 8990, + "time_per_iteration": 2.489483118057251 + }, + { + "auxiliary_loss_clip": 0.01106485, + "auxiliary_loss_mlp": 0.01028032, + "balance_loss_clip": 1.01594377, + "balance_loss_mlp": 1.03673029, + "epoch": 0.5405681647377123, + "flos": 21106425335040.0, + "grad_norm": 1.5931818725193578, + "language_loss": 0.69039345, + "learning_rate": 1.834473608367745e-06, + "loss": 0.71173859, + "num_input_tokens_seen": 193629775, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 8991, + "time_per_iteration": 2.4418294429779053 + }, + { + "auxiliary_loss_clip": 0.01109673, + "auxiliary_loss_mlp": 0.01035539, + "balance_loss_clip": 1.02171683, + "balance_loss_mlp": 1.03739381, + "epoch": 0.5406282879903803, + "flos": 20449188230400.0, + "grad_norm": 1.7624988623501092, + "language_loss": 0.7614572, + "learning_rate": 1.8340854859760277e-06, + "loss": 0.78290933, + "num_input_tokens_seen": 193648070, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.72265625, + "step": 8992, + "time_per_iteration": 2.4845540523529053 + }, + { + "auxiliary_loss_clip": 0.01110684, + "auxiliary_loss_mlp": 0.01032261, + "balance_loss_clip": 1.01963115, + "balance_loss_mlp": 1.03731656, + "epoch": 0.5406884112430482, + "flos": 14209493973120.0, + "grad_norm": 2.6314606707027304, + "language_loss": 0.76393229, + "learning_rate": 1.8336973698758056e-06, + "loss": 0.78536171, + "num_input_tokens_seen": 193665060, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.734375, + "step": 8993, + "time_per_iteration": 2.4074175357818604 + }, + { + "auxiliary_loss_clip": 0.01107083, + "auxiliary_loss_mlp": 0.01033943, + "balance_loss_clip": 1.02129519, + "balance_loss_mlp": 1.03785443, + "epoch": 0.5407485344957162, + "flos": 23875568536320.0, + "grad_norm": 1.6731423627794038, + "language_loss": 0.70444834, + "learning_rate": 1.8333092600817959e-06, + "loss": 0.72585857, + "num_input_tokens_seen": 193683620, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 8994, + "time_per_iteration": 2.5207760334014893 + }, + { + "auxiliary_loss_clip": 0.01110631, + "auxiliary_loss_mlp": 0.01031119, + "balance_loss_clip": 1.01729715, + "balance_loss_mlp": 1.03817177, + "epoch": 0.5408086577483842, + "flos": 23148485435520.0, + "grad_norm": 1.7966588085871025, + "language_loss": 0.74846065, + "learning_rate": 1.8329211566087157e-06, + "loss": 0.76987815, + "num_input_tokens_seen": 193702990, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7265625, + "step": 8995, + "time_per_iteration": 2.468820095062256 + }, + { + "auxiliary_loss_clip": 0.01107091, + "auxiliary_loss_mlp": 0.01035132, + "balance_loss_clip": 1.02315211, + "balance_loss_mlp": 1.0381844, + "epoch": 0.5408687810010522, + "flos": 18771046773120.0, + "grad_norm": 1.845320286189123, + "language_loss": 0.73867524, + "learning_rate": 1.832533059471282e-06, + "loss": 0.7600975, + "num_input_tokens_seen": 193721785, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6875, + "step": 8996, + "time_per_iteration": 2.4607067108154297 + }, + { + "auxiliary_loss_clip": 0.01105028, + "auxiliary_loss_mlp": 0.01033976, + "balance_loss_clip": 1.02183414, + "balance_loss_mlp": 1.03760076, + "epoch": 0.5409289042537201, + "flos": 13881557779200.0, + "grad_norm": 1.7779086932858201, + "language_loss": 0.73281908, + "learning_rate": 1.8321449686842115e-06, + "loss": 0.75420916, + "num_input_tokens_seen": 193740315, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.671875, + "step": 8997, + "time_per_iteration": 2.433438301086426 + }, + { + "auxiliary_loss_clip": 0.01109644, + "auxiliary_loss_mlp": 0.01033634, + "balance_loss_clip": 1.02052116, + "balance_loss_mlp": 1.03904319, + "epoch": 0.5409890275063881, + "flos": 14465357527680.0, + "grad_norm": 2.01233035965423, + "language_loss": 0.71775877, + "learning_rate": 1.8317568842622207e-06, + "loss": 0.73919159, + "num_input_tokens_seen": 193757580, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 8998, + "time_per_iteration": 2.4791901111602783 + }, + { + "auxiliary_loss_clip": 0.01107126, + "auxiliary_loss_mlp": 0.01037885, + "balance_loss_clip": 1.02471876, + "balance_loss_mlp": 1.03724909, + "epoch": 0.541049150759056, + "flos": 48977449349760.0, + "grad_norm": 1.596226887866337, + "language_loss": 0.70601052, + "learning_rate": 1.8313688062200256e-06, + "loss": 0.72746068, + "num_input_tokens_seen": 193780965, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.69921875, + "step": 8999, + "time_per_iteration": 2.6774816513061523 + }, + { + "auxiliary_loss_clip": 0.01106234, + "auxiliary_loss_mlp": 0.01035678, + "balance_loss_clip": 1.0222373, + "balance_loss_mlp": 1.03789854, + "epoch": 0.541109274011724, + "flos": 18147601388160.0, + "grad_norm": 2.5727427903087716, + "language_loss": 0.80433559, + "learning_rate": 1.8309807345723422e-06, + "loss": 0.8257547, + "num_input_tokens_seen": 193797855, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.68359375, + "step": 9000, + "time_per_iteration": 2.4608795642852783 + }, + { + "auxiliary_loss_clip": 0.0110639, + "auxiliary_loss_mlp": 0.01029527, + "balance_loss_clip": 1.01646805, + "balance_loss_mlp": 1.03770971, + "epoch": 0.541169397264392, + "flos": 20522553759360.0, + "grad_norm": 1.4688376580267075, + "language_loss": 0.72885478, + "learning_rate": 1.8305926693338863e-06, + "loss": 0.75021398, + "num_input_tokens_seen": 193817375, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6875, + "step": 9001, + "time_per_iteration": 2.469433069229126 + }, + { + "auxiliary_loss_clip": 0.01112566, + "auxiliary_loss_mlp": 0.01035385, + "balance_loss_clip": 1.0213902, + "balance_loss_mlp": 1.03844023, + "epoch": 0.54122952051706, + "flos": 20044043752320.0, + "grad_norm": 2.257759724972284, + "language_loss": 0.85127461, + "learning_rate": 1.8302046105193734e-06, + "loss": 0.87275422, + "num_input_tokens_seen": 193832205, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7421875, + "step": 9002, + "time_per_iteration": 2.4405739307403564 + }, + { + "auxiliary_loss_clip": 0.01107037, + "auxiliary_loss_mlp": 0.010314, + "balance_loss_clip": 1.02020574, + "balance_loss_mlp": 1.0384078, + "epoch": 0.541289643769728, + "flos": 19062246332160.0, + "grad_norm": 1.7125809204353786, + "language_loss": 0.77755821, + "learning_rate": 1.8298165581435183e-06, + "loss": 0.79894257, + "num_input_tokens_seen": 193849830, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6875, + "step": 9003, + "time_per_iteration": 2.451507806777954 + }, + { + "auxiliary_loss_clip": 0.01105384, + "auxiliary_loss_mlp": 0.01029055, + "balance_loss_clip": 1.01557827, + "balance_loss_mlp": 1.03640234, + "epoch": 0.5413497670223959, + "flos": 22382295402240.0, + "grad_norm": 2.168361582224207, + "language_loss": 0.69784325, + "learning_rate": 1.8294285122210372e-06, + "loss": 0.71918762, + "num_input_tokens_seen": 193869945, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.6875, + "step": 9004, + "time_per_iteration": 2.613961935043335 + }, + { + "auxiliary_loss_clip": 0.01028073, + "auxiliary_loss_mlp": 0.01010119, + "balance_loss_clip": 1.00899816, + "balance_loss_mlp": 1.00624812, + "epoch": 0.5414098902750639, + "flos": 70031734093440.0, + "grad_norm": 0.9677352946959291, + "language_loss": 0.59124619, + "learning_rate": 1.8290404727666434e-06, + "loss": 0.61162812, + "num_input_tokens_seen": 193930860, + "router_z_loss_clip": 0.01123047, + "router_z_loss_mlp": 0.21875, + "step": 9005, + "time_per_iteration": 3.175964832305908 + }, + { + "auxiliary_loss_clip": 0.01110665, + "auxiliary_loss_mlp": 0.01033824, + "balance_loss_clip": 1.02183771, + "balance_loss_mlp": 1.03938627, + "epoch": 0.5414700135277318, + "flos": 21798962530560.0, + "grad_norm": 1.6968329328942213, + "language_loss": 0.77685302, + "learning_rate": 1.8286524397950517e-06, + "loss": 0.79829788, + "num_input_tokens_seen": 193949075, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.7109375, + "step": 9006, + "time_per_iteration": 2.455742359161377 + }, + { + "auxiliary_loss_clip": 0.01104494, + "auxiliary_loss_mlp": 0.01032845, + "balance_loss_clip": 1.02205062, + "balance_loss_mlp": 1.03625751, + "epoch": 0.5415301367803999, + "flos": 16907929251840.0, + "grad_norm": 1.624690870596759, + "language_loss": 0.82998371, + "learning_rate": 1.8282644133209777e-06, + "loss": 0.8513571, + "num_input_tokens_seen": 193967630, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.68359375, + "step": 9007, + "time_per_iteration": 2.4356093406677246 + }, + { + "auxiliary_loss_clip": 0.01107937, + "auxiliary_loss_mlp": 0.01030224, + "balance_loss_clip": 1.01693249, + "balance_loss_mlp": 1.03761423, + "epoch": 0.5415902600330678, + "flos": 25704176065920.0, + "grad_norm": 2.1377427178959434, + "language_loss": 0.67209023, + "learning_rate": 1.8278763933591334e-06, + "loss": 0.69347185, + "num_input_tokens_seen": 193988730, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.703125, + "step": 9008, + "time_per_iteration": 2.5489509105682373 + }, + { + "auxiliary_loss_clip": 0.01111879, + "auxiliary_loss_mlp": 0.01031733, + "balance_loss_clip": 1.01810145, + "balance_loss_mlp": 1.03802204, + "epoch": 0.5416503832857358, + "flos": 19208151377280.0, + "grad_norm": 2.189253604566193, + "language_loss": 0.74129766, + "learning_rate": 1.827488379924234e-06, + "loss": 0.76273382, + "num_input_tokens_seen": 194005160, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7421875, + "step": 9009, + "time_per_iteration": 3.8252077102661133 + }, + { + "auxiliary_loss_clip": 0.01110449, + "auxiliary_loss_mlp": 0.01034408, + "balance_loss_clip": 1.02109861, + "balance_loss_mlp": 1.03791738, + "epoch": 0.5417105065384037, + "flos": 12713706887040.0, + "grad_norm": 2.141173328238238, + "language_loss": 0.87482637, + "learning_rate": 1.8271003730309923e-06, + "loss": 0.89627492, + "num_input_tokens_seen": 194021700, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.72265625, + "step": 9010, + "time_per_iteration": 2.4628190994262695 + }, + { + "auxiliary_loss_clip": 0.01106778, + "auxiliary_loss_mlp": 0.01032743, + "balance_loss_clip": 1.02007151, + "balance_loss_mlp": 1.03684556, + "epoch": 0.5417706297910717, + "flos": 30335933998080.0, + "grad_norm": 1.9800903494769417, + "language_loss": 0.64830345, + "learning_rate": 1.826712372694122e-06, + "loss": 0.66969872, + "num_input_tokens_seen": 194042620, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 9011, + "time_per_iteration": 2.530303955078125 + }, + { + "auxiliary_loss_clip": 0.01109504, + "auxiliary_loss_mlp": 0.01036995, + "balance_loss_clip": 1.02463341, + "balance_loss_mlp": 1.03945065, + "epoch": 0.5418307530437396, + "flos": 29020992912000.0, + "grad_norm": 3.61342010762258, + "language_loss": 0.79000378, + "learning_rate": 1.8263243789283362e-06, + "loss": 0.81146884, + "num_input_tokens_seen": 194061800, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.69921875, + "step": 9012, + "time_per_iteration": 5.477705240249634 + }, + { + "auxiliary_loss_clip": 0.01106272, + "auxiliary_loss_mlp": 0.01030108, + "balance_loss_clip": 1.01720369, + "balance_loss_mlp": 1.0364089, + "epoch": 0.5418908762964076, + "flos": 16873455173760.0, + "grad_norm": 1.7419259634167055, + "language_loss": 0.74031919, + "learning_rate": 1.8259363917483466e-06, + "loss": 0.76168299, + "num_input_tokens_seen": 194079890, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69921875, + "step": 9013, + "time_per_iteration": 3.8720171451568604 + }, + { + "auxiliary_loss_clip": 0.01109547, + "auxiliary_loss_mlp": 0.01029661, + "balance_loss_clip": 1.01657844, + "balance_loss_mlp": 1.0367403, + "epoch": 0.5419509995490756, + "flos": 18949702043520.0, + "grad_norm": 2.040050456437719, + "language_loss": 0.72289932, + "learning_rate": 1.8255484111688667e-06, + "loss": 0.74429148, + "num_input_tokens_seen": 194097625, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 9014, + "time_per_iteration": 2.436251640319824 + }, + { + "auxiliary_loss_clip": 0.01108382, + "auxiliary_loss_mlp": 0.01031413, + "balance_loss_clip": 1.01889062, + "balance_loss_mlp": 1.03802454, + "epoch": 0.5420111228017436, + "flos": 18077719478400.0, + "grad_norm": 1.601636110073364, + "language_loss": 0.80585766, + "learning_rate": 1.8251604372046085e-06, + "loss": 0.82725561, + "num_input_tokens_seen": 194116055, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 9015, + "time_per_iteration": 2.4523091316223145 + }, + { + "auxiliary_loss_clip": 0.01112438, + "auxiliary_loss_mlp": 0.0103619, + "balance_loss_clip": 1.02298188, + "balance_loss_mlp": 1.03929543, + "epoch": 0.5420712460544116, + "flos": 19061779455360.0, + "grad_norm": 3.6814275573944717, + "language_loss": 0.81413746, + "learning_rate": 1.8247724698702843e-06, + "loss": 0.83562374, + "num_input_tokens_seen": 194130365, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73046875, + "step": 9016, + "time_per_iteration": 2.4310686588287354 + }, + { + "auxiliary_loss_clip": 0.01107219, + "auxiliary_loss_mlp": 0.01030151, + "balance_loss_clip": 1.01763988, + "balance_loss_mlp": 1.03753281, + "epoch": 0.5421313693070795, + "flos": 18187103370240.0, + "grad_norm": 2.1017981350927646, + "language_loss": 0.81103092, + "learning_rate": 1.8243845091806053e-06, + "loss": 0.83240461, + "num_input_tokens_seen": 194148975, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69921875, + "step": 9017, + "time_per_iteration": 2.427536725997925 + }, + { + "auxiliary_loss_clip": 0.01104389, + "auxiliary_loss_mlp": 0.01029579, + "balance_loss_clip": 1.01719928, + "balance_loss_mlp": 1.03666961, + "epoch": 0.5421914925597475, + "flos": 13005947940480.0, + "grad_norm": 1.7397815948262747, + "language_loss": 0.77372575, + "learning_rate": 1.8239965551502837e-06, + "loss": 0.79506552, + "num_input_tokens_seen": 194167185, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 9018, + "time_per_iteration": 2.4533066749572754 + }, + { + "auxiliary_loss_clip": 0.01107196, + "auxiliary_loss_mlp": 0.01037507, + "balance_loss_clip": 1.02436996, + "balance_loss_mlp": 1.03481603, + "epoch": 0.5422516158124154, + "flos": 46758457831680.0, + "grad_norm": 1.448924926163926, + "language_loss": 0.66352963, + "learning_rate": 1.8236086077940303e-06, + "loss": 0.68497658, + "num_input_tokens_seen": 194192840, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7265625, + "step": 9019, + "time_per_iteration": 2.6830832958221436 + }, + { + "auxiliary_loss_clip": 0.01103655, + "auxiliary_loss_mlp": 0.0102679, + "balance_loss_clip": 1.0157038, + "balance_loss_mlp": 1.03604794, + "epoch": 0.5423117390650835, + "flos": 31758642864000.0, + "grad_norm": 1.5485094933207573, + "language_loss": 0.69635725, + "learning_rate": 1.8232206671265555e-06, + "loss": 0.71766162, + "num_input_tokens_seen": 194213150, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.67578125, + "step": 9020, + "time_per_iteration": 2.5516250133514404 + }, + { + "auxiliary_loss_clip": 0.01101699, + "auxiliary_loss_mlp": 0.01036657, + "balance_loss_clip": 1.02415812, + "balance_loss_mlp": 1.03544152, + "epoch": 0.5423718623177514, + "flos": 27201974313600.0, + "grad_norm": 1.4647880942088878, + "language_loss": 0.80443847, + "learning_rate": 1.8228327331625717e-06, + "loss": 0.825822, + "num_input_tokens_seen": 194234665, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6640625, + "step": 9021, + "time_per_iteration": 2.52411150932312 + }, + { + "auxiliary_loss_clip": 0.01107355, + "auxiliary_loss_mlp": 0.01033488, + "balance_loss_clip": 1.02107835, + "balance_loss_mlp": 1.03812504, + "epoch": 0.5424319855704194, + "flos": 23546447193600.0, + "grad_norm": 1.483970922248673, + "language_loss": 0.78272343, + "learning_rate": 1.822444805916788e-06, + "loss": 0.80413187, + "num_input_tokens_seen": 194253790, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6953125, + "step": 9022, + "time_per_iteration": 2.4745841026306152 + }, + { + "auxiliary_loss_clip": 0.01104936, + "auxiliary_loss_mlp": 0.01033878, + "balance_loss_clip": 1.02170706, + "balance_loss_mlp": 1.03559494, + "epoch": 0.5424921088230873, + "flos": 26615624699520.0, + "grad_norm": 1.6624827413591161, + "language_loss": 0.82107073, + "learning_rate": 1.822056885403915e-06, + "loss": 0.84245884, + "num_input_tokens_seen": 194274950, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6953125, + "step": 9023, + "time_per_iteration": 2.4953298568725586 + }, + { + "auxiliary_loss_clip": 0.01106969, + "auxiliary_loss_mlp": 0.01028855, + "balance_loss_clip": 1.01670718, + "balance_loss_mlp": 1.03815961, + "epoch": 0.5425522320757553, + "flos": 23586811102080.0, + "grad_norm": 1.8210142178846183, + "language_loss": 0.71515894, + "learning_rate": 1.8216689716386627e-06, + "loss": 0.73651719, + "num_input_tokens_seen": 194296155, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 9024, + "time_per_iteration": 2.512596368789673 + }, + { + "auxiliary_loss_clip": 0.01107389, + "auxiliary_loss_mlp": 0.01030904, + "balance_loss_clip": 1.01878023, + "balance_loss_mlp": 1.03640127, + "epoch": 0.5426123553284232, + "flos": 30592264429440.0, + "grad_norm": 1.659326462636006, + "language_loss": 0.64976329, + "learning_rate": 1.8212810646357405e-06, + "loss": 0.67114621, + "num_input_tokens_seen": 194318025, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.7109375, + "step": 9025, + "time_per_iteration": 2.512734889984131 + }, + { + "auxiliary_loss_clip": 0.0110856, + "auxiliary_loss_mlp": 0.01030004, + "balance_loss_clip": 1.01776159, + "balance_loss_mlp": 1.0378685, + "epoch": 0.5426724785810912, + "flos": 12495118671360.0, + "grad_norm": 6.402510966233504, + "language_loss": 0.74099922, + "learning_rate": 1.8208931644098591e-06, + "loss": 0.76238489, + "num_input_tokens_seen": 194336150, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.70703125, + "step": 9026, + "time_per_iteration": 2.42434024810791 + }, + { + "auxiliary_loss_clip": 0.01107364, + "auxiliary_loss_mlp": 0.01040251, + "balance_loss_clip": 1.02587438, + "balance_loss_mlp": 1.03585124, + "epoch": 0.5427326018337592, + "flos": 26064611089920.0, + "grad_norm": 1.637995325273745, + "language_loss": 0.78638506, + "learning_rate": 1.8205052709757265e-06, + "loss": 0.80786121, + "num_input_tokens_seen": 194355980, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.71484375, + "step": 9027, + "time_per_iteration": 2.488490104675293 + }, + { + "auxiliary_loss_clip": 0.01029187, + "auxiliary_loss_mlp": 0.01006045, + "balance_loss_clip": 1.00479341, + "balance_loss_mlp": 1.00745916, + "epoch": 0.5427927250864272, + "flos": 65984745576960.0, + "grad_norm": 0.7366554152868067, + "language_loss": 0.56548405, + "learning_rate": 1.8201173843480515e-06, + "loss": 0.58583641, + "num_input_tokens_seen": 194422660, + "router_z_loss_clip": 0.01251221, + "router_z_loss_mlp": 0.21679688, + "step": 9028, + "time_per_iteration": 3.0799479484558105 + }, + { + "auxiliary_loss_clip": 0.01108987, + "auxiliary_loss_mlp": 0.01030483, + "balance_loss_clip": 1.01727474, + "balance_loss_mlp": 1.03760409, + "epoch": 0.5428528483390952, + "flos": 19975382904960.0, + "grad_norm": 2.289578054979344, + "language_loss": 0.7793408, + "learning_rate": 1.8197295045415442e-06, + "loss": 0.80073547, + "num_input_tokens_seen": 194438545, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71484375, + "step": 9029, + "time_per_iteration": 2.454566478729248 + }, + { + "auxiliary_loss_clip": 0.01105649, + "auxiliary_loss_mlp": 0.01027546, + "balance_loss_clip": 1.01489735, + "balance_loss_mlp": 1.03734791, + "epoch": 0.5429129715917631, + "flos": 21832323287040.0, + "grad_norm": 1.5369423730734595, + "language_loss": 0.83306921, + "learning_rate": 1.8193416315709112e-06, + "loss": 0.85440123, + "num_input_tokens_seen": 194458060, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.68359375, + "step": 9030, + "time_per_iteration": 2.4675095081329346 + }, + { + "auxiliary_loss_clip": 0.01105498, + "auxiliary_loss_mlp": 0.01028416, + "balance_loss_clip": 1.01676893, + "balance_loss_mlp": 1.0374887, + "epoch": 0.5429730948444311, + "flos": 27782685492480.0, + "grad_norm": 1.5422544284751551, + "language_loss": 0.74720484, + "learning_rate": 1.8189537654508623e-06, + "loss": 0.76854396, + "num_input_tokens_seen": 194477405, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 9031, + "time_per_iteration": 2.4871413707733154 + }, + { + "auxiliary_loss_clip": 0.01103416, + "auxiliary_loss_mlp": 0.01030421, + "balance_loss_clip": 1.01883435, + "balance_loss_mlp": 1.03710687, + "epoch": 0.543033218097099, + "flos": 26760452336640.0, + "grad_norm": 1.9031998711979703, + "language_loss": 0.85544586, + "learning_rate": 1.8185659061961045e-06, + "loss": 0.87678427, + "num_input_tokens_seen": 194497085, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6640625, + "step": 9032, + "time_per_iteration": 2.492750406265259 + }, + { + "auxiliary_loss_clip": 0.01110136, + "auxiliary_loss_mlp": 0.01029381, + "balance_loss_clip": 1.01670289, + "balance_loss_mlp": 1.03757548, + "epoch": 0.5430933413497671, + "flos": 22675254727680.0, + "grad_norm": 1.71218946587007, + "language_loss": 0.73568988, + "learning_rate": 1.8181780538213457e-06, + "loss": 0.75708508, + "num_input_tokens_seen": 194516785, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7265625, + "step": 9033, + "time_per_iteration": 2.458281993865967 + }, + { + "auxiliary_loss_clip": 0.01106249, + "auxiliary_loss_mlp": 0.01033307, + "balance_loss_clip": 1.02057564, + "balance_loss_mlp": 1.03709424, + "epoch": 0.543153464602435, + "flos": 24607499973120.0, + "grad_norm": 1.6976408638259588, + "language_loss": 0.75797909, + "learning_rate": 1.8177902083412935e-06, + "loss": 0.77937472, + "num_input_tokens_seen": 194536475, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 9034, + "time_per_iteration": 2.491690158843994 + }, + { + "auxiliary_loss_clip": 0.01105341, + "auxiliary_loss_mlp": 0.01031202, + "balance_loss_clip": 1.01932836, + "balance_loss_mlp": 1.03710067, + "epoch": 0.543213587855103, + "flos": 19025725178880.0, + "grad_norm": 1.7098309272106547, + "language_loss": 0.84488094, + "learning_rate": 1.817402369770655e-06, + "loss": 0.86624634, + "num_input_tokens_seen": 194554495, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 9035, + "time_per_iteration": 2.4352262020111084 + }, + { + "auxiliary_loss_clip": 0.01028064, + "auxiliary_loss_mlp": 0.01007827, + "balance_loss_clip": 1.00669503, + "balance_loss_mlp": 1.00628209, + "epoch": 0.5432737111077709, + "flos": 65686435125120.0, + "grad_norm": 0.7231810753813949, + "language_loss": 0.55908412, + "learning_rate": 1.8170145381241364e-06, + "loss": 0.57944304, + "num_input_tokens_seen": 194617620, + "router_z_loss_clip": 0.01135254, + "router_z_loss_mlp": 0.21777344, + "step": 9036, + "time_per_iteration": 3.041694402694702 + }, + { + "auxiliary_loss_clip": 0.01108199, + "auxiliary_loss_mlp": 0.01034372, + "balance_loss_clip": 1.02147961, + "balance_loss_mlp": 1.03686309, + "epoch": 0.5433338343604389, + "flos": 22091670460800.0, + "grad_norm": 1.5099374695532384, + "language_loss": 0.75264686, + "learning_rate": 1.8166267134164451e-06, + "loss": 0.77407253, + "num_input_tokens_seen": 194637690, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9037, + "time_per_iteration": 2.4950051307678223 + }, + { + "auxiliary_loss_clip": 0.01106194, + "auxiliary_loss_mlp": 0.01035411, + "balance_loss_clip": 1.02301288, + "balance_loss_mlp": 1.03557479, + "epoch": 0.5433939576131068, + "flos": 34672649616000.0, + "grad_norm": 1.5216693219084618, + "language_loss": 0.66438931, + "learning_rate": 1.8162388956622875e-06, + "loss": 0.68580532, + "num_input_tokens_seen": 194659520, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.70703125, + "step": 9038, + "time_per_iteration": 2.559807777404785 + }, + { + "auxiliary_loss_clip": 0.01103453, + "auxiliary_loss_mlp": 0.01030083, + "balance_loss_clip": 1.0184598, + "balance_loss_mlp": 1.03513312, + "epoch": 0.5434540808657748, + "flos": 20303355012480.0, + "grad_norm": 1.8787316560909988, + "language_loss": 0.78100199, + "learning_rate": 1.8158510848763692e-06, + "loss": 0.80233729, + "num_input_tokens_seen": 194677645, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.68359375, + "step": 9039, + "time_per_iteration": 2.4654388427734375 + }, + { + "auxiliary_loss_clip": 0.01106931, + "auxiliary_loss_mlp": 0.01032936, + "balance_loss_clip": 1.02066386, + "balance_loss_mlp": 1.03744531, + "epoch": 0.5435142041184428, + "flos": 23112790295040.0, + "grad_norm": 1.8309305249268624, + "language_loss": 0.76449573, + "learning_rate": 1.8154632810733962e-06, + "loss": 0.78589433, + "num_input_tokens_seen": 194697400, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 9040, + "time_per_iteration": 2.527614116668701 + }, + { + "auxiliary_loss_clip": 0.0102829, + "auxiliary_loss_mlp": 0.01001895, + "balance_loss_clip": 1.00074422, + "balance_loss_mlp": 1.0065496, + "epoch": 0.5435743273711108, + "flos": 64012746954240.0, + "grad_norm": 0.6649082596858222, + "language_loss": 0.52501261, + "learning_rate": 1.815075484268074e-06, + "loss": 0.54531443, + "num_input_tokens_seen": 194761205, + "router_z_loss_clip": 0.01147461, + "router_z_loss_mlp": 0.21777344, + "step": 9041, + "time_per_iteration": 3.0513055324554443 + }, + { + "auxiliary_loss_clip": 0.01105303, + "auxiliary_loss_mlp": 0.01036545, + "balance_loss_clip": 1.02383089, + "balance_loss_mlp": 1.03610432, + "epoch": 0.5436344506237788, + "flos": 25118903859840.0, + "grad_norm": 1.5670483715805776, + "language_loss": 0.76206207, + "learning_rate": 1.8146876944751078e-06, + "loss": 0.78348053, + "num_input_tokens_seen": 194782445, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 9042, + "time_per_iteration": 2.4679293632507324 + }, + { + "auxiliary_loss_clip": 0.01102475, + "auxiliary_loss_mlp": 0.01031619, + "balance_loss_clip": 1.02001429, + "balance_loss_mlp": 1.03483939, + "epoch": 0.5436945738764467, + "flos": 19572967860480.0, + "grad_norm": 1.637929025007711, + "language_loss": 0.67479855, + "learning_rate": 1.8142999117092033e-06, + "loss": 0.69613945, + "num_input_tokens_seen": 194800325, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.67578125, + "step": 9043, + "time_per_iteration": 2.469393730163574 + }, + { + "auxiliary_loss_clip": 0.01101674, + "auxiliary_loss_mlp": 0.01031755, + "balance_loss_clip": 1.019876, + "balance_loss_mlp": 1.03556848, + "epoch": 0.5437546971291147, + "flos": 21142515525120.0, + "grad_norm": 1.6229792564391676, + "language_loss": 0.8417449, + "learning_rate": 1.8139121359850644e-06, + "loss": 0.86307919, + "num_input_tokens_seen": 194818675, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66015625, + "step": 9044, + "time_per_iteration": 2.4827311038970947 + }, + { + "auxiliary_loss_clip": 0.01111478, + "auxiliary_loss_mlp": 0.01026732, + "balance_loss_clip": 1.01375592, + "balance_loss_mlp": 1.03744245, + "epoch": 0.5438148203817826, + "flos": 25118688378240.0, + "grad_norm": 4.385221285903045, + "language_loss": 0.6211096, + "learning_rate": 1.8135243673173956e-06, + "loss": 0.6424917, + "num_input_tokens_seen": 194836595, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7421875, + "step": 9045, + "time_per_iteration": 2.5340473651885986 + }, + { + "auxiliary_loss_clip": 0.01108322, + "auxiliary_loss_mlp": 0.01030915, + "balance_loss_clip": 1.01814771, + "balance_loss_mlp": 1.03780746, + "epoch": 0.5438749436344507, + "flos": 23002939526400.0, + "grad_norm": 1.4286240482824728, + "language_loss": 0.69942701, + "learning_rate": 1.8131366057209023e-06, + "loss": 0.72081935, + "num_input_tokens_seen": 194857520, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.703125, + "step": 9046, + "time_per_iteration": 2.4620296955108643 + }, + { + "auxiliary_loss_clip": 0.01104565, + "auxiliary_loss_mlp": 0.01027743, + "balance_loss_clip": 1.01592338, + "balance_loss_mlp": 1.03681147, + "epoch": 0.5439350668871186, + "flos": 15487016065920.0, + "grad_norm": 2.1944623143587667, + "language_loss": 0.77171725, + "learning_rate": 1.8127488512102868e-06, + "loss": 0.79304034, + "num_input_tokens_seen": 194876020, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 9047, + "time_per_iteration": 2.4618160724639893 + }, + { + "auxiliary_loss_clip": 0.01107988, + "auxiliary_loss_mlp": 0.0103533, + "balance_loss_clip": 1.0232358, + "balance_loss_mlp": 1.03817999, + "epoch": 0.5439951901397866, + "flos": 17238415311360.0, + "grad_norm": 1.7709524835714412, + "language_loss": 0.72530591, + "learning_rate": 1.8123611038002547e-06, + "loss": 0.74673903, + "num_input_tokens_seen": 194894650, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69921875, + "step": 9048, + "time_per_iteration": 2.43306827545166 + }, + { + "auxiliary_loss_clip": 0.01108101, + "auxiliary_loss_mlp": 0.0103279, + "balance_loss_clip": 1.01999831, + "balance_loss_mlp": 1.03979266, + "epoch": 0.5440553133924545, + "flos": 18661016436480.0, + "grad_norm": 2.1212679973875805, + "language_loss": 0.93380594, + "learning_rate": 1.8119733635055076e-06, + "loss": 0.95521486, + "num_input_tokens_seen": 194911935, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.68359375, + "step": 9049, + "time_per_iteration": 2.4344465732574463 + }, + { + "auxiliary_loss_clip": 0.01102747, + "auxiliary_loss_mlp": 0.01029345, + "balance_loss_clip": 1.01810968, + "balance_loss_mlp": 1.0347991, + "epoch": 0.5441154366451225, + "flos": 27122934435840.0, + "grad_norm": 1.8375314287256255, + "language_loss": 0.73678643, + "learning_rate": 1.8115856303407492e-06, + "loss": 0.75810736, + "num_input_tokens_seen": 194931620, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6796875, + "step": 9050, + "time_per_iteration": 2.491992473602295 + }, + { + "auxiliary_loss_clip": 0.01109361, + "auxiliary_loss_mlp": 0.01028722, + "balance_loss_clip": 1.01630008, + "balance_loss_mlp": 1.0390985, + "epoch": 0.5441755598977904, + "flos": 25993867253760.0, + "grad_norm": 1.7129729573051025, + "language_loss": 0.67238903, + "learning_rate": 1.8111979043206832e-06, + "loss": 0.69376987, + "num_input_tokens_seen": 194952560, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.703125, + "step": 9051, + "time_per_iteration": 3.862109661102295 + }, + { + "auxiliary_loss_clip": 0.01104183, + "auxiliary_loss_mlp": 0.01029305, + "balance_loss_clip": 1.0174253, + "balance_loss_mlp": 1.03553367, + "epoch": 0.5442356831504584, + "flos": 32380041173760.0, + "grad_norm": 1.6461015999412698, + "language_loss": 0.67748392, + "learning_rate": 1.810810185460011e-06, + "loss": 0.6988188, + "num_input_tokens_seen": 194973915, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 9052, + "time_per_iteration": 2.5398967266082764 + }, + { + "auxiliary_loss_clip": 0.01108274, + "auxiliary_loss_mlp": 0.010316, + "balance_loss_clip": 1.01914227, + "balance_loss_mlp": 1.03725493, + "epoch": 0.5442958064031264, + "flos": 24164290056960.0, + "grad_norm": 1.7506645402052365, + "language_loss": 0.92625535, + "learning_rate": 1.810422473773436e-06, + "loss": 0.94765407, + "num_input_tokens_seen": 194990170, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 9053, + "time_per_iteration": 2.4675142765045166 + }, + { + "auxiliary_loss_clip": 0.01107915, + "auxiliary_loss_mlp": 0.01034902, + "balance_loss_clip": 1.02233112, + "balance_loss_mlp": 1.03685415, + "epoch": 0.5443559296557944, + "flos": 18764690065920.0, + "grad_norm": 2.7890591975918206, + "language_loss": 0.83447516, + "learning_rate": 1.8100347692756595e-06, + "loss": 0.85590339, + "num_input_tokens_seen": 195006395, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 9054, + "time_per_iteration": 5.314599275588989 + }, + { + "auxiliary_loss_clip": 0.01109858, + "auxiliary_loss_mlp": 0.01032667, + "balance_loss_clip": 1.02034652, + "balance_loss_mlp": 1.04010189, + "epoch": 0.5444160529084624, + "flos": 22632556435200.0, + "grad_norm": 2.3459133888285564, + "language_loss": 0.68981498, + "learning_rate": 1.8096470719813836e-06, + "loss": 0.71124029, + "num_input_tokens_seen": 195025080, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 9055, + "time_per_iteration": 3.926511287689209 + }, + { + "auxiliary_loss_clip": 0.01028465, + "auxiliary_loss_mlp": 0.00999723, + "balance_loss_clip": 0.99868602, + "balance_loss_mlp": 1.00688159, + "epoch": 0.5444761761611303, + "flos": 69671909600640.0, + "grad_norm": 0.7309752042107527, + "language_loss": 0.57659, + "learning_rate": 1.80925938190531e-06, + "loss": 0.59687185, + "num_input_tokens_seen": 195085725, + "router_z_loss_clip": 0.01037598, + "router_z_loss_mlp": 0.21582031, + "step": 9056, + "time_per_iteration": 3.0622963905334473 + }, + { + "auxiliary_loss_clip": 0.01106856, + "auxiliary_loss_mlp": 0.01029461, + "balance_loss_clip": 1.01665783, + "balance_loss_mlp": 1.03565168, + "epoch": 0.5445362994137983, + "flos": 14278442129280.0, + "grad_norm": 1.7313106745452744, + "language_loss": 0.69337952, + "learning_rate": 1.8088716990621395e-06, + "loss": 0.71474266, + "num_input_tokens_seen": 195102585, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7109375, + "step": 9057, + "time_per_iteration": 2.4510855674743652 + }, + { + "auxiliary_loss_clip": 0.01106022, + "auxiliary_loss_mlp": 0.01035977, + "balance_loss_clip": 1.02320337, + "balance_loss_mlp": 1.03730392, + "epoch": 0.5445964226664662, + "flos": 28986195611520.0, + "grad_norm": 2.1714933584662615, + "language_loss": 0.7508406, + "learning_rate": 1.8084840234665738e-06, + "loss": 0.77226055, + "num_input_tokens_seen": 195120055, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6875, + "step": 9058, + "time_per_iteration": 2.526362419128418 + }, + { + "auxiliary_loss_clip": 0.01028725, + "auxiliary_loss_mlp": 0.0100208, + "balance_loss_clip": 1.00100732, + "balance_loss_mlp": 1.00713301, + "epoch": 0.5446565459191343, + "flos": 68620230270720.0, + "grad_norm": 0.7971345769694276, + "language_loss": 0.62662959, + "learning_rate": 1.808096355133312e-06, + "loss": 0.64693761, + "num_input_tokens_seen": 195181045, + "router_z_loss_clip": 0.01074219, + "router_z_loss_mlp": 0.21582031, + "step": 9059, + "time_per_iteration": 3.1505026817321777 + }, + { + "auxiliary_loss_clip": 0.01105797, + "auxiliary_loss_mlp": 0.01030676, + "balance_loss_clip": 1.01862383, + "balance_loss_mlp": 1.03710485, + "epoch": 0.5447166691718022, + "flos": 16216469464320.0, + "grad_norm": 1.9373576881408119, + "language_loss": 0.791785, + "learning_rate": 1.8077086940770572e-06, + "loss": 0.81314969, + "num_input_tokens_seen": 195198840, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6875, + "step": 9060, + "time_per_iteration": 2.4754552841186523 + }, + { + "auxiliary_loss_clip": 0.01106659, + "auxiliary_loss_mlp": 0.01033204, + "balance_loss_clip": 1.02058554, + "balance_loss_mlp": 1.03625464, + "epoch": 0.5447767924244702, + "flos": 25849039616640.0, + "grad_norm": 1.604299719110434, + "language_loss": 0.7939564, + "learning_rate": 1.8073210403125072e-06, + "loss": 0.81535506, + "num_input_tokens_seen": 195218720, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 9061, + "time_per_iteration": 2.556467056274414 + }, + { + "auxiliary_loss_clip": 0.01104902, + "auxiliary_loss_mlp": 0.01026453, + "balance_loss_clip": 1.0152173, + "balance_loss_mlp": 1.03701198, + "epoch": 0.5448369156771381, + "flos": 19677718897920.0, + "grad_norm": 1.7809339372629867, + "language_loss": 0.87091219, + "learning_rate": 1.8069333938543627e-06, + "loss": 0.89222574, + "num_input_tokens_seen": 195235770, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6796875, + "step": 9062, + "time_per_iteration": 2.4758143424987793 + }, + { + "auxiliary_loss_clip": 0.01111266, + "auxiliary_loss_mlp": 0.01032954, + "balance_loss_clip": 1.01959074, + "balance_loss_mlp": 1.03804517, + "epoch": 0.5448970389298061, + "flos": 19281804215040.0, + "grad_norm": 1.9589069040824287, + "language_loss": 0.82366961, + "learning_rate": 1.8065457547173233e-06, + "loss": 0.84511185, + "num_input_tokens_seen": 195254870, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.734375, + "step": 9063, + "time_per_iteration": 2.4351277351379395 + }, + { + "auxiliary_loss_clip": 0.01106592, + "auxiliary_loss_mlp": 0.01028534, + "balance_loss_clip": 1.01580811, + "balance_loss_mlp": 1.0372479, + "epoch": 0.544957162182474, + "flos": 20991690316800.0, + "grad_norm": 1.809751627458355, + "language_loss": 0.63477433, + "learning_rate": 1.8061581229160878e-06, + "loss": 0.65612566, + "num_input_tokens_seen": 195273390, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 9064, + "time_per_iteration": 2.5002574920654297 + }, + { + "auxiliary_loss_clip": 0.01108938, + "auxiliary_loss_mlp": 0.01031086, + "balance_loss_clip": 1.01844406, + "balance_loss_mlp": 1.0378474, + "epoch": 0.545017285435142, + "flos": 25374587846400.0, + "grad_norm": 1.5950372697964212, + "language_loss": 0.79787326, + "learning_rate": 1.8057704984653566e-06, + "loss": 0.81927347, + "num_input_tokens_seen": 195295635, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 9065, + "time_per_iteration": 2.485886335372925 + }, + { + "auxiliary_loss_clip": 0.01104024, + "auxiliary_loss_mlp": 0.01029589, + "balance_loss_clip": 1.01893747, + "balance_loss_mlp": 1.03695667, + "epoch": 0.54507740868781, + "flos": 19134749934720.0, + "grad_norm": 1.9866274876050938, + "language_loss": 0.78143919, + "learning_rate": 1.805382881379827e-06, + "loss": 0.80277526, + "num_input_tokens_seen": 195312545, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.671875, + "step": 9066, + "time_per_iteration": 2.4608097076416016 + }, + { + "auxiliary_loss_clip": 0.0110724, + "auxiliary_loss_mlp": 0.0102858, + "balance_loss_clip": 1.0161345, + "balance_loss_mlp": 1.03510523, + "epoch": 0.545137531940478, + "flos": 26249802635520.0, + "grad_norm": 1.7709941680506742, + "language_loss": 0.75842655, + "learning_rate": 1.8049952716741975e-06, + "loss": 0.7797848, + "num_input_tokens_seen": 195332955, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.72265625, + "step": 9067, + "time_per_iteration": 2.4940598011016846 + }, + { + "auxiliary_loss_clip": 0.01114286, + "auxiliary_loss_mlp": 0.01035797, + "balance_loss_clip": 1.02152777, + "balance_loss_mlp": 1.0393995, + "epoch": 0.545197655193146, + "flos": 37555629995520.0, + "grad_norm": 2.2574843156274, + "language_loss": 0.63637972, + "learning_rate": 1.8046076693631682e-06, + "loss": 0.65788054, + "num_input_tokens_seen": 195355930, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.75, + "step": 9068, + "time_per_iteration": 2.570791244506836 + }, + { + "auxiliary_loss_clip": 0.0110619, + "auxiliary_loss_mlp": 0.01035364, + "balance_loss_clip": 1.02378917, + "balance_loss_mlp": 1.03860283, + "epoch": 0.5452577784458139, + "flos": 26031250333440.0, + "grad_norm": 1.608624941379858, + "language_loss": 0.7232843, + "learning_rate": 1.8042200744614343e-06, + "loss": 0.74469984, + "num_input_tokens_seen": 195376445, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.67578125, + "step": 9069, + "time_per_iteration": 2.49194073677063 + }, + { + "auxiliary_loss_clip": 0.01105915, + "auxiliary_loss_mlp": 0.01029861, + "balance_loss_clip": 1.01882815, + "balance_loss_mlp": 1.03988457, + "epoch": 0.5453179016984819, + "flos": 17639034675840.0, + "grad_norm": 1.7038570560603954, + "language_loss": 0.74060583, + "learning_rate": 1.8038324869836957e-06, + "loss": 0.76196355, + "num_input_tokens_seen": 195393725, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.66015625, + "step": 9070, + "time_per_iteration": 2.4085381031036377 + }, + { + "auxiliary_loss_clip": 0.0110378, + "auxiliary_loss_mlp": 0.01032237, + "balance_loss_clip": 1.02016675, + "balance_loss_mlp": 1.035869, + "epoch": 0.5453780249511498, + "flos": 23216679406080.0, + "grad_norm": 1.9518916968876514, + "language_loss": 0.60487843, + "learning_rate": 1.8034449069446489e-06, + "loss": 0.62623858, + "num_input_tokens_seen": 195411380, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 9071, + "time_per_iteration": 2.4736368656158447 + }, + { + "auxiliary_loss_clip": 0.01029891, + "auxiliary_loss_mlp": 0.01009543, + "balance_loss_clip": 1.00851762, + "balance_loss_mlp": 1.00855255, + "epoch": 0.5454381482038179, + "flos": 68696504801280.0, + "grad_norm": 0.702361481728272, + "language_loss": 0.57095647, + "learning_rate": 1.80305733435899e-06, + "loss": 0.59135079, + "num_input_tokens_seen": 195482015, + "router_z_loss_clip": 0.01025391, + "router_z_loss_mlp": 0.21386719, + "step": 9072, + "time_per_iteration": 3.1778738498687744 + }, + { + "auxiliary_loss_clip": 0.01104044, + "auxiliary_loss_mlp": 0.01029766, + "balance_loss_clip": 1.01834023, + "balance_loss_mlp": 1.03754437, + "epoch": 0.5454982714564858, + "flos": 13260626346240.0, + "grad_norm": 1.6497532443668452, + "language_loss": 0.69947577, + "learning_rate": 1.8026697692414174e-06, + "loss": 0.72081387, + "num_input_tokens_seen": 195500440, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 9073, + "time_per_iteration": 2.414483070373535 + }, + { + "auxiliary_loss_clip": 0.01102116, + "auxiliary_loss_mlp": 0.01032371, + "balance_loss_clip": 1.02133226, + "balance_loss_mlp": 1.03575385, + "epoch": 0.5455583947091538, + "flos": 21835878733440.0, + "grad_norm": 1.7860657423568516, + "language_loss": 0.71207851, + "learning_rate": 1.802282211606627e-06, + "loss": 0.73342335, + "num_input_tokens_seen": 195520860, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6640625, + "step": 9074, + "time_per_iteration": 2.5126519203186035 + }, + { + "auxiliary_loss_clip": 0.01105462, + "auxiliary_loss_mlp": 0.01037255, + "balance_loss_clip": 1.02541733, + "balance_loss_mlp": 1.03713095, + "epoch": 0.5456185179618217, + "flos": 17817438551040.0, + "grad_norm": 1.7043380827263428, + "language_loss": 0.68845975, + "learning_rate": 1.8018946614693148e-06, + "loss": 0.70988691, + "num_input_tokens_seen": 195538615, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.68359375, + "step": 9075, + "time_per_iteration": 2.4271233081817627 + }, + { + "auxiliary_loss_clip": 0.01104973, + "auxiliary_loss_mlp": 0.01029765, + "balance_loss_clip": 1.01904845, + "balance_loss_mlp": 1.03828716, + "epoch": 0.5456786412144897, + "flos": 21069401391360.0, + "grad_norm": 2.0277857780736155, + "language_loss": 0.804497, + "learning_rate": 1.8015071188441768e-06, + "loss": 0.82584435, + "num_input_tokens_seen": 195557460, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.66796875, + "step": 9076, + "time_per_iteration": 2.5117785930633545 + }, + { + "auxiliary_loss_clip": 0.01105415, + "auxiliary_loss_mlp": 0.01030375, + "balance_loss_clip": 1.01892447, + "balance_loss_mlp": 1.03663969, + "epoch": 0.5457387644671576, + "flos": 23294965098240.0, + "grad_norm": 1.583996751680831, + "language_loss": 0.80426413, + "learning_rate": 1.8011195837459089e-06, + "loss": 0.82562208, + "num_input_tokens_seen": 195577985, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6875, + "step": 9077, + "time_per_iteration": 2.4544837474823 + }, + { + "auxiliary_loss_clip": 0.0110649, + "auxiliary_loss_mlp": 0.01030156, + "balance_loss_clip": 1.01880729, + "balance_loss_mlp": 1.03688538, + "epoch": 0.5457988877198257, + "flos": 21617039122560.0, + "grad_norm": 1.9788210228225505, + "language_loss": 0.67737269, + "learning_rate": 1.8007320561892064e-06, + "loss": 0.69873917, + "num_input_tokens_seen": 195597620, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6953125, + "step": 9078, + "time_per_iteration": 2.5323657989501953 + }, + { + "auxiliary_loss_clip": 0.01107395, + "auxiliary_loss_mlp": 0.01032857, + "balance_loss_clip": 1.02072752, + "balance_loss_mlp": 1.03703523, + "epoch": 0.5458590109724936, + "flos": 23762485543680.0, + "grad_norm": 1.8696943679753917, + "language_loss": 0.80740905, + "learning_rate": 1.800344536188764e-06, + "loss": 0.82881159, + "num_input_tokens_seen": 195615910, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.703125, + "step": 9079, + "time_per_iteration": 2.458158493041992 + }, + { + "auxiliary_loss_clip": 0.01110307, + "auxiliary_loss_mlp": 0.01032502, + "balance_loss_clip": 1.01966298, + "balance_loss_mlp": 1.03775454, + "epoch": 0.5459191342251616, + "flos": 24424283675520.0, + "grad_norm": 1.6840905516778153, + "language_loss": 0.75812018, + "learning_rate": 1.799957023759277e-06, + "loss": 0.77954829, + "num_input_tokens_seen": 195635620, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7265625, + "step": 9080, + "time_per_iteration": 2.4955971240997314 + }, + { + "auxiliary_loss_clip": 0.01108419, + "auxiliary_loss_mlp": 0.01033024, + "balance_loss_clip": 1.02007222, + "balance_loss_mlp": 1.03805685, + "epoch": 0.5459792574778296, + "flos": 23623009032960.0, + "grad_norm": 2.4851521305720627, + "language_loss": 0.83080792, + "learning_rate": 1.7995695189154392e-06, + "loss": 0.85222232, + "num_input_tokens_seen": 195652495, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.703125, + "step": 9081, + "time_per_iteration": 2.4580602645874023 + }, + { + "auxiliary_loss_clip": 0.01110385, + "auxiliary_loss_mlp": 0.01029387, + "balance_loss_clip": 1.01722193, + "balance_loss_mlp": 1.03842843, + "epoch": 0.5460393807304975, + "flos": 19135540033920.0, + "grad_norm": 1.5408403844848193, + "language_loss": 0.69658768, + "learning_rate": 1.7991820216719461e-06, + "loss": 0.71798551, + "num_input_tokens_seen": 195671965, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.71875, + "step": 9082, + "time_per_iteration": 2.472858428955078 + }, + { + "auxiliary_loss_clip": 0.01102277, + "auxiliary_loss_mlp": 0.01026377, + "balance_loss_clip": 1.01434886, + "balance_loss_mlp": 1.03546321, + "epoch": 0.5460995039831655, + "flos": 35918534805120.0, + "grad_norm": 1.7415454834760362, + "language_loss": 0.66599333, + "learning_rate": 1.7987945320434906e-06, + "loss": 0.68727982, + "num_input_tokens_seen": 195694725, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 9083, + "time_per_iteration": 2.5756945610046387 + }, + { + "auxiliary_loss_clip": 0.01104147, + "auxiliary_loss_mlp": 0.01029182, + "balance_loss_clip": 1.01772594, + "balance_loss_mlp": 1.03678334, + "epoch": 0.5461596272358334, + "flos": 26759231274240.0, + "grad_norm": 1.6516896910486423, + "language_loss": 0.78909004, + "learning_rate": 1.798407050044766e-06, + "loss": 0.81042337, + "num_input_tokens_seen": 195714090, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.67578125, + "step": 9084, + "time_per_iteration": 2.5361523628234863 + }, + { + "auxiliary_loss_clip": 0.01107894, + "auxiliary_loss_mlp": 0.01032728, + "balance_loss_clip": 1.02093244, + "balance_loss_mlp": 1.03781819, + "epoch": 0.5462197504885015, + "flos": 20886580143360.0, + "grad_norm": 2.0163372032767826, + "language_loss": 0.74970639, + "learning_rate": 1.7980195756904675e-06, + "loss": 0.77111256, + "num_input_tokens_seen": 195733585, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.69921875, + "step": 9085, + "time_per_iteration": 2.461916208267212 + }, + { + "auxiliary_loss_clip": 0.01107723, + "auxiliary_loss_mlp": 0.0102905, + "balance_loss_clip": 1.01702785, + "balance_loss_mlp": 1.03705621, + "epoch": 0.5462798737411694, + "flos": 25804976607360.0, + "grad_norm": 1.6682732441654566, + "language_loss": 0.74792248, + "learning_rate": 1.7976321089952857e-06, + "loss": 0.76929021, + "num_input_tokens_seen": 195752820, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.703125, + "step": 9086, + "time_per_iteration": 2.530505657196045 + }, + { + "auxiliary_loss_clip": 0.01105061, + "auxiliary_loss_mlp": 0.01029176, + "balance_loss_clip": 1.01707602, + "balance_loss_mlp": 1.03592753, + "epoch": 0.5463399969938374, + "flos": 25775027642880.0, + "grad_norm": 1.5861549378759865, + "language_loss": 0.76987553, + "learning_rate": 1.7972446499739155e-06, + "loss": 0.79121786, + "num_input_tokens_seen": 195773740, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69140625, + "step": 9087, + "time_per_iteration": 2.4786858558654785 + }, + { + "auxiliary_loss_clip": 0.01110207, + "auxiliary_loss_mlp": 0.01035533, + "balance_loss_clip": 1.0228374, + "balance_loss_mlp": 1.03895903, + "epoch": 0.5464001202465053, + "flos": 18843298980480.0, + "grad_norm": 1.736831801992395, + "language_loss": 0.77471095, + "learning_rate": 1.7968571986410484e-06, + "loss": 0.79616833, + "num_input_tokens_seen": 195792125, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71484375, + "step": 9088, + "time_per_iteration": 2.450409173965454 + }, + { + "auxiliary_loss_clip": 0.01030156, + "auxiliary_loss_mlp": 0.01001999, + "balance_loss_clip": 1.0009743, + "balance_loss_mlp": 1.0086112, + "epoch": 0.5464602434991733, + "flos": 69049541623680.0, + "grad_norm": 0.7273835392783513, + "language_loss": 0.57771385, + "learning_rate": 1.7964697550113758e-06, + "loss": 0.59803545, + "num_input_tokens_seen": 195854935, + "router_z_loss_clip": 0.01025391, + "router_z_loss_mlp": 0.21484375, + "step": 9089, + "time_per_iteration": 3.1002800464630127 + }, + { + "auxiliary_loss_clip": 0.01107167, + "auxiliary_loss_mlp": 0.01030942, + "balance_loss_clip": 1.01875257, + "balance_loss_mlp": 1.03710759, + "epoch": 0.5465203667518412, + "flos": 27560039040000.0, + "grad_norm": 1.6935215277859987, + "language_loss": 0.76448178, + "learning_rate": 1.7960823190995918e-06, + "loss": 0.78586286, + "num_input_tokens_seen": 195874715, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.703125, + "step": 9090, + "time_per_iteration": 2.5178091526031494 + }, + { + "auxiliary_loss_clip": 0.0110913, + "auxiliary_loss_mlp": 0.01035309, + "balance_loss_clip": 1.02226758, + "balance_loss_mlp": 1.0362854, + "epoch": 0.5465804900045093, + "flos": 21210206705280.0, + "grad_norm": 2.128546091443876, + "language_loss": 0.73422724, + "learning_rate": 1.7956948909203855e-06, + "loss": 0.75567162, + "num_input_tokens_seen": 195892610, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7265625, + "step": 9091, + "time_per_iteration": 2.4523463249206543 + }, + { + "auxiliary_loss_clip": 0.0110893, + "auxiliary_loss_mlp": 0.01035178, + "balance_loss_clip": 1.02313828, + "balance_loss_mlp": 1.03835773, + "epoch": 0.5466406132571772, + "flos": 22488949860480.0, + "grad_norm": 1.850730557544026, + "language_loss": 0.77855682, + "learning_rate": 1.7953074704884498e-06, + "loss": 0.79999787, + "num_input_tokens_seen": 195911085, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.70703125, + "step": 9092, + "time_per_iteration": 2.463998556137085 + }, + { + "auxiliary_loss_clip": 0.01110185, + "auxiliary_loss_mlp": 0.01032754, + "balance_loss_clip": 1.01975393, + "balance_loss_mlp": 1.03879404, + "epoch": 0.5467007365098452, + "flos": 17675843137920.0, + "grad_norm": 1.992080116269468, + "language_loss": 0.74526983, + "learning_rate": 1.794920057818476e-06, + "loss": 0.76669919, + "num_input_tokens_seen": 195929845, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71484375, + "step": 9093, + "time_per_iteration": 3.8121659755706787 + }, + { + "auxiliary_loss_clip": 0.01108983, + "auxiliary_loss_mlp": 0.01037849, + "balance_loss_clip": 1.02420545, + "balance_loss_mlp": 1.03643596, + "epoch": 0.5467608597625132, + "flos": 15698852524800.0, + "grad_norm": 1.8684331289519012, + "language_loss": 0.69012475, + "learning_rate": 1.7945326529251533e-06, + "loss": 0.71159303, + "num_input_tokens_seen": 195946350, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.72265625, + "step": 9094, + "time_per_iteration": 2.406708240509033 + }, + { + "auxiliary_loss_clip": 0.0110964, + "auxiliary_loss_mlp": 0.01035906, + "balance_loss_clip": 1.02463508, + "balance_loss_mlp": 1.0408746, + "epoch": 0.5468209830151811, + "flos": 24312816794880.0, + "grad_norm": 3.1943674750228426, + "language_loss": 0.68355155, + "learning_rate": 1.7941452558231731e-06, + "loss": 0.70500696, + "num_input_tokens_seen": 195959840, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6875, + "step": 9095, + "time_per_iteration": 2.4663615226745605 + }, + { + "auxiliary_loss_clip": 0.0110876, + "auxiliary_loss_mlp": 0.01035993, + "balance_loss_clip": 1.0244838, + "balance_loss_mlp": 1.04013026, + "epoch": 0.5468811062678491, + "flos": 29166323339520.0, + "grad_norm": 1.544968347193232, + "language_loss": 0.66645032, + "learning_rate": 1.7937578665272256e-06, + "loss": 0.6878978, + "num_input_tokens_seen": 195981125, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6875, + "step": 9096, + "time_per_iteration": 5.378362417221069 + }, + { + "auxiliary_loss_clip": 0.01030132, + "auxiliary_loss_mlp": 0.00998409, + "balance_loss_clip": 0.99731266, + "balance_loss_mlp": 1.00865221, + "epoch": 0.546941229520517, + "flos": 67867037982720.0, + "grad_norm": 0.7389922300516351, + "language_loss": 0.57573926, + "learning_rate": 1.7933704850520007e-06, + "loss": 0.59602463, + "num_input_tokens_seen": 196038880, + "router_z_loss_clip": 0.01098633, + "router_z_loss_mlp": 0.21484375, + "step": 9097, + "time_per_iteration": 3.168614387512207 + }, + { + "auxiliary_loss_clip": 0.01030189, + "auxiliary_loss_mlp": 0.01002061, + "balance_loss_clip": 1.00105369, + "balance_loss_mlp": 1.00863671, + "epoch": 0.5470013527731851, + "flos": 58270306625280.0, + "grad_norm": 0.9052213801384115, + "language_loss": 0.64790761, + "learning_rate": 1.7929831114121868e-06, + "loss": 0.66823018, + "num_input_tokens_seen": 196099215, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.21484375, + "step": 9098, + "time_per_iteration": 3.01711106300354 + }, + { + "auxiliary_loss_clip": 0.01108703, + "auxiliary_loss_mlp": 0.01036918, + "balance_loss_clip": 1.02399004, + "balance_loss_mlp": 1.03762555, + "epoch": 0.547061476025853, + "flos": 22965915582720.0, + "grad_norm": 1.9907442514686344, + "language_loss": 0.73179287, + "learning_rate": 1.7925957456224753e-06, + "loss": 0.75324905, + "num_input_tokens_seen": 196120370, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9099, + "time_per_iteration": 2.50752592086792 + }, + { + "auxiliary_loss_clip": 0.01105594, + "auxiliary_loss_mlp": 0.01028658, + "balance_loss_clip": 1.01767302, + "balance_loss_mlp": 1.03749669, + "epoch": 0.547121599278521, + "flos": 29968244426880.0, + "grad_norm": 1.9036037415187144, + "language_loss": 0.72414565, + "learning_rate": 1.7922083876975537e-06, + "loss": 0.74548817, + "num_input_tokens_seen": 196139075, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6796875, + "step": 9100, + "time_per_iteration": 2.5455925464630127 + }, + { + "auxiliary_loss_clip": 0.01105887, + "auxiliary_loss_mlp": 0.01026443, + "balance_loss_clip": 1.01381898, + "balance_loss_mlp": 1.03679228, + "epoch": 0.5471817225311889, + "flos": 36535443914880.0, + "grad_norm": 1.608228209483335, + "language_loss": 0.67675304, + "learning_rate": 1.7918210376521102e-06, + "loss": 0.69807637, + "num_input_tokens_seen": 196159990, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69140625, + "step": 9101, + "time_per_iteration": 2.638460397720337 + }, + { + "auxiliary_loss_clip": 0.01108046, + "auxiliary_loss_mlp": 0.0102741, + "balance_loss_clip": 1.01515532, + "balance_loss_mlp": 1.03816807, + "epoch": 0.5472418457838569, + "flos": 25775243124480.0, + "grad_norm": 1.6461027740418694, + "language_loss": 0.78004694, + "learning_rate": 1.7914336955008343e-06, + "loss": 0.80140156, + "num_input_tokens_seen": 196180570, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 9102, + "time_per_iteration": 2.515669822692871 + }, + { + "auxiliary_loss_clip": 0.01109549, + "auxiliary_loss_mlp": 0.01036821, + "balance_loss_clip": 1.02434635, + "balance_loss_mlp": 1.04091179, + "epoch": 0.5473019690365248, + "flos": 27887687925120.0, + "grad_norm": 1.641023318874669, + "language_loss": 0.72358656, + "learning_rate": 1.791046361258413e-06, + "loss": 0.74505031, + "num_input_tokens_seen": 196200300, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 9103, + "time_per_iteration": 2.516160249710083 + }, + { + "auxiliary_loss_clip": 0.0110583, + "auxiliary_loss_mlp": 0.01027804, + "balance_loss_clip": 1.01571035, + "balance_loss_mlp": 1.03704, + "epoch": 0.5473620922891929, + "flos": 57631490219520.0, + "grad_norm": 1.3192542299458547, + "language_loss": 0.65333968, + "learning_rate": 1.7906590349395356e-06, + "loss": 0.674676, + "num_input_tokens_seen": 196228525, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 9104, + "time_per_iteration": 2.8076846599578857 + }, + { + "auxiliary_loss_clip": 0.01110613, + "auxiliary_loss_mlp": 0.01032122, + "balance_loss_clip": 1.0188477, + "balance_loss_mlp": 1.03879666, + "epoch": 0.5474222155418608, + "flos": 19354056422400.0, + "grad_norm": 1.7582225342351636, + "language_loss": 0.81346989, + "learning_rate": 1.790271716558888e-06, + "loss": 0.83489728, + "num_input_tokens_seen": 196247690, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 9105, + "time_per_iteration": 2.4436333179473877 + }, + { + "auxiliary_loss_clip": 0.01106137, + "auxiliary_loss_mlp": 0.01029973, + "balance_loss_clip": 1.01836777, + "balance_loss_mlp": 1.03727031, + "epoch": 0.5474823387945288, + "flos": 25120448144640.0, + "grad_norm": 1.5498107295674015, + "language_loss": 0.80534816, + "learning_rate": 1.7898844061311575e-06, + "loss": 0.82670921, + "num_input_tokens_seen": 196268555, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6875, + "step": 9106, + "time_per_iteration": 2.5293564796447754 + }, + { + "auxiliary_loss_clip": 0.01108965, + "auxiliary_loss_mlp": 0.0103263, + "balance_loss_clip": 1.02120996, + "balance_loss_mlp": 1.03986609, + "epoch": 0.5475424620471967, + "flos": 18004174381440.0, + "grad_norm": 1.7454593746340303, + "language_loss": 0.69378364, + "learning_rate": 1.7894971036710322e-06, + "loss": 0.71519959, + "num_input_tokens_seen": 196285585, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.69140625, + "step": 9107, + "time_per_iteration": 2.423023223876953 + }, + { + "auxiliary_loss_clip": 0.01110146, + "auxiliary_loss_mlp": 0.01029027, + "balance_loss_clip": 1.01680255, + "balance_loss_mlp": 1.03831339, + "epoch": 0.5476025852998647, + "flos": 22309324922880.0, + "grad_norm": 1.6483473327352183, + "language_loss": 0.63088882, + "learning_rate": 1.789109809193197e-06, + "loss": 0.65228057, + "num_input_tokens_seen": 196305085, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.71875, + "step": 9108, + "time_per_iteration": 2.4629247188568115 + }, + { + "auxiliary_loss_clip": 0.01106827, + "auxiliary_loss_mlp": 0.0102654, + "balance_loss_clip": 1.01526904, + "balance_loss_mlp": 1.03832912, + "epoch": 0.5476627085525327, + "flos": 20120497850880.0, + "grad_norm": 1.6809972098624877, + "language_loss": 0.74894333, + "learning_rate": 1.7887225227123396e-06, + "loss": 0.77027702, + "num_input_tokens_seen": 196323945, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.68359375, + "step": 9109, + "time_per_iteration": 2.445711851119995 + }, + { + "auxiliary_loss_clip": 0.01105646, + "auxiliary_loss_mlp": 0.01033842, + "balance_loss_clip": 1.02130747, + "balance_loss_mlp": 1.03783536, + "epoch": 0.5477228318052006, + "flos": 17712579772800.0, + "grad_norm": 1.9460400321268034, + "language_loss": 0.77668434, + "learning_rate": 1.7883352442431457e-06, + "loss": 0.79807919, + "num_input_tokens_seen": 196342200, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.67578125, + "step": 9110, + "time_per_iteration": 2.4724810123443604 + }, + { + "auxiliary_loss_clip": 0.01105056, + "auxiliary_loss_mlp": 0.01033075, + "balance_loss_clip": 1.02193475, + "balance_loss_mlp": 1.03772378, + "epoch": 0.5477829550578687, + "flos": 25848895962240.0, + "grad_norm": 1.7745449116751173, + "language_loss": 0.71189445, + "learning_rate": 1.7879479738002993e-06, + "loss": 0.73327577, + "num_input_tokens_seen": 196362940, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.67578125, + "step": 9111, + "time_per_iteration": 2.5220110416412354 + }, + { + "auxiliary_loss_clip": 0.01108238, + "auxiliary_loss_mlp": 0.01036998, + "balance_loss_clip": 1.02525544, + "balance_loss_mlp": 1.03890049, + "epoch": 0.5478430783105366, + "flos": 23039676161280.0, + "grad_norm": 1.5754245119869974, + "language_loss": 0.71029758, + "learning_rate": 1.7875607113984876e-06, + "loss": 0.73174989, + "num_input_tokens_seen": 196383070, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.69140625, + "step": 9112, + "time_per_iteration": 2.4876022338867188 + }, + { + "auxiliary_loss_clip": 0.01108992, + "auxiliary_loss_mlp": 0.01029489, + "balance_loss_clip": 1.0176518, + "balance_loss_mlp": 1.03795052, + "epoch": 0.5479032015632046, + "flos": 16071210864000.0, + "grad_norm": 2.4321144529101946, + "language_loss": 0.88027447, + "learning_rate": 1.7871734570523953e-06, + "loss": 0.90165925, + "num_input_tokens_seen": 196398485, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.7109375, + "step": 9113, + "time_per_iteration": 2.4495129585266113 + }, + { + "auxiliary_loss_clip": 0.01110892, + "auxiliary_loss_mlp": 0.01031433, + "balance_loss_clip": 1.01863575, + "balance_loss_mlp": 1.04015231, + "epoch": 0.5479633248158725, + "flos": 24278701852800.0, + "grad_norm": 1.4380357531145453, + "language_loss": 0.73040199, + "learning_rate": 1.7867862107767067e-06, + "loss": 0.75182521, + "num_input_tokens_seen": 196417725, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.70703125, + "step": 9114, + "time_per_iteration": 2.49124813079834 + }, + { + "auxiliary_loss_clip": 0.0110468, + "auxiliary_loss_mlp": 0.01031545, + "balance_loss_clip": 1.0205301, + "balance_loss_mlp": 1.03658402, + "epoch": 0.5480234480685405, + "flos": 26358216860160.0, + "grad_norm": 1.7175878836105734, + "language_loss": 0.72105908, + "learning_rate": 1.7863989725861066e-06, + "loss": 0.74242127, + "num_input_tokens_seen": 196437840, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6796875, + "step": 9115, + "time_per_iteration": 2.4818665981292725 + }, + { + "auxiliary_loss_clip": 0.01109482, + "auxiliary_loss_mlp": 0.01032226, + "balance_loss_clip": 1.01915491, + "balance_loss_mlp": 1.03801298, + "epoch": 0.5480835713212084, + "flos": 22055077480320.0, + "grad_norm": 1.8153830213846445, + "language_loss": 0.7222048, + "learning_rate": 1.7860117424952781e-06, + "loss": 0.74362183, + "num_input_tokens_seen": 196457300, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71484375, + "step": 9116, + "time_per_iteration": 2.4857382774353027 + }, + { + "auxiliary_loss_clip": 0.01108168, + "auxiliary_loss_mlp": 0.01038569, + "balance_loss_clip": 1.02634406, + "balance_loss_mlp": 1.03931904, + "epoch": 0.5481436945738765, + "flos": 25301042749440.0, + "grad_norm": 2.1442712779415025, + "language_loss": 0.76391387, + "learning_rate": 1.7856245205189063e-06, + "loss": 0.78538126, + "num_input_tokens_seen": 196476720, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6875, + "step": 9117, + "time_per_iteration": 2.481539726257324 + }, + { + "auxiliary_loss_clip": 0.01102281, + "auxiliary_loss_mlp": 0.01032027, + "balance_loss_clip": 1.02069592, + "balance_loss_mlp": 1.03559899, + "epoch": 0.5482038178265444, + "flos": 33580857772800.0, + "grad_norm": 1.6184993035700161, + "language_loss": 0.62667149, + "learning_rate": 1.785237306671674e-06, + "loss": 0.64801455, + "num_input_tokens_seen": 196496765, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 9118, + "time_per_iteration": 2.582087516784668 + }, + { + "auxiliary_loss_clip": 0.01112715, + "auxiliary_loss_mlp": 0.01030243, + "balance_loss_clip": 1.01705241, + "balance_loss_mlp": 1.04148602, + "epoch": 0.5482639410792124, + "flos": 19026192055680.0, + "grad_norm": 2.080656601028848, + "language_loss": 0.79054701, + "learning_rate": 1.7848501009682646e-06, + "loss": 0.81197661, + "num_input_tokens_seen": 196516220, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 9119, + "time_per_iteration": 2.431641101837158 + }, + { + "auxiliary_loss_clip": 0.01106769, + "auxiliary_loss_mlp": 0.01032074, + "balance_loss_clip": 1.02143443, + "balance_loss_mlp": 1.0393101, + "epoch": 0.5483240643318803, + "flos": 25410318900480.0, + "grad_norm": 1.6818671426073972, + "language_loss": 0.82585561, + "learning_rate": 1.7844629034233604e-06, + "loss": 0.84724402, + "num_input_tokens_seen": 196533860, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.671875, + "step": 9120, + "time_per_iteration": 2.501248359680176 + }, + { + "auxiliary_loss_clip": 0.01110356, + "auxiliary_loss_mlp": 0.01038217, + "balance_loss_clip": 1.02538443, + "balance_loss_mlp": 1.03979588, + "epoch": 0.5483841875845483, + "flos": 21466896272640.0, + "grad_norm": 1.7397757233914666, + "language_loss": 0.80841327, + "learning_rate": 1.7840757140516455e-06, + "loss": 0.82989895, + "num_input_tokens_seen": 196551305, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.70703125, + "step": 9121, + "time_per_iteration": 2.449951171875 + }, + { + "auxiliary_loss_clip": 0.01108531, + "auxiliary_loss_mlp": 0.0103453, + "balance_loss_clip": 1.02164376, + "balance_loss_mlp": 1.03663361, + "epoch": 0.5484443108372163, + "flos": 24747263792640.0, + "grad_norm": 2.0253856212842662, + "language_loss": 0.61077833, + "learning_rate": 1.7836885328678008e-06, + "loss": 0.63220894, + "num_input_tokens_seen": 196569420, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 9122, + "time_per_iteration": 2.4943363666534424 + }, + { + "auxiliary_loss_clip": 0.01107335, + "auxiliary_loss_mlp": 0.01031907, + "balance_loss_clip": 1.02135706, + "balance_loss_mlp": 1.03908038, + "epoch": 0.5485044340898843, + "flos": 25375377945600.0, + "grad_norm": 1.7986157880414966, + "language_loss": 0.71862841, + "learning_rate": 1.7833013598865084e-06, + "loss": 0.74002087, + "num_input_tokens_seen": 196590610, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.68359375, + "step": 9123, + "time_per_iteration": 2.4815285205841064 + }, + { + "auxiliary_loss_clip": 0.01108125, + "auxiliary_loss_mlp": 0.01028332, + "balance_loss_clip": 1.01702476, + "balance_loss_mlp": 1.03875828, + "epoch": 0.5485645573425523, + "flos": 12641167370880.0, + "grad_norm": 1.9471016807647592, + "language_loss": 0.83393133, + "learning_rate": 1.7829141951224505e-06, + "loss": 0.8552959, + "num_input_tokens_seen": 196606495, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6953125, + "step": 9124, + "time_per_iteration": 2.442490816116333 + }, + { + "auxiliary_loss_clip": 0.0110907, + "auxiliary_loss_mlp": 0.01034722, + "balance_loss_clip": 1.02254486, + "balance_loss_mlp": 1.04040182, + "epoch": 0.5486246805952202, + "flos": 28329425383680.0, + "grad_norm": 1.9388864941150135, + "language_loss": 0.79954362, + "learning_rate": 1.7825270385903075e-06, + "loss": 0.82098156, + "num_input_tokens_seen": 196626365, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 9125, + "time_per_iteration": 2.5117273330688477 + }, + { + "auxiliary_loss_clip": 0.01109363, + "auxiliary_loss_mlp": 0.01030791, + "balance_loss_clip": 1.01844716, + "balance_loss_mlp": 1.03870225, + "epoch": 0.5486848038478882, + "flos": 16800017817600.0, + "grad_norm": 2.35248102892353, + "language_loss": 0.74499249, + "learning_rate": 1.7821398903047617e-06, + "loss": 0.76639402, + "num_input_tokens_seen": 196644465, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.70703125, + "step": 9126, + "time_per_iteration": 2.481576442718506 + }, + { + "auxiliary_loss_clip": 0.01110687, + "auxiliary_loss_mlp": 0.01031323, + "balance_loss_clip": 1.01794803, + "balance_loss_mlp": 1.03789783, + "epoch": 0.5487449271005561, + "flos": 17236224581760.0, + "grad_norm": 2.4816786154583212, + "language_loss": 0.66715956, + "learning_rate": 1.7817527502804928e-06, + "loss": 0.68857968, + "num_input_tokens_seen": 196659160, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7265625, + "step": 9127, + "time_per_iteration": 2.462186574935913 + }, + { + "auxiliary_loss_clip": 0.01106989, + "auxiliary_loss_mlp": 0.01034985, + "balance_loss_clip": 1.02183664, + "balance_loss_mlp": 1.03737557, + "epoch": 0.5488050503532241, + "flos": 17340867878400.0, + "grad_norm": 1.7392555793748137, + "language_loss": 0.83598024, + "learning_rate": 1.781365618532181e-06, + "loss": 0.85740006, + "num_input_tokens_seen": 196677410, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.6953125, + "step": 9128, + "time_per_iteration": 2.4559218883514404 + }, + { + "auxiliary_loss_clip": 0.01107314, + "auxiliary_loss_mlp": 0.01031923, + "balance_loss_clip": 1.01948333, + "balance_loss_mlp": 1.03735828, + "epoch": 0.548865173605892, + "flos": 17239169496960.0, + "grad_norm": 1.8252742071628254, + "language_loss": 0.74370325, + "learning_rate": 1.7809784950745078e-06, + "loss": 0.76509559, + "num_input_tokens_seen": 196696765, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.69921875, + "step": 9129, + "time_per_iteration": 2.443394422531128 + }, + { + "auxiliary_loss_clip": 0.01111598, + "auxiliary_loss_mlp": 0.01030328, + "balance_loss_clip": 1.01749516, + "balance_loss_mlp": 1.0391345, + "epoch": 0.5489252968585601, + "flos": 17456716218240.0, + "grad_norm": 2.8843985474075557, + "language_loss": 0.6325981, + "learning_rate": 1.7805913799221511e-06, + "loss": 0.65401739, + "num_input_tokens_seen": 196714895, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7265625, + "step": 9130, + "time_per_iteration": 2.424933433532715 + }, + { + "auxiliary_loss_clip": 0.01109538, + "auxiliary_loss_mlp": 0.01035884, + "balance_loss_clip": 1.02319455, + "balance_loss_mlp": 1.03782725, + "epoch": 0.548985420111228, + "flos": 26323383646080.0, + "grad_norm": 2.1259011139704804, + "language_loss": 0.62936115, + "learning_rate": 1.7802042730897915e-06, + "loss": 0.65081537, + "num_input_tokens_seen": 196735510, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71484375, + "step": 9131, + "time_per_iteration": 2.508054256439209 + }, + { + "auxiliary_loss_clip": 0.01109907, + "auxiliary_loss_mlp": 0.01032434, + "balance_loss_clip": 1.01925564, + "balance_loss_mlp": 1.03880227, + "epoch": 0.549045543363896, + "flos": 18693730748160.0, + "grad_norm": 1.7299030045344002, + "language_loss": 0.74452615, + "learning_rate": 1.7798171745921084e-06, + "loss": 0.76594955, + "num_input_tokens_seen": 196752855, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 9132, + "time_per_iteration": 2.456127166748047 + }, + { + "auxiliary_loss_clip": 0.0110607, + "auxiliary_loss_mlp": 0.01027832, + "balance_loss_clip": 1.01656091, + "balance_loss_mlp": 1.03589082, + "epoch": 0.5491056666165639, + "flos": 24717386655360.0, + "grad_norm": 1.6111198761107228, + "language_loss": 0.8129831, + "learning_rate": 1.7794300844437795e-06, + "loss": 0.83432209, + "num_input_tokens_seen": 196772230, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.703125, + "step": 9133, + "time_per_iteration": 2.490236759185791 + }, + { + "auxiliary_loss_clip": 0.01106997, + "auxiliary_loss_mlp": 0.01033411, + "balance_loss_clip": 1.02131128, + "balance_loss_mlp": 1.03802598, + "epoch": 0.5491657898692319, + "flos": 21576926609280.0, + "grad_norm": 1.7268592344479874, + "language_loss": 0.70094633, + "learning_rate": 1.7790430026594841e-06, + "loss": 0.72235036, + "num_input_tokens_seen": 196790405, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 9134, + "time_per_iteration": 3.827064275741577 + }, + { + "auxiliary_loss_clip": 0.01110087, + "auxiliary_loss_mlp": 0.01032274, + "balance_loss_clip": 1.02014494, + "balance_loss_mlp": 1.03806603, + "epoch": 0.5492259131219, + "flos": 50476432746240.0, + "grad_norm": 1.744868024388231, + "language_loss": 0.61109304, + "learning_rate": 1.7786559292539004e-06, + "loss": 0.63251662, + "num_input_tokens_seen": 196813785, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71875, + "step": 9135, + "time_per_iteration": 2.730273723602295 + }, + { + "auxiliary_loss_clip": 0.01110668, + "auxiliary_loss_mlp": 0.01034696, + "balance_loss_clip": 1.02089787, + "balance_loss_mlp": 1.03864121, + "epoch": 0.5492860363745679, + "flos": 25119262995840.0, + "grad_norm": 1.7368953039767876, + "language_loss": 0.72582811, + "learning_rate": 1.7782688642417058e-06, + "loss": 0.74728173, + "num_input_tokens_seen": 196834390, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.71875, + "step": 9136, + "time_per_iteration": 2.483704090118408 + }, + { + "auxiliary_loss_clip": 0.01111013, + "auxiliary_loss_mlp": 0.01036009, + "balance_loss_clip": 1.02256799, + "balance_loss_mlp": 1.03636873, + "epoch": 0.5493461596272359, + "flos": 22633777497600.0, + "grad_norm": 3.852349726597511, + "language_loss": 0.68771708, + "learning_rate": 1.7778818076375781e-06, + "loss": 0.70918733, + "num_input_tokens_seen": 196853290, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 9137, + "time_per_iteration": 5.456461191177368 + }, + { + "auxiliary_loss_clip": 0.01031834, + "auxiliary_loss_mlp": 0.01007044, + "balance_loss_clip": 1.00602436, + "balance_loss_mlp": 1.01015878, + "epoch": 0.5494062828799038, + "flos": 66151800754560.0, + "grad_norm": 0.9040496486989937, + "language_loss": 0.6527245, + "learning_rate": 1.7774947594561947e-06, + "loss": 0.67311323, + "num_input_tokens_seen": 196913120, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.21679688, + "step": 9138, + "time_per_iteration": 4.559895753860474 + }, + { + "auxiliary_loss_clip": 0.01109871, + "auxiliary_loss_mlp": 0.01030983, + "balance_loss_clip": 1.01828778, + "balance_loss_mlp": 1.03911173, + "epoch": 0.5494664061325718, + "flos": 21105958458240.0, + "grad_norm": 1.6793798945838962, + "language_loss": 0.74981934, + "learning_rate": 1.7771077197122321e-06, + "loss": 0.7712279, + "num_input_tokens_seen": 196931530, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 9139, + "time_per_iteration": 2.4897236824035645 + }, + { + "auxiliary_loss_clip": 0.01107792, + "auxiliary_loss_mlp": 0.01029596, + "balance_loss_clip": 1.01754975, + "balance_loss_mlp": 1.03827238, + "epoch": 0.5495265293852397, + "flos": 14392566616320.0, + "grad_norm": 1.7331605634368676, + "language_loss": 0.71274745, + "learning_rate": 1.7767206884203672e-06, + "loss": 0.73412126, + "num_input_tokens_seen": 196949430, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6953125, + "step": 9140, + "time_per_iteration": 2.416760206222534 + }, + { + "auxiliary_loss_clip": 0.01105846, + "auxiliary_loss_mlp": 0.01033348, + "balance_loss_clip": 1.02035391, + "balance_loss_mlp": 1.03625703, + "epoch": 0.5495866526379077, + "flos": 25549148966400.0, + "grad_norm": 1.6373657351429003, + "language_loss": 0.76304853, + "learning_rate": 1.7763336655952762e-06, + "loss": 0.78444046, + "num_input_tokens_seen": 196968265, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6953125, + "step": 9141, + "time_per_iteration": 2.495957612991333 + }, + { + "auxiliary_loss_clip": 0.01104653, + "auxiliary_loss_mlp": 0.0102863, + "balance_loss_clip": 1.01651192, + "balance_loss_mlp": 1.03816998, + "epoch": 0.5496467758905756, + "flos": 21317256213120.0, + "grad_norm": 1.8000642859490852, + "language_loss": 0.74711812, + "learning_rate": 1.7759466512516346e-06, + "loss": 0.76845098, + "num_input_tokens_seen": 196984930, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 9142, + "time_per_iteration": 2.476701259613037 + }, + { + "auxiliary_loss_clip": 0.01112649, + "auxiliary_loss_mlp": 0.01033014, + "balance_loss_clip": 1.01920366, + "balance_loss_mlp": 1.04044414, + "epoch": 0.5497068991432437, + "flos": 22233086305920.0, + "grad_norm": 3.087747357168804, + "language_loss": 0.76516807, + "learning_rate": 1.7755596454041192e-06, + "loss": 0.78662473, + "num_input_tokens_seen": 197002320, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.72265625, + "step": 9143, + "time_per_iteration": 2.4777820110321045 + }, + { + "auxiliary_loss_clip": 0.01105498, + "auxiliary_loss_mlp": 0.0103083, + "balance_loss_clip": 1.0188787, + "balance_loss_mlp": 1.03639066, + "epoch": 0.5497670223959116, + "flos": 18479093028480.0, + "grad_norm": 4.124964872446098, + "language_loss": 0.79934669, + "learning_rate": 1.7751726480674044e-06, + "loss": 0.82070994, + "num_input_tokens_seen": 197020825, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.69140625, + "step": 9144, + "time_per_iteration": 2.470946788787842 + }, + { + "auxiliary_loss_clip": 0.01109215, + "auxiliary_loss_mlp": 0.01028797, + "balance_loss_clip": 1.0163275, + "balance_loss_mlp": 1.03886819, + "epoch": 0.5498271456485796, + "flos": 29205107049600.0, + "grad_norm": 2.259125962742438, + "language_loss": 0.71273595, + "learning_rate": 1.7747856592561645e-06, + "loss": 0.73411608, + "num_input_tokens_seen": 197040450, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.703125, + "step": 9145, + "time_per_iteration": 2.5155293941497803 + }, + { + "auxiliary_loss_clip": 0.0110796, + "auxiliary_loss_mlp": 0.01027624, + "balance_loss_clip": 1.01604867, + "balance_loss_mlp": 1.03797007, + "epoch": 0.5498872689012475, + "flos": 34824372664320.0, + "grad_norm": 1.760392083970442, + "language_loss": 0.70398986, + "learning_rate": 1.774398678985076e-06, + "loss": 0.72534567, + "num_input_tokens_seen": 197063930, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.703125, + "step": 9146, + "time_per_iteration": 2.5837745666503906 + }, + { + "auxiliary_loss_clip": 0.01103828, + "auxiliary_loss_mlp": 0.01030506, + "balance_loss_clip": 1.01897275, + "balance_loss_mlp": 1.03747129, + "epoch": 0.5499473921539155, + "flos": 25921938268800.0, + "grad_norm": 1.7328002119898687, + "language_loss": 0.6403445, + "learning_rate": 1.7740117072688113e-06, + "loss": 0.66168791, + "num_input_tokens_seen": 197082660, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 9147, + "time_per_iteration": 2.5004754066467285 + }, + { + "auxiliary_loss_clip": 0.01110115, + "auxiliary_loss_mlp": 0.01029285, + "balance_loss_clip": 1.01714349, + "balance_loss_mlp": 1.04033351, + "epoch": 0.5500075154065835, + "flos": 22273701609600.0, + "grad_norm": 2.3129813772985854, + "language_loss": 0.80632472, + "learning_rate": 1.7736247441220458e-06, + "loss": 0.82771873, + "num_input_tokens_seen": 197100675, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6953125, + "step": 9148, + "time_per_iteration": 2.4941914081573486 + }, + { + "auxiliary_loss_clip": 0.01109987, + "auxiliary_loss_mlp": 0.01034322, + "balance_loss_clip": 1.0224669, + "balance_loss_mlp": 1.04013515, + "epoch": 0.5500676386592515, + "flos": 28037507552640.0, + "grad_norm": 1.5952381042001647, + "language_loss": 0.78739786, + "learning_rate": 1.773237789559453e-06, + "loss": 0.80884099, + "num_input_tokens_seen": 197121320, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.69921875, + "step": 9149, + "time_per_iteration": 2.5276949405670166 + }, + { + "auxiliary_loss_clip": 0.01108964, + "auxiliary_loss_mlp": 0.01029124, + "balance_loss_clip": 1.01695323, + "balance_loss_mlp": 1.03880644, + "epoch": 0.5501277619119195, + "flos": 23914819123200.0, + "grad_norm": 2.0296810240639847, + "language_loss": 0.72119236, + "learning_rate": 1.7728508435957052e-06, + "loss": 0.74257326, + "num_input_tokens_seen": 197138965, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.703125, + "step": 9150, + "time_per_iteration": 2.4646284580230713 + }, + { + "auxiliary_loss_clip": 0.01110946, + "auxiliary_loss_mlp": 0.01027984, + "balance_loss_clip": 1.01450694, + "balance_loss_mlp": 1.03812099, + "epoch": 0.5501878851645874, + "flos": 20923783655040.0, + "grad_norm": 1.6901514106805953, + "language_loss": 0.74800563, + "learning_rate": 1.772463906245477e-06, + "loss": 0.76939499, + "num_input_tokens_seen": 197156460, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7265625, + "step": 9151, + "time_per_iteration": 2.4528467655181885 + }, + { + "auxiliary_loss_clip": 0.01109486, + "auxiliary_loss_mlp": 0.01027197, + "balance_loss_clip": 1.01572907, + "balance_loss_mlp": 1.03945291, + "epoch": 0.5502480084172554, + "flos": 20665298407680.0, + "grad_norm": 1.835684303690663, + "language_loss": 0.76049578, + "learning_rate": 1.7720769775234394e-06, + "loss": 0.78186262, + "num_input_tokens_seen": 197175140, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.703125, + "step": 9152, + "time_per_iteration": 2.4587628841400146 + }, + { + "auxiliary_loss_clip": 0.011054, + "auxiliary_loss_mlp": 0.01028906, + "balance_loss_clip": 1.01691318, + "balance_loss_mlp": 1.03700173, + "epoch": 0.5503081316699233, + "flos": 26432552056320.0, + "grad_norm": 1.7890824738540096, + "language_loss": 0.82162666, + "learning_rate": 1.7716900574442662e-06, + "loss": 0.84296966, + "num_input_tokens_seen": 197194345, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.68359375, + "step": 9153, + "time_per_iteration": 2.490391492843628 + }, + { + "auxiliary_loss_clip": 0.01107152, + "auxiliary_loss_mlp": 0.01032594, + "balance_loss_clip": 1.02004111, + "balance_loss_mlp": 1.03787208, + "epoch": 0.5503682549225913, + "flos": 30629144718720.0, + "grad_norm": 1.7732052023343188, + "language_loss": 0.74143934, + "learning_rate": 1.7713031460226294e-06, + "loss": 0.76283687, + "num_input_tokens_seen": 197215535, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69140625, + "step": 9154, + "time_per_iteration": 2.5304152965545654 + }, + { + "auxiliary_loss_clip": 0.01113689, + "auxiliary_loss_mlp": 0.01035044, + "balance_loss_clip": 1.02184761, + "balance_loss_mlp": 1.04016376, + "epoch": 0.5504283781752592, + "flos": 22565439872640.0, + "grad_norm": 1.4983591953206352, + "language_loss": 0.7257731, + "learning_rate": 1.770916243273199e-06, + "loss": 0.74726045, + "num_input_tokens_seen": 197234945, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 9155, + "time_per_iteration": 2.4642586708068848 + }, + { + "auxiliary_loss_clip": 0.01033812, + "auxiliary_loss_mlp": 0.01001849, + "balance_loss_clip": 1.00080609, + "balance_loss_mlp": 1.01202416, + "epoch": 0.5504885014279273, + "flos": 67901009270400.0, + "grad_norm": 0.7480439065154532, + "language_loss": 0.55414248, + "learning_rate": 1.7705293492106483e-06, + "loss": 0.57449913, + "num_input_tokens_seen": 197302285, + "router_z_loss_clip": 0.01043701, + "router_z_loss_mlp": 0.21777344, + "step": 9156, + "time_per_iteration": 3.184554100036621 + }, + { + "auxiliary_loss_clip": 0.0110658, + "auxiliary_loss_mlp": 0.01029477, + "balance_loss_clip": 1.01741338, + "balance_loss_mlp": 1.0373919, + "epoch": 0.5505486246805952, + "flos": 22450058409600.0, + "grad_norm": 1.690497670143624, + "language_loss": 0.82608092, + "learning_rate": 1.7701424638496475e-06, + "loss": 0.84744143, + "num_input_tokens_seen": 197321575, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.69140625, + "step": 9157, + "time_per_iteration": 2.4718377590179443 + }, + { + "auxiliary_loss_clip": 0.01115009, + "auxiliary_loss_mlp": 0.01031366, + "balance_loss_clip": 1.01764512, + "balance_loss_mlp": 1.04140961, + "epoch": 0.5506087479332632, + "flos": 26906896085760.0, + "grad_norm": 2.5846917450647138, + "language_loss": 0.75262648, + "learning_rate": 1.7697555872048677e-06, + "loss": 0.77409017, + "num_input_tokens_seen": 197340255, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.734375, + "step": 9158, + "time_per_iteration": 2.483400583267212 + }, + { + "auxiliary_loss_clip": 0.01105976, + "auxiliary_loss_mlp": 0.01030833, + "balance_loss_clip": 1.01863742, + "balance_loss_mlp": 1.0392096, + "epoch": 0.5506688711859311, + "flos": 22930256355840.0, + "grad_norm": 1.6248211907364027, + "language_loss": 0.69624805, + "learning_rate": 1.769368719290979e-06, + "loss": 0.71761608, + "num_input_tokens_seen": 197360360, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66796875, + "step": 9159, + "time_per_iteration": 2.5159049034118652 + }, + { + "auxiliary_loss_clip": 0.01110817, + "auxiliary_loss_mlp": 0.01032003, + "balance_loss_clip": 1.01913416, + "balance_loss_mlp": 1.03923249, + "epoch": 0.5507289944385991, + "flos": 29606408772480.0, + "grad_norm": 1.7392637683079002, + "language_loss": 0.67766821, + "learning_rate": 1.7689818601226516e-06, + "loss": 0.69909644, + "num_input_tokens_seen": 197381905, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71484375, + "step": 9160, + "time_per_iteration": 2.5915122032165527 + }, + { + "auxiliary_loss_clip": 0.01106091, + "auxiliary_loss_mlp": 0.01032828, + "balance_loss_clip": 1.02106166, + "balance_loss_mlp": 1.03855252, + "epoch": 0.5507891176912671, + "flos": 15334431091200.0, + "grad_norm": 1.9414097965551829, + "language_loss": 0.71404171, + "learning_rate": 1.7685950097145552e-06, + "loss": 0.7354309, + "num_input_tokens_seen": 197398555, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.67578125, + "step": 9161, + "time_per_iteration": 2.4698691368103027 + }, + { + "auxiliary_loss_clip": 0.0110819, + "auxiliary_loss_mlp": 0.01035993, + "balance_loss_clip": 1.02365494, + "balance_loss_mlp": 1.03864145, + "epoch": 0.5508492409439351, + "flos": 26578313447040.0, + "grad_norm": 2.0077015754602985, + "language_loss": 0.69346386, + "learning_rate": 1.768208168081359e-06, + "loss": 0.71490568, + "num_input_tokens_seen": 197419630, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 9162, + "time_per_iteration": 2.514615297317505 + }, + { + "auxiliary_loss_clip": 0.01107873, + "auxiliary_loss_mlp": 0.01037948, + "balance_loss_clip": 1.02538323, + "balance_loss_mlp": 1.03850245, + "epoch": 0.5509093641966031, + "flos": 25443428261760.0, + "grad_norm": 1.6272332912595904, + "language_loss": 0.8531208, + "learning_rate": 1.767821335237733e-06, + "loss": 0.87457901, + "num_input_tokens_seen": 197438480, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 9163, + "time_per_iteration": 2.55450439453125 + }, + { + "auxiliary_loss_clip": 0.01107861, + "auxiliary_loss_mlp": 0.01032273, + "balance_loss_clip": 1.02065635, + "balance_loss_mlp": 1.0394969, + "epoch": 0.550969487449271, + "flos": 18698543170560.0, + "grad_norm": 1.5452929110279412, + "language_loss": 0.8063103, + "learning_rate": 1.7674345111983441e-06, + "loss": 0.8277117, + "num_input_tokens_seen": 197456755, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.68359375, + "step": 9164, + "time_per_iteration": 2.477283239364624 + }, + { + "auxiliary_loss_clip": 0.01112735, + "auxiliary_loss_mlp": 0.01027637, + "balance_loss_clip": 1.0152092, + "balance_loss_mlp": 1.04160368, + "epoch": 0.551029610701939, + "flos": 22708723224960.0, + "grad_norm": 1.8276675469309818, + "language_loss": 0.73409986, + "learning_rate": 1.767047695977863e-06, + "loss": 0.75550359, + "num_input_tokens_seen": 197475530, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 9165, + "time_per_iteration": 2.4870002269744873 + }, + { + "auxiliary_loss_clip": 0.01105934, + "auxiliary_loss_mlp": 0.01028437, + "balance_loss_clip": 1.01700497, + "balance_loss_mlp": 1.03732443, + "epoch": 0.5510897339546069, + "flos": 12420496166400.0, + "grad_norm": 1.8849650051461906, + "language_loss": 0.79019225, + "learning_rate": 1.7666608895909563e-06, + "loss": 0.81153595, + "num_input_tokens_seen": 197490835, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6875, + "step": 9166, + "time_per_iteration": 2.435049295425415 + }, + { + "auxiliary_loss_clip": 0.01108748, + "auxiliary_loss_mlp": 0.01028394, + "balance_loss_clip": 1.01596665, + "balance_loss_mlp": 1.03822398, + "epoch": 0.5511498572072749, + "flos": 18770579896320.0, + "grad_norm": 2.033929506473001, + "language_loss": 0.76165509, + "learning_rate": 1.7662740920522913e-06, + "loss": 0.78302646, + "num_input_tokens_seen": 197508770, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.70703125, + "step": 9167, + "time_per_iteration": 2.474677562713623 + }, + { + "auxiliary_loss_clip": 0.01105676, + "auxiliary_loss_mlp": 0.01029086, + "balance_loss_clip": 1.01631832, + "balance_loss_mlp": 1.03744709, + "epoch": 0.5512099804599428, + "flos": 19573326996480.0, + "grad_norm": 2.261050601267758, + "language_loss": 0.79845661, + "learning_rate": 1.7658873033765374e-06, + "loss": 0.81980425, + "num_input_tokens_seen": 197527340, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 9168, + "time_per_iteration": 2.484435796737671 + }, + { + "auxiliary_loss_clip": 0.01110227, + "auxiliary_loss_mlp": 0.0103542, + "balance_loss_clip": 1.0231235, + "balance_loss_mlp": 1.03901529, + "epoch": 0.5512701037126109, + "flos": 26245600744320.0, + "grad_norm": 1.641322965099804, + "language_loss": 0.68934894, + "learning_rate": 1.7655005235783591e-06, + "loss": 0.71080542, + "num_input_tokens_seen": 197547280, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.7109375, + "step": 9169, + "time_per_iteration": 2.5206069946289062 + }, + { + "auxiliary_loss_clip": 0.01102507, + "auxiliary_loss_mlp": 0.01025884, + "balance_loss_clip": 1.014714, + "balance_loss_mlp": 1.03545678, + "epoch": 0.5513302269652788, + "flos": 21945406279680.0, + "grad_norm": 2.0185216192280553, + "language_loss": 0.85350084, + "learning_rate": 1.7651137526724251e-06, + "loss": 0.87478477, + "num_input_tokens_seen": 197565045, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.671875, + "step": 9170, + "time_per_iteration": 2.4762823581695557 + }, + { + "auxiliary_loss_clip": 0.01031617, + "auxiliary_loss_mlp": 0.01002445, + "balance_loss_clip": 1.00143194, + "balance_loss_mlp": 1.00984073, + "epoch": 0.5513903502179468, + "flos": 68235948616320.0, + "grad_norm": 0.7807167648980764, + "language_loss": 0.5990442, + "learning_rate": 1.7647269906734017e-06, + "loss": 0.61938488, + "num_input_tokens_seen": 197625005, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.21777344, + "step": 9171, + "time_per_iteration": 3.0934739112854004 + }, + { + "auxiliary_loss_clip": 0.01106302, + "auxiliary_loss_mlp": 0.01032525, + "balance_loss_clip": 1.02024603, + "balance_loss_mlp": 1.03768301, + "epoch": 0.5514504734706147, + "flos": 18734238311040.0, + "grad_norm": 1.4242208217777272, + "language_loss": 0.701002, + "learning_rate": 1.7643402375959533e-06, + "loss": 0.72239029, + "num_input_tokens_seen": 197645050, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 9172, + "time_per_iteration": 2.482672929763794 + }, + { + "auxiliary_loss_clip": 0.01104259, + "auxiliary_loss_mlp": 0.01029819, + "balance_loss_clip": 1.0176115, + "balance_loss_mlp": 1.03602123, + "epoch": 0.5515105967232827, + "flos": 22270972176000.0, + "grad_norm": 1.708440744181033, + "language_loss": 0.75790203, + "learning_rate": 1.7639534934547474e-06, + "loss": 0.77924281, + "num_input_tokens_seen": 197663910, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 9173, + "time_per_iteration": 2.476710557937622 + }, + { + "auxiliary_loss_clip": 0.01104019, + "auxiliary_loss_mlp": 0.01032582, + "balance_loss_clip": 1.02019644, + "balance_loss_mlp": 1.0371182, + "epoch": 0.5515707199759508, + "flos": 22557682535040.0, + "grad_norm": 1.5740431144983165, + "language_loss": 0.74457419, + "learning_rate": 1.7635667582644484e-06, + "loss": 0.76594019, + "num_input_tokens_seen": 197681580, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 9174, + "time_per_iteration": 2.4599406719207764 + }, + { + "auxiliary_loss_clip": 0.01108196, + "auxiliary_loss_mlp": 0.01029333, + "balance_loss_clip": 1.0173409, + "balance_loss_mlp": 1.03827941, + "epoch": 0.5516308432286187, + "flos": 28291072636800.0, + "grad_norm": 1.784111045924148, + "language_loss": 0.72615731, + "learning_rate": 1.7631800320397217e-06, + "loss": 0.74753261, + "num_input_tokens_seen": 197702095, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69921875, + "step": 9175, + "time_per_iteration": 2.5028982162475586 + }, + { + "auxiliary_loss_clip": 0.01108059, + "auxiliary_loss_mlp": 0.01035871, + "balance_loss_clip": 1.02403331, + "balance_loss_mlp": 1.0378927, + "epoch": 0.5516909664812867, + "flos": 18764474584320.0, + "grad_norm": 1.8209397746213287, + "language_loss": 0.69452918, + "learning_rate": 1.7627933147952318e-06, + "loss": 0.71596849, + "num_input_tokens_seen": 197720720, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.703125, + "step": 9176, + "time_per_iteration": 3.852022171020508 + }, + { + "auxiliary_loss_clip": 0.01104891, + "auxiliary_loss_mlp": 0.01027669, + "balance_loss_clip": 1.01612973, + "balance_loss_mlp": 1.03734601, + "epoch": 0.5517510897339546, + "flos": 27740346336000.0, + "grad_norm": 1.7630507090786165, + "language_loss": 0.70797551, + "learning_rate": 1.7624066065456435e-06, + "loss": 0.7293011, + "num_input_tokens_seen": 197741820, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 9177, + "time_per_iteration": 2.507990837097168 + }, + { + "auxiliary_loss_clip": 0.01109377, + "auxiliary_loss_mlp": 0.0102783, + "balance_loss_clip": 1.01644588, + "balance_loss_mlp": 1.03980064, + "epoch": 0.5518112129866226, + "flos": 18404470523520.0, + "grad_norm": 1.556329351454275, + "language_loss": 0.80197215, + "learning_rate": 1.7620199073056204e-06, + "loss": 0.82334423, + "num_input_tokens_seen": 197759160, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6953125, + "step": 9178, + "time_per_iteration": 2.4645802974700928 + }, + { + "auxiliary_loss_clip": 0.01110368, + "auxiliary_loss_mlp": 0.01040375, + "balance_loss_clip": 1.02744687, + "balance_loss_mlp": 1.03942454, + "epoch": 0.5518713362392905, + "flos": 25082670015360.0, + "grad_norm": 1.5358645892565401, + "language_loss": 0.74621391, + "learning_rate": 1.761633217089826e-06, + "loss": 0.7677213, + "num_input_tokens_seen": 197779760, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9179, + "time_per_iteration": 4.023995399475098 + }, + { + "auxiliary_loss_clip": 0.01106959, + "auxiliary_loss_mlp": 0.01034445, + "balance_loss_clip": 1.02213681, + "balance_loss_mlp": 1.0385108, + "epoch": 0.5519314594919585, + "flos": 36538999361280.0, + "grad_norm": 1.8924336027697886, + "language_loss": 0.70433038, + "learning_rate": 1.761246535912924e-06, + "loss": 0.72574437, + "num_input_tokens_seen": 197801545, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.68359375, + "step": 9180, + "time_per_iteration": 4.060170650482178 + }, + { + "auxiliary_loss_clip": 0.0110796, + "auxiliary_loss_mlp": 0.01034059, + "balance_loss_clip": 1.02197158, + "balance_loss_mlp": 1.03808069, + "epoch": 0.5519915827446265, + "flos": 20448613612800.0, + "grad_norm": 1.9150410275355574, + "language_loss": 0.66870642, + "learning_rate": 1.7608598637895776e-06, + "loss": 0.69012666, + "num_input_tokens_seen": 197820760, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69921875, + "step": 9181, + "time_per_iteration": 2.4741644859313965 + }, + { + "auxiliary_loss_clip": 0.01109873, + "auxiliary_loss_mlp": 0.01028117, + "balance_loss_clip": 1.01514149, + "balance_loss_mlp": 1.03774214, + "epoch": 0.5520517059972945, + "flos": 23768052151680.0, + "grad_norm": 1.9118124234638791, + "language_loss": 0.79398257, + "learning_rate": 1.7604732007344486e-06, + "loss": 0.81536245, + "num_input_tokens_seen": 197840195, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.72265625, + "step": 9182, + "time_per_iteration": 2.4744672775268555 + }, + { + "auxiliary_loss_clip": 0.01107607, + "auxiliary_loss_mlp": 0.01027131, + "balance_loss_clip": 1.0145787, + "balance_loss_mlp": 1.03817368, + "epoch": 0.5521118292499624, + "flos": 22196457411840.0, + "grad_norm": 1.7815316362256517, + "language_loss": 0.82710314, + "learning_rate": 1.7600865467622003e-06, + "loss": 0.84845054, + "num_input_tokens_seen": 197859475, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 9183, + "time_per_iteration": 2.4999542236328125 + }, + { + "auxiliary_loss_clip": 0.01106614, + "auxiliary_loss_mlp": 0.01026614, + "balance_loss_clip": 1.01474106, + "balance_loss_mlp": 1.03841662, + "epoch": 0.5521719525026304, + "flos": 23583291569280.0, + "grad_norm": 1.3300741669264389, + "language_loss": 0.67200708, + "learning_rate": 1.7596999018874936e-06, + "loss": 0.69333941, + "num_input_tokens_seen": 197879395, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 9184, + "time_per_iteration": 2.4747231006622314 + }, + { + "auxiliary_loss_clip": 0.01107758, + "auxiliary_loss_mlp": 0.01025737, + "balance_loss_clip": 1.01336932, + "balance_loss_mlp": 1.03818047, + "epoch": 0.5522320757552983, + "flos": 26137617482880.0, + "grad_norm": 1.521307728440283, + "language_loss": 0.76197934, + "learning_rate": 1.7593132661249917e-06, + "loss": 0.78331435, + "num_input_tokens_seen": 197900815, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6953125, + "step": 9185, + "time_per_iteration": 2.534573793411255 + }, + { + "auxiliary_loss_clip": 0.01109207, + "auxiliary_loss_mlp": 0.01034245, + "balance_loss_clip": 1.02194285, + "balance_loss_mlp": 1.0396924, + "epoch": 0.5522921990079663, + "flos": 24676160820480.0, + "grad_norm": 1.6519250451143856, + "language_loss": 0.7376985, + "learning_rate": 1.7589266394893536e-06, + "loss": 0.75913298, + "num_input_tokens_seen": 197918985, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 9186, + "time_per_iteration": 2.5148305892944336 + }, + { + "auxiliary_loss_clip": 0.01111442, + "auxiliary_loss_mlp": 0.01033032, + "balance_loss_clip": 1.02137351, + "balance_loss_mlp": 1.04041481, + "epoch": 0.5523523222606344, + "flos": 22748153379840.0, + "grad_norm": 2.3297788732806275, + "language_loss": 0.6611231, + "learning_rate": 1.7585400219952421e-06, + "loss": 0.68256783, + "num_input_tokens_seen": 197937725, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.7109375, + "step": 9187, + "time_per_iteration": 2.4953529834747314 + }, + { + "auxiliary_loss_clip": 0.0110884, + "auxiliary_loss_mlp": 0.010278, + "balance_loss_clip": 1.01550388, + "balance_loss_mlp": 1.0389905, + "epoch": 0.5524124455133023, + "flos": 19755825022080.0, + "grad_norm": 1.699111440652827, + "language_loss": 0.77629888, + "learning_rate": 1.758153413657318e-06, + "loss": 0.79766524, + "num_input_tokens_seen": 197955635, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69921875, + "step": 9188, + "time_per_iteration": 2.4593770503997803 + }, + { + "auxiliary_loss_clip": 0.01105648, + "auxiliary_loss_mlp": 0.01030962, + "balance_loss_clip": 1.01829576, + "balance_loss_mlp": 1.03729725, + "epoch": 0.5524725687659703, + "flos": 23294821443840.0, + "grad_norm": 1.837373875573988, + "language_loss": 0.81666493, + "learning_rate": 1.7577668144902394e-06, + "loss": 0.83803099, + "num_input_tokens_seen": 197974490, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.68359375, + "step": 9189, + "time_per_iteration": 2.514223098754883 + }, + { + "auxiliary_loss_clip": 0.01105635, + "auxiliary_loss_mlp": 0.01028064, + "balance_loss_clip": 1.0153625, + "balance_loss_mlp": 1.03796136, + "epoch": 0.5525326920186382, + "flos": 24862178378880.0, + "grad_norm": 1.3687672594772107, + "language_loss": 0.76419669, + "learning_rate": 1.7573802245086684e-06, + "loss": 0.78553367, + "num_input_tokens_seen": 197995735, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 9190, + "time_per_iteration": 2.4991939067840576 + }, + { + "auxiliary_loss_clip": 0.01111398, + "auxiliary_loss_mlp": 0.0103191, + "balance_loss_clip": 1.01837981, + "balance_loss_mlp": 1.03823757, + "epoch": 0.5525928152713062, + "flos": 13735580906880.0, + "grad_norm": 3.1168017297152484, + "language_loss": 0.78959441, + "learning_rate": 1.7569936437272627e-06, + "loss": 0.81102753, + "num_input_tokens_seen": 198009685, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73046875, + "step": 9191, + "time_per_iteration": 2.447239875793457 + }, + { + "auxiliary_loss_clip": 0.01106392, + "auxiliary_loss_mlp": 0.01030056, + "balance_loss_clip": 1.01799178, + "balance_loss_mlp": 1.03781414, + "epoch": 0.5526529385239741, + "flos": 13071592045440.0, + "grad_norm": 2.1697062429363427, + "language_loss": 0.68734175, + "learning_rate": 1.7566070721606829e-06, + "loss": 0.70870626, + "num_input_tokens_seen": 198026845, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.68359375, + "step": 9192, + "time_per_iteration": 2.424194812774658 + }, + { + "auxiliary_loss_clip": 0.01104657, + "auxiliary_loss_mlp": 0.01031796, + "balance_loss_clip": 1.0210079, + "balance_loss_mlp": 1.03741503, + "epoch": 0.5527130617766421, + "flos": 23148377694720.0, + "grad_norm": 1.580245881596358, + "language_loss": 0.77429307, + "learning_rate": 1.756220509823588e-06, + "loss": 0.79565763, + "num_input_tokens_seen": 198045275, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.671875, + "step": 9193, + "time_per_iteration": 2.486544370651245 + }, + { + "auxiliary_loss_clip": 0.01106633, + "auxiliary_loss_mlp": 0.01033853, + "balance_loss_clip": 1.02199149, + "balance_loss_mlp": 1.03775311, + "epoch": 0.55277318502931, + "flos": 21285547482240.0, + "grad_norm": 1.6936547327162281, + "language_loss": 0.78554469, + "learning_rate": 1.7558339567306344e-06, + "loss": 0.80694956, + "num_input_tokens_seen": 198065760, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 9194, + "time_per_iteration": 2.446010112762451 + }, + { + "auxiliary_loss_clip": 0.01111391, + "auxiliary_loss_mlp": 0.01032697, + "balance_loss_clip": 1.01982856, + "balance_loss_mlp": 1.03737998, + "epoch": 0.5528333082819781, + "flos": 38324549462400.0, + "grad_norm": 1.6547854303314034, + "language_loss": 0.69580936, + "learning_rate": 1.7554474128964825e-06, + "loss": 0.71725023, + "num_input_tokens_seen": 198087595, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.73828125, + "step": 9195, + "time_per_iteration": 2.633622407913208 + }, + { + "auxiliary_loss_clip": 0.01114776, + "auxiliary_loss_mlp": 0.01029834, + "balance_loss_clip": 1.01669717, + "balance_loss_mlp": 1.0401336, + "epoch": 0.552893431534646, + "flos": 13553621585280.0, + "grad_norm": 2.085899367605988, + "language_loss": 0.73877811, + "learning_rate": 1.7550608783357887e-06, + "loss": 0.76022422, + "num_input_tokens_seen": 198104620, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.74609375, + "step": 9196, + "time_per_iteration": 2.4477953910827637 + }, + { + "auxiliary_loss_clip": 0.01107465, + "auxiliary_loss_mlp": 0.01032297, + "balance_loss_clip": 1.0202986, + "balance_loss_mlp": 1.03845131, + "epoch": 0.552953554787314, + "flos": 21939408708480.0, + "grad_norm": 1.5760086547957552, + "language_loss": 0.76767844, + "learning_rate": 1.7546743530632115e-06, + "loss": 0.78907609, + "num_input_tokens_seen": 198123565, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 9197, + "time_per_iteration": 2.4946064949035645 + }, + { + "auxiliary_loss_clip": 0.01104392, + "auxiliary_loss_mlp": 0.0102516, + "balance_loss_clip": 1.01429963, + "balance_loss_mlp": 1.03566051, + "epoch": 0.5530136780399819, + "flos": 43658002558080.0, + "grad_norm": 1.6045583807501234, + "language_loss": 0.76419538, + "learning_rate": 1.754287837093407e-06, + "loss": 0.78549087, + "num_input_tokens_seen": 198148270, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6875, + "step": 9198, + "time_per_iteration": 2.7027511596679688 + }, + { + "auxiliary_loss_clip": 0.01105504, + "auxiliary_loss_mlp": 0.01025375, + "balance_loss_clip": 1.0140028, + "balance_loss_mlp": 1.03652, + "epoch": 0.5530738012926499, + "flos": 25045502417280.0, + "grad_norm": 1.7911524754161214, + "language_loss": 0.79089695, + "learning_rate": 1.7539013304410327e-06, + "loss": 0.81220573, + "num_input_tokens_seen": 198168810, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6875, + "step": 9199, + "time_per_iteration": 2.5071682929992676 + }, + { + "auxiliary_loss_clip": 0.01106031, + "auxiliary_loss_mlp": 0.01030471, + "balance_loss_clip": 1.01893711, + "balance_loss_mlp": 1.03667951, + "epoch": 0.553133924545318, + "flos": 16472081623680.0, + "grad_norm": 1.789754163992573, + "language_loss": 0.64116317, + "learning_rate": 1.7535148331207443e-06, + "loss": 0.66252816, + "num_input_tokens_seen": 198186200, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6953125, + "step": 9200, + "time_per_iteration": 2.453810214996338 + }, + { + "auxiliary_loss_clip": 0.01112463, + "auxiliary_loss_mlp": 0.01027812, + "balance_loss_clip": 1.01444292, + "balance_loss_mlp": 1.03949916, + "epoch": 0.5531940477979859, + "flos": 24606207083520.0, + "grad_norm": 1.54627322023295, + "language_loss": 0.66172588, + "learning_rate": 1.7531283451471978e-06, + "loss": 0.6831286, + "num_input_tokens_seen": 198207050, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73046875, + "step": 9201, + "time_per_iteration": 2.5050048828125 + }, + { + "auxiliary_loss_clip": 0.01110041, + "auxiliary_loss_mlp": 0.01032711, + "balance_loss_clip": 1.02000964, + "balance_loss_mlp": 1.04039264, + "epoch": 0.5532541710506539, + "flos": 22159577122560.0, + "grad_norm": 2.1300156031813624, + "language_loss": 0.60931027, + "learning_rate": 1.7527418665350502e-06, + "loss": 0.63073778, + "num_input_tokens_seen": 198224565, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 9202, + "time_per_iteration": 2.454374074935913 + }, + { + "auxiliary_loss_clip": 0.01105546, + "auxiliary_loss_mlp": 0.01028359, + "balance_loss_clip": 1.01677179, + "balance_loss_mlp": 1.0374378, + "epoch": 0.5533142943033218, + "flos": 21397265758080.0, + "grad_norm": 1.6333926311503897, + "language_loss": 0.64007318, + "learning_rate": 1.7523553972989548e-06, + "loss": 0.66141224, + "num_input_tokens_seen": 198244790, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6796875, + "step": 9203, + "time_per_iteration": 2.520813226699829 + }, + { + "auxiliary_loss_clip": 0.01106796, + "auxiliary_loss_mlp": 0.0102947, + "balance_loss_clip": 1.01762629, + "balance_loss_mlp": 1.03710103, + "epoch": 0.5533744175559898, + "flos": 23550541344000.0, + "grad_norm": 1.5876710884236471, + "language_loss": 0.63839149, + "learning_rate": 1.7519689374535683e-06, + "loss": 0.65975416, + "num_input_tokens_seen": 198264375, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6953125, + "step": 9204, + "time_per_iteration": 2.519796371459961 + }, + { + "auxiliary_loss_clip": 0.01103569, + "auxiliary_loss_mlp": 0.01026922, + "balance_loss_clip": 1.01617515, + "balance_loss_mlp": 1.0357914, + "epoch": 0.5534345408086577, + "flos": 24061514267520.0, + "grad_norm": 1.7042490030554438, + "language_loss": 0.77431834, + "learning_rate": 1.7515824870135445e-06, + "loss": 0.79562324, + "num_input_tokens_seen": 198283895, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6796875, + "step": 9205, + "time_per_iteration": 2.5149800777435303 + }, + { + "auxiliary_loss_clip": 0.01105223, + "auxiliary_loss_mlp": 0.01029733, + "balance_loss_clip": 1.01799703, + "balance_loss_mlp": 1.03753543, + "epoch": 0.5534946640613257, + "flos": 33771831408000.0, + "grad_norm": 1.5447277527142993, + "language_loss": 0.72338134, + "learning_rate": 1.751196045993537e-06, + "loss": 0.74473095, + "num_input_tokens_seen": 198310035, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.67578125, + "step": 9206, + "time_per_iteration": 2.6088132858276367 + }, + { + "auxiliary_loss_clip": 0.01107088, + "auxiliary_loss_mlp": 0.01030178, + "balance_loss_clip": 1.01891243, + "balance_loss_mlp": 1.03847539, + "epoch": 0.5535547873139937, + "flos": 15159223526400.0, + "grad_norm": 1.9679878300179545, + "language_loss": 0.75601065, + "learning_rate": 1.7508096144082012e-06, + "loss": 0.77738333, + "num_input_tokens_seen": 198327810, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6875, + "step": 9207, + "time_per_iteration": 2.4550647735595703 + }, + { + "auxiliary_loss_clip": 0.01112139, + "auxiliary_loss_mlp": 0.01031947, + "balance_loss_clip": 1.01861894, + "balance_loss_mlp": 1.03909707, + "epoch": 0.5536149105666617, + "flos": 16980863817600.0, + "grad_norm": 2.4900859433120055, + "language_loss": 0.61790574, + "learning_rate": 1.750423192272189e-06, + "loss": 0.6393466, + "num_input_tokens_seen": 198343150, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73046875, + "step": 9208, + "time_per_iteration": 2.4474070072174072 + }, + { + "auxiliary_loss_clip": 0.01109576, + "auxiliary_loss_mlp": 0.01030497, + "balance_loss_clip": 1.01908827, + "balance_loss_mlp": 1.03917742, + "epoch": 0.5536750338193296, + "flos": 18149935772160.0, + "grad_norm": 2.138498398763569, + "language_loss": 0.64059991, + "learning_rate": 1.7500367796001547e-06, + "loss": 0.66200066, + "num_input_tokens_seen": 198360925, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.703125, + "step": 9209, + "time_per_iteration": 2.49118709564209 + }, + { + "auxiliary_loss_clip": 0.01106938, + "auxiliary_loss_mlp": 0.01032725, + "balance_loss_clip": 1.02030945, + "balance_loss_mlp": 1.03779769, + "epoch": 0.5537351570719976, + "flos": 22747794243840.0, + "grad_norm": 1.9091325066097349, + "language_loss": 0.8244276, + "learning_rate": 1.7496503764067513e-06, + "loss": 0.84582424, + "num_input_tokens_seen": 198379265, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69140625, + "step": 9210, + "time_per_iteration": 2.479508876800537 + }, + { + "auxiliary_loss_clip": 0.01104462, + "auxiliary_loss_mlp": 0.01027145, + "balance_loss_clip": 1.01554608, + "balance_loss_mlp": 1.03640354, + "epoch": 0.5537952803246655, + "flos": 26356026130560.0, + "grad_norm": 1.9903415105614328, + "language_loss": 0.72810864, + "learning_rate": 1.74926398270663e-06, + "loss": 0.74942476, + "num_input_tokens_seen": 198399490, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 9211, + "time_per_iteration": 2.56174635887146 + }, + { + "auxiliary_loss_clip": 0.01109862, + "auxiliary_loss_mlp": 0.01034269, + "balance_loss_clip": 1.02101886, + "balance_loss_mlp": 1.03795481, + "epoch": 0.5538554035773335, + "flos": 18037427397120.0, + "grad_norm": 1.687820261734967, + "language_loss": 0.66492426, + "learning_rate": 1.7488775985144437e-06, + "loss": 0.68636549, + "num_input_tokens_seen": 198419110, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 9212, + "time_per_iteration": 2.4493961334228516 + }, + { + "auxiliary_loss_clip": 0.01107628, + "auxiliary_loss_mlp": 0.01032485, + "balance_loss_clip": 1.01846039, + "balance_loss_mlp": 1.03564453, + "epoch": 0.5539155268300014, + "flos": 31686247002240.0, + "grad_norm": 1.478127311181698, + "language_loss": 0.51676697, + "learning_rate": 1.7484912238448443e-06, + "loss": 0.53816813, + "num_input_tokens_seen": 198441360, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.71875, + "step": 9213, + "time_per_iteration": 2.5872037410736084 + }, + { + "auxiliary_loss_clip": 0.01111386, + "auxiliary_loss_mlp": 0.01031335, + "balance_loss_clip": 1.01868105, + "balance_loss_mlp": 1.03979373, + "epoch": 0.5539756500826695, + "flos": 15193769431680.0, + "grad_norm": 1.9151587743929102, + "language_loss": 0.8548407, + "learning_rate": 1.7481048587124827e-06, + "loss": 0.87626791, + "num_input_tokens_seen": 198459835, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71484375, + "step": 9214, + "time_per_iteration": 2.4696502685546875 + }, + { + "auxiliary_loss_clip": 0.01108266, + "auxiliary_loss_mlp": 0.01027132, + "balance_loss_clip": 1.01570582, + "balance_loss_mlp": 1.03970075, + "epoch": 0.5540357733353375, + "flos": 26353117128960.0, + "grad_norm": 1.700191688942819, + "language_loss": 0.70016778, + "learning_rate": 1.7477185031320108e-06, + "loss": 0.72152174, + "num_input_tokens_seen": 198478955, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6875, + "step": 9215, + "time_per_iteration": 2.50022029876709 + }, + { + "auxiliary_loss_clip": 0.01109258, + "auxiliary_loss_mlp": 0.01029951, + "balance_loss_clip": 1.01724386, + "balance_loss_mlp": 1.03815317, + "epoch": 0.5540958965880054, + "flos": 21323684747520.0, + "grad_norm": 1.5266679061001223, + "language_loss": 0.73124695, + "learning_rate": 1.7473321571180773e-06, + "loss": 0.75263906, + "num_input_tokens_seen": 198499030, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9216, + "time_per_iteration": 2.4683403968811035 + }, + { + "auxiliary_loss_clip": 0.01105693, + "auxiliary_loss_mlp": 0.01031419, + "balance_loss_clip": 1.01916385, + "balance_loss_mlp": 1.03830385, + "epoch": 0.5541560198406734, + "flos": 25666828899840.0, + "grad_norm": 1.9596921442179602, + "language_loss": 0.71501839, + "learning_rate": 1.7469458206853345e-06, + "loss": 0.73638952, + "num_input_tokens_seen": 198520265, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 9217, + "time_per_iteration": 2.542431592941284 + }, + { + "auxiliary_loss_clip": 0.01103432, + "auxiliary_loss_mlp": 0.010249, + "balance_loss_clip": 1.01331282, + "balance_loss_mlp": 1.03553486, + "epoch": 0.5542161430933413, + "flos": 21939624190080.0, + "grad_norm": 1.8113809838055568, + "language_loss": 0.7838676, + "learning_rate": 1.7465594938484315e-06, + "loss": 0.80515093, + "num_input_tokens_seen": 198539645, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 9218, + "time_per_iteration": 3.8476054668426514 + }, + { + "auxiliary_loss_clip": 0.01108339, + "auxiliary_loss_mlp": 0.01034768, + "balance_loss_clip": 1.02095163, + "balance_loss_mlp": 1.03540277, + "epoch": 0.5542762663460093, + "flos": 19571459489280.0, + "grad_norm": 2.0355993872839675, + "language_loss": 0.72591358, + "learning_rate": 1.7461731766220176e-06, + "loss": 0.74734467, + "num_input_tokens_seen": 198558710, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7265625, + "step": 9219, + "time_per_iteration": 2.4924545288085938 + }, + { + "auxiliary_loss_clip": 0.01111531, + "auxiliary_loss_mlp": 0.01039554, + "balance_loss_clip": 1.02701962, + "balance_loss_mlp": 1.03986812, + "epoch": 0.5543363895986773, + "flos": 19499063627520.0, + "grad_norm": 1.546677051774663, + "language_loss": 0.71403503, + "learning_rate": 1.7457868690207426e-06, + "loss": 0.73554587, + "num_input_tokens_seen": 198577050, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71875, + "step": 9220, + "time_per_iteration": 2.4362480640411377 + }, + { + "auxiliary_loss_clip": 0.01106556, + "auxiliary_loss_mlp": 0.0102571, + "balance_loss_clip": 1.01424217, + "balance_loss_mlp": 1.03777957, + "epoch": 0.5543965128513453, + "flos": 22635609091200.0, + "grad_norm": 1.6357699921116782, + "language_loss": 0.79294407, + "learning_rate": 1.7454005710592547e-06, + "loss": 0.81426674, + "num_input_tokens_seen": 198595290, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6875, + "step": 9221, + "time_per_iteration": 5.3692920207977295 + }, + { + "auxiliary_loss_clip": 0.01107012, + "auxiliary_loss_mlp": 0.01030177, + "balance_loss_clip": 1.01745725, + "balance_loss_mlp": 1.03750253, + "epoch": 0.5544566361040132, + "flos": 25989952671360.0, + "grad_norm": 1.7434924477802918, + "language_loss": 0.83865321, + "learning_rate": 1.7450142827522027e-06, + "loss": 0.86002505, + "num_input_tokens_seen": 198614110, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 9222, + "time_per_iteration": 2.5054023265838623 + }, + { + "auxiliary_loss_clip": 0.0111308, + "auxiliary_loss_mlp": 0.01034265, + "balance_loss_clip": 1.02092493, + "balance_loss_mlp": 1.04003119, + "epoch": 0.5545167593566812, + "flos": 28257568225920.0, + "grad_norm": 1.7723513069494143, + "language_loss": 0.75498754, + "learning_rate": 1.7446280041142344e-06, + "loss": 0.77646095, + "num_input_tokens_seen": 198633880, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73046875, + "step": 9223, + "time_per_iteration": 2.5140554904937744 + }, + { + "auxiliary_loss_clip": 0.01108507, + "auxiliary_loss_mlp": 0.01028359, + "balance_loss_clip": 1.0155921, + "balance_loss_mlp": 1.03917074, + "epoch": 0.5545768826093491, + "flos": 28476551491200.0, + "grad_norm": 1.798104527740367, + "language_loss": 0.81975842, + "learning_rate": 1.7442417351599986e-06, + "loss": 0.84112704, + "num_input_tokens_seen": 198653505, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 9224, + "time_per_iteration": 2.5273303985595703 + }, + { + "auxiliary_loss_clip": 0.01110718, + "auxiliary_loss_mlp": 0.01040562, + "balance_loss_clip": 1.02769315, + "balance_loss_mlp": 1.0393647, + "epoch": 0.5546370058620171, + "flos": 18478051534080.0, + "grad_norm": 2.764116317399656, + "language_loss": 0.5700891, + "learning_rate": 1.743855475904141e-06, + "loss": 0.59160185, + "num_input_tokens_seen": 198671890, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9225, + "time_per_iteration": 2.4379100799560547 + }, + { + "auxiliary_loss_clip": 0.01110187, + "auxiliary_loss_mlp": 0.01036326, + "balance_loss_clip": 1.02342129, + "balance_loss_mlp": 1.03836024, + "epoch": 0.554697129114685, + "flos": 22930507751040.0, + "grad_norm": 1.5085866030732613, + "language_loss": 0.67495418, + "learning_rate": 1.7434692263613098e-06, + "loss": 0.69641924, + "num_input_tokens_seen": 198691995, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 9226, + "time_per_iteration": 2.4891088008880615 + }, + { + "auxiliary_loss_clip": 0.01107189, + "auxiliary_loss_mlp": 0.01032344, + "balance_loss_clip": 1.01961827, + "balance_loss_mlp": 1.03644681, + "epoch": 0.5547572523673531, + "flos": 21797166850560.0, + "grad_norm": 1.4051697234065024, + "language_loss": 0.74315172, + "learning_rate": 1.7430829865461518e-06, + "loss": 0.76454705, + "num_input_tokens_seen": 198712440, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 9227, + "time_per_iteration": 2.4678173065185547 + }, + { + "auxiliary_loss_clip": 0.01114145, + "auxiliary_loss_mlp": 0.01030915, + "balance_loss_clip": 1.01826084, + "balance_loss_mlp": 1.04228091, + "epoch": 0.5548173756200211, + "flos": 22342829333760.0, + "grad_norm": 2.5448731753452405, + "language_loss": 0.73452151, + "learning_rate": 1.7426967564733118e-06, + "loss": 0.75597215, + "num_input_tokens_seen": 198731515, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 9228, + "time_per_iteration": 2.4851813316345215 + }, + { + "auxiliary_loss_clip": 0.01110082, + "auxiliary_loss_mlp": 0.01030138, + "balance_loss_clip": 1.01803231, + "balance_loss_mlp": 1.03902888, + "epoch": 0.554877498872689, + "flos": 17858736213120.0, + "grad_norm": 2.153919283771507, + "language_loss": 0.76069826, + "learning_rate": 1.7423105361574373e-06, + "loss": 0.7821005, + "num_input_tokens_seen": 198749750, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.7109375, + "step": 9229, + "time_per_iteration": 2.4682509899139404 + }, + { + "auxiliary_loss_clip": 0.01110192, + "auxiliary_loss_mlp": 0.01039645, + "balance_loss_clip": 1.02623343, + "balance_loss_mlp": 1.03956127, + "epoch": 0.554937622125357, + "flos": 17238343484160.0, + "grad_norm": 1.3529022003633056, + "language_loss": 0.68695533, + "learning_rate": 1.741924325613172e-06, + "loss": 0.70845366, + "num_input_tokens_seen": 198768320, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.70703125, + "step": 9230, + "time_per_iteration": 2.4558916091918945 + }, + { + "auxiliary_loss_clip": 0.01110086, + "auxiliary_loss_mlp": 0.01033326, + "balance_loss_clip": 1.02054107, + "balance_loss_mlp": 1.03759503, + "epoch": 0.5549977453780249, + "flos": 25368087484800.0, + "grad_norm": 2.0513203800368327, + "language_loss": 0.67574155, + "learning_rate": 1.741538124855163e-06, + "loss": 0.69717568, + "num_input_tokens_seen": 198787230, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7265625, + "step": 9231, + "time_per_iteration": 2.4816246032714844 + }, + { + "auxiliary_loss_clip": 0.01113542, + "auxiliary_loss_mlp": 0.01035024, + "balance_loss_clip": 1.02160072, + "balance_loss_mlp": 1.03941798, + "epoch": 0.555057868630693, + "flos": 25079114568960.0, + "grad_norm": 1.5458592279354035, + "language_loss": 0.77953124, + "learning_rate": 1.7411519338980548e-06, + "loss": 0.80101693, + "num_input_tokens_seen": 198806720, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7421875, + "step": 9232, + "time_per_iteration": 2.5161256790161133 + }, + { + "auxiliary_loss_clip": 0.01106102, + "auxiliary_loss_mlp": 0.01037505, + "balance_loss_clip": 1.02622199, + "balance_loss_mlp": 1.03777027, + "epoch": 0.5551179918833609, + "flos": 26104220812800.0, + "grad_norm": 1.5305081634070101, + "language_loss": 0.82585824, + "learning_rate": 1.7407657527564898e-06, + "loss": 0.84729433, + "num_input_tokens_seen": 198826235, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.68359375, + "step": 9233, + "time_per_iteration": 2.513498306274414 + }, + { + "auxiliary_loss_clip": 0.01112184, + "auxiliary_loss_mlp": 0.01039099, + "balance_loss_clip": 1.02717805, + "balance_loss_mlp": 1.03902006, + "epoch": 0.5551781151360289, + "flos": 19384759572480.0, + "grad_norm": 2.1768956460608053, + "language_loss": 0.75171268, + "learning_rate": 1.7403795814451142e-06, + "loss": 0.77322543, + "num_input_tokens_seen": 198842655, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.734375, + "step": 9234, + "time_per_iteration": 2.4618585109710693 + }, + { + "auxiliary_loss_clip": 0.01105123, + "auxiliary_loss_mlp": 0.01028385, + "balance_loss_clip": 1.01663136, + "balance_loss_mlp": 1.03685272, + "epoch": 0.5552382383886968, + "flos": 21725956137600.0, + "grad_norm": 2.1362991517660146, + "language_loss": 0.64992738, + "learning_rate": 1.7399934199785706e-06, + "loss": 0.6712625, + "num_input_tokens_seen": 198861210, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.68359375, + "step": 9235, + "time_per_iteration": 2.4449851512908936 + }, + { + "auxiliary_loss_clip": 0.01108941, + "auxiliary_loss_mlp": 0.01032132, + "balance_loss_clip": 1.01977587, + "balance_loss_mlp": 1.03794515, + "epoch": 0.5552983616413648, + "flos": 14356189117440.0, + "grad_norm": 1.8479272776295672, + "language_loss": 0.67863953, + "learning_rate": 1.7396072683715029e-06, + "loss": 0.70005023, + "num_input_tokens_seen": 198880045, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.7109375, + "step": 9236, + "time_per_iteration": 2.4798662662506104 + }, + { + "auxiliary_loss_clip": 0.01104311, + "auxiliary_loss_mlp": 0.01024908, + "balance_loss_clip": 1.0127244, + "balance_loss_mlp": 1.03731084, + "epoch": 0.5553584848940327, + "flos": 25478548784640.0, + "grad_norm": 3.129052058582791, + "language_loss": 0.86174095, + "learning_rate": 1.7392211266385536e-06, + "loss": 0.88303316, + "num_input_tokens_seen": 198900210, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.671875, + "step": 9237, + "time_per_iteration": 2.4789483547210693 + }, + { + "auxiliary_loss_clip": 0.01104495, + "auxiliary_loss_mlp": 0.01032906, + "balance_loss_clip": 1.02062178, + "balance_loss_mlp": 1.03669763, + "epoch": 0.5554186081467007, + "flos": 22163850840960.0, + "grad_norm": 1.712591160520522, + "language_loss": 0.73281908, + "learning_rate": 1.7388349947943652e-06, + "loss": 0.75419307, + "num_input_tokens_seen": 198919055, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.67578125, + "step": 9238, + "time_per_iteration": 2.4812166690826416 + }, + { + "auxiliary_loss_clip": 0.01109816, + "auxiliary_loss_mlp": 0.0103234, + "balance_loss_clip": 1.01997221, + "balance_loss_mlp": 1.03750467, + "epoch": 0.5554787313993687, + "flos": 49746656125440.0, + "grad_norm": 1.5735650405734192, + "language_loss": 0.78268331, + "learning_rate": 1.73844887285358e-06, + "loss": 0.80410492, + "num_input_tokens_seen": 198943505, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.72265625, + "step": 9239, + "time_per_iteration": 2.6846883296966553 + }, + { + "auxiliary_loss_clip": 0.01107901, + "auxiliary_loss_mlp": 0.0102797, + "balance_loss_clip": 1.01580429, + "balance_loss_mlp": 1.03730011, + "epoch": 0.5555388546520367, + "flos": 22127365601280.0, + "grad_norm": 1.4802036052022307, + "language_loss": 0.79760826, + "learning_rate": 1.7380627608308393e-06, + "loss": 0.81896698, + "num_input_tokens_seen": 198963590, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.703125, + "step": 9240, + "time_per_iteration": 2.4733242988586426 + }, + { + "auxiliary_loss_clip": 0.01107185, + "auxiliary_loss_mlp": 0.01032369, + "balance_loss_clip": 1.02035236, + "balance_loss_mlp": 1.0374887, + "epoch": 0.5555989779047047, + "flos": 24682122478080.0, + "grad_norm": 1.5810234034759716, + "language_loss": 0.6520583, + "learning_rate": 1.737676658740786e-06, + "loss": 0.67345387, + "num_input_tokens_seen": 198982680, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69921875, + "step": 9241, + "time_per_iteration": 2.4733994007110596 + }, + { + "auxiliary_loss_clip": 0.01109149, + "auxiliary_loss_mlp": 0.01032145, + "balance_loss_clip": 1.01974106, + "balance_loss_mlp": 1.03843307, + "epoch": 0.5556591011573726, + "flos": 16106510954880.0, + "grad_norm": 1.9354963557050642, + "language_loss": 0.72742647, + "learning_rate": 1.7372905665980594e-06, + "loss": 0.74883944, + "num_input_tokens_seen": 199000185, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.70703125, + "step": 9242, + "time_per_iteration": 2.439195394515991 + }, + { + "auxiliary_loss_clip": 0.01109113, + "auxiliary_loss_mlp": 0.0103746, + "balance_loss_clip": 1.02429366, + "balance_loss_mlp": 1.03737354, + "epoch": 0.5557192244100406, + "flos": 12933695733120.0, + "grad_norm": 1.6615305539564786, + "language_loss": 0.63989079, + "learning_rate": 1.7369044844173012e-06, + "loss": 0.66135651, + "num_input_tokens_seen": 199018380, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71875, + "step": 9243, + "time_per_iteration": 2.5009653568267822 + }, + { + "auxiliary_loss_clip": 0.01109943, + "auxiliary_loss_mlp": 0.01031237, + "balance_loss_clip": 1.01894689, + "balance_loss_mlp": 1.03998828, + "epoch": 0.5557793476627085, + "flos": 23111712887040.0, + "grad_norm": 1.8112849174534187, + "language_loss": 0.75149089, + "learning_rate": 1.7365184122131509e-06, + "loss": 0.77290273, + "num_input_tokens_seen": 199037115, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 9244, + "time_per_iteration": 2.475520610809326 + }, + { + "auxiliary_loss_clip": 0.01102344, + "auxiliary_loss_mlp": 0.01028296, + "balance_loss_clip": 1.01693511, + "balance_loss_mlp": 1.03605533, + "epoch": 0.5558394709153766, + "flos": 21428040735360.0, + "grad_norm": 2.1432873648263473, + "language_loss": 0.74578094, + "learning_rate": 1.7361323500002486e-06, + "loss": 0.76708734, + "num_input_tokens_seen": 199053375, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6640625, + "step": 9245, + "time_per_iteration": 2.45875883102417 + }, + { + "auxiliary_loss_clip": 0.01111156, + "auxiliary_loss_mlp": 0.01031213, + "balance_loss_clip": 1.01832068, + "balance_loss_mlp": 1.03885865, + "epoch": 0.5558995941680445, + "flos": 25078324469760.0, + "grad_norm": 2.0585608296199, + "language_loss": 0.79468071, + "learning_rate": 1.7357462977932348e-06, + "loss": 0.81610441, + "num_input_tokens_seen": 199070930, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.72265625, + "step": 9246, + "time_per_iteration": 2.5065393447875977 + }, + { + "auxiliary_loss_clip": 0.01108664, + "auxiliary_loss_mlp": 0.01032421, + "balance_loss_clip": 1.02022004, + "balance_loss_mlp": 1.03894639, + "epoch": 0.5559597174207125, + "flos": 20011149872640.0, + "grad_norm": 1.99088564820557, + "language_loss": 0.73864704, + "learning_rate": 1.7353602556067471e-06, + "loss": 0.76005793, + "num_input_tokens_seen": 199088675, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69921875, + "step": 9247, + "time_per_iteration": 2.535578489303589 + }, + { + "auxiliary_loss_clip": 0.01108657, + "auxiliary_loss_mlp": 0.01033098, + "balance_loss_clip": 1.02047944, + "balance_loss_mlp": 1.03822637, + "epoch": 0.5560198406733804, + "flos": 16835677044480.0, + "grad_norm": 3.9448346084731214, + "language_loss": 0.76161623, + "learning_rate": 1.7349742234554254e-06, + "loss": 0.78303373, + "num_input_tokens_seen": 199103075, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 9248, + "time_per_iteration": 2.4247324466705322 + }, + { + "auxiliary_loss_clip": 0.01031453, + "auxiliary_loss_mlp": 0.01002871, + "balance_loss_clip": 1.00163698, + "balance_loss_mlp": 1.00995636, + "epoch": 0.5560799639260484, + "flos": 70697051758080.0, + "grad_norm": 0.8418132845618771, + "language_loss": 0.59482312, + "learning_rate": 1.7345882013539081e-06, + "loss": 0.61516631, + "num_input_tokens_seen": 199160325, + "router_z_loss_clip": 0.0123291, + "router_z_loss_mlp": 0.21484375, + "step": 9249, + "time_per_iteration": 3.1760778427124023 + }, + { + "auxiliary_loss_clip": 0.01104052, + "auxiliary_loss_mlp": 0.01027363, + "balance_loss_clip": 1.01514411, + "balance_loss_mlp": 1.03505003, + "epoch": 0.5561400871787163, + "flos": 23148593176320.0, + "grad_norm": 1.8510226601540976, + "language_loss": 0.79942709, + "learning_rate": 1.734202189316832e-06, + "loss": 0.82074124, + "num_input_tokens_seen": 199179760, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 9250, + "time_per_iteration": 2.4803051948547363 + }, + { + "auxiliary_loss_clip": 0.01107715, + "auxiliary_loss_mlp": 0.01032139, + "balance_loss_clip": 1.01952708, + "balance_loss_mlp": 1.03654897, + "epoch": 0.5562002104313843, + "flos": 17566423332480.0, + "grad_norm": 2.627943398678235, + "language_loss": 0.68456143, + "learning_rate": 1.733816187358836e-06, + "loss": 0.70596004, + "num_input_tokens_seen": 199196695, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 9251, + "time_per_iteration": 2.4627792835235596 + }, + { + "auxiliary_loss_clip": 0.01106616, + "auxiliary_loss_mlp": 0.01029568, + "balance_loss_clip": 1.01753998, + "balance_loss_mlp": 1.03680301, + "epoch": 0.5562603336840523, + "flos": 25045430590080.0, + "grad_norm": 1.9270315036455492, + "language_loss": 0.75472188, + "learning_rate": 1.7334301954945569e-06, + "loss": 0.77608371, + "num_input_tokens_seen": 199217845, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6953125, + "step": 9252, + "time_per_iteration": 2.5131149291992188 + }, + { + "auxiliary_loss_clip": 0.01109989, + "auxiliary_loss_mlp": 0.01032399, + "balance_loss_clip": 1.02020955, + "balance_loss_mlp": 1.0379473, + "epoch": 0.5563204569367203, + "flos": 29059022436480.0, + "grad_norm": 1.5243167641625328, + "language_loss": 0.72841972, + "learning_rate": 1.7330442137386313e-06, + "loss": 0.74984354, + "num_input_tokens_seen": 199239250, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.71875, + "step": 9253, + "time_per_iteration": 2.545469045639038 + }, + { + "auxiliary_loss_clip": 0.01108615, + "auxiliary_loss_mlp": 0.01029855, + "balance_loss_clip": 1.01835763, + "balance_loss_mlp": 1.03873754, + "epoch": 0.5563805801893883, + "flos": 22090449398400.0, + "grad_norm": 1.7630844010149394, + "language_loss": 0.8319999, + "learning_rate": 1.7326582421056965e-06, + "loss": 0.85338461, + "num_input_tokens_seen": 199258320, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.69921875, + "step": 9254, + "time_per_iteration": 2.4762439727783203 + }, + { + "auxiliary_loss_clip": 0.01028463, + "auxiliary_loss_mlp": 0.00998119, + "balance_loss_clip": 0.99699229, + "balance_loss_mlp": 1.00661826, + "epoch": 0.5564407034420562, + "flos": 58636128689280.0, + "grad_norm": 0.880020971367601, + "language_loss": 0.64831799, + "learning_rate": 1.732272280610387e-06, + "loss": 0.66858381, + "num_input_tokens_seen": 199314840, + "router_z_loss_clip": 0.0112915, + "router_z_loss_mlp": 0.21875, + "step": 9255, + "time_per_iteration": 2.894592523574829 + }, + { + "auxiliary_loss_clip": 0.01108855, + "auxiliary_loss_mlp": 0.01034937, + "balance_loss_clip": 1.02330816, + "balance_loss_mlp": 1.04103208, + "epoch": 0.5565008266947242, + "flos": 23112323418240.0, + "grad_norm": 1.9305562864951415, + "language_loss": 0.69224131, + "learning_rate": 1.7318863292673399e-06, + "loss": 0.71367919, + "num_input_tokens_seen": 199335405, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 9256, + "time_per_iteration": 2.489379644393921 + }, + { + "auxiliary_loss_clip": 0.01102517, + "auxiliary_loss_mlp": 0.01029113, + "balance_loss_clip": 1.01805019, + "balance_loss_mlp": 1.03555584, + "epoch": 0.5565609499473921, + "flos": 21578399066880.0, + "grad_norm": 1.531147439374393, + "language_loss": 0.75793779, + "learning_rate": 1.73150038809119e-06, + "loss": 0.77925408, + "num_input_tokens_seen": 199354345, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.66796875, + "step": 9257, + "time_per_iteration": 2.484574794769287 + }, + { + "auxiliary_loss_clip": 0.01106314, + "auxiliary_loss_mlp": 0.01036216, + "balance_loss_clip": 1.02477169, + "balance_loss_mlp": 1.03559875, + "epoch": 0.5566210732000602, + "flos": 18369637309440.0, + "grad_norm": 4.5210433992726635, + "language_loss": 0.61403644, + "learning_rate": 1.7311144570965724e-06, + "loss": 0.63546175, + "num_input_tokens_seen": 199372250, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.70703125, + "step": 9258, + "time_per_iteration": 2.4358863830566406 + }, + { + "auxiliary_loss_clip": 0.0110731, + "auxiliary_loss_mlp": 0.01032708, + "balance_loss_clip": 1.01988161, + "balance_loss_mlp": 1.0372082, + "epoch": 0.5566811964527281, + "flos": 25703350053120.0, + "grad_norm": 1.630618195357818, + "language_loss": 0.79231477, + "learning_rate": 1.7307285362981215e-06, + "loss": 0.81371492, + "num_input_tokens_seen": 199392815, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.703125, + "step": 9259, + "time_per_iteration": 3.931269884109497 + }, + { + "auxiliary_loss_clip": 0.01106185, + "auxiliary_loss_mlp": 0.01031095, + "balance_loss_clip": 1.01859045, + "balance_loss_mlp": 1.03665948, + "epoch": 0.5567413197053961, + "flos": 26943991856640.0, + "grad_norm": 1.9981692343252953, + "language_loss": 0.81332636, + "learning_rate": 1.7303426257104712e-06, + "loss": 0.83469915, + "num_input_tokens_seen": 199412375, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 9260, + "time_per_iteration": 2.5092766284942627 + }, + { + "auxiliary_loss_clip": 0.01108362, + "auxiliary_loss_mlp": 0.01037114, + "balance_loss_clip": 1.02450144, + "balance_loss_mlp": 1.03862071, + "epoch": 0.556801442958064, + "flos": 20850597694080.0, + "grad_norm": 1.4782542821591422, + "language_loss": 0.68771613, + "learning_rate": 1.729956725348256e-06, + "loss": 0.70917082, + "num_input_tokens_seen": 199431490, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69921875, + "step": 9261, + "time_per_iteration": 2.5739381313323975 + }, + { + "auxiliary_loss_clip": 0.01027391, + "auxiliary_loss_mlp": 0.01004087, + "balance_loss_clip": 1.00296021, + "balance_loss_mlp": 1.00587916, + "epoch": 0.556861566210732, + "flos": 70498213044480.0, + "grad_norm": 0.7282105219345391, + "language_loss": 0.61132908, + "learning_rate": 1.729570835226108e-06, + "loss": 0.63164389, + "num_input_tokens_seen": 199495855, + "router_z_loss_clip": 0.0112915, + "router_z_loss_mlp": 0.21484375, + "step": 9262, + "time_per_iteration": 5.870652675628662 + }, + { + "auxiliary_loss_clip": 0.01108355, + "auxiliary_loss_mlp": 0.01033188, + "balance_loss_clip": 1.02145159, + "balance_loss_mlp": 1.0379622, + "epoch": 0.5569216894633999, + "flos": 25337276593920.0, + "grad_norm": 1.6754840031905727, + "language_loss": 0.64504874, + "learning_rate": 1.7291849553586622e-06, + "loss": 0.66646421, + "num_input_tokens_seen": 199515870, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.703125, + "step": 9263, + "time_per_iteration": 3.9533426761627197 + }, + { + "auxiliary_loss_clip": 0.01107431, + "auxiliary_loss_mlp": 0.0103239, + "balance_loss_clip": 1.02047563, + "balance_loss_mlp": 1.03795195, + "epoch": 0.556981812716068, + "flos": 22638733574400.0, + "grad_norm": 2.058460487271679, + "language_loss": 0.73137188, + "learning_rate": 1.7287990857605497e-06, + "loss": 0.75277007, + "num_input_tokens_seen": 199535745, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 9264, + "time_per_iteration": 2.493511199951172 + }, + { + "auxiliary_loss_clip": 0.0111011, + "auxiliary_loss_mlp": 0.01031978, + "balance_loss_clip": 1.02008092, + "balance_loss_mlp": 1.04015422, + "epoch": 0.5570419359687359, + "flos": 11035852738560.0, + "grad_norm": 1.9025948017547305, + "language_loss": 0.75953865, + "learning_rate": 1.7284132264464022e-06, + "loss": 0.78095955, + "num_input_tokens_seen": 199554035, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.703125, + "step": 9265, + "time_per_iteration": 2.4533309936523438 + }, + { + "auxiliary_loss_clip": 0.01103692, + "auxiliary_loss_mlp": 0.01030017, + "balance_loss_clip": 1.01909113, + "balance_loss_mlp": 1.03774786, + "epoch": 0.5571020592214039, + "flos": 22823135020800.0, + "grad_norm": 1.366142740242795, + "language_loss": 0.7096293, + "learning_rate": 1.7280273774308536e-06, + "loss": 0.73096645, + "num_input_tokens_seen": 199576120, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.66015625, + "step": 9266, + "time_per_iteration": 2.5045597553253174 + }, + { + "auxiliary_loss_clip": 0.01106333, + "auxiliary_loss_mlp": 0.01033928, + "balance_loss_clip": 1.02204871, + "balance_loss_mlp": 1.03720617, + "epoch": 0.5571621824740719, + "flos": 22927778317440.0, + "grad_norm": 1.7291111077620351, + "language_loss": 0.681355, + "learning_rate": 1.727641538728533e-06, + "loss": 0.7027576, + "num_input_tokens_seen": 199593780, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69140625, + "step": 9267, + "time_per_iteration": 2.5197811126708984 + }, + { + "auxiliary_loss_clip": 0.01103883, + "auxiliary_loss_mlp": 0.01034523, + "balance_loss_clip": 1.02367473, + "balance_loss_mlp": 1.03763127, + "epoch": 0.5572223057267398, + "flos": 22966705681920.0, + "grad_norm": 1.9159467095237732, + "language_loss": 0.74278724, + "learning_rate": 1.7272557103540736e-06, + "loss": 0.76417124, + "num_input_tokens_seen": 199613220, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6640625, + "step": 9268, + "time_per_iteration": 2.489957332611084 + }, + { + "auxiliary_loss_clip": 0.01105844, + "auxiliary_loss_mlp": 0.01030138, + "balance_loss_clip": 1.0188365, + "balance_loss_mlp": 1.03773642, + "epoch": 0.5572824289794078, + "flos": 20960053413120.0, + "grad_norm": 2.490438410193009, + "language_loss": 0.7539283, + "learning_rate": 1.726869892322104e-06, + "loss": 0.77528816, + "num_input_tokens_seen": 199632085, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6796875, + "step": 9269, + "time_per_iteration": 2.5165016651153564 + }, + { + "auxiliary_loss_clip": 0.01106358, + "auxiliary_loss_mlp": 0.01029515, + "balance_loss_clip": 1.01847041, + "balance_loss_mlp": 1.0366416, + "epoch": 0.5573425522320757, + "flos": 25042413847680.0, + "grad_norm": 1.5593232015543566, + "language_loss": 0.82527506, + "learning_rate": 1.726484084647256e-06, + "loss": 0.84663379, + "num_input_tokens_seen": 199649295, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6953125, + "step": 9270, + "time_per_iteration": 2.495546579360962 + }, + { + "auxiliary_loss_clip": 0.01107465, + "auxiliary_loss_mlp": 0.01031385, + "balance_loss_clip": 1.01927948, + "balance_loss_mlp": 1.03695226, + "epoch": 0.5574026754847438, + "flos": 23659637927040.0, + "grad_norm": 2.4402155421947485, + "language_loss": 0.79217434, + "learning_rate": 1.7260982873441591e-06, + "loss": 0.81356287, + "num_input_tokens_seen": 199668870, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.703125, + "step": 9271, + "time_per_iteration": 2.5050055980682373 + }, + { + "auxiliary_loss_clip": 0.01107417, + "auxiliary_loss_mlp": 0.01031316, + "balance_loss_clip": 1.01938963, + "balance_loss_mlp": 1.03778744, + "epoch": 0.5574627987374117, + "flos": 24782240661120.0, + "grad_norm": 1.994384891359262, + "language_loss": 0.90424085, + "learning_rate": 1.725712500427442e-06, + "loss": 0.92562819, + "num_input_tokens_seen": 199684870, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 9272, + "time_per_iteration": 2.455949068069458 + }, + { + "auxiliary_loss_clip": 0.0110516, + "auxiliary_loss_mlp": 0.01030586, + "balance_loss_clip": 1.01864076, + "balance_loss_mlp": 1.03754234, + "epoch": 0.5575229219900797, + "flos": 21834944979840.0, + "grad_norm": 1.979276269767202, + "language_loss": 0.83862162, + "learning_rate": 1.7253267239117347e-06, + "loss": 0.85997909, + "num_input_tokens_seen": 199701975, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 9273, + "time_per_iteration": 2.4802021980285645 + }, + { + "auxiliary_loss_clip": 0.01108902, + "auxiliary_loss_mlp": 0.01041124, + "balance_loss_clip": 1.02752197, + "balance_loss_mlp": 1.03908944, + "epoch": 0.5575830452427476, + "flos": 27815148408960.0, + "grad_norm": 2.0454885443684905, + "language_loss": 0.73996758, + "learning_rate": 1.7249409578116655e-06, + "loss": 0.76146781, + "num_input_tokens_seen": 199721865, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.69921875, + "step": 9274, + "time_per_iteration": 2.4761173725128174 + }, + { + "auxiliary_loss_clip": 0.01116526, + "auxiliary_loss_mlp": 0.01034454, + "balance_loss_clip": 1.02121544, + "balance_loss_mlp": 1.04015088, + "epoch": 0.5576431684954156, + "flos": 17812805696640.0, + "grad_norm": 2.9773966002159824, + "language_loss": 0.78126067, + "learning_rate": 1.7245552021418629e-06, + "loss": 0.8027705, + "num_input_tokens_seen": 199736455, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.765625, + "step": 9275, + "time_per_iteration": 2.4496877193450928 + }, + { + "auxiliary_loss_clip": 0.01106389, + "auxiliary_loss_mlp": 0.01029467, + "balance_loss_clip": 1.01745057, + "balance_loss_mlp": 1.03767419, + "epoch": 0.5577032917480835, + "flos": 15486872411520.0, + "grad_norm": 1.6885485925360224, + "language_loss": 0.74829316, + "learning_rate": 1.7241694569169546e-06, + "loss": 0.76965177, + "num_input_tokens_seen": 199753125, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6875, + "step": 9276, + "time_per_iteration": 2.413726806640625 + }, + { + "auxiliary_loss_clip": 0.0110324, + "auxiliary_loss_mlp": 0.01031184, + "balance_loss_clip": 1.01978803, + "balance_loss_mlp": 1.03508329, + "epoch": 0.5577634150007516, + "flos": 21579763783680.0, + "grad_norm": 1.7672131346084554, + "language_loss": 0.75013113, + "learning_rate": 1.7237837221515678e-06, + "loss": 0.77147532, + "num_input_tokens_seen": 199771365, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6796875, + "step": 9277, + "time_per_iteration": 2.4982142448425293 + }, + { + "auxiliary_loss_clip": 0.01102538, + "auxiliary_loss_mlp": 0.01032622, + "balance_loss_clip": 1.02155328, + "balance_loss_mlp": 1.03504467, + "epoch": 0.5578235382534195, + "flos": 21139750177920.0, + "grad_norm": 1.8714980055762023, + "language_loss": 0.71817064, + "learning_rate": 1.7233979978603304e-06, + "loss": 0.73952222, + "num_input_tokens_seen": 199790035, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.67578125, + "step": 9278, + "time_per_iteration": 2.4389007091522217 + }, + { + "auxiliary_loss_clip": 0.01109043, + "auxiliary_loss_mlp": 0.01034833, + "balance_loss_clip": 1.02185118, + "balance_loss_mlp": 1.0372287, + "epoch": 0.5578836615060875, + "flos": 26505199313280.0, + "grad_norm": 1.6538282955120047, + "language_loss": 0.75750679, + "learning_rate": 1.723012284057868e-06, + "loss": 0.77894545, + "num_input_tokens_seen": 199811125, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71875, + "step": 9279, + "time_per_iteration": 2.5255484580993652 + }, + { + "auxiliary_loss_clip": 0.0110534, + "auxiliary_loss_mlp": 0.01029428, + "balance_loss_clip": 1.01767397, + "balance_loss_mlp": 1.03544426, + "epoch": 0.5579437847587555, + "flos": 20153786780160.0, + "grad_norm": 2.2545627368714034, + "language_loss": 0.67431748, + "learning_rate": 1.7226265807588082e-06, + "loss": 0.69566512, + "num_input_tokens_seen": 199829915, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.69921875, + "step": 9280, + "time_per_iteration": 2.5258350372314453 + }, + { + "auxiliary_loss_clip": 0.01107802, + "auxiliary_loss_mlp": 0.01037985, + "balance_loss_clip": 1.02595139, + "balance_loss_mlp": 1.03626418, + "epoch": 0.5580039080114234, + "flos": 26102281478400.0, + "grad_norm": 1.676674952402485, + "language_loss": 0.72964156, + "learning_rate": 1.7222408879777763e-06, + "loss": 0.75109941, + "num_input_tokens_seen": 199850670, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.71484375, + "step": 9281, + "time_per_iteration": 2.505610466003418 + }, + { + "auxiliary_loss_clip": 0.01106676, + "auxiliary_loss_mlp": 0.01030885, + "balance_loss_clip": 1.01922011, + "balance_loss_mlp": 1.03804862, + "epoch": 0.5580640312640914, + "flos": 13771671096960.0, + "grad_norm": 2.9649443100281627, + "language_loss": 0.75254506, + "learning_rate": 1.7218552057293974e-06, + "loss": 0.77392066, + "num_input_tokens_seen": 199867645, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6875, + "step": 9282, + "time_per_iteration": 2.444455623626709 + }, + { + "auxiliary_loss_clip": 0.01104903, + "auxiliary_loss_mlp": 0.01026903, + "balance_loss_clip": 1.01507115, + "balance_loss_mlp": 1.03695285, + "epoch": 0.5581241545167593, + "flos": 17675986792320.0, + "grad_norm": 1.6849195839549764, + "language_loss": 0.66588777, + "learning_rate": 1.721469534028297e-06, + "loss": 0.68720585, + "num_input_tokens_seen": 199886320, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 9283, + "time_per_iteration": 2.4668378829956055 + }, + { + "auxiliary_loss_clip": 0.01105958, + "auxiliary_loss_mlp": 0.01025736, + "balance_loss_clip": 1.01500154, + "balance_loss_mlp": 1.03703356, + "epoch": 0.5581842777694274, + "flos": 19569161018880.0, + "grad_norm": 2.7565054625366305, + "language_loss": 0.8290503, + "learning_rate": 1.7210838728890994e-06, + "loss": 0.85036725, + "num_input_tokens_seen": 199904895, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6875, + "step": 9284, + "time_per_iteration": 2.430774688720703 + }, + { + "auxiliary_loss_clip": 0.01105717, + "auxiliary_loss_mlp": 0.01029119, + "balance_loss_clip": 1.01653099, + "balance_loss_mlp": 1.03554368, + "epoch": 0.5582444010220953, + "flos": 20595165102720.0, + "grad_norm": 2.3933521300057836, + "language_loss": 0.85047686, + "learning_rate": 1.7206982223264304e-06, + "loss": 0.87182522, + "num_input_tokens_seen": 199921090, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 9285, + "time_per_iteration": 2.4788479804992676 + }, + { + "auxiliary_loss_clip": 0.01105483, + "auxiliary_loss_mlp": 0.01031556, + "balance_loss_clip": 1.02009416, + "balance_loss_mlp": 1.03531575, + "epoch": 0.5583045242747633, + "flos": 19135504120320.0, + "grad_norm": 3.198131799092361, + "language_loss": 0.73653531, + "learning_rate": 1.720312582354912e-06, + "loss": 0.75790572, + "num_input_tokens_seen": 199939925, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.703125, + "step": 9286, + "time_per_iteration": 2.439715623855591 + }, + { + "auxiliary_loss_clip": 0.01107925, + "auxiliary_loss_mlp": 0.01031209, + "balance_loss_clip": 1.01946068, + "balance_loss_mlp": 1.03781044, + "epoch": 0.5583646475274312, + "flos": 27454569730560.0, + "grad_norm": 1.684452503968906, + "language_loss": 0.74169838, + "learning_rate": 1.7199269529891684e-06, + "loss": 0.76308966, + "num_input_tokens_seen": 199960015, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.703125, + "step": 9287, + "time_per_iteration": 2.534813642501831 + }, + { + "auxiliary_loss_clip": 0.01112227, + "auxiliary_loss_mlp": 0.01030459, + "balance_loss_clip": 1.01780486, + "balance_loss_mlp": 1.03982437, + "epoch": 0.5584247707800992, + "flos": 23653784010240.0, + "grad_norm": 2.339953652318452, + "language_loss": 0.75018406, + "learning_rate": 1.7195413342438233e-06, + "loss": 0.77161086, + "num_input_tokens_seen": 199980505, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 9288, + "time_per_iteration": 2.470242977142334 + }, + { + "auxiliary_loss_clip": 0.01109468, + "auxiliary_loss_mlp": 0.01037482, + "balance_loss_clip": 1.0241785, + "balance_loss_mlp": 1.03922033, + "epoch": 0.5584848940327671, + "flos": 13698880185600.0, + "grad_norm": 1.8804248151935914, + "language_loss": 0.77241838, + "learning_rate": 1.7191557261334984e-06, + "loss": 0.79388785, + "num_input_tokens_seen": 199999020, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.703125, + "step": 9289, + "time_per_iteration": 2.5357422828674316 + }, + { + "auxiliary_loss_clip": 0.01112615, + "auxiliary_loss_mlp": 0.01031708, + "balance_loss_clip": 1.01918483, + "balance_loss_mlp": 1.03802335, + "epoch": 0.5585450172854352, + "flos": 27016208150400.0, + "grad_norm": 1.7341259817318901, + "language_loss": 0.61310709, + "learning_rate": 1.718770128672817e-06, + "loss": 0.63455033, + "num_input_tokens_seen": 200019020, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.74609375, + "step": 9290, + "time_per_iteration": 2.479149580001831 + }, + { + "auxiliary_loss_clip": 0.01107208, + "auxiliary_loss_mlp": 0.01028765, + "balance_loss_clip": 1.01678467, + "balance_loss_mlp": 1.03602409, + "epoch": 0.5586051405381031, + "flos": 23185653033600.0, + "grad_norm": 1.9512495779204855, + "language_loss": 0.67988908, + "learning_rate": 1.7183845418764e-06, + "loss": 0.70124876, + "num_input_tokens_seen": 200038110, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.7109375, + "step": 9291, + "time_per_iteration": 2.4684019088745117 + }, + { + "auxiliary_loss_clip": 0.01108099, + "auxiliary_loss_mlp": 0.01033244, + "balance_loss_clip": 1.02022064, + "balance_loss_mlp": 1.0363071, + "epoch": 0.5586652637907711, + "flos": 20775544225920.0, + "grad_norm": 2.2522167745355524, + "language_loss": 0.83802187, + "learning_rate": 1.7179989657588698e-06, + "loss": 0.85943532, + "num_input_tokens_seen": 200056210, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 9292, + "time_per_iteration": 2.550994873046875 + }, + { + "auxiliary_loss_clip": 0.01104675, + "auxiliary_loss_mlp": 0.01033633, + "balance_loss_clip": 1.02180171, + "balance_loss_mlp": 1.03674221, + "epoch": 0.5587253870434391, + "flos": 28219897837440.0, + "grad_norm": 1.8368239448999808, + "language_loss": 0.73363894, + "learning_rate": 1.7176134003348476e-06, + "loss": 0.75502205, + "num_input_tokens_seen": 200075620, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 9293, + "time_per_iteration": 2.5334718227386475 + }, + { + "auxiliary_loss_clip": 0.01104517, + "auxiliary_loss_mlp": 0.01031653, + "balance_loss_clip": 1.02023864, + "balance_loss_mlp": 1.03715324, + "epoch": 0.558785510296107, + "flos": 26615732440320.0, + "grad_norm": 1.6770372644425844, + "language_loss": 0.7251429, + "learning_rate": 1.7172278456189523e-06, + "loss": 0.7465046, + "num_input_tokens_seen": 200095945, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 9294, + "time_per_iteration": 2.4782567024230957 + }, + { + "auxiliary_loss_clip": 0.01107679, + "auxiliary_loss_mlp": 0.01030624, + "balance_loss_clip": 1.01867914, + "balance_loss_mlp": 1.03769052, + "epoch": 0.558845633548775, + "flos": 20156767608960.0, + "grad_norm": 2.2895769976939437, + "language_loss": 0.68138099, + "learning_rate": 1.716842301625806e-06, + "loss": 0.70276403, + "num_input_tokens_seen": 200114185, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69921875, + "step": 9295, + "time_per_iteration": 2.433671474456787 + }, + { + "auxiliary_loss_clip": 0.01108675, + "auxiliary_loss_mlp": 0.0103174, + "balance_loss_clip": 1.01949131, + "balance_loss_mlp": 1.03873825, + "epoch": 0.5589057568014429, + "flos": 24350774492160.0, + "grad_norm": 1.7275865639530346, + "language_loss": 0.80619705, + "learning_rate": 1.7164567683700281e-06, + "loss": 0.82760113, + "num_input_tokens_seen": 200135030, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.69921875, + "step": 9296, + "time_per_iteration": 2.4831361770629883 + }, + { + "auxiliary_loss_clip": 0.01106832, + "auxiliary_loss_mlp": 0.01031695, + "balance_loss_clip": 1.0200243, + "balance_loss_mlp": 1.03788233, + "epoch": 0.558965880054111, + "flos": 21105168359040.0, + "grad_norm": 1.8948732644892212, + "language_loss": 0.65465128, + "learning_rate": 1.7160712458662379e-06, + "loss": 0.67603648, + "num_input_tokens_seen": 200154290, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6875, + "step": 9297, + "time_per_iteration": 2.4711036682128906 + }, + { + "auxiliary_loss_clip": 0.01109853, + "auxiliary_loss_mlp": 0.01035081, + "balance_loss_clip": 1.02202153, + "balance_loss_mlp": 1.03785491, + "epoch": 0.5590260033067789, + "flos": 18436071513600.0, + "grad_norm": 1.6800872146948855, + "language_loss": 0.7513994, + "learning_rate": 1.7156857341290544e-06, + "loss": 0.77284867, + "num_input_tokens_seen": 200171555, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 9298, + "time_per_iteration": 2.421066999435425 + }, + { + "auxiliary_loss_clip": 0.01031879, + "auxiliary_loss_mlp": 0.01018081, + "balance_loss_clip": 1.01695406, + "balance_loss_mlp": 1.01014686, + "epoch": 0.5590861265594469, + "flos": 70577432490240.0, + "grad_norm": 0.6830476030131911, + "language_loss": 0.52463478, + "learning_rate": 1.7153002331730967e-06, + "loss": 0.54513437, + "num_input_tokens_seen": 200237010, + "router_z_loss_clip": 0.0112915, + "router_z_loss_mlp": 0.21777344, + "step": 9299, + "time_per_iteration": 3.096731424331665 + }, + { + "auxiliary_loss_clip": 0.0110307, + "auxiliary_loss_mlp": 0.01029526, + "balance_loss_clip": 1.01799822, + "balance_loss_mlp": 1.03608131, + "epoch": 0.5591462498121148, + "flos": 30664408896000.0, + "grad_norm": 1.8758260689947703, + "language_loss": 0.68378884, + "learning_rate": 1.7149147430129824e-06, + "loss": 0.70511478, + "num_input_tokens_seen": 200260820, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 9300, + "time_per_iteration": 2.5355281829833984 + }, + { + "auxiliary_loss_clip": 0.01107824, + "auxiliary_loss_mlp": 0.01040798, + "balance_loss_clip": 1.02798903, + "balance_loss_mlp": 1.0372839, + "epoch": 0.5592063730647828, + "flos": 18150438562560.0, + "grad_norm": 1.868740801794004, + "language_loss": 0.81233132, + "learning_rate": 1.7145292636633293e-06, + "loss": 0.83381754, + "num_input_tokens_seen": 200278035, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.703125, + "step": 9301, + "time_per_iteration": 3.9131312370300293 + }, + { + "auxiliary_loss_clip": 0.01104347, + "auxiliary_loss_mlp": 0.01026194, + "balance_loss_clip": 1.01370668, + "balance_loss_mlp": 1.03488898, + "epoch": 0.5592664963174507, + "flos": 24060400945920.0, + "grad_norm": 2.564037719481304, + "language_loss": 0.67297423, + "learning_rate": 1.714143795138756e-06, + "loss": 0.69427967, + "num_input_tokens_seen": 200297255, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 9302, + "time_per_iteration": 2.484609365463257 + }, + { + "auxiliary_loss_clip": 0.011096, + "auxiliary_loss_mlp": 0.01024968, + "balance_loss_clip": 1.01249897, + "balance_loss_mlp": 1.03721702, + "epoch": 0.5593266195701188, + "flos": 19827897661440.0, + "grad_norm": 2.803806869845176, + "language_loss": 0.70999819, + "learning_rate": 1.713758337453878e-06, + "loss": 0.73134387, + "num_input_tokens_seen": 200317505, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7265625, + "step": 9303, + "time_per_iteration": 2.442859649658203 + }, + { + "auxiliary_loss_clip": 0.01105574, + "auxiliary_loss_mlp": 0.0102866, + "balance_loss_clip": 1.01791978, + "balance_loss_mlp": 1.03930676, + "epoch": 0.5593867428227867, + "flos": 25300755440640.0, + "grad_norm": 1.540239070281283, + "language_loss": 0.72772652, + "learning_rate": 1.7133728906233124e-06, + "loss": 0.74906886, + "num_input_tokens_seen": 200338350, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6640625, + "step": 9304, + "time_per_iteration": 5.429321765899658 + }, + { + "auxiliary_loss_clip": 0.01104708, + "auxiliary_loss_mlp": 0.01027817, + "balance_loss_clip": 1.01613426, + "balance_loss_mlp": 1.03523278, + "epoch": 0.5594468660754547, + "flos": 12933013374720.0, + "grad_norm": 1.8535856395803625, + "language_loss": 0.77888674, + "learning_rate": 1.7129874546616763e-06, + "loss": 0.80021197, + "num_input_tokens_seen": 200353965, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6953125, + "step": 9305, + "time_per_iteration": 3.8705790042877197 + }, + { + "auxiliary_loss_clip": 0.01102125, + "auxiliary_loss_mlp": 0.01024983, + "balance_loss_clip": 1.01390815, + "balance_loss_mlp": 1.03657615, + "epoch": 0.5595069893281227, + "flos": 19062713208960.0, + "grad_norm": 1.7045399129758072, + "language_loss": 0.69334519, + "learning_rate": 1.7126020295835836e-06, + "loss": 0.7146163, + "num_input_tokens_seen": 200373595, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 9306, + "time_per_iteration": 2.4669442176818848 + }, + { + "auxiliary_loss_clip": 0.01030152, + "auxiliary_loss_mlp": 0.01003605, + "balance_loss_clip": 1.0025028, + "balance_loss_mlp": 1.00838459, + "epoch": 0.5595671125807906, + "flos": 70273375862400.0, + "grad_norm": 0.9104128938879268, + "language_loss": 0.60324359, + "learning_rate": 1.7122166154036518e-06, + "loss": 0.62358117, + "num_input_tokens_seen": 200429155, + "router_z_loss_clip": 0.01104736, + "router_z_loss_mlp": 0.21777344, + "step": 9307, + "time_per_iteration": 3.167161703109741 + }, + { + "auxiliary_loss_clip": 0.01105033, + "auxiliary_loss_mlp": 0.01030814, + "balance_loss_clip": 1.01972127, + "balance_loss_mlp": 1.03697395, + "epoch": 0.5596272358334586, + "flos": 20665513889280.0, + "grad_norm": 1.9188877301503315, + "language_loss": 0.73981357, + "learning_rate": 1.7118312121364943e-06, + "loss": 0.76117194, + "num_input_tokens_seen": 200448290, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6796875, + "step": 9308, + "time_per_iteration": 2.544931650161743 + }, + { + "auxiliary_loss_clip": 0.01107282, + "auxiliary_loss_mlp": 0.01031253, + "balance_loss_clip": 1.01833069, + "balance_loss_mlp": 1.03571653, + "epoch": 0.5596873590861265, + "flos": 25041013217280.0, + "grad_norm": 1.8987333438245737, + "language_loss": 0.69393057, + "learning_rate": 1.7114458197967257e-06, + "loss": 0.71531588, + "num_input_tokens_seen": 200466555, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 9309, + "time_per_iteration": 2.5008022785186768 + }, + { + "auxiliary_loss_clip": 0.01108803, + "auxiliary_loss_mlp": 0.01031964, + "balance_loss_clip": 1.01787376, + "balance_loss_mlp": 1.03872681, + "epoch": 0.5597474823387946, + "flos": 25958387594880.0, + "grad_norm": 2.0715816525821458, + "language_loss": 0.75254035, + "learning_rate": 1.7110604383989613e-06, + "loss": 0.77394807, + "num_input_tokens_seen": 200485980, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.703125, + "step": 9310, + "time_per_iteration": 2.5096590518951416 + }, + { + "auxiliary_loss_clip": 0.01111521, + "auxiliary_loss_mlp": 0.01032538, + "balance_loss_clip": 1.01953197, + "balance_loss_mlp": 1.03922331, + "epoch": 0.5598076055914625, + "flos": 26177442687360.0, + "grad_norm": 4.006602699764322, + "language_loss": 0.69449794, + "learning_rate": 1.7106750679578133e-06, + "loss": 0.71593851, + "num_input_tokens_seen": 200504555, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.72265625, + "step": 9311, + "time_per_iteration": 2.5238418579101562 + }, + { + "auxiliary_loss_clip": 0.01103209, + "auxiliary_loss_mlp": 0.01028086, + "balance_loss_clip": 1.01616526, + "balance_loss_mlp": 1.03474474, + "epoch": 0.5598677288441305, + "flos": 11655778590720.0, + "grad_norm": 1.8631623558730779, + "language_loss": 0.72497612, + "learning_rate": 1.7102897084878962e-06, + "loss": 0.74628901, + "num_input_tokens_seen": 200522700, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.68359375, + "step": 9312, + "time_per_iteration": 2.4980969429016113 + }, + { + "auxiliary_loss_clip": 0.01106218, + "auxiliary_loss_mlp": 0.01030938, + "balance_loss_clip": 1.01871908, + "balance_loss_mlp": 1.03834271, + "epoch": 0.5599278520967984, + "flos": 22966597941120.0, + "grad_norm": 1.9916809517025356, + "language_loss": 0.89106059, + "learning_rate": 1.709904360003822e-06, + "loss": 0.91243219, + "num_input_tokens_seen": 200541910, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6796875, + "step": 9313, + "time_per_iteration": 2.43849515914917 + }, + { + "auxiliary_loss_clip": 0.01107396, + "auxiliary_loss_mlp": 0.01034864, + "balance_loss_clip": 1.0224545, + "balance_loss_mlp": 1.03886163, + "epoch": 0.5599879753494664, + "flos": 21215557831680.0, + "grad_norm": 1.848557040479868, + "language_loss": 0.77809632, + "learning_rate": 1.709519022520204e-06, + "loss": 0.79951894, + "num_input_tokens_seen": 200562600, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 9314, + "time_per_iteration": 2.4745004177093506 + }, + { + "auxiliary_loss_clip": 0.01103678, + "auxiliary_loss_mlp": 0.01027591, + "balance_loss_clip": 1.0153954, + "balance_loss_mlp": 1.03497362, + "epoch": 0.5600480986021343, + "flos": 31903219105920.0, + "grad_norm": 1.6135281246099127, + "language_loss": 0.7005592, + "learning_rate": 1.7091336960516537e-06, + "loss": 0.72187185, + "num_input_tokens_seen": 200584795, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 9315, + "time_per_iteration": 2.523815631866455 + }, + { + "auxiliary_loss_clip": 0.0110827, + "auxiliary_loss_mlp": 0.01034837, + "balance_loss_clip": 1.02225423, + "balance_loss_mlp": 1.03666615, + "epoch": 0.5601082218548024, + "flos": 28476048700800.0, + "grad_norm": 2.163442884097896, + "language_loss": 0.66467899, + "learning_rate": 1.7087483806127824e-06, + "loss": 0.68611002, + "num_input_tokens_seen": 200606945, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71875, + "step": 9316, + "time_per_iteration": 2.530667304992676 + }, + { + "auxiliary_loss_clip": 0.01106878, + "auxiliary_loss_mlp": 0.01031394, + "balance_loss_clip": 1.01796496, + "balance_loss_mlp": 1.03770351, + "epoch": 0.5601683451074703, + "flos": 24097173494400.0, + "grad_norm": 2.3805446029838624, + "language_loss": 0.86762506, + "learning_rate": 1.7083630762182022e-06, + "loss": 0.88900781, + "num_input_tokens_seen": 200626340, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.69140625, + "step": 9317, + "time_per_iteration": 2.469134569168091 + }, + { + "auxiliary_loss_clip": 0.01108894, + "auxiliary_loss_mlp": 0.01035341, + "balance_loss_clip": 1.02155399, + "balance_loss_mlp": 1.03657329, + "epoch": 0.5602284683601383, + "flos": 26356205698560.0, + "grad_norm": 1.7151693589962669, + "language_loss": 0.77363193, + "learning_rate": 1.7079777828825233e-06, + "loss": 0.79507434, + "num_input_tokens_seen": 200644520, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.72265625, + "step": 9318, + "time_per_iteration": 2.4952752590179443 + }, + { + "auxiliary_loss_clip": 0.01101693, + "auxiliary_loss_mlp": 0.01035081, + "balance_loss_clip": 1.02351773, + "balance_loss_mlp": 1.03302336, + "epoch": 0.5602885916128063, + "flos": 24496392228480.0, + "grad_norm": 1.698102214619228, + "language_loss": 0.75956237, + "learning_rate": 1.7075925006203558e-06, + "loss": 0.7809301, + "num_input_tokens_seen": 200664845, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6875, + "step": 9319, + "time_per_iteration": 2.479919910430908 + }, + { + "auxiliary_loss_clip": 0.01104648, + "auxiliary_loss_mlp": 0.01034126, + "balance_loss_clip": 1.02235985, + "balance_loss_mlp": 1.03689611, + "epoch": 0.5603487148654742, + "flos": 27345006270720.0, + "grad_norm": 1.554434910389292, + "language_loss": 0.85508537, + "learning_rate": 1.7072072294463101e-06, + "loss": 0.87647313, + "num_input_tokens_seen": 200686535, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.67578125, + "step": 9320, + "time_per_iteration": 2.511880874633789 + }, + { + "auxiliary_loss_clip": 0.01030962, + "auxiliary_loss_mlp": 0.00999706, + "balance_loss_clip": 0.99860352, + "balance_loss_mlp": 1.00918674, + "epoch": 0.5604088381181422, + "flos": 54087756180480.0, + "grad_norm": 0.7458732992694707, + "language_loss": 0.52630556, + "learning_rate": 1.706821969374996e-06, + "loss": 0.54661226, + "num_input_tokens_seen": 200736965, + "router_z_loss_clip": 0.01104736, + "router_z_loss_mlp": 0.21777344, + "step": 9321, + "time_per_iteration": 2.8576598167419434 + }, + { + "auxiliary_loss_clip": 0.01104414, + "auxiliary_loss_mlp": 0.01031081, + "balance_loss_clip": 1.01938033, + "balance_loss_mlp": 1.03744757, + "epoch": 0.5604689613708101, + "flos": 22236390357120.0, + "grad_norm": 1.4865751697326912, + "language_loss": 0.74422431, + "learning_rate": 1.7064367204210216e-06, + "loss": 0.76557928, + "num_input_tokens_seen": 200757420, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 9322, + "time_per_iteration": 2.480198383331299 + }, + { + "auxiliary_loss_clip": 0.01105934, + "auxiliary_loss_mlp": 0.01033008, + "balance_loss_clip": 1.01982379, + "balance_loss_mlp": 1.03641856, + "epoch": 0.5605290846234782, + "flos": 35297782940160.0, + "grad_norm": 1.8343710411867171, + "language_loss": 0.73661906, + "learning_rate": 1.7060514825989963e-06, + "loss": 0.75800848, + "num_input_tokens_seen": 200779520, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.6953125, + "step": 9323, + "time_per_iteration": 2.5517938137054443 + }, + { + "auxiliary_loss_clip": 0.01109096, + "auxiliary_loss_mlp": 0.01026708, + "balance_loss_clip": 1.01386333, + "balance_loss_mlp": 1.03797293, + "epoch": 0.5605892078761461, + "flos": 20263314326400.0, + "grad_norm": 1.5108510359489868, + "language_loss": 0.61287946, + "learning_rate": 1.7056662559235286e-06, + "loss": 0.63423753, + "num_input_tokens_seen": 200799485, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9324, + "time_per_iteration": 2.4675137996673584 + }, + { + "auxiliary_loss_clip": 0.01106981, + "auxiliary_loss_mlp": 0.0102911, + "balance_loss_clip": 1.01650345, + "balance_loss_mlp": 1.03693414, + "epoch": 0.5606493311288141, + "flos": 17308333134720.0, + "grad_norm": 2.2169286979326768, + "language_loss": 0.87785721, + "learning_rate": 1.705281040409226e-06, + "loss": 0.89921808, + "num_input_tokens_seen": 200817540, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 9325, + "time_per_iteration": 2.4160819053649902 + }, + { + "auxiliary_loss_clip": 0.01108623, + "auxiliary_loss_mlp": 0.01030754, + "balance_loss_clip": 1.01805806, + "balance_loss_mlp": 1.03765607, + "epoch": 0.560709454381482, + "flos": 21652985658240.0, + "grad_norm": 1.6383695475184654, + "language_loss": 0.74048722, + "learning_rate": 1.7048958360706952e-06, + "loss": 0.76188105, + "num_input_tokens_seen": 200838380, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9326, + "time_per_iteration": 2.463094711303711 + }, + { + "auxiliary_loss_clip": 0.01112046, + "auxiliary_loss_mlp": 0.01029376, + "balance_loss_clip": 1.01620328, + "balance_loss_mlp": 1.0386548, + "epoch": 0.56076957763415, + "flos": 20303355012480.0, + "grad_norm": 3.3443611641012674, + "language_loss": 0.78365433, + "learning_rate": 1.7045106429225447e-06, + "loss": 0.80506855, + "num_input_tokens_seen": 200855640, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 9327, + "time_per_iteration": 2.445756673812866 + }, + { + "auxiliary_loss_clip": 0.01108683, + "auxiliary_loss_mlp": 0.01031541, + "balance_loss_clip": 1.01842213, + "balance_loss_mlp": 1.03914046, + "epoch": 0.5608297008868179, + "flos": 25045897466880.0, + "grad_norm": 2.5559440694427478, + "language_loss": 0.78508025, + "learning_rate": 1.7041254609793795e-06, + "loss": 0.80648255, + "num_input_tokens_seen": 200876585, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6953125, + "step": 9328, + "time_per_iteration": 2.5156970024108887 + }, + { + "auxiliary_loss_clip": 0.01106121, + "auxiliary_loss_mlp": 0.01028303, + "balance_loss_clip": 1.01594675, + "balance_loss_mlp": 1.03623605, + "epoch": 0.560889824139486, + "flos": 19866825025920.0, + "grad_norm": 1.528557811702872, + "language_loss": 0.73765361, + "learning_rate": 1.7037402902558066e-06, + "loss": 0.7589978, + "num_input_tokens_seen": 200898175, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69921875, + "step": 9329, + "time_per_iteration": 2.4843335151672363 + }, + { + "auxiliary_loss_clip": 0.01110712, + "auxiliary_loss_mlp": 0.01036618, + "balance_loss_clip": 1.02325511, + "balance_loss_mlp": 1.03798938, + "epoch": 0.5609499473921539, + "flos": 22929394429440.0, + "grad_norm": 1.6466003553704387, + "language_loss": 0.83545572, + "learning_rate": 1.7033551307664324e-06, + "loss": 0.85692906, + "num_input_tokens_seen": 200917515, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7265625, + "step": 9330, + "time_per_iteration": 2.482752561569214 + }, + { + "auxiliary_loss_clip": 0.01031116, + "auxiliary_loss_mlp": 0.01002487, + "balance_loss_clip": 1.00147378, + "balance_loss_mlp": 1.0092634, + "epoch": 0.5610100706448219, + "flos": 53035825455360.0, + "grad_norm": 0.7161961657295335, + "language_loss": 0.57873559, + "learning_rate": 1.7029699825258603e-06, + "loss": 0.59907156, + "num_input_tokens_seen": 200978615, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.21875, + "step": 9331, + "time_per_iteration": 3.063901662826538 + }, + { + "auxiliary_loss_clip": 0.01108686, + "auxiliary_loss_mlp": 0.01032168, + "balance_loss_clip": 1.02002692, + "balance_loss_mlp": 1.03850377, + "epoch": 0.5610701938974898, + "flos": 21834944979840.0, + "grad_norm": 1.694841283599879, + "language_loss": 0.82141155, + "learning_rate": 1.7025848455486971e-06, + "loss": 0.84282017, + "num_input_tokens_seen": 200997745, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.703125, + "step": 9332, + "time_per_iteration": 2.475790500640869 + }, + { + "auxiliary_loss_clip": 0.01113328, + "auxiliary_loss_mlp": 0.01036451, + "balance_loss_clip": 1.02233052, + "balance_loss_mlp": 1.03915834, + "epoch": 0.5611303171501578, + "flos": 17457183095040.0, + "grad_norm": 1.7394490434662164, + "language_loss": 0.8172127, + "learning_rate": 1.7021997198495454e-06, + "loss": 0.83871055, + "num_input_tokens_seen": 201016370, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 9333, + "time_per_iteration": 2.4251558780670166 + }, + { + "auxiliary_loss_clip": 0.01106502, + "auxiliary_loss_mlp": 0.01027346, + "balance_loss_clip": 1.01541877, + "balance_loss_mlp": 1.03641915, + "epoch": 0.5611904404028258, + "flos": 22637799820800.0, + "grad_norm": 1.5456564302164297, + "language_loss": 0.73111224, + "learning_rate": 1.7018146054430108e-06, + "loss": 0.7524507, + "num_input_tokens_seen": 201034310, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.703125, + "step": 9334, + "time_per_iteration": 2.5241355895996094 + }, + { + "auxiliary_loss_clip": 0.01108924, + "auxiliary_loss_mlp": 0.01037845, + "balance_loss_clip": 1.02525675, + "balance_loss_mlp": 1.03886223, + "epoch": 0.5612505636554938, + "flos": 14316327999360.0, + "grad_norm": 1.7664531017043277, + "language_loss": 0.71317977, + "learning_rate": 1.7014295023436961e-06, + "loss": 0.73464751, + "num_input_tokens_seen": 201052030, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 9335, + "time_per_iteration": 2.4215545654296875 + }, + { + "auxiliary_loss_clip": 0.01109063, + "auxiliary_loss_mlp": 0.01029211, + "balance_loss_clip": 1.01659274, + "balance_loss_mlp": 1.0381881, + "epoch": 0.5613106869081618, + "flos": 16508279554560.0, + "grad_norm": 1.7059405915097856, + "language_loss": 0.76673937, + "learning_rate": 1.701044410566205e-06, + "loss": 0.78812212, + "num_input_tokens_seen": 201068445, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 9336, + "time_per_iteration": 2.456911087036133 + }, + { + "auxiliary_loss_clip": 0.01106617, + "auxiliary_loss_mlp": 0.01032175, + "balance_loss_clip": 1.0203793, + "balance_loss_mlp": 1.0376699, + "epoch": 0.5613708101608297, + "flos": 24058569352320.0, + "grad_norm": 2.253598480453168, + "language_loss": 0.644315, + "learning_rate": 1.7006593301251393e-06, + "loss": 0.66570294, + "num_input_tokens_seen": 201082140, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 9337, + "time_per_iteration": 2.4435572624206543 + }, + { + "auxiliary_loss_clip": 0.01029918, + "auxiliary_loss_mlp": 0.01004848, + "balance_loss_clip": 1.00367343, + "balance_loss_mlp": 1.00804543, + "epoch": 0.5614309334134977, + "flos": 64905735997440.0, + "grad_norm": 0.9905116764848269, + "language_loss": 0.62572861, + "learning_rate": 1.700274261035102e-06, + "loss": 0.64607626, + "num_input_tokens_seen": 201137245, + "router_z_loss_clip": 0.01171875, + "router_z_loss_mlp": 0.21875, + "step": 9338, + "time_per_iteration": 3.039401054382324 + }, + { + "auxiliary_loss_clip": 0.01110236, + "auxiliary_loss_mlp": 0.0103103, + "balance_loss_clip": 1.01862049, + "balance_loss_mlp": 1.03832674, + "epoch": 0.5614910566661656, + "flos": 32919849740160.0, + "grad_norm": 1.7660421922814409, + "language_loss": 0.65246809, + "learning_rate": 1.6998892033106946e-06, + "loss": 0.67388076, + "num_input_tokens_seen": 201157270, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71875, + "step": 9339, + "time_per_iteration": 2.5356857776641846 + }, + { + "auxiliary_loss_clip": 0.01106617, + "auxiliary_loss_mlp": 0.01032872, + "balance_loss_clip": 1.0203191, + "balance_loss_mlp": 1.03761101, + "epoch": 0.5615511799188336, + "flos": 18588871969920.0, + "grad_norm": 3.5768294087083317, + "language_loss": 0.69863123, + "learning_rate": 1.6995041569665184e-06, + "loss": 0.72002614, + "num_input_tokens_seen": 201174530, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 9340, + "time_per_iteration": 2.4699902534484863 + }, + { + "auxiliary_loss_clip": 0.01105107, + "auxiliary_loss_mlp": 0.01027787, + "balance_loss_clip": 1.01596177, + "balance_loss_mlp": 1.03900409, + "epoch": 0.5616113031715015, + "flos": 22820010537600.0, + "grad_norm": 1.8075752300654697, + "language_loss": 0.77621818, + "learning_rate": 1.6991191220171756e-06, + "loss": 0.7975471, + "num_input_tokens_seen": 201194905, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66015625, + "step": 9341, + "time_per_iteration": 2.456268072128296 + }, + { + "auxiliary_loss_clip": 0.01106711, + "auxiliary_loss_mlp": 0.01030586, + "balance_loss_clip": 1.01759195, + "balance_loss_mlp": 1.03572893, + "epoch": 0.5616714264241696, + "flos": 22345702421760.0, + "grad_norm": 1.9728763199974049, + "language_loss": 0.79315615, + "learning_rate": 1.6987340984772653e-06, + "loss": 0.81452906, + "num_input_tokens_seen": 201213715, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 9342, + "time_per_iteration": 2.4534597396850586 + }, + { + "auxiliary_loss_clip": 0.01111218, + "auxiliary_loss_mlp": 0.01030819, + "balance_loss_clip": 1.01735401, + "balance_loss_mlp": 1.03851485, + "epoch": 0.5617315496768375, + "flos": 18807783408000.0, + "grad_norm": 2.593835689079262, + "language_loss": 0.76322573, + "learning_rate": 1.6983490863613882e-06, + "loss": 0.78464609, + "num_input_tokens_seen": 201231415, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7265625, + "step": 9343, + "time_per_iteration": 3.8814024925231934 + }, + { + "auxiliary_loss_clip": 0.01109994, + "auxiliary_loss_mlp": 0.01037634, + "balance_loss_clip": 1.0245204, + "balance_loss_mlp": 1.03978682, + "epoch": 0.5617916729295055, + "flos": 18369314087040.0, + "grad_norm": 1.5945215839270617, + "language_loss": 0.68185151, + "learning_rate": 1.6979640856841442e-06, + "loss": 0.70332778, + "num_input_tokens_seen": 201249625, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 9344, + "time_per_iteration": 2.4659440517425537 + }, + { + "auxiliary_loss_clip": 0.01109593, + "auxiliary_loss_mlp": 0.01036696, + "balance_loss_clip": 1.02364254, + "balance_loss_mlp": 1.0381155, + "epoch": 0.5618517961821734, + "flos": 28179964892160.0, + "grad_norm": 2.2863999357797202, + "language_loss": 0.66754413, + "learning_rate": 1.6975790964601318e-06, + "loss": 0.68900704, + "num_input_tokens_seen": 201271205, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71484375, + "step": 9345, + "time_per_iteration": 2.5232093334198 + }, + { + "auxiliary_loss_clip": 0.01109525, + "auxiliary_loss_mlp": 0.01025247, + "balance_loss_clip": 1.01317143, + "balance_loss_mlp": 1.03883803, + "epoch": 0.5619119194348414, + "flos": 15486872411520.0, + "grad_norm": 1.8616054032141576, + "language_loss": 0.87347126, + "learning_rate": 1.6971941187039512e-06, + "loss": 0.89481902, + "num_input_tokens_seen": 201287700, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.70703125, + "step": 9346, + "time_per_iteration": 3.9651877880096436 + }, + { + "auxiliary_loss_clip": 0.0110623, + "auxiliary_loss_mlp": 0.01035149, + "balance_loss_clip": 1.02200019, + "balance_loss_mlp": 1.03657687, + "epoch": 0.5619720426875094, + "flos": 29128652951040.0, + "grad_norm": 2.36966351637476, + "language_loss": 0.59370089, + "learning_rate": 1.6968091524301993e-06, + "loss": 0.61511469, + "num_input_tokens_seen": 201307530, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.6953125, + "step": 9347, + "time_per_iteration": 3.9802420139312744 + }, + { + "auxiliary_loss_clip": 0.01108812, + "auxiliary_loss_mlp": 0.01037423, + "balance_loss_clip": 1.02319539, + "balance_loss_mlp": 1.03742838, + "epoch": 0.5620321659401774, + "flos": 18003743418240.0, + "grad_norm": 2.4273405009541107, + "language_loss": 0.68972194, + "learning_rate": 1.6964241976534745e-06, + "loss": 0.71118426, + "num_input_tokens_seen": 201326210, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7109375, + "step": 9348, + "time_per_iteration": 2.4413368701934814 + }, + { + "auxiliary_loss_clip": 0.01111452, + "auxiliary_loss_mlp": 0.01027053, + "balance_loss_clip": 1.01292634, + "balance_loss_mlp": 1.03695107, + "epoch": 0.5620922891928454, + "flos": 20594518657920.0, + "grad_norm": 1.9093659081457641, + "language_loss": 0.79040921, + "learning_rate": 1.6960392543883754e-06, + "loss": 0.81179428, + "num_input_tokens_seen": 201346120, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7421875, + "step": 9349, + "time_per_iteration": 2.4354894161224365 + }, + { + "auxiliary_loss_clip": 0.01110239, + "auxiliary_loss_mlp": 0.01028142, + "balance_loss_clip": 1.01527977, + "balance_loss_mlp": 1.03902698, + "epoch": 0.5621524124455133, + "flos": 26287006147200.0, + "grad_norm": 2.4504118343525207, + "language_loss": 0.67282045, + "learning_rate": 1.6956543226494975e-06, + "loss": 0.69420421, + "num_input_tokens_seen": 201365700, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9350, + "time_per_iteration": 2.548351287841797 + }, + { + "auxiliary_loss_clip": 0.01110364, + "auxiliary_loss_mlp": 0.01037163, + "balance_loss_clip": 1.02408016, + "balance_loss_mlp": 1.03830576, + "epoch": 0.5622125356981813, + "flos": 12750299867520.0, + "grad_norm": 2.1113714103165884, + "language_loss": 0.78716242, + "learning_rate": 1.6952694024514381e-06, + "loss": 0.80863774, + "num_input_tokens_seen": 201382795, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 9351, + "time_per_iteration": 2.4350974559783936 + }, + { + "auxiliary_loss_clip": 0.01112089, + "auxiliary_loss_mlp": 0.01032433, + "balance_loss_clip": 1.01989186, + "balance_loss_mlp": 1.03818786, + "epoch": 0.5622726589508492, + "flos": 23805327490560.0, + "grad_norm": 1.498970106789848, + "language_loss": 0.58875829, + "learning_rate": 1.6948844938087945e-06, + "loss": 0.6102035, + "num_input_tokens_seen": 201402780, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.73828125, + "step": 9352, + "time_per_iteration": 2.4637343883514404 + }, + { + "auxiliary_loss_clip": 0.01103109, + "auxiliary_loss_mlp": 0.01031893, + "balance_loss_clip": 1.01988828, + "balance_loss_mlp": 1.03640223, + "epoch": 0.5623327822035172, + "flos": 24718212668160.0, + "grad_norm": 1.2149782460758531, + "language_loss": 0.71828997, + "learning_rate": 1.6944995967361604e-06, + "loss": 0.73964, + "num_input_tokens_seen": 201424140, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66796875, + "step": 9353, + "time_per_iteration": 2.4747259616851807 + }, + { + "auxiliary_loss_clip": 0.01110024, + "auxiliary_loss_mlp": 0.01028607, + "balance_loss_clip": 1.01584542, + "balance_loss_mlp": 1.03763878, + "epoch": 0.5623929054561851, + "flos": 14019274523520.0, + "grad_norm": 5.092816610198626, + "language_loss": 0.75717902, + "learning_rate": 1.6941147112481327e-06, + "loss": 0.77856535, + "num_input_tokens_seen": 201439645, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7265625, + "step": 9354, + "time_per_iteration": 2.412938356399536 + }, + { + "auxiliary_loss_clip": 0.01111475, + "auxiliary_loss_mlp": 0.01033305, + "balance_loss_clip": 1.02066851, + "balance_loss_mlp": 1.03783214, + "epoch": 0.5624530287088532, + "flos": 20704405340160.0, + "grad_norm": 2.4650169046981434, + "language_loss": 0.72549778, + "learning_rate": 1.6937298373593056e-06, + "loss": 0.74694556, + "num_input_tokens_seen": 201459970, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.734375, + "step": 9355, + "time_per_iteration": 2.484099864959717 + }, + { + "auxiliary_loss_clip": 0.01108801, + "auxiliary_loss_mlp": 0.01030058, + "balance_loss_clip": 1.01700521, + "balance_loss_mlp": 1.03818929, + "epoch": 0.5625131519615211, + "flos": 21470918595840.0, + "grad_norm": 1.8617046290731056, + "language_loss": 0.73371327, + "learning_rate": 1.693344975084274e-06, + "loss": 0.75510186, + "num_input_tokens_seen": 201480055, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.70703125, + "step": 9356, + "time_per_iteration": 2.465129852294922 + }, + { + "auxiliary_loss_clip": 0.0110695, + "auxiliary_loss_mlp": 0.01035155, + "balance_loss_clip": 1.02280545, + "balance_loss_mlp": 1.03822494, + "epoch": 0.5625732752141891, + "flos": 18698004466560.0, + "grad_norm": 1.9991704999969526, + "language_loss": 0.82985485, + "learning_rate": 1.6929601244376318e-06, + "loss": 0.85127592, + "num_input_tokens_seen": 201497645, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 9357, + "time_per_iteration": 2.41115665435791 + }, + { + "auxiliary_loss_clip": 0.01107392, + "auxiliary_loss_mlp": 0.01030025, + "balance_loss_clip": 1.01797318, + "balance_loss_mlp": 1.03697777, + "epoch": 0.562633398466857, + "flos": 16216900427520.0, + "grad_norm": 1.9946457873090748, + "language_loss": 0.720213, + "learning_rate": 1.6925752854339722e-06, + "loss": 0.74158716, + "num_input_tokens_seen": 201515455, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.703125, + "step": 9358, + "time_per_iteration": 2.4276978969573975 + }, + { + "auxiliary_loss_clip": 0.01105818, + "auxiliary_loss_mlp": 0.0103922, + "balance_loss_clip": 1.0264169, + "balance_loss_mlp": 1.03677905, + "epoch": 0.562693521719525, + "flos": 22491930689280.0, + "grad_norm": 2.1174896987661755, + "language_loss": 0.77650487, + "learning_rate": 1.6921904580878885e-06, + "loss": 0.79795527, + "num_input_tokens_seen": 201534500, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69140625, + "step": 9359, + "time_per_iteration": 2.5595555305480957 + }, + { + "auxiliary_loss_clip": 0.01108374, + "auxiliary_loss_mlp": 0.01029979, + "balance_loss_clip": 1.0177722, + "balance_loss_mlp": 1.03723145, + "epoch": 0.562753644972193, + "flos": 25331171281920.0, + "grad_norm": 1.6788321894876823, + "language_loss": 0.70193481, + "learning_rate": 1.6918056424139736e-06, + "loss": 0.7233184, + "num_input_tokens_seen": 201553280, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.7109375, + "step": 9360, + "time_per_iteration": 2.485053062438965 + }, + { + "auxiliary_loss_clip": 0.01030911, + "auxiliary_loss_mlp": 0.01001933, + "balance_loss_clip": 1.0007472, + "balance_loss_mlp": 1.00916827, + "epoch": 0.562813768224861, + "flos": 67392622126080.0, + "grad_norm": 0.7762895856423075, + "language_loss": 0.55579072, + "learning_rate": 1.6914208384268197e-06, + "loss": 0.57611912, + "num_input_tokens_seen": 201610030, + "router_z_loss_clip": 0.01184082, + "router_z_loss_mlp": 0.21679688, + "step": 9361, + "time_per_iteration": 3.025913953781128 + }, + { + "auxiliary_loss_clip": 0.01105882, + "auxiliary_loss_mlp": 0.01033189, + "balance_loss_clip": 1.02153039, + "balance_loss_mlp": 1.03833425, + "epoch": 0.562873891477529, + "flos": 23331163029120.0, + "grad_norm": 1.3888397041491727, + "language_loss": 0.8183462, + "learning_rate": 1.691036046141018e-06, + "loss": 0.83973688, + "num_input_tokens_seen": 201628370, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.67578125, + "step": 9362, + "time_per_iteration": 2.5037269592285156 + }, + { + "auxiliary_loss_clip": 0.01105782, + "auxiliary_loss_mlp": 0.0103495, + "balance_loss_clip": 1.02248108, + "balance_loss_mlp": 1.03707612, + "epoch": 0.5629340147301969, + "flos": 38472824805120.0, + "grad_norm": 1.5280416781125297, + "language_loss": 0.74536633, + "learning_rate": 1.6906512655711614e-06, + "loss": 0.7667737, + "num_input_tokens_seen": 201649790, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 9363, + "time_per_iteration": 2.617192268371582 + }, + { + "auxiliary_loss_clip": 0.01109847, + "auxiliary_loss_mlp": 0.01032322, + "balance_loss_clip": 1.01944757, + "balance_loss_mlp": 1.03815794, + "epoch": 0.5629941379828649, + "flos": 29242023252480.0, + "grad_norm": 1.6569550766143035, + "language_loss": 0.83350259, + "learning_rate": 1.690266496731839e-06, + "loss": 0.85492432, + "num_input_tokens_seen": 201669175, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 9364, + "time_per_iteration": 2.5304059982299805 + }, + { + "auxiliary_loss_clip": 0.01107314, + "auxiliary_loss_mlp": 0.01034102, + "balance_loss_clip": 1.0222224, + "balance_loss_mlp": 1.03869832, + "epoch": 0.5630542612355328, + "flos": 19420885676160.0, + "grad_norm": 2.211298310091642, + "language_loss": 0.64659059, + "learning_rate": 1.689881739637642e-06, + "loss": 0.66800475, + "num_input_tokens_seen": 201687000, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 9365, + "time_per_iteration": 2.4514007568359375 + }, + { + "auxiliary_loss_clip": 0.01114055, + "auxiliary_loss_mlp": 0.01036393, + "balance_loss_clip": 1.02264261, + "balance_loss_mlp": 1.03817499, + "epoch": 0.5631143844882008, + "flos": 22266303408000.0, + "grad_norm": 3.047674915648226, + "language_loss": 0.81461316, + "learning_rate": 1.6894969943031611e-06, + "loss": 0.83611768, + "num_input_tokens_seen": 201703335, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7578125, + "step": 9366, + "time_per_iteration": 2.4486207962036133 + }, + { + "auxiliary_loss_clip": 0.01107023, + "auxiliary_loss_mlp": 0.01032651, + "balance_loss_clip": 1.02089667, + "balance_loss_mlp": 1.03850698, + "epoch": 0.5631745077408687, + "flos": 22965305051520.0, + "grad_norm": 1.4263654905382444, + "language_loss": 0.73047578, + "learning_rate": 1.6891122607429845e-06, + "loss": 0.75187254, + "num_input_tokens_seen": 201723495, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.68359375, + "step": 9367, + "time_per_iteration": 2.4800310134887695 + }, + { + "auxiliary_loss_clip": 0.01030227, + "auxiliary_loss_mlp": 0.01002627, + "balance_loss_clip": 1.00138175, + "balance_loss_mlp": 1.00840044, + "epoch": 0.5632346309935368, + "flos": 65080515576960.0, + "grad_norm": 0.6249011108272925, + "language_loss": 0.5348472, + "learning_rate": 1.6887275389717028e-06, + "loss": 0.55517572, + "num_input_tokens_seen": 201792615, + "router_z_loss_clip": 0.01245117, + "router_z_loss_mlp": 0.21875, + "step": 9368, + "time_per_iteration": 3.1797282695770264 + }, + { + "auxiliary_loss_clip": 0.01108664, + "auxiliary_loss_mlp": 0.01035647, + "balance_loss_clip": 1.02317202, + "balance_loss_mlp": 1.03974152, + "epoch": 0.5632947542462047, + "flos": 23002903612800.0, + "grad_norm": 1.7643271699947485, + "language_loss": 0.69015235, + "learning_rate": 1.6883428290039046e-06, + "loss": 0.71159542, + "num_input_tokens_seen": 201812520, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 9369, + "time_per_iteration": 2.4736390113830566 + }, + { + "auxiliary_loss_clip": 0.01105862, + "auxiliary_loss_mlp": 0.01033878, + "balance_loss_clip": 1.02091432, + "balance_loss_mlp": 1.03527367, + "epoch": 0.5633548774988727, + "flos": 30482593228800.0, + "grad_norm": 1.7859826045223857, + "language_loss": 0.7540313, + "learning_rate": 1.6879581308541763e-06, + "loss": 0.77542865, + "num_input_tokens_seen": 201834185, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.70703125, + "step": 9370, + "time_per_iteration": 2.5553858280181885 + }, + { + "auxiliary_loss_clip": 0.01109895, + "auxiliary_loss_mlp": 0.01033197, + "balance_loss_clip": 1.01930332, + "balance_loss_mlp": 1.0373863, + "epoch": 0.5634150007515406, + "flos": 18515039564160.0, + "grad_norm": 3.078957924920332, + "language_loss": 0.75699127, + "learning_rate": 1.687573444537108e-06, + "loss": 0.77842218, + "num_input_tokens_seen": 201851305, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7265625, + "step": 9371, + "time_per_iteration": 2.4327011108398438 + }, + { + "auxiliary_loss_clip": 0.01106308, + "auxiliary_loss_mlp": 0.01035962, + "balance_loss_clip": 1.02386189, + "balance_loss_mlp": 1.03729022, + "epoch": 0.5634751240042086, + "flos": 19244672530560.0, + "grad_norm": 2.3308389897051702, + "language_loss": 0.76292467, + "learning_rate": 1.687188770067285e-06, + "loss": 0.7843473, + "num_input_tokens_seen": 201870350, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69140625, + "step": 9372, + "time_per_iteration": 2.447720766067505 + }, + { + "auxiliary_loss_clip": 0.01106021, + "auxiliary_loss_mlp": 0.01032443, + "balance_loss_clip": 1.02006888, + "balance_loss_mlp": 1.03829265, + "epoch": 0.5635352472568766, + "flos": 12020630987520.0, + "grad_norm": 2.0572116747420224, + "language_loss": 0.72010261, + "learning_rate": 1.6868041074592956e-06, + "loss": 0.74148726, + "num_input_tokens_seen": 201886800, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 9373, + "time_per_iteration": 2.4268109798431396 + }, + { + "auxiliary_loss_clip": 0.01110498, + "auxiliary_loss_mlp": 0.01031888, + "balance_loss_clip": 1.01839924, + "balance_loss_mlp": 1.03994441, + "epoch": 0.5635953705095446, + "flos": 21871645701120.0, + "grad_norm": 2.3770492627250617, + "language_loss": 0.82499874, + "learning_rate": 1.6864194567277264e-06, + "loss": 0.84642255, + "num_input_tokens_seen": 201904730, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.70703125, + "step": 9374, + "time_per_iteration": 2.49582576751709 + }, + { + "auxiliary_loss_clip": 0.0110343, + "auxiliary_loss_mlp": 0.01026872, + "balance_loss_clip": 1.0145762, + "balance_loss_mlp": 1.03463507, + "epoch": 0.5636554937622126, + "flos": 27126166659840.0, + "grad_norm": 1.5156995265370945, + "language_loss": 0.66020733, + "learning_rate": 1.6860348178871618e-06, + "loss": 0.68151033, + "num_input_tokens_seen": 201924850, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 9375, + "time_per_iteration": 2.516523599624634 + }, + { + "auxiliary_loss_clip": 0.01109185, + "auxiliary_loss_mlp": 0.01036661, + "balance_loss_clip": 1.02434063, + "balance_loss_mlp": 1.03792977, + "epoch": 0.5637156170148805, + "flos": 12926405272320.0, + "grad_norm": 5.168267369431286, + "language_loss": 0.80860347, + "learning_rate": 1.6856501909521889e-06, + "loss": 0.83006191, + "num_input_tokens_seen": 201939500, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.7109375, + "step": 9376, + "time_per_iteration": 2.4961087703704834 + }, + { + "auxiliary_loss_clip": 0.01110113, + "auxiliary_loss_mlp": 0.01033851, + "balance_loss_clip": 1.02070785, + "balance_loss_mlp": 1.03650188, + "epoch": 0.5637757402675485, + "flos": 45551033130240.0, + "grad_norm": 1.331404975713729, + "language_loss": 0.69354665, + "learning_rate": 1.6852655759373925e-06, + "loss": 0.71498632, + "num_input_tokens_seen": 201963000, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 9377, + "time_per_iteration": 2.6732125282287598 + }, + { + "auxiliary_loss_clip": 0.01104228, + "auxiliary_loss_mlp": 0.01030175, + "balance_loss_clip": 1.01828349, + "balance_loss_mlp": 1.03818166, + "epoch": 0.5638358635202164, + "flos": 20886041439360.0, + "grad_norm": 1.3430474289029712, + "language_loss": 0.74622703, + "learning_rate": 1.6848809728573565e-06, + "loss": 0.76757109, + "num_input_tokens_seen": 201983145, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66015625, + "step": 9378, + "time_per_iteration": 2.4836812019348145 + }, + { + "auxiliary_loss_clip": 0.01111215, + "auxiliary_loss_mlp": 0.01035583, + "balance_loss_clip": 1.0216949, + "balance_loss_mlp": 1.03538918, + "epoch": 0.5638959867728844, + "flos": 18806562345600.0, + "grad_norm": 2.4002466182561366, + "language_loss": 0.81976169, + "learning_rate": 1.6844963817266656e-06, + "loss": 0.84122968, + "num_input_tokens_seen": 202000335, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 9379, + "time_per_iteration": 2.4185829162597656 + }, + { + "auxiliary_loss_clip": 0.01106862, + "auxiliary_loss_mlp": 0.01029607, + "balance_loss_clip": 1.01691699, + "balance_loss_mlp": 1.03549135, + "epoch": 0.5639561100255523, + "flos": 27490336698240.0, + "grad_norm": 2.697413775835763, + "language_loss": 0.71534967, + "learning_rate": 1.6841118025599042e-06, + "loss": 0.73671436, + "num_input_tokens_seen": 202018275, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71484375, + "step": 9380, + "time_per_iteration": 2.5077950954437256 + }, + { + "auxiliary_loss_clip": 0.01110271, + "auxiliary_loss_mlp": 0.01034366, + "balance_loss_clip": 1.02068686, + "balance_loss_mlp": 1.03794408, + "epoch": 0.5640162332782204, + "flos": 18076570243200.0, + "grad_norm": 3.2105212283898905, + "language_loss": 0.74216485, + "learning_rate": 1.6837272353716542e-06, + "loss": 0.7636112, + "num_input_tokens_seen": 202034330, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.72265625, + "step": 9381, + "time_per_iteration": 2.4029319286346436 + }, + { + "auxiliary_loss_clip": 0.01110337, + "auxiliary_loss_mlp": 0.01031927, + "balance_loss_clip": 1.01963091, + "balance_loss_mlp": 1.03806376, + "epoch": 0.5640763565308883, + "flos": 20884856290560.0, + "grad_norm": 3.316310717009383, + "language_loss": 0.72300208, + "learning_rate": 1.683342680176499e-06, + "loss": 0.7444247, + "num_input_tokens_seen": 202053100, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.72265625, + "step": 9382, + "time_per_iteration": 2.501958131790161 + }, + { + "auxiliary_loss_clip": 0.01028829, + "auxiliary_loss_mlp": 0.00999503, + "balance_loss_clip": 0.99848998, + "balance_loss_mlp": 1.00756264, + "epoch": 0.5641364797835563, + "flos": 64447912224000.0, + "grad_norm": 0.7363360341332579, + "language_loss": 0.54461426, + "learning_rate": 1.682958136989022e-06, + "loss": 0.5648976, + "num_input_tokens_seen": 202120125, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.21289062, + "step": 9383, + "time_per_iteration": 3.2148938179016113 + }, + { + "auxiliary_loss_clip": 0.01110708, + "auxiliary_loss_mlp": 0.01030206, + "balance_loss_clip": 1.01627028, + "balance_loss_mlp": 1.03699017, + "epoch": 0.5641966030362242, + "flos": 18660944609280.0, + "grad_norm": 1.8140556963544339, + "language_loss": 0.71018171, + "learning_rate": 1.6825736058238033e-06, + "loss": 0.73159087, + "num_input_tokens_seen": 202138030, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.73828125, + "step": 9384, + "time_per_iteration": 2.442484140396118 + }, + { + "auxiliary_loss_clip": 0.0110745, + "auxiliary_loss_mlp": 0.01030718, + "balance_loss_clip": 1.01751578, + "balance_loss_mlp": 1.03652072, + "epoch": 0.5642567262888922, + "flos": 22492325738880.0, + "grad_norm": 7.95557819766849, + "language_loss": 0.76225626, + "learning_rate": 1.6821890866954263e-06, + "loss": 0.78363794, + "num_input_tokens_seen": 202155580, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 9385, + "time_per_iteration": 3.928744316101074 + }, + { + "auxiliary_loss_clip": 0.01105207, + "auxiliary_loss_mlp": 0.0103345, + "balance_loss_clip": 1.02080739, + "balance_loss_mlp": 1.0359602, + "epoch": 0.5643168495415603, + "flos": 13003972692480.0, + "grad_norm": 2.157193633028955, + "language_loss": 0.82184142, + "learning_rate": 1.6818045796184703e-06, + "loss": 0.84322798, + "num_input_tokens_seen": 202170365, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 9386, + "time_per_iteration": 2.397623062133789 + }, + { + "auxiliary_loss_clip": 0.01112226, + "auxiliary_loss_mlp": 0.01035726, + "balance_loss_clip": 1.0220114, + "balance_loss_mlp": 1.03887677, + "epoch": 0.5643769727942282, + "flos": 18588297352320.0, + "grad_norm": 2.006582014999343, + "language_loss": 0.6989364, + "learning_rate": 1.681420084607516e-06, + "loss": 0.72041589, + "num_input_tokens_seen": 202189095, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73046875, + "step": 9387, + "time_per_iteration": 5.281404733657837 + }, + { + "auxiliary_loss_clip": 0.01110413, + "auxiliary_loss_mlp": 0.01033865, + "balance_loss_clip": 1.02143192, + "balance_loss_mlp": 1.03790522, + "epoch": 0.5644370960468962, + "flos": 33806269572480.0, + "grad_norm": 1.551891117692425, + "language_loss": 0.74553275, + "learning_rate": 1.6810356016771452e-06, + "loss": 0.76697552, + "num_input_tokens_seen": 202213500, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7265625, + "step": 9388, + "time_per_iteration": 4.091272830963135 + }, + { + "auxiliary_loss_clip": 0.01103254, + "auxiliary_loss_mlp": 0.01030903, + "balance_loss_clip": 1.01970327, + "balance_loss_mlp": 1.03551602, + "epoch": 0.5644972192995641, + "flos": 21214911386880.0, + "grad_norm": 1.6063296237871756, + "language_loss": 0.82072294, + "learning_rate": 1.6806511308419353e-06, + "loss": 0.8420645, + "num_input_tokens_seen": 202231920, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.67578125, + "step": 9389, + "time_per_iteration": 2.4588046073913574 + }, + { + "auxiliary_loss_clip": 0.01110191, + "auxiliary_loss_mlp": 0.01034189, + "balance_loss_clip": 1.01995528, + "balance_loss_mlp": 1.03775918, + "epoch": 0.5645573425522321, + "flos": 18587722734720.0, + "grad_norm": 1.8781979731175902, + "language_loss": 0.64145517, + "learning_rate": 1.680266672116467e-06, + "loss": 0.66289902, + "num_input_tokens_seen": 202247600, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.72265625, + "step": 9390, + "time_per_iteration": 2.4152185916900635 + }, + { + "auxiliary_loss_clip": 0.01108689, + "auxiliary_loss_mlp": 0.01031011, + "balance_loss_clip": 1.01928711, + "balance_loss_mlp": 1.0396266, + "epoch": 0.5646174658049, + "flos": 18113809668480.0, + "grad_norm": 1.6485981004433565, + "language_loss": 0.91899133, + "learning_rate": 1.6798822255153192e-06, + "loss": 0.94038832, + "num_input_tokens_seen": 202265350, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.69140625, + "step": 9391, + "time_per_iteration": 2.4316937923431396 + }, + { + "auxiliary_loss_clip": 0.01113898, + "auxiliary_loss_mlp": 0.01036386, + "balance_loss_clip": 1.02221847, + "balance_loss_mlp": 1.03941607, + "epoch": 0.564677589057568, + "flos": 28329964087680.0, + "grad_norm": 1.8545056387285421, + "language_loss": 0.60528994, + "learning_rate": 1.6794977910530684e-06, + "loss": 0.62679285, + "num_input_tokens_seen": 202284285, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7421875, + "step": 9392, + "time_per_iteration": 2.524616003036499 + }, + { + "auxiliary_loss_clip": 0.01106729, + "auxiliary_loss_mlp": 0.01027429, + "balance_loss_clip": 1.01412547, + "balance_loss_mlp": 1.03683674, + "epoch": 0.564737712310236, + "flos": 22163743100160.0, + "grad_norm": 1.8891326454378248, + "language_loss": 0.81002814, + "learning_rate": 1.6791133687442937e-06, + "loss": 0.83136976, + "num_input_tokens_seen": 202303450, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.69921875, + "step": 9393, + "time_per_iteration": 2.5394442081451416 + }, + { + "auxiliary_loss_clip": 0.01109875, + "auxiliary_loss_mlp": 0.01029911, + "balance_loss_clip": 1.0175252, + "balance_loss_mlp": 1.03945863, + "epoch": 0.564797835562904, + "flos": 20959011918720.0, + "grad_norm": 1.6361233529041357, + "language_loss": 0.87129962, + "learning_rate": 1.6787289586035725e-06, + "loss": 0.89269751, + "num_input_tokens_seen": 202322315, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.703125, + "step": 9394, + "time_per_iteration": 2.4735207557678223 + }, + { + "auxiliary_loss_clip": 0.01109379, + "auxiliary_loss_mlp": 0.01030351, + "balance_loss_clip": 1.01856136, + "balance_loss_mlp": 1.04019666, + "epoch": 0.5648579588155719, + "flos": 17420302805760.0, + "grad_norm": 2.1407868955990232, + "language_loss": 0.84850395, + "learning_rate": 1.6783445606454814e-06, + "loss": 0.8699013, + "num_input_tokens_seen": 202339905, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.69140625, + "step": 9395, + "time_per_iteration": 2.457840919494629 + }, + { + "auxiliary_loss_clip": 0.01029319, + "auxiliary_loss_mlp": 0.00999952, + "balance_loss_clip": 0.99876004, + "balance_loss_mlp": 1.00789344, + "epoch": 0.5649180820682399, + "flos": 69929568835200.0, + "grad_norm": 0.857023745969297, + "language_loss": 0.58308172, + "learning_rate": 1.677960174884597e-06, + "loss": 0.60337436, + "num_input_tokens_seen": 202397320, + "router_z_loss_clip": 0.01190186, + "router_z_loss_mlp": 0.21484375, + "step": 9396, + "time_per_iteration": 3.073537588119507 + }, + { + "auxiliary_loss_clip": 0.01110535, + "auxiliary_loss_mlp": 0.01030433, + "balance_loss_clip": 1.01818371, + "balance_loss_mlp": 1.03816915, + "epoch": 0.5649782053209078, + "flos": 24973070641920.0, + "grad_norm": 2.248812637940723, + "language_loss": 0.70105237, + "learning_rate": 1.6775758013354943e-06, + "loss": 0.72246206, + "num_input_tokens_seen": 202416865, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.72265625, + "step": 9397, + "time_per_iteration": 2.4962973594665527 + }, + { + "auxiliary_loss_clip": 0.01109847, + "auxiliary_loss_mlp": 0.01032556, + "balance_loss_clip": 1.02008653, + "balance_loss_mlp": 1.03723562, + "epoch": 0.5650383285735758, + "flos": 21726602582400.0, + "grad_norm": 1.751232513493423, + "language_loss": 0.66376907, + "learning_rate": 1.67719144001275e-06, + "loss": 0.68519312, + "num_input_tokens_seen": 202436210, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7265625, + "step": 9398, + "time_per_iteration": 2.4747612476348877 + }, + { + "auxiliary_loss_clip": 0.01027927, + "auxiliary_loss_mlp": 0.0100079, + "balance_loss_clip": 0.99962217, + "balance_loss_mlp": 1.00642622, + "epoch": 0.5650984518262439, + "flos": 65904484636800.0, + "grad_norm": 0.8050196413226386, + "language_loss": 0.58135325, + "learning_rate": 1.6768070909309386e-06, + "loss": 0.60164046, + "num_input_tokens_seen": 202492925, + "router_z_loss_clip": 0.01165771, + "router_z_loss_mlp": 0.21484375, + "step": 9399, + "time_per_iteration": 3.043860912322998 + }, + { + "auxiliary_loss_clip": 0.01109539, + "auxiliary_loss_mlp": 0.01034107, + "balance_loss_clip": 1.01959336, + "balance_loss_mlp": 1.03663015, + "epoch": 0.5651585750789118, + "flos": 21032592929280.0, + "grad_norm": 1.8022721102148394, + "language_loss": 0.72654182, + "learning_rate": 1.6764227541046347e-06, + "loss": 0.74797827, + "num_input_tokens_seen": 202511905, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7265625, + "step": 9400, + "time_per_iteration": 2.46345853805542 + }, + { + "auxiliary_loss_clip": 0.01112209, + "auxiliary_loss_mlp": 0.01036399, + "balance_loss_clip": 1.02223074, + "balance_loss_mlp": 1.03858781, + "epoch": 0.5652186983315798, + "flos": 18551919853440.0, + "grad_norm": 2.2275961694321254, + "language_loss": 0.61034292, + "learning_rate": 1.676038429548412e-06, + "loss": 0.63182896, + "num_input_tokens_seen": 202529815, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.734375, + "step": 9401, + "time_per_iteration": 2.4518327713012695 + }, + { + "auxiliary_loss_clip": 0.01105602, + "auxiliary_loss_mlp": 0.01028063, + "balance_loss_clip": 1.01590967, + "balance_loss_mlp": 1.03578329, + "epoch": 0.5652788215842477, + "flos": 18478662065280.0, + "grad_norm": 1.8211208041554372, + "language_loss": 0.81334603, + "learning_rate": 1.6756541172768453e-06, + "loss": 0.8346827, + "num_input_tokens_seen": 202547710, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69921875, + "step": 9402, + "time_per_iteration": 2.4201457500457764 + }, + { + "auxiliary_loss_clip": 0.0110456, + "auxiliary_loss_mlp": 0.01033217, + "balance_loss_clip": 1.02154684, + "balance_loss_mlp": 1.03594768, + "epoch": 0.5653389448369157, + "flos": 30044052080640.0, + "grad_norm": 1.4814077209882908, + "language_loss": 0.77969164, + "learning_rate": 1.6752698173045068e-06, + "loss": 0.80106944, + "num_input_tokens_seen": 202568835, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.68359375, + "step": 9403, + "time_per_iteration": 2.5353829860687256 + }, + { + "auxiliary_loss_clip": 0.01106959, + "auxiliary_loss_mlp": 0.01027921, + "balance_loss_clip": 1.01558244, + "balance_loss_mlp": 1.03666544, + "epoch": 0.5653990680895836, + "flos": 16727550128640.0, + "grad_norm": 1.6092170779922605, + "language_loss": 0.68699729, + "learning_rate": 1.6748855296459685e-06, + "loss": 0.70834613, + "num_input_tokens_seen": 202587385, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 9404, + "time_per_iteration": 2.4321181774139404 + }, + { + "auxiliary_loss_clip": 0.01102774, + "auxiliary_loss_mlp": 0.0103085, + "balance_loss_clip": 1.01951897, + "balance_loss_mlp": 1.03503776, + "epoch": 0.5654591913422516, + "flos": 14538256179840.0, + "grad_norm": 2.484491546437136, + "language_loss": 0.66842878, + "learning_rate": 1.6745012543158045e-06, + "loss": 0.68976498, + "num_input_tokens_seen": 202604815, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6796875, + "step": 9405, + "time_per_iteration": 2.440232992172241 + }, + { + "auxiliary_loss_clip": 0.01104276, + "auxiliary_loss_mlp": 0.01031828, + "balance_loss_clip": 1.02001476, + "balance_loss_mlp": 1.03823268, + "epoch": 0.5655193145949196, + "flos": 26209905603840.0, + "grad_norm": 1.9824391842040467, + "language_loss": 0.74238181, + "learning_rate": 1.6741169913285852e-06, + "loss": 0.76374286, + "num_input_tokens_seen": 202623775, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66015625, + "step": 9406, + "time_per_iteration": 2.4748172760009766 + }, + { + "auxiliary_loss_clip": 0.0110835, + "auxiliary_loss_mlp": 0.01033658, + "balance_loss_clip": 1.02006197, + "balance_loss_mlp": 1.03640151, + "epoch": 0.5655794378475876, + "flos": 25046579825280.0, + "grad_norm": 1.7875183280919196, + "language_loss": 0.79345733, + "learning_rate": 1.673732740698882e-06, + "loss": 0.81487745, + "num_input_tokens_seen": 202643375, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71875, + "step": 9407, + "time_per_iteration": 2.507815361022949 + }, + { + "auxiliary_loss_clip": 0.01104854, + "auxiliary_loss_mlp": 0.01031122, + "balance_loss_clip": 1.0192194, + "balance_loss_mlp": 1.03815317, + "epoch": 0.5656395611002555, + "flos": 31032852652800.0, + "grad_norm": 1.520930632215419, + "language_loss": 0.70626116, + "learning_rate": 1.6733485024412666e-06, + "loss": 0.7276209, + "num_input_tokens_seen": 202668400, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 9408, + "time_per_iteration": 2.62674880027771 + }, + { + "auxiliary_loss_clip": 0.01106638, + "auxiliary_loss_mlp": 0.01032436, + "balance_loss_clip": 1.02018738, + "balance_loss_mlp": 1.03758848, + "epoch": 0.5656996843529235, + "flos": 20229522606720.0, + "grad_norm": 2.0177540820880377, + "language_loss": 0.81701803, + "learning_rate": 1.672964276570308e-06, + "loss": 0.83840877, + "num_input_tokens_seen": 202685125, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.69140625, + "step": 9409, + "time_per_iteration": 2.4532053470611572 + }, + { + "auxiliary_loss_clip": 0.01105936, + "auxiliary_loss_mlp": 0.01026562, + "balance_loss_clip": 1.01446199, + "balance_loss_mlp": 1.03632855, + "epoch": 0.5657598076055914, + "flos": 20996251344000.0, + "grad_norm": 1.7583452820695855, + "language_loss": 0.77886415, + "learning_rate": 1.6725800631005776e-06, + "loss": 0.80018914, + "num_input_tokens_seen": 202703830, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69921875, + "step": 9410, + "time_per_iteration": 2.441938877105713 + }, + { + "auxiliary_loss_clip": 0.01107661, + "auxiliary_loss_mlp": 0.01033936, + "balance_loss_clip": 1.02188444, + "balance_loss_mlp": 1.0371294, + "epoch": 0.5658199308582594, + "flos": 11545999649280.0, + "grad_norm": 2.4716186369957405, + "language_loss": 0.83512276, + "learning_rate": 1.6721958620466432e-06, + "loss": 0.85653877, + "num_input_tokens_seen": 202719835, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.703125, + "step": 9411, + "time_per_iteration": 2.4718945026397705 + }, + { + "auxiliary_loss_clip": 0.01111478, + "auxiliary_loss_mlp": 0.01031553, + "balance_loss_clip": 1.01870787, + "balance_loss_mlp": 1.03809881, + "epoch": 0.5658800541109275, + "flos": 14172146807040.0, + "grad_norm": 2.235812012909735, + "language_loss": 0.67052126, + "learning_rate": 1.6718116734230749e-06, + "loss": 0.69195151, + "num_input_tokens_seen": 202736795, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.734375, + "step": 9412, + "time_per_iteration": 2.4114651679992676 + }, + { + "auxiliary_loss_clip": 0.01102875, + "auxiliary_loss_mlp": 0.01027937, + "balance_loss_clip": 1.01747072, + "balance_loss_mlp": 1.03637409, + "epoch": 0.5659401773635954, + "flos": 27305073325440.0, + "grad_norm": 1.4642683426161254, + "language_loss": 0.58723432, + "learning_rate": 1.6714274972444413e-06, + "loss": 0.60854244, + "num_input_tokens_seen": 202756900, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.6640625, + "step": 9413, + "time_per_iteration": 2.5274460315704346 + }, + { + "auxiliary_loss_clip": 0.01102994, + "auxiliary_loss_mlp": 0.01028734, + "balance_loss_clip": 1.01708674, + "balance_loss_mlp": 1.03515315, + "epoch": 0.5660003006162634, + "flos": 16728196573440.0, + "grad_norm": 1.4689493119012975, + "language_loss": 0.69065028, + "learning_rate": 1.6710433335253092e-06, + "loss": 0.71196759, + "num_input_tokens_seen": 202775145, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 9414, + "time_per_iteration": 2.4249722957611084 + }, + { + "auxiliary_loss_clip": 0.01100758, + "auxiliary_loss_mlp": 0.01026939, + "balance_loss_clip": 1.0162462, + "balance_loss_mlp": 1.03464198, + "epoch": 0.5660604238689313, + "flos": 21653452535040.0, + "grad_norm": 2.330719071721026, + "language_loss": 0.78351963, + "learning_rate": 1.670659182280247e-06, + "loss": 0.80479658, + "num_input_tokens_seen": 202794505, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.66015625, + "step": 9415, + "time_per_iteration": 2.4853508472442627 + }, + { + "auxiliary_loss_clip": 0.01027693, + "auxiliary_loss_mlp": 0.01002734, + "balance_loss_clip": 1.00167274, + "balance_loss_mlp": 1.00642896, + "epoch": 0.5661205471215993, + "flos": 68824022083200.0, + "grad_norm": 0.686572948711127, + "language_loss": 0.49232727, + "learning_rate": 1.670275043523822e-06, + "loss": 0.51263154, + "num_input_tokens_seen": 202858580, + "router_z_loss_clip": 0.01062012, + "router_z_loss_mlp": 0.21289062, + "step": 9416, + "time_per_iteration": 3.1817550659179688 + }, + { + "auxiliary_loss_clip": 0.01106414, + "auxiliary_loss_mlp": 0.01032774, + "balance_loss_clip": 1.02036452, + "balance_loss_mlp": 1.03713977, + "epoch": 0.5661806703742672, + "flos": 28621774177920.0, + "grad_norm": 1.6874553076405654, + "language_loss": 0.62577593, + "learning_rate": 1.6698909172706e-06, + "loss": 0.6471678, + "num_input_tokens_seen": 202878565, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6953125, + "step": 9417, + "time_per_iteration": 2.5856666564941406 + }, + { + "auxiliary_loss_clip": 0.01107822, + "auxiliary_loss_mlp": 0.01030901, + "balance_loss_clip": 1.01865244, + "balance_loss_mlp": 1.03606224, + "epoch": 0.5662407936269352, + "flos": 21397948116480.0, + "grad_norm": 1.797784660701456, + "language_loss": 0.68931323, + "learning_rate": 1.6695068035351479e-06, + "loss": 0.71070051, + "num_input_tokens_seen": 202897350, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.71875, + "step": 9418, + "time_per_iteration": 2.4920060634613037 + }, + { + "auxiliary_loss_clip": 0.01105804, + "auxiliary_loss_mlp": 0.01032238, + "balance_loss_clip": 1.01873779, + "balance_loss_mlp": 1.035465, + "epoch": 0.5663009168796032, + "flos": 25660005315840.0, + "grad_norm": 1.9782803688051387, + "language_loss": 0.64613676, + "learning_rate": 1.6691227023320304e-06, + "loss": 0.66751719, + "num_input_tokens_seen": 202916745, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.703125, + "step": 9419, + "time_per_iteration": 2.5130629539489746 + }, + { + "auxiliary_loss_clip": 0.01028877, + "auxiliary_loss_mlp": 0.01005663, + "balance_loss_clip": 1.00455463, + "balance_loss_mlp": 1.00721812, + "epoch": 0.5663610401322712, + "flos": 67930458422400.0, + "grad_norm": 0.7373486000439856, + "language_loss": 0.59778821, + "learning_rate": 1.6687386136758135e-06, + "loss": 0.61813354, + "num_input_tokens_seen": 202982375, + "router_z_loss_clip": 0.0111084, + "router_z_loss_mlp": 0.21679688, + "step": 9420, + "time_per_iteration": 3.1712303161621094 + }, + { + "auxiliary_loss_clip": 0.01101914, + "auxiliary_loss_mlp": 0.01029636, + "balance_loss_clip": 1.01874661, + "balance_loss_mlp": 1.03477347, + "epoch": 0.5664211633849391, + "flos": 24609367480320.0, + "grad_norm": 1.7745364781392496, + "language_loss": 0.74103463, + "learning_rate": 1.6683545375810618e-06, + "loss": 0.76235008, + "num_input_tokens_seen": 203002430, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.671875, + "step": 9421, + "time_per_iteration": 2.4926223754882812 + }, + { + "auxiliary_loss_clip": 0.01109317, + "auxiliary_loss_mlp": 0.01035488, + "balance_loss_clip": 1.02292371, + "balance_loss_mlp": 1.03705812, + "epoch": 0.5664812866376071, + "flos": 11648811352320.0, + "grad_norm": 1.8540803425049197, + "language_loss": 0.72345394, + "learning_rate": 1.6679704740623389e-06, + "loss": 0.74490201, + "num_input_tokens_seen": 203019425, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.72265625, + "step": 9422, + "time_per_iteration": 2.4081509113311768 + }, + { + "auxiliary_loss_clip": 0.01103997, + "auxiliary_loss_mlp": 0.01034779, + "balance_loss_clip": 1.02378821, + "balance_loss_mlp": 1.03694618, + "epoch": 0.566541409890275, + "flos": 24643985212800.0, + "grad_norm": 1.7305682094853587, + "language_loss": 0.81321973, + "learning_rate": 1.6675864231342085e-06, + "loss": 0.83460754, + "num_input_tokens_seen": 203039035, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.671875, + "step": 9423, + "time_per_iteration": 2.4871041774749756 + }, + { + "auxiliary_loss_clip": 0.01102932, + "auxiliary_loss_mlp": 0.0103348, + "balance_loss_clip": 1.0210824, + "balance_loss_mlp": 1.0354147, + "epoch": 0.566601533142943, + "flos": 22270577126400.0, + "grad_norm": 1.656660590859511, + "language_loss": 0.8069616, + "learning_rate": 1.6672023848112353e-06, + "loss": 0.82832569, + "num_input_tokens_seen": 203059320, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.67578125, + "step": 9424, + "time_per_iteration": 2.4634275436401367 + }, + { + "auxiliary_loss_clip": 0.01111676, + "auxiliary_loss_mlp": 0.01032203, + "balance_loss_clip": 1.01844072, + "balance_loss_mlp": 1.03887486, + "epoch": 0.5666616563956111, + "flos": 29971656218880.0, + "grad_norm": 1.8161233698436283, + "language_loss": 0.78745866, + "learning_rate": 1.6668183591079805e-06, + "loss": 0.80889738, + "num_input_tokens_seen": 203078490, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7265625, + "step": 9425, + "time_per_iteration": 2.5064780712127686 + }, + { + "auxiliary_loss_clip": 0.01105998, + "auxiliary_loss_mlp": 0.01028946, + "balance_loss_clip": 1.01658988, + "balance_loss_mlp": 1.03674626, + "epoch": 0.566721779648279, + "flos": 17781456101760.0, + "grad_norm": 1.8642193992685885, + "language_loss": 0.5897873, + "learning_rate": 1.6664343460390064e-06, + "loss": 0.61113673, + "num_input_tokens_seen": 203096065, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6953125, + "step": 9426, + "time_per_iteration": 2.4720263481140137 + }, + { + "auxiliary_loss_clip": 0.01110856, + "auxiliary_loss_mlp": 0.01030111, + "balance_loss_clip": 1.01804113, + "balance_loss_mlp": 1.03823078, + "epoch": 0.566781902900947, + "flos": 21033490769280.0, + "grad_norm": 2.0557394177022768, + "language_loss": 0.81685758, + "learning_rate": 1.6660503456188764e-06, + "loss": 0.83826721, + "num_input_tokens_seen": 203115270, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.7265625, + "step": 9427, + "time_per_iteration": 3.872758388519287 + }, + { + "auxiliary_loss_clip": 0.01104828, + "auxiliary_loss_mlp": 0.01036492, + "balance_loss_clip": 1.02394485, + "balance_loss_mlp": 1.03744185, + "epoch": 0.5668420261536149, + "flos": 23148593176320.0, + "grad_norm": 1.8776390907485432, + "language_loss": 0.86198628, + "learning_rate": 1.6656663578621498e-06, + "loss": 0.88339949, + "num_input_tokens_seen": 203134290, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.671875, + "step": 9428, + "time_per_iteration": 2.4911303520202637 + }, + { + "auxiliary_loss_clip": 0.01112998, + "auxiliary_loss_mlp": 0.01036692, + "balance_loss_clip": 1.02427602, + "balance_loss_mlp": 1.04080331, + "epoch": 0.5669021494062829, + "flos": 22601601889920.0, + "grad_norm": 2.1518083513194552, + "language_loss": 0.74125421, + "learning_rate": 1.6652823827833886e-06, + "loss": 0.7627511, + "num_input_tokens_seen": 203152935, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71875, + "step": 9429, + "time_per_iteration": 3.9635231494903564 + }, + { + "auxiliary_loss_clip": 0.01109434, + "auxiliary_loss_mlp": 0.0103455, + "balance_loss_clip": 1.02127612, + "balance_loss_mlp": 1.03756118, + "epoch": 0.5669622726589508, + "flos": 17381231786880.0, + "grad_norm": 1.7976574461964, + "language_loss": 0.7496838, + "learning_rate": 1.6648984203971538e-06, + "loss": 0.77112365, + "num_input_tokens_seen": 203170110, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 9430, + "time_per_iteration": 3.8817877769470215 + }, + { + "auxiliary_loss_clip": 0.01106735, + "auxiliary_loss_mlp": 0.01033765, + "balance_loss_clip": 1.02152157, + "balance_loss_mlp": 1.03621042, + "epoch": 0.5670223959116188, + "flos": 18763253521920.0, + "grad_norm": 2.3751678803775285, + "language_loss": 0.7272107, + "learning_rate": 1.6645144707180032e-06, + "loss": 0.74861568, + "num_input_tokens_seen": 203188825, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.703125, + "step": 9431, + "time_per_iteration": 2.51401948928833 + }, + { + "auxiliary_loss_clip": 0.01100909, + "auxiliary_loss_mlp": 0.01029023, + "balance_loss_clip": 1.01810944, + "balance_loss_mlp": 1.03722477, + "epoch": 0.5670825191642868, + "flos": 13553334276480.0, + "grad_norm": 1.9291254540879526, + "language_loss": 0.73248518, + "learning_rate": 1.6641305337604984e-06, + "loss": 0.75378448, + "num_input_tokens_seen": 203206860, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.63671875, + "step": 9432, + "time_per_iteration": 2.4319839477539062 + }, + { + "auxiliary_loss_clip": 0.01107311, + "auxiliary_loss_mlp": 0.01032729, + "balance_loss_clip": 1.02087343, + "balance_loss_mlp": 1.03681755, + "epoch": 0.5671426424169548, + "flos": 22054035985920.0, + "grad_norm": 1.5888571716641233, + "language_loss": 0.77957594, + "learning_rate": 1.663746609539197e-06, + "loss": 0.80097634, + "num_input_tokens_seen": 203225625, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.703125, + "step": 9433, + "time_per_iteration": 2.5169765949249268 + }, + { + "auxiliary_loss_clip": 0.01111851, + "auxiliary_loss_mlp": 0.01031044, + "balance_loss_clip": 1.01645875, + "balance_loss_mlp": 1.03870261, + "epoch": 0.5672027656696227, + "flos": 21323972056320.0, + "grad_norm": 1.7704673621088174, + "language_loss": 0.63839334, + "learning_rate": 1.6633626980686582e-06, + "loss": 0.65982234, + "num_input_tokens_seen": 203242920, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.73046875, + "step": 9434, + "time_per_iteration": 2.4372098445892334 + }, + { + "auxiliary_loss_clip": 0.01102835, + "auxiliary_loss_mlp": 0.01026729, + "balance_loss_clip": 1.01495695, + "balance_loss_mlp": 1.03529072, + "epoch": 0.5672628889222907, + "flos": 23514056104320.0, + "grad_norm": 1.879777953851778, + "language_loss": 0.66724491, + "learning_rate": 1.6629787993634399e-06, + "loss": 0.68854052, + "num_input_tokens_seen": 203261995, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.67578125, + "step": 9435, + "time_per_iteration": 2.5156021118164062 + }, + { + "auxiliary_loss_clip": 0.01104078, + "auxiliary_loss_mlp": 0.01032518, + "balance_loss_clip": 1.02028716, + "balance_loss_mlp": 1.03599691, + "epoch": 0.5673230121749586, + "flos": 27121928855040.0, + "grad_norm": 1.3893571871291595, + "language_loss": 0.71398699, + "learning_rate": 1.6625949134380984e-06, + "loss": 0.73535293, + "num_input_tokens_seen": 203280670, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6796875, + "step": 9436, + "time_per_iteration": 2.4815714359283447 + }, + { + "auxiliary_loss_clip": 0.01109121, + "auxiliary_loss_mlp": 0.01029795, + "balance_loss_clip": 1.01723647, + "balance_loss_mlp": 1.03756368, + "epoch": 0.5673831354276266, + "flos": 31141985149440.0, + "grad_norm": 1.6654091498260946, + "language_loss": 0.73988926, + "learning_rate": 1.6622110403071921e-06, + "loss": 0.76127845, + "num_input_tokens_seen": 203304800, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71484375, + "step": 9437, + "time_per_iteration": 2.5965943336486816 + }, + { + "auxiliary_loss_clip": 0.01112439, + "auxiliary_loss_mlp": 0.01032397, + "balance_loss_clip": 1.01942062, + "balance_loss_mlp": 1.04159832, + "epoch": 0.5674432586802945, + "flos": 27673193859840.0, + "grad_norm": 2.439390833366172, + "language_loss": 0.60905057, + "learning_rate": 1.661827179985277e-06, + "loss": 0.63049889, + "num_input_tokens_seen": 203324060, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.70703125, + "step": 9438, + "time_per_iteration": 2.512578010559082 + }, + { + "auxiliary_loss_clip": 0.01105416, + "auxiliary_loss_mlp": 0.01028895, + "balance_loss_clip": 1.01714146, + "balance_loss_mlp": 1.03543329, + "epoch": 0.5675033819329626, + "flos": 26615157822720.0, + "grad_norm": 1.6600048607148805, + "language_loss": 0.75087392, + "learning_rate": 1.661443332486909e-06, + "loss": 0.77221704, + "num_input_tokens_seen": 203344360, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.69921875, + "step": 9439, + "time_per_iteration": 2.531489133834839 + }, + { + "auxiliary_loss_clip": 0.01107772, + "auxiliary_loss_mlp": 0.01029394, + "balance_loss_clip": 1.0159471, + "balance_loss_mlp": 1.03828883, + "epoch": 0.5675635051856306, + "flos": 19098372435840.0, + "grad_norm": 1.8930047517001285, + "language_loss": 0.8361944, + "learning_rate": 1.6610594978266438e-06, + "loss": 0.857566, + "num_input_tokens_seen": 203362115, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.6953125, + "step": 9440, + "time_per_iteration": 2.4386231899261475 + }, + { + "auxiliary_loss_clip": 0.01111147, + "auxiliary_loss_mlp": 0.01034391, + "balance_loss_clip": 1.02123022, + "balance_loss_mlp": 1.03704751, + "epoch": 0.5676236284382985, + "flos": 17566315591680.0, + "grad_norm": 2.0023123091206467, + "language_loss": 0.7550447, + "learning_rate": 1.6606756760190365e-06, + "loss": 0.77650005, + "num_input_tokens_seen": 203380550, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 9441, + "time_per_iteration": 2.4788920879364014 + }, + { + "auxiliary_loss_clip": 0.01108262, + "auxiliary_loss_mlp": 0.01032728, + "balance_loss_clip": 1.02022874, + "balance_loss_mlp": 1.0381217, + "epoch": 0.5676837516909665, + "flos": 15954069634560.0, + "grad_norm": 2.003106565766755, + "language_loss": 0.83199525, + "learning_rate": 1.6602918670786413e-06, + "loss": 0.85340512, + "num_input_tokens_seen": 203396590, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 9442, + "time_per_iteration": 2.4066359996795654 + }, + { + "auxiliary_loss_clip": 0.01103828, + "auxiliary_loss_mlp": 0.01030609, + "balance_loss_clip": 1.01906371, + "balance_loss_mlp": 1.0388906, + "epoch": 0.5677438749436344, + "flos": 18295912644480.0, + "grad_norm": 2.099488848818881, + "language_loss": 0.74606907, + "learning_rate": 1.6599080710200126e-06, + "loss": 0.76741344, + "num_input_tokens_seen": 203414280, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 9443, + "time_per_iteration": 2.4699020385742188 + }, + { + "auxiliary_loss_clip": 0.0110959, + "auxiliary_loss_mlp": 0.01034472, + "balance_loss_clip": 1.02184737, + "balance_loss_mlp": 1.03892851, + "epoch": 0.5678039981963025, + "flos": 17931311642880.0, + "grad_norm": 1.9353911334921245, + "language_loss": 0.77443373, + "learning_rate": 1.6595242878577046e-06, + "loss": 0.79587436, + "num_input_tokens_seen": 203433280, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.70703125, + "step": 9444, + "time_per_iteration": 2.418164014816284 + }, + { + "auxiliary_loss_clip": 0.01110885, + "auxiliary_loss_mlp": 0.01037563, + "balance_loss_clip": 1.02498603, + "balance_loss_mlp": 1.03886068, + "epoch": 0.5678641214489704, + "flos": 19316350120320.0, + "grad_norm": 1.6369546772732781, + "language_loss": 0.80673003, + "learning_rate": 1.6591405176062687e-06, + "loss": 0.82821453, + "num_input_tokens_seen": 203449935, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71875, + "step": 9445, + "time_per_iteration": 2.4474682807922363 + }, + { + "auxiliary_loss_clip": 0.01105393, + "auxiliary_loss_mlp": 0.01026724, + "balance_loss_clip": 1.0146122, + "balance_loss_mlp": 1.03579414, + "epoch": 0.5679242447016384, + "flos": 27751084502400.0, + "grad_norm": 1.310891415120181, + "language_loss": 0.70843911, + "learning_rate": 1.658756760280259e-06, + "loss": 0.72976023, + "num_input_tokens_seen": 203473025, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 9446, + "time_per_iteration": 2.5338428020477295 + }, + { + "auxiliary_loss_clip": 0.01111342, + "auxiliary_loss_mlp": 0.01031244, + "balance_loss_clip": 1.018489, + "balance_loss_mlp": 1.03815663, + "epoch": 0.5679843679543063, + "flos": 23769093646080.0, + "grad_norm": 1.8305308972685952, + "language_loss": 0.7354359, + "learning_rate": 1.6583730158942276e-06, + "loss": 0.75686181, + "num_input_tokens_seen": 203492895, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.734375, + "step": 9447, + "time_per_iteration": 2.5152740478515625 + }, + { + "auxiliary_loss_clip": 0.01110587, + "auxiliary_loss_mlp": 0.01032075, + "balance_loss_clip": 1.01963568, + "balance_loss_mlp": 1.0382061, + "epoch": 0.5680444912069743, + "flos": 25591883172480.0, + "grad_norm": 2.262443693729548, + "language_loss": 0.74931812, + "learning_rate": 1.657989284462725e-06, + "loss": 0.77074468, + "num_input_tokens_seen": 203513710, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.72265625, + "step": 9448, + "time_per_iteration": 2.468688488006592 + }, + { + "auxiliary_loss_clip": 0.0111579, + "auxiliary_loss_mlp": 0.01035922, + "balance_loss_clip": 1.0227201, + "balance_loss_mlp": 1.04175234, + "epoch": 0.5681046144596422, + "flos": 23695799944320.0, + "grad_norm": 2.1518179799978356, + "language_loss": 0.76137841, + "learning_rate": 1.6576055660003038e-06, + "loss": 0.78289551, + "num_input_tokens_seen": 203531630, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73828125, + "step": 9449, + "time_per_iteration": 2.510693311691284 + }, + { + "auxiliary_loss_clip": 0.01110533, + "auxiliary_loss_mlp": 0.01033296, + "balance_loss_clip": 1.02046347, + "balance_loss_mlp": 1.03867984, + "epoch": 0.5681647377123102, + "flos": 28000770917760.0, + "grad_norm": 1.6592475910366993, + "language_loss": 0.74742198, + "learning_rate": 1.6572218605215128e-06, + "loss": 0.76886022, + "num_input_tokens_seen": 203551885, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 9450, + "time_per_iteration": 2.5034866333007812 + }, + { + "auxiliary_loss_clip": 0.01112382, + "auxiliary_loss_mlp": 0.01035469, + "balance_loss_clip": 1.02404845, + "balance_loss_mlp": 1.04081213, + "epoch": 0.5682248609649782, + "flos": 22747758330240.0, + "grad_norm": 3.8340234675809017, + "language_loss": 0.67216206, + "learning_rate": 1.6568381680409038e-06, + "loss": 0.69364059, + "num_input_tokens_seen": 203572250, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.71875, + "step": 9451, + "time_per_iteration": 2.503805637359619 + }, + { + "auxiliary_loss_clip": 0.01115025, + "auxiliary_loss_mlp": 0.01031671, + "balance_loss_clip": 1.01743114, + "balance_loss_mlp": 1.03788531, + "epoch": 0.5682849842176462, + "flos": 21288600138240.0, + "grad_norm": 1.8009184427821863, + "language_loss": 0.71697223, + "learning_rate": 1.656454488573026e-06, + "loss": 0.7384392, + "num_input_tokens_seen": 203590605, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7734375, + "step": 9452, + "time_per_iteration": 2.4519643783569336 + }, + { + "auxiliary_loss_clip": 0.01105455, + "auxiliary_loss_mlp": 0.01031445, + "balance_loss_clip": 1.01947021, + "balance_loss_mlp": 1.03679395, + "epoch": 0.5683451074703142, + "flos": 21141689512320.0, + "grad_norm": 1.6525298490216664, + "language_loss": 0.70272237, + "learning_rate": 1.656070822132428e-06, + "loss": 0.72409141, + "num_input_tokens_seen": 203610080, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6875, + "step": 9453, + "time_per_iteration": 2.5260796546936035 + }, + { + "auxiliary_loss_clip": 0.01110332, + "auxiliary_loss_mlp": 0.01034031, + "balance_loss_clip": 1.02190745, + "balance_loss_mlp": 1.03889799, + "epoch": 0.5684052307229821, + "flos": 22344481359360.0, + "grad_norm": 2.2860746429720833, + "language_loss": 0.69546616, + "learning_rate": 1.6556871687336592e-06, + "loss": 0.71690989, + "num_input_tokens_seen": 203630060, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71484375, + "step": 9454, + "time_per_iteration": 2.457736015319824 + }, + { + "auxiliary_loss_clip": 0.01103936, + "auxiliary_loss_mlp": 0.01029774, + "balance_loss_clip": 1.01837158, + "balance_loss_mlp": 1.03616297, + "epoch": 0.5684653539756501, + "flos": 21798639308160.0, + "grad_norm": 1.8998375571155763, + "language_loss": 0.60430771, + "learning_rate": 1.6553035283912671e-06, + "loss": 0.6256448, + "num_input_tokens_seen": 203649065, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.67578125, + "step": 9455, + "time_per_iteration": 2.506091594696045 + }, + { + "auxiliary_loss_clip": 0.01116555, + "auxiliary_loss_mlp": 0.01030863, + "balance_loss_clip": 1.01808953, + "balance_loss_mlp": 1.0424788, + "epoch": 0.568525477228318, + "flos": 22999635475200.0, + "grad_norm": 2.102932497256003, + "language_loss": 0.72914851, + "learning_rate": 1.6549199011198e-06, + "loss": 0.75062263, + "num_input_tokens_seen": 203667545, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7421875, + "step": 9456, + "time_per_iteration": 2.439221143722534 + }, + { + "auxiliary_loss_clip": 0.01108291, + "auxiliary_loss_mlp": 0.01032712, + "balance_loss_clip": 1.02125049, + "balance_loss_mlp": 1.03915823, + "epoch": 0.568585600480986, + "flos": 21392489249280.0, + "grad_norm": 1.5692423529190727, + "language_loss": 0.76402628, + "learning_rate": 1.6545362869338048e-06, + "loss": 0.78543633, + "num_input_tokens_seen": 203686025, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.69140625, + "step": 9457, + "time_per_iteration": 2.475327491760254 + }, + { + "auxiliary_loss_clip": 0.01110625, + "auxiliary_loss_mlp": 0.01036596, + "balance_loss_clip": 1.02338171, + "balance_loss_mlp": 1.03828931, + "epoch": 0.568645723733654, + "flos": 30007351359360.0, + "grad_norm": 1.8808926225586853, + "language_loss": 0.66305089, + "learning_rate": 1.6541526858478285e-06, + "loss": 0.68452305, + "num_input_tokens_seen": 203705540, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7265625, + "step": 9458, + "time_per_iteration": 2.5271642208099365 + }, + { + "auxiliary_loss_clip": 0.01111416, + "auxiliary_loss_mlp": 0.01027286, + "balance_loss_clip": 1.01424456, + "balance_loss_mlp": 1.03845215, + "epoch": 0.568705846986322, + "flos": 20412667077120.0, + "grad_norm": 2.21557799175144, + "language_loss": 0.67912495, + "learning_rate": 1.6537690978764167e-06, + "loss": 0.70051199, + "num_input_tokens_seen": 203723670, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.73046875, + "step": 9459, + "time_per_iteration": 2.534374237060547 + }, + { + "auxiliary_loss_clip": 0.0111268, + "auxiliary_loss_mlp": 0.01032195, + "balance_loss_clip": 1.01942194, + "balance_loss_mlp": 1.04046702, + "epoch": 0.5687659702389899, + "flos": 17456752131840.0, + "grad_norm": 3.4353012744759335, + "language_loss": 0.77999187, + "learning_rate": 1.6533855230341155e-06, + "loss": 0.8014406, + "num_input_tokens_seen": 203739705, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.72265625, + "step": 9460, + "time_per_iteration": 2.434570789337158 + }, + { + "auxiliary_loss_clip": 0.01109374, + "auxiliary_loss_mlp": 0.01035426, + "balance_loss_clip": 1.02221131, + "balance_loss_mlp": 1.03767824, + "epoch": 0.5688260934916579, + "flos": 25406081095680.0, + "grad_norm": 1.7026913094631195, + "language_loss": 0.71950358, + "learning_rate": 1.65300196133547e-06, + "loss": 0.74095166, + "num_input_tokens_seen": 203759000, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71875, + "step": 9461, + "time_per_iteration": 2.5527231693267822 + }, + { + "auxiliary_loss_clip": 0.01109281, + "auxiliary_loss_mlp": 0.01030442, + "balance_loss_clip": 1.01769304, + "balance_loss_mlp": 1.03814745, + "epoch": 0.5688862167443258, + "flos": 21608024808960.0, + "grad_norm": 2.8717094069028617, + "language_loss": 0.72976351, + "learning_rate": 1.6526184127950249e-06, + "loss": 0.75116074, + "num_input_tokens_seen": 203774295, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9462, + "time_per_iteration": 2.422624111175537 + }, + { + "auxiliary_loss_clip": 0.0110396, + "auxiliary_loss_mlp": 0.01026733, + "balance_loss_clip": 1.01590848, + "balance_loss_mlp": 1.03640223, + "epoch": 0.5689463399969938, + "flos": 22418996123520.0, + "grad_norm": 1.8933127595424433, + "language_loss": 0.7326529, + "learning_rate": 1.6522348774273246e-06, + "loss": 0.75395983, + "num_input_tokens_seen": 203792710, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.67578125, + "step": 9463, + "time_per_iteration": 2.466491460800171 + }, + { + "auxiliary_loss_clip": 0.01106737, + "auxiliary_loss_mlp": 0.01030565, + "balance_loss_clip": 1.01810765, + "balance_loss_mlp": 1.03583968, + "epoch": 0.5690064632496618, + "flos": 18296810484480.0, + "grad_norm": 1.7491308846328846, + "language_loss": 0.74368691, + "learning_rate": 1.6518513552469123e-06, + "loss": 0.76505989, + "num_input_tokens_seen": 203811645, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 9464, + "time_per_iteration": 2.406031370162964 + }, + { + "auxiliary_loss_clip": 0.01110663, + "auxiliary_loss_mlp": 0.01036268, + "balance_loss_clip": 1.02382255, + "balance_loss_mlp": 1.03892159, + "epoch": 0.5690665865023298, + "flos": 21579260993280.0, + "grad_norm": 1.714079864723851, + "language_loss": 0.84333247, + "learning_rate": 1.6514678462683312e-06, + "loss": 0.86480176, + "num_input_tokens_seen": 203830040, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71875, + "step": 9465, + "time_per_iteration": 2.514777183532715 + }, + { + "auxiliary_loss_clip": 0.01103611, + "auxiliary_loss_mlp": 0.01029517, + "balance_loss_clip": 1.01757169, + "balance_loss_mlp": 1.03546405, + "epoch": 0.5691267097549978, + "flos": 24421446501120.0, + "grad_norm": 1.8589721720108319, + "language_loss": 0.7226572, + "learning_rate": 1.651084350506125e-06, + "loss": 0.74398845, + "num_input_tokens_seen": 203851245, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.68359375, + "step": 9466, + "time_per_iteration": 2.475188732147217 + }, + { + "auxiliary_loss_clip": 0.01029497, + "auxiliary_loss_mlp": 0.01005385, + "balance_loss_clip": 1.00427043, + "balance_loss_mlp": 1.0077517, + "epoch": 0.5691868330076657, + "flos": 61657906199040.0, + "grad_norm": 0.7081654133828948, + "language_loss": 0.55354679, + "learning_rate": 1.6507008679748343e-06, + "loss": 0.57389557, + "num_input_tokens_seen": 203916400, + "router_z_loss_clip": 0.01116943, + "router_z_loss_mlp": 0.21777344, + "step": 9467, + "time_per_iteration": 3.185729742050171 + }, + { + "auxiliary_loss_clip": 0.01111718, + "auxiliary_loss_mlp": 0.01032639, + "balance_loss_clip": 1.01861966, + "balance_loss_mlp": 1.03861189, + "epoch": 0.5692469562603337, + "flos": 21325193118720.0, + "grad_norm": 2.2495356407271854, + "language_loss": 0.63680357, + "learning_rate": 1.6503173986890023e-06, + "loss": 0.65824717, + "num_input_tokens_seen": 203935870, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.73046875, + "step": 9468, + "time_per_iteration": 2.4373323917388916 + }, + { + "auxiliary_loss_clip": 0.01108125, + "auxiliary_loss_mlp": 0.01028705, + "balance_loss_clip": 1.01587772, + "balance_loss_mlp": 1.03801632, + "epoch": 0.5693070795130016, + "flos": 23367899664000.0, + "grad_norm": 1.8525378978069993, + "language_loss": 0.79367, + "learning_rate": 1.64993394266317e-06, + "loss": 0.81503832, + "num_input_tokens_seen": 203954950, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 9469, + "time_per_iteration": 3.8166728019714355 + }, + { + "auxiliary_loss_clip": 0.0111246, + "auxiliary_loss_mlp": 0.01041609, + "balance_loss_clip": 1.02810884, + "balance_loss_mlp": 1.03860152, + "epoch": 0.5693672027656697, + "flos": 18697250280960.0, + "grad_norm": 1.9923541987272968, + "language_loss": 0.69606256, + "learning_rate": 1.6495504999118769e-06, + "loss": 0.71760333, + "num_input_tokens_seen": 203972715, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73828125, + "step": 9470, + "time_per_iteration": 2.4572556018829346 + }, + { + "auxiliary_loss_clip": 0.01107845, + "auxiliary_loss_mlp": 0.01031625, + "balance_loss_clip": 1.01882184, + "balance_loss_mlp": 1.03729832, + "epoch": 0.5694273260183376, + "flos": 20449188230400.0, + "grad_norm": 1.5518202279497855, + "language_loss": 0.74791551, + "learning_rate": 1.6491670704496644e-06, + "loss": 0.76931024, + "num_input_tokens_seen": 203990775, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.70703125, + "step": 9471, + "time_per_iteration": 3.926091432571411 + }, + { + "auxiliary_loss_clip": 0.01108882, + "auxiliary_loss_mlp": 0.01032599, + "balance_loss_clip": 1.02006447, + "balance_loss_mlp": 1.03928542, + "epoch": 0.5694874492710056, + "flos": 17603195880960.0, + "grad_norm": 1.9616270612820847, + "language_loss": 0.57270539, + "learning_rate": 1.6487836542910716e-06, + "loss": 0.59412026, + "num_input_tokens_seen": 204008845, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 9472, + "time_per_iteration": 3.8452813625335693 + }, + { + "auxiliary_loss_clip": 0.01103976, + "auxiliary_loss_mlp": 0.010308, + "balance_loss_clip": 1.01886702, + "balance_loss_mlp": 1.03722382, + "epoch": 0.5695475725236735, + "flos": 13370836250880.0, + "grad_norm": 1.803122156723958, + "language_loss": 0.73615265, + "learning_rate": 1.648400251450638e-06, + "loss": 0.75750041, + "num_input_tokens_seen": 204023755, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66796875, + "step": 9473, + "time_per_iteration": 2.4637346267700195 + }, + { + "auxiliary_loss_clip": 0.01028797, + "auxiliary_loss_mlp": 0.01004803, + "balance_loss_clip": 1.00359905, + "balance_loss_mlp": 1.00722575, + "epoch": 0.5696076957763415, + "flos": 68174398661760.0, + "grad_norm": 0.6476817486149063, + "language_loss": 0.57596511, + "learning_rate": 1.6480168619429023e-06, + "loss": 0.59630114, + "num_input_tokens_seen": 204091255, + "router_z_loss_clip": 0.01202393, + "router_z_loss_mlp": 0.21679688, + "step": 9474, + "time_per_iteration": 3.09342622756958 + }, + { + "auxiliary_loss_clip": 0.01108341, + "auxiliary_loss_mlp": 0.01034518, + "balance_loss_clip": 1.02095199, + "balance_loss_mlp": 1.03955841, + "epoch": 0.5696678190290094, + "flos": 33838301525760.0, + "grad_norm": 1.7127367690076127, + "language_loss": 0.53624213, + "learning_rate": 1.6476334857824017e-06, + "loss": 0.55767071, + "num_input_tokens_seen": 204113285, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.6875, + "step": 9475, + "time_per_iteration": 2.6103556156158447 + }, + { + "auxiliary_loss_clip": 0.01110194, + "auxiliary_loss_mlp": 0.01032608, + "balance_loss_clip": 1.01969719, + "balance_loss_mlp": 1.03914022, + "epoch": 0.5697279422816774, + "flos": 26356600748160.0, + "grad_norm": 1.5220537573313933, + "language_loss": 0.79891974, + "learning_rate": 1.647250122983675e-06, + "loss": 0.82034773, + "num_input_tokens_seen": 204133045, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9476, + "time_per_iteration": 2.5519871711730957 + }, + { + "auxiliary_loss_clip": 0.01113361, + "auxiliary_loss_mlp": 0.0103459, + "balance_loss_clip": 1.02248454, + "balance_loss_mlp": 1.04071283, + "epoch": 0.5697880655343454, + "flos": 22930507751040.0, + "grad_norm": 2.93922823935367, + "language_loss": 0.66361278, + "learning_rate": 1.6468667735612592e-06, + "loss": 0.68509227, + "num_input_tokens_seen": 204152590, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.7265625, + "step": 9477, + "time_per_iteration": 2.556461811065674 + }, + { + "auxiliary_loss_clip": 0.01107946, + "auxiliary_loss_mlp": 0.01030235, + "balance_loss_clip": 1.01737881, + "balance_loss_mlp": 1.03697014, + "epoch": 0.5698481887870134, + "flos": 26761314263040.0, + "grad_norm": 1.8188873629652118, + "language_loss": 0.70921832, + "learning_rate": 1.6464834375296906e-06, + "loss": 0.73060012, + "num_input_tokens_seen": 204171815, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9478, + "time_per_iteration": 2.5022385120391846 + }, + { + "auxiliary_loss_clip": 0.01104521, + "auxiliary_loss_mlp": 0.01027788, + "balance_loss_clip": 1.01615286, + "balance_loss_mlp": 1.03824937, + "epoch": 0.5699083120396814, + "flos": 15742269089280.0, + "grad_norm": 1.5933810632151244, + "language_loss": 0.69647413, + "learning_rate": 1.6461001149035055e-06, + "loss": 0.71779716, + "num_input_tokens_seen": 204188535, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 9479, + "time_per_iteration": 2.544422149658203 + }, + { + "auxiliary_loss_clip": 0.01103959, + "auxiliary_loss_mlp": 0.01028863, + "balance_loss_clip": 1.01729965, + "balance_loss_mlp": 1.03753138, + "epoch": 0.5699684352923493, + "flos": 19537272720000.0, + "grad_norm": 1.4338626650619826, + "language_loss": 0.71364439, + "learning_rate": 1.6457168056972392e-06, + "loss": 0.7349726, + "num_input_tokens_seen": 204208365, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6640625, + "step": 9480, + "time_per_iteration": 2.5680878162384033 + }, + { + "auxiliary_loss_clip": 0.01106665, + "auxiliary_loss_mlp": 0.01029171, + "balance_loss_clip": 1.01615977, + "balance_loss_mlp": 1.03689599, + "epoch": 0.5700285585450173, + "flos": 16253349753600.0, + "grad_norm": 2.894404055389402, + "language_loss": 0.71927261, + "learning_rate": 1.6453335099254276e-06, + "loss": 0.74063098, + "num_input_tokens_seen": 204226560, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.69921875, + "step": 9481, + "time_per_iteration": 2.4576737880706787 + }, + { + "auxiliary_loss_clip": 0.01108109, + "auxiliary_loss_mlp": 0.01030771, + "balance_loss_clip": 1.01848626, + "balance_loss_mlp": 1.03819919, + "epoch": 0.5700886817976852, + "flos": 19864993432320.0, + "grad_norm": 1.6819252466037764, + "language_loss": 0.78134334, + "learning_rate": 1.6449502276026041e-06, + "loss": 0.80273211, + "num_input_tokens_seen": 204245410, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69921875, + "step": 9482, + "time_per_iteration": 2.4244532585144043 + }, + { + "auxiliary_loss_clip": 0.01107032, + "auxiliary_loss_mlp": 0.01026772, + "balance_loss_clip": 1.01462436, + "balance_loss_mlp": 1.0372206, + "epoch": 0.5701488050503533, + "flos": 23841704989440.0, + "grad_norm": 2.1918431398286686, + "language_loss": 0.77641654, + "learning_rate": 1.6445669587433043e-06, + "loss": 0.79775453, + "num_input_tokens_seen": 204264840, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69921875, + "step": 9483, + "time_per_iteration": 2.4840755462646484 + }, + { + "auxiliary_loss_clip": 0.01106594, + "auxiliary_loss_mlp": 0.01033667, + "balance_loss_clip": 1.0217644, + "balance_loss_mlp": 1.037377, + "epoch": 0.5702089283030212, + "flos": 23659673840640.0, + "grad_norm": 2.4281256207615702, + "language_loss": 0.8098467, + "learning_rate": 1.6441837033620612e-06, + "loss": 0.8312493, + "num_input_tokens_seen": 204284335, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69140625, + "step": 9484, + "time_per_iteration": 2.4726784229278564 + }, + { + "auxiliary_loss_clip": 0.01107682, + "auxiliary_loss_mlp": 0.01030904, + "balance_loss_clip": 1.01798165, + "balance_loss_mlp": 1.03656316, + "epoch": 0.5702690515556892, + "flos": 27891171544320.0, + "grad_norm": 9.175896769478262, + "language_loss": 0.60516417, + "learning_rate": 1.6438004614734073e-06, + "loss": 0.62655002, + "num_input_tokens_seen": 204302590, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 9485, + "time_per_iteration": 2.5423014163970947 + }, + { + "auxiliary_loss_clip": 0.01107039, + "auxiliary_loss_mlp": 0.01033529, + "balance_loss_clip": 1.02155399, + "balance_loss_mlp": 1.03619039, + "epoch": 0.5703291748083571, + "flos": 24023951619840.0, + "grad_norm": 1.6367482229195742, + "language_loss": 0.65350515, + "learning_rate": 1.6434172330918757e-06, + "loss": 0.67491084, + "num_input_tokens_seen": 204323055, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.7109375, + "step": 9486, + "time_per_iteration": 2.4597506523132324 + }, + { + "auxiliary_loss_clip": 0.01028731, + "auxiliary_loss_mlp": 0.01001408, + "balance_loss_clip": 1.00001299, + "balance_loss_mlp": 1.0072422, + "epoch": 0.5703892980610251, + "flos": 57023382919680.0, + "grad_norm": 0.6639559744347447, + "language_loss": 0.48005819, + "learning_rate": 1.6430340182319978e-06, + "loss": 0.50035954, + "num_input_tokens_seen": 204386160, + "router_z_loss_clip": 0.01397705, + "router_z_loss_mlp": 0.21484375, + "step": 9487, + "time_per_iteration": 3.139495849609375 + }, + { + "auxiliary_loss_clip": 0.01107717, + "auxiliary_loss_mlp": 0.01034452, + "balance_loss_clip": 1.02199435, + "balance_loss_mlp": 1.03726935, + "epoch": 0.570449421313693, + "flos": 24351025887360.0, + "grad_norm": 3.049670437576873, + "language_loss": 0.86058694, + "learning_rate": 1.6426508169083067e-06, + "loss": 0.88200867, + "num_input_tokens_seen": 204406315, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.703125, + "step": 9488, + "time_per_iteration": 2.474616289138794 + }, + { + "auxiliary_loss_clip": 0.01111409, + "auxiliary_loss_mlp": 0.01033222, + "balance_loss_clip": 1.02065694, + "balance_loss_mlp": 1.03814459, + "epoch": 0.570509544566361, + "flos": 24828566227200.0, + "grad_norm": 1.4447763000600118, + "language_loss": 0.79057854, + "learning_rate": 1.6422676291353314e-06, + "loss": 0.81202483, + "num_input_tokens_seen": 204427645, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.73046875, + "step": 9489, + "time_per_iteration": 2.5065059661865234 + }, + { + "auxiliary_loss_clip": 0.01109061, + "auxiliary_loss_mlp": 0.01030936, + "balance_loss_clip": 1.01978409, + "balance_loss_mlp": 1.03869939, + "epoch": 0.570569667819029, + "flos": 21397301671680.0, + "grad_norm": 1.7186115243718623, + "language_loss": 0.69906354, + "learning_rate": 1.641884454927604e-06, + "loss": 0.72046351, + "num_input_tokens_seen": 204445910, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.703125, + "step": 9490, + "time_per_iteration": 2.431102752685547 + }, + { + "auxiliary_loss_clip": 0.01107746, + "auxiliary_loss_mlp": 0.01028873, + "balance_loss_clip": 1.01676106, + "balance_loss_mlp": 1.03836775, + "epoch": 0.570629791071697, + "flos": 23216751233280.0, + "grad_norm": 1.5472180668734579, + "language_loss": 0.76222062, + "learning_rate": 1.6415012942996548e-06, + "loss": 0.78358686, + "num_input_tokens_seen": 204464680, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 9491, + "time_per_iteration": 2.4962759017944336 + }, + { + "auxiliary_loss_clip": 0.01028502, + "auxiliary_loss_mlp": 0.01004058, + "balance_loss_clip": 1.00276494, + "balance_loss_mlp": 1.00699997, + "epoch": 0.570689914324365, + "flos": 65284666525440.0, + "grad_norm": 0.7944597612251223, + "language_loss": 0.57379556, + "learning_rate": 1.641118147266011e-06, + "loss": 0.59412122, + "num_input_tokens_seen": 204525580, + "router_z_loss_clip": 0.01293945, + "router_z_loss_mlp": 0.21484375, + "step": 9492, + "time_per_iteration": 3.0417838096618652 + }, + { + "auxiliary_loss_clip": 0.01108126, + "auxiliary_loss_mlp": 0.01033252, + "balance_loss_clip": 1.02009118, + "balance_loss_mlp": 1.03813028, + "epoch": 0.5707500375770329, + "flos": 21141904993920.0, + "grad_norm": 1.7217254573804663, + "language_loss": 0.71475661, + "learning_rate": 1.6407350138412035e-06, + "loss": 0.73617041, + "num_input_tokens_seen": 204541320, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.69921875, + "step": 9493, + "time_per_iteration": 2.4304161071777344 + }, + { + "auxiliary_loss_clip": 0.01111414, + "auxiliary_loss_mlp": 0.01030431, + "balance_loss_clip": 1.01807487, + "balance_loss_mlp": 1.0384568, + "epoch": 0.5708101608297009, + "flos": 20812747737600.0, + "grad_norm": 1.5364295350921338, + "language_loss": 0.77778745, + "learning_rate": 1.6403518940397606e-06, + "loss": 0.7992059, + "num_input_tokens_seen": 204560275, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.73046875, + "step": 9494, + "time_per_iteration": 2.495940923690796 + }, + { + "auxiliary_loss_clip": 0.01111436, + "auxiliary_loss_mlp": 0.01033742, + "balance_loss_clip": 1.02015769, + "balance_loss_mlp": 1.03685784, + "epoch": 0.5708702840823688, + "flos": 25812338895360.0, + "grad_norm": 2.275602748234112, + "language_loss": 0.80153453, + "learning_rate": 1.6399687878762096e-06, + "loss": 0.82298625, + "num_input_tokens_seen": 204579430, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.74609375, + "step": 9495, + "time_per_iteration": 2.464423418045044 + }, + { + "auxiliary_loss_clip": 0.01117033, + "auxiliary_loss_mlp": 0.01039006, + "balance_loss_clip": 1.02393782, + "balance_loss_mlp": 1.04061937, + "epoch": 0.5709304073350369, + "flos": 23651916503040.0, + "grad_norm": 3.463558707959815, + "language_loss": 0.66745138, + "learning_rate": 1.6395856953650784e-06, + "loss": 0.68901181, + "num_input_tokens_seen": 204597710, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.765625, + "step": 9496, + "time_per_iteration": 2.460413694381714 + }, + { + "auxiliary_loss_clip": 0.01113845, + "auxiliary_loss_mlp": 0.01037224, + "balance_loss_clip": 1.02361047, + "balance_loss_mlp": 1.03911281, + "epoch": 0.5709905305877048, + "flos": 16107552449280.0, + "grad_norm": 2.3847499053839067, + "language_loss": 0.6960094, + "learning_rate": 1.6392026165208938e-06, + "loss": 0.71752012, + "num_input_tokens_seen": 204616140, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 9497, + "time_per_iteration": 2.4051928520202637 + }, + { + "auxiliary_loss_clip": 0.01111626, + "auxiliary_loss_mlp": 0.01030681, + "balance_loss_clip": 1.01712704, + "balance_loss_mlp": 1.03815341, + "epoch": 0.5710506538403728, + "flos": 24750819239040.0, + "grad_norm": 1.8796088723274103, + "language_loss": 0.81200778, + "learning_rate": 1.638819551358182e-06, + "loss": 0.83343083, + "num_input_tokens_seen": 204636470, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.734375, + "step": 9498, + "time_per_iteration": 2.4764246940612793 + }, + { + "auxiliary_loss_clip": 0.01111235, + "auxiliary_loss_mlp": 0.01035181, + "balance_loss_clip": 1.02085817, + "balance_loss_mlp": 1.03874803, + "epoch": 0.5711107770930407, + "flos": 21982250655360.0, + "grad_norm": 1.7968018947144153, + "language_loss": 0.66237068, + "learning_rate": 1.638436499891469e-06, + "loss": 0.68383479, + "num_input_tokens_seen": 204656640, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.72265625, + "step": 9499, + "time_per_iteration": 2.4842209815979004 + }, + { + "auxiliary_loss_clip": 0.01109681, + "auxiliary_loss_mlp": 0.0103261, + "balance_loss_clip": 1.01994455, + "balance_loss_mlp": 1.03881264, + "epoch": 0.5711709003457087, + "flos": 19574009354880.0, + "grad_norm": 2.341189176641991, + "language_loss": 0.71659786, + "learning_rate": 1.6380534621352805e-06, + "loss": 0.73802078, + "num_input_tokens_seen": 204675475, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9500, + "time_per_iteration": 2.474968671798706 + }, + { + "auxiliary_loss_clip": 0.01113264, + "auxiliary_loss_mlp": 0.01032452, + "balance_loss_clip": 1.01911259, + "balance_loss_mlp": 1.03896177, + "epoch": 0.5712310235983766, + "flos": 24242683489920.0, + "grad_norm": 1.7510176581013566, + "language_loss": 0.76148939, + "learning_rate": 1.6376704381041407e-06, + "loss": 0.78294659, + "num_input_tokens_seen": 204695385, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7421875, + "step": 9501, + "time_per_iteration": 2.481982707977295 + }, + { + "auxiliary_loss_clip": 0.01112022, + "auxiliary_loss_mlp": 0.01030774, + "balance_loss_clip": 1.01872778, + "balance_loss_mlp": 1.03827071, + "epoch": 0.5712911468510447, + "flos": 20996143603200.0, + "grad_norm": 1.6683693962706503, + "language_loss": 0.75252867, + "learning_rate": 1.6372874278125742e-06, + "loss": 0.7739566, + "num_input_tokens_seen": 204714730, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.73828125, + "step": 9502, + "time_per_iteration": 2.4645891189575195 + }, + { + "auxiliary_loss_clip": 0.01108222, + "auxiliary_loss_mlp": 0.01026372, + "balance_loss_clip": 1.01413548, + "balance_loss_mlp": 1.03776038, + "epoch": 0.5713512701037126, + "flos": 18916987731840.0, + "grad_norm": 3.8399261830524076, + "language_loss": 0.82397389, + "learning_rate": 1.636904431275105e-06, + "loss": 0.84531981, + "num_input_tokens_seen": 204735025, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.703125, + "step": 9503, + "time_per_iteration": 2.4945871829986572 + }, + { + "auxiliary_loss_clip": 0.01108893, + "auxiliary_loss_mlp": 0.01035138, + "balance_loss_clip": 1.02267456, + "balance_loss_mlp": 1.03824139, + "epoch": 0.5714113933563806, + "flos": 17413443308160.0, + "grad_norm": 2.09557851646671, + "language_loss": 0.85872537, + "learning_rate": 1.6365214485062553e-06, + "loss": 0.8801657, + "num_input_tokens_seen": 204751365, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 9504, + "time_per_iteration": 2.3861567974090576 + }, + { + "auxiliary_loss_clip": 0.01106356, + "auxiliary_loss_mlp": 0.0102491, + "balance_loss_clip": 1.01232708, + "balance_loss_mlp": 1.03753018, + "epoch": 0.5714715166090486, + "flos": 20193360589440.0, + "grad_norm": 1.9315555303189194, + "language_loss": 0.75182885, + "learning_rate": 1.6361384795205496e-06, + "loss": 0.7731415, + "num_input_tokens_seen": 204768980, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 9505, + "time_per_iteration": 2.462536573410034 + }, + { + "auxiliary_loss_clip": 0.01108197, + "auxiliary_loss_mlp": 0.01031273, + "balance_loss_clip": 1.01970994, + "balance_loss_mlp": 1.03717351, + "epoch": 0.5715316398617165, + "flos": 18551668458240.0, + "grad_norm": 1.6115496885789637, + "language_loss": 0.81918782, + "learning_rate": 1.635755524332509e-06, + "loss": 0.84058261, + "num_input_tokens_seen": 204788110, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.7109375, + "step": 9506, + "time_per_iteration": 2.467022180557251 + }, + { + "auxiliary_loss_clip": 0.01106598, + "auxiliary_loss_mlp": 0.01027974, + "balance_loss_clip": 1.01546264, + "balance_loss_mlp": 1.03684521, + "epoch": 0.5715917631143845, + "flos": 18478195188480.0, + "grad_norm": 1.6660041805363315, + "language_loss": 0.77144134, + "learning_rate": 1.6353725829566552e-06, + "loss": 0.79278708, + "num_input_tokens_seen": 204807240, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69921875, + "step": 9507, + "time_per_iteration": 2.4672694206237793 + }, + { + "auxiliary_loss_clip": 0.01110344, + "auxiliary_loss_mlp": 0.01035951, + "balance_loss_clip": 1.02186108, + "balance_loss_mlp": 1.03726792, + "epoch": 0.5716518863670524, + "flos": 24020037037440.0, + "grad_norm": 2.45367934924197, + "language_loss": 0.68435538, + "learning_rate": 1.63498965540751e-06, + "loss": 0.7058183, + "num_input_tokens_seen": 204826415, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.73046875, + "step": 9508, + "time_per_iteration": 2.464097261428833 + }, + { + "auxiliary_loss_clip": 0.01110426, + "auxiliary_loss_mlp": 0.01029096, + "balance_loss_clip": 1.01629877, + "balance_loss_mlp": 1.03722239, + "epoch": 0.5717120096197205, + "flos": 17819485626240.0, + "grad_norm": 2.0052906721639836, + "language_loss": 0.79419613, + "learning_rate": 1.634606741699593e-06, + "loss": 0.81559134, + "num_input_tokens_seen": 204844305, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.73046875, + "step": 9509, + "time_per_iteration": 2.504023551940918 + }, + { + "auxiliary_loss_clip": 0.01105897, + "auxiliary_loss_mlp": 0.01033534, + "balance_loss_clip": 1.02089834, + "balance_loss_mlp": 1.03664279, + "epoch": 0.5717721328723884, + "flos": 21866043179520.0, + "grad_norm": 1.839099502620817, + "language_loss": 0.7265448, + "learning_rate": 1.6342238418474255e-06, + "loss": 0.74793911, + "num_input_tokens_seen": 204861765, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 9510, + "time_per_iteration": 3.815577507019043 + }, + { + "auxiliary_loss_clip": 0.01107423, + "auxiliary_loss_mlp": 0.01030098, + "balance_loss_clip": 1.01810002, + "balance_loss_mlp": 1.03668678, + "epoch": 0.5718322561250564, + "flos": 28437624126720.0, + "grad_norm": 1.3819155223826083, + "language_loss": 0.69395494, + "learning_rate": 1.6338409558655264e-06, + "loss": 0.71533018, + "num_input_tokens_seen": 204882505, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.70703125, + "step": 9511, + "time_per_iteration": 2.5445902347564697 + }, + { + "auxiliary_loss_clip": 0.0110843, + "auxiliary_loss_mlp": 0.01036527, + "balance_loss_clip": 1.02426028, + "balance_loss_mlp": 1.03781009, + "epoch": 0.5718923793777243, + "flos": 13551825905280.0, + "grad_norm": 1.8672218842214499, + "language_loss": 0.61565816, + "learning_rate": 1.6334580837684152e-06, + "loss": 0.63710779, + "num_input_tokens_seen": 204899830, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.70703125, + "step": 9512, + "time_per_iteration": 3.8341665267944336 + }, + { + "auxiliary_loss_clip": 0.01106641, + "auxiliary_loss_mlp": 0.01028616, + "balance_loss_clip": 1.0164628, + "balance_loss_mlp": 1.03667331, + "epoch": 0.5719525026303923, + "flos": 17822035491840.0, + "grad_norm": 4.170405845803043, + "language_loss": 0.7586627, + "learning_rate": 1.6330752255706104e-06, + "loss": 0.78001529, + "num_input_tokens_seen": 204918100, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69921875, + "step": 9513, + "time_per_iteration": 5.299789667129517 + }, + { + "auxiliary_loss_clip": 0.01028017, + "auxiliary_loss_mlp": 0.00999308, + "balance_loss_clip": 0.99809855, + "balance_loss_mlp": 1.00645494, + "epoch": 0.5720126258830602, + "flos": 61298042814720.0, + "grad_norm": 0.8876641821203675, + "language_loss": 0.6684342, + "learning_rate": 1.6326923812866288e-06, + "loss": 0.68870747, + "num_input_tokens_seen": 204972925, + "router_z_loss_clip": 0.01208496, + "router_z_loss_mlp": 0.21582031, + "step": 9514, + "time_per_iteration": 3.0201942920684814 + }, + { + "auxiliary_loss_clip": 0.01114776, + "auxiliary_loss_mlp": 0.0104014, + "balance_loss_clip": 1.02696776, + "balance_loss_mlp": 1.04034257, + "epoch": 0.5720727491357283, + "flos": 23988040997760.0, + "grad_norm": 2.046774799271973, + "language_loss": 0.81059563, + "learning_rate": 1.63230955093099e-06, + "loss": 0.8321448, + "num_input_tokens_seen": 204990910, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.74609375, + "step": 9515, + "time_per_iteration": 2.440838575363159 + }, + { + "auxiliary_loss_clip": 0.01104804, + "auxiliary_loss_mlp": 0.01027026, + "balance_loss_clip": 1.01469994, + "balance_loss_mlp": 1.03602076, + "epoch": 0.5721328723883962, + "flos": 23405426398080.0, + "grad_norm": 1.8601231206296425, + "language_loss": 0.86125237, + "learning_rate": 1.6319267345182092e-06, + "loss": 0.88257068, + "num_input_tokens_seen": 205010500, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 9516, + "time_per_iteration": 2.477764368057251 + }, + { + "auxiliary_loss_clip": 0.01104974, + "auxiliary_loss_mlp": 0.01029332, + "balance_loss_clip": 1.01654696, + "balance_loss_mlp": 1.03561044, + "epoch": 0.5721929956410642, + "flos": 18804910320000.0, + "grad_norm": 1.8026555789133811, + "language_loss": 0.87531322, + "learning_rate": 1.6315439320628038e-06, + "loss": 0.89665627, + "num_input_tokens_seen": 205028560, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 9517, + "time_per_iteration": 2.425889253616333 + }, + { + "auxiliary_loss_clip": 0.0110684, + "auxiliary_loss_mlp": 0.0103103, + "balance_loss_clip": 1.01804841, + "balance_loss_mlp": 1.03662252, + "epoch": 0.5722531188937322, + "flos": 27196659100800.0, + "grad_norm": 1.765867586501473, + "language_loss": 0.8479656, + "learning_rate": 1.6311611435792893e-06, + "loss": 0.86934435, + "num_input_tokens_seen": 205048650, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.703125, + "step": 9518, + "time_per_iteration": 2.515908718109131 + }, + { + "auxiliary_loss_clip": 0.01102718, + "auxiliary_loss_mlp": 0.01031303, + "balance_loss_clip": 1.01909649, + "balance_loss_mlp": 1.03518391, + "epoch": 0.5723132421464001, + "flos": 15195672852480.0, + "grad_norm": 1.8620909672026127, + "language_loss": 0.7880826, + "learning_rate": 1.6307783690821812e-06, + "loss": 0.80942279, + "num_input_tokens_seen": 205066480, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.67578125, + "step": 9519, + "time_per_iteration": 2.400693893432617 + }, + { + "auxiliary_loss_clip": 0.01105893, + "auxiliary_loss_mlp": 0.01030213, + "balance_loss_clip": 1.01795244, + "balance_loss_mlp": 1.03658307, + "epoch": 0.5723733653990681, + "flos": 27599433281280.0, + "grad_norm": 1.5438950427184228, + "language_loss": 0.82970679, + "learning_rate": 1.6303956085859944e-06, + "loss": 0.85106778, + "num_input_tokens_seen": 205087475, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.69140625, + "step": 9520, + "time_per_iteration": 2.5011074542999268 + }, + { + "auxiliary_loss_clip": 0.01110791, + "auxiliary_loss_mlp": 0.01039958, + "balance_loss_clip": 1.0268625, + "balance_loss_mlp": 1.03927732, + "epoch": 0.572433488651736, + "flos": 18222870337920.0, + "grad_norm": 2.123220131944119, + "language_loss": 0.71853209, + "learning_rate": 1.630012862105243e-06, + "loss": 0.74003959, + "num_input_tokens_seen": 205106495, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71484375, + "step": 9521, + "time_per_iteration": 2.4156429767608643 + }, + { + "auxiliary_loss_clip": 0.01106899, + "auxiliary_loss_mlp": 0.01033537, + "balance_loss_clip": 1.02117443, + "balance_loss_mlp": 1.0362848, + "epoch": 0.5724936119044041, + "flos": 31249106484480.0, + "grad_norm": 1.6921576366095024, + "language_loss": 0.77830148, + "learning_rate": 1.6296301296544415e-06, + "loss": 0.79970586, + "num_input_tokens_seen": 205128285, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.70703125, + "step": 9522, + "time_per_iteration": 2.5682153701782227 + }, + { + "auxiliary_loss_clip": 0.01102798, + "auxiliary_loss_mlp": 0.01031779, + "balance_loss_clip": 1.02081728, + "balance_loss_mlp": 1.03628266, + "epoch": 0.572553735157072, + "flos": 19202189719680.0, + "grad_norm": 1.565759699688635, + "language_loss": 0.71671265, + "learning_rate": 1.629247411248102e-06, + "loss": 0.73805845, + "num_input_tokens_seen": 205146595, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6640625, + "step": 9523, + "time_per_iteration": 2.402622938156128 + }, + { + "auxiliary_loss_clip": 0.01104927, + "auxiliary_loss_mlp": 0.01026614, + "balance_loss_clip": 1.01511574, + "balance_loss_mlp": 1.03639328, + "epoch": 0.57261385840974, + "flos": 21214911386880.0, + "grad_norm": 1.6537237547017787, + "language_loss": 0.70046443, + "learning_rate": 1.628864706900738e-06, + "loss": 0.72177982, + "num_input_tokens_seen": 205164295, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6875, + "step": 9524, + "time_per_iteration": 2.478745698928833 + }, + { + "auxiliary_loss_clip": 0.01107047, + "auxiliary_loss_mlp": 0.01030877, + "balance_loss_clip": 1.01944458, + "balance_loss_mlp": 1.03783476, + "epoch": 0.5726739816624079, + "flos": 33984529793280.0, + "grad_norm": 1.431879051430598, + "language_loss": 0.65079439, + "learning_rate": 1.6284820166268615e-06, + "loss": 0.67217362, + "num_input_tokens_seen": 205185380, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.69140625, + "step": 9525, + "time_per_iteration": 2.5722320079803467 + }, + { + "auxiliary_loss_clip": 0.01102, + "auxiliary_loss_mlp": 0.01029624, + "balance_loss_clip": 1.01825702, + "balance_loss_mlp": 1.03385937, + "epoch": 0.5727341049150759, + "flos": 24275972419200.0, + "grad_norm": 1.7621674355193322, + "language_loss": 0.72353703, + "learning_rate": 1.628099340440984e-06, + "loss": 0.74485326, + "num_input_tokens_seen": 205204895, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.68359375, + "step": 9526, + "time_per_iteration": 2.5182504653930664 + }, + { + "auxiliary_loss_clip": 0.01102827, + "auxiliary_loss_mlp": 0.01031539, + "balance_loss_clip": 1.02022004, + "balance_loss_mlp": 1.03617597, + "epoch": 0.5727942281677438, + "flos": 28400564269440.0, + "grad_norm": 1.6243804380597333, + "language_loss": 0.80131519, + "learning_rate": 1.6277166783576176e-06, + "loss": 0.8226589, + "num_input_tokens_seen": 205223440, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66796875, + "step": 9527, + "time_per_iteration": 2.556168556213379 + }, + { + "auxiliary_loss_clip": 0.01104789, + "auxiliary_loss_mlp": 0.01036832, + "balance_loss_clip": 1.02399302, + "balance_loss_mlp": 1.03633451, + "epoch": 0.5728543514204119, + "flos": 19536769929600.0, + "grad_norm": 1.8731920412295517, + "language_loss": 0.71818352, + "learning_rate": 1.6273340303912713e-06, + "loss": 0.7395997, + "num_input_tokens_seen": 205242800, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.68359375, + "step": 9528, + "time_per_iteration": 2.502045154571533 + }, + { + "auxiliary_loss_clip": 0.01107269, + "auxiliary_loss_mlp": 0.01033482, + "balance_loss_clip": 1.02113199, + "balance_loss_mlp": 1.03742957, + "epoch": 0.5729144746730798, + "flos": 21506757390720.0, + "grad_norm": 1.9532280974694858, + "language_loss": 0.853854, + "learning_rate": 1.6269513965564557e-06, + "loss": 0.87526155, + "num_input_tokens_seen": 205259465, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69921875, + "step": 9529, + "time_per_iteration": 2.468146324157715 + }, + { + "auxiliary_loss_clip": 0.01028852, + "auxiliary_loss_mlp": 0.0100185, + "balance_loss_clip": 1.0006398, + "balance_loss_mlp": 1.00712085, + "epoch": 0.5729745979257478, + "flos": 58681628242560.0, + "grad_norm": 0.7632636876236247, + "language_loss": 0.56091511, + "learning_rate": 1.6265687768676813e-06, + "loss": 0.58122212, + "num_input_tokens_seen": 205314100, + "router_z_loss_clip": 0.01208496, + "router_z_loss_mlp": 0.21777344, + "step": 9530, + "time_per_iteration": 2.955796003341675 + }, + { + "auxiliary_loss_clip": 0.01109966, + "auxiliary_loss_mlp": 0.01023962, + "balance_loss_clip": 1.01241684, + "balance_loss_mlp": 1.03820443, + "epoch": 0.5730347211784158, + "flos": 18552099421440.0, + "grad_norm": 2.605800582107851, + "language_loss": 0.66667211, + "learning_rate": 1.6261861713394553e-06, + "loss": 0.68801141, + "num_input_tokens_seen": 205333420, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.71875, + "step": 9531, + "time_per_iteration": 2.4874041080474854 + }, + { + "auxiliary_loss_clip": 0.01107074, + "auxiliary_loss_mlp": 0.01031474, + "balance_loss_clip": 1.0189929, + "balance_loss_mlp": 1.0362972, + "epoch": 0.5730948444310837, + "flos": 38031482396160.0, + "grad_norm": 2.577990064326961, + "language_loss": 0.75677073, + "learning_rate": 1.6258035799862876e-06, + "loss": 0.77815616, + "num_input_tokens_seen": 205350995, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.70703125, + "step": 9532, + "time_per_iteration": 2.653745651245117 + }, + { + "auxiliary_loss_clip": 0.01105987, + "auxiliary_loss_mlp": 0.01030007, + "balance_loss_clip": 1.01779997, + "balance_loss_mlp": 1.03636467, + "epoch": 0.5731549676837517, + "flos": 25227066689280.0, + "grad_norm": 3.4857041080787696, + "language_loss": 0.78726482, + "learning_rate": 1.625421002822686e-06, + "loss": 0.80862474, + "num_input_tokens_seen": 205372675, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6953125, + "step": 9533, + "time_per_iteration": 2.5444183349609375 + }, + { + "auxiliary_loss_clip": 0.01105558, + "auxiliary_loss_mlp": 0.01033011, + "balance_loss_clip": 1.02156746, + "balance_loss_mlp": 1.03771889, + "epoch": 0.5732150909364196, + "flos": 23368222886400.0, + "grad_norm": 2.5155449858561036, + "language_loss": 0.8564285, + "learning_rate": 1.6250384398631574e-06, + "loss": 0.87781423, + "num_input_tokens_seen": 205392590, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6796875, + "step": 9534, + "time_per_iteration": 2.611769199371338 + }, + { + "auxiliary_loss_clip": 0.01108602, + "auxiliary_loss_mlp": 0.01035947, + "balance_loss_clip": 1.02241731, + "balance_loss_mlp": 1.03833961, + "epoch": 0.5732752141890877, + "flos": 23079357711360.0, + "grad_norm": 1.7913378128419626, + "language_loss": 0.74880809, + "learning_rate": 1.6246558911222085e-06, + "loss": 0.7702536, + "num_input_tokens_seen": 205414885, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.703125, + "step": 9535, + "time_per_iteration": 2.5294063091278076 + }, + { + "auxiliary_loss_clip": 0.01113223, + "auxiliary_loss_mlp": 0.010319, + "balance_loss_clip": 1.01927602, + "balance_loss_mlp": 1.04021287, + "epoch": 0.5733353374417556, + "flos": 24352282863360.0, + "grad_norm": 1.60935564318513, + "language_loss": 0.70712042, + "learning_rate": 1.624273356614346e-06, + "loss": 0.72857165, + "num_input_tokens_seen": 205434440, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.73046875, + "step": 9536, + "time_per_iteration": 2.5115044116973877 + }, + { + "auxiliary_loss_clip": 0.01104773, + "auxiliary_loss_mlp": 0.01029145, + "balance_loss_clip": 1.01741457, + "balance_loss_mlp": 1.03604972, + "epoch": 0.5733954606944236, + "flos": 27198849830400.0, + "grad_norm": 1.9605571924010112, + "language_loss": 0.69843078, + "learning_rate": 1.6238908363540755e-06, + "loss": 0.71977001, + "num_input_tokens_seen": 205454225, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6875, + "step": 9537, + "time_per_iteration": 2.485203266143799 + }, + { + "auxiliary_loss_clip": 0.01106743, + "auxiliary_loss_mlp": 0.01033894, + "balance_loss_clip": 1.02179384, + "balance_loss_mlp": 1.03693986, + "epoch": 0.5734555839470915, + "flos": 28765129357440.0, + "grad_norm": 1.9885156073739136, + "language_loss": 0.6257112, + "learning_rate": 1.623508330355902e-06, + "loss": 0.64711761, + "num_input_tokens_seen": 205474750, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 9538, + "time_per_iteration": 2.5242531299591064 + }, + { + "auxiliary_loss_clip": 0.01106895, + "auxiliary_loss_mlp": 0.01034021, + "balance_loss_clip": 1.02131939, + "balance_loss_mlp": 1.03750122, + "epoch": 0.5735157071997595, + "flos": 22966813422720.0, + "grad_norm": 1.847251631174476, + "language_loss": 0.83067656, + "learning_rate": 1.6231258386343306e-06, + "loss": 0.85208571, + "num_input_tokens_seen": 205495495, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 9539, + "time_per_iteration": 2.4557297229766846 + }, + { + "auxiliary_loss_clip": 0.01108422, + "auxiliary_loss_mlp": 0.01034021, + "balance_loss_clip": 1.02155805, + "balance_loss_mlp": 1.03672779, + "epoch": 0.5735758304524274, + "flos": 18989455420800.0, + "grad_norm": 1.9303873756935568, + "language_loss": 0.73266071, + "learning_rate": 1.6227433612038647e-06, + "loss": 0.75408518, + "num_input_tokens_seen": 205510070, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71484375, + "step": 9540, + "time_per_iteration": 2.449195384979248 + }, + { + "auxiliary_loss_clip": 0.01102256, + "auxiliary_loss_mlp": 0.01025926, + "balance_loss_clip": 1.01486361, + "balance_loss_mlp": 1.03386962, + "epoch": 0.5736359537050955, + "flos": 28397942576640.0, + "grad_norm": 1.7719156274309316, + "language_loss": 0.80036277, + "learning_rate": 1.6223608980790089e-06, + "loss": 0.82164454, + "num_input_tokens_seen": 205530190, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.68359375, + "step": 9541, + "time_per_iteration": 2.4807605743408203 + }, + { + "auxiliary_loss_clip": 0.01109647, + "auxiliary_loss_mlp": 0.01035157, + "balance_loss_clip": 1.02247286, + "balance_loss_mlp": 1.03748846, + "epoch": 0.5736960769577634, + "flos": 15627210848640.0, + "grad_norm": 2.3537030152809817, + "language_loss": 0.64358872, + "learning_rate": 1.6219784492742654e-06, + "loss": 0.66503674, + "num_input_tokens_seen": 205547380, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 9542, + "time_per_iteration": 2.417178153991699 + }, + { + "auxiliary_loss_clip": 0.01105899, + "auxiliary_loss_mlp": 0.0102862, + "balance_loss_clip": 1.01681268, + "balance_loss_mlp": 1.03586972, + "epoch": 0.5737562002104314, + "flos": 18003994813440.0, + "grad_norm": 2.222303069950764, + "language_loss": 0.82983625, + "learning_rate": 1.6215960148041365e-06, + "loss": 0.85118151, + "num_input_tokens_seen": 205566540, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.703125, + "step": 9543, + "time_per_iteration": 2.4162886142730713 + }, + { + "auxiliary_loss_clip": 0.01111645, + "auxiliary_loss_mlp": 0.01030794, + "balance_loss_clip": 1.01729405, + "balance_loss_mlp": 1.0378089, + "epoch": 0.5738163234630994, + "flos": 20698192287360.0, + "grad_norm": 2.297441344794182, + "language_loss": 0.73850191, + "learning_rate": 1.6212135946831257e-06, + "loss": 0.75992632, + "num_input_tokens_seen": 205584200, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73828125, + "step": 9544, + "time_per_iteration": 2.4531123638153076 + }, + { + "auxiliary_loss_clip": 0.01110237, + "auxiliary_loss_mlp": 0.01029691, + "balance_loss_clip": 1.01687646, + "balance_loss_mlp": 1.03741252, + "epoch": 0.5738764467157673, + "flos": 23149311448320.0, + "grad_norm": 2.106910148542404, + "language_loss": 0.75869375, + "learning_rate": 1.620831188925733e-06, + "loss": 0.78009301, + "num_input_tokens_seen": 205604675, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.73046875, + "step": 9545, + "time_per_iteration": 2.446340799331665 + }, + { + "auxiliary_loss_clip": 0.01109663, + "auxiliary_loss_mlp": 0.01033226, + "balance_loss_clip": 1.02086437, + "balance_loss_mlp": 1.03903508, + "epoch": 0.5739365699684353, + "flos": 29492930730240.0, + "grad_norm": 1.6841481616941998, + "language_loss": 0.56267381, + "learning_rate": 1.620448797546459e-06, + "loss": 0.58410275, + "num_input_tokens_seen": 205624680, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.70703125, + "step": 9546, + "time_per_iteration": 2.5431458950042725 + }, + { + "auxiliary_loss_clip": 0.01109256, + "auxiliary_loss_mlp": 0.01032725, + "balance_loss_clip": 1.02027345, + "balance_loss_mlp": 1.0375458, + "epoch": 0.5739966932211032, + "flos": 14027247342720.0, + "grad_norm": 2.2354008467729236, + "language_loss": 0.76396316, + "learning_rate": 1.6200664205598055e-06, + "loss": 0.78538299, + "num_input_tokens_seen": 205641950, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71875, + "step": 9547, + "time_per_iteration": 2.399355173110962 + }, + { + "auxiliary_loss_clip": 0.01108464, + "auxiliary_loss_mlp": 0.01030986, + "balance_loss_clip": 1.01847458, + "balance_loss_mlp": 1.03692102, + "epoch": 0.5740568164737713, + "flos": 19062030850560.0, + "grad_norm": 3.5736288481687457, + "language_loss": 0.74030554, + "learning_rate": 1.6196840579802704e-06, + "loss": 0.76169997, + "num_input_tokens_seen": 205660130, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71484375, + "step": 9548, + "time_per_iteration": 2.438188314437866 + }, + { + "auxiliary_loss_clip": 0.01107619, + "auxiliary_loss_mlp": 0.01033978, + "balance_loss_clip": 1.02162778, + "balance_loss_mlp": 1.03630018, + "epoch": 0.5741169397264392, + "flos": 22127832478080.0, + "grad_norm": 2.070673757769185, + "language_loss": 0.6898725, + "learning_rate": 1.619301709822355e-06, + "loss": 0.71128839, + "num_input_tokens_seen": 205678895, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.71484375, + "step": 9549, + "time_per_iteration": 2.4443182945251465 + }, + { + "auxiliary_loss_clip": 0.01109324, + "auxiliary_loss_mlp": 0.01029147, + "balance_loss_clip": 1.01756024, + "balance_loss_mlp": 1.0398941, + "epoch": 0.5741770629791072, + "flos": 24936836797440.0, + "grad_norm": 1.5143454441571018, + "language_loss": 0.79360747, + "learning_rate": 1.6189193761005564e-06, + "loss": 0.81499219, + "num_input_tokens_seen": 205698450, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6953125, + "step": 9550, + "time_per_iteration": 2.570117473602295 + }, + { + "auxiliary_loss_clip": 0.0111024, + "auxiliary_loss_mlp": 0.01030597, + "balance_loss_clip": 1.01844966, + "balance_loss_mlp": 1.03862011, + "epoch": 0.5742371862317751, + "flos": 18801462614400.0, + "grad_norm": 1.8121895379081407, + "language_loss": 0.67906272, + "learning_rate": 1.6185370568293727e-06, + "loss": 0.70047116, + "num_input_tokens_seen": 205714870, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.71484375, + "step": 9551, + "time_per_iteration": 2.423403024673462 + }, + { + "auxiliary_loss_clip": 0.01109924, + "auxiliary_loss_mlp": 0.01036266, + "balance_loss_clip": 1.02370107, + "balance_loss_mlp": 1.03743887, + "epoch": 0.5742973094844431, + "flos": 24460661174400.0, + "grad_norm": 1.628701607162486, + "language_loss": 0.71362531, + "learning_rate": 1.6181547520233031e-06, + "loss": 0.73508722, + "num_input_tokens_seen": 205736045, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.72265625, + "step": 9552, + "time_per_iteration": 3.886622428894043 + }, + { + "auxiliary_loss_clip": 0.01109635, + "auxiliary_loss_mlp": 0.01031692, + "balance_loss_clip": 1.01972914, + "balance_loss_mlp": 1.03975332, + "epoch": 0.574357432737111, + "flos": 21652770176640.0, + "grad_norm": 1.7228318188262413, + "language_loss": 0.79922652, + "learning_rate": 1.617772461696843e-06, + "loss": 0.82063985, + "num_input_tokens_seen": 205754445, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69921875, + "step": 9553, + "time_per_iteration": 2.431051731109619 + }, + { + "auxiliary_loss_clip": 0.01108983, + "auxiliary_loss_mlp": 0.01030986, + "balance_loss_clip": 1.01900589, + "balance_loss_mlp": 1.03611398, + "epoch": 0.5744175559897791, + "flos": 16544728880640.0, + "grad_norm": 2.015136287210995, + "language_loss": 0.83396381, + "learning_rate": 1.6173901858644895e-06, + "loss": 0.85536349, + "num_input_tokens_seen": 205770595, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.73046875, + "step": 9554, + "time_per_iteration": 3.823064088821411 + }, + { + "auxiliary_loss_clip": 0.0111382, + "auxiliary_loss_mlp": 0.01036712, + "balance_loss_clip": 1.02347982, + "balance_loss_mlp": 1.04021072, + "epoch": 0.574477679242447, + "flos": 24207598880640.0, + "grad_norm": 1.4846822756962552, + "language_loss": 0.70777845, + "learning_rate": 1.6170079245407385e-06, + "loss": 0.72928381, + "num_input_tokens_seen": 205791935, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 9555, + "time_per_iteration": 5.333508491516113 + }, + { + "auxiliary_loss_clip": 0.01109263, + "auxiliary_loss_mlp": 0.01027381, + "balance_loss_clip": 1.01494122, + "balance_loss_mlp": 1.03861225, + "epoch": 0.574537802495115, + "flos": 14903000835840.0, + "grad_norm": 2.115239569910986, + "language_loss": 0.72206348, + "learning_rate": 1.6166256777400853e-06, + "loss": 0.7434299, + "num_input_tokens_seen": 205807260, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.70703125, + "step": 9556, + "time_per_iteration": 2.4479689598083496 + }, + { + "auxiliary_loss_clip": 0.01109212, + "auxiliary_loss_mlp": 0.01034822, + "balance_loss_clip": 1.02174449, + "balance_loss_mlp": 1.03852749, + "epoch": 0.5745979257477829, + "flos": 24934969290240.0, + "grad_norm": 1.5580789907924004, + "language_loss": 0.73779786, + "learning_rate": 1.6162434454770248e-06, + "loss": 0.75923818, + "num_input_tokens_seen": 205826885, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.70703125, + "step": 9557, + "time_per_iteration": 2.53330397605896 + }, + { + "auxiliary_loss_clip": 0.01108535, + "auxiliary_loss_mlp": 0.01033277, + "balance_loss_clip": 1.02114749, + "balance_loss_mlp": 1.03805625, + "epoch": 0.5746580490004509, + "flos": 17235757704960.0, + "grad_norm": 1.551535187819687, + "language_loss": 0.67825913, + "learning_rate": 1.6158612277660514e-06, + "loss": 0.69967735, + "num_input_tokens_seen": 205844630, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.703125, + "step": 9558, + "time_per_iteration": 2.4345078468322754 + }, + { + "auxiliary_loss_clip": 0.01115654, + "auxiliary_loss_mlp": 0.01039699, + "balance_loss_clip": 1.02509618, + "balance_loss_mlp": 1.03993464, + "epoch": 0.5747181722531189, + "flos": 13187871348480.0, + "grad_norm": 2.018077791857229, + "language_loss": 0.71494532, + "learning_rate": 1.615479024621659e-06, + "loss": 0.73649883, + "num_input_tokens_seen": 205860960, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7578125, + "step": 9559, + "time_per_iteration": 2.4112660884857178 + }, + { + "auxiliary_loss_clip": 0.01109449, + "auxiliary_loss_mlp": 0.01029457, + "balance_loss_clip": 1.01856709, + "balance_loss_mlp": 1.03951454, + "epoch": 0.5747782955057869, + "flos": 22963006581120.0, + "grad_norm": 1.8277860809166269, + "language_loss": 0.79002881, + "learning_rate": 1.6150968360583398e-06, + "loss": 0.81141782, + "num_input_tokens_seen": 205880675, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.69921875, + "step": 9560, + "time_per_iteration": 2.461737871170044 + }, + { + "auxiliary_loss_clip": 0.01110078, + "auxiliary_loss_mlp": 0.01029167, + "balance_loss_clip": 1.01649547, + "balance_loss_mlp": 1.03796887, + "epoch": 0.5748384187584549, + "flos": 23403235668480.0, + "grad_norm": 2.312922307701609, + "language_loss": 0.64114952, + "learning_rate": 1.614714662090588e-06, + "loss": 0.66254199, + "num_input_tokens_seen": 205900050, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 9561, + "time_per_iteration": 2.4589121341705322 + }, + { + "auxiliary_loss_clip": 0.01116817, + "auxiliary_loss_mlp": 0.01037364, + "balance_loss_clip": 1.02403021, + "balance_loss_mlp": 1.04126084, + "epoch": 0.5748985420111228, + "flos": 17785514338560.0, + "grad_norm": 1.619271715020599, + "language_loss": 0.71404445, + "learning_rate": 1.6143325027328945e-06, + "loss": 0.73558629, + "num_input_tokens_seen": 205918855, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7578125, + "step": 9562, + "time_per_iteration": 2.4472360610961914 + }, + { + "auxiliary_loss_clip": 0.01108014, + "auxiliary_loss_mlp": 0.01036964, + "balance_loss_clip": 1.0256269, + "balance_loss_mlp": 1.03870499, + "epoch": 0.5749586652637908, + "flos": 19866250408320.0, + "grad_norm": 1.47664891140277, + "language_loss": 0.84212148, + "learning_rate": 1.613950357999751e-06, + "loss": 0.86357129, + "num_input_tokens_seen": 205936970, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6953125, + "step": 9563, + "time_per_iteration": 2.448540449142456 + }, + { + "auxiliary_loss_clip": 0.01112952, + "auxiliary_loss_mlp": 0.01035939, + "balance_loss_clip": 1.02251637, + "balance_loss_mlp": 1.03915787, + "epoch": 0.5750187885164587, + "flos": 21287235421440.0, + "grad_norm": 2.1518785584706266, + "language_loss": 0.57469738, + "learning_rate": 1.6135682279056488e-06, + "loss": 0.59618628, + "num_input_tokens_seen": 205954630, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73828125, + "step": 9564, + "time_per_iteration": 2.455137252807617 + }, + { + "auxiliary_loss_clip": 0.01104038, + "auxiliary_loss_mlp": 0.0102815, + "balance_loss_clip": 1.01645529, + "balance_loss_mlp": 1.03663075, + "epoch": 0.5750789117691267, + "flos": 18804658924800.0, + "grad_norm": 1.7205024550895016, + "language_loss": 0.75828826, + "learning_rate": 1.613186112465078e-06, + "loss": 0.7796101, + "num_input_tokens_seen": 205971510, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 9565, + "time_per_iteration": 2.4293572902679443 + }, + { + "auxiliary_loss_clip": 0.01030195, + "auxiliary_loss_mlp": 0.01000571, + "balance_loss_clip": 0.9991762, + "balance_loss_mlp": 1.00864065, + "epoch": 0.5751390350217946, + "flos": 70663224124800.0, + "grad_norm": 0.7426631899706556, + "language_loss": 0.60724127, + "learning_rate": 1.6128040116925287e-06, + "loss": 0.62754893, + "num_input_tokens_seen": 206035125, + "router_z_loss_clip": 0.01397705, + "router_z_loss_mlp": 0.21582031, + "step": 9566, + "time_per_iteration": 3.156651496887207 + }, + { + "auxiliary_loss_clip": 0.01109259, + "auxiliary_loss_mlp": 0.01033183, + "balance_loss_clip": 1.02127385, + "balance_loss_mlp": 1.03952003, + "epoch": 0.5751991582744627, + "flos": 14246338348800.0, + "grad_norm": 1.8230299531471923, + "language_loss": 0.7537874, + "learning_rate": 1.6124219256024901e-06, + "loss": 0.77521175, + "num_input_tokens_seen": 206052075, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 9567, + "time_per_iteration": 2.414881706237793 + }, + { + "auxiliary_loss_clip": 0.01108744, + "auxiliary_loss_mlp": 0.01029486, + "balance_loss_clip": 1.01692748, + "balance_loss_mlp": 1.03808224, + "epoch": 0.5752592815271306, + "flos": 18328160079360.0, + "grad_norm": 1.5717614086198337, + "language_loss": 0.74559051, + "learning_rate": 1.6120398542094504e-06, + "loss": 0.76697284, + "num_input_tokens_seen": 206069970, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.70703125, + "step": 9568, + "time_per_iteration": 2.458827495574951 + }, + { + "auxiliary_loss_clip": 0.0111112, + "auxiliary_loss_mlp": 0.01029788, + "balance_loss_clip": 1.01751542, + "balance_loss_mlp": 1.0394876, + "epoch": 0.5753194047797986, + "flos": 20922742160640.0, + "grad_norm": 1.7630953099139652, + "language_loss": 0.70951653, + "learning_rate": 1.6116577975278994e-06, + "loss": 0.73092568, + "num_input_tokens_seen": 206088950, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.71484375, + "step": 9569, + "time_per_iteration": 2.4545505046844482 + }, + { + "auxiliary_loss_clip": 0.01112247, + "auxiliary_loss_mlp": 0.01040682, + "balance_loss_clip": 1.02746797, + "balance_loss_mlp": 1.04058015, + "epoch": 0.5753795280324665, + "flos": 19281804215040.0, + "grad_norm": 1.9393871177420576, + "language_loss": 0.55699342, + "learning_rate": 1.6112757555723223e-06, + "loss": 0.57852268, + "num_input_tokens_seen": 206107780, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71875, + "step": 9570, + "time_per_iteration": 2.478793144226074 + }, + { + "auxiliary_loss_clip": 0.01106131, + "auxiliary_loss_mlp": 0.01038048, + "balance_loss_clip": 1.02648425, + "balance_loss_mlp": 1.03744042, + "epoch": 0.5754396512851345, + "flos": 21652877917440.0, + "grad_norm": 1.6217673569741213, + "language_loss": 0.64154774, + "learning_rate": 1.6108937283572082e-06, + "loss": 0.6629895, + "num_input_tokens_seen": 206127445, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6875, + "step": 9571, + "time_per_iteration": 2.4446957111358643 + }, + { + "auxiliary_loss_clip": 0.01108191, + "auxiliary_loss_mlp": 0.0103505, + "balance_loss_clip": 1.02213967, + "balance_loss_mlp": 1.03693449, + "epoch": 0.5754997745378025, + "flos": 51021700179840.0, + "grad_norm": 1.5404037339802243, + "language_loss": 0.67144608, + "learning_rate": 1.6105117158970434e-06, + "loss": 0.69287848, + "num_input_tokens_seen": 206152005, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9572, + "time_per_iteration": 2.739871025085449 + }, + { + "auxiliary_loss_clip": 0.0110922, + "auxiliary_loss_mlp": 0.01031668, + "balance_loss_clip": 1.01920414, + "balance_loss_mlp": 1.03968024, + "epoch": 0.5755598977904705, + "flos": 22856890826880.0, + "grad_norm": 2.3042557910685897, + "language_loss": 0.72336781, + "learning_rate": 1.6101297182063123e-06, + "loss": 0.74477673, + "num_input_tokens_seen": 206169875, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 9573, + "time_per_iteration": 2.446484088897705 + }, + { + "auxiliary_loss_clip": 0.01105342, + "auxiliary_loss_mlp": 0.01028983, + "balance_loss_clip": 1.01808691, + "balance_loss_mlp": 1.03999066, + "epoch": 0.5756200210431385, + "flos": 38472824805120.0, + "grad_norm": 1.9447567655956284, + "language_loss": 0.76657987, + "learning_rate": 1.6097477352995022e-06, + "loss": 0.78792316, + "num_input_tokens_seen": 206192635, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.65625, + "step": 9574, + "time_per_iteration": 2.620338201522827 + }, + { + "auxiliary_loss_clip": 0.01113268, + "auxiliary_loss_mlp": 0.01030568, + "balance_loss_clip": 1.01712155, + "balance_loss_mlp": 1.03815711, + "epoch": 0.5756801442958064, + "flos": 23910006700800.0, + "grad_norm": 2.450005891087765, + "language_loss": 0.66523874, + "learning_rate": 1.6093657671910968e-06, + "loss": 0.6866771, + "num_input_tokens_seen": 206211485, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 9575, + "time_per_iteration": 2.4487204551696777 + }, + { + "auxiliary_loss_clip": 0.01106224, + "auxiliary_loss_mlp": 0.01032413, + "balance_loss_clip": 1.02086747, + "balance_loss_mlp": 1.03883016, + "epoch": 0.5757402675484744, + "flos": 21105276099840.0, + "grad_norm": 1.5135571903226765, + "language_loss": 0.79637057, + "learning_rate": 1.6089838138955804e-06, + "loss": 0.81775701, + "num_input_tokens_seen": 206231740, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.67578125, + "step": 9576, + "time_per_iteration": 2.499525547027588 + }, + { + "auxiliary_loss_clip": 0.01106499, + "auxiliary_loss_mlp": 0.0102964, + "balance_loss_clip": 1.0181545, + "balance_loss_mlp": 1.038414, + "epoch": 0.5758003908011423, + "flos": 20559110826240.0, + "grad_norm": 1.624550594516776, + "language_loss": 0.69612324, + "learning_rate": 1.6086018754274372e-06, + "loss": 0.71748459, + "num_input_tokens_seen": 206250975, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6796875, + "step": 9577, + "time_per_iteration": 2.4342739582061768 + }, + { + "auxiliary_loss_clip": 0.0111126, + "auxiliary_loss_mlp": 0.01032786, + "balance_loss_clip": 1.02076983, + "balance_loss_mlp": 1.03889465, + "epoch": 0.5758605140538103, + "flos": 16473015377280.0, + "grad_norm": 1.7262479676640925, + "language_loss": 0.66394711, + "learning_rate": 1.6082199518011504e-06, + "loss": 0.68538755, + "num_input_tokens_seen": 206268800, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.7265625, + "step": 9578, + "time_per_iteration": 2.452836513519287 + }, + { + "auxiliary_loss_clip": 0.01104785, + "auxiliary_loss_mlp": 0.01028747, + "balance_loss_clip": 1.01713598, + "balance_loss_mlp": 1.03683639, + "epoch": 0.5759206373064782, + "flos": 21287558643840.0, + "grad_norm": 1.5955641210398863, + "language_loss": 0.72130096, + "learning_rate": 1.6078380430312016e-06, + "loss": 0.74263626, + "num_input_tokens_seen": 206287190, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 9579, + "time_per_iteration": 2.4709668159484863 + }, + { + "auxiliary_loss_clip": 0.01113888, + "auxiliary_loss_mlp": 0.01031946, + "balance_loss_clip": 1.01880956, + "balance_loss_mlp": 1.03966045, + "epoch": 0.5759807605591463, + "flos": 26067879227520.0, + "grad_norm": 2.099656741464949, + "language_loss": 0.64655066, + "learning_rate": 1.6074561491320742e-06, + "loss": 0.66800898, + "num_input_tokens_seen": 206307020, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 9580, + "time_per_iteration": 2.5071680545806885 + }, + { + "auxiliary_loss_clip": 0.01108728, + "auxiliary_loss_mlp": 0.01033088, + "balance_loss_clip": 1.02024293, + "balance_loss_mlp": 1.03776896, + "epoch": 0.5760408838118142, + "flos": 18873068376960.0, + "grad_norm": 1.9172914104456789, + "language_loss": 0.8563143, + "learning_rate": 1.6070742701182486e-06, + "loss": 0.87773246, + "num_input_tokens_seen": 206324095, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9581, + "time_per_iteration": 2.459761142730713 + }, + { + "auxiliary_loss_clip": 0.01117292, + "auxiliary_loss_mlp": 0.01040765, + "balance_loss_clip": 1.02792597, + "balance_loss_mlp": 1.04308629, + "epoch": 0.5761010070644822, + "flos": 15378134964480.0, + "grad_norm": 2.0860755056974627, + "language_loss": 0.67691463, + "learning_rate": 1.6066924060042057e-06, + "loss": 0.69849521, + "num_input_tokens_seen": 206343210, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7421875, + "step": 9582, + "time_per_iteration": 2.461245536804199 + }, + { + "auxiliary_loss_clip": 0.01030428, + "auxiliary_loss_mlp": 0.01000716, + "balance_loss_clip": 0.99950552, + "balance_loss_mlp": 1.00893497, + "epoch": 0.5761611303171501, + "flos": 71471932882560.0, + "grad_norm": 0.6389163922736963, + "language_loss": 0.57233906, + "learning_rate": 1.6063105568044271e-06, + "loss": 0.59265041, + "num_input_tokens_seen": 206415935, + "router_z_loss_clip": 0.01208496, + "router_z_loss_mlp": 0.21484375, + "step": 9583, + "time_per_iteration": 3.212454080581665 + }, + { + "auxiliary_loss_clip": 0.01108245, + "auxiliary_loss_mlp": 0.01029211, + "balance_loss_clip": 1.01740384, + "balance_loss_mlp": 1.0381434, + "epoch": 0.5762212535698181, + "flos": 16246167033600.0, + "grad_norm": 1.8641226876424317, + "language_loss": 0.82294947, + "learning_rate": 1.6059287225333912e-06, + "loss": 0.84432399, + "num_input_tokens_seen": 206431900, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.69921875, + "step": 9584, + "time_per_iteration": 2.445197582244873 + }, + { + "auxiliary_loss_clip": 0.0102928, + "auxiliary_loss_mlp": 0.01000964, + "balance_loss_clip": 0.99972469, + "balance_loss_mlp": 1.00788319, + "epoch": 0.5762813768224861, + "flos": 70185504216960.0, + "grad_norm": 0.6211358186522926, + "language_loss": 0.49536344, + "learning_rate": 1.6055469032055773e-06, + "loss": 0.51566589, + "num_input_tokens_seen": 206501200, + "router_z_loss_clip": 0.01239014, + "router_z_loss_mlp": 0.21484375, + "step": 9585, + "time_per_iteration": 3.1135380268096924 + }, + { + "auxiliary_loss_clip": 0.01103387, + "auxiliary_loss_mlp": 0.01026782, + "balance_loss_clip": 1.01523662, + "balance_loss_mlp": 1.0356468, + "epoch": 0.5763415000751541, + "flos": 20518028645760.0, + "grad_norm": 2.0469276219055037, + "language_loss": 0.84745687, + "learning_rate": 1.605165098835465e-06, + "loss": 0.86875856, + "num_input_tokens_seen": 206520575, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6796875, + "step": 9586, + "time_per_iteration": 2.4322049617767334 + }, + { + "auxiliary_loss_clip": 0.01107042, + "auxiliary_loss_mlp": 0.01033774, + "balance_loss_clip": 1.02099502, + "balance_loss_mlp": 1.0371176, + "epoch": 0.5764016233278221, + "flos": 15815526877440.0, + "grad_norm": 1.708349469848261, + "language_loss": 0.79935288, + "learning_rate": 1.6047833094375308e-06, + "loss": 0.82076108, + "num_input_tokens_seen": 206538060, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 9587, + "time_per_iteration": 2.420388698577881 + }, + { + "auxiliary_loss_clip": 0.01106973, + "auxiliary_loss_mlp": 0.01030504, + "balance_loss_clip": 1.01772523, + "balance_loss_mlp": 1.03791797, + "epoch": 0.57646174658049, + "flos": 20772312001920.0, + "grad_norm": 1.476870264659234, + "language_loss": 0.65978181, + "learning_rate": 1.6044015350262542e-06, + "loss": 0.68115664, + "num_input_tokens_seen": 206557320, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69140625, + "step": 9588, + "time_per_iteration": 2.470181941986084 + }, + { + "auxiliary_loss_clip": 0.0110785, + "auxiliary_loss_mlp": 0.0103402, + "balance_loss_clip": 1.02095485, + "balance_loss_mlp": 1.03747165, + "epoch": 0.576521869833158, + "flos": 23549930812800.0, + "grad_norm": 1.7939970430826904, + "language_loss": 0.78344554, + "learning_rate": 1.6040197756161104e-06, + "loss": 0.80486423, + "num_input_tokens_seen": 206575780, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 9589, + "time_per_iteration": 2.4622039794921875 + }, + { + "auxiliary_loss_clip": 0.0110208, + "auxiliary_loss_mlp": 0.01023642, + "balance_loss_clip": 1.01255536, + "balance_loss_mlp": 1.03513849, + "epoch": 0.5765819930858259, + "flos": 20266582464000.0, + "grad_norm": 1.899286870644745, + "language_loss": 0.79484087, + "learning_rate": 1.6036380312215762e-06, + "loss": 0.81609809, + "num_input_tokens_seen": 206594100, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.66796875, + "step": 9590, + "time_per_iteration": 2.4738223552703857 + }, + { + "auxiliary_loss_clip": 0.01107337, + "auxiliary_loss_mlp": 0.0102776, + "balance_loss_clip": 1.01693606, + "balance_loss_mlp": 1.03926742, + "epoch": 0.5766421163384939, + "flos": 23148772744320.0, + "grad_norm": 1.6468651932641252, + "language_loss": 0.63016611, + "learning_rate": 1.6032563018571283e-06, + "loss": 0.65151715, + "num_input_tokens_seen": 206613325, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6796875, + "step": 9591, + "time_per_iteration": 2.4630722999572754 + }, + { + "auxiliary_loss_clip": 0.0110984, + "auxiliary_loss_mlp": 0.01035664, + "balance_loss_clip": 1.02349293, + "balance_loss_mlp": 1.03998208, + "epoch": 0.5767022395911618, + "flos": 25848895962240.0, + "grad_norm": 1.6611744555405081, + "language_loss": 0.77684325, + "learning_rate": 1.6028745875372406e-06, + "loss": 0.7982983, + "num_input_tokens_seen": 206634265, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69921875, + "step": 9592, + "time_per_iteration": 2.4990251064300537 + }, + { + "auxiliary_loss_clip": 0.01029258, + "auxiliary_loss_mlp": 0.01002299, + "balance_loss_clip": 1.00114298, + "balance_loss_mlp": 1.00790858, + "epoch": 0.5767623628438299, + "flos": 68293299657600.0, + "grad_norm": 0.7302836874791289, + "language_loss": 0.59611464, + "learning_rate": 1.6024928882763885e-06, + "loss": 0.61643022, + "num_input_tokens_seen": 206696990, + "router_z_loss_clip": 0.01153564, + "router_z_loss_mlp": 0.21386719, + "step": 9593, + "time_per_iteration": 3.1885087490081787 + }, + { + "auxiliary_loss_clip": 0.01110729, + "auxiliary_loss_mlp": 0.01039747, + "balance_loss_clip": 1.0265801, + "balance_loss_mlp": 1.03883052, + "epoch": 0.5768224860964978, + "flos": 30188448754560.0, + "grad_norm": 2.3535875138052806, + "language_loss": 0.7131753, + "learning_rate": 1.6021112040890463e-06, + "loss": 0.73468006, + "num_input_tokens_seen": 206717815, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71875, + "step": 9594, + "time_per_iteration": 3.89677357673645 + }, + { + "auxiliary_loss_clip": 0.01107394, + "auxiliary_loss_mlp": 0.01032136, + "balance_loss_clip": 1.02087677, + "balance_loss_mlp": 1.03755784, + "epoch": 0.5768826093491658, + "flos": 17895041884800.0, + "grad_norm": 1.9084853230861274, + "language_loss": 0.71146429, + "learning_rate": 1.6017295349896863e-06, + "loss": 0.73285961, + "num_input_tokens_seen": 206735985, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.69921875, + "step": 9595, + "time_per_iteration": 2.438798666000366 + }, + { + "auxiliary_loss_clip": 0.01106901, + "auxiliary_loss_mlp": 0.01029125, + "balance_loss_clip": 1.01726389, + "balance_loss_mlp": 1.03756046, + "epoch": 0.5769427326018337, + "flos": 17457183095040.0, + "grad_norm": 2.7843520689138646, + "language_loss": 0.69750065, + "learning_rate": 1.6013478809927828e-06, + "loss": 0.71886092, + "num_input_tokens_seen": 206753370, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6953125, + "step": 9596, + "time_per_iteration": 3.8589518070220947 + }, + { + "auxiliary_loss_clip": 0.01111865, + "auxiliary_loss_mlp": 0.01036248, + "balance_loss_clip": 1.02235997, + "balance_loss_mlp": 1.03845882, + "epoch": 0.5770028558545017, + "flos": 39421728345600.0, + "grad_norm": 2.3208716765708974, + "language_loss": 0.67437601, + "learning_rate": 1.6009662421128074e-06, + "loss": 0.69585705, + "num_input_tokens_seen": 206777645, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.734375, + "step": 9597, + "time_per_iteration": 4.025861501693726 + }, + { + "auxiliary_loss_clip": 0.01107063, + "auxiliary_loss_mlp": 0.01033781, + "balance_loss_clip": 1.02220011, + "balance_loss_mlp": 1.03775668, + "epoch": 0.5770629791071697, + "flos": 21536383132800.0, + "grad_norm": 2.263151487781109, + "language_loss": 0.81492549, + "learning_rate": 1.6005846183642323e-06, + "loss": 0.83633393, + "num_input_tokens_seen": 206794865, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.69140625, + "step": 9598, + "time_per_iteration": 2.4457364082336426 + }, + { + "auxiliary_loss_clip": 0.01108029, + "auxiliary_loss_mlp": 0.01030455, + "balance_loss_clip": 1.01787877, + "balance_loss_mlp": 1.03758776, + "epoch": 0.5771231023598377, + "flos": 20886795624960.0, + "grad_norm": 1.482456402920166, + "language_loss": 0.72767603, + "learning_rate": 1.6002030097615277e-06, + "loss": 0.74906087, + "num_input_tokens_seen": 206814095, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 9599, + "time_per_iteration": 2.440633773803711 + }, + { + "auxiliary_loss_clip": 0.0110238, + "auxiliary_loss_mlp": 0.01029145, + "balance_loss_clip": 1.0178082, + "balance_loss_mlp": 1.03569376, + "epoch": 0.5771832256125057, + "flos": 18077216688000.0, + "grad_norm": 1.8193310631715605, + "language_loss": 0.77990794, + "learning_rate": 1.5998214163191663e-06, + "loss": 0.80122316, + "num_input_tokens_seen": 206832245, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66796875, + "step": 9600, + "time_per_iteration": 2.4627256393432617 + }, + { + "auxiliary_loss_clip": 0.01108817, + "auxiliary_loss_mlp": 0.01033376, + "balance_loss_clip": 1.0210135, + "balance_loss_mlp": 1.03849137, + "epoch": 0.5772433488651736, + "flos": 26359078786560.0, + "grad_norm": 1.5552976085447456, + "language_loss": 0.72505343, + "learning_rate": 1.5994398380516163e-06, + "loss": 0.74647534, + "num_input_tokens_seen": 206851535, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.703125, + "step": 9601, + "time_per_iteration": 2.5040857791900635 + }, + { + "auxiliary_loss_clip": 0.01107029, + "auxiliary_loss_mlp": 0.01033902, + "balance_loss_clip": 1.02177262, + "balance_loss_mlp": 1.03861833, + "epoch": 0.5773034721178416, + "flos": 19680987035520.0, + "grad_norm": 1.6061208919603027, + "language_loss": 0.68449026, + "learning_rate": 1.599058274973348e-06, + "loss": 0.7058996, + "num_input_tokens_seen": 206870595, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.68359375, + "step": 9602, + "time_per_iteration": 2.4730873107910156 + }, + { + "auxiliary_loss_clip": 0.01102166, + "auxiliary_loss_mlp": 0.01030844, + "balance_loss_clip": 1.01990008, + "balance_loss_mlp": 1.03666043, + "epoch": 0.5773635953705095, + "flos": 25082885496960.0, + "grad_norm": 1.4427131087039327, + "language_loss": 0.72969544, + "learning_rate": 1.5986767270988297e-06, + "loss": 0.75102556, + "num_input_tokens_seen": 206892320, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.65625, + "step": 9603, + "time_per_iteration": 2.4821383953094482 + }, + { + "auxiliary_loss_clip": 0.01106029, + "auxiliary_loss_mlp": 0.01029794, + "balance_loss_clip": 1.01787269, + "balance_loss_mlp": 1.03815305, + "epoch": 0.5774237186231775, + "flos": 21032987978880.0, + "grad_norm": 1.760798848795816, + "language_loss": 0.76811421, + "learning_rate": 1.5982951944425298e-06, + "loss": 0.78947246, + "num_input_tokens_seen": 206912485, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 9604, + "time_per_iteration": 2.4963274002075195 + }, + { + "auxiliary_loss_clip": 0.01108714, + "auxiliary_loss_mlp": 0.01033384, + "balance_loss_clip": 1.01986611, + "balance_loss_mlp": 1.03805828, + "epoch": 0.5774838418758454, + "flos": 15231727128960.0, + "grad_norm": 1.8255502953236893, + "language_loss": 0.83589303, + "learning_rate": 1.5979136770189174e-06, + "loss": 0.85731399, + "num_input_tokens_seen": 206929100, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.70703125, + "step": 9605, + "time_per_iteration": 2.420722484588623 + }, + { + "auxiliary_loss_clip": 0.01115788, + "auxiliary_loss_mlp": 0.01032506, + "balance_loss_clip": 1.01826096, + "balance_loss_mlp": 1.041394, + "epoch": 0.5775439651285135, + "flos": 23582609210880.0, + "grad_norm": 1.6448412923605056, + "language_loss": 0.78043878, + "learning_rate": 1.5975321748424581e-06, + "loss": 0.80192173, + "num_input_tokens_seen": 206947020, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.74609375, + "step": 9606, + "time_per_iteration": 2.47755765914917 + }, + { + "auxiliary_loss_clip": 0.01105815, + "auxiliary_loss_mlp": 0.01033593, + "balance_loss_clip": 1.02192283, + "balance_loss_mlp": 1.03780627, + "epoch": 0.5776040883811814, + "flos": 18040515966720.0, + "grad_norm": 1.6466821062116115, + "language_loss": 0.74067813, + "learning_rate": 1.597150687927619e-06, + "loss": 0.76207221, + "num_input_tokens_seen": 206964065, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 9607, + "time_per_iteration": 2.473158597946167 + }, + { + "auxiliary_loss_clip": 0.01107935, + "auxiliary_loss_mlp": 0.01030683, + "balance_loss_clip": 1.01862538, + "balance_loss_mlp": 1.03809416, + "epoch": 0.5776642116338494, + "flos": 18624638937600.0, + "grad_norm": 1.6703318324983303, + "language_loss": 0.69666326, + "learning_rate": 1.5967692162888664e-06, + "loss": 0.71804941, + "num_input_tokens_seen": 206981940, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.69921875, + "step": 9608, + "time_per_iteration": 2.457597255706787 + }, + { + "auxiliary_loss_clip": 0.01109603, + "auxiliary_loss_mlp": 0.01031186, + "balance_loss_clip": 1.01841307, + "balance_loss_mlp": 1.03859639, + "epoch": 0.5777243348865173, + "flos": 28402539517440.0, + "grad_norm": 1.7239529426914375, + "language_loss": 0.76340568, + "learning_rate": 1.596387759940665e-06, + "loss": 0.78481352, + "num_input_tokens_seen": 207002365, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7109375, + "step": 9609, + "time_per_iteration": 2.478379964828491 + }, + { + "auxiliary_loss_clip": 0.01106636, + "auxiliary_loss_mlp": 0.01028415, + "balance_loss_clip": 1.01672637, + "balance_loss_mlp": 1.03600001, + "epoch": 0.5777844581391853, + "flos": 24024705805440.0, + "grad_norm": 1.8185868001057917, + "language_loss": 0.77262604, + "learning_rate": 1.5960063188974808e-06, + "loss": 0.79397655, + "num_input_tokens_seen": 207021195, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.70703125, + "step": 9610, + "time_per_iteration": 2.4817564487457275 + }, + { + "auxiliary_loss_clip": 0.01108101, + "auxiliary_loss_mlp": 0.0102822, + "balance_loss_clip": 1.01526141, + "balance_loss_mlp": 1.03805757, + "epoch": 0.5778445813918534, + "flos": 17777361951360.0, + "grad_norm": 2.0354514470011327, + "language_loss": 0.68514067, + "learning_rate": 1.5956248931737777e-06, + "loss": 0.70650387, + "num_input_tokens_seen": 207037465, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.69921875, + "step": 9611, + "time_per_iteration": 2.401411771774292 + }, + { + "auxiliary_loss_clip": 0.01104847, + "auxiliary_loss_mlp": 0.01026691, + "balance_loss_clip": 1.01431727, + "balance_loss_mlp": 1.03594267, + "epoch": 0.5779047046445213, + "flos": 22233194046720.0, + "grad_norm": 1.8201815228945446, + "language_loss": 0.82796168, + "learning_rate": 1.5952434827840185e-06, + "loss": 0.84927702, + "num_input_tokens_seen": 207054230, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69140625, + "step": 9612, + "time_per_iteration": 2.4473085403442383 + }, + { + "auxiliary_loss_clip": 0.0110712, + "auxiliary_loss_mlp": 0.01031451, + "balance_loss_clip": 1.01915455, + "balance_loss_mlp": 1.0376699, + "epoch": 0.5779648278971893, + "flos": 21434361528960.0, + "grad_norm": 1.6350469107350603, + "language_loss": 0.79244345, + "learning_rate": 1.594862087742667e-06, + "loss": 0.81382918, + "num_input_tokens_seen": 207073150, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6953125, + "step": 9613, + "time_per_iteration": 2.427710771560669 + }, + { + "auxiliary_loss_clip": 0.01104237, + "auxiliary_loss_mlp": 0.01034281, + "balance_loss_clip": 1.02318311, + "balance_loss_mlp": 1.03584552, + "epoch": 0.5780249511498572, + "flos": 19026120228480.0, + "grad_norm": 1.8237036529741348, + "language_loss": 0.77103758, + "learning_rate": 1.5944807080641863e-06, + "loss": 0.79242271, + "num_input_tokens_seen": 207090375, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.68359375, + "step": 9614, + "time_per_iteration": 2.44856595993042 + }, + { + "auxiliary_loss_clip": 0.01108473, + "auxiliary_loss_mlp": 0.0103142, + "balance_loss_clip": 1.01954651, + "balance_loss_mlp": 1.03704453, + "epoch": 0.5780850744025252, + "flos": 12124663752960.0, + "grad_norm": 2.4290592896418093, + "language_loss": 0.8083241, + "learning_rate": 1.5940993437630375e-06, + "loss": 0.829723, + "num_input_tokens_seen": 207106030, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.71484375, + "step": 9615, + "time_per_iteration": 2.387230396270752 + }, + { + "auxiliary_loss_clip": 0.01107982, + "auxiliary_loss_mlp": 0.01032204, + "balance_loss_clip": 1.01979423, + "balance_loss_mlp": 1.0372864, + "epoch": 0.5781451976551931, + "flos": 25044425009280.0, + "grad_norm": 1.467111790124014, + "language_loss": 0.67172909, + "learning_rate": 1.5937179948536825e-06, + "loss": 0.69313097, + "num_input_tokens_seen": 207125435, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.70703125, + "step": 9616, + "time_per_iteration": 2.5091681480407715 + }, + { + "auxiliary_loss_clip": 0.01103982, + "auxiliary_loss_mlp": 0.01031765, + "balance_loss_clip": 1.01983774, + "balance_loss_mlp": 1.03701568, + "epoch": 0.5782053209078611, + "flos": 19245606284160.0, + "grad_norm": 1.7373937933185963, + "language_loss": 0.77820861, + "learning_rate": 1.5933366613505812e-06, + "loss": 0.79956603, + "num_input_tokens_seen": 207145095, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66796875, + "step": 9617, + "time_per_iteration": 2.434692144393921 + }, + { + "auxiliary_loss_clip": 0.01105528, + "auxiliary_loss_mlp": 0.01031393, + "balance_loss_clip": 1.01911426, + "balance_loss_mlp": 1.03798401, + "epoch": 0.578265444160529, + "flos": 25993831340160.0, + "grad_norm": 1.4913926039582375, + "language_loss": 0.75064909, + "learning_rate": 1.5929553432681947e-06, + "loss": 0.77201837, + "num_input_tokens_seen": 207166045, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.67578125, + "step": 9618, + "time_per_iteration": 2.5143377780914307 + }, + { + "auxiliary_loss_clip": 0.01103572, + "auxiliary_loss_mlp": 0.0103001, + "balance_loss_clip": 1.01855421, + "balance_loss_mlp": 1.03614712, + "epoch": 0.5783255674131971, + "flos": 21798603394560.0, + "grad_norm": 1.5244275331123438, + "language_loss": 0.81895173, + "learning_rate": 1.5925740406209826e-06, + "loss": 0.84028757, + "num_input_tokens_seen": 207185290, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.67578125, + "step": 9619, + "time_per_iteration": 2.436741828918457 + }, + { + "auxiliary_loss_clip": 0.01106581, + "auxiliary_loss_mlp": 0.01035226, + "balance_loss_clip": 1.02319741, + "balance_loss_mlp": 1.03689742, + "epoch": 0.578385690665865, + "flos": 24789746603520.0, + "grad_norm": 2.8855702259785874, + "language_loss": 0.7266885, + "learning_rate": 1.5921927534234039e-06, + "loss": 0.7481066, + "num_input_tokens_seen": 207205505, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6953125, + "step": 9620, + "time_per_iteration": 2.508638858795166 + }, + { + "auxiliary_loss_clip": 0.01106937, + "auxiliary_loss_mlp": 0.01031542, + "balance_loss_clip": 1.01994872, + "balance_loss_mlp": 1.0379591, + "epoch": 0.578445813918533, + "flos": 21212864311680.0, + "grad_norm": 1.4901469929607327, + "language_loss": 0.77143538, + "learning_rate": 1.591811481689916e-06, + "loss": 0.79282016, + "num_input_tokens_seen": 207225315, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6875, + "step": 9621, + "time_per_iteration": 2.4620673656463623 + }, + { + "auxiliary_loss_clip": 0.01106096, + "auxiliary_loss_mlp": 0.01031373, + "balance_loss_clip": 1.01862931, + "balance_loss_mlp": 1.03550279, + "epoch": 0.5785059371712009, + "flos": 25046795306880.0, + "grad_norm": 1.5105026325174375, + "language_loss": 0.70597667, + "learning_rate": 1.5914302254349787e-06, + "loss": 0.72735131, + "num_input_tokens_seen": 207247690, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.70703125, + "step": 9622, + "time_per_iteration": 2.509505033493042 + }, + { + "auxiliary_loss_clip": 0.01028849, + "auxiliary_loss_mlp": 0.01001525, + "balance_loss_clip": 1.00028539, + "balance_loss_mlp": 1.007653, + "epoch": 0.5785660604238689, + "flos": 70843172284800.0, + "grad_norm": 0.7726155153830789, + "language_loss": 0.55941814, + "learning_rate": 1.5910489846730476e-06, + "loss": 0.57972187, + "num_input_tokens_seen": 207301735, + "router_z_loss_clip": 0.01239014, + "router_z_loss_mlp": 0.21191406, + "step": 9623, + "time_per_iteration": 3.0823814868927 + }, + { + "auxiliary_loss_clip": 0.01109291, + "auxiliary_loss_mlp": 0.01036509, + "balance_loss_clip": 1.02344918, + "balance_loss_mlp": 1.03692317, + "epoch": 0.578626183676537, + "flos": 31649977244160.0, + "grad_norm": 2.2221143081246373, + "language_loss": 0.71056175, + "learning_rate": 1.5906677594185799e-06, + "loss": 0.73201978, + "num_input_tokens_seen": 207321240, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.72265625, + "step": 9624, + "time_per_iteration": 2.5265705585479736 + }, + { + "auxiliary_loss_clip": 0.01107503, + "auxiliary_loss_mlp": 0.01037993, + "balance_loss_clip": 1.02552414, + "balance_loss_mlp": 1.03862953, + "epoch": 0.5786863069292049, + "flos": 21865181253120.0, + "grad_norm": 2.222167937534436, + "language_loss": 0.82642812, + "learning_rate": 1.5902865496860322e-06, + "loss": 0.84788311, + "num_input_tokens_seen": 207339540, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 9625, + "time_per_iteration": 2.451249122619629 + }, + { + "auxiliary_loss_clip": 0.01104891, + "auxiliary_loss_mlp": 0.01034617, + "balance_loss_clip": 1.02198672, + "balance_loss_mlp": 1.03701115, + "epoch": 0.5787464301818729, + "flos": 23364954748800.0, + "grad_norm": 1.455235974234194, + "language_loss": 0.69956779, + "learning_rate": 1.5899053554898591e-06, + "loss": 0.72096288, + "num_input_tokens_seen": 207360470, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6796875, + "step": 9626, + "time_per_iteration": 2.4975287914276123 + }, + { + "auxiliary_loss_clip": 0.01103607, + "auxiliary_loss_mlp": 0.01036492, + "balance_loss_clip": 1.02480352, + "balance_loss_mlp": 1.03568482, + "epoch": 0.5788065534345408, + "flos": 30004011394560.0, + "grad_norm": 1.93553238886208, + "language_loss": 0.71862161, + "learning_rate": 1.5895241768445166e-06, + "loss": 0.7400226, + "num_input_tokens_seen": 207383080, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 9627, + "time_per_iteration": 2.5138702392578125 + }, + { + "auxiliary_loss_clip": 0.01104177, + "auxiliary_loss_mlp": 0.01028958, + "balance_loss_clip": 1.01737726, + "balance_loss_mlp": 1.03599048, + "epoch": 0.5788666766872088, + "flos": 24527849564160.0, + "grad_norm": 1.727007676436273, + "language_loss": 0.8414377, + "learning_rate": 1.589143013764458e-06, + "loss": 0.86276901, + "num_input_tokens_seen": 207401000, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6796875, + "step": 9628, + "time_per_iteration": 2.4851796627044678 + }, + { + "auxiliary_loss_clip": 0.01103695, + "auxiliary_loss_mlp": 0.01025516, + "balance_loss_clip": 1.01394033, + "balance_loss_mlp": 1.03516388, + "epoch": 0.5789267999398767, + "flos": 23732823888000.0, + "grad_norm": 1.6873428245402236, + "language_loss": 0.71942705, + "learning_rate": 1.5887618662641376e-06, + "loss": 0.74071914, + "num_input_tokens_seen": 207419230, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6875, + "step": 9629, + "time_per_iteration": 2.520517587661743 + }, + { + "auxiliary_loss_clip": 0.01108734, + "auxiliary_loss_mlp": 0.01034367, + "balance_loss_clip": 1.02181387, + "balance_loss_mlp": 1.03963637, + "epoch": 0.5789869231925447, + "flos": 21135045496320.0, + "grad_norm": 1.9628574132847711, + "language_loss": 0.74576336, + "learning_rate": 1.5883807343580087e-06, + "loss": 0.76719439, + "num_input_tokens_seen": 207437615, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69140625, + "step": 9630, + "time_per_iteration": 2.454810380935669 + }, + { + "auxiliary_loss_clip": 0.01101829, + "auxiliary_loss_mlp": 0.01028168, + "balance_loss_clip": 1.01682508, + "balance_loss_mlp": 1.03553247, + "epoch": 0.5790470464452127, + "flos": 21209632087680.0, + "grad_norm": 1.6371763310429226, + "language_loss": 0.79325604, + "learning_rate": 1.587999618060523e-06, + "loss": 0.814556, + "num_input_tokens_seen": 207457270, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 9631, + "time_per_iteration": 2.440864324569702 + }, + { + "auxiliary_loss_clip": 0.01104911, + "auxiliary_loss_mlp": 0.01030044, + "balance_loss_clip": 1.01800966, + "balance_loss_mlp": 1.03596497, + "epoch": 0.5791071696978807, + "flos": 23404384903680.0, + "grad_norm": 1.6037309933130668, + "language_loss": 0.75137591, + "learning_rate": 1.5876185173861333e-06, + "loss": 0.77272546, + "num_input_tokens_seen": 207477890, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6875, + "step": 9632, + "time_per_iteration": 2.4771668910980225 + }, + { + "auxiliary_loss_clip": 0.01106006, + "auxiliary_loss_mlp": 0.01027741, + "balance_loss_clip": 1.01517081, + "balance_loss_mlp": 1.03731871, + "epoch": 0.5791672929505486, + "flos": 24206521472640.0, + "grad_norm": 2.4626986888140716, + "language_loss": 0.79077435, + "learning_rate": 1.5872374323492915e-06, + "loss": 0.81211185, + "num_input_tokens_seen": 207497670, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 9633, + "time_per_iteration": 2.448436737060547 + }, + { + "auxiliary_loss_clip": 0.01115908, + "auxiliary_loss_mlp": 0.0103724, + "balance_loss_clip": 1.02447283, + "balance_loss_mlp": 1.04036343, + "epoch": 0.5792274162032166, + "flos": 24348871071360.0, + "grad_norm": 1.7086543878642706, + "language_loss": 0.77430606, + "learning_rate": 1.5868563629644464e-06, + "loss": 0.79583752, + "num_input_tokens_seen": 207516105, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.75390625, + "step": 9634, + "time_per_iteration": 2.4811017513275146 + }, + { + "auxiliary_loss_clip": 0.01108474, + "auxiliary_loss_mlp": 0.01038015, + "balance_loss_clip": 1.02580237, + "balance_loss_mlp": 1.03722477, + "epoch": 0.5792875394558845, + "flos": 20449403712000.0, + "grad_norm": 2.1301414361920843, + "language_loss": 0.63183784, + "learning_rate": 1.5864753092460502e-06, + "loss": 0.65330267, + "num_input_tokens_seen": 207533685, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.71484375, + "step": 9635, + "time_per_iteration": 3.8360743522644043 + }, + { + "auxiliary_loss_clip": 0.01105747, + "auxiliary_loss_mlp": 0.01036, + "balance_loss_clip": 1.02431154, + "balance_loss_mlp": 1.03854156, + "epoch": 0.5793476627085525, + "flos": 24060329118720.0, + "grad_norm": 1.5921207664968484, + "language_loss": 0.76923883, + "learning_rate": 1.5860942712085516e-06, + "loss": 0.79065627, + "num_input_tokens_seen": 207552840, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.671875, + "step": 9636, + "time_per_iteration": 2.4524970054626465 + }, + { + "auxiliary_loss_clip": 0.01101976, + "auxiliary_loss_mlp": 0.01032358, + "balance_loss_clip": 1.020854, + "balance_loss_mlp": 1.03643167, + "epoch": 0.5794077859612206, + "flos": 22054287381120.0, + "grad_norm": 1.6428369167222547, + "language_loss": 0.68367255, + "learning_rate": 1.5857132488663998e-06, + "loss": 0.70501596, + "num_input_tokens_seen": 207572095, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 9637, + "time_per_iteration": 3.9001073837280273 + }, + { + "auxiliary_loss_clip": 0.01107722, + "auxiliary_loss_mlp": 0.010306, + "balance_loss_clip": 1.0181725, + "balance_loss_mlp": 1.03622174, + "epoch": 0.5794679092138885, + "flos": 11434855991040.0, + "grad_norm": 2.3860817889930326, + "language_loss": 0.72291076, + "learning_rate": 1.585332242234043e-06, + "loss": 0.74429405, + "num_input_tokens_seen": 207587495, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71484375, + "step": 9638, + "time_per_iteration": 3.8099658489227295 + }, + { + "auxiliary_loss_clip": 0.01106068, + "auxiliary_loss_mlp": 0.01031021, + "balance_loss_clip": 1.01981568, + "balance_loss_mlp": 1.03809261, + "epoch": 0.5795280324665565, + "flos": 18880215183360.0, + "grad_norm": 2.0300843650533387, + "language_loss": 0.72111142, + "learning_rate": 1.5849512513259291e-06, + "loss": 0.7424823, + "num_input_tokens_seen": 207606795, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6796875, + "step": 9639, + "time_per_iteration": 3.9071426391601562 + }, + { + "auxiliary_loss_clip": 0.01107692, + "auxiliary_loss_mlp": 0.01034902, + "balance_loss_clip": 1.02291572, + "balance_loss_mlp": 1.03860509, + "epoch": 0.5795881557192244, + "flos": 13005947940480.0, + "grad_norm": 2.0103274032155163, + "language_loss": 0.69715077, + "learning_rate": 1.5845702761565054e-06, + "loss": 0.71857667, + "num_input_tokens_seen": 207623620, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 9640, + "time_per_iteration": 2.433104991912842 + }, + { + "auxiliary_loss_clip": 0.01113005, + "auxiliary_loss_mlp": 0.01038437, + "balance_loss_clip": 1.02583635, + "balance_loss_mlp": 1.03887677, + "epoch": 0.5796482789718924, + "flos": 19932397303680.0, + "grad_norm": 2.7872404958031884, + "language_loss": 0.77623034, + "learning_rate": 1.5841893167402183e-06, + "loss": 0.79774475, + "num_input_tokens_seen": 207639380, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7421875, + "step": 9641, + "time_per_iteration": 2.4301722049713135 + }, + { + "auxiliary_loss_clip": 0.01106272, + "auxiliary_loss_mlp": 0.01030792, + "balance_loss_clip": 1.01930058, + "balance_loss_mlp": 1.0378499, + "epoch": 0.5797084022245603, + "flos": 21650794928640.0, + "grad_norm": 1.8500908876117999, + "language_loss": 0.73673463, + "learning_rate": 1.5838083730915143e-06, + "loss": 0.75810528, + "num_input_tokens_seen": 207657915, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.68359375, + "step": 9642, + "time_per_iteration": 2.49660325050354 + }, + { + "auxiliary_loss_clip": 0.01104964, + "auxiliary_loss_mlp": 0.01029372, + "balance_loss_clip": 1.01718903, + "balance_loss_mlp": 1.03625488, + "epoch": 0.5797685254772283, + "flos": 26031573555840.0, + "grad_norm": 1.696347443177098, + "language_loss": 0.73574042, + "learning_rate": 1.5834274452248378e-06, + "loss": 0.75708383, + "num_input_tokens_seen": 207678620, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 9643, + "time_per_iteration": 2.485637903213501 + }, + { + "auxiliary_loss_clip": 0.01108659, + "auxiliary_loss_mlp": 0.01031042, + "balance_loss_clip": 1.01862597, + "balance_loss_mlp": 1.03768921, + "epoch": 0.5798286487298963, + "flos": 22705167778560.0, + "grad_norm": 1.9990943096580656, + "language_loss": 0.67527819, + "learning_rate": 1.5830465331546352e-06, + "loss": 0.69667518, + "num_input_tokens_seen": 207696980, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 9644, + "time_per_iteration": 2.487901449203491 + }, + { + "auxiliary_loss_clip": 0.01112826, + "auxiliary_loss_mlp": 0.01029368, + "balance_loss_clip": 1.01664853, + "balance_loss_mlp": 1.03988528, + "epoch": 0.5798887719825643, + "flos": 23148988225920.0, + "grad_norm": 2.232135453826953, + "language_loss": 0.85353506, + "learning_rate": 1.5826656368953496e-06, + "loss": 0.87495703, + "num_input_tokens_seen": 207714065, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7265625, + "step": 9645, + "time_per_iteration": 2.4591071605682373 + }, + { + "auxiliary_loss_clip": 0.01108849, + "auxiliary_loss_mlp": 0.01029438, + "balance_loss_clip": 1.01782739, + "balance_loss_mlp": 1.03902066, + "epoch": 0.5799488952352322, + "flos": 24426043441920.0, + "grad_norm": 1.87513340954769, + "language_loss": 0.7528075, + "learning_rate": 1.5822847564614244e-06, + "loss": 0.77419043, + "num_input_tokens_seen": 207734720, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.69921875, + "step": 9646, + "time_per_iteration": 2.5096170902252197 + }, + { + "auxiliary_loss_clip": 0.01111341, + "auxiliary_loss_mlp": 0.01033913, + "balance_loss_clip": 1.02068663, + "balance_loss_mlp": 1.03949249, + "epoch": 0.5800090184879002, + "flos": 38395903829760.0, + "grad_norm": 1.666102030467492, + "language_loss": 0.5938943, + "learning_rate": 1.5819038918673038e-06, + "loss": 0.61534685, + "num_input_tokens_seen": 207755435, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71875, + "step": 9647, + "time_per_iteration": 2.5928401947021484 + }, + { + "auxiliary_loss_clip": 0.01109539, + "auxiliary_loss_mlp": 0.0103437, + "balance_loss_clip": 1.02159083, + "balance_loss_mlp": 1.0388217, + "epoch": 0.5800691417405681, + "flos": 19784840232960.0, + "grad_norm": 1.5329184941218248, + "language_loss": 0.84261942, + "learning_rate": 1.5815230431274288e-06, + "loss": 0.86405849, + "num_input_tokens_seen": 207773570, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.70703125, + "step": 9648, + "time_per_iteration": 2.460245132446289 + }, + { + "auxiliary_loss_clip": 0.01027507, + "auxiliary_loss_mlp": 0.00998956, + "balance_loss_clip": 0.99765694, + "balance_loss_mlp": 1.00610447, + "epoch": 0.5801292649932361, + "flos": 70314565783680.0, + "grad_norm": 0.8404119708733213, + "language_loss": 0.62959844, + "learning_rate": 1.581142210256242e-06, + "loss": 0.64986312, + "num_input_tokens_seen": 207830095, + "router_z_loss_clip": 0.01300049, + "router_z_loss_mlp": 0.21484375, + "step": 9649, + "time_per_iteration": 3.1300153732299805 + }, + { + "auxiliary_loss_clip": 0.01103333, + "auxiliary_loss_mlp": 0.01031569, + "balance_loss_clip": 1.02015436, + "balance_loss_mlp": 1.03649998, + "epoch": 0.5801893882459042, + "flos": 18734812928640.0, + "grad_norm": 2.3310983541006434, + "language_loss": 0.82039601, + "learning_rate": 1.5807613932681857e-06, + "loss": 0.84174502, + "num_input_tokens_seen": 207848555, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 9650, + "time_per_iteration": 2.4216153621673584 + }, + { + "auxiliary_loss_clip": 0.0111056, + "auxiliary_loss_mlp": 0.01032759, + "balance_loss_clip": 1.02018833, + "balance_loss_mlp": 1.0376749, + "epoch": 0.5802495114985721, + "flos": 15596507698560.0, + "grad_norm": 2.3176650701334442, + "language_loss": 0.77372313, + "learning_rate": 1.580380592177698e-06, + "loss": 0.79515636, + "num_input_tokens_seen": 207867060, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7265625, + "step": 9651, + "time_per_iteration": 2.4731314182281494 + }, + { + "auxiliary_loss_clip": 0.01110796, + "auxiliary_loss_mlp": 0.01036307, + "balance_loss_clip": 1.02309239, + "balance_loss_mlp": 1.03978133, + "epoch": 0.5803096347512401, + "flos": 18255405081600.0, + "grad_norm": 2.0034024707617575, + "language_loss": 0.74143803, + "learning_rate": 1.5799998069992213e-06, + "loss": 0.76290905, + "num_input_tokens_seen": 207884520, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 9652, + "time_per_iteration": 2.426095485687256 + }, + { + "auxiliary_loss_clip": 0.01106661, + "auxiliary_loss_mlp": 0.01031597, + "balance_loss_clip": 1.01887703, + "balance_loss_mlp": 1.03536129, + "epoch": 0.580369758003908, + "flos": 22893160584960.0, + "grad_norm": 1.9100146686462136, + "language_loss": 0.76669693, + "learning_rate": 1.579619037747193e-06, + "loss": 0.78807956, + "num_input_tokens_seen": 207905370, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9653, + "time_per_iteration": 2.479843854904175 + }, + { + "auxiliary_loss_clip": 0.01107407, + "auxiliary_loss_mlp": 0.01030936, + "balance_loss_clip": 1.01702428, + "balance_loss_mlp": 1.03746295, + "epoch": 0.580429881256576, + "flos": 18697681244160.0, + "grad_norm": 2.3557465918911578, + "language_loss": 0.74466497, + "learning_rate": 1.5792382844360534e-06, + "loss": 0.76604843, + "num_input_tokens_seen": 207923790, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.69921875, + "step": 9654, + "time_per_iteration": 2.4389872550964355 + }, + { + "auxiliary_loss_clip": 0.01105384, + "auxiliary_loss_mlp": 0.01033574, + "balance_loss_clip": 1.02185535, + "balance_loss_mlp": 1.0386194, + "epoch": 0.5804900045092439, + "flos": 24681978823680.0, + "grad_norm": 1.67229579578488, + "language_loss": 0.70335853, + "learning_rate": 1.5788575470802408e-06, + "loss": 0.72474813, + "num_input_tokens_seen": 207942335, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 9655, + "time_per_iteration": 2.4667346477508545 + }, + { + "auxiliary_loss_clip": 0.01112207, + "auxiliary_loss_mlp": 0.01038295, + "balance_loss_clip": 1.025087, + "balance_loss_mlp": 1.03787553, + "epoch": 0.580550127761912, + "flos": 23112790295040.0, + "grad_norm": 3.1924669760277666, + "language_loss": 0.69441068, + "learning_rate": 1.5784768256941915e-06, + "loss": 0.71591568, + "num_input_tokens_seen": 207961975, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 9656, + "time_per_iteration": 2.47267746925354 + }, + { + "auxiliary_loss_clip": 0.01104052, + "auxiliary_loss_mlp": 0.01031775, + "balance_loss_clip": 1.02040219, + "balance_loss_mlp": 1.0376507, + "epoch": 0.5806102510145799, + "flos": 18475681236480.0, + "grad_norm": 1.8802574367017126, + "language_loss": 0.71315479, + "learning_rate": 1.5780961202923433e-06, + "loss": 0.73451304, + "num_input_tokens_seen": 207979520, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6640625, + "step": 9657, + "time_per_iteration": 2.411862850189209 + }, + { + "auxiliary_loss_clip": 0.01110384, + "auxiliary_loss_mlp": 0.01033397, + "balance_loss_clip": 1.02014136, + "balance_loss_mlp": 1.03748548, + "epoch": 0.5806703742672479, + "flos": 23915645136000.0, + "grad_norm": 2.139189937245848, + "language_loss": 0.70763719, + "learning_rate": 1.5777154308891328e-06, + "loss": 0.72907501, + "num_input_tokens_seen": 207998375, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7265625, + "step": 9658, + "time_per_iteration": 2.4618098735809326 + }, + { + "auxiliary_loss_clip": 0.01029117, + "auxiliary_loss_mlp": 0.01007613, + "balance_loss_clip": 1.00649261, + "balance_loss_mlp": 1.00762427, + "epoch": 0.5807304975199158, + "flos": 66311999412480.0, + "grad_norm": 0.6568503671216013, + "language_loss": 0.53557444, + "learning_rate": 1.5773347574989953e-06, + "loss": 0.5559417, + "num_input_tokens_seen": 208060605, + "router_z_loss_clip": 0.01123047, + "router_z_loss_mlp": 0.21484375, + "step": 9659, + "time_per_iteration": 3.081292152404785 + }, + { + "auxiliary_loss_clip": 0.01109597, + "auxiliary_loss_mlp": 0.01038179, + "balance_loss_clip": 1.02564979, + "balance_loss_mlp": 1.0386076, + "epoch": 0.5807906207725838, + "flos": 31722444933120.0, + "grad_norm": 2.325531986819307, + "language_loss": 0.62134814, + "learning_rate": 1.576954100136366e-06, + "loss": 0.6428259, + "num_input_tokens_seen": 208080320, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7109375, + "step": 9660, + "time_per_iteration": 2.5101215839385986 + }, + { + "auxiliary_loss_clip": 0.01107552, + "auxiliary_loss_mlp": 0.01033971, + "balance_loss_clip": 1.02121592, + "balance_loss_mlp": 1.03510964, + "epoch": 0.5808507440252517, + "flos": 23801161512960.0, + "grad_norm": 1.644077336412447, + "language_loss": 0.65339613, + "learning_rate": 1.5765734588156797e-06, + "loss": 0.67481142, + "num_input_tokens_seen": 208099305, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7265625, + "step": 9661, + "time_per_iteration": 2.495326042175293 + }, + { + "auxiliary_loss_clip": 0.01101624, + "auxiliary_loss_mlp": 0.0102562, + "balance_loss_clip": 1.01473665, + "balance_loss_mlp": 1.03630924, + "epoch": 0.5809108672779197, + "flos": 13698449222400.0, + "grad_norm": 1.4453410326473544, + "language_loss": 0.74667752, + "learning_rate": 1.5761928335513704e-06, + "loss": 0.76795, + "num_input_tokens_seen": 208116960, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.65234375, + "step": 9662, + "time_per_iteration": 2.4072024822235107 + }, + { + "auxiliary_loss_clip": 0.0102818, + "auxiliary_loss_mlp": 0.01003249, + "balance_loss_clip": 1.0020808, + "balance_loss_mlp": 1.00680053, + "epoch": 0.5809709905305876, + "flos": 69134866381440.0, + "grad_norm": 0.8844058515803096, + "language_loss": 0.58421201, + "learning_rate": 1.5758122243578709e-06, + "loss": 0.60452628, + "num_input_tokens_seen": 208182190, + "router_z_loss_clip": 0.01165771, + "router_z_loss_mlp": 0.21484375, + "step": 9663, + "time_per_iteration": 3.128176689147949 + }, + { + "auxiliary_loss_clip": 0.01107731, + "auxiliary_loss_mlp": 0.01032948, + "balance_loss_clip": 1.02058566, + "balance_loss_mlp": 1.03855336, + "epoch": 0.5810311137832557, + "flos": 19827538525440.0, + "grad_norm": 2.2307426037080558, + "language_loss": 0.82198572, + "learning_rate": 1.5754316312496152e-06, + "loss": 0.84339249, + "num_input_tokens_seen": 208197015, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.69140625, + "step": 9664, + "time_per_iteration": 2.4268438816070557 + }, + { + "auxiliary_loss_clip": 0.01106716, + "auxiliary_loss_mlp": 0.01024753, + "balance_loss_clip": 1.0119977, + "balance_loss_mlp": 1.03471017, + "epoch": 0.5810912370359237, + "flos": 29238503719680.0, + "grad_norm": 1.6499573770914204, + "language_loss": 0.81283242, + "learning_rate": 1.5750510542410337e-06, + "loss": 0.8341471, + "num_input_tokens_seen": 208215795, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.72265625, + "step": 9665, + "time_per_iteration": 2.539750337600708 + }, + { + "auxiliary_loss_clip": 0.01113083, + "auxiliary_loss_mlp": 0.01032325, + "balance_loss_clip": 1.0180558, + "balance_loss_mlp": 1.03968716, + "epoch": 0.5811513602885916, + "flos": 22785572373120.0, + "grad_norm": 1.6493862237198238, + "language_loss": 0.81106472, + "learning_rate": 1.5746704933465599e-06, + "loss": 0.83251882, + "num_input_tokens_seen": 208234655, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.734375, + "step": 9666, + "time_per_iteration": 2.4637341499328613 + }, + { + "auxiliary_loss_clip": 0.01105376, + "auxiliary_loss_mlp": 0.01032179, + "balance_loss_clip": 1.02047861, + "balance_loss_mlp": 1.03734851, + "epoch": 0.5812114835412596, + "flos": 18734346051840.0, + "grad_norm": 1.772076851837157, + "language_loss": 0.79902422, + "learning_rate": 1.5742899485806227e-06, + "loss": 0.82039976, + "num_input_tokens_seen": 208251300, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 9667, + "time_per_iteration": 2.4630167484283447 + }, + { + "auxiliary_loss_clip": 0.01113135, + "auxiliary_loss_mlp": 0.01033461, + "balance_loss_clip": 1.01935887, + "balance_loss_mlp": 1.03786182, + "epoch": 0.5812716067939275, + "flos": 26431295080320.0, + "grad_norm": 1.5126376316707284, + "language_loss": 0.78524494, + "learning_rate": 1.573909419957653e-06, + "loss": 0.80671084, + "num_input_tokens_seen": 208272685, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.75, + "step": 9668, + "time_per_iteration": 2.4933431148529053 + }, + { + "auxiliary_loss_clip": 0.01109741, + "auxiliary_loss_mlp": 0.01031977, + "balance_loss_clip": 1.01976347, + "balance_loss_mlp": 1.03882718, + "epoch": 0.5813317300465956, + "flos": 43397865285120.0, + "grad_norm": 2.2917193824708395, + "language_loss": 0.6405921, + "learning_rate": 1.5735289074920819e-06, + "loss": 0.66200924, + "num_input_tokens_seen": 208294315, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.7109375, + "step": 9669, + "time_per_iteration": 2.711413860321045 + }, + { + "auxiliary_loss_clip": 0.01109059, + "auxiliary_loss_mlp": 0.01034536, + "balance_loss_clip": 1.02185786, + "balance_loss_mlp": 1.03847837, + "epoch": 0.5813918532992635, + "flos": 24785472885120.0, + "grad_norm": 1.7201818199144705, + "language_loss": 0.73401237, + "learning_rate": 1.5731484111983363e-06, + "loss": 0.75544822, + "num_input_tokens_seen": 208315610, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 9670, + "time_per_iteration": 2.481351375579834 + }, + { + "auxiliary_loss_clip": 0.01108134, + "auxiliary_loss_mlp": 0.01035647, + "balance_loss_clip": 1.02327895, + "balance_loss_mlp": 1.03665125, + "epoch": 0.5814519765519315, + "flos": 22857357703680.0, + "grad_norm": 2.1547601144280693, + "language_loss": 0.79159272, + "learning_rate": 1.5727679310908464e-06, + "loss": 0.81303054, + "num_input_tokens_seen": 208334725, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71484375, + "step": 9671, + "time_per_iteration": 2.481765031814575 + }, + { + "auxiliary_loss_clip": 0.01113516, + "auxiliary_loss_mlp": 0.01036309, + "balance_loss_clip": 1.02253985, + "balance_loss_mlp": 1.04052281, + "epoch": 0.5815120998045994, + "flos": 24060831909120.0, + "grad_norm": 1.8667318330129747, + "language_loss": 0.60387075, + "learning_rate": 1.5723874671840399e-06, + "loss": 0.62536901, + "num_input_tokens_seen": 208353825, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.73046875, + "step": 9672, + "time_per_iteration": 2.4585747718811035 + }, + { + "auxiliary_loss_clip": 0.01106042, + "auxiliary_loss_mlp": 0.01027644, + "balance_loss_clip": 1.01597953, + "balance_loss_mlp": 1.03804862, + "epoch": 0.5815722230572674, + "flos": 24279491952000.0, + "grad_norm": 1.9986212138203583, + "language_loss": 0.81078732, + "learning_rate": 1.572007019492342e-06, + "loss": 0.83212423, + "num_input_tokens_seen": 208374160, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 9673, + "time_per_iteration": 2.4950785636901855 + }, + { + "auxiliary_loss_clip": 0.01113708, + "auxiliary_loss_mlp": 0.01035096, + "balance_loss_clip": 1.02148843, + "balance_loss_mlp": 1.03956604, + "epoch": 0.5816323463099353, + "flos": 22200371994240.0, + "grad_norm": 1.7057299891387632, + "language_loss": 0.87750065, + "learning_rate": 1.5716265880301817e-06, + "loss": 0.89898866, + "num_input_tokens_seen": 208392105, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7421875, + "step": 9674, + "time_per_iteration": 2.440136432647705 + }, + { + "auxiliary_loss_clip": 0.01108901, + "auxiliary_loss_mlp": 0.01033088, + "balance_loss_clip": 1.02156651, + "balance_loss_mlp": 1.03789747, + "epoch": 0.5816924695626033, + "flos": 24134448833280.0, + "grad_norm": 1.5021502044615473, + "language_loss": 0.78512001, + "learning_rate": 1.571246172811984e-06, + "loss": 0.80653995, + "num_input_tokens_seen": 208411755, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.7109375, + "step": 9675, + "time_per_iteration": 2.474719285964966 + }, + { + "auxiliary_loss_clip": 0.01108481, + "auxiliary_loss_mlp": 0.010293, + "balance_loss_clip": 1.0162822, + "balance_loss_mlp": 1.03912115, + "epoch": 0.5817525928152713, + "flos": 21324223451520.0, + "grad_norm": 2.1292944862371486, + "language_loss": 0.70189106, + "learning_rate": 1.5708657738521748e-06, + "loss": 0.72326887, + "num_input_tokens_seen": 208429995, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6953125, + "step": 9676, + "time_per_iteration": 2.435563325881958 + }, + { + "auxiliary_loss_clip": 0.01108816, + "auxiliary_loss_mlp": 0.01030513, + "balance_loss_clip": 1.01728702, + "balance_loss_mlp": 1.03810883, + "epoch": 0.5818127160679393, + "flos": 26934510666240.0, + "grad_norm": 2.2453262518267216, + "language_loss": 0.63408953, + "learning_rate": 1.5704853911651779e-06, + "loss": 0.65548283, + "num_input_tokens_seen": 208443655, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.70703125, + "step": 9677, + "time_per_iteration": 3.852684736251831 + }, + { + "auxiliary_loss_clip": 0.01029913, + "auxiliary_loss_mlp": 0.01005476, + "balance_loss_clip": 1.00418842, + "balance_loss_mlp": 1.00840485, + "epoch": 0.5818728393206073, + "flos": 63918626342400.0, + "grad_norm": 0.8082693819649737, + "language_loss": 0.54284507, + "learning_rate": 1.5701050247654182e-06, + "loss": 0.56319892, + "num_input_tokens_seen": 208498405, + "router_z_loss_clip": 0.01287842, + "router_z_loss_mlp": 0.21484375, + "step": 9678, + "time_per_iteration": 3.1727702617645264 + }, + { + "auxiliary_loss_clip": 0.01029364, + "auxiliary_loss_mlp": 0.01, + "balance_loss_clip": 0.99879593, + "balance_loss_mlp": 1.00781882, + "epoch": 0.5819329625732752, + "flos": 64954108638720.0, + "grad_norm": 0.7323225743115229, + "language_loss": 0.56212348, + "learning_rate": 1.569724674667319e-06, + "loss": 0.58241719, + "num_input_tokens_seen": 208559075, + "router_z_loss_clip": 0.01202393, + "router_z_loss_mlp": 0.21484375, + "step": 9679, + "time_per_iteration": 4.407592296600342 + }, + { + "auxiliary_loss_clip": 0.01106016, + "auxiliary_loss_mlp": 0.01028689, + "balance_loss_clip": 1.01719165, + "balance_loss_mlp": 1.03636777, + "epoch": 0.5819930858259432, + "flos": 21215270522880.0, + "grad_norm": 1.5677269140843855, + "language_loss": 0.65393043, + "learning_rate": 1.5693443408853032e-06, + "loss": 0.67527747, + "num_input_tokens_seen": 208577770, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6953125, + "step": 9680, + "time_per_iteration": 3.854875087738037 + }, + { + "auxiliary_loss_clip": 0.01106852, + "auxiliary_loss_mlp": 0.01029479, + "balance_loss_clip": 1.01755846, + "balance_loss_mlp": 1.0371331, + "epoch": 0.5820532090786111, + "flos": 19458520151040.0, + "grad_norm": 1.7974099210270778, + "language_loss": 0.83398807, + "learning_rate": 1.5689640234337933e-06, + "loss": 0.85535139, + "num_input_tokens_seen": 208595110, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 9681, + "time_per_iteration": 3.906952381134033 + }, + { + "auxiliary_loss_clip": 0.01107734, + "auxiliary_loss_mlp": 0.01029608, + "balance_loss_clip": 1.01704884, + "balance_loss_mlp": 1.03765953, + "epoch": 0.5821133323312792, + "flos": 17712615686400.0, + "grad_norm": 1.7009206287297167, + "language_loss": 0.75691867, + "learning_rate": 1.5685837223272109e-06, + "loss": 0.77829218, + "num_input_tokens_seen": 208612080, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 9682, + "time_per_iteration": 2.4177029132843018 + }, + { + "auxiliary_loss_clip": 0.01109999, + "auxiliary_loss_mlp": 0.01029335, + "balance_loss_clip": 1.01696062, + "balance_loss_mlp": 1.03816795, + "epoch": 0.5821734555839471, + "flos": 24571804832640.0, + "grad_norm": 2.1225270667604, + "language_loss": 0.75228214, + "learning_rate": 1.568203437579977e-06, + "loss": 0.77367556, + "num_input_tokens_seen": 208630235, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71875, + "step": 9683, + "time_per_iteration": 2.483633279800415 + }, + { + "auxiliary_loss_clip": 0.01110877, + "auxiliary_loss_mlp": 0.01029498, + "balance_loss_clip": 1.01652765, + "balance_loss_mlp": 1.03809631, + "epoch": 0.5822335788366151, + "flos": 22382259488640.0, + "grad_norm": 1.7411447986789845, + "language_loss": 0.74026191, + "learning_rate": 1.5678231692065116e-06, + "loss": 0.76166564, + "num_input_tokens_seen": 208647925, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7265625, + "step": 9684, + "time_per_iteration": 2.445389986038208 + }, + { + "auxiliary_loss_clip": 0.0111011, + "auxiliary_loss_mlp": 0.01036105, + "balance_loss_clip": 1.02327847, + "balance_loss_mlp": 1.03914332, + "epoch": 0.582293702089283, + "flos": 26722494639360.0, + "grad_norm": 2.480778861643935, + "language_loss": 0.77930081, + "learning_rate": 1.5674429172212348e-06, + "loss": 0.80076301, + "num_input_tokens_seen": 208666180, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7109375, + "step": 9685, + "time_per_iteration": 2.4822564125061035 + }, + { + "auxiliary_loss_clip": 0.01108161, + "auxiliary_loss_mlp": 0.01037765, + "balance_loss_clip": 1.02525425, + "balance_loss_mlp": 1.0376507, + "epoch": 0.582353825341951, + "flos": 17348661129600.0, + "grad_norm": 1.6531366373498986, + "language_loss": 0.75214118, + "learning_rate": 1.5670626816385667e-06, + "loss": 0.77360046, + "num_input_tokens_seen": 208684240, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 9686, + "time_per_iteration": 2.441162109375 + }, + { + "auxiliary_loss_clip": 0.0102947, + "auxiliary_loss_mlp": 0.01008506, + "balance_loss_clip": 1.00720644, + "balance_loss_mlp": 1.00800455, + "epoch": 0.5824139485946189, + "flos": 55473261534720.0, + "grad_norm": 0.8335448804232356, + "language_loss": 0.57427585, + "learning_rate": 1.5666824624729244e-06, + "loss": 0.59465551, + "num_input_tokens_seen": 208736090, + "router_z_loss_clip": 0.01300049, + "router_z_loss_mlp": 0.21484375, + "step": 9687, + "time_per_iteration": 2.887495279312134 + }, + { + "auxiliary_loss_clip": 0.01106071, + "auxiliary_loss_mlp": 0.01028754, + "balance_loss_clip": 1.01534319, + "balance_loss_mlp": 1.03597438, + "epoch": 0.582474071847287, + "flos": 20303031790080.0, + "grad_norm": 1.808127013520305, + "language_loss": 0.69851446, + "learning_rate": 1.566302259738727e-06, + "loss": 0.7198627, + "num_input_tokens_seen": 208754600, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.703125, + "step": 9688, + "time_per_iteration": 2.475397825241089 + }, + { + "auxiliary_loss_clip": 0.01108083, + "auxiliary_loss_mlp": 0.01032707, + "balance_loss_clip": 1.02077413, + "balance_loss_mlp": 1.03770781, + "epoch": 0.5825341950999549, + "flos": 23878010661120.0, + "grad_norm": 2.8185672100752224, + "language_loss": 0.65197223, + "learning_rate": 1.5659220734503918e-06, + "loss": 0.67338014, + "num_input_tokens_seen": 208773140, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.703125, + "step": 9689, + "time_per_iteration": 2.461808204650879 + }, + { + "auxiliary_loss_clip": 0.01107982, + "auxiliary_loss_mlp": 0.01031185, + "balance_loss_clip": 1.0186801, + "balance_loss_mlp": 1.03977919, + "epoch": 0.5825943183526229, + "flos": 23113041690240.0, + "grad_norm": 1.5648827403998262, + "language_loss": 0.73213816, + "learning_rate": 1.5655419036223341e-06, + "loss": 0.75352979, + "num_input_tokens_seen": 208793410, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 9690, + "time_per_iteration": 2.459392786026001 + }, + { + "auxiliary_loss_clip": 0.01110714, + "auxiliary_loss_mlp": 0.0103267, + "balance_loss_clip": 1.01889586, + "balance_loss_mlp": 1.03849721, + "epoch": 0.5826544416052909, + "flos": 22857429530880.0, + "grad_norm": 1.9110650477929338, + "language_loss": 0.76118016, + "learning_rate": 1.5651617502689717e-06, + "loss": 0.78261399, + "num_input_tokens_seen": 208811920, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.72265625, + "step": 9691, + "time_per_iteration": 2.454533338546753 + }, + { + "auxiliary_loss_clip": 0.01107915, + "auxiliary_loss_mlp": 0.01033477, + "balance_loss_clip": 1.02115119, + "balance_loss_mlp": 1.03619492, + "epoch": 0.5827145648579588, + "flos": 31501845555840.0, + "grad_norm": 1.7126808977143095, + "language_loss": 0.80746913, + "learning_rate": 1.5647816134047184e-06, + "loss": 0.82888305, + "num_input_tokens_seen": 208834720, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.71875, + "step": 9692, + "time_per_iteration": 2.501497268676758 + }, + { + "auxiliary_loss_clip": 0.01027994, + "auxiliary_loss_mlp": 0.01002254, + "balance_loss_clip": 1.00103843, + "balance_loss_mlp": 1.0067246, + "epoch": 0.5827746881106268, + "flos": 69811817074560.0, + "grad_norm": 0.7602984909294345, + "language_loss": 0.56910902, + "learning_rate": 1.5644014930439907e-06, + "loss": 0.5894115, + "num_input_tokens_seen": 208898415, + "router_z_loss_clip": 0.012146, + "router_z_loss_mlp": 0.21289062, + "step": 9693, + "time_per_iteration": 3.0237975120544434 + }, + { + "auxiliary_loss_clip": 0.01106474, + "auxiliary_loss_mlp": 0.01033695, + "balance_loss_clip": 1.02250707, + "balance_loss_mlp": 1.03660345, + "epoch": 0.5828348113632947, + "flos": 23112395245440.0, + "grad_norm": 2.266427213008104, + "language_loss": 0.79537672, + "learning_rate": 1.5640213892012025e-06, + "loss": 0.81677842, + "num_input_tokens_seen": 208919045, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.69921875, + "step": 9694, + "time_per_iteration": 2.4761908054351807 + }, + { + "auxiliary_loss_clip": 0.01104051, + "auxiliary_loss_mlp": 0.01033564, + "balance_loss_clip": 1.02250743, + "balance_loss_mlp": 1.03815889, + "epoch": 0.5828949346159628, + "flos": 21873082245120.0, + "grad_norm": 1.3946621855299897, + "language_loss": 0.75905991, + "learning_rate": 1.5636413018907656e-06, + "loss": 0.7804361, + "num_input_tokens_seen": 208939375, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.66015625, + "step": 9695, + "time_per_iteration": 2.4863994121551514 + }, + { + "auxiliary_loss_clip": 0.01028568, + "auxiliary_loss_mlp": 0.01000024, + "balance_loss_clip": 0.99865955, + "balance_loss_mlp": 1.00692177, + "epoch": 0.5829550578686307, + "flos": 65962553950080.0, + "grad_norm": 0.7688369043614423, + "language_loss": 0.54971713, + "learning_rate": 1.563261231127095e-06, + "loss": 0.57000303, + "num_input_tokens_seen": 209004760, + "router_z_loss_clip": 0.01367188, + "router_z_loss_mlp": 0.21679688, + "step": 9696, + "time_per_iteration": 3.1397409439086914 + }, + { + "auxiliary_loss_clip": 0.01108342, + "auxiliary_loss_mlp": 0.0102802, + "balance_loss_clip": 1.01588464, + "balance_loss_mlp": 1.03907263, + "epoch": 0.5830151811212987, + "flos": 16289799079680.0, + "grad_norm": 2.461981122956424, + "language_loss": 0.7641257, + "learning_rate": 1.5628811769246021e-06, + "loss": 0.78548938, + "num_input_tokens_seen": 209022930, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 9697, + "time_per_iteration": 2.4391984939575195 + }, + { + "auxiliary_loss_clip": 0.01110278, + "auxiliary_loss_mlp": 0.01032297, + "balance_loss_clip": 1.01940477, + "balance_loss_mlp": 1.03790259, + "epoch": 0.5830753043739666, + "flos": 24168851084160.0, + "grad_norm": 1.5880971870479619, + "language_loss": 0.77744102, + "learning_rate": 1.5625011392976991e-06, + "loss": 0.79886687, + "num_input_tokens_seen": 209043740, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7265625, + "step": 9698, + "time_per_iteration": 2.5576770305633545 + }, + { + "auxiliary_loss_clip": 0.01107792, + "auxiliary_loss_mlp": 0.01037348, + "balance_loss_clip": 1.02412117, + "balance_loss_mlp": 1.03847361, + "epoch": 0.5831354276266346, + "flos": 27059050097280.0, + "grad_norm": 1.8122014087406897, + "language_loss": 0.83381891, + "learning_rate": 1.5621211182607966e-06, + "loss": 0.85527027, + "num_input_tokens_seen": 209068885, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.6953125, + "step": 9699, + "time_per_iteration": 2.5637032985687256 + }, + { + "auxiliary_loss_clip": 0.01108462, + "auxiliary_loss_mlp": 0.01029094, + "balance_loss_clip": 1.01663673, + "balance_loss_mlp": 1.03769052, + "epoch": 0.5831955508793025, + "flos": 23623475909760.0, + "grad_norm": 2.315377539273772, + "language_loss": 0.66859722, + "learning_rate": 1.561741113828305e-06, + "loss": 0.68997276, + "num_input_tokens_seen": 209087340, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.70703125, + "step": 9700, + "time_per_iteration": 2.471012592315674 + }, + { + "auxiliary_loss_clip": 0.01106015, + "auxiliary_loss_mlp": 0.01032562, + "balance_loss_clip": 1.0199858, + "balance_loss_mlp": 1.03591251, + "epoch": 0.5832556741319705, + "flos": 24973250209920.0, + "grad_norm": 1.5256356872175616, + "language_loss": 0.713889, + "learning_rate": 1.5613611260146344e-06, + "loss": 0.73527479, + "num_input_tokens_seen": 209108840, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 9701, + "time_per_iteration": 2.4697649478912354 + }, + { + "auxiliary_loss_clip": 0.01104917, + "auxiliary_loss_mlp": 0.01031819, + "balance_loss_clip": 1.01984477, + "balance_loss_mlp": 1.03625238, + "epoch": 0.5833157973846385, + "flos": 23221563655680.0, + "grad_norm": 1.810379708827147, + "language_loss": 0.85387969, + "learning_rate": 1.5609811548341936e-06, + "loss": 0.87524706, + "num_input_tokens_seen": 209127985, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6875, + "step": 9702, + "time_per_iteration": 2.481027841567993 + }, + { + "auxiliary_loss_clip": 0.01100783, + "auxiliary_loss_mlp": 0.0103365, + "balance_loss_clip": 1.02206278, + "balance_loss_mlp": 1.0346241, + "epoch": 0.5833759206373065, + "flos": 21977941023360.0, + "grad_norm": 1.4628982512923412, + "language_loss": 0.77776694, + "learning_rate": 1.560601200301392e-06, + "loss": 0.79911131, + "num_input_tokens_seen": 209146885, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 9703, + "time_per_iteration": 2.435124397277832 + }, + { + "auxiliary_loss_clip": 0.01110145, + "auxiliary_loss_mlp": 0.01030447, + "balance_loss_clip": 1.01736951, + "balance_loss_mlp": 1.03907001, + "epoch": 0.5834360438899745, + "flos": 21762405463680.0, + "grad_norm": 1.7159930715569567, + "language_loss": 0.71405482, + "learning_rate": 1.5602212624306366e-06, + "loss": 0.73546076, + "num_input_tokens_seen": 209166130, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 9704, + "time_per_iteration": 2.4737584590911865 + }, + { + "auxiliary_loss_clip": 0.01107118, + "auxiliary_loss_mlp": 0.01031577, + "balance_loss_clip": 1.02001369, + "balance_loss_mlp": 1.03844225, + "epoch": 0.5834961671426424, + "flos": 15992566035840.0, + "grad_norm": 2.155391395554278, + "language_loss": 0.814731, + "learning_rate": 1.559841341236335e-06, + "loss": 0.83611786, + "num_input_tokens_seen": 209183350, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6875, + "step": 9705, + "time_per_iteration": 2.456681966781616 + }, + { + "auxiliary_loss_clip": 0.01105829, + "auxiliary_loss_mlp": 0.01029323, + "balance_loss_clip": 1.01780725, + "balance_loss_mlp": 1.03706515, + "epoch": 0.5835562903953104, + "flos": 22818322598400.0, + "grad_norm": 2.7067870421451805, + "language_loss": 0.80659604, + "learning_rate": 1.5594614367328937e-06, + "loss": 0.82794762, + "num_input_tokens_seen": 209203945, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6875, + "step": 9706, + "time_per_iteration": 2.497509717941284 + }, + { + "auxiliary_loss_clip": 0.01104424, + "auxiliary_loss_mlp": 0.01031781, + "balance_loss_clip": 1.01860809, + "balance_loss_mlp": 1.03667164, + "epoch": 0.5836164136479783, + "flos": 48468056624640.0, + "grad_norm": 2.0481497339382084, + "language_loss": 0.74599034, + "learning_rate": 1.5590815489347187e-06, + "loss": 0.7673524, + "num_input_tokens_seen": 209227080, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.6796875, + "step": 9707, + "time_per_iteration": 2.6745028495788574 + }, + { + "auxiliary_loss_clip": 0.01103427, + "auxiliary_loss_mlp": 0.01030508, + "balance_loss_clip": 1.01876628, + "balance_loss_mlp": 1.03624749, + "epoch": 0.5836765369006464, + "flos": 26905998245760.0, + "grad_norm": 1.608372812838098, + "language_loss": 0.81249726, + "learning_rate": 1.5587016778562163e-06, + "loss": 0.83383656, + "num_input_tokens_seen": 209248170, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 9708, + "time_per_iteration": 2.492741584777832 + }, + { + "auxiliary_loss_clip": 0.01106344, + "auxiliary_loss_mlp": 0.01027852, + "balance_loss_clip": 1.01569307, + "balance_loss_mlp": 1.03903604, + "epoch": 0.5837366601533143, + "flos": 20084048524800.0, + "grad_norm": 1.7521527331614153, + "language_loss": 0.78249604, + "learning_rate": 1.5583218235117896e-06, + "loss": 0.80383801, + "num_input_tokens_seen": 209267730, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.671875, + "step": 9709, + "time_per_iteration": 2.476956844329834 + }, + { + "auxiliary_loss_clip": 0.01027997, + "auxiliary_loss_mlp": 0.01002158, + "balance_loss_clip": 1.00083506, + "balance_loss_mlp": 1.0065155, + "epoch": 0.5837967834059823, + "flos": 65363885971200.0, + "grad_norm": 0.7691792257321526, + "language_loss": 0.56582153, + "learning_rate": 1.557941985915844e-06, + "loss": 0.58612299, + "num_input_tokens_seen": 209332510, + "router_z_loss_clip": 0.01324463, + "router_z_loss_mlp": 0.21484375, + "step": 9710, + "time_per_iteration": 3.0814101696014404 + }, + { + "auxiliary_loss_clip": 0.0110345, + "auxiliary_loss_mlp": 0.01032732, + "balance_loss_clip": 1.0211035, + "balance_loss_mlp": 1.03715682, + "epoch": 0.5838569066586502, + "flos": 25338641310720.0, + "grad_norm": 1.5515305439757483, + "language_loss": 0.65762496, + "learning_rate": 1.5575621650827833e-06, + "loss": 0.67898679, + "num_input_tokens_seen": 209353355, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 9711, + "time_per_iteration": 2.4872825145721436 + }, + { + "auxiliary_loss_clip": 0.01112071, + "auxiliary_loss_mlp": 0.01034684, + "balance_loss_clip": 1.02147532, + "balance_loss_mlp": 1.03822017, + "epoch": 0.5839170299113182, + "flos": 22229243550720.0, + "grad_norm": 1.6429842517443687, + "language_loss": 0.78599298, + "learning_rate": 1.5571823610270085e-06, + "loss": 0.80746061, + "num_input_tokens_seen": 209370960, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73828125, + "step": 9712, + "time_per_iteration": 2.442077398300171 + }, + { + "auxiliary_loss_clip": 0.01105322, + "auxiliary_loss_mlp": 0.01025498, + "balance_loss_clip": 1.01343966, + "balance_loss_mlp": 1.03646183, + "epoch": 0.5839771531639861, + "flos": 22200012858240.0, + "grad_norm": 1.7240347174541215, + "language_loss": 0.73268932, + "learning_rate": 1.5568025737629234e-06, + "loss": 0.7539975, + "num_input_tokens_seen": 209390955, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6875, + "step": 9713, + "time_per_iteration": 2.459120750427246 + }, + { + "auxiliary_loss_clip": 0.01110691, + "auxiliary_loss_mlp": 0.01029593, + "balance_loss_clip": 1.01647365, + "balance_loss_mlp": 1.03805757, + "epoch": 0.5840372764166541, + "flos": 22419355259520.0, + "grad_norm": 1.8470967199163717, + "language_loss": 0.69391453, + "learning_rate": 1.5564228033049292e-06, + "loss": 0.71531737, + "num_input_tokens_seen": 209410260, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7265625, + "step": 9714, + "time_per_iteration": 2.4558205604553223 + }, + { + "auxiliary_loss_clip": 0.01106219, + "auxiliary_loss_mlp": 0.01030193, + "balance_loss_clip": 1.01737761, + "balance_loss_mlp": 1.03574395, + "epoch": 0.5840973996693221, + "flos": 19828256797440.0, + "grad_norm": 1.7342681115417722, + "language_loss": 0.79977894, + "learning_rate": 1.5560430496674268e-06, + "loss": 0.82114303, + "num_input_tokens_seen": 209429920, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.70703125, + "step": 9715, + "time_per_iteration": 2.426506757736206 + }, + { + "auxiliary_loss_clip": 0.01106351, + "auxiliary_loss_mlp": 0.01029273, + "balance_loss_clip": 1.01666617, + "balance_loss_mlp": 1.037099, + "epoch": 0.5841575229219901, + "flos": 21142982401920.0, + "grad_norm": 4.9488403812071535, + "language_loss": 0.72778314, + "learning_rate": 1.5556633128648167e-06, + "loss": 0.74913943, + "num_input_tokens_seen": 209449470, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69140625, + "step": 9716, + "time_per_iteration": 2.44687819480896 + }, + { + "auxiliary_loss_clip": 0.01103683, + "auxiliary_loss_mlp": 0.01027677, + "balance_loss_clip": 1.01595879, + "balance_loss_mlp": 1.03716838, + "epoch": 0.5842176461746581, + "flos": 24640322025600.0, + "grad_norm": 1.6127648254863816, + "language_loss": 0.74810076, + "learning_rate": 1.5552835929114976e-06, + "loss": 0.76941431, + "num_input_tokens_seen": 209467695, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 9717, + "time_per_iteration": 2.460857629776001 + }, + { + "auxiliary_loss_clip": 0.01105902, + "auxiliary_loss_mlp": 0.0103646, + "balance_loss_clip": 1.02414012, + "balance_loss_mlp": 1.03733993, + "epoch": 0.584277769427326, + "flos": 19131158574720.0, + "grad_norm": 2.202005488151785, + "language_loss": 0.7997486, + "learning_rate": 1.5549038898218697e-06, + "loss": 0.82117224, + "num_input_tokens_seen": 209484250, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 9718, + "time_per_iteration": 2.4178881645202637 + }, + { + "auxiliary_loss_clip": 0.01106549, + "auxiliary_loss_mlp": 0.01032036, + "balance_loss_clip": 1.01891065, + "balance_loss_mlp": 1.03846669, + "epoch": 0.584337892679994, + "flos": 22675111073280.0, + "grad_norm": 1.4800218219438264, + "language_loss": 0.67422116, + "learning_rate": 1.5545242036103306e-06, + "loss": 0.69560701, + "num_input_tokens_seen": 209502830, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6796875, + "step": 9719, + "time_per_iteration": 3.8449153900146484 + }, + { + "auxiliary_loss_clip": 0.01107677, + "auxiliary_loss_mlp": 0.01028351, + "balance_loss_clip": 1.01631081, + "balance_loss_mlp": 1.03717732, + "epoch": 0.5843980159326619, + "flos": 31284083352960.0, + "grad_norm": 2.1638863024999484, + "language_loss": 0.75937355, + "learning_rate": 1.5541445342912786e-06, + "loss": 0.78073382, + "num_input_tokens_seen": 209525995, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.703125, + "step": 9720, + "time_per_iteration": 2.521005630493164 + }, + { + "auxiliary_loss_clip": 0.01106215, + "auxiliary_loss_mlp": 0.01037967, + "balance_loss_clip": 1.02579594, + "balance_loss_mlp": 1.03623533, + "epoch": 0.58445813918533, + "flos": 22748117466240.0, + "grad_norm": 1.5774446570210707, + "language_loss": 0.83079016, + "learning_rate": 1.5537648818791105e-06, + "loss": 0.85223192, + "num_input_tokens_seen": 209545895, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69921875, + "step": 9721, + "time_per_iteration": 3.9998085498809814 + }, + { + "auxiliary_loss_clip": 0.01030301, + "auxiliary_loss_mlp": 0.01007637, + "balance_loss_clip": 1.00636697, + "balance_loss_mlp": 1.00867438, + "epoch": 0.5845182624379979, + "flos": 60686556658560.0, + "grad_norm": 0.9369686939257119, + "language_loss": 0.71297473, + "learning_rate": 1.5533852463882226e-06, + "loss": 0.73335409, + "num_input_tokens_seen": 209602315, + "router_z_loss_clip": 0.01269531, + "router_z_loss_mlp": 0.21679688, + "step": 9722, + "time_per_iteration": 4.55988335609436 + }, + { + "auxiliary_loss_clip": 0.01104254, + "auxiliary_loss_mlp": 0.01033708, + "balance_loss_clip": 1.0219183, + "balance_loss_mlp": 1.03621197, + "epoch": 0.5845783856906659, + "flos": 16362446336640.0, + "grad_norm": 2.3592007880272097, + "language_loss": 0.89236099, + "learning_rate": 1.5530056278330113e-06, + "loss": 0.91374058, + "num_input_tokens_seen": 209617615, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 9723, + "time_per_iteration": 3.8671655654907227 + }, + { + "auxiliary_loss_clip": 0.01106969, + "auxiliary_loss_mlp": 0.01031836, + "balance_loss_clip": 1.02042723, + "balance_loss_mlp": 1.03859067, + "epoch": 0.5846385089433338, + "flos": 20083402080000.0, + "grad_norm": 1.4227647539631216, + "language_loss": 0.68610382, + "learning_rate": 1.5526260262278709e-06, + "loss": 0.70749187, + "num_input_tokens_seen": 209637005, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.68359375, + "step": 9724, + "time_per_iteration": 2.428325653076172 + }, + { + "auxiliary_loss_clip": 0.01114065, + "auxiliary_loss_mlp": 0.01034629, + "balance_loss_clip": 1.0221715, + "balance_loss_mlp": 1.04199743, + "epoch": 0.5846986321960018, + "flos": 17311062568320.0, + "grad_norm": 1.8750713541003288, + "language_loss": 0.86348903, + "learning_rate": 1.552246441587197e-06, + "loss": 0.88497603, + "num_input_tokens_seen": 209653170, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71875, + "step": 9725, + "time_per_iteration": 2.4113223552703857 + }, + { + "auxiliary_loss_clip": 0.01112675, + "auxiliary_loss_mlp": 0.01038738, + "balance_loss_clip": 1.02615535, + "balance_loss_mlp": 1.04008734, + "epoch": 0.5847587554486697, + "flos": 17197907748480.0, + "grad_norm": 1.9888550356442254, + "language_loss": 0.82856494, + "learning_rate": 1.5518668739253821e-06, + "loss": 0.85007912, + "num_input_tokens_seen": 209671275, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7265625, + "step": 9726, + "time_per_iteration": 2.4277760982513428 + }, + { + "auxiliary_loss_clip": 0.01108752, + "auxiliary_loss_mlp": 0.01036993, + "balance_loss_clip": 1.02550149, + "balance_loss_mlp": 1.03925705, + "epoch": 0.5848188787013378, + "flos": 24529106540160.0, + "grad_norm": 1.8720162128796731, + "language_loss": 0.66911906, + "learning_rate": 1.5514873232568206e-06, + "loss": 0.69057649, + "num_input_tokens_seen": 209690380, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6953125, + "step": 9727, + "time_per_iteration": 2.4941296577453613 + }, + { + "auxiliary_loss_clip": 0.011109, + "auxiliary_loss_mlp": 0.01042126, + "balance_loss_clip": 1.02927577, + "balance_loss_mlp": 1.04078412, + "epoch": 0.5848790019540057, + "flos": 20628382204800.0, + "grad_norm": 1.755089310778911, + "language_loss": 0.81880605, + "learning_rate": 1.5511077895959055e-06, + "loss": 0.84033632, + "num_input_tokens_seen": 209708845, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69921875, + "step": 9728, + "time_per_iteration": 2.504457950592041 + }, + { + "auxiliary_loss_clip": 0.01105423, + "auxiliary_loss_mlp": 0.01036783, + "balance_loss_clip": 1.02519631, + "balance_loss_mlp": 1.03857303, + "epoch": 0.5849391252066737, + "flos": 22418852469120.0, + "grad_norm": 1.9458365932895556, + "language_loss": 0.78459418, + "learning_rate": 1.550728272957027e-06, + "loss": 0.80601627, + "num_input_tokens_seen": 209729000, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66796875, + "step": 9729, + "time_per_iteration": 2.4906978607177734 + }, + { + "auxiliary_loss_clip": 0.0110723, + "auxiliary_loss_mlp": 0.01029316, + "balance_loss_clip": 1.01629853, + "balance_loss_mlp": 1.03705525, + "epoch": 0.5849992484593417, + "flos": 25410929431680.0, + "grad_norm": 2.2265789157985205, + "language_loss": 0.70611644, + "learning_rate": 1.5503487733545782e-06, + "loss": 0.72748184, + "num_input_tokens_seen": 209747435, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.703125, + "step": 9730, + "time_per_iteration": 2.5273194313049316 + }, + { + "auxiliary_loss_clip": 0.01113418, + "auxiliary_loss_mlp": 0.0103557, + "balance_loss_clip": 1.02182508, + "balance_loss_mlp": 1.04057014, + "epoch": 0.5850593717120096, + "flos": 21065163586560.0, + "grad_norm": 2.222037907468424, + "language_loss": 0.78473902, + "learning_rate": 1.5499692908029482e-06, + "loss": 0.80622888, + "num_input_tokens_seen": 209764910, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7265625, + "step": 9731, + "time_per_iteration": 2.4710583686828613 + }, + { + "auxiliary_loss_clip": 0.0110815, + "auxiliary_loss_mlp": 0.01032599, + "balance_loss_clip": 1.01983809, + "balance_loss_mlp": 1.03908777, + "epoch": 0.5851194949646776, + "flos": 25301545539840.0, + "grad_norm": 1.7845208257427057, + "language_loss": 0.69966131, + "learning_rate": 1.549589825316528e-06, + "loss": 0.72106874, + "num_input_tokens_seen": 209786115, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69140625, + "step": 9732, + "time_per_iteration": 2.4975006580352783 + }, + { + "auxiliary_loss_clip": 0.01113456, + "auxiliary_loss_mlp": 0.01033051, + "balance_loss_clip": 1.01913929, + "balance_loss_mlp": 1.04045916, + "epoch": 0.5851796182173455, + "flos": 23587242065280.0, + "grad_norm": 1.73190032828597, + "language_loss": 0.52698147, + "learning_rate": 1.5492103769097075e-06, + "loss": 0.54844654, + "num_input_tokens_seen": 209806095, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.73046875, + "step": 9733, + "time_per_iteration": 2.485399007797241 + }, + { + "auxiliary_loss_clip": 0.01111159, + "auxiliary_loss_mlp": 0.01030285, + "balance_loss_clip": 1.01741672, + "balance_loss_mlp": 1.04071164, + "epoch": 0.5852397414700136, + "flos": 24822712310400.0, + "grad_norm": 6.263677136925273, + "language_loss": 0.87694037, + "learning_rate": 1.5488309455968739e-06, + "loss": 0.89835489, + "num_input_tokens_seen": 209823650, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 9734, + "time_per_iteration": 2.472288131713867 + }, + { + "auxiliary_loss_clip": 0.01103403, + "auxiliary_loss_mlp": 0.01032428, + "balance_loss_clip": 1.02119839, + "balance_loss_mlp": 1.03833449, + "epoch": 0.5852998647226815, + "flos": 19937784343680.0, + "grad_norm": 1.513447931139509, + "language_loss": 0.72063559, + "learning_rate": 1.5484515313924163e-06, + "loss": 0.7419939, + "num_input_tokens_seen": 209843220, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 9735, + "time_per_iteration": 2.4491236209869385 + }, + { + "auxiliary_loss_clip": 0.01113365, + "auxiliary_loss_mlp": 0.01041862, + "balance_loss_clip": 1.02809381, + "balance_loss_mlp": 1.04022026, + "epoch": 0.5853599879753495, + "flos": 16720367408640.0, + "grad_norm": 2.443961120173282, + "language_loss": 0.74189854, + "learning_rate": 1.5480721343107217e-06, + "loss": 0.76345086, + "num_input_tokens_seen": 209854880, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.73046875, + "step": 9736, + "time_per_iteration": 2.419142961502075 + }, + { + "auxiliary_loss_clip": 0.01106138, + "auxiliary_loss_mlp": 0.01032073, + "balance_loss_clip": 1.01981854, + "balance_loss_mlp": 1.0379591, + "epoch": 0.5854201112280174, + "flos": 44456583680640.0, + "grad_norm": 2.2236691167379083, + "language_loss": 0.70181298, + "learning_rate": 1.5476927543661772e-06, + "loss": 0.72319508, + "num_input_tokens_seen": 209877870, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6796875, + "step": 9737, + "time_per_iteration": 2.6583194732666016 + }, + { + "auxiliary_loss_clip": 0.01106196, + "auxiliary_loss_mlp": 0.0103613, + "balance_loss_clip": 1.02428091, + "balance_loss_mlp": 1.03835154, + "epoch": 0.5854802344806854, + "flos": 20339193807360.0, + "grad_norm": 1.7203982017599655, + "language_loss": 0.82579291, + "learning_rate": 1.547313391573169e-06, + "loss": 0.84721613, + "num_input_tokens_seen": 209896690, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.67578125, + "step": 9738, + "time_per_iteration": 2.4531257152557373 + }, + { + "auxiliary_loss_clip": 0.01113247, + "auxiliary_loss_mlp": 0.01036554, + "balance_loss_clip": 1.02323246, + "balance_loss_mlp": 1.04034615, + "epoch": 0.5855403577333533, + "flos": 20921054221440.0, + "grad_norm": 1.7945048569600959, + "language_loss": 0.68588519, + "learning_rate": 1.546934045946082e-06, + "loss": 0.70738328, + "num_input_tokens_seen": 209914640, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7265625, + "step": 9739, + "time_per_iteration": 2.456914186477661 + }, + { + "auxiliary_loss_clip": 0.01108939, + "auxiliary_loss_mlp": 0.0102703, + "balance_loss_clip": 1.01416099, + "balance_loss_mlp": 1.03718436, + "epoch": 0.5856004809860214, + "flos": 20448649526400.0, + "grad_norm": 3.661868392990544, + "language_loss": 0.58782631, + "learning_rate": 1.5465547174993017e-06, + "loss": 0.60918605, + "num_input_tokens_seen": 209933375, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 9740, + "time_per_iteration": 2.4507863521575928 + }, + { + "auxiliary_loss_clip": 0.01106066, + "auxiliary_loss_mlp": 0.01027305, + "balance_loss_clip": 1.01462674, + "balance_loss_mlp": 1.03621328, + "epoch": 0.5856606042386893, + "flos": 19640766781440.0, + "grad_norm": 2.5503677599504138, + "language_loss": 0.74937272, + "learning_rate": 1.5461754062472113e-06, + "loss": 0.77070647, + "num_input_tokens_seen": 209952055, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 9741, + "time_per_iteration": 2.4589905738830566 + }, + { + "auxiliary_loss_clip": 0.01110252, + "auxiliary_loss_mlp": 0.01030619, + "balance_loss_clip": 1.01856065, + "balance_loss_mlp": 1.04028082, + "epoch": 0.5857207274913573, + "flos": 21686166846720.0, + "grad_norm": 5.17192355324585, + "language_loss": 0.75760782, + "learning_rate": 1.5457961122041959e-06, + "loss": 0.77901655, + "num_input_tokens_seen": 209971190, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.69921875, + "step": 9742, + "time_per_iteration": 2.4604122638702393 + }, + { + "auxiliary_loss_clip": 0.01106761, + "auxiliary_loss_mlp": 0.01028937, + "balance_loss_clip": 1.01720667, + "balance_loss_mlp": 1.03765917, + "epoch": 0.5857808507440253, + "flos": 23182708118400.0, + "grad_norm": 1.843175426453247, + "language_loss": 0.74955082, + "learning_rate": 1.5454168353846369e-06, + "loss": 0.77090788, + "num_input_tokens_seen": 209990695, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.69140625, + "step": 9743, + "time_per_iteration": 2.4604763984680176 + }, + { + "auxiliary_loss_clip": 0.01106043, + "auxiliary_loss_mlp": 0.01028717, + "balance_loss_clip": 1.0171833, + "balance_loss_mlp": 1.03878045, + "epoch": 0.5858409739966932, + "flos": 27235299156480.0, + "grad_norm": 1.7092789137699793, + "language_loss": 0.81049299, + "learning_rate": 1.5450375758029172e-06, + "loss": 0.83184063, + "num_input_tokens_seen": 210010210, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 9744, + "time_per_iteration": 2.516517162322998 + }, + { + "auxiliary_loss_clip": 0.0111328, + "auxiliary_loss_mlp": 0.01029291, + "balance_loss_clip": 1.01756728, + "balance_loss_mlp": 1.04009771, + "epoch": 0.5859010972493612, + "flos": 27855512317440.0, + "grad_norm": 1.7947324983718902, + "language_loss": 0.71260583, + "learning_rate": 1.5446583334734183e-06, + "loss": 0.73403156, + "num_input_tokens_seen": 210030030, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.73046875, + "step": 9745, + "time_per_iteration": 2.5095736980438232 + }, + { + "auxiliary_loss_clip": 0.01029472, + "auxiliary_loss_mlp": 0.01001042, + "balance_loss_clip": 0.99980211, + "balance_loss_mlp": 1.00798225, + "epoch": 0.5859612205020291, + "flos": 70007064428160.0, + "grad_norm": 0.7288291603374486, + "language_loss": 0.5328598, + "learning_rate": 1.5442791084105204e-06, + "loss": 0.55316496, + "num_input_tokens_seen": 210094840, + "router_z_loss_clip": 0.01239014, + "router_z_loss_mlp": 0.21484375, + "step": 9746, + "time_per_iteration": 3.1588006019592285 + }, + { + "auxiliary_loss_clip": 0.01111789, + "auxiliary_loss_mlp": 0.01028882, + "balance_loss_clip": 1.01581621, + "balance_loss_mlp": 1.04034877, + "epoch": 0.5860213437546972, + "flos": 24056019486720.0, + "grad_norm": 2.1076565833563743, + "language_loss": 0.73041242, + "learning_rate": 1.5438999006286054e-06, + "loss": 0.75181913, + "num_input_tokens_seen": 210114660, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71484375, + "step": 9747, + "time_per_iteration": 2.529571533203125 + }, + { + "auxiliary_loss_clip": 0.01110161, + "auxiliary_loss_mlp": 0.01034205, + "balance_loss_clip": 1.02153921, + "balance_loss_mlp": 1.03954244, + "epoch": 0.5860814670073651, + "flos": 18947583141120.0, + "grad_norm": 2.1114805581962934, + "language_loss": 0.81232262, + "learning_rate": 1.543520710142051e-06, + "loss": 0.83376622, + "num_input_tokens_seen": 210132770, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 9748, + "time_per_iteration": 2.4205257892608643 + }, + { + "auxiliary_loss_clip": 0.01108981, + "auxiliary_loss_mlp": 0.01031425, + "balance_loss_clip": 1.01904488, + "balance_loss_mlp": 1.03803837, + "epoch": 0.5861415902600331, + "flos": 22561848512640.0, + "grad_norm": 1.6594717662282998, + "language_loss": 0.71928638, + "learning_rate": 1.5431415369652375e-06, + "loss": 0.74069047, + "num_input_tokens_seen": 210151895, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 9749, + "time_per_iteration": 2.4881033897399902 + }, + { + "auxiliary_loss_clip": 0.0110821, + "auxiliary_loss_mlp": 0.0103104, + "balance_loss_clip": 1.01869583, + "balance_loss_mlp": 1.04076529, + "epoch": 0.586201713512701, + "flos": 14392027912320.0, + "grad_norm": 2.0326510096801056, + "language_loss": 0.7436285, + "learning_rate": 1.5427623811125428e-06, + "loss": 0.76502097, + "num_input_tokens_seen": 210168040, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.67578125, + "step": 9750, + "time_per_iteration": 2.414621353149414 + }, + { + "auxiliary_loss_clip": 0.01108261, + "auxiliary_loss_mlp": 0.01035384, + "balance_loss_clip": 1.02279603, + "balance_loss_mlp": 1.03921914, + "epoch": 0.586261836765369, + "flos": 19498560837120.0, + "grad_norm": 1.743949260258008, + "language_loss": 0.71048808, + "learning_rate": 1.542383242598344e-06, + "loss": 0.73192453, + "num_input_tokens_seen": 210187720, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69140625, + "step": 9751, + "time_per_iteration": 2.4829182624816895 + }, + { + "auxiliary_loss_clip": 0.01112341, + "auxiliary_loss_mlp": 0.01034246, + "balance_loss_clip": 1.02050161, + "balance_loss_mlp": 1.04000425, + "epoch": 0.5863219600180369, + "flos": 20701819560960.0, + "grad_norm": 1.8642101544605258, + "language_loss": 0.74632239, + "learning_rate": 1.5420041214370184e-06, + "loss": 0.76778823, + "num_input_tokens_seen": 210206080, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.72265625, + "step": 9752, + "time_per_iteration": 2.4715142250061035 + }, + { + "auxiliary_loss_clip": 0.01107296, + "auxiliary_loss_mlp": 0.01031223, + "balance_loss_clip": 1.01895666, + "balance_loss_mlp": 1.0386945, + "epoch": 0.586382083270705, + "flos": 19792130693760.0, + "grad_norm": 1.7856678678755609, + "language_loss": 0.77179754, + "learning_rate": 1.541625017642943e-06, + "loss": 0.79318273, + "num_input_tokens_seen": 210225660, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6875, + "step": 9753, + "time_per_iteration": 2.443422794342041 + }, + { + "auxiliary_loss_clip": 0.01105348, + "auxiliary_loss_mlp": 0.01026457, + "balance_loss_clip": 1.01546574, + "balance_loss_mlp": 1.03864121, + "epoch": 0.5864422065233729, + "flos": 16500558130560.0, + "grad_norm": 1.9587413882718219, + "language_loss": 0.70530736, + "learning_rate": 1.5412459312304927e-06, + "loss": 0.72662538, + "num_input_tokens_seen": 210242725, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6640625, + "step": 9754, + "time_per_iteration": 2.409973621368408 + }, + { + "auxiliary_loss_clip": 0.01107928, + "auxiliary_loss_mlp": 0.01031096, + "balance_loss_clip": 1.01829863, + "balance_loss_mlp": 1.03827429, + "epoch": 0.5865023297760409, + "flos": 20413277608320.0, + "grad_norm": 1.747136336565704, + "language_loss": 0.72055626, + "learning_rate": 1.540866862214043e-06, + "loss": 0.74194646, + "num_input_tokens_seen": 210263225, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 9755, + "time_per_iteration": 2.4600584506988525 + }, + { + "auxiliary_loss_clip": 0.01030392, + "auxiliary_loss_mlp": 0.01003259, + "balance_loss_clip": 1.00203705, + "balance_loss_mlp": 1.00899053, + "epoch": 0.5865624530287089, + "flos": 63350769254400.0, + "grad_norm": 0.7394274912640315, + "language_loss": 0.5697751, + "learning_rate": 1.540487810607967e-06, + "loss": 0.59011161, + "num_input_tokens_seen": 210322310, + "router_z_loss_clip": 0.01220703, + "router_z_loss_mlp": 0.21484375, + "step": 9756, + "time_per_iteration": 3.0282156467437744 + }, + { + "auxiliary_loss_clip": 0.01105056, + "auxiliary_loss_mlp": 0.01032383, + "balance_loss_clip": 1.02114117, + "balance_loss_mlp": 1.03774321, + "epoch": 0.5866225762813768, + "flos": 27016279977600.0, + "grad_norm": 1.7702895540430315, + "language_loss": 0.76155764, + "learning_rate": 1.5401087764266396e-06, + "loss": 0.78293204, + "num_input_tokens_seen": 210340845, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 9757, + "time_per_iteration": 2.5391111373901367 + }, + { + "auxiliary_loss_clip": 0.01030392, + "auxiliary_loss_mlp": 0.01004494, + "balance_loss_clip": 1.00322425, + "balance_loss_mlp": 1.00899124, + "epoch": 0.5866826995340448, + "flos": 72987038507520.0, + "grad_norm": 0.8655305518018972, + "language_loss": 0.60531819, + "learning_rate": 1.5397297596844337e-06, + "loss": 0.62566704, + "num_input_tokens_seen": 210397815, + "router_z_loss_clip": 0.01269531, + "router_z_loss_mlp": 0.21484375, + "step": 9758, + "time_per_iteration": 3.0623366832733154 + }, + { + "auxiliary_loss_clip": 0.01113209, + "auxiliary_loss_mlp": 0.01030767, + "balance_loss_clip": 1.01773787, + "balance_loss_mlp": 1.03982747, + "epoch": 0.5867428227867127, + "flos": 21285727050240.0, + "grad_norm": 2.3357598656034897, + "language_loss": 0.71766979, + "learning_rate": 1.5393507603957212e-06, + "loss": 0.73910952, + "num_input_tokens_seen": 210413900, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.734375, + "step": 9759, + "time_per_iteration": 2.474400043487549 + }, + { + "auxiliary_loss_clip": 0.0111074, + "auxiliary_loss_mlp": 0.01032361, + "balance_loss_clip": 1.0208931, + "balance_loss_mlp": 1.04039979, + "epoch": 0.5868029460393808, + "flos": 33468852188160.0, + "grad_norm": 1.5007272591007914, + "language_loss": 0.73244017, + "learning_rate": 1.5389717785748742e-06, + "loss": 0.7538712, + "num_input_tokens_seen": 210434110, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.703125, + "step": 9760, + "time_per_iteration": 4.081261396408081 + }, + { + "auxiliary_loss_clip": 0.01106401, + "auxiliary_loss_mlp": 0.01027896, + "balance_loss_clip": 1.01556969, + "balance_loss_mlp": 1.03715563, + "epoch": 0.5868630692920487, + "flos": 17889475276800.0, + "grad_norm": 1.8805423527385174, + "language_loss": 0.72491598, + "learning_rate": 1.5385928142362637e-06, + "loss": 0.74625897, + "num_input_tokens_seen": 210451685, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69140625, + "step": 9761, + "time_per_iteration": 2.42621111869812 + }, + { + "auxiliary_loss_clip": 0.01107357, + "auxiliary_loss_mlp": 0.0103042, + "balance_loss_clip": 1.0167706, + "balance_loss_mlp": 1.03563881, + "epoch": 0.5869231925447167, + "flos": 21035035054080.0, + "grad_norm": 1.837534804487864, + "language_loss": 0.74821299, + "learning_rate": 1.5382138673942597e-06, + "loss": 0.76959074, + "num_input_tokens_seen": 210470825, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71875, + "step": 9762, + "time_per_iteration": 3.899322032928467 + }, + { + "auxiliary_loss_clip": 0.01107201, + "auxiliary_loss_mlp": 0.01029073, + "balance_loss_clip": 1.01706839, + "balance_loss_mlp": 1.03918064, + "epoch": 0.5869833157973846, + "flos": 74738219293440.0, + "grad_norm": 1.367882310541282, + "language_loss": 0.72223246, + "learning_rate": 1.5378349380632317e-06, + "loss": 0.74359524, + "num_input_tokens_seen": 210500075, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 9763, + "time_per_iteration": 4.356280326843262 + }, + { + "auxiliary_loss_clip": 0.01105096, + "auxiliary_loss_mlp": 0.01029686, + "balance_loss_clip": 1.01809907, + "balance_loss_mlp": 1.03675938, + "epoch": 0.5870434390500526, + "flos": 17638998762240.0, + "grad_norm": 1.4976833867772195, + "language_loss": 0.79729784, + "learning_rate": 1.53745602625755e-06, + "loss": 0.81864572, + "num_input_tokens_seen": 210518150, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.68359375, + "step": 9764, + "time_per_iteration": 3.9194676876068115 + }, + { + "auxiliary_loss_clip": 0.01108839, + "auxiliary_loss_mlp": 0.0103278, + "balance_loss_clip": 1.0202508, + "balance_loss_mlp": 1.03856993, + "epoch": 0.5871035623027205, + "flos": 21506146859520.0, + "grad_norm": 2.0111563944475908, + "language_loss": 0.78612924, + "learning_rate": 1.5370771319915819e-06, + "loss": 0.80754542, + "num_input_tokens_seen": 210537760, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 9765, + "time_per_iteration": 2.53273344039917 + }, + { + "auxiliary_loss_clip": 0.01106605, + "auxiliary_loss_mlp": 0.01029586, + "balance_loss_clip": 1.01712823, + "balance_loss_mlp": 1.03891206, + "epoch": 0.5871636855553886, + "flos": 13551861818880.0, + "grad_norm": 1.8843759319265088, + "language_loss": 0.83718032, + "learning_rate": 1.5366982552796947e-06, + "loss": 0.8585422, + "num_input_tokens_seen": 210555515, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.67578125, + "step": 9766, + "time_per_iteration": 2.467556953430176 + }, + { + "auxiliary_loss_clip": 0.01110103, + "auxiliary_loss_mlp": 0.01032223, + "balance_loss_clip": 1.02024257, + "balance_loss_mlp": 1.03847504, + "epoch": 0.5872238088080565, + "flos": 26212922346240.0, + "grad_norm": 2.6418409503909674, + "language_loss": 0.69825381, + "learning_rate": 1.536319396136257e-06, + "loss": 0.71967709, + "num_input_tokens_seen": 210575000, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.71875, + "step": 9767, + "time_per_iteration": 2.514695405960083 + }, + { + "auxiliary_loss_clip": 0.01108076, + "auxiliary_loss_mlp": 0.01034383, + "balance_loss_clip": 1.02162123, + "balance_loss_mlp": 1.03721809, + "epoch": 0.5872839320607245, + "flos": 30665198995200.0, + "grad_norm": 1.7100990150928812, + "language_loss": 0.6345011, + "learning_rate": 1.5359405545756336e-06, + "loss": 0.65592575, + "num_input_tokens_seen": 210595185, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7109375, + "step": 9768, + "time_per_iteration": 2.510586738586426 + }, + { + "auxiliary_loss_clip": 0.01029111, + "auxiliary_loss_mlp": 0.00999867, + "balance_loss_clip": 0.9985916, + "balance_loss_mlp": 1.00760961, + "epoch": 0.5873440553133924, + "flos": 60303570871680.0, + "grad_norm": 0.7128870586180143, + "language_loss": 0.53924322, + "learning_rate": 1.5355617306121914e-06, + "loss": 0.559533, + "num_input_tokens_seen": 210653210, + "router_z_loss_clip": 0.01275635, + "router_z_loss_mlp": 0.21484375, + "step": 9769, + "time_per_iteration": 3.0710904598236084 + }, + { + "auxiliary_loss_clip": 0.01104834, + "auxiliary_loss_mlp": 0.01033468, + "balance_loss_clip": 1.02148712, + "balance_loss_mlp": 1.03672135, + "epoch": 0.5874041785660604, + "flos": 21539292134400.0, + "grad_norm": 1.4641633186547043, + "language_loss": 0.70532131, + "learning_rate": 1.5351829242602945e-06, + "loss": 0.7267043, + "num_input_tokens_seen": 210673750, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 9770, + "time_per_iteration": 2.516707420349121 + }, + { + "auxiliary_loss_clip": 0.01106458, + "auxiliary_loss_mlp": 0.01031999, + "balance_loss_clip": 1.0193336, + "balance_loss_mlp": 1.03782773, + "epoch": 0.5874643018187284, + "flos": 24388947671040.0, + "grad_norm": 3.691664094278214, + "language_loss": 0.67488074, + "learning_rate": 1.5348041355343077e-06, + "loss": 0.69626534, + "num_input_tokens_seen": 210692960, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 9771, + "time_per_iteration": 2.4816172122955322 + }, + { + "auxiliary_loss_clip": 0.01107891, + "auxiliary_loss_mlp": 0.0103358, + "balance_loss_clip": 1.02041984, + "balance_loss_mlp": 1.03628254, + "epoch": 0.5875244250713964, + "flos": 28147717457280.0, + "grad_norm": 1.6051808895674682, + "language_loss": 0.65752995, + "learning_rate": 1.5344253644485954e-06, + "loss": 0.67894471, + "num_input_tokens_seen": 210714040, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71484375, + "step": 9772, + "time_per_iteration": 2.5371270179748535 + }, + { + "auxiliary_loss_clip": 0.01111325, + "auxiliary_loss_mlp": 0.01038697, + "balance_loss_clip": 1.02478576, + "balance_loss_mlp": 1.03915095, + "epoch": 0.5875845483240644, + "flos": 25812410722560.0, + "grad_norm": 1.7393863773768459, + "language_loss": 0.74272907, + "learning_rate": 1.534046611017519e-06, + "loss": 0.7642293, + "num_input_tokens_seen": 210733710, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.71875, + "step": 9773, + "time_per_iteration": 2.4879984855651855 + }, + { + "auxiliary_loss_clip": 0.01108784, + "auxiliary_loss_mlp": 0.01037956, + "balance_loss_clip": 1.02513528, + "balance_loss_mlp": 1.03829455, + "epoch": 0.5876446715767323, + "flos": 26906572863360.0, + "grad_norm": 2.707979121748391, + "language_loss": 0.53293657, + "learning_rate": 1.5336678752554421e-06, + "loss": 0.55440396, + "num_input_tokens_seen": 210753580, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 9774, + "time_per_iteration": 2.5072500705718994 + }, + { + "auxiliary_loss_clip": 0.01109655, + "auxiliary_loss_mlp": 0.01035615, + "balance_loss_clip": 1.02257991, + "balance_loss_mlp": 1.03880942, + "epoch": 0.5877047948294003, + "flos": 36684832579200.0, + "grad_norm": 2.48971225310605, + "language_loss": 0.65312964, + "learning_rate": 1.5332891571767264e-06, + "loss": 0.6745823, + "num_input_tokens_seen": 210773495, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 9775, + "time_per_iteration": 2.5655953884124756 + }, + { + "auxiliary_loss_clip": 0.01106711, + "auxiliary_loss_mlp": 0.01033817, + "balance_loss_clip": 1.02168775, + "balance_loss_mlp": 1.03676975, + "epoch": 0.5877649180820682, + "flos": 26724721282560.0, + "grad_norm": 1.785458151895031, + "language_loss": 0.73554152, + "learning_rate": 1.5329104567957326e-06, + "loss": 0.7569468, + "num_input_tokens_seen": 210793645, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69921875, + "step": 9776, + "time_per_iteration": 2.54707932472229 + }, + { + "auxiliary_loss_clip": 0.01106906, + "auxiliary_loss_mlp": 0.01033226, + "balance_loss_clip": 1.02136469, + "balance_loss_mlp": 1.0373795, + "epoch": 0.5878250413347362, + "flos": 21032197879680.0, + "grad_norm": 2.328878154900185, + "language_loss": 0.74400878, + "learning_rate": 1.532531774126821e-06, + "loss": 0.76541013, + "num_input_tokens_seen": 210813415, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6953125, + "step": 9777, + "time_per_iteration": 2.5013017654418945 + }, + { + "auxiliary_loss_clip": 0.01103004, + "auxiliary_loss_mlp": 0.01029838, + "balance_loss_clip": 1.01816726, + "balance_loss_mlp": 1.03745651, + "epoch": 0.5878851645874041, + "flos": 25484259047040.0, + "grad_norm": 1.542678345734907, + "language_loss": 0.74238187, + "learning_rate": 1.5321531091843512e-06, + "loss": 0.76371026, + "num_input_tokens_seen": 210833850, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 9778, + "time_per_iteration": 2.548445224761963 + }, + { + "auxiliary_loss_clip": 0.01104043, + "auxiliary_loss_mlp": 0.01029504, + "balance_loss_clip": 1.01765513, + "balance_loss_mlp": 1.03588045, + "epoch": 0.5879452878400722, + "flos": 23769129559680.0, + "grad_norm": 1.8670942886874708, + "language_loss": 0.70107329, + "learning_rate": 1.5317744619826824e-06, + "loss": 0.72240877, + "num_input_tokens_seen": 210853115, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 9779, + "time_per_iteration": 2.440385341644287 + }, + { + "auxiliary_loss_clip": 0.01109422, + "auxiliary_loss_mlp": 0.01032703, + "balance_loss_clip": 1.02029324, + "balance_loss_mlp": 1.03690886, + "epoch": 0.5880054110927401, + "flos": 17824513530240.0, + "grad_norm": 1.8860885981569304, + "language_loss": 0.67181754, + "learning_rate": 1.5313958325361727e-06, + "loss": 0.69323874, + "num_input_tokens_seen": 210872090, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7265625, + "step": 9780, + "time_per_iteration": 2.5105738639831543 + }, + { + "auxiliary_loss_clip": 0.01108309, + "auxiliary_loss_mlp": 0.01035836, + "balance_loss_clip": 1.02308023, + "balance_loss_mlp": 1.03872418, + "epoch": 0.5880655343454081, + "flos": 19463404400640.0, + "grad_norm": 3.148071574180809, + "language_loss": 0.72608495, + "learning_rate": 1.5310172208591807e-06, + "loss": 0.74752629, + "num_input_tokens_seen": 210888490, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 9781, + "time_per_iteration": 2.4174652099609375 + }, + { + "auxiliary_loss_clip": 0.01104991, + "auxiliary_loss_mlp": 0.0103161, + "balance_loss_clip": 1.01946235, + "balance_loss_mlp": 1.03562713, + "epoch": 0.588125657598076, + "flos": 21397588980480.0, + "grad_norm": 1.4505377017032317, + "language_loss": 0.70405555, + "learning_rate": 1.5306386269660622e-06, + "loss": 0.72542155, + "num_input_tokens_seen": 210908220, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6953125, + "step": 9782, + "time_per_iteration": 2.4488813877105713 + }, + { + "auxiliary_loss_clip": 0.01108141, + "auxiliary_loss_mlp": 0.01032909, + "balance_loss_clip": 1.02064204, + "balance_loss_mlp": 1.03547001, + "epoch": 0.588185780850744, + "flos": 16034653797120.0, + "grad_norm": 3.528130932430564, + "language_loss": 0.70414114, + "learning_rate": 1.5302600508711741e-06, + "loss": 0.72555161, + "num_input_tokens_seen": 210923945, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.7265625, + "step": 9783, + "time_per_iteration": 2.411940813064575 + }, + { + "auxiliary_loss_clip": 0.01109132, + "auxiliary_loss_mlp": 0.01032084, + "balance_loss_clip": 1.0186553, + "balance_loss_mlp": 1.03764033, + "epoch": 0.588245904103412, + "flos": 23728226947200.0, + "grad_norm": 2.8122189742296952, + "language_loss": 0.6903708, + "learning_rate": 1.5298814925888719e-06, + "loss": 0.71178293, + "num_input_tokens_seen": 210941955, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.71484375, + "step": 9784, + "time_per_iteration": 2.4809060096740723 + }, + { + "auxiliary_loss_clip": 0.01107726, + "auxiliary_loss_mlp": 0.01034991, + "balance_loss_clip": 1.02227104, + "balance_loss_mlp": 1.03585327, + "epoch": 0.58830602735608, + "flos": 33802534558080.0, + "grad_norm": 1.976987554101205, + "language_loss": 0.69485259, + "learning_rate": 1.5295029521335102e-06, + "loss": 0.71627975, + "num_input_tokens_seen": 210963105, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 9785, + "time_per_iteration": 2.5458383560180664 + }, + { + "auxiliary_loss_clip": 0.0110444, + "auxiliary_loss_mlp": 0.01026297, + "balance_loss_clip": 1.01477504, + "balance_loss_mlp": 1.03624511, + "epoch": 0.588366150608748, + "flos": 17090714586240.0, + "grad_norm": 2.0068567513814375, + "language_loss": 0.77542102, + "learning_rate": 1.5291244295194448e-06, + "loss": 0.79672837, + "num_input_tokens_seen": 210978720, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.68359375, + "step": 9786, + "time_per_iteration": 2.4269275665283203 + }, + { + "auxiliary_loss_clip": 0.01108029, + "auxiliary_loss_mlp": 0.01033892, + "balance_loss_clip": 1.02173829, + "balance_loss_mlp": 1.03681958, + "epoch": 0.5884262738614159, + "flos": 22127186033280.0, + "grad_norm": 1.4388452349288328, + "language_loss": 0.79175329, + "learning_rate": 1.5287459247610276e-06, + "loss": 0.81317246, + "num_input_tokens_seen": 210998750, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.7109375, + "step": 9787, + "time_per_iteration": 2.441265344619751 + }, + { + "auxiliary_loss_clip": 0.01106621, + "auxiliary_loss_mlp": 0.01031149, + "balance_loss_clip": 1.01953244, + "balance_loss_mlp": 1.03677058, + "epoch": 0.5884863971140839, + "flos": 21031838743680.0, + "grad_norm": 1.596428038291934, + "language_loss": 0.66514194, + "learning_rate": 1.5283674378726116e-06, + "loss": 0.68651974, + "num_input_tokens_seen": 211017550, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.69921875, + "step": 9788, + "time_per_iteration": 2.4632344245910645 + }, + { + "auxiliary_loss_clip": 0.01106001, + "auxiliary_loss_mlp": 0.01030969, + "balance_loss_clip": 1.01877332, + "balance_loss_mlp": 1.03787911, + "epoch": 0.5885465203667518, + "flos": 23805112008960.0, + "grad_norm": 2.066265402471891, + "language_loss": 0.79951847, + "learning_rate": 1.5279889688685506e-06, + "loss": 0.82088816, + "num_input_tokens_seen": 211034135, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 9789, + "time_per_iteration": 2.4486775398254395 + }, + { + "auxiliary_loss_clip": 0.01105301, + "auxiliary_loss_mlp": 0.01027594, + "balance_loss_clip": 1.01579237, + "balance_loss_mlp": 1.03722358, + "epoch": 0.5886066436194198, + "flos": 18880574319360.0, + "grad_norm": 1.510117689081276, + "language_loss": 0.70817208, + "learning_rate": 1.5276105177631944e-06, + "loss": 0.72950107, + "num_input_tokens_seen": 211053850, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 9790, + "time_per_iteration": 2.474634885787964 + }, + { + "auxiliary_loss_clip": 0.01105567, + "auxiliary_loss_mlp": 0.01033293, + "balance_loss_clip": 1.02120566, + "balance_loss_mlp": 1.0374043, + "epoch": 0.5886667668720877, + "flos": 24790141653120.0, + "grad_norm": 1.9043586619327855, + "language_loss": 0.83184004, + "learning_rate": 1.527232084570895e-06, + "loss": 0.85322857, + "num_input_tokens_seen": 211072165, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 9791, + "time_per_iteration": 2.4930591583251953 + }, + { + "auxiliary_loss_clip": 0.01110799, + "auxiliary_loss_mlp": 0.01034535, + "balance_loss_clip": 1.02189827, + "balance_loss_mlp": 1.04020619, + "epoch": 0.5887268901247558, + "flos": 21614381516160.0, + "grad_norm": 1.5964011084944127, + "language_loss": 0.76287472, + "learning_rate": 1.5268536693060026e-06, + "loss": 0.78432798, + "num_input_tokens_seen": 211089630, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 9792, + "time_per_iteration": 2.572164297103882 + }, + { + "auxiliary_loss_clip": 0.01110663, + "auxiliary_loss_mlp": 0.01030996, + "balance_loss_clip": 1.01878858, + "balance_loss_mlp": 1.0383172, + "epoch": 0.5887870133774237, + "flos": 20481722974080.0, + "grad_norm": 1.954465265842666, + "language_loss": 0.69085598, + "learning_rate": 1.5264752719828662e-06, + "loss": 0.71227252, + "num_input_tokens_seen": 211106120, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.72265625, + "step": 9793, + "time_per_iteration": 2.440532684326172 + }, + { + "auxiliary_loss_clip": 0.01105715, + "auxiliary_loss_mlp": 0.01032775, + "balance_loss_clip": 1.02001381, + "balance_loss_mlp": 1.03754866, + "epoch": 0.5888471366300917, + "flos": 19206283870080.0, + "grad_norm": 2.2945820531528547, + "language_loss": 0.60200524, + "learning_rate": 1.5260968926158353e-06, + "loss": 0.6233902, + "num_input_tokens_seen": 211122450, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 9794, + "time_per_iteration": 2.4281349182128906 + }, + { + "auxiliary_loss_clip": 0.01107792, + "auxiliary_loss_mlp": 0.01035699, + "balance_loss_clip": 1.02265191, + "balance_loss_mlp": 1.03800488, + "epoch": 0.5889072598827596, + "flos": 19972904866560.0, + "grad_norm": 1.8105141483242522, + "language_loss": 0.65209466, + "learning_rate": 1.525718531219257e-06, + "loss": 0.67352962, + "num_input_tokens_seen": 211141765, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.69921875, + "step": 9795, + "time_per_iteration": 2.4471983909606934 + }, + { + "auxiliary_loss_clip": 0.01105789, + "auxiliary_loss_mlp": 0.01036936, + "balance_loss_clip": 1.02589679, + "balance_loss_mlp": 1.03751063, + "epoch": 0.5889673831354276, + "flos": 20741249715840.0, + "grad_norm": 1.6472816848345888, + "language_loss": 0.74171197, + "learning_rate": 1.5253401878074801e-06, + "loss": 0.76313925, + "num_input_tokens_seen": 211160475, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6796875, + "step": 9796, + "time_per_iteration": 2.4404211044311523 + }, + { + "auxiliary_loss_clip": 0.0110878, + "auxiliary_loss_mlp": 0.01029178, + "balance_loss_clip": 1.01761484, + "balance_loss_mlp": 1.04002237, + "epoch": 0.5890275063880956, + "flos": 25300935008640.0, + "grad_norm": 1.4898681844876358, + "language_loss": 0.83064574, + "learning_rate": 1.5249618623948507e-06, + "loss": 0.85202533, + "num_input_tokens_seen": 211180480, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6875, + "step": 9797, + "time_per_iteration": 2.487971544265747 + }, + { + "auxiliary_loss_clip": 0.01104148, + "auxiliary_loss_mlp": 0.01031256, + "balance_loss_clip": 1.01896548, + "balance_loss_mlp": 1.03718829, + "epoch": 0.5890876296407636, + "flos": 11765377964160.0, + "grad_norm": 1.804693100831568, + "language_loss": 0.78741366, + "learning_rate": 1.5245835549957152e-06, + "loss": 0.80876774, + "num_input_tokens_seen": 211198000, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66796875, + "step": 9798, + "time_per_iteration": 2.4391119480133057 + }, + { + "auxiliary_loss_clip": 0.01104678, + "auxiliary_loss_mlp": 0.01031268, + "balance_loss_clip": 1.01994312, + "balance_loss_mlp": 1.03718722, + "epoch": 0.5891477528934316, + "flos": 13589460380160.0, + "grad_norm": 2.097614269824193, + "language_loss": 0.74100447, + "learning_rate": 1.5242052656244186e-06, + "loss": 0.76236397, + "num_input_tokens_seen": 211214765, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.67578125, + "step": 9799, + "time_per_iteration": 2.444185972213745 + }, + { + "auxiliary_loss_clip": 0.01110656, + "auxiliary_loss_mlp": 0.01031885, + "balance_loss_clip": 1.01852775, + "balance_loss_mlp": 1.03889656, + "epoch": 0.5892078761460995, + "flos": 15049193189760.0, + "grad_norm": 1.9705578864506654, + "language_loss": 0.76078779, + "learning_rate": 1.5238269942953064e-06, + "loss": 0.78221321, + "num_input_tokens_seen": 211232335, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.71875, + "step": 9800, + "time_per_iteration": 2.4564571380615234 + }, + { + "auxiliary_loss_clip": 0.01108184, + "auxiliary_loss_mlp": 0.01039976, + "balance_loss_clip": 1.02804899, + "balance_loss_mlp": 1.03771484, + "epoch": 0.5892679993987675, + "flos": 15778215624960.0, + "grad_norm": 1.9698106702703237, + "language_loss": 0.78824806, + "learning_rate": 1.523448741022722e-06, + "loss": 0.8097297, + "num_input_tokens_seen": 211249985, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.703125, + "step": 9801, + "time_per_iteration": 2.439195156097412 + }, + { + "auxiliary_loss_clip": 0.01109337, + "auxiliary_loss_mlp": 0.01029379, + "balance_loss_clip": 1.01721966, + "balance_loss_mlp": 1.03768528, + "epoch": 0.5893281226514354, + "flos": 25265203954560.0, + "grad_norm": 2.596016426383407, + "language_loss": 0.65912932, + "learning_rate": 1.5230705058210088e-06, + "loss": 0.68051648, + "num_input_tokens_seen": 211268425, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.71875, + "step": 9802, + "time_per_iteration": 3.8562896251678467 + }, + { + "auxiliary_loss_clip": 0.01106914, + "auxiliary_loss_mlp": 0.01027404, + "balance_loss_clip": 1.01552522, + "balance_loss_mlp": 1.03888416, + "epoch": 0.5893882459041034, + "flos": 19458232842240.0, + "grad_norm": 1.5756682227023782, + "language_loss": 0.78167737, + "learning_rate": 1.5226922887045108e-06, + "loss": 0.8030206, + "num_input_tokens_seen": 211286680, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 9803, + "time_per_iteration": 2.4531607627868652 + }, + { + "auxiliary_loss_clip": 0.01108754, + "auxiliary_loss_mlp": 0.01035578, + "balance_loss_clip": 1.02300191, + "balance_loss_mlp": 1.03849792, + "epoch": 0.5894483691567713, + "flos": 20634056553600.0, + "grad_norm": 1.5070835087317231, + "language_loss": 0.7292577, + "learning_rate": 1.5223140896875686e-06, + "loss": 0.75070107, + "num_input_tokens_seen": 211307700, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 9804, + "time_per_iteration": 3.909280776977539 + }, + { + "auxiliary_loss_clip": 0.01108266, + "auxiliary_loss_mlp": 0.01029855, + "balance_loss_clip": 1.01809549, + "balance_loss_mlp": 1.03996158, + "epoch": 0.5895084924094394, + "flos": 17778223877760.0, + "grad_norm": 1.9252543926260512, + "language_loss": 0.7480545, + "learning_rate": 1.5219359087845234e-06, + "loss": 0.76943576, + "num_input_tokens_seen": 211324835, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.68359375, + "step": 9805, + "time_per_iteration": 3.92484712600708 + }, + { + "auxiliary_loss_clip": 0.01113176, + "auxiliary_loss_mlp": 0.01031215, + "balance_loss_clip": 1.01807201, + "balance_loss_mlp": 1.03880858, + "epoch": 0.5895686156621073, + "flos": 20121072468480.0, + "grad_norm": 2.2161041024358736, + "language_loss": 0.7798723, + "learning_rate": 1.5215577460097174e-06, + "loss": 0.8013162, + "num_input_tokens_seen": 211344130, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 9806, + "time_per_iteration": 3.958747625350952 + }, + { + "auxiliary_loss_clip": 0.01106773, + "auxiliary_loss_mlp": 0.01030721, + "balance_loss_clip": 1.01821566, + "balance_loss_mlp": 1.03678048, + "epoch": 0.5896287389147753, + "flos": 20850058990080.0, + "grad_norm": 2.028844636014754, + "language_loss": 0.77013928, + "learning_rate": 1.5211796013774887e-06, + "loss": 0.79151416, + "num_input_tokens_seen": 211362915, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69921875, + "step": 9807, + "time_per_iteration": 2.437091827392578 + }, + { + "auxiliary_loss_clip": 0.01111522, + "auxiliary_loss_mlp": 0.01031277, + "balance_loss_clip": 1.01834261, + "balance_loss_mlp": 1.040411, + "epoch": 0.5896888621674432, + "flos": 14537897043840.0, + "grad_norm": 2.123691808114849, + "language_loss": 0.74406278, + "learning_rate": 1.5208014749021786e-06, + "loss": 0.76549083, + "num_input_tokens_seen": 211380700, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9808, + "time_per_iteration": 2.4456939697265625 + }, + { + "auxiliary_loss_clip": 0.01111351, + "auxiliary_loss_mlp": 0.01031497, + "balance_loss_clip": 1.01794887, + "balance_loss_mlp": 1.03927052, + "epoch": 0.5897489854201112, + "flos": 20886759711360.0, + "grad_norm": 1.9040797268830973, + "language_loss": 0.71715617, + "learning_rate": 1.5204233665981236e-06, + "loss": 0.73858464, + "num_input_tokens_seen": 211400095, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.71875, + "step": 9809, + "time_per_iteration": 2.4555907249450684 + }, + { + "auxiliary_loss_clip": 0.01111034, + "auxiliary_loss_mlp": 0.0103364, + "balance_loss_clip": 1.02066374, + "balance_loss_mlp": 1.03881156, + "epoch": 0.5898091086727792, + "flos": 20011149872640.0, + "grad_norm": 2.6575599068105262, + "language_loss": 0.81872356, + "learning_rate": 1.5200452764796627e-06, + "loss": 0.84017026, + "num_input_tokens_seen": 211417810, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.72265625, + "step": 9810, + "time_per_iteration": 2.546018600463867 + }, + { + "auxiliary_loss_clip": 0.01105843, + "auxiliary_loss_mlp": 0.01030074, + "balance_loss_clip": 1.01815283, + "balance_loss_mlp": 1.03850091, + "epoch": 0.5898692319254472, + "flos": 16253242012800.0, + "grad_norm": 1.679981614097192, + "language_loss": 0.8076582, + "learning_rate": 1.5196672045611336e-06, + "loss": 0.8290174, + "num_input_tokens_seen": 211436020, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 9811, + "time_per_iteration": 2.432685613632202 + }, + { + "auxiliary_loss_clip": 0.01110453, + "auxiliary_loss_mlp": 0.01027593, + "balance_loss_clip": 1.01449776, + "balance_loss_mlp": 1.03924918, + "epoch": 0.5899293551781152, + "flos": 20448541785600.0, + "grad_norm": 1.903117615206719, + "language_loss": 0.76666933, + "learning_rate": 1.5192891508568715e-06, + "loss": 0.78804982, + "num_input_tokens_seen": 211454335, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 9812, + "time_per_iteration": 2.45906138420105 + }, + { + "auxiliary_loss_clip": 0.01107232, + "auxiliary_loss_mlp": 0.01028772, + "balance_loss_clip": 1.01794147, + "balance_loss_mlp": 1.03932881, + "epoch": 0.5899894784307831, + "flos": 13881701433600.0, + "grad_norm": 3.543593991514859, + "language_loss": 0.70407474, + "learning_rate": 1.5189111153812133e-06, + "loss": 0.72543478, + "num_input_tokens_seen": 211472775, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6796875, + "step": 9813, + "time_per_iteration": 2.417073965072632 + }, + { + "auxiliary_loss_clip": 0.0110801, + "auxiliary_loss_mlp": 0.01031885, + "balance_loss_clip": 1.01969576, + "balance_loss_mlp": 1.03846037, + "epoch": 0.5900496016834511, + "flos": 20083797129600.0, + "grad_norm": 1.496524946754694, + "language_loss": 0.72230315, + "learning_rate": 1.518533098148494e-06, + "loss": 0.74370211, + "num_input_tokens_seen": 211492195, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6953125, + "step": 9814, + "time_per_iteration": 2.527130365371704 + }, + { + "auxiliary_loss_clip": 0.0110797, + "auxiliary_loss_mlp": 0.01029956, + "balance_loss_clip": 1.01768374, + "balance_loss_mlp": 1.03837872, + "epoch": 0.590109724936119, + "flos": 20259148348800.0, + "grad_norm": 1.8734717265521494, + "language_loss": 0.78583348, + "learning_rate": 1.5181550991730476e-06, + "loss": 0.80721277, + "num_input_tokens_seen": 211510220, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 9815, + "time_per_iteration": 2.4397730827331543 + }, + { + "auxiliary_loss_clip": 0.01114156, + "auxiliary_loss_mlp": 0.01035445, + "balance_loss_clip": 1.02197468, + "balance_loss_mlp": 1.03963876, + "epoch": 0.590169848188787, + "flos": 24235069806720.0, + "grad_norm": 2.0868241481245415, + "language_loss": 0.7557171, + "learning_rate": 1.5177771184692083e-06, + "loss": 0.7772131, + "num_input_tokens_seen": 211526260, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7421875, + "step": 9816, + "time_per_iteration": 2.443861484527588 + }, + { + "auxiliary_loss_clip": 0.01110119, + "auxiliary_loss_mlp": 0.01033143, + "balance_loss_clip": 1.02063835, + "balance_loss_mlp": 1.04108596, + "epoch": 0.590229971441455, + "flos": 17784724239360.0, + "grad_norm": 2.234392841889587, + "language_loss": 0.81303239, + "learning_rate": 1.517399156051309e-06, + "loss": 0.83446503, + "num_input_tokens_seen": 211542890, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69140625, + "step": 9817, + "time_per_iteration": 2.4248719215393066 + }, + { + "auxiliary_loss_clip": 0.01109425, + "auxiliary_loss_mlp": 0.01033156, + "balance_loss_clip": 1.02112818, + "balance_loss_mlp": 1.03941548, + "epoch": 0.590290094694123, + "flos": 22236893147520.0, + "grad_norm": 1.5738429375950187, + "language_loss": 0.76401961, + "learning_rate": 1.517021211933682e-06, + "loss": 0.78544545, + "num_input_tokens_seen": 211562685, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69921875, + "step": 9818, + "time_per_iteration": 2.445507526397705 + }, + { + "auxiliary_loss_clip": 0.01104232, + "auxiliary_loss_mlp": 0.01030314, + "balance_loss_clip": 1.01861358, + "balance_loss_mlp": 1.03634679, + "epoch": 0.5903502179467909, + "flos": 19098623831040.0, + "grad_norm": 1.8418500679377416, + "language_loss": 0.66351467, + "learning_rate": 1.5166432861306592e-06, + "loss": 0.68486011, + "num_input_tokens_seen": 211579960, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 9819, + "time_per_iteration": 2.4585890769958496 + }, + { + "auxiliary_loss_clip": 0.01109622, + "auxiliary_loss_mlp": 0.01032276, + "balance_loss_clip": 1.01972878, + "balance_loss_mlp": 1.03955185, + "epoch": 0.5904103411994589, + "flos": 24235500769920.0, + "grad_norm": 1.5583203498776486, + "language_loss": 0.77830237, + "learning_rate": 1.5162653786565714e-06, + "loss": 0.79972136, + "num_input_tokens_seen": 211599310, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 9820, + "time_per_iteration": 2.452444314956665 + }, + { + "auxiliary_loss_clip": 0.01033068, + "auxiliary_loss_mlp": 0.01003995, + "balance_loss_clip": 1.00268924, + "balance_loss_mlp": 1.01099396, + "epoch": 0.5904704644521268, + "flos": 64876613045760.0, + "grad_norm": 0.9230258023741272, + "language_loss": 0.65167463, + "learning_rate": 1.5158874895257487e-06, + "loss": 0.67204523, + "num_input_tokens_seen": 211658790, + "router_z_loss_clip": 0.01306152, + "router_z_loss_mlp": 0.22070312, + "step": 9821, + "time_per_iteration": 3.0410289764404297 + }, + { + "auxiliary_loss_clip": 0.01106857, + "auxiliary_loss_mlp": 0.01028087, + "balance_loss_clip": 1.0159936, + "balance_loss_mlp": 1.03887093, + "epoch": 0.5905305877047948, + "flos": 19609991804160.0, + "grad_norm": 1.8405567429237777, + "language_loss": 0.61040848, + "learning_rate": 1.515509618752521e-06, + "loss": 0.63175792, + "num_input_tokens_seen": 211677240, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 9822, + "time_per_iteration": 2.4597485065460205 + }, + { + "auxiliary_loss_clip": 0.01110158, + "auxiliary_loss_mlp": 0.01038511, + "balance_loss_clip": 1.02598214, + "balance_loss_mlp": 1.03878164, + "epoch": 0.5905907109574628, + "flos": 18989634988800.0, + "grad_norm": 1.8163106241475082, + "language_loss": 0.82910824, + "learning_rate": 1.5151317663512173e-06, + "loss": 0.850595, + "num_input_tokens_seen": 211695485, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71484375, + "step": 9823, + "time_per_iteration": 2.4342074394226074 + }, + { + "auxiliary_loss_clip": 0.01106822, + "auxiliary_loss_mlp": 0.01032649, + "balance_loss_clip": 1.02025676, + "balance_loss_mlp": 1.03823602, + "epoch": 0.5906508342101308, + "flos": 22200407907840.0, + "grad_norm": 1.9061097186750977, + "language_loss": 0.73051912, + "learning_rate": 1.514753932336165e-06, + "loss": 0.75191379, + "num_input_tokens_seen": 211713090, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 9824, + "time_per_iteration": 2.474583387374878 + }, + { + "auxiliary_loss_clip": 0.01115754, + "auxiliary_loss_mlp": 0.0103547, + "balance_loss_clip": 1.02118862, + "balance_loss_mlp": 1.03907609, + "epoch": 0.5907109574627988, + "flos": 20886687884160.0, + "grad_norm": 2.117093757339989, + "language_loss": 0.82486725, + "learning_rate": 1.514376116721693e-06, + "loss": 0.84637952, + "num_input_tokens_seen": 211732510, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.76953125, + "step": 9825, + "time_per_iteration": 2.4499030113220215 + }, + { + "auxiliary_loss_clip": 0.01104731, + "auxiliary_loss_mlp": 0.01028819, + "balance_loss_clip": 1.01781034, + "balance_loss_mlp": 1.03812122, + "epoch": 0.5907710807154667, + "flos": 21506649649920.0, + "grad_norm": 1.7674632389005596, + "language_loss": 0.77194965, + "learning_rate": 1.5139983195221272e-06, + "loss": 0.79328513, + "num_input_tokens_seen": 211748695, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6640625, + "step": 9826, + "time_per_iteration": 2.490628480911255 + }, + { + "auxiliary_loss_clip": 0.01106346, + "auxiliary_loss_mlp": 0.01026697, + "balance_loss_clip": 1.01523519, + "balance_loss_mlp": 1.03757071, + "epoch": 0.5908312039681347, + "flos": 22018376759040.0, + "grad_norm": 1.8211120400501501, + "language_loss": 0.72350824, + "learning_rate": 1.513620540751793e-06, + "loss": 0.74483871, + "num_input_tokens_seen": 211768545, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6875, + "step": 9827, + "time_per_iteration": 2.496574640274048 + }, + { + "auxiliary_loss_clip": 0.01107742, + "auxiliary_loss_mlp": 0.01028951, + "balance_loss_clip": 1.01782858, + "balance_loss_mlp": 1.0374589, + "epoch": 0.5908913272208026, + "flos": 18479523991680.0, + "grad_norm": 1.7932913826709562, + "language_loss": 0.79741728, + "learning_rate": 1.5132427804250178e-06, + "loss": 0.81878424, + "num_input_tokens_seen": 211786665, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.703125, + "step": 9828, + "time_per_iteration": 2.51045298576355 + }, + { + "auxiliary_loss_clip": 0.01111624, + "auxiliary_loss_mlp": 0.01034198, + "balance_loss_clip": 1.02125204, + "balance_loss_mlp": 1.03958178, + "epoch": 0.5909514504734706, + "flos": 12312189682560.0, + "grad_norm": 2.271428998540672, + "language_loss": 0.88056707, + "learning_rate": 1.5128650385561241e-06, + "loss": 0.90202534, + "num_input_tokens_seen": 211801215, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 9829, + "time_per_iteration": 2.4169514179229736 + }, + { + "auxiliary_loss_clip": 0.01031439, + "auxiliary_loss_mlp": 0.00999905, + "balance_loss_clip": 0.99870729, + "balance_loss_mlp": 1.00956726, + "epoch": 0.5910115737261386, + "flos": 70213262451840.0, + "grad_norm": 0.7537251091943264, + "language_loss": 0.57855141, + "learning_rate": 1.5124873151594376e-06, + "loss": 0.59886479, + "num_input_tokens_seen": 211857005, + "router_z_loss_clip": 0.01196289, + "router_z_loss_mlp": 0.21875, + "step": 9830, + "time_per_iteration": 2.996295928955078 + }, + { + "auxiliary_loss_clip": 0.01117401, + "auxiliary_loss_mlp": 0.01032419, + "balance_loss_clip": 1.01852536, + "balance_loss_mlp": 1.04140687, + "epoch": 0.5910716969788066, + "flos": 22017766227840.0, + "grad_norm": 2.0665850759749813, + "language_loss": 0.76163888, + "learning_rate": 1.5121096102492812e-06, + "loss": 0.78313708, + "num_input_tokens_seen": 211876675, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 9831, + "time_per_iteration": 2.461068868637085 + }, + { + "auxiliary_loss_clip": 0.01105452, + "auxiliary_loss_mlp": 0.01027496, + "balance_loss_clip": 1.01565278, + "balance_loss_mlp": 1.03923118, + "epoch": 0.5911318202314745, + "flos": 21251648021760.0, + "grad_norm": 1.602158251769988, + "language_loss": 0.7790612, + "learning_rate": 1.5117319238399767e-06, + "loss": 0.80039072, + "num_input_tokens_seen": 211895725, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 9832, + "time_per_iteration": 2.4806432723999023 + }, + { + "auxiliary_loss_clip": 0.01104873, + "auxiliary_loss_mlp": 0.01027671, + "balance_loss_clip": 1.01554728, + "balance_loss_mlp": 1.03533232, + "epoch": 0.5911919434841425, + "flos": 17821604528640.0, + "grad_norm": 1.7748958571682212, + "language_loss": 0.83552635, + "learning_rate": 1.511354255945847e-06, + "loss": 0.85685176, + "num_input_tokens_seen": 211913860, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6953125, + "step": 9833, + "time_per_iteration": 2.436558961868286 + }, + { + "auxiliary_loss_clip": 0.01108125, + "auxiliary_loss_mlp": 0.01032413, + "balance_loss_clip": 1.02002692, + "balance_loss_mlp": 1.03818607, + "epoch": 0.5912520667368104, + "flos": 20374781207040.0, + "grad_norm": 1.512608687160236, + "language_loss": 0.74505258, + "learning_rate": 1.5109766065812123e-06, + "loss": 0.76645797, + "num_input_tokens_seen": 211932880, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.703125, + "step": 9834, + "time_per_iteration": 2.497488260269165 + }, + { + "auxiliary_loss_clip": 0.01107604, + "auxiliary_loss_mlp": 0.01028933, + "balance_loss_clip": 1.01680338, + "balance_loss_mlp": 1.03707302, + "epoch": 0.5913121899894784, + "flos": 17930557457280.0, + "grad_norm": 2.15246332260658, + "language_loss": 0.78111219, + "learning_rate": 1.5105989757603942e-06, + "loss": 0.8024776, + "num_input_tokens_seen": 211948625, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.703125, + "step": 9835, + "time_per_iteration": 2.428570032119751 + }, + { + "auxiliary_loss_clip": 0.01108371, + "auxiliary_loss_mlp": 0.0103274, + "balance_loss_clip": 1.02080131, + "balance_loss_mlp": 1.03782153, + "epoch": 0.5913723132421465, + "flos": 22126934638080.0, + "grad_norm": 2.790579015547894, + "language_loss": 0.74016017, + "learning_rate": 1.5102213634977117e-06, + "loss": 0.76157123, + "num_input_tokens_seen": 211965355, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.703125, + "step": 9836, + "time_per_iteration": 2.4571895599365234 + }, + { + "auxiliary_loss_clip": 0.01108454, + "auxiliary_loss_mlp": 0.01025364, + "balance_loss_clip": 1.01332974, + "balance_loss_mlp": 1.03816915, + "epoch": 0.5914324364948144, + "flos": 15697918771200.0, + "grad_norm": 2.0887710674316335, + "language_loss": 0.81834614, + "learning_rate": 1.5098437698074841e-06, + "loss": 0.83968431, + "num_input_tokens_seen": 211982245, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.703125, + "step": 9837, + "time_per_iteration": 2.425869941711426 + }, + { + "auxiliary_loss_clip": 0.01109463, + "auxiliary_loss_mlp": 0.01030556, + "balance_loss_clip": 1.01760364, + "balance_loss_mlp": 1.03828216, + "epoch": 0.5914925597474824, + "flos": 22747327367040.0, + "grad_norm": 1.6633412669476784, + "language_loss": 0.79169023, + "learning_rate": 1.5094661947040304e-06, + "loss": 0.81309044, + "num_input_tokens_seen": 212000250, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 9838, + "time_per_iteration": 2.480945348739624 + }, + { + "auxiliary_loss_clip": 0.01109443, + "auxiliary_loss_mlp": 0.01032925, + "balance_loss_clip": 1.02036071, + "balance_loss_mlp": 1.03814876, + "epoch": 0.5915526830001503, + "flos": 18292788161280.0, + "grad_norm": 1.9639883281700399, + "language_loss": 0.6955409, + "learning_rate": 1.5090886382016673e-06, + "loss": 0.7169646, + "num_input_tokens_seen": 212017505, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 9839, + "time_per_iteration": 2.445032835006714 + }, + { + "auxiliary_loss_clip": 0.01109116, + "auxiliary_loss_mlp": 0.01040629, + "balance_loss_clip": 1.02804112, + "balance_loss_mlp": 1.03763521, + "epoch": 0.5916128062528183, + "flos": 17019072910080.0, + "grad_norm": 2.156057098485451, + "language_loss": 0.65970773, + "learning_rate": 1.5087111003147124e-06, + "loss": 0.68120515, + "num_input_tokens_seen": 212034595, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71484375, + "step": 9840, + "time_per_iteration": 2.4208333492279053 + }, + { + "auxiliary_loss_clip": 0.01109278, + "auxiliary_loss_mlp": 0.01031815, + "balance_loss_clip": 1.01920867, + "balance_loss_mlp": 1.03765261, + "epoch": 0.5916729295054862, + "flos": 24754231031040.0, + "grad_norm": 1.6889823147578333, + "language_loss": 0.81775278, + "learning_rate": 1.5083335810574813e-06, + "loss": 0.83916378, + "num_input_tokens_seen": 212055775, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 9841, + "time_per_iteration": 2.485783576965332 + }, + { + "auxiliary_loss_clip": 0.0110413, + "auxiliary_loss_mlp": 0.01028956, + "balance_loss_clip": 1.0175122, + "balance_loss_mlp": 1.03609967, + "epoch": 0.5917330527581542, + "flos": 15958199698560.0, + "grad_norm": 1.5545668932192243, + "language_loss": 0.68891448, + "learning_rate": 1.507956080444291e-06, + "loss": 0.71024531, + "num_input_tokens_seen": 212074000, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6796875, + "step": 9842, + "time_per_iteration": 2.4090652465820312 + }, + { + "auxiliary_loss_clip": 0.01108304, + "auxiliary_loss_mlp": 0.0103228, + "balance_loss_clip": 1.02031779, + "balance_loss_mlp": 1.03697038, + "epoch": 0.5917931760108222, + "flos": 23800730549760.0, + "grad_norm": 1.8995177421561278, + "language_loss": 0.8258518, + "learning_rate": 1.5075785984894549e-06, + "loss": 0.84725767, + "num_input_tokens_seen": 212091415, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.7109375, + "step": 9843, + "time_per_iteration": 2.456085443496704 + }, + { + "auxiliary_loss_clip": 0.01107968, + "auxiliary_loss_mlp": 0.01031114, + "balance_loss_clip": 1.01810205, + "balance_loss_mlp": 1.03701758, + "epoch": 0.5918532992634902, + "flos": 23249609199360.0, + "grad_norm": 2.3414678440212953, + "language_loss": 0.81883448, + "learning_rate": 1.5072011352072875e-06, + "loss": 0.84022528, + "num_input_tokens_seen": 212105255, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 9844, + "time_per_iteration": 3.834216833114624 + }, + { + "auxiliary_loss_clip": 0.01111099, + "auxiliary_loss_mlp": 0.01031123, + "balance_loss_clip": 1.01842773, + "balance_loss_mlp": 1.04004455, + "epoch": 0.5919134225161581, + "flos": 19499853726720.0, + "grad_norm": 1.8185302816606077, + "language_loss": 0.74449736, + "learning_rate": 1.5068236906121032e-06, + "loss": 0.76591957, + "num_input_tokens_seen": 212122765, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9845, + "time_per_iteration": 2.409029960632324 + }, + { + "auxiliary_loss_clip": 0.01108139, + "auxiliary_loss_mlp": 0.01026326, + "balance_loss_clip": 1.01324248, + "balance_loss_mlp": 1.03682494, + "epoch": 0.5919735457688261, + "flos": 38800940567040.0, + "grad_norm": 2.2228008907542027, + "language_loss": 0.63848257, + "learning_rate": 1.506446264718213e-06, + "loss": 0.65982717, + "num_input_tokens_seen": 212143960, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 9846, + "time_per_iteration": 3.994704246520996 + }, + { + "auxiliary_loss_clip": 0.01100388, + "auxiliary_loss_mlp": 0.01026228, + "balance_loss_clip": 1.01529002, + "balance_loss_mlp": 1.03501678, + "epoch": 0.592033669021494, + "flos": 22163994495360.0, + "grad_norm": 1.7549171077463366, + "language_loss": 0.76315683, + "learning_rate": 1.506068857539931e-06, + "loss": 0.78442299, + "num_input_tokens_seen": 212162005, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.65625, + "step": 9847, + "time_per_iteration": 3.815723419189453 + }, + { + "auxiliary_loss_clip": 0.01107339, + "auxiliary_loss_mlp": 0.01031252, + "balance_loss_clip": 1.01892579, + "balance_loss_mlp": 1.03723776, + "epoch": 0.592093792274162, + "flos": 22710985781760.0, + "grad_norm": 1.7391013556086516, + "language_loss": 0.6229955, + "learning_rate": 1.5056914690915667e-06, + "loss": 0.6443814, + "num_input_tokens_seen": 212181635, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 9848, + "time_per_iteration": 3.9868550300598145 + }, + { + "auxiliary_loss_clip": 0.01108795, + "auxiliary_loss_mlp": 0.01037331, + "balance_loss_clip": 1.02532125, + "balance_loss_mlp": 1.03819513, + "epoch": 0.59215391552683, + "flos": 22528954632960.0, + "grad_norm": 2.784596822173483, + "language_loss": 0.75762534, + "learning_rate": 1.5053140993874312e-06, + "loss": 0.77908659, + "num_input_tokens_seen": 212201615, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.703125, + "step": 9849, + "time_per_iteration": 2.4613027572631836 + }, + { + "auxiliary_loss_clip": 0.01108412, + "auxiliary_loss_mlp": 0.01032802, + "balance_loss_clip": 1.02006471, + "balance_loss_mlp": 1.0370928, + "epoch": 0.592214038779498, + "flos": 24499013921280.0, + "grad_norm": 1.6562680086624124, + "language_loss": 0.75594199, + "learning_rate": 1.5049367484418353e-06, + "loss": 0.77735424, + "num_input_tokens_seen": 212219355, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9850, + "time_per_iteration": 2.5371382236480713 + }, + { + "auxiliary_loss_clip": 0.01106578, + "auxiliary_loss_mlp": 0.01029098, + "balance_loss_clip": 1.01676035, + "balance_loss_mlp": 1.03672051, + "epoch": 0.592274162032166, + "flos": 21831353619840.0, + "grad_norm": 1.7347218503083297, + "language_loss": 0.7573396, + "learning_rate": 1.5045594162690868e-06, + "loss": 0.7786963, + "num_input_tokens_seen": 212236710, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.69921875, + "step": 9851, + "time_per_iteration": 2.4500503540039062 + }, + { + "auxiliary_loss_clip": 0.01106705, + "auxiliary_loss_mlp": 0.0103027, + "balance_loss_clip": 1.0179739, + "balance_loss_mlp": 1.03609896, + "epoch": 0.5923342852848339, + "flos": 24608146417920.0, + "grad_norm": 1.818113501506117, + "language_loss": 0.70232719, + "learning_rate": 1.5041821028834954e-06, + "loss": 0.72369695, + "num_input_tokens_seen": 212256195, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.70703125, + "step": 9852, + "time_per_iteration": 2.50327205657959 + }, + { + "auxiliary_loss_clip": 0.01112321, + "auxiliary_loss_mlp": 0.01040222, + "balance_loss_clip": 1.02710271, + "balance_loss_mlp": 1.03861785, + "epoch": 0.5923944085375019, + "flos": 19938143479680.0, + "grad_norm": 38.24844963287624, + "language_loss": 0.8025564, + "learning_rate": 1.5038048082993685e-06, + "loss": 0.82408178, + "num_input_tokens_seen": 212274085, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73828125, + "step": 9853, + "time_per_iteration": 2.443661689758301 + }, + { + "auxiliary_loss_clip": 0.01103448, + "auxiliary_loss_mlp": 0.01025904, + "balance_loss_clip": 1.01480556, + "balance_loss_mlp": 1.03603673, + "epoch": 0.5924545317901698, + "flos": 28658510812800.0, + "grad_norm": 1.502563314800498, + "language_loss": 0.67641807, + "learning_rate": 1.5034275325310124e-06, + "loss": 0.69771153, + "num_input_tokens_seen": 212295530, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.67578125, + "step": 9854, + "time_per_iteration": 2.5323755741119385 + }, + { + "auxiliary_loss_clip": 0.01105063, + "auxiliary_loss_mlp": 0.01025695, + "balance_loss_clip": 1.01371408, + "balance_loss_mlp": 1.03610444, + "epoch": 0.5925146550428378, + "flos": 19864885691520.0, + "grad_norm": 1.6522001385368033, + "language_loss": 0.88777542, + "learning_rate": 1.5030502755927344e-06, + "loss": 0.90908301, + "num_input_tokens_seen": 212313770, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 9855, + "time_per_iteration": 2.4309167861938477 + }, + { + "auxiliary_loss_clip": 0.01102278, + "auxiliary_loss_mlp": 0.01030174, + "balance_loss_clip": 1.01936722, + "balance_loss_mlp": 1.03590918, + "epoch": 0.5925747782955058, + "flos": 15122989681920.0, + "grad_norm": 1.7115668008760792, + "language_loss": 0.86635554, + "learning_rate": 1.5026730374988397e-06, + "loss": 0.88768005, + "num_input_tokens_seen": 212331525, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6640625, + "step": 9856, + "time_per_iteration": 2.464066743850708 + }, + { + "auxiliary_loss_clip": 0.01105344, + "auxiliary_loss_mlp": 0.01034231, + "balance_loss_clip": 1.02256656, + "balance_loss_mlp": 1.03562045, + "epoch": 0.5926349015481738, + "flos": 18405440190720.0, + "grad_norm": 2.1473398743532153, + "language_loss": 0.77584958, + "learning_rate": 1.5022958182636332e-06, + "loss": 0.79724526, + "num_input_tokens_seen": 212347295, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.69921875, + "step": 9857, + "time_per_iteration": 2.4102070331573486 + }, + { + "auxiliary_loss_clip": 0.01109396, + "auxiliary_loss_mlp": 0.01033385, + "balance_loss_clip": 1.0216434, + "balance_loss_mlp": 1.03954232, + "epoch": 0.5926950248008417, + "flos": 23111138269440.0, + "grad_norm": 1.9751188115052367, + "language_loss": 0.64351666, + "learning_rate": 1.501918617901419e-06, + "loss": 0.66494453, + "num_input_tokens_seen": 212365750, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.69921875, + "step": 9858, + "time_per_iteration": 2.461881637573242 + }, + { + "auxiliary_loss_clip": 0.01104549, + "auxiliary_loss_mlp": 0.01030693, + "balance_loss_clip": 1.01884377, + "balance_loss_mlp": 1.03700852, + "epoch": 0.5927551480535097, + "flos": 28033916192640.0, + "grad_norm": 1.9049315760209506, + "language_loss": 0.77045393, + "learning_rate": 1.501541436426501e-06, + "loss": 0.79180634, + "num_input_tokens_seen": 212385300, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.67578125, + "step": 9859, + "time_per_iteration": 2.478782892227173 + }, + { + "auxiliary_loss_clip": 0.01110235, + "auxiliary_loss_mlp": 0.01033746, + "balance_loss_clip": 1.02082372, + "balance_loss_mlp": 1.03882456, + "epoch": 0.5928152713061776, + "flos": 21798675221760.0, + "grad_norm": 2.1565186381803194, + "language_loss": 0.75153667, + "learning_rate": 1.5011642738531818e-06, + "loss": 0.77297652, + "num_input_tokens_seen": 212402140, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71484375, + "step": 9860, + "time_per_iteration": 2.4513912200927734 + }, + { + "auxiliary_loss_clip": 0.01106266, + "auxiliary_loss_mlp": 0.01033749, + "balance_loss_clip": 1.02277529, + "balance_loss_mlp": 1.03840578, + "epoch": 0.5928753945588456, + "flos": 24316839118080.0, + "grad_norm": 1.6305970530500205, + "language_loss": 0.76227921, + "learning_rate": 1.500787130195763e-06, + "loss": 0.78367937, + "num_input_tokens_seen": 212421790, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.67578125, + "step": 9861, + "time_per_iteration": 2.474095344543457 + }, + { + "auxiliary_loss_clip": 0.01103657, + "auxiliary_loss_mlp": 0.01024434, + "balance_loss_clip": 1.0131923, + "balance_loss_mlp": 1.03595328, + "epoch": 0.5929355178115137, + "flos": 26464619923200.0, + "grad_norm": 1.8413108938997076, + "language_loss": 0.70368218, + "learning_rate": 1.5004100054685465e-06, + "loss": 0.72496319, + "num_input_tokens_seen": 212442115, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.67578125, + "step": 9862, + "time_per_iteration": 2.539903402328491 + }, + { + "auxiliary_loss_clip": 0.0110657, + "auxiliary_loss_mlp": 0.0103043, + "balance_loss_clip": 1.01868796, + "balance_loss_mlp": 1.03706694, + "epoch": 0.5929956410641816, + "flos": 24965995662720.0, + "grad_norm": 1.8355876983877193, + "language_loss": 0.77771485, + "learning_rate": 1.500032899685832e-06, + "loss": 0.7990849, + "num_input_tokens_seen": 212459535, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6953125, + "step": 9863, + "time_per_iteration": 2.4712796211242676 + }, + { + "auxiliary_loss_clip": 0.01106967, + "auxiliary_loss_mlp": 0.01038141, + "balance_loss_clip": 1.02583861, + "balance_loss_mlp": 1.03730559, + "epoch": 0.5930557643168496, + "flos": 26208325405440.0, + "grad_norm": 1.8648903136261632, + "language_loss": 0.70763469, + "learning_rate": 1.499655812861921e-06, + "loss": 0.72908574, + "num_input_tokens_seen": 212479385, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 9864, + "time_per_iteration": 2.52478289604187 + }, + { + "auxiliary_loss_clip": 0.01107547, + "auxiliary_loss_mlp": 0.01033988, + "balance_loss_clip": 1.02201343, + "balance_loss_mlp": 1.03711009, + "epoch": 0.5931158875695175, + "flos": 27854937699840.0, + "grad_norm": 2.2141122969684655, + "language_loss": 0.67234761, + "learning_rate": 1.4992787450111112e-06, + "loss": 0.69376296, + "num_input_tokens_seen": 212500060, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.703125, + "step": 9865, + "time_per_iteration": 2.4957449436187744 + }, + { + "auxiliary_loss_clip": 0.0110929, + "auxiliary_loss_mlp": 0.01031685, + "balance_loss_clip": 1.01892328, + "balance_loss_mlp": 1.03758049, + "epoch": 0.5931760108221855, + "flos": 15413650536960.0, + "grad_norm": 1.8936144812420768, + "language_loss": 0.78334385, + "learning_rate": 1.4989016961477015e-06, + "loss": 0.8047536, + "num_input_tokens_seen": 212518590, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.71875, + "step": 9866, + "time_per_iteration": 2.4394681453704834 + }, + { + "auxiliary_loss_clip": 0.01105609, + "auxiliary_loss_mlp": 0.01030002, + "balance_loss_clip": 1.01867127, + "balance_loss_mlp": 1.03786838, + "epoch": 0.5932361340748534, + "flos": 30188520581760.0, + "grad_norm": 1.98454003485575, + "language_loss": 0.72037029, + "learning_rate": 1.4985246662859903e-06, + "loss": 0.7417264, + "num_input_tokens_seen": 212538190, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.67578125, + "step": 9867, + "time_per_iteration": 2.5107383728027344 + }, + { + "auxiliary_loss_clip": 0.01107812, + "auxiliary_loss_mlp": 0.01030449, + "balance_loss_clip": 1.01795018, + "balance_loss_mlp": 1.03910947, + "epoch": 0.5932962573275214, + "flos": 20157557708160.0, + "grad_norm": 1.538584883762445, + "language_loss": 0.66726553, + "learning_rate": 1.4981476554402732e-06, + "loss": 0.68864822, + "num_input_tokens_seen": 212557820, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 9868, + "time_per_iteration": 2.5143752098083496 + }, + { + "auxiliary_loss_clip": 0.01107645, + "auxiliary_loss_mlp": 0.01033304, + "balance_loss_clip": 1.02100754, + "balance_loss_mlp": 1.03726101, + "epoch": 0.5933563805801894, + "flos": 25445906300160.0, + "grad_norm": 1.5720110660148519, + "language_loss": 0.75083476, + "learning_rate": 1.4977706636248478e-06, + "loss": 0.77224427, + "num_input_tokens_seen": 212577645, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 9869, + "time_per_iteration": 2.4784231185913086 + }, + { + "auxiliary_loss_clip": 0.01110477, + "auxiliary_loss_mlp": 0.0103956, + "balance_loss_clip": 1.02690041, + "balance_loss_mlp": 1.0391326, + "epoch": 0.5934165038328574, + "flos": 59995740337920.0, + "grad_norm": 1.6442009630814416, + "language_loss": 0.74131197, + "learning_rate": 1.4973936908540091e-06, + "loss": 0.76281238, + "num_input_tokens_seen": 212603430, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9870, + "time_per_iteration": 2.8396053314208984 + }, + { + "auxiliary_loss_clip": 0.0111113, + "auxiliary_loss_mlp": 0.01025896, + "balance_loss_clip": 1.01414764, + "balance_loss_mlp": 1.04010868, + "epoch": 0.5934766270855253, + "flos": 24420548661120.0, + "grad_norm": 1.9765481299651093, + "language_loss": 0.71421361, + "learning_rate": 1.4970167371420517e-06, + "loss": 0.7355839, + "num_input_tokens_seen": 212620730, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.7109375, + "step": 9871, + "time_per_iteration": 2.460695505142212 + }, + { + "auxiliary_loss_clip": 0.01110046, + "auxiliary_loss_mlp": 0.01032784, + "balance_loss_clip": 1.0198555, + "balance_loss_mlp": 1.03879905, + "epoch": 0.5935367503381933, + "flos": 23513158264320.0, + "grad_norm": 1.9723601672672642, + "language_loss": 0.74131697, + "learning_rate": 1.496639802503271e-06, + "loss": 0.76274526, + "num_input_tokens_seen": 212639745, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9872, + "time_per_iteration": 2.4877848625183105 + }, + { + "auxiliary_loss_clip": 0.01111497, + "auxiliary_loss_mlp": 0.01036942, + "balance_loss_clip": 1.02359688, + "balance_loss_mlp": 1.03926826, + "epoch": 0.5935968735908612, + "flos": 18948337326720.0, + "grad_norm": 2.142318153174813, + "language_loss": 0.78675568, + "learning_rate": 1.4962628869519583e-06, + "loss": 0.80824012, + "num_input_tokens_seen": 212655915, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.72265625, + "step": 9873, + "time_per_iteration": 2.4480934143066406 + }, + { + "auxiliary_loss_clip": 0.01109102, + "auxiliary_loss_mlp": 0.01034698, + "balance_loss_clip": 1.02197838, + "balance_loss_mlp": 1.03843832, + "epoch": 0.5936569968435292, + "flos": 25483433034240.0, + "grad_norm": 1.5306423792742176, + "language_loss": 0.85011673, + "learning_rate": 1.4958859905024078e-06, + "loss": 0.87155473, + "num_input_tokens_seen": 212676115, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 9874, + "time_per_iteration": 2.5098774433135986 + }, + { + "auxiliary_loss_clip": 0.01030749, + "auxiliary_loss_mlp": 0.01001619, + "balance_loss_clip": 1.00044489, + "balance_loss_mlp": 1.00908446, + "epoch": 0.5937171200961973, + "flos": 66378361789440.0, + "grad_norm": 0.6973173617166174, + "language_loss": 0.60004687, + "learning_rate": 1.4955091131689115e-06, + "loss": 0.62037057, + "num_input_tokens_seen": 212737560, + "router_z_loss_clip": 0.01171875, + "router_z_loss_mlp": 0.21679688, + "step": 9875, + "time_per_iteration": 3.1099135875701904 + }, + { + "auxiliary_loss_clip": 0.01110933, + "auxiliary_loss_mlp": 0.01033007, + "balance_loss_clip": 1.01980412, + "balance_loss_mlp": 1.0373013, + "epoch": 0.5937772433488652, + "flos": 14903467712640.0, + "grad_norm": 2.0699471238582943, + "language_loss": 0.77501059, + "learning_rate": 1.4951322549657594e-06, + "loss": 0.7964499, + "num_input_tokens_seen": 212755365, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 9876, + "time_per_iteration": 2.466031551361084 + }, + { + "auxiliary_loss_clip": 0.01103172, + "auxiliary_loss_mlp": 0.01028266, + "balance_loss_clip": 1.01652348, + "balance_loss_mlp": 1.03654408, + "epoch": 0.5938373666015332, + "flos": 22561489376640.0, + "grad_norm": 1.5589386174362272, + "language_loss": 0.75830436, + "learning_rate": 1.494755415907243e-06, + "loss": 0.77961862, + "num_input_tokens_seen": 212773875, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 9877, + "time_per_iteration": 2.4772722721099854 + }, + { + "auxiliary_loss_clip": 0.01108511, + "auxiliary_loss_mlp": 0.01031637, + "balance_loss_clip": 1.01892304, + "balance_loss_mlp": 1.03673589, + "epoch": 0.5938974898542011, + "flos": 18440883936000.0, + "grad_norm": 4.77912842405454, + "language_loss": 0.81212896, + "learning_rate": 1.4943785960076522e-06, + "loss": 0.83353043, + "num_input_tokens_seen": 212790590, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 9878, + "time_per_iteration": 2.511408805847168 + }, + { + "auxiliary_loss_clip": 0.01108341, + "auxiliary_loss_mlp": 0.01037126, + "balance_loss_clip": 1.02462077, + "balance_loss_mlp": 1.0378468, + "epoch": 0.5939576131068691, + "flos": 45586728270720.0, + "grad_norm": 1.7027842827521733, + "language_loss": 0.71123505, + "learning_rate": 1.4940017952812754e-06, + "loss": 0.73268974, + "num_input_tokens_seen": 212812265, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 9879, + "time_per_iteration": 2.6537530422210693 + }, + { + "auxiliary_loss_clip": 0.01107077, + "auxiliary_loss_mlp": 0.01032021, + "balance_loss_clip": 1.01973653, + "balance_loss_mlp": 1.03814936, + "epoch": 0.594017736359537, + "flos": 23587708942080.0, + "grad_norm": 1.4837097454893722, + "language_loss": 0.5739696, + "learning_rate": 1.493625013742401e-06, + "loss": 0.59536058, + "num_input_tokens_seen": 212831915, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69140625, + "step": 9880, + "time_per_iteration": 2.487082004547119 + }, + { + "auxiliary_loss_clip": 0.01107055, + "auxiliary_loss_mlp": 0.01037129, + "balance_loss_clip": 1.02435601, + "balance_loss_mlp": 1.03724837, + "epoch": 0.594077859612205, + "flos": 29457235589760.0, + "grad_norm": 1.7845732450958962, + "language_loss": 0.76980609, + "learning_rate": 1.4932482514053177e-06, + "loss": 0.79124796, + "num_input_tokens_seen": 212851350, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 9881, + "time_per_iteration": 2.5019240379333496 + }, + { + "auxiliary_loss_clip": 0.01105499, + "auxiliary_loss_mlp": 0.01026899, + "balance_loss_clip": 1.01437569, + "balance_loss_mlp": 1.03524506, + "epoch": 0.594137982864873, + "flos": 16800089644800.0, + "grad_norm": 2.214394269583833, + "language_loss": 0.82820934, + "learning_rate": 1.4928715082843112e-06, + "loss": 0.84953332, + "num_input_tokens_seen": 212867995, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 9882, + "time_per_iteration": 2.4258036613464355 + }, + { + "auxiliary_loss_clip": 0.01106542, + "auxiliary_loss_mlp": 0.01035086, + "balance_loss_clip": 1.02321863, + "balance_loss_mlp": 1.03781402, + "epoch": 0.594198106117541, + "flos": 12750263953920.0, + "grad_norm": 2.5324902309588855, + "language_loss": 0.79348171, + "learning_rate": 1.492494784393667e-06, + "loss": 0.81489801, + "num_input_tokens_seen": 212885220, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 9883, + "time_per_iteration": 2.4191815853118896 + }, + { + "auxiliary_loss_clip": 0.01112982, + "auxiliary_loss_mlp": 0.01034942, + "balance_loss_clip": 1.0214777, + "balance_loss_mlp": 1.03999424, + "epoch": 0.5942582293702089, + "flos": 20996538652800.0, + "grad_norm": 1.7967272432241739, + "language_loss": 0.74134135, + "learning_rate": 1.4921180797476725e-06, + "loss": 0.7628206, + "num_input_tokens_seen": 212903195, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73046875, + "step": 9884, + "time_per_iteration": 2.4599032402038574 + }, + { + "auxiliary_loss_clip": 0.01112156, + "auxiliary_loss_mlp": 0.01030057, + "balance_loss_clip": 1.0181067, + "balance_loss_mlp": 1.04232001, + "epoch": 0.5943183526228769, + "flos": 28291431772800.0, + "grad_norm": 3.4474311080183964, + "language_loss": 0.6639331, + "learning_rate": 1.4917413943606106e-06, + "loss": 0.68535531, + "num_input_tokens_seen": 212923340, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.69921875, + "step": 9885, + "time_per_iteration": 3.940159797668457 + }, + { + "auxiliary_loss_clip": 0.01107325, + "auxiliary_loss_mlp": 0.01036401, + "balance_loss_clip": 1.02392602, + "balance_loss_mlp": 1.03891098, + "epoch": 0.5943784758755448, + "flos": 26614619118720.0, + "grad_norm": 2.562196250157405, + "language_loss": 0.77456462, + "learning_rate": 1.4913647282467667e-06, + "loss": 0.79600191, + "num_input_tokens_seen": 212942755, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 9886, + "time_per_iteration": 2.4958837032318115 + }, + { + "auxiliary_loss_clip": 0.01029578, + "auxiliary_loss_mlp": 0.009997, + "balance_loss_clip": 0.99845427, + "balance_loss_mlp": 1.00789237, + "epoch": 0.5944385991282128, + "flos": 64190935347840.0, + "grad_norm": 0.8479500751523403, + "language_loss": 0.64580774, + "learning_rate": 1.490988081420423e-06, + "loss": 0.6661005, + "num_input_tokens_seen": 212999355, + "router_z_loss_clip": 0.01245117, + "router_z_loss_mlp": 0.21679688, + "step": 9887, + "time_per_iteration": 4.312393426895142 + }, + { + "auxiliary_loss_clip": 0.01106228, + "auxiliary_loss_mlp": 0.01031375, + "balance_loss_clip": 1.01911473, + "balance_loss_mlp": 1.03743696, + "epoch": 0.5944987223808808, + "flos": 19571998193280.0, + "grad_norm": 1.9767325567336362, + "language_loss": 0.69172513, + "learning_rate": 1.4906114538958615e-06, + "loss": 0.71310121, + "num_input_tokens_seen": 213018570, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 9888, + "time_per_iteration": 3.8631362915039062 + }, + { + "auxiliary_loss_clip": 0.01108213, + "auxiliary_loss_mlp": 0.01030472, + "balance_loss_clip": 1.01763296, + "balance_loss_mlp": 1.03916407, + "epoch": 0.5945588456335488, + "flos": 26177586341760.0, + "grad_norm": 1.5956528037649322, + "language_loss": 0.79466522, + "learning_rate": 1.490234845687366e-06, + "loss": 0.81605208, + "num_input_tokens_seen": 213037735, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6875, + "step": 9889, + "time_per_iteration": 4.0321431159973145 + }, + { + "auxiliary_loss_clip": 0.01105021, + "auxiliary_loss_mlp": 0.01031056, + "balance_loss_clip": 1.01912892, + "balance_loss_mlp": 1.03607225, + "epoch": 0.5946189688862168, + "flos": 20446494710400.0, + "grad_norm": 1.529319229595301, + "language_loss": 0.70732993, + "learning_rate": 1.4898582568092154e-06, + "loss": 0.72869068, + "num_input_tokens_seen": 213057160, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 9890, + "time_per_iteration": 2.465503692626953 + }, + { + "auxiliary_loss_clip": 0.01110328, + "auxiliary_loss_mlp": 0.01032793, + "balance_loss_clip": 1.01994216, + "balance_loss_mlp": 1.03921902, + "epoch": 0.5946790921388847, + "flos": 13437521850240.0, + "grad_norm": 2.2570879506032933, + "language_loss": 0.69334114, + "learning_rate": 1.489481687275691e-06, + "loss": 0.71477234, + "num_input_tokens_seen": 213073630, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9891, + "time_per_iteration": 2.4280505180358887 + }, + { + "auxiliary_loss_clip": 0.01106776, + "auxiliary_loss_mlp": 0.01036094, + "balance_loss_clip": 1.02376795, + "balance_loss_mlp": 1.03809762, + "epoch": 0.5947392153915527, + "flos": 20412272027520.0, + "grad_norm": 1.752140694177181, + "language_loss": 0.53531826, + "learning_rate": 1.4891051371010726e-06, + "loss": 0.55674696, + "num_input_tokens_seen": 213092450, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 9892, + "time_per_iteration": 2.4815757274627686 + }, + { + "auxiliary_loss_clip": 0.01030384, + "auxiliary_loss_mlp": 0.0100208, + "balance_loss_clip": 1.00095928, + "balance_loss_mlp": 1.00874603, + "epoch": 0.5947993386442206, + "flos": 65619138994560.0, + "grad_norm": 0.6588951163028871, + "language_loss": 0.54535234, + "learning_rate": 1.4887286062996375e-06, + "loss": 0.56567693, + "num_input_tokens_seen": 213155465, + "router_z_loss_clip": 0.01123047, + "router_z_loss_mlp": 0.21679688, + "step": 9893, + "time_per_iteration": 3.1101529598236084 + }, + { + "auxiliary_loss_clip": 0.01106079, + "auxiliary_loss_mlp": 0.01030799, + "balance_loss_clip": 1.01892543, + "balance_loss_mlp": 1.03811431, + "epoch": 0.5948594618968887, + "flos": 23183103168000.0, + "grad_norm": 1.588107459430707, + "language_loss": 0.74231315, + "learning_rate": 1.4883520948856658e-06, + "loss": 0.76368201, + "num_input_tokens_seen": 213174875, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 9894, + "time_per_iteration": 2.4519400596618652 + }, + { + "auxiliary_loss_clip": 0.01106074, + "auxiliary_loss_mlp": 0.01032085, + "balance_loss_clip": 1.02005649, + "balance_loss_mlp": 1.03685939, + "epoch": 0.5949195851495566, + "flos": 13626771632640.0, + "grad_norm": 1.6911288792838162, + "language_loss": 0.77848423, + "learning_rate": 1.487975602873434e-06, + "loss": 0.79986584, + "num_input_tokens_seen": 213192695, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6953125, + "step": 9895, + "time_per_iteration": 2.524150848388672 + }, + { + "auxiliary_loss_clip": 0.01110174, + "auxiliary_loss_mlp": 0.01032, + "balance_loss_clip": 1.01923883, + "balance_loss_mlp": 1.0391717, + "epoch": 0.5949797084022246, + "flos": 19751012599680.0, + "grad_norm": 1.6627914614590094, + "language_loss": 0.79355633, + "learning_rate": 1.4875991302772182e-06, + "loss": 0.814978, + "num_input_tokens_seen": 213211195, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9896, + "time_per_iteration": 2.450514078140259 + }, + { + "auxiliary_loss_clip": 0.01108131, + "auxiliary_loss_mlp": 0.01032365, + "balance_loss_clip": 1.01991367, + "balance_loss_mlp": 1.0379312, + "epoch": 0.5950398316548925, + "flos": 25773878407680.0, + "grad_norm": 1.56691412182982, + "language_loss": 0.83697438, + "learning_rate": 1.4872226771112954e-06, + "loss": 0.8583793, + "num_input_tokens_seen": 213231975, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.703125, + "step": 9897, + "time_per_iteration": 2.499427556991577 + }, + { + "auxiliary_loss_clip": 0.0111126, + "auxiliary_loss_mlp": 0.01033118, + "balance_loss_clip": 1.02043986, + "balance_loss_mlp": 1.04021525, + "epoch": 0.5950999549075605, + "flos": 23039029716480.0, + "grad_norm": 1.7628400615055348, + "language_loss": 0.70908117, + "learning_rate": 1.486846243389939e-06, + "loss": 0.7305249, + "num_input_tokens_seen": 213249760, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9898, + "time_per_iteration": 2.450711488723755 + }, + { + "auxiliary_loss_clip": 0.01114075, + "auxiliary_loss_mlp": 0.01038747, + "balance_loss_clip": 1.02481782, + "balance_loss_mlp": 1.03905582, + "epoch": 0.5951600781602284, + "flos": 32446367637120.0, + "grad_norm": 2.840239375448059, + "language_loss": 0.64112437, + "learning_rate": 1.4864698291274251e-06, + "loss": 0.66265255, + "num_input_tokens_seen": 213269890, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 9899, + "time_per_iteration": 2.5394744873046875 + }, + { + "auxiliary_loss_clip": 0.01109128, + "auxiliary_loss_mlp": 0.01026657, + "balance_loss_clip": 1.01592183, + "balance_loss_mlp": 1.04008675, + "epoch": 0.5952202014128964, + "flos": 23800874204160.0, + "grad_norm": 1.879978941191363, + "language_loss": 0.71715653, + "learning_rate": 1.4860934343380267e-06, + "loss": 0.73851436, + "num_input_tokens_seen": 213289400, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6875, + "step": 9900, + "time_per_iteration": 2.4623067378997803 + }, + { + "auxiliary_loss_clip": 0.01107194, + "auxiliary_loss_mlp": 0.01029325, + "balance_loss_clip": 1.01654577, + "balance_loss_mlp": 1.03926349, + "epoch": 0.5952803246655644, + "flos": 22492182084480.0, + "grad_norm": 1.9859766918367532, + "language_loss": 0.84489024, + "learning_rate": 1.4857170590360169e-06, + "loss": 0.86625552, + "num_input_tokens_seen": 213308040, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 9901, + "time_per_iteration": 2.4463791847229004 + }, + { + "auxiliary_loss_clip": 0.01028301, + "auxiliary_loss_mlp": 0.01003723, + "balance_loss_clip": 1.00249529, + "balance_loss_mlp": 1.00672269, + "epoch": 0.5953404479182324, + "flos": 51234688851840.0, + "grad_norm": 0.8098587011957621, + "language_loss": 0.58273184, + "learning_rate": 1.4853407032356674e-06, + "loss": 0.60305208, + "num_input_tokens_seen": 213358585, + "router_z_loss_clip": 0.01226807, + "router_z_loss_mlp": 0.21582031, + "step": 9902, + "time_per_iteration": 2.9000015258789062 + }, + { + "auxiliary_loss_clip": 0.01109898, + "auxiliary_loss_mlp": 0.0103143, + "balance_loss_clip": 1.01876402, + "balance_loss_mlp": 1.03859127, + "epoch": 0.5954005711709004, + "flos": 23112682554240.0, + "grad_norm": 3.08671627053405, + "language_loss": 0.77136552, + "learning_rate": 1.4849643669512503e-06, + "loss": 0.79277885, + "num_input_tokens_seen": 213379585, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71484375, + "step": 9903, + "time_per_iteration": 2.5076375007629395 + }, + { + "auxiliary_loss_clip": 0.01111406, + "auxiliary_loss_mlp": 0.01036008, + "balance_loss_clip": 1.02430773, + "balance_loss_mlp": 1.04097402, + "epoch": 0.5954606944235683, + "flos": 35954732736000.0, + "grad_norm": 1.7111155417857251, + "language_loss": 0.77616894, + "learning_rate": 1.4845880501970362e-06, + "loss": 0.79764313, + "num_input_tokens_seen": 213401465, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.703125, + "step": 9904, + "time_per_iteration": 2.5716845989227295 + }, + { + "auxiliary_loss_clip": 0.01110151, + "auxiliary_loss_mlp": 0.01036445, + "balance_loss_clip": 1.02405953, + "balance_loss_mlp": 1.03790653, + "epoch": 0.5955208176762363, + "flos": 30443665864320.0, + "grad_norm": 2.2036474032145192, + "language_loss": 0.72382712, + "learning_rate": 1.4842117529872942e-06, + "loss": 0.74529308, + "num_input_tokens_seen": 213422720, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.72265625, + "step": 9905, + "time_per_iteration": 2.5354321002960205 + }, + { + "auxiliary_loss_clip": 0.01109645, + "auxiliary_loss_mlp": 0.01030539, + "balance_loss_clip": 1.01789069, + "balance_loss_mlp": 1.03853083, + "epoch": 0.5955809409289042, + "flos": 17640112083840.0, + "grad_norm": 1.6203597758298474, + "language_loss": 0.69817066, + "learning_rate": 1.483835475336295e-06, + "loss": 0.71957242, + "num_input_tokens_seen": 213439480, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9906, + "time_per_iteration": 2.4373247623443604 + }, + { + "auxiliary_loss_clip": 0.01110789, + "auxiliary_loss_mlp": 0.01035293, + "balance_loss_clip": 1.02259731, + "balance_loss_mlp": 1.03987217, + "epoch": 0.5956410641815723, + "flos": 24279887001600.0, + "grad_norm": 1.782354761153575, + "language_loss": 0.7491982, + "learning_rate": 1.4834592172583057e-06, + "loss": 0.77065903, + "num_input_tokens_seen": 213458895, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9907, + "time_per_iteration": 2.5548195838928223 + }, + { + "auxiliary_loss_clip": 0.01109413, + "auxiliary_loss_mlp": 0.01035553, + "balance_loss_clip": 1.02353668, + "balance_loss_mlp": 1.0388813, + "epoch": 0.5957011874342402, + "flos": 35734277013120.0, + "grad_norm": 1.601142913290667, + "language_loss": 0.67155874, + "learning_rate": 1.483082978767595e-06, + "loss": 0.69300842, + "num_input_tokens_seen": 213481730, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.70703125, + "step": 9908, + "time_per_iteration": 2.5727956295013428 + }, + { + "auxiliary_loss_clip": 0.01108392, + "auxiliary_loss_mlp": 0.01029453, + "balance_loss_clip": 1.01753211, + "balance_loss_mlp": 1.03904438, + "epoch": 0.5957613106869082, + "flos": 21245004005760.0, + "grad_norm": 5.1100613292928365, + "language_loss": 0.76492268, + "learning_rate": 1.4827067598784298e-06, + "loss": 0.78630114, + "num_input_tokens_seen": 213497225, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 9909, + "time_per_iteration": 2.459608554840088 + }, + { + "auxiliary_loss_clip": 0.01028544, + "auxiliary_loss_mlp": 0.01005303, + "balance_loss_clip": 1.00416493, + "balance_loss_mlp": 1.00715542, + "epoch": 0.5958214339395761, + "flos": 65940969876480.0, + "grad_norm": 0.9275868367088792, + "language_loss": 0.73427647, + "learning_rate": 1.4823305606050753e-06, + "loss": 0.75461495, + "num_input_tokens_seen": 213556890, + "router_z_loss_clip": 0.01141357, + "router_z_loss_mlp": 0.21386719, + "step": 9910, + "time_per_iteration": 3.1051745414733887 + }, + { + "auxiliary_loss_clip": 0.01108818, + "auxiliary_loss_mlp": 0.01032907, + "balance_loss_clip": 1.01981187, + "balance_loss_mlp": 1.03741884, + "epoch": 0.5958815571922441, + "flos": 23218690567680.0, + "grad_norm": 1.6458105124951614, + "language_loss": 0.69844317, + "learning_rate": 1.481954380961799e-06, + "loss": 0.71986043, + "num_input_tokens_seen": 213575800, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71484375, + "step": 9911, + "time_per_iteration": 2.4647021293640137 + }, + { + "auxiliary_loss_clip": 0.01116428, + "auxiliary_loss_mlp": 0.01033668, + "balance_loss_clip": 1.02031708, + "balance_loss_mlp": 1.04145718, + "epoch": 0.595941680444912, + "flos": 16538623568640.0, + "grad_norm": 1.8630263408862686, + "language_loss": 0.65476716, + "learning_rate": 1.4815782209628631e-06, + "loss": 0.6762681, + "num_input_tokens_seen": 213592740, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.75, + "step": 9912, + "time_per_iteration": 2.4077272415161133 + }, + { + "auxiliary_loss_clip": 0.01108551, + "auxiliary_loss_mlp": 0.01036693, + "balance_loss_clip": 1.02385449, + "balance_loss_mlp": 1.03806984, + "epoch": 0.59600180369758, + "flos": 27818883423360.0, + "grad_norm": 2.0476871057930772, + "language_loss": 0.73610109, + "learning_rate": 1.4812020806225337e-06, + "loss": 0.75755352, + "num_input_tokens_seen": 213611970, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 9913, + "time_per_iteration": 2.5155045986175537 + }, + { + "auxiliary_loss_clip": 0.01113961, + "auxiliary_loss_mlp": 0.0103048, + "balance_loss_clip": 1.0178144, + "balance_loss_mlp": 1.03791463, + "epoch": 0.596061926950248, + "flos": 29491566013440.0, + "grad_norm": 2.0765652786465885, + "language_loss": 0.79696703, + "learning_rate": 1.4808259599550738e-06, + "loss": 0.81841141, + "num_input_tokens_seen": 213632230, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.76171875, + "step": 9914, + "time_per_iteration": 2.4950027465820312 + }, + { + "auxiliary_loss_clip": 0.01107633, + "auxiliary_loss_mlp": 0.01030894, + "balance_loss_clip": 1.0189786, + "balance_loss_mlp": 1.03856075, + "epoch": 0.596122050202916, + "flos": 16836790366080.0, + "grad_norm": 1.9745402695948293, + "language_loss": 0.67218065, + "learning_rate": 1.4804498589747448e-06, + "loss": 0.69356596, + "num_input_tokens_seen": 213649645, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69140625, + "step": 9915, + "time_per_iteration": 2.43723726272583 + }, + { + "auxiliary_loss_clip": 0.01107786, + "auxiliary_loss_mlp": 0.01035427, + "balance_loss_clip": 1.02319074, + "balance_loss_mlp": 1.03634763, + "epoch": 0.596182173455584, + "flos": 20996646393600.0, + "grad_norm": 1.613453800947639, + "language_loss": 0.78928566, + "learning_rate": 1.4800737776958095e-06, + "loss": 0.81071782, + "num_input_tokens_seen": 213668850, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.71484375, + "step": 9916, + "time_per_iteration": 2.456350088119507 + }, + { + "auxiliary_loss_clip": 0.01108915, + "auxiliary_loss_mlp": 0.0103207, + "balance_loss_clip": 1.01933253, + "balance_loss_mlp": 1.03744936, + "epoch": 0.5962422967082519, + "flos": 16065680169600.0, + "grad_norm": 1.7690461818627004, + "language_loss": 0.82394695, + "learning_rate": 1.4796977161325286e-06, + "loss": 0.84535682, + "num_input_tokens_seen": 213685695, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71484375, + "step": 9917, + "time_per_iteration": 2.469238758087158 + }, + { + "auxiliary_loss_clip": 0.01108059, + "auxiliary_loss_mlp": 0.01035556, + "balance_loss_clip": 1.02383804, + "balance_loss_mlp": 1.03837276, + "epoch": 0.5963024199609199, + "flos": 12166966995840.0, + "grad_norm": 1.817824058021054, + "language_loss": 0.77982944, + "learning_rate": 1.4793216742991625e-06, + "loss": 0.8012656, + "num_input_tokens_seen": 213703515, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6953125, + "step": 9918, + "time_per_iteration": 2.4436004161834717 + }, + { + "auxiliary_loss_clip": 0.01109399, + "auxiliary_loss_mlp": 0.01034518, + "balance_loss_clip": 1.02182257, + "balance_loss_mlp": 1.0390811, + "epoch": 0.5963625432135878, + "flos": 28074280101120.0, + "grad_norm": 1.422582146168897, + "language_loss": 0.78566158, + "learning_rate": 1.4789456522099707e-06, + "loss": 0.80710077, + "num_input_tokens_seen": 213724170, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 9919, + "time_per_iteration": 2.5787289142608643 + }, + { + "auxiliary_loss_clip": 0.01107781, + "auxiliary_loss_mlp": 0.01034192, + "balance_loss_clip": 1.02094173, + "balance_loss_mlp": 1.0381664, + "epoch": 0.5964226664662559, + "flos": 19860324664320.0, + "grad_norm": 1.9239790966111896, + "language_loss": 0.77425951, + "learning_rate": 1.4785696498792122e-06, + "loss": 0.79567927, + "num_input_tokens_seen": 213740620, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6953125, + "step": 9920, + "time_per_iteration": 2.4440083503723145 + }, + { + "auxiliary_loss_clip": 0.01113744, + "auxiliary_loss_mlp": 0.01030569, + "balance_loss_clip": 1.01843953, + "balance_loss_mlp": 1.04212332, + "epoch": 0.5964827897189238, + "flos": 12932618325120.0, + "grad_norm": 2.2435260632361733, + "language_loss": 0.82452321, + "learning_rate": 1.4781936673211446e-06, + "loss": 0.84596634, + "num_input_tokens_seen": 213755390, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71484375, + "step": 9921, + "time_per_iteration": 2.456138849258423 + }, + { + "auxiliary_loss_clip": 0.0110802, + "auxiliary_loss_mlp": 0.01032844, + "balance_loss_clip": 1.02021408, + "balance_loss_mlp": 1.0373764, + "epoch": 0.5965429129715918, + "flos": 18150797698560.0, + "grad_norm": 1.9967408520895134, + "language_loss": 0.80682462, + "learning_rate": 1.4778177045500252e-06, + "loss": 0.82823324, + "num_input_tokens_seen": 213773225, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.70703125, + "step": 9922, + "time_per_iteration": 2.4144599437713623 + }, + { + "auxiliary_loss_clip": 0.0110795, + "auxiliary_loss_mlp": 0.01029598, + "balance_loss_clip": 1.01693821, + "balance_loss_mlp": 1.03790641, + "epoch": 0.5966030362242597, + "flos": 21763231476480.0, + "grad_norm": 1.7485306495183626, + "language_loss": 0.77080536, + "learning_rate": 1.477441761580111e-06, + "loss": 0.79218084, + "num_input_tokens_seen": 213791860, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 9923, + "time_per_iteration": 2.489145517349243 + }, + { + "auxiliary_loss_clip": 0.01115253, + "auxiliary_loss_mlp": 0.01035824, + "balance_loss_clip": 1.02174497, + "balance_loss_mlp": 1.04084301, + "epoch": 0.5966631594769277, + "flos": 18807208790400.0, + "grad_norm": 1.7680593419575392, + "language_loss": 0.75725371, + "learning_rate": 1.4770658384256573e-06, + "loss": 0.77876449, + "num_input_tokens_seen": 213809455, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 9924, + "time_per_iteration": 2.4216740131378174 + }, + { + "auxiliary_loss_clip": 0.01105499, + "auxiliary_loss_mlp": 0.01032044, + "balance_loss_clip": 1.0190742, + "balance_loss_mlp": 1.03832626, + "epoch": 0.5967232827295956, + "flos": 14064163545600.0, + "grad_norm": 3.198852886281723, + "language_loss": 0.6646719, + "learning_rate": 1.4766899351009204e-06, + "loss": 0.68604732, + "num_input_tokens_seen": 213826615, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.671875, + "step": 9925, + "time_per_iteration": 2.4475882053375244 + }, + { + "auxiliary_loss_clip": 0.01109319, + "auxiliary_loss_mlp": 0.01032439, + "balance_loss_clip": 1.01986837, + "balance_loss_mlp": 1.04157531, + "epoch": 0.5967834059822636, + "flos": 17238235743360.0, + "grad_norm": 2.375187864026988, + "language_loss": 0.71979719, + "learning_rate": 1.4763140516201528e-06, + "loss": 0.74121475, + "num_input_tokens_seen": 213844495, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6796875, + "step": 9926, + "time_per_iteration": 2.4132394790649414 + }, + { + "auxiliary_loss_clip": 0.01111749, + "auxiliary_loss_mlp": 0.01033269, + "balance_loss_clip": 1.02014971, + "balance_loss_mlp": 1.03978753, + "epoch": 0.5968435292349316, + "flos": 42520244284800.0, + "grad_norm": 1.812838696961727, + "language_loss": 0.70522958, + "learning_rate": 1.4759381879976088e-06, + "loss": 0.7266798, + "num_input_tokens_seen": 213869125, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 9927, + "time_per_iteration": 4.071920156478882 + }, + { + "auxiliary_loss_clip": 0.01112228, + "auxiliary_loss_mlp": 0.01031173, + "balance_loss_clip": 1.01779175, + "balance_loss_mlp": 1.03788543, + "epoch": 0.5969036524875996, + "flos": 37630898945280.0, + "grad_norm": 1.756068652476383, + "language_loss": 0.63428164, + "learning_rate": 1.4755623442475415e-06, + "loss": 0.65571564, + "num_input_tokens_seen": 213891115, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7421875, + "step": 9928, + "time_per_iteration": 2.616556406021118 + }, + { + "auxiliary_loss_clip": 0.01105274, + "auxiliary_loss_mlp": 0.01029748, + "balance_loss_clip": 1.01774395, + "balance_loss_mlp": 1.0362494, + "epoch": 0.5969637757402676, + "flos": 23148377694720.0, + "grad_norm": 1.5985801618436777, + "language_loss": 0.69484866, + "learning_rate": 1.4751865203842022e-06, + "loss": 0.71619892, + "num_input_tokens_seen": 213911925, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 9929, + "time_per_iteration": 3.929401397705078 + }, + { + "auxiliary_loss_clip": 0.01106506, + "auxiliary_loss_mlp": 0.01034705, + "balance_loss_clip": 1.02314126, + "balance_loss_mlp": 1.0390749, + "epoch": 0.5970238989929355, + "flos": 24020934877440.0, + "grad_norm": 1.8723634053132125, + "language_loss": 0.7651577, + "learning_rate": 1.4748107164218431e-06, + "loss": 0.78656977, + "num_input_tokens_seen": 213930715, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.67578125, + "step": 9930, + "time_per_iteration": 3.9201314449310303 + }, + { + "auxiliary_loss_clip": 0.01114181, + "auxiliary_loss_mlp": 0.01032152, + "balance_loss_clip": 1.01845503, + "balance_loss_mlp": 1.04086351, + "epoch": 0.5970840222456035, + "flos": 19426883247360.0, + "grad_norm": 1.7493285690141849, + "language_loss": 0.69032001, + "learning_rate": 1.4744349323747146e-06, + "loss": 0.71178329, + "num_input_tokens_seen": 213950015, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.734375, + "step": 9931, + "time_per_iteration": 3.879492998123169 + }, + { + "auxiliary_loss_clip": 0.01027027, + "auxiliary_loss_mlp": 0.00997139, + "balance_loss_clip": 0.99597675, + "balance_loss_mlp": 1.00581264, + "epoch": 0.5971441454982714, + "flos": 62976615235200.0, + "grad_norm": 0.8633082810339764, + "language_loss": 0.64247859, + "learning_rate": 1.474059168257065e-06, + "loss": 0.6627202, + "num_input_tokens_seen": 214003330, + "router_z_loss_clip": 0.01159668, + "router_z_loss_mlp": 0.21289062, + "step": 9932, + "time_per_iteration": 2.985929489135742 + }, + { + "auxiliary_loss_clip": 0.01109379, + "auxiliary_loss_mlp": 0.01029489, + "balance_loss_clip": 1.01604247, + "balance_loss_mlp": 1.03876853, + "epoch": 0.5972042687509395, + "flos": 20266223328000.0, + "grad_norm": 1.8784919283093424, + "language_loss": 0.74257267, + "learning_rate": 1.4736834240831454e-06, + "loss": 0.76396132, + "num_input_tokens_seen": 214021680, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.70703125, + "step": 9933, + "time_per_iteration": 2.4789366722106934 + }, + { + "auxiliary_loss_clip": 0.01027236, + "auxiliary_loss_mlp": 0.00998624, + "balance_loss_clip": 0.997509, + "balance_loss_mlp": 1.00592136, + "epoch": 0.5972643920036074, + "flos": 71652383832960.0, + "grad_norm": 0.6667374312128803, + "language_loss": 0.51967168, + "learning_rate": 1.473307699867203e-06, + "loss": 0.53993034, + "num_input_tokens_seen": 214090265, + "router_z_loss_clip": 0.01116943, + "router_z_loss_mlp": 0.21289062, + "step": 9934, + "time_per_iteration": 3.181849956512451 + }, + { + "auxiliary_loss_clip": 0.01027661, + "auxiliary_loss_mlp": 0.00997349, + "balance_loss_clip": 0.99616891, + "balance_loss_mlp": 1.00641167, + "epoch": 0.5973245152562754, + "flos": 56892702263040.0, + "grad_norm": 0.8444164965298677, + "language_loss": 0.54164159, + "learning_rate": 1.4729319956234849e-06, + "loss": 0.56189167, + "num_input_tokens_seen": 214146375, + "router_z_loss_clip": 0.01177979, + "router_z_loss_mlp": 0.21289062, + "step": 9935, + "time_per_iteration": 2.997821807861328 + }, + { + "auxiliary_loss_clip": 0.01108103, + "auxiliary_loss_mlp": 0.01034314, + "balance_loss_clip": 1.02102828, + "balance_loss_mlp": 1.03731823, + "epoch": 0.5973846385089433, + "flos": 24164361884160.0, + "grad_norm": 1.5699606989571269, + "language_loss": 0.65541828, + "learning_rate": 1.4725563113662394e-06, + "loss": 0.67684245, + "num_input_tokens_seen": 214165340, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.70703125, + "step": 9936, + "time_per_iteration": 2.533317804336548 + }, + { + "auxiliary_loss_clip": 0.01110253, + "auxiliary_loss_mlp": 0.01032053, + "balance_loss_clip": 1.02026367, + "balance_loss_mlp": 1.03937888, + "epoch": 0.5974447617616113, + "flos": 17670599752320.0, + "grad_norm": 2.0123537966767797, + "language_loss": 0.67731905, + "learning_rate": 1.4721806471097103e-06, + "loss": 0.69874215, + "num_input_tokens_seen": 214181360, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.7109375, + "step": 9937, + "time_per_iteration": 2.4379465579986572 + }, + { + "auxiliary_loss_clip": 0.01112093, + "auxiliary_loss_mlp": 0.01034338, + "balance_loss_clip": 1.02101064, + "balance_loss_mlp": 1.03899479, + "epoch": 0.5975048850142792, + "flos": 22892514140160.0, + "grad_norm": 3.133342754143776, + "language_loss": 0.77174151, + "learning_rate": 1.4718050028681442e-06, + "loss": 0.79320574, + "num_input_tokens_seen": 214198525, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73046875, + "step": 9938, + "time_per_iteration": 2.470590114593506 + }, + { + "auxiliary_loss_clip": 0.01110044, + "auxiliary_loss_mlp": 0.01030032, + "balance_loss_clip": 1.01708603, + "balance_loss_mlp": 1.03813004, + "epoch": 0.5975650082669473, + "flos": 24353108876160.0, + "grad_norm": 1.6192850653818303, + "language_loss": 0.75987661, + "learning_rate": 1.4714293786557855e-06, + "loss": 0.78127742, + "num_input_tokens_seen": 214218710, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 9939, + "time_per_iteration": 2.477731227874756 + }, + { + "auxiliary_loss_clip": 0.01113496, + "auxiliary_loss_mlp": 0.01030328, + "balance_loss_clip": 1.01565337, + "balance_loss_mlp": 1.03811717, + "epoch": 0.5976251315196152, + "flos": 20923352691840.0, + "grad_norm": 2.2637964874634124, + "language_loss": 0.6840167, + "learning_rate": 1.471053774486878e-06, + "loss": 0.70545495, + "num_input_tokens_seen": 214237800, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.75390625, + "step": 9940, + "time_per_iteration": 2.4641294479370117 + }, + { + "auxiliary_loss_clip": 0.01103786, + "auxiliary_loss_mlp": 0.01033159, + "balance_loss_clip": 1.02150035, + "balance_loss_mlp": 1.03630126, + "epoch": 0.5976852547722832, + "flos": 35844594658560.0, + "grad_norm": 1.3031499437689418, + "language_loss": 0.70227146, + "learning_rate": 1.470678190375664e-06, + "loss": 0.72364092, + "num_input_tokens_seen": 214260355, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.67578125, + "step": 9941, + "time_per_iteration": 2.644956111907959 + }, + { + "auxiliary_loss_clip": 0.01103617, + "auxiliary_loss_mlp": 0.01033415, + "balance_loss_clip": 1.02042711, + "balance_loss_mlp": 1.0345757, + "epoch": 0.5977453780249512, + "flos": 12855948744960.0, + "grad_norm": 2.0310172288776456, + "language_loss": 0.77255404, + "learning_rate": 1.470302626336386e-06, + "loss": 0.79392433, + "num_input_tokens_seen": 214277120, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.69140625, + "step": 9942, + "time_per_iteration": 2.4575772285461426 + }, + { + "auxiliary_loss_clip": 0.01108233, + "auxiliary_loss_mlp": 0.01040799, + "balance_loss_clip": 1.02815676, + "balance_loss_mlp": 1.03664815, + "epoch": 0.5978055012776191, + "flos": 20959155573120.0, + "grad_norm": 1.8744137632140625, + "language_loss": 0.7585178, + "learning_rate": 1.4699270823832857e-06, + "loss": 0.78000808, + "num_input_tokens_seen": 214295300, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71484375, + "step": 9943, + "time_per_iteration": 2.4413061141967773 + }, + { + "auxiliary_loss_clip": 0.01105208, + "auxiliary_loss_mlp": 0.01030161, + "balance_loss_clip": 1.01884818, + "balance_loss_mlp": 1.03699136, + "epoch": 0.5978656245302871, + "flos": 34058003063040.0, + "grad_norm": 1.7396443017276344, + "language_loss": 0.61821425, + "learning_rate": 1.4695515585306032e-06, + "loss": 0.63956803, + "num_input_tokens_seen": 214317050, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.68359375, + "step": 9944, + "time_per_iteration": 2.569403886795044 + }, + { + "auxiliary_loss_clip": 0.01110079, + "auxiliary_loss_mlp": 0.01035221, + "balance_loss_clip": 1.02228653, + "balance_loss_mlp": 1.0391618, + "epoch": 0.597925747782955, + "flos": 37373275624320.0, + "grad_norm": 1.6935047887113677, + "language_loss": 0.72621685, + "learning_rate": 1.4691760547925795e-06, + "loss": 0.74766988, + "num_input_tokens_seen": 214337470, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 9945, + "time_per_iteration": 2.5811283588409424 + }, + { + "auxiliary_loss_clip": 0.0110514, + "auxiliary_loss_mlp": 0.01032852, + "balance_loss_clip": 1.02017426, + "balance_loss_mlp": 1.03536916, + "epoch": 0.5979858710356231, + "flos": 25374803328000.0, + "grad_norm": 2.0883326121528443, + "language_loss": 0.67156124, + "learning_rate": 1.4688005711834522e-06, + "loss": 0.69294119, + "num_input_tokens_seen": 214357975, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 9946, + "time_per_iteration": 2.513643503189087 + }, + { + "auxiliary_loss_clip": 0.01111839, + "auxiliary_loss_mlp": 0.01036188, + "balance_loss_clip": 1.02280676, + "balance_loss_mlp": 1.03886974, + "epoch": 0.598045994288291, + "flos": 13698413308800.0, + "grad_norm": 2.0799446912413386, + "language_loss": 0.88996196, + "learning_rate": 1.468425107717461e-06, + "loss": 0.91144222, + "num_input_tokens_seen": 214374125, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73046875, + "step": 9947, + "time_per_iteration": 2.4069466590881348 + }, + { + "auxiliary_loss_clip": 0.01102487, + "auxiliary_loss_mlp": 0.01035974, + "balance_loss_clip": 1.02501893, + "balance_loss_mlp": 1.03634834, + "epoch": 0.598106117540959, + "flos": 21981352815360.0, + "grad_norm": 1.664735448435926, + "language_loss": 0.72050726, + "learning_rate": 1.4680496644088432e-06, + "loss": 0.74189186, + "num_input_tokens_seen": 214393395, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6640625, + "step": 9948, + "time_per_iteration": 2.474961280822754 + }, + { + "auxiliary_loss_clip": 0.01107668, + "auxiliary_loss_mlp": 0.01031342, + "balance_loss_clip": 1.01749587, + "balance_loss_mlp": 1.03676891, + "epoch": 0.5981662407936269, + "flos": 20559362221440.0, + "grad_norm": 1.8018456141940389, + "language_loss": 0.89439249, + "learning_rate": 1.4676742412718347e-06, + "loss": 0.91578257, + "num_input_tokens_seen": 214411550, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7109375, + "step": 9949, + "time_per_iteration": 2.455151319503784 + }, + { + "auxiliary_loss_clip": 0.0110613, + "auxiliary_loss_mlp": 0.01026216, + "balance_loss_clip": 1.01458669, + "balance_loss_mlp": 1.03746963, + "epoch": 0.5982263640462949, + "flos": 14063840323200.0, + "grad_norm": 1.9594093526491967, + "language_loss": 0.70425475, + "learning_rate": 1.467298838320673e-06, + "loss": 0.72557819, + "num_input_tokens_seen": 214429780, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6875, + "step": 9950, + "time_per_iteration": 2.479177474975586 + }, + { + "auxiliary_loss_clip": 0.01106992, + "auxiliary_loss_mlp": 0.01030869, + "balance_loss_clip": 1.01816094, + "balance_loss_mlp": 1.03653646, + "epoch": 0.5982864872989628, + "flos": 17707228646400.0, + "grad_norm": 1.7839667170115563, + "language_loss": 0.78153586, + "learning_rate": 1.4669234555695921e-06, + "loss": 0.8029145, + "num_input_tokens_seen": 214447775, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 9951, + "time_per_iteration": 2.4318583011627197 + }, + { + "auxiliary_loss_clip": 0.01108258, + "auxiliary_loss_mlp": 0.01042077, + "balance_loss_clip": 1.02885103, + "balance_loss_mlp": 1.03666139, + "epoch": 0.5983466105516309, + "flos": 16764789553920.0, + "grad_norm": 6.7296631151691235, + "language_loss": 0.73816681, + "learning_rate": 1.4665480930328275e-06, + "loss": 0.75967014, + "num_input_tokens_seen": 214467245, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 9952, + "time_per_iteration": 2.4669008255004883 + }, + { + "auxiliary_loss_clip": 0.01109291, + "auxiliary_loss_mlp": 0.01030783, + "balance_loss_clip": 1.01705003, + "balance_loss_mlp": 1.03699803, + "epoch": 0.5984067338042988, + "flos": 20042714949120.0, + "grad_norm": 2.1100044837404264, + "language_loss": 0.78595901, + "learning_rate": 1.466172750724613e-06, + "loss": 0.8073597, + "num_input_tokens_seen": 214484385, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.72265625, + "step": 9953, + "time_per_iteration": 2.432607650756836 + }, + { + "auxiliary_loss_clip": 0.01106295, + "auxiliary_loss_mlp": 0.01030481, + "balance_loss_clip": 1.01883411, + "balance_loss_mlp": 1.03698087, + "epoch": 0.5984668570569668, + "flos": 26319900026880.0, + "grad_norm": 1.6558066102502929, + "language_loss": 0.69747621, + "learning_rate": 1.4657974286591807e-06, + "loss": 0.71884394, + "num_input_tokens_seen": 214503465, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6953125, + "step": 9954, + "time_per_iteration": 2.5316383838653564 + }, + { + "auxiliary_loss_clip": 0.01106341, + "auxiliary_loss_mlp": 0.01031118, + "balance_loss_clip": 1.01923835, + "balance_loss_mlp": 1.03664923, + "epoch": 0.5985269803096348, + "flos": 20593728558720.0, + "grad_norm": 1.7741106098423227, + "language_loss": 0.73212743, + "learning_rate": 1.4654221268507637e-06, + "loss": 0.75350201, + "num_input_tokens_seen": 214520725, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6953125, + "step": 9955, + "time_per_iteration": 2.457697629928589 + }, + { + "auxiliary_loss_clip": 0.01107558, + "auxiliary_loss_mlp": 0.01030352, + "balance_loss_clip": 1.01816237, + "balance_loss_mlp": 1.03694773, + "epoch": 0.5985871035623027, + "flos": 26865382942080.0, + "grad_norm": 1.8276717412391432, + "language_loss": 0.68681955, + "learning_rate": 1.4650468453135934e-06, + "loss": 0.70819867, + "num_input_tokens_seen": 214540675, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.70703125, + "step": 9956, + "time_per_iteration": 2.5265135765075684 + }, + { + "auxiliary_loss_clip": 0.01109542, + "auxiliary_loss_mlp": 0.01031362, + "balance_loss_clip": 1.0191431, + "balance_loss_mlp": 1.03873038, + "epoch": 0.5986472268149707, + "flos": 19609704495360.0, + "grad_norm": 2.224432093074028, + "language_loss": 0.73662853, + "learning_rate": 1.4646715840618999e-06, + "loss": 0.75803757, + "num_input_tokens_seen": 214559910, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.7109375, + "step": 9957, + "time_per_iteration": 2.4384164810180664 + }, + { + "auxiliary_loss_clip": 0.01105192, + "auxiliary_loss_mlp": 0.0102626, + "balance_loss_clip": 1.01433289, + "balance_loss_mlp": 1.03838789, + "epoch": 0.5987073500676386, + "flos": 21794616984960.0, + "grad_norm": 1.875022862600817, + "language_loss": 0.84732842, + "learning_rate": 1.4642963431099138e-06, + "loss": 0.86864293, + "num_input_tokens_seen": 214575960, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66796875, + "step": 9958, + "time_per_iteration": 2.501417636871338 + }, + { + "auxiliary_loss_clip": 0.01109112, + "auxiliary_loss_mlp": 0.01037074, + "balance_loss_clip": 1.02396715, + "balance_loss_mlp": 1.03740525, + "epoch": 0.5987674733203067, + "flos": 24314361079680.0, + "grad_norm": 2.024494152709453, + "language_loss": 0.66685295, + "learning_rate": 1.463921122471864e-06, + "loss": 0.6883148, + "num_input_tokens_seen": 214594230, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 9959, + "time_per_iteration": 2.471848726272583 + }, + { + "auxiliary_loss_clip": 0.01108718, + "auxiliary_loss_mlp": 0.01029772, + "balance_loss_clip": 1.01754093, + "balance_loss_mlp": 1.0389334, + "epoch": 0.5988275965729746, + "flos": 21320201128320.0, + "grad_norm": 1.6260957561310903, + "language_loss": 0.83360457, + "learning_rate": 1.4635459221619796e-06, + "loss": 0.85498953, + "num_input_tokens_seen": 214613130, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6953125, + "step": 9960, + "time_per_iteration": 2.4651761054992676 + }, + { + "auxiliary_loss_clip": 0.01106018, + "auxiliary_loss_mlp": 0.01029196, + "balance_loss_clip": 1.01716197, + "balance_loss_mlp": 1.03686321, + "epoch": 0.5988877198256426, + "flos": 25118041933440.0, + "grad_norm": 1.466008615140069, + "language_loss": 0.79505813, + "learning_rate": 1.4631707421944868e-06, + "loss": 0.81641018, + "num_input_tokens_seen": 214634470, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.69140625, + "step": 9961, + "time_per_iteration": 2.475454568862915 + }, + { + "auxiliary_loss_clip": 0.01106184, + "auxiliary_loss_mlp": 0.01030716, + "balance_loss_clip": 1.01849759, + "balance_loss_mlp": 1.03730237, + "epoch": 0.5989478430783105, + "flos": 26429104350720.0, + "grad_norm": 1.756927001005791, + "language_loss": 0.67329001, + "learning_rate": 1.4627955825836136e-06, + "loss": 0.69465899, + "num_input_tokens_seen": 214654030, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 9962, + "time_per_iteration": 2.489084005355835 + }, + { + "auxiliary_loss_clip": 0.01107167, + "auxiliary_loss_mlp": 0.01035232, + "balance_loss_clip": 1.02303684, + "balance_loss_mlp": 1.03722596, + "epoch": 0.5990079663309785, + "flos": 25778439434880.0, + "grad_norm": 1.365980621399165, + "language_loss": 0.74311382, + "learning_rate": 1.4624204433435857e-06, + "loss": 0.76453781, + "num_input_tokens_seen": 214676985, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.703125, + "step": 9963, + "time_per_iteration": 2.4947874546051025 + }, + { + "auxiliary_loss_clip": 0.01105091, + "auxiliary_loss_mlp": 0.01032808, + "balance_loss_clip": 1.02042198, + "balance_loss_mlp": 1.03652799, + "epoch": 0.5990680895836464, + "flos": 36831779118720.0, + "grad_norm": 2.111691032145124, + "language_loss": 0.68214118, + "learning_rate": 1.4620453244886281e-06, + "loss": 0.70352018, + "num_input_tokens_seen": 214700105, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6875, + "step": 9964, + "time_per_iteration": 2.595745086669922 + }, + { + "auxiliary_loss_clip": 0.01104549, + "auxiliary_loss_mlp": 0.01028863, + "balance_loss_clip": 1.01635242, + "balance_loss_mlp": 1.03745115, + "epoch": 0.5991282128363145, + "flos": 24133550993280.0, + "grad_norm": 1.9069133835925212, + "language_loss": 0.77044344, + "learning_rate": 1.4616702260329662e-06, + "loss": 0.79177749, + "num_input_tokens_seen": 214717885, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.671875, + "step": 9965, + "time_per_iteration": 2.447580337524414 + }, + { + "auxiliary_loss_clip": 0.01106548, + "auxiliary_loss_mlp": 0.01030278, + "balance_loss_clip": 1.01833928, + "balance_loss_mlp": 1.03651989, + "epoch": 0.5991883360889824, + "flos": 10304064956160.0, + "grad_norm": 1.8284726106569544, + "language_loss": 0.77189291, + "learning_rate": 1.4612951479908229e-06, + "loss": 0.79326117, + "num_input_tokens_seen": 214733680, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.703125, + "step": 9966, + "time_per_iteration": 2.450202226638794 + }, + { + "auxiliary_loss_clip": 0.01106883, + "auxiliary_loss_mlp": 0.01029207, + "balance_loss_clip": 1.01775706, + "balance_loss_mlp": 1.03827262, + "epoch": 0.5992484593416504, + "flos": 23951196622080.0, + "grad_norm": 1.4816211966309663, + "language_loss": 0.73338163, + "learning_rate": 1.460920090376422e-06, + "loss": 0.7547425, + "num_input_tokens_seen": 214753285, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6875, + "step": 9967, + "time_per_iteration": 2.5361080169677734 + }, + { + "auxiliary_loss_clip": 0.01113043, + "auxiliary_loss_mlp": 0.01036039, + "balance_loss_clip": 1.0229435, + "balance_loss_mlp": 1.03907526, + "epoch": 0.5993085825943184, + "flos": 11944105061760.0, + "grad_norm": 1.98552880835617, + "language_loss": 0.68667233, + "learning_rate": 1.4605450532039847e-06, + "loss": 0.70816314, + "num_input_tokens_seen": 214767810, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7421875, + "step": 9968, + "time_per_iteration": 2.4201669692993164 + }, + { + "auxiliary_loss_clip": 0.01107383, + "auxiliary_loss_mlp": 0.01034585, + "balance_loss_clip": 1.02188921, + "balance_loss_mlp": 1.03702521, + "epoch": 0.5993687058469863, + "flos": 19026838500480.0, + "grad_norm": 1.5069000727815525, + "language_loss": 0.79169899, + "learning_rate": 1.4601700364877334e-06, + "loss": 0.8131187, + "num_input_tokens_seen": 214786040, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 9969, + "time_per_iteration": 3.9278953075408936 + }, + { + "auxiliary_loss_clip": 0.01106356, + "auxiliary_loss_mlp": 0.01032743, + "balance_loss_clip": 1.01999974, + "balance_loss_mlp": 1.03598189, + "epoch": 0.5994288290996543, + "flos": 14282967242880.0, + "grad_norm": 2.0663897132059588, + "language_loss": 0.81023246, + "learning_rate": 1.4597950402418889e-06, + "loss": 0.83162344, + "num_input_tokens_seen": 214803110, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 9970, + "time_per_iteration": 2.4416465759277344 + }, + { + "auxiliary_loss_clip": 0.01109867, + "auxiliary_loss_mlp": 0.01039566, + "balance_loss_clip": 1.02511787, + "balance_loss_mlp": 1.0377593, + "epoch": 0.5994889523523222, + "flos": 19206643006080.0, + "grad_norm": 1.8664927797599988, + "language_loss": 0.62176776, + "learning_rate": 1.4594200644806697e-06, + "loss": 0.64326209, + "num_input_tokens_seen": 214819945, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.71875, + "step": 9971, + "time_per_iteration": 3.8846518993377686 + }, + { + "auxiliary_loss_clip": 0.01102408, + "auxiliary_loss_mlp": 0.01029479, + "balance_loss_clip": 1.01776624, + "balance_loss_mlp": 1.03571367, + "epoch": 0.5995490756049903, + "flos": 28037040675840.0, + "grad_norm": 1.8563043542024344, + "language_loss": 0.79314888, + "learning_rate": 1.4590451092182962e-06, + "loss": 0.81446773, + "num_input_tokens_seen": 214838810, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 9972, + "time_per_iteration": 3.901256561279297 + }, + { + "auxiliary_loss_clip": 0.01112588, + "auxiliary_loss_mlp": 0.01034647, + "balance_loss_clip": 1.02152252, + "balance_loss_mlp": 1.03817391, + "epoch": 0.5996091988576582, + "flos": 29052953038080.0, + "grad_norm": 2.1896098539024176, + "language_loss": 0.76205128, + "learning_rate": 1.4586701744689864e-06, + "loss": 0.78352362, + "num_input_tokens_seen": 214857040, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7421875, + "step": 9973, + "time_per_iteration": 3.9424259662628174 + }, + { + "auxiliary_loss_clip": 0.01106987, + "auxiliary_loss_mlp": 0.01033225, + "balance_loss_clip": 1.02021337, + "balance_loss_mlp": 1.0362227, + "epoch": 0.5996693221103262, + "flos": 20813968800000.0, + "grad_norm": 2.3034108647788933, + "language_loss": 0.64969486, + "learning_rate": 1.4582952602469578e-06, + "loss": 0.67109704, + "num_input_tokens_seen": 214873375, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 9974, + "time_per_iteration": 2.4875805377960205 + }, + { + "auxiliary_loss_clip": 0.01107343, + "auxiliary_loss_mlp": 0.0103502, + "balance_loss_clip": 1.02270579, + "balance_loss_mlp": 1.03728855, + "epoch": 0.5997294453629941, + "flos": 23768914078080.0, + "grad_norm": 1.4500461001521425, + "language_loss": 0.74434048, + "learning_rate": 1.457920366566428e-06, + "loss": 0.76576418, + "num_input_tokens_seen": 214893900, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 9975, + "time_per_iteration": 2.4895670413970947 + }, + { + "auxiliary_loss_clip": 0.01107892, + "auxiliary_loss_mlp": 0.01028638, + "balance_loss_clip": 1.01572204, + "balance_loss_mlp": 1.03760493, + "epoch": 0.5997895686156621, + "flos": 20960017499520.0, + "grad_norm": 1.7933529759094704, + "language_loss": 0.76735765, + "learning_rate": 1.457545493441611e-06, + "loss": 0.78872299, + "num_input_tokens_seen": 214912110, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 9976, + "time_per_iteration": 2.5056304931640625 + }, + { + "auxiliary_loss_clip": 0.01107614, + "auxiliary_loss_mlp": 0.01039313, + "balance_loss_clip": 1.02620029, + "balance_loss_mlp": 1.03780508, + "epoch": 0.59984969186833, + "flos": 28365443746560.0, + "grad_norm": 2.4460752586196857, + "language_loss": 0.74817264, + "learning_rate": 1.4571706408867237e-06, + "loss": 0.76964188, + "num_input_tokens_seen": 214930140, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.69921875, + "step": 9977, + "time_per_iteration": 2.496149778366089 + }, + { + "auxiliary_loss_clip": 0.01107436, + "auxiliary_loss_mlp": 0.01032043, + "balance_loss_clip": 1.01962721, + "balance_loss_mlp": 1.03684258, + "epoch": 0.5999098151209981, + "flos": 22565906749440.0, + "grad_norm": 1.6882301956293941, + "language_loss": 0.68553925, + "learning_rate": 1.4567958089159802e-06, + "loss": 0.70693398, + "num_input_tokens_seen": 214949200, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.70703125, + "step": 9978, + "time_per_iteration": 2.483567714691162 + }, + { + "auxiliary_loss_clip": 0.01113427, + "auxiliary_loss_mlp": 0.01033778, + "balance_loss_clip": 1.02087975, + "balance_loss_mlp": 1.04072738, + "epoch": 0.599969938373666, + "flos": 18768712389120.0, + "grad_norm": 2.78777966355448, + "language_loss": 0.81153774, + "learning_rate": 1.456420997543594e-06, + "loss": 0.83300972, + "num_input_tokens_seen": 214965775, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7265625, + "step": 9979, + "time_per_iteration": 2.413935899734497 + }, + { + "auxiliary_loss_clip": 0.01102922, + "auxiliary_loss_mlp": 0.01032239, + "balance_loss_clip": 1.02026439, + "balance_loss_mlp": 1.03630424, + "epoch": 0.600030061626334, + "flos": 11327231865600.0, + "grad_norm": 1.7401896529481804, + "language_loss": 0.6957618, + "learning_rate": 1.4560462067837782e-06, + "loss": 0.71711338, + "num_input_tokens_seen": 214982480, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6640625, + "step": 9980, + "time_per_iteration": 2.4312682151794434 + }, + { + "auxiliary_loss_clip": 0.01110498, + "auxiliary_loss_mlp": 0.01032669, + "balance_loss_clip": 1.01947856, + "balance_loss_mlp": 1.03764093, + "epoch": 0.600090184879002, + "flos": 16578664254720.0, + "grad_norm": 3.8237519537086238, + "language_loss": 0.68642873, + "learning_rate": 1.4556714366507445e-06, + "loss": 0.70786041, + "num_input_tokens_seen": 214998110, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73046875, + "step": 9981, + "time_per_iteration": 2.4180452823638916 + }, + { + "auxiliary_loss_clip": 0.01106792, + "auxiliary_loss_mlp": 0.01035605, + "balance_loss_clip": 1.02439916, + "balance_loss_mlp": 1.03752363, + "epoch": 0.6001503081316699, + "flos": 23618627573760.0, + "grad_norm": 3.017374403618408, + "language_loss": 0.78579712, + "learning_rate": 1.4552966871587048e-06, + "loss": 0.80722106, + "num_input_tokens_seen": 215017995, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.69140625, + "step": 9982, + "time_per_iteration": 2.5378241539001465 + }, + { + "auxiliary_loss_clip": 0.01107415, + "auxiliary_loss_mlp": 0.01034564, + "balance_loss_clip": 1.02182055, + "balance_loss_mlp": 1.03862381, + "epoch": 0.6002104313843379, + "flos": 20667668705280.0, + "grad_norm": 1.4959053225865697, + "language_loss": 0.72973263, + "learning_rate": 1.4549219583218686e-06, + "loss": 0.7511524, + "num_input_tokens_seen": 215038285, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 9983, + "time_per_iteration": 2.4516336917877197 + }, + { + "auxiliary_loss_clip": 0.01105736, + "auxiliary_loss_mlp": 0.01031565, + "balance_loss_clip": 1.01893497, + "balance_loss_mlp": 1.03546536, + "epoch": 0.6002705546370058, + "flos": 22455229968000.0, + "grad_norm": 2.0437339372279775, + "language_loss": 0.77803969, + "learning_rate": 1.454547250154447e-06, + "loss": 0.79941273, + "num_input_tokens_seen": 215057825, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 9984, + "time_per_iteration": 2.4639358520507812 + } + ], + "logging_steps": 1.0, + "max_steps": 16632, + "num_input_tokens_seen": 215057825, + "num_train_epochs": 1, + "save_steps": 3328, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.38416130140799e+17, + "train_batch_size": 5, + "trial_name": null, + "trial_params": null +} diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/training_args.bin b/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6b9a73eb97a1ef37776f0d97a0590d802e6f8d5a --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a0c59c7a64d6e018f6d41a91f3e718772a260e91597586a7ce64cd9f7d3d0c6 +size 7992 diff --git a/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/zero_to_fp32.py b/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/zero_to_fp32.py new file mode 100644 index 0000000000000000000000000000000000000000..24cc342e78d1a006c782b3a4cd68d9ce786d8fd8 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/checkpoint-9984/zero_to_fp32.py @@ -0,0 +1,604 @@ +#!/usr/bin/env python + +# Copyright (c) Microsoft Corporation. +# SPDX-License-Identifier: Apache-2.0 + +# DeepSpeed Team + +# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets +# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in +# the future. Once extracted, the weights don't require DeepSpeed and can be used in any +# application. +# +# example: python zero_to_fp32.py . pytorch_model.bin + +import argparse +import torch +import glob +import math +import os +import re +from collections import OrderedDict +from dataclasses import dataclass + +# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with +# DeepSpeed data structures it has to be available in the current python environment. +from deepspeed.utils import logger +from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS, + FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES, + FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS) + + +@dataclass +class zero_model_state: + buffers: dict() + param_shapes: dict() + shared_params: list + ds_version: int + frozen_param_shapes: dict() + frozen_param_fragments: dict() + + +debug = 0 + +# load to cpu +device = torch.device('cpu') + + +def atoi(text): + return int(text) if text.isdigit() else text + + +def natural_keys(text): + ''' + alist.sort(key=natural_keys) sorts in human order + http://nedbatchelder.com/blog/200712/human_sorting.html + (See Toothy's implementation in the comments) + ''' + return [atoi(c) for c in re.split(r'(\d+)', text)] + + +def get_model_state_file(checkpoint_dir, zero_stage): + if not os.path.isdir(checkpoint_dir): + raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist") + + # there should be only one file + if zero_stage <= 2: + file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt") + elif zero_stage == 3: + file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt") + + if not os.path.exists(file): + raise FileNotFoundError(f"can't find model states file at '{file}'") + + return file + + +def get_checkpoint_files(checkpoint_dir, glob_pattern): + # XXX: need to test that this simple glob rule works for multi-node setup too + ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys) + + if len(ckpt_files) == 0: + raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'") + + return ckpt_files + + +def get_optim_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt") + + +def get_model_state_files(checkpoint_dir): + return get_checkpoint_files(checkpoint_dir, "*_model_states.pt") + + +def parse_model_states(files): + zero_model_states = [] + for file in files: + state_dict = torch.load(file, map_location=device) + + if BUFFER_NAMES not in state_dict: + raise ValueError(f"{file} is not a model state checkpoint") + buffer_names = state_dict[BUFFER_NAMES] + if debug: + print("Found buffers:", buffer_names) + + # recover just the buffers while restoring them to fp32 if they were saved in fp16 + buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names} + param_shapes = state_dict[PARAM_SHAPES] + + # collect parameters that are included in param_shapes + param_names = [] + for s in param_shapes: + for name in s.keys(): + param_names.append(name) + + # update with frozen parameters + frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None) + if frozen_param_shapes is not None: + if debug: + print(f"Found frozen_param_shapes: {frozen_param_shapes}") + param_names += list(frozen_param_shapes.keys()) + + # handle shared params + shared_params = [[k, v] for k, v in state_dict["shared_params"].items()] + + ds_version = state_dict.get(DS_VERSION, None) + + frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None) + + z_model_state = zero_model_state(buffers=buffers, + param_shapes=param_shapes, + shared_params=shared_params, + ds_version=ds_version, + frozen_param_shapes=frozen_param_shapes, + frozen_param_fragments=frozen_param_fragments) + zero_model_states.append(z_model_state) + + return zero_model_states + + +def parse_optim_states(files, ds_checkpoint_dir): + + total_files = len(files) + state_dicts = [] + for f in files: + state_dict = torch.load(f, map_location=device) + # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights + # and also handle the case where it was already removed by another helper script + state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None) + state_dicts.append(state_dict) + + if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]: + raise ValueError(f"{files[0]} is not a zero checkpoint") + zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE] + world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT] + + # For ZeRO-2 each param group can have different partition_count as data parallelism for expert + # parameters can be different from data parallelism for non-expert parameters. So we can just + # use the max of the partition_count to get the dp world_size. + + if type(world_size) is list: + world_size = max(world_size) + + if world_size != total_files: + raise ValueError( + f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. " + "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes." + ) + + # the groups are named differently in each stage + if zero_stage <= 2: + fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS + elif zero_stage == 3: + fp32_groups_key = FP32_FLAT_GROUPS + else: + raise ValueError(f"unknown zero stage {zero_stage}") + + if zero_stage <= 2: + fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))] + elif zero_stage == 3: + # if there is more than one param group, there will be multiple flattened tensors - one + # flattened tensor per group - for simplicity merge them into a single tensor + # + # XXX: could make the script more memory efficient for when there are multiple groups - it + # will require matching the sub-lists of param_shapes for each param group flattened tensor + + fp32_flat_groups = [ + torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts)) + ] + + return zero_stage, world_size, fp32_flat_groups + + +def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters): + """ + Returns fp32 state_dict reconstructed from ds checkpoint + + Args: + - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are) + + """ + print(f"Processing zero checkpoint '{ds_checkpoint_dir}'") + + optim_files = get_optim_files(ds_checkpoint_dir) + zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir) + print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}") + + model_files = get_model_state_files(ds_checkpoint_dir) + + zero_model_states = parse_model_states(model_files) + print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}') + + if zero_stage <= 2: + return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + elif zero_stage == 3: + return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters) + + +def _zero2_merge_frozen_params(state_dict, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + frozen_param_fragments = zero_model_states[0].frozen_param_fragments + + if debug: + num_elem = sum(s.numel() for s in frozen_param_shapes.values()) + print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in frozen_param_fragments.values()]) + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + state_dict[name] = frozen_param_fragments[name] + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _has_callable(obj, fn): + attr = getattr(obj, fn, None) + return callable(attr) + + +def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + + # Reconstruction protocol: + # + # XXX: document this + + if debug: + for i in range(world_size): + for j in range(len(fp32_flat_groups[0])): + print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}") + + # XXX: memory usage doubles here (zero2) + num_param_groups = len(fp32_flat_groups[0]) + merged_single_partition_of_fp32_groups = [] + for i in range(num_param_groups): + merged_partitions = [sd[i] for sd in fp32_flat_groups] + full_single_fp32_vector = torch.cat(merged_partitions, 0) + merged_single_partition_of_fp32_groups.append(full_single_fp32_vector) + avail_numel = sum( + [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups]) + + if debug: + wanted_params = sum([len(shapes) for shapes in param_shapes]) + wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes]) + # not asserting if there is a mismatch due to possible padding + print(f"Have {avail_numel} numels to process.") + print(f"Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + total_numel = 0 + total_params = 0 + for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups): + offset = 0 + avail_numel = full_single_fp32_vector.numel() + for name, shape in shapes.items(): + + unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape) + total_numel += unpartitioned_numel + total_params += 1 + + if debug: + print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ") + state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape) + offset += unpartitioned_numel + + # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and + # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex + # paddings performed in the code it's almost impossible to predict the exact numbers w/o the + # live optimizer object, so we are checking that the numbers are within the right range + align_to = 2 * world_size + + def zero2_align(x): + return align_to * math.ceil(x / align_to) + + if debug: + print(f"original offset={offset}, avail_numel={avail_numel}") + + offset = zero2_align(offset) + avail_numel = zero2_align(avail_numel) + + if debug: + print(f"aligned offset={offset}, avail_numel={avail_numel}") + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero2_merge_frozen_params(state_dict, zero_model_states) + + _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def zero3_partitioned_param_info(unpartitioned_numel, world_size): + remainder = unpartitioned_numel % world_size + padding_numel = (world_size - remainder) if remainder else 0 + partitioned_numel = math.ceil(unpartitioned_numel / world_size) + return partitioned_numel, padding_numel + + +def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states): + if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0: + return + + if debug: + for i in range(world_size): + num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values()) + print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}') + + frozen_param_shapes = zero_model_states[0].frozen_param_shapes + wanted_params = len(frozen_param_shapes) + wanted_numel = sum(s.numel() for s in frozen_param_shapes.values()) + avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size + print(f'Frozen params: Have {avail_numel} numels to process.') + print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params') + + total_params = 0 + total_numel = 0 + for name, shape in zero_model_states[0].frozen_param_shapes.items(): + total_params += 1 + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + + param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states) + state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape) + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements") + + +def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states): + param_shapes = zero_model_states[0].param_shapes + avail_numel = fp32_flat_groups[0].numel() * world_size + # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each + # param, re-consolidating each param, while dealing with padding if any + + # merge list of dicts, preserving order + param_shapes = {k: v for d in param_shapes for k, v in d.items()} + + if debug: + for i in range(world_size): + print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}") + + wanted_params = len(param_shapes) + wanted_numel = sum(shape.numel() for shape in param_shapes.values()) + # not asserting if there is a mismatch due to possible padding + avail_numel = fp32_flat_groups[0].numel() * world_size + print(f"Trainable params: Have {avail_numel} numels to process.") + print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.") + + # params + # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support + # out-of-core computing solution + offset = 0 + total_numel = 0 + total_params = 0 + for name, shape in param_shapes.items(): + + unpartitioned_numel = shape.numel() + total_numel += unpartitioned_numel + total_params += 1 + + partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size) + + if debug: + print( + f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}" + ) + + # XXX: memory usage doubles here + state_dict[name] = torch.cat( + tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)), + 0).narrow(0, 0, unpartitioned_numel).view(shape) + offset += partitioned_numel + + offset *= world_size + + # Sanity check + if offset != avail_numel: + raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong") + + print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements") + + +def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states, + exclude_frozen_parameters): + state_dict = OrderedDict() + + # buffers + buffers = zero_model_states[0].buffers + state_dict.update(buffers) + if debug: + print(f"added {len(buffers)} buffers") + + if not exclude_frozen_parameters: + _zero3_merge_frozen_params(state_dict, world_size, zero_model_states) + + _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states) + + # recover shared parameters + for pair in zero_model_states[0].shared_params: + if pair[1] in state_dict: + state_dict[pair[0]] = state_dict[pair[1]] + + return state_dict + + +def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with + ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example + via a model hub. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + + Returns: + - pytorch ``state_dict`` + + Note: this approach may not work if your application doesn't have sufficient free CPU memory and + you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with + the checkpoint. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint + # do the training and checkpoint saving + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu + model = model.cpu() # move to cpu + model.load_state_dict(state_dict) + # submit to model hub or save the model to share with others + + In this example the ``model`` will no longer be usable in the deepspeed context of the same + application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead. + + """ + if tag is None: + latest_path = os.path.join(checkpoint_dir, 'latest') + if os.path.isfile(latest_path): + with open(latest_path, 'r') as fd: + tag = fd.read().strip() + else: + raise ValueError(f"Unable to find 'latest' file at {latest_path}") + + ds_checkpoint_dir = os.path.join(checkpoint_dir, tag) + + if not os.path.isdir(ds_checkpoint_dir): + raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist") + + return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters) + + +def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None, exclude_frozen_parameters=False): + """ + Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be + loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed. + + Args: + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + - ``exclude_frozen_parameters``: exclude frozen parameters + """ + + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag, exclude_frozen_parameters) + print(f"Saving fp32 state dict to {output_file}") + torch.save(state_dict, output_file) + + +def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None): + """ + 1. Put the provided model to cpu + 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` + 3. Load it into the provided model + + Args: + - ``model``: the model object to update + - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``) + - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14`` + + Returns: + - ``model`: modified model + + Make sure you have plenty of CPU memory available before you call this function. If you don't + have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it + conveniently placed for you in the checkpoint folder. + + A typical usage might be :: + + from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint + model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir) + # submit to model hub or save the model to share with others + + Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context + of the same application. i.e. you will need to re-initialize the deepspeed engine, since + ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it. + + """ + logger.info(f"Extracting fp32 weights") + state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag) + + logger.info(f"Overwriting model with fp32 weights") + model = model.cpu() + model.load_state_dict(state_dict, strict=False) + + return model + + +if __name__ == "__main__": + + parser = argparse.ArgumentParser() + parser.add_argument("checkpoint_dir", + type=str, + help="path to the desired checkpoint folder, e.g., path/checkpoint-12") + parser.add_argument( + "output_file", + type=str, + help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)") + parser.add_argument("-t", + "--tag", + type=str, + default=None, + help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1") + parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters") + parser.add_argument("-d", "--debug", action='store_true', help="enable debug") + args = parser.parse_args() + + debug = args.debug + + convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, + args.output_file, + tag=args.tag, + exclude_frozen_parameters=args.exclude_frozen_parameters) diff --git a/sft/Full_new_smoe_sigmoidgating/config.json b/sft/Full_new_smoe_sigmoidgating/config.json new file mode 100644 index 0000000000000000000000000000000000000000..bec9e28b38d59da156504ed7ad3bbf443af0ccd9 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/config.json @@ -0,0 +1,199 @@ +{ + "_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "architectures": [ + "LlavaPhiForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3.Phi3Config", + "AutoModelForCausalLM": "modeling_phi3.Phi3ForCausalLM" + }, + "balance_loss_coef": 0.01, + "bos_token_id": 1, + "clip_smoe": true, + "diversity_loss_coef": 0.01, + "dropout": false, + "e_loss_coef": 0.001, + "embd_pdrop": 0.0, + "entropy_advance_loss": false, + "eos_token_id": 32000, + "freeze_backbone": false, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 3072, + "hybrid": false, + "image_aspect_ratio": "pad", + "init_weight": true, + "initializer_range": 0.02, + "intermediate_size": 8192, + "is_cosine": false, + "is_norm_weight": false, + "local_rank": 0, + "loss1": "balanceloss", + "loss2": "zloss", + "luna": false, + "max_compete_in_iter": 3, + "max_position_embeddings": 131072, + "mlp_smoe": true, + "mm_hidden_size": 1152, + "mm_patch_merge_type": "flat", + "mm_projector_lr": null, + "mm_projector_type": "moe", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-224", + "model_name_or_path": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft", + "model_type": "llava_phi", + "moe_name": "smoe_sigmoidgating", + "norm_softmax": false, + "normalization": true, + "num_attention_heads": 32, + "num_experts": 4, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "num_layers": 3, + "num_selected": 2, + "number_of_previous_tokens": 2, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "pretrain_mm_mlp_adapter": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/mm_projector.bin", + "rate_compete": 0.2, + "rate_flip": 0.05, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062 + ], + "short_factor": [ + 1.0, + 1.0199999809265137, + 1.0299999713897705, + 1.0299999713897705, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0499999523162842, + 1.0699999332427979, + 1.0999999046325684, + 1.1099998950958252, + 1.1599998474121094, + 1.1599998474121094, + 1.1699998378753662, + 1.2899998426437378, + 1.339999794960022, + 1.679999828338623, + 1.7899998426437378, + 1.8199998140335083, + 1.8499997854232788, + 1.8799997568130493, + 1.9099997282028198, + 1.9399996995925903, + 1.9899996519088745, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0199997425079346, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0299997329711914, + 2.0799996852874756, + 2.0899996757507324, + 2.189999580383301, + 2.2199995517730713, + 2.5899994373321533, + 2.729999542236328, + 2.749999523162842, + 2.8399994373321533 + ], + "type": "longrope" + }, + "rope_theta": 10000.0, + "router_loss_coef": 0.01, + "router_theta": 0.1, + "router_z_loss_coef": 0.001, + "scales": [ + 1, + 3 + ], + "sliding_window": 262144, + "sparse_upcycling": true, + "strategy_train": "base", + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "topk_max": 2, + "topk_min": 1, + "torch_dtype": "bfloat16", + "training": true, + "transformers_version": "4.43.0", + "tune_mm_mlp_adapter": false, + "unit_test": true, + "use_cache": true, + "use_mm_proj": true, + "use_old": false, + "version": "phi35", + "vision_tower": "google/siglip-so400m-patch14-224", + "vision_tower_dir": "/cm/archive/namnv78/checkpoints/phi35-siglip224/pft/clip.bin", + "vocab_size": 32064, + "warm_up": 0.05 +} diff --git a/sft/Full_new_smoe_sigmoidgating/generation_config.json b/sft/Full_new_smoe_sigmoidgating/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..dad5c4578f0dc5969b38755d095fc30c368bb54a --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/generation_config.json @@ -0,0 +1,12 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "do_sample": true, + "eos_token_id": [ + 32007, + 32001, + 32000 + ], + "pad_token_id": 32000, + "transformers_version": "4.43.0" +} diff --git a/sft/Full_new_smoe_sigmoidgating/model-00001-of-00003.safetensors b/sft/Full_new_smoe_sigmoidgating/model-00001-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..f5b7321e207c9353462ae136d4ede44f47689416 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/model-00001-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7722796541d52a39377cd2531b8fc433ef6a093a1f99987883d2012a4e94dc7 +size 4972489328 diff --git a/sft/Full_new_smoe_sigmoidgating/model-00002-of-00003.safetensors b/sft/Full_new_smoe_sigmoidgating/model-00002-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..bdaa4fc2ab26723da186e277a0048acfef73ad4b --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/model-00002-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:616b9978b385c3409b052f6f7f74f22fa04dce5c37b3688f820876c5b2c4f4ba +size 4985529648 diff --git a/sft/Full_new_smoe_sigmoidgating/model-00003-of-00003.safetensors b/sft/Full_new_smoe_sigmoidgating/model-00003-of-00003.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..c65459b5ca40d134a874af93f90b967622398fc9 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/model-00003-of-00003.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:921925205fa91bb7815f7d11a90db2d2a7b7e9c4e710f02241163cf3cb7e133d +size 248943552 diff --git a/sft/Full_new_smoe_sigmoidgating/model.safetensors.index.json b/sft/Full_new_smoe_sigmoidgating/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..aa54419fc0a3eab502aa7c4ad974dca52ed10803 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/model.safetensors.index.json @@ -0,0 +1,1005 @@ +{ + "metadata": { + "total_size": 10206819456 + }, + "weight_map": { + "lm_head.weight": "model-00003-of-00003.safetensors", + "model.embed_tokens.weight": "model-00001-of-00003.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.0.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.1.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.10.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.11.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.12.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.13.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.14.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.15.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.16.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.17.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.18.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.19.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.2.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.20.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.24.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.25.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.26.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.27.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.28.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.29.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.3.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.30.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.30.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.input_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.mlp.gate_up_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00002-of-00003.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.31.self_attn.qkv_proj.weight": "model-00002-of-00003.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.4.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.5.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.6.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.7.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.8.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.mlp.gate_up_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00003.safetensors", + "model.layers.9.self_attn.qkv_proj.weight": "model-00001-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.2.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.0.2.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.2.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.1.2.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.0.bias": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.0.weight": "model-00002-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.2.2.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.0.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.0.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.2.bias": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.experts.3.2.weight": "model-00003-of-00003.safetensors", + "model.mm_projector.moelayer.gate.weight": "model-00003-of-00003.safetensors", + "model.norm.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.patch_embedding.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.embeddings.position_embedding.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.0.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.1.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.10.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.11.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.12.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.13.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.14.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.15.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.16.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.17.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.18.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.19.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.2.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.20.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.21.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.22.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.23.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.24.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.25.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.26.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.3.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.4.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.5.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.6.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.7.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.8.self_attn.v_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.layer_norm2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.0.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.1.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.2.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc1.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc1.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc2.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.experts.3.fc2.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.moelayer.gate.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.k_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.out_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.q_proj.weight": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.bias": "model-00002-of-00003.safetensors", + "model.vision_tower.vision_model.encoder.layers.9.self_attn.v_proj.weight": "model-00002-of-00003.safetensors" + } +} diff --git a/sft/Full_new_smoe_sigmoidgating/special_tokens_map.json b/sft/Full_new_smoe_sigmoidgating/special_tokens_map.json new file mode 100644 index 0000000000000000000000000000000000000000..3e4d5a5bc1cb51753cc9ae0305ece0da60052b10 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/special_tokens_map.json @@ -0,0 +1,24 @@ +{ + "bos_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "pad_token": "", + "unk_token": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + } +} diff --git a/sft/Full_new_smoe_sigmoidgating/tokenizer.model b/sft/Full_new_smoe_sigmoidgating/tokenizer.model new file mode 100644 index 0000000000000000000000000000000000000000..6c00c742ce03c627d6cd5b795984876fa49fa899 --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/tokenizer.model @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347 +size 499723 diff --git a/sft/Full_new_smoe_sigmoidgating/tokenizer_config.json b/sft/Full_new_smoe_sigmoidgating/tokenizer_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d579bb0b91b24b214ea3c2e487e27a65017cdc4a --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/tokenizer_config.json @@ -0,0 +1,132 @@ +{ + "add_bos_token": false, + "add_eos_token": false, + "add_prefix_space": true, + "added_tokens_decoder": { + "0": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "1": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "2": { + "content": "", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": false + }, + "32000": { + "content": "<|endoftext|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "32001": { + "content": "<|assistant|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32002": { + "content": "<|placeholder1|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32003": { + "content": "<|placeholder2|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32004": { + "content": "<|placeholder3|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32005": { + "content": "<|placeholder4|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32006": { + "content": "<|system|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32007": { + "content": "<|end|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32008": { + "content": "<|placeholder5|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32009": { + "content": "<|placeholder6|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + }, + "32010": { + "content": "<|user|>", + "lstrip": false, + "normalized": false, + "rstrip": true, + "single_word": false, + "special": true + } + }, + "bos_token": "", + "chat_template": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}", + "clean_up_tokenization_spaces": false, + "eos_token": "<|endoftext|>", + "legacy": false, + "model_max_length": 2048, + "pad_token": "", + "padding_side": "right", + "sp_model_kwargs": {}, + "spaces_between_special_tokens": false, + "tokenizer_class": "LlamaTokenizer", + "unk_token": "", + "use_default_system_prompt": false +} diff --git a/sft/Full_new_smoe_sigmoidgating/trainer_state.json b/sft/Full_new_smoe_sigmoidgating/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..97ddca848d7f4d25cc57d2b116ff33fa49e3258c --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/trainer_state.json @@ -0,0 +1,282787 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.999969938373666, + "eval_steps": 500, + "global_step": 16632, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "auxiliary_loss_clip": 0.05015663, + "auxiliary_loss_mlp": 0.02215404, + "balance_loss_clip": 1.76946592, + "balance_loss_mlp": 2.42247009, + "epoch": 6.012325266796934e-05, + "flos": 24456507091200.0, + "grad_norm": 54.31846269900138, + "language_loss": 2.84849024, + "learning_rate": 0.0, + "loss": 1.94356799, + "num_input_tokens_seen": 19155, + "router_z_loss_clip": 4.4375, + "router_z_loss_mlp": 26.0, + "step": 1, + "time_per_iteration": 14.062297821044922 + }, + { + "auxiliary_loss_clip": 0.03371575, + "auxiliary_loss_mlp": 0.01459085, + "balance_loss_clip": 1.18919563, + "balance_loss_mlp": 1.61943495, + "epoch": 0.00012024650533593868, + "flos": 20225931246720.0, + "grad_norm": 34.71678092445231, + "language_loss": 1.82690942, + "learning_rate": 4.4628432569317594e-07, + "loss": 1.87521601, + "num_input_tokens_seen": 36175, + "router_z_loss_clip": 2.703125, + "router_z_loss_mlp": 17.5, + "step": 2, + "time_per_iteration": 2.4504079818725586 + }, + { + "auxiliary_loss_clip": 0.03311525, + "auxiliary_loss_mlp": 0.014397, + "balance_loss_clip": 1.18697679, + "balance_loss_mlp": 1.61685562, + "epoch": 0.000180369758003908, + "flos": 22309935454080.0, + "grad_norm": 34.59102075188436, + "language_loss": 1.57529902, + "learning_rate": 7.073439208833112e-07, + "loss": 1.62281132, + "num_input_tokens_seen": 54870, + "router_z_loss_clip": 2.53125, + "router_z_loss_mlp": 17.0, + "step": 3, + "time_per_iteration": 2.4145541191101074 + }, + { + "auxiliary_loss_clip": 0.03353861, + "auxiliary_loss_mlp": 0.01449549, + "balance_loss_clip": 1.15390992, + "balance_loss_mlp": 1.61571431, + "epoch": 0.00024049301067187735, + "flos": 22414650577920.0, + "grad_norm": 51.728740512395206, + "language_loss": 1.67595887, + "learning_rate": 8.925686513863519e-07, + "loss": 1.72399294, + "num_input_tokens_seen": 74575, + "router_z_loss_clip": 2.953125, + "router_z_loss_mlp": 17.375, + "step": 4, + "time_per_iteration": 2.466392993927002 + }, + { + "auxiliary_loss_clip": 0.03393634, + "auxiliary_loss_mlp": 0.01505687, + "balance_loss_clip": 1.21710527, + "balance_loss_mlp": 1.61638641, + "epoch": 0.0003006162633398467, + "flos": 21396978449280.0, + "grad_norm": 56.74196654651921, + "language_loss": 1.90851176, + "learning_rate": 1.0362401141348472e-06, + "loss": 1.95750499, + "num_input_tokens_seen": 92580, + "router_z_loss_clip": 2.890625, + "router_z_loss_mlp": 17.75, + "step": 5, + "time_per_iteration": 2.6828246116638184 + }, + { + "auxiliary_loss_clip": 0.03361898, + "auxiliary_loss_mlp": 0.01518906, + "balance_loss_clip": 1.22441149, + "balance_loss_mlp": 1.60614848, + "epoch": 0.000360739516007816, + "flos": 21652375127040.0, + "grad_norm": 33.32400799743486, + "language_loss": 1.6094954, + "learning_rate": 1.153628246576487e-06, + "loss": 1.6583035, + "num_input_tokens_seen": 109705, + "router_z_loss_clip": 2.953125, + "router_z_loss_mlp": 17.5, + "step": 6, + "time_per_iteration": 2.660855770111084 + }, + { + "auxiliary_loss_clip": 0.03345758, + "auxiliary_loss_mlp": 0.01485904, + "balance_loss_clip": 1.20209074, + "balance_loss_mlp": 1.60783124, + "epoch": 0.0004208627686757854, + "flos": 27159742897920.0, + "grad_norm": 26.76365346454933, + "language_loss": 1.53346825, + "learning_rate": 1.2528784983718962e-06, + "loss": 1.58178496, + "num_input_tokens_seen": 129425, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 17.375, + "step": 7, + "time_per_iteration": 2.718822956085205 + }, + { + "auxiliary_loss_clip": 0.03312894, + "auxiliary_loss_mlp": 0.01444018, + "balance_loss_clip": 1.16630852, + "balance_loss_mlp": 1.60320723, + "epoch": 0.0004809860213437547, + "flos": 31319096135040.0, + "grad_norm": 31.923588970831496, + "language_loss": 1.43687642, + "learning_rate": 1.338852977079528e-06, + "loss": 1.48444545, + "num_input_tokens_seen": 149210, + "router_z_loss_clip": 2.78125, + "router_z_loss_mlp": 17.0, + "step": 8, + "time_per_iteration": 2.779961109161377 + }, + { + "auxiliary_loss_clip": 0.03360351, + "auxiliary_loss_mlp": 0.01496215, + "balance_loss_clip": 1.21144783, + "balance_loss_mlp": 1.60258842, + "epoch": 0.000541109274011724, + "flos": 32160411463680.0, + "grad_norm": 28.084887526361417, + "language_loss": 1.49955618, + "learning_rate": 1.4146878417666224e-06, + "loss": 1.54812181, + "num_input_tokens_seen": 169055, + "router_z_loss_clip": 2.84375, + "router_z_loss_mlp": 17.5, + "step": 9, + "time_per_iteration": 2.799635887145996 + }, + { + "auxiliary_loss_clip": 0.03302188, + "auxiliary_loss_mlp": 0.01477479, + "balance_loss_clip": 1.20797062, + "balance_loss_mlp": 1.6070832, + "epoch": 0.0006012325266796934, + "flos": 18916808163840.0, + "grad_norm": 23.45187310710616, + "language_loss": 1.44727731, + "learning_rate": 1.4825244398280232e-06, + "loss": 1.49507403, + "num_input_tokens_seen": 188045, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 17.0, + "step": 10, + "time_per_iteration": 2.6989152431488037 + }, + { + "auxiliary_loss_clip": 0.03356835, + "auxiliary_loss_mlp": 0.01493566, + "balance_loss_clip": 1.21928966, + "balance_loss_mlp": 1.61121845, + "epoch": 0.0006613557793476627, + "flos": 20774861867520.0, + "grad_norm": 18.63867113279811, + "language_loss": 1.45021069, + "learning_rate": 1.5438901072051983e-06, + "loss": 1.4987148, + "num_input_tokens_seen": 207035, + "router_z_loss_clip": 2.75, + "router_z_loss_mlp": 17.5, + "step": 11, + "time_per_iteration": 2.6820693016052246 + }, + { + "auxiliary_loss_clip": 0.0328584, + "auxiliary_loss_mlp": 0.01449969, + "balance_loss_clip": 1.17378449, + "balance_loss_mlp": 1.59900761, + "epoch": 0.000721479032015632, + "flos": 16581680997120.0, + "grad_norm": 16.861449854609447, + "language_loss": 1.45122719, + "learning_rate": 1.5999125722696629e-06, + "loss": 1.49858522, + "num_input_tokens_seen": 223225, + "router_z_loss_clip": 2.765625, + "router_z_loss_mlp": 16.875, + "step": 12, + "time_per_iteration": 2.631218910217285 + }, + { + "auxiliary_loss_clip": 0.03313605, + "auxiliary_loss_mlp": 0.01404342, + "balance_loss_clip": 1.14589679, + "balance_loss_mlp": 1.60898232, + "epoch": 0.0007816022846836014, + "flos": 23805471144960.0, + "grad_norm": 11.176593153687291, + "language_loss": 1.24100113, + "learning_rate": 1.6514482443788434e-06, + "loss": 1.28818083, + "num_input_tokens_seen": 242570, + "router_z_loss_clip": 2.578125, + "router_z_loss_mlp": 17.125, + "step": 13, + "time_per_iteration": 2.6961779594421387 + }, + { + "auxiliary_loss_clip": 0.03282163, + "auxiliary_loss_mlp": 0.01472629, + "balance_loss_clip": 1.20464635, + "balance_loss_mlp": 1.60534358, + "epoch": 0.0008417255373515708, + "flos": 19172204841600.0, + "grad_norm": 5.7580183597057975, + "language_loss": 1.20611417, + "learning_rate": 1.6991628240650723e-06, + "loss": 1.25366211, + "num_input_tokens_seen": 261215, + "router_z_loss_clip": 2.6875, + "router_z_loss_mlp": 16.75, + "step": 14, + "time_per_iteration": 2.6555092334747314 + }, + { + "auxiliary_loss_clip": 0.0326835, + "auxiliary_loss_mlp": 0.01431945, + "balance_loss_clip": 1.16815877, + "balance_loss_mlp": 1.6104542, + "epoch": 0.00090184879001954, + "flos": 26395564026240.0, + "grad_norm": 6.4839782289009085, + "language_loss": 1.12832427, + "learning_rate": 1.7435840350181584e-06, + "loss": 1.1753273, + "num_input_tokens_seen": 280035, + "router_z_loss_clip": 2.625, + "router_z_loss_mlp": 16.5, + "step": 15, + "time_per_iteration": 2.717512607574463 + }, + { + "auxiliary_loss_clip": 0.03231722, + "auxiliary_loss_mlp": 0.01412441, + "balance_loss_clip": 1.16257811, + "balance_loss_mlp": 1.59521294, + "epoch": 0.0009619720426875094, + "flos": 24679500785280.0, + "grad_norm": 4.584872954405151, + "language_loss": 1.1119349, + "learning_rate": 1.7851373027727038e-06, + "loss": 1.15837646, + "num_input_tokens_seen": 300265, + "router_z_loss_clip": 2.5, + "router_z_loss_mlp": 16.375, + "step": 16, + "time_per_iteration": 2.7170701026916504 + }, + { + "auxiliary_loss_clip": 0.03220058, + "auxiliary_loss_mlp": 0.0141779, + "balance_loss_clip": 1.17784595, + "balance_loss_mlp": 1.60289145, + "epoch": 0.0010220952953554788, + "flos": 18624531196800.0, + "grad_norm": 5.285773165398426, + "language_loss": 1.1253047, + "learning_rate": 1.8241705979033208e-06, + "loss": 1.17168307, + "num_input_tokens_seen": 317375, + "router_z_loss_clip": 2.40625, + "router_z_loss_mlp": 16.125, + "step": 17, + "time_per_iteration": 2.6125564575195312 + }, + { + "auxiliary_loss_clip": 0.0315575, + "auxiliary_loss_mlp": 0.01378857, + "balance_loss_clip": 1.14730477, + "balance_loss_mlp": 1.60051179, + "epoch": 0.001082218548023448, + "flos": 26142537646080.0, + "grad_norm": 3.8094646515897193, + "language_loss": 1.08149433, + "learning_rate": 1.860972167459798e-06, + "loss": 1.12684035, + "num_input_tokens_seen": 337975, + "router_z_loss_clip": 2.3125, + "router_z_loss_mlp": 15.5625, + "step": 18, + "time_per_iteration": 5.593315362930298 + }, + { + "auxiliary_loss_clip": 0.03181327, + "auxiliary_loss_mlp": 0.01400224, + "balance_loss_clip": 1.13548398, + "balance_loss_mlp": 1.59901524, + "epoch": 0.0011423418006914173, + "flos": 19609776322560.0, + "grad_norm": 4.551402579460018, + "language_loss": 1.02296436, + "learning_rate": 1.89578346593066e-06, + "loss": 1.06877995, + "num_input_tokens_seen": 356635, + "router_z_loss_clip": 2.65625, + "router_z_loss_mlp": 15.8125, + "step": 19, + "time_per_iteration": 2.6462903022766113 + }, + { + "auxiliary_loss_clip": 0.0312444, + "auxiliary_loss_mlp": 0.01341166, + "balance_loss_clip": 1.12096262, + "balance_loss_mlp": 1.60122275, + "epoch": 0.0012024650533593868, + "flos": 17895365107200.0, + "grad_norm": 4.049985155187145, + "language_loss": 1.16660511, + "learning_rate": 1.928808765521199e-06, + "loss": 1.21126115, + "num_input_tokens_seen": 375625, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 15.25, + "step": 20, + "time_per_iteration": 2.687962293624878 + }, + { + "auxiliary_loss_clip": 0.03111088, + "auxiliary_loss_mlp": 0.01380381, + "balance_loss_clip": 1.13109064, + "balance_loss_mlp": 1.58184814, + "epoch": 0.001262588306027356, + "flos": 21252043071360.0, + "grad_norm": 8.855966691950416, + "language_loss": 1.06044388, + "learning_rate": 1.9602224192552076e-06, + "loss": 1.1053586, + "num_input_tokens_seen": 394350, + "router_z_loss_clip": 2.484375, + "router_z_loss_mlp": 15.3125, + "step": 21, + "time_per_iteration": 2.705784320831299 + }, + { + "auxiliary_loss_clip": 0.03006166, + "auxiliary_loss_mlp": 0.0138104, + "balance_loss_clip": 1.14758062, + "balance_loss_mlp": 1.56386232, + "epoch": 0.0013227115586953253, + "flos": 26104077158400.0, + "grad_norm": 3.503731577984969, + "language_loss": 1.05752254, + "learning_rate": 1.9901744328983746e-06, + "loss": 1.10139465, + "num_input_tokens_seen": 413255, + "router_z_loss_clip": 2.34375, + "router_z_loss_mlp": 14.4375, + "step": 22, + "time_per_iteration": 2.714902400970459 + }, + { + "auxiliary_loss_clip": 0.02958535, + "auxiliary_loss_mlp": 0.01337723, + "balance_loss_clip": 1.12743819, + "balance_loss_mlp": 1.56545472, + "epoch": 0.0013828348113632948, + "flos": 23951376190080.0, + "grad_norm": 2.8887485842740657, + "language_loss": 0.91820848, + "learning_rate": 2.018794797290208e-06, + "loss": 0.96117103, + "num_input_tokens_seen": 433065, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 13.9375, + "step": 23, + "time_per_iteration": 2.6802501678466797 + }, + { + "auxiliary_loss_clip": 0.02925568, + "auxiliary_loss_mlp": 0.0136327, + "balance_loss_clip": 1.14306688, + "balance_loss_mlp": 1.55789983, + "epoch": 0.001442958064031264, + "flos": 15959851724160.0, + "grad_norm": 2.888412626700388, + "language_loss": 1.08090949, + "learning_rate": 2.046196897962839e-06, + "loss": 1.12379789, + "num_input_tokens_seen": 451175, + "router_z_loss_clip": 2.203125, + "router_z_loss_mlp": 13.6875, + "step": 24, + "time_per_iteration": 2.6134862899780273 + }, + { + "auxiliary_loss_clip": 0.02818042, + "auxiliary_loss_mlp": 0.01329399, + "balance_loss_clip": 1.11892343, + "balance_loss_mlp": 1.55278993, + "epoch": 0.0015030813166992333, + "flos": 18108350801280.0, + "grad_norm": 3.5526652768314877, + "language_loss": 1.01197755, + "learning_rate": 2.0724802282696944e-06, + "loss": 1.05345201, + "num_input_tokens_seen": 468775, + "router_z_loss_clip": 2.109375, + "router_z_loss_mlp": 12.6875, + "step": 25, + "time_per_iteration": 2.6801955699920654 + }, + { + "auxiliary_loss_clip": 0.02811065, + "auxiliary_loss_mlp": 0.01310914, + "balance_loss_clip": 1.10196424, + "balance_loss_mlp": 1.55557573, + "epoch": 0.0015632045693672028, + "flos": 22234558763520.0, + "grad_norm": 2.8866965715457127, + "language_loss": 1.0650332, + "learning_rate": 2.0977325700720194e-06, + "loss": 1.10625291, + "num_input_tokens_seen": 488530, + "router_z_loss_clip": 2.09375, + "router_z_loss_mlp": 12.5625, + "step": 26, + "time_per_iteration": 2.6561954021453857 + }, + { + "auxiliary_loss_clip": 0.02754337, + "auxiliary_loss_mlp": 0.01325989, + "balance_loss_clip": 1.12600398, + "balance_loss_mlp": 1.54593086, + "epoch": 0.001623327822035172, + "flos": 23991955580160.0, + "grad_norm": 8.480879524297928, + "language_loss": 0.95465469, + "learning_rate": 2.122031762649933e-06, + "loss": 0.99545801, + "num_input_tokens_seen": 510495, + "router_z_loss_clip": 2.0, + "router_z_loss_mlp": 12.0625, + "step": 27, + "time_per_iteration": 2.717332363128662 + }, + { + "auxiliary_loss_clip": 0.02732017, + "auxiliary_loss_mlp": 0.0131313, + "balance_loss_clip": 1.13174081, + "balance_loss_mlp": 1.55085063, + "epoch": 0.0016834510747031415, + "flos": 19677647070720.0, + "grad_norm": 2.7582152185230338, + "language_loss": 1.06276608, + "learning_rate": 2.1454471497582483e-06, + "loss": 1.1032176, + "num_input_tokens_seen": 528605, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 11.8125, + "step": 28, + "time_per_iteration": 2.6645846366882324 + }, + { + "auxiliary_loss_clip": 0.02698877, + "auxiliary_loss_mlp": 0.01319704, + "balance_loss_clip": 1.1339283, + "balance_loss_mlp": 1.5357703, + "epoch": 0.0017435743273711108, + "flos": 20923819568640.0, + "grad_norm": 2.703793609192777, + "language_loss": 1.02653611, + "learning_rate": 2.1680407726407727e-06, + "loss": 1.06672192, + "num_input_tokens_seen": 548515, + "router_z_loss_clip": 1.859375, + "router_z_loss_mlp": 11.625, + "step": 29, + "time_per_iteration": 2.6647088527679443 + }, + { + "auxiliary_loss_clip": 0.02692806, + "auxiliary_loss_mlp": 0.01313595, + "balance_loss_clip": 1.12667465, + "balance_loss_mlp": 1.53252506, + "epoch": 0.00180369758003908, + "flos": 19528976678400.0, + "grad_norm": 3.824163422844594, + "language_loss": 1.1929419, + "learning_rate": 2.189868360711334e-06, + "loss": 1.233006, + "num_input_tokens_seen": 564025, + "router_z_loss_clip": 1.875, + "router_z_loss_mlp": 11.625, + "step": 30, + "time_per_iteration": 2.6305816173553467 + }, + { + "auxiliary_loss_clip": 0.02610821, + "auxiliary_loss_mlp": 0.01338782, + "balance_loss_clip": 1.15748882, + "balance_loss_mlp": 1.51829374, + "epoch": 0.0018638208327070496, + "flos": 27453169100160.0, + "grad_norm": 4.55861683808779, + "language_loss": 1.02499342, + "learning_rate": 2.2109801597326265e-06, + "loss": 1.06448936, + "num_input_tokens_seen": 583345, + "router_z_loss_clip": 1.8125, + "router_z_loss_mlp": 10.9375, + "step": 31, + "time_per_iteration": 2.71045184135437 + }, + { + "auxiliary_loss_clip": 0.02583705, + "auxiliary_loss_mlp": 0.01332414, + "balance_loss_clip": 1.15245557, + "balance_loss_mlp": 1.52035046, + "epoch": 0.0019239440853750188, + "flos": 13589460380160.0, + "grad_norm": 2.526137445187824, + "language_loss": 0.95697796, + "learning_rate": 2.2314216284658796e-06, + "loss": 0.99613917, + "num_input_tokens_seen": 600010, + "router_z_loss_clip": 1.796875, + "router_z_loss_mlp": 10.625, + "step": 32, + "time_per_iteration": 2.626783847808838 + }, + { + "auxiliary_loss_clip": 0.02566919, + "auxiliary_loss_mlp": 0.01304168, + "balance_loss_clip": 1.13670313, + "balance_loss_mlp": 1.51655078, + "epoch": 0.001984067338042988, + "flos": 11253866336640.0, + "grad_norm": 3.344933729659458, + "language_loss": 0.95465255, + "learning_rate": 2.2512340280885094e-06, + "loss": 0.99336338, + "num_input_tokens_seen": 616295, + "router_z_loss_clip": 1.671875, + "router_z_loss_mlp": 10.5, + "step": 33, + "time_per_iteration": 2.645725727081299 + }, + { + "auxiliary_loss_clip": 0.02433039, + "auxiliary_loss_mlp": 0.013041, + "balance_loss_clip": 1.14569449, + "balance_loss_mlp": 1.48877192, + "epoch": 0.0020441905907109576, + "flos": 22386245898240.0, + "grad_norm": 4.808068329548225, + "language_loss": 0.91556877, + "learning_rate": 2.270454923596497e-06, + "loss": 0.95294011, + "num_input_tokens_seen": 637640, + "router_z_loss_clip": 1.5859375, + "router_z_loss_mlp": 9.4375, + "step": 34, + "time_per_iteration": 2.7327146530151367 + }, + { + "auxiliary_loss_clip": 0.02385913, + "auxiliary_loss_mlp": 0.0127366, + "balance_loss_clip": 1.1172576, + "balance_loss_mlp": 1.45172572, + "epoch": 0.0021043138433789266, + "flos": 49778580337920.0, + "grad_norm": 2.948252640490764, + "language_loss": 0.76639408, + "learning_rate": 2.2891186125067434e-06, + "loss": 0.80298984, + "num_input_tokens_seen": 659710, + "router_z_loss_clip": 1.5625, + "router_z_loss_mlp": 9.375, + "step": 35, + "time_per_iteration": 2.940739870071411 + }, + { + "auxiliary_loss_clip": 0.02360979, + "auxiliary_loss_mlp": 0.0127456, + "balance_loss_clip": 1.12769413, + "balance_loss_mlp": 1.46427846, + "epoch": 0.002164437096046896, + "flos": 20557961591040.0, + "grad_norm": 2.1659182072135064, + "language_loss": 0.89043307, + "learning_rate": 2.307256493152974e-06, + "loss": 0.92678845, + "num_input_tokens_seen": 679670, + "router_z_loss_clip": 1.46875, + "router_z_loss_mlp": 8.9375, + "step": 36, + "time_per_iteration": 2.693335771560669 + }, + { + "auxiliary_loss_clip": 0.02305413, + "auxiliary_loss_mlp": 0.01335093, + "balance_loss_clip": 1.18574798, + "balance_loss_mlp": 1.45221901, + "epoch": 0.0022245603487148656, + "flos": 26542295084160.0, + "grad_norm": 3.3248653771669416, + "language_loss": 0.93231332, + "learning_rate": 2.3248973825097614e-06, + "loss": 0.96871841, + "num_input_tokens_seen": 700170, + "router_z_loss_clip": 1.4921875, + "router_z_loss_mlp": 8.5, + "step": 37, + "time_per_iteration": 2.70194673538208 + }, + { + "auxiliary_loss_clip": 0.02264412, + "auxiliary_loss_mlp": 0.01277806, + "balance_loss_clip": 1.15373349, + "balance_loss_mlp": 1.44697845, + "epoch": 0.0022846836013828346, + "flos": 20338188226560.0, + "grad_norm": 2.1191864106647906, + "language_loss": 1.04275775, + "learning_rate": 2.3420677916238357e-06, + "loss": 1.07817996, + "num_input_tokens_seen": 718545, + "router_z_loss_clip": 1.2421875, + "router_z_loss_mlp": 8.1875, + "step": 38, + "time_per_iteration": 2.674187183380127 + }, + { + "auxiliary_loss_clip": 0.02234117, + "auxiliary_loss_mlp": 0.01257339, + "balance_loss_clip": 1.13164425, + "balance_loss_mlp": 1.44101977, + "epoch": 0.002344806854050804, + "flos": 26247575992320.0, + "grad_norm": 2.2707505194681685, + "language_loss": 0.85635245, + "learning_rate": 2.358792165262154e-06, + "loss": 0.891267, + "num_input_tokens_seen": 739865, + "router_z_loss_clip": 1.2578125, + "router_z_loss_mlp": 7.9375, + "step": 39, + "time_per_iteration": 2.716417074203491 + }, + { + "auxiliary_loss_clip": 0.02209554, + "auxiliary_loss_mlp": 0.01248677, + "balance_loss_clip": 1.1173557, + "balance_loss_mlp": 1.43176007, + "epoch": 0.0024049301067187736, + "flos": 11801539981440.0, + "grad_norm": 2.874633531970748, + "language_loss": 0.90416026, + "learning_rate": 2.3750930912143747e-06, + "loss": 0.93874258, + "num_input_tokens_seen": 755770, + "router_z_loss_clip": 1.3125, + "router_z_loss_mlp": 7.78125, + "step": 40, + "time_per_iteration": 2.621108055114746 + }, + { + "auxiliary_loss_clip": 0.02158681, + "auxiliary_loss_mlp": 0.01271709, + "balance_loss_clip": 1.15626693, + "balance_loss_mlp": 1.42207694, + "epoch": 0.0024650533593867426, + "flos": 20631506688000.0, + "grad_norm": 3.842521317695652, + "language_loss": 0.93497038, + "learning_rate": 2.3909914837471044e-06, + "loss": 0.96927428, + "num_input_tokens_seen": 773440, + "router_z_loss_clip": 1.15625, + "router_z_loss_mlp": 7.375, + "step": 41, + "time_per_iteration": 2.66089129447937 + }, + { + "auxiliary_loss_clip": 0.0212207, + "auxiliary_loss_mlp": 0.0125263, + "balance_loss_clip": 1.14720106, + "balance_loss_mlp": 1.41368401, + "epoch": 0.002525176612054712, + "flos": 18406122549120.0, + "grad_norm": 4.5963223670672635, + "language_loss": 0.97454929, + "learning_rate": 2.4065067449483835e-06, + "loss": 1.00829637, + "num_input_tokens_seen": 790455, + "router_z_loss_clip": 1.0546875, + "router_z_loss_mlp": 7.09375, + "step": 42, + "time_per_iteration": 2.63149094581604 + }, + { + "auxiliary_loss_clip": 0.02082851, + "auxiliary_loss_mlp": 0.01298258, + "balance_loss_clip": 1.18939614, + "balance_loss_mlp": 1.41430426, + "epoch": 0.0025852998647226816, + "flos": 28184023128960.0, + "grad_norm": 2.9545418034556814, + "language_loss": 0.97656071, + "learning_rate": 2.4216569070848724e-06, + "loss": 1.01037169, + "num_input_tokens_seen": 810645, + "router_z_loss_clip": 1.09375, + "router_z_loss_mlp": 6.6875, + "step": 43, + "time_per_iteration": 2.7244436740875244 + }, + { + "auxiliary_loss_clip": 0.02102024, + "auxiliary_loss_mlp": 0.01311792, + "balance_loss_clip": 1.19706488, + "balance_loss_mlp": 1.4130851, + "epoch": 0.0026454231173906506, + "flos": 14283110897280.0, + "grad_norm": 2.0531245010632473, + "language_loss": 0.93701768, + "learning_rate": 2.4364587585915504e-06, + "loss": 0.97115582, + "num_input_tokens_seen": 827470, + "router_z_loss_clip": 1.1484375, + "router_z_loss_mlp": 6.875, + "step": 44, + "time_per_iteration": 2.6628317832946777 + }, + { + "auxiliary_loss_clip": 0.02065563, + "auxiliary_loss_mlp": 0.01272457, + "balance_loss_clip": 1.17236853, + "balance_loss_mlp": 1.41084957, + "epoch": 0.00270554637005862, + "flos": 22419211605120.0, + "grad_norm": 9.3374631511207, + "language_loss": 0.98937047, + "learning_rate": 2.450927955901469e-06, + "loss": 1.02275062, + "num_input_tokens_seen": 847285, + "router_z_loss_clip": 1.0, + "router_z_loss_mlp": 6.5625, + "step": 45, + "time_per_iteration": 2.7355775833129883 + }, + { + "auxiliary_loss_clip": 0.02040064, + "auxiliary_loss_mlp": 0.01227769, + "balance_loss_clip": 1.13831401, + "balance_loss_mlp": 1.39673805, + "epoch": 0.0027656696227265896, + "flos": 23985778440960.0, + "grad_norm": 1.8055823424878037, + "language_loss": 1.02792716, + "learning_rate": 2.465079122983384e-06, + "loss": 1.06060553, + "num_input_tokens_seen": 867545, + "router_z_loss_clip": 0.8984375, + "router_z_loss_mlp": 6.4375, + "step": 46, + "time_per_iteration": 2.7488839626312256 + }, + { + "auxiliary_loss_clip": 0.02002379, + "auxiliary_loss_mlp": 0.01270193, + "balance_loss_clip": 1.17773402, + "balance_loss_mlp": 1.38648152, + "epoch": 0.0028257928753945586, + "flos": 37669503087360.0, + "grad_norm": 2.971366079361506, + "language_loss": 0.88043427, + "learning_rate": 2.4789259401737868e-06, + "loss": 0.91315997, + "num_input_tokens_seen": 889915, + "router_z_loss_clip": 0.921875, + "router_z_loss_mlp": 6.15625, + "step": 47, + "time_per_iteration": 2.845005512237549 + }, + { + "auxiliary_loss_clip": 0.01963914, + "auxiliary_loss_mlp": 0.01252908, + "balance_loss_clip": 1.16493094, + "balance_loss_mlp": 1.37624073, + "epoch": 0.002885916128062528, + "flos": 22454547609600.0, + "grad_norm": 2.070099145794898, + "language_loss": 0.87949276, + "learning_rate": 2.492481223656015e-06, + "loss": 0.91166103, + "num_input_tokens_seen": 908975, + "router_z_loss_clip": 0.87890625, + "router_z_loss_mlp": 5.875, + "step": 48, + "time_per_iteration": 2.7514398097991943 + }, + { + "auxiliary_loss_clip": 0.01962956, + "auxiliary_loss_mlp": 0.01244481, + "balance_loss_clip": 1.15078259, + "balance_loss_mlp": 1.36602139, + "epoch": 0.0029460393807304976, + "flos": 27012796358400.0, + "grad_norm": 2.366138839739612, + "language_loss": 0.89877701, + "learning_rate": 2.5057569967437924e-06, + "loss": 0.93085134, + "num_input_tokens_seen": 929810, + "router_z_loss_clip": 0.9375, + "router_z_loss_mlp": 6.0, + "step": 49, + "time_per_iteration": 2.743236541748047 + }, + { + "auxiliary_loss_clip": 0.01955947, + "auxiliary_loss_mlp": 0.01232227, + "balance_loss_clip": 1.14534748, + "balance_loss_mlp": 1.36045313, + "epoch": 0.0030061626333984666, + "flos": 15851832549120.0, + "grad_norm": 2.8158483763506914, + "language_loss": 0.91078663, + "learning_rate": 2.51876455396287e-06, + "loss": 0.94266832, + "num_input_tokens_seen": 948650, + "router_z_loss_clip": 0.8671875, + "router_z_loss_mlp": 5.9375, + "step": 50, + "time_per_iteration": 2.6860456466674805 + }, + { + "auxiliary_loss_clip": 0.01953364, + "auxiliary_loss_mlp": 0.01201227, + "balance_loss_clip": 1.11778045, + "balance_loss_mlp": 1.36547732, + "epoch": 0.003066285886066436, + "flos": 31827052316160.0, + "grad_norm": 3.5299735782100026, + "language_loss": 0.87144494, + "learning_rate": 2.5315145187866316e-06, + "loss": 0.90299082, + "num_input_tokens_seen": 966455, + "router_z_loss_clip": 0.83203125, + "router_z_loss_mlp": 5.875, + "step": 51, + "time_per_iteration": 2.7481534481048584 + }, + { + "auxiliary_loss_clip": 0.01909154, + "auxiliary_loss_mlp": 0.01207037, + "balance_loss_clip": 1.12707186, + "balance_loss_mlp": 1.35597348, + "epoch": 0.0031264091387344056, + "flos": 41427482774400.0, + "grad_norm": 2.0262044932375836, + "language_loss": 0.95253396, + "learning_rate": 2.5440168957651953e-06, + "loss": 0.98369586, + "num_input_tokens_seen": 988110, + "router_z_loss_clip": 0.796875, + "router_z_loss_mlp": 5.53125, + "step": 52, + "time_per_iteration": 2.8958797454833984 + }, + { + "auxiliary_loss_clip": 0.01904814, + "auxiliary_loss_mlp": 0.01243661, + "balance_loss_clip": 1.16274214, + "balance_loss_mlp": 1.35173535, + "epoch": 0.0031865323914023747, + "flos": 23440941970560.0, + "grad_norm": 3.3193539013945546, + "language_loss": 0.92261833, + "learning_rate": 2.5562811176888872e-06, + "loss": 0.95410311, + "num_input_tokens_seen": 1008550, + "router_z_loss_clip": 0.80859375, + "router_z_loss_mlp": 5.53125, + "step": 53, + "time_per_iteration": 2.7579286098480225 + }, + { + "auxiliary_loss_clip": 0.01893968, + "auxiliary_loss_mlp": 0.01196907, + "balance_loss_clip": 1.11489081, + "balance_loss_mlp": 1.35535884, + "epoch": 0.003246655644070344, + "flos": 14429195510400.0, + "grad_norm": 2.2021865200163, + "language_loss": 0.82945669, + "learning_rate": 2.5683160883431093e-06, + "loss": 0.86036545, + "num_input_tokens_seen": 1026840, + "router_z_loss_clip": 0.8203125, + "router_z_loss_mlp": 5.375, + "step": 54, + "time_per_iteration": 2.684718132019043 + }, + { + "auxiliary_loss_clip": 0.01889572, + "auxiliary_loss_mlp": 0.01211293, + "balance_loss_clip": 1.13113666, + "balance_loss_mlp": 1.34359026, + "epoch": 0.0033067788967383136, + "flos": 35918247496320.0, + "grad_norm": 2.4060188817442487, + "language_loss": 0.81305432, + "learning_rate": 2.580130221340046e-06, + "loss": 0.84406298, + "num_input_tokens_seen": 1048875, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 5.4375, + "step": 55, + "time_per_iteration": 2.7722246646881104 + }, + { + "auxiliary_loss_clip": 0.01879346, + "auxiliary_loss_mlp": 0.01199903, + "balance_loss_clip": 1.11926973, + "balance_loss_mlp": 1.33773279, + "epoch": 0.003366902149406283, + "flos": 22958732862720.0, + "grad_norm": 2.497299649397407, + "language_loss": 0.87261844, + "learning_rate": 2.5917314754514246e-06, + "loss": 0.90341091, + "num_input_tokens_seen": 1066435, + "router_z_loss_clip": 0.8046875, + "router_z_loss_mlp": 5.40625, + "step": 56, + "time_per_iteration": 2.7031195163726807 + }, + { + "auxiliary_loss_clip": 0.01879922, + "auxiliary_loss_mlp": 0.01161266, + "balance_loss_clip": 1.0864507, + "balance_loss_mlp": 1.33024335, + "epoch": 0.003427025402074252, + "flos": 26582838560640.0, + "grad_norm": 2.4089458733946882, + "language_loss": 0.92949611, + "learning_rate": 2.6031273868139713e-06, + "loss": 0.95990801, + "num_input_tokens_seen": 1090330, + "router_z_loss_clip": 0.75, + "router_z_loss_mlp": 5.5, + "step": 57, + "time_per_iteration": 2.8580281734466553 + }, + { + "auxiliary_loss_clip": 0.01843074, + "auxiliary_loss_mlp": 0.01217004, + "balance_loss_clip": 1.14395308, + "balance_loss_mlp": 1.33453596, + "epoch": 0.0034871486547422216, + "flos": 23951196622080.0, + "grad_norm": 2.105168727735643, + "language_loss": 0.99725533, + "learning_rate": 2.614325098333948e-06, + "loss": 1.02785611, + "num_input_tokens_seen": 1109840, + "router_z_loss_clip": 0.73046875, + "router_z_loss_mlp": 5.09375, + "step": 58, + "time_per_iteration": 2.687504529953003 + }, + { + "auxiliary_loss_clip": 0.01822907, + "auxiliary_loss_mlp": 0.01195384, + "balance_loss_clip": 1.12319088, + "balance_loss_mlp": 1.32094967, + "epoch": 0.003547271907410191, + "flos": 21214983214080.0, + "grad_norm": 2.1328304194940855, + "language_loss": 0.8821373, + "learning_rate": 2.625331386578098e-06, + "loss": 0.9123202, + "num_input_tokens_seen": 1128415, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 5.03125, + "step": 59, + "time_per_iteration": 6.997380495071411 + }, + { + "auxiliary_loss_clip": 0.01844896, + "auxiliary_loss_mlp": 0.01162144, + "balance_loss_clip": 1.08885431, + "balance_loss_mlp": 1.32932925, + "epoch": 0.00360739516007816, + "flos": 16504903676160.0, + "grad_norm": 2.097582115586327, + "language_loss": 0.93430054, + "learning_rate": 2.63615268640451e-06, + "loss": 0.96437097, + "num_input_tokens_seen": 1146515, + "router_z_loss_clip": 0.734375, + "router_z_loss_mlp": 5.15625, + "step": 60, + "time_per_iteration": 2.67743182182312 + }, + { + "auxiliary_loss_clip": 0.0182307, + "auxiliary_loss_mlp": 0.01172385, + "balance_loss_clip": 1.10376787, + "balance_loss_mlp": 1.31307459, + "epoch": 0.0036675184127461296, + "flos": 19464805031040.0, + "grad_norm": 4.241258673484683, + "language_loss": 0.90090871, + "learning_rate": 2.6467951135575943e-06, + "loss": 0.93086326, + "num_input_tokens_seen": 1166330, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 5.09375, + "step": 61, + "time_per_iteration": 2.707247257232666 + }, + { + "auxiliary_loss_clip": 0.01806801, + "auxiliary_loss_mlp": 0.01142561, + "balance_loss_clip": 1.07475519, + "balance_loss_mlp": 1.31002319, + "epoch": 0.003727641665414099, + "flos": 20957323979520.0, + "grad_norm": 3.0487456468745586, + "language_loss": 0.88434047, + "learning_rate": 2.657264485425803e-06, + "loss": 0.9138341, + "num_input_tokens_seen": 1186010, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 4.96875, + "step": 62, + "time_per_iteration": 2.736107587814331 + }, + { + "auxiliary_loss_clip": 0.01787131, + "auxiliary_loss_mlp": 0.01161947, + "balance_loss_clip": 1.09132755, + "balance_loss_mlp": 1.30018497, + "epoch": 0.003787764918082068, + "flos": 18406050721920.0, + "grad_norm": 2.6509198595432406, + "language_loss": 0.96265876, + "learning_rate": 2.6675663401385186e-06, + "loss": 0.99214947, + "num_input_tokens_seen": 1204985, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 4.875, + "step": 63, + "time_per_iteration": 2.6760194301605225 + }, + { + "auxiliary_loss_clip": 0.01795174, + "auxiliary_loss_mlp": 0.01169703, + "balance_loss_clip": 1.10284996, + "balance_loss_mlp": 1.30725491, + "epoch": 0.0038478881707500376, + "flos": 12459243962880.0, + "grad_norm": 2.677484479433752, + "language_loss": 0.99141657, + "learning_rate": 2.677705954159056e-06, + "loss": 1.02106524, + "num_input_tokens_seen": 1223545, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 4.875, + "step": 64, + "time_per_iteration": 2.675295114517212 + }, + { + "auxiliary_loss_clip": 0.01802087, + "auxiliary_loss_mlp": 0.01149441, + "balance_loss_clip": 1.08134842, + "balance_loss_mlp": 1.30652797, + "epoch": 0.003908011423418007, + "flos": 13553334276480.0, + "grad_norm": 2.45939593962701, + "language_loss": 0.85358196, + "learning_rate": 2.6876883585136904e-06, + "loss": 0.88309723, + "num_input_tokens_seen": 1241175, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 4.9375, + "step": 65, + "time_per_iteration": 2.647696018218994 + }, + { + "auxiliary_loss_clip": 0.01779034, + "auxiliary_loss_mlp": 0.01156784, + "balance_loss_clip": 1.0886445, + "balance_loss_mlp": 1.29322505, + "epoch": 0.003968134676085976, + "flos": 18333475292160.0, + "grad_norm": 2.8561979494145033, + "language_loss": 0.85224223, + "learning_rate": 2.697518353781685e-06, + "loss": 0.88160038, + "num_input_tokens_seen": 1259315, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 4.875, + "step": 66, + "time_per_iteration": 2.617143392562866 + }, + { + "auxiliary_loss_clip": 0.01782156, + "auxiliary_loss_mlp": 0.01152634, + "balance_loss_clip": 1.07648349, + "balance_loss_mlp": 1.29168975, + "epoch": 0.004028257928753946, + "flos": 20485242506880.0, + "grad_norm": 2.246759082278279, + "language_loss": 0.96454394, + "learning_rate": 2.7072005239581103e-06, + "loss": 0.99389184, + "num_input_tokens_seen": 1277055, + "router_z_loss_clip": 0.76171875, + "router_z_loss_mlp": 4.90625, + "step": 67, + "time_per_iteration": 2.6343421936035156 + }, + { + "auxiliary_loss_clip": 0.01753238, + "auxiliary_loss_mlp": 0.01155696, + "balance_loss_clip": 1.08340704, + "balance_loss_mlp": 1.28524387, + "epoch": 0.004088381181421915, + "flos": 18843837684480.0, + "grad_norm": 2.549207131743101, + "language_loss": 0.94534445, + "learning_rate": 2.7167392492896727e-06, + "loss": 0.97443378, + "num_input_tokens_seen": 1294355, + "router_z_loss_clip": 0.72265625, + "router_z_loss_mlp": 4.6875, + "step": 68, + "time_per_iteration": 2.614696741104126 + }, + { + "auxiliary_loss_clip": 0.01748377, + "auxiliary_loss_mlp": 0.01156697, + "balance_loss_clip": 1.08717394, + "balance_loss_mlp": 1.28268003, + "epoch": 0.004148504434089885, + "flos": 19427817000960.0, + "grad_norm": 1.9922029239060344, + "language_loss": 0.95657748, + "learning_rate": 2.7261387181735195e-06, + "loss": 0.98562825, + "num_input_tokens_seen": 1313525, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 4.65625, + "step": 69, + "time_per_iteration": 2.6637492179870605 + }, + { + "auxiliary_loss_clip": 0.01742428, + "auxiliary_loss_mlp": 0.01160645, + "balance_loss_clip": 1.09598637, + "balance_loss_mlp": 1.2855866, + "epoch": 0.004208627686757853, + "flos": 20811023884800.0, + "grad_norm": 2.4176731159017075, + "language_loss": 0.98073572, + "learning_rate": 2.7354029381999196e-06, + "loss": 1.00976658, + "num_input_tokens_seen": 1330505, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 4.5625, + "step": 70, + "time_per_iteration": 2.6395556926727295 + }, + { + "auxiliary_loss_clip": 0.01748999, + "auxiliary_loss_mlp": 0.01146397, + "balance_loss_clip": 1.07673144, + "balance_loss_mlp": 1.2760632, + "epoch": 0.004268750939425823, + "flos": 19098623831040.0, + "grad_norm": 2.71386904393857, + "language_loss": 0.93927777, + "learning_rate": 2.7445357464116983e-06, + "loss": 0.96823174, + "num_input_tokens_seen": 1349615, + "router_z_loss_clip": 0.69921875, + "router_z_loss_mlp": 4.75, + "step": 71, + "time_per_iteration": 2.628272294998169 + }, + { + "auxiliary_loss_clip": 0.01838762, + "auxiliary_loss_mlp": 0.01327632, + "balance_loss_clip": 1.28967619, + "balance_loss_mlp": 1.43997037, + "epoch": 0.004328874192093792, + "flos": 52439635514880.0, + "grad_norm": 2.4194543250518663, + "language_loss": 0.65655279, + "learning_rate": 2.75354081884615e-06, + "loss": 0.68821681, + "num_input_tokens_seen": 1410275, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 4.0, + "step": 72, + "time_per_iteration": 3.104635000228882 + }, + { + "auxiliary_loss_clip": 0.01820285, + "auxiliary_loss_mlp": 0.01295248, + "balance_loss_clip": 1.25824571, + "balance_loss_mlp": 1.43420911, + "epoch": 0.004388997444761762, + "flos": 66473239564800.0, + "grad_norm": 2.2482458517722455, + "language_loss": 0.63711512, + "learning_rate": 2.7624216794188286e-06, + "loss": 0.66827047, + "num_input_tokens_seen": 1473020, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 3.859375, + "step": 73, + "time_per_iteration": 3.208836793899536 + }, + { + "auxiliary_loss_clip": 0.01723308, + "auxiliary_loss_mlp": 0.01141966, + "balance_loss_clip": 1.07382631, + "balance_loss_mlp": 1.26790953, + "epoch": 0.004449120697429731, + "flos": 18952970181120.0, + "grad_norm": 2.4515337577309424, + "language_loss": 0.85899854, + "learning_rate": 2.771181708202938e-06, + "loss": 0.88765126, + "num_input_tokens_seen": 1490385, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 4.5625, + "step": 74, + "time_per_iteration": 2.6287550926208496 + }, + { + "auxiliary_loss_clip": 0.01725734, + "auxiliary_loss_mlp": 0.01165418, + "balance_loss_clip": 1.09584761, + "balance_loss_mlp": 1.26750898, + "epoch": 0.004509243950097701, + "flos": 21105491581440.0, + "grad_norm": 2.110493434952054, + "language_loss": 0.9716984, + "learning_rate": 2.779824149153005e-06, + "loss": 1.00060987, + "num_input_tokens_seen": 1509725, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 4.5625, + "step": 75, + "time_per_iteration": 2.635618209838867 + }, + { + "auxiliary_loss_clip": 0.01704277, + "auxiliary_loss_mlp": 0.01144942, + "balance_loss_clip": 1.07875705, + "balance_loss_mlp": 1.26302838, + "epoch": 0.004569367202765669, + "flos": 20698730991360.0, + "grad_norm": 2.60583579179481, + "language_loss": 0.87675405, + "learning_rate": 2.788352117317012e-06, + "loss": 0.9052462, + "num_input_tokens_seen": 1527245, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 4.4375, + "step": 76, + "time_per_iteration": 2.6379826068878174 + }, + { + "auxiliary_loss_clip": 0.01705571, + "auxiliary_loss_mlp": 0.0114831, + "balance_loss_clip": 1.07845366, + "balance_loss_mlp": 1.26138341, + "epoch": 0.004629490455433639, + "flos": 28658474899200.0, + "grad_norm": 1.9080158042054207, + "language_loss": 0.91751724, + "learning_rate": 2.796768605577095e-06, + "loss": 0.94605613, + "num_input_tokens_seen": 1548930, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 4.4375, + "step": 77, + "time_per_iteration": 2.6596872806549072 + }, + { + "auxiliary_loss_clip": 0.01694222, + "auxiliary_loss_mlp": 0.01165235, + "balance_loss_clip": 1.09494948, + "balance_loss_mlp": 1.26167083, + "epoch": 0.004689613708101608, + "flos": 11072409805440.0, + "grad_norm": 2.1229280552318803, + "language_loss": 0.92189825, + "learning_rate": 2.80507649095533e-06, + "loss": 0.95049286, + "num_input_tokens_seen": 1565695, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 4.3125, + "step": 78, + "time_per_iteration": 2.598590850830078 + }, + { + "auxiliary_loss_clip": 0.01690635, + "auxiliary_loss_mlp": 0.01155594, + "balance_loss_clip": 1.08735824, + "balance_loss_mlp": 1.25696921, + "epoch": 0.004749736960769578, + "flos": 21799106184960.0, + "grad_norm": 2.280813483182965, + "language_loss": 0.82480371, + "learning_rate": 2.813278540517843e-06, + "loss": 0.85326606, + "num_input_tokens_seen": 1582625, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 4.34375, + "step": 79, + "time_per_iteration": 2.6215708255767822 + }, + { + "auxiliary_loss_clip": 0.01705122, + "auxiliary_loss_mlp": 0.01133248, + "balance_loss_clip": 1.06315339, + "balance_loss_mlp": 1.26029253, + "epoch": 0.004809860213437547, + "flos": 19792597570560.0, + "grad_norm": 2.4809717100134616, + "language_loss": 0.91311121, + "learning_rate": 2.8213774169075505e-06, + "loss": 0.94149494, + "num_input_tokens_seen": 1601725, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 4.4375, + "step": 80, + "time_per_iteration": 2.639841079711914 + }, + { + "auxiliary_loss_clip": 0.01674552, + "auxiliary_loss_mlp": 0.01142875, + "balance_loss_clip": 1.07254159, + "balance_loss_mlp": 1.25350285, + "epoch": 0.004869983466105517, + "flos": 26574327037440.0, + "grad_norm": 2.165091554789383, + "language_loss": 0.94981706, + "learning_rate": 2.829375683533245e-06, + "loss": 0.97799134, + "num_input_tokens_seen": 1622420, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 4.21875, + "step": 81, + "time_per_iteration": 2.6689717769622803 + }, + { + "auxiliary_loss_clip": 0.01688803, + "auxiliary_loss_mlp": 0.01148831, + "balance_loss_clip": 1.08269382, + "balance_loss_mlp": 1.25745821, + "epoch": 0.004930106718773485, + "flos": 12823378087680.0, + "grad_norm": 2.9914678747629226, + "language_loss": 0.96341741, + "learning_rate": 2.8372758094402803e-06, + "loss": 0.99179375, + "num_input_tokens_seen": 1640715, + "router_z_loss_clip": 0.66015625, + "router_z_loss_mlp": 4.3125, + "step": 82, + "time_per_iteration": 2.629596471786499 + }, + { + "auxiliary_loss_clip": 0.01671229, + "auxiliary_loss_mlp": 0.01159801, + "balance_loss_clip": 1.09013557, + "balance_loss_mlp": 1.24528587, + "epoch": 0.004990229971441455, + "flos": 25774919902080.0, + "grad_norm": 2.533591741594043, + "language_loss": 0.8664127, + "learning_rate": 2.84508017388607e-06, + "loss": 0.894723, + "num_input_tokens_seen": 1662210, + "router_z_loss_clip": 0.6953125, + "router_z_loss_mlp": 4.25, + "step": 83, + "time_per_iteration": 2.7277162075042725 + }, + { + "auxiliary_loss_clip": 0.01664198, + "auxiliary_loss_mlp": 0.01156919, + "balance_loss_clip": 1.08663368, + "balance_loss_mlp": 1.24647975, + "epoch": 0.005050353224109424, + "flos": 17457254922240.0, + "grad_norm": 3.373799694341511, + "language_loss": 0.91779828, + "learning_rate": 2.852791070641559e-06, + "loss": 0.94600952, + "num_input_tokens_seen": 1681070, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 4.1875, + "step": 84, + "time_per_iteration": 2.62187123298645 + }, + { + "auxiliary_loss_clip": 0.01645783, + "auxiliary_loss_mlp": 0.01205663, + "balance_loss_clip": 1.17075825, + "balance_loss_mlp": 1.34984684, + "epoch": 0.005110476476777394, + "flos": 69805460367360.0, + "grad_norm": 1.4266053341540552, + "language_loss": 0.62504542, + "learning_rate": 2.8604107120381682e-06, + "loss": 0.65355992, + "num_input_tokens_seen": 1747140, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 2.96875, + "step": 85, + "time_per_iteration": 3.190223217010498 + }, + { + "auxiliary_loss_clip": 0.0165122, + "auxiliary_loss_mlp": 0.01127154, + "balance_loss_clip": 1.05648708, + "balance_loss_mlp": 1.23674285, + "epoch": 0.005170599729445363, + "flos": 24790105739520.0, + "grad_norm": 1.7428139018461835, + "language_loss": 0.90836501, + "learning_rate": 2.8679412327780482e-06, + "loss": 0.93614876, + "num_input_tokens_seen": 1767475, + "router_z_loss_clip": 0.70703125, + "router_z_loss_mlp": 4.15625, + "step": 86, + "time_per_iteration": 2.66162109375 + }, + { + "auxiliary_loss_clip": 0.01655877, + "auxiliary_loss_mlp": 0.01161945, + "balance_loss_clip": 1.09065783, + "balance_loss_mlp": 1.24282312, + "epoch": 0.005230722982113333, + "flos": 23258048895360.0, + "grad_norm": 2.38275425723773, + "language_loss": 0.8209877, + "learning_rate": 2.8753846935240833e-06, + "loss": 0.84916592, + "num_input_tokens_seen": 1784980, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 4.125, + "step": 87, + "time_per_iteration": 2.618906021118164 + }, + { + "auxiliary_loss_clip": 0.01644726, + "auxiliary_loss_mlp": 0.01154792, + "balance_loss_clip": 1.08617568, + "balance_loss_mlp": 1.24127626, + "epoch": 0.005290846234781301, + "flos": 16727909264640.0, + "grad_norm": 1.8918921085406437, + "language_loss": 0.95630223, + "learning_rate": 2.8827430842847267e-06, + "loss": 0.98429739, + "num_input_tokens_seen": 1803030, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 4.03125, + "step": 88, + "time_per_iteration": 2.5916671752929688 + }, + { + "auxiliary_loss_clip": 0.01661198, + "auxiliary_loss_mlp": 0.0114963, + "balance_loss_clip": 1.08230066, + "balance_loss_mlp": 1.24101663, + "epoch": 0.005350969487449271, + "flos": 20886077352960.0, + "grad_norm": 1.9438908009999392, + "language_loss": 0.85920149, + "learning_rate": 2.8900183276075957e-06, + "loss": 0.88730979, + "num_input_tokens_seen": 1822865, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 4.1875, + "step": 89, + "time_per_iteration": 2.6486849784851074 + }, + { + "auxiliary_loss_clip": 0.01648909, + "auxiliary_loss_mlp": 0.01132231, + "balance_loss_clip": 1.06547391, + "balance_loss_mlp": 1.23491406, + "epoch": 0.00541109274011724, + "flos": 26209977431040.0, + "grad_norm": 4.519706664825811, + "language_loss": 0.91517568, + "learning_rate": 2.8972122815946455e-06, + "loss": 0.94298708, + "num_input_tokens_seen": 1842435, + "router_z_loss_clip": 0.66796875, + "router_z_loss_mlp": 4.125, + "step": 90, + "time_per_iteration": 2.658997058868408 + }, + { + "auxiliary_loss_clip": 0.01630542, + "auxiliary_loss_mlp": 0.0113282, + "balance_loss_clip": 1.06496572, + "balance_loss_mlp": 1.23102689, + "epoch": 0.00547121599278521, + "flos": 21178569801600.0, + "grad_norm": 2.2090932400382486, + "language_loss": 0.8587057, + "learning_rate": 2.90432674275074e-06, + "loss": 0.88633931, + "num_input_tokens_seen": 1860065, + "router_z_loss_clip": 0.6796875, + "router_z_loss_mlp": 3.984375, + "step": 91, + "time_per_iteration": 2.619231939315796 + }, + { + "auxiliary_loss_clip": 0.01629785, + "auxiliary_loss_mlp": 0.01140917, + "balance_loss_clip": 1.07458866, + "balance_loss_mlp": 1.22673059, + "epoch": 0.005531339245453179, + "flos": 19718801078400.0, + "grad_norm": 2.769705373909222, + "language_loss": 0.86930025, + "learning_rate": 2.91136344867656e-06, + "loss": 0.89700729, + "num_input_tokens_seen": 1878135, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 4.03125, + "step": 92, + "time_per_iteration": 2.646968126296997 + }, + { + "auxiliary_loss_clip": 0.01621216, + "auxiliary_loss_mlp": 0.01179948, + "balance_loss_clip": 1.1122849, + "balance_loss_mlp": 1.21872091, + "epoch": 0.005591462498121149, + "flos": 17636089760640.0, + "grad_norm": 2.5030178409929, + "language_loss": 0.92042911, + "learning_rate": 2.918324080615938e-06, + "loss": 0.94844079, + "num_input_tokens_seen": 1894895, + "router_z_loss_clip": 0.67578125, + "router_z_loss_mlp": 4.03125, + "step": 93, + "time_per_iteration": 2.59853196144104 + }, + { + "auxiliary_loss_clip": 0.016342, + "auxiliary_loss_mlp": 0.01152159, + "balance_loss_clip": 1.08120561, + "balance_loss_mlp": 1.22512126, + "epoch": 0.005651585750789117, + "flos": 20011221699840.0, + "grad_norm": 2.2071592078672198, + "language_loss": 0.87372428, + "learning_rate": 2.925210265866963e-06, + "loss": 0.90158784, + "num_input_tokens_seen": 1913220, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 4.09375, + "step": 94, + "time_per_iteration": 2.587707281112671 + }, + { + "auxiliary_loss_clip": 0.01562532, + "auxiliary_loss_mlp": 0.01067909, + "balance_loss_clip": 1.03243279, + "balance_loss_mlp": 1.30452466, + "epoch": 0.005711709003457087, + "flos": 59812957981440.0, + "grad_norm": 1.3851210442303683, + "language_loss": 0.6813519, + "learning_rate": 2.932023580065507e-06, + "loss": 0.70765626, + "num_input_tokens_seen": 1970970, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 2.578125, + "step": 95, + "time_per_iteration": 3.067047595977783 + }, + { + "auxiliary_loss_clip": 0.01611383, + "auxiliary_loss_mlp": 0.01154317, + "balance_loss_clip": 1.08693981, + "balance_loss_mlp": 1.21303511, + "epoch": 0.005771832256125056, + "flos": 15559591495680.0, + "grad_norm": 2.5109536438971976, + "language_loss": 0.89978027, + "learning_rate": 2.9387655493491906e-06, + "loss": 0.92743719, + "num_input_tokens_seen": 1988930, + "router_z_loss_clip": 0.671875, + "router_z_loss_mlp": 3.984375, + "step": 96, + "time_per_iteration": 2.590522289276123 + }, + { + "auxiliary_loss_clip": 0.01603776, + "auxiliary_loss_mlp": 0.01143264, + "balance_loss_clip": 1.08108413, + "balance_loss_mlp": 1.21597803, + "epoch": 0.005831955508793026, + "flos": 22528380015360.0, + "grad_norm": 2.825781473558237, + "language_loss": 0.89798892, + "learning_rate": 2.9454376524092147e-06, + "loss": 0.92545933, + "num_input_tokens_seen": 2006285, + "router_z_loss_clip": 0.62109375, + "router_z_loss_mlp": 3.875, + "step": 97, + "time_per_iteration": 2.630364179611206 + }, + { + "auxiliary_loss_clip": 0.0158997, + "auxiliary_loss_mlp": 0.01139649, + "balance_loss_clip": 1.07103181, + "balance_loss_mlp": 1.20754981, + "epoch": 0.005892078761460995, + "flos": 22049834094720.0, + "grad_norm": 2.1954130163748573, + "language_loss": 0.76553786, + "learning_rate": 2.952041322436969e-06, + "loss": 0.79283404, + "num_input_tokens_seen": 2024905, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 3.8125, + "step": 98, + "time_per_iteration": 2.6088852882385254 + }, + { + "auxiliary_loss_clip": 0.01531856, + "auxiliary_loss_mlp": 0.01047217, + "balance_loss_clip": 1.01250362, + "balance_loss_mlp": 1.28449416, + "epoch": 0.005952202014128965, + "flos": 68539143317760.0, + "grad_norm": 1.0389188302362988, + "language_loss": 0.65464473, + "learning_rate": 2.9585779489718204e-06, + "loss": 0.68043554, + "num_input_tokens_seen": 2086220, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 2.46875, + "step": 99, + "time_per_iteration": 3.196779251098633 + }, + { + "auxiliary_loss_clip": 0.0159215, + "auxiliary_loss_mlp": 0.01143603, + "balance_loss_clip": 1.07312632, + "balance_loss_mlp": 1.20754516, + "epoch": 0.006012325266796933, + "flos": 22960887678720.0, + "grad_norm": 2.02393591458392, + "language_loss": 0.90861535, + "learning_rate": 2.9650488796560464e-06, + "loss": 0.93597281, + "num_input_tokens_seen": 2103365, + "router_z_loss_clip": 0.703125, + "router_z_loss_mlp": 3.84375, + "step": 100, + "time_per_iteration": 2.659716844558716 + }, + { + "auxiliary_loss_clip": 0.01602583, + "auxiliary_loss_mlp": 0.01150362, + "balance_loss_clip": 1.08360529, + "balance_loss_mlp": 1.21008992, + "epoch": 0.006072448519464903, + "flos": 17347942857600.0, + "grad_norm": 9.149928686451464, + "language_loss": 0.91165614, + "learning_rate": 2.971455421902446e-06, + "loss": 0.93918556, + "num_input_tokens_seen": 2121995, + "router_z_loss_clip": 0.66796875, + "router_z_loss_mlp": 3.921875, + "step": 101, + "time_per_iteration": 5.522722959518433 + }, + { + "auxiliary_loss_clip": 0.01592164, + "auxiliary_loss_mlp": 0.01153598, + "balance_loss_clip": 1.08273995, + "balance_loss_mlp": 1.21078956, + "epoch": 0.006132571772132872, + "flos": 24681116897280.0, + "grad_norm": 2.149611483260168, + "language_loss": 0.90634245, + "learning_rate": 2.9777988444798075e-06, + "loss": 0.9338001, + "num_input_tokens_seen": 2141815, + "router_z_loss_clip": 0.7109375, + "router_z_loss_mlp": 3.8125, + "step": 102, + "time_per_iteration": 2.7264201641082764 + }, + { + "auxiliary_loss_clip": 0.01586171, + "auxiliary_loss_mlp": 0.01134806, + "balance_loss_clip": 1.06986046, + "balance_loss_mlp": 1.20794034, + "epoch": 0.006192695024800842, + "flos": 21465675210240.0, + "grad_norm": 2.4455555336324135, + "language_loss": 0.87990314, + "learning_rate": 2.9840803790210285e-06, + "loss": 0.9071129, + "num_input_tokens_seen": 2161125, + "router_z_loss_clip": 0.6484375, + "router_z_loss_mlp": 3.78125, + "step": 103, + "time_per_iteration": 2.6332345008850098 + }, + { + "auxiliary_loss_clip": 0.01586169, + "auxiliary_loss_mlp": 0.01136721, + "balance_loss_clip": 1.07015502, + "balance_loss_mlp": 1.2100153, + "epoch": 0.006252818277468811, + "flos": 17420410546560.0, + "grad_norm": 1.9653003456434248, + "language_loss": 0.93796182, + "learning_rate": 2.990301221458371e-06, + "loss": 0.96519077, + "num_input_tokens_seen": 2179510, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 3.765625, + "step": 104, + "time_per_iteration": 2.5763180255889893 + }, + { + "auxiliary_loss_clip": 0.01576682, + "auxiliary_loss_mlp": 0.01148107, + "balance_loss_clip": 1.08382916, + "balance_loss_mlp": 1.20004964, + "epoch": 0.006312941530136781, + "flos": 19099557584640.0, + "grad_norm": 2.978383813748495, + "language_loss": 0.96302718, + "learning_rate": 2.9964625333900544e-06, + "loss": 0.99027503, + "num_input_tokens_seen": 2197870, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 3.765625, + "step": 105, + "time_per_iteration": 2.598074197769165 + }, + { + "auxiliary_loss_clip": 0.01576054, + "auxiliary_loss_mlp": 0.01157995, + "balance_loss_clip": 1.08618331, + "balance_loss_mlp": 1.20040035, + "epoch": 0.006373064782804749, + "flos": 24060831909120.0, + "grad_norm": 2.254409296180574, + "language_loss": 0.86981636, + "learning_rate": 3.002565443382063e-06, + "loss": 0.89715683, + "num_input_tokens_seen": 2217495, + "router_z_loss_clip": 0.71875, + "router_z_loss_mlp": 3.75, + "step": 106, + "time_per_iteration": 2.620400905609131 + }, + { + "auxiliary_loss_clip": 0.01558878, + "auxiliary_loss_mlp": 0.01142953, + "balance_loss_clip": 1.07462192, + "balance_loss_mlp": 1.18650925, + "epoch": 0.006433188035472719, + "flos": 18332433797760.0, + "grad_norm": 2.299900982703377, + "language_loss": 0.8342824, + "learning_rate": 3.008611048208843e-06, + "loss": 0.86130083, + "num_input_tokens_seen": 2236520, + "router_z_loss_clip": 0.68359375, + "router_z_loss_mlp": 3.71875, + "step": 107, + "time_per_iteration": 2.6031439304351807 + }, + { + "auxiliary_loss_clip": 0.01473949, + "auxiliary_loss_mlp": 0.01044987, + "balance_loss_clip": 1.01294351, + "balance_loss_mlp": 1.24969411, + "epoch": 0.006493311288140688, + "flos": 62562387594240.0, + "grad_norm": 0.9921074222226888, + "language_loss": 0.64829654, + "learning_rate": 3.014600414036285e-06, + "loss": 0.67348593, + "num_input_tokens_seen": 2300140, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 2.25, + "step": 108, + "time_per_iteration": 3.1797876358032227 + }, + { + "auxiliary_loss_clip": 0.01549803, + "auxiliary_loss_mlp": 0.0113223, + "balance_loss_clip": 1.0634706, + "balance_loss_mlp": 1.18794155, + "epoch": 0.006553434540808658, + "flos": 19500141035520.0, + "grad_norm": 3.0292528917398895, + "language_loss": 0.97705221, + "learning_rate": 3.0205345775501937e-06, + "loss": 1.00387263, + "num_input_tokens_seen": 2317320, + "router_z_loss_clip": 0.6875, + "router_z_loss_mlp": 3.625, + "step": 109, + "time_per_iteration": 2.587251663208008 + }, + { + "auxiliary_loss_clip": 0.01548304, + "auxiliary_loss_mlp": 0.01143686, + "balance_loss_clip": 1.07759643, + "balance_loss_mlp": 1.18955791, + "epoch": 0.006613557793476627, + "flos": 21105132445440.0, + "grad_norm": 1.7037490209774204, + "language_loss": 0.84119976, + "learning_rate": 3.0264145470332218e-06, + "loss": 0.86811972, + "num_input_tokens_seen": 2337820, + "router_z_loss_clip": 0.66015625, + "router_z_loss_mlp": 3.59375, + "step": 110, + "time_per_iteration": 2.612900495529175 + }, + { + "auxiliary_loss_clip": 0.01543027, + "auxiliary_loss_mlp": 0.01148771, + "balance_loss_clip": 1.08287191, + "balance_loss_mlp": 1.18348098, + "epoch": 0.006673681046144597, + "flos": 26030747543040.0, + "grad_norm": 2.0686651571732186, + "language_loss": 0.83053756, + "learning_rate": 3.032241303393073e-06, + "loss": 0.85745549, + "num_input_tokens_seen": 2358560, + "router_z_loss_clip": 0.66015625, + "router_z_loss_mlp": 3.59375, + "step": 111, + "time_per_iteration": 2.648775815963745 + }, + { + "auxiliary_loss_clip": 0.01543945, + "auxiliary_loss_mlp": 0.01132291, + "balance_loss_clip": 1.06906247, + "balance_loss_mlp": 1.18600404, + "epoch": 0.006733804298812566, + "flos": 23147767163520.0, + "grad_norm": 1.9360906695559799, + "language_loss": 0.94064176, + "learning_rate": 3.0380158011446e-06, + "loss": 0.96740413, + "num_input_tokens_seen": 2379005, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 3.59375, + "step": 112, + "time_per_iteration": 2.5952305793762207 + }, + { + "auxiliary_loss_clip": 0.01547241, + "auxiliary_loss_mlp": 0.0113746, + "balance_loss_clip": 1.07342076, + "balance_loss_mlp": 1.18214464, + "epoch": 0.006793927551480535, + "flos": 11764444210560.0, + "grad_norm": 2.4119047199233594, + "language_loss": 0.79298341, + "learning_rate": 3.0437389693482466e-06, + "loss": 0.81983036, + "num_input_tokens_seen": 2395610, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 3.65625, + "step": 113, + "time_per_iteration": 2.524744987487793 + }, + { + "auxiliary_loss_clip": 0.01535171, + "auxiliary_loss_mlp": 0.01130123, + "balance_loss_clip": 1.06460583, + "balance_loss_mlp": 1.1784718, + "epoch": 0.006854050804148504, + "flos": 19171953446400.0, + "grad_norm": 2.1108584765070924, + "language_loss": 0.93168736, + "learning_rate": 3.0494117125071475e-06, + "loss": 0.95834035, + "num_input_tokens_seen": 2415005, + "router_z_loss_clip": 0.65625, + "router_z_loss_mlp": 3.5625, + "step": 114, + "time_per_iteration": 2.6716785430908203 + }, + { + "auxiliary_loss_clip": 0.01541748, + "auxiliary_loss_mlp": 0.01138267, + "balance_loss_clip": 1.07828045, + "balance_loss_mlp": 1.17785645, + "epoch": 0.006914174056816474, + "flos": 21981891519360.0, + "grad_norm": 2.266348661789013, + "language_loss": 0.94440514, + "learning_rate": 3.055034911425055e-06, + "loss": 0.97120523, + "num_input_tokens_seen": 2433965, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 3.640625, + "step": 115, + "time_per_iteration": 2.6136229038238525 + }, + { + "auxiliary_loss_clip": 0.01536673, + "auxiliary_loss_mlp": 0.0111845, + "balance_loss_clip": 1.052122, + "balance_loss_mlp": 1.1758287, + "epoch": 0.006974297309484443, + "flos": 16289152634880.0, + "grad_norm": 12.665326776351556, + "language_loss": 0.81903678, + "learning_rate": 3.0606094240271244e-06, + "loss": 0.84558797, + "num_input_tokens_seen": 2451605, + "router_z_loss_clip": 0.6640625, + "router_z_loss_mlp": 3.609375, + "step": 116, + "time_per_iteration": 2.577003240585327 + }, + { + "auxiliary_loss_clip": 0.01526673, + "auxiliary_loss_mlp": 0.01127935, + "balance_loss_clip": 1.06375241, + "balance_loss_mlp": 1.17504787, + "epoch": 0.007034420562152413, + "flos": 26104005331200.0, + "grad_norm": 2.0071741256932794, + "language_loss": 0.88063896, + "learning_rate": 3.0661360861454656e-06, + "loss": 0.90718508, + "num_input_tokens_seen": 2472035, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 3.515625, + "step": 117, + "time_per_iteration": 2.611503839492798 + }, + { + "auxiliary_loss_clip": 0.01525448, + "auxiliary_loss_mlp": 0.01143736, + "balance_loss_clip": 1.07840896, + "balance_loss_mlp": 1.17308259, + "epoch": 0.007094543814820382, + "flos": 14204609723520.0, + "grad_norm": 2.5473368597875594, + "language_loss": 0.84470415, + "learning_rate": 3.071615712271274e-06, + "loss": 0.87139601, + "num_input_tokens_seen": 2489285, + "router_z_loss_clip": 0.65234375, + "router_z_loss_mlp": 3.53125, + "step": 118, + "time_per_iteration": 2.577461004257202 + }, + { + "auxiliary_loss_clip": 0.01536798, + "auxiliary_loss_mlp": 0.01163532, + "balance_loss_clip": 1.09930205, + "balance_loss_mlp": 1.1748507, + "epoch": 0.007154667067488351, + "flos": 14976007228800.0, + "grad_norm": 2.057592918726277, + "language_loss": 0.99470234, + "learning_rate": 3.0770490962752172e-06, + "loss": 1.02170563, + "num_input_tokens_seen": 2506460, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 3.625, + "step": 119, + "time_per_iteration": 2.549661636352539 + }, + { + "auxiliary_loss_clip": 0.01537395, + "auxiliary_loss_mlp": 0.0111939, + "balance_loss_clip": 1.05701971, + "balance_loss_mlp": 1.16968298, + "epoch": 0.00721479032015632, + "flos": 20193288762240.0, + "grad_norm": 2.410205702357196, + "language_loss": 0.89085704, + "learning_rate": 3.082437012097686e-06, + "loss": 0.91742492, + "num_input_tokens_seen": 2525565, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 3.6875, + "step": 120, + "time_per_iteration": 2.583630084991455 + }, + { + "auxiliary_loss_clip": 0.01524337, + "auxiliary_loss_mlp": 0.01130091, + "balance_loss_clip": 1.06667209, + "balance_loss_mlp": 1.17169607, + "epoch": 0.00727491357282429, + "flos": 23147228459520.0, + "grad_norm": 1.904240324338801, + "language_loss": 0.93491054, + "learning_rate": 3.0877802144103967e-06, + "loss": 0.96145487, + "num_input_tokens_seen": 2546605, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 3.53125, + "step": 121, + "time_per_iteration": 2.6146788597106934 + }, + { + "auxiliary_loss_clip": 0.01523412, + "auxiliary_loss_mlp": 0.0114605, + "balance_loss_clip": 1.08382273, + "balance_loss_mlp": 1.17073464, + "epoch": 0.007335036825492259, + "flos": 15521669712000.0, + "grad_norm": 3.352658173167552, + "language_loss": 0.90176952, + "learning_rate": 3.09307943925077e-06, + "loss": 0.92846411, + "num_input_tokens_seen": 2560730, + "router_z_loss_clip": 0.62109375, + "router_z_loss_mlp": 3.53125, + "step": 122, + "time_per_iteration": 2.566470146179199 + }, + { + "auxiliary_loss_clip": 0.01520578, + "auxiliary_loss_mlp": 0.01142532, + "balance_loss_clip": 1.07634664, + "balance_loss_mlp": 1.16606736, + "epoch": 0.007395160078160229, + "flos": 24243365848320.0, + "grad_norm": 2.7249964127160764, + "language_loss": 0.92516506, + "learning_rate": 3.0983354046304154e-06, + "loss": 0.95179617, + "num_input_tokens_seen": 2579550, + "router_z_loss_clip": 0.66015625, + "router_z_loss_mlp": 3.546875, + "step": 123, + "time_per_iteration": 2.6002941131591797 + }, + { + "auxiliary_loss_clip": 0.01517776, + "auxiliary_loss_mlp": 0.01125795, + "balance_loss_clip": 1.06433022, + "balance_loss_mlp": 1.1609534, + "epoch": 0.007455283330828198, + "flos": 31759792099200.0, + "grad_norm": 7.583203404073904, + "language_loss": 0.71128142, + "learning_rate": 3.103548811118979e-06, + "loss": 0.73771715, + "num_input_tokens_seen": 2600390, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 3.5625, + "step": 124, + "time_per_iteration": 2.79618763923645 + }, + { + "auxiliary_loss_clip": 0.01505473, + "auxiliary_loss_mlp": 0.01124615, + "balance_loss_clip": 1.06157708, + "balance_loss_mlp": 1.16223335, + "epoch": 0.007515406583496167, + "flos": 26615157822720.0, + "grad_norm": 2.4227692366027855, + "language_loss": 0.88482195, + "learning_rate": 3.108720342404542e-06, + "loss": 0.9111228, + "num_input_tokens_seen": 2620770, + "router_z_loss_clip": 0.62890625, + "router_z_loss_mlp": 3.4375, + "step": 125, + "time_per_iteration": 2.6131536960601807 + }, + { + "auxiliary_loss_clip": 0.0152071, + "auxiliary_loss_mlp": 0.01140137, + "balance_loss_clip": 1.07762396, + "balance_loss_mlp": 1.16211164, + "epoch": 0.007575529836164136, + "flos": 18223696350720.0, + "grad_norm": 2.993097477973623, + "language_loss": 0.82384819, + "learning_rate": 3.1138506658316945e-06, + "loss": 0.8504566, + "num_input_tokens_seen": 2639900, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 3.59375, + "step": 126, + "time_per_iteration": 2.595423936843872 + }, + { + "auxiliary_loss_clip": 0.01514354, + "auxiliary_loss_mlp": 0.01139255, + "balance_loss_clip": 1.077981, + "balance_loss_mlp": 1.16128385, + "epoch": 0.007635653088832106, + "flos": 21580410228480.0, + "grad_norm": 3.7264016399601534, + "language_loss": 0.67276633, + "learning_rate": 3.1189404329183404e-06, + "loss": 0.69930243, + "num_input_tokens_seen": 2657450, + "router_z_loss_clip": 0.61328125, + "router_z_loss_mlp": 3.53125, + "step": 127, + "time_per_iteration": 2.620950937271118 + }, + { + "auxiliary_loss_clip": 0.01504536, + "auxiliary_loss_mlp": 0.01128822, + "balance_loss_clip": 1.06640375, + "balance_loss_mlp": 1.16422939, + "epoch": 0.007695776341500075, + "flos": 25375054723200.0, + "grad_norm": 3.6226937306152496, + "language_loss": 0.8815757, + "learning_rate": 3.1239902798522317e-06, + "loss": 0.90790927, + "num_input_tokens_seen": 2678150, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 3.40625, + "step": 128, + "time_per_iteration": 2.6476521492004395 + }, + { + "auxiliary_loss_clip": 0.01505804, + "auxiliary_loss_mlp": 0.01141266, + "balance_loss_clip": 1.07870471, + "balance_loss_mlp": 1.15920687, + "epoch": 0.007755899594168045, + "flos": 22343906741760.0, + "grad_norm": 1.875185485357673, + "language_loss": 0.84581351, + "learning_rate": 3.129000827968184e-06, + "loss": 0.87228423, + "num_input_tokens_seen": 2698290, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 3.46875, + "step": 129, + "time_per_iteration": 2.611762285232544 + }, + { + "auxiliary_loss_clip": 0.01499869, + "auxiliary_loss_mlp": 0.01133647, + "balance_loss_clip": 1.07122934, + "balance_loss_mlp": 1.1588279, + "epoch": 0.007816022846836013, + "flos": 22638230784000.0, + "grad_norm": 2.023668494136832, + "language_loss": 0.9742806, + "learning_rate": 3.133972684206866e-06, + "loss": 1.00061572, + "num_input_tokens_seen": 2717630, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 3.40625, + "step": 130, + "time_per_iteration": 2.599639415740967 + }, + { + "auxiliary_loss_clip": 0.01493155, + "auxiliary_loss_mlp": 0.0113499, + "balance_loss_clip": 1.07109392, + "balance_loss_mlp": 1.15518749, + "epoch": 0.007876146099503984, + "flos": 18182901479040.0, + "grad_norm": 2.1876581172480285, + "language_loss": 0.82624269, + "learning_rate": 3.138906441556014e-06, + "loss": 0.85252404, + "num_input_tokens_seen": 2735835, + "router_z_loss_clip": 0.63671875, + "router_z_loss_mlp": 3.375, + "step": 131, + "time_per_iteration": 2.6086065769195557 + }, + { + "auxiliary_loss_clip": 0.01502593, + "auxiliary_loss_mlp": 0.01127672, + "balance_loss_clip": 1.06759024, + "balance_loss_mlp": 1.15800536, + "epoch": 0.007936269352171952, + "flos": 27119486730240.0, + "grad_norm": 2.4868851395581677, + "language_loss": 0.82762384, + "learning_rate": 3.143802679474861e-06, + "loss": 0.85392648, + "num_input_tokens_seen": 2756335, + "router_z_loss_clip": 0.6015625, + "router_z_loss_mlp": 3.4375, + "step": 132, + "time_per_iteration": 2.673790454864502 + }, + { + "auxiliary_loss_clip": 0.01493849, + "auxiliary_loss_mlp": 0.01128197, + "balance_loss_clip": 1.06716144, + "balance_loss_mlp": 1.15264463, + "epoch": 0.007996392604839923, + "flos": 19026335710080.0, + "grad_norm": 2.7432419346617443, + "language_loss": 0.95486552, + "learning_rate": 3.1486619643025565e-06, + "loss": 0.98108596, + "num_input_tokens_seen": 2775090, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 3.40625, + "step": 133, + "time_per_iteration": 2.6287872791290283 + }, + { + "auxiliary_loss_clip": 0.01490198, + "auxiliary_loss_mlp": 0.01125526, + "balance_loss_clip": 1.06725681, + "balance_loss_mlp": 1.16143155, + "epoch": 0.008056515857507891, + "flos": 25484151306240.0, + "grad_norm": 1.7764051426707919, + "language_loss": 0.73316634, + "learning_rate": 3.153484849651286e-06, + "loss": 0.7593236, + "num_input_tokens_seen": 2795320, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 3.296875, + "step": 134, + "time_per_iteration": 2.6728081703186035 + }, + { + "auxiliary_loss_clip": 0.01486213, + "auxiliary_loss_mlp": 0.01130543, + "balance_loss_clip": 1.06707644, + "balance_loss_mlp": 1.14955854, + "epoch": 0.00811663911017586, + "flos": 20557566541440.0, + "grad_norm": 3.090234736760587, + "language_loss": 0.88808328, + "learning_rate": 3.1582718767847806e-06, + "loss": 0.91425079, + "num_input_tokens_seen": 2812815, + "router_z_loss_clip": 0.63671875, + "router_z_loss_mlp": 3.375, + "step": 135, + "time_per_iteration": 2.6380510330200195 + }, + { + "auxiliary_loss_clip": 0.01489108, + "auxiliary_loss_mlp": 0.01131555, + "balance_loss_clip": 1.06789732, + "balance_loss_mlp": 1.15456343, + "epoch": 0.00817676236284383, + "flos": 18799738761600.0, + "grad_norm": 2.008171494368998, + "language_loss": 0.89123899, + "learning_rate": 3.1630235749828485e-06, + "loss": 0.9174456, + "num_input_tokens_seen": 2830445, + "router_z_loss_clip": 0.63671875, + "router_z_loss_mlp": 3.34375, + "step": 136, + "time_per_iteration": 2.555936813354492 + }, + { + "auxiliary_loss_clip": 0.01486639, + "auxiliary_loss_mlp": 0.01108223, + "balance_loss_clip": 1.04962027, + "balance_loss_mlp": 1.14870429, + "epoch": 0.008236885615511799, + "flos": 23873593288320.0, + "grad_norm": 5.8712537379963345, + "language_loss": 0.8400104, + "learning_rate": 3.1677404618925676e-06, + "loss": 0.86595905, + "num_input_tokens_seen": 2846965, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 3.375, + "step": 137, + "time_per_iteration": 2.6225337982177734 + }, + { + "auxiliary_loss_clip": 0.01482624, + "auxiliary_loss_mlp": 0.01116377, + "balance_loss_clip": 1.05796409, + "balance_loss_mlp": 1.14842129, + "epoch": 0.00829700886817977, + "flos": 24643626076800.0, + "grad_norm": 1.6861384534946333, + "language_loss": 0.90170664, + "learning_rate": 3.1724230438666953e-06, + "loss": 0.9276967, + "num_input_tokens_seen": 2867520, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 3.34375, + "step": 138, + "time_per_iteration": 2.653205156326294 + }, + { + "auxiliary_loss_clip": 0.01472312, + "auxiliary_loss_mlp": 0.01119929, + "balance_loss_clip": 1.0568912, + "balance_loss_mlp": 1.1478796, + "epoch": 0.008357132120847738, + "flos": 25262007644160.0, + "grad_norm": 2.679342832062188, + "language_loss": 0.91253459, + "learning_rate": 3.177071816289865e-06, + "loss": 0.93845713, + "num_input_tokens_seen": 2885675, + "router_z_loss_clip": 0.6328125, + "router_z_loss_mlp": 3.234375, + "step": 139, + "time_per_iteration": 2.6182503700256348 + }, + { + "auxiliary_loss_clip": 0.01489087, + "auxiliary_loss_mlp": 0.01123997, + "balance_loss_clip": 1.06229401, + "balance_loss_mlp": 1.154405, + "epoch": 0.008417255373515706, + "flos": 27344898529920.0, + "grad_norm": 2.5553770836970675, + "language_loss": 0.85446793, + "learning_rate": 3.181687263893095e-06, + "loss": 0.88059878, + "num_input_tokens_seen": 2905960, + "router_z_loss_clip": 0.6171875, + "router_z_loss_mlp": 3.34375, + "step": 140, + "time_per_iteration": 2.649454116821289 + }, + { + "auxiliary_loss_clip": 0.01476267, + "auxiliary_loss_mlp": 0.0111889, + "balance_loss_clip": 1.0594281, + "balance_loss_mlp": 1.14865911, + "epoch": 0.008477378626183677, + "flos": 17639070589440.0, + "grad_norm": 2.379593217845822, + "language_loss": 0.84156519, + "learning_rate": 3.186269861057098e-06, + "loss": 0.86751676, + "num_input_tokens_seen": 2922780, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 3.28125, + "step": 141, + "time_per_iteration": 2.608603000640869 + }, + { + "auxiliary_loss_clip": 0.01480312, + "auxiliary_loss_mlp": 0.01134333, + "balance_loss_clip": 1.07320273, + "balance_loss_mlp": 1.14624739, + "epoch": 0.008537501878851645, + "flos": 13881342297600.0, + "grad_norm": 2.3283494467369965, + "language_loss": 0.81387591, + "learning_rate": 3.1908200721048745e-06, + "loss": 0.84002233, + "num_input_tokens_seen": 2938765, + "router_z_loss_clip": 0.609375, + "router_z_loss_mlp": 3.34375, + "step": 142, + "time_per_iteration": 4.023308753967285 + }, + { + "auxiliary_loss_clip": 0.01378722, + "auxiliary_loss_mlp": 0.01032728, + "balance_loss_clip": 1.00621629, + "balance_loss_mlp": 1.1918689, + "epoch": 0.008597625131519616, + "flos": 71248101281280.0, + "grad_norm": 1.0451783350372967, + "language_loss": 0.66831523, + "learning_rate": 3.195338351584042e-06, + "loss": 0.69242978, + "num_input_tokens_seen": 3006665, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.8671875, + "step": 143, + "time_per_iteration": 4.718023777008057 + }, + { + "auxiliary_loss_clip": 0.01472184, + "auxiliary_loss_mlp": 0.0112263, + "balance_loss_clip": 1.06283474, + "balance_loss_mlp": 1.14625573, + "epoch": 0.008657748384187584, + "flos": 17602836744960.0, + "grad_norm": 2.2608538764922295, + "language_loss": 0.83954072, + "learning_rate": 3.1998251445393258e-06, + "loss": 0.86548889, + "num_input_tokens_seen": 3024335, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 3.25, + "step": 144, + "time_per_iteration": 2.5878453254699707 + }, + { + "auxiliary_loss_clip": 0.01457808, + "auxiliary_loss_mlp": 0.01111605, + "balance_loss_clip": 1.04890084, + "balance_loss_mlp": 1.13930941, + "epoch": 0.008717871636855555, + "flos": 19715317459200.0, + "grad_norm": 2.241812154138119, + "language_loss": 0.88511693, + "learning_rate": 3.204280886775619e-06, + "loss": 0.91081107, + "num_input_tokens_seen": 3043300, + "router_z_loss_clip": 0.62890625, + "router_z_loss_mlp": 3.1875, + "step": 145, + "time_per_iteration": 2.586512565612793 + }, + { + "auxiliary_loss_clip": 0.01475641, + "auxiliary_loss_mlp": 0.01124002, + "balance_loss_clip": 1.06153631, + "balance_loss_mlp": 1.14211285, + "epoch": 0.008777994889523523, + "flos": 24717422568960.0, + "grad_norm": 1.792984011276012, + "language_loss": 0.85949898, + "learning_rate": 3.208706005112005e-06, + "loss": 0.88549542, + "num_input_tokens_seen": 3064610, + "router_z_loss_clip": 0.625, + "router_z_loss_mlp": 3.34375, + "step": 146, + "time_per_iteration": 2.6258151531219482 + }, + { + "auxiliary_loss_clip": 0.01359324, + "auxiliary_loss_mlp": 0.01026953, + "balance_loss_clip": 1.00082254, + "balance_loss_mlp": 1.17825258, + "epoch": 0.008838118142191492, + "flos": 70132067758080.0, + "grad_norm": 0.8557738136673508, + "language_loss": 0.60047674, + "learning_rate": 3.213100917627104e-06, + "loss": 0.62433958, + "num_input_tokens_seen": 3130385, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.8125, + "step": 147, + "time_per_iteration": 3.2522764205932617 + }, + { + "auxiliary_loss_clip": 0.01465546, + "auxiliary_loss_mlp": 0.01124118, + "balance_loss_clip": 1.06670642, + "balance_loss_mlp": 1.14550173, + "epoch": 0.008898241394859462, + "flos": 20044797937920.0, + "grad_norm": 1.8343461268862185, + "language_loss": 0.8454501, + "learning_rate": 3.2174660338961135e-06, + "loss": 0.87134671, + "num_input_tokens_seen": 3149760, + "router_z_loss_clip": 0.57421875, + "router_z_loss_mlp": 3.203125, + "step": 148, + "time_per_iteration": 2.635499954223633 + }, + { + "auxiliary_loss_clip": 0.0147086, + "auxiliary_loss_mlp": 0.01143141, + "balance_loss_clip": 1.07914925, + "balance_loss_mlp": 1.14693797, + "epoch": 0.008958364647527431, + "flos": 10743611685120.0, + "grad_norm": 2.2581185064103404, + "language_loss": 0.88802874, + "learning_rate": 3.2218017552198588e-06, + "loss": 0.91416872, + "num_input_tokens_seen": 3164500, + "router_z_loss_clip": 0.640625, + "router_z_loss_mlp": 3.234375, + "step": 149, + "time_per_iteration": 2.5458836555480957 + }, + { + "auxiliary_loss_clip": 0.01466862, + "auxiliary_loss_mlp": 0.01112061, + "balance_loss_clip": 1.05445874, + "balance_loss_mlp": 1.14131117, + "epoch": 0.009018487900195401, + "flos": 29127467802240.0, + "grad_norm": 2.7760320197047097, + "language_loss": 0.93054724, + "learning_rate": 3.226108474846181e-06, + "loss": 0.95633656, + "num_input_tokens_seen": 3182455, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 3.25, + "step": 150, + "time_per_iteration": 2.648111343383789 + }, + { + "auxiliary_loss_clip": 0.01454371, + "auxiliary_loss_mlp": 0.01109463, + "balance_loss_clip": 1.05391192, + "balance_loss_mlp": 1.13663483, + "epoch": 0.00907861115286337, + "flos": 32963661354240.0, + "grad_norm": 1.9005080345968057, + "language_loss": 0.74303263, + "learning_rate": 3.2303865781839817e-06, + "loss": 0.76867104, + "num_input_tokens_seen": 3203995, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 3.171875, + "step": 151, + "time_per_iteration": 2.6953165531158447 + }, + { + "auxiliary_loss_clip": 0.014664, + "auxiliary_loss_mlp": 0.01125146, + "balance_loss_clip": 1.06735289, + "balance_loss_mlp": 1.14143276, + "epoch": 0.009138734405531338, + "flos": 21762441377280.0, + "grad_norm": 2.6241423805649298, + "language_loss": 0.88251799, + "learning_rate": 3.234636443010188e-06, + "loss": 0.90843344, + "num_input_tokens_seen": 3222575, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 3.25, + "step": 152, + "time_per_iteration": 2.6034231185913086 + }, + { + "auxiliary_loss_clip": 0.01466383, + "auxiliary_loss_mlp": 0.01121244, + "balance_loss_clip": 1.0628314, + "balance_loss_mlp": 1.14757276, + "epoch": 0.009198857658199309, + "flos": 20842517134080.0, + "grad_norm": 3.4062301864690196, + "language_loss": 0.83957756, + "learning_rate": 3.238858439669943e-06, + "loss": 0.86545384, + "num_input_tokens_seen": 3240180, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 3.1875, + "step": 153, + "time_per_iteration": 2.6023271083831787 + }, + { + "auxiliary_loss_clip": 0.01456394, + "auxiliary_loss_mlp": 0.01136316, + "balance_loss_clip": 1.0765202, + "balance_loss_mlp": 1.13805962, + "epoch": 0.009258980910867277, + "flos": 24827381078400.0, + "grad_norm": 1.9441527650945287, + "language_loss": 0.89881843, + "learning_rate": 3.2430529312702712e-06, + "loss": 0.92474556, + "num_input_tokens_seen": 3259800, + "router_z_loss_clip": 0.59765625, + "router_z_loss_mlp": 3.1875, + "step": 154, + "time_per_iteration": 2.646308183670044 + }, + { + "auxiliary_loss_clip": 0.01460439, + "auxiliary_loss_mlp": 0.01154617, + "balance_loss_clip": 1.09577537, + "balance_loss_mlp": 1.14094579, + "epoch": 0.009319104163535248, + "flos": 28767786963840.0, + "grad_norm": 2.0692323216259187, + "language_loss": 0.89471745, + "learning_rate": 3.2472202738674737e-06, + "loss": 0.92086804, + "num_input_tokens_seen": 3280400, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 3.1875, + "step": 155, + "time_per_iteration": 2.6336286067962646 + }, + { + "auxiliary_loss_clip": 0.01463585, + "auxiliary_loss_mlp": 0.01116238, + "balance_loss_clip": 1.05894589, + "balance_loss_mlp": 1.13895822, + "epoch": 0.009379227416203216, + "flos": 16582004219520.0, + "grad_norm": 3.3077298720636255, + "language_loss": 0.86882627, + "learning_rate": 3.2513608166485063e-06, + "loss": 0.89462447, + "num_input_tokens_seen": 3297600, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 3.25, + "step": 156, + "time_per_iteration": 2.5539867877960205 + }, + { + "auxiliary_loss_clip": 0.01462083, + "auxiliary_loss_mlp": 0.01121969, + "balance_loss_clip": 1.06408143, + "balance_loss_mlp": 1.14298415, + "epoch": 0.009439350668871187, + "flos": 18329919845760.0, + "grad_norm": 2.4916444524903527, + "language_loss": 0.99553013, + "learning_rate": 3.2554749021065498e-06, + "loss": 1.02137065, + "num_input_tokens_seen": 3313635, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 3.1875, + "step": 157, + "time_per_iteration": 2.5249693393707275 + }, + { + "auxiliary_loss_clip": 0.01445636, + "auxiliary_loss_mlp": 0.01139016, + "balance_loss_clip": 1.08146214, + "balance_loss_mlp": 1.1366899, + "epoch": 0.009499473921539155, + "flos": 24349912565760.0, + "grad_norm": 2.0302475566757225, + "language_loss": 0.8847568, + "learning_rate": 3.2595628662110186e-06, + "loss": 0.91060334, + "num_input_tokens_seen": 3333735, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 3.09375, + "step": 158, + "time_per_iteration": 2.6009252071380615 + }, + { + "auxiliary_loss_clip": 0.01452439, + "auxiliary_loss_mlp": 0.01124253, + "balance_loss_clip": 1.06555486, + "balance_loss_mlp": 1.13677907, + "epoch": 0.009559597174207124, + "flos": 16399326625920.0, + "grad_norm": 4.310723443959545, + "language_loss": 0.86534697, + "learning_rate": 3.2636250385721982e-06, + "loss": 0.89111388, + "num_input_tokens_seen": 3348800, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 3.15625, + "step": 159, + "time_per_iteration": 2.6107394695281982 + }, + { + "auxiliary_loss_clip": 0.01442093, + "auxiliary_loss_mlp": 0.01132817, + "balance_loss_clip": 1.07340288, + "balance_loss_mlp": 1.13145089, + "epoch": 0.009619720426875094, + "flos": 22856890826880.0, + "grad_norm": 1.790220267572532, + "language_loss": 0.86825597, + "learning_rate": 3.2676617426007263e-06, + "loss": 0.89400506, + "num_input_tokens_seen": 3368595, + "router_z_loss_clip": 0.59375, + "router_z_loss_mlp": 3.109375, + "step": 160, + "time_per_iteration": 2.574252128601074 + }, + { + "auxiliary_loss_clip": 0.01449537, + "auxiliary_loss_mlp": 0.01117828, + "balance_loss_clip": 1.06318271, + "balance_loss_mlp": 1.13704872, + "epoch": 0.009679843679543063, + "flos": 19135001329920.0, + "grad_norm": 2.6107931748588893, + "language_loss": 0.91542315, + "learning_rate": 3.2716732956621042e-06, + "loss": 0.94109678, + "num_input_tokens_seen": 3384975, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 3.125, + "step": 161, + "time_per_iteration": 2.550865650177002 + }, + { + "auxiliary_loss_clip": 0.01454094, + "auxiliary_loss_mlp": 0.01109765, + "balance_loss_clip": 1.05488133, + "balance_loss_mlp": 1.13759339, + "epoch": 0.009739966932211033, + "flos": 20302995876480.0, + "grad_norm": 2.2107920101940994, + "language_loss": 0.91690832, + "learning_rate": 3.2756600092264203e-06, + "loss": 0.94254684, + "num_input_tokens_seen": 3404755, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 3.15625, + "step": 162, + "time_per_iteration": 2.5527970790863037 + }, + { + "auxiliary_loss_clip": 0.01312712, + "auxiliary_loss_mlp": 0.0102725, + "balance_loss_clip": 1.00331306, + "balance_loss_mlp": 1.14560354, + "epoch": 0.009800090184879002, + "flos": 67034234177280.0, + "grad_norm": 1.2615279464106541, + "language_loss": 0.72354776, + "learning_rate": 3.279622189013474e-06, + "loss": 0.74694741, + "num_input_tokens_seen": 3467210, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.671875, + "step": 163, + "time_per_iteration": 3.143763542175293 + }, + { + "auxiliary_loss_clip": 0.01440764, + "auxiliary_loss_mlp": 0.01113881, + "balance_loss_clip": 1.05804312, + "balance_loss_mlp": 1.13505006, + "epoch": 0.00986021343754697, + "flos": 17164690646400.0, + "grad_norm": 2.1923315312730374, + "language_loss": 0.8427155, + "learning_rate": 3.283560135133457e-06, + "loss": 0.86826193, + "num_input_tokens_seen": 3483220, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 3.0625, + "step": 164, + "time_per_iteration": 2.5536584854125977 + }, + { + "auxiliary_loss_clip": 0.01429878, + "auxiliary_loss_mlp": 0.01100497, + "balance_loss_clip": 1.04585135, + "balance_loss_mlp": 1.12637794, + "epoch": 0.00992033669021494, + "flos": 17749424148480.0, + "grad_norm": 2.006756380443377, + "language_loss": 0.89215541, + "learning_rate": 3.2874741422233565e-06, + "loss": 0.91745919, + "num_input_tokens_seen": 3501465, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 3.03125, + "step": 165, + "time_per_iteration": 2.5313210487365723 + }, + { + "auxiliary_loss_clip": 0.01434156, + "auxiliary_loss_mlp": 0.01127756, + "balance_loss_clip": 1.0692482, + "balance_loss_mlp": 1.12764359, + "epoch": 0.00998045994288291, + "flos": 25297164080640.0, + "grad_norm": 6.432940691763592, + "language_loss": 0.80138129, + "learning_rate": 3.2913644995792465e-06, + "loss": 0.82700044, + "num_input_tokens_seen": 3520480, + "router_z_loss_clip": 0.5859375, + "router_z_loss_mlp": 3.0625, + "step": 166, + "time_per_iteration": 2.6461095809936523 + }, + { + "auxiliary_loss_clip": 0.01438531, + "auxiliary_loss_mlp": 0.01125189, + "balance_loss_clip": 1.06749213, + "balance_loss_mlp": 1.13121533, + "epoch": 0.01004058319555088, + "flos": 32298954220800.0, + "grad_norm": 2.334124726802297, + "language_loss": 0.9190954, + "learning_rate": 3.2952314912845914e-06, + "loss": 0.94473255, + "num_input_tokens_seen": 3539570, + "router_z_loss_clip": 0.578125, + "router_z_loss_mlp": 3.078125, + "step": 167, + "time_per_iteration": 2.655597448348999 + }, + { + "auxiliary_loss_clip": 0.01430369, + "auxiliary_loss_mlp": 0.01135101, + "balance_loss_clip": 1.07997894, + "balance_loss_mlp": 1.12960708, + "epoch": 0.010100706448218848, + "flos": 11319941404800.0, + "grad_norm": 3.1870046541457873, + "language_loss": 0.90852308, + "learning_rate": 3.299075396334735e-06, + "loss": 0.93417776, + "num_input_tokens_seen": 3555465, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 3.0, + "step": 168, + "time_per_iteration": 2.5387983322143555 + }, + { + "auxiliary_loss_clip": 0.01424973, + "auxiliary_loss_mlp": 0.01106848, + "balance_loss_clip": 1.05072391, + "balance_loss_mlp": 1.12456727, + "epoch": 0.010160829700886819, + "flos": 29719491765120.0, + "grad_norm": 2.0495813916191077, + "language_loss": 0.87094414, + "learning_rate": 3.3028964887576868e-06, + "loss": 0.89626241, + "num_input_tokens_seen": 3578970, + "router_z_loss_clip": 0.5625, + "router_z_loss_mlp": 3.0, + "step": 169, + "time_per_iteration": 2.6448419094085693 + }, + { + "auxiliary_loss_clip": 0.01426284, + "auxiliary_loss_mlp": 0.01111393, + "balance_loss_clip": 1.05548358, + "balance_loss_mlp": 1.12704372, + "epoch": 0.010220952953554787, + "flos": 20412343854720.0, + "grad_norm": 3.0203817486241973, + "language_loss": 0.84758192, + "learning_rate": 3.306695037731344e-06, + "loss": 0.87295866, + "num_input_tokens_seen": 3597275, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 3.0, + "step": 170, + "time_per_iteration": 2.5596489906311035 + }, + { + "auxiliary_loss_clip": 0.01435879, + "auxiliary_loss_mlp": 0.01136565, + "balance_loss_clip": 1.07963061, + "balance_loss_mlp": 1.12765205, + "epoch": 0.010281076206222756, + "flos": 31285124847360.0, + "grad_norm": 2.124400250788896, + "language_loss": 0.89896494, + "learning_rate": 3.3104713076972827e-06, + "loss": 0.92468935, + "num_input_tokens_seen": 3618905, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 3.078125, + "step": 171, + "time_per_iteration": 2.71183180809021 + }, + { + "auxiliary_loss_clip": 0.01429687, + "auxiliary_loss_mlp": 0.01108406, + "balance_loss_clip": 1.05421364, + "balance_loss_mlp": 1.1300813, + "epoch": 0.010341199458890726, + "flos": 21982286568960.0, + "grad_norm": 2.015577645060998, + "language_loss": 0.88978243, + "learning_rate": 3.314225558471224e-06, + "loss": 0.91516334, + "num_input_tokens_seen": 3639610, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 3.0, + "step": 172, + "time_per_iteration": 2.6193771362304688 + }, + { + "auxiliary_loss_clip": 0.01419105, + "auxiliary_loss_mlp": 0.01124801, + "balance_loss_clip": 1.06986928, + "balance_loss_mlp": 1.12354624, + "epoch": 0.010401322711558695, + "flos": 30810529422720.0, + "grad_norm": 1.6868779107262128, + "language_loss": 0.81148165, + "learning_rate": 3.317958045350308e-06, + "loss": 0.83692074, + "num_input_tokens_seen": 3664030, + "router_z_loss_clip": 0.55078125, + "router_z_loss_mlp": 2.953125, + "step": 173, + "time_per_iteration": 2.656935691833496 + }, + { + "auxiliary_loss_clip": 0.01430653, + "auxiliary_loss_mlp": 0.0110718, + "balance_loss_clip": 1.05496693, + "balance_loss_mlp": 1.12733519, + "epoch": 0.010461445964226665, + "flos": 24715124098560.0, + "grad_norm": 2.1134597687554244, + "language_loss": 0.82498932, + "learning_rate": 3.3216690192172596e-06, + "loss": 0.85036767, + "num_input_tokens_seen": 3683615, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 3.03125, + "step": 174, + "time_per_iteration": 2.6050753593444824 + }, + { + "auxiliary_loss_clip": 0.01425822, + "auxiliary_loss_mlp": 0.01124156, + "balance_loss_clip": 1.06984437, + "balance_loss_mlp": 1.12589645, + "epoch": 0.010521569216894634, + "flos": 27710361457920.0, + "grad_norm": 2.6035215697191965, + "language_loss": 0.72699076, + "learning_rate": 3.325358726641591e-06, + "loss": 0.75249052, + "num_input_tokens_seen": 3704540, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 3.0, + "step": 175, + "time_per_iteration": 2.6859946250915527 + }, + { + "auxiliary_loss_clip": 0.01427679, + "auxiliary_loss_mlp": 0.0113274, + "balance_loss_clip": 1.07571054, + "balance_loss_mlp": 1.12603855, + "epoch": 0.010581692469562603, + "flos": 12458346122880.0, + "grad_norm": 2.402827576481816, + "language_loss": 0.98082507, + "learning_rate": 3.329027409977902e-06, + "loss": 1.00642931, + "num_input_tokens_seen": 3721320, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 3.015625, + "step": 176, + "time_per_iteration": 2.5405664443969727 + }, + { + "auxiliary_loss_clip": 0.01412838, + "auxiliary_loss_mlp": 0.01132631, + "balance_loss_clip": 1.08005941, + "balance_loss_mlp": 1.12270594, + "epoch": 0.010641815722230573, + "flos": 19427601519360.0, + "grad_norm": 2.3427037211777115, + "language_loss": 0.76749414, + "learning_rate": 3.3326753074614087e-06, + "loss": 0.79294884, + "num_input_tokens_seen": 3739385, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.90625, + "step": 177, + "time_per_iteration": 2.555553674697876 + }, + { + "auxiliary_loss_clip": 0.01423246, + "auxiliary_loss_mlp": 0.01104615, + "balance_loss_clip": 1.0507797, + "balance_loss_mlp": 1.12089574, + "epoch": 0.010701938974898541, + "flos": 18332577452160.0, + "grad_norm": 2.4108248963401464, + "language_loss": 0.76824659, + "learning_rate": 3.3363026533007716e-06, + "loss": 0.79352522, + "num_input_tokens_seen": 3756360, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 3.015625, + "step": 178, + "time_per_iteration": 2.5799388885498047 + }, + { + "auxiliary_loss_clip": 0.01429506, + "auxiliary_loss_mlp": 0.01108132, + "balance_loss_clip": 1.05224717, + "balance_loss_mlp": 1.12586653, + "epoch": 0.010762062227566512, + "flos": 19203985399680.0, + "grad_norm": 2.1918052506036174, + "language_loss": 0.84004253, + "learning_rate": 3.3399096777683303e-06, + "loss": 0.86541891, + "num_input_tokens_seen": 3773930, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 3.03125, + "step": 179, + "time_per_iteration": 2.5387184619903564 + }, + { + "auxiliary_loss_clip": 0.01420983, + "auxiliary_loss_mlp": 0.01112539, + "balance_loss_clip": 1.05677247, + "balance_loss_mlp": 1.12062979, + "epoch": 0.01082218548023448, + "flos": 31425427370880.0, + "grad_norm": 1.90488055395076, + "language_loss": 0.83719397, + "learning_rate": 3.3434966072878213e-06, + "loss": 0.86252916, + "num_input_tokens_seen": 3793630, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 3.0, + "step": 180, + "time_per_iteration": 2.6149253845214844 + }, + { + "auxiliary_loss_clip": 0.01421575, + "auxiliary_loss_mlp": 0.01120367, + "balance_loss_clip": 1.06503046, + "balance_loss_mlp": 1.1226536, + "epoch": 0.01088230873290245, + "flos": 25046436170880.0, + "grad_norm": 3.784573507260413, + "language_loss": 0.7774682, + "learning_rate": 3.3470636645196674e-06, + "loss": 0.80288756, + "num_input_tokens_seen": 3813610, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 3.0, + "step": 181, + "time_per_iteration": 2.5769712924957275 + }, + { + "auxiliary_loss_clip": 0.01417045, + "auxiliary_loss_mlp": 0.01131731, + "balance_loss_clip": 1.07732356, + "balance_loss_mlp": 1.11938787, + "epoch": 0.01094243198557042, + "flos": 22893411980160.0, + "grad_norm": 3.1835165271024377, + "language_loss": 0.76440376, + "learning_rate": 3.3506110684439156e-06, + "loss": 0.78989148, + "num_input_tokens_seen": 3831390, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 2.96875, + "step": 182, + "time_per_iteration": 2.5641353130340576 + }, + { + "auxiliary_loss_clip": 0.01412704, + "auxiliary_loss_mlp": 0.01127012, + "balance_loss_clip": 1.07122183, + "balance_loss_mlp": 1.11758399, + "epoch": 0.011002555238238388, + "flos": 17165049782400.0, + "grad_norm": 2.172025067133121, + "language_loss": 0.87377435, + "learning_rate": 3.3541390344409054e-06, + "loss": 0.89917147, + "num_input_tokens_seen": 3849705, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 2.953125, + "step": 183, + "time_per_iteration": 2.567457914352417 + }, + { + "auxiliary_loss_clip": 0.01415124, + "auxiliary_loss_mlp": 0.01114516, + "balance_loss_clip": 1.06397092, + "balance_loss_mlp": 1.1209594, + "epoch": 0.011062678490906358, + "flos": 22310150935680.0, + "grad_norm": 2.2669267607504255, + "language_loss": 0.86875558, + "learning_rate": 3.357647774369736e-06, + "loss": 0.89405191, + "num_input_tokens_seen": 3869230, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.953125, + "step": 184, + "time_per_iteration": 5.380701780319214 + }, + { + "auxiliary_loss_clip": 0.01411555, + "auxiliary_loss_mlp": 0.01107942, + "balance_loss_clip": 1.05308247, + "balance_loss_mlp": 1.12176847, + "epoch": 0.011122801743574327, + "flos": 24388373053440.0, + "grad_norm": 1.8448371257401488, + "language_loss": 0.83683228, + "learning_rate": 3.3611374966446085e-06, + "loss": 0.86202729, + "num_input_tokens_seen": 3889735, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 2.90625, + "step": 185, + "time_per_iteration": 2.5522208213806152 + }, + { + "auxiliary_loss_clip": 0.01420908, + "auxiliary_loss_mlp": 0.01109712, + "balance_loss_clip": 1.05253971, + "balance_loss_mlp": 1.11964798, + "epoch": 0.011182924996242297, + "flos": 18150258994560.0, + "grad_norm": 2.4162416092451475, + "language_loss": 0.71111757, + "learning_rate": 3.3646084063091142e-06, + "loss": 0.73642373, + "num_input_tokens_seen": 3908855, + "router_z_loss_clip": 0.5703125, + "router_z_loss_mlp": 3.015625, + "step": 186, + "time_per_iteration": 2.536498546600342 + }, + { + "auxiliary_loss_clip": 0.01416319, + "auxiliary_loss_mlp": 0.01107204, + "balance_loss_clip": 1.0558964, + "balance_loss_mlp": 1.11923158, + "epoch": 0.011243048248910266, + "flos": 15486800584320.0, + "grad_norm": 3.342492581434835, + "language_loss": 1.02028871, + "learning_rate": 3.3680607051085194e-06, + "loss": 1.04552388, + "num_input_tokens_seen": 3923865, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 2.96875, + "step": 187, + "time_per_iteration": 2.5189080238342285 + }, + { + "auxiliary_loss_clip": 0.01405552, + "auxiliary_loss_mlp": 0.01110459, + "balance_loss_clip": 1.05597997, + "balance_loss_mlp": 1.11834478, + "epoch": 0.011303171501578235, + "flos": 40916868986880.0, + "grad_norm": 1.6787333311747052, + "language_loss": 0.75107503, + "learning_rate": 3.371494591560139e-06, + "loss": 0.7762351, + "num_input_tokens_seen": 3946870, + "router_z_loss_clip": 0.546875, + "router_z_loss_mlp": 2.875, + "step": 188, + "time_per_iteration": 2.73420786857605 + }, + { + "auxiliary_loss_clip": 0.01292523, + "auxiliary_loss_mlp": 0.01034102, + "balance_loss_clip": 1.01273942, + "balance_loss_mlp": 1.13387585, + "epoch": 0.011363294754246205, + "flos": 66302697790080.0, + "grad_norm": 0.7700467396195164, + "language_loss": 0.56216431, + "learning_rate": 3.3749102610218297e-06, + "loss": 0.5854305, + "num_input_tokens_seen": 4010005, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.5859375, + "step": 189, + "time_per_iteration": 3.176280975341797 + }, + { + "auxiliary_loss_clip": 0.01402635, + "auxiliary_loss_mlp": 0.01121834, + "balance_loss_clip": 1.06742704, + "balance_loss_mlp": 1.1134795, + "epoch": 0.011423418006914174, + "flos": 24900279730560.0, + "grad_norm": 2.292403028528975, + "language_loss": 0.94771594, + "learning_rate": 3.3783079057586833e-06, + "loss": 0.97296059, + "num_input_tokens_seen": 4029035, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 2.90625, + "step": 190, + "time_per_iteration": 2.604132652282715 + }, + { + "auxiliary_loss_clip": 0.01403317, + "auxiliary_loss_mlp": 0.01101291, + "balance_loss_clip": 1.04964972, + "balance_loss_mlp": 1.11493886, + "epoch": 0.011483541259582144, + "flos": 19791879298560.0, + "grad_norm": 2.993049163405909, + "language_loss": 0.84462845, + "learning_rate": 3.3816877150079665e-06, + "loss": 0.8696745, + "num_input_tokens_seen": 4046995, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 2.875, + "step": 191, + "time_per_iteration": 2.569664716720581 + }, + { + "auxiliary_loss_clip": 0.01402316, + "auxiliary_loss_mlp": 0.01121031, + "balance_loss_clip": 1.0698905, + "balance_loss_mlp": 1.11087692, + "epoch": 0.011543664512250112, + "flos": 26176939896960.0, + "grad_norm": 2.0097697123850593, + "language_loss": 0.91439575, + "learning_rate": 3.385049875042367e-06, + "loss": 0.93962914, + "num_input_tokens_seen": 4065865, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 2.90625, + "step": 192, + "time_per_iteration": 2.6416900157928467 + }, + { + "auxiliary_loss_clip": 0.0139743, + "auxiliary_loss_mlp": 0.01113461, + "balance_loss_clip": 1.05776596, + "balance_loss_mlp": 1.11231375, + "epoch": 0.011603787764918083, + "flos": 23768985905280.0, + "grad_norm": 2.095754720056515, + "language_loss": 0.86849445, + "learning_rate": 3.3883945692315938e-06, + "loss": 0.89360332, + "num_input_tokens_seen": 4085305, + "router_z_loss_clip": 0.5546875, + "router_z_loss_mlp": 2.84375, + "step": 193, + "time_per_iteration": 2.569899797439575 + }, + { + "auxiliary_loss_clip": 0.01399232, + "auxiliary_loss_mlp": 0.01095137, + "balance_loss_clip": 1.04409146, + "balance_loss_mlp": 1.10937476, + "epoch": 0.011663911017586051, + "flos": 25954688494080.0, + "grad_norm": 2.446553756436178, + "language_loss": 0.92399615, + "learning_rate": 3.3917219781023906e-06, + "loss": 0.9489398, + "num_input_tokens_seen": 4105185, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 2.90625, + "step": 194, + "time_per_iteration": 2.6078743934631348 + }, + { + "auxiliary_loss_clip": 0.01405837, + "auxiliary_loss_mlp": 0.0110398, + "balance_loss_clip": 1.05188549, + "balance_loss_mlp": 1.11522019, + "epoch": 0.01172403427025402, + "flos": 17895149625600.0, + "grad_norm": 3.1413620570060052, + "language_loss": 0.89698559, + "learning_rate": 3.3950322793970014e-06, + "loss": 0.92208374, + "num_input_tokens_seen": 4123160, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.90625, + "step": 195, + "time_per_iteration": 2.5785820484161377 + }, + { + "auxiliary_loss_clip": 0.01400897, + "auxiliary_loss_mlp": 0.01117652, + "balance_loss_clip": 1.06345916, + "balance_loss_mlp": 1.11416054, + "epoch": 0.01178415752292199, + "flos": 17894539094400.0, + "grad_norm": 3.0173579296668813, + "language_loss": 0.8577168, + "learning_rate": 3.3983256481301445e-06, + "loss": 0.88290232, + "num_input_tokens_seen": 4140425, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 2.875, + "step": 196, + "time_per_iteration": 2.5492773056030273 + }, + { + "auxiliary_loss_clip": 0.01397107, + "auxiliary_loss_mlp": 0.01106206, + "balance_loss_clip": 1.05299139, + "balance_loss_mlp": 1.10991478, + "epoch": 0.011844280775589959, + "flos": 22893555634560.0, + "grad_norm": 2.86264810097015, + "language_loss": 0.93367243, + "learning_rate": 3.4016022566445335e-06, + "loss": 0.95870566, + "num_input_tokens_seen": 4159555, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.875, + "step": 197, + "time_per_iteration": 2.5488431453704834 + }, + { + "auxiliary_loss_clip": 0.01394686, + "auxiliary_loss_mlp": 0.0110986, + "balance_loss_clip": 1.05781317, + "balance_loss_mlp": 1.1120131, + "epoch": 0.01190440402825793, + "flos": 26980333441920.0, + "grad_norm": 2.1872318454948045, + "language_loss": 0.79184073, + "learning_rate": 3.4048622746649966e-06, + "loss": 0.81688625, + "num_input_tokens_seen": 4180480, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 2.828125, + "step": 198, + "time_per_iteration": 2.6208834648132324 + }, + { + "auxiliary_loss_clip": 0.01390401, + "auxiliary_loss_mlp": 0.01116361, + "balance_loss_clip": 1.06545901, + "balance_loss_mlp": 1.11265802, + "epoch": 0.011964527280925898, + "flos": 20521584092160.0, + "grad_norm": 3.3720724842630663, + "language_loss": 0.88065112, + "learning_rate": 3.4081058693512278e-06, + "loss": 0.90571868, + "num_input_tokens_seen": 4198835, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.765625, + "step": 199, + "time_per_iteration": 2.5257043838500977 + }, + { + "auxiliary_loss_clip": 0.01403414, + "auxiliary_loss_mlp": 0.01121968, + "balance_loss_clip": 1.0658679, + "balance_loss_mlp": 1.11557496, + "epoch": 0.012024650533593867, + "flos": 27745984771200.0, + "grad_norm": 1.8432610551497841, + "language_loss": 0.81327617, + "learning_rate": 3.411333205349222e-06, + "loss": 0.83853, + "num_input_tokens_seen": 4219335, + "router_z_loss_clip": 0.55859375, + "router_z_loss_mlp": 2.875, + "step": 200, + "time_per_iteration": 2.593231201171875 + }, + { + "auxiliary_loss_clip": 0.01400536, + "auxiliary_loss_mlp": 0.01101092, + "balance_loss_clip": 1.04792464, + "balance_loss_mlp": 1.11138511, + "epoch": 0.012084773786261837, + "flos": 10452017076480.0, + "grad_norm": 2.758923223370522, + "language_loss": 0.87688923, + "learning_rate": 3.4145444448414217e-06, + "loss": 0.90190548, + "num_input_tokens_seen": 4236940, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.90625, + "step": 201, + "time_per_iteration": 2.5057122707366943 + }, + { + "auxiliary_loss_clip": 0.01401128, + "auxiliary_loss_mlp": 0.01110995, + "balance_loss_clip": 1.05751753, + "balance_loss_mlp": 1.1152513, + "epoch": 0.012144897038929806, + "flos": 23105751229440.0, + "grad_norm": 3.7927516715708736, + "language_loss": 0.84123611, + "learning_rate": 3.4177397475956223e-06, + "loss": 0.86635733, + "num_input_tokens_seen": 4256755, + "router_z_loss_clip": 0.53515625, + "router_z_loss_mlp": 2.859375, + "step": 202, + "time_per_iteration": 2.555680751800537 + }, + { + "auxiliary_loss_clip": 0.01388205, + "auxiliary_loss_mlp": 0.01109065, + "balance_loss_clip": 1.05639839, + "balance_loss_mlp": 1.10674798, + "epoch": 0.012205020291597776, + "flos": 21033203460480.0, + "grad_norm": 1.9040504717952067, + "language_loss": 0.90116632, + "learning_rate": 3.4209192710126685e-06, + "loss": 0.926139, + "num_input_tokens_seen": 4276505, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 2.8125, + "step": 203, + "time_per_iteration": 2.526937484741211 + }, + { + "auxiliary_loss_clip": 0.01281494, + "auxiliary_loss_mlp": 0.01053133, + "balance_loss_clip": 1.03138971, + "balance_loss_mlp": 1.12054539, + "epoch": 0.012265143544265745, + "flos": 68447785075200.0, + "grad_norm": 1.0150955472927095, + "language_loss": 0.61259121, + "learning_rate": 3.4240831701729837e-06, + "loss": 0.63593745, + "num_input_tokens_seen": 4330965, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.609375, + "step": 204, + "time_per_iteration": 3.051469326019287 + }, + { + "auxiliary_loss_clip": 0.01398264, + "auxiliary_loss_mlp": 0.01111819, + "balance_loss_clip": 1.0593431, + "balance_loss_mlp": 1.11035323, + "epoch": 0.012325266796933715, + "flos": 17019252478080.0, + "grad_norm": 2.269022633654934, + "language_loss": 0.91206741, + "learning_rate": 3.4272315978819516e-06, + "loss": 0.93716824, + "num_input_tokens_seen": 4348200, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.875, + "step": 205, + "time_per_iteration": 2.5105605125427246 + }, + { + "auxiliary_loss_clip": 0.01406073, + "auxiliary_loss_mlp": 0.01120568, + "balance_loss_clip": 1.06675649, + "balance_loss_mlp": 1.11524296, + "epoch": 0.012385390049601683, + "flos": 20190056538240.0, + "grad_norm": 2.2813283317886497, + "language_loss": 0.89215505, + "learning_rate": 3.4303647047142043e-06, + "loss": 0.91742146, + "num_input_tokens_seen": 4365460, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 2.90625, + "step": 206, + "time_per_iteration": 2.524327039718628 + }, + { + "auxiliary_loss_clip": 0.01394865, + "auxiliary_loss_mlp": 0.01101938, + "balance_loss_clip": 1.05039215, + "balance_loss_mlp": 1.10848641, + "epoch": 0.012445513302269652, + "flos": 16253134272000.0, + "grad_norm": 2.502758142715096, + "language_loss": 0.95368809, + "learning_rate": 3.43348263905683e-06, + "loss": 0.97865611, + "num_input_tokens_seen": 4383650, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 2.859375, + "step": 207, + "time_per_iteration": 2.5147407054901123 + }, + { + "auxiliary_loss_clip": 0.01393931, + "auxiliary_loss_mlp": 0.01116307, + "balance_loss_clip": 1.06416512, + "balance_loss_mlp": 1.11335945, + "epoch": 0.012505636554937622, + "flos": 23769380954880.0, + "grad_norm": 2.4565104125033232, + "language_loss": 0.75770479, + "learning_rate": 3.436585547151547e-06, + "loss": 0.78280723, + "num_input_tokens_seen": 4403765, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.8125, + "step": 208, + "time_per_iteration": 2.5426721572875977 + }, + { + "auxiliary_loss_clip": 0.01382601, + "auxiliary_loss_mlp": 0.01107359, + "balance_loss_clip": 1.05497861, + "balance_loss_mlp": 1.10796773, + "epoch": 0.012565759807605591, + "flos": 30591546157440.0, + "grad_norm": 2.79364384939249, + "language_loss": 0.98718858, + "learning_rate": 3.4396735731358586e-06, + "loss": 1.01208818, + "num_input_tokens_seen": 4421935, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.75, + "step": 209, + "time_per_iteration": 2.607238292694092 + }, + { + "auxiliary_loss_clip": 0.01389293, + "auxiliary_loss_mlp": 0.01112212, + "balance_loss_clip": 1.05971253, + "balance_loss_mlp": 1.11020541, + "epoch": 0.012625883060273561, + "flos": 40113511355520.0, + "grad_norm": 7.039976369418198, + "language_loss": 0.85444254, + "learning_rate": 3.4427468590832302e-06, + "loss": 0.87945753, + "num_input_tokens_seen": 4441470, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.78125, + "step": 210, + "time_per_iteration": 2.67632794380188 + }, + { + "auxiliary_loss_clip": 0.01385349, + "auxiliary_loss_mlp": 0.01120301, + "balance_loss_clip": 1.07042408, + "balance_loss_mlp": 1.1073029, + "epoch": 0.01268600631294153, + "flos": 27089178629760.0, + "grad_norm": 2.2334441604414783, + "language_loss": 0.97016168, + "learning_rate": 3.445805545042314e-06, + "loss": 0.99521822, + "num_input_tokens_seen": 4459950, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 2.78125, + "step": 211, + "time_per_iteration": 2.5733633041381836 + }, + { + "auxiliary_loss_clip": 0.01394963, + "auxiliary_loss_mlp": 0.01114691, + "balance_loss_clip": 1.0616188, + "balance_loss_mlp": 1.11342549, + "epoch": 0.012746129565609499, + "flos": 16982767238400.0, + "grad_norm": 3.6563211355425453, + "language_loss": 0.95188707, + "learning_rate": 3.448849769075239e-06, + "loss": 0.97698367, + "num_input_tokens_seen": 4478390, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.8125, + "step": 212, + "time_per_iteration": 2.5224313735961914 + }, + { + "auxiliary_loss_clip": 0.01383511, + "auxiliary_loss_mlp": 0.01116361, + "balance_loss_clip": 1.06376541, + "balance_loss_mlp": 1.10996664, + "epoch": 0.012806252818277469, + "flos": 46533476995200.0, + "grad_norm": 2.0395830195466504, + "language_loss": 0.76049221, + "learning_rate": 3.4518796672950093e-06, + "loss": 0.78549099, + "num_input_tokens_seen": 4501665, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 2.734375, + "step": 213, + "time_per_iteration": 2.76625919342041 + }, + { + "auxiliary_loss_clip": 0.0138732, + "auxiliary_loss_mlp": 0.01103154, + "balance_loss_clip": 1.052037, + "balance_loss_mlp": 1.10833097, + "epoch": 0.012866376070945438, + "flos": 14388616120320.0, + "grad_norm": 8.414558483522654, + "language_loss": 0.86754733, + "learning_rate": 3.4548953739020187e-06, + "loss": 0.89245206, + "num_input_tokens_seen": 4519055, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 2.78125, + "step": 214, + "time_per_iteration": 2.500417470932007 + }, + { + "auxiliary_loss_clip": 0.0138682, + "auxiliary_loss_mlp": 0.01127788, + "balance_loss_clip": 1.07397687, + "balance_loss_mlp": 1.11549139, + "epoch": 0.012926499323613408, + "flos": 26140813793280.0, + "grad_norm": 2.3854037050744057, + "language_loss": 0.77357471, + "learning_rate": 3.4578970212197196e-06, + "loss": 0.79872084, + "num_input_tokens_seen": 4540870, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 2.703125, + "step": 215, + "time_per_iteration": 2.6116256713867188 + }, + { + "auxiliary_loss_clip": 0.01394912, + "auxiliary_loss_mlp": 0.01111048, + "balance_loss_clip": 1.06002641, + "balance_loss_mlp": 1.11393261, + "epoch": 0.012986622576281377, + "flos": 30117202128000.0, + "grad_norm": 2.44498430810385, + "language_loss": 0.90545797, + "learning_rate": 3.460884739729461e-06, + "loss": 0.93051755, + "num_input_tokens_seen": 4560395, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.8125, + "step": 216, + "time_per_iteration": 2.5903706550598145 + }, + { + "auxiliary_loss_clip": 0.0138678, + "auxiliary_loss_mlp": 0.01107632, + "balance_loss_clip": 1.05622888, + "balance_loss_mlp": 1.10772836, + "epoch": 0.013046745828949347, + "flos": 13954025468160.0, + "grad_norm": 2.630220300857062, + "language_loss": 0.93660516, + "learning_rate": 3.463858658104523e-06, + "loss": 0.96154928, + "num_input_tokens_seen": 4575785, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 2.78125, + "step": 217, + "time_per_iteration": 2.5109100341796875 + }, + { + "auxiliary_loss_clip": 0.01381618, + "auxiliary_loss_mlp": 0.01107152, + "balance_loss_clip": 1.05360317, + "balance_loss_mlp": 1.10700643, + "epoch": 0.013106869081617315, + "flos": 17347835116800.0, + "grad_norm": 1.9165712032980975, + "language_loss": 0.93656206, + "learning_rate": 3.4668189032433696e-06, + "loss": 0.96144974, + "num_input_tokens_seen": 4594985, + "router_z_loss_clip": 0.53515625, + "router_z_loss_mlp": 2.75, + "step": 218, + "time_per_iteration": 2.6586077213287354 + }, + { + "auxiliary_loss_clip": 0.01376505, + "auxiliary_loss_mlp": 0.01108753, + "balance_loss_clip": 1.05820787, + "balance_loss_mlp": 1.10663593, + "epoch": 0.013166992334285284, + "flos": 25884914325120.0, + "grad_norm": 1.916363531530835, + "language_loss": 0.86148179, + "learning_rate": 3.46976560030214e-06, + "loss": 0.88633436, + "num_input_tokens_seen": 4616125, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 2.703125, + "step": 219, + "time_per_iteration": 2.584040880203247 + }, + { + "auxiliary_loss_clip": 0.01383955, + "auxiliary_loss_mlp": 0.01101272, + "balance_loss_clip": 1.05056047, + "balance_loss_mlp": 1.110309, + "epoch": 0.013227115586953254, + "flos": 31175956437120.0, + "grad_norm": 1.7731463199764816, + "language_loss": 0.87598741, + "learning_rate": 3.4726988727263976e-06, + "loss": 0.90083969, + "num_input_tokens_seen": 4637795, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.75, + "step": 220, + "time_per_iteration": 2.6294186115264893 + }, + { + "auxiliary_loss_clip": 0.01373821, + "auxiliary_loss_mlp": 0.01103316, + "balance_loss_clip": 1.05663311, + "balance_loss_mlp": 1.10389161, + "epoch": 0.013287238839621223, + "flos": 20409470766720.0, + "grad_norm": 1.991547522293572, + "language_loss": 0.86413074, + "learning_rate": 3.475618842282164e-06, + "loss": 0.88890207, + "num_input_tokens_seen": 4656835, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 2.6875, + "step": 221, + "time_per_iteration": 2.606137990951538 + }, + { + "auxiliary_loss_clip": 0.0137878, + "auxiliary_loss_mlp": 0.01109834, + "balance_loss_clip": 1.05800176, + "balance_loss_mlp": 1.10240269, + "epoch": 0.013347362092289193, + "flos": 14137134024960.0, + "grad_norm": 2.017045003530743, + "language_loss": 0.92153138, + "learning_rate": 3.4785256290862486e-06, + "loss": 0.94641757, + "num_input_tokens_seen": 4673015, + "router_z_loss_clip": 0.51953125, + "router_z_loss_mlp": 2.765625, + "step": 222, + "time_per_iteration": 2.6237566471099854 + }, + { + "auxiliary_loss_clip": 0.01377393, + "auxiliary_loss_mlp": 0.01105441, + "balance_loss_clip": 1.05129576, + "balance_loss_mlp": 1.10672021, + "epoch": 0.013407485344957162, + "flos": 21797705554560.0, + "grad_norm": 2.7127164790698606, + "language_loss": 0.95539695, + "learning_rate": 3.481419351635897e-06, + "loss": 0.98022527, + "num_input_tokens_seen": 4692355, + "router_z_loss_clip": 0.54296875, + "router_z_loss_mlp": 2.71875, + "step": 223, + "time_per_iteration": 2.679387092590332 + }, + { + "auxiliary_loss_clip": 0.01377947, + "auxiliary_loss_mlp": 0.01106927, + "balance_loss_clip": 1.05612004, + "balance_loss_mlp": 1.10671806, + "epoch": 0.013467608597625132, + "flos": 18621622195200.0, + "grad_norm": 2.5543531214735586, + "language_loss": 0.88022512, + "learning_rate": 3.484300126837776e-06, + "loss": 0.90507382, + "num_input_tokens_seen": 4710080, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.71875, + "step": 224, + "time_per_iteration": 2.6327528953552246 + }, + { + "auxiliary_loss_clip": 0.0137715, + "auxiliary_loss_mlp": 0.01102713, + "balance_loss_clip": 1.04873466, + "balance_loss_mlp": 1.10632586, + "epoch": 0.013527731850293101, + "flos": 18552314903040.0, + "grad_norm": 2.0812591886363183, + "language_loss": 0.89642018, + "learning_rate": 3.487168070036317e-06, + "loss": 0.92121875, + "num_input_tokens_seen": 4728980, + "router_z_loss_clip": 0.5390625, + "router_z_loss_mlp": 2.703125, + "step": 225, + "time_per_iteration": 2.511749505996704 + }, + { + "auxiliary_loss_clip": 0.01374075, + "auxiliary_loss_mlp": 0.01115854, + "balance_loss_clip": 1.06273401, + "balance_loss_mlp": 1.10547256, + "epoch": 0.01358785510296107, + "flos": 19165381257600.0, + "grad_norm": 2.1555099546542142, + "language_loss": 0.99022663, + "learning_rate": 3.4900232950414224e-06, + "loss": 1.01512599, + "num_input_tokens_seen": 4747020, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.6875, + "step": 226, + "time_per_iteration": 5.38438868522644 + }, + { + "auxiliary_loss_clip": 0.0137773, + "auxiliary_loss_mlp": 0.01111487, + "balance_loss_clip": 1.0584867, + "balance_loss_mlp": 1.10696185, + "epoch": 0.01364797835562904, + "flos": 23329941966720.0, + "grad_norm": 15.523681056640678, + "language_loss": 0.91210413, + "learning_rate": 3.4928659141555727e-06, + "loss": 0.93699628, + "num_input_tokens_seen": 4765000, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.703125, + "step": 227, + "time_per_iteration": 2.5391762256622314 + }, + { + "auxiliary_loss_clip": 0.01252818, + "auxiliary_loss_mlp": 0.01025357, + "balance_loss_clip": 1.00666487, + "balance_loss_mlp": 1.10911703, + "epoch": 0.013708101608297009, + "flos": 70993746097920.0, + "grad_norm": 0.99230217192713, + "language_loss": 0.57680154, + "learning_rate": 3.4956960382003234e-06, + "loss": 0.59958327, + "num_input_tokens_seen": 4833210, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.4375, + "step": 228, + "time_per_iteration": 3.1981163024902344 + }, + { + "auxiliary_loss_clip": 0.0136686, + "auxiliary_loss_mlp": 0.01110654, + "balance_loss_clip": 1.06127763, + "balance_loss_mlp": 1.10228515, + "epoch": 0.013768224860964979, + "flos": 16325170997760.0, + "grad_norm": 2.2779006264878374, + "language_loss": 0.8759563, + "learning_rate": 3.4985137765422354e-06, + "loss": 0.90073144, + "num_input_tokens_seen": 4850120, + "router_z_loss_clip": 0.49414062, + "router_z_loss_mlp": 2.65625, + "step": 229, + "time_per_iteration": 2.49130916595459 + }, + { + "auxiliary_loss_clip": 0.01377631, + "auxiliary_loss_mlp": 0.01101911, + "balance_loss_clip": 1.05212951, + "balance_loss_mlp": 1.10486007, + "epoch": 0.013828348113632948, + "flos": 20193037367040.0, + "grad_norm": 4.280679608747667, + "language_loss": 0.84247303, + "learning_rate": 3.501319237118231e-06, + "loss": 0.8672685, + "num_input_tokens_seen": 4866215, + "router_z_loss_clip": 0.49804688, + "router_z_loss_mlp": 2.734375, + "step": 230, + "time_per_iteration": 2.501218557357788 + }, + { + "auxiliary_loss_clip": 0.01375417, + "auxiliary_loss_mlp": 0.0111628, + "balance_loss_clip": 1.06671298, + "balance_loss_mlp": 1.10600948, + "epoch": 0.013888471366300916, + "flos": 20741070147840.0, + "grad_norm": 1.78964280876859, + "language_loss": 0.90378422, + "learning_rate": 3.5041125264604056e-06, + "loss": 0.92870116, + "num_input_tokens_seen": 4885630, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 2.6875, + "step": 231, + "time_per_iteration": 2.541137456893921 + }, + { + "auxiliary_loss_clip": 0.01377441, + "auxiliary_loss_mlp": 0.01108629, + "balance_loss_clip": 1.05941916, + "balance_loss_mlp": 1.10821056, + "epoch": 0.013948594618968886, + "flos": 22090628966400.0, + "grad_norm": 2.031489983297281, + "language_loss": 0.83706695, + "learning_rate": 3.5068937497203002e-06, + "loss": 0.86192763, + "num_input_tokens_seen": 4905570, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 2.6875, + "step": 232, + "time_per_iteration": 2.5444753170013428 + }, + { + "auxiliary_loss_clip": 0.0137977, + "auxiliary_loss_mlp": 0.01092372, + "balance_loss_clip": 1.04125488, + "balance_loss_mlp": 1.10017753, + "epoch": 0.014008717871636855, + "flos": 19063108258560.0, + "grad_norm": 2.928489064169697, + "language_loss": 0.74033689, + "learning_rate": 3.509663010692652e-06, + "loss": 0.76505834, + "num_input_tokens_seen": 4923535, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 2.796875, + "step": 233, + "time_per_iteration": 2.5364952087402344 + }, + { + "auxiliary_loss_clip": 0.01382965, + "auxiliary_loss_mlp": 0.0112384, + "balance_loss_clip": 1.07141209, + "balance_loss_mlp": 1.10741055, + "epoch": 0.014068841124304825, + "flos": 14530822064640.0, + "grad_norm": 2.287774019631123, + "language_loss": 0.85867143, + "learning_rate": 3.512420411838642e-06, + "loss": 0.88373953, + "num_input_tokens_seen": 4939200, + "router_z_loss_clip": 0.5234375, + "router_z_loss_mlp": 2.75, + "step": 234, + "time_per_iteration": 2.532949209213257 + }, + { + "auxiliary_loss_clip": 0.01375298, + "auxiliary_loss_mlp": 0.01106064, + "balance_loss_clip": 1.05683041, + "balance_loss_mlp": 1.10759592, + "epoch": 0.014128964376972794, + "flos": 18077396256000.0, + "grad_norm": 2.6527993685177154, + "language_loss": 0.89144391, + "learning_rate": 3.515166054308634e-06, + "loss": 0.9162575, + "num_input_tokens_seen": 4956620, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 2.671875, + "step": 235, + "time_per_iteration": 2.509592294692993 + }, + { + "auxiliary_loss_clip": 0.0137416, + "auxiliary_loss_mlp": 0.01119384, + "balance_loss_clip": 1.06874382, + "balance_loss_mlp": 1.10830367, + "epoch": 0.014189087629640764, + "flos": 25334331678720.0, + "grad_norm": 4.054998173736759, + "language_loss": 0.85780042, + "learning_rate": 3.5179000379644498e-06, + "loss": 0.88273585, + "num_input_tokens_seen": 4975650, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.65625, + "step": 236, + "time_per_iteration": 2.744925022125244 + }, + { + "auxiliary_loss_clip": 0.0137118, + "auxiliary_loss_mlp": 0.01099258, + "balance_loss_clip": 1.04871392, + "balance_loss_mlp": 1.10178149, + "epoch": 0.014249210882308733, + "flos": 36139744713600.0, + "grad_norm": 2.128422813257453, + "language_loss": 0.82452404, + "learning_rate": 3.520622461401154e-06, + "loss": 0.84922838, + "num_input_tokens_seen": 4997415, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.6875, + "step": 237, + "time_per_iteration": 2.67307710647583 + }, + { + "auxiliary_loss_clip": 0.01369116, + "auxiliary_loss_mlp": 0.01116968, + "balance_loss_clip": 1.0643487, + "balance_loss_mlp": 1.10451889, + "epoch": 0.014309334134976702, + "flos": 12932977461120.0, + "grad_norm": 3.103781307849977, + "language_loss": 0.77321362, + "learning_rate": 3.5233334219683935e-06, + "loss": 0.79807448, + "num_input_tokens_seen": 5013905, + "router_z_loss_clip": 0.52734375, + "router_z_loss_mlp": 2.65625, + "step": 238, + "time_per_iteration": 2.4973809719085693 + }, + { + "auxiliary_loss_clip": 0.01368178, + "auxiliary_loss_mlp": 0.01112367, + "balance_loss_clip": 1.06566119, + "balance_loss_mlp": 1.10654771, + "epoch": 0.014369457387644672, + "flos": 20777519473920.0, + "grad_norm": 1.992064896075991, + "language_loss": 0.87370872, + "learning_rate": 3.526033015791284e-06, + "loss": 0.89851415, + "num_input_tokens_seen": 5033645, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 2.609375, + "step": 239, + "time_per_iteration": 2.554222583770752 + }, + { + "auxiliary_loss_clip": 0.01352979, + "auxiliary_loss_mlp": 0.01100535, + "balance_loss_clip": 1.05330408, + "balance_loss_mlp": 1.09776592, + "epoch": 0.01442958064031264, + "flos": 25848536826240.0, + "grad_norm": 2.2433371609956283, + "language_loss": 0.93297911, + "learning_rate": 3.528721337790862e-06, + "loss": 0.95751429, + "num_input_tokens_seen": 5052875, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 2.5625, + "step": 240, + "time_per_iteration": 2.588529348373413 + }, + { + "auxiliary_loss_clip": 0.01362634, + "auxiliary_loss_mlp": 0.01104045, + "balance_loss_clip": 1.05736244, + "balance_loss_mlp": 1.10324717, + "epoch": 0.014489703892980611, + "flos": 28219718269440.0, + "grad_norm": 2.299780828803648, + "language_loss": 0.85129881, + "learning_rate": 3.531398481704111e-06, + "loss": 0.8759656, + "num_input_tokens_seen": 5075005, + "router_z_loss_clip": 0.46679688, + "router_z_loss_mlp": 2.59375, + "step": 241, + "time_per_iteration": 2.607272148132324 + }, + { + "auxiliary_loss_clip": 0.01360778, + "auxiliary_loss_mlp": 0.01116022, + "balance_loss_clip": 1.06695509, + "balance_loss_mlp": 1.10865557, + "epoch": 0.01454982714564858, + "flos": 22490925108480.0, + "grad_norm": 1.927287768398498, + "language_loss": 0.88410223, + "learning_rate": 3.534064540103573e-06, + "loss": 0.90887022, + "num_input_tokens_seen": 5091875, + "router_z_loss_clip": 0.49023438, + "router_z_loss_mlp": 2.53125, + "step": 242, + "time_per_iteration": 2.522657632827759 + }, + { + "auxiliary_loss_clip": 0.013595, + "auxiliary_loss_mlp": 0.0109979, + "balance_loss_clip": 1.04981756, + "balance_loss_mlp": 1.10147619, + "epoch": 0.014609950398316548, + "flos": 21653201139840.0, + "grad_norm": 2.6384412969740922, + "language_loss": 0.86817086, + "learning_rate": 3.536719604416555e-06, + "loss": 0.89276373, + "num_input_tokens_seen": 5111290, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 2.578125, + "step": 243, + "time_per_iteration": 2.5738751888275146 + }, + { + "auxiliary_loss_clip": 0.01366378, + "auxiliary_loss_mlp": 0.01105289, + "balance_loss_clip": 1.05574584, + "balance_loss_mlp": 1.10421979, + "epoch": 0.014670073650984519, + "flos": 21869993675520.0, + "grad_norm": 1.576084931358892, + "language_loss": 0.84271425, + "learning_rate": 3.5393637649439464e-06, + "loss": 0.86743093, + "num_input_tokens_seen": 5132265, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 2.625, + "step": 244, + "time_per_iteration": 2.51370906829834 + }, + { + "auxiliary_loss_clip": 0.01374385, + "auxiliary_loss_mlp": 0.01115077, + "balance_loss_clip": 1.06403196, + "balance_loss_mlp": 1.10701251, + "epoch": 0.014730196903652487, + "flos": 23183713699200.0, + "grad_norm": 2.2775099056278916, + "language_loss": 0.78689361, + "learning_rate": 3.54199711087864e-06, + "loss": 0.8117882, + "num_input_tokens_seen": 5148575, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 2.671875, + "step": 245, + "time_per_iteration": 2.5579745769500732 + }, + { + "auxiliary_loss_clip": 0.01372772, + "auxiliary_loss_mlp": 0.0110276, + "balance_loss_clip": 1.04961681, + "balance_loss_mlp": 1.10232484, + "epoch": 0.014790320156320457, + "flos": 23222605150080.0, + "grad_norm": 2.2330220282190685, + "language_loss": 0.84241545, + "learning_rate": 3.5446197303235913e-06, + "loss": 0.86717069, + "num_input_tokens_seen": 5170415, + "router_z_loss_clip": 0.53125, + "router_z_loss_mlp": 2.703125, + "step": 246, + "time_per_iteration": 2.565614700317383 + }, + { + "auxiliary_loss_clip": 0.01367419, + "auxiliary_loss_mlp": 0.01097455, + "balance_loss_clip": 1.04722059, + "balance_loss_mlp": 1.10181057, + "epoch": 0.014850443408988426, + "flos": 15815490963840.0, + "grad_norm": 1.9335653980079095, + "language_loss": 0.9014703, + "learning_rate": 3.5472317103095034e-06, + "loss": 0.92611909, + "num_input_tokens_seen": 5188565, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 2.65625, + "step": 247, + "time_per_iteration": 2.5572896003723145 + }, + { + "auxiliary_loss_clip": 0.01365881, + "auxiliary_loss_mlp": 0.01097755, + "balance_loss_clip": 1.04952252, + "balance_loss_mlp": 1.09689593, + "epoch": 0.014910566661656396, + "flos": 22781657790720.0, + "grad_norm": 2.1205098484246734, + "language_loss": 0.78058362, + "learning_rate": 3.549833136812155e-06, + "loss": 0.80521989, + "num_input_tokens_seen": 5207810, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 2.6875, + "step": 248, + "time_per_iteration": 2.5365517139434814 + }, + { + "auxiliary_loss_clip": 0.0136687, + "auxiliary_loss_mlp": 0.01105288, + "balance_loss_clip": 1.05552983, + "balance_loss_mlp": 1.10545397, + "epoch": 0.014970689914324365, + "flos": 26865023806080.0, + "grad_norm": 2.1747011613954177, + "language_loss": 0.83849227, + "learning_rate": 3.552424094769381e-06, + "loss": 0.86321384, + "num_input_tokens_seen": 5226210, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 2.609375, + "step": 249, + "time_per_iteration": 2.6142020225524902 + }, + { + "auxiliary_loss_clip": 0.01360073, + "auxiliary_loss_mlp": 0.01106063, + "balance_loss_clip": 1.05806887, + "balance_loss_mlp": 1.09971058, + "epoch": 0.015030813166992334, + "flos": 13985662371840.0, + "grad_norm": 2.2137591284686455, + "language_loss": 0.93476778, + "learning_rate": 3.5550046680977174e-06, + "loss": 0.95942914, + "num_input_tokens_seen": 5241660, + "router_z_loss_clip": 0.47851562, + "router_z_loss_mlp": 2.609375, + "step": 250, + "time_per_iteration": 2.485686779022217 + }, + { + "auxiliary_loss_clip": 0.01369254, + "auxiliary_loss_mlp": 0.01114661, + "balance_loss_clip": 1.06351972, + "balance_loss_mlp": 1.10460913, + "epoch": 0.015090936419660304, + "flos": 24717817618560.0, + "grad_norm": 2.2612141068319622, + "language_loss": 0.97030997, + "learning_rate": 3.5575749397087034e-06, + "loss": 0.99514914, + "num_input_tokens_seen": 5261090, + "router_z_loss_clip": 0.51171875, + "router_z_loss_mlp": 2.640625, + "step": 251, + "time_per_iteration": 2.5887296199798584 + }, + { + "auxiliary_loss_clip": 0.01362288, + "auxiliary_loss_mlp": 0.01105325, + "balance_loss_clip": 1.05723596, + "balance_loss_mlp": 1.09872079, + "epoch": 0.015151059672328273, + "flos": 25738793798400.0, + "grad_norm": 2.0465178965121136, + "language_loss": 0.8428089, + "learning_rate": 3.5601349915248707e-06, + "loss": 0.86748511, + "num_input_tokens_seen": 5279175, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 2.640625, + "step": 252, + "time_per_iteration": 2.5749199390411377 + }, + { + "auxiliary_loss_clip": 0.01357969, + "auxiliary_loss_mlp": 0.01114738, + "balance_loss_clip": 1.06569552, + "balance_loss_mlp": 1.10169089, + "epoch": 0.015211182924996243, + "flos": 21871214737920.0, + "grad_norm": 2.482990993198259, + "language_loss": 0.98208833, + "learning_rate": 3.5626849044954064e-06, + "loss": 1.00681543, + "num_input_tokens_seen": 5296975, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 2.5625, + "step": 253, + "time_per_iteration": 2.5639333724975586 + }, + { + "auxiliary_loss_clip": 0.01233728, + "auxiliary_loss_mlp": 0.01026961, + "balance_loss_clip": 1.00855541, + "balance_loss_mlp": 1.09965372, + "epoch": 0.015271306177664212, + "flos": 66895080888960.0, + "grad_norm": 0.8505459641429172, + "language_loss": 0.55672622, + "learning_rate": 3.5652247586115167e-06, + "loss": 0.57933319, + "num_input_tokens_seen": 5358375, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.34375, + "step": 254, + "time_per_iteration": 3.1063449382781982 + }, + { + "auxiliary_loss_clip": 0.01362079, + "auxiliary_loss_mlp": 0.01116704, + "balance_loss_clip": 1.06687438, + "balance_loss_mlp": 1.09652638, + "epoch": 0.01533142943033218, + "flos": 26834069260800.0, + "grad_norm": 2.4360968938917065, + "language_loss": 0.90453845, + "learning_rate": 3.567754632921479e-06, + "loss": 0.9293263, + "num_input_tokens_seen": 5377255, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 2.65625, + "step": 255, + "time_per_iteration": 2.5746912956237793 + }, + { + "auxiliary_loss_clip": 0.01358909, + "auxiliary_loss_mlp": 0.01125654, + "balance_loss_clip": 1.07568169, + "balance_loss_mlp": 1.09931397, + "epoch": 0.01539155268300015, + "flos": 20813753318400.0, + "grad_norm": 2.2666703391376903, + "language_loss": 0.8562001, + "learning_rate": 3.5702746055454075e-06, + "loss": 0.8810457, + "num_input_tokens_seen": 5395320, + "router_z_loss_clip": 0.5, + "router_z_loss_mlp": 2.59375, + "step": 256, + "time_per_iteration": 2.6095149517059326 + }, + { + "auxiliary_loss_clip": 0.01366413, + "auxiliary_loss_mlp": 0.01112529, + "balance_loss_clip": 1.06305718, + "balance_loss_mlp": 1.09961021, + "epoch": 0.01545167593566812, + "flos": 15961862885760.0, + "grad_norm": 2.7442871984488386, + "language_loss": 0.71504897, + "learning_rate": 3.5727847536897254e-06, + "loss": 0.73983842, + "num_input_tokens_seen": 5411970, + "router_z_loss_clip": 0.49414062, + "router_z_loss_mlp": 2.65625, + "step": 257, + "time_per_iteration": 2.5939691066741943 + }, + { + "auxiliary_loss_clip": 0.01357007, + "auxiliary_loss_mlp": 0.01100177, + "balance_loss_clip": 1.05087197, + "balance_loss_mlp": 1.09875202, + "epoch": 0.01551179918833609, + "flos": 22601745544320.0, + "grad_norm": 1.9522192109187282, + "language_loss": 0.94659579, + "learning_rate": 3.5752851536613596e-06, + "loss": 0.97116768, + "num_input_tokens_seen": 5430245, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 2.578125, + "step": 258, + "time_per_iteration": 2.7119739055633545 + }, + { + "auxiliary_loss_clip": 0.01356701, + "auxiliary_loss_mlp": 0.01104272, + "balance_loss_clip": 1.05615926, + "balance_loss_mlp": 1.09608126, + "epoch": 0.015571922441004058, + "flos": 22816706486400.0, + "grad_norm": 3.167214789879638, + "language_loss": 0.93174207, + "learning_rate": 3.577775880881658e-06, + "loss": 0.95635182, + "num_input_tokens_seen": 5448905, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 2.59375, + "step": 259, + "time_per_iteration": 2.6776607036590576 + }, + { + "auxiliary_loss_clip": 0.01351639, + "auxiliary_loss_mlp": 0.010988, + "balance_loss_clip": 1.05297637, + "balance_loss_mlp": 1.10035825, + "epoch": 0.015632045693672027, + "flos": 18947439486720.0, + "grad_norm": 2.1226725879970605, + "language_loss": 0.97360909, + "learning_rate": 3.5802570099000424e-06, + "loss": 0.99811351, + "num_input_tokens_seen": 5466405, + "router_z_loss_clip": 0.45898438, + "router_z_loss_mlp": 2.515625, + "step": 260, + "time_per_iteration": 2.520759105682373 + }, + { + "auxiliary_loss_clip": 0.01365989, + "auxiliary_loss_mlp": 0.01110082, + "balance_loss_clip": 1.06282747, + "balance_loss_mlp": 1.10060608, + "epoch": 0.015692168946339995, + "flos": 29971728046080.0, + "grad_norm": 2.3569711169381, + "language_loss": 0.87644511, + "learning_rate": 3.5827286144073947e-06, + "loss": 0.90120584, + "num_input_tokens_seen": 5487055, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 2.65625, + "step": 261, + "time_per_iteration": 2.5837602615356445 + }, + { + "auxiliary_loss_clip": 0.0135711, + "auxiliary_loss_mlp": 0.01105229, + "balance_loss_clip": 1.05613816, + "balance_loss_mlp": 1.09709311, + "epoch": 0.015752292199007967, + "flos": 19392085946880.0, + "grad_norm": 1.9926513495738176, + "language_loss": 0.67226446, + "learning_rate": 3.5851907672491904e-06, + "loss": 0.69688779, + "num_input_tokens_seen": 5506600, + "router_z_loss_clip": 0.49023438, + "router_z_loss_mlp": 2.59375, + "step": 262, + "time_per_iteration": 2.5490784645080566 + }, + { + "auxiliary_loss_clip": 0.01354995, + "auxiliary_loss_mlp": 0.01121613, + "balance_loss_clip": 1.07145, + "balance_loss_mlp": 1.0984714, + "epoch": 0.015812415451675936, + "flos": 20339804338560.0, + "grad_norm": 2.3019763169045637, + "language_loss": 0.68570435, + "learning_rate": 3.587643540438383e-06, + "loss": 0.71047044, + "num_input_tokens_seen": 5524350, + "router_z_loss_clip": 0.50390625, + "router_z_loss_mlp": 2.5625, + "step": 263, + "time_per_iteration": 2.5207104682922363 + }, + { + "auxiliary_loss_clip": 0.01355963, + "auxiliary_loss_mlp": 0.01105396, + "balance_loss_clip": 1.055686, + "balance_loss_mlp": 1.09446979, + "epoch": 0.015872538704343905, + "flos": 17525412979200.0, + "grad_norm": 3.705792502973735, + "language_loss": 0.85120308, + "learning_rate": 3.590087005168037e-06, + "loss": 0.87581658, + "num_input_tokens_seen": 5542145, + "router_z_loss_clip": 0.49609375, + "router_z_loss_mlp": 2.625, + "step": 264, + "time_per_iteration": 2.559406280517578 + }, + { + "auxiliary_loss_clip": 0.01361439, + "auxiliary_loss_mlp": 0.01092909, + "balance_loss_clip": 1.04663229, + "balance_loss_mlp": 1.10003614, + "epoch": 0.015932661957011873, + "flos": 15260490944640.0, + "grad_norm": 4.651007312001026, + "language_loss": 1.04371059, + "learning_rate": 3.5925212318237344e-06, + "loss": 1.06825411, + "num_input_tokens_seen": 5557920, + "router_z_loss_clip": 0.46289062, + "router_z_loss_mlp": 2.625, + "step": 265, + "time_per_iteration": 2.5076427459716797 + }, + { + "auxiliary_loss_clip": 0.01364923, + "auxiliary_loss_mlp": 0.01114141, + "balance_loss_clip": 1.06266677, + "balance_loss_mlp": 1.10278761, + "epoch": 0.015992785209679845, + "flos": 20302528999680.0, + "grad_norm": 2.2797174203272705, + "language_loss": 0.75153112, + "learning_rate": 3.5949462899957323e-06, + "loss": 0.77632177, + "num_input_tokens_seen": 5576290, + "router_z_loss_clip": 0.515625, + "router_z_loss_mlp": 2.625, + "step": 266, + "time_per_iteration": 2.52923583984375 + }, + { + "auxiliary_loss_clip": 0.01351984, + "auxiliary_loss_mlp": 0.01101922, + "balance_loss_clip": 1.05321336, + "balance_loss_mlp": 1.10004377, + "epoch": 0.016052908462347814, + "flos": 23362368969600.0, + "grad_norm": 1.7047265515665009, + "language_loss": 0.90568709, + "learning_rate": 3.5973622484909068e-06, + "loss": 0.93022615, + "num_input_tokens_seen": 5595205, + "router_z_loss_clip": 0.48632812, + "router_z_loss_mlp": 2.515625, + "step": 267, + "time_per_iteration": 4.033226251602173 + }, + { + "auxiliary_loss_clip": 0.01359316, + "auxiliary_loss_mlp": 0.01118854, + "balance_loss_clip": 1.07143235, + "balance_loss_mlp": 1.09878063, + "epoch": 0.016113031715015783, + "flos": 21286588976640.0, + "grad_norm": 2.258126572730018, + "language_loss": 0.86044276, + "learning_rate": 3.599769175344462e-06, + "loss": 0.88522446, + "num_input_tokens_seen": 5612645, + "router_z_loss_clip": 0.47460938, + "router_z_loss_mlp": 2.609375, + "step": 268, + "time_per_iteration": 3.9120936393737793 + }, + { + "auxiliary_loss_clip": 0.01352601, + "auxiliary_loss_mlp": 0.01098281, + "balance_loss_clip": 1.05186045, + "balance_loss_mlp": 1.10092831, + "epoch": 0.01617315496768375, + "flos": 18914689261440.0, + "grad_norm": 3.4793793476816335, + "language_loss": 0.88284534, + "learning_rate": 3.602167137831432e-06, + "loss": 0.90735412, + "num_input_tokens_seen": 5628345, + "router_z_loss_clip": 0.46484375, + "router_z_loss_mlp": 2.515625, + "step": 269, + "time_per_iteration": 2.5170347690582275 + }, + { + "auxiliary_loss_clip": 0.01357286, + "auxiliary_loss_mlp": 0.01099969, + "balance_loss_clip": 1.04901874, + "balance_loss_mlp": 1.09723783, + "epoch": 0.01623327822035172, + "flos": 16546488647040.0, + "grad_norm": 2.082153756456244, + "language_loss": 0.97073388, + "learning_rate": 3.6045562024779565e-06, + "loss": 0.99530637, + "num_input_tokens_seen": 5645940, + "router_z_loss_clip": 0.5078125, + "router_z_loss_mlp": 2.59375, + "step": 270, + "time_per_iteration": 2.4856350421905518 + }, + { + "auxiliary_loss_clip": 0.01357366, + "auxiliary_loss_mlp": 0.01117767, + "balance_loss_clip": 1.07001138, + "balance_loss_mlp": 1.10259032, + "epoch": 0.016293401473019692, + "flos": 23513481486720.0, + "grad_norm": 2.1071719511680755, + "language_loss": 0.85919821, + "learning_rate": 3.606936435072361e-06, + "loss": 0.88394946, + "num_input_tokens_seen": 5665690, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 2.546875, + "step": 271, + "time_per_iteration": 2.55047345161438 + }, + { + "auxiliary_loss_clip": 0.01355041, + "auxiliary_loss_mlp": 0.0109977, + "balance_loss_clip": 1.05201519, + "balance_loss_mlp": 1.09418058, + "epoch": 0.01635352472568766, + "flos": 29016072748800.0, + "grad_norm": 3.6330072162998523, + "language_loss": 0.81509304, + "learning_rate": 3.609307900676025e-06, + "loss": 0.83964115, + "num_input_tokens_seen": 5683190, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 2.609375, + "step": 272, + "time_per_iteration": 2.563840389251709 + }, + { + "auxiliary_loss_clip": 0.01348825, + "auxiliary_loss_mlp": 0.01118044, + "balance_loss_clip": 1.07229137, + "balance_loss_mlp": 1.09649634, + "epoch": 0.01641364797835563, + "flos": 13370513028480.0, + "grad_norm": 2.4112371858801436, + "language_loss": 0.81101978, + "learning_rate": 3.611670663634051e-06, + "loss": 0.83568847, + "num_input_tokens_seen": 5699780, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 2.515625, + "step": 273, + "time_per_iteration": 2.504791736602783 + }, + { + "auxiliary_loss_clip": 0.01348205, + "auxiliary_loss_mlp": 0.01105988, + "balance_loss_clip": 1.05825627, + "balance_loss_mlp": 1.0930239, + "epoch": 0.016473771231023598, + "flos": 18878239935360.0, + "grad_norm": 2.3125197915452387, + "language_loss": 0.91599321, + "learning_rate": 3.614024787585744e-06, + "loss": 0.94053519, + "num_input_tokens_seen": 5716980, + "router_z_loss_clip": 0.4765625, + "router_z_loss_mlp": 2.5625, + "step": 274, + "time_per_iteration": 2.530883312225342 + }, + { + "auxiliary_loss_clip": 0.01346841, + "auxiliary_loss_mlp": 0.01110058, + "balance_loss_clip": 1.06154013, + "balance_loss_mlp": 1.09588742, + "epoch": 0.016533894483691566, + "flos": 22601637803520.0, + "grad_norm": 1.8828740595481548, + "language_loss": 0.87952697, + "learning_rate": 3.6163703354748927e-06, + "loss": 0.90409595, + "num_input_tokens_seen": 5737780, + "router_z_loss_clip": 0.484375, + "router_z_loss_mlp": 2.515625, + "step": 275, + "time_per_iteration": 2.6067841053009033 + }, + { + "auxiliary_loss_clip": 0.01349399, + "auxiliary_loss_mlp": 0.01103927, + "balance_loss_clip": 1.05481219, + "balance_loss_mlp": 1.09579742, + "epoch": 0.01659401773635954, + "flos": 21507188353920.0, + "grad_norm": 1.8814357547622875, + "language_loss": 0.80717576, + "learning_rate": 3.6187073695598707e-06, + "loss": 0.83170903, + "num_input_tokens_seen": 5758330, + "router_z_loss_clip": 0.4921875, + "router_z_loss_mlp": 2.53125, + "step": 276, + "time_per_iteration": 2.5251641273498535 + }, + { + "auxiliary_loss_clip": 0.01340258, + "auxiliary_loss_mlp": 0.01100275, + "balance_loss_clip": 1.0561676, + "balance_loss_mlp": 1.0946306, + "epoch": 0.016654140989027507, + "flos": 32850973411200.0, + "grad_norm": 1.7238418569970533, + "language_loss": 0.81033546, + "learning_rate": 3.621035951423551e-06, + "loss": 0.83474076, + "num_input_tokens_seen": 5778340, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 2.46875, + "step": 277, + "time_per_iteration": 2.6796398162841797 + }, + { + "auxiliary_loss_clip": 0.01338755, + "auxiliary_loss_mlp": 0.01095233, + "balance_loss_clip": 1.04828835, + "balance_loss_mlp": 1.08789539, + "epoch": 0.016714264241695476, + "flos": 12306228024960.0, + "grad_norm": 2.810922211495867, + "language_loss": 0.80307728, + "learning_rate": 3.623356141983041e-06, + "loss": 0.82741719, + "num_input_tokens_seen": 5794295, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 2.515625, + "step": 278, + "time_per_iteration": 2.4939208030700684 + }, + { + "auxiliary_loss_clip": 0.01343866, + "auxiliary_loss_mlp": 0.01101481, + "balance_loss_clip": 1.05634809, + "balance_loss_mlp": 1.09381282, + "epoch": 0.016774387494363444, + "flos": 27123796362240.0, + "grad_norm": 1.7778988036026468, + "language_loss": 0.90482658, + "learning_rate": 3.6256680014992486e-06, + "loss": 0.92928004, + "num_input_tokens_seen": 5814405, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 2.5, + "step": 279, + "time_per_iteration": 2.571439504623413 + }, + { + "auxiliary_loss_clip": 0.01348727, + "auxiliary_loss_mlp": 0.01116075, + "balance_loss_clip": 1.06872559, + "balance_loss_mlp": 1.09391451, + "epoch": 0.016834510747031413, + "flos": 20191493082240.0, + "grad_norm": 3.0477743200742387, + "language_loss": 0.94153798, + "learning_rate": 3.6279715895862713e-06, + "loss": 0.96618605, + "num_input_tokens_seen": 5832795, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 2.546875, + "step": 280, + "time_per_iteration": 2.5161728858947754 + }, + { + "auxiliary_loss_clip": 0.0134865, + "auxiliary_loss_mlp": 0.0110692, + "balance_loss_clip": 1.05864, + "balance_loss_mlp": 1.09245062, + "epoch": 0.016894633999699385, + "flos": 27274262434560.0, + "grad_norm": 3.578687135351882, + "language_loss": 0.73929775, + "learning_rate": 3.6302669652206183e-06, + "loss": 0.76385343, + "num_input_tokens_seen": 5855750, + "router_z_loss_clip": 0.48242188, + "router_z_loss_mlp": 2.5625, + "step": 281, + "time_per_iteration": 2.616241931915283 + }, + { + "auxiliary_loss_clip": 0.01343434, + "auxiliary_loss_mlp": 0.0111488, + "balance_loss_clip": 1.06977129, + "balance_loss_mlp": 1.09390783, + "epoch": 0.016954757252367354, + "flos": 14902964922240.0, + "grad_norm": 2.679798242609796, + "language_loss": 0.80207133, + "learning_rate": 3.632554186750274e-06, + "loss": 0.82665443, + "num_input_tokens_seen": 5872610, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 2.5, + "step": 282, + "time_per_iteration": 2.5421135425567627 + }, + { + "auxiliary_loss_clip": 0.01348806, + "auxiliary_loss_mlp": 0.01117348, + "balance_loss_clip": 1.0704273, + "balance_loss_mlp": 1.09599137, + "epoch": 0.017014880505035322, + "flos": 21358805270400.0, + "grad_norm": 2.1184562475367916, + "language_loss": 0.77788174, + "learning_rate": 3.6348333119035937e-06, + "loss": 0.80254328, + "num_input_tokens_seen": 5892985, + "router_z_loss_clip": 0.46875, + "router_z_loss_mlp": 2.53125, + "step": 283, + "time_per_iteration": 2.516474485397339 + }, + { + "auxiliary_loss_clip": 0.01349252, + "auxiliary_loss_mlp": 0.01091995, + "balance_loss_clip": 1.04788804, + "balance_loss_mlp": 1.09700751, + "epoch": 0.01707500375770329, + "flos": 35333154858240.0, + "grad_norm": 2.1009174504018544, + "language_loss": 0.84172702, + "learning_rate": 3.6371043977980503e-06, + "loss": 0.86613953, + "num_input_tokens_seen": 5914060, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 2.515625, + "step": 284, + "time_per_iteration": 2.646301031112671 + }, + { + "auxiliary_loss_clip": 0.01339164, + "auxiliary_loss_mlp": 0.010994, + "balance_loss_clip": 1.05216956, + "balance_loss_mlp": 1.09148788, + "epoch": 0.01713512701037126, + "flos": 23582070506880.0, + "grad_norm": 3.014395623363928, + "language_loss": 0.96993905, + "learning_rate": 3.639367500948819e-06, + "loss": 0.99432468, + "num_input_tokens_seen": 5932860, + "router_z_loss_clip": 0.47265625, + "router_z_loss_mlp": 2.46875, + "step": 285, + "time_per_iteration": 2.5412731170654297 + }, + { + "auxiliary_loss_clip": 0.01342544, + "auxiliary_loss_mlp": 0.01093983, + "balance_loss_clip": 1.05025744, + "balance_loss_mlp": 1.09407294, + "epoch": 0.01719525026303923, + "flos": 27634661544960.0, + "grad_norm": 2.2067050643741433, + "language_loss": 0.93951917, + "learning_rate": 3.6416226772772178e-06, + "loss": 0.96388453, + "num_input_tokens_seen": 5952725, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 2.484375, + "step": 286, + "time_per_iteration": 2.5895566940307617 + }, + { + "auxiliary_loss_clip": 0.0133546, + "auxiliary_loss_mlp": 0.01090331, + "balance_loss_clip": 1.04503167, + "balance_loss_mlp": 1.08924019, + "epoch": 0.0172553735157072, + "flos": 26979722910720.0, + "grad_norm": 1.8729510510678706, + "language_loss": 0.92157722, + "learning_rate": 3.643869982119001e-06, + "loss": 0.94583511, + "num_input_tokens_seen": 5970560, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 2.46875, + "step": 287, + "time_per_iteration": 2.6144802570343018 + }, + { + "auxiliary_loss_clip": 0.01338793, + "auxiliary_loss_mlp": 0.01089685, + "balance_loss_clip": 1.04462433, + "balance_loss_mlp": 1.08859432, + "epoch": 0.01731549676837517, + "flos": 14056621689600.0, + "grad_norm": 3.2271144452092564, + "language_loss": 1.02026963, + "learning_rate": 3.646109470232502e-06, + "loss": 1.04455447, + "num_input_tokens_seen": 5982980, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 2.5, + "step": 288, + "time_per_iteration": 2.488274097442627 + }, + { + "auxiliary_loss_clip": 0.01222501, + "auxiliary_loss_mlp": 0.01036501, + "balance_loss_clip": 1.02000237, + "balance_loss_mlp": 1.09325862, + "epoch": 0.017375620021043137, + "flos": 66510694471680.0, + "grad_norm": 0.9131614435254132, + "language_loss": 0.63915455, + "learning_rate": 3.6483411958066417e-06, + "loss": 0.66174459, + "num_input_tokens_seen": 6049445, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 1.296875, + "step": 289, + "time_per_iteration": 3.222426652908325 + }, + { + "auxiliary_loss_clip": 0.01341104, + "auxiliary_loss_mlp": 0.01107523, + "balance_loss_clip": 1.06379664, + "balance_loss_mlp": 1.09403992, + "epoch": 0.01743574327371111, + "flos": 15225154940160.0, + "grad_norm": 2.4014361624695173, + "language_loss": 0.88569438, + "learning_rate": 3.6505652124687957e-06, + "loss": 0.91018069, + "num_input_tokens_seen": 6064150, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 2.46875, + "step": 290, + "time_per_iteration": 2.49294114112854 + }, + { + "auxiliary_loss_clip": 0.01338257, + "auxiliary_loss_mlp": 0.01091523, + "balance_loss_clip": 1.04631877, + "balance_loss_mlp": 1.09248078, + "epoch": 0.017495866526379078, + "flos": 25373869574400.0, + "grad_norm": 2.156562479490788, + "language_loss": 0.84578067, + "learning_rate": 3.6527815732925258e-06, + "loss": 0.87007844, + "num_input_tokens_seen": 6083920, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 2.453125, + "step": 291, + "time_per_iteration": 2.5356485843658447 + }, + { + "auxiliary_loss_clip": 0.01345108, + "auxiliary_loss_mlp": 0.01106973, + "balance_loss_clip": 1.05897939, + "balance_loss_mlp": 1.10042334, + "epoch": 0.017555989779047047, + "flos": 26359473836160.0, + "grad_norm": 1.6617628708439536, + "language_loss": 0.72766221, + "learning_rate": 3.6549903308051806e-06, + "loss": 0.75218308, + "num_input_tokens_seen": 6105460, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 2.453125, + "step": 292, + "time_per_iteration": 2.6524176597595215 + }, + { + "auxiliary_loss_clip": 0.01333825, + "auxiliary_loss_mlp": 0.01101528, + "balance_loss_clip": 1.05625248, + "balance_loss_mlp": 1.09236324, + "epoch": 0.017616113031715015, + "flos": 22338807010560.0, + "grad_norm": 2.2014441192179866, + "language_loss": 0.8726995, + "learning_rate": 3.6571915369953646e-06, + "loss": 0.89705306, + "num_input_tokens_seen": 6122890, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 2.40625, + "step": 293, + "time_per_iteration": 2.531580686569214 + }, + { + "auxiliary_loss_clip": 0.01334314, + "auxiliary_loss_mlp": 0.0110389, + "balance_loss_clip": 1.05959213, + "balance_loss_mlp": 1.09177744, + "epoch": 0.017676236284382984, + "flos": 20156911263360.0, + "grad_norm": 2.3120260424061367, + "language_loss": 0.81276119, + "learning_rate": 3.6593852433202797e-06, + "loss": 0.83714324, + "num_input_tokens_seen": 6142890, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 2.4375, + "step": 294, + "time_per_iteration": 2.568784236907959 + }, + { + "auxiliary_loss_clip": 0.01334452, + "auxiliary_loss_mlp": 0.01107857, + "balance_loss_clip": 1.06274807, + "balance_loss_mlp": 1.08824301, + "epoch": 0.017736359537050956, + "flos": 25223331674880.0, + "grad_norm": 2.9227055740425705, + "language_loss": 0.83710909, + "learning_rate": 3.6615715007129453e-06, + "loss": 0.86153215, + "num_input_tokens_seen": 6162030, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 2.46875, + "step": 295, + "time_per_iteration": 2.5799450874328613 + }, + { + "auxiliary_loss_clip": 0.01339817, + "auxiliary_loss_mlp": 0.01110731, + "balance_loss_clip": 1.06559837, + "balance_loss_mlp": 1.09874845, + "epoch": 0.017796482789718925, + "flos": 20338798757760.0, + "grad_norm": 2.5339269047951727, + "language_loss": 0.84620988, + "learning_rate": 3.6637503595892897e-06, + "loss": 0.87071538, + "num_input_tokens_seen": 6180540, + "router_z_loss_clip": 0.45117188, + "router_z_loss_mlp": 2.40625, + "step": 296, + "time_per_iteration": 2.5243051052093506 + }, + { + "auxiliary_loss_clip": 0.01338756, + "auxiliary_loss_mlp": 0.01097832, + "balance_loss_clip": 1.05417752, + "balance_loss_mlp": 1.09317493, + "epoch": 0.017856606042386893, + "flos": 22379206832640.0, + "grad_norm": 2.123858619871597, + "language_loss": 0.87729871, + "learning_rate": 3.665921869855132e-06, + "loss": 0.90166461, + "num_input_tokens_seen": 6199425, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 2.453125, + "step": 297, + "time_per_iteration": 2.5186710357666016 + }, + { + "auxiliary_loss_clip": 0.01337139, + "auxiliary_loss_mlp": 0.01100837, + "balance_loss_clip": 1.05713463, + "balance_loss_mlp": 1.09108877, + "epoch": 0.017916729295054862, + "flos": 20230061310720.0, + "grad_norm": 2.170328911832355, + "language_loss": 0.88528925, + "learning_rate": 3.6680860809130346e-06, + "loss": 0.90966904, + "num_input_tokens_seen": 6219170, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 2.46875, + "step": 298, + "time_per_iteration": 2.5320143699645996 + }, + { + "auxiliary_loss_clip": 0.0133273, + "auxiliary_loss_mlp": 0.01118432, + "balance_loss_clip": 1.07234538, + "balance_loss_mlp": 1.09249902, + "epoch": 0.01797685254772283, + "flos": 19390972625280.0, + "grad_norm": 1.8938405886263965, + "language_loss": 0.88666737, + "learning_rate": 3.6702430416690516e-06, + "loss": 0.91117901, + "num_input_tokens_seen": 6237930, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 2.40625, + "step": 299, + "time_per_iteration": 2.588275671005249 + }, + { + "auxiliary_loss_clip": 0.01340105, + "auxiliary_loss_mlp": 0.01105829, + "balance_loss_clip": 1.06055307, + "balance_loss_mlp": 1.09275746, + "epoch": 0.018036975800390802, + "flos": 24426007528320.0, + "grad_norm": 3.2936483356677253, + "language_loss": 0.64349103, + "learning_rate": 3.672392800539357e-06, + "loss": 0.66795039, + "num_input_tokens_seen": 6257170, + "router_z_loss_clip": 0.453125, + "router_z_loss_mlp": 2.46875, + "step": 300, + "time_per_iteration": 2.592313289642334 + }, + { + "auxiliary_loss_clip": 0.01338706, + "auxiliary_loss_mlp": 0.01105447, + "balance_loss_clip": 1.05986142, + "balance_loss_mlp": 1.09540462, + "epoch": 0.01809709905305877, + "flos": 15778933896960.0, + "grad_norm": 2.310898752337597, + "language_loss": 0.88330823, + "learning_rate": 3.6745354054567686e-06, + "loss": 0.90774977, + "num_input_tokens_seen": 6274780, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 2.4375, + "step": 301, + "time_per_iteration": 2.499481439590454 + }, + { + "auxiliary_loss_clip": 0.01214573, + "auxiliary_loss_mlp": 0.01024582, + "balance_loss_clip": 1.00932336, + "balance_loss_mlp": 1.08753991, + "epoch": 0.01815722230572674, + "flos": 67348382526720.0, + "grad_norm": 0.8370211186232274, + "language_loss": 0.62198341, + "learning_rate": 3.676670903877158e-06, + "loss": 0.64437497, + "num_input_tokens_seen": 6340435, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 1.265625, + "step": 302, + "time_per_iteration": 3.259997844696045 + }, + { + "auxiliary_loss_clip": 0.01329895, + "auxiliary_loss_mlp": 0.01107479, + "balance_loss_clip": 1.06198907, + "balance_loss_mlp": 1.08938098, + "epoch": 0.01821734555839471, + "flos": 15485615435520.0, + "grad_norm": 2.491293816938874, + "language_loss": 0.89617372, + "learning_rate": 3.6787993427857567e-06, + "loss": 0.92054749, + "num_input_tokens_seen": 6358160, + "router_z_loss_clip": 0.45507812, + "router_z_loss_mlp": 2.40625, + "step": 303, + "time_per_iteration": 2.536773920059204 + }, + { + "auxiliary_loss_clip": 0.01336859, + "auxiliary_loss_mlp": 0.01114111, + "balance_loss_clip": 1.06778669, + "balance_loss_mlp": 1.09363747, + "epoch": 0.018277468811062677, + "flos": 24097424889600.0, + "grad_norm": 4.887297609803561, + "language_loss": 0.80314684, + "learning_rate": 3.680920768703364e-06, + "loss": 0.82765651, + "num_input_tokens_seen": 6378485, + "router_z_loss_clip": 0.46289062, + "router_z_loss_mlp": 2.4375, + "step": 304, + "time_per_iteration": 2.563828945159912 + }, + { + "auxiliary_loss_clip": 0.01331614, + "auxiliary_loss_mlp": 0.01094816, + "balance_loss_clip": 1.05144823, + "balance_loss_mlp": 1.09657788, + "epoch": 0.01833759206373065, + "flos": 20959335141120.0, + "grad_norm": 1.8235558005033383, + "language_loss": 0.82894015, + "learning_rate": 3.6830352276924415e-06, + "loss": 0.85320443, + "num_input_tokens_seen": 6397845, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 2.34375, + "step": 305, + "time_per_iteration": 2.5195910930633545 + }, + { + "auxiliary_loss_clip": 0.01332168, + "auxiliary_loss_mlp": 0.01093114, + "balance_loss_clip": 1.04993677, + "balance_loss_mlp": 1.08868921, + "epoch": 0.018397715316398618, + "flos": 19390757143680.0, + "grad_norm": 1.9087210074301977, + "language_loss": 0.90843809, + "learning_rate": 3.685142765363119e-06, + "loss": 0.93269092, + "num_input_tokens_seen": 6416475, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 2.4375, + "step": 306, + "time_per_iteration": 2.501276969909668 + }, + { + "auxiliary_loss_clip": 0.01324982, + "auxiliary_loss_mlp": 0.01090544, + "balance_loss_clip": 1.04815364, + "balance_loss_mlp": 1.08638549, + "epoch": 0.018457838569066586, + "flos": 29132531619840.0, + "grad_norm": 2.1762826783898586, + "language_loss": 0.86435306, + "learning_rate": 3.687243426879095e-06, + "loss": 0.88850832, + "num_input_tokens_seen": 6437520, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 2.390625, + "step": 307, + "time_per_iteration": 2.6048038005828857 + }, + { + "auxiliary_loss_clip": 0.01325097, + "auxiliary_loss_mlp": 0.01106166, + "balance_loss_clip": 1.05817199, + "balance_loss_mlp": 1.09046888, + "epoch": 0.018517961821734555, + "flos": 19208654167680.0, + "grad_norm": 2.221444292833677, + "language_loss": 0.71723771, + "learning_rate": 3.6893372569634466e-06, + "loss": 0.74155033, + "num_input_tokens_seen": 6455680, + "router_z_loss_clip": 0.48046875, + "router_z_loss_mlp": 2.34375, + "step": 308, + "time_per_iteration": 2.513774871826172 + }, + { + "auxiliary_loss_clip": 0.01331987, + "auxiliary_loss_mlp": 0.01102938, + "balance_loss_clip": 1.05904555, + "balance_loss_mlp": 1.08861351, + "epoch": 0.018578085074402523, + "flos": 19863018184320.0, + "grad_norm": 2.2254161740825293, + "language_loss": 0.91952753, + "learning_rate": 3.6914242999043395e-06, + "loss": 0.94387674, + "num_input_tokens_seen": 6474880, + "router_z_loss_clip": 0.43945312, + "router_z_loss_mlp": 2.4375, + "step": 309, + "time_per_iteration": 5.224750280380249 + }, + { + "auxiliary_loss_clip": 0.01338325, + "auxiliary_loss_mlp": 0.01104953, + "balance_loss_clip": 1.05896235, + "balance_loss_mlp": 1.08840334, + "epoch": 0.018638208327070496, + "flos": 29606947476480.0, + "grad_norm": 2.8056803187702135, + "language_loss": 0.72399509, + "learning_rate": 3.69350459956065e-06, + "loss": 0.74842793, + "num_input_tokens_seen": 6495945, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 2.5, + "step": 310, + "time_per_iteration": 2.584730863571167 + }, + { + "auxiliary_loss_clip": 0.01330325, + "auxiliary_loss_mlp": 0.01112154, + "balance_loss_clip": 1.06790328, + "balance_loss_mlp": 1.09306264, + "epoch": 0.018698331579738464, + "flos": 45731555907840.0, + "grad_norm": 12.392698164772181, + "language_loss": 0.74104297, + "learning_rate": 3.695578199367497e-06, + "loss": 0.76546776, + "num_input_tokens_seen": 6519930, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 2.375, + "step": 311, + "time_per_iteration": 2.734072208404541 + }, + { + "auxiliary_loss_clip": 0.01337963, + "auxiliary_loss_mlp": 0.0110935, + "balance_loss_clip": 1.06619668, + "balance_loss_mlp": 1.09045064, + "epoch": 0.018758454832406433, + "flos": 20483662308480.0, + "grad_norm": 2.2753160661232603, + "language_loss": 0.91518372, + "learning_rate": 3.6976451423416825e-06, + "loss": 0.93965685, + "num_input_tokens_seen": 6535070, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 2.46875, + "step": 312, + "time_per_iteration": 2.5117411613464355 + }, + { + "auxiliary_loss_clip": 0.01336169, + "auxiliary_loss_mlp": 0.01112089, + "balance_loss_clip": 1.06609774, + "balance_loss_mlp": 1.09088099, + "epoch": 0.0188185780850744, + "flos": 15777784661760.0, + "grad_norm": 2.320247917383294, + "language_loss": 0.89746982, + "learning_rate": 3.699705471087043e-06, + "loss": 0.92195237, + "num_input_tokens_seen": 6554135, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 2.453125, + "step": 313, + "time_per_iteration": 2.4761838912963867 + }, + { + "auxiliary_loss_clip": 0.01340305, + "auxiliary_loss_mlp": 0.01098393, + "balance_loss_clip": 1.05230689, + "balance_loss_mlp": 1.09061432, + "epoch": 0.018878701337742373, + "flos": 22455732758400.0, + "grad_norm": 2.3404867001555236, + "language_loss": 0.73099983, + "learning_rate": 3.7017592277997256e-06, + "loss": 0.75538683, + "num_input_tokens_seen": 6572275, + "router_z_loss_clip": 0.4609375, + "router_z_loss_mlp": 2.5, + "step": 314, + "time_per_iteration": 2.5488638877868652 + }, + { + "auxiliary_loss_clip": 0.01326469, + "auxiliary_loss_mlp": 0.01103837, + "balance_loss_clip": 1.06101751, + "balance_loss_mlp": 1.08694446, + "epoch": 0.018938824590410342, + "flos": 30993530238720.0, + "grad_norm": 2.192553769026804, + "language_loss": 0.89887041, + "learning_rate": 3.7038064542733654e-06, + "loss": 0.92317349, + "num_input_tokens_seen": 6594520, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 2.40625, + "step": 315, + "time_per_iteration": 2.5857741832733154 + }, + { + "auxiliary_loss_clip": 0.01329672, + "auxiliary_loss_mlp": 0.01096027, + "balance_loss_clip": 1.05170512, + "balance_loss_mlp": 1.08870411, + "epoch": 0.01899894784307831, + "flos": 23258910821760.0, + "grad_norm": 1.8364758613144732, + "language_loss": 0.80796063, + "learning_rate": 3.7058471919041945e-06, + "loss": 0.83221763, + "num_input_tokens_seen": 6614245, + "router_z_loss_clip": 0.44335938, + "router_z_loss_mlp": 2.40625, + "step": 316, + "time_per_iteration": 2.5222342014312744 + }, + { + "auxiliary_loss_clip": 0.01324399, + "auxiliary_loss_mlp": 0.01095063, + "balance_loss_clip": 1.05131364, + "balance_loss_mlp": 1.08633423, + "epoch": 0.01905907109574628, + "flos": 17457901367040.0, + "grad_norm": 2.1363686538021236, + "language_loss": 0.90357143, + "learning_rate": 3.7078814816960605e-06, + "loss": 0.92776608, + "num_input_tokens_seen": 6632015, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 2.375, + "step": 317, + "time_per_iteration": 2.4696123600006104 + }, + { + "auxiliary_loss_clip": 0.01319895, + "auxiliary_loss_mlp": 0.01095564, + "balance_loss_clip": 1.0515281, + "balance_loss_mlp": 1.0845592, + "epoch": 0.019119194348414248, + "flos": 14970225139200.0, + "grad_norm": 2.5260192321083794, + "language_loss": 0.90939772, + "learning_rate": 3.709909364265374e-06, + "loss": 0.93355227, + "num_input_tokens_seen": 6649015, + "router_z_loss_clip": 0.44140625, + "router_z_loss_mlp": 2.34375, + "step": 318, + "time_per_iteration": 2.488128185272217 + }, + { + "auxiliary_loss_clip": 0.01324457, + "auxiliary_loss_mlp": 0.01088861, + "balance_loss_clip": 1.04706657, + "balance_loss_mlp": 1.08574772, + "epoch": 0.01917931760108222, + "flos": 25482822503040.0, + "grad_norm": 2.626221841877022, + "language_loss": 0.93980259, + "learning_rate": 3.7119308798459706e-06, + "loss": 0.96393579, + "num_input_tokens_seen": 6669225, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 2.390625, + "step": 319, + "time_per_iteration": 2.5184502601623535 + }, + { + "auxiliary_loss_clip": 0.01205117, + "auxiliary_loss_mlp": 0.01080363, + "balance_loss_clip": 1.06586683, + "balance_loss_mlp": 1.07482553, + "epoch": 0.01923944085375019, + "flos": 71556967353600.0, + "grad_norm": 0.9345393611259016, + "language_loss": 0.59860981, + "learning_rate": 3.7139460682939026e-06, + "loss": 0.62146461, + "num_input_tokens_seen": 6725775, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 1.296875, + "step": 320, + "time_per_iteration": 3.0250258445739746 + }, + { + "auxiliary_loss_clip": 0.01320993, + "auxiliary_loss_mlp": 0.0110086, + "balance_loss_clip": 1.05827808, + "balance_loss_mlp": 1.08425927, + "epoch": 0.019299564106418157, + "flos": 19682495406720.0, + "grad_norm": 3.0799113353921572, + "language_loss": 0.89622325, + "learning_rate": 3.715954969092154e-06, + "loss": 0.92044175, + "num_input_tokens_seen": 6744170, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 2.375, + "step": 321, + "time_per_iteration": 2.476439952850342 + }, + { + "auxiliary_loss_clip": 0.01332068, + "auxiliary_loss_mlp": 0.0112077, + "balance_loss_clip": 1.07620978, + "balance_loss_mlp": 1.08993089, + "epoch": 0.019359687359086126, + "flos": 24387151991040.0, + "grad_norm": 2.068543890023447, + "language_loss": 0.82884163, + "learning_rate": 3.7179576213552805e-06, + "loss": 0.85337007, + "num_input_tokens_seen": 6764565, + "router_z_loss_clip": 0.4453125, + "router_z_loss_mlp": 2.421875, + "step": 322, + "time_per_iteration": 2.556302309036255 + }, + { + "auxiliary_loss_clip": 0.01332156, + "auxiliary_loss_mlp": 0.01090343, + "balance_loss_clip": 1.04828596, + "balance_loss_mlp": 1.08754158, + "epoch": 0.019419810611754094, + "flos": 23951376190080.0, + "grad_norm": 2.2506232399398245, + "language_loss": 0.72734368, + "learning_rate": 3.719954063833981e-06, + "loss": 0.75156873, + "num_input_tokens_seen": 6785310, + "router_z_loss_clip": 0.41992188, + "router_z_loss_mlp": 2.453125, + "step": 323, + "time_per_iteration": 2.5033397674560547 + }, + { + "auxiliary_loss_clip": 0.01318896, + "auxiliary_loss_mlp": 0.01090622, + "balance_loss_clip": 1.04763484, + "balance_loss_mlp": 1.08184087, + "epoch": 0.019479933864422067, + "flos": 22160223567360.0, + "grad_norm": 2.023515622890843, + "language_loss": 0.92639947, + "learning_rate": 3.721944334919596e-06, + "loss": 0.95049465, + "num_input_tokens_seen": 6803290, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 2.375, + "step": 324, + "time_per_iteration": 2.5194544792175293 + }, + { + "auxiliary_loss_clip": 0.01328869, + "auxiliary_loss_mlp": 0.01089838, + "balance_loss_clip": 1.04935479, + "balance_loss_mlp": 1.08943164, + "epoch": 0.019540057117090035, + "flos": 22236821320320.0, + "grad_norm": 4.018466874717804, + "language_loss": 0.65336061, + "learning_rate": 3.7239284726485375e-06, + "loss": 0.67754775, + "num_input_tokens_seen": 6822570, + "router_z_loss_clip": 0.40429688, + "router_z_loss_mlp": 2.390625, + "step": 325, + "time_per_iteration": 2.5107386112213135 + }, + { + "auxiliary_loss_clip": 0.0132709, + "auxiliary_loss_mlp": 0.01101196, + "balance_loss_clip": 1.05799484, + "balance_loss_mlp": 1.093485, + "epoch": 0.019600180369758004, + "flos": 23076771932160.0, + "grad_norm": 1.921455060851243, + "language_loss": 0.76449442, + "learning_rate": 3.72590651470665e-06, + "loss": 0.78877723, + "num_input_tokens_seen": 6841910, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 2.34375, + "step": 326, + "time_per_iteration": 2.5080325603485107 + }, + { + "auxiliary_loss_clip": 0.01320399, + "auxiliary_loss_mlp": 0.01103572, + "balance_loss_clip": 1.06015599, + "balance_loss_mlp": 1.08845115, + "epoch": 0.019660303622425972, + "flos": 25410857604480.0, + "grad_norm": 2.1551163890972123, + "language_loss": 0.79176939, + "learning_rate": 3.727878498433505e-06, + "loss": 0.8160091, + "num_input_tokens_seen": 6862480, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 2.3125, + "step": 327, + "time_per_iteration": 2.5449633598327637 + }, + { + "auxiliary_loss_clip": 0.01326802, + "auxiliary_loss_mlp": 0.01111954, + "balance_loss_clip": 1.06984949, + "balance_loss_mlp": 1.08873606, + "epoch": 0.01972042687509394, + "flos": 23657519024640.0, + "grad_norm": 2.1574079642063246, + "language_loss": 0.80725288, + "learning_rate": 3.7298444608266328e-06, + "loss": 0.83164048, + "num_input_tokens_seen": 6882015, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 2.390625, + "step": 328, + "time_per_iteration": 2.5418970584869385 + }, + { + "auxiliary_loss_clip": 0.01325663, + "auxiliary_loss_mlp": 0.01096681, + "balance_loss_clip": 1.05278802, + "balance_loss_mlp": 1.08396721, + "epoch": 0.019780550127761913, + "flos": 18223480869120.0, + "grad_norm": 2.245263087715646, + "language_loss": 0.93704766, + "learning_rate": 3.731804438545683e-06, + "loss": 0.96127105, + "num_input_tokens_seen": 6899785, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 2.40625, + "step": 329, + "time_per_iteration": 2.4910004138946533 + }, + { + "auxiliary_loss_clip": 0.01332781, + "auxiliary_loss_mlp": 0.01105781, + "balance_loss_clip": 1.06253231, + "balance_loss_mlp": 1.08930123, + "epoch": 0.01984067338042988, + "flos": 22418780641920.0, + "grad_norm": 2.9776357674257365, + "language_loss": 0.74277973, + "learning_rate": 3.7337584679165324e-06, + "loss": 0.7671653, + "num_input_tokens_seen": 6918575, + "router_z_loss_clip": 0.43164062, + "router_z_loss_mlp": 2.4375, + "step": 330, + "time_per_iteration": 2.51430082321167 + }, + { + "auxiliary_loss_clip": 0.01328701, + "auxiliary_loss_mlp": 0.01120913, + "balance_loss_clip": 1.07814097, + "balance_loss_mlp": 1.08762872, + "epoch": 0.01990079663309785, + "flos": 17055199013760.0, + "grad_norm": 2.972763157156593, + "language_loss": 0.93870068, + "learning_rate": 3.7357065849353186e-06, + "loss": 0.96319681, + "num_input_tokens_seen": 6936965, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 2.40625, + "step": 331, + "time_per_iteration": 2.4759159088134766 + }, + { + "auxiliary_loss_clip": 0.01316192, + "auxiliary_loss_mlp": 0.01089699, + "balance_loss_clip": 1.04938233, + "balance_loss_mlp": 1.0853951, + "epoch": 0.01996091988576582, + "flos": 15961791058560.0, + "grad_norm": 2.6958694906457836, + "language_loss": 0.92730892, + "learning_rate": 3.737648825272422e-06, + "loss": 0.95136791, + "num_input_tokens_seen": 6953475, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 2.3125, + "step": 332, + "time_per_iteration": 2.49817156791687 + }, + { + "auxiliary_loss_clip": 0.01325132, + "auxiliary_loss_mlp": 0.01092519, + "balance_loss_clip": 1.04903162, + "balance_loss_mlp": 1.09081161, + "epoch": 0.02002104313843379, + "flos": 23586451966080.0, + "grad_norm": 2.6289067025313777, + "language_loss": 0.75589794, + "learning_rate": 3.739585224276384e-06, + "loss": 0.78007442, + "num_input_tokens_seen": 6971630, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 2.34375, + "step": 333, + "time_per_iteration": 2.5180609226226807 + }, + { + "auxiliary_loss_clip": 0.01323371, + "auxiliary_loss_mlp": 0.01087623, + "balance_loss_clip": 1.04597139, + "balance_loss_mlp": 1.08625877, + "epoch": 0.02008116639110176, + "flos": 34094883352320.0, + "grad_norm": 2.1766901409232426, + "language_loss": 0.78768885, + "learning_rate": 3.7415158169777673e-06, + "loss": 0.81179881, + "num_input_tokens_seen": 6992775, + "router_z_loss_clip": 0.41601562, + "router_z_loss_mlp": 2.375, + "step": 334, + "time_per_iteration": 2.614708423614502 + }, + { + "auxiliary_loss_clip": 0.01324397, + "auxiliary_loss_mlp": 0.01094838, + "balance_loss_clip": 1.05015838, + "balance_loss_mlp": 1.08276975, + "epoch": 0.020141289643769728, + "flos": 19683716469120.0, + "grad_norm": 2.4059127888346916, + "language_loss": 0.83083838, + "learning_rate": 3.7434406380929575e-06, + "loss": 0.85503072, + "num_input_tokens_seen": 7011425, + "router_z_loss_clip": 0.44726562, + "router_z_loss_mlp": 2.421875, + "step": 335, + "time_per_iteration": 2.495260000228882 + }, + { + "auxiliary_loss_clip": 0.01320649, + "auxiliary_loss_mlp": 0.01090782, + "balance_loss_clip": 1.04934454, + "balance_loss_mlp": 1.08585882, + "epoch": 0.020201412896437697, + "flos": 20740567357440.0, + "grad_norm": 2.166489879958422, + "language_loss": 0.92639577, + "learning_rate": 3.745359722027911e-06, + "loss": 0.95051014, + "num_input_tokens_seen": 7029450, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 2.34375, + "step": 336, + "time_per_iteration": 2.526906967163086 + }, + { + "auxiliary_loss_clip": 0.01321744, + "auxiliary_loss_mlp": 0.01083167, + "balance_loss_clip": 1.04139614, + "balance_loss_mlp": 1.08352447, + "epoch": 0.020261536149105665, + "flos": 20266510636800.0, + "grad_norm": 1.825762702383362, + "language_loss": 0.88474333, + "learning_rate": 3.7472731028818428e-06, + "loss": 0.90879244, + "num_input_tokens_seen": 7047555, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 2.390625, + "step": 337, + "time_per_iteration": 2.5151607990264893 + }, + { + "auxiliary_loss_clip": 0.01310297, + "auxiliary_loss_mlp": 0.01101804, + "balance_loss_clip": 1.05836427, + "balance_loss_mlp": 1.08001363, + "epoch": 0.020321659401773638, + "flos": 25848752307840.0, + "grad_norm": 1.5415234153999902, + "language_loss": 0.89914495, + "learning_rate": 3.7491808144508626e-06, + "loss": 0.92326593, + "num_input_tokens_seen": 7068185, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 2.3125, + "step": 338, + "time_per_iteration": 2.5795979499816895 + }, + { + "auxiliary_loss_clip": 0.01324391, + "auxiliary_loss_mlp": 0.01099602, + "balance_loss_clip": 1.05742574, + "balance_loss_mlp": 1.08479571, + "epoch": 0.020381782654441606, + "flos": 17495033051520.0, + "grad_norm": 2.047046576054304, + "language_loss": 0.84801471, + "learning_rate": 3.7510828902315576e-06, + "loss": 0.87225461, + "num_input_tokens_seen": 7085955, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 2.40625, + "step": 339, + "time_per_iteration": 2.4558403491973877 + }, + { + "auxiliary_loss_clip": 0.01326609, + "auxiliary_loss_mlp": 0.01093427, + "balance_loss_clip": 1.05001152, + "balance_loss_mlp": 1.08709431, + "epoch": 0.020441905907109575, + "flos": 24243940465920.0, + "grad_norm": 1.7544231793273473, + "language_loss": 0.88913274, + "learning_rate": 3.75297936342452e-06, + "loss": 0.91333312, + "num_input_tokens_seen": 7106345, + "router_z_loss_clip": 0.43359375, + "router_z_loss_mlp": 2.40625, + "step": 340, + "time_per_iteration": 2.5330188274383545 + }, + { + "auxiliary_loss_clip": 0.01323557, + "auxiliary_loss_mlp": 0.01086176, + "balance_loss_clip": 1.04135346, + "balance_loss_mlp": 1.0859195, + "epoch": 0.020502029159777543, + "flos": 22233301787520.0, + "grad_norm": 2.2340783182785975, + "language_loss": 0.88071406, + "learning_rate": 3.7548702669378253e-06, + "loss": 0.90481138, + "num_input_tokens_seen": 7125070, + "router_z_loss_clip": 0.44921875, + "router_z_loss_mlp": 2.375, + "step": 341, + "time_per_iteration": 2.502161979675293 + }, + { + "auxiliary_loss_clip": 0.01325847, + "auxiliary_loss_mlp": 0.01099304, + "balance_loss_clip": 1.05643678, + "balance_loss_mlp": 1.08389783, + "epoch": 0.020562152412445512, + "flos": 23987861429760.0, + "grad_norm": 3.2005009235922572, + "language_loss": 0.80293322, + "learning_rate": 3.756755633390458e-06, + "loss": 0.82718468, + "num_input_tokens_seen": 7144675, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 2.421875, + "step": 342, + "time_per_iteration": 2.5315535068511963 + }, + { + "auxiliary_loss_clip": 0.0131301, + "auxiliary_loss_mlp": 0.01098615, + "balance_loss_clip": 1.05293417, + "balance_loss_mlp": 1.08132875, + "epoch": 0.020622275665113484, + "flos": 26975305537920.0, + "grad_norm": 2.399130254204822, + "language_loss": 0.89451253, + "learning_rate": 3.7586354951156886e-06, + "loss": 0.91862881, + "num_input_tokens_seen": 7165505, + "router_z_loss_clip": 0.45703125, + "router_z_loss_mlp": 2.3125, + "step": 343, + "time_per_iteration": 2.554255485534668 + }, + { + "auxiliary_loss_clip": 0.01325104, + "auxiliary_loss_mlp": 0.01094315, + "balance_loss_clip": 1.05342627, + "balance_loss_mlp": 1.08973229, + "epoch": 0.020682398917781453, + "flos": 22600704049920.0, + "grad_norm": 2.3234219523507296, + "language_loss": 0.78252918, + "learning_rate": 3.7605098841644e-06, + "loss": 0.80672336, + "num_input_tokens_seen": 7184605, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 2.359375, + "step": 344, + "time_per_iteration": 2.514665365219116 + }, + { + "auxiliary_loss_clip": 0.01309596, + "auxiliary_loss_mlp": 0.0110098, + "balance_loss_clip": 1.05730188, + "balance_loss_mlp": 1.08079529, + "epoch": 0.02074252217044942, + "flos": 15013605790080.0, + "grad_norm": 1.8371023099908983, + "language_loss": 0.75138956, + "learning_rate": 3.7623788323083666e-06, + "loss": 0.77549529, + "num_input_tokens_seen": 7203065, + "router_z_loss_clip": 0.4375, + "router_z_loss_mlp": 2.28125, + "step": 345, + "time_per_iteration": 2.513394594192505 + }, + { + "auxiliary_loss_clip": 0.01318525, + "auxiliary_loss_mlp": 0.01101003, + "balance_loss_clip": 1.05806339, + "balance_loss_mlp": 1.08789146, + "epoch": 0.02080264542311739, + "flos": 25337958952320.0, + "grad_norm": 2.0741733748571565, + "language_loss": 0.90269232, + "learning_rate": 3.7642423710434837e-06, + "loss": 0.92688763, + "num_input_tokens_seen": 7222995, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 2.3125, + "step": 346, + "time_per_iteration": 2.5487060546875 + }, + { + "auxiliary_loss_clip": 0.01314255, + "auxiliary_loss_mlp": 0.01095048, + "balance_loss_clip": 1.05527973, + "balance_loss_mlp": 1.08358788, + "epoch": 0.02086276867578536, + "flos": 24388804016640.0, + "grad_norm": 2.0766581400667, + "language_loss": 0.78869188, + "learning_rate": 3.7661005315929563e-06, + "loss": 0.81278479, + "num_input_tokens_seen": 7244625, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 2.3125, + "step": 347, + "time_per_iteration": 2.516402006149292 + }, + { + "auxiliary_loss_clip": 0.01317315, + "auxiliary_loss_mlp": 0.01096912, + "balance_loss_clip": 1.05335259, + "balance_loss_mlp": 1.08719826, + "epoch": 0.02092289192845333, + "flos": 24462205459200.0, + "grad_norm": 2.4234628631287927, + "language_loss": 0.71424043, + "learning_rate": 3.7679533449104354e-06, + "loss": 0.7383827, + "num_input_tokens_seen": 7263255, + "router_z_loss_clip": 0.43554688, + "router_z_loss_mlp": 2.3125, + "step": 348, + "time_per_iteration": 2.5407540798187256 + }, + { + "auxiliary_loss_clip": 0.01319638, + "auxiliary_loss_mlp": 0.01101899, + "balance_loss_clip": 1.0595324, + "balance_loss_mlp": 1.08435416, + "epoch": 0.0209830151811213, + "flos": 17451185523840.0, + "grad_norm": 4.002924557181807, + "language_loss": 0.76819432, + "learning_rate": 3.7698008416831116e-06, + "loss": 0.79240972, + "num_input_tokens_seen": 7279275, + "router_z_loss_clip": 0.42382812, + "router_z_loss_mlp": 2.34375, + "step": 349, + "time_per_iteration": 2.4884049892425537 + }, + { + "auxiliary_loss_clip": 0.0130292, + "auxiliary_loss_mlp": 0.0109884, + "balance_loss_clip": 1.05792725, + "balance_loss_mlp": 1.08141851, + "epoch": 0.021043138433789268, + "flos": 24573995562240.0, + "grad_norm": 1.9115672624672835, + "language_loss": 0.85271406, + "learning_rate": 3.7716430523347664e-06, + "loss": 0.87673163, + "num_input_tokens_seen": 7300180, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 2.21875, + "step": 350, + "time_per_iteration": 2.559812307357788 + }, + { + "auxiliary_loss_clip": 0.01311162, + "auxiliary_loss_mlp": 0.01089483, + "balance_loss_clip": 1.05083585, + "balance_loss_mlp": 1.08571863, + "epoch": 0.021103261686457236, + "flos": 24454053072000.0, + "grad_norm": 2.3355222976898764, + "language_loss": 0.80104828, + "learning_rate": 3.773480007028776e-06, + "loss": 0.82505476, + "num_input_tokens_seen": 7317430, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 2.25, + "step": 351, + "time_per_iteration": 5.468780517578125 + }, + { + "auxiliary_loss_clip": 0.01318524, + "auxiliary_loss_mlp": 0.01103443, + "balance_loss_clip": 1.06048024, + "balance_loss_mlp": 1.08623564, + "epoch": 0.021163384939125205, + "flos": 14683083816960.0, + "grad_norm": 3.8473493260702125, + "language_loss": 0.87258279, + "learning_rate": 3.775311735671078e-06, + "loss": 0.89680254, + "num_input_tokens_seen": 7334875, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 2.328125, + "step": 352, + "time_per_iteration": 2.4787278175354004 + }, + { + "auxiliary_loss_clip": 0.01312545, + "auxiliary_loss_mlp": 0.01105111, + "balance_loss_clip": 1.06248152, + "balance_loss_mlp": 1.08574009, + "epoch": 0.021223508191793177, + "flos": 24493195918080.0, + "grad_norm": 1.8920106465676412, + "language_loss": 0.82386625, + "learning_rate": 3.7771382679130878e-06, + "loss": 0.84804279, + "num_input_tokens_seen": 7355185, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 2.265625, + "step": 353, + "time_per_iteration": 2.5428433418273926 + }, + { + "auxiliary_loss_clip": 0.01307832, + "auxiliary_loss_mlp": 0.01091814, + "balance_loss_clip": 1.05133069, + "balance_loss_mlp": 1.08353949, + "epoch": 0.021283631444461146, + "flos": 24126978804480.0, + "grad_norm": 2.0636001035279694, + "language_loss": 0.8102631, + "learning_rate": 3.7789596331545845e-06, + "loss": 0.83425963, + "num_input_tokens_seen": 7374425, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 2.25, + "step": 354, + "time_per_iteration": 2.5261166095733643 + }, + { + "auxiliary_loss_clip": 0.01315043, + "auxiliary_loss_mlp": 0.01092413, + "balance_loss_clip": 1.04935455, + "balance_loss_mlp": 1.08190715, + "epoch": 0.021343754697129114, + "flos": 25192233475200.0, + "grad_norm": 2.8065821662627575, + "language_loss": 0.80764574, + "learning_rate": 3.780775860546545e-06, + "loss": 0.83172029, + "num_input_tokens_seen": 7394175, + "router_z_loss_clip": 0.4296875, + "router_z_loss_mlp": 2.328125, + "step": 355, + "time_per_iteration": 2.56968355178833 + }, + { + "auxiliary_loss_clip": 0.01310125, + "auxiliary_loss_mlp": 0.01086869, + "balance_loss_clip": 1.0454793, + "balance_loss_mlp": 1.08140039, + "epoch": 0.021403877949797083, + "flos": 17274182279040.0, + "grad_norm": 2.2488803729957, + "language_loss": 0.89553398, + "learning_rate": 3.7825869789939474e-06, + "loss": 0.91950381, + "num_input_tokens_seen": 7412645, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 2.28125, + "step": 356, + "time_per_iteration": 2.5510213375091553 + }, + { + "auxiliary_loss_clip": 0.01308646, + "auxiliary_loss_mlp": 0.01083372, + "balance_loss_clip": 1.04117227, + "balance_loss_mlp": 1.08451605, + "epoch": 0.021464001202465055, + "flos": 30917435276160.0, + "grad_norm": 2.7055681522526522, + "language_loss": 0.80032516, + "learning_rate": 3.784393017158528e-06, + "loss": 0.82424533, + "num_input_tokens_seen": 7432275, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 2.234375, + "step": 357, + "time_per_iteration": 2.5834848880767822 + }, + { + "auxiliary_loss_clip": 0.01311386, + "auxiliary_loss_mlp": 0.0108216, + "balance_loss_clip": 1.04336917, + "balance_loss_mlp": 1.08195996, + "epoch": 0.021524124455133024, + "flos": 18186385098240.0, + "grad_norm": 2.3810225918991827, + "language_loss": 0.7661376, + "learning_rate": 3.786194003461506e-06, + "loss": 0.7900731, + "num_input_tokens_seen": 7450245, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 2.296875, + "step": 358, + "time_per_iteration": 2.4937031269073486 + }, + { + "auxiliary_loss_clip": 0.01308618, + "auxiliary_loss_mlp": 0.01088514, + "balance_loss_clip": 1.04574156, + "balance_loss_mlp": 1.08024073, + "epoch": 0.021584247707800992, + "flos": 13805786039040.0, + "grad_norm": 3.004949550769694, + "language_loss": 0.88491321, + "learning_rate": 3.787989966086264e-06, + "loss": 0.90888453, + "num_input_tokens_seen": 7466845, + "router_z_loss_clip": 0.42773438, + "router_z_loss_mlp": 2.28125, + "step": 359, + "time_per_iteration": 2.452698230743408 + }, + { + "auxiliary_loss_clip": 0.01316066, + "auxiliary_loss_mlp": 0.01089057, + "balance_loss_clip": 1.05000377, + "balance_loss_mlp": 1.08438587, + "epoch": 0.02164437096046896, + "flos": 23294713703040.0, + "grad_norm": 2.789884231725057, + "language_loss": 0.76007903, + "learning_rate": 3.789780932980997e-06, + "loss": 0.78413033, + "num_input_tokens_seen": 7485450, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.3125, + "step": 360, + "time_per_iteration": 2.490006685256958 + }, + { + "auxiliary_loss_clip": 0.01189834, + "auxiliary_loss_mlp": 0.010797, + "balance_loss_clip": 1.06634831, + "balance_loss_mlp": 1.06162107, + "epoch": 0.02170449421313693, + "flos": 68899578341760.0, + "grad_norm": 0.8685264055585812, + "language_loss": 0.64943242, + "learning_rate": 3.79156693186132e-06, + "loss": 0.67212784, + "num_input_tokens_seen": 7553780, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 1.28125, + "step": 361, + "time_per_iteration": 3.1978280544281006 + }, + { + "auxiliary_loss_clip": 0.01307066, + "auxiliary_loss_mlp": 0.01088482, + "balance_loss_clip": 1.04826093, + "balance_loss_mlp": 1.0776422, + "epoch": 0.0217646174658049, + "flos": 25228539146880.0, + "grad_norm": 2.6839093883440213, + "language_loss": 0.78157276, + "learning_rate": 3.7933479902128433e-06, + "loss": 0.80552828, + "num_input_tokens_seen": 7574155, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 2.296875, + "step": 362, + "time_per_iteration": 2.5401153564453125 + }, + { + "auxiliary_loss_clip": 0.0131339, + "auxiliary_loss_mlp": 0.01092034, + "balance_loss_clip": 1.05171776, + "balance_loss_mlp": 1.08265781, + "epoch": 0.02182474071847287, + "flos": 22893124671360.0, + "grad_norm": 2.163466714708112, + "language_loss": 0.92508751, + "learning_rate": 3.7951241352937077e-06, + "loss": 0.94914174, + "num_input_tokens_seen": 7592320, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 2.3125, + "step": 363, + "time_per_iteration": 2.4868171215057373 + }, + { + "auxiliary_loss_clip": 0.01307593, + "auxiliary_loss_mlp": 0.01102168, + "balance_loss_clip": 1.06270981, + "balance_loss_mlp": 1.08121252, + "epoch": 0.02188486397114084, + "flos": 23658991482240.0, + "grad_norm": 2.137373361500905, + "language_loss": 0.89611077, + "learning_rate": 3.7968953941370915e-06, + "loss": 0.92020839, + "num_input_tokens_seen": 7611185, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 2.265625, + "step": 364, + "time_per_iteration": 2.5251975059509277 + }, + { + "auxiliary_loss_clip": 0.01313873, + "auxiliary_loss_mlp": 0.01094072, + "balance_loss_clip": 1.05232477, + "balance_loss_mlp": 1.08512843, + "epoch": 0.021944987223808807, + "flos": 21543637680000.0, + "grad_norm": 2.0040846596101867, + "language_loss": 0.79597497, + "learning_rate": 3.798661793553676e-06, + "loss": 0.82005441, + "num_input_tokens_seen": 7631970, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 2.28125, + "step": 365, + "time_per_iteration": 2.5358779430389404 + }, + { + "auxiliary_loss_clip": 0.01307321, + "auxiliary_loss_mlp": 0.01094001, + "balance_loss_clip": 1.05218291, + "balance_loss_mlp": 1.08262253, + "epoch": 0.022005110476476776, + "flos": 16070887641600.0, + "grad_norm": 2.4198695758814126, + "language_loss": 0.84312123, + "learning_rate": 3.8004233601340808e-06, + "loss": 0.86713445, + "num_input_tokens_seen": 7649745, + "router_z_loss_clip": 0.41796875, + "router_z_loss_mlp": 2.25, + "step": 366, + "time_per_iteration": 2.4834306240081787 + }, + { + "auxiliary_loss_clip": 0.01314411, + "auxiliary_loss_mlp": 0.01089093, + "balance_loss_clip": 1.05008757, + "balance_loss_mlp": 1.08409071, + "epoch": 0.022065233729144748, + "flos": 21433715084160.0, + "grad_norm": 2.4790438398014114, + "language_loss": 0.87009263, + "learning_rate": 3.8021801202512694e-06, + "loss": 0.89412761, + "num_input_tokens_seen": 7668830, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.296875, + "step": 367, + "time_per_iteration": 2.486476421356201 + }, + { + "auxiliary_loss_clip": 0.01315695, + "auxiliary_loss_mlp": 0.01094559, + "balance_loss_clip": 1.05247772, + "balance_loss_mlp": 1.08183074, + "epoch": 0.022125356981812717, + "flos": 21543709507200.0, + "grad_norm": 3.1787846704720906, + "language_loss": 0.84725291, + "learning_rate": 3.803932100062912e-06, + "loss": 0.87135541, + "num_input_tokens_seen": 7687240, + "router_z_loss_clip": 0.421875, + "router_z_loss_mlp": 2.34375, + "step": 368, + "time_per_iteration": 2.522035837173462 + }, + { + "auxiliary_loss_clip": 0.01314671, + "auxiliary_loss_mlp": 0.01085486, + "balance_loss_clip": 1.04559815, + "balance_loss_mlp": 1.07997978, + "epoch": 0.022185480234480685, + "flos": 20704153944960.0, + "grad_norm": 3.205334425353566, + "language_loss": 0.75328851, + "learning_rate": 3.8056793255137264e-06, + "loss": 0.77728999, + "num_input_tokens_seen": 7704440, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 2.34375, + "step": 369, + "time_per_iteration": 2.5247385501861572 + }, + { + "auxiliary_loss_clip": 0.01309465, + "auxiliary_loss_mlp": 0.01102826, + "balance_loss_clip": 1.06241453, + "balance_loss_mlp": 1.08204889, + "epoch": 0.022245603487148654, + "flos": 25193203142400.0, + "grad_norm": 2.195001895084689, + "language_loss": 0.82444763, + "learning_rate": 3.8074218223377844e-06, + "loss": 0.84857059, + "num_input_tokens_seen": 7727160, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 2.28125, + "step": 370, + "time_per_iteration": 2.556654453277588 + }, + { + "auxiliary_loss_clip": 0.01308477, + "auxiliary_loss_mlp": 0.01102256, + "balance_loss_clip": 1.06186807, + "balance_loss_mlp": 1.08148122, + "epoch": 0.022305726739816623, + "flos": 21395936954880.0, + "grad_norm": 1.701167396379405, + "language_loss": 0.81576145, + "learning_rate": 3.8091596160607834e-06, + "loss": 0.83986878, + "num_input_tokens_seen": 7747730, + "router_z_loss_clip": 0.40429688, + "router_z_loss_mlp": 2.265625, + "step": 371, + "time_per_iteration": 2.5303707122802734 + }, + { + "auxiliary_loss_clip": 0.01313813, + "auxiliary_loss_mlp": 0.01097647, + "balance_loss_clip": 1.05611479, + "balance_loss_mlp": 1.08685589, + "epoch": 0.022365849992484595, + "flos": 22492146170880.0, + "grad_norm": 2.421527930745161, + "language_loss": 0.83273733, + "learning_rate": 3.8108927320022896e-06, + "loss": 0.85685182, + "num_input_tokens_seen": 7766765, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 2.28125, + "step": 372, + "time_per_iteration": 2.528141975402832 + }, + { + "auxiliary_loss_clip": 0.01304775, + "auxiliary_loss_mlp": 0.01093239, + "balance_loss_clip": 1.05170679, + "balance_loss_mlp": 1.08068216, + "epoch": 0.022425973245152563, + "flos": 17856581397120.0, + "grad_norm": 3.9515576064335742, + "language_loss": 0.78448784, + "learning_rate": 3.8126211952779548e-06, + "loss": 0.80846798, + "num_input_tokens_seen": 7784010, + "router_z_loss_clip": 0.41601562, + "router_z_loss_mlp": 2.234375, + "step": 373, + "time_per_iteration": 2.4879236221313477 + }, + { + "auxiliary_loss_clip": 0.01310159, + "auxiliary_loss_mlp": 0.01088775, + "balance_loss_clip": 1.04681301, + "balance_loss_mlp": 1.08387947, + "epoch": 0.022486096497820532, + "flos": 15483029656320.0, + "grad_norm": 2.577150517784044, + "language_loss": 0.77507353, + "learning_rate": 3.8143450308016952e-06, + "loss": 0.79906291, + "num_input_tokens_seen": 7801305, + "router_z_loss_clip": 0.41992188, + "router_z_loss_mlp": 2.265625, + "step": 374, + "time_per_iteration": 2.467660665512085 + }, + { + "auxiliary_loss_clip": 0.01300907, + "auxiliary_loss_mlp": 0.01075524, + "balance_loss_clip": 1.03415811, + "balance_loss_mlp": 1.07458413, + "epoch": 0.0225462197504885, + "flos": 27784157950080.0, + "grad_norm": 2.1361288872426187, + "language_loss": 0.85989249, + "learning_rate": 3.8160642632878525e-06, + "loss": 0.8836568, + "num_input_tokens_seen": 7823965, + "router_z_loss_clip": 0.4140625, + "router_z_loss_mlp": 2.265625, + "step": 375, + "time_per_iteration": 2.555748224258423 + }, + { + "auxiliary_loss_clip": 0.01307901, + "auxiliary_loss_mlp": 0.01100092, + "balance_loss_clip": 1.05767775, + "balance_loss_mlp": 1.08341241, + "epoch": 0.02260634300315647, + "flos": 19975490645760.0, + "grad_norm": 5.5735447387306785, + "language_loss": 0.89170349, + "learning_rate": 3.817778917253314e-06, + "loss": 0.91578341, + "num_input_tokens_seen": 7842115, + "router_z_loss_clip": 0.42578125, + "router_z_loss_mlp": 2.25, + "step": 376, + "time_per_iteration": 2.53151798248291 + }, + { + "auxiliary_loss_clip": 0.01309113, + "auxiliary_loss_mlp": 0.01087831, + "balance_loss_clip": 1.04908752, + "balance_loss_mlp": 1.07899499, + "epoch": 0.02266646625582444, + "flos": 16028189349120.0, + "grad_norm": 4.261190841992283, + "language_loss": 0.74947262, + "learning_rate": 3.8194890170196155e-06, + "loss": 0.77344215, + "num_input_tokens_seen": 7857830, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 2.3125, + "step": 377, + "time_per_iteration": 2.463115692138672 + }, + { + "auxiliary_loss_clip": 0.0129987, + "auxiliary_loss_mlp": 0.01091273, + "balance_loss_clip": 1.05009794, + "balance_loss_mlp": 1.08131123, + "epoch": 0.02272658950849241, + "flos": 20404622430720.0, + "grad_norm": 9.398931100052017, + "language_loss": 0.99195766, + "learning_rate": 3.8211945867150055e-06, + "loss": 1.01586914, + "num_input_tokens_seen": 7875840, + "router_z_loss_clip": 0.41210938, + "router_z_loss_mlp": 2.1875, + "step": 378, + "time_per_iteration": 2.4765851497650146 + }, + { + "auxiliary_loss_clip": 0.01180245, + "auxiliary_loss_mlp": 0.0112236, + "balance_loss_clip": 1.10910404, + "balance_loss_mlp": 1.06006432, + "epoch": 0.02278671276116038, + "flos": 69847332647040.0, + "grad_norm": 0.9843357397114052, + "language_loss": 0.75457036, + "learning_rate": 3.822895650276492e-06, + "loss": 0.77759647, + "num_input_tokens_seen": 7940190, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 1.203125, + "step": 379, + "time_per_iteration": 3.113067388534546 + }, + { + "auxiliary_loss_clip": 0.01308809, + "auxiliary_loss_mlp": 0.01083458, + "balance_loss_clip": 1.0448581, + "balance_loss_mlp": 1.07811105, + "epoch": 0.022846836013828347, + "flos": 38508771340800.0, + "grad_norm": 4.195302770466088, + "language_loss": 0.78423429, + "learning_rate": 3.824592231451859e-06, + "loss": 0.80815697, + "num_input_tokens_seen": 7960840, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 2.3125, + "step": 380, + "time_per_iteration": 2.6457204818725586 + }, + { + "auxiliary_loss_clip": 0.01302565, + "auxiliary_loss_mlp": 0.01083038, + "balance_loss_clip": 1.04527259, + "balance_loss_mlp": 1.08019924, + "epoch": 0.02290695926649632, + "flos": 20959478795520.0, + "grad_norm": 2.272240555091753, + "language_loss": 0.9679752, + "learning_rate": 3.826284353801652e-06, + "loss": 0.99183118, + "num_input_tokens_seen": 7975500, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 2.21875, + "step": 381, + "time_per_iteration": 2.485316038131714 + }, + { + "auxiliary_loss_clip": 0.01312325, + "auxiliary_loss_mlp": 0.01084569, + "balance_loss_clip": 1.04501581, + "balance_loss_mlp": 1.08177519, + "epoch": 0.022967082519164288, + "flos": 24022407335040.0, + "grad_norm": 2.322972014312181, + "language_loss": 0.88035834, + "learning_rate": 3.827972040701142e-06, + "loss": 0.90432727, + "num_input_tokens_seen": 7993880, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 2.3125, + "step": 382, + "time_per_iteration": 2.5361156463623047 + }, + { + "auxiliary_loss_clip": 0.01306631, + "auxiliary_loss_mlp": 0.01099641, + "balance_loss_clip": 1.06080246, + "balance_loss_mlp": 1.08242524, + "epoch": 0.023027205771832256, + "flos": 20997149184000.0, + "grad_norm": 2.197151340607638, + "language_loss": 0.84830511, + "learning_rate": 3.829655315342268e-06, + "loss": 0.87236774, + "num_input_tokens_seen": 8012730, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 2.25, + "step": 383, + "time_per_iteration": 2.479097843170166 + }, + { + "auxiliary_loss_clip": 0.01303681, + "auxiliary_loss_mlp": 0.01106386, + "balance_loss_clip": 1.06673658, + "balance_loss_mlp": 1.08259249, + "epoch": 0.023087329024500225, + "flos": 21360816432000.0, + "grad_norm": 2.2992198386883116, + "language_loss": 0.83199835, + "learning_rate": 3.831334200735543e-06, + "loss": 0.85609907, + "num_input_tokens_seen": 8031275, + "router_z_loss_clip": 0.39648438, + "router_z_loss_mlp": 2.203125, + "step": 384, + "time_per_iteration": 2.5008413791656494 + }, + { + "auxiliary_loss_clip": 0.01303616, + "auxiliary_loss_mlp": 0.0109643, + "balance_loss_clip": 1.06030965, + "balance_loss_mlp": 1.08539534, + "epoch": 0.023147452277168194, + "flos": 21872435800320.0, + "grad_norm": 1.8570399395654076, + "language_loss": 0.89240694, + "learning_rate": 3.8330087197119426e-06, + "loss": 0.91640741, + "num_input_tokens_seen": 8051600, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 2.1875, + "step": 385, + "time_per_iteration": 2.4913859367370605 + }, + { + "auxiliary_loss_clip": 0.01306859, + "auxiliary_loss_mlp": 0.01121647, + "balance_loss_clip": 1.08397639, + "balance_loss_mlp": 1.0826149, + "epoch": 0.023207575529836166, + "flos": 18916700423040.0, + "grad_norm": 2.2576284783670357, + "language_loss": 0.70096415, + "learning_rate": 3.83467889492477e-06, + "loss": 0.72524917, + "num_input_tokens_seen": 8070600, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 2.234375, + "step": 386, + "time_per_iteration": 2.5017154216766357 + }, + { + "auxiliary_loss_clip": 0.01308067, + "auxiliary_loss_mlp": 0.01098351, + "balance_loss_clip": 1.06072879, + "balance_loss_mlp": 1.08460176, + "epoch": 0.023267698782504134, + "flos": 25046005207680.0, + "grad_norm": 1.9470877788533054, + "language_loss": 0.87909782, + "learning_rate": 3.836344748851495e-06, + "loss": 0.90316188, + "num_input_tokens_seen": 8090680, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 2.234375, + "step": 387, + "time_per_iteration": 2.5142157077789307 + }, + { + "auxiliary_loss_clip": 0.01308318, + "auxiliary_loss_mlp": 0.01085815, + "balance_loss_clip": 1.04666662, + "balance_loss_mlp": 1.08291698, + "epoch": 0.023327822035172103, + "flos": 28879217930880.0, + "grad_norm": 2.441105853176172, + "language_loss": 0.83429295, + "learning_rate": 3.838006303795566e-06, + "loss": 0.85823429, + "num_input_tokens_seen": 8114610, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.25, + "step": 388, + "time_per_iteration": 2.591242790222168 + }, + { + "auxiliary_loss_clip": 0.01305661, + "auxiliary_loss_mlp": 0.01094305, + "balance_loss_clip": 1.05754054, + "balance_loss_mlp": 1.08271885, + "epoch": 0.02338794528784007, + "flos": 27121533805440.0, + "grad_norm": 3.2646980282386644, + "language_loss": 0.93823689, + "learning_rate": 3.839663581888206e-06, + "loss": 0.96223652, + "num_input_tokens_seen": 8133975, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 2.21875, + "step": 389, + "time_per_iteration": 2.5427236557006836 + }, + { + "auxiliary_loss_clip": 0.01299094, + "auxiliary_loss_mlp": 0.01087693, + "balance_loss_clip": 1.04954624, + "balance_loss_mlp": 1.08334351, + "epoch": 0.02344806854050804, + "flos": 21322355944320.0, + "grad_norm": 2.08298220488583, + "language_loss": 0.87901413, + "learning_rate": 3.841316605090178e-06, + "loss": 0.90288198, + "num_input_tokens_seen": 8153570, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 2.15625, + "step": 390, + "time_per_iteration": 2.53519606590271 + }, + { + "auxiliary_loss_clip": 0.01304239, + "auxiliary_loss_mlp": 0.01095828, + "balance_loss_clip": 1.05927861, + "balance_loss_mlp": 1.08334053, + "epoch": 0.023508191793176012, + "flos": 24789997998720.0, + "grad_norm": 2.2293869448662362, + "language_loss": 0.89346433, + "learning_rate": 3.842965395193529e-06, + "loss": 0.91746497, + "num_input_tokens_seen": 8170075, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 2.203125, + "step": 391, + "time_per_iteration": 2.5662949085235596 + }, + { + "auxiliary_loss_clip": 0.01302453, + "auxiliary_loss_mlp": 0.0107275, + "balance_loss_clip": 1.03560483, + "balance_loss_mlp": 1.08116579, + "epoch": 0.02356831504584398, + "flos": 25995375624960.0, + "grad_norm": 2.022763227206087, + "language_loss": 0.86065882, + "learning_rate": 3.84460997382332e-06, + "loss": 0.88441086, + "num_input_tokens_seen": 8190420, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 2.21875, + "step": 392, + "time_per_iteration": 4.050429105758667 + }, + { + "auxiliary_loss_clip": 0.01297975, + "auxiliary_loss_mlp": 0.01086863, + "balance_loss_clip": 1.04990816, + "balance_loss_mlp": 1.08006191, + "epoch": 0.02362843829851195, + "flos": 19062461813760.0, + "grad_norm": 2.9628480690926318, + "language_loss": 0.88900077, + "learning_rate": 3.8462503624393256e-06, + "loss": 0.91284919, + "num_input_tokens_seen": 8208790, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 2.1875, + "step": 393, + "time_per_iteration": 3.9293932914733887 + }, + { + "auxiliary_loss_clip": 0.01309989, + "auxiliary_loss_mlp": 0.01103908, + "balance_loss_clip": 1.06449771, + "balance_loss_mlp": 1.087502, + "epoch": 0.023688561551179918, + "flos": 16071031296000.0, + "grad_norm": 2.0531375516435943, + "language_loss": 0.81400156, + "learning_rate": 3.84788658233771e-06, + "loss": 0.83814055, + "num_input_tokens_seen": 8226885, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 2.21875, + "step": 394, + "time_per_iteration": 2.552100658416748 + }, + { + "auxiliary_loss_clip": 0.01299653, + "auxiliary_loss_mlp": 0.01084647, + "balance_loss_clip": 1.04611897, + "balance_loss_mlp": 1.08043575, + "epoch": 0.023748684803847887, + "flos": 21724375939200.0, + "grad_norm": 2.0447414784698092, + "language_loss": 0.86189264, + "learning_rate": 3.84951865465269e-06, + "loss": 0.88573563, + "num_input_tokens_seen": 8246825, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 2.1875, + "step": 395, + "time_per_iteration": 2.536823272705078 + }, + { + "auxiliary_loss_clip": 0.01174527, + "auxiliary_loss_mlp": 0.01044608, + "balance_loss_clip": 1.03135228, + "balance_loss_mlp": 1.0590049, + "epoch": 0.02380880805651586, + "flos": 61926192881280.0, + "grad_norm": 0.9487784547172928, + "language_loss": 0.63808912, + "learning_rate": 3.851146600358172e-06, + "loss": 0.66028047, + "num_input_tokens_seen": 8302835, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 1.15625, + "step": 396, + "time_per_iteration": 2.935506582260132 + }, + { + "auxiliary_loss_clip": 0.01296295, + "auxiliary_loss_mlp": 0.01069502, + "balance_loss_clip": 1.03252339, + "balance_loss_mlp": 1.07895613, + "epoch": 0.023868931309183827, + "flos": 20266331068800.0, + "grad_norm": 2.6168641306315172, + "language_loss": 0.83744055, + "learning_rate": 3.852770440269372e-06, + "loss": 0.86109853, + "num_input_tokens_seen": 8320745, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 2.171875, + "step": 397, + "time_per_iteration": 2.5051028728485107 + }, + { + "auxiliary_loss_clip": 0.01302535, + "auxiliary_loss_mlp": 0.01091033, + "balance_loss_clip": 1.05288601, + "balance_loss_mlp": 1.08300877, + "epoch": 0.023929054561851796, + "flos": 21139103733120.0, + "grad_norm": 2.535145802301163, + "language_loss": 0.84050488, + "learning_rate": 3.854390195044404e-06, + "loss": 0.86444056, + "num_input_tokens_seen": 8339540, + "router_z_loss_clip": 0.38085938, + "router_z_loss_mlp": 2.1875, + "step": 398, + "time_per_iteration": 2.5234646797180176 + }, + { + "auxiliary_loss_clip": 0.01300466, + "auxiliary_loss_mlp": 0.0108273, + "balance_loss_clip": 1.04427278, + "balance_loss_mlp": 1.07864475, + "epoch": 0.023989177814519765, + "flos": 13698521049600.0, + "grad_norm": 2.904470095612531, + "language_loss": 0.85865271, + "learning_rate": 3.856005885185868e-06, + "loss": 0.88248467, + "num_input_tokens_seen": 8354890, + "router_z_loss_clip": 0.38476562, + "router_z_loss_mlp": 2.21875, + "step": 399, + "time_per_iteration": 2.4674201011657715 + }, + { + "auxiliary_loss_clip": 0.01295496, + "auxiliary_loss_mlp": 0.01093118, + "balance_loss_clip": 1.05566239, + "balance_loss_mlp": 1.08021355, + "epoch": 0.024049301067187733, + "flos": 26322018929280.0, + "grad_norm": 2.016759933832732, + "language_loss": 0.86157769, + "learning_rate": 3.857617531042398e-06, + "loss": 0.88546383, + "num_input_tokens_seen": 8375845, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 2.15625, + "step": 400, + "time_per_iteration": 2.554075241088867 + }, + { + "auxiliary_loss_clip": 0.01303599, + "auxiliary_loss_mlp": 0.01083552, + "balance_loss_clip": 1.04652512, + "balance_loss_mlp": 1.0848943, + "epoch": 0.024109424319855705, + "flos": 24425432910720.0, + "grad_norm": 3.068890951588493, + "language_loss": 0.79142016, + "learning_rate": 3.8592251528102065e-06, + "loss": 0.8152917, + "num_input_tokens_seen": 8395240, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 2.1875, + "step": 401, + "time_per_iteration": 2.543750286102295 + }, + { + "auxiliary_loss_clip": 0.01297911, + "auxiliary_loss_mlp": 0.01096359, + "balance_loss_clip": 1.05968988, + "balance_loss_mlp": 1.07987046, + "epoch": 0.024169547572523674, + "flos": 29604397610880.0, + "grad_norm": 2.2009554384450154, + "language_loss": 0.78456193, + "learning_rate": 3.8608287705345976e-06, + "loss": 0.80850464, + "num_input_tokens_seen": 8416950, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 2.1875, + "step": 402, + "time_per_iteration": 2.5531415939331055 + }, + { + "auxiliary_loss_clip": 0.0130167, + "auxiliary_loss_mlp": 0.01084273, + "balance_loss_clip": 1.04529142, + "balance_loss_mlp": 1.07989287, + "epoch": 0.024229670825191642, + "flos": 22601458235520.0, + "grad_norm": 2.7198213535828923, + "language_loss": 0.94637424, + "learning_rate": 3.86242840411147e-06, + "loss": 0.97023368, + "num_input_tokens_seen": 8433660, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.21875, + "step": 403, + "time_per_iteration": 2.4873671531677246 + }, + { + "auxiliary_loss_clip": 0.01306025, + "auxiliary_loss_mlp": 0.01095616, + "balance_loss_clip": 1.05620587, + "balance_loss_mlp": 1.07952547, + "epoch": 0.02428979407785961, + "flos": 18150258994560.0, + "grad_norm": 2.3706875621243246, + "language_loss": 0.99751151, + "learning_rate": 3.864024073288798e-06, + "loss": 1.02152789, + "num_input_tokens_seen": 8450180, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 2.265625, + "step": 404, + "time_per_iteration": 2.5400550365448 + }, + { + "auxiliary_loss_clip": 0.01304501, + "auxiliary_loss_mlp": 0.01104455, + "balance_loss_clip": 1.06716657, + "balance_loss_mlp": 1.08213115, + "epoch": 0.024349917330527583, + "flos": 15304984917120.0, + "grad_norm": 2.480197457162756, + "language_loss": 0.87603909, + "learning_rate": 3.865615797668091e-06, + "loss": 0.90012866, + "num_input_tokens_seen": 8467775, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 2.21875, + "step": 405, + "time_per_iteration": 2.4698479175567627 + }, + { + "auxiliary_loss_clip": 0.01314075, + "auxiliary_loss_mlp": 0.01107285, + "balance_loss_clip": 1.06835127, + "balance_loss_mlp": 1.08775485, + "epoch": 0.024410040583195552, + "flos": 20773892200320.0, + "grad_norm": 3.242686201363518, + "language_loss": 0.93258083, + "learning_rate": 3.867203596705844e-06, + "loss": 0.9567945, + "num_input_tokens_seen": 8486765, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 2.265625, + "step": 406, + "time_per_iteration": 2.505598306655884 + }, + { + "auxiliary_loss_clip": 0.01305046, + "auxiliary_loss_mlp": 0.01092168, + "balance_loss_clip": 1.05330622, + "balance_loss_mlp": 1.08378315, + "epoch": 0.02447016383586352, + "flos": 21798854789760.0, + "grad_norm": 2.059728688773918, + "language_loss": 0.87446553, + "learning_rate": 3.86878748971496e-06, + "loss": 0.89843762, + "num_input_tokens_seen": 8506515, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 2.21875, + "step": 407, + "time_per_iteration": 2.5017173290252686 + }, + { + "auxiliary_loss_clip": 0.01301523, + "auxiliary_loss_mlp": 0.01085362, + "balance_loss_clip": 1.04814506, + "balance_loss_mlp": 1.08445001, + "epoch": 0.02453028708853149, + "flos": 33948116380800.0, + "grad_norm": 2.439524495250932, + "language_loss": 0.7404871, + "learning_rate": 3.8703674958661596e-06, + "loss": 0.76435596, + "num_input_tokens_seen": 8528035, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 2.171875, + "step": 408, + "time_per_iteration": 2.6097092628479004 + }, + { + "auxiliary_loss_clip": 0.013061, + "auxiliary_loss_mlp": 0.01096961, + "balance_loss_clip": 1.05771768, + "balance_loss_mlp": 1.08381224, + "epoch": 0.024590410341199458, + "flos": 21793000872960.0, + "grad_norm": 2.750776221383638, + "language_loss": 0.92393035, + "learning_rate": 3.871943634189376e-06, + "loss": 0.94796097, + "num_input_tokens_seen": 8546455, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 2.21875, + "step": 409, + "time_per_iteration": 2.5198304653167725 + }, + { + "auxiliary_loss_clip": 0.01304769, + "auxiliary_loss_mlp": 0.01080478, + "balance_loss_clip": 1.04488206, + "balance_loss_mlp": 1.0854609, + "epoch": 0.02465053359386743, + "flos": 35114782124160.0, + "grad_norm": 1.9763435283924244, + "language_loss": 0.82926536, + "learning_rate": 3.873515923575128e-06, + "loss": 0.85311788, + "num_input_tokens_seen": 8568450, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 2.1875, + "step": 410, + "time_per_iteration": 2.624333143234253 + }, + { + "auxiliary_loss_clip": 0.01307118, + "auxiliary_loss_mlp": 0.01089288, + "balance_loss_clip": 1.05164146, + "balance_loss_mlp": 1.08556843, + "epoch": 0.0247106568465354, + "flos": 27451409333760.0, + "grad_norm": 4.176812441051998, + "language_loss": 0.77715993, + "learning_rate": 3.875084382775879e-06, + "loss": 0.80112404, + "num_input_tokens_seen": 8589340, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 2.21875, + "step": 411, + "time_per_iteration": 2.571401596069336 + }, + { + "auxiliary_loss_clip": 0.01303549, + "auxiliary_loss_mlp": 0.01102238, + "balance_loss_clip": 1.06311393, + "balance_loss_mlp": 1.08078265, + "epoch": 0.024770780099203367, + "flos": 20703794808960.0, + "grad_norm": 2.1103060729449883, + "language_loss": 0.86276567, + "learning_rate": 3.87664903040738e-06, + "loss": 0.88682353, + "num_input_tokens_seen": 8607150, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.21875, + "step": 412, + "time_per_iteration": 2.4968833923339844 + }, + { + "auxiliary_loss_clip": 0.01168305, + "auxiliary_loss_mlp": 0.01068817, + "balance_loss_clip": 1.05632353, + "balance_loss_mlp": 1.05478358, + "epoch": 0.024830903351871336, + "flos": 69551859369600.0, + "grad_norm": 0.8568818905087673, + "language_loss": 0.58512402, + "learning_rate": 3.878209884949994e-06, + "loss": 0.60749531, + "num_input_tokens_seen": 8669865, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 1.1328125, + "step": 413, + "time_per_iteration": 3.1763217449188232 + }, + { + "auxiliary_loss_clip": 0.01296528, + "auxiliary_loss_mlp": 0.01092205, + "balance_loss_clip": 1.05145931, + "balance_loss_mlp": 1.07941055, + "epoch": 0.024891026604539304, + "flos": 32270477713920.0, + "grad_norm": 1.7554792190049524, + "language_loss": 0.80704832, + "learning_rate": 3.879766964750006e-06, + "loss": 0.83093566, + "num_input_tokens_seen": 8690235, + "router_z_loss_clip": 0.40625, + "router_z_loss_mlp": 2.171875, + "step": 414, + "time_per_iteration": 2.5954627990722656 + }, + { + "auxiliary_loss_clip": 0.01292737, + "auxiliary_loss_mlp": 0.01093441, + "balance_loss_clip": 1.05660486, + "balance_loss_mlp": 1.07739186, + "epoch": 0.024951149857207276, + "flos": 18840282238080.0, + "grad_norm": 2.3796689224247904, + "language_loss": 0.80473328, + "learning_rate": 3.881320288020917e-06, + "loss": 0.82859504, + "num_input_tokens_seen": 8706295, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 2.15625, + "step": 415, + "time_per_iteration": 2.471665620803833 + }, + { + "auxiliary_loss_clip": 0.0131185, + "auxiliary_loss_mlp": 0.01085672, + "balance_loss_clip": 1.0481931, + "balance_loss_mlp": 1.08601356, + "epoch": 0.025011273109875245, + "flos": 15377201210880.0, + "grad_norm": 5.333540620494007, + "language_loss": 0.96179891, + "learning_rate": 3.882869872844723e-06, + "loss": 0.98577416, + "num_input_tokens_seen": 8724200, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 2.25, + "step": 416, + "time_per_iteration": 2.5133068561553955 + }, + { + "auxiliary_loss_clip": 0.01300197, + "auxiliary_loss_mlp": 0.01076153, + "balance_loss_clip": 1.03702867, + "balance_loss_mlp": 1.0806849, + "epoch": 0.025071396362543213, + "flos": 18915515274240.0, + "grad_norm": 2.409464042642492, + "language_loss": 0.77541196, + "learning_rate": 3.884415737173176e-06, + "loss": 0.79917544, + "num_input_tokens_seen": 8744170, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.1875, + "step": 417, + "time_per_iteration": 2.5206987857818604 + }, + { + "auxiliary_loss_clip": 0.01297091, + "auxiliary_loss_mlp": 0.01092626, + "balance_loss_clip": 1.05512297, + "balance_loss_mlp": 1.08281994, + "epoch": 0.025131519615211182, + "flos": 25337958952320.0, + "grad_norm": 1.6345521849457858, + "language_loss": 0.7689445, + "learning_rate": 3.8859578988290344e-06, + "loss": 0.79284167, + "num_input_tokens_seen": 8765120, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 2.140625, + "step": 418, + "time_per_iteration": 2.6002862453460693 + }, + { + "auxiliary_loss_clip": 0.01304842, + "auxiliary_loss_mlp": 0.01075451, + "balance_loss_clip": 1.03873444, + "balance_loss_mlp": 1.08383846, + "epoch": 0.02519164286787915, + "flos": 18953149749120.0, + "grad_norm": 2.548681745998596, + "language_loss": 0.81088459, + "learning_rate": 3.887496375507294e-06, + "loss": 0.83468759, + "num_input_tokens_seen": 8783500, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 2.203125, + "step": 419, + "time_per_iteration": 2.5097553730010986 + }, + { + "auxiliary_loss_clip": 0.01298642, + "auxiliary_loss_mlp": 0.01085388, + "balance_loss_clip": 1.0453577, + "balance_loss_mlp": 1.08236253, + "epoch": 0.025251766120547123, + "flos": 17421092904960.0, + "grad_norm": 1.9166879875817555, + "language_loss": 0.73812175, + "learning_rate": 3.8890311847764065e-06, + "loss": 0.761962, + "num_input_tokens_seen": 8801175, + "router_z_loss_clip": 0.40039062, + "router_z_loss_mlp": 2.15625, + "step": 420, + "time_per_iteration": 2.480468511581421 + }, + { + "auxiliary_loss_clip": 0.01298409, + "auxiliary_loss_mlp": 0.01098321, + "balance_loss_clip": 1.06086528, + "balance_loss_mlp": 1.0791508, + "epoch": 0.02531188937321509, + "flos": 25045430590080.0, + "grad_norm": 1.7246544027149788, + "language_loss": 0.78928417, + "learning_rate": 3.890562344079484e-06, + "loss": 0.8132515, + "num_input_tokens_seen": 8820215, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 2.1875, + "step": 421, + "time_per_iteration": 2.583979845046997 + }, + { + "auxiliary_loss_clip": 0.01300301, + "auxiliary_loss_mlp": 0.01095113, + "balance_loss_clip": 1.05589294, + "balance_loss_mlp": 1.08374381, + "epoch": 0.02537201262588306, + "flos": 30592228515840.0, + "grad_norm": 2.879256315405443, + "language_loss": 0.81915486, + "learning_rate": 3.89208987073549e-06, + "loss": 0.84310895, + "num_input_tokens_seen": 8839660, + "router_z_loss_clip": 0.39257812, + "router_z_loss_mlp": 2.171875, + "step": 422, + "time_per_iteration": 2.5834591388702393 + }, + { + "auxiliary_loss_clip": 0.01299282, + "auxiliary_loss_mlp": 0.01079788, + "balance_loss_clip": 1.0445497, + "balance_loss_mlp": 1.07925105, + "epoch": 0.02543213587855103, + "flos": 26065365275520.0, + "grad_norm": 1.9426129656279463, + "language_loss": 0.83468062, + "learning_rate": 3.893613781940409e-06, + "loss": 0.85847133, + "num_input_tokens_seen": 8859280, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 2.203125, + "step": 423, + "time_per_iteration": 2.5526318550109863 + }, + { + "auxiliary_loss_clip": 0.01293361, + "auxiliary_loss_mlp": 0.01086715, + "balance_loss_clip": 1.04978371, + "balance_loss_mlp": 1.07668817, + "epoch": 0.025492259131218997, + "flos": 36022818965760.0, + "grad_norm": 2.7010989411926367, + "language_loss": 0.74435121, + "learning_rate": 3.895134094768415e-06, + "loss": 0.768152, + "num_input_tokens_seen": 8880560, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 2.171875, + "step": 424, + "time_per_iteration": 2.606895923614502 + }, + { + "auxiliary_loss_clip": 0.01303473, + "auxiliary_loss_mlp": 0.01097188, + "balance_loss_clip": 1.06113958, + "balance_loss_mlp": 1.08349586, + "epoch": 0.02555238238388697, + "flos": 18588045957120.0, + "grad_norm": 2.227147445366898, + "language_loss": 0.83008313, + "learning_rate": 3.896650826173015e-06, + "loss": 0.85408974, + "num_input_tokens_seen": 8899155, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 2.203125, + "step": 425, + "time_per_iteration": 2.522517442703247 + }, + { + "auxiliary_loss_clip": 0.01299491, + "auxiliary_loss_mlp": 0.01096328, + "balance_loss_clip": 1.05691719, + "balance_loss_mlp": 1.07528758, + "epoch": 0.025612505636554938, + "flos": 24243186280320.0, + "grad_norm": 2.394258070540652, + "language_loss": 0.85481966, + "learning_rate": 3.898163992988186e-06, + "loss": 0.87877786, + "num_input_tokens_seen": 8917890, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 2.25, + "step": 426, + "time_per_iteration": 2.5039095878601074 + }, + { + "auxiliary_loss_clip": 0.01160068, + "auxiliary_loss_mlp": 0.0104803, + "balance_loss_clip": 1.03663349, + "balance_loss_mlp": 1.04526472, + "epoch": 0.025672628889222907, + "flos": 60586941265920.0, + "grad_norm": 0.8962322500302954, + "language_loss": 0.57186544, + "learning_rate": 3.899673611929491e-06, + "loss": 0.5939464, + "num_input_tokens_seen": 8978260, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 1.1484375, + "step": 427, + "time_per_iteration": 3.2289342880249023 + }, + { + "auxiliary_loss_clip": 0.01297452, + "auxiliary_loss_mlp": 0.01095521, + "balance_loss_clip": 1.05849457, + "balance_loss_mlp": 1.0838623, + "epoch": 0.025732752141890875, + "flos": 19573255169280.0, + "grad_norm": 2.6536896946259816, + "language_loss": 0.88190198, + "learning_rate": 3.901179699595194e-06, + "loss": 0.90583158, + "num_input_tokens_seen": 8994460, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 2.125, + "step": 428, + "time_per_iteration": 2.500389814376831 + }, + { + "auxiliary_loss_clip": 0.01290417, + "auxiliary_loss_mlp": 0.01078869, + "balance_loss_clip": 1.03972101, + "balance_loss_mlp": 1.07718623, + "epoch": 0.025792875394558847, + "flos": 31284262920960.0, + "grad_norm": 1.6692033855414803, + "language_loss": 0.85672665, + "learning_rate": 3.902682272467353e-06, + "loss": 0.88041949, + "num_input_tokens_seen": 9016670, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.125, + "step": 429, + "time_per_iteration": 2.605687379837036 + }, + { + "auxiliary_loss_clip": 0.01297427, + "auxiliary_loss_mlp": 0.01082502, + "balance_loss_clip": 1.04373491, + "balance_loss_mlp": 1.07673144, + "epoch": 0.025852998647226816, + "flos": 32379610210560.0, + "grad_norm": 2.5023850128037672, + "language_loss": 0.88384748, + "learning_rate": 3.904181346912895e-06, + "loss": 0.90764678, + "num_input_tokens_seen": 9039720, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 2.203125, + "step": 430, + "time_per_iteration": 2.593492269515991 + }, + { + "auxiliary_loss_clip": 0.01298542, + "auxiliary_loss_mlp": 0.01083596, + "balance_loss_clip": 1.04799962, + "balance_loss_mlp": 1.08428442, + "epoch": 0.025913121899894784, + "flos": 20193288762240.0, + "grad_norm": 1.9811912271744876, + "language_loss": 0.84202254, + "learning_rate": 3.905676939184698e-06, + "loss": 0.86584389, + "num_input_tokens_seen": 9059850, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 2.140625, + "step": 431, + "time_per_iteration": 2.5326902866363525 + }, + { + "auxiliary_loss_clip": 0.01291302, + "auxiliary_loss_mlp": 0.01073914, + "balance_loss_clip": 1.03886628, + "balance_loss_mlp": 1.0772872, + "epoch": 0.025973245152562753, + "flos": 14720430983040.0, + "grad_norm": 2.686150654607635, + "language_loss": 0.86775959, + "learning_rate": 3.907169065422638e-06, + "loss": 0.89141178, + "num_input_tokens_seen": 9077590, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 2.140625, + "step": 432, + "time_per_iteration": 2.4793269634246826 + }, + { + "auxiliary_loss_clip": 0.01296964, + "auxiliary_loss_mlp": 0.01080084, + "balance_loss_clip": 1.04491723, + "balance_loss_mlp": 1.08109105, + "epoch": 0.02603336840523072, + "flos": 30992991534720.0, + "grad_norm": 2.6953453355349684, + "language_loss": 0.76074433, + "learning_rate": 3.908657741654636e-06, + "loss": 0.78451484, + "num_input_tokens_seen": 9099880, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 2.15625, + "step": 433, + "time_per_iteration": 2.6125545501708984 + }, + { + "auxiliary_loss_clip": 0.01296292, + "auxiliary_loss_mlp": 0.0109282, + "balance_loss_clip": 1.05312383, + "balance_loss_mlp": 1.07772529, + "epoch": 0.026093491657898694, + "flos": 17674262939520.0, + "grad_norm": 2.2540618473103247, + "language_loss": 0.89764363, + "learning_rate": 3.910142983797699e-06, + "loss": 0.92153478, + "num_input_tokens_seen": 9118620, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 2.1875, + "step": 434, + "time_per_iteration": 5.3097922801971436 + }, + { + "auxiliary_loss_clip": 0.01297376, + "auxiliary_loss_mlp": 0.01102904, + "balance_loss_clip": 1.06404209, + "balance_loss_mlp": 1.08362865, + "epoch": 0.026153614910566662, + "flos": 17857874286720.0, + "grad_norm": 6.328317132251919, + "language_loss": 0.7985189, + "learning_rate": 3.9116248076589305e-06, + "loss": 0.82252169, + "num_input_tokens_seen": 9135655, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 2.125, + "step": 435, + "time_per_iteration": 3.9629530906677246 + }, + { + "auxiliary_loss_clip": 0.01291104, + "auxiliary_loss_mlp": 0.01091144, + "balance_loss_clip": 1.05316401, + "balance_loss_mlp": 1.0750463, + "epoch": 0.02621373816323463, + "flos": 20011113959040.0, + "grad_norm": 3.559504815450524, + "language_loss": 0.86357677, + "learning_rate": 3.913103228936546e-06, + "loss": 0.88739926, + "num_input_tokens_seen": 9153520, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 2.15625, + "step": 436, + "time_per_iteration": 2.479033946990967 + }, + { + "auxiliary_loss_clip": 0.01296325, + "auxiliary_loss_mlp": 0.01099771, + "balance_loss_clip": 1.06214869, + "balance_loss_mlp": 1.07964039, + "epoch": 0.0262738614159026, + "flos": 19281193683840.0, + "grad_norm": 2.6168892141891944, + "language_loss": 0.75002837, + "learning_rate": 3.914578263220868e-06, + "loss": 0.77398932, + "num_input_tokens_seen": 9170750, + "router_z_loss_clip": 0.37695312, + "router_z_loss_mlp": 2.171875, + "step": 437, + "time_per_iteration": 2.508769989013672 + }, + { + "auxiliary_loss_clip": 0.01293849, + "auxiliary_loss_mlp": 0.01104049, + "balance_loss_clip": 1.06380415, + "balance_loss_mlp": 1.08015561, + "epoch": 0.026333984668570568, + "flos": 18807208790400.0, + "grad_norm": 2.3031145987765758, + "language_loss": 0.91467845, + "learning_rate": 3.916049925995316e-06, + "loss": 0.93865746, + "num_input_tokens_seen": 9188430, + "router_z_loss_clip": 0.40234375, + "router_z_loss_mlp": 2.140625, + "step": 438, + "time_per_iteration": 2.4693844318389893 + }, + { + "auxiliary_loss_clip": 0.01155458, + "auxiliary_loss_mlp": 0.01064255, + "balance_loss_clip": 1.05276346, + "balance_loss_mlp": 1.0448494, + "epoch": 0.02639410792123854, + "flos": 64572020691840.0, + "grad_norm": 0.877669139368542, + "language_loss": 0.62577796, + "learning_rate": 3.917518232637377e-06, + "loss": 0.64797509, + "num_input_tokens_seen": 9255835, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 1.109375, + "step": 439, + "time_per_iteration": 3.162259101867676 + }, + { + "auxiliary_loss_clip": 0.01303989, + "auxiliary_loss_mlp": 0.01098096, + "balance_loss_clip": 1.05873275, + "balance_loss_mlp": 1.08440769, + "epoch": 0.02645423117390651, + "flos": 28473462921600.0, + "grad_norm": 2.1384369611317493, + "language_loss": 0.75629139, + "learning_rate": 3.918983198419573e-06, + "loss": 0.78031218, + "num_input_tokens_seen": 9276835, + "router_z_loss_clip": 0.39453125, + "router_z_loss_mlp": 2.203125, + "step": 440, + "time_per_iteration": 2.5541677474975586 + }, + { + "auxiliary_loss_clip": 0.01294139, + "auxiliary_loss_mlp": 0.01082398, + "balance_loss_clip": 1.04408443, + "balance_loss_mlp": 1.08003163, + "epoch": 0.026514354426574478, + "flos": 18551237495040.0, + "grad_norm": 1.9583565981573345, + "language_loss": 0.83186466, + "learning_rate": 3.920444838510415e-06, + "loss": 0.85563004, + "num_input_tokens_seen": 9295075, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 2.140625, + "step": 441, + "time_per_iteration": 2.453705072402954 + }, + { + "auxiliary_loss_clip": 0.01298235, + "auxiliary_loss_mlp": 0.01092726, + "balance_loss_clip": 1.05286217, + "balance_loss_mlp": 1.07855892, + "epoch": 0.026574477679242446, + "flos": 20667812359680.0, + "grad_norm": 2.035076381127293, + "language_loss": 0.7850582, + "learning_rate": 3.92190316797534e-06, + "loss": 0.80896777, + "num_input_tokens_seen": 9314205, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 2.203125, + "step": 442, + "time_per_iteration": 2.477555990219116 + }, + { + "auxiliary_loss_clip": 0.01145517, + "auxiliary_loss_mlp": 0.01012445, + "balance_loss_clip": 1.00185919, + "balance_loss_mlp": 1.04045749, + "epoch": 0.026634600931910415, + "flos": 57956125340160.0, + "grad_norm": 0.9584767110468104, + "language_loss": 0.64475185, + "learning_rate": 3.92335820177765e-06, + "loss": 0.66633147, + "num_input_tokens_seen": 9367395, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 1.046875, + "step": 443, + "time_per_iteration": 2.9838714599609375 + }, + { + "auxiliary_loss_clip": 0.01297944, + "auxiliary_loss_mlp": 0.01087685, + "balance_loss_clip": 1.04941845, + "balance_loss_mlp": 1.08318424, + "epoch": 0.026694724184578387, + "flos": 15815131827840.0, + "grad_norm": 2.4335650573352483, + "language_loss": 0.82707053, + "learning_rate": 3.924809954779425e-06, + "loss": 0.85092688, + "num_input_tokens_seen": 9385185, + "router_z_loss_clip": 0.3828125, + "router_z_loss_mlp": 2.140625, + "step": 444, + "time_per_iteration": 2.4520323276519775 + }, + { + "auxiliary_loss_clip": 0.0130195, + "auxiliary_loss_mlp": 0.0108515, + "balance_loss_clip": 1.0440464, + "balance_loss_mlp": 1.08103406, + "epoch": 0.026754847437246355, + "flos": 23440259612160.0, + "grad_norm": 2.6903851096875733, + "language_loss": 0.95400113, + "learning_rate": 3.9262584417424425e-06, + "loss": 0.97787213, + "num_input_tokens_seen": 9403225, + "router_z_loss_clip": 0.41015625, + "router_z_loss_mlp": 2.21875, + "step": 445, + "time_per_iteration": 2.5113518238067627 + }, + { + "auxiliary_loss_clip": 0.01296406, + "auxiliary_loss_mlp": 0.01096489, + "balance_loss_clip": 1.05657816, + "balance_loss_mlp": 1.08177555, + "epoch": 0.026814970689914324, + "flos": 17341801632000.0, + "grad_norm": 2.416617421630428, + "language_loss": 0.91790259, + "learning_rate": 3.9277036773290725e-06, + "loss": 0.94183153, + "num_input_tokens_seen": 9420540, + "router_z_loss_clip": 0.3984375, + "router_z_loss_mlp": 2.15625, + "step": 446, + "time_per_iteration": 2.4585111141204834 + }, + { + "auxiliary_loss_clip": 0.01293099, + "auxiliary_loss_mlp": 0.01085762, + "balance_loss_clip": 1.04718637, + "balance_loss_mlp": 1.08102632, + "epoch": 0.026875093942582293, + "flos": 17894718662400.0, + "grad_norm": 2.3983095061811635, + "language_loss": 0.80024058, + "learning_rate": 3.92914567610317e-06, + "loss": 0.82402921, + "num_input_tokens_seen": 9438840, + "router_z_loss_clip": 0.38671875, + "router_z_loss_mlp": 2.125, + "step": 447, + "time_per_iteration": 2.509643316268921 + }, + { + "auxiliary_loss_clip": 0.01292768, + "auxiliary_loss_mlp": 0.01072511, + "balance_loss_clip": 1.03658175, + "balance_loss_mlp": 1.07935369, + "epoch": 0.026935217195250265, + "flos": 21723980889600.0, + "grad_norm": 2.4579217038825423, + "language_loss": 0.86773896, + "learning_rate": 3.930584452530952e-06, + "loss": 0.89139175, + "num_input_tokens_seen": 9457215, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 2.125, + "step": 448, + "time_per_iteration": 2.477384328842163 + }, + { + "auxiliary_loss_clip": 0.01287268, + "auxiliary_loss_mlp": 0.01093327, + "balance_loss_clip": 1.0583508, + "balance_loss_mlp": 1.07870793, + "epoch": 0.026995340447918233, + "flos": 23622685810560.0, + "grad_norm": 2.1426472419274503, + "language_loss": 0.88779259, + "learning_rate": 3.9320200209818755e-06, + "loss": 0.91159856, + "num_input_tokens_seen": 9475615, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 2.078125, + "step": 449, + "time_per_iteration": 2.50108003616333 + }, + { + "auxiliary_loss_clip": 0.01298718, + "auxiliary_loss_mlp": 0.01087936, + "balance_loss_clip": 1.04897857, + "balance_loss_mlp": 1.08056545, + "epoch": 0.027055463700586202, + "flos": 17931275729280.0, + "grad_norm": 1.9975703664508544, + "language_loss": 0.80516291, + "learning_rate": 3.933452395729493e-06, + "loss": 0.82902944, + "num_input_tokens_seen": 9493975, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.1875, + "step": 450, + "time_per_iteration": 2.470536947250366 + }, + { + "auxiliary_loss_clip": 0.01291132, + "auxiliary_loss_mlp": 0.01077476, + "balance_loss_clip": 1.03973413, + "balance_loss_mlp": 1.08217299, + "epoch": 0.02711558695325417, + "flos": 25118903859840.0, + "grad_norm": 2.7768383062811637, + "language_loss": 0.81500483, + "learning_rate": 3.934881590952304e-06, + "loss": 0.83869088, + "num_input_tokens_seen": 9514810, + "router_z_loss_clip": 0.37890625, + "router_z_loss_mlp": 2.09375, + "step": 451, + "time_per_iteration": 2.530539035797119 + }, + { + "auxiliary_loss_clip": 0.01289442, + "auxiliary_loss_mlp": 0.0109125, + "balance_loss_clip": 1.0524354, + "balance_loss_mlp": 1.08151317, + "epoch": 0.02717571020592214, + "flos": 24239559006720.0, + "grad_norm": 1.5925691418309382, + "language_loss": 0.76994318, + "learning_rate": 3.936307620734599e-06, + "loss": 0.79375011, + "num_input_tokens_seen": 9533635, + "router_z_loss_clip": 0.38867188, + "router_z_loss_mlp": 2.078125, + "step": 452, + "time_per_iteration": 2.5138871669769287 + }, + { + "auxiliary_loss_clip": 0.01292925, + "auxiliary_loss_mlp": 0.01088314, + "balance_loss_clip": 1.0507158, + "balance_loss_mlp": 1.08201516, + "epoch": 0.02723583345859011, + "flos": 25118939773440.0, + "grad_norm": 1.9334646917545748, + "language_loss": 0.73053265, + "learning_rate": 3.937730499067294e-06, + "loss": 0.754345, + "num_input_tokens_seen": 9555420, + "router_z_loss_clip": 0.375, + "router_z_loss_mlp": 2.109375, + "step": 453, + "time_per_iteration": 2.5271401405334473 + }, + { + "auxiliary_loss_clip": 0.01288113, + "auxiliary_loss_mlp": 0.01086026, + "balance_loss_clip": 1.04952383, + "balance_loss_mlp": 1.08018303, + "epoch": 0.02729595671125808, + "flos": 42741597847680.0, + "grad_norm": 1.845498968311748, + "language_loss": 0.82439983, + "learning_rate": 3.939150239848748e-06, + "loss": 0.84814119, + "num_input_tokens_seen": 9578950, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 2.078125, + "step": 454, + "time_per_iteration": 2.6724069118499756 + }, + { + "auxiliary_loss_clip": 0.01290287, + "auxiliary_loss_mlp": 0.01078957, + "balance_loss_clip": 1.04491115, + "balance_loss_mlp": 1.0808264, + "epoch": 0.02735607996392605, + "flos": 21430985650560.0, + "grad_norm": 2.1414002490484005, + "language_loss": 0.75815403, + "learning_rate": 3.9405668568855866e-06, + "loss": 0.78184646, + "num_input_tokens_seen": 9598160, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 2.09375, + "step": 455, + "time_per_iteration": 2.496913194656372 + }, + { + "auxiliary_loss_clip": 0.01290624, + "auxiliary_loss_mlp": 0.01097119, + "balance_loss_clip": 1.06114161, + "balance_loss_mlp": 1.07846022, + "epoch": 0.027416203216594017, + "flos": 20851280052480.0, + "grad_norm": 2.102028743174525, + "language_loss": 0.80576169, + "learning_rate": 3.941980363893499e-06, + "loss": 0.82963914, + "num_input_tokens_seen": 9616010, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 2.125, + "step": 456, + "time_per_iteration": 2.4748263359069824 + }, + { + "auxiliary_loss_clip": 0.01286184, + "auxiliary_loss_mlp": 0.01078793, + "balance_loss_clip": 1.04152811, + "balance_loss_mlp": 1.07863176, + "epoch": 0.027476326469261986, + "flos": 13224500242560.0, + "grad_norm": 2.479828414472028, + "language_loss": 0.81621009, + "learning_rate": 3.9433907744980384e-06, + "loss": 0.83985978, + "num_input_tokens_seen": 9634000, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 2.078125, + "step": 457, + "time_per_iteration": 2.5122945308685303 + }, + { + "auxiliary_loss_clip": 0.01289671, + "auxiliary_loss_mlp": 0.01084101, + "balance_loss_clip": 1.04728937, + "balance_loss_mlp": 1.07828617, + "epoch": 0.027536449721929958, + "flos": 24024526237440.0, + "grad_norm": 2.0492464691581476, + "language_loss": 0.94062889, + "learning_rate": 3.944798102235412e-06, + "loss": 0.96436661, + "num_input_tokens_seen": 9653455, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 2.109375, + "step": 458, + "time_per_iteration": 2.542919874191284 + }, + { + "auxiliary_loss_clip": 0.01287914, + "auxiliary_loss_mlp": 0.01093849, + "balance_loss_clip": 1.05872989, + "balance_loss_mlp": 1.07926297, + "epoch": 0.027596572974597926, + "flos": 13006055681280.0, + "grad_norm": 2.4293190258203774, + "language_loss": 0.79353511, + "learning_rate": 3.9462023605532545e-06, + "loss": 0.81735277, + "num_input_tokens_seen": 9669650, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 2.09375, + "step": 459, + "time_per_iteration": 2.472830295562744 + }, + { + "auxiliary_loss_clip": 0.01293203, + "auxiliary_loss_mlp": 0.01082653, + "balance_loss_clip": 1.04360008, + "balance_loss_mlp": 1.08543491, + "epoch": 0.027656696227265895, + "flos": 26143076350080.0, + "grad_norm": 1.8472887331493792, + "language_loss": 0.83103061, + "learning_rate": 3.947603562811407e-06, + "loss": 0.85478914, + "num_input_tokens_seen": 9691415, + "router_z_loss_clip": 0.390625, + "router_z_loss_mlp": 2.078125, + "step": 460, + "time_per_iteration": 2.5376338958740234 + }, + { + "auxiliary_loss_clip": 0.01140517, + "auxiliary_loss_mlp": 0.01043703, + "balance_loss_clip": 1.03488147, + "balance_loss_mlp": 1.03798664, + "epoch": 0.027716819479933864, + "flos": 60697222997760.0, + "grad_norm": 1.5738760379538346, + "language_loss": 0.73565412, + "learning_rate": 3.949001722282675e-06, + "loss": 0.7574963, + "num_input_tokens_seen": 9755605, + "router_z_loss_clip": 0.08837891, + "router_z_loss_mlp": 1.0234375, + "step": 461, + "time_per_iteration": 3.0358285903930664 + }, + { + "auxiliary_loss_clip": 0.01289208, + "auxiliary_loss_mlp": 0.01081781, + "balance_loss_clip": 1.04735351, + "balance_loss_mlp": 1.086905, + "epoch": 0.027776942732601832, + "flos": 31211938886400.0, + "grad_norm": 2.85425781388422, + "language_loss": 0.81291741, + "learning_rate": 3.950396852153582e-06, + "loss": 0.83662736, + "num_input_tokens_seen": 9776270, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 2.015625, + "step": 462, + "time_per_iteration": 2.6079564094543457 + }, + { + "auxiliary_loss_clip": 0.01287586, + "auxiliary_loss_mlp": 0.01073577, + "balance_loss_clip": 1.04096127, + "balance_loss_mlp": 1.08167982, + "epoch": 0.027837065985269804, + "flos": 22674644196480.0, + "grad_norm": 2.2822341634579195, + "language_loss": 0.90235889, + "learning_rate": 3.951788965525118e-06, + "loss": 0.92597055, + "num_input_tokens_seen": 9794465, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 2.0625, + "step": 463, + "time_per_iteration": 2.4881155490875244 + }, + { + "auxiliary_loss_clip": 0.01137482, + "auxiliary_loss_mlp": 0.01014393, + "balance_loss_clip": 1.00561893, + "balance_loss_mlp": 1.03824747, + "epoch": 0.027897189237937773, + "flos": 62182487399040.0, + "grad_norm": 0.8835585057209928, + "language_loss": 0.59031862, + "learning_rate": 3.953178075413476e-06, + "loss": 0.61183739, + "num_input_tokens_seen": 9849685, + "router_z_loss_clip": 0.08789062, + "router_z_loss_mlp": 0.9921875, + "step": 464, + "time_per_iteration": 3.039377212524414 + }, + { + "auxiliary_loss_clip": 0.01299905, + "auxiliary_loss_mlp": 0.01097461, + "balance_loss_clip": 1.06081581, + "balance_loss_mlp": 1.08716702, + "epoch": 0.02795731249060574, + "flos": 24493160004480.0, + "grad_norm": 2.8663863440598525, + "language_loss": 0.81203198, + "learning_rate": 3.954564194750784e-06, + "loss": 0.83600569, + "num_input_tokens_seen": 9869505, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 2.125, + "step": 465, + "time_per_iteration": 2.5197718143463135 + }, + { + "auxiliary_loss_clip": 0.01286546, + "auxiliary_loss_mlp": 0.01082829, + "balance_loss_clip": 1.04708982, + "balance_loss_mlp": 1.08028877, + "epoch": 0.02801743574327371, + "flos": 23733003456000.0, + "grad_norm": 2.004656273762408, + "language_loss": 0.78560221, + "learning_rate": 3.955947336385828e-06, + "loss": 0.80929601, + "num_input_tokens_seen": 9890950, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 2.0625, + "step": 466, + "time_per_iteration": 2.5151565074920654 + }, + { + "auxiliary_loss_clip": 0.01285777, + "auxiliary_loss_mlp": 0.01085472, + "balance_loss_clip": 1.05075812, + "balance_loss_mlp": 1.0816046, + "epoch": 0.02807755899594168, + "flos": 20629100476800.0, + "grad_norm": 2.05931728393333, + "language_loss": 0.87548482, + "learning_rate": 3.957327513084761e-06, + "loss": 0.89919734, + "num_input_tokens_seen": 9911265, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 2.03125, + "step": 467, + "time_per_iteration": 2.4994542598724365 + }, + { + "auxiliary_loss_clip": 0.01289137, + "auxiliary_loss_mlp": 0.01106554, + "balance_loss_clip": 1.06969416, + "balance_loss_mlp": 1.08202362, + "epoch": 0.02813768224860965, + "flos": 19244564789760.0, + "grad_norm": 2.728881931821799, + "language_loss": 0.86217642, + "learning_rate": 3.958704737531818e-06, + "loss": 0.88613331, + "num_input_tokens_seen": 9929025, + "router_z_loss_clip": 0.36914062, + "router_z_loss_mlp": 2.0625, + "step": 468, + "time_per_iteration": 2.482377767562866 + }, + { + "auxiliary_loss_clip": 0.01287545, + "auxiliary_loss_mlp": 0.01081999, + "balance_loss_clip": 1.0447104, + "balance_loss_mlp": 1.07984936, + "epoch": 0.02819780550127762, + "flos": 20813968800000.0, + "grad_norm": 3.6924571591440762, + "language_loss": 0.91605878, + "learning_rate": 3.9600790223300065e-06, + "loss": 0.93975413, + "num_input_tokens_seen": 9945190, + "router_z_loss_clip": 0.37304688, + "router_z_loss_mlp": 2.078125, + "step": 469, + "time_per_iteration": 2.471510648727417 + }, + { + "auxiliary_loss_clip": 0.01286876, + "auxiliary_loss_mlp": 0.01096778, + "balance_loss_clip": 1.06106234, + "balance_loss_mlp": 1.08290672, + "epoch": 0.028257928753945588, + "flos": 19974125928960.0, + "grad_norm": 8.38112094971343, + "language_loss": 0.81587195, + "learning_rate": 3.96145038000181e-06, + "loss": 0.83970851, + "num_input_tokens_seen": 9962820, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 2.03125, + "step": 470, + "time_per_iteration": 2.5398614406585693 + }, + { + "auxiliary_loss_clip": 0.01286572, + "auxiliary_loss_mlp": 0.01085498, + "balance_loss_clip": 1.04868627, + "balance_loss_mlp": 1.07859015, + "epoch": 0.028318052006613557, + "flos": 20484488321280.0, + "grad_norm": 1.8437898933227894, + "language_loss": 0.93147206, + "learning_rate": 3.962818822989861e-06, + "loss": 0.9551928, + "num_input_tokens_seen": 9982595, + "router_z_loss_clip": 0.3671875, + "router_z_loss_mlp": 2.078125, + "step": 471, + "time_per_iteration": 2.5005030632019043 + }, + { + "auxiliary_loss_clip": 0.0128173, + "auxiliary_loss_mlp": 0.01094713, + "balance_loss_clip": 1.05885458, + "balance_loss_mlp": 1.07808042, + "epoch": 0.02837817525928153, + "flos": 28514832410880.0, + "grad_norm": 1.89303735573371, + "language_loss": 0.757568, + "learning_rate": 3.964184363657625e-06, + "loss": 0.78133243, + "num_input_tokens_seen": 10004645, + "router_z_loss_clip": 0.35742188, + "router_z_loss_mlp": 2.03125, + "step": 472, + "time_per_iteration": 2.597637176513672 + }, + { + "auxiliary_loss_clip": 0.0128882, + "auxiliary_loss_mlp": 0.01078393, + "balance_loss_clip": 1.04479945, + "balance_loss_mlp": 1.07699013, + "epoch": 0.028438298511949497, + "flos": 18551668458240.0, + "grad_norm": 3.986951446490631, + "language_loss": 0.93354845, + "learning_rate": 3.965547014290071e-06, + "loss": 0.95722055, + "num_input_tokens_seen": 10022555, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 2.125, + "step": 473, + "time_per_iteration": 2.4882545471191406 + }, + { + "auxiliary_loss_clip": 0.01293922, + "auxiliary_loss_mlp": 0.01115319, + "balance_loss_clip": 1.08134401, + "balance_loss_mlp": 1.08149064, + "epoch": 0.028498421764617466, + "flos": 16910227722240.0, + "grad_norm": 4.845992674029067, + "language_loss": 0.88586211, + "learning_rate": 3.96690678709433e-06, + "loss": 0.90995455, + "num_input_tokens_seen": 10041025, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 2.125, + "step": 474, + "time_per_iteration": 2.483210563659668 + }, + { + "auxiliary_loss_clip": 0.01284496, + "auxiliary_loss_mlp": 0.01091761, + "balance_loss_clip": 1.05559278, + "balance_loss_mlp": 1.07983565, + "epoch": 0.028558545017285435, + "flos": 27778699082880.0, + "grad_norm": 2.474550917046853, + "language_loss": 0.78771299, + "learning_rate": 3.968263694200355e-06, + "loss": 0.81147563, + "num_input_tokens_seen": 10060775, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 2.046875, + "step": 475, + "time_per_iteration": 2.5462486743927 + }, + { + "auxiliary_loss_clip": 0.01139312, + "auxiliary_loss_mlp": 0.0107539, + "balance_loss_clip": 1.06647348, + "balance_loss_mlp": 1.03907108, + "epoch": 0.028618668269953403, + "flos": 65654367258240.0, + "grad_norm": 0.9304884927077405, + "language_loss": 0.66880804, + "learning_rate": 3.969617747661569e-06, + "loss": 0.6909551, + "num_input_tokens_seen": 10120225, + "router_z_loss_clip": 0.08935547, + "router_z_loss_mlp": 1.0, + "step": 476, + "time_per_iteration": 5.8287513256073 + }, + { + "auxiliary_loss_clip": 0.01286666, + "auxiliary_loss_mlp": 0.01081774, + "balance_loss_clip": 1.04527175, + "balance_loss_mlp": 1.0796659, + "epoch": 0.028678791522621375, + "flos": 21937074324480.0, + "grad_norm": 2.9569520931335775, + "language_loss": 0.83852398, + "learning_rate": 3.970968959455509e-06, + "loss": 0.86220837, + "num_input_tokens_seen": 10137880, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 2.078125, + "step": 477, + "time_per_iteration": 2.5179195404052734 + }, + { + "auxiliary_loss_clip": 0.01293161, + "auxiliary_loss_mlp": 0.01088101, + "balance_loss_clip": 1.05164671, + "balance_loss_mlp": 1.08298135, + "epoch": 0.028738914775289344, + "flos": 24572128055040.0, + "grad_norm": 2.2048636254017504, + "language_loss": 0.82267237, + "learning_rate": 3.97231734148446e-06, + "loss": 0.84648502, + "num_input_tokens_seen": 10156930, + "router_z_loss_clip": 0.36523438, + "router_z_loss_mlp": 2.09375, + "step": 478, + "time_per_iteration": 2.495760679244995 + }, + { + "auxiliary_loss_clip": 0.01283274, + "auxiliary_loss_mlp": 0.01076252, + "balance_loss_clip": 1.0409658, + "balance_loss_mlp": 1.07707858, + "epoch": 0.028799038027957313, + "flos": 23257977068160.0, + "grad_norm": 2.28603697529264, + "language_loss": 0.81010443, + "learning_rate": 3.973662905576082e-06, + "loss": 0.8336997, + "num_input_tokens_seen": 10176295, + "router_z_loss_clip": 0.35351562, + "router_z_loss_mlp": 2.0625, + "step": 479, + "time_per_iteration": 2.491910934448242 + }, + { + "auxiliary_loss_clip": 0.01281719, + "auxiliary_loss_mlp": 0.01080307, + "balance_loss_clip": 1.04323328, + "balance_loss_mlp": 1.07729793, + "epoch": 0.02885916128062528, + "flos": 22164102236160.0, + "grad_norm": 2.2385690137770715, + "language_loss": 0.73465097, + "learning_rate": 3.975005663484038e-06, + "loss": 0.75827128, + "num_input_tokens_seen": 10195790, + "router_z_loss_clip": 0.37109375, + "router_z_loss_mlp": 2.03125, + "step": 480, + "time_per_iteration": 2.4959068298339844 + }, + { + "auxiliary_loss_clip": 0.01280408, + "auxiliary_loss_mlp": 0.01071287, + "balance_loss_clip": 1.03945768, + "balance_loss_mlp": 1.07837129, + "epoch": 0.02891928453329325, + "flos": 22932842135040.0, + "grad_norm": 1.6612342828976938, + "language_loss": 0.87719476, + "learning_rate": 3.976345626888605e-06, + "loss": 0.90071172, + "num_input_tokens_seen": 10218405, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 2.03125, + "step": 481, + "time_per_iteration": 2.534792184829712 + }, + { + "auxiliary_loss_clip": 0.0113967, + "auxiliary_loss_mlp": 0.01022688, + "balance_loss_clip": 1.01367593, + "balance_loss_mlp": 1.03470159, + "epoch": 0.028979407785961222, + "flos": 57432941792640.0, + "grad_norm": 0.8259666239631118, + "language_loss": 0.66064727, + "learning_rate": 3.9776828073972864e-06, + "loss": 0.68227088, + "num_input_tokens_seen": 10271005, + "router_z_loss_clip": 0.09033203, + "router_z_loss_mlp": 1.046875, + "step": 482, + "time_per_iteration": 2.8219997882843018 + }, + { + "auxiliary_loss_clip": 0.01295379, + "auxiliary_loss_mlp": 0.01073835, + "balance_loss_clip": 1.04014635, + "balance_loss_mlp": 1.08159328, + "epoch": 0.02903953103862919, + "flos": 16722737706240.0, + "grad_norm": 2.373570732629757, + "language_loss": 0.78743541, + "learning_rate": 3.979017216545415e-06, + "loss": 0.81112754, + "num_input_tokens_seen": 10288405, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 2.140625, + "step": 483, + "time_per_iteration": 2.4733006954193115 + }, + { + "auxiliary_loss_clip": 0.01293434, + "auxiliary_loss_mlp": 0.01090935, + "balance_loss_clip": 1.0548625, + "balance_loss_mlp": 1.08311069, + "epoch": 0.02909965429129716, + "flos": 16763640318720.0, + "grad_norm": 2.520023812901894, + "language_loss": 0.75405324, + "learning_rate": 3.980348865796749e-06, + "loss": 0.77789688, + "num_input_tokens_seen": 10306875, + "router_z_loss_clip": 0.36132812, + "router_z_loss_mlp": 2.109375, + "step": 484, + "time_per_iteration": 2.466634750366211 + }, + { + "auxiliary_loss_clip": 0.01288089, + "auxiliary_loss_mlp": 0.01078618, + "balance_loss_clip": 1.04459584, + "balance_loss_mlp": 1.08002305, + "epoch": 0.029159777543965128, + "flos": 19785343023360.0, + "grad_norm": 2.0323982063196153, + "language_loss": 0.84021544, + "learning_rate": 3.9816777665440615e-06, + "loss": 0.86388254, + "num_input_tokens_seen": 10323965, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 2.078125, + "step": 485, + "time_per_iteration": 2.511415719985962 + }, + { + "auxiliary_loss_clip": 0.01293039, + "auxiliary_loss_mlp": 0.01081906, + "balance_loss_clip": 1.04740667, + "balance_loss_mlp": 1.08659554, + "epoch": 0.029219900796633096, + "flos": 19642670202240.0, + "grad_norm": 1.9066132168030567, + "language_loss": 0.84465218, + "learning_rate": 3.983003930109732e-06, + "loss": 0.86840165, + "num_input_tokens_seen": 10342620, + "router_z_loss_clip": 0.34570312, + "router_z_loss_mlp": 2.0625, + "step": 486, + "time_per_iteration": 2.453583002090454 + }, + { + "auxiliary_loss_clip": 0.01284719, + "auxiliary_loss_mlp": 0.01083872, + "balance_loss_clip": 1.04841876, + "balance_loss_mlp": 1.07841349, + "epoch": 0.02928002404930107, + "flos": 25885704424320.0, + "grad_norm": 1.9228432408219163, + "language_loss": 0.8891986, + "learning_rate": 3.984327367746315e-06, + "loss": 0.91288453, + "num_input_tokens_seen": 10364610, + "router_z_loss_clip": 0.35546875, + "router_z_loss_mlp": 2.0625, + "step": 487, + "time_per_iteration": 2.5558598041534424 + }, + { + "auxiliary_loss_clip": 0.0128758, + "auxiliary_loss_mlp": 0.01070867, + "balance_loss_clip": 1.03806067, + "balance_loss_mlp": 1.08095598, + "epoch": 0.029340147301969037, + "flos": 20660234590080.0, + "grad_norm": 2.5260996981700456, + "language_loss": 0.87981069, + "learning_rate": 3.985648090637122e-06, + "loss": 0.90339512, + "num_input_tokens_seen": 10380910, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 2.0625, + "step": 488, + "time_per_iteration": 2.5299952030181885 + }, + { + "auxiliary_loss_clip": 0.01283325, + "auxiliary_loss_mlp": 0.01079627, + "balance_loss_clip": 1.0449605, + "balance_loss_mlp": 1.07794333, + "epoch": 0.029400270554637006, + "flos": 24428018689920.0, + "grad_norm": 2.1862911790042543, + "language_loss": 0.88956475, + "learning_rate": 3.986966109896785e-06, + "loss": 0.9131943, + "num_input_tokens_seen": 10400665, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 2.046875, + "step": 489, + "time_per_iteration": 2.545240879058838 + }, + { + "auxiliary_loss_clip": 0.0127768, + "auxiliary_loss_mlp": 0.01078157, + "balance_loss_clip": 1.04322839, + "balance_loss_mlp": 1.07402337, + "epoch": 0.029460393807304974, + "flos": 20120892900480.0, + "grad_norm": 2.0397830948196756, + "language_loss": 0.88539088, + "learning_rate": 3.988281436571815e-06, + "loss": 0.90894926, + "num_input_tokens_seen": 10420150, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 2.03125, + "step": 490, + "time_per_iteration": 2.4727838039398193 + }, + { + "auxiliary_loss_clip": 0.01284238, + "auxiliary_loss_mlp": 0.01081508, + "balance_loss_clip": 1.04774833, + "balance_loss_mlp": 1.07731342, + "epoch": 0.029520517059972943, + "flos": 17675914965120.0, + "grad_norm": 2.230679327742206, + "language_loss": 0.91299963, + "learning_rate": 3.989594081641164e-06, + "loss": 0.93665713, + "num_input_tokens_seen": 10438210, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 2.0625, + "step": 491, + "time_per_iteration": 2.4900829792022705 + }, + { + "auxiliary_loss_clip": 0.01274874, + "auxiliary_loss_mlp": 0.01070684, + "balance_loss_clip": 1.03804421, + "balance_loss_mlp": 1.0749476, + "epoch": 0.029580640312640915, + "flos": 18953185662720.0, + "grad_norm": 2.419480988494796, + "language_loss": 0.85232413, + "learning_rate": 3.9909040560167675e-06, + "loss": 0.87577969, + "num_input_tokens_seen": 10455125, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 2.0, + "step": 492, + "time_per_iteration": 2.457188844680786 + }, + { + "auxiliary_loss_clip": 0.0128558, + "auxiliary_loss_mlp": 0.01093772, + "balance_loss_clip": 1.05939209, + "balance_loss_mlp": 1.08082771, + "epoch": 0.029640763565308884, + "flos": 18726121837440.0, + "grad_norm": 2.826333733481051, + "language_loss": 0.83989829, + "learning_rate": 3.992211370544093e-06, + "loss": 0.86369187, + "num_input_tokens_seen": 10470990, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 2.046875, + "step": 493, + "time_per_iteration": 2.4821553230285645 + }, + { + "auxiliary_loss_clip": 0.01280126, + "auxiliary_loss_mlp": 0.01079048, + "balance_loss_clip": 1.04586005, + "balance_loss_mlp": 1.07578444, + "epoch": 0.029700886817976852, + "flos": 20595308757120.0, + "grad_norm": 1.8259196989393787, + "language_loss": 0.86575663, + "learning_rate": 3.99351603600268e-06, + "loss": 0.88934839, + "num_input_tokens_seen": 10490685, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 2.046875, + "step": 494, + "time_per_iteration": 2.507068395614624 + }, + { + "auxiliary_loss_clip": 0.01286409, + "auxiliary_loss_mlp": 0.01082408, + "balance_loss_clip": 1.05084157, + "balance_loss_mlp": 1.07973599, + "epoch": 0.02976101007064482, + "flos": 22236857233920.0, + "grad_norm": 4.414490317498679, + "language_loss": 0.86250752, + "learning_rate": 3.994818063106668e-06, + "loss": 0.88619578, + "num_input_tokens_seen": 10509435, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 2.0625, + "step": 495, + "time_per_iteration": 2.498401165008545 + }, + { + "auxiliary_loss_clip": 0.01274095, + "auxiliary_loss_mlp": 0.0107342, + "balance_loss_clip": 1.04144859, + "balance_loss_mlp": 1.07653904, + "epoch": 0.029821133323312793, + "flos": 23732644320000.0, + "grad_norm": 1.893732744603442, + "language_loss": 0.6230706, + "learning_rate": 3.99611746250533e-06, + "loss": 0.64654577, + "num_input_tokens_seen": 10530050, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 1.9765625, + "step": 496, + "time_per_iteration": 2.499669313430786 + }, + { + "auxiliary_loss_clip": 0.01276388, + "auxiliary_loss_mlp": 0.01085353, + "balance_loss_clip": 1.05314219, + "balance_loss_mlp": 1.07830799, + "epoch": 0.02988125657598076, + "flos": 22419498913920.0, + "grad_norm": 1.8423417765009742, + "language_loss": 0.88582325, + "learning_rate": 3.997414244783595e-06, + "loss": 0.90944064, + "num_input_tokens_seen": 10551370, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 1.984375, + "step": 497, + "time_per_iteration": 2.5570924282073975 + }, + { + "auxiliary_loss_clip": 0.01282787, + "auxiliary_loss_mlp": 0.0108035, + "balance_loss_clip": 1.04711461, + "balance_loss_mlp": 1.07822609, + "epoch": 0.02994137982864873, + "flos": 13845108453120.0, + "grad_norm": 3.4064142479622377, + "language_loss": 0.85174376, + "learning_rate": 3.998708420462557e-06, + "loss": 0.87537515, + "num_input_tokens_seen": 10569225, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 2.046875, + "step": 498, + "time_per_iteration": 2.513601541519165 + }, + { + "auxiliary_loss_clip": 0.01281177, + "auxiliary_loss_mlp": 0.01082811, + "balance_loss_clip": 1.05052912, + "balance_loss_mlp": 1.07829463, + "epoch": 0.0300015030813167, + "flos": 23908354675200.0, + "grad_norm": 37.23719619981942, + "language_loss": 0.78152531, + "learning_rate": 4e-06, + "loss": 0.80516517, + "num_input_tokens_seen": 10586170, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 2.03125, + "step": 499, + "time_per_iteration": 2.4924824237823486 + }, + { + "auxiliary_loss_clip": 0.01282354, + "auxiliary_loss_mlp": 0.01080564, + "balance_loss_clip": 1.04818654, + "balance_loss_mlp": 1.08037949, + "epoch": 0.030061626333984667, + "flos": 22016796560640.0, + "grad_norm": 3.687829420060643, + "language_loss": 0.8271451, + "learning_rate": 3.9999999620799e-06, + "loss": 0.85077423, + "num_input_tokens_seen": 10606205, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 2.015625, + "step": 500, + "time_per_iteration": 2.494333028793335 + }, + { + "auxiliary_loss_clip": 0.01274571, + "auxiliary_loss_mlp": 0.01084389, + "balance_loss_clip": 1.04924583, + "balance_loss_mlp": 1.07541978, + "epoch": 0.03012174958665264, + "flos": 23039747988480.0, + "grad_norm": 2.6096117253121447, + "language_loss": 0.88464928, + "learning_rate": 3.9999998483196e-06, + "loss": 0.90823889, + "num_input_tokens_seen": 10625995, + "router_z_loss_clip": 0.3515625, + "router_z_loss_mlp": 1.9921875, + "step": 501, + "time_per_iteration": 2.494575262069702 + }, + { + "auxiliary_loss_clip": 0.01283018, + "auxiliary_loss_mlp": 0.01073076, + "balance_loss_clip": 1.04158127, + "balance_loss_mlp": 1.07912767, + "epoch": 0.030181872839320608, + "flos": 18953257489920.0, + "grad_norm": 2.304054979465899, + "language_loss": 0.86586684, + "learning_rate": 3.9999996587191065e-06, + "loss": 0.88942778, + "num_input_tokens_seen": 10644105, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 2.03125, + "step": 502, + "time_per_iteration": 2.4574413299560547 + }, + { + "auxiliary_loss_clip": 0.01278734, + "auxiliary_loss_mlp": 0.01077839, + "balance_loss_clip": 1.0444839, + "balance_loss_mlp": 1.07952762, + "epoch": 0.030241996091988577, + "flos": 16728017005440.0, + "grad_norm": 2.6244890775354976, + "language_loss": 0.84661186, + "learning_rate": 3.999999393278425e-06, + "loss": 0.87017757, + "num_input_tokens_seen": 10661090, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 1.9921875, + "step": 503, + "time_per_iteration": 2.4406938552856445 + }, + { + "auxiliary_loss_clip": 0.0127278, + "auxiliary_loss_mlp": 0.01082796, + "balance_loss_clip": 1.05008519, + "balance_loss_mlp": 1.07727659, + "epoch": 0.030302119344656545, + "flos": 28621271387520.0, + "grad_norm": 1.6755724800263092, + "language_loss": 0.88215417, + "learning_rate": 3.999999051997567e-06, + "loss": 0.90570992, + "num_input_tokens_seen": 10682380, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.953125, + "step": 504, + "time_per_iteration": 2.5319011211395264 + }, + { + "auxiliary_loss_clip": 0.01274883, + "auxiliary_loss_mlp": 0.01087129, + "balance_loss_clip": 1.05556226, + "balance_loss_mlp": 1.07692564, + "epoch": 0.030362242597324514, + "flos": 15669334523520.0, + "grad_norm": 2.2080583468347, + "language_loss": 0.78446162, + "learning_rate": 3.9999986348765425e-06, + "loss": 0.80808175, + "num_input_tokens_seen": 10699925, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.9765625, + "step": 505, + "time_per_iteration": 2.4724690914154053 + }, + { + "auxiliary_loss_clip": 0.01135682, + "auxiliary_loss_mlp": 0.01018291, + "balance_loss_clip": 1.00927854, + "balance_loss_mlp": 1.04092085, + "epoch": 0.030422365849992486, + "flos": 72125973676800.0, + "grad_norm": 0.8461866637376847, + "language_loss": 0.55057126, + "learning_rate": 3.999998141915371e-06, + "loss": 0.57211095, + "num_input_tokens_seen": 10766525, + "router_z_loss_clip": 0.09033203, + "router_z_loss_mlp": 0.9453125, + "step": 506, + "time_per_iteration": 3.2490124702453613 + }, + { + "auxiliary_loss_clip": 0.01274292, + "auxiliary_loss_mlp": 0.01087138, + "balance_loss_clip": 1.05418897, + "balance_loss_mlp": 1.0756762, + "epoch": 0.030482489102660455, + "flos": 19427817000960.0, + "grad_norm": 1.9034614277572226, + "language_loss": 0.83767861, + "learning_rate": 3.999997573114069e-06, + "loss": 0.8612929, + "num_input_tokens_seen": 10786725, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.984375, + "step": 507, + "time_per_iteration": 2.48811674118042 + }, + { + "auxiliary_loss_clip": 0.01280318, + "auxiliary_loss_mlp": 0.01080114, + "balance_loss_clip": 1.04778421, + "balance_loss_mlp": 1.07709789, + "epoch": 0.030542612355328423, + "flos": 20375822701440.0, + "grad_norm": 2.5950154193771526, + "language_loss": 0.88689649, + "learning_rate": 3.999996928472659e-06, + "loss": 0.91050076, + "num_input_tokens_seen": 10805390, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 2.03125, + "step": 508, + "time_per_iteration": 2.4966533184051514 + }, + { + "auxiliary_loss_clip": 0.01281637, + "auxiliary_loss_mlp": 0.01063766, + "balance_loss_clip": 1.0311265, + "balance_loss_mlp": 1.07728887, + "epoch": 0.030602735607996392, + "flos": 34677354297600.0, + "grad_norm": 2.2339008285543227, + "language_loss": 0.71499902, + "learning_rate": 3.999996207991165e-06, + "loss": 0.73845309, + "num_input_tokens_seen": 10828030, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 2.03125, + "step": 509, + "time_per_iteration": 2.5966317653656006 + }, + { + "auxiliary_loss_clip": 0.01274736, + "auxiliary_loss_mlp": 0.01072718, + "balance_loss_clip": 1.04229546, + "balance_loss_mlp": 1.07770133, + "epoch": 0.03066285886066436, + "flos": 23658668259840.0, + "grad_norm": 2.064360756351981, + "language_loss": 0.82369828, + "learning_rate": 3.999995411669614e-06, + "loss": 0.8471728, + "num_input_tokens_seen": 10845240, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.9765625, + "step": 510, + "time_per_iteration": 2.5276355743408203 + }, + { + "auxiliary_loss_clip": 0.01280977, + "auxiliary_loss_mlp": 0.01082699, + "balance_loss_clip": 1.04984498, + "balance_loss_mlp": 1.08235979, + "epoch": 0.030722982113332332, + "flos": 23002975440000.0, + "grad_norm": 2.1614325499153693, + "language_loss": 0.83621502, + "learning_rate": 3.999994539508036e-06, + "loss": 0.85985172, + "num_input_tokens_seen": 10864325, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.984375, + "step": 511, + "time_per_iteration": 2.503779888153076 + }, + { + "auxiliary_loss_clip": 0.01278507, + "auxiliary_loss_mlp": 0.01077898, + "balance_loss_clip": 1.04633093, + "balance_loss_mlp": 1.07648492, + "epoch": 0.0307831053660003, + "flos": 24750855152640.0, + "grad_norm": 2.1059740170821515, + "language_loss": 0.82234836, + "learning_rate": 3.9999935915064655e-06, + "loss": 0.8459124, + "num_input_tokens_seen": 10883860, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 2.03125, + "step": 512, + "time_per_iteration": 2.5306975841522217 + }, + { + "auxiliary_loss_clip": 0.01276149, + "auxiliary_loss_mlp": 0.01077944, + "balance_loss_clip": 1.04499435, + "balance_loss_mlp": 1.0769974, + "epoch": 0.03084322861866827, + "flos": 26140885620480.0, + "grad_norm": 1.9256325141107502, + "language_loss": 0.87030005, + "learning_rate": 3.9999925676649374e-06, + "loss": 0.89384103, + "num_input_tokens_seen": 10904555, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 1.9921875, + "step": 513, + "time_per_iteration": 2.507490634918213 + }, + { + "auxiliary_loss_clip": 0.01281572, + "auxiliary_loss_mlp": 0.01080973, + "balance_loss_clip": 1.04840553, + "balance_loss_mlp": 1.07869625, + "epoch": 0.03090335187133624, + "flos": 18771298168320.0, + "grad_norm": 3.202753983864072, + "language_loss": 0.79141152, + "learning_rate": 3.999991467983491e-06, + "loss": 0.81503695, + "num_input_tokens_seen": 10923700, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 2.03125, + "step": 514, + "time_per_iteration": 2.515496015548706 + }, + { + "auxiliary_loss_clip": 0.01276725, + "auxiliary_loss_mlp": 0.01063014, + "balance_loss_clip": 1.03218651, + "balance_loss_mlp": 1.07966864, + "epoch": 0.030963475124004207, + "flos": 23221886878080.0, + "grad_norm": 2.5461002634459216, + "language_loss": 0.77459693, + "learning_rate": 3.999990292462167e-06, + "loss": 0.79799432, + "num_input_tokens_seen": 10942730, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.96875, + "step": 515, + "time_per_iteration": 2.481903553009033 + }, + { + "auxiliary_loss_clip": 0.01272098, + "auxiliary_loss_mlp": 0.0106896, + "balance_loss_clip": 1.03615332, + "balance_loss_mlp": 1.07318711, + "epoch": 0.03102359837667218, + "flos": 42525595411200.0, + "grad_norm": 1.901518391780262, + "language_loss": 0.82729101, + "learning_rate": 3.999989041101011e-06, + "loss": 0.85070157, + "num_input_tokens_seen": 10967120, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.9921875, + "step": 516, + "time_per_iteration": 2.699577808380127 + }, + { + "auxiliary_loss_clip": 0.01272185, + "auxiliary_loss_mlp": 0.01070291, + "balance_loss_clip": 1.03760433, + "balance_loss_mlp": 1.07659435, + "epoch": 0.031083721629340148, + "flos": 21176953689600.0, + "grad_norm": 2.071844032637654, + "language_loss": 0.79009813, + "learning_rate": 3.999987713900071e-06, + "loss": 0.81352293, + "num_input_tokens_seen": 10986775, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.953125, + "step": 517, + "time_per_iteration": 4.0190205574035645 + }, + { + "auxiliary_loss_clip": 0.01269009, + "auxiliary_loss_mlp": 0.01072314, + "balance_loss_clip": 1.04069996, + "balance_loss_mlp": 1.07610774, + "epoch": 0.031143844882008116, + "flos": 29716187713920.0, + "grad_norm": 1.58218863781409, + "language_loss": 0.90778029, + "learning_rate": 3.999986310859396e-06, + "loss": 0.93119347, + "num_input_tokens_seen": 11011360, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.9296875, + "step": 518, + "time_per_iteration": 4.080751657485962 + }, + { + "auxiliary_loss_clip": 0.0128372, + "auxiliary_loss_mlp": 0.01093666, + "balance_loss_clip": 1.05883288, + "balance_loss_mlp": 1.08518016, + "epoch": 0.031203968134676085, + "flos": 23112467072640.0, + "grad_norm": 3.008779144342936, + "language_loss": 0.86396456, + "learning_rate": 3.999984831979039e-06, + "loss": 0.88773847, + "num_input_tokens_seen": 11030150, + "router_z_loss_clip": 0.34765625, + "router_z_loss_mlp": 1.984375, + "step": 519, + "time_per_iteration": 2.510267734527588 + }, + { + "auxiliary_loss_clip": 0.01278708, + "auxiliary_loss_mlp": 0.01092513, + "balance_loss_clip": 1.06092215, + "balance_loss_mlp": 1.07567024, + "epoch": 0.03126409138734405, + "flos": 20954379064320.0, + "grad_norm": 2.0313723427087216, + "language_loss": 0.87156898, + "learning_rate": 3.999983277259057e-06, + "loss": 0.8952812, + "num_input_tokens_seen": 11049145, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 2.03125, + "step": 520, + "time_per_iteration": 2.4891066551208496 + }, + { + "auxiliary_loss_clip": 0.01281744, + "auxiliary_loss_mlp": 0.01089643, + "balance_loss_clip": 1.05633557, + "balance_loss_mlp": 1.07832289, + "epoch": 0.031324214640012026, + "flos": 21650112570240.0, + "grad_norm": 1.6802829394342778, + "language_loss": 0.89362079, + "learning_rate": 3.999981646699509e-06, + "loss": 0.91733468, + "num_input_tokens_seen": 11068835, + "router_z_loss_clip": 0.33203125, + "router_z_loss_mlp": 2.03125, + "step": 521, + "time_per_iteration": 2.508524179458618 + }, + { + "auxiliary_loss_clip": 0.01274208, + "auxiliary_loss_mlp": 0.010832, + "balance_loss_clip": 1.04889154, + "balance_loss_mlp": 1.07795191, + "epoch": 0.03138433789267999, + "flos": 23441337020160.0, + "grad_norm": 2.273639697525746, + "language_loss": 0.71327078, + "learning_rate": 3.999979940300456e-06, + "loss": 0.73684484, + "num_input_tokens_seen": 11088980, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.9609375, + "step": 522, + "time_per_iteration": 2.49629282951355 + }, + { + "auxiliary_loss_clip": 0.01278501, + "auxiliary_loss_mlp": 0.01083501, + "balance_loss_clip": 1.05150533, + "balance_loss_mlp": 1.07655358, + "epoch": 0.03144446114534796, + "flos": 18982164960000.0, + "grad_norm": 3.1208656196394706, + "language_loss": 0.84886295, + "learning_rate": 3.999978158061963e-06, + "loss": 0.87248302, + "num_input_tokens_seen": 11104300, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 2.015625, + "step": 523, + "time_per_iteration": 2.4674315452575684 + }, + { + "auxiliary_loss_clip": 0.01280597, + "auxiliary_loss_mlp": 0.01075539, + "balance_loss_clip": 1.04249442, + "balance_loss_mlp": 1.07655168, + "epoch": 0.031504584398015935, + "flos": 22637692080000.0, + "grad_norm": 1.9693639011355857, + "language_loss": 0.90419745, + "learning_rate": 3.999976299984099e-06, + "loss": 0.92775881, + "num_input_tokens_seen": 11123335, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 2.046875, + "step": 524, + "time_per_iteration": 2.480764627456665 + }, + { + "auxiliary_loss_clip": 0.01285248, + "auxiliary_loss_mlp": 0.01083988, + "balance_loss_clip": 1.05034757, + "balance_loss_mlp": 1.08102393, + "epoch": 0.0315647076506839, + "flos": 25297056339840.0, + "grad_norm": 2.4392367222760276, + "language_loss": 0.80040443, + "learning_rate": 3.999974366066933e-06, + "loss": 0.8240968, + "num_input_tokens_seen": 11140880, + "router_z_loss_clip": 0.3359375, + "router_z_loss_mlp": 2.046875, + "step": 525, + "time_per_iteration": 2.5409629344940186 + }, + { + "auxiliary_loss_clip": 0.01277675, + "auxiliary_loss_mlp": 0.01082993, + "balance_loss_clip": 1.05025804, + "balance_loss_mlp": 1.07571197, + "epoch": 0.03162483090335187, + "flos": 16982839065600.0, + "grad_norm": 2.8378410017413658, + "language_loss": 0.80693865, + "learning_rate": 3.999972356310538e-06, + "loss": 0.83054531, + "num_input_tokens_seen": 11158710, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 2.03125, + "step": 526, + "time_per_iteration": 2.4509081840515137 + }, + { + "auxiliary_loss_clip": 0.01285808, + "auxiliary_loss_mlp": 0.01072361, + "balance_loss_clip": 1.03655052, + "balance_loss_mlp": 1.08127069, + "epoch": 0.03168495415601984, + "flos": 18734489706240.0, + "grad_norm": 2.27970800213601, + "language_loss": 0.81417823, + "learning_rate": 3.999970270714991e-06, + "loss": 0.83775997, + "num_input_tokens_seen": 11177550, + "router_z_loss_clip": 0.359375, + "router_z_loss_mlp": 2.046875, + "step": 527, + "time_per_iteration": 2.4760756492614746 + }, + { + "auxiliary_loss_clip": 0.01273782, + "auxiliary_loss_mlp": 0.01080634, + "balance_loss_clip": 1.04651666, + "balance_loss_mlp": 1.07408452, + "epoch": 0.03174507740868781, + "flos": 21214875473280.0, + "grad_norm": 2.59751390244888, + "language_loss": 0.93932182, + "learning_rate": 3.999968109280371e-06, + "loss": 0.96286595, + "num_input_tokens_seen": 11196230, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 2.0, + "step": 528, + "time_per_iteration": 2.4721155166625977 + }, + { + "auxiliary_loss_clip": 0.01273884, + "auxiliary_loss_mlp": 0.01073354, + "balance_loss_clip": 1.04083371, + "balance_loss_mlp": 1.07427406, + "epoch": 0.03180520066135578, + "flos": 24787663614720.0, + "grad_norm": 1.8844039207994492, + "language_loss": 0.84143054, + "learning_rate": 3.99996587200676e-06, + "loss": 0.86490291, + "num_input_tokens_seen": 11214935, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 2.0, + "step": 529, + "time_per_iteration": 2.5173239707946777 + }, + { + "auxiliary_loss_clip": 0.01278919, + "auxiliary_loss_mlp": 0.01087129, + "balance_loss_clip": 1.05530047, + "balance_loss_mlp": 1.08254409, + "epoch": 0.03186532391402375, + "flos": 24864261367680.0, + "grad_norm": 2.130233453276154, + "language_loss": 0.90547037, + "learning_rate": 3.999963558894243e-06, + "loss": 0.92913085, + "num_input_tokens_seen": 11235310, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 1.96875, + "step": 530, + "time_per_iteration": 2.5096359252929688 + }, + { + "auxiliary_loss_clip": 0.0127291, + "auxiliary_loss_mlp": 0.01073181, + "balance_loss_clip": 1.03930187, + "balance_loss_mlp": 1.07199419, + "epoch": 0.03192544716669172, + "flos": 21215055041280.0, + "grad_norm": 2.12169085676626, + "language_loss": 0.76197046, + "learning_rate": 3.999961169942907e-06, + "loss": 0.78543139, + "num_input_tokens_seen": 11254425, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 2.015625, + "step": 531, + "time_per_iteration": 2.503265142440796 + }, + { + "auxiliary_loss_clip": 0.01272973, + "auxiliary_loss_mlp": 0.01064442, + "balance_loss_clip": 1.03030038, + "balance_loss_mlp": 1.07424712, + "epoch": 0.03198557041935969, + "flos": 24353216616960.0, + "grad_norm": 2.621085079916904, + "language_loss": 0.9073056, + "learning_rate": 3.999958705152843e-06, + "loss": 0.9306798, + "num_input_tokens_seen": 11274595, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 1.9921875, + "step": 532, + "time_per_iteration": 2.506220817565918 + }, + { + "auxiliary_loss_clip": 0.01137355, + "auxiliary_loss_mlp": 0.01010615, + "balance_loss_clip": 1.00198412, + "balance_loss_mlp": 1.0428524, + "epoch": 0.032045693672027656, + "flos": 61827367587840.0, + "grad_norm": 0.7306749876416057, + "language_loss": 0.57931173, + "learning_rate": 3.9999561645241445e-06, + "loss": 0.60079145, + "num_input_tokens_seen": 11336705, + "router_z_loss_clip": 0.08642578, + "router_z_loss_mlp": 0.9453125, + "step": 533, + "time_per_iteration": 3.154953956604004 + }, + { + "auxiliary_loss_clip": 0.01271016, + "auxiliary_loss_mlp": 0.01084756, + "balance_loss_clip": 1.05209231, + "balance_loss_mlp": 1.07378936, + "epoch": 0.03210581692469563, + "flos": 28401174800640.0, + "grad_norm": 1.8972625930530718, + "language_loss": 0.86725944, + "learning_rate": 3.999953548056907e-06, + "loss": 0.89081717, + "num_input_tokens_seen": 11356820, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 1.96875, + "step": 534, + "time_per_iteration": 2.5384750366210938 + }, + { + "auxiliary_loss_clip": 0.01271847, + "auxiliary_loss_mlp": 0.01066511, + "balance_loss_clip": 1.03468204, + "balance_loss_mlp": 1.07573223, + "epoch": 0.03216594017736359, + "flos": 24717709877760.0, + "grad_norm": 2.118212102173022, + "language_loss": 0.77352351, + "learning_rate": 3.999950855751232e-06, + "loss": 0.79690707, + "num_input_tokens_seen": 11376645, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 1.9609375, + "step": 535, + "time_per_iteration": 2.517940044403076 + }, + { + "auxiliary_loss_clip": 0.01274503, + "auxiliary_loss_mlp": 0.01083227, + "balance_loss_clip": 1.05151725, + "balance_loss_mlp": 1.07644773, + "epoch": 0.032226063430031565, + "flos": 31175453646720.0, + "grad_norm": 2.176836888233088, + "language_loss": 0.8074764, + "learning_rate": 3.999948087607219e-06, + "loss": 0.83105373, + "num_input_tokens_seen": 11397310, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 1.984375, + "step": 536, + "time_per_iteration": 2.546128034591675 + }, + { + "auxiliary_loss_clip": 0.01275643, + "auxiliary_loss_mlp": 0.01077633, + "balance_loss_clip": 1.04361033, + "balance_loss_mlp": 1.07698941, + "epoch": 0.03228618668269954, + "flos": 32198225506560.0, + "grad_norm": 2.3353202427960627, + "language_loss": 0.70118421, + "learning_rate": 3.999945243624975e-06, + "loss": 0.72471696, + "num_input_tokens_seen": 11418475, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 1.9921875, + "step": 537, + "time_per_iteration": 2.578101634979248 + }, + { + "auxiliary_loss_clip": 0.01274556, + "auxiliary_loss_mlp": 0.01081628, + "balance_loss_clip": 1.04877353, + "balance_loss_mlp": 1.08040798, + "epoch": 0.0323463099353675, + "flos": 22670154996480.0, + "grad_norm": 2.1000918694055044, + "language_loss": 0.8250435, + "learning_rate": 3.999942323804607e-06, + "loss": 0.84860539, + "num_input_tokens_seen": 11436630, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.9375, + "step": 538, + "time_per_iteration": 2.4822683334350586 + }, + { + "auxiliary_loss_clip": 0.01280793, + "auxiliary_loss_mlp": 0.01078587, + "balance_loss_clip": 1.0458765, + "balance_loss_mlp": 1.0775007, + "epoch": 0.032406433188035474, + "flos": 26905172232960.0, + "grad_norm": 1.8128048759039839, + "language_loss": 0.78999949, + "learning_rate": 3.999939328146225e-06, + "loss": 0.81359327, + "num_input_tokens_seen": 11457275, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 2.03125, + "step": 539, + "time_per_iteration": 2.5495705604553223 + }, + { + "auxiliary_loss_clip": 0.01274183, + "auxiliary_loss_mlp": 0.01066988, + "balance_loss_clip": 1.03284597, + "balance_loss_mlp": 1.0766232, + "epoch": 0.03246655644070344, + "flos": 31503928544640.0, + "grad_norm": 2.6651388031929835, + "language_loss": 0.77802742, + "learning_rate": 3.999936256649943e-06, + "loss": 0.80143911, + "num_input_tokens_seen": 11476925, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 1.9765625, + "step": 540, + "time_per_iteration": 2.5547144412994385 + }, + { + "auxiliary_loss_clip": 0.01282159, + "auxiliary_loss_mlp": 0.01070004, + "balance_loss_clip": 1.03755546, + "balance_loss_mlp": 1.08122253, + "epoch": 0.03252667969337141, + "flos": 23218331431680.0, + "grad_norm": 2.2422114385304845, + "language_loss": 0.85410464, + "learning_rate": 3.999933109315878e-06, + "loss": 0.8776263, + "num_input_tokens_seen": 11496830, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 2.0, + "step": 541, + "time_per_iteration": 2.517545700073242 + }, + { + "auxiliary_loss_clip": 0.01271503, + "auxiliary_loss_mlp": 0.01083563, + "balance_loss_clip": 1.04906392, + "balance_loss_mlp": 1.07759655, + "epoch": 0.032586802946039384, + "flos": 14757454926720.0, + "grad_norm": 2.210152212848466, + "language_loss": 0.89072484, + "learning_rate": 3.9999298861441496e-06, + "loss": 0.91427547, + "num_input_tokens_seen": 11515605, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.9375, + "step": 542, + "time_per_iteration": 2.437566041946411 + }, + { + "auxiliary_loss_clip": 0.01272694, + "auxiliary_loss_mlp": 0.01075801, + "balance_loss_clip": 1.04289961, + "balance_loss_mlp": 1.07649362, + "epoch": 0.03264692619870735, + "flos": 24280677100800.0, + "grad_norm": 2.3494598042187236, + "language_loss": 0.71096039, + "learning_rate": 3.999926587134879e-06, + "loss": 0.73444533, + "num_input_tokens_seen": 11536230, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.9609375, + "step": 543, + "time_per_iteration": 2.5121288299560547 + }, + { + "auxiliary_loss_clip": 0.0127171, + "auxiliary_loss_mlp": 0.01086873, + "balance_loss_clip": 1.05411386, + "balance_loss_mlp": 1.07139826, + "epoch": 0.03270704945137532, + "flos": 22893160584960.0, + "grad_norm": 2.6617228213889375, + "language_loss": 0.91273057, + "learning_rate": 3.999923212288192e-06, + "loss": 0.93631637, + "num_input_tokens_seen": 11554715, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 2.0, + "step": 544, + "time_per_iteration": 2.529536008834839 + }, + { + "auxiliary_loss_clip": 0.01274727, + "auxiliary_loss_mlp": 0.01082721, + "balance_loss_clip": 1.05186987, + "balance_loss_mlp": 1.07790041, + "epoch": 0.032767172704043286, + "flos": 18041018757120.0, + "grad_norm": 3.144073602630947, + "language_loss": 0.6640051, + "learning_rate": 3.999919761604216e-06, + "loss": 0.68757957, + "num_input_tokens_seen": 11571370, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.96875, + "step": 545, + "time_per_iteration": 2.487250328063965 + }, + { + "auxiliary_loss_clip": 0.01272187, + "auxiliary_loss_mlp": 0.01069604, + "balance_loss_clip": 1.03715563, + "balance_loss_mlp": 1.07393909, + "epoch": 0.03282729595671126, + "flos": 22528739151360.0, + "grad_norm": 2.6288964335615805, + "language_loss": 0.91857421, + "learning_rate": 3.999916235083083e-06, + "loss": 0.94199216, + "num_input_tokens_seen": 11588560, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 1.984375, + "step": 546, + "time_per_iteration": 2.4893922805786133 + }, + { + "auxiliary_loss_clip": 0.0126813, + "auxiliary_loss_mlp": 0.01071134, + "balance_loss_clip": 1.03723049, + "balance_loss_mlp": 1.07095337, + "epoch": 0.03288741920937923, + "flos": 20410620001920.0, + "grad_norm": 2.4455611041839127, + "language_loss": 0.82002354, + "learning_rate": 3.999912632724925e-06, + "loss": 0.84341609, + "num_input_tokens_seen": 11605685, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 1.96875, + "step": 547, + "time_per_iteration": 2.4879701137542725 + }, + { + "auxiliary_loss_clip": 0.01271545, + "auxiliary_loss_mlp": 0.01070995, + "balance_loss_clip": 1.0376637, + "balance_loss_mlp": 1.07550538, + "epoch": 0.032947542462047195, + "flos": 20777986350720.0, + "grad_norm": 3.015836198351779, + "language_loss": 0.80919325, + "learning_rate": 3.999908954529881e-06, + "loss": 0.83261865, + "num_input_tokens_seen": 11626290, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 1.9609375, + "step": 548, + "time_per_iteration": 2.501983404159546 + }, + { + "auxiliary_loss_clip": 0.01270889, + "auxiliary_loss_mlp": 0.01079421, + "balance_loss_clip": 1.04499304, + "balance_loss_mlp": 1.07411838, + "epoch": 0.03300766571471517, + "flos": 19901263190400.0, + "grad_norm": 3.9904289991591217, + "language_loss": 0.67330974, + "learning_rate": 3.999905200498087e-06, + "loss": 0.69681287, + "num_input_tokens_seen": 11643950, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.96875, + "step": 549, + "time_per_iteration": 2.479069948196411 + }, + { + "auxiliary_loss_clip": 0.01265753, + "auxiliary_loss_mlp": 0.01075673, + "balance_loss_clip": 1.04286647, + "balance_loss_mlp": 1.07537639, + "epoch": 0.03306778896738313, + "flos": 17967760968960.0, + "grad_norm": 2.081726350608672, + "language_loss": 0.86137938, + "learning_rate": 3.999901370629689e-06, + "loss": 0.88479364, + "num_input_tokens_seen": 11662560, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.90625, + "step": 550, + "time_per_iteration": 2.435030221939087 + }, + { + "auxiliary_loss_clip": 0.01271779, + "auxiliary_loss_mlp": 0.01089379, + "balance_loss_clip": 1.05712056, + "balance_loss_mlp": 1.07876444, + "epoch": 0.033127912220051105, + "flos": 21653380707840.0, + "grad_norm": 2.0024940554917534, + "language_loss": 0.81302834, + "learning_rate": 3.99989746492483e-06, + "loss": 0.83663994, + "num_input_tokens_seen": 11682265, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 1.9296875, + "step": 551, + "time_per_iteration": 2.474317789077759 + }, + { + "auxiliary_loss_clip": 0.01278525, + "auxiliary_loss_mlp": 0.01080037, + "balance_loss_clip": 1.0469687, + "balance_loss_mlp": 1.0786469, + "epoch": 0.03318803547271908, + "flos": 30188376927360.0, + "grad_norm": 2.5540153370218697, + "language_loss": 0.85907811, + "learning_rate": 3.999893483383658e-06, + "loss": 0.88266373, + "num_input_tokens_seen": 11699300, + "router_z_loss_clip": 0.33007812, + "router_z_loss_mlp": 2.0, + "step": 552, + "time_per_iteration": 2.5352437496185303 + }, + { + "auxiliary_loss_clip": 0.01276099, + "auxiliary_loss_mlp": 0.01077197, + "balance_loss_clip": 1.0428648, + "balance_loss_mlp": 1.07894135, + "epoch": 0.03324815872538704, + "flos": 20376038183040.0, + "grad_norm": 2.3148388677976253, + "language_loss": 0.928128, + "learning_rate": 3.999889426006326e-06, + "loss": 0.95166099, + "num_input_tokens_seen": 11716955, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.96875, + "step": 553, + "time_per_iteration": 2.4860291481018066 + }, + { + "auxiliary_loss_clip": 0.01270959, + "auxiliary_loss_mlp": 0.01072703, + "balance_loss_clip": 1.03858554, + "balance_loss_mlp": 1.0755136, + "epoch": 0.033308281978055014, + "flos": 24494560634880.0, + "grad_norm": 2.234190064541142, + "language_loss": 0.78874755, + "learning_rate": 3.999885292792986e-06, + "loss": 0.81218415, + "num_input_tokens_seen": 11736130, + "router_z_loss_clip": 0.34179688, + "router_z_loss_mlp": 1.953125, + "step": 554, + "time_per_iteration": 2.4878416061401367 + }, + { + "auxiliary_loss_clip": 0.0126611, + "auxiliary_loss_mlp": 0.01082645, + "balance_loss_clip": 1.04838455, + "balance_loss_mlp": 1.07417822, + "epoch": 0.03336840523072298, + "flos": 23400326666880.0, + "grad_norm": 2.1365458646452424, + "language_loss": 0.82297659, + "learning_rate": 3.999881083743795e-06, + "loss": 0.84646416, + "num_input_tokens_seen": 11754425, + "router_z_loss_clip": 0.34375, + "router_z_loss_mlp": 1.9140625, + "step": 555, + "time_per_iteration": 2.4846394062042236 + }, + { + "auxiliary_loss_clip": 0.01270081, + "auxiliary_loss_mlp": 0.01075464, + "balance_loss_clip": 1.04156113, + "balance_loss_mlp": 1.07390678, + "epoch": 0.03342852848339095, + "flos": 30550571717760.0, + "grad_norm": 2.781828445596944, + "language_loss": 0.88624835, + "learning_rate": 3.999876798858914e-06, + "loss": 0.90970379, + "num_input_tokens_seen": 11772845, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 1.96875, + "step": 556, + "time_per_iteration": 2.5788674354553223 + }, + { + "auxiliary_loss_clip": 0.01269545, + "auxiliary_loss_mlp": 0.01079314, + "balance_loss_clip": 1.04531527, + "balance_loss_mlp": 1.07534254, + "epoch": 0.03348865173605892, + "flos": 22893304239360.0, + "grad_norm": 2.0860752820949586, + "language_loss": 0.83492053, + "learning_rate": 3.999872438138503e-06, + "loss": 0.85840911, + "num_input_tokens_seen": 11792850, + "router_z_loss_clip": 0.33984375, + "router_z_loss_mlp": 1.9375, + "step": 557, + "time_per_iteration": 2.5352954864501953 + }, + { + "auxiliary_loss_clip": 0.01275093, + "auxiliary_loss_mlp": 0.0106652, + "balance_loss_clip": 1.03495288, + "balance_loss_mlp": 1.07979858, + "epoch": 0.03354877498872689, + "flos": 17676022705920.0, + "grad_norm": 9.145612151583265, + "language_loss": 0.94169575, + "learning_rate": 3.999868001582729e-06, + "loss": 0.96511185, + "num_input_tokens_seen": 11809670, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.953125, + "step": 558, + "time_per_iteration": 2.4541964530944824 + }, + { + "auxiliary_loss_clip": 0.01265501, + "auxiliary_loss_mlp": 0.01073065, + "balance_loss_clip": 1.0406878, + "balance_loss_mlp": 1.07178497, + "epoch": 0.03360889824139486, + "flos": 21652985658240.0, + "grad_norm": 2.48174106566098, + "language_loss": 0.7735827, + "learning_rate": 3.99986348919176e-06, + "loss": 0.7969684, + "num_input_tokens_seen": 11829665, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 1.9375, + "step": 559, + "time_per_iteration": 5.362890005111694 + }, + { + "auxiliary_loss_clip": 0.01268387, + "auxiliary_loss_mlp": 0.01078962, + "balance_loss_clip": 1.04818201, + "balance_loss_mlp": 1.07386613, + "epoch": 0.033669021494062826, + "flos": 21795730306560.0, + "grad_norm": 2.071149038386511, + "language_loss": 0.87681198, + "learning_rate": 3.9998589009657675e-06, + "loss": 0.90028548, + "num_input_tokens_seen": 11848190, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.953125, + "step": 560, + "time_per_iteration": 3.9536426067352295 + }, + { + "auxiliary_loss_clip": 0.01264547, + "auxiliary_loss_mlp": 0.01067998, + "balance_loss_clip": 1.0375762, + "balance_loss_mlp": 1.07323277, + "epoch": 0.0337291447467308, + "flos": 21866222747520.0, + "grad_norm": 2.2284071587683463, + "language_loss": 0.81380183, + "learning_rate": 3.999854236904925e-06, + "loss": 0.83712727, + "num_input_tokens_seen": 11864795, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.9140625, + "step": 561, + "time_per_iteration": 2.49826717376709 + }, + { + "auxiliary_loss_clip": 0.01263917, + "auxiliary_loss_mlp": 0.01071053, + "balance_loss_clip": 1.04029727, + "balance_loss_mlp": 1.07403696, + "epoch": 0.03378926799939877, + "flos": 24245951627520.0, + "grad_norm": 1.7768341081574646, + "language_loss": 0.82018232, + "learning_rate": 3.999849497009409e-06, + "loss": 0.84353203, + "num_input_tokens_seen": 11885275, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.90625, + "step": 562, + "time_per_iteration": 2.503990888595581 + }, + { + "auxiliary_loss_clip": 0.01269896, + "auxiliary_loss_mlp": 0.01075498, + "balance_loss_clip": 1.04352641, + "balance_loss_mlp": 1.07592142, + "epoch": 0.033849391252066735, + "flos": 16507812677760.0, + "grad_norm": 1.966221896086353, + "language_loss": 0.84028983, + "learning_rate": 3.999844681279401e-06, + "loss": 0.86374378, + "num_input_tokens_seen": 11903595, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 1.9375, + "step": 563, + "time_per_iteration": 2.464571952819824 + }, + { + "auxiliary_loss_clip": 0.01268432, + "auxiliary_loss_mlp": 0.01079949, + "balance_loss_clip": 1.04866886, + "balance_loss_mlp": 1.07648492, + "epoch": 0.03390951450473471, + "flos": 15669298609920.0, + "grad_norm": 2.359913311978066, + "language_loss": 0.94194812, + "learning_rate": 3.99983978971508e-06, + "loss": 0.96543193, + "num_input_tokens_seen": 11917815, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.921875, + "step": 564, + "time_per_iteration": 2.423762798309326 + }, + { + "auxiliary_loss_clip": 0.01267204, + "auxiliary_loss_mlp": 0.01070001, + "balance_loss_clip": 1.03745687, + "balance_loss_mlp": 1.07225537, + "epoch": 0.03396963775740267, + "flos": 22674787850880.0, + "grad_norm": 3.7666153248687277, + "language_loss": 0.94089758, + "learning_rate": 3.999834822316635e-06, + "loss": 0.96426964, + "num_input_tokens_seen": 11936305, + "router_z_loss_clip": 0.32617188, + "router_z_loss_mlp": 1.953125, + "step": 565, + "time_per_iteration": 2.499417543411255 + }, + { + "auxiliary_loss_clip": 0.01140331, + "auxiliary_loss_mlp": 0.0102506, + "balance_loss_clip": 1.01714468, + "balance_loss_mlp": 1.04934859, + "epoch": 0.034029761010070644, + "flos": 64392683063040.0, + "grad_norm": 1.1198796781785882, + "language_loss": 0.54823005, + "learning_rate": 3.9998297790842535e-06, + "loss": 0.569884, + "num_input_tokens_seen": 11998940, + "router_z_loss_clip": 0.07910156, + "router_z_loss_mlp": 0.91015625, + "step": 566, + "time_per_iteration": 3.1322038173675537 + }, + { + "auxiliary_loss_clip": 0.01270043, + "auxiliary_loss_mlp": 0.01072204, + "balance_loss_clip": 1.03837276, + "balance_loss_mlp": 1.0753262, + "epoch": 0.034089884262738616, + "flos": 25004204755200.0, + "grad_norm": 2.6603630269915683, + "language_loss": 0.76780868, + "learning_rate": 3.999824660018126e-06, + "loss": 0.79123116, + "num_input_tokens_seen": 12018860, + "router_z_loss_clip": 0.33789062, + "router_z_loss_mlp": 1.9453125, + "step": 567, + "time_per_iteration": 2.5351951122283936 + }, + { + "auxiliary_loss_clip": 0.01261299, + "auxiliary_loss_mlp": 0.01077897, + "balance_loss_clip": 1.04809463, + "balance_loss_mlp": 1.07400167, + "epoch": 0.03415000751540658, + "flos": 28439096584320.0, + "grad_norm": 4.563520524929296, + "language_loss": 0.80796623, + "learning_rate": 3.999819465118447e-06, + "loss": 0.83135819, + "num_input_tokens_seen": 12039675, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.875, + "step": 568, + "time_per_iteration": 2.558093309402466 + }, + { + "auxiliary_loss_clip": 0.01263323, + "auxiliary_loss_mlp": 0.01079335, + "balance_loss_clip": 1.04836476, + "balance_loss_mlp": 1.07628214, + "epoch": 0.034210130768074554, + "flos": 21468727866240.0, + "grad_norm": 1.809578126153619, + "language_loss": 0.86777622, + "learning_rate": 3.999814194385413e-06, + "loss": 0.89120281, + "num_input_tokens_seen": 12057680, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.875, + "step": 569, + "time_per_iteration": 2.500319719314575 + }, + { + "auxiliary_loss_clip": 0.01264002, + "auxiliary_loss_mlp": 0.01073079, + "balance_loss_clip": 1.04227519, + "balance_loss_mlp": 1.07425416, + "epoch": 0.03427025402074252, + "flos": 18697501676160.0, + "grad_norm": 1.8164454228173497, + "language_loss": 0.95802778, + "learning_rate": 3.9998088478192255e-06, + "loss": 0.98139858, + "num_input_tokens_seen": 12076135, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.8984375, + "step": 570, + "time_per_iteration": 2.473808526992798 + }, + { + "auxiliary_loss_clip": 0.01264689, + "auxiliary_loss_mlp": 0.01080759, + "balance_loss_clip": 1.04733253, + "balance_loss_mlp": 1.07053721, + "epoch": 0.03433037727341049, + "flos": 20849987162880.0, + "grad_norm": 2.217921822086313, + "language_loss": 0.79522127, + "learning_rate": 3.9998034254200846e-06, + "loss": 0.81867576, + "num_input_tokens_seen": 12094785, + "router_z_loss_clip": 0.33398438, + "router_z_loss_mlp": 1.9375, + "step": 571, + "time_per_iteration": 2.48317813873291 + }, + { + "auxiliary_loss_clip": 0.01265335, + "auxiliary_loss_mlp": 0.01076969, + "balance_loss_clip": 1.04490221, + "balance_loss_mlp": 1.07593679, + "epoch": 0.03439050052607846, + "flos": 25410282986880.0, + "grad_norm": 2.3471183659940555, + "language_loss": 0.79962778, + "learning_rate": 3.999797927188199e-06, + "loss": 0.82305074, + "num_input_tokens_seen": 12114590, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 1.890625, + "step": 572, + "time_per_iteration": 2.5112764835357666 + }, + { + "auxiliary_loss_clip": 0.01270326, + "auxiliary_loss_mlp": 0.0106947, + "balance_loss_clip": 1.03871393, + "balance_loss_mlp": 1.07574439, + "epoch": 0.03445062377874643, + "flos": 17640147997440.0, + "grad_norm": 1.9544136074887903, + "language_loss": 0.84374899, + "learning_rate": 3.999792353123774e-06, + "loss": 0.86714697, + "num_input_tokens_seen": 12132390, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.9453125, + "step": 573, + "time_per_iteration": 2.474212408065796 + }, + { + "auxiliary_loss_clip": 0.01266726, + "auxiliary_loss_mlp": 0.01064214, + "balance_loss_clip": 1.03460276, + "balance_loss_mlp": 1.07282329, + "epoch": 0.0345107470314144, + "flos": 16764502245120.0, + "grad_norm": 3.553507560277694, + "language_loss": 0.76376265, + "learning_rate": 3.999786703227023e-06, + "loss": 0.78707206, + "num_input_tokens_seen": 12149035, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.9375, + "step": 574, + "time_per_iteration": 2.4510116577148438 + }, + { + "auxiliary_loss_clip": 0.01264596, + "auxiliary_loss_mlp": 0.01064423, + "balance_loss_clip": 1.03531194, + "balance_loss_mlp": 1.0731982, + "epoch": 0.03457087028408237, + "flos": 14684448533760.0, + "grad_norm": 2.5278817664157343, + "language_loss": 0.83801597, + "learning_rate": 3.9997809774981606e-06, + "loss": 0.86130619, + "num_input_tokens_seen": 12167530, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.9140625, + "step": 575, + "time_per_iteration": 2.459693193435669 + }, + { + "auxiliary_loss_clip": 0.01260171, + "auxiliary_loss_mlp": 0.01067742, + "balance_loss_clip": 1.03830886, + "balance_loss_mlp": 1.07501364, + "epoch": 0.03463099353675034, + "flos": 20011293527040.0, + "grad_norm": 2.241383472398266, + "language_loss": 0.83726245, + "learning_rate": 3.9997751759374025e-06, + "loss": 0.86054158, + "num_input_tokens_seen": 12186340, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.8515625, + "step": 576, + "time_per_iteration": 2.47292423248291 + }, + { + "auxiliary_loss_clip": 0.01267718, + "auxiliary_loss_mlp": 0.01074956, + "balance_loss_clip": 1.04582155, + "balance_loss_mlp": 1.08247435, + "epoch": 0.03469111678941831, + "flos": 25301150490240.0, + "grad_norm": 2.0876645490308334, + "language_loss": 0.8640908, + "learning_rate": 3.99976929854497e-06, + "loss": 0.88751757, + "num_input_tokens_seen": 12204090, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.8515625, + "step": 577, + "time_per_iteration": 2.529500961303711 + }, + { + "auxiliary_loss_clip": 0.01262371, + "auxiliary_loss_mlp": 0.01069797, + "balance_loss_clip": 1.04028082, + "balance_loss_mlp": 1.0769875, + "epoch": 0.034751240042086275, + "flos": 23259413612160.0, + "grad_norm": 3.2017547958107784, + "language_loss": 0.72333407, + "learning_rate": 3.9997633453210845e-06, + "loss": 0.74665576, + "num_input_tokens_seen": 12224850, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.859375, + "step": 578, + "time_per_iteration": 2.4868762493133545 + }, + { + "auxiliary_loss_clip": 0.01263036, + "auxiliary_loss_mlp": 0.01071071, + "balance_loss_clip": 1.04050565, + "balance_loss_mlp": 1.07441878, + "epoch": 0.03481136329475425, + "flos": 23769237300480.0, + "grad_norm": 1.8544904120227406, + "language_loss": 0.77664137, + "learning_rate": 3.999757316265973e-06, + "loss": 0.79998243, + "num_input_tokens_seen": 12244935, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.8828125, + "step": 579, + "time_per_iteration": 2.50669002532959 + }, + { + "auxiliary_loss_clip": 0.01260844, + "auxiliary_loss_mlp": 0.01077558, + "balance_loss_clip": 1.04634845, + "balance_loss_mlp": 1.07355189, + "epoch": 0.03487148654742222, + "flos": 20157521794560.0, + "grad_norm": 2.5351053977844136, + "language_loss": 0.86927247, + "learning_rate": 3.999751211379863e-06, + "loss": 0.89265645, + "num_input_tokens_seen": 12262140, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.875, + "step": 580, + "time_per_iteration": 2.505908966064453 + }, + { + "auxiliary_loss_clip": 0.01266331, + "auxiliary_loss_mlp": 0.01063953, + "balance_loss_clip": 1.03536677, + "balance_loss_mlp": 1.07510614, + "epoch": 0.034931609800090184, + "flos": 15669585918720.0, + "grad_norm": 4.565959491833327, + "language_loss": 0.82161844, + "learning_rate": 3.999745030662987e-06, + "loss": 0.84492135, + "num_input_tokens_seen": 12280930, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.9140625, + "step": 581, + "time_per_iteration": 2.4735610485076904 + }, + { + "auxiliary_loss_clip": 0.01263493, + "auxiliary_loss_mlp": 0.01067149, + "balance_loss_clip": 1.03832436, + "balance_loss_mlp": 1.07712197, + "epoch": 0.034991733052758156, + "flos": 16362374509440.0, + "grad_norm": 2.2699668532214377, + "language_loss": 0.77498174, + "learning_rate": 3.99973877411558e-06, + "loss": 0.79828823, + "num_input_tokens_seen": 12299125, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.8671875, + "step": 582, + "time_per_iteration": 2.4596173763275146 + }, + { + "auxiliary_loss_clip": 0.01261728, + "auxiliary_loss_mlp": 0.01075668, + "balance_loss_clip": 1.04467332, + "balance_loss_mlp": 1.07715631, + "epoch": 0.03505185630542612, + "flos": 19387309438080.0, + "grad_norm": 2.0991939318744692, + "language_loss": 0.87632537, + "learning_rate": 3.999732441737877e-06, + "loss": 0.89969933, + "num_input_tokens_seen": 12316905, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 1.84375, + "step": 583, + "time_per_iteration": 2.46062970161438 + }, + { + "auxiliary_loss_clip": 0.01268555, + "auxiliary_loss_mlp": 0.01082553, + "balance_loss_clip": 1.05167794, + "balance_loss_mlp": 1.07587278, + "epoch": 0.03511197955809409, + "flos": 21323828401920.0, + "grad_norm": 2.3581841085942004, + "language_loss": 0.80997103, + "learning_rate": 3.99972603353012e-06, + "loss": 0.83348215, + "num_input_tokens_seen": 12335070, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.921875, + "step": 584, + "time_per_iteration": 2.4776926040649414 + }, + { + "auxiliary_loss_clip": 0.01262257, + "auxiliary_loss_mlp": 0.01063212, + "balance_loss_clip": 1.03326654, + "balance_loss_mlp": 1.0725317, + "epoch": 0.035172102810762065, + "flos": 14136595320960.0, + "grad_norm": 2.6245680316153743, + "language_loss": 0.92654932, + "learning_rate": 3.999719549492551e-06, + "loss": 0.94980395, + "num_input_tokens_seen": 12350315, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 1.8984375, + "step": 585, + "time_per_iteration": 2.486678123474121 + }, + { + "auxiliary_loss_clip": 0.01262479, + "auxiliary_loss_mlp": 0.01070226, + "balance_loss_clip": 1.04011369, + "balance_loss_mlp": 1.07368612, + "epoch": 0.03523222606343003, + "flos": 20296890564480.0, + "grad_norm": 2.4855014647160245, + "language_loss": 0.87484592, + "learning_rate": 3.9997129896254165e-06, + "loss": 0.89817297, + "num_input_tokens_seen": 12366030, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.890625, + "step": 586, + "time_per_iteration": 2.457772970199585 + }, + { + "auxiliary_loss_clip": 0.01269677, + "auxiliary_loss_mlp": 0.01071192, + "balance_loss_clip": 1.04137754, + "balance_loss_mlp": 1.07875896, + "epoch": 0.035292349316098, + "flos": 20375822701440.0, + "grad_norm": 1.7854143394247532, + "language_loss": 0.76574278, + "learning_rate": 3.999706353928965e-06, + "loss": 0.78915149, + "num_input_tokens_seen": 12384895, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.90625, + "step": 587, + "time_per_iteration": 2.4794015884399414 + }, + { + "auxiliary_loss_clip": 0.01269924, + "auxiliary_loss_mlp": 0.01061103, + "balance_loss_clip": 1.02991772, + "balance_loss_mlp": 1.07701528, + "epoch": 0.03535247256876597, + "flos": 21468871520640.0, + "grad_norm": 1.6805414217886456, + "language_loss": 0.78441286, + "learning_rate": 3.999699642403449e-06, + "loss": 0.80772316, + "num_input_tokens_seen": 12404980, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.9296875, + "step": 588, + "time_per_iteration": 2.4755733013153076 + }, + { + "auxiliary_loss_clip": 0.01267146, + "auxiliary_loss_mlp": 0.01071411, + "balance_loss_clip": 1.03850961, + "balance_loss_mlp": 1.07600832, + "epoch": 0.03541259582143394, + "flos": 23623044946560.0, + "grad_norm": 2.6477303031273185, + "language_loss": 0.94003904, + "learning_rate": 3.99969285504912e-06, + "loss": 0.96342462, + "num_input_tokens_seen": 12423835, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.9140625, + "step": 589, + "time_per_iteration": 2.515296459197998 + }, + { + "auxiliary_loss_clip": 0.01269747, + "auxiliary_loss_mlp": 0.01067695, + "balance_loss_clip": 1.03803611, + "balance_loss_mlp": 1.07632184, + "epoch": 0.03547271907410191, + "flos": 33726367768320.0, + "grad_norm": 2.4870139863099157, + "language_loss": 0.84060037, + "learning_rate": 3.99968599186624e-06, + "loss": 0.86397475, + "num_input_tokens_seen": 12443135, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.9375, + "step": 590, + "time_per_iteration": 2.583080291748047 + }, + { + "auxiliary_loss_clip": 0.01259593, + "auxiliary_loss_mlp": 0.01062628, + "balance_loss_clip": 1.0342319, + "balance_loss_mlp": 1.07476449, + "epoch": 0.03553284232676988, + "flos": 21142695093120.0, + "grad_norm": 2.031404841890899, + "language_loss": 0.86889851, + "learning_rate": 3.999679052855065e-06, + "loss": 0.89212072, + "num_input_tokens_seen": 12462895, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.8515625, + "step": 591, + "time_per_iteration": 2.497912883758545 + }, + { + "auxiliary_loss_clip": 0.01264593, + "auxiliary_loss_mlp": 0.01070221, + "balance_loss_clip": 1.03917849, + "balance_loss_mlp": 1.07271862, + "epoch": 0.03559296557943785, + "flos": 20046593617920.0, + "grad_norm": 3.1144902928375586, + "language_loss": 0.82980722, + "learning_rate": 3.999672038015861e-06, + "loss": 0.85315537, + "num_input_tokens_seen": 12481515, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 1.921875, + "step": 592, + "time_per_iteration": 2.463977813720703 + }, + { + "auxiliary_loss_clip": 0.01128612, + "auxiliary_loss_mlp": 0.01022486, + "balance_loss_clip": 1.01590526, + "balance_loss_mlp": 1.03881407, + "epoch": 0.035653088832105814, + "flos": 60334597244160.0, + "grad_norm": 0.8806680605255408, + "language_loss": 0.59741807, + "learning_rate": 3.999664947348893e-06, + "loss": 0.61892909, + "num_input_tokens_seen": 12548220, + "router_z_loss_clip": 0.06591797, + "router_z_loss_mlp": 0.8984375, + "step": 593, + "time_per_iteration": 3.1275696754455566 + }, + { + "auxiliary_loss_clip": 0.01262803, + "auxiliary_loss_mlp": 0.01070928, + "balance_loss_clip": 1.03988624, + "balance_loss_mlp": 1.07810974, + "epoch": 0.035713212084773786, + "flos": 20113135562880.0, + "grad_norm": 1.8853114596204945, + "language_loss": 0.87042278, + "learning_rate": 3.999657780854429e-06, + "loss": 0.89376009, + "num_input_tokens_seen": 12566105, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 1.84375, + "step": 594, + "time_per_iteration": 2.522805690765381 + }, + { + "auxiliary_loss_clip": 0.01262874, + "auxiliary_loss_mlp": 0.01064868, + "balance_loss_clip": 1.03539896, + "balance_loss_mlp": 1.07309461, + "epoch": 0.03577333533744176, + "flos": 26285785084800.0, + "grad_norm": 2.3431313884364395, + "language_loss": 0.83481348, + "learning_rate": 3.999650538532742e-06, + "loss": 0.85809088, + "num_input_tokens_seen": 12586680, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.8984375, + "step": 595, + "time_per_iteration": 2.565220832824707 + }, + { + "auxiliary_loss_clip": 0.01261367, + "auxiliary_loss_mlp": 0.01072242, + "balance_loss_clip": 1.04216576, + "balance_loss_mlp": 1.07610273, + "epoch": 0.035833458590109724, + "flos": 10889732211840.0, + "grad_norm": 3.1278930526147426, + "language_loss": 0.96185803, + "learning_rate": 3.999643220384106e-06, + "loss": 0.98519421, + "num_input_tokens_seen": 12601605, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.859375, + "step": 596, + "time_per_iteration": 2.460515260696411 + }, + { + "auxiliary_loss_clip": 0.0126361, + "auxiliary_loss_mlp": 0.0107037, + "balance_loss_clip": 1.04185498, + "balance_loss_mlp": 1.07627654, + "epoch": 0.035893581842777696, + "flos": 22090198003200.0, + "grad_norm": 2.2167421176017204, + "language_loss": 0.82718551, + "learning_rate": 3.999635826408799e-06, + "loss": 0.85052526, + "num_input_tokens_seen": 12620365, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.875, + "step": 597, + "time_per_iteration": 2.5076701641082764 + }, + { + "auxiliary_loss_clip": 0.01261023, + "auxiliary_loss_mlp": 0.01069081, + "balance_loss_clip": 1.03956461, + "balance_loss_mlp": 1.0784421, + "epoch": 0.03595370509544566, + "flos": 23038347358080.0, + "grad_norm": 2.168981908539252, + "language_loss": 0.81386817, + "learning_rate": 3.999628356607101e-06, + "loss": 0.83716923, + "num_input_tokens_seen": 12641140, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.828125, + "step": 598, + "time_per_iteration": 2.531188726425171 + }, + { + "auxiliary_loss_clip": 0.01254264, + "auxiliary_loss_mlp": 0.0106961, + "balance_loss_clip": 1.03894937, + "balance_loss_mlp": 1.07570839, + "epoch": 0.03601382834811363, + "flos": 20777734955520.0, + "grad_norm": 1.9075541218278638, + "language_loss": 0.81387949, + "learning_rate": 3.999620810979295e-06, + "loss": 0.83711827, + "num_input_tokens_seen": 12661080, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.7890625, + "step": 599, + "time_per_iteration": 2.511871576309204 + }, + { + "auxiliary_loss_clip": 0.01262476, + "auxiliary_loss_mlp": 0.01074253, + "balance_loss_clip": 1.04557085, + "balance_loss_mlp": 1.07350755, + "epoch": 0.036073951600781605, + "flos": 23951627585280.0, + "grad_norm": 2.1528215266255604, + "language_loss": 0.86115932, + "learning_rate": 3.999613189525668e-06, + "loss": 0.88452661, + "num_input_tokens_seen": 12678270, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.890625, + "step": 600, + "time_per_iteration": 2.50054669380188 + }, + { + "auxiliary_loss_clip": 0.01254617, + "auxiliary_loss_mlp": 0.01080731, + "balance_loss_clip": 1.05133438, + "balance_loss_mlp": 1.06909621, + "epoch": 0.03613407485344957, + "flos": 18912283050240.0, + "grad_norm": 3.928737875146519, + "language_loss": 0.82175761, + "learning_rate": 3.999605492246508e-06, + "loss": 0.84511113, + "num_input_tokens_seen": 12697295, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.8515625, + "step": 601, + "time_per_iteration": 6.795202255249023 + }, + { + "auxiliary_loss_clip": 0.01253245, + "auxiliary_loss_mlp": 0.01056304, + "balance_loss_clip": 1.02666831, + "balance_loss_mlp": 1.07096183, + "epoch": 0.03619419810611754, + "flos": 23038526926080.0, + "grad_norm": 2.2629653513719252, + "language_loss": 0.75467926, + "learning_rate": 3.999597719142107e-06, + "loss": 0.77777481, + "num_input_tokens_seen": 12716165, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.8203125, + "step": 602, + "time_per_iteration": 2.503629446029663 + }, + { + "auxiliary_loss_clip": 0.01252806, + "auxiliary_loss_mlp": 0.01057069, + "balance_loss_clip": 1.02833962, + "balance_loss_mlp": 1.07078326, + "epoch": 0.03625432135878551, + "flos": 29457774293760.0, + "grad_norm": 1.9962737747137984, + "language_loss": 0.80078572, + "learning_rate": 3.999589870212761e-06, + "loss": 0.82388449, + "num_input_tokens_seen": 12735475, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.8203125, + "step": 603, + "time_per_iteration": 2.568368911743164 + }, + { + "auxiliary_loss_clip": 0.01258325, + "auxiliary_loss_mlp": 0.01061531, + "balance_loss_clip": 1.03320646, + "balance_loss_mlp": 1.07597041, + "epoch": 0.03631444461145348, + "flos": 23508525409920.0, + "grad_norm": 1.9836566776981934, + "language_loss": 0.86801207, + "learning_rate": 3.9995819454587664e-06, + "loss": 0.89121068, + "num_input_tokens_seen": 12754540, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.8203125, + "step": 604, + "time_per_iteration": 2.496415376663208 + }, + { + "auxiliary_loss_clip": 0.01260423, + "auxiliary_loss_mlp": 0.01065702, + "balance_loss_clip": 1.03549433, + "balance_loss_mlp": 1.07688427, + "epoch": 0.03637456786412145, + "flos": 16618130323200.0, + "grad_norm": 3.252638522711271, + "language_loss": 0.81078291, + "learning_rate": 3.999573944880424e-06, + "loss": 0.83404416, + "num_input_tokens_seen": 12773050, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.828125, + "step": 605, + "time_per_iteration": 2.46071457862854 + }, + { + "auxiliary_loss_clip": 0.01255946, + "auxiliary_loss_mlp": 0.01067125, + "balance_loss_clip": 1.04012406, + "balance_loss_mlp": 1.07317901, + "epoch": 0.03643469111678942, + "flos": 15851832549120.0, + "grad_norm": 2.2162807408147964, + "language_loss": 0.85624671, + "learning_rate": 3.9995658684780375e-06, + "loss": 0.87947738, + "num_input_tokens_seen": 12791240, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.828125, + "step": 606, + "time_per_iteration": 2.450775623321533 + }, + { + "auxiliary_loss_clip": 0.01262483, + "auxiliary_loss_mlp": 0.01072166, + "balance_loss_clip": 1.04279351, + "balance_loss_mlp": 1.07551849, + "epoch": 0.03649481436945739, + "flos": 23620387340160.0, + "grad_norm": 2.1498788116147125, + "language_loss": 0.82370651, + "learning_rate": 3.999557716251912e-06, + "loss": 0.84705305, + "num_input_tokens_seen": 12812245, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.8671875, + "step": 607, + "time_per_iteration": 2.4969747066497803 + }, + { + "auxiliary_loss_clip": 0.01255757, + "auxiliary_loss_mlp": 0.01063348, + "balance_loss_clip": 1.035954, + "balance_loss_mlp": 1.07488835, + "epoch": 0.036554937622125354, + "flos": 21755581879680.0, + "grad_norm": 3.329641026295442, + "language_loss": 0.8315016, + "learning_rate": 3.999549488202358e-06, + "loss": 0.8546927, + "num_input_tokens_seen": 12831085, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.8046875, + "step": 608, + "time_per_iteration": 2.4648640155792236 + }, + { + "auxiliary_loss_clip": 0.01260127, + "auxiliary_loss_mlp": 0.0106578, + "balance_loss_clip": 1.03533435, + "balance_loss_mlp": 1.0769459, + "epoch": 0.036615060874793326, + "flos": 17819772935040.0, + "grad_norm": 2.072924568315734, + "language_loss": 0.82258713, + "learning_rate": 3.999541184329688e-06, + "loss": 0.84584618, + "num_input_tokens_seen": 12849115, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.828125, + "step": 609, + "time_per_iteration": 2.4761714935302734 + }, + { + "auxiliary_loss_clip": 0.01266536, + "auxiliary_loss_mlp": 0.01080333, + "balance_loss_clip": 1.05247378, + "balance_loss_mlp": 1.08229148, + "epoch": 0.0366751841274613, + "flos": 26753808320640.0, + "grad_norm": 2.279075715646142, + "language_loss": 0.7924515, + "learning_rate": 3.999532804634215e-06, + "loss": 0.81592017, + "num_input_tokens_seen": 12868005, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.84375, + "step": 610, + "time_per_iteration": 2.512223958969116 + }, + { + "auxiliary_loss_clip": 0.01265179, + "auxiliary_loss_mlp": 0.01076881, + "balance_loss_clip": 1.04767442, + "balance_loss_mlp": 1.07819688, + "epoch": 0.03673530738012926, + "flos": 22196960202240.0, + "grad_norm": 2.108980449215705, + "language_loss": 0.87263799, + "learning_rate": 3.9995243491162575e-06, + "loss": 0.89605856, + "num_input_tokens_seen": 12886890, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.8671875, + "step": 611, + "time_per_iteration": 2.488800525665283 + }, + { + "auxiliary_loss_clip": 0.01257304, + "auxiliary_loss_mlp": 0.01084406, + "balance_loss_clip": 1.05577183, + "balance_loss_mlp": 1.0769043, + "epoch": 0.036795430632797235, + "flos": 24681655601280.0, + "grad_norm": 2.0539399448943145, + "language_loss": 0.72783852, + "learning_rate": 3.999515817776136e-06, + "loss": 0.75125557, + "num_input_tokens_seen": 12906130, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.8046875, + "step": 612, + "time_per_iteration": 2.4950740337371826 + }, + { + "auxiliary_loss_clip": 0.01258776, + "auxiliary_loss_mlp": 0.01069045, + "balance_loss_clip": 1.03999329, + "balance_loss_mlp": 1.07377708, + "epoch": 0.0368555538854652, + "flos": 17748921358080.0, + "grad_norm": 2.903841869182041, + "language_loss": 0.7909385, + "learning_rate": 3.999507210614175e-06, + "loss": 0.81421661, + "num_input_tokens_seen": 12925260, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.8515625, + "step": 613, + "time_per_iteration": 2.4849369525909424 + }, + { + "auxiliary_loss_clip": 0.01253943, + "auxiliary_loss_mlp": 0.01079095, + "balance_loss_clip": 1.05141413, + "balance_loss_mlp": 1.07326341, + "epoch": 0.03691567713813317, + "flos": 20594554571520.0, + "grad_norm": 2.273957434397869, + "language_loss": 0.93266213, + "learning_rate": 3.9994985276307e-06, + "loss": 0.95599246, + "num_input_tokens_seen": 12944590, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.8125, + "step": 614, + "time_per_iteration": 2.4639992713928223 + }, + { + "auxiliary_loss_clip": 0.01263574, + "auxiliary_loss_mlp": 0.01075313, + "balance_loss_clip": 1.04415178, + "balance_loss_mlp": 1.07938302, + "epoch": 0.036975800390801145, + "flos": 33650380546560.0, + "grad_norm": 2.901964177226116, + "language_loss": 0.72534943, + "learning_rate": 3.999489768826041e-06, + "loss": 0.74873829, + "num_input_tokens_seen": 12964785, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.84375, + "step": 615, + "time_per_iteration": 2.601372480392456 + }, + { + "auxiliary_loss_clip": 0.01258092, + "auxiliary_loss_mlp": 0.01071353, + "balance_loss_clip": 1.04299331, + "balance_loss_mlp": 1.07278967, + "epoch": 0.03703592364346911, + "flos": 28293694329600.0, + "grad_norm": 2.023635364571096, + "language_loss": 0.81449711, + "learning_rate": 3.999480934200528e-06, + "loss": 0.83779156, + "num_input_tokens_seen": 12986705, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.8515625, + "step": 616, + "time_per_iteration": 2.5325467586517334 + }, + { + "auxiliary_loss_clip": 0.01256707, + "auxiliary_loss_mlp": 0.01063142, + "balance_loss_clip": 1.03643894, + "balance_loss_mlp": 1.07431316, + "epoch": 0.03709604689613708, + "flos": 31504215853440.0, + "grad_norm": 1.9753277492127743, + "language_loss": 0.67868775, + "learning_rate": 3.999472023754499e-06, + "loss": 0.7018863, + "num_input_tokens_seen": 13010560, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.828125, + "step": 617, + "time_per_iteration": 2.5784177780151367 + }, + { + "auxiliary_loss_clip": 0.01263095, + "auxiliary_loss_mlp": 0.01064585, + "balance_loss_clip": 1.0349381, + "balance_loss_mlp": 1.07892454, + "epoch": 0.03715617014880505, + "flos": 19609381272960.0, + "grad_norm": 3.556814357499394, + "language_loss": 0.80340034, + "learning_rate": 3.99946303748829e-06, + "loss": 0.8266772, + "num_input_tokens_seen": 13028935, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.84375, + "step": 618, + "time_per_iteration": 2.4876792430877686 + }, + { + "auxiliary_loss_clip": 0.01261829, + "auxiliary_loss_mlp": 0.01070874, + "balance_loss_clip": 1.04059458, + "balance_loss_mlp": 1.07458091, + "epoch": 0.03721629340147302, + "flos": 15924192497280.0, + "grad_norm": 2.355648226269084, + "language_loss": 0.91115171, + "learning_rate": 3.999453975402242e-06, + "loss": 0.93447876, + "num_input_tokens_seen": 13046000, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.875, + "step": 619, + "time_per_iteration": 2.4804162979125977 + }, + { + "auxiliary_loss_clip": 0.01259898, + "auxiliary_loss_mlp": 0.01077134, + "balance_loss_clip": 1.04871452, + "balance_loss_mlp": 1.07845378, + "epoch": 0.03727641665414099, + "flos": 21104090951040.0, + "grad_norm": 2.218621959424752, + "language_loss": 0.94397002, + "learning_rate": 3.9994448374967e-06, + "loss": 0.96734041, + "num_input_tokens_seen": 13062995, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.8125, + "step": 620, + "time_per_iteration": 2.4592232704162598 + }, + { + "auxiliary_loss_clip": 0.01257463, + "auxiliary_loss_mlp": 0.01077616, + "balance_loss_clip": 1.04750419, + "balance_loss_mlp": 1.07455909, + "epoch": 0.037336539906808956, + "flos": 24131683486080.0, + "grad_norm": 1.8159025601621845, + "language_loss": 0.77105826, + "learning_rate": 3.999435623772008e-06, + "loss": 0.7944091, + "num_input_tokens_seen": 13084120, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.828125, + "step": 621, + "time_per_iteration": 2.53365159034729 + }, + { + "auxiliary_loss_clip": 0.01255819, + "auxiliary_loss_mlp": 0.01059811, + "balance_loss_clip": 1.03084326, + "balance_loss_mlp": 1.07761526, + "epoch": 0.03739666315947693, + "flos": 22346384780160.0, + "grad_norm": 2.793013868715132, + "language_loss": 0.86895752, + "learning_rate": 3.999426334228518e-06, + "loss": 0.89211386, + "num_input_tokens_seen": 13100035, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.78125, + "step": 622, + "time_per_iteration": 2.472726583480835 + }, + { + "auxiliary_loss_clip": 0.01258428, + "auxiliary_loss_mlp": 0.01064577, + "balance_loss_clip": 1.03591871, + "balance_loss_mlp": 1.07622766, + "epoch": 0.0374567864121449, + "flos": 20449511452800.0, + "grad_norm": 2.261361439009279, + "language_loss": 0.90376818, + "learning_rate": 3.999416968866581e-06, + "loss": 0.9269982, + "num_input_tokens_seen": 13118070, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.8203125, + "step": 623, + "time_per_iteration": 2.486699104309082 + }, + { + "auxiliary_loss_clip": 0.0125978, + "auxiliary_loss_mlp": 0.01075147, + "balance_loss_clip": 1.04626298, + "balance_loss_mlp": 1.07841158, + "epoch": 0.037516909664812866, + "flos": 19208043636480.0, + "grad_norm": 1.9910669563462169, + "language_loss": 0.84149444, + "learning_rate": 3.999407527686551e-06, + "loss": 0.86484373, + "num_input_tokens_seen": 13136355, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.8125, + "step": 624, + "time_per_iteration": 2.4514520168304443 + }, + { + "auxiliary_loss_clip": 0.01261437, + "auxiliary_loss_mlp": 0.01067743, + "balance_loss_clip": 1.03867936, + "balance_loss_mlp": 1.0750618, + "epoch": 0.03757703291748084, + "flos": 35005218664320.0, + "grad_norm": 2.4867963928692554, + "language_loss": 0.66228586, + "learning_rate": 3.999398010688788e-06, + "loss": 0.68557763, + "num_input_tokens_seen": 13155435, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.8671875, + "step": 625, + "time_per_iteration": 2.5765273571014404 + }, + { + "auxiliary_loss_clip": 0.01253583, + "auxiliary_loss_mlp": 0.0106714, + "balance_loss_clip": 1.03697979, + "balance_loss_mlp": 1.07435441, + "epoch": 0.0376371561701488, + "flos": 25483899911040.0, + "grad_norm": 2.071255255654034, + "language_loss": 0.77375329, + "learning_rate": 3.999388417873652e-06, + "loss": 0.79696059, + "num_input_tokens_seen": 13174295, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.7890625, + "step": 626, + "time_per_iteration": 2.5022406578063965 + }, + { + "auxiliary_loss_clip": 0.01258684, + "auxiliary_loss_mlp": 0.01074389, + "balance_loss_clip": 1.04499173, + "balance_loss_mlp": 1.07735705, + "epoch": 0.037697279422816775, + "flos": 18185630912640.0, + "grad_norm": 2.2077512286027288, + "language_loss": 0.81357861, + "learning_rate": 3.999378749241506e-06, + "loss": 0.83690929, + "num_input_tokens_seen": 13192500, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.8125, + "step": 627, + "time_per_iteration": 2.4750607013702393 + }, + { + "auxiliary_loss_clip": 0.01261632, + "auxiliary_loss_mlp": 0.01076941, + "balance_loss_clip": 1.04768682, + "balance_loss_mlp": 1.07859111, + "epoch": 0.03775740267548475, + "flos": 24644272521600.0, + "grad_norm": 3.546199216596373, + "language_loss": 0.88572276, + "learning_rate": 3.999369004792719e-06, + "loss": 0.90910852, + "num_input_tokens_seen": 13213470, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.828125, + "step": 628, + "time_per_iteration": 2.571899890899658 + }, + { + "auxiliary_loss_clip": 0.01253553, + "auxiliary_loss_mlp": 0.01067038, + "balance_loss_clip": 1.03864217, + "balance_loss_mlp": 1.07086658, + "epoch": 0.03781752592815271, + "flos": 21288205088640.0, + "grad_norm": 2.488861546346732, + "language_loss": 0.79683006, + "learning_rate": 3.999359184527658e-06, + "loss": 0.82003593, + "num_input_tokens_seen": 13232365, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.828125, + "step": 629, + "time_per_iteration": 2.486675262451172 + }, + { + "auxiliary_loss_clip": 0.01258011, + "auxiliary_loss_mlp": 0.01067816, + "balance_loss_clip": 1.03977799, + "balance_loss_mlp": 1.07458425, + "epoch": 0.037877649180820684, + "flos": 22089623385600.0, + "grad_norm": 1.7117761504495859, + "language_loss": 0.76808703, + "learning_rate": 3.999349288446696e-06, + "loss": 0.79134536, + "num_input_tokens_seen": 13251920, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.8359375, + "step": 630, + "time_per_iteration": 2.494297742843628 + }, + { + "auxiliary_loss_clip": 0.01262746, + "auxiliary_loss_mlp": 0.01070638, + "balance_loss_clip": 1.04250503, + "balance_loss_mlp": 1.07651484, + "epoch": 0.03793777243348865, + "flos": 14501339976960.0, + "grad_norm": 2.6765452133705403, + "language_loss": 0.91492796, + "learning_rate": 3.99933931655021e-06, + "loss": 0.93826187, + "num_input_tokens_seen": 13267440, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.859375, + "step": 631, + "time_per_iteration": 2.4605348110198975 + }, + { + "auxiliary_loss_clip": 0.01252436, + "auxiliary_loss_mlp": 0.01076716, + "balance_loss_clip": 1.04560196, + "balance_loss_mlp": 1.07244229, + "epoch": 0.03799789568615662, + "flos": 21908418249600.0, + "grad_norm": 1.669704350294595, + "language_loss": 0.9207651, + "learning_rate": 3.999329268838575e-06, + "loss": 0.94405663, + "num_input_tokens_seen": 13287850, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 1.796875, + "step": 632, + "time_per_iteration": 2.518498659133911 + }, + { + "auxiliary_loss_clip": 0.01255106, + "auxiliary_loss_mlp": 0.01058467, + "balance_loss_clip": 1.03069162, + "balance_loss_mlp": 1.07462335, + "epoch": 0.03805801893882459, + "flos": 24827021942400.0, + "grad_norm": 2.0828864645498872, + "language_loss": 0.8341018, + "learning_rate": 3.999319145312175e-06, + "loss": 0.85723758, + "num_input_tokens_seen": 13307760, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.8046875, + "step": 633, + "time_per_iteration": 2.5217537879943848 + }, + { + "auxiliary_loss_clip": 0.01258224, + "auxiliary_loss_mlp": 0.01071025, + "balance_loss_clip": 1.04153264, + "balance_loss_mlp": 1.07408428, + "epoch": 0.03811814219149256, + "flos": 30482952364800.0, + "grad_norm": 1.6987522649376106, + "language_loss": 0.69638437, + "learning_rate": 3.999308945971392e-06, + "loss": 0.71967685, + "num_input_tokens_seen": 13331230, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.84375, + "step": 634, + "time_per_iteration": 2.5694239139556885 + }, + { + "auxiliary_loss_clip": 0.01127675, + "auxiliary_loss_mlp": 0.01017483, + "balance_loss_clip": 1.0106163, + "balance_loss_mlp": 1.04225707, + "epoch": 0.03817826544416053, + "flos": 66992577379200.0, + "grad_norm": 0.8852243261294688, + "language_loss": 0.61585373, + "learning_rate": 3.999298670816614e-06, + "loss": 0.63730532, + "num_input_tokens_seen": 13394760, + "router_z_loss_clip": 0.06884766, + "router_z_loss_mlp": 0.8515625, + "step": 635, + "time_per_iteration": 3.1059212684631348 + }, + { + "auxiliary_loss_clip": 0.01253433, + "auxiliary_loss_mlp": 0.01068627, + "balance_loss_clip": 1.03930187, + "balance_loss_mlp": 1.07354546, + "epoch": 0.038238388696828496, + "flos": 20485350247680.0, + "grad_norm": 2.2313569204055246, + "language_loss": 0.83721048, + "learning_rate": 3.9992883198482294e-06, + "loss": 0.86043108, + "num_input_tokens_seen": 13412775, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.796875, + "step": 636, + "time_per_iteration": 2.4975383281707764 + }, + { + "auxiliary_loss_clip": 0.01258014, + "auxiliary_loss_mlp": 0.01077997, + "balance_loss_clip": 1.04852867, + "balance_loss_mlp": 1.07623935, + "epoch": 0.03829851194949647, + "flos": 17965893461760.0, + "grad_norm": 2.4018992949787847, + "language_loss": 0.79327047, + "learning_rate": 3.999277893066632e-06, + "loss": 0.8166306, + "num_input_tokens_seen": 13427835, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.8203125, + "step": 637, + "time_per_iteration": 2.4560744762420654 + }, + { + "auxiliary_loss_clip": 0.01258084, + "auxiliary_loss_mlp": 0.01073075, + "balance_loss_clip": 1.04342771, + "balance_loss_mlp": 1.07309079, + "epoch": 0.03835863520216444, + "flos": 22456522857600.0, + "grad_norm": 2.8779285506389924, + "language_loss": 0.8410306, + "learning_rate": 3.999267390472215e-06, + "loss": 0.86434221, + "num_input_tokens_seen": 13447295, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.8515625, + "step": 638, + "time_per_iteration": 2.504343271255493 + }, + { + "auxiliary_loss_clip": 0.01263348, + "auxiliary_loss_mlp": 0.01067898, + "balance_loss_clip": 1.03717756, + "balance_loss_mlp": 1.07495832, + "epoch": 0.038418758454832405, + "flos": 22164425458560.0, + "grad_norm": 2.5416523890288976, + "language_loss": 0.70099992, + "learning_rate": 3.999256812065381e-06, + "loss": 0.72431237, + "num_input_tokens_seen": 13468455, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.890625, + "step": 639, + "time_per_iteration": 2.52817964553833 + }, + { + "auxiliary_loss_clip": 0.01259266, + "auxiliary_loss_mlp": 0.01075603, + "balance_loss_clip": 1.04463232, + "balance_loss_mlp": 1.07514286, + "epoch": 0.03847888170750038, + "flos": 22747435107840.0, + "grad_norm": 2.42201861797838, + "language_loss": 0.85030365, + "learning_rate": 3.999246157846526e-06, + "loss": 0.8736524, + "num_input_tokens_seen": 13489085, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.84375, + "step": 640, + "time_per_iteration": 2.503262758255005 + }, + { + "auxiliary_loss_clip": 0.0126167, + "auxiliary_loss_mlp": 0.0107911, + "balance_loss_clip": 1.04725742, + "balance_loss_mlp": 1.07574821, + "epoch": 0.03853900496016834, + "flos": 22711201263360.0, + "grad_norm": 2.3722848939528953, + "language_loss": 0.82117289, + "learning_rate": 3.9992354278160574e-06, + "loss": 0.84458065, + "num_input_tokens_seen": 13509120, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 1.859375, + "step": 641, + "time_per_iteration": 2.51052188873291 + }, + { + "auxiliary_loss_clip": 0.01118992, + "auxiliary_loss_mlp": 0.01008303, + "balance_loss_clip": 1.00205636, + "balance_loss_mlp": 1.03414774, + "epoch": 0.038599128212836314, + "flos": 70399136355840.0, + "grad_norm": 0.9008353353488252, + "language_loss": 0.6540072, + "learning_rate": 3.999224621974381e-06, + "loss": 0.67528021, + "num_input_tokens_seen": 13562005, + "router_z_loss_clip": 0.06225586, + "router_z_loss_mlp": 0.8515625, + "step": 642, + "time_per_iteration": 4.430839538574219 + }, + { + "auxiliary_loss_clip": 0.01256856, + "auxiliary_loss_mlp": 0.01062608, + "balance_loss_clip": 1.03433132, + "balance_loss_mlp": 1.07364345, + "epoch": 0.03865925146550429, + "flos": 23295144666240.0, + "grad_norm": 1.9870813050305103, + "language_loss": 0.79512584, + "learning_rate": 3.999213740321906e-06, + "loss": 0.81832051, + "num_input_tokens_seen": 13582185, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.8359375, + "step": 643, + "time_per_iteration": 5.386199951171875 + }, + { + "auxiliary_loss_clip": 0.01255871, + "auxiliary_loss_mlp": 0.01074414, + "balance_loss_clip": 1.0456841, + "balance_loss_mlp": 1.07266629, + "epoch": 0.03871937471817225, + "flos": 21430446946560.0, + "grad_norm": 2.074949815918338, + "language_loss": 0.82926929, + "learning_rate": 3.999202782859046e-06, + "loss": 0.85257208, + "num_input_tokens_seen": 13599555, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.828125, + "step": 644, + "time_per_iteration": 2.45499587059021 + }, + { + "auxiliary_loss_clip": 0.01260265, + "auxiliary_loss_mlp": 0.0106622, + "balance_loss_clip": 1.03503489, + "balance_loss_mlp": 1.07482159, + "epoch": 0.038779497970840224, + "flos": 34277309550720.0, + "grad_norm": 2.258008571643512, + "language_loss": 0.82131916, + "learning_rate": 3.9991917495862165e-06, + "loss": 0.84458399, + "num_input_tokens_seen": 13621160, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.859375, + "step": 645, + "time_per_iteration": 2.610435724258423 + }, + { + "auxiliary_loss_clip": 0.0126099, + "auxiliary_loss_mlp": 0.01070847, + "balance_loss_clip": 1.04121125, + "balance_loss_mlp": 1.07544899, + "epoch": 0.03883962122350819, + "flos": 22748189293440.0, + "grad_norm": 2.4729923618605554, + "language_loss": 0.82006776, + "learning_rate": 3.9991806405038345e-06, + "loss": 0.84338611, + "num_input_tokens_seen": 13641915, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.8515625, + "step": 646, + "time_per_iteration": 2.4771342277526855 + }, + { + "auxiliary_loss_clip": 0.01260575, + "auxiliary_loss_mlp": 0.01080585, + "balance_loss_clip": 1.05123544, + "balance_loss_mlp": 1.07928514, + "epoch": 0.03889974447617616, + "flos": 21945837242880.0, + "grad_norm": 1.8327945326632593, + "language_loss": 0.81973422, + "learning_rate": 3.999169455612323e-06, + "loss": 0.84314579, + "num_input_tokens_seen": 13661410, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.8125, + "step": 647, + "time_per_iteration": 2.522347927093506 + }, + { + "auxiliary_loss_clip": 0.01260388, + "auxiliary_loss_mlp": 0.01069051, + "balance_loss_clip": 1.03965366, + "balance_loss_mlp": 1.07776546, + "epoch": 0.03895986772884413, + "flos": 31504826384640.0, + "grad_norm": 1.9222642653000834, + "language_loss": 0.84699827, + "learning_rate": 3.999158194912106e-06, + "loss": 0.87029266, + "num_input_tokens_seen": 13681705, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.828125, + "step": 648, + "time_per_iteration": 2.561929941177368 + }, + { + "auxiliary_loss_clip": 0.01258218, + "auxiliary_loss_mlp": 0.01071465, + "balance_loss_clip": 1.041448, + "balance_loss_mlp": 1.07636404, + "epoch": 0.0390199909815121, + "flos": 19901011795200.0, + "grad_norm": 3.7283662397985053, + "language_loss": 0.84446943, + "learning_rate": 3.9991468584036086e-06, + "loss": 0.86776626, + "num_input_tokens_seen": 13700400, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.8203125, + "step": 649, + "time_per_iteration": 2.477027416229248 + }, + { + "auxiliary_loss_clip": 0.01259496, + "auxiliary_loss_mlp": 0.01070031, + "balance_loss_clip": 1.03977561, + "balance_loss_mlp": 1.07551885, + "epoch": 0.03908011423418007, + "flos": 21612478095360.0, + "grad_norm": 1.8508721849532739, + "language_loss": 0.79670662, + "learning_rate": 3.999135446087263e-06, + "loss": 0.8200019, + "num_input_tokens_seen": 13720145, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.84375, + "step": 650, + "time_per_iteration": 2.482440948486328 + }, + { + "auxiliary_loss_clip": 0.0125375, + "auxiliary_loss_mlp": 0.01072982, + "balance_loss_clip": 1.04314423, + "balance_loss_mlp": 1.07259929, + "epoch": 0.039140237486848035, + "flos": 18661411486080.0, + "grad_norm": 2.708739352564946, + "language_loss": 0.78509629, + "learning_rate": 3.9991239579635e-06, + "loss": 0.80836356, + "num_input_tokens_seen": 13737500, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.8125, + "step": 651, + "time_per_iteration": 2.4757516384124756 + }, + { + "auxiliary_loss_clip": 0.01255418, + "auxiliary_loss_mlp": 0.01082545, + "balance_loss_clip": 1.05004883, + "balance_loss_mlp": 1.0719974, + "epoch": 0.03920036073951601, + "flos": 18661124177280.0, + "grad_norm": 2.7896665115169244, + "language_loss": 0.88031149, + "learning_rate": 3.999112394032757e-06, + "loss": 0.90369117, + "num_input_tokens_seen": 13754750, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 1.8359375, + "step": 652, + "time_per_iteration": 2.4425668716430664 + }, + { + "auxiliary_loss_clip": 0.01249027, + "auxiliary_loss_mlp": 0.01069663, + "balance_loss_clip": 1.0411005, + "balance_loss_mlp": 1.07108784, + "epoch": 0.03926048399218398, + "flos": 31354468053120.0, + "grad_norm": 3.185528651545475, + "language_loss": 0.79044777, + "learning_rate": 3.999100754295471e-06, + "loss": 0.81363463, + "num_input_tokens_seen": 13771990, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.78125, + "step": 653, + "time_per_iteration": 2.5651934146881104 + }, + { + "auxiliary_loss_clip": 0.01264568, + "auxiliary_loss_mlp": 0.01070462, + "balance_loss_clip": 1.03996825, + "balance_loss_mlp": 1.07603264, + "epoch": 0.039320607244851945, + "flos": 29603499770880.0, + "grad_norm": 2.207303268368246, + "language_loss": 0.86304128, + "learning_rate": 3.999089038752085e-06, + "loss": 0.88639158, + "num_input_tokens_seen": 13792750, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.8828125, + "step": 654, + "time_per_iteration": 2.533297061920166 + }, + { + "auxiliary_loss_clip": 0.01115043, + "auxiliary_loss_mlp": 0.01012751, + "balance_loss_clip": 1.00710094, + "balance_loss_mlp": 1.03246427, + "epoch": 0.03938073049751992, + "flos": 66534609951360.0, + "grad_norm": 0.7205066186016396, + "language_loss": 0.49900642, + "learning_rate": 3.999077247403041e-06, + "loss": 0.5202843, + "num_input_tokens_seen": 13858570, + "router_z_loss_clip": 0.05639648, + "router_z_loss_mlp": 0.82421875, + "step": 655, + "time_per_iteration": 3.1399919986724854 + }, + { + "auxiliary_loss_clip": 0.01251012, + "auxiliary_loss_mlp": 0.01066863, + "balance_loss_clip": 1.03866971, + "balance_loss_mlp": 1.07330465, + "epoch": 0.03944085375018788, + "flos": 23367827836800.0, + "grad_norm": 2.4228021909793918, + "language_loss": 0.80845964, + "learning_rate": 3.9990653802487886e-06, + "loss": 0.83163846, + "num_input_tokens_seen": 13876335, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.78125, + "step": 656, + "time_per_iteration": 2.5063297748565674 + }, + { + "auxiliary_loss_clip": 0.01264211, + "auxiliary_loss_mlp": 0.0109165, + "balance_loss_clip": 1.0566026, + "balance_loss_mlp": 1.07672703, + "epoch": 0.039500977002855854, + "flos": 18548292579840.0, + "grad_norm": 2.8602268717749526, + "language_loss": 0.76602596, + "learning_rate": 3.999053437289776e-06, + "loss": 0.78958458, + "num_input_tokens_seen": 13892640, + "router_z_loss_clip": 0.34960938, + "router_z_loss_mlp": 1.875, + "step": 657, + "time_per_iteration": 2.4405555725097656 + }, + { + "auxiliary_loss_clip": 0.01258331, + "auxiliary_loss_mlp": 0.01071967, + "balance_loss_clip": 1.04192615, + "balance_loss_mlp": 1.07452726, + "epoch": 0.039561100255523826, + "flos": 25338174433920.0, + "grad_norm": 2.1526815744488945, + "language_loss": 0.81690443, + "learning_rate": 3.999041418526457e-06, + "loss": 0.84020746, + "num_input_tokens_seen": 13910085, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.84375, + "step": 658, + "time_per_iteration": 2.5383949279785156 + }, + { + "auxiliary_loss_clip": 0.01252051, + "auxiliary_loss_mlp": 0.01072669, + "balance_loss_clip": 1.04091132, + "balance_loss_mlp": 1.07283425, + "epoch": 0.03962122350819179, + "flos": 18219889509120.0, + "grad_norm": 2.2075021313123777, + "language_loss": 0.91331315, + "learning_rate": 3.999029323959287e-06, + "loss": 0.93656039, + "num_input_tokens_seen": 13928800, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.796875, + "step": 659, + "time_per_iteration": 2.4678854942321777 + }, + { + "auxiliary_loss_clip": 0.01259034, + "auxiliary_loss_mlp": 0.01066414, + "balance_loss_clip": 1.03699267, + "balance_loss_mlp": 1.07427669, + "epoch": 0.03968134676085976, + "flos": 20522230536960.0, + "grad_norm": 2.5412719342676215, + "language_loss": 0.79241848, + "learning_rate": 3.999017153588724e-06, + "loss": 0.81567293, + "num_input_tokens_seen": 13948325, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.8515625, + "step": 660, + "time_per_iteration": 2.5135834217071533 + }, + { + "auxiliary_loss_clip": 0.01255641, + "auxiliary_loss_mlp": 0.01070807, + "balance_loss_clip": 1.04017007, + "balance_loss_mlp": 1.07534087, + "epoch": 0.03974147001352773, + "flos": 22422587483520.0, + "grad_norm": 1.6909533460123631, + "language_loss": 0.81942898, + "learning_rate": 3.999004907415231e-06, + "loss": 0.84269351, + "num_input_tokens_seen": 13969090, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.796875, + "step": 661, + "time_per_iteration": 2.513702154159546 + }, + { + "auxiliary_loss_clip": 0.01112947, + "auxiliary_loss_mlp": 0.01010967, + "balance_loss_clip": 1.00519753, + "balance_loss_mlp": 1.03039932, + "epoch": 0.0398015932661957, + "flos": 71128769322240.0, + "grad_norm": 0.9113020435813882, + "language_loss": 0.69376045, + "learning_rate": 3.998992585439272e-06, + "loss": 0.7149995, + "num_input_tokens_seen": 14037555, + "router_z_loss_clip": 0.05761719, + "router_z_loss_mlp": 0.82421875, + "step": 662, + "time_per_iteration": 3.2435107231140137 + }, + { + "auxiliary_loss_clip": 0.01260063, + "auxiliary_loss_mlp": 0.01071537, + "balance_loss_clip": 1.04113865, + "balance_loss_mlp": 1.0779382, + "epoch": 0.03986171651886367, + "flos": 16800951571200.0, + "grad_norm": 2.025040011333182, + "language_loss": 0.83253002, + "learning_rate": 3.998980187661314e-06, + "loss": 0.85584599, + "num_input_tokens_seen": 14055765, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.8125, + "step": 663, + "time_per_iteration": 2.5213887691497803 + }, + { + "auxiliary_loss_clip": 0.01261822, + "auxiliary_loss_mlp": 0.0106269, + "balance_loss_clip": 1.032125, + "balance_loss_mlp": 1.07768416, + "epoch": 0.03992183977153164, + "flos": 24535068197760.0, + "grad_norm": 2.8595031628608143, + "language_loss": 0.87538105, + "learning_rate": 3.998967714081826e-06, + "loss": 0.89862621, + "num_input_tokens_seen": 14074195, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.84375, + "step": 664, + "time_per_iteration": 2.516810655593872 + }, + { + "auxiliary_loss_clip": 0.0125116, + "auxiliary_loss_mlp": 0.01060628, + "balance_loss_clip": 1.03065872, + "balance_loss_mlp": 1.07347679, + "epoch": 0.03998196302419961, + "flos": 15595897167360.0, + "grad_norm": 2.3519362819230625, + "language_loss": 0.84738994, + "learning_rate": 3.998955164701281e-06, + "loss": 0.87050784, + "num_input_tokens_seen": 14090215, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 1.7734375, + "step": 665, + "time_per_iteration": 2.4348978996276855 + }, + { + "auxiliary_loss_clip": 0.01263346, + "auxiliary_loss_mlp": 0.01087391, + "balance_loss_clip": 1.05525231, + "balance_loss_mlp": 1.07680821, + "epoch": 0.04004208627686758, + "flos": 25305065072640.0, + "grad_norm": 2.1279588772882687, + "language_loss": 0.81491798, + "learning_rate": 3.998942539520158e-06, + "loss": 0.83842534, + "num_input_tokens_seen": 14112150, + "router_z_loss_clip": 0.32226562, + "router_z_loss_mlp": 1.8671875, + "step": 666, + "time_per_iteration": 2.564187526702881 + }, + { + "auxiliary_loss_clip": 0.01252779, + "auxiliary_loss_mlp": 0.01074876, + "balance_loss_clip": 1.04276049, + "balance_loss_mlp": 1.07225358, + "epoch": 0.04010220952953555, + "flos": 23475847011840.0, + "grad_norm": 2.9939634291419526, + "language_loss": 0.87121451, + "learning_rate": 3.998929838538932e-06, + "loss": 0.89449108, + "num_input_tokens_seen": 14131475, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 1.8046875, + "step": 667, + "time_per_iteration": 2.547909736633301 + }, + { + "auxiliary_loss_clip": 0.0125258, + "auxiliary_loss_mlp": 0.01066007, + "balance_loss_clip": 1.03661036, + "balance_loss_mlp": 1.07692444, + "epoch": 0.04016233278220352, + "flos": 18617025254400.0, + "grad_norm": 2.627098567014159, + "language_loss": 0.80619991, + "learning_rate": 3.998917061758087e-06, + "loss": 0.82938576, + "num_input_tokens_seen": 14146165, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.7578125, + "step": 668, + "time_per_iteration": 2.441667079925537 + }, + { + "auxiliary_loss_clip": 0.01110895, + "auxiliary_loss_mlp": 0.01011229, + "balance_loss_clip": 1.0053643, + "balance_loss_mlp": 1.02968836, + "epoch": 0.040222456034871484, + "flos": 70906194696960.0, + "grad_norm": 0.7872457900726799, + "language_loss": 0.60042131, + "learning_rate": 3.998904209178107e-06, + "loss": 0.62164247, + "num_input_tokens_seen": 14215005, + "router_z_loss_clip": 0.05859375, + "router_z_loss_mlp": 0.8125, + "step": 669, + "time_per_iteration": 3.200874090194702 + }, + { + "auxiliary_loss_clip": 0.01253738, + "auxiliary_loss_mlp": 0.0107276, + "balance_loss_clip": 1.0431962, + "balance_loss_mlp": 1.07228541, + "epoch": 0.040282579287539456, + "flos": 23764712186880.0, + "grad_norm": 1.7415828974469272, + "language_loss": 0.86405391, + "learning_rate": 3.9988912807994785e-06, + "loss": 0.88731897, + "num_input_tokens_seen": 14235510, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.8125, + "step": 670, + "time_per_iteration": 2.5169434547424316 + }, + { + "auxiliary_loss_clip": 0.0124964, + "auxiliary_loss_mlp": 0.01070621, + "balance_loss_clip": 1.0414381, + "balance_loss_mlp": 1.07305872, + "epoch": 0.04034270254020743, + "flos": 18478518410880.0, + "grad_norm": 1.9261739939324196, + "language_loss": 0.752123, + "learning_rate": 3.998878276622692e-06, + "loss": 0.7753256, + "num_input_tokens_seen": 14254565, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.765625, + "step": 671, + "time_per_iteration": 2.514566421508789 + }, + { + "auxiliary_loss_clip": 0.01259516, + "auxiliary_loss_mlp": 0.01075144, + "balance_loss_clip": 1.04472136, + "balance_loss_mlp": 1.0774349, + "epoch": 0.040402825792875394, + "flos": 17201858244480.0, + "grad_norm": 2.0846907245314688, + "language_loss": 0.92279977, + "learning_rate": 3.998865196648242e-06, + "loss": 0.94614637, + "num_input_tokens_seen": 14271885, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.8203125, + "step": 672, + "time_per_iteration": 2.464036226272583 + }, + { + "auxiliary_loss_clip": 0.01253491, + "auxiliary_loss_mlp": 0.01071171, + "balance_loss_clip": 1.03921115, + "balance_loss_mlp": 1.07329202, + "epoch": 0.040462949045543366, + "flos": 19172168928000.0, + "grad_norm": 1.816355722874097, + "language_loss": 0.90220857, + "learning_rate": 3.998852040876622e-06, + "loss": 0.92545515, + "num_input_tokens_seen": 14289670, + "router_z_loss_clip": 0.3203125, + "router_z_loss_mlp": 1.796875, + "step": 673, + "time_per_iteration": 2.450547456741333 + }, + { + "auxiliary_loss_clip": 0.01249229, + "auxiliary_loss_mlp": 0.01077482, + "balance_loss_clip": 1.0463202, + "balance_loss_mlp": 1.07150948, + "epoch": 0.04052307229821133, + "flos": 24019821555840.0, + "grad_norm": 2.117589951798075, + "language_loss": 0.74881005, + "learning_rate": 3.998838809308334e-06, + "loss": 0.77207714, + "num_input_tokens_seen": 14309285, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.78125, + "step": 674, + "time_per_iteration": 2.5444436073303223 + }, + { + "auxiliary_loss_clip": 0.01260981, + "auxiliary_loss_mlp": 0.01061202, + "balance_loss_clip": 1.03036261, + "balance_loss_mlp": 1.07609737, + "epoch": 0.0405831955508793, + "flos": 16436601964800.0, + "grad_norm": 2.2422867770418797, + "language_loss": 0.78305578, + "learning_rate": 3.9988255019438766e-06, + "loss": 0.80627763, + "num_input_tokens_seen": 14328300, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.84375, + "step": 675, + "time_per_iteration": 2.4525954723358154 + }, + { + "auxiliary_loss_clip": 0.01252319, + "auxiliary_loss_mlp": 0.01078615, + "balance_loss_clip": 1.04578447, + "balance_loss_mlp": 1.07254028, + "epoch": 0.040643318803547275, + "flos": 24279922915200.0, + "grad_norm": 1.7072695919905723, + "language_loss": 0.76650077, + "learning_rate": 3.998812118783757e-06, + "loss": 0.78981006, + "num_input_tokens_seen": 14346395, + "router_z_loss_clip": 0.328125, + "router_z_loss_mlp": 1.796875, + "step": 676, + "time_per_iteration": 2.530043840408325 + }, + { + "auxiliary_loss_clip": 0.01258388, + "auxiliary_loss_mlp": 0.01076398, + "balance_loss_clip": 1.04564214, + "balance_loss_mlp": 1.0750767, + "epoch": 0.04070344205621524, + "flos": 17712076982400.0, + "grad_norm": 2.3168648577819138, + "language_loss": 0.85182011, + "learning_rate": 3.9987986598284804e-06, + "loss": 0.87516803, + "num_input_tokens_seen": 14364605, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.828125, + "step": 677, + "time_per_iteration": 2.4390082359313965 + }, + { + "auxiliary_loss_clip": 0.01249568, + "auxiliary_loss_mlp": 0.01068372, + "balance_loss_clip": 1.03804517, + "balance_loss_mlp": 1.071486, + "epoch": 0.04076356530888321, + "flos": 26177658168960.0, + "grad_norm": 1.7808730288109123, + "language_loss": 0.76348364, + "learning_rate": 3.998785125078559e-06, + "loss": 0.78666306, + "num_input_tokens_seen": 14385265, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.78125, + "step": 678, + "time_per_iteration": 2.5151596069335938 + }, + { + "auxiliary_loss_clip": 0.01250603, + "auxiliary_loss_mlp": 0.01066495, + "balance_loss_clip": 1.03807509, + "balance_loss_mlp": 1.07162285, + "epoch": 0.04082368856155118, + "flos": 35773455772800.0, + "grad_norm": 1.9938089142752387, + "language_loss": 0.82114184, + "learning_rate": 3.998771514534505e-06, + "loss": 0.84431279, + "num_input_tokens_seen": 14406090, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.7890625, + "step": 679, + "time_per_iteration": 2.5701568126678467 + }, + { + "auxiliary_loss_clip": 0.01255726, + "auxiliary_loss_mlp": 0.01057721, + "balance_loss_clip": 1.02799058, + "balance_loss_mlp": 1.07693028, + "epoch": 0.04088381181421915, + "flos": 28146640049280.0, + "grad_norm": 1.893911305727382, + "language_loss": 0.76349533, + "learning_rate": 3.998757828196835e-06, + "loss": 0.7866298, + "num_input_tokens_seen": 14425130, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.7890625, + "step": 680, + "time_per_iteration": 2.5289864540100098 + }, + { + "auxiliary_loss_clip": 0.01255007, + "auxiliary_loss_mlp": 0.01065478, + "balance_loss_clip": 1.03305268, + "balance_loss_mlp": 1.07167506, + "epoch": 0.04094393506688712, + "flos": 27597673514880.0, + "grad_norm": 1.7999776318515568, + "language_loss": 0.83315849, + "learning_rate": 3.9987440660660685e-06, + "loss": 0.8563633, + "num_input_tokens_seen": 14447355, + "router_z_loss_clip": 0.32421875, + "router_z_loss_mlp": 1.8359375, + "step": 681, + "time_per_iteration": 2.5313305854797363 + }, + { + "auxiliary_loss_clip": 0.01253144, + "auxiliary_loss_mlp": 0.01064685, + "balance_loss_clip": 1.03302324, + "balance_loss_mlp": 1.07082057, + "epoch": 0.04100405831955509, + "flos": 23112036109440.0, + "grad_norm": 1.6690976928218293, + "language_loss": 0.71312869, + "learning_rate": 3.998730228142726e-06, + "loss": 0.73630697, + "num_input_tokens_seen": 14466790, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.828125, + "step": 682, + "time_per_iteration": 2.5190017223358154 + }, + { + "auxiliary_loss_clip": 0.01251459, + "auxiliary_loss_mlp": 0.01068202, + "balance_loss_clip": 1.03911471, + "balance_loss_mlp": 1.07090235, + "epoch": 0.04106418157222306, + "flos": 20156731695360.0, + "grad_norm": 1.7744847161326498, + "language_loss": 0.72373003, + "learning_rate": 3.998716314427333e-06, + "loss": 0.74692667, + "num_input_tokens_seen": 14485195, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.8046875, + "step": 683, + "time_per_iteration": 2.473156690597534 + }, + { + "auxiliary_loss_clip": 0.01250706, + "auxiliary_loss_mlp": 0.01075324, + "balance_loss_clip": 1.04540253, + "balance_loss_mlp": 1.07707, + "epoch": 0.041124304824891024, + "flos": 17420697855360.0, + "grad_norm": 2.316908811268422, + "language_loss": 0.81263745, + "learning_rate": 3.998702324920417e-06, + "loss": 0.83589774, + "num_input_tokens_seen": 14503370, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 1.734375, + "step": 684, + "time_per_iteration": 5.34027099609375 + }, + { + "auxiliary_loss_clip": 0.01251905, + "auxiliary_loss_mlp": 0.01072266, + "balance_loss_clip": 1.04053211, + "balance_loss_mlp": 1.07572865, + "epoch": 0.041184428077558996, + "flos": 25780163287680.0, + "grad_norm": 1.5327144156887007, + "language_loss": 0.90501672, + "learning_rate": 3.9986882596225085e-06, + "loss": 0.92825842, + "num_input_tokens_seen": 14526415, + "router_z_loss_clip": 0.31835938, + "router_z_loss_mlp": 1.765625, + "step": 685, + "time_per_iteration": 3.918776750564575 + }, + { + "auxiliary_loss_clip": 0.01253389, + "auxiliary_loss_mlp": 0.010703, + "balance_loss_clip": 1.04002118, + "balance_loss_mlp": 1.07458997, + "epoch": 0.04124455133022697, + "flos": 22964766347520.0, + "grad_norm": 2.0402082016953234, + "language_loss": 0.87871253, + "learning_rate": 3.998674118534141e-06, + "loss": 0.90194941, + "num_input_tokens_seen": 14546595, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.7890625, + "step": 686, + "time_per_iteration": 2.481177806854248 + }, + { + "auxiliary_loss_clip": 0.01258153, + "auxiliary_loss_mlp": 0.01071669, + "balance_loss_clip": 1.04158103, + "balance_loss_mlp": 1.07474661, + "epoch": 0.04130467458289493, + "flos": 21289067015040.0, + "grad_norm": 1.7716861202834375, + "language_loss": 0.71645427, + "learning_rate": 3.998659901655851e-06, + "loss": 0.73975253, + "num_input_tokens_seen": 14566590, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.8359375, + "step": 687, + "time_per_iteration": 2.4720261096954346 + }, + { + "auxiliary_loss_clip": 0.01252382, + "auxiliary_loss_mlp": 0.01070684, + "balance_loss_clip": 1.04262209, + "balance_loss_mlp": 1.07918715, + "epoch": 0.041364797835562905, + "flos": 19974233669760.0, + "grad_norm": 2.117746024922212, + "language_loss": 0.8642537, + "learning_rate": 3.998645608988177e-06, + "loss": 0.88748431, + "num_input_tokens_seen": 14585965, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.734375, + "step": 688, + "time_per_iteration": 2.46706223487854 + }, + { + "auxiliary_loss_clip": 0.01249454, + "auxiliary_loss_mlp": 0.01083042, + "balance_loss_clip": 1.05338287, + "balance_loss_mlp": 1.07534754, + "epoch": 0.04142492108823087, + "flos": 21906227520000.0, + "grad_norm": 2.6487514234328304, + "language_loss": 0.83326006, + "learning_rate": 3.998631240531661e-06, + "loss": 0.85658503, + "num_input_tokens_seen": 14606015, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.7421875, + "step": 689, + "time_per_iteration": 2.4689462184906006 + }, + { + "auxiliary_loss_clip": 0.01248134, + "auxiliary_loss_mlp": 0.01077255, + "balance_loss_clip": 1.04847789, + "balance_loss_mlp": 1.07176828, + "epoch": 0.04148504434089884, + "flos": 27639617621760.0, + "grad_norm": 1.7821885346326607, + "language_loss": 0.68391848, + "learning_rate": 3.998616796286848e-06, + "loss": 0.70717239, + "num_input_tokens_seen": 14629955, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.765625, + "step": 690, + "time_per_iteration": 2.5583128929138184 + }, + { + "auxiliary_loss_clip": 0.012458, + "auxiliary_loss_mlp": 0.01071299, + "balance_loss_clip": 1.04197323, + "balance_loss_mlp": 1.07094526, + "epoch": 0.041545167593566815, + "flos": 20518387781760.0, + "grad_norm": 1.747700039366933, + "language_loss": 0.74933273, + "learning_rate": 3.998602276254286e-06, + "loss": 0.77250373, + "num_input_tokens_seen": 14648000, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.75, + "step": 691, + "time_per_iteration": 2.4566729068756104 + }, + { + "auxiliary_loss_clip": 0.01246178, + "auxiliary_loss_mlp": 0.0107911, + "balance_loss_clip": 1.04890203, + "balance_loss_mlp": 1.07268727, + "epoch": 0.04160529084623478, + "flos": 11868907939200.0, + "grad_norm": 2.450885846250815, + "language_loss": 0.84518701, + "learning_rate": 3.998587680434526e-06, + "loss": 0.86843991, + "num_input_tokens_seen": 14662235, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.734375, + "step": 692, + "time_per_iteration": 2.4667932987213135 + }, + { + "auxiliary_loss_clip": 0.01252043, + "auxiliary_loss_mlp": 0.01072791, + "balance_loss_clip": 1.04124784, + "balance_loss_mlp": 1.07099986, + "epoch": 0.04166541409890275, + "flos": 14828306503680.0, + "grad_norm": 9.166238009589804, + "language_loss": 0.89107299, + "learning_rate": 3.99857300882812e-06, + "loss": 0.9143213, + "num_input_tokens_seen": 14676065, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.8125, + "step": 693, + "time_per_iteration": 2.4823052883148193 + }, + { + "auxiliary_loss_clip": 0.01254961, + "auxiliary_loss_mlp": 0.01065864, + "balance_loss_clip": 1.03637171, + "balance_loss_mlp": 1.07755136, + "epoch": 0.04172553735157072, + "flos": 25808137004160.0, + "grad_norm": 2.1462970179067646, + "language_loss": 0.82179356, + "learning_rate": 3.998558261435626e-06, + "loss": 0.84500182, + "num_input_tokens_seen": 14694955, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.7734375, + "step": 694, + "time_per_iteration": 2.564098834991455 + }, + { + "auxiliary_loss_clip": 0.01253069, + "auxiliary_loss_mlp": 0.01067691, + "balance_loss_clip": 1.03791225, + "balance_loss_mlp": 1.07214785, + "epoch": 0.04178566060423869, + "flos": 24279815174400.0, + "grad_norm": 2.057768586122239, + "language_loss": 0.83656573, + "learning_rate": 3.9985434382576015e-06, + "loss": 0.85977334, + "num_input_tokens_seen": 14715510, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.8125, + "step": 695, + "time_per_iteration": 2.5122969150543213 + }, + { + "auxiliary_loss_clip": 0.01249861, + "auxiliary_loss_mlp": 0.01073319, + "balance_loss_clip": 1.04270577, + "balance_loss_mlp": 1.07313716, + "epoch": 0.04184578385690666, + "flos": 18222008411520.0, + "grad_norm": 2.138642052855673, + "language_loss": 0.8441087, + "learning_rate": 3.99852853929461e-06, + "loss": 0.86734056, + "num_input_tokens_seen": 14731755, + "router_z_loss_clip": 0.30664062, + "router_z_loss_mlp": 1.765625, + "step": 696, + "time_per_iteration": 2.462756872177124 + }, + { + "auxiliary_loss_clip": 0.01247863, + "auxiliary_loss_mlp": 0.01073791, + "balance_loss_clip": 1.04253471, + "balance_loss_mlp": 1.07146811, + "epoch": 0.041905907109574626, + "flos": 22776342577920.0, + "grad_norm": 2.042298821772003, + "language_loss": 0.93134123, + "learning_rate": 3.998513564547216e-06, + "loss": 0.95455778, + "num_input_tokens_seen": 14750810, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.765625, + "step": 697, + "time_per_iteration": 2.5189502239227295 + }, + { + "auxiliary_loss_clip": 0.0124398, + "auxiliary_loss_mlp": 0.01069004, + "balance_loss_clip": 1.04048967, + "balance_loss_mlp": 1.07146859, + "epoch": 0.0419660303622426, + "flos": 20156947176960.0, + "grad_norm": 2.2837511795811207, + "language_loss": 0.83989406, + "learning_rate": 3.998498514015987e-06, + "loss": 0.86302388, + "num_input_tokens_seen": 14768435, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.71875, + "step": 698, + "time_per_iteration": 2.5080907344818115 + }, + { + "auxiliary_loss_clip": 0.01247569, + "auxiliary_loss_mlp": 0.01086724, + "balance_loss_clip": 1.05551505, + "balance_loss_mlp": 1.0711751, + "epoch": 0.042026153614910564, + "flos": 23076376882560.0, + "grad_norm": 1.9405760650289445, + "language_loss": 0.91369909, + "learning_rate": 3.998483387701495e-06, + "loss": 0.93704206, + "num_input_tokens_seen": 14786690, + "router_z_loss_clip": 0.3125, + "router_z_loss_mlp": 1.7578125, + "step": 699, + "time_per_iteration": 2.4667766094207764 + }, + { + "auxiliary_loss_clip": 0.01113685, + "auxiliary_loss_mlp": 0.01024099, + "balance_loss_clip": 1.01842487, + "balance_loss_mlp": 1.03384757, + "epoch": 0.042086276867578536, + "flos": 64495243370880.0, + "grad_norm": 0.8964375713204716, + "language_loss": 0.67850006, + "learning_rate": 3.998468185604312e-06, + "loss": 0.69987792, + "num_input_tokens_seen": 14853840, + "router_z_loss_clip": 0.05664062, + "router_z_loss_mlp": 0.796875, + "step": 700, + "time_per_iteration": 3.1214911937713623 + }, + { + "auxiliary_loss_clip": 0.01254452, + "auxiliary_loss_mlp": 0.01078478, + "balance_loss_clip": 1.04695964, + "balance_loss_mlp": 1.07502532, + "epoch": 0.04214640012024651, + "flos": 15487016065920.0, + "grad_norm": 2.6789371965697524, + "language_loss": 0.89020562, + "learning_rate": 3.998452907725016e-06, + "loss": 0.913535, + "num_input_tokens_seen": 14869580, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 1.796875, + "step": 701, + "time_per_iteration": 2.46085524559021 + }, + { + "auxiliary_loss_clip": 0.01250018, + "auxiliary_loss_mlp": 0.0107128, + "balance_loss_clip": 1.04085803, + "balance_loss_mlp": 1.07681179, + "epoch": 0.04220652337291447, + "flos": 23877040993920.0, + "grad_norm": 2.2592774096130794, + "language_loss": 0.67494118, + "learning_rate": 3.998437554064184e-06, + "loss": 0.69815421, + "num_input_tokens_seen": 14891065, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.734375, + "step": 702, + "time_per_iteration": 2.5170979499816895 + }, + { + "auxiliary_loss_clip": 0.01112958, + "auxiliary_loss_mlp": 0.01006834, + "balance_loss_clip": 1.00161314, + "balance_loss_mlp": 1.03296542, + "epoch": 0.042266646625582445, + "flos": 63795451628160.0, + "grad_norm": 0.8426087453226233, + "language_loss": 0.60777819, + "learning_rate": 3.9984221246224006e-06, + "loss": 0.62897617, + "num_input_tokens_seen": 14954815, + "router_z_loss_clip": 0.05224609, + "router_z_loss_mlp": 0.80078125, + "step": 703, + "time_per_iteration": 3.155794143676758 + }, + { + "auxiliary_loss_clip": 0.01112196, + "auxiliary_loss_mlp": 0.01010352, + "balance_loss_clip": 1.0050354, + "balance_loss_mlp": 1.03251982, + "epoch": 0.04232676987825041, + "flos": 50018863345920.0, + "grad_norm": 1.0167549333074237, + "language_loss": 0.5776214, + "learning_rate": 3.9984066194002494e-06, + "loss": 0.59884691, + "num_input_tokens_seen": 15003050, + "router_z_loss_clip": 0.05322266, + "router_z_loss_mlp": 0.796875, + "step": 704, + "time_per_iteration": 2.95633602142334 + }, + { + "auxiliary_loss_clip": 0.01252148, + "auxiliary_loss_mlp": 0.01070665, + "balance_loss_clip": 1.0397656, + "balance_loss_mlp": 1.07432342, + "epoch": 0.04238689313091838, + "flos": 21616105368960.0, + "grad_norm": 2.1970745802550624, + "language_loss": 0.87708455, + "learning_rate": 3.998391038398319e-06, + "loss": 0.90031266, + "num_input_tokens_seen": 15021990, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.78125, + "step": 705, + "time_per_iteration": 2.51802921295166 + }, + { + "auxiliary_loss_clip": 0.01238458, + "auxiliary_loss_mlp": 0.01062417, + "balance_loss_clip": 1.03498721, + "balance_loss_mlp": 1.06876624, + "epoch": 0.042447016383586354, + "flos": 19135109070720.0, + "grad_norm": 1.7054575923778923, + "language_loss": 0.71612352, + "learning_rate": 3.998375381617201e-06, + "loss": 0.73913229, + "num_input_tokens_seen": 15040700, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.6953125, + "step": 706, + "time_per_iteration": 2.464270830154419 + }, + { + "auxiliary_loss_clip": 0.01243119, + "auxiliary_loss_mlp": 0.01068207, + "balance_loss_clip": 1.03816676, + "balance_loss_mlp": 1.07029784, + "epoch": 0.04250713963625432, + "flos": 24426007528320.0, + "grad_norm": 2.0927829932503714, + "language_loss": 0.93480223, + "learning_rate": 3.9983596490574875e-06, + "loss": 0.95791554, + "num_input_tokens_seen": 15056725, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.7265625, + "step": 707, + "time_per_iteration": 2.5087966918945312 + }, + { + "auxiliary_loss_clip": 0.01245928, + "auxiliary_loss_mlp": 0.01065311, + "balance_loss_clip": 1.03441203, + "balance_loss_mlp": 1.0676806, + "epoch": 0.04256726288892229, + "flos": 30367391333760.0, + "grad_norm": 2.3244890877745883, + "language_loss": 0.81275034, + "learning_rate": 3.998343840719776e-06, + "loss": 0.83586276, + "num_input_tokens_seen": 15077550, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.78125, + "step": 708, + "time_per_iteration": 2.557119607925415 + }, + { + "auxiliary_loss_clip": 0.01251091, + "auxiliary_loss_mlp": 0.01073266, + "balance_loss_clip": 1.04239082, + "balance_loss_mlp": 1.07195199, + "epoch": 0.04262738614159026, + "flos": 16362661818240.0, + "grad_norm": 2.2553269788690224, + "language_loss": 0.82229173, + "learning_rate": 3.998327956604666e-06, + "loss": 0.84553528, + "num_input_tokens_seen": 15094955, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.796875, + "step": 709, + "time_per_iteration": 2.4828600883483887 + }, + { + "auxiliary_loss_clip": 0.01256006, + "auxiliary_loss_mlp": 0.01064315, + "balance_loss_clip": 1.03389335, + "balance_loss_mlp": 1.07517564, + "epoch": 0.04268750939425823, + "flos": 20412379768320.0, + "grad_norm": 2.534138916450152, + "language_loss": 0.85063422, + "learning_rate": 3.99831199671276e-06, + "loss": 0.87383747, + "num_input_tokens_seen": 15113395, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.8125, + "step": 710, + "time_per_iteration": 2.453641653060913 + }, + { + "auxiliary_loss_clip": 0.01254724, + "auxiliary_loss_mlp": 0.01070713, + "balance_loss_clip": 1.04114938, + "balance_loss_mlp": 1.07757199, + "epoch": 0.0427476326469262, + "flos": 20302959962880.0, + "grad_norm": 3.316207411440496, + "language_loss": 0.84996349, + "learning_rate": 3.998295961044662e-06, + "loss": 0.87321782, + "num_input_tokens_seen": 15132920, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.7734375, + "step": 711, + "time_per_iteration": 2.4685802459716797 + }, + { + "auxiliary_loss_clip": 0.01246695, + "auxiliary_loss_mlp": 0.01069917, + "balance_loss_clip": 1.03827882, + "balance_loss_mlp": 1.07044697, + "epoch": 0.042807755899594166, + "flos": 21650794928640.0, + "grad_norm": 2.000925777751644, + "language_loss": 0.85439169, + "learning_rate": 3.9982798496009804e-06, + "loss": 0.87755781, + "num_input_tokens_seen": 15153115, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.765625, + "step": 712, + "time_per_iteration": 2.5002598762512207 + }, + { + "auxiliary_loss_clip": 0.01252579, + "auxiliary_loss_mlp": 0.0107294, + "balance_loss_clip": 1.0445205, + "balance_loss_mlp": 1.0701685, + "epoch": 0.04286787915226214, + "flos": 21435007973760.0, + "grad_norm": 3.2453781921901728, + "language_loss": 0.90829903, + "learning_rate": 3.998263662382328e-06, + "loss": 0.9315542, + "num_input_tokens_seen": 15172770, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.8203125, + "step": 713, + "time_per_iteration": 2.4908998012542725 + }, + { + "auxiliary_loss_clip": 0.01108903, + "auxiliary_loss_mlp": 0.01017546, + "balance_loss_clip": 1.01187158, + "balance_loss_mlp": 1.0288384, + "epoch": 0.04292800240493011, + "flos": 66397970615040.0, + "grad_norm": 0.8777811618173876, + "language_loss": 0.63746506, + "learning_rate": 3.9982473993893165e-06, + "loss": 0.65872955, + "num_input_tokens_seen": 15240055, + "router_z_loss_clip": 0.05664062, + "router_z_loss_mlp": 0.80078125, + "step": 714, + "time_per_iteration": 3.158921480178833 + }, + { + "auxiliary_loss_clip": 0.01249012, + "auxiliary_loss_mlp": 0.01080593, + "balance_loss_clip": 1.05076694, + "balance_loss_mlp": 1.07545531, + "epoch": 0.042988125657598075, + "flos": 31650264552960.0, + "grad_norm": 2.1622955343434382, + "language_loss": 0.74528754, + "learning_rate": 3.998231060622563e-06, + "loss": 0.76858354, + "num_input_tokens_seen": 15261585, + "router_z_loss_clip": 0.29882812, + "router_z_loss_mlp": 1.734375, + "step": 715, + "time_per_iteration": 2.5759642124176025 + }, + { + "auxiliary_loss_clip": 0.01250142, + "auxiliary_loss_mlp": 0.01077038, + "balance_loss_clip": 1.04534006, + "balance_loss_mlp": 1.07450986, + "epoch": 0.04304824891026605, + "flos": 33248468292480.0, + "grad_norm": 2.2108029839954213, + "language_loss": 0.72630137, + "learning_rate": 3.998214646082688e-06, + "loss": 0.74957311, + "num_input_tokens_seen": 15281160, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.7578125, + "step": 716, + "time_per_iteration": 2.5973668098449707 + }, + { + "auxiliary_loss_clip": 0.01104967, + "auxiliary_loss_mlp": 0.01006876, + "balance_loss_clip": 1.00091577, + "balance_loss_mlp": 1.02687418, + "epoch": 0.04310837216293401, + "flos": 64064782782720.0, + "grad_norm": 0.9052113850529176, + "language_loss": 0.65557301, + "learning_rate": 3.998198155770314e-06, + "loss": 0.67669141, + "num_input_tokens_seen": 15344505, + "router_z_loss_clip": 0.05957031, + "router_z_loss_mlp": 0.78125, + "step": 717, + "time_per_iteration": 3.114957571029663 + }, + { + "auxiliary_loss_clip": 0.01104969, + "auxiliary_loss_mlp": 0.01003955, + "balance_loss_clip": 0.99780369, + "balance_loss_mlp": 1.02667391, + "epoch": 0.043168495415601985, + "flos": 61343757849600.0, + "grad_norm": 0.9880116621267147, + "language_loss": 0.58762264, + "learning_rate": 3.998181589686065e-06, + "loss": 0.60871184, + "num_input_tokens_seen": 15404050, + "router_z_loss_clip": 0.0612793, + "router_z_loss_mlp": 0.78125, + "step": 718, + "time_per_iteration": 2.910278797149658 + }, + { + "auxiliary_loss_clip": 0.01248398, + "auxiliary_loss_mlp": 0.01074809, + "balance_loss_clip": 1.04314709, + "balance_loss_mlp": 1.0758605, + "epoch": 0.04322861866826996, + "flos": 20704261685760.0, + "grad_norm": 1.8513004644505335, + "language_loss": 0.91198725, + "learning_rate": 3.99816494783057e-06, + "loss": 0.93521935, + "num_input_tokens_seen": 15424190, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.7265625, + "step": 719, + "time_per_iteration": 2.492509126663208 + }, + { + "auxiliary_loss_clip": 0.01244347, + "auxiliary_loss_mlp": 0.0107141, + "balance_loss_clip": 1.04208493, + "balance_loss_mlp": 1.06931555, + "epoch": 0.04328874192093792, + "flos": 30373352991360.0, + "grad_norm": 1.803377327315558, + "language_loss": 0.66468138, + "learning_rate": 3.99814823020446e-06, + "loss": 0.68783891, + "num_input_tokens_seen": 15446500, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.75, + "step": 720, + "time_per_iteration": 2.6061203479766846 + }, + { + "auxiliary_loss_clip": 0.01244682, + "auxiliary_loss_mlp": 0.01079523, + "balance_loss_clip": 1.04895782, + "balance_loss_mlp": 1.07152998, + "epoch": 0.043348865173605894, + "flos": 21944795748480.0, + "grad_norm": 1.8832143461121282, + "language_loss": 0.77743989, + "learning_rate": 3.9981314368083684e-06, + "loss": 0.80068195, + "num_input_tokens_seen": 15465830, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.734375, + "step": 721, + "time_per_iteration": 2.5255632400512695 + }, + { + "auxiliary_loss_clip": 0.01251204, + "auxiliary_loss_mlp": 0.0108774, + "balance_loss_clip": 1.05879569, + "balance_loss_mlp": 1.07584524, + "epoch": 0.04340898842627386, + "flos": 15264225959040.0, + "grad_norm": 3.027898330451403, + "language_loss": 0.87873065, + "learning_rate": 3.998114567642933e-06, + "loss": 0.90212011, + "num_input_tokens_seen": 15479985, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.75, + "step": 722, + "time_per_iteration": 2.536283493041992 + }, + { + "auxiliary_loss_clip": 0.0125365, + "auxiliary_loss_mlp": 0.01075404, + "balance_loss_clip": 1.04660296, + "balance_loss_mlp": 1.0758208, + "epoch": 0.04346911167894183, + "flos": 27965434913280.0, + "grad_norm": 30.376200688873947, + "language_loss": 0.84770942, + "learning_rate": 3.998097622708792e-06, + "loss": 0.87099999, + "num_input_tokens_seen": 15501545, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.78125, + "step": 723, + "time_per_iteration": 2.5167360305786133 + }, + { + "auxiliary_loss_clip": 0.01256754, + "auxiliary_loss_mlp": 0.01076494, + "balance_loss_clip": 1.04638171, + "balance_loss_mlp": 1.07828176, + "epoch": 0.0435292349316098, + "flos": 29242202820480.0, + "grad_norm": 1.9203333396820472, + "language_loss": 0.82793808, + "learning_rate": 3.99808060200659e-06, + "loss": 0.85127056, + "num_input_tokens_seen": 15521725, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.78125, + "step": 724, + "time_per_iteration": 2.5561347007751465 + }, + { + "auxiliary_loss_clip": 0.0125067, + "auxiliary_loss_mlp": 0.01090321, + "balance_loss_clip": 1.05975556, + "balance_loss_mlp": 1.07561088, + "epoch": 0.04358935818427777, + "flos": 20558356640640.0, + "grad_norm": 1.8200683460759586, + "language_loss": 0.79530561, + "learning_rate": 3.998063505536971e-06, + "loss": 0.81871551, + "num_input_tokens_seen": 15540910, + "router_z_loss_clip": 0.3046875, + "router_z_loss_mlp": 1.75, + "step": 725, + "time_per_iteration": 2.4551918506622314 + }, + { + "auxiliary_loss_clip": 0.0126067, + "auxiliary_loss_mlp": 0.01076358, + "balance_loss_clip": 1.04529178, + "balance_loss_mlp": 1.07715642, + "epoch": 0.04364948143694574, + "flos": 14464926564480.0, + "grad_norm": 2.8106150104808485, + "language_loss": 0.87100697, + "learning_rate": 3.998046333300584e-06, + "loss": 0.89437729, + "num_input_tokens_seen": 15558640, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 1.8359375, + "step": 726, + "time_per_iteration": 5.350574731826782 + }, + { + "auxiliary_loss_clip": 0.01106916, + "auxiliary_loss_mlp": 0.01011263, + "balance_loss_clip": 1.00542223, + "balance_loss_mlp": 1.02866364, + "epoch": 0.043709604689613706, + "flos": 50067268922880.0, + "grad_norm": 0.9088619113669424, + "language_loss": 0.5587045, + "learning_rate": 3.998029085298079e-06, + "loss": 0.57988632, + "num_input_tokens_seen": 15612975, + "router_z_loss_clip": 0.05834961, + "router_z_loss_mlp": 0.78125, + "step": 727, + "time_per_iteration": 3.1539440155029297 + }, + { + "auxiliary_loss_clip": 0.01251236, + "auxiliary_loss_mlp": 0.01076851, + "balance_loss_clip": 1.04676282, + "balance_loss_mlp": 1.07453549, + "epoch": 0.04376972794228168, + "flos": 13991588115840.0, + "grad_norm": 2.397861957488019, + "language_loss": 0.82248902, + "learning_rate": 3.998011761530112e-06, + "loss": 0.84576982, + "num_input_tokens_seen": 15631070, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.765625, + "step": 728, + "time_per_iteration": 2.4445250034332275 + }, + { + "auxiliary_loss_clip": 0.01244631, + "auxiliary_loss_mlp": 0.01068516, + "balance_loss_clip": 1.0395956, + "balance_loss_mlp": 1.07265663, + "epoch": 0.04382985119494965, + "flos": 22009901149440.0, + "grad_norm": 2.2715062050859745, + "language_loss": 0.77187145, + "learning_rate": 3.997994361997338e-06, + "loss": 0.79500294, + "num_input_tokens_seen": 15647825, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.71875, + "step": 729, + "time_per_iteration": 2.5091514587402344 + }, + { + "auxiliary_loss_clip": 0.0125233, + "auxiliary_loss_mlp": 0.01074681, + "balance_loss_clip": 1.04502177, + "balance_loss_mlp": 1.07452357, + "epoch": 0.043889974447617615, + "flos": 24206521472640.0, + "grad_norm": 2.258754879989397, + "language_loss": 0.9515503, + "learning_rate": 3.997976886700417e-06, + "loss": 0.97482038, + "num_input_tokens_seen": 15668260, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.78125, + "step": 730, + "time_per_iteration": 2.4795522689819336 + }, + { + "auxiliary_loss_clip": 0.0124716, + "auxiliary_loss_mlp": 0.01065838, + "balance_loss_clip": 1.03496313, + "balance_loss_mlp": 1.07000017, + "epoch": 0.04395009770028559, + "flos": 17274541415040.0, + "grad_norm": 2.2097226025839483, + "language_loss": 0.88016784, + "learning_rate": 3.997959335640013e-06, + "loss": 0.90329784, + "num_input_tokens_seen": 15685630, + "router_z_loss_clip": 0.30859375, + "router_z_loss_mlp": 1.7734375, + "step": 731, + "time_per_iteration": 2.4678709506988525 + }, + { + "auxiliary_loss_clip": 0.01251191, + "auxiliary_loss_mlp": 0.01073318, + "balance_loss_clip": 1.04589999, + "balance_loss_mlp": 1.07521737, + "epoch": 0.04401022095295355, + "flos": 12310286261760.0, + "grad_norm": 3.3707184473936587, + "language_loss": 0.88656235, + "learning_rate": 3.997941708816791e-06, + "loss": 0.90980744, + "num_input_tokens_seen": 15698645, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.7578125, + "step": 732, + "time_per_iteration": 2.4135851860046387 + }, + { + "auxiliary_loss_clip": 0.01251086, + "auxiliary_loss_mlp": 0.01073165, + "balance_loss_clip": 1.04288554, + "balance_loss_mlp": 1.07443762, + "epoch": 0.044070344205621524, + "flos": 20959658363520.0, + "grad_norm": 2.131822645051773, + "language_loss": 0.86010063, + "learning_rate": 3.997924006231419e-06, + "loss": 0.88334322, + "num_input_tokens_seen": 15716775, + "router_z_loss_clip": 0.30273438, + "router_z_loss_mlp": 1.765625, + "step": 733, + "time_per_iteration": 2.491278648376465 + }, + { + "auxiliary_loss_clip": 0.01256254, + "auxiliary_loss_mlp": 0.01078649, + "balance_loss_clip": 1.04715347, + "balance_loss_mlp": 1.07624841, + "epoch": 0.044130467458289496, + "flos": 13845288021120.0, + "grad_norm": 2.0564057381838885, + "language_loss": 0.91515708, + "learning_rate": 3.9979062278845685e-06, + "loss": 0.93850613, + "num_input_tokens_seen": 15733320, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 1.796875, + "step": 734, + "time_per_iteration": 2.451258897781372 + }, + { + "auxiliary_loss_clip": 0.01247796, + "auxiliary_loss_mlp": 0.01064289, + "balance_loss_clip": 1.03696656, + "balance_loss_mlp": 1.07613921, + "epoch": 0.04419059071095746, + "flos": 28655063107200.0, + "grad_norm": 1.8863467898976456, + "language_loss": 0.77831066, + "learning_rate": 3.9978883737769125e-06, + "loss": 0.8014316, + "num_input_tokens_seen": 15752705, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.71875, + "step": 735, + "time_per_iteration": 2.558958053588867 + }, + { + "auxiliary_loss_clip": 0.01240634, + "auxiliary_loss_mlp": 0.01063468, + "balance_loss_clip": 1.03526342, + "balance_loss_mlp": 1.06886315, + "epoch": 0.04425071396362543, + "flos": 28183304856960.0, + "grad_norm": 2.1337917025346074, + "language_loss": 0.88456166, + "learning_rate": 3.9978704439091305e-06, + "loss": 0.90760267, + "num_input_tokens_seen": 15772800, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.71875, + "step": 736, + "time_per_iteration": 2.5100033283233643 + }, + { + "auxiliary_loss_clip": 0.01242163, + "auxiliary_loss_mlp": 0.01067088, + "balance_loss_clip": 1.03995562, + "balance_loss_mlp": 1.07473993, + "epoch": 0.0443108372162934, + "flos": 23658452778240.0, + "grad_norm": 1.954630170969084, + "language_loss": 0.84155536, + "learning_rate": 3.997852438281901e-06, + "loss": 0.86464787, + "num_input_tokens_seen": 15793665, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.671875, + "step": 737, + "time_per_iteration": 2.5015766620635986 + }, + { + "auxiliary_loss_clip": 0.01251899, + "auxiliary_loss_mlp": 0.01072468, + "balance_loss_clip": 1.04077065, + "balance_loss_mlp": 1.07667851, + "epoch": 0.04437096046896137, + "flos": 33979861025280.0, + "grad_norm": 2.0376910697928947, + "language_loss": 0.8518666, + "learning_rate": 3.997834356895906e-06, + "loss": 0.87511027, + "num_input_tokens_seen": 15813175, + "router_z_loss_clip": 0.31640625, + "router_z_loss_mlp": 1.75, + "step": 738, + "time_per_iteration": 2.5576610565185547 + }, + { + "auxiliary_loss_clip": 0.01112093, + "auxiliary_loss_mlp": 0.01046041, + "balance_loss_clip": 1.04048622, + "balance_loss_mlp": 1.03298163, + "epoch": 0.04443108372162934, + "flos": 67397506375680.0, + "grad_norm": 0.8684121686227821, + "language_loss": 0.59110028, + "learning_rate": 3.9978161997518324e-06, + "loss": 0.61268163, + "num_input_tokens_seen": 15872050, + "router_z_loss_clip": 0.05566406, + "router_z_loss_mlp": 0.7890625, + "step": 739, + "time_per_iteration": 3.0643718242645264 + }, + { + "auxiliary_loss_clip": 0.0124678, + "auxiliary_loss_mlp": 0.01070548, + "balance_loss_clip": 1.04220033, + "balance_loss_mlp": 1.07513726, + "epoch": 0.04449120697429731, + "flos": 29752672953600.0, + "grad_norm": 2.1860888775648695, + "language_loss": 0.91622591, + "learning_rate": 3.997797966850369e-06, + "loss": 0.93939924, + "num_input_tokens_seen": 15891085, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.71875, + "step": 740, + "time_per_iteration": 2.5448389053344727 + }, + { + "auxiliary_loss_clip": 0.01252276, + "auxiliary_loss_mlp": 0.01063948, + "balance_loss_clip": 1.03693473, + "balance_loss_mlp": 1.07766986, + "epoch": 0.04455133022696528, + "flos": 36502119072000.0, + "grad_norm": 2.01644947055736, + "language_loss": 0.71842492, + "learning_rate": 3.997779658192205e-06, + "loss": 0.74158716, + "num_input_tokens_seen": 15914225, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.7421875, + "step": 741, + "time_per_iteration": 2.5979790687561035 + }, + { + "auxiliary_loss_clip": 0.01240373, + "auxiliary_loss_mlp": 0.01073056, + "balance_loss_clip": 1.04532838, + "balance_loss_mlp": 1.07044411, + "epoch": 0.044611453479633245, + "flos": 28803661672320.0, + "grad_norm": 1.722907957661965, + "language_loss": 0.88555831, + "learning_rate": 3.997761273778037e-06, + "loss": 0.9086926, + "num_input_tokens_seen": 15934540, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.703125, + "step": 742, + "time_per_iteration": 2.6367549896240234 + }, + { + "auxiliary_loss_clip": 0.0124233, + "auxiliary_loss_mlp": 0.01061826, + "balance_loss_clip": 1.03253651, + "balance_loss_mlp": 1.07209873, + "epoch": 0.04467157673230122, + "flos": 20010970304640.0, + "grad_norm": 2.0306401320231693, + "language_loss": 0.83823264, + "learning_rate": 3.997742813608561e-06, + "loss": 0.86127412, + "num_input_tokens_seen": 15952560, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.703125, + "step": 743, + "time_per_iteration": 2.516587972640991 + }, + { + "auxiliary_loss_clip": 0.01249271, + "auxiliary_loss_mlp": 0.01068722, + "balance_loss_clip": 1.04161429, + "balance_loss_mlp": 1.07474804, + "epoch": 0.04473169998496919, + "flos": 18004964480640.0, + "grad_norm": 3.0889105946672704, + "language_loss": 0.79948521, + "learning_rate": 3.997724277684479e-06, + "loss": 0.8226651, + "num_input_tokens_seen": 15970620, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.75, + "step": 744, + "time_per_iteration": 2.44805645942688 + }, + { + "auxiliary_loss_clip": 0.01243449, + "auxiliary_loss_mlp": 0.01067337, + "balance_loss_clip": 1.04037201, + "balance_loss_mlp": 1.07279778, + "epoch": 0.044791823237637154, + "flos": 20631722169600.0, + "grad_norm": 2.388036535067576, + "language_loss": 0.85400093, + "learning_rate": 3.99770566600649e-06, + "loss": 0.87710881, + "num_input_tokens_seen": 15987325, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.703125, + "step": 745, + "time_per_iteration": 2.4790430068969727 + }, + { + "auxiliary_loss_clip": 0.01242131, + "auxiliary_loss_mlp": 0.01064523, + "balance_loss_clip": 1.03569877, + "balance_loss_mlp": 1.0714339, + "epoch": 0.04485194649030513, + "flos": 31176171918720.0, + "grad_norm": 2.1215702602167688, + "language_loss": 0.6866799, + "learning_rate": 3.997686978575302e-06, + "loss": 0.70974648, + "num_input_tokens_seen": 16008310, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.7109375, + "step": 746, + "time_per_iteration": 2.5645759105682373 + }, + { + "auxiliary_loss_clip": 0.01250748, + "auxiliary_loss_mlp": 0.010776, + "balance_loss_clip": 1.04748797, + "balance_loss_mlp": 1.0783143, + "epoch": 0.04491206974297309, + "flos": 26143291831680.0, + "grad_norm": 2.1376273799467547, + "language_loss": 0.68823957, + "learning_rate": 3.997668215391625e-06, + "loss": 0.71152306, + "num_input_tokens_seen": 16029620, + "router_z_loss_clip": 0.30078125, + "router_z_loss_mlp": 1.7265625, + "step": 747, + "time_per_iteration": 2.5267317295074463 + }, + { + "auxiliary_loss_clip": 0.01248685, + "auxiliary_loss_mlp": 0.01079454, + "balance_loss_clip": 1.04984236, + "balance_loss_mlp": 1.07314527, + "epoch": 0.044972192995641064, + "flos": 20667668705280.0, + "grad_norm": 1.9669744064389407, + "language_loss": 0.66721869, + "learning_rate": 3.997649376456168e-06, + "loss": 0.69050002, + "num_input_tokens_seen": 16049065, + "router_z_loss_clip": 0.296875, + "router_z_loss_mlp": 1.75, + "step": 748, + "time_per_iteration": 2.4818925857543945 + }, + { + "auxiliary_loss_clip": 0.01250197, + "auxiliary_loss_mlp": 0.01082391, + "balance_loss_clip": 1.05320835, + "balance_loss_mlp": 1.07779491, + "epoch": 0.045032316248309036, + "flos": 16106834177280.0, + "grad_norm": 2.650057046326624, + "language_loss": 0.76540357, + "learning_rate": 3.997630461769647e-06, + "loss": 0.78872949, + "num_input_tokens_seen": 16066765, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.7265625, + "step": 749, + "time_per_iteration": 2.4454426765441895 + }, + { + "auxiliary_loss_clip": 0.01251335, + "auxiliary_loss_mlp": 0.01077492, + "balance_loss_clip": 1.04883409, + "balance_loss_mlp": 1.0770005, + "epoch": 0.045092439500977, + "flos": 17858843953920.0, + "grad_norm": 2.0345099055640317, + "language_loss": 0.88970172, + "learning_rate": 3.997611471332778e-06, + "loss": 0.91298997, + "num_input_tokens_seen": 16085980, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.7421875, + "step": 750, + "time_per_iteration": 2.458716630935669 + }, + { + "auxiliary_loss_clip": 0.01247033, + "auxiliary_loss_mlp": 0.01074335, + "balance_loss_clip": 1.04295921, + "balance_loss_mlp": 1.07139015, + "epoch": 0.04515256275364497, + "flos": 24462815990400.0, + "grad_norm": 2.3716924268159367, + "language_loss": 0.74869245, + "learning_rate": 3.9975924051462825e-06, + "loss": 0.77190608, + "num_input_tokens_seen": 16106260, + "router_z_loss_clip": 0.31445312, + "router_z_loss_mlp": 1.7578125, + "step": 751, + "time_per_iteration": 2.5231218338012695 + }, + { + "auxiliary_loss_clip": 0.01243504, + "auxiliary_loss_mlp": 0.01073697, + "balance_loss_clip": 1.04573071, + "balance_loss_mlp": 1.07175446, + "epoch": 0.04521268600631294, + "flos": 20916385453440.0, + "grad_norm": 2.2224468826240975, + "language_loss": 0.69360238, + "learning_rate": 3.997573263210883e-06, + "loss": 0.7167744, + "num_input_tokens_seen": 16123475, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.71875, + "step": 752, + "time_per_iteration": 2.4620048999786377 + }, + { + "auxiliary_loss_clip": 0.01244736, + "auxiliary_loss_mlp": 0.01057192, + "balance_loss_clip": 1.02927327, + "balance_loss_mlp": 1.07154715, + "epoch": 0.04527280925898091, + "flos": 13371374954880.0, + "grad_norm": 2.984649176219999, + "language_loss": 0.91634125, + "learning_rate": 3.997554045527305e-06, + "loss": 0.9393605, + "num_input_tokens_seen": 16138335, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.734375, + "step": 753, + "time_per_iteration": 2.4722437858581543 + }, + { + "auxiliary_loss_clip": 0.01249124, + "auxiliary_loss_mlp": 0.01075497, + "balance_loss_clip": 1.04728031, + "balance_loss_mlp": 1.07501864, + "epoch": 0.04533293251164888, + "flos": 23254565276160.0, + "grad_norm": 2.2056938633592975, + "language_loss": 0.91197902, + "learning_rate": 3.997534752096277e-06, + "loss": 0.93522525, + "num_input_tokens_seen": 16157110, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.7421875, + "step": 754, + "time_per_iteration": 2.472975492477417 + }, + { + "auxiliary_loss_clip": 0.01238249, + "auxiliary_loss_mlp": 0.0107062, + "balance_loss_clip": 1.04144955, + "balance_loss_mlp": 1.07163191, + "epoch": 0.04539305576431685, + "flos": 12422004537600.0, + "grad_norm": 2.234660546964849, + "language_loss": 0.78528345, + "learning_rate": 3.997515382918531e-06, + "loss": 0.80837214, + "num_input_tokens_seen": 16174155, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.671875, + "step": 755, + "time_per_iteration": 2.4704174995422363 + }, + { + "auxiliary_loss_clip": 0.01248815, + "auxiliary_loss_mlp": 0.0107981, + "balance_loss_clip": 1.05100918, + "balance_loss_mlp": 1.07416105, + "epoch": 0.04545317901698482, + "flos": 16070995382400.0, + "grad_norm": 1.9667934561660614, + "language_loss": 0.78451371, + "learning_rate": 3.9974959379948015e-06, + "loss": 0.80779994, + "num_input_tokens_seen": 16192240, + "router_z_loss_clip": 0.28710938, + "router_z_loss_mlp": 1.75, + "step": 756, + "time_per_iteration": 2.4873547554016113 + }, + { + "auxiliary_loss_clip": 0.01118229, + "auxiliary_loss_mlp": 0.010118, + "balance_loss_clip": 1.00600612, + "balance_loss_mlp": 1.03558636, + "epoch": 0.045513302269652785, + "flos": 66396139021440.0, + "grad_norm": 0.8118987787253854, + "language_loss": 0.62730747, + "learning_rate": 3.997476417325827e-06, + "loss": 0.64860779, + "num_input_tokens_seen": 16255775, + "router_z_loss_clip": 0.05786133, + "router_z_loss_mlp": 0.828125, + "step": 757, + "time_per_iteration": 3.1292941570281982 + }, + { + "auxiliary_loss_clip": 0.01242797, + "auxiliary_loss_mlp": 0.01069674, + "balance_loss_clip": 1.04220784, + "balance_loss_mlp": 1.0731318, + "epoch": 0.04557342552232076, + "flos": 21471169991040.0, + "grad_norm": 1.5194495460848947, + "language_loss": 0.84329176, + "learning_rate": 3.997456820912346e-06, + "loss": 0.86641645, + "num_input_tokens_seen": 16277015, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.6953125, + "step": 758, + "time_per_iteration": 2.498905658721924 + }, + { + "auxiliary_loss_clip": 0.01237511, + "auxiliary_loss_mlp": 0.01067085, + "balance_loss_clip": 1.0405376, + "balance_loss_mlp": 1.06733441, + "epoch": 0.04563354877498873, + "flos": 23732680233600.0, + "grad_norm": 2.0933163310434963, + "language_loss": 0.88315606, + "learning_rate": 3.997437148755101e-06, + "loss": 0.90620202, + "num_input_tokens_seen": 16296005, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.703125, + "step": 759, + "time_per_iteration": 2.5122711658477783 + }, + { + "auxiliary_loss_clip": 0.01248241, + "auxiliary_loss_mlp": 0.01075804, + "balance_loss_clip": 1.04644299, + "balance_loss_mlp": 1.075526, + "epoch": 0.045693672027656694, + "flos": 25735741142400.0, + "grad_norm": 2.170817451496144, + "language_loss": 0.73644727, + "learning_rate": 3.9974174008548405e-06, + "loss": 0.75968778, + "num_input_tokens_seen": 16315300, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.7265625, + "step": 760, + "time_per_iteration": 2.511322021484375 + }, + { + "auxiliary_loss_clip": 0.01244913, + "auxiliary_loss_mlp": 0.01073409, + "balance_loss_clip": 1.04630077, + "balance_loss_mlp": 1.07509935, + "epoch": 0.045753795280324666, + "flos": 19719016560000.0, + "grad_norm": 2.192184725657734, + "language_loss": 0.82177126, + "learning_rate": 3.9973975772123105e-06, + "loss": 0.84495443, + "num_input_tokens_seen": 16333820, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.6953125, + "step": 761, + "time_per_iteration": 2.4831535816192627 + }, + { + "auxiliary_loss_clip": 0.01238913, + "auxiliary_loss_mlp": 0.0107061, + "balance_loss_clip": 1.04245305, + "balance_loss_mlp": 1.06961203, + "epoch": 0.04581391853299264, + "flos": 23255786338560.0, + "grad_norm": 1.7986428347309282, + "language_loss": 0.79732436, + "learning_rate": 3.997377677828266e-06, + "loss": 0.82041955, + "num_input_tokens_seen": 16355290, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.6953125, + "step": 762, + "time_per_iteration": 2.52453875541687 + }, + { + "auxiliary_loss_clip": 0.01117014, + "auxiliary_loss_mlp": 0.01036706, + "balance_loss_clip": 1.03112733, + "balance_loss_mlp": 1.03455913, + "epoch": 0.0458740417856606, + "flos": 64231155601920.0, + "grad_norm": 1.008821564963746, + "language_loss": 0.58659625, + "learning_rate": 3.9973577027034585e-06, + "loss": 0.60813344, + "num_input_tokens_seen": 16415995, + "router_z_loss_clip": 0.0559082, + "router_z_loss_mlp": 0.82421875, + "step": 763, + "time_per_iteration": 3.1429429054260254 + }, + { + "auxiliary_loss_clip": 0.01245459, + "auxiliary_loss_mlp": 0.01081866, + "balance_loss_clip": 1.05381632, + "balance_loss_mlp": 1.07288039, + "epoch": 0.045934165038328575, + "flos": 20770121272320.0, + "grad_norm": 2.8717486924500517, + "language_loss": 0.87752867, + "learning_rate": 3.9973376518386475e-06, + "loss": 0.9008019, + "num_input_tokens_seen": 16433120, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.7265625, + "step": 764, + "time_per_iteration": 2.4727554321289062 + }, + { + "auxiliary_loss_clip": 0.01248006, + "auxiliary_loss_mlp": 0.01079864, + "balance_loss_clip": 1.05192137, + "balance_loss_mlp": 1.07565248, + "epoch": 0.04599428829099654, + "flos": 30262891691520.0, + "grad_norm": 1.9426139778845304, + "language_loss": 0.86118066, + "learning_rate": 3.997317525234592e-06, + "loss": 0.88445938, + "num_input_tokens_seen": 16453360, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.71875, + "step": 765, + "time_per_iteration": 2.5370731353759766 + }, + { + "auxiliary_loss_clip": 0.01248646, + "auxiliary_loss_mlp": 0.01070241, + "balance_loss_clip": 1.03912735, + "balance_loss_mlp": 1.07336497, + "epoch": 0.04605441154366451, + "flos": 23038921975680.0, + "grad_norm": 3.0624701923152453, + "language_loss": 0.87846982, + "learning_rate": 3.997297322892056e-06, + "loss": 0.90165865, + "num_input_tokens_seen": 16471160, + "router_z_loss_clip": 0.31054688, + "router_z_loss_mlp": 1.75, + "step": 766, + "time_per_iteration": 2.475677013397217 + }, + { + "auxiliary_loss_clip": 0.01239894, + "auxiliary_loss_mlp": 0.01067957, + "balance_loss_clip": 1.03979921, + "balance_loss_mlp": 1.06896472, + "epoch": 0.046114534796332485, + "flos": 22017407091840.0, + "grad_norm": 2.616885530601855, + "language_loss": 0.84314167, + "learning_rate": 3.997277044811806e-06, + "loss": 0.86622024, + "num_input_tokens_seen": 16488940, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.703125, + "step": 767, + "time_per_iteration": 2.465449810028076 + }, + { + "auxiliary_loss_clip": 0.01245421, + "auxiliary_loss_mlp": 0.01060911, + "balance_loss_clip": 1.03249097, + "balance_loss_mlp": 1.07569289, + "epoch": 0.04617465804900045, + "flos": 29862380067840.0, + "grad_norm": 2.056931367891973, + "language_loss": 0.87013769, + "learning_rate": 3.99725669099461e-06, + "loss": 0.89320099, + "num_input_tokens_seen": 16509505, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.703125, + "step": 768, + "time_per_iteration": 5.441957235336304 + }, + { + "auxiliary_loss_clip": 0.01238542, + "auxiliary_loss_mlp": 0.01069073, + "balance_loss_clip": 1.04184508, + "balance_loss_mlp": 1.06768477, + "epoch": 0.04623478130166842, + "flos": 25630056351360.0, + "grad_norm": 2.1199205591749033, + "language_loss": 0.75022334, + "learning_rate": 3.9972362614412395e-06, + "loss": 0.77329946, + "num_input_tokens_seen": 16528840, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.703125, + "step": 769, + "time_per_iteration": 2.5294675827026367 + }, + { + "auxiliary_loss_clip": 0.01238179, + "auxiliary_loss_mlp": 0.01063477, + "balance_loss_clip": 1.03734684, + "balance_loss_mlp": 1.07084632, + "epoch": 0.04629490455433639, + "flos": 20449080489600.0, + "grad_norm": 1.886534334963383, + "language_loss": 0.86162585, + "learning_rate": 3.997215756152471e-06, + "loss": 0.88464236, + "num_input_tokens_seen": 16548335, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.671875, + "step": 770, + "time_per_iteration": 2.4646449089050293 + }, + { + "auxiliary_loss_clip": 0.01248004, + "auxiliary_loss_mlp": 0.01066015, + "balance_loss_clip": 1.0385015, + "balance_loss_mlp": 1.07160687, + "epoch": 0.04635502780700436, + "flos": 23148736830720.0, + "grad_norm": 2.8625416592988477, + "language_loss": 0.87259042, + "learning_rate": 3.99719517512908e-06, + "loss": 0.89573061, + "num_input_tokens_seen": 16567725, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.765625, + "step": 771, + "time_per_iteration": 2.512622117996216 + }, + { + "auxiliary_loss_clip": 0.01246333, + "auxiliary_loss_mlp": 0.01076832, + "balance_loss_clip": 1.04726815, + "balance_loss_mlp": 1.06911707, + "epoch": 0.04641515105967233, + "flos": 23292020183040.0, + "grad_norm": 2.3640102097360587, + "language_loss": 0.83736801, + "learning_rate": 3.997174518371848e-06, + "loss": 0.86059964, + "num_input_tokens_seen": 16588175, + "router_z_loss_clip": 0.29492188, + "router_z_loss_mlp": 1.7734375, + "step": 772, + "time_per_iteration": 2.509572982788086 + }, + { + "auxiliary_loss_clip": 0.01243608, + "auxiliary_loss_mlp": 0.01064058, + "balance_loss_clip": 1.03721189, + "balance_loss_mlp": 1.07392263, + "epoch": 0.046475274312340296, + "flos": 25115204759040.0, + "grad_norm": 2.3097217333215694, + "language_loss": 0.73399591, + "learning_rate": 3.997153785881557e-06, + "loss": 0.75707257, + "num_input_tokens_seen": 16607735, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.6953125, + "step": 773, + "time_per_iteration": 2.5539331436157227 + }, + { + "auxiliary_loss_clip": 0.01240234, + "auxiliary_loss_mlp": 0.01065536, + "balance_loss_clip": 1.03624654, + "balance_loss_mlp": 1.07288945, + "epoch": 0.04653539756500827, + "flos": 25264916645760.0, + "grad_norm": 2.066531290075925, + "language_loss": 0.78523052, + "learning_rate": 3.997132977658996e-06, + "loss": 0.80828828, + "num_input_tokens_seen": 16627225, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.671875, + "step": 774, + "time_per_iteration": 2.5350210666656494 + }, + { + "auxiliary_loss_clip": 0.01239038, + "auxiliary_loss_mlp": 0.01065848, + "balance_loss_clip": 1.03955007, + "balance_loss_mlp": 1.07101154, + "epoch": 0.046595520817676234, + "flos": 35404150089600.0, + "grad_norm": 2.187480231527322, + "language_loss": 0.73357666, + "learning_rate": 3.997112093704952e-06, + "loss": 0.75662553, + "num_input_tokens_seen": 16647785, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.6796875, + "step": 775, + "time_per_iteration": 2.6102981567382812 + }, + { + "auxiliary_loss_clip": 0.01240703, + "auxiliary_loss_mlp": 0.01059246, + "balance_loss_clip": 1.03096998, + "balance_loss_mlp": 1.06996655, + "epoch": 0.046655644070344206, + "flos": 18112516778880.0, + "grad_norm": 1.5904648869830247, + "language_loss": 0.77037287, + "learning_rate": 3.997091134020217e-06, + "loss": 0.79337239, + "num_input_tokens_seen": 16667555, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.703125, + "step": 776, + "time_per_iteration": 2.4713642597198486 + }, + { + "auxiliary_loss_clip": 0.01236202, + "auxiliary_loss_mlp": 0.01063775, + "balance_loss_clip": 1.03790653, + "balance_loss_mlp": 1.06914115, + "epoch": 0.04671576732301218, + "flos": 29205286617600.0, + "grad_norm": 1.9751950676431418, + "language_loss": 0.70967531, + "learning_rate": 3.997070098605585e-06, + "loss": 0.73267508, + "num_input_tokens_seen": 16686875, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.671875, + "step": 777, + "time_per_iteration": 2.540151596069336 + }, + { + "auxiliary_loss_clip": 0.01242182, + "auxiliary_loss_mlp": 0.01078178, + "balance_loss_clip": 1.04999709, + "balance_loss_mlp": 1.07221043, + "epoch": 0.04677589057568014, + "flos": 30478319510400.0, + "grad_norm": 1.9852588200641685, + "language_loss": 0.76756501, + "learning_rate": 3.997048987461856e-06, + "loss": 0.79076868, + "num_input_tokens_seen": 16706420, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.703125, + "step": 778, + "time_per_iteration": 2.5299642086029053 + }, + { + "auxiliary_loss_clip": 0.01236882, + "auxiliary_loss_mlp": 0.01068399, + "balance_loss_clip": 1.04049253, + "balance_loss_mlp": 1.06948996, + "epoch": 0.046836013828348115, + "flos": 20557674282240.0, + "grad_norm": 2.9364819041983576, + "language_loss": 0.78900939, + "learning_rate": 3.997027800589829e-06, + "loss": 0.81206226, + "num_input_tokens_seen": 16726390, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.671875, + "step": 779, + "time_per_iteration": 2.4999477863311768 + }, + { + "auxiliary_loss_clip": 0.01230899, + "auxiliary_loss_mlp": 0.01065999, + "balance_loss_clip": 1.03997588, + "balance_loss_mlp": 1.06776333, + "epoch": 0.04689613708101608, + "flos": 25447378757760.0, + "grad_norm": 1.7037291099106273, + "language_loss": 0.77051055, + "learning_rate": 3.997006537990308e-06, + "loss": 0.7934795, + "num_input_tokens_seen": 16748965, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.6328125, + "step": 780, + "time_per_iteration": 2.54770565032959 + }, + { + "auxiliary_loss_clip": 0.01235667, + "auxiliary_loss_mlp": 0.01067194, + "balance_loss_clip": 1.04187369, + "balance_loss_mlp": 1.07070863, + "epoch": 0.04695626033368405, + "flos": 23001395241600.0, + "grad_norm": 2.6789342331958745, + "language_loss": 0.76432645, + "learning_rate": 3.996985199664099e-06, + "loss": 0.78735507, + "num_input_tokens_seen": 16768620, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.6484375, + "step": 781, + "time_per_iteration": 2.5040361881256104 + }, + { + "auxiliary_loss_clip": 0.01245917, + "auxiliary_loss_mlp": 0.01072818, + "balance_loss_clip": 1.04468417, + "balance_loss_mlp": 1.07423282, + "epoch": 0.047016383586352024, + "flos": 29133357632640.0, + "grad_norm": 2.2171800145032736, + "language_loss": 0.74027473, + "learning_rate": 3.99696378561201e-06, + "loss": 0.76346207, + "num_input_tokens_seen": 16789755, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.71875, + "step": 782, + "time_per_iteration": 2.528890371322632 + }, + { + "auxiliary_loss_clip": 0.01241991, + "auxiliary_loss_mlp": 0.01060851, + "balance_loss_clip": 1.03549504, + "balance_loss_mlp": 1.07483578, + "epoch": 0.04707650683901999, + "flos": 14976330451200.0, + "grad_norm": 6.219089205177081, + "language_loss": 0.8032757, + "learning_rate": 3.996942295834855e-06, + "loss": 0.82630414, + "num_input_tokens_seen": 16807585, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.671875, + "step": 783, + "time_per_iteration": 2.4866061210632324 + }, + { + "auxiliary_loss_clip": 0.01232605, + "auxiliary_loss_mlp": 0.01059533, + "balance_loss_clip": 1.03417742, + "balance_loss_mlp": 1.07062817, + "epoch": 0.04713663009168796, + "flos": 21651118151040.0, + "grad_norm": 2.0172272756643816, + "language_loss": 0.81289953, + "learning_rate": 3.996920730333448e-06, + "loss": 0.83582091, + "num_input_tokens_seen": 16827220, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.6171875, + "step": 784, + "time_per_iteration": 2.476659059524536 + }, + { + "auxiliary_loss_clip": 0.01238913, + "auxiliary_loss_mlp": 0.01072248, + "balance_loss_clip": 1.04597473, + "balance_loss_mlp": 1.0683856, + "epoch": 0.04719675334435593, + "flos": 21325408600320.0, + "grad_norm": 2.171254656371271, + "language_loss": 0.8076694, + "learning_rate": 3.996899089108607e-06, + "loss": 0.83078098, + "num_input_tokens_seen": 16846230, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.703125, + "step": 785, + "time_per_iteration": 2.493598461151123 + }, + { + "auxiliary_loss_clip": 0.01241548, + "auxiliary_loss_mlp": 0.01061941, + "balance_loss_clip": 1.03752661, + "balance_loss_mlp": 1.0762614, + "epoch": 0.0472568765970239, + "flos": 17931383470080.0, + "grad_norm": 2.444819858404617, + "language_loss": 0.89981294, + "learning_rate": 3.996877372161152e-06, + "loss": 0.92284781, + "num_input_tokens_seen": 16865325, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.65625, + "step": 786, + "time_per_iteration": 2.4573311805725098 + }, + { + "auxiliary_loss_clip": 0.012413, + "auxiliary_loss_mlp": 0.01070001, + "balance_loss_clip": 1.04055619, + "balance_loss_mlp": 1.06742501, + "epoch": 0.04731699984969187, + "flos": 18077324428800.0, + "grad_norm": 3.379381752409287, + "language_loss": 0.76639462, + "learning_rate": 3.9968555794919065e-06, + "loss": 0.78950763, + "num_input_tokens_seen": 16882930, + "router_z_loss_clip": 0.29296875, + "router_z_loss_mlp": 1.734375, + "step": 787, + "time_per_iteration": 2.447611093521118 + }, + { + "auxiliary_loss_clip": 0.01247236, + "auxiliary_loss_mlp": 0.01071736, + "balance_loss_clip": 1.04431772, + "balance_loss_mlp": 1.0765723, + "epoch": 0.047377123102359836, + "flos": 23185078416000.0, + "grad_norm": 2.4642209511959403, + "language_loss": 0.80851126, + "learning_rate": 3.996833711101698e-06, + "loss": 0.83170098, + "num_input_tokens_seen": 16900710, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.7109375, + "step": 788, + "time_per_iteration": 2.4679956436157227 + }, + { + "auxiliary_loss_clip": 0.01236983, + "auxiliary_loss_mlp": 0.01074337, + "balance_loss_clip": 1.04551244, + "balance_loss_mlp": 1.07285857, + "epoch": 0.04743724635502781, + "flos": 22747794243840.0, + "grad_norm": 2.2318634793178127, + "language_loss": 0.84819949, + "learning_rate": 3.996811766991355e-06, + "loss": 0.87131274, + "num_input_tokens_seen": 16919210, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.640625, + "step": 789, + "time_per_iteration": 2.4982516765594482 + }, + { + "auxiliary_loss_clip": 0.01242053, + "auxiliary_loss_mlp": 0.01066276, + "balance_loss_clip": 1.04006219, + "balance_loss_mlp": 1.07367456, + "epoch": 0.04749736960769577, + "flos": 17238702620160.0, + "grad_norm": 1.948517450129577, + "language_loss": 0.82196069, + "learning_rate": 3.996789747161709e-06, + "loss": 0.84504396, + "num_input_tokens_seen": 16937125, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.6875, + "step": 790, + "time_per_iteration": 2.4380602836608887 + }, + { + "auxiliary_loss_clip": 0.01236299, + "auxiliary_loss_mlp": 0.01062348, + "balance_loss_clip": 1.03524029, + "balance_loss_mlp": 1.06857598, + "epoch": 0.047557492860363745, + "flos": 40479261592320.0, + "grad_norm": 2.8806939749630054, + "language_loss": 0.88245451, + "learning_rate": 3.996767651613597e-06, + "loss": 0.90544093, + "num_input_tokens_seen": 16958610, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.671875, + "step": 791, + "time_per_iteration": 2.6723573207855225 + }, + { + "auxiliary_loss_clip": 0.01239952, + "auxiliary_loss_mlp": 0.010655, + "balance_loss_clip": 1.03826034, + "balance_loss_mlp": 1.07212687, + "epoch": 0.04761761611303172, + "flos": 18698004466560.0, + "grad_norm": 2.2584516419561464, + "language_loss": 0.90245461, + "learning_rate": 3.996745480347854e-06, + "loss": 0.92550921, + "num_input_tokens_seen": 16977300, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.671875, + "step": 792, + "time_per_iteration": 2.4627771377563477 + }, + { + "auxiliary_loss_clip": 0.01241845, + "auxiliary_loss_mlp": 0.01074856, + "balance_loss_clip": 1.04874945, + "balance_loss_mlp": 1.07157969, + "epoch": 0.04767773936569968, + "flos": 20921987975040.0, + "grad_norm": 1.9386484459236437, + "language_loss": 0.7310667, + "learning_rate": 3.996723233365324e-06, + "loss": 0.75423372, + "num_input_tokens_seen": 16994950, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.703125, + "step": 793, + "time_per_iteration": 2.512657403945923 + }, + { + "auxiliary_loss_clip": 0.0124198, + "auxiliary_loss_mlp": 0.01067209, + "balance_loss_clip": 1.03969526, + "balance_loss_mlp": 1.07207203, + "epoch": 0.047737862618367655, + "flos": 23732680233600.0, + "grad_norm": 2.0117940746735123, + "language_loss": 0.86102074, + "learning_rate": 3.996700910666847e-06, + "loss": 0.88411266, + "num_input_tokens_seen": 17014760, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.703125, + "step": 794, + "time_per_iteration": 2.510611057281494 + }, + { + "auxiliary_loss_clip": 0.0123999, + "auxiliary_loss_mlp": 0.01074174, + "balance_loss_clip": 1.04701805, + "balance_loss_mlp": 1.06925917, + "epoch": 0.04779798587103562, + "flos": 23695764030720.0, + "grad_norm": 3.4118642482115384, + "language_loss": 0.69812739, + "learning_rate": 3.996678512253272e-06, + "loss": 0.72126907, + "num_input_tokens_seen": 17032715, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.7109375, + "step": 795, + "time_per_iteration": 2.500420093536377 + }, + { + "auxiliary_loss_clip": 0.01236981, + "auxiliary_loss_mlp": 0.01070364, + "balance_loss_clip": 1.0432204, + "balance_loss_mlp": 1.06999111, + "epoch": 0.04785810912370359, + "flos": 23183641872000.0, + "grad_norm": 2.0479238599532135, + "language_loss": 0.81053579, + "learning_rate": 3.996656038125449e-06, + "loss": 0.83360916, + "num_input_tokens_seen": 17052215, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.671875, + "step": 796, + "time_per_iteration": 2.4838409423828125 + }, + { + "auxiliary_loss_clip": 0.0124002, + "auxiliary_loss_mlp": 0.01058331, + "balance_loss_clip": 1.03129458, + "balance_loss_mlp": 1.07190371, + "epoch": 0.047918232376371564, + "flos": 18040623707520.0, + "grad_norm": 2.3456590334750858, + "language_loss": 0.81249642, + "learning_rate": 3.996633488284228e-06, + "loss": 0.83547997, + "num_input_tokens_seen": 17069225, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.6796875, + "step": 797, + "time_per_iteration": 2.466343402862549 + }, + { + "auxiliary_loss_clip": 0.01122032, + "auxiliary_loss_mlp": 0.0100279, + "balance_loss_clip": 0.9972828, + "balance_loss_mlp": 1.03672731, + "epoch": 0.04797835562903953, + "flos": 62442588758400.0, + "grad_norm": 0.9120921080635288, + "language_loss": 0.64447635, + "learning_rate": 3.996610862730465e-06, + "loss": 0.66572458, + "num_input_tokens_seen": 17126680, + "router_z_loss_clip": 0.05517578, + "router_z_loss_mlp": 0.8515625, + "step": 798, + "time_per_iteration": 3.0081863403320312 + }, + { + "auxiliary_loss_clip": 0.01243937, + "auxiliary_loss_mlp": 0.01070197, + "balance_loss_clip": 1.04285014, + "balance_loss_mlp": 1.06894708, + "epoch": 0.0480384788817075, + "flos": 21507296094720.0, + "grad_norm": 7.0153313624744005, + "language_loss": 0.90794134, + "learning_rate": 3.996588161465018e-06, + "loss": 0.93108267, + "num_input_tokens_seen": 17144835, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.75, + "step": 799, + "time_per_iteration": 2.4872424602508545 + }, + { + "auxiliary_loss_clip": 0.01242621, + "auxiliary_loss_mlp": 0.01069655, + "balance_loss_clip": 1.04220068, + "balance_loss_mlp": 1.07567, + "epoch": 0.048098602134375466, + "flos": 21726710323200.0, + "grad_norm": 2.1467314479540818, + "language_loss": 0.86701, + "learning_rate": 3.996565384488748e-06, + "loss": 0.89013278, + "num_input_tokens_seen": 17165030, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.671875, + "step": 800, + "time_per_iteration": 2.477720022201538 + }, + { + "auxiliary_loss_clip": 0.01243518, + "auxiliary_loss_mlp": 0.0106979, + "balance_loss_clip": 1.04362369, + "balance_loss_mlp": 1.07207572, + "epoch": 0.04815872538704344, + "flos": 22931082368640.0, + "grad_norm": 7.517902152046504, + "language_loss": 0.84513009, + "learning_rate": 3.996542531802518e-06, + "loss": 0.86826313, + "num_input_tokens_seen": 17184895, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.71875, + "step": 801, + "time_per_iteration": 2.487889528274536 + }, + { + "auxiliary_loss_clip": 0.01242116, + "auxiliary_loss_mlp": 0.01071409, + "balance_loss_clip": 1.04470587, + "balance_loss_mlp": 1.07289147, + "epoch": 0.04821884863971141, + "flos": 43174716042240.0, + "grad_norm": 1.97564705550146, + "language_loss": 0.79967415, + "learning_rate": 3.996519603407196e-06, + "loss": 0.82280934, + "num_input_tokens_seen": 17208225, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.6953125, + "step": 802, + "time_per_iteration": 2.6496224403381348 + }, + { + "auxiliary_loss_clip": 0.01238875, + "auxiliary_loss_mlp": 0.01065547, + "balance_loss_clip": 1.03963101, + "balance_loss_mlp": 1.07270598, + "epoch": 0.048278971892379376, + "flos": 18620006083200.0, + "grad_norm": 2.8331626885697725, + "language_loss": 0.86420751, + "learning_rate": 3.996496599303649e-06, + "loss": 0.88725173, + "num_input_tokens_seen": 17226305, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.6640625, + "step": 803, + "time_per_iteration": 2.4806807041168213 + }, + { + "auxiliary_loss_clip": 0.01238315, + "auxiliary_loss_mlp": 0.01061166, + "balance_loss_clip": 1.0346303, + "balance_loss_mlp": 1.07398677, + "epoch": 0.04833909514504735, + "flos": 20230061310720.0, + "grad_norm": 2.229653749186784, + "language_loss": 0.85436332, + "learning_rate": 3.996473519492753e-06, + "loss": 0.87735808, + "num_input_tokens_seen": 17244545, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.640625, + "step": 804, + "time_per_iteration": 2.458303213119507 + }, + { + "auxiliary_loss_clip": 0.01239413, + "auxiliary_loss_mlp": 0.01066878, + "balance_loss_clip": 1.04099822, + "balance_loss_mlp": 1.07286024, + "epoch": 0.04839921839771532, + "flos": 24645170361600.0, + "grad_norm": 2.2509331098011645, + "language_loss": 0.86119306, + "learning_rate": 3.99645036397538e-06, + "loss": 0.88425595, + "num_input_tokens_seen": 17265730, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.6640625, + "step": 805, + "time_per_iteration": 2.5021419525146484 + }, + { + "auxiliary_loss_clip": 0.01235031, + "auxiliary_loss_mlp": 0.01067273, + "balance_loss_clip": 1.04115391, + "balance_loss_mlp": 1.06942892, + "epoch": 0.048459341650383285, + "flos": 24827452905600.0, + "grad_norm": 1.8866019303880346, + "language_loss": 0.68034315, + "learning_rate": 3.9964271327524085e-06, + "loss": 0.70336622, + "num_input_tokens_seen": 17284820, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.65625, + "step": 806, + "time_per_iteration": 2.4904568195343018 + }, + { + "auxiliary_loss_clip": 0.01235579, + "auxiliary_loss_mlp": 0.01064526, + "balance_loss_clip": 1.03847933, + "balance_loss_mlp": 1.07208037, + "epoch": 0.04851946490305126, + "flos": 22163204396160.0, + "grad_norm": 2.221107161276338, + "language_loss": 0.7716608, + "learning_rate": 3.9964038258247214e-06, + "loss": 0.79466188, + "num_input_tokens_seen": 17305085, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.6328125, + "step": 807, + "time_per_iteration": 2.498624563217163 + }, + { + "auxiliary_loss_clip": 0.01232532, + "auxiliary_loss_mlp": 0.01071643, + "balance_loss_clip": 1.04567873, + "balance_loss_mlp": 1.06831741, + "epoch": 0.04857958815571922, + "flos": 19792022952960.0, + "grad_norm": 2.844770488216335, + "language_loss": 0.86509991, + "learning_rate": 3.9963804431932005e-06, + "loss": 0.88814163, + "num_input_tokens_seen": 17322715, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.640625, + "step": 808, + "time_per_iteration": 2.444673538208008 + }, + { + "auxiliary_loss_clip": 0.01242847, + "auxiliary_loss_mlp": 0.01070908, + "balance_loss_clip": 1.04441929, + "balance_loss_mlp": 1.07261682, + "epoch": 0.048639711408387194, + "flos": 18697968552960.0, + "grad_norm": 1.9428867449931826, + "language_loss": 0.90154302, + "learning_rate": 3.996356984858732e-06, + "loss": 0.92468053, + "num_input_tokens_seen": 17341455, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.703125, + "step": 809, + "time_per_iteration": 5.353702545166016 + }, + { + "auxiliary_loss_clip": 0.01242102, + "auxiliary_loss_mlp": 0.01070004, + "balance_loss_clip": 1.0432415, + "balance_loss_mlp": 1.07577538, + "epoch": 0.048699834661055166, + "flos": 24863507182080.0, + "grad_norm": 2.12821080633451, + "language_loss": 0.84360719, + "learning_rate": 3.996333450822208e-06, + "loss": 0.86672825, + "num_input_tokens_seen": 17360765, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.65625, + "step": 810, + "time_per_iteration": 3.8935022354125977 + }, + { + "auxiliary_loss_clip": 0.01240735, + "auxiliary_loss_mlp": 0.01066961, + "balance_loss_clip": 1.04049659, + "balance_loss_mlp": 1.07189715, + "epoch": 0.04875995791372313, + "flos": 20704010290560.0, + "grad_norm": 1.7610993085905569, + "language_loss": 0.80875039, + "learning_rate": 3.99630984108452e-06, + "loss": 0.8318274, + "num_input_tokens_seen": 17380625, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.6875, + "step": 811, + "time_per_iteration": 2.5000643730163574 + }, + { + "auxiliary_loss_clip": 0.01232044, + "auxiliary_loss_mlp": 0.0107527, + "balance_loss_clip": 1.04991412, + "balance_loss_mlp": 1.06997907, + "epoch": 0.048820081166391104, + "flos": 18588297352320.0, + "grad_norm": 2.0417171226218715, + "language_loss": 0.74768531, + "learning_rate": 3.9962861556465615e-06, + "loss": 0.77075845, + "num_input_tokens_seen": 17399355, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.625, + "step": 812, + "time_per_iteration": 2.4853179454803467 + }, + { + "auxiliary_loss_clip": 0.01233917, + "auxiliary_loss_mlp": 0.01074517, + "balance_loss_clip": 1.04924428, + "balance_loss_mlp": 1.07263327, + "epoch": 0.04888020441905907, + "flos": 22707322594560.0, + "grad_norm": 1.8904091966919716, + "language_loss": 0.89845109, + "learning_rate": 3.996262394509233e-06, + "loss": 0.92153537, + "num_input_tokens_seen": 17418240, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.609375, + "step": 813, + "time_per_iteration": 2.6731016635894775 + }, + { + "auxiliary_loss_clip": 0.01232344, + "auxiliary_loss_mlp": 0.01058653, + "balance_loss_clip": 1.03429866, + "balance_loss_mlp": 1.07083082, + "epoch": 0.04894032767172704, + "flos": 22784351310720.0, + "grad_norm": 2.028357820963791, + "language_loss": 0.74551463, + "learning_rate": 3.9962385576734335e-06, + "loss": 0.76842451, + "num_input_tokens_seen": 17436250, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.6171875, + "step": 814, + "time_per_iteration": 2.509963035583496 + }, + { + "auxiliary_loss_clip": 0.01235531, + "auxiliary_loss_mlp": 0.01067085, + "balance_loss_clip": 1.04074001, + "balance_loss_mlp": 1.07073569, + "epoch": 0.04900045092439501, + "flos": 25516147345920.0, + "grad_norm": 2.3605733083261464, + "language_loss": 0.83740532, + "learning_rate": 3.9962146451400675e-06, + "loss": 0.86043149, + "num_input_tokens_seen": 17455750, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.6484375, + "step": 815, + "time_per_iteration": 2.5490894317626953 + }, + { + "auxiliary_loss_clip": 0.01239776, + "auxiliary_loss_mlp": 0.0106033, + "balance_loss_clip": 1.03396082, + "balance_loss_mlp": 1.07326484, + "epoch": 0.04906057417706298, + "flos": 25958136199680.0, + "grad_norm": 2.271155414035229, + "language_loss": 0.90803105, + "learning_rate": 3.996190656910043e-06, + "loss": 0.93103218, + "num_input_tokens_seen": 17474995, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.6640625, + "step": 816, + "time_per_iteration": 2.5273053646087646 + }, + { + "auxiliary_loss_clip": 0.01240454, + "auxiliary_loss_mlp": 0.01060699, + "balance_loss_clip": 1.03410304, + "balance_loss_mlp": 1.0732162, + "epoch": 0.04912069742973095, + "flos": 18624638937600.0, + "grad_norm": 3.2321750342473603, + "language_loss": 0.79924619, + "learning_rate": 3.996166592984268e-06, + "loss": 0.82225776, + "num_input_tokens_seen": 17493395, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.671875, + "step": 817, + "time_per_iteration": 2.5095019340515137 + }, + { + "auxiliary_loss_clip": 0.0123455, + "auxiliary_loss_mlp": 0.01074727, + "balance_loss_clip": 1.04864395, + "balance_loss_mlp": 1.07184172, + "epoch": 0.049180820682398915, + "flos": 23699786353920.0, + "grad_norm": 1.8264850687392937, + "language_loss": 0.84520394, + "learning_rate": 3.996142453363656e-06, + "loss": 0.86829674, + "num_input_tokens_seen": 17514565, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.625, + "step": 818, + "time_per_iteration": 2.5476157665252686 + }, + { + "auxiliary_loss_clip": 0.01243386, + "auxiliary_loss_mlp": 0.01067455, + "balance_loss_clip": 1.04041791, + "balance_loss_mlp": 1.07401037, + "epoch": 0.04924094393506689, + "flos": 22420396753920.0, + "grad_norm": 2.779535734169796, + "language_loss": 0.75307131, + "learning_rate": 3.996118238049124e-06, + "loss": 0.77617967, + "num_input_tokens_seen": 17534590, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.6953125, + "step": 819, + "time_per_iteration": 2.5486624240875244 + }, + { + "auxiliary_loss_clip": 0.01239669, + "auxiliary_loss_mlp": 0.01061583, + "balance_loss_clip": 1.03858793, + "balance_loss_mlp": 1.07577193, + "epoch": 0.04930106718773486, + "flos": 15738246766080.0, + "grad_norm": 2.1475545017813853, + "language_loss": 0.85166955, + "learning_rate": 3.996093947041586e-06, + "loss": 0.87468207, + "num_input_tokens_seen": 17551900, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.640625, + "step": 820, + "time_per_iteration": 2.4565298557281494 + }, + { + "auxiliary_loss_clip": 0.0123627, + "auxiliary_loss_mlp": 0.01061843, + "balance_loss_clip": 1.03602266, + "balance_loss_mlp": 1.07061315, + "epoch": 0.049361190440402825, + "flos": 26250628648320.0, + "grad_norm": 1.902695357085614, + "language_loss": 0.9041872, + "learning_rate": 3.996069580341966e-06, + "loss": 0.92716837, + "num_input_tokens_seen": 17571485, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.65625, + "step": 821, + "time_per_iteration": 2.5412514209747314 + }, + { + "auxiliary_loss_clip": 0.01233424, + "auxiliary_loss_mlp": 0.01073041, + "balance_loss_clip": 1.04773307, + "balance_loss_mlp": 1.06951392, + "epoch": 0.0494213136930708, + "flos": 21252366293760.0, + "grad_norm": 2.0531707528144274, + "language_loss": 0.8941884, + "learning_rate": 3.996045137951188e-06, + "loss": 0.91725308, + "num_input_tokens_seen": 17591410, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.640625, + "step": 822, + "time_per_iteration": 2.5171031951904297 + }, + { + "auxiliary_loss_clip": 0.01237258, + "auxiliary_loss_mlp": 0.01059943, + "balance_loss_clip": 1.03295374, + "balance_loss_mlp": 1.0742538, + "epoch": 0.04948143694573876, + "flos": 27965506740480.0, + "grad_norm": 2.060390808888412, + "language_loss": 0.67537785, + "learning_rate": 3.996020619870178e-06, + "loss": 0.69834983, + "num_input_tokens_seen": 17612010, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.6328125, + "step": 823, + "time_per_iteration": 2.5744235515594482 + }, + { + "auxiliary_loss_clip": 0.01120581, + "auxiliary_loss_mlp": 0.01008389, + "balance_loss_clip": 1.00323892, + "balance_loss_mlp": 1.04174662, + "epoch": 0.049541560198406734, + "flos": 66180995533440.0, + "grad_norm": 1.3777513990451415, + "language_loss": 0.62206292, + "learning_rate": 3.995996026099866e-06, + "loss": 0.64335263, + "num_input_tokens_seen": 17673430, + "router_z_loss_clip": 0.05151367, + "router_z_loss_mlp": 0.7890625, + "step": 824, + "time_per_iteration": 3.13708758354187 + }, + { + "auxiliary_loss_clip": 0.01240025, + "auxiliary_loss_mlp": 0.01070032, + "balance_loss_clip": 1.0431149, + "balance_loss_mlp": 1.07293963, + "epoch": 0.049601683451074706, + "flos": 22892693708160.0, + "grad_norm": 2.021638376413324, + "language_loss": 0.90364408, + "learning_rate": 3.995971356641185e-06, + "loss": 0.92674464, + "num_input_tokens_seen": 17689545, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.671875, + "step": 825, + "time_per_iteration": 2.519487142562866 + }, + { + "auxiliary_loss_clip": 0.01237141, + "auxiliary_loss_mlp": 0.01064311, + "balance_loss_clip": 1.03678548, + "balance_loss_mlp": 1.0713625, + "epoch": 0.04966180670374267, + "flos": 21433643256960.0, + "grad_norm": 23.06748840114486, + "language_loss": 0.66790086, + "learning_rate": 3.9959466114950695e-06, + "loss": 0.69091535, + "num_input_tokens_seen": 17705965, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.65625, + "step": 826, + "time_per_iteration": 2.486837387084961 + }, + { + "auxiliary_loss_clip": 0.01236344, + "auxiliary_loss_mlp": 0.01062091, + "balance_loss_clip": 1.0362581, + "balance_loss_mlp": 1.07166433, + "epoch": 0.04972192995641064, + "flos": 23107367341440.0, + "grad_norm": 5.4656671498779845, + "language_loss": 0.78386623, + "learning_rate": 3.995921790662459e-06, + "loss": 0.80685055, + "num_input_tokens_seen": 17724580, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.640625, + "step": 827, + "time_per_iteration": 2.517092704772949 + }, + { + "auxiliary_loss_clip": 0.0124052, + "auxiliary_loss_mlp": 0.01072288, + "balance_loss_clip": 1.04553795, + "balance_loss_mlp": 1.07333767, + "epoch": 0.04978205320907861, + "flos": 40406147458560.0, + "grad_norm": 2.8940457048653916, + "language_loss": 0.78592682, + "learning_rate": 3.995896894144294e-06, + "loss": 0.80905491, + "num_input_tokens_seen": 17747755, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.671875, + "step": 828, + "time_per_iteration": 2.6536450386047363 + }, + { + "auxiliary_loss_clip": 0.01227721, + "auxiliary_loss_mlp": 0.01058804, + "balance_loss_clip": 1.03428292, + "balance_loss_mlp": 1.06777728, + "epoch": 0.04984217646174658, + "flos": 25228539146880.0, + "grad_norm": 2.330577425067274, + "language_loss": 0.83493364, + "learning_rate": 3.995871921941519e-06, + "loss": 0.85779881, + "num_input_tokens_seen": 17768550, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.6015625, + "step": 829, + "time_per_iteration": 2.5744268894195557 + }, + { + "auxiliary_loss_clip": 0.01235678, + "auxiliary_loss_mlp": 0.01073434, + "balance_loss_clip": 1.04433525, + "balance_loss_mlp": 1.07021666, + "epoch": 0.04990229971441455, + "flos": 15959636242560.0, + "grad_norm": 2.2375926111489743, + "language_loss": 0.75055873, + "learning_rate": 3.99584687405508e-06, + "loss": 0.77364987, + "num_input_tokens_seen": 17786080, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.65625, + "step": 830, + "time_per_iteration": 2.5045461654663086 + }, + { + "auxiliary_loss_clip": 0.01233457, + "auxiliary_loss_mlp": 0.01065961, + "balance_loss_clip": 1.03935385, + "balance_loss_mlp": 1.06966341, + "epoch": 0.04996242296708252, + "flos": 18405116968320.0, + "grad_norm": 1.962979792887244, + "language_loss": 0.79379636, + "learning_rate": 3.995821750485929e-06, + "loss": 0.81679052, + "num_input_tokens_seen": 17803635, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.640625, + "step": 831, + "time_per_iteration": 2.5924267768859863 + }, + { + "auxiliary_loss_clip": 0.01237676, + "auxiliary_loss_mlp": 0.01070014, + "balance_loss_clip": 1.04487276, + "balance_loss_mlp": 1.07213569, + "epoch": 0.05002254621975049, + "flos": 17858053854720.0, + "grad_norm": 2.758266217871517, + "language_loss": 0.91538632, + "learning_rate": 3.995796551235016e-06, + "loss": 0.93846321, + "num_input_tokens_seen": 17822190, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.65625, + "step": 832, + "time_per_iteration": 2.653150796890259 + }, + { + "auxiliary_loss_clip": 0.01230534, + "auxiliary_loss_mlp": 0.01081981, + "balance_loss_clip": 1.05747163, + "balance_loss_mlp": 1.07053018, + "epoch": 0.050082669472418455, + "flos": 45660273367680.0, + "grad_norm": 1.9700093948003867, + "language_loss": 0.83139837, + "learning_rate": 3.9957712763032974e-06, + "loss": 0.85452354, + "num_input_tokens_seen": 17846915, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.6015625, + "step": 833, + "time_per_iteration": 2.73848819732666 + }, + { + "auxiliary_loss_clip": 0.0123523, + "auxiliary_loss_mlp": 0.01058285, + "balance_loss_clip": 1.0318923, + "balance_loss_mlp": 1.06913459, + "epoch": 0.05014279272508643, + "flos": 37962067363200.0, + "grad_norm": 2.433665596415918, + "language_loss": 0.8254565, + "learning_rate": 3.995745925691733e-06, + "loss": 0.84839165, + "num_input_tokens_seen": 17867270, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.65625, + "step": 834, + "time_per_iteration": 2.6406352519989014 + }, + { + "auxiliary_loss_clip": 0.01236789, + "auxiliary_loss_mlp": 0.01063828, + "balance_loss_clip": 1.03710127, + "balance_loss_mlp": 1.07138014, + "epoch": 0.0502029159777544, + "flos": 20996179516800.0, + "grad_norm": 2.099554255469436, + "language_loss": 0.91758966, + "learning_rate": 3.995720499401282e-06, + "loss": 0.94059587, + "num_input_tokens_seen": 17884880, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.65625, + "step": 835, + "time_per_iteration": 2.4750425815582275 + }, + { + "auxiliary_loss_clip": 0.0123437, + "auxiliary_loss_mlp": 0.01071713, + "balance_loss_clip": 1.04372287, + "balance_loss_mlp": 1.06699944, + "epoch": 0.050263039230422364, + "flos": 15888066393600.0, + "grad_norm": 2.4903656252358735, + "language_loss": 0.76346481, + "learning_rate": 3.995694997432911e-06, + "loss": 0.78652561, + "num_input_tokens_seen": 17903695, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.671875, + "step": 836, + "time_per_iteration": 2.4839258193969727 + }, + { + "auxiliary_loss_clip": 0.01229978, + "auxiliary_loss_mlp": 0.01072782, + "balance_loss_clip": 1.04696083, + "balance_loss_mlp": 1.07100809, + "epoch": 0.050323162483090336, + "flos": 23732752060800.0, + "grad_norm": 2.1380784235063066, + "language_loss": 0.8360337, + "learning_rate": 3.9956694197875855e-06, + "loss": 0.85906136, + "num_input_tokens_seen": 17920745, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.5859375, + "step": 837, + "time_per_iteration": 2.5140485763549805 + }, + { + "auxiliary_loss_clip": 0.01233502, + "auxiliary_loss_mlp": 0.01065592, + "balance_loss_clip": 1.0403192, + "balance_loss_mlp": 1.07245386, + "epoch": 0.0503832857357583, + "flos": 20266223328000.0, + "grad_norm": 2.225982034212064, + "language_loss": 0.73137468, + "learning_rate": 3.995643766466275e-06, + "loss": 0.75436556, + "num_input_tokens_seen": 17938220, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.609375, + "step": 838, + "time_per_iteration": 2.5128419399261475 + }, + { + "auxiliary_loss_clip": 0.01229023, + "auxiliary_loss_mlp": 0.0106788, + "balance_loss_clip": 1.04195237, + "balance_loss_mlp": 1.06636167, + "epoch": 0.05044340898842627, + "flos": 17785011548160.0, + "grad_norm": 1.886796600099776, + "language_loss": 0.83328462, + "learning_rate": 3.995618037469953e-06, + "loss": 0.85625362, + "num_input_tokens_seen": 17957325, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.6328125, + "step": 839, + "time_per_iteration": 2.499415874481201 + }, + { + "auxiliary_loss_clip": 0.01228207, + "auxiliary_loss_mlp": 0.01066651, + "balance_loss_clip": 1.04128349, + "balance_loss_mlp": 1.06866539, + "epoch": 0.050503532241094246, + "flos": 22966526113920.0, + "grad_norm": 2.2056506497336765, + "language_loss": 0.85777193, + "learning_rate": 3.995592232799595e-06, + "loss": 0.8807205, + "num_input_tokens_seen": 17975875, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.59375, + "step": 840, + "time_per_iteration": 2.522038698196411 + }, + { + "auxiliary_loss_clip": 0.01235877, + "auxiliary_loss_mlp": 0.01063775, + "balance_loss_clip": 1.03691697, + "balance_loss_mlp": 1.07246661, + "epoch": 0.05056365549376221, + "flos": 22776989022720.0, + "grad_norm": 2.034102412822674, + "language_loss": 0.94658732, + "learning_rate": 3.99556635245618e-06, + "loss": 0.96958393, + "num_input_tokens_seen": 17994340, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.6328125, + "step": 841, + "time_per_iteration": 2.4996211528778076 + }, + { + "auxiliary_loss_clip": 0.01234454, + "auxiliary_loss_mlp": 0.01066453, + "balance_loss_clip": 1.03958321, + "balance_loss_mlp": 1.07130527, + "epoch": 0.05062377874643018, + "flos": 30916968399360.0, + "grad_norm": 2.030819255438432, + "language_loss": 0.77387047, + "learning_rate": 3.995540396440688e-06, + "loss": 0.79687953, + "num_input_tokens_seen": 18015260, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.6328125, + "step": 842, + "time_per_iteration": 2.6253628730773926 + }, + { + "auxiliary_loss_clip": 0.01238804, + "auxiliary_loss_mlp": 0.01067813, + "balance_loss_clip": 1.041659, + "balance_loss_mlp": 1.07278991, + "epoch": 0.05068390199909815, + "flos": 19647159402240.0, + "grad_norm": 2.283727909175907, + "language_loss": 0.78014457, + "learning_rate": 3.995514364754105e-06, + "loss": 0.80321074, + "num_input_tokens_seen": 18033960, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.6640625, + "step": 843, + "time_per_iteration": 2.5158324241638184 + }, + { + "auxiliary_loss_clip": 0.01237695, + "auxiliary_loss_mlp": 0.01061566, + "balance_loss_clip": 1.036461, + "balance_loss_mlp": 1.07266212, + "epoch": 0.05074402525176612, + "flos": 37962103276800.0, + "grad_norm": 2.249210505837228, + "language_loss": 0.82952344, + "learning_rate": 3.995488257397417e-06, + "loss": 0.85251611, + "num_input_tokens_seen": 18056700, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.6484375, + "step": 844, + "time_per_iteration": 2.6476500034332275 + }, + { + "auxiliary_loss_clip": 0.01229818, + "auxiliary_loss_mlp": 0.0106479, + "balance_loss_clip": 1.03935087, + "balance_loss_mlp": 1.06871867, + "epoch": 0.05080414850443409, + "flos": 22054610603520.0, + "grad_norm": 2.3236550986537368, + "language_loss": 0.76042783, + "learning_rate": 3.995462074371614e-06, + "loss": 0.78337395, + "num_input_tokens_seen": 18075815, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.609375, + "step": 845, + "time_per_iteration": 2.502901315689087 + }, + { + "auxiliary_loss_clip": 0.01229682, + "auxiliary_loss_mlp": 0.01075672, + "balance_loss_clip": 1.04924285, + "balance_loss_mlp": 1.06694174, + "epoch": 0.05086427175710206, + "flos": 20225787592320.0, + "grad_norm": 2.2528566199281905, + "language_loss": 0.87468004, + "learning_rate": 3.99543581567769e-06, + "loss": 0.89773357, + "num_input_tokens_seen": 18095095, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.625, + "step": 846, + "time_per_iteration": 2.5271859169006348 + }, + { + "auxiliary_loss_clip": 0.01230653, + "auxiliary_loss_mlp": 0.01070334, + "balance_loss_clip": 1.04521692, + "balance_loss_mlp": 1.06982791, + "epoch": 0.05092439500977003, + "flos": 15159223526400.0, + "grad_norm": 1.95159927266484, + "language_loss": 0.87571466, + "learning_rate": 3.9954094813166394e-06, + "loss": 0.89872456, + "num_input_tokens_seen": 18112675, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.609375, + "step": 847, + "time_per_iteration": 2.4566030502319336 + }, + { + "auxiliary_loss_clip": 0.01226009, + "auxiliary_loss_mlp": 0.01071018, + "balance_loss_clip": 1.04489946, + "balance_loss_mlp": 1.06883907, + "epoch": 0.050984518262437994, + "flos": 22055149307520.0, + "grad_norm": 2.141846591022022, + "language_loss": 0.81706643, + "learning_rate": 3.995383071289462e-06, + "loss": 0.84003675, + "num_input_tokens_seen": 18130745, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.5703125, + "step": 848, + "time_per_iteration": 2.4695050716400146 + }, + { + "auxiliary_loss_clip": 0.0123182, + "auxiliary_loss_mlp": 0.01077851, + "balance_loss_clip": 1.0522449, + "balance_loss_mlp": 1.07167053, + "epoch": 0.05104464151510597, + "flos": 30225329043840.0, + "grad_norm": 1.898868752622741, + "language_loss": 0.87266076, + "learning_rate": 3.995356585597158e-06, + "loss": 0.89575738, + "num_input_tokens_seen": 18152410, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.6015625, + "step": 849, + "time_per_iteration": 2.5472936630249023 + }, + { + "auxiliary_loss_clip": 0.0122487, + "auxiliary_loss_mlp": 0.01062562, + "balance_loss_clip": 1.03743291, + "balance_loss_mlp": 1.06569946, + "epoch": 0.05110476476777394, + "flos": 18332900674560.0, + "grad_norm": 1.8637209623848903, + "language_loss": 0.83340889, + "learning_rate": 3.995330024240732e-06, + "loss": 0.85628319, + "num_input_tokens_seen": 18170870, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.59375, + "step": 850, + "time_per_iteration": 2.493814468383789 + }, + { + "auxiliary_loss_clip": 0.01229016, + "auxiliary_loss_mlp": 0.01064236, + "balance_loss_clip": 1.03847528, + "balance_loss_mlp": 1.06816506, + "epoch": 0.051164888020441904, + "flos": 37998732170880.0, + "grad_norm": 2.1400408414194154, + "language_loss": 0.6501807, + "learning_rate": 3.995303387221192e-06, + "loss": 0.67311323, + "num_input_tokens_seen": 18191555, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.609375, + "step": 851, + "time_per_iteration": 5.443026065826416 + }, + { + "auxiliary_loss_clip": 0.01228781, + "auxiliary_loss_mlp": 0.01071453, + "balance_loss_clip": 1.04424942, + "balance_loss_mlp": 1.0674876, + "epoch": 0.051225011273109876, + "flos": 23038634666880.0, + "grad_norm": 2.2562645326336686, + "language_loss": 0.8376134, + "learning_rate": 3.995276674539547e-06, + "loss": 0.86061573, + "num_input_tokens_seen": 18208620, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.609375, + "step": 852, + "time_per_iteration": 2.4753623008728027 + }, + { + "auxiliary_loss_clip": 0.01231223, + "auxiliary_loss_mlp": 0.01068594, + "balance_loss_clip": 1.04190326, + "balance_loss_mlp": 1.06879044, + "epoch": 0.05128513452577785, + "flos": 18259822454400.0, + "grad_norm": 1.9405819970113303, + "language_loss": 0.80252314, + "learning_rate": 3.995249886196811e-06, + "loss": 0.82552135, + "num_input_tokens_seen": 18226370, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.625, + "step": 853, + "time_per_iteration": 2.5048112869262695 + }, + { + "auxiliary_loss_clip": 0.01226539, + "auxiliary_loss_mlp": 0.01060743, + "balance_loss_clip": 1.03432584, + "balance_loss_mlp": 1.06710184, + "epoch": 0.05134525777844581, + "flos": 27198957571200.0, + "grad_norm": 1.8237562231360178, + "language_loss": 0.75846469, + "learning_rate": 3.995223022193999e-06, + "loss": 0.7813375, + "num_input_tokens_seen": 18247075, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.59375, + "step": 854, + "time_per_iteration": 2.53165602684021 + }, + { + "auxiliary_loss_clip": 0.01233418, + "auxiliary_loss_mlp": 0.0106357, + "balance_loss_clip": 1.03678393, + "balance_loss_mlp": 1.07139039, + "epoch": 0.051405381031113785, + "flos": 28362247436160.0, + "grad_norm": 2.718422527893707, + "language_loss": 0.81173462, + "learning_rate": 3.99519608253213e-06, + "loss": 0.83470446, + "num_input_tokens_seen": 18265680, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.625, + "step": 855, + "time_per_iteration": 2.5610744953155518 + }, + { + "auxiliary_loss_clip": 0.01113278, + "auxiliary_loss_mlp": 0.01020682, + "balance_loss_clip": 1.01534104, + "balance_loss_mlp": 1.03902698, + "epoch": 0.05146550428378175, + "flos": 65618169327360.0, + "grad_norm": 1.0071030268205712, + "language_loss": 0.65609074, + "learning_rate": 3.995169067212227e-06, + "loss": 0.67743033, + "num_input_tokens_seen": 18327015, + "router_z_loss_clip": 0.0534668, + "router_z_loss_mlp": 0.7421875, + "step": 856, + "time_per_iteration": 3.0546581745147705 + }, + { + "auxiliary_loss_clip": 0.01224884, + "auxiliary_loss_mlp": 0.01053813, + "balance_loss_clip": 1.02823043, + "balance_loss_mlp": 1.06811357, + "epoch": 0.05152562753644972, + "flos": 22054861998720.0, + "grad_norm": 1.8111088050205955, + "language_loss": 0.76996124, + "learning_rate": 3.9951419762353116e-06, + "loss": 0.79274821, + "num_input_tokens_seen": 18345235, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.5625, + "step": 857, + "time_per_iteration": 2.6051554679870605 + }, + { + "auxiliary_loss_clip": 0.01229705, + "auxiliary_loss_mlp": 0.01057595, + "balance_loss_clip": 1.03130889, + "balance_loss_mlp": 1.06846082, + "epoch": 0.051585750789117694, + "flos": 18509544783360.0, + "grad_norm": 3.7937823779894377, + "language_loss": 0.88893878, + "learning_rate": 3.995114809602412e-06, + "loss": 0.91181171, + "num_input_tokens_seen": 18362350, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.6171875, + "step": 858, + "time_per_iteration": 2.4517769813537598 + }, + { + "auxiliary_loss_clip": 0.01228685, + "auxiliary_loss_mlp": 0.01056497, + "balance_loss_clip": 1.03000832, + "balance_loss_mlp": 1.06902003, + "epoch": 0.05164587404178566, + "flos": 23730238108800.0, + "grad_norm": 1.9531750101692102, + "language_loss": 0.75199753, + "learning_rate": 3.9950875673145605e-06, + "loss": 0.77484941, + "num_input_tokens_seen": 18383390, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.59375, + "step": 859, + "time_per_iteration": 2.5090014934539795 + }, + { + "auxiliary_loss_clip": 0.01237239, + "auxiliary_loss_mlp": 0.01070917, + "balance_loss_clip": 1.04280758, + "balance_loss_mlp": 1.06980002, + "epoch": 0.05170599729445363, + "flos": 16252882876800.0, + "grad_norm": 2.092452223155828, + "language_loss": 0.90812773, + "learning_rate": 3.995060249372788e-06, + "loss": 0.93120927, + "num_input_tokens_seen": 18399220, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.671875, + "step": 860, + "time_per_iteration": 2.437220335006714 + }, + { + "auxiliary_loss_clip": 0.01231057, + "auxiliary_loss_mlp": 0.01060698, + "balance_loss_clip": 1.03568769, + "balance_loss_mlp": 1.0717634, + "epoch": 0.0517661205471216, + "flos": 23985922095360.0, + "grad_norm": 1.9189860758016508, + "language_loss": 0.82252973, + "learning_rate": 3.99503285577813e-06, + "loss": 0.8454473, + "num_input_tokens_seen": 18419005, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.59375, + "step": 861, + "time_per_iteration": 2.50883412361145 + }, + { + "auxiliary_loss_clip": 0.01233216, + "auxiliary_loss_mlp": 0.01057472, + "balance_loss_clip": 1.03177071, + "balance_loss_mlp": 1.0704143, + "epoch": 0.05182624379978957, + "flos": 29277718392960.0, + "grad_norm": 2.0352629197197762, + "language_loss": 0.78607392, + "learning_rate": 3.995005386531627e-06, + "loss": 0.80898082, + "num_input_tokens_seen": 18440550, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.625, + "step": 862, + "time_per_iteration": 2.548757314682007 + }, + { + "auxiliary_loss_clip": 0.01229413, + "auxiliary_loss_mlp": 0.01068334, + "balance_loss_clip": 1.04402709, + "balance_loss_mlp": 1.07291067, + "epoch": 0.05188636705245754, + "flos": 24170826332160.0, + "grad_norm": 1.9841587361763113, + "language_loss": 0.88999134, + "learning_rate": 3.9949778416343195e-06, + "loss": 0.91296881, + "num_input_tokens_seen": 18461950, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.5625, + "step": 863, + "time_per_iteration": 2.506289005279541 + }, + { + "auxiliary_loss_clip": 0.01238268, + "auxiliary_loss_mlp": 0.01064282, + "balance_loss_clip": 1.03712606, + "balance_loss_mlp": 1.07635331, + "epoch": 0.051946490305125506, + "flos": 26760703731840.0, + "grad_norm": 2.003999649515418, + "language_loss": 0.7575798, + "learning_rate": 3.9949502210872525e-06, + "loss": 0.78060532, + "num_input_tokens_seen": 18480555, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.625, + "step": 864, + "time_per_iteration": 2.515944480895996 + }, + { + "auxiliary_loss_clip": 0.01236545, + "auxiliary_loss_mlp": 0.01069508, + "balance_loss_clip": 1.04228067, + "balance_loss_mlp": 1.07355332, + "epoch": 0.05200661355779348, + "flos": 21502519585920.0, + "grad_norm": 1.9298630836237705, + "language_loss": 0.7919569, + "learning_rate": 3.994922524891474e-06, + "loss": 0.81501746, + "num_input_tokens_seen": 18499645, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.6328125, + "step": 865, + "time_per_iteration": 2.485499620437622 + }, + { + "auxiliary_loss_clip": 0.0123268, + "auxiliary_loss_mlp": 0.0106684, + "balance_loss_clip": 1.04144871, + "balance_loss_mlp": 1.07079291, + "epoch": 0.05206673681046144, + "flos": 18114492026880.0, + "grad_norm": 2.366131428952597, + "language_loss": 0.85700798, + "learning_rate": 3.994894753048032e-06, + "loss": 0.88000321, + "num_input_tokens_seen": 18516810, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.6171875, + "step": 866, + "time_per_iteration": 2.4854226112365723 + }, + { + "auxiliary_loss_clip": 0.01242589, + "auxiliary_loss_mlp": 0.01065926, + "balance_loss_clip": 1.03910398, + "balance_loss_mlp": 1.0804987, + "epoch": 0.052126860063129415, + "flos": 17524191916800.0, + "grad_norm": 2.535209572965093, + "language_loss": 0.8680315, + "learning_rate": 3.9948669055579815e-06, + "loss": 0.89111662, + "num_input_tokens_seen": 18532510, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.625, + "step": 867, + "time_per_iteration": 2.4644970893859863 + }, + { + "auxiliary_loss_clip": 0.01231644, + "auxiliary_loss_mlp": 0.01073847, + "balance_loss_clip": 1.05021977, + "balance_loss_mlp": 1.07513499, + "epoch": 0.05218698331579739, + "flos": 32598054771840.0, + "grad_norm": 1.64188364663517, + "language_loss": 0.63562089, + "learning_rate": 3.9948389824223785e-06, + "loss": 0.65867579, + "num_input_tokens_seen": 18557380, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.5625, + "step": 868, + "time_per_iteration": 2.567958354949951 + }, + { + "auxiliary_loss_clip": 0.01236968, + "auxiliary_loss_mlp": 0.01065922, + "balance_loss_clip": 1.03753829, + "balance_loss_mlp": 1.07263327, + "epoch": 0.05224710656846535, + "flos": 22127293774080.0, + "grad_norm": 2.1448269109564198, + "language_loss": 0.83076257, + "learning_rate": 3.994810983642281e-06, + "loss": 0.85379148, + "num_input_tokens_seen": 18575720, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.6484375, + "step": 869, + "time_per_iteration": 2.5021841526031494 + }, + { + "auxiliary_loss_clip": 0.01237154, + "auxiliary_loss_mlp": 0.01057742, + "balance_loss_clip": 1.03201652, + "balance_loss_mlp": 1.07245827, + "epoch": 0.052307229821133325, + "flos": 11145092976000.0, + "grad_norm": 2.352948725027126, + "language_loss": 0.87544227, + "learning_rate": 3.994782909218751e-06, + "loss": 0.89839119, + "num_input_tokens_seen": 18592185, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.6484375, + "step": 870, + "time_per_iteration": 2.459662437438965 + }, + { + "auxiliary_loss_clip": 0.01238457, + "auxiliary_loss_mlp": 0.01067184, + "balance_loss_clip": 1.04135191, + "balance_loss_mlp": 1.07536197, + "epoch": 0.05236735307380129, + "flos": 19128070005120.0, + "grad_norm": 1.9212028950510787, + "language_loss": 0.80554998, + "learning_rate": 3.994754759152854e-06, + "loss": 0.82860637, + "num_input_tokens_seen": 18609560, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.6328125, + "step": 871, + "time_per_iteration": 2.4701170921325684 + }, + { + "auxiliary_loss_clip": 0.01234905, + "auxiliary_loss_mlp": 0.01064695, + "balance_loss_clip": 1.04009032, + "balance_loss_mlp": 1.07576704, + "epoch": 0.05242747632646926, + "flos": 20960663944320.0, + "grad_norm": 1.5975290841395262, + "language_loss": 0.81374049, + "learning_rate": 3.994726533445656e-06, + "loss": 0.8367365, + "num_input_tokens_seen": 18629405, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.59375, + "step": 872, + "time_per_iteration": 2.4886369705200195 + }, + { + "auxiliary_loss_clip": 0.0111147, + "auxiliary_loss_mlp": 0.0101489, + "balance_loss_clip": 1.00952566, + "balance_loss_mlp": 1.03955865, + "epoch": 0.052487599579137234, + "flos": 65020542842880.0, + "grad_norm": 0.8879269166117758, + "language_loss": 0.61579192, + "learning_rate": 3.9946982320982274e-06, + "loss": 0.63705552, + "num_input_tokens_seen": 18681480, + "router_z_loss_clip": 0.05371094, + "router_z_loss_mlp": 0.71875, + "step": 873, + "time_per_iteration": 2.9913430213928223 + }, + { + "auxiliary_loss_clip": 0.01231663, + "auxiliary_loss_mlp": 0.01058247, + "balance_loss_clip": 1.03245032, + "balance_loss_mlp": 1.07107997, + "epoch": 0.0525477228318052, + "flos": 23288859786240.0, + "grad_norm": 1.8426182555123698, + "language_loss": 0.88426232, + "learning_rate": 3.994669855111643e-06, + "loss": 0.90716141, + "num_input_tokens_seen": 18700390, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.6015625, + "step": 874, + "time_per_iteration": 2.4794461727142334 + }, + { + "auxiliary_loss_clip": 0.0123222, + "auxiliary_loss_mlp": 0.01062298, + "balance_loss_clip": 1.03626251, + "balance_loss_mlp": 1.06908488, + "epoch": 0.05260784608447317, + "flos": 32230221546240.0, + "grad_norm": 2.2494767595307628, + "language_loss": 0.74779439, + "learning_rate": 3.994641402486977e-06, + "loss": 0.77073956, + "num_input_tokens_seen": 18721280, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.6328125, + "step": 875, + "time_per_iteration": 2.5768113136291504 + }, + { + "auxiliary_loss_clip": 0.01228414, + "auxiliary_loss_mlp": 0.01061086, + "balance_loss_clip": 1.03412056, + "balance_loss_mlp": 1.06905699, + "epoch": 0.052667969337141136, + "flos": 24463211040000.0, + "grad_norm": 2.052141253618648, + "language_loss": 0.92836702, + "learning_rate": 3.99461287422531e-06, + "loss": 0.951262, + "num_input_tokens_seen": 18741545, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.59375, + "step": 876, + "time_per_iteration": 2.535587787628174 + }, + { + "auxiliary_loss_clip": 0.0110653, + "auxiliary_loss_mlp": 0.01009036, + "balance_loss_clip": 1.00379074, + "balance_loss_mlp": 1.03698087, + "epoch": 0.05272809258980911, + "flos": 57784329567360.0, + "grad_norm": 0.854570032578524, + "language_loss": 0.62934959, + "learning_rate": 3.994584270327722e-06, + "loss": 0.6505053, + "num_input_tokens_seen": 18801400, + "router_z_loss_clip": 0.05249023, + "router_z_loss_mlp": 0.6953125, + "step": 877, + "time_per_iteration": 3.094581127166748 + }, + { + "auxiliary_loss_clip": 0.01231545, + "auxiliary_loss_mlp": 0.01069359, + "balance_loss_clip": 1.04174972, + "balance_loss_mlp": 1.06975055, + "epoch": 0.05278821584247708, + "flos": 17420805596160.0, + "grad_norm": 2.154366240232031, + "language_loss": 0.85691291, + "learning_rate": 3.994555590795299e-06, + "loss": 0.87992191, + "num_input_tokens_seen": 18819670, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.6171875, + "step": 878, + "time_per_iteration": 2.5052285194396973 + }, + { + "auxiliary_loss_clip": 0.01232133, + "auxiliary_loss_mlp": 0.01063559, + "balance_loss_clip": 1.03754723, + "balance_loss_mlp": 1.06974411, + "epoch": 0.052848339095145046, + "flos": 26137258346880.0, + "grad_norm": 2.0833089409086942, + "language_loss": 0.82790506, + "learning_rate": 3.9945268356291275e-06, + "loss": 0.85086197, + "num_input_tokens_seen": 18840580, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.625, + "step": 879, + "time_per_iteration": 2.564312219619751 + }, + { + "auxiliary_loss_clip": 0.01227867, + "auxiliary_loss_mlp": 0.01066877, + "balance_loss_clip": 1.04011488, + "balance_loss_mlp": 1.06966615, + "epoch": 0.05290846234781302, + "flos": 16472081623680.0, + "grad_norm": 4.271066320440391, + "language_loss": 0.84404933, + "learning_rate": 3.9944980048302985e-06, + "loss": 0.86699677, + "num_input_tokens_seen": 18859295, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.578125, + "step": 880, + "time_per_iteration": 2.4854133129119873 + }, + { + "auxiliary_loss_clip": 0.01233797, + "auxiliary_loss_mlp": 0.01069821, + "balance_loss_clip": 1.04360688, + "balance_loss_mlp": 1.07206059, + "epoch": 0.05296858560048098, + "flos": 19865173000320.0, + "grad_norm": 3.515636761469604, + "language_loss": 0.87156737, + "learning_rate": 3.994469098399906e-06, + "loss": 0.89460361, + "num_input_tokens_seen": 18877485, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.625, + "step": 881, + "time_per_iteration": 2.476846933364868 + }, + { + "auxiliary_loss_clip": 0.01228751, + "auxiliary_loss_mlp": 0.01065941, + "balance_loss_clip": 1.03789103, + "balance_loss_mlp": 1.06813371, + "epoch": 0.053028708853148955, + "flos": 24388588535040.0, + "grad_norm": 1.9345214626214409, + "language_loss": 0.87682849, + "learning_rate": 3.994440116339046e-06, + "loss": 0.89977539, + "num_input_tokens_seen": 18898275, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.609375, + "step": 882, + "time_per_iteration": 2.6449031829833984 + }, + { + "auxiliary_loss_clip": 0.01233714, + "auxiliary_loss_mlp": 0.01065669, + "balance_loss_clip": 1.03825057, + "balance_loss_mlp": 1.07030129, + "epoch": 0.05308883210581693, + "flos": 36393166143360.0, + "grad_norm": 2.7245054008776814, + "language_loss": 0.68869275, + "learning_rate": 3.994411058648816e-06, + "loss": 0.71168661, + "num_input_tokens_seen": 18920665, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.6328125, + "step": 883, + "time_per_iteration": 2.620363235473633 + }, + { + "auxiliary_loss_clip": 0.01225388, + "auxiliary_loss_mlp": 0.01060527, + "balance_loss_clip": 1.03461075, + "balance_loss_mlp": 1.06937146, + "epoch": 0.05314895535848489, + "flos": 22855095146880.0, + "grad_norm": 1.9628498458506696, + "language_loss": 0.75887203, + "learning_rate": 3.994381925330319e-06, + "loss": 0.78173113, + "num_input_tokens_seen": 18939835, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.5546875, + "step": 884, + "time_per_iteration": 2.4948067665100098 + }, + { + "auxiliary_loss_clip": 0.01225729, + "auxiliary_loss_mlp": 0.01063879, + "balance_loss_clip": 1.03870201, + "balance_loss_mlp": 1.06921601, + "epoch": 0.053209078611152864, + "flos": 12860330204160.0, + "grad_norm": 2.00306560312032, + "language_loss": 0.85323638, + "learning_rate": 3.994352716384659e-06, + "loss": 0.87613249, + "num_input_tokens_seen": 18958405, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.5625, + "step": 885, + "time_per_iteration": 2.5159530639648438 + }, + { + "auxiliary_loss_clip": 0.01228523, + "auxiliary_loss_mlp": 0.01068973, + "balance_loss_clip": 1.04205549, + "balance_loss_mlp": 1.06673646, + "epoch": 0.05326920186382083, + "flos": 12164596698240.0, + "grad_norm": 2.6316893825734344, + "language_loss": 0.85726082, + "learning_rate": 3.994323431812945e-06, + "loss": 0.88023585, + "num_input_tokens_seen": 18975445, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.6171875, + "step": 886, + "time_per_iteration": 2.4650700092315674 + }, + { + "auxiliary_loss_clip": 0.01226585, + "auxiliary_loss_mlp": 0.01066459, + "balance_loss_clip": 1.03908896, + "balance_loss_mlp": 1.06944001, + "epoch": 0.0533293251164888, + "flos": 22704485420160.0, + "grad_norm": 2.1517488326805214, + "language_loss": 0.89229804, + "learning_rate": 3.994294071616286e-06, + "loss": 0.91522843, + "num_input_tokens_seen": 18991930, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.5703125, + "step": 887, + "time_per_iteration": 2.5020337104797363 + }, + { + "auxiliary_loss_clip": 0.01227687, + "auxiliary_loss_mlp": 0.01070962, + "balance_loss_clip": 1.04270935, + "balance_loss_mlp": 1.06604195, + "epoch": 0.053389448369156774, + "flos": 26940939200640.0, + "grad_norm": 2.2836036404275593, + "language_loss": 0.75076836, + "learning_rate": 3.994264635795796e-06, + "loss": 0.77375484, + "num_input_tokens_seen": 19009790, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.6171875, + "step": 888, + "time_per_iteration": 2.5055694580078125 + }, + { + "auxiliary_loss_clip": 0.0123028, + "auxiliary_loss_mlp": 0.0107639, + "balance_loss_clip": 1.0480895, + "balance_loss_mlp": 1.07113457, + "epoch": 0.05344957162182474, + "flos": 25556331686400.0, + "grad_norm": 2.032914331295681, + "language_loss": 0.88330352, + "learning_rate": 3.994235124352592e-06, + "loss": 0.90637028, + "num_input_tokens_seen": 19030170, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.59375, + "step": 889, + "time_per_iteration": 2.5147650241851807 + }, + { + "auxiliary_loss_clip": 0.01222875, + "auxiliary_loss_mlp": 0.01053174, + "balance_loss_clip": 1.02748489, + "balance_loss_mlp": 1.06732821, + "epoch": 0.05350969487449271, + "flos": 19719591177600.0, + "grad_norm": 1.9726085703824752, + "language_loss": 0.88269985, + "learning_rate": 3.994205537287791e-06, + "loss": 0.90546036, + "num_input_tokens_seen": 19048075, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.5546875, + "step": 890, + "time_per_iteration": 2.490300416946411 + }, + { + "auxiliary_loss_clip": 0.01225662, + "auxiliary_loss_mlp": 0.0107145, + "balance_loss_clip": 1.04612982, + "balance_loss_mlp": 1.06690812, + "epoch": 0.053569818127160676, + "flos": 27016351804800.0, + "grad_norm": 2.320271972022273, + "language_loss": 0.93251556, + "learning_rate": 3.994175874602517e-06, + "loss": 0.95548671, + "num_input_tokens_seen": 19067465, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.59375, + "step": 891, + "time_per_iteration": 2.5133957862854004 + }, + { + "auxiliary_loss_clip": 0.01225404, + "auxiliary_loss_mlp": 0.01062212, + "balance_loss_clip": 1.03506804, + "balance_loss_mlp": 1.06682086, + "epoch": 0.05362994137982865, + "flos": 13188338225280.0, + "grad_norm": 2.238230674372026, + "language_loss": 0.71759057, + "learning_rate": 3.994146136297893e-06, + "loss": 0.74046671, + "num_input_tokens_seen": 19085505, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.5859375, + "step": 892, + "time_per_iteration": 2.5544779300689697 + }, + { + "auxiliary_loss_clip": 0.01229119, + "auxiliary_loss_mlp": 0.01067529, + "balance_loss_clip": 1.0421617, + "balance_loss_mlp": 1.06946719, + "epoch": 0.05369006463249662, + "flos": 28658008022400.0, + "grad_norm": 2.3204520758070037, + "language_loss": 0.82304287, + "learning_rate": 3.994116322375049e-06, + "loss": 0.84600937, + "num_input_tokens_seen": 19104360, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.6015625, + "step": 893, + "time_per_iteration": 5.3903117179870605 + }, + { + "auxiliary_loss_clip": 0.0122945, + "auxiliary_loss_mlp": 0.01070342, + "balance_loss_clip": 1.04430699, + "balance_loss_mlp": 1.0679965, + "epoch": 0.053750187885164585, + "flos": 28913153304960.0, + "grad_norm": 2.3808217776212937, + "language_loss": 0.81695569, + "learning_rate": 3.994086432835114e-06, + "loss": 0.83995366, + "num_input_tokens_seen": 19124680, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.609375, + "step": 894, + "time_per_iteration": 2.52809476852417 + }, + { + "auxiliary_loss_clip": 0.01227471, + "auxiliary_loss_mlp": 0.01065449, + "balance_loss_clip": 1.03915119, + "balance_loss_mlp": 1.06881404, + "epoch": 0.05381031113783256, + "flos": 15158828476800.0, + "grad_norm": 2.5337894710206093, + "language_loss": 0.76043701, + "learning_rate": 3.994056467679221e-06, + "loss": 0.7833662, + "num_input_tokens_seen": 19142895, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.5859375, + "step": 895, + "time_per_iteration": 2.4810688495635986 + }, + { + "auxiliary_loss_clip": 0.01238307, + "auxiliary_loss_mlp": 0.01057353, + "balance_loss_clip": 1.03022122, + "balance_loss_mlp": 1.07260597, + "epoch": 0.05387043439050053, + "flos": 21835232288640.0, + "grad_norm": 2.2065839001211156, + "language_loss": 0.86456096, + "learning_rate": 3.9940264269085065e-06, + "loss": 0.88751751, + "num_input_tokens_seen": 19163125, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.65625, + "step": 896, + "time_per_iteration": 2.522254467010498 + }, + { + "auxiliary_loss_clip": 0.01231325, + "auxiliary_loss_mlp": 0.01062675, + "balance_loss_clip": 1.03495908, + "balance_loss_mlp": 1.06809413, + "epoch": 0.053930557643168495, + "flos": 17310308382720.0, + "grad_norm": 2.1680285530564274, + "language_loss": 0.87949234, + "learning_rate": 3.9939963105241115e-06, + "loss": 0.90243232, + "num_input_tokens_seen": 19179385, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.6328125, + "step": 897, + "time_per_iteration": 2.457918167114258 + }, + { + "auxiliary_loss_clip": 0.0122574, + "auxiliary_loss_mlp": 0.01063765, + "balance_loss_clip": 1.03570318, + "balance_loss_mlp": 1.06723523, + "epoch": 0.05399068089583647, + "flos": 17348481561600.0, + "grad_norm": 1.7359050724031848, + "language_loss": 0.9035244, + "learning_rate": 3.993966118527175e-06, + "loss": 0.9264195, + "num_input_tokens_seen": 19198725, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.5859375, + "step": 898, + "time_per_iteration": 2.4593143463134766 + }, + { + "auxiliary_loss_clip": 0.01234899, + "auxiliary_loss_mlp": 0.01084595, + "balance_loss_clip": 1.05808282, + "balance_loss_mlp": 1.07024622, + "epoch": 0.05405080414850443, + "flos": 17486952491520.0, + "grad_norm": 3.958355519485596, + "language_loss": 0.91756964, + "learning_rate": 3.993935850918845e-06, + "loss": 0.94076455, + "num_input_tokens_seen": 19212380, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.6484375, + "step": 899, + "time_per_iteration": 2.4461729526519775 + }, + { + "auxiliary_loss_clip": 0.01225208, + "auxiliary_loss_mlp": 0.01065344, + "balance_loss_clip": 1.03964233, + "balance_loss_mlp": 1.06601286, + "epoch": 0.054110927401172404, + "flos": 24496787278080.0, + "grad_norm": 2.6493739136310643, + "language_loss": 0.75594276, + "learning_rate": 3.9939055077002665e-06, + "loss": 0.77884829, + "num_input_tokens_seen": 19232235, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.59375, + "step": 900, + "time_per_iteration": 2.5180957317352295 + }, + { + "auxiliary_loss_clip": 0.01231903, + "auxiliary_loss_mlp": 0.01059763, + "balance_loss_clip": 1.03413296, + "balance_loss_mlp": 1.06860638, + "epoch": 0.054171050653840376, + "flos": 22930040874240.0, + "grad_norm": 2.2496787705299908, + "language_loss": 0.7377668, + "learning_rate": 3.993875088872592e-06, + "loss": 0.76068342, + "num_input_tokens_seen": 19251460, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.6328125, + "step": 901, + "time_per_iteration": 2.49638032913208 + }, + { + "auxiliary_loss_clip": 0.01221671, + "auxiliary_loss_mlp": 0.01074944, + "balance_loss_clip": 1.04982698, + "balance_loss_mlp": 1.06662059, + "epoch": 0.05423117390650834, + "flos": 12933192942720.0, + "grad_norm": 2.0553503619333586, + "language_loss": 0.85004938, + "learning_rate": 3.9938445944369745e-06, + "loss": 0.87301552, + "num_input_tokens_seen": 19269060, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.546875, + "step": 902, + "time_per_iteration": 2.5067105293273926 + }, + { + "auxiliary_loss_clip": 0.01226177, + "auxiliary_loss_mlp": 0.0106299, + "balance_loss_clip": 1.03638279, + "balance_loss_mlp": 1.06769705, + "epoch": 0.05429129715917631, + "flos": 19901335017600.0, + "grad_norm": 2.0002475654879195, + "language_loss": 0.8655951, + "learning_rate": 3.993814024394569e-06, + "loss": 0.8884868, + "num_input_tokens_seen": 19288620, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5859375, + "step": 903, + "time_per_iteration": 2.522193670272827 + }, + { + "auxiliary_loss_clip": 0.01227512, + "auxiliary_loss_mlp": 0.01062198, + "balance_loss_clip": 1.03622222, + "balance_loss_mlp": 1.06904316, + "epoch": 0.05435142041184428, + "flos": 16908611610240.0, + "grad_norm": 2.4298091072226855, + "language_loss": 0.74835998, + "learning_rate": 3.993783378746537e-06, + "loss": 0.77125704, + "num_input_tokens_seen": 19306615, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.578125, + "step": 904, + "time_per_iteration": 2.456969976425171 + }, + { + "auxiliary_loss_clip": 0.0123038, + "auxiliary_loss_mlp": 0.01073252, + "balance_loss_clip": 1.04685879, + "balance_loss_mlp": 1.06905615, + "epoch": 0.05441154366451225, + "flos": 23948323534080.0, + "grad_norm": 2.0843949675352356, + "language_loss": 0.85750329, + "learning_rate": 3.993752657494039e-06, + "loss": 0.8805396, + "num_input_tokens_seen": 19321680, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.609375, + "step": 905, + "time_per_iteration": 2.5358731746673584 + }, + { + "auxiliary_loss_clip": 0.01227222, + "auxiliary_loss_mlp": 0.01078235, + "balance_loss_clip": 1.05317712, + "balance_loss_mlp": 1.07247257, + "epoch": 0.05447166691718022, + "flos": 19975382904960.0, + "grad_norm": 1.7937911991915148, + "language_loss": 0.74028552, + "learning_rate": 3.993721860638241e-06, + "loss": 0.76334012, + "num_input_tokens_seen": 19339760, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.546875, + "step": 906, + "time_per_iteration": 2.468331813812256 + }, + { + "auxiliary_loss_clip": 0.01228766, + "auxiliary_loss_mlp": 0.01065896, + "balance_loss_clip": 1.03909731, + "balance_loss_mlp": 1.06858826, + "epoch": 0.05453179016984819, + "flos": 24936513575040.0, + "grad_norm": 2.220044948377472, + "language_loss": 0.87410975, + "learning_rate": 3.993690988180309e-06, + "loss": 0.89705634, + "num_input_tokens_seen": 19359585, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.6015625, + "step": 907, + "time_per_iteration": 2.5177390575408936 + }, + { + "auxiliary_loss_clip": 0.01227557, + "auxiliary_loss_mlp": 0.01071851, + "balance_loss_clip": 1.04521942, + "balance_loss_mlp": 1.07002556, + "epoch": 0.05459191342251616, + "flos": 18115102558080.0, + "grad_norm": 1.8689281211501179, + "language_loss": 0.86915505, + "learning_rate": 3.9936600401214165e-06, + "loss": 0.89214909, + "num_input_tokens_seen": 19378590, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.578125, + "step": 908, + "time_per_iteration": 2.45135498046875 + }, + { + "auxiliary_loss_clip": 0.01225417, + "auxiliary_loss_mlp": 0.01068459, + "balance_loss_clip": 1.04073071, + "balance_loss_mlp": 1.06842148, + "epoch": 0.054652036675184125, + "flos": 19208295031680.0, + "grad_norm": 2.409525813232516, + "language_loss": 0.89454836, + "learning_rate": 3.9936290164627345e-06, + "loss": 0.91748714, + "num_input_tokens_seen": 19397910, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.5703125, + "step": 909, + "time_per_iteration": 2.4702625274658203 + }, + { + "auxiliary_loss_clip": 0.01231345, + "auxiliary_loss_mlp": 0.01075786, + "balance_loss_clip": 1.04773629, + "balance_loss_mlp": 1.06930447, + "epoch": 0.0547121599278521, + "flos": 16325745615360.0, + "grad_norm": 2.4022545211155593, + "language_loss": 0.70942473, + "learning_rate": 3.99359791720544e-06, + "loss": 0.73249602, + "num_input_tokens_seen": 19415950, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.625, + "step": 910, + "time_per_iteration": 2.4530797004699707 + }, + { + "auxiliary_loss_clip": 0.01224757, + "auxiliary_loss_mlp": 0.01055797, + "balance_loss_clip": 1.03002357, + "balance_loss_mlp": 1.06815219, + "epoch": 0.05477228318052007, + "flos": 20339014239360.0, + "grad_norm": 2.0100188286094745, + "language_loss": 0.8349818, + "learning_rate": 3.993566742350714e-06, + "loss": 0.85778737, + "num_input_tokens_seen": 19435275, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.5625, + "step": 911, + "time_per_iteration": 2.4792025089263916 + }, + { + "auxiliary_loss_clip": 0.01224017, + "auxiliary_loss_mlp": 0.01072081, + "balance_loss_clip": 1.04524732, + "balance_loss_mlp": 1.06649613, + "epoch": 0.054832406433188034, + "flos": 21973092687360.0, + "grad_norm": 2.746196883211308, + "language_loss": 0.76096344, + "learning_rate": 3.993535491899736e-06, + "loss": 0.7839244, + "num_input_tokens_seen": 19452090, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.578125, + "step": 912, + "time_per_iteration": 2.4651522636413574 + }, + { + "auxiliary_loss_clip": 0.01219912, + "auxiliary_loss_mlp": 0.01052416, + "balance_loss_clip": 1.02733433, + "balance_loss_mlp": 1.06664968, + "epoch": 0.054892529685856006, + "flos": 16398931576320.0, + "grad_norm": 2.385296939765248, + "language_loss": 0.82667339, + "learning_rate": 3.993504165853694e-06, + "loss": 0.84939671, + "num_input_tokens_seen": 19470865, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.53125, + "step": 913, + "time_per_iteration": 2.475384473800659 + }, + { + "auxiliary_loss_clip": 0.01224168, + "auxiliary_loss_mlp": 0.01061883, + "balance_loss_clip": 1.03633678, + "balance_loss_mlp": 1.07065797, + "epoch": 0.05495265293852397, + "flos": 23912341084800.0, + "grad_norm": 2.227172084037845, + "language_loss": 0.83470452, + "learning_rate": 3.993472764213772e-06, + "loss": 0.85756505, + "num_input_tokens_seen": 19492145, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.53125, + "step": 914, + "time_per_iteration": 2.5741806030273438 + }, + { + "auxiliary_loss_clip": 0.01229195, + "auxiliary_loss_mlp": 0.01057782, + "balance_loss_clip": 1.03324902, + "balance_loss_mlp": 1.07264161, + "epoch": 0.055012776191191944, + "flos": 23586954756480.0, + "grad_norm": 2.897688985464872, + "language_loss": 0.9010309, + "learning_rate": 3.9934412869811655e-06, + "loss": 0.92390066, + "num_input_tokens_seen": 19511015, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.5625, + "step": 915, + "time_per_iteration": 2.492981433868408 + }, + { + "auxiliary_loss_clip": 0.01225584, + "auxiliary_loss_mlp": 0.01055475, + "balance_loss_clip": 1.03046489, + "balance_loss_mlp": 1.0708915, + "epoch": 0.055072899443859916, + "flos": 17528501548800.0, + "grad_norm": 1.870109983937874, + "language_loss": 0.89555848, + "learning_rate": 3.993409734157064e-06, + "loss": 0.91836905, + "num_input_tokens_seen": 19529040, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.546875, + "step": 916, + "time_per_iteration": 2.4621188640594482 + }, + { + "auxiliary_loss_clip": 0.01228011, + "auxiliary_loss_mlp": 0.01072271, + "balance_loss_clip": 1.04593801, + "balance_loss_mlp": 1.06942379, + "epoch": 0.05513302269652788, + "flos": 21687172427520.0, + "grad_norm": 1.7933741103180343, + "language_loss": 0.80085957, + "learning_rate": 3.993378105742666e-06, + "loss": 0.82386243, + "num_input_tokens_seen": 19549540, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.5859375, + "step": 917, + "time_per_iteration": 2.49455189704895 + }, + { + "auxiliary_loss_clip": 0.01225592, + "auxiliary_loss_mlp": 0.01058516, + "balance_loss_clip": 1.03270769, + "balance_loss_mlp": 1.06678224, + "epoch": 0.05519314594919585, + "flos": 21613340021760.0, + "grad_norm": 1.9216560267302982, + "language_loss": 0.79673612, + "learning_rate": 3.9933464017391705e-06, + "loss": 0.81957722, + "num_input_tokens_seen": 19567570, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.59375, + "step": 918, + "time_per_iteration": 2.504734516143799 + }, + { + "auxiliary_loss_clip": 0.01223712, + "auxiliary_loss_mlp": 0.01059794, + "balance_loss_clip": 1.03414011, + "balance_loss_mlp": 1.06658053, + "epoch": 0.05525326920186382, + "flos": 21798567480960.0, + "grad_norm": 1.9394116717498289, + "language_loss": 0.89132315, + "learning_rate": 3.99331462214778e-06, + "loss": 0.91415823, + "num_input_tokens_seen": 19585330, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.5703125, + "step": 919, + "time_per_iteration": 2.5087900161743164 + }, + { + "auxiliary_loss_clip": 0.01219042, + "auxiliary_loss_mlp": 0.01068553, + "balance_loss_clip": 1.0427916, + "balance_loss_mlp": 1.06515777, + "epoch": 0.05531339245453179, + "flos": 28439635288320.0, + "grad_norm": 2.688355226699252, + "language_loss": 0.87421197, + "learning_rate": 3.993282766969699e-06, + "loss": 0.89708793, + "num_input_tokens_seen": 19604970, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.5390625, + "step": 920, + "time_per_iteration": 2.536914348602295 + }, + { + "auxiliary_loss_clip": 0.01223828, + "auxiliary_loss_mlp": 0.01063036, + "balance_loss_clip": 1.03733468, + "balance_loss_mlp": 1.06937671, + "epoch": 0.05537351570719976, + "flos": 37375143131520.0, + "grad_norm": 2.1255302161497704, + "language_loss": 0.65921712, + "learning_rate": 3.993250836206136e-06, + "loss": 0.68208569, + "num_input_tokens_seen": 19626235, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.546875, + "step": 921, + "time_per_iteration": 2.643416166305542 + }, + { + "auxiliary_loss_clip": 0.01229793, + "auxiliary_loss_mlp": 0.0106877, + "balance_loss_clip": 1.03969455, + "balance_loss_mlp": 1.0698204, + "epoch": 0.05543363895986773, + "flos": 20084479488000.0, + "grad_norm": 2.143682946402907, + "language_loss": 0.71841472, + "learning_rate": 3.993218829858301e-06, + "loss": 0.74140036, + "num_input_tokens_seen": 19644305, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.6015625, + "step": 922, + "time_per_iteration": 2.4544074535369873 + }, + { + "auxiliary_loss_clip": 0.0122536, + "auxiliary_loss_mlp": 0.01070183, + "balance_loss_clip": 1.04346824, + "balance_loss_mlp": 1.0669136, + "epoch": 0.0554937622125357, + "flos": 24533200690560.0, + "grad_norm": 2.766492717488127, + "language_loss": 0.82548857, + "learning_rate": 3.993186747927408e-06, + "loss": 0.84844404, + "num_input_tokens_seen": 19662130, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5859375, + "step": 923, + "time_per_iteration": 2.490915536880493 + }, + { + "auxiliary_loss_clip": 0.01221243, + "auxiliary_loss_mlp": 0.01068053, + "balance_loss_clip": 1.04194593, + "balance_loss_mlp": 1.06429458, + "epoch": 0.055553885465203665, + "flos": 14320063013760.0, + "grad_norm": 2.2095756655687397, + "language_loss": 0.78808558, + "learning_rate": 3.993154590414675e-06, + "loss": 0.81097853, + "num_input_tokens_seen": 19680715, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.5625, + "step": 924, + "time_per_iteration": 2.45884108543396 + }, + { + "auxiliary_loss_clip": 0.0121918, + "auxiliary_loss_mlp": 0.01059373, + "balance_loss_clip": 1.03245521, + "balance_loss_mlp": 1.06480467, + "epoch": 0.05561400871787164, + "flos": 27381132374400.0, + "grad_norm": 1.9513803878946447, + "language_loss": 1.02250028, + "learning_rate": 3.993122357321319e-06, + "loss": 1.04528582, + "num_input_tokens_seen": 19700535, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.546875, + "step": 925, + "time_per_iteration": 2.5296268463134766 + }, + { + "auxiliary_loss_clip": 0.01220429, + "auxiliary_loss_mlp": 0.01055633, + "balance_loss_clip": 1.02975261, + "balance_loss_mlp": 1.0634799, + "epoch": 0.05567413197053961, + "flos": 23221096778880.0, + "grad_norm": 2.3756260245044687, + "language_loss": 0.80808276, + "learning_rate": 3.993090048648564e-06, + "loss": 0.83084333, + "num_input_tokens_seen": 19718825, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.5703125, + "step": 926, + "time_per_iteration": 2.496299982070923 + }, + { + "auxiliary_loss_clip": 0.01229405, + "auxiliary_loss_mlp": 0.01068259, + "balance_loss_clip": 1.04049563, + "balance_loss_mlp": 1.06743848, + "epoch": 0.055734255223207574, + "flos": 25264952559360.0, + "grad_norm": 2.4713559623940924, + "language_loss": 0.73378903, + "learning_rate": 3.993057664397634e-06, + "loss": 0.75676566, + "num_input_tokens_seen": 19739080, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.6171875, + "step": 927, + "time_per_iteration": 2.5607478618621826 + }, + { + "auxiliary_loss_clip": 0.01103967, + "auxiliary_loss_mlp": 0.01014529, + "balance_loss_clip": 1.00837731, + "balance_loss_mlp": 1.03639269, + "epoch": 0.055794378475875546, + "flos": 66503116702080.0, + "grad_norm": 0.7814837823676635, + "language_loss": 0.5989722, + "learning_rate": 3.9930252045697585e-06, + "loss": 0.62015712, + "num_input_tokens_seen": 19802960, + "router_z_loss_clip": 0.0612793, + "router_z_loss_mlp": 0.67578125, + "step": 928, + "time_per_iteration": 3.0945305824279785 + }, + { + "auxiliary_loss_clip": 0.01223562, + "auxiliary_loss_mlp": 0.01066756, + "balance_loss_clip": 1.04035151, + "balance_loss_mlp": 1.06729245, + "epoch": 0.05585450172854351, + "flos": 25337635729920.0, + "grad_norm": 2.3037954576101587, + "language_loss": 0.95011377, + "learning_rate": 3.992992669166168e-06, + "loss": 0.97301698, + "num_input_tokens_seen": 19822765, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.5625, + "step": 929, + "time_per_iteration": 2.527270555496216 + }, + { + "auxiliary_loss_clip": 0.01221186, + "auxiliary_loss_mlp": 0.01067668, + "balance_loss_clip": 1.03924894, + "balance_loss_mlp": 1.06494856, + "epoch": 0.05591462498121148, + "flos": 33911738881920.0, + "grad_norm": 2.1540114832188553, + "language_loss": 0.71827871, + "learning_rate": 3.992960058188094e-06, + "loss": 0.74116725, + "num_input_tokens_seen": 19843590, + "router_z_loss_clip": 0.28320312, + "router_z_loss_mlp": 1.5625, + "step": 930, + "time_per_iteration": 2.57513689994812 + }, + { + "auxiliary_loss_clip": 0.01227654, + "auxiliary_loss_mlp": 0.01062398, + "balance_loss_clip": 1.0355165, + "balance_loss_mlp": 1.06905401, + "epoch": 0.055974748233879455, + "flos": 17930880679680.0, + "grad_norm": 2.336481182624628, + "language_loss": 0.85333288, + "learning_rate": 3.992927371636776e-06, + "loss": 0.87623346, + "num_input_tokens_seen": 19860230, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.5859375, + "step": 931, + "time_per_iteration": 2.459167957305908 + }, + { + "auxiliary_loss_clip": 0.01224553, + "auxiliary_loss_mlp": 0.01072004, + "balance_loss_clip": 1.0448482, + "balance_loss_mlp": 1.06556344, + "epoch": 0.05603487148654742, + "flos": 24021976371840.0, + "grad_norm": 1.9723738142749898, + "language_loss": 0.83577204, + "learning_rate": 3.9928946095134525e-06, + "loss": 0.85873753, + "num_input_tokens_seen": 19880795, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.59375, + "step": 932, + "time_per_iteration": 2.4930593967437744 + }, + { + "auxiliary_loss_clip": 0.01223225, + "auxiliary_loss_mlp": 0.01067756, + "balance_loss_clip": 1.04012322, + "balance_loss_mlp": 1.06712675, + "epoch": 0.05609499473921539, + "flos": 17307758517120.0, + "grad_norm": 2.411257667891357, + "language_loss": 0.73405433, + "learning_rate": 3.992861771819365e-06, + "loss": 0.75696409, + "num_input_tokens_seen": 19897960, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.5625, + "step": 933, + "time_per_iteration": 2.526521682739258 + }, + { + "auxiliary_loss_clip": 0.01219811, + "auxiliary_loss_mlp": 0.01070368, + "balance_loss_clip": 1.04328358, + "balance_loss_mlp": 1.06432819, + "epoch": 0.05615511799188336, + "flos": 20994742972800.0, + "grad_norm": 2.577929883809357, + "language_loss": 0.86850882, + "learning_rate": 3.99282885855576e-06, + "loss": 0.89141059, + "num_input_tokens_seen": 19913315, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.5546875, + "step": 934, + "time_per_iteration": 5.338034391403198 + }, + { + "auxiliary_loss_clip": 0.01220003, + "auxiliary_loss_mlp": 0.01069693, + "balance_loss_clip": 1.04495692, + "balance_loss_mlp": 1.06842983, + "epoch": 0.05621524124455133, + "flos": 17273535834240.0, + "grad_norm": 2.2060919587088965, + "language_loss": 0.80243224, + "learning_rate": 3.992795869723885e-06, + "loss": 0.82532918, + "num_input_tokens_seen": 19928790, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.515625, + "step": 935, + "time_per_iteration": 3.8198087215423584 + }, + { + "auxiliary_loss_clip": 0.01094527, + "auxiliary_loss_mlp": 0.01008984, + "balance_loss_clip": 1.00321388, + "balance_loss_mlp": 1.02876139, + "epoch": 0.0562753644972193, + "flos": 58719370458240.0, + "grad_norm": 0.8225714537835027, + "language_loss": 0.69179416, + "learning_rate": 3.99276280532499e-06, + "loss": 0.71282923, + "num_input_tokens_seen": 19988785, + "router_z_loss_clip": 0.05761719, + "router_z_loss_mlp": 0.65625, + "step": 936, + "time_per_iteration": 2.9585764408111572 + }, + { + "auxiliary_loss_clip": 0.01220636, + "auxiliary_loss_mlp": 0.01067113, + "balance_loss_clip": 1.04123259, + "balance_loss_mlp": 1.06387568, + "epoch": 0.05633548774988727, + "flos": 17457039440640.0, + "grad_norm": 2.5168182860703237, + "language_loss": 0.75900578, + "learning_rate": 3.992729665360331e-06, + "loss": 0.78188324, + "num_input_tokens_seen": 20007685, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.5703125, + "step": 937, + "time_per_iteration": 2.4891855716705322 + }, + { + "auxiliary_loss_clip": 0.01091743, + "auxiliary_loss_mlp": 0.01002728, + "balance_loss_clip": 0.99738711, + "balance_loss_mlp": 1.02642298, + "epoch": 0.05639561100255524, + "flos": 70654928083200.0, + "grad_norm": 0.8631606334327763, + "language_loss": 0.64287508, + "learning_rate": 3.992696449831162e-06, + "loss": 0.66381979, + "num_input_tokens_seen": 20072750, + "router_z_loss_clip": 0.0534668, + "router_z_loss_mlp": 0.65625, + "step": 938, + "time_per_iteration": 3.0239782333374023 + }, + { + "auxiliary_loss_clip": 0.01226335, + "auxiliary_loss_mlp": 0.0107197, + "balance_loss_clip": 1.04487348, + "balance_loss_mlp": 1.06571174, + "epoch": 0.056455734255223204, + "flos": 20485996692480.0, + "grad_norm": 4.570077538128457, + "language_loss": 0.7903074, + "learning_rate": 3.992663158738745e-06, + "loss": 0.81329048, + "num_input_tokens_seen": 20089070, + "router_z_loss_clip": 0.27148438, + "router_z_loss_mlp": 1.609375, + "step": 939, + "time_per_iteration": 2.494706630706787 + }, + { + "auxiliary_loss_clip": 0.012214, + "auxiliary_loss_mlp": 0.01063948, + "balance_loss_clip": 1.03868759, + "balance_loss_mlp": 1.0669229, + "epoch": 0.056515857507891176, + "flos": 22053569109120.0, + "grad_norm": 1.950609958048397, + "language_loss": 0.73893893, + "learning_rate": 3.992629792084341e-06, + "loss": 0.76179242, + "num_input_tokens_seen": 20108790, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.546875, + "step": 940, + "time_per_iteration": 2.5279061794281006 + }, + { + "auxiliary_loss_clip": 0.01220257, + "auxiliary_loss_mlp": 0.01064421, + "balance_loss_clip": 1.03776574, + "balance_loss_mlp": 1.06722569, + "epoch": 0.05657598076055915, + "flos": 24025316336640.0, + "grad_norm": 1.9142676693922898, + "language_loss": 0.70475829, + "learning_rate": 3.992596349869216e-06, + "loss": 0.72760499, + "num_input_tokens_seen": 20128455, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.53125, + "step": 941, + "time_per_iteration": 2.551604747772217 + }, + { + "auxiliary_loss_clip": 0.01218348, + "auxiliary_loss_mlp": 0.01058281, + "balance_loss_clip": 1.03229308, + "balance_loss_mlp": 1.06624675, + "epoch": 0.05663610401322711, + "flos": 20480609652480.0, + "grad_norm": 2.3045436850665917, + "language_loss": 0.80928791, + "learning_rate": 3.992562832094637e-06, + "loss": 0.83205426, + "num_input_tokens_seen": 20145775, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.515625, + "step": 942, + "time_per_iteration": 2.515646457672119 + }, + { + "auxiliary_loss_clip": 0.01214197, + "auxiliary_loss_mlp": 0.01057859, + "balance_loss_clip": 1.03245521, + "balance_loss_mlp": 1.062042, + "epoch": 0.056696227265895086, + "flos": 21069042255360.0, + "grad_norm": 2.7900678467193205, + "language_loss": 0.88067353, + "learning_rate": 3.9925292387618755e-06, + "loss": 0.9033941, + "num_input_tokens_seen": 20164315, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.5234375, + "step": 943, + "time_per_iteration": 2.674614191055298 + }, + { + "auxiliary_loss_clip": 0.01220399, + "auxiliary_loss_mlp": 0.01056577, + "balance_loss_clip": 1.03182912, + "balance_loss_mlp": 1.06757212, + "epoch": 0.05675635051856306, + "flos": 17821317219840.0, + "grad_norm": 2.6837069047913924, + "language_loss": 0.75092185, + "learning_rate": 3.992495569872206e-06, + "loss": 0.77369165, + "num_input_tokens_seen": 20182760, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.5234375, + "step": 944, + "time_per_iteration": 2.5806639194488525 + }, + { + "auxiliary_loss_clip": 0.01215674, + "auxiliary_loss_mlp": 0.01062669, + "balance_loss_clip": 1.0385294, + "balance_loss_mlp": 1.06267428, + "epoch": 0.05681647377123102, + "flos": 23114945111040.0, + "grad_norm": 1.7462690351912153, + "language_loss": 0.79321784, + "learning_rate": 3.992461825426906e-06, + "loss": 0.8160013, + "num_input_tokens_seen": 20203830, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.53125, + "step": 945, + "time_per_iteration": 2.695613384246826 + }, + { + "auxiliary_loss_clip": 0.01218347, + "auxiliary_loss_mlp": 0.01061935, + "balance_loss_clip": 1.03628159, + "balance_loss_mlp": 1.06407309, + "epoch": 0.056876597023898995, + "flos": 16070528505600.0, + "grad_norm": 2.1794845223078556, + "language_loss": 0.82465631, + "learning_rate": 3.992428005427252e-06, + "loss": 0.84745914, + "num_input_tokens_seen": 20220365, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.546875, + "step": 946, + "time_per_iteration": 2.6081790924072266 + }, + { + "auxiliary_loss_clip": 0.01223255, + "auxiliary_loss_mlp": 0.01059618, + "balance_loss_clip": 1.03258097, + "balance_loss_mlp": 1.06615055, + "epoch": 0.05693672027656696, + "flos": 16835641130880.0, + "grad_norm": 2.7693395657309297, + "language_loss": 0.7904911, + "learning_rate": 3.992394109874529e-06, + "loss": 0.8133198, + "num_input_tokens_seen": 20238640, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.5703125, + "step": 947, + "time_per_iteration": 2.460472822189331 + }, + { + "auxiliary_loss_clip": 0.01227462, + "auxiliary_loss_mlp": 0.01065027, + "balance_loss_clip": 1.03890848, + "balance_loss_mlp": 1.06883287, + "epoch": 0.05699684352923493, + "flos": 21389113370880.0, + "grad_norm": 7.046260534289203, + "language_loss": 0.85772789, + "learning_rate": 3.9923601387700225e-06, + "loss": 0.88065279, + "num_input_tokens_seen": 20251025, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.5859375, + "step": 948, + "time_per_iteration": 2.4560892581939697 + }, + { + "auxiliary_loss_clip": 0.01217019, + "auxiliary_loss_mlp": 0.01060985, + "balance_loss_clip": 1.03374553, + "balance_loss_mlp": 1.06329989, + "epoch": 0.057056966781902904, + "flos": 15560309767680.0, + "grad_norm": 1.8055084405958775, + "language_loss": 0.87044799, + "learning_rate": 3.992326092115019e-06, + "loss": 0.89322805, + "num_input_tokens_seen": 20269775, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.5390625, + "step": 949, + "time_per_iteration": 2.4843316078186035 + }, + { + "auxiliary_loss_clip": 0.01212611, + "auxiliary_loss_mlp": 0.01066298, + "balance_loss_clip": 1.04170561, + "balance_loss_mlp": 1.06284809, + "epoch": 0.05711709003457087, + "flos": 19937856170880.0, + "grad_norm": 2.230679935648155, + "language_loss": 0.79035759, + "learning_rate": 3.992291969910811e-06, + "loss": 0.81314665, + "num_input_tokens_seen": 20287715, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.4921875, + "step": 950, + "time_per_iteration": 2.468172311782837 + }, + { + "auxiliary_loss_clip": 0.01221984, + "auxiliary_loss_mlp": 0.01068601, + "balance_loss_clip": 1.04365039, + "balance_loss_mlp": 1.06574106, + "epoch": 0.05717721328723884, + "flos": 30332701774080.0, + "grad_norm": 2.0871877141587682, + "language_loss": 0.8244521, + "learning_rate": 3.992257772158691e-06, + "loss": 0.84735799, + "num_input_tokens_seen": 20307070, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.5625, + "step": 951, + "time_per_iteration": 2.5418505668640137 + }, + { + "auxiliary_loss_clip": 0.01215404, + "auxiliary_loss_mlp": 0.01062639, + "balance_loss_clip": 1.03568625, + "balance_loss_mlp": 1.06129527, + "epoch": 0.05723733653990681, + "flos": 23654358627840.0, + "grad_norm": 2.5400916768099426, + "language_loss": 0.86685216, + "learning_rate": 3.992223498859958e-06, + "loss": 0.88963258, + "num_input_tokens_seen": 20324945, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.5390625, + "step": 952, + "time_per_iteration": 2.513356924057007 + }, + { + "auxiliary_loss_clip": 0.0122001, + "auxiliary_loss_mlp": 0.01062958, + "balance_loss_clip": 1.03415656, + "balance_loss_mlp": 1.06145215, + "epoch": 0.05729745979257478, + "flos": 22055759838720.0, + "grad_norm": 1.725154467975805, + "language_loss": 0.79043579, + "learning_rate": 3.9921891500159084e-06, + "loss": 0.81326544, + "num_input_tokens_seen": 20346135, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.5859375, + "step": 953, + "time_per_iteration": 2.490940570831299 + }, + { + "auxiliary_loss_clip": 0.01223554, + "auxiliary_loss_mlp": 0.01063244, + "balance_loss_clip": 1.03592086, + "balance_loss_mlp": 1.06757712, + "epoch": 0.05735758304524275, + "flos": 19604353368960.0, + "grad_norm": 2.2937199779067106, + "language_loss": 0.87086606, + "learning_rate": 3.992154725627848e-06, + "loss": 0.89373398, + "num_input_tokens_seen": 20364450, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.5625, + "step": 954, + "time_per_iteration": 2.495039701461792 + }, + { + "auxiliary_loss_clip": 0.01221375, + "auxiliary_loss_mlp": 0.01062344, + "balance_loss_clip": 1.03707159, + "balance_loss_mlp": 1.06446028, + "epoch": 0.057417706297910716, + "flos": 19099018880640.0, + "grad_norm": 2.3514674671771933, + "language_loss": 0.87789929, + "learning_rate": 3.9921202256970804e-06, + "loss": 0.90073651, + "num_input_tokens_seen": 20383500, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.5625, + "step": 955, + "time_per_iteration": 2.5018017292022705 + }, + { + "auxiliary_loss_clip": 0.01214985, + "auxiliary_loss_mlp": 0.01065732, + "balance_loss_clip": 1.04000711, + "balance_loss_mlp": 1.06217909, + "epoch": 0.05747782955057869, + "flos": 16654507822080.0, + "grad_norm": 3.7193659196918576, + "language_loss": 0.89682388, + "learning_rate": 3.992085650224914e-06, + "loss": 0.919631, + "num_input_tokens_seen": 20400295, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.53125, + "step": 956, + "time_per_iteration": 2.43306565284729 + }, + { + "auxiliary_loss_clip": 0.01212174, + "auxiliary_loss_mlp": 0.0105844, + "balance_loss_clip": 1.03232098, + "balance_loss_mlp": 1.06344521, + "epoch": 0.05753795280324665, + "flos": 14502058248960.0, + "grad_norm": 1.7667772588634594, + "language_loss": 0.75335747, + "learning_rate": 3.99205099921266e-06, + "loss": 0.77606356, + "num_input_tokens_seen": 20419085, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.484375, + "step": 957, + "time_per_iteration": 2.469240665435791 + }, + { + "auxiliary_loss_clip": 0.01218166, + "auxiliary_loss_mlp": 0.01075955, + "balance_loss_clip": 1.04713011, + "balance_loss_mlp": 1.06214452, + "epoch": 0.057598076055914625, + "flos": 18076318848000.0, + "grad_norm": 1.8974624224625587, + "language_loss": 0.79871029, + "learning_rate": 3.992016272661633e-06, + "loss": 0.82165146, + "num_input_tokens_seen": 20437465, + "router_z_loss_clip": 0.2890625, + "router_z_loss_mlp": 1.5625, + "step": 958, + "time_per_iteration": 2.5016849040985107 + }, + { + "auxiliary_loss_clip": 0.01214009, + "auxiliary_loss_mlp": 0.01062008, + "balance_loss_clip": 1.03780818, + "balance_loss_mlp": 1.06024444, + "epoch": 0.0576581993085826, + "flos": 22124600254080.0, + "grad_norm": 2.5702669091422234, + "language_loss": 0.88410264, + "learning_rate": 3.99198147057315e-06, + "loss": 0.90686285, + "num_input_tokens_seen": 20456235, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.5390625, + "step": 959, + "time_per_iteration": 2.4830191135406494 + }, + { + "auxiliary_loss_clip": 0.01211651, + "auxiliary_loss_mlp": 0.01063458, + "balance_loss_clip": 1.03832912, + "balance_loss_mlp": 1.0626018, + "epoch": 0.05771832256125056, + "flos": 33181746779520.0, + "grad_norm": 2.6997220185951347, + "language_loss": 0.78556621, + "learning_rate": 3.991946592948529e-06, + "loss": 0.8083173, + "num_input_tokens_seen": 20476825, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.4921875, + "step": 960, + "time_per_iteration": 2.569218397140503 + }, + { + "auxiliary_loss_clip": 0.01217172, + "auxiliary_loss_mlp": 0.01063539, + "balance_loss_clip": 1.03685999, + "balance_loss_mlp": 1.06168103, + "epoch": 0.057778445813918534, + "flos": 24170143973760.0, + "grad_norm": 4.159271492638429, + "language_loss": 0.932491, + "learning_rate": 3.991911639789094e-06, + "loss": 0.95529813, + "num_input_tokens_seen": 20496965, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.5546875, + "step": 961, + "time_per_iteration": 2.511765480041504 + }, + { + "auxiliary_loss_clip": 0.01215042, + "auxiliary_loss_mlp": 0.01070899, + "balance_loss_clip": 1.04411268, + "balance_loss_mlp": 1.06039667, + "epoch": 0.0578385690665865, + "flos": 29643037666560.0, + "grad_norm": 2.532017623976099, + "language_loss": 0.6822986, + "learning_rate": 3.991876611096169e-06, + "loss": 0.70515805, + "num_input_tokens_seen": 20518035, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.546875, + "step": 962, + "time_per_iteration": 2.544498920440674 + }, + { + "auxiliary_loss_clip": 0.01214012, + "auxiliary_loss_mlp": 0.01068596, + "balance_loss_clip": 1.04461062, + "balance_loss_mlp": 1.06268489, + "epoch": 0.05789869231925447, + "flos": 20885430908160.0, + "grad_norm": 2.445305128304827, + "language_loss": 0.88187808, + "learning_rate": 3.991841506871084e-06, + "loss": 0.90470415, + "num_input_tokens_seen": 20534740, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.515625, + "step": 963, + "time_per_iteration": 2.459487199783325 + }, + { + "auxiliary_loss_clip": 0.01222623, + "auxiliary_loss_mlp": 0.01058866, + "balance_loss_clip": 1.03337944, + "balance_loss_mlp": 1.06633568, + "epoch": 0.057958815571922444, + "flos": 26031106679040.0, + "grad_norm": 2.5656796350524473, + "language_loss": 0.84858835, + "learning_rate": 3.99180632711517e-06, + "loss": 0.87140322, + "num_input_tokens_seen": 20553485, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.5625, + "step": 964, + "time_per_iteration": 2.5268235206604004 + }, + { + "auxiliary_loss_clip": 0.01216658, + "auxiliary_loss_mlp": 0.01067828, + "balance_loss_clip": 1.04157782, + "balance_loss_mlp": 1.06309247, + "epoch": 0.05801893882459041, + "flos": 18077683564800.0, + "grad_norm": 2.846103019544017, + "language_loss": 0.77748007, + "learning_rate": 3.99177107182976e-06, + "loss": 0.80032492, + "num_input_tokens_seen": 20572155, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.5390625, + "step": 965, + "time_per_iteration": 2.4572315216064453 + }, + { + "auxiliary_loss_clip": 0.01211478, + "auxiliary_loss_mlp": 0.01068539, + "balance_loss_clip": 1.04424393, + "balance_loss_mlp": 1.0614084, + "epoch": 0.05807906207725838, + "flos": 17748885444480.0, + "grad_norm": 2.4479010977704463, + "language_loss": 0.80922461, + "learning_rate": 3.99173574101619e-06, + "loss": 0.83202475, + "num_input_tokens_seen": 20590395, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.5, + "step": 966, + "time_per_iteration": 2.4682776927948 + }, + { + "auxiliary_loss_clip": 0.01212307, + "auxiliary_loss_mlp": 0.01061872, + "balance_loss_clip": 1.03730273, + "balance_loss_mlp": 1.06173599, + "epoch": 0.058139185329926346, + "flos": 18040372312320.0, + "grad_norm": 1.8643875206872442, + "language_loss": 0.76291096, + "learning_rate": 3.9917003346758035e-06, + "loss": 0.78565276, + "num_input_tokens_seen": 20608435, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.5, + "step": 967, + "time_per_iteration": 2.453474521636963 + }, + { + "auxiliary_loss_clip": 0.01084806, + "auxiliary_loss_mlp": 0.01040579, + "balance_loss_clip": 1.03547657, + "balance_loss_mlp": 1.02152586, + "epoch": 0.05819930858259432, + "flos": 62363297485440.0, + "grad_norm": 0.7926144837125159, + "language_loss": 0.57362092, + "learning_rate": 3.991664852809939e-06, + "loss": 0.59487474, + "num_input_tokens_seen": 20668575, + "router_z_loss_clip": 0.05102539, + "router_z_loss_mlp": 0.6328125, + "step": 968, + "time_per_iteration": 2.994419574737549 + }, + { + "auxiliary_loss_clip": 0.01218807, + "auxiliary_loss_mlp": 0.01055354, + "balance_loss_clip": 1.02865148, + "balance_loss_mlp": 1.06574845, + "epoch": 0.05825943183526229, + "flos": 19135360465920.0, + "grad_norm": 2.057389892616485, + "language_loss": 0.82289147, + "learning_rate": 3.991629295419945e-06, + "loss": 0.84563303, + "num_input_tokens_seen": 20687355, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.53125, + "step": 969, + "time_per_iteration": 2.4718844890594482 + }, + { + "auxiliary_loss_clip": 0.01217673, + "auxiliary_loss_mlp": 0.01056826, + "balance_loss_clip": 1.03105259, + "balance_loss_mlp": 1.06392384, + "epoch": 0.058319555087930255, + "flos": 29022465369600.0, + "grad_norm": 2.1897875503845725, + "language_loss": 0.780442, + "learning_rate": 3.991593662507167e-06, + "loss": 0.80318701, + "num_input_tokens_seen": 20705710, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.5390625, + "step": 970, + "time_per_iteration": 2.5632171630859375 + }, + { + "auxiliary_loss_clip": 0.01216631, + "auxiliary_loss_mlp": 0.01054997, + "balance_loss_clip": 1.02809155, + "balance_loss_mlp": 1.06188202, + "epoch": 0.05837967834059823, + "flos": 18879999701760.0, + "grad_norm": 2.6802242915962, + "language_loss": 0.92492616, + "learning_rate": 3.991557954072958e-06, + "loss": 0.94764245, + "num_input_tokens_seen": 20722405, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.546875, + "step": 971, + "time_per_iteration": 2.4642531871795654 + }, + { + "auxiliary_loss_clip": 0.01210603, + "auxiliary_loss_mlp": 0.010597, + "balance_loss_clip": 1.03439212, + "balance_loss_mlp": 1.05865097, + "epoch": 0.05843980159326619, + "flos": 25703062744320.0, + "grad_norm": 3.0470884327064276, + "language_loss": 0.86133701, + "learning_rate": 3.991522170118673e-06, + "loss": 0.88404, + "num_input_tokens_seen": 20741480, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.515625, + "step": 972, + "time_per_iteration": 2.5298526287078857 + }, + { + "auxiliary_loss_clip": 0.01212752, + "auxiliary_loss_mlp": 0.01064681, + "balance_loss_clip": 1.04038596, + "balance_loss_mlp": 1.0636549, + "epoch": 0.058499924845934165, + "flos": 25552129795200.0, + "grad_norm": 2.0754734138997906, + "language_loss": 0.87340444, + "learning_rate": 3.991486310645667e-06, + "loss": 0.89617872, + "num_input_tokens_seen": 20759685, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.4921875, + "step": 973, + "time_per_iteration": 2.5198311805725098 + }, + { + "auxiliary_loss_clip": 0.01213937, + "auxiliary_loss_mlp": 0.01067264, + "balance_loss_clip": 1.04070425, + "balance_loss_mlp": 1.06140256, + "epoch": 0.05856004809860214, + "flos": 16436171001600.0, + "grad_norm": 3.2539468590332707, + "language_loss": 0.74868345, + "learning_rate": 3.991450375655301e-06, + "loss": 0.77149546, + "num_input_tokens_seen": 20778180, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5234375, + "step": 974, + "time_per_iteration": 2.465268850326538 + }, + { + "auxiliary_loss_clip": 0.0121359, + "auxiliary_loss_mlp": 0.01059075, + "balance_loss_clip": 1.03308713, + "balance_loss_mlp": 1.06260133, + "epoch": 0.0586201713512701, + "flos": 39458824116480.0, + "grad_norm": 1.7891188847385684, + "language_loss": 0.76707923, + "learning_rate": 3.991414365148936e-06, + "loss": 0.78980577, + "num_input_tokens_seen": 20802705, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.515625, + "step": 975, + "time_per_iteration": 2.633850336074829 + }, + { + "auxiliary_loss_clip": 0.01216778, + "auxiliary_loss_mlp": 0.01068456, + "balance_loss_clip": 1.04332697, + "balance_loss_mlp": 1.0621978, + "epoch": 0.058680294603938074, + "flos": 23365170230400.0, + "grad_norm": 2.0981769673049326, + "language_loss": 0.76878488, + "learning_rate": 3.99137827912794e-06, + "loss": 0.79163718, + "num_input_tokens_seen": 20822540, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.546875, + "step": 976, + "time_per_iteration": 6.8309245109558105 + }, + { + "auxiliary_loss_clip": 0.01210296, + "auxiliary_loss_mlp": 0.01061517, + "balance_loss_clip": 1.03606534, + "balance_loss_mlp": 1.0585494, + "epoch": 0.05874041785660604, + "flos": 32232017226240.0, + "grad_norm": 1.8109666318996334, + "language_loss": 0.87465948, + "learning_rate": 3.991342117593679e-06, + "loss": 0.89737761, + "num_input_tokens_seen": 20844175, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.515625, + "step": 977, + "time_per_iteration": 2.5693395137786865 + }, + { + "auxiliary_loss_clip": 0.01213396, + "auxiliary_loss_mlp": 0.01064383, + "balance_loss_clip": 1.0380497, + "balance_loss_mlp": 1.06246471, + "epoch": 0.05880054110927401, + "flos": 22310043194880.0, + "grad_norm": 1.7886661734827753, + "language_loss": 0.79517525, + "learning_rate": 3.991305880547527e-06, + "loss": 0.81795299, + "num_input_tokens_seen": 20864730, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.5078125, + "step": 978, + "time_per_iteration": 2.51609206199646 + }, + { + "auxiliary_loss_clip": 0.01218239, + "auxiliary_loss_mlp": 0.01069938, + "balance_loss_clip": 1.04339027, + "balance_loss_mlp": 1.06304932, + "epoch": 0.05886066436194198, + "flos": 27380450016000.0, + "grad_norm": 2.6270410794651102, + "language_loss": 0.80902123, + "learning_rate": 3.991269567990855e-06, + "loss": 0.83190298, + "num_input_tokens_seen": 20885200, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.546875, + "step": 979, + "time_per_iteration": 2.527127504348755 + }, + { + "auxiliary_loss_clip": 0.01085971, + "auxiliary_loss_mlp": 0.01009543, + "balance_loss_clip": 1.0044651, + "balance_loss_mlp": 1.02304745, + "epoch": 0.05892078761460995, + "flos": 59584493525760.0, + "grad_norm": 0.94528472512207, + "language_loss": 0.59059429, + "learning_rate": 3.9912331799250415e-06, + "loss": 0.61154944, + "num_input_tokens_seen": 20940325, + "router_z_loss_clip": 0.05078125, + "router_z_loss_mlp": 0.62890625, + "step": 980, + "time_per_iteration": 2.9545915126800537 + }, + { + "auxiliary_loss_clip": 0.01210703, + "auxiliary_loss_mlp": 0.0106402, + "balance_loss_clip": 1.03747201, + "balance_loss_mlp": 1.0622623, + "epoch": 0.05898091086727792, + "flos": 15414081500160.0, + "grad_norm": 2.3915266710240917, + "language_loss": 0.86397457, + "learning_rate": 3.9911967163514665e-06, + "loss": 0.88672185, + "num_input_tokens_seen": 20958220, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.484375, + "step": 981, + "time_per_iteration": 2.4726293087005615 + }, + { + "auxiliary_loss_clip": 0.01212695, + "auxiliary_loss_mlp": 0.01057503, + "balance_loss_clip": 1.03423381, + "balance_loss_mlp": 1.06214404, + "epoch": 0.059041034119945886, + "flos": 23655328295040.0, + "grad_norm": 1.9485203495729437, + "language_loss": 0.79623365, + "learning_rate": 3.991160177271513e-06, + "loss": 0.81893563, + "num_input_tokens_seen": 20978920, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.5078125, + "step": 982, + "time_per_iteration": 2.5271458625793457 + }, + { + "auxiliary_loss_clip": 0.01219179, + "auxiliary_loss_mlp": 0.01060762, + "balance_loss_clip": 1.03571582, + "balance_loss_mlp": 1.06248748, + "epoch": 0.05910115737261386, + "flos": 24754087376640.0, + "grad_norm": 2.5320957946125437, + "language_loss": 0.84376037, + "learning_rate": 3.9911235626865654e-06, + "loss": 0.86655974, + "num_input_tokens_seen": 20999490, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.5625, + "step": 983, + "time_per_iteration": 2.526364803314209 + }, + { + "auxiliary_loss_clip": 0.01212847, + "auxiliary_loss_mlp": 0.01067453, + "balance_loss_clip": 1.04361129, + "balance_loss_mlp": 1.06317604, + "epoch": 0.05916128062528183, + "flos": 11728749070080.0, + "grad_norm": 1.8446015864025267, + "language_loss": 0.84607553, + "learning_rate": 3.9910868725980125e-06, + "loss": 0.86887848, + "num_input_tokens_seen": 21017865, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.4921875, + "step": 984, + "time_per_iteration": 2.456803321838379 + }, + { + "auxiliary_loss_clip": 0.01211466, + "auxiliary_loss_mlp": 0.01059154, + "balance_loss_clip": 1.03551483, + "balance_loss_mlp": 1.06338882, + "epoch": 0.059221403877949795, + "flos": 21902995296000.0, + "grad_norm": 2.3276500524021495, + "language_loss": 0.77875566, + "learning_rate": 3.9910501070072465e-06, + "loss": 0.80146182, + "num_input_tokens_seen": 21035900, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.484375, + "step": 985, + "time_per_iteration": 2.504096508026123 + }, + { + "auxiliary_loss_clip": 0.01215785, + "auxiliary_loss_mlp": 0.01061307, + "balance_loss_clip": 1.03661919, + "balance_loss_mlp": 1.06191659, + "epoch": 0.05928152713061777, + "flos": 20514580940160.0, + "grad_norm": 2.294716701848832, + "language_loss": 0.90598249, + "learning_rate": 3.991013265915661e-06, + "loss": 0.92875338, + "num_input_tokens_seen": 21053235, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.5390625, + "step": 986, + "time_per_iteration": 2.4882049560546875 + }, + { + "auxiliary_loss_clip": 0.01215421, + "auxiliary_loss_mlp": 0.01062373, + "balance_loss_clip": 1.03534794, + "balance_loss_mlp": 1.06017947, + "epoch": 0.05934165038328574, + "flos": 24495135252480.0, + "grad_norm": 3.8181645576894256, + "language_loss": 0.7589798, + "learning_rate": 3.9909763493246525e-06, + "loss": 0.78175771, + "num_input_tokens_seen": 21073090, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.546875, + "step": 987, + "time_per_iteration": 2.492412805557251 + }, + { + "auxiliary_loss_clip": 0.01216653, + "auxiliary_loss_mlp": 0.01059941, + "balance_loss_clip": 1.03491902, + "balance_loss_mlp": 1.06059265, + "epoch": 0.059401773635953704, + "flos": 38728041914880.0, + "grad_norm": 2.1447391932017843, + "language_loss": 0.71525705, + "learning_rate": 3.990939357235621e-06, + "loss": 0.73802304, + "num_input_tokens_seen": 21094895, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.5625, + "step": 988, + "time_per_iteration": 2.6386756896972656 + }, + { + "auxiliary_loss_clip": 0.01081383, + "auxiliary_loss_mlp": 0.01012054, + "balance_loss_clip": 1.00680876, + "balance_loss_mlp": 1.01888978, + "epoch": 0.059461896888621676, + "flos": 58023565125120.0, + "grad_norm": 0.9344259157338769, + "language_loss": 0.71159971, + "learning_rate": 3.99090228964997e-06, + "loss": 0.73253405, + "num_input_tokens_seen": 21147555, + "router_z_loss_clip": 0.05249023, + "router_z_loss_mlp": 0.625, + "step": 989, + "time_per_iteration": 2.903996706008911 + }, + { + "auxiliary_loss_clip": 0.01219656, + "auxiliary_loss_mlp": 0.01067443, + "balance_loss_clip": 1.0404067, + "balance_loss_mlp": 1.06221163, + "epoch": 0.05952202014128964, + "flos": 22127760650880.0, + "grad_norm": 1.89069901477269, + "language_loss": 0.78102934, + "learning_rate": 3.990865146569105e-06, + "loss": 0.80390036, + "num_input_tokens_seen": 21167845, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.578125, + "step": 990, + "time_per_iteration": 2.6252431869506836 + }, + { + "auxiliary_loss_clip": 0.01208224, + "auxiliary_loss_mlp": 0.0105602, + "balance_loss_clip": 1.0303905, + "balance_loss_mlp": 1.05700588, + "epoch": 0.059582143393957614, + "flos": 20445776438400.0, + "grad_norm": 2.077710223302236, + "language_loss": 0.86406755, + "learning_rate": 3.990827927994434e-06, + "loss": 0.88671005, + "num_input_tokens_seen": 21185085, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.515625, + "step": 991, + "time_per_iteration": 2.483853340148926 + }, + { + "auxiliary_loss_clip": 0.01216429, + "auxiliary_loss_mlp": 0.01065185, + "balance_loss_clip": 1.04030573, + "balance_loss_mlp": 1.06190968, + "epoch": 0.059642266646625586, + "flos": 20594877793920.0, + "grad_norm": 2.866628977756486, + "language_loss": 0.76876801, + "learning_rate": 3.9907906339273674e-06, + "loss": 0.79158413, + "num_input_tokens_seen": 21204230, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 1.546875, + "step": 992, + "time_per_iteration": 2.5149648189544678 + }, + { + "auxiliary_loss_clip": 0.01214781, + "auxiliary_loss_mlp": 0.0106048, + "balance_loss_clip": 1.03701937, + "balance_loss_mlp": 1.06251192, + "epoch": 0.05970238989929355, + "flos": 19352655792000.0, + "grad_norm": 2.726921793738851, + "language_loss": 0.74594641, + "learning_rate": 3.9907532643693215e-06, + "loss": 0.76869899, + "num_input_tokens_seen": 21222655, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.5234375, + "step": 993, + "time_per_iteration": 2.4739816188812256 + }, + { + "auxiliary_loss_clip": 0.01214249, + "auxiliary_loss_mlp": 0.01071365, + "balance_loss_clip": 1.04560351, + "balance_loss_mlp": 1.06326771, + "epoch": 0.05976251315196152, + "flos": 30264040926720.0, + "grad_norm": 3.2517233877247396, + "language_loss": 0.78911841, + "learning_rate": 3.990715819321712e-06, + "loss": 0.81197453, + "num_input_tokens_seen": 21242310, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.515625, + "step": 994, + "time_per_iteration": 2.5408835411071777 + }, + { + "auxiliary_loss_clip": 0.01214677, + "auxiliary_loss_mlp": 0.01082728, + "balance_loss_clip": 1.05768251, + "balance_loss_mlp": 1.06170893, + "epoch": 0.05982263640462949, + "flos": 23185150243200.0, + "grad_norm": 2.42517884603863, + "language_loss": 0.79639304, + "learning_rate": 3.99067829878596e-06, + "loss": 0.81936711, + "num_input_tokens_seen": 21261410, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.53125, + "step": 995, + "time_per_iteration": 2.5062758922576904 + }, + { + "auxiliary_loss_clip": 0.01212085, + "auxiliary_loss_mlp": 0.01065995, + "balance_loss_clip": 1.04022169, + "balance_loss_mlp": 1.05969059, + "epoch": 0.05988275965729746, + "flos": 27850879463040.0, + "grad_norm": 2.536496545288829, + "language_loss": 0.86939722, + "learning_rate": 3.990640702763487e-06, + "loss": 0.89217806, + "num_input_tokens_seen": 21280080, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.53125, + "step": 996, + "time_per_iteration": 2.5236001014709473 + }, + { + "auxiliary_loss_clip": 0.01217352, + "auxiliary_loss_mlp": 0.01070048, + "balance_loss_clip": 1.04098463, + "balance_loss_mlp": 1.06309104, + "epoch": 0.05994288290996543, + "flos": 24680003575680.0, + "grad_norm": 4.013698471354103, + "language_loss": 0.88192105, + "learning_rate": 3.990603031255718e-06, + "loss": 0.90479505, + "num_input_tokens_seen": 21296765, + "router_z_loss_clip": 0.29101562, + "router_z_loss_mlp": 1.546875, + "step": 997, + "time_per_iteration": 2.483116626739502 + }, + { + "auxiliary_loss_clip": 0.01079761, + "auxiliary_loss_mlp": 0.01004407, + "balance_loss_clip": 0.9993524, + "balance_loss_mlp": 1.01837301, + "epoch": 0.0600030061626334, + "flos": 69929568835200.0, + "grad_norm": 1.020759515587473, + "language_loss": 0.75442117, + "learning_rate": 3.990565284264083e-06, + "loss": 0.77526283, + "num_input_tokens_seen": 21363345, + "router_z_loss_clip": 0.05053711, + "router_z_loss_mlp": 0.6171875, + "step": 998, + "time_per_iteration": 3.152331590652466 + }, + { + "auxiliary_loss_clip": 0.01213812, + "auxiliary_loss_mlp": 0.01067708, + "balance_loss_clip": 1.04179215, + "balance_loss_mlp": 1.0626508, + "epoch": 0.06006312941530137, + "flos": 26540140268160.0, + "grad_norm": 1.8375420281697645, + "language_loss": 0.75796127, + "learning_rate": 3.990527461790013e-06, + "loss": 0.7807765, + "num_input_tokens_seen": 21385290, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.515625, + "step": 999, + "time_per_iteration": 2.5051238536834717 + }, + { + "auxiliary_loss_clip": 0.01212853, + "auxiliary_loss_mlp": 0.0106227, + "balance_loss_clip": 1.03575778, + "balance_loss_mlp": 1.05894446, + "epoch": 0.060123252667969335, + "flos": 27344000689920.0, + "grad_norm": 1.9091686508511199, + "language_loss": 0.82658899, + "learning_rate": 3.990489563834943e-06, + "loss": 0.8493402, + "num_input_tokens_seen": 21407625, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5390625, + "step": 1000, + "time_per_iteration": 2.5369935035705566 + }, + { + "auxiliary_loss_clip": 0.01215386, + "auxiliary_loss_mlp": 0.01057348, + "balance_loss_clip": 1.03282714, + "balance_loss_mlp": 1.06143069, + "epoch": 0.06018337592063731, + "flos": 27016710940800.0, + "grad_norm": 3.4065508827059783, + "language_loss": 0.85644853, + "learning_rate": 3.990451590400309e-06, + "loss": 0.8791759, + "num_input_tokens_seen": 21426835, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.5390625, + "step": 1001, + "time_per_iteration": 2.4972190856933594 + }, + { + "auxiliary_loss_clip": 0.0120879, + "auxiliary_loss_mlp": 0.01063055, + "balance_loss_clip": 1.0376749, + "balance_loss_mlp": 1.0587517, + "epoch": 0.06024349917330528, + "flos": 25592960580480.0, + "grad_norm": 2.156321640703371, + "language_loss": 0.74386394, + "learning_rate": 3.990413541487551e-06, + "loss": 0.76658237, + "num_input_tokens_seen": 21444920, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.5, + "step": 1002, + "time_per_iteration": 2.531172275543213 + }, + { + "auxiliary_loss_clip": 0.01211576, + "auxiliary_loss_mlp": 0.01065904, + "balance_loss_clip": 1.04019034, + "balance_loss_mlp": 1.06015134, + "epoch": 0.060303622425973244, + "flos": 26133271937280.0, + "grad_norm": 3.1165374575777145, + "language_loss": 0.75346643, + "learning_rate": 3.990375417098112e-06, + "loss": 0.77624118, + "num_input_tokens_seen": 21463555, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.515625, + "step": 1003, + "time_per_iteration": 2.508817434310913 + }, + { + "auxiliary_loss_clip": 0.01219434, + "auxiliary_loss_mlp": 0.01065938, + "balance_loss_clip": 1.04047489, + "balance_loss_mlp": 1.06255794, + "epoch": 0.060363745678641216, + "flos": 20377187418240.0, + "grad_norm": 2.2578292515807603, + "language_loss": 0.70071733, + "learning_rate": 3.990337217233437e-06, + "loss": 0.723571, + "num_input_tokens_seen": 21481990, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.5625, + "step": 1004, + "time_per_iteration": 2.480116844177246 + }, + { + "auxiliary_loss_clip": 0.01218526, + "auxiliary_loss_mlp": 0.01073584, + "balance_loss_clip": 1.04810917, + "balance_loss_mlp": 1.06360686, + "epoch": 0.06042386893130918, + "flos": 17749172753280.0, + "grad_norm": 2.248554137518493, + "language_loss": 0.83246684, + "learning_rate": 3.990298941894976e-06, + "loss": 0.85538793, + "num_input_tokens_seen": 21500385, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.546875, + "step": 1005, + "time_per_iteration": 2.449733018875122 + }, + { + "auxiliary_loss_clip": 0.01077543, + "auxiliary_loss_mlp": 0.01007523, + "balance_loss_clip": 1.00306416, + "balance_loss_mlp": 1.0157814, + "epoch": 0.06048399218397715, + "flos": 68538496872960.0, + "grad_norm": 0.8959746990508154, + "language_loss": 0.59000289, + "learning_rate": 3.9902605910841794e-06, + "loss": 0.61085355, + "num_input_tokens_seen": 21561040, + "router_z_loss_clip": 0.04467773, + "router_z_loss_mlp": 0.6171875, + "step": 1006, + "time_per_iteration": 3.1583423614501953 + }, + { + "auxiliary_loss_clip": 0.01209886, + "auxiliary_loss_mlp": 0.0105727, + "balance_loss_clip": 1.03203392, + "balance_loss_mlp": 1.05658197, + "epoch": 0.060544115436645125, + "flos": 23258515772160.0, + "grad_norm": 2.271524805944984, + "language_loss": 0.7428897, + "learning_rate": 3.990222164802503e-06, + "loss": 0.76556122, + "num_input_tokens_seen": 21580655, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.53125, + "step": 1007, + "time_per_iteration": 2.49139666557312 + }, + { + "auxiliary_loss_clip": 0.01212867, + "auxiliary_loss_mlp": 0.01055047, + "balance_loss_clip": 1.02930975, + "balance_loss_mlp": 1.05897522, + "epoch": 0.06060423868931309, + "flos": 23878441624320.0, + "grad_norm": 1.8583948299039934, + "language_loss": 0.80739897, + "learning_rate": 3.9901836630514006e-06, + "loss": 0.83007812, + "num_input_tokens_seen": 21599650, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.5390625, + "step": 1008, + "time_per_iteration": 2.4990036487579346 + }, + { + "auxiliary_loss_clip": 0.01213893, + "auxiliary_loss_mlp": 0.01055804, + "balance_loss_clip": 1.03082955, + "balance_loss_mlp": 1.06254637, + "epoch": 0.06066436194198106, + "flos": 18728061171840.0, + "grad_norm": 1.935763632111394, + "language_loss": 0.77840835, + "learning_rate": 3.990145085832335e-06, + "loss": 0.80110532, + "num_input_tokens_seen": 21617550, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.515625, + "step": 1009, + "time_per_iteration": 2.4785048961639404 + }, + { + "auxiliary_loss_clip": 0.01210213, + "auxiliary_loss_mlp": 0.01059495, + "balance_loss_clip": 1.03537917, + "balance_loss_mlp": 1.06082368, + "epoch": 0.06072448519464903, + "flos": 24640465680000.0, + "grad_norm": 2.1058592784097567, + "language_loss": 0.93059653, + "learning_rate": 3.990106433146769e-06, + "loss": 0.95329368, + "num_input_tokens_seen": 21635865, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.4921875, + "step": 1010, + "time_per_iteration": 2.507596015930176 + }, + { + "auxiliary_loss_clip": 0.01219036, + "auxiliary_loss_mlp": 0.01066438, + "balance_loss_clip": 1.0390203, + "balance_loss_mlp": 1.05885124, + "epoch": 0.060784608447317, + "flos": 17378825575680.0, + "grad_norm": 3.1716667034247843, + "language_loss": 0.71846473, + "learning_rate": 3.9900677049961665e-06, + "loss": 0.74131954, + "num_input_tokens_seen": 21653945, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.6015625, + "step": 1011, + "time_per_iteration": 2.4791040420532227 + }, + { + "auxiliary_loss_clip": 0.01214432, + "auxiliary_loss_mlp": 0.01070485, + "balance_loss_clip": 1.04388905, + "balance_loss_mlp": 1.05902421, + "epoch": 0.06084473169998497, + "flos": 23692208584320.0, + "grad_norm": 2.5871469840663535, + "language_loss": 0.87542284, + "learning_rate": 3.990028901381999e-06, + "loss": 0.89827204, + "num_input_tokens_seen": 21671230, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5546875, + "step": 1012, + "time_per_iteration": 2.4876151084899902 + }, + { + "auxiliary_loss_clip": 0.01206171, + "auxiliary_loss_mlp": 0.01060353, + "balance_loss_clip": 1.03577256, + "balance_loss_mlp": 1.05505085, + "epoch": 0.06090485495265294, + "flos": 23546339452800.0, + "grad_norm": 1.8956263482043672, + "language_loss": 0.76679665, + "learning_rate": 3.989990022305734e-06, + "loss": 0.78946191, + "num_input_tokens_seen": 21691155, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.515625, + "step": 1013, + "time_per_iteration": 2.4874446392059326 + }, + { + "auxiliary_loss_clip": 0.01215089, + "auxiliary_loss_mlp": 0.01067055, + "balance_loss_clip": 1.03946972, + "balance_loss_mlp": 1.05924904, + "epoch": 0.06096497820532091, + "flos": 20339301548160.0, + "grad_norm": 2.654718290448769, + "language_loss": 0.85651302, + "learning_rate": 3.98995106776885e-06, + "loss": 0.87933445, + "num_input_tokens_seen": 21707405, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.5546875, + "step": 1014, + "time_per_iteration": 2.483774423599243 + }, + { + "auxiliary_loss_clip": 0.0122011, + "auxiliary_loss_mlp": 0.01067578, + "balance_loss_clip": 1.03996944, + "balance_loss_mlp": 1.06207335, + "epoch": 0.061025101457988874, + "flos": 26939035779840.0, + "grad_norm": 2.4287988001966028, + "language_loss": 0.72807163, + "learning_rate": 3.98991203777282e-06, + "loss": 0.75094855, + "num_input_tokens_seen": 21728090, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.578125, + "step": 1015, + "time_per_iteration": 2.520026206970215 + }, + { + "auxiliary_loss_clip": 0.01207162, + "auxiliary_loss_mlp": 0.01068406, + "balance_loss_clip": 1.04290748, + "balance_loss_mlp": 1.0576005, + "epoch": 0.061085224710656846, + "flos": 25375054723200.0, + "grad_norm": 1.6555956389633335, + "language_loss": 0.79197502, + "learning_rate": 3.9898729323191275e-06, + "loss": 0.8147307, + "num_input_tokens_seen": 21747950, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4921875, + "step": 1016, + "time_per_iteration": 2.5177054405212402 + }, + { + "auxiliary_loss_clip": 0.01210352, + "auxiliary_loss_mlp": 0.01060413, + "balance_loss_clip": 1.03609443, + "balance_loss_mlp": 1.0571332, + "epoch": 0.06114534796332482, + "flos": 24824759385600.0, + "grad_norm": 1.934405213560846, + "language_loss": 0.76170123, + "learning_rate": 3.989833751409254e-06, + "loss": 0.78440881, + "num_input_tokens_seen": 21767900, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.53125, + "step": 1017, + "time_per_iteration": 2.517730951309204 + }, + { + "auxiliary_loss_clip": 0.01220983, + "auxiliary_loss_mlp": 0.01069505, + "balance_loss_clip": 1.04331422, + "balance_loss_mlp": 1.06240773, + "epoch": 0.061205471215992784, + "flos": 20631434860800.0, + "grad_norm": 1.873264658326973, + "language_loss": 0.86145842, + "learning_rate": 3.989794495044685e-06, + "loss": 0.88436329, + "num_input_tokens_seen": 21787375, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.5859375, + "step": 1018, + "time_per_iteration": 5.324457883834839 + }, + { + "auxiliary_loss_clip": 0.01206709, + "auxiliary_loss_mlp": 0.01071464, + "balance_loss_clip": 1.045012, + "balance_loss_mlp": 1.05659163, + "epoch": 0.061265594468660756, + "flos": 16508351381760.0, + "grad_norm": 2.696758126666256, + "language_loss": 0.77535981, + "learning_rate": 3.989755163226909e-06, + "loss": 0.79814154, + "num_input_tokens_seen": 21806275, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.5, + "step": 1019, + "time_per_iteration": 2.453768253326416 + }, + { + "auxiliary_loss_clip": 0.01210848, + "auxiliary_loss_mlp": 0.01061489, + "balance_loss_clip": 1.03559661, + "balance_loss_mlp": 1.05749679, + "epoch": 0.06132571772132872, + "flos": 26246211275520.0, + "grad_norm": 1.8458417378275351, + "language_loss": 0.84254557, + "learning_rate": 3.989715755957418e-06, + "loss": 0.86526895, + "num_input_tokens_seen": 21826430, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.53125, + "step": 1020, + "time_per_iteration": 2.5126123428344727 + }, + { + "auxiliary_loss_clip": 0.01217116, + "auxiliary_loss_mlp": 0.01060663, + "balance_loss_clip": 1.0352596, + "balance_loss_mlp": 1.06234074, + "epoch": 0.06138584097399669, + "flos": 37414788768000.0, + "grad_norm": 2.186416819505148, + "language_loss": 0.79234397, + "learning_rate": 3.989676273237705e-06, + "loss": 0.81512177, + "num_input_tokens_seen": 21847800, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.546875, + "step": 1021, + "time_per_iteration": 2.583918333053589 + }, + { + "auxiliary_loss_clip": 0.01207219, + "auxiliary_loss_mlp": 0.01064403, + "balance_loss_clip": 1.04207504, + "balance_loss_mlp": 1.05748677, + "epoch": 0.061445964226664665, + "flos": 17420661941760.0, + "grad_norm": 2.2026341390443434, + "language_loss": 0.87493509, + "learning_rate": 3.9896367150692705e-06, + "loss": 0.89765131, + "num_input_tokens_seen": 21863385, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.5, + "step": 1022, + "time_per_iteration": 2.441298007965088 + }, + { + "auxiliary_loss_clip": 0.01213359, + "auxiliary_loss_mlp": 0.0106856, + "balance_loss_clip": 1.04353857, + "balance_loss_mlp": 1.06052542, + "epoch": 0.06150608747933263, + "flos": 22600021691520.0, + "grad_norm": 1.752710779550117, + "language_loss": 0.82776564, + "learning_rate": 3.989597081453611e-06, + "loss": 0.85058486, + "num_input_tokens_seen": 21881880, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.53125, + "step": 1023, + "time_per_iteration": 2.5027952194213867 + }, + { + "auxiliary_loss_clip": 0.01079025, + "auxiliary_loss_mlp": 0.01003997, + "balance_loss_clip": 0.99944335, + "balance_loss_mlp": 1.01796818, + "epoch": 0.0615662107320006, + "flos": 56741482005120.0, + "grad_norm": 0.8999264202466762, + "language_loss": 0.65078986, + "learning_rate": 3.989557372392231e-06, + "loss": 0.67162001, + "num_input_tokens_seen": 21940550, + "router_z_loss_clip": 0.0456543, + "router_z_loss_mlp": 0.609375, + "step": 1024, + "time_per_iteration": 3.0969655513763428 + }, + { + "auxiliary_loss_clip": 0.01212272, + "auxiliary_loss_mlp": 0.01066841, + "balance_loss_clip": 1.04123473, + "balance_loss_mlp": 1.05936897, + "epoch": 0.06162633398466857, + "flos": 22564793427840.0, + "grad_norm": 1.9303372998519377, + "language_loss": 0.88293028, + "learning_rate": 3.989517587886636e-06, + "loss": 0.90572149, + "num_input_tokens_seen": 21958390, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.53125, + "step": 1025, + "time_per_iteration": 2.5229876041412354 + }, + { + "auxiliary_loss_clip": 0.01212316, + "auxiliary_loss_mlp": 0.01059432, + "balance_loss_clip": 1.03513718, + "balance_loss_mlp": 1.05916524, + "epoch": 0.06168645723733654, + "flos": 25593104234880.0, + "grad_norm": 1.519276165786755, + "language_loss": 0.84567487, + "learning_rate": 3.989477727938335e-06, + "loss": 0.86839235, + "num_input_tokens_seen": 21978625, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.53125, + "step": 1026, + "time_per_iteration": 2.5304806232452393 + }, + { + "auxiliary_loss_clip": 0.01212365, + "auxiliary_loss_mlp": 0.01071013, + "balance_loss_clip": 1.04614556, + "balance_loss_mlp": 1.05798197, + "epoch": 0.06174658049000451, + "flos": 15997917162240.0, + "grad_norm": 1.9431802827698534, + "language_loss": 0.82320756, + "learning_rate": 3.989437792548839e-06, + "loss": 0.84604132, + "num_input_tokens_seen": 21996035, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 1.546875, + "step": 1027, + "time_per_iteration": 2.4508368968963623 + }, + { + "auxiliary_loss_clip": 0.01209611, + "auxiliary_loss_mlp": 0.01058329, + "balance_loss_clip": 1.03343821, + "balance_loss_mlp": 1.05799866, + "epoch": 0.06180670374267248, + "flos": 11285970117120.0, + "grad_norm": 2.262386050001272, + "language_loss": 0.84232426, + "learning_rate": 3.989397781719663e-06, + "loss": 0.86500365, + "num_input_tokens_seen": 22011625, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 1.515625, + "step": 1028, + "time_per_iteration": 2.4485137462615967 + }, + { + "auxiliary_loss_clip": 0.01077664, + "auxiliary_loss_mlp": 0.01009618, + "balance_loss_clip": 1.00544536, + "balance_loss_mlp": 1.01686025, + "epoch": 0.06186682699534045, + "flos": 65130142216320.0, + "grad_norm": 0.9476883841381922, + "language_loss": 0.60497737, + "learning_rate": 3.989357695452323e-06, + "loss": 0.6258502, + "num_input_tokens_seen": 22066035, + "router_z_loss_clip": 0.04174805, + "router_z_loss_mlp": 0.609375, + "step": 1029, + "time_per_iteration": 2.8714137077331543 + }, + { + "auxiliary_loss_clip": 0.0120304, + "auxiliary_loss_mlp": 0.01066238, + "balance_loss_clip": 1.0419786, + "balance_loss_mlp": 1.05338669, + "epoch": 0.061926950248008414, + "flos": 21105742976640.0, + "grad_norm": 2.297452518318954, + "language_loss": 0.82309926, + "learning_rate": 3.98931753374834e-06, + "loss": 0.84579194, + "num_input_tokens_seen": 22085015, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.4921875, + "step": 1030, + "time_per_iteration": 2.4705348014831543 + }, + { + "auxiliary_loss_clip": 0.01214194, + "auxiliary_loss_mlp": 0.01071397, + "balance_loss_clip": 1.04586279, + "balance_loss_mlp": 1.06025672, + "epoch": 0.061987073500676386, + "flos": 17748454481280.0, + "grad_norm": 2.391039807046215, + "language_loss": 0.80262065, + "learning_rate": 3.989277296609237e-06, + "loss": 0.82547653, + "num_input_tokens_seen": 22102775, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.5390625, + "step": 1031, + "time_per_iteration": 2.447964906692505 + }, + { + "auxiliary_loss_clip": 0.0121101, + "auxiliary_loss_mlp": 0.01075497, + "balance_loss_clip": 1.04919958, + "balance_loss_mlp": 1.05865717, + "epoch": 0.06204719675334436, + "flos": 21836237869440.0, + "grad_norm": 1.6245278130098144, + "language_loss": 0.77141201, + "learning_rate": 3.98923698403654e-06, + "loss": 0.79427713, + "num_input_tokens_seen": 22121680, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.5234375, + "step": 1032, + "time_per_iteration": 2.475891590118408 + }, + { + "auxiliary_loss_clip": 0.01205906, + "auxiliary_loss_mlp": 0.01069412, + "balance_loss_clip": 1.04350805, + "balance_loss_mlp": 1.05307126, + "epoch": 0.06210732000601232, + "flos": 19353697286400.0, + "grad_norm": 3.949793190746779, + "language_loss": 0.89276892, + "learning_rate": 3.989196596031776e-06, + "loss": 0.91552204, + "num_input_tokens_seen": 22138155, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.53125, + "step": 1033, + "time_per_iteration": 2.5332658290863037 + }, + { + "auxiliary_loss_clip": 0.01212647, + "auxiliary_loss_mlp": 0.01059216, + "balance_loss_clip": 1.03437293, + "balance_loss_mlp": 1.05739737, + "epoch": 0.062167443258680295, + "flos": 24749382695040.0, + "grad_norm": 2.160025730572359, + "language_loss": 0.84795135, + "learning_rate": 3.989156132596479e-06, + "loss": 0.87066996, + "num_input_tokens_seen": 22157420, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.5546875, + "step": 1034, + "time_per_iteration": 2.507636785507202 + }, + { + "auxiliary_loss_clip": 0.01202421, + "auxiliary_loss_mlp": 0.01060051, + "balance_loss_clip": 1.03399241, + "balance_loss_mlp": 1.05694687, + "epoch": 0.06222756651134827, + "flos": 34458478773120.0, + "grad_norm": 3.176440156188905, + "language_loss": 0.81156218, + "learning_rate": 3.989115593732182e-06, + "loss": 0.83418697, + "num_input_tokens_seen": 22178620, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.453125, + "step": 1035, + "time_per_iteration": 2.624635696411133 + }, + { + "auxiliary_loss_clip": 0.01212161, + "auxiliary_loss_mlp": 0.01068413, + "balance_loss_clip": 1.04051828, + "balance_loss_mlp": 1.06080353, + "epoch": 0.06228768976401623, + "flos": 25666469763840.0, + "grad_norm": 2.252599829484163, + "language_loss": 0.78701359, + "learning_rate": 3.989074979440421e-06, + "loss": 0.80981934, + "num_input_tokens_seen": 22197125, + "router_z_loss_clip": 0.27929688, + "router_z_loss_mlp": 1.515625, + "step": 1036, + "time_per_iteration": 2.53027081489563 + }, + { + "auxiliary_loss_clip": 0.01204167, + "auxiliary_loss_mlp": 0.01068533, + "balance_loss_clip": 1.04334402, + "balance_loss_mlp": 1.05620134, + "epoch": 0.062347813016684205, + "flos": 25295619795840.0, + "grad_norm": 1.670767972712633, + "language_loss": 0.86802149, + "learning_rate": 3.989034289722739e-06, + "loss": 0.8907485, + "num_input_tokens_seen": 22217575, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.4765625, + "step": 1037, + "time_per_iteration": 2.506011724472046 + }, + { + "auxiliary_loss_clip": 0.01206019, + "auxiliary_loss_mlp": 0.01057504, + "balance_loss_clip": 1.02990723, + "balance_loss_mlp": 1.05728471, + "epoch": 0.06240793626935217, + "flos": 26907039740160.0, + "grad_norm": 2.1914513209480933, + "language_loss": 0.81051469, + "learning_rate": 3.988993524580676e-06, + "loss": 0.83314991, + "num_input_tokens_seen": 22236840, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.484375, + "step": 1038, + "time_per_iteration": 2.486758232116699 + }, + { + "auxiliary_loss_clip": 0.01205947, + "auxiliary_loss_mlp": 0.01072566, + "balance_loss_clip": 1.04587555, + "balance_loss_mlp": 1.05856836, + "epoch": 0.06246805952202014, + "flos": 21615782146560.0, + "grad_norm": 2.3663261426095965, + "language_loss": 0.85336804, + "learning_rate": 3.98895268401578e-06, + "loss": 0.87615323, + "num_input_tokens_seen": 22256465, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.46875, + "step": 1039, + "time_per_iteration": 2.489241123199463 + }, + { + "auxiliary_loss_clip": 0.01207559, + "auxiliary_loss_mlp": 0.01067632, + "balance_loss_clip": 1.0417757, + "balance_loss_mlp": 1.05744672, + "epoch": 0.0625281827746881, + "flos": 19311896833920.0, + "grad_norm": 1.9774289629637263, + "language_loss": 0.80853289, + "learning_rate": 3.9889117680296e-06, + "loss": 0.83128488, + "num_input_tokens_seen": 22274025, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.5, + "step": 1040, + "time_per_iteration": 2.480022668838501 + }, + { + "auxiliary_loss_clip": 0.01213203, + "auxiliary_loss_mlp": 0.01067746, + "balance_loss_clip": 1.04155588, + "balance_loss_mlp": 1.06227219, + "epoch": 0.06258830602735609, + "flos": 27745769289600.0, + "grad_norm": 2.535271913081881, + "language_loss": 0.69440711, + "learning_rate": 3.988870776623685e-06, + "loss": 0.71721661, + "num_input_tokens_seen": 22292245, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.5078125, + "step": 1041, + "time_per_iteration": 2.5417978763580322 + }, + { + "auxiliary_loss_clip": 0.01210541, + "auxiliary_loss_mlp": 0.0106006, + "balance_loss_clip": 1.03360724, + "balance_loss_mlp": 1.05743289, + "epoch": 0.06264842928002405, + "flos": 23222605150080.0, + "grad_norm": 1.9564735382917973, + "language_loss": 0.80983013, + "learning_rate": 3.9888297097995905e-06, + "loss": 0.83253616, + "num_input_tokens_seen": 22311455, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.53125, + "step": 1042, + "time_per_iteration": 2.478926181793213 + }, + { + "auxiliary_loss_clip": 0.01210242, + "auxiliary_loss_mlp": 0.01057893, + "balance_loss_clip": 1.03352678, + "balance_loss_mlp": 1.05925727, + "epoch": 0.06270855253269202, + "flos": 38399495189760.0, + "grad_norm": 1.9466384226705415, + "language_loss": 0.76463902, + "learning_rate": 3.988788567558874e-06, + "loss": 0.78732038, + "num_input_tokens_seen": 22333750, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.515625, + "step": 1043, + "time_per_iteration": 2.6262781620025635 + }, + { + "auxiliary_loss_clip": 0.01203702, + "auxiliary_loss_mlp": 0.01066445, + "balance_loss_clip": 1.04174471, + "balance_loss_mlp": 1.05835676, + "epoch": 0.06276867578535998, + "flos": 22453542028800.0, + "grad_norm": 1.8860277298285366, + "language_loss": 0.92454541, + "learning_rate": 3.988747349903097e-06, + "loss": 0.94724691, + "num_input_tokens_seen": 22351940, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.453125, + "step": 1044, + "time_per_iteration": 2.4886953830718994 + }, + { + "auxiliary_loss_clip": 0.01204359, + "auxiliary_loss_mlp": 0.01073486, + "balance_loss_clip": 1.04824948, + "balance_loss_mlp": 1.05475259, + "epoch": 0.06282879903802796, + "flos": 22930435923840.0, + "grad_norm": 1.9539908597303346, + "language_loss": 0.8581354, + "learning_rate": 3.988706056833821e-06, + "loss": 0.88091385, + "num_input_tokens_seen": 22372085, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.5, + "step": 1045, + "time_per_iteration": 2.5382347106933594 + }, + { + "auxiliary_loss_clip": 0.01203094, + "auxiliary_loss_mlp": 0.01073753, + "balance_loss_clip": 1.04900479, + "balance_loss_mlp": 1.05618775, + "epoch": 0.06288892229069593, + "flos": 34819237019520.0, + "grad_norm": 2.0798822187092094, + "language_loss": 0.77675486, + "learning_rate": 3.9886646883526125e-06, + "loss": 0.79952335, + "num_input_tokens_seen": 22392020, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.46875, + "step": 1046, + "time_per_iteration": 2.548157215118408 + }, + { + "auxiliary_loss_clip": 0.01206605, + "auxiliary_loss_mlp": 0.01074859, + "balance_loss_clip": 1.04981279, + "balance_loss_mlp": 1.05837655, + "epoch": 0.06294904554336389, + "flos": 19427134642560.0, + "grad_norm": 2.197016946040243, + "language_loss": 0.77317166, + "learning_rate": 3.988623244461039e-06, + "loss": 0.79598629, + "num_input_tokens_seen": 22411180, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.4765625, + "step": 1047, + "time_per_iteration": 2.467973232269287 + }, + { + "auxiliary_loss_clip": 0.0121283, + "auxiliary_loss_mlp": 0.01061299, + "balance_loss_clip": 1.03584743, + "balance_loss_mlp": 1.05874014, + "epoch": 0.06300916879603187, + "flos": 40661867358720.0, + "grad_norm": 2.3103480986625753, + "language_loss": 0.7696203, + "learning_rate": 3.988581725160672e-06, + "loss": 0.79236162, + "num_input_tokens_seen": 22435105, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.546875, + "step": 1048, + "time_per_iteration": 2.636072874069214 + }, + { + "auxiliary_loss_clip": 0.01209565, + "auxiliary_loss_mlp": 0.01072791, + "balance_loss_clip": 1.0470655, + "balance_loss_mlp": 1.0583266, + "epoch": 0.06306929204869983, + "flos": 23804142341760.0, + "grad_norm": 2.2069714466600656, + "language_loss": 0.77757037, + "learning_rate": 3.988540130453087e-06, + "loss": 0.80039394, + "num_input_tokens_seen": 22452710, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.515625, + "step": 1049, + "time_per_iteration": 2.5173420906066895 + }, + { + "auxiliary_loss_clip": 0.01207985, + "auxiliary_loss_mlp": 0.01065489, + "balance_loss_clip": 1.04047871, + "balance_loss_mlp": 1.05734015, + "epoch": 0.0631294153013678, + "flos": 18915802583040.0, + "grad_norm": 2.316298014027776, + "language_loss": 0.83165503, + "learning_rate": 3.988498460339862e-06, + "loss": 0.85438979, + "num_input_tokens_seen": 22470175, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.5078125, + "step": 1050, + "time_per_iteration": 2.4742541313171387 + }, + { + "auxiliary_loss_clip": 0.01204381, + "auxiliary_loss_mlp": 0.01062607, + "balance_loss_clip": 1.03852665, + "balance_loss_mlp": 1.05776763, + "epoch": 0.06318953855403578, + "flos": 24280174310400.0, + "grad_norm": 2.1475970013183563, + "language_loss": 0.76909173, + "learning_rate": 3.988456714822575e-06, + "loss": 0.79176152, + "num_input_tokens_seen": 22490020, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.46875, + "step": 1051, + "time_per_iteration": 2.4629740715026855 + }, + { + "auxiliary_loss_clip": 0.01207556, + "auxiliary_loss_mlp": 0.01070398, + "balance_loss_clip": 1.04487562, + "balance_loss_mlp": 1.05788827, + "epoch": 0.06324966180670374, + "flos": 22528918719360.0, + "grad_norm": 2.090947022989376, + "language_loss": 0.80053556, + "learning_rate": 3.98841489390281e-06, + "loss": 0.82331514, + "num_input_tokens_seen": 22509685, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.4921875, + "step": 1052, + "time_per_iteration": 2.4729230403900146 + }, + { + "auxiliary_loss_clip": 0.01209047, + "auxiliary_loss_mlp": 0.01064567, + "balance_loss_clip": 1.03911567, + "balance_loss_mlp": 1.05839717, + "epoch": 0.06330978505937171, + "flos": 15778107884160.0, + "grad_norm": 2.21177767113968, + "language_loss": 0.78088665, + "learning_rate": 3.988372997582155e-06, + "loss": 0.80362272, + "num_input_tokens_seen": 22527905, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.5078125, + "step": 1053, + "time_per_iteration": 2.433969736099243 + }, + { + "auxiliary_loss_clip": 0.01209123, + "auxiliary_loss_mlp": 0.01055135, + "balance_loss_clip": 1.03094769, + "balance_loss_mlp": 1.0578481, + "epoch": 0.06336990831203967, + "flos": 21471098163840.0, + "grad_norm": 1.8421697124920164, + "language_loss": 0.84737611, + "learning_rate": 3.988331025862195e-06, + "loss": 0.8700186, + "num_input_tokens_seen": 22546335, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.515625, + "step": 1054, + "time_per_iteration": 2.4986183643341064 + }, + { + "auxiliary_loss_clip": 0.01205973, + "auxiliary_loss_mlp": 0.01065192, + "balance_loss_clip": 1.04051518, + "balance_loss_mlp": 1.05870843, + "epoch": 0.06343003156470765, + "flos": 18478877546880.0, + "grad_norm": 1.9255333357469135, + "language_loss": 0.8566432, + "learning_rate": 3.9882889787445225e-06, + "loss": 0.87935483, + "num_input_tokens_seen": 22563885, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.4765625, + "step": 1055, + "time_per_iteration": 2.5098037719726562 + }, + { + "auxiliary_loss_clip": 0.0121179, + "auxiliary_loss_mlp": 0.01071097, + "balance_loss_clip": 1.0451932, + "balance_loss_mlp": 1.05891657, + "epoch": 0.06349015481737562, + "flos": 25154886309120.0, + "grad_norm": 2.390503126540762, + "language_loss": 0.80966836, + "learning_rate": 3.988246856230734e-06, + "loss": 0.83249724, + "num_input_tokens_seen": 22583035, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.53125, + "step": 1056, + "time_per_iteration": 2.4944088459014893 + }, + { + "auxiliary_loss_clip": 0.01211818, + "auxiliary_loss_mlp": 0.01065832, + "balance_loss_clip": 1.03816319, + "balance_loss_mlp": 1.05503476, + "epoch": 0.06355027807004358, + "flos": 26871775562880.0, + "grad_norm": 2.70684555522199, + "language_loss": 0.81153649, + "learning_rate": 3.988204658322426e-06, + "loss": 0.83431304, + "num_input_tokens_seen": 22605055, + "router_z_loss_clip": 0.27734375, + "router_z_loss_mlp": 1.5703125, + "step": 1057, + "time_per_iteration": 2.5327882766723633 + }, + { + "auxiliary_loss_clip": 0.0119703, + "auxiliary_loss_mlp": 0.01056395, + "balance_loss_clip": 1.03401923, + "balance_loss_mlp": 1.054492, + "epoch": 0.06361040132271156, + "flos": 21396691140480.0, + "grad_norm": 2.2830641052403826, + "language_loss": 0.8369416, + "learning_rate": 3.988162385021196e-06, + "loss": 0.85947585, + "num_input_tokens_seen": 22623760, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.421875, + "step": 1058, + "time_per_iteration": 2.4742424488067627 + }, + { + "auxiliary_loss_clip": 0.01208572, + "auxiliary_loss_mlp": 0.01067718, + "balance_loss_clip": 1.03934646, + "balance_loss_mlp": 1.05714464, + "epoch": 0.06367052457537953, + "flos": 25733765894400.0, + "grad_norm": 1.9712110015930453, + "language_loss": 0.87264961, + "learning_rate": 3.988120036328651e-06, + "loss": 0.8954125, + "num_input_tokens_seen": 22643000, + "router_z_loss_clip": 0.28515625, + "router_z_loss_mlp": 1.515625, + "step": 1059, + "time_per_iteration": 5.514882564544678 + }, + { + "auxiliary_loss_clip": 0.01213823, + "auxiliary_loss_mlp": 0.01068179, + "balance_loss_clip": 1.04273927, + "balance_loss_mlp": 1.06130195, + "epoch": 0.0637306478280475, + "flos": 17631420992640.0, + "grad_norm": 2.227642611819728, + "language_loss": 0.9117676, + "learning_rate": 3.988077612246394e-06, + "loss": 0.9345876, + "num_input_tokens_seen": 22660460, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.5234375, + "step": 1060, + "time_per_iteration": 3.8977622985839844 + }, + { + "auxiliary_loss_clip": 0.01204952, + "auxiliary_loss_mlp": 0.01062848, + "balance_loss_clip": 1.03691983, + "balance_loss_mlp": 1.05582809, + "epoch": 0.06379077108071547, + "flos": 13662610427520.0, + "grad_norm": 1.9159755464944204, + "language_loss": 0.87713706, + "learning_rate": 3.988035112776035e-06, + "loss": 0.89981508, + "num_input_tokens_seen": 22679270, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.4921875, + "step": 1061, + "time_per_iteration": 2.4825663566589355 + }, + { + "auxiliary_loss_clip": 0.01213048, + "auxiliary_loss_mlp": 0.01066139, + "balance_loss_clip": 1.03862584, + "balance_loss_mlp": 1.05683804, + "epoch": 0.06385089433338344, + "flos": 28478849961600.0, + "grad_norm": 2.167309005799961, + "language_loss": 0.771905, + "learning_rate": 3.987992537919185e-06, + "loss": 0.79469687, + "num_input_tokens_seen": 22699330, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.5625, + "step": 1062, + "time_per_iteration": 2.5576398372650146 + }, + { + "auxiliary_loss_clip": 0.01206834, + "auxiliary_loss_mlp": 0.01063844, + "balance_loss_clip": 1.03896523, + "balance_loss_mlp": 1.05504322, + "epoch": 0.0639110175860514, + "flos": 24311057028480.0, + "grad_norm": 2.0414192004570872, + "language_loss": 0.86835265, + "learning_rate": 3.987949887677459e-06, + "loss": 0.89105946, + "num_input_tokens_seen": 22717945, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 1.515625, + "step": 1063, + "time_per_iteration": 2.472473382949829 + }, + { + "auxiliary_loss_clip": 0.01206458, + "auxiliary_loss_mlp": 0.0106328, + "balance_loss_clip": 1.03747082, + "balance_loss_mlp": 1.05539751, + "epoch": 0.06397114083871938, + "flos": 22090772620800.0, + "grad_norm": 2.0150359019026185, + "language_loss": 0.8051579, + "learning_rate": 3.9879071620524744e-06, + "loss": 0.82785529, + "num_input_tokens_seen": 22736790, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.515625, + "step": 1064, + "time_per_iteration": 2.478205919265747 + }, + { + "auxiliary_loss_clip": 0.01207278, + "auxiliary_loss_mlp": 0.01070567, + "balance_loss_clip": 1.04409075, + "balance_loss_mlp": 1.05682254, + "epoch": 0.06403126409138735, + "flos": 19572824206080.0, + "grad_norm": 2.254194289767691, + "language_loss": 0.84650666, + "learning_rate": 3.987864361045851e-06, + "loss": 0.86928511, + "num_input_tokens_seen": 22754745, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5078125, + "step": 1065, + "time_per_iteration": 2.456456184387207 + }, + { + "auxiliary_loss_clip": 0.01207067, + "auxiliary_loss_mlp": 0.01055171, + "balance_loss_clip": 1.03099585, + "balance_loss_mlp": 1.05966115, + "epoch": 0.06409138734405531, + "flos": 40807413267840.0, + "grad_norm": 1.66169186591579, + "language_loss": 0.68201709, + "learning_rate": 3.987821484659211e-06, + "loss": 0.70463943, + "num_input_tokens_seen": 22776780, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.46875, + "step": 1066, + "time_per_iteration": 2.6294829845428467 + }, + { + "auxiliary_loss_clip": 0.01206291, + "auxiliary_loss_mlp": 0.01076738, + "balance_loss_clip": 1.05003476, + "balance_loss_mlp": 1.05877519, + "epoch": 0.06415151059672328, + "flos": 20441610460800.0, + "grad_norm": 3.704601442813356, + "language_loss": 0.90345579, + "learning_rate": 3.987778532894181e-06, + "loss": 0.9262861, + "num_input_tokens_seen": 22793915, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.4765625, + "step": 1067, + "time_per_iteration": 2.459721565246582 + }, + { + "auxiliary_loss_clip": 0.01207052, + "auxiliary_loss_mlp": 0.01068129, + "balance_loss_clip": 1.04364336, + "balance_loss_mlp": 1.05625772, + "epoch": 0.06421163384939126, + "flos": 18072045129600.0, + "grad_norm": 2.8684947664405436, + "language_loss": 0.8343029, + "learning_rate": 3.987735505752391e-06, + "loss": 0.85705471, + "num_input_tokens_seen": 22812670, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.515625, + "step": 1068, + "time_per_iteration": 2.4611129760742188 + }, + { + "auxiliary_loss_clip": 0.01205753, + "auxiliary_loss_mlp": 0.01064379, + "balance_loss_clip": 1.03963113, + "balance_loss_mlp": 1.05991328, + "epoch": 0.06427175710205922, + "flos": 25119442563840.0, + "grad_norm": 2.4683216708617053, + "language_loss": 0.89402264, + "learning_rate": 3.987692403235471e-06, + "loss": 0.91672397, + "num_input_tokens_seen": 22832440, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.453125, + "step": 1069, + "time_per_iteration": 2.486241340637207 + }, + { + "auxiliary_loss_clip": 0.01206711, + "auxiliary_loss_mlp": 0.01082225, + "balance_loss_clip": 1.05555749, + "balance_loss_mlp": 1.05718124, + "epoch": 0.06433188035472719, + "flos": 17380549428480.0, + "grad_norm": 2.6076700233042396, + "language_loss": 0.95764256, + "learning_rate": 3.987649225345056e-06, + "loss": 0.98053193, + "num_input_tokens_seen": 22845495, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5, + "step": 1070, + "time_per_iteration": 2.413357734680176 + }, + { + "auxiliary_loss_clip": 0.01209924, + "auxiliary_loss_mlp": 0.01057318, + "balance_loss_clip": 1.0309608, + "balance_loss_mlp": 1.05859673, + "epoch": 0.06439200360739517, + "flos": 23546267625600.0, + "grad_norm": 1.8004745601001504, + "language_loss": 0.8819589, + "learning_rate": 3.987605972082782e-06, + "loss": 0.90463126, + "num_input_tokens_seen": 22865390, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.515625, + "step": 1071, + "time_per_iteration": 2.4717295169830322 + }, + { + "auxiliary_loss_clip": 0.01204044, + "auxiliary_loss_mlp": 0.01052011, + "balance_loss_clip": 1.02799058, + "balance_loss_mlp": 1.056633, + "epoch": 0.06445212686006313, + "flos": 21979772616960.0, + "grad_norm": 1.6498592642907823, + "language_loss": 0.75996184, + "learning_rate": 3.987562643450292e-06, + "loss": 0.78252238, + "num_input_tokens_seen": 22885495, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.4765625, + "step": 1072, + "time_per_iteration": 2.486936330795288 + }, + { + "auxiliary_loss_clip": 0.01207782, + "auxiliary_loss_mlp": 0.010661, + "balance_loss_clip": 1.03951669, + "balance_loss_mlp": 1.05679154, + "epoch": 0.0645122501127311, + "flos": 25921291824000.0, + "grad_norm": 1.95165590675185, + "language_loss": 0.80415034, + "learning_rate": 3.987519239449226e-06, + "loss": 0.82688916, + "num_input_tokens_seen": 22904845, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.5078125, + "step": 1073, + "time_per_iteration": 2.476189613342285 + }, + { + "auxiliary_loss_clip": 0.01200054, + "auxiliary_loss_mlp": 0.01059954, + "balance_loss_clip": 1.03563547, + "balance_loss_mlp": 1.05634785, + "epoch": 0.06457237336539907, + "flos": 25626034028160.0, + "grad_norm": 1.7105520573330508, + "language_loss": 0.80205524, + "learning_rate": 3.987475760081233e-06, + "loss": 0.82465529, + "num_input_tokens_seen": 22925940, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.4375, + "step": 1074, + "time_per_iteration": 2.499133586883545 + }, + { + "auxiliary_loss_clip": 0.01204265, + "auxiliary_loss_mlp": 0.01060595, + "balance_loss_clip": 1.03469074, + "balance_loss_mlp": 1.05560029, + "epoch": 0.06463249661806704, + "flos": 19463979018240.0, + "grad_norm": 2.398999995550556, + "language_loss": 0.79203326, + "learning_rate": 3.987432205347958e-06, + "loss": 0.81468183, + "num_input_tokens_seen": 22944375, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.484375, + "step": 1075, + "time_per_iteration": 2.46777606010437 + }, + { + "auxiliary_loss_clip": 0.01207545, + "auxiliary_loss_mlp": 0.01064646, + "balance_loss_clip": 1.04086363, + "balance_loss_mlp": 1.05960226, + "epoch": 0.064692619870735, + "flos": 24498044254080.0, + "grad_norm": 2.7671348430420712, + "language_loss": 0.87819242, + "learning_rate": 3.987388575251055e-06, + "loss": 0.90091443, + "num_input_tokens_seen": 22959145, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.484375, + "step": 1076, + "time_per_iteration": 2.487417221069336 + }, + { + "auxiliary_loss_clip": 0.01199028, + "auxiliary_loss_mlp": 0.01053729, + "balance_loss_clip": 1.02918351, + "balance_loss_mlp": 1.05429745, + "epoch": 0.06475274312340297, + "flos": 17018677860480.0, + "grad_norm": 2.1388407300528534, + "language_loss": 0.80692923, + "learning_rate": 3.98734486979218e-06, + "loss": 0.82945681, + "num_input_tokens_seen": 22978100, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.4453125, + "step": 1077, + "time_per_iteration": 2.4290995597839355 + }, + { + "auxiliary_loss_clip": 0.01211867, + "auxiliary_loss_mlp": 0.01071702, + "balance_loss_clip": 1.04566646, + "balance_loss_mlp": 1.05862093, + "epoch": 0.06481286637607095, + "flos": 24572379450240.0, + "grad_norm": 2.618517400605346, + "language_loss": 0.91640681, + "learning_rate": 3.987301088972986e-06, + "loss": 0.93924248, + "num_input_tokens_seen": 22997285, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.53125, + "step": 1078, + "time_per_iteration": 2.500995635986328 + }, + { + "auxiliary_loss_clip": 0.01212712, + "auxiliary_loss_mlp": 0.01062475, + "balance_loss_clip": 1.03698754, + "balance_loss_mlp": 1.05874825, + "epoch": 0.06487298962873891, + "flos": 21105635235840.0, + "grad_norm": 2.106125999672554, + "language_loss": 0.78772497, + "learning_rate": 3.987257232795137e-06, + "loss": 0.81047684, + "num_input_tokens_seen": 23016285, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.5390625, + "step": 1079, + "time_per_iteration": 2.4510841369628906 + }, + { + "auxiliary_loss_clip": 0.01204732, + "auxiliary_loss_mlp": 0.01061369, + "balance_loss_clip": 1.03619218, + "balance_loss_mlp": 1.05602205, + "epoch": 0.06493311288140688, + "flos": 24608182331520.0, + "grad_norm": 2.051955253501364, + "language_loss": 0.69555283, + "learning_rate": 3.987213301260294e-06, + "loss": 0.7182138, + "num_input_tokens_seen": 23036420, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.484375, + "step": 1080, + "time_per_iteration": 2.5024302005767822 + }, + { + "auxiliary_loss_clip": 0.01204586, + "auxiliary_loss_mlp": 0.01063302, + "balance_loss_clip": 1.03649211, + "balance_loss_mlp": 1.05477285, + "epoch": 0.06499323613407486, + "flos": 25337994865920.0, + "grad_norm": 1.85895294752556, + "language_loss": 0.72094852, + "learning_rate": 3.987169294370123e-06, + "loss": 0.74362737, + "num_input_tokens_seen": 23056945, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.5, + "step": 1081, + "time_per_iteration": 2.5032622814178467 + }, + { + "auxiliary_loss_clip": 0.01201777, + "auxiliary_loss_mlp": 0.01064533, + "balance_loss_clip": 1.03867674, + "balance_loss_mlp": 1.0554111, + "epoch": 0.06505335938674282, + "flos": 20375714960640.0, + "grad_norm": 2.6422342029105863, + "language_loss": 0.84621316, + "learning_rate": 3.987125212126294e-06, + "loss": 0.86887628, + "num_input_tokens_seen": 23074940, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.46875, + "step": 1082, + "time_per_iteration": 2.459564447402954 + }, + { + "auxiliary_loss_clip": 0.01214386, + "auxiliary_loss_mlp": 0.01067955, + "balance_loss_clip": 1.04106104, + "balance_loss_mlp": 1.05817008, + "epoch": 0.06511348263941079, + "flos": 25337923038720.0, + "grad_norm": 2.177850298461163, + "language_loss": 0.8303026, + "learning_rate": 3.987081054530478e-06, + "loss": 0.85312605, + "num_input_tokens_seen": 23093420, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.5625, + "step": 1083, + "time_per_iteration": 2.504584550857544 + }, + { + "auxiliary_loss_clip": 0.01206291, + "auxiliary_loss_mlp": 0.01065303, + "balance_loss_clip": 1.03852844, + "balance_loss_mlp": 1.05794787, + "epoch": 0.06517360589207877, + "flos": 20332801186560.0, + "grad_norm": 2.6002614807121227, + "language_loss": 0.79689312, + "learning_rate": 3.987036821584348e-06, + "loss": 0.81960905, + "num_input_tokens_seen": 23111550, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.484375, + "step": 1084, + "time_per_iteration": 2.4530820846557617 + }, + { + "auxiliary_loss_clip": 0.01204762, + "auxiliary_loss_mlp": 0.01060872, + "balance_loss_clip": 1.03489637, + "balance_loss_mlp": 1.05634058, + "epoch": 0.06523372914474673, + "flos": 31681650061440.0, + "grad_norm": 2.1191367521188074, + "language_loss": 0.66211331, + "learning_rate": 3.986992513289584e-06, + "loss": 0.68476963, + "num_input_tokens_seen": 23130335, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.484375, + "step": 1085, + "time_per_iteration": 2.5733256340026855 + }, + { + "auxiliary_loss_clip": 0.01198609, + "auxiliary_loss_mlp": 0.01069188, + "balance_loss_clip": 1.04436827, + "balance_loss_mlp": 1.05400848, + "epoch": 0.0652938523974147, + "flos": 20778165918720.0, + "grad_norm": 1.9997547556569089, + "language_loss": 0.76998973, + "learning_rate": 3.9869481296478645e-06, + "loss": 0.79266769, + "num_input_tokens_seen": 23152380, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.4453125, + "step": 1086, + "time_per_iteration": 2.4958763122558594 + }, + { + "auxiliary_loss_clip": 0.01199669, + "auxiliary_loss_mlp": 0.01063299, + "balance_loss_clip": 1.03763306, + "balance_loss_mlp": 1.05291176, + "epoch": 0.06535397565008266, + "flos": 16690993061760.0, + "grad_norm": 2.1546414392836977, + "language_loss": 0.85154319, + "learning_rate": 3.986903670660872e-06, + "loss": 0.87417287, + "num_input_tokens_seen": 23171630, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.46875, + "step": 1087, + "time_per_iteration": 2.4456934928894043 + }, + { + "auxiliary_loss_clip": 0.01204231, + "auxiliary_loss_mlp": 0.01061167, + "balance_loss_clip": 1.03609776, + "balance_loss_mlp": 1.05594206, + "epoch": 0.06541409890275064, + "flos": 26868220116480.0, + "grad_norm": 1.7775330808837086, + "language_loss": 0.77970594, + "learning_rate": 3.9868591363302945e-06, + "loss": 0.80235994, + "num_input_tokens_seen": 23192520, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.484375, + "step": 1088, + "time_per_iteration": 2.53505277633667 + }, + { + "auxiliary_loss_clip": 0.01204134, + "auxiliary_loss_mlp": 0.01066637, + "balance_loss_clip": 1.04329574, + "balance_loss_mlp": 1.05602646, + "epoch": 0.06547422215541861, + "flos": 20521620005760.0, + "grad_norm": 1.9036978890371752, + "language_loss": 0.71191919, + "learning_rate": 3.9868145266578186e-06, + "loss": 0.73462689, + "num_input_tokens_seen": 23210710, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.484375, + "step": 1089, + "time_per_iteration": 2.4569168090820312 + }, + { + "auxiliary_loss_clip": 0.01204151, + "auxiliary_loss_mlp": 0.0105997, + "balance_loss_clip": 1.03566289, + "balance_loss_mlp": 1.05729651, + "epoch": 0.06553434540808657, + "flos": 22016616992640.0, + "grad_norm": 1.7924808842614686, + "language_loss": 0.85504186, + "learning_rate": 3.9867698416451366e-06, + "loss": 0.8776831, + "num_input_tokens_seen": 23230305, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.46875, + "step": 1090, + "time_per_iteration": 2.4624812602996826 + }, + { + "auxiliary_loss_clip": 0.01204567, + "auxiliary_loss_mlp": 0.01062106, + "balance_loss_clip": 1.0365001, + "balance_loss_mlp": 1.05594897, + "epoch": 0.06559446866075455, + "flos": 24608649208320.0, + "grad_norm": 2.2382380061135945, + "language_loss": 0.72027361, + "learning_rate": 3.9867250812939434e-06, + "loss": 0.74294031, + "num_input_tokens_seen": 23249015, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.484375, + "step": 1091, + "time_per_iteration": 2.4911999702453613 + }, + { + "auxiliary_loss_clip": 0.01201014, + "auxiliary_loss_mlp": 0.0106187, + "balance_loss_clip": 1.03638279, + "balance_loss_mlp": 1.05507159, + "epoch": 0.06565459191342252, + "flos": 24274679529600.0, + "grad_norm": 2.7948943762047525, + "language_loss": 0.82525271, + "learning_rate": 3.986680245605936e-06, + "loss": 0.8478815, + "num_input_tokens_seen": 23265105, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4609375, + "step": 1092, + "time_per_iteration": 2.510835886001587 + }, + { + "auxiliary_loss_clip": 0.01205888, + "auxiliary_loss_mlp": 0.01064535, + "balance_loss_clip": 1.03716493, + "balance_loss_mlp": 1.05484402, + "epoch": 0.06571471516609048, + "flos": 24787124910720.0, + "grad_norm": 4.994634192306823, + "language_loss": 0.71286589, + "learning_rate": 3.986635334582814e-06, + "loss": 0.73557013, + "num_input_tokens_seen": 23283950, + "router_z_loss_clip": 0.2734375, + "router_z_loss_mlp": 1.515625, + "step": 1093, + "time_per_iteration": 2.528994560241699 + }, + { + "auxiliary_loss_clip": 0.01204526, + "auxiliary_loss_mlp": 0.01063177, + "balance_loss_clip": 1.03668869, + "balance_loss_mlp": 1.05701041, + "epoch": 0.06577483841875846, + "flos": 26214071581440.0, + "grad_norm": 1.8259988866114194, + "language_loss": 0.87971264, + "learning_rate": 3.986590348226282e-06, + "loss": 0.90238965, + "num_input_tokens_seen": 23305005, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.4765625, + "step": 1094, + "time_per_iteration": 2.50201678276062 + }, + { + "auxiliary_loss_clip": 0.01205803, + "auxiliary_loss_mlp": 0.0106202, + "balance_loss_clip": 1.0350548, + "balance_loss_mlp": 1.0575459, + "epoch": 0.06583496167142643, + "flos": 25080802508160.0, + "grad_norm": 1.6349502946236962, + "language_loss": 0.81364405, + "learning_rate": 3.986545286538044e-06, + "loss": 0.83632231, + "num_input_tokens_seen": 23323220, + "router_z_loss_clip": 0.26953125, + "router_z_loss_mlp": 1.484375, + "step": 1095, + "time_per_iteration": 2.4947729110717773 + }, + { + "auxiliary_loss_clip": 0.01200923, + "auxiliary_loss_mlp": 0.01057353, + "balance_loss_clip": 1.03414297, + "balance_loss_mlp": 1.05544913, + "epoch": 0.06589508492409439, + "flos": 25629804956160.0, + "grad_norm": 2.4379029944224215, + "language_loss": 0.69712919, + "learning_rate": 3.986500149519811e-06, + "loss": 0.7197119, + "num_input_tokens_seen": 23342235, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.453125, + "step": 1096, + "time_per_iteration": 2.5135879516601562 + }, + { + "auxiliary_loss_clip": 0.01206873, + "auxiliary_loss_mlp": 0.01069815, + "balance_loss_clip": 1.04451883, + "balance_loss_mlp": 1.0592947, + "epoch": 0.06595520817676236, + "flos": 23621249266560.0, + "grad_norm": 1.7715259730160258, + "language_loss": 0.77498722, + "learning_rate": 3.986454937173292e-06, + "loss": 0.79775411, + "num_input_tokens_seen": 23363680, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.4765625, + "step": 1097, + "time_per_iteration": 2.4872820377349854 + }, + { + "auxiliary_loss_clip": 0.0120653, + "auxiliary_loss_mlp": 0.01063548, + "balance_loss_clip": 1.03814423, + "balance_loss_mlp": 1.05785179, + "epoch": 0.06601533142943034, + "flos": 33801708545280.0, + "grad_norm": 1.7376479388989727, + "language_loss": 0.77846545, + "learning_rate": 3.986409649500203e-06, + "loss": 0.80116618, + "num_input_tokens_seen": 23385590, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.484375, + "step": 1098, + "time_per_iteration": 2.583075761795044 + }, + { + "auxiliary_loss_clip": 0.01204454, + "auxiliary_loss_mlp": 0.01071542, + "balance_loss_clip": 1.04483891, + "balance_loss_mlp": 1.05739522, + "epoch": 0.0660754546820983, + "flos": 20259184262400.0, + "grad_norm": 1.9398633669636132, + "language_loss": 0.81675154, + "learning_rate": 3.986364286502261e-06, + "loss": 0.83951151, + "num_input_tokens_seen": 23402945, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.46875, + "step": 1099, + "time_per_iteration": 2.446969985961914 + }, + { + "auxiliary_loss_clip": 0.01195976, + "auxiliary_loss_mlp": 0.01052705, + "balance_loss_clip": 1.02801692, + "balance_loss_mlp": 1.0519135, + "epoch": 0.06613557793476627, + "flos": 19354164163200.0, + "grad_norm": 2.0018625732470245, + "language_loss": 0.82619941, + "learning_rate": 3.986318848181186e-06, + "loss": 0.84868616, + "num_input_tokens_seen": 23421410, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.4375, + "step": 1100, + "time_per_iteration": 2.4545743465423584 + }, + { + "auxiliary_loss_clip": 0.01204382, + "auxiliary_loss_mlp": 0.01060672, + "balance_loss_clip": 1.03630555, + "balance_loss_mlp": 1.05827951, + "epoch": 0.06619570118743424, + "flos": 13772568936960.0, + "grad_norm": 2.362466383483127, + "language_loss": 0.73439336, + "learning_rate": 3.986273334538702e-06, + "loss": 0.7570439, + "num_input_tokens_seen": 23438870, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.4609375, + "step": 1101, + "time_per_iteration": 6.740786790847778 + }, + { + "auxiliary_loss_clip": 0.0119874, + "auxiliary_loss_mlp": 0.01062411, + "balance_loss_clip": 1.03829539, + "balance_loss_mlp": 1.05373132, + "epoch": 0.06625582444010221, + "flos": 17857874286720.0, + "grad_norm": 2.46656505058328, + "language_loss": 0.86047602, + "learning_rate": 3.986227745576533e-06, + "loss": 0.88308758, + "num_input_tokens_seen": 23456975, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.453125, + "step": 1102, + "time_per_iteration": 2.4480903148651123 + }, + { + "auxiliary_loss_clip": 0.01200394, + "auxiliary_loss_mlp": 0.01057443, + "balance_loss_clip": 1.0322063, + "balance_loss_mlp": 1.05588222, + "epoch": 0.06631594769277017, + "flos": 11838707579520.0, + "grad_norm": 2.0494810685505995, + "language_loss": 0.81707513, + "learning_rate": 3.98618208129641e-06, + "loss": 0.83965349, + "num_input_tokens_seen": 23473440, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.4453125, + "step": 1103, + "time_per_iteration": 2.4419338703155518 + }, + { + "auxiliary_loss_clip": 0.01203538, + "auxiliary_loss_mlp": 0.01063441, + "balance_loss_clip": 1.04029047, + "balance_loss_mlp": 1.05891824, + "epoch": 0.06637607094543815, + "flos": 19793351756160.0, + "grad_norm": 1.7865556655629211, + "language_loss": 0.82059169, + "learning_rate": 3.986136341700063e-06, + "loss": 0.84326148, + "num_input_tokens_seen": 23493880, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.4453125, + "step": 1104, + "time_per_iteration": 2.506230115890503 + }, + { + "auxiliary_loss_clip": 0.01195268, + "auxiliary_loss_mlp": 0.01047754, + "balance_loss_clip": 1.02229071, + "balance_loss_mlp": 1.05232382, + "epoch": 0.06643619419810612, + "flos": 25485659677440.0, + "grad_norm": 1.6089454783719872, + "language_loss": 0.80542791, + "learning_rate": 3.986090526789227e-06, + "loss": 0.82785821, + "num_input_tokens_seen": 23514920, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4296875, + "step": 1105, + "time_per_iteration": 2.524385929107666 + }, + { + "auxiliary_loss_clip": 0.01197193, + "auxiliary_loss_mlp": 0.01060252, + "balance_loss_clip": 1.03720832, + "balance_loss_mlp": 1.05697632, + "epoch": 0.06649631745077408, + "flos": 16946533393920.0, + "grad_norm": 1.8452117827451007, + "language_loss": 0.96738935, + "learning_rate": 3.986044636565639e-06, + "loss": 0.98996383, + "num_input_tokens_seen": 23531635, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.40625, + "step": 1106, + "time_per_iteration": 2.455122470855713 + }, + { + "auxiliary_loss_clip": 0.01204143, + "auxiliary_loss_mlp": 0.01060087, + "balance_loss_clip": 1.03436136, + "balance_loss_mlp": 1.05509543, + "epoch": 0.06655644070344206, + "flos": 17858592558720.0, + "grad_norm": 1.9568581550144768, + "language_loss": 0.82766026, + "learning_rate": 3.985998671031039e-06, + "loss": 0.85030258, + "num_input_tokens_seen": 23551020, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.4921875, + "step": 1107, + "time_per_iteration": 2.4554357528686523 + }, + { + "auxiliary_loss_clip": 0.01077187, + "auxiliary_loss_mlp": 0.01010186, + "balance_loss_clip": 1.0061568, + "balance_loss_mlp": 1.01696265, + "epoch": 0.06661656395611003, + "flos": 61419350021760.0, + "grad_norm": 0.8235952583150978, + "language_loss": 0.56729984, + "learning_rate": 3.9859526301871705e-06, + "loss": 0.58817357, + "num_input_tokens_seen": 23610675, + "router_z_loss_clip": 0.0402832, + "router_z_loss_mlp": 0.6015625, + "step": 1108, + "time_per_iteration": 3.0248770713806152 + }, + { + "auxiliary_loss_clip": 0.01200435, + "auxiliary_loss_mlp": 0.01065514, + "balance_loss_clip": 1.04034865, + "balance_loss_mlp": 1.05397463, + "epoch": 0.066676687208778, + "flos": 20662856282880.0, + "grad_norm": 2.4203653272420693, + "language_loss": 0.72493321, + "learning_rate": 3.9859065140357795e-06, + "loss": 0.74759269, + "num_input_tokens_seen": 23628710, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.46875, + "step": 1109, + "time_per_iteration": 2.4559717178344727 + }, + { + "auxiliary_loss_clip": 0.01197389, + "auxiliary_loss_mlp": 0.01063103, + "balance_loss_clip": 1.03759217, + "balance_loss_mlp": 1.05389571, + "epoch": 0.06673681046144596, + "flos": 20923280864640.0, + "grad_norm": 3.084593088047962, + "language_loss": 0.78256035, + "learning_rate": 3.985860322578614e-06, + "loss": 0.80516529, + "num_input_tokens_seen": 23649160, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4375, + "step": 1110, + "time_per_iteration": 2.4989912509918213 + }, + { + "auxiliary_loss_clip": 0.01201522, + "auxiliary_loss_mlp": 0.01057407, + "balance_loss_clip": 1.0334934, + "balance_loss_mlp": 1.05598152, + "epoch": 0.06679693371411394, + "flos": 31065818359680.0, + "grad_norm": 2.197430378352105, + "language_loss": 0.71290207, + "learning_rate": 3.985814055817427e-06, + "loss": 0.73549128, + "num_input_tokens_seen": 23671995, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.453125, + "step": 1111, + "time_per_iteration": 2.5445287227630615 + }, + { + "auxiliary_loss_clip": 0.0120524, + "auxiliary_loss_mlp": 0.01062473, + "balance_loss_clip": 1.03833365, + "balance_loss_mlp": 1.05788755, + "epoch": 0.0668570569667819, + "flos": 21726135705600.0, + "grad_norm": 1.8078370838130353, + "language_loss": 0.78315711, + "learning_rate": 3.985767713753971e-06, + "loss": 0.80583429, + "num_input_tokens_seen": 23690705, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.4765625, + "step": 1112, + "time_per_iteration": 2.4791040420532227 + }, + { + "auxiliary_loss_clip": 0.01203172, + "auxiliary_loss_mlp": 0.01058254, + "balance_loss_clip": 1.03426933, + "balance_loss_mlp": 1.05794001, + "epoch": 0.06691718021944987, + "flos": 22747255539840.0, + "grad_norm": 2.0430507180103943, + "language_loss": 0.78819263, + "learning_rate": 3.985721296390005e-06, + "loss": 0.81080687, + "num_input_tokens_seen": 23709990, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.453125, + "step": 1113, + "time_per_iteration": 2.4637296199798584 + }, + { + "auxiliary_loss_clip": 0.01195153, + "auxiliary_loss_mlp": 0.01053406, + "balance_loss_clip": 1.03056598, + "balance_loss_mlp": 1.05255365, + "epoch": 0.06697730347211785, + "flos": 16545626720640.0, + "grad_norm": 2.035611213247421, + "language_loss": 0.82393003, + "learning_rate": 3.985674803727289e-06, + "loss": 0.84641558, + "num_input_tokens_seen": 23728485, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.4296875, + "step": 1114, + "time_per_iteration": 2.434006452560425 + }, + { + "auxiliary_loss_clip": 0.01076, + "auxiliary_loss_mlp": 0.01008619, + "balance_loss_clip": 1.00463712, + "balance_loss_mlp": 1.0165143, + "epoch": 0.06703742672478581, + "flos": 59782326658560.0, + "grad_norm": 0.8339607525511222, + "language_loss": 0.58126414, + "learning_rate": 3.985628235767584e-06, + "loss": 0.60211033, + "num_input_tokens_seen": 23786650, + "router_z_loss_clip": 0.03979492, + "router_z_loss_mlp": 0.59375, + "step": 1115, + "time_per_iteration": 3.020782709121704 + }, + { + "auxiliary_loss_clip": 0.01200335, + "auxiliary_loss_mlp": 0.01059737, + "balance_loss_clip": 1.03427422, + "balance_loss_mlp": 1.05479646, + "epoch": 0.06709754997745378, + "flos": 16800197385600.0, + "grad_norm": 2.8263674595854464, + "language_loss": 0.91123891, + "learning_rate": 3.985581592512658e-06, + "loss": 0.93383968, + "num_input_tokens_seen": 23802555, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.453125, + "step": 1116, + "time_per_iteration": 2.446439504623413 + }, + { + "auxiliary_loss_clip": 0.01209259, + "auxiliary_loss_mlp": 0.01067721, + "balance_loss_clip": 1.04323506, + "balance_loss_mlp": 1.06065357, + "epoch": 0.06715767323012176, + "flos": 22123917895680.0, + "grad_norm": 2.019283248682947, + "language_loss": 0.8709814, + "learning_rate": 3.985534873964279e-06, + "loss": 0.89375114, + "num_input_tokens_seen": 23822945, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.484375, + "step": 1117, + "time_per_iteration": 2.486212968826294 + }, + { + "auxiliary_loss_clip": 0.01074244, + "auxiliary_loss_mlp": 0.0100646, + "balance_loss_clip": 1.00250196, + "balance_loss_mlp": 1.01550937, + "epoch": 0.06721779648278972, + "flos": 66618100137600.0, + "grad_norm": 0.9454776991467404, + "language_loss": 0.59798217, + "learning_rate": 3.985488080124218e-06, + "loss": 0.6187892, + "num_input_tokens_seen": 23874075, + "router_z_loss_clip": 0.03955078, + "router_z_loss_mlp": 0.5859375, + "step": 1118, + "time_per_iteration": 3.0197594165802 + }, + { + "auxiliary_loss_clip": 0.01201284, + "auxiliary_loss_mlp": 0.01056466, + "balance_loss_clip": 1.03255224, + "balance_loss_mlp": 1.05418777, + "epoch": 0.06727791973545769, + "flos": 22382474970240.0, + "grad_norm": 3.7568577616727468, + "language_loss": 0.83498162, + "learning_rate": 3.985441210994251e-06, + "loss": 0.85755914, + "num_input_tokens_seen": 23889720, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.4765625, + "step": 1119, + "time_per_iteration": 2.4535257816314697 + }, + { + "auxiliary_loss_clip": 0.01199216, + "auxiliary_loss_mlp": 0.01059451, + "balance_loss_clip": 1.03732562, + "balance_loss_mlp": 1.0562222, + "epoch": 0.06733804298812565, + "flos": 24280210224000.0, + "grad_norm": 1.8165724331790314, + "language_loss": 0.8480413, + "learning_rate": 3.9853942665761545e-06, + "loss": 0.87062794, + "num_input_tokens_seen": 23909385, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.4296875, + "step": 1120, + "time_per_iteration": 2.533182382583618 + }, + { + "auxiliary_loss_clip": 0.01208718, + "auxiliary_loss_mlp": 0.01067102, + "balance_loss_clip": 1.04269981, + "balance_loss_mlp": 1.0602659, + "epoch": 0.06739816624079363, + "flos": 15918230839680.0, + "grad_norm": 2.032922437281707, + "language_loss": 0.78959441, + "learning_rate": 3.985347246871708e-06, + "loss": 0.81235266, + "num_input_tokens_seen": 23926830, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.484375, + "step": 1121, + "time_per_iteration": 2.4523215293884277 + }, + { + "auxiliary_loss_clip": 0.01072018, + "auxiliary_loss_mlp": 0.01004747, + "balance_loss_clip": 1.00033593, + "balance_loss_mlp": 1.0132587, + "epoch": 0.0674582894934616, + "flos": 71398567353600.0, + "grad_norm": 0.7615352754050735, + "language_loss": 0.58346939, + "learning_rate": 3.985300151882694e-06, + "loss": 0.60423702, + "num_input_tokens_seen": 23992640, + "router_z_loss_clip": 0.04418945, + "router_z_loss_mlp": 0.5859375, + "step": 1122, + "time_per_iteration": 3.2087855339050293 + }, + { + "auxiliary_loss_clip": 0.0120309, + "auxiliary_loss_mlp": 0.01065913, + "balance_loss_clip": 1.04245234, + "balance_loss_mlp": 1.0584271, + "epoch": 0.06751841274612956, + "flos": 25264952559360.0, + "grad_norm": 2.0430211727412098, + "language_loss": 0.71546745, + "learning_rate": 3.985252981610901e-06, + "loss": 0.73815745, + "num_input_tokens_seen": 24011135, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.4453125, + "step": 1123, + "time_per_iteration": 2.5017640590667725 + }, + { + "auxiliary_loss_clip": 0.01201701, + "auxiliary_loss_mlp": 0.01057362, + "balance_loss_clip": 1.03216124, + "balance_loss_mlp": 1.05484593, + "epoch": 0.06757853599879754, + "flos": 23802741711360.0, + "grad_norm": 1.8376842720828679, + "language_loss": 0.79288971, + "learning_rate": 3.985205736058114e-06, + "loss": 0.81548035, + "num_input_tokens_seen": 24030695, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.46875, + "step": 1124, + "time_per_iteration": 2.4980688095092773 + }, + { + "auxiliary_loss_clip": 0.01196564, + "auxiliary_loss_mlp": 0.01054377, + "balance_loss_clip": 1.03204954, + "balance_loss_mlp": 1.05469489, + "epoch": 0.0676386592514655, + "flos": 21033742164480.0, + "grad_norm": 2.0983993205372253, + "language_loss": 0.71198726, + "learning_rate": 3.985158415226128e-06, + "loss": 0.73449671, + "num_input_tokens_seen": 24050680, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.421875, + "step": 1125, + "time_per_iteration": 2.4704325199127197 + }, + { + "auxiliary_loss_clip": 0.01198895, + "auxiliary_loss_mlp": 0.01068522, + "balance_loss_clip": 1.04247451, + "balance_loss_mlp": 1.05620742, + "epoch": 0.06769878250413347, + "flos": 25556331686400.0, + "grad_norm": 2.9171204901367243, + "language_loss": 0.80814254, + "learning_rate": 3.985111019116736e-06, + "loss": 0.83081663, + "num_input_tokens_seen": 24067205, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.421875, + "step": 1126, + "time_per_iteration": 2.5046803951263428 + }, + { + "auxiliary_loss_clip": 0.01070877, + "auxiliary_loss_mlp": 0.01002736, + "balance_loss_clip": 0.9986586, + "balance_loss_mlp": 1.01286924, + "epoch": 0.06775890575680145, + "flos": 70655251305600.0, + "grad_norm": 0.7804116507992601, + "language_loss": 0.59733766, + "learning_rate": 3.985063547731735e-06, + "loss": 0.61807376, + "num_input_tokens_seen": 24131320, + "router_z_loss_clip": 0.04077148, + "router_z_loss_mlp": 0.578125, + "step": 1127, + "time_per_iteration": 3.0877249240875244 + }, + { + "auxiliary_loss_clip": 0.01199514, + "auxiliary_loss_mlp": 0.01056848, + "balance_loss_clip": 1.03376949, + "balance_loss_mlp": 1.05723238, + "epoch": 0.06781902900946941, + "flos": 24235500769920.0, + "grad_norm": 2.13286114653412, + "language_loss": 0.81392133, + "learning_rate": 3.985016001072925e-06, + "loss": 0.83648497, + "num_input_tokens_seen": 24149930, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.421875, + "step": 1128, + "time_per_iteration": 2.5406885147094727 + }, + { + "auxiliary_loss_clip": 0.01208088, + "auxiliary_loss_mlp": 0.0105195, + "balance_loss_clip": 1.02692807, + "balance_loss_mlp": 1.0598706, + "epoch": 0.06787915226213738, + "flos": 22417523665920.0, + "grad_norm": 3.047918834731733, + "language_loss": 0.76034033, + "learning_rate": 3.984968379142109e-06, + "loss": 0.78294069, + "num_input_tokens_seen": 24169590, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.484375, + "step": 1129, + "time_per_iteration": 2.486829996109009 + }, + { + "auxiliary_loss_clip": 0.01201584, + "auxiliary_loss_mlp": 0.01061333, + "balance_loss_clip": 1.03721654, + "balance_loss_mlp": 1.05536139, + "epoch": 0.06793927551480534, + "flos": 37706922080640.0, + "grad_norm": 1.8621491947103987, + "language_loss": 0.72340226, + "learning_rate": 3.984920681941094e-06, + "loss": 0.74603146, + "num_input_tokens_seen": 24189965, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.4609375, + "step": 1130, + "time_per_iteration": 2.6195991039276123 + }, + { + "auxiliary_loss_clip": 0.01197626, + "auxiliary_loss_mlp": 0.01063599, + "balance_loss_clip": 1.03957844, + "balance_loss_mlp": 1.05584192, + "epoch": 0.06799939876747332, + "flos": 20631398947200.0, + "grad_norm": 2.3479224842049917, + "language_loss": 0.80624223, + "learning_rate": 3.984872909471688e-06, + "loss": 0.82885444, + "num_input_tokens_seen": 24208045, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.421875, + "step": 1131, + "time_per_iteration": 2.4917030334472656 + }, + { + "auxiliary_loss_clip": 0.01196301, + "auxiliary_loss_mlp": 0.0106802, + "balance_loss_clip": 1.04398775, + "balance_loss_mlp": 1.05550814, + "epoch": 0.06805952202014129, + "flos": 14864755829760.0, + "grad_norm": 2.1673533627141652, + "language_loss": 0.8104949, + "learning_rate": 3.984825061735701e-06, + "loss": 0.83313811, + "num_input_tokens_seen": 24223805, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.40625, + "step": 1132, + "time_per_iteration": 2.4325902462005615 + }, + { + "auxiliary_loss_clip": 0.01199688, + "auxiliary_loss_mlp": 0.01069367, + "balance_loss_clip": 1.04525137, + "balance_loss_mlp": 1.05629563, + "epoch": 0.06811964527280925, + "flos": 48909434947200.0, + "grad_norm": 1.450417149602266, + "language_loss": 0.63629937, + "learning_rate": 3.9847771387349495e-06, + "loss": 0.65898991, + "num_input_tokens_seen": 24249475, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.4296875, + "step": 1133, + "time_per_iteration": 2.7164230346679688 + }, + { + "auxiliary_loss_clip": 0.01203203, + "auxiliary_loss_mlp": 0.01059397, + "balance_loss_clip": 1.03194308, + "balance_loss_mlp": 1.05427325, + "epoch": 0.06817976852547723, + "flos": 15377273038080.0, + "grad_norm": 2.5027083277203963, + "language_loss": 0.74811196, + "learning_rate": 3.9847291404712506e-06, + "loss": 0.77073789, + "num_input_tokens_seen": 24267980, + "router_z_loss_clip": 0.27539062, + "router_z_loss_mlp": 1.484375, + "step": 1134, + "time_per_iteration": 2.420506000518799 + }, + { + "auxiliary_loss_clip": 0.01201452, + "auxiliary_loss_mlp": 0.01064371, + "balance_loss_clip": 1.04088652, + "balance_loss_mlp": 1.05952573, + "epoch": 0.0682398917781452, + "flos": 20155690200960.0, + "grad_norm": 2.0759609389962037, + "language_loss": 0.87245119, + "learning_rate": 3.984681066946423e-06, + "loss": 0.89510942, + "num_input_tokens_seen": 24286805, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.421875, + "step": 1135, + "time_per_iteration": 2.464738607406616 + }, + { + "auxiliary_loss_clip": 0.01200809, + "auxiliary_loss_mlp": 0.01055494, + "balance_loss_clip": 1.03010249, + "balance_loss_mlp": 1.05388534, + "epoch": 0.06830001503081316, + "flos": 23440618748160.0, + "grad_norm": 2.383261313924855, + "language_loss": 0.78335494, + "learning_rate": 3.984632918162291e-06, + "loss": 0.80591798, + "num_input_tokens_seen": 24305855, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.46875, + "step": 1136, + "time_per_iteration": 2.4486002922058105 + }, + { + "auxiliary_loss_clip": 0.01206211, + "auxiliary_loss_mlp": 0.01073979, + "balance_loss_clip": 1.04906416, + "balance_loss_mlp": 1.06089664, + "epoch": 0.06836013828348114, + "flos": 34349813153280.0, + "grad_norm": 3.2008110915617207, + "language_loss": 0.83941948, + "learning_rate": 3.984584694120679e-06, + "loss": 0.86222148, + "num_input_tokens_seen": 24326535, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 1.453125, + "step": 1137, + "time_per_iteration": 2.5714635848999023 + }, + { + "auxiliary_loss_clip": 0.01199575, + "auxiliary_loss_mlp": 0.01061827, + "balance_loss_clip": 1.03806889, + "balance_loss_mlp": 1.05628538, + "epoch": 0.06842026153614911, + "flos": 23148844571520.0, + "grad_norm": 2.067587662099544, + "language_loss": 0.78669268, + "learning_rate": 3.984536394823418e-06, + "loss": 0.80930662, + "num_input_tokens_seen": 24345810, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.4375, + "step": 1138, + "time_per_iteration": 2.459437370300293 + }, + { + "auxiliary_loss_clip": 0.01202271, + "auxiliary_loss_mlp": 0.01059469, + "balance_loss_clip": 1.03480506, + "balance_loss_mlp": 1.05729747, + "epoch": 0.06848038478881707, + "flos": 24608972430720.0, + "grad_norm": 2.606905885529735, + "language_loss": 0.85683703, + "learning_rate": 3.984488020272336e-06, + "loss": 0.87945449, + "num_input_tokens_seen": 24366095, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.453125, + "step": 1139, + "time_per_iteration": 2.5198936462402344 + }, + { + "auxiliary_loss_clip": 0.01201061, + "auxiliary_loss_mlp": 0.01057605, + "balance_loss_clip": 1.03297663, + "balance_loss_mlp": 1.05803108, + "epoch": 0.06854050804148504, + "flos": 40880994278400.0, + "grad_norm": 1.7528507300348692, + "language_loss": 0.74826896, + "learning_rate": 3.984439570469271e-06, + "loss": 0.77085567, + "num_input_tokens_seen": 24388665, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.4296875, + "step": 1140, + "time_per_iteration": 2.6609106063842773 + }, + { + "auxiliary_loss_clip": 0.01198151, + "auxiliary_loss_mlp": 0.01062313, + "balance_loss_clip": 1.03698146, + "balance_loss_mlp": 1.05620885, + "epoch": 0.06860063129415302, + "flos": 31686354743040.0, + "grad_norm": 2.210262717529583, + "language_loss": 0.68083167, + "learning_rate": 3.9843910454160574e-06, + "loss": 0.70343632, + "num_input_tokens_seen": 24407705, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.421875, + "step": 1141, + "time_per_iteration": 2.5661122798919678 + }, + { + "auxiliary_loss_clip": 0.01205913, + "auxiliary_loss_mlp": 0.0106664, + "balance_loss_clip": 1.04098654, + "balance_loss_mlp": 1.05848837, + "epoch": 0.06866075454682098, + "flos": 26542007775360.0, + "grad_norm": 1.82433360121009, + "language_loss": 0.79399014, + "learning_rate": 3.984342445114538e-06, + "loss": 0.8167156, + "num_input_tokens_seen": 24428390, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.46875, + "step": 1142, + "time_per_iteration": 2.5499107837677 + }, + { + "auxiliary_loss_clip": 0.0120232, + "auxiliary_loss_mlp": 0.01061074, + "balance_loss_clip": 1.03650475, + "balance_loss_mlp": 1.05730164, + "epoch": 0.06872087779948895, + "flos": 29789768724480.0, + "grad_norm": 1.6821535193321122, + "language_loss": 0.68701231, + "learning_rate": 3.984293769566553e-06, + "loss": 0.70964622, + "num_input_tokens_seen": 24450810, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.4453125, + "step": 1143, + "time_per_iteration": 5.380373239517212 + }, + { + "auxiliary_loss_clip": 0.01196375, + "auxiliary_loss_mlp": 0.01059216, + "balance_loss_clip": 1.03670955, + "balance_loss_mlp": 1.05885804, + "epoch": 0.06878100105215693, + "flos": 26941118768640.0, + "grad_norm": 1.8434796401844256, + "language_loss": 0.74694496, + "learning_rate": 3.98424501877395e-06, + "loss": 0.76950091, + "num_input_tokens_seen": 24469965, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.375, + "step": 1144, + "time_per_iteration": 2.536839485168457 + }, + { + "auxiliary_loss_clip": 0.01204332, + "auxiliary_loss_mlp": 0.01064223, + "balance_loss_clip": 1.03893876, + "balance_loss_mlp": 1.05654943, + "epoch": 0.06884112430482489, + "flos": 10670748946560.0, + "grad_norm": 2.296493270147659, + "language_loss": 0.91720247, + "learning_rate": 3.984196192738577e-06, + "loss": 0.93988806, + "num_input_tokens_seen": 24486370, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4765625, + "step": 1145, + "time_per_iteration": 2.44307017326355 + }, + { + "auxiliary_loss_clip": 0.01206887, + "auxiliary_loss_mlp": 0.01067692, + "balance_loss_clip": 1.04160893, + "balance_loss_mlp": 1.05779576, + "epoch": 0.06890124755749286, + "flos": 20193647898240.0, + "grad_norm": 2.4650333910918865, + "language_loss": 0.82189268, + "learning_rate": 3.984147291462285e-06, + "loss": 0.84463847, + "num_input_tokens_seen": 24503780, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.4921875, + "step": 1146, + "time_per_iteration": 2.4743804931640625 + }, + { + "auxiliary_loss_clip": 0.01198651, + "auxiliary_loss_mlp": 0.01061891, + "balance_loss_clip": 1.03869271, + "balance_loss_mlp": 1.05755806, + "epoch": 0.06896137081016084, + "flos": 20449224144000.0, + "grad_norm": 2.5935722439127744, + "language_loss": 0.85150343, + "learning_rate": 3.98409831494693e-06, + "loss": 0.87410891, + "num_input_tokens_seen": 24522320, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.4140625, + "step": 1147, + "time_per_iteration": 2.48410701751709 + }, + { + "auxiliary_loss_clip": 0.01201275, + "auxiliary_loss_mlp": 0.01064743, + "balance_loss_clip": 1.03988767, + "balance_loss_mlp": 1.05699074, + "epoch": 0.0690214940628288, + "flos": 18368703555840.0, + "grad_norm": 2.3932988353276645, + "language_loss": 0.86235052, + "learning_rate": 3.984049263194367e-06, + "loss": 0.88501072, + "num_input_tokens_seen": 24540445, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.4453125, + "step": 1148, + "time_per_iteration": 2.455441951751709 + }, + { + "auxiliary_loss_clip": 0.01199305, + "auxiliary_loss_mlp": 0.01058033, + "balance_loss_clip": 1.0337863, + "balance_loss_mlp": 1.05560231, + "epoch": 0.06908161731549677, + "flos": 20558033418240.0, + "grad_norm": 2.070658514783469, + "language_loss": 0.69185412, + "learning_rate": 3.9840001362064575e-06, + "loss": 0.71442747, + "num_input_tokens_seen": 24557105, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.4375, + "step": 1149, + "time_per_iteration": 2.461880922317505 + }, + { + "auxiliary_loss_clip": 0.01203307, + "auxiliary_loss_mlp": 0.0105502, + "balance_loss_clip": 1.0289495, + "balance_loss_mlp": 1.05679548, + "epoch": 0.06914174056816474, + "flos": 27563666313600.0, + "grad_norm": 2.828663566846353, + "language_loss": 0.84069788, + "learning_rate": 3.983950933985064e-06, + "loss": 0.86328113, + "num_input_tokens_seen": 24578240, + "router_z_loss_clip": 0.25976562, + "router_z_loss_mlp": 1.4609375, + "step": 1150, + "time_per_iteration": 2.509122371673584 + }, + { + "auxiliary_loss_clip": 0.01206199, + "auxiliary_loss_mlp": 0.01058671, + "balance_loss_clip": 1.03453135, + "balance_loss_mlp": 1.06116164, + "epoch": 0.06920186382083271, + "flos": 15304015249920.0, + "grad_norm": 3.57752822218259, + "language_loss": 0.82044697, + "learning_rate": 3.983901656532052e-06, + "loss": 0.84309566, + "num_input_tokens_seen": 24593585, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.453125, + "step": 1151, + "time_per_iteration": 2.420128345489502 + }, + { + "auxiliary_loss_clip": 0.01201904, + "auxiliary_loss_mlp": 0.01062248, + "balance_loss_clip": 1.03883505, + "balance_loss_mlp": 1.06011868, + "epoch": 0.06926198707350067, + "flos": 25191227894400.0, + "grad_norm": 1.8279979065740934, + "language_loss": 0.85587418, + "learning_rate": 3.983852303849291e-06, + "loss": 0.87851566, + "num_input_tokens_seen": 24613110, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.4140625, + "step": 1152, + "time_per_iteration": 2.498180866241455 + }, + { + "auxiliary_loss_clip": 0.01198565, + "auxiliary_loss_mlp": 0.01060938, + "balance_loss_clip": 1.03747797, + "balance_loss_mlp": 1.05767703, + "epoch": 0.06932211032616864, + "flos": 13256137146240.0, + "grad_norm": 2.1251557516582995, + "language_loss": 0.90536988, + "learning_rate": 3.983802875938651e-06, + "loss": 0.92796487, + "num_input_tokens_seen": 24628795, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.40625, + "step": 1153, + "time_per_iteration": 2.422480821609497 + }, + { + "auxiliary_loss_clip": 0.01200757, + "auxiliary_loss_mlp": 0.01054623, + "balance_loss_clip": 1.03035152, + "balance_loss_mlp": 1.05790865, + "epoch": 0.06938223357883662, + "flos": 24827381078400.0, + "grad_norm": 2.190017778582164, + "language_loss": 0.81363368, + "learning_rate": 3.983753372802008e-06, + "loss": 0.83618748, + "num_input_tokens_seen": 24645480, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.4296875, + "step": 1154, + "time_per_iteration": 2.528118848800659 + }, + { + "auxiliary_loss_clip": 0.01202754, + "auxiliary_loss_mlp": 0.01068044, + "balance_loss_clip": 1.04476249, + "balance_loss_mlp": 1.06078768, + "epoch": 0.06944235683150458, + "flos": 27267977554560.0, + "grad_norm": 32.79102955334026, + "language_loss": 0.7560131, + "learning_rate": 3.983703794441237e-06, + "loss": 0.77872109, + "num_input_tokens_seen": 24664630, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.421875, + "step": 1155, + "time_per_iteration": 2.5010287761688232 + }, + { + "auxiliary_loss_clip": 0.01196108, + "auxiliary_loss_mlp": 0.01059268, + "balance_loss_clip": 1.03595114, + "balance_loss_mlp": 1.05511975, + "epoch": 0.06950248008417255, + "flos": 25808065176960.0, + "grad_norm": 1.6800097473238784, + "language_loss": 0.71119213, + "learning_rate": 3.98365414085822e-06, + "loss": 0.73374593, + "num_input_tokens_seen": 24684210, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.40625, + "step": 1156, + "time_per_iteration": 2.5034549236297607 + }, + { + "auxiliary_loss_clip": 0.01199728, + "auxiliary_loss_mlp": 0.01069841, + "balance_loss_clip": 1.04437828, + "balance_loss_mlp": 1.05711889, + "epoch": 0.06956260333684053, + "flos": 22271546793600.0, + "grad_norm": 2.0301788984863918, + "language_loss": 0.75299567, + "learning_rate": 3.98360441205484e-06, + "loss": 0.77569139, + "num_input_tokens_seen": 24702490, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4296875, + "step": 1157, + "time_per_iteration": 2.4654574394226074 + }, + { + "auxiliary_loss_clip": 0.0119867, + "auxiliary_loss_mlp": 0.0105715, + "balance_loss_clip": 1.03240204, + "balance_loss_mlp": 1.0551796, + "epoch": 0.0696227265895085, + "flos": 29681390413440.0, + "grad_norm": 1.6687264459000366, + "language_loss": 0.71895158, + "learning_rate": 3.983554608032982e-06, + "loss": 0.7415098, + "num_input_tokens_seen": 24724340, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.4375, + "step": 1158, + "time_per_iteration": 2.53495454788208 + }, + { + "auxiliary_loss_clip": 0.01202231, + "auxiliary_loss_mlp": 0.01063046, + "balance_loss_clip": 1.03764284, + "balance_loss_mlp": 1.05718327, + "epoch": 0.06968284984217646, + "flos": 25523545547520.0, + "grad_norm": 1.9777890540291267, + "language_loss": 0.79796576, + "learning_rate": 3.983504728794533e-06, + "loss": 0.82061857, + "num_input_tokens_seen": 24745550, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.453125, + "step": 1159, + "time_per_iteration": 2.511402130126953 + }, + { + "auxiliary_loss_clip": 0.01205534, + "auxiliary_loss_mlp": 0.0106614, + "balance_loss_clip": 1.03938961, + "balance_loss_mlp": 1.05860782, + "epoch": 0.06974297309484444, + "flos": 20698192287360.0, + "grad_norm": 5.094070474761981, + "language_loss": 0.810929, + "learning_rate": 3.983454774341387e-06, + "loss": 0.83364576, + "num_input_tokens_seen": 24762575, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.46875, + "step": 1160, + "time_per_iteration": 2.4580883979797363 + }, + { + "auxiliary_loss_clip": 0.01197544, + "auxiliary_loss_mlp": 0.01059119, + "balance_loss_clip": 1.03373909, + "balance_loss_mlp": 1.05382752, + "epoch": 0.0698030963475124, + "flos": 26505199313280.0, + "grad_norm": 1.8746427931419856, + "language_loss": 0.75958532, + "learning_rate": 3.983404744675437e-06, + "loss": 0.78215194, + "num_input_tokens_seen": 24782605, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4375, + "step": 1161, + "time_per_iteration": 2.5046370029449463 + }, + { + "auxiliary_loss_clip": 0.01195466, + "auxiliary_loss_mlp": 0.01062077, + "balance_loss_clip": 1.03642368, + "balance_loss_mlp": 1.05299318, + "epoch": 0.06986321960018037, + "flos": 23040430346880.0, + "grad_norm": 1.806880077375887, + "language_loss": 0.8285073, + "learning_rate": 3.9833546397985794e-06, + "loss": 0.85108274, + "num_input_tokens_seen": 24802910, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.4296875, + "step": 1162, + "time_per_iteration": 2.4779040813446045 + }, + { + "auxiliary_loss_clip": 0.01193968, + "auxiliary_loss_mlp": 0.01055987, + "balance_loss_clip": 1.03172803, + "balance_loss_mlp": 1.05355024, + "epoch": 0.06992334285284833, + "flos": 28584822061440.0, + "grad_norm": 1.8779282806609423, + "language_loss": 0.79095101, + "learning_rate": 3.983304459712716e-06, + "loss": 0.81345057, + "num_input_tokens_seen": 24823305, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.40625, + "step": 1163, + "time_per_iteration": 2.515899181365967 + }, + { + "auxiliary_loss_clip": 0.01198337, + "auxiliary_loss_mlp": 0.0106386, + "balance_loss_clip": 1.03728819, + "balance_loss_mlp": 1.05438375, + "epoch": 0.06998346610551631, + "flos": 20595344670720.0, + "grad_norm": 2.1142628107327233, + "language_loss": 0.79552305, + "learning_rate": 3.983254204419749e-06, + "loss": 0.81814498, + "num_input_tokens_seen": 24842155, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.4375, + "step": 1164, + "time_per_iteration": 2.476428747177124 + }, + { + "auxiliary_loss_clip": 0.0119937, + "auxiliary_loss_mlp": 0.0106576, + "balance_loss_clip": 1.0401659, + "balance_loss_mlp": 1.05587661, + "epoch": 0.07004358935818428, + "flos": 22528810978560.0, + "grad_norm": 1.4863162511761774, + "language_loss": 0.73198837, + "learning_rate": 3.983203873921583e-06, + "loss": 0.75463963, + "num_input_tokens_seen": 24862080, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.4375, + "step": 1165, + "time_per_iteration": 2.5053012371063232 + }, + { + "auxiliary_loss_clip": 0.01196916, + "auxiliary_loss_mlp": 0.01056731, + "balance_loss_clip": 1.03225732, + "balance_loss_mlp": 1.05550849, + "epoch": 0.07010371261085224, + "flos": 28949997680640.0, + "grad_norm": 1.690867173089168, + "language_loss": 0.81019437, + "learning_rate": 3.983153468220128e-06, + "loss": 0.83273077, + "num_input_tokens_seen": 24886165, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.4140625, + "step": 1166, + "time_per_iteration": 2.5378963947296143 + }, + { + "auxiliary_loss_clip": 0.01194011, + "auxiliary_loss_mlp": 0.0104974, + "balance_loss_clip": 1.02452731, + "balance_loss_mlp": 1.0534389, + "epoch": 0.07016383586352022, + "flos": 23659171050240.0, + "grad_norm": 4.886682439277329, + "language_loss": 0.84443307, + "learning_rate": 3.983102987317295e-06, + "loss": 0.86687052, + "num_input_tokens_seen": 24905775, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.40625, + "step": 1167, + "time_per_iteration": 2.5244622230529785 + }, + { + "auxiliary_loss_clip": 0.01201364, + "auxiliary_loss_mlp": 0.01057063, + "balance_loss_clip": 1.03188586, + "balance_loss_mlp": 1.05693448, + "epoch": 0.07022395911618819, + "flos": 19792130693760.0, + "grad_norm": 3.687845484368313, + "language_loss": 0.89423364, + "learning_rate": 3.983052431214997e-06, + "loss": 0.9168179, + "num_input_tokens_seen": 24924295, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.4453125, + "step": 1168, + "time_per_iteration": 2.49411678314209 + }, + { + "auxiliary_loss_clip": 0.01203973, + "auxiliary_loss_mlp": 0.01068913, + "balance_loss_clip": 1.04078007, + "balance_loss_mlp": 1.05737031, + "epoch": 0.07028408236885615, + "flos": 21689147675520.0, + "grad_norm": 2.629371766417224, + "language_loss": 0.88661098, + "learning_rate": 3.983001799915153e-06, + "loss": 0.9093399, + "num_input_tokens_seen": 24943210, + "router_z_loss_clip": 0.28125, + "router_z_loss_mlp": 1.46875, + "step": 1169, + "time_per_iteration": 2.4795143604278564 + }, + { + "auxiliary_loss_clip": 0.01203226, + "auxiliary_loss_mlp": 0.01069625, + "balance_loss_clip": 1.04397118, + "balance_loss_mlp": 1.05864179, + "epoch": 0.07034420562152413, + "flos": 25630271832960.0, + "grad_norm": 2.0154006947860705, + "language_loss": 0.84000075, + "learning_rate": 3.982951093419681e-06, + "loss": 0.86272925, + "num_input_tokens_seen": 24960360, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.4375, + "step": 1170, + "time_per_iteration": 2.501016616821289 + }, + { + "auxiliary_loss_clip": 0.01199625, + "auxiliary_loss_mlp": 0.01064997, + "balance_loss_clip": 1.03860402, + "balance_loss_mlp": 1.05753505, + "epoch": 0.0704043288741921, + "flos": 20810449267200.0, + "grad_norm": 1.945268169582358, + "language_loss": 0.75220597, + "learning_rate": 3.982900311730506e-06, + "loss": 0.77485222, + "num_input_tokens_seen": 24978290, + "router_z_loss_clip": 0.26367188, + "router_z_loss_mlp": 1.421875, + "step": 1171, + "time_per_iteration": 2.4456748962402344 + }, + { + "auxiliary_loss_clip": 0.01199689, + "auxiliary_loss_mlp": 0.01058158, + "balance_loss_clip": 1.03393483, + "balance_loss_mlp": 1.05765915, + "epoch": 0.07046445212686006, + "flos": 25593176062080.0, + "grad_norm": 3.2481396571627923, + "language_loss": 0.88848841, + "learning_rate": 3.9828494548495514e-06, + "loss": 0.91106689, + "num_input_tokens_seen": 24997055, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.421875, + "step": 1172, + "time_per_iteration": 2.4970321655273438 + }, + { + "auxiliary_loss_clip": 0.01202846, + "auxiliary_loss_mlp": 0.01053059, + "balance_loss_clip": 1.02776241, + "balance_loss_mlp": 1.05584753, + "epoch": 0.07052457537952803, + "flos": 25556978131200.0, + "grad_norm": 1.6229718682058278, + "language_loss": 0.8212136, + "learning_rate": 3.982798522778748e-06, + "loss": 0.84377271, + "num_input_tokens_seen": 25017490, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.46875, + "step": 1173, + "time_per_iteration": 2.485822916030884 + }, + { + "auxiliary_loss_clip": 0.01200818, + "auxiliary_loss_mlp": 0.01061183, + "balance_loss_clip": 1.03574347, + "balance_loss_mlp": 1.05786848, + "epoch": 0.070584698632196, + "flos": 17968515154560.0, + "grad_norm": 2.056745883983527, + "language_loss": 0.81825697, + "learning_rate": 3.9827475155200245e-06, + "loss": 0.840877, + "num_input_tokens_seen": 25035660, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4296875, + "step": 1174, + "time_per_iteration": 2.4564759731292725 + }, + { + "auxiliary_loss_clip": 0.01199791, + "auxiliary_loss_mlp": 0.01060254, + "balance_loss_clip": 1.03538728, + "balance_loss_mlp": 1.0569849, + "epoch": 0.07064482188486397, + "flos": 25370888745600.0, + "grad_norm": 1.925446476900023, + "language_loss": 0.8511939, + "learning_rate": 3.982696433075317e-06, + "loss": 0.87379438, + "num_input_tokens_seen": 25054785, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 1.421875, + "step": 1175, + "time_per_iteration": 2.487058639526367 + }, + { + "auxiliary_loss_clip": 0.0120243, + "auxiliary_loss_mlp": 0.01067762, + "balance_loss_clip": 1.04362202, + "balance_loss_mlp": 1.05922508, + "epoch": 0.07070494513753194, + "flos": 24899848767360.0, + "grad_norm": 1.9716433558257507, + "language_loss": 0.8303746, + "learning_rate": 3.982645275446563e-06, + "loss": 0.85307658, + "num_input_tokens_seen": 25075180, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.4375, + "step": 1176, + "time_per_iteration": 2.511456251144409 + }, + { + "auxiliary_loss_clip": 0.01197689, + "auxiliary_loss_mlp": 0.01061097, + "balance_loss_clip": 1.03528786, + "balance_loss_mlp": 1.05717707, + "epoch": 0.07076506839019991, + "flos": 22338447874560.0, + "grad_norm": 2.3318965992312, + "language_loss": 0.74563694, + "learning_rate": 3.982594042635701e-06, + "loss": 0.76822478, + "num_input_tokens_seen": 25093035, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.40625, + "step": 1177, + "time_per_iteration": 2.460102081298828 + }, + { + "auxiliary_loss_clip": 0.01207406, + "auxiliary_loss_mlp": 0.01058724, + "balance_loss_clip": 1.033476, + "balance_loss_mlp": 1.06167924, + "epoch": 0.07082519164286788, + "flos": 18660800954880.0, + "grad_norm": 2.2206541819979995, + "language_loss": 0.86031914, + "learning_rate": 3.982542734644673e-06, + "loss": 0.88298053, + "num_input_tokens_seen": 25112520, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.4609375, + "step": 1178, + "time_per_iteration": 2.4605627059936523 + }, + { + "auxiliary_loss_clip": 0.01087089, + "auxiliary_loss_mlp": 0.01007975, + "balance_loss_clip": 1.00349271, + "balance_loss_mlp": 1.02766943, + "epoch": 0.07088531489553584, + "flos": 63654107610240.0, + "grad_norm": 0.8386980392448491, + "language_loss": 0.63242435, + "learning_rate": 3.982491351475427e-06, + "loss": 0.65337497, + "num_input_tokens_seen": 25177760, + "router_z_loss_clip": 0.04492188, + "router_z_loss_mlp": 0.59375, + "step": 1179, + "time_per_iteration": 3.156688690185547 + }, + { + "auxiliary_loss_clip": 0.01207076, + "auxiliary_loss_mlp": 0.01062734, + "balance_loss_clip": 1.03886819, + "balance_loss_mlp": 1.06038809, + "epoch": 0.07094543814820382, + "flos": 21572688804480.0, + "grad_norm": 2.3853497849810945, + "language_loss": 0.83326972, + "learning_rate": 3.98243989312991e-06, + "loss": 0.85596782, + "num_input_tokens_seen": 25195260, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.46875, + "step": 1180, + "time_per_iteration": 2.4823896884918213 + }, + { + "auxiliary_loss_clip": 0.01200915, + "auxiliary_loss_mlp": 0.01065839, + "balance_loss_clip": 1.04087663, + "balance_loss_mlp": 1.05910683, + "epoch": 0.07100556140087179, + "flos": 22089946608000.0, + "grad_norm": 2.1921067510196446, + "language_loss": 0.88595563, + "learning_rate": 3.982388359610074e-06, + "loss": 0.90862316, + "num_input_tokens_seen": 25212740, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.421875, + "step": 1181, + "time_per_iteration": 2.505908727645874 + }, + { + "auxiliary_loss_clip": 0.01200757, + "auxiliary_loss_mlp": 0.01060636, + "balance_loss_clip": 1.03607869, + "balance_loss_mlp": 1.05944347, + "epoch": 0.07106568465353975, + "flos": 47922286400640.0, + "grad_norm": 2.2303634282095257, + "language_loss": 0.83314365, + "learning_rate": 3.9823367509178725e-06, + "loss": 0.85575759, + "num_input_tokens_seen": 25236420, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.4140625, + "step": 1182, + "time_per_iteration": 2.7283623218536377 + }, + { + "auxiliary_loss_clip": 0.01199287, + "auxiliary_loss_mlp": 0.01065564, + "balance_loss_clip": 1.04006529, + "balance_loss_mlp": 1.06100821, + "epoch": 0.07112580790620772, + "flos": 23440798316160.0, + "grad_norm": 2.671395976555463, + "language_loss": 0.7925818, + "learning_rate": 3.982285067055262e-06, + "loss": 0.81523037, + "num_input_tokens_seen": 25255120, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.3828125, + "step": 1183, + "time_per_iteration": 2.5057172775268555 + }, + { + "auxiliary_loss_clip": 0.01201972, + "auxiliary_loss_mlp": 0.01059167, + "balance_loss_clip": 1.03441906, + "balance_loss_mlp": 1.05550563, + "epoch": 0.0711859311588757, + "flos": 31868888682240.0, + "grad_norm": 2.6492838430830963, + "language_loss": 0.78910172, + "learning_rate": 3.982233308024204e-06, + "loss": 0.8117131, + "num_input_tokens_seen": 25275150, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.46875, + "step": 1184, + "time_per_iteration": 5.494150638580322 + }, + { + "auxiliary_loss_clip": 0.01196982, + "auxiliary_loss_mlp": 0.01057128, + "balance_loss_clip": 1.03369188, + "balance_loss_mlp": 1.05884266, + "epoch": 0.07124605441154366, + "flos": 19610315026560.0, + "grad_norm": 2.546293211356889, + "language_loss": 0.7696892, + "learning_rate": 3.98218147382666e-06, + "loss": 0.79223031, + "num_input_tokens_seen": 25293680, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.375, + "step": 1185, + "time_per_iteration": 3.8873486518859863 + }, + { + "auxiliary_loss_clip": 0.01200052, + "auxiliary_loss_mlp": 0.01065088, + "balance_loss_clip": 1.0408771, + "balance_loss_mlp": 1.05808377, + "epoch": 0.07130617766421163, + "flos": 14684448533760.0, + "grad_norm": 2.519913974657541, + "language_loss": 0.65896261, + "learning_rate": 3.982129564464596e-06, + "loss": 0.68161404, + "num_input_tokens_seen": 25310050, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.421875, + "step": 1186, + "time_per_iteration": 2.44986891746521 + }, + { + "auxiliary_loss_clip": 0.01198722, + "auxiliary_loss_mlp": 0.01056267, + "balance_loss_clip": 1.03234124, + "balance_loss_mlp": 1.05906928, + "epoch": 0.07136630091687961, + "flos": 26067915141120.0, + "grad_norm": 2.0047668871213205, + "language_loss": 0.69673246, + "learning_rate": 3.98207757993998e-06, + "loss": 0.71928233, + "num_input_tokens_seen": 25331020, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.3984375, + "step": 1187, + "time_per_iteration": 2.517432451248169 + }, + { + "auxiliary_loss_clip": 0.01194056, + "auxiliary_loss_mlp": 0.01059861, + "balance_loss_clip": 1.03713942, + "balance_loss_mlp": 1.05690861, + "epoch": 0.07142642416954757, + "flos": 15669190869120.0, + "grad_norm": 2.6848541171122307, + "language_loss": 0.78598166, + "learning_rate": 3.9820255202547845e-06, + "loss": 0.80852079, + "num_input_tokens_seen": 25347875, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.375, + "step": 1188, + "time_per_iteration": 2.4682350158691406 + }, + { + "auxiliary_loss_clip": 0.01197809, + "auxiliary_loss_mlp": 0.01056931, + "balance_loss_clip": 1.03282666, + "balance_loss_mlp": 1.0588758, + "epoch": 0.07148654742221554, + "flos": 19755322231680.0, + "grad_norm": 2.0343008635273834, + "language_loss": 0.84854662, + "learning_rate": 3.981973385410981e-06, + "loss": 0.87109399, + "num_input_tokens_seen": 25366715, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.390625, + "step": 1189, + "time_per_iteration": 2.451464891433716 + }, + { + "auxiliary_loss_clip": 0.01193617, + "auxiliary_loss_mlp": 0.01062112, + "balance_loss_clip": 1.03707767, + "balance_loss_mlp": 1.05589187, + "epoch": 0.07154667067488352, + "flos": 23471824688640.0, + "grad_norm": 1.7193907035784557, + "language_loss": 0.77021295, + "learning_rate": 3.9819211754105494e-06, + "loss": 0.79277021, + "num_input_tokens_seen": 25385450, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.375, + "step": 1190, + "time_per_iteration": 2.5028254985809326 + }, + { + "auxiliary_loss_clip": 0.01200514, + "auxiliary_loss_mlp": 0.01065982, + "balance_loss_clip": 1.04018509, + "balance_loss_mlp": 1.0585537, + "epoch": 0.07160679392755148, + "flos": 18332936588160.0, + "grad_norm": 2.3385605637591302, + "language_loss": 0.75145626, + "learning_rate": 3.981868890255468e-06, + "loss": 0.77412122, + "num_input_tokens_seen": 25403940, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.421875, + "step": 1191, + "time_per_iteration": 2.461045980453491 + }, + { + "auxiliary_loss_clip": 0.0119767, + "auxiliary_loss_mlp": 0.01056581, + "balance_loss_clip": 1.03147578, + "balance_loss_mlp": 1.05730891, + "epoch": 0.07166691718021945, + "flos": 17747017937280.0, + "grad_norm": 3.3332115059632583, + "language_loss": 0.7360636, + "learning_rate": 3.981816529947719e-06, + "loss": 0.75860614, + "num_input_tokens_seen": 25420410, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.40625, + "step": 1192, + "time_per_iteration": 2.4944753646850586 + }, + { + "auxiliary_loss_clip": 0.01194068, + "auxiliary_loss_mlp": 0.01052089, + "balance_loss_clip": 1.02884293, + "balance_loss_mlp": 1.05358601, + "epoch": 0.07172704043288743, + "flos": 22451925916800.0, + "grad_norm": 2.1652973689026176, + "language_loss": 0.7830255, + "learning_rate": 3.9817640944892896e-06, + "loss": 0.80548704, + "num_input_tokens_seen": 25439415, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.40625, + "step": 1193, + "time_per_iteration": 2.487025737762451 + }, + { + "auxiliary_loss_clip": 0.01202609, + "auxiliary_loss_mlp": 0.01053593, + "balance_loss_clip": 1.02786815, + "balance_loss_mlp": 1.06034899, + "epoch": 0.07178716368555539, + "flos": 23222210100480.0, + "grad_norm": 1.9678931818636167, + "language_loss": 0.85748619, + "learning_rate": 3.981711583882166e-06, + "loss": 0.88004816, + "num_input_tokens_seen": 25458715, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.421875, + "step": 1194, + "time_per_iteration": 2.493823766708374 + }, + { + "auxiliary_loss_clip": 0.01197363, + "auxiliary_loss_mlp": 0.0106262, + "balance_loss_clip": 1.03886151, + "balance_loss_mlp": 1.05782473, + "epoch": 0.07184728693822336, + "flos": 25150828072320.0, + "grad_norm": 1.9701258602591958, + "language_loss": 0.81425989, + "learning_rate": 3.981658998128341e-06, + "loss": 0.83685976, + "num_input_tokens_seen": 25477985, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.3984375, + "step": 1195, + "time_per_iteration": 2.5168802738189697 + }, + { + "auxiliary_loss_clip": 0.01195742, + "auxiliary_loss_mlp": 0.01051594, + "balance_loss_clip": 1.02979064, + "balance_loss_mlp": 1.05720496, + "epoch": 0.07190741019089132, + "flos": 22711237176960.0, + "grad_norm": 1.9269272748189905, + "language_loss": 0.79917538, + "learning_rate": 3.981606337229808e-06, + "loss": 0.82164884, + "num_input_tokens_seen": 25497110, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.3828125, + "step": 1196, + "time_per_iteration": 2.4749536514282227 + }, + { + "auxiliary_loss_clip": 0.01193553, + "auxiliary_loss_mlp": 0.01069477, + "balance_loss_clip": 1.04418063, + "balance_loss_mlp": 1.05655897, + "epoch": 0.0719675334435593, + "flos": 29349791032320.0, + "grad_norm": 8.862292558474625, + "language_loss": 0.71015084, + "learning_rate": 3.9815536011885655e-06, + "loss": 0.73278111, + "num_input_tokens_seen": 25516555, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.3671875, + "step": 1197, + "time_per_iteration": 2.520514726638794 + }, + { + "auxiliary_loss_clip": 0.01192449, + "auxiliary_loss_mlp": 0.01052157, + "balance_loss_clip": 1.02845871, + "balance_loss_mlp": 1.05429292, + "epoch": 0.07202765669622727, + "flos": 17639788861440.0, + "grad_norm": 2.0584524946763767, + "language_loss": 0.86034989, + "learning_rate": 3.98150079000661e-06, + "loss": 0.88279593, + "num_input_tokens_seen": 25533895, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.3828125, + "step": 1198, + "time_per_iteration": 2.441458225250244 + }, + { + "auxiliary_loss_clip": 0.01194875, + "auxiliary_loss_mlp": 0.01061206, + "balance_loss_clip": 1.03724504, + "balance_loss_mlp": 1.05664325, + "epoch": 0.07208777994889523, + "flos": 21434038306560.0, + "grad_norm": 2.7240513490380307, + "language_loss": 0.83822477, + "learning_rate": 3.981447903685947e-06, + "loss": 0.8607856, + "num_input_tokens_seen": 25554195, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.3828125, + "step": 1199, + "time_per_iteration": 2.462790012359619 + }, + { + "auxiliary_loss_clip": 0.01201627, + "auxiliary_loss_mlp": 0.01055923, + "balance_loss_clip": 1.03351128, + "balance_loss_mlp": 1.06159616, + "epoch": 0.07214790320156321, + "flos": 26940867373440.0, + "grad_norm": 2.0725431151836453, + "language_loss": 0.76464498, + "learning_rate": 3.981394942228581e-06, + "loss": 0.78722042, + "num_input_tokens_seen": 25574155, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.3984375, + "step": 1200, + "time_per_iteration": 2.5007636547088623 + }, + { + "auxiliary_loss_clip": 0.01196382, + "auxiliary_loss_mlp": 0.01061794, + "balance_loss_clip": 1.0376662, + "balance_loss_mlp": 1.05783701, + "epoch": 0.07220802645423118, + "flos": 23879949995520.0, + "grad_norm": 1.959995672067427, + "language_loss": 0.82965535, + "learning_rate": 3.98134190563652e-06, + "loss": 0.85223711, + "num_input_tokens_seen": 25592735, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.390625, + "step": 1201, + "time_per_iteration": 2.4968512058258057 + }, + { + "auxiliary_loss_clip": 0.01198607, + "auxiliary_loss_mlp": 0.01059493, + "balance_loss_clip": 1.03372014, + "balance_loss_mlp": 1.05568862, + "epoch": 0.07226814970689914, + "flos": 19243631036160.0, + "grad_norm": 2.411287508312223, + "language_loss": 0.69041032, + "learning_rate": 3.981288793911775e-06, + "loss": 0.71299136, + "num_input_tokens_seen": 25611510, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.4296875, + "step": 1202, + "time_per_iteration": 2.456216812133789 + }, + { + "auxiliary_loss_clip": 0.01196785, + "auxiliary_loss_mlp": 0.01063607, + "balance_loss_clip": 1.03804839, + "balance_loss_mlp": 1.05721354, + "epoch": 0.07232827295956712, + "flos": 19172025273600.0, + "grad_norm": 1.9411904343348254, + "language_loss": 0.87723774, + "learning_rate": 3.98123560705636e-06, + "loss": 0.89984161, + "num_input_tokens_seen": 25629560, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.3984375, + "step": 1203, + "time_per_iteration": 2.449903964996338 + }, + { + "auxiliary_loss_clip": 0.01198028, + "auxiliary_loss_mlp": 0.01061987, + "balance_loss_clip": 1.03803837, + "balance_loss_mlp": 1.0546416, + "epoch": 0.07238839621223508, + "flos": 17639752947840.0, + "grad_norm": 1.819852916387131, + "language_loss": 0.7844671, + "learning_rate": 3.981182345072293e-06, + "loss": 0.80706728, + "num_input_tokens_seen": 25648330, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.4375, + "step": 1204, + "time_per_iteration": 2.449265480041504 + }, + { + "auxiliary_loss_clip": 0.01194984, + "auxiliary_loss_mlp": 0.01062187, + "balance_loss_clip": 1.0388217, + "balance_loss_mlp": 1.05605316, + "epoch": 0.07244851946490305, + "flos": 28292401440000.0, + "grad_norm": 1.8514893306986777, + "language_loss": 0.81960398, + "learning_rate": 3.981129007961593e-06, + "loss": 0.84217566, + "num_input_tokens_seen": 25669470, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.390625, + "step": 1205, + "time_per_iteration": 2.517423629760742 + }, + { + "auxiliary_loss_clip": 0.01199989, + "auxiliary_loss_mlp": 0.01067422, + "balance_loss_clip": 1.04250705, + "balance_loss_mlp": 1.05852747, + "epoch": 0.07250864271757101, + "flos": 22564829341440.0, + "grad_norm": 2.0830735488163254, + "language_loss": 0.76702261, + "learning_rate": 3.981075595726283e-06, + "loss": 0.78969669, + "num_input_tokens_seen": 25690470, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.4140625, + "step": 1206, + "time_per_iteration": 2.489978313446045 + }, + { + "auxiliary_loss_clip": 0.01193529, + "auxiliary_loss_mlp": 0.01055273, + "balance_loss_clip": 1.03071594, + "balance_loss_mlp": 1.05481935, + "epoch": 0.072568765970239, + "flos": 21762405463680.0, + "grad_norm": 1.8430962541821914, + "language_loss": 0.77246201, + "learning_rate": 3.981022108368387e-06, + "loss": 0.79495007, + "num_input_tokens_seen": 25709205, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.3828125, + "step": 1207, + "time_per_iteration": 2.4895267486572266 + }, + { + "auxiliary_loss_clip": 0.01194673, + "auxiliary_loss_mlp": 0.01049476, + "balance_loss_clip": 1.02816105, + "balance_loss_mlp": 1.05703962, + "epoch": 0.07262888922290696, + "flos": 25519702792320.0, + "grad_norm": 5.768853045708734, + "language_loss": 0.79723513, + "learning_rate": 3.9809685458899345e-06, + "loss": 0.81967664, + "num_input_tokens_seen": 25728485, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.375, + "step": 1208, + "time_per_iteration": 2.509073495864868 + }, + { + "auxiliary_loss_clip": 0.0119292, + "auxiliary_loss_mlp": 0.01054601, + "balance_loss_clip": 1.03204679, + "balance_loss_mlp": 1.05551386, + "epoch": 0.07268901247557492, + "flos": 21246548290560.0, + "grad_norm": 3.6873449148768063, + "language_loss": 0.78595626, + "learning_rate": 3.980914908292955e-06, + "loss": 0.80843151, + "num_input_tokens_seen": 25747730, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.375, + "step": 1209, + "time_per_iteration": 2.506157398223877 + }, + { + "auxiliary_loss_clip": 0.01194158, + "auxiliary_loss_mlp": 0.01056209, + "balance_loss_clip": 1.03409529, + "balance_loss_mlp": 1.05510461, + "epoch": 0.0727491357282429, + "flos": 25479302970240.0, + "grad_norm": 2.6193169355932104, + "language_loss": 0.81117678, + "learning_rate": 3.980861195579486e-06, + "loss": 0.83368045, + "num_input_tokens_seen": 25768050, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.390625, + "step": 1210, + "time_per_iteration": 2.4985666275024414 + }, + { + "auxiliary_loss_clip": 0.01192388, + "auxiliary_loss_mlp": 0.01061033, + "balance_loss_clip": 1.03688109, + "balance_loss_mlp": 1.0565064, + "epoch": 0.07280925898091087, + "flos": 24462169545600.0, + "grad_norm": 2.2378435782703834, + "language_loss": 0.84350932, + "learning_rate": 3.98080740775156e-06, + "loss": 0.86604351, + "num_input_tokens_seen": 25787985, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.359375, + "step": 1211, + "time_per_iteration": 2.4971728324890137 + }, + { + "auxiliary_loss_clip": 0.01189207, + "auxiliary_loss_mlp": 0.01051238, + "balance_loss_clip": 1.02931547, + "balance_loss_mlp": 1.05233216, + "epoch": 0.07286938223357883, + "flos": 18288191220480.0, + "grad_norm": 2.2910402501943516, + "language_loss": 0.90813953, + "learning_rate": 3.98075354481122e-06, + "loss": 0.9305439, + "num_input_tokens_seen": 25803620, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.375, + "step": 1212, + "time_per_iteration": 2.424874782562256 + }, + { + "auxiliary_loss_clip": 0.01191621, + "auxiliary_loss_mlp": 0.01051304, + "balance_loss_clip": 1.0286777, + "balance_loss_mlp": 1.05457211, + "epoch": 0.07292950548624681, + "flos": 21214803646080.0, + "grad_norm": 2.346480404505952, + "language_loss": 0.7238096, + "learning_rate": 3.9806996067605055e-06, + "loss": 0.74623883, + "num_input_tokens_seen": 25823315, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.3671875, + "step": 1213, + "time_per_iteration": 2.443542003631592 + }, + { + "auxiliary_loss_clip": 0.0119423, + "auxiliary_loss_mlp": 0.01051601, + "balance_loss_clip": 1.02848625, + "balance_loss_mlp": 1.05338192, + "epoch": 0.07298962873891478, + "flos": 24642009964800.0, + "grad_norm": 1.9141465843449694, + "language_loss": 0.84441102, + "learning_rate": 3.980645593601465e-06, + "loss": 0.86686933, + "num_input_tokens_seen": 25842605, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.40625, + "step": 1214, + "time_per_iteration": 2.500112295150757 + }, + { + "auxiliary_loss_clip": 0.01197246, + "auxiliary_loss_mlp": 0.0105819, + "balance_loss_clip": 1.03468192, + "balance_loss_mlp": 1.05678558, + "epoch": 0.07304975199158274, + "flos": 27052765217280.0, + "grad_norm": 2.82775499028919, + "language_loss": 0.83929181, + "learning_rate": 3.980591505336144e-06, + "loss": 0.86184609, + "num_input_tokens_seen": 25863030, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.40625, + "step": 1215, + "time_per_iteration": 2.539769411087036 + }, + { + "auxiliary_loss_clip": 0.01194493, + "auxiliary_loss_mlp": 0.01061008, + "balance_loss_clip": 1.03711891, + "balance_loss_mlp": 1.05474758, + "epoch": 0.07310987524425071, + "flos": 33549544091520.0, + "grad_norm": 1.8082751516232567, + "language_loss": 0.80984753, + "learning_rate": 3.980537341966595e-06, + "loss": 0.83240259, + "num_input_tokens_seen": 25888015, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.390625, + "step": 1216, + "time_per_iteration": 2.6036598682403564 + }, + { + "auxiliary_loss_clip": 0.01196444, + "auxiliary_loss_mlp": 0.01050548, + "balance_loss_clip": 1.02863717, + "balance_loss_mlp": 1.05746269, + "epoch": 0.07316999849691869, + "flos": 28110944908800.0, + "grad_norm": 2.8100743600713276, + "language_loss": 0.76112509, + "learning_rate": 3.980483103494872e-06, + "loss": 0.78359497, + "num_input_tokens_seen": 25908660, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.390625, + "step": 1217, + "time_per_iteration": 2.513061046600342 + }, + { + "auxiliary_loss_clip": 0.01192952, + "auxiliary_loss_mlp": 0.01055183, + "balance_loss_clip": 1.0347029, + "balance_loss_mlp": 1.05546904, + "epoch": 0.07323012174958665, + "flos": 14392602529920.0, + "grad_norm": 2.0751842608938142, + "language_loss": 0.86442709, + "learning_rate": 3.98042878992303e-06, + "loss": 0.88690841, + "num_input_tokens_seen": 25927215, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.375, + "step": 1218, + "time_per_iteration": 2.4514572620391846 + }, + { + "auxiliary_loss_clip": 0.01193593, + "auxiliary_loss_mlp": 0.01062446, + "balance_loss_clip": 1.03989124, + "balance_loss_mlp": 1.05405331, + "epoch": 0.07329024500225462, + "flos": 21616428591360.0, + "grad_norm": 1.9036635750322874, + "language_loss": 0.86757988, + "learning_rate": 3.9803744012531305e-06, + "loss": 0.8901403, + "num_input_tokens_seen": 25945500, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.3984375, + "step": 1219, + "time_per_iteration": 2.4501893520355225 + }, + { + "auxiliary_loss_clip": 0.01190573, + "auxiliary_loss_mlp": 0.01058106, + "balance_loss_clip": 1.03654075, + "balance_loss_mlp": 1.05260015, + "epoch": 0.0733503682549226, + "flos": 13224141106560.0, + "grad_norm": 2.320539289810395, + "language_loss": 0.84721315, + "learning_rate": 3.980319937487235e-06, + "loss": 0.86969984, + "num_input_tokens_seen": 25963105, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.375, + "step": 1220, + "time_per_iteration": 2.4651544094085693 + }, + { + "auxiliary_loss_clip": 0.01193314, + "auxiliary_loss_mlp": 0.01062531, + "balance_loss_clip": 1.04015541, + "balance_loss_mlp": 1.05455709, + "epoch": 0.07341049150759056, + "flos": 20886975192960.0, + "grad_norm": 2.803787378453645, + "language_loss": 0.76840538, + "learning_rate": 3.98026539862741e-06, + "loss": 0.79096377, + "num_input_tokens_seen": 25981690, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.390625, + "step": 1221, + "time_per_iteration": 2.4643850326538086 + }, + { + "auxiliary_loss_clip": 0.01195957, + "auxiliary_loss_mlp": 0.01059407, + "balance_loss_clip": 1.0369482, + "balance_loss_mlp": 1.05698907, + "epoch": 0.07347061476025853, + "flos": 15413614623360.0, + "grad_norm": 4.111967976062365, + "language_loss": 0.92201889, + "learning_rate": 3.980210784675722e-06, + "loss": 0.94457251, + "num_input_tokens_seen": 25999890, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.390625, + "step": 1222, + "time_per_iteration": 2.461393117904663 + }, + { + "auxiliary_loss_clip": 0.01197113, + "auxiliary_loss_mlp": 0.01056347, + "balance_loss_clip": 1.03440046, + "balance_loss_mlp": 1.05795276, + "epoch": 0.0735307380129265, + "flos": 11108859131520.0, + "grad_norm": 2.739326433562924, + "language_loss": 0.91106719, + "learning_rate": 3.980156095634242e-06, + "loss": 0.9336018, + "num_input_tokens_seen": 26016445, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.390625, + "step": 1223, + "time_per_iteration": 2.4616212844848633 + }, + { + "auxiliary_loss_clip": 0.01194512, + "auxiliary_loss_mlp": 0.01071192, + "balance_loss_clip": 1.04895926, + "balance_loss_mlp": 1.05628467, + "epoch": 0.07359086126559447, + "flos": 23732392924800.0, + "grad_norm": 2.5538951271380395, + "language_loss": 0.81946027, + "learning_rate": 3.980101331505045e-06, + "loss": 0.84211743, + "num_input_tokens_seen": 26036080, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.3828125, + "step": 1224, + "time_per_iteration": 2.555060386657715 + }, + { + "auxiliary_loss_clip": 0.01191919, + "auxiliary_loss_mlp": 0.01053854, + "balance_loss_clip": 1.02938056, + "balance_loss_mlp": 1.05385065, + "epoch": 0.07365098451826244, + "flos": 20993270515200.0, + "grad_norm": 2.209826315991058, + "language_loss": 0.83313572, + "learning_rate": 3.9800464922902076e-06, + "loss": 0.8555935, + "num_input_tokens_seen": 26055805, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.375, + "step": 1225, + "time_per_iteration": 2.5317656993865967 + }, + { + "auxiliary_loss_clip": 0.01194191, + "auxiliary_loss_mlp": 0.0105208, + "balance_loss_clip": 1.0300144, + "balance_loss_mlp": 1.05566537, + "epoch": 0.0737111077709304, + "flos": 19933582452480.0, + "grad_norm": 2.0864455990649144, + "language_loss": 0.9037565, + "learning_rate": 3.979991577991808e-06, + "loss": 0.92621917, + "num_input_tokens_seen": 26073905, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.3828125, + "step": 1226, + "time_per_iteration": 5.374137878417969 + }, + { + "auxiliary_loss_clip": 0.01201048, + "auxiliary_loss_mlp": 0.01048748, + "balance_loss_clip": 1.02451301, + "balance_loss_mlp": 1.05401981, + "epoch": 0.07377123102359838, + "flos": 16581537342720.0, + "grad_norm": 2.8833434676543, + "language_loss": 0.76944947, + "learning_rate": 3.97993658861193e-06, + "loss": 0.79194742, + "num_input_tokens_seen": 26091700, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.46875, + "step": 1227, + "time_per_iteration": 2.4528942108154297 + }, + { + "auxiliary_loss_clip": 0.01192324, + "auxiliary_loss_mlp": 0.01049537, + "balance_loss_clip": 1.02720916, + "balance_loss_mlp": 1.05810142, + "epoch": 0.07383135427626634, + "flos": 28328563457280.0, + "grad_norm": 1.6041059240123434, + "language_loss": 0.85634637, + "learning_rate": 3.9798815241526575e-06, + "loss": 0.87876499, + "num_input_tokens_seen": 26114105, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.34375, + "step": 1228, + "time_per_iteration": 2.5452229976654053 + }, + { + "auxiliary_loss_clip": 0.01194537, + "auxiliary_loss_mlp": 0.01061009, + "balance_loss_clip": 1.0383954, + "balance_loss_mlp": 1.05448794, + "epoch": 0.07389147752893431, + "flos": 20047168235520.0, + "grad_norm": 4.251776538682485, + "language_loss": 0.79688829, + "learning_rate": 3.97982638461608e-06, + "loss": 0.81944382, + "num_input_tokens_seen": 26131165, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.3984375, + "step": 1229, + "time_per_iteration": 2.501086711883545 + }, + { + "auxiliary_loss_clip": 0.01196019, + "auxiliary_loss_mlp": 0.01061374, + "balance_loss_clip": 1.03777039, + "balance_loss_mlp": 1.05632436, + "epoch": 0.07395160078160229, + "flos": 18114132890880.0, + "grad_norm": 2.028375336194412, + "language_loss": 0.78218549, + "learning_rate": 3.979771170004287e-06, + "loss": 0.8047595, + "num_input_tokens_seen": 26150040, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.3984375, + "step": 1230, + "time_per_iteration": 2.4474098682403564 + }, + { + "auxiliary_loss_clip": 0.01193092, + "auxiliary_loss_mlp": 0.01048754, + "balance_loss_clip": 1.02554393, + "balance_loss_mlp": 1.05599403, + "epoch": 0.07401172403427025, + "flos": 23586918842880.0, + "grad_norm": 1.924374124094053, + "language_loss": 0.81301343, + "learning_rate": 3.979715880319372e-06, + "loss": 0.83543187, + "num_input_tokens_seen": 26169380, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.375, + "step": 1231, + "time_per_iteration": 2.4861042499542236 + }, + { + "auxiliary_loss_clip": 0.01198041, + "auxiliary_loss_mlp": 0.01066474, + "balance_loss_clip": 1.04277539, + "balance_loss_mlp": 1.05443811, + "epoch": 0.07407184728693822, + "flos": 26359904799360.0, + "grad_norm": 2.4882746298902343, + "language_loss": 0.95111585, + "learning_rate": 3.979660515563434e-06, + "loss": 0.97376096, + "num_input_tokens_seen": 26189420, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.4375, + "step": 1232, + "time_per_iteration": 2.5074143409729004 + }, + { + "auxiliary_loss_clip": 0.01194092, + "auxiliary_loss_mlp": 0.01060623, + "balance_loss_clip": 1.03938031, + "balance_loss_mlp": 1.05667329, + "epoch": 0.0741319705396062, + "flos": 22200443821440.0, + "grad_norm": 2.246534337547551, + "language_loss": 0.80640733, + "learning_rate": 3.979605075738569e-06, + "loss": 0.82895458, + "num_input_tokens_seen": 26209300, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.375, + "step": 1233, + "time_per_iteration": 2.490816831588745 + }, + { + "auxiliary_loss_clip": 0.01198611, + "auxiliary_loss_mlp": 0.01060349, + "balance_loss_clip": 1.03488624, + "balance_loss_mlp": 1.05483365, + "epoch": 0.07419209379227416, + "flos": 39200482523520.0, + "grad_norm": 2.357402762223285, + "language_loss": 0.70458734, + "learning_rate": 3.979549560846883e-06, + "loss": 0.72717696, + "num_input_tokens_seen": 26228110, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.4375, + "step": 1234, + "time_per_iteration": 2.605139970779419 + }, + { + "auxiliary_loss_clip": 0.01195848, + "auxiliary_loss_mlp": 0.01059615, + "balance_loss_clip": 1.03665543, + "balance_loss_mlp": 1.05792761, + "epoch": 0.07425221704494213, + "flos": 22781657790720.0, + "grad_norm": 2.1034220776692765, + "language_loss": 0.77058101, + "learning_rate": 3.979493970890478e-06, + "loss": 0.79313564, + "num_input_tokens_seen": 26247020, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.3828125, + "step": 1235, + "time_per_iteration": 2.4728853702545166 + }, + { + "auxiliary_loss_clip": 0.01189622, + "auxiliary_loss_mlp": 0.01053872, + "balance_loss_clip": 1.03123438, + "balance_loss_mlp": 1.05414248, + "epoch": 0.0743123402976101, + "flos": 22272983337600.0, + "grad_norm": 5.584514149172867, + "language_loss": 0.82648033, + "learning_rate": 3.979438305871464e-06, + "loss": 0.84891528, + "num_input_tokens_seen": 26265750, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.359375, + "step": 1236, + "time_per_iteration": 2.462069511413574 + }, + { + "auxiliary_loss_clip": 0.0119681, + "auxiliary_loss_mlp": 0.01057366, + "balance_loss_clip": 1.03385794, + "balance_loss_mlp": 1.05572712, + "epoch": 0.07437246355027807, + "flos": 29315029645440.0, + "grad_norm": 2.2536643652174724, + "language_loss": 0.75702679, + "learning_rate": 3.979382565791951e-06, + "loss": 0.77956861, + "num_input_tokens_seen": 26287905, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.40625, + "step": 1237, + "time_per_iteration": 2.5572054386138916 + }, + { + "auxiliary_loss_clip": 0.01192925, + "auxiliary_loss_mlp": 0.01060344, + "balance_loss_clip": 1.03817141, + "balance_loss_mlp": 1.05427146, + "epoch": 0.07443258680294604, + "flos": 31944732249600.0, + "grad_norm": 1.878495773650564, + "language_loss": 0.7740556, + "learning_rate": 3.979326750654053e-06, + "loss": 0.7965883, + "num_input_tokens_seen": 26311795, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.3828125, + "step": 1238, + "time_per_iteration": 2.5915493965148926 + }, + { + "auxiliary_loss_clip": 0.01198337, + "auxiliary_loss_mlp": 0.01055743, + "balance_loss_clip": 1.03222322, + "balance_loss_mlp": 1.05435395, + "epoch": 0.074492710055614, + "flos": 22675290641280.0, + "grad_norm": 2.0695087378138455, + "language_loss": 0.86322856, + "learning_rate": 3.9792708604598854e-06, + "loss": 0.88576937, + "num_input_tokens_seen": 26330330, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.4375, + "step": 1239, + "time_per_iteration": 2.4961507320404053 + }, + { + "auxiliary_loss_clip": 0.01194884, + "auxiliary_loss_mlp": 0.01049072, + "balance_loss_clip": 1.02401412, + "balance_loss_mlp": 1.05433989, + "epoch": 0.07455283330828198, + "flos": 21284901037440.0, + "grad_norm": 2.179426429753772, + "language_loss": 0.89070082, + "learning_rate": 3.979214895211569e-06, + "loss": 0.91314042, + "num_input_tokens_seen": 26348865, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.40625, + "step": 1240, + "time_per_iteration": 2.456801176071167 + }, + { + "auxiliary_loss_clip": 0.01197473, + "auxiliary_loss_mlp": 0.01058447, + "balance_loss_clip": 1.03325772, + "balance_loss_mlp": 1.05600643, + "epoch": 0.07461295656094995, + "flos": 24388408967040.0, + "grad_norm": 2.2624482063672513, + "language_loss": 0.88586551, + "learning_rate": 3.979158854911225e-06, + "loss": 0.90842468, + "num_input_tokens_seen": 26368210, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.4140625, + "step": 1241, + "time_per_iteration": 2.5667178630828857 + }, + { + "auxiliary_loss_clip": 0.01080695, + "auxiliary_loss_mlp": 0.01022083, + "balance_loss_clip": 1.018507, + "balance_loss_mlp": 1.02113318, + "epoch": 0.07467307981361791, + "flos": 62109660574080.0, + "grad_norm": 0.9233978594431768, + "language_loss": 0.63032585, + "learning_rate": 3.979102739560979e-06, + "loss": 0.65135366, + "num_input_tokens_seen": 26424890, + "router_z_loss_clip": 0.03564453, + "router_z_loss_mlp": 0.59375, + "step": 1242, + "time_per_iteration": 3.1321358680725098 + }, + { + "auxiliary_loss_clip": 0.012088, + "auxiliary_loss_mlp": 0.01059736, + "balance_loss_clip": 1.03305697, + "balance_loss_mlp": 1.05792046, + "epoch": 0.07473320306628589, + "flos": 24863148046080.0, + "grad_norm": 2.8956100556858004, + "language_loss": 0.62917286, + "learning_rate": 3.9790465491629595e-06, + "loss": 0.65185821, + "num_input_tokens_seen": 26446405, + "router_z_loss_clip": 0.26757812, + "router_z_loss_mlp": 1.5078125, + "step": 1243, + "time_per_iteration": 2.5571463108062744 + }, + { + "auxiliary_loss_clip": 0.01196196, + "auxiliary_loss_mlp": 0.01052045, + "balance_loss_clip": 1.0280956, + "balance_loss_mlp": 1.05710852, + "epoch": 0.07479332631895386, + "flos": 24897442556160.0, + "grad_norm": 2.504235331520048, + "language_loss": 0.76465732, + "learning_rate": 3.978990283719296e-06, + "loss": 0.78713971, + "num_input_tokens_seen": 26466070, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.390625, + "step": 1244, + "time_per_iteration": 2.501621723175049 + }, + { + "auxiliary_loss_clip": 0.01197755, + "auxiliary_loss_mlp": 0.01058762, + "balance_loss_clip": 1.03462183, + "balance_loss_mlp": 1.05684423, + "epoch": 0.07485344957162182, + "flos": 17815247821440.0, + "grad_norm": 2.8968513367461495, + "language_loss": 0.69149882, + "learning_rate": 3.978933943232123e-06, + "loss": 0.714064, + "num_input_tokens_seen": 26479350, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.40625, + "step": 1245, + "time_per_iteration": 2.417921781539917 + }, + { + "auxiliary_loss_clip": 0.01196347, + "auxiliary_loss_mlp": 0.01052065, + "balance_loss_clip": 1.02768707, + "balance_loss_mlp": 1.05663347, + "epoch": 0.0749135728242898, + "flos": 25010202326400.0, + "grad_norm": 1.9272496045423029, + "language_loss": 0.88344061, + "learning_rate": 3.978877527703576e-06, + "loss": 0.90592474, + "num_input_tokens_seen": 26498255, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.3984375, + "step": 1246, + "time_per_iteration": 2.5631723403930664 + }, + { + "auxiliary_loss_clip": 0.01205457, + "auxiliary_loss_mlp": 0.01067222, + "balance_loss_clip": 1.04055524, + "balance_loss_mlp": 1.05656838, + "epoch": 0.07497369607695777, + "flos": 17822071405440.0, + "grad_norm": 2.4755370190447064, + "language_loss": 0.87921643, + "learning_rate": 3.9788210371357945e-06, + "loss": 0.90194321, + "num_input_tokens_seen": 26515375, + "router_z_loss_clip": 0.265625, + "router_z_loss_mlp": 1.4921875, + "step": 1247, + "time_per_iteration": 2.4602389335632324 + }, + { + "auxiliary_loss_clip": 0.01194073, + "auxiliary_loss_mlp": 0.01060013, + "balance_loss_clip": 1.03502667, + "balance_loss_mlp": 1.05565107, + "epoch": 0.07503381932962573, + "flos": 15121086261120.0, + "grad_norm": 2.2039165223770194, + "language_loss": 0.6477375, + "learning_rate": 3.978764471530921e-06, + "loss": 0.67027843, + "num_input_tokens_seen": 26533595, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.3828125, + "step": 1248, + "time_per_iteration": 2.4408388137817383 + }, + { + "auxiliary_loss_clip": 0.01192958, + "auxiliary_loss_mlp": 0.0106246, + "balance_loss_clip": 1.04016805, + "balance_loss_mlp": 1.0575254, + "epoch": 0.0750939425822937, + "flos": 12816734071680.0, + "grad_norm": 2.0641418493429713, + "language_loss": 0.73964334, + "learning_rate": 3.978707830891102e-06, + "loss": 0.76219749, + "num_input_tokens_seen": 26549405, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.359375, + "step": 1249, + "time_per_iteration": 2.443767547607422 + }, + { + "auxiliary_loss_clip": 0.01201286, + "auxiliary_loss_mlp": 0.01068388, + "balance_loss_clip": 1.0433774, + "balance_loss_mlp": 1.05842972, + "epoch": 0.07515406583496168, + "flos": 24206844695040.0, + "grad_norm": 2.607815988938315, + "language_loss": 0.81845009, + "learning_rate": 3.978651115218482e-06, + "loss": 0.84114683, + "num_input_tokens_seen": 26567200, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.4296875, + "step": 1250, + "time_per_iteration": 2.491236448287964 + }, + { + "auxiliary_loss_clip": 0.01197565, + "auxiliary_loss_mlp": 0.01061004, + "balance_loss_clip": 1.03703094, + "balance_loss_mlp": 1.05932856, + "epoch": 0.07521418908762964, + "flos": 26688164215680.0, + "grad_norm": 2.308634463940828, + "language_loss": 0.66713893, + "learning_rate": 3.978594324515215e-06, + "loss": 0.68972456, + "num_input_tokens_seen": 26586190, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.3828125, + "step": 1251, + "time_per_iteration": 2.5437874794006348 + }, + { + "auxiliary_loss_clip": 0.0107681, + "auxiliary_loss_mlp": 0.01002851, + "balance_loss_clip": 0.99946529, + "balance_loss_mlp": 1.02021933, + "epoch": 0.0752743123402976, + "flos": 59095140589440.0, + "grad_norm": 0.8978558428983584, + "language_loss": 0.70356798, + "learning_rate": 3.9785374587834515e-06, + "loss": 0.72436458, + "num_input_tokens_seen": 26650710, + "router_z_loss_clip": 0.03393555, + "router_z_loss_mlp": 0.56640625, + "step": 1252, + "time_per_iteration": 3.1170923709869385 + }, + { + "auxiliary_loss_clip": 0.01194007, + "auxiliary_loss_mlp": 0.01061281, + "balance_loss_clip": 1.03698599, + "balance_loss_mlp": 1.05419612, + "epoch": 0.07533443559296558, + "flos": 23477032160640.0, + "grad_norm": 2.9290655276351045, + "language_loss": 0.79516673, + "learning_rate": 3.97848051802535e-06, + "loss": 0.81771958, + "num_input_tokens_seen": 26669000, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.3984375, + "step": 1253, + "time_per_iteration": 2.4821414947509766 + }, + { + "auxiliary_loss_clip": 0.01199953, + "auxiliary_loss_mlp": 0.01065033, + "balance_loss_clip": 1.04125071, + "balance_loss_mlp": 1.05829906, + "epoch": 0.07539455884563355, + "flos": 20879110114560.0, + "grad_norm": 2.5751371148477995, + "language_loss": 0.93441045, + "learning_rate": 3.978423502243069e-06, + "loss": 0.95706034, + "num_input_tokens_seen": 26683075, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.421875, + "step": 1254, + "time_per_iteration": 2.4245519638061523 + }, + { + "auxiliary_loss_clip": 0.01191058, + "auxiliary_loss_mlp": 0.01062028, + "balance_loss_clip": 1.03849554, + "balance_loss_mlp": 1.05566263, + "epoch": 0.07545468209830151, + "flos": 27672906551040.0, + "grad_norm": 1.866823394820361, + "language_loss": 0.88030314, + "learning_rate": 3.97836641143877e-06, + "loss": 0.902834, + "num_input_tokens_seen": 26701875, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.359375, + "step": 1255, + "time_per_iteration": 2.5579185485839844 + }, + { + "auxiliary_loss_clip": 0.01192242, + "auxiliary_loss_mlp": 0.01064619, + "balance_loss_clip": 1.04009795, + "balance_loss_mlp": 1.05518413, + "epoch": 0.0755148053509695, + "flos": 14136990370560.0, + "grad_norm": 1.7574194703288544, + "language_loss": 0.79516619, + "learning_rate": 3.978309245614618e-06, + "loss": 0.81773484, + "num_input_tokens_seen": 26719050, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.3671875, + "step": 1256, + "time_per_iteration": 2.4203784465789795 + }, + { + "auxiliary_loss_clip": 0.01074137, + "auxiliary_loss_mlp": 0.01007102, + "balance_loss_clip": 1.00378788, + "balance_loss_mlp": 1.01769829, + "epoch": 0.07557492860363746, + "flos": 58235257929600.0, + "grad_norm": 0.8283025846018472, + "language_loss": 0.58016127, + "learning_rate": 3.9782520047727825e-06, + "loss": 0.60097361, + "num_input_tokens_seen": 26780650, + "router_z_loss_clip": 0.03320312, + "router_z_loss_mlp": 0.5625, + "step": 1257, + "time_per_iteration": 3.1732118129730225 + }, + { + "auxiliary_loss_clip": 0.0119581, + "auxiliary_loss_mlp": 0.01056297, + "balance_loss_clip": 1.03272927, + "balance_loss_mlp": 1.05982757, + "epoch": 0.07563505185630542, + "flos": 24644380262400.0, + "grad_norm": 3.1336739114125107, + "language_loss": 0.89859951, + "learning_rate": 3.978194688915432e-06, + "loss": 0.92112058, + "num_input_tokens_seen": 26798725, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.359375, + "step": 1258, + "time_per_iteration": 2.516925811767578 + }, + { + "auxiliary_loss_clip": 0.01192364, + "auxiliary_loss_mlp": 0.01054502, + "balance_loss_clip": 1.03181624, + "balance_loss_mlp": 1.05663717, + "epoch": 0.07569517510897339, + "flos": 15522998515200.0, + "grad_norm": 3.28312942247731, + "language_loss": 0.81211507, + "learning_rate": 3.978137298044741e-06, + "loss": 0.83458376, + "num_input_tokens_seen": 26817005, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.359375, + "step": 1259, + "time_per_iteration": 2.449533224105835 + }, + { + "auxiliary_loss_clip": 0.01193912, + "auxiliary_loss_mlp": 0.01058573, + "balance_loss_clip": 1.03593481, + "balance_loss_mlp": 1.05662787, + "epoch": 0.07575529836164137, + "flos": 22928532503040.0, + "grad_norm": 1.9172803769558988, + "language_loss": 0.75733984, + "learning_rate": 3.978079832162885e-06, + "loss": 0.77986467, + "num_input_tokens_seen": 26836655, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.375, + "step": 1260, + "time_per_iteration": 2.5003559589385986 + }, + { + "auxiliary_loss_clip": 0.01192246, + "auxiliary_loss_mlp": 0.01059755, + "balance_loss_clip": 1.03550828, + "balance_loss_mlp": 1.0552032, + "epoch": 0.07581542161430933, + "flos": 19500428344320.0, + "grad_norm": 1.8260195606442358, + "language_loss": 0.84695768, + "learning_rate": 3.978022291272044e-06, + "loss": 0.86947775, + "num_input_tokens_seen": 26854925, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.375, + "step": 1261, + "time_per_iteration": 2.4633476734161377 + }, + { + "auxiliary_loss_clip": 0.01200376, + "auxiliary_loss_mlp": 0.01060967, + "balance_loss_clip": 1.03828108, + "balance_loss_mlp": 1.05969536, + "epoch": 0.0758755448669773, + "flos": 24973465691520.0, + "grad_norm": 2.3160282321136334, + "language_loss": 0.8266682, + "learning_rate": 3.977964675374399e-06, + "loss": 0.84928167, + "num_input_tokens_seen": 26876170, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.40625, + "step": 1262, + "time_per_iteration": 2.5256471633911133 + }, + { + "auxiliary_loss_clip": 0.01192085, + "auxiliary_loss_mlp": 0.01061195, + "balance_loss_clip": 1.03703153, + "balance_loss_mlp": 1.0540688, + "epoch": 0.07593566811964528, + "flos": 22747973811840.0, + "grad_norm": 2.4581964181262776, + "language_loss": 0.8255769, + "learning_rate": 3.977906984472136e-06, + "loss": 0.84810972, + "num_input_tokens_seen": 26895005, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.3828125, + "step": 1263, + "time_per_iteration": 2.470656633377075 + }, + { + "auxiliary_loss_clip": 0.01195735, + "auxiliary_loss_mlp": 0.01056704, + "balance_loss_clip": 1.03381538, + "balance_loss_mlp": 1.05504882, + "epoch": 0.07599579137231324, + "flos": 23112395245440.0, + "grad_norm": 2.324943057092889, + "language_loss": 0.7591399, + "learning_rate": 3.977849218567442e-06, + "loss": 0.78166431, + "num_input_tokens_seen": 26913930, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.40625, + "step": 1264, + "time_per_iteration": 2.4715359210968018 + }, + { + "auxiliary_loss_clip": 0.0119596, + "auxiliary_loss_mlp": 0.01062168, + "balance_loss_clip": 1.03832579, + "balance_loss_mlp": 1.05711412, + "epoch": 0.07605591462498121, + "flos": 14502058248960.0, + "grad_norm": 2.1997185871944356, + "language_loss": 0.81106204, + "learning_rate": 3.977791377662507e-06, + "loss": 0.83364332, + "num_input_tokens_seen": 26931485, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.390625, + "step": 1265, + "time_per_iteration": 2.440000295639038 + }, + { + "auxiliary_loss_clip": 0.01195477, + "auxiliary_loss_mlp": 0.01056708, + "balance_loss_clip": 1.03408241, + "balance_loss_mlp": 1.05631864, + "epoch": 0.07611603787764919, + "flos": 23514199758720.0, + "grad_norm": 2.141616369936441, + "language_loss": 0.64935738, + "learning_rate": 3.977733461759524e-06, + "loss": 0.67187923, + "num_input_tokens_seen": 26951670, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.390625, + "step": 1266, + "time_per_iteration": 2.495001792907715 + }, + { + "auxiliary_loss_clip": 0.01194799, + "auxiliary_loss_mlp": 0.01060988, + "balance_loss_clip": 1.03752804, + "balance_loss_mlp": 1.05550349, + "epoch": 0.07617616113031715, + "flos": 21507188353920.0, + "grad_norm": 2.2514277899416606, + "language_loss": 0.79527593, + "learning_rate": 3.977675470860691e-06, + "loss": 0.81783378, + "num_input_tokens_seen": 26970335, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.390625, + "step": 1267, + "time_per_iteration": 2.4763970375061035 + }, + { + "auxiliary_loss_clip": 0.01194511, + "auxiliary_loss_mlp": 0.01051729, + "balance_loss_clip": 1.02975869, + "balance_loss_mlp": 1.05526185, + "epoch": 0.07623628438298512, + "flos": 14573161221120.0, + "grad_norm": 2.2740159695832682, + "language_loss": 0.7253381, + "learning_rate": 3.977617404968205e-06, + "loss": 0.74780059, + "num_input_tokens_seen": 26986025, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.390625, + "step": 1268, + "time_per_iteration": 3.8910977840423584 + }, + { + "auxiliary_loss_clip": 0.01192554, + "auxiliary_loss_mlp": 0.01057239, + "balance_loss_clip": 1.03447044, + "balance_loss_mlp": 1.05342031, + "epoch": 0.07629640763565308, + "flos": 14720395069440.0, + "grad_norm": 2.163449384012833, + "language_loss": 0.81891817, + "learning_rate": 3.977559264084269e-06, + "loss": 0.84141612, + "num_input_tokens_seen": 27004045, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.390625, + "step": 1269, + "time_per_iteration": 3.8643741607666016 + }, + { + "auxiliary_loss_clip": 0.01192657, + "auxiliary_loss_mlp": 0.01054484, + "balance_loss_clip": 1.03120267, + "balance_loss_mlp": 1.05559695, + "epoch": 0.07635653088832106, + "flos": 14902929008640.0, + "grad_norm": 3.2383492700687078, + "language_loss": 0.88135087, + "learning_rate": 3.977501048211088e-06, + "loss": 0.90382218, + "num_input_tokens_seen": 27022070, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.375, + "step": 1270, + "time_per_iteration": 2.4746575355529785 + }, + { + "auxiliary_loss_clip": 0.01198155, + "auxiliary_loss_mlp": 0.0105921, + "balance_loss_clip": 1.03559494, + "balance_loss_mlp": 1.05707884, + "epoch": 0.07641665414098903, + "flos": 26651571235200.0, + "grad_norm": 2.188682914143081, + "language_loss": 0.71113384, + "learning_rate": 3.977442757350869e-06, + "loss": 0.73370755, + "num_input_tokens_seen": 27041755, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.4140625, + "step": 1271, + "time_per_iteration": 2.529632091522217 + }, + { + "auxiliary_loss_clip": 0.01189637, + "auxiliary_loss_mlp": 0.01066344, + "balance_loss_clip": 1.04351556, + "balance_loss_mlp": 1.05675423, + "epoch": 0.07647677739365699, + "flos": 25192808092800.0, + "grad_norm": 1.9018984880968814, + "language_loss": 0.82745486, + "learning_rate": 3.977384391505823e-06, + "loss": 0.85001469, + "num_input_tokens_seen": 27061540, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.328125, + "step": 1272, + "time_per_iteration": 2.4950368404388428 + }, + { + "auxiliary_loss_clip": 0.01191491, + "auxiliary_loss_mlp": 0.01061838, + "balance_loss_clip": 1.03867579, + "balance_loss_mlp": 1.05351079, + "epoch": 0.07653690064632497, + "flos": 20558141159040.0, + "grad_norm": 2.0211474255264643, + "language_loss": 0.79951203, + "learning_rate": 3.977325950678162e-06, + "loss": 0.82204533, + "num_input_tokens_seen": 27081395, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.3828125, + "step": 1273, + "time_per_iteration": 2.490281105041504 + }, + { + "auxiliary_loss_clip": 0.01194744, + "auxiliary_loss_mlp": 0.01062211, + "balance_loss_clip": 1.03858376, + "balance_loss_mlp": 1.05600715, + "epoch": 0.07659702389899294, + "flos": 22269320150400.0, + "grad_norm": 1.848359088284866, + "language_loss": 0.81545758, + "learning_rate": 3.977267434870103e-06, + "loss": 0.83802712, + "num_input_tokens_seen": 27101175, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.390625, + "step": 1274, + "time_per_iteration": 2.499799966812134 + }, + { + "auxiliary_loss_clip": 0.01191932, + "auxiliary_loss_mlp": 0.01068548, + "balance_loss_clip": 1.04430115, + "balance_loss_mlp": 1.05469346, + "epoch": 0.0766571471516609, + "flos": 32636120209920.0, + "grad_norm": 1.991418246716423, + "language_loss": 0.73099387, + "learning_rate": 3.977208844083865e-06, + "loss": 0.75359869, + "num_input_tokens_seen": 27124505, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.375, + "step": 1275, + "time_per_iteration": 2.557973623275757 + }, + { + "auxiliary_loss_clip": 0.011939, + "auxiliary_loss_mlp": 0.01061514, + "balance_loss_clip": 1.0368377, + "balance_loss_mlp": 1.05536842, + "epoch": 0.07671727040432888, + "flos": 15267386355840.0, + "grad_norm": 2.1093684912214545, + "language_loss": 0.79584897, + "learning_rate": 3.9771501783216685e-06, + "loss": 0.81840312, + "num_input_tokens_seen": 27140960, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.390625, + "step": 1276, + "time_per_iteration": 2.4329752922058105 + }, + { + "auxiliary_loss_clip": 0.01196395, + "auxiliary_loss_mlp": 0.01051332, + "balance_loss_clip": 1.02838457, + "balance_loss_mlp": 1.05656397, + "epoch": 0.07677739365699685, + "flos": 28184094956160.0, + "grad_norm": 2.623540269613024, + "language_loss": 0.59020305, + "learning_rate": 3.97709143758574e-06, + "loss": 0.61268032, + "num_input_tokens_seen": 27160985, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.3984375, + "step": 1277, + "time_per_iteration": 2.5318989753723145 + }, + { + "auxiliary_loss_clip": 0.01200985, + "auxiliary_loss_mlp": 0.01057464, + "balance_loss_clip": 1.03433728, + "balance_loss_mlp": 1.05805659, + "epoch": 0.07683751690966481, + "flos": 18296128126080.0, + "grad_norm": 2.2944749333347096, + "language_loss": 0.74846482, + "learning_rate": 3.977032621878305e-06, + "loss": 0.77104926, + "num_input_tokens_seen": 27178390, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.4296875, + "step": 1278, + "time_per_iteration": 2.448615789413452 + }, + { + "auxiliary_loss_clip": 0.01190146, + "auxiliary_loss_mlp": 0.01051712, + "balance_loss_clip": 1.02943182, + "balance_loss_mlp": 1.05475163, + "epoch": 0.07689764016233278, + "flos": 21981101420160.0, + "grad_norm": 4.0999470067777075, + "language_loss": 0.88656616, + "learning_rate": 3.976973731201596e-06, + "loss": 0.90898478, + "num_input_tokens_seen": 27197505, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.359375, + "step": 1279, + "time_per_iteration": 2.4883790016174316 + }, + { + "auxiliary_loss_clip": 0.01189256, + "auxiliary_loss_mlp": 0.01062556, + "balance_loss_clip": 1.03973901, + "balance_loss_mlp": 1.05507362, + "epoch": 0.07695776341500075, + "flos": 22235995307520.0, + "grad_norm": 3.4596954186847393, + "language_loss": 0.82899994, + "learning_rate": 3.976914765557845e-06, + "loss": 0.85151803, + "num_input_tokens_seen": 27214260, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.34375, + "step": 1280, + "time_per_iteration": 2.459294319152832 + }, + { + "auxiliary_loss_clip": 0.01188755, + "auxiliary_loss_mlp": 0.01061811, + "balance_loss_clip": 1.03874409, + "balance_loss_mlp": 1.05492759, + "epoch": 0.07701788666766872, + "flos": 16143750380160.0, + "grad_norm": 1.9224222656998016, + "language_loss": 0.76059222, + "learning_rate": 3.9768557249492875e-06, + "loss": 0.78309786, + "num_input_tokens_seen": 27232525, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.3359375, + "step": 1281, + "time_per_iteration": 2.453183650970459 + }, + { + "auxiliary_loss_clip": 0.0119548, + "auxiliary_loss_mlp": 0.01054802, + "balance_loss_clip": 1.03128171, + "balance_loss_mlp": 1.05448353, + "epoch": 0.07707800992033668, + "flos": 19463045264640.0, + "grad_norm": 1.8937081587754587, + "language_loss": 0.75307631, + "learning_rate": 3.9767966093781634e-06, + "loss": 0.77557921, + "num_input_tokens_seen": 27249800, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.40625, + "step": 1282, + "time_per_iteration": 2.4526116847991943 + }, + { + "auxiliary_loss_clip": 0.01190337, + "auxiliary_loss_mlp": 0.01070616, + "balance_loss_clip": 1.04734671, + "balance_loss_mlp": 1.054286, + "epoch": 0.07713813317300466, + "flos": 18990281433600.0, + "grad_norm": 2.0304459145795963, + "language_loss": 0.8428033, + "learning_rate": 3.976737418846713e-06, + "loss": 0.86541283, + "num_input_tokens_seen": 27268895, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.359375, + "step": 1283, + "time_per_iteration": 2.468101739883423 + }, + { + "auxiliary_loss_clip": 0.01192768, + "auxiliary_loss_mlp": 0.01062747, + "balance_loss_clip": 1.0375464, + "balance_loss_mlp": 1.05560803, + "epoch": 0.07719825642567263, + "flos": 18113953322880.0, + "grad_norm": 2.622403612740989, + "language_loss": 0.75031364, + "learning_rate": 3.976678153357181e-06, + "loss": 0.77286887, + "num_input_tokens_seen": 27288180, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.375, + "step": 1284, + "time_per_iteration": 2.451749801635742 + }, + { + "auxiliary_loss_clip": 0.01188745, + "auxiliary_loss_mlp": 0.0106155, + "balance_loss_clip": 1.03947222, + "balance_loss_mlp": 1.05330253, + "epoch": 0.0772583796783406, + "flos": 42194426993280.0, + "grad_norm": 1.6448065546510353, + "language_loss": 0.75934827, + "learning_rate": 3.976618812911817e-06, + "loss": 0.78185129, + "num_input_tokens_seen": 27311815, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.359375, + "step": 1285, + "time_per_iteration": 2.664769411087036 + }, + { + "auxiliary_loss_clip": 0.01196484, + "auxiliary_loss_mlp": 0.01062869, + "balance_loss_clip": 1.0406251, + "balance_loss_mlp": 1.05862105, + "epoch": 0.07731850293100857, + "flos": 24753692327040.0, + "grad_norm": 1.8165785508620624, + "language_loss": 0.84204662, + "learning_rate": 3.9765593975128685e-06, + "loss": 0.86464012, + "num_input_tokens_seen": 27331890, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.375, + "step": 1286, + "time_per_iteration": 2.550670862197876 + }, + { + "auxiliary_loss_clip": 0.01196192, + "auxiliary_loss_mlp": 0.01055874, + "balance_loss_clip": 1.03271151, + "balance_loss_mlp": 1.05582845, + "epoch": 0.07737862618367654, + "flos": 17565884628480.0, + "grad_norm": 4.521300853065514, + "language_loss": 0.76725763, + "learning_rate": 3.97649990716259e-06, + "loss": 0.78977823, + "num_input_tokens_seen": 27348320, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.40625, + "step": 1287, + "time_per_iteration": 2.455627918243408 + }, + { + "auxiliary_loss_clip": 0.01190346, + "auxiliary_loss_mlp": 0.01058612, + "balance_loss_clip": 1.03636777, + "balance_loss_mlp": 1.05476642, + "epoch": 0.0774387494363445, + "flos": 25627147349760.0, + "grad_norm": 1.6785000972571258, + "language_loss": 0.84509134, + "learning_rate": 3.976440341863237e-06, + "loss": 0.86758095, + "num_input_tokens_seen": 27367670, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.359375, + "step": 1288, + "time_per_iteration": 2.500218629837036 + }, + { + "auxiliary_loss_clip": 0.01192387, + "auxiliary_loss_mlp": 0.01056799, + "balance_loss_clip": 1.0350318, + "balance_loss_mlp": 1.05364347, + "epoch": 0.07749887268901248, + "flos": 12239865648000.0, + "grad_norm": 2.192533837519805, + "language_loss": 0.85769016, + "learning_rate": 3.976380701617068e-06, + "loss": 0.88018203, + "num_input_tokens_seen": 27385485, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.390625, + "step": 1289, + "time_per_iteration": 2.4759440422058105 + }, + { + "auxiliary_loss_clip": 0.01189023, + "auxiliary_loss_mlp": 0.01047658, + "balance_loss_clip": 1.02563989, + "balance_loss_mlp": 1.05300641, + "epoch": 0.07755899594168045, + "flos": 25081736261760.0, + "grad_norm": 1.8877463184856607, + "language_loss": 0.85053366, + "learning_rate": 3.976320986426344e-06, + "loss": 0.87290049, + "num_input_tokens_seen": 27405110, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.359375, + "step": 1290, + "time_per_iteration": 2.5266401767730713 + }, + { + "auxiliary_loss_clip": 0.01185369, + "auxiliary_loss_mlp": 0.01059291, + "balance_loss_clip": 1.03541303, + "balance_loss_mlp": 1.05397463, + "epoch": 0.07761911919434841, + "flos": 14246410176000.0, + "grad_norm": 2.3980248629455834, + "language_loss": 0.90562832, + "learning_rate": 3.9762611962933315e-06, + "loss": 0.92807496, + "num_input_tokens_seen": 27422855, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.3125, + "step": 1291, + "time_per_iteration": 2.4760262966156006 + }, + { + "auxiliary_loss_clip": 0.01071114, + "auxiliary_loss_mlp": 0.01008288, + "balance_loss_clip": 1.00456893, + "balance_loss_mlp": 1.01656318, + "epoch": 0.07767924244701638, + "flos": 67237202954880.0, + "grad_norm": 0.9429671936579762, + "language_loss": 0.64993972, + "learning_rate": 3.9762013312202955e-06, + "loss": 0.67073375, + "num_input_tokens_seen": 27487190, + "router_z_loss_clip": 0.03710938, + "router_z_loss_mlp": 0.546875, + "step": 1292, + "time_per_iteration": 3.1508371829986572 + }, + { + "auxiliary_loss_clip": 0.0118873, + "auxiliary_loss_mlp": 0.01059018, + "balance_loss_clip": 1.03716707, + "balance_loss_mlp": 1.05293965, + "epoch": 0.07773936569968436, + "flos": 28550635292160.0, + "grad_norm": 1.7960778456946043, + "language_loss": 0.87610948, + "learning_rate": 3.9761413912095075e-06, + "loss": 0.89858699, + "num_input_tokens_seen": 27510465, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.359375, + "step": 1293, + "time_per_iteration": 2.6359729766845703 + }, + { + "auxiliary_loss_clip": 0.01193413, + "auxiliary_loss_mlp": 0.01062236, + "balance_loss_clip": 1.03789377, + "balance_loss_mlp": 1.05659533, + "epoch": 0.07779948895235232, + "flos": 27490264871040.0, + "grad_norm": 2.312065886688882, + "language_loss": 0.85111046, + "learning_rate": 3.976081376263239e-06, + "loss": 0.873667, + "num_input_tokens_seen": 27528645, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.3671875, + "step": 1294, + "time_per_iteration": 2.5151314735412598 + }, + { + "auxiliary_loss_clip": 0.01193943, + "auxiliary_loss_mlp": 0.01054926, + "balance_loss_clip": 1.03188252, + "balance_loss_mlp": 1.05702615, + "epoch": 0.07785961220502029, + "flos": 18223301301120.0, + "grad_norm": 2.728225366024782, + "language_loss": 0.79202414, + "learning_rate": 3.976021286383768e-06, + "loss": 0.81451285, + "num_input_tokens_seen": 27546165, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.3671875, + "step": 1295, + "time_per_iteration": 2.459510326385498 + }, + { + "auxiliary_loss_clip": 0.01188808, + "auxiliary_loss_mlp": 0.01052849, + "balance_loss_clip": 1.02966261, + "balance_loss_mlp": 1.05383039, + "epoch": 0.07791973545768827, + "flos": 24608218245120.0, + "grad_norm": 2.8222308711400834, + "language_loss": 0.88216382, + "learning_rate": 3.975961121573371e-06, + "loss": 0.90458035, + "num_input_tokens_seen": 27566520, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3515625, + "step": 1296, + "time_per_iteration": 2.492892026901245 + }, + { + "auxiliary_loss_clip": 0.01192946, + "auxiliary_loss_mlp": 0.01058016, + "balance_loss_clip": 1.03410244, + "balance_loss_mlp": 1.05591464, + "epoch": 0.07797985871035623, + "flos": 14282069402880.0, + "grad_norm": 3.2140473454082086, + "language_loss": 0.96160841, + "learning_rate": 3.9759008818343305e-06, + "loss": 0.98411804, + "num_input_tokens_seen": 27581960, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.375, + "step": 1297, + "time_per_iteration": 2.4668915271759033 + }, + { + "auxiliary_loss_clip": 0.01189875, + "auxiliary_loss_mlp": 0.01054366, + "balance_loss_clip": 1.032372, + "balance_loss_mlp": 1.05289149, + "epoch": 0.0780399819630242, + "flos": 26610453141120.0, + "grad_norm": 2.460261972702069, + "language_loss": 0.76087165, + "learning_rate": 3.97584056716893e-06, + "loss": 0.78331399, + "num_input_tokens_seen": 27601415, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.3671875, + "step": 1298, + "time_per_iteration": 2.5059781074523926 + }, + { + "auxiliary_loss_clip": 0.01192131, + "auxiliary_loss_mlp": 0.01061793, + "balance_loss_clip": 1.04039502, + "balance_loss_mlp": 1.05696058, + "epoch": 0.07810010521569218, + "flos": 21834514016640.0, + "grad_norm": 1.8752674736144914, + "language_loss": 0.80755305, + "learning_rate": 3.9757801775794575e-06, + "loss": 0.83009231, + "num_input_tokens_seen": 27621490, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.3515625, + "step": 1299, + "time_per_iteration": 2.5036020278930664 + }, + { + "auxiliary_loss_clip": 0.01183493, + "auxiliary_loss_mlp": 0.01056623, + "balance_loss_clip": 1.03402138, + "balance_loss_mlp": 1.05226159, + "epoch": 0.07816022846836014, + "flos": 25081233471360.0, + "grad_norm": 2.1903498852009813, + "language_loss": 0.86459941, + "learning_rate": 3.975719713068202e-06, + "loss": 0.88700056, + "num_input_tokens_seen": 27640600, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.3125, + "step": 1300, + "time_per_iteration": 2.4866278171539307 + }, + { + "auxiliary_loss_clip": 0.0118988, + "auxiliary_loss_mlp": 0.01052064, + "balance_loss_clip": 1.0284245, + "balance_loss_mlp": 1.05393028, + "epoch": 0.0782203517210281, + "flos": 40917515431680.0, + "grad_norm": 1.909902293479526, + "language_loss": 0.71778899, + "learning_rate": 3.975659173637458e-06, + "loss": 0.74020839, + "num_input_tokens_seen": 27663070, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.359375, + "step": 1301, + "time_per_iteration": 2.6491336822509766 + }, + { + "auxiliary_loss_clip": 0.01196178, + "auxiliary_loss_mlp": 0.0106414, + "balance_loss_clip": 1.04106081, + "balance_loss_mlp": 1.0586772, + "epoch": 0.07828047497369607, + "flos": 41172014269440.0, + "grad_norm": 1.5624281437346959, + "language_loss": 0.70860815, + "learning_rate": 3.97559855928952e-06, + "loss": 0.7312113, + "num_input_tokens_seen": 27686425, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.375, + "step": 1302, + "time_per_iteration": 2.635430335998535 + }, + { + "auxiliary_loss_clip": 0.01188946, + "auxiliary_loss_mlp": 0.01060723, + "balance_loss_clip": 1.03702378, + "balance_loss_mlp": 1.05438161, + "epoch": 0.07834059822636405, + "flos": 23508130360320.0, + "grad_norm": 2.152945758623263, + "language_loss": 0.8192755, + "learning_rate": 3.9755378700266864e-06, + "loss": 0.84177226, + "num_input_tokens_seen": 27704900, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.34375, + "step": 1303, + "time_per_iteration": 2.4861090183258057 + }, + { + "auxiliary_loss_clip": 0.01188578, + "auxiliary_loss_mlp": 0.01061933, + "balance_loss_clip": 1.03879452, + "balance_loss_mlp": 1.05351233, + "epoch": 0.07840072147903202, + "flos": 20193899293440.0, + "grad_norm": 1.8425530042965788, + "language_loss": 0.7497822, + "learning_rate": 3.9754771058512585e-06, + "loss": 0.77228731, + "num_input_tokens_seen": 27724890, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3515625, + "step": 1304, + "time_per_iteration": 2.464087963104248 + }, + { + "auxiliary_loss_clip": 0.01191658, + "auxiliary_loss_mlp": 0.0106237, + "balance_loss_clip": 1.03857565, + "balance_loss_mlp": 1.05645108, + "epoch": 0.07846084473169998, + "flos": 21360816432000.0, + "grad_norm": 1.696211405930565, + "language_loss": 0.76397038, + "learning_rate": 3.975416266765542e-06, + "loss": 0.78651059, + "num_input_tokens_seen": 27743115, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.359375, + "step": 1305, + "time_per_iteration": 2.486093521118164 + }, + { + "auxiliary_loss_clip": 0.01192283, + "auxiliary_loss_mlp": 0.01064575, + "balance_loss_clip": 1.04087615, + "balance_loss_mlp": 1.05527782, + "epoch": 0.07852096798436796, + "flos": 25410965345280.0, + "grad_norm": 2.2926357932273866, + "language_loss": 0.85035503, + "learning_rate": 3.975355352771841e-06, + "loss": 0.87292361, + "num_input_tokens_seen": 27763570, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.3671875, + "step": 1306, + "time_per_iteration": 2.496265172958374 + }, + { + "auxiliary_loss_clip": 0.0119039, + "auxiliary_loss_mlp": 0.0104404, + "balance_loss_clip": 1.02299976, + "balance_loss_mlp": 1.05652416, + "epoch": 0.07858109123703592, + "flos": 24571481610240.0, + "grad_norm": 3.0575778567802976, + "language_loss": 0.90087706, + "learning_rate": 3.975294363872468e-06, + "loss": 0.92322135, + "num_input_tokens_seen": 27780030, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.34375, + "step": 1307, + "time_per_iteration": 2.5122623443603516 + }, + { + "auxiliary_loss_clip": 0.01189263, + "auxiliary_loss_mlp": 0.01057091, + "balance_loss_clip": 1.03295124, + "balance_loss_mlp": 1.05417371, + "epoch": 0.07864121448970389, + "flos": 20698874645760.0, + "grad_norm": 1.8540925974151201, + "language_loss": 0.83408689, + "learning_rate": 3.975233300069735e-06, + "loss": 0.85655046, + "num_input_tokens_seen": 27796225, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.3515625, + "step": 1308, + "time_per_iteration": 2.4686944484710693 + }, + { + "auxiliary_loss_clip": 0.01186004, + "auxiliary_loss_mlp": 0.01053628, + "balance_loss_clip": 1.03177738, + "balance_loss_mlp": 1.05289674, + "epoch": 0.07870133774237187, + "flos": 22966526113920.0, + "grad_norm": 1.6283340971904061, + "language_loss": 0.77841777, + "learning_rate": 3.975172161365958e-06, + "loss": 0.80081415, + "num_input_tokens_seen": 27815975, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.328125, + "step": 1309, + "time_per_iteration": 5.444388151168823 + }, + { + "auxiliary_loss_clip": 0.0119416, + "auxiliary_loss_mlp": 0.01062294, + "balance_loss_clip": 1.0380106, + "balance_loss_mlp": 1.05386913, + "epoch": 0.07876146099503983, + "flos": 18842832103680.0, + "grad_norm": 1.9656388899868151, + "language_loss": 0.80146122, + "learning_rate": 3.975110947763453e-06, + "loss": 0.82402575, + "num_input_tokens_seen": 27832255, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.40625, + "step": 1310, + "time_per_iteration": 3.8553466796875 + }, + { + "auxiliary_loss_clip": 0.01185305, + "auxiliary_loss_mlp": 0.01052602, + "balance_loss_clip": 1.03067899, + "balance_loss_mlp": 1.05544043, + "epoch": 0.0788215842477078, + "flos": 23805794367360.0, + "grad_norm": 1.7115323272474947, + "language_loss": 0.73069102, + "learning_rate": 3.9750496592645435e-06, + "loss": 0.75307012, + "num_input_tokens_seen": 27852180, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.296875, + "step": 1311, + "time_per_iteration": 2.5299458503723145 + }, + { + "auxiliary_loss_clip": 0.01190682, + "auxiliary_loss_mlp": 0.01072627, + "balance_loss_clip": 1.04861844, + "balance_loss_mlp": 1.05650353, + "epoch": 0.07888170750037576, + "flos": 21579907438080.0, + "grad_norm": 1.9161215374898264, + "language_loss": 0.85871482, + "learning_rate": 3.974988295871553e-06, + "loss": 0.88134789, + "num_input_tokens_seen": 27871435, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.34375, + "step": 1312, + "time_per_iteration": 2.5490031242370605 + }, + { + "auxiliary_loss_clip": 0.01186476, + "auxiliary_loss_mlp": 0.01059916, + "balance_loss_clip": 1.03811264, + "balance_loss_mlp": 1.0555284, + "epoch": 0.07894183075304374, + "flos": 19864849777920.0, + "grad_norm": 1.7542323177910393, + "language_loss": 0.81968379, + "learning_rate": 3.9749268575868085e-06, + "loss": 0.84214771, + "num_input_tokens_seen": 27890625, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.3125, + "step": 1313, + "time_per_iteration": 2.507046699523926 + }, + { + "auxiliary_loss_clip": 0.0119261, + "auxiliary_loss_mlp": 0.0105996, + "balance_loss_clip": 1.03528404, + "balance_loss_mlp": 1.05271506, + "epoch": 0.07900195400571171, + "flos": 16143463071360.0, + "grad_norm": 3.109477065223649, + "language_loss": 0.73372161, + "learning_rate": 3.97486534441264e-06, + "loss": 0.7562474, + "num_input_tokens_seen": 27906530, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.3984375, + "step": 1314, + "time_per_iteration": 2.4396395683288574 + }, + { + "auxiliary_loss_clip": 0.01185115, + "auxiliary_loss_mlp": 0.01058505, + "balance_loss_clip": 1.03678489, + "balance_loss_mlp": 1.05120206, + "epoch": 0.07906207725837967, + "flos": 23730417676800.0, + "grad_norm": 1.579996187361532, + "language_loss": 0.79460657, + "learning_rate": 3.974803756351379e-06, + "loss": 0.81704271, + "num_input_tokens_seen": 27926725, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.34375, + "step": 1315, + "time_per_iteration": 2.493365526199341 + }, + { + "auxiliary_loss_clip": 0.011877, + "auxiliary_loss_mlp": 0.01060931, + "balance_loss_clip": 1.03592062, + "balance_loss_mlp": 1.05232, + "epoch": 0.07912220051104765, + "flos": 24315905364480.0, + "grad_norm": 1.9411836832725016, + "language_loss": 0.73614991, + "learning_rate": 3.974742093405362e-06, + "loss": 0.75863618, + "num_input_tokens_seen": 27947875, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.3515625, + "step": 1316, + "time_per_iteration": 2.4696316719055176 + }, + { + "auxiliary_loss_clip": 0.01193023, + "auxiliary_loss_mlp": 0.01063129, + "balance_loss_clip": 1.03940618, + "balance_loss_mlp": 1.05415511, + "epoch": 0.07918232376371562, + "flos": 18880035615360.0, + "grad_norm": 2.862910173072837, + "language_loss": 0.65148681, + "learning_rate": 3.974680355576927e-06, + "loss": 0.67404836, + "num_input_tokens_seen": 27965040, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.390625, + "step": 1317, + "time_per_iteration": 2.447847843170166 + }, + { + "auxiliary_loss_clip": 0.01197561, + "auxiliary_loss_mlp": 0.01063488, + "balance_loss_clip": 1.03899026, + "balance_loss_mlp": 1.05774999, + "epoch": 0.07924244701638358, + "flos": 27376284038400.0, + "grad_norm": 2.3478172138868967, + "language_loss": 0.7324174, + "learning_rate": 3.974618542868415e-06, + "loss": 0.75502789, + "num_input_tokens_seen": 27985330, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.3984375, + "step": 1318, + "time_per_iteration": 2.497406482696533 + }, + { + "auxiliary_loss_clip": 0.01188329, + "auxiliary_loss_mlp": 0.01057875, + "balance_loss_clip": 1.03557122, + "balance_loss_mlp": 1.05335736, + "epoch": 0.07930257026905156, + "flos": 25120340403840.0, + "grad_norm": 1.92969491679129, + "language_loss": 0.90610284, + "learning_rate": 3.97455665528217e-06, + "loss": 0.92856491, + "num_input_tokens_seen": 28007615, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.3515625, + "step": 1319, + "time_per_iteration": 2.5007200241088867 + }, + { + "auxiliary_loss_clip": 0.01188786, + "auxiliary_loss_mlp": 0.01054126, + "balance_loss_clip": 1.03086793, + "balance_loss_mlp": 1.05155873, + "epoch": 0.07936269352171953, + "flos": 21834478103040.0, + "grad_norm": 2.95797867210378, + "language_loss": 0.79765761, + "learning_rate": 3.974494692820539e-06, + "loss": 0.82008684, + "num_input_tokens_seen": 28027765, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.375, + "step": 1320, + "time_per_iteration": 2.4683783054351807 + }, + { + "auxiliary_loss_clip": 0.01190918, + "auxiliary_loss_mlp": 0.01057044, + "balance_loss_clip": 1.03448987, + "balance_loss_mlp": 1.05700457, + "epoch": 0.07942281677438749, + "flos": 16939889377920.0, + "grad_norm": 2.6163787894008363, + "language_loss": 0.69574934, + "learning_rate": 3.974432655485872e-06, + "loss": 0.71822894, + "num_input_tokens_seen": 28044225, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.34375, + "step": 1321, + "time_per_iteration": 2.466911554336548 + }, + { + "auxiliary_loss_clip": 0.01184231, + "auxiliary_loss_mlp": 0.01055954, + "balance_loss_clip": 1.03313756, + "balance_loss_mlp": 1.05313718, + "epoch": 0.07948294002705546, + "flos": 18986941468800.0, + "grad_norm": 1.926313653502779, + "language_loss": 0.83559513, + "learning_rate": 3.9743705432805195e-06, + "loss": 0.857997, + "num_input_tokens_seen": 28062915, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.3125, + "step": 1322, + "time_per_iteration": 2.465885639190674 + }, + { + "auxiliary_loss_clip": 0.01188233, + "auxiliary_loss_mlp": 0.01058763, + "balance_loss_clip": 1.03544521, + "balance_loss_mlp": 1.05104756, + "epoch": 0.07954306327972344, + "flos": 21653452535040.0, + "grad_norm": 1.8863777031262867, + "language_loss": 0.90437615, + "learning_rate": 3.974308356206838e-06, + "loss": 0.92684615, + "num_input_tokens_seen": 28082175, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.375, + "step": 1323, + "time_per_iteration": 2.465841293334961 + }, + { + "auxiliary_loss_clip": 0.0118735, + "auxiliary_loss_mlp": 0.01057162, + "balance_loss_clip": 1.03438115, + "balance_loss_mlp": 1.05414796, + "epoch": 0.0796031865323914, + "flos": 23220270766080.0, + "grad_norm": 1.6454981938510795, + "language_loss": 0.82583225, + "learning_rate": 3.974246094267187e-06, + "loss": 0.84827733, + "num_input_tokens_seen": 28102645, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.328125, + "step": 1324, + "time_per_iteration": 2.475486993789673 + }, + { + "auxiliary_loss_clip": 0.01188398, + "auxiliary_loss_mlp": 0.01049438, + "balance_loss_clip": 1.0255841, + "balance_loss_mlp": 1.05264676, + "epoch": 0.07966330978505937, + "flos": 23294534135040.0, + "grad_norm": 2.416918252865386, + "language_loss": 0.79654729, + "learning_rate": 3.974183757463925e-06, + "loss": 0.81892562, + "num_input_tokens_seen": 28122805, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.359375, + "step": 1325, + "time_per_iteration": 2.482555389404297 + }, + { + "auxiliary_loss_clip": 0.01190127, + "auxiliary_loss_mlp": 0.01064919, + "balance_loss_clip": 1.03989661, + "balance_loss_mlp": 1.05474687, + "epoch": 0.07972343303772735, + "flos": 18363783392640.0, + "grad_norm": 2.170521767048619, + "language_loss": 0.8812806, + "learning_rate": 3.974121345799418e-06, + "loss": 0.90383106, + "num_input_tokens_seen": 28140530, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.3515625, + "step": 1326, + "time_per_iteration": 2.466742753982544 + }, + { + "auxiliary_loss_clip": 0.01182901, + "auxiliary_loss_mlp": 0.01052255, + "balance_loss_clip": 1.02823424, + "balance_loss_mlp": 1.05014396, + "epoch": 0.07978355629039531, + "flos": 21762513204480.0, + "grad_norm": 2.3992518634606164, + "language_loss": 0.83013594, + "learning_rate": 3.974058859276032e-06, + "loss": 0.8524875, + "num_input_tokens_seen": 28159640, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.328125, + "step": 1327, + "time_per_iteration": 2.4989237785339355 + }, + { + "auxiliary_loss_clip": 0.0119143, + "auxiliary_loss_mlp": 0.0105424, + "balance_loss_clip": 1.03013575, + "balance_loss_mlp": 1.05436027, + "epoch": 0.07984367954306328, + "flos": 18551309322240.0, + "grad_norm": 2.1664091533416587, + "language_loss": 0.78452092, + "learning_rate": 3.9739962978961354e-06, + "loss": 0.80697763, + "num_input_tokens_seen": 28177050, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.375, + "step": 1328, + "time_per_iteration": 2.4402852058410645 + }, + { + "auxiliary_loss_clip": 0.01191637, + "auxiliary_loss_mlp": 0.01053331, + "balance_loss_clip": 1.02969217, + "balance_loss_mlp": 1.05460131, + "epoch": 0.07990380279573125, + "flos": 16904050583040.0, + "grad_norm": 2.484533735051083, + "language_loss": 0.74277186, + "learning_rate": 3.973933661662101e-06, + "loss": 0.76522154, + "num_input_tokens_seen": 28193245, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.375, + "step": 1329, + "time_per_iteration": 2.425388813018799 + }, + { + "auxiliary_loss_clip": 0.01185759, + "auxiliary_loss_mlp": 0.01060058, + "balance_loss_clip": 1.03731298, + "balance_loss_mlp": 1.05096054, + "epoch": 0.07996392604839922, + "flos": 24098358643200.0, + "grad_norm": 1.5753219993175995, + "language_loss": 0.81090498, + "learning_rate": 3.973870950576305e-06, + "loss": 0.83336312, + "num_input_tokens_seen": 28213570, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.3515625, + "step": 1330, + "time_per_iteration": 2.4831247329711914 + }, + { + "auxiliary_loss_clip": 0.01190834, + "auxiliary_loss_mlp": 0.01062422, + "balance_loss_clip": 1.03924823, + "balance_loss_mlp": 1.05348384, + "epoch": 0.08002404930106718, + "flos": 14278729438080.0, + "grad_norm": 2.322034822225311, + "language_loss": 0.88790143, + "learning_rate": 3.9738081646411255e-06, + "loss": 0.91043401, + "num_input_tokens_seen": 28229980, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.375, + "step": 1331, + "time_per_iteration": 2.4410722255706787 + }, + { + "auxiliary_loss_clip": 0.01193336, + "auxiliary_loss_mlp": 0.01058252, + "balance_loss_clip": 1.03414834, + "balance_loss_mlp": 1.05288279, + "epoch": 0.08008417255373516, + "flos": 40406219285760.0, + "grad_norm": 2.577873328737783, + "language_loss": 0.73332524, + "learning_rate": 3.973745303858942e-06, + "loss": 0.75584114, + "num_input_tokens_seen": 28253840, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.40625, + "step": 1332, + "time_per_iteration": 2.6054465770721436 + }, + { + "auxiliary_loss_clip": 0.01186558, + "auxiliary_loss_mlp": 0.01050644, + "balance_loss_clip": 1.02820885, + "balance_loss_mlp": 1.05179858, + "epoch": 0.08014429580640313, + "flos": 18478913460480.0, + "grad_norm": 1.9568005204239032, + "language_loss": 0.82994795, + "learning_rate": 3.973682368232138e-06, + "loss": 0.85232008, + "num_input_tokens_seen": 28271675, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.34375, + "step": 1333, + "time_per_iteration": 2.453457832336426 + }, + { + "auxiliary_loss_clip": 0.01187412, + "auxiliary_loss_mlp": 0.01055323, + "balance_loss_clip": 1.03272128, + "balance_loss_mlp": 1.05115032, + "epoch": 0.0802044190590711, + "flos": 22053461368320.0, + "grad_norm": 2.7771179443818466, + "language_loss": 0.74698973, + "learning_rate": 3.9736193577631015e-06, + "loss": 0.76941711, + "num_input_tokens_seen": 28291850, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.359375, + "step": 1334, + "time_per_iteration": 2.5768256187438965 + }, + { + "auxiliary_loss_clip": 0.01187182, + "auxiliary_loss_mlp": 0.01060862, + "balance_loss_clip": 1.03831935, + "balance_loss_mlp": 1.05457497, + "epoch": 0.08026454231173906, + "flos": 24572128055040.0, + "grad_norm": 2.0216765528325635, + "language_loss": 0.80279201, + "learning_rate": 3.973556272454221e-06, + "loss": 0.82527244, + "num_input_tokens_seen": 28310780, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.328125, + "step": 1335, + "time_per_iteration": 2.538670301437378 + }, + { + "auxiliary_loss_clip": 0.01078994, + "auxiliary_loss_mlp": 0.01011272, + "balance_loss_clip": 1.00802934, + "balance_loss_mlp": 1.02308655, + "epoch": 0.08032466556440704, + "flos": 52581841459200.0, + "grad_norm": 0.7427722697577622, + "language_loss": 0.56020629, + "learning_rate": 3.973493112307889e-06, + "loss": 0.58110893, + "num_input_tokens_seen": 28369985, + "router_z_loss_clip": 0.0324707, + "router_z_loss_mlp": 0.5625, + "step": 1336, + "time_per_iteration": 3.125026226043701 + }, + { + "auxiliary_loss_clip": 0.01188939, + "auxiliary_loss_mlp": 0.01054834, + "balance_loss_clip": 1.0331738, + "balance_loss_mlp": 1.05371606, + "epoch": 0.080384788817075, + "flos": 23842602829440.0, + "grad_norm": 2.050916847484745, + "language_loss": 0.67764497, + "learning_rate": 3.9734298773265005e-06, + "loss": 0.70008272, + "num_input_tokens_seen": 28388670, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.3515625, + "step": 1337, + "time_per_iteration": 2.506103038787842 + }, + { + "auxiliary_loss_clip": 0.01188826, + "auxiliary_loss_mlp": 0.01065102, + "balance_loss_clip": 1.04313135, + "balance_loss_mlp": 1.05480385, + "epoch": 0.08044491206974297, + "flos": 25300719527040.0, + "grad_norm": 1.8692893317328456, + "language_loss": 0.86701488, + "learning_rate": 3.973366567512453e-06, + "loss": 0.88955414, + "num_input_tokens_seen": 28411845, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.3359375, + "step": 1338, + "time_per_iteration": 2.5451908111572266 + }, + { + "auxiliary_loss_clip": 0.01188004, + "auxiliary_loss_mlp": 0.01060185, + "balance_loss_clip": 1.0368793, + "balance_loss_mlp": 1.05142283, + "epoch": 0.08050503532241095, + "flos": 22376549226240.0, + "grad_norm": 2.6265473040924725, + "language_loss": 0.87246621, + "learning_rate": 3.973303182868147e-06, + "loss": 0.89494807, + "num_input_tokens_seen": 28427875, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.359375, + "step": 1339, + "time_per_iteration": 2.450932502746582 + }, + { + "auxiliary_loss_clip": 0.01181336, + "auxiliary_loss_mlp": 0.01047749, + "balance_loss_clip": 1.02660179, + "balance_loss_mlp": 1.05106449, + "epoch": 0.08056515857507891, + "flos": 18369421827840.0, + "grad_norm": 2.428441908593999, + "language_loss": 0.88819683, + "learning_rate": 3.973239723395988e-06, + "loss": 0.91048771, + "num_input_tokens_seen": 28446615, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.296875, + "step": 1340, + "time_per_iteration": 2.4539895057678223 + }, + { + "auxiliary_loss_clip": 0.01072684, + "auxiliary_loss_mlp": 0.01003041, + "balance_loss_clip": 0.99951285, + "balance_loss_mlp": 1.01727247, + "epoch": 0.08062528182774688, + "flos": 51348130980480.0, + "grad_norm": 0.8886760882983712, + "language_loss": 0.64806795, + "learning_rate": 3.97317618909838e-06, + "loss": 0.66882515, + "num_input_tokens_seen": 28505290, + "router_z_loss_clip": 0.03540039, + "router_z_loss_mlp": 0.5546875, + "step": 1341, + "time_per_iteration": 3.0034360885620117 + }, + { + "auxiliary_loss_clip": 0.01193907, + "auxiliary_loss_mlp": 0.01060938, + "balance_loss_clip": 1.03577328, + "balance_loss_mlp": 1.05301166, + "epoch": 0.08068540508041486, + "flos": 17599712261760.0, + "grad_norm": 2.817345215565239, + "language_loss": 0.89616883, + "learning_rate": 3.973112579977733e-06, + "loss": 0.91871732, + "num_input_tokens_seen": 28522735, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.40625, + "step": 1342, + "time_per_iteration": 2.479701042175293 + }, + { + "auxiliary_loss_clip": 0.01194936, + "auxiliary_loss_mlp": 0.0105815, + "balance_loss_clip": 1.03334308, + "balance_loss_mlp": 1.05721259, + "epoch": 0.08074552833308282, + "flos": 10561185486720.0, + "grad_norm": 2.7453135307928216, + "language_loss": 0.76378155, + "learning_rate": 3.973048896036459e-06, + "loss": 0.78631246, + "num_input_tokens_seen": 28539460, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.375, + "step": 1343, + "time_per_iteration": 2.4969120025634766 + }, + { + "auxiliary_loss_clip": 0.01072196, + "auxiliary_loss_mlp": 0.01008036, + "balance_loss_clip": 1.00446022, + "balance_loss_mlp": 1.01791215, + "epoch": 0.08080565158575079, + "flos": 60840254954880.0, + "grad_norm": 0.8963318804352591, + "language_loss": 0.57395822, + "learning_rate": 3.972985137276974e-06, + "loss": 0.59476054, + "num_input_tokens_seen": 28599855, + "router_z_loss_clip": 0.03564453, + "router_z_loss_mlp": 0.54296875, + "step": 1344, + "time_per_iteration": 2.9917871952056885 + }, + { + "auxiliary_loss_clip": 0.01190985, + "auxiliary_loss_mlp": 0.0105771, + "balance_loss_clip": 1.03452373, + "balance_loss_mlp": 1.05523396, + "epoch": 0.08086577483841875, + "flos": 18332361970560.0, + "grad_norm": 2.677643541218582, + "language_loss": 0.86665964, + "learning_rate": 3.972921303701695e-06, + "loss": 0.88914657, + "num_input_tokens_seen": 28617585, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.359375, + "step": 1345, + "time_per_iteration": 2.4601447582244873 + }, + { + "auxiliary_loss_clip": 0.01187459, + "auxiliary_loss_mlp": 0.01054913, + "balance_loss_clip": 1.03289497, + "balance_loss_mlp": 1.05403256, + "epoch": 0.08092589809108673, + "flos": 21543601766400.0, + "grad_norm": 1.7098835991166323, + "language_loss": 0.87242532, + "learning_rate": 3.972857395313042e-06, + "loss": 0.894849, + "num_input_tokens_seen": 28636355, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.3359375, + "step": 1346, + "time_per_iteration": 2.4809892177581787 + }, + { + "auxiliary_loss_clip": 0.01185898, + "auxiliary_loss_mlp": 0.01054973, + "balance_loss_clip": 1.03256202, + "balance_loss_mlp": 1.05219567, + "epoch": 0.0809860213437547, + "flos": 22128012046080.0, + "grad_norm": 1.6659805361601863, + "language_loss": 0.92606491, + "learning_rate": 3.972793412113439e-06, + "loss": 0.94847363, + "num_input_tokens_seen": 28656260, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.3359375, + "step": 1347, + "time_per_iteration": 2.4802379608154297 + }, + { + "auxiliary_loss_clip": 0.0118757, + "auxiliary_loss_mlp": 0.01057822, + "balance_loss_clip": 1.03318167, + "balance_loss_mlp": 1.05471659, + "epoch": 0.08104614459642266, + "flos": 21725489260800.0, + "grad_norm": 9.453605004454174, + "language_loss": 0.89181751, + "learning_rate": 3.972729354105312e-06, + "loss": 0.91427147, + "num_input_tokens_seen": 28675865, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.328125, + "step": 1348, + "time_per_iteration": 2.4610300064086914 + }, + { + "auxiliary_loss_clip": 0.01185296, + "auxiliary_loss_mlp": 0.01056008, + "balance_loss_clip": 1.03420484, + "balance_loss_mlp": 1.05543983, + "epoch": 0.08110626784909064, + "flos": 23951878980480.0, + "grad_norm": 2.4916215003739355, + "language_loss": 0.76796132, + "learning_rate": 3.97266522129109e-06, + "loss": 0.7903744, + "num_input_tokens_seen": 28696255, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.296875, + "step": 1349, + "time_per_iteration": 2.4789178371429443 + }, + { + "auxiliary_loss_clip": 0.01187103, + "auxiliary_loss_mlp": 0.0105974, + "balance_loss_clip": 1.03669679, + "balance_loss_mlp": 1.05236626, + "epoch": 0.0811663911017586, + "flos": 19025689265280.0, + "grad_norm": 2.126949034470324, + "language_loss": 0.88571703, + "learning_rate": 3.972601013673205e-06, + "loss": 0.90818548, + "num_input_tokens_seen": 28713905, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.34375, + "step": 1350, + "time_per_iteration": 2.43094539642334 + }, + { + "auxiliary_loss_clip": 0.01184059, + "auxiliary_loss_mlp": 0.01061052, + "balance_loss_clip": 1.03773451, + "balance_loss_mlp": 1.05228257, + "epoch": 0.08122651435442657, + "flos": 15341290588800.0, + "grad_norm": 2.044220866897066, + "language_loss": 0.82058489, + "learning_rate": 3.972536731254092e-06, + "loss": 0.843036, + "num_input_tokens_seen": 28732075, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.3125, + "step": 1351, + "time_per_iteration": 6.688653469085693 + }, + { + "auxiliary_loss_clip": 0.01184193, + "auxiliary_loss_mlp": 0.01053712, + "balance_loss_clip": 1.02917862, + "balance_loss_mlp": 1.04863417, + "epoch": 0.08128663760709455, + "flos": 23221563655680.0, + "grad_norm": 1.9894600711485977, + "language_loss": 0.75347674, + "learning_rate": 3.972472374036189e-06, + "loss": 0.77585584, + "num_input_tokens_seen": 28751150, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.359375, + "step": 1352, + "time_per_iteration": 2.4888412952423096 + }, + { + "auxiliary_loss_clip": 0.01192461, + "auxiliary_loss_mlp": 0.01055559, + "balance_loss_clip": 1.03163338, + "balance_loss_mlp": 1.05483341, + "epoch": 0.08134676085976252, + "flos": 22965628273920.0, + "grad_norm": 1.7603053493114211, + "language_loss": 0.82833469, + "learning_rate": 3.972407942021935e-06, + "loss": 0.85081488, + "num_input_tokens_seen": 28773360, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.375, + "step": 1353, + "time_per_iteration": 2.522960901260376 + }, + { + "auxiliary_loss_clip": 0.01073388, + "auxiliary_loss_mlp": 0.01010471, + "balance_loss_clip": 1.00694275, + "balance_loss_mlp": 1.01996851, + "epoch": 0.08140688411243048, + "flos": 64322115816960.0, + "grad_norm": 0.8931676068679675, + "language_loss": 0.5970993, + "learning_rate": 3.972343435213775e-06, + "loss": 0.61793786, + "num_input_tokens_seen": 28833390, + "router_z_loss_clip": 0.03540039, + "router_z_loss_mlp": 0.53125, + "step": 1354, + "time_per_iteration": 3.0639474391937256 + }, + { + "auxiliary_loss_clip": 0.0118665, + "auxiliary_loss_mlp": 0.01060844, + "balance_loss_clip": 1.03764629, + "balance_loss_mlp": 1.05431724, + "epoch": 0.08146700736509845, + "flos": 22491858862080.0, + "grad_norm": 1.7981329827127455, + "language_loss": 0.82785606, + "learning_rate": 3.972278853614154e-06, + "loss": 0.85033101, + "num_input_tokens_seen": 28852430, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.3203125, + "step": 1355, + "time_per_iteration": 2.4664132595062256 + }, + { + "auxiliary_loss_clip": 0.01186535, + "auxiliary_loss_mlp": 0.01062061, + "balance_loss_clip": 1.03619206, + "balance_loss_mlp": 1.05146575, + "epoch": 0.08152713061776642, + "flos": 20447823513600.0, + "grad_norm": 1.9123465925299232, + "language_loss": 0.70799643, + "learning_rate": 3.972214197225521e-06, + "loss": 0.73048234, + "num_input_tokens_seen": 28870685, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.3515625, + "step": 1356, + "time_per_iteration": 2.509061813354492 + }, + { + "auxiliary_loss_clip": 0.01188256, + "auxiliary_loss_mlp": 0.01055944, + "balance_loss_clip": 1.03169644, + "balance_loss_mlp": 1.05148005, + "epoch": 0.08158725387043439, + "flos": 23550218121600.0, + "grad_norm": 2.53580294551395, + "language_loss": 0.70255458, + "learning_rate": 3.972149466050329e-06, + "loss": 0.72499657, + "num_input_tokens_seen": 28889860, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.3671875, + "step": 1357, + "time_per_iteration": 2.476951837539673 + }, + { + "auxiliary_loss_clip": 0.01191615, + "auxiliary_loss_mlp": 0.01053374, + "balance_loss_clip": 1.03067684, + "balance_loss_mlp": 1.05488217, + "epoch": 0.08164737712310235, + "flos": 22017335264640.0, + "grad_norm": 2.6163823683714953, + "language_loss": 0.84186697, + "learning_rate": 3.97208466009103e-06, + "loss": 0.86431682, + "num_input_tokens_seen": 28905865, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.3671875, + "step": 1358, + "time_per_iteration": 2.457376480102539 + }, + { + "auxiliary_loss_clip": 0.01190093, + "auxiliary_loss_mlp": 0.01056216, + "balance_loss_clip": 1.0310626, + "balance_loss_mlp": 1.05484545, + "epoch": 0.08170750037577033, + "flos": 23367827836800.0, + "grad_norm": 1.9894839389786314, + "language_loss": 1.02294087, + "learning_rate": 3.972019779350084e-06, + "loss": 1.04540396, + "num_input_tokens_seen": 28925250, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.3515625, + "step": 1359, + "time_per_iteration": 2.4723212718963623 + }, + { + "auxiliary_loss_clip": 0.01185855, + "auxiliary_loss_mlp": 0.01057366, + "balance_loss_clip": 1.03344035, + "balance_loss_mlp": 1.0511415, + "epoch": 0.0817676236284383, + "flos": 28397978490240.0, + "grad_norm": 2.0666688933075963, + "language_loss": 0.82969773, + "learning_rate": 3.971954823829951e-06, + "loss": 0.85212988, + "num_input_tokens_seen": 28943445, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.34375, + "step": 1360, + "time_per_iteration": 2.5143508911132812 + }, + { + "auxiliary_loss_clip": 0.01190184, + "auxiliary_loss_mlp": 0.01062181, + "balance_loss_clip": 1.03820777, + "balance_loss_mlp": 1.05335808, + "epoch": 0.08182774688110626, + "flos": 19208905562880.0, + "grad_norm": 2.14797754608813, + "language_loss": 0.72352278, + "learning_rate": 3.971889793533093e-06, + "loss": 0.74604642, + "num_input_tokens_seen": 28962695, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.3671875, + "step": 1361, + "time_per_iteration": 2.458034038543701 + }, + { + "auxiliary_loss_clip": 0.01179057, + "auxiliary_loss_mlp": 0.01057287, + "balance_loss_clip": 1.03249121, + "balance_loss_mlp": 1.04741335, + "epoch": 0.08188787013377424, + "flos": 22784099915520.0, + "grad_norm": 5.8589819193374515, + "language_loss": 0.76781029, + "learning_rate": 3.971824688461976e-06, + "loss": 0.79017377, + "num_input_tokens_seen": 28982120, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.3125, + "step": 1362, + "time_per_iteration": 2.472259759902954 + }, + { + "auxiliary_loss_clip": 0.01187551, + "auxiliary_loss_mlp": 0.01052018, + "balance_loss_clip": 1.0291419, + "balance_loss_mlp": 1.05449164, + "epoch": 0.08194799338644221, + "flos": 16468095214080.0, + "grad_norm": 2.631594675791475, + "language_loss": 0.72409523, + "learning_rate": 3.971759508619069e-06, + "loss": 0.74649096, + "num_input_tokens_seen": 28998100, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.328125, + "step": 1363, + "time_per_iteration": 2.4447264671325684 + }, + { + "auxiliary_loss_clip": 0.01189235, + "auxiliary_loss_mlp": 0.01061525, + "balance_loss_clip": 1.03603828, + "balance_loss_mlp": 1.05607057, + "epoch": 0.08200811663911017, + "flos": 23913633974400.0, + "grad_norm": 3.9166951523525464, + "language_loss": 0.77459586, + "learning_rate": 3.971694254006844e-06, + "loss": 0.79710352, + "num_input_tokens_seen": 29017095, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.328125, + "step": 1364, + "time_per_iteration": 2.475141763687134 + }, + { + "auxiliary_loss_clip": 0.01190144, + "auxiliary_loss_mlp": 0.01061328, + "balance_loss_clip": 1.03745019, + "balance_loss_mlp": 1.05500793, + "epoch": 0.08206823989177814, + "flos": 17896550256000.0, + "grad_norm": 2.6241179536013033, + "language_loss": 0.82025397, + "learning_rate": 3.971628924627776e-06, + "loss": 0.84276867, + "num_input_tokens_seen": 29037240, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.3515625, + "step": 1365, + "time_per_iteration": 2.493732452392578 + }, + { + "auxiliary_loss_clip": 0.0118713, + "auxiliary_loss_mlp": 0.0105741, + "balance_loss_clip": 1.03406882, + "balance_loss_mlp": 1.05614781, + "epoch": 0.08212836314444612, + "flos": 22088186841600.0, + "grad_norm": 3.3261283913074884, + "language_loss": 0.82173789, + "learning_rate": 3.97156352048434e-06, + "loss": 0.84418333, + "num_input_tokens_seen": 29056250, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.3125, + "step": 1366, + "time_per_iteration": 2.4809322357177734 + }, + { + "auxiliary_loss_clip": 0.01186928, + "auxiliary_loss_mlp": 0.0105891, + "balance_loss_clip": 1.03703475, + "balance_loss_mlp": 1.05126381, + "epoch": 0.08218848639711408, + "flos": 17597485618560.0, + "grad_norm": 2.8403828718649033, + "language_loss": 0.81534755, + "learning_rate": 3.97149804157902e-06, + "loss": 0.83780599, + "num_input_tokens_seen": 29073380, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.359375, + "step": 1367, + "time_per_iteration": 2.4577715396881104 + }, + { + "auxiliary_loss_clip": 0.01192402, + "auxiliary_loss_mlp": 0.01060775, + "balance_loss_clip": 1.03724277, + "balance_loss_mlp": 1.05413651, + "epoch": 0.08224860964978205, + "flos": 17857838373120.0, + "grad_norm": 2.3540874203263358, + "language_loss": 0.83644414, + "learning_rate": 3.9714324879142946e-06, + "loss": 0.85897589, + "num_input_tokens_seen": 29091330, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.3828125, + "step": 1368, + "time_per_iteration": 2.453547716140747 + }, + { + "auxiliary_loss_clip": 0.01181645, + "auxiliary_loss_mlp": 0.01049123, + "balance_loss_clip": 1.02694988, + "balance_loss_mlp": 1.05349994, + "epoch": 0.08230873290245003, + "flos": 25227533566080.0, + "grad_norm": 1.7360129433802456, + "language_loss": 0.81245828, + "learning_rate": 3.971366859492653e-06, + "loss": 0.83476603, + "num_input_tokens_seen": 29110375, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.28125, + "step": 1369, + "time_per_iteration": 2.527573585510254 + }, + { + "auxiliary_loss_clip": 0.01185735, + "auxiliary_loss_mlp": 0.01051149, + "balance_loss_clip": 1.02979898, + "balance_loss_mlp": 1.05528903, + "epoch": 0.08236885615511799, + "flos": 31759935753600.0, + "grad_norm": 2.240857135161324, + "language_loss": 0.74790901, + "learning_rate": 3.971301156316582e-06, + "loss": 0.77027786, + "num_input_tokens_seen": 29129395, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.3046875, + "step": 1370, + "time_per_iteration": 2.5205185413360596 + }, + { + "auxiliary_loss_clip": 0.01189372, + "auxiliary_loss_mlp": 0.01061396, + "balance_loss_clip": 1.03697038, + "balance_loss_mlp": 1.05480862, + "epoch": 0.08242897940778596, + "flos": 23185832601600.0, + "grad_norm": 1.6313231263601415, + "language_loss": 0.74633086, + "learning_rate": 3.971235378388573e-06, + "loss": 0.76883852, + "num_input_tokens_seen": 29148650, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.34375, + "step": 1371, + "time_per_iteration": 2.4995803833007812 + }, + { + "auxiliary_loss_clip": 0.01188254, + "auxiliary_loss_mlp": 0.01061601, + "balance_loss_clip": 1.03769946, + "balance_loss_mlp": 1.05410123, + "epoch": 0.08248910266045394, + "flos": 34491480393600.0, + "grad_norm": 2.0830704741847423, + "language_loss": 0.71080554, + "learning_rate": 3.971169525711122e-06, + "loss": 0.73330408, + "num_input_tokens_seen": 29170785, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.34375, + "step": 1372, + "time_per_iteration": 2.574457883834839 + }, + { + "auxiliary_loss_clip": 0.0118845, + "auxiliary_loss_mlp": 0.01051797, + "balance_loss_clip": 1.02750254, + "balance_loss_mlp": 1.05397415, + "epoch": 0.0825492259131219, + "flos": 13436228960640.0, + "grad_norm": 3.137320584176607, + "language_loss": 0.88010907, + "learning_rate": 3.9711035982867246e-06, + "loss": 0.90251154, + "num_input_tokens_seen": 29185210, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.34375, + "step": 1373, + "time_per_iteration": 2.485727310180664 + }, + { + "auxiliary_loss_clip": 0.01186594, + "auxiliary_loss_mlp": 0.01058909, + "balance_loss_clip": 1.03575897, + "balance_loss_mlp": 1.05331743, + "epoch": 0.08260934916578987, + "flos": 25812446636160.0, + "grad_norm": 1.7727067520163604, + "language_loss": 0.82349706, + "learning_rate": 3.971037596117882e-06, + "loss": 0.84595209, + "num_input_tokens_seen": 29205210, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.328125, + "step": 1374, + "time_per_iteration": 2.5223724842071533 + }, + { + "auxiliary_loss_clip": 0.01076427, + "auxiliary_loss_mlp": 0.01007461, + "balance_loss_clip": 1.0032891, + "balance_loss_mlp": 1.02371156, + "epoch": 0.08266947241845783, + "flos": 63460009491840.0, + "grad_norm": 0.8248734910296001, + "language_loss": 0.60630989, + "learning_rate": 3.970971519207095e-06, + "loss": 0.62714875, + "num_input_tokens_seen": 29265350, + "router_z_loss_clip": 0.04174805, + "router_z_loss_mlp": 0.5234375, + "step": 1375, + "time_per_iteration": 3.0909183025360107 + }, + { + "auxiliary_loss_clip": 0.01074233, + "auxiliary_loss_mlp": 0.01006319, + "balance_loss_clip": 1.00221813, + "balance_loss_mlp": 1.02162504, + "epoch": 0.08272959567112581, + "flos": 69993704568960.0, + "grad_norm": 0.9071425511101782, + "language_loss": 0.62149519, + "learning_rate": 3.970905367556871e-06, + "loss": 0.64230067, + "num_input_tokens_seen": 29321475, + "router_z_loss_clip": 0.04101562, + "router_z_loss_mlp": 0.52734375, + "step": 1376, + "time_per_iteration": 2.991158962249756 + }, + { + "auxiliary_loss_clip": 0.01195866, + "auxiliary_loss_mlp": 0.01069408, + "balance_loss_clip": 1.04624534, + "balance_loss_mlp": 1.05995989, + "epoch": 0.08278971892379378, + "flos": 20413205781120.0, + "grad_norm": 1.9826192893196872, + "language_loss": 0.82601643, + "learning_rate": 3.970839141169718e-06, + "loss": 0.84866917, + "num_input_tokens_seen": 29341405, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.359375, + "step": 1377, + "time_per_iteration": 2.5851728916168213 + }, + { + "auxiliary_loss_clip": 0.01188463, + "auxiliary_loss_mlp": 0.01057538, + "balance_loss_clip": 1.0342443, + "balance_loss_mlp": 1.05601847, + "epoch": 0.08284984217646174, + "flos": 26250233598720.0, + "grad_norm": 1.8760965133588865, + "language_loss": 0.84516692, + "learning_rate": 3.970772840048147e-06, + "loss": 0.86762691, + "num_input_tokens_seen": 29361955, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.3203125, + "step": 1378, + "time_per_iteration": 2.500251054763794 + }, + { + "auxiliary_loss_clip": 0.01190024, + "auxiliary_loss_mlp": 0.01062419, + "balance_loss_clip": 1.0383265, + "balance_loss_mlp": 1.05516553, + "epoch": 0.08290996542912972, + "flos": 27194683852800.0, + "grad_norm": 1.9551783234852504, + "language_loss": 0.87725681, + "learning_rate": 3.970706464194672e-06, + "loss": 0.89978123, + "num_input_tokens_seen": 29382395, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.3515625, + "step": 1379, + "time_per_iteration": 2.5428385734558105 + }, + { + "auxiliary_loss_clip": 0.01189534, + "auxiliary_loss_mlp": 0.01056049, + "balance_loss_clip": 1.03336358, + "balance_loss_mlp": 1.05776525, + "epoch": 0.08297008868179769, + "flos": 38618191146240.0, + "grad_norm": 1.7573789229703745, + "language_loss": 0.78658688, + "learning_rate": 3.970640013611812e-06, + "loss": 0.80904275, + "num_input_tokens_seen": 29404460, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.3125, + "step": 1380, + "time_per_iteration": 2.6350595951080322 + }, + { + "auxiliary_loss_clip": 0.01190411, + "auxiliary_loss_mlp": 0.01061393, + "balance_loss_clip": 1.03666866, + "balance_loss_mlp": 1.05878401, + "epoch": 0.08303021193446565, + "flos": 19974736460160.0, + "grad_norm": 2.2395713763978002, + "language_loss": 0.86146504, + "learning_rate": 3.970573488302083e-06, + "loss": 0.88398302, + "num_input_tokens_seen": 29422675, + "router_z_loss_clip": 0.24707031, + "router_z_loss_mlp": 1.3125, + "step": 1381, + "time_per_iteration": 2.470153331756592 + }, + { + "auxiliary_loss_clip": 0.0119877, + "auxiliary_loss_mlp": 0.01060106, + "balance_loss_clip": 1.03604937, + "balance_loss_mlp": 1.06063581, + "epoch": 0.08309033518713363, + "flos": 13662646341120.0, + "grad_norm": 3.795546136319442, + "language_loss": 0.8817445, + "learning_rate": 3.970506888268011e-06, + "loss": 0.90433335, + "num_input_tokens_seen": 29439840, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.3828125, + "step": 1382, + "time_per_iteration": 2.4352822303771973 + }, + { + "auxiliary_loss_clip": 0.01190764, + "auxiliary_loss_mlp": 0.01059612, + "balance_loss_clip": 1.03728414, + "balance_loss_mlp": 1.0569818, + "epoch": 0.0831504584398016, + "flos": 17968551068160.0, + "grad_norm": 2.6234570747150734, + "language_loss": 0.77606535, + "learning_rate": 3.970440213512121e-06, + "loss": 0.79856908, + "num_input_tokens_seen": 29457360, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.34375, + "step": 1383, + "time_per_iteration": 2.45939040184021 + }, + { + "auxiliary_loss_clip": 0.01194291, + "auxiliary_loss_mlp": 0.01056287, + "balance_loss_clip": 1.03254008, + "balance_loss_mlp": 1.05730414, + "epoch": 0.08321058169246956, + "flos": 22601386408320.0, + "grad_norm": 2.1508484512905945, + "language_loss": 0.8293128, + "learning_rate": 3.97037346403694e-06, + "loss": 0.85181862, + "num_input_tokens_seen": 29477040, + "router_z_loss_clip": 0.23730469, + "router_z_loss_mlp": 1.3671875, + "step": 1384, + "time_per_iteration": 2.4773356914520264 + }, + { + "auxiliary_loss_clip": 0.01198678, + "auxiliary_loss_mlp": 0.01055169, + "balance_loss_clip": 1.02937245, + "balance_loss_mlp": 1.05890989, + "epoch": 0.08327070494513754, + "flos": 22850426378880.0, + "grad_norm": 2.4890613364481893, + "language_loss": 0.84828049, + "learning_rate": 3.970306639845e-06, + "loss": 0.87081897, + "num_input_tokens_seen": 29492010, + "router_z_loss_clip": 0.2578125, + "router_z_loss_mlp": 1.3984375, + "step": 1385, + "time_per_iteration": 2.5084009170532227 + }, + { + "auxiliary_loss_clip": 0.01194904, + "auxiliary_loss_mlp": 0.01066074, + "balance_loss_clip": 1.04257774, + "balance_loss_mlp": 1.05825758, + "epoch": 0.0833308281978055, + "flos": 22782986593920.0, + "grad_norm": 2.123672194513448, + "language_loss": 0.68744183, + "learning_rate": 3.970239740938835e-06, + "loss": 0.7100516, + "num_input_tokens_seen": 29511850, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.3671875, + "step": 1386, + "time_per_iteration": 2.477592945098877 + }, + { + "auxiliary_loss_clip": 0.01191265, + "auxiliary_loss_mlp": 0.0105612, + "balance_loss_clip": 1.03186047, + "balance_loss_mlp": 1.05579662, + "epoch": 0.08339095145047347, + "flos": 20812604083200.0, + "grad_norm": 1.7726596290820096, + "language_loss": 0.82067239, + "learning_rate": 3.97017276732098e-06, + "loss": 0.84314626, + "num_input_tokens_seen": 29531415, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.359375, + "step": 1387, + "time_per_iteration": 2.529261350631714 + }, + { + "auxiliary_loss_clip": 0.01196512, + "auxiliary_loss_mlp": 0.0107016, + "balance_loss_clip": 1.04474461, + "balance_loss_mlp": 1.05739772, + "epoch": 0.08345107470314143, + "flos": 18515326872960.0, + "grad_norm": 2.385304875072474, + "language_loss": 0.77194649, + "learning_rate": 3.970105718993978e-06, + "loss": 0.79461324, + "num_input_tokens_seen": 29549525, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.390625, + "step": 1388, + "time_per_iteration": 2.4517693519592285 + }, + { + "auxiliary_loss_clip": 0.01187734, + "auxiliary_loss_mlp": 0.01059717, + "balance_loss_clip": 1.0351125, + "balance_loss_mlp": 1.0574429, + "epoch": 0.08351119795580941, + "flos": 18807567926400.0, + "grad_norm": 2.246368739161805, + "language_loss": 0.79078835, + "learning_rate": 3.970038595960369e-06, + "loss": 0.81326282, + "num_input_tokens_seen": 29568705, + "router_z_loss_clip": 0.24609375, + "router_z_loss_mlp": 1.3046875, + "step": 1389, + "time_per_iteration": 2.4999983310699463 + }, + { + "auxiliary_loss_clip": 0.01194109, + "auxiliary_loss_mlp": 0.01056803, + "balance_loss_clip": 1.03368866, + "balance_loss_mlp": 1.05773938, + "epoch": 0.08357132120847738, + "flos": 18441817689600.0, + "grad_norm": 4.533904477221136, + "language_loss": 0.87495124, + "learning_rate": 3.969971398222699e-06, + "loss": 0.89746046, + "num_input_tokens_seen": 29585855, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.359375, + "step": 1390, + "time_per_iteration": 2.438126802444458 + }, + { + "auxiliary_loss_clip": 0.01190886, + "auxiliary_loss_mlp": 0.01063167, + "balance_loss_clip": 1.03902745, + "balance_loss_mlp": 1.05621624, + "epoch": 0.08363144446114534, + "flos": 25922333318400.0, + "grad_norm": 1.6928828016377326, + "language_loss": 0.86753631, + "learning_rate": 3.969904125783517e-06, + "loss": 0.89007682, + "num_input_tokens_seen": 29607280, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.34375, + "step": 1391, + "time_per_iteration": 2.5615429878234863 + }, + { + "auxiliary_loss_clip": 0.01198327, + "auxiliary_loss_mlp": 0.01071606, + "balance_loss_clip": 1.0480268, + "balance_loss_mlp": 1.05904424, + "epoch": 0.08369156771381332, + "flos": 18041306065920.0, + "grad_norm": 4.090701354718017, + "language_loss": 0.87550449, + "learning_rate": 3.969836778645371e-06, + "loss": 0.89820385, + "num_input_tokens_seen": 29624130, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.390625, + "step": 1392, + "time_per_iteration": 2.4343698024749756 + }, + { + "auxiliary_loss_clip": 0.01190277, + "auxiliary_loss_mlp": 0.01060815, + "balance_loss_clip": 1.03682983, + "balance_loss_mlp": 1.05556941, + "epoch": 0.08375169096648129, + "flos": 22675111073280.0, + "grad_norm": 2.9857894096842457, + "language_loss": 0.80519998, + "learning_rate": 3.969769356810819e-06, + "loss": 0.82771087, + "num_input_tokens_seen": 29643210, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.34375, + "step": 1393, + "time_per_iteration": 3.9978342056274414 + }, + { + "auxiliary_loss_clip": 0.01191931, + "auxiliary_loss_mlp": 0.01054176, + "balance_loss_clip": 1.03098941, + "balance_loss_mlp": 1.05832088, + "epoch": 0.08381181421914925, + "flos": 26103215232000.0, + "grad_norm": 1.8413427873168604, + "language_loss": 0.84738398, + "learning_rate": 3.969701860282415e-06, + "loss": 0.86984503, + "num_input_tokens_seen": 29663920, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3359375, + "step": 1394, + "time_per_iteration": 3.995389461517334 + }, + { + "auxiliary_loss_clip": 0.01193271, + "auxiliary_loss_mlp": 0.010537, + "balance_loss_clip": 1.0296433, + "balance_loss_mlp": 1.05856824, + "epoch": 0.08387193747181723, + "flos": 20629782835200.0, + "grad_norm": 1.7688902284368797, + "language_loss": 0.82957625, + "learning_rate": 3.969634289062719e-06, + "loss": 0.85204601, + "num_input_tokens_seen": 29683825, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.34375, + "step": 1395, + "time_per_iteration": 2.5080416202545166 + }, + { + "auxiliary_loss_clip": 0.01194811, + "auxiliary_loss_mlp": 0.01062467, + "balance_loss_clip": 1.03683722, + "balance_loss_mlp": 1.05833054, + "epoch": 0.0839320607244852, + "flos": 13443196199040.0, + "grad_norm": 1.9626395114639965, + "language_loss": 0.82492781, + "learning_rate": 3.969566643154293e-06, + "loss": 0.84750068, + "num_input_tokens_seen": 29698775, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.3671875, + "step": 1396, + "time_per_iteration": 2.51763653755188 + }, + { + "auxiliary_loss_clip": 0.01191589, + "auxiliary_loss_mlp": 0.01058769, + "balance_loss_clip": 1.03253114, + "balance_loss_mlp": 1.05944824, + "epoch": 0.08399218397715316, + "flos": 23477247642240.0, + "grad_norm": 2.3756879295671367, + "language_loss": 0.7702114, + "learning_rate": 3.969498922559703e-06, + "loss": 0.79271495, + "num_input_tokens_seen": 29719430, + "router_z_loss_clip": 0.26171875, + "router_z_loss_mlp": 1.3203125, + "step": 1397, + "time_per_iteration": 2.522019624710083 + }, + { + "auxiliary_loss_clip": 0.01191257, + "auxiliary_loss_mlp": 0.01050826, + "balance_loss_clip": 1.02635193, + "balance_loss_mlp": 1.05688787, + "epoch": 0.08405230722982113, + "flos": 25920717206400.0, + "grad_norm": 2.1333990758799795, + "language_loss": 0.77589226, + "learning_rate": 3.969431127281516e-06, + "loss": 0.79831308, + "num_input_tokens_seen": 29739685, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.34375, + "step": 1398, + "time_per_iteration": 2.499405860900879 + }, + { + "auxiliary_loss_clip": 0.01187104, + "auxiliary_loss_mlp": 0.01057261, + "balance_loss_clip": 1.03366995, + "balance_loss_mlp": 1.05604136, + "epoch": 0.0841124304824891, + "flos": 17967437746560.0, + "grad_norm": 6.547707007931562, + "language_loss": 0.94411373, + "learning_rate": 3.969363257322304e-06, + "loss": 0.96655744, + "num_input_tokens_seen": 29756165, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.3125, + "step": 1399, + "time_per_iteration": 2.458564043045044 + }, + { + "auxiliary_loss_clip": 0.01192876, + "auxiliary_loss_mlp": 0.01060981, + "balance_loss_clip": 1.03585184, + "balance_loss_mlp": 1.05564523, + "epoch": 0.08417255373515707, + "flos": 25629661301760.0, + "grad_norm": 2.3313569082148637, + "language_loss": 0.82052553, + "learning_rate": 3.96929531268464e-06, + "loss": 0.84306407, + "num_input_tokens_seen": 29776425, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.375, + "step": 1400, + "time_per_iteration": 2.511075258255005 + }, + { + "auxiliary_loss_clip": 0.01191821, + "auxiliary_loss_mlp": 0.01061122, + "balance_loss_clip": 1.03713727, + "balance_loss_mlp": 1.05681479, + "epoch": 0.08423267698782504, + "flos": 26249730808320.0, + "grad_norm": 3.6029570836648723, + "language_loss": 0.86615682, + "learning_rate": 3.969227293371099e-06, + "loss": 0.8886863, + "num_input_tokens_seen": 29796440, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.3515625, + "step": 1401, + "time_per_iteration": 2.5328855514526367 + }, + { + "auxiliary_loss_clip": 0.01190636, + "auxiliary_loss_mlp": 0.01063749, + "balance_loss_clip": 1.0383811, + "balance_loss_mlp": 1.05496573, + "epoch": 0.08429280024049302, + "flos": 20119707751680.0, + "grad_norm": 2.2778357332658543, + "language_loss": 0.87128234, + "learning_rate": 3.969159199384263e-06, + "loss": 0.89382625, + "num_input_tokens_seen": 29814755, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.359375, + "step": 1402, + "time_per_iteration": 2.4695520401000977 + }, + { + "auxiliary_loss_clip": 0.0118725, + "auxiliary_loss_mlp": 0.01056626, + "balance_loss_clip": 1.03340352, + "balance_loss_mlp": 1.0542388, + "epoch": 0.08435292349316098, + "flos": 42924526836480.0, + "grad_norm": 2.954964391273458, + "language_loss": 0.88680542, + "learning_rate": 3.9690910307267125e-06, + "loss": 0.90924418, + "num_input_tokens_seen": 29834785, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.328125, + "step": 1403, + "time_per_iteration": 2.6655161380767822 + }, + { + "auxiliary_loss_clip": 0.01189559, + "auxiliary_loss_mlp": 0.01056388, + "balance_loss_clip": 1.03105569, + "balance_loss_mlp": 1.05429792, + "epoch": 0.08441304674582895, + "flos": 22857285876480.0, + "grad_norm": 1.9645692036725415, + "language_loss": 0.80325729, + "learning_rate": 3.969022787401033e-06, + "loss": 0.82571673, + "num_input_tokens_seen": 29854695, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.359375, + "step": 1404, + "time_per_iteration": 2.5011603832244873 + }, + { + "auxiliary_loss_clip": 0.01195719, + "auxiliary_loss_mlp": 0.01066072, + "balance_loss_clip": 1.04089534, + "balance_loss_mlp": 1.05798006, + "epoch": 0.08447316999849692, + "flos": 18697501676160.0, + "grad_norm": 2.1059643070764027, + "language_loss": 0.83845061, + "learning_rate": 3.968954469409811e-06, + "loss": 0.86106849, + "num_input_tokens_seen": 29872180, + "router_z_loss_clip": 0.25195312, + "router_z_loss_mlp": 1.375, + "step": 1405, + "time_per_iteration": 2.4612858295440674 + }, + { + "auxiliary_loss_clip": 0.01188265, + "auxiliary_loss_mlp": 0.01056168, + "balance_loss_clip": 1.03314888, + "balance_loss_mlp": 1.05381966, + "epoch": 0.08453329325116489, + "flos": 25483971738240.0, + "grad_norm": 1.7581309060245893, + "language_loss": 0.80343008, + "learning_rate": 3.968886076755639e-06, + "loss": 0.82587439, + "num_input_tokens_seen": 29893205, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.34375, + "step": 1406, + "time_per_iteration": 2.496676206588745 + }, + { + "auxiliary_loss_clip": 0.01192497, + "auxiliary_loss_mlp": 0.01065969, + "balance_loss_clip": 1.0421989, + "balance_loss_mlp": 1.05858994, + "epoch": 0.08459341650383286, + "flos": 20920048640640.0, + "grad_norm": 1.8241253914082192, + "language_loss": 0.79411483, + "learning_rate": 3.96881760944111e-06, + "loss": 0.8166995, + "num_input_tokens_seen": 29911970, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.3359375, + "step": 1407, + "time_per_iteration": 2.491055727005005 + }, + { + "auxiliary_loss_clip": 0.01188371, + "auxiliary_loss_mlp": 0.01055807, + "balance_loss_clip": 1.03234673, + "balance_loss_mlp": 1.05521655, + "epoch": 0.08465353975650082, + "flos": 13043079624960.0, + "grad_norm": 4.541456574357825, + "language_loss": 0.91929626, + "learning_rate": 3.968749067468819e-06, + "loss": 0.94173807, + "num_input_tokens_seen": 29929925, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.328125, + "step": 1408, + "time_per_iteration": 2.44599986076355 + }, + { + "auxiliary_loss_clip": 0.01074059, + "auxiliary_loss_mlp": 0.01025014, + "balance_loss_clip": 1.02074611, + "balance_loss_mlp": 1.02193737, + "epoch": 0.0847136630091688, + "flos": 60877422552960.0, + "grad_norm": 0.8980094129226197, + "language_loss": 0.61861706, + "learning_rate": 3.968680450841368e-06, + "loss": 0.63960779, + "num_input_tokens_seen": 29985950, + "router_z_loss_clip": 0.04272461, + "router_z_loss_mlp": 0.5234375, + "step": 1409, + "time_per_iteration": 3.1084799766540527 + }, + { + "auxiliary_loss_clip": 0.01180993, + "auxiliary_loss_mlp": 0.01060196, + "balance_loss_clip": 1.03784466, + "balance_loss_mlp": 1.05419254, + "epoch": 0.08477378626183676, + "flos": 22046530043520.0, + "grad_norm": 2.25814404402445, + "language_loss": 0.86819237, + "learning_rate": 3.968611759561355e-06, + "loss": 0.89060426, + "num_input_tokens_seen": 30004330, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.265625, + "step": 1410, + "time_per_iteration": 2.4854791164398193 + }, + { + "auxiliary_loss_clip": 0.01188551, + "auxiliary_loss_mlp": 0.01056537, + "balance_loss_clip": 1.0309782, + "balance_loss_mlp": 1.05453801, + "epoch": 0.08483390951450473, + "flos": 16690059308160.0, + "grad_norm": 2.048224684561652, + "language_loss": 0.74138093, + "learning_rate": 3.968542993631388e-06, + "loss": 0.76383173, + "num_input_tokens_seen": 30022555, + "router_z_loss_clip": 0.25585938, + "router_z_loss_mlp": 1.3359375, + "step": 1411, + "time_per_iteration": 2.484879970550537 + }, + { + "auxiliary_loss_clip": 0.01068033, + "auxiliary_loss_mlp": 0.01005767, + "balance_loss_clip": 1.00169039, + "balance_loss_mlp": 1.01640451, + "epoch": 0.08489403276717271, + "flos": 51584640082560.0, + "grad_norm": 0.9041737870208939, + "language_loss": 0.56723791, + "learning_rate": 3.968474153054073e-06, + "loss": 0.58797586, + "num_input_tokens_seen": 30077220, + "router_z_loss_clip": 0.04077148, + "router_z_loss_mlp": 0.515625, + "step": 1412, + "time_per_iteration": 3.003227949142456 + }, + { + "auxiliary_loss_clip": 0.01183878, + "auxiliary_loss_mlp": 0.01062107, + "balance_loss_clip": 1.03855133, + "balance_loss_mlp": 1.05354273, + "epoch": 0.08495415601984067, + "flos": 17092330698240.0, + "grad_norm": 2.0338814511208883, + "language_loss": 0.89084172, + "learning_rate": 3.96840523783202e-06, + "loss": 0.91330159, + "num_input_tokens_seen": 30094600, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.3046875, + "step": 1413, + "time_per_iteration": 2.4545698165893555 + }, + { + "auxiliary_loss_clip": 0.01186591, + "auxiliary_loss_mlp": 0.01054482, + "balance_loss_clip": 1.03019929, + "balance_loss_mlp": 1.0562067, + "epoch": 0.08501427927250864, + "flos": 23148413608320.0, + "grad_norm": 2.1859301398641415, + "language_loss": 0.8807795, + "learning_rate": 3.968336247967844e-06, + "loss": 0.90319026, + "num_input_tokens_seen": 30114475, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.3046875, + "step": 1414, + "time_per_iteration": 2.4803147315979004 + }, + { + "auxiliary_loss_clip": 0.01185784, + "auxiliary_loss_mlp": 0.01056984, + "balance_loss_clip": 1.03497767, + "balance_loss_mlp": 1.0540117, + "epoch": 0.08507440252517662, + "flos": 19063467394560.0, + "grad_norm": 1.82577143383273, + "language_loss": 0.77434587, + "learning_rate": 3.96826718346416e-06, + "loss": 0.79677355, + "num_input_tokens_seen": 30133350, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.3125, + "step": 1415, + "time_per_iteration": 2.510671615600586 + }, + { + "auxiliary_loss_clip": 0.01185616, + "auxiliary_loss_mlp": 0.010657, + "balance_loss_clip": 1.0441227, + "balance_loss_mlp": 1.05612898, + "epoch": 0.08513452577784458, + "flos": 60182296600320.0, + "grad_norm": 1.848223104879299, + "language_loss": 0.70859981, + "learning_rate": 3.968198044323587e-06, + "loss": 0.73111296, + "num_input_tokens_seen": 30159005, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.296875, + "step": 1416, + "time_per_iteration": 2.827016592025757 + }, + { + "auxiliary_loss_clip": 0.01192002, + "auxiliary_loss_mlp": 0.0106124, + "balance_loss_clip": 1.03587198, + "balance_loss_mlp": 1.05693281, + "epoch": 0.08519464903051255, + "flos": 27308485117440.0, + "grad_norm": 1.9370001986884609, + "language_loss": 0.74855268, + "learning_rate": 3.968128830548748e-06, + "loss": 0.77108514, + "num_input_tokens_seen": 30179450, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.3515625, + "step": 1417, + "time_per_iteration": 2.51518177986145 + }, + { + "auxiliary_loss_clip": 0.01184448, + "auxiliary_loss_mlp": 0.01055419, + "balance_loss_clip": 1.03157723, + "balance_loss_mlp": 1.05394006, + "epoch": 0.08525477228318051, + "flos": 20266438809600.0, + "grad_norm": 2.566029486363868, + "language_loss": 0.82460356, + "learning_rate": 3.968059542142265e-06, + "loss": 0.84700227, + "num_input_tokens_seen": 30197235, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.3046875, + "step": 1418, + "time_per_iteration": 2.4632515907287598 + }, + { + "auxiliary_loss_clip": 0.01078096, + "auxiliary_loss_mlp": 0.01026146, + "balance_loss_clip": 1.02221191, + "balance_loss_mlp": 1.0269177, + "epoch": 0.08531489553584849, + "flos": 67615017183360.0, + "grad_norm": 0.8662062784105238, + "language_loss": 0.56616145, + "learning_rate": 3.9679901791067685e-06, + "loss": 0.58720386, + "num_input_tokens_seen": 30257410, + "router_z_loss_clip": 0.03930664, + "router_z_loss_mlp": 0.51171875, + "step": 1419, + "time_per_iteration": 3.0262646675109863 + }, + { + "auxiliary_loss_clip": 0.01185611, + "auxiliary_loss_mlp": 0.01062944, + "balance_loss_clip": 1.03858972, + "balance_loss_mlp": 1.05284262, + "epoch": 0.08537501878851646, + "flos": 27526965592320.0, + "grad_norm": 2.301787344693911, + "language_loss": 0.69764268, + "learning_rate": 3.967920741444886e-06, + "loss": 0.72012818, + "num_input_tokens_seen": 30277865, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.328125, + "step": 1420, + "time_per_iteration": 2.5173370838165283 + }, + { + "auxiliary_loss_clip": 0.01182824, + "auxiliary_loss_mlp": 0.01051954, + "balance_loss_clip": 1.02912498, + "balance_loss_mlp": 1.05232763, + "epoch": 0.08543514204118442, + "flos": 22784243569920.0, + "grad_norm": 1.56579546013663, + "language_loss": 0.87886292, + "learning_rate": 3.967851229159252e-06, + "loss": 0.90121067, + "num_input_tokens_seen": 30298545, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.3046875, + "step": 1421, + "time_per_iteration": 2.498198986053467 + }, + { + "auxiliary_loss_clip": 0.01069987, + "auxiliary_loss_mlp": 0.01004015, + "balance_loss_clip": 1.00034332, + "balance_loss_mlp": 1.01909983, + "epoch": 0.0854952652938524, + "flos": 60990721027200.0, + "grad_norm": 0.7935144939089421, + "language_loss": 0.63490081, + "learning_rate": 3.967781642252502e-06, + "loss": 0.65564084, + "num_input_tokens_seen": 30361725, + "router_z_loss_clip": 0.03662109, + "router_z_loss_mlp": 0.5078125, + "step": 1422, + "time_per_iteration": 3.050874948501587 + }, + { + "auxiliary_loss_clip": 0.01182797, + "auxiliary_loss_mlp": 0.01065038, + "balance_loss_clip": 1.04182768, + "balance_loss_mlp": 1.05538559, + "epoch": 0.08555538854652037, + "flos": 28038046256640.0, + "grad_norm": 2.040119561169685, + "language_loss": 0.83427018, + "learning_rate": 3.967711980727276e-06, + "loss": 0.85674852, + "num_input_tokens_seen": 30382180, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.2734375, + "step": 1423, + "time_per_iteration": 2.525075674057007 + }, + { + "auxiliary_loss_clip": 0.01190455, + "auxiliary_loss_mlp": 0.01059451, + "balance_loss_clip": 1.0365268, + "balance_loss_mlp": 1.05613029, + "epoch": 0.08561551179918833, + "flos": 23509279595520.0, + "grad_norm": 1.7627385415604107, + "language_loss": 0.74945033, + "learning_rate": 3.967642244586213e-06, + "loss": 0.77194929, + "num_input_tokens_seen": 30402980, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.34375, + "step": 1424, + "time_per_iteration": 2.523231029510498 + }, + { + "auxiliary_loss_clip": 0.01185893, + "auxiliary_loss_mlp": 0.01056266, + "balance_loss_clip": 1.03307986, + "balance_loss_mlp": 1.05510807, + "epoch": 0.08567563505185631, + "flos": 17926930183680.0, + "grad_norm": 1.9395290082560723, + "language_loss": 0.7574805, + "learning_rate": 3.96757243383196e-06, + "loss": 0.7799021, + "num_input_tokens_seen": 30420800, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3046875, + "step": 1425, + "time_per_iteration": 2.441420793533325 + }, + { + "auxiliary_loss_clip": 0.01183386, + "auxiliary_loss_mlp": 0.01053965, + "balance_loss_clip": 1.03092194, + "balance_loss_mlp": 1.05407834, + "epoch": 0.08573575830452428, + "flos": 19719519350400.0, + "grad_norm": 2.579491371045568, + "language_loss": 0.93504989, + "learning_rate": 3.9675025484671624e-06, + "loss": 0.95742333, + "num_input_tokens_seen": 30439620, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.296875, + "step": 1426, + "time_per_iteration": 2.4703657627105713 + }, + { + "auxiliary_loss_clip": 0.0119154, + "auxiliary_loss_mlp": 0.01066598, + "balance_loss_clip": 1.04115915, + "balance_loss_mlp": 1.05764198, + "epoch": 0.08579588155719224, + "flos": 17931563038080.0, + "grad_norm": 2.235647808517122, + "language_loss": 0.75003266, + "learning_rate": 3.967432588494471e-06, + "loss": 0.772614, + "num_input_tokens_seen": 30457300, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.34375, + "step": 1427, + "time_per_iteration": 2.4430549144744873 + }, + { + "auxiliary_loss_clip": 0.01182417, + "auxiliary_loss_mlp": 0.01061112, + "balance_loss_clip": 1.03907049, + "balance_loss_mlp": 1.05315089, + "epoch": 0.08585600480986022, + "flos": 16033324993920.0, + "grad_norm": 2.3372587699614726, + "language_loss": 0.81915152, + "learning_rate": 3.96736255391654e-06, + "loss": 0.84158677, + "num_input_tokens_seen": 30471580, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.296875, + "step": 1428, + "time_per_iteration": 2.454441785812378 + }, + { + "auxiliary_loss_clip": 0.01189987, + "auxiliary_loss_mlp": 0.01066735, + "balance_loss_clip": 1.04258347, + "balance_loss_mlp": 1.05586076, + "epoch": 0.08591612806252819, + "flos": 28657433404800.0, + "grad_norm": 2.395570851050941, + "language_loss": 0.79697371, + "learning_rate": 3.967292444736023e-06, + "loss": 0.81954098, + "num_input_tokens_seen": 30492720, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.34375, + "step": 1429, + "time_per_iteration": 2.5411579608917236 + }, + { + "auxiliary_loss_clip": 0.0119024, + "auxiliary_loss_mlp": 0.01062326, + "balance_loss_clip": 1.03952122, + "balance_loss_mlp": 1.05773449, + "epoch": 0.08597625131519615, + "flos": 20959119659520.0, + "grad_norm": 2.301464625204156, + "language_loss": 0.88055587, + "learning_rate": 3.967222260955578e-06, + "loss": 0.90308148, + "num_input_tokens_seen": 30509535, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.328125, + "step": 1430, + "time_per_iteration": 2.497523546218872 + }, + { + "auxiliary_loss_clip": 0.01184535, + "auxiliary_loss_mlp": 0.01072949, + "balance_loss_clip": 1.04995334, + "balance_loss_mlp": 1.05712664, + "epoch": 0.08603637456786412, + "flos": 23256360956160.0, + "grad_norm": 1.7504719201320615, + "language_loss": 0.81914723, + "learning_rate": 3.96715200257787e-06, + "loss": 0.84172201, + "num_input_tokens_seen": 30529490, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.2734375, + "step": 1431, + "time_per_iteration": 2.478731393814087 + }, + { + "auxiliary_loss_clip": 0.01184756, + "auxiliary_loss_mlp": 0.01056491, + "balance_loss_clip": 1.03337657, + "balance_loss_mlp": 1.05376828, + "epoch": 0.0860964978205321, + "flos": 28694170039680.0, + "grad_norm": 1.9949655353101803, + "language_loss": 0.77759397, + "learning_rate": 3.967081669605559e-06, + "loss": 0.80000651, + "num_input_tokens_seen": 30550205, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3125, + "step": 1432, + "time_per_iteration": 2.5344104766845703 + }, + { + "auxiliary_loss_clip": 0.0118072, + "auxiliary_loss_mlp": 0.01059034, + "balance_loss_clip": 1.03497803, + "balance_loss_mlp": 1.05027151, + "epoch": 0.08615662107320006, + "flos": 19318397195520.0, + "grad_norm": 2.2873036973179603, + "language_loss": 0.73330259, + "learning_rate": 3.967011262041315e-06, + "loss": 0.75570011, + "num_input_tokens_seen": 30568830, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.3046875, + "step": 1433, + "time_per_iteration": 2.4787938594818115 + }, + { + "auxiliary_loss_clip": 0.01188497, + "auxiliary_loss_mlp": 0.01058804, + "balance_loss_clip": 1.03375793, + "balance_loss_mlp": 1.05464733, + "epoch": 0.08621674432586802, + "flos": 15851688894720.0, + "grad_norm": 2.615593579271415, + "language_loss": 0.85741955, + "learning_rate": 3.9669407798878065e-06, + "loss": 0.87989259, + "num_input_tokens_seen": 30585730, + "router_z_loss_clip": 0.25, + "router_z_loss_mlp": 1.3359375, + "step": 1434, + "time_per_iteration": 5.500946998596191 + }, + { + "auxiliary_loss_clip": 0.01182383, + "auxiliary_loss_mlp": 0.01054521, + "balance_loss_clip": 1.03139436, + "balance_loss_mlp": 1.05177212, + "epoch": 0.086276867578536, + "flos": 14100648785280.0, + "grad_norm": 3.0513138823403825, + "language_loss": 0.78913063, + "learning_rate": 3.966870223147707e-06, + "loss": 0.81149966, + "num_input_tokens_seen": 30603180, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3046875, + "step": 1435, + "time_per_iteration": 3.899777412414551 + }, + { + "auxiliary_loss_clip": 0.01070575, + "auxiliary_loss_mlp": 0.01027743, + "balance_loss_clip": 1.02428555, + "balance_loss_mlp": 1.02010655, + "epoch": 0.08633699083120397, + "flos": 70184857772160.0, + "grad_norm": 0.8910926846424677, + "language_loss": 0.57930011, + "learning_rate": 3.96679959182369e-06, + "loss": 0.60028332, + "num_input_tokens_seen": 30668895, + "router_z_loss_clip": 0.03466797, + "router_z_loss_mlp": 0.5078125, + "step": 1436, + "time_per_iteration": 3.179255247116089 + }, + { + "auxiliary_loss_clip": 0.01186059, + "auxiliary_loss_mlp": 0.01049386, + "balance_loss_clip": 1.02633083, + "balance_loss_mlp": 1.05314159, + "epoch": 0.08639711408387193, + "flos": 30298874140800.0, + "grad_norm": 2.429993259280604, + "language_loss": 0.68775386, + "learning_rate": 3.966728885918437e-06, + "loss": 0.71010828, + "num_input_tokens_seen": 30688955, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.328125, + "step": 1437, + "time_per_iteration": 2.529339551925659 + }, + { + "auxiliary_loss_clip": 0.01185365, + "auxiliary_loss_mlp": 0.01050306, + "balance_loss_clip": 1.02806163, + "balance_loss_mlp": 1.05388093, + "epoch": 0.08645723733653991, + "flos": 20297680663680.0, + "grad_norm": 2.5641138848438163, + "language_loss": 0.7274068, + "learning_rate": 3.966658105434627e-06, + "loss": 0.74976349, + "num_input_tokens_seen": 30706095, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.3125, + "step": 1438, + "time_per_iteration": 2.4840176105499268 + }, + { + "auxiliary_loss_clip": 0.01183596, + "auxiliary_loss_mlp": 0.01049035, + "balance_loss_clip": 1.02594447, + "balance_loss_mlp": 1.05472374, + "epoch": 0.08651736058920788, + "flos": 32890583134080.0, + "grad_norm": 1.681614476681305, + "language_loss": 0.64628494, + "learning_rate": 3.966587250374945e-06, + "loss": 0.66861117, + "num_input_tokens_seen": 30729025, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.2890625, + "step": 1439, + "time_per_iteration": 2.61686372756958 + }, + { + "auxiliary_loss_clip": 0.01187197, + "auxiliary_loss_mlp": 0.01055218, + "balance_loss_clip": 1.03131676, + "balance_loss_mlp": 1.05638909, + "epoch": 0.08657748384187584, + "flos": 22637368857600.0, + "grad_norm": 2.062065757985673, + "language_loss": 0.87748063, + "learning_rate": 3.966516320742077e-06, + "loss": 0.89990479, + "num_input_tokens_seen": 30746155, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.3125, + "step": 1440, + "time_per_iteration": 2.5116493701934814 + }, + { + "auxiliary_loss_clip": 0.01188419, + "auxiliary_loss_mlp": 0.01059749, + "balance_loss_clip": 1.03538251, + "balance_loss_mlp": 1.0540843, + "epoch": 0.08663760709454381, + "flos": 23658380951040.0, + "grad_norm": 2.4102507257620363, + "language_loss": 0.83243793, + "learning_rate": 3.9664453165387124e-06, + "loss": 0.85491961, + "num_input_tokens_seen": 30761410, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.34375, + "step": 1441, + "time_per_iteration": 2.5058300495147705 + }, + { + "auxiliary_loss_clip": 0.01070024, + "auxiliary_loss_mlp": 0.01009256, + "balance_loss_clip": 1.00525022, + "balance_loss_mlp": 1.01939523, + "epoch": 0.08669773034721179, + "flos": 62686564911360.0, + "grad_norm": 0.8461220926791603, + "language_loss": 0.60426581, + "learning_rate": 3.966374237767545e-06, + "loss": 0.62505859, + "num_input_tokens_seen": 30823010, + "router_z_loss_clip": 0.04003906, + "router_z_loss_mlp": 0.5078125, + "step": 1442, + "time_per_iteration": 3.1946628093719482 + }, + { + "auxiliary_loss_clip": 0.01192002, + "auxiliary_loss_mlp": 0.01057232, + "balance_loss_clip": 1.03379524, + "balance_loss_mlp": 1.05709028, + "epoch": 0.08675785359987975, + "flos": 20667489137280.0, + "grad_norm": 3.2809405592870835, + "language_loss": 0.79264277, + "learning_rate": 3.96630308443127e-06, + "loss": 0.81513512, + "num_input_tokens_seen": 30841980, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.3515625, + "step": 1443, + "time_per_iteration": 2.477691411972046 + }, + { + "auxiliary_loss_clip": 0.01185255, + "auxiliary_loss_mlp": 0.01054103, + "balance_loss_clip": 1.03170311, + "balance_loss_mlp": 1.05261874, + "epoch": 0.08681797685254772, + "flos": 26941118768640.0, + "grad_norm": 1.764762918327591, + "language_loss": 0.82248437, + "learning_rate": 3.966231856532584e-06, + "loss": 0.8448779, + "num_input_tokens_seen": 30863280, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.328125, + "step": 1444, + "time_per_iteration": 2.584773063659668 + }, + { + "auxiliary_loss_clip": 0.01189581, + "auxiliary_loss_mlp": 0.01049918, + "balance_loss_clip": 1.02745867, + "balance_loss_mlp": 1.05537939, + "epoch": 0.0868781001052157, + "flos": 17712831168000.0, + "grad_norm": 1.945627197742621, + "language_loss": 0.86856627, + "learning_rate": 3.966160554074189e-06, + "loss": 0.89096129, + "num_input_tokens_seen": 30881710, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.34375, + "step": 1445, + "time_per_iteration": 2.506258964538574 + }, + { + "auxiliary_loss_clip": 0.01189413, + "auxiliary_loss_mlp": 0.01054326, + "balance_loss_clip": 1.03303528, + "balance_loss_mlp": 1.05808067, + "epoch": 0.08693822335788366, + "flos": 19896522595200.0, + "grad_norm": 1.9763924186655837, + "language_loss": 0.81639445, + "learning_rate": 3.96608917705879e-06, + "loss": 0.8388319, + "num_input_tokens_seen": 30900225, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.3125, + "step": 1446, + "time_per_iteration": 2.531313180923462 + }, + { + "auxiliary_loss_clip": 0.010647, + "auxiliary_loss_mlp": 0.01005416, + "balance_loss_clip": 1.0020541, + "balance_loss_mlp": 1.0147202, + "epoch": 0.08699834661055163, + "flos": 67023747406080.0, + "grad_norm": 0.728477241136595, + "language_loss": 0.54725462, + "learning_rate": 3.966017725489091e-06, + "loss": 0.56795579, + "num_input_tokens_seen": 30959580, + "router_z_loss_clip": 0.03369141, + "router_z_loss_mlp": 0.5, + "step": 1447, + "time_per_iteration": 3.1009976863861084 + }, + { + "auxiliary_loss_clip": 0.01178637, + "auxiliary_loss_mlp": 0.01052877, + "balance_loss_clip": 1.03104973, + "balance_loss_mlp": 1.05198455, + "epoch": 0.0870584698632196, + "flos": 13480507451520.0, + "grad_norm": 2.2332818090387243, + "language_loss": 0.84593046, + "learning_rate": 3.965946199367804e-06, + "loss": 0.8682456, + "num_input_tokens_seen": 30976775, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.265625, + "step": 1448, + "time_per_iteration": 2.483792543411255 + }, + { + "auxiliary_loss_clip": 0.01185215, + "auxiliary_loss_mlp": 0.01056358, + "balance_loss_clip": 1.03386295, + "balance_loss_mlp": 1.0524509, + "epoch": 0.08711859311588757, + "flos": 16107013745280.0, + "grad_norm": 3.099884448391289, + "language_loss": 0.80688727, + "learning_rate": 3.965874598697638e-06, + "loss": 0.82930297, + "num_input_tokens_seen": 30990495, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.328125, + "step": 1449, + "time_per_iteration": 2.4637081623077393 + }, + { + "auxiliary_loss_clip": 0.01182046, + "auxiliary_loss_mlp": 0.01050023, + "balance_loss_clip": 1.02862501, + "balance_loss_mlp": 1.05370414, + "epoch": 0.08717871636855554, + "flos": 38472357928320.0, + "grad_norm": 4.183651889411507, + "language_loss": 0.71012592, + "learning_rate": 3.965802923481313e-06, + "loss": 0.73244655, + "num_input_tokens_seen": 31014080, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.2890625, + "step": 1450, + "time_per_iteration": 2.6521542072296143 + }, + { + "auxiliary_loss_clip": 0.0118314, + "auxiliary_loss_mlp": 0.01053244, + "balance_loss_clip": 1.03057098, + "balance_loss_mlp": 1.05502534, + "epoch": 0.0872388396212235, + "flos": 17600574188160.0, + "grad_norm": 1.8266796466048172, + "language_loss": 0.83492875, + "learning_rate": 3.965731173721542e-06, + "loss": 0.85729253, + "num_input_tokens_seen": 31031210, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.28125, + "step": 1451, + "time_per_iteration": 2.4866271018981934 + }, + { + "auxiliary_loss_clip": 0.01181156, + "auxiliary_loss_mlp": 0.01057138, + "balance_loss_clip": 1.03538203, + "balance_loss_mlp": 1.05371869, + "epoch": 0.08729896287389148, + "flos": 25259385951360.0, + "grad_norm": 1.850339391564711, + "language_loss": 0.74351519, + "learning_rate": 3.965659349421049e-06, + "loss": 0.76589811, + "num_input_tokens_seen": 31049710, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.2734375, + "step": 1452, + "time_per_iteration": 2.5450925827026367 + }, + { + "auxiliary_loss_clip": 0.01182798, + "auxiliary_loss_mlp": 0.01061481, + "balance_loss_clip": 1.03840256, + "balance_loss_mlp": 1.05121017, + "epoch": 0.08735908612655945, + "flos": 15632454234240.0, + "grad_norm": 3.3421371051734474, + "language_loss": 0.79840016, + "learning_rate": 3.965587450582556e-06, + "loss": 0.82084292, + "num_input_tokens_seen": 31066160, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3125, + "step": 1453, + "time_per_iteration": 2.49350643157959 + }, + { + "auxiliary_loss_clip": 0.01182604, + "auxiliary_loss_mlp": 0.0106489, + "balance_loss_clip": 1.04213262, + "balance_loss_mlp": 1.0545752, + "epoch": 0.08741920937922741, + "flos": 20339660684160.0, + "grad_norm": 1.982640213979625, + "language_loss": 0.71298045, + "learning_rate": 3.96551547720879e-06, + "loss": 0.73545539, + "num_input_tokens_seen": 31085270, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.28125, + "step": 1454, + "time_per_iteration": 2.534457206726074 + }, + { + "auxiliary_loss_clip": 0.01070391, + "auxiliary_loss_mlp": 0.01029091, + "balance_loss_clip": 1.02608728, + "balance_loss_mlp": 1.02026677, + "epoch": 0.08747933263189539, + "flos": 62819795433600.0, + "grad_norm": 0.7993884765543664, + "language_loss": 0.58655661, + "learning_rate": 3.96544342930248e-06, + "loss": 0.6075514, + "num_input_tokens_seen": 31148445, + "router_z_loss_clip": 0.0300293, + "router_z_loss_mlp": 0.5, + "step": 1455, + "time_per_iteration": 3.088113307952881 + }, + { + "auxiliary_loss_clip": 0.01182632, + "auxiliary_loss_mlp": 0.01058901, + "balance_loss_clip": 1.03638279, + "balance_loss_mlp": 1.05210626, + "epoch": 0.08753945588456336, + "flos": 33035877648000.0, + "grad_norm": 1.5590098662562957, + "language_loss": 0.77404714, + "learning_rate": 3.965371306866359e-06, + "loss": 0.79646254, + "num_input_tokens_seen": 31168770, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.3046875, + "step": 1456, + "time_per_iteration": 2.6145191192626953 + }, + { + "auxiliary_loss_clip": 0.01182283, + "auxiliary_loss_mlp": 0.01051585, + "balance_loss_clip": 1.02888715, + "balance_loss_mlp": 1.05235434, + "epoch": 0.08759957913723132, + "flos": 35547182046720.0, + "grad_norm": 2.3657198267749777, + "language_loss": 0.72391665, + "learning_rate": 3.96529910990316e-06, + "loss": 0.74625528, + "num_input_tokens_seen": 31189270, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.296875, + "step": 1457, + "time_per_iteration": 2.6438605785369873 + }, + { + "auxiliary_loss_clip": 0.01179054, + "auxiliary_loss_mlp": 0.01047648, + "balance_loss_clip": 1.02623844, + "balance_loss_mlp": 1.05207849, + "epoch": 0.0876597023898993, + "flos": 23911120022400.0, + "grad_norm": 1.5929331180335078, + "language_loss": 0.86215973, + "learning_rate": 3.965226838415622e-06, + "loss": 0.88442671, + "num_input_tokens_seen": 31210385, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.265625, + "step": 1458, + "time_per_iteration": 2.539658546447754 + }, + { + "auxiliary_loss_clip": 0.01189161, + "auxiliary_loss_mlp": 0.01059801, + "balance_loss_clip": 1.03694844, + "balance_loss_mlp": 1.05887103, + "epoch": 0.08771982564256726, + "flos": 18114025150080.0, + "grad_norm": 1.660016084678777, + "language_loss": 0.80662763, + "learning_rate": 3.965154492406486e-06, + "loss": 0.8291173, + "num_input_tokens_seen": 31229745, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.3046875, + "step": 1459, + "time_per_iteration": 2.4880902767181396 + }, + { + "auxiliary_loss_clip": 0.01187526, + "auxiliary_loss_mlp": 0.01054149, + "balance_loss_clip": 1.03057003, + "balance_loss_mlp": 1.05512893, + "epoch": 0.08777994889523523, + "flos": 17712005155200.0, + "grad_norm": 2.474003232718447, + "language_loss": 0.84058738, + "learning_rate": 3.9650820718784945e-06, + "loss": 0.86300415, + "num_input_tokens_seen": 31248280, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.328125, + "step": 1460, + "time_per_iteration": 2.4644060134887695 + }, + { + "auxiliary_loss_clip": 0.01178547, + "auxiliary_loss_mlp": 0.01054299, + "balance_loss_clip": 1.03287745, + "balance_loss_mlp": 1.05051732, + "epoch": 0.0878400721479032, + "flos": 12819930382080.0, + "grad_norm": 2.696872821623283, + "language_loss": 0.81030595, + "learning_rate": 3.965009576834394e-06, + "loss": 0.83263445, + "num_input_tokens_seen": 31262190, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.28125, + "step": 1461, + "time_per_iteration": 2.4456100463867188 + }, + { + "auxiliary_loss_clip": 0.01187345, + "auxiliary_loss_mlp": 0.01059901, + "balance_loss_clip": 1.03795433, + "balance_loss_mlp": 1.05579305, + "epoch": 0.08790019540057117, + "flos": 26392690938240.0, + "grad_norm": 1.656505593412751, + "language_loss": 0.76405656, + "learning_rate": 3.964937007276932e-06, + "loss": 0.786529, + "num_input_tokens_seen": 31283690, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.3125, + "step": 1462, + "time_per_iteration": 2.546812057495117 + }, + { + "auxiliary_loss_clip": 0.01190578, + "auxiliary_loss_mlp": 0.01058183, + "balance_loss_clip": 1.03431702, + "balance_loss_mlp": 1.05753493, + "epoch": 0.08796031865323914, + "flos": 19134031662720.0, + "grad_norm": 2.4277854967530663, + "language_loss": 0.74615479, + "learning_rate": 3.9648643632088634e-06, + "loss": 0.76864231, + "num_input_tokens_seen": 31302505, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.328125, + "step": 1463, + "time_per_iteration": 2.46510648727417 + }, + { + "auxiliary_loss_clip": 0.01189177, + "auxiliary_loss_mlp": 0.0106376, + "balance_loss_clip": 1.03929877, + "balance_loss_mlp": 1.05380559, + "epoch": 0.0880204419059071, + "flos": 26064287867520.0, + "grad_norm": 2.09054267836168, + "language_loss": 0.83423382, + "learning_rate": 3.964791644632941e-06, + "loss": 0.85676318, + "num_input_tokens_seen": 31323070, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.3515625, + "step": 1464, + "time_per_iteration": 2.5343735218048096 + }, + { + "auxiliary_loss_clip": 0.01183588, + "auxiliary_loss_mlp": 0.01069008, + "balance_loss_clip": 1.04659677, + "balance_loss_mlp": 1.05336595, + "epoch": 0.08808056515857508, + "flos": 22377842115840.0, + "grad_norm": 4.267071209901202, + "language_loss": 0.78351951, + "learning_rate": 3.964718851551923e-06, + "loss": 0.80604541, + "num_input_tokens_seen": 31341880, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.296875, + "step": 1465, + "time_per_iteration": 2.4745209217071533 + }, + { + "auxiliary_loss_clip": 0.01190864, + "auxiliary_loss_mlp": 0.01059186, + "balance_loss_clip": 1.0371089, + "balance_loss_mlp": 1.05628061, + "epoch": 0.08814068841124305, + "flos": 23185293897600.0, + "grad_norm": 1.8950228405880263, + "language_loss": 0.84698099, + "learning_rate": 3.9646459839685675e-06, + "loss": 0.86948144, + "num_input_tokens_seen": 31361995, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.34375, + "step": 1466, + "time_per_iteration": 2.4920802116394043 + }, + { + "auxiliary_loss_clip": 0.01185126, + "auxiliary_loss_mlp": 0.0105874, + "balance_loss_clip": 1.03556609, + "balance_loss_mlp": 1.05407715, + "epoch": 0.08820081166391101, + "flos": 25155281358720.0, + "grad_norm": 3.8136580791310783, + "language_loss": 0.84233636, + "learning_rate": 3.964573041885641e-06, + "loss": 0.86477506, + "num_input_tokens_seen": 31381515, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3125, + "step": 1467, + "time_per_iteration": 2.5413413047790527 + }, + { + "auxiliary_loss_clip": 0.01183856, + "auxiliary_loss_mlp": 0.01056021, + "balance_loss_clip": 1.03381276, + "balance_loss_mlp": 1.05462813, + "epoch": 0.08826093491657899, + "flos": 22231685675520.0, + "grad_norm": 1.7481416698073104, + "language_loss": 0.75517243, + "learning_rate": 3.964500025305907e-06, + "loss": 0.7775712, + "num_input_tokens_seen": 31400345, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.296875, + "step": 1468, + "time_per_iteration": 2.496363878250122 + }, + { + "auxiliary_loss_clip": 0.01184448, + "auxiliary_loss_mlp": 0.0105718, + "balance_loss_clip": 1.03623509, + "balance_loss_mlp": 1.05570245, + "epoch": 0.08832105816924696, + "flos": 22126826897280.0, + "grad_norm": 1.7579385887345491, + "language_loss": 0.80601043, + "learning_rate": 3.9644269342321355e-06, + "loss": 0.82842672, + "num_input_tokens_seen": 31419620, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.2890625, + "step": 1469, + "time_per_iteration": 2.5486512184143066 + }, + { + "auxiliary_loss_clip": 0.01187777, + "auxiliary_loss_mlp": 0.01054482, + "balance_loss_clip": 1.0321182, + "balance_loss_mlp": 1.05454695, + "epoch": 0.08838118142191492, + "flos": 17566495159680.0, + "grad_norm": 3.202810753535508, + "language_loss": 0.77607989, + "learning_rate": 3.9643537686670974e-06, + "loss": 0.7985025, + "num_input_tokens_seen": 31437970, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.3359375, + "step": 1470, + "time_per_iteration": 2.6632297039031982 + }, + { + "auxiliary_loss_clip": 0.01182287, + "auxiliary_loss_mlp": 0.0106647, + "balance_loss_clip": 1.04266429, + "balance_loss_mlp": 1.05412459, + "epoch": 0.0884413046745829, + "flos": 20777196251520.0, + "grad_norm": 1.774803600242038, + "language_loss": 0.84233272, + "learning_rate": 3.964280528613569e-06, + "loss": 0.86482024, + "num_input_tokens_seen": 31457040, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.28125, + "step": 1471, + "time_per_iteration": 2.5040950775146484 + }, + { + "auxiliary_loss_clip": 0.01178062, + "auxiliary_loss_mlp": 0.01052705, + "balance_loss_clip": 1.03247499, + "balance_loss_mlp": 1.05459309, + "epoch": 0.08850142792725087, + "flos": 22125462180480.0, + "grad_norm": 1.6761790638208889, + "language_loss": 0.83481324, + "learning_rate": 3.964207214074324e-06, + "loss": 0.85712093, + "num_input_tokens_seen": 31477520, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.234375, + "step": 1472, + "time_per_iteration": 2.5079073905944824 + }, + { + "auxiliary_loss_clip": 0.01185739, + "auxiliary_loss_mlp": 0.0105882, + "balance_loss_clip": 1.03597999, + "balance_loss_mlp": 1.05491877, + "epoch": 0.08856155117991883, + "flos": 22418744728320.0, + "grad_norm": 2.396127276436556, + "language_loss": 0.828246, + "learning_rate": 3.964133825052146e-06, + "loss": 0.85069156, + "num_input_tokens_seen": 31495575, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.3125, + "step": 1473, + "time_per_iteration": 2.4919679164886475 + }, + { + "auxiliary_loss_clip": 0.01183368, + "auxiliary_loss_mlp": 0.01061525, + "balance_loss_clip": 1.04040098, + "balance_loss_mlp": 1.05414963, + "epoch": 0.0886216744325868, + "flos": 29937002572800.0, + "grad_norm": 1.8346488607114506, + "language_loss": 0.78871369, + "learning_rate": 3.964060361549816e-06, + "loss": 0.81116265, + "num_input_tokens_seen": 31520020, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.296875, + "step": 1474, + "time_per_iteration": 2.5806753635406494 + }, + { + "auxiliary_loss_clip": 0.01181812, + "auxiliary_loss_mlp": 0.01057333, + "balance_loss_clip": 1.03413475, + "balance_loss_mlp": 1.05450511, + "epoch": 0.08868179768525478, + "flos": 23982833525760.0, + "grad_norm": 1.918961213895669, + "language_loss": 0.79045832, + "learning_rate": 3.963986823570121e-06, + "loss": 0.81284976, + "num_input_tokens_seen": 31539265, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.2734375, + "step": 1475, + "time_per_iteration": 2.495753765106201 + }, + { + "auxiliary_loss_clip": 0.01184034, + "auxiliary_loss_mlp": 0.01048109, + "balance_loss_clip": 1.0258882, + "balance_loss_mlp": 1.05443335, + "epoch": 0.08874192093792274, + "flos": 43177553216640.0, + "grad_norm": 1.6510632676992876, + "language_loss": 0.73973525, + "learning_rate": 3.963913211115848e-06, + "loss": 0.76205671, + "num_input_tokens_seen": 31563425, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.296875, + "step": 1476, + "time_per_iteration": 6.925957679748535 + }, + { + "auxiliary_loss_clip": 0.0118493, + "auxiliary_loss_mlp": 0.01060562, + "balance_loss_clip": 1.03723264, + "balance_loss_mlp": 1.05454326, + "epoch": 0.0888020441905907, + "flos": 32852445868800.0, + "grad_norm": 1.527991814504802, + "language_loss": 0.74644423, + "learning_rate": 3.9638395241897895e-06, + "loss": 0.76889908, + "num_input_tokens_seen": 31584525, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.3046875, + "step": 1477, + "time_per_iteration": 2.6033589839935303 + }, + { + "auxiliary_loss_clip": 0.01181345, + "auxiliary_loss_mlp": 0.01048179, + "balance_loss_clip": 1.02571976, + "balance_loss_mlp": 1.05315852, + "epoch": 0.08886216744325869, + "flos": 23149347361920.0, + "grad_norm": 2.4237564416671002, + "language_loss": 0.86488914, + "learning_rate": 3.963765762794739e-06, + "loss": 0.88718438, + "num_input_tokens_seen": 31603325, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.28125, + "step": 1478, + "time_per_iteration": 2.5188398361206055 + }, + { + "auxiliary_loss_clip": 0.01182629, + "auxiliary_loss_mlp": 0.01057749, + "balance_loss_clip": 1.03599334, + "balance_loss_mlp": 1.05417609, + "epoch": 0.08892229069592665, + "flos": 23331593992320.0, + "grad_norm": 7.715019285918926, + "language_loss": 0.77988106, + "learning_rate": 3.963691926933495e-06, + "loss": 0.80228484, + "num_input_tokens_seen": 31624820, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.28125, + "step": 1479, + "time_per_iteration": 2.50730562210083 + }, + { + "auxiliary_loss_clip": 0.01180801, + "auxiliary_loss_mlp": 0.01053517, + "balance_loss_clip": 1.02986622, + "balance_loss_mlp": 1.05275774, + "epoch": 0.08898241394859462, + "flos": 26213784272640.0, + "grad_norm": 2.3628139464189815, + "language_loss": 0.78267598, + "learning_rate": 3.9636180166088555e-06, + "loss": 0.80501914, + "num_input_tokens_seen": 31646080, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.28125, + "step": 1480, + "time_per_iteration": 2.512730360031128 + }, + { + "auxiliary_loss_clip": 0.01185027, + "auxiliary_loss_mlp": 0.01063075, + "balance_loss_clip": 1.03901875, + "balance_loss_mlp": 1.05357075, + "epoch": 0.0890425372012626, + "flos": 23550613171200.0, + "grad_norm": 3.1949876590170825, + "language_loss": 0.66627192, + "learning_rate": 3.963544031823624e-06, + "loss": 0.68875289, + "num_input_tokens_seen": 31665770, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.3125, + "step": 1481, + "time_per_iteration": 2.4874138832092285 + }, + { + "auxiliary_loss_clip": 0.0118244, + "auxiliary_loss_mlp": 0.01051994, + "balance_loss_clip": 1.03040504, + "balance_loss_mlp": 1.05519605, + "epoch": 0.08910266045393056, + "flos": 23002795872000.0, + "grad_norm": 1.9560930463008703, + "language_loss": 0.9644348, + "learning_rate": 3.9634699725806065e-06, + "loss": 0.98677909, + "num_input_tokens_seen": 31683805, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.2734375, + "step": 1482, + "time_per_iteration": 2.484274387359619 + }, + { + "auxiliary_loss_clip": 0.01190541, + "auxiliary_loss_mlp": 0.01055727, + "balance_loss_clip": 1.03306508, + "balance_loss_mlp": 1.0577234, + "epoch": 0.08916278370659853, + "flos": 31936508035200.0, + "grad_norm": 2.358614174414972, + "language_loss": 0.78436875, + "learning_rate": 3.96339583888261e-06, + "loss": 0.80683142, + "num_input_tokens_seen": 31704630, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.328125, + "step": 1483, + "time_per_iteration": 2.566199779510498 + }, + { + "auxiliary_loss_clip": 0.01183147, + "auxiliary_loss_mlp": 0.01072522, + "balance_loss_clip": 1.04891825, + "balance_loss_mlp": 1.05463076, + "epoch": 0.08922290695926649, + "flos": 17530404969600.0, + "grad_norm": 2.232834813834399, + "language_loss": 0.86091626, + "learning_rate": 3.963321630732448e-06, + "loss": 0.88347292, + "num_input_tokens_seen": 31723255, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.28125, + "step": 1484, + "time_per_iteration": 2.4742467403411865 + }, + { + "auxiliary_loss_clip": 0.01190947, + "auxiliary_loss_mlp": 0.01064822, + "balance_loss_clip": 1.04152799, + "balance_loss_mlp": 1.0570302, + "epoch": 0.08928303021193447, + "flos": 32125075459200.0, + "grad_norm": 1.7135103732453094, + "language_loss": 0.80460989, + "learning_rate": 3.963247348132932e-06, + "loss": 0.82716757, + "num_input_tokens_seen": 31747045, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.34375, + "step": 1485, + "time_per_iteration": 2.5808591842651367 + }, + { + "auxiliary_loss_clip": 0.01182644, + "auxiliary_loss_mlp": 0.01059654, + "balance_loss_clip": 1.03663421, + "balance_loss_mlp": 1.05256486, + "epoch": 0.08934315346460243, + "flos": 22125210785280.0, + "grad_norm": 2.0833446931013144, + "language_loss": 0.8295821, + "learning_rate": 3.96317299108688e-06, + "loss": 0.852005, + "num_input_tokens_seen": 31766615, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.296875, + "step": 1486, + "time_per_iteration": 2.5060923099517822 + }, + { + "auxiliary_loss_clip": 0.01184012, + "auxiliary_loss_mlp": 0.01060171, + "balance_loss_clip": 1.03749752, + "balance_loss_mlp": 1.05506349, + "epoch": 0.0894032767172704, + "flos": 22565583527040.0, + "grad_norm": 1.6673763915473876, + "language_loss": 0.76653707, + "learning_rate": 3.963098559597111e-06, + "loss": 0.78897893, + "num_input_tokens_seen": 31785855, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.296875, + "step": 1487, + "time_per_iteration": 2.4968059062957764 + }, + { + "auxiliary_loss_clip": 0.01181982, + "auxiliary_loss_mlp": 0.01063322, + "balance_loss_clip": 1.03908658, + "balance_loss_mlp": 1.05203557, + "epoch": 0.08946339996993838, + "flos": 20193396503040.0, + "grad_norm": 2.360836711926668, + "language_loss": 0.83246535, + "learning_rate": 3.963024053666449e-06, + "loss": 0.85491836, + "num_input_tokens_seen": 31804210, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.296875, + "step": 1488, + "time_per_iteration": 2.48189377784729 + }, + { + "auxiliary_loss_clip": 0.01180173, + "auxiliary_loss_mlp": 0.01051663, + "balance_loss_clip": 1.03020549, + "balance_loss_mlp": 1.05375743, + "epoch": 0.08952352322260634, + "flos": 48360181104000.0, + "grad_norm": 1.9508187836998312, + "language_loss": 0.71647823, + "learning_rate": 3.962949473297718e-06, + "loss": 0.73879659, + "num_input_tokens_seen": 31826150, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.265625, + "step": 1489, + "time_per_iteration": 2.701035737991333 + }, + { + "auxiliary_loss_clip": 0.01178824, + "auxiliary_loss_mlp": 0.01053682, + "balance_loss_clip": 1.03087783, + "balance_loss_mlp": 1.05088401, + "epoch": 0.08958364647527431, + "flos": 31793081028480.0, + "grad_norm": 1.8144641128553483, + "language_loss": 0.89490288, + "learning_rate": 3.962874818493745e-06, + "loss": 0.91722786, + "num_input_tokens_seen": 31848060, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.28125, + "step": 1490, + "time_per_iteration": 2.676098108291626 + }, + { + "auxiliary_loss_clip": 0.01187914, + "auxiliary_loss_mlp": 0.01064376, + "balance_loss_clip": 1.0416671, + "balance_loss_mlp": 1.05264366, + "epoch": 0.08964376972794229, + "flos": 23368186972800.0, + "grad_norm": 2.165908760559946, + "language_loss": 0.73276365, + "learning_rate": 3.9628000892573635e-06, + "loss": 0.75528657, + "num_input_tokens_seen": 31870040, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.3515625, + "step": 1491, + "time_per_iteration": 2.5531163215637207 + }, + { + "auxiliary_loss_clip": 0.01181575, + "auxiliary_loss_mlp": 0.01050632, + "balance_loss_clip": 1.02984166, + "balance_loss_mlp": 1.05362582, + "epoch": 0.08970389298061025, + "flos": 23294785530240.0, + "grad_norm": 1.6884120279290091, + "language_loss": 0.77121007, + "learning_rate": 3.9627252855914055e-06, + "loss": 0.79353207, + "num_input_tokens_seen": 31890400, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.28125, + "step": 1492, + "time_per_iteration": 2.485531806945801 + }, + { + "auxiliary_loss_clip": 0.01180742, + "auxiliary_loss_mlp": 0.01055458, + "balance_loss_clip": 1.03324914, + "balance_loss_mlp": 1.05471706, + "epoch": 0.08976401623327822, + "flos": 33761703772800.0, + "grad_norm": 2.0059524225222414, + "language_loss": 0.71168351, + "learning_rate": 3.962650407498707e-06, + "loss": 0.73404551, + "num_input_tokens_seen": 31913435, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.2578125, + "step": 1493, + "time_per_iteration": 2.5819149017333984 + }, + { + "auxiliary_loss_clip": 0.01184961, + "auxiliary_loss_mlp": 0.01056172, + "balance_loss_clip": 1.03304577, + "balance_loss_mlp": 1.05477107, + "epoch": 0.08982413948594618, + "flos": 23911335504000.0, + "grad_norm": 1.7443337417031568, + "language_loss": 0.86910093, + "learning_rate": 3.962575454982109e-06, + "loss": 0.89151227, + "num_input_tokens_seen": 31932435, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.3046875, + "step": 1494, + "time_per_iteration": 2.491126775741577 + }, + { + "auxiliary_loss_clip": 0.01180854, + "auxiliary_loss_mlp": 0.01064445, + "balance_loss_clip": 1.04080594, + "balance_loss_mlp": 1.05289626, + "epoch": 0.08988426273861416, + "flos": 16837544551680.0, + "grad_norm": 1.7176751495851263, + "language_loss": 0.83065581, + "learning_rate": 3.962500428044454e-06, + "loss": 0.85310876, + "num_input_tokens_seen": 31950125, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.28125, + "step": 1495, + "time_per_iteration": 2.463747501373291 + }, + { + "auxiliary_loss_clip": 0.01187726, + "auxiliary_loss_mlp": 0.01056179, + "balance_loss_clip": 1.03410196, + "balance_loss_mlp": 1.05825078, + "epoch": 0.08994438599128213, + "flos": 14793365548800.0, + "grad_norm": 1.861203767183833, + "language_loss": 0.69813877, + "learning_rate": 3.962425326688585e-06, + "loss": 0.72057784, + "num_input_tokens_seen": 31968050, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.296875, + "step": 1496, + "time_per_iteration": 2.4409985542297363 + }, + { + "auxiliary_loss_clip": 0.01180533, + "auxiliary_loss_mlp": 0.01051241, + "balance_loss_clip": 1.03035557, + "balance_loss_mlp": 1.05325341, + "epoch": 0.09000450924395009, + "flos": 17384320356480.0, + "grad_norm": 1.6091347390483586, + "language_loss": 0.79913563, + "learning_rate": 3.962350150917351e-06, + "loss": 0.82145333, + "num_input_tokens_seen": 31985675, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.2734375, + "step": 1497, + "time_per_iteration": 2.492732048034668 + }, + { + "auxiliary_loss_clip": 0.01186533, + "auxiliary_loss_mlp": 0.0105809, + "balance_loss_clip": 1.03484416, + "balance_loss_mlp": 1.05299318, + "epoch": 0.09006463249661807, + "flos": 24280317964800.0, + "grad_norm": 2.3611651581227915, + "language_loss": 0.8262192, + "learning_rate": 3.9622749007336035e-06, + "loss": 0.84866548, + "num_input_tokens_seen": 32005180, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.3359375, + "step": 1498, + "time_per_iteration": 2.492124080657959 + }, + { + "auxiliary_loss_clip": 0.01188542, + "auxiliary_loss_mlp": 0.01061597, + "balance_loss_clip": 1.0402112, + "balance_loss_mlp": 1.05628157, + "epoch": 0.09012475574928604, + "flos": 13661928069120.0, + "grad_norm": 2.316244908481527, + "language_loss": 0.7849865, + "learning_rate": 3.962199576140195e-06, + "loss": 0.80748791, + "num_input_tokens_seen": 32022970, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.3203125, + "step": 1499, + "time_per_iteration": 2.455986738204956 + }, + { + "auxiliary_loss_clip": 0.0117942, + "auxiliary_loss_mlp": 0.01055125, + "balance_loss_clip": 1.03348815, + "balance_loss_mlp": 1.05351877, + "epoch": 0.090184879001954, + "flos": 23327751237120.0, + "grad_norm": 1.652937184766999, + "language_loss": 0.93453979, + "learning_rate": 3.962124177139981e-06, + "loss": 0.95688522, + "num_input_tokens_seen": 32043055, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.2578125, + "step": 1500, + "time_per_iteration": 2.481450080871582 + }, + { + "auxiliary_loss_clip": 0.01182931, + "auxiliary_loss_mlp": 0.01050934, + "balance_loss_clip": 1.0268302, + "balance_loss_mlp": 1.05170345, + "epoch": 0.09024500225462198, + "flos": 23002688131200.0, + "grad_norm": 2.9257189866461966, + "language_loss": 0.74465239, + "learning_rate": 3.962048703735822e-06, + "loss": 0.76699102, + "num_input_tokens_seen": 32061900, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.3125, + "step": 1501, + "time_per_iteration": 2.4806344509124756 + }, + { + "auxiliary_loss_clip": 0.01077215, + "auxiliary_loss_mlp": 0.01003378, + "balance_loss_clip": 0.99992049, + "balance_loss_mlp": 1.02834833, + "epoch": 0.09030512550728995, + "flos": 62189203242240.0, + "grad_norm": 0.7322723529864947, + "language_loss": 0.58304042, + "learning_rate": 3.96197315593058e-06, + "loss": 0.60384637, + "num_input_tokens_seen": 32122745, + "router_z_loss_clip": 0.03466797, + "router_z_loss_mlp": 0.48828125, + "step": 1502, + "time_per_iteration": 3.066755771636963 + }, + { + "auxiliary_loss_clip": 0.01178455, + "auxiliary_loss_mlp": 0.01047829, + "balance_loss_clip": 1.02655029, + "balance_loss_mlp": 1.05134845, + "epoch": 0.09036524875995791, + "flos": 38800689171840.0, + "grad_norm": 2.407651446444188, + "language_loss": 0.69502187, + "learning_rate": 3.961897533727119e-06, + "loss": 0.71728474, + "num_input_tokens_seen": 32145125, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.2734375, + "step": 1503, + "time_per_iteration": 2.608006000518799 + }, + { + "auxiliary_loss_clip": 0.01180913, + "auxiliary_loss_mlp": 0.01054911, + "balance_loss_clip": 1.03346539, + "balance_loss_mlp": 1.0508244, + "epoch": 0.09042537201262588, + "flos": 21690081429120.0, + "grad_norm": 2.015182939383952, + "language_loss": 0.86142361, + "learning_rate": 3.961821837128306e-06, + "loss": 0.88378185, + "num_input_tokens_seen": 32166255, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.3046875, + "step": 1504, + "time_per_iteration": 2.489906072616577 + }, + { + "auxiliary_loss_clip": 0.01188306, + "auxiliary_loss_mlp": 0.01064134, + "balance_loss_clip": 1.03871906, + "balance_loss_mlp": 1.05330658, + "epoch": 0.09048549526529386, + "flos": 22267021680000.0, + "grad_norm": 1.9466916160800904, + "language_loss": 0.72267938, + "learning_rate": 3.961746066137014e-06, + "loss": 0.74520379, + "num_input_tokens_seen": 32184010, + "router_z_loss_clip": 0.25390625, + "router_z_loss_mlp": 1.3515625, + "step": 1505, + "time_per_iteration": 2.465965509414673 + }, + { + "auxiliary_loss_clip": 0.01179818, + "auxiliary_loss_mlp": 0.01054589, + "balance_loss_clip": 1.03203487, + "balance_loss_mlp": 1.05332816, + "epoch": 0.09054561851796182, + "flos": 14610939350400.0, + "grad_norm": 2.3726339000283447, + "language_loss": 0.80946511, + "learning_rate": 3.961670220756114e-06, + "loss": 0.83180916, + "num_input_tokens_seen": 32201635, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.265625, + "step": 1506, + "time_per_iteration": 2.4512932300567627 + }, + { + "auxiliary_loss_clip": 0.01179114, + "auxiliary_loss_mlp": 0.01049611, + "balance_loss_clip": 1.02859426, + "balance_loss_mlp": 1.0531404, + "epoch": 0.09060574177062979, + "flos": 27636169916160.0, + "grad_norm": 2.1533698580433254, + "language_loss": 0.76043189, + "learning_rate": 3.961594300988482e-06, + "loss": 0.78271914, + "num_input_tokens_seen": 32221940, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.2578125, + "step": 1507, + "time_per_iteration": 2.5277152061462402 + }, + { + "auxiliary_loss_clip": 0.01067186, + "auxiliary_loss_mlp": 0.01009923, + "balance_loss_clip": 1.00679994, + "balance_loss_mlp": 1.01922798, + "epoch": 0.09066586502329776, + "flos": 66085797513600.0, + "grad_norm": 0.7312512202665958, + "language_loss": 0.57670546, + "learning_rate": 3.961518306836998e-06, + "loss": 0.59747648, + "num_input_tokens_seen": 32276495, + "router_z_loss_clip": 0.03112793, + "router_z_loss_mlp": 0.48046875, + "step": 1508, + "time_per_iteration": 2.9330992698669434 + }, + { + "auxiliary_loss_clip": 0.01182207, + "auxiliary_loss_mlp": 0.01052694, + "balance_loss_clip": 1.0313319, + "balance_loss_mlp": 1.05309892, + "epoch": 0.09072598827596573, + "flos": 18916449027840.0, + "grad_norm": 2.072562238387217, + "language_loss": 0.85046542, + "learning_rate": 3.961442238304543e-06, + "loss": 0.87281442, + "num_input_tokens_seen": 32294130, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.2890625, + "step": 1509, + "time_per_iteration": 2.475606918334961 + }, + { + "auxiliary_loss_clip": 0.01189974, + "auxiliary_loss_mlp": 0.01065674, + "balance_loss_clip": 1.04158139, + "balance_loss_mlp": 1.05606115, + "epoch": 0.0907861115286337, + "flos": 24821742643200.0, + "grad_norm": 2.413703760690829, + "language_loss": 0.84302551, + "learning_rate": 3.961366095394002e-06, + "loss": 0.86558187, + "num_input_tokens_seen": 32313555, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.3359375, + "step": 1510, + "time_per_iteration": 2.576070785522461 + }, + { + "auxiliary_loss_clip": 0.01184002, + "auxiliary_loss_mlp": 0.01055545, + "balance_loss_clip": 1.0335387, + "balance_loss_mlp": 1.05408144, + "epoch": 0.09084623478130167, + "flos": 21652842003840.0, + "grad_norm": 1.9204492801986277, + "language_loss": 0.85558611, + "learning_rate": 3.961289878108262e-06, + "loss": 0.8779816, + "num_input_tokens_seen": 32331430, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.296875, + "step": 1511, + "time_per_iteration": 2.5085484981536865 + }, + { + "auxiliary_loss_clip": 0.01181957, + "auxiliary_loss_mlp": 0.01048579, + "balance_loss_clip": 1.02690685, + "balance_loss_mlp": 1.05469918, + "epoch": 0.09090635803396964, + "flos": 27639258485760.0, + "grad_norm": 1.5775523407684693, + "language_loss": 0.84897017, + "learning_rate": 3.9612135864502135e-06, + "loss": 0.87127548, + "num_input_tokens_seen": 32353705, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.2734375, + "step": 1512, + "time_per_iteration": 2.515565872192383 + }, + { + "auxiliary_loss_clip": 0.01175178, + "auxiliary_loss_mlp": 0.01049482, + "balance_loss_clip": 1.02888274, + "balance_loss_mlp": 1.05033123, + "epoch": 0.0909664812866376, + "flos": 17669127294720.0, + "grad_norm": 2.9006324958480167, + "language_loss": 0.86704344, + "learning_rate": 3.961137220422749e-06, + "loss": 0.88929009, + "num_input_tokens_seen": 32370520, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.25, + "step": 1513, + "time_per_iteration": 2.475271701812744 + }, + { + "auxiliary_loss_clip": 0.01180699, + "auxiliary_loss_mlp": 0.01051925, + "balance_loss_clip": 1.03170729, + "balance_loss_mlp": 1.0536902, + "epoch": 0.09102660453930557, + "flos": 23951448017280.0, + "grad_norm": 1.6716164971548293, + "language_loss": 0.86379707, + "learning_rate": 3.961060780028764e-06, + "loss": 0.8861233, + "num_input_tokens_seen": 32389105, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.2734375, + "step": 1514, + "time_per_iteration": 2.5317347049713135 + }, + { + "auxiliary_loss_clip": 0.0118192, + "auxiliary_loss_mlp": 0.01060131, + "balance_loss_clip": 1.03991365, + "balance_loss_mlp": 1.05550981, + "epoch": 0.09108672779197355, + "flos": 25812949426560.0, + "grad_norm": 1.9279276264910965, + "language_loss": 0.89882755, + "learning_rate": 3.960984265271159e-06, + "loss": 0.92124808, + "num_input_tokens_seen": 32408065, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.265625, + "step": 1515, + "time_per_iteration": 2.5507757663726807 + }, + { + "auxiliary_loss_clip": 0.011822, + "auxiliary_loss_mlp": 0.0105444, + "balance_loss_clip": 1.03174293, + "balance_loss_mlp": 1.05321527, + "epoch": 0.09114685104464151, + "flos": 29639482220160.0, + "grad_norm": 2.0145121179505905, + "language_loss": 0.85567206, + "learning_rate": 3.9609076761528335e-06, + "loss": 0.87803847, + "num_input_tokens_seen": 32427225, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.28125, + "step": 1516, + "time_per_iteration": 2.524787425994873 + }, + { + "auxiliary_loss_clip": 0.01182997, + "auxiliary_loss_mlp": 0.01053148, + "balance_loss_clip": 1.03130913, + "balance_loss_mlp": 1.05217946, + "epoch": 0.09120697429730948, + "flos": 33729635905920.0, + "grad_norm": 1.5232376391767188, + "language_loss": 0.81104374, + "learning_rate": 3.960831012676692e-06, + "loss": 0.83340514, + "num_input_tokens_seen": 32450510, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.3125, + "step": 1517, + "time_per_iteration": 2.5781173706054688 + }, + { + "auxiliary_loss_clip": 0.01185126, + "auxiliary_loss_mlp": 0.01068952, + "balance_loss_clip": 1.04729199, + "balance_loss_mlp": 1.05378699, + "epoch": 0.09126709754997746, + "flos": 18401381953920.0, + "grad_norm": 1.6026665805728266, + "language_loss": 0.78008473, + "learning_rate": 3.960754274845642e-06, + "loss": 0.80262554, + "num_input_tokens_seen": 32468425, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.3125, + "step": 1518, + "time_per_iteration": 4.000938653945923 + }, + { + "auxiliary_loss_clip": 0.01179619, + "auxiliary_loss_mlp": 0.01060053, + "balance_loss_clip": 1.03851235, + "balance_loss_mlp": 1.05189955, + "epoch": 0.09132722080264542, + "flos": 22091957769600.0, + "grad_norm": 1.883609624415087, + "language_loss": 0.86375809, + "learning_rate": 3.960677462662594e-06, + "loss": 0.88615477, + "num_input_tokens_seen": 32487510, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.28125, + "step": 1519, + "time_per_iteration": 3.945183277130127 + }, + { + "auxiliary_loss_clip": 0.01180521, + "auxiliary_loss_mlp": 0.01053198, + "balance_loss_clip": 1.03027439, + "balance_loss_mlp": 1.05196333, + "epoch": 0.09138734405531339, + "flos": 21033131633280.0, + "grad_norm": 2.4149150298084425, + "language_loss": 0.73425877, + "learning_rate": 3.96060057613046e-06, + "loss": 0.75659597, + "num_input_tokens_seen": 32507250, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.28125, + "step": 1520, + "time_per_iteration": 2.470977306365967 + }, + { + "auxiliary_loss_clip": 0.01181506, + "auxiliary_loss_mlp": 0.01055081, + "balance_loss_clip": 1.03299177, + "balance_loss_mlp": 1.0525614, + "epoch": 0.09144746730798137, + "flos": 20083940784000.0, + "grad_norm": 2.6960755220153825, + "language_loss": 0.85296613, + "learning_rate": 3.960523615252156e-06, + "loss": 0.87533194, + "num_input_tokens_seen": 32526045, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.2890625, + "step": 1521, + "time_per_iteration": 2.478440761566162 + }, + { + "auxiliary_loss_clip": 0.01183058, + "auxiliary_loss_mlp": 0.01057495, + "balance_loss_clip": 1.034917, + "balance_loss_mlp": 1.05319118, + "epoch": 0.09150759056064933, + "flos": 22778210085120.0, + "grad_norm": 2.1543470058122876, + "language_loss": 0.83979875, + "learning_rate": 3.960446580030599e-06, + "loss": 0.86220425, + "num_input_tokens_seen": 32546575, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.296875, + "step": 1522, + "time_per_iteration": 2.4761834144592285 + }, + { + "auxiliary_loss_clip": 0.01174804, + "auxiliary_loss_mlp": 0.01057417, + "balance_loss_clip": 1.03500533, + "balance_loss_mlp": 1.05125594, + "epoch": 0.0915677138133173, + "flos": 27564205017600.0, + "grad_norm": 2.174137545904809, + "language_loss": 0.810691, + "learning_rate": 3.960369470468711e-06, + "loss": 0.83301324, + "num_input_tokens_seen": 32568795, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.234375, + "step": 1523, + "time_per_iteration": 2.525385618209839 + }, + { + "auxiliary_loss_clip": 0.01182998, + "auxiliary_loss_mlp": 0.01063543, + "balance_loss_clip": 1.0426811, + "balance_loss_mlp": 1.05365944, + "epoch": 0.09162783706598528, + "flos": 17674765729920.0, + "grad_norm": 2.529065997296093, + "language_loss": 0.74591744, + "learning_rate": 3.960292286569418e-06, + "loss": 0.76838291, + "num_input_tokens_seen": 32587010, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.296875, + "step": 1524, + "time_per_iteration": 2.4293112754821777 + }, + { + "auxiliary_loss_clip": 0.01181121, + "auxiliary_loss_mlp": 0.01060116, + "balance_loss_clip": 1.03822935, + "balance_loss_mlp": 1.05373263, + "epoch": 0.09168796031865324, + "flos": 18478195188480.0, + "grad_norm": 2.0870290485059586, + "language_loss": 0.861516, + "learning_rate": 3.960215028335644e-06, + "loss": 0.88392842, + "num_input_tokens_seen": 32602375, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.2734375, + "step": 1525, + "time_per_iteration": 2.449774980545044 + }, + { + "auxiliary_loss_clip": 0.01181752, + "auxiliary_loss_mlp": 0.01047765, + "balance_loss_clip": 1.02577078, + "balance_loss_mlp": 1.05424511, + "epoch": 0.0917480835713212, + "flos": 29387605075200.0, + "grad_norm": 2.3600448138049597, + "language_loss": 0.74690467, + "learning_rate": 3.96013769577032e-06, + "loss": 0.76919985, + "num_input_tokens_seen": 32621460, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.28125, + "step": 1526, + "time_per_iteration": 2.5295088291168213 + }, + { + "auxiliary_loss_clip": 0.01177679, + "auxiliary_loss_mlp": 0.01052164, + "balance_loss_clip": 1.03058743, + "balance_loss_mlp": 1.05291057, + "epoch": 0.09180820682398917, + "flos": 19829262378240.0, + "grad_norm": 1.970734062299861, + "language_loss": 0.7736311, + "learning_rate": 3.960060288876378e-06, + "loss": 0.79592943, + "num_input_tokens_seen": 32640440, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.25, + "step": 1527, + "time_per_iteration": 2.465484142303467 + }, + { + "auxiliary_loss_clip": 0.01179355, + "auxiliary_loss_mlp": 0.01053495, + "balance_loss_clip": 1.03064227, + "balance_loss_mlp": 1.05090261, + "epoch": 0.09186833007665715, + "flos": 23841848643840.0, + "grad_norm": 1.9755082573034908, + "language_loss": 0.78465801, + "learning_rate": 3.959982807656753e-06, + "loss": 0.80698651, + "num_input_tokens_seen": 32660020, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.28125, + "step": 1528, + "time_per_iteration": 2.5257718563079834 + }, + { + "auxiliary_loss_clip": 0.01177926, + "auxiliary_loss_mlp": 0.01048723, + "balance_loss_clip": 1.0276351, + "balance_loss_mlp": 1.05085492, + "epoch": 0.09192845332932512, + "flos": 12932726065920.0, + "grad_norm": 2.6736868569465813, + "language_loss": 0.76880527, + "learning_rate": 3.959905252114384e-06, + "loss": 0.79107177, + "num_input_tokens_seen": 32678170, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.2734375, + "step": 1529, + "time_per_iteration": 2.4417288303375244 + }, + { + "auxiliary_loss_clip": 0.01180418, + "auxiliary_loss_mlp": 0.01053431, + "balance_loss_clip": 1.0306139, + "balance_loss_mlp": 1.05037212, + "epoch": 0.09198857658199308, + "flos": 24568177559040.0, + "grad_norm": 1.767002219307874, + "language_loss": 0.83118784, + "learning_rate": 3.959827622252211e-06, + "loss": 0.85352623, + "num_input_tokens_seen": 32697540, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.296875, + "step": 1530, + "time_per_iteration": 2.53367018699646 + }, + { + "auxiliary_loss_clip": 0.01173477, + "auxiliary_loss_mlp": 0.01059229, + "balance_loss_clip": 1.03723454, + "balance_loss_mlp": 1.05024123, + "epoch": 0.09204869983466106, + "flos": 20266941600000.0, + "grad_norm": 2.058190265763826, + "language_loss": 0.8408612, + "learning_rate": 3.959749918073179e-06, + "loss": 0.86318833, + "num_input_tokens_seen": 32716805, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.234375, + "step": 1531, + "time_per_iteration": 2.4784743785858154 + }, + { + "auxiliary_loss_clip": 0.01177383, + "auxiliary_loss_mlp": 0.01048961, + "balance_loss_clip": 1.02728868, + "balance_loss_mlp": 1.05083799, + "epoch": 0.09210882308732903, + "flos": 20885646389760.0, + "grad_norm": 1.8347699676368683, + "language_loss": 0.81135088, + "learning_rate": 3.959672139580233e-06, + "loss": 0.83361435, + "num_input_tokens_seen": 32736385, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.265625, + "step": 1532, + "time_per_iteration": 2.506875991821289 + }, + { + "auxiliary_loss_clip": 0.01179012, + "auxiliary_loss_mlp": 0.01052948, + "balance_loss_clip": 1.03044105, + "balance_loss_mlp": 1.05169332, + "epoch": 0.09216894633999699, + "flos": 30956326727040.0, + "grad_norm": 1.8650949584676202, + "language_loss": 0.83489287, + "learning_rate": 3.9595942867763235e-06, + "loss": 0.85721242, + "num_input_tokens_seen": 32757140, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.2734375, + "step": 1533, + "time_per_iteration": 2.5279369354248047 + }, + { + "auxiliary_loss_clip": 0.01181754, + "auxiliary_loss_mlp": 0.01048559, + "balance_loss_clip": 1.02662432, + "balance_loss_mlp": 1.05468941, + "epoch": 0.09222906959266497, + "flos": 13151565676800.0, + "grad_norm": 1.8226281566677605, + "language_loss": 0.89789164, + "learning_rate": 3.959516359664402e-06, + "loss": 0.92019475, + "num_input_tokens_seen": 32774860, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.2734375, + "step": 1534, + "time_per_iteration": 2.498732089996338 + }, + { + "auxiliary_loss_clip": 0.01178154, + "auxiliary_loss_mlp": 0.01064045, + "balance_loss_clip": 1.03958321, + "balance_loss_mlp": 1.04994035, + "epoch": 0.09228919284533293, + "flos": 25994477784960.0, + "grad_norm": 2.6410414613778777, + "language_loss": 0.75911283, + "learning_rate": 3.959438358247424e-06, + "loss": 0.78153479, + "num_input_tokens_seen": 32795250, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.28125, + "step": 1535, + "time_per_iteration": 2.5389468669891357 + }, + { + "auxiliary_loss_clip": 0.01170543, + "auxiliary_loss_mlp": 0.01043965, + "balance_loss_clip": 1.02404571, + "balance_loss_mlp": 1.04907823, + "epoch": 0.0923493160980009, + "flos": 18660800954880.0, + "grad_norm": 1.8388387816947327, + "language_loss": 0.81344318, + "learning_rate": 3.959360282528346e-06, + "loss": 0.83558822, + "num_input_tokens_seen": 32813805, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.2109375, + "step": 1536, + "time_per_iteration": 2.5075631141662598 + }, + { + "auxiliary_loss_clip": 0.01173873, + "auxiliary_loss_mlp": 0.01051939, + "balance_loss_clip": 1.0312202, + "balance_loss_mlp": 1.04995418, + "epoch": 0.09240943935066886, + "flos": 21140576190720.0, + "grad_norm": 2.109198419692537, + "language_loss": 0.8921392, + "learning_rate": 3.959282132510131e-06, + "loss": 0.91439736, + "num_input_tokens_seen": 32830960, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.234375, + "step": 1537, + "time_per_iteration": 2.4454562664031982 + }, + { + "auxiliary_loss_clip": 0.01177438, + "auxiliary_loss_mlp": 0.01059104, + "balance_loss_clip": 1.03638315, + "balance_loss_mlp": 1.05164456, + "epoch": 0.09246956260333684, + "flos": 20592435669120.0, + "grad_norm": 2.1959440535625285, + "language_loss": 0.8072964, + "learning_rate": 3.959203908195741e-06, + "loss": 0.82966185, + "num_input_tokens_seen": 32848275, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.2578125, + "step": 1538, + "time_per_iteration": 2.50838303565979 + }, + { + "auxiliary_loss_clip": 0.01066028, + "auxiliary_loss_mlp": 0.0101212, + "balance_loss_clip": 1.0091517, + "balance_loss_mlp": 1.01794529, + "epoch": 0.09252968585600481, + "flos": 67558710614400.0, + "grad_norm": 0.74443800558722, + "language_loss": 0.57375526, + "learning_rate": 3.959125609588142e-06, + "loss": 0.59453678, + "num_input_tokens_seen": 32917730, + "router_z_loss_clip": 0.02966309, + "router_z_loss_mlp": 0.48046875, + "step": 1539, + "time_per_iteration": 3.16038179397583 + }, + { + "auxiliary_loss_clip": 0.01179737, + "auxiliary_loss_mlp": 0.01051392, + "balance_loss_clip": 1.02958906, + "balance_loss_mlp": 1.05291581, + "epoch": 0.09258980910867277, + "flos": 17383853479680.0, + "grad_norm": 2.903908071477431, + "language_loss": 0.67164814, + "learning_rate": 3.959047236690304e-06, + "loss": 0.69395947, + "num_input_tokens_seen": 32934910, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.265625, + "step": 1540, + "time_per_iteration": 2.488809585571289 + }, + { + "auxiliary_loss_clip": 0.01178592, + "auxiliary_loss_mlp": 0.0104328, + "balance_loss_clip": 1.02154827, + "balance_loss_mlp": 1.05285096, + "epoch": 0.09264993236134075, + "flos": 19865927185920.0, + "grad_norm": 1.797248436862791, + "language_loss": 0.83666921, + "learning_rate": 3.958968789505198e-06, + "loss": 0.85888791, + "num_input_tokens_seen": 32953840, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.2578125, + "step": 1541, + "time_per_iteration": 2.5406758785247803 + }, + { + "auxiliary_loss_clip": 0.01060695, + "auxiliary_loss_mlp": 0.01009011, + "balance_loss_clip": 1.0061146, + "balance_loss_mlp": 1.01339245, + "epoch": 0.09271005561400872, + "flos": 62284401262080.0, + "grad_norm": 0.8904869203130611, + "language_loss": 0.6196329, + "learning_rate": 3.9588902680358e-06, + "loss": 0.64032996, + "num_input_tokens_seen": 33011410, + "router_z_loss_clip": 0.02893066, + "router_z_loss_mlp": 0.47265625, + "step": 1542, + "time_per_iteration": 3.0973262786865234 + }, + { + "auxiliary_loss_clip": 0.01178215, + "auxiliary_loss_mlp": 0.01055032, + "balance_loss_clip": 1.03486192, + "balance_loss_mlp": 1.05283189, + "epoch": 0.09277017886667668, + "flos": 23329870139520.0, + "grad_norm": 1.711071573157868, + "language_loss": 0.82672381, + "learning_rate": 3.958811672285086e-06, + "loss": 0.84905624, + "num_input_tokens_seen": 33031675, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.25, + "step": 1543, + "time_per_iteration": 2.489415168762207 + }, + { + "auxiliary_loss_clip": 0.0117317, + "auxiliary_loss_mlp": 0.01055984, + "balance_loss_clip": 1.03462195, + "balance_loss_mlp": 1.05128777, + "epoch": 0.09283030211934466, + "flos": 54745169875200.0, + "grad_norm": 1.6169278883375504, + "language_loss": 0.72058821, + "learning_rate": 3.958733002256038e-06, + "loss": 0.74287981, + "num_input_tokens_seen": 33056355, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.21875, + "step": 1544, + "time_per_iteration": 2.7986748218536377 + }, + { + "auxiliary_loss_clip": 0.01177062, + "auxiliary_loss_mlp": 0.01047649, + "balance_loss_clip": 1.0257864, + "balance_loss_mlp": 1.05111873, + "epoch": 0.09289042537201263, + "flos": 30334784762880.0, + "grad_norm": 1.7012123784712243, + "language_loss": 0.77617419, + "learning_rate": 3.958654257951637e-06, + "loss": 0.79842126, + "num_input_tokens_seen": 33079520, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.265625, + "step": 1545, + "time_per_iteration": 2.5673069953918457 + }, + { + "auxiliary_loss_clip": 0.01173726, + "auxiliary_loss_mlp": 0.01050414, + "balance_loss_clip": 1.029338, + "balance_loss_mlp": 1.0525856, + "epoch": 0.09295054862468059, + "flos": 17746838369280.0, + "grad_norm": 2.736353511607615, + "language_loss": 0.74531418, + "learning_rate": 3.9585754393748706e-06, + "loss": 0.76755565, + "num_input_tokens_seen": 33096135, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.2109375, + "step": 1546, + "time_per_iteration": 2.456806182861328 + }, + { + "auxiliary_loss_clip": 0.01180806, + "auxiliary_loss_mlp": 0.01051708, + "balance_loss_clip": 1.02968979, + "balance_loss_mlp": 1.05292201, + "epoch": 0.09301067187734856, + "flos": 23658021815040.0, + "grad_norm": 2.1086065935537284, + "language_loss": 0.84392273, + "learning_rate": 3.9584965465287275e-06, + "loss": 0.86624783, + "num_input_tokens_seen": 33115245, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.28125, + "step": 1547, + "time_per_iteration": 2.5041439533233643 + }, + { + "auxiliary_loss_clip": 0.01177454, + "auxiliary_loss_mlp": 0.01053084, + "balance_loss_clip": 1.03136444, + "balance_loss_mlp": 1.05125856, + "epoch": 0.09307079513001654, + "flos": 27527719777920.0, + "grad_norm": 7.120670718523448, + "language_loss": 0.67616034, + "learning_rate": 3.958417579416199e-06, + "loss": 0.6984657, + "num_input_tokens_seen": 33136640, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.2578125, + "step": 1548, + "time_per_iteration": 2.513141393661499 + }, + { + "auxiliary_loss_clip": 0.01178735, + "auxiliary_loss_mlp": 0.01053, + "balance_loss_clip": 1.03083944, + "balance_loss_mlp": 1.05175209, + "epoch": 0.0931309183826845, + "flos": 20627340710400.0, + "grad_norm": 2.761700755369037, + "language_loss": 0.83445251, + "learning_rate": 3.9583385380402795e-06, + "loss": 0.85676992, + "num_input_tokens_seen": 33155060, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.265625, + "step": 1549, + "time_per_iteration": 2.4822285175323486 + }, + { + "auxiliary_loss_clip": 0.01181659, + "auxiliary_loss_mlp": 0.01043887, + "balance_loss_clip": 1.02312112, + "balance_loss_mlp": 1.05560291, + "epoch": 0.09319104163535247, + "flos": 29020921084800.0, + "grad_norm": 1.7822943519837542, + "language_loss": 0.75744081, + "learning_rate": 3.958259422403966e-06, + "loss": 0.77969635, + "num_input_tokens_seen": 33175420, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.2578125, + "step": 1550, + "time_per_iteration": 2.5503265857696533 + }, + { + "auxiliary_loss_clip": 0.01179426, + "auxiliary_loss_mlp": 0.01069184, + "balance_loss_clip": 1.04579496, + "balance_loss_mlp": 1.05118561, + "epoch": 0.09325116488802045, + "flos": 25301545539840.0, + "grad_norm": 2.0184762942100876, + "language_loss": 0.83272278, + "learning_rate": 3.95818023251026e-06, + "loss": 0.85520893, + "num_input_tokens_seen": 33194120, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.28125, + "step": 1551, + "time_per_iteration": 2.4962081909179688 + }, + { + "auxiliary_loss_clip": 0.01060634, + "auxiliary_loss_mlp": 0.01007794, + "balance_loss_clip": 1.0051949, + "balance_loss_mlp": 1.01350796, + "epoch": 0.09331128814068841, + "flos": 61536203942400.0, + "grad_norm": 0.7800746873014213, + "language_loss": 0.6182366, + "learning_rate": 3.958100968362163e-06, + "loss": 0.6389209, + "num_input_tokens_seen": 33261080, + "router_z_loss_clip": 0.02600098, + "router_z_loss_mlp": 0.47070312, + "step": 1552, + "time_per_iteration": 3.2178378105163574 + }, + { + "auxiliary_loss_clip": 0.01059462, + "auxiliary_loss_mlp": 0.01003668, + "balance_loss_clip": 1.00099754, + "balance_loss_mlp": 1.01257896, + "epoch": 0.09337141139335638, + "flos": 53293700171520.0, + "grad_norm": 0.8330449834122059, + "language_loss": 0.5895977, + "learning_rate": 3.958021629962681e-06, + "loss": 0.61022902, + "num_input_tokens_seen": 33330235, + "router_z_loss_clip": 0.0267334, + "router_z_loss_mlp": 0.46875, + "step": 1553, + "time_per_iteration": 3.220923900604248 + }, + { + "auxiliary_loss_clip": 0.01178223, + "auxiliary_loss_mlp": 0.01058803, + "balance_loss_clip": 1.0369525, + "balance_loss_mlp": 1.05040002, + "epoch": 0.09343153464602436, + "flos": 23476852592640.0, + "grad_norm": 2.0753391269624797, + "language_loss": 0.87452686, + "learning_rate": 3.957942217314823e-06, + "loss": 0.89689714, + "num_input_tokens_seen": 33349035, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.28125, + "step": 1554, + "time_per_iteration": 2.5448763370513916 + }, + { + "auxiliary_loss_clip": 0.01174828, + "auxiliary_loss_mlp": 0.01052934, + "balance_loss_clip": 1.0310595, + "balance_loss_mlp": 1.05265594, + "epoch": 0.09349165789869232, + "flos": 19353481804800.0, + "grad_norm": 2.2438919833216913, + "language_loss": 0.81355709, + "learning_rate": 3.957862730421599e-06, + "loss": 0.83583468, + "num_input_tokens_seen": 33368060, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.21875, + "step": 1555, + "time_per_iteration": 2.4939990043640137 + }, + { + "auxiliary_loss_clip": 0.01058772, + "auxiliary_loss_mlp": 0.01008478, + "balance_loss_clip": 1.00581956, + "balance_loss_mlp": 1.01259685, + "epoch": 0.09355178115136029, + "flos": 67502580635520.0, + "grad_norm": 0.8701907042199977, + "language_loss": 0.59583747, + "learning_rate": 3.957783169286024e-06, + "loss": 0.61651003, + "num_input_tokens_seen": 33430825, + "router_z_loss_clip": 0.02661133, + "router_z_loss_mlp": 0.4609375, + "step": 1556, + "time_per_iteration": 3.0923824310302734 + }, + { + "auxiliary_loss_clip": 0.01177126, + "auxiliary_loss_mlp": 0.01056269, + "balance_loss_clip": 1.03518105, + "balance_loss_mlp": 1.05278862, + "epoch": 0.09361190440402825, + "flos": 37341638720640.0, + "grad_norm": 1.5891177576034032, + "language_loss": 0.84455961, + "learning_rate": 3.9577035339111155e-06, + "loss": 0.86689359, + "num_input_tokens_seen": 33454855, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.2421875, + "step": 1557, + "time_per_iteration": 2.5973968505859375 + }, + { + "auxiliary_loss_clip": 0.01175988, + "auxiliary_loss_mlp": 0.01061513, + "balance_loss_clip": 1.03799307, + "balance_loss_mlp": 1.05065048, + "epoch": 0.09367202765669623, + "flos": 24899705112960.0, + "grad_norm": 1.787574567308206, + "language_loss": 0.77987397, + "learning_rate": 3.957623824299893e-06, + "loss": 0.80224895, + "num_input_tokens_seen": 33476000, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.25, + "step": 1558, + "time_per_iteration": 2.5390286445617676 + }, + { + "auxiliary_loss_clip": 0.01178258, + "auxiliary_loss_mlp": 0.01054751, + "balance_loss_clip": 1.03268576, + "balance_loss_mlp": 1.05035424, + "epoch": 0.0937321509093642, + "flos": 15705568368000.0, + "grad_norm": 2.0310113035260873, + "language_loss": 0.7998119, + "learning_rate": 3.957544040455379e-06, + "loss": 0.822142, + "num_input_tokens_seen": 33493845, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.28125, + "step": 1559, + "time_per_iteration": 5.3233802318573 + }, + { + "auxiliary_loss_clip": 0.01172855, + "auxiliary_loss_mlp": 0.01063353, + "balance_loss_clip": 1.04146647, + "balance_loss_mlp": 1.05015147, + "epoch": 0.09379227416203216, + "flos": 20483698222080.0, + "grad_norm": 1.9877315441152976, + "language_loss": 0.76720232, + "learning_rate": 3.957464182380599e-06, + "loss": 0.78956437, + "num_input_tokens_seen": 33510850, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.2265625, + "step": 1560, + "time_per_iteration": 3.863935947418213 + }, + { + "auxiliary_loss_clip": 0.01180546, + "auxiliary_loss_mlp": 0.01059772, + "balance_loss_clip": 1.03793311, + "balance_loss_mlp": 1.05101645, + "epoch": 0.09385239741470014, + "flos": 24352498344960.0, + "grad_norm": 1.6628394684514, + "language_loss": 0.81219828, + "learning_rate": 3.95738425007858e-06, + "loss": 0.83460152, + "num_input_tokens_seen": 33530430, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.296875, + "step": 1561, + "time_per_iteration": 2.5050160884857178 + }, + { + "auxiliary_loss_clip": 0.01175131, + "auxiliary_loss_mlp": 0.01048338, + "balance_loss_clip": 1.02641547, + "balance_loss_mlp": 1.04764926, + "epoch": 0.0939125206673681, + "flos": 33291489807360.0, + "grad_norm": 2.307547697406205, + "language_loss": 0.61553764, + "learning_rate": 3.957304243552354e-06, + "loss": 0.63777232, + "num_input_tokens_seen": 33551975, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.2734375, + "step": 1562, + "time_per_iteration": 2.5884838104248047 + }, + { + "auxiliary_loss_clip": 0.01177686, + "auxiliary_loss_mlp": 0.01059886, + "balance_loss_clip": 1.03920364, + "balance_loss_mlp": 1.0552876, + "epoch": 0.09397264392003607, + "flos": 19244923925760.0, + "grad_norm": 2.5948914783661468, + "language_loss": 0.84981585, + "learning_rate": 3.957224162804956e-06, + "loss": 0.87219155, + "num_input_tokens_seen": 33569850, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.21875, + "step": 1563, + "time_per_iteration": 2.427928924560547 + }, + { + "auxiliary_loss_clip": 0.01172512, + "auxiliary_loss_mlp": 0.01048044, + "balance_loss_clip": 1.02767134, + "balance_loss_mlp": 1.05013323, + "epoch": 0.09403276717270405, + "flos": 19317930318720.0, + "grad_norm": 1.8141046481233785, + "language_loss": 0.76106739, + "learning_rate": 3.9571440078394205e-06, + "loss": 0.78327298, + "num_input_tokens_seen": 33590510, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.21875, + "step": 1564, + "time_per_iteration": 2.4996325969696045 + }, + { + "auxiliary_loss_clip": 0.01177295, + "auxiliary_loss_mlp": 0.01055133, + "balance_loss_clip": 1.03415227, + "balance_loss_mlp": 1.05290008, + "epoch": 0.09409289042537201, + "flos": 23583471137280.0, + "grad_norm": 2.0134268414891388, + "language_loss": 0.7971766, + "learning_rate": 3.9570637786587895e-06, + "loss": 0.81950086, + "num_input_tokens_seen": 33608810, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.25, + "step": 1565, + "time_per_iteration": 2.470870018005371 + }, + { + "auxiliary_loss_clip": 0.01175133, + "auxiliary_loss_mlp": 0.01069432, + "balance_loss_clip": 1.0479629, + "balance_loss_mlp": 1.0497129, + "epoch": 0.09415301367803998, + "flos": 20078446003200.0, + "grad_norm": 1.8353632925340597, + "language_loss": 0.75241816, + "learning_rate": 3.956983475266103e-06, + "loss": 0.77486378, + "num_input_tokens_seen": 33627265, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.2578125, + "step": 1566, + "time_per_iteration": 2.4962053298950195 + }, + { + "auxiliary_loss_clip": 0.0117411, + "auxiliary_loss_mlp": 0.01059014, + "balance_loss_clip": 1.03746092, + "balance_loss_mlp": 1.04822683, + "epoch": 0.09421313693070796, + "flos": 21062075016960.0, + "grad_norm": 2.55149440594841, + "language_loss": 0.77724433, + "learning_rate": 3.956903097664407e-06, + "loss": 0.79957557, + "num_input_tokens_seen": 33644810, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.2578125, + "step": 1567, + "time_per_iteration": 2.448511838912964 + }, + { + "auxiliary_loss_clip": 0.01178494, + "auxiliary_loss_mlp": 0.01054706, + "balance_loss_clip": 1.03504825, + "balance_loss_mlp": 1.05183101, + "epoch": 0.09427326018337592, + "flos": 24316156759680.0, + "grad_norm": 2.293964487000622, + "language_loss": 0.82571244, + "learning_rate": 3.956822645856749e-06, + "loss": 0.8480444, + "num_input_tokens_seen": 33665665, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.265625, + "step": 1568, + "time_per_iteration": 2.5221774578094482 + }, + { + "auxiliary_loss_clip": 0.01179838, + "auxiliary_loss_mlp": 0.01048346, + "balance_loss_clip": 1.02527881, + "balance_loss_mlp": 1.05191278, + "epoch": 0.09433338343604389, + "flos": 20263888944000.0, + "grad_norm": 4.3822924949764515, + "language_loss": 0.7658236, + "learning_rate": 3.9567421198461814e-06, + "loss": 0.78810549, + "num_input_tokens_seen": 33684760, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.28125, + "step": 1569, + "time_per_iteration": 2.464019775390625 + }, + { + "auxiliary_loss_clip": 0.01171203, + "auxiliary_loss_mlp": 0.01052053, + "balance_loss_clip": 1.03004718, + "balance_loss_mlp": 1.04984534, + "epoch": 0.09439350668871185, + "flos": 12742973493120.0, + "grad_norm": 2.11394347406088, + "language_loss": 0.86315012, + "learning_rate": 3.956661519635756e-06, + "loss": 0.88538271, + "num_input_tokens_seen": 33700750, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.21875, + "step": 1570, + "time_per_iteration": 2.479001998901367 + }, + { + "auxiliary_loss_clip": 0.01177967, + "auxiliary_loss_mlp": 0.01049184, + "balance_loss_clip": 1.02764273, + "balance_loss_mlp": 1.05340183, + "epoch": 0.09445362994137983, + "flos": 25962266263680.0, + "grad_norm": 1.6480791038221163, + "language_loss": 0.76531005, + "learning_rate": 3.95658084522853e-06, + "loss": 0.78758156, + "num_input_tokens_seen": 33724430, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.25, + "step": 1571, + "time_per_iteration": 2.5270462036132812 + }, + { + "auxiliary_loss_clip": 0.01169263, + "auxiliary_loss_mlp": 0.01049685, + "balance_loss_clip": 1.02848995, + "balance_loss_mlp": 1.0496099, + "epoch": 0.0945137531940478, + "flos": 19715353372800.0, + "grad_norm": 1.780883866775424, + "language_loss": 0.79518712, + "learning_rate": 3.956500096627561e-06, + "loss": 0.81737661, + "num_input_tokens_seen": 33743455, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1953125, + "step": 1572, + "time_per_iteration": 2.477403163909912 + }, + { + "auxiliary_loss_clip": 0.01172702, + "auxiliary_loss_mlp": 0.01053869, + "balance_loss_clip": 1.03288805, + "balance_loss_mlp": 1.05036175, + "epoch": 0.09457387644671576, + "flos": 23617047375360.0, + "grad_norm": 1.8458711299535766, + "language_loss": 0.87948155, + "learning_rate": 3.956419273835913e-06, + "loss": 0.90174723, + "num_input_tokens_seen": 33763435, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.21875, + "step": 1573, + "time_per_iteration": 2.5164122581481934 + }, + { + "auxiliary_loss_clip": 0.01177194, + "auxiliary_loss_mlp": 0.01059795, + "balance_loss_clip": 1.03533316, + "balance_loss_mlp": 1.05045378, + "epoch": 0.09463399969938374, + "flos": 26907291135360.0, + "grad_norm": 2.770313323609274, + "language_loss": 0.81827116, + "learning_rate": 3.95633837685665e-06, + "loss": 0.84064102, + "num_input_tokens_seen": 33784325, + "router_z_loss_clip": 0.24511719, + "router_z_loss_mlp": 1.265625, + "step": 1574, + "time_per_iteration": 2.5540831089019775 + }, + { + "auxiliary_loss_clip": 0.01178056, + "auxiliary_loss_mlp": 0.01052269, + "balance_loss_clip": 1.03128815, + "balance_loss_mlp": 1.05359375, + "epoch": 0.0946941229520517, + "flos": 23659566099840.0, + "grad_norm": 2.139236970889498, + "language_loss": 0.80922085, + "learning_rate": 3.95625740569284e-06, + "loss": 0.83152413, + "num_input_tokens_seen": 33802510, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.25, + "step": 1575, + "time_per_iteration": 2.4874608516693115 + }, + { + "auxiliary_loss_clip": 0.01172567, + "auxiliary_loss_mlp": 0.01063693, + "balance_loss_clip": 1.04184198, + "balance_loss_mlp": 1.05048943, + "epoch": 0.09475424620471967, + "flos": 24134053783680.0, + "grad_norm": 2.1107661515601, + "language_loss": 0.86745369, + "learning_rate": 3.956176360347553e-06, + "loss": 0.88981628, + "num_input_tokens_seen": 33819980, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.21875, + "step": 1576, + "time_per_iteration": 2.514961004257202 + }, + { + "auxiliary_loss_clip": 0.01058351, + "auxiliary_loss_mlp": 0.01025821, + "balance_loss_clip": 1.02272177, + "balance_loss_mlp": 1.01272786, + "epoch": 0.09481436945738765, + "flos": 68426168065920.0, + "grad_norm": 0.9836929902555142, + "language_loss": 0.65832257, + "learning_rate": 3.956095240823862e-06, + "loss": 0.67916429, + "num_input_tokens_seen": 33878925, + "router_z_loss_clip": 0.03100586, + "router_z_loss_mlp": 0.45703125, + "step": 1577, + "time_per_iteration": 3.042998790740967 + }, + { + "auxiliary_loss_clip": 0.01175806, + "auxiliary_loss_mlp": 0.01045658, + "balance_loss_clip": 1.02504635, + "balance_loss_mlp": 1.05083144, + "epoch": 0.09487449271005562, + "flos": 16654076858880.0, + "grad_norm": 3.158821122445177, + "language_loss": 0.79113019, + "learning_rate": 3.956014047124844e-06, + "loss": 0.81334484, + "num_input_tokens_seen": 33897600, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.25, + "step": 1578, + "time_per_iteration": 2.492605447769165 + }, + { + "auxiliary_loss_clip": 0.01173104, + "auxiliary_loss_mlp": 0.01056494, + "balance_loss_clip": 1.03446436, + "balance_loss_mlp": 1.04935408, + "epoch": 0.09493461596272358, + "flos": 24275685110400.0, + "grad_norm": 1.6941125689582233, + "language_loss": 0.77994359, + "learning_rate": 3.955932779253578e-06, + "loss": 0.80223954, + "num_input_tokens_seen": 33917365, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.234375, + "step": 1579, + "time_per_iteration": 2.5021350383758545 + }, + { + "auxiliary_loss_clip": 0.01176838, + "auxiliary_loss_mlp": 0.01054415, + "balance_loss_clip": 1.0317533, + "balance_loss_mlp": 1.05228639, + "epoch": 0.09499473921539155, + "flos": 21870173243520.0, + "grad_norm": 2.3012950697800747, + "language_loss": 0.73576474, + "learning_rate": 3.955851437213144e-06, + "loss": 0.75807726, + "num_input_tokens_seen": 33936680, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.2421875, + "step": 1580, + "time_per_iteration": 2.500426769256592 + }, + { + "auxiliary_loss_clip": 0.01171524, + "auxiliary_loss_mlp": 0.01053034, + "balance_loss_clip": 1.03235102, + "balance_loss_mlp": 1.05162525, + "epoch": 0.09505486246805953, + "flos": 33547137880320.0, + "grad_norm": 2.820694860574998, + "language_loss": 0.77813822, + "learning_rate": 3.955770021006627e-06, + "loss": 0.80038381, + "num_input_tokens_seen": 33960685, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.203125, + "step": 1581, + "time_per_iteration": 2.569086790084839 + }, + { + "auxiliary_loss_clip": 0.01177083, + "auxiliary_loss_mlp": 0.0105881, + "balance_loss_clip": 1.03779316, + "balance_loss_mlp": 1.05315304, + "epoch": 0.09511498572072749, + "flos": 21215342350080.0, + "grad_norm": 2.1718701740895443, + "language_loss": 0.86914808, + "learning_rate": 3.955688530637116e-06, + "loss": 0.89150703, + "num_input_tokens_seen": 33980015, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.234375, + "step": 1582, + "time_per_iteration": 2.476386785507202 + }, + { + "auxiliary_loss_clip": 0.01178411, + "auxiliary_loss_mlp": 0.01057193, + "balance_loss_clip": 1.03394723, + "balance_loss_mlp": 1.05487967, + "epoch": 0.09517510897339546, + "flos": 14611262572800.0, + "grad_norm": 1.7496793522695477, + "language_loss": 0.66838771, + "learning_rate": 3.955606966107699e-06, + "loss": 0.6907438, + "num_input_tokens_seen": 33997705, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.234375, + "step": 1583, + "time_per_iteration": 2.4433302879333496 + }, + { + "auxiliary_loss_clip": 0.01180705, + "auxiliary_loss_mlp": 0.01052141, + "balance_loss_clip": 1.02919281, + "balance_loss_mlp": 1.0555923, + "epoch": 0.09523523222606343, + "flos": 27817339138560.0, + "grad_norm": 1.8272679383640855, + "language_loss": 0.70314872, + "learning_rate": 3.95552532742147e-06, + "loss": 0.7254771, + "num_input_tokens_seen": 34017465, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.25, + "step": 1584, + "time_per_iteration": 2.5352938175201416 + }, + { + "auxiliary_loss_clip": 0.01177345, + "auxiliary_loss_mlp": 0.01054432, + "balance_loss_clip": 1.0344646, + "balance_loss_mlp": 1.0527246, + "epoch": 0.0952953554787314, + "flos": 20706272847360.0, + "grad_norm": 1.5429491827095454, + "language_loss": 0.80649364, + "learning_rate": 3.955443614581525e-06, + "loss": 0.82881135, + "num_input_tokens_seen": 34038550, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.2421875, + "step": 1585, + "time_per_iteration": 2.5006139278411865 + }, + { + "auxiliary_loss_clip": 0.01179471, + "auxiliary_loss_mlp": 0.01056563, + "balance_loss_clip": 1.03301954, + "balance_loss_mlp": 1.05324364, + "epoch": 0.09535547873139937, + "flos": 24787627701120.0, + "grad_norm": 1.5763794615860258, + "language_loss": 0.7156626, + "learning_rate": 3.955361827590961e-06, + "loss": 0.73802292, + "num_input_tokens_seen": 34058665, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.265625, + "step": 1586, + "time_per_iteration": 2.510941982269287 + }, + { + "auxiliary_loss_clip": 0.01058124, + "auxiliary_loss_mlp": 0.010121, + "balance_loss_clip": 1.00946522, + "balance_loss_mlp": 1.01272035, + "epoch": 0.09541560198406734, + "flos": 71912194905600.0, + "grad_norm": 0.8128409972345002, + "language_loss": 0.55392706, + "learning_rate": 3.955279966452883e-06, + "loss": 0.57462931, + "num_input_tokens_seen": 34109655, + "router_z_loss_clip": 0.02636719, + "router_z_loss_mlp": 0.453125, + "step": 1587, + "time_per_iteration": 2.8747992515563965 + }, + { + "auxiliary_loss_clip": 0.0118109, + "auxiliary_loss_mlp": 0.01056077, + "balance_loss_clip": 1.0345006, + "balance_loss_mlp": 1.0550952, + "epoch": 0.09547572523673531, + "flos": 28982604251520.0, + "grad_norm": 1.813611272618652, + "language_loss": 0.81023234, + "learning_rate": 3.955198031170391e-06, + "loss": 0.83260405, + "num_input_tokens_seen": 34131115, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.265625, + "step": 1588, + "time_per_iteration": 2.5403292179107666 + }, + { + "auxiliary_loss_clip": 0.01178114, + "auxiliary_loss_mlp": 0.01054854, + "balance_loss_clip": 1.03290713, + "balance_loss_mlp": 1.05471849, + "epoch": 0.09553584848940327, + "flos": 24133910129280.0, + "grad_norm": 2.1843830695972835, + "language_loss": 0.81552076, + "learning_rate": 3.955116021746594e-06, + "loss": 0.83785045, + "num_input_tokens_seen": 34151925, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.234375, + "step": 1589, + "time_per_iteration": 2.4995651245117188 + }, + { + "auxiliary_loss_clip": 0.01175474, + "auxiliary_loss_mlp": 0.01051658, + "balance_loss_clip": 1.02901983, + "balance_loss_mlp": 1.05340207, + "epoch": 0.09559597174207124, + "flos": 42851376789120.0, + "grad_norm": 1.4497838373443381, + "language_loss": 0.65005404, + "learning_rate": 3.955033938184601e-06, + "loss": 0.67232537, + "num_input_tokens_seen": 34175395, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.21875, + "step": 1590, + "time_per_iteration": 2.7222375869750977 + }, + { + "auxiliary_loss_clip": 0.01173556, + "auxiliary_loss_mlp": 0.01051921, + "balance_loss_clip": 1.03036785, + "balance_loss_mlp": 1.05178595, + "epoch": 0.09565609499473922, + "flos": 32670845683200.0, + "grad_norm": 1.714913693600035, + "language_loss": 0.83272862, + "learning_rate": 3.954951780487526e-06, + "loss": 0.85498345, + "num_input_tokens_seen": 34197760, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.21875, + "step": 1591, + "time_per_iteration": 2.571277379989624 + }, + { + "auxiliary_loss_clip": 0.01179776, + "auxiliary_loss_mlp": 0.01055769, + "balance_loss_clip": 1.03419209, + "balance_loss_mlp": 1.05280709, + "epoch": 0.09571621824740718, + "flos": 18478410670080.0, + "grad_norm": 2.268244689889179, + "language_loss": 0.74068749, + "learning_rate": 3.9548695486584835e-06, + "loss": 0.76304293, + "num_input_tokens_seen": 34215330, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.265625, + "step": 1592, + "time_per_iteration": 2.446272373199463 + }, + { + "auxiliary_loss_clip": 0.01173297, + "auxiliary_loss_mlp": 0.0104948, + "balance_loss_clip": 1.0282129, + "balance_loss_mlp": 1.05028248, + "epoch": 0.09577634150007515, + "flos": 29387497334400.0, + "grad_norm": 1.9287746031752921, + "language_loss": 0.74135411, + "learning_rate": 3.954787242700592e-06, + "loss": 0.76358187, + "num_input_tokens_seen": 34237745, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.234375, + "step": 1593, + "time_per_iteration": 2.518038749694824 + }, + { + "auxiliary_loss_clip": 0.01175652, + "auxiliary_loss_mlp": 0.01051222, + "balance_loss_clip": 1.03061128, + "balance_loss_mlp": 1.05365515, + "epoch": 0.09583646475274313, + "flos": 22747830157440.0, + "grad_norm": 1.8251705146793997, + "language_loss": 0.69907188, + "learning_rate": 3.954704862616971e-06, + "loss": 0.72134066, + "num_input_tokens_seen": 34256565, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.21875, + "step": 1594, + "time_per_iteration": 2.5454983711242676 + }, + { + "auxiliary_loss_clip": 0.01174594, + "auxiliary_loss_mlp": 0.0105111, + "balance_loss_clip": 1.03062999, + "balance_loss_mlp": 1.05023921, + "epoch": 0.0958965880054111, + "flos": 23218367345280.0, + "grad_norm": 2.596137828422853, + "language_loss": 0.82464099, + "learning_rate": 3.954622408410747e-06, + "loss": 0.84689802, + "num_input_tokens_seen": 34275970, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.2421875, + "step": 1595, + "time_per_iteration": 2.472062826156616 + }, + { + "auxiliary_loss_clip": 0.01176658, + "auxiliary_loss_mlp": 0.01050558, + "balance_loss_clip": 1.02803886, + "balance_loss_mlp": 1.05217803, + "epoch": 0.09595671125807906, + "flos": 21324438933120.0, + "grad_norm": 2.0311987750358953, + "language_loss": 0.84673214, + "learning_rate": 3.954539880085045e-06, + "loss": 0.86900425, + "num_input_tokens_seen": 34295490, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.2421875, + "step": 1596, + "time_per_iteration": 2.4801599979400635 + }, + { + "auxiliary_loss_clip": 0.01181467, + "auxiliary_loss_mlp": 0.01051063, + "balance_loss_clip": 1.02871156, + "balance_loss_mlp": 1.05628884, + "epoch": 0.09601683451074704, + "flos": 39603472185600.0, + "grad_norm": 2.531539932785817, + "language_loss": 0.68993127, + "learning_rate": 3.9544572776429945e-06, + "loss": 0.71225667, + "num_input_tokens_seen": 34319990, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.25, + "step": 1597, + "time_per_iteration": 2.6195101737976074 + }, + { + "auxiliary_loss_clip": 0.01175632, + "auxiliary_loss_mlp": 0.0104509, + "balance_loss_clip": 1.02370429, + "balance_loss_mlp": 1.04902959, + "epoch": 0.096076957763415, + "flos": 23732716147200.0, + "grad_norm": 2.18946094151333, + "language_loss": 0.74929029, + "learning_rate": 3.954374601087729e-06, + "loss": 0.77149749, + "num_input_tokens_seen": 34339225, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.265625, + "step": 1598, + "time_per_iteration": 2.474071502685547 + }, + { + "auxiliary_loss_clip": 0.01179079, + "auxiliary_loss_mlp": 0.01048418, + "balance_loss_clip": 1.02574444, + "balance_loss_mlp": 1.05284083, + "epoch": 0.09613708101608297, + "flos": 34678108483200.0, + "grad_norm": 1.6350676424235815, + "language_loss": 0.69002283, + "learning_rate": 3.954291850422382e-06, + "loss": 0.7122978, + "num_input_tokens_seen": 34361020, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.265625, + "step": 1599, + "time_per_iteration": 2.5599992275238037 + }, + { + "auxiliary_loss_clip": 0.01174972, + "auxiliary_loss_mlp": 0.01056578, + "balance_loss_clip": 1.0358355, + "balance_loss_mlp": 1.05169392, + "epoch": 0.09619720426875093, + "flos": 20740028653440.0, + "grad_norm": 2.013538613147854, + "language_loss": 0.840271, + "learning_rate": 3.954209025650093e-06, + "loss": 0.8625865, + "num_input_tokens_seen": 34378630, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.234375, + "step": 1600, + "time_per_iteration": 2.4882116317749023 + }, + { + "auxiliary_loss_clip": 0.01174537, + "auxiliary_loss_mlp": 0.01052763, + "balance_loss_clip": 1.03162694, + "balance_loss_mlp": 1.05098653, + "epoch": 0.09625732752141891, + "flos": 13042720488960.0, + "grad_norm": 3.038904015519863, + "language_loss": 0.8034178, + "learning_rate": 3.954126126774001e-06, + "loss": 0.82569081, + "num_input_tokens_seen": 34397110, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.234375, + "step": 1601, + "time_per_iteration": 5.328745365142822 + }, + { + "auxiliary_loss_clip": 0.01178453, + "auxiliary_loss_mlp": 0.01052259, + "balance_loss_clip": 1.03031266, + "balance_loss_mlp": 1.05090928, + "epoch": 0.09631745077408688, + "flos": 22273629782400.0, + "grad_norm": 2.183236390866488, + "language_loss": 0.82405198, + "learning_rate": 3.954043153797251e-06, + "loss": 0.84635913, + "num_input_tokens_seen": 34414165, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.2734375, + "step": 1602, + "time_per_iteration": 2.4609556198120117 + }, + { + "auxiliary_loss_clip": 0.01172805, + "auxiliary_loss_mlp": 0.01051791, + "balance_loss_clip": 1.02926075, + "balance_loss_mlp": 1.05170703, + "epoch": 0.09637757402675484, + "flos": 24754266944640.0, + "grad_norm": 1.882331764966583, + "language_loss": 0.62527591, + "learning_rate": 3.953960106722989e-06, + "loss": 0.64752185, + "num_input_tokens_seen": 34434445, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.2109375, + "step": 1603, + "time_per_iteration": 2.4974379539489746 + }, + { + "auxiliary_loss_clip": 0.01178105, + "auxiliary_loss_mlp": 0.01054363, + "balance_loss_clip": 1.03049707, + "balance_loss_mlp": 1.05224609, + "epoch": 0.09643769727942282, + "flos": 22525758322560.0, + "grad_norm": 2.347327571135852, + "language_loss": 0.71259016, + "learning_rate": 3.953876985554364e-06, + "loss": 0.73491484, + "num_input_tokens_seen": 34453095, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.2578125, + "step": 1604, + "time_per_iteration": 2.5012693405151367 + }, + { + "auxiliary_loss_clip": 0.01172586, + "auxiliary_loss_mlp": 0.01056823, + "balance_loss_clip": 1.0368669, + "balance_loss_mlp": 1.05051208, + "epoch": 0.09649782053209079, + "flos": 30921026636160.0, + "grad_norm": 2.129697971326249, + "language_loss": 0.79487669, + "learning_rate": 3.953793790294527e-06, + "loss": 0.8171708, + "num_input_tokens_seen": 34473680, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.21875, + "step": 1605, + "time_per_iteration": 2.5392873287200928 + }, + { + "auxiliary_loss_clip": 0.01176232, + "auxiliary_loss_mlp": 0.01044253, + "balance_loss_clip": 1.02275968, + "balance_loss_mlp": 1.04916394, + "epoch": 0.09655794378475875, + "flos": 25337635729920.0, + "grad_norm": 3.698123586343809, + "language_loss": 0.74810207, + "learning_rate": 3.953710520946634e-06, + "loss": 0.77030694, + "num_input_tokens_seen": 34492610, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.2734375, + "step": 1606, + "time_per_iteration": 2.4922726154327393 + }, + { + "auxiliary_loss_clip": 0.01176161, + "auxiliary_loss_mlp": 0.01044775, + "balance_loss_clip": 1.02391386, + "balance_loss_mlp": 1.05243278, + "epoch": 0.09661806703742673, + "flos": 22346061557760.0, + "grad_norm": 1.649703340967918, + "language_loss": 0.75382137, + "learning_rate": 3.953627177513843e-06, + "loss": 0.77603066, + "num_input_tokens_seen": 34511855, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.234375, + "step": 1607, + "time_per_iteration": 2.4787087440490723 + }, + { + "auxiliary_loss_clip": 0.0117289, + "auxiliary_loss_mlp": 0.01042475, + "balance_loss_clip": 1.02206647, + "balance_loss_mlp": 1.04831934, + "epoch": 0.0966781902900947, + "flos": 17457578144640.0, + "grad_norm": 2.262571531890369, + "language_loss": 0.86648059, + "learning_rate": 3.953543759999312e-06, + "loss": 0.88863426, + "num_input_tokens_seen": 34528905, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.25, + "step": 1608, + "time_per_iteration": 2.435391664505005 + }, + { + "auxiliary_loss_clip": 0.01183391, + "auxiliary_loss_mlp": 0.01056654, + "balance_loss_clip": 1.03513622, + "balance_loss_mlp": 1.05276418, + "epoch": 0.09673831354276266, + "flos": 36903995412480.0, + "grad_norm": 2.2277980990408297, + "language_loss": 0.70968121, + "learning_rate": 3.953460268406207e-06, + "loss": 0.73208165, + "num_input_tokens_seen": 34548480, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.3046875, + "step": 1609, + "time_per_iteration": 2.599719762802124 + }, + { + "auxiliary_loss_clip": 0.01173214, + "auxiliary_loss_mlp": 0.01054271, + "balance_loss_clip": 1.03342104, + "balance_loss_mlp": 1.04860282, + "epoch": 0.09679843679543064, + "flos": 20701388597760.0, + "grad_norm": 3.7787270736621674, + "language_loss": 0.84566712, + "learning_rate": 3.953376702737693e-06, + "loss": 0.86794198, + "num_input_tokens_seen": 34565410, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.2421875, + "step": 1610, + "time_per_iteration": 2.446676254272461 + }, + { + "auxiliary_loss_clip": 0.01177531, + "auxiliary_loss_mlp": 0.01049914, + "balance_loss_clip": 1.02781224, + "balance_loss_mlp": 1.05382621, + "epoch": 0.0968585600480986, + "flos": 23514415240320.0, + "grad_norm": 2.0483419743874682, + "language_loss": 0.67360532, + "learning_rate": 3.953293062996939e-06, + "loss": 0.69587982, + "num_input_tokens_seen": 34584840, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.234375, + "step": 1611, + "time_per_iteration": 2.520211696624756 + }, + { + "auxiliary_loss_clip": 0.01177545, + "auxiliary_loss_mlp": 0.0105068, + "balance_loss_clip": 1.03000879, + "balance_loss_mlp": 1.05313492, + "epoch": 0.09691868330076657, + "flos": 20121072468480.0, + "grad_norm": 1.6625909003061596, + "language_loss": 0.81166416, + "learning_rate": 3.953209349187115e-06, + "loss": 0.83394641, + "num_input_tokens_seen": 34603360, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.2421875, + "step": 1612, + "time_per_iteration": 2.449491262435913 + }, + { + "auxiliary_loss_clip": 0.01180036, + "auxiliary_loss_mlp": 0.01061745, + "balance_loss_clip": 1.04027581, + "balance_loss_mlp": 1.05431938, + "epoch": 0.09697880655343454, + "flos": 16544692967040.0, + "grad_norm": 2.509420249413084, + "language_loss": 0.80708754, + "learning_rate": 3.953125561311398e-06, + "loss": 0.82950538, + "num_input_tokens_seen": 34620760, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.2578125, + "step": 1613, + "time_per_iteration": 2.4753763675689697 + }, + { + "auxiliary_loss_clip": 0.01173718, + "auxiliary_loss_mlp": 0.01052644, + "balance_loss_clip": 1.03019738, + "balance_loss_mlp": 1.05074048, + "epoch": 0.09703892980610251, + "flos": 26104184899200.0, + "grad_norm": 2.0025313344872484, + "language_loss": 0.84173608, + "learning_rate": 3.953041699372964e-06, + "loss": 0.86399966, + "num_input_tokens_seen": 34640695, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.2265625, + "step": 1614, + "time_per_iteration": 2.5492141246795654 + }, + { + "auxiliary_loss_clip": 0.01065917, + "auxiliary_loss_mlp": 0.010187, + "balance_loss_clip": 1.01610088, + "balance_loss_mlp": 1.019063, + "epoch": 0.09709905305877048, + "flos": 60443622000000.0, + "grad_norm": 0.7078098108364695, + "language_loss": 0.54584575, + "learning_rate": 3.952957763374992e-06, + "loss": 0.56669194, + "num_input_tokens_seen": 34702395, + "router_z_loss_clip": 0.02600098, + "router_z_loss_mlp": 0.46875, + "step": 1615, + "time_per_iteration": 3.1041057109832764 + }, + { + "auxiliary_loss_clip": 0.01065912, + "auxiliary_loss_mlp": 0.01007477, + "balance_loss_clip": 1.00491357, + "balance_loss_mlp": 1.01844954, + "epoch": 0.09715917631143844, + "flos": 57639932893440.0, + "grad_norm": 0.7637649269659756, + "language_loss": 0.5822649, + "learning_rate": 3.952873753320666e-06, + "loss": 0.60299873, + "num_input_tokens_seen": 34768910, + "router_z_loss_clip": 0.02563477, + "router_z_loss_mlp": 0.47460938, + "step": 1616, + "time_per_iteration": 3.215376377105713 + }, + { + "auxiliary_loss_clip": 0.01178513, + "auxiliary_loss_mlp": 0.01055808, + "balance_loss_clip": 1.03275275, + "balance_loss_mlp": 1.05275226, + "epoch": 0.09721929956410642, + "flos": 20558212986240.0, + "grad_norm": 1.690325520565165, + "language_loss": 0.69293094, + "learning_rate": 3.952789669213172e-06, + "loss": 0.71527421, + "num_input_tokens_seen": 34787680, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.2578125, + "step": 1617, + "time_per_iteration": 2.458017587661743 + }, + { + "auxiliary_loss_clip": 0.01176727, + "auxiliary_loss_mlp": 0.01055641, + "balance_loss_clip": 1.03116739, + "balance_loss_mlp": 1.05130577, + "epoch": 0.09727942281677439, + "flos": 27344359825920.0, + "grad_norm": 1.7927692696889819, + "language_loss": 0.80748308, + "learning_rate": 3.952705511055698e-06, + "loss": 0.8298068, + "num_input_tokens_seen": 34808330, + "router_z_loss_clip": 0.24414062, + "router_z_loss_mlp": 1.25, + "step": 1618, + "time_per_iteration": 2.5471577644348145 + }, + { + "auxiliary_loss_clip": 0.01169902, + "auxiliary_loss_mlp": 0.01050477, + "balance_loss_clip": 1.03077149, + "balance_loss_mlp": 1.04996848, + "epoch": 0.09733954606944235, + "flos": 24900028335360.0, + "grad_norm": 1.5831304278494804, + "language_loss": 0.9288674, + "learning_rate": 3.952621278851435e-06, + "loss": 0.9510712, + "num_input_tokens_seen": 34830020, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1953125, + "step": 1619, + "time_per_iteration": 2.515282392501831 + }, + { + "auxiliary_loss_clip": 0.01171299, + "auxiliary_loss_mlp": 0.01052594, + "balance_loss_clip": 1.03150594, + "balance_loss_mlp": 1.05216622, + "epoch": 0.09739966932211033, + "flos": 31503928544640.0, + "grad_norm": 1.7974961209450113, + "language_loss": 0.88785303, + "learning_rate": 3.9525369726035784e-06, + "loss": 0.910092, + "num_input_tokens_seen": 34850330, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1875, + "step": 1620, + "time_per_iteration": 2.556744337081909 + }, + { + "auxiliary_loss_clip": 0.01175309, + "auxiliary_loss_mlp": 0.01056801, + "balance_loss_clip": 1.0339601, + "balance_loss_mlp": 1.05045033, + "epoch": 0.0974597925747783, + "flos": 23878764846720.0, + "grad_norm": 1.90931759761679, + "language_loss": 0.77130795, + "learning_rate": 3.952452592315324e-06, + "loss": 0.79362905, + "num_input_tokens_seen": 34871640, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.25, + "step": 1621, + "time_per_iteration": 2.491441011428833 + }, + { + "auxiliary_loss_clip": 0.01171563, + "auxiliary_loss_mlp": 0.01056002, + "balance_loss_clip": 1.03398418, + "balance_loss_mlp": 1.04859447, + "epoch": 0.09751991582744626, + "flos": 17019575700480.0, + "grad_norm": 1.9170880538391684, + "language_loss": 0.77856946, + "learning_rate": 3.952368137989871e-06, + "loss": 0.80084509, + "num_input_tokens_seen": 34888100, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.2265625, + "step": 1622, + "time_per_iteration": 2.4379701614379883 + }, + { + "auxiliary_loss_clip": 0.01177415, + "auxiliary_loss_mlp": 0.01056732, + "balance_loss_clip": 1.0349052, + "balance_loss_mlp": 1.05105746, + "epoch": 0.09758003908011423, + "flos": 28402826826240.0, + "grad_norm": 1.9420709042223125, + "language_loss": 0.85783195, + "learning_rate": 3.9522836096304225e-06, + "loss": 0.88017344, + "num_input_tokens_seen": 34910485, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.265625, + "step": 1623, + "time_per_iteration": 2.51741099357605 + }, + { + "auxiliary_loss_clip": 0.01172696, + "auxiliary_loss_mlp": 0.01056286, + "balance_loss_clip": 1.03498316, + "balance_loss_mlp": 1.05181813, + "epoch": 0.09764016233278221, + "flos": 18144297336960.0, + "grad_norm": 2.2833168401589656, + "language_loss": 0.80328369, + "learning_rate": 3.952199007240184e-06, + "loss": 0.8255735, + "num_input_tokens_seen": 34928615, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.2109375, + "step": 1624, + "time_per_iteration": 2.4646618366241455 + }, + { + "auxiliary_loss_clip": 0.01170952, + "auxiliary_loss_mlp": 0.01044517, + "balance_loss_clip": 1.02450192, + "balance_loss_mlp": 1.04799926, + "epoch": 0.09770028558545017, + "flos": 15265842071040.0, + "grad_norm": 2.7577002662180954, + "language_loss": 0.8575626, + "learning_rate": 3.952114330822364e-06, + "loss": 0.87971735, + "num_input_tokens_seen": 34946045, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.234375, + "step": 1625, + "time_per_iteration": 2.452615976333618 + }, + { + "auxiliary_loss_clip": 0.01176501, + "auxiliary_loss_mlp": 0.01055325, + "balance_loss_clip": 1.03445125, + "balance_loss_mlp": 1.05226421, + "epoch": 0.09776040883811814, + "flos": 23472435219840.0, + "grad_norm": 3.258883448957912, + "language_loss": 0.8539601, + "learning_rate": 3.952029580380172e-06, + "loss": 0.87627834, + "num_input_tokens_seen": 34962865, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.2421875, + "step": 1626, + "time_per_iteration": 2.4931013584136963 + }, + { + "auxiliary_loss_clip": 0.01181466, + "auxiliary_loss_mlp": 0.0105723, + "balance_loss_clip": 1.03493834, + "balance_loss_mlp": 1.05541551, + "epoch": 0.09782053209078612, + "flos": 24499480798080.0, + "grad_norm": 1.979888643217431, + "language_loss": 0.83329904, + "learning_rate": 3.9519447559168234e-06, + "loss": 0.85568601, + "num_input_tokens_seen": 34983505, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.2578125, + "step": 1627, + "time_per_iteration": 2.5056917667388916 + }, + { + "auxiliary_loss_clip": 0.01170161, + "auxiliary_loss_mlp": 0.01050744, + "balance_loss_clip": 1.03065729, + "balance_loss_mlp": 1.0488416, + "epoch": 0.09788065534345408, + "flos": 21580158833280.0, + "grad_norm": 1.7873285490487296, + "language_loss": 0.84291327, + "learning_rate": 3.951859857435534e-06, + "loss": 0.86512232, + "num_input_tokens_seen": 35001825, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.2109375, + "step": 1628, + "time_per_iteration": 2.4835076332092285 + }, + { + "auxiliary_loss_clip": 0.01169153, + "auxiliary_loss_mlp": 0.01052825, + "balance_loss_clip": 1.0321064, + "balance_loss_mlp": 1.04880238, + "epoch": 0.09794077859612205, + "flos": 23842459175040.0, + "grad_norm": 1.6092149858605884, + "language_loss": 0.75609362, + "learning_rate": 3.951774884939523e-06, + "loss": 0.77831334, + "num_input_tokens_seen": 35023075, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.203125, + "step": 1629, + "time_per_iteration": 2.4959983825683594 + }, + { + "auxiliary_loss_clip": 0.01175285, + "auxiliary_loss_mlp": 0.01046701, + "balance_loss_clip": 1.02412319, + "balance_loss_mlp": 1.0530107, + "epoch": 0.09800090184879003, + "flos": 23659889322240.0, + "grad_norm": 1.5982247062153871, + "language_loss": 0.78224194, + "learning_rate": 3.951689838432013e-06, + "loss": 0.80446172, + "num_input_tokens_seen": 35043480, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.21875, + "step": 1630, + "time_per_iteration": 2.5138731002807617 + }, + { + "auxiliary_loss_clip": 0.01177321, + "auxiliary_loss_mlp": 0.01051411, + "balance_loss_clip": 1.0292381, + "balance_loss_mlp": 1.05457997, + "epoch": 0.09806102510145799, + "flos": 17055773631360.0, + "grad_norm": 1.9134334701620013, + "language_loss": 0.86704385, + "learning_rate": 3.951604717916228e-06, + "loss": 0.8893311, + "num_input_tokens_seen": 35061490, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.2265625, + "step": 1631, + "time_per_iteration": 2.443878173828125 + }, + { + "auxiliary_loss_clip": 0.01172712, + "auxiliary_loss_mlp": 0.01050929, + "balance_loss_clip": 1.03065109, + "balance_loss_mlp": 1.05258322, + "epoch": 0.09812114835412596, + "flos": 23878477537920.0, + "grad_norm": 2.096430969489036, + "language_loss": 0.83111286, + "learning_rate": 3.9515195233953975e-06, + "loss": 0.85334921, + "num_input_tokens_seen": 35079670, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.203125, + "step": 1632, + "time_per_iteration": 2.478825807571411 + }, + { + "auxiliary_loss_clip": 0.01174956, + "auxiliary_loss_mlp": 0.01056552, + "balance_loss_clip": 1.0368464, + "balance_loss_mlp": 1.05281615, + "epoch": 0.09818127160679392, + "flos": 20595488325120.0, + "grad_norm": 1.5107232822128822, + "language_loss": 0.7877655, + "learning_rate": 3.951434254872751e-06, + "loss": 0.81008065, + "num_input_tokens_seen": 35099205, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.21875, + "step": 1633, + "time_per_iteration": 2.447930097579956 + }, + { + "auxiliary_loss_clip": 0.01169184, + "auxiliary_loss_mlp": 0.01049402, + "balance_loss_clip": 1.02833819, + "balance_loss_mlp": 1.04989707, + "epoch": 0.0982413948594619, + "flos": 15487339288320.0, + "grad_norm": 2.0663591821232865, + "language_loss": 0.73159611, + "learning_rate": 3.951348912351521e-06, + "loss": 0.75378191, + "num_input_tokens_seen": 35115270, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1875, + "step": 1634, + "time_per_iteration": 2.460265636444092 + }, + { + "auxiliary_loss_clip": 0.01179893, + "auxiliary_loss_mlp": 0.01062758, + "balance_loss_clip": 1.04026294, + "balance_loss_mlp": 1.0516957, + "epoch": 0.09830151811212987, + "flos": 24207958016640.0, + "grad_norm": 2.7516342600991868, + "language_loss": 0.72714394, + "learning_rate": 3.951263495834947e-06, + "loss": 0.74957043, + "num_input_tokens_seen": 35134065, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.28125, + "step": 1635, + "time_per_iteration": 2.4835710525512695 + }, + { + "auxiliary_loss_clip": 0.01177592, + "auxiliary_loss_mlp": 0.01055297, + "balance_loss_clip": 1.03301644, + "balance_loss_mlp": 1.05253148, + "epoch": 0.09836164136479783, + "flos": 20594590485120.0, + "grad_norm": 1.8458745824258636, + "language_loss": 0.7819975, + "learning_rate": 3.951178005326264e-06, + "loss": 0.80432636, + "num_input_tokens_seen": 35154870, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.25, + "step": 1636, + "time_per_iteration": 2.53061842918396 + }, + { + "auxiliary_loss_clip": 0.01173491, + "auxiliary_loss_mlp": 0.01056847, + "balance_loss_clip": 1.03498387, + "balance_loss_mlp": 1.05113721, + "epoch": 0.09842176461746581, + "flos": 19934157070080.0, + "grad_norm": 2.2976115041381386, + "language_loss": 0.70005965, + "learning_rate": 3.951092440828715e-06, + "loss": 0.722363, + "num_input_tokens_seen": 35171850, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.21875, + "step": 1637, + "time_per_iteration": 2.445742130279541 + }, + { + "auxiliary_loss_clip": 0.0117379, + "auxiliary_loss_mlp": 0.01053221, + "balance_loss_clip": 1.03175139, + "balance_loss_mlp": 1.05108416, + "epoch": 0.09848188787013377, + "flos": 21214659991680.0, + "grad_norm": 2.115587702667026, + "language_loss": 0.77395654, + "learning_rate": 3.951006802345545e-06, + "loss": 0.79622668, + "num_input_tokens_seen": 35188795, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.2265625, + "step": 1638, + "time_per_iteration": 2.4725139141082764 + }, + { + "auxiliary_loss_clip": 0.01170234, + "auxiliary_loss_mlp": 0.01046096, + "balance_loss_clip": 1.02524579, + "balance_loss_mlp": 1.05077171, + "epoch": 0.09854201112280174, + "flos": 30154226071680.0, + "grad_norm": 1.4162008179950134, + "language_loss": 0.7263118, + "learning_rate": 3.950921089880003e-06, + "loss": 0.74847507, + "num_input_tokens_seen": 35212100, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1953125, + "step": 1639, + "time_per_iteration": 2.5534512996673584 + }, + { + "auxiliary_loss_clip": 0.01173162, + "auxiliary_loss_mlp": 0.01040763, + "balance_loss_clip": 1.01943696, + "balance_loss_mlp": 1.05003214, + "epoch": 0.09860213437546972, + "flos": 21795730306560.0, + "grad_norm": 1.8280373897837945, + "language_loss": 0.88669002, + "learning_rate": 3.950835303435337e-06, + "loss": 0.90882927, + "num_input_tokens_seen": 35230390, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.234375, + "step": 1640, + "time_per_iteration": 2.4868786334991455 + }, + { + "auxiliary_loss_clip": 0.01173727, + "auxiliary_loss_mlp": 0.01037743, + "balance_loss_clip": 1.01685774, + "balance_loss_mlp": 1.05164635, + "epoch": 0.09866225762813768, + "flos": 21835555511040.0, + "grad_norm": 2.1859335509376527, + "language_loss": 0.8086108, + "learning_rate": 3.950749443014801e-06, + "loss": 0.83072555, + "num_input_tokens_seen": 35250405, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.21875, + "step": 1641, + "time_per_iteration": 2.5081584453582764 + }, + { + "auxiliary_loss_clip": 0.01173536, + "auxiliary_loss_mlp": 0.01054387, + "balance_loss_clip": 1.03130805, + "balance_loss_mlp": 1.05067503, + "epoch": 0.09872238088080565, + "flos": 17599855916160.0, + "grad_norm": 2.4983515693134417, + "language_loss": 0.85826755, + "learning_rate": 3.95066350862165e-06, + "loss": 0.88054669, + "num_input_tokens_seen": 35262820, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.2265625, + "step": 1642, + "time_per_iteration": 2.4351255893707275 + }, + { + "auxiliary_loss_clip": 0.01177694, + "auxiliary_loss_mlp": 0.01053725, + "balance_loss_clip": 1.0326128, + "balance_loss_mlp": 1.05365527, + "epoch": 0.09878250413347361, + "flos": 27636134002560.0, + "grad_norm": 1.7421144196917664, + "language_loss": 0.80859929, + "learning_rate": 3.950577500259144e-06, + "loss": 0.83091342, + "num_input_tokens_seen": 35284490, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.2421875, + "step": 1643, + "time_per_iteration": 3.9550716876983643 + }, + { + "auxiliary_loss_clip": 0.01170472, + "auxiliary_loss_mlp": 0.01063125, + "balance_loss_clip": 1.04138088, + "balance_loss_mlp": 1.0494256, + "epoch": 0.0988426273861416, + "flos": 16544728880640.0, + "grad_norm": 1.9624417465121429, + "language_loss": 0.8262763, + "learning_rate": 3.950491417930543e-06, + "loss": 0.84861231, + "num_input_tokens_seen": 35302815, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.2109375, + "step": 1644, + "time_per_iteration": 3.8253817558288574 + }, + { + "auxiliary_loss_clip": 0.01169448, + "auxiliary_loss_mlp": 0.01048566, + "balance_loss_clip": 1.02733469, + "balance_loss_mlp": 1.05048347, + "epoch": 0.09890275063880956, + "flos": 21215270522880.0, + "grad_norm": 1.7099323885745632, + "language_loss": 0.6819675, + "learning_rate": 3.9504052616391124e-06, + "loss": 0.70414758, + "num_input_tokens_seen": 35321175, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1953125, + "step": 1645, + "time_per_iteration": 2.4549567699432373 + }, + { + "auxiliary_loss_clip": 0.01065531, + "auxiliary_loss_mlp": 0.01023286, + "balance_loss_clip": 1.0206517, + "balance_loss_mlp": 1.01924491, + "epoch": 0.09896287389147752, + "flos": 59379372910080.0, + "grad_norm": 0.9514884974425206, + "language_loss": 0.60854232, + "learning_rate": 3.950319031388119e-06, + "loss": 0.62943053, + "num_input_tokens_seen": 35381740, + "router_z_loss_clip": 0.02636719, + "router_z_loss_mlp": 0.46289062, + "step": 1646, + "time_per_iteration": 2.9953765869140625 + }, + { + "auxiliary_loss_clip": 0.01170253, + "auxiliary_loss_mlp": 0.01049996, + "balance_loss_clip": 1.02725112, + "balance_loss_mlp": 1.04880357, + "epoch": 0.0990229971441455, + "flos": 29642678530560.0, + "grad_norm": 2.5496486678231425, + "language_loss": 0.73046064, + "learning_rate": 3.950232727180833e-06, + "loss": 0.75266314, + "num_input_tokens_seen": 35403760, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.2109375, + "step": 1647, + "time_per_iteration": 2.5241641998291016 + }, + { + "auxiliary_loss_clip": 0.01171762, + "auxiliary_loss_mlp": 0.0105645, + "balance_loss_clip": 1.03663731, + "balance_loss_mlp": 1.04955053, + "epoch": 0.09908312039681347, + "flos": 21834873152640.0, + "grad_norm": 1.8237647662791463, + "language_loss": 0.84120429, + "learning_rate": 3.950146349020525e-06, + "loss": 0.86348635, + "num_input_tokens_seen": 35424050, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.21875, + "step": 1648, + "time_per_iteration": 2.467717170715332 + }, + { + "auxiliary_loss_clip": 0.01061152, + "auxiliary_loss_mlp": 0.01009658, + "balance_loss_clip": 1.00701165, + "balance_loss_mlp": 1.0159142, + "epoch": 0.09914324364948143, + "flos": 57564304807680.0, + "grad_norm": 0.7437092318732932, + "language_loss": 0.55674303, + "learning_rate": 3.950059896910473e-06, + "loss": 0.57745123, + "num_input_tokens_seen": 35481690, + "router_z_loss_clip": 0.02648926, + "router_z_loss_mlp": 0.453125, + "step": 1649, + "time_per_iteration": 2.99874210357666 + }, + { + "auxiliary_loss_clip": 0.01165781, + "auxiliary_loss_mlp": 0.01046657, + "balance_loss_clip": 1.02598572, + "balance_loss_mlp": 1.04597533, + "epoch": 0.09920336690214941, + "flos": 34123934476800.0, + "grad_norm": 2.284847215884091, + "language_loss": 0.89930248, + "learning_rate": 3.949973370853954e-06, + "loss": 0.92142689, + "num_input_tokens_seen": 35498635, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1953125, + "step": 1650, + "time_per_iteration": 2.5542855262756348 + }, + { + "auxiliary_loss_clip": 0.01058202, + "auxiliary_loss_mlp": 0.01003693, + "balance_loss_clip": 1.00105858, + "balance_loss_mlp": 1.01395106, + "epoch": 0.09926349015481738, + "flos": 71216428464000.0, + "grad_norm": 0.8031298543824162, + "language_loss": 0.63733649, + "learning_rate": 3.94988677085425e-06, + "loss": 0.65795547, + "num_input_tokens_seen": 35565720, + "router_z_loss_clip": 0.02636719, + "router_z_loss_mlp": 0.44140625, + "step": 1651, + "time_per_iteration": 3.217806100845337 + }, + { + "auxiliary_loss_clip": 0.01168872, + "auxiliary_loss_mlp": 0.01054978, + "balance_loss_clip": 1.03318655, + "balance_loss_mlp": 1.04885435, + "epoch": 0.09932361340748534, + "flos": 23148700917120.0, + "grad_norm": 1.9462006377707899, + "language_loss": 0.88288587, + "learning_rate": 3.949800096914643e-06, + "loss": 0.90512443, + "num_input_tokens_seen": 35586000, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.203125, + "step": 1652, + "time_per_iteration": 2.5014448165893555 + }, + { + "auxiliary_loss_clip": 0.01174376, + "auxiliary_loss_mlp": 0.01057611, + "balance_loss_clip": 1.03692842, + "balance_loss_mlp": 1.05190849, + "epoch": 0.09938373666015332, + "flos": 19828651847040.0, + "grad_norm": 1.9500387106757973, + "language_loss": 0.82206833, + "learning_rate": 3.949713349038422e-06, + "loss": 0.84438825, + "num_input_tokens_seen": 35604355, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.2265625, + "step": 1653, + "time_per_iteration": 2.4881839752197266 + }, + { + "auxiliary_loss_clip": 0.01172582, + "auxiliary_loss_mlp": 0.010545, + "balance_loss_clip": 1.03330469, + "balance_loss_mlp": 1.04984093, + "epoch": 0.09944385991282129, + "flos": 22090664880000.0, + "grad_norm": 2.0314065071494136, + "language_loss": 0.79399735, + "learning_rate": 3.949626527228875e-06, + "loss": 0.81626815, + "num_input_tokens_seen": 35625495, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.2265625, + "step": 1654, + "time_per_iteration": 2.5269205570220947 + }, + { + "auxiliary_loss_clip": 0.01167439, + "auxiliary_loss_mlp": 0.01055854, + "balance_loss_clip": 1.03700721, + "balance_loss_mlp": 1.05072093, + "epoch": 0.09950398316548925, + "flos": 19828867328640.0, + "grad_norm": 1.5637423809135174, + "language_loss": 0.8088094, + "learning_rate": 3.949539631489295e-06, + "loss": 0.83104229, + "num_input_tokens_seen": 35645030, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.171875, + "step": 1655, + "time_per_iteration": 2.4652602672576904 + }, + { + "auxiliary_loss_clip": 0.01167369, + "auxiliary_loss_mlp": 0.0105576, + "balance_loss_clip": 1.03495777, + "balance_loss_mlp": 1.04891443, + "epoch": 0.09956410641815722, + "flos": 25003701964800.0, + "grad_norm": 1.9082198159511756, + "language_loss": 0.80947387, + "learning_rate": 3.9494526618229765e-06, + "loss": 0.83170521, + "num_input_tokens_seen": 35664305, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.1875, + "step": 1656, + "time_per_iteration": 2.4966416358947754 + }, + { + "auxiliary_loss_clip": 0.01170477, + "auxiliary_loss_mlp": 0.0106116, + "balance_loss_clip": 1.04066813, + "balance_loss_mlp": 1.05147541, + "epoch": 0.0996242296708252, + "flos": 19317714837120.0, + "grad_norm": 1.6268850155063674, + "language_loss": 0.88850212, + "learning_rate": 3.949365618233217e-06, + "loss": 0.91081852, + "num_input_tokens_seen": 35684060, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1875, + "step": 1657, + "time_per_iteration": 2.446124792098999 + }, + { + "auxiliary_loss_clip": 0.01175951, + "auxiliary_loss_mlp": 0.01063236, + "balance_loss_clip": 1.04088378, + "balance_loss_mlp": 1.05091214, + "epoch": 0.09968435292349316, + "flos": 21871609787520.0, + "grad_norm": 2.0057694643168302, + "language_loss": 0.84758937, + "learning_rate": 3.9492785007233195e-06, + "loss": 0.86998123, + "num_input_tokens_seen": 35703250, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.25, + "step": 1658, + "time_per_iteration": 2.457902669906616 + }, + { + "auxiliary_loss_clip": 0.01054631, + "auxiliary_loss_mlp": 0.01077242, + "balance_loss_clip": 1.07460773, + "balance_loss_mlp": 1.0110395, + "epoch": 0.09974447617616113, + "flos": 65384533313280.0, + "grad_norm": 0.9153195332104517, + "language_loss": 0.60843968, + "learning_rate": 3.949191309296585e-06, + "loss": 0.62975848, + "num_input_tokens_seen": 35762165, + "router_z_loss_clip": 0.02636719, + "router_z_loss_mlp": 0.43554688, + "step": 1659, + "time_per_iteration": 3.077805519104004 + }, + { + "auxiliary_loss_clip": 0.01170517, + "auxiliary_loss_mlp": 0.01052954, + "balance_loss_clip": 1.03155613, + "balance_loss_mlp": 1.04999721, + "epoch": 0.0998045994288291, + "flos": 23659817495040.0, + "grad_norm": 1.8691655756599186, + "language_loss": 0.85116851, + "learning_rate": 3.949104043956321e-06, + "loss": 0.87340325, + "num_input_tokens_seen": 35781520, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.2109375, + "step": 1660, + "time_per_iteration": 2.49082612991333 + }, + { + "auxiliary_loss_clip": 0.01171003, + "auxiliary_loss_mlp": 0.01056184, + "balance_loss_clip": 1.03393948, + "balance_loss_mlp": 1.05291247, + "epoch": 0.09986472268149707, + "flos": 19609704495360.0, + "grad_norm": 2.130922035700174, + "language_loss": 0.80037123, + "learning_rate": 3.949016704705836e-06, + "loss": 0.8226431, + "num_input_tokens_seen": 35799565, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.1796875, + "step": 1661, + "time_per_iteration": 2.4412636756896973 + }, + { + "auxiliary_loss_clip": 0.01175671, + "auxiliary_loss_mlp": 0.01050112, + "balance_loss_clip": 1.02801085, + "balance_loss_mlp": 1.05002224, + "epoch": 0.09992484593416504, + "flos": 26213317395840.0, + "grad_norm": 1.8939661728963775, + "language_loss": 0.83592767, + "learning_rate": 3.948929291548443e-06, + "loss": 0.85818553, + "num_input_tokens_seen": 35821085, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.2578125, + "step": 1662, + "time_per_iteration": 2.5200328826904297 + }, + { + "auxiliary_loss_clip": 0.01171098, + "auxiliary_loss_mlp": 0.01052384, + "balance_loss_clip": 1.02972281, + "balance_loss_mlp": 1.05104828, + "epoch": 0.09998496918683301, + "flos": 17493632421120.0, + "grad_norm": 2.1063962968477, + "language_loss": 0.88696563, + "learning_rate": 3.9488418044874546e-06, + "loss": 0.90920055, + "num_input_tokens_seen": 35839840, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.203125, + "step": 1663, + "time_per_iteration": 2.42790150642395 + }, + { + "auxiliary_loss_clip": 0.01174901, + "auxiliary_loss_mlp": 0.01052956, + "balance_loss_clip": 1.03084326, + "balance_loss_mlp": 1.05225635, + "epoch": 0.10004509243950098, + "flos": 22784925928320.0, + "grad_norm": 1.6888490247303796, + "language_loss": 0.7034179, + "learning_rate": 3.948754243526191e-06, + "loss": 0.72569644, + "num_input_tokens_seen": 35861545, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.21875, + "step": 1664, + "time_per_iteration": 2.512596368789673 + }, + { + "auxiliary_loss_clip": 0.01173831, + "auxiliary_loss_mlp": 0.01050685, + "balance_loss_clip": 1.02903676, + "balance_loss_mlp": 1.0535655, + "epoch": 0.10010521569216894, + "flos": 16253385667200.0, + "grad_norm": 2.1773983349048804, + "language_loss": 0.7878316, + "learning_rate": 3.94866660866797e-06, + "loss": 0.81007671, + "num_input_tokens_seen": 35878295, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.203125, + "step": 1665, + "time_per_iteration": 2.4271252155303955 + }, + { + "auxiliary_loss_clip": 0.0117847, + "auxiliary_loss_mlp": 0.01061559, + "balance_loss_clip": 1.0404706, + "balance_loss_mlp": 1.05681181, + "epoch": 0.10016533894483691, + "flos": 23402589223680.0, + "grad_norm": 1.663243771388797, + "language_loss": 0.70152062, + "learning_rate": 3.9485788999161165e-06, + "loss": 0.72392094, + "num_input_tokens_seen": 35898990, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.21875, + "step": 1666, + "time_per_iteration": 2.499131202697754 + }, + { + "auxiliary_loss_clip": 0.01173729, + "auxiliary_loss_mlp": 0.01060981, + "balance_loss_clip": 1.03777063, + "balance_loss_mlp": 1.0506525, + "epoch": 0.10022546219750489, + "flos": 19354164163200.0, + "grad_norm": 1.8121915129470096, + "language_loss": 0.791031, + "learning_rate": 3.948491117273956e-06, + "loss": 0.8133781, + "num_input_tokens_seen": 35916225, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.234375, + "step": 1667, + "time_per_iteration": 2.4429264068603516 + }, + { + "auxiliary_loss_clip": 0.0117317, + "auxiliary_loss_mlp": 0.01050651, + "balance_loss_clip": 1.02810836, + "balance_loss_mlp": 1.05261493, + "epoch": 0.10028558545017285, + "flos": 27085766837760.0, + "grad_norm": 2.9507555712476945, + "language_loss": 0.7715596, + "learning_rate": 3.948403260744817e-06, + "loss": 0.79379785, + "num_input_tokens_seen": 35934630, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.203125, + "step": 1668, + "time_per_iteration": 2.5223031044006348 + }, + { + "auxiliary_loss_clip": 0.01173457, + "auxiliary_loss_mlp": 0.01057389, + "balance_loss_clip": 1.03434563, + "balance_loss_mlp": 1.05256963, + "epoch": 0.10034570870284082, + "flos": 25847136195840.0, + "grad_norm": 1.9809152554972944, + "language_loss": 0.77852714, + "learning_rate": 3.948315330332031e-06, + "loss": 0.80083561, + "num_input_tokens_seen": 35953855, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.2109375, + "step": 1669, + "time_per_iteration": 2.5082881450653076 + }, + { + "auxiliary_loss_clip": 0.01181618, + "auxiliary_loss_mlp": 0.01059972, + "balance_loss_clip": 1.03641593, + "balance_loss_mlp": 1.05464602, + "epoch": 0.1004058319555088, + "flos": 26249587153920.0, + "grad_norm": 2.145889566444559, + "language_loss": 0.85461181, + "learning_rate": 3.948227326038933e-06, + "loss": 0.87702769, + "num_input_tokens_seen": 35974555, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.2734375, + "step": 1670, + "time_per_iteration": 2.5235135555267334 + }, + { + "auxiliary_loss_clip": 0.01166248, + "auxiliary_loss_mlp": 0.01057789, + "balance_loss_clip": 1.03681993, + "balance_loss_mlp": 1.0501771, + "epoch": 0.10046595520817676, + "flos": 25374480105600.0, + "grad_norm": 1.5986093935623644, + "language_loss": 0.76899171, + "learning_rate": 3.9481392478688586e-06, + "loss": 0.79123211, + "num_input_tokens_seen": 35996830, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.15625, + "step": 1671, + "time_per_iteration": 2.505441665649414 + }, + { + "auxiliary_loss_clip": 0.01059926, + "auxiliary_loss_mlp": 0.01022857, + "balance_loss_clip": 1.02019823, + "balance_loss_mlp": 1.01598763, + "epoch": 0.10052607846084473, + "flos": 67461821677440.0, + "grad_norm": 0.7900846916321359, + "language_loss": 0.60719293, + "learning_rate": 3.948051095825149e-06, + "loss": 0.62802076, + "num_input_tokens_seen": 36054465, + "router_z_loss_clip": 0.02661133, + "router_z_loss_mlp": 0.43945312, + "step": 1672, + "time_per_iteration": 3.07255482673645 + }, + { + "auxiliary_loss_clip": 0.01173395, + "auxiliary_loss_mlp": 0.01064348, + "balance_loss_clip": 1.04179382, + "balance_loss_mlp": 1.05045998, + "epoch": 0.10058620171351271, + "flos": 21360493209600.0, + "grad_norm": 2.0407855091156377, + "language_loss": 0.77119517, + "learning_rate": 3.947962869911147e-06, + "loss": 0.79357255, + "num_input_tokens_seen": 36073480, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.234375, + "step": 1673, + "time_per_iteration": 2.4693222045898438 + }, + { + "auxiliary_loss_clip": 0.01171478, + "auxiliary_loss_mlp": 0.01052114, + "balance_loss_clip": 1.03066778, + "balance_loss_mlp": 1.04964709, + "epoch": 0.10064632496618067, + "flos": 16800125558400.0, + "grad_norm": 2.2570599367002835, + "language_loss": 0.72829556, + "learning_rate": 3.947874570130197e-06, + "loss": 0.75053144, + "num_input_tokens_seen": 36091830, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.21875, + "step": 1674, + "time_per_iteration": 2.4534130096435547 + }, + { + "auxiliary_loss_clip": 0.01170516, + "auxiliary_loss_mlp": 0.01051148, + "balance_loss_clip": 1.03047729, + "balance_loss_mlp": 1.04903197, + "epoch": 0.10070644821884864, + "flos": 23624445576960.0, + "grad_norm": 2.043409325490185, + "language_loss": 0.79386973, + "learning_rate": 3.947786196485649e-06, + "loss": 0.81608635, + "num_input_tokens_seen": 36111400, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.21875, + "step": 1675, + "time_per_iteration": 2.496504545211792 + }, + { + "auxiliary_loss_clip": 0.01168157, + "auxiliary_loss_mlp": 0.01064762, + "balance_loss_clip": 1.04449606, + "balance_loss_mlp": 1.04908013, + "epoch": 0.1007665714715166, + "flos": 24462564595200.0, + "grad_norm": 2.0305638084579294, + "language_loss": 0.81565315, + "learning_rate": 3.947697748980853e-06, + "loss": 0.8379823, + "num_input_tokens_seen": 36129345, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1875, + "step": 1676, + "time_per_iteration": 2.5022919178009033 + }, + { + "auxiliary_loss_clip": 0.01174395, + "auxiliary_loss_mlp": 0.01058603, + "balance_loss_clip": 1.03713369, + "balance_loss_mlp": 1.05283856, + "epoch": 0.10082669472418458, + "flos": 16799119977600.0, + "grad_norm": 2.134524944411931, + "language_loss": 0.86155027, + "learning_rate": 3.947609227619163e-06, + "loss": 0.88388026, + "num_input_tokens_seen": 36146255, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.2109375, + "step": 1677, + "time_per_iteration": 2.44887113571167 + }, + { + "auxiliary_loss_clip": 0.01171962, + "auxiliary_loss_mlp": 0.01055328, + "balance_loss_clip": 1.03452563, + "balance_loss_mlp": 1.05113602, + "epoch": 0.10088681797685255, + "flos": 13553513844480.0, + "grad_norm": 5.349815535910457, + "language_loss": 0.86318195, + "learning_rate": 3.947520632403936e-06, + "loss": 0.88545489, + "num_input_tokens_seen": 36164050, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.2109375, + "step": 1678, + "time_per_iteration": 2.4373903274536133 + }, + { + "auxiliary_loss_clip": 0.01172423, + "auxiliary_loss_mlp": 0.01055078, + "balance_loss_clip": 1.03359675, + "balance_loss_mlp": 1.05214512, + "epoch": 0.10094694122952051, + "flos": 25265706744960.0, + "grad_norm": 2.6897314721028867, + "language_loss": 0.89726269, + "learning_rate": 3.947431963338532e-06, + "loss": 0.91953766, + "num_input_tokens_seen": 36183530, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.203125, + "step": 1679, + "time_per_iteration": 2.493959903717041 + }, + { + "auxiliary_loss_clip": 0.01056795, + "auxiliary_loss_mlp": 0.01017317, + "balance_loss_clip": 1.01468229, + "balance_loss_mlp": 1.01327634, + "epoch": 0.10100706448218849, + "flos": 69854299885440.0, + "grad_norm": 0.7831657514235874, + "language_loss": 0.53018153, + "learning_rate": 3.947343220426312e-06, + "loss": 0.55092263, + "num_input_tokens_seen": 36248550, + "router_z_loss_clip": 0.02636719, + "router_z_loss_mlp": 0.43554688, + "step": 1680, + "time_per_iteration": 3.15899658203125 + }, + { + "auxiliary_loss_clip": 0.01168402, + "auxiliary_loss_mlp": 0.01055332, + "balance_loss_clip": 1.03429151, + "balance_loss_mlp": 1.04983318, + "epoch": 0.10106718773485646, + "flos": 20007163463040.0, + "grad_norm": 1.657625192327098, + "language_loss": 0.76889706, + "learning_rate": 3.947254403670641e-06, + "loss": 0.79113436, + "num_input_tokens_seen": 36266065, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.1875, + "step": 1681, + "time_per_iteration": 2.446937322616577 + }, + { + "auxiliary_loss_clip": 0.01175341, + "auxiliary_loss_mlp": 0.01058478, + "balance_loss_clip": 1.03423131, + "balance_loss_mlp": 1.04937744, + "epoch": 0.10112731098752442, + "flos": 13479825093120.0, + "grad_norm": 2.135292201068385, + "language_loss": 0.93928307, + "learning_rate": 3.947165513074889e-06, + "loss": 0.96162128, + "num_input_tokens_seen": 36280960, + "router_z_loss_clip": 0.2421875, + "router_z_loss_mlp": 1.2578125, + "step": 1682, + "time_per_iteration": 2.4357759952545166 + }, + { + "auxiliary_loss_clip": 0.01172101, + "auxiliary_loss_mlp": 0.01053977, + "balance_loss_clip": 1.03315091, + "balance_loss_mlp": 1.05045152, + "epoch": 0.1011874342401924, + "flos": 18515901490560.0, + "grad_norm": 5.112669241194533, + "language_loss": 0.87866408, + "learning_rate": 3.947076548642425e-06, + "loss": 0.90092492, + "num_input_tokens_seen": 36299010, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.21875, + "step": 1683, + "time_per_iteration": 2.427802562713623 + }, + { + "auxiliary_loss_clip": 0.01169341, + "auxiliary_loss_mlp": 0.01059869, + "balance_loss_clip": 1.03888798, + "balance_loss_mlp": 1.05144525, + "epoch": 0.10124755749286037, + "flos": 20702861055360.0, + "grad_norm": 1.7718228637860187, + "language_loss": 0.74768114, + "learning_rate": 3.946987510376624e-06, + "loss": 0.76997328, + "num_input_tokens_seen": 36318400, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.1796875, + "step": 1684, + "time_per_iteration": 5.332470417022705 + }, + { + "auxiliary_loss_clip": 0.01059522, + "auxiliary_loss_mlp": 0.01011499, + "balance_loss_clip": 1.00863802, + "balance_loss_mlp": 1.01624751, + "epoch": 0.10130768074552833, + "flos": 56109456247680.0, + "grad_norm": 0.760003339390084, + "language_loss": 0.61090153, + "learning_rate": 3.9468983982808615e-06, + "loss": 0.6316117, + "num_input_tokens_seen": 36381815, + "router_z_loss_clip": 0.02856445, + "router_z_loss_mlp": 0.43359375, + "step": 1685, + "time_per_iteration": 4.508171081542969 + }, + { + "auxiliary_loss_clip": 0.01169013, + "auxiliary_loss_mlp": 0.01049359, + "balance_loss_clip": 1.02769828, + "balance_loss_mlp": 1.04891801, + "epoch": 0.1013678039981963, + "flos": 33402346156800.0, + "grad_norm": 2.3224629698824075, + "language_loss": 0.61664945, + "learning_rate": 3.946809212358516e-06, + "loss": 0.63883317, + "num_input_tokens_seen": 36404320, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.203125, + "step": 1686, + "time_per_iteration": 2.586596965789795 + }, + { + "auxiliary_loss_clip": 0.01173787, + "auxiliary_loss_mlp": 0.01054454, + "balance_loss_clip": 1.03238797, + "balance_loss_mlp": 1.0545882, + "epoch": 0.10142792725086427, + "flos": 31905338008320.0, + "grad_norm": 2.1992592502117443, + "language_loss": 0.81408226, + "learning_rate": 3.946719952612972e-06, + "loss": 0.83636469, + "num_input_tokens_seen": 36427510, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1875, + "step": 1687, + "time_per_iteration": 2.5495810508728027 + }, + { + "auxiliary_loss_clip": 0.01173812, + "auxiliary_loss_mlp": 0.01051846, + "balance_loss_clip": 1.03055501, + "balance_loss_mlp": 1.0514555, + "epoch": 0.10148805050353224, + "flos": 28475905046400.0, + "grad_norm": 1.783489688966995, + "language_loss": 0.72360015, + "learning_rate": 3.94663061904761e-06, + "loss": 0.74585676, + "num_input_tokens_seen": 36448230, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.21875, + "step": 1688, + "time_per_iteration": 2.5242748260498047 + }, + { + "auxiliary_loss_clip": 0.01169898, + "auxiliary_loss_mlp": 0.01054433, + "balance_loss_clip": 1.03264165, + "balance_loss_mlp": 1.05043888, + "epoch": 0.1015481737562002, + "flos": 25148888737920.0, + "grad_norm": 1.9893327907397977, + "language_loss": 0.86880058, + "learning_rate": 3.94654121166582e-06, + "loss": 0.8910439, + "num_input_tokens_seen": 36464395, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.1875, + "step": 1689, + "time_per_iteration": 2.5283408164978027 + }, + { + "auxiliary_loss_clip": 0.01165961, + "auxiliary_loss_mlp": 0.01045526, + "balance_loss_clip": 1.02585626, + "balance_loss_mlp": 1.04692245, + "epoch": 0.10160829700886818, + "flos": 30882781630080.0, + "grad_norm": 1.8972643802531153, + "language_loss": 0.88054395, + "learning_rate": 3.946451730470993e-06, + "loss": 0.90265882, + "num_input_tokens_seen": 36486475, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1875, + "step": 1690, + "time_per_iteration": 2.5732247829437256 + }, + { + "auxiliary_loss_clip": 0.01170509, + "auxiliary_loss_mlp": 0.01051598, + "balance_loss_clip": 1.02961624, + "balance_loss_mlp": 1.04965854, + "epoch": 0.10166842026153615, + "flos": 20412020632320.0, + "grad_norm": 1.8841763324380914, + "language_loss": 0.83124495, + "learning_rate": 3.946362175466521e-06, + "loss": 0.85346603, + "num_input_tokens_seen": 36505310, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.203125, + "step": 1691, + "time_per_iteration": 2.453263282775879 + }, + { + "auxiliary_loss_clip": 0.01172636, + "auxiliary_loss_mlp": 0.01050561, + "balance_loss_clip": 1.028579, + "balance_loss_mlp": 1.05049825, + "epoch": 0.10172854351420411, + "flos": 33476968661760.0, + "grad_norm": 1.648035623213742, + "language_loss": 0.66938514, + "learning_rate": 3.946272546655801e-06, + "loss": 0.69161713, + "num_input_tokens_seen": 36529820, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.21875, + "step": 1692, + "time_per_iteration": 2.5865867137908936 + }, + { + "auxiliary_loss_clip": 0.01167535, + "auxiliary_loss_mlp": 0.01067279, + "balance_loss_clip": 1.04540372, + "balance_loss_mlp": 1.0471102, + "epoch": 0.1017886667668721, + "flos": 23550325862400.0, + "grad_norm": 1.649284734670808, + "language_loss": 0.75387824, + "learning_rate": 3.94618284404223e-06, + "loss": 0.77622634, + "num_input_tokens_seen": 36549000, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.203125, + "step": 1693, + "time_per_iteration": 2.499476194381714 + }, + { + "auxiliary_loss_clip": 0.01171507, + "auxiliary_loss_mlp": 0.01049517, + "balance_loss_clip": 1.02685595, + "balance_loss_mlp": 1.04984784, + "epoch": 0.10184879001954006, + "flos": 23296078419840.0, + "grad_norm": 1.6930931596653784, + "language_loss": 0.87206519, + "learning_rate": 3.9460930676292105e-06, + "loss": 0.89427543, + "num_input_tokens_seen": 36567515, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.21875, + "step": 1694, + "time_per_iteration": 2.483264923095703 + }, + { + "auxiliary_loss_clip": 0.01177185, + "auxiliary_loss_mlp": 0.01052768, + "balance_loss_clip": 1.03013015, + "balance_loss_mlp": 1.05056214, + "epoch": 0.10190891327220802, + "flos": 18333116156160.0, + "grad_norm": 3.1999162319303274, + "language_loss": 0.79579329, + "learning_rate": 3.946003217420147e-06, + "loss": 0.81809288, + "num_input_tokens_seen": 36586190, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.265625, + "step": 1695, + "time_per_iteration": 2.4574177265167236 + }, + { + "auxiliary_loss_clip": 0.01168528, + "auxiliary_loss_mlp": 0.01055372, + "balance_loss_clip": 1.03280592, + "balance_loss_mlp": 1.04648614, + "epoch": 0.10196903652487599, + "flos": 26465374108800.0, + "grad_norm": 1.7546035908378184, + "language_loss": 0.86581397, + "learning_rate": 3.945913293418447e-06, + "loss": 0.88805294, + "num_input_tokens_seen": 36607495, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.21875, + "step": 1696, + "time_per_iteration": 2.4986772537231445 + }, + { + "auxiliary_loss_clip": 0.01168623, + "auxiliary_loss_mlp": 0.01056747, + "balance_loss_clip": 1.03532469, + "balance_loss_mlp": 1.04927731, + "epoch": 0.10202915977754397, + "flos": 21869526798720.0, + "grad_norm": 1.97196247739744, + "language_loss": 0.82034266, + "learning_rate": 3.945823295627519e-06, + "loss": 0.84259629, + "num_input_tokens_seen": 36628555, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.1953125, + "step": 1697, + "time_per_iteration": 2.483682155609131 + }, + { + "auxiliary_loss_clip": 0.01170239, + "auxiliary_loss_mlp": 0.0104937, + "balance_loss_clip": 1.02674437, + "balance_loss_mlp": 1.0477041, + "epoch": 0.10208928303021193, + "flos": 22309755886080.0, + "grad_norm": 1.9483747561194416, + "language_loss": 0.80650747, + "learning_rate": 3.9457332240507775e-06, + "loss": 0.82870358, + "num_input_tokens_seen": 36646250, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.2265625, + "step": 1698, + "time_per_iteration": 2.4512858390808105 + }, + { + "auxiliary_loss_clip": 0.01172882, + "auxiliary_loss_mlp": 0.01048738, + "balance_loss_clip": 1.02756608, + "balance_loss_mlp": 1.05113077, + "epoch": 0.1021494062828799, + "flos": 22125569921280.0, + "grad_norm": 4.641294823605382, + "language_loss": 0.75680709, + "learning_rate": 3.945643078691637e-06, + "loss": 0.77902329, + "num_input_tokens_seen": 36666675, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.21875, + "step": 1699, + "time_per_iteration": 2.458315849304199 + }, + { + "auxiliary_loss_clip": 0.01171952, + "auxiliary_loss_mlp": 0.01048121, + "balance_loss_clip": 1.02606726, + "balance_loss_mlp": 1.05093145, + "epoch": 0.10220952953554788, + "flos": 19646728439040.0, + "grad_norm": 1.7623204527071121, + "language_loss": 0.79777479, + "learning_rate": 3.945552859553516e-06, + "loss": 0.81997555, + "num_input_tokens_seen": 36685225, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.2109375, + "step": 1700, + "time_per_iteration": 2.4692423343658447 + }, + { + "auxiliary_loss_clip": 0.01170922, + "auxiliary_loss_mlp": 0.01045823, + "balance_loss_clip": 1.02411532, + "balance_loss_mlp": 1.04850125, + "epoch": 0.10226965278821584, + "flos": 29787290686080.0, + "grad_norm": 1.8827887870563835, + "language_loss": 0.76854098, + "learning_rate": 3.945462566639836e-06, + "loss": 0.79070842, + "num_input_tokens_seen": 36705985, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.21875, + "step": 1701, + "time_per_iteration": 2.5015852451324463 + }, + { + "auxiliary_loss_clip": 0.01176415, + "auxiliary_loss_mlp": 0.01048843, + "balance_loss_clip": 1.02708709, + "balance_loss_mlp": 1.05213511, + "epoch": 0.10232977604088381, + "flos": 27016818681600.0, + "grad_norm": 2.1180628790190927, + "language_loss": 0.78123891, + "learning_rate": 3.945372199954019e-06, + "loss": 0.80349147, + "num_input_tokens_seen": 36725815, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.2421875, + "step": 1702, + "time_per_iteration": 2.4999852180480957 + }, + { + "auxiliary_loss_clip": 0.01168217, + "auxiliary_loss_mlp": 0.01046251, + "balance_loss_clip": 1.02586651, + "balance_loss_mlp": 1.0487566, + "epoch": 0.10238989929355179, + "flos": 20777519473920.0, + "grad_norm": 2.3091523831758765, + "language_loss": 0.94838184, + "learning_rate": 3.945281759499494e-06, + "loss": 0.97052652, + "num_input_tokens_seen": 36742345, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.1953125, + "step": 1703, + "time_per_iteration": 2.4586100578308105 + }, + { + "auxiliary_loss_clip": 0.01058115, + "auxiliary_loss_mlp": 0.01013234, + "balance_loss_clip": 1.01077867, + "balance_loss_mlp": 1.01462317, + "epoch": 0.10245002254621975, + "flos": 57698322451200.0, + "grad_norm": 0.8800585598511617, + "language_loss": 0.55092424, + "learning_rate": 3.94519124527969e-06, + "loss": 0.57163775, + "num_input_tokens_seen": 36798775, + "router_z_loss_clip": 0.02453613, + "router_z_loss_mlp": 0.43554688, + "step": 1704, + "time_per_iteration": 2.998384952545166 + }, + { + "auxiliary_loss_clip": 0.01170706, + "auxiliary_loss_mlp": 0.01050153, + "balance_loss_clip": 1.02790844, + "balance_loss_mlp": 1.04962945, + "epoch": 0.10251014579888772, + "flos": 16800125558400.0, + "grad_norm": 3.5257555777633174, + "language_loss": 0.83979154, + "learning_rate": 3.945100657298039e-06, + "loss": 0.86200017, + "num_input_tokens_seen": 36816295, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.2109375, + "step": 1705, + "time_per_iteration": 2.4242281913757324 + }, + { + "auxiliary_loss_clip": 0.01057951, + "auxiliary_loss_mlp": 0.01005039, + "balance_loss_clip": 1.00258374, + "balance_loss_mlp": 1.01514411, + "epoch": 0.1025702690515557, + "flos": 68565500922240.0, + "grad_norm": 0.7733309182053202, + "language_loss": 0.60434854, + "learning_rate": 3.9450099955579765e-06, + "loss": 0.62497854, + "num_input_tokens_seen": 36882030, + "router_z_loss_clip": 0.02453613, + "router_z_loss_mlp": 0.4296875, + "step": 1706, + "time_per_iteration": 3.127495765686035 + }, + { + "auxiliary_loss_clip": 0.01175774, + "auxiliary_loss_mlp": 0.01050349, + "balance_loss_clip": 1.02876019, + "balance_loss_mlp": 1.05214357, + "epoch": 0.10263039230422366, + "flos": 14866623336960.0, + "grad_norm": 2.0444921886168284, + "language_loss": 0.85967243, + "learning_rate": 3.94491926006294e-06, + "loss": 0.88193369, + "num_input_tokens_seen": 36899245, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.234375, + "step": 1707, + "time_per_iteration": 2.4486777782440186 + }, + { + "auxiliary_loss_clip": 0.01169845, + "auxiliary_loss_mlp": 0.01046854, + "balance_loss_clip": 1.02654099, + "balance_loss_mlp": 1.04891372, + "epoch": 0.10269051555689163, + "flos": 25337599816320.0, + "grad_norm": 1.6368034329364625, + "language_loss": 0.72840983, + "learning_rate": 3.944828450816369e-06, + "loss": 0.75057685, + "num_input_tokens_seen": 36920950, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.2109375, + "step": 1708, + "time_per_iteration": 2.5019850730895996 + }, + { + "auxiliary_loss_clip": 0.01168702, + "auxiliary_loss_mlp": 0.01054619, + "balance_loss_clip": 1.0325532, + "balance_loss_mlp": 1.0493356, + "epoch": 0.10275063880955959, + "flos": 21068826773760.0, + "grad_norm": 1.9016884094819633, + "language_loss": 0.90944314, + "learning_rate": 3.944737567821709e-06, + "loss": 0.93167639, + "num_input_tokens_seen": 36938900, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1953125, + "step": 1709, + "time_per_iteration": 2.4590399265289307 + }, + { + "auxiliary_loss_clip": 0.01173643, + "auxiliary_loss_mlp": 0.01055032, + "balance_loss_clip": 1.03357422, + "balance_loss_mlp": 1.05296373, + "epoch": 0.10281076206222757, + "flos": 30366780802560.0, + "grad_norm": 3.826538703219267, + "language_loss": 0.8828221, + "learning_rate": 3.944646611082406e-06, + "loss": 0.90510881, + "num_input_tokens_seen": 36957010, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.203125, + "step": 1710, + "time_per_iteration": 2.533165216445923 + }, + { + "auxiliary_loss_clip": 0.01167248, + "auxiliary_loss_mlp": 0.01053637, + "balance_loss_clip": 1.03229809, + "balance_loss_mlp": 1.04937959, + "epoch": 0.10287088531489554, + "flos": 22418313765120.0, + "grad_norm": 1.824520485293549, + "language_loss": 0.79264998, + "learning_rate": 3.944555580601908e-06, + "loss": 0.81485879, + "num_input_tokens_seen": 36977690, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1796875, + "step": 1711, + "time_per_iteration": 2.4947102069854736 + }, + { + "auxiliary_loss_clip": 0.01171963, + "auxiliary_loss_mlp": 0.01058195, + "balance_loss_clip": 1.03615332, + "balance_loss_mlp": 1.05005431, + "epoch": 0.1029310085675635, + "flos": 25115994858240.0, + "grad_norm": 2.0689984646996016, + "language_loss": 0.73589319, + "learning_rate": 3.944464476383668e-06, + "loss": 0.7581948, + "num_input_tokens_seen": 36997300, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.21875, + "step": 1712, + "time_per_iteration": 2.521899461746216 + }, + { + "auxiliary_loss_clip": 0.01166438, + "auxiliary_loss_mlp": 0.01058704, + "balance_loss_clip": 1.03902221, + "balance_loss_mlp": 1.04961872, + "epoch": 0.10299113182023148, + "flos": 19865639877120.0, + "grad_norm": 1.8460865361447714, + "language_loss": 0.86673403, + "learning_rate": 3.94437329843114e-06, + "loss": 0.8889854, + "num_input_tokens_seen": 37016110, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.1640625, + "step": 1713, + "time_per_iteration": 2.467824935913086 + }, + { + "auxiliary_loss_clip": 0.01166539, + "auxiliary_loss_mlp": 0.01059926, + "balance_loss_clip": 1.04019666, + "balance_loss_mlp": 1.04741335, + "epoch": 0.10305125507289944, + "flos": 20447608032000.0, + "grad_norm": 2.6691144860495126, + "language_loss": 0.72610664, + "learning_rate": 3.944282046747782e-06, + "loss": 0.74837124, + "num_input_tokens_seen": 37036405, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1875, + "step": 1714, + "time_per_iteration": 2.478605031967163 + }, + { + "auxiliary_loss_clip": 0.0117345, + "auxiliary_loss_mlp": 0.01057893, + "balance_loss_clip": 1.03542209, + "balance_loss_mlp": 1.04920006, + "epoch": 0.10311137832556741, + "flos": 26250772302720.0, + "grad_norm": 2.3323118637090605, + "language_loss": 0.91395295, + "learning_rate": 3.944190721337053e-06, + "loss": 0.93626636, + "num_input_tokens_seen": 37057580, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.2421875, + "step": 1715, + "time_per_iteration": 2.5223729610443115 + }, + { + "auxiliary_loss_clip": 0.01167345, + "auxiliary_loss_mlp": 0.01053609, + "balance_loss_clip": 1.03290248, + "balance_loss_mlp": 1.04737377, + "epoch": 0.10317150157823539, + "flos": 35298932175360.0, + "grad_norm": 1.9302110224144968, + "language_loss": 0.75736755, + "learning_rate": 3.944099322202418e-06, + "loss": 0.77957708, + "num_input_tokens_seen": 37079120, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.203125, + "step": 1716, + "time_per_iteration": 2.585345506668091 + }, + { + "auxiliary_loss_clip": 0.01171415, + "auxiliary_loss_mlp": 0.01068189, + "balance_loss_clip": 1.04601645, + "balance_loss_mlp": 1.04868793, + "epoch": 0.10323162483090335, + "flos": 25739943033600.0, + "grad_norm": 2.1161503252482747, + "language_loss": 0.85214567, + "learning_rate": 3.944007849347342e-06, + "loss": 0.87454176, + "num_input_tokens_seen": 37099710, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.2265625, + "step": 1717, + "time_per_iteration": 2.500964879989624 + }, + { + "auxiliary_loss_clip": 0.01169937, + "auxiliary_loss_mlp": 0.01055982, + "balance_loss_clip": 1.0358479, + "balance_loss_mlp": 1.05102515, + "epoch": 0.10329174808357132, + "flos": 16289870906880.0, + "grad_norm": 2.0308520014155746, + "language_loss": 0.82883167, + "learning_rate": 3.943916302775292e-06, + "loss": 0.85109091, + "num_input_tokens_seen": 37117775, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.1875, + "step": 1718, + "time_per_iteration": 2.436836004257202 + }, + { + "auxiliary_loss_clip": 0.01169212, + "auxiliary_loss_mlp": 0.01052655, + "balance_loss_clip": 1.03058898, + "balance_loss_mlp": 1.05092025, + "epoch": 0.10335187133623928, + "flos": 36687166963200.0, + "grad_norm": 1.8725763890619624, + "language_loss": 0.73192763, + "learning_rate": 3.943824682489742e-06, + "loss": 0.75414634, + "num_input_tokens_seen": 37140280, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1796875, + "step": 1719, + "time_per_iteration": 2.606293201446533 + }, + { + "auxiliary_loss_clip": 0.01172065, + "auxiliary_loss_mlp": 0.01046316, + "balance_loss_clip": 1.02637219, + "balance_loss_mlp": 1.05197001, + "epoch": 0.10341199458890726, + "flos": 14975648092800.0, + "grad_norm": 2.356604748076592, + "language_loss": 0.92601806, + "learning_rate": 3.9437329884941665e-06, + "loss": 0.94820189, + "num_input_tokens_seen": 37158350, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.203125, + "step": 1720, + "time_per_iteration": 2.4628992080688477 + }, + { + "auxiliary_loss_clip": 0.01167182, + "auxiliary_loss_mlp": 0.01054246, + "balance_loss_clip": 1.03239512, + "balance_loss_mlp": 1.04656935, + "epoch": 0.10347211784157523, + "flos": 21031587348480.0, + "grad_norm": 2.8075298743139174, + "language_loss": 0.79416633, + "learning_rate": 3.943641220792039e-06, + "loss": 0.81638062, + "num_input_tokens_seen": 37177120, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.2109375, + "step": 1721, + "time_per_iteration": 2.4713754653930664 + }, + { + "auxiliary_loss_clip": 0.0117694, + "auxiliary_loss_mlp": 0.01056525, + "balance_loss_clip": 1.03317165, + "balance_loss_mlp": 1.05172479, + "epoch": 0.1035322410942432, + "flos": 19792094780160.0, + "grad_norm": 2.496468299898097, + "language_loss": 0.80755401, + "learning_rate": 3.9435493793868434e-06, + "loss": 0.82988858, + "num_input_tokens_seen": 37195895, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.25, + "step": 1722, + "time_per_iteration": 2.4676520824432373 + }, + { + "auxiliary_loss_clip": 0.01056762, + "auxiliary_loss_mlp": 0.01049921, + "balance_loss_clip": 1.04772782, + "balance_loss_mlp": 1.013726, + "epoch": 0.10359236434691117, + "flos": 52698874947840.0, + "grad_norm": 0.9564367479099696, + "language_loss": 0.67185652, + "learning_rate": 3.943457464282059e-06, + "loss": 0.69292337, + "num_input_tokens_seen": 37247270, + "router_z_loss_clip": 0.02197266, + "router_z_loss_mlp": 0.4296875, + "step": 1723, + "time_per_iteration": 2.8474721908569336 + }, + { + "auxiliary_loss_clip": 0.01170693, + "auxiliary_loss_mlp": 0.01050183, + "balance_loss_clip": 1.02951217, + "balance_loss_mlp": 1.04747462, + "epoch": 0.10365248759957914, + "flos": 18405404277120.0, + "grad_norm": 2.780632359822339, + "language_loss": 0.77922273, + "learning_rate": 3.9433654754811745e-06, + "loss": 0.80143142, + "num_input_tokens_seen": 37265595, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.234375, + "step": 1724, + "time_per_iteration": 2.4311840534210205 + }, + { + "auxiliary_loss_clip": 0.01175556, + "auxiliary_loss_mlp": 0.01052899, + "balance_loss_clip": 1.03233576, + "balance_loss_mlp": 1.05101144, + "epoch": 0.1037126108522471, + "flos": 47553555335040.0, + "grad_norm": 1.8180629527722856, + "language_loss": 0.74894094, + "learning_rate": 3.943273412987676e-06, + "loss": 0.77122545, + "num_input_tokens_seen": 37286660, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.2421875, + "step": 1725, + "time_per_iteration": 2.6802284717559814 + }, + { + "auxiliary_loss_clip": 0.01170353, + "auxiliary_loss_mlp": 0.01049343, + "balance_loss_clip": 1.02852905, + "balance_loss_mlp": 1.05098462, + "epoch": 0.10377273410491508, + "flos": 22816670572800.0, + "grad_norm": 2.4392097975248244, + "language_loss": 0.75290418, + "learning_rate": 3.943181276805054e-06, + "loss": 0.77510113, + "num_input_tokens_seen": 37304915, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.1875, + "step": 1726, + "time_per_iteration": 5.461729049682617 + }, + { + "auxiliary_loss_clip": 0.01174745, + "auxiliary_loss_mlp": 0.01059612, + "balance_loss_clip": 1.03765321, + "balance_loss_mlp": 1.0527426, + "epoch": 0.10383285735758305, + "flos": 26138694890880.0, + "grad_norm": 1.8824890959349092, + "language_loss": 0.73943913, + "learning_rate": 3.9430890669368035e-06, + "loss": 0.76178271, + "num_input_tokens_seen": 37325265, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.21875, + "step": 1727, + "time_per_iteration": 3.883134126663208 + }, + { + "auxiliary_loss_clip": 0.01169505, + "auxiliary_loss_mlp": 0.01051483, + "balance_loss_clip": 1.03023946, + "balance_loss_mlp": 1.04815936, + "epoch": 0.10389298061025101, + "flos": 17091791994240.0, + "grad_norm": 2.187385195417556, + "language_loss": 0.84670323, + "learning_rate": 3.942996783386422e-06, + "loss": 0.86891311, + "num_input_tokens_seen": 37341650, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.2109375, + "step": 1728, + "time_per_iteration": 2.4405598640441895 + }, + { + "auxiliary_loss_clip": 0.01171168, + "auxiliary_loss_mlp": 0.0105142, + "balance_loss_clip": 1.02980709, + "balance_loss_mlp": 1.05098438, + "epoch": 0.10395310386291898, + "flos": 20776513893120.0, + "grad_norm": 2.4528097766615677, + "language_loss": 0.70985407, + "learning_rate": 3.942904426157406e-06, + "loss": 0.73207992, + "num_input_tokens_seen": 37360270, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.203125, + "step": 1729, + "time_per_iteration": 2.465688467025757 + }, + { + "auxiliary_loss_clip": 0.01170997, + "auxiliary_loss_mlp": 0.01059912, + "balance_loss_clip": 1.03679705, + "balance_loss_mlp": 1.05000722, + "epoch": 0.10401322711558696, + "flos": 12820540913280.0, + "grad_norm": 2.5788681057232625, + "language_loss": 0.81288344, + "learning_rate": 3.9428119952532605e-06, + "loss": 0.8351925, + "num_input_tokens_seen": 37375225, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.2109375, + "step": 1730, + "time_per_iteration": 2.4582717418670654 + }, + { + "auxiliary_loss_clip": 0.01167657, + "auxiliary_loss_mlp": 0.010515, + "balance_loss_clip": 1.03190255, + "balance_loss_mlp": 1.04836845, + "epoch": 0.10407335036825492, + "flos": 23184683366400.0, + "grad_norm": 2.1021084439253723, + "language_loss": 0.75932384, + "learning_rate": 3.942719490677489e-06, + "loss": 0.78151548, + "num_input_tokens_seen": 37395165, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.1953125, + "step": 1731, + "time_per_iteration": 2.4650096893310547 + }, + { + "auxiliary_loss_clip": 0.01164648, + "auxiliary_loss_mlp": 0.01046999, + "balance_loss_clip": 1.02762735, + "balance_loss_mlp": 1.04899907, + "epoch": 0.10413347362092289, + "flos": 26104184899200.0, + "grad_norm": 1.8082651510271561, + "language_loss": 0.82679468, + "learning_rate": 3.9426269124336e-06, + "loss": 0.84891117, + "num_input_tokens_seen": 37414845, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.15625, + "step": 1732, + "time_per_iteration": 2.5152552127838135 + }, + { + "auxiliary_loss_clip": 0.01169252, + "auxiliary_loss_mlp": 0.01048286, + "balance_loss_clip": 1.02881873, + "balance_loss_mlp": 1.05052853, + "epoch": 0.10419359687359087, + "flos": 12641059630080.0, + "grad_norm": 2.755876599624297, + "language_loss": 0.82947195, + "learning_rate": 3.942534260525104e-06, + "loss": 0.85164732, + "num_input_tokens_seen": 37432490, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.1875, + "step": 1733, + "time_per_iteration": 2.4426257610321045 + }, + { + "auxiliary_loss_clip": 0.01171007, + "auxiliary_loss_mlp": 0.01052347, + "balance_loss_clip": 1.03171146, + "balance_loss_mlp": 1.04982805, + "epoch": 0.10425372012625883, + "flos": 12125094716160.0, + "grad_norm": 2.4971959439308336, + "language_loss": 0.76446331, + "learning_rate": 3.942441534955514e-06, + "loss": 0.78669679, + "num_input_tokens_seen": 37449435, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.2109375, + "step": 1734, + "time_per_iteration": 2.4556663036346436 + }, + { + "auxiliary_loss_clip": 0.01165131, + "auxiliary_loss_mlp": 0.01047841, + "balance_loss_clip": 1.02795696, + "balance_loss_mlp": 1.04718471, + "epoch": 0.1043138433789268, + "flos": 25337563902720.0, + "grad_norm": 1.9861442095390862, + "language_loss": 0.74962163, + "learning_rate": 3.9423487357283465e-06, + "loss": 0.7717514, + "num_input_tokens_seen": 37469105, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.1796875, + "step": 1735, + "time_per_iteration": 2.4961798191070557 + }, + { + "auxiliary_loss_clip": 0.01172587, + "auxiliary_loss_mlp": 0.01048204, + "balance_loss_clip": 1.02724743, + "balance_loss_mlp": 1.05081487, + "epoch": 0.10437396663159478, + "flos": 29167149352320.0, + "grad_norm": 1.9829662552727403, + "language_loss": 0.79049939, + "learning_rate": 3.94225586284712e-06, + "loss": 0.8127073, + "num_input_tokens_seen": 37490540, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.21875, + "step": 1736, + "time_per_iteration": 2.530808448791504 + }, + { + "auxiliary_loss_clip": 0.01166906, + "auxiliary_loss_mlp": 0.01057245, + "balance_loss_clip": 1.03655005, + "balance_loss_mlp": 1.0491184, + "epoch": 0.10443408988426274, + "flos": 25080946162560.0, + "grad_norm": 1.8105684861006923, + "language_loss": 0.70339012, + "learning_rate": 3.942162916315356e-06, + "loss": 0.72563159, + "num_input_tokens_seen": 37511905, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.171875, + "step": 1737, + "time_per_iteration": 2.4789419174194336 + }, + { + "auxiliary_loss_clip": 0.01170601, + "auxiliary_loss_mlp": 0.01051121, + "balance_loss_clip": 1.02758932, + "balance_loss_mlp": 1.04718471, + "epoch": 0.1044942131369307, + "flos": 26759662237440.0, + "grad_norm": 2.004598680960266, + "language_loss": 0.81483257, + "learning_rate": 3.942069896136581e-06, + "loss": 0.83704984, + "num_input_tokens_seen": 37533635, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.234375, + "step": 1738, + "time_per_iteration": 2.505312442779541 + }, + { + "auxiliary_loss_clip": 0.0116919, + "auxiliary_loss_mlp": 0.01058357, + "balance_loss_clip": 1.0351944, + "balance_loss_mlp": 1.04712963, + "epoch": 0.10455433638959867, + "flos": 18442571875200.0, + "grad_norm": 4.442978598454381, + "language_loss": 0.750579, + "learning_rate": 3.9419768023143196e-06, + "loss": 0.77285445, + "num_input_tokens_seen": 37552035, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.21875, + "step": 1739, + "time_per_iteration": 2.4544031620025635 + }, + { + "auxiliary_loss_clip": 0.01168087, + "auxiliary_loss_mlp": 0.01055908, + "balance_loss_clip": 1.0349865, + "balance_loss_mlp": 1.04893625, + "epoch": 0.10461445964226665, + "flos": 23218977876480.0, + "grad_norm": 1.676051388115223, + "language_loss": 0.77279431, + "learning_rate": 3.941883634852104e-06, + "loss": 0.79503429, + "num_input_tokens_seen": 37571540, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1875, + "step": 1740, + "time_per_iteration": 2.489302635192871 + }, + { + "auxiliary_loss_clip": 0.01169756, + "auxiliary_loss_mlp": 0.01048525, + "balance_loss_clip": 1.02820003, + "balance_loss_mlp": 1.05093944, + "epoch": 0.10467458289493461, + "flos": 24345243797760.0, + "grad_norm": 2.1911967502326775, + "language_loss": 0.85983682, + "learning_rate": 3.941790393753467e-06, + "loss": 0.88201964, + "num_input_tokens_seen": 37588265, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1875, + "step": 1741, + "time_per_iteration": 2.4571211338043213 + }, + { + "auxiliary_loss_clip": 0.01171445, + "auxiliary_loss_mlp": 0.01053113, + "balance_loss_clip": 1.03091609, + "balance_loss_mlp": 1.04901385, + "epoch": 0.10473470614760258, + "flos": 21287953693440.0, + "grad_norm": 4.086245960730198, + "language_loss": 0.74991679, + "learning_rate": 3.941697079021942e-06, + "loss": 0.77216244, + "num_input_tokens_seen": 37606860, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.2265625, + "step": 1742, + "time_per_iteration": 2.4919426441192627 + }, + { + "auxiliary_loss_clip": 0.01171849, + "auxiliary_loss_mlp": 0.01059576, + "balance_loss_clip": 1.03914368, + "balance_loss_mlp": 1.05323386, + "epoch": 0.10479482940027056, + "flos": 21687208341120.0, + "grad_norm": 1.9550995481311175, + "language_loss": 0.87150526, + "learning_rate": 3.94160369066107e-06, + "loss": 0.89381945, + "num_input_tokens_seen": 37625210, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.1875, + "step": 1743, + "time_per_iteration": 2.470841884613037 + }, + { + "auxiliary_loss_clip": 0.01168292, + "auxiliary_loss_mlp": 0.01049872, + "balance_loss_clip": 1.02760363, + "balance_loss_mlp": 1.04964471, + "epoch": 0.10485495265293852, + "flos": 21573694385280.0, + "grad_norm": 2.1176645115958923, + "language_loss": 0.75532508, + "learning_rate": 3.941510228674391e-06, + "loss": 0.77750671, + "num_input_tokens_seen": 37644110, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.1875, + "step": 1744, + "time_per_iteration": 2.4725873470306396 + }, + { + "auxiliary_loss_clip": 0.01171079, + "auxiliary_loss_mlp": 0.01052914, + "balance_loss_clip": 1.03336394, + "balance_loss_mlp": 1.05184436, + "epoch": 0.10491507590560649, + "flos": 37961923708800.0, + "grad_norm": 2.151699961275852, + "language_loss": 0.79306591, + "learning_rate": 3.941416693065451e-06, + "loss": 0.81530583, + "num_input_tokens_seen": 37665800, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.1875, + "step": 1745, + "time_per_iteration": 2.5937912464141846 + }, + { + "auxiliary_loss_clip": 0.01166892, + "auxiliary_loss_mlp": 0.01062835, + "balance_loss_clip": 1.04194999, + "balance_loss_mlp": 1.047683, + "epoch": 0.10497519915827447, + "flos": 26396282298240.0, + "grad_norm": 2.087314316255438, + "language_loss": 0.82382894, + "learning_rate": 3.941323083837794e-06, + "loss": 0.8461262, + "num_input_tokens_seen": 37685095, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1875, + "step": 1746, + "time_per_iteration": 2.520306348800659 + }, + { + "auxiliary_loss_clip": 0.01170145, + "auxiliary_loss_mlp": 0.01062461, + "balance_loss_clip": 1.04186153, + "balance_loss_mlp": 1.05198646, + "epoch": 0.10503532241094243, + "flos": 40662190581120.0, + "grad_norm": 1.645771273172373, + "language_loss": 0.69951761, + "learning_rate": 3.941229400994971e-06, + "loss": 0.7218436, + "num_input_tokens_seen": 37707445, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1875, + "step": 1747, + "time_per_iteration": 2.618581771850586 + }, + { + "auxiliary_loss_clip": 0.01176288, + "auxiliary_loss_mlp": 0.01062255, + "balance_loss_clip": 1.04140496, + "balance_loss_mlp": 1.05136323, + "epoch": 0.1050954456636104, + "flos": 29789409588480.0, + "grad_norm": 2.3385484358742192, + "language_loss": 0.84245849, + "learning_rate": 3.941135644540535e-06, + "loss": 0.86484385, + "num_input_tokens_seen": 37728325, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.25, + "step": 1748, + "time_per_iteration": 2.539386034011841 + }, + { + "auxiliary_loss_clip": 0.01165269, + "auxiliary_loss_mlp": 0.01049548, + "balance_loss_clip": 1.02797103, + "balance_loss_mlp": 1.04729426, + "epoch": 0.10515556891627838, + "flos": 23948754497280.0, + "grad_norm": 1.8953667439120294, + "language_loss": 0.71491921, + "learning_rate": 3.941041814478041e-06, + "loss": 0.7370674, + "num_input_tokens_seen": 37748910, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.1796875, + "step": 1749, + "time_per_iteration": 2.481700897216797 + }, + { + "auxiliary_loss_clip": 0.01166695, + "auxiliary_loss_mlp": 0.01060715, + "balance_loss_clip": 1.0395906, + "balance_loss_mlp": 1.04953468, + "epoch": 0.10521569216894634, + "flos": 18259606972800.0, + "grad_norm": 1.9760411129591238, + "language_loss": 0.81960011, + "learning_rate": 3.940947910811047e-06, + "loss": 0.84187424, + "num_input_tokens_seen": 37765745, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.171875, + "step": 1750, + "time_per_iteration": 2.4454832077026367 + }, + { + "auxiliary_loss_clip": 0.01173904, + "auxiliary_loss_mlp": 0.01060945, + "balance_loss_clip": 1.03946304, + "balance_loss_mlp": 1.05259562, + "epoch": 0.10527581542161431, + "flos": 15630909949440.0, + "grad_norm": 2.3402404294313524, + "language_loss": 0.91871023, + "learning_rate": 3.940853933543114e-06, + "loss": 0.94105875, + "num_input_tokens_seen": 37780520, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.21875, + "step": 1751, + "time_per_iteration": 2.416607141494751 + }, + { + "auxiliary_loss_clip": 0.01166575, + "auxiliary_loss_mlp": 0.0104776, + "balance_loss_clip": 1.02698207, + "balance_loss_mlp": 1.04889047, + "epoch": 0.10533593867428227, + "flos": 18296559089280.0, + "grad_norm": 2.265296057434122, + "language_loss": 0.79560149, + "learning_rate": 3.940759882677805e-06, + "loss": 0.81774485, + "num_input_tokens_seen": 37799515, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.1796875, + "step": 1752, + "time_per_iteration": 2.46063494682312 + }, + { + "auxiliary_loss_clip": 0.01167711, + "auxiliary_loss_mlp": 0.01052906, + "balance_loss_clip": 1.03202033, + "balance_loss_mlp": 1.05050862, + "epoch": 0.10539606192695025, + "flos": 29023219555200.0, + "grad_norm": 2.1401152378303867, + "language_loss": 0.75782037, + "learning_rate": 3.940665758218686e-06, + "loss": 0.78002656, + "num_input_tokens_seen": 37818695, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.171875, + "step": 1753, + "time_per_iteration": 2.51767635345459 + }, + { + "auxiliary_loss_clip": 0.01172527, + "auxiliary_loss_mlp": 0.01057137, + "balance_loss_clip": 1.03436756, + "balance_loss_mlp": 1.04939532, + "epoch": 0.10545618517961822, + "flos": 19969313506560.0, + "grad_norm": 2.0790136174876546, + "language_loss": 0.84048498, + "learning_rate": 3.940571560169328e-06, + "loss": 0.86278164, + "num_input_tokens_seen": 37837860, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.234375, + "step": 1754, + "time_per_iteration": 2.4683756828308105 + }, + { + "auxiliary_loss_clip": 0.01175207, + "auxiliary_loss_mlp": 0.01053622, + "balance_loss_clip": 1.03044736, + "balance_loss_mlp": 1.05438888, + "epoch": 0.10551630843228618, + "flos": 16143427157760.0, + "grad_norm": 2.8736094439376645, + "language_loss": 0.68956709, + "learning_rate": 3.940477288533302e-06, + "loss": 0.71185535, + "num_input_tokens_seen": 37856260, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.2109375, + "step": 1755, + "time_per_iteration": 2.45597243309021 + }, + { + "auxiliary_loss_clip": 0.01172827, + "auxiliary_loss_mlp": 0.01061763, + "balance_loss_clip": 1.03989983, + "balance_loss_mlp": 1.05102587, + "epoch": 0.10557643168495416, + "flos": 23440115957760.0, + "grad_norm": 5.502613786824721, + "language_loss": 0.76718754, + "learning_rate": 3.940382943314182e-06, + "loss": 0.78953344, + "num_input_tokens_seen": 37876960, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.21875, + "step": 1756, + "time_per_iteration": 2.5239176750183105 + }, + { + "auxiliary_loss_clip": 0.01172125, + "auxiliary_loss_mlp": 0.01058013, + "balance_loss_clip": 1.03712726, + "balance_loss_mlp": 1.04982626, + "epoch": 0.10563655493762213, + "flos": 21799034357760.0, + "grad_norm": 1.7784869470084927, + "language_loss": 0.80162531, + "learning_rate": 3.940288524515547e-06, + "loss": 0.82392669, + "num_input_tokens_seen": 37897070, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.21875, + "step": 1757, + "time_per_iteration": 2.4551706314086914 + }, + { + "auxiliary_loss_clip": 0.01171845, + "auxiliary_loss_mlp": 0.01056344, + "balance_loss_clip": 1.03499317, + "balance_loss_mlp": 1.05132246, + "epoch": 0.10569667819029009, + "flos": 53800863275520.0, + "grad_norm": 1.631431596421375, + "language_loss": 0.78800333, + "learning_rate": 3.940194032140976e-06, + "loss": 0.81028521, + "num_input_tokens_seen": 37923635, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.203125, + "step": 1758, + "time_per_iteration": 2.7955896854400635 + }, + { + "auxiliary_loss_clip": 0.01177436, + "auxiliary_loss_mlp": 0.01050523, + "balance_loss_clip": 1.02865982, + "balance_loss_mlp": 1.05364573, + "epoch": 0.10575680144295807, + "flos": 22925515760640.0, + "grad_norm": 2.609159841262955, + "language_loss": 0.9189958, + "learning_rate": 3.940099466194054e-06, + "loss": 0.94127536, + "num_input_tokens_seen": 37942650, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.234375, + "step": 1759, + "time_per_iteration": 2.4853782653808594 + }, + { + "auxiliary_loss_clip": 0.01173064, + "auxiliary_loss_mlp": 0.01055702, + "balance_loss_clip": 1.03276575, + "balance_loss_mlp": 1.04970741, + "epoch": 0.10581692469562604, + "flos": 14136667148160.0, + "grad_norm": 2.498568213886603, + "language_loss": 0.76932353, + "learning_rate": 3.940004826678365e-06, + "loss": 0.79161119, + "num_input_tokens_seen": 37960660, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.234375, + "step": 1760, + "time_per_iteration": 2.470705509185791 + }, + { + "auxiliary_loss_clip": 0.01173982, + "auxiliary_loss_mlp": 0.01061265, + "balance_loss_clip": 1.03825736, + "balance_loss_mlp": 1.05152941, + "epoch": 0.105877047948294, + "flos": 25958674903680.0, + "grad_norm": 2.349800445259612, + "language_loss": 0.89282435, + "learning_rate": 3.939910113597498e-06, + "loss": 0.91517675, + "num_input_tokens_seen": 37978625, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.2265625, + "step": 1761, + "time_per_iteration": 2.491501569747925 + }, + { + "auxiliary_loss_clip": 0.01173015, + "auxiliary_loss_mlp": 0.01060542, + "balance_loss_clip": 1.03944254, + "balance_loss_mlp": 1.0518589, + "epoch": 0.10593717120096197, + "flos": 30664768032000.0, + "grad_norm": 2.4794664397863877, + "language_loss": 0.78304708, + "learning_rate": 3.9398153269550464e-06, + "loss": 0.80538261, + "num_input_tokens_seen": 38000005, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.2109375, + "step": 1762, + "time_per_iteration": 2.5563831329345703 + }, + { + "auxiliary_loss_clip": 0.01062071, + "auxiliary_loss_mlp": 0.01014008, + "balance_loss_clip": 1.0110991, + "balance_loss_mlp": 1.02000487, + "epoch": 0.10599729445362994, + "flos": 66436682497920.0, + "grad_norm": 0.753444103392694, + "language_loss": 0.60481733, + "learning_rate": 3.939720466754602e-06, + "loss": 0.62557811, + "num_input_tokens_seen": 38066165, + "router_z_loss_clip": 0.02905273, + "router_z_loss_mlp": 0.421875, + "step": 1763, + "time_per_iteration": 3.2239294052124023 + }, + { + "auxiliary_loss_clip": 0.01170891, + "auxiliary_loss_mlp": 0.01048971, + "balance_loss_clip": 1.02777529, + "balance_loss_mlp": 1.04924011, + "epoch": 0.10605741770629791, + "flos": 23948179879680.0, + "grad_norm": 2.054980370260194, + "language_loss": 0.8010751, + "learning_rate": 3.939625532999763e-06, + "loss": 0.82327372, + "num_input_tokens_seen": 38086150, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.21875, + "step": 1764, + "time_per_iteration": 2.476325273513794 + }, + { + "auxiliary_loss_clip": 0.01169028, + "auxiliary_loss_mlp": 0.01049345, + "balance_loss_clip": 1.02745855, + "balance_loss_mlp": 1.04961264, + "epoch": 0.10611754095896588, + "flos": 19387524919680.0, + "grad_norm": 1.7621956234955212, + "language_loss": 0.7999962, + "learning_rate": 3.9395305256941314e-06, + "loss": 0.82217997, + "num_input_tokens_seen": 38104205, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.1953125, + "step": 1765, + "time_per_iteration": 2.446593999862671 + }, + { + "auxiliary_loss_clip": 0.01167126, + "auxiliary_loss_mlp": 0.01054873, + "balance_loss_clip": 1.03394008, + "balance_loss_mlp": 1.04794002, + "epoch": 0.10617766421163385, + "flos": 22237755073920.0, + "grad_norm": 1.867239621884004, + "language_loss": 0.76693732, + "learning_rate": 3.939435444841306e-06, + "loss": 0.78915727, + "num_input_tokens_seen": 38122005, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1875, + "step": 1766, + "time_per_iteration": 2.4462356567382812 + }, + { + "auxiliary_loss_clip": 0.01170332, + "auxiliary_loss_mlp": 0.01059306, + "balance_loss_clip": 1.0366683, + "balance_loss_mlp": 1.05017042, + "epoch": 0.10623778746430182, + "flos": 28404407024640.0, + "grad_norm": 1.6580981789618001, + "language_loss": 0.77319431, + "learning_rate": 3.939340290444895e-06, + "loss": 0.79549068, + "num_input_tokens_seen": 38143365, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.203125, + "step": 1767, + "time_per_iteration": 2.542797088623047 + }, + { + "auxiliary_loss_clip": 0.01060068, + "auxiliary_loss_mlp": 0.01000453, + "balance_loss_clip": 0.99785471, + "balance_loss_mlp": 1.01804066, + "epoch": 0.10629791071696978, + "flos": 64234639221120.0, + "grad_norm": 0.6789245534488961, + "language_loss": 0.57902765, + "learning_rate": 3.939245062508506e-06, + "loss": 0.59963286, + "num_input_tokens_seen": 38210035, + "router_z_loss_clip": 0.02600098, + "router_z_loss_mlp": 0.421875, + "step": 1768, + "time_per_iteration": 6.071596384048462 + }, + { + "auxiliary_loss_clip": 0.01172748, + "auxiliary_loss_mlp": 0.01041813, + "balance_loss_clip": 1.0219171, + "balance_loss_mlp": 1.05201912, + "epoch": 0.10635803396963776, + "flos": 22747578762240.0, + "grad_norm": 1.446404125156032, + "language_loss": 0.86796767, + "learning_rate": 3.939149761035749e-06, + "loss": 0.89011335, + "num_input_tokens_seen": 38231230, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.2109375, + "step": 1769, + "time_per_iteration": 2.5106868743896484 + }, + { + "auxiliary_loss_clip": 0.01175908, + "auxiliary_loss_mlp": 0.01056805, + "balance_loss_clip": 1.03496528, + "balance_loss_mlp": 1.05300689, + "epoch": 0.10641815722230573, + "flos": 31395586147200.0, + "grad_norm": 1.766851816283336, + "language_loss": 0.61890501, + "learning_rate": 3.9390543860302395e-06, + "loss": 0.64123213, + "num_input_tokens_seen": 38253890, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.2265625, + "step": 1770, + "time_per_iteration": 2.5770323276519775 + }, + { + "auxiliary_loss_clip": 0.01061292, + "auxiliary_loss_mlp": 0.01003176, + "balance_loss_clip": 1.00058925, + "balance_loss_mlp": 1.01873469, + "epoch": 0.1064782804749737, + "flos": 58552527784320.0, + "grad_norm": 0.8864779346546747, + "language_loss": 0.57095039, + "learning_rate": 3.9389589374955925e-06, + "loss": 0.59159505, + "num_input_tokens_seen": 38304290, + "router_z_loss_clip": 0.02587891, + "router_z_loss_mlp": 0.42578125, + "step": 1771, + "time_per_iteration": 2.957993507385254 + }, + { + "auxiliary_loss_clip": 0.01174087, + "auxiliary_loss_mlp": 0.01063103, + "balance_loss_clip": 1.04187179, + "balance_loss_mlp": 1.05443954, + "epoch": 0.10653840372764166, + "flos": 23987825516160.0, + "grad_norm": 1.6398085638646198, + "language_loss": 0.88530469, + "learning_rate": 3.938863415435429e-06, + "loss": 0.90767658, + "num_input_tokens_seen": 38324725, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1953125, + "step": 1772, + "time_per_iteration": 2.520744562149048 + }, + { + "auxiliary_loss_clip": 0.01176768, + "auxiliary_loss_mlp": 0.01063607, + "balance_loss_clip": 1.03945482, + "balance_loss_mlp": 1.05091381, + "epoch": 0.10659852698030964, + "flos": 18294655668480.0, + "grad_norm": 2.8236986107629094, + "language_loss": 0.76021719, + "learning_rate": 3.93876781985337e-06, + "loss": 0.78262091, + "num_input_tokens_seen": 38340735, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.2578125, + "step": 1773, + "time_per_iteration": 2.4228129386901855 + }, + { + "auxiliary_loss_clip": 0.01171647, + "auxiliary_loss_mlp": 0.01063224, + "balance_loss_clip": 1.04087257, + "balance_loss_mlp": 1.05147731, + "epoch": 0.1066586502329776, + "flos": 32160591031680.0, + "grad_norm": 2.1931291175477177, + "language_loss": 0.83184093, + "learning_rate": 3.938672150753041e-06, + "loss": 0.85418963, + "num_input_tokens_seen": 38361315, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.203125, + "step": 1774, + "time_per_iteration": 2.5613787174224854 + }, + { + "auxiliary_loss_clip": 0.01177598, + "auxiliary_loss_mlp": 0.01054445, + "balance_loss_clip": 1.03262997, + "balance_loss_mlp": 1.05220413, + "epoch": 0.10671877348564557, + "flos": 17785155202560.0, + "grad_norm": 2.683505024819064, + "language_loss": 0.76297373, + "learning_rate": 3.9385764081380704e-06, + "loss": 0.78529418, + "num_input_tokens_seen": 38377425, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.25, + "step": 1775, + "time_per_iteration": 2.437676429748535 + }, + { + "auxiliary_loss_clip": 0.01057587, + "auxiliary_loss_mlp": 0.01006639, + "balance_loss_clip": 1.00413537, + "balance_loss_mlp": 1.01520467, + "epoch": 0.10677889673831355, + "flos": 63510177813120.0, + "grad_norm": 0.8253045983972309, + "language_loss": 0.57443953, + "learning_rate": 3.9384805920120876e-06, + "loss": 0.59508181, + "num_input_tokens_seen": 38440275, + "router_z_loss_clip": 0.02502441, + "router_z_loss_mlp": 0.42382812, + "step": 1776, + "time_per_iteration": 3.101378917694092 + }, + { + "auxiliary_loss_clip": 0.01176962, + "auxiliary_loss_mlp": 0.01059775, + "balance_loss_clip": 1.0365653, + "balance_loss_mlp": 1.05411029, + "epoch": 0.10683901999098151, + "flos": 22017694400640.0, + "grad_norm": 1.6481869723516467, + "language_loss": 0.83374244, + "learning_rate": 3.938384702378727e-06, + "loss": 0.8561098, + "num_input_tokens_seen": 38461820, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.2265625, + "step": 1777, + "time_per_iteration": 2.5109002590179443 + }, + { + "auxiliary_loss_clip": 0.01170133, + "auxiliary_loss_mlp": 0.01055162, + "balance_loss_clip": 1.03371584, + "balance_loss_mlp": 1.05298579, + "epoch": 0.10689914324364948, + "flos": 25042952551680.0, + "grad_norm": 2.6420984425067013, + "language_loss": 0.87275863, + "learning_rate": 3.938288739241625e-06, + "loss": 0.89501154, + "num_input_tokens_seen": 38482235, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.171875, + "step": 1778, + "time_per_iteration": 2.503103494644165 + }, + { + "auxiliary_loss_clip": 0.01175003, + "auxiliary_loss_mlp": 0.01053847, + "balance_loss_clip": 1.032354, + "balance_loss_mlp": 1.05328, + "epoch": 0.10695926649631746, + "flos": 16435129507200.0, + "grad_norm": 2.213225731734914, + "language_loss": 0.83970487, + "learning_rate": 3.938192702604417e-06, + "loss": 0.86199337, + "num_input_tokens_seen": 38500690, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.21875, + "step": 1779, + "time_per_iteration": 2.4574496746063232 + }, + { + "auxiliary_loss_clip": 0.01169562, + "auxiliary_loss_mlp": 0.01052117, + "balance_loss_clip": 1.03086162, + "balance_loss_mlp": 1.04975557, + "epoch": 0.10701938974898542, + "flos": 16979211792000.0, + "grad_norm": 2.4959309518827655, + "language_loss": 0.67064941, + "learning_rate": 3.9380965924707495e-06, + "loss": 0.69286621, + "num_input_tokens_seen": 38518405, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1953125, + "step": 1780, + "time_per_iteration": 2.447756052017212 + }, + { + "auxiliary_loss_clip": 0.01171847, + "auxiliary_loss_mlp": 0.01046888, + "balance_loss_clip": 1.02553749, + "balance_loss_mlp": 1.05183458, + "epoch": 0.10707951300165339, + "flos": 15888102307200.0, + "grad_norm": 2.25546613947904, + "language_loss": 0.91667759, + "learning_rate": 3.938000408844265e-06, + "loss": 0.93886495, + "num_input_tokens_seen": 38535060, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.203125, + "step": 1781, + "time_per_iteration": 2.4367144107818604 + }, + { + "auxiliary_loss_clip": 0.01175433, + "auxiliary_loss_mlp": 0.01046071, + "balance_loss_clip": 1.02524495, + "balance_loss_mlp": 1.05302, + "epoch": 0.10713963625432135, + "flos": 14247164361600.0, + "grad_norm": 2.202402738572802, + "language_loss": 0.79505372, + "learning_rate": 3.9379041517286105e-06, + "loss": 0.81726873, + "num_input_tokens_seen": 38552855, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.2265625, + "step": 1782, + "time_per_iteration": 2.4340877532958984 + }, + { + "auxiliary_loss_clip": 0.01175468, + "auxiliary_loss_mlp": 0.01052246, + "balance_loss_clip": 1.03055024, + "balance_loss_mlp": 1.0517509, + "epoch": 0.10719975950698933, + "flos": 16756780821120.0, + "grad_norm": 2.0445491568240994, + "language_loss": 0.78994977, + "learning_rate": 3.937807821127436e-06, + "loss": 0.81222689, + "num_input_tokens_seen": 38570075, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.234375, + "step": 1783, + "time_per_iteration": 2.434527635574341 + }, + { + "auxiliary_loss_clip": 0.01176375, + "auxiliary_loss_mlp": 0.01052212, + "balance_loss_clip": 1.02991986, + "balance_loss_mlp": 1.0529108, + "epoch": 0.1072598827596573, + "flos": 22710626645760.0, + "grad_norm": 1.8050343336808015, + "language_loss": 0.85956216, + "learning_rate": 3.937711417044395e-06, + "loss": 0.88184798, + "num_input_tokens_seen": 38587970, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.234375, + "step": 1784, + "time_per_iteration": 2.4867746829986572 + }, + { + "auxiliary_loss_clip": 0.01174134, + "auxiliary_loss_mlp": 0.01054075, + "balance_loss_clip": 1.03188968, + "balance_loss_mlp": 1.05080986, + "epoch": 0.10732000601232526, + "flos": 23258264376960.0, + "grad_norm": 3.0774406347184806, + "language_loss": 1.00899053, + "learning_rate": 3.937614939483143e-06, + "loss": 1.03127265, + "num_input_tokens_seen": 38605840, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.234375, + "step": 1785, + "time_per_iteration": 2.46663498878479 + }, + { + "auxiliary_loss_clip": 0.01171119, + "auxiliary_loss_mlp": 0.01057254, + "balance_loss_clip": 1.03636849, + "balance_loss_mlp": 1.05306709, + "epoch": 0.10738012926499324, + "flos": 24207060176640.0, + "grad_norm": 1.4495948735276882, + "language_loss": 0.85070992, + "learning_rate": 3.937518388447339e-06, + "loss": 0.87299371, + "num_input_tokens_seen": 38627070, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1796875, + "step": 1786, + "time_per_iteration": 2.505018949508667 + }, + { + "auxiliary_loss_clip": 0.01170035, + "auxiliary_loss_mlp": 0.01059108, + "balance_loss_clip": 1.035779, + "balance_loss_mlp": 1.04750311, + "epoch": 0.1074402525176612, + "flos": 20923065383040.0, + "grad_norm": 1.8788886178726656, + "language_loss": 0.78817046, + "learning_rate": 3.937421763940642e-06, + "loss": 0.81046188, + "num_input_tokens_seen": 38645840, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.21875, + "step": 1787, + "time_per_iteration": 2.443532705307007 + }, + { + "auxiliary_loss_clip": 0.01176938, + "auxiliary_loss_mlp": 0.01049821, + "balance_loss_clip": 1.02768385, + "balance_loss_mlp": 1.0517112, + "epoch": 0.10750037577032917, + "flos": 16946928443520.0, + "grad_norm": 2.551869220071384, + "language_loss": 0.82557851, + "learning_rate": 3.937325065966719e-06, + "loss": 0.84784609, + "num_input_tokens_seen": 38664770, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.25, + "step": 1788, + "time_per_iteration": 2.4807305335998535 + }, + { + "auxiliary_loss_clip": 0.01170019, + "auxiliary_loss_mlp": 0.0106343, + "balance_loss_clip": 1.04219902, + "balance_loss_mlp": 1.04939878, + "epoch": 0.10756049902299715, + "flos": 20266546550400.0, + "grad_norm": 2.778852512980128, + "language_loss": 0.77794182, + "learning_rate": 3.9372282945292335e-06, + "loss": 0.80027628, + "num_input_tokens_seen": 38683865, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.203125, + "step": 1789, + "time_per_iteration": 2.482330322265625 + }, + { + "auxiliary_loss_clip": 0.01173111, + "auxiliary_loss_mlp": 0.01061244, + "balance_loss_clip": 1.03631723, + "balance_loss_mlp": 1.05133712, + "epoch": 0.10762062227566511, + "flos": 23586523793280.0, + "grad_norm": 2.434124451319009, + "language_loss": 0.74467903, + "learning_rate": 3.937131449631859e-06, + "loss": 0.76702261, + "num_input_tokens_seen": 38702485, + "router_z_loss_clip": 0.24902344, + "router_z_loss_mlp": 1.21875, + "step": 1790, + "time_per_iteration": 2.5921239852905273 + }, + { + "auxiliary_loss_clip": 0.01177807, + "auxiliary_loss_mlp": 0.01072366, + "balance_loss_clip": 1.04766607, + "balance_loss_mlp": 1.05428767, + "epoch": 0.10768074552833308, + "flos": 24310626065280.0, + "grad_norm": 2.5839507236364554, + "language_loss": 0.78495383, + "learning_rate": 3.9370345312782645e-06, + "loss": 0.80745554, + "num_input_tokens_seen": 38722475, + "router_z_loss_clip": 0.24804688, + "router_z_loss_mlp": 1.234375, + "step": 1791, + "time_per_iteration": 2.5242488384246826 + }, + { + "auxiliary_loss_clip": 0.01167341, + "auxiliary_loss_mlp": 0.01053897, + "balance_loss_clip": 1.0330478, + "balance_loss_mlp": 1.05112934, + "epoch": 0.10774086878100106, + "flos": 25299965341440.0, + "grad_norm": 1.8605555947944812, + "language_loss": 0.70855284, + "learning_rate": 3.936937539472126e-06, + "loss": 0.73076522, + "num_input_tokens_seen": 38743285, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.15625, + "step": 1792, + "time_per_iteration": 2.5260751247406006 + }, + { + "auxiliary_loss_clip": 0.01175824, + "auxiliary_loss_mlp": 0.010463, + "balance_loss_clip": 1.02330506, + "balance_loss_mlp": 1.05109024, + "epoch": 0.10780099203366902, + "flos": 22054035985920.0, + "grad_norm": 2.973355145299492, + "language_loss": 0.76029646, + "learning_rate": 3.9368404742171236e-06, + "loss": 0.78251767, + "num_input_tokens_seen": 38763035, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.25, + "step": 1793, + "time_per_iteration": 2.5037007331848145 + }, + { + "auxiliary_loss_clip": 0.01171847, + "auxiliary_loss_mlp": 0.01060242, + "balance_loss_clip": 1.03793848, + "balance_loss_mlp": 1.0537113, + "epoch": 0.10786111528633699, + "flos": 22747471021440.0, + "grad_norm": 1.7251623627880495, + "language_loss": 0.85158944, + "learning_rate": 3.936743335516936e-06, + "loss": 0.87391031, + "num_input_tokens_seen": 38784900, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.1875, + "step": 1794, + "time_per_iteration": 2.5210132598876953 + }, + { + "auxiliary_loss_clip": 0.01180393, + "auxiliary_loss_mlp": 0.01052991, + "balance_loss_clip": 1.02954292, + "balance_loss_mlp": 1.05342674, + "epoch": 0.10792123853900495, + "flos": 20851064570880.0, + "grad_norm": 1.9245153565321482, + "language_loss": 0.74914879, + "learning_rate": 3.936646123375246e-06, + "loss": 0.77148265, + "num_input_tokens_seen": 38804695, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.265625, + "step": 1795, + "time_per_iteration": 2.486111879348755 + }, + { + "auxiliary_loss_clip": 0.0117609, + "auxiliary_loss_mlp": 0.01060963, + "balance_loss_clip": 1.03863525, + "balance_loss_mlp": 1.05227423, + "epoch": 0.10798136179167293, + "flos": 17748705876480.0, + "grad_norm": 2.917857918230487, + "language_loss": 0.8116014, + "learning_rate": 3.936548837795741e-06, + "loss": 0.83397192, + "num_input_tokens_seen": 38822395, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.234375, + "step": 1796, + "time_per_iteration": 2.4357504844665527 + }, + { + "auxiliary_loss_clip": 0.01177296, + "auxiliary_loss_mlp": 0.01075942, + "balance_loss_clip": 1.05260134, + "balance_loss_mlp": 1.05476594, + "epoch": 0.1080414850443409, + "flos": 13589639948160.0, + "grad_norm": 2.4043777768562293, + "language_loss": 0.73476732, + "learning_rate": 3.936451478782111e-06, + "loss": 0.75729972, + "num_input_tokens_seen": 38839865, + "router_z_loss_clip": 0.23339844, + "router_z_loss_mlp": 1.21875, + "step": 1797, + "time_per_iteration": 2.477867841720581 + }, + { + "auxiliary_loss_clip": 0.01172695, + "auxiliary_loss_mlp": 0.01051138, + "balance_loss_clip": 1.03081274, + "balance_loss_mlp": 1.05260658, + "epoch": 0.10810160829700886, + "flos": 16253421580800.0, + "grad_norm": 3.1892188654982396, + "language_loss": 0.81348622, + "learning_rate": 3.936354046338046e-06, + "loss": 0.83572453, + "num_input_tokens_seen": 38857300, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.203125, + "step": 1798, + "time_per_iteration": 2.5060064792633057 + }, + { + "auxiliary_loss_clip": 0.011719, + "auxiliary_loss_mlp": 0.01053896, + "balance_loss_clip": 1.03075755, + "balance_loss_mlp": 1.0508821, + "epoch": 0.10816173154967684, + "flos": 15158002464000.0, + "grad_norm": 2.4195393058725623, + "language_loss": 0.85180116, + "learning_rate": 3.936256540467242e-06, + "loss": 0.87405908, + "num_input_tokens_seen": 38874960, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.2109375, + "step": 1799, + "time_per_iteration": 2.4546945095062256 + }, + { + "auxiliary_loss_clip": 0.01172982, + "auxiliary_loss_mlp": 0.01064124, + "balance_loss_clip": 1.04271412, + "balance_loss_mlp": 1.0546999, + "epoch": 0.10822185480234481, + "flos": 17785334770560.0, + "grad_norm": 2.2474252534922265, + "language_loss": 0.77365196, + "learning_rate": 3.9361589611733955e-06, + "loss": 0.79602301, + "num_input_tokens_seen": 38893610, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.1796875, + "step": 1800, + "time_per_iteration": 2.4650769233703613 + }, + { + "auxiliary_loss_clip": 0.01168665, + "auxiliary_loss_mlp": 0.01044543, + "balance_loss_clip": 1.02443254, + "balance_loss_mlp": 1.05136347, + "epoch": 0.10828197805501277, + "flos": 25556654908800.0, + "grad_norm": 2.2954016650766844, + "language_loss": 0.7287963, + "learning_rate": 3.9360613084602075e-06, + "loss": 0.7509284, + "num_input_tokens_seen": 38913485, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.171875, + "step": 1801, + "time_per_iteration": 2.5045113563537598 + }, + { + "auxiliary_loss_clip": 0.01177863, + "auxiliary_loss_mlp": 0.01048534, + "balance_loss_clip": 1.02785134, + "balance_loss_mlp": 1.05259442, + "epoch": 0.10834210130768075, + "flos": 28984435845120.0, + "grad_norm": 1.8364602771794378, + "language_loss": 0.66427058, + "learning_rate": 3.935963582331381e-06, + "loss": 0.68653458, + "num_input_tokens_seen": 38935650, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.25, + "step": 1802, + "time_per_iteration": 2.5547947883605957 + }, + { + "auxiliary_loss_clip": 0.01170285, + "auxiliary_loss_mlp": 0.01059138, + "balance_loss_clip": 1.03712058, + "balance_loss_mlp": 1.05202222, + "epoch": 0.10840222456034872, + "flos": 20264212166400.0, + "grad_norm": 1.7898565484043845, + "language_loss": 0.8136133, + "learning_rate": 3.935865782790621e-06, + "loss": 0.83590758, + "num_input_tokens_seen": 38954130, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1796875, + "step": 1803, + "time_per_iteration": 2.4758658409118652 + }, + { + "auxiliary_loss_clip": 0.0116949, + "auxiliary_loss_mlp": 0.01052862, + "balance_loss_clip": 1.031106, + "balance_loss_mlp": 1.05126929, + "epoch": 0.10846234781301668, + "flos": 19863054097920.0, + "grad_norm": 2.61974519761109, + "language_loss": 0.9122982, + "learning_rate": 3.9357679098416365e-06, + "loss": 0.93452168, + "num_input_tokens_seen": 38972905, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.1875, + "step": 1804, + "time_per_iteration": 2.4858944416046143 + }, + { + "auxiliary_loss_clip": 0.01175328, + "auxiliary_loss_mlp": 0.01052796, + "balance_loss_clip": 1.03031349, + "balance_loss_mlp": 1.05401301, + "epoch": 0.10852247106568465, + "flos": 26469037296000.0, + "grad_norm": 2.0091269076806078, + "language_loss": 0.7623654, + "learning_rate": 3.935669963488139e-06, + "loss": 0.78464663, + "num_input_tokens_seen": 38993255, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.2109375, + "step": 1805, + "time_per_iteration": 2.5379836559295654 + }, + { + "auxiliary_loss_clip": 0.01172079, + "auxiliary_loss_mlp": 0.01048159, + "balance_loss_clip": 1.02842999, + "balance_loss_mlp": 1.0535754, + "epoch": 0.10858259431835263, + "flos": 30081506987520.0, + "grad_norm": 1.8192828849331855, + "language_loss": 0.860416, + "learning_rate": 3.935571943733843e-06, + "loss": 0.88261837, + "num_input_tokens_seen": 39012610, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1875, + "step": 1806, + "time_per_iteration": 2.5523955821990967 + }, + { + "auxiliary_loss_clip": 0.01170931, + "auxiliary_loss_mlp": 0.01053704, + "balance_loss_clip": 1.03275895, + "balance_loss_mlp": 1.05068612, + "epoch": 0.10864271757102059, + "flos": 19063180085760.0, + "grad_norm": 5.439462316727856, + "language_loss": 0.80572915, + "learning_rate": 3.9354738505824635e-06, + "loss": 0.82797557, + "num_input_tokens_seen": 39030120, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.203125, + "step": 1807, + "time_per_iteration": 2.514390230178833 + }, + { + "auxiliary_loss_clip": 0.01171878, + "auxiliary_loss_mlp": 0.01051305, + "balance_loss_clip": 1.03168321, + "balance_loss_mlp": 1.05415583, + "epoch": 0.10870284082368856, + "flos": 24715052271360.0, + "grad_norm": 1.7684897552837426, + "language_loss": 0.78731525, + "learning_rate": 3.9353756840377225e-06, + "loss": 0.80954707, + "num_input_tokens_seen": 39049875, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.171875, + "step": 1808, + "time_per_iteration": 2.5084331035614014 + }, + { + "auxiliary_loss_clip": 0.01176105, + "auxiliary_loss_mlp": 0.01052005, + "balance_loss_clip": 1.03090501, + "balance_loss_mlp": 1.05633223, + "epoch": 0.10876296407635654, + "flos": 20627663932800.0, + "grad_norm": 1.6609588216066864, + "language_loss": 0.78927523, + "learning_rate": 3.935277444103342e-06, + "loss": 0.81155634, + "num_input_tokens_seen": 39068935, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1953125, + "step": 1809, + "time_per_iteration": 5.368049621582031 + }, + { + "auxiliary_loss_clip": 0.01171492, + "auxiliary_loss_mlp": 0.01053913, + "balance_loss_clip": 1.03318286, + "balance_loss_mlp": 1.05087388, + "epoch": 0.1088230873290245, + "flos": 21579835610880.0, + "grad_norm": 2.0370215842844197, + "language_loss": 0.8468523, + "learning_rate": 3.935179130783046e-06, + "loss": 0.86910635, + "num_input_tokens_seen": 39087370, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.203125, + "step": 1810, + "time_per_iteration": 3.904432535171509 + }, + { + "auxiliary_loss_clip": 0.01180342, + "auxiliary_loss_mlp": 0.01054633, + "balance_loss_clip": 1.03111291, + "balance_loss_mlp": 1.05665135, + "epoch": 0.10888321058169247, + "flos": 26469037296000.0, + "grad_norm": 1.9531179942167565, + "language_loss": 0.63677633, + "learning_rate": 3.935080744080564e-06, + "loss": 0.6591261, + "num_input_tokens_seen": 39106635, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.234375, + "step": 1811, + "time_per_iteration": 2.523650646209717 + }, + { + "auxiliary_loss_clip": 0.01171345, + "auxiliary_loss_mlp": 0.01048747, + "balance_loss_clip": 1.02737319, + "balance_loss_mlp": 1.05139136, + "epoch": 0.10894333383436045, + "flos": 25848608653440.0, + "grad_norm": 3.279966127836369, + "language_loss": 0.74238914, + "learning_rate": 3.934982283999626e-06, + "loss": 0.76459008, + "num_input_tokens_seen": 39126335, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.203125, + "step": 1812, + "time_per_iteration": 2.5579042434692383 + }, + { + "auxiliary_loss_clip": 0.01173457, + "auxiliary_loss_mlp": 0.01047521, + "balance_loss_clip": 1.02587295, + "balance_loss_mlp": 1.05391026, + "epoch": 0.10900345708702841, + "flos": 19537093152000.0, + "grad_norm": 1.9314487748153213, + "language_loss": 0.72647583, + "learning_rate": 3.934883750543966e-06, + "loss": 0.74868566, + "num_input_tokens_seen": 39144820, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.1875, + "step": 1813, + "time_per_iteration": 2.488762617111206 + }, + { + "auxiliary_loss_clip": 0.01174675, + "auxiliary_loss_mlp": 0.01051455, + "balance_loss_clip": 1.02999711, + "balance_loss_mlp": 1.05744648, + "epoch": 0.10906358033969638, + "flos": 23623296341760.0, + "grad_norm": 10.097396236718186, + "language_loss": 0.82224226, + "learning_rate": 3.93478514371732e-06, + "loss": 0.84450358, + "num_input_tokens_seen": 39165945, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.171875, + "step": 1814, + "time_per_iteration": 2.495798349380493 + }, + { + "auxiliary_loss_clip": 0.01176897, + "auxiliary_loss_mlp": 0.01057916, + "balance_loss_clip": 1.03670859, + "balance_loss_mlp": 1.05595291, + "epoch": 0.10912370359236434, + "flos": 21214731818880.0, + "grad_norm": 2.3551509805271422, + "language_loss": 0.84218144, + "learning_rate": 3.934686463523429e-06, + "loss": 0.86452949, + "num_input_tokens_seen": 39183520, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.2109375, + "step": 1815, + "time_per_iteration": 2.462663173675537 + }, + { + "auxiliary_loss_clip": 0.01175955, + "auxiliary_loss_mlp": 0.01053131, + "balance_loss_clip": 1.03054035, + "balance_loss_mlp": 1.05833483, + "epoch": 0.10918382684503232, + "flos": 13553190622080.0, + "grad_norm": 2.3954928768695027, + "language_loss": 0.71048725, + "learning_rate": 3.9345877099660315e-06, + "loss": 0.73277813, + "num_input_tokens_seen": 39201190, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.171875, + "step": 1816, + "time_per_iteration": 2.465953826904297 + }, + { + "auxiliary_loss_clip": 0.01178612, + "auxiliary_loss_mlp": 0.01063123, + "balance_loss_clip": 1.04061651, + "balance_loss_mlp": 1.056674, + "epoch": 0.10924395009770028, + "flos": 27964321591680.0, + "grad_norm": 2.0063973144433067, + "language_loss": 0.72811669, + "learning_rate": 3.9344888830488744e-06, + "loss": 0.75053406, + "num_input_tokens_seen": 39221210, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.21875, + "step": 1817, + "time_per_iteration": 2.5323143005371094 + }, + { + "auxiliary_loss_clip": 0.01178871, + "auxiliary_loss_mlp": 0.01054207, + "balance_loss_clip": 1.03167605, + "balance_loss_mlp": 1.05709267, + "epoch": 0.10930407335036825, + "flos": 25593750679680.0, + "grad_norm": 1.767365755633268, + "language_loss": 0.67279243, + "learning_rate": 3.934389982775706e-06, + "loss": 0.6951232, + "num_input_tokens_seen": 39242025, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.21875, + "step": 1818, + "time_per_iteration": 2.5450243949890137 + }, + { + "auxiliary_loss_clip": 0.01177017, + "auxiliary_loss_mlp": 0.01063325, + "balance_loss_clip": 1.04123521, + "balance_loss_mlp": 1.05534315, + "epoch": 0.10936419660303623, + "flos": 18406194376320.0, + "grad_norm": 2.0802139312896744, + "language_loss": 0.72992313, + "learning_rate": 3.934291009150275e-06, + "loss": 0.75232661, + "num_input_tokens_seen": 39259870, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.21875, + "step": 1819, + "time_per_iteration": 2.487644910812378 + }, + { + "auxiliary_loss_clip": 0.01180831, + "auxiliary_loss_mlp": 0.01051168, + "balance_loss_clip": 1.02994883, + "balance_loss_mlp": 1.06090236, + "epoch": 0.1094243198557042, + "flos": 23840052963840.0, + "grad_norm": 7.240077427900601, + "language_loss": 0.73943537, + "learning_rate": 3.934191962176335e-06, + "loss": 0.76175541, + "num_input_tokens_seen": 39278500, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.203125, + "step": 1820, + "time_per_iteration": 2.5064899921417236 + }, + { + "auxiliary_loss_clip": 0.01177081, + "auxiliary_loss_mlp": 0.0105084, + "balance_loss_clip": 1.02765381, + "balance_loss_mlp": 1.05699766, + "epoch": 0.10948444310837216, + "flos": 14643940970880.0, + "grad_norm": 2.1677198782015887, + "language_loss": 0.82586408, + "learning_rate": 3.934092841857642e-06, + "loss": 0.84814322, + "num_input_tokens_seen": 39294800, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.203125, + "step": 1821, + "time_per_iteration": 2.4487218856811523 + }, + { + "auxiliary_loss_clip": 0.01174491, + "auxiliary_loss_mlp": 0.01049191, + "balance_loss_clip": 1.0280906, + "balance_loss_mlp": 1.05549288, + "epoch": 0.10954456636104014, + "flos": 27818811596160.0, + "grad_norm": 2.4783722356243065, + "language_loss": 0.76171732, + "learning_rate": 3.933993648197955e-06, + "loss": 0.78395414, + "num_input_tokens_seen": 39314625, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1875, + "step": 1822, + "time_per_iteration": 2.5196666717529297 + }, + { + "auxiliary_loss_clip": 0.01175357, + "auxiliary_loss_mlp": 0.01051738, + "balance_loss_clip": 1.03070986, + "balance_loss_mlp": 1.05751145, + "epoch": 0.1096046896137081, + "flos": 33620934372480.0, + "grad_norm": 1.9066217775511896, + "language_loss": 0.79275787, + "learning_rate": 3.933894381201034e-06, + "loss": 0.81502879, + "num_input_tokens_seen": 39336465, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1796875, + "step": 1823, + "time_per_iteration": 2.5665249824523926 + }, + { + "auxiliary_loss_clip": 0.01176588, + "auxiliary_loss_mlp": 0.01047872, + "balance_loss_clip": 1.02583015, + "balance_loss_mlp": 1.05788529, + "epoch": 0.10966481286637607, + "flos": 26980010219520.0, + "grad_norm": 1.7066251744315906, + "language_loss": 0.79424715, + "learning_rate": 3.933795040870645e-06, + "loss": 0.81649172, + "num_input_tokens_seen": 39357930, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1875, + "step": 1824, + "time_per_iteration": 2.5238118171691895 + }, + { + "auxiliary_loss_clip": 0.01173809, + "auxiliary_loss_mlp": 0.01054255, + "balance_loss_clip": 1.03264284, + "balance_loss_mlp": 1.05610347, + "epoch": 0.10972493611904403, + "flos": 23036551678080.0, + "grad_norm": 2.2183246130345, + "language_loss": 0.87992203, + "learning_rate": 3.933695627210554e-06, + "loss": 0.90220273, + "num_input_tokens_seen": 39376380, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.171875, + "step": 1825, + "time_per_iteration": 2.48294734954834 + }, + { + "auxiliary_loss_clip": 0.01171104, + "auxiliary_loss_mlp": 0.01056568, + "balance_loss_clip": 1.03483629, + "balance_loss_mlp": 1.05362988, + "epoch": 0.10978505937171201, + "flos": 38104632443520.0, + "grad_norm": 1.8404731426595848, + "language_loss": 0.76462233, + "learning_rate": 3.933596140224532e-06, + "loss": 0.78689909, + "num_input_tokens_seen": 39399935, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.171875, + "step": 1826, + "time_per_iteration": 2.6397035121917725 + }, + { + "auxiliary_loss_clip": 0.01066703, + "auxiliary_loss_mlp": 0.01001412, + "balance_loss_clip": 0.9983961, + "balance_loss_mlp": 1.02257371, + "epoch": 0.10984518262437998, + "flos": 59849694616320.0, + "grad_norm": 0.8361632453995619, + "language_loss": 0.54999328, + "learning_rate": 3.93349657991635e-06, + "loss": 0.57067442, + "num_input_tokens_seen": 39460685, + "router_z_loss_clip": 0.03015137, + "router_z_loss_mlp": 0.44140625, + "step": 1827, + "time_per_iteration": 3.065896511077881 + }, + { + "auxiliary_loss_clip": 0.01064494, + "auxiliary_loss_mlp": 0.01003719, + "balance_loss_clip": 1.00082231, + "balance_loss_mlp": 1.02098036, + "epoch": 0.10990530587704794, + "flos": 66719837410560.0, + "grad_norm": 0.7348311418426204, + "language_loss": 0.55346334, + "learning_rate": 3.933396946289784e-06, + "loss": 0.57414544, + "num_input_tokens_seen": 39524765, + "router_z_loss_clip": 0.02893066, + "router_z_loss_mlp": 0.43359375, + "step": 1828, + "time_per_iteration": 3.0850460529327393 + }, + { + "auxiliary_loss_clip": 0.01180205, + "auxiliary_loss_mlp": 0.01063699, + "balance_loss_clip": 1.03967869, + "balance_loss_mlp": 1.05754089, + "epoch": 0.10996542912971592, + "flos": 25447199189760.0, + "grad_norm": 2.992065013624077, + "language_loss": 0.84191215, + "learning_rate": 3.933297239348612e-06, + "loss": 0.86435115, + "num_input_tokens_seen": 39543640, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.2265625, + "step": 1829, + "time_per_iteration": 2.5398643016815186 + }, + { + "auxiliary_loss_clip": 0.01181422, + "auxiliary_loss_mlp": 0.01057367, + "balance_loss_clip": 1.03348923, + "balance_loss_mlp": 1.05845475, + "epoch": 0.11002555238238389, + "flos": 44018186186880.0, + "grad_norm": 2.654516298718269, + "language_loss": 0.8878119, + "learning_rate": 3.933197459096614e-06, + "loss": 0.91019976, + "num_input_tokens_seen": 39567525, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.2265625, + "step": 1830, + "time_per_iteration": 2.6912100315093994 + }, + { + "auxiliary_loss_clip": 0.01061018, + "auxiliary_loss_mlp": 0.01017752, + "balance_loss_clip": 1.01497495, + "balance_loss_mlp": 1.01824236, + "epoch": 0.11008567563505185, + "flos": 54065133590400.0, + "grad_norm": 0.6883241829767079, + "language_loss": 0.55492055, + "learning_rate": 3.9330976055375756e-06, + "loss": 0.57570827, + "num_input_tokens_seen": 39628470, + "router_z_loss_clip": 0.02783203, + "router_z_loss_mlp": 0.42773438, + "step": 1831, + "time_per_iteration": 3.075678825378418 + }, + { + "auxiliary_loss_clip": 0.01183643, + "auxiliary_loss_mlp": 0.01072422, + "balance_loss_clip": 1.04829443, + "balance_loss_mlp": 1.05867732, + "epoch": 0.11014579888771983, + "flos": 24243150366720.0, + "grad_norm": 2.054835171188452, + "language_loss": 0.90726995, + "learning_rate": 3.932997678675282e-06, + "loss": 0.92983055, + "num_input_tokens_seen": 39646670, + "router_z_loss_clip": 0.24121094, + "router_z_loss_mlp": 1.25, + "step": 1832, + "time_per_iteration": 2.5084948539733887 + }, + { + "auxiliary_loss_clip": 0.01058943, + "auxiliary_loss_mlp": 0.01015171, + "balance_loss_clip": 1.01245296, + "balance_loss_mlp": 1.01603723, + "epoch": 0.1102059221403878, + "flos": 57743965658880.0, + "grad_norm": 0.7159549093535102, + "language_loss": 0.59889859, + "learning_rate": 3.932897678513523e-06, + "loss": 0.61963969, + "num_input_tokens_seen": 39712915, + "router_z_loss_clip": 0.02722168, + "router_z_loss_mlp": 0.4296875, + "step": 1833, + "time_per_iteration": 3.0748977661132812 + }, + { + "auxiliary_loss_clip": 0.01175273, + "auxiliary_loss_mlp": 0.01050301, + "balance_loss_clip": 1.0277946, + "balance_loss_mlp": 1.05353165, + "epoch": 0.11026604539305576, + "flos": 16795923667200.0, + "grad_norm": 2.6030857455850303, + "language_loss": 0.8095156, + "learning_rate": 3.93279760505609e-06, + "loss": 0.83177137, + "num_input_tokens_seen": 39730650, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.21875, + "step": 1834, + "time_per_iteration": 2.452131509780884 + }, + { + "auxiliary_loss_clip": 0.01179876, + "auxiliary_loss_mlp": 0.0105407, + "balance_loss_clip": 1.0302285, + "balance_loss_mlp": 1.05899858, + "epoch": 0.11032616864572373, + "flos": 23988076911360.0, + "grad_norm": 2.5262438386564807, + "language_loss": 0.90514123, + "learning_rate": 3.932697458306779e-06, + "loss": 0.9274807, + "num_input_tokens_seen": 39751065, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.2109375, + "step": 1835, + "time_per_iteration": 2.5261363983154297 + }, + { + "auxiliary_loss_clip": 0.01177237, + "auxiliary_loss_mlp": 0.0105853, + "balance_loss_clip": 1.03445005, + "balance_loss_mlp": 1.05625033, + "epoch": 0.1103862918983917, + "flos": 19683141851520.0, + "grad_norm": 2.0785934228774003, + "language_loss": 0.63590646, + "learning_rate": 3.932597238269386e-06, + "loss": 0.65826416, + "num_input_tokens_seen": 39769245, + "router_z_loss_clip": 0.24023438, + "router_z_loss_mlp": 1.2109375, + "step": 1836, + "time_per_iteration": 2.502586603164673 + }, + { + "auxiliary_loss_clip": 0.01173672, + "auxiliary_loss_mlp": 0.01057372, + "balance_loss_clip": 1.03547311, + "balance_loss_mlp": 1.05388379, + "epoch": 0.11044641515105967, + "flos": 32160878340480.0, + "grad_norm": 1.9330421575083043, + "language_loss": 0.72814602, + "learning_rate": 3.932496944947711e-06, + "loss": 0.75045645, + "num_input_tokens_seen": 39790830, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.1953125, + "step": 1837, + "time_per_iteration": 2.594910144805908 + }, + { + "auxiliary_loss_clip": 0.01179947, + "auxiliary_loss_mlp": 0.0105928, + "balance_loss_clip": 1.03733349, + "balance_loss_mlp": 1.05921102, + "epoch": 0.11050653840372764, + "flos": 16689233295360.0, + "grad_norm": 2.132041599419941, + "language_loss": 0.79049784, + "learning_rate": 3.93239657834556e-06, + "loss": 0.81289005, + "num_input_tokens_seen": 39809475, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.203125, + "step": 1838, + "time_per_iteration": 2.4922690391540527 + }, + { + "auxiliary_loss_clip": 0.01175098, + "auxiliary_loss_mlp": 0.01061476, + "balance_loss_clip": 1.03883791, + "balance_loss_mlp": 1.05623114, + "epoch": 0.11056666165639562, + "flos": 21208877902080.0, + "grad_norm": 4.130442583787946, + "language_loss": 0.71453696, + "learning_rate": 3.932296138466736e-06, + "loss": 0.73690271, + "num_input_tokens_seen": 39826355, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.1875, + "step": 1839, + "time_per_iteration": 2.5151031017303467 + }, + { + "auxiliary_loss_clip": 0.01183988, + "auxiliary_loss_mlp": 0.01053903, + "balance_loss_clip": 1.03082371, + "balance_loss_mlp": 1.05938148, + "epoch": 0.11062678490906358, + "flos": 19165488998400.0, + "grad_norm": 2.064820600929851, + "language_loss": 0.79099703, + "learning_rate": 3.93219562531505e-06, + "loss": 0.81337595, + "num_input_tokens_seen": 39845335, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.25, + "step": 1840, + "time_per_iteration": 2.487116575241089 + }, + { + "auxiliary_loss_clip": 0.01171241, + "auxiliary_loss_mlp": 0.01053863, + "balance_loss_clip": 1.03234553, + "balance_loss_mlp": 1.05329347, + "epoch": 0.11068690816173155, + "flos": 24895287740160.0, + "grad_norm": 2.0204098875762293, + "language_loss": 0.87691998, + "learning_rate": 3.932095038894311e-06, + "loss": 0.89917111, + "num_input_tokens_seen": 39865065, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.1796875, + "step": 1841, + "time_per_iteration": 2.5141868591308594 + }, + { + "auxiliary_loss_clip": 0.01170262, + "auxiliary_loss_mlp": 0.01053518, + "balance_loss_clip": 1.03126192, + "balance_loss_mlp": 1.05365491, + "epoch": 0.11074703141439952, + "flos": 16472368932480.0, + "grad_norm": 2.3404569451138535, + "language_loss": 0.90582979, + "learning_rate": 3.931994379208334e-06, + "loss": 0.92806768, + "num_input_tokens_seen": 39882780, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.1640625, + "step": 1842, + "time_per_iteration": 2.4583773612976074 + }, + { + "auxiliary_loss_clip": 0.0117179, + "auxiliary_loss_mlp": 0.0105155, + "balance_loss_clip": 1.03080761, + "balance_loss_mlp": 1.05210185, + "epoch": 0.11080715466706749, + "flos": 19172420323200.0, + "grad_norm": 2.171204868901281, + "language_loss": 0.85597986, + "learning_rate": 3.931893646260937e-06, + "loss": 0.87821329, + "num_input_tokens_seen": 39900295, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1953125, + "step": 1843, + "time_per_iteration": 2.4801278114318848 + }, + { + "auxiliary_loss_clip": 0.01174004, + "auxiliary_loss_mlp": 0.01060021, + "balance_loss_clip": 1.03645349, + "balance_loss_mlp": 1.05622911, + "epoch": 0.11086727791973545, + "flos": 27704687109120.0, + "grad_norm": 1.47825888700324, + "language_loss": 0.7494424, + "learning_rate": 3.931792840055941e-06, + "loss": 0.77178264, + "num_input_tokens_seen": 39922075, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.1796875, + "step": 1844, + "time_per_iteration": 2.526383876800537 + }, + { + "auxiliary_loss_clip": 0.01173241, + "auxiliary_loss_mlp": 0.01054334, + "balance_loss_clip": 1.0304563, + "balance_loss_mlp": 1.05405343, + "epoch": 0.11092740117240343, + "flos": 18514967736960.0, + "grad_norm": 2.0036363505702433, + "language_loss": 0.75732028, + "learning_rate": 3.931691960597165e-06, + "loss": 0.77959603, + "num_input_tokens_seen": 39940115, + "router_z_loss_clip": 0.23925781, + "router_z_loss_mlp": 1.1953125, + "step": 1845, + "time_per_iteration": 2.463327169418335 + }, + { + "auxiliary_loss_clip": 0.01171011, + "auxiliary_loss_mlp": 0.01054645, + "balance_loss_clip": 1.03341389, + "balance_loss_mlp": 1.05351365, + "epoch": 0.1109875244250714, + "flos": 20522446018560.0, + "grad_norm": 1.6129010657048202, + "language_loss": 0.76336479, + "learning_rate": 3.9315910078884375e-06, + "loss": 0.7856214, + "num_input_tokens_seen": 39959920, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.171875, + "step": 1846, + "time_per_iteration": 2.465045928955078 + }, + { + "auxiliary_loss_clip": 0.01175917, + "auxiliary_loss_mlp": 0.01053852, + "balance_loss_clip": 1.03262115, + "balance_loss_mlp": 1.05392015, + "epoch": 0.11104764767773936, + "flos": 14098601710080.0, + "grad_norm": 2.9965527726637577, + "language_loss": 0.85611343, + "learning_rate": 3.931489981933584e-06, + "loss": 0.87841111, + "num_input_tokens_seen": 39974755, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.21875, + "step": 1847, + "time_per_iteration": 2.4493908882141113 + }, + { + "auxiliary_loss_clip": 0.01174539, + "auxiliary_loss_mlp": 0.01053148, + "balance_loss_clip": 1.03018796, + "balance_loss_mlp": 1.05326366, + "epoch": 0.11110777093040733, + "flos": 20594518657920.0, + "grad_norm": 3.3740806549350086, + "language_loss": 0.76464605, + "learning_rate": 3.931388882736438e-06, + "loss": 0.78692293, + "num_input_tokens_seen": 39993355, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.2109375, + "step": 1848, + "time_per_iteration": 2.4647536277770996 + }, + { + "auxiliary_loss_clip": 0.01173713, + "auxiliary_loss_mlp": 0.01048417, + "balance_loss_clip": 1.02754378, + "balance_loss_mlp": 1.05833888, + "epoch": 0.11116789418307531, + "flos": 21870065502720.0, + "grad_norm": 2.0750561163348173, + "language_loss": 0.77849847, + "learning_rate": 3.931287710300832e-06, + "loss": 0.8007198, + "num_input_tokens_seen": 40012410, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1484375, + "step": 1849, + "time_per_iteration": 2.514777660369873 + }, + { + "auxiliary_loss_clip": 0.01176001, + "auxiliary_loss_mlp": 0.01056414, + "balance_loss_clip": 1.03496861, + "balance_loss_mlp": 1.05422294, + "epoch": 0.11122801743574327, + "flos": 15523106256000.0, + "grad_norm": 3.6662643697478066, + "language_loss": 0.71315688, + "learning_rate": 3.931186464630601e-06, + "loss": 0.73548102, + "num_input_tokens_seen": 40029315, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.21875, + "step": 1850, + "time_per_iteration": 2.4420053958892822 + }, + { + "auxiliary_loss_clip": 0.01174469, + "auxiliary_loss_mlp": 0.01056777, + "balance_loss_clip": 1.03434181, + "balance_loss_mlp": 1.05444217, + "epoch": 0.11128814068841124, + "flos": 14392279307520.0, + "grad_norm": 2.2721050151861912, + "language_loss": 0.81174368, + "learning_rate": 3.931085145729588e-06, + "loss": 0.83405614, + "num_input_tokens_seen": 40045765, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.203125, + "step": 1851, + "time_per_iteration": 5.341679811477661 + }, + { + "auxiliary_loss_clip": 0.01173679, + "auxiliary_loss_mlp": 0.01051699, + "balance_loss_clip": 1.03126621, + "balance_loss_mlp": 1.05519962, + "epoch": 0.11134826394107922, + "flos": 16653933204480.0, + "grad_norm": 3.240427658931177, + "language_loss": 0.88860446, + "learning_rate": 3.930983753601631e-06, + "loss": 0.91085827, + "num_input_tokens_seen": 40061660, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.1875, + "step": 1852, + "time_per_iteration": 3.8281352519989014 + }, + { + "auxiliary_loss_clip": 0.01176515, + "auxiliary_loss_mlp": 0.01057817, + "balance_loss_clip": 1.03514326, + "balance_loss_mlp": 1.05636191, + "epoch": 0.11140838719374718, + "flos": 16690993061760.0, + "grad_norm": 2.0685366180695848, + "language_loss": 0.72092974, + "learning_rate": 3.930882288250578e-06, + "loss": 0.74327302, + "num_input_tokens_seen": 40080180, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.203125, + "step": 1853, + "time_per_iteration": 2.4896738529205322 + }, + { + "auxiliary_loss_clip": 0.01055744, + "auxiliary_loss_mlp": 0.01002079, + "balance_loss_clip": 0.99923038, + "balance_loss_mlp": 1.0132798, + "epoch": 0.11146851044641515, + "flos": 60976355587200.0, + "grad_norm": 0.7783537669608381, + "language_loss": 0.53647029, + "learning_rate": 3.930780749680273e-06, + "loss": 0.5570485, + "num_input_tokens_seen": 40138910, + "router_z_loss_clip": 0.02844238, + "router_z_loss_mlp": 0.42578125, + "step": 1854, + "time_per_iteration": 3.0189781188964844 + }, + { + "auxiliary_loss_clip": 0.01184355, + "auxiliary_loss_mlp": 0.01052254, + "balance_loss_clip": 1.02937746, + "balance_loss_mlp": 1.057657, + "epoch": 0.11152863369908313, + "flos": 22193835719040.0, + "grad_norm": 2.006296213399466, + "language_loss": 0.8394689, + "learning_rate": 3.9306791378945705e-06, + "loss": 0.861835, + "num_input_tokens_seen": 40157745, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.265625, + "step": 1855, + "time_per_iteration": 2.4908485412597656 + }, + { + "auxiliary_loss_clip": 0.01173641, + "auxiliary_loss_mlp": 0.0106694, + "balance_loss_clip": 1.04588723, + "balance_loss_mlp": 1.05353498, + "epoch": 0.11158875695175109, + "flos": 19537524115200.0, + "grad_norm": 2.2091175797191815, + "language_loss": 0.82098675, + "learning_rate": 3.9305774528973205e-06, + "loss": 0.84339261, + "num_input_tokens_seen": 40175375, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.203125, + "step": 1856, + "time_per_iteration": 2.4653480052948 + }, + { + "auxiliary_loss_clip": 0.01173222, + "auxiliary_loss_mlp": 0.01048519, + "balance_loss_clip": 1.02631092, + "balance_loss_mlp": 1.05662763, + "epoch": 0.11164888020441906, + "flos": 25442709989760.0, + "grad_norm": 2.9605277294776, + "language_loss": 0.8305279, + "learning_rate": 3.93047569469238e-06, + "loss": 0.85274535, + "num_input_tokens_seen": 40195715, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.1640625, + "step": 1857, + "time_per_iteration": 2.5205907821655273 + }, + { + "auxiliary_loss_clip": 0.01173614, + "auxiliary_loss_mlp": 0.01049346, + "balance_loss_clip": 1.0279119, + "balance_loss_mlp": 1.05195725, + "epoch": 0.11170900345708702, + "flos": 15632741543040.0, + "grad_norm": 2.3309612964817923, + "language_loss": 0.83037764, + "learning_rate": 3.930373863283608e-06, + "loss": 0.85260725, + "num_input_tokens_seen": 40213975, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.21875, + "step": 1858, + "time_per_iteration": 2.4567432403564453 + }, + { + "auxiliary_loss_clip": 0.01175678, + "auxiliary_loss_mlp": 0.01062921, + "balance_loss_clip": 1.04205894, + "balance_loss_mlp": 1.05549788, + "epoch": 0.111769126709755, + "flos": 23039424766080.0, + "grad_norm": 2.004830650729854, + "language_loss": 0.91120583, + "learning_rate": 3.930271958674866e-06, + "loss": 0.93359184, + "num_input_tokens_seen": 40233905, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.203125, + "step": 1859, + "time_per_iteration": 2.532048463821411 + }, + { + "auxiliary_loss_clip": 0.01173939, + "auxiliary_loss_mlp": 0.0105127, + "balance_loss_clip": 1.02983618, + "balance_loss_mlp": 1.05344319, + "epoch": 0.11182924996242297, + "flos": 20850705434880.0, + "grad_norm": 2.4768392741235306, + "language_loss": 0.81709313, + "learning_rate": 3.930169980870018e-06, + "loss": 0.83934522, + "num_input_tokens_seen": 40252810, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.203125, + "step": 1860, + "time_per_iteration": 2.4747087955474854 + }, + { + "auxiliary_loss_clip": 0.01170638, + "auxiliary_loss_mlp": 0.01056481, + "balance_loss_clip": 1.0361197, + "balance_loss_mlp": 1.05388653, + "epoch": 0.11188937321509093, + "flos": 17455315587840.0, + "grad_norm": 2.1256274007234937, + "language_loss": 0.75203162, + "learning_rate": 3.930067929872931e-06, + "loss": 0.77430284, + "num_input_tokens_seen": 40272000, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.171875, + "step": 1861, + "time_per_iteration": 2.4773240089416504 + }, + { + "auxiliary_loss_clip": 0.01169857, + "auxiliary_loss_mlp": 0.01052708, + "balance_loss_clip": 1.03318143, + "balance_loss_mlp": 1.05338371, + "epoch": 0.11194949646775891, + "flos": 24095916518400.0, + "grad_norm": 2.0016824982414776, + "language_loss": 0.88759935, + "learning_rate": 3.929965805687474e-06, + "loss": 0.90982509, + "num_input_tokens_seen": 40290660, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.1640625, + "step": 1862, + "time_per_iteration": 2.4750735759735107 + }, + { + "auxiliary_loss_clip": 0.01173358, + "auxiliary_loss_mlp": 0.01059619, + "balance_loss_clip": 1.03880525, + "balance_loss_mlp": 1.05597067, + "epoch": 0.11200961972042688, + "flos": 25153880728320.0, + "grad_norm": 2.1858127473987015, + "language_loss": 0.8707, + "learning_rate": 3.92986360831752e-06, + "loss": 0.89302975, + "num_input_tokens_seen": 40307820, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.171875, + "step": 1863, + "time_per_iteration": 2.510676860809326 + }, + { + "auxiliary_loss_clip": 0.01173984, + "auxiliary_loss_mlp": 0.01051873, + "balance_loss_clip": 1.0283289, + "balance_loss_mlp": 1.05463171, + "epoch": 0.11206974297309484, + "flos": 21288312829440.0, + "grad_norm": 2.0887108243102976, + "language_loss": 0.64630157, + "learning_rate": 3.929761337766945e-06, + "loss": 0.66856015, + "num_input_tokens_seen": 40327430, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.1953125, + "step": 1864, + "time_per_iteration": 2.4843807220458984 + }, + { + "auxiliary_loss_clip": 0.01171142, + "auxiliary_loss_mlp": 0.01051015, + "balance_loss_clip": 1.03169096, + "balance_loss_mlp": 1.05504417, + "epoch": 0.11212986622576282, + "flos": 18915982151040.0, + "grad_norm": 2.0715232833306874, + "language_loss": 0.73895639, + "learning_rate": 3.929658994039627e-06, + "loss": 0.76117796, + "num_input_tokens_seen": 40344545, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.15625, + "step": 1865, + "time_per_iteration": 2.4509596824645996 + }, + { + "auxiliary_loss_clip": 0.01169998, + "auxiliary_loss_mlp": 0.01051954, + "balance_loss_clip": 1.02928007, + "balance_loss_mlp": 1.05253589, + "epoch": 0.11218998947843078, + "flos": 22054754257920.0, + "grad_norm": 2.190736679244475, + "language_loss": 0.84019023, + "learning_rate": 3.929556577139446e-06, + "loss": 0.86240977, + "num_input_tokens_seen": 40362300, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.171875, + "step": 1866, + "time_per_iteration": 2.473715305328369 + }, + { + "auxiliary_loss_clip": 0.01169711, + "auxiliary_loss_mlp": 0.01048459, + "balance_loss_clip": 1.02737069, + "balance_loss_mlp": 1.05260134, + "epoch": 0.11225011273109875, + "flos": 24571697091840.0, + "grad_norm": 1.5419857436109028, + "language_loss": 0.81424987, + "learning_rate": 3.929454087070286e-06, + "loss": 0.83643156, + "num_input_tokens_seen": 40384720, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.171875, + "step": 1867, + "time_per_iteration": 2.5367391109466553 + }, + { + "auxiliary_loss_clip": 0.01172987, + "auxiliary_loss_mlp": 0.01055012, + "balance_loss_clip": 1.03473496, + "balance_loss_mlp": 1.05594015, + "epoch": 0.11231023598376672, + "flos": 28438665621120.0, + "grad_norm": 2.5308159777425976, + "language_loss": 0.86677599, + "learning_rate": 3.929351523836035e-06, + "loss": 0.88905597, + "num_input_tokens_seen": 40404000, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.171875, + "step": 1868, + "time_per_iteration": 2.5044100284576416 + }, + { + "auxiliary_loss_clip": 0.01172172, + "auxiliary_loss_mlp": 0.01049736, + "balance_loss_clip": 1.03026938, + "balance_loss_mlp": 1.05724931, + "epoch": 0.1123703592364347, + "flos": 14426466076800.0, + "grad_norm": 2.333499600894065, + "language_loss": 0.68059367, + "learning_rate": 3.9292488874405795e-06, + "loss": 0.70281279, + "num_input_tokens_seen": 40418665, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.1484375, + "step": 1869, + "time_per_iteration": 2.4462318420410156 + }, + { + "auxiliary_loss_clip": 0.01176659, + "auxiliary_loss_mlp": 0.01061629, + "balance_loss_clip": 1.03969407, + "balance_loss_mlp": 1.05456114, + "epoch": 0.11243048248910266, + "flos": 22236282616320.0, + "grad_norm": 2.049754856307833, + "language_loss": 0.7735095, + "learning_rate": 3.929146177887814e-06, + "loss": 0.79589236, + "num_input_tokens_seen": 40437870, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.21875, + "step": 1870, + "time_per_iteration": 2.488471031188965 + }, + { + "auxiliary_loss_clip": 0.01174025, + "auxiliary_loss_mlp": 0.01053264, + "balance_loss_clip": 1.03177094, + "balance_loss_mlp": 1.05264199, + "epoch": 0.11249060574177062, + "flos": 18584167288320.0, + "grad_norm": 1.8085683914823212, + "language_loss": 0.75747174, + "learning_rate": 3.929043395181631e-06, + "loss": 0.77974463, + "num_input_tokens_seen": 40455570, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.21875, + "step": 1871, + "time_per_iteration": 2.475358486175537 + }, + { + "auxiliary_loss_clip": 0.01171992, + "auxiliary_loss_mlp": 0.01049389, + "balance_loss_clip": 1.02936232, + "balance_loss_mlp": 1.05448031, + "epoch": 0.1125507289944386, + "flos": 22856567604480.0, + "grad_norm": 2.4822417703451265, + "language_loss": 0.81949306, + "learning_rate": 3.928940539325929e-06, + "loss": 0.84170687, + "num_input_tokens_seen": 40473600, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.171875, + "step": 1872, + "time_per_iteration": 2.4984912872314453 + }, + { + "auxiliary_loss_clip": 0.01172929, + "auxiliary_loss_mlp": 0.01052146, + "balance_loss_clip": 1.03183281, + "balance_loss_mlp": 1.05497694, + "epoch": 0.11261085224710657, + "flos": 19676390094720.0, + "grad_norm": 2.7250665555581937, + "language_loss": 0.83564019, + "learning_rate": 3.9288376103246095e-06, + "loss": 0.85789096, + "num_input_tokens_seen": 40490025, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1796875, + "step": 1873, + "time_per_iteration": 2.480172872543335 + }, + { + "auxiliary_loss_clip": 0.01175617, + "auxiliary_loss_mlp": 0.01053305, + "balance_loss_clip": 1.03089404, + "balance_loss_mlp": 1.05352998, + "epoch": 0.11267097549977453, + "flos": 26063246373120.0, + "grad_norm": 2.2103217259008985, + "language_loss": 0.91925669, + "learning_rate": 3.928734608181575e-06, + "loss": 0.9415459, + "num_input_tokens_seen": 40511580, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.21875, + "step": 1874, + "time_per_iteration": 2.5193865299224854 + }, + { + "auxiliary_loss_clip": 0.01168968, + "auxiliary_loss_mlp": 0.01057524, + "balance_loss_clip": 1.0376637, + "balance_loss_mlp": 1.0528394, + "epoch": 0.11273109875244251, + "flos": 21068036674560.0, + "grad_norm": 1.5656160151577971, + "language_loss": 0.7534616, + "learning_rate": 3.928631532900729e-06, + "loss": 0.77572656, + "num_input_tokens_seen": 40530155, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.15625, + "step": 1875, + "time_per_iteration": 2.509000062942505 + }, + { + "auxiliary_loss_clip": 0.01168067, + "auxiliary_loss_mlp": 0.01054767, + "balance_loss_clip": 1.03545499, + "balance_loss_mlp": 1.05498421, + "epoch": 0.11279122200511048, + "flos": 27088999061760.0, + "grad_norm": 1.875753927893446, + "language_loss": 0.71727258, + "learning_rate": 3.928528384485984e-06, + "loss": 0.73950088, + "num_input_tokens_seen": 40549500, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.1328125, + "step": 1876, + "time_per_iteration": 2.5222911834716797 + }, + { + "auxiliary_loss_clip": 0.01170022, + "auxiliary_loss_mlp": 0.01051226, + "balance_loss_clip": 1.03036463, + "balance_loss_mlp": 1.05574679, + "epoch": 0.11285134525777844, + "flos": 20187901722240.0, + "grad_norm": 2.408917627715415, + "language_loss": 0.76760256, + "learning_rate": 3.9284251629412475e-06, + "loss": 0.78981495, + "num_input_tokens_seen": 40567475, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.140625, + "step": 1877, + "time_per_iteration": 2.487870693206787 + }, + { + "auxiliary_loss_clip": 0.01173931, + "auxiliary_loss_mlp": 0.01057623, + "balance_loss_clip": 1.03555715, + "balance_loss_mlp": 1.05530918, + "epoch": 0.11291146851044641, + "flos": 12458453863680.0, + "grad_norm": 2.569804002246691, + "language_loss": 0.88132238, + "learning_rate": 3.928321868270436e-06, + "loss": 0.90363795, + "num_input_tokens_seen": 40583280, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1875, + "step": 1878, + "time_per_iteration": 2.4562089443206787 + }, + { + "auxiliary_loss_clip": 0.01171231, + "auxiliary_loss_mlp": 0.01046868, + "balance_loss_clip": 1.02628088, + "balance_loss_mlp": 1.05382609, + "epoch": 0.11297159176311439, + "flos": 23842315520640.0, + "grad_norm": 2.2792620862185036, + "language_loss": 0.81521666, + "learning_rate": 3.928218500477466e-06, + "loss": 0.83739763, + "num_input_tokens_seen": 40603080, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.171875, + "step": 1879, + "time_per_iteration": 2.515162944793701 + }, + { + "auxiliary_loss_clip": 0.01174903, + "auxiliary_loss_mlp": 0.01056113, + "balance_loss_clip": 1.03513217, + "balance_loss_mlp": 1.05591071, + "epoch": 0.11303171501578235, + "flos": 29930538124800.0, + "grad_norm": 2.9729184409385376, + "language_loss": 0.70101768, + "learning_rate": 3.928115059566259e-06, + "loss": 0.72332788, + "num_input_tokens_seen": 40623255, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.1875, + "step": 1880, + "time_per_iteration": 2.5420267581939697 + }, + { + "auxiliary_loss_clip": 0.01169399, + "auxiliary_loss_mlp": 0.01045441, + "balance_loss_clip": 1.02442431, + "balance_loss_mlp": 1.05396068, + "epoch": 0.11309183826845032, + "flos": 16180558842240.0, + "grad_norm": 1.7442831242084353, + "language_loss": 0.72337204, + "learning_rate": 3.928011545540734e-06, + "loss": 0.74552047, + "num_input_tokens_seen": 40641570, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.15625, + "step": 1881, + "time_per_iteration": 2.4648680686950684 + }, + { + "auxiliary_loss_clip": 0.01172977, + "auxiliary_loss_mlp": 0.01057236, + "balance_loss_clip": 1.03452694, + "balance_loss_mlp": 1.05385113, + "epoch": 0.1131519615211183, + "flos": 12020702814720.0, + "grad_norm": 2.4452990726029533, + "language_loss": 0.74243963, + "learning_rate": 3.927907958404819e-06, + "loss": 0.76474178, + "num_input_tokens_seen": 40658775, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.1953125, + "step": 1882, + "time_per_iteration": 2.459181547164917 + }, + { + "auxiliary_loss_clip": 0.01171271, + "auxiliary_loss_mlp": 0.01052266, + "balance_loss_clip": 1.03045106, + "balance_loss_mlp": 1.05493677, + "epoch": 0.11321208477378626, + "flos": 26250125857920.0, + "grad_norm": 2.8641228673356873, + "language_loss": 0.79328096, + "learning_rate": 3.92780429816244e-06, + "loss": 0.81551635, + "num_input_tokens_seen": 40679555, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.15625, + "step": 1883, + "time_per_iteration": 2.5236945152282715 + }, + { + "auxiliary_loss_clip": 0.01173507, + "auxiliary_loss_mlp": 0.01054538, + "balance_loss_clip": 1.03271067, + "balance_loss_mlp": 1.05288672, + "epoch": 0.11327220802645423, + "flos": 13626376583040.0, + "grad_norm": 3.0524763398538193, + "language_loss": 0.77151698, + "learning_rate": 3.927700564817529e-06, + "loss": 0.79379749, + "num_input_tokens_seen": 40697295, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.203125, + "step": 1884, + "time_per_iteration": 2.4974489212036133 + }, + { + "auxiliary_loss_clip": 0.01063959, + "auxiliary_loss_mlp": 0.01012749, + "balance_loss_clip": 1.0102694, + "balance_loss_mlp": 1.02156711, + "epoch": 0.1133323312791222, + "flos": 57191802814080.0, + "grad_norm": 0.7928734254501784, + "language_loss": 0.55183071, + "learning_rate": 3.927596758374019e-06, + "loss": 0.5725978, + "num_input_tokens_seen": 40758095, + "router_z_loss_clip": 0.02478027, + "router_z_loss_mlp": 0.42382812, + "step": 1885, + "time_per_iteration": 2.9756290912628174 + }, + { + "auxiliary_loss_clip": 0.01166272, + "auxiliary_loss_mlp": 0.01047922, + "balance_loss_clip": 1.02758515, + "balance_loss_mlp": 1.0534817, + "epoch": 0.11339245453179017, + "flos": 24351708245760.0, + "grad_norm": 5.752063942495911, + "language_loss": 0.90240276, + "learning_rate": 3.927492878835848e-06, + "loss": 0.92454469, + "num_input_tokens_seen": 40777140, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.125, + "step": 1886, + "time_per_iteration": 2.5031139850616455 + }, + { + "auxiliary_loss_clip": 0.01168969, + "auxiliary_loss_mlp": 0.01051145, + "balance_loss_clip": 1.03018832, + "balance_loss_mlp": 1.05306387, + "epoch": 0.11345257778445814, + "flos": 22670693700480.0, + "grad_norm": 2.0267704425546036, + "language_loss": 0.85101235, + "learning_rate": 3.927388926206953e-06, + "loss": 0.87321353, + "num_input_tokens_seen": 40797505, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.15625, + "step": 1887, + "time_per_iteration": 2.5177412033081055 + }, + { + "auxiliary_loss_clip": 0.01172698, + "auxiliary_loss_mlp": 0.01061982, + "balance_loss_clip": 1.0417881, + "balance_loss_mlp": 1.05554259, + "epoch": 0.11351270103712612, + "flos": 20988242611200.0, + "grad_norm": 5.5783153731033055, + "language_loss": 0.76168925, + "learning_rate": 3.927284900491277e-06, + "loss": 0.78403604, + "num_input_tokens_seen": 40812970, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.171875, + "step": 1888, + "time_per_iteration": 2.4965853691101074 + }, + { + "auxiliary_loss_clip": 0.01177743, + "auxiliary_loss_mlp": 0.01057849, + "balance_loss_clip": 1.03542566, + "balance_loss_mlp": 1.05632472, + "epoch": 0.11357282428979408, + "flos": 37347923600640.0, + "grad_norm": 2.114301103868513, + "language_loss": 0.68039739, + "learning_rate": 3.927180801692764e-06, + "loss": 0.70275331, + "num_input_tokens_seen": 40837745, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.21875, + "step": 1889, + "time_per_iteration": 2.643867015838623 + }, + { + "auxiliary_loss_clip": 0.01172679, + "auxiliary_loss_mlp": 0.01047613, + "balance_loss_clip": 1.02611947, + "balance_loss_mlp": 1.05620956, + "epoch": 0.11363294754246205, + "flos": 21757018423680.0, + "grad_norm": 2.158184033346157, + "language_loss": 0.84414917, + "learning_rate": 3.927076629815362e-06, + "loss": 0.86635208, + "num_input_tokens_seen": 40856490, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.1640625, + "step": 1890, + "time_per_iteration": 2.5018270015716553 + }, + { + "auxiliary_loss_clip": 0.01168344, + "auxiliary_loss_mlp": 0.01050115, + "balance_loss_clip": 1.02855039, + "balance_loss_mlp": 1.05288363, + "epoch": 0.11369307079513001, + "flos": 22601637803520.0, + "grad_norm": 2.2859967152973373, + "language_loss": 0.65099049, + "learning_rate": 3.926972384863022e-06, + "loss": 0.67317504, + "num_input_tokens_seen": 40874070, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.15625, + "step": 1891, + "time_per_iteration": 2.4870762825012207 + }, + { + "auxiliary_loss_clip": 0.01173219, + "auxiliary_loss_mlp": 0.01043072, + "balance_loss_clip": 1.02274656, + "balance_loss_mlp": 1.05397856, + "epoch": 0.11375319404779799, + "flos": 21944257044480.0, + "grad_norm": 2.358390081637715, + "language_loss": 0.87789619, + "learning_rate": 3.9268680668396956e-06, + "loss": 0.90005904, + "num_input_tokens_seen": 40892425, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1953125, + "step": 1892, + "time_per_iteration": 2.469215154647827 + }, + { + "auxiliary_loss_clip": 0.01173439, + "auxiliary_loss_mlp": 0.01066287, + "balance_loss_clip": 1.04509139, + "balance_loss_mlp": 1.05419993, + "epoch": 0.11381331730046595, + "flos": 26395456285440.0, + "grad_norm": 2.4185703679999775, + "language_loss": 0.72724342, + "learning_rate": 3.926763675749339e-06, + "loss": 0.7496407, + "num_input_tokens_seen": 40912190, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1875, + "step": 1893, + "time_per_iteration": 4.021688222885132 + }, + { + "auxiliary_loss_clip": 0.01169367, + "auxiliary_loss_mlp": 0.0105827, + "balance_loss_clip": 1.03531051, + "balance_loss_mlp": 1.05175805, + "epoch": 0.11387344055313392, + "flos": 23804716959360.0, + "grad_norm": 2.254020248775613, + "language_loss": 0.79367435, + "learning_rate": 3.92665921159591e-06, + "loss": 0.81595069, + "num_input_tokens_seen": 40928395, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.171875, + "step": 1894, + "time_per_iteration": 3.9190711975097656 + }, + { + "auxiliary_loss_clip": 0.01176791, + "auxiliary_loss_mlp": 0.01052535, + "balance_loss_clip": 1.03074312, + "balance_loss_mlp": 1.05530715, + "epoch": 0.1139335638058019, + "flos": 34522865902080.0, + "grad_norm": 2.587114905294773, + "language_loss": 0.78868139, + "learning_rate": 3.926554674383371e-06, + "loss": 0.81097472, + "num_input_tokens_seen": 40946555, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.21875, + "step": 1895, + "time_per_iteration": 2.5924861431121826 + }, + { + "auxiliary_loss_clip": 0.0106161, + "auxiliary_loss_mlp": 0.01001633, + "balance_loss_clip": 0.99917758, + "balance_loss_mlp": 1.01840448, + "epoch": 0.11399368705846986, + "flos": 70587811520640.0, + "grad_norm": 0.8005582337036792, + "language_loss": 0.63316774, + "learning_rate": 3.926450064115686e-06, + "loss": 0.65380025, + "num_input_tokens_seen": 41004910, + "router_z_loss_clip": 0.02453613, + "router_z_loss_mlp": 0.43359375, + "step": 1896, + "time_per_iteration": 3.143843412399292 + }, + { + "auxiliary_loss_clip": 0.01170086, + "auxiliary_loss_mlp": 0.01059473, + "balance_loss_clip": 1.03600097, + "balance_loss_mlp": 1.05385494, + "epoch": 0.11405381031113783, + "flos": 21324259365120.0, + "grad_norm": 1.6058527618620146, + "language_loss": 0.84707338, + "learning_rate": 3.926345380796821e-06, + "loss": 0.86936897, + "num_input_tokens_seen": 41026385, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.15625, + "step": 1897, + "time_per_iteration": 2.5120036602020264 + }, + { + "auxiliary_loss_clip": 0.0117262, + "auxiliary_loss_mlp": 0.01053072, + "balance_loss_clip": 1.03159046, + "balance_loss_mlp": 1.05385423, + "epoch": 0.11411393356380581, + "flos": 19719627091200.0, + "grad_norm": 2.3286063431421926, + "language_loss": 0.79776239, + "learning_rate": 3.9262406244307465e-06, + "loss": 0.8200193, + "num_input_tokens_seen": 41045315, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.1875, + "step": 1898, + "time_per_iteration": 2.5186216831207275 + }, + { + "auxiliary_loss_clip": 0.01174476, + "auxiliary_loss_mlp": 0.01056562, + "balance_loss_clip": 1.03330398, + "balance_loss_mlp": 1.05247831, + "epoch": 0.11417405681647377, + "flos": 17530440883200.0, + "grad_norm": 1.996095488823442, + "language_loss": 0.73049861, + "learning_rate": 3.926135795021435e-06, + "loss": 0.75280899, + "num_input_tokens_seen": 41063390, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.21875, + "step": 1899, + "time_per_iteration": 2.484767198562622 + }, + { + "auxiliary_loss_clip": 0.01059885, + "auxiliary_loss_mlp": 0.01004452, + "balance_loss_clip": 1.00197208, + "balance_loss_mlp": 1.0167762, + "epoch": 0.11423418006914174, + "flos": 59674666619520.0, + "grad_norm": 0.9092154832512579, + "language_loss": 0.63432097, + "learning_rate": 3.92603089257286e-06, + "loss": 0.65496433, + "num_input_tokens_seen": 41124180, + "router_z_loss_clip": 0.02478027, + "router_z_loss_mlp": 0.4296875, + "step": 1900, + "time_per_iteration": 3.0239956378936768 + }, + { + "auxiliary_loss_clip": 0.0117026, + "auxiliary_loss_mlp": 0.01058021, + "balance_loss_clip": 1.03600276, + "balance_loss_mlp": 1.05181098, + "epoch": 0.1142943033218097, + "flos": 22963114321920.0, + "grad_norm": 1.6715138036124124, + "language_loss": 0.78116465, + "learning_rate": 3.925925917089001e-06, + "loss": 0.80344748, + "num_input_tokens_seen": 41143485, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.1875, + "step": 1901, + "time_per_iteration": 2.5007457733154297 + }, + { + "auxiliary_loss_clip": 0.01172182, + "auxiliary_loss_mlp": 0.01059819, + "balance_loss_clip": 1.03894591, + "balance_loss_mlp": 1.05482793, + "epoch": 0.11435442657447768, + "flos": 18256267008000.0, + "grad_norm": 1.9023337273707566, + "language_loss": 0.83676988, + "learning_rate": 3.925820868573839e-06, + "loss": 0.85908997, + "num_input_tokens_seen": 41161695, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.171875, + "step": 1902, + "time_per_iteration": 2.4389002323150635 + }, + { + "auxiliary_loss_clip": 0.0117356, + "auxiliary_loss_mlp": 0.01050952, + "balance_loss_clip": 1.02856493, + "balance_loss_mlp": 1.05356252, + "epoch": 0.11441454982714565, + "flos": 24061191045120.0, + "grad_norm": 1.6958297254772137, + "language_loss": 0.77551281, + "learning_rate": 3.925715747031356e-06, + "loss": 0.79775804, + "num_input_tokens_seen": 41181715, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.203125, + "step": 1903, + "time_per_iteration": 2.503164768218994 + }, + { + "auxiliary_loss_clip": 0.01171838, + "auxiliary_loss_mlp": 0.01045456, + "balance_loss_clip": 1.02651334, + "balance_loss_mlp": 1.05437744, + "epoch": 0.11447467307981361, + "flos": 25337707557120.0, + "grad_norm": 2.553861289811236, + "language_loss": 0.75704938, + "learning_rate": 3.925610552465539e-06, + "loss": 0.77922231, + "num_input_tokens_seen": 41201770, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.171875, + "step": 1904, + "time_per_iteration": 2.5097854137420654 + }, + { + "auxiliary_loss_clip": 0.01171595, + "auxiliary_loss_mlp": 0.01053755, + "balance_loss_clip": 1.03192747, + "balance_loss_mlp": 1.05519056, + "epoch": 0.11453479633248159, + "flos": 21726063878400.0, + "grad_norm": 2.146045336495955, + "language_loss": 0.92476678, + "learning_rate": 3.9255052848803764e-06, + "loss": 0.94702017, + "num_input_tokens_seen": 41220590, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.1640625, + "step": 1905, + "time_per_iteration": 2.4905850887298584 + }, + { + "auxiliary_loss_clip": 0.0117632, + "auxiliary_loss_mlp": 0.01050773, + "balance_loss_clip": 1.02755141, + "balance_loss_mlp": 1.0496794, + "epoch": 0.11459491958514956, + "flos": 12969714096000.0, + "grad_norm": 2.457773566764277, + "language_loss": 0.77108872, + "learning_rate": 3.925399944279861e-06, + "loss": 0.7933597, + "num_input_tokens_seen": 41237250, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.265625, + "step": 1906, + "time_per_iteration": 2.4469265937805176 + }, + { + "auxiliary_loss_clip": 0.01173869, + "auxiliary_loss_mlp": 0.01053097, + "balance_loss_clip": 1.03072143, + "balance_loss_mlp": 1.05375302, + "epoch": 0.11465504283781752, + "flos": 22711273090560.0, + "grad_norm": 2.4555636334810593, + "language_loss": 0.81855345, + "learning_rate": 3.925294530667986e-06, + "loss": 0.84082305, + "num_input_tokens_seen": 41256680, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.203125, + "step": 1907, + "time_per_iteration": 2.499772071838379 + }, + { + "auxiliary_loss_clip": 0.01173651, + "auxiliary_loss_mlp": 0.0106593, + "balance_loss_clip": 1.045784, + "balance_loss_mlp": 1.05599511, + "epoch": 0.1147151660904855, + "flos": 23398387332480.0, + "grad_norm": 4.041607412488977, + "language_loss": 0.84798187, + "learning_rate": 3.92518904404875e-06, + "loss": 0.87037772, + "num_input_tokens_seen": 41270955, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.171875, + "step": 1908, + "time_per_iteration": 2.468519687652588 + }, + { + "auxiliary_loss_clip": 0.0105669, + "auxiliary_loss_mlp": 0.01004376, + "balance_loss_clip": 1.0018847, + "balance_loss_mlp": 1.01344705, + "epoch": 0.11477528934315347, + "flos": 63011843498880.0, + "grad_norm": 0.9477470057539497, + "language_loss": 0.6100027, + "learning_rate": 3.925083484426153e-06, + "loss": 0.63061339, + "num_input_tokens_seen": 41319180, + "router_z_loss_clip": 0.02490234, + "router_z_loss_mlp": 0.43164062, + "step": 1909, + "time_per_iteration": 2.8313472270965576 + }, + { + "auxiliary_loss_clip": 0.01174173, + "auxiliary_loss_mlp": 0.01052438, + "balance_loss_clip": 1.03223228, + "balance_loss_mlp": 1.05660319, + "epoch": 0.11483541259582143, + "flos": 16325601960960.0, + "grad_norm": 2.135894642259737, + "language_loss": 0.78793955, + "learning_rate": 3.924977851804197e-06, + "loss": 0.8102057, + "num_input_tokens_seen": 41337480, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.171875, + "step": 1910, + "time_per_iteration": 2.4613592624664307 + }, + { + "auxiliary_loss_clip": 0.01178149, + "auxiliary_loss_mlp": 0.01051798, + "balance_loss_clip": 1.03005373, + "balance_loss_mlp": 1.05803406, + "epoch": 0.1148955358484894, + "flos": 21580410228480.0, + "grad_norm": 3.035949872237615, + "language_loss": 0.76787984, + "learning_rate": 3.9248721461868875e-06, + "loss": 0.79017925, + "num_input_tokens_seen": 41354650, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.203125, + "step": 1911, + "time_per_iteration": 2.475069761276245 + }, + { + "auxiliary_loss_clip": 0.01166349, + "auxiliary_loss_mlp": 0.01051416, + "balance_loss_clip": 1.03048277, + "balance_loss_mlp": 1.05284548, + "epoch": 0.11495565910115738, + "flos": 27673696650240.0, + "grad_norm": 2.1144124150337023, + "language_loss": 0.7927531, + "learning_rate": 3.9247663675782336e-06, + "loss": 0.81493074, + "num_input_tokens_seen": 41376935, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1328125, + "step": 1912, + "time_per_iteration": 2.543607473373413 + }, + { + "auxiliary_loss_clip": 0.01169469, + "auxiliary_loss_mlp": 0.0105862, + "balance_loss_clip": 1.0369364, + "balance_loss_mlp": 1.05352569, + "epoch": 0.11501578235382534, + "flos": 20632368614400.0, + "grad_norm": 1.9322037304643997, + "language_loss": 0.7777245, + "learning_rate": 3.924660515982246e-06, + "loss": 0.80000544, + "num_input_tokens_seen": 41396105, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.1640625, + "step": 1913, + "time_per_iteration": 2.5093326568603516 + }, + { + "auxiliary_loss_clip": 0.01169525, + "auxiliary_loss_mlp": 0.01051154, + "balance_loss_clip": 1.02889776, + "balance_loss_mlp": 1.05118954, + "epoch": 0.1150759056064933, + "flos": 19829046896640.0, + "grad_norm": 3.783180746712747, + "language_loss": 0.70389271, + "learning_rate": 3.924554591402939e-06, + "loss": 0.72609949, + "num_input_tokens_seen": 41415600, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.1875, + "step": 1914, + "time_per_iteration": 2.5099785327911377 + }, + { + "auxiliary_loss_clip": 0.01058565, + "auxiliary_loss_mlp": 0.01002053, + "balance_loss_clip": 0.99943084, + "balance_loss_mlp": 1.01452589, + "epoch": 0.11513602885916129, + "flos": 70045776311040.0, + "grad_norm": 0.7556045547130329, + "language_loss": 0.61044526, + "learning_rate": 3.92444859384433e-06, + "loss": 0.63105142, + "num_input_tokens_seen": 41478760, + "router_z_loss_clip": 0.02624512, + "router_z_loss_mlp": 0.44140625, + "step": 1915, + "time_per_iteration": 3.1735148429870605 + }, + { + "auxiliary_loss_clip": 0.01172283, + "auxiliary_loss_mlp": 0.01054758, + "balance_loss_clip": 1.03273964, + "balance_loss_mlp": 1.05674434, + "epoch": 0.11519615211182925, + "flos": 15741730385280.0, + "grad_norm": 2.822924091618307, + "language_loss": 0.9323889, + "learning_rate": 3.924342523310436e-06, + "loss": 0.95465934, + "num_input_tokens_seen": 41495720, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.15625, + "step": 1916, + "time_per_iteration": 2.4806342124938965 + }, + { + "auxiliary_loss_clip": 0.01171086, + "auxiliary_loss_mlp": 0.01061893, + "balance_loss_clip": 1.03845596, + "balance_loss_mlp": 1.05340374, + "epoch": 0.11525627536449722, + "flos": 20667632791680.0, + "grad_norm": 1.8768677942494545, + "language_loss": 0.72286755, + "learning_rate": 3.9242363798052806e-06, + "loss": 0.7451973, + "num_input_tokens_seen": 41513585, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.171875, + "step": 1917, + "time_per_iteration": 2.519758701324463 + }, + { + "auxiliary_loss_clip": 0.01171782, + "auxiliary_loss_mlp": 0.0104867, + "balance_loss_clip": 1.02664053, + "balance_loss_mlp": 1.05521619, + "epoch": 0.1153163986171652, + "flos": 20303283185280.0, + "grad_norm": 2.2984335892825594, + "language_loss": 0.74389827, + "learning_rate": 3.92413016333289e-06, + "loss": 0.76610279, + "num_input_tokens_seen": 41533390, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1640625, + "step": 1918, + "time_per_iteration": 2.4867136478424072 + }, + { + "auxiliary_loss_clip": 0.01173604, + "auxiliary_loss_mlp": 0.01045744, + "balance_loss_clip": 1.02394044, + "balance_loss_mlp": 1.05273843, + "epoch": 0.11537652186983316, + "flos": 17639321984640.0, + "grad_norm": 2.1981507651696193, + "language_loss": 0.86515707, + "learning_rate": 3.92402387389729e-06, + "loss": 0.88735056, + "num_input_tokens_seen": 41551015, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.2109375, + "step": 1919, + "time_per_iteration": 2.4838428497314453 + }, + { + "auxiliary_loss_clip": 0.01168988, + "auxiliary_loss_mlp": 0.01054432, + "balance_loss_clip": 1.03190136, + "balance_loss_mlp": 1.05291939, + "epoch": 0.11543664512250112, + "flos": 21069401391360.0, + "grad_norm": 2.516832715272094, + "language_loss": 0.86640596, + "learning_rate": 3.923917511502512e-06, + "loss": 0.88864017, + "num_input_tokens_seen": 41568055, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.15625, + "step": 1920, + "time_per_iteration": 2.524017333984375 + }, + { + "auxiliary_loss_clip": 0.01167627, + "auxiliary_loss_mlp": 0.01047596, + "balance_loss_clip": 1.02549434, + "balance_loss_mlp": 1.05360281, + "epoch": 0.11549676837516909, + "flos": 22747542848640.0, + "grad_norm": 2.2143351457696525, + "language_loss": 0.79792106, + "learning_rate": 3.923811076152589e-06, + "loss": 0.82007331, + "num_input_tokens_seen": 41587435, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.140625, + "step": 1921, + "time_per_iteration": 2.494673252105713 + }, + { + "auxiliary_loss_clip": 0.01174597, + "auxiliary_loss_mlp": 0.01056005, + "balance_loss_clip": 1.03331947, + "balance_loss_mlp": 1.05358851, + "epoch": 0.11555689162783707, + "flos": 19168972617600.0, + "grad_norm": 8.96706495073623, + "language_loss": 0.78418177, + "learning_rate": 3.923704567851557e-06, + "loss": 0.8064878, + "num_input_tokens_seen": 41604975, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.2109375, + "step": 1922, + "time_per_iteration": 2.5293705463409424 + }, + { + "auxiliary_loss_clip": 0.01174074, + "auxiliary_loss_mlp": 0.01060645, + "balance_loss_clip": 1.03910375, + "balance_loss_mlp": 1.05410469, + "epoch": 0.11561701488050503, + "flos": 24572056227840.0, + "grad_norm": 1.8482726295091094, + "language_loss": 0.84187758, + "learning_rate": 3.923597986603456e-06, + "loss": 0.86422473, + "num_input_tokens_seen": 41626155, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.203125, + "step": 1923, + "time_per_iteration": 2.5203118324279785 + }, + { + "auxiliary_loss_clip": 0.01175585, + "auxiliary_loss_mlp": 0.01053498, + "balance_loss_clip": 1.03074098, + "balance_loss_mlp": 1.05742192, + "epoch": 0.115677138133173, + "flos": 17092546179840.0, + "grad_norm": 2.0576366068601666, + "language_loss": 0.80471247, + "learning_rate": 3.9234913324123264e-06, + "loss": 0.82700336, + "num_input_tokens_seen": 41644805, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.1796875, + "step": 1924, + "time_per_iteration": 2.48531436920166 + }, + { + "auxiliary_loss_clip": 0.01058783, + "auxiliary_loss_mlp": 0.01001491, + "balance_loss_clip": 0.99917841, + "balance_loss_mlp": 1.0154866, + "epoch": 0.11573726138584098, + "flos": 62703875266560.0, + "grad_norm": 0.810907468185892, + "language_loss": 0.6115036, + "learning_rate": 3.923384605282212e-06, + "loss": 0.6321063, + "num_input_tokens_seen": 41709345, + "router_z_loss_clip": 0.02307129, + "router_z_loss_mlp": 0.43359375, + "step": 1925, + "time_per_iteration": 3.112396478652954 + }, + { + "auxiliary_loss_clip": 0.01173159, + "auxiliary_loss_mlp": 0.01076027, + "balance_loss_clip": 1.05304384, + "balance_loss_mlp": 1.05447614, + "epoch": 0.11579738463850894, + "flos": 22601135013120.0, + "grad_norm": 2.806943429185086, + "language_loss": 0.7482335, + "learning_rate": 3.923277805217161e-06, + "loss": 0.77072537, + "num_input_tokens_seen": 41730210, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.1875, + "step": 1926, + "time_per_iteration": 2.4890315532684326 + }, + { + "auxiliary_loss_clip": 0.01174997, + "auxiliary_loss_mlp": 0.0106307, + "balance_loss_clip": 1.03873897, + "balance_loss_mlp": 1.0552361, + "epoch": 0.11585750789117691, + "flos": 21726135705600.0, + "grad_norm": 2.429758451090488, + "language_loss": 0.73112315, + "learning_rate": 3.923170932221222e-06, + "loss": 0.7535038, + "num_input_tokens_seen": 41750270, + "router_z_loss_clip": 0.24316406, + "router_z_loss_mlp": 1.203125, + "step": 1927, + "time_per_iteration": 2.4673402309417725 + }, + { + "auxiliary_loss_clip": 0.0117016, + "auxiliary_loss_mlp": 0.01054777, + "balance_loss_clip": 1.03244913, + "balance_loss_mlp": 1.05291271, + "epoch": 0.11591763114384489, + "flos": 26287544851200.0, + "grad_norm": 2.854021270140142, + "language_loss": 0.86824137, + "learning_rate": 3.92306398629845e-06, + "loss": 0.89049077, + "num_input_tokens_seen": 41772975, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.171875, + "step": 1928, + "time_per_iteration": 2.530325412750244 + }, + { + "auxiliary_loss_clip": 0.01173569, + "auxiliary_loss_mlp": 0.01055147, + "balance_loss_clip": 1.03289056, + "balance_loss_mlp": 1.05469573, + "epoch": 0.11597775439651285, + "flos": 23000461488000.0, + "grad_norm": 1.71243688867153, + "language_loss": 0.77567977, + "learning_rate": 3.922956967452898e-06, + "loss": 0.79796684, + "num_input_tokens_seen": 41791765, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.1875, + "step": 1929, + "time_per_iteration": 2.489664316177368 + }, + { + "auxiliary_loss_clip": 0.01168882, + "auxiliary_loss_mlp": 0.01062437, + "balance_loss_clip": 1.04238629, + "balance_loss_mlp": 1.05385804, + "epoch": 0.11603787764918082, + "flos": 31941715507200.0, + "grad_norm": 1.6293868207273203, + "language_loss": 0.76724243, + "learning_rate": 3.922849875688626e-06, + "loss": 0.78955561, + "num_input_tokens_seen": 41815615, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.1484375, + "step": 1930, + "time_per_iteration": 2.5867533683776855 + }, + { + "auxiliary_loss_clip": 0.01169352, + "auxiliary_loss_mlp": 0.0105213, + "balance_loss_clip": 1.03027928, + "balance_loss_mlp": 1.05313969, + "epoch": 0.1160980009018488, + "flos": 22271654534400.0, + "grad_norm": 1.9270697111110349, + "language_loss": 0.72114342, + "learning_rate": 3.922742711009693e-06, + "loss": 0.74335825, + "num_input_tokens_seen": 41834810, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.1640625, + "step": 1931, + "time_per_iteration": 2.5218429565429688 + }, + { + "auxiliary_loss_clip": 0.01173627, + "auxiliary_loss_mlp": 0.0105412, + "balance_loss_clip": 1.03168511, + "balance_loss_mlp": 1.05528855, + "epoch": 0.11615812415451676, + "flos": 22783633038720.0, + "grad_norm": 1.5295866923660926, + "language_loss": 0.82133794, + "learning_rate": 3.922635473420164e-06, + "loss": 0.84361541, + "num_input_tokens_seen": 41854975, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.1875, + "step": 1932, + "time_per_iteration": 2.4879212379455566 + }, + { + "auxiliary_loss_clip": 0.01053319, + "auxiliary_loss_mlp": 0.01007659, + "balance_loss_clip": 1.00539386, + "balance_loss_mlp": 1.0111897, + "epoch": 0.11621824740718473, + "flos": 67146096107520.0, + "grad_norm": 0.7701959329661775, + "language_loss": 0.61053753, + "learning_rate": 3.922528162924105e-06, + "loss": 0.63114727, + "num_input_tokens_seen": 41911105, + "router_z_loss_clip": 0.02270508, + "router_z_loss_mlp": 0.421875, + "step": 1933, + "time_per_iteration": 2.960437059402466 + }, + { + "auxiliary_loss_clip": 0.01172297, + "auxiliary_loss_mlp": 0.01054299, + "balance_loss_clip": 1.03248382, + "balance_loss_mlp": 1.05259895, + "epoch": 0.11627837065985269, + "flos": 20375930442240.0, + "grad_norm": 2.2263920275904425, + "language_loss": 0.85587192, + "learning_rate": 3.922420779525586e-06, + "loss": 0.87813795, + "num_input_tokens_seen": 41931750, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.1953125, + "step": 1934, + "time_per_iteration": 5.3810875415802 + }, + { + "auxiliary_loss_clip": 0.01178805, + "auxiliary_loss_mlp": 0.01059072, + "balance_loss_clip": 1.03667259, + "balance_loss_mlp": 1.05852652, + "epoch": 0.11633849391252067, + "flos": 21725812483200.0, + "grad_norm": 2.481370623449466, + "language_loss": 0.65555394, + "learning_rate": 3.9223133232286776e-06, + "loss": 0.67793274, + "num_input_tokens_seen": 41949400, + "router_z_loss_clip": 0.22363281, + "router_z_loss_mlp": 1.203125, + "step": 1935, + "time_per_iteration": 2.483814239501953 + }, + { + "auxiliary_loss_clip": 0.01176161, + "auxiliary_loss_mlp": 0.01053675, + "balance_loss_clip": 1.03352857, + "balance_loss_mlp": 1.05533004, + "epoch": 0.11639861716518864, + "flos": 18805341283200.0, + "grad_norm": 1.8046174937009931, + "language_loss": 0.75469184, + "learning_rate": 3.922205794037456e-06, + "loss": 0.77699012, + "num_input_tokens_seen": 41968100, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.2109375, + "step": 1936, + "time_per_iteration": 3.8786003589630127 + }, + { + "auxiliary_loss_clip": 0.01173369, + "auxiliary_loss_mlp": 0.0105617, + "balance_loss_clip": 1.0325551, + "balance_loss_mlp": 1.05320179, + "epoch": 0.1164587404178566, + "flos": 21214983214080.0, + "grad_norm": 1.9600676544166102, + "language_loss": 0.84061754, + "learning_rate": 3.922098191955998e-06, + "loss": 0.86291301, + "num_input_tokens_seen": 41986375, + "router_z_loss_clip": 0.23632812, + "router_z_loss_mlp": 1.1953125, + "step": 1937, + "time_per_iteration": 2.5084798336029053 + }, + { + "auxiliary_loss_clip": 0.01166803, + "auxiliary_loss_mlp": 0.01045843, + "balance_loss_clip": 1.02533889, + "balance_loss_mlp": 1.05254185, + "epoch": 0.11651886367052458, + "flos": 27818632028160.0, + "grad_norm": 2.0067941571917927, + "language_loss": 0.76479459, + "learning_rate": 3.921990516988384e-06, + "loss": 0.78692102, + "num_input_tokens_seen": 42006055, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.140625, + "step": 1938, + "time_per_iteration": 2.5770225524902344 + }, + { + "auxiliary_loss_clip": 0.01177239, + "auxiliary_loss_mlp": 0.01051282, + "balance_loss_clip": 1.02963328, + "balance_loss_mlp": 1.05566061, + "epoch": 0.11657898692319255, + "flos": 22889569224960.0, + "grad_norm": 2.0274312317590084, + "language_loss": 0.79127967, + "learning_rate": 3.921882769138696e-06, + "loss": 0.8135649, + "num_input_tokens_seen": 42024995, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.21875, + "step": 1939, + "time_per_iteration": 2.5020864009857178 + }, + { + "auxiliary_loss_clip": 0.01173869, + "auxiliary_loss_mlp": 0.0105151, + "balance_loss_clip": 1.02886081, + "balance_loss_mlp": 1.05530274, + "epoch": 0.11663911017586051, + "flos": 24315905364480.0, + "grad_norm": 3.7077039427391343, + "language_loss": 0.86712289, + "learning_rate": 3.9217749484110215e-06, + "loss": 0.88937664, + "num_input_tokens_seen": 42042640, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.1875, + "step": 1940, + "time_per_iteration": 2.484750270843506 + }, + { + "auxiliary_loss_clip": 0.01172427, + "auxiliary_loss_mlp": 0.0105781, + "balance_loss_clip": 1.03699601, + "balance_loss_mlp": 1.05674481, + "epoch": 0.11669923342852849, + "flos": 42340152470400.0, + "grad_norm": 1.4506595925957548, + "language_loss": 0.75750297, + "learning_rate": 3.921667054809449e-06, + "loss": 0.7798053, + "num_input_tokens_seen": 42067005, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.15625, + "step": 1941, + "time_per_iteration": 2.7000842094421387 + }, + { + "auxiliary_loss_clip": 0.01167731, + "auxiliary_loss_mlp": 0.01059034, + "balance_loss_clip": 1.0375998, + "balance_loss_mlp": 1.05215478, + "epoch": 0.11675935668119646, + "flos": 14642288945280.0, + "grad_norm": 2.1675787105273256, + "language_loss": 0.8828994, + "learning_rate": 3.921559088338068e-06, + "loss": 0.90516704, + "num_input_tokens_seen": 42082295, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.15625, + "step": 1942, + "time_per_iteration": 2.460014581680298 + }, + { + "auxiliary_loss_clip": 0.01170106, + "auxiliary_loss_mlp": 0.01048326, + "balance_loss_clip": 1.02839422, + "balance_loss_mlp": 1.05465341, + "epoch": 0.11681947993386442, + "flos": 35116470063360.0, + "grad_norm": 1.688985931696262, + "language_loss": 0.67729998, + "learning_rate": 3.921451049000975e-06, + "loss": 0.69948429, + "num_input_tokens_seen": 42105295, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.15625, + "step": 1943, + "time_per_iteration": 2.5899837017059326 + }, + { + "auxiliary_loss_clip": 0.01170349, + "auxiliary_loss_mlp": 0.01046897, + "balance_loss_clip": 1.02586865, + "balance_loss_mlp": 1.05437136, + "epoch": 0.11687960318653239, + "flos": 38983259024640.0, + "grad_norm": 2.2767867948110263, + "language_loss": 0.69852126, + "learning_rate": 3.921342936802265e-06, + "loss": 0.72069371, + "num_input_tokens_seen": 42125520, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1640625, + "step": 1944, + "time_per_iteration": 2.6237125396728516 + }, + { + "auxiliary_loss_clip": 0.01166997, + "auxiliary_loss_mlp": 0.01045496, + "balance_loss_clip": 1.02513456, + "balance_loss_mlp": 1.05112338, + "epoch": 0.11693972643920036, + "flos": 25994980575360.0, + "grad_norm": 2.1059371232711572, + "language_loss": 0.82477605, + "learning_rate": 3.921234751746038e-06, + "loss": 0.84690094, + "num_input_tokens_seen": 42146335, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.15625, + "step": 1945, + "time_per_iteration": 2.519148349761963 + }, + { + "auxiliary_loss_clip": 0.01169071, + "auxiliary_loss_mlp": 0.01058941, + "balance_loss_clip": 1.03805542, + "balance_loss_mlp": 1.05241919, + "epoch": 0.11699984969186833, + "flos": 27272107618560.0, + "grad_norm": 2.378189536328268, + "language_loss": 0.7640717, + "learning_rate": 3.9211264938363975e-06, + "loss": 0.7863518, + "num_input_tokens_seen": 42165320, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1640625, + "step": 1946, + "time_per_iteration": 2.516782283782959 + }, + { + "auxiliary_loss_clip": 0.01169578, + "auxiliary_loss_mlp": 0.0105231, + "balance_loss_clip": 1.03249717, + "balance_loss_mlp": 1.05597568, + "epoch": 0.1170599729445363, + "flos": 15267853232640.0, + "grad_norm": 2.040115867247402, + "language_loss": 0.68749321, + "learning_rate": 3.921018163077448e-06, + "loss": 0.70971209, + "num_input_tokens_seen": 42182955, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.140625, + "step": 1947, + "time_per_iteration": 2.443979501724243 + }, + { + "auxiliary_loss_clip": 0.01173266, + "auxiliary_loss_mlp": 0.01062988, + "balance_loss_clip": 1.041924, + "balance_loss_mlp": 1.05761504, + "epoch": 0.11712009619720427, + "flos": 17164439251200.0, + "grad_norm": 1.892409556337103, + "language_loss": 0.84730887, + "learning_rate": 3.920909759473295e-06, + "loss": 0.86967146, + "num_input_tokens_seen": 42200760, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.15625, + "step": 1948, + "time_per_iteration": 2.456883192062378 + }, + { + "auxiliary_loss_clip": 0.01060706, + "auxiliary_loss_mlp": 0.01000375, + "balance_loss_clip": 0.99784815, + "balance_loss_mlp": 1.01743388, + "epoch": 0.11718021944987224, + "flos": 70940991997440.0, + "grad_norm": 0.8146373030628324, + "language_loss": 0.65102834, + "learning_rate": 3.920801283028054e-06, + "loss": 0.6716392, + "num_input_tokens_seen": 42265745, + "router_z_loss_clip": 0.02526855, + "router_z_loss_mlp": 0.43359375, + "step": 1949, + "time_per_iteration": 3.083716630935669 + }, + { + "auxiliary_loss_clip": 0.01168495, + "auxiliary_loss_mlp": 0.01056708, + "balance_loss_clip": 1.03614426, + "balance_loss_mlp": 1.05524707, + "epoch": 0.1172403427025402, + "flos": 27453456408960.0, + "grad_norm": 1.7265339558443402, + "language_loss": 0.71616268, + "learning_rate": 3.920692733745835e-06, + "loss": 0.73841476, + "num_input_tokens_seen": 42286245, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.1328125, + "step": 1950, + "time_per_iteration": 2.5140750408172607 + }, + { + "auxiliary_loss_clip": 0.01174036, + "auxiliary_loss_mlp": 0.0105899, + "balance_loss_clip": 1.03823543, + "balance_loss_mlp": 1.05524027, + "epoch": 0.11730046595520818, + "flos": 15668723992320.0, + "grad_norm": 13.047142281747327, + "language_loss": 0.76811576, + "learning_rate": 3.920584111630755e-06, + "loss": 0.79044604, + "num_input_tokens_seen": 42302710, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1875, + "step": 1951, + "time_per_iteration": 2.4511098861694336 + }, + { + "auxiliary_loss_clip": 0.01172385, + "auxiliary_loss_mlp": 0.0106409, + "balance_loss_clip": 1.04351449, + "balance_loss_mlp": 1.05736876, + "epoch": 0.11736058920787615, + "flos": 25630164092160.0, + "grad_norm": 2.4689531190361858, + "language_loss": 0.75770319, + "learning_rate": 3.9204754166869325e-06, + "loss": 0.78006792, + "num_input_tokens_seen": 42324115, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.15625, + "step": 1952, + "time_per_iteration": 2.5249404907226562 + }, + { + "auxiliary_loss_clip": 0.01170041, + "auxiliary_loss_mlp": 0.01060486, + "balance_loss_clip": 1.04038692, + "balance_loss_mlp": 1.05350161, + "epoch": 0.11742071246054411, + "flos": 21434289701760.0, + "grad_norm": 1.8929141854364566, + "language_loss": 0.71838403, + "learning_rate": 3.920366648918491e-06, + "loss": 0.74068928, + "num_input_tokens_seen": 42342505, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.171875, + "step": 1953, + "time_per_iteration": 2.5321006774902344 + }, + { + "auxiliary_loss_clip": 0.01178671, + "auxiliary_loss_mlp": 0.01054108, + "balance_loss_clip": 1.03186345, + "balance_loss_mlp": 1.05794597, + "epoch": 0.11748083571321208, + "flos": 15997845335040.0, + "grad_norm": 2.5505654209141317, + "language_loss": 0.7939415, + "learning_rate": 3.920257808329552e-06, + "loss": 0.81626928, + "num_input_tokens_seen": 42360525, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.203125, + "step": 1954, + "time_per_iteration": 2.477182149887085 + }, + { + "auxiliary_loss_clip": 0.01174109, + "auxiliary_loss_mlp": 0.01060284, + "balance_loss_clip": 1.03859961, + "balance_loss_mlp": 1.05628419, + "epoch": 0.11754095896588006, + "flos": 16180056051840.0, + "grad_norm": 2.1305529461824344, + "language_loss": 0.85609406, + "learning_rate": 3.920148894924246e-06, + "loss": 0.878438, + "num_input_tokens_seen": 42377045, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.1796875, + "step": 1955, + "time_per_iteration": 2.4685070514678955 + }, + { + "auxiliary_loss_clip": 0.01172636, + "auxiliary_loss_mlp": 0.01049417, + "balance_loss_clip": 1.02949762, + "balance_loss_mlp": 1.05551839, + "epoch": 0.11760108221854802, + "flos": 13261596013440.0, + "grad_norm": 3.149612339355701, + "language_loss": 0.77626467, + "learning_rate": 3.920039908706701e-06, + "loss": 0.79848516, + "num_input_tokens_seen": 42393960, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.171875, + "step": 1956, + "time_per_iteration": 2.469529151916504 + }, + { + "auxiliary_loss_clip": 0.01169266, + "auxiliary_loss_mlp": 0.01054147, + "balance_loss_clip": 1.03357112, + "balance_loss_mlp": 1.05667603, + "epoch": 0.11766120547121599, + "flos": 24498439303680.0, + "grad_norm": 4.253665449575931, + "language_loss": 0.80333984, + "learning_rate": 3.91993084968105e-06, + "loss": 0.82557392, + "num_input_tokens_seen": 42413160, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.125, + "step": 1957, + "time_per_iteration": 2.508272886276245 + }, + { + "auxiliary_loss_clip": 0.01176684, + "auxiliary_loss_mlp": 0.01049845, + "balance_loss_clip": 1.03003287, + "balance_loss_mlp": 1.05895627, + "epoch": 0.11772132872388397, + "flos": 17784005967360.0, + "grad_norm": 3.1587185145349737, + "language_loss": 0.77638769, + "learning_rate": 3.919821717851428e-06, + "loss": 0.79865301, + "num_input_tokens_seen": 42432590, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.1796875, + "step": 1958, + "time_per_iteration": 2.48563551902771 + }, + { + "auxiliary_loss_clip": 0.01174636, + "auxiliary_loss_mlp": 0.01048305, + "balance_loss_clip": 1.02640605, + "balance_loss_mlp": 1.05859971, + "epoch": 0.11778145197655193, + "flos": 13217030213760.0, + "grad_norm": 2.0966272081131985, + "language_loss": 0.76906043, + "learning_rate": 3.919712513221976e-06, + "loss": 0.79128981, + "num_input_tokens_seen": 42450135, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.15625, + "step": 1959, + "time_per_iteration": 2.4826674461364746 + }, + { + "auxiliary_loss_clip": 0.01171719, + "auxiliary_loss_mlp": 0.01050959, + "balance_loss_clip": 1.03128934, + "balance_loss_mlp": 1.05581582, + "epoch": 0.1178415752292199, + "flos": 20230204965120.0, + "grad_norm": 3.13785825532277, + "language_loss": 0.69989765, + "learning_rate": 3.919603235796832e-06, + "loss": 0.72212446, + "num_input_tokens_seen": 42470050, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.15625, + "step": 1960, + "time_per_iteration": 2.4965405464172363 + }, + { + "auxiliary_loss_clip": 0.01178622, + "auxiliary_loss_mlp": 0.01058274, + "balance_loss_clip": 1.03704309, + "balance_loss_mlp": 1.05921102, + "epoch": 0.11790169848188788, + "flos": 13040134709760.0, + "grad_norm": 2.5802576751796327, + "language_loss": 0.81135678, + "learning_rate": 3.9194938855801406e-06, + "loss": 0.83372575, + "num_input_tokens_seen": 42484335, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1953125, + "step": 1961, + "time_per_iteration": 2.456537961959839 + }, + { + "auxiliary_loss_clip": 0.01167569, + "auxiliary_loss_mlp": 0.01055573, + "balance_loss_clip": 1.03640413, + "balance_loss_mlp": 1.05682623, + "epoch": 0.11796182173455584, + "flos": 22265728790400.0, + "grad_norm": 3.5009623449342206, + "language_loss": 0.92335653, + "learning_rate": 3.919384462576049e-06, + "loss": 0.94558799, + "num_input_tokens_seen": 42502720, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.109375, + "step": 1962, + "time_per_iteration": 2.4831955432891846 + }, + { + "auxiliary_loss_clip": 0.01175087, + "auxiliary_loss_mlp": 0.01054037, + "balance_loss_clip": 1.03379536, + "balance_loss_mlp": 1.05849361, + "epoch": 0.1180219449872238, + "flos": 10635017892480.0, + "grad_norm": 2.1891263418172353, + "language_loss": 0.87132198, + "learning_rate": 3.919274966788707e-06, + "loss": 0.89361322, + "num_input_tokens_seen": 42519460, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.1640625, + "step": 1963, + "time_per_iteration": 2.453864097595215 + }, + { + "auxiliary_loss_clip": 0.01174267, + "auxiliary_loss_mlp": 0.01047313, + "balance_loss_clip": 1.02764392, + "balance_loss_mlp": 1.05800569, + "epoch": 0.11808206823989177, + "flos": 20923532259840.0, + "grad_norm": 2.1122466665000155, + "language_loss": 0.84163988, + "learning_rate": 3.919165398222265e-06, + "loss": 0.86385566, + "num_input_tokens_seen": 42539420, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1640625, + "step": 1964, + "time_per_iteration": 2.496471405029297 + }, + { + "auxiliary_loss_clip": 0.01178376, + "auxiliary_loss_mlp": 0.01056634, + "balance_loss_clip": 1.03699994, + "balance_loss_mlp": 1.06327403, + "epoch": 0.11814219149255975, + "flos": 20777770869120.0, + "grad_norm": 1.965243610427017, + "language_loss": 0.82994169, + "learning_rate": 3.919055756880879e-06, + "loss": 0.85229176, + "num_input_tokens_seen": 42558225, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.1484375, + "step": 1965, + "time_per_iteration": 2.46545672416687 + }, + { + "auxiliary_loss_clip": 0.01175057, + "auxiliary_loss_mlp": 0.01049044, + "balance_loss_clip": 1.02883816, + "balance_loss_mlp": 1.05948591, + "epoch": 0.11820231474522772, + "flos": 48759938542080.0, + "grad_norm": 1.6968751772896917, + "language_loss": 0.74517393, + "learning_rate": 3.918946042768707e-06, + "loss": 0.76741493, + "num_input_tokens_seen": 42580790, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.15625, + "step": 1966, + "time_per_iteration": 2.730928421020508 + }, + { + "auxiliary_loss_clip": 0.01185811, + "auxiliary_loss_mlp": 0.01055482, + "balance_loss_clip": 1.03552604, + "balance_loss_mlp": 1.0661025, + "epoch": 0.11826243799789568, + "flos": 16690598012160.0, + "grad_norm": 3.573953561090722, + "language_loss": 0.725128, + "learning_rate": 3.918836255889908e-06, + "loss": 0.74754095, + "num_input_tokens_seen": 42597355, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.1953125, + "step": 1967, + "time_per_iteration": 2.459409713745117 + }, + { + "auxiliary_loss_clip": 0.01174728, + "auxiliary_loss_mlp": 0.01044222, + "balance_loss_clip": 1.02409899, + "balance_loss_mlp": 1.0596199, + "epoch": 0.11832256125056366, + "flos": 16909868586240.0, + "grad_norm": 2.07735233424318, + "language_loss": 0.87874025, + "learning_rate": 3.9187263962486456e-06, + "loss": 0.90092969, + "num_input_tokens_seen": 42616060, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.15625, + "step": 1968, + "time_per_iteration": 2.474860191345215 + }, + { + "auxiliary_loss_clip": 0.0117476, + "auxiliary_loss_mlp": 0.01051094, + "balance_loss_clip": 1.03083992, + "balance_loss_mlp": 1.05980873, + "epoch": 0.11838268450323162, + "flos": 22820405587200.0, + "grad_norm": 2.3710109771053904, + "language_loss": 0.66827953, + "learning_rate": 3.918616463849087e-06, + "loss": 0.69053805, + "num_input_tokens_seen": 42636285, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1484375, + "step": 1969, + "time_per_iteration": 2.5025057792663574 + }, + { + "auxiliary_loss_clip": 0.01177024, + "auxiliary_loss_mlp": 0.01052173, + "balance_loss_clip": 1.03172874, + "balance_loss_mlp": 1.06375933, + "epoch": 0.11844280775589959, + "flos": 33545844990720.0, + "grad_norm": 2.0668162562591013, + "language_loss": 0.81199527, + "learning_rate": 3.918506458695399e-06, + "loss": 0.83428723, + "num_input_tokens_seen": 42658320, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1328125, + "step": 1970, + "time_per_iteration": 2.6005184650421143 + }, + { + "auxiliary_loss_clip": 0.01071753, + "auxiliary_loss_mlp": 0.0102596, + "balance_loss_clip": 1.02306354, + "balance_loss_mlp": 1.02803779, + "epoch": 0.11850293100856757, + "flos": 66350998604160.0, + "grad_norm": 0.8059191438251484, + "language_loss": 0.66145539, + "learning_rate": 3.918396380791754e-06, + "loss": 0.68243253, + "num_input_tokens_seen": 42721500, + "router_z_loss_clip": 0.02893066, + "router_z_loss_mlp": 0.4375, + "step": 1971, + "time_per_iteration": 3.0580737590789795 + }, + { + "auxiliary_loss_clip": 0.01173379, + "auxiliary_loss_mlp": 0.0105069, + "balance_loss_clip": 1.03112769, + "balance_loss_mlp": 1.0578413, + "epoch": 0.11856305426123553, + "flos": 24681045070080.0, + "grad_norm": 1.9720310647047086, + "language_loss": 0.79760695, + "learning_rate": 3.918286230142327e-06, + "loss": 0.81984764, + "num_input_tokens_seen": 42739825, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.15625, + "step": 1972, + "time_per_iteration": 2.5330677032470703 + }, + { + "auxiliary_loss_clip": 0.01174806, + "auxiliary_loss_mlp": 0.01052417, + "balance_loss_clip": 1.03144813, + "balance_loss_mlp": 1.06013465, + "epoch": 0.1186231775139035, + "flos": 24280102483200.0, + "grad_norm": 2.451560144092476, + "language_loss": 0.72162819, + "learning_rate": 3.918176006751292e-06, + "loss": 0.74390036, + "num_input_tokens_seen": 42758695, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.140625, + "step": 1973, + "time_per_iteration": 2.497079372406006 + }, + { + "auxiliary_loss_clip": 0.0117035, + "auxiliary_loss_mlp": 0.01043803, + "balance_loss_clip": 1.02407408, + "balance_loss_mlp": 1.05802357, + "epoch": 0.11868330076657148, + "flos": 21757413473280.0, + "grad_norm": 2.2680636805256897, + "language_loss": 0.71724641, + "learning_rate": 3.918065710622832e-06, + "loss": 0.73938787, + "num_input_tokens_seen": 42778510, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.125, + "step": 1974, + "time_per_iteration": 2.5145771503448486 + }, + { + "auxiliary_loss_clip": 0.01170733, + "auxiliary_loss_mlp": 0.01038557, + "balance_loss_clip": 1.01937568, + "balance_loss_mlp": 1.05660915, + "epoch": 0.11874342401923944, + "flos": 17193274894080.0, + "grad_norm": 2.192039880981389, + "language_loss": 0.77186036, + "learning_rate": 3.917955341761128e-06, + "loss": 0.7939533, + "num_input_tokens_seen": 42793995, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.140625, + "step": 1975, + "time_per_iteration": 2.4483766555786133 + }, + { + "auxiliary_loss_clip": 0.01172653, + "auxiliary_loss_mlp": 0.01051494, + "balance_loss_clip": 1.03212273, + "balance_loss_mlp": 1.06021976, + "epoch": 0.11880354727190741, + "flos": 15229572312960.0, + "grad_norm": 2.2667330410251596, + "language_loss": 0.7498399, + "learning_rate": 3.917844900170364e-06, + "loss": 0.77208138, + "num_input_tokens_seen": 42809000, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.125, + "step": 1976, + "time_per_iteration": 3.9421374797821045 + }, + { + "auxiliary_loss_clip": 0.01172444, + "auxiliary_loss_mlp": 0.01044553, + "balance_loss_clip": 1.02544367, + "balance_loss_mlp": 1.05979395, + "epoch": 0.11886367052457537, + "flos": 27309706179840.0, + "grad_norm": 1.6192257034176818, + "language_loss": 0.75191766, + "learning_rate": 3.91773438585473e-06, + "loss": 0.77408761, + "num_input_tokens_seen": 42831585, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.125, + "step": 1977, + "time_per_iteration": 3.9506070613861084 + }, + { + "auxiliary_loss_clip": 0.01172952, + "auxiliary_loss_mlp": 0.01053238, + "balance_loss_clip": 1.0338068, + "balance_loss_mlp": 1.05777454, + "epoch": 0.11892379377724335, + "flos": 21798280172160.0, + "grad_norm": 7.387040580957373, + "language_loss": 0.7393533, + "learning_rate": 3.9176237988184165e-06, + "loss": 0.76161528, + "num_input_tokens_seen": 42848420, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.15625, + "step": 1978, + "time_per_iteration": 2.4754912853240967 + }, + { + "auxiliary_loss_clip": 0.01168854, + "auxiliary_loss_mlp": 0.01048253, + "balance_loss_clip": 1.02872658, + "balance_loss_mlp": 1.05782461, + "epoch": 0.11898391702991132, + "flos": 13991013498240.0, + "grad_norm": 1.709416576437117, + "language_loss": 0.73273945, + "learning_rate": 3.917513139065616e-06, + "loss": 0.75491059, + "num_input_tokens_seen": 42866645, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.109375, + "step": 1979, + "time_per_iteration": 2.478938579559326 + }, + { + "auxiliary_loss_clip": 0.01172076, + "auxiliary_loss_mlp": 0.01048439, + "balance_loss_clip": 1.0286746, + "balance_loss_mlp": 1.05735934, + "epoch": 0.11904404028257928, + "flos": 32234567091840.0, + "grad_norm": 1.877436937799078, + "language_loss": 0.98387957, + "learning_rate": 3.917402406600525e-06, + "loss": 1.00608468, + "num_input_tokens_seen": 42888515, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1484375, + "step": 1980, + "time_per_iteration": 2.5758843421936035 + }, + { + "auxiliary_loss_clip": 0.01173349, + "auxiliary_loss_mlp": 0.01046819, + "balance_loss_clip": 1.02580202, + "balance_loss_mlp": 1.05741775, + "epoch": 0.11910416353524726, + "flos": 23586272398080.0, + "grad_norm": 1.8930015682875676, + "language_loss": 0.85929906, + "learning_rate": 3.917291601427342e-06, + "loss": 0.88150084, + "num_input_tokens_seen": 42909035, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.15625, + "step": 1981, + "time_per_iteration": 2.505051612854004 + }, + { + "auxiliary_loss_clip": 0.01172656, + "auxiliary_loss_mlp": 0.01057237, + "balance_loss_clip": 1.03601766, + "balance_loss_mlp": 1.057832, + "epoch": 0.11916428678791523, + "flos": 25333038789120.0, + "grad_norm": 1.9242535829958574, + "language_loss": 0.85007018, + "learning_rate": 3.91718072355027e-06, + "loss": 0.87236911, + "num_input_tokens_seen": 42927555, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1484375, + "step": 1982, + "time_per_iteration": 2.513012409210205 + }, + { + "auxiliary_loss_clip": 0.01166906, + "auxiliary_loss_mlp": 0.01046511, + "balance_loss_clip": 1.02667475, + "balance_loss_mlp": 1.05463564, + "epoch": 0.11922441004058319, + "flos": 19788431592960.0, + "grad_norm": 1.926275276354154, + "language_loss": 0.85026526, + "learning_rate": 3.917069772973513e-06, + "loss": 0.87239939, + "num_input_tokens_seen": 42945300, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.125, + "step": 1983, + "time_per_iteration": 2.4627623558044434 + }, + { + "auxiliary_loss_clip": 0.01172266, + "auxiliary_loss_mlp": 0.01049845, + "balance_loss_clip": 1.02980638, + "balance_loss_mlp": 1.05581713, + "epoch": 0.11928453329325117, + "flos": 21536347219200.0, + "grad_norm": 3.2679367356540894, + "language_loss": 0.77020949, + "learning_rate": 3.916958749701277e-06, + "loss": 0.79243064, + "num_input_tokens_seen": 42961295, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.15625, + "step": 1984, + "time_per_iteration": 2.466224193572998 + }, + { + "auxiliary_loss_clip": 0.01168386, + "auxiliary_loss_mlp": 0.01055095, + "balance_loss_clip": 1.03542554, + "balance_loss_mlp": 1.05464029, + "epoch": 0.11934465654591914, + "flos": 20815010294400.0, + "grad_norm": 1.7272493982968635, + "language_loss": 0.83323789, + "learning_rate": 3.9168476537377745e-06, + "loss": 0.85547268, + "num_input_tokens_seen": 42980330, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.140625, + "step": 1985, + "time_per_iteration": 2.485797882080078 + }, + { + "auxiliary_loss_clip": 0.01162278, + "auxiliary_loss_mlp": 0.01046498, + "balance_loss_clip": 1.02659011, + "balance_loss_mlp": 1.05230284, + "epoch": 0.1194047797985871, + "flos": 19060486565760.0, + "grad_norm": 1.9847962315308523, + "language_loss": 0.7379061, + "learning_rate": 3.916736485087216e-06, + "loss": 0.75999391, + "num_input_tokens_seen": 42996125, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.1015625, + "step": 1986, + "time_per_iteration": 2.4477651119232178 + }, + { + "auxiliary_loss_clip": 0.01167211, + "auxiliary_loss_mlp": 0.01055872, + "balance_loss_clip": 1.03664303, + "balance_loss_mlp": 1.05418456, + "epoch": 0.11946490305125507, + "flos": 27190805184000.0, + "grad_norm": 2.0940320364759573, + "language_loss": 0.7209813, + "learning_rate": 3.916625243753819e-06, + "loss": 0.74321216, + "num_input_tokens_seen": 43014180, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.125, + "step": 1987, + "time_per_iteration": 2.528564929962158 + }, + { + "auxiliary_loss_clip": 0.01166851, + "auxiliary_loss_mlp": 0.01053632, + "balance_loss_clip": 1.03256774, + "balance_loss_mlp": 1.05243921, + "epoch": 0.11952502630392305, + "flos": 21140791672320.0, + "grad_norm": 2.544292945564917, + "language_loss": 0.72455966, + "learning_rate": 3.916513929741799e-06, + "loss": 0.74676454, + "num_input_tokens_seen": 43032120, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.140625, + "step": 1988, + "time_per_iteration": 2.482295274734497 + }, + { + "auxiliary_loss_clip": 0.01168039, + "auxiliary_loss_mlp": 0.01063511, + "balance_loss_clip": 1.04195809, + "balance_loss_mlp": 1.05425191, + "epoch": 0.11958514955659101, + "flos": 22124241118080.0, + "grad_norm": 2.3919568417846544, + "language_loss": 0.80848205, + "learning_rate": 3.91640254305538e-06, + "loss": 0.83079755, + "num_input_tokens_seen": 43052215, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.140625, + "step": 1989, + "time_per_iteration": 2.5321335792541504 + }, + { + "auxiliary_loss_clip": 0.01171171, + "auxiliary_loss_mlp": 0.01051003, + "balance_loss_clip": 1.03040385, + "balance_loss_mlp": 1.05518925, + "epoch": 0.11964527280925898, + "flos": 17421452040960.0, + "grad_norm": 2.7848130249027077, + "language_loss": 0.76000333, + "learning_rate": 3.916291083698784e-06, + "loss": 0.78222507, + "num_input_tokens_seen": 43069720, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.15625, + "step": 1990, + "time_per_iteration": 2.4608383178710938 + }, + { + "auxiliary_loss_clip": 0.01060104, + "auxiliary_loss_mlp": 0.0101675, + "balance_loss_clip": 1.01392448, + "balance_loss_mlp": 1.01813149, + "epoch": 0.11970539606192696, + "flos": 70679741402880.0, + "grad_norm": 0.8877551125762418, + "language_loss": 0.55219597, + "learning_rate": 3.916179551676238e-06, + "loss": 0.57296449, + "num_input_tokens_seen": 43123130, + "router_z_loss_clip": 0.02819824, + "router_z_loss_mlp": 0.41992188, + "step": 1991, + "time_per_iteration": 3.0575883388519287 + }, + { + "auxiliary_loss_clip": 0.01166639, + "auxiliary_loss_mlp": 0.01048947, + "balance_loss_clip": 1.02905095, + "balance_loss_mlp": 1.05472517, + "epoch": 0.11976551931459492, + "flos": 21215019127680.0, + "grad_norm": 2.2244739837006797, + "language_loss": 0.78156978, + "learning_rate": 3.916067946991971e-06, + "loss": 0.8037256, + "num_input_tokens_seen": 43140015, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.1171875, + "step": 1992, + "time_per_iteration": 2.5395517349243164 + }, + { + "auxiliary_loss_clip": 0.01170251, + "auxiliary_loss_mlp": 0.0104925, + "balance_loss_clip": 1.02819777, + "balance_loss_mlp": 1.0534482, + "epoch": 0.11982564256726289, + "flos": 25989306226560.0, + "grad_norm": 1.898510109378507, + "language_loss": 0.78694016, + "learning_rate": 3.915956269650216e-06, + "loss": 0.80913514, + "num_input_tokens_seen": 43160105, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1640625, + "step": 1993, + "time_per_iteration": 2.5264625549316406 + }, + { + "auxiliary_loss_clip": 0.01165494, + "auxiliary_loss_mlp": 0.01058458, + "balance_loss_clip": 1.03837109, + "balance_loss_mlp": 1.05150676, + "epoch": 0.11988576581993086, + "flos": 21650866755840.0, + "grad_norm": 1.7590613991113047, + "language_loss": 0.82287014, + "learning_rate": 3.915844519655208e-06, + "loss": 0.8451097, + "num_input_tokens_seen": 43179835, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.140625, + "step": 1994, + "time_per_iteration": 2.4871127605438232 + }, + { + "auxiliary_loss_clip": 0.01166639, + "auxiliary_loss_mlp": 0.01054967, + "balance_loss_clip": 1.03551149, + "balance_loss_mlp": 1.05389762, + "epoch": 0.11994588907259883, + "flos": 17857407409920.0, + "grad_norm": 2.1035856813409786, + "language_loss": 0.87953222, + "learning_rate": 3.915732697011183e-06, + "loss": 0.9017483, + "num_input_tokens_seen": 43197210, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.125, + "step": 1995, + "time_per_iteration": 2.46690034866333 + }, + { + "auxiliary_loss_clip": 0.01169288, + "auxiliary_loss_mlp": 0.01057862, + "balance_loss_clip": 1.03692937, + "balance_loss_mlp": 1.05346155, + "epoch": 0.1200060123252668, + "flos": 24462744163200.0, + "grad_norm": 2.783456627489481, + "language_loss": 0.74206698, + "learning_rate": 3.9156208017223825e-06, + "loss": 0.76433849, + "num_input_tokens_seen": 43215050, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.15625, + "step": 1996, + "time_per_iteration": 2.5115768909454346 + }, + { + "auxiliary_loss_clip": 0.01167539, + "auxiliary_loss_mlp": 0.01052549, + "balance_loss_clip": 1.03138888, + "balance_loss_mlp": 1.05337763, + "epoch": 0.12006613557793476, + "flos": 18732191235840.0, + "grad_norm": 1.9342712291191904, + "language_loss": 0.88266122, + "learning_rate": 3.915508833793048e-06, + "loss": 0.90486217, + "num_input_tokens_seen": 43233900, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.140625, + "step": 1997, + "time_per_iteration": 2.4716532230377197 + }, + { + "auxiliary_loss_clip": 0.01167703, + "auxiliary_loss_mlp": 0.01063842, + "balance_loss_clip": 1.04287314, + "balance_loss_mlp": 1.05315256, + "epoch": 0.12012625883060274, + "flos": 22267739952000.0, + "grad_norm": 3.8633631849497054, + "language_loss": 0.78929418, + "learning_rate": 3.915396793227428e-06, + "loss": 0.81160963, + "num_input_tokens_seen": 43252105, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1484375, + "step": 1998, + "time_per_iteration": 2.4798996448516846 + }, + { + "auxiliary_loss_clip": 0.01170638, + "auxiliary_loss_mlp": 0.01048668, + "balance_loss_clip": 1.027318, + "balance_loss_mlp": 1.05610394, + "epoch": 0.1201863820832707, + "flos": 21758885930880.0, + "grad_norm": 2.053047413592738, + "language_loss": 0.73435485, + "learning_rate": 3.915284680029769e-06, + "loss": 0.75654793, + "num_input_tokens_seen": 43270315, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1484375, + "step": 1999, + "time_per_iteration": 2.5017611980438232 + }, + { + "auxiliary_loss_clip": 0.01169689, + "auxiliary_loss_mlp": 0.01065385, + "balance_loss_clip": 1.04436839, + "balance_loss_mlp": 1.05347967, + "epoch": 0.12024650533593867, + "flos": 21907987286400.0, + "grad_norm": 3.6093884580795677, + "language_loss": 0.74955112, + "learning_rate": 3.915172494204323e-06, + "loss": 0.77190185, + "num_input_tokens_seen": 43289935, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.15625, + "step": 2000, + "time_per_iteration": 2.5060245990753174 + }, + { + "auxiliary_loss_clip": 0.01170552, + "auxiliary_loss_mlp": 0.01050835, + "balance_loss_clip": 1.02997398, + "balance_loss_mlp": 1.05408299, + "epoch": 0.12030662858860665, + "flos": 21689219502720.0, + "grad_norm": 1.5368563042333518, + "language_loss": 0.84667969, + "learning_rate": 3.915060235755344e-06, + "loss": 0.86889356, + "num_input_tokens_seen": 43309325, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.171875, + "step": 2001, + "time_per_iteration": 2.499922752380371 + }, + { + "auxiliary_loss_clip": 0.01168457, + "auxiliary_loss_mlp": 0.0105136, + "balance_loss_clip": 1.03176236, + "balance_loss_mlp": 1.05330753, + "epoch": 0.12036675184127461, + "flos": 12933228856320.0, + "grad_norm": 2.074842616733997, + "language_loss": 0.73982531, + "learning_rate": 3.91494790468709e-06, + "loss": 0.76202351, + "num_input_tokens_seen": 43327010, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.15625, + "step": 2002, + "time_per_iteration": 2.486853837966919 + }, + { + "auxiliary_loss_clip": 0.01175825, + "auxiliary_loss_mlp": 0.01058049, + "balance_loss_clip": 1.03599501, + "balance_loss_mlp": 1.05508709, + "epoch": 0.12042687509394258, + "flos": 20851028657280.0, + "grad_norm": 2.832741043586106, + "language_loss": 0.78091669, + "learning_rate": 3.9148355010038185e-06, + "loss": 0.80325544, + "num_input_tokens_seen": 43345650, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.2109375, + "step": 2003, + "time_per_iteration": 2.4740982055664062 + }, + { + "auxiliary_loss_clip": 0.01166397, + "auxiliary_loss_mlp": 0.01050741, + "balance_loss_clip": 1.02979612, + "balance_loss_mlp": 1.0521121, + "epoch": 0.12048699834661056, + "flos": 23878513451520.0, + "grad_norm": 1.9652989098821625, + "language_loss": 0.72093791, + "learning_rate": 3.914723024709793e-06, + "loss": 0.74310923, + "num_input_tokens_seen": 43365555, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.140625, + "step": 2004, + "time_per_iteration": 2.5126965045928955 + }, + { + "auxiliary_loss_clip": 0.01174991, + "auxiliary_loss_mlp": 0.01061179, + "balance_loss_clip": 1.03877997, + "balance_loss_mlp": 1.0546937, + "epoch": 0.12054712159927852, + "flos": 19756363726080.0, + "grad_norm": 2.2150760255497945, + "language_loss": 0.78260767, + "learning_rate": 3.914610475809279e-06, + "loss": 0.80496937, + "num_input_tokens_seen": 43384990, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.203125, + "step": 2005, + "time_per_iteration": 2.4991190433502197 + }, + { + "auxiliary_loss_clip": 0.01056255, + "auxiliary_loss_mlp": 0.01005501, + "balance_loss_clip": 1.00284314, + "balance_loss_mlp": 1.01496482, + "epoch": 0.12060724485194649, + "flos": 51672763123200.0, + "grad_norm": 0.9233110616682776, + "language_loss": 0.58020771, + "learning_rate": 3.914497854306543e-06, + "loss": 0.60082525, + "num_input_tokens_seen": 43436335, + "router_z_loss_clip": 0.02661133, + "router_z_loss_mlp": 0.4140625, + "step": 2006, + "time_per_iteration": 2.8520798683166504 + }, + { + "auxiliary_loss_clip": 0.01165745, + "auxiliary_loss_mlp": 0.01049181, + "balance_loss_clip": 1.02958333, + "balance_loss_mlp": 1.05345094, + "epoch": 0.12066736810461445, + "flos": 18990425088000.0, + "grad_norm": 1.7247761793975513, + "language_loss": 0.76275218, + "learning_rate": 3.9143851602058575e-06, + "loss": 0.78490144, + "num_input_tokens_seen": 43456495, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.125, + "step": 2007, + "time_per_iteration": 2.50325083732605 + }, + { + "auxiliary_loss_clip": 0.01170732, + "auxiliary_loss_mlp": 0.01058411, + "balance_loss_clip": 1.03653646, + "balance_loss_mlp": 1.05348623, + "epoch": 0.12072749135728243, + "flos": 16471973882880.0, + "grad_norm": 3.332475401193337, + "language_loss": 0.82973194, + "learning_rate": 3.914272393511494e-06, + "loss": 0.85202336, + "num_input_tokens_seen": 43473085, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.171875, + "step": 2008, + "time_per_iteration": 2.4650609493255615 + }, + { + "auxiliary_loss_clip": 0.0116834, + "auxiliary_loss_mlp": 0.01054228, + "balance_loss_clip": 1.03319979, + "balance_loss_mlp": 1.05225682, + "epoch": 0.1207876146099504, + "flos": 18077108947200.0, + "grad_norm": 2.236244219024357, + "language_loss": 0.84184098, + "learning_rate": 3.91415955422773e-06, + "loss": 0.86406672, + "num_input_tokens_seen": 43491135, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1640625, + "step": 2009, + "time_per_iteration": 2.4602744579315186 + }, + { + "auxiliary_loss_clip": 0.01170159, + "auxiliary_loss_mlp": 0.01053411, + "balance_loss_clip": 1.03083277, + "balance_loss_mlp": 1.0551877, + "epoch": 0.12084773786261836, + "flos": 21871573873920.0, + "grad_norm": 1.7312486930792712, + "language_loss": 0.83945864, + "learning_rate": 3.914046642358844e-06, + "loss": 0.86169434, + "num_input_tokens_seen": 43510440, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.1484375, + "step": 2010, + "time_per_iteration": 2.480238437652588 + }, + { + "auxiliary_loss_clip": 0.01171814, + "auxiliary_loss_mlp": 0.01056176, + "balance_loss_clip": 1.03437304, + "balance_loss_mlp": 1.05634403, + "epoch": 0.12090786111528634, + "flos": 18333044328960.0, + "grad_norm": 1.658807365911602, + "language_loss": 0.84157598, + "learning_rate": 3.9139336579091174e-06, + "loss": 0.8638559, + "num_input_tokens_seen": 43530145, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.15625, + "step": 2011, + "time_per_iteration": 2.454406499862671 + }, + { + "auxiliary_loss_clip": 0.01172165, + "auxiliary_loss_mlp": 0.01055064, + "balance_loss_clip": 1.03386891, + "balance_loss_mlp": 1.055547, + "epoch": 0.1209679843679543, + "flos": 21105850717440.0, + "grad_norm": 1.879921554869875, + "language_loss": 0.96007967, + "learning_rate": 3.913820600882834e-06, + "loss": 0.9823519, + "num_input_tokens_seen": 43549315, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.171875, + "step": 2012, + "time_per_iteration": 2.479583740234375 + }, + { + "auxiliary_loss_clip": 0.01166488, + "auxiliary_loss_mlp": 0.01047788, + "balance_loss_clip": 1.026914, + "balance_loss_mlp": 1.05365777, + "epoch": 0.12102810762062227, + "flos": 29241053585280.0, + "grad_norm": 2.6055417591736036, + "language_loss": 0.80619711, + "learning_rate": 3.913707471284283e-06, + "loss": 0.82833993, + "num_input_tokens_seen": 43569240, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.125, + "step": 2013, + "time_per_iteration": 2.538651704788208 + }, + { + "auxiliary_loss_clip": 0.01172968, + "auxiliary_loss_mlp": 0.0104686, + "balance_loss_clip": 1.02444816, + "balance_loss_mlp": 1.05412138, + "epoch": 0.12108823087329025, + "flos": 17930701111680.0, + "grad_norm": 3.9791821612033953, + "language_loss": 0.77157021, + "learning_rate": 3.9135942691177515e-06, + "loss": 0.79376847, + "num_input_tokens_seen": 43587710, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.1875, + "step": 2014, + "time_per_iteration": 2.4411396980285645 + }, + { + "auxiliary_loss_clip": 0.01169091, + "auxiliary_loss_mlp": 0.01046827, + "balance_loss_clip": 1.02509499, + "balance_loss_mlp": 1.05448556, + "epoch": 0.12114835412595822, + "flos": 22091850028800.0, + "grad_norm": 2.028780359370303, + "language_loss": 0.86930937, + "learning_rate": 3.913480994387535e-06, + "loss": 0.89146852, + "num_input_tokens_seen": 43606000, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.140625, + "step": 2015, + "time_per_iteration": 2.4546844959259033 + }, + { + "auxiliary_loss_clip": 0.01159471, + "auxiliary_loss_mlp": 0.01047561, + "balance_loss_clip": 1.0268662, + "balance_loss_mlp": 1.04779112, + "epoch": 0.12120847737862618, + "flos": 20412343854720.0, + "grad_norm": 2.0866681231001762, + "language_loss": 0.69274801, + "learning_rate": 3.913367647097926e-06, + "loss": 0.71481836, + "num_input_tokens_seen": 43624815, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.109375, + "step": 2016, + "time_per_iteration": 2.469177007675171 + }, + { + "auxiliary_loss_clip": 0.01169041, + "auxiliary_loss_mlp": 0.01043193, + "balance_loss_clip": 1.02042413, + "balance_loss_mlp": 1.05407953, + "epoch": 0.12126860063129415, + "flos": 22309037614080.0, + "grad_norm": 3.095255398319528, + "language_loss": 0.80049825, + "learning_rate": 3.913254227253225e-06, + "loss": 0.82262057, + "num_input_tokens_seen": 43643960, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.15625, + "step": 2017, + "time_per_iteration": 2.459447145462036 + }, + { + "auxiliary_loss_clip": 0.01168347, + "auxiliary_loss_mlp": 0.01051308, + "balance_loss_clip": 1.0292666, + "balance_loss_mlp": 1.05315137, + "epoch": 0.12132872388396213, + "flos": 13699275235200.0, + "grad_norm": 2.364451122732105, + "language_loss": 0.69343489, + "learning_rate": 3.913140734857731e-06, + "loss": 0.71563143, + "num_input_tokens_seen": 43662650, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1484375, + "step": 2018, + "time_per_iteration": 3.919508695602417 + }, + { + "auxiliary_loss_clip": 0.0117102, + "auxiliary_loss_mlp": 0.0105213, + "balance_loss_clip": 1.03226995, + "balance_loss_mlp": 1.05712008, + "epoch": 0.12138884713663009, + "flos": 26466954307200.0, + "grad_norm": 2.162901456551013, + "language_loss": 0.72318506, + "learning_rate": 3.91302716991575e-06, + "loss": 0.74541652, + "num_input_tokens_seen": 43684205, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.140625, + "step": 2019, + "time_per_iteration": 3.910888433456421 + }, + { + "auxiliary_loss_clip": 0.01168573, + "auxiliary_loss_mlp": 0.01057878, + "balance_loss_clip": 1.03615856, + "balance_loss_mlp": 1.05187333, + "epoch": 0.12144897038929806, + "flos": 26141603892480.0, + "grad_norm": 1.8061721544245042, + "language_loss": 0.92484713, + "learning_rate": 3.912913532431586e-06, + "loss": 0.94711161, + "num_input_tokens_seen": 43706320, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.1640625, + "step": 2020, + "time_per_iteration": 2.5007998943328857 + }, + { + "auxiliary_loss_clip": 0.01168404, + "auxiliary_loss_mlp": 0.0105088, + "balance_loss_clip": 1.03064966, + "balance_loss_mlp": 1.05388308, + "epoch": 0.12150909364196603, + "flos": 24717530309760.0, + "grad_norm": 1.9478588429028871, + "language_loss": 0.77149868, + "learning_rate": 3.912799822409549e-06, + "loss": 0.79369152, + "num_input_tokens_seen": 43724805, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.140625, + "step": 2021, + "time_per_iteration": 2.522216796875 + }, + { + "auxiliary_loss_clip": 0.01165897, + "auxiliary_loss_mlp": 0.01046456, + "balance_loss_clip": 1.02586901, + "balance_loss_mlp": 1.05312037, + "epoch": 0.121569216894634, + "flos": 25186990089600.0, + "grad_norm": 2.0305604143992944, + "language_loss": 0.80324662, + "learning_rate": 3.912686039853952e-06, + "loss": 0.82537007, + "num_input_tokens_seen": 43742320, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.125, + "step": 2022, + "time_per_iteration": 2.518737316131592 + }, + { + "auxiliary_loss_clip": 0.01173528, + "auxiliary_loss_mlp": 0.01051897, + "balance_loss_clip": 1.03094029, + "balance_loss_mlp": 1.057019, + "epoch": 0.12162934014730196, + "flos": 13444094039040.0, + "grad_norm": 1.9019957932594662, + "language_loss": 0.8458122, + "learning_rate": 3.912572184769108e-06, + "loss": 0.86806649, + "num_input_tokens_seen": 43760665, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1640625, + "step": 2023, + "time_per_iteration": 2.4534339904785156 + }, + { + "auxiliary_loss_clip": 0.01169339, + "auxiliary_loss_mlp": 0.0104975, + "balance_loss_clip": 1.02916241, + "balance_loss_mlp": 1.05421007, + "epoch": 0.12168946339996994, + "flos": 16946138344320.0, + "grad_norm": 2.2004951084054234, + "language_loss": 0.85155022, + "learning_rate": 3.912458257159335e-06, + "loss": 0.87374109, + "num_input_tokens_seen": 43779020, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.15625, + "step": 2024, + "time_per_iteration": 2.436833143234253 + }, + { + "auxiliary_loss_clip": 0.0116415, + "auxiliary_loss_mlp": 0.010498, + "balance_loss_clip": 1.02974951, + "balance_loss_mlp": 1.04884946, + "epoch": 0.12174958665263791, + "flos": 29821585196160.0, + "grad_norm": 2.043367551334066, + "language_loss": 0.71662712, + "learning_rate": 3.912344257028954e-06, + "loss": 0.73876667, + "num_input_tokens_seen": 43798850, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.15625, + "step": 2025, + "time_per_iteration": 2.541215658187866 + }, + { + "auxiliary_loss_clip": 0.01168343, + "auxiliary_loss_mlp": 0.0104585, + "balance_loss_clip": 1.02564383, + "balance_loss_mlp": 1.05309796, + "epoch": 0.12180970990530587, + "flos": 24641902224000.0, + "grad_norm": 2.0848974538483755, + "language_loss": 0.75976777, + "learning_rate": 3.912230184382286e-06, + "loss": 0.7819097, + "num_input_tokens_seen": 43820130, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.15625, + "step": 2026, + "time_per_iteration": 2.529049873352051 + }, + { + "auxiliary_loss_clip": 0.01167307, + "auxiliary_loss_mlp": 0.01045796, + "balance_loss_clip": 1.02570963, + "balance_loss_mlp": 1.05251837, + "epoch": 0.12186983315797385, + "flos": 20521691832960.0, + "grad_norm": 2.6572777094172597, + "language_loss": 0.88875067, + "learning_rate": 3.912116039223659e-06, + "loss": 0.9108817, + "num_input_tokens_seen": 43838485, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.1484375, + "step": 2027, + "time_per_iteration": 2.472158432006836 + }, + { + "auxiliary_loss_clip": 0.01165413, + "auxiliary_loss_mlp": 0.01052054, + "balance_loss_clip": 1.03375518, + "balance_loss_mlp": 1.05316114, + "epoch": 0.12192995641064182, + "flos": 27818344719360.0, + "grad_norm": 2.343330799439898, + "language_loss": 0.75515145, + "learning_rate": 3.912001821557399e-06, + "loss": 0.77732611, + "num_input_tokens_seen": 43859080, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.125, + "step": 2028, + "time_per_iteration": 2.5286035537719727 + }, + { + "auxiliary_loss_clip": 0.01164893, + "auxiliary_loss_mlp": 0.010582, + "balance_loss_clip": 1.03758836, + "balance_loss_mlp": 1.05089998, + "epoch": 0.12199007966330978, + "flos": 22017119783040.0, + "grad_norm": 2.270604294931249, + "language_loss": 0.766294, + "learning_rate": 3.911887531387839e-06, + "loss": 0.78852487, + "num_input_tokens_seen": 43879030, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.140625, + "step": 2029, + "time_per_iteration": 2.479799747467041 + }, + { + "auxiliary_loss_clip": 0.0116289, + "auxiliary_loss_mlp": 0.01051159, + "balance_loss_clip": 1.03113246, + "balance_loss_mlp": 1.05001879, + "epoch": 0.12205020291597775, + "flos": 23295216493440.0, + "grad_norm": 2.2290592341985747, + "language_loss": 0.7955277, + "learning_rate": 3.911773168719313e-06, + "loss": 0.81766814, + "num_input_tokens_seen": 43898505, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.125, + "step": 2030, + "time_per_iteration": 2.479250431060791 + }, + { + "auxiliary_loss_clip": 0.01164659, + "auxiliary_loss_mlp": 0.01054283, + "balance_loss_clip": 1.03301597, + "balance_loss_mlp": 1.0526309, + "epoch": 0.12211032616864573, + "flos": 26031609469440.0, + "grad_norm": 3.9595633959777694, + "language_loss": 0.74556369, + "learning_rate": 3.911658733556155e-06, + "loss": 0.76775312, + "num_input_tokens_seen": 43917945, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.125, + "step": 2031, + "time_per_iteration": 2.4966888427734375 + }, + { + "auxiliary_loss_clip": 0.01166064, + "auxiliary_loss_mlp": 0.01045008, + "balance_loss_clip": 1.0269599, + "balance_loss_mlp": 1.05319047, + "epoch": 0.12217044942131369, + "flos": 20410943224320.0, + "grad_norm": 1.9774178696035418, + "language_loss": 0.75045705, + "learning_rate": 3.911544225902707e-06, + "loss": 0.77256775, + "num_input_tokens_seen": 43937385, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.1328125, + "step": 2032, + "time_per_iteration": 2.4545648097991943 + }, + { + "auxiliary_loss_clip": 0.01156748, + "auxiliary_loss_mlp": 0.01043308, + "balance_loss_clip": 1.02398455, + "balance_loss_mlp": 1.04844511, + "epoch": 0.12223057267398166, + "flos": 22857142222080.0, + "grad_norm": 1.6143118682838826, + "language_loss": 0.88853258, + "learning_rate": 3.911429645763311e-06, + "loss": 0.91053319, + "num_input_tokens_seen": 43958130, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0859375, + "step": 2033, + "time_per_iteration": 2.505521535873413 + }, + { + "auxiliary_loss_clip": 0.01170793, + "auxiliary_loss_mlp": 0.01050252, + "balance_loss_clip": 1.03059459, + "balance_loss_mlp": 1.05660009, + "epoch": 0.12229069592664964, + "flos": 20047563285120.0, + "grad_norm": 2.1152048244965096, + "language_loss": 0.65517056, + "learning_rate": 3.911314993142311e-06, + "loss": 0.67738092, + "num_input_tokens_seen": 43976800, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.140625, + "step": 2034, + "time_per_iteration": 2.4647884368896484 + }, + { + "auxiliary_loss_clip": 0.01167041, + "auxiliary_loss_mlp": 0.01055195, + "balance_loss_clip": 1.03425026, + "balance_loss_mlp": 1.05399358, + "epoch": 0.1223508191793176, + "flos": 22274240313600.0, + "grad_norm": 1.59634219760927, + "language_loss": 0.76435542, + "learning_rate": 3.911200268044055e-06, + "loss": 0.78657782, + "num_input_tokens_seen": 43996620, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.125, + "step": 2035, + "time_per_iteration": 2.483016014099121 + }, + { + "auxiliary_loss_clip": 0.01169828, + "auxiliary_loss_mlp": 0.01051267, + "balance_loss_clip": 1.03104889, + "balance_loss_mlp": 1.0543201, + "epoch": 0.12241094243198557, + "flos": 21285978445440.0, + "grad_norm": 1.8316823187763973, + "language_loss": 0.71407682, + "learning_rate": 3.911085470472892e-06, + "loss": 0.73628777, + "num_input_tokens_seen": 44016175, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.15625, + "step": 2036, + "time_per_iteration": 2.476471185684204 + }, + { + "auxiliary_loss_clip": 0.01168411, + "auxiliary_loss_mlp": 0.01051825, + "balance_loss_clip": 1.0309397, + "balance_loss_mlp": 1.05532706, + "epoch": 0.12247106568465355, + "flos": 17382381022080.0, + "grad_norm": 1.632988910709452, + "language_loss": 0.83352619, + "learning_rate": 3.910970600433178e-06, + "loss": 0.85572863, + "num_input_tokens_seen": 44035060, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.125, + "step": 2037, + "time_per_iteration": 2.476040840148926 + }, + { + "auxiliary_loss_clip": 0.0117386, + "auxiliary_loss_mlp": 0.01058093, + "balance_loss_clip": 1.03625405, + "balance_loss_mlp": 1.05652785, + "epoch": 0.12253118893732151, + "flos": 27045438842880.0, + "grad_norm": 2.722283338591856, + "language_loss": 0.80255699, + "learning_rate": 3.910855657929267e-06, + "loss": 0.82487655, + "num_input_tokens_seen": 44053330, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.171875, + "step": 2038, + "time_per_iteration": 2.5043163299560547 + }, + { + "auxiliary_loss_clip": 0.01058546, + "auxiliary_loss_mlp": 0.01007425, + "balance_loss_clip": 1.0051837, + "balance_loss_mlp": 1.01638949, + "epoch": 0.12259131218998948, + "flos": 53861518368000.0, + "grad_norm": 0.832889593555193, + "language_loss": 0.58671033, + "learning_rate": 3.910740642965518e-06, + "loss": 0.60737002, + "num_input_tokens_seen": 44107575, + "router_z_loss_clip": 0.02246094, + "router_z_loss_mlp": 0.421875, + "step": 2039, + "time_per_iteration": 2.9495608806610107 + }, + { + "auxiliary_loss_clip": 0.01172242, + "auxiliary_loss_mlp": 0.01049387, + "balance_loss_clip": 1.0277977, + "balance_loss_mlp": 1.05559754, + "epoch": 0.12265143544265744, + "flos": 17891917401600.0, + "grad_norm": 2.6229044060505298, + "language_loss": 0.80485016, + "learning_rate": 3.910625555546292e-06, + "loss": 0.82706642, + "num_input_tokens_seen": 44126075, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.171875, + "step": 2040, + "time_per_iteration": 2.4483039379119873 + }, + { + "auxiliary_loss_clip": 0.01166059, + "auxiliary_loss_mlp": 0.01050675, + "balance_loss_clip": 1.02977788, + "balance_loss_mlp": 1.05270815, + "epoch": 0.12271155869532542, + "flos": 21799932197760.0, + "grad_norm": 1.8235003945490114, + "language_loss": 0.82753873, + "learning_rate": 3.910510395675953e-06, + "loss": 0.84970617, + "num_input_tokens_seen": 44145605, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1328125, + "step": 2041, + "time_per_iteration": 2.4804372787475586 + }, + { + "auxiliary_loss_clip": 0.01170766, + "auxiliary_loss_mlp": 0.01049407, + "balance_loss_clip": 1.02846217, + "balance_loss_mlp": 1.05399048, + "epoch": 0.12277168194799339, + "flos": 19828759587840.0, + "grad_norm": 1.7522185366152092, + "language_loss": 0.66806722, + "learning_rate": 3.9103951633588694e-06, + "loss": 0.69026893, + "num_input_tokens_seen": 44164770, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1640625, + "step": 2042, + "time_per_iteration": 2.4683480262756348 + }, + { + "auxiliary_loss_clip": 0.01165609, + "auxiliary_loss_mlp": 0.01051247, + "balance_loss_clip": 1.03032589, + "balance_loss_mlp": 1.05184031, + "epoch": 0.12283180520066135, + "flos": 23221024951680.0, + "grad_norm": 1.8478924147346443, + "language_loss": 0.81661081, + "learning_rate": 3.910279858599409e-06, + "loss": 0.83877933, + "num_input_tokens_seen": 44184025, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.140625, + "step": 2043, + "time_per_iteration": 2.5265614986419678 + }, + { + "auxiliary_loss_clip": 0.01166463, + "auxiliary_loss_mlp": 0.01049773, + "balance_loss_clip": 1.02792168, + "balance_loss_mlp": 1.05028844, + "epoch": 0.12289192845332933, + "flos": 18588476920320.0, + "grad_norm": 2.0920421188484095, + "language_loss": 0.8049221, + "learning_rate": 3.910164481401946e-06, + "loss": 0.82708442, + "num_input_tokens_seen": 44202950, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.1640625, + "step": 2044, + "time_per_iteration": 2.45843768119812 + }, + { + "auxiliary_loss_clip": 0.0116264, + "auxiliary_loss_mlp": 0.01046352, + "balance_loss_clip": 1.02577674, + "balance_loss_mlp": 1.05169511, + "epoch": 0.1229520517059973, + "flos": 25769532862080.0, + "grad_norm": 1.7057283877293323, + "language_loss": 0.7796452, + "learning_rate": 3.910049031770853e-06, + "loss": 0.8017351, + "num_input_tokens_seen": 44221115, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.109375, + "step": 2045, + "time_per_iteration": 2.5117220878601074 + }, + { + "auxiliary_loss_clip": 0.01172524, + "auxiliary_loss_mlp": 0.01063382, + "balance_loss_clip": 1.04210341, + "balance_loss_mlp": 1.05461311, + "epoch": 0.12301217495866526, + "flos": 20887154760960.0, + "grad_norm": 2.0659302798736436, + "language_loss": 0.67135215, + "learning_rate": 3.90993350971051e-06, + "loss": 0.69371116, + "num_input_tokens_seen": 44240575, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1796875, + "step": 2046, + "time_per_iteration": 2.466304063796997 + }, + { + "auxiliary_loss_clip": 0.01166597, + "auxiliary_loss_mlp": 0.01058908, + "balance_loss_clip": 1.03793919, + "balance_loss_mlp": 1.05408335, + "epoch": 0.12307229821133324, + "flos": 22378811783040.0, + "grad_norm": 2.3143924335245654, + "language_loss": 0.72491664, + "learning_rate": 3.909817915225297e-06, + "loss": 0.7471717, + "num_input_tokens_seen": 44257145, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.125, + "step": 2047, + "time_per_iteration": 2.4625275135040283 + }, + { + "auxiliary_loss_clip": 0.01163998, + "auxiliary_loss_mlp": 0.0106421, + "balance_loss_clip": 1.04232347, + "balance_loss_mlp": 1.05105257, + "epoch": 0.1231324214640012, + "flos": 23367396873600.0, + "grad_norm": 1.6458989790549132, + "language_loss": 0.76394033, + "learning_rate": 3.909702248319597e-06, + "loss": 0.7862224, + "num_input_tokens_seen": 44278035, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.125, + "step": 2048, + "time_per_iteration": 2.508790969848633 + }, + { + "auxiliary_loss_clip": 0.01165989, + "auxiliary_loss_mlp": 0.01048998, + "balance_loss_clip": 1.03061616, + "balance_loss_mlp": 1.05322123, + "epoch": 0.12319254471666917, + "flos": 23767154311680.0, + "grad_norm": 2.118548028298143, + "language_loss": 0.84626836, + "learning_rate": 3.909586508997797e-06, + "loss": 0.86841822, + "num_input_tokens_seen": 44296980, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.125, + "step": 2049, + "time_per_iteration": 2.538325071334839 + }, + { + "auxiliary_loss_clip": 0.01164402, + "auxiliary_loss_mlp": 0.01053692, + "balance_loss_clip": 1.0336647, + "balance_loss_mlp": 1.05051267, + "epoch": 0.12325266796933713, + "flos": 23550146294400.0, + "grad_norm": 3.176509780932849, + "language_loss": 0.75351131, + "learning_rate": 3.909470697264285e-06, + "loss": 0.77569222, + "num_input_tokens_seen": 44318005, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.140625, + "step": 2050, + "time_per_iteration": 2.499915599822998 + }, + { + "auxiliary_loss_clip": 0.0116542, + "auxiliary_loss_mlp": 0.01054604, + "balance_loss_clip": 1.03431439, + "balance_loss_mlp": 1.05127048, + "epoch": 0.12331279122200511, + "flos": 24423996366720.0, + "grad_norm": 1.9728027261326873, + "language_loss": 0.80877042, + "learning_rate": 3.909354813123452e-06, + "loss": 0.83097064, + "num_input_tokens_seen": 44335260, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.140625, + "step": 2051, + "time_per_iteration": 2.5018789768218994 + }, + { + "auxiliary_loss_clip": 0.01164273, + "auxiliary_loss_mlp": 0.01053369, + "balance_loss_clip": 1.03338933, + "balance_loss_mlp": 1.05348301, + "epoch": 0.12337291447467308, + "flos": 25484294960640.0, + "grad_norm": 1.7756923294305167, + "language_loss": 0.79991698, + "learning_rate": 3.909238856579693e-06, + "loss": 0.82209337, + "num_input_tokens_seen": 44355315, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.109375, + "step": 2052, + "time_per_iteration": 2.4962196350097656 + }, + { + "auxiliary_loss_clip": 0.01167428, + "auxiliary_loss_mlp": 0.01059063, + "balance_loss_clip": 1.03793955, + "balance_loss_mlp": 1.0515492, + "epoch": 0.12343303772734104, + "flos": 23550002640000.0, + "grad_norm": 2.071130498978609, + "language_loss": 0.73757279, + "learning_rate": 3.909122827637406e-06, + "loss": 0.75983769, + "num_input_tokens_seen": 44373020, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.15625, + "step": 2053, + "time_per_iteration": 2.4748997688293457 + }, + { + "auxiliary_loss_clip": 0.01164856, + "auxiliary_loss_mlp": 0.01054483, + "balance_loss_clip": 1.03337085, + "balance_loss_mlp": 1.04912996, + "epoch": 0.12349316098000902, + "flos": 47557074867840.0, + "grad_norm": 2.5139588428492408, + "language_loss": 0.73835206, + "learning_rate": 3.909006726300991e-06, + "loss": 0.76054543, + "num_input_tokens_seen": 44397525, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.15625, + "step": 2054, + "time_per_iteration": 2.7009665966033936 + }, + { + "auxiliary_loss_clip": 0.01161738, + "auxiliary_loss_mlp": 0.01042094, + "balance_loss_clip": 1.02381933, + "balance_loss_mlp": 1.04980421, + "epoch": 0.12355328423267699, + "flos": 25045969294080.0, + "grad_norm": 2.0020033330801863, + "language_loss": 0.85107529, + "learning_rate": 3.908890552574849e-06, + "loss": 0.87311363, + "num_input_tokens_seen": 44415890, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.1171875, + "step": 2055, + "time_per_iteration": 2.5038392543792725 + }, + { + "auxiliary_loss_clip": 0.01164626, + "auxiliary_loss_mlp": 0.01053304, + "balance_loss_clip": 1.03445673, + "balance_loss_mlp": 1.05093932, + "epoch": 0.12361340748534495, + "flos": 27709140395520.0, + "grad_norm": 1.9818000135561404, + "language_loss": 0.77465194, + "learning_rate": 3.908774306463384e-06, + "loss": 0.79683125, + "num_input_tokens_seen": 44436625, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.140625, + "step": 2056, + "time_per_iteration": 2.5265629291534424 + }, + { + "auxiliary_loss_clip": 0.01162241, + "auxiliary_loss_mlp": 0.01055177, + "balance_loss_clip": 1.03486395, + "balance_loss_mlp": 1.04937708, + "epoch": 0.12367353073801293, + "flos": 26140598311680.0, + "grad_norm": 1.9976131339644834, + "language_loss": 0.83188522, + "learning_rate": 3.908657987971009e-06, + "loss": 0.85405934, + "num_input_tokens_seen": 44455265, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1328125, + "step": 2057, + "time_per_iteration": 2.502987861633301 + }, + { + "auxiliary_loss_clip": 0.0116756, + "auxiliary_loss_mlp": 0.01053922, + "balance_loss_clip": 1.03272629, + "balance_loss_mlp": 1.05169332, + "epoch": 0.1237336539906809, + "flos": 25156035544320.0, + "grad_norm": 1.751792200322901, + "language_loss": 0.78356105, + "learning_rate": 3.90854159710213e-06, + "loss": 0.80577588, + "num_input_tokens_seen": 44475815, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1640625, + "step": 2058, + "time_per_iteration": 2.5236053466796875 + }, + { + "auxiliary_loss_clip": 0.01167574, + "auxiliary_loss_mlp": 0.01052354, + "balance_loss_clip": 1.03086066, + "balance_loss_mlp": 1.05105174, + "epoch": 0.12379377724334886, + "flos": 15304589867520.0, + "grad_norm": 2.1327254817813124, + "language_loss": 0.83191061, + "learning_rate": 3.9084251338611624e-06, + "loss": 0.85410988, + "num_input_tokens_seen": 44494045, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.1640625, + "step": 2059, + "time_per_iteration": 5.313246726989746 + }, + { + "auxiliary_loss_clip": 0.01169117, + "auxiliary_loss_mlp": 0.01056711, + "balance_loss_clip": 1.0344671, + "balance_loss_mlp": 1.05206418, + "epoch": 0.12385390049601683, + "flos": 21316717509120.0, + "grad_norm": 2.990324814625926, + "language_loss": 0.81387389, + "learning_rate": 3.908308598252523e-06, + "loss": 0.83613217, + "num_input_tokens_seen": 44509120, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.171875, + "step": 2060, + "time_per_iteration": 3.8617331981658936 + }, + { + "auxiliary_loss_clip": 0.01163462, + "auxiliary_loss_mlp": 0.01050537, + "balance_loss_clip": 1.02928221, + "balance_loss_mlp": 1.04859161, + "epoch": 0.1239140237486848, + "flos": 15116309752320.0, + "grad_norm": 2.0129231677956105, + "language_loss": 0.86278749, + "learning_rate": 3.9081919902806306e-06, + "loss": 0.88492751, + "num_input_tokens_seen": 44525780, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1484375, + "step": 2061, + "time_per_iteration": 2.4531033039093018 + }, + { + "auxiliary_loss_clip": 0.01163888, + "auxiliary_loss_mlp": 0.01045306, + "balance_loss_clip": 1.02552915, + "balance_loss_mlp": 1.05163288, + "epoch": 0.12397414700135277, + "flos": 21976791788160.0, + "grad_norm": 2.146204871859891, + "language_loss": 0.84992719, + "learning_rate": 3.908075309949906e-06, + "loss": 0.87201917, + "num_input_tokens_seen": 44543125, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.125, + "step": 2062, + "time_per_iteration": 2.475050449371338 + }, + { + "auxiliary_loss_clip": 0.01167091, + "auxiliary_loss_mlp": 0.01057701, + "balance_loss_clip": 1.03600502, + "balance_loss_mlp": 1.05348217, + "epoch": 0.12403427025402074, + "flos": 13400892956160.0, + "grad_norm": 2.194910982672458, + "language_loss": 0.78651118, + "learning_rate": 3.907958557264774e-06, + "loss": 0.80875909, + "num_input_tokens_seen": 44560275, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.140625, + "step": 2063, + "time_per_iteration": 2.4638655185699463 + }, + { + "auxiliary_loss_clip": 0.01167155, + "auxiliary_loss_mlp": 0.01058063, + "balance_loss_clip": 1.03590226, + "balance_loss_mlp": 1.05330634, + "epoch": 0.12409439350668872, + "flos": 15304374385920.0, + "grad_norm": 2.133219584666701, + "language_loss": 0.79411167, + "learning_rate": 3.907841732229663e-06, + "loss": 0.81636381, + "num_input_tokens_seen": 44577640, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.1328125, + "step": 2064, + "time_per_iteration": 2.4441418647766113 + }, + { + "auxiliary_loss_clip": 0.01163006, + "auxiliary_loss_mlp": 0.01051291, + "balance_loss_clip": 1.03083503, + "balance_loss_mlp": 1.04955256, + "epoch": 0.12415451675935668, + "flos": 25009376313600.0, + "grad_norm": 2.2298036351802533, + "language_loss": 0.92358226, + "learning_rate": 3.907724834849002e-06, + "loss": 0.9457252, + "num_input_tokens_seen": 44594860, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1328125, + "step": 2065, + "time_per_iteration": 2.4909794330596924 + }, + { + "auxiliary_loss_clip": 0.01166011, + "auxiliary_loss_mlp": 0.01050011, + "balance_loss_clip": 1.02880335, + "balance_loss_mlp": 1.05061674, + "epoch": 0.12421464001202465, + "flos": 23659673840640.0, + "grad_norm": 1.7134253508315578, + "language_loss": 0.8042016, + "learning_rate": 3.907607865127225e-06, + "loss": 0.82636184, + "num_input_tokens_seen": 44614780, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.15625, + "step": 2066, + "time_per_iteration": 2.484276056289673 + }, + { + "auxiliary_loss_clip": 0.01052523, + "auxiliary_loss_mlp": 0.0100337, + "balance_loss_clip": 1.00111723, + "balance_loss_mlp": 1.01144505, + "epoch": 0.12427476326469263, + "flos": 65732904345600.0, + "grad_norm": 0.8687209975293121, + "language_loss": 0.63275361, + "learning_rate": 3.907490823068766e-06, + "loss": 0.65331256, + "num_input_tokens_seen": 44671240, + "router_z_loss_clip": 0.02258301, + "router_z_loss_mlp": 0.41015625, + "step": 2067, + "time_per_iteration": 3.0286524295806885 + }, + { + "auxiliary_loss_clip": 0.01166519, + "auxiliary_loss_mlp": 0.0105175, + "balance_loss_clip": 1.03103137, + "balance_loss_mlp": 1.05087852, + "epoch": 0.12433488651736059, + "flos": 24535427333760.0, + "grad_norm": 1.9774411847970965, + "language_loss": 0.93209147, + "learning_rate": 3.907373708678063e-06, + "loss": 0.95427418, + "num_input_tokens_seen": 44691050, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.15625, + "step": 2068, + "time_per_iteration": 2.4971697330474854 + }, + { + "auxiliary_loss_clip": 0.01167817, + "auxiliary_loss_mlp": 0.01049229, + "balance_loss_clip": 1.03079867, + "balance_loss_mlp": 1.053213, + "epoch": 0.12439500977002856, + "flos": 21031659175680.0, + "grad_norm": 1.9835561743386452, + "language_loss": 0.81277847, + "learning_rate": 3.9072565219595596e-06, + "loss": 0.83494884, + "num_input_tokens_seen": 44709850, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.1484375, + "step": 2069, + "time_per_iteration": 2.4772391319274902 + }, + { + "auxiliary_loss_clip": 0.01166777, + "auxiliary_loss_mlp": 0.01055339, + "balance_loss_clip": 1.03519261, + "balance_loss_mlp": 1.05177176, + "epoch": 0.12445513302269653, + "flos": 26830621555200.0, + "grad_norm": 1.606173275168009, + "language_loss": 0.77390277, + "learning_rate": 3.907139262917696e-06, + "loss": 0.79612398, + "num_input_tokens_seen": 44731475, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.1484375, + "step": 2070, + "time_per_iteration": 2.4962410926818848 + }, + { + "auxiliary_loss_clip": 0.01172971, + "auxiliary_loss_mlp": 0.01046497, + "balance_loss_clip": 1.02598071, + "balance_loss_mlp": 1.05637431, + "epoch": 0.1245152562753645, + "flos": 18368919037440.0, + "grad_norm": 2.418044156181854, + "language_loss": 0.80847198, + "learning_rate": 3.907021931556922e-06, + "loss": 0.83066666, + "num_input_tokens_seen": 44749685, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1640625, + "step": 2071, + "time_per_iteration": 2.452148199081421 + }, + { + "auxiliary_loss_clip": 0.01162159, + "auxiliary_loss_mlp": 0.01051578, + "balance_loss_clip": 1.03063262, + "balance_loss_mlp": 1.05134583, + "epoch": 0.12457537952803246, + "flos": 33107986200960.0, + "grad_norm": 1.802846280579791, + "language_loss": 0.77933639, + "learning_rate": 3.906904527881684e-06, + "loss": 0.80147374, + "num_input_tokens_seen": 44772165, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.109375, + "step": 2072, + "time_per_iteration": 2.5763509273529053 + }, + { + "auxiliary_loss_clip": 0.01166298, + "auxiliary_loss_mlp": 0.01054628, + "balance_loss_clip": 1.03480363, + "balance_loss_mlp": 1.05423427, + "epoch": 0.12463550278070043, + "flos": 22270217990400.0, + "grad_norm": 2.6278132513508976, + "language_loss": 0.74839735, + "learning_rate": 3.9067870518964355e-06, + "loss": 0.77060658, + "num_input_tokens_seen": 44790580, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.125, + "step": 2073, + "time_per_iteration": 2.4676945209503174 + }, + { + "auxiliary_loss_clip": 0.01162135, + "auxiliary_loss_mlp": 0.01050014, + "balance_loss_clip": 1.02904546, + "balance_loss_mlp": 1.04915833, + "epoch": 0.12469562603336841, + "flos": 14679025580160.0, + "grad_norm": 1.9457561725453951, + "language_loss": 0.90556443, + "learning_rate": 3.906669503605631e-06, + "loss": 0.92768592, + "num_input_tokens_seen": 44806730, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1328125, + "step": 2074, + "time_per_iteration": 2.4873156547546387 + }, + { + "auxiliary_loss_clip": 0.01168793, + "auxiliary_loss_mlp": 0.01051059, + "balance_loss_clip": 1.02843285, + "balance_loss_mlp": 1.05183172, + "epoch": 0.12475574928603637, + "flos": 24644775312000.0, + "grad_norm": 2.3814572559525877, + "language_loss": 0.83753067, + "learning_rate": 3.906551883013728e-06, + "loss": 0.85972917, + "num_input_tokens_seen": 44825550, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.171875, + "step": 2075, + "time_per_iteration": 2.500657320022583 + }, + { + "auxiliary_loss_clip": 0.01164838, + "auxiliary_loss_mlp": 0.01056086, + "balance_loss_clip": 1.0341754, + "balance_loss_mlp": 1.05080831, + "epoch": 0.12481587253870434, + "flos": 21762980081280.0, + "grad_norm": 2.1638910845289567, + "language_loss": 0.73802024, + "learning_rate": 3.9064341901251865e-06, + "loss": 0.76022947, + "num_input_tokens_seen": 44844155, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.140625, + "step": 2076, + "time_per_iteration": 2.5686564445495605 + }, + { + "auxiliary_loss_clip": 0.01161577, + "auxiliary_loss_mlp": 0.01043023, + "balance_loss_clip": 1.02346063, + "balance_loss_mlp": 1.05219531, + "epoch": 0.12487599579137232, + "flos": 21432529935360.0, + "grad_norm": 1.967733683791653, + "language_loss": 0.7551648, + "learning_rate": 3.906316424944469e-06, + "loss": 0.77721083, + "num_input_tokens_seen": 44863780, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.09375, + "step": 2077, + "time_per_iteration": 2.489954710006714 + }, + { + "auxiliary_loss_clip": 0.01163633, + "auxiliary_loss_mlp": 0.0105265, + "balance_loss_clip": 1.03104901, + "balance_loss_mlp": 1.05015802, + "epoch": 0.12493611904404028, + "flos": 16107624276480.0, + "grad_norm": 4.043491061132511, + "language_loss": 0.82077563, + "learning_rate": 3.906198587476043e-06, + "loss": 0.84293842, + "num_input_tokens_seen": 44881480, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.1328125, + "step": 2078, + "time_per_iteration": 2.445270299911499 + }, + { + "auxiliary_loss_clip": 0.01168396, + "auxiliary_loss_mlp": 0.0104761, + "balance_loss_clip": 1.02629507, + "balance_loss_mlp": 1.05372512, + "epoch": 0.12499624229670825, + "flos": 21580266574080.0, + "grad_norm": 2.023726857078381, + "language_loss": 0.75024784, + "learning_rate": 3.906080677724374e-06, + "loss": 0.77240789, + "num_input_tokens_seen": 44900390, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1484375, + "step": 2079, + "time_per_iteration": 2.4694364070892334 + }, + { + "auxiliary_loss_clip": 0.01173002, + "auxiliary_loss_mlp": 0.01056904, + "balance_loss_clip": 1.03578043, + "balance_loss_mlp": 1.05697465, + "epoch": 0.1250563655493762, + "flos": 25699040421120.0, + "grad_norm": 2.9314739831996124, + "language_loss": 0.83961046, + "learning_rate": 3.905962695693935e-06, + "loss": 0.86190951, + "num_input_tokens_seen": 44920375, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1640625, + "step": 2080, + "time_per_iteration": 2.52955961227417 + }, + { + "auxiliary_loss_clip": 0.01166024, + "auxiliary_loss_mlp": 0.0105753, + "balance_loss_clip": 1.0364058, + "balance_loss_mlp": 1.05275226, + "epoch": 0.12511648880204418, + "flos": 16909509450240.0, + "grad_norm": 2.0357346796271307, + "language_loss": 0.84575123, + "learning_rate": 3.9058446413892e-06, + "loss": 0.8679868, + "num_input_tokens_seen": 44938415, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1328125, + "step": 2081, + "time_per_iteration": 2.4380433559417725 + }, + { + "auxiliary_loss_clip": 0.0116507, + "auxiliary_loss_mlp": 0.01044581, + "balance_loss_clip": 1.02430391, + "balance_loss_mlp": 1.05154538, + "epoch": 0.12517661205471217, + "flos": 17567500740480.0, + "grad_norm": 1.660916229819668, + "language_loss": 0.76882648, + "learning_rate": 3.905726514814646e-06, + "loss": 0.790923, + "num_input_tokens_seen": 44957135, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1328125, + "step": 2082, + "time_per_iteration": 2.454939842224121 + }, + { + "auxiliary_loss_clip": 0.01182882, + "auxiliary_loss_mlp": 0.01052846, + "balance_loss_clip": 1.03117347, + "balance_loss_mlp": 1.06035674, + "epoch": 0.12523673530738014, + "flos": 16033791870720.0, + "grad_norm": 2.833832134330164, + "language_loss": 0.78994107, + "learning_rate": 3.9056083159747495e-06, + "loss": 0.81229836, + "num_input_tokens_seen": 44974480, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.2265625, + "step": 2083, + "time_per_iteration": 2.4439167976379395 + }, + { + "auxiliary_loss_clip": 0.01168103, + "auxiliary_loss_mlp": 0.01051445, + "balance_loss_clip": 1.02855682, + "balance_loss_mlp": 1.05132031, + "epoch": 0.1252968585600481, + "flos": 18807747494400.0, + "grad_norm": 2.376124844090109, + "language_loss": 0.89690113, + "learning_rate": 3.9054900448739966e-06, + "loss": 0.91909659, + "num_input_tokens_seen": 44990310, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.171875, + "step": 2084, + "time_per_iteration": 2.492274045944214 + }, + { + "auxiliary_loss_clip": 0.01168755, + "auxiliary_loss_mlp": 0.01049772, + "balance_loss_clip": 1.02876747, + "balance_loss_mlp": 1.05379784, + "epoch": 0.12535698181271607, + "flos": 27271568914560.0, + "grad_norm": 1.9059704425119062, + "language_loss": 0.79718572, + "learning_rate": 3.905371701516869e-06, + "loss": 0.81937099, + "num_input_tokens_seen": 45010720, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.1484375, + "step": 2085, + "time_per_iteration": 2.5295538902282715 + }, + { + "auxiliary_loss_clip": 0.0116658, + "auxiliary_loss_mlp": 0.01051649, + "balance_loss_clip": 1.03011954, + "balance_loss_mlp": 1.05235541, + "epoch": 0.12541710506538403, + "flos": 22054107813120.0, + "grad_norm": 1.9580642243137214, + "language_loss": 0.88227898, + "learning_rate": 3.905253285907856e-06, + "loss": 0.90446126, + "num_input_tokens_seen": 45030360, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.140625, + "step": 2086, + "time_per_iteration": 2.4508614540100098 + }, + { + "auxiliary_loss_clip": 0.01162238, + "auxiliary_loss_mlp": 0.01045013, + "balance_loss_clip": 1.02541506, + "balance_loss_mlp": 1.05238986, + "epoch": 0.125477228318052, + "flos": 12603173760000.0, + "grad_norm": 2.3707303368435957, + "language_loss": 0.87088495, + "learning_rate": 3.905134798051447e-06, + "loss": 0.89295745, + "num_input_tokens_seen": 45045085, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.09375, + "step": 2087, + "time_per_iteration": 2.4342494010925293 + }, + { + "auxiliary_loss_clip": 0.01166252, + "auxiliary_loss_mlp": 0.01056999, + "balance_loss_clip": 1.03444421, + "balance_loss_mlp": 1.05230761, + "epoch": 0.12553735157071996, + "flos": 23878549365120.0, + "grad_norm": 3.239876707553976, + "language_loss": 0.73480451, + "learning_rate": 3.905016237952136e-06, + "loss": 0.75703704, + "num_input_tokens_seen": 45065145, + "router_z_loss_clip": 0.22558594, + "router_z_loss_mlp": 1.140625, + "step": 2088, + "time_per_iteration": 2.4926228523254395 + }, + { + "auxiliary_loss_clip": 0.01053685, + "auxiliary_loss_mlp": 0.01004858, + "balance_loss_clip": 1.00259304, + "balance_loss_mlp": 1.01231122, + "epoch": 0.12559747482338796, + "flos": 69920841830400.0, + "grad_norm": 0.759594920780347, + "language_loss": 0.61699253, + "learning_rate": 3.904897605614418e-06, + "loss": 0.63757795, + "num_input_tokens_seen": 45126230, + "router_z_loss_clip": 0.02270508, + "router_z_loss_mlp": 0.4140625, + "step": 2089, + "time_per_iteration": 3.0373222827911377 + }, + { + "auxiliary_loss_clip": 0.01165987, + "auxiliary_loss_mlp": 0.01057326, + "balance_loss_clip": 1.03522468, + "balance_loss_mlp": 1.05317736, + "epoch": 0.12565759807605592, + "flos": 24279563779200.0, + "grad_norm": 2.0159960445234746, + "language_loss": 0.78266793, + "learning_rate": 3.904778901042793e-06, + "loss": 0.80490106, + "num_input_tokens_seen": 45145545, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.125, + "step": 2090, + "time_per_iteration": 2.5307860374450684 + }, + { + "auxiliary_loss_clip": 0.01051163, + "auxiliary_loss_mlp": 0.01005786, + "balance_loss_clip": 1.00381935, + "balance_loss_mlp": 1.01062346, + "epoch": 0.12571772132872389, + "flos": 56451180286080.0, + "grad_norm": 0.749206069507312, + "language_loss": 0.59394926, + "learning_rate": 3.90466012424176e-06, + "loss": 0.61451876, + "num_input_tokens_seen": 45206845, + "router_z_loss_clip": 0.01965332, + "router_z_loss_mlp": 0.40625, + "step": 2091, + "time_per_iteration": 2.976081609725952 + }, + { + "auxiliary_loss_clip": 0.01166574, + "auxiliary_loss_mlp": 0.0105012, + "balance_loss_clip": 1.03016472, + "balance_loss_mlp": 1.0538522, + "epoch": 0.12577784458139185, + "flos": 41245846675200.0, + "grad_norm": 1.8692826570762828, + "language_loss": 0.63588953, + "learning_rate": 3.904541275215825e-06, + "loss": 0.6580565, + "num_input_tokens_seen": 45228495, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.125, + "step": 2092, + "time_per_iteration": 2.633054733276367 + }, + { + "auxiliary_loss_clip": 0.01169654, + "auxiliary_loss_mlp": 0.01059319, + "balance_loss_clip": 1.03770638, + "balance_loss_mlp": 1.05095637, + "epoch": 0.12583796783405982, + "flos": 19755501799680.0, + "grad_norm": 3.3800613541528257, + "language_loss": 0.80149096, + "learning_rate": 3.904422353969493e-06, + "loss": 0.82378066, + "num_input_tokens_seen": 45245720, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.1875, + "step": 2093, + "time_per_iteration": 2.4769086837768555 + }, + { + "auxiliary_loss_clip": 0.01166637, + "auxiliary_loss_mlp": 0.01065148, + "balance_loss_clip": 1.04385769, + "balance_loss_mlp": 1.05323935, + "epoch": 0.12589809108672778, + "flos": 22602104680320.0, + "grad_norm": 1.7179534274341421, + "language_loss": 0.75928843, + "learning_rate": 3.904303360507276e-06, + "loss": 0.78160632, + "num_input_tokens_seen": 45265650, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1328125, + "step": 2094, + "time_per_iteration": 2.4775569438934326 + }, + { + "auxiliary_loss_clip": 0.01163905, + "auxiliary_loss_mlp": 0.01053098, + "balance_loss_clip": 1.03322637, + "balance_loss_mlp": 1.05116057, + "epoch": 0.12595821433939577, + "flos": 45222845541120.0, + "grad_norm": 1.654740537988477, + "language_loss": 0.76833487, + "learning_rate": 3.9041842948336835e-06, + "loss": 0.79050487, + "num_input_tokens_seen": 45287790, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.125, + "step": 2095, + "time_per_iteration": 2.669593095779419 + }, + { + "auxiliary_loss_clip": 0.01166425, + "auxiliary_loss_mlp": 0.01064344, + "balance_loss_clip": 1.04330409, + "balance_loss_mlp": 1.05012596, + "epoch": 0.12601833759206374, + "flos": 14319811618560.0, + "grad_norm": 2.7658625824396568, + "language_loss": 0.8312341, + "learning_rate": 3.904065156953232e-06, + "loss": 0.85354173, + "num_input_tokens_seen": 45305720, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1640625, + "step": 2096, + "time_per_iteration": 2.446169853210449 + }, + { + "auxiliary_loss_clip": 0.01167307, + "auxiliary_loss_mlp": 0.0105403, + "balance_loss_clip": 1.03317988, + "balance_loss_mlp": 1.05236387, + "epoch": 0.1260784608447317, + "flos": 21288241002240.0, + "grad_norm": 1.9365429623482773, + "language_loss": 0.7532599, + "learning_rate": 3.903945946870439e-06, + "loss": 0.77547324, + "num_input_tokens_seen": 45325290, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1484375, + "step": 2097, + "time_per_iteration": 2.46520733833313 + }, + { + "auxiliary_loss_clip": 0.0116818, + "auxiliary_loss_mlp": 0.0105919, + "balance_loss_clip": 1.0399375, + "balance_loss_mlp": 1.05366278, + "epoch": 0.12613858409739967, + "flos": 26251311006720.0, + "grad_norm": 2.0415683165998004, + "language_loss": 0.8696878, + "learning_rate": 3.9038266645898246e-06, + "loss": 0.89196146, + "num_input_tokens_seen": 45344465, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.1484375, + "step": 2098, + "time_per_iteration": 2.488985061645508 + }, + { + "auxiliary_loss_clip": 0.01171506, + "auxiliary_loss_mlp": 0.01063691, + "balance_loss_clip": 1.03984964, + "balance_loss_mlp": 1.05263424, + "epoch": 0.12619870735006763, + "flos": 21579979265280.0, + "grad_norm": 1.8810788789855342, + "language_loss": 0.69538295, + "learning_rate": 3.903707310115912e-06, + "loss": 0.71773493, + "num_input_tokens_seen": 45362465, + "router_z_loss_clip": 0.23828125, + "router_z_loss_mlp": 1.1875, + "step": 2099, + "time_per_iteration": 2.4791061878204346 + }, + { + "auxiliary_loss_clip": 0.01167442, + "auxiliary_loss_mlp": 0.01058165, + "balance_loss_clip": 1.03538442, + "balance_loss_mlp": 1.05016196, + "epoch": 0.1262588306027356, + "flos": 23367037737600.0, + "grad_norm": 3.489186386071109, + "language_loss": 0.81622505, + "learning_rate": 3.903587883453228e-06, + "loss": 0.83848113, + "num_input_tokens_seen": 45382700, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.171875, + "step": 2100, + "time_per_iteration": 2.4970083236694336 + }, + { + "auxiliary_loss_clip": 0.01170444, + "auxiliary_loss_mlp": 0.01056399, + "balance_loss_clip": 1.03558493, + "balance_loss_mlp": 1.05375385, + "epoch": 0.12631895385540357, + "flos": 23949185460480.0, + "grad_norm": 21.240028764463403, + "language_loss": 0.80653214, + "learning_rate": 3.903468384606302e-06, + "loss": 0.82880062, + "num_input_tokens_seen": 45401005, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.1640625, + "step": 2101, + "time_per_iteration": 5.441275596618652 + }, + { + "auxiliary_loss_clip": 0.01053889, + "auxiliary_loss_mlp": 0.01009667, + "balance_loss_clip": 1.00753367, + "balance_loss_mlp": 1.01423335, + "epoch": 0.12637907710807156, + "flos": 70282138780800.0, + "grad_norm": 0.7055092704674581, + "language_loss": 0.57077372, + "learning_rate": 3.903348813579662e-06, + "loss": 0.59140933, + "num_input_tokens_seen": 45466555, + "router_z_loss_clip": 0.0213623, + "router_z_loss_mlp": 0.39648438, + "step": 2102, + "time_per_iteration": 4.4595959186553955 + }, + { + "auxiliary_loss_clip": 0.01172982, + "auxiliary_loss_mlp": 0.0105633, + "balance_loss_clip": 1.03513408, + "balance_loss_mlp": 1.05443108, + "epoch": 0.12643920036073952, + "flos": 18915084311040.0, + "grad_norm": 1.9163731362545673, + "language_loss": 0.93033105, + "learning_rate": 3.903229170377845e-06, + "loss": 0.9526242, + "num_input_tokens_seen": 45485165, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1875, + "step": 2103, + "time_per_iteration": 2.4612908363342285 + }, + { + "auxiliary_loss_clip": 0.01160763, + "auxiliary_loss_mlp": 0.01039063, + "balance_loss_clip": 1.01929784, + "balance_loss_mlp": 1.05146646, + "epoch": 0.1264993236134075, + "flos": 27782470010880.0, + "grad_norm": 1.70771861982282, + "language_loss": 0.7804687, + "learning_rate": 3.903109455005387e-06, + "loss": 0.80246699, + "num_input_tokens_seen": 45504630, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.09375, + "step": 2104, + "time_per_iteration": 2.556351661682129 + }, + { + "auxiliary_loss_clip": 0.01173017, + "auxiliary_loss_mlp": 0.01056721, + "balance_loss_clip": 1.03659892, + "balance_loss_mlp": 1.05698192, + "epoch": 0.12655944686607545, + "flos": 24754697907840.0, + "grad_norm": 1.9983303318130716, + "language_loss": 0.81274837, + "learning_rate": 3.902989667466828e-06, + "loss": 0.83504581, + "num_input_tokens_seen": 45524885, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.15625, + "step": 2105, + "time_per_iteration": 2.4998059272766113 + }, + { + "auxiliary_loss_clip": 0.01177911, + "auxiliary_loss_mlp": 0.01057389, + "balance_loss_clip": 1.03515697, + "balance_loss_mlp": 1.05756688, + "epoch": 0.12661957011874342, + "flos": 24133048202880.0, + "grad_norm": 2.6618923007939728, + "language_loss": 0.83258855, + "learning_rate": 3.90286980776671e-06, + "loss": 0.85494161, + "num_input_tokens_seen": 45545000, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.203125, + "step": 2106, + "time_per_iteration": 2.4816856384277344 + }, + { + "auxiliary_loss_clip": 0.01170292, + "auxiliary_loss_mlp": 0.01048713, + "balance_loss_clip": 1.02755296, + "balance_loss_mlp": 1.05664992, + "epoch": 0.12667969337141138, + "flos": 24569614103040.0, + "grad_norm": 2.017673348074064, + "language_loss": 0.73717511, + "learning_rate": 3.902749875909578e-06, + "loss": 0.75936514, + "num_input_tokens_seen": 45564210, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.140625, + "step": 2107, + "time_per_iteration": 2.503575325012207 + }, + { + "auxiliary_loss_clip": 0.01166119, + "auxiliary_loss_mlp": 0.01046685, + "balance_loss_clip": 1.02683651, + "balance_loss_mlp": 1.05330598, + "epoch": 0.12673981662407935, + "flos": 22961677777920.0, + "grad_norm": 2.8409726657459213, + "language_loss": 0.79492414, + "learning_rate": 3.90262987189998e-06, + "loss": 0.81705213, + "num_input_tokens_seen": 45583030, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.125, + "step": 2108, + "time_per_iteration": 2.448009967803955 + }, + { + "auxiliary_loss_clip": 0.01168328, + "auxiliary_loss_mlp": 0.01048086, + "balance_loss_clip": 1.02635407, + "balance_loss_mlp": 1.05213785, + "epoch": 0.12679993987674734, + "flos": 17274864637440.0, + "grad_norm": 2.700834997101356, + "language_loss": 0.75458848, + "learning_rate": 3.902509795742467e-06, + "loss": 0.77675259, + "num_input_tokens_seen": 45602265, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.1640625, + "step": 2109, + "time_per_iteration": 2.463996171951294 + }, + { + "auxiliary_loss_clip": 0.01165378, + "auxiliary_loss_mlp": 0.01046335, + "balance_loss_clip": 1.02641523, + "balance_loss_mlp": 1.05309939, + "epoch": 0.1268600631294153, + "flos": 17275080119040.0, + "grad_norm": 5.620565406896926, + "language_loss": 0.82876229, + "learning_rate": 3.902389647441592e-06, + "loss": 0.85087943, + "num_input_tokens_seen": 45620595, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.125, + "step": 2110, + "time_per_iteration": 2.4536476135253906 + }, + { + "auxiliary_loss_clip": 0.01166918, + "auxiliary_loss_mlp": 0.01055332, + "balance_loss_clip": 1.03271818, + "balance_loss_mlp": 1.0524385, + "epoch": 0.12692018638208327, + "flos": 24061047390720.0, + "grad_norm": 1.8108257578185059, + "language_loss": 0.78553301, + "learning_rate": 3.90226942700191e-06, + "loss": 0.80775553, + "num_input_tokens_seen": 45641140, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.140625, + "step": 2111, + "time_per_iteration": 2.4898500442504883 + }, + { + "auxiliary_loss_clip": 0.01178398, + "auxiliary_loss_mlp": 0.01069762, + "balance_loss_clip": 1.04634905, + "balance_loss_mlp": 1.05599511, + "epoch": 0.12698030963475124, + "flos": 31831900652160.0, + "grad_norm": 2.2255287569010567, + "language_loss": 0.76852119, + "learning_rate": 3.902149134427982e-06, + "loss": 0.79100275, + "num_input_tokens_seen": 45662315, + "router_z_loss_clip": 0.234375, + "router_z_loss_mlp": 1.2265625, + "step": 2112, + "time_per_iteration": 2.534062623977661 + }, + { + "auxiliary_loss_clip": 0.0116691, + "auxiliary_loss_mlp": 0.01060346, + "balance_loss_clip": 1.03878117, + "balance_loss_mlp": 1.05138493, + "epoch": 0.1270404328874192, + "flos": 25187744275200.0, + "grad_norm": 1.901101750436338, + "language_loss": 0.85764933, + "learning_rate": 3.902028769724367e-06, + "loss": 0.87992191, + "num_input_tokens_seen": 45680335, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.15625, + "step": 2113, + "time_per_iteration": 2.4980924129486084 + }, + { + "auxiliary_loss_clip": 0.01166421, + "auxiliary_loss_mlp": 0.01057595, + "balance_loss_clip": 1.03581548, + "balance_loss_mlp": 1.05287683, + "epoch": 0.12710055614008717, + "flos": 15997342544640.0, + "grad_norm": 2.270588429793272, + "language_loss": 0.74000478, + "learning_rate": 3.9019083328956315e-06, + "loss": 0.76224494, + "num_input_tokens_seen": 45696240, + "router_z_loss_clip": 0.21777344, + "router_z_loss_mlp": 1.1328125, + "step": 2114, + "time_per_iteration": 2.422631025314331 + }, + { + "auxiliary_loss_clip": 0.01170563, + "auxiliary_loss_mlp": 0.01057942, + "balance_loss_clip": 1.03504217, + "balance_loss_mlp": 1.05601084, + "epoch": 0.12716067939275516, + "flos": 15085642515840.0, + "grad_norm": 1.7902572486589996, + "language_loss": 0.83236456, + "learning_rate": 3.901787823946341e-06, + "loss": 0.85464966, + "num_input_tokens_seen": 45713695, + "router_z_loss_clip": 0.22851562, + "router_z_loss_mlp": 1.1484375, + "step": 2115, + "time_per_iteration": 2.4601340293884277 + }, + { + "auxiliary_loss_clip": 0.01169954, + "auxiliary_loss_mlp": 0.01060607, + "balance_loss_clip": 1.03953075, + "balance_loss_mlp": 1.05397201, + "epoch": 0.12722080264542313, + "flos": 28366736636160.0, + "grad_norm": 1.532692301262898, + "language_loss": 0.86615002, + "learning_rate": 3.901667242881065e-06, + "loss": 0.88845563, + "num_input_tokens_seen": 45736655, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1640625, + "step": 2116, + "time_per_iteration": 2.5315732955932617 + }, + { + "auxiliary_loss_clip": 0.01164638, + "auxiliary_loss_mlp": 0.01050843, + "balance_loss_clip": 1.03062534, + "balance_loss_mlp": 1.05188024, + "epoch": 0.1272809258980911, + "flos": 32379897519360.0, + "grad_norm": 1.8525451323112498, + "language_loss": 0.70492947, + "learning_rate": 3.9015465897043775e-06, + "loss": 0.72708428, + "num_input_tokens_seen": 45758195, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.125, + "step": 2117, + "time_per_iteration": 2.6213905811309814 + }, + { + "auxiliary_loss_clip": 0.01168229, + "auxiliary_loss_mlp": 0.01055103, + "balance_loss_clip": 1.03346658, + "balance_loss_mlp": 1.05461121, + "epoch": 0.12734104915075906, + "flos": 16034402401920.0, + "grad_norm": 2.4058915352959294, + "language_loss": 0.86858076, + "learning_rate": 3.901425864420852e-06, + "loss": 0.89081407, + "num_input_tokens_seen": 45774280, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.140625, + "step": 2118, + "time_per_iteration": 2.4760360717773438 + }, + { + "auxiliary_loss_clip": 0.01164532, + "auxiliary_loss_mlp": 0.01048268, + "balance_loss_clip": 1.0279547, + "balance_loss_mlp": 1.0518508, + "epoch": 0.12740117240342702, + "flos": 18260325244800.0, + "grad_norm": 1.7933295144796901, + "language_loss": 0.87325591, + "learning_rate": 3.901305067035068e-06, + "loss": 0.89538383, + "num_input_tokens_seen": 45792760, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.125, + "step": 2119, + "time_per_iteration": 2.547213315963745 + }, + { + "auxiliary_loss_clip": 0.01167828, + "auxiliary_loss_mlp": 0.01051741, + "balance_loss_clip": 1.03024805, + "balance_loss_mlp": 1.05369782, + "epoch": 0.127461295656095, + "flos": 12121790664960.0, + "grad_norm": 2.4444945117671018, + "language_loss": 0.8769815, + "learning_rate": 3.901184197551605e-06, + "loss": 0.89917719, + "num_input_tokens_seen": 45804300, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.140625, + "step": 2120, + "time_per_iteration": 2.4568872451782227 + }, + { + "auxiliary_loss_clip": 0.01169401, + "auxiliary_loss_mlp": 0.0104623, + "balance_loss_clip": 1.02553487, + "balance_loss_mlp": 1.05405664, + "epoch": 0.12752141890876295, + "flos": 23149095966720.0, + "grad_norm": 1.8558714180118523, + "language_loss": 0.75193042, + "learning_rate": 3.901063255975046e-06, + "loss": 0.77408671, + "num_input_tokens_seen": 45823780, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1484375, + "step": 2121, + "time_per_iteration": 2.508117437362671 + }, + { + "auxiliary_loss_clip": 0.01167335, + "auxiliary_loss_mlp": 0.01050063, + "balance_loss_clip": 1.02895081, + "balance_loss_mlp": 1.05228865, + "epoch": 0.12758154216143094, + "flos": 21615997628160.0, + "grad_norm": 2.458066848563671, + "language_loss": 0.8294577, + "learning_rate": 3.900942242309978e-06, + "loss": 0.8516317, + "num_input_tokens_seen": 45840495, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.15625, + "step": 2122, + "time_per_iteration": 2.4878990650177 + }, + { + "auxiliary_loss_clip": 0.01168476, + "auxiliary_loss_mlp": 0.01050394, + "balance_loss_clip": 1.02924609, + "balance_loss_mlp": 1.05379128, + "epoch": 0.1276416654140989, + "flos": 15924874855680.0, + "grad_norm": 2.1208761223769375, + "language_loss": 0.79040462, + "learning_rate": 3.90082115656099e-06, + "loss": 0.81259328, + "num_input_tokens_seen": 45857735, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1484375, + "step": 2123, + "time_per_iteration": 2.512085199356079 + }, + { + "auxiliary_loss_clip": 0.0117181, + "auxiliary_loss_mlp": 0.01056255, + "balance_loss_clip": 1.03411841, + "balance_loss_mlp": 1.05565643, + "epoch": 0.12770178866676687, + "flos": 22382690451840.0, + "grad_norm": 1.7846776317234667, + "language_loss": 0.79227948, + "learning_rate": 3.900699998732673e-06, + "loss": 0.81456017, + "num_input_tokens_seen": 45876485, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1640625, + "step": 2124, + "time_per_iteration": 2.4865264892578125 + }, + { + "auxiliary_loss_clip": 0.01168084, + "auxiliary_loss_mlp": 0.01054179, + "balance_loss_clip": 1.03267348, + "balance_loss_mlp": 1.05149364, + "epoch": 0.12776191191943484, + "flos": 21652482867840.0, + "grad_norm": 2.8175561910153215, + "language_loss": 0.75565529, + "learning_rate": 3.900578768829623e-06, + "loss": 0.77787793, + "num_input_tokens_seen": 45894645, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.1640625, + "step": 2125, + "time_per_iteration": 2.514455795288086 + }, + { + "auxiliary_loss_clip": 0.01166899, + "auxiliary_loss_mlp": 0.01047376, + "balance_loss_clip": 1.02645469, + "balance_loss_mlp": 1.05262208, + "epoch": 0.1278220351721028, + "flos": 25735561574400.0, + "grad_norm": 2.1990589160087493, + "language_loss": 0.77811432, + "learning_rate": 3.900457466856434e-06, + "loss": 0.80025709, + "num_input_tokens_seen": 45913755, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.140625, + "step": 2126, + "time_per_iteration": 2.556657075881958 + }, + { + "auxiliary_loss_clip": 0.01167875, + "auxiliary_loss_mlp": 0.01050746, + "balance_loss_clip": 1.03124356, + "balance_loss_mlp": 1.05559683, + "epoch": 0.12788215842477077, + "flos": 41243224982400.0, + "grad_norm": 1.702389562623477, + "language_loss": 0.69255161, + "learning_rate": 3.9003360928177085e-06, + "loss": 0.71473777, + "num_input_tokens_seen": 45936095, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.125, + "step": 2127, + "time_per_iteration": 2.629990339279175 + }, + { + "auxiliary_loss_clip": 0.01050691, + "auxiliary_loss_mlp": 0.01005275, + "balance_loss_clip": 1.00326061, + "balance_loss_mlp": 1.01139402, + "epoch": 0.12794228167743876, + "flos": 70877430881280.0, + "grad_norm": 0.8552720802624753, + "language_loss": 0.62738979, + "learning_rate": 3.900214646718047e-06, + "loss": 0.64794946, + "num_input_tokens_seen": 46004655, + "router_z_loss_clip": 0.0201416, + "router_z_loss_mlp": 0.39257812, + "step": 2128, + "time_per_iteration": 3.1237356662750244 + }, + { + "auxiliary_loss_clip": 0.01168478, + "auxiliary_loss_mlp": 0.01048721, + "balance_loss_clip": 1.02646422, + "balance_loss_mlp": 1.05287039, + "epoch": 0.12800240493010673, + "flos": 16289727252480.0, + "grad_norm": 2.3711218915030368, + "language_loss": 0.77148604, + "learning_rate": 3.900093128562056e-06, + "loss": 0.79365802, + "num_input_tokens_seen": 46023610, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.15625, + "step": 2129, + "time_per_iteration": 2.4499564170837402 + }, + { + "auxiliary_loss_clip": 0.01179121, + "auxiliary_loss_mlp": 0.01052089, + "balance_loss_clip": 1.02902186, + "balance_loss_mlp": 1.05744195, + "epoch": 0.1280625281827747, + "flos": 20631542601600.0, + "grad_norm": 2.273395516882369, + "language_loss": 0.79321349, + "learning_rate": 3.899971538354343e-06, + "loss": 0.81552559, + "num_input_tokens_seen": 46041725, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.21875, + "step": 2130, + "time_per_iteration": 2.4536893367767334 + }, + { + "auxiliary_loss_clip": 0.0116812, + "auxiliary_loss_mlp": 0.01044456, + "balance_loss_clip": 1.02463198, + "balance_loss_mlp": 1.05328345, + "epoch": 0.12812265143544266, + "flos": 22638230784000.0, + "grad_norm": 2.267455405666958, + "language_loss": 0.70879477, + "learning_rate": 3.899849876099518e-06, + "loss": 0.73092055, + "num_input_tokens_seen": 46061095, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.1484375, + "step": 2131, + "time_per_iteration": 2.514155149459839 + }, + { + "auxiliary_loss_clip": 0.01166691, + "auxiliary_loss_mlp": 0.01051797, + "balance_loss_clip": 1.03007698, + "balance_loss_mlp": 1.05375445, + "epoch": 0.12818277468811062, + "flos": 34714701463680.0, + "grad_norm": 2.2952793086030376, + "language_loss": 0.72266257, + "learning_rate": 3.899728141802197e-06, + "loss": 0.74484742, + "num_input_tokens_seen": 46082670, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.125, + "step": 2132, + "time_per_iteration": 2.5662834644317627 + }, + { + "auxiliary_loss_clip": 0.01163765, + "auxiliary_loss_mlp": 0.01054914, + "balance_loss_clip": 1.03396928, + "balance_loss_mlp": 1.05281162, + "epoch": 0.1282428979407786, + "flos": 23112107936640.0, + "grad_norm": 2.1162344308699828, + "language_loss": 0.82306767, + "learning_rate": 3.8996063354669935e-06, + "loss": 0.84525442, + "num_input_tokens_seen": 46102410, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.109375, + "step": 2133, + "time_per_iteration": 2.488302230834961 + }, + { + "auxiliary_loss_clip": 0.01174206, + "auxiliary_loss_mlp": 0.01061813, + "balance_loss_clip": 1.03871, + "balance_loss_mlp": 1.05329132, + "epoch": 0.12830302119344655, + "flos": 20886508316160.0, + "grad_norm": 2.538367341661163, + "language_loss": 0.79631573, + "learning_rate": 3.899484457098528e-06, + "loss": 0.81867594, + "num_input_tokens_seen": 46121145, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.203125, + "step": 2134, + "time_per_iteration": 2.4610936641693115 + }, + { + "auxiliary_loss_clip": 0.01172893, + "auxiliary_loss_mlp": 0.01045118, + "balance_loss_clip": 1.02393413, + "balance_loss_mlp": 1.05650806, + "epoch": 0.12836314444611455, + "flos": 21397768548480.0, + "grad_norm": 2.033800341734765, + "language_loss": 0.83015293, + "learning_rate": 3.899362506701421e-06, + "loss": 0.85233301, + "num_input_tokens_seen": 46140740, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1640625, + "step": 2135, + "time_per_iteration": 2.4743056297302246 + }, + { + "auxiliary_loss_clip": 0.01165668, + "auxiliary_loss_mlp": 0.01061205, + "balance_loss_clip": 1.03842425, + "balance_loss_mlp": 1.05173945, + "epoch": 0.1284232676987825, + "flos": 13662466773120.0, + "grad_norm": 2.9021762622464853, + "language_loss": 0.77293968, + "learning_rate": 3.899240484280298e-06, + "loss": 0.79520839, + "num_input_tokens_seen": 46156805, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.140625, + "step": 2136, + "time_per_iteration": 2.4412362575531006 + }, + { + "auxiliary_loss_clip": 0.01051729, + "auxiliary_loss_mlp": 0.01001869, + "balance_loss_clip": 0.99983084, + "balance_loss_mlp": 1.01248765, + "epoch": 0.12848339095145048, + "flos": 59994737735040.0, + "grad_norm": 0.8943310105061408, + "language_loss": 0.59115362, + "learning_rate": 3.899118389839785e-06, + "loss": 0.61168963, + "num_input_tokens_seen": 46222085, + "router_z_loss_clip": 0.02038574, + "router_z_loss_mlp": 0.39257812, + "step": 2137, + "time_per_iteration": 3.2407264709472656 + }, + { + "auxiliary_loss_clip": 0.01164926, + "auxiliary_loss_mlp": 0.01052629, + "balance_loss_clip": 1.03207743, + "balance_loss_mlp": 1.04970789, + "epoch": 0.12854351420411844, + "flos": 13881378211200.0, + "grad_norm": 2.4694787743163404, + "language_loss": 0.81923193, + "learning_rate": 3.898996223384512e-06, + "loss": 0.84140748, + "num_input_tokens_seen": 46239970, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.15625, + "step": 2138, + "time_per_iteration": 2.441708564758301 + }, + { + "auxiliary_loss_clip": 0.01170897, + "auxiliary_loss_mlp": 0.01055556, + "balance_loss_clip": 1.03207207, + "balance_loss_mlp": 1.05353928, + "epoch": 0.1286036374567864, + "flos": 22637943475200.0, + "grad_norm": 2.804990264663657, + "language_loss": 0.79418135, + "learning_rate": 3.898873984919113e-06, + "loss": 0.81644583, + "num_input_tokens_seen": 46257740, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.171875, + "step": 2139, + "time_per_iteration": 2.5321907997131348 + }, + { + "auxiliary_loss_clip": 0.01169458, + "auxiliary_loss_mlp": 0.0104552, + "balance_loss_clip": 1.02488446, + "balance_loss_mlp": 1.05315363, + "epoch": 0.12866376070945437, + "flos": 16324775948160.0, + "grad_norm": 2.1742564972583667, + "language_loss": 0.84761363, + "learning_rate": 3.8987516744482215e-06, + "loss": 0.86976337, + "num_input_tokens_seen": 46275445, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.1640625, + "step": 2140, + "time_per_iteration": 2.469543933868408 + }, + { + "auxiliary_loss_clip": 0.01164368, + "auxiliary_loss_mlp": 0.01045521, + "balance_loss_clip": 1.02524316, + "balance_loss_mlp": 1.05079114, + "epoch": 0.12872388396212234, + "flos": 11874546374400.0, + "grad_norm": 2.376703775404894, + "language_loss": 0.85850012, + "learning_rate": 3.898629291976476e-06, + "loss": 0.88059902, + "num_input_tokens_seen": 46291710, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1328125, + "step": 2141, + "time_per_iteration": 2.473205327987671 + }, + { + "auxiliary_loss_clip": 0.0116884, + "auxiliary_loss_mlp": 0.01049008, + "balance_loss_clip": 1.0278126, + "balance_loss_mlp": 1.05059922, + "epoch": 0.12878400721479033, + "flos": 28366700722560.0, + "grad_norm": 3.411777854813752, + "language_loss": 0.68245387, + "learning_rate": 3.898506837508518e-06, + "loss": 0.7046324, + "num_input_tokens_seen": 46311335, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1796875, + "step": 2142, + "time_per_iteration": 2.5327556133270264 + }, + { + "auxiliary_loss_clip": 0.01171992, + "auxiliary_loss_mlp": 0.01048809, + "balance_loss_clip": 1.02702951, + "balance_loss_mlp": 1.05430341, + "epoch": 0.1288441304674583, + "flos": 25885632597120.0, + "grad_norm": 2.0295098459565692, + "language_loss": 0.82883704, + "learning_rate": 3.89838431104899e-06, + "loss": 0.85104507, + "num_input_tokens_seen": 46330985, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.171875, + "step": 2143, + "time_per_iteration": 4.014873743057251 + }, + { + "auxiliary_loss_clip": 0.01171398, + "auxiliary_loss_mlp": 0.01053828, + "balance_loss_clip": 1.03262091, + "balance_loss_mlp": 1.05572712, + "epoch": 0.12890425372012626, + "flos": 20813789232000.0, + "grad_norm": 1.7367706894947552, + "language_loss": 0.81788546, + "learning_rate": 3.898261712602539e-06, + "loss": 0.84013772, + "num_input_tokens_seen": 46351295, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.15625, + "step": 2144, + "time_per_iteration": 4.002255439758301 + }, + { + "auxiliary_loss_clip": 0.0116509, + "auxiliary_loss_mlp": 0.0105384, + "balance_loss_clip": 1.03108335, + "balance_loss_mlp": 1.04864693, + "epoch": 0.12896437697279423, + "flos": 22565870835840.0, + "grad_norm": 3.8817809862500727, + "language_loss": 0.78257203, + "learning_rate": 3.898139042173813e-06, + "loss": 0.80476135, + "num_input_tokens_seen": 46368600, + "router_z_loss_clip": 0.22753906, + "router_z_loss_mlp": 1.1640625, + "step": 2145, + "time_per_iteration": 2.4952287673950195 + }, + { + "auxiliary_loss_clip": 0.01167211, + "auxiliary_loss_mlp": 0.01049919, + "balance_loss_clip": 1.02825832, + "balance_loss_mlp": 1.05031526, + "epoch": 0.1290245002254622, + "flos": 17493776075520.0, + "grad_norm": 2.1659704609946897, + "language_loss": 0.82622325, + "learning_rate": 3.898016299767465e-06, + "loss": 0.84839463, + "num_input_tokens_seen": 46387370, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.171875, + "step": 2146, + "time_per_iteration": 2.4898681640625 + }, + { + "auxiliary_loss_clip": 0.01165601, + "auxiliary_loss_mlp": 0.01051615, + "balance_loss_clip": 1.02959681, + "balance_loss_mlp": 1.05129158, + "epoch": 0.12908462347813016, + "flos": 36315957859200.0, + "grad_norm": 2.717320122986492, + "language_loss": 0.70446974, + "learning_rate": 3.897893485388149e-06, + "loss": 0.72664189, + "num_input_tokens_seen": 46409570, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.140625, + "step": 2147, + "time_per_iteration": 2.5964484214782715 + }, + { + "auxiliary_loss_clip": 0.01165989, + "auxiliary_loss_mlp": 0.01051149, + "balance_loss_clip": 1.03069305, + "balance_loss_mlp": 1.05166912, + "epoch": 0.12914474673079815, + "flos": 22528703237760.0, + "grad_norm": 2.443887417123452, + "language_loss": 0.71685153, + "learning_rate": 3.897770599040521e-06, + "loss": 0.73902297, + "num_input_tokens_seen": 46429320, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.140625, + "step": 2148, + "time_per_iteration": 2.4910573959350586 + }, + { + "auxiliary_loss_clip": 0.01165944, + "auxiliary_loss_mlp": 0.01046939, + "balance_loss_clip": 1.02681684, + "balance_loss_mlp": 1.05413008, + "epoch": 0.12920486998346611, + "flos": 21471888263040.0, + "grad_norm": 1.666574129953403, + "language_loss": 0.79379606, + "learning_rate": 3.897647640729242e-06, + "loss": 0.81592482, + "num_input_tokens_seen": 46450155, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.1171875, + "step": 2149, + "time_per_iteration": 2.495443820953369 + }, + { + "auxiliary_loss_clip": 0.01167493, + "auxiliary_loss_mlp": 0.01046346, + "balance_loss_clip": 1.02455473, + "balance_loss_mlp": 1.05306077, + "epoch": 0.12926499323613408, + "flos": 27308556944640.0, + "grad_norm": 2.1379132369478313, + "language_loss": 0.76475441, + "learning_rate": 3.897524610458975e-06, + "loss": 0.78689277, + "num_input_tokens_seen": 46470280, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.140625, + "step": 2150, + "time_per_iteration": 2.524395704269409 + }, + { + "auxiliary_loss_clip": 0.01166143, + "auxiliary_loss_mlp": 0.0105244, + "balance_loss_clip": 1.03124499, + "balance_loss_mlp": 1.05094671, + "epoch": 0.12932511648880204, + "flos": 22091131756800.0, + "grad_norm": 2.417935370690141, + "language_loss": 0.70735669, + "learning_rate": 3.8974015082343835e-06, + "loss": 0.72954249, + "num_input_tokens_seen": 46487605, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.1484375, + "step": 2151, + "time_per_iteration": 2.5213184356689453 + }, + { + "auxiliary_loss_clip": 0.01165721, + "auxiliary_loss_mlp": 0.01044761, + "balance_loss_clip": 1.02502, + "balance_loss_mlp": 1.05457592, + "epoch": 0.12938523974147, + "flos": 20302780394880.0, + "grad_norm": 1.9866869590783298, + "language_loss": 0.84050369, + "learning_rate": 3.897278334060137e-06, + "loss": 0.86260849, + "num_input_tokens_seen": 46505100, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.109375, + "step": 2152, + "time_per_iteration": 2.4494428634643555 + }, + { + "auxiliary_loss_clip": 0.01167192, + "auxiliary_loss_mlp": 0.01057934, + "balance_loss_clip": 1.03689384, + "balance_loss_mlp": 1.05128813, + "epoch": 0.12944536299413797, + "flos": 19499961467520.0, + "grad_norm": 2.226463520109079, + "language_loss": 0.78646791, + "learning_rate": 3.897155087940906e-06, + "loss": 0.80871922, + "num_input_tokens_seen": 46524020, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.15625, + "step": 2153, + "time_per_iteration": 2.46975040435791 + }, + { + "auxiliary_loss_clip": 0.01163518, + "auxiliary_loss_mlp": 0.01052865, + "balance_loss_clip": 1.03220654, + "balance_loss_mlp": 1.05069268, + "epoch": 0.12950548624680594, + "flos": 27707919333120.0, + "grad_norm": 2.482522823334948, + "language_loss": 0.80135351, + "learning_rate": 3.897031769881364e-06, + "loss": 0.82351738, + "num_input_tokens_seen": 46544640, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.125, + "step": 2154, + "time_per_iteration": 2.558769941329956 + }, + { + "auxiliary_loss_clip": 0.01170487, + "auxiliary_loss_mlp": 0.01051039, + "balance_loss_clip": 1.02998686, + "balance_loss_mlp": 1.05522227, + "epoch": 0.12956560949947393, + "flos": 17565740974080.0, + "grad_norm": 2.0988715261553774, + "language_loss": 0.83128881, + "learning_rate": 3.896908379886188e-06, + "loss": 0.85350406, + "num_input_tokens_seen": 46561395, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1484375, + "step": 2155, + "time_per_iteration": 2.476299524307251 + }, + { + "auxiliary_loss_clip": 0.01164922, + "auxiliary_loss_mlp": 0.01050282, + "balance_loss_clip": 1.02961075, + "balance_loss_mlp": 1.05010283, + "epoch": 0.1296257327521419, + "flos": 20740711011840.0, + "grad_norm": 2.842594732542889, + "language_loss": 0.76062953, + "learning_rate": 3.896784917960055e-06, + "loss": 0.7827816, + "num_input_tokens_seen": 46579395, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1484375, + "step": 2156, + "time_per_iteration": 2.6024632453918457 + }, + { + "auxiliary_loss_clip": 0.01161875, + "auxiliary_loss_mlp": 0.01051596, + "balance_loss_clip": 1.03078222, + "balance_loss_mlp": 1.05121815, + "epoch": 0.12968585600480986, + "flos": 16395735265920.0, + "grad_norm": 1.9934077258859366, + "language_loss": 0.86546719, + "learning_rate": 3.896661384107648e-06, + "loss": 0.88760191, + "num_input_tokens_seen": 46597090, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.109375, + "step": 2157, + "time_per_iteration": 2.452826976776123 + }, + { + "auxiliary_loss_clip": 0.01164359, + "auxiliary_loss_mlp": 0.01059125, + "balance_loss_clip": 1.03745282, + "balance_loss_mlp": 1.04796743, + "epoch": 0.12974597925747783, + "flos": 28329533124480.0, + "grad_norm": 2.339899004847696, + "language_loss": 0.80590808, + "learning_rate": 3.896537778333651e-06, + "loss": 0.82814288, + "num_input_tokens_seen": 46617355, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.1640625, + "step": 2158, + "time_per_iteration": 2.5332443714141846 + }, + { + "auxiliary_loss_clip": 0.01169288, + "auxiliary_loss_mlp": 0.01055971, + "balance_loss_clip": 1.03510916, + "balance_loss_mlp": 1.05294585, + "epoch": 0.1298061025101458, + "flos": 9683025782400.0, + "grad_norm": 2.254282600322574, + "language_loss": 0.74603379, + "learning_rate": 3.896414100642752e-06, + "loss": 0.76828635, + "num_input_tokens_seen": 46633130, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1640625, + "step": 2159, + "time_per_iteration": 2.469038963317871 + }, + { + "auxiliary_loss_clip": 0.01158286, + "auxiliary_loss_mlp": 0.0105286, + "balance_loss_clip": 1.0323323, + "balance_loss_mlp": 1.04777908, + "epoch": 0.12986622576281376, + "flos": 27709535445120.0, + "grad_norm": 2.1260113568932746, + "language_loss": 0.8227706, + "learning_rate": 3.89629035103964e-06, + "loss": 0.84488213, + "num_input_tokens_seen": 46650575, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.109375, + "step": 2160, + "time_per_iteration": 2.516723155975342 + }, + { + "auxiliary_loss_clip": 0.01159917, + "auxiliary_loss_mlp": 0.01044215, + "balance_loss_clip": 1.02450943, + "balance_loss_mlp": 1.05318654, + "epoch": 0.12992634901548175, + "flos": 18802719590400.0, + "grad_norm": 1.6308358458278915, + "language_loss": 0.81877828, + "learning_rate": 3.896166529529008e-06, + "loss": 0.8408196, + "num_input_tokens_seen": 46668780, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0625, + "step": 2161, + "time_per_iteration": 2.4677131175994873 + }, + { + "auxiliary_loss_clip": 0.01161774, + "auxiliary_loss_mlp": 0.01056265, + "balance_loss_clip": 1.03479493, + "balance_loss_mlp": 1.05035043, + "epoch": 0.12998647226814972, + "flos": 29127575543040.0, + "grad_norm": 2.2782308625037686, + "language_loss": 0.82592809, + "learning_rate": 3.896042636115551e-06, + "loss": 0.84810847, + "num_input_tokens_seen": 46687550, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.109375, + "step": 2162, + "time_per_iteration": 2.5702993869781494 + }, + { + "auxiliary_loss_clip": 0.01164237, + "auxiliary_loss_mlp": 0.01054699, + "balance_loss_clip": 1.03454113, + "balance_loss_mlp": 1.04993796, + "epoch": 0.13004659552081768, + "flos": 19573686132480.0, + "grad_norm": 2.619296712638915, + "language_loss": 0.72762972, + "learning_rate": 3.895918670803968e-06, + "loss": 0.7498191, + "num_input_tokens_seen": 46706730, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.140625, + "step": 2163, + "time_per_iteration": 2.4531478881835938 + }, + { + "auxiliary_loss_clip": 0.0116607, + "auxiliary_loss_mlp": 0.01053845, + "balance_loss_clip": 1.03183889, + "balance_loss_mlp": 1.05107188, + "epoch": 0.13010671877348565, + "flos": 22490709626880.0, + "grad_norm": 2.0773433264348435, + "language_loss": 0.81498116, + "learning_rate": 3.895794633598958e-06, + "loss": 0.83718032, + "num_input_tokens_seen": 46724250, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1484375, + "step": 2164, + "time_per_iteration": 2.497072458267212 + }, + { + "auxiliary_loss_clip": 0.01164255, + "auxiliary_loss_mlp": 0.01042951, + "balance_loss_clip": 1.02381766, + "balance_loss_mlp": 1.05107093, + "epoch": 0.1301668420261536, + "flos": 23878226142720.0, + "grad_norm": 2.2040156749440523, + "language_loss": 0.72564822, + "learning_rate": 3.8956705245052256e-06, + "loss": 0.7477203, + "num_input_tokens_seen": 46744105, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.1328125, + "step": 2165, + "time_per_iteration": 2.515026807785034 + }, + { + "auxiliary_loss_clip": 0.01167589, + "auxiliary_loss_mlp": 0.01047652, + "balance_loss_clip": 1.02599204, + "balance_loss_mlp": 1.05286038, + "epoch": 0.13022696527882158, + "flos": 23150065633920.0, + "grad_norm": 2.8786436091142913, + "language_loss": 0.74697578, + "learning_rate": 3.8955463435274765e-06, + "loss": 0.76912814, + "num_input_tokens_seen": 46764250, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.1484375, + "step": 2166, + "time_per_iteration": 2.5301709175109863 + }, + { + "auxiliary_loss_clip": 0.01165477, + "auxiliary_loss_mlp": 0.01047606, + "balance_loss_clip": 1.02751899, + "balance_loss_mlp": 1.05156064, + "epoch": 0.13028708853148954, + "flos": 26908548111360.0, + "grad_norm": 1.5708346768068926, + "language_loss": 0.83053899, + "learning_rate": 3.895422090670421e-06, + "loss": 0.85266984, + "num_input_tokens_seen": 46786865, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.140625, + "step": 2167, + "time_per_iteration": 2.632035732269287 + }, + { + "auxiliary_loss_clip": 0.01163335, + "auxiliary_loss_mlp": 0.01060394, + "balance_loss_clip": 1.03931761, + "balance_loss_mlp": 1.05201721, + "epoch": 0.13034721178415754, + "flos": 21251468453760.0, + "grad_norm": 1.9158171210349437, + "language_loss": 0.83286303, + "learning_rate": 3.89529776593877e-06, + "loss": 0.85510027, + "num_input_tokens_seen": 46807030, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.109375, + "step": 2168, + "time_per_iteration": 2.4766387939453125 + }, + { + "auxiliary_loss_clip": 0.0116626, + "auxiliary_loss_mlp": 0.01052307, + "balance_loss_clip": 1.03075409, + "balance_loss_mlp": 1.05258656, + "epoch": 0.1304073350368255, + "flos": 18767239931520.0, + "grad_norm": 2.304013454801214, + "language_loss": 0.80027354, + "learning_rate": 3.8951733693372375e-06, + "loss": 0.82245922, + "num_input_tokens_seen": 46826280, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.140625, + "step": 2169, + "time_per_iteration": 2.5185413360595703 + }, + { + "auxiliary_loss_clip": 0.01166949, + "auxiliary_loss_mlp": 0.01045126, + "balance_loss_clip": 1.02329922, + "balance_loss_mlp": 1.05451608, + "epoch": 0.13046745828949347, + "flos": 28364653647360.0, + "grad_norm": 4.565704621626811, + "language_loss": 0.66456163, + "learning_rate": 3.8950489008705406e-06, + "loss": 0.68668246, + "num_input_tokens_seen": 46846505, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.125, + "step": 2170, + "time_per_iteration": 2.5556788444519043 + }, + { + "auxiliary_loss_clip": 0.01165384, + "auxiliary_loss_mlp": 0.01044241, + "balance_loss_clip": 1.02397573, + "balance_loss_mlp": 1.05294132, + "epoch": 0.13052758154216143, + "flos": 29605044055680.0, + "grad_norm": 1.848772151746763, + "language_loss": 0.66935396, + "learning_rate": 3.8949243605434e-06, + "loss": 0.69145024, + "num_input_tokens_seen": 46867380, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.125, + "step": 2171, + "time_per_iteration": 2.553422451019287 + }, + { + "auxiliary_loss_clip": 0.01164709, + "auxiliary_loss_mlp": 0.01048224, + "balance_loss_clip": 1.02649236, + "balance_loss_mlp": 1.05211711, + "epoch": 0.1305877047948294, + "flos": 19390864884480.0, + "grad_norm": 1.9479804069383955, + "language_loss": 0.71952963, + "learning_rate": 3.894799748360537e-06, + "loss": 0.74165899, + "num_input_tokens_seen": 46886810, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.125, + "step": 2172, + "time_per_iteration": 2.4801840782165527 + }, + { + "auxiliary_loss_clip": 0.01161466, + "auxiliary_loss_mlp": 0.01043706, + "balance_loss_clip": 1.02508521, + "balance_loss_mlp": 1.05435848, + "epoch": 0.13064782804749736, + "flos": 16873527000960.0, + "grad_norm": 1.8616776845407013, + "language_loss": 0.75547618, + "learning_rate": 3.894675064326678e-06, + "loss": 0.77752787, + "num_input_tokens_seen": 46905620, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.0703125, + "step": 2173, + "time_per_iteration": 2.4639194011688232 + }, + { + "auxiliary_loss_clip": 0.01165867, + "auxiliary_loss_mlp": 0.01055263, + "balance_loss_clip": 1.03406715, + "balance_loss_mlp": 1.05319107, + "epoch": 0.13070795130016533, + "flos": 24499085748480.0, + "grad_norm": 2.777389952877741, + "language_loss": 0.70484382, + "learning_rate": 3.894550308446551e-06, + "loss": 0.72705513, + "num_input_tokens_seen": 46925120, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.125, + "step": 2174, + "time_per_iteration": 2.4914908409118652 + }, + { + "auxiliary_loss_clip": 0.01055025, + "auxiliary_loss_mlp": 0.01011158, + "balance_loss_clip": 1.0087378, + "balance_loss_mlp": 1.0165, + "epoch": 0.13076807455283332, + "flos": 71054505953280.0, + "grad_norm": 0.8022263951171452, + "language_loss": 0.59071571, + "learning_rate": 3.894425480724886e-06, + "loss": 0.61137754, + "num_input_tokens_seen": 46988195, + "router_z_loss_clip": 0.02416992, + "router_z_loss_mlp": 0.38671875, + "step": 2175, + "time_per_iteration": 3.244633913040161 + }, + { + "auxiliary_loss_clip": 0.01164931, + "auxiliary_loss_mlp": 0.01051735, + "balance_loss_clip": 1.03214908, + "balance_loss_mlp": 1.05474329, + "epoch": 0.13082819780550128, + "flos": 20264499475200.0, + "grad_norm": 2.247504257537708, + "language_loss": 0.79946023, + "learning_rate": 3.894300581166417e-06, + "loss": 0.8216269, + "num_input_tokens_seen": 47004720, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.1015625, + "step": 2176, + "time_per_iteration": 2.439883232116699 + }, + { + "auxiliary_loss_clip": 0.01163907, + "auxiliary_loss_mlp": 0.01050259, + "balance_loss_clip": 1.02806199, + "balance_loss_mlp": 1.05234194, + "epoch": 0.13088832105816925, + "flos": 34203441231360.0, + "grad_norm": 1.8562517641565577, + "language_loss": 0.74595284, + "learning_rate": 3.894175609775881e-06, + "loss": 0.76809454, + "num_input_tokens_seen": 47024255, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.1171875, + "step": 2177, + "time_per_iteration": 2.561140298843384 + }, + { + "auxiliary_loss_clip": 0.01163674, + "auxiliary_loss_mlp": 0.01046693, + "balance_loss_clip": 1.024472, + "balance_loss_mlp": 1.05222929, + "epoch": 0.13094844431083721, + "flos": 17894970057600.0, + "grad_norm": 2.128567307625778, + "language_loss": 0.81855309, + "learning_rate": 3.894050566558015e-06, + "loss": 0.84065676, + "num_input_tokens_seen": 47042465, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.1171875, + "step": 2178, + "time_per_iteration": 2.458812713623047 + }, + { + "auxiliary_loss_clip": 0.01166111, + "auxiliary_loss_mlp": 0.01047933, + "balance_loss_clip": 1.02695179, + "balance_loss_mlp": 1.05466795, + "epoch": 0.13100856756350518, + "flos": 17311313963520.0, + "grad_norm": 2.66972533149016, + "language_loss": 0.74942935, + "learning_rate": 3.893925451517562e-06, + "loss": 0.77156973, + "num_input_tokens_seen": 47060370, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.109375, + "step": 2179, + "time_per_iteration": 2.4679782390594482 + }, + { + "auxiliary_loss_clip": 0.01161603, + "auxiliary_loss_mlp": 0.0105054, + "balance_loss_clip": 1.03079903, + "balance_loss_mlp": 1.05280709, + "epoch": 0.13106869081617314, + "flos": 22200551562240.0, + "grad_norm": 2.0560779031919636, + "language_loss": 0.84319234, + "learning_rate": 3.893800264659266e-06, + "loss": 0.86531377, + "num_input_tokens_seen": 47081415, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0859375, + "step": 2180, + "time_per_iteration": 2.567873477935791 + }, + { + "auxiliary_loss_clip": 0.01166279, + "auxiliary_loss_mlp": 0.01054601, + "balance_loss_clip": 1.03483582, + "balance_loss_mlp": 1.05700839, + "epoch": 0.13112881406884114, + "flos": 21763123735680.0, + "grad_norm": 2.214126283525484, + "language_loss": 0.8987745, + "learning_rate": 3.8936750059878746e-06, + "loss": 0.92098325, + "num_input_tokens_seen": 47099860, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.09375, + "step": 2181, + "time_per_iteration": 2.4802486896514893 + }, + { + "auxiliary_loss_clip": 0.01166281, + "auxiliary_loss_mlp": 0.01043829, + "balance_loss_clip": 1.02438569, + "balance_loss_mlp": 1.0557189, + "epoch": 0.1311889373215091, + "flos": 23331091201920.0, + "grad_norm": 1.8993602522657917, + "language_loss": 0.68657839, + "learning_rate": 3.893549675508137e-06, + "loss": 0.70867944, + "num_input_tokens_seen": 47118540, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.109375, + "step": 2182, + "time_per_iteration": 2.460148572921753 + }, + { + "auxiliary_loss_clip": 0.01167141, + "auxiliary_loss_mlp": 0.01048146, + "balance_loss_clip": 1.02745128, + "balance_loss_mlp": 1.05504203, + "epoch": 0.13124906057417707, + "flos": 21467363149440.0, + "grad_norm": 2.6442759836393277, + "language_loss": 0.78435183, + "learning_rate": 3.893424273224806e-06, + "loss": 0.80650467, + "num_input_tokens_seen": 47136710, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.125, + "step": 2183, + "time_per_iteration": 2.5462143421173096 + }, + { + "auxiliary_loss_clip": 0.01162472, + "auxiliary_loss_mlp": 0.01043905, + "balance_loss_clip": 1.02375841, + "balance_loss_mlp": 1.05238128, + "epoch": 0.13130918382684503, + "flos": 23255319461760.0, + "grad_norm": 2.788927255894662, + "language_loss": 0.85543215, + "learning_rate": 3.893298799142636e-06, + "loss": 0.87749588, + "num_input_tokens_seen": 47157155, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.1015625, + "step": 2184, + "time_per_iteration": 3.8904993534088135 + }, + { + "auxiliary_loss_clip": 0.01165934, + "auxiliary_loss_mlp": 0.01047649, + "balance_loss_clip": 1.0265255, + "balance_loss_mlp": 1.0529201, + "epoch": 0.131369307079513, + "flos": 20850274471680.0, + "grad_norm": 2.505672435211917, + "language_loss": 0.82206696, + "learning_rate": 3.893173253266387e-06, + "loss": 0.84420282, + "num_input_tokens_seen": 47176820, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1328125, + "step": 2185, + "time_per_iteration": 5.3855485916137695 + }, + { + "auxiliary_loss_clip": 0.01168066, + "auxiliary_loss_mlp": 0.0105393, + "balance_loss_clip": 1.03323543, + "balance_loss_mlp": 1.05440092, + "epoch": 0.13142943033218096, + "flos": 17858341163520.0, + "grad_norm": 2.0294565364346235, + "language_loss": 0.73037684, + "learning_rate": 3.893047635600818e-06, + "loss": 0.7525968, + "num_input_tokens_seen": 47195855, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1328125, + "step": 2186, + "time_per_iteration": 2.4839119911193848 + }, + { + "auxiliary_loss_clip": 0.01165928, + "auxiliary_loss_mlp": 0.01048235, + "balance_loss_clip": 1.02601433, + "balance_loss_mlp": 1.05449164, + "epoch": 0.13148955358484893, + "flos": 20996035862400.0, + "grad_norm": 2.0525608711513614, + "language_loss": 0.80174023, + "learning_rate": 3.892921946150693e-06, + "loss": 0.82388186, + "num_input_tokens_seen": 47214535, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.109375, + "step": 2187, + "time_per_iteration": 2.463906764984131 + }, + { + "auxiliary_loss_clip": 0.01053378, + "auxiliary_loss_mlp": 0.01005839, + "balance_loss_clip": 1.00344312, + "balance_loss_mlp": 1.01508641, + "epoch": 0.13154967683751692, + "flos": 70172467580160.0, + "grad_norm": 0.8435449169341035, + "language_loss": 0.58977342, + "learning_rate": 3.892796184920778e-06, + "loss": 0.61036563, + "num_input_tokens_seen": 47270300, + "router_z_loss_clip": 0.02392578, + "router_z_loss_mlp": 0.3828125, + "step": 2188, + "time_per_iteration": 3.1052041053771973 + }, + { + "auxiliary_loss_clip": 0.01169813, + "auxiliary_loss_mlp": 0.01050803, + "balance_loss_clip": 1.03037024, + "balance_loss_mlp": 1.05918622, + "epoch": 0.1316098000901849, + "flos": 20376145923840.0, + "grad_norm": 2.1443848583942846, + "language_loss": 0.74199927, + "learning_rate": 3.892670351915842e-06, + "loss": 0.76420546, + "num_input_tokens_seen": 47290720, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.109375, + "step": 2189, + "time_per_iteration": 2.5137264728546143 + }, + { + "auxiliary_loss_clip": 0.01166605, + "auxiliary_loss_mlp": 0.01049022, + "balance_loss_clip": 1.02894759, + "balance_loss_mlp": 1.05678558, + "epoch": 0.13166992334285285, + "flos": 23221132692480.0, + "grad_norm": 1.7642431940848833, + "language_loss": 0.72561657, + "learning_rate": 3.892544447140657e-06, + "loss": 0.74777287, + "num_input_tokens_seen": 47311820, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.1015625, + "step": 2190, + "time_per_iteration": 2.5053412914276123 + }, + { + "auxiliary_loss_clip": 0.01169095, + "auxiliary_loss_mlp": 0.01051343, + "balance_loss_clip": 1.03094649, + "balance_loss_mlp": 1.05706906, + "epoch": 0.13173004659552082, + "flos": 23330947547520.0, + "grad_norm": 8.700182749243472, + "language_loss": 0.74395585, + "learning_rate": 3.892418470599996e-06, + "loss": 0.76616025, + "num_input_tokens_seen": 47331605, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.1171875, + "step": 2191, + "time_per_iteration": 2.507687568664551 + }, + { + "auxiliary_loss_clip": 0.01168877, + "auxiliary_loss_mlp": 0.01049305, + "balance_loss_clip": 1.02841949, + "balance_loss_mlp": 1.05689156, + "epoch": 0.13179016984818878, + "flos": 21251504367360.0, + "grad_norm": 2.0250128968483403, + "language_loss": 0.79286075, + "learning_rate": 3.892292422298637e-06, + "loss": 0.8150425, + "num_input_tokens_seen": 47350455, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1171875, + "step": 2192, + "time_per_iteration": 2.5068893432617188 + }, + { + "auxiliary_loss_clip": 0.01168449, + "auxiliary_loss_mlp": 0.01053422, + "balance_loss_clip": 1.03290629, + "balance_loss_mlp": 1.05564141, + "epoch": 0.13185029310085675, + "flos": 17778690754560.0, + "grad_norm": 1.9285179647135495, + "language_loss": 0.84827602, + "learning_rate": 3.892166302241361e-06, + "loss": 0.87049472, + "num_input_tokens_seen": 47368225, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.125, + "step": 2193, + "time_per_iteration": 2.456409215927124 + }, + { + "auxiliary_loss_clip": 0.0105585, + "auxiliary_loss_mlp": 0.01002145, + "balance_loss_clip": 0.99976075, + "balance_loss_mlp": 1.0179081, + "epoch": 0.1319104163535247, + "flos": 69851785933440.0, + "grad_norm": 0.7727203010194038, + "language_loss": 0.54049635, + "learning_rate": 3.8920401104329475e-06, + "loss": 0.56107628, + "num_input_tokens_seen": 47427125, + "router_z_loss_clip": 0.02380371, + "router_z_loss_mlp": 0.37890625, + "step": 2194, + "time_per_iteration": 3.0569794178009033 + }, + { + "auxiliary_loss_clip": 0.01165932, + "auxiliary_loss_mlp": 0.01046302, + "balance_loss_clip": 1.02566671, + "balance_loss_mlp": 1.05514359, + "epoch": 0.1319705396061927, + "flos": 25193095401600.0, + "grad_norm": 1.7688784093808256, + "language_loss": 0.72086227, + "learning_rate": 3.891913846878185e-06, + "loss": 0.74298465, + "num_input_tokens_seen": 47450275, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.109375, + "step": 2195, + "time_per_iteration": 2.527435541152954 + }, + { + "auxiliary_loss_clip": 0.01173804, + "auxiliary_loss_mlp": 0.01045041, + "balance_loss_clip": 1.02310634, + "balance_loss_mlp": 1.05663633, + "epoch": 0.13203066285886067, + "flos": 20740459616640.0, + "grad_norm": 1.7664998702658374, + "language_loss": 0.78195536, + "learning_rate": 3.891787511581859e-06, + "loss": 0.80414379, + "num_input_tokens_seen": 47469155, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.171875, + "step": 2196, + "time_per_iteration": 2.4685165882110596 + }, + { + "auxiliary_loss_clip": 0.01169552, + "auxiliary_loss_mlp": 0.01046979, + "balance_loss_clip": 1.02714252, + "balance_loss_mlp": 1.05638218, + "epoch": 0.13209078611152864, + "flos": 22054395121920.0, + "grad_norm": 2.1663119445052295, + "language_loss": 0.74861938, + "learning_rate": 3.89166110454876e-06, + "loss": 0.77078474, + "num_input_tokens_seen": 47488405, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.1328125, + "step": 2197, + "time_per_iteration": 2.489504814147949 + }, + { + "auxiliary_loss_clip": 0.01170611, + "auxiliary_loss_mlp": 0.01044966, + "balance_loss_clip": 1.02430725, + "balance_loss_mlp": 1.05543399, + "epoch": 0.1321509093641966, + "flos": 16284950743680.0, + "grad_norm": 2.4378795089069674, + "language_loss": 0.8011694, + "learning_rate": 3.891534625783685e-06, + "loss": 0.82332516, + "num_input_tokens_seen": 47505650, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1484375, + "step": 2198, + "time_per_iteration": 2.437718391418457 + }, + { + "auxiliary_loss_clip": 0.0116676, + "auxiliary_loss_mlp": 0.01061419, + "balance_loss_clip": 1.04173732, + "balance_loss_mlp": 1.05483699, + "epoch": 0.13221103261686457, + "flos": 16983018633600.0, + "grad_norm": 2.4514815632850038, + "language_loss": 0.82552117, + "learning_rate": 3.891408075291425e-06, + "loss": 0.847803, + "num_input_tokens_seen": 47521540, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1171875, + "step": 2199, + "time_per_iteration": 2.47356915473938 + }, + { + "auxiliary_loss_clip": 0.01167277, + "auxiliary_loss_mlp": 0.01047633, + "balance_loss_clip": 1.02724838, + "balance_loss_mlp": 1.05458844, + "epoch": 0.13227115586953253, + "flos": 34233605677440.0, + "grad_norm": 2.465688895758548, + "language_loss": 0.68963099, + "learning_rate": 3.8912814530767826e-06, + "loss": 0.71178007, + "num_input_tokens_seen": 47543625, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.125, + "step": 2200, + "time_per_iteration": 2.5828843116760254 + }, + { + "auxiliary_loss_clip": 0.01166491, + "auxiliary_loss_mlp": 0.01055533, + "balance_loss_clip": 1.03420663, + "balance_loss_mlp": 1.05397916, + "epoch": 0.13233127912220052, + "flos": 20704656735360.0, + "grad_norm": 2.591612522060186, + "language_loss": 0.84600091, + "learning_rate": 3.891154759144557e-06, + "loss": 0.86822116, + "num_input_tokens_seen": 47563740, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.125, + "step": 2201, + "time_per_iteration": 2.5546202659606934 + }, + { + "auxiliary_loss_clip": 0.01168797, + "auxiliary_loss_mlp": 0.01054072, + "balance_loss_clip": 1.03315115, + "balance_loss_mlp": 1.05466592, + "epoch": 0.1323914023748685, + "flos": 25805048434560.0, + "grad_norm": 1.901870031688447, + "language_loss": 0.86978126, + "learning_rate": 3.891027993499554e-06, + "loss": 0.89200991, + "num_input_tokens_seen": 47582655, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.140625, + "step": 2202, + "time_per_iteration": 2.509300470352173 + }, + { + "auxiliary_loss_clip": 0.01164666, + "auxiliary_loss_mlp": 0.01042993, + "balance_loss_clip": 1.02364576, + "balance_loss_mlp": 1.05389142, + "epoch": 0.13245152562753645, + "flos": 21251540280960.0, + "grad_norm": 2.3614014237187084, + "language_loss": 0.72746712, + "learning_rate": 3.89090115614658e-06, + "loss": 0.74954367, + "num_input_tokens_seen": 47600875, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.109375, + "step": 2203, + "time_per_iteration": 2.5267388820648193 + }, + { + "auxiliary_loss_clip": 0.01167891, + "auxiliary_loss_mlp": 0.0105678, + "balance_loss_clip": 1.03781366, + "balance_loss_mlp": 1.05453348, + "epoch": 0.13251164888020442, + "flos": 26610955931520.0, + "grad_norm": 2.5436302639516, + "language_loss": 0.73248756, + "learning_rate": 3.890774247090444e-06, + "loss": 0.75473428, + "num_input_tokens_seen": 47619250, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.1328125, + "step": 2204, + "time_per_iteration": 2.5298051834106445 + }, + { + "auxiliary_loss_clip": 0.01168712, + "auxiliary_loss_mlp": 0.01053403, + "balance_loss_clip": 1.03211212, + "balance_loss_mlp": 1.05558085, + "epoch": 0.13257177213287238, + "flos": 29826541272960.0, + "grad_norm": 1.7540271848273767, + "language_loss": 0.78627133, + "learning_rate": 3.89064726633596e-06, + "loss": 0.80849254, + "num_input_tokens_seen": 47639445, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.1328125, + "step": 2205, + "time_per_iteration": 2.5343189239501953 + }, + { + "auxiliary_loss_clip": 0.01166449, + "auxiliary_loss_mlp": 0.01053788, + "balance_loss_clip": 1.033391, + "balance_loss_mlp": 1.05560231, + "epoch": 0.13263189538554035, + "flos": 21288456483840.0, + "grad_norm": 2.234297854715259, + "language_loss": 0.78748876, + "learning_rate": 3.890520213887941e-06, + "loss": 0.80969107, + "num_input_tokens_seen": 47658740, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.109375, + "step": 2206, + "time_per_iteration": 2.473229169845581 + }, + { + "auxiliary_loss_clip": 0.01170953, + "auxiliary_loss_mlp": 0.01046503, + "balance_loss_clip": 1.02750087, + "balance_loss_mlp": 1.05758011, + "epoch": 0.13269201863820831, + "flos": 16874101618560.0, + "grad_norm": 2.3028539815574494, + "language_loss": 0.73993444, + "learning_rate": 3.890393089751208e-06, + "loss": 0.76210898, + "num_input_tokens_seen": 47676880, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.125, + "step": 2207, + "time_per_iteration": 2.479421854019165 + }, + { + "auxiliary_loss_clip": 0.01160402, + "auxiliary_loss_mlp": 0.0104899, + "balance_loss_clip": 1.02822387, + "balance_loss_mlp": 1.05323017, + "epoch": 0.1327521418908763, + "flos": 23768914078080.0, + "grad_norm": 2.4105539478543454, + "language_loss": 0.84151787, + "learning_rate": 3.890265893930578e-06, + "loss": 0.86361182, + "num_input_tokens_seen": 47696635, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.0703125, + "step": 2208, + "time_per_iteration": 2.501969337463379 + }, + { + "auxiliary_loss_clip": 0.01161894, + "auxiliary_loss_mlp": 0.01055633, + "balance_loss_clip": 1.03621435, + "balance_loss_mlp": 1.05553222, + "epoch": 0.13281226514354427, + "flos": 26505594362880.0, + "grad_norm": 1.9362156368998853, + "language_loss": 0.85323346, + "learning_rate": 3.890138626430876e-06, + "loss": 0.87540877, + "num_input_tokens_seen": 47717760, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0625, + "step": 2209, + "time_per_iteration": 2.509761333465576 + }, + { + "auxiliary_loss_clip": 0.01166975, + "auxiliary_loss_mlp": 0.01049621, + "balance_loss_clip": 1.03039217, + "balance_loss_mlp": 1.05628705, + "epoch": 0.13287238839621224, + "flos": 24498762526080.0, + "grad_norm": 2.055387861012722, + "language_loss": 0.81545013, + "learning_rate": 3.890011287256929e-06, + "loss": 0.83761609, + "num_input_tokens_seen": 47737685, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.109375, + "step": 2210, + "time_per_iteration": 2.4920527935028076 + }, + { + "auxiliary_loss_clip": 0.0106048, + "auxiliary_loss_mlp": 0.01003994, + "balance_loss_clip": 1.0016222, + "balance_loss_mlp": 1.02205014, + "epoch": 0.1329325116488802, + "flos": 67694344369920.0, + "grad_norm": 0.7616894664797615, + "language_loss": 0.57984382, + "learning_rate": 3.889883876413563e-06, + "loss": 0.6004886, + "num_input_tokens_seen": 47802415, + "router_z_loss_clip": 0.02368164, + "router_z_loss_mlp": 0.3828125, + "step": 2211, + "time_per_iteration": 3.1735260486602783 + }, + { + "auxiliary_loss_clip": 0.01059664, + "auxiliary_loss_mlp": 0.01005439, + "balance_loss_clip": 1.00312614, + "balance_loss_mlp": 1.02081084, + "epoch": 0.13299263490154817, + "flos": 72261894741120.0, + "grad_norm": 0.7970523185699088, + "language_loss": 0.55364317, + "learning_rate": 3.889756393905611e-06, + "loss": 0.57429421, + "num_input_tokens_seen": 47871485, + "router_z_loss_clip": 0.02307129, + "router_z_loss_mlp": 0.38671875, + "step": 2212, + "time_per_iteration": 3.142056465148926 + }, + { + "auxiliary_loss_clip": 0.01170665, + "auxiliary_loss_mlp": 0.01052255, + "balance_loss_clip": 1.03164423, + "balance_loss_mlp": 1.056463, + "epoch": 0.13305275815421613, + "flos": 17931275729280.0, + "grad_norm": 4.2694742121271645, + "language_loss": 0.74779308, + "learning_rate": 3.889628839737908e-06, + "loss": 0.77002227, + "num_input_tokens_seen": 47888315, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.140625, + "step": 2213, + "time_per_iteration": 2.4599013328552246 + }, + { + "auxiliary_loss_clip": 0.0115893, + "auxiliary_loss_mlp": 0.01047564, + "balance_loss_clip": 1.02889609, + "balance_loss_mlp": 1.05235839, + "epoch": 0.13311288140688413, + "flos": 22340889999360.0, + "grad_norm": 2.0343460890824927, + "language_loss": 0.79269958, + "learning_rate": 3.889501213915291e-06, + "loss": 0.81476456, + "num_input_tokens_seen": 47906600, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.0625, + "step": 2214, + "time_per_iteration": 2.4635097980499268 + }, + { + "auxiliary_loss_clip": 0.01166328, + "auxiliary_loss_mlp": 0.01051317, + "balance_loss_clip": 1.03062189, + "balance_loss_mlp": 1.05593503, + "epoch": 0.1331730046595521, + "flos": 31868888682240.0, + "grad_norm": 2.0399610331480407, + "language_loss": 0.69410872, + "learning_rate": 3.889373516442597e-06, + "loss": 0.71628523, + "num_input_tokens_seen": 47927630, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1015625, + "step": 2215, + "time_per_iteration": 2.5798754692077637 + }, + { + "auxiliary_loss_clip": 0.01166771, + "auxiliary_loss_mlp": 0.01046808, + "balance_loss_clip": 1.02725816, + "balance_loss_mlp": 1.05576539, + "epoch": 0.13323312791222006, + "flos": 22566589107840.0, + "grad_norm": 2.4518621177772175, + "language_loss": 0.81136751, + "learning_rate": 3.889245747324671e-06, + "loss": 0.83350337, + "num_input_tokens_seen": 47947935, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.109375, + "step": 2216, + "time_per_iteration": 2.49601674079895 + }, + { + "auxiliary_loss_clip": 0.01166215, + "auxiliary_loss_mlp": 0.01057297, + "balance_loss_clip": 1.03668606, + "balance_loss_mlp": 1.05610895, + "epoch": 0.13329325116488802, + "flos": 15085319293440.0, + "grad_norm": 3.5729384628186307, + "language_loss": 0.87350845, + "learning_rate": 3.889117906566356e-06, + "loss": 0.89574361, + "num_input_tokens_seen": 47965515, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.1015625, + "step": 2217, + "time_per_iteration": 2.435224771499634 + }, + { + "auxiliary_loss_clip": 0.01165439, + "auxiliary_loss_mlp": 0.01048261, + "balance_loss_clip": 1.02716112, + "balance_loss_mlp": 1.05609739, + "epoch": 0.133353374417556, + "flos": 27453671890560.0, + "grad_norm": 2.6393181601709057, + "language_loss": 0.73460543, + "learning_rate": 3.888989994172501e-06, + "loss": 0.75674248, + "num_input_tokens_seen": 47985675, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.09375, + "step": 2218, + "time_per_iteration": 2.4984188079833984 + }, + { + "auxiliary_loss_clip": 0.01164132, + "auxiliary_loss_mlp": 0.01044805, + "balance_loss_clip": 1.02401495, + "balance_loss_mlp": 1.05406141, + "epoch": 0.13341349767022395, + "flos": 24094695456000.0, + "grad_norm": 1.803125703936159, + "language_loss": 0.87483871, + "learning_rate": 3.8888620101479565e-06, + "loss": 0.89692807, + "num_input_tokens_seen": 48004985, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.09375, + "step": 2219, + "time_per_iteration": 2.4761111736297607 + }, + { + "auxiliary_loss_clip": 0.01166927, + "auxiliary_loss_mlp": 0.01051114, + "balance_loss_clip": 1.03198123, + "balance_loss_mlp": 1.05804753, + "epoch": 0.13347362092289192, + "flos": 24133335511680.0, + "grad_norm": 1.5604165479120375, + "language_loss": 0.77241862, + "learning_rate": 3.888733954497574e-06, + "loss": 0.79459906, + "num_input_tokens_seen": 48024965, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0859375, + "step": 2220, + "time_per_iteration": 2.5172770023345947 + }, + { + "auxiliary_loss_clip": 0.01158357, + "auxiliary_loss_mlp": 0.01044474, + "balance_loss_clip": 1.02599692, + "balance_loss_mlp": 1.05065227, + "epoch": 0.1335337441755599, + "flos": 18436538390400.0, + "grad_norm": 2.752699726256429, + "language_loss": 0.79361391, + "learning_rate": 3.888605827226212e-06, + "loss": 0.81564224, + "num_input_tokens_seen": 48040890, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.078125, + "step": 2221, + "time_per_iteration": 2.4729459285736084 + }, + { + "auxiliary_loss_clip": 0.01056162, + "auxiliary_loss_mlp": 0.01004009, + "balance_loss_clip": 1.00177944, + "balance_loss_mlp": 1.01797867, + "epoch": 0.13359386742822787, + "flos": 50611997652480.0, + "grad_norm": 0.9620212456786271, + "language_loss": 0.6890744, + "learning_rate": 3.8884776283387275e-06, + "loss": 0.70967615, + "num_input_tokens_seen": 48091855, + "router_z_loss_clip": 0.02233887, + "router_z_loss_mlp": 0.3828125, + "step": 2222, + "time_per_iteration": 2.9102694988250732 + }, + { + "auxiliary_loss_clip": 0.011664, + "auxiliary_loss_mlp": 0.01047762, + "balance_loss_clip": 1.02885592, + "balance_loss_mlp": 1.05645049, + "epoch": 0.13365399068089584, + "flos": 22778569221120.0, + "grad_norm": 1.8990549263762904, + "language_loss": 0.66966134, + "learning_rate": 3.888349357839982e-06, + "loss": 0.69180298, + "num_input_tokens_seen": 48111350, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.1015625, + "step": 2223, + "time_per_iteration": 2.4860363006591797 + }, + { + "auxiliary_loss_clip": 0.01162257, + "auxiliary_loss_mlp": 0.01055999, + "balance_loss_clip": 1.03584075, + "balance_loss_mlp": 1.05173874, + "epoch": 0.1337141139335638, + "flos": 12531603911040.0, + "grad_norm": 2.0940561003244738, + "language_loss": 0.82572883, + "learning_rate": 3.88822101573484e-06, + "loss": 0.84791142, + "num_input_tokens_seen": 48129840, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.109375, + "step": 2224, + "time_per_iteration": 2.453310966491699 + }, + { + "auxiliary_loss_clip": 0.01167505, + "auxiliary_loss_mlp": 0.01047104, + "balance_loss_clip": 1.0266118, + "balance_loss_mlp": 1.05410361, + "epoch": 0.13377423718623177, + "flos": 23038957889280.0, + "grad_norm": 2.0797940389634624, + "language_loss": 0.66006851, + "learning_rate": 3.888092602028167e-06, + "loss": 0.68221462, + "num_input_tokens_seen": 48149240, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1328125, + "step": 2225, + "time_per_iteration": 2.505760669708252 + }, + { + "auxiliary_loss_clip": 0.01164479, + "auxiliary_loss_mlp": 0.01054978, + "balance_loss_clip": 1.03491461, + "balance_loss_mlp": 1.05366707, + "epoch": 0.13383436043889974, + "flos": 16216397637120.0, + "grad_norm": 2.2490181158076545, + "language_loss": 0.89484501, + "learning_rate": 3.887964116724835e-06, + "loss": 0.91703951, + "num_input_tokens_seen": 48166330, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.109375, + "step": 2226, + "time_per_iteration": 3.827432632446289 + }, + { + "auxiliary_loss_clip": 0.01166768, + "auxiliary_loss_mlp": 0.01050683, + "balance_loss_clip": 1.03132319, + "balance_loss_mlp": 1.05492473, + "epoch": 0.1338944836915677, + "flos": 24279671520000.0, + "grad_norm": 2.0692514385202947, + "language_loss": 0.73874348, + "learning_rate": 3.887835559829712e-06, + "loss": 0.76091796, + "num_input_tokens_seen": 48187600, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.1171875, + "step": 2227, + "time_per_iteration": 5.469221115112305 + }, + { + "auxiliary_loss_clip": 0.01166021, + "auxiliary_loss_mlp": 0.01049972, + "balance_loss_clip": 1.02971888, + "balance_loss_mlp": 1.05582607, + "epoch": 0.1339546069442357, + "flos": 17598742594560.0, + "grad_norm": 2.597241668203809, + "language_loss": 0.8519839, + "learning_rate": 3.8877069313476764e-06, + "loss": 0.87414384, + "num_input_tokens_seen": 48204400, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1015625, + "step": 2228, + "time_per_iteration": 2.449289560317993 + }, + { + "auxiliary_loss_clip": 0.01162737, + "auxiliary_loss_mlp": 0.01047632, + "balance_loss_clip": 1.0275687, + "balance_loss_mlp": 1.05501461, + "epoch": 0.13401473019690366, + "flos": 18990065952000.0, + "grad_norm": 2.700498827765594, + "language_loss": 0.8100034, + "learning_rate": 3.8875782312836054e-06, + "loss": 0.83210707, + "num_input_tokens_seen": 48222180, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.078125, + "step": 2229, + "time_per_iteration": 2.454185962677002 + }, + { + "auxiliary_loss_clip": 0.01165405, + "auxiliary_loss_mlp": 0.01055372, + "balance_loss_clip": 1.03528547, + "balance_loss_mlp": 1.05576682, + "epoch": 0.13407485344957162, + "flos": 26943812288640.0, + "grad_norm": 2.350850930683171, + "language_loss": 0.73814881, + "learning_rate": 3.887449459642378e-06, + "loss": 0.76035661, + "num_input_tokens_seen": 48243245, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.09375, + "step": 2230, + "time_per_iteration": 2.538679838180542 + }, + { + "auxiliary_loss_clip": 0.0116587, + "auxiliary_loss_mlp": 0.01055192, + "balance_loss_clip": 1.03551102, + "balance_loss_mlp": 1.0541544, + "epoch": 0.1341349767022396, + "flos": 20339373375360.0, + "grad_norm": 8.27737726970052, + "language_loss": 0.79914325, + "learning_rate": 3.8873206164288785e-06, + "loss": 0.82135391, + "num_input_tokens_seen": 48262600, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.1171875, + "step": 2231, + "time_per_iteration": 2.436964273452759 + }, + { + "auxiliary_loss_clip": 0.0116777, + "auxiliary_loss_mlp": 0.01049092, + "balance_loss_clip": 1.02906466, + "balance_loss_mlp": 1.05716896, + "epoch": 0.13419509995490755, + "flos": 29862020931840.0, + "grad_norm": 1.9954658779127024, + "language_loss": 0.72341192, + "learning_rate": 3.887191701647992e-06, + "loss": 0.74558049, + "num_input_tokens_seen": 48285075, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.109375, + "step": 2232, + "time_per_iteration": 2.5315330028533936 + }, + { + "auxiliary_loss_clip": 0.01169038, + "auxiliary_loss_mlp": 0.01047761, + "balance_loss_clip": 1.02664888, + "balance_loss_mlp": 1.05505097, + "epoch": 0.13425522320757552, + "flos": 26942986275840.0, + "grad_norm": 2.53729194427275, + "language_loss": 0.65508974, + "learning_rate": 3.8870627153046066e-06, + "loss": 0.67725778, + "num_input_tokens_seen": 48301285, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.140625, + "step": 2233, + "time_per_iteration": 2.480006694793701 + }, + { + "auxiliary_loss_clip": 0.01161612, + "auxiliary_loss_mlp": 0.01047371, + "balance_loss_clip": 1.02687883, + "balance_loss_mlp": 1.05011904, + "epoch": 0.1343153464602435, + "flos": 15777281871360.0, + "grad_norm": 4.541384002557222, + "language_loss": 0.81492066, + "learning_rate": 3.886933657403615e-06, + "loss": 0.8370105, + "num_input_tokens_seen": 48317835, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1171875, + "step": 2234, + "time_per_iteration": 2.431962490081787 + }, + { + "auxiliary_loss_clip": 0.01165653, + "auxiliary_loss_mlp": 0.01054939, + "balance_loss_clip": 1.03466105, + "balance_loss_mlp": 1.05424869, + "epoch": 0.13437546971291148, + "flos": 24314756129280.0, + "grad_norm": 1.9481483268780417, + "language_loss": 0.82361299, + "learning_rate": 3.886804527949909e-06, + "loss": 0.84581894, + "num_input_tokens_seen": 48335670, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1171875, + "step": 2235, + "time_per_iteration": 2.4478979110717773 + }, + { + "auxiliary_loss_clip": 0.0116322, + "auxiliary_loss_mlp": 0.01055841, + "balance_loss_clip": 1.03378713, + "balance_loss_mlp": 1.05170834, + "epoch": 0.13443559296557944, + "flos": 26650673395200.0, + "grad_norm": 1.6568048404288893, + "language_loss": 0.86399209, + "learning_rate": 3.8866753269483864e-06, + "loss": 0.88618279, + "num_input_tokens_seen": 48357805, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1171875, + "step": 2236, + "time_per_iteration": 2.534761428833008 + }, + { + "auxiliary_loss_clip": 0.01166625, + "auxiliary_loss_mlp": 0.0104777, + "balance_loss_clip": 1.02712345, + "balance_loss_mlp": 1.05506372, + "epoch": 0.1344957162182474, + "flos": 21796197183360.0, + "grad_norm": 1.5401183277834882, + "language_loss": 0.76936173, + "learning_rate": 3.886546054403946e-06, + "loss": 0.79150563, + "num_input_tokens_seen": 48377845, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1171875, + "step": 2237, + "time_per_iteration": 2.454881191253662 + }, + { + "auxiliary_loss_clip": 0.0116441, + "auxiliary_loss_mlp": 0.01051932, + "balance_loss_clip": 1.02974725, + "balance_loss_mlp": 1.05312407, + "epoch": 0.13455583947091537, + "flos": 19865568049920.0, + "grad_norm": 1.976295310563951, + "language_loss": 0.78737688, + "learning_rate": 3.886416710321491e-06, + "loss": 0.80954033, + "num_input_tokens_seen": 48394735, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.1171875, + "step": 2238, + "time_per_iteration": 2.508364200592041 + }, + { + "auxiliary_loss_clip": 0.01162005, + "auxiliary_loss_mlp": 0.01051856, + "balance_loss_clip": 1.03057706, + "balance_loss_mlp": 1.0530107, + "epoch": 0.13461596272358334, + "flos": 30846835094400.0, + "grad_norm": 2.3078790626960246, + "language_loss": 0.67977941, + "learning_rate": 3.886287294705924e-06, + "loss": 0.70191795, + "num_input_tokens_seen": 48414200, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.09375, + "step": 2239, + "time_per_iteration": 2.533148765563965 + }, + { + "auxiliary_loss_clip": 0.01165153, + "auxiliary_loss_mlp": 0.01049226, + "balance_loss_clip": 1.02888918, + "balance_loss_mlp": 1.05296254, + "epoch": 0.1346760859762513, + "flos": 12494436312960.0, + "grad_norm": 2.7482132203763245, + "language_loss": 0.81085825, + "learning_rate": 3.8861578075621555e-06, + "loss": 0.83300203, + "num_input_tokens_seen": 48431065, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.125, + "step": 2240, + "time_per_iteration": 2.458702802658081 + }, + { + "auxiliary_loss_clip": 0.01166075, + "auxiliary_loss_mlp": 0.01050419, + "balance_loss_clip": 1.02958083, + "balance_loss_mlp": 1.05302262, + "epoch": 0.1347362092289193, + "flos": 21836022387840.0, + "grad_norm": 1.775061814751768, + "language_loss": 0.77491653, + "learning_rate": 3.886028248895093e-06, + "loss": 0.79708141, + "num_input_tokens_seen": 48450335, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1328125, + "step": 2241, + "time_per_iteration": 2.4814610481262207 + }, + { + "auxiliary_loss_clip": 0.01160364, + "auxiliary_loss_mlp": 0.01044969, + "balance_loss_clip": 1.0265156, + "balance_loss_mlp": 1.05368328, + "epoch": 0.13479633248158726, + "flos": 23509459163520.0, + "grad_norm": 1.708340264075402, + "language_loss": 0.83106101, + "learning_rate": 3.88589861870965e-06, + "loss": 0.85311437, + "num_input_tokens_seen": 48468555, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.0625, + "step": 2242, + "time_per_iteration": 2.531010627746582 + }, + { + "auxiliary_loss_clip": 0.01166889, + "auxiliary_loss_mlp": 0.01052169, + "balance_loss_clip": 1.03056788, + "balance_loss_mlp": 1.05465889, + "epoch": 0.13485645573425523, + "flos": 29344332165120.0, + "grad_norm": 3.594763109819468, + "language_loss": 0.64927268, + "learning_rate": 3.885768917010744e-06, + "loss": 0.67146331, + "num_input_tokens_seen": 48488515, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.125, + "step": 2243, + "time_per_iteration": 2.5215864181518555 + }, + { + "auxiliary_loss_clip": 0.01158013, + "auxiliary_loss_mlp": 0.01045929, + "balance_loss_clip": 1.02573538, + "balance_loss_mlp": 1.05214143, + "epoch": 0.1349165789869232, + "flos": 28037112503040.0, + "grad_norm": 1.6702464572283469, + "language_loss": 0.72275442, + "learning_rate": 3.8856391438032895e-06, + "loss": 0.74479383, + "num_input_tokens_seen": 48510515, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0625, + "step": 2244, + "time_per_iteration": 2.572275161743164 + }, + { + "auxiliary_loss_clip": 0.01161264, + "auxiliary_loss_mlp": 0.01052624, + "balance_loss_clip": 1.03339577, + "balance_loss_mlp": 1.0510093, + "epoch": 0.13497670223959116, + "flos": 22853730430080.0, + "grad_norm": 1.6251739599249553, + "language_loss": 0.86419517, + "learning_rate": 3.88550929909221e-06, + "loss": 0.886334, + "num_input_tokens_seen": 48529940, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.1015625, + "step": 2245, + "time_per_iteration": 2.4847447872161865 + }, + { + "auxiliary_loss_clip": 0.0115964, + "auxiliary_loss_mlp": 0.01049956, + "balance_loss_clip": 1.029953, + "balance_loss_mlp": 1.0534606, + "epoch": 0.13503682549225912, + "flos": 16504580453760.0, + "grad_norm": 1.986035604010071, + "language_loss": 0.79054129, + "learning_rate": 3.88537938288243e-06, + "loss": 0.81263721, + "num_input_tokens_seen": 48548190, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0625, + "step": 2246, + "time_per_iteration": 2.521500825881958 + }, + { + "auxiliary_loss_clip": 0.01053943, + "auxiliary_loss_mlp": 0.01006436, + "balance_loss_clip": 1.00378919, + "balance_loss_mlp": 1.01705432, + "epoch": 0.1350969487449271, + "flos": 70756303242240.0, + "grad_norm": 0.7498554605470831, + "language_loss": 0.60597092, + "learning_rate": 3.885249395178874e-06, + "loss": 0.6265747, + "num_input_tokens_seen": 48613165, + "router_z_loss_clip": 0.02648926, + "router_z_loss_mlp": 0.3671875, + "step": 2247, + "time_per_iteration": 3.209567070007324 + }, + { + "auxiliary_loss_clip": 0.0117261, + "auxiliary_loss_mlp": 0.01058621, + "balance_loss_clip": 1.03629315, + "balance_loss_mlp": 1.05673957, + "epoch": 0.13515707199759508, + "flos": 23075981832960.0, + "grad_norm": 2.930333372025318, + "language_loss": 0.81250268, + "learning_rate": 3.885119335986473e-06, + "loss": 0.83481503, + "num_input_tokens_seen": 48631705, + "router_z_loss_clip": 0.22265625, + "router_z_loss_mlp": 1.15625, + "step": 2248, + "time_per_iteration": 2.5274717807769775 + }, + { + "auxiliary_loss_clip": 0.01157631, + "auxiliary_loss_mlp": 0.0104321, + "balance_loss_clip": 1.02503014, + "balance_loss_mlp": 1.0515008, + "epoch": 0.13521719525026304, + "flos": 23186371305600.0, + "grad_norm": 2.1598236051462383, + "language_loss": 0.77427459, + "learning_rate": 3.884989205310157e-06, + "loss": 0.79628301, + "num_input_tokens_seen": 48649740, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.0625, + "step": 2249, + "time_per_iteration": 2.475325345993042 + }, + { + "auxiliary_loss_clip": 0.01161564, + "auxiliary_loss_mlp": 0.01053869, + "balance_loss_clip": 1.03477216, + "balance_loss_mlp": 1.05408192, + "epoch": 0.135277318502931, + "flos": 24790931752320.0, + "grad_norm": 1.4620260499768896, + "language_loss": 0.84598488, + "learning_rate": 3.884859003154862e-06, + "loss": 0.86813927, + "num_input_tokens_seen": 48671565, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0703125, + "step": 2250, + "time_per_iteration": 2.5579018592834473 + }, + { + "auxiliary_loss_clip": 0.01162926, + "auxiliary_loss_mlp": 0.0104688, + "balance_loss_clip": 1.02586317, + "balance_loss_mlp": 1.05311561, + "epoch": 0.13533744175559898, + "flos": 21908525990400.0, + "grad_norm": 1.9830962049575767, + "language_loss": 0.8213973, + "learning_rate": 3.884728729525524e-06, + "loss": 0.84349537, + "num_input_tokens_seen": 48690425, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.1015625, + "step": 2251, + "time_per_iteration": 2.459254503250122 + }, + { + "auxiliary_loss_clip": 0.01160349, + "auxiliary_loss_mlp": 0.01053163, + "balance_loss_clip": 1.03144348, + "balance_loss_mlp": 1.05075097, + "epoch": 0.13539756500826694, + "flos": 21211643249280.0, + "grad_norm": 1.6927381248236872, + "language_loss": 0.85981321, + "learning_rate": 3.884598384427084e-06, + "loss": 0.88194835, + "num_input_tokens_seen": 48707505, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.09375, + "step": 2252, + "time_per_iteration": 2.508246421813965 + }, + { + "auxiliary_loss_clip": 0.01050697, + "auxiliary_loss_mlp": 0.0100648, + "balance_loss_clip": 1.00398886, + "balance_loss_mlp": 1.01368976, + "epoch": 0.1354576882609349, + "flos": 63242103634560.0, + "grad_norm": 0.7502755191421498, + "language_loss": 0.61736262, + "learning_rate": 3.884467967864485e-06, + "loss": 0.63793439, + "num_input_tokens_seen": 48775895, + "router_z_loss_clip": 0.02490234, + "router_z_loss_mlp": 0.37109375, + "step": 2253, + "time_per_iteration": 3.1357691287994385 + }, + { + "auxiliary_loss_clip": 0.01163708, + "auxiliary_loss_mlp": 0.01055809, + "balance_loss_clip": 1.0357219, + "balance_loss_mlp": 1.05454588, + "epoch": 0.1355178115136029, + "flos": 25483037984640.0, + "grad_norm": 2.033104819567641, + "language_loss": 0.89383745, + "learning_rate": 3.884337479842671e-06, + "loss": 0.91603261, + "num_input_tokens_seen": 48798370, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.09375, + "step": 2254, + "time_per_iteration": 2.4983997344970703 + }, + { + "auxiliary_loss_clip": 0.01166229, + "auxiliary_loss_mlp": 0.01050811, + "balance_loss_clip": 1.02786362, + "balance_loss_mlp": 1.05202925, + "epoch": 0.13557793476627086, + "flos": 21616967295360.0, + "grad_norm": 2.0851597725495843, + "language_loss": 0.84461302, + "learning_rate": 3.884206920366591e-06, + "loss": 0.86678338, + "num_input_tokens_seen": 48817955, + "router_z_loss_clip": 0.22949219, + "router_z_loss_mlp": 1.140625, + "step": 2255, + "time_per_iteration": 2.4466094970703125 + }, + { + "auxiliary_loss_clip": 0.01159898, + "auxiliary_loss_mlp": 0.01046769, + "balance_loss_clip": 1.02632451, + "balance_loss_mlp": 1.05059099, + "epoch": 0.13563805801893883, + "flos": 24928253447040.0, + "grad_norm": 2.8290739743459126, + "language_loss": 0.7493006, + "learning_rate": 3.884076289441196e-06, + "loss": 0.77136725, + "num_input_tokens_seen": 48836330, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.09375, + "step": 2256, + "time_per_iteration": 2.49464750289917 + }, + { + "auxiliary_loss_clip": 0.01164681, + "auxiliary_loss_mlp": 0.01049021, + "balance_loss_clip": 1.02806377, + "balance_loss_mlp": 1.05080438, + "epoch": 0.1356981812716068, + "flos": 14750272206720.0, + "grad_norm": 4.107811937736733, + "language_loss": 0.83023381, + "learning_rate": 3.88394558707144e-06, + "loss": 0.85237086, + "num_input_tokens_seen": 48851890, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.140625, + "step": 2257, + "time_per_iteration": 2.4069128036499023 + }, + { + "auxiliary_loss_clip": 0.0116793, + "auxiliary_loss_mlp": 0.01054876, + "balance_loss_clip": 1.03272712, + "balance_loss_mlp": 1.05211377, + "epoch": 0.13575830452427476, + "flos": 11108571822720.0, + "grad_norm": 2.2162023158830655, + "language_loss": 0.82266492, + "learning_rate": 3.883814813262277e-06, + "loss": 0.84489298, + "num_input_tokens_seen": 48865510, + "router_z_loss_clip": 0.22167969, + "router_z_loss_mlp": 1.15625, + "step": 2258, + "time_per_iteration": 2.4187939167022705 + }, + { + "auxiliary_loss_clip": 0.01161942, + "auxiliary_loss_mlp": 0.01051916, + "balance_loss_clip": 1.02890849, + "balance_loss_mlp": 1.05117583, + "epoch": 0.13581842777694272, + "flos": 17960290940160.0, + "grad_norm": 2.3528312033652434, + "language_loss": 0.82556236, + "learning_rate": 3.883683968018669e-06, + "loss": 0.84770095, + "num_input_tokens_seen": 48882360, + "router_z_loss_clip": 0.23046875, + "router_z_loss_mlp": 1.109375, + "step": 2259, + "time_per_iteration": 2.4182498455047607 + }, + { + "auxiliary_loss_clip": 0.01162398, + "auxiliary_loss_mlp": 0.0105006, + "balance_loss_clip": 1.0313561, + "balance_loss_mlp": 1.05370188, + "epoch": 0.1358785510296107, + "flos": 22857142222080.0, + "grad_norm": 1.9951846625000045, + "language_loss": 0.73434722, + "learning_rate": 3.8835530513455755e-06, + "loss": 0.75647175, + "num_input_tokens_seen": 48902700, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0859375, + "step": 2260, + "time_per_iteration": 2.5117952823638916 + }, + { + "auxiliary_loss_clip": 0.01160597, + "auxiliary_loss_mlp": 0.01053624, + "balance_loss_clip": 1.03389525, + "balance_loss_mlp": 1.05164778, + "epoch": 0.13593867428227868, + "flos": 25739404329600.0, + "grad_norm": 2.6406640236232826, + "language_loss": 0.75450647, + "learning_rate": 3.883422063247961e-06, + "loss": 0.77664864, + "num_input_tokens_seen": 48922525, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.09375, + "step": 2261, + "time_per_iteration": 2.4773809909820557 + }, + { + "auxiliary_loss_clip": 0.01164897, + "auxiliary_loss_mlp": 0.01048665, + "balance_loss_clip": 1.02887654, + "balance_loss_mlp": 1.05329657, + "epoch": 0.13599879753494665, + "flos": 31249214225280.0, + "grad_norm": 1.9984757312973846, + "language_loss": 0.63141024, + "learning_rate": 3.883291003730794e-06, + "loss": 0.65354586, + "num_input_tokens_seen": 48942510, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.1171875, + "step": 2262, + "time_per_iteration": 2.5423331260681152 + }, + { + "auxiliary_loss_clip": 0.01161423, + "auxiliary_loss_mlp": 0.01043862, + "balance_loss_clip": 1.02458549, + "balance_loss_mlp": 1.05046105, + "epoch": 0.1360589207876146, + "flos": 23915034604800.0, + "grad_norm": 2.598036861128168, + "language_loss": 0.82363462, + "learning_rate": 3.883159872799043e-06, + "loss": 0.84568739, + "num_input_tokens_seen": 48962625, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.109375, + "step": 2263, + "time_per_iteration": 2.472050428390503 + }, + { + "auxiliary_loss_clip": 0.01166147, + "auxiliary_loss_mlp": 0.01061822, + "balance_loss_clip": 1.03914785, + "balance_loss_mlp": 1.05306447, + "epoch": 0.13611904404028258, + "flos": 19974197756160.0, + "grad_norm": 1.7757676532235749, + "language_loss": 0.87984985, + "learning_rate": 3.8830286704576815e-06, + "loss": 0.90212959, + "num_input_tokens_seen": 48982525, + "router_z_loss_clip": 0.2265625, + "router_z_loss_mlp": 1.1328125, + "step": 2264, + "time_per_iteration": 2.4857943058013916 + }, + { + "auxiliary_loss_clip": 0.01163519, + "auxiliary_loss_mlp": 0.01048759, + "balance_loss_clip": 1.02700329, + "balance_loss_mlp": 1.05115557, + "epoch": 0.13617916729295054, + "flos": 15340644144000.0, + "grad_norm": 2.9904691281538693, + "language_loss": 0.7103616, + "learning_rate": 3.882897396711683e-06, + "loss": 0.73248434, + "num_input_tokens_seen": 48997605, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.125, + "step": 2265, + "time_per_iteration": 2.428753614425659 + }, + { + "auxiliary_loss_clip": 0.01160486, + "auxiliary_loss_mlp": 0.01041679, + "balance_loss_clip": 1.02187812, + "balance_loss_mlp": 1.05258036, + "epoch": 0.1362392905456185, + "flos": 27451445247360.0, + "grad_norm": 2.049615390343222, + "language_loss": 0.66760135, + "learning_rate": 3.882766051566027e-06, + "loss": 0.689623, + "num_input_tokens_seen": 49018535, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.078125, + "step": 2266, + "time_per_iteration": 2.4990508556365967 + }, + { + "auxiliary_loss_clip": 0.01159505, + "auxiliary_loss_mlp": 0.01060297, + "balance_loss_clip": 1.04079425, + "balance_loss_mlp": 1.05220675, + "epoch": 0.1362994137982865, + "flos": 25009017177600.0, + "grad_norm": 1.7538751206895893, + "language_loss": 0.76376909, + "learning_rate": 3.882634635025694e-06, + "loss": 0.78596711, + "num_input_tokens_seen": 49038865, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0703125, + "step": 2267, + "time_per_iteration": 2.485907554626465 + }, + { + "auxiliary_loss_clip": 0.0116058, + "auxiliary_loss_mlp": 0.01046136, + "balance_loss_clip": 1.02639508, + "balance_loss_mlp": 1.05051804, + "epoch": 0.13635953705095447, + "flos": 20303031790080.0, + "grad_norm": 2.002795226804265, + "language_loss": 0.81781995, + "learning_rate": 3.882503147095667e-06, + "loss": 0.83988714, + "num_input_tokens_seen": 49058010, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1015625, + "step": 2268, + "time_per_iteration": 3.890936851501465 + }, + { + "auxiliary_loss_clip": 0.01161581, + "auxiliary_loss_mlp": 0.01046085, + "balance_loss_clip": 1.02567649, + "balance_loss_mlp": 1.0542717, + "epoch": 0.13641966030362243, + "flos": 31358418549120.0, + "grad_norm": 2.071095479959133, + "language_loss": 0.76078153, + "learning_rate": 3.882371587780931e-06, + "loss": 0.78285825, + "num_input_tokens_seen": 49080330, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0703125, + "step": 2269, + "time_per_iteration": 4.03081202507019 + }, + { + "auxiliary_loss_clip": 0.01165717, + "auxiliary_loss_mlp": 0.0104628, + "balance_loss_clip": 1.02612138, + "balance_loss_mlp": 1.05518508, + "epoch": 0.1364797835562904, + "flos": 20478095700480.0, + "grad_norm": 2.039865659244694, + "language_loss": 0.80856502, + "learning_rate": 3.882239957086477e-06, + "loss": 0.83068502, + "num_input_tokens_seen": 49097035, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.109375, + "step": 2270, + "time_per_iteration": 2.431426525115967 + }, + { + "auxiliary_loss_clip": 0.01164608, + "auxiliary_loss_mlp": 0.01055428, + "balance_loss_clip": 1.03463817, + "balance_loss_mlp": 1.05227089, + "epoch": 0.13653990680895836, + "flos": 13078343802240.0, + "grad_norm": 2.715242097566801, + "language_loss": 0.75720018, + "learning_rate": 3.882108255017295e-06, + "loss": 0.77940053, + "num_input_tokens_seen": 49113945, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.125, + "step": 2271, + "time_per_iteration": 2.440701961517334 + }, + { + "auxiliary_loss_clip": 0.01161613, + "auxiliary_loss_mlp": 0.01052373, + "balance_loss_clip": 1.03149939, + "balance_loss_mlp": 1.05171776, + "epoch": 0.13660003006162633, + "flos": 16946712961920.0, + "grad_norm": 2.2487551674667565, + "language_loss": 0.80084515, + "learning_rate": 3.881976481578379e-06, + "loss": 0.82298499, + "num_input_tokens_seen": 49132855, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.1015625, + "step": 2272, + "time_per_iteration": 2.4305598735809326 + }, + { + "auxiliary_loss_clip": 0.01056084, + "auxiliary_loss_mlp": 0.01011943, + "balance_loss_clip": 1.00937963, + "balance_loss_mlp": 1.01818228, + "epoch": 0.1366601533142943, + "flos": 68682749892480.0, + "grad_norm": 0.7032235049035468, + "language_loss": 0.60682511, + "learning_rate": 3.8818446367747255e-06, + "loss": 0.62750536, + "num_input_tokens_seen": 49198310, + "router_z_loss_clip": 0.02563477, + "router_z_loss_mlp": 0.37890625, + "step": 2273, + "time_per_iteration": 3.1601598262786865 + }, + { + "auxiliary_loss_clip": 0.01158579, + "auxiliary_loss_mlp": 0.01047766, + "balance_loss_clip": 1.02732205, + "balance_loss_mlp": 1.05170178, + "epoch": 0.13672027656696228, + "flos": 19244241567360.0, + "grad_norm": 1.7482195510707834, + "language_loss": 0.77978206, + "learning_rate": 3.881712720611336e-06, + "loss": 0.80184555, + "num_input_tokens_seen": 49217250, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0625, + "step": 2274, + "time_per_iteration": 2.448374032974243 + }, + { + "auxiliary_loss_clip": 0.01163563, + "auxiliary_loss_mlp": 0.01046845, + "balance_loss_clip": 1.02613878, + "balance_loss_mlp": 1.0536654, + "epoch": 0.13678039981963025, + "flos": 24534924543360.0, + "grad_norm": 2.152740159395537, + "language_loss": 0.78435361, + "learning_rate": 3.881580733093211e-06, + "loss": 0.80645764, + "num_input_tokens_seen": 49236615, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1015625, + "step": 2275, + "time_per_iteration": 2.4761078357696533 + }, + { + "auxiliary_loss_clip": 0.01161418, + "auxiliary_loss_mlp": 0.01039, + "balance_loss_clip": 1.02003431, + "balance_loss_mlp": 1.05312562, + "epoch": 0.13684052307229821, + "flos": 15669334523520.0, + "grad_norm": 2.879456622893362, + "language_loss": 0.81436646, + "learning_rate": 3.881448674225356e-06, + "loss": 0.83637059, + "num_input_tokens_seen": 49253935, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0859375, + "step": 2276, + "time_per_iteration": 2.453623056411743 + }, + { + "auxiliary_loss_clip": 0.01169888, + "auxiliary_loss_mlp": 0.01054109, + "balance_loss_clip": 1.03082716, + "balance_loss_mlp": 1.05443549, + "epoch": 0.13690064632496618, + "flos": 28364689560960.0, + "grad_norm": 2.7308629221608576, + "language_loss": 0.69347179, + "learning_rate": 3.881316544012779e-06, + "loss": 0.71571183, + "num_input_tokens_seen": 49273605, + "router_z_loss_clip": 0.23242188, + "router_z_loss_mlp": 1.15625, + "step": 2277, + "time_per_iteration": 2.537464141845703 + }, + { + "auxiliary_loss_clip": 0.01162034, + "auxiliary_loss_mlp": 0.01051118, + "balance_loss_clip": 1.03056657, + "balance_loss_mlp": 1.05136657, + "epoch": 0.13696076957763414, + "flos": 23404779953280.0, + "grad_norm": 2.1796180013972384, + "language_loss": 0.80487186, + "learning_rate": 3.88118434246049e-06, + "loss": 0.82700336, + "num_input_tokens_seen": 49291785, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.109375, + "step": 2278, + "time_per_iteration": 2.478158950805664 + }, + { + "auxiliary_loss_clip": 0.01164216, + "auxiliary_loss_mlp": 0.01047731, + "balance_loss_clip": 1.02788246, + "balance_loss_mlp": 1.05658543, + "epoch": 0.1370208928303021, + "flos": 37196595601920.0, + "grad_norm": 2.2222454745927744, + "language_loss": 0.74863833, + "learning_rate": 3.881052069573502e-06, + "loss": 0.77075779, + "num_input_tokens_seen": 49311405, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.078125, + "step": 2279, + "time_per_iteration": 2.5930991172790527 + }, + { + "auxiliary_loss_clip": 0.01166611, + "auxiliary_loss_mlp": 0.0105256, + "balance_loss_clip": 1.03232992, + "balance_loss_mlp": 1.05331779, + "epoch": 0.13708101608297008, + "flos": 26976311118720.0, + "grad_norm": 2.3437990696634916, + "language_loss": 0.76614088, + "learning_rate": 3.880919725356831e-06, + "loss": 0.78833258, + "num_input_tokens_seen": 49331835, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.1328125, + "step": 2280, + "time_per_iteration": 2.527808666229248 + }, + { + "auxiliary_loss_clip": 0.01156674, + "auxiliary_loss_mlp": 0.01046301, + "balance_loss_clip": 1.0272876, + "balance_loss_mlp": 1.04930711, + "epoch": 0.13714113933563807, + "flos": 32556864850560.0, + "grad_norm": 1.7035700975942816, + "language_loss": 0.79808372, + "learning_rate": 3.880787309815496e-06, + "loss": 0.82011348, + "num_input_tokens_seen": 49352290, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.078125, + "step": 2281, + "time_per_iteration": 2.5486884117126465 + }, + { + "auxiliary_loss_clip": 0.01167882, + "auxiliary_loss_mlp": 0.01055632, + "balance_loss_clip": 1.03618872, + "balance_loss_mlp": 1.05488086, + "epoch": 0.13720126258830603, + "flos": 16101267569280.0, + "grad_norm": 1.697672260024265, + "language_loss": 0.83955061, + "learning_rate": 3.880654822954518e-06, + "loss": 0.86178571, + "num_input_tokens_seen": 49370285, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.125, + "step": 2282, + "time_per_iteration": 2.4731719493865967 + }, + { + "auxiliary_loss_clip": 0.01157557, + "auxiliary_loss_mlp": 0.01055496, + "balance_loss_clip": 1.03664923, + "balance_loss_mlp": 1.05028629, + "epoch": 0.137261385840974, + "flos": 18953544798720.0, + "grad_norm": 1.8152250836173982, + "language_loss": 0.73821312, + "learning_rate": 3.8805222647789195e-06, + "loss": 0.76034367, + "num_input_tokens_seen": 49389610, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0703125, + "step": 2283, + "time_per_iteration": 2.5041310787200928 + }, + { + "auxiliary_loss_clip": 0.01161767, + "auxiliary_loss_mlp": 0.01048138, + "balance_loss_clip": 1.02991104, + "balance_loss_mlp": 1.05546188, + "epoch": 0.13732150909364196, + "flos": 23295360147840.0, + "grad_norm": 2.845966051455131, + "language_loss": 0.83875519, + "learning_rate": 3.880389635293729e-06, + "loss": 0.86085427, + "num_input_tokens_seen": 49408390, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.0625, + "step": 2284, + "time_per_iteration": 2.489459991455078 + }, + { + "auxiliary_loss_clip": 0.01164156, + "auxiliary_loss_mlp": 0.01049019, + "balance_loss_clip": 1.02784729, + "balance_loss_mlp": 1.05016088, + "epoch": 0.13738163234630993, + "flos": 29351263489920.0, + "grad_norm": 1.9356174938409232, + "language_loss": 0.74778754, + "learning_rate": 3.880256934503974e-06, + "loss": 0.76991928, + "num_input_tokens_seen": 49427725, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.140625, + "step": 2285, + "time_per_iteration": 2.542114734649658 + }, + { + "auxiliary_loss_clip": 0.01158945, + "auxiliary_loss_mlp": 0.01045355, + "balance_loss_clip": 1.02680647, + "balance_loss_mlp": 1.05192137, + "epoch": 0.1374417555989779, + "flos": 26651319840000.0, + "grad_norm": 1.7476035379248278, + "language_loss": 0.74461651, + "learning_rate": 3.880124162414689e-06, + "loss": 0.7666595, + "num_input_tokens_seen": 49449000, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0703125, + "step": 2286, + "time_per_iteration": 2.52837872505188 + }, + { + "auxiliary_loss_clip": 0.01165905, + "auxiliary_loss_mlp": 0.01045032, + "balance_loss_clip": 1.02407491, + "balance_loss_mlp": 1.05466056, + "epoch": 0.1375018788516459, + "flos": 28403401443840.0, + "grad_norm": 2.4229799840234936, + "language_loss": 0.86074513, + "learning_rate": 3.879991319030908e-06, + "loss": 0.88285446, + "num_input_tokens_seen": 49468360, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.109375, + "step": 2287, + "time_per_iteration": 2.5267093181610107 + }, + { + "auxiliary_loss_clip": 0.01162503, + "auxiliary_loss_mlp": 0.01046382, + "balance_loss_clip": 1.02724862, + "balance_loss_mlp": 1.05281329, + "epoch": 0.13756200210431385, + "flos": 37413783187200.0, + "grad_norm": 2.1686670508464783, + "language_loss": 0.68304116, + "learning_rate": 3.879858404357666e-06, + "loss": 0.70512998, + "num_input_tokens_seen": 49493450, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.09375, + "step": 2288, + "time_per_iteration": 2.6589176654815674 + }, + { + "auxiliary_loss_clip": 0.01162886, + "auxiliary_loss_mlp": 0.01054184, + "balance_loss_clip": 1.03410959, + "balance_loss_mlp": 1.05404294, + "epoch": 0.13762212535698182, + "flos": 22711021695360.0, + "grad_norm": 3.8263362529629896, + "language_loss": 0.87251699, + "learning_rate": 3.879725418400005e-06, + "loss": 0.89468765, + "num_input_tokens_seen": 49511220, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.0859375, + "step": 2289, + "time_per_iteration": 2.4834415912628174 + }, + { + "auxiliary_loss_clip": 0.01154414, + "auxiliary_loss_mlp": 0.01045882, + "balance_loss_clip": 1.02735722, + "balance_loss_mlp": 1.0496552, + "epoch": 0.13768224860964978, + "flos": 23952130375680.0, + "grad_norm": 1.801469753111382, + "language_loss": 0.74045157, + "learning_rate": 3.879592361162969e-06, + "loss": 0.76245451, + "num_input_tokens_seen": 49529820, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.046875, + "step": 2290, + "time_per_iteration": 2.4901175498962402 + }, + { + "auxiliary_loss_clip": 0.01056657, + "auxiliary_loss_mlp": 0.01003238, + "balance_loss_clip": 1.00099707, + "balance_loss_mlp": 1.01923215, + "epoch": 0.13774237186231775, + "flos": 63590438753280.0, + "grad_norm": 0.7021136788609851, + "language_loss": 0.5160234, + "learning_rate": 3.8794592326516015e-06, + "loss": 0.53662229, + "num_input_tokens_seen": 49595325, + "router_z_loss_clip": 0.02246094, + "router_z_loss_mlp": 0.375, + "step": 2291, + "time_per_iteration": 3.1141176223754883 + }, + { + "auxiliary_loss_clip": 0.01158988, + "auxiliary_loss_mlp": 0.01049009, + "balance_loss_clip": 1.02856493, + "balance_loss_mlp": 1.05007744, + "epoch": 0.1378024951149857, + "flos": 24279456038400.0, + "grad_norm": 2.104305633549435, + "language_loss": 0.7090801, + "learning_rate": 3.879326032870952e-06, + "loss": 0.73116004, + "num_input_tokens_seen": 49615850, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.09375, + "step": 2292, + "time_per_iteration": 2.5535075664520264 + }, + { + "auxiliary_loss_clip": 0.01160381, + "auxiliary_loss_mlp": 0.01044896, + "balance_loss_clip": 1.02613211, + "balance_loss_mlp": 1.05272794, + "epoch": 0.13786261836765368, + "flos": 14021537080320.0, + "grad_norm": 2.835181445389694, + "language_loss": 0.79774708, + "learning_rate": 3.879192761826071e-06, + "loss": 0.81979978, + "num_input_tokens_seen": 49631860, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.078125, + "step": 2293, + "time_per_iteration": 2.4434242248535156 + }, + { + "auxiliary_loss_clip": 0.01159833, + "auxiliary_loss_mlp": 0.01050431, + "balance_loss_clip": 1.03065419, + "balance_loss_mlp": 1.0489893, + "epoch": 0.13792274162032167, + "flos": 28878679226880.0, + "grad_norm": 2.8100583587938566, + "language_loss": 0.78455698, + "learning_rate": 3.879059419522011e-06, + "loss": 0.80665964, + "num_input_tokens_seen": 49652145, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.109375, + "step": 2294, + "time_per_iteration": 2.5279018878936768 + }, + { + "auxiliary_loss_clip": 0.01156302, + "auxiliary_loss_mlp": 0.01044594, + "balance_loss_clip": 1.02679634, + "balance_loss_mlp": 1.05053687, + "epoch": 0.13798286487298964, + "flos": 21141150808320.0, + "grad_norm": 2.844605455172751, + "language_loss": 0.80448526, + "learning_rate": 3.878926005963831e-06, + "loss": 0.82649422, + "num_input_tokens_seen": 49669880, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 1.0546875, + "step": 2295, + "time_per_iteration": 2.46471905708313 + }, + { + "auxiliary_loss_clip": 0.01158353, + "auxiliary_loss_mlp": 0.01045588, + "balance_loss_clip": 1.02604938, + "balance_loss_mlp": 1.04990947, + "epoch": 0.1380429881256576, + "flos": 22487477402880.0, + "grad_norm": 1.905081494696058, + "language_loss": 0.78027165, + "learning_rate": 3.878792521156588e-06, + "loss": 0.80231106, + "num_input_tokens_seen": 49687255, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0859375, + "step": 2296, + "time_per_iteration": 2.489081859588623 + }, + { + "auxiliary_loss_clip": 0.0116031, + "auxiliary_loss_mlp": 0.01052914, + "balance_loss_clip": 1.03356612, + "balance_loss_mlp": 1.05272174, + "epoch": 0.13810311137832557, + "flos": 21393674398080.0, + "grad_norm": 1.8577842545242083, + "language_loss": 0.78632545, + "learning_rate": 3.8786589651053446e-06, + "loss": 0.80845773, + "num_input_tokens_seen": 49706650, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.078125, + "step": 2297, + "time_per_iteration": 2.479617118835449 + }, + { + "auxiliary_loss_clip": 0.01157952, + "auxiliary_loss_mlp": 0.01050362, + "balance_loss_clip": 1.03187263, + "balance_loss_mlp": 1.05133367, + "epoch": 0.13816323463099353, + "flos": 25989844930560.0, + "grad_norm": 2.1383795008624946, + "language_loss": 0.69005466, + "learning_rate": 3.878525337815164e-06, + "loss": 0.71213776, + "num_input_tokens_seen": 49725715, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0625, + "step": 2298, + "time_per_iteration": 2.4894726276397705 + }, + { + "auxiliary_loss_clip": 0.01163842, + "auxiliary_loss_mlp": 0.0105021, + "balance_loss_clip": 1.03075552, + "balance_loss_mlp": 1.05287397, + "epoch": 0.1382233578836615, + "flos": 19244313394560.0, + "grad_norm": 1.7932718261070644, + "language_loss": 0.86958891, + "learning_rate": 3.878391639291116e-06, + "loss": 0.89172935, + "num_input_tokens_seen": 49744710, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.109375, + "step": 2299, + "time_per_iteration": 2.4343175888061523 + }, + { + "auxiliary_loss_clip": 0.01158457, + "auxiliary_loss_mlp": 0.01052611, + "balance_loss_clip": 1.03221393, + "balance_loss_mlp": 1.05076718, + "epoch": 0.1382834811363295, + "flos": 25666290195840.0, + "grad_norm": 2.6477233854648015, + "language_loss": 0.7542398, + "learning_rate": 3.878257869538267e-06, + "loss": 0.7763505, + "num_input_tokens_seen": 49764300, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.078125, + "step": 2300, + "time_per_iteration": 2.5398943424224854 + }, + { + "auxiliary_loss_clip": 0.01160789, + "auxiliary_loss_mlp": 0.01050356, + "balance_loss_clip": 1.03088915, + "balance_loss_mlp": 1.05409729, + "epoch": 0.13834360438899745, + "flos": 19784193788160.0, + "grad_norm": 2.6084363319634956, + "language_loss": 0.82612532, + "learning_rate": 3.878124028561692e-06, + "loss": 0.8482368, + "num_input_tokens_seen": 49778380, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0625, + "step": 2301, + "time_per_iteration": 2.435732841491699 + }, + { + "auxiliary_loss_clip": 0.01155849, + "auxiliary_loss_mlp": 0.01043651, + "balance_loss_clip": 1.02461374, + "balance_loss_mlp": 1.04986811, + "epoch": 0.13840372764166542, + "flos": 26651858544000.0, + "grad_norm": 2.0886382571109987, + "language_loss": 0.85972583, + "learning_rate": 3.877990116366466e-06, + "loss": 0.8817209, + "num_input_tokens_seen": 49797460, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0625, + "step": 2302, + "time_per_iteration": 2.504011869430542 + }, + { + "auxiliary_loss_clip": 0.01055451, + "auxiliary_loss_mlp": 0.01009124, + "balance_loss_clip": 1.00688314, + "balance_loss_mlp": 1.0189817, + "epoch": 0.13846385089433338, + "flos": 70510998286080.0, + "grad_norm": 0.7554932596602951, + "language_loss": 0.65648526, + "learning_rate": 3.877856132957667e-06, + "loss": 0.677131, + "num_input_tokens_seen": 49868005, + "router_z_loss_clip": 0.02246094, + "router_z_loss_mlp": 0.36328125, + "step": 2303, + "time_per_iteration": 3.2563750743865967 + }, + { + "auxiliary_loss_clip": 0.0115535, + "auxiliary_loss_mlp": 0.01038432, + "balance_loss_clip": 1.01971662, + "balance_loss_mlp": 1.05022073, + "epoch": 0.13852397414700135, + "flos": 17348732956800.0, + "grad_norm": 2.0694955360834912, + "language_loss": 0.78234196, + "learning_rate": 3.877722078340374e-06, + "loss": 0.80427974, + "num_input_tokens_seen": 49885825, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.046875, + "step": 2304, + "time_per_iteration": 2.461975574493408 + }, + { + "auxiliary_loss_clip": 0.01161783, + "auxiliary_loss_mlp": 0.01038842, + "balance_loss_clip": 1.01991165, + "balance_loss_mlp": 1.05225086, + "epoch": 0.13858409739966931, + "flos": 21543781334400.0, + "grad_norm": 1.838077080535218, + "language_loss": 0.77824223, + "learning_rate": 3.877587952519672e-06, + "loss": 0.8002485, + "num_input_tokens_seen": 49905975, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.09375, + "step": 2305, + "time_per_iteration": 2.468254804611206 + }, + { + "auxiliary_loss_clip": 0.0115424, + "auxiliary_loss_mlp": 0.01045678, + "balance_loss_clip": 1.02732027, + "balance_loss_mlp": 1.04923558, + "epoch": 0.13864422065233728, + "flos": 21579907438080.0, + "grad_norm": 3.2063314507866947, + "language_loss": 0.87484217, + "learning_rate": 3.877453755500647e-06, + "loss": 0.89684129, + "num_input_tokens_seen": 49925800, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.046875, + "step": 2306, + "time_per_iteration": 2.4840242862701416 + }, + { + "auxiliary_loss_clip": 0.0105475, + "auxiliary_loss_mlp": 0.01012102, + "balance_loss_clip": 1.00995588, + "balance_loss_mlp": 1.01749539, + "epoch": 0.13870434390500527, + "flos": 53371156872960.0, + "grad_norm": 0.8793018572536648, + "language_loss": 0.59049129, + "learning_rate": 3.877319487288387e-06, + "loss": 0.6111598, + "num_input_tokens_seen": 49977620, + "router_z_loss_clip": 0.02148438, + "router_z_loss_mlp": 0.37304688, + "step": 2307, + "time_per_iteration": 3.1098880767822266 + }, + { + "auxiliary_loss_clip": 0.01164649, + "auxiliary_loss_mlp": 0.01043993, + "balance_loss_clip": 1.0233345, + "balance_loss_mlp": 1.05279016, + "epoch": 0.13876446715767324, + "flos": 22565906749440.0, + "grad_norm": 1.7539420555734833, + "language_loss": 0.79683769, + "learning_rate": 3.877185147887984e-06, + "loss": 0.81892413, + "num_input_tokens_seen": 49996650, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1171875, + "step": 2308, + "time_per_iteration": 2.5119385719299316 + }, + { + "auxiliary_loss_clip": 0.01159668, + "auxiliary_loss_mlp": 0.01042487, + "balance_loss_clip": 1.02331865, + "balance_loss_mlp": 1.05292201, + "epoch": 0.1388245904103412, + "flos": 20705231352960.0, + "grad_norm": 2.1876242684272342, + "language_loss": 0.78186178, + "learning_rate": 3.877050737304533e-06, + "loss": 0.80388331, + "num_input_tokens_seen": 50015640, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0625, + "step": 2309, + "time_per_iteration": 3.889902353286743 + }, + { + "auxiliary_loss_clip": 0.01164667, + "auxiliary_loss_mlp": 0.01044971, + "balance_loss_clip": 1.02517033, + "balance_loss_mlp": 1.05319023, + "epoch": 0.13888471366300917, + "flos": 20554729367040.0, + "grad_norm": 1.9671645437439387, + "language_loss": 0.67473733, + "learning_rate": 3.876916255543129e-06, + "loss": 0.69683367, + "num_input_tokens_seen": 50033500, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.1171875, + "step": 2310, + "time_per_iteration": 5.331011056900024 + }, + { + "auxiliary_loss_clip": 0.01159907, + "auxiliary_loss_mlp": 0.01051301, + "balance_loss_clip": 1.03067827, + "balance_loss_mlp": 1.0511837, + "epoch": 0.13894483691567713, + "flos": 13838033473920.0, + "grad_norm": 1.8339330301012977, + "language_loss": 0.83962393, + "learning_rate": 3.8767817026088725e-06, + "loss": 0.86173606, + "num_input_tokens_seen": 50050075, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.0859375, + "step": 2311, + "time_per_iteration": 2.4287211894989014 + }, + { + "auxiliary_loss_clip": 0.01165031, + "auxiliary_loss_mlp": 0.01046165, + "balance_loss_clip": 1.02629256, + "balance_loss_mlp": 1.05262017, + "epoch": 0.1390049601683451, + "flos": 28031186759040.0, + "grad_norm": 2.2677083380951997, + "language_loss": 0.81788063, + "learning_rate": 3.876647078506866e-06, + "loss": 0.83999264, + "num_input_tokens_seen": 50070080, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.125, + "step": 2312, + "time_per_iteration": 2.5261852741241455 + }, + { + "auxiliary_loss_clip": 0.01165344, + "auxiliary_loss_mlp": 0.01045576, + "balance_loss_clip": 1.02634764, + "balance_loss_mlp": 1.05353236, + "epoch": 0.13906508342101306, + "flos": 26756860976640.0, + "grad_norm": 2.1868066623869202, + "language_loss": 0.86641061, + "learning_rate": 3.876512383242215e-06, + "loss": 0.88851982, + "num_input_tokens_seen": 50090040, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.1171875, + "step": 2313, + "time_per_iteration": 2.491847515106201 + }, + { + "auxiliary_loss_clip": 0.0116138, + "auxiliary_loss_mlp": 0.01052556, + "balance_loss_clip": 1.03208828, + "balance_loss_mlp": 1.05377281, + "epoch": 0.13912520667368106, + "flos": 24535104111360.0, + "grad_norm": 2.199884337980412, + "language_loss": 0.79629153, + "learning_rate": 3.876377616820024e-06, + "loss": 0.8184309, + "num_input_tokens_seen": 50110595, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.078125, + "step": 2314, + "time_per_iteration": 2.513545036315918 + }, + { + "auxiliary_loss_clip": 0.0116003, + "auxiliary_loss_mlp": 0.0104633, + "balance_loss_clip": 1.02668452, + "balance_loss_mlp": 1.05130863, + "epoch": 0.13918532992634902, + "flos": 19383215287680.0, + "grad_norm": 2.30759926974498, + "language_loss": 0.86246645, + "learning_rate": 3.876242779245409e-06, + "loss": 0.88453007, + "num_input_tokens_seen": 50125430, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0859375, + "step": 2315, + "time_per_iteration": 2.4236056804656982 + }, + { + "auxiliary_loss_clip": 0.01159066, + "auxiliary_loss_mlp": 0.01052564, + "balance_loss_clip": 1.03192866, + "balance_loss_mlp": 1.05146074, + "epoch": 0.139245453179017, + "flos": 21323756574720.0, + "grad_norm": 2.162038852448813, + "language_loss": 0.77074778, + "learning_rate": 3.876107870523477e-06, + "loss": 0.79286408, + "num_input_tokens_seen": 50144120, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.078125, + "step": 2316, + "time_per_iteration": 2.4574813842773438 + }, + { + "auxiliary_loss_clip": 0.01157842, + "auxiliary_loss_mlp": 0.01058721, + "balance_loss_clip": 1.03733492, + "balance_loss_mlp": 1.05045736, + "epoch": 0.13930557643168495, + "flos": 19500607912320.0, + "grad_norm": 1.6719823206156588, + "language_loss": 0.76972795, + "learning_rate": 3.875972890659349e-06, + "loss": 0.7918936, + "num_input_tokens_seen": 50162500, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.078125, + "step": 2317, + "time_per_iteration": 2.448096990585327 + }, + { + "auxiliary_loss_clip": 0.01162372, + "auxiliary_loss_mlp": 0.01049791, + "balance_loss_clip": 1.02993095, + "balance_loss_mlp": 1.05272126, + "epoch": 0.13936569968435292, + "flos": 25410821690880.0, + "grad_norm": 2.004328537884534, + "language_loss": 0.80159998, + "learning_rate": 3.875837839658139e-06, + "loss": 0.82372165, + "num_input_tokens_seen": 50182415, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.09375, + "step": 2318, + "time_per_iteration": 2.5152556896209717 + }, + { + "auxiliary_loss_clip": 0.01050424, + "auxiliary_loss_mlp": 0.0100261, + "balance_loss_clip": 1.00027394, + "balance_loss_mlp": 1.01373565, + "epoch": 0.13942582293702088, + "flos": 70771063731840.0, + "grad_norm": 0.8654041988705774, + "language_loss": 0.59008324, + "learning_rate": 3.87570271752497e-06, + "loss": 0.61061358, + "num_input_tokens_seen": 50245160, + "router_z_loss_clip": 0.02331543, + "router_z_loss_mlp": 0.3671875, + "step": 2319, + "time_per_iteration": 3.101083993911743 + }, + { + "auxiliary_loss_clip": 0.01162526, + "auxiliary_loss_mlp": 0.01053809, + "balance_loss_clip": 1.03365111, + "balance_loss_mlp": 1.05213809, + "epoch": 0.13948594618968888, + "flos": 35590885920000.0, + "grad_norm": 2.2307371496542356, + "language_loss": 0.65372109, + "learning_rate": 3.875567524264967e-06, + "loss": 0.67588449, + "num_input_tokens_seen": 50268215, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.109375, + "step": 2320, + "time_per_iteration": 2.580655336380005 + }, + { + "auxiliary_loss_clip": 0.01157047, + "auxiliary_loss_mlp": 0.01043656, + "balance_loss_clip": 1.02407002, + "balance_loss_mlp": 1.0507009, + "epoch": 0.13954606944235684, + "flos": 21105204272640.0, + "grad_norm": 1.6249908375914148, + "language_loss": 0.70695353, + "learning_rate": 3.875432259883256e-06, + "loss": 0.72896051, + "num_input_tokens_seen": 50288575, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0625, + "step": 2321, + "time_per_iteration": 2.4594380855560303 + }, + { + "auxiliary_loss_clip": 0.01158572, + "auxiliary_loss_mlp": 0.01055348, + "balance_loss_clip": 1.0345459, + "balance_loss_mlp": 1.04883599, + "epoch": 0.1396061926950248, + "flos": 25044425009280.0, + "grad_norm": 43.01057366099128, + "language_loss": 0.86161166, + "learning_rate": 3.875296924384965e-06, + "loss": 0.88375086, + "num_input_tokens_seen": 50308735, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.09375, + "step": 2322, + "time_per_iteration": 2.4912750720977783 + }, + { + "auxiliary_loss_clip": 0.01152207, + "auxiliary_loss_mlp": 0.01056736, + "balance_loss_clip": 1.0373404, + "balance_loss_mlp": 1.04840016, + "epoch": 0.13966631594769277, + "flos": 37634023428480.0, + "grad_norm": 1.7187096085030618, + "language_loss": 0.6682983, + "learning_rate": 3.875161517775226e-06, + "loss": 0.69038773, + "num_input_tokens_seen": 50331025, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0390625, + "step": 2323, + "time_per_iteration": 2.606084108352661 + }, + { + "auxiliary_loss_clip": 0.0116621, + "auxiliary_loss_mlp": 0.01051125, + "balance_loss_clip": 1.03068066, + "balance_loss_mlp": 1.05250573, + "epoch": 0.13972643920036074, + "flos": 16690993061760.0, + "grad_norm": 2.0268681764850665, + "language_loss": 0.89011461, + "learning_rate": 3.875026040059175e-06, + "loss": 0.91228795, + "num_input_tokens_seen": 50349725, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1328125, + "step": 2324, + "time_per_iteration": 2.458172559738159 + }, + { + "auxiliary_loss_clip": 0.01159494, + "auxiliary_loss_mlp": 0.01056649, + "balance_loss_clip": 1.03626466, + "balance_loss_mlp": 1.04949069, + "epoch": 0.1397865624530287, + "flos": 23331055288320.0, + "grad_norm": 4.4201897818475775, + "language_loss": 0.70700991, + "learning_rate": 3.8748904912419485e-06, + "loss": 0.7291714, + "num_input_tokens_seen": 50367965, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.1015625, + "step": 2325, + "time_per_iteration": 2.4608585834503174 + }, + { + "auxiliary_loss_clip": 0.01161715, + "auxiliary_loss_mlp": 0.01055057, + "balance_loss_clip": 1.03568554, + "balance_loss_mlp": 1.05384755, + "epoch": 0.13984668570569667, + "flos": 22778317825920.0, + "grad_norm": 1.8512202881484865, + "language_loss": 0.81165004, + "learning_rate": 3.874754871328688e-06, + "loss": 0.83381784, + "num_input_tokens_seen": 50385605, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.078125, + "step": 2326, + "time_per_iteration": 2.474729537963867 + }, + { + "auxiliary_loss_clip": 0.01155088, + "auxiliary_loss_mlp": 0.01047016, + "balance_loss_clip": 1.02880073, + "balance_loss_mlp": 1.05092621, + "epoch": 0.13990680895836466, + "flos": 19464553635840.0, + "grad_norm": 1.806872548679543, + "language_loss": 0.88955671, + "learning_rate": 3.874619180324534e-06, + "loss": 0.9115777, + "num_input_tokens_seen": 50403985, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.0390625, + "step": 2327, + "time_per_iteration": 2.4512577056884766 + }, + { + "auxiliary_loss_clip": 0.01155487, + "auxiliary_loss_mlp": 0.0105816, + "balance_loss_clip": 1.03790593, + "balance_loss_mlp": 1.05021226, + "epoch": 0.13996693221103262, + "flos": 20303283185280.0, + "grad_norm": 2.4750320646827992, + "language_loss": 0.85236871, + "learning_rate": 3.874483418234632e-06, + "loss": 0.87450516, + "num_input_tokens_seen": 50421590, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.046875, + "step": 2328, + "time_per_iteration": 2.4724884033203125 + }, + { + "auxiliary_loss_clip": 0.01158673, + "auxiliary_loss_mlp": 0.0104927, + "balance_loss_clip": 1.02926636, + "balance_loss_mlp": 1.05120313, + "epoch": 0.1400270554637006, + "flos": 26617707688320.0, + "grad_norm": 1.653872228613324, + "language_loss": 0.74084997, + "learning_rate": 3.874347585064131e-06, + "loss": 0.76292944, + "num_input_tokens_seen": 50443945, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.078125, + "step": 2329, + "time_per_iteration": 2.5238442420959473 + }, + { + "auxiliary_loss_clip": 0.01156952, + "auxiliary_loss_mlp": 0.01050364, + "balance_loss_clip": 1.03070641, + "balance_loss_mlp": 1.04729962, + "epoch": 0.14008717871636855, + "flos": 19391475415680.0, + "grad_norm": 1.840223813628444, + "language_loss": 0.77969897, + "learning_rate": 3.874211680818183e-06, + "loss": 0.80177212, + "num_input_tokens_seen": 50462065, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.09375, + "step": 2330, + "time_per_iteration": 2.468606948852539 + }, + { + "auxiliary_loss_clip": 0.01156133, + "auxiliary_loss_mlp": 0.01046075, + "balance_loss_clip": 1.02738333, + "balance_loss_mlp": 1.0495398, + "epoch": 0.14014730196903652, + "flos": 15304266645120.0, + "grad_norm": 2.6993483396219506, + "language_loss": 0.72030222, + "learning_rate": 3.87407570550194e-06, + "loss": 0.74232423, + "num_input_tokens_seen": 50479565, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0625, + "step": 2331, + "time_per_iteration": 2.504417896270752 + }, + { + "auxiliary_loss_clip": 0.01150975, + "auxiliary_loss_mlp": 0.01053113, + "balance_loss_clip": 1.03333664, + "balance_loss_mlp": 1.05008936, + "epoch": 0.14020742522170448, + "flos": 14939701557120.0, + "grad_norm": 1.585347596838152, + "language_loss": 0.72609055, + "learning_rate": 3.873939659120557e-06, + "loss": 0.74813151, + "num_input_tokens_seen": 50497305, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0078125, + "step": 2332, + "time_per_iteration": 2.4244635105133057 + }, + { + "auxiliary_loss_clip": 0.01047328, + "auxiliary_loss_mlp": 0.01002801, + "balance_loss_clip": 1.00048828, + "balance_loss_mlp": 1.01059568, + "epoch": 0.14026754847437245, + "flos": 48824580044160.0, + "grad_norm": 0.8290843953692559, + "language_loss": 0.56071591, + "learning_rate": 3.873803541679196e-06, + "loss": 0.58121729, + "num_input_tokens_seen": 50549735, + "router_z_loss_clip": 0.02307129, + "router_z_loss_mlp": 0.3671875, + "step": 2333, + "time_per_iteration": 2.8934712409973145 + }, + { + "auxiliary_loss_clip": 0.01155339, + "auxiliary_loss_mlp": 0.01046057, + "balance_loss_clip": 1.02629185, + "balance_loss_mlp": 1.05001664, + "epoch": 0.14032767172704044, + "flos": 25773267876480.0, + "grad_norm": 1.7851490004805215, + "language_loss": 0.82529652, + "learning_rate": 3.873667353183016e-06, + "loss": 0.84731042, + "num_input_tokens_seen": 50570100, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0546875, + "step": 2334, + "time_per_iteration": 2.495786428451538 + }, + { + "auxiliary_loss_clip": 0.01155545, + "auxiliary_loss_mlp": 0.01048248, + "balance_loss_clip": 1.0293529, + "balance_loss_mlp": 1.05012262, + "epoch": 0.1403877949797084, + "flos": 21216312017280.0, + "grad_norm": 1.8251700419130605, + "language_loss": 0.81237197, + "learning_rate": 3.8735310936371825e-06, + "loss": 0.83440989, + "num_input_tokens_seen": 50589185, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0546875, + "step": 2335, + "time_per_iteration": 2.483055591583252 + }, + { + "auxiliary_loss_clip": 0.01163426, + "auxiliary_loss_mlp": 0.01051429, + "balance_loss_clip": 1.02829087, + "balance_loss_mlp": 1.05328035, + "epoch": 0.14044791823237637, + "flos": 22747973811840.0, + "grad_norm": 1.83822789048078, + "language_loss": 0.82159901, + "learning_rate": 3.873394763046862e-06, + "loss": 0.8437475, + "num_input_tokens_seen": 50609645, + "router_z_loss_clip": 0.23144531, + "router_z_loss_mlp": 1.1015625, + "step": 2336, + "time_per_iteration": 2.4732770919799805 + }, + { + "auxiliary_loss_clip": 0.01157668, + "auxiliary_loss_mlp": 0.01044768, + "balance_loss_clip": 1.02526581, + "balance_loss_mlp": 1.05202782, + "epoch": 0.14050804148504434, + "flos": 22964443125120.0, + "grad_norm": 1.8506426201256954, + "language_loss": 0.80081403, + "learning_rate": 3.873258361417225e-06, + "loss": 0.82283843, + "num_input_tokens_seen": 50628385, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0546875, + "step": 2337, + "time_per_iteration": 2.4599671363830566 + }, + { + "auxiliary_loss_clip": 0.01155582, + "auxiliary_loss_mlp": 0.0104864, + "balance_loss_clip": 1.02911353, + "balance_loss_mlp": 1.04861474, + "epoch": 0.1405681647377123, + "flos": 22200336080640.0, + "grad_norm": 2.2474896580124963, + "language_loss": 0.7927807, + "learning_rate": 3.873121888753442e-06, + "loss": 0.81482291, + "num_input_tokens_seen": 50647260, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0703125, + "step": 2338, + "time_per_iteration": 2.4892208576202393 + }, + { + "auxiliary_loss_clip": 0.01165053, + "auxiliary_loss_mlp": 0.01046329, + "balance_loss_clip": 1.02577746, + "balance_loss_mlp": 1.05685067, + "epoch": 0.14062828799038027, + "flos": 23732787974400.0, + "grad_norm": 2.148660398501072, + "language_loss": 0.79827893, + "learning_rate": 3.87298534506069e-06, + "loss": 0.82039273, + "num_input_tokens_seen": 50666130, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.078125, + "step": 2339, + "time_per_iteration": 2.4672555923461914 + }, + { + "auxiliary_loss_clip": 0.01159986, + "auxiliary_loss_mlp": 0.01055012, + "balance_loss_clip": 1.03506875, + "balance_loss_mlp": 1.0527122, + "epoch": 0.14068841124304826, + "flos": 39202493685120.0, + "grad_norm": 1.7979240482106922, + "language_loss": 0.6582588, + "learning_rate": 3.872848730344146e-06, + "loss": 0.68040884, + "num_input_tokens_seen": 50687440, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0703125, + "step": 2340, + "time_per_iteration": 2.614506483078003 + }, + { + "auxiliary_loss_clip": 0.01155238, + "auxiliary_loss_mlp": 0.01048788, + "balance_loss_clip": 1.02936912, + "balance_loss_mlp": 1.05242825, + "epoch": 0.14074853449571623, + "flos": 20192283181440.0, + "grad_norm": 2.5431372850663334, + "language_loss": 0.78670812, + "learning_rate": 3.87271204460899e-06, + "loss": 0.80874836, + "num_input_tokens_seen": 50704030, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.03125, + "step": 2341, + "time_per_iteration": 2.4420077800750732 + }, + { + "auxiliary_loss_clip": 0.01156345, + "auxiliary_loss_mlp": 0.01050043, + "balance_loss_clip": 1.03058767, + "balance_loss_mlp": 1.05246425, + "epoch": 0.1408086577483842, + "flos": 18405871153920.0, + "grad_norm": 11.570217446637303, + "language_loss": 0.80154169, + "learning_rate": 3.8725752878604066e-06, + "loss": 0.82360554, + "num_input_tokens_seen": 50723305, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.03125, + "step": 2342, + "time_per_iteration": 2.4961190223693848 + }, + { + "auxiliary_loss_clip": 0.01159304, + "auxiliary_loss_mlp": 0.010435, + "balance_loss_clip": 1.02486777, + "balance_loss_mlp": 1.05673313, + "epoch": 0.14086878100105216, + "flos": 25264593423360.0, + "grad_norm": 1.9358851833739352, + "language_loss": 0.77974075, + "learning_rate": 3.87243846010358e-06, + "loss": 0.80176884, + "num_input_tokens_seen": 50743270, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.0234375, + "step": 2343, + "time_per_iteration": 2.479679584503174 + }, + { + "auxiliary_loss_clip": 0.01049361, + "auxiliary_loss_mlp": 0.0100492, + "balance_loss_clip": 1.0025475, + "balance_loss_mlp": 1.01255798, + "epoch": 0.14092890425372012, + "flos": 65978388869760.0, + "grad_norm": 0.8341361150670269, + "language_loss": 0.6155628, + "learning_rate": 3.872301561343699e-06, + "loss": 0.63610566, + "num_input_tokens_seen": 50802710, + "router_z_loss_clip": 0.02368164, + "router_z_loss_mlp": 0.3671875, + "step": 2344, + "time_per_iteration": 3.048691987991333 + }, + { + "auxiliary_loss_clip": 0.01151538, + "auxiliary_loss_mlp": 0.01040868, + "balance_loss_clip": 1.02309346, + "balance_loss_mlp": 1.04911709, + "epoch": 0.1409890275063881, + "flos": 23694973931520.0, + "grad_norm": 1.886714907416039, + "language_loss": 0.64591062, + "learning_rate": 3.872164591585956e-06, + "loss": 0.6678347, + "num_input_tokens_seen": 50822625, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 1.0234375, + "step": 2345, + "time_per_iteration": 2.509552240371704 + }, + { + "auxiliary_loss_clip": 0.01162324, + "auxiliary_loss_mlp": 0.01044831, + "balance_loss_clip": 1.023803, + "balance_loss_mlp": 1.05019534, + "epoch": 0.14104915075905605, + "flos": 23623152687360.0, + "grad_norm": 2.502398022219224, + "language_loss": 0.736485, + "learning_rate": 3.8720275508355435e-06, + "loss": 0.7585566, + "num_input_tokens_seen": 50842330, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1171875, + "step": 2346, + "time_per_iteration": 2.4962430000305176 + }, + { + "auxiliary_loss_clip": 0.01160187, + "auxiliary_loss_mlp": 0.01046171, + "balance_loss_clip": 1.02600074, + "balance_loss_mlp": 1.05144429, + "epoch": 0.14110927401172405, + "flos": 20595165102720.0, + "grad_norm": 2.4324488814849703, + "language_loss": 0.77868927, + "learning_rate": 3.8718904390976585e-06, + "loss": 0.80075288, + "num_input_tokens_seen": 50861035, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0859375, + "step": 2347, + "time_per_iteration": 2.4663050174713135 + }, + { + "auxiliary_loss_clip": 0.01155281, + "auxiliary_loss_mlp": 0.01046804, + "balance_loss_clip": 1.02852941, + "balance_loss_mlp": 1.04918981, + "epoch": 0.141169397264392, + "flos": 28548049512960.0, + "grad_norm": 1.7514485331985392, + "language_loss": 0.76446569, + "learning_rate": 3.8717532563775e-06, + "loss": 0.78648651, + "num_input_tokens_seen": 50880105, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.0625, + "step": 2348, + "time_per_iteration": 2.508002758026123 + }, + { + "auxiliary_loss_clip": 0.0115565, + "auxiliary_loss_mlp": 0.01043027, + "balance_loss_clip": 1.02346444, + "balance_loss_mlp": 1.0508523, + "epoch": 0.14122952051705998, + "flos": 17092258871040.0, + "grad_norm": 1.8350283773112115, + "language_loss": 0.8686446, + "learning_rate": 3.871616002680272e-06, + "loss": 0.89063132, + "num_input_tokens_seen": 50897720, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.046875, + "step": 2349, + "time_per_iteration": 2.4446985721588135 + }, + { + "auxiliary_loss_clip": 0.01156083, + "auxiliary_loss_mlp": 0.0104419, + "balance_loss_clip": 1.02478313, + "balance_loss_mlp": 1.05220377, + "epoch": 0.14128964376972794, + "flos": 28946801370240.0, + "grad_norm": 1.7285118920158233, + "language_loss": 0.8895669, + "learning_rate": 3.871478678011177e-06, + "loss": 0.9115696, + "num_input_tokens_seen": 50918385, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0390625, + "step": 2350, + "time_per_iteration": 2.49725341796875 + }, + { + "auxiliary_loss_clip": 0.0115943, + "auxiliary_loss_mlp": 0.01046195, + "balance_loss_clip": 1.02542889, + "balance_loss_mlp": 1.05281878, + "epoch": 0.1413497670223959, + "flos": 18989778643200.0, + "grad_norm": 1.8656651100546833, + "language_loss": 0.814816, + "learning_rate": 3.871341282375423e-06, + "loss": 0.83687228, + "num_input_tokens_seen": 50938270, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.0703125, + "step": 2351, + "time_per_iteration": 3.941416025161743 + }, + { + "auxiliary_loss_clip": 0.01157242, + "auxiliary_loss_mlp": 0.01040097, + "balance_loss_clip": 1.02102304, + "balance_loss_mlp": 1.05032706, + "epoch": 0.14140989027506387, + "flos": 29862236413440.0, + "grad_norm": 2.6782915885510286, + "language_loss": 0.82935351, + "learning_rate": 3.871203815778219e-06, + "loss": 0.85132694, + "num_input_tokens_seen": 50958155, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0703125, + "step": 2352, + "time_per_iteration": 5.431722640991211 + }, + { + "auxiliary_loss_clip": 0.01047453, + "auxiliary_loss_mlp": 0.01006216, + "balance_loss_clip": 1.00387907, + "balance_loss_mlp": 1.01053333, + "epoch": 0.14147001352773186, + "flos": 62079532041600.0, + "grad_norm": 0.90864091090638, + "language_loss": 0.61894125, + "learning_rate": 3.87106627822478e-06, + "loss": 0.63947791, + "num_input_tokens_seen": 51020705, + "router_z_loss_clip": 0.02331543, + "router_z_loss_mlp": 0.36914062, + "step": 2353, + "time_per_iteration": 3.0071640014648438 + }, + { + "auxiliary_loss_clip": 0.01154516, + "auxiliary_loss_mlp": 0.01047207, + "balance_loss_clip": 1.02807426, + "balance_loss_mlp": 1.05024958, + "epoch": 0.14153013678039983, + "flos": 22017514832640.0, + "grad_norm": 1.8535903324814498, + "language_loss": 0.87264848, + "learning_rate": 3.8709286697203196e-06, + "loss": 0.89466572, + "num_input_tokens_seen": 51039995, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0390625, + "step": 2354, + "time_per_iteration": 2.4613726139068604 + }, + { + "auxiliary_loss_clip": 0.01157155, + "auxiliary_loss_mlp": 0.01046298, + "balance_loss_clip": 1.02607965, + "balance_loss_mlp": 1.04953241, + "epoch": 0.1415902600330678, + "flos": 19720093968000.0, + "grad_norm": 1.9651075901387003, + "language_loss": 0.74872321, + "learning_rate": 3.870790990270057e-06, + "loss": 0.77075779, + "num_input_tokens_seen": 51059075, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.078125, + "step": 2355, + "time_per_iteration": 2.442379951477051 + }, + { + "auxiliary_loss_clip": 0.01047047, + "auxiliary_loss_mlp": 0.01002716, + "balance_loss_clip": 1.00052261, + "balance_loss_mlp": 1.01023293, + "epoch": 0.14165038328573576, + "flos": 65900929190400.0, + "grad_norm": 0.6790475533637321, + "language_loss": 0.5182299, + "learning_rate": 3.870653239879212e-06, + "loss": 0.53872752, + "num_input_tokens_seen": 51120380, + "router_z_loss_clip": 0.02197266, + "router_z_loss_mlp": 0.3671875, + "step": 2356, + "time_per_iteration": 2.9892258644104004 + }, + { + "auxiliary_loss_clip": 0.01156071, + "auxiliary_loss_mlp": 0.01053896, + "balance_loss_clip": 1.03495359, + "balance_loss_mlp": 1.05080867, + "epoch": 0.14171050653840372, + "flos": 12130158533760.0, + "grad_norm": 3.0630792396255053, + "language_loss": 0.70576489, + "learning_rate": 3.8705154185530095e-06, + "loss": 0.72786456, + "num_input_tokens_seen": 51136950, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0546875, + "step": 2357, + "time_per_iteration": 2.421844005584717 + }, + { + "auxiliary_loss_clip": 0.0116013, + "auxiliary_loss_mlp": 0.01050288, + "balance_loss_clip": 1.03169179, + "balance_loss_mlp": 1.05012453, + "epoch": 0.1417706297910717, + "flos": 20412487509120.0, + "grad_norm": 1.8720076771552743, + "language_loss": 0.82205695, + "learning_rate": 3.870377526296674e-06, + "loss": 0.84416115, + "num_input_tokens_seen": 51155175, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.09375, + "step": 2358, + "time_per_iteration": 2.4519011974334717 + }, + { + "auxiliary_loss_clip": 0.01160902, + "auxiliary_loss_mlp": 0.01047176, + "balance_loss_clip": 1.02663624, + "balance_loss_mlp": 1.051018, + "epoch": 0.14183075304373965, + "flos": 22380607463040.0, + "grad_norm": 6.439592826280342, + "language_loss": 0.7129705, + "learning_rate": 3.870239563115436e-06, + "loss": 0.73505127, + "num_input_tokens_seen": 51174500, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.1015625, + "step": 2359, + "time_per_iteration": 2.4797613620758057 + }, + { + "auxiliary_loss_clip": 0.0115558, + "auxiliary_loss_mlp": 0.01043529, + "balance_loss_clip": 1.02374041, + "balance_loss_mlp": 1.04988599, + "epoch": 0.14189087629640765, + "flos": 21580913018880.0, + "grad_norm": 5.514404455287625, + "language_loss": 0.76040578, + "learning_rate": 3.870101529014526e-06, + "loss": 0.78239685, + "num_input_tokens_seen": 51194270, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.0546875, + "step": 2360, + "time_per_iteration": 2.4538815021514893 + }, + { + "auxiliary_loss_clip": 0.011559, + "auxiliary_loss_mlp": 0.01041926, + "balance_loss_clip": 1.02173233, + "balance_loss_mlp": 1.05221295, + "epoch": 0.1419509995490756, + "flos": 20008564093440.0, + "grad_norm": 2.1535632205539135, + "language_loss": 0.8188749, + "learning_rate": 3.869963423999178e-06, + "loss": 0.84085315, + "num_input_tokens_seen": 51211850, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0390625, + "step": 2361, + "time_per_iteration": 2.4411346912384033 + }, + { + "auxiliary_loss_clip": 0.01152529, + "auxiliary_loss_mlp": 0.01047315, + "balance_loss_clip": 1.02826524, + "balance_loss_mlp": 1.04964995, + "epoch": 0.14201112280174358, + "flos": 31941464112000.0, + "grad_norm": 2.775663525053056, + "language_loss": 0.74489617, + "learning_rate": 3.86982524807463e-06, + "loss": 0.76689464, + "num_input_tokens_seen": 51233545, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.03125, + "step": 2362, + "time_per_iteration": 2.530163049697876 + }, + { + "auxiliary_loss_clip": 0.01158195, + "auxiliary_loss_mlp": 0.01046424, + "balance_loss_clip": 1.0265274, + "balance_loss_mlp": 1.05187464, + "epoch": 0.14207124605441154, + "flos": 41464147582080.0, + "grad_norm": 4.478599792998506, + "language_loss": 0.73748112, + "learning_rate": 3.869687001246122e-06, + "loss": 0.75952733, + "num_input_tokens_seen": 51257615, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0625, + "step": 2363, + "time_per_iteration": 2.646651029586792 + }, + { + "auxiliary_loss_clip": 0.01156109, + "auxiliary_loss_mlp": 0.01045606, + "balance_loss_clip": 1.02605534, + "balance_loss_mlp": 1.05005693, + "epoch": 0.1421313693070795, + "flos": 31905086613120.0, + "grad_norm": 1.8353407682080387, + "language_loss": 0.72971261, + "learning_rate": 3.8695486835188946e-06, + "loss": 0.75172973, + "num_input_tokens_seen": 51279645, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0625, + "step": 2364, + "time_per_iteration": 2.5670576095581055 + }, + { + "auxiliary_loss_clip": 0.01152213, + "auxiliary_loss_mlp": 0.01048707, + "balance_loss_clip": 1.031183, + "balance_loss_mlp": 1.05015445, + "epoch": 0.14219149255974747, + "flos": 26871165031680.0, + "grad_norm": 4.452075303519762, + "language_loss": 0.90230036, + "learning_rate": 3.869410294898195e-06, + "loss": 0.92430955, + "num_input_tokens_seen": 51299775, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 1.015625, + "step": 2365, + "time_per_iteration": 2.5130062103271484 + }, + { + "auxiliary_loss_clip": 0.01155172, + "auxiliary_loss_mlp": 0.0104726, + "balance_loss_clip": 1.02735198, + "balance_loss_mlp": 1.04896259, + "epoch": 0.14225161581241544, + "flos": 27454426076160.0, + "grad_norm": 1.956458588852685, + "language_loss": 0.65377176, + "learning_rate": 3.869271835389268e-06, + "loss": 0.67579615, + "num_input_tokens_seen": 51319430, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0625, + "step": 2366, + "time_per_iteration": 2.5081095695495605 + }, + { + "auxiliary_loss_clip": 0.01152693, + "auxiliary_loss_mlp": 0.01056429, + "balance_loss_clip": 1.03640223, + "balance_loss_mlp": 1.04979372, + "epoch": 0.14231173906508343, + "flos": 10561436881920.0, + "grad_norm": 2.190613479881076, + "language_loss": 0.80414236, + "learning_rate": 3.8691333049973665e-06, + "loss": 0.82623357, + "num_input_tokens_seen": 51336045, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.03125, + "step": 2367, + "time_per_iteration": 2.4398317337036133 + }, + { + "auxiliary_loss_clip": 0.01158941, + "auxiliary_loss_mlp": 0.01054295, + "balance_loss_clip": 1.0333972, + "balance_loss_mlp": 1.05221498, + "epoch": 0.1423718623177514, + "flos": 28360882719360.0, + "grad_norm": 2.898581267606924, + "language_loss": 0.82619941, + "learning_rate": 3.868994703727742e-06, + "loss": 0.84833181, + "num_input_tokens_seen": 51357030, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.0703125, + "step": 2368, + "time_per_iteration": 2.512401580810547 + }, + { + "auxiliary_loss_clip": 0.01157439, + "auxiliary_loss_mlp": 0.01050054, + "balance_loss_clip": 1.0298835, + "balance_loss_mlp": 1.05165803, + "epoch": 0.14243198557041936, + "flos": 19354235990400.0, + "grad_norm": 2.7587049982231675, + "language_loss": 0.86971414, + "learning_rate": 3.868856031585652e-06, + "loss": 0.89178908, + "num_input_tokens_seen": 51374890, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0625, + "step": 2369, + "time_per_iteration": 2.444784164428711 + }, + { + "auxiliary_loss_clip": 0.01158905, + "auxiliary_loss_mlp": 0.01042779, + "balance_loss_clip": 1.02303767, + "balance_loss_mlp": 1.04913163, + "epoch": 0.14249210882308733, + "flos": 28806857982720.0, + "grad_norm": 1.4370193327140612, + "language_loss": 0.75704634, + "learning_rate": 3.868717288576354e-06, + "loss": 0.77906322, + "num_input_tokens_seen": 51398100, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.09375, + "step": 2370, + "time_per_iteration": 2.527740240097046 + }, + { + "auxiliary_loss_clip": 0.01154457, + "auxiliary_loss_mlp": 0.01058445, + "balance_loss_clip": 1.0384295, + "balance_loss_mlp": 1.04879546, + "epoch": 0.1425522320757553, + "flos": 21835016807040.0, + "grad_norm": 1.7319048865171518, + "language_loss": 0.82923144, + "learning_rate": 3.868578474705109e-06, + "loss": 0.85136044, + "num_input_tokens_seen": 51418745, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.0625, + "step": 2371, + "time_per_iteration": 2.4644808769226074 + }, + { + "auxiliary_loss_clip": 0.01158835, + "auxiliary_loss_mlp": 0.01051346, + "balance_loss_clip": 1.03171265, + "balance_loss_mlp": 1.05157602, + "epoch": 0.14261235532842326, + "flos": 17311457617920.0, + "grad_norm": 1.956158386855541, + "language_loss": 0.82575452, + "learning_rate": 3.868439589977181e-06, + "loss": 0.84785628, + "num_input_tokens_seen": 51437455, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0703125, + "step": 2372, + "time_per_iteration": 2.42240047454834 + }, + { + "auxiliary_loss_clip": 0.01157732, + "auxiliary_loss_mlp": 0.01051664, + "balance_loss_clip": 1.03175569, + "balance_loss_mlp": 1.05134308, + "epoch": 0.14267247858109125, + "flos": 18806741913600.0, + "grad_norm": 2.19442784605527, + "language_loss": 0.8396256, + "learning_rate": 3.868300634397836e-06, + "loss": 0.86171949, + "num_input_tokens_seen": 51455710, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0625, + "step": 2373, + "time_per_iteration": 2.444695472717285 + }, + { + "auxiliary_loss_clip": 0.01154816, + "auxiliary_loss_mlp": 0.01050946, + "balance_loss_clip": 1.03294528, + "balance_loss_mlp": 1.05012143, + "epoch": 0.14273260183375922, + "flos": 11358904682880.0, + "grad_norm": 2.034088541649992, + "language_loss": 0.86271042, + "learning_rate": 3.8681616079723445e-06, + "loss": 0.88476801, + "num_input_tokens_seen": 51471270, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.046875, + "step": 2374, + "time_per_iteration": 2.428062915802002 + }, + { + "auxiliary_loss_clip": 0.01161306, + "auxiliary_loss_mlp": 0.01050996, + "balance_loss_clip": 1.03024197, + "balance_loss_mlp": 1.05125451, + "epoch": 0.14279272508642718, + "flos": 27567688636800.0, + "grad_norm": 4.612229602439842, + "language_loss": 0.7919687, + "learning_rate": 3.868022510705977e-06, + "loss": 0.81409162, + "num_input_tokens_seen": 51492705, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1015625, + "step": 2375, + "time_per_iteration": 2.526838541030884 + }, + { + "auxiliary_loss_clip": 0.01157834, + "auxiliary_loss_mlp": 0.01056869, + "balance_loss_clip": 1.03655601, + "balance_loss_mlp": 1.05240607, + "epoch": 0.14285284833909515, + "flos": 16252559654400.0, + "grad_norm": 2.386247922788535, + "language_loss": 0.76400912, + "learning_rate": 3.867883342604009e-06, + "loss": 0.78615618, + "num_input_tokens_seen": 51510780, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.046875, + "step": 2376, + "time_per_iteration": 2.4554591178894043 + }, + { + "auxiliary_loss_clip": 0.01156552, + "auxiliary_loss_mlp": 0.0104949, + "balance_loss_clip": 1.02995205, + "balance_loss_mlp": 1.05075741, + "epoch": 0.1429129715917631, + "flos": 19755609540480.0, + "grad_norm": 2.9035160782842753, + "language_loss": 0.93037754, + "learning_rate": 3.867744103671717e-06, + "loss": 0.952438, + "num_input_tokens_seen": 51531400, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0625, + "step": 2377, + "time_per_iteration": 2.51385760307312 + }, + { + "auxiliary_loss_clip": 0.01157682, + "auxiliary_loss_mlp": 0.01051526, + "balance_loss_clip": 1.02991319, + "balance_loss_mlp": 1.05085003, + "epoch": 0.14297309484443108, + "flos": 21137092571520.0, + "grad_norm": 1.9751577144221115, + "language_loss": 0.91598773, + "learning_rate": 3.867604793914382e-06, + "loss": 0.93807983, + "num_input_tokens_seen": 51548215, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.0703125, + "step": 2378, + "time_per_iteration": 2.558563470840454 + }, + { + "auxiliary_loss_clip": 0.01159674, + "auxiliary_loss_mlp": 0.01044299, + "balance_loss_clip": 1.02410531, + "balance_loss_mlp": 1.051296, + "epoch": 0.14303321809709904, + "flos": 23586667447680.0, + "grad_norm": 1.745891074970689, + "language_loss": 0.73947102, + "learning_rate": 3.8674654133372864e-06, + "loss": 0.76151079, + "num_input_tokens_seen": 51566820, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0859375, + "step": 2379, + "time_per_iteration": 2.511359214782715 + }, + { + "auxiliary_loss_clip": 0.01156473, + "auxiliary_loss_mlp": 0.01056109, + "balance_loss_clip": 1.03636849, + "balance_loss_mlp": 1.05014992, + "epoch": 0.14309334134976703, + "flos": 15888281875200.0, + "grad_norm": 1.8640465231226504, + "language_loss": 0.79013336, + "learning_rate": 3.867325961945714e-06, + "loss": 0.81225914, + "num_input_tokens_seen": 51585075, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0625, + "step": 2380, + "time_per_iteration": 2.466219663619995 + }, + { + "auxiliary_loss_clip": 0.01162977, + "auxiliary_loss_mlp": 0.0105089, + "balance_loss_clip": 1.03124452, + "balance_loss_mlp": 1.05528164, + "epoch": 0.143153464602435, + "flos": 16325601960960.0, + "grad_norm": 2.3244590707621073, + "language_loss": 0.87958229, + "learning_rate": 3.867186439744955e-06, + "loss": 0.90172088, + "num_input_tokens_seen": 51603185, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.078125, + "step": 2381, + "time_per_iteration": 2.4476850032806396 + }, + { + "auxiliary_loss_clip": 0.01156941, + "auxiliary_loss_mlp": 0.01051059, + "balance_loss_clip": 1.03084123, + "balance_loss_mlp": 1.0517571, + "epoch": 0.14321358785510296, + "flos": 17092079303040.0, + "grad_norm": 2.599935932772449, + "language_loss": 0.76852649, + "learning_rate": 3.867046846740299e-06, + "loss": 0.7906065, + "num_input_tokens_seen": 51620880, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.046875, + "step": 2382, + "time_per_iteration": 2.4389045238494873 + }, + { + "auxiliary_loss_clip": 0.01157847, + "auxiliary_loss_mlp": 0.01053474, + "balance_loss_clip": 1.03338671, + "balance_loss_mlp": 1.05068171, + "epoch": 0.14327371110777093, + "flos": 26322916769280.0, + "grad_norm": 2.461149819336849, + "language_loss": 0.76948071, + "learning_rate": 3.866907182937039e-06, + "loss": 0.79159391, + "num_input_tokens_seen": 51640170, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0703125, + "step": 2383, + "time_per_iteration": 2.516038179397583 + }, + { + "auxiliary_loss_clip": 0.01158748, + "auxiliary_loss_mlp": 0.01050878, + "balance_loss_clip": 1.0299803, + "balance_loss_mlp": 1.05114412, + "epoch": 0.1433338343604389, + "flos": 18076462502400.0, + "grad_norm": 2.169581662424978, + "language_loss": 0.88202822, + "learning_rate": 3.866767448340471e-06, + "loss": 0.9041245, + "num_input_tokens_seen": 51656580, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.078125, + "step": 2384, + "time_per_iteration": 2.42138934135437 + }, + { + "auxiliary_loss_clip": 0.01164338, + "auxiliary_loss_mlp": 0.01049242, + "balance_loss_clip": 1.02780819, + "balance_loss_mlp": 1.05382657, + "epoch": 0.14339395761310686, + "flos": 15522783033600.0, + "grad_norm": 4.175812514986151, + "language_loss": 0.79225606, + "learning_rate": 3.866627642955895e-06, + "loss": 0.81439185, + "num_input_tokens_seen": 51674645, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.109375, + "step": 2385, + "time_per_iteration": 2.4439244270324707 + }, + { + "auxiliary_loss_clip": 0.01156029, + "auxiliary_loss_mlp": 0.01046717, + "balance_loss_clip": 1.02692771, + "balance_loss_mlp": 1.04881537, + "epoch": 0.14345408086577485, + "flos": 28548767784960.0, + "grad_norm": 1.9672730758223058, + "language_loss": 0.74989617, + "learning_rate": 3.866487766788612e-06, + "loss": 0.77192366, + "num_input_tokens_seen": 51695770, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.078125, + "step": 2386, + "time_per_iteration": 2.533304214477539 + }, + { + "auxiliary_loss_clip": 0.01159067, + "auxiliary_loss_mlp": 0.01047419, + "balance_loss_clip": 1.02777338, + "balance_loss_mlp": 1.05180025, + "epoch": 0.14351420411844282, + "flos": 20230061310720.0, + "grad_norm": 2.5174427688568626, + "language_loss": 0.78475344, + "learning_rate": 3.866347819843925e-06, + "loss": 0.80681831, + "num_input_tokens_seen": 51714165, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0703125, + "step": 2387, + "time_per_iteration": 2.4568724632263184 + }, + { + "auxiliary_loss_clip": 0.01157837, + "auxiliary_loss_mlp": 0.010548, + "balance_loss_clip": 1.03389072, + "balance_loss_mlp": 1.05092847, + "epoch": 0.14357432737111078, + "flos": 19865029345920.0, + "grad_norm": 2.559937991009886, + "language_loss": 0.82087159, + "learning_rate": 3.866207802127143e-06, + "loss": 0.84299791, + "num_input_tokens_seen": 51734440, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.0703125, + "step": 2388, + "time_per_iteration": 2.5136237144470215 + }, + { + "auxiliary_loss_clip": 0.01161514, + "auxiliary_loss_mlp": 0.01044981, + "balance_loss_clip": 1.02633715, + "balance_loss_mlp": 1.05393136, + "epoch": 0.14363445062377875, + "flos": 28256814040320.0, + "grad_norm": 2.471836270672028, + "language_loss": 0.82267237, + "learning_rate": 3.866067713643573e-06, + "loss": 0.84473729, + "num_input_tokens_seen": 51753730, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.078125, + "step": 2389, + "time_per_iteration": 2.576352119445801 + }, + { + "auxiliary_loss_clip": 0.01161426, + "auxiliary_loss_mlp": 0.01051291, + "balance_loss_clip": 1.03020322, + "balance_loss_mlp": 1.05032301, + "epoch": 0.1436945738764467, + "flos": 18186672407040.0, + "grad_norm": 2.165584666776674, + "language_loss": 0.82654548, + "learning_rate": 3.8659275543985285e-06, + "loss": 0.84867263, + "num_input_tokens_seen": 51771195, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.109375, + "step": 2390, + "time_per_iteration": 2.5145435333251953 + }, + { + "auxiliary_loss_clip": 0.01158378, + "auxiliary_loss_mlp": 0.01054186, + "balance_loss_clip": 1.03406334, + "balance_loss_mlp": 1.0510571, + "epoch": 0.14375469712911468, + "flos": 27307910499840.0, + "grad_norm": 3.0575281215329086, + "language_loss": 0.74616158, + "learning_rate": 3.865787324397324e-06, + "loss": 0.76828718, + "num_input_tokens_seen": 51792290, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.078125, + "step": 2391, + "time_per_iteration": 2.5368545055389404 + }, + { + "auxiliary_loss_clip": 0.01050934, + "auxiliary_loss_mlp": 0.01014282, + "balance_loss_clip": 1.0121367, + "balance_loss_mlp": 1.01461065, + "epoch": 0.14381482038178264, + "flos": 56891445287040.0, + "grad_norm": 0.8732258813949081, + "language_loss": 0.61769497, + "learning_rate": 3.865647023645277e-06, + "loss": 0.63834715, + "num_input_tokens_seen": 51843675, + "router_z_loss_clip": 0.02148438, + "router_z_loss_mlp": 0.36328125, + "step": 2392, + "time_per_iteration": 2.9315476417541504 + }, + { + "auxiliary_loss_clip": 0.01161818, + "auxiliary_loss_mlp": 0.01056559, + "balance_loss_clip": 1.03449333, + "balance_loss_mlp": 1.04981267, + "epoch": 0.14387494363445064, + "flos": 14282177143680.0, + "grad_norm": 2.638581894381379, + "language_loss": 0.76172751, + "learning_rate": 3.865506652147709e-06, + "loss": 0.78391123, + "num_input_tokens_seen": 51860285, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.1171875, + "step": 2393, + "time_per_iteration": 3.857799530029297 + }, + { + "auxiliary_loss_clip": 0.01161345, + "auxiliary_loss_mlp": 0.01049065, + "balance_loss_clip": 1.02908611, + "balance_loss_mlp": 1.05249143, + "epoch": 0.1439350668871186, + "flos": 26761493831040.0, + "grad_norm": 1.8778469598095298, + "language_loss": 0.76782668, + "learning_rate": 3.865366209909941e-06, + "loss": 0.78993082, + "num_input_tokens_seen": 51880105, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.09375, + "step": 2394, + "time_per_iteration": 3.979130983352661 + }, + { + "auxiliary_loss_clip": 0.01158023, + "auxiliary_loss_mlp": 0.01048603, + "balance_loss_clip": 1.02836156, + "balance_loss_mlp": 1.05062532, + "epoch": 0.14399519013978657, + "flos": 40700040537600.0, + "grad_norm": 1.605706810552395, + "language_loss": 0.85831755, + "learning_rate": 3.8652256969372994e-06, + "loss": 0.88038385, + "num_input_tokens_seen": 51905175, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.078125, + "step": 2395, + "time_per_iteration": 2.652092933654785 + }, + { + "auxiliary_loss_clip": 0.01157831, + "auxiliary_loss_mlp": 0.01049814, + "balance_loss_clip": 1.03040648, + "balance_loss_mlp": 1.05241179, + "epoch": 0.14405531339245453, + "flos": 20557530627840.0, + "grad_norm": 1.5230484666362787, + "language_loss": 0.82984561, + "learning_rate": 3.865085113235113e-06, + "loss": 0.85192204, + "num_input_tokens_seen": 51924490, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0546875, + "step": 2396, + "time_per_iteration": 2.4647467136383057 + }, + { + "auxiliary_loss_clip": 0.01152766, + "auxiliary_loss_mlp": 0.01046059, + "balance_loss_clip": 1.02691364, + "balance_loss_mlp": 1.04947495, + "epoch": 0.1441154366451225, + "flos": 19572931946880.0, + "grad_norm": 2.435366869769497, + "language_loss": 0.82564163, + "learning_rate": 3.864944458808712e-06, + "loss": 0.8476299, + "num_input_tokens_seen": 51940490, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.03125, + "step": 2397, + "time_per_iteration": 2.4151055812835693 + }, + { + "auxiliary_loss_clip": 0.01161338, + "auxiliary_loss_mlp": 0.01047669, + "balance_loss_clip": 1.02689052, + "balance_loss_mlp": 1.05216622, + "epoch": 0.14417555989779046, + "flos": 18515721922560.0, + "grad_norm": 1.6104109289920625, + "language_loss": 0.79418427, + "learning_rate": 3.86480373366343e-06, + "loss": 0.81627429, + "num_input_tokens_seen": 51957910, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.09375, + "step": 2398, + "time_per_iteration": 2.4470388889312744 + }, + { + "auxiliary_loss_clip": 0.01158929, + "auxiliary_loss_mlp": 0.01052066, + "balance_loss_clip": 1.03246808, + "balance_loss_mlp": 1.05359757, + "epoch": 0.14423568315045843, + "flos": 26031681296640.0, + "grad_norm": 2.7500042291552433, + "language_loss": 0.64847696, + "learning_rate": 3.864662937804603e-06, + "loss": 0.67058688, + "num_input_tokens_seen": 51978010, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0546875, + "step": 2399, + "time_per_iteration": 2.5123891830444336 + }, + { + "auxiliary_loss_clip": 0.01157977, + "auxiliary_loss_mlp": 0.01044487, + "balance_loss_clip": 1.02472198, + "balance_loss_mlp": 1.05306005, + "epoch": 0.14429580640312642, + "flos": 21288743792640.0, + "grad_norm": 1.4896130870957418, + "language_loss": 0.82329226, + "learning_rate": 3.864522071237571e-06, + "loss": 0.84531689, + "num_input_tokens_seen": 51998515, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0546875, + "step": 2400, + "time_per_iteration": 2.4825797080993652 + }, + { + "auxiliary_loss_clip": 0.01162323, + "auxiliary_loss_mlp": 0.01052957, + "balance_loss_clip": 1.03165436, + "balance_loss_mlp": 1.053689, + "epoch": 0.14435592965579438, + "flos": 25627865621760.0, + "grad_norm": 1.540874002782335, + "language_loss": 0.74606794, + "learning_rate": 3.864381133967676e-06, + "loss": 0.76822078, + "num_input_tokens_seen": 52019270, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.0859375, + "step": 2401, + "time_per_iteration": 2.507983684539795 + }, + { + "auxiliary_loss_clip": 0.01156636, + "auxiliary_loss_mlp": 0.01046459, + "balance_loss_clip": 1.0269084, + "balance_loss_mlp": 1.05109596, + "epoch": 0.14441605290846235, + "flos": 22965053656320.0, + "grad_norm": 1.7568662987329828, + "language_loss": 0.80577219, + "learning_rate": 3.86424012600026e-06, + "loss": 0.82780313, + "num_input_tokens_seen": 52039315, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.046875, + "step": 2402, + "time_per_iteration": 2.4913880825042725 + }, + { + "auxiliary_loss_clip": 0.01156436, + "auxiliary_loss_mlp": 0.01048893, + "balance_loss_clip": 1.02880669, + "balance_loss_mlp": 1.05137098, + "epoch": 0.14447617616113032, + "flos": 17347655548800.0, + "grad_norm": 2.1115432529250753, + "language_loss": 0.84918672, + "learning_rate": 3.864099047340673e-06, + "loss": 0.87124002, + "num_input_tokens_seen": 52056555, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.046875, + "step": 2403, + "time_per_iteration": 2.4267525672912598 + }, + { + "auxiliary_loss_clip": 0.01155438, + "auxiliary_loss_mlp": 0.01053748, + "balance_loss_clip": 1.03312445, + "balance_loss_mlp": 1.04934669, + "epoch": 0.14453629941379828, + "flos": 24060185464320.0, + "grad_norm": 3.423742001713465, + "language_loss": 0.70017314, + "learning_rate": 3.863957897994262e-06, + "loss": 0.72226501, + "num_input_tokens_seen": 52075800, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.0625, + "step": 2404, + "time_per_iteration": 2.487827777862549 + }, + { + "auxiliary_loss_clip": 0.01151274, + "auxiliary_loss_mlp": 0.0104872, + "balance_loss_clip": 1.02976513, + "balance_loss_mlp": 1.0473218, + "epoch": 0.14459642266646625, + "flos": 14429554646400.0, + "grad_norm": 2.368746641876408, + "language_loss": 0.72847003, + "learning_rate": 3.863816677966381e-06, + "loss": 0.75046992, + "num_input_tokens_seen": 52092585, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0390625, + "step": 2405, + "time_per_iteration": 2.458444833755493 + }, + { + "auxiliary_loss_clip": 0.01152813, + "auxiliary_loss_mlp": 0.0104761, + "balance_loss_clip": 1.02879858, + "balance_loss_mlp": 1.04891181, + "epoch": 0.14465654591913424, + "flos": 9867032179200.0, + "grad_norm": 2.2064790582144473, + "language_loss": 0.73115766, + "learning_rate": 3.863675387262386e-06, + "loss": 0.75316191, + "num_input_tokens_seen": 52108990, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0390625, + "step": 2406, + "time_per_iteration": 2.4501168727874756 + }, + { + "auxiliary_loss_clip": 0.0115439, + "auxiliary_loss_mlp": 0.01052848, + "balance_loss_clip": 1.03161645, + "balance_loss_mlp": 1.04889357, + "epoch": 0.1447166691718022, + "flos": 24972926987520.0, + "grad_norm": 4.997473868200426, + "language_loss": 0.75399184, + "learning_rate": 3.8635340258876325e-06, + "loss": 0.77606416, + "num_input_tokens_seen": 52125385, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.0546875, + "step": 2407, + "time_per_iteration": 2.482008934020996 + }, + { + "auxiliary_loss_clip": 0.01151849, + "auxiliary_loss_mlp": 0.01043439, + "balance_loss_clip": 1.02418649, + "balance_loss_mlp": 1.04607177, + "epoch": 0.14477679242447017, + "flos": 21908023200000.0, + "grad_norm": 1.6082248834480546, + "language_loss": 0.79472804, + "learning_rate": 3.8633925938474826e-06, + "loss": 0.81668091, + "num_input_tokens_seen": 52144985, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.0625, + "step": 2408, + "time_per_iteration": 2.4657323360443115 + }, + { + "auxiliary_loss_clip": 0.01155517, + "auxiliary_loss_mlp": 0.01051689, + "balance_loss_clip": 1.03006482, + "balance_loss_mlp": 1.05088127, + "epoch": 0.14483691567713813, + "flos": 20740746925440.0, + "grad_norm": 2.1979655558708893, + "language_loss": 0.82594806, + "learning_rate": 3.863251091147299e-06, + "loss": 0.84802014, + "num_input_tokens_seen": 52163885, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.046875, + "step": 2409, + "time_per_iteration": 2.450345039367676 + }, + { + "auxiliary_loss_clip": 0.01156412, + "auxiliary_loss_mlp": 0.01053711, + "balance_loss_clip": 1.03411365, + "balance_loss_mlp": 1.05046105, + "epoch": 0.1448970389298061, + "flos": 35407705536000.0, + "grad_norm": 1.954409921875598, + "language_loss": 0.74561608, + "learning_rate": 3.863109517792446e-06, + "loss": 0.7677173, + "num_input_tokens_seen": 52184325, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0625, + "step": 2410, + "time_per_iteration": 2.5861988067626953 + }, + { + "auxiliary_loss_clip": 0.01154014, + "auxiliary_loss_mlp": 0.01047443, + "balance_loss_clip": 1.02883387, + "balance_loss_mlp": 1.04858971, + "epoch": 0.14495716218247406, + "flos": 15414368808960.0, + "grad_norm": 2.3844352739280597, + "language_loss": 0.81135416, + "learning_rate": 3.8629678737882945e-06, + "loss": 0.83336866, + "num_input_tokens_seen": 52202740, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0546875, + "step": 2411, + "time_per_iteration": 2.4708898067474365 + }, + { + "auxiliary_loss_clip": 0.0115486, + "auxiliary_loss_mlp": 0.01053033, + "balance_loss_clip": 1.03403103, + "balance_loss_mlp": 1.05123138, + "epoch": 0.14501728543514203, + "flos": 33693222493440.0, + "grad_norm": 1.954560524414831, + "language_loss": 0.69816971, + "learning_rate": 3.862826159140214e-06, + "loss": 0.7202487, + "num_input_tokens_seen": 52223100, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.03125, + "step": 2412, + "time_per_iteration": 2.5614776611328125 + }, + { + "auxiliary_loss_clip": 0.0115476, + "auxiliary_loss_mlp": 0.01046078, + "balance_loss_clip": 1.02640891, + "balance_loss_mlp": 1.05100143, + "epoch": 0.14507740868781002, + "flos": 15596112648960.0, + "grad_norm": 2.1541085269745803, + "language_loss": 0.77347231, + "learning_rate": 3.862684373853579e-06, + "loss": 0.79548067, + "num_input_tokens_seen": 52239690, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0390625, + "step": 2413, + "time_per_iteration": 2.4292590618133545 + }, + { + "auxiliary_loss_clip": 0.01049286, + "auxiliary_loss_mlp": 0.0100403, + "balance_loss_clip": 1.00175285, + "balance_loss_mlp": 1.01294982, + "epoch": 0.145137531940478, + "flos": 66675343438080.0, + "grad_norm": 0.9152840666775347, + "language_loss": 0.58887923, + "learning_rate": 3.8625425179337656e-06, + "loss": 0.60941237, + "num_input_tokens_seen": 52296705, + "router_z_loss_clip": 0.02282715, + "router_z_loss_mlp": 0.36328125, + "step": 2414, + "time_per_iteration": 2.9752402305603027 + }, + { + "auxiliary_loss_clip": 0.01048826, + "auxiliary_loss_mlp": 0.01001535, + "balance_loss_clip": 0.99943656, + "balance_loss_mlp": 1.01240802, + "epoch": 0.14519765519314595, + "flos": 67521578929920.0, + "grad_norm": 0.8348908268898737, + "language_loss": 0.6218617, + "learning_rate": 3.862400591386154e-06, + "loss": 0.64236534, + "num_input_tokens_seen": 52361830, + "router_z_loss_clip": 0.02099609, + "router_z_loss_mlp": 0.36328125, + "step": 2415, + "time_per_iteration": 3.039710521697998 + }, + { + "auxiliary_loss_clip": 0.01151709, + "auxiliary_loss_mlp": 0.01046414, + "balance_loss_clip": 1.02637458, + "balance_loss_mlp": 1.04699647, + "epoch": 0.14525777844581392, + "flos": 17198913329280.0, + "grad_norm": 1.8743578134099377, + "language_loss": 0.72001135, + "learning_rate": 3.8622585942161245e-06, + "loss": 0.74199259, + "num_input_tokens_seen": 52379420, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.046875, + "step": 2416, + "time_per_iteration": 2.4283041954040527 + }, + { + "auxiliary_loss_clip": 0.0104556, + "auxiliary_loss_mlp": 0.01005813, + "balance_loss_clip": 1.00379848, + "balance_loss_mlp": 1.01002693, + "epoch": 0.14531790169848188, + "flos": 65404609015680.0, + "grad_norm": 0.711670432605859, + "language_loss": 0.60392165, + "learning_rate": 3.8621165264290635e-06, + "loss": 0.62443542, + "num_input_tokens_seen": 52446290, + "router_z_loss_clip": 0.0201416, + "router_z_loss_mlp": 0.35546875, + "step": 2417, + "time_per_iteration": 3.0824739933013916 + }, + { + "auxiliary_loss_clip": 0.01155799, + "auxiliary_loss_mlp": 0.01055986, + "balance_loss_clip": 1.03639972, + "balance_loss_mlp": 1.04795754, + "epoch": 0.14537802495114985, + "flos": 32562467372160.0, + "grad_norm": 2.9144560714513363, + "language_loss": 0.79237175, + "learning_rate": 3.861974388030356e-06, + "loss": 0.8144896, + "num_input_tokens_seen": 52467295, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.078125, + "step": 2418, + "time_per_iteration": 2.564497947692871 + }, + { + "auxiliary_loss_clip": 0.01150145, + "auxiliary_loss_mlp": 0.01051645, + "balance_loss_clip": 1.03267837, + "balance_loss_mlp": 1.04712582, + "epoch": 0.1454381482038178, + "flos": 20226685432320.0, + "grad_norm": 1.8755047341617508, + "language_loss": 0.72032261, + "learning_rate": 3.861832179025394e-06, + "loss": 0.74234051, + "num_input_tokens_seen": 52487295, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.03125, + "step": 2419, + "time_per_iteration": 2.457617998123169 + }, + { + "auxiliary_loss_clip": 0.01153915, + "auxiliary_loss_mlp": 0.01053899, + "balance_loss_clip": 1.0335021, + "balance_loss_mlp": 1.05042267, + "epoch": 0.1454982714564858, + "flos": 22893124671360.0, + "grad_norm": 2.3659429121693525, + "language_loss": 0.90125811, + "learning_rate": 3.861689899419569e-06, + "loss": 0.92333627, + "num_input_tokens_seen": 52504220, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.03125, + "step": 2420, + "time_per_iteration": 2.456087827682495 + }, + { + "auxiliary_loss_clip": 0.01154143, + "auxiliary_loss_mlp": 0.01057012, + "balance_loss_clip": 1.0382725, + "balance_loss_mlp": 1.04868603, + "epoch": 0.14555839470915377, + "flos": 20229845829120.0, + "grad_norm": 2.2940003535379057, + "language_loss": 0.83309549, + "learning_rate": 3.861547549218276e-06, + "loss": 0.85520703, + "num_input_tokens_seen": 52521900, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0546875, + "step": 2421, + "time_per_iteration": 2.441432476043701 + }, + { + "auxiliary_loss_clip": 0.01153189, + "auxiliary_loss_mlp": 0.01053683, + "balance_loss_clip": 1.03400183, + "balance_loss_mlp": 1.04684627, + "epoch": 0.14561851796182174, + "flos": 22236282616320.0, + "grad_norm": 1.6167157199382733, + "language_loss": 0.81511533, + "learning_rate": 3.861405128426914e-06, + "loss": 0.83718407, + "num_input_tokens_seen": 52540495, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0625, + "step": 2422, + "time_per_iteration": 2.473010540008545 + }, + { + "auxiliary_loss_clip": 0.01046424, + "auxiliary_loss_mlp": 0.01017838, + "balance_loss_clip": 1.01558518, + "balance_loss_mlp": 1.01065397, + "epoch": 0.1456786412144897, + "flos": 52636786289280.0, + "grad_norm": 0.9226410759759552, + "language_loss": 0.63245702, + "learning_rate": 3.861262637050883e-06, + "loss": 0.65309966, + "num_input_tokens_seen": 52603305, + "router_z_loss_clip": 0.02258301, + "router_z_loss_mlp": 0.35742188, + "step": 2423, + "time_per_iteration": 3.0516433715820312 + }, + { + "auxiliary_loss_clip": 0.01155109, + "auxiliary_loss_mlp": 0.01045363, + "balance_loss_clip": 1.02756512, + "balance_loss_mlp": 1.05096769, + "epoch": 0.14573876446715767, + "flos": 23221671396480.0, + "grad_norm": 1.7656587875688796, + "language_loss": 0.8267172, + "learning_rate": 3.861120075095585e-06, + "loss": 0.84872198, + "num_input_tokens_seen": 52623435, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 1.046875, + "step": 2424, + "time_per_iteration": 2.4918792247772217 + }, + { + "auxiliary_loss_clip": 0.01153149, + "auxiliary_loss_mlp": 0.0104962, + "balance_loss_clip": 1.03071296, + "balance_loss_mlp": 1.04970837, + "epoch": 0.14579888771982563, + "flos": 18114384286080.0, + "grad_norm": 2.0603730404595915, + "language_loss": 0.79317909, + "learning_rate": 3.860977442566429e-06, + "loss": 0.81520677, + "num_input_tokens_seen": 52642255, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.03125, + "step": 2425, + "time_per_iteration": 2.4607083797454834 + }, + { + "auxiliary_loss_clip": 0.01155851, + "auxiliary_loss_mlp": 0.01048544, + "balance_loss_clip": 1.030007, + "balance_loss_mlp": 1.05136847, + "epoch": 0.14585901097249362, + "flos": 23001107932800.0, + "grad_norm": 2.4026453111661703, + "language_loss": 0.83269531, + "learning_rate": 3.860834739468821e-06, + "loss": 0.85473925, + "num_input_tokens_seen": 52658700, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0390625, + "step": 2426, + "time_per_iteration": 2.4615883827209473 + }, + { + "auxiliary_loss_clip": 0.01153183, + "auxiliary_loss_mlp": 0.0104253, + "balance_loss_clip": 1.02420735, + "balance_loss_mlp": 1.05100346, + "epoch": 0.1459191342251616, + "flos": 21908669644800.0, + "grad_norm": 1.78851961601388, + "language_loss": 0.86878085, + "learning_rate": 3.860691965808173e-06, + "loss": 0.89073801, + "num_input_tokens_seen": 52678140, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.0234375, + "step": 2427, + "time_per_iteration": 2.46846866607666 + }, + { + "auxiliary_loss_clip": 0.01159617, + "auxiliary_loss_mlp": 0.01046481, + "balance_loss_clip": 1.0264895, + "balance_loss_mlp": 1.05060291, + "epoch": 0.14597925747782955, + "flos": 14975504438400.0, + "grad_norm": 1.9424277979169204, + "language_loss": 0.66795039, + "learning_rate": 3.8605491215899e-06, + "loss": 0.69001138, + "num_input_tokens_seen": 52696825, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.0859375, + "step": 2428, + "time_per_iteration": 2.4277987480163574 + }, + { + "auxiliary_loss_clip": 0.01154279, + "auxiliary_loss_mlp": 0.01048778, + "balance_loss_clip": 1.02870345, + "balance_loss_mlp": 1.05036306, + "epoch": 0.14603938073049752, + "flos": 21068898600960.0, + "grad_norm": 1.7447652065053452, + "language_loss": 0.8363744, + "learning_rate": 3.860406206819417e-06, + "loss": 0.85840499, + "num_input_tokens_seen": 52715125, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0390625, + "step": 2429, + "time_per_iteration": 2.5208661556243896 + }, + { + "auxiliary_loss_clip": 0.01152615, + "auxiliary_loss_mlp": 0.01048492, + "balance_loss_clip": 1.02972817, + "balance_loss_mlp": 1.04804671, + "epoch": 0.14609950398316549, + "flos": 19864777950720.0, + "grad_norm": 1.723947749216575, + "language_loss": 0.78811824, + "learning_rate": 3.860263221502145e-06, + "loss": 0.8101294, + "num_input_tokens_seen": 52734015, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.046875, + "step": 2430, + "time_per_iteration": 2.460575580596924 + }, + { + "auxiliary_loss_clip": 0.0115835, + "auxiliary_loss_mlp": 0.01049311, + "balance_loss_clip": 1.03014231, + "balance_loss_mlp": 1.0529238, + "epoch": 0.14615962723583345, + "flos": 22418852469120.0, + "grad_norm": 2.3723861833809767, + "language_loss": 0.83178174, + "learning_rate": 3.860120165643504e-06, + "loss": 0.85385835, + "num_input_tokens_seen": 52753025, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0546875, + "step": 2431, + "time_per_iteration": 2.468472480773926 + }, + { + "auxiliary_loss_clip": 0.01158923, + "auxiliary_loss_mlp": 0.01053127, + "balance_loss_clip": 1.03244448, + "balance_loss_mlp": 1.05131185, + "epoch": 0.14621975048850142, + "flos": 22346241125760.0, + "grad_norm": 1.7402379411604871, + "language_loss": 0.78777766, + "learning_rate": 3.859977039248921e-06, + "loss": 0.80989814, + "num_input_tokens_seen": 52773420, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.078125, + "step": 2432, + "time_per_iteration": 2.4618513584136963 + }, + { + "auxiliary_loss_clip": 0.01153865, + "auxiliary_loss_mlp": 0.01052087, + "balance_loss_clip": 1.03158331, + "balance_loss_mlp": 1.04917812, + "epoch": 0.1462798737411694, + "flos": 24389163152640.0, + "grad_norm": 1.9105383938395448, + "language_loss": 0.79940903, + "learning_rate": 3.859833842323822e-06, + "loss": 0.82146859, + "num_input_tokens_seen": 52792870, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.046875, + "step": 2433, + "time_per_iteration": 2.4901435375213623 + }, + { + "auxiliary_loss_clip": 0.01152814, + "auxiliary_loss_mlp": 0.01051119, + "balance_loss_clip": 1.03149712, + "balance_loss_mlp": 1.05186844, + "epoch": 0.14633999699383737, + "flos": 19244672530560.0, + "grad_norm": 1.8984055506020234, + "language_loss": 0.78421938, + "learning_rate": 3.859690574873638e-06, + "loss": 0.80625868, + "num_input_tokens_seen": 52811615, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0078125, + "step": 2434, + "time_per_iteration": 3.833007335662842 + }, + { + "auxiliary_loss_clip": 0.01046525, + "auxiliary_loss_mlp": 0.01005945, + "balance_loss_clip": 1.00356054, + "balance_loss_mlp": 1.01038933, + "epoch": 0.14640012024650534, + "flos": 62660638270080.0, + "grad_norm": 0.8674820067375166, + "language_loss": 0.58373666, + "learning_rate": 3.8595472369038e-06, + "loss": 0.60426134, + "num_input_tokens_seen": 52873230, + "router_z_loss_clip": 0.02380371, + "router_z_loss_mlp": 0.36132812, + "step": 2435, + "time_per_iteration": 5.911077499389648 + }, + { + "auxiliary_loss_clip": 0.01147895, + "auxiliary_loss_mlp": 0.01045492, + "balance_loss_clip": 1.02620411, + "balance_loss_mlp": 1.04662895, + "epoch": 0.1464602434991733, + "flos": 12276243146880.0, + "grad_norm": 2.2832294661951753, + "language_loss": 0.88395989, + "learning_rate": 3.859403828419744e-06, + "loss": 0.90589368, + "num_input_tokens_seen": 52889325, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.015625, + "step": 2436, + "time_per_iteration": 2.440303325653076 + }, + { + "auxiliary_loss_clip": 0.01156296, + "auxiliary_loss_mlp": 0.01046658, + "balance_loss_clip": 1.02697682, + "balance_loss_mlp": 1.05032742, + "epoch": 0.14652036675184127, + "flos": 20922311197440.0, + "grad_norm": 2.0196076648737, + "language_loss": 0.74832988, + "learning_rate": 3.85926034942691e-06, + "loss": 0.7703594, + "num_input_tokens_seen": 52909705, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0625, + "step": 2437, + "time_per_iteration": 2.460806369781494 + }, + { + "auxiliary_loss_clip": 0.01153675, + "auxiliary_loss_mlp": 0.01044961, + "balance_loss_clip": 1.02374196, + "balance_loss_mlp": 1.04798007, + "epoch": 0.14658049000450923, + "flos": 27703681528320.0, + "grad_norm": 2.346268485469047, + "language_loss": 0.73932636, + "learning_rate": 3.859116799930736e-06, + "loss": 0.76131272, + "num_input_tokens_seen": 52930300, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.0546875, + "step": 2438, + "time_per_iteration": 2.5051729679107666 + }, + { + "auxiliary_loss_clip": 0.01154512, + "auxiliary_loss_mlp": 0.01041271, + "balance_loss_clip": 1.02310383, + "balance_loss_mlp": 1.05231857, + "epoch": 0.14664061325717723, + "flos": 24936513575040.0, + "grad_norm": 1.8289443089735578, + "language_loss": 0.74791402, + "learning_rate": 3.858973179936668e-06, + "loss": 0.76987189, + "num_input_tokens_seen": 52949955, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.0234375, + "step": 2439, + "time_per_iteration": 2.4596338272094727 + }, + { + "auxiliary_loss_clip": 0.01151843, + "auxiliary_loss_mlp": 0.01047986, + "balance_loss_clip": 1.02872145, + "balance_loss_mlp": 1.04913521, + "epoch": 0.1467007365098452, + "flos": 40297661406720.0, + "grad_norm": 2.106046924266039, + "language_loss": 0.74542844, + "learning_rate": 3.85882948945015e-06, + "loss": 0.76742673, + "num_input_tokens_seen": 52972905, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.03125, + "step": 2440, + "time_per_iteration": 2.613889217376709 + }, + { + "auxiliary_loss_clip": 0.01146734, + "auxiliary_loss_mlp": 0.01048348, + "balance_loss_clip": 1.02964425, + "balance_loss_mlp": 1.04660702, + "epoch": 0.14676085976251316, + "flos": 26541074021760.0, + "grad_norm": 1.6151911954653986, + "language_loss": 0.83047861, + "learning_rate": 3.85868572847663e-06, + "loss": 0.85242939, + "num_input_tokens_seen": 52994850, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0, + "step": 2441, + "time_per_iteration": 2.508570432662964 + }, + { + "auxiliary_loss_clip": 0.01157481, + "auxiliary_loss_mlp": 0.0104725, + "balance_loss_clip": 1.0275681, + "balance_loss_mlp": 1.04952955, + "epoch": 0.14682098301518112, + "flos": 23550110380800.0, + "grad_norm": 3.362343971731744, + "language_loss": 0.71562135, + "learning_rate": 3.858541897021563e-06, + "loss": 0.73766863, + "num_input_tokens_seen": 53014740, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.078125, + "step": 2442, + "time_per_iteration": 2.4903416633605957 + }, + { + "auxiliary_loss_clip": 0.01160717, + "auxiliary_loss_mlp": 0.01042253, + "balance_loss_clip": 1.02257109, + "balance_loss_mlp": 1.0510819, + "epoch": 0.1468811062678491, + "flos": 11651073909120.0, + "grad_norm": 3.2762909335645043, + "language_loss": 0.80804002, + "learning_rate": 3.8583979950904e-06, + "loss": 0.83006966, + "num_input_tokens_seen": 53029780, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.09375, + "step": 2443, + "time_per_iteration": 2.424539089202881 + }, + { + "auxiliary_loss_clip": 0.01154468, + "auxiliary_loss_mlp": 0.01049831, + "balance_loss_clip": 1.03005409, + "balance_loss_mlp": 1.0504694, + "epoch": 0.14694122952051705, + "flos": 23002616304000.0, + "grad_norm": 2.077049554342068, + "language_loss": 0.8297509, + "learning_rate": 3.858254022688599e-06, + "loss": 0.85179389, + "num_input_tokens_seen": 53048620, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0390625, + "step": 2444, + "time_per_iteration": 2.4937214851379395 + }, + { + "auxiliary_loss_clip": 0.01154781, + "auxiliary_loss_mlp": 0.01048939, + "balance_loss_clip": 1.02961493, + "balance_loss_mlp": 1.05025554, + "epoch": 0.14700135277318502, + "flos": 26502972670080.0, + "grad_norm": 1.763635964291881, + "language_loss": 0.71218902, + "learning_rate": 3.85810997982162e-06, + "loss": 0.73422623, + "num_input_tokens_seen": 53070055, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.046875, + "step": 2445, + "time_per_iteration": 2.491645336151123 + }, + { + "auxiliary_loss_clip": 0.01045345, + "auxiliary_loss_mlp": 0.01028611, + "balance_loss_clip": 1.02659595, + "balance_loss_mlp": 1.00942683, + "epoch": 0.147061476025853, + "flos": 59449434387840.0, + "grad_norm": 0.8232649654452494, + "language_loss": 0.63138294, + "learning_rate": 3.857965866494923e-06, + "loss": 0.6521225, + "num_input_tokens_seen": 53126945, + "router_z_loss_clip": 0.0201416, + "router_z_loss_mlp": 0.359375, + "step": 2446, + "time_per_iteration": 2.9610531330108643 + }, + { + "auxiliary_loss_clip": 0.01158924, + "auxiliary_loss_mlp": 0.0104308, + "balance_loss_clip": 1.02355385, + "balance_loss_mlp": 1.05348802, + "epoch": 0.14712159927852098, + "flos": 28330897841280.0, + "grad_norm": 1.8119571313268434, + "language_loss": 0.74937665, + "learning_rate": 3.857821682713975e-06, + "loss": 0.7713967, + "num_input_tokens_seen": 53149130, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0546875, + "step": 2447, + "time_per_iteration": 2.547112226486206 + }, + { + "auxiliary_loss_clip": 0.0115445, + "auxiliary_loss_mlp": 0.01046965, + "balance_loss_clip": 1.02838051, + "balance_loss_mlp": 1.04998112, + "epoch": 0.14718172253118894, + "flos": 27089825074560.0, + "grad_norm": 2.0554455972062744, + "language_loss": 0.85722244, + "learning_rate": 3.857677428484242e-06, + "loss": 0.87923658, + "num_input_tokens_seen": 53167120, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.046875, + "step": 2448, + "time_per_iteration": 2.519530773162842 + }, + { + "auxiliary_loss_clip": 0.01045412, + "auxiliary_loss_mlp": 0.01010534, + "balance_loss_clip": 1.0081377, + "balance_loss_mlp": 1.00952029, + "epoch": 0.1472418457838569, + "flos": 66706764860160.0, + "grad_norm": 0.7649510042513386, + "language_loss": 0.56836212, + "learning_rate": 3.857533103811195e-06, + "loss": 0.58892155, + "num_input_tokens_seen": 53227945, + "router_z_loss_clip": 0.02392578, + "router_z_loss_mlp": 0.359375, + "step": 2449, + "time_per_iteration": 3.0049068927764893 + }, + { + "auxiliary_loss_clip": 0.01150109, + "auxiliary_loss_mlp": 0.01044261, + "balance_loss_clip": 1.02462673, + "balance_loss_mlp": 1.04850447, + "epoch": 0.14730196903652487, + "flos": 19573578391680.0, + "grad_norm": 1.900224172693126, + "language_loss": 0.85544562, + "learning_rate": 3.857388708700307e-06, + "loss": 0.87738931, + "num_input_tokens_seen": 53244615, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.015625, + "step": 2450, + "time_per_iteration": 2.5826945304870605 + }, + { + "auxiliary_loss_clip": 0.01155696, + "auxiliary_loss_mlp": 0.01049879, + "balance_loss_clip": 1.03009009, + "balance_loss_mlp": 1.05074143, + "epoch": 0.14736209228919284, + "flos": 16071031296000.0, + "grad_norm": 2.029178420182481, + "language_loss": 0.74693608, + "learning_rate": 3.857244243157052e-06, + "loss": 0.76899183, + "num_input_tokens_seen": 53262205, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0546875, + "step": 2451, + "time_per_iteration": 2.4345250129699707 + }, + { + "auxiliary_loss_clip": 0.01146898, + "auxiliary_loss_mlp": 0.01039395, + "balance_loss_clip": 1.02092934, + "balance_loss_mlp": 1.04758763, + "epoch": 0.1474222155418608, + "flos": 23039460679680.0, + "grad_norm": 1.6073898366987713, + "language_loss": 0.82240498, + "learning_rate": 3.85709970718691e-06, + "loss": 0.8442679, + "num_input_tokens_seen": 53282445, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.9921875, + "step": 2452, + "time_per_iteration": 2.468869924545288 + }, + { + "auxiliary_loss_clip": 0.01154267, + "auxiliary_loss_mlp": 0.01038485, + "balance_loss_clip": 1.02032936, + "balance_loss_mlp": 1.05154371, + "epoch": 0.1474823387945288, + "flos": 17018641946880.0, + "grad_norm": 1.7191329381743174, + "language_loss": 0.74021572, + "learning_rate": 3.856955100795361e-06, + "loss": 0.76214325, + "num_input_tokens_seen": 53299060, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.03125, + "step": 2453, + "time_per_iteration": 2.433424472808838 + }, + { + "auxiliary_loss_clip": 0.01154761, + "auxiliary_loss_mlp": 0.01050025, + "balance_loss_clip": 1.03048682, + "balance_loss_mlp": 1.04918802, + "epoch": 0.14754246204719676, + "flos": 17895041884800.0, + "grad_norm": 2.171465059586897, + "language_loss": 0.76326835, + "learning_rate": 3.856810423987889e-06, + "loss": 0.78531623, + "num_input_tokens_seen": 53315970, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0546875, + "step": 2454, + "time_per_iteration": 2.419368028640747 + }, + { + "auxiliary_loss_clip": 0.01155198, + "auxiliary_loss_mlp": 0.01038866, + "balance_loss_clip": 1.01980472, + "balance_loss_mlp": 1.04922831, + "epoch": 0.14760258529986472, + "flos": 13079097987840.0, + "grad_norm": 2.006370127686132, + "language_loss": 0.8301537, + "learning_rate": 3.856665676769979e-06, + "loss": 0.85209435, + "num_input_tokens_seen": 53332940, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0625, + "step": 2455, + "time_per_iteration": 2.426819324493408 + }, + { + "auxiliary_loss_clip": 0.01157227, + "auxiliary_loss_mlp": 0.01044033, + "balance_loss_clip": 1.02519834, + "balance_loss_mlp": 1.04846048, + "epoch": 0.1476627085525327, + "flos": 30806399358720.0, + "grad_norm": 2.442844218228049, + "language_loss": 0.83938581, + "learning_rate": 3.85652085914712e-06, + "loss": 0.8613984, + "num_input_tokens_seen": 53353295, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.09375, + "step": 2456, + "time_per_iteration": 2.525296926498413 + }, + { + "auxiliary_loss_clip": 0.01151791, + "auxiliary_loss_mlp": 0.01043419, + "balance_loss_clip": 1.02459574, + "balance_loss_mlp": 1.04980254, + "epoch": 0.14772283180520066, + "flos": 21689434984320.0, + "grad_norm": 1.8839437807359896, + "language_loss": 0.84325618, + "learning_rate": 3.856375971124805e-06, + "loss": 0.86520827, + "num_input_tokens_seen": 53373410, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0234375, + "step": 2457, + "time_per_iteration": 2.471068859100342 + }, + { + "auxiliary_loss_clip": 0.01149123, + "auxiliary_loss_mlp": 0.01041136, + "balance_loss_clip": 1.02237296, + "balance_loss_mlp": 1.04932761, + "epoch": 0.14778295505786862, + "flos": 18770400328320.0, + "grad_norm": 1.9862753985638202, + "language_loss": 0.75645256, + "learning_rate": 3.856231012708527e-06, + "loss": 0.77835512, + "num_input_tokens_seen": 53391430, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.99609375, + "step": 2458, + "time_per_iteration": 2.44146466255188 + }, + { + "auxiliary_loss_clip": 0.01160318, + "auxiliary_loss_mlp": 0.01049421, + "balance_loss_clip": 1.0284996, + "balance_loss_mlp": 1.05119324, + "epoch": 0.1478430783105366, + "flos": 22893555634560.0, + "grad_norm": 2.405388225865701, + "language_loss": 0.83817005, + "learning_rate": 3.856085983903782e-06, + "loss": 0.86026746, + "num_input_tokens_seen": 53409960, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.09375, + "step": 2459, + "time_per_iteration": 2.470345973968506 + }, + { + "auxiliary_loss_clip": 0.01149406, + "auxiliary_loss_mlp": 0.01041796, + "balance_loss_clip": 1.02359271, + "balance_loss_mlp": 1.0489651, + "epoch": 0.14790320156320458, + "flos": 15085319293440.0, + "grad_norm": 2.6666731923680733, + "language_loss": 0.75856471, + "learning_rate": 3.855940884716071e-06, + "loss": 0.78047681, + "num_input_tokens_seen": 53426160, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.0, + "step": 2460, + "time_per_iteration": 2.4294657707214355 + }, + { + "auxiliary_loss_clip": 0.01157631, + "auxiliary_loss_mlp": 0.01042027, + "balance_loss_clip": 1.02260733, + "balance_loss_mlp": 1.05102873, + "epoch": 0.14796332481587254, + "flos": 26504768350080.0, + "grad_norm": 1.6904429322803973, + "language_loss": 0.81591463, + "learning_rate": 3.855795715150896e-06, + "loss": 0.83791113, + "num_input_tokens_seen": 53448530, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0703125, + "step": 2461, + "time_per_iteration": 2.4993178844451904 + }, + { + "auxiliary_loss_clip": 0.01159506, + "auxiliary_loss_mlp": 0.01046713, + "balance_loss_clip": 1.02611399, + "balance_loss_mlp": 1.05356562, + "epoch": 0.1480234480685405, + "flos": 17563191108480.0, + "grad_norm": 3.2471604819605036, + "language_loss": 0.65689576, + "learning_rate": 3.855650475213761e-06, + "loss": 0.678958, + "num_input_tokens_seen": 53465915, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.0625, + "step": 2462, + "time_per_iteration": 2.4197235107421875 + }, + { + "auxiliary_loss_clip": 0.0115574, + "auxiliary_loss_mlp": 0.01048819, + "balance_loss_clip": 1.02929282, + "balance_loss_mlp": 1.05148113, + "epoch": 0.14808357132120847, + "flos": 53582203232640.0, + "grad_norm": 1.4717210360784851, + "language_loss": 0.67368174, + "learning_rate": 3.8555051649101745e-06, + "loss": 0.69572735, + "num_input_tokens_seen": 53496055, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0390625, + "step": 2463, + "time_per_iteration": 2.774268865585327 + }, + { + "auxiliary_loss_clip": 0.01154664, + "auxiliary_loss_mlp": 0.01050077, + "balance_loss_clip": 1.03071713, + "balance_loss_mlp": 1.04978383, + "epoch": 0.14814369457387644, + "flos": 19829190551040.0, + "grad_norm": 2.177919724516607, + "language_loss": 0.76567936, + "learning_rate": 3.855359784245646e-06, + "loss": 0.78772676, + "num_input_tokens_seen": 53513790, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.046875, + "step": 2464, + "time_per_iteration": 2.4522674083709717 + }, + { + "auxiliary_loss_clip": 0.01152455, + "auxiliary_loss_mlp": 0.01049156, + "balance_loss_clip": 1.03089297, + "balance_loss_mlp": 1.05009413, + "epoch": 0.1482038178265444, + "flos": 23914962777600.0, + "grad_norm": 1.623144605896263, + "language_loss": 0.79623306, + "learning_rate": 3.855214333225688e-06, + "loss": 0.81824923, + "num_input_tokens_seen": 53533410, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.0234375, + "step": 2465, + "time_per_iteration": 2.4946794509887695 + }, + { + "auxiliary_loss_clip": 0.01159963, + "auxiliary_loss_mlp": 0.01045929, + "balance_loss_clip": 1.02543747, + "balance_loss_mlp": 1.0522809, + "epoch": 0.1482639410792124, + "flos": 24170503109760.0, + "grad_norm": 2.8838905575360925, + "language_loss": 0.76230991, + "learning_rate": 3.855068811855817e-06, + "loss": 0.78436887, + "num_input_tokens_seen": 53554775, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.078125, + "step": 2466, + "time_per_iteration": 2.4722483158111572 + }, + { + "auxiliary_loss_clip": 0.01052707, + "auxiliary_loss_mlp": 0.01020247, + "balance_loss_clip": 1.01781487, + "balance_loss_mlp": 1.01613474, + "epoch": 0.14832406433188036, + "flos": 66191051341440.0, + "grad_norm": 0.8013334536894682, + "language_loss": 0.60022712, + "learning_rate": 3.854923220141551e-06, + "loss": 0.62095666, + "num_input_tokens_seen": 53609675, + "router_z_loss_clip": 0.02429199, + "router_z_loss_mlp": 0.3671875, + "step": 2467, + "time_per_iteration": 3.0702927112579346 + }, + { + "auxiliary_loss_clip": 0.01154799, + "auxiliary_loss_mlp": 0.01043072, + "balance_loss_clip": 1.02393889, + "balance_loss_mlp": 1.05059397, + "epoch": 0.14838418758454833, + "flos": 25411252654080.0, + "grad_norm": 2.3345318496369405, + "language_loss": 0.87671721, + "learning_rate": 3.85477755808841e-06, + "loss": 0.89869595, + "num_input_tokens_seen": 53626950, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.046875, + "step": 2468, + "time_per_iteration": 2.5042202472686768 + }, + { + "auxiliary_loss_clip": 0.0115781, + "auxiliary_loss_mlp": 0.01052711, + "balance_loss_clip": 1.0322901, + "balance_loss_mlp": 1.05078602, + "epoch": 0.1484443108372163, + "flos": 23289901280640.0, + "grad_norm": 4.884804263226826, + "language_loss": 0.75884396, + "learning_rate": 3.854631825701919e-06, + "loss": 0.78094912, + "num_input_tokens_seen": 53644200, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0703125, + "step": 2469, + "time_per_iteration": 2.4750967025756836 + }, + { + "auxiliary_loss_clip": 0.01153722, + "auxiliary_loss_mlp": 0.01053888, + "balance_loss_clip": 1.03425384, + "balance_loss_mlp": 1.04954958, + "epoch": 0.14850443408988426, + "flos": 14647675985280.0, + "grad_norm": 2.457578452134473, + "language_loss": 0.76183128, + "learning_rate": 3.854486022987603e-06, + "loss": 0.78390741, + "num_input_tokens_seen": 53659650, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.046875, + "step": 2470, + "time_per_iteration": 2.4312937259674072 + }, + { + "auxiliary_loss_clip": 0.01152707, + "auxiliary_loss_mlp": 0.01045721, + "balance_loss_clip": 1.02651668, + "balance_loss_mlp": 1.05050206, + "epoch": 0.14856455734255222, + "flos": 23548314700800.0, + "grad_norm": 1.9398758609720104, + "language_loss": 0.72121894, + "learning_rate": 3.8543401499509905e-06, + "loss": 0.74320322, + "num_input_tokens_seen": 53680275, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0234375, + "step": 2471, + "time_per_iteration": 2.519866466522217 + }, + { + "auxiliary_loss_clip": 0.01160204, + "auxiliary_loss_mlp": 0.01047639, + "balance_loss_clip": 1.0272181, + "balance_loss_mlp": 1.0499022, + "epoch": 0.1486246805952202, + "flos": 18077288515200.0, + "grad_norm": 2.11598070664324, + "language_loss": 0.89739621, + "learning_rate": 3.854194206597615e-06, + "loss": 0.91947466, + "num_input_tokens_seen": 53698270, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.1015625, + "step": 2472, + "time_per_iteration": 2.4281632900238037 + }, + { + "auxiliary_loss_clip": 0.01155174, + "auxiliary_loss_mlp": 0.01049471, + "balance_loss_clip": 1.030123, + "balance_loss_mlp": 1.05059123, + "epoch": 0.14868480384788818, + "flos": 19353625459200.0, + "grad_norm": 4.013793804030176, + "language_loss": 0.80734539, + "learning_rate": 3.854048192933008e-06, + "loss": 0.82939184, + "num_input_tokens_seen": 53716845, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.046875, + "step": 2473, + "time_per_iteration": 2.4329466819763184 + }, + { + "auxiliary_loss_clip": 0.0115911, + "auxiliary_loss_mlp": 0.01063152, + "balance_loss_clip": 1.04358959, + "balance_loss_mlp": 1.05129409, + "epoch": 0.14874492710055615, + "flos": 22200192426240.0, + "grad_norm": 2.5981192604624526, + "language_loss": 0.77540123, + "learning_rate": 3.853902108962709e-06, + "loss": 0.79762381, + "num_input_tokens_seen": 53734970, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.078125, + "step": 2474, + "time_per_iteration": 2.453432083129883 + }, + { + "auxiliary_loss_clip": 0.01157718, + "auxiliary_loss_mlp": 0.01057886, + "balance_loss_clip": 1.03763211, + "balance_loss_mlp": 1.04955983, + "epoch": 0.1488050503532241, + "flos": 21103444506240.0, + "grad_norm": 1.8103491271764227, + "language_loss": 0.82315612, + "learning_rate": 3.853755954692255e-06, + "loss": 0.84531218, + "num_input_tokens_seen": 53753415, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.0859375, + "step": 2475, + "time_per_iteration": 2.4591174125671387 + }, + { + "auxiliary_loss_clip": 0.01157844, + "auxiliary_loss_mlp": 0.01058234, + "balance_loss_clip": 1.03985167, + "balance_loss_mlp": 1.05399168, + "epoch": 0.14886517360589208, + "flos": 12786569625600.0, + "grad_norm": 1.9240192853863896, + "language_loss": 0.80811602, + "learning_rate": 3.85360973012719e-06, + "loss": 0.83027685, + "num_input_tokens_seen": 53770305, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.0390625, + "step": 2476, + "time_per_iteration": 3.810553789138794 + }, + { + "auxiliary_loss_clip": 0.01148934, + "auxiliary_loss_mlp": 0.01053022, + "balance_loss_clip": 1.03467607, + "balance_loss_mlp": 1.05016851, + "epoch": 0.14892529685856004, + "flos": 29022860419200.0, + "grad_norm": 1.8396010916090604, + "language_loss": 0.77889222, + "learning_rate": 3.853463435273058e-06, + "loss": 0.80091178, + "num_input_tokens_seen": 53788895, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.98828125, + "step": 2477, + "time_per_iteration": 4.031312942504883 + }, + { + "auxiliary_loss_clip": 0.01048753, + "auxiliary_loss_mlp": 0.01018076, + "balance_loss_clip": 1.01581085, + "balance_loss_mlp": 1.01302671, + "epoch": 0.148985420111228, + "flos": 61926121054080.0, + "grad_norm": 0.8050876444063699, + "language_loss": 0.60130364, + "learning_rate": 3.853317070135407e-06, + "loss": 0.62197196, + "num_input_tokens_seen": 53850260, + "router_z_loss_clip": 0.02270508, + "router_z_loss_mlp": 0.35742188, + "step": 2478, + "time_per_iteration": 3.1073787212371826 + }, + { + "auxiliary_loss_clip": 0.01154836, + "auxiliary_loss_mlp": 0.01044957, + "balance_loss_clip": 1.02695656, + "balance_loss_mlp": 1.05078554, + "epoch": 0.149045543363896, + "flos": 23915106432000.0, + "grad_norm": 2.232556799389181, + "language_loss": 0.70951897, + "learning_rate": 3.853170634719787e-06, + "loss": 0.7315169, + "num_input_tokens_seen": 53867520, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0390625, + "step": 2479, + "time_per_iteration": 2.475215435028076 + }, + { + "auxiliary_loss_clip": 0.01153193, + "auxiliary_loss_mlp": 0.01050847, + "balance_loss_clip": 1.0313679, + "balance_loss_mlp": 1.04886127, + "epoch": 0.14910566661656396, + "flos": 23654394541440.0, + "grad_norm": 1.5896653051626852, + "language_loss": 0.80748487, + "learning_rate": 3.853024129031751e-06, + "loss": 0.82952535, + "num_input_tokens_seen": 53886620, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.046875, + "step": 2480, + "time_per_iteration": 2.4618492126464844 + }, + { + "auxiliary_loss_clip": 0.01156746, + "auxiliary_loss_mlp": 0.01047338, + "balance_loss_clip": 1.02838397, + "balance_loss_mlp": 1.05017209, + "epoch": 0.14916578986923193, + "flos": 20515299212160.0, + "grad_norm": 2.4101793906634894, + "language_loss": 0.84132183, + "learning_rate": 3.852877553076854e-06, + "loss": 0.86336267, + "num_input_tokens_seen": 53902230, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0625, + "step": 2481, + "time_per_iteration": 2.437391519546509 + }, + { + "auxiliary_loss_clip": 0.01152664, + "auxiliary_loss_mlp": 0.01051193, + "balance_loss_clip": 1.03046227, + "balance_loss_mlp": 1.04808569, + "epoch": 0.1492259131218999, + "flos": 22491822948480.0, + "grad_norm": 3.194199563979109, + "language_loss": 0.77347398, + "learning_rate": 3.8527309068606546e-06, + "loss": 0.79551256, + "num_input_tokens_seen": 53919475, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.046875, + "step": 2482, + "time_per_iteration": 2.4710068702697754 + }, + { + "auxiliary_loss_clip": 0.01161857, + "auxiliary_loss_mlp": 0.01040162, + "balance_loss_clip": 1.01939583, + "balance_loss_mlp": 1.05186439, + "epoch": 0.14928603637456786, + "flos": 23185868515200.0, + "grad_norm": 2.968394626295353, + "language_loss": 0.78719991, + "learning_rate": 3.852584190388713e-06, + "loss": 0.80922014, + "num_input_tokens_seen": 53939150, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.1015625, + "step": 2483, + "time_per_iteration": 2.5075182914733887 + }, + { + "auxiliary_loss_clip": 0.0114759, + "auxiliary_loss_mlp": 0.01040314, + "balance_loss_clip": 1.02304077, + "balance_loss_mlp": 1.04774714, + "epoch": 0.14934615962723582, + "flos": 21653237053440.0, + "grad_norm": 1.642113570978582, + "language_loss": 0.70521605, + "learning_rate": 3.852437403666595e-06, + "loss": 0.72709513, + "num_input_tokens_seen": 53958735, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 1.0, + "step": 2484, + "time_per_iteration": 2.4810657501220703 + }, + { + "auxiliary_loss_clip": 0.01154341, + "auxiliary_loss_mlp": 0.01041731, + "balance_loss_clip": 1.02049971, + "balance_loss_mlp": 1.04769683, + "epoch": 0.1494062828799038, + "flos": 27010066924800.0, + "grad_norm": 2.5518326423103654, + "language_loss": 0.84396368, + "learning_rate": 3.852290546699863e-06, + "loss": 0.86592442, + "num_input_tokens_seen": 53975065, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.0703125, + "step": 2485, + "time_per_iteration": 2.47004771232605 + }, + { + "auxiliary_loss_clip": 0.01155612, + "auxiliary_loss_mlp": 0.01044521, + "balance_loss_clip": 1.02442229, + "balance_loss_mlp": 1.04906201, + "epoch": 0.14946640613257178, + "flos": 21214947300480.0, + "grad_norm": 2.1854599778658663, + "language_loss": 0.84902173, + "learning_rate": 3.8521436194940894e-06, + "loss": 0.87102306, + "num_input_tokens_seen": 53993330, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0625, + "step": 2486, + "time_per_iteration": 2.4553234577178955 + }, + { + "auxiliary_loss_clip": 0.011482, + "auxiliary_loss_mlp": 0.01038818, + "balance_loss_clip": 1.02208114, + "balance_loss_mlp": 1.04672825, + "epoch": 0.14952652938523975, + "flos": 13370872164480.0, + "grad_norm": 2.4579579723442855, + "language_loss": 0.74329305, + "learning_rate": 3.851996622054842e-06, + "loss": 0.76516318, + "num_input_tokens_seen": 54010515, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 1.015625, + "step": 2487, + "time_per_iteration": 2.436316967010498 + }, + { + "auxiliary_loss_clip": 0.01148703, + "auxiliary_loss_mlp": 0.01048053, + "balance_loss_clip": 1.02934861, + "balance_loss_mlp": 1.04707325, + "epoch": 0.1495866526379077, + "flos": 35517699959040.0, + "grad_norm": 2.1423480103066375, + "language_loss": 0.71837348, + "learning_rate": 3.8518495543877e-06, + "loss": 0.74034101, + "num_input_tokens_seen": 54031315, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 2488, + "time_per_iteration": 2.649794816970825 + }, + { + "auxiliary_loss_clip": 0.01156424, + "auxiliary_loss_mlp": 0.01046549, + "balance_loss_clip": 1.02780962, + "balance_loss_mlp": 1.04946375, + "epoch": 0.14964677589057568, + "flos": 17632749795840.0, + "grad_norm": 2.5167610907777513, + "language_loss": 0.70519507, + "learning_rate": 3.851702416498235e-06, + "loss": 0.72722483, + "num_input_tokens_seen": 54045965, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0703125, + "step": 2489, + "time_per_iteration": 2.416708469390869 + }, + { + "auxiliary_loss_clip": 0.01153385, + "auxiliary_loss_mlp": 0.01045512, + "balance_loss_clip": 1.02637911, + "balance_loss_mlp": 1.04785299, + "epoch": 0.14970689914324364, + "flos": 20185280029440.0, + "grad_norm": 6.063777716142612, + "language_loss": 0.81789696, + "learning_rate": 3.8515552083920295e-06, + "loss": 0.83988589, + "num_input_tokens_seen": 54059960, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0546875, + "step": 2490, + "time_per_iteration": 2.433284282684326 + }, + { + "auxiliary_loss_clip": 0.0115747, + "auxiliary_loss_mlp": 0.01042151, + "balance_loss_clip": 1.02357852, + "balance_loss_mlp": 1.05097246, + "epoch": 0.1497670223959116, + "flos": 37228699382400.0, + "grad_norm": 1.781748843431282, + "language_loss": 0.79878485, + "learning_rate": 3.851407930074666e-06, + "loss": 0.82078111, + "num_input_tokens_seen": 54079330, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0625, + "step": 2491, + "time_per_iteration": 2.616642475128174 + }, + { + "auxiliary_loss_clip": 0.0115457, + "auxiliary_loss_mlp": 0.01046038, + "balance_loss_clip": 1.02491403, + "balance_loss_mlp": 1.04683256, + "epoch": 0.1498271456485796, + "flos": 24455848752000.0, + "grad_norm": 2.263792295832721, + "language_loss": 0.90779251, + "learning_rate": 3.851260581551727e-06, + "loss": 0.9297986, + "num_input_tokens_seen": 54097555, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.078125, + "step": 2492, + "time_per_iteration": 2.508188009262085 + }, + { + "auxiliary_loss_clip": 0.01152347, + "auxiliary_loss_mlp": 0.01059815, + "balance_loss_clip": 1.04028893, + "balance_loss_mlp": 1.04883122, + "epoch": 0.14988726890124757, + "flos": 16253601148800.0, + "grad_norm": 2.7210225604175116, + "language_loss": 0.79162109, + "learning_rate": 3.851113162828802e-06, + "loss": 0.8137427, + "num_input_tokens_seen": 54115600, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.03125, + "step": 2493, + "time_per_iteration": 2.4228014945983887 + }, + { + "auxiliary_loss_clip": 0.01150881, + "auxiliary_loss_mlp": 0.01042857, + "balance_loss_clip": 1.02299631, + "balance_loss_mlp": 1.04643607, + "epoch": 0.14994739215391553, + "flos": 20666555383680.0, + "grad_norm": 2.8095511996528297, + "language_loss": 0.80186284, + "learning_rate": 3.85096567391148e-06, + "loss": 0.82380015, + "num_input_tokens_seen": 54135220, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.046875, + "step": 2494, + "time_per_iteration": 2.4774162769317627 + }, + { + "auxiliary_loss_clip": 0.01149241, + "auxiliary_loss_mlp": 0.01046465, + "balance_loss_clip": 1.02613974, + "balance_loss_mlp": 1.04731214, + "epoch": 0.1500075154065835, + "flos": 70652375239680.0, + "grad_norm": 1.9697458415941205, + "language_loss": 0.65825832, + "learning_rate": 3.850818114805354e-06, + "loss": 0.68021536, + "num_input_tokens_seen": 54161065, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.015625, + "step": 2495, + "time_per_iteration": 2.87758207321167 + }, + { + "auxiliary_loss_clip": 0.01053312, + "auxiliary_loss_mlp": 0.01002116, + "balance_loss_clip": 0.99992257, + "balance_loss_mlp": 1.01668406, + "epoch": 0.15006763865925146, + "flos": 68011937447040.0, + "grad_norm": 1.1924806916138095, + "language_loss": 0.59488082, + "learning_rate": 3.850670485516019e-06, + "loss": 0.61543506, + "num_input_tokens_seen": 54225095, + "router_z_loss_clip": 0.02197266, + "router_z_loss_mlp": 0.3671875, + "step": 2496, + "time_per_iteration": 3.0807061195373535 + }, + { + "auxiliary_loss_clip": 0.01152427, + "auxiliary_loss_mlp": 0.01054598, + "balance_loss_clip": 1.03467774, + "balance_loss_mlp": 1.0468092, + "epoch": 0.15012776191191943, + "flos": 18916269459840.0, + "grad_norm": 2.296903755979897, + "language_loss": 0.65457296, + "learning_rate": 3.850522786049075e-06, + "loss": 0.67664325, + "num_input_tokens_seen": 54243750, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0546875, + "step": 2497, + "time_per_iteration": 2.4403655529022217 + }, + { + "auxiliary_loss_clip": 0.01155934, + "auxiliary_loss_mlp": 0.01048581, + "balance_loss_clip": 1.03021121, + "balance_loss_mlp": 1.05125117, + "epoch": 0.1501878851645874, + "flos": 23701330638720.0, + "grad_norm": 1.4500790349521295, + "language_loss": 0.75247943, + "learning_rate": 3.850375016410121e-06, + "loss": 0.77452457, + "num_input_tokens_seen": 54266185, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.046875, + "step": 2498, + "time_per_iteration": 2.5286927223205566 + }, + { + "auxiliary_loss_clip": 0.01155949, + "auxiliary_loss_mlp": 0.01043093, + "balance_loss_clip": 1.02256477, + "balance_loss_mlp": 1.04910398, + "epoch": 0.15024800841725539, + "flos": 20412523422720.0, + "grad_norm": 2.1627878003877257, + "language_loss": 0.72073609, + "learning_rate": 3.850227176604761e-06, + "loss": 0.74272656, + "num_input_tokens_seen": 54283940, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0625, + "step": 2499, + "time_per_iteration": 2.4415009021759033 + }, + { + "auxiliary_loss_clip": 0.01153017, + "auxiliary_loss_mlp": 0.01049378, + "balance_loss_clip": 1.03001857, + "balance_loss_mlp": 1.04765654, + "epoch": 0.15030813166992335, + "flos": 31831002812160.0, + "grad_norm": 1.7935878764928508, + "language_loss": 0.7195605, + "learning_rate": 3.850079266638601e-06, + "loss": 0.74158442, + "num_input_tokens_seen": 54304830, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0625, + "step": 2500, + "time_per_iteration": 2.5504300594329834 + }, + { + "auxiliary_loss_clip": 0.01152715, + "auxiliary_loss_mlp": 0.01058033, + "balance_loss_clip": 1.03831601, + "balance_loss_mlp": 1.04960001, + "epoch": 0.15036825492259132, + "flos": 35657822914560.0, + "grad_norm": 2.491284008551419, + "language_loss": 0.64973354, + "learning_rate": 3.849931286517249e-06, + "loss": 0.67184103, + "num_input_tokens_seen": 54325595, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.03125, + "step": 2501, + "time_per_iteration": 2.587292432785034 + }, + { + "auxiliary_loss_clip": 0.01153217, + "auxiliary_loss_mlp": 0.01059755, + "balance_loss_clip": 1.03940582, + "balance_loss_mlp": 1.04861319, + "epoch": 0.15042837817525928, + "flos": 18838163335680.0, + "grad_norm": 2.0240839018319, + "language_loss": 0.83043593, + "learning_rate": 3.849783236246318e-06, + "loss": 0.85256565, + "num_input_tokens_seen": 54342180, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.046875, + "step": 2502, + "time_per_iteration": 2.470350980758667 + }, + { + "auxiliary_loss_clip": 0.01149694, + "auxiliary_loss_mlp": 0.01050766, + "balance_loss_clip": 1.03272963, + "balance_loss_mlp": 1.04702473, + "epoch": 0.15048850142792725, + "flos": 19535548867200.0, + "grad_norm": 2.3174234065433597, + "language_loss": 0.77197748, + "learning_rate": 3.849635115831421e-06, + "loss": 0.79398209, + "num_input_tokens_seen": 54360255, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.0234375, + "step": 2503, + "time_per_iteration": 2.6598432064056396 + }, + { + "auxiliary_loss_clip": 0.01151836, + "auxiliary_loss_mlp": 0.01043843, + "balance_loss_clip": 1.02585387, + "balance_loss_mlp": 1.04901898, + "epoch": 0.1505486246805952, + "flos": 22017550746240.0, + "grad_norm": 2.1270494317377007, + "language_loss": 0.85432625, + "learning_rate": 3.849486925278176e-06, + "loss": 0.87628305, + "num_input_tokens_seen": 54378260, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0234375, + "step": 2504, + "time_per_iteration": 2.7323355674743652 + }, + { + "auxiliary_loss_clip": 0.01150395, + "auxiliary_loss_mlp": 0.01041023, + "balance_loss_clip": 1.02322519, + "balance_loss_mlp": 1.04855871, + "epoch": 0.15060874793326318, + "flos": 20743153136640.0, + "grad_norm": 1.6383963769174188, + "language_loss": 0.83226919, + "learning_rate": 3.8493386645922e-06, + "loss": 0.85418344, + "num_input_tokens_seen": 54399745, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 1.015625, + "step": 2505, + "time_per_iteration": 2.4866323471069336 + }, + { + "auxiliary_loss_clip": 0.01150621, + "auxiliary_loss_mlp": 0.01046549, + "balance_loss_clip": 1.02851272, + "balance_loss_mlp": 1.04672468, + "epoch": 0.15066887118593117, + "flos": 16471902055680.0, + "grad_norm": 2.268670074130615, + "language_loss": 0.7639147, + "learning_rate": 3.849190333779117e-06, + "loss": 0.78588635, + "num_input_tokens_seen": 54417105, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.0390625, + "step": 2506, + "time_per_iteration": 2.4266390800476074 + }, + { + "auxiliary_loss_clip": 0.01156061, + "auxiliary_loss_mlp": 0.01043099, + "balance_loss_clip": 1.02452636, + "balance_loss_mlp": 1.04987144, + "epoch": 0.15072899443859913, + "flos": 19859319083520.0, + "grad_norm": 4.189374997051622, + "language_loss": 0.76202261, + "learning_rate": 3.849041932844552e-06, + "loss": 0.78401417, + "num_input_tokens_seen": 54433920, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0625, + "step": 2507, + "time_per_iteration": 2.477936029434204 + }, + { + "auxiliary_loss_clip": 0.01145213, + "auxiliary_loss_mlp": 0.01043256, + "balance_loss_clip": 1.02519584, + "balance_loss_mlp": 1.04538798, + "epoch": 0.1507891176912671, + "flos": 20776226584320.0, + "grad_norm": 2.4120052182021503, + "language_loss": 0.69041586, + "learning_rate": 3.848893461794131e-06, + "loss": 0.71230054, + "num_input_tokens_seen": 54451540, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.0, + "step": 2508, + "time_per_iteration": 2.4462738037109375 + }, + { + "auxiliary_loss_clip": 0.01156095, + "auxiliary_loss_mlp": 0.01046654, + "balance_loss_clip": 1.02870142, + "balance_loss_mlp": 1.05190873, + "epoch": 0.15084924094393506, + "flos": 23586631534080.0, + "grad_norm": 1.8904486830015208, + "language_loss": 0.77516425, + "learning_rate": 3.8487449206334845e-06, + "loss": 0.79719174, + "num_input_tokens_seen": 54470800, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0390625, + "step": 2509, + "time_per_iteration": 2.47723126411438 + }, + { + "auxiliary_loss_clip": 0.01160822, + "auxiliary_loss_mlp": 0.01052281, + "balance_loss_clip": 1.0307281, + "balance_loss_mlp": 1.05027628, + "epoch": 0.15090936419660303, + "flos": 18911313383040.0, + "grad_norm": 2.607083522867767, + "language_loss": 0.80497003, + "learning_rate": 3.848596309368246e-06, + "loss": 0.82710105, + "num_input_tokens_seen": 54486525, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.1015625, + "step": 2510, + "time_per_iteration": 2.4445176124572754 + }, + { + "auxiliary_loss_clip": 0.0115714, + "auxiliary_loss_mlp": 0.01053415, + "balance_loss_clip": 1.0336144, + "balance_loss_mlp": 1.05078745, + "epoch": 0.150969487449271, + "flos": 17928223073280.0, + "grad_norm": 2.033214689307001, + "language_loss": 0.73913604, + "learning_rate": 3.8484476280040495e-06, + "loss": 0.76124156, + "num_input_tokens_seen": 54503795, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0625, + "step": 2511, + "time_per_iteration": 2.4372222423553467 + }, + { + "auxiliary_loss_clip": 0.01152059, + "auxiliary_loss_mlp": 0.01040964, + "balance_loss_clip": 1.02332115, + "balance_loss_mlp": 1.04880548, + "epoch": 0.151029610701939, + "flos": 24243078539520.0, + "grad_norm": 2.077792778828972, + "language_loss": 0.6935091, + "learning_rate": 3.848298876546534e-06, + "loss": 0.71543926, + "num_input_tokens_seen": 54523025, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 1.03125, + "step": 2512, + "time_per_iteration": 2.5126166343688965 + }, + { + "auxiliary_loss_clip": 0.01154623, + "auxiliary_loss_mlp": 0.01047016, + "balance_loss_clip": 1.02903962, + "balance_loss_mlp": 1.05130434, + "epoch": 0.15108973395460695, + "flos": 30262496641920.0, + "grad_norm": 3.0703205269170364, + "language_loss": 0.73833334, + "learning_rate": 3.84815005500134e-06, + "loss": 0.76034975, + "num_input_tokens_seen": 54545025, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.03125, + "step": 2513, + "time_per_iteration": 2.5560262203216553 + }, + { + "auxiliary_loss_clip": 0.01052097, + "auxiliary_loss_mlp": 0.01001905, + "balance_loss_clip": 0.99995023, + "balance_loss_mlp": 1.01588845, + "epoch": 0.15114985720727492, + "flos": 60437624428800.0, + "grad_norm": 0.8742342414591, + "language_loss": 0.64759278, + "learning_rate": 3.84800116337411e-06, + "loss": 0.6681329, + "num_input_tokens_seen": 54604545, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.36328125, + "step": 2514, + "time_per_iteration": 3.0147135257720947 + }, + { + "auxiliary_loss_clip": 0.01150943, + "auxiliary_loss_mlp": 0.01043819, + "balance_loss_clip": 1.02588964, + "balance_loss_mlp": 1.04910421, + "epoch": 0.15120998045994288, + "flos": 20521691832960.0, + "grad_norm": 2.6951033245551597, + "language_loss": 0.73257691, + "learning_rate": 3.8478522016704916e-06, + "loss": 0.75452447, + "num_input_tokens_seen": 54620590, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0234375, + "step": 2515, + "time_per_iteration": 2.4640309810638428 + }, + { + "auxiliary_loss_clip": 0.01151787, + "auxiliary_loss_mlp": 0.0104255, + "balance_loss_clip": 1.02361989, + "balance_loss_mlp": 1.04967082, + "epoch": 0.15127010371261085, + "flos": 21178893024000.0, + "grad_norm": 1.8637331039353218, + "language_loss": 0.76990104, + "learning_rate": 3.8477031698961325e-06, + "loss": 0.79184443, + "num_input_tokens_seen": 54640410, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.015625, + "step": 2516, + "time_per_iteration": 2.4672725200653076 + }, + { + "auxiliary_loss_clip": 0.01049641, + "auxiliary_loss_mlp": 0.01003705, + "balance_loss_clip": 1.00177407, + "balance_loss_mlp": 1.01351547, + "epoch": 0.1513302269652788, + "flos": 65320648974720.0, + "grad_norm": 0.745436195681612, + "language_loss": 0.54673135, + "learning_rate": 3.8475540680566835e-06, + "loss": 0.56726485, + "num_input_tokens_seen": 54701430, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.36132812, + "step": 2517, + "time_per_iteration": 3.0677855014801025 + }, + { + "auxiliary_loss_clip": 0.01151686, + "auxiliary_loss_mlp": 0.0104095, + "balance_loss_clip": 1.02126849, + "balance_loss_mlp": 1.04780149, + "epoch": 0.15139035021794678, + "flos": 19135827342720.0, + "grad_norm": 2.2326216563166983, + "language_loss": 0.78515786, + "learning_rate": 3.8474048961577995e-06, + "loss": 0.8070842, + "num_input_tokens_seen": 54720845, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0390625, + "step": 2518, + "time_per_iteration": 3.8305110931396484 + }, + { + "auxiliary_loss_clip": 0.01159011, + "auxiliary_loss_mlp": 0.01048517, + "balance_loss_clip": 1.02851379, + "balance_loss_mlp": 1.05163026, + "epoch": 0.15145047347061477, + "flos": 26578564842240.0, + "grad_norm": 2.1364726943924772, + "language_loss": 0.70153689, + "learning_rate": 3.847255654205137e-06, + "loss": 0.72361219, + "num_input_tokens_seen": 54740495, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0703125, + "step": 2519, + "time_per_iteration": 3.9920616149902344 + }, + { + "auxiliary_loss_clip": 0.01151572, + "auxiliary_loss_mlp": 0.01044317, + "balance_loss_clip": 1.02549386, + "balance_loss_mlp": 1.04812384, + "epoch": 0.15151059672328274, + "flos": 20302959962880.0, + "grad_norm": 1.9802508383478334, + "language_loss": 0.79219216, + "learning_rate": 3.847106342204354e-06, + "loss": 0.81415105, + "num_input_tokens_seen": 54758415, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.03125, + "step": 2520, + "time_per_iteration": 2.4853925704956055 + }, + { + "auxiliary_loss_clip": 0.01155647, + "auxiliary_loss_mlp": 0.01050752, + "balance_loss_clip": 1.03090394, + "balance_loss_mlp": 1.05067897, + "epoch": 0.1515707199759507, + "flos": 27228367831680.0, + "grad_norm": 2.075013959426641, + "language_loss": 0.74324691, + "learning_rate": 3.846956960161114e-06, + "loss": 0.76531088, + "num_input_tokens_seen": 54779355, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.046875, + "step": 2521, + "time_per_iteration": 2.6154706478118896 + }, + { + "auxiliary_loss_clip": 0.01160623, + "auxiliary_loss_mlp": 0.01045817, + "balance_loss_clip": 1.02587366, + "balance_loss_mlp": 1.05273759, + "epoch": 0.15163084322861867, + "flos": 23587349806080.0, + "grad_norm": 2.7623729867934737, + "language_loss": 0.81996739, + "learning_rate": 3.84680750808108e-06, + "loss": 0.84203184, + "num_input_tokens_seen": 54799465, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.078125, + "step": 2522, + "time_per_iteration": 2.4873530864715576 + }, + { + "auxiliary_loss_clip": 0.0104876, + "auxiliary_loss_mlp": 0.01001752, + "balance_loss_clip": 0.99982071, + "balance_loss_mlp": 1.01252866, + "epoch": 0.15169096648128663, + "flos": 66889622021760.0, + "grad_norm": 0.824359498034346, + "language_loss": 0.57915509, + "learning_rate": 3.846657985969922e-06, + "loss": 0.59966022, + "num_input_tokens_seen": 54857665, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.36328125, + "step": 2523, + "time_per_iteration": 2.998990774154663 + }, + { + "auxiliary_loss_clip": 0.01153336, + "auxiliary_loss_mlp": 0.01050595, + "balance_loss_clip": 1.03147376, + "balance_loss_mlp": 1.04972816, + "epoch": 0.1517510897339546, + "flos": 29095435848960.0, + "grad_norm": 1.970015434384356, + "language_loss": 0.7485956, + "learning_rate": 3.8465083938333066e-06, + "loss": 0.77063495, + "num_input_tokens_seen": 54879895, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0390625, + "step": 2524, + "time_per_iteration": 2.570068836212158 + }, + { + "auxiliary_loss_clip": 0.0115237, + "auxiliary_loss_mlp": 0.01044934, + "balance_loss_clip": 1.02603889, + "balance_loss_mlp": 1.0488894, + "epoch": 0.1518112129866226, + "flos": 18406553512320.0, + "grad_norm": 1.8388163356316347, + "language_loss": 0.74780655, + "learning_rate": 3.8463587316769085e-06, + "loss": 0.76977956, + "num_input_tokens_seen": 54898245, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.03125, + "step": 2525, + "time_per_iteration": 2.431143283843994 + }, + { + "auxiliary_loss_clip": 0.01157293, + "auxiliary_loss_mlp": 0.01043467, + "balance_loss_clip": 1.02432156, + "balance_loss_mlp": 1.05145812, + "epoch": 0.15187133623929056, + "flos": 19425410789760.0, + "grad_norm": 1.8962457769996104, + "language_loss": 0.79644465, + "learning_rate": 3.846208999506402e-06, + "loss": 0.81845224, + "num_input_tokens_seen": 54917060, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0625, + "step": 2526, + "time_per_iteration": 2.5167391300201416 + }, + { + "auxiliary_loss_clip": 0.01151222, + "auxiliary_loss_mlp": 0.01044184, + "balance_loss_clip": 1.0271492, + "balance_loss_mlp": 1.05228162, + "epoch": 0.15193145949195852, + "flos": 17566207850880.0, + "grad_norm": 1.8025865198757494, + "language_loss": 0.84928662, + "learning_rate": 3.846059197327466e-06, + "loss": 0.87124068, + "num_input_tokens_seen": 54936365, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9921875, + "step": 2527, + "time_per_iteration": 2.4550719261169434 + }, + { + "auxiliary_loss_clip": 0.01151683, + "auxiliary_loss_mlp": 0.01041065, + "balance_loss_clip": 1.02321947, + "balance_loss_mlp": 1.04876995, + "epoch": 0.15199158274462649, + "flos": 36176265866880.0, + "grad_norm": 2.2810224367730156, + "language_loss": 0.69326001, + "learning_rate": 3.845909325145779e-06, + "loss": 0.71518755, + "num_input_tokens_seen": 54961365, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 1.03125, + "step": 2528, + "time_per_iteration": 2.610042095184326 + }, + { + "auxiliary_loss_clip": 0.01153606, + "auxiliary_loss_mlp": 0.01047581, + "balance_loss_clip": 1.0288415, + "balance_loss_mlp": 1.05137038, + "epoch": 0.15205170599729445, + "flos": 23074042498560.0, + "grad_norm": 2.490892546855648, + "language_loss": 0.86502308, + "learning_rate": 3.845759382967026e-06, + "loss": 0.88703495, + "num_input_tokens_seen": 54980750, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0234375, + "step": 2529, + "time_per_iteration": 2.4695634841918945 + }, + { + "auxiliary_loss_clip": 0.01147713, + "auxiliary_loss_mlp": 0.0103836, + "balance_loss_clip": 1.01989472, + "balance_loss_mlp": 1.04683101, + "epoch": 0.15211182924996242, + "flos": 21908382336000.0, + "grad_norm": 1.8772276619965056, + "language_loss": 0.83002013, + "learning_rate": 3.845609370796893e-06, + "loss": 0.85188091, + "num_input_tokens_seen": 54999675, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.0078125, + "step": 2530, + "time_per_iteration": 2.476238489151001 + }, + { + "auxiliary_loss_clip": 0.01153377, + "auxiliary_loss_mlp": 0.01044599, + "balance_loss_clip": 1.02550209, + "balance_loss_mlp": 1.04987955, + "epoch": 0.15217195250263038, + "flos": 13881521865600.0, + "grad_norm": 2.344030506991615, + "language_loss": 0.80540878, + "learning_rate": 3.845459288641066e-06, + "loss": 0.82738853, + "num_input_tokens_seen": 55018295, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.03125, + "step": 2531, + "time_per_iteration": 2.443617105484009 + }, + { + "auxiliary_loss_clip": 0.01149745, + "auxiliary_loss_mlp": 0.01049084, + "balance_loss_clip": 1.03138137, + "balance_loss_mlp": 1.04895151, + "epoch": 0.15223207575529837, + "flos": 24535319592960.0, + "grad_norm": 2.0816362099746017, + "language_loss": 0.79241651, + "learning_rate": 3.8453091365052394e-06, + "loss": 0.81440473, + "num_input_tokens_seen": 55037975, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 1.0078125, + "step": 2532, + "time_per_iteration": 2.5071239471435547 + }, + { + "auxiliary_loss_clip": 0.0115001, + "auxiliary_loss_mlp": 0.01046515, + "balance_loss_clip": 1.02694106, + "balance_loss_mlp": 1.04952455, + "epoch": 0.15229219900796634, + "flos": 25556798563200.0, + "grad_norm": 1.8298502444413876, + "language_loss": 0.87712961, + "learning_rate": 3.845158914395105e-06, + "loss": 0.89909488, + "num_input_tokens_seen": 55057135, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0, + "step": 2533, + "time_per_iteration": 2.5262463092803955 + }, + { + "auxiliary_loss_clip": 0.01150696, + "auxiliary_loss_mlp": 0.01047398, + "balance_loss_clip": 1.02932572, + "balance_loss_mlp": 1.04766071, + "epoch": 0.1523523222606343, + "flos": 18217806520320.0, + "grad_norm": 2.2606742211331556, + "language_loss": 0.79057097, + "learning_rate": 3.84500862231636e-06, + "loss": 0.81255192, + "num_input_tokens_seen": 55075525, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.03125, + "step": 2534, + "time_per_iteration": 2.4421815872192383 + }, + { + "auxiliary_loss_clip": 0.01156406, + "auxiliary_loss_mlp": 0.01041573, + "balance_loss_clip": 1.02177238, + "balance_loss_mlp": 1.04847312, + "epoch": 0.15241244551330227, + "flos": 13260087642240.0, + "grad_norm": 2.8989864742133933, + "language_loss": 0.76862979, + "learning_rate": 3.844858260274702e-06, + "loss": 0.7906096, + "num_input_tokens_seen": 55090845, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.078125, + "step": 2535, + "time_per_iteration": 2.4193530082702637 + }, + { + "auxiliary_loss_clip": 0.01156147, + "auxiliary_loss_mlp": 0.01040468, + "balance_loss_clip": 1.02153718, + "balance_loss_mlp": 1.04885459, + "epoch": 0.15247256876597023, + "flos": 19715568854400.0, + "grad_norm": 2.234687708038525, + "language_loss": 0.78185135, + "learning_rate": 3.844707828275835e-06, + "loss": 0.80381751, + "num_input_tokens_seen": 55108750, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0703125, + "step": 2536, + "time_per_iteration": 2.478066921234131 + }, + { + "auxiliary_loss_clip": 0.01150448, + "auxiliary_loss_mlp": 0.0105096, + "balance_loss_clip": 1.03305459, + "balance_loss_mlp": 1.05067229, + "epoch": 0.1525326920186382, + "flos": 20375858615040.0, + "grad_norm": 2.124557148089124, + "language_loss": 0.74979979, + "learning_rate": 3.844557326325461e-06, + "loss": 0.77181387, + "num_input_tokens_seen": 55126750, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0, + "step": 2537, + "time_per_iteration": 2.455779552459717 + }, + { + "auxiliary_loss_clip": 0.01152934, + "auxiliary_loss_mlp": 0.01043794, + "balance_loss_clip": 1.02545929, + "balance_loss_mlp": 1.04965043, + "epoch": 0.15259281527130616, + "flos": 13589963170560.0, + "grad_norm": 2.005826380833244, + "language_loss": 0.77631724, + "learning_rate": 3.8444067544292896e-06, + "loss": 0.79828459, + "num_input_tokens_seen": 55144690, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.03125, + "step": 2538, + "time_per_iteration": 2.527730941772461 + }, + { + "auxiliary_loss_clip": 0.01147714, + "auxiliary_loss_mlp": 0.01040159, + "balance_loss_clip": 1.02308786, + "balance_loss_mlp": 1.04806781, + "epoch": 0.15265293852397416, + "flos": 22860374446080.0, + "grad_norm": 1.6961003069906246, + "language_loss": 0.89707708, + "learning_rate": 3.844256112593029e-06, + "loss": 0.9189558, + "num_input_tokens_seen": 55166055, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.99609375, + "step": 2539, + "time_per_iteration": 2.485410451889038 + }, + { + "auxiliary_loss_clip": 0.01151642, + "auxiliary_loss_mlp": 0.01043152, + "balance_loss_clip": 1.02491331, + "balance_loss_mlp": 1.05028892, + "epoch": 0.15271306177664212, + "flos": 29238108670080.0, + "grad_norm": 2.1834515010765627, + "language_loss": 0.93514961, + "learning_rate": 3.844105400822391e-06, + "loss": 0.95709753, + "num_input_tokens_seen": 55186285, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.015625, + "step": 2540, + "time_per_iteration": 2.5399627685546875 + }, + { + "auxiliary_loss_clip": 0.01144897, + "auxiliary_loss_mlp": 0.01043966, + "balance_loss_clip": 1.0266571, + "balance_loss_mlp": 1.04625463, + "epoch": 0.1527731850293101, + "flos": 31246269310080.0, + "grad_norm": 1.9271166035098393, + "language_loss": 0.75039941, + "learning_rate": 3.843954619123092e-06, + "loss": 0.77228808, + "num_input_tokens_seen": 55207915, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.984375, + "step": 2541, + "time_per_iteration": 2.516559362411499 + }, + { + "auxiliary_loss_clip": 0.01147451, + "auxiliary_loss_mlp": 0.01048877, + "balance_loss_clip": 1.03025603, + "balance_loss_mlp": 1.04787207, + "epoch": 0.15283330828197805, + "flos": 22382079920640.0, + "grad_norm": 1.7480154890803248, + "language_loss": 0.81308234, + "learning_rate": 3.84380376750085e-06, + "loss": 0.83504558, + "num_input_tokens_seen": 55227860, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.99609375, + "step": 2542, + "time_per_iteration": 2.4681694507598877 + }, + { + "auxiliary_loss_clip": 0.01150381, + "auxiliary_loss_mlp": 0.01050782, + "balance_loss_clip": 1.03213799, + "balance_loss_mlp": 1.04772067, + "epoch": 0.15289343153464602, + "flos": 25520133755520.0, + "grad_norm": 2.009812895323552, + "language_loss": 0.77568293, + "learning_rate": 3.843652845961383e-06, + "loss": 0.79769456, + "num_input_tokens_seen": 55247330, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.0234375, + "step": 2543, + "time_per_iteration": 2.4899120330810547 + }, + { + "auxiliary_loss_clip": 0.01147346, + "auxiliary_loss_mlp": 0.01045177, + "balance_loss_clip": 1.0266397, + "balance_loss_mlp": 1.04692626, + "epoch": 0.15295355478731398, + "flos": 22710016114560.0, + "grad_norm": 2.3128696364379935, + "language_loss": 0.86483204, + "learning_rate": 3.843501854510416e-06, + "loss": 0.88675725, + "num_input_tokens_seen": 55266195, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0, + "step": 2544, + "time_per_iteration": 2.4774844646453857 + }, + { + "auxiliary_loss_clip": 0.01152485, + "auxiliary_loss_mlp": 0.010531, + "balance_loss_clip": 1.03287029, + "balance_loss_mlp": 1.04675508, + "epoch": 0.15301367803998198, + "flos": 23251907669760.0, + "grad_norm": 2.0966566192890106, + "language_loss": 0.8228749, + "learning_rate": 3.843350793153673e-06, + "loss": 0.84493077, + "num_input_tokens_seen": 55283305, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0546875, + "step": 2545, + "time_per_iteration": 2.4526925086975098 + }, + { + "auxiliary_loss_clip": 0.01149503, + "auxiliary_loss_mlp": 0.01044491, + "balance_loss_clip": 1.02614498, + "balance_loss_mlp": 1.04802954, + "epoch": 0.15307380129264994, + "flos": 25886279041920.0, + "grad_norm": 2.540509049886226, + "language_loss": 0.70711339, + "learning_rate": 3.843199661896884e-06, + "loss": 0.72905338, + "num_input_tokens_seen": 55303035, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.015625, + "step": 2546, + "time_per_iteration": 2.5009732246398926 + }, + { + "auxiliary_loss_clip": 0.01152354, + "auxiliary_loss_mlp": 0.01043417, + "balance_loss_clip": 1.02423596, + "balance_loss_mlp": 1.04967904, + "epoch": 0.1531339245453179, + "flos": 46973239205760.0, + "grad_norm": 1.5770850469719229, + "language_loss": 0.77521312, + "learning_rate": 3.843048460745779e-06, + "loss": 0.79717076, + "num_input_tokens_seen": 55327570, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.03125, + "step": 2547, + "time_per_iteration": 2.6822421550750732 + }, + { + "auxiliary_loss_clip": 0.01152263, + "auxiliary_loss_mlp": 0.01047861, + "balance_loss_clip": 1.02932382, + "balance_loss_mlp": 1.04904902, + "epoch": 0.15319404779798587, + "flos": 35882049565440.0, + "grad_norm": 2.0900989153424976, + "language_loss": 0.73985445, + "learning_rate": 3.842897189706092e-06, + "loss": 0.76185566, + "num_input_tokens_seen": 55351090, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.03125, + "step": 2548, + "time_per_iteration": 2.59080171585083 + }, + { + "auxiliary_loss_clip": 0.01150578, + "auxiliary_loss_mlp": 0.01050674, + "balance_loss_clip": 1.03158915, + "balance_loss_mlp": 1.04806828, + "epoch": 0.15325417105065384, + "flos": 25664638170240.0, + "grad_norm": 1.499185349529517, + "language_loss": 0.80589813, + "learning_rate": 3.842745848783558e-06, + "loss": 0.82791066, + "num_input_tokens_seen": 55371050, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0234375, + "step": 2549, + "time_per_iteration": 2.498096227645874 + }, + { + "auxiliary_loss_clip": 0.01150664, + "auxiliary_loss_mlp": 0.01048572, + "balance_loss_clip": 1.02951026, + "balance_loss_mlp": 1.04750037, + "epoch": 0.1533142943033218, + "flos": 18770831291520.0, + "grad_norm": 1.687491024735964, + "language_loss": 0.74760693, + "learning_rate": 3.842594437983917e-06, + "loss": 0.76959932, + "num_input_tokens_seen": 55390375, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.03125, + "step": 2550, + "time_per_iteration": 2.4740684032440186 + }, + { + "auxiliary_loss_clip": 0.01153822, + "auxiliary_loss_mlp": 0.01039682, + "balance_loss_clip": 1.02035773, + "balance_loss_mlp": 1.04903841, + "epoch": 0.15337441755598977, + "flos": 23107367341440.0, + "grad_norm": 2.205632522725416, + "language_loss": 0.76839805, + "learning_rate": 3.8424429573129115e-06, + "loss": 0.79033309, + "num_input_tokens_seen": 55408890, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.046875, + "step": 2551, + "time_per_iteration": 2.468886375427246 + }, + { + "auxiliary_loss_clip": 0.01045401, + "auxiliary_loss_mlp": 0.01020401, + "balance_loss_clip": 1.01873255, + "balance_loss_mlp": 1.0102303, + "epoch": 0.15343454080865776, + "flos": 59861079227520.0, + "grad_norm": 0.9464853846906186, + "language_loss": 0.56666422, + "learning_rate": 3.842291406776283e-06, + "loss": 0.58732224, + "num_input_tokens_seen": 55463815, + "router_z_loss_clip": 0.01672363, + "router_z_loss_mlp": 0.3515625, + "step": 2552, + "time_per_iteration": 3.0059380531311035 + }, + { + "auxiliary_loss_clip": 0.01152358, + "auxiliary_loss_mlp": 0.010458, + "balance_loss_clip": 1.02684569, + "balance_loss_mlp": 1.04793155, + "epoch": 0.15349466406132573, + "flos": 11910887959680.0, + "grad_norm": 3.2490122092843947, + "language_loss": 0.88505352, + "learning_rate": 3.84213978637978e-06, + "loss": 0.90703511, + "num_input_tokens_seen": 55481050, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.046875, + "step": 2553, + "time_per_iteration": 2.4523322582244873 + }, + { + "auxiliary_loss_clip": 0.01153624, + "auxiliary_loss_mlp": 0.01047537, + "balance_loss_clip": 1.02858269, + "balance_loss_mlp": 1.04771137, + "epoch": 0.1535547873139937, + "flos": 24096922099200.0, + "grad_norm": 1.8003580088176259, + "language_loss": 0.78462374, + "learning_rate": 3.841988096129152e-06, + "loss": 0.80663538, + "num_input_tokens_seen": 55500050, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0625, + "step": 2554, + "time_per_iteration": 2.48526668548584 + }, + { + "auxiliary_loss_clip": 0.01154341, + "auxiliary_loss_mlp": 0.01052395, + "balance_loss_clip": 1.03212881, + "balance_loss_mlp": 1.04941773, + "epoch": 0.15361491056666166, + "flos": 17566459246080.0, + "grad_norm": 2.4926146542113763, + "language_loss": 0.78344929, + "learning_rate": 3.841836336030151e-06, + "loss": 0.80551672, + "num_input_tokens_seen": 55518125, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.046875, + "step": 2555, + "time_per_iteration": 2.4687228202819824 + }, + { + "auxiliary_loss_clip": 0.01149124, + "auxiliary_loss_mlp": 0.01053536, + "balance_loss_clip": 1.03543973, + "balance_loss_mlp": 1.04890609, + "epoch": 0.15367503381932962, + "flos": 25046041121280.0, + "grad_norm": 1.6634961059278193, + "language_loss": 0.76901627, + "learning_rate": 3.8416845060885305e-06, + "loss": 0.7910428, + "num_input_tokens_seen": 55540960, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.0, + "step": 2556, + "time_per_iteration": 2.5006635189056396 + }, + { + "auxiliary_loss_clip": 0.01145988, + "auxiliary_loss_mlp": 0.0104239, + "balance_loss_clip": 1.02362633, + "balance_loss_mlp": 1.04657805, + "epoch": 0.15373515707199759, + "flos": 21507332008320.0, + "grad_norm": 1.8623555031997667, + "language_loss": 0.89489496, + "learning_rate": 3.84153260631005e-06, + "loss": 0.9167788, + "num_input_tokens_seen": 55559210, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.99609375, + "step": 2557, + "time_per_iteration": 2.4434657096862793 + }, + { + "auxiliary_loss_clip": 0.01151609, + "auxiliary_loss_mlp": 0.01046416, + "balance_loss_clip": 1.0263536, + "balance_loss_mlp": 1.04834831, + "epoch": 0.15379528032466555, + "flos": 25994729180160.0, + "grad_norm": 2.0348980361104587, + "language_loss": 0.7119934, + "learning_rate": 3.841380636700468e-06, + "loss": 0.73397368, + "num_input_tokens_seen": 55578925, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.03125, + "step": 2558, + "time_per_iteration": 2.490226984024048 + }, + { + "auxiliary_loss_clip": 0.01152232, + "auxiliary_loss_mlp": 0.01047681, + "balance_loss_clip": 1.02863097, + "balance_loss_mlp": 1.04888546, + "epoch": 0.15385540357733354, + "flos": 19277315015040.0, + "grad_norm": 2.2935483083292705, + "language_loss": 0.92370701, + "learning_rate": 3.841228597265548e-06, + "loss": 0.94570613, + "num_input_tokens_seen": 55597255, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.03125, + "step": 2559, + "time_per_iteration": 3.885131597518921 + }, + { + "auxiliary_loss_clip": 0.01155373, + "auxiliary_loss_mlp": 0.01053347, + "balance_loss_clip": 1.03331971, + "balance_loss_mlp": 1.05068171, + "epoch": 0.1539155268300015, + "flos": 28549126920960.0, + "grad_norm": 5.140445938018919, + "language_loss": 0.63637704, + "learning_rate": 3.841076488011055e-06, + "loss": 0.65846419, + "num_input_tokens_seen": 55619515, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.046875, + "step": 2560, + "time_per_iteration": 5.343889236450195 + }, + { + "auxiliary_loss_clip": 0.01153839, + "auxiliary_loss_mlp": 0.01046849, + "balance_loss_clip": 1.02725124, + "balance_loss_mlp": 1.04950392, + "epoch": 0.15397565008266947, + "flos": 23547883737600.0, + "grad_norm": 1.8613162525264346, + "language_loss": 0.88230681, + "learning_rate": 3.8409243089427574e-06, + "loss": 0.90431374, + "num_input_tokens_seen": 55640050, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.046875, + "step": 2561, + "time_per_iteration": 2.4648611545562744 + }, + { + "auxiliary_loss_clip": 0.01145331, + "auxiliary_loss_mlp": 0.01041909, + "balance_loss_clip": 1.02433765, + "balance_loss_mlp": 1.0477581, + "epoch": 0.15403577333533744, + "flos": 17129821518720.0, + "grad_norm": 1.8458305826175445, + "language_loss": 0.82909077, + "learning_rate": 3.840772060066425e-06, + "loss": 0.85096323, + "num_input_tokens_seen": 55658695, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9765625, + "step": 2562, + "time_per_iteration": 2.4327874183654785 + }, + { + "auxiliary_loss_clip": 0.01160792, + "auxiliary_loss_mlp": 0.0104767, + "balance_loss_clip": 1.02614117, + "balance_loss_mlp": 1.05274105, + "epoch": 0.1540958965880054, + "flos": 17894503180800.0, + "grad_norm": 1.8513620412223286, + "language_loss": 0.74713194, + "learning_rate": 3.840619741387832e-06, + "loss": 0.7692166, + "num_input_tokens_seen": 55676340, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.078125, + "step": 2563, + "time_per_iteration": 2.4246435165405273 + }, + { + "auxiliary_loss_clip": 0.01152598, + "auxiliary_loss_mlp": 0.01044039, + "balance_loss_clip": 1.02425051, + "balance_loss_mlp": 1.04708791, + "epoch": 0.15415601984067337, + "flos": 32161057908480.0, + "grad_norm": 4.308351588789828, + "language_loss": 0.75896233, + "learning_rate": 3.8404673529127534e-06, + "loss": 0.78092873, + "num_input_tokens_seen": 55698890, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.0546875, + "step": 2564, + "time_per_iteration": 2.5528018474578857 + }, + { + "auxiliary_loss_clip": 0.01149402, + "auxiliary_loss_mlp": 0.01050825, + "balance_loss_clip": 1.03233564, + "balance_loss_mlp": 1.04782677, + "epoch": 0.15421614309334136, + "flos": 24024418496640.0, + "grad_norm": 1.9915177170702032, + "language_loss": 0.70825899, + "learning_rate": 3.840314894646969e-06, + "loss": 0.73026133, + "num_input_tokens_seen": 55718535, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.015625, + "step": 2565, + "time_per_iteration": 2.5007505416870117 + }, + { + "auxiliary_loss_clip": 0.01149717, + "auxiliary_loss_mlp": 0.01050801, + "balance_loss_clip": 1.0315845, + "balance_loss_mlp": 1.04728019, + "epoch": 0.15427626634600933, + "flos": 24386290064640.0, + "grad_norm": 2.308308002927142, + "language_loss": 0.71535969, + "learning_rate": 3.840162366596259e-06, + "loss": 0.73736489, + "num_input_tokens_seen": 55738970, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.0234375, + "step": 2566, + "time_per_iteration": 2.498033285140991 + }, + { + "auxiliary_loss_clip": 0.01143736, + "auxiliary_loss_mlp": 0.01042132, + "balance_loss_clip": 1.02379811, + "balance_loss_mlp": 1.04381752, + "epoch": 0.1543363895986773, + "flos": 23331522165120.0, + "grad_norm": 1.7584763964610812, + "language_loss": 0.85129261, + "learning_rate": 3.840009768766408e-06, + "loss": 0.87315124, + "num_input_tokens_seen": 55759585, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.0, + "step": 2567, + "time_per_iteration": 2.46708083152771 + }, + { + "auxiliary_loss_clip": 0.01150763, + "auxiliary_loss_mlp": 0.01050725, + "balance_loss_clip": 1.03266454, + "balance_loss_mlp": 1.0491097, + "epoch": 0.15439651285134526, + "flos": 24274284480000.0, + "grad_norm": 2.4904852760766127, + "language_loss": 0.78025472, + "learning_rate": 3.839857101163202e-06, + "loss": 0.80226958, + "num_input_tokens_seen": 55779250, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.015625, + "step": 2568, + "time_per_iteration": 2.476029634475708 + }, + { + "auxiliary_loss_clip": 0.01150703, + "auxiliary_loss_mlp": 0.01039967, + "balance_loss_clip": 1.01974905, + "balance_loss_mlp": 1.04835856, + "epoch": 0.15445663610401322, + "flos": 22456163721600.0, + "grad_norm": 1.967048361077992, + "language_loss": 0.70183134, + "learning_rate": 3.83970436379243e-06, + "loss": 0.72373807, + "num_input_tokens_seen": 55800470, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0234375, + "step": 2569, + "time_per_iteration": 2.4566383361816406 + }, + { + "auxiliary_loss_clip": 0.011445, + "auxiliary_loss_mlp": 0.0104299, + "balance_loss_clip": 1.0243814, + "balance_loss_mlp": 1.04563344, + "epoch": 0.1545167593566812, + "flos": 22049510872320.0, + "grad_norm": 1.7954711420319855, + "language_loss": 0.76502788, + "learning_rate": 3.839551556659884e-06, + "loss": 0.78690279, + "num_input_tokens_seen": 55817795, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.98828125, + "step": 2570, + "time_per_iteration": 2.4543209075927734 + }, + { + "auxiliary_loss_clip": 0.01149071, + "auxiliary_loss_mlp": 0.01044712, + "balance_loss_clip": 1.02532816, + "balance_loss_mlp": 1.04811645, + "epoch": 0.15457688260934915, + "flos": 19318253541120.0, + "grad_norm": 7.2402617485583525, + "language_loss": 0.77214551, + "learning_rate": 3.839398679771359e-06, + "loss": 0.7940833, + "num_input_tokens_seen": 55836125, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0078125, + "step": 2571, + "time_per_iteration": 2.4532222747802734 + }, + { + "auxiliary_loss_clip": 0.0114992, + "auxiliary_loss_mlp": 0.01049579, + "balance_loss_clip": 1.03086352, + "balance_loss_mlp": 1.04835165, + "epoch": 0.15463700586201715, + "flos": 24133981956480.0, + "grad_norm": 1.949392721600437, + "language_loss": 0.82254899, + "learning_rate": 3.839245733132652e-06, + "loss": 0.84454399, + "num_input_tokens_seen": 55855280, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 2572, + "time_per_iteration": 2.4919703006744385 + }, + { + "auxiliary_loss_clip": 0.01152049, + "auxiliary_loss_mlp": 0.01047577, + "balance_loss_clip": 1.02838445, + "balance_loss_mlp": 1.04827368, + "epoch": 0.1546971291146851, + "flos": 22420935457920.0, + "grad_norm": 1.621727953381826, + "language_loss": 0.90506172, + "learning_rate": 3.839092716749563e-06, + "loss": 0.92705798, + "num_input_tokens_seen": 55875695, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.03125, + "step": 2573, + "time_per_iteration": 2.4679911136627197 + }, + { + "auxiliary_loss_clip": 0.01152025, + "auxiliary_loss_mlp": 0.01056653, + "balance_loss_clip": 1.03724563, + "balance_loss_mlp": 1.04919529, + "epoch": 0.15475725236735308, + "flos": 17530225401600.0, + "grad_norm": 1.7899098306423509, + "language_loss": 0.70378339, + "learning_rate": 3.838939630627893e-06, + "loss": 0.72587025, + "num_input_tokens_seen": 55894575, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0234375, + "step": 2574, + "time_per_iteration": 2.448148012161255 + }, + { + "auxiliary_loss_clip": 0.01150284, + "auxiliary_loss_mlp": 0.01048729, + "balance_loss_clip": 1.02798676, + "balance_loss_mlp": 1.04641008, + "epoch": 0.15481737562002104, + "flos": 22561740771840.0, + "grad_norm": 1.761755301023602, + "language_loss": 0.82718939, + "learning_rate": 3.838786474773448e-06, + "loss": 0.84917951, + "num_input_tokens_seen": 55912855, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.0390625, + "step": 2575, + "time_per_iteration": 2.4515788555145264 + }, + { + "auxiliary_loss_clip": 0.011498, + "auxiliary_loss_mlp": 0.01047927, + "balance_loss_clip": 1.02937794, + "balance_loss_mlp": 1.0456214, + "epoch": 0.154877498872689, + "flos": 24900567039360.0, + "grad_norm": 2.21774000772259, + "language_loss": 0.84661531, + "learning_rate": 3.838633249192036e-06, + "loss": 0.86859256, + "num_input_tokens_seen": 55932375, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0390625, + "step": 2576, + "time_per_iteration": 2.5052003860473633 + }, + { + "auxiliary_loss_clip": 0.01149148, + "auxiliary_loss_mlp": 0.01043114, + "balance_loss_clip": 1.02414751, + "balance_loss_mlp": 1.04679108, + "epoch": 0.15493762212535697, + "flos": 28147501975680.0, + "grad_norm": 1.816317520286285, + "language_loss": 0.81942815, + "learning_rate": 3.838479953889465e-06, + "loss": 0.84135079, + "num_input_tokens_seen": 55953970, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0234375, + "step": 2577, + "time_per_iteration": 2.5133895874023438 + }, + { + "auxiliary_loss_clip": 0.01151988, + "auxiliary_loss_mlp": 0.01049852, + "balance_loss_clip": 1.03090954, + "balance_loss_mlp": 1.04980743, + "epoch": 0.15499774537802496, + "flos": 25411073086080.0, + "grad_norm": 2.384736720709717, + "language_loss": 0.76260924, + "learning_rate": 3.8383265888715525e-06, + "loss": 0.78462768, + "num_input_tokens_seen": 55973120, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0234375, + "step": 2578, + "time_per_iteration": 2.5140793323516846 + }, + { + "auxiliary_loss_clip": 0.01151489, + "auxiliary_loss_mlp": 0.01045761, + "balance_loss_clip": 1.02630556, + "balance_loss_mlp": 1.04832911, + "epoch": 0.15505786863069293, + "flos": 22091562720000.0, + "grad_norm": 2.651100693067537, + "language_loss": 0.82420707, + "learning_rate": 3.83817315414411e-06, + "loss": 0.84617954, + "num_input_tokens_seen": 55993260, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.03125, + "step": 2579, + "time_per_iteration": 2.4410548210144043 + }, + { + "auxiliary_loss_clip": 0.01152359, + "auxiliary_loss_mlp": 0.01049402, + "balance_loss_clip": 1.03056741, + "balance_loss_mlp": 1.05137682, + "epoch": 0.1551179918833609, + "flos": 18917131386240.0, + "grad_norm": 1.6356270056083286, + "language_loss": 0.80460835, + "learning_rate": 3.838019649712958e-06, + "loss": 0.82662606, + "num_input_tokens_seen": 56012130, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0078125, + "step": 2580, + "time_per_iteration": 2.457929849624634 + }, + { + "auxiliary_loss_clip": 0.01050492, + "auxiliary_loss_mlp": 0.01014696, + "balance_loss_clip": 1.0128479, + "balance_loss_mlp": 1.01473403, + "epoch": 0.15517811513602886, + "flos": 66239172587520.0, + "grad_norm": 0.84873853717235, + "language_loss": 0.58840239, + "learning_rate": 3.8378660755839166e-06, + "loss": 0.60905427, + "num_input_tokens_seen": 56079045, + "router_z_loss_clip": 0.01843262, + "router_z_loss_mlp": 0.35742188, + "step": 2581, + "time_per_iteration": 3.1725480556488037 + }, + { + "auxiliary_loss_clip": 0.01152966, + "auxiliary_loss_mlp": 0.01044952, + "balance_loss_clip": 1.02615237, + "balance_loss_mlp": 1.04869819, + "epoch": 0.15523823838869683, + "flos": 24021078531840.0, + "grad_norm": 1.8637973548327127, + "language_loss": 0.85214508, + "learning_rate": 3.8377124317628095e-06, + "loss": 0.87412429, + "num_input_tokens_seen": 56098745, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0390625, + "step": 2582, + "time_per_iteration": 2.486454963684082 + }, + { + "auxiliary_loss_clip": 0.01150766, + "auxiliary_loss_mlp": 0.01055198, + "balance_loss_clip": 1.03534937, + "balance_loss_mlp": 1.04837251, + "epoch": 0.1552983616413648, + "flos": 20485062938880.0, + "grad_norm": 2.457099081417407, + "language_loss": 0.78432047, + "learning_rate": 3.8375587182554625e-06, + "loss": 0.80638009, + "num_input_tokens_seen": 56117655, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0234375, + "step": 2583, + "time_per_iteration": 2.468686580657959 + }, + { + "auxiliary_loss_clip": 0.01151702, + "auxiliary_loss_mlp": 0.01054386, + "balance_loss_clip": 1.03458571, + "balance_loss_mlp": 1.04853427, + "epoch": 0.15535848489403276, + "flos": 32123710742400.0, + "grad_norm": 1.6727812592242826, + "language_loss": 0.76121294, + "learning_rate": 3.837404935067705e-06, + "loss": 0.78327382, + "num_input_tokens_seen": 56141960, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.03125, + "step": 2584, + "time_per_iteration": 2.5471444129943848 + }, + { + "auxiliary_loss_clip": 0.01149066, + "auxiliary_loss_mlp": 0.01046504, + "balance_loss_clip": 1.02746594, + "balance_loss_mlp": 1.04740906, + "epoch": 0.15541860814670075, + "flos": 19098444263040.0, + "grad_norm": 2.0194610159936324, + "language_loss": 0.75623107, + "learning_rate": 3.837251082205368e-06, + "loss": 0.7781868, + "num_input_tokens_seen": 56161430, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.015625, + "step": 2585, + "time_per_iteration": 2.4448020458221436 + }, + { + "auxiliary_loss_clip": 0.01146182, + "auxiliary_loss_mlp": 0.01049826, + "balance_loss_clip": 1.03101528, + "balance_loss_mlp": 1.04662418, + "epoch": 0.1554787313993687, + "flos": 19172097100800.0, + "grad_norm": 2.233481730992117, + "language_loss": 0.611651, + "learning_rate": 3.837097159674286e-06, + "loss": 0.63361114, + "num_input_tokens_seen": 56179390, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.9921875, + "step": 2586, + "time_per_iteration": 2.4375994205474854 + }, + { + "auxiliary_loss_clip": 0.01150781, + "auxiliary_loss_mlp": 0.01047148, + "balance_loss_clip": 1.02814651, + "balance_loss_mlp": 1.04623449, + "epoch": 0.15553885465203668, + "flos": 16143822207360.0, + "grad_norm": 1.8194244944539537, + "language_loss": 0.8108865, + "learning_rate": 3.836943167480296e-06, + "loss": 0.83286583, + "num_input_tokens_seen": 56198020, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.046875, + "step": 2587, + "time_per_iteration": 2.4394617080688477 + }, + { + "auxiliary_loss_clip": 0.01155076, + "auxiliary_loss_mlp": 0.01058653, + "balance_loss_clip": 1.03512144, + "balance_loss_mlp": 1.04851258, + "epoch": 0.15559897790470464, + "flos": 25337779384320.0, + "grad_norm": 1.8978014455674168, + "language_loss": 0.88844347, + "learning_rate": 3.836789105629236e-06, + "loss": 0.91058075, + "num_input_tokens_seen": 56218165, + "router_z_loss_clip": 0.23535156, + "router_z_loss_mlp": 1.0625, + "step": 2588, + "time_per_iteration": 2.519864559173584 + }, + { + "auxiliary_loss_clip": 0.01150101, + "auxiliary_loss_mlp": 0.01053957, + "balance_loss_clip": 1.03351235, + "balance_loss_mlp": 1.04859662, + "epoch": 0.1556591011573726, + "flos": 23148772744320.0, + "grad_norm": 2.6765596364055266, + "language_loss": 0.64950025, + "learning_rate": 3.83663497412695e-06, + "loss": 0.6715408, + "num_input_tokens_seen": 56237160, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.015625, + "step": 2589, + "time_per_iteration": 2.5106732845306396 + }, + { + "auxiliary_loss_clip": 0.01150618, + "auxiliary_loss_mlp": 0.01044948, + "balance_loss_clip": 1.02451587, + "balance_loss_mlp": 1.0483036, + "epoch": 0.15571922441004057, + "flos": 25370888745600.0, + "grad_norm": 1.7614316666112095, + "language_loss": 0.82610166, + "learning_rate": 3.836480772979281e-06, + "loss": 0.84805739, + "num_input_tokens_seen": 56257610, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0234375, + "step": 2590, + "time_per_iteration": 2.519573211669922 + }, + { + "auxiliary_loss_clip": 0.01151128, + "auxiliary_loss_mlp": 0.01047405, + "balance_loss_clip": 1.02761662, + "balance_loss_mlp": 1.04740536, + "epoch": 0.15577934766270854, + "flos": 14501375890560.0, + "grad_norm": 2.1478399705358195, + "language_loss": 0.78919029, + "learning_rate": 3.836326502192077e-06, + "loss": 0.81117558, + "num_input_tokens_seen": 56275215, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.0390625, + "step": 2591, + "time_per_iteration": 2.446871519088745 + }, + { + "auxiliary_loss_clip": 0.01150021, + "auxiliary_loss_mlp": 0.01051358, + "balance_loss_clip": 1.03271413, + "balance_loss_mlp": 1.04902434, + "epoch": 0.15583947091537653, + "flos": 37414537372800.0, + "grad_norm": 1.9877262596002243, + "language_loss": 0.64780253, + "learning_rate": 3.836172161771189e-06, + "loss": 0.66981632, + "num_input_tokens_seen": 56297130, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0078125, + "step": 2592, + "time_per_iteration": 2.5992095470428467 + }, + { + "auxiliary_loss_clip": 0.01156577, + "auxiliary_loss_mlp": 0.01052338, + "balance_loss_clip": 1.03195322, + "balance_loss_mlp": 1.0518856, + "epoch": 0.1558995941680445, + "flos": 21834729498240.0, + "grad_norm": 2.6077304694487062, + "language_loss": 0.81806099, + "learning_rate": 3.836017751722467e-06, + "loss": 0.84015012, + "num_input_tokens_seen": 56314995, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.046875, + "step": 2593, + "time_per_iteration": 2.4317471981048584 + }, + { + "auxiliary_loss_clip": 0.01148564, + "auxiliary_loss_mlp": 0.01049732, + "balance_loss_clip": 1.02876306, + "balance_loss_mlp": 1.04862404, + "epoch": 0.15595971742071246, + "flos": 19792633484160.0, + "grad_norm": 2.3131099691306445, + "language_loss": 0.72585857, + "learning_rate": 3.8358632720517695e-06, + "loss": 0.7478416, + "num_input_tokens_seen": 56334005, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.0, + "step": 2594, + "time_per_iteration": 2.454946994781494 + }, + { + "auxiliary_loss_clip": 0.01145676, + "auxiliary_loss_mlp": 0.01044453, + "balance_loss_clip": 1.02514088, + "balance_loss_mlp": 1.0476191, + "epoch": 0.15601984067338043, + "flos": 26722135503360.0, + "grad_norm": 1.980280068020953, + "language_loss": 0.8170377, + "learning_rate": 3.835708722764952e-06, + "loss": 0.83893895, + "num_input_tokens_seen": 56353795, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.98046875, + "step": 2595, + "time_per_iteration": 2.4859232902526855 + }, + { + "auxiliary_loss_clip": 0.01149214, + "auxiliary_loss_mlp": 0.01047121, + "balance_loss_clip": 1.02761889, + "balance_loss_mlp": 1.04722846, + "epoch": 0.1560799639260484, + "flos": 18369278173440.0, + "grad_norm": 2.3729637830877177, + "language_loss": 0.86587811, + "learning_rate": 3.835554103867876e-06, + "loss": 0.88784146, + "num_input_tokens_seen": 56373195, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.015625, + "step": 2596, + "time_per_iteration": 2.431851387023926 + }, + { + "auxiliary_loss_clip": 0.0114636, + "auxiliary_loss_mlp": 0.01043935, + "balance_loss_clip": 1.02558839, + "balance_loss_mlp": 1.04831815, + "epoch": 0.15614008717871636, + "flos": 22598980197120.0, + "grad_norm": 1.6624104890405602, + "language_loss": 0.68610018, + "learning_rate": 3.835399415366404e-06, + "loss": 0.70800316, + "num_input_tokens_seen": 56391525, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.98046875, + "step": 2597, + "time_per_iteration": 2.447265625 + }, + { + "auxiliary_loss_clip": 0.01144111, + "auxiliary_loss_mlp": 0.01040539, + "balance_loss_clip": 1.02210891, + "balance_loss_mlp": 1.04714298, + "epoch": 0.15620021043138435, + "flos": 22746860490240.0, + "grad_norm": 1.638980754682227, + "language_loss": 0.79885375, + "learning_rate": 3.8352446572664035e-06, + "loss": 0.82070029, + "num_input_tokens_seen": 56410715, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.97265625, + "step": 2598, + "time_per_iteration": 2.4641571044921875 + }, + { + "auxiliary_loss_clip": 0.01141262, + "auxiliary_loss_mlp": 0.01039052, + "balance_loss_clip": 1.02003777, + "balance_loss_mlp": 1.04484367, + "epoch": 0.15626033368405232, + "flos": 13114936782720.0, + "grad_norm": 2.19687533686526, + "language_loss": 0.82877028, + "learning_rate": 3.8350898295737405e-06, + "loss": 0.85057342, + "num_input_tokens_seen": 56429170, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.96484375, + "step": 2599, + "time_per_iteration": 2.419464111328125 + }, + { + "auxiliary_loss_clip": 0.01155461, + "auxiliary_loss_mlp": 0.01052363, + "balance_loss_clip": 1.03115571, + "balance_loss_mlp": 1.04991198, + "epoch": 0.15632045693672028, + "flos": 16472297105280.0, + "grad_norm": 3.412785735027946, + "language_loss": 0.81813747, + "learning_rate": 3.834934932294287e-06, + "loss": 0.84021574, + "num_input_tokens_seen": 56445685, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.0546875, + "step": 2600, + "time_per_iteration": 2.408848524093628 + }, + { + "auxiliary_loss_clip": 0.01152936, + "auxiliary_loss_mlp": 0.0104778, + "balance_loss_clip": 1.02813435, + "balance_loss_mlp": 1.05145574, + "epoch": 0.15638058018938825, + "flos": 20850346298880.0, + "grad_norm": 1.8570517134994367, + "language_loss": 0.8869983, + "learning_rate": 3.834779965433917e-06, + "loss": 0.90900552, + "num_input_tokens_seen": 56465900, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.015625, + "step": 2601, + "time_per_iteration": 3.8960022926330566 + }, + { + "auxiliary_loss_clip": 0.01155618, + "auxiliary_loss_mlp": 0.01064496, + "balance_loss_clip": 1.04250216, + "balance_loss_mlp": 1.05294669, + "epoch": 0.1564407034420562, + "flos": 21872220318720.0, + "grad_norm": 1.6572791804428935, + "language_loss": 0.78657669, + "learning_rate": 3.834624928998508e-06, + "loss": 0.80877781, + "num_input_tokens_seen": 56485020, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.0234375, + "step": 2602, + "time_per_iteration": 5.330498456954956 + }, + { + "auxiliary_loss_clip": 0.01148351, + "auxiliary_loss_mlp": 0.01041482, + "balance_loss_clip": 1.02178836, + "balance_loss_mlp": 1.04872918, + "epoch": 0.15650082669472418, + "flos": 21834549930240.0, + "grad_norm": 1.9481072701353659, + "language_loss": 0.73668396, + "learning_rate": 3.8344698229939376e-06, + "loss": 0.75858229, + "num_input_tokens_seen": 56505205, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.99609375, + "step": 2603, + "time_per_iteration": 2.4632985591888428 + }, + { + "auxiliary_loss_clip": 0.01152236, + "auxiliary_loss_mlp": 0.01051929, + "balance_loss_clip": 1.03205693, + "balance_loss_mlp": 1.05066442, + "epoch": 0.15656094994739214, + "flos": 13800542653440.0, + "grad_norm": 3.4624008692922583, + "language_loss": 0.87223339, + "learning_rate": 3.8343146474260865e-06, + "loss": 0.89427507, + "num_input_tokens_seen": 56521495, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.015625, + "step": 2604, + "time_per_iteration": 2.449589490890503 + }, + { + "auxiliary_loss_clip": 0.01151636, + "auxiliary_loss_mlp": 0.01043178, + "balance_loss_clip": 1.02404523, + "balance_loss_mlp": 1.04892218, + "epoch": 0.15662107320006013, + "flos": 27308197808640.0, + "grad_norm": 1.883819023069068, + "language_loss": 0.85465723, + "learning_rate": 3.834159402300841e-06, + "loss": 0.87660539, + "num_input_tokens_seen": 56540665, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0234375, + "step": 2605, + "time_per_iteration": 2.4958839416503906 + }, + { + "auxiliary_loss_clip": 0.01153078, + "auxiliary_loss_mlp": 0.01047461, + "balance_loss_clip": 1.0274334, + "balance_loss_mlp": 1.04840827, + "epoch": 0.1566811964527281, + "flos": 26685075646080.0, + "grad_norm": 2.4518366617864897, + "language_loss": 0.72954321, + "learning_rate": 3.834004087624087e-06, + "loss": 0.75154853, + "num_input_tokens_seen": 56560805, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.046875, + "step": 2606, + "time_per_iteration": 2.5142898559570312 + }, + { + "auxiliary_loss_clip": 0.01153185, + "auxiliary_loss_mlp": 0.01052184, + "balance_loss_clip": 1.03406429, + "balance_loss_mlp": 1.05257165, + "epoch": 0.15674131970539606, + "flos": 16103422385280.0, + "grad_norm": 1.9820673877795116, + "language_loss": 0.7643044, + "learning_rate": 3.8338487034017145e-06, + "loss": 0.78635812, + "num_input_tokens_seen": 56576335, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.0, + "step": 2607, + "time_per_iteration": 2.433779239654541 + }, + { + "auxiliary_loss_clip": 0.01150219, + "auxiliary_loss_mlp": 0.01046695, + "balance_loss_clip": 1.0282656, + "balance_loss_mlp": 1.05097091, + "epoch": 0.15680144295806403, + "flos": 19169690889600.0, + "grad_norm": 1.7850270515341367, + "language_loss": 0.8191157, + "learning_rate": 3.833693249639615e-06, + "loss": 0.8410849, + "num_input_tokens_seen": 56595880, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.9921875, + "step": 2608, + "time_per_iteration": 2.4599456787109375 + }, + { + "auxiliary_loss_clip": 0.0115477, + "auxiliary_loss_mlp": 0.01051079, + "balance_loss_clip": 1.03001475, + "balance_loss_mlp": 1.05087662, + "epoch": 0.156861566210732, + "flos": 20813430096000.0, + "grad_norm": 1.762197880640894, + "language_loss": 0.72479111, + "learning_rate": 3.833537726343684e-06, + "loss": 0.74684954, + "num_input_tokens_seen": 56615130, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.0390625, + "step": 2609, + "time_per_iteration": 2.478262424468994 + }, + { + "auxiliary_loss_clip": 0.0115339, + "auxiliary_loss_mlp": 0.01043612, + "balance_loss_clip": 1.02415729, + "balance_loss_mlp": 1.04881263, + "epoch": 0.15692168946339996, + "flos": 20047922421120.0, + "grad_norm": 1.8833233307981396, + "language_loss": 0.71974212, + "learning_rate": 3.833382133519818e-06, + "loss": 0.74171209, + "num_input_tokens_seen": 56634005, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.046875, + "step": 2610, + "time_per_iteration": 2.468616247177124 + }, + { + "auxiliary_loss_clip": 0.01153055, + "auxiliary_loss_mlp": 0.01052533, + "balance_loss_clip": 1.03119481, + "balance_loss_mlp": 1.04865789, + "epoch": 0.15698181271606793, + "flos": 21398019943680.0, + "grad_norm": 2.0486839750324117, + "language_loss": 0.72148776, + "learning_rate": 3.833226471173919e-06, + "loss": 0.74354362, + "num_input_tokens_seen": 56653480, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.046875, + "step": 2611, + "time_per_iteration": 2.4812967777252197 + }, + { + "auxiliary_loss_clip": 0.01152967, + "auxiliary_loss_mlp": 0.01044873, + "balance_loss_clip": 1.02517986, + "balance_loss_mlp": 1.05081797, + "epoch": 0.15704193596873592, + "flos": 20845785271680.0, + "grad_norm": 2.1526303920645153, + "language_loss": 0.70732605, + "learning_rate": 3.833070739311887e-06, + "loss": 0.72930443, + "num_input_tokens_seen": 56672270, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0234375, + "step": 2612, + "time_per_iteration": 2.4659905433654785 + }, + { + "auxiliary_loss_clip": 0.0115345, + "auxiliary_loss_mlp": 0.01053573, + "balance_loss_clip": 1.03448749, + "balance_loss_mlp": 1.05112672, + "epoch": 0.15710205922140388, + "flos": 21762908254080.0, + "grad_norm": 1.98698506128839, + "language_loss": 0.75649011, + "learning_rate": 3.83291493793963e-06, + "loss": 0.77856034, + "num_input_tokens_seen": 56691510, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0234375, + "step": 2613, + "time_per_iteration": 2.5053935050964355 + }, + { + "auxiliary_loss_clip": 0.01150247, + "auxiliary_loss_mlp": 0.01054631, + "balance_loss_clip": 1.03454411, + "balance_loss_mlp": 1.04870725, + "epoch": 0.15716218247407185, + "flos": 25007760201600.0, + "grad_norm": 1.7256548803860323, + "language_loss": 0.6593504, + "learning_rate": 3.832759067063055e-06, + "loss": 0.68139917, + "num_input_tokens_seen": 56712230, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.015625, + "step": 2614, + "time_per_iteration": 2.49568772315979 + }, + { + "auxiliary_loss_clip": 0.01154976, + "auxiliary_loss_mlp": 0.01050381, + "balance_loss_clip": 1.02972233, + "balance_loss_mlp": 1.04979289, + "epoch": 0.1572223057267398, + "flos": 20191780391040.0, + "grad_norm": 2.1509467282749055, + "language_loss": 0.7554003, + "learning_rate": 3.832603126688072e-06, + "loss": 0.7774539, + "num_input_tokens_seen": 56727490, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.0546875, + "step": 2615, + "time_per_iteration": 2.529383420944214 + }, + { + "auxiliary_loss_clip": 0.0115204, + "auxiliary_loss_mlp": 0.01052516, + "balance_loss_clip": 1.03374028, + "balance_loss_mlp": 1.05295634, + "epoch": 0.15728242897940778, + "flos": 20959514709120.0, + "grad_norm": 1.616950748432624, + "language_loss": 0.72989607, + "learning_rate": 3.832447116820594e-06, + "loss": 0.75194162, + "num_input_tokens_seen": 56747385, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9921875, + "step": 2616, + "time_per_iteration": 2.5096960067749023 + }, + { + "auxiliary_loss_clip": 0.01152584, + "auxiliary_loss_mlp": 0.01055054, + "balance_loss_clip": 1.03453839, + "balance_loss_mlp": 1.04991412, + "epoch": 0.15734255223207574, + "flos": 23038275530880.0, + "grad_norm": 3.5663633553154774, + "language_loss": 0.72316766, + "learning_rate": 3.832291037466539e-06, + "loss": 0.74524403, + "num_input_tokens_seen": 56768055, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0234375, + "step": 2617, + "time_per_iteration": 2.46756911277771 + }, + { + "auxiliary_loss_clip": 0.01151577, + "auxiliary_loss_mlp": 0.01043789, + "balance_loss_clip": 1.02453637, + "balance_loss_mlp": 1.05169988, + "epoch": 0.15740267548474374, + "flos": 20551281661440.0, + "grad_norm": 2.0296559288157563, + "language_loss": 0.74336463, + "learning_rate": 3.8321348886318235e-06, + "loss": 0.76531827, + "num_input_tokens_seen": 56785110, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.0, + "step": 2618, + "time_per_iteration": 2.4584109783172607 + }, + { + "auxiliary_loss_clip": 0.01156356, + "auxiliary_loss_mlp": 0.01051737, + "balance_loss_clip": 1.02976644, + "balance_loss_mlp": 1.05079079, + "epoch": 0.1574627987374117, + "flos": 22666922772480.0, + "grad_norm": 2.116136233608656, + "language_loss": 0.78624105, + "learning_rate": 3.8319786703223695e-06, + "loss": 0.80832201, + "num_input_tokens_seen": 56804975, + "router_z_loss_clip": 0.21972656, + "router_z_loss_mlp": 1.0546875, + "step": 2619, + "time_per_iteration": 2.481902837753296 + }, + { + "auxiliary_loss_clip": 0.01151953, + "auxiliary_loss_mlp": 0.01052764, + "balance_loss_clip": 1.03373837, + "balance_loss_mlp": 1.05213726, + "epoch": 0.15752292199007967, + "flos": 16800664262400.0, + "grad_norm": 1.705564128099723, + "language_loss": 0.76632881, + "learning_rate": 3.831822382544101e-06, + "loss": 0.78837597, + "num_input_tokens_seen": 56822470, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0, + "step": 2620, + "time_per_iteration": 2.432645082473755 + }, + { + "auxiliary_loss_clip": 0.01153614, + "auxiliary_loss_mlp": 0.01050007, + "balance_loss_clip": 1.02901375, + "balance_loss_mlp": 1.05096626, + "epoch": 0.15758304524274763, + "flos": 29826002568960.0, + "grad_norm": 1.7942321132139696, + "language_loss": 0.70836174, + "learning_rate": 3.831666025302944e-06, + "loss": 0.73039794, + "num_input_tokens_seen": 56842100, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.03125, + "step": 2621, + "time_per_iteration": 2.5259244441986084 + }, + { + "auxiliary_loss_clip": 0.01156472, + "auxiliary_loss_mlp": 0.01049198, + "balance_loss_clip": 1.0277524, + "balance_loss_mlp": 1.05222857, + "epoch": 0.1576431684954156, + "flos": 53577426723840.0, + "grad_norm": 2.5825564073202467, + "language_loss": 0.71880406, + "learning_rate": 3.831509598604828e-06, + "loss": 0.74086076, + "num_input_tokens_seen": 56865920, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.046875, + "step": 2622, + "time_per_iteration": 2.738351583480835 + }, + { + "auxiliary_loss_clip": 0.01153726, + "auxiliary_loss_mlp": 0.01047401, + "balance_loss_clip": 1.02826762, + "balance_loss_mlp": 1.05162704, + "epoch": 0.15770329174808356, + "flos": 20813609664000.0, + "grad_norm": 1.7275011876813262, + "language_loss": 0.87603116, + "learning_rate": 3.831353102455684e-06, + "loss": 0.89804244, + "num_input_tokens_seen": 56885265, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0234375, + "step": 2623, + "time_per_iteration": 2.439276695251465 + }, + { + "auxiliary_loss_clip": 0.01153997, + "auxiliary_loss_mlp": 0.01046147, + "balance_loss_clip": 1.02774143, + "balance_loss_mlp": 1.05301619, + "epoch": 0.15776341500075153, + "flos": 24974004395520.0, + "grad_norm": 1.7488793041913886, + "language_loss": 0.82132548, + "learning_rate": 3.831196536861448e-06, + "loss": 0.84332693, + "num_input_tokens_seen": 56906710, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.0078125, + "step": 2624, + "time_per_iteration": 2.5011823177337646 + }, + { + "auxiliary_loss_clip": 0.01156666, + "auxiliary_loss_mlp": 0.01047764, + "balance_loss_clip": 1.02720022, + "balance_loss_mlp": 1.0518285, + "epoch": 0.15782353825341952, + "flos": 21907915459200.0, + "grad_norm": 2.213311097116894, + "language_loss": 0.79965818, + "learning_rate": 3.831039901828054e-06, + "loss": 0.82170242, + "num_input_tokens_seen": 56924275, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.046875, + "step": 2625, + "time_per_iteration": 2.469705581665039 + }, + { + "auxiliary_loss_clip": 0.01152837, + "auxiliary_loss_mlp": 0.01050956, + "balance_loss_clip": 1.03215635, + "balance_loss_mlp": 1.05189955, + "epoch": 0.15788366150608749, + "flos": 26177191292160.0, + "grad_norm": 2.0497226184185044, + "language_loss": 0.80393386, + "learning_rate": 3.830883197361445e-06, + "loss": 0.82597172, + "num_input_tokens_seen": 56941525, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0078125, + "step": 2626, + "time_per_iteration": 2.4822630882263184 + }, + { + "auxiliary_loss_clip": 0.01157567, + "auxiliary_loss_mlp": 0.01046921, + "balance_loss_clip": 1.02703679, + "balance_loss_mlp": 1.05660009, + "epoch": 0.15794378475875545, + "flos": 27709822753920.0, + "grad_norm": 1.8439314798963051, + "language_loss": 0.73819017, + "learning_rate": 3.830726423467561e-06, + "loss": 0.76023501, + "num_input_tokens_seen": 56962145, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0078125, + "step": 2627, + "time_per_iteration": 2.5146384239196777 + }, + { + "auxiliary_loss_clip": 0.01153645, + "auxiliary_loss_mlp": 0.01055765, + "balance_loss_clip": 1.03515387, + "balance_loss_mlp": 1.05136025, + "epoch": 0.15800390801142342, + "flos": 12130158533760.0, + "grad_norm": 2.581375347872909, + "language_loss": 0.84926289, + "learning_rate": 3.830569580152348e-06, + "loss": 0.87135696, + "num_input_tokens_seen": 56977505, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.0234375, + "step": 2628, + "time_per_iteration": 2.476461172103882 + }, + { + "auxiliary_loss_clip": 0.01152526, + "auxiliary_loss_mlp": 0.01039179, + "balance_loss_clip": 1.02045107, + "balance_loss_mlp": 1.05181646, + "epoch": 0.15806403126409138, + "flos": 20704728562560.0, + "grad_norm": 1.9330212081502065, + "language_loss": 0.76414472, + "learning_rate": 3.830412667421752e-06, + "loss": 0.78606176, + "num_input_tokens_seen": 56996770, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0078125, + "step": 2629, + "time_per_iteration": 2.4604575634002686 + }, + { + "auxiliary_loss_clip": 0.01157301, + "auxiliary_loss_mlp": 0.01053673, + "balance_loss_clip": 1.03277516, + "balance_loss_mlp": 1.05376625, + "epoch": 0.15812415451675935, + "flos": 17821712269440.0, + "grad_norm": 2.3335878107949624, + "language_loss": 0.73786485, + "learning_rate": 3.8302556852817245e-06, + "loss": 0.7599746, + "num_input_tokens_seen": 57014970, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.0390625, + "step": 2630, + "time_per_iteration": 2.4556961059570312 + }, + { + "auxiliary_loss_clip": 0.01159154, + "auxiliary_loss_mlp": 0.01049527, + "balance_loss_clip": 1.02934527, + "balance_loss_mlp": 1.05278432, + "epoch": 0.15818427776942734, + "flos": 20084048524800.0, + "grad_norm": 3.0799062126580385, + "language_loss": 0.83732498, + "learning_rate": 3.8300986337382184e-06, + "loss": 0.85941184, + "num_input_tokens_seen": 57034045, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0625, + "step": 2631, + "time_per_iteration": 2.46466326713562 + }, + { + "auxiliary_loss_clip": 0.0115417, + "auxiliary_loss_mlp": 0.01047476, + "balance_loss_clip": 1.02800894, + "balance_loss_mlp": 1.05072045, + "epoch": 0.1582444010220953, + "flos": 21214911386880.0, + "grad_norm": 1.8231521117013414, + "language_loss": 0.78509778, + "learning_rate": 3.8299415127971895e-06, + "loss": 0.80711424, + "num_input_tokens_seen": 57053695, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0390625, + "step": 2632, + "time_per_iteration": 2.4678170680999756 + }, + { + "auxiliary_loss_clip": 0.01160199, + "auxiliary_loss_mlp": 0.01058182, + "balance_loss_clip": 1.03766572, + "balance_loss_mlp": 1.05516291, + "epoch": 0.15830452427476327, + "flos": 17858341163520.0, + "grad_norm": 2.1429957658458374, + "language_loss": 0.83250827, + "learning_rate": 3.829784322464594e-06, + "loss": 0.8546921, + "num_input_tokens_seen": 57071290, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.046875, + "step": 2633, + "time_per_iteration": 2.4329495429992676 + }, + { + "auxiliary_loss_clip": 0.01161566, + "auxiliary_loss_mlp": 0.01046458, + "balance_loss_clip": 1.02641928, + "balance_loss_mlp": 1.05591452, + "epoch": 0.15836464752743123, + "flos": 24534960456960.0, + "grad_norm": 1.9651575849984717, + "language_loss": 0.77401066, + "learning_rate": 3.829627062746394e-06, + "loss": 0.79609084, + "num_input_tokens_seen": 57091465, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.0625, + "step": 2634, + "time_per_iteration": 2.4989452362060547 + }, + { + "auxiliary_loss_clip": 0.01158347, + "auxiliary_loss_mlp": 0.01049894, + "balance_loss_clip": 1.02961695, + "balance_loss_mlp": 1.05281138, + "epoch": 0.1584247707800992, + "flos": 20120821073280.0, + "grad_norm": 2.178604932363088, + "language_loss": 0.89144027, + "learning_rate": 3.829469733648552e-06, + "loss": 0.91352272, + "num_input_tokens_seen": 57110075, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.0546875, + "step": 2635, + "time_per_iteration": 2.45926570892334 + }, + { + "auxiliary_loss_clip": 0.0115666, + "auxiliary_loss_mlp": 0.0105615, + "balance_loss_clip": 1.03518081, + "balance_loss_mlp": 1.05145168, + "epoch": 0.15848489403276717, + "flos": 20375966355840.0, + "grad_norm": 2.07071202721755, + "language_loss": 0.75814605, + "learning_rate": 3.829312335177034e-06, + "loss": 0.78027415, + "num_input_tokens_seen": 57128945, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.046875, + "step": 2636, + "time_per_iteration": 2.4601919651031494 + }, + { + "auxiliary_loss_clip": 0.01159967, + "auxiliary_loss_mlp": 0.01046973, + "balance_loss_clip": 1.0252409, + "balance_loss_mlp": 1.05383635, + "epoch": 0.15854501728543513, + "flos": 39346890359040.0, + "grad_norm": 2.192817266182781, + "language_loss": 0.72065628, + "learning_rate": 3.82915486733781e-06, + "loss": 0.74272561, + "num_input_tokens_seen": 57152385, + "router_z_loss_clip": 0.21679688, + "router_z_loss_mlp": 1.0625, + "step": 2637, + "time_per_iteration": 2.6509416103363037 + }, + { + "auxiliary_loss_clip": 0.01154792, + "auxiliary_loss_mlp": 0.01042754, + "balance_loss_clip": 1.02395523, + "balance_loss_mlp": 1.05307317, + "epoch": 0.15860514053810312, + "flos": 24864225454080.0, + "grad_norm": 1.9644709833035638, + "language_loss": 0.77938193, + "learning_rate": 3.82899733013685e-06, + "loss": 0.80135739, + "num_input_tokens_seen": 57172620, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 2638, + "time_per_iteration": 2.516597032546997 + }, + { + "auxiliary_loss_clip": 0.01160159, + "auxiliary_loss_mlp": 0.01062214, + "balance_loss_clip": 1.04074407, + "balance_loss_mlp": 1.05348861, + "epoch": 0.1586652637907711, + "flos": 26177694082560.0, + "grad_norm": 1.8473853011869859, + "language_loss": 0.75521988, + "learning_rate": 3.828839723580128e-06, + "loss": 0.77744359, + "num_input_tokens_seen": 57194680, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.0625, + "step": 2639, + "time_per_iteration": 2.5517024993896484 + }, + { + "auxiliary_loss_clip": 0.01159513, + "auxiliary_loss_mlp": 0.01061213, + "balance_loss_clip": 1.04115009, + "balance_loss_mlp": 1.0541048, + "epoch": 0.15872538704343905, + "flos": 19792058866560.0, + "grad_norm": 2.7935559917311212, + "language_loss": 0.81487972, + "learning_rate": 3.82868204767362e-06, + "loss": 0.83708692, + "num_input_tokens_seen": 57214675, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0546875, + "step": 2640, + "time_per_iteration": 2.5613112449645996 + }, + { + "auxiliary_loss_clip": 0.01152653, + "auxiliary_loss_mlp": 0.01050922, + "balance_loss_clip": 1.030406, + "balance_loss_mlp": 1.05107331, + "epoch": 0.15878551029610702, + "flos": 28475366342400.0, + "grad_norm": 1.4887809421561018, + "language_loss": 0.67051661, + "learning_rate": 3.828524302423306e-06, + "loss": 0.69255233, + "num_input_tokens_seen": 57235830, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.015625, + "step": 2641, + "time_per_iteration": 2.5603220462799072 + }, + { + "auxiliary_loss_clip": 0.01163302, + "auxiliary_loss_mlp": 0.01057677, + "balance_loss_clip": 1.03670835, + "balance_loss_mlp": 1.05338526, + "epoch": 0.15884563354877498, + "flos": 24206701040640.0, + "grad_norm": 2.894977763056953, + "language_loss": 0.7508198, + "learning_rate": 3.828366487835167e-06, + "loss": 0.77302957, + "num_input_tokens_seen": 57255970, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.09375, + "step": 2642, + "time_per_iteration": 2.4783003330230713 + }, + { + "auxiliary_loss_clip": 0.01154514, + "auxiliary_loss_mlp": 0.01054374, + "balance_loss_clip": 1.0343703, + "balance_loss_mlp": 1.05342579, + "epoch": 0.15890575680144295, + "flos": 23949795991680.0, + "grad_norm": 2.1233146618452046, + "language_loss": 0.70096999, + "learning_rate": 3.828208603915186e-06, + "loss": 0.72305882, + "num_input_tokens_seen": 57274435, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.015625, + "step": 2643, + "time_per_iteration": 3.8417530059814453 + }, + { + "auxiliary_loss_clip": 0.0115474, + "auxiliary_loss_mlp": 0.01046992, + "balance_loss_clip": 1.02801371, + "balance_loss_mlp": 1.05399418, + "epoch": 0.15896588005411091, + "flos": 21215019127680.0, + "grad_norm": 2.266510625665779, + "language_loss": 0.78172421, + "learning_rate": 3.828050650669353e-06, + "loss": 0.80374151, + "num_input_tokens_seen": 57293115, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0078125, + "step": 2644, + "time_per_iteration": 3.918332099914551 + }, + { + "auxiliary_loss_clip": 0.01155626, + "auxiliary_loss_mlp": 0.01054747, + "balance_loss_clip": 1.03432608, + "balance_loss_mlp": 1.05189228, + "epoch": 0.1590260033067789, + "flos": 24352390604160.0, + "grad_norm": 1.8745538844001242, + "language_loss": 0.82203078, + "learning_rate": 3.827892628103657e-06, + "loss": 0.84413457, + "num_input_tokens_seen": 57312565, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0390625, + "step": 2645, + "time_per_iteration": 2.484264373779297 + }, + { + "auxiliary_loss_clip": 0.01156639, + "auxiliary_loss_mlp": 0.01055562, + "balance_loss_clip": 1.0340929, + "balance_loss_mlp": 1.05192447, + "epoch": 0.15908612655944687, + "flos": 32048944583040.0, + "grad_norm": 1.974907168100252, + "language_loss": 0.69778836, + "learning_rate": 3.827734536224087e-06, + "loss": 0.71991032, + "num_input_tokens_seen": 57333360, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.046875, + "step": 2646, + "time_per_iteration": 2.5406665802001953 + }, + { + "auxiliary_loss_clip": 0.01151139, + "auxiliary_loss_mlp": 0.01046289, + "balance_loss_clip": 1.02738249, + "balance_loss_mlp": 1.05206954, + "epoch": 0.15914624981211484, + "flos": 17785370684160.0, + "grad_norm": 2.5066454352116914, + "language_loss": 0.62659109, + "learning_rate": 3.827576375036642e-06, + "loss": 0.64856541, + "num_input_tokens_seen": 57350575, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.9921875, + "step": 2647, + "time_per_iteration": 2.442711353302002 + }, + { + "auxiliary_loss_clip": 0.01155368, + "auxiliary_loss_mlp": 0.01052347, + "balance_loss_clip": 1.03226066, + "balance_loss_mlp": 1.05410099, + "epoch": 0.1592063730647828, + "flos": 17712507945600.0, + "grad_norm": 2.1253745247586204, + "language_loss": 0.8942067, + "learning_rate": 3.827418144547318e-06, + "loss": 0.91628385, + "num_input_tokens_seen": 57367570, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.015625, + "step": 2648, + "time_per_iteration": 2.4649319648742676 + }, + { + "auxiliary_loss_clip": 0.01152722, + "auxiliary_loss_mlp": 0.01049569, + "balance_loss_clip": 1.03141308, + "balance_loss_mlp": 1.05391204, + "epoch": 0.15926649631745077, + "flos": 18803545603200.0, + "grad_norm": 1.8651001097947648, + "language_loss": 0.91716385, + "learning_rate": 3.827259844762114e-06, + "loss": 0.93918669, + "num_input_tokens_seen": 57383980, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.98828125, + "step": 2649, + "time_per_iteration": 2.451261520385742 + }, + { + "auxiliary_loss_clip": 0.01163223, + "auxiliary_loss_mlp": 0.01049063, + "balance_loss_clip": 1.02802217, + "balance_loss_mlp": 1.05272281, + "epoch": 0.15932661957011873, + "flos": 17566243764480.0, + "grad_norm": 3.3226984417644028, + "language_loss": 0.71273595, + "learning_rate": 3.827101475687033e-06, + "loss": 0.73485881, + "num_input_tokens_seen": 57400840, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.1015625, + "step": 2650, + "time_per_iteration": 2.43603253364563 + }, + { + "auxiliary_loss_clip": 0.01153823, + "auxiliary_loss_mlp": 0.01044738, + "balance_loss_clip": 1.02695203, + "balance_loss_mlp": 1.05372715, + "epoch": 0.15938674282278673, + "flos": 13334351011200.0, + "grad_norm": 2.4247432930640898, + "language_loss": 0.71116996, + "learning_rate": 3.826943037328082e-06, + "loss": 0.73315561, + "num_input_tokens_seen": 57419230, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 1.0, + "step": 2651, + "time_per_iteration": 2.467451572418213 + }, + { + "auxiliary_loss_clip": 0.01155622, + "auxiliary_loss_mlp": 0.01049144, + "balance_loss_clip": 1.02912855, + "balance_loss_mlp": 1.0513978, + "epoch": 0.1594468660754547, + "flos": 22488842119680.0, + "grad_norm": 1.909821572556346, + "language_loss": 0.7997523, + "learning_rate": 3.8267845296912674e-06, + "loss": 0.82179999, + "num_input_tokens_seen": 57439315, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.046875, + "step": 2652, + "time_per_iteration": 2.519624948501587 + }, + { + "auxiliary_loss_clip": 0.01153837, + "auxiliary_loss_mlp": 0.01045946, + "balance_loss_clip": 1.02665794, + "balance_loss_mlp": 1.05385149, + "epoch": 0.15950698932812266, + "flos": 15007320910080.0, + "grad_norm": 2.695147262103697, + "language_loss": 0.70050812, + "learning_rate": 3.826625952782601e-06, + "loss": 0.72250587, + "num_input_tokens_seen": 57454635, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.0, + "step": 2653, + "time_per_iteration": 2.439445972442627 + }, + { + "auxiliary_loss_clip": 0.01154814, + "auxiliary_loss_mlp": 0.01043059, + "balance_loss_clip": 1.02309155, + "balance_loss_mlp": 1.05308652, + "epoch": 0.15956711258079062, + "flos": 30155052084480.0, + "grad_norm": 2.046273350718209, + "language_loss": 0.76509416, + "learning_rate": 3.826467306608095e-06, + "loss": 0.7870729, + "num_input_tokens_seen": 57476805, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.015625, + "step": 2654, + "time_per_iteration": 2.529644012451172 + }, + { + "auxiliary_loss_clip": 0.01154147, + "auxiliary_loss_mlp": 0.01047134, + "balance_loss_clip": 1.02750051, + "balance_loss_mlp": 1.0526185, + "epoch": 0.1596272358334586, + "flos": 21032700670080.0, + "grad_norm": 1.961582700797155, + "language_loss": 0.8208828, + "learning_rate": 3.826308591173765e-06, + "loss": 0.84289569, + "num_input_tokens_seen": 57496400, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.015625, + "step": 2655, + "time_per_iteration": 2.4841158390045166 + }, + { + "auxiliary_loss_clip": 0.01154679, + "auxiliary_loss_mlp": 0.01049984, + "balance_loss_clip": 1.03166127, + "balance_loss_mlp": 1.05125904, + "epoch": 0.15968735908612655, + "flos": 15268032800640.0, + "grad_norm": 2.077546195878165, + "language_loss": 0.73565602, + "learning_rate": 3.826149806485631e-06, + "loss": 0.75770259, + "num_input_tokens_seen": 57513700, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.03125, + "step": 2656, + "time_per_iteration": 2.4727072715759277 + }, + { + "auxiliary_loss_clip": 0.01149623, + "auxiliary_loss_mlp": 0.01046235, + "balance_loss_clip": 1.02766216, + "balance_loss_mlp": 1.05170095, + "epoch": 0.15974748233879452, + "flos": 52665726695040.0, + "grad_norm": 1.884771930829773, + "language_loss": 0.77508467, + "learning_rate": 3.825990952549713e-06, + "loss": 0.79704326, + "num_input_tokens_seen": 57536180, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.98046875, + "step": 2657, + "time_per_iteration": 2.801560401916504 + }, + { + "auxiliary_loss_clip": 0.01154211, + "auxiliary_loss_mlp": 0.01048143, + "balance_loss_clip": 1.02910495, + "balance_loss_mlp": 1.05459499, + "epoch": 0.1598076055914625, + "flos": 18733232730240.0, + "grad_norm": 1.6493844029380673, + "language_loss": 0.74807733, + "learning_rate": 3.825832029372035e-06, + "loss": 0.77010089, + "num_input_tokens_seen": 57555025, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.99609375, + "step": 2658, + "time_per_iteration": 2.4434328079223633 + }, + { + "auxiliary_loss_clip": 0.01155878, + "auxiliary_loss_mlp": 0.01050595, + "balance_loss_clip": 1.02912521, + "balance_loss_mlp": 1.05291355, + "epoch": 0.15986772884413047, + "flos": 34349238535680.0, + "grad_norm": 1.8153435843839463, + "language_loss": 0.75194407, + "learning_rate": 3.825673036958624e-06, + "loss": 0.77400887, + "num_input_tokens_seen": 57577660, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.03125, + "step": 2659, + "time_per_iteration": 2.587700366973877 + }, + { + "auxiliary_loss_clip": 0.01159224, + "auxiliary_loss_mlp": 0.01052946, + "balance_loss_clip": 1.03295422, + "balance_loss_mlp": 1.05531979, + "epoch": 0.15992785209679844, + "flos": 22054969739520.0, + "grad_norm": 2.4521775760186526, + "language_loss": 0.90417045, + "learning_rate": 3.825513975315508e-06, + "loss": 0.92629218, + "num_input_tokens_seen": 57596335, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0390625, + "step": 2660, + "time_per_iteration": 2.45237398147583 + }, + { + "auxiliary_loss_clip": 0.01161295, + "auxiliary_loss_mlp": 0.0105014, + "balance_loss_clip": 1.0300889, + "balance_loss_mlp": 1.05822825, + "epoch": 0.1599879753494664, + "flos": 33066652625280.0, + "grad_norm": 2.0123178843036373, + "language_loss": 0.77552611, + "learning_rate": 3.82535484444872e-06, + "loss": 0.79764044, + "num_input_tokens_seen": 57616830, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.03125, + "step": 2661, + "time_per_iteration": 2.574652910232544 + }, + { + "auxiliary_loss_clip": 0.01158998, + "auxiliary_loss_mlp": 0.01048944, + "balance_loss_clip": 1.02913153, + "balance_loss_mlp": 1.05460262, + "epoch": 0.16004809860213437, + "flos": 28038010343040.0, + "grad_norm": 1.7348749157972516, + "language_loss": 0.74735796, + "learning_rate": 3.825195644364292e-06, + "loss": 0.76943737, + "num_input_tokens_seen": 57635515, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.046875, + "step": 2662, + "time_per_iteration": 2.506974935531616 + }, + { + "auxiliary_loss_clip": 0.01158039, + "auxiliary_loss_mlp": 0.01051532, + "balance_loss_clip": 1.03233898, + "balance_loss_mlp": 1.05416894, + "epoch": 0.16010822185480234, + "flos": 22780113505920.0, + "grad_norm": 2.0770925688556074, + "language_loss": 0.82047677, + "learning_rate": 3.825036375068263e-06, + "loss": 0.84257245, + "num_input_tokens_seen": 57654250, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0390625, + "step": 2663, + "time_per_iteration": 2.459630012512207 + }, + { + "auxiliary_loss_clip": 0.0116012, + "auxiliary_loss_mlp": 0.0104966, + "balance_loss_clip": 1.02978826, + "balance_loss_mlp": 1.05576038, + "epoch": 0.16016834510747033, + "flos": 20084012611200.0, + "grad_norm": 2.5815812177362454, + "language_loss": 0.7910682, + "learning_rate": 3.824877036566672e-06, + "loss": 0.81316602, + "num_input_tokens_seen": 57672645, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0390625, + "step": 2664, + "time_per_iteration": 2.4978790283203125 + }, + { + "auxiliary_loss_clip": 0.01156133, + "auxiliary_loss_mlp": 0.01051164, + "balance_loss_clip": 1.03222167, + "balance_loss_mlp": 1.05318165, + "epoch": 0.1602284683601383, + "flos": 21173829206400.0, + "grad_norm": 1.8148985254226184, + "language_loss": 0.93767202, + "learning_rate": 3.824717628865561e-06, + "loss": 0.95974499, + "num_input_tokens_seen": 57691055, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.03125, + "step": 2665, + "time_per_iteration": 2.467349052429199 + }, + { + "auxiliary_loss_clip": 0.01157965, + "auxiliary_loss_mlp": 0.01047224, + "balance_loss_clip": 1.02750635, + "balance_loss_mlp": 1.05352151, + "epoch": 0.16028859161280626, + "flos": 14647568244480.0, + "grad_norm": 1.9534389472193405, + "language_loss": 0.85255575, + "learning_rate": 3.824558151970974e-06, + "loss": 0.87460762, + "num_input_tokens_seen": 57707235, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.046875, + "step": 2666, + "time_per_iteration": 2.4229867458343506 + }, + { + "auxiliary_loss_clip": 0.01155877, + "auxiliary_loss_mlp": 0.01047711, + "balance_loss_clip": 1.02899504, + "balance_loss_mlp": 1.05404496, + "epoch": 0.16034871486547422, + "flos": 20990325600000.0, + "grad_norm": 1.873987360542769, + "language_loss": 0.81461811, + "learning_rate": 3.8243986058889595e-06, + "loss": 0.83665401, + "num_input_tokens_seen": 57724190, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 2667, + "time_per_iteration": 2.4989583492279053 + }, + { + "auxiliary_loss_clip": 0.01157612, + "auxiliary_loss_mlp": 0.01050501, + "balance_loss_clip": 1.03104627, + "balance_loss_mlp": 1.05707479, + "epoch": 0.1604088381181422, + "flos": 21397732634880.0, + "grad_norm": 2.676276626789842, + "language_loss": 0.74079859, + "learning_rate": 3.824238990625567e-06, + "loss": 0.76287973, + "num_input_tokens_seen": 57743620, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0078125, + "step": 2668, + "time_per_iteration": 2.463395357131958 + }, + { + "auxiliary_loss_clip": 0.01158531, + "auxiliary_loss_mlp": 0.01051768, + "balance_loss_clip": 1.03175282, + "balance_loss_mlp": 1.05527806, + "epoch": 0.16046896137081015, + "flos": 23877040993920.0, + "grad_norm": 1.6382268793433732, + "language_loss": 0.77214229, + "learning_rate": 3.824079306186848e-06, + "loss": 0.79424524, + "num_input_tokens_seen": 57764810, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.03125, + "step": 2669, + "time_per_iteration": 2.5107781887054443 + }, + { + "auxiliary_loss_clip": 0.01059914, + "auxiliary_loss_mlp": 0.01008943, + "balance_loss_clip": 1.0062964, + "balance_loss_mlp": 1.0249362, + "epoch": 0.16052908462347812, + "flos": 59806709015040.0, + "grad_norm": 0.8072457077707946, + "language_loss": 0.55571371, + "learning_rate": 3.823919552578861e-06, + "loss": 0.57640231, + "num_input_tokens_seen": 57824390, + "router_z_loss_clip": 0.02648926, + "router_z_loss_mlp": 0.34960938, + "step": 2670, + "time_per_iteration": 2.964386463165283 + }, + { + "auxiliary_loss_clip": 0.01157188, + "auxiliary_loss_mlp": 0.01043938, + "balance_loss_clip": 1.02544856, + "balance_loss_mlp": 1.05379438, + "epoch": 0.1605892078761461, + "flos": 18296559089280.0, + "grad_norm": 8.31640977393562, + "language_loss": 0.77088535, + "learning_rate": 3.82375972980766e-06, + "loss": 0.79289663, + "num_input_tokens_seen": 57843665, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.03125, + "step": 2671, + "time_per_iteration": 2.4722845554351807 + }, + { + "auxiliary_loss_clip": 0.01159298, + "auxiliary_loss_mlp": 0.01045605, + "balance_loss_clip": 1.02684164, + "balance_loss_mlp": 1.05666459, + "epoch": 0.16064933112881408, + "flos": 32160734686080.0, + "grad_norm": 1.9636142117953166, + "language_loss": 0.64497644, + "learning_rate": 3.8235998378793086e-06, + "loss": 0.66702545, + "num_input_tokens_seen": 57863305, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.03125, + "step": 2672, + "time_per_iteration": 2.5702145099639893 + }, + { + "auxiliary_loss_clip": 0.01157155, + "auxiliary_loss_mlp": 0.01042294, + "balance_loss_clip": 1.02128983, + "balance_loss_mlp": 1.05270457, + "epoch": 0.16070945438148204, + "flos": 19828795501440.0, + "grad_norm": 1.885579538712505, + "language_loss": 0.8533771, + "learning_rate": 3.8234398767998675e-06, + "loss": 0.87537158, + "num_input_tokens_seen": 57883025, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.046875, + "step": 2673, + "time_per_iteration": 2.4754209518432617 + }, + { + "auxiliary_loss_clip": 0.01156938, + "auxiliary_loss_mlp": 0.01055602, + "balance_loss_clip": 1.03718424, + "balance_loss_mlp": 1.05537605, + "epoch": 0.16076957763415, + "flos": 18913144976640.0, + "grad_norm": 2.484212796080384, + "language_loss": 0.72797197, + "learning_rate": 3.823279846575403e-06, + "loss": 0.75009739, + "num_input_tokens_seen": 57901430, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.015625, + "step": 2674, + "time_per_iteration": 2.4771230220794678 + }, + { + "auxiliary_loss_clip": 0.01153717, + "auxiliary_loss_mlp": 0.01047616, + "balance_loss_clip": 1.02745771, + "balance_loss_mlp": 1.05242229, + "epoch": 0.16082970088681797, + "flos": 16764358590720.0, + "grad_norm": 2.0917218572710143, + "language_loss": 0.84550452, + "learning_rate": 3.823119747211986e-06, + "loss": 0.86751789, + "num_input_tokens_seen": 57919550, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.015625, + "step": 2675, + "time_per_iteration": 2.4583237171173096 + }, + { + "auxiliary_loss_clip": 0.01158822, + "auxiliary_loss_mlp": 0.0104935, + "balance_loss_clip": 1.02890563, + "balance_loss_mlp": 1.0566349, + "epoch": 0.16088982413948594, + "flos": 35150261783040.0, + "grad_norm": 1.979365293626276, + "language_loss": 0.82605797, + "learning_rate": 3.822959578715685e-06, + "loss": 0.84813964, + "num_input_tokens_seen": 57939890, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0234375, + "step": 2676, + "time_per_iteration": 2.5966403484344482 + }, + { + "auxiliary_loss_clip": 0.01157172, + "auxiliary_loss_mlp": 0.01050632, + "balance_loss_clip": 1.03263116, + "balance_loss_mlp": 1.05701363, + "epoch": 0.1609499473921539, + "flos": 18625105814400.0, + "grad_norm": 1.9372140801278581, + "language_loss": 0.73252106, + "learning_rate": 3.822799341092573e-06, + "loss": 0.75459909, + "num_input_tokens_seen": 57957410, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0, + "step": 2677, + "time_per_iteration": 2.459545135498047 + }, + { + "auxiliary_loss_clip": 0.01153742, + "auxiliary_loss_mlp": 0.01046774, + "balance_loss_clip": 1.02774811, + "balance_loss_mlp": 1.05381799, + "epoch": 0.1610100706448219, + "flos": 33145728416640.0, + "grad_norm": 3.4714871699848, + "language_loss": 0.76175338, + "learning_rate": 3.822639034348728e-06, + "loss": 0.78375852, + "num_input_tokens_seen": 57977900, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0, + "step": 2678, + "time_per_iteration": 2.6220550537109375 + }, + { + "auxiliary_loss_clip": 0.01154476, + "auxiliary_loss_mlp": 0.01048242, + "balance_loss_clip": 1.02835846, + "balance_loss_mlp": 1.05157948, + "epoch": 0.16107019389748986, + "flos": 34676707852800.0, + "grad_norm": 1.6939354956764687, + "language_loss": 0.70202518, + "learning_rate": 3.822478658490228e-06, + "loss": 0.72405231, + "num_input_tokens_seen": 57998210, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.03125, + "step": 2679, + "time_per_iteration": 2.580995559692383 + }, + { + "auxiliary_loss_clip": 0.01054747, + "auxiliary_loss_mlp": 0.01023179, + "balance_loss_clip": 1.020854, + "balance_loss_mlp": 1.02026391, + "epoch": 0.16113031715015783, + "flos": 65713403260800.0, + "grad_norm": 0.8161414687228778, + "language_loss": 0.51844025, + "learning_rate": 3.822318213523154e-06, + "loss": 0.5392195, + "num_input_tokens_seen": 58059420, + "router_z_loss_clip": 0.02319336, + "router_z_loss_mlp": 0.34375, + "step": 2680, + "time_per_iteration": 3.105682849884033 + }, + { + "auxiliary_loss_clip": 0.01155604, + "auxiliary_loss_mlp": 0.01047691, + "balance_loss_clip": 1.02750874, + "balance_loss_mlp": 1.05157876, + "epoch": 0.1611904404028258, + "flos": 20810413353600.0, + "grad_norm": 1.8335073832427007, + "language_loss": 0.80319828, + "learning_rate": 3.8221576994535925e-06, + "loss": 0.82523119, + "num_input_tokens_seen": 58078370, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0390625, + "step": 2681, + "time_per_iteration": 2.4695565700531006 + }, + { + "auxiliary_loss_clip": 0.01151045, + "auxiliary_loss_mlp": 0.01058971, + "balance_loss_clip": 1.04031444, + "balance_loss_mlp": 1.05258918, + "epoch": 0.16125056365549376, + "flos": 27013335062400.0, + "grad_norm": 1.8021457293712753, + "language_loss": 0.69142133, + "learning_rate": 3.821997116287627e-06, + "loss": 0.71352148, + "num_input_tokens_seen": 58097395, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.984375, + "step": 2682, + "time_per_iteration": 2.5027854442596436 + }, + { + "auxiliary_loss_clip": 0.011576, + "auxiliary_loss_mlp": 0.01048243, + "balance_loss_clip": 1.02800107, + "balance_loss_mlp": 1.0559957, + "epoch": 0.16131068690816172, + "flos": 19276524915840.0, + "grad_norm": 1.8107912193408944, + "language_loss": 0.87568235, + "learning_rate": 3.821836464031348e-06, + "loss": 0.89774084, + "num_input_tokens_seen": 58115630, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.015625, + "step": 2683, + "time_per_iteration": 2.461944341659546 + }, + { + "auxiliary_loss_clip": 0.01156212, + "auxiliary_loss_mlp": 0.0105566, + "balance_loss_clip": 1.03587174, + "balance_loss_mlp": 1.05452991, + "epoch": 0.16137081016082971, + "flos": 35337931367040.0, + "grad_norm": 3.5824209574719035, + "language_loss": 0.74160969, + "learning_rate": 3.821675742690849e-06, + "loss": 0.76372838, + "num_input_tokens_seen": 58138655, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.015625, + "step": 2684, + "time_per_iteration": 4.005981206893921 + }, + { + "auxiliary_loss_clip": 0.01159701, + "auxiliary_loss_mlp": 0.01048543, + "balance_loss_clip": 1.02811038, + "balance_loss_mlp": 1.05543995, + "epoch": 0.16143093341349768, + "flos": 34235257703040.0, + "grad_norm": 1.919238603617177, + "language_loss": 0.70244128, + "learning_rate": 3.821514952272223e-06, + "loss": 0.72452366, + "num_input_tokens_seen": 58157440, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.046875, + "step": 2685, + "time_per_iteration": 5.387023448944092 + }, + { + "auxiliary_loss_clip": 0.0115036, + "auxiliary_loss_mlp": 0.01047397, + "balance_loss_clip": 1.0282284, + "balance_loss_mlp": 1.0518229, + "epoch": 0.16149105666616564, + "flos": 27999262546560.0, + "grad_norm": 1.8016019482814314, + "language_loss": 0.71518582, + "learning_rate": 3.821354092781567e-06, + "loss": 0.73716336, + "num_input_tokens_seen": 58176660, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.984375, + "step": 2686, + "time_per_iteration": 2.5451064109802246 + }, + { + "auxiliary_loss_clip": 0.01157161, + "auxiliary_loss_mlp": 0.01051189, + "balance_loss_clip": 1.03191292, + "balance_loss_mlp": 1.05551481, + "epoch": 0.1615511799188336, + "flos": 19422214479360.0, + "grad_norm": 1.8631629169214377, + "language_loss": 0.81521869, + "learning_rate": 3.821193164224981e-06, + "loss": 0.83730221, + "num_input_tokens_seen": 58195085, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.015625, + "step": 2687, + "time_per_iteration": 2.4542620182037354 + }, + { + "auxiliary_loss_clip": 0.01155843, + "auxiliary_loss_mlp": 0.01044301, + "balance_loss_clip": 1.02327275, + "balance_loss_mlp": 1.04894984, + "epoch": 0.16161130317150157, + "flos": 22854915578880.0, + "grad_norm": 1.8081463969498348, + "language_loss": 0.71823454, + "learning_rate": 3.821032166608568e-06, + "loss": 0.74023592, + "num_input_tokens_seen": 58213540, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.0625, + "step": 2688, + "time_per_iteration": 2.493476152420044 + }, + { + "auxiliary_loss_clip": 0.0115191, + "auxiliary_loss_mlp": 0.01045785, + "balance_loss_clip": 1.02730739, + "balance_loss_mlp": 1.05067098, + "epoch": 0.16167142642416954, + "flos": 26110577520000.0, + "grad_norm": 2.2392978206929555, + "language_loss": 0.76041406, + "learning_rate": 3.8208710999384325e-06, + "loss": 0.78239101, + "num_input_tokens_seen": 58236995, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.015625, + "step": 2689, + "time_per_iteration": 2.5840976238250732 + }, + { + "auxiliary_loss_clip": 0.01155388, + "auxiliary_loss_mlp": 0.01046669, + "balance_loss_clip": 1.02704763, + "balance_loss_mlp": 1.05417943, + "epoch": 0.1617315496768375, + "flos": 22779646629120.0, + "grad_norm": 1.9258973882551216, + "language_loss": 0.87260234, + "learning_rate": 3.820709964220683e-06, + "loss": 0.89462292, + "num_input_tokens_seen": 58257230, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.015625, + "step": 2690, + "time_per_iteration": 2.496943473815918 + }, + { + "auxiliary_loss_clip": 0.01151534, + "auxiliary_loss_mlp": 0.01047712, + "balance_loss_clip": 1.02980638, + "balance_loss_mlp": 1.05211663, + "epoch": 0.1617916729295055, + "flos": 22017299351040.0, + "grad_norm": 1.562024048541713, + "language_loss": 0.87728393, + "learning_rate": 3.8205487594614284e-06, + "loss": 0.89927632, + "num_input_tokens_seen": 58277080, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9921875, + "step": 2691, + "time_per_iteration": 2.510960817337036 + }, + { + "auxiliary_loss_clip": 0.01157097, + "auxiliary_loss_mlp": 0.01049167, + "balance_loss_clip": 1.02764988, + "balance_loss_mlp": 1.05021381, + "epoch": 0.16185179618217346, + "flos": 23438248450560.0, + "grad_norm": 2.082856606872889, + "language_loss": 0.82327259, + "learning_rate": 3.820387485666784e-06, + "loss": 0.84533525, + "num_input_tokens_seen": 58294815, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.0703125, + "step": 2692, + "time_per_iteration": 2.481032371520996 + }, + { + "auxiliary_loss_clip": 0.0115716, + "auxiliary_loss_mlp": 0.01048999, + "balance_loss_clip": 1.02835155, + "balance_loss_mlp": 1.05069244, + "epoch": 0.16191191943484143, + "flos": 25666110627840.0, + "grad_norm": 3.0763505181853454, + "language_loss": 0.80942917, + "learning_rate": 3.820226142842862e-06, + "loss": 0.83149081, + "num_input_tokens_seen": 58313215, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.0625, + "step": 2693, + "time_per_iteration": 2.493278980255127 + }, + { + "auxiliary_loss_clip": 0.01150662, + "auxiliary_loss_mlp": 0.01054953, + "balance_loss_clip": 1.03670192, + "balance_loss_mlp": 1.05223358, + "epoch": 0.1619720426875094, + "flos": 23477355383040.0, + "grad_norm": 1.7139740211881158, + "language_loss": 0.83639967, + "learning_rate": 3.820064730995783e-06, + "loss": 0.85845578, + "num_input_tokens_seen": 58333215, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.984375, + "step": 2694, + "time_per_iteration": 2.5051510334014893 + }, + { + "auxiliary_loss_clip": 0.01156309, + "auxiliary_loss_mlp": 0.01048183, + "balance_loss_clip": 1.02764273, + "balance_loss_mlp": 1.0509156, + "epoch": 0.16203216594017736, + "flos": 24133658734080.0, + "grad_norm": 1.9608549080280004, + "language_loss": 0.69125426, + "learning_rate": 3.819903250131667e-06, + "loss": 0.71329916, + "num_input_tokens_seen": 58351160, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0546875, + "step": 2695, + "time_per_iteration": 2.495098352432251 + }, + { + "auxiliary_loss_clip": 0.01159947, + "auxiliary_loss_mlp": 0.01054922, + "balance_loss_clip": 1.03391731, + "balance_loss_mlp": 1.05520689, + "epoch": 0.16209228919284532, + "flos": 22340889999360.0, + "grad_norm": 2.466913217352614, + "language_loss": 0.82403111, + "learning_rate": 3.819741700256637e-06, + "loss": 0.84617984, + "num_input_tokens_seen": 58368505, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.046875, + "step": 2696, + "time_per_iteration": 2.484523296356201 + }, + { + "auxiliary_loss_clip": 0.01161904, + "auxiliary_loss_mlp": 0.01056335, + "balance_loss_clip": 1.03529406, + "balance_loss_mlp": 1.05316591, + "epoch": 0.1621524124455133, + "flos": 15815131827840.0, + "grad_norm": 1.9982919021229957, + "language_loss": 0.8852337, + "learning_rate": 3.8195800813768194e-06, + "loss": 0.90741605, + "num_input_tokens_seen": 58385085, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.09375, + "step": 2697, + "time_per_iteration": 2.4806151390075684 + }, + { + "auxiliary_loss_clip": 0.01147135, + "auxiliary_loss_mlp": 0.01046149, + "balance_loss_clip": 1.02756453, + "balance_loss_mlp": 1.04989469, + "epoch": 0.16221253569818128, + "flos": 30186688988160.0, + "grad_norm": 1.4702975792509376, + "language_loss": 0.80172735, + "learning_rate": 3.819418393498343e-06, + "loss": 0.82366014, + "num_input_tokens_seen": 58406985, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.97265625, + "step": 2698, + "time_per_iteration": 2.532137393951416 + }, + { + "auxiliary_loss_clip": 0.01149805, + "auxiliary_loss_mlp": 0.01049018, + "balance_loss_clip": 1.02957439, + "balance_loss_mlp": 1.05167758, + "epoch": 0.16227265895084925, + "flos": 24605991601920.0, + "grad_norm": 1.5576448961090323, + "language_loss": 0.77258182, + "learning_rate": 3.819256636627339e-06, + "loss": 0.79456997, + "num_input_tokens_seen": 58426205, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.98046875, + "step": 2699, + "time_per_iteration": 2.514084577560425 + }, + { + "auxiliary_loss_clip": 0.01150261, + "auxiliary_loss_mlp": 0.0104371, + "balance_loss_clip": 1.0251497, + "balance_loss_mlp": 1.04891944, + "epoch": 0.1623327822035172, + "flos": 19573326996480.0, + "grad_norm": 2.038036982956784, + "language_loss": 0.85697722, + "learning_rate": 3.81909481076994e-06, + "loss": 0.87891692, + "num_input_tokens_seen": 58443830, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.015625, + "step": 2700, + "time_per_iteration": 2.4434289932250977 + }, + { + "auxiliary_loss_clip": 0.01147712, + "auxiliary_loss_mlp": 0.0104585, + "balance_loss_clip": 1.0247376, + "balance_loss_mlp": 1.04878318, + "epoch": 0.16239290545618518, + "flos": 26468462678400.0, + "grad_norm": 1.6982179557795123, + "language_loss": 0.80378878, + "learning_rate": 3.818932915932284e-06, + "loss": 0.82572436, + "num_input_tokens_seen": 58464405, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.98828125, + "step": 2701, + "time_per_iteration": 2.5267322063446045 + }, + { + "auxiliary_loss_clip": 0.01156296, + "auxiliary_loss_mlp": 0.01048895, + "balance_loss_clip": 1.02945244, + "balance_loss_mlp": 1.05514598, + "epoch": 0.16245302870885314, + "flos": 15851940289920.0, + "grad_norm": 1.5999982166608073, + "language_loss": 0.73006868, + "learning_rate": 3.818770952120511e-06, + "loss": 0.75212055, + "num_input_tokens_seen": 58483295, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0078125, + "step": 2702, + "time_per_iteration": 2.44750714302063 + }, + { + "auxiliary_loss_clip": 0.01153204, + "auxiliary_loss_mlp": 0.01050741, + "balance_loss_clip": 1.02986753, + "balance_loss_mlp": 1.05053687, + "epoch": 0.1625131519615211, + "flos": 14756521173120.0, + "grad_norm": 2.5386207662450464, + "language_loss": 0.73164749, + "learning_rate": 3.81860891934076e-06, + "loss": 0.7536869, + "num_input_tokens_seen": 58501205, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.0234375, + "step": 2703, + "time_per_iteration": 2.469242811203003 + }, + { + "auxiliary_loss_clip": 0.01150736, + "auxiliary_loss_mlp": 0.01046914, + "balance_loss_clip": 1.02538514, + "balance_loss_mlp": 1.04765964, + "epoch": 0.1625732752141891, + "flos": 28220508368640.0, + "grad_norm": 1.9216464968932823, + "language_loss": 0.70681584, + "learning_rate": 3.818446817599176e-06, + "loss": 0.72879231, + "num_input_tokens_seen": 58522315, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.03125, + "step": 2704, + "time_per_iteration": 2.5236263275146484 + }, + { + "auxiliary_loss_clip": 0.0105028, + "auxiliary_loss_mlp": 0.01003507, + "balance_loss_clip": 1.00091982, + "balance_loss_mlp": 1.01563144, + "epoch": 0.16263339846685707, + "flos": 67327947688320.0, + "grad_norm": 0.7797469934396678, + "language_loss": 0.53369009, + "learning_rate": 3.818284646901907e-06, + "loss": 0.55422795, + "num_input_tokens_seen": 58586695, + "router_z_loss_clip": 0.02587891, + "router_z_loss_mlp": 0.34765625, + "step": 2705, + "time_per_iteration": 3.0887868404388428 + }, + { + "auxiliary_loss_clip": 0.0115608, + "auxiliary_loss_mlp": 0.01048272, + "balance_loss_clip": 1.02873373, + "balance_loss_mlp": 1.05151534, + "epoch": 0.16269352171952503, + "flos": 14319165173760.0, + "grad_norm": 2.4525976943058896, + "language_loss": 0.75060308, + "learning_rate": 3.818122407255102e-06, + "loss": 0.77264655, + "num_input_tokens_seen": 58602435, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.046875, + "step": 2706, + "time_per_iteration": 2.439283847808838 + }, + { + "auxiliary_loss_clip": 0.01154579, + "auxiliary_loss_mlp": 0.01051686, + "balance_loss_clip": 1.03248119, + "balance_loss_mlp": 1.05240536, + "epoch": 0.162753644972193, + "flos": 28361205941760.0, + "grad_norm": 1.9153778871117788, + "language_loss": 0.7234174, + "learning_rate": 3.817960098664914e-06, + "loss": 0.74547994, + "num_input_tokens_seen": 58621275, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.015625, + "step": 2707, + "time_per_iteration": 2.51819109916687 + }, + { + "auxiliary_loss_clip": 0.01155215, + "auxiliary_loss_mlp": 0.01050366, + "balance_loss_clip": 1.03154302, + "balance_loss_mlp": 1.05275822, + "epoch": 0.16281376822486096, + "flos": 19937856170880.0, + "grad_norm": 3.869992791268662, + "language_loss": 0.83790398, + "learning_rate": 3.817797721137495e-06, + "loss": 0.85995972, + "num_input_tokens_seen": 58637550, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.03125, + "step": 2708, + "time_per_iteration": 2.4592010974884033 + }, + { + "auxiliary_loss_clip": 0.0115992, + "auxiliary_loss_mlp": 0.01049095, + "balance_loss_clip": 1.02768469, + "balance_loss_mlp": 1.05268705, + "epoch": 0.16287389147752893, + "flos": 21251719848960.0, + "grad_norm": 2.162290718142945, + "language_loss": 0.86529553, + "learning_rate": 3.817635274679006e-06, + "loss": 0.88738573, + "num_input_tokens_seen": 58654135, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.0703125, + "step": 2709, + "time_per_iteration": 2.4745054244995117 + }, + { + "auxiliary_loss_clip": 0.01154974, + "auxiliary_loss_mlp": 0.01054439, + "balance_loss_clip": 1.0353297, + "balance_loss_mlp": 1.05096519, + "epoch": 0.1629340147301969, + "flos": 19244672530560.0, + "grad_norm": 1.6782807127870958, + "language_loss": 0.91449893, + "learning_rate": 3.817472759295605e-06, + "loss": 0.93659306, + "num_input_tokens_seen": 58674320, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0390625, + "step": 2710, + "time_per_iteration": 2.4846651554107666 + }, + { + "auxiliary_loss_clip": 0.0115562, + "auxiliary_loss_mlp": 0.01054818, + "balance_loss_clip": 1.03549433, + "balance_loss_mlp": 1.05447197, + "epoch": 0.16299413798286488, + "flos": 21249816428160.0, + "grad_norm": 1.99410407833921, + "language_loss": 0.8129673, + "learning_rate": 3.817310174993453e-06, + "loss": 0.83507168, + "num_input_tokens_seen": 58691000, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0078125, + "step": 2711, + "time_per_iteration": 2.4878618717193604 + }, + { + "auxiliary_loss_clip": 0.01153665, + "auxiliary_loss_mlp": 0.01046499, + "balance_loss_clip": 1.02731824, + "balance_loss_mlp": 1.04737568, + "epoch": 0.16305426123553285, + "flos": 18770579896320.0, + "grad_norm": 2.7794575527068077, + "language_loss": 0.81605875, + "learning_rate": 3.817147521778719e-06, + "loss": 0.83806038, + "num_input_tokens_seen": 58710230, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0625, + "step": 2712, + "time_per_iteration": 2.4479072093963623 + }, + { + "auxiliary_loss_clip": 0.01158025, + "auxiliary_loss_mlp": 0.01058532, + "balance_loss_clip": 1.03858864, + "balance_loss_mlp": 1.05211174, + "epoch": 0.16311438448820081, + "flos": 22087648137600.0, + "grad_norm": 2.1959953506899774, + "language_loss": 0.76885653, + "learning_rate": 3.816984799657568e-06, + "loss": 0.79102206, + "num_input_tokens_seen": 58728610, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0625, + "step": 2713, + "time_per_iteration": 2.493394374847412 + }, + { + "auxiliary_loss_clip": 0.01155185, + "auxiliary_loss_mlp": 0.01062558, + "balance_loss_clip": 1.04290032, + "balance_loss_mlp": 1.05623782, + "epoch": 0.16317450774086878, + "flos": 16467700164480.0, + "grad_norm": 2.081844956712308, + "language_loss": 0.78926778, + "learning_rate": 3.8168220086361715e-06, + "loss": 0.8114453, + "num_input_tokens_seen": 58744385, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.98828125, + "step": 2714, + "time_per_iteration": 2.442214012145996 + }, + { + "auxiliary_loss_clip": 0.01155305, + "auxiliary_loss_mlp": 0.01059199, + "balance_loss_clip": 1.04011369, + "balance_loss_mlp": 1.05286288, + "epoch": 0.16323463099353674, + "flos": 24352929308160.0, + "grad_norm": 2.259619309439112, + "language_loss": 0.78143466, + "learning_rate": 3.816659148720702e-06, + "loss": 0.80357969, + "num_input_tokens_seen": 58763905, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0234375, + "step": 2715, + "time_per_iteration": 2.499178409576416 + }, + { + "auxiliary_loss_clip": 0.01150615, + "auxiliary_loss_mlp": 0.01047807, + "balance_loss_clip": 1.02973497, + "balance_loss_mlp": 1.04868412, + "epoch": 0.1632947542462047, + "flos": 24900782520960.0, + "grad_norm": 2.0916631483814783, + "language_loss": 0.81397748, + "learning_rate": 3.816496219917336e-06, + "loss": 0.8359617, + "num_input_tokens_seen": 58785580, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.015625, + "step": 2716, + "time_per_iteration": 2.5004689693450928 + }, + { + "auxiliary_loss_clip": 0.01158421, + "auxiliary_loss_mlp": 0.01057354, + "balance_loss_clip": 1.03853106, + "balance_loss_mlp": 1.05482328, + "epoch": 0.1633548774988727, + "flos": 24900279730560.0, + "grad_norm": 1.8793848003912939, + "language_loss": 0.86203027, + "learning_rate": 3.816333222232251e-06, + "loss": 0.88418794, + "num_input_tokens_seen": 58806075, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0390625, + "step": 2717, + "time_per_iteration": 2.5112617015838623 + }, + { + "auxiliary_loss_clip": 0.0115161, + "auxiliary_loss_mlp": 0.01046152, + "balance_loss_clip": 1.02725708, + "balance_loss_mlp": 1.05153894, + "epoch": 0.16341500075154067, + "flos": 30441798357120.0, + "grad_norm": 1.652261986612604, + "language_loss": 0.76514149, + "learning_rate": 3.816170155671629e-06, + "loss": 0.78711915, + "num_input_tokens_seen": 58827405, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0, + "step": 2718, + "time_per_iteration": 2.549654245376587 + }, + { + "auxiliary_loss_clip": 0.01156654, + "auxiliary_loss_mlp": 0.01045361, + "balance_loss_clip": 1.02696729, + "balance_loss_mlp": 1.05180717, + "epoch": 0.16347512400420863, + "flos": 22784530878720.0, + "grad_norm": 2.080955072975882, + "language_loss": 0.73027492, + "learning_rate": 3.816007020241652e-06, + "loss": 0.75229508, + "num_input_tokens_seen": 58847205, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.046875, + "step": 2719, + "time_per_iteration": 2.4911599159240723 + }, + { + "auxiliary_loss_clip": 0.01151759, + "auxiliary_loss_mlp": 0.01049636, + "balance_loss_clip": 1.03084862, + "balance_loss_mlp": 1.0492239, + "epoch": 0.1635352472568766, + "flos": 22633274707200.0, + "grad_norm": 1.6610037254914274, + "language_loss": 0.72384167, + "learning_rate": 3.815843815948507e-06, + "loss": 0.74585563, + "num_input_tokens_seen": 58866865, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0234375, + "step": 2720, + "time_per_iteration": 2.4733760356903076 + }, + { + "auxiliary_loss_clip": 0.01150132, + "auxiliary_loss_mlp": 0.01048266, + "balance_loss_clip": 1.02789283, + "balance_loss_mlp": 1.05076206, + "epoch": 0.16359537050954456, + "flos": 15522998515200.0, + "grad_norm": 2.2797021453727893, + "language_loss": 0.75100243, + "learning_rate": 3.8156805427983824e-06, + "loss": 0.77298641, + "num_input_tokens_seen": 58885200, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.9921875, + "step": 2721, + "time_per_iteration": 2.44942569732666 + }, + { + "auxiliary_loss_clip": 0.01155245, + "auxiliary_loss_mlp": 0.01049168, + "balance_loss_clip": 1.02893853, + "balance_loss_mlp": 1.0502317, + "epoch": 0.16365549376221253, + "flos": 22090162089600.0, + "grad_norm": 1.74959220753002, + "language_loss": 0.79254043, + "learning_rate": 3.8155172007974695e-06, + "loss": 0.81458461, + "num_input_tokens_seen": 58906385, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.046875, + "step": 2722, + "time_per_iteration": 2.4775915145874023 + }, + { + "auxiliary_loss_clip": 0.01158964, + "auxiliary_loss_mlp": 0.01049216, + "balance_loss_clip": 1.02675653, + "balance_loss_mlp": 1.05248678, + "epoch": 0.1637156170148805, + "flos": 24060400945920.0, + "grad_norm": 2.0539311275727634, + "language_loss": 0.8477816, + "learning_rate": 3.8153537899519624e-06, + "loss": 0.86986339, + "num_input_tokens_seen": 58925040, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.0625, + "step": 2723, + "time_per_iteration": 2.5084922313690186 + }, + { + "auxiliary_loss_clip": 0.01146914, + "auxiliary_loss_mlp": 0.01037208, + "balance_loss_clip": 1.0177772, + "balance_loss_mlp": 1.04940808, + "epoch": 0.1637757402675485, + "flos": 26685362954880.0, + "grad_norm": 2.0049787201865503, + "language_loss": 0.70883536, + "learning_rate": 3.815190310268058e-06, + "loss": 0.73067659, + "num_input_tokens_seen": 58944790, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.9765625, + "step": 2724, + "time_per_iteration": 2.5094263553619385 + }, + { + "auxiliary_loss_clip": 0.01150034, + "auxiliary_loss_mlp": 0.01044202, + "balance_loss_clip": 1.02583206, + "balance_loss_mlp": 1.05113125, + "epoch": 0.16383586352021645, + "flos": 16106941918080.0, + "grad_norm": 2.04326868324577, + "language_loss": 0.70914948, + "learning_rate": 3.815026761751955e-06, + "loss": 0.73109186, + "num_input_tokens_seen": 58962500, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.9921875, + "step": 2725, + "time_per_iteration": 2.495342254638672 + }, + { + "auxiliary_loss_clip": 0.01149794, + "auxiliary_loss_mlp": 0.01042547, + "balance_loss_clip": 1.02437937, + "balance_loss_mlp": 1.05219352, + "epoch": 0.16389598677288442, + "flos": 19165991788800.0, + "grad_norm": 1.9381311422505, + "language_loss": 0.8873682, + "learning_rate": 3.814863144409855e-06, + "loss": 0.90929163, + "num_input_tokens_seen": 58980355, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9765625, + "step": 2726, + "time_per_iteration": 3.983738660812378 + }, + { + "auxiliary_loss_clip": 0.01156798, + "auxiliary_loss_mlp": 0.01049309, + "balance_loss_clip": 1.02965117, + "balance_loss_mlp": 1.05406547, + "epoch": 0.16395611002555238, + "flos": 21507008785920.0, + "grad_norm": 1.8502717081228044, + "language_loss": 0.7439661, + "learning_rate": 3.814699458247963e-06, + "loss": 0.76602715, + "num_input_tokens_seen": 58999505, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.03125, + "step": 2727, + "time_per_iteration": 5.52494215965271 + }, + { + "auxiliary_loss_clip": 0.01150784, + "auxiliary_loss_mlp": 0.0105177, + "balance_loss_clip": 1.03429413, + "balance_loss_mlp": 1.05145037, + "epoch": 0.16401623327822035, + "flos": 21470918595840.0, + "grad_norm": 1.6814144838265654, + "language_loss": 0.82321334, + "learning_rate": 3.8145357032724855e-06, + "loss": 0.84523886, + "num_input_tokens_seen": 59017930, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9921875, + "step": 2728, + "time_per_iteration": 2.4621498584747314 + }, + { + "auxiliary_loss_clip": 0.01156146, + "auxiliary_loss_mlp": 0.01050932, + "balance_loss_clip": 1.03131044, + "balance_loss_mlp": 1.05167341, + "epoch": 0.1640763565308883, + "flos": 13626232928640.0, + "grad_norm": 2.4458707176630425, + "language_loss": 0.84766865, + "learning_rate": 3.814371879489633e-06, + "loss": 0.86973941, + "num_input_tokens_seen": 59035130, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0390625, + "step": 2729, + "time_per_iteration": 2.459495782852173 + }, + { + "auxiliary_loss_clip": 0.01151277, + "auxiliary_loss_mlp": 0.01044659, + "balance_loss_clip": 1.02661061, + "balance_loss_mlp": 1.04923487, + "epoch": 0.16413647978355628, + "flos": 15451464579840.0, + "grad_norm": 1.9327126112676087, + "language_loss": 0.72569054, + "learning_rate": 3.814207986905616e-06, + "loss": 0.74764991, + "num_input_tokens_seen": 59053080, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.0234375, + "step": 2730, + "time_per_iteration": 2.451016902923584 + }, + { + "auxiliary_loss_clip": 0.01153124, + "auxiliary_loss_mlp": 0.01053311, + "balance_loss_clip": 1.03243709, + "balance_loss_mlp": 1.04862678, + "epoch": 0.16419660303622427, + "flos": 45878682015360.0, + "grad_norm": 2.2141787283307854, + "language_loss": 0.74431163, + "learning_rate": 3.814044025526651e-06, + "loss": 0.76637596, + "num_input_tokens_seen": 59075610, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.046875, + "step": 2731, + "time_per_iteration": 2.6857874393463135 + }, + { + "auxiliary_loss_clip": 0.0115844, + "auxiliary_loss_mlp": 0.01048812, + "balance_loss_clip": 1.02818894, + "balance_loss_mlp": 1.05408466, + "epoch": 0.16425672628889224, + "flos": 18952826526720.0, + "grad_norm": 2.15833206643789, + "language_loss": 0.78783584, + "learning_rate": 3.8138799953589548e-06, + "loss": 0.80990839, + "num_input_tokens_seen": 59094555, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.046875, + "step": 2732, + "time_per_iteration": 2.44146728515625 + }, + { + "auxiliary_loss_clip": 0.01155842, + "auxiliary_loss_mlp": 0.01051717, + "balance_loss_clip": 1.03166568, + "balance_loss_mlp": 1.05211556, + "epoch": 0.1643168495415602, + "flos": 24312996362880.0, + "grad_norm": 1.9937390498547816, + "language_loss": 0.68943298, + "learning_rate": 3.8137158964087473e-06, + "loss": 0.71150857, + "num_input_tokens_seen": 59113515, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.0390625, + "step": 2733, + "time_per_iteration": 2.4981601238250732 + }, + { + "auxiliary_loss_clip": 0.01151384, + "auxiliary_loss_mlp": 0.01048132, + "balance_loss_clip": 1.02792621, + "balance_loss_mlp": 1.05054927, + "epoch": 0.16437697279422817, + "flos": 26428421992320.0, + "grad_norm": 2.20018793155086, + "language_loss": 0.80626202, + "learning_rate": 3.8135517286822508e-06, + "loss": 0.8282572, + "num_input_tokens_seen": 59133275, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0078125, + "step": 2734, + "time_per_iteration": 2.495030641555786 + }, + { + "auxiliary_loss_clip": 0.01152713, + "auxiliary_loss_mlp": 0.0105599, + "balance_loss_clip": 1.03638041, + "balance_loss_mlp": 1.05143905, + "epoch": 0.16443709604689613, + "flos": 34532239351680.0, + "grad_norm": 4.0691467716051175, + "language_loss": 0.82265377, + "learning_rate": 3.8133874921856914e-06, + "loss": 0.84474081, + "num_input_tokens_seen": 59154095, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0078125, + "step": 2735, + "time_per_iteration": 2.5911896228790283 + }, + { + "auxiliary_loss_clip": 0.01150004, + "auxiliary_loss_mlp": 0.01044021, + "balance_loss_clip": 1.02556753, + "balance_loss_mlp": 1.05158913, + "epoch": 0.1644972192995641, + "flos": 23258048895360.0, + "grad_norm": 2.5735103485950077, + "language_loss": 0.78697491, + "learning_rate": 3.813223186925296e-06, + "loss": 0.80891526, + "num_input_tokens_seen": 59173795, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.984375, + "step": 2736, + "time_per_iteration": 2.4699559211730957 + }, + { + "auxiliary_loss_clip": 0.01155005, + "auxiliary_loss_mlp": 0.01052588, + "balance_loss_clip": 1.03438449, + "balance_loss_mlp": 1.05231023, + "epoch": 0.1645573425522321, + "flos": 26979543342720.0, + "grad_norm": 1.680513335410081, + "language_loss": 0.81409019, + "learning_rate": 3.8130588129072964e-06, + "loss": 0.83616614, + "num_input_tokens_seen": 59191610, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.03125, + "step": 2737, + "time_per_iteration": 2.4892401695251465 + }, + { + "auxiliary_loss_clip": 0.0115392, + "auxiliary_loss_mlp": 0.01046744, + "balance_loss_clip": 1.02819467, + "balance_loss_mlp": 1.05107307, + "epoch": 0.16461746580490005, + "flos": 28731768600960.0, + "grad_norm": 1.8393773079816103, + "language_loss": 0.87291563, + "learning_rate": 3.8128943701379246e-06, + "loss": 0.89492232, + "num_input_tokens_seen": 59213000, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.03125, + "step": 2738, + "time_per_iteration": 2.54569935798645 + }, + { + "auxiliary_loss_clip": 0.01154293, + "auxiliary_loss_mlp": 0.01055893, + "balance_loss_clip": 1.03653371, + "balance_loss_mlp": 1.05139303, + "epoch": 0.16467758905756802, + "flos": 24930156867840.0, + "grad_norm": 2.0122721864238438, + "language_loss": 0.72351867, + "learning_rate": 3.8127298586234167e-06, + "loss": 0.74562055, + "num_input_tokens_seen": 59232340, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.03125, + "step": 2739, + "time_per_iteration": 2.5309460163116455 + }, + { + "auxiliary_loss_clip": 0.01148442, + "auxiliary_loss_mlp": 0.0104888, + "balance_loss_clip": 1.02991343, + "balance_loss_mlp": 1.04766631, + "epoch": 0.16473771231023598, + "flos": 24826519152000.0, + "grad_norm": 1.690107638621115, + "language_loss": 0.81735384, + "learning_rate": 3.8125652783700104e-06, + "loss": 0.8393271, + "num_input_tokens_seen": 59253950, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0078125, + "step": 2740, + "time_per_iteration": 2.5005404949188232 + }, + { + "auxiliary_loss_clip": 0.01157284, + "auxiliary_loss_mlp": 0.01053239, + "balance_loss_clip": 1.03176928, + "balance_loss_mlp": 1.05347896, + "epoch": 0.16479783556290395, + "flos": 39896072375040.0, + "grad_norm": 2.8033984026588756, + "language_loss": 0.69098473, + "learning_rate": 3.8124006293839475e-06, + "loss": 0.71308994, + "num_input_tokens_seen": 59275545, + "router_z_loss_clip": 0.21484375, + "router_z_loss_mlp": 1.0390625, + "step": 2741, + "time_per_iteration": 2.6353659629821777 + }, + { + "auxiliary_loss_clip": 0.01151645, + "auxiliary_loss_mlp": 0.01044648, + "balance_loss_clip": 1.02588463, + "balance_loss_mlp": 1.04987025, + "epoch": 0.16485795881557191, + "flos": 19897061299200.0, + "grad_norm": 2.1078448839323167, + "language_loss": 0.79967189, + "learning_rate": 3.812235911671472e-06, + "loss": 0.82163477, + "num_input_tokens_seen": 59293480, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 2742, + "time_per_iteration": 2.4471442699432373 + }, + { + "auxiliary_loss_clip": 0.01150824, + "auxiliary_loss_mlp": 0.01053847, + "balance_loss_clip": 1.03373659, + "balance_loss_mlp": 1.05117011, + "epoch": 0.16491808206823988, + "flos": 20556129997440.0, + "grad_norm": 2.1468697804747823, + "language_loss": 0.84769481, + "learning_rate": 3.8120711252388274e-06, + "loss": 0.86974156, + "num_input_tokens_seen": 59313435, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0, + "step": 2743, + "time_per_iteration": 2.459146022796631 + }, + { + "auxiliary_loss_clip": 0.01149398, + "auxiliary_loss_mlp": 0.01052609, + "balance_loss_clip": 1.03359556, + "balance_loss_mlp": 1.05074859, + "epoch": 0.16497820532090787, + "flos": 23800802376960.0, + "grad_norm": 1.5853616537097488, + "language_loss": 0.85723281, + "learning_rate": 3.811906270092265e-06, + "loss": 0.87925285, + "num_input_tokens_seen": 59331535, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.984375, + "step": 2744, + "time_per_iteration": 2.4920642375946045 + }, + { + "auxiliary_loss_clip": 0.01147114, + "auxiliary_loss_mlp": 0.01046312, + "balance_loss_clip": 1.0283947, + "balance_loss_mlp": 1.05124998, + "epoch": 0.16503832857357584, + "flos": 25482642935040.0, + "grad_norm": 1.7300129139105382, + "language_loss": 0.82973897, + "learning_rate": 3.811741346238036e-06, + "loss": 0.85167319, + "num_input_tokens_seen": 59350680, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9609375, + "step": 2745, + "time_per_iteration": 2.490399122238159 + }, + { + "auxiliary_loss_clip": 0.0115758, + "auxiliary_loss_mlp": 0.01054165, + "balance_loss_clip": 1.03548467, + "balance_loss_mlp": 1.05477679, + "epoch": 0.1650984518262438, + "flos": 17676058619520.0, + "grad_norm": 2.19754759855213, + "language_loss": 0.76411253, + "learning_rate": 3.8115763536823923e-06, + "loss": 0.78622997, + "num_input_tokens_seen": 59367020, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.03125, + "step": 2746, + "time_per_iteration": 2.46258282661438 + }, + { + "auxiliary_loss_clip": 0.01152266, + "auxiliary_loss_mlp": 0.01052583, + "balance_loss_clip": 1.03387904, + "balance_loss_mlp": 1.05164099, + "epoch": 0.16515857507891177, + "flos": 18698327688960.0, + "grad_norm": 1.5978428663850568, + "language_loss": 0.80686736, + "learning_rate": 3.811411292431592e-06, + "loss": 0.82891583, + "num_input_tokens_seen": 59386075, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0078125, + "step": 2747, + "time_per_iteration": 2.4612972736358643 + }, + { + "auxiliary_loss_clip": 0.01158238, + "auxiliary_loss_mlp": 0.01048108, + "balance_loss_clip": 1.02848577, + "balance_loss_mlp": 1.05559731, + "epoch": 0.16521869833157973, + "flos": 15010481306880.0, + "grad_norm": 1.853069559467639, + "language_loss": 0.69463658, + "learning_rate": 3.8112461624918945e-06, + "loss": 0.71670008, + "num_input_tokens_seen": 59402690, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0234375, + "step": 2748, + "time_per_iteration": 2.4235999584198 + }, + { + "auxiliary_loss_clip": 0.01155731, + "auxiliary_loss_mlp": 0.0105142, + "balance_loss_clip": 1.03314471, + "balance_loss_mlp": 1.05482006, + "epoch": 0.1652788215842477, + "flos": 22121152548480.0, + "grad_norm": 2.265414403061137, + "language_loss": 0.87653661, + "learning_rate": 3.811080963869561e-06, + "loss": 0.89860809, + "num_input_tokens_seen": 59421130, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 1.0078125, + "step": 2749, + "time_per_iteration": 2.4706709384918213 + }, + { + "auxiliary_loss_clip": 0.01153325, + "auxiliary_loss_mlp": 0.01048781, + "balance_loss_clip": 1.02905142, + "balance_loss_mlp": 1.0509429, + "epoch": 0.16533894483691566, + "flos": 18333080242560.0, + "grad_norm": 2.3451981357461444, + "language_loss": 0.79248077, + "learning_rate": 3.8109156965708557e-06, + "loss": 0.81450188, + "num_input_tokens_seen": 59438970, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0234375, + "step": 2750, + "time_per_iteration": 2.4588990211486816 + }, + { + "auxiliary_loss_clip": 0.01151699, + "auxiliary_loss_mlp": 0.01045956, + "balance_loss_clip": 1.02657294, + "balance_loss_mlp": 1.05188382, + "epoch": 0.16539906808958366, + "flos": 22382115834240.0, + "grad_norm": 1.7653411133265118, + "language_loss": 0.95010567, + "learning_rate": 3.8107503606020455e-06, + "loss": 0.9720822, + "num_input_tokens_seen": 59458510, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.99609375, + "step": 2751, + "time_per_iteration": 2.4776439666748047 + }, + { + "auxiliary_loss_clip": 0.01152135, + "auxiliary_loss_mlp": 0.01045797, + "balance_loss_clip": 1.02762985, + "balance_loss_mlp": 1.05480134, + "epoch": 0.16545919134225162, + "flos": 22711093522560.0, + "grad_norm": 1.9833662518999209, + "language_loss": 0.71080822, + "learning_rate": 3.8105849559693997e-06, + "loss": 0.73278749, + "num_input_tokens_seen": 59477110, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.97265625, + "step": 2752, + "time_per_iteration": 2.4609227180480957 + }, + { + "auxiliary_loss_clip": 0.01051961, + "auxiliary_loss_mlp": 0.01021231, + "balance_loss_clip": 1.01878762, + "balance_loss_mlp": 1.01785779, + "epoch": 0.1655193145949196, + "flos": 67802974076160.0, + "grad_norm": 0.7698122762266473, + "language_loss": 0.54079807, + "learning_rate": 3.810419482679192e-06, + "loss": 0.56152999, + "num_input_tokens_seen": 59541155, + "router_z_loss_clip": 0.02441406, + "router_z_loss_mlp": 0.33984375, + "step": 2753, + "time_per_iteration": 3.161339282989502 + }, + { + "auxiliary_loss_clip": 0.01152964, + "auxiliary_loss_mlp": 0.01042006, + "balance_loss_clip": 1.02282572, + "balance_loss_mlp": 1.05254793, + "epoch": 0.16557943784758755, + "flos": 24280389792000.0, + "grad_norm": 1.9686645345026932, + "language_loss": 0.75467873, + "learning_rate": 3.8102539407376954e-06, + "loss": 0.77662838, + "num_input_tokens_seen": 59561155, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.0, + "step": 2754, + "time_per_iteration": 2.4837357997894287 + }, + { + "auxiliary_loss_clip": 0.01160718, + "auxiliary_loss_mlp": 0.01060834, + "balance_loss_clip": 1.03875661, + "balance_loss_mlp": 1.05358946, + "epoch": 0.16563956110025552, + "flos": 20083617561600.0, + "grad_norm": 3.81944507319113, + "language_loss": 0.87154973, + "learning_rate": 3.810088330151188e-06, + "loss": 0.89376527, + "num_input_tokens_seen": 59580460, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.0703125, + "step": 2755, + "time_per_iteration": 2.482780933380127 + }, + { + "auxiliary_loss_clip": 0.01148695, + "auxiliary_loss_mlp": 0.01052509, + "balance_loss_clip": 1.03348362, + "balance_loss_mlp": 1.04862666, + "epoch": 0.16569968435292348, + "flos": 28034454896640.0, + "grad_norm": 1.859731734913831, + "language_loss": 0.73258269, + "learning_rate": 3.80992265092595e-06, + "loss": 0.7545948, + "num_input_tokens_seen": 59600025, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0, + "step": 2756, + "time_per_iteration": 2.519432783126831 + }, + { + "auxiliary_loss_clip": 0.01149207, + "auxiliary_loss_mlp": 0.01049415, + "balance_loss_clip": 1.02999544, + "balance_loss_mlp": 1.05331099, + "epoch": 0.16575980760559147, + "flos": 26250233598720.0, + "grad_norm": 1.6628427585054586, + "language_loss": 0.74967468, + "learning_rate": 3.8097569030682636e-06, + "loss": 0.77166092, + "num_input_tokens_seen": 59620600, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.9609375, + "step": 2757, + "time_per_iteration": 2.5122530460357666 + }, + { + "auxiliary_loss_clip": 0.01154145, + "auxiliary_loss_mlp": 0.01044644, + "balance_loss_clip": 1.02590466, + "balance_loss_mlp": 1.05359447, + "epoch": 0.16581993085825944, + "flos": 26943955943040.0, + "grad_norm": 2.101183789218018, + "language_loss": 0.84532511, + "learning_rate": 3.8095910865844137e-06, + "loss": 0.86731303, + "num_input_tokens_seen": 59641385, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0078125, + "step": 2758, + "time_per_iteration": 2.5268592834472656 + }, + { + "auxiliary_loss_clip": 0.01153935, + "auxiliary_loss_mlp": 0.01051485, + "balance_loss_clip": 1.03382993, + "balance_loss_mlp": 1.05355358, + "epoch": 0.1658800541109274, + "flos": 21653632103040.0, + "grad_norm": 3.016772390052645, + "language_loss": 0.79003322, + "learning_rate": 3.809425201480689e-06, + "loss": 0.81208748, + "num_input_tokens_seen": 59659865, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 1.0, + "step": 2759, + "time_per_iteration": 2.468798875808716 + }, + { + "auxiliary_loss_clip": 0.01151828, + "auxiliary_loss_mlp": 0.01048294, + "balance_loss_clip": 1.02953088, + "balance_loss_mlp": 1.05121255, + "epoch": 0.16594017736359537, + "flos": 16435488643200.0, + "grad_norm": 4.81235802271706, + "language_loss": 0.75059134, + "learning_rate": 3.8092592477633793e-06, + "loss": 0.77259254, + "num_input_tokens_seen": 59678780, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0, + "step": 2760, + "time_per_iteration": 2.459453582763672 + }, + { + "auxiliary_loss_clip": 0.01158028, + "auxiliary_loss_mlp": 0.0104013, + "balance_loss_clip": 1.02139056, + "balance_loss_mlp": 1.05363011, + "epoch": 0.16600030061626334, + "flos": 22637297030400.0, + "grad_norm": 1.843496656605, + "language_loss": 0.73409051, + "learning_rate": 3.8090932254387774e-06, + "loss": 0.75607204, + "num_input_tokens_seen": 59698795, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.046875, + "step": 2761, + "time_per_iteration": 2.473264455795288 + }, + { + "auxiliary_loss_clip": 0.01154594, + "auxiliary_loss_mlp": 0.01046157, + "balance_loss_clip": 1.02709532, + "balance_loss_mlp": 1.05460942, + "epoch": 0.1660604238689313, + "flos": 26396569607040.0, + "grad_norm": 2.076392836835936, + "language_loss": 0.89255953, + "learning_rate": 3.8089271345131788e-06, + "loss": 0.91456699, + "num_input_tokens_seen": 59718795, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0, + "step": 2762, + "time_per_iteration": 2.4917852878570557 + }, + { + "auxiliary_loss_clip": 0.01153346, + "auxiliary_loss_mlp": 0.01052721, + "balance_loss_clip": 1.03327847, + "balance_loss_mlp": 1.0517025, + "epoch": 0.16612054712159927, + "flos": 23039999383680.0, + "grad_norm": 1.6634533311047424, + "language_loss": 0.87782222, + "learning_rate": 3.8087609749928822e-06, + "loss": 0.89988291, + "num_input_tokens_seen": 59737555, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.015625, + "step": 2763, + "time_per_iteration": 2.48002028465271 + }, + { + "auxiliary_loss_clip": 0.01051809, + "auxiliary_loss_mlp": 0.01013596, + "balance_loss_clip": 1.01105642, + "balance_loss_mlp": 1.01786494, + "epoch": 0.16618067037426726, + "flos": 59241225202560.0, + "grad_norm": 0.7771287992078079, + "language_loss": 0.59777391, + "learning_rate": 3.8085947468841885e-06, + "loss": 0.61842799, + "num_input_tokens_seen": 59800915, + "router_z_loss_clip": 0.02539062, + "router_z_loss_mlp": 0.33984375, + "step": 2764, + "time_per_iteration": 3.0722031593322754 + }, + { + "auxiliary_loss_clip": 0.01154679, + "auxiliary_loss_mlp": 0.01052765, + "balance_loss_clip": 1.03183234, + "balance_loss_mlp": 1.05292118, + "epoch": 0.16624079362693522, + "flos": 27198813916800.0, + "grad_norm": 1.8564974944455146, + "language_loss": 0.82349414, + "learning_rate": 3.808428450193401e-06, + "loss": 0.8455686, + "num_input_tokens_seen": 59822910, + "router_z_loss_clip": 0.20898438, + "router_z_loss_mlp": 1.015625, + "step": 2765, + "time_per_iteration": 2.5071089267730713 + }, + { + "auxiliary_loss_clip": 0.01161301, + "auxiliary_loss_mlp": 0.01048817, + "balance_loss_clip": 1.02758563, + "balance_loss_mlp": 1.05308914, + "epoch": 0.1663009168796032, + "flos": 10925068216320.0, + "grad_norm": 2.1954568630881566, + "language_loss": 0.70029616, + "learning_rate": 3.8082620849268244e-06, + "loss": 0.72239733, + "num_input_tokens_seen": 59838805, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.078125, + "step": 2766, + "time_per_iteration": 2.417538642883301 + }, + { + "auxiliary_loss_clip": 0.01153227, + "auxiliary_loss_mlp": 0.01045171, + "balance_loss_clip": 1.02669311, + "balance_loss_mlp": 1.05449462, + "epoch": 0.16636104013227115, + "flos": 17894431353600.0, + "grad_norm": 2.3642497854018174, + "language_loss": 0.88693011, + "learning_rate": 3.808095651090769e-06, + "loss": 0.90891409, + "num_input_tokens_seen": 59855345, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.98828125, + "step": 2767, + "time_per_iteration": 2.447087287902832 + }, + { + "auxiliary_loss_clip": 0.01048639, + "auxiliary_loss_mlp": 0.01007692, + "balance_loss_clip": 1.0051651, + "balance_loss_mlp": 1.01474071, + "epoch": 0.16642116338493912, + "flos": 66726050463360.0, + "grad_norm": 0.659533193053428, + "language_loss": 0.52894622, + "learning_rate": 3.8079291486915447e-06, + "loss": 0.54950953, + "num_input_tokens_seen": 59917710, + "router_z_loss_clip": 0.02526855, + "router_z_loss_mlp": 0.33984375, + "step": 2768, + "time_per_iteration": 4.540286064147949 + }, + { + "auxiliary_loss_clip": 0.01156575, + "auxiliary_loss_mlp": 0.01051889, + "balance_loss_clip": 1.03196931, + "balance_loss_mlp": 1.05233693, + "epoch": 0.16648128663760708, + "flos": 19026048401280.0, + "grad_norm": 2.4421243199538543, + "language_loss": 0.84964579, + "learning_rate": 3.8077625777354667e-06, + "loss": 0.87173045, + "num_input_tokens_seen": 59935105, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.046875, + "step": 2769, + "time_per_iteration": 3.9888546466827393 + }, + { + "auxiliary_loss_clip": 0.01046706, + "auxiliary_loss_mlp": 0.01007405, + "balance_loss_clip": 1.00486565, + "balance_loss_mlp": 1.01284146, + "epoch": 0.16654140989027508, + "flos": 70134976759680.0, + "grad_norm": 0.809970645404753, + "language_loss": 0.57417655, + "learning_rate": 3.80759593822885e-06, + "loss": 0.59471762, + "num_input_tokens_seen": 59984085, + "router_z_loss_clip": 0.02539062, + "router_z_loss_mlp": 0.33984375, + "step": 2770, + "time_per_iteration": 2.909212350845337 + }, + { + "auxiliary_loss_clip": 0.01045765, + "auxiliary_loss_mlp": 0.01004174, + "balance_loss_clip": 1.00161099, + "balance_loss_mlp": 1.0120976, + "epoch": 0.16660153314294304, + "flos": 70272406195200.0, + "grad_norm": 0.8642108743281017, + "language_loss": 0.5621168, + "learning_rate": 3.807429230178015e-06, + "loss": 0.58261615, + "num_input_tokens_seen": 60043470, + "router_z_loss_clip": 0.02563477, + "router_z_loss_mlp": 0.3359375, + "step": 2771, + "time_per_iteration": 2.9000375270843506 + }, + { + "auxiliary_loss_clip": 0.01152287, + "auxiliary_loss_mlp": 0.01058074, + "balance_loss_clip": 1.03741515, + "balance_loss_mlp": 1.05137527, + "epoch": 0.166661656395611, + "flos": 23075048079360.0, + "grad_norm": 2.4271023422086593, + "language_loss": 0.70461071, + "learning_rate": 3.8072624535892817e-06, + "loss": 0.72671425, + "num_input_tokens_seen": 60063045, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.0078125, + "step": 2772, + "time_per_iteration": 2.45868182182312 + }, + { + "auxiliary_loss_clip": 0.01150213, + "auxiliary_loss_mlp": 0.01052488, + "balance_loss_clip": 1.03305721, + "balance_loss_mlp": 1.04914951, + "epoch": 0.16672177964827897, + "flos": 28366341586560.0, + "grad_norm": 1.8764675289735346, + "language_loss": 0.86201918, + "learning_rate": 3.807095608468975e-06, + "loss": 0.8840462, + "num_input_tokens_seen": 60081945, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0078125, + "step": 2773, + "time_per_iteration": 2.513784885406494 + }, + { + "auxiliary_loss_clip": 0.01152492, + "auxiliary_loss_mlp": 0.01046232, + "balance_loss_clip": 1.02808821, + "balance_loss_mlp": 1.05230188, + "epoch": 0.16678190290094694, + "flos": 19091010147840.0, + "grad_norm": 2.2216439453760595, + "language_loss": 0.81859678, + "learning_rate": 3.8069286948234224e-06, + "loss": 0.84058398, + "num_input_tokens_seen": 60096820, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.0, + "step": 2774, + "time_per_iteration": 2.4288830757141113 + }, + { + "auxiliary_loss_clip": 0.01155539, + "auxiliary_loss_mlp": 0.0104957, + "balance_loss_clip": 1.02955508, + "balance_loss_mlp": 1.05290627, + "epoch": 0.1668420261536149, + "flos": 21799106184960.0, + "grad_norm": 2.1125697386324576, + "language_loss": 0.83287829, + "learning_rate": 3.806761712658952e-06, + "loss": 0.85492939, + "num_input_tokens_seen": 60116140, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.0234375, + "step": 2775, + "time_per_iteration": 2.4773504734039307 + }, + { + "auxiliary_loss_clip": 0.01151, + "auxiliary_loss_mlp": 0.01053902, + "balance_loss_clip": 1.03599668, + "balance_loss_mlp": 1.0527029, + "epoch": 0.16690214940628287, + "flos": 19062533640960.0, + "grad_norm": 1.9011936520028738, + "language_loss": 0.80721045, + "learning_rate": 3.806594661981897e-06, + "loss": 0.82925946, + "num_input_tokens_seen": 60134235, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.984375, + "step": 2776, + "time_per_iteration": 2.4736995697021484 + }, + { + "auxiliary_loss_clip": 0.01147621, + "auxiliary_loss_mlp": 0.01053383, + "balance_loss_clip": 1.03457189, + "balance_loss_mlp": 1.05260348, + "epoch": 0.16696227265895086, + "flos": 18588548747520.0, + "grad_norm": 1.7922512358148395, + "language_loss": 0.798361, + "learning_rate": 3.8064275427985906e-06, + "loss": 0.82037103, + "num_input_tokens_seen": 60153275, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.953125, + "step": 2777, + "time_per_iteration": 2.4625258445739746 + }, + { + "auxiliary_loss_clip": 0.01149386, + "auxiliary_loss_mlp": 0.0105028, + "balance_loss_clip": 1.0313735, + "balance_loss_mlp": 1.05002642, + "epoch": 0.16702239591161883, + "flos": 23294139085440.0, + "grad_norm": 1.8218923631286437, + "language_loss": 0.85132945, + "learning_rate": 3.806260355115371e-06, + "loss": 0.87332618, + "num_input_tokens_seen": 60173215, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.99609375, + "step": 2778, + "time_per_iteration": 2.4819412231445312 + }, + { + "auxiliary_loss_clip": 0.01154381, + "auxiliary_loss_mlp": 0.0104532, + "balance_loss_clip": 1.02626991, + "balance_loss_mlp": 1.05222583, + "epoch": 0.1670825191642868, + "flos": 24425648392320.0, + "grad_norm": 2.6489491047564826, + "language_loss": 0.74133682, + "learning_rate": 3.8060930989385778e-06, + "loss": 0.76333386, + "num_input_tokens_seen": 60190515, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0234375, + "step": 2779, + "time_per_iteration": 2.510207176208496 + }, + { + "auxiliary_loss_clip": 0.0115174, + "auxiliary_loss_mlp": 0.01045601, + "balance_loss_clip": 1.02625358, + "balance_loss_mlp": 1.05116367, + "epoch": 0.16714264241695476, + "flos": 26797512193920.0, + "grad_norm": 2.2761441742273663, + "language_loss": 0.65382051, + "learning_rate": 3.805925774274554e-06, + "loss": 0.67579395, + "num_input_tokens_seen": 60211655, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0078125, + "step": 2780, + "time_per_iteration": 2.5250439643859863 + }, + { + "auxiliary_loss_clip": 0.01150325, + "auxiliary_loss_mlp": 0.01048314, + "balance_loss_clip": 1.02856088, + "balance_loss_mlp": 1.05120933, + "epoch": 0.16720276566962272, + "flos": 21835304115840.0, + "grad_norm": 2.0602280440022382, + "language_loss": 0.78563058, + "learning_rate": 3.805758381129643e-06, + "loss": 0.80761701, + "num_input_tokens_seen": 60230860, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.9921875, + "step": 2781, + "time_per_iteration": 2.4921979904174805 + }, + { + "auxiliary_loss_clip": 0.01153739, + "auxiliary_loss_mlp": 0.01049181, + "balance_loss_clip": 1.03065586, + "balance_loss_mlp": 1.05227423, + "epoch": 0.1672628889222907, + "flos": 21470415805440.0, + "grad_norm": 1.480266857331911, + "language_loss": 0.75262564, + "learning_rate": 3.805590919510193e-06, + "loss": 0.77465487, + "num_input_tokens_seen": 60250535, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.015625, + "step": 2782, + "time_per_iteration": 2.468590021133423 + }, + { + "auxiliary_loss_clip": 0.01159372, + "auxiliary_loss_mlp": 0.01052642, + "balance_loss_clip": 1.03141046, + "balance_loss_mlp": 1.05443954, + "epoch": 0.16732301217495865, + "flos": 30774008269440.0, + "grad_norm": 1.999958464394936, + "language_loss": 0.67841566, + "learning_rate": 3.8054233894225547e-06, + "loss": 0.70053571, + "num_input_tokens_seen": 60269530, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.0546875, + "step": 2783, + "time_per_iteration": 2.5312225818634033 + }, + { + "auxiliary_loss_clip": 0.01153889, + "auxiliary_loss_mlp": 0.01050749, + "balance_loss_clip": 1.03193808, + "balance_loss_mlp": 1.0538497, + "epoch": 0.16738313542762664, + "flos": 23474625949440.0, + "grad_norm": 2.209785525271013, + "language_loss": 0.70028126, + "learning_rate": 3.805255790873081e-06, + "loss": 0.72232759, + "num_input_tokens_seen": 60289900, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0, + "step": 2784, + "time_per_iteration": 2.4932820796966553 + }, + { + "auxiliary_loss_clip": 0.01154602, + "auxiliary_loss_mlp": 0.0105186, + "balance_loss_clip": 1.03087902, + "balance_loss_mlp": 1.05120277, + "epoch": 0.1674432586802946, + "flos": 29789086366080.0, + "grad_norm": 1.9638597335511054, + "language_loss": 0.60441053, + "learning_rate": 3.805088123868126e-06, + "loss": 0.62647516, + "num_input_tokens_seen": 60310025, + "router_z_loss_clip": 0.20996094, + "router_z_loss_mlp": 1.03125, + "step": 2785, + "time_per_iteration": 2.527010440826416 + }, + { + "auxiliary_loss_clip": 0.0104901, + "auxiliary_loss_mlp": 0.01029558, + "balance_loss_clip": 1.02681625, + "balance_loss_mlp": 1.01595187, + "epoch": 0.16750338193296258, + "flos": 66136073575680.0, + "grad_norm": 0.8343482124814343, + "language_loss": 0.588, + "learning_rate": 3.8049203884140492e-06, + "loss": 0.60878569, + "num_input_tokens_seen": 60377800, + "router_z_loss_clip": 0.02746582, + "router_z_loss_mlp": 0.33007812, + "step": 2786, + "time_per_iteration": 3.1062281131744385 + }, + { + "auxiliary_loss_clip": 0.0115343, + "auxiliary_loss_mlp": 0.01044844, + "balance_loss_clip": 1.0253408, + "balance_loss_mlp": 1.05108333, + "epoch": 0.16756350518563054, + "flos": 25696777864320.0, + "grad_norm": 1.9494651562196093, + "language_loss": 0.75846571, + "learning_rate": 3.80475258451721e-06, + "loss": 0.78044844, + "num_input_tokens_seen": 60398215, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0234375, + "step": 2787, + "time_per_iteration": 2.51383900642395 + }, + { + "auxiliary_loss_clip": 0.0115361, + "auxiliary_loss_mlp": 0.0104169, + "balance_loss_clip": 1.02287841, + "balance_loss_mlp": 1.05218899, + "epoch": 0.1676236284382985, + "flos": 23836102467840.0, + "grad_norm": 2.088538847955111, + "language_loss": 0.77615869, + "learning_rate": 3.804584712183972e-06, + "loss": 0.79811174, + "num_input_tokens_seen": 60416910, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 2788, + "time_per_iteration": 2.4926373958587646 + }, + { + "auxiliary_loss_clip": 0.01048965, + "auxiliary_loss_mlp": 0.01004104, + "balance_loss_clip": 1.00154078, + "balance_loss_mlp": 1.01582766, + "epoch": 0.16768375169096647, + "flos": 59874902985600.0, + "grad_norm": 0.861309286667726, + "language_loss": 0.59360403, + "learning_rate": 3.8044167714207013e-06, + "loss": 0.61413473, + "num_input_tokens_seen": 60468660, + "router_z_loss_clip": 0.02563477, + "router_z_loss_mlp": 0.33203125, + "step": 2789, + "time_per_iteration": 2.9390883445739746 + }, + { + "auxiliary_loss_clip": 0.01153417, + "auxiliary_loss_mlp": 0.01052728, + "balance_loss_clip": 1.03262937, + "balance_loss_mlp": 1.05115533, + "epoch": 0.16774387494363446, + "flos": 38435657207040.0, + "grad_norm": 1.8582032581880512, + "language_loss": 0.70117038, + "learning_rate": 3.804248762233765e-06, + "loss": 0.72323185, + "num_input_tokens_seen": 60492370, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.015625, + "step": 2790, + "time_per_iteration": 2.6337287425994873 + }, + { + "auxiliary_loss_clip": 0.01154528, + "auxiliary_loss_mlp": 0.01057043, + "balance_loss_clip": 1.03852975, + "balance_loss_mlp": 1.05254579, + "epoch": 0.16780399819630243, + "flos": 22637620252800.0, + "grad_norm": 1.9267324208283758, + "language_loss": 0.7914235, + "learning_rate": 3.8040806846295356e-06, + "loss": 0.81353921, + "num_input_tokens_seen": 60512655, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0234375, + "step": 2791, + "time_per_iteration": 2.4992258548736572 + }, + { + "auxiliary_loss_clip": 0.01154296, + "auxiliary_loss_mlp": 0.0104755, + "balance_loss_clip": 1.02807093, + "balance_loss_mlp": 1.05311096, + "epoch": 0.1678641214489704, + "flos": 32891516887680.0, + "grad_norm": 1.670563786806713, + "language_loss": 0.71465087, + "learning_rate": 3.8039125386143853e-06, + "loss": 0.73666936, + "num_input_tokens_seen": 60533090, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.015625, + "step": 2792, + "time_per_iteration": 2.5886104106903076 + }, + { + "auxiliary_loss_clip": 0.01154826, + "auxiliary_loss_mlp": 0.01045884, + "balance_loss_clip": 1.02648878, + "balance_loss_mlp": 1.05179656, + "epoch": 0.16792424470163836, + "flos": 19974916028160.0, + "grad_norm": 2.423044729867527, + "language_loss": 0.72166264, + "learning_rate": 3.803744324194691e-06, + "loss": 0.74366981, + "num_input_tokens_seen": 60553190, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.03125, + "step": 2793, + "time_per_iteration": 2.5197043418884277 + }, + { + "auxiliary_loss_clip": 0.01153184, + "auxiliary_loss_mlp": 0.01054586, + "balance_loss_clip": 1.03502417, + "balance_loss_mlp": 1.05135465, + "epoch": 0.16798436795430632, + "flos": 19719878486400.0, + "grad_norm": 1.9474647186442988, + "language_loss": 0.77305138, + "learning_rate": 3.803576041376831e-06, + "loss": 0.79512912, + "num_input_tokens_seen": 60571995, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.015625, + "step": 2794, + "time_per_iteration": 2.467292547225952 + }, + { + "auxiliary_loss_clip": 0.01154384, + "auxiliary_loss_mlp": 0.01054467, + "balance_loss_clip": 1.03558397, + "balance_loss_mlp": 1.05253601, + "epoch": 0.1680444912069743, + "flos": 28104839596800.0, + "grad_norm": 2.2742759048834578, + "language_loss": 0.71613103, + "learning_rate": 3.803407690167187e-06, + "loss": 0.7382195, + "num_input_tokens_seen": 60591275, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.015625, + "step": 2795, + "time_per_iteration": 2.5272278785705566 + }, + { + "auxiliary_loss_clip": 0.01149377, + "auxiliary_loss_mlp": 0.01044626, + "balance_loss_clip": 1.02592218, + "balance_loss_mlp": 1.04932868, + "epoch": 0.16810461445964225, + "flos": 18075205526400.0, + "grad_norm": 1.942494339721957, + "language_loss": 0.83784455, + "learning_rate": 3.803239270572142e-06, + "loss": 0.8597846, + "num_input_tokens_seen": 60609235, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0, + "step": 2796, + "time_per_iteration": 2.448528289794922 + }, + { + "auxiliary_loss_clip": 0.01152862, + "auxiliary_loss_mlp": 0.01059215, + "balance_loss_clip": 1.03911614, + "balance_loss_mlp": 1.04904127, + "epoch": 0.16816473771231025, + "flos": 23878657105920.0, + "grad_norm": 1.6778887705488965, + "language_loss": 0.8109591, + "learning_rate": 3.8030707825980838e-06, + "loss": 0.83307993, + "num_input_tokens_seen": 60629880, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0390625, + "step": 2797, + "time_per_iteration": 2.5044567584991455 + }, + { + "auxiliary_loss_clip": 0.01147186, + "auxiliary_loss_mlp": 0.01044345, + "balance_loss_clip": 1.02766752, + "balance_loss_mlp": 1.05142093, + "epoch": 0.1682248609649782, + "flos": 22783597125120.0, + "grad_norm": 1.4189820060365406, + "language_loss": 0.74740726, + "learning_rate": 3.802902226251401e-06, + "loss": 0.76932257, + "num_input_tokens_seen": 60651175, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.95703125, + "step": 2798, + "time_per_iteration": 2.4913666248321533 + }, + { + "auxiliary_loss_clip": 0.01154688, + "auxiliary_loss_mlp": 0.01049917, + "balance_loss_clip": 1.03250098, + "balance_loss_mlp": 1.05462337, + "epoch": 0.16828498421764618, + "flos": 20705123612160.0, + "grad_norm": 1.8962576537558784, + "language_loss": 0.79592311, + "learning_rate": 3.8027336015384845e-06, + "loss": 0.81796914, + "num_input_tokens_seen": 60670210, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 1.0, + "step": 2799, + "time_per_iteration": 2.4844021797180176 + }, + { + "auxiliary_loss_clip": 0.01153892, + "auxiliary_loss_mlp": 0.01046392, + "balance_loss_clip": 1.02597189, + "balance_loss_mlp": 1.04983997, + "epoch": 0.16834510747031414, + "flos": 29420606695680.0, + "grad_norm": 2.7819182919151455, + "language_loss": 0.70778632, + "learning_rate": 3.8025649084657296e-06, + "loss": 0.72978926, + "num_input_tokens_seen": 60690895, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0390625, + "step": 2800, + "time_per_iteration": 2.548715829849243 + }, + { + "auxiliary_loss_clip": 0.01148463, + "auxiliary_loss_mlp": 0.01043839, + "balance_loss_clip": 1.02365637, + "balance_loss_mlp": 1.04882574, + "epoch": 0.1684052307229821, + "flos": 18145374744960.0, + "grad_norm": 1.9135359518782422, + "language_loss": 0.83549178, + "learning_rate": 3.8023961470395326e-06, + "loss": 0.85741478, + "num_input_tokens_seen": 60708280, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.99609375, + "step": 2801, + "time_per_iteration": 2.456601858139038 + }, + { + "auxiliary_loss_clip": 0.01149997, + "auxiliary_loss_mlp": 0.01052315, + "balance_loss_clip": 1.03355145, + "balance_loss_mlp": 1.04947591, + "epoch": 0.16846535397565007, + "flos": 16574929240320.0, + "grad_norm": 2.757874152621573, + "language_loss": 0.822721, + "learning_rate": 3.8022273172662933e-06, + "loss": 0.84474415, + "num_input_tokens_seen": 60724150, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0078125, + "step": 2802, + "time_per_iteration": 2.4426534175872803 + }, + { + "auxiliary_loss_clip": 0.01153107, + "auxiliary_loss_mlp": 0.01047694, + "balance_loss_clip": 1.02764344, + "balance_loss_mlp": 1.05123353, + "epoch": 0.16852547722831807, + "flos": 30408868563840.0, + "grad_norm": 1.4855905624355255, + "language_loss": 0.81064272, + "learning_rate": 3.802058419152413e-06, + "loss": 0.83265072, + "num_input_tokens_seen": 60746485, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.015625, + "step": 2803, + "time_per_iteration": 2.5615930557250977 + }, + { + "auxiliary_loss_clip": 0.01150255, + "auxiliary_loss_mlp": 0.01045652, + "balance_loss_clip": 1.02693641, + "balance_loss_mlp": 1.05246449, + "epoch": 0.16858560048098603, + "flos": 33507420416640.0, + "grad_norm": 2.2799183114600545, + "language_loss": 0.7645762, + "learning_rate": 3.801889452704297e-06, + "loss": 0.78653532, + "num_input_tokens_seen": 60762875, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9765625, + "step": 2804, + "time_per_iteration": 2.541059970855713 + }, + { + "auxiliary_loss_clip": 0.01045818, + "auxiliary_loss_mlp": 0.01026702, + "balance_loss_clip": 1.02452028, + "balance_loss_mlp": 1.01328063, + "epoch": 0.168645723733654, + "flos": 67370502326400.0, + "grad_norm": 0.8620881286764229, + "language_loss": 0.55414748, + "learning_rate": 3.8017204179283526e-06, + "loss": 0.57487267, + "num_input_tokens_seen": 60825510, + "router_z_loss_clip": 0.02185059, + "router_z_loss_mlp": 0.32421875, + "step": 2805, + "time_per_iteration": 3.033358573913574 + }, + { + "auxiliary_loss_clip": 0.01144187, + "auxiliary_loss_mlp": 0.01039064, + "balance_loss_clip": 1.02161169, + "balance_loss_mlp": 1.04741919, + "epoch": 0.16870584698632196, + "flos": 21324618501120.0, + "grad_norm": 1.9122963285347783, + "language_loss": 0.73038024, + "learning_rate": 3.8015513148309892e-06, + "loss": 0.75221276, + "num_input_tokens_seen": 60844440, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.96484375, + "step": 2806, + "time_per_iteration": 2.4699463844299316 + }, + { + "auxiliary_loss_clip": 0.01148467, + "auxiliary_loss_mlp": 0.01045307, + "balance_loss_clip": 1.02712786, + "balance_loss_mlp": 1.05072176, + "epoch": 0.16876597023898993, + "flos": 20740746925440.0, + "grad_norm": 1.9407491705316076, + "language_loss": 0.69966477, + "learning_rate": 3.80138214341862e-06, + "loss": 0.7216025, + "num_input_tokens_seen": 60863210, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.98046875, + "step": 2807, + "time_per_iteration": 2.4583139419555664 + }, + { + "auxiliary_loss_clip": 0.01149832, + "auxiliary_loss_mlp": 0.01051611, + "balance_loss_clip": 1.03196526, + "balance_loss_mlp": 1.05013919, + "epoch": 0.1688260934916579, + "flos": 20303498666880.0, + "grad_norm": 2.8028706291815912, + "language_loss": 0.70265883, + "learning_rate": 3.8012129036976587e-06, + "loss": 0.72467327, + "num_input_tokens_seen": 60882510, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.9921875, + "step": 2808, + "time_per_iteration": 2.4724719524383545 + }, + { + "auxiliary_loss_clip": 0.01154296, + "auxiliary_loss_mlp": 0.0104775, + "balance_loss_clip": 1.02792549, + "balance_loss_mlp": 1.05130935, + "epoch": 0.16888621674432586, + "flos": 20340702178560.0, + "grad_norm": 2.1293629398657954, + "language_loss": 0.80103064, + "learning_rate": 3.8010435956745236e-06, + "loss": 0.8230511, + "num_input_tokens_seen": 60901105, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.03125, + "step": 2809, + "time_per_iteration": 3.844451427459717 + }, + { + "auxiliary_loss_clip": 0.01155336, + "auxiliary_loss_mlp": 0.01051942, + "balance_loss_clip": 1.03301144, + "balance_loss_mlp": 1.050385, + "epoch": 0.16894633999699385, + "flos": 16244802316800.0, + "grad_norm": 2.0909159229075245, + "language_loss": 0.88465077, + "learning_rate": 3.8008742193556358e-06, + "loss": 0.9067235, + "num_input_tokens_seen": 60915340, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.046875, + "step": 2810, + "time_per_iteration": 5.43256688117981 + }, + { + "auxiliary_loss_clip": 0.0115459, + "auxiliary_loss_mlp": 0.01052284, + "balance_loss_clip": 1.03238845, + "balance_loss_mlp": 1.05188894, + "epoch": 0.16900646324966181, + "flos": 19610171372160.0, + "grad_norm": 2.324870160833927, + "language_loss": 0.92483926, + "learning_rate": 3.800704774747416e-06, + "loss": 0.94690794, + "num_input_tokens_seen": 60933735, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.03125, + "step": 2811, + "time_per_iteration": 2.4633538722991943 + }, + { + "auxiliary_loss_clip": 0.01157458, + "auxiliary_loss_mlp": 0.01049775, + "balance_loss_clip": 1.03154814, + "balance_loss_mlp": 1.05537057, + "epoch": 0.16906658650232978, + "flos": 22018089450240.0, + "grad_norm": 20.150047321728213, + "language_loss": 0.78719699, + "learning_rate": 3.800535261856291e-06, + "loss": 0.80926931, + "num_input_tokens_seen": 60953105, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.0234375, + "step": 2812, + "time_per_iteration": 2.475893974304199 + }, + { + "auxiliary_loss_clip": 0.01154531, + "auxiliary_loss_mlp": 0.01053249, + "balance_loss_clip": 1.0353322, + "balance_loss_mlp": 1.05427527, + "epoch": 0.16912670975499774, + "flos": 11763690024960.0, + "grad_norm": 2.3708558754635103, + "language_loss": 0.7492249, + "learning_rate": 3.8003656806886887e-06, + "loss": 0.7713027, + "num_input_tokens_seen": 60969150, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 1.0078125, + "step": 2813, + "time_per_iteration": 2.4622457027435303 + }, + { + "auxiliary_loss_clip": 0.01155154, + "auxiliary_loss_mlp": 0.01047749, + "balance_loss_clip": 1.02862835, + "balance_loss_mlp": 1.05231524, + "epoch": 0.1691868330076657, + "flos": 17161386595200.0, + "grad_norm": 2.6643465032783955, + "language_loss": 0.69000697, + "learning_rate": 3.8001960312510396e-06, + "loss": 0.71203601, + "num_input_tokens_seen": 60982825, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.03125, + "step": 2814, + "time_per_iteration": 2.442352771759033 + }, + { + "auxiliary_loss_clip": 0.01152587, + "auxiliary_loss_mlp": 0.01048898, + "balance_loss_clip": 1.03032494, + "balance_loss_mlp": 1.05269694, + "epoch": 0.16924695626033368, + "flos": 22416553998720.0, + "grad_norm": 3.3683342322522543, + "language_loss": 0.61842358, + "learning_rate": 3.800026313549776e-06, + "loss": 0.64043844, + "num_input_tokens_seen": 61000875, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0, + "step": 2815, + "time_per_iteration": 2.4859516620635986 + }, + { + "auxiliary_loss_clip": 0.01149812, + "auxiliary_loss_mlp": 0.01050269, + "balance_loss_clip": 1.03179121, + "balance_loss_mlp": 1.05104065, + "epoch": 0.16930707951300164, + "flos": 25739655724800.0, + "grad_norm": 1.9947957584318596, + "language_loss": 0.81983805, + "learning_rate": 3.7998565275913342e-06, + "loss": 0.84183884, + "num_input_tokens_seen": 61021940, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9921875, + "step": 2816, + "time_per_iteration": 2.5549440383911133 + }, + { + "auxiliary_loss_clip": 0.01156016, + "auxiliary_loss_mlp": 0.01049677, + "balance_loss_clip": 1.03072321, + "balance_loss_mlp": 1.05379295, + "epoch": 0.16936720276566963, + "flos": 22747040058240.0, + "grad_norm": 2.502019531770294, + "language_loss": 0.8722589, + "learning_rate": 3.799686673382153e-06, + "loss": 0.89431584, + "num_input_tokens_seen": 61040285, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0234375, + "step": 2817, + "time_per_iteration": 2.4906835556030273 + }, + { + "auxiliary_loss_clip": 0.01152128, + "auxiliary_loss_mlp": 0.01051154, + "balance_loss_clip": 1.03200889, + "balance_loss_mlp": 1.05302715, + "epoch": 0.1694273260183376, + "flos": 19573973441280.0, + "grad_norm": 1.7787508021643152, + "language_loss": 0.81666476, + "learning_rate": 3.799516750928672e-06, + "loss": 0.83869755, + "num_input_tokens_seen": 61059020, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.98828125, + "step": 2818, + "time_per_iteration": 2.4673428535461426 + }, + { + "auxiliary_loss_clip": 0.01151603, + "auxiliary_loss_mlp": 0.01052661, + "balance_loss_clip": 1.03339636, + "balance_loss_mlp": 1.05154157, + "epoch": 0.16948744927100556, + "flos": 12457843332480.0, + "grad_norm": 5.791836374282792, + "language_loss": 0.80712807, + "learning_rate": 3.799346760237336e-06, + "loss": 0.8291707, + "num_input_tokens_seen": 61074245, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0, + "step": 2819, + "time_per_iteration": 2.43947434425354 + }, + { + "auxiliary_loss_clip": 0.01048844, + "auxiliary_loss_mlp": 0.01007246, + "balance_loss_clip": 1.00504076, + "balance_loss_mlp": 1.01552486, + "epoch": 0.16954757252367353, + "flos": 71291694435840.0, + "grad_norm": 0.9491282523447765, + "language_loss": 0.61080176, + "learning_rate": 3.7991767013145902e-06, + "loss": 0.63136268, + "num_input_tokens_seen": 61127080, + "router_z_loss_clip": 0.02209473, + "router_z_loss_mlp": 0.33203125, + "step": 2820, + "time_per_iteration": 3.008953809738159 + }, + { + "auxiliary_loss_clip": 0.01152835, + "auxiliary_loss_mlp": 0.01049908, + "balance_loss_clip": 1.031335, + "balance_loss_mlp": 1.05163527, + "epoch": 0.1696076957763415, + "flos": 29606516513280.0, + "grad_norm": 2.1013484538112097, + "language_loss": 0.78625357, + "learning_rate": 3.7990065741668844e-06, + "loss": 0.808281, + "num_input_tokens_seen": 61146955, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.015625, + "step": 2821, + "time_per_iteration": 2.5363481044769287 + }, + { + "auxiliary_loss_clip": 0.01152889, + "auxiliary_loss_mlp": 0.01056486, + "balance_loss_clip": 1.03667343, + "balance_loss_mlp": 1.05229986, + "epoch": 0.16966781902900946, + "flos": 24388588535040.0, + "grad_norm": 2.87583667245789, + "language_loss": 0.78450388, + "learning_rate": 3.7988363788006685e-06, + "loss": 0.80659759, + "num_input_tokens_seen": 61166605, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.0078125, + "step": 2822, + "time_per_iteration": 2.4969065189361572 + }, + { + "auxiliary_loss_clip": 0.01147495, + "auxiliary_loss_mlp": 0.01050996, + "balance_loss_clip": 1.03299582, + "balance_loss_mlp": 1.04956698, + "epoch": 0.16972794228167745, + "flos": 23038814234880.0, + "grad_norm": 1.9220487825624015, + "language_loss": 0.75016022, + "learning_rate": 3.7986661152223967e-06, + "loss": 0.77214515, + "num_input_tokens_seen": 61186535, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9765625, + "step": 2823, + "time_per_iteration": 2.491588830947876 + }, + { + "auxiliary_loss_clip": 0.01151822, + "auxiliary_loss_mlp": 0.0105186, + "balance_loss_clip": 1.03198779, + "balance_loss_mlp": 1.05209637, + "epoch": 0.16978806553434542, + "flos": 35228691129600.0, + "grad_norm": 1.9648811068121905, + "language_loss": 0.60514438, + "learning_rate": 3.7984957834385257e-06, + "loss": 0.62718117, + "num_input_tokens_seen": 61208965, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.99609375, + "step": 2824, + "time_per_iteration": 2.6178910732269287 + }, + { + "auxiliary_loss_clip": 0.01151958, + "auxiliary_loss_mlp": 0.01040113, + "balance_loss_clip": 1.02030015, + "balance_loss_mlp": 1.05367076, + "epoch": 0.16984818878701338, + "flos": 32014290936960.0, + "grad_norm": 1.6856049786717988, + "language_loss": 0.73004806, + "learning_rate": 3.7983253834555144e-06, + "loss": 0.75196874, + "num_input_tokens_seen": 61230670, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.98046875, + "step": 2825, + "time_per_iteration": 2.559774398803711 + }, + { + "auxiliary_loss_clip": 0.01155697, + "auxiliary_loss_mlp": 0.01054546, + "balance_loss_clip": 1.03321934, + "balance_loss_mlp": 1.0505774, + "epoch": 0.16990831203968135, + "flos": 22818609907200.0, + "grad_norm": 1.7849035157466668, + "language_loss": 0.85660541, + "learning_rate": 3.7981549152798245e-06, + "loss": 0.87870789, + "num_input_tokens_seen": 61249510, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.0546875, + "step": 2826, + "time_per_iteration": 2.4860360622406006 + }, + { + "auxiliary_loss_clip": 0.0115502, + "auxiliary_loss_mlp": 0.01050706, + "balance_loss_clip": 1.03164482, + "balance_loss_mlp": 1.0515151, + "epoch": 0.1699684352923493, + "flos": 23039604334080.0, + "grad_norm": 2.3205594057943175, + "language_loss": 0.8232255, + "learning_rate": 3.7979843789179196e-06, + "loss": 0.84528267, + "num_input_tokens_seen": 61269440, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.03125, + "step": 2827, + "time_per_iteration": 2.4965107440948486 + }, + { + "auxiliary_loss_clip": 0.01153252, + "auxiliary_loss_mlp": 0.01049837, + "balance_loss_clip": 1.02965498, + "balance_loss_mlp": 1.05059743, + "epoch": 0.17002855854501728, + "flos": 21434110133760.0, + "grad_norm": 2.393760877815214, + "language_loss": 0.73652613, + "learning_rate": 3.797813774376267e-06, + "loss": 0.75855708, + "num_input_tokens_seen": 61288195, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.0234375, + "step": 2828, + "time_per_iteration": 2.5726237297058105 + }, + { + "auxiliary_loss_clip": 0.01046718, + "auxiliary_loss_mlp": 0.01008554, + "balance_loss_clip": 1.00625372, + "balance_loss_mlp": 1.01360035, + "epoch": 0.17008868179768524, + "flos": 71453509205760.0, + "grad_norm": 0.76062911359866, + "language_loss": 0.56446254, + "learning_rate": 3.797643101661336e-06, + "loss": 0.5850153, + "num_input_tokens_seen": 61350850, + "router_z_loss_clip": 0.02294922, + "router_z_loss_mlp": 0.33203125, + "step": 2829, + "time_per_iteration": 3.1035284996032715 + }, + { + "auxiliary_loss_clip": 0.01148906, + "auxiliary_loss_mlp": 0.01048452, + "balance_loss_clip": 1.02912867, + "balance_loss_mlp": 1.04916263, + "epoch": 0.17014880505035324, + "flos": 24900315644160.0, + "grad_norm": 1.7229604876305038, + "language_loss": 0.83673382, + "learning_rate": 3.7974723607795983e-06, + "loss": 0.85870743, + "num_input_tokens_seen": 61370765, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.99609375, + "step": 2830, + "time_per_iteration": 2.5140810012817383 + }, + { + "auxiliary_loss_clip": 0.01150805, + "auxiliary_loss_mlp": 0.01048567, + "balance_loss_clip": 1.02792013, + "balance_loss_mlp": 1.04919207, + "epoch": 0.1702089283030212, + "flos": 29862415981440.0, + "grad_norm": 2.0065309441313337, + "language_loss": 0.77852297, + "learning_rate": 3.797301551737529e-06, + "loss": 0.80051666, + "num_input_tokens_seen": 61388935, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.015625, + "step": 2831, + "time_per_iteration": 2.524578094482422 + }, + { + "auxiliary_loss_clip": 0.01152075, + "auxiliary_loss_mlp": 0.01050912, + "balance_loss_clip": 1.03013349, + "balance_loss_mlp": 1.04948521, + "epoch": 0.17026905155568917, + "flos": 17744180762880.0, + "grad_norm": 2.1211873867699285, + "language_loss": 0.79345167, + "learning_rate": 3.7971306745416044e-06, + "loss": 0.81548154, + "num_input_tokens_seen": 61407350, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.0234375, + "step": 2832, + "time_per_iteration": 2.459954261779785 + }, + { + "auxiliary_loss_clip": 0.01151972, + "auxiliary_loss_mlp": 0.0104718, + "balance_loss_clip": 1.02836847, + "balance_loss_mlp": 1.05050385, + "epoch": 0.17032917480835713, + "flos": 23148665003520.0, + "grad_norm": 1.9382017652854369, + "language_loss": 0.89026237, + "learning_rate": 3.7969597291983046e-06, + "loss": 0.91225392, + "num_input_tokens_seen": 61429010, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 2833, + "time_per_iteration": 2.4812114238739014 + }, + { + "auxiliary_loss_clip": 0.0115284, + "auxiliary_loss_mlp": 0.01048999, + "balance_loss_clip": 1.02963924, + "balance_loss_mlp": 1.05124569, + "epoch": 0.1703892980610251, + "flos": 39202565512320.0, + "grad_norm": 2.853060698790674, + "language_loss": 0.72425497, + "learning_rate": 3.7967887157141115e-06, + "loss": 0.74627328, + "num_input_tokens_seen": 61450040, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.015625, + "step": 2834, + "time_per_iteration": 2.6271297931671143 + }, + { + "auxiliary_loss_clip": 0.01156378, + "auxiliary_loss_mlp": 0.01058486, + "balance_loss_clip": 1.03894782, + "balance_loss_mlp": 1.05294132, + "epoch": 0.17044942131369306, + "flos": 23039101543680.0, + "grad_norm": 1.9954265429463485, + "language_loss": 0.86434042, + "learning_rate": 3.7966176340955106e-06, + "loss": 0.88648909, + "num_input_tokens_seen": 61468585, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.03125, + "step": 2835, + "time_per_iteration": 2.4804999828338623 + }, + { + "auxiliary_loss_clip": 0.01155592, + "auxiliary_loss_mlp": 0.01051963, + "balance_loss_clip": 1.03007674, + "balance_loss_mlp": 1.05081642, + "epoch": 0.17050954456636103, + "flos": 17054983532160.0, + "grad_norm": 1.9180646463430515, + "language_loss": 0.73242748, + "learning_rate": 3.796446484348989e-06, + "loss": 0.75450307, + "num_input_tokens_seen": 61486330, + "router_z_loss_clip": 0.21875, + "router_z_loss_mlp": 1.046875, + "step": 2836, + "time_per_iteration": 2.4694178104400635 + }, + { + "auxiliary_loss_clip": 0.01156947, + "auxiliary_loss_mlp": 0.01048414, + "balance_loss_clip": 1.02599072, + "balance_loss_mlp": 1.05033076, + "epoch": 0.17056966781902902, + "flos": 16836969934080.0, + "grad_norm": 2.1253309510576717, + "language_loss": 0.79653537, + "learning_rate": 3.796275266481036e-06, + "loss": 0.81858897, + "num_input_tokens_seen": 61503950, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.0703125, + "step": 2837, + "time_per_iteration": 2.452153444290161 + }, + { + "auxiliary_loss_clip": 0.01150588, + "auxiliary_loss_mlp": 0.01045279, + "balance_loss_clip": 1.02550185, + "balance_loss_mlp": 1.05232143, + "epoch": 0.17062979107169698, + "flos": 17712543859200.0, + "grad_norm": 2.19906443062711, + "language_loss": 0.83575213, + "learning_rate": 3.7961039804981456e-06, + "loss": 0.85771078, + "num_input_tokens_seen": 61523550, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.98046875, + "step": 2838, + "time_per_iteration": 2.479573965072632 + }, + { + "auxiliary_loss_clip": 0.01148981, + "auxiliary_loss_mlp": 0.01045249, + "balance_loss_clip": 1.02660489, + "balance_loss_mlp": 1.05069315, + "epoch": 0.17068991432436495, + "flos": 22525040050560.0, + "grad_norm": 1.7423496230624245, + "language_loss": 0.93620354, + "learning_rate": 3.795932626406812e-06, + "loss": 0.95814586, + "num_input_tokens_seen": 61542720, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.984375, + "step": 2839, + "time_per_iteration": 2.5399010181427 + }, + { + "auxiliary_loss_clip": 0.01154457, + "auxiliary_loss_mlp": 0.01049077, + "balance_loss_clip": 1.0277859, + "balance_loss_mlp": 1.05050242, + "epoch": 0.17075003757703291, + "flos": 25882939077120.0, + "grad_norm": 2.8052720148780894, + "language_loss": 0.83847374, + "learning_rate": 3.7957612042135336e-06, + "loss": 0.86050916, + "num_input_tokens_seen": 61563040, + "router_z_loss_clip": 0.21386719, + "router_z_loss_mlp": 1.0390625, + "step": 2840, + "time_per_iteration": 2.5449130535125732 + }, + { + "auxiliary_loss_clip": 0.01155521, + "auxiliary_loss_mlp": 0.01047778, + "balance_loss_clip": 1.02647519, + "balance_loss_mlp": 1.05213881, + "epoch": 0.17081016082970088, + "flos": 20120713332480.0, + "grad_norm": 2.014300966058614, + "language_loss": 0.76390004, + "learning_rate": 3.79558971392481e-06, + "loss": 0.78593302, + "num_input_tokens_seen": 61581890, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.03125, + "step": 2841, + "time_per_iteration": 2.4836723804473877 + }, + { + "auxiliary_loss_clip": 0.01152003, + "auxiliary_loss_mlp": 0.01052533, + "balance_loss_clip": 1.03243482, + "balance_loss_mlp": 1.04932261, + "epoch": 0.17087028408236885, + "flos": 24936477661440.0, + "grad_norm": 1.8874127741110907, + "language_loss": 0.77000463, + "learning_rate": 3.7954181555471443e-06, + "loss": 0.79205, + "num_input_tokens_seen": 61602095, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.03125, + "step": 2842, + "time_per_iteration": 2.5051841735839844 + }, + { + "auxiliary_loss_clip": 0.01148386, + "auxiliary_loss_mlp": 0.01046299, + "balance_loss_clip": 1.02647448, + "balance_loss_mlp": 1.0497905, + "epoch": 0.17093040733503684, + "flos": 19057864872960.0, + "grad_norm": 2.05566421297988, + "language_loss": 0.86086738, + "learning_rate": 3.795246529087043e-06, + "loss": 0.88281423, + "num_input_tokens_seen": 61620400, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 0.98828125, + "step": 2843, + "time_per_iteration": 2.4487509727478027 + }, + { + "auxiliary_loss_clip": 0.01150009, + "auxiliary_loss_mlp": 0.01046155, + "balance_loss_clip": 1.02696228, + "balance_loss_mlp": 1.05090249, + "epoch": 0.1709905305877048, + "flos": 13078954333440.0, + "grad_norm": 1.8875494657309706, + "language_loss": 0.6826812, + "learning_rate": 3.7950748345510126e-06, + "loss": 0.70464289, + "num_input_tokens_seen": 61637680, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.9921875, + "step": 2844, + "time_per_iteration": 2.4429779052734375 + }, + { + "auxiliary_loss_clip": 0.01150851, + "auxiliary_loss_mlp": 0.01054229, + "balance_loss_clip": 1.03371274, + "balance_loss_mlp": 1.05040824, + "epoch": 0.17105065384037277, + "flos": 19209336526080.0, + "grad_norm": 1.8058232236820264, + "language_loss": 0.78258789, + "learning_rate": 3.7949030719455646e-06, + "loss": 0.80463862, + "num_input_tokens_seen": 61655630, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0, + "step": 2845, + "time_per_iteration": 2.4377951622009277 + }, + { + "auxiliary_loss_clip": 0.01151786, + "auxiliary_loss_mlp": 0.01045909, + "balance_loss_clip": 1.02687097, + "balance_loss_mlp": 1.05064154, + "epoch": 0.17111077709304073, + "flos": 18515183218560.0, + "grad_norm": 2.746386155528142, + "language_loss": 0.77959955, + "learning_rate": 3.7947312412772127e-06, + "loss": 0.8015765, + "num_input_tokens_seen": 61673475, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0078125, + "step": 2846, + "time_per_iteration": 2.4196622371673584 + }, + { + "auxiliary_loss_clip": 0.01152165, + "auxiliary_loss_mlp": 0.0104791, + "balance_loss_clip": 1.02895534, + "balance_loss_mlp": 1.05158973, + "epoch": 0.1711709003457087, + "flos": 25082670015360.0, + "grad_norm": 1.7441395807388675, + "language_loss": 0.7942031, + "learning_rate": 3.794559342552472e-06, + "loss": 0.81620383, + "num_input_tokens_seen": 61693370, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0078125, + "step": 2847, + "time_per_iteration": 2.504087448120117 + }, + { + "auxiliary_loss_clip": 0.01148457, + "auxiliary_loss_mlp": 0.0104865, + "balance_loss_clip": 1.02913523, + "balance_loss_mlp": 1.04612017, + "epoch": 0.17123102359837666, + "flos": 17566387418880.0, + "grad_norm": 2.239997254259111, + "language_loss": 0.86818451, + "learning_rate": 3.7943873757778614e-06, + "loss": 0.89015555, + "num_input_tokens_seen": 61710820, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0234375, + "step": 2848, + "time_per_iteration": 2.438711643218994 + }, + { + "auxiliary_loss_clip": 0.0115323, + "auxiliary_loss_mlp": 0.01044307, + "balance_loss_clip": 1.02438748, + "balance_loss_mlp": 1.05133212, + "epoch": 0.17129114685104463, + "flos": 26173635845760.0, + "grad_norm": 1.715396677859901, + "language_loss": 0.75223613, + "learning_rate": 3.794215340959902e-06, + "loss": 0.77421153, + "num_input_tokens_seen": 61729855, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.015625, + "step": 2849, + "time_per_iteration": 2.4918415546417236 + }, + { + "auxiliary_loss_clip": 0.01047678, + "auxiliary_loss_mlp": 0.01003312, + "balance_loss_clip": 1.00107098, + "balance_loss_mlp": 1.01492834, + "epoch": 0.17135127010371262, + "flos": 69269710037760.0, + "grad_norm": 0.7949737728021388, + "language_loss": 0.57471085, + "learning_rate": 3.7940432381051163e-06, + "loss": 0.59522074, + "num_input_tokens_seen": 61790290, + "router_z_loss_clip": 0.02246094, + "router_z_loss_mlp": 0.328125, + "step": 2850, + "time_per_iteration": 3.057778835296631 + }, + { + "auxiliary_loss_clip": 0.01146039, + "auxiliary_loss_mlp": 0.0105304, + "balance_loss_clip": 1.03332317, + "balance_loss_mlp": 1.04852295, + "epoch": 0.1714113933563806, + "flos": 23550110380800.0, + "grad_norm": 2.4364727127987704, + "language_loss": 0.80988616, + "learning_rate": 3.793871067220031e-06, + "loss": 0.83187693, + "num_input_tokens_seen": 61809265, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.9765625, + "step": 2851, + "time_per_iteration": 3.887600898742676 + }, + { + "auxiliary_loss_clip": 0.01146778, + "auxiliary_loss_mlp": 0.0104369, + "balance_loss_clip": 1.02549911, + "balance_loss_mlp": 1.04858351, + "epoch": 0.17147151660904855, + "flos": 21142443697920.0, + "grad_norm": 2.035620688428962, + "language_loss": 0.93063158, + "learning_rate": 3.7936988283111764e-06, + "loss": 0.95253623, + "num_input_tokens_seen": 61828980, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.98046875, + "step": 2852, + "time_per_iteration": 3.920153856277466 + }, + { + "auxiliary_loss_clip": 0.01149404, + "auxiliary_loss_mlp": 0.01053071, + "balance_loss_clip": 1.03374732, + "balance_loss_mlp": 1.04728949, + "epoch": 0.17153163986171652, + "flos": 18624890332800.0, + "grad_norm": 1.8406206656402175, + "language_loss": 0.69480836, + "learning_rate": 3.7935265213850817e-06, + "loss": 0.71683311, + "num_input_tokens_seen": 61847915, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.015625, + "step": 2853, + "time_per_iteration": 2.4457037448883057 + }, + { + "auxiliary_loss_clip": 0.0115316, + "auxiliary_loss_mlp": 0.01050964, + "balance_loss_clip": 1.03150904, + "balance_loss_mlp": 1.05059445, + "epoch": 0.17159176311438448, + "flos": 18223265387520.0, + "grad_norm": 2.187977199847503, + "language_loss": 0.66505128, + "learning_rate": 3.7933541464482815e-06, + "loss": 0.68709248, + "num_input_tokens_seen": 61865570, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0234375, + "step": 2854, + "time_per_iteration": 2.4421632289886475 + }, + { + "auxiliary_loss_clip": 0.01144359, + "auxiliary_loss_mlp": 0.01044047, + "balance_loss_clip": 1.02520037, + "balance_loss_mlp": 1.04574227, + "epoch": 0.17165188636705245, + "flos": 20738987159040.0, + "grad_norm": 1.8257227486643586, + "language_loss": 0.89394444, + "learning_rate": 3.7931817035073124e-06, + "loss": 0.91582847, + "num_input_tokens_seen": 61883340, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.984375, + "step": 2855, + "time_per_iteration": 2.4601552486419678 + }, + { + "auxiliary_loss_clip": 0.01148566, + "auxiliary_loss_mlp": 0.01051381, + "balance_loss_clip": 1.03286791, + "balance_loss_mlp": 1.04792452, + "epoch": 0.17171200961972044, + "flos": 24899884680960.0, + "grad_norm": 2.515892939250119, + "language_loss": 0.83822739, + "learning_rate": 3.7930091925687134e-06, + "loss": 0.86022681, + "num_input_tokens_seen": 61900610, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0078125, + "step": 2856, + "time_per_iteration": 2.4747347831726074 + }, + { + "auxiliary_loss_clip": 0.01151618, + "auxiliary_loss_mlp": 0.01048758, + "balance_loss_clip": 1.02906466, + "balance_loss_mlp": 1.05112195, + "epoch": 0.1717721328723884, + "flos": 20157234485760.0, + "grad_norm": 1.9053156238546485, + "language_loss": 0.8645792, + "learning_rate": 3.792836613639026e-06, + "loss": 0.88658297, + "num_input_tokens_seen": 61916795, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0078125, + "step": 2857, + "time_per_iteration": 2.4460220336914062 + }, + { + "auxiliary_loss_clip": 0.01148045, + "auxiliary_loss_mlp": 0.0105234, + "balance_loss_clip": 1.03327847, + "balance_loss_mlp": 1.04805577, + "epoch": 0.17183225612505637, + "flos": 23361650697600.0, + "grad_norm": 2.139076633770832, + "language_loss": 0.77919662, + "learning_rate": 3.7926639667247947e-06, + "loss": 0.80120051, + "num_input_tokens_seen": 61936665, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0, + "step": 2858, + "time_per_iteration": 2.4459195137023926 + }, + { + "auxiliary_loss_clip": 0.01156262, + "auxiliary_loss_mlp": 0.01058687, + "balance_loss_clip": 1.03761101, + "balance_loss_mlp": 1.04760742, + "epoch": 0.17189237937772434, + "flos": 18114240631680.0, + "grad_norm": 2.423579883765011, + "language_loss": 0.77235049, + "learning_rate": 3.7924912518325663e-06, + "loss": 0.79449999, + "num_input_tokens_seen": 61954415, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.0859375, + "step": 2859, + "time_per_iteration": 2.43471360206604 + }, + { + "auxiliary_loss_clip": 0.01148379, + "auxiliary_loss_mlp": 0.01050312, + "balance_loss_clip": 1.03069019, + "balance_loss_mlp": 1.04920983, + "epoch": 0.1719525026303923, + "flos": 23258408031360.0, + "grad_norm": 3.774880148287903, + "language_loss": 0.77179611, + "learning_rate": 3.7923184689688902e-06, + "loss": 0.79378301, + "num_input_tokens_seen": 61973940, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.9921875, + "step": 2860, + "time_per_iteration": 2.463344097137451 + }, + { + "auxiliary_loss_clip": 0.01149457, + "auxiliary_loss_mlp": 0.01051045, + "balance_loss_clip": 1.03217435, + "balance_loss_mlp": 1.04703689, + "epoch": 0.17201262588306027, + "flos": 20810413353600.0, + "grad_norm": 2.1505291491255463, + "language_loss": 0.81964719, + "learning_rate": 3.792145618140317e-06, + "loss": 0.84165227, + "num_input_tokens_seen": 61991845, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0234375, + "step": 2861, + "time_per_iteration": 2.4505395889282227 + }, + { + "auxiliary_loss_clip": 0.01149339, + "auxiliary_loss_mlp": 0.01050609, + "balance_loss_clip": 1.03163123, + "balance_loss_mlp": 1.04897118, + "epoch": 0.17207274913572823, + "flos": 20375858615040.0, + "grad_norm": 4.22955926449596, + "language_loss": 0.85649675, + "learning_rate": 3.7919726993534038e-06, + "loss": 0.87849623, + "num_input_tokens_seen": 62009395, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0078125, + "step": 2862, + "time_per_iteration": 2.4392077922821045 + }, + { + "auxiliary_loss_clip": 0.01144423, + "auxiliary_loss_mlp": 0.01046105, + "balance_loss_clip": 1.02867651, + "balance_loss_mlp": 1.04785109, + "epoch": 0.17213287238839622, + "flos": 26797727675520.0, + "grad_norm": 2.3146804122881037, + "language_loss": 0.77874523, + "learning_rate": 3.7917997126147054e-06, + "loss": 0.80065054, + "num_input_tokens_seen": 62029005, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.96484375, + "step": 2863, + "time_per_iteration": 2.4745166301727295 + }, + { + "auxiliary_loss_clip": 0.01147347, + "auxiliary_loss_mlp": 0.01048138, + "balance_loss_clip": 1.02935052, + "balance_loss_mlp": 1.04726493, + "epoch": 0.1721929956410642, + "flos": 26030819370240.0, + "grad_norm": 1.7012031973405044, + "language_loss": 0.72191179, + "learning_rate": 3.7916266579307823e-06, + "loss": 0.74386668, + "num_input_tokens_seen": 62048730, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0, + "step": 2864, + "time_per_iteration": 2.496522903442383 + }, + { + "auxiliary_loss_clip": 0.01151447, + "auxiliary_loss_mlp": 0.01053526, + "balance_loss_clip": 1.03497648, + "balance_loss_mlp": 1.04935968, + "epoch": 0.17225311889373215, + "flos": 22273091078400.0, + "grad_norm": 1.6688219876641972, + "language_loss": 0.72896975, + "learning_rate": 3.7914535353081973e-06, + "loss": 0.75101948, + "num_input_tokens_seen": 62069000, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.015625, + "step": 2865, + "time_per_iteration": 2.468726396560669 + }, + { + "auxiliary_loss_clip": 0.01151587, + "auxiliary_loss_mlp": 0.01044873, + "balance_loss_clip": 1.02608538, + "balance_loss_mlp": 1.05194211, + "epoch": 0.17231324214640012, + "flos": 21287774125440.0, + "grad_norm": 3.1747822479918764, + "language_loss": 0.79011786, + "learning_rate": 3.7912803447535145e-06, + "loss": 0.81208247, + "num_input_tokens_seen": 62086750, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.99609375, + "step": 2866, + "time_per_iteration": 2.445716381072998 + }, + { + "auxiliary_loss_clip": 0.01149563, + "auxiliary_loss_mlp": 0.01046901, + "balance_loss_clip": 1.02651668, + "balance_loss_mlp": 1.04966402, + "epoch": 0.17237336539906808, + "flos": 19680735640320.0, + "grad_norm": 1.797659045411876, + "language_loss": 0.79865277, + "learning_rate": 3.7911070862733016e-06, + "loss": 0.82061744, + "num_input_tokens_seen": 62106240, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0, + "step": 2867, + "time_per_iteration": 2.4745590686798096 + }, + { + "auxiliary_loss_clip": 0.0114836, + "auxiliary_loss_mlp": 0.01037447, + "balance_loss_clip": 1.01887417, + "balance_loss_mlp": 1.04821014, + "epoch": 0.17243348865173605, + "flos": 17529650784000.0, + "grad_norm": 1.717941409951427, + "language_loss": 0.79707634, + "learning_rate": 3.7909337598741276e-06, + "loss": 0.81893444, + "num_input_tokens_seen": 62124895, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0, + "step": 2868, + "time_per_iteration": 2.4545693397521973 + }, + { + "auxiliary_loss_clip": 0.01157442, + "auxiliary_loss_mlp": 0.0104461, + "balance_loss_clip": 1.02645397, + "balance_loss_mlp": 1.0538218, + "epoch": 0.17249361190440402, + "flos": 18259858368000.0, + "grad_norm": 1.9332967921770021, + "language_loss": 0.84265673, + "learning_rate": 3.7907603655625674e-06, + "loss": 0.86467719, + "num_input_tokens_seen": 62143510, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.03125, + "step": 2869, + "time_per_iteration": 2.445429563522339 + }, + { + "auxiliary_loss_clip": 0.01151297, + "auxiliary_loss_mlp": 0.01052302, + "balance_loss_clip": 1.03226328, + "balance_loss_mlp": 1.04971075, + "epoch": 0.172553735157072, + "flos": 21174367910400.0, + "grad_norm": 2.3539211413688954, + "language_loss": 0.77522051, + "learning_rate": 3.7905869033451932e-06, + "loss": 0.79725653, + "num_input_tokens_seen": 62162285, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.015625, + "step": 2870, + "time_per_iteration": 2.4975087642669678 + }, + { + "auxiliary_loss_clip": 0.01146931, + "auxiliary_loss_mlp": 0.01043287, + "balance_loss_clip": 1.02609706, + "balance_loss_mlp": 1.05132568, + "epoch": 0.17261385840973997, + "flos": 22273270646400.0, + "grad_norm": 1.897031493968697, + "language_loss": 0.7680704, + "learning_rate": 3.7904133732285857e-06, + "loss": 0.78997254, + "num_input_tokens_seen": 62180970, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.95703125, + "step": 2871, + "time_per_iteration": 2.4777348041534424 + }, + { + "auxiliary_loss_clip": 0.01150344, + "auxiliary_loss_mlp": 0.01043916, + "balance_loss_clip": 1.02442563, + "balance_loss_mlp": 1.05061746, + "epoch": 0.17267398166240794, + "flos": 27922233830400.0, + "grad_norm": 2.240934958328371, + "language_loss": 0.74448204, + "learning_rate": 3.7902397752193228e-06, + "loss": 0.76642466, + "num_input_tokens_seen": 62198965, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0, + "step": 2872, + "time_per_iteration": 2.5021097660064697 + }, + { + "auxiliary_loss_clip": 0.01147343, + "auxiliary_loss_mlp": 0.01039942, + "balance_loss_clip": 1.02117848, + "balance_loss_mlp": 1.05127549, + "epoch": 0.1727341049150759, + "flos": 21945118970880.0, + "grad_norm": 1.8155923086100165, + "language_loss": 0.82694656, + "learning_rate": 3.790066109323988e-06, + "loss": 0.84881938, + "num_input_tokens_seen": 62219890, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9609375, + "step": 2873, + "time_per_iteration": 2.4852540493011475 + }, + { + "auxiliary_loss_clip": 0.01147589, + "auxiliary_loss_mlp": 0.01043231, + "balance_loss_clip": 1.0229888, + "balance_loss_mlp": 1.049196, + "epoch": 0.17279422816774387, + "flos": 18107883924480.0, + "grad_norm": 2.0464410919173814, + "language_loss": 0.75083232, + "learning_rate": 3.7898923755491678e-06, + "loss": 0.77274048, + "num_input_tokens_seen": 62237140, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.984375, + "step": 2874, + "time_per_iteration": 2.440610885620117 + }, + { + "auxiliary_loss_clip": 0.01151305, + "auxiliary_loss_mlp": 0.01044062, + "balance_loss_clip": 1.0238322, + "balance_loss_mlp": 1.0515728, + "epoch": 0.17285435142041183, + "flos": 21835447770240.0, + "grad_norm": 1.9230852666364326, + "language_loss": 0.8067199, + "learning_rate": 3.7897185739014487e-06, + "loss": 0.8286736, + "num_input_tokens_seen": 62255405, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.99609375, + "step": 2875, + "time_per_iteration": 2.478473424911499 + }, + { + "auxiliary_loss_clip": 0.01153488, + "auxiliary_loss_mlp": 0.01049908, + "balance_loss_clip": 1.02984488, + "balance_loss_mlp": 1.05083489, + "epoch": 0.17291447467307983, + "flos": 18368452160640.0, + "grad_norm": 2.5699127680633542, + "language_loss": 0.87525117, + "learning_rate": 3.7895447043874217e-06, + "loss": 0.89728516, + "num_input_tokens_seen": 62271280, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.03125, + "step": 2876, + "time_per_iteration": 2.4580602645874023 + }, + { + "auxiliary_loss_clip": 0.01150473, + "auxiliary_loss_mlp": 0.01042457, + "balance_loss_clip": 1.02384901, + "balance_loss_mlp": 1.05273616, + "epoch": 0.1729745979257478, + "flos": 18624638937600.0, + "grad_norm": 1.9567138745888089, + "language_loss": 0.84561193, + "learning_rate": 3.789370767013681e-06, + "loss": 0.86754125, + "num_input_tokens_seen": 62289140, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9765625, + "step": 2877, + "time_per_iteration": 2.4696123600006104 + }, + { + "auxiliary_loss_clip": 0.01151589, + "auxiliary_loss_mlp": 0.01041028, + "balance_loss_clip": 1.02179909, + "balance_loss_mlp": 1.05281305, + "epoch": 0.17303472117841576, + "flos": 22998234844800.0, + "grad_norm": 3.0724129461132406, + "language_loss": 0.79527134, + "learning_rate": 3.7891967617868204e-06, + "loss": 0.81719756, + "num_input_tokens_seen": 62307490, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.984375, + "step": 2878, + "time_per_iteration": 2.4739902019500732 + }, + { + "auxiliary_loss_clip": 0.01147004, + "auxiliary_loss_mlp": 0.01042956, + "balance_loss_clip": 1.02450228, + "balance_loss_mlp": 1.04968572, + "epoch": 0.17309484443108372, + "flos": 25664386775040.0, + "grad_norm": 1.9694378769308076, + "language_loss": 0.70306808, + "learning_rate": 3.78902268871344e-06, + "loss": 0.72496772, + "num_input_tokens_seen": 62328570, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.97265625, + "step": 2879, + "time_per_iteration": 2.5014665126800537 + }, + { + "auxiliary_loss_clip": 0.01151101, + "auxiliary_loss_mlp": 0.01050497, + "balance_loss_clip": 1.03156662, + "balance_loss_mlp": 1.05038834, + "epoch": 0.1731549676837517, + "flos": 13552903313280.0, + "grad_norm": 2.4431111997211734, + "language_loss": 0.83465785, + "learning_rate": 3.78884854780014e-06, + "loss": 0.85667384, + "num_input_tokens_seen": 62345735, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0078125, + "step": 2880, + "time_per_iteration": 2.433776378631592 + }, + { + "auxiliary_loss_clip": 0.01153087, + "auxiliary_loss_mlp": 0.01044219, + "balance_loss_clip": 1.0250026, + "balance_loss_mlp": 1.05171311, + "epoch": 0.17321509093641965, + "flos": 22857070394880.0, + "grad_norm": 2.135155165507549, + "language_loss": 0.80866969, + "learning_rate": 3.7886743390535236e-06, + "loss": 0.8306427, + "num_input_tokens_seen": 62365525, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.0078125, + "step": 2881, + "time_per_iteration": 2.4944772720336914 + }, + { + "auxiliary_loss_clip": 0.01148623, + "auxiliary_loss_mlp": 0.01043966, + "balance_loss_clip": 1.0263828, + "balance_loss_mlp": 1.05030859, + "epoch": 0.17327521418908762, + "flos": 24352785653760.0, + "grad_norm": 2.5502275528368066, + "language_loss": 0.77372867, + "learning_rate": 3.788500062480197e-06, + "loss": 0.79565454, + "num_input_tokens_seen": 62385160, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.984375, + "step": 2882, + "time_per_iteration": 2.5426836013793945 + }, + { + "auxiliary_loss_clip": 0.011482, + "auxiliary_loss_mlp": 0.01051627, + "balance_loss_clip": 1.03276825, + "balance_loss_mlp": 1.05005169, + "epoch": 0.1733353374417556, + "flos": 33105651816960.0, + "grad_norm": 1.8718611847068298, + "language_loss": 0.76652586, + "learning_rate": 3.788325718086769e-06, + "loss": 0.78852415, + "num_input_tokens_seen": 62405280, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.984375, + "step": 2883, + "time_per_iteration": 2.5733277797698975 + }, + { + "auxiliary_loss_clip": 0.01146516, + "auxiliary_loss_mlp": 0.01044391, + "balance_loss_clip": 1.0265696, + "balance_loss_mlp": 1.04944682, + "epoch": 0.17339546069442358, + "flos": 24388947671040.0, + "grad_norm": 1.945193845574475, + "language_loss": 0.85463524, + "learning_rate": 3.7881513058798503e-06, + "loss": 0.87654424, + "num_input_tokens_seen": 62423665, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.97265625, + "step": 2884, + "time_per_iteration": 2.4708735942840576 + }, + { + "auxiliary_loss_clip": 0.01149646, + "auxiliary_loss_mlp": 0.01039486, + "balance_loss_clip": 1.02122355, + "balance_loss_mlp": 1.05114794, + "epoch": 0.17345558394709154, + "flos": 27454174680960.0, + "grad_norm": 1.6148586475999513, + "language_loss": 0.73758793, + "learning_rate": 3.787976825866055e-06, + "loss": 0.75947917, + "num_input_tokens_seen": 62445170, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.984375, + "step": 2885, + "time_per_iteration": 2.5266878604888916 + }, + { + "auxiliary_loss_clip": 0.01147273, + "auxiliary_loss_mlp": 0.01044711, + "balance_loss_clip": 1.02775908, + "balance_loss_mlp": 1.05269074, + "epoch": 0.1735157071997595, + "flos": 24682158391680.0, + "grad_norm": 1.9690054244815705, + "language_loss": 0.70377076, + "learning_rate": 3.7878022780519998e-06, + "loss": 0.72569054, + "num_input_tokens_seen": 62466135, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9453125, + "step": 2886, + "time_per_iteration": 2.508695363998413 + }, + { + "auxiliary_loss_clip": 0.01146959, + "auxiliary_loss_mlp": 0.01040101, + "balance_loss_clip": 1.0212425, + "balance_loss_mlp": 1.04799545, + "epoch": 0.17357583045242747, + "flos": 21688932193920.0, + "grad_norm": 1.9665325510573808, + "language_loss": 0.69294798, + "learning_rate": 3.7876276624443024e-06, + "loss": 0.7148186, + "num_input_tokens_seen": 62483910, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.98828125, + "step": 2887, + "time_per_iteration": 2.4787776470184326 + }, + { + "auxiliary_loss_clip": 0.01149915, + "auxiliary_loss_mlp": 0.01049822, + "balance_loss_clip": 1.03180945, + "balance_loss_mlp": 1.05075955, + "epoch": 0.17363595370509544, + "flos": 15375728753280.0, + "grad_norm": 1.791000255721863, + "language_loss": 0.85391176, + "learning_rate": 3.787452979049585e-06, + "loss": 0.87590909, + "num_input_tokens_seen": 62501530, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9921875, + "step": 2888, + "time_per_iteration": 2.4234085083007812 + }, + { + "auxiliary_loss_clip": 0.01149617, + "auxiliary_loss_mlp": 0.01046928, + "balance_loss_clip": 1.02668667, + "balance_loss_mlp": 1.05046952, + "epoch": 0.1736960769577634, + "flos": 23440941970560.0, + "grad_norm": 3.660213605651755, + "language_loss": 0.78465497, + "learning_rate": 3.7872782278744718e-06, + "loss": 0.80662042, + "num_input_tokens_seen": 62521295, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 0.9921875, + "step": 2889, + "time_per_iteration": 2.5042123794555664 + }, + { + "auxiliary_loss_clip": 0.01146581, + "auxiliary_loss_mlp": 0.01047944, + "balance_loss_clip": 1.02913308, + "balance_loss_mlp": 1.05222893, + "epoch": 0.1737562002104314, + "flos": 18587830475520.0, + "grad_norm": 2.9081348702485723, + "language_loss": 0.83860242, + "learning_rate": 3.7871034089255883e-06, + "loss": 0.86054766, + "num_input_tokens_seen": 62539615, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.9453125, + "step": 2890, + "time_per_iteration": 2.4698500633239746 + }, + { + "auxiliary_loss_clip": 0.01150813, + "auxiliary_loss_mlp": 0.01047378, + "balance_loss_clip": 1.02880502, + "balance_loss_mlp": 1.05083108, + "epoch": 0.17381632346309936, + "flos": 15998060816640.0, + "grad_norm": 1.9935479009749588, + "language_loss": 0.82253492, + "learning_rate": 3.7869285222095653e-06, + "loss": 0.84451687, + "num_input_tokens_seen": 62556820, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0, + "step": 2891, + "time_per_iteration": 2.4478886127471924 + }, + { + "auxiliary_loss_clip": 0.01150226, + "auxiliary_loss_mlp": 0.01045776, + "balance_loss_clip": 1.02608275, + "balance_loss_mlp": 1.04824781, + "epoch": 0.17387644671576732, + "flos": 13369830670080.0, + "grad_norm": 2.3073165362682873, + "language_loss": 0.81479478, + "learning_rate": 3.7867535677330334e-06, + "loss": 0.8367548, + "num_input_tokens_seen": 62572450, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.015625, + "step": 2892, + "time_per_iteration": 2.4094645977020264 + }, + { + "auxiliary_loss_clip": 0.01154909, + "auxiliary_loss_mlp": 0.01055666, + "balance_loss_clip": 1.03519785, + "balance_loss_mlp": 1.05379355, + "epoch": 0.1739365699684353, + "flos": 26615516958720.0, + "grad_norm": 2.24459564009462, + "language_loss": 0.74480057, + "learning_rate": 3.786578545502627e-06, + "loss": 0.76690638, + "num_input_tokens_seen": 62592580, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0078125, + "step": 2893, + "time_per_iteration": 3.8296191692352295 + }, + { + "auxiliary_loss_clip": 0.01152082, + "auxiliary_loss_mlp": 0.010434, + "balance_loss_clip": 1.02375412, + "balance_loss_mlp": 1.05193436, + "epoch": 0.17399669322110325, + "flos": 23367971491200.0, + "grad_norm": 2.117368029368179, + "language_loss": 0.83073241, + "learning_rate": 3.7864034555249828e-06, + "loss": 0.85268712, + "num_input_tokens_seen": 62611220, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0, + "step": 2894, + "time_per_iteration": 3.9817075729370117 + }, + { + "auxiliary_loss_clip": 0.01150382, + "auxiliary_loss_mlp": 0.01047312, + "balance_loss_clip": 1.02523482, + "balance_loss_mlp": 1.05032384, + "epoch": 0.17405681647377122, + "flos": 22054107813120.0, + "grad_norm": 2.157907065313142, + "language_loss": 0.74051547, + "learning_rate": 3.786228297806741e-06, + "loss": 0.76249242, + "num_input_tokens_seen": 62629185, + "router_z_loss_clip": 0.22070312, + "router_z_loss_mlp": 1.0, + "step": 2895, + "time_per_iteration": 2.461857318878174 + }, + { + "auxiliary_loss_clip": 0.01048544, + "auxiliary_loss_mlp": 0.01006984, + "balance_loss_clip": 1.00467134, + "balance_loss_mlp": 1.01600659, + "epoch": 0.1741169397264392, + "flos": 61457559114240.0, + "grad_norm": 0.8715266336267762, + "language_loss": 0.6273998, + "learning_rate": 3.7860530723545435e-06, + "loss": 0.64795506, + "num_input_tokens_seen": 62691895, + "router_z_loss_clip": 0.02307129, + "router_z_loss_mlp": 0.32421875, + "step": 2896, + "time_per_iteration": 3.1462173461914062 + }, + { + "auxiliary_loss_clip": 0.01148575, + "auxiliary_loss_mlp": 0.01041696, + "balance_loss_clip": 1.02160895, + "balance_loss_mlp": 1.04787612, + "epoch": 0.17417706297910718, + "flos": 27017680608000.0, + "grad_norm": 2.3238967096174923, + "language_loss": 0.75600475, + "learning_rate": 3.785877779175034e-06, + "loss": 0.77790749, + "num_input_tokens_seen": 62713790, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0078125, + "step": 2897, + "time_per_iteration": 2.4974682331085205 + }, + { + "auxiliary_loss_clip": 0.01147676, + "auxiliary_loss_mlp": 0.01042715, + "balance_loss_clip": 1.02354646, + "balance_loss_mlp": 1.05000067, + "epoch": 0.17423718623177514, + "flos": 33508856960640.0, + "grad_norm": 1.9004029304223122, + "language_loss": 0.69384712, + "learning_rate": 3.7857024182748606e-06, + "loss": 0.71575105, + "num_input_tokens_seen": 62736285, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.9765625, + "step": 2898, + "time_per_iteration": 2.5650558471679688 + }, + { + "auxiliary_loss_clip": 0.0115334, + "auxiliary_loss_mlp": 0.01049615, + "balance_loss_clip": 1.03026772, + "balance_loss_mlp": 1.05215359, + "epoch": 0.1742973094844431, + "flos": 27198634348800.0, + "grad_norm": 2.315885710988465, + "language_loss": 0.76069367, + "learning_rate": 3.7855269896606717e-06, + "loss": 0.78272319, + "num_input_tokens_seen": 62756240, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.015625, + "step": 2899, + "time_per_iteration": 2.5006191730499268 + }, + { + "auxiliary_loss_clip": 0.01145178, + "auxiliary_loss_mlp": 0.01045245, + "balance_loss_clip": 1.02571905, + "balance_loss_mlp": 1.04929495, + "epoch": 0.17435743273711107, + "flos": 22710734386560.0, + "grad_norm": 1.9440585306650153, + "language_loss": 0.72821134, + "learning_rate": 3.785351493339121e-06, + "loss": 0.75011557, + "num_input_tokens_seen": 62775910, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.9609375, + "step": 2900, + "time_per_iteration": 2.5199801921844482 + }, + { + "auxiliary_loss_clip": 0.01150074, + "auxiliary_loss_mlp": 0.01051215, + "balance_loss_clip": 1.03261876, + "balance_loss_mlp": 1.04989529, + "epoch": 0.17441755598977904, + "flos": 41646466039680.0, + "grad_norm": 1.6677330343015109, + "language_loss": 0.70085949, + "learning_rate": 3.785175929316863e-06, + "loss": 0.72287238, + "num_input_tokens_seen": 62799385, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0, + "step": 2901, + "time_per_iteration": 2.624864101409912 + }, + { + "auxiliary_loss_clip": 0.01152064, + "auxiliary_loss_mlp": 0.01048884, + "balance_loss_clip": 1.03022778, + "balance_loss_mlp": 1.05087507, + "epoch": 0.174477679242447, + "flos": 26287077974400.0, + "grad_norm": 1.7643324639769489, + "language_loss": 0.76549768, + "learning_rate": 3.7850002976005543e-06, + "loss": 0.78750718, + "num_input_tokens_seen": 62819380, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.0078125, + "step": 2902, + "time_per_iteration": 2.51680850982666 + }, + { + "auxiliary_loss_clip": 0.01150394, + "auxiliary_loss_mlp": 0.01056591, + "balance_loss_clip": 1.03741002, + "balance_loss_mlp": 1.04885221, + "epoch": 0.174537802495115, + "flos": 17858412990720.0, + "grad_norm": 2.129298660499851, + "language_loss": 0.81787169, + "learning_rate": 3.7848245981968558e-06, + "loss": 0.8399415, + "num_input_tokens_seen": 62836205, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.015625, + "step": 2903, + "time_per_iteration": 2.436877727508545 + }, + { + "auxiliary_loss_clip": 0.01148467, + "auxiliary_loss_mlp": 0.01041626, + "balance_loss_clip": 1.02255297, + "balance_loss_mlp": 1.04978609, + "epoch": 0.17459792574778296, + "flos": 16940715390720.0, + "grad_norm": 2.1703016783079327, + "language_loss": 0.73228866, + "learning_rate": 3.784648831112429e-06, + "loss": 0.75418955, + "num_input_tokens_seen": 62854045, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.98828125, + "step": 2904, + "time_per_iteration": 2.462775707244873 + }, + { + "auxiliary_loss_clip": 0.0114756, + "auxiliary_loss_mlp": 0.01045319, + "balance_loss_clip": 1.02719879, + "balance_loss_mlp": 1.04777265, + "epoch": 0.17465804900045093, + "flos": 25520026014720.0, + "grad_norm": 1.9374721445221084, + "language_loss": 0.64526325, + "learning_rate": 3.7844729963539406e-06, + "loss": 0.6671921, + "num_input_tokens_seen": 62873075, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.0, + "step": 2905, + "time_per_iteration": 2.468395233154297 + }, + { + "auxiliary_loss_clip": 0.01157304, + "auxiliary_loss_mlp": 0.01050089, + "balance_loss_clip": 1.0292747, + "balance_loss_mlp": 1.05202341, + "epoch": 0.1747181722531189, + "flos": 24129708238080.0, + "grad_norm": 1.804147248272645, + "language_loss": 0.79236615, + "learning_rate": 3.7842970939280566e-06, + "loss": 0.81444013, + "num_input_tokens_seen": 62892675, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.0546875, + "step": 2906, + "time_per_iteration": 2.4632725715637207 + }, + { + "auxiliary_loss_clip": 0.01150693, + "auxiliary_loss_mlp": 0.01055346, + "balance_loss_clip": 1.03577161, + "balance_loss_mlp": 1.05044913, + "epoch": 0.17477829550578686, + "flos": 17748813617280.0, + "grad_norm": 1.7929508882228948, + "language_loss": 0.81010377, + "learning_rate": 3.784121123841449e-06, + "loss": 0.83216417, + "num_input_tokens_seen": 62910675, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0, + "step": 2907, + "time_per_iteration": 2.4214229583740234 + }, + { + "auxiliary_loss_clip": 0.01152007, + "auxiliary_loss_mlp": 0.01050463, + "balance_loss_clip": 1.03161621, + "balance_loss_mlp": 1.05040026, + "epoch": 0.17483841875845482, + "flos": 15377344865280.0, + "grad_norm": 2.7402312811515515, + "language_loss": 0.81315112, + "learning_rate": 3.7839450861007886e-06, + "loss": 0.83517587, + "num_input_tokens_seen": 62928130, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.015625, + "step": 2908, + "time_per_iteration": 2.4340970516204834 + }, + { + "auxiliary_loss_clip": 0.01150458, + "auxiliary_loss_mlp": 0.01051278, + "balance_loss_clip": 1.03047633, + "balance_loss_mlp": 1.04978228, + "epoch": 0.17489854201112282, + "flos": 17163254102400.0, + "grad_norm": 2.419675279893618, + "language_loss": 0.80399191, + "learning_rate": 3.7837689807127518e-06, + "loss": 0.82600915, + "num_input_tokens_seen": 62944290, + "router_z_loss_clip": 0.20800781, + "router_z_loss_mlp": 1.0078125, + "step": 2909, + "time_per_iteration": 2.4170033931732178 + }, + { + "auxiliary_loss_clip": 0.0115308, + "auxiliary_loss_mlp": 0.01053412, + "balance_loss_clip": 1.03319383, + "balance_loss_mlp": 1.05133021, + "epoch": 0.17495866526379078, + "flos": 19755286318080.0, + "grad_norm": 1.6998329053727648, + "language_loss": 0.76530939, + "learning_rate": 3.783592807684017e-06, + "loss": 0.78737426, + "num_input_tokens_seen": 62963505, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.015625, + "step": 2910, + "time_per_iteration": 2.457628011703491 + }, + { + "auxiliary_loss_clip": 0.01151475, + "auxiliary_loss_mlp": 0.01049527, + "balance_loss_clip": 1.02901077, + "balance_loss_mlp": 1.05060935, + "epoch": 0.17501878851645875, + "flos": 28511133310080.0, + "grad_norm": 1.6502133484544155, + "language_loss": 0.87255991, + "learning_rate": 3.7834165670212645e-06, + "loss": 0.89456993, + "num_input_tokens_seen": 62985020, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.0078125, + "step": 2911, + "time_per_iteration": 2.5302672386169434 + }, + { + "auxiliary_loss_clip": 0.01148398, + "auxiliary_loss_mlp": 0.0105451, + "balance_loss_clip": 1.03349352, + "balance_loss_mlp": 1.04746377, + "epoch": 0.1750789117691267, + "flos": 17931203902080.0, + "grad_norm": 2.260601647926804, + "language_loss": 0.89586449, + "learning_rate": 3.7832402587311764e-06, + "loss": 0.91789353, + "num_input_tokens_seen": 63001745, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.0078125, + "step": 2912, + "time_per_iteration": 2.447650194168091 + }, + { + "auxiliary_loss_clip": 0.01150608, + "auxiliary_loss_mlp": 0.01050267, + "balance_loss_clip": 1.0302161, + "balance_loss_mlp": 1.04871392, + "epoch": 0.17513903502179468, + "flos": 18259427404800.0, + "grad_norm": 2.8836544870459813, + "language_loss": 0.7262938, + "learning_rate": 3.783063882820439e-06, + "loss": 0.74830252, + "num_input_tokens_seen": 63019750, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 1.015625, + "step": 2913, + "time_per_iteration": 2.423595666885376 + }, + { + "auxiliary_loss_clip": 0.01150722, + "auxiliary_loss_mlp": 0.01047113, + "balance_loss_clip": 1.02738369, + "balance_loss_mlp": 1.0522244, + "epoch": 0.17519915827446264, + "flos": 20704728562560.0, + "grad_norm": 2.243393227782369, + "language_loss": 0.68799925, + "learning_rate": 3.782887439295741e-06, + "loss": 0.70997757, + "num_input_tokens_seen": 63039500, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.984375, + "step": 2914, + "time_per_iteration": 2.46085262298584 + }, + { + "auxiliary_loss_clip": 0.01149727, + "auxiliary_loss_mlp": 0.01056566, + "balance_loss_clip": 1.03616977, + "balance_loss_mlp": 1.05143356, + "epoch": 0.1752592815271306, + "flos": 20523415685760.0, + "grad_norm": 1.8218690011087264, + "language_loss": 0.93755293, + "learning_rate": 3.782710928163772e-06, + "loss": 0.95961595, + "num_input_tokens_seen": 63059785, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 0.98046875, + "step": 2915, + "time_per_iteration": 2.457148551940918 + }, + { + "auxiliary_loss_clip": 0.01143068, + "auxiliary_loss_mlp": 0.01047095, + "balance_loss_clip": 1.02744889, + "balance_loss_mlp": 1.04722261, + "epoch": 0.1753194047797986, + "flos": 21799178012160.0, + "grad_norm": 1.8144768789670476, + "language_loss": 0.80869162, + "learning_rate": 3.782534349431226e-06, + "loss": 0.83059323, + "num_input_tokens_seen": 63079385, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.95703125, + "step": 2916, + "time_per_iteration": 2.4740476608276367 + }, + { + "auxiliary_loss_clip": 0.01150429, + "auxiliary_loss_mlp": 0.01056449, + "balance_loss_clip": 1.03663611, + "balance_loss_mlp": 1.04854608, + "epoch": 0.17537952803246656, + "flos": 20668351063680.0, + "grad_norm": 1.67512565222408, + "language_loss": 0.73645711, + "learning_rate": 3.782357703104799e-06, + "loss": 0.75852591, + "num_input_tokens_seen": 63098970, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.015625, + "step": 2917, + "time_per_iteration": 2.4484915733337402 + }, + { + "auxiliary_loss_clip": 0.01144993, + "auxiliary_loss_mlp": 0.01055794, + "balance_loss_clip": 1.03517044, + "balance_loss_mlp": 1.04897738, + "epoch": 0.17543965128513453, + "flos": 23295072839040.0, + "grad_norm": 12.675743752905372, + "language_loss": 0.77019119, + "learning_rate": 3.7821809891911897e-06, + "loss": 0.79219908, + "num_input_tokens_seen": 63118750, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 0.9609375, + "step": 2918, + "time_per_iteration": 2.4723429679870605 + }, + { + "auxiliary_loss_clip": 0.01154194, + "auxiliary_loss_mlp": 0.01046159, + "balance_loss_clip": 1.0260129, + "balance_loss_mlp": 1.05131745, + "epoch": 0.1754997745378025, + "flos": 29095615416960.0, + "grad_norm": 3.415786226656528, + "language_loss": 0.74196291, + "learning_rate": 3.782004207697098e-06, + "loss": 0.76396644, + "num_input_tokens_seen": 63136865, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.03125, + "step": 2919, + "time_per_iteration": 2.5049829483032227 + }, + { + "auxiliary_loss_clip": 0.01154792, + "auxiliary_loss_mlp": 0.01049917, + "balance_loss_clip": 1.03080809, + "balance_loss_mlp": 1.05090559, + "epoch": 0.17555989779047046, + "flos": 30371844620160.0, + "grad_norm": 1.7754050788280298, + "language_loss": 0.74211872, + "learning_rate": 3.781827358629228e-06, + "loss": 0.76416576, + "num_input_tokens_seen": 63158325, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0390625, + "step": 2920, + "time_per_iteration": 2.565361738204956 + }, + { + "auxiliary_loss_clip": 0.01144387, + "auxiliary_loss_mlp": 0.01039886, + "balance_loss_clip": 1.0219686, + "balance_loss_mlp": 1.04717219, + "epoch": 0.17562002104313842, + "flos": 23287746464640.0, + "grad_norm": 2.3164139995284834, + "language_loss": 0.7949307, + "learning_rate": 3.7816504419942873e-06, + "loss": 0.81677347, + "num_input_tokens_seen": 63173115, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.97265625, + "step": 2921, + "time_per_iteration": 2.4471213817596436 + }, + { + "auxiliary_loss_clip": 0.01153986, + "auxiliary_loss_mlp": 0.0104563, + "balance_loss_clip": 1.02569795, + "balance_loss_mlp": 1.05029321, + "epoch": 0.1756801442958064, + "flos": 24790500789120.0, + "grad_norm": 1.6170497741380607, + "language_loss": 0.87493849, + "learning_rate": 3.7814734577989823e-06, + "loss": 0.89693457, + "num_input_tokens_seen": 63192880, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.0390625, + "step": 2922, + "time_per_iteration": 2.5042173862457275 + }, + { + "auxiliary_loss_clip": 0.01149338, + "auxiliary_loss_mlp": 0.01050477, + "balance_loss_clip": 1.03074801, + "balance_loss_mlp": 1.04808784, + "epoch": 0.17574026754847438, + "flos": 25771651764480.0, + "grad_norm": 2.3811708545321735, + "language_loss": 0.62097687, + "learning_rate": 3.7812964060500253e-06, + "loss": 0.64297503, + "num_input_tokens_seen": 63214395, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.015625, + "step": 2923, + "time_per_iteration": 2.5067484378814697 + }, + { + "auxiliary_loss_clip": 0.01154551, + "auxiliary_loss_mlp": 0.01048297, + "balance_loss_clip": 1.02775693, + "balance_loss_mlp": 1.05287814, + "epoch": 0.17580039080114235, + "flos": 17456608477440.0, + "grad_norm": 2.1344206016331797, + "language_loss": 0.80602306, + "learning_rate": 3.78111928675413e-06, + "loss": 0.82805157, + "num_input_tokens_seen": 63231020, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 1.015625, + "step": 2924, + "time_per_iteration": 2.453174114227295 + }, + { + "auxiliary_loss_clip": 0.0115147, + "auxiliary_loss_mlp": 0.01053673, + "balance_loss_clip": 1.03214407, + "balance_loss_mlp": 1.04809761, + "epoch": 0.1758605140538103, + "flos": 14864648088960.0, + "grad_norm": 3.672968077353321, + "language_loss": 0.70954067, + "learning_rate": 3.7809420999180126e-06, + "loss": 0.73159206, + "num_input_tokens_seen": 63246245, + "router_z_loss_clip": 0.21582031, + "router_z_loss_mlp": 1.03125, + "step": 2925, + "time_per_iteration": 2.4666385650634766 + }, + { + "auxiliary_loss_clip": 0.01148763, + "auxiliary_loss_mlp": 0.0104438, + "balance_loss_clip": 1.02538979, + "balance_loss_mlp": 1.05147243, + "epoch": 0.17592063730647828, + "flos": 23004268329600.0, + "grad_norm": 1.6622274839000213, + "language_loss": 0.71700275, + "learning_rate": 3.7807648455483934e-06, + "loss": 0.73893416, + "num_input_tokens_seen": 63267790, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.97265625, + "step": 2926, + "time_per_iteration": 2.50289249420166 + }, + { + "auxiliary_loss_clip": 0.01150931, + "auxiliary_loss_mlp": 0.01043072, + "balance_loss_clip": 1.02191257, + "balance_loss_mlp": 1.04857433, + "epoch": 0.17598076055914624, + "flos": 20741501111040.0, + "grad_norm": 1.8916391197618272, + "language_loss": 0.84433806, + "learning_rate": 3.7805875236519918e-06, + "loss": 0.86627805, + "num_input_tokens_seen": 63286830, + "router_z_loss_clip": 0.21191406, + "router_z_loss_mlp": 1.0234375, + "step": 2927, + "time_per_iteration": 2.447207450866699 + }, + { + "auxiliary_loss_clip": 0.01149947, + "auxiliary_loss_mlp": 0.01043802, + "balance_loss_clip": 1.02568233, + "balance_loss_mlp": 1.0506475, + "epoch": 0.1760408838118142, + "flos": 34092441227520.0, + "grad_norm": 1.8156588356210406, + "language_loss": 0.71879232, + "learning_rate": 3.7804101342355336e-06, + "loss": 0.74072987, + "num_input_tokens_seen": 63308870, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9921875, + "step": 2928, + "time_per_iteration": 2.585942029953003 + }, + { + "auxiliary_loss_clip": 0.01150116, + "auxiliary_loss_mlp": 0.01048544, + "balance_loss_clip": 1.028934, + "balance_loss_mlp": 1.05230594, + "epoch": 0.1761010070644822, + "flos": 24168384207360.0, + "grad_norm": 2.0402577824357886, + "language_loss": 0.83222824, + "learning_rate": 3.780232677305744e-06, + "loss": 0.85421479, + "num_input_tokens_seen": 63329005, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.9765625, + "step": 2929, + "time_per_iteration": 2.461101770401001 + }, + { + "auxiliary_loss_clip": 0.01149627, + "auxiliary_loss_mlp": 0.0104173, + "balance_loss_clip": 1.02298999, + "balance_loss_mlp": 1.0493536, + "epoch": 0.17616113031715017, + "flos": 26576697335040.0, + "grad_norm": 1.817429721867852, + "language_loss": 0.7933988, + "learning_rate": 3.7800551528693535e-06, + "loss": 0.81531239, + "num_input_tokens_seen": 63349390, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0, + "step": 2930, + "time_per_iteration": 2.491748571395874 + }, + { + "auxiliary_loss_clip": 0.01154203, + "auxiliary_loss_mlp": 0.0104708, + "balance_loss_clip": 1.02671921, + "balance_loss_mlp": 1.05319881, + "epoch": 0.17622125356981813, + "flos": 25666685245440.0, + "grad_norm": 2.194829469856105, + "language_loss": 0.76142448, + "learning_rate": 3.7798775609330927e-06, + "loss": 0.78343737, + "num_input_tokens_seen": 63368835, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0078125, + "step": 2931, + "time_per_iteration": 2.4907379150390625 + }, + { + "auxiliary_loss_clip": 0.01151699, + "auxiliary_loss_mlp": 0.0104003, + "balance_loss_clip": 1.0211587, + "balance_loss_mlp": 1.05108666, + "epoch": 0.1762813768224861, + "flos": 16508530949760.0, + "grad_norm": 2.8261445455709153, + "language_loss": 0.74740392, + "learning_rate": 3.779699901503696e-06, + "loss": 0.7693212, + "num_input_tokens_seen": 63385220, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0078125, + "step": 2932, + "time_per_iteration": 2.4252588748931885 + }, + { + "auxiliary_loss_clip": 0.01157373, + "auxiliary_loss_mlp": 0.01043179, + "balance_loss_clip": 1.0221262, + "balance_loss_mlp": 1.05086923, + "epoch": 0.17634150007515406, + "flos": 11211850402560.0, + "grad_norm": 2.4930669650063355, + "language_loss": 0.8968839, + "learning_rate": 3.7795221745879016e-06, + "loss": 0.9188894, + "num_input_tokens_seen": 63400865, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 1.0625, + "step": 2933, + "time_per_iteration": 2.4334278106689453 + }, + { + "auxiliary_loss_clip": 0.01147962, + "auxiliary_loss_mlp": 0.01047507, + "balance_loss_clip": 1.02980459, + "balance_loss_mlp": 1.05053639, + "epoch": 0.17640162332782203, + "flos": 23659925235840.0, + "grad_norm": 1.6616334836184845, + "language_loss": 0.88273364, + "learning_rate": 3.779344380192448e-06, + "loss": 0.90468836, + "num_input_tokens_seen": 63421390, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9765625, + "step": 2934, + "time_per_iteration": 3.891472578048706 + }, + { + "auxiliary_loss_clip": 0.01147552, + "auxiliary_loss_mlp": 0.0104641, + "balance_loss_clip": 1.02827823, + "balance_loss_mlp": 1.04972959, + "epoch": 0.17646174658049, + "flos": 53796984606720.0, + "grad_norm": 1.7575209177187046, + "language_loss": 0.70843625, + "learning_rate": 3.779166518324077e-06, + "loss": 0.73037589, + "num_input_tokens_seen": 63444715, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.98046875, + "step": 2935, + "time_per_iteration": 5.650984287261963 + }, + { + "auxiliary_loss_clip": 0.01157572, + "auxiliary_loss_mlp": 0.0104314, + "balance_loss_clip": 1.02405488, + "balance_loss_mlp": 1.05251908, + "epoch": 0.17652186983315798, + "flos": 24243868638720.0, + "grad_norm": 2.2448658169111795, + "language_loss": 0.69255942, + "learning_rate": 3.7789885889895325e-06, + "loss": 0.71456659, + "num_input_tokens_seen": 63465525, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0546875, + "step": 2936, + "time_per_iteration": 2.4864091873168945 + }, + { + "auxiliary_loss_clip": 0.01154775, + "auxiliary_loss_mlp": 0.01045313, + "balance_loss_clip": 1.02758646, + "balance_loss_mlp": 1.05530488, + "epoch": 0.17658199308582595, + "flos": 27454282421760.0, + "grad_norm": 1.883537128373794, + "language_loss": 0.71391022, + "learning_rate": 3.7788105921955634e-06, + "loss": 0.73591107, + "num_input_tokens_seen": 63485815, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.99609375, + "step": 2937, + "time_per_iteration": 2.5096240043640137 + }, + { + "auxiliary_loss_clip": 0.01158892, + "auxiliary_loss_mlp": 0.010448, + "balance_loss_clip": 1.02461779, + "balance_loss_mlp": 1.05530524, + "epoch": 0.17664211633849392, + "flos": 22418672901120.0, + "grad_norm": 2.165923066719211, + "language_loss": 0.7584855, + "learning_rate": 3.7786325279489184e-06, + "loss": 0.78052241, + "num_input_tokens_seen": 63503905, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.03125, + "step": 2938, + "time_per_iteration": 2.475069284439087 + }, + { + "auxiliary_loss_clip": 0.01153261, + "auxiliary_loss_mlp": 0.01043059, + "balance_loss_clip": 1.02466512, + "balance_loss_mlp": 1.05156195, + "epoch": 0.17670223959116188, + "flos": 24715124098560.0, + "grad_norm": 2.20477923303766, + "language_loss": 0.71130306, + "learning_rate": 3.7784543962563495e-06, + "loss": 0.73326623, + "num_input_tokens_seen": 63521985, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 1.015625, + "step": 2939, + "time_per_iteration": 2.4806766510009766 + }, + { + "auxiliary_loss_clip": 0.01153772, + "auxiliary_loss_mlp": 0.0104332, + "balance_loss_clip": 1.02421093, + "balance_loss_mlp": 1.0538342, + "epoch": 0.17676236284382985, + "flos": 22527051212160.0, + "grad_norm": 3.125031265469358, + "language_loss": 0.73781312, + "learning_rate": 3.7782761971246115e-06, + "loss": 0.7597841, + "num_input_tokens_seen": 63539830, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0, + "step": 2940, + "time_per_iteration": 2.5438694953918457 + }, + { + "auxiliary_loss_clip": 0.01154904, + "auxiliary_loss_mlp": 0.01045748, + "balance_loss_clip": 1.02568471, + "balance_loss_mlp": 1.05372643, + "epoch": 0.1768224860964978, + "flos": 12385160161920.0, + "grad_norm": 2.4976558026918703, + "language_loss": 0.85003591, + "learning_rate": 3.7780979305604616e-06, + "loss": 0.87204242, + "num_input_tokens_seen": 63555495, + "router_z_loss_clip": 0.20117188, + "router_z_loss_mlp": 1.0078125, + "step": 2941, + "time_per_iteration": 2.4616622924804688 + }, + { + "auxiliary_loss_clip": 0.01154492, + "auxiliary_loss_mlp": 0.0104577, + "balance_loss_clip": 1.02687514, + "balance_loss_mlp": 1.05292201, + "epoch": 0.1768826093491658, + "flos": 24353360271360.0, + "grad_norm": 2.199835477442084, + "language_loss": 0.7711162, + "learning_rate": 3.7779195965706607e-06, + "loss": 0.79311877, + "num_input_tokens_seen": 63575290, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.015625, + "step": 2942, + "time_per_iteration": 2.512493848800659 + }, + { + "auxiliary_loss_clip": 0.01154308, + "auxiliary_loss_mlp": 0.01044542, + "balance_loss_clip": 1.02514625, + "balance_loss_mlp": 1.05181623, + "epoch": 0.17694273260183377, + "flos": 23587062497280.0, + "grad_norm": 1.9811917296629065, + "language_loss": 0.80591762, + "learning_rate": 3.77774119516197e-06, + "loss": 0.82790613, + "num_input_tokens_seen": 63594670, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 1.0234375, + "step": 2943, + "time_per_iteration": 2.4898416996002197 + }, + { + "auxiliary_loss_clip": 0.01154834, + "auxiliary_loss_mlp": 0.01050893, + "balance_loss_clip": 1.02953053, + "balance_loss_mlp": 1.05046725, + "epoch": 0.17700285585450173, + "flos": 26760991040640.0, + "grad_norm": 2.9958912509352866, + "language_loss": 0.80558729, + "learning_rate": 3.777562726341155e-06, + "loss": 0.82764459, + "num_input_tokens_seen": 63614780, + "router_z_loss_clip": 0.21289062, + "router_z_loss_mlp": 1.046875, + "step": 2944, + "time_per_iteration": 2.533968448638916 + }, + { + "auxiliary_loss_clip": 0.01154843, + "auxiliary_loss_mlp": 0.01062464, + "balance_loss_clip": 1.04353368, + "balance_loss_mlp": 1.05239737, + "epoch": 0.1770629791071697, + "flos": 42776323320960.0, + "grad_norm": 1.992535786356086, + "language_loss": 0.73450243, + "learning_rate": 3.7773841901149835e-06, + "loss": 0.75667548, + "num_input_tokens_seen": 63637190, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0234375, + "step": 2945, + "time_per_iteration": 2.641890287399292 + }, + { + "auxiliary_loss_clip": 0.01152525, + "auxiliary_loss_mlp": 0.01050215, + "balance_loss_clip": 1.03179753, + "balance_loss_mlp": 1.05274916, + "epoch": 0.17712310235983766, + "flos": 17345572560000.0, + "grad_norm": 2.3259800829895028, + "language_loss": 0.7778489, + "learning_rate": 3.7772055864902256e-06, + "loss": 0.79987633, + "num_input_tokens_seen": 63652140, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.99609375, + "step": 2946, + "time_per_iteration": 2.420511484146118 + }, + { + "auxiliary_loss_clip": 0.01150621, + "auxiliary_loss_mlp": 0.01051141, + "balance_loss_clip": 1.03190041, + "balance_loss_mlp": 1.05060697, + "epoch": 0.17718322561250563, + "flos": 23878477537920.0, + "grad_norm": 1.9846715459481197, + "language_loss": 0.76240218, + "learning_rate": 3.7770269154736535e-06, + "loss": 0.78441978, + "num_input_tokens_seen": 63671700, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.0, + "step": 2947, + "time_per_iteration": 2.485795259475708 + }, + { + "auxiliary_loss_clip": 0.01148639, + "auxiliary_loss_mlp": 0.01046512, + "balance_loss_clip": 1.02725959, + "balance_loss_mlp": 1.04881549, + "epoch": 0.1772433488651736, + "flos": 36466352104320.0, + "grad_norm": 2.7031010106606654, + "language_loss": 0.71890748, + "learning_rate": 3.7768481770720424e-06, + "loss": 0.74085903, + "num_input_tokens_seen": 63691685, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.99609375, + "step": 2948, + "time_per_iteration": 2.598586320877075 + }, + { + "auxiliary_loss_clip": 0.01151482, + "auxiliary_loss_mlp": 0.01051629, + "balance_loss_clip": 1.03313947, + "balance_loss_mlp": 1.05261326, + "epoch": 0.1773034721178416, + "flos": 26684716510080.0, + "grad_norm": 1.809900152556277, + "language_loss": 0.81843233, + "learning_rate": 3.776669371292171e-06, + "loss": 0.8404634, + "num_input_tokens_seen": 63711720, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.98828125, + "step": 2949, + "time_per_iteration": 2.496962547302246 + }, + { + "auxiliary_loss_clip": 0.01051917, + "auxiliary_loss_mlp": 0.01007586, + "balance_loss_clip": 1.00552368, + "balance_loss_mlp": 1.01889789, + "epoch": 0.17736359537050955, + "flos": 57117467617920.0, + "grad_norm": 0.7669309197050882, + "language_loss": 0.64973593, + "learning_rate": 3.7764904981408186e-06, + "loss": 0.670331, + "num_input_tokens_seen": 63776280, + "router_z_loss_clip": 0.02062988, + "router_z_loss_mlp": 0.33007812, + "step": 2950, + "time_per_iteration": 3.1220879554748535 + }, + { + "auxiliary_loss_clip": 0.01145274, + "auxiliary_loss_mlp": 0.01049164, + "balance_loss_clip": 1.02992332, + "balance_loss_mlp": 1.04777181, + "epoch": 0.17742371862317752, + "flos": 27198203385600.0, + "grad_norm": 1.9502306021254343, + "language_loss": 0.83540517, + "learning_rate": 3.7763115576247686e-06, + "loss": 0.85734957, + "num_input_tokens_seen": 63797535, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.9765625, + "step": 2951, + "time_per_iteration": 2.5360641479492188 + }, + { + "auxiliary_loss_clip": 0.01153398, + "auxiliary_loss_mlp": 0.01055919, + "balance_loss_clip": 1.03710794, + "balance_loss_mlp": 1.04963326, + "epoch": 0.17748384187584548, + "flos": 20959694277120.0, + "grad_norm": 3.175759961241781, + "language_loss": 0.80564123, + "learning_rate": 3.776132549750806e-06, + "loss": 0.82773435, + "num_input_tokens_seen": 63817045, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.0390625, + "step": 2952, + "time_per_iteration": 2.478635787963867 + }, + { + "auxiliary_loss_clip": 0.01150606, + "auxiliary_loss_mlp": 0.01051207, + "balance_loss_clip": 1.03157318, + "balance_loss_mlp": 1.05045855, + "epoch": 0.17754396512851345, + "flos": 25009986844800.0, + "grad_norm": 2.157061982289712, + "language_loss": 0.79982865, + "learning_rate": 3.7759534745257194e-06, + "loss": 0.82184678, + "num_input_tokens_seen": 63837665, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0, + "step": 2953, + "time_per_iteration": 2.5143978595733643 + }, + { + "auxiliary_loss_clip": 0.01152559, + "auxiliary_loss_mlp": 0.01048489, + "balance_loss_clip": 1.03003526, + "balance_loss_mlp": 1.05173969, + "epoch": 0.1776040883811814, + "flos": 32051566275840.0, + "grad_norm": 1.8943960347088487, + "language_loss": 0.88006002, + "learning_rate": 3.7757743319562994e-06, + "loss": 0.90207046, + "num_input_tokens_seen": 63858455, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.0078125, + "step": 2954, + "time_per_iteration": 2.575603485107422 + }, + { + "auxiliary_loss_clip": 0.01150383, + "auxiliary_loss_mlp": 0.01052239, + "balance_loss_clip": 1.0327127, + "balance_loss_mlp": 1.05101538, + "epoch": 0.17766421163384938, + "flos": 21574125348480.0, + "grad_norm": 2.123866524492404, + "language_loss": 0.84441978, + "learning_rate": 3.7755951220493386e-06, + "loss": 0.86644602, + "num_input_tokens_seen": 63876935, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.9921875, + "step": 2955, + "time_per_iteration": 2.476022958755493 + }, + { + "auxiliary_loss_clip": 0.01148363, + "auxiliary_loss_mlp": 0.0104412, + "balance_loss_clip": 1.02591681, + "balance_loss_mlp": 1.04843807, + "epoch": 0.17772433488651737, + "flos": 22419319345920.0, + "grad_norm": 2.0229859139182382, + "language_loss": 0.71172267, + "learning_rate": 3.7754158448116327e-06, + "loss": 0.73364747, + "num_input_tokens_seen": 63896815, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 1.0, + "step": 2956, + "time_per_iteration": 2.4795608520507812 + }, + { + "auxiliary_loss_clip": 0.0114893, + "auxiliary_loss_mlp": 0.01052163, + "balance_loss_clip": 1.03226662, + "balance_loss_mlp": 1.04974461, + "epoch": 0.17778445813918534, + "flos": 25629445820160.0, + "grad_norm": 1.891261769499534, + "language_loss": 0.82908547, + "learning_rate": 3.7752365002499795e-06, + "loss": 0.85109639, + "num_input_tokens_seen": 63916140, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.9921875, + "step": 2957, + "time_per_iteration": 2.494279384613037 + }, + { + "auxiliary_loss_clip": 0.01146796, + "auxiliary_loss_mlp": 0.01046422, + "balance_loss_clip": 1.02819514, + "balance_loss_mlp": 1.04814482, + "epoch": 0.1778445813918533, + "flos": 25628871202560.0, + "grad_norm": 1.926043663168548, + "language_loss": 0.75286758, + "learning_rate": 3.7750570883711807e-06, + "loss": 0.7747997, + "num_input_tokens_seen": 63935220, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.984375, + "step": 2958, + "time_per_iteration": 2.532339572906494 + }, + { + "auxiliary_loss_clip": 0.01153516, + "auxiliary_loss_mlp": 0.01043348, + "balance_loss_clip": 1.02483475, + "balance_loss_mlp": 1.05278933, + "epoch": 0.17790470464452127, + "flos": 22345522853760.0, + "grad_norm": 2.0794730574663265, + "language_loss": 0.79558724, + "learning_rate": 3.7748776091820397e-06, + "loss": 0.8175559, + "num_input_tokens_seen": 63954550, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.0078125, + "step": 2959, + "time_per_iteration": 2.45941424369812 + }, + { + "auxiliary_loss_clip": 0.01152953, + "auxiliary_loss_mlp": 0.01045372, + "balance_loss_clip": 1.02573824, + "balance_loss_mlp": 1.04968762, + "epoch": 0.17796482789718923, + "flos": 18765875214720.0, + "grad_norm": 2.284306220471852, + "language_loss": 0.52288693, + "learning_rate": 3.774698062689362e-06, + "loss": 0.5448702, + "num_input_tokens_seen": 63972425, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.03125, + "step": 2960, + "time_per_iteration": 2.4603421688079834 + }, + { + "auxiliary_loss_clip": 0.01154348, + "auxiliary_loss_mlp": 0.01055908, + "balance_loss_clip": 1.03685808, + "balance_loss_mlp": 1.05185843, + "epoch": 0.1780249511498572, + "flos": 23440941970560.0, + "grad_norm": 1.9615261009939866, + "language_loss": 0.89047921, + "learning_rate": 3.7745184488999548e-06, + "loss": 0.9125818, + "num_input_tokens_seen": 63992165, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0234375, + "step": 2961, + "time_per_iteration": 2.475848913192749 + }, + { + "auxiliary_loss_clip": 0.01151915, + "auxiliary_loss_mlp": 0.01051556, + "balance_loss_clip": 1.0313381, + "balance_loss_mlp": 1.04849648, + "epoch": 0.1780850744025252, + "flos": 23367468700800.0, + "grad_norm": 2.2193748892921517, + "language_loss": 0.79186273, + "learning_rate": 3.774338767820631e-06, + "loss": 0.81389749, + "num_input_tokens_seen": 64013470, + "router_z_loss_clip": 0.20214844, + "router_z_loss_mlp": 1.03125, + "step": 2962, + "time_per_iteration": 2.508054256439209 + }, + { + "auxiliary_loss_clip": 0.011535, + "auxiliary_loss_mlp": 0.01051549, + "balance_loss_clip": 1.03175986, + "balance_loss_mlp": 1.0524615, + "epoch": 0.17814519765519315, + "flos": 13771994319360.0, + "grad_norm": 1.9550413638631114, + "language_loss": 0.74514943, + "learning_rate": 3.774159019458203e-06, + "loss": 0.76719993, + "num_input_tokens_seen": 64030975, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.015625, + "step": 2963, + "time_per_iteration": 2.4414234161376953 + }, + { + "auxiliary_loss_clip": 0.01156042, + "auxiliary_loss_mlp": 0.01048013, + "balance_loss_clip": 1.02822399, + "balance_loss_mlp": 1.05221784, + "epoch": 0.17820532090786112, + "flos": 21976396738560.0, + "grad_norm": 1.541363360665875, + "language_loss": 0.78624183, + "learning_rate": 3.7739792038194877e-06, + "loss": 0.80828238, + "num_input_tokens_seen": 64050075, + "router_z_loss_clip": 0.19824219, + "router_z_loss_mlp": 1.0390625, + "step": 2964, + "time_per_iteration": 2.502497911453247 + }, + { + "auxiliary_loss_clip": 0.0115044, + "auxiliary_loss_mlp": 0.01056098, + "balance_loss_clip": 1.03661871, + "balance_loss_mlp": 1.05026746, + "epoch": 0.17826544416052909, + "flos": 24790752184320.0, + "grad_norm": 1.923237578914178, + "language_loss": 0.81686175, + "learning_rate": 3.7737993209113027e-06, + "loss": 0.83892715, + "num_input_tokens_seen": 64071920, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 1.0, + "step": 2965, + "time_per_iteration": 2.538076400756836 + }, + { + "auxiliary_loss_clip": 0.01147349, + "auxiliary_loss_mlp": 0.01049832, + "balance_loss_clip": 1.03273785, + "balance_loss_mlp": 1.04941893, + "epoch": 0.17832556741319705, + "flos": 13879582531200.0, + "grad_norm": 2.2408088539265183, + "language_loss": 0.94580686, + "learning_rate": 3.7736193707404698e-06, + "loss": 0.96777868, + "num_input_tokens_seen": 64086835, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.98046875, + "step": 2966, + "time_per_iteration": 2.43082332611084 + }, + { + "auxiliary_loss_clip": 0.01149854, + "auxiliary_loss_mlp": 0.01045128, + "balance_loss_clip": 1.02470756, + "balance_loss_mlp": 1.05002928, + "epoch": 0.17838569066586502, + "flos": 36641703323520.0, + "grad_norm": 2.145285080590972, + "language_loss": 0.72469354, + "learning_rate": 3.7734393533138127e-06, + "loss": 0.74664342, + "num_input_tokens_seen": 64107360, + "router_z_loss_clip": 0.20410156, + "router_z_loss_mlp": 1.0, + "step": 2967, + "time_per_iteration": 2.5735998153686523 + }, + { + "auxiliary_loss_clip": 0.01145139, + "auxiliary_loss_mlp": 0.01044527, + "balance_loss_clip": 1.02613282, + "balance_loss_mlp": 1.04889679, + "epoch": 0.17844581391853298, + "flos": 18727271072640.0, + "grad_norm": 2.088672387523525, + "language_loss": 0.76831949, + "learning_rate": 3.773259268638157e-06, + "loss": 0.79021615, + "num_input_tokens_seen": 64124690, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.9609375, + "step": 2968, + "time_per_iteration": 2.437344789505005 + }, + { + "auxiliary_loss_clip": 0.01147824, + "auxiliary_loss_mlp": 0.01047277, + "balance_loss_clip": 1.0287044, + "balance_loss_mlp": 1.04982233, + "epoch": 0.17850593717120097, + "flos": 27378259286400.0, + "grad_norm": 3.3962137266502075, + "language_loss": 0.75934523, + "learning_rate": 3.7730791167203333e-06, + "loss": 0.78129619, + "num_input_tokens_seen": 64146315, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.98046875, + "step": 2969, + "time_per_iteration": 2.5003507137298584 + }, + { + "auxiliary_loss_clip": 0.01047445, + "auxiliary_loss_mlp": 0.01001591, + "balance_loss_clip": 0.99940914, + "balance_loss_mlp": 1.01426291, + "epoch": 0.17856606042386894, + "flos": 66996025084800.0, + "grad_norm": 0.8459028719848601, + "language_loss": 0.69080526, + "learning_rate": 3.772898897567171e-06, + "loss": 0.7112956, + "num_input_tokens_seen": 64210875, + "router_z_loss_clip": 0.02185059, + "router_z_loss_mlp": 0.33203125, + "step": 2970, + "time_per_iteration": 3.1193249225616455 + }, + { + "auxiliary_loss_clip": 0.01153596, + "auxiliary_loss_mlp": 0.01041832, + "balance_loss_clip": 1.0229373, + "balance_loss_mlp": 1.0498271, + "epoch": 0.1786261836765369, + "flos": 36977001805440.0, + "grad_norm": 2.0858657386647614, + "language_loss": 0.67452097, + "learning_rate": 3.772718611185505e-06, + "loss": 0.69647527, + "num_input_tokens_seen": 64230740, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0390625, + "step": 2971, + "time_per_iteration": 2.580946683883667 + }, + { + "auxiliary_loss_clip": 0.01146095, + "auxiliary_loss_mlp": 0.0105018, + "balance_loss_clip": 1.03059363, + "balance_loss_mlp": 1.04643905, + "epoch": 0.17868630692920487, + "flos": 24825441744000.0, + "grad_norm": 1.713623966203784, + "language_loss": 0.89631712, + "learning_rate": 3.7725382575821717e-06, + "loss": 0.91827983, + "num_input_tokens_seen": 64252300, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.99609375, + "step": 2972, + "time_per_iteration": 2.491608142852783 + }, + { + "auxiliary_loss_clip": 0.01150348, + "auxiliary_loss_mlp": 0.01056161, + "balance_loss_clip": 1.03762364, + "balance_loss_mlp": 1.05058205, + "epoch": 0.17874643018187283, + "flos": 16981977139200.0, + "grad_norm": 2.067523530387673, + "language_loss": 0.88030291, + "learning_rate": 3.77235783676401e-06, + "loss": 0.90236795, + "num_input_tokens_seen": 64270105, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0, + "step": 2973, + "time_per_iteration": 2.4357106685638428 + }, + { + "auxiliary_loss_clip": 0.01148396, + "auxiliary_loss_mlp": 0.01051569, + "balance_loss_clip": 1.03282917, + "balance_loss_mlp": 1.04979324, + "epoch": 0.1788065534345408, + "flos": 21032233793280.0, + "grad_norm": 2.1406659419236176, + "language_loss": 0.75648922, + "learning_rate": 3.7721773487378615e-06, + "loss": 0.77848881, + "num_input_tokens_seen": 64287250, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.984375, + "step": 2974, + "time_per_iteration": 2.484236478805542 + }, + { + "auxiliary_loss_clip": 0.01148515, + "auxiliary_loss_mlp": 0.01044686, + "balance_loss_clip": 1.02560067, + "balance_loss_mlp": 1.04925394, + "epoch": 0.17886667668720876, + "flos": 23987717775360.0, + "grad_norm": 2.8019304252630453, + "language_loss": 0.74556506, + "learning_rate": 3.7719967935105705e-06, + "loss": 0.76749712, + "num_input_tokens_seen": 64307140, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.9921875, + "step": 2975, + "time_per_iteration": 2.4658849239349365 + }, + { + "auxiliary_loss_clip": 0.01145454, + "auxiliary_loss_mlp": 0.0104533, + "balance_loss_clip": 1.02692378, + "balance_loss_mlp": 1.04805982, + "epoch": 0.17892679993987676, + "flos": 25739476156800.0, + "grad_norm": 1.5963289978134585, + "language_loss": 0.73245859, + "learning_rate": 3.7718161710889833e-06, + "loss": 0.7543664, + "num_input_tokens_seen": 64328760, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.97265625, + "step": 2976, + "time_per_iteration": 3.921170949935913 + }, + { + "auxiliary_loss_clip": 0.01140857, + "auxiliary_loss_mlp": 0.01040265, + "balance_loss_clip": 1.02455354, + "balance_loss_mlp": 1.04732931, + "epoch": 0.17898692319254472, + "flos": 25699686865920.0, + "grad_norm": 1.5556273460638488, + "language_loss": 0.77324069, + "learning_rate": 3.7716354814799495e-06, + "loss": 0.79505193, + "num_input_tokens_seen": 64348800, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.9375, + "step": 2977, + "time_per_iteration": 5.459218978881836 + }, + { + "auxiliary_loss_clip": 0.01150602, + "auxiliary_loss_mlp": 0.0105157, + "balance_loss_clip": 1.03352153, + "balance_loss_mlp": 1.05327988, + "epoch": 0.1790470464452127, + "flos": 19317786664320.0, + "grad_norm": 2.814268655584857, + "language_loss": 0.79470795, + "learning_rate": 3.7714547246903203e-06, + "loss": 0.81672966, + "num_input_tokens_seen": 64367955, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.97265625, + "step": 2978, + "time_per_iteration": 2.4917376041412354 + }, + { + "auxiliary_loss_clip": 0.01152273, + "auxiliary_loss_mlp": 0.0104187, + "balance_loss_clip": 1.022892, + "balance_loss_mlp": 1.04982674, + "epoch": 0.17910716969788065, + "flos": 30044267562240.0, + "grad_norm": 1.6585859201367117, + "language_loss": 0.76166439, + "learning_rate": 3.7712739007269508e-06, + "loss": 0.78360581, + "num_input_tokens_seen": 64389805, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0234375, + "step": 2979, + "time_per_iteration": 2.5283753871917725 + }, + { + "auxiliary_loss_clip": 0.01144584, + "auxiliary_loss_mlp": 0.01046117, + "balance_loss_clip": 1.0283196, + "balance_loss_mlp": 1.04760695, + "epoch": 0.17916729295054862, + "flos": 19427709260160.0, + "grad_norm": 2.3100878996861014, + "language_loss": 0.69246143, + "learning_rate": 3.7710930095966976e-06, + "loss": 0.7143684, + "num_input_tokens_seen": 64408220, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.96875, + "step": 2980, + "time_per_iteration": 2.452199935913086 + }, + { + "auxiliary_loss_clip": 0.01148553, + "auxiliary_loss_mlp": 0.01047507, + "balance_loss_clip": 1.02703881, + "balance_loss_mlp": 1.04957294, + "epoch": 0.17922741620321658, + "flos": 14611549881600.0, + "grad_norm": 1.6769030770257147, + "language_loss": 0.7077347, + "learning_rate": 3.7709120513064196e-06, + "loss": 0.72969532, + "num_input_tokens_seen": 64426380, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.98828125, + "step": 2981, + "time_per_iteration": 2.453328847885132 + }, + { + "auxiliary_loss_clip": 0.01151272, + "auxiliary_loss_mlp": 0.01057949, + "balance_loss_clip": 1.03929293, + "balance_loss_mlp": 1.05124855, + "epoch": 0.17928753945588458, + "flos": 17165301177600.0, + "grad_norm": 2.4096510966801916, + "language_loss": 0.82313269, + "learning_rate": 3.7707310258629796e-06, + "loss": 0.84522492, + "num_input_tokens_seen": 64444355, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.0, + "step": 2982, + "time_per_iteration": 2.4727423191070557 + }, + { + "auxiliary_loss_clip": 0.01145202, + "auxiliary_loss_mlp": 0.01048964, + "balance_loss_clip": 1.0309453, + "balance_loss_mlp": 1.04754186, + "epoch": 0.17934766270855254, + "flos": 31395622060800.0, + "grad_norm": 2.0170018574221404, + "language_loss": 0.82899523, + "learning_rate": 3.7705499332732413e-06, + "loss": 0.85093689, + "num_input_tokens_seen": 64467800, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9765625, + "step": 2983, + "time_per_iteration": 2.5544486045837402 + }, + { + "auxiliary_loss_clip": 0.01148269, + "auxiliary_loss_mlp": 0.01051569, + "balance_loss_clip": 1.03234076, + "balance_loss_mlp": 1.04676509, + "epoch": 0.1794077859612205, + "flos": 20814184281600.0, + "grad_norm": 2.0025677466759175, + "language_loss": 0.84977567, + "learning_rate": 3.7703687735440718e-06, + "loss": 0.87177408, + "num_input_tokens_seen": 64487230, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.015625, + "step": 2984, + "time_per_iteration": 2.461451530456543 + }, + { + "auxiliary_loss_clip": 0.01146636, + "auxiliary_loss_mlp": 0.01044432, + "balance_loss_clip": 1.02558494, + "balance_loss_mlp": 1.04734373, + "epoch": 0.17946790921388847, + "flos": 28986447006720.0, + "grad_norm": 2.5972673531528874, + "language_loss": 0.89526331, + "learning_rate": 3.7701875466823416e-06, + "loss": 0.91717398, + "num_input_tokens_seen": 64509165, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.9921875, + "step": 2985, + "time_per_iteration": 2.5644643306732178 + }, + { + "auxiliary_loss_clip": 0.01142965, + "auxiliary_loss_mlp": 0.01045202, + "balance_loss_clip": 1.02879906, + "balance_loss_mlp": 1.0478375, + "epoch": 0.17952803246655644, + "flos": 20737406960640.0, + "grad_norm": 1.9029387971382474, + "language_loss": 0.69863129, + "learning_rate": 3.770006252694922e-06, + "loss": 0.72051299, + "num_input_tokens_seen": 64527940, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.953125, + "step": 2986, + "time_per_iteration": 2.4629499912261963 + }, + { + "auxiliary_loss_clip": 0.01144523, + "auxiliary_loss_mlp": 0.01043434, + "balance_loss_clip": 1.02507591, + "balance_loss_mlp": 1.04828227, + "epoch": 0.1795881557192244, + "flos": 28255988027520.0, + "grad_norm": 2.203273814413497, + "language_loss": 0.77872753, + "learning_rate": 3.769824891588688e-06, + "loss": 0.80060714, + "num_input_tokens_seen": 64545230, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.96484375, + "step": 2987, + "time_per_iteration": 2.524712562561035 + }, + { + "auxiliary_loss_clip": 0.01149287, + "auxiliary_loss_mlp": 0.01043881, + "balance_loss_clip": 1.02412844, + "balance_loss_mlp": 1.04834962, + "epoch": 0.17964827897189237, + "flos": 18552027594240.0, + "grad_norm": 2.225668764256514, + "language_loss": 0.78012109, + "learning_rate": 3.7696434633705164e-06, + "loss": 0.8020528, + "num_input_tokens_seen": 64563820, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0078125, + "step": 2988, + "time_per_iteration": 2.4608163833618164 + }, + { + "auxiliary_loss_clip": 0.01048374, + "auxiliary_loss_mlp": 0.01007691, + "balance_loss_clip": 1.00570035, + "balance_loss_mlp": 1.0154314, + "epoch": 0.17970840222456036, + "flos": 58165088711040.0, + "grad_norm": 0.7961406236538413, + "language_loss": 0.62767559, + "learning_rate": 3.7694619680472875e-06, + "loss": 0.64823627, + "num_input_tokens_seen": 64621315, + "router_z_loss_clip": 0.01989746, + "router_z_loss_mlp": 0.33007812, + "step": 2989, + "time_per_iteration": 2.9831957817077637 + }, + { + "auxiliary_loss_clip": 0.01146079, + "auxiliary_loss_mlp": 0.01041184, + "balance_loss_clip": 1.02363658, + "balance_loss_mlp": 1.04836369, + "epoch": 0.17976852547722832, + "flos": 20300805146880.0, + "grad_norm": 3.4434429944335525, + "language_loss": 0.70464563, + "learning_rate": 3.7692804056258837e-06, + "loss": 0.72651821, + "num_input_tokens_seen": 64639885, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.98046875, + "step": 2990, + "time_per_iteration": 2.556100606918335 + }, + { + "auxiliary_loss_clip": 0.01146243, + "auxiliary_loss_mlp": 0.01039011, + "balance_loss_clip": 1.0210464, + "balance_loss_mlp": 1.04735422, + "epoch": 0.1798286487298963, + "flos": 39669367685760.0, + "grad_norm": 1.7649502456354873, + "language_loss": 0.68110204, + "learning_rate": 3.7690987761131893e-06, + "loss": 0.70295459, + "num_input_tokens_seen": 64661220, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.98828125, + "step": 2991, + "time_per_iteration": 2.6224544048309326 + }, + { + "auxiliary_loss_clip": 0.01145545, + "auxiliary_loss_mlp": 0.01040119, + "balance_loss_clip": 1.02194011, + "balance_loss_mlp": 1.04794931, + "epoch": 0.17988877198256426, + "flos": 25520313323520.0, + "grad_norm": 1.5716432326573742, + "language_loss": 0.82754636, + "learning_rate": 3.7689170795160924e-06, + "loss": 0.84940296, + "num_input_tokens_seen": 64682530, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9765625, + "step": 2992, + "time_per_iteration": 2.51824951171875 + }, + { + "auxiliary_loss_clip": 0.01138637, + "auxiliary_loss_mlp": 0.01040458, + "balance_loss_clip": 1.02301776, + "balance_loss_mlp": 1.04464579, + "epoch": 0.17994889523523222, + "flos": 18807496099200.0, + "grad_norm": 2.1353598877924806, + "language_loss": 0.81958085, + "learning_rate": 3.7687353158414822e-06, + "loss": 0.84137177, + "num_input_tokens_seen": 64701025, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.94140625, + "step": 2993, + "time_per_iteration": 2.4349074363708496 + }, + { + "auxiliary_loss_clip": 0.01143824, + "auxiliary_loss_mlp": 0.01047314, + "balance_loss_clip": 1.02889621, + "balance_loss_mlp": 1.04586673, + "epoch": 0.18000901848790019, + "flos": 21104450087040.0, + "grad_norm": 1.7254805142405878, + "language_loss": 0.78390837, + "learning_rate": 3.7685534850962517e-06, + "loss": 0.80581975, + "num_input_tokens_seen": 64719570, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.98046875, + "step": 2994, + "time_per_iteration": 2.4898691177368164 + }, + { + "auxiliary_loss_clip": 0.01148185, + "auxiliary_loss_mlp": 0.01043708, + "balance_loss_clip": 1.02642226, + "balance_loss_mlp": 1.04966068, + "epoch": 0.18006914174056818, + "flos": 19646441130240.0, + "grad_norm": 1.8689491925476576, + "language_loss": 0.80392146, + "learning_rate": 3.768371587287296e-06, + "loss": 0.82584035, + "num_input_tokens_seen": 64738110, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.984375, + "step": 2995, + "time_per_iteration": 2.4521572589874268 + }, + { + "auxiliary_loss_clip": 0.01144196, + "auxiliary_loss_mlp": 0.01046311, + "balance_loss_clip": 1.02939498, + "balance_loss_mlp": 1.04679298, + "epoch": 0.18012926499323614, + "flos": 19499889640320.0, + "grad_norm": 1.5635152056288029, + "language_loss": 0.84467834, + "learning_rate": 3.768189622421512e-06, + "loss": 0.86658335, + "num_input_tokens_seen": 64756345, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.97265625, + "step": 2996, + "time_per_iteration": 2.46993088722229 + }, + { + "auxiliary_loss_clip": 0.01139788, + "auxiliary_loss_mlp": 0.01041997, + "balance_loss_clip": 1.02493799, + "balance_loss_mlp": 1.04656756, + "epoch": 0.1801893882459041, + "flos": 19464553635840.0, + "grad_norm": 2.9197857622903793, + "language_loss": 0.88254511, + "learning_rate": 3.7680075905058006e-06, + "loss": 0.90436304, + "num_input_tokens_seen": 64776375, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9296875, + "step": 2997, + "time_per_iteration": 2.470113515853882 + }, + { + "auxiliary_loss_clip": 0.01148809, + "auxiliary_loss_mlp": 0.01043259, + "balance_loss_clip": 1.02435279, + "balance_loss_mlp": 1.04666877, + "epoch": 0.18024951149857207, + "flos": 26870590414080.0, + "grad_norm": 2.5635961030192935, + "language_loss": 0.8504566, + "learning_rate": 3.7678254915470643e-06, + "loss": 0.87237728, + "num_input_tokens_seen": 64796210, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0234375, + "step": 2998, + "time_per_iteration": 2.5252864360809326 + }, + { + "auxiliary_loss_clip": 0.0114547, + "auxiliary_loss_mlp": 0.01045025, + "balance_loss_clip": 1.02783537, + "balance_loss_mlp": 1.05022454, + "epoch": 0.18030963475124004, + "flos": 30226621933440.0, + "grad_norm": 1.8695557812200347, + "language_loss": 0.84270376, + "learning_rate": 3.7676433255522084e-06, + "loss": 0.86460871, + "num_input_tokens_seen": 64818590, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.953125, + "step": 2999, + "time_per_iteration": 2.5272696018218994 + }, + { + "auxiliary_loss_clip": 0.01143823, + "auxiliary_loss_mlp": 0.01044085, + "balance_loss_clip": 1.02577412, + "balance_loss_mlp": 1.04662383, + "epoch": 0.180369758003908, + "flos": 22307493329280.0, + "grad_norm": 1.7700032623605295, + "language_loss": 0.74753368, + "learning_rate": 3.76746109252814e-06, + "loss": 0.76941276, + "num_input_tokens_seen": 64838350, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.97265625, + "step": 3000, + "time_per_iteration": 2.4800922870635986 + }, + { + "auxiliary_loss_clip": 0.01143329, + "auxiliary_loss_mlp": 0.01060132, + "balance_loss_clip": 1.04111791, + "balance_loss_mlp": 1.04825568, + "epoch": 0.18042988125657597, + "flos": 23732033788800.0, + "grad_norm": 2.369063359757221, + "language_loss": 0.71625632, + "learning_rate": 3.76727879248177e-06, + "loss": 0.73829091, + "num_input_tokens_seen": 64858065, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.953125, + "step": 3001, + "time_per_iteration": 2.484309434890747 + }, + { + "auxiliary_loss_clip": 0.01148499, + "auxiliary_loss_mlp": 0.01048814, + "balance_loss_clip": 1.03010964, + "balance_loss_mlp": 1.04815364, + "epoch": 0.18049000450924396, + "flos": 24093582134400.0, + "grad_norm": 2.7240097708601225, + "language_loss": 0.87795258, + "learning_rate": 3.767096425420011e-06, + "loss": 0.89992571, + "num_input_tokens_seen": 64877305, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 1.0078125, + "step": 3002, + "time_per_iteration": 2.4881784915924072 + }, + { + "auxiliary_loss_clip": 0.011444, + "auxiliary_loss_mlp": 0.01044754, + "balance_loss_clip": 1.02689672, + "balance_loss_mlp": 1.04694915, + "epoch": 0.18055012776191193, + "flos": 22163168482560.0, + "grad_norm": 1.6880476069492312, + "language_loss": 0.80563951, + "learning_rate": 3.7669139913497788e-06, + "loss": 0.8275311, + "num_input_tokens_seen": 64896955, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.9765625, + "step": 3003, + "time_per_iteration": 2.452103614807129 + }, + { + "auxiliary_loss_clip": 0.0114812, + "auxiliary_loss_mlp": 0.01044755, + "balance_loss_clip": 1.02673101, + "balance_loss_mlp": 1.04780829, + "epoch": 0.1806102510145799, + "flos": 28913512440960.0, + "grad_norm": 2.4630533980116804, + "language_loss": 0.66931474, + "learning_rate": 3.7667314902779907e-06, + "loss": 0.69124347, + "num_input_tokens_seen": 64917080, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0, + "step": 3004, + "time_per_iteration": 2.5085701942443848 + }, + { + "auxiliary_loss_clip": 0.0114685, + "auxiliary_loss_mlp": 0.01050762, + "balance_loss_clip": 1.03184342, + "balance_loss_mlp": 1.04860806, + "epoch": 0.18067037426724786, + "flos": 19025689265280.0, + "grad_norm": 1.8927608809249736, + "language_loss": 0.85172975, + "learning_rate": 3.7665489222115677e-06, + "loss": 0.87370586, + "num_input_tokens_seen": 64935215, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.984375, + "step": 3005, + "time_per_iteration": 2.44529128074646 + }, + { + "auxiliary_loss_clip": 0.01141782, + "auxiliary_loss_mlp": 0.01042658, + "balance_loss_clip": 1.02611172, + "balance_loss_mlp": 1.04684031, + "epoch": 0.18073049751991582, + "flos": 27453635976960.0, + "grad_norm": 1.553419886600377, + "language_loss": 0.82951266, + "learning_rate": 3.766366287157432e-06, + "loss": 0.85135704, + "num_input_tokens_seen": 64956275, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.94921875, + "step": 3006, + "time_per_iteration": 2.532597780227661 + }, + { + "auxiliary_loss_clip": 0.01143778, + "auxiliary_loss_mlp": 0.01050753, + "balance_loss_clip": 1.0315007, + "balance_loss_mlp": 1.04581141, + "epoch": 0.1807906207725838, + "flos": 28729039167360.0, + "grad_norm": 1.6363768703600998, + "language_loss": 0.76883924, + "learning_rate": 3.7661835851225103e-06, + "loss": 0.79078454, + "num_input_tokens_seen": 64979390, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.98046875, + "step": 3007, + "time_per_iteration": 2.5265002250671387 + }, + { + "auxiliary_loss_clip": 0.01046842, + "auxiliary_loss_mlp": 0.01004593, + "balance_loss_clip": 1.00238752, + "balance_loss_mlp": 1.01358199, + "epoch": 0.18085074402525175, + "flos": 64466515468800.0, + "grad_norm": 0.8067080511403597, + "language_loss": 0.56949043, + "learning_rate": 3.7660008161137294e-06, + "loss": 0.59000474, + "num_input_tokens_seen": 65043135, + "router_z_loss_clip": 0.02209473, + "router_z_loss_mlp": 0.33203125, + "step": 3008, + "time_per_iteration": 3.1923961639404297 + }, + { + "auxiliary_loss_clip": 0.01148419, + "auxiliary_loss_mlp": 0.01048421, + "balance_loss_clip": 1.02878737, + "balance_loss_mlp": 1.04951596, + "epoch": 0.18091086727791975, + "flos": 23476960333440.0, + "grad_norm": 1.8063105677439477, + "language_loss": 0.67226636, + "learning_rate": 3.765817980138021e-06, + "loss": 0.69423479, + "num_input_tokens_seen": 65062845, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.98828125, + "step": 3009, + "time_per_iteration": 2.467525005340576 + }, + { + "auxiliary_loss_clip": 0.01147918, + "auxiliary_loss_mlp": 0.01047401, + "balance_loss_clip": 1.02993655, + "balance_loss_mlp": 1.04874969, + "epoch": 0.1809709905305877, + "flos": 24170467196160.0, + "grad_norm": 1.842230928142314, + "language_loss": 0.75573891, + "learning_rate": 3.7656350772023177e-06, + "loss": 0.77769208, + "num_input_tokens_seen": 65082110, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.98828125, + "step": 3010, + "time_per_iteration": 2.486067533493042 + }, + { + "auxiliary_loss_clip": 0.01141012, + "auxiliary_loss_mlp": 0.0104251, + "balance_loss_clip": 1.02585649, + "balance_loss_mlp": 1.04816866, + "epoch": 0.18103111378325568, + "flos": 21650902669440.0, + "grad_norm": 1.6130539386655762, + "language_loss": 0.66672593, + "learning_rate": 3.7654521073135553e-06, + "loss": 0.6885612, + "num_input_tokens_seen": 65101985, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9296875, + "step": 3011, + "time_per_iteration": 2.461749792098999 + }, + { + "auxiliary_loss_clip": 0.01142359, + "auxiliary_loss_mlp": 0.01048591, + "balance_loss_clip": 1.0309006, + "balance_loss_mlp": 1.04706419, + "epoch": 0.18109123703592364, + "flos": 53686918356480.0, + "grad_norm": 2.1517129990512927, + "language_loss": 0.71184897, + "learning_rate": 3.7652690704786723e-06, + "loss": 0.73375839, + "num_input_tokens_seen": 65129295, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.953125, + "step": 3012, + "time_per_iteration": 2.7380943298339844 + }, + { + "auxiliary_loss_clip": 0.01145213, + "auxiliary_loss_mlp": 0.01048493, + "balance_loss_clip": 1.03045654, + "balance_loss_mlp": 1.05109787, + "epoch": 0.1811513602885916, + "flos": 35845564325760.0, + "grad_norm": 2.2489260815019447, + "language_loss": 0.62039113, + "learning_rate": 3.765085966704609e-06, + "loss": 0.64232826, + "num_input_tokens_seen": 65150625, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.94140625, + "step": 3013, + "time_per_iteration": 2.5800936222076416 + }, + { + "auxiliary_loss_clip": 0.01145888, + "auxiliary_loss_mlp": 0.01050021, + "balance_loss_clip": 1.03303385, + "balance_loss_mlp": 1.04870379, + "epoch": 0.18121148354125957, + "flos": 23732572492800.0, + "grad_norm": 1.5535403171237991, + "language_loss": 0.76026124, + "learning_rate": 3.764902795998309e-06, + "loss": 0.7822203, + "num_input_tokens_seen": 65170880, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.97265625, + "step": 3014, + "time_per_iteration": 2.5049405097961426 + }, + { + "auxiliary_loss_clip": 0.01151342, + "auxiliary_loss_mlp": 0.01046668, + "balance_loss_clip": 1.02697504, + "balance_loss_mlp": 1.05086446, + "epoch": 0.18127160679392756, + "flos": 28728320895360.0, + "grad_norm": 1.7733972454950666, + "language_loss": 0.65696967, + "learning_rate": 3.7647195583667184e-06, + "loss": 0.67894971, + "num_input_tokens_seen": 65192530, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0, + "step": 3015, + "time_per_iteration": 2.52614426612854 + }, + { + "auxiliary_loss_clip": 0.01143858, + "auxiliary_loss_mlp": 0.01043977, + "balance_loss_clip": 1.0262742, + "balance_loss_mlp": 1.0490694, + "epoch": 0.18133173004659553, + "flos": 20485062938880.0, + "grad_norm": 1.7500400577379265, + "language_loss": 0.7809943, + "learning_rate": 3.764536253816785e-06, + "loss": 0.80287266, + "num_input_tokens_seen": 65211675, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9453125, + "step": 3016, + "time_per_iteration": 2.4736039638519287 + }, + { + "auxiliary_loss_clip": 0.01152649, + "auxiliary_loss_mlp": 0.01050768, + "balance_loss_clip": 1.03214788, + "balance_loss_mlp": 1.05294776, + "epoch": 0.1813918532992635, + "flos": 22852078404480.0, + "grad_norm": 1.6390488083316745, + "language_loss": 0.83498454, + "learning_rate": 3.7643528823554602e-06, + "loss": 0.85701871, + "num_input_tokens_seen": 65231185, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0, + "step": 3017, + "time_per_iteration": 2.454888105392456 + }, + { + "auxiliary_loss_clip": 0.01142751, + "auxiliary_loss_mlp": 0.01039497, + "balance_loss_clip": 1.02192545, + "balance_loss_mlp": 1.0486486, + "epoch": 0.18145197655193146, + "flos": 36065122208640.0, + "grad_norm": 2.2301629944757964, + "language_loss": 0.67067724, + "learning_rate": 3.764169443989697e-06, + "loss": 0.69249976, + "num_input_tokens_seen": 65251645, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.94140625, + "step": 3018, + "time_per_iteration": 3.950299024581909 + }, + { + "auxiliary_loss_clip": 0.01146405, + "auxiliary_loss_mlp": 0.0103774, + "balance_loss_clip": 1.01953697, + "balance_loss_mlp": 1.04928112, + "epoch": 0.18151209980459942, + "flos": 24023951619840.0, + "grad_norm": 2.174717508383113, + "language_loss": 0.75745898, + "learning_rate": 3.7639859387264518e-06, + "loss": 0.77930045, + "num_input_tokens_seen": 65271125, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.97265625, + "step": 3019, + "time_per_iteration": 3.9721574783325195 + }, + { + "auxiliary_loss_clip": 0.01149794, + "auxiliary_loss_mlp": 0.01045611, + "balance_loss_clip": 1.02653718, + "balance_loss_mlp": 1.05230832, + "epoch": 0.1815722230572674, + "flos": 23951627585280.0, + "grad_norm": 2.1373464597463574, + "language_loss": 0.81687438, + "learning_rate": 3.7638023665726834e-06, + "loss": 0.83882844, + "num_input_tokens_seen": 65290600, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.9765625, + "step": 3020, + "time_per_iteration": 2.510564088821411 + }, + { + "auxiliary_loss_clip": 0.01148401, + "auxiliary_loss_mlp": 0.01042965, + "balance_loss_clip": 1.02373672, + "balance_loss_mlp": 1.05124021, + "epoch": 0.18163234630993536, + "flos": 24386469632640.0, + "grad_norm": 2.9178918869439654, + "language_loss": 0.77220714, + "learning_rate": 3.763618727535352e-06, + "loss": 0.79412079, + "num_input_tokens_seen": 65311040, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.96875, + "step": 3021, + "time_per_iteration": 2.4856297969818115 + }, + { + "auxiliary_loss_clip": 0.01141247, + "auxiliary_loss_mlp": 0.0104233, + "balance_loss_clip": 1.02419829, + "balance_loss_mlp": 1.04617524, + "epoch": 0.18169246956260335, + "flos": 24681332378880.0, + "grad_norm": 1.7066661124221545, + "language_loss": 0.84841502, + "learning_rate": 3.763435021621422e-06, + "loss": 0.87025082, + "num_input_tokens_seen": 65332115, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.94921875, + "step": 3022, + "time_per_iteration": 2.4933700561523438 + }, + { + "auxiliary_loss_clip": 0.01148694, + "auxiliary_loss_mlp": 0.01042276, + "balance_loss_clip": 1.02296424, + "balance_loss_mlp": 1.0491302, + "epoch": 0.1817525928152713, + "flos": 24243294021120.0, + "grad_norm": 1.9452352079001236, + "language_loss": 0.69178426, + "learning_rate": 3.763251248837859e-06, + "loss": 0.7136941, + "num_input_tokens_seen": 65352210, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.9921875, + "step": 3023, + "time_per_iteration": 2.495107412338257 + }, + { + "auxiliary_loss_clip": 0.01144443, + "auxiliary_loss_mlp": 0.01044488, + "balance_loss_clip": 1.0261296, + "balance_loss_mlp": 1.04748738, + "epoch": 0.18181271606793928, + "flos": 16472081623680.0, + "grad_norm": 1.9417078000950883, + "language_loss": 0.73956865, + "learning_rate": 3.7630674091916317e-06, + "loss": 0.76145792, + "num_input_tokens_seen": 65370600, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.96875, + "step": 3024, + "time_per_iteration": 2.4531846046447754 + }, + { + "auxiliary_loss_clip": 0.01145205, + "auxiliary_loss_mlp": 0.01043186, + "balance_loss_clip": 1.02549553, + "balance_loss_mlp": 1.0490942, + "epoch": 0.18187283932060724, + "flos": 18581042805120.0, + "grad_norm": 2.344564071286257, + "language_loss": 0.88167858, + "learning_rate": 3.7628835026897123e-06, + "loss": 0.90356255, + "num_input_tokens_seen": 65387270, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9609375, + "step": 3025, + "time_per_iteration": 2.4708051681518555 + }, + { + "auxiliary_loss_clip": 0.01145802, + "auxiliary_loss_mlp": 0.01052568, + "balance_loss_clip": 1.03356552, + "balance_loss_mlp": 1.05046904, + "epoch": 0.1819329625732752, + "flos": 20266833859200.0, + "grad_norm": 2.755473586939447, + "language_loss": 0.79284346, + "learning_rate": 3.7626995293390735e-06, + "loss": 0.8148272, + "num_input_tokens_seen": 65406550, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.953125, + "step": 3026, + "time_per_iteration": 2.482987403869629 + }, + { + "auxiliary_loss_clip": 0.01149047, + "auxiliary_loss_mlp": 0.01053602, + "balance_loss_clip": 1.03424227, + "balance_loss_mlp": 1.0502665, + "epoch": 0.18199308582594317, + "flos": 25915186512000.0, + "grad_norm": 1.6571051349992714, + "language_loss": 0.76047945, + "learning_rate": 3.762515489146692e-06, + "loss": 0.78250599, + "num_input_tokens_seen": 65425955, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.98828125, + "step": 3027, + "time_per_iteration": 2.4952149391174316 + }, + { + "auxiliary_loss_clip": 0.01151758, + "auxiliary_loss_mlp": 0.01049071, + "balance_loss_clip": 1.03055763, + "balance_loss_mlp": 1.05106115, + "epoch": 0.18205320907861114, + "flos": 15377524433280.0, + "grad_norm": 1.7989426432275553, + "language_loss": 0.85400331, + "learning_rate": 3.762331382119546e-06, + "loss": 0.87601155, + "num_input_tokens_seen": 65442820, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0078125, + "step": 3028, + "time_per_iteration": 2.438113212585449 + }, + { + "auxiliary_loss_clip": 0.01144845, + "auxiliary_loss_mlp": 0.01043225, + "balance_loss_clip": 1.02543902, + "balance_loss_mlp": 1.04937243, + "epoch": 0.18211333233127913, + "flos": 25624310175360.0, + "grad_norm": 1.8205418995180693, + "language_loss": 0.82655656, + "learning_rate": 3.7621472082646183e-06, + "loss": 0.84843719, + "num_input_tokens_seen": 65461825, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.953125, + "step": 3029, + "time_per_iteration": 2.4866995811462402 + }, + { + "auxiliary_loss_clip": 0.01151958, + "auxiliary_loss_mlp": 0.01045395, + "balance_loss_clip": 1.02640462, + "balance_loss_mlp": 1.05306637, + "epoch": 0.1821734555839471, + "flos": 14976007228800.0, + "grad_norm": 2.0975281503542433, + "language_loss": 0.78150737, + "learning_rate": 3.761962967588891e-06, + "loss": 0.80348092, + "num_input_tokens_seen": 65479480, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.9921875, + "step": 3030, + "time_per_iteration": 2.458627700805664 + }, + { + "auxiliary_loss_clip": 0.01150751, + "auxiliary_loss_mlp": 0.01043659, + "balance_loss_clip": 1.02495515, + "balance_loss_mlp": 1.05141127, + "epoch": 0.18223357883661506, + "flos": 20194007034240.0, + "grad_norm": 1.955618442063123, + "language_loss": 0.85318518, + "learning_rate": 3.761778660099352e-06, + "loss": 0.87512928, + "num_input_tokens_seen": 65497775, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.99609375, + "step": 3031, + "time_per_iteration": 2.4492268562316895 + }, + { + "auxiliary_loss_clip": 0.01150203, + "auxiliary_loss_mlp": 0.01045881, + "balance_loss_clip": 1.02824974, + "balance_loss_mlp": 1.05232072, + "epoch": 0.18229370208928303, + "flos": 15231978524160.0, + "grad_norm": 1.8744751837074634, + "language_loss": 0.79713088, + "learning_rate": 3.76159428580299e-06, + "loss": 0.81909174, + "num_input_tokens_seen": 65516505, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.98046875, + "step": 3032, + "time_per_iteration": 2.4449126720428467 + }, + { + "auxiliary_loss_clip": 0.0115633, + "auxiliary_loss_mlp": 0.01044461, + "balance_loss_clip": 1.0260191, + "balance_loss_mlp": 1.05395341, + "epoch": 0.182353825341951, + "flos": 23840483927040.0, + "grad_norm": 2.0774072235136964, + "language_loss": 0.81420642, + "learning_rate": 3.761409844706795e-06, + "loss": 0.8362143, + "num_input_tokens_seen": 65536160, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 1.0234375, + "step": 3033, + "time_per_iteration": 2.47562575340271 + }, + { + "auxiliary_loss_clip": 0.01052781, + "auxiliary_loss_mlp": 0.01006645, + "balance_loss_clip": 1.00452268, + "balance_loss_mlp": 1.01995599, + "epoch": 0.18241394859461896, + "flos": 61190957393280.0, + "grad_norm": 0.8883360043233282, + "language_loss": 0.63479006, + "learning_rate": 3.7612253368177625e-06, + "loss": 0.6553843, + "num_input_tokens_seen": 65589375, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.328125, + "step": 3034, + "time_per_iteration": 2.9712142944335938 + }, + { + "auxiliary_loss_clip": 0.01148548, + "auxiliary_loss_mlp": 0.01043903, + "balance_loss_clip": 1.0263083, + "balance_loss_mlp": 1.05033147, + "epoch": 0.18247407184728695, + "flos": 18471694826880.0, + "grad_norm": 2.0132790953316113, + "language_loss": 0.79684323, + "learning_rate": 3.7610407621428893e-06, + "loss": 0.81876773, + "num_input_tokens_seen": 65606720, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.984375, + "step": 3035, + "time_per_iteration": 2.4517030715942383 + }, + { + "auxiliary_loss_clip": 0.01147231, + "auxiliary_loss_mlp": 0.01044908, + "balance_loss_clip": 1.02792096, + "balance_loss_mlp": 1.05231702, + "epoch": 0.18253419509995492, + "flos": 21795191602560.0, + "grad_norm": 2.217606261766961, + "language_loss": 0.84895855, + "learning_rate": 3.7608561206891735e-06, + "loss": 0.87087989, + "num_input_tokens_seen": 65625495, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.94921875, + "step": 3036, + "time_per_iteration": 2.5017378330230713 + }, + { + "auxiliary_loss_clip": 0.01142577, + "auxiliary_loss_mlp": 0.01042615, + "balance_loss_clip": 1.02524662, + "balance_loss_mlp": 1.04940438, + "epoch": 0.18259431835262288, + "flos": 20149764456960.0, + "grad_norm": 2.216717642760365, + "language_loss": 0.79836094, + "learning_rate": 3.760671412463617e-06, + "loss": 0.82021284, + "num_input_tokens_seen": 65643515, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9296875, + "step": 3037, + "time_per_iteration": 2.4591338634490967 + }, + { + "auxiliary_loss_clip": 0.01149617, + "auxiliary_loss_mlp": 0.01047298, + "balance_loss_clip": 1.02686584, + "balance_loss_mlp": 1.05208671, + "epoch": 0.18265444160529085, + "flos": 16981653916800.0, + "grad_norm": 2.68131613553598, + "language_loss": 0.79450762, + "learning_rate": 3.7604866374732246e-06, + "loss": 0.81647676, + "num_input_tokens_seen": 65658155, + "router_z_loss_clip": 0.20507812, + "router_z_loss_mlp": 0.9765625, + "step": 3038, + "time_per_iteration": 2.440664768218994 + }, + { + "auxiliary_loss_clip": 0.0114731, + "auxiliary_loss_mlp": 0.01048244, + "balance_loss_clip": 1.03069699, + "balance_loss_mlp": 1.05140162, + "epoch": 0.1827145648579588, + "flos": 34423250509440.0, + "grad_norm": 2.3213350225315748, + "language_loss": 0.67311364, + "learning_rate": 3.7603017957250023e-06, + "loss": 0.69506919, + "num_input_tokens_seen": 65679310, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.95703125, + "step": 3039, + "time_per_iteration": 2.573272466659546 + }, + { + "auxiliary_loss_clip": 0.01148566, + "auxiliary_loss_mlp": 0.01051856, + "balance_loss_clip": 1.03323567, + "balance_loss_mlp": 1.05112875, + "epoch": 0.18277468811062678, + "flos": 53287017264000.0, + "grad_norm": 1.9125298187860031, + "language_loss": 0.73687911, + "learning_rate": 3.7601168872259593e-06, + "loss": 0.75888336, + "num_input_tokens_seen": 65705235, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9765625, + "step": 3040, + "time_per_iteration": 2.771242618560791 + }, + { + "auxiliary_loss_clip": 0.0114474, + "auxiliary_loss_mlp": 0.01042775, + "balance_loss_clip": 1.02418995, + "balance_loss_mlp": 1.04849768, + "epoch": 0.18283481136329474, + "flos": 31650659602560.0, + "grad_norm": 1.8780343880464916, + "language_loss": 0.60176188, + "learning_rate": 3.7599319119831075e-06, + "loss": 0.62363702, + "num_input_tokens_seen": 65727575, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9609375, + "step": 3041, + "time_per_iteration": 2.670691967010498 + }, + { + "auxiliary_loss_clip": 0.01146425, + "auxiliary_loss_mlp": 0.01055713, + "balance_loss_clip": 1.03756928, + "balance_loss_mlp": 1.05012786, + "epoch": 0.18289493461596273, + "flos": 53137664513280.0, + "grad_norm": 1.7488247873746179, + "language_loss": 0.60361505, + "learning_rate": 3.7597468700034616e-06, + "loss": 0.6256364, + "num_input_tokens_seen": 65751370, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9609375, + "step": 3042, + "time_per_iteration": 2.7942960262298584 + }, + { + "auxiliary_loss_clip": 0.01144442, + "auxiliary_loss_mlp": 0.0104919, + "balance_loss_clip": 1.03143954, + "balance_loss_mlp": 1.04945385, + "epoch": 0.1829550578686307, + "flos": 25589369220480.0, + "grad_norm": 1.6831322617730042, + "language_loss": 0.8769263, + "learning_rate": 3.7595617612940374e-06, + "loss": 0.8988626, + "num_input_tokens_seen": 65771040, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.94921875, + "step": 3043, + "time_per_iteration": 2.524871587753296 + }, + { + "auxiliary_loss_clip": 0.01149409, + "auxiliary_loss_mlp": 0.01049211, + "balance_loss_clip": 1.03005409, + "balance_loss_mlp": 1.05107832, + "epoch": 0.18301518112129866, + "flos": 22601422321920.0, + "grad_norm": 1.9464603469819268, + "language_loss": 0.707008, + "learning_rate": 3.7593765858618552e-06, + "loss": 0.72899425, + "num_input_tokens_seen": 65789345, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.984375, + "step": 3044, + "time_per_iteration": 2.483335018157959 + }, + { + "auxiliary_loss_clip": 0.01150484, + "auxiliary_loss_mlp": 0.01055406, + "balance_loss_clip": 1.03552175, + "balance_loss_mlp": 1.04929996, + "epoch": 0.18307530437396663, + "flos": 34020799551360.0, + "grad_norm": 2.0901220952627497, + "language_loss": 0.64385587, + "learning_rate": 3.7591913437139365e-06, + "loss": 0.66591471, + "num_input_tokens_seen": 65810990, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 1.015625, + "step": 3045, + "time_per_iteration": 2.592855453491211 + }, + { + "auxiliary_loss_clip": 0.01145205, + "auxiliary_loss_mlp": 0.01054969, + "balance_loss_clip": 1.0368377, + "balance_loss_mlp": 1.04977548, + "epoch": 0.1831354276266346, + "flos": 21279765392640.0, + "grad_norm": 2.998731206361719, + "language_loss": 0.79165137, + "learning_rate": 3.7590060348573066e-06, + "loss": 0.81365317, + "num_input_tokens_seen": 65827230, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.953125, + "step": 3046, + "time_per_iteration": 2.5034587383270264 + }, + { + "auxiliary_loss_clip": 0.01146985, + "auxiliary_loss_mlp": 0.01048107, + "balance_loss_clip": 1.02908087, + "balance_loss_mlp": 1.04764223, + "epoch": 0.18319555087930256, + "flos": 21032952065280.0, + "grad_norm": 3.3529268295267016, + "language_loss": 0.78991181, + "learning_rate": 3.7588206592989903e-06, + "loss": 0.81186271, + "num_input_tokens_seen": 65845900, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.9921875, + "step": 3047, + "time_per_iteration": 2.5140535831451416 + }, + { + "auxiliary_loss_clip": 0.01145799, + "auxiliary_loss_mlp": 0.01046901, + "balance_loss_clip": 1.02923381, + "balance_loss_mlp": 1.05111742, + "epoch": 0.18325567413197055, + "flos": 34382958428160.0, + "grad_norm": 1.5613113238500957, + "language_loss": 0.80888635, + "learning_rate": 3.7586352170460194e-06, + "loss": 0.83081341, + "num_input_tokens_seen": 65868730, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9453125, + "step": 3048, + "time_per_iteration": 2.5848963260650635 + }, + { + "auxiliary_loss_clip": 0.01148052, + "auxiliary_loss_mlp": 0.01042108, + "balance_loss_clip": 1.02283192, + "balance_loss_mlp": 1.0502528, + "epoch": 0.18331579738463852, + "flos": 20558464381440.0, + "grad_norm": 1.8161394933049422, + "language_loss": 0.86232805, + "learning_rate": 3.758449708105424e-06, + "loss": 0.88422966, + "num_input_tokens_seen": 65888420, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.9765625, + "step": 3049, + "time_per_iteration": 2.4665114879608154 + }, + { + "auxiliary_loss_clip": 0.01154476, + "auxiliary_loss_mlp": 0.01043247, + "balance_loss_clip": 1.02364874, + "balance_loss_mlp": 1.05159521, + "epoch": 0.18337592063730648, + "flos": 19607872901760.0, + "grad_norm": 2.2703740748038066, + "language_loss": 0.77160966, + "learning_rate": 3.75826413248424e-06, + "loss": 0.79358685, + "num_input_tokens_seen": 65905840, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.03125, + "step": 3050, + "time_per_iteration": 2.4525256156921387 + }, + { + "auxiliary_loss_clip": 0.01146286, + "auxiliary_loss_mlp": 0.01045385, + "balance_loss_clip": 1.02683592, + "balance_loss_mlp": 1.04867804, + "epoch": 0.18343604388997445, + "flos": 20850885002880.0, + "grad_norm": 2.010292972394078, + "language_loss": 0.99174476, + "learning_rate": 3.7580784901895035e-06, + "loss": 1.0136615, + "num_input_tokens_seen": 65922845, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9765625, + "step": 3051, + "time_per_iteration": 2.4559926986694336 + }, + { + "auxiliary_loss_clip": 0.01145751, + "auxiliary_loss_mlp": 0.01039078, + "balance_loss_clip": 1.02096963, + "balance_loss_mlp": 1.050529, + "epoch": 0.1834961671426424, + "flos": 24394370624640.0, + "grad_norm": 1.5992624239842805, + "language_loss": 0.86153144, + "learning_rate": 3.7578927812282542e-06, + "loss": 0.8833797, + "num_input_tokens_seen": 65945555, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.953125, + "step": 3052, + "time_per_iteration": 2.559396505355835 + }, + { + "auxiliary_loss_clip": 0.01145626, + "auxiliary_loss_mlp": 0.0105059, + "balance_loss_clip": 1.03267264, + "balance_loss_mlp": 1.04985499, + "epoch": 0.18355629039531038, + "flos": 21251612108160.0, + "grad_norm": 1.8182752776897229, + "language_loss": 0.73004341, + "learning_rate": 3.7577070056075356e-06, + "loss": 0.75200558, + "num_input_tokens_seen": 65963965, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.95703125, + "step": 3053, + "time_per_iteration": 2.4481074810028076 + }, + { + "auxiliary_loss_clip": 0.01150169, + "auxiliary_loss_mlp": 0.01049972, + "balance_loss_clip": 1.03051662, + "balance_loss_mlp": 1.05208337, + "epoch": 0.18361641364797834, + "flos": 28656499651200.0, + "grad_norm": 1.6467304764216655, + "language_loss": 0.62212563, + "learning_rate": 3.7575211633343902e-06, + "loss": 0.64412701, + "num_input_tokens_seen": 65985965, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.98046875, + "step": 3054, + "time_per_iteration": 2.5701377391815186 + }, + { + "auxiliary_loss_clip": 0.01146023, + "auxiliary_loss_mlp": 0.01042771, + "balance_loss_clip": 1.02510393, + "balance_loss_mlp": 1.04962707, + "epoch": 0.18367653690064634, + "flos": 20918827578240.0, + "grad_norm": 2.2210920593094325, + "language_loss": 0.78501689, + "learning_rate": 3.7573352544158663e-06, + "loss": 0.80690485, + "num_input_tokens_seen": 66005645, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9609375, + "step": 3055, + "time_per_iteration": 2.47776198387146 + }, + { + "auxiliary_loss_clip": 0.0114232, + "auxiliary_loss_mlp": 0.01053937, + "balance_loss_clip": 1.03622222, + "balance_loss_mlp": 1.04779387, + "epoch": 0.1837366601533143, + "flos": 28765596234240.0, + "grad_norm": 1.894881128028073, + "language_loss": 0.70218527, + "learning_rate": 3.757149278859014e-06, + "loss": 0.72414786, + "num_input_tokens_seen": 66025675, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9453125, + "step": 3056, + "time_per_iteration": 2.541361093521118 + }, + { + "auxiliary_loss_clip": 0.0114918, + "auxiliary_loss_mlp": 0.01043721, + "balance_loss_clip": 1.02612543, + "balance_loss_mlp": 1.05066419, + "epoch": 0.18379678340598227, + "flos": 21251432540160.0, + "grad_norm": 1.4932354373853338, + "language_loss": 0.8028152, + "learning_rate": 3.7569632366708842e-06, + "loss": 0.82474422, + "num_input_tokens_seen": 66046125, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.984375, + "step": 3057, + "time_per_iteration": 2.4718995094299316 + }, + { + "auxiliary_loss_clip": 0.0115229, + "auxiliary_loss_mlp": 0.01049373, + "balance_loss_clip": 1.02864265, + "balance_loss_mlp": 1.04847729, + "epoch": 0.18385690665865023, + "flos": 20449619193600.0, + "grad_norm": 2.0112890674266914, + "language_loss": 0.82289785, + "learning_rate": 3.756777127858533e-06, + "loss": 0.84491444, + "num_input_tokens_seen": 66064375, + "router_z_loss_clip": 0.20703125, + "router_z_loss_mlp": 1.0390625, + "step": 3058, + "time_per_iteration": 2.4653379917144775 + }, + { + "auxiliary_loss_clip": 0.01148012, + "auxiliary_loss_mlp": 0.01046535, + "balance_loss_clip": 1.02818882, + "balance_loss_mlp": 1.04893029, + "epoch": 0.1839170299113182, + "flos": 26140562398080.0, + "grad_norm": 2.205773819593527, + "language_loss": 0.85894352, + "learning_rate": 3.756590952429017e-06, + "loss": 0.88088906, + "num_input_tokens_seen": 66084590, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.9921875, + "step": 3059, + "time_per_iteration": 4.0151047706604 + }, + { + "auxiliary_loss_clip": 0.01145706, + "auxiliary_loss_mlp": 0.01045338, + "balance_loss_clip": 1.02724195, + "balance_loss_mlp": 1.04931092, + "epoch": 0.18397715316398616, + "flos": 31758032332800.0, + "grad_norm": 1.70952354928268, + "language_loss": 0.72799402, + "learning_rate": 3.756404710389396e-06, + "loss": 0.74990445, + "num_input_tokens_seen": 66107105, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9609375, + "step": 3060, + "time_per_iteration": 5.466471195220947 + }, + { + "auxiliary_loss_clip": 0.01151276, + "auxiliary_loss_mlp": 0.01042783, + "balance_loss_clip": 1.02375722, + "balance_loss_mlp": 1.05253565, + "epoch": 0.18403727641665413, + "flos": 24611989173120.0, + "grad_norm": 1.7373746338425942, + "language_loss": 0.72797298, + "learning_rate": 3.7562184017467323e-06, + "loss": 0.74991357, + "num_input_tokens_seen": 66129295, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.98828125, + "step": 3061, + "time_per_iteration": 2.5244035720825195 + }, + { + "auxiliary_loss_clip": 0.01146537, + "auxiliary_loss_mlp": 0.0104557, + "balance_loss_clip": 1.02697313, + "balance_loss_mlp": 1.05087519, + "epoch": 0.18409739966932212, + "flos": 23439900476160.0, + "grad_norm": 1.8714044833418495, + "language_loss": 0.81622046, + "learning_rate": 3.7560320265080906e-06, + "loss": 0.83814156, + "num_input_tokens_seen": 66146910, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.95703125, + "step": 3062, + "time_per_iteration": 2.4767649173736572 + }, + { + "auxiliary_loss_clip": 0.01154667, + "auxiliary_loss_mlp": 0.01045371, + "balance_loss_clip": 1.02681041, + "balance_loss_mlp": 1.05394542, + "epoch": 0.18415752292199009, + "flos": 21872112577920.0, + "grad_norm": 1.7582970194369052, + "language_loss": 0.72718614, + "learning_rate": 3.7558455846805383e-06, + "loss": 0.74918652, + "num_input_tokens_seen": 66165370, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 1.0078125, + "step": 3063, + "time_per_iteration": 2.5082144737243652 + }, + { + "auxiliary_loss_clip": 0.01146453, + "auxiliary_loss_mlp": 0.01041784, + "balance_loss_clip": 1.02516627, + "balance_loss_mlp": 1.04935837, + "epoch": 0.18421764617465805, + "flos": 25410678036480.0, + "grad_norm": 2.1216519555610183, + "language_loss": 0.65496099, + "learning_rate": 3.7556590762711463e-06, + "loss": 0.6768434, + "num_input_tokens_seen": 66186210, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.96875, + "step": 3064, + "time_per_iteration": 2.523141622543335 + }, + { + "auxiliary_loss_clip": 0.01149329, + "auxiliary_loss_mlp": 0.01048992, + "balance_loss_clip": 1.03081298, + "balance_loss_mlp": 1.05274165, + "epoch": 0.18427776942732602, + "flos": 27198131558400.0, + "grad_norm": 2.6163412642887947, + "language_loss": 0.68768656, + "learning_rate": 3.7554725012869853e-06, + "loss": 0.70966971, + "num_input_tokens_seen": 66204800, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.96875, + "step": 3065, + "time_per_iteration": 2.5244293212890625 + }, + { + "auxiliary_loss_clip": 0.01151353, + "auxiliary_loss_mlp": 0.01047403, + "balance_loss_clip": 1.02819824, + "balance_loss_mlp": 1.05120087, + "epoch": 0.18433789267999398, + "flos": 27852351920640.0, + "grad_norm": 4.932084281869228, + "language_loss": 0.72561431, + "learning_rate": 3.7552858597351318e-06, + "loss": 0.74760187, + "num_input_tokens_seen": 66222195, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0, + "step": 3066, + "time_per_iteration": 2.5428919792175293 + }, + { + "auxiliary_loss_clip": 0.01148706, + "auxiliary_loss_mlp": 0.01043732, + "balance_loss_clip": 1.02608871, + "balance_loss_mlp": 1.05074954, + "epoch": 0.18439801593266195, + "flos": 17856940533120.0, + "grad_norm": 1.9825677919996112, + "language_loss": 0.82477474, + "learning_rate": 3.7550991516226622e-06, + "loss": 0.84669906, + "num_input_tokens_seen": 66239505, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.98046875, + "step": 3067, + "time_per_iteration": 2.4500880241394043 + }, + { + "auxiliary_loss_clip": 0.01048916, + "auxiliary_loss_mlp": 0.01007477, + "balance_loss_clip": 1.00535476, + "balance_loss_mlp": 1.01668859, + "epoch": 0.18445813918532994, + "flos": 56389522590720.0, + "grad_norm": 0.7924805733675573, + "language_loss": 0.59706604, + "learning_rate": 3.754912376956657e-06, + "loss": 0.61763, + "num_input_tokens_seen": 66295695, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.32226562, + "step": 3068, + "time_per_iteration": 2.9375104904174805 + }, + { + "auxiliary_loss_clip": 0.01153283, + "auxiliary_loss_mlp": 0.01040124, + "balance_loss_clip": 1.02232563, + "balance_loss_mlp": 1.05714762, + "epoch": 0.1845182624379979, + "flos": 20957180325120.0, + "grad_norm": 1.8708990955689164, + "language_loss": 0.76227212, + "learning_rate": 3.7547255357441987e-06, + "loss": 0.78420615, + "num_input_tokens_seen": 66315315, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9609375, + "step": 3069, + "time_per_iteration": 2.462446451187134 + }, + { + "auxiliary_loss_clip": 0.01151625, + "auxiliary_loss_mlp": 0.01040566, + "balance_loss_clip": 1.02233863, + "balance_loss_mlp": 1.05299067, + "epoch": 0.18457838569066587, + "flos": 20485170679680.0, + "grad_norm": 1.7428293735192475, + "language_loss": 0.84803855, + "learning_rate": 3.7545386279923718e-06, + "loss": 0.86996043, + "num_input_tokens_seen": 66333675, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.984375, + "step": 3070, + "time_per_iteration": 2.4887194633483887 + }, + { + "auxiliary_loss_clip": 0.01152145, + "auxiliary_loss_mlp": 0.01042624, + "balance_loss_clip": 1.02462363, + "balance_loss_mlp": 1.05298758, + "epoch": 0.18463850894333383, + "flos": 25010022758400.0, + "grad_norm": 1.9722863584187038, + "language_loss": 0.77370453, + "learning_rate": 3.754351653708265e-06, + "loss": 0.79565221, + "num_input_tokens_seen": 66354075, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.98828125, + "step": 3071, + "time_per_iteration": 2.482213258743286 + }, + { + "auxiliary_loss_clip": 0.01152228, + "auxiliary_loss_mlp": 0.01048541, + "balance_loss_clip": 1.03042173, + "balance_loss_mlp": 1.05342758, + "epoch": 0.1846986321960018, + "flos": 16800628348800.0, + "grad_norm": 2.705053980849468, + "language_loss": 0.77691031, + "learning_rate": 3.7541646128989674e-06, + "loss": 0.79891801, + "num_input_tokens_seen": 66372520, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9921875, + "step": 3072, + "time_per_iteration": 2.466387987136841 + }, + { + "auxiliary_loss_clip": 0.01150023, + "auxiliary_loss_mlp": 0.01042519, + "balance_loss_clip": 1.02339804, + "balance_loss_mlp": 1.05013216, + "epoch": 0.18475875544866976, + "flos": 20814327936000.0, + "grad_norm": 1.8173375196390826, + "language_loss": 0.8607235, + "learning_rate": 3.7539775055715715e-06, + "loss": 0.88264889, + "num_input_tokens_seen": 66390745, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0, + "step": 3073, + "time_per_iteration": 2.4510810375213623 + }, + { + "auxiliary_loss_clip": 0.01151045, + "auxiliary_loss_mlp": 0.01045152, + "balance_loss_clip": 1.02851045, + "balance_loss_mlp": 1.05339348, + "epoch": 0.18481887870133773, + "flos": 22601422321920.0, + "grad_norm": 2.2059027996031877, + "language_loss": 0.92005521, + "learning_rate": 3.7537903317331732e-06, + "loss": 0.9420172, + "num_input_tokens_seen": 66410525, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.97265625, + "step": 3074, + "time_per_iteration": 2.473710298538208 + }, + { + "auxiliary_loss_clip": 0.01146992, + "auxiliary_loss_mlp": 0.01044255, + "balance_loss_clip": 1.02490735, + "balance_loss_mlp": 1.05028176, + "epoch": 0.18487900195400572, + "flos": 29458815788160.0, + "grad_norm": 1.9913742546968862, + "language_loss": 0.65041798, + "learning_rate": 3.75360309139087e-06, + "loss": 0.67233044, + "num_input_tokens_seen": 66432535, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.96875, + "step": 3075, + "time_per_iteration": 2.533724784851074 + }, + { + "auxiliary_loss_clip": 0.01149493, + "auxiliary_loss_mlp": 0.01043367, + "balance_loss_clip": 1.02578402, + "balance_loss_mlp": 1.053177, + "epoch": 0.1849391252066737, + "flos": 20628777254400.0, + "grad_norm": 1.709240712607824, + "language_loss": 0.72323918, + "learning_rate": 3.753415784551761e-06, + "loss": 0.74516779, + "num_input_tokens_seen": 66450620, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9609375, + "step": 3076, + "time_per_iteration": 2.4544899463653564 + }, + { + "auxiliary_loss_clip": 0.01153692, + "auxiliary_loss_mlp": 0.01046042, + "balance_loss_clip": 1.0280292, + "balance_loss_mlp": 1.05341136, + "epoch": 0.18499924845934165, + "flos": 14428549065600.0, + "grad_norm": 2.4900368363969854, + "language_loss": 0.80860448, + "learning_rate": 3.7532284112229507e-06, + "loss": 0.83060181, + "num_input_tokens_seen": 66467865, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 1.0, + "step": 3077, + "time_per_iteration": 2.45137882232666 + }, + { + "auxiliary_loss_clip": 0.01146798, + "auxiliary_loss_mlp": 0.01045715, + "balance_loss_clip": 1.02816749, + "balance_loss_mlp": 1.05103469, + "epoch": 0.18505937171200962, + "flos": 23727652329600.0, + "grad_norm": 1.7908770900539794, + "language_loss": 0.78764129, + "learning_rate": 3.7530409714115424e-06, + "loss": 0.8095665, + "num_input_tokens_seen": 66486245, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.95703125, + "step": 3078, + "time_per_iteration": 2.477393865585327 + }, + { + "auxiliary_loss_clip": 0.01147874, + "auxiliary_loss_mlp": 0.01044325, + "balance_loss_clip": 1.02714717, + "balance_loss_mlp": 1.05057585, + "epoch": 0.18511949496467758, + "flos": 25957489754880.0, + "grad_norm": 1.8549646444276375, + "language_loss": 0.7758081, + "learning_rate": 3.7528534651246453e-06, + "loss": 0.79773009, + "num_input_tokens_seen": 66506510, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9765625, + "step": 3079, + "time_per_iteration": 2.5069448947906494 + }, + { + "auxiliary_loss_clip": 0.01143899, + "auxiliary_loss_mlp": 0.0104323, + "balance_loss_clip": 1.02581406, + "balance_loss_mlp": 1.04723024, + "epoch": 0.18517961821734555, + "flos": 42413553912960.0, + "grad_norm": 2.3452692712375893, + "language_loss": 0.81668431, + "learning_rate": 3.752665892369369e-06, + "loss": 0.83855557, + "num_input_tokens_seen": 66530960, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.96484375, + "step": 3080, + "time_per_iteration": 2.688206911087036 + }, + { + "auxiliary_loss_clip": 0.01149652, + "auxiliary_loss_mlp": 0.01046927, + "balance_loss_clip": 1.02812803, + "balance_loss_mlp": 1.05079699, + "epoch": 0.18523974147001354, + "flos": 24097568544000.0, + "grad_norm": 2.0276132956863764, + "language_loss": 0.7435087, + "learning_rate": 3.7524782531528266e-06, + "loss": 0.7654745, + "num_input_tokens_seen": 66550275, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.98828125, + "step": 3081, + "time_per_iteration": 2.5003983974456787 + }, + { + "auxiliary_loss_clip": 0.01151656, + "auxiliary_loss_mlp": 0.01050271, + "balance_loss_clip": 1.03124547, + "balance_loss_mlp": 1.05527234, + "epoch": 0.1852998647226815, + "flos": 27375278457600.0, + "grad_norm": 2.070281784994394, + "language_loss": 0.71532816, + "learning_rate": 3.7522905474821334e-06, + "loss": 0.73734742, + "num_input_tokens_seen": 66569040, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.9609375, + "step": 3082, + "time_per_iteration": 2.514004707336426 + }, + { + "auxiliary_loss_clip": 0.011545, + "auxiliary_loss_mlp": 0.01050471, + "balance_loss_clip": 1.03155267, + "balance_loss_mlp": 1.05488813, + "epoch": 0.18535998797534947, + "flos": 18332757020160.0, + "grad_norm": 1.869200996989063, + "language_loss": 0.69338834, + "learning_rate": 3.752102775364407e-06, + "loss": 0.71543807, + "num_input_tokens_seen": 66587775, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.9921875, + "step": 3083, + "time_per_iteration": 2.446418523788452 + }, + { + "auxiliary_loss_clip": 0.0114679, + "auxiliary_loss_mlp": 0.01049301, + "balance_loss_clip": 1.03187287, + "balance_loss_mlp": 1.05216169, + "epoch": 0.18542011122801744, + "flos": 37845859887360.0, + "grad_norm": 4.022344342016001, + "language_loss": 0.68854296, + "learning_rate": 3.751914936806767e-06, + "loss": 0.71050388, + "num_input_tokens_seen": 66610800, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9453125, + "step": 3084, + "time_per_iteration": 2.5964090824127197 + }, + { + "auxiliary_loss_clip": 0.01145496, + "auxiliary_loss_mlp": 0.01043341, + "balance_loss_clip": 1.02541232, + "balance_loss_mlp": 1.04961908, + "epoch": 0.1854802344806854, + "flos": 25186128163200.0, + "grad_norm": 1.5883609883793584, + "language_loss": 0.77831411, + "learning_rate": 3.7517270318163377e-06, + "loss": 0.80020249, + "num_input_tokens_seen": 66630960, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9609375, + "step": 3085, + "time_per_iteration": 2.500401020050049 + }, + { + "auxiliary_loss_clip": 0.01145039, + "auxiliary_loss_mlp": 0.01053452, + "balance_loss_clip": 1.03557014, + "balance_loss_mlp": 1.04887915, + "epoch": 0.18554035773335337, + "flos": 26684788337280.0, + "grad_norm": 1.8880953488015286, + "language_loss": 0.73488086, + "learning_rate": 3.751539060400244e-06, + "loss": 0.7568658, + "num_input_tokens_seen": 66650585, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.9609375, + "step": 3086, + "time_per_iteration": 2.5121798515319824 + }, + { + "auxiliary_loss_clip": 0.0114717, + "auxiliary_loss_mlp": 0.01048198, + "balance_loss_clip": 1.02949429, + "balance_loss_mlp": 1.05223882, + "epoch": 0.18560048098602133, + "flos": 22346887570560.0, + "grad_norm": 4.074676999617497, + "language_loss": 0.70087367, + "learning_rate": 3.7513510225656132e-06, + "loss": 0.72282737, + "num_input_tokens_seen": 66670045, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.953125, + "step": 3087, + "time_per_iteration": 2.469980001449585 + }, + { + "auxiliary_loss_clip": 0.01149215, + "auxiliary_loss_mlp": 0.01048668, + "balance_loss_clip": 1.02928519, + "balance_loss_mlp": 1.05118215, + "epoch": 0.18566060423868933, + "flos": 17748526308480.0, + "grad_norm": 2.299065028063824, + "language_loss": 0.72731185, + "learning_rate": 3.7511629183195764e-06, + "loss": 0.74929065, + "num_input_tokens_seen": 66688790, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.98046875, + "step": 3088, + "time_per_iteration": 2.4569249153137207 + }, + { + "auxiliary_loss_clip": 0.01144422, + "auxiliary_loss_mlp": 0.01045089, + "balance_loss_clip": 1.02733839, + "balance_loss_mlp": 1.05015588, + "epoch": 0.1857207274913573, + "flos": 24677274142080.0, + "grad_norm": 2.023411505730453, + "language_loss": 0.91849768, + "learning_rate": 3.7509747476692663e-06, + "loss": 0.94039273, + "num_input_tokens_seen": 66708090, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.94140625, + "step": 3089, + "time_per_iteration": 2.5086276531219482 + }, + { + "auxiliary_loss_clip": 0.01146464, + "auxiliary_loss_mlp": 0.0104377, + "balance_loss_clip": 1.02573323, + "balance_loss_mlp": 1.05124271, + "epoch": 0.18578085074402526, + "flos": 28147825198080.0, + "grad_norm": 2.7535733421879174, + "language_loss": 0.57406759, + "learning_rate": 3.7507865106218176e-06, + "loss": 0.59596992, + "num_input_tokens_seen": 66727320, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.953125, + "step": 3090, + "time_per_iteration": 2.544934034347534 + }, + { + "auxiliary_loss_clip": 0.011443, + "auxiliary_loss_mlp": 0.01049477, + "balance_loss_clip": 1.03133333, + "balance_loss_mlp": 1.04945779, + "epoch": 0.18584097399669322, + "flos": 23951878980480.0, + "grad_norm": 1.9526543189913628, + "language_loss": 0.82229531, + "learning_rate": 3.7505982071843695e-06, + "loss": 0.84423304, + "num_input_tokens_seen": 66747505, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9453125, + "step": 3091, + "time_per_iteration": 2.5339536666870117 + }, + { + "auxiliary_loss_clip": 0.01149127, + "auxiliary_loss_mlp": 0.01049478, + "balance_loss_clip": 1.03165662, + "balance_loss_mlp": 1.05212235, + "epoch": 0.18590109724936119, + "flos": 17201678676480.0, + "grad_norm": 2.0588011246991127, + "language_loss": 0.83561456, + "learning_rate": 3.7504098373640617e-06, + "loss": 0.85760063, + "num_input_tokens_seen": 66766425, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.96875, + "step": 3092, + "time_per_iteration": 2.5091474056243896 + }, + { + "auxiliary_loss_clip": 0.01151013, + "auxiliary_loss_mlp": 0.01044436, + "balance_loss_clip": 1.02562487, + "balance_loss_mlp": 1.05010569, + "epoch": 0.18596122050202915, + "flos": 17234644383360.0, + "grad_norm": 4.142827775979207, + "language_loss": 0.93487823, + "learning_rate": 3.750221401168038e-06, + "loss": 0.95683277, + "num_input_tokens_seen": 66781130, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 1.0078125, + "step": 3093, + "time_per_iteration": 2.4307475090026855 + }, + { + "auxiliary_loss_clip": 0.01146588, + "auxiliary_loss_mlp": 0.01038182, + "balance_loss_clip": 1.02115917, + "balance_loss_mlp": 1.05090082, + "epoch": 0.18602134375469712, + "flos": 19020733188480.0, + "grad_norm": 2.060946690404802, + "language_loss": 0.77380008, + "learning_rate": 3.750032898603443e-06, + "loss": 0.79564774, + "num_input_tokens_seen": 66797535, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.953125, + "step": 3094, + "time_per_iteration": 2.4520375728607178 + }, + { + "auxiliary_loss_clip": 0.01147212, + "auxiliary_loss_mlp": 0.01047698, + "balance_loss_clip": 1.03098452, + "balance_loss_mlp": 1.05099964, + "epoch": 0.1860814670073651, + "flos": 50950094417280.0, + "grad_norm": 1.6535165555915046, + "language_loss": 0.69985378, + "learning_rate": 3.749844329677425e-06, + "loss": 0.72180283, + "num_input_tokens_seen": 66821720, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9609375, + "step": 3095, + "time_per_iteration": 2.7395834922790527 + }, + { + "auxiliary_loss_clip": 0.01149572, + "auxiliary_loss_mlp": 0.01045107, + "balance_loss_clip": 1.02614033, + "balance_loss_mlp": 1.05169249, + "epoch": 0.18614159026003307, + "flos": 19390972625280.0, + "grad_norm": 1.9053555001005595, + "language_loss": 0.8077082, + "learning_rate": 3.749655694397135e-06, + "loss": 0.82965505, + "num_input_tokens_seen": 66839060, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.98046875, + "step": 3096, + "time_per_iteration": 2.4506232738494873 + }, + { + "auxiliary_loss_clip": 0.01147695, + "auxiliary_loss_mlp": 0.01047208, + "balance_loss_clip": 1.02883816, + "balance_loss_mlp": 1.05086875, + "epoch": 0.18620171351270104, + "flos": 21798782962560.0, + "grad_norm": 2.061308652340225, + "language_loss": 0.75101036, + "learning_rate": 3.7494669927697255e-06, + "loss": 0.77295941, + "num_input_tokens_seen": 66857760, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.96875, + "step": 3097, + "time_per_iteration": 2.46639347076416 + }, + { + "auxiliary_loss_clip": 0.01147181, + "auxiliary_loss_mlp": 0.01045993, + "balance_loss_clip": 1.02789688, + "balance_loss_mlp": 1.05196047, + "epoch": 0.186261836765369, + "flos": 16362877299840.0, + "grad_norm": 2.5365100966912664, + "language_loss": 0.66038394, + "learning_rate": 3.749278224802352e-06, + "loss": 0.68231571, + "num_input_tokens_seen": 66876460, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.953125, + "step": 3098, + "time_per_iteration": 2.46763014793396 + }, + { + "auxiliary_loss_clip": 0.01148744, + "auxiliary_loss_mlp": 0.01048857, + "balance_loss_clip": 1.02973545, + "balance_loss_mlp": 1.04978585, + "epoch": 0.18632196001803697, + "flos": 23370054480000.0, + "grad_norm": 1.6025275160282182, + "language_loss": 0.69907904, + "learning_rate": 3.7490893905021733e-06, + "loss": 0.72105503, + "num_input_tokens_seen": 66897960, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.98828125, + "step": 3099, + "time_per_iteration": 2.469336748123169 + }, + { + "auxiliary_loss_clip": 0.01147788, + "auxiliary_loss_mlp": 0.01052362, + "balance_loss_clip": 1.03290749, + "balance_loss_mlp": 1.04985309, + "epoch": 0.18638208327070493, + "flos": 22492002516480.0, + "grad_norm": 1.4888180158498334, + "language_loss": 0.71623552, + "learning_rate": 3.7489004898763494e-06, + "loss": 0.73823702, + "num_input_tokens_seen": 66917675, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.9765625, + "step": 3100, + "time_per_iteration": 2.510803699493408 + }, + { + "auxiliary_loss_clip": 0.01150621, + "auxiliary_loss_mlp": 0.01050424, + "balance_loss_clip": 1.03104091, + "balance_loss_mlp": 1.05147338, + "epoch": 0.18644220652337293, + "flos": 29165245931520.0, + "grad_norm": 2.2181859131844757, + "language_loss": 0.80163074, + "learning_rate": 3.7487115229320444e-06, + "loss": 0.82364118, + "num_input_tokens_seen": 66936000, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.9921875, + "step": 3101, + "time_per_iteration": 4.007607936859131 + }, + { + "auxiliary_loss_clip": 0.0114449, + "auxiliary_loss_mlp": 0.01043434, + "balance_loss_clip": 1.02606487, + "balance_loss_mlp": 1.05100489, + "epoch": 0.1865023297760409, + "flos": 24243796811520.0, + "grad_norm": 2.082156961368248, + "language_loss": 0.76803768, + "learning_rate": 3.7485224896764222e-06, + "loss": 0.78991693, + "num_input_tokens_seen": 66955700, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9375, + "step": 3102, + "time_per_iteration": 5.438685894012451 + }, + { + "auxiliary_loss_clip": 0.0114717, + "auxiliary_loss_mlp": 0.01041158, + "balance_loss_clip": 1.02322865, + "balance_loss_mlp": 1.04973269, + "epoch": 0.18656245302870886, + "flos": 19128716449920.0, + "grad_norm": 2.5595226686006565, + "language_loss": 0.76962835, + "learning_rate": 3.7483333901166525e-06, + "loss": 0.79151165, + "num_input_tokens_seen": 66972815, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9765625, + "step": 3103, + "time_per_iteration": 2.4742202758789062 + }, + { + "auxiliary_loss_clip": 0.0114783, + "auxiliary_loss_mlp": 0.0104302, + "balance_loss_clip": 1.02540123, + "balance_loss_mlp": 1.05014729, + "epoch": 0.18662257628137682, + "flos": 17786088956160.0, + "grad_norm": 1.966347666558745, + "language_loss": 0.79074025, + "learning_rate": 3.7481442242599054e-06, + "loss": 0.81264877, + "num_input_tokens_seen": 66992280, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9765625, + "step": 3104, + "time_per_iteration": 2.4873924255371094 + }, + { + "auxiliary_loss_clip": 0.01148715, + "auxiliary_loss_mlp": 0.01043939, + "balance_loss_clip": 1.02653468, + "balance_loss_mlp": 1.05237842, + "epoch": 0.1866826995340448, + "flos": 24024382583040.0, + "grad_norm": 1.943867006204371, + "language_loss": 0.8519029, + "learning_rate": 3.747954992113354e-06, + "loss": 0.87382948, + "num_input_tokens_seen": 67012220, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9609375, + "step": 3105, + "time_per_iteration": 2.488638162612915 + }, + { + "auxiliary_loss_clip": 0.01152184, + "auxiliary_loss_mlp": 0.01047951, + "balance_loss_clip": 1.02872288, + "balance_loss_mlp": 1.0491997, + "epoch": 0.18674282278671275, + "flos": 26141244756480.0, + "grad_norm": 1.7838474228223986, + "language_loss": 0.86952424, + "learning_rate": 3.7477656936841742e-06, + "loss": 0.89152563, + "num_input_tokens_seen": 67032030, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 1.03125, + "step": 3106, + "time_per_iteration": 2.5103402137756348 + }, + { + "auxiliary_loss_clip": 0.0115436, + "auxiliary_loss_mlp": 0.01044282, + "balance_loss_clip": 1.02623367, + "balance_loss_mlp": 1.05296755, + "epoch": 0.18680294603938072, + "flos": 19201938324480.0, + "grad_norm": 1.9680738799082358, + "language_loss": 0.78253353, + "learning_rate": 3.7475763289795445e-06, + "loss": 0.80451989, + "num_input_tokens_seen": 67048920, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 1.015625, + "step": 3107, + "time_per_iteration": 2.44567608833313 + }, + { + "auxiliary_loss_clip": 0.01150298, + "auxiliary_loss_mlp": 0.01051545, + "balance_loss_clip": 1.03179181, + "balance_loss_mlp": 1.05040216, + "epoch": 0.1868630692920487, + "flos": 28544889116160.0, + "grad_norm": 1.9125203241398734, + "language_loss": 0.74114668, + "learning_rate": 3.7473868980066446e-06, + "loss": 0.76316506, + "num_input_tokens_seen": 67068645, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 1.0, + "step": 3108, + "time_per_iteration": 2.5254971981048584 + }, + { + "auxiliary_loss_clip": 0.0115125, + "auxiliary_loss_mlp": 0.01045574, + "balance_loss_clip": 1.02684629, + "balance_loss_mlp": 1.05332017, + "epoch": 0.18692319254471668, + "flos": 17238020261760.0, + "grad_norm": 1.6536820415924105, + "language_loss": 0.74707133, + "learning_rate": 3.747197400772658e-06, + "loss": 0.76903957, + "num_input_tokens_seen": 67087075, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.98046875, + "step": 3109, + "time_per_iteration": 2.426945924758911 + }, + { + "auxiliary_loss_clip": 0.01147996, + "auxiliary_loss_mlp": 0.0104719, + "balance_loss_clip": 1.02845001, + "balance_loss_mlp": 1.05078959, + "epoch": 0.18698331579738464, + "flos": 23185186156800.0, + "grad_norm": 1.4293009008592994, + "language_loss": 0.84324062, + "learning_rate": 3.747007837284772e-06, + "loss": 0.86519247, + "num_input_tokens_seen": 67108040, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.97265625, + "step": 3110, + "time_per_iteration": 2.4744956493377686 + }, + { + "auxiliary_loss_clip": 0.01154611, + "auxiliary_loss_mlp": 0.0104307, + "balance_loss_clip": 1.02472341, + "balance_loss_mlp": 1.05598927, + "epoch": 0.1870434390500526, + "flos": 25516721963520.0, + "grad_norm": 1.633662412254079, + "language_loss": 0.84753799, + "learning_rate": 3.7468182075501737e-06, + "loss": 0.86951482, + "num_input_tokens_seen": 67127605, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.984375, + "step": 3111, + "time_per_iteration": 2.4757230281829834 + }, + { + "auxiliary_loss_clip": 0.01149935, + "auxiliary_loss_mlp": 0.01042098, + "balance_loss_clip": 1.02408528, + "balance_loss_mlp": 1.05231404, + "epoch": 0.18710356230272057, + "flos": 19500823393920.0, + "grad_norm": 1.8513735900463348, + "language_loss": 0.76565534, + "learning_rate": 3.7466285115760536e-06, + "loss": 0.78757566, + "num_input_tokens_seen": 67145785, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9765625, + "step": 3112, + "time_per_iteration": 2.465552806854248 + }, + { + "auxiliary_loss_clip": 0.01150842, + "auxiliary_loss_mlp": 0.0104724, + "balance_loss_clip": 1.02907228, + "balance_loss_mlp": 1.0516355, + "epoch": 0.18716368555538854, + "flos": 26760847386240.0, + "grad_norm": 1.8580615351340177, + "language_loss": 0.64277315, + "learning_rate": 3.7464387493696046e-06, + "loss": 0.66475397, + "num_input_tokens_seen": 67165930, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9921875, + "step": 3113, + "time_per_iteration": 2.491805076599121 + }, + { + "auxiliary_loss_clip": 0.01155946, + "auxiliary_loss_mlp": 0.01047625, + "balance_loss_clip": 1.02817035, + "balance_loss_mlp": 1.0528996, + "epoch": 0.1872238088080565, + "flos": 25189827264000.0, + "grad_norm": 2.238258329288858, + "language_loss": 0.81043601, + "learning_rate": 3.746248920938024e-06, + "loss": 0.83247173, + "num_input_tokens_seen": 67185830, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.03125, + "step": 3114, + "time_per_iteration": 2.4947290420532227 + }, + { + "auxiliary_loss_clip": 0.01153492, + "auxiliary_loss_mlp": 0.01054246, + "balance_loss_clip": 1.03361082, + "balance_loss_mlp": 1.05319226, + "epoch": 0.1872839320607245, + "flos": 24134305178880.0, + "grad_norm": 2.2102322241331467, + "language_loss": 0.57819968, + "learning_rate": 3.74605902628851e-06, + "loss": 0.60027713, + "num_input_tokens_seen": 67206930, + "router_z_loss_clip": 0.20605469, + "router_z_loss_mlp": 1.0, + "step": 3115, + "time_per_iteration": 2.4892075061798096 + }, + { + "auxiliary_loss_clip": 0.01151062, + "auxiliary_loss_mlp": 0.01056747, + "balance_loss_clip": 1.03873444, + "balance_loss_mlp": 1.05434299, + "epoch": 0.18734405531339246, + "flos": 21173793292800.0, + "grad_norm": 1.8141768865365742, + "language_loss": 0.71160758, + "learning_rate": 3.745869065428261e-06, + "loss": 0.73368567, + "num_input_tokens_seen": 67226290, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.96484375, + "step": 3116, + "time_per_iteration": 2.4705467224121094 + }, + { + "auxiliary_loss_clip": 0.01142667, + "auxiliary_loss_mlp": 0.01035702, + "balance_loss_clip": 1.01751065, + "balance_loss_mlp": 1.04771161, + "epoch": 0.18740417856606043, + "flos": 17237697039360.0, + "grad_norm": 1.8736078530078255, + "language_loss": 0.78733885, + "learning_rate": 3.7456790383644833e-06, + "loss": 0.80912256, + "num_input_tokens_seen": 67244410, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.94921875, + "step": 3117, + "time_per_iteration": 2.418527126312256 + }, + { + "auxiliary_loss_clip": 0.01151486, + "auxiliary_loss_mlp": 0.01048418, + "balance_loss_clip": 1.02898717, + "balance_loss_mlp": 1.05421317, + "epoch": 0.1874643018187284, + "flos": 32558049999360.0, + "grad_norm": 1.743274375857092, + "language_loss": 0.83945131, + "learning_rate": 3.745488945104381e-06, + "loss": 0.86145031, + "num_input_tokens_seen": 67264470, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.97265625, + "step": 3118, + "time_per_iteration": 2.5691416263580322 + }, + { + "auxiliary_loss_clip": 0.01151442, + "auxiliary_loss_mlp": 0.01049226, + "balance_loss_clip": 1.03109384, + "balance_loss_mlp": 1.0525409, + "epoch": 0.18752442507139636, + "flos": 23258156636160.0, + "grad_norm": 1.7594323212393352, + "language_loss": 0.76151264, + "learning_rate": 3.7452987856551636e-06, + "loss": 0.78351927, + "num_input_tokens_seen": 67284315, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.98828125, + "step": 3119, + "time_per_iteration": 2.459648847579956 + }, + { + "auxiliary_loss_clip": 0.01150997, + "auxiliary_loss_mlp": 0.01053702, + "balance_loss_clip": 1.03549838, + "balance_loss_mlp": 1.05181718, + "epoch": 0.18758454832406432, + "flos": 21760933006080.0, + "grad_norm": 1.593515591831454, + "language_loss": 0.81975627, + "learning_rate": 3.7451085600240406e-06, + "loss": 0.84180319, + "num_input_tokens_seen": 67302780, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9921875, + "step": 3120, + "time_per_iteration": 2.478870153427124 + }, + { + "auxiliary_loss_clip": 0.01148213, + "auxiliary_loss_mlp": 0.01043273, + "balance_loss_clip": 1.02526081, + "balance_loss_mlp": 1.05178094, + "epoch": 0.1876446715767323, + "flos": 29570210841600.0, + "grad_norm": 1.7598733043788508, + "language_loss": 0.8513701, + "learning_rate": 3.7449182682182263e-06, + "loss": 0.873285, + "num_input_tokens_seen": 67323405, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9609375, + "step": 3121, + "time_per_iteration": 2.5178277492523193 + }, + { + "auxiliary_loss_clip": 0.0115058, + "auxiliary_loss_mlp": 0.0104859, + "balance_loss_clip": 1.02976704, + "balance_loss_mlp": 1.05281448, + "epoch": 0.18770479482940028, + "flos": 30339992234880.0, + "grad_norm": 2.163070382320244, + "language_loss": 0.70038795, + "learning_rate": 3.744727910244937e-06, + "loss": 0.72237968, + "num_input_tokens_seen": 67345800, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9765625, + "step": 3122, + "time_per_iteration": 2.5523242950439453 + }, + { + "auxiliary_loss_clip": 0.0114817, + "auxiliary_loss_mlp": 0.01045591, + "balance_loss_clip": 1.02524245, + "balance_loss_mlp": 1.05194402, + "epoch": 0.18776491808206824, + "flos": 14465357527680.0, + "grad_norm": 2.352571744641408, + "language_loss": 0.7034744, + "learning_rate": 3.7445374861113905e-06, + "loss": 0.72541201, + "num_input_tokens_seen": 67363575, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 0.9609375, + "step": 3123, + "time_per_iteration": 2.4145569801330566 + }, + { + "auxiliary_loss_clip": 0.01149047, + "auxiliary_loss_mlp": 0.01047451, + "balance_loss_clip": 1.02968884, + "balance_loss_mlp": 1.05238771, + "epoch": 0.1878250413347362, + "flos": 24498547044480.0, + "grad_norm": 2.0330816469172097, + "language_loss": 0.73851109, + "learning_rate": 3.7443469958248066e-06, + "loss": 0.76047611, + "num_input_tokens_seen": 67381765, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.96875, + "step": 3124, + "time_per_iteration": 2.497352123260498 + }, + { + "auxiliary_loss_clip": 0.01153024, + "auxiliary_loss_mlp": 0.01050586, + "balance_loss_clip": 1.03027248, + "balance_loss_mlp": 1.05275774, + "epoch": 0.18788516458740417, + "flos": 39786185692800.0, + "grad_norm": 1.9990758157966066, + "language_loss": 0.80601895, + "learning_rate": 3.7441564393924106e-06, + "loss": 0.82805508, + "num_input_tokens_seen": 67405000, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.0, + "step": 3125, + "time_per_iteration": 2.605851411819458 + }, + { + "auxiliary_loss_clip": 0.01056257, + "auxiliary_loss_mlp": 0.01009024, + "balance_loss_clip": 1.00697315, + "balance_loss_mlp": 1.02352476, + "epoch": 0.18794528784007214, + "flos": 64699250664960.0, + "grad_norm": 0.9386177249275542, + "language_loss": 0.63591504, + "learning_rate": 3.7439658168214273e-06, + "loss": 0.65656781, + "num_input_tokens_seen": 67467140, + "router_z_loss_clip": 0.02050781, + "router_z_loss_mlp": 0.328125, + "step": 3126, + "time_per_iteration": 3.0943961143493652 + }, + { + "auxiliary_loss_clip": 0.01150221, + "auxiliary_loss_mlp": 0.01042071, + "balance_loss_clip": 1.02366543, + "balance_loss_mlp": 1.05439222, + "epoch": 0.1880054110927401, + "flos": 28622061486720.0, + "grad_norm": 1.7984129752859428, + "language_loss": 0.81274688, + "learning_rate": 3.7437751281190857e-06, + "loss": 0.83466977, + "num_input_tokens_seen": 67487980, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.9609375, + "step": 3127, + "time_per_iteration": 2.535048723220825 + }, + { + "auxiliary_loss_clip": 0.01054784, + "auxiliary_loss_mlp": 0.0100739, + "balance_loss_clip": 1.00543487, + "balance_loss_mlp": 1.02235639, + "epoch": 0.1880655343454081, + "flos": 64488958490880.0, + "grad_norm": 0.7620779230288282, + "language_loss": 0.6191628, + "learning_rate": 3.7435843732926164e-06, + "loss": 0.63978451, + "num_input_tokens_seen": 67552500, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.32421875, + "step": 3128, + "time_per_iteration": 3.1384503841400146 + }, + { + "auxiliary_loss_clip": 0.01153999, + "auxiliary_loss_mlp": 0.0104217, + "balance_loss_clip": 1.02329898, + "balance_loss_mlp": 1.05182266, + "epoch": 0.18812565759807606, + "flos": 32124464928000.0, + "grad_norm": 2.171302965646948, + "language_loss": 0.71237707, + "learning_rate": 3.7433935523492536e-06, + "loss": 0.73433876, + "num_input_tokens_seen": 67573295, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0234375, + "step": 3129, + "time_per_iteration": 2.560601234436035 + }, + { + "auxiliary_loss_clip": 0.01149923, + "auxiliary_loss_mlp": 0.01051091, + "balance_loss_clip": 1.03206491, + "balance_loss_mlp": 1.05224252, + "epoch": 0.18818578085074403, + "flos": 20624539449600.0, + "grad_norm": 2.040923932078449, + "language_loss": 0.85375232, + "learning_rate": 3.7432026652962314e-06, + "loss": 0.87576246, + "num_input_tokens_seen": 67590010, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.9765625, + "step": 3130, + "time_per_iteration": 2.4366040229797363 + }, + { + "auxiliary_loss_clip": 0.0114816, + "auxiliary_loss_mlp": 0.01044498, + "balance_loss_clip": 1.02507877, + "balance_loss_mlp": 1.04844868, + "epoch": 0.188245904103412, + "flos": 28840506048000.0, + "grad_norm": 1.9842347260172397, + "language_loss": 0.77227372, + "learning_rate": 3.7430117121407897e-06, + "loss": 0.7942003, + "num_input_tokens_seen": 67611110, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 1.0, + "step": 3131, + "time_per_iteration": 2.503112554550171 + }, + { + "auxiliary_loss_clip": 0.01151098, + "auxiliary_loss_mlp": 0.0104766, + "balance_loss_clip": 1.02800202, + "balance_loss_mlp": 1.05402517, + "epoch": 0.18830602735607996, + "flos": 29420319386880.0, + "grad_norm": 1.8095346888628816, + "language_loss": 0.81244844, + "learning_rate": 3.74282069289017e-06, + "loss": 0.834436, + "num_input_tokens_seen": 67631990, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.96875, + "step": 3132, + "time_per_iteration": 2.5265986919403076 + }, + { + "auxiliary_loss_clip": 0.01154443, + "auxiliary_loss_mlp": 0.01048532, + "balance_loss_clip": 1.02939904, + "balance_loss_mlp": 1.05395401, + "epoch": 0.18836615060874792, + "flos": 28872933050880.0, + "grad_norm": 2.3595669444771135, + "language_loss": 0.79035556, + "learning_rate": 3.742629607551614e-06, + "loss": 0.81238532, + "num_input_tokens_seen": 67650490, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 1.0, + "step": 3133, + "time_per_iteration": 2.500927209854126 + }, + { + "auxiliary_loss_clip": 0.01151251, + "auxiliary_loss_mlp": 0.0105121, + "balance_loss_clip": 1.03224421, + "balance_loss_mlp": 1.05204821, + "epoch": 0.18842627386141592, + "flos": 22601673717120.0, + "grad_norm": 4.024150314183157, + "language_loss": 0.82826144, + "learning_rate": 3.7424384561323698e-06, + "loss": 0.85028601, + "num_input_tokens_seen": 67668860, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.9921875, + "step": 3134, + "time_per_iteration": 2.4773380756378174 + }, + { + "auxiliary_loss_clip": 0.01146819, + "auxiliary_loss_mlp": 0.01047119, + "balance_loss_clip": 1.02847505, + "balance_loss_mlp": 1.05027199, + "epoch": 0.18848639711408388, + "flos": 24573600512640.0, + "grad_norm": 1.4735244825899, + "language_loss": 0.82783771, + "learning_rate": 3.742247238639684e-06, + "loss": 0.8497771, + "num_input_tokens_seen": 67690220, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.96484375, + "step": 3135, + "time_per_iteration": 2.4957115650177 + }, + { + "auxiliary_loss_clip": 0.01149872, + "auxiliary_loss_mlp": 0.01052674, + "balance_loss_clip": 1.03343356, + "balance_loss_mlp": 1.0503304, + "epoch": 0.18854652036675185, + "flos": 34166920078080.0, + "grad_norm": 1.8513380433423674, + "language_loss": 0.79031271, + "learning_rate": 3.7420559550808083e-06, + "loss": 0.81233823, + "num_input_tokens_seen": 67709820, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.9921875, + "step": 3136, + "time_per_iteration": 2.556800127029419 + }, + { + "auxiliary_loss_clip": 0.01150763, + "auxiliary_loss_mlp": 0.01048681, + "balance_loss_clip": 1.02947617, + "balance_loss_mlp": 1.05327463, + "epoch": 0.1886066436194198, + "flos": 24200236592640.0, + "grad_norm": 1.9366242888645147, + "language_loss": 0.81049621, + "learning_rate": 3.741864605462996e-06, + "loss": 0.83249068, + "num_input_tokens_seen": 67729490, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.9765625, + "step": 3137, + "time_per_iteration": 2.487513542175293 + }, + { + "auxiliary_loss_clip": 0.01151307, + "auxiliary_loss_mlp": 0.01057024, + "balance_loss_clip": 1.03913093, + "balance_loss_mlp": 1.05406666, + "epoch": 0.18866676687208778, + "flos": 21251109317760.0, + "grad_norm": 1.5870634004860276, + "language_loss": 0.8119483, + "learning_rate": 3.741673189793504e-06, + "loss": 0.83403158, + "num_input_tokens_seen": 67749665, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.97265625, + "step": 3138, + "time_per_iteration": 2.4554855823516846 + }, + { + "auxiliary_loss_clip": 0.01152889, + "auxiliary_loss_mlp": 0.01050697, + "balance_loss_clip": 1.03162408, + "balance_loss_mlp": 1.05190897, + "epoch": 0.18872689012475574, + "flos": 37308673013760.0, + "grad_norm": 1.760814692015778, + "language_loss": 0.636096, + "learning_rate": 3.7414817080795896e-06, + "loss": 0.6581319, + "num_input_tokens_seen": 67776230, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 1.0078125, + "step": 3139, + "time_per_iteration": 2.6103553771972656 + }, + { + "auxiliary_loss_clip": 0.01146092, + "auxiliary_loss_mlp": 0.0105006, + "balance_loss_clip": 1.03046215, + "balance_loss_mlp": 1.04812348, + "epoch": 0.1887870133774237, + "flos": 21652303299840.0, + "grad_norm": 2.433795452320061, + "language_loss": 0.71546841, + "learning_rate": 3.741290160328514e-06, + "loss": 0.73742986, + "num_input_tokens_seen": 67795080, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.98046875, + "step": 3140, + "time_per_iteration": 2.4519457817077637 + }, + { + "auxiliary_loss_clip": 0.01147517, + "auxiliary_loss_mlp": 0.01047268, + "balance_loss_clip": 1.02764606, + "balance_loss_mlp": 1.04848385, + "epoch": 0.1888471366300917, + "flos": 15924659374080.0, + "grad_norm": 3.1391974719951574, + "language_loss": 0.87001872, + "learning_rate": 3.7410985465475412e-06, + "loss": 0.89196658, + "num_input_tokens_seen": 67813110, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.98828125, + "step": 3141, + "time_per_iteration": 2.4811747074127197 + }, + { + "auxiliary_loss_clip": 0.01153623, + "auxiliary_loss_mlp": 0.0104492, + "balance_loss_clip": 1.02460694, + "balance_loss_mlp": 1.05144691, + "epoch": 0.18890725988275966, + "flos": 18551955767040.0, + "grad_norm": 2.021325930100965, + "language_loss": 0.77418405, + "learning_rate": 3.7409068667439378e-06, + "loss": 0.79616946, + "num_input_tokens_seen": 67831070, + "router_z_loss_clip": 0.203125, + "router_z_loss_mlp": 1.0234375, + "step": 3142, + "time_per_iteration": 2.5001184940338135 + }, + { + "auxiliary_loss_clip": 0.01148279, + "auxiliary_loss_mlp": 0.01042631, + "balance_loss_clip": 1.02542925, + "balance_loss_mlp": 1.05104184, + "epoch": 0.18896738313542763, + "flos": 28840865184000.0, + "grad_norm": 1.6841374820722228, + "language_loss": 0.78446913, + "learning_rate": 3.740715120924971e-06, + "loss": 0.80637825, + "num_input_tokens_seen": 67852170, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.97265625, + "step": 3143, + "time_per_iteration": 3.9074132442474365 + }, + { + "auxiliary_loss_clip": 0.01150084, + "auxiliary_loss_mlp": 0.01049426, + "balance_loss_clip": 1.03081727, + "balance_loss_mlp": 1.05069065, + "epoch": 0.1890275063880956, + "flos": 22412747157120.0, + "grad_norm": 4.1822349926512485, + "language_loss": 0.71507585, + "learning_rate": 3.740523309097912e-06, + "loss": 0.73707104, + "num_input_tokens_seen": 67869945, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9921875, + "step": 3144, + "time_per_iteration": 3.981715679168701 + }, + { + "auxiliary_loss_clip": 0.01152034, + "auxiliary_loss_mlp": 0.01045652, + "balance_loss_clip": 1.02605355, + "balance_loss_mlp": 1.0513736, + "epoch": 0.18908762964076356, + "flos": 24243904552320.0, + "grad_norm": 2.6203593578621893, + "language_loss": 0.73683178, + "learning_rate": 3.7403314312700356e-06, + "loss": 0.75880861, + "num_input_tokens_seen": 67890240, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0078125, + "step": 3145, + "time_per_iteration": 2.5101706981658936 + }, + { + "auxiliary_loss_clip": 0.01143872, + "auxiliary_loss_mlp": 0.01045631, + "balance_loss_clip": 1.02783298, + "balance_loss_mlp": 1.04759097, + "epoch": 0.18914775289343153, + "flos": 16982910892800.0, + "grad_norm": 2.6756165752276027, + "language_loss": 0.77081764, + "learning_rate": 3.740139487448616e-06, + "loss": 0.79271269, + "num_input_tokens_seen": 67907825, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9609375, + "step": 3146, + "time_per_iteration": 2.4278056621551514 + }, + { + "auxiliary_loss_clip": 0.01148489, + "auxiliary_loss_mlp": 0.01046339, + "balance_loss_clip": 1.02811205, + "balance_loss_mlp": 1.04947495, + "epoch": 0.1892078761460995, + "flos": 21543781334400.0, + "grad_norm": 1.794796296308648, + "language_loss": 0.78377169, + "learning_rate": 3.7399474776409326e-06, + "loss": 0.80571997, + "num_input_tokens_seen": 67926670, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.98828125, + "step": 3147, + "time_per_iteration": 2.467607259750366 + }, + { + "auxiliary_loss_clip": 0.0114757, + "auxiliary_loss_mlp": 0.01048988, + "balance_loss_clip": 1.0310235, + "balance_loss_mlp": 1.0499115, + "epoch": 0.18926799939876748, + "flos": 23001538896000.0, + "grad_norm": 3.2769360880247853, + "language_loss": 0.67016155, + "learning_rate": 3.739755401854267e-06, + "loss": 0.69212711, + "num_input_tokens_seen": 67943645, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9765625, + "step": 3148, + "time_per_iteration": 2.4846954345703125 + }, + { + "auxiliary_loss_clip": 0.01145427, + "auxiliary_loss_mlp": 0.01037742, + "balance_loss_clip": 1.02037382, + "balance_loss_mlp": 1.04898858, + "epoch": 0.18932812265143545, + "flos": 22273019251200.0, + "grad_norm": 4.644784357412393, + "language_loss": 0.75978655, + "learning_rate": 3.739563260095902e-06, + "loss": 0.78161824, + "num_input_tokens_seen": 67962345, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.96484375, + "step": 3149, + "time_per_iteration": 2.4768459796905518 + }, + { + "auxiliary_loss_clip": 0.01143839, + "auxiliary_loss_mlp": 0.01047607, + "balance_loss_clip": 1.03028584, + "balance_loss_mlp": 1.05033517, + "epoch": 0.1893882459041034, + "flos": 18624423456000.0, + "grad_norm": 2.9181295874949735, + "language_loss": 0.81229341, + "learning_rate": 3.7393710523731245e-06, + "loss": 0.83420789, + "num_input_tokens_seen": 67979760, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9375, + "step": 3150, + "time_per_iteration": 2.42832088470459 + }, + { + "auxiliary_loss_clip": 0.01148187, + "auxiliary_loss_mlp": 0.01046446, + "balance_loss_clip": 1.02886271, + "balance_loss_mlp": 1.05068374, + "epoch": 0.18944836915677138, + "flos": 22892981016960.0, + "grad_norm": 2.066054594612055, + "language_loss": 0.84966886, + "learning_rate": 3.7391787786932215e-06, + "loss": 0.87161517, + "num_input_tokens_seen": 67996895, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9765625, + "step": 3151, + "time_per_iteration": 2.458054542541504 + }, + { + "auxiliary_loss_clip": 0.01148364, + "auxiliary_loss_mlp": 0.01052715, + "balance_loss_clip": 1.03441668, + "balance_loss_mlp": 1.04896331, + "epoch": 0.18950849240943934, + "flos": 26796542526720.0, + "grad_norm": 1.9128881662164896, + "language_loss": 0.7443462, + "learning_rate": 3.7389864390634857e-06, + "loss": 0.76635695, + "num_input_tokens_seen": 68018365, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.99609375, + "step": 3152, + "time_per_iteration": 2.4904792308807373 + }, + { + "auxiliary_loss_clip": 0.01146776, + "auxiliary_loss_mlp": 0.01048229, + "balance_loss_clip": 1.02937067, + "balance_loss_mlp": 1.0502255, + "epoch": 0.1895686156621073, + "flos": 24971239048320.0, + "grad_norm": 1.8661622565083957, + "language_loss": 0.75719136, + "learning_rate": 3.738794033491209e-06, + "loss": 0.77914143, + "num_input_tokens_seen": 68037985, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.96484375, + "step": 3153, + "time_per_iteration": 2.5026283264160156 + }, + { + "auxiliary_loss_clip": 0.01148349, + "auxiliary_loss_mlp": 0.01048305, + "balance_loss_clip": 1.03007817, + "balance_loss_mlp": 1.04962945, + "epoch": 0.1896287389147753, + "flos": 21944544353280.0, + "grad_norm": 1.8393709351558127, + "language_loss": 0.79529279, + "learning_rate": 3.7386015619836887e-06, + "loss": 0.81725931, + "num_input_tokens_seen": 68057975, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.984375, + "step": 3154, + "time_per_iteration": 2.4544081687927246 + }, + { + "auxiliary_loss_clip": 0.01151316, + "auxiliary_loss_mlp": 0.01047877, + "balance_loss_clip": 1.02919698, + "balance_loss_mlp": 1.04986668, + "epoch": 0.18968886216744327, + "flos": 18179058723840.0, + "grad_norm": 2.673670363277482, + "language_loss": 0.72798991, + "learning_rate": 3.738409024548223e-06, + "loss": 0.74998182, + "num_input_tokens_seen": 68074175, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 1.015625, + "step": 3155, + "time_per_iteration": 2.425431728363037 + }, + { + "auxiliary_loss_clip": 0.01145009, + "auxiliary_loss_mlp": 0.01048344, + "balance_loss_clip": 1.03042662, + "balance_loss_mlp": 1.04930019, + "epoch": 0.18974898542011123, + "flos": 20412487509120.0, + "grad_norm": 1.676026678838244, + "language_loss": 0.73911691, + "learning_rate": 3.7382164211921136e-06, + "loss": 0.76105046, + "num_input_tokens_seen": 68095230, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.95703125, + "step": 3156, + "time_per_iteration": 2.4683640003204346 + }, + { + "auxiliary_loss_clip": 0.01149399, + "auxiliary_loss_mlp": 0.01050259, + "balance_loss_clip": 1.03281915, + "balance_loss_mlp": 1.05195308, + "epoch": 0.1898091086727792, + "flos": 23985024255360.0, + "grad_norm": 1.5984593201401434, + "language_loss": 0.68251741, + "learning_rate": 3.7380237519226623e-06, + "loss": 0.70451397, + "num_input_tokens_seen": 68113805, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9765625, + "step": 3157, + "time_per_iteration": 2.472182512283325 + }, + { + "auxiliary_loss_clip": 0.01146139, + "auxiliary_loss_mlp": 0.01043457, + "balance_loss_clip": 1.02539706, + "balance_loss_mlp": 1.04914486, + "epoch": 0.18986923192544716, + "flos": 27637067756160.0, + "grad_norm": 1.9937577865402571, + "language_loss": 0.80197155, + "learning_rate": 3.737831016747176e-06, + "loss": 0.82386756, + "num_input_tokens_seen": 68133190, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.96875, + "step": 3158, + "time_per_iteration": 2.4978723526000977 + }, + { + "auxiliary_loss_clip": 0.01152812, + "auxiliary_loss_mlp": 0.01045212, + "balance_loss_clip": 1.02624583, + "balance_loss_mlp": 1.05201745, + "epoch": 0.18992935517811513, + "flos": 25484151306240.0, + "grad_norm": 1.9065090881698699, + "language_loss": 0.71940476, + "learning_rate": 3.737638215672964e-06, + "loss": 0.74138498, + "num_input_tokens_seen": 68152330, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 1.0078125, + "step": 3159, + "time_per_iteration": 2.503129720687866 + }, + { + "auxiliary_loss_clip": 0.01150054, + "auxiliary_loss_mlp": 0.01049079, + "balance_loss_clip": 1.02987432, + "balance_loss_mlp": 1.05255282, + "epoch": 0.1899894784307831, + "flos": 17420805596160.0, + "grad_norm": 1.8597759984302606, + "language_loss": 0.85071993, + "learning_rate": 3.7374453487073366e-06, + "loss": 0.8727113, + "num_input_tokens_seen": 68170185, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.9765625, + "step": 3160, + "time_per_iteration": 2.45534348487854 + }, + { + "auxiliary_loss_clip": 0.01143204, + "auxiliary_loss_mlp": 0.01049047, + "balance_loss_clip": 1.03235734, + "balance_loss_mlp": 1.050807, + "epoch": 0.19004960168345109, + "flos": 27492240119040.0, + "grad_norm": 1.7120140162377986, + "language_loss": 0.73554128, + "learning_rate": 3.7372524158576074e-06, + "loss": 0.75746381, + "num_input_tokens_seen": 68191665, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.92578125, + "step": 3161, + "time_per_iteration": 2.5551726818084717 + }, + { + "auxiliary_loss_clip": 0.01150414, + "auxiliary_loss_mlp": 0.01047878, + "balance_loss_clip": 1.02982974, + "balance_loss_mlp": 1.05420387, + "epoch": 0.19010972493611905, + "flos": 38654676385920.0, + "grad_norm": 1.554139282497156, + "language_loss": 0.80939364, + "learning_rate": 3.7370594171310926e-06, + "loss": 0.83137655, + "num_input_tokens_seen": 68214635, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9609375, + "step": 3162, + "time_per_iteration": 2.609764337539673 + }, + { + "auxiliary_loss_clip": 0.01149524, + "auxiliary_loss_mlp": 0.01043018, + "balance_loss_clip": 1.02486265, + "balance_loss_mlp": 1.05257571, + "epoch": 0.19016984818878702, + "flos": 19244744357760.0, + "grad_norm": 1.8884975109329094, + "language_loss": 0.75600141, + "learning_rate": 3.73686635253511e-06, + "loss": 0.77792686, + "num_input_tokens_seen": 68232150, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.96875, + "step": 3163, + "time_per_iteration": 2.4494824409484863 + }, + { + "auxiliary_loss_clip": 0.01149917, + "auxiliary_loss_mlp": 0.0103951, + "balance_loss_clip": 1.02161682, + "balance_loss_mlp": 1.05577397, + "epoch": 0.19022997144145498, + "flos": 37596891744000.0, + "grad_norm": 1.5980783305445414, + "language_loss": 0.74197054, + "learning_rate": 3.736673222076982e-06, + "loss": 0.76386476, + "num_input_tokens_seen": 68253370, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.94140625, + "step": 3164, + "time_per_iteration": 2.5901739597320557 + }, + { + "auxiliary_loss_clip": 0.01148415, + "auxiliary_loss_mlp": 0.01039529, + "balance_loss_clip": 1.02151656, + "balance_loss_mlp": 1.05402589, + "epoch": 0.19029009469412295, + "flos": 61530921665280.0, + "grad_norm": 1.5830796140792522, + "language_loss": 0.66913098, + "learning_rate": 3.7364800257640313e-06, + "loss": 0.69101042, + "num_input_tokens_seen": 68278895, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9453125, + "step": 3165, + "time_per_iteration": 2.899500608444214 + }, + { + "auxiliary_loss_clip": 0.01148214, + "auxiliary_loss_mlp": 0.01045421, + "balance_loss_clip": 1.02624011, + "balance_loss_mlp": 1.05282831, + "epoch": 0.1903502179467909, + "flos": 13954851480960.0, + "grad_norm": 2.1716027754337257, + "language_loss": 0.7452209, + "learning_rate": 3.7362867636035835e-06, + "loss": 0.76715726, + "num_input_tokens_seen": 68294880, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.953125, + "step": 3166, + "time_per_iteration": 2.4325685501098633 + }, + { + "auxiliary_loss_clip": 0.01062623, + "auxiliary_loss_mlp": 0.01017161, + "balance_loss_clip": 1.01490772, + "balance_loss_mlp": 1.02902174, + "epoch": 0.1904103411994589, + "flos": 66899641916160.0, + "grad_norm": 0.8067170187870535, + "language_loss": 0.50396568, + "learning_rate": 3.736093435602968e-06, + "loss": 0.52476352, + "num_input_tokens_seen": 68359665, + "router_z_loss_clip": 0.02258301, + "router_z_loss_mlp": 0.3359375, + "step": 3167, + "time_per_iteration": 3.1095221042633057 + }, + { + "auxiliary_loss_clip": 0.01146367, + "auxiliary_loss_mlp": 0.01049594, + "balance_loss_clip": 1.03184342, + "balance_loss_mlp": 1.05208659, + "epoch": 0.19047046445212687, + "flos": 21908741472000.0, + "grad_norm": 1.7496006549093657, + "language_loss": 0.74235475, + "learning_rate": 3.7359000417695156e-06, + "loss": 0.76431435, + "num_input_tokens_seen": 68378950, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9453125, + "step": 3168, + "time_per_iteration": 2.483551502227783 + }, + { + "auxiliary_loss_clip": 0.01059618, + "auxiliary_loss_mlp": 0.01004786, + "balance_loss_clip": 1.00246131, + "balance_loss_mlp": 1.02649927, + "epoch": 0.19053058770479483, + "flos": 59255156701440.0, + "grad_norm": 0.8615778549663292, + "language_loss": 0.60097563, + "learning_rate": 3.73570658211056e-06, + "loss": 0.6216197, + "num_input_tokens_seen": 68434235, + "router_z_loss_clip": 0.02319336, + "router_z_loss_mlp": 0.33203125, + "step": 3169, + "time_per_iteration": 2.958176851272583 + }, + { + "auxiliary_loss_clip": 0.01152665, + "auxiliary_loss_mlp": 0.01051299, + "balance_loss_clip": 1.03371537, + "balance_loss_mlp": 1.05302989, + "epoch": 0.1905907109574628, + "flos": 23951304362880.0, + "grad_norm": 1.550337238497042, + "language_loss": 0.77976263, + "learning_rate": 3.735513056633436e-06, + "loss": 0.80180222, + "num_input_tokens_seen": 68453830, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.99609375, + "step": 3170, + "time_per_iteration": 2.5174756050109863 + }, + { + "auxiliary_loss_clip": 0.01145075, + "auxiliary_loss_mlp": 0.01046915, + "balance_loss_clip": 1.02960575, + "balance_loss_mlp": 1.05185819, + "epoch": 0.19065083421013077, + "flos": 20812316774400.0, + "grad_norm": 1.7193055204742105, + "language_loss": 0.78597021, + "learning_rate": 3.7353194653454834e-06, + "loss": 0.80789012, + "num_input_tokens_seen": 68473005, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.93359375, + "step": 3171, + "time_per_iteration": 2.4895551204681396 + }, + { + "auxiliary_loss_clip": 0.01149187, + "auxiliary_loss_mlp": 0.0104474, + "balance_loss_clip": 1.02617931, + "balance_loss_mlp": 1.05111575, + "epoch": 0.19071095746279873, + "flos": 31284981192960.0, + "grad_norm": 3.5246110250440386, + "language_loss": 0.78578937, + "learning_rate": 3.7351258082540426e-06, + "loss": 0.80772865, + "num_input_tokens_seen": 68493470, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.98046875, + "step": 3172, + "time_per_iteration": 2.574558973312378 + }, + { + "auxiliary_loss_clip": 0.01148239, + "auxiliary_loss_mlp": 0.01054453, + "balance_loss_clip": 1.03711963, + "balance_loss_mlp": 1.05253482, + "epoch": 0.1907710807154667, + "flos": 14356117290240.0, + "grad_norm": 1.581476317811461, + "language_loss": 0.80126482, + "learning_rate": 3.7349320853664576e-06, + "loss": 0.82329178, + "num_input_tokens_seen": 68511290, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.95703125, + "step": 3173, + "time_per_iteration": 2.464979410171509 + }, + { + "auxiliary_loss_clip": 0.01149716, + "auxiliary_loss_mlp": 0.01051904, + "balance_loss_clip": 1.03432083, + "balance_loss_mlp": 1.05250478, + "epoch": 0.1908312039681347, + "flos": 26907039740160.0, + "grad_norm": 1.9222394249434893, + "language_loss": 0.78740567, + "learning_rate": 3.7347382966900735e-06, + "loss": 0.8094219, + "num_input_tokens_seen": 68532575, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.96875, + "step": 3174, + "time_per_iteration": 2.540959358215332 + }, + { + "auxiliary_loss_clip": 0.01149777, + "auxiliary_loss_mlp": 0.01047648, + "balance_loss_clip": 1.03043461, + "balance_loss_mlp": 1.05367374, + "epoch": 0.19089132722080265, + "flos": 14494695960960.0, + "grad_norm": 1.8458147293094664, + "language_loss": 0.80757344, + "learning_rate": 3.7345444422322395e-06, + "loss": 0.82954776, + "num_input_tokens_seen": 68548760, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9609375, + "step": 3175, + "time_per_iteration": 2.441190481185913 + }, + { + "auxiliary_loss_clip": 0.01148992, + "auxiliary_loss_mlp": 0.01056395, + "balance_loss_clip": 1.03821599, + "balance_loss_mlp": 1.0521791, + "epoch": 0.19095145047347062, + "flos": 13952876232960.0, + "grad_norm": 2.3562328324004445, + "language_loss": 0.85142022, + "learning_rate": 3.7343505220003067e-06, + "loss": 0.87347412, + "num_input_tokens_seen": 68563100, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.96875, + "step": 3176, + "time_per_iteration": 2.4397072792053223 + }, + { + "auxiliary_loss_clip": 0.01152727, + "auxiliary_loss_mlp": 0.01056149, + "balance_loss_clip": 1.036515, + "balance_loss_mlp": 1.05395234, + "epoch": 0.19101157372613858, + "flos": 25301832848640.0, + "grad_norm": 2.002060812172469, + "language_loss": 0.81206596, + "learning_rate": 3.7341565360016285e-06, + "loss": 0.83415473, + "num_input_tokens_seen": 68581650, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.98828125, + "step": 3177, + "time_per_iteration": 2.4980266094207764 + }, + { + "auxiliary_loss_clip": 0.01144454, + "auxiliary_loss_mlp": 0.01048966, + "balance_loss_clip": 1.03073931, + "balance_loss_mlp": 1.0503974, + "epoch": 0.19107169697880655, + "flos": 20558212986240.0, + "grad_norm": 1.9374450898751996, + "language_loss": 0.74628592, + "learning_rate": 3.73396248424356e-06, + "loss": 0.76822007, + "num_input_tokens_seen": 68600360, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.94140625, + "step": 3178, + "time_per_iteration": 2.477679967880249 + }, + { + "auxiliary_loss_clip": 0.01147132, + "auxiliary_loss_mlp": 0.01039746, + "balance_loss_clip": 1.02273464, + "balance_loss_mlp": 1.05001104, + "epoch": 0.19113182023147451, + "flos": 22163204396160.0, + "grad_norm": 1.8429055258583904, + "language_loss": 0.8167876, + "learning_rate": 3.7337683667334606e-06, + "loss": 0.83865643, + "num_input_tokens_seen": 68617885, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.97265625, + "step": 3179, + "time_per_iteration": 2.452310800552368 + }, + { + "auxiliary_loss_clip": 0.0114904, + "auxiliary_loss_mlp": 0.01046544, + "balance_loss_clip": 1.02892482, + "balance_loss_mlp": 1.05279994, + "epoch": 0.19119194348414248, + "flos": 18581796990720.0, + "grad_norm": 2.1508657656276484, + "language_loss": 0.7946887, + "learning_rate": 3.733574183478691e-06, + "loss": 0.81664455, + "num_input_tokens_seen": 68634550, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.96484375, + "step": 3180, + "time_per_iteration": 2.451066732406616 + }, + { + "auxiliary_loss_clip": 0.0114304, + "auxiliary_loss_mlp": 0.01045985, + "balance_loss_clip": 1.02770984, + "balance_loss_mlp": 1.04780042, + "epoch": 0.19125206673681047, + "flos": 19026623018880.0, + "grad_norm": 2.916741655382754, + "language_loss": 0.79891652, + "learning_rate": 3.733379934486615e-06, + "loss": 0.82080674, + "num_input_tokens_seen": 68651895, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.94921875, + "step": 3181, + "time_per_iteration": 2.4310615062713623 + }, + { + "auxiliary_loss_clip": 0.0114616, + "auxiliary_loss_mlp": 0.01053832, + "balance_loss_clip": 1.03623664, + "balance_loss_mlp": 1.04858851, + "epoch": 0.19131218998947844, + "flos": 21690153256320.0, + "grad_norm": 1.7607714952320546, + "language_loss": 0.73820639, + "learning_rate": 3.7331856197645973e-06, + "loss": 0.76020634, + "num_input_tokens_seen": 68671500, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9765625, + "step": 3182, + "time_per_iteration": 2.4712350368499756 + }, + { + "auxiliary_loss_clip": 0.01148345, + "auxiliary_loss_mlp": 0.01048421, + "balance_loss_clip": 1.03093314, + "balance_loss_mlp": 1.05187011, + "epoch": 0.1913723132421464, + "flos": 18442500048000.0, + "grad_norm": 1.8018319163421928, + "language_loss": 0.6486634, + "learning_rate": 3.7329912393200084e-06, + "loss": 0.67063105, + "num_input_tokens_seen": 68690570, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.96484375, + "step": 3183, + "time_per_iteration": 2.440232753753662 + }, + { + "auxiliary_loss_clip": 0.01145449, + "auxiliary_loss_mlp": 0.01047983, + "balance_loss_clip": 1.02920759, + "balance_loss_mlp": 1.04864669, + "epoch": 0.19143243649481437, + "flos": 27160102033920.0, + "grad_norm": 1.760716170695104, + "language_loss": 0.73234087, + "learning_rate": 3.7327967931602173e-06, + "loss": 0.7542752, + "num_input_tokens_seen": 68709735, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.96875, + "step": 3184, + "time_per_iteration": 3.9211573600769043 + }, + { + "auxiliary_loss_clip": 0.01145752, + "auxiliary_loss_mlp": 0.01047876, + "balance_loss_clip": 1.0281471, + "balance_loss_mlp": 1.04738748, + "epoch": 0.19149255974748233, + "flos": 21718952985600.0, + "grad_norm": 2.1066155051108315, + "language_loss": 0.8784132, + "learning_rate": 3.732602281292598e-06, + "loss": 0.9003495, + "num_input_tokens_seen": 68727565, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.984375, + "step": 3185, + "time_per_iteration": 5.396124601364136 + }, + { + "auxiliary_loss_clip": 0.01143307, + "auxiliary_loss_mlp": 0.01046716, + "balance_loss_clip": 1.02803612, + "balance_loss_mlp": 1.04899192, + "epoch": 0.1915526830001503, + "flos": 22963293889920.0, + "grad_norm": 2.10102369978198, + "language_loss": 0.72667789, + "learning_rate": 3.7324077037245267e-06, + "loss": 0.74857807, + "num_input_tokens_seen": 68748110, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.94140625, + "step": 3186, + "time_per_iteration": 2.498241901397705 + }, + { + "auxiliary_loss_clip": 0.01153236, + "auxiliary_loss_mlp": 0.01042185, + "balance_loss_clip": 1.02244437, + "balance_loss_mlp": 1.054919, + "epoch": 0.1916128062528183, + "flos": 26140741966080.0, + "grad_norm": 2.264264166459479, + "language_loss": 0.83865881, + "learning_rate": 3.7322130604633825e-06, + "loss": 0.86061311, + "num_input_tokens_seen": 68769765, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.98046875, + "step": 3187, + "time_per_iteration": 2.527416467666626 + }, + { + "auxiliary_loss_clip": 0.01051867, + "auxiliary_loss_mlp": 0.01015636, + "balance_loss_clip": 1.01343083, + "balance_loss_mlp": 1.01988959, + "epoch": 0.19167292950548626, + "flos": 54925767457920.0, + "grad_norm": 0.8634842964488614, + "language_loss": 0.55803859, + "learning_rate": 3.732018351516544e-06, + "loss": 0.5787136, + "num_input_tokens_seen": 68826815, + "router_z_loss_clip": 0.02209473, + "router_z_loss_mlp": 0.3203125, + "step": 3188, + "time_per_iteration": 3.0815136432647705 + }, + { + "auxiliary_loss_clip": 0.01145462, + "auxiliary_loss_mlp": 0.01055783, + "balance_loss_clip": 1.03709126, + "balance_loss_mlp": 1.04972625, + "epoch": 0.19173305275815422, + "flos": 29935601942400.0, + "grad_norm": 1.71302722892552, + "language_loss": 0.70180511, + "learning_rate": 3.731823576891397e-06, + "loss": 0.72381759, + "num_input_tokens_seen": 68847585, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.95703125, + "step": 3189, + "time_per_iteration": 2.5380465984344482 + }, + { + "auxiliary_loss_clip": 0.01140421, + "auxiliary_loss_mlp": 0.01034792, + "balance_loss_clip": 1.01822233, + "balance_loss_mlp": 1.04853344, + "epoch": 0.1917931760108222, + "flos": 24752471264640.0, + "grad_norm": 2.222159201352765, + "language_loss": 0.74234986, + "learning_rate": 3.7316287365953266e-06, + "loss": 0.76410198, + "num_input_tokens_seen": 68866620, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.91796875, + "step": 3190, + "time_per_iteration": 2.5862700939178467 + }, + { + "auxiliary_loss_clip": 0.01143494, + "auxiliary_loss_mlp": 0.01056401, + "balance_loss_clip": 1.03818583, + "balance_loss_mlp": 1.04965627, + "epoch": 0.19185329926349015, + "flos": 18843550375680.0, + "grad_norm": 1.8818377537371913, + "language_loss": 0.8394708, + "learning_rate": 3.73143383063572e-06, + "loss": 0.86146975, + "num_input_tokens_seen": 68885515, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9375, + "step": 3191, + "time_per_iteration": 2.5077905654907227 + }, + { + "auxiliary_loss_clip": 0.01139779, + "auxiliary_loss_mlp": 0.01038816, + "balance_loss_clip": 1.02217412, + "balance_loss_mlp": 1.04766488, + "epoch": 0.19191342251615812, + "flos": 22086858038400.0, + "grad_norm": 1.7694679756443132, + "language_loss": 0.89325655, + "learning_rate": 3.73123885901997e-06, + "loss": 0.91504252, + "num_input_tokens_seen": 68903225, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.921875, + "step": 3192, + "time_per_iteration": 2.4738776683807373 + }, + { + "auxiliary_loss_clip": 0.01150885, + "auxiliary_loss_mlp": 0.0105345, + "balance_loss_clip": 1.03398299, + "balance_loss_mlp": 1.0531472, + "epoch": 0.19197354576882608, + "flos": 22199115018240.0, + "grad_norm": 2.352703418633998, + "language_loss": 0.74830496, + "learning_rate": 3.7310438217554687e-06, + "loss": 0.77034831, + "num_input_tokens_seen": 68922860, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.9765625, + "step": 3193, + "time_per_iteration": 2.47143816947937 + }, + { + "auxiliary_loss_clip": 0.01146927, + "auxiliary_loss_mlp": 0.01045614, + "balance_loss_clip": 1.02717233, + "balance_loss_mlp": 1.04918766, + "epoch": 0.19203366902149407, + "flos": 24896185580160.0, + "grad_norm": 1.7283890992056894, + "language_loss": 0.74733245, + "learning_rate": 3.730848718849612e-06, + "loss": 0.7692579, + "num_input_tokens_seen": 68943000, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.9765625, + "step": 3194, + "time_per_iteration": 2.5001959800720215 + }, + { + "auxiliary_loss_clip": 0.0105047, + "auxiliary_loss_mlp": 0.010055, + "balance_loss_clip": 1.00319958, + "balance_loss_mlp": 1.01851392, + "epoch": 0.19209379227416204, + "flos": 68416722789120.0, + "grad_norm": 0.7975785668902318, + "language_loss": 0.68455988, + "learning_rate": 3.7306535503097985e-06, + "loss": 0.70511955, + "num_input_tokens_seen": 69000255, + "router_z_loss_clip": 0.02294922, + "router_z_loss_mlp": 0.3203125, + "step": 3195, + "time_per_iteration": 3.014677047729492 + }, + { + "auxiliary_loss_clip": 0.01146296, + "auxiliary_loss_mlp": 0.01043268, + "balance_loss_clip": 1.0254823, + "balance_loss_mlp": 1.05066323, + "epoch": 0.19215391552683, + "flos": 22055185221120.0, + "grad_norm": 1.9672517867074575, + "language_loss": 0.72712696, + "learning_rate": 3.730458316143429e-06, + "loss": 0.74902254, + "num_input_tokens_seen": 69019665, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.95703125, + "step": 3196, + "time_per_iteration": 2.4855856895446777 + }, + { + "auxiliary_loss_clip": 0.01151669, + "auxiliary_loss_mlp": 0.01046783, + "balance_loss_clip": 1.0284251, + "balance_loss_mlp": 1.05643284, + "epoch": 0.19221403877949797, + "flos": 20302959962880.0, + "grad_norm": 1.8158077484015336, + "language_loss": 0.83774233, + "learning_rate": 3.7302630163579068e-06, + "loss": 0.85972691, + "num_input_tokens_seen": 69039055, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.953125, + "step": 3197, + "time_per_iteration": 2.4530181884765625 + }, + { + "auxiliary_loss_clip": 0.01146905, + "auxiliary_loss_mlp": 0.01044345, + "balance_loss_clip": 1.02565312, + "balance_loss_mlp": 1.05036283, + "epoch": 0.19227416203216594, + "flos": 23185329811200.0, + "grad_norm": 2.295881830513264, + "language_loss": 0.80459738, + "learning_rate": 3.7300676509606373e-06, + "loss": 0.82650983, + "num_input_tokens_seen": 69056370, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.96875, + "step": 3198, + "time_per_iteration": 2.4882590770721436 + }, + { + "auxiliary_loss_clip": 0.01148366, + "auxiliary_loss_mlp": 0.01050243, + "balance_loss_clip": 1.03090763, + "balance_loss_mlp": 1.04984999, + "epoch": 0.1923342852848339, + "flos": 25776607841280.0, + "grad_norm": 1.9800701307051174, + "language_loss": 0.7862891, + "learning_rate": 3.729872219959029e-06, + "loss": 0.80827522, + "num_input_tokens_seen": 69075915, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.984375, + "step": 3199, + "time_per_iteration": 2.507227659225464 + }, + { + "auxiliary_loss_clip": 0.01146428, + "auxiliary_loss_mlp": 0.01042987, + "balance_loss_clip": 1.02567828, + "balance_loss_mlp": 1.05150342, + "epoch": 0.19239440853750187, + "flos": 17128349061120.0, + "grad_norm": 2.05190707233933, + "language_loss": 0.83391261, + "learning_rate": 3.7296767233604934e-06, + "loss": 0.85580671, + "num_input_tokens_seen": 69094145, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.94921875, + "step": 3200, + "time_per_iteration": 2.459218978881836 + }, + { + "auxiliary_loss_clip": 0.01148087, + "auxiliary_loss_mlp": 0.01051054, + "balance_loss_clip": 1.03286231, + "balance_loss_mlp": 1.0524931, + "epoch": 0.19245453179016986, + "flos": 16435093593600.0, + "grad_norm": 2.0233550639398428, + "language_loss": 0.78678542, + "learning_rate": 3.729481161172443e-06, + "loss": 0.80877686, + "num_input_tokens_seen": 69111110, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.95703125, + "step": 3201, + "time_per_iteration": 2.435478448867798 + }, + { + "auxiliary_loss_clip": 0.01148745, + "auxiliary_loss_mlp": 0.01046904, + "balance_loss_clip": 1.02874875, + "balance_loss_mlp": 1.05050445, + "epoch": 0.19251465504283782, + "flos": 20230276792320.0, + "grad_norm": 2.1716175760371814, + "language_loss": 0.69168961, + "learning_rate": 3.7292855334022927e-06, + "loss": 0.71364617, + "num_input_tokens_seen": 69130280, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.984375, + "step": 3202, + "time_per_iteration": 2.4596354961395264 + }, + { + "auxiliary_loss_clip": 0.01145317, + "auxiliary_loss_mlp": 0.01035376, + "balance_loss_clip": 1.01790023, + "balance_loss_mlp": 1.05140352, + "epoch": 0.1925747782955058, + "flos": 19464374067840.0, + "grad_norm": 1.7015130302687178, + "language_loss": 0.91123176, + "learning_rate": 3.7290898400574627e-06, + "loss": 0.93303871, + "num_input_tokens_seen": 69149570, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9375, + "step": 3203, + "time_per_iteration": 2.4425902366638184 + }, + { + "auxiliary_loss_clip": 0.01147002, + "auxiliary_loss_mlp": 0.01050127, + "balance_loss_clip": 1.03127956, + "balance_loss_mlp": 1.05008471, + "epoch": 0.19263490154817375, + "flos": 17785586165760.0, + "grad_norm": 2.129263396651385, + "language_loss": 0.81766933, + "learning_rate": 3.7288940811453725e-06, + "loss": 0.83964062, + "num_input_tokens_seen": 69168190, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.96875, + "step": 3204, + "time_per_iteration": 2.4466230869293213 + }, + { + "auxiliary_loss_clip": 0.01143673, + "auxiliary_loss_mlp": 0.01047511, + "balance_loss_clip": 1.03022599, + "balance_loss_mlp": 1.0497942, + "epoch": 0.19269502480084172, + "flos": 17457075354240.0, + "grad_norm": 2.065510679734303, + "language_loss": 0.75797462, + "learning_rate": 3.7286982566734454e-06, + "loss": 0.77988648, + "num_input_tokens_seen": 69186950, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9375, + "step": 3205, + "time_per_iteration": 2.439906358718872 + }, + { + "auxiliary_loss_clip": 0.01150471, + "auxiliary_loss_mlp": 0.01047316, + "balance_loss_clip": 1.02958953, + "balance_loss_mlp": 1.05312991, + "epoch": 0.19275514805350968, + "flos": 21506901045120.0, + "grad_norm": 2.4125731541540465, + "language_loss": 0.83020669, + "learning_rate": 3.728502366649107e-06, + "loss": 0.85218459, + "num_input_tokens_seen": 69204850, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.97265625, + "step": 3206, + "time_per_iteration": 2.463888168334961 + }, + { + "auxiliary_loss_clip": 0.0104957, + "auxiliary_loss_mlp": 0.01003581, + "balance_loss_clip": 1.00139928, + "balance_loss_mlp": 1.01731467, + "epoch": 0.19281527130617768, + "flos": 47695979738880.0, + "grad_norm": 0.8499440783854421, + "language_loss": 0.60609913, + "learning_rate": 3.728306411079786e-06, + "loss": 0.62663066, + "num_input_tokens_seen": 69259200, + "router_z_loss_clip": 0.02185059, + "router_z_loss_mlp": 0.32421875, + "step": 3207, + "time_per_iteration": 2.8865902423858643 + }, + { + "auxiliary_loss_clip": 0.01147085, + "auxiliary_loss_mlp": 0.01045801, + "balance_loss_clip": 1.02789569, + "balance_loss_mlp": 1.05069125, + "epoch": 0.19287539455884564, + "flos": 11801252672640.0, + "grad_norm": 2.4047527057594564, + "language_loss": 0.75119245, + "learning_rate": 3.7281103899729125e-06, + "loss": 0.77312136, + "num_input_tokens_seen": 69275835, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.9609375, + "step": 3208, + "time_per_iteration": 2.4727799892425537 + }, + { + "auxiliary_loss_clip": 0.01146825, + "auxiliary_loss_mlp": 0.01048755, + "balance_loss_clip": 1.02921605, + "balance_loss_mlp": 1.04890394, + "epoch": 0.1929355178115136, + "flos": 20631434860800.0, + "grad_norm": 2.3372356299161696, + "language_loss": 0.60567236, + "learning_rate": 3.7279143033359195e-06, + "loss": 0.62762815, + "num_input_tokens_seen": 69294810, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.98046875, + "step": 3209, + "time_per_iteration": 2.4695677757263184 + }, + { + "auxiliary_loss_clip": 0.0114885, + "auxiliary_loss_mlp": 0.01049539, + "balance_loss_clip": 1.03003573, + "balance_loss_mlp": 1.04981887, + "epoch": 0.19299564106418157, + "flos": 40807916058240.0, + "grad_norm": 1.9457412312791633, + "language_loss": 0.80153656, + "learning_rate": 3.727718151176243e-06, + "loss": 0.82352048, + "num_input_tokens_seen": 69316065, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.9921875, + "step": 3210, + "time_per_iteration": 2.6459405422210693 + }, + { + "auxiliary_loss_clip": 0.01138808, + "auxiliary_loss_mlp": 0.01041334, + "balance_loss_clip": 1.02437103, + "balance_loss_mlp": 1.04580569, + "epoch": 0.19305576431684954, + "flos": 11361418634880.0, + "grad_norm": 2.107646167575127, + "language_loss": 0.82575119, + "learning_rate": 3.7275219335013217e-06, + "loss": 0.84755266, + "num_input_tokens_seen": 69332900, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9296875, + "step": 3211, + "time_per_iteration": 2.454702615737915 + }, + { + "auxiliary_loss_clip": 0.01046258, + "auxiliary_loss_mlp": 0.01012694, + "balance_loss_clip": 1.01057243, + "balance_loss_mlp": 1.01463401, + "epoch": 0.1931158875695175, + "flos": 54511895975040.0, + "grad_norm": 0.9758169311408023, + "language_loss": 0.63670558, + "learning_rate": 3.7273256503185953e-06, + "loss": 0.65729511, + "num_input_tokens_seen": 69382535, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.31640625, + "step": 3212, + "time_per_iteration": 2.914459705352783 + }, + { + "auxiliary_loss_clip": 0.01145937, + "auxiliary_loss_mlp": 0.01046347, + "balance_loss_clip": 1.02967, + "balance_loss_mlp": 1.05140018, + "epoch": 0.19317601082218547, + "flos": 19828436365440.0, + "grad_norm": 1.5978218597026725, + "language_loss": 0.76514798, + "learning_rate": 3.7271293016355074e-06, + "loss": 0.78707075, + "num_input_tokens_seen": 69400600, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9453125, + "step": 3213, + "time_per_iteration": 2.47961163520813 + }, + { + "auxiliary_loss_clip": 0.01147383, + "auxiliary_loss_mlp": 0.01047068, + "balance_loss_clip": 1.02823281, + "balance_loss_mlp": 1.04934072, + "epoch": 0.19323613407485346, + "flos": 13152068467200.0, + "grad_norm": 4.5461953882780115, + "language_loss": 0.70799339, + "learning_rate": 3.726932887459503e-06, + "loss": 0.72993791, + "num_input_tokens_seen": 69417350, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.98046875, + "step": 3214, + "time_per_iteration": 2.4547488689422607 + }, + { + "auxiliary_loss_clip": 0.01142593, + "auxiliary_loss_mlp": 0.01046871, + "balance_loss_clip": 1.02808392, + "balance_loss_mlp": 1.0470041, + "epoch": 0.19329625732752143, + "flos": 14027247342720.0, + "grad_norm": 2.2459266127411848, + "language_loss": 0.75352395, + "learning_rate": 3.72673640779803e-06, + "loss": 0.77541864, + "num_input_tokens_seen": 69431845, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.95703125, + "step": 3215, + "time_per_iteration": 2.4477176666259766 + }, + { + "auxiliary_loss_clip": 0.01139586, + "auxiliary_loss_mlp": 0.01053833, + "balance_loss_clip": 1.03685808, + "balance_loss_mlp": 1.04626155, + "epoch": 0.1933563805801894, + "flos": 23441732069760.0, + "grad_norm": 2.304207478946857, + "language_loss": 0.88559556, + "learning_rate": 3.72653986265854e-06, + "loss": 0.90752971, + "num_input_tokens_seen": 69453275, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9296875, + "step": 3216, + "time_per_iteration": 2.499464988708496 + }, + { + "auxiliary_loss_clip": 0.01140269, + "auxiliary_loss_mlp": 0.0104998, + "balance_loss_clip": 1.0330286, + "balance_loss_mlp": 1.0474, + "epoch": 0.19341650383285736, + "flos": 20485314334080.0, + "grad_norm": 1.5978066249985532, + "language_loss": 0.79762065, + "learning_rate": 3.726343252048485e-06, + "loss": 0.8195231, + "num_input_tokens_seen": 69471830, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9296875, + "step": 3217, + "time_per_iteration": 2.4428889751434326 + }, + { + "auxiliary_loss_clip": 0.0114893, + "auxiliary_loss_mlp": 0.01048467, + "balance_loss_clip": 1.0294652, + "balance_loss_mlp": 1.0504688, + "epoch": 0.19347662708552532, + "flos": 17858484817920.0, + "grad_norm": 2.6606972104147673, + "language_loss": 0.61408496, + "learning_rate": 3.7261465759753206e-06, + "loss": 0.63605893, + "num_input_tokens_seen": 69489320, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.984375, + "step": 3218, + "time_per_iteration": 2.4313230514526367 + }, + { + "auxiliary_loss_clip": 0.0114381, + "auxiliary_loss_mlp": 0.01040596, + "balance_loss_clip": 1.02315593, + "balance_loss_mlp": 1.04883909, + "epoch": 0.1935367503381933, + "flos": 18187247024640.0, + "grad_norm": 1.6811153728366703, + "language_loss": 0.80158418, + "learning_rate": 3.7259498344465053e-06, + "loss": 0.82342821, + "num_input_tokens_seen": 69506665, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.94921875, + "step": 3219, + "time_per_iteration": 2.4347593784332275 + }, + { + "auxiliary_loss_clip": 0.01145851, + "auxiliary_loss_mlp": 0.01048329, + "balance_loss_clip": 1.03010237, + "balance_loss_mlp": 1.05070114, + "epoch": 0.19359687359086128, + "flos": 15957122290560.0, + "grad_norm": 2.032012314604138, + "language_loss": 0.85781908, + "learning_rate": 3.7257530274694993e-06, + "loss": 0.87976086, + "num_input_tokens_seen": 69523835, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.94921875, + "step": 3220, + "time_per_iteration": 2.4572718143463135 + }, + { + "auxiliary_loss_clip": 0.01136805, + "auxiliary_loss_mlp": 0.0103947, + "balance_loss_clip": 1.02356791, + "balance_loss_mlp": 1.0477736, + "epoch": 0.19365699684352924, + "flos": 21215198695680.0, + "grad_norm": 2.087292049011103, + "language_loss": 0.84617937, + "learning_rate": 3.725556155051766e-06, + "loss": 0.86794209, + "num_input_tokens_seen": 69542620, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.890625, + "step": 3221, + "time_per_iteration": 2.4601354598999023 + }, + { + "auxiliary_loss_clip": 0.01142607, + "auxiliary_loss_mlp": 0.01049362, + "balance_loss_clip": 1.0331614, + "balance_loss_mlp": 1.05009556, + "epoch": 0.1937171200961972, + "flos": 17311098481920.0, + "grad_norm": 2.075109928662421, + "language_loss": 0.85929954, + "learning_rate": 3.7253592172007702e-06, + "loss": 0.88121927, + "num_input_tokens_seen": 69561130, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.92578125, + "step": 3222, + "time_per_iteration": 2.433027505874634 + }, + { + "auxiliary_loss_clip": 0.0114145, + "auxiliary_loss_mlp": 0.01040151, + "balance_loss_clip": 1.02212656, + "balance_loss_mlp": 1.04663789, + "epoch": 0.19377724334886517, + "flos": 22635968227200.0, + "grad_norm": 3.9278404759018053, + "language_loss": 0.78207982, + "learning_rate": 3.72516221392398e-06, + "loss": 0.80389583, + "num_input_tokens_seen": 69580425, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9453125, + "step": 3223, + "time_per_iteration": 2.4451496601104736 + }, + { + "auxiliary_loss_clip": 0.01139994, + "auxiliary_loss_mlp": 0.01047584, + "balance_loss_clip": 1.03013206, + "balance_loss_mlp": 1.04896808, + "epoch": 0.19383736660153314, + "flos": 15077813351040.0, + "grad_norm": 1.8200574771064912, + "language_loss": 0.75589085, + "learning_rate": 3.7249651452288653e-06, + "loss": 0.77776659, + "num_input_tokens_seen": 69597085, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.91015625, + "step": 3224, + "time_per_iteration": 2.4390981197357178 + }, + { + "auxiliary_loss_clip": 0.0114036, + "auxiliary_loss_mlp": 0.01039996, + "balance_loss_clip": 1.02274644, + "balance_loss_mlp": 1.04741263, + "epoch": 0.1938974898542011, + "flos": 47119934350080.0, + "grad_norm": 2.092202382915022, + "language_loss": 0.71141279, + "learning_rate": 3.7247680111229e-06, + "loss": 0.73321629, + "num_input_tokens_seen": 69618885, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9296875, + "step": 3225, + "time_per_iteration": 2.6690707206726074 + }, + { + "auxiliary_loss_clip": 0.01142605, + "auxiliary_loss_mlp": 0.01044348, + "balance_loss_clip": 1.0279572, + "balance_loss_mlp": 1.04787326, + "epoch": 0.19395761310686907, + "flos": 25812554376960.0, + "grad_norm": 2.058354492672399, + "language_loss": 0.6915803, + "learning_rate": 3.7245708116135585e-06, + "loss": 0.71344984, + "num_input_tokens_seen": 69638200, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9453125, + "step": 3226, + "time_per_iteration": 3.906217336654663 + }, + { + "auxiliary_loss_clip": 0.0114437, + "auxiliary_loss_mlp": 0.01044177, + "balance_loss_clip": 1.02562809, + "balance_loss_mlp": 1.05274427, + "epoch": 0.19401773635953706, + "flos": 23039604334080.0, + "grad_norm": 1.6131772564475266, + "language_loss": 0.76138854, + "learning_rate": 3.7243735467083193e-06, + "loss": 0.78327405, + "num_input_tokens_seen": 69657550, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9140625, + "step": 3227, + "time_per_iteration": 4.168737411499023 + }, + { + "auxiliary_loss_clip": 0.01140847, + "auxiliary_loss_mlp": 0.01042545, + "balance_loss_clip": 1.02547467, + "balance_loss_mlp": 1.04588878, + "epoch": 0.19407785961220503, + "flos": 15920780705280.0, + "grad_norm": 1.8539897665707572, + "language_loss": 0.69154215, + "learning_rate": 3.724176216414662e-06, + "loss": 0.7133761, + "num_input_tokens_seen": 69675005, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.94921875, + "step": 3228, + "time_per_iteration": 2.4857404232025146 + }, + { + "auxiliary_loss_clip": 0.01142054, + "auxiliary_loss_mlp": 0.01044016, + "balance_loss_clip": 1.02698135, + "balance_loss_mlp": 1.04929864, + "epoch": 0.194137982864873, + "flos": 25921722787200.0, + "grad_norm": 1.9069922854616745, + "language_loss": 0.7428174, + "learning_rate": 3.72397882074007e-06, + "loss": 0.76467812, + "num_input_tokens_seen": 69696455, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9296875, + "step": 3229, + "time_per_iteration": 2.5357918739318848 + }, + { + "auxiliary_loss_clip": 0.01141663, + "auxiliary_loss_mlp": 0.01041685, + "balance_loss_clip": 1.02443504, + "balance_loss_mlp": 1.04832351, + "epoch": 0.19419810611754096, + "flos": 13261344618240.0, + "grad_norm": 1.6963766145995596, + "language_loss": 0.65157712, + "learning_rate": 3.7237813596920285e-06, + "loss": 0.67341059, + "num_input_tokens_seen": 69714245, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.93359375, + "step": 3230, + "time_per_iteration": 2.4796855449676514 + }, + { + "auxiliary_loss_clip": 0.01137871, + "auxiliary_loss_mlp": 0.01044544, + "balance_loss_clip": 1.0268054, + "balance_loss_mlp": 1.04652202, + "epoch": 0.19425822937020892, + "flos": 15705568368000.0, + "grad_norm": 1.8877471342298004, + "language_loss": 0.8184334, + "learning_rate": 3.7235838332780254e-06, + "loss": 0.84025759, + "num_input_tokens_seen": 69731515, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9140625, + "step": 3231, + "time_per_iteration": 2.5315961837768555 + }, + { + "auxiliary_loss_clip": 0.01145592, + "auxiliary_loss_mlp": 0.01039112, + "balance_loss_clip": 1.02045608, + "balance_loss_mlp": 1.05067456, + "epoch": 0.1943183526228769, + "flos": 23105392093440.0, + "grad_norm": 1.787689187471357, + "language_loss": 0.86743605, + "learning_rate": 3.72338624150555e-06, + "loss": 0.88928306, + "num_input_tokens_seen": 69748885, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.94921875, + "step": 3232, + "time_per_iteration": 2.4916152954101562 + }, + { + "auxiliary_loss_clip": 0.01141636, + "auxiliary_loss_mlp": 0.01052447, + "balance_loss_clip": 1.03497076, + "balance_loss_mlp": 1.05008495, + "epoch": 0.19437847587554485, + "flos": 24712610146560.0, + "grad_norm": 1.5602267859616314, + "language_loss": 0.8513217, + "learning_rate": 3.723188584382096e-06, + "loss": 0.87326247, + "num_input_tokens_seen": 69767540, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9140625, + "step": 3233, + "time_per_iteration": 2.526118040084839 + }, + { + "auxiliary_loss_clip": 0.01145232, + "auxiliary_loss_mlp": 0.01053705, + "balance_loss_clip": 1.03603804, + "balance_loss_mlp": 1.04827857, + "epoch": 0.19443859912821285, + "flos": 23116130259840.0, + "grad_norm": 1.6631942166294669, + "language_loss": 0.89191484, + "learning_rate": 3.722990861915158e-06, + "loss": 0.91390419, + "num_input_tokens_seen": 69789340, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.96875, + "step": 3234, + "time_per_iteration": 2.4783849716186523 + }, + { + "auxiliary_loss_clip": 0.01143869, + "auxiliary_loss_mlp": 0.01043333, + "balance_loss_clip": 1.02493858, + "balance_loss_mlp": 1.04675341, + "epoch": 0.1944987223808808, + "flos": 15084385539840.0, + "grad_norm": 2.1776085062187374, + "language_loss": 0.78503513, + "learning_rate": 3.722793074112234e-06, + "loss": 0.80690718, + "num_input_tokens_seen": 69806470, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.96875, + "step": 3235, + "time_per_iteration": 2.4414284229278564 + }, + { + "auxiliary_loss_clip": 0.01146423, + "auxiliary_loss_mlp": 0.01041843, + "balance_loss_clip": 1.02545178, + "balance_loss_mlp": 1.05288744, + "epoch": 0.19455884563354878, + "flos": 17126876603520.0, + "grad_norm": 2.115791514531618, + "language_loss": 0.7937218, + "learning_rate": 3.7225952209808233e-06, + "loss": 0.81560451, + "num_input_tokens_seen": 69822655, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.93359375, + "step": 3236, + "time_per_iteration": 2.4394619464874268 + }, + { + "auxiliary_loss_clip": 0.01144391, + "auxiliary_loss_mlp": 0.01040175, + "balance_loss_clip": 1.02204323, + "balance_loss_mlp": 1.05156302, + "epoch": 0.19461896888621674, + "flos": 20193396503040.0, + "grad_norm": 2.445233321344346, + "language_loss": 0.75936478, + "learning_rate": 3.72239730252843e-06, + "loss": 0.78121042, + "num_input_tokens_seen": 69841895, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9296875, + "step": 3237, + "time_per_iteration": 2.544003486633301 + }, + { + "auxiliary_loss_clip": 0.01147227, + "auxiliary_loss_mlp": 0.01046687, + "balance_loss_clip": 1.03005719, + "balance_loss_mlp": 1.05079889, + "epoch": 0.1946790921388847, + "flos": 25301365971840.0, + "grad_norm": 2.0921387862929586, + "language_loss": 0.75056225, + "learning_rate": 3.7221993187625583e-06, + "loss": 0.77250135, + "num_input_tokens_seen": 69862220, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.96484375, + "step": 3238, + "time_per_iteration": 2.4795806407928467 + }, + { + "auxiliary_loss_clip": 0.01147117, + "auxiliary_loss_mlp": 0.01044554, + "balance_loss_clip": 1.02600455, + "balance_loss_mlp": 1.05317962, + "epoch": 0.19473921539155267, + "flos": 20193396503040.0, + "grad_norm": 1.8233855681516762, + "language_loss": 0.73016453, + "learning_rate": 3.7220012696907155e-06, + "loss": 0.75208122, + "num_input_tokens_seen": 69881830, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.94140625, + "step": 3239, + "time_per_iteration": 2.4695816040039062 + }, + { + "auxiliary_loss_clip": 0.01144581, + "auxiliary_loss_mlp": 0.01048537, + "balance_loss_clip": 1.03026247, + "balance_loss_mlp": 1.0505631, + "epoch": 0.19479933864422067, + "flos": 20887549810560.0, + "grad_norm": 1.897973355517785, + "language_loss": 0.73792124, + "learning_rate": 3.721803155320412e-06, + "loss": 0.75985241, + "num_input_tokens_seen": 69900515, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.94140625, + "step": 3240, + "time_per_iteration": 2.4483954906463623 + }, + { + "auxiliary_loss_clip": 0.0114635, + "auxiliary_loss_mlp": 0.01041908, + "balance_loss_clip": 1.02477801, + "balance_loss_mlp": 1.05221701, + "epoch": 0.19485946189688863, + "flos": 23295072839040.0, + "grad_norm": 1.8797415358152445, + "language_loss": 0.66685343, + "learning_rate": 3.7216049756591606e-06, + "loss": 0.68873608, + "num_input_tokens_seen": 69920060, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.94140625, + "step": 3241, + "time_per_iteration": 2.5644116401672363 + }, + { + "auxiliary_loss_clip": 0.01144249, + "auxiliary_loss_mlp": 0.01045443, + "balance_loss_clip": 1.0280863, + "balance_loss_mlp": 1.05193758, + "epoch": 0.1949195851495566, + "flos": 23295036925440.0, + "grad_norm": 1.4346271942222966, + "language_loss": 0.82889283, + "learning_rate": 3.7214067307144754e-06, + "loss": 0.85078967, + "num_input_tokens_seen": 69939820, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.921875, + "step": 3242, + "time_per_iteration": 2.476043701171875 + }, + { + "auxiliary_loss_clip": 0.01054708, + "auxiliary_loss_mlp": 0.01010683, + "balance_loss_clip": 1.00856066, + "balance_loss_mlp": 1.02379096, + "epoch": 0.19497970840222456, + "flos": 64962871557120.0, + "grad_norm": 0.8482804620416572, + "language_loss": 0.57572454, + "learning_rate": 3.721208420493875e-06, + "loss": 0.59637845, + "num_input_tokens_seen": 70002145, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.30859375, + "step": 3243, + "time_per_iteration": 3.1217525005340576 + }, + { + "auxiliary_loss_clip": 0.01144732, + "auxiliary_loss_mlp": 0.01043073, + "balance_loss_clip": 1.02573967, + "balance_loss_mlp": 1.05099249, + "epoch": 0.19503983165489253, + "flos": 19644717277440.0, + "grad_norm": 2.02063631868758, + "language_loss": 0.83243412, + "learning_rate": 3.7210100450048784e-06, + "loss": 0.85431218, + "num_input_tokens_seen": 70020510, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9375, + "step": 3244, + "time_per_iteration": 2.4848830699920654 + }, + { + "auxiliary_loss_clip": 0.01147429, + "auxiliary_loss_mlp": 0.01048127, + "balance_loss_clip": 1.03144979, + "balance_loss_mlp": 1.05495024, + "epoch": 0.1950999549075605, + "flos": 21141976821120.0, + "grad_norm": 1.8275576625869878, + "language_loss": 0.77049786, + "learning_rate": 3.7208116042550088e-06, + "loss": 0.79245341, + "num_input_tokens_seen": 70040760, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.921875, + "step": 3245, + "time_per_iteration": 2.5539040565490723 + }, + { + "auxiliary_loss_clip": 0.01141945, + "auxiliary_loss_mlp": 0.01041151, + "balance_loss_clip": 1.0235796, + "balance_loss_mlp": 1.04852772, + "epoch": 0.19516007816022846, + "flos": 20884820376960.0, + "grad_norm": 2.8639596298576055, + "language_loss": 0.84020388, + "learning_rate": 3.7206130982517906e-06, + "loss": 0.86203486, + "num_input_tokens_seen": 70058720, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9375, + "step": 3246, + "time_per_iteration": 2.5018341541290283 + }, + { + "auxiliary_loss_clip": 0.0114444, + "auxiliary_loss_mlp": 0.01045285, + "balance_loss_clip": 1.02834511, + "balance_loss_mlp": 1.04978824, + "epoch": 0.19522020141289645, + "flos": 16910515031040.0, + "grad_norm": 2.1267063345385777, + "language_loss": 0.7636531, + "learning_rate": 3.7204145270027514e-06, + "loss": 0.78555036, + "num_input_tokens_seen": 70076470, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9453125, + "step": 3247, + "time_per_iteration": 2.4512898921966553 + }, + { + "auxiliary_loss_clip": 0.01144663, + "auxiliary_loss_mlp": 0.01038699, + "balance_loss_clip": 1.02228367, + "balance_loss_mlp": 1.05077446, + "epoch": 0.19528032466556441, + "flos": 26724829023360.0, + "grad_norm": 1.4744510548582124, + "language_loss": 0.75330198, + "learning_rate": 3.720215890515421e-06, + "loss": 0.77513552, + "num_input_tokens_seen": 70096220, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9375, + "step": 3248, + "time_per_iteration": 2.5222222805023193 + }, + { + "auxiliary_loss_clip": 0.01140079, + "auxiliary_loss_mlp": 0.0104275, + "balance_loss_clip": 1.02590537, + "balance_loss_mlp": 1.04661679, + "epoch": 0.19534044791823238, + "flos": 21032808410880.0, + "grad_norm": 1.9881324270373204, + "language_loss": 0.78316575, + "learning_rate": 3.7200171887973316e-06, + "loss": 0.80499399, + "num_input_tokens_seen": 70114800, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.93359375, + "step": 3249, + "time_per_iteration": 2.475385904312134 + }, + { + "auxiliary_loss_clip": 0.01143906, + "auxiliary_loss_mlp": 0.01048238, + "balance_loss_clip": 1.0316205, + "balance_loss_mlp": 1.04948914, + "epoch": 0.19540057117090034, + "flos": 22344050396160.0, + "grad_norm": 1.839405294960197, + "language_loss": 0.73238158, + "learning_rate": 3.7198184218560176e-06, + "loss": 0.7543031, + "num_input_tokens_seen": 70134930, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9453125, + "step": 3250, + "time_per_iteration": 2.4548323154449463 + }, + { + "auxiliary_loss_clip": 0.01136082, + "auxiliary_loss_mlp": 0.01037994, + "balance_loss_clip": 1.02206779, + "balance_loss_mlp": 1.04583359, + "epoch": 0.1954606944235683, + "flos": 20301631159680.0, + "grad_norm": 1.9014920395959154, + "language_loss": 0.79582441, + "learning_rate": 3.719619589699017e-06, + "loss": 0.8175652, + "num_input_tokens_seen": 70152045, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.90234375, + "step": 3251, + "time_per_iteration": 2.4597084522247314 + }, + { + "auxiliary_loss_clip": 0.01142571, + "auxiliary_loss_mlp": 0.01041367, + "balance_loss_clip": 1.02441597, + "balance_loss_mlp": 1.04888558, + "epoch": 0.19552081767623627, + "flos": 17346865449600.0, + "grad_norm": 3.2143497379473613, + "language_loss": 0.83534026, + "learning_rate": 3.7194206923338695e-06, + "loss": 0.85717964, + "num_input_tokens_seen": 70169240, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9375, + "step": 3252, + "time_per_iteration": 2.4245967864990234 + }, + { + "auxiliary_loss_clip": 0.01143675, + "auxiliary_loss_mlp": 0.01048315, + "balance_loss_clip": 1.03026652, + "balance_loss_mlp": 1.04651105, + "epoch": 0.19558094092890424, + "flos": 31977626129280.0, + "grad_norm": 1.7806404718622555, + "language_loss": 0.73870194, + "learning_rate": 3.719221729768117e-06, + "loss": 0.76062191, + "num_input_tokens_seen": 70192690, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.96875, + "step": 3253, + "time_per_iteration": 2.5752809047698975 + }, + { + "auxiliary_loss_clip": 0.01142809, + "auxiliary_loss_mlp": 0.01040218, + "balance_loss_clip": 1.02352846, + "balance_loss_mlp": 1.04619944, + "epoch": 0.19564106418157223, + "flos": 22268889187200.0, + "grad_norm": 1.833285648050628, + "language_loss": 0.76684111, + "learning_rate": 3.7190227020093037e-06, + "loss": 0.78867137, + "num_input_tokens_seen": 70209685, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.96484375, + "step": 3254, + "time_per_iteration": 2.533993721008301 + }, + { + "auxiliary_loss_clip": 0.01044914, + "auxiliary_loss_mlp": 0.01004749, + "balance_loss_clip": 1.00268674, + "balance_loss_mlp": 1.01349974, + "epoch": 0.1957011874342402, + "flos": 54364554385920.0, + "grad_norm": 0.7652407497357797, + "language_loss": 0.55344874, + "learning_rate": 3.7188236090649774e-06, + "loss": 0.5739454, + "num_input_tokens_seen": 70265050, + "router_z_loss_clip": 0.02062988, + "router_z_loss_mlp": 0.3125, + "step": 3255, + "time_per_iteration": 3.164173126220703 + }, + { + "auxiliary_loss_clip": 0.01144973, + "auxiliary_loss_mlp": 0.01041369, + "balance_loss_clip": 1.02407217, + "balance_loss_mlp": 1.05057478, + "epoch": 0.19576131068690816, + "flos": 16506699356160.0, + "grad_norm": 2.650975615707017, + "language_loss": 0.7066443, + "learning_rate": 3.718624450942688e-06, + "loss": 0.7285077, + "num_input_tokens_seen": 70281830, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9453125, + "step": 3256, + "time_per_iteration": 2.496424436569214 + }, + { + "auxiliary_loss_clip": 0.01139601, + "auxiliary_loss_mlp": 0.0104318, + "balance_loss_clip": 1.02635908, + "balance_loss_mlp": 1.04647136, + "epoch": 0.19582143393957613, + "flos": 14719676797440.0, + "grad_norm": 2.256610935254856, + "language_loss": 0.80055118, + "learning_rate": 3.718425227649987e-06, + "loss": 0.82237899, + "num_input_tokens_seen": 70297420, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9296875, + "step": 3257, + "time_per_iteration": 2.4771504402160645 + }, + { + "auxiliary_loss_clip": 0.01143218, + "auxiliary_loss_mlp": 0.01042656, + "balance_loss_clip": 1.02637219, + "balance_loss_mlp": 1.05034149, + "epoch": 0.1958815571922441, + "flos": 24425504737920.0, + "grad_norm": 1.9567741269254724, + "language_loss": 0.74843282, + "learning_rate": 3.7182259391944292e-06, + "loss": 0.77029151, + "num_input_tokens_seen": 70319210, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.9296875, + "step": 3258, + "time_per_iteration": 2.6177120208740234 + }, + { + "auxiliary_loss_clip": 0.01142767, + "auxiliary_loss_mlp": 0.01036452, + "balance_loss_clip": 1.01932144, + "balance_loss_mlp": 1.04772139, + "epoch": 0.19594168044491206, + "flos": 24900279730560.0, + "grad_norm": 1.7410781544458231, + "language_loss": 0.74462247, + "learning_rate": 3.7180265855835714e-06, + "loss": 0.7664147, + "num_input_tokens_seen": 70339045, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.94921875, + "step": 3259, + "time_per_iteration": 2.54068660736084 + }, + { + "auxiliary_loss_clip": 0.01145135, + "auxiliary_loss_mlp": 0.01036775, + "balance_loss_clip": 1.01923943, + "balance_loss_mlp": 1.04965675, + "epoch": 0.19600180369758005, + "flos": 12057008486400.0, + "grad_norm": 2.380592438675979, + "language_loss": 0.77040654, + "learning_rate": 3.7178271668249735e-06, + "loss": 0.7922256, + "num_input_tokens_seen": 70356505, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.953125, + "step": 3260, + "time_per_iteration": 2.4983303546905518 + }, + { + "auxiliary_loss_clip": 0.01143361, + "auxiliary_loss_mlp": 0.01041828, + "balance_loss_clip": 1.02459061, + "balance_loss_mlp": 1.0486325, + "epoch": 0.19606192695024802, + "flos": 20850202644480.0, + "grad_norm": 2.011568492365706, + "language_loss": 0.82168972, + "learning_rate": 3.7176276829261975e-06, + "loss": 0.84354162, + "num_input_tokens_seen": 70375410, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9453125, + "step": 3261, + "time_per_iteration": 2.52164626121521 + }, + { + "auxiliary_loss_clip": 0.01144228, + "auxiliary_loss_mlp": 0.01042699, + "balance_loss_clip": 1.02510428, + "balance_loss_mlp": 1.05130327, + "epoch": 0.19612205020291598, + "flos": 28475509996800.0, + "grad_norm": 2.1812525814986112, + "language_loss": 0.76691413, + "learning_rate": 3.717428133894807e-06, + "loss": 0.78878343, + "num_input_tokens_seen": 70396315, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9296875, + "step": 3262, + "time_per_iteration": 2.513619899749756 + }, + { + "auxiliary_loss_clip": 0.01145398, + "auxiliary_loss_mlp": 0.01044459, + "balance_loss_clip": 1.02775788, + "balance_loss_mlp": 1.05290008, + "epoch": 0.19618217345558395, + "flos": 25556618995200.0, + "grad_norm": 1.7175684177653927, + "language_loss": 0.8667773, + "learning_rate": 3.71722851973837e-06, + "loss": 0.88867593, + "num_input_tokens_seen": 70417945, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.921875, + "step": 3263, + "time_per_iteration": 2.49373459815979 + }, + { + "auxiliary_loss_clip": 0.01140801, + "auxiliary_loss_mlp": 0.01041854, + "balance_loss_clip": 1.0251646, + "balance_loss_mlp": 1.04784787, + "epoch": 0.1962422967082519, + "flos": 25264413855360.0, + "grad_norm": 1.5660143494742738, + "language_loss": 0.74136549, + "learning_rate": 3.717028840464455e-06, + "loss": 0.76319206, + "num_input_tokens_seen": 70438690, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9296875, + "step": 3264, + "time_per_iteration": 2.4891843795776367 + }, + { + "auxiliary_loss_clip": 0.0114591, + "auxiliary_loss_mlp": 0.01049823, + "balance_loss_clip": 1.03340793, + "balance_loss_mlp": 1.05435038, + "epoch": 0.19630241996091988, + "flos": 18807352444800.0, + "grad_norm": 4.0742741532711975, + "language_loss": 0.78590196, + "learning_rate": 3.7168290960806344e-06, + "loss": 0.8078593, + "num_input_tokens_seen": 70455385, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9140625, + "step": 3265, + "time_per_iteration": 2.4226529598236084 + }, + { + "auxiliary_loss_clip": 0.01047401, + "auxiliary_loss_mlp": 0.01014864, + "balance_loss_clip": 1.01292133, + "balance_loss_mlp": 1.01652646, + "epoch": 0.19636254321358784, + "flos": 62321137896960.0, + "grad_norm": 0.7852387786228787, + "language_loss": 0.53459084, + "learning_rate": 3.716629286594483e-06, + "loss": 0.55521357, + "num_input_tokens_seen": 70514280, + "router_z_loss_clip": 0.01940918, + "router_z_loss_mlp": 0.30859375, + "step": 3266, + "time_per_iteration": 3.0519652366638184 + }, + { + "auxiliary_loss_clip": 0.01145434, + "auxiliary_loss_mlp": 0.01041492, + "balance_loss_clip": 1.02263319, + "balance_loss_mlp": 1.04800785, + "epoch": 0.19642266646625584, + "flos": 21069329564160.0, + "grad_norm": 1.9728388819613873, + "language_loss": 0.80503136, + "learning_rate": 3.7164294120135767e-06, + "loss": 0.82690066, + "num_input_tokens_seen": 70531800, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.9765625, + "step": 3267, + "time_per_iteration": 2.436455011367798 + }, + { + "auxiliary_loss_clip": 0.01138843, + "auxiliary_loss_mlp": 0.0104324, + "balance_loss_clip": 1.02726591, + "balance_loss_mlp": 1.04780269, + "epoch": 0.1964827897189238, + "flos": 14538651229440.0, + "grad_norm": 2.528633756775916, + "language_loss": 0.87031806, + "learning_rate": 3.7162294723454953e-06, + "loss": 0.89213896, + "num_input_tokens_seen": 70550615, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.91015625, + "step": 3268, + "time_per_iteration": 5.348580360412598 + }, + { + "auxiliary_loss_clip": 0.01141651, + "auxiliary_loss_mlp": 0.01045776, + "balance_loss_clip": 1.02865744, + "balance_loss_mlp": 1.04996669, + "epoch": 0.19654291297159177, + "flos": 19244636616960.0, + "grad_norm": 2.7845337804652086, + "language_loss": 0.69331455, + "learning_rate": 3.7160294675978197e-06, + "loss": 0.71518886, + "num_input_tokens_seen": 70568690, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9140625, + "step": 3269, + "time_per_iteration": 3.9386346340179443 + }, + { + "auxiliary_loss_clip": 0.01148343, + "auxiliary_loss_mlp": 0.01051701, + "balance_loss_clip": 1.03361702, + "balance_loss_mlp": 1.0530045, + "epoch": 0.19660303622425973, + "flos": 25775710001280.0, + "grad_norm": 2.4386480468071086, + "language_loss": 0.80760634, + "learning_rate": 3.715829397778135e-06, + "loss": 0.82960677, + "num_input_tokens_seen": 70588665, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.953125, + "step": 3270, + "time_per_iteration": 2.5130820274353027 + }, + { + "auxiliary_loss_clip": 0.01140062, + "auxiliary_loss_mlp": 0.01045089, + "balance_loss_clip": 1.02848363, + "balance_loss_mlp": 1.04726839, + "epoch": 0.1966631594769277, + "flos": 20595093275520.0, + "grad_norm": 1.857854204827715, + "language_loss": 0.83918732, + "learning_rate": 3.715629262894028e-06, + "loss": 0.86103886, + "num_input_tokens_seen": 70606900, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9296875, + "step": 3271, + "time_per_iteration": 2.4522581100463867 + }, + { + "auxiliary_loss_clip": 0.01139583, + "auxiliary_loss_mlp": 0.01046491, + "balance_loss_clip": 1.0297302, + "balance_loss_mlp": 1.04943895, + "epoch": 0.19672328272959566, + "flos": 23623188600960.0, + "grad_norm": 2.1376155358713835, + "language_loss": 0.80162311, + "learning_rate": 3.715429062953087e-06, + "loss": 0.82348382, + "num_input_tokens_seen": 70625955, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8984375, + "step": 3272, + "time_per_iteration": 2.4968738555908203 + }, + { + "auxiliary_loss_clip": 0.01145188, + "auxiliary_loss_mlp": 0.01045772, + "balance_loss_clip": 1.02766371, + "balance_loss_mlp": 1.05075002, + "epoch": 0.19678340598226365, + "flos": 23110922787840.0, + "grad_norm": 1.7855512393811417, + "language_loss": 0.80728978, + "learning_rate": 3.7152287979629043e-06, + "loss": 0.82919937, + "num_input_tokens_seen": 70646090, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9453125, + "step": 3273, + "time_per_iteration": 2.525407552719116 + }, + { + "auxiliary_loss_clip": 0.01142802, + "auxiliary_loss_mlp": 0.01051833, + "balance_loss_clip": 1.03454804, + "balance_loss_mlp": 1.04807115, + "epoch": 0.19684352923493162, + "flos": 24534852716160.0, + "grad_norm": 5.081990879764466, + "language_loss": 0.7791425, + "learning_rate": 3.7150284679310735e-06, + "loss": 0.80108881, + "num_input_tokens_seen": 70666065, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9453125, + "step": 3274, + "time_per_iteration": 2.527858018875122 + }, + { + "auxiliary_loss_clip": 0.01141542, + "auxiliary_loss_mlp": 0.01052239, + "balance_loss_clip": 1.03440571, + "balance_loss_mlp": 1.04765558, + "epoch": 0.19690365248759958, + "flos": 21796448578560.0, + "grad_norm": 2.1984029701042367, + "language_loss": 0.81144857, + "learning_rate": 3.7148280728651914e-06, + "loss": 0.83338642, + "num_input_tokens_seen": 70681580, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.9375, + "step": 3275, + "time_per_iteration": 2.451392412185669 + }, + { + "auxiliary_loss_clip": 0.01143631, + "auxiliary_loss_mlp": 0.01047389, + "balance_loss_clip": 1.02934027, + "balance_loss_mlp": 1.04772139, + "epoch": 0.19696377574026755, + "flos": 19056643810560.0, + "grad_norm": 1.90284229785688, + "language_loss": 0.81104618, + "learning_rate": 3.7146276127728563e-06, + "loss": 0.83295637, + "num_input_tokens_seen": 70697745, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9609375, + "step": 3276, + "time_per_iteration": 2.462033748626709 + }, + { + "auxiliary_loss_clip": 0.01142306, + "auxiliary_loss_mlp": 0.01038428, + "balance_loss_clip": 1.02132106, + "balance_loss_mlp": 1.04889154, + "epoch": 0.19702389899293551, + "flos": 22820656982400.0, + "grad_norm": 2.0909421048868126, + "language_loss": 0.89347923, + "learning_rate": 3.7144270876616713e-06, + "loss": 0.91528654, + "num_input_tokens_seen": 70715110, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.93359375, + "step": 3277, + "time_per_iteration": 2.4887003898620605 + }, + { + "auxiliary_loss_clip": 0.01146208, + "auxiliary_loss_mlp": 0.01047781, + "balance_loss_clip": 1.02804041, + "balance_loss_mlp": 1.04832077, + "epoch": 0.19708402224560348, + "flos": 22894237992960.0, + "grad_norm": 2.9974095646387573, + "language_loss": 0.62265754, + "learning_rate": 3.714226497539239e-06, + "loss": 0.64459741, + "num_input_tokens_seen": 70734715, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.9765625, + "step": 3278, + "time_per_iteration": 2.560401201248169 + }, + { + "auxiliary_loss_clip": 0.01144829, + "auxiliary_loss_mlp": 0.01054112, + "balance_loss_clip": 1.03562284, + "balance_loss_mlp": 1.04910243, + "epoch": 0.19714414549827144, + "flos": 25662519267840.0, + "grad_norm": 3.1131920881239936, + "language_loss": 0.73664343, + "learning_rate": 3.714025842413166e-06, + "loss": 0.75863284, + "num_input_tokens_seen": 70752650, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.95703125, + "step": 3279, + "time_per_iteration": 2.5036048889160156 + }, + { + "auxiliary_loss_clip": 0.01144667, + "auxiliary_loss_mlp": 0.01045176, + "balance_loss_clip": 1.02816486, + "balance_loss_mlp": 1.04906511, + "epoch": 0.19720426875093944, + "flos": 23915824704000.0, + "grad_norm": 1.6310774806952162, + "language_loss": 0.82451236, + "learning_rate": 3.713825122291061e-06, + "loss": 0.84641075, + "num_input_tokens_seen": 70772365, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.95703125, + "step": 3280, + "time_per_iteration": 2.499962091445923 + }, + { + "auxiliary_loss_clip": 0.01145271, + "auxiliary_loss_mlp": 0.01043645, + "balance_loss_clip": 1.02744484, + "balance_loss_mlp": 1.05086279, + "epoch": 0.1972643920036074, + "flos": 13881952828800.0, + "grad_norm": 1.847926035637751, + "language_loss": 0.77581155, + "learning_rate": 3.713624337180536e-06, + "loss": 0.79770064, + "num_input_tokens_seen": 70790340, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.94140625, + "step": 3281, + "time_per_iteration": 2.4610888957977295 + }, + { + "auxiliary_loss_clip": 0.01141389, + "auxiliary_loss_mlp": 0.01043048, + "balance_loss_clip": 1.02719295, + "balance_loss_mlp": 1.0507971, + "epoch": 0.19732451525627537, + "flos": 19863592801920.0, + "grad_norm": 1.593504057665797, + "language_loss": 0.79502213, + "learning_rate": 3.7134234870892045e-06, + "loss": 0.81686652, + "num_input_tokens_seen": 70809295, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.90625, + "step": 3282, + "time_per_iteration": 2.453230857849121 + }, + { + "auxiliary_loss_clip": 0.01149903, + "auxiliary_loss_mlp": 0.01044987, + "balance_loss_clip": 1.0279994, + "balance_loss_mlp": 1.05359089, + "epoch": 0.19738463850894333, + "flos": 24973429777920.0, + "grad_norm": 2.157912578421005, + "language_loss": 0.71937042, + "learning_rate": 3.7132225720246826e-06, + "loss": 0.7413193, + "num_input_tokens_seen": 70828765, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9609375, + "step": 3283, + "time_per_iteration": 2.5070157051086426 + }, + { + "auxiliary_loss_clip": 0.0114321, + "auxiliary_loss_mlp": 0.01041465, + "balance_loss_clip": 1.02462053, + "balance_loss_mlp": 1.04858577, + "epoch": 0.1974447617616113, + "flos": 18368883123840.0, + "grad_norm": 1.741034644212953, + "language_loss": 0.78832877, + "learning_rate": 3.7130215919945886e-06, + "loss": 0.81017548, + "num_input_tokens_seen": 70846805, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9453125, + "step": 3284, + "time_per_iteration": 2.436530113220215 + }, + { + "auxiliary_loss_clip": 0.01147439, + "auxiliary_loss_mlp": 0.01047462, + "balance_loss_clip": 1.02952087, + "balance_loss_mlp": 1.05069387, + "epoch": 0.19750488501427926, + "flos": 22892945103360.0, + "grad_norm": 2.0622477624774325, + "language_loss": 0.86366653, + "learning_rate": 3.7128205470065445e-06, + "loss": 0.88561547, + "num_input_tokens_seen": 70863805, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.96875, + "step": 3285, + "time_per_iteration": 2.4581058025360107 + }, + { + "auxiliary_loss_clip": 0.01143401, + "auxiliary_loss_mlp": 0.01042449, + "balance_loss_clip": 1.02571201, + "balance_loss_mlp": 1.0520879, + "epoch": 0.19756500826694723, + "flos": 21871502046720.0, + "grad_norm": 2.7361177014734372, + "language_loss": 0.88680863, + "learning_rate": 3.712619437068174e-06, + "loss": 0.90866709, + "num_input_tokens_seen": 70882660, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9140625, + "step": 3286, + "time_per_iteration": 2.472475290298462 + }, + { + "auxiliary_loss_clip": 0.01148385, + "auxiliary_loss_mlp": 0.01049125, + "balance_loss_clip": 1.03036189, + "balance_loss_mlp": 1.05260301, + "epoch": 0.19762513151961522, + "flos": 15158972131200.0, + "grad_norm": 2.2372981039860833, + "language_loss": 0.78297567, + "learning_rate": 3.712418262187102e-06, + "loss": 0.80495083, + "num_input_tokens_seen": 70898765, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.95703125, + "step": 3287, + "time_per_iteration": 2.417569875717163 + }, + { + "auxiliary_loss_clip": 0.01146192, + "auxiliary_loss_mlp": 0.01045423, + "balance_loss_clip": 1.02674246, + "balance_loss_mlp": 1.04974318, + "epoch": 0.1976852547722832, + "flos": 16979175878400.0, + "grad_norm": 2.197025185749627, + "language_loss": 0.81252837, + "learning_rate": 3.7122170223709584e-06, + "loss": 0.83444452, + "num_input_tokens_seen": 70916370, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.96484375, + "step": 3288, + "time_per_iteration": 2.4107155799865723 + }, + { + "auxiliary_loss_clip": 0.01139417, + "auxiliary_loss_mlp": 0.01049687, + "balance_loss_clip": 1.03315234, + "balance_loss_mlp": 1.04890108, + "epoch": 0.19774537802495115, + "flos": 20302924049280.0, + "grad_norm": 1.7615970311636253, + "language_loss": 0.72502065, + "learning_rate": 3.712015717627374e-06, + "loss": 0.74691164, + "num_input_tokens_seen": 70934870, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.90625, + "step": 3289, + "time_per_iteration": 2.4479291439056396 + }, + { + "auxiliary_loss_clip": 0.01144115, + "auxiliary_loss_mlp": 0.01045349, + "balance_loss_clip": 1.02807593, + "balance_loss_mlp": 1.0500598, + "epoch": 0.19780550127761912, + "flos": 27235478724480.0, + "grad_norm": 2.0523474932115833, + "language_loss": 0.7944051, + "learning_rate": 3.7118143479639813e-06, + "loss": 0.81629974, + "num_input_tokens_seen": 70955140, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9375, + "step": 3290, + "time_per_iteration": 2.499950408935547 + }, + { + "auxiliary_loss_clip": 0.0104544, + "auxiliary_loss_mlp": 0.01002976, + "balance_loss_clip": 1.00056827, + "balance_loss_mlp": 1.01336336, + "epoch": 0.19786562453028708, + "flos": 63550972684800.0, + "grad_norm": 0.9098407078047199, + "language_loss": 0.60440773, + "learning_rate": 3.711612913388418e-06, + "loss": 0.62489194, + "num_input_tokens_seen": 71012005, + "router_z_loss_clip": 0.02404785, + "router_z_loss_mlp": 0.3203125, + "step": 3291, + "time_per_iteration": 3.1538305282592773 + }, + { + "auxiliary_loss_clip": 0.01144863, + "auxiliary_loss_mlp": 0.01044766, + "balance_loss_clip": 1.02639592, + "balance_loss_mlp": 1.04670751, + "epoch": 0.19792574778295505, + "flos": 26286647011200.0, + "grad_norm": 2.151168561582294, + "language_loss": 0.81352198, + "learning_rate": 3.7114114139083204e-06, + "loss": 0.83541822, + "num_input_tokens_seen": 71031140, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.984375, + "step": 3292, + "time_per_iteration": 2.539417028427124 + }, + { + "auxiliary_loss_clip": 0.01137712, + "auxiliary_loss_mlp": 0.01047669, + "balance_loss_clip": 1.03051507, + "balance_loss_mlp": 1.04855824, + "epoch": 0.19798587103562304, + "flos": 19938107566080.0, + "grad_norm": 2.212806192124084, + "language_loss": 0.82146955, + "learning_rate": 3.7112098495313313e-06, + "loss": 0.84332335, + "num_input_tokens_seen": 71050250, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.890625, + "step": 3293, + "time_per_iteration": 2.438809394836426 + }, + { + "auxiliary_loss_clip": 0.01151271, + "auxiliary_loss_mlp": 0.01048402, + "balance_loss_clip": 1.02988923, + "balance_loss_mlp": 1.05333924, + "epoch": 0.198045994288291, + "flos": 20120282369280.0, + "grad_norm": 2.10438249616411, + "language_loss": 0.61268854, + "learning_rate": 3.711008220265093e-06, + "loss": 0.63468528, + "num_input_tokens_seen": 71068665, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9765625, + "step": 3294, + "time_per_iteration": 2.451650381088257 + }, + { + "auxiliary_loss_clip": 0.01143209, + "auxiliary_loss_mlp": 0.01043395, + "balance_loss_clip": 1.02681279, + "balance_loss_mlp": 1.05004907, + "epoch": 0.19810611754095897, + "flos": 17967653228160.0, + "grad_norm": 2.028666267444235, + "language_loss": 0.86983609, + "learning_rate": 3.710806526117251e-06, + "loss": 0.89170212, + "num_input_tokens_seen": 71085320, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9296875, + "step": 3295, + "time_per_iteration": 2.416771411895752 + }, + { + "auxiliary_loss_clip": 0.01141633, + "auxiliary_loss_mlp": 0.01051654, + "balance_loss_clip": 1.03529871, + "balance_loss_mlp": 1.04786801, + "epoch": 0.19816624079362694, + "flos": 15084996071040.0, + "grad_norm": 13.771873008268457, + "language_loss": 0.80491048, + "learning_rate": 3.7106047670954544e-06, + "loss": 0.82684338, + "num_input_tokens_seen": 71102020, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.9375, + "step": 3296, + "time_per_iteration": 2.450934648513794 + }, + { + "auxiliary_loss_clip": 0.01145402, + "auxiliary_loss_mlp": 0.01045523, + "balance_loss_clip": 1.02637851, + "balance_loss_mlp": 1.0482688, + "epoch": 0.1982263640462949, + "flos": 24900315644160.0, + "grad_norm": 2.0804115334054134, + "language_loss": 0.68406892, + "learning_rate": 3.710402943207354e-06, + "loss": 0.70597816, + "num_input_tokens_seen": 71123390, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.97265625, + "step": 3297, + "time_per_iteration": 2.5111610889434814 + }, + { + "auxiliary_loss_clip": 0.01138573, + "auxiliary_loss_mlp": 0.01040678, + "balance_loss_clip": 1.02440548, + "balance_loss_mlp": 1.04895413, + "epoch": 0.19828648729896287, + "flos": 20376181837440.0, + "grad_norm": 1.7575465421519259, + "language_loss": 0.81232154, + "learning_rate": 3.7102010544606016e-06, + "loss": 0.83411407, + "num_input_tokens_seen": 71141800, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.89453125, + "step": 3298, + "time_per_iteration": 2.472025156021118 + }, + { + "auxiliary_loss_clip": 0.01147375, + "auxiliary_loss_mlp": 0.01046386, + "balance_loss_clip": 1.02634668, + "balance_loss_mlp": 1.05001056, + "epoch": 0.19834661055163083, + "flos": 18880035615360.0, + "grad_norm": 2.343960149367745, + "language_loss": 0.85115641, + "learning_rate": 3.7099991008628544e-06, + "loss": 0.87309396, + "num_input_tokens_seen": 71159505, + "router_z_loss_clip": 0.20019531, + "router_z_loss_mlp": 0.9765625, + "step": 3299, + "time_per_iteration": 2.4725356101989746 + }, + { + "auxiliary_loss_clip": 0.01045198, + "auxiliary_loss_mlp": 0.01003179, + "balance_loss_clip": 1.00097358, + "balance_loss_mlp": 1.0131526, + "epoch": 0.19840673380429882, + "flos": 60259184640000.0, + "grad_norm": 0.7731212371218976, + "language_loss": 0.53215671, + "learning_rate": 3.7097970824217706e-06, + "loss": 0.55264044, + "num_input_tokens_seen": 71223265, + "router_z_loss_clip": 0.02209473, + "router_z_loss_mlp": 0.3203125, + "step": 3300, + "time_per_iteration": 3.004054069519043 + }, + { + "auxiliary_loss_clip": 0.01142157, + "auxiliary_loss_mlp": 0.01051571, + "balance_loss_clip": 1.03298628, + "balance_loss_mlp": 1.04772329, + "epoch": 0.1984668570569668, + "flos": 19902017376000.0, + "grad_norm": 1.6138936044346288, + "language_loss": 0.73150593, + "learning_rate": 3.7095949991450093e-06, + "loss": 0.75344324, + "num_input_tokens_seen": 71242385, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9453125, + "step": 3301, + "time_per_iteration": 2.4547884464263916 + }, + { + "auxiliary_loss_clip": 0.01140885, + "auxiliary_loss_mlp": 0.01038256, + "balance_loss_clip": 1.02191293, + "balance_loss_mlp": 1.04811358, + "epoch": 0.19852698030963475, + "flos": 15630766295040.0, + "grad_norm": 2.437382428027231, + "language_loss": 0.88445318, + "learning_rate": 3.709392851040235e-06, + "loss": 0.90624458, + "num_input_tokens_seen": 71258990, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.9296875, + "step": 3302, + "time_per_iteration": 2.429579019546509 + }, + { + "auxiliary_loss_clip": 0.01142317, + "auxiliary_loss_mlp": 0.0104676, + "balance_loss_clip": 1.02940273, + "balance_loss_mlp": 1.04750872, + "epoch": 0.19858710356230272, + "flos": 43143007311360.0, + "grad_norm": 1.9503370408087137, + "language_loss": 0.73907369, + "learning_rate": 3.709190638115111e-06, + "loss": 0.76096445, + "num_input_tokens_seen": 71282770, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9453125, + "step": 3303, + "time_per_iteration": 2.627835273742676 + }, + { + "auxiliary_loss_clip": 0.01141217, + "auxiliary_loss_mlp": 0.0104825, + "balance_loss_clip": 1.03117871, + "balance_loss_mlp": 1.04874539, + "epoch": 0.19864722681497068, + "flos": 35144084643840.0, + "grad_norm": 1.8172241344194675, + "language_loss": 0.74761099, + "learning_rate": 3.7089883603773084e-06, + "loss": 0.76950562, + "num_input_tokens_seen": 71301410, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.921875, + "step": 3304, + "time_per_iteration": 2.551241397857666 + }, + { + "auxiliary_loss_clip": 0.01139854, + "auxiliary_loss_mlp": 0.01039407, + "balance_loss_clip": 1.02333784, + "balance_loss_mlp": 1.04763281, + "epoch": 0.19870735006763865, + "flos": 19426200888960.0, + "grad_norm": 2.605019982075021, + "language_loss": 0.85717452, + "learning_rate": 3.7087860178344955e-06, + "loss": 0.87896717, + "num_input_tokens_seen": 71319670, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.921875, + "step": 3305, + "time_per_iteration": 2.432363986968994 + }, + { + "auxiliary_loss_clip": 0.01141298, + "auxiliary_loss_mlp": 0.01040354, + "balance_loss_clip": 1.02408171, + "balance_loss_mlp": 1.04600525, + "epoch": 0.19876747332030664, + "flos": 23547380947200.0, + "grad_norm": 1.7555780714506408, + "language_loss": 0.68014234, + "learning_rate": 3.7085836104943445e-06, + "loss": 0.70195889, + "num_input_tokens_seen": 71339850, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.953125, + "step": 3306, + "time_per_iteration": 2.478422164916992 + }, + { + "auxiliary_loss_clip": 0.01137681, + "auxiliary_loss_mlp": 0.01036227, + "balance_loss_clip": 1.02098584, + "balance_loss_mlp": 1.0453912, + "epoch": 0.1988275965729746, + "flos": 19829406032640.0, + "grad_norm": 1.4744708200758283, + "language_loss": 0.76455241, + "learning_rate": 3.7083811383645332e-06, + "loss": 0.78629148, + "num_input_tokens_seen": 71359795, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.921875, + "step": 3307, + "time_per_iteration": 2.459176778793335 + }, + { + "auxiliary_loss_clip": 0.01140736, + "auxiliary_loss_mlp": 0.0104117, + "balance_loss_clip": 1.02520776, + "balance_loss_mlp": 1.04866791, + "epoch": 0.19888771982564257, + "flos": 23513625141120.0, + "grad_norm": 1.8666050855147507, + "language_loss": 0.75933248, + "learning_rate": 3.708178601452737e-06, + "loss": 0.78115153, + "num_input_tokens_seen": 71378885, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.921875, + "step": 3308, + "time_per_iteration": 2.483060121536255 + }, + { + "auxiliary_loss_clip": 0.01141228, + "auxiliary_loss_mlp": 0.01041046, + "balance_loss_clip": 1.02426159, + "balance_loss_mlp": 1.04736626, + "epoch": 0.19894784307831054, + "flos": 18150510389760.0, + "grad_norm": 1.6368693105847256, + "language_loss": 0.75640005, + "learning_rate": 3.7079759997666374e-06, + "loss": 0.7782228, + "num_input_tokens_seen": 71397285, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.94140625, + "step": 3309, + "time_per_iteration": 3.8069632053375244 + }, + { + "auxiliary_loss_clip": 0.01138354, + "auxiliary_loss_mlp": 0.01046592, + "balance_loss_clip": 1.02869844, + "balance_loss_mlp": 1.04665506, + "epoch": 0.1990079663309785, + "flos": 24276044246400.0, + "grad_norm": 1.6858420956549012, + "language_loss": 0.87646699, + "learning_rate": 3.707773333313917e-06, + "loss": 0.8983165, + "num_input_tokens_seen": 71415775, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.9140625, + "step": 3310, + "time_per_iteration": 3.9299721717834473 + }, + { + "auxiliary_loss_clip": 0.01138843, + "auxiliary_loss_mlp": 0.01041462, + "balance_loss_clip": 1.02431977, + "balance_loss_mlp": 1.04637599, + "epoch": 0.19906808958364647, + "flos": 34897666366080.0, + "grad_norm": 3.6845239503362412, + "language_loss": 0.64166129, + "learning_rate": 3.70757060210226e-06, + "loss": 0.66346431, + "num_input_tokens_seen": 71437315, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.92578125, + "step": 3311, + "time_per_iteration": 2.5747337341308594 + }, + { + "auxiliary_loss_clip": 0.01143032, + "auxiliary_loss_mlp": 0.01042216, + "balance_loss_clip": 1.02559805, + "balance_loss_mlp": 1.04768658, + "epoch": 0.19912821283631443, + "flos": 24024885373440.0, + "grad_norm": 2.462607887220823, + "language_loss": 0.74053729, + "learning_rate": 3.707367806139355e-06, + "loss": 0.76238978, + "num_input_tokens_seen": 71456320, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.953125, + "step": 3312, + "time_per_iteration": 2.471867799758911 + }, + { + "auxiliary_loss_clip": 0.01141113, + "auxiliary_loss_mlp": 0.01046905, + "balance_loss_clip": 1.03060961, + "balance_loss_mlp": 1.04843581, + "epoch": 0.19918833608898243, + "flos": 19859031774720.0, + "grad_norm": 2.2841450786746016, + "language_loss": 0.83511955, + "learning_rate": 3.7071649454328915e-06, + "loss": 0.8569997, + "num_input_tokens_seen": 71475360, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.92578125, + "step": 3313, + "time_per_iteration": 2.4846627712249756 + }, + { + "auxiliary_loss_clip": 0.01142431, + "auxiliary_loss_mlp": 0.01041924, + "balance_loss_clip": 1.02575922, + "balance_loss_mlp": 1.04944849, + "epoch": 0.1992484593416504, + "flos": 29095794984960.0, + "grad_norm": 3.438256379955746, + "language_loss": 0.80930895, + "learning_rate": 3.7069620199905625e-06, + "loss": 0.83115256, + "num_input_tokens_seen": 71496155, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.9296875, + "step": 3314, + "time_per_iteration": 2.525754928588867 + }, + { + "auxiliary_loss_clip": 0.01137185, + "auxiliary_loss_mlp": 0.01043596, + "balance_loss_clip": 1.0280745, + "balance_loss_mlp": 1.04706359, + "epoch": 0.19930858259431836, + "flos": 23295001011840.0, + "grad_norm": 1.5137591341622172, + "language_loss": 0.87549174, + "learning_rate": 3.7067590298200627e-06, + "loss": 0.89729953, + "num_input_tokens_seen": 71517295, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8984375, + "step": 3315, + "time_per_iteration": 2.5170931816101074 + }, + { + "auxiliary_loss_clip": 0.01141446, + "auxiliary_loss_mlp": 0.01046665, + "balance_loss_clip": 1.03032112, + "balance_loss_mlp": 1.04808092, + "epoch": 0.19936870584698632, + "flos": 25378825651200.0, + "grad_norm": 1.5984895942740787, + "language_loss": 0.71255141, + "learning_rate": 3.7065559749290892e-06, + "loss": 0.73443246, + "num_input_tokens_seen": 71540000, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9296875, + "step": 3316, + "time_per_iteration": 2.520071029663086 + }, + { + "auxiliary_loss_clip": 0.0105243, + "auxiliary_loss_mlp": 0.01028392, + "balance_loss_clip": 1.02646089, + "balance_loss_mlp": 1.01928639, + "epoch": 0.1994288290996543, + "flos": 62168053109760.0, + "grad_norm": 0.8439111854473917, + "language_loss": 0.66260874, + "learning_rate": 3.706352855325342e-06, + "loss": 0.68341696, + "num_input_tokens_seen": 71607880, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.33203125, + "step": 3317, + "time_per_iteration": 3.1460416316986084 + }, + { + "auxiliary_loss_clip": 0.01142295, + "auxiliary_loss_mlp": 0.01052969, + "balance_loss_clip": 1.03557682, + "balance_loss_mlp": 1.04575253, + "epoch": 0.19948895235232225, + "flos": 19025832919680.0, + "grad_norm": 2.672944172124665, + "language_loss": 0.74319738, + "learning_rate": 3.7061496710165233e-06, + "loss": 0.76515001, + "num_input_tokens_seen": 71625695, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.96484375, + "step": 3318, + "time_per_iteration": 2.6139748096466064 + }, + { + "auxiliary_loss_clip": 0.01134724, + "auxiliary_loss_mlp": 0.01043694, + "balance_loss_clip": 1.0282445, + "balance_loss_mlp": 1.04536486, + "epoch": 0.19954907560499022, + "flos": 37815803182080.0, + "grad_norm": 1.900050251198073, + "language_loss": 0.78860074, + "learning_rate": 3.7059464220103385e-06, + "loss": 0.81038487, + "num_input_tokens_seen": 71648520, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.89453125, + "step": 3319, + "time_per_iteration": 2.6014342308044434 + }, + { + "auxiliary_loss_clip": 0.01141458, + "auxiliary_loss_mlp": 0.01042777, + "balance_loss_clip": 1.02439499, + "balance_loss_mlp": 1.04806578, + "epoch": 0.1996091988576582, + "flos": 49565199594240.0, + "grad_norm": 2.0962453666662073, + "language_loss": 0.75462162, + "learning_rate": 3.7057431083144945e-06, + "loss": 0.77646399, + "num_input_tokens_seen": 71672185, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.93359375, + "step": 3320, + "time_per_iteration": 2.739485263824463 + }, + { + "auxiliary_loss_clip": 0.01139438, + "auxiliary_loss_mlp": 0.01042565, + "balance_loss_clip": 1.02613819, + "balance_loss_mlp": 1.04714417, + "epoch": 0.19966932211032618, + "flos": 22635788659200.0, + "grad_norm": 2.167317842134812, + "language_loss": 0.80547488, + "learning_rate": 3.705539729936701e-06, + "loss": 0.82729495, + "num_input_tokens_seen": 71692890, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.921875, + "step": 3321, + "time_per_iteration": 2.581353187561035 + }, + { + "auxiliary_loss_clip": 0.01049309, + "auxiliary_loss_mlp": 0.01003433, + "balance_loss_clip": 1.00151408, + "balance_loss_mlp": 1.01694489, + "epoch": 0.19972944536299414, + "flos": 54082117745280.0, + "grad_norm": 0.880630206553271, + "language_loss": 0.65178835, + "learning_rate": 3.7053362868846696e-06, + "loss": 0.67231572, + "num_input_tokens_seen": 71745815, + "router_z_loss_clip": 0.01916504, + "router_z_loss_mlp": 0.32421875, + "step": 3322, + "time_per_iteration": 2.9042704105377197 + }, + { + "auxiliary_loss_clip": 0.01050141, + "auxiliary_loss_mlp": 0.01003283, + "balance_loss_clip": 1.00130391, + "balance_loss_mlp": 1.01724231, + "epoch": 0.1997895686156621, + "flos": 69355031817600.0, + "grad_norm": 0.7916622121471568, + "language_loss": 0.56975091, + "learning_rate": 3.7051327791661153e-06, + "loss": 0.59028506, + "num_input_tokens_seen": 71806915, + "router_z_loss_clip": 0.01977539, + "router_z_loss_mlp": 0.328125, + "step": 3323, + "time_per_iteration": 3.2141411304473877 + }, + { + "auxiliary_loss_clip": 0.01139547, + "auxiliary_loss_mlp": 0.01035371, + "balance_loss_clip": 1.01859808, + "balance_loss_mlp": 1.04839373, + "epoch": 0.19984969186833007, + "flos": 18552063507840.0, + "grad_norm": 1.9849201654975537, + "language_loss": 0.80526733, + "learning_rate": 3.7049292067887555e-06, + "loss": 0.82701647, + "num_input_tokens_seen": 71824645, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9140625, + "step": 3324, + "time_per_iteration": 2.5455262660980225 + }, + { + "auxiliary_loss_clip": 0.01137077, + "auxiliary_loss_mlp": 0.01040613, + "balance_loss_clip": 1.02329218, + "balance_loss_mlp": 1.04540765, + "epoch": 0.19990981512099804, + "flos": 26429678968320.0, + "grad_norm": 1.8681208438308643, + "language_loss": 0.53681695, + "learning_rate": 3.7047255697603092e-06, + "loss": 0.55859387, + "num_input_tokens_seen": 71845125, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.91796875, + "step": 3325, + "time_per_iteration": 2.581782102584839 + }, + { + "auxiliary_loss_clip": 0.01138508, + "auxiliary_loss_mlp": 0.01039502, + "balance_loss_clip": 1.02337289, + "balance_loss_mlp": 1.04565668, + "epoch": 0.19996993837366603, + "flos": 16325997010560.0, + "grad_norm": 2.0672953846254027, + "language_loss": 0.86169922, + "learning_rate": 3.7045218680884984e-06, + "loss": 0.88347936, + "num_input_tokens_seen": 71863500, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.9296875, + "step": 3326, + "time_per_iteration": 2.494718551635742 + }, + { + "auxiliary_loss_clip": 0.01138244, + "auxiliary_loss_mlp": 0.01037965, + "balance_loss_clip": 1.02243209, + "balance_loss_mlp": 1.04851878, + "epoch": 0.200030061626334, + "flos": 20844169159680.0, + "grad_norm": 1.8653522915536895, + "language_loss": 0.71835959, + "learning_rate": 3.7043181017810476e-06, + "loss": 0.74012172, + "num_input_tokens_seen": 71881845, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8984375, + "step": 3327, + "time_per_iteration": 2.536574602127075 + }, + { + "auxiliary_loss_clip": 0.0114197, + "auxiliary_loss_mlp": 0.01041829, + "balance_loss_clip": 1.02368546, + "balance_loss_mlp": 1.04750776, + "epoch": 0.20009018487900196, + "flos": 23762629198080.0, + "grad_norm": 1.83111198959611, + "language_loss": 0.76588571, + "learning_rate": 3.7041142708456833e-06, + "loss": 0.78772372, + "num_input_tokens_seen": 71900940, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9453125, + "step": 3328, + "time_per_iteration": 2.5083916187286377 + }, + { + "auxiliary_loss_clip": 0.01698253, + "auxiliary_loss_mlp": 0.01552284, + "balance_loss_clip": 1.52980089, + "balance_loss_mlp": 1.56677365, + "epoch": 0.20015030813166992, + "flos": 28106162236800.0, + "grad_norm": 1.6482454448342019, + "language_loss": 1.03044438, + "learning_rate": 3.7039103752901353e-06, + "loss": 0.7143048, + "num_input_tokens_seen": 71921925, + "router_z_loss_clip": 0.22460938, + "router_z_loss_mlp": 1.3125, + "step": 3329, + "time_per_iteration": 15.37552785873413 + }, + { + "auxiliary_loss_clip": 0.01146286, + "auxiliary_loss_mlp": 0.01050404, + "balance_loss_clip": 1.03149772, + "balance_loss_mlp": 1.0504123, + "epoch": 0.2002104313843379, + "flos": 26067160955520.0, + "grad_norm": 1.5519947176183269, + "language_loss": 0.81297028, + "learning_rate": 3.7037064151221353e-06, + "loss": 0.8349371, + "num_input_tokens_seen": 71941855, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.9609375, + "step": 3330, + "time_per_iteration": 2.500103712081909 + }, + { + "auxiliary_loss_clip": 0.01140997, + "auxiliary_loss_mlp": 0.01037259, + "balance_loss_clip": 1.01994956, + "balance_loss_mlp": 1.04669356, + "epoch": 0.20027055463700585, + "flos": 22966633854720.0, + "grad_norm": 2.032272994312633, + "language_loss": 0.76649368, + "learning_rate": 3.703502390349417e-06, + "loss": 0.78827626, + "num_input_tokens_seen": 71960915, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9453125, + "step": 3331, + "time_per_iteration": 2.4018712043762207 + }, + { + "auxiliary_loss_clip": 0.01141733, + "auxiliary_loss_mlp": 0.01045779, + "balance_loss_clip": 1.02819538, + "balance_loss_mlp": 1.04608667, + "epoch": 0.20033067788967382, + "flos": 17165660313600.0, + "grad_norm": 1.6582018653132529, + "language_loss": 0.79261309, + "learning_rate": 3.7032983009797176e-06, + "loss": 0.81448817, + "num_input_tokens_seen": 71979220, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.953125, + "step": 3332, + "time_per_iteration": 2.4550859928131104 + }, + { + "auxiliary_loss_clip": 0.01045684, + "auxiliary_loss_mlp": 0.01005368, + "balance_loss_clip": 1.0036391, + "balance_loss_mlp": 1.01433849, + "epoch": 0.2003908011423418, + "flos": 60825566292480.0, + "grad_norm": 0.9315137515082259, + "language_loss": 0.61990142, + "learning_rate": 3.703094147020776e-06, + "loss": 0.64041197, + "num_input_tokens_seen": 72033950, + "router_z_loss_clip": 0.01733398, + "router_z_loss_mlp": 0.31445312, + "step": 3333, + "time_per_iteration": 2.9623756408691406 + }, + { + "auxiliary_loss_clip": 0.01139681, + "auxiliary_loss_mlp": 0.01044905, + "balance_loss_clip": 1.02819228, + "balance_loss_mlp": 1.04501462, + "epoch": 0.20045092439500978, + "flos": 24206234163840.0, + "grad_norm": 2.1372355522021893, + "language_loss": 0.81203878, + "learning_rate": 3.7028899284803334e-06, + "loss": 0.8338846, + "num_input_tokens_seen": 72051395, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9453125, + "step": 3334, + "time_per_iteration": 2.49924373626709 + }, + { + "auxiliary_loss_clip": 0.01146523, + "auxiliary_loss_mlp": 0.01047388, + "balance_loss_clip": 1.02938735, + "balance_loss_mlp": 1.04878521, + "epoch": 0.20051104764767774, + "flos": 29387605075200.0, + "grad_norm": 2.1564721635267516, + "language_loss": 0.74261904, + "learning_rate": 3.702685645366134e-06, + "loss": 0.76455814, + "num_input_tokens_seen": 72071305, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9765625, + "step": 3335, + "time_per_iteration": 2.634608745574951 + }, + { + "auxiliary_loss_clip": 0.01150022, + "auxiliary_loss_mlp": 0.01058924, + "balance_loss_clip": 1.04205632, + "balance_loss_mlp": 1.05375338, + "epoch": 0.2005711709003457, + "flos": 23513804709120.0, + "grad_norm": 1.6943946878944693, + "language_loss": 0.79839814, + "learning_rate": 3.7024812976859243e-06, + "loss": 0.82048762, + "num_input_tokens_seen": 72090165, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9609375, + "step": 3336, + "time_per_iteration": 2.7025394439697266 + }, + { + "auxiliary_loss_clip": 0.01145798, + "auxiliary_loss_mlp": 0.01045992, + "balance_loss_clip": 1.02744317, + "balance_loss_mlp": 1.04703879, + "epoch": 0.20063129415301367, + "flos": 22523388024960.0, + "grad_norm": 1.9043375292422164, + "language_loss": 0.78031212, + "learning_rate": 3.7022768854474532e-06, + "loss": 0.80223, + "num_input_tokens_seen": 72107210, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.98828125, + "step": 3337, + "time_per_iteration": 2.5718014240264893 + }, + { + "auxiliary_loss_clip": 0.01143827, + "auxiliary_loss_mlp": 0.01045572, + "balance_loss_clip": 1.02708244, + "balance_loss_mlp": 1.0486424, + "epoch": 0.20069141740568164, + "flos": 25958243940480.0, + "grad_norm": 1.9983960159800889, + "language_loss": 0.6873948, + "learning_rate": 3.7020724086584724e-06, + "loss": 0.70928884, + "num_input_tokens_seen": 72126315, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.94921875, + "step": 3338, + "time_per_iteration": 2.5848047733306885 + }, + { + "auxiliary_loss_clip": 0.01143098, + "auxiliary_loss_mlp": 0.01049172, + "balance_loss_clip": 1.03263819, + "balance_loss_mlp": 1.04853702, + "epoch": 0.2007515406583496, + "flos": 24790608529920.0, + "grad_norm": 2.1061075345379576, + "language_loss": 0.68823779, + "learning_rate": 3.701867867326735e-06, + "loss": 0.71016049, + "num_input_tokens_seen": 72146470, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.9453125, + "step": 3339, + "time_per_iteration": 2.523771047592163 + }, + { + "auxiliary_loss_clip": 0.01149874, + "auxiliary_loss_mlp": 0.01038245, + "balance_loss_clip": 1.02217603, + "balance_loss_mlp": 1.05197799, + "epoch": 0.2008116639110176, + "flos": 37925582123520.0, + "grad_norm": 2.3080693694415872, + "language_loss": 0.66263533, + "learning_rate": 3.7016632614599974e-06, + "loss": 0.68451655, + "num_input_tokens_seen": 72166600, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.9765625, + "step": 3340, + "time_per_iteration": 2.647495985031128 + }, + { + "auxiliary_loss_clip": 0.01141947, + "auxiliary_loss_mlp": 0.0103392, + "balance_loss_clip": 1.01570475, + "balance_loss_mlp": 1.0457145, + "epoch": 0.20087178716368556, + "flos": 20740531443840.0, + "grad_norm": 2.8472305033219696, + "language_loss": 0.74124628, + "learning_rate": 3.701458591066019e-06, + "loss": 0.76300496, + "num_input_tokens_seen": 72185160, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.9609375, + "step": 3341, + "time_per_iteration": 2.511585235595703 + }, + { + "auxiliary_loss_clip": 0.01140464, + "auxiliary_loss_mlp": 0.01043131, + "balance_loss_clip": 1.02689481, + "balance_loss_mlp": 1.04846787, + "epoch": 0.20093191041635353, + "flos": 23842279607040.0, + "grad_norm": 2.1698717951472326, + "language_loss": 0.71578503, + "learning_rate": 3.70125385615256e-06, + "loss": 0.73762101, + "num_input_tokens_seen": 72205160, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.91796875, + "step": 3342, + "time_per_iteration": 2.561998128890991 + }, + { + "auxiliary_loss_clip": 0.01142187, + "auxiliary_loss_mlp": 0.01045325, + "balance_loss_clip": 1.02871895, + "balance_loss_mlp": 1.04746354, + "epoch": 0.2009920336690215, + "flos": 21792067119360.0, + "grad_norm": 1.9864957062525024, + "language_loss": 0.73130047, + "learning_rate": 3.701049056727384e-06, + "loss": 0.75317556, + "num_input_tokens_seen": 72223555, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9453125, + "step": 3343, + "time_per_iteration": 4.046127557754517 + }, + { + "auxiliary_loss_clip": 0.01142173, + "auxiliary_loss_mlp": 0.01050047, + "balance_loss_clip": 1.03252363, + "balance_loss_mlp": 1.04738092, + "epoch": 0.20105215692168946, + "flos": 26359222440960.0, + "grad_norm": 1.9813453341923526, + "language_loss": 0.81026411, + "learning_rate": 3.7008441927982574e-06, + "loss": 0.83218634, + "num_input_tokens_seen": 72242465, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.94921875, + "step": 3344, + "time_per_iteration": 2.520765542984009 + }, + { + "auxiliary_loss_clip": 0.01141139, + "auxiliary_loss_mlp": 0.01050367, + "balance_loss_clip": 1.03412437, + "balance_loss_mlp": 1.04661858, + "epoch": 0.20111228017435742, + "flos": 18807280617600.0, + "grad_norm": 2.7491478080862684, + "language_loss": 0.83503234, + "learning_rate": 3.700639264372948e-06, + "loss": 0.85694736, + "num_input_tokens_seen": 72260655, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.9453125, + "step": 3345, + "time_per_iteration": 4.064355373382568 + }, + { + "auxiliary_loss_clip": 0.01135224, + "auxiliary_loss_mlp": 0.01041726, + "balance_loss_clip": 1.02689624, + "balance_loss_mlp": 1.0464828, + "epoch": 0.20117240342702541, + "flos": 19975059682560.0, + "grad_norm": 1.723487885242635, + "language_loss": 0.67909771, + "learning_rate": 3.7004342714592283e-06, + "loss": 0.70086718, + "num_input_tokens_seen": 72279055, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.88671875, + "step": 3346, + "time_per_iteration": 2.521949291229248 + }, + { + "auxiliary_loss_clip": 0.01140579, + "auxiliary_loss_mlp": 0.01048866, + "balance_loss_clip": 1.03233206, + "balance_loss_mlp": 1.04726124, + "epoch": 0.20123252667969338, + "flos": 23142703345920.0, + "grad_norm": 2.272845003166824, + "language_loss": 0.73496711, + "learning_rate": 3.70022921406487e-06, + "loss": 0.75686157, + "num_input_tokens_seen": 72297895, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.93359375, + "step": 3347, + "time_per_iteration": 2.5316877365112305 + }, + { + "auxiliary_loss_clip": 0.01140927, + "auxiliary_loss_mlp": 0.01047237, + "balance_loss_clip": 1.03179908, + "balance_loss_mlp": 1.04827023, + "epoch": 0.20129264993236134, + "flos": 23221671396480.0, + "grad_norm": 1.7467826588499227, + "language_loss": 0.86716485, + "learning_rate": 3.70002409219765e-06, + "loss": 0.88904649, + "num_input_tokens_seen": 72318385, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.92578125, + "step": 3348, + "time_per_iteration": 2.5123202800750732 + }, + { + "auxiliary_loss_clip": 0.01138726, + "auxiliary_loss_mlp": 0.01041589, + "balance_loss_clip": 1.02335036, + "balance_loss_mlp": 1.04729295, + "epoch": 0.2013527731850293, + "flos": 21871466133120.0, + "grad_norm": 1.5886148695932183, + "language_loss": 0.71200913, + "learning_rate": 3.699818905865346e-06, + "loss": 0.73381227, + "num_input_tokens_seen": 72338235, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.9140625, + "step": 3349, + "time_per_iteration": 2.505594491958618 + }, + { + "auxiliary_loss_clip": 0.01144556, + "auxiliary_loss_mlp": 0.01048519, + "balance_loss_clip": 1.03016067, + "balance_loss_mlp": 1.04982185, + "epoch": 0.20141289643769728, + "flos": 18040803275520.0, + "grad_norm": 1.649154800785762, + "language_loss": 0.71079665, + "learning_rate": 3.6996136550757377e-06, + "loss": 0.73272741, + "num_input_tokens_seen": 72357825, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.9453125, + "step": 3350, + "time_per_iteration": 2.4927315711975098 + }, + { + "auxiliary_loss_clip": 0.01145933, + "auxiliary_loss_mlp": 0.01044553, + "balance_loss_clip": 1.02612305, + "balance_loss_mlp": 1.05045485, + "epoch": 0.20147301969036524, + "flos": 23951412103680.0, + "grad_norm": 3.2873247390310554, + "language_loss": 0.76327842, + "learning_rate": 3.69940833983661e-06, + "loss": 0.78518331, + "num_input_tokens_seen": 72376335, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.953125, + "step": 3351, + "time_per_iteration": 2.5077342987060547 + }, + { + "auxiliary_loss_clip": 0.01146641, + "auxiliary_loss_mlp": 0.01043619, + "balance_loss_clip": 1.02555871, + "balance_loss_mlp": 1.05069637, + "epoch": 0.2015331429430332, + "flos": 25588471380480.0, + "grad_norm": 1.662758000066145, + "language_loss": 0.80545723, + "learning_rate": 3.699202960155748e-06, + "loss": 0.8273598, + "num_input_tokens_seen": 72395440, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.95703125, + "step": 3352, + "time_per_iteration": 2.5717766284942627 + }, + { + "auxiliary_loss_clip": 0.01146315, + "auxiliary_loss_mlp": 0.01039569, + "balance_loss_clip": 1.02274823, + "balance_loss_mlp": 1.05210721, + "epoch": 0.2015932661957012, + "flos": 26724972677760.0, + "grad_norm": 1.7179856660366186, + "language_loss": 0.8027631, + "learning_rate": 3.6989975160409396e-06, + "loss": 0.82462192, + "num_input_tokens_seen": 72414670, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9453125, + "step": 3353, + "time_per_iteration": 2.6415467262268066 + }, + { + "auxiliary_loss_clip": 0.01140403, + "auxiliary_loss_mlp": 0.01041635, + "balance_loss_clip": 1.02512455, + "balance_loss_mlp": 1.04978478, + "epoch": 0.20165338944836916, + "flos": 15633136592640.0, + "grad_norm": 2.050762039112588, + "language_loss": 0.8946988, + "learning_rate": 3.6987920074999747e-06, + "loss": 0.91651917, + "num_input_tokens_seen": 72432210, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.90625, + "step": 3354, + "time_per_iteration": 2.4780237674713135 + }, + { + "auxiliary_loss_clip": 0.01052075, + "auxiliary_loss_mlp": 0.01011403, + "balance_loss_clip": 1.00948358, + "balance_loss_mlp": 1.0202148, + "epoch": 0.20171351270103713, + "flos": 57912529207680.0, + "grad_norm": 0.830112597874188, + "language_loss": 0.55839282, + "learning_rate": 3.6985864345406465e-06, + "loss": 0.57902759, + "num_input_tokens_seen": 72489225, + "router_z_loss_clip": 0.01916504, + "router_z_loss_mlp": 0.31835938, + "step": 3355, + "time_per_iteration": 3.0224292278289795 + }, + { + "auxiliary_loss_clip": 0.01140957, + "auxiliary_loss_mlp": 0.01045212, + "balance_loss_clip": 1.02891648, + "balance_loss_mlp": 1.05068707, + "epoch": 0.2017736359537051, + "flos": 20814363849600.0, + "grad_norm": 1.5257876958196368, + "language_loss": 0.84076762, + "learning_rate": 3.698380797170751e-06, + "loss": 0.86262929, + "num_input_tokens_seen": 72508715, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.90234375, + "step": 3356, + "time_per_iteration": 2.510615348815918 + }, + { + "auxiliary_loss_clip": 0.01152963, + "auxiliary_loss_mlp": 0.01043363, + "balance_loss_clip": 1.024194, + "balance_loss_mlp": 1.05356848, + "epoch": 0.20183375920637306, + "flos": 17092043389440.0, + "grad_norm": 2.9361880537925584, + "language_loss": 0.688007, + "learning_rate": 3.698175095398085e-06, + "loss": 0.70997024, + "num_input_tokens_seen": 72525135, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.9921875, + "step": 3357, + "time_per_iteration": 2.460022211074829 + }, + { + "auxiliary_loss_clip": 0.01144866, + "auxiliary_loss_mlp": 0.01040819, + "balance_loss_clip": 1.02280617, + "balance_loss_mlp": 1.0492487, + "epoch": 0.20189388245904102, + "flos": 18661339658880.0, + "grad_norm": 1.7490617907772006, + "language_loss": 0.71748042, + "learning_rate": 3.6979693292304493e-06, + "loss": 0.73933733, + "num_input_tokens_seen": 72543690, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.95703125, + "step": 3358, + "time_per_iteration": 2.563767194747925 + }, + { + "auxiliary_loss_clip": 0.01139733, + "auxiliary_loss_mlp": 0.01054955, + "balance_loss_clip": 1.03818202, + "balance_loss_mlp": 1.04849517, + "epoch": 0.20195400571170902, + "flos": 16797539779200.0, + "grad_norm": 2.042998238377631, + "language_loss": 0.83104217, + "learning_rate": 3.6977634986756463e-06, + "loss": 0.85298896, + "num_input_tokens_seen": 72560725, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9140625, + "step": 3359, + "time_per_iteration": 2.531332015991211 + }, + { + "auxiliary_loss_clip": 0.01052883, + "auxiliary_loss_mlp": 0.01001012, + "balance_loss_clip": 0.99911654, + "balance_loss_mlp": 1.02214265, + "epoch": 0.20201412896437698, + "flos": 67174716268800.0, + "grad_norm": 12.853939959466139, + "language_loss": 0.5895561, + "learning_rate": 3.697557603741482e-06, + "loss": 0.61009508, + "num_input_tokens_seen": 72621940, + "router_z_loss_clip": 0.0189209, + "router_z_loss_mlp": 0.30859375, + "step": 3360, + "time_per_iteration": 3.0536341667175293 + }, + { + "auxiliary_loss_clip": 0.01147837, + "auxiliary_loss_mlp": 0.01049077, + "balance_loss_clip": 1.03117216, + "balance_loss_mlp": 1.05149043, + "epoch": 0.20207425221704495, + "flos": 21325013550720.0, + "grad_norm": 2.4416015649532286, + "language_loss": 0.62138069, + "learning_rate": 3.697351644435763e-06, + "loss": 0.64334983, + "num_input_tokens_seen": 72639135, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.96484375, + "step": 3361, + "time_per_iteration": 2.512657403945923 + }, + { + "auxiliary_loss_clip": 0.0114522, + "auxiliary_loss_mlp": 0.01055979, + "balance_loss_clip": 1.03900385, + "balance_loss_mlp": 1.05156183, + "epoch": 0.2021343754697129, + "flos": 22527158952960.0, + "grad_norm": 2.0025961231737526, + "language_loss": 0.75524926, + "learning_rate": 3.6971456207662993e-06, + "loss": 0.77726126, + "num_input_tokens_seen": 72658525, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9375, + "step": 3362, + "time_per_iteration": 2.555492639541626 + }, + { + "auxiliary_loss_clip": 0.01145631, + "auxiliary_loss_mlp": 0.01046193, + "balance_loss_clip": 1.02926481, + "balance_loss_mlp": 1.05209327, + "epoch": 0.20219449872238088, + "flos": 19062785036160.0, + "grad_norm": 1.6135185744423872, + "language_loss": 0.76400363, + "learning_rate": 3.6969395327409035e-06, + "loss": 0.78592181, + "num_input_tokens_seen": 72678085, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9375, + "step": 3363, + "time_per_iteration": 2.486969470977783 + }, + { + "auxiliary_loss_clip": 0.01141408, + "auxiliary_loss_mlp": 0.01052858, + "balance_loss_clip": 1.03686023, + "balance_loss_mlp": 1.04736471, + "epoch": 0.20225462197504884, + "flos": 24717027519360.0, + "grad_norm": 2.0495916908721434, + "language_loss": 0.74606001, + "learning_rate": 3.696733380367391e-06, + "loss": 0.76800275, + "num_input_tokens_seen": 72698695, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.9375, + "step": 3364, + "time_per_iteration": 2.58673095703125 + }, + { + "auxiliary_loss_clip": 0.01144028, + "auxiliary_loss_mlp": 0.01052057, + "balance_loss_clip": 1.03390145, + "balance_loss_mlp": 1.04865253, + "epoch": 0.2023147452277168, + "flos": 22018304931840.0, + "grad_norm": 2.1992700083841084, + "language_loss": 0.71451771, + "learning_rate": 3.6965271636535783e-06, + "loss": 0.73647857, + "num_input_tokens_seen": 72717880, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.953125, + "step": 3365, + "time_per_iteration": 2.522470712661743 + }, + { + "auxiliary_loss_clip": 0.01147339, + "auxiliary_loss_mlp": 0.01052179, + "balance_loss_clip": 1.03516757, + "balance_loss_mlp": 1.05331004, + "epoch": 0.2023748684803848, + "flos": 17745365911680.0, + "grad_norm": 1.9561618637344158, + "language_loss": 0.85770535, + "learning_rate": 3.696320882607286e-06, + "loss": 0.87970054, + "num_input_tokens_seen": 72736410, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.94140625, + "step": 3366, + "time_per_iteration": 2.536529541015625 + }, + { + "auxiliary_loss_clip": 0.01143453, + "auxiliary_loss_mlp": 0.01044399, + "balance_loss_clip": 1.02761412, + "balance_loss_mlp": 1.0499506, + "epoch": 0.20243499173305277, + "flos": 31138932493440.0, + "grad_norm": 1.628387041142295, + "language_loss": 0.69651556, + "learning_rate": 3.696114537236335e-06, + "loss": 0.7183941, + "num_input_tokens_seen": 72758295, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.93359375, + "step": 3367, + "time_per_iteration": 2.5608372688293457 + }, + { + "auxiliary_loss_clip": 0.01145892, + "auxiliary_loss_mlp": 0.01043195, + "balance_loss_clip": 1.0235498, + "balance_loss_mlp": 1.04696274, + "epoch": 0.20249511498572073, + "flos": 33839235279360.0, + "grad_norm": 2.963599898430263, + "language_loss": 0.68230569, + "learning_rate": 3.6959081275485512e-06, + "loss": 0.70419657, + "num_input_tokens_seen": 72782495, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.98828125, + "step": 3368, + "time_per_iteration": 2.66802978515625 + }, + { + "auxiliary_loss_clip": 0.01143607, + "auxiliary_loss_mlp": 0.01049214, + "balance_loss_clip": 1.03178596, + "balance_loss_mlp": 1.0505259, + "epoch": 0.2025552382383887, + "flos": 21215629658880.0, + "grad_norm": 7.849671101524798, + "language_loss": 0.77025628, + "learning_rate": 3.6957016535517615e-06, + "loss": 0.79218459, + "num_input_tokens_seen": 72801885, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9296875, + "step": 3369, + "time_per_iteration": 2.5143446922302246 + }, + { + "auxiliary_loss_clip": 0.01145287, + "auxiliary_loss_mlp": 0.01057318, + "balance_loss_clip": 1.04029489, + "balance_loss_mlp": 1.04800487, + "epoch": 0.20261536149105666, + "flos": 14647388676480.0, + "grad_norm": 4.298107611861754, + "language_loss": 0.65408337, + "learning_rate": 3.695495115253795e-06, + "loss": 0.67610943, + "num_input_tokens_seen": 72816990, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.97265625, + "step": 3370, + "time_per_iteration": 2.503589630126953 + }, + { + "auxiliary_loss_clip": 0.01048919, + "auxiliary_loss_mlp": 0.01024768, + "balance_loss_clip": 1.02313519, + "balance_loss_mlp": 1.01856685, + "epoch": 0.20267548474372463, + "flos": 66783649921920.0, + "grad_norm": 0.6799262329378595, + "language_loss": 0.58101869, + "learning_rate": 3.6952885126624834e-06, + "loss": 0.60175562, + "num_input_tokens_seen": 72879240, + "router_z_loss_clip": 0.01635742, + "router_z_loss_mlp": 0.3046875, + "step": 3371, + "time_per_iteration": 3.1626369953155518 + }, + { + "auxiliary_loss_clip": 0.01143688, + "auxiliary_loss_mlp": 0.0104249, + "balance_loss_clip": 1.0254668, + "balance_loss_mlp": 1.04866266, + "epoch": 0.2027356079963926, + "flos": 24680793674880.0, + "grad_norm": 1.766606164011739, + "language_loss": 0.92068136, + "learning_rate": 3.6950818457856617e-06, + "loss": 0.94254309, + "num_input_tokens_seen": 72899030, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.94921875, + "step": 3372, + "time_per_iteration": 2.578045129776001 + }, + { + "auxiliary_loss_clip": 0.0114549, + "auxiliary_loss_mlp": 0.01044018, + "balance_loss_clip": 1.02538514, + "balance_loss_mlp": 1.05037856, + "epoch": 0.20279573124906058, + "flos": 26392762765440.0, + "grad_norm": 1.6491924635250923, + "language_loss": 0.78632712, + "learning_rate": 3.694875114631167e-06, + "loss": 0.80822217, + "num_input_tokens_seen": 72919190, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.953125, + "step": 3373, + "time_per_iteration": 2.5762507915496826 + }, + { + "auxiliary_loss_clip": 0.01137806, + "auxiliary_loss_mlp": 0.01039377, + "balance_loss_clip": 1.02162719, + "balance_loss_mlp": 1.04629672, + "epoch": 0.20285585450172855, + "flos": 33799984692480.0, + "grad_norm": 1.8751465027713456, + "language_loss": 0.71102971, + "learning_rate": 3.6946683192068377e-06, + "loss": 0.73280156, + "num_input_tokens_seen": 72939720, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9140625, + "step": 3374, + "time_per_iteration": 2.6212260723114014 + }, + { + "auxiliary_loss_clip": 0.01048807, + "auxiliary_loss_mlp": 0.01001811, + "balance_loss_clip": 1.00001132, + "balance_loss_mlp": 1.01811993, + "epoch": 0.20291597775439651, + "flos": 71164823598720.0, + "grad_norm": 0.9912238676598704, + "language_loss": 0.62450445, + "learning_rate": 3.694461459520516e-06, + "loss": 0.64501071, + "num_input_tokens_seen": 73000015, + "router_z_loss_clip": 0.01794434, + "router_z_loss_mlp": 0.30859375, + "step": 3375, + "time_per_iteration": 3.0768048763275146 + }, + { + "auxiliary_loss_clip": 0.01140549, + "auxiliary_loss_mlp": 0.01044631, + "balance_loss_clip": 1.02722621, + "balance_loss_mlp": 1.04769731, + "epoch": 0.20297610100706448, + "flos": 19494287118720.0, + "grad_norm": 1.6669967725054042, + "language_loss": 0.82450807, + "learning_rate": 3.6942545355800463e-06, + "loss": 0.84635985, + "num_input_tokens_seen": 73017675, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9296875, + "step": 3376, + "time_per_iteration": 2.5632758140563965 + }, + { + "auxiliary_loss_clip": 0.011433, + "auxiliary_loss_mlp": 0.01039932, + "balance_loss_clip": 1.02110839, + "balance_loss_mlp": 1.04692364, + "epoch": 0.20303622425973245, + "flos": 25044245441280.0, + "grad_norm": 2.2640770034372006, + "language_loss": 0.81587797, + "learning_rate": 3.6940475473932743e-06, + "loss": 0.83771032, + "num_input_tokens_seen": 73036135, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.96484375, + "step": 3377, + "time_per_iteration": 2.6376402378082275 + }, + { + "auxiliary_loss_clip": 0.01139097, + "auxiliary_loss_mlp": 0.01045773, + "balance_loss_clip": 1.02786779, + "balance_loss_mlp": 1.04670048, + "epoch": 0.2030963475124004, + "flos": 21979988098560.0, + "grad_norm": 4.046949512949318, + "language_loss": 0.769104, + "learning_rate": 3.69384049496805e-06, + "loss": 0.79095268, + "num_input_tokens_seen": 73054075, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.921875, + "step": 3378, + "time_per_iteration": 2.532942056655884 + }, + { + "auxiliary_loss_clip": 0.01143396, + "auxiliary_loss_mlp": 0.01043534, + "balance_loss_clip": 1.02493691, + "balance_loss_mlp": 1.04772687, + "epoch": 0.2031564707650684, + "flos": 19500392430720.0, + "grad_norm": 1.9870266088444717, + "language_loss": 0.79710048, + "learning_rate": 3.6936333783122242e-06, + "loss": 0.81896979, + "num_input_tokens_seen": 73073530, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.95703125, + "step": 3379, + "time_per_iteration": 2.5187509059906006 + }, + { + "auxiliary_loss_clip": 0.01137083, + "auxiliary_loss_mlp": 0.01038348, + "balance_loss_clip": 1.02162337, + "balance_loss_mlp": 1.04698288, + "epoch": 0.20321659401773637, + "flos": 22747075971840.0, + "grad_norm": 1.7003196517483214, + "language_loss": 0.86949915, + "learning_rate": 3.6934261974336505e-06, + "loss": 0.89125347, + "num_input_tokens_seen": 73092820, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8984375, + "step": 3380, + "time_per_iteration": 2.5350420475006104 + }, + { + "auxiliary_loss_clip": 0.01143485, + "auxiliary_loss_mlp": 0.01046611, + "balance_loss_clip": 1.02905154, + "balance_loss_mlp": 1.05103135, + "epoch": 0.20327671727040433, + "flos": 22455840499200.0, + "grad_norm": 1.9133898096862498, + "language_loss": 0.74515057, + "learning_rate": 3.693218952340186e-06, + "loss": 0.76705158, + "num_input_tokens_seen": 73113385, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.92578125, + "step": 3381, + "time_per_iteration": 2.5428466796875 + }, + { + "auxiliary_loss_clip": 0.01143807, + "auxiliary_loss_mlp": 0.01043772, + "balance_loss_clip": 1.0258193, + "balance_loss_mlp": 1.04754519, + "epoch": 0.2033368405230723, + "flos": 19535010163200.0, + "grad_norm": 1.741042372938858, + "language_loss": 0.79304886, + "learning_rate": 3.6930116430396895e-06, + "loss": 0.81492472, + "num_input_tokens_seen": 73131195, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9609375, + "step": 3382, + "time_per_iteration": 2.51084041595459 + }, + { + "auxiliary_loss_clip": 0.01146625, + "auxiliary_loss_mlp": 0.01040796, + "balance_loss_clip": 1.02123427, + "balance_loss_mlp": 1.04849267, + "epoch": 0.20339696377574026, + "flos": 13809233744640.0, + "grad_norm": 1.8514394244027284, + "language_loss": 0.80188596, + "learning_rate": 3.6928042695400214e-06, + "loss": 0.82376015, + "num_input_tokens_seen": 73148850, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.98046875, + "step": 3383, + "time_per_iteration": 2.5047500133514404 + }, + { + "auxiliary_loss_clip": 0.0113964, + "auxiliary_loss_mlp": 0.01042049, + "balance_loss_clip": 1.02401257, + "balance_loss_mlp": 1.04616201, + "epoch": 0.20345708702840823, + "flos": 20339409288960.0, + "grad_norm": 6.482166974991387, + "language_loss": 0.74195492, + "learning_rate": 3.6925968318490464e-06, + "loss": 0.76377177, + "num_input_tokens_seen": 73166775, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.93359375, + "step": 3384, + "time_per_iteration": 2.4931931495666504 + }, + { + "auxiliary_loss_clip": 0.01147866, + "auxiliary_loss_mlp": 0.01043488, + "balance_loss_clip": 1.02442586, + "balance_loss_mlp": 1.04929996, + "epoch": 0.2035172102810762, + "flos": 20333950421760.0, + "grad_norm": 2.292912234818254, + "language_loss": 0.76429737, + "learning_rate": 3.6923893299746293e-06, + "loss": 0.78621089, + "num_input_tokens_seen": 73183215, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.984375, + "step": 3385, + "time_per_iteration": 3.9999845027923584 + }, + { + "auxiliary_loss_clip": 0.01139546, + "auxiliary_loss_mlp": 0.01058955, + "balance_loss_clip": 1.04031098, + "balance_loss_mlp": 1.04538202, + "epoch": 0.2035773335337442, + "flos": 23330983461120.0, + "grad_norm": 1.8347755395186154, + "language_loss": 0.68259251, + "learning_rate": 3.692181763924639e-06, + "loss": 0.70457751, + "num_input_tokens_seen": 73203290, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.94140625, + "step": 3386, + "time_per_iteration": 2.525538682937622 + }, + { + "auxiliary_loss_clip": 0.01143921, + "auxiliary_loss_mlp": 0.01054172, + "balance_loss_clip": 1.0348835, + "balance_loss_mlp": 1.04785144, + "epoch": 0.20363745678641215, + "flos": 28330287310080.0, + "grad_norm": 1.949323793812955, + "language_loss": 0.81000078, + "learning_rate": 3.691974133706947e-06, + "loss": 0.83198166, + "num_input_tokens_seen": 73226185, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.9609375, + "step": 3387, + "time_per_iteration": 4.122355222702026 + }, + { + "auxiliary_loss_clip": 0.01137445, + "auxiliary_loss_mlp": 0.01040694, + "balance_loss_clip": 1.02331305, + "balance_loss_mlp": 1.04754424, + "epoch": 0.20369758003908012, + "flos": 18915658928640.0, + "grad_norm": 2.869822824167972, + "language_loss": 0.79960001, + "learning_rate": 3.6917664393294262e-06, + "loss": 0.82138139, + "num_input_tokens_seen": 73243300, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.8984375, + "step": 3388, + "time_per_iteration": 2.498455047607422 + }, + { + "auxiliary_loss_clip": 0.01142619, + "auxiliary_loss_mlp": 0.01040015, + "balance_loss_clip": 1.02120411, + "balance_loss_mlp": 1.04757476, + "epoch": 0.20375770329174808, + "flos": 19206499351680.0, + "grad_norm": 1.6489636222716584, + "language_loss": 0.71810246, + "learning_rate": 3.6915586807999527e-06, + "loss": 0.73992884, + "num_input_tokens_seen": 73261490, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.94921875, + "step": 3389, + "time_per_iteration": 2.4751241207122803 + }, + { + "auxiliary_loss_clip": 0.01140457, + "auxiliary_loss_mlp": 0.01048463, + "balance_loss_clip": 1.03108239, + "balance_loss_mlp": 1.04812241, + "epoch": 0.20381782654441605, + "flos": 19391008538880.0, + "grad_norm": 1.7476252287205662, + "language_loss": 0.87431413, + "learning_rate": 3.691350858126404e-06, + "loss": 0.89620328, + "num_input_tokens_seen": 73280180, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.921875, + "step": 3390, + "time_per_iteration": 2.5229172706604004 + }, + { + "auxiliary_loss_clip": 0.01138893, + "auxiliary_loss_mlp": 0.01044263, + "balance_loss_clip": 1.02673888, + "balance_loss_mlp": 1.04638386, + "epoch": 0.203877949797084, + "flos": 24827704300800.0, + "grad_norm": 3.0399462437196743, + "language_loss": 0.71092427, + "learning_rate": 3.691142971316662e-06, + "loss": 0.73275584, + "num_input_tokens_seen": 73300680, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.92578125, + "step": 3391, + "time_per_iteration": 2.528003454208374 + }, + { + "auxiliary_loss_clip": 0.01137362, + "auxiliary_loss_mlp": 0.01043664, + "balance_loss_clip": 1.02592552, + "balance_loss_mlp": 1.04483938, + "epoch": 0.20393807304975198, + "flos": 18003707504640.0, + "grad_norm": 2.517550673127581, + "language_loss": 0.85993969, + "learning_rate": 3.6909350203786086e-06, + "loss": 0.88174999, + "num_input_tokens_seen": 73316760, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.921875, + "step": 3392, + "time_per_iteration": 2.5080008506774902 + }, + { + "auxiliary_loss_clip": 0.01143294, + "auxiliary_loss_mlp": 0.01049793, + "balance_loss_clip": 1.03231716, + "balance_loss_mlp": 1.04759896, + "epoch": 0.20399819630241997, + "flos": 24206988349440.0, + "grad_norm": 1.5067582134175779, + "language_loss": 0.80730146, + "learning_rate": 3.69072700532013e-06, + "loss": 0.82923234, + "num_input_tokens_seen": 73339385, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.95703125, + "step": 3393, + "time_per_iteration": 2.5464906692504883 + }, + { + "auxiliary_loss_clip": 0.01139211, + "auxiliary_loss_mlp": 0.01039094, + "balance_loss_clip": 1.02236915, + "balance_loss_mlp": 1.0471251, + "epoch": 0.20405831955508794, + "flos": 20777124424320.0, + "grad_norm": 1.882536464234473, + "language_loss": 0.86276352, + "learning_rate": 3.6905189261491137e-06, + "loss": 0.88454658, + "num_input_tokens_seen": 73357235, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.921875, + "step": 3394, + "time_per_iteration": 2.495544195175171 + }, + { + "auxiliary_loss_clip": 0.01139364, + "auxiliary_loss_mlp": 0.01042637, + "balance_loss_clip": 1.02640033, + "balance_loss_mlp": 1.04756498, + "epoch": 0.2041184428077559, + "flos": 15486908325120.0, + "grad_norm": 2.9880936155816324, + "language_loss": 0.83455038, + "learning_rate": 3.69031078287345e-06, + "loss": 0.85637033, + "num_input_tokens_seen": 73374435, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.91796875, + "step": 3395, + "time_per_iteration": 2.4636099338531494 + }, + { + "auxiliary_loss_clip": 0.01144564, + "auxiliary_loss_mlp": 0.01035131, + "balance_loss_clip": 1.01753616, + "balance_loss_mlp": 1.04799199, + "epoch": 0.20417856606042387, + "flos": 15588463052160.0, + "grad_norm": 2.0105247570422877, + "language_loss": 0.83632553, + "learning_rate": 3.690102575501033e-06, + "loss": 0.85812247, + "num_input_tokens_seen": 73391025, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.96875, + "step": 3396, + "time_per_iteration": 2.507140636444092 + }, + { + "auxiliary_loss_clip": 0.01139778, + "auxiliary_loss_mlp": 0.01042511, + "balance_loss_clip": 1.02470088, + "balance_loss_mlp": 1.04775488, + "epoch": 0.20423868931309183, + "flos": 24279348297600.0, + "grad_norm": 1.9261630392212734, + "language_loss": 0.77139032, + "learning_rate": 3.6898943040397556e-06, + "loss": 0.79321325, + "num_input_tokens_seen": 73409270, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.91796875, + "step": 3397, + "time_per_iteration": 2.5000061988830566 + }, + { + "auxiliary_loss_clip": 0.01140053, + "auxiliary_loss_mlp": 0.01043864, + "balance_loss_clip": 1.027771, + "balance_loss_mlp": 1.0482713, + "epoch": 0.2042988125657598, + "flos": 18614870438400.0, + "grad_norm": 2.6022565941655285, + "language_loss": 0.87048233, + "learning_rate": 3.689685968497518e-06, + "loss": 0.89232147, + "num_input_tokens_seen": 73425225, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.91796875, + "step": 3398, + "time_per_iteration": 2.4879262447357178 + }, + { + "auxiliary_loss_clip": 0.01146457, + "auxiliary_loss_mlp": 0.01045529, + "balance_loss_clip": 1.02855396, + "balance_loss_mlp": 1.05200124, + "epoch": 0.2043589358184278, + "flos": 17851230270720.0, + "grad_norm": 2.0446998950436273, + "language_loss": 0.77973163, + "learning_rate": 3.6894775688822186e-06, + "loss": 0.8016516, + "num_input_tokens_seen": 73440940, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9453125, + "step": 3399, + "time_per_iteration": 2.4417104721069336 + }, + { + "auxiliary_loss_clip": 0.01142529, + "auxiliary_loss_mlp": 0.01039697, + "balance_loss_clip": 1.02180338, + "balance_loss_mlp": 1.0471437, + "epoch": 0.20441905907109575, + "flos": 21435223455360.0, + "grad_norm": 1.9372936252349278, + "language_loss": 0.76201475, + "learning_rate": 3.6892691052017603e-06, + "loss": 0.78383702, + "num_input_tokens_seen": 73458805, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.953125, + "step": 3400, + "time_per_iteration": 2.513378858566284 + }, + { + "auxiliary_loss_clip": 0.01140509, + "auxiliary_loss_mlp": 0.01043924, + "balance_loss_clip": 1.02709138, + "balance_loss_mlp": 1.04937315, + "epoch": 0.20447918232376372, + "flos": 27707703851520.0, + "grad_norm": 1.6590163779918286, + "language_loss": 0.79357922, + "learning_rate": 3.6890605774640487e-06, + "loss": 0.81542361, + "num_input_tokens_seen": 73479380, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9140625, + "step": 3401, + "time_per_iteration": 2.5628185272216797 + }, + { + "auxiliary_loss_clip": 0.01141107, + "auxiliary_loss_mlp": 0.01041447, + "balance_loss_clip": 1.02400649, + "balance_loss_mlp": 1.04659653, + "epoch": 0.20453930557643168, + "flos": 30524214113280.0, + "grad_norm": 1.682072453203677, + "language_loss": 0.69205511, + "learning_rate": 3.688851985676991e-06, + "loss": 0.71388066, + "num_input_tokens_seen": 73505105, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9453125, + "step": 3402, + "time_per_iteration": 2.653932571411133 + }, + { + "auxiliary_loss_clip": 0.01144935, + "auxiliary_loss_mlp": 0.01043349, + "balance_loss_clip": 1.02538395, + "balance_loss_mlp": 1.05008948, + "epoch": 0.20459942882909965, + "flos": 18987767481600.0, + "grad_norm": 2.6906490082479086, + "language_loss": 0.81077826, + "learning_rate": 3.688643329848496e-06, + "loss": 0.83266115, + "num_input_tokens_seen": 73523700, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.94921875, + "step": 3403, + "time_per_iteration": 2.518402099609375 + }, + { + "auxiliary_loss_clip": 0.01145331, + "auxiliary_loss_mlp": 0.0104575, + "balance_loss_clip": 1.02873933, + "balance_loss_mlp": 1.05067933, + "epoch": 0.20465955208176762, + "flos": 20339050152960.0, + "grad_norm": 1.7308307985558895, + "language_loss": 0.83497006, + "learning_rate": 3.6884346099864772e-06, + "loss": 0.85688084, + "num_input_tokens_seen": 73542625, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9453125, + "step": 3404, + "time_per_iteration": 2.5041427612304688 + }, + { + "auxiliary_loss_clip": 0.0114107, + "auxiliary_loss_mlp": 0.0104714, + "balance_loss_clip": 1.03018808, + "balance_loss_mlp": 1.04686713, + "epoch": 0.20471967533443558, + "flos": 21251288885760.0, + "grad_norm": 1.717424757849508, + "language_loss": 0.86319768, + "learning_rate": 3.6882258260988487e-06, + "loss": 0.88507974, + "num_input_tokens_seen": 73561450, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9453125, + "step": 3405, + "time_per_iteration": 2.5019404888153076 + }, + { + "auxiliary_loss_clip": 0.01138198, + "auxiliary_loss_mlp": 0.01042134, + "balance_loss_clip": 1.02558827, + "balance_loss_mlp": 1.04664326, + "epoch": 0.20477979858710357, + "flos": 14501555458560.0, + "grad_norm": 2.0734152439752327, + "language_loss": 0.84731919, + "learning_rate": 3.6880169781935276e-06, + "loss": 0.86912251, + "num_input_tokens_seen": 73577155, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9140625, + "step": 3406, + "time_per_iteration": 2.508274793624878 + }, + { + "auxiliary_loss_clip": 0.0114, + "auxiliary_loss_mlp": 0.01042004, + "balance_loss_clip": 1.02601814, + "balance_loss_mlp": 1.04885817, + "epoch": 0.20483992183977154, + "flos": 11400310085760.0, + "grad_norm": 2.0579137112366332, + "language_loss": 0.68086451, + "learning_rate": 3.6878080662784336e-06, + "loss": 0.70268458, + "num_input_tokens_seen": 73594900, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.9140625, + "step": 3407, + "time_per_iteration": 2.4675915241241455 + }, + { + "auxiliary_loss_clip": 0.01137483, + "auxiliary_loss_mlp": 0.0104729, + "balance_loss_clip": 1.03039861, + "balance_loss_mlp": 1.0469842, + "epoch": 0.2049000450924395, + "flos": 19060271084160.0, + "grad_norm": 2.4520435823789857, + "language_loss": 0.84025276, + "learning_rate": 3.6875990903614886e-06, + "loss": 0.86210054, + "num_input_tokens_seen": 73613810, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.90625, + "step": 3408, + "time_per_iteration": 2.4996185302734375 + }, + { + "auxiliary_loss_clip": 0.01144748, + "auxiliary_loss_mlp": 0.01045034, + "balance_loss_clip": 1.02851176, + "balance_loss_mlp": 1.05156052, + "epoch": 0.20496016834510747, + "flos": 14574561851520.0, + "grad_norm": 2.726731275915995, + "language_loss": 0.64288676, + "learning_rate": 3.6873900504506166e-06, + "loss": 0.66478455, + "num_input_tokens_seen": 73631495, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9296875, + "step": 3409, + "time_per_iteration": 2.469758987426758 + }, + { + "auxiliary_loss_clip": 0.01139054, + "auxiliary_loss_mlp": 0.01046007, + "balance_loss_clip": 1.0295676, + "balance_loss_mlp": 1.04638147, + "epoch": 0.20502029159777543, + "flos": 22126647329280.0, + "grad_norm": 1.319045584705984, + "language_loss": 0.80357087, + "learning_rate": 3.687180946553745e-06, + "loss": 0.82542145, + "num_input_tokens_seen": 73652840, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.92578125, + "step": 3410, + "time_per_iteration": 2.5167293548583984 + }, + { + "auxiliary_loss_clip": 0.01140553, + "auxiliary_loss_mlp": 0.01046311, + "balance_loss_clip": 1.0303905, + "balance_loss_mlp": 1.05014896, + "epoch": 0.2050804148504434, + "flos": 25367907916800.0, + "grad_norm": 2.259997857874164, + "language_loss": 0.75796056, + "learning_rate": 3.686971778678803e-06, + "loss": 0.7798292, + "num_input_tokens_seen": 73672150, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.90625, + "step": 3411, + "time_per_iteration": 2.5411264896392822 + }, + { + "auxiliary_loss_clip": 0.01144909, + "auxiliary_loss_mlp": 0.0104429, + "balance_loss_clip": 1.02817273, + "balance_loss_mlp": 1.05220985, + "epoch": 0.2051405381031114, + "flos": 23620171858560.0, + "grad_norm": 2.0004173274373183, + "language_loss": 0.73696554, + "learning_rate": 3.686762546833722e-06, + "loss": 0.75885755, + "num_input_tokens_seen": 73691940, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.9296875, + "step": 3412, + "time_per_iteration": 2.5047144889831543 + }, + { + "auxiliary_loss_clip": 0.01143761, + "auxiliary_loss_mlp": 0.01047167, + "balance_loss_clip": 1.03015614, + "balance_loss_mlp": 1.04735541, + "epoch": 0.20520066135577936, + "flos": 19565533745280.0, + "grad_norm": 2.0925027501904228, + "language_loss": 0.77863461, + "learning_rate": 3.6865532510264362e-06, + "loss": 0.8005439, + "num_input_tokens_seen": 73709080, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.96484375, + "step": 3413, + "time_per_iteration": 2.5472991466522217 + }, + { + "auxiliary_loss_clip": 0.01138869, + "auxiliary_loss_mlp": 0.01042643, + "balance_loss_clip": 1.02534604, + "balance_loss_mlp": 1.04989886, + "epoch": 0.20526078460844732, + "flos": 17676345928320.0, + "grad_norm": 1.912987525537943, + "language_loss": 0.84719825, + "learning_rate": 3.6863438912648823e-06, + "loss": 0.86901337, + "num_input_tokens_seen": 73727670, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.890625, + "step": 3414, + "time_per_iteration": 2.478729724884033 + }, + { + "auxiliary_loss_clip": 0.01138295, + "auxiliary_loss_mlp": 0.0104162, + "balance_loss_clip": 1.02496636, + "balance_loss_mlp": 1.04659235, + "epoch": 0.2053209078611153, + "flos": 21500328856320.0, + "grad_norm": 1.9076108002018353, + "language_loss": 0.80448711, + "learning_rate": 3.6861344675569986e-06, + "loss": 0.82628626, + "num_input_tokens_seen": 73747170, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.91796875, + "step": 3415, + "time_per_iteration": 2.5366415977478027 + }, + { + "auxiliary_loss_clip": 0.01137909, + "auxiliary_loss_mlp": 0.01037046, + "balance_loss_clip": 1.02154934, + "balance_loss_mlp": 1.04796863, + "epoch": 0.20538103111378325, + "flos": 25663524848640.0, + "grad_norm": 1.7629792917286327, + "language_loss": 0.72893143, + "learning_rate": 3.6859249799107275e-06, + "loss": 0.75068092, + "num_input_tokens_seen": 73767690, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8984375, + "step": 3416, + "time_per_iteration": 2.5656492710113525 + }, + { + "auxiliary_loss_clip": 0.01140135, + "auxiliary_loss_mlp": 0.01042271, + "balance_loss_clip": 1.02520072, + "balance_loss_mlp": 1.04695165, + "epoch": 0.20544115436645122, + "flos": 23148952312320.0, + "grad_norm": 2.5523210605949425, + "language_loss": 0.78623438, + "learning_rate": 3.6857154283340115e-06, + "loss": 0.80805844, + "num_input_tokens_seen": 73786900, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.93359375, + "step": 3417, + "time_per_iteration": 2.51582932472229 + }, + { + "auxiliary_loss_clip": 0.01140114, + "auxiliary_loss_mlp": 0.01046708, + "balance_loss_clip": 1.02948236, + "balance_loss_mlp": 1.04842472, + "epoch": 0.20550127761911918, + "flos": 19390433921280.0, + "grad_norm": 2.178207343470702, + "language_loss": 0.87390542, + "learning_rate": 3.685505812834798e-06, + "loss": 0.89577365, + "num_input_tokens_seen": 73804515, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.91796875, + "step": 3418, + "time_per_iteration": 2.4900615215301514 + }, + { + "auxiliary_loss_clip": 0.01139839, + "auxiliary_loss_mlp": 0.0104158, + "balance_loss_clip": 1.0251534, + "balance_loss_mlp": 1.04798996, + "epoch": 0.20556140087178718, + "flos": 22893124671360.0, + "grad_norm": 2.115759049165993, + "language_loss": 0.62156075, + "learning_rate": 3.685296133421035e-06, + "loss": 0.64337492, + "num_input_tokens_seen": 73822910, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.921875, + "step": 3419, + "time_per_iteration": 2.527057647705078 + }, + { + "auxiliary_loss_clip": 0.01143982, + "auxiliary_loss_mlp": 0.0104893, + "balance_loss_clip": 1.02977359, + "balance_loss_mlp": 1.04905963, + "epoch": 0.20562152412445514, + "flos": 19789652655360.0, + "grad_norm": 2.2865688080492466, + "language_loss": 0.86502206, + "learning_rate": 3.685086390100674e-06, + "loss": 0.88695121, + "num_input_tokens_seen": 73841160, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.953125, + "step": 3420, + "time_per_iteration": 2.532512664794922 + }, + { + "auxiliary_loss_clip": 0.01138181, + "auxiliary_loss_mlp": 0.0104181, + "balance_loss_clip": 1.02533531, + "balance_loss_mlp": 1.04659796, + "epoch": 0.2056816473771231, + "flos": 31501989210240.0, + "grad_norm": 2.535685660701584, + "language_loss": 0.70904821, + "learning_rate": 3.684876582881668e-06, + "loss": 0.73084807, + "num_input_tokens_seen": 73862795, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.91796875, + "step": 3421, + "time_per_iteration": 2.5924150943756104 + }, + { + "auxiliary_loss_clip": 0.0113664, + "auxiliary_loss_mlp": 0.01038524, + "balance_loss_clip": 1.02099967, + "balance_loss_mlp": 1.04581738, + "epoch": 0.20574177062979107, + "flos": 23258372117760.0, + "grad_norm": 3.5707952740494235, + "language_loss": 0.70370102, + "learning_rate": 3.6846667117719732e-06, + "loss": 0.72545266, + "num_input_tokens_seen": 73881525, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.90625, + "step": 3422, + "time_per_iteration": 2.499041795730591 + }, + { + "auxiliary_loss_clip": 0.01060302, + "auxiliary_loss_mlp": 0.01012319, + "balance_loss_clip": 1.01001859, + "balance_loss_mlp": 1.02983248, + "epoch": 0.20580189388245904, + "flos": 70312518708480.0, + "grad_norm": 0.7605512778953217, + "language_loss": 0.55499864, + "learning_rate": 3.684456776779548e-06, + "loss": 0.57572484, + "num_input_tokens_seen": 73937775, + "router_z_loss_clip": 0.02294922, + "router_z_loss_mlp": 0.3046875, + "step": 3423, + "time_per_iteration": 3.1569108963012695 + }, + { + "auxiliary_loss_clip": 0.0114215, + "auxiliary_loss_mlp": 0.01042807, + "balance_loss_clip": 1.02494931, + "balance_loss_mlp": 1.04882169, + "epoch": 0.205862017135127, + "flos": 30737846252160.0, + "grad_norm": 1.7754304652232902, + "language_loss": 0.71701574, + "learning_rate": 3.684246777912353e-06, + "loss": 0.73886526, + "num_input_tokens_seen": 73958250, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.9296875, + "step": 3424, + "time_per_iteration": 2.58278751373291 + }, + { + "auxiliary_loss_clip": 0.01140924, + "auxiliary_loss_mlp": 0.01046159, + "balance_loss_clip": 1.02920699, + "balance_loss_mlp": 1.05022514, + "epoch": 0.20592214038779497, + "flos": 21324546673920.0, + "grad_norm": 1.563470220797352, + "language_loss": 0.75031066, + "learning_rate": 3.684036715178351e-06, + "loss": 0.77218151, + "num_input_tokens_seen": 73977775, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.90625, + "step": 3425, + "time_per_iteration": 2.518050193786621 + }, + { + "auxiliary_loss_clip": 0.01145974, + "auxiliary_loss_mlp": 0.01057037, + "balance_loss_clip": 1.0404191, + "balance_loss_mlp": 1.0545603, + "epoch": 0.20598226364046296, + "flos": 22891652213760.0, + "grad_norm": 1.8081006382856646, + "language_loss": 0.88246548, + "learning_rate": 3.683826588585508e-06, + "loss": 0.90449566, + "num_input_tokens_seen": 73996590, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9140625, + "step": 3426, + "time_per_iteration": 2.5141823291778564 + }, + { + "auxiliary_loss_clip": 0.01139115, + "auxiliary_loss_mlp": 0.01046156, + "balance_loss_clip": 1.02927566, + "balance_loss_mlp": 1.04961991, + "epoch": 0.20604238689313092, + "flos": 23878549365120.0, + "grad_norm": 1.8273097367093476, + "language_loss": 0.76748925, + "learning_rate": 3.6836163981417926e-06, + "loss": 0.78934193, + "num_input_tokens_seen": 74015935, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.89453125, + "step": 3427, + "time_per_iteration": 4.068110227584839 + }, + { + "auxiliary_loss_clip": 0.01143208, + "auxiliary_loss_mlp": 0.01048853, + "balance_loss_clip": 1.03143609, + "balance_loss_mlp": 1.04978716, + "epoch": 0.2061025101457989, + "flos": 22491535639680.0, + "grad_norm": 1.6956079848027177, + "language_loss": 0.73914266, + "learning_rate": 3.683406143855174e-06, + "loss": 0.76106334, + "num_input_tokens_seen": 74036575, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.93359375, + "step": 3428, + "time_per_iteration": 2.5296199321746826 + }, + { + "auxiliary_loss_clip": 0.0113987, + "auxiliary_loss_mlp": 0.01049805, + "balance_loss_clip": 1.03188777, + "balance_loss_mlp": 1.04691577, + "epoch": 0.20616263339846685, + "flos": 22778928357120.0, + "grad_norm": 3.779292361126499, + "language_loss": 0.73553443, + "learning_rate": 3.6831958257336256e-06, + "loss": 0.75743121, + "num_input_tokens_seen": 74055365, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9296875, + "step": 3429, + "time_per_iteration": 3.979640483856201 + }, + { + "auxiliary_loss_clip": 0.01146724, + "auxiliary_loss_mlp": 0.01041423, + "balance_loss_clip": 1.0242331, + "balance_loss_mlp": 1.05180049, + "epoch": 0.20622275665113482, + "flos": 20882198684160.0, + "grad_norm": 1.8474903397728304, + "language_loss": 0.85301876, + "learning_rate": 3.6829854437851237e-06, + "loss": 0.87490022, + "num_input_tokens_seen": 74074875, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.94921875, + "step": 3430, + "time_per_iteration": 2.532275438308716 + }, + { + "auxiliary_loss_clip": 0.0114587, + "auxiliary_loss_mlp": 0.01052093, + "balance_loss_clip": 1.03411579, + "balance_loss_mlp": 1.05116892, + "epoch": 0.20628287990380278, + "flos": 19354415558400.0, + "grad_norm": 1.4715876867440674, + "language_loss": 0.69369543, + "learning_rate": 3.6827749980176444e-06, + "loss": 0.715675, + "num_input_tokens_seen": 74094505, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.94921875, + "step": 3431, + "time_per_iteration": 2.4857282638549805 + }, + { + "auxiliary_loss_clip": 0.01051719, + "auxiliary_loss_mlp": 0.01015472, + "balance_loss_clip": 1.01329005, + "balance_loss_mlp": 1.02078724, + "epoch": 0.20634300315647078, + "flos": 71517932248320.0, + "grad_norm": 0.8322663536180677, + "language_loss": 0.60249984, + "learning_rate": 3.6825644884391693e-06, + "loss": 0.62317169, + "num_input_tokens_seen": 74158500, + "router_z_loss_clip": 0.02185059, + "router_z_loss_mlp": 0.30859375, + "step": 3432, + "time_per_iteration": 3.250966787338257 + }, + { + "auxiliary_loss_clip": 0.01143675, + "auxiliary_loss_mlp": 0.01047086, + "balance_loss_clip": 1.03021789, + "balance_loss_mlp": 1.05125713, + "epoch": 0.20640312640913874, + "flos": 21723944976000.0, + "grad_norm": 1.7869258470827205, + "language_loss": 0.72495091, + "learning_rate": 3.682353915057679e-06, + "loss": 0.74685854, + "num_input_tokens_seen": 74176685, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.921875, + "step": 3433, + "time_per_iteration": 2.528576135635376 + }, + { + "auxiliary_loss_clip": 0.01143793, + "auxiliary_loss_mlp": 0.01050396, + "balance_loss_clip": 1.03295541, + "balance_loss_mlp": 1.04886997, + "epoch": 0.2064632496618067, + "flos": 20554621626240.0, + "grad_norm": 1.715054190412472, + "language_loss": 0.8721565, + "learning_rate": 3.6821432778811604e-06, + "loss": 0.8940984, + "num_input_tokens_seen": 74194935, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.94921875, + "step": 3434, + "time_per_iteration": 2.507589101791382 + }, + { + "auxiliary_loss_clip": 0.01144514, + "auxiliary_loss_mlp": 0.01043806, + "balance_loss_clip": 1.0269376, + "balance_loss_mlp": 1.04833162, + "epoch": 0.20652337291447467, + "flos": 29823273135360.0, + "grad_norm": 1.6274854163318595, + "language_loss": 0.69133317, + "learning_rate": 3.6819325769176004e-06, + "loss": 0.71321636, + "num_input_tokens_seen": 74215400, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9609375, + "step": 3435, + "time_per_iteration": 2.587930679321289 + }, + { + "auxiliary_loss_clip": 0.01140929, + "auxiliary_loss_mlp": 0.01041675, + "balance_loss_clip": 1.0241158, + "balance_loss_mlp": 1.04983366, + "epoch": 0.20658349616714264, + "flos": 26213640618240.0, + "grad_norm": 1.7028603597643168, + "language_loss": 0.8922776, + "learning_rate": 3.681721812174988e-06, + "loss": 0.91410363, + "num_input_tokens_seen": 74234090, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.91015625, + "step": 3436, + "time_per_iteration": 2.57295298576355 + }, + { + "auxiliary_loss_clip": 0.01144451, + "auxiliary_loss_mlp": 0.01035549, + "balance_loss_clip": 1.01856136, + "balance_loss_mlp": 1.05126333, + "epoch": 0.2066436194198106, + "flos": 25994370044160.0, + "grad_norm": 1.8990861512322268, + "language_loss": 0.76659, + "learning_rate": 3.6815109836613163e-06, + "loss": 0.78839004, + "num_input_tokens_seen": 74253345, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9296875, + "step": 3437, + "time_per_iteration": 2.5819849967956543 + }, + { + "auxiliary_loss_clip": 0.01142266, + "auxiliary_loss_mlp": 0.0104022, + "balance_loss_clip": 1.02397132, + "balance_loss_mlp": 1.04877901, + "epoch": 0.20670374267247857, + "flos": 21361067827200.0, + "grad_norm": 1.7925672188665596, + "language_loss": 0.77611911, + "learning_rate": 3.6813000913845795e-06, + "loss": 0.79794395, + "num_input_tokens_seen": 74271615, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.93359375, + "step": 3438, + "time_per_iteration": 2.5091731548309326 + }, + { + "auxiliary_loss_clip": 0.01047915, + "auxiliary_loss_mlp": 0.01005377, + "balance_loss_clip": 1.00348175, + "balance_loss_mlp": 1.01723933, + "epoch": 0.20676386592514656, + "flos": 66383281952640.0, + "grad_norm": 0.8367234589951487, + "language_loss": 0.67141807, + "learning_rate": 3.6810891353527747e-06, + "loss": 0.69195092, + "num_input_tokens_seen": 74331390, + "router_z_loss_clip": 0.0189209, + "router_z_loss_mlp": 0.30664062, + "step": 3439, + "time_per_iteration": 3.0797181129455566 + }, + { + "auxiliary_loss_clip": 0.01142942, + "auxiliary_loss_mlp": 0.01037044, + "balance_loss_clip": 1.02028275, + "balance_loss_mlp": 1.04791629, + "epoch": 0.20682398917781453, + "flos": 17274577328640.0, + "grad_norm": 2.0580501207842428, + "language_loss": 0.83931267, + "learning_rate": 3.6808781155739014e-06, + "loss": 0.86111259, + "num_input_tokens_seen": 74347335, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.94921875, + "step": 3440, + "time_per_iteration": 2.5015172958374023 + }, + { + "auxiliary_loss_clip": 0.01143016, + "auxiliary_loss_mlp": 0.01041686, + "balance_loss_clip": 1.02584338, + "balance_loss_mlp": 1.05009377, + "epoch": 0.2068841124304825, + "flos": 18077288515200.0, + "grad_norm": 1.9416657792651912, + "language_loss": 0.84825736, + "learning_rate": 3.6806670320559614e-06, + "loss": 0.87010437, + "num_input_tokens_seen": 74366310, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.9296875, + "step": 3441, + "time_per_iteration": 2.4866137504577637 + }, + { + "auxiliary_loss_clip": 0.01140001, + "auxiliary_loss_mlp": 0.01044739, + "balance_loss_clip": 1.02778697, + "balance_loss_mlp": 1.0502038, + "epoch": 0.20694423568315046, + "flos": 27347017432320.0, + "grad_norm": 1.6577892844013908, + "language_loss": 0.85889506, + "learning_rate": 3.680455884806959e-06, + "loss": 0.88074249, + "num_input_tokens_seen": 74387100, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8984375, + "step": 3442, + "time_per_iteration": 2.5914649963378906 + }, + { + "auxiliary_loss_clip": 0.01145487, + "auxiliary_loss_mlp": 0.01040291, + "balance_loss_clip": 1.02305317, + "balance_loss_mlp": 1.05208063, + "epoch": 0.20700435893581842, + "flos": 20229845829120.0, + "grad_norm": 1.9070439101703558, + "language_loss": 0.72829354, + "learning_rate": 3.6802446738349014e-06, + "loss": 0.75015128, + "num_input_tokens_seen": 74404460, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.93359375, + "step": 3443, + "time_per_iteration": 2.5210063457489014 + }, + { + "auxiliary_loss_clip": 0.01140016, + "auxiliary_loss_mlp": 0.01044032, + "balance_loss_clip": 1.02879703, + "balance_loss_mlp": 1.0496819, + "epoch": 0.2070644821884864, + "flos": 20631111638400.0, + "grad_norm": 2.5056876708900186, + "language_loss": 0.85428166, + "learning_rate": 3.680033399147797e-06, + "loss": 0.87612224, + "num_input_tokens_seen": 74423790, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.90625, + "step": 3444, + "time_per_iteration": 2.528493881225586 + }, + { + "auxiliary_loss_clip": 0.01047325, + "auxiliary_loss_mlp": 0.0100746, + "balance_loss_clip": 1.00537384, + "balance_loss_mlp": 1.01688242, + "epoch": 0.20712460544115438, + "flos": 65941077617280.0, + "grad_norm": 0.6978715278146553, + "language_loss": 0.57091653, + "learning_rate": 3.6798220607536585e-06, + "loss": 0.5914644, + "num_input_tokens_seen": 74488130, + "router_z_loss_clip": 0.02087402, + "router_z_loss_mlp": 0.3046875, + "step": 3445, + "time_per_iteration": 3.086552619934082 + }, + { + "auxiliary_loss_clip": 0.01140085, + "auxiliary_loss_mlp": 0.01050946, + "balance_loss_clip": 1.03356516, + "balance_loss_mlp": 1.04968095, + "epoch": 0.20718472869382235, + "flos": 19425734012160.0, + "grad_norm": 1.5621496076246746, + "language_loss": 0.78459281, + "learning_rate": 3.6796106586604987e-06, + "loss": 0.80650306, + "num_input_tokens_seen": 74506720, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.90625, + "step": 3446, + "time_per_iteration": 2.4844422340393066 + }, + { + "auxiliary_loss_clip": 0.01148285, + "auxiliary_loss_mlp": 0.01048146, + "balance_loss_clip": 1.02846456, + "balance_loss_mlp": 1.05057228, + "epoch": 0.2072448519464903, + "flos": 24499049834880.0, + "grad_norm": 2.157476270385918, + "language_loss": 0.62436825, + "learning_rate": 3.679399192876334e-06, + "loss": 0.64633256, + "num_input_tokens_seen": 74525330, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.9765625, + "step": 3447, + "time_per_iteration": 2.592799663543701 + }, + { + "auxiliary_loss_clip": 0.01141894, + "auxiliary_loss_mlp": 0.01047763, + "balance_loss_clip": 1.03071666, + "balance_loss_mlp": 1.04810297, + "epoch": 0.20730497519915828, + "flos": 23075694524160.0, + "grad_norm": 1.740614876967074, + "language_loss": 0.86066437, + "learning_rate": 3.679187663409184e-06, + "loss": 0.88256097, + "num_input_tokens_seen": 74544535, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9375, + "step": 3448, + "time_per_iteration": 2.5054237842559814 + }, + { + "auxiliary_loss_clip": 0.01140662, + "auxiliary_loss_mlp": 0.01044351, + "balance_loss_clip": 1.02576649, + "balance_loss_mlp": 1.04814398, + "epoch": 0.20736509845182624, + "flos": 21069042255360.0, + "grad_norm": 3.1117492515519665, + "language_loss": 0.75452864, + "learning_rate": 3.6789760702670696e-06, + "loss": 0.77637869, + "num_input_tokens_seen": 74562300, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.92578125, + "step": 3449, + "time_per_iteration": 2.506657838821411 + }, + { + "auxiliary_loss_clip": 0.01145667, + "auxiliary_loss_mlp": 0.01050496, + "balance_loss_clip": 1.03194678, + "balance_loss_mlp": 1.04896426, + "epoch": 0.2074252217044942, + "flos": 17633288499840.0, + "grad_norm": 1.7877143934577313, + "language_loss": 0.76703656, + "learning_rate": 3.6787644134580134e-06, + "loss": 0.78899819, + "num_input_tokens_seen": 74580080, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.96875, + "step": 3450, + "time_per_iteration": 2.479090929031372 + }, + { + "auxiliary_loss_clip": 0.01143955, + "auxiliary_loss_mlp": 0.01047659, + "balance_loss_clip": 1.0302192, + "balance_loss_mlp": 1.04780531, + "epoch": 0.20748534495716217, + "flos": 23546985897600.0, + "grad_norm": 1.5227053471466307, + "language_loss": 0.822101, + "learning_rate": 3.6785526929900436e-06, + "loss": 0.84401715, + "num_input_tokens_seen": 74598980, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9609375, + "step": 3451, + "time_per_iteration": 2.5465826988220215 + }, + { + "auxiliary_loss_clip": 0.01047156, + "auxiliary_loss_mlp": 0.01003865, + "balance_loss_clip": 1.00193334, + "balance_loss_mlp": 1.01645589, + "epoch": 0.20754546820983016, + "flos": 52252935598080.0, + "grad_norm": 0.7930757504147553, + "language_loss": 0.56569821, + "learning_rate": 3.6783409088711875e-06, + "loss": 0.58620846, + "num_input_tokens_seen": 74655275, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.30859375, + "step": 3452, + "time_per_iteration": 2.979168653488159 + }, + { + "auxiliary_loss_clip": 0.01144097, + "auxiliary_loss_mlp": 0.01045617, + "balance_loss_clip": 1.02765203, + "balance_loss_mlp": 1.0492605, + "epoch": 0.20760559146249813, + "flos": 20412379768320.0, + "grad_norm": 1.970927529953097, + "language_loss": 0.88332593, + "learning_rate": 3.6781290611094755e-06, + "loss": 0.90522313, + "num_input_tokens_seen": 74674560, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.94921875, + "step": 3453, + "time_per_iteration": 2.5404746532440186 + }, + { + "auxiliary_loss_clip": 0.01145334, + "auxiliary_loss_mlp": 0.01043412, + "balance_loss_clip": 1.02518487, + "balance_loss_mlp": 1.05121803, + "epoch": 0.2076657147151661, + "flos": 23186012169600.0, + "grad_norm": 1.6193396769615114, + "language_loss": 0.80056196, + "learning_rate": 3.6779171497129407e-06, + "loss": 0.82244939, + "num_input_tokens_seen": 74694500, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.94140625, + "step": 3454, + "time_per_iteration": 2.536154270172119 + }, + { + "auxiliary_loss_clip": 0.0114231, + "auxiliary_loss_mlp": 0.0104846, + "balance_loss_clip": 1.03128242, + "balance_loss_mlp": 1.04881716, + "epoch": 0.20772583796783406, + "flos": 18293219124480.0, + "grad_norm": 3.767477329453147, + "language_loss": 0.76424366, + "learning_rate": 3.6777051746896202e-06, + "loss": 0.78615135, + "num_input_tokens_seen": 74710485, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.93359375, + "step": 3455, + "time_per_iteration": 2.502450466156006 + }, + { + "auxiliary_loss_clip": 0.01141184, + "auxiliary_loss_mlp": 0.01049655, + "balance_loss_clip": 1.03247654, + "balance_loss_mlp": 1.04867601, + "epoch": 0.20778596122050202, + "flos": 17602800831360.0, + "grad_norm": 2.1876724852466163, + "language_loss": 0.80599815, + "learning_rate": 3.6774931360475516e-06, + "loss": 0.82790661, + "num_input_tokens_seen": 74727450, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.92578125, + "step": 3456, + "time_per_iteration": 2.495405673980713 + }, + { + "auxiliary_loss_clip": 0.01147485, + "auxiliary_loss_mlp": 0.01042924, + "balance_loss_clip": 1.02447069, + "balance_loss_mlp": 1.05180097, + "epoch": 0.20784608447317, + "flos": 23805578885760.0, + "grad_norm": 1.5859267830694757, + "language_loss": 0.77988815, + "learning_rate": 3.6772810337947745e-06, + "loss": 0.80179226, + "num_input_tokens_seen": 74746725, + "router_z_loss_clip": 0.18457031, + "router_z_loss_mlp": 0.95703125, + "step": 3457, + "time_per_iteration": 2.5625829696655273 + }, + { + "auxiliary_loss_clip": 0.01149281, + "auxiliary_loss_mlp": 0.01054167, + "balance_loss_clip": 1.03461635, + "balance_loss_mlp": 1.05195451, + "epoch": 0.20790620772583795, + "flos": 17639286071040.0, + "grad_norm": 2.0073788397072136, + "language_loss": 0.83581042, + "learning_rate": 3.677068867939333e-06, + "loss": 0.85784483, + "num_input_tokens_seen": 74765255, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.96875, + "step": 3458, + "time_per_iteration": 2.470740556716919 + }, + { + "auxiliary_loss_clip": 0.01142717, + "auxiliary_loss_mlp": 0.01041287, + "balance_loss_clip": 1.02443111, + "balance_loss_mlp": 1.05063045, + "epoch": 0.20796633097850595, + "flos": 27673481168640.0, + "grad_norm": 1.732611194718632, + "language_loss": 0.76041365, + "learning_rate": 3.676856638489272e-06, + "loss": 0.78225368, + "num_input_tokens_seen": 74785710, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.921875, + "step": 3459, + "time_per_iteration": 2.5753207206726074 + }, + { + "auxiliary_loss_clip": 0.01138446, + "auxiliary_loss_mlp": 0.01041199, + "balance_loss_clip": 1.02451003, + "balance_loss_mlp": 1.04829502, + "epoch": 0.2080264542311739, + "flos": 19245606284160.0, + "grad_norm": 2.1264218253084386, + "language_loss": 0.77302521, + "learning_rate": 3.6766443454526382e-06, + "loss": 0.79482168, + "num_input_tokens_seen": 74804490, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8984375, + "step": 3460, + "time_per_iteration": 2.498760938644409 + }, + { + "auxiliary_loss_clip": 0.01143636, + "auxiliary_loss_mlp": 0.01050405, + "balance_loss_clip": 1.03284574, + "balance_loss_mlp": 1.04819179, + "epoch": 0.20808657748384188, + "flos": 27525924097920.0, + "grad_norm": 2.1644839576228296, + "language_loss": 0.75785947, + "learning_rate": 3.6764319888374836e-06, + "loss": 0.77979982, + "num_input_tokens_seen": 74826340, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.953125, + "step": 3461, + "time_per_iteration": 2.5850372314453125 + }, + { + "auxiliary_loss_clip": 0.01145604, + "auxiliary_loss_mlp": 0.010446, + "balance_loss_clip": 1.02645624, + "balance_loss_mlp": 1.0469749, + "epoch": 0.20814670073650984, + "flos": 26906931999360.0, + "grad_norm": 1.8484421465162717, + "language_loss": 0.88227051, + "learning_rate": 3.6762195686518604e-06, + "loss": 0.90417254, + "num_input_tokens_seen": 74844960, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.98828125, + "step": 3462, + "time_per_iteration": 2.558375358581543 + }, + { + "auxiliary_loss_clip": 0.01043601, + "auxiliary_loss_mlp": 0.0101247, + "balance_loss_clip": 1.01059818, + "balance_loss_mlp": 1.01278758, + "epoch": 0.2082068239891778, + "flos": 70175735717760.0, + "grad_norm": 0.7627714646141646, + "language_loss": 0.59057152, + "learning_rate": 3.6760070849038226e-06, + "loss": 0.6111322, + "num_input_tokens_seen": 74909075, + "router_z_loss_clip": 0.01867676, + "router_z_loss_mlp": 0.30859375, + "step": 3463, + "time_per_iteration": 3.2280492782592773 + }, + { + "auxiliary_loss_clip": 0.01144566, + "auxiliary_loss_mlp": 0.01049331, + "balance_loss_clip": 1.03056765, + "balance_loss_mlp": 1.04713821, + "epoch": 0.20826694724184577, + "flos": 24608074590720.0, + "grad_norm": 2.542529703880477, + "language_loss": 0.65831709, + "learning_rate": 3.675794537601429e-06, + "loss": 0.68025607, + "num_input_tokens_seen": 74928125, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.97265625, + "step": 3464, + "time_per_iteration": 2.5706918239593506 + }, + { + "auxiliary_loss_clip": 0.01147872, + "auxiliary_loss_mlp": 0.01050812, + "balance_loss_clip": 1.03160763, + "balance_loss_mlp": 1.0492928, + "epoch": 0.20832707049451377, + "flos": 12892829034240.0, + "grad_norm": 2.848617339554035, + "language_loss": 0.83536243, + "learning_rate": 3.6755819267527373e-06, + "loss": 0.85734928, + "num_input_tokens_seen": 74945090, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.984375, + "step": 3465, + "time_per_iteration": 2.535473585128784 + }, + { + "auxiliary_loss_clip": 0.01143191, + "auxiliary_loss_mlp": 0.01044869, + "balance_loss_clip": 1.02767932, + "balance_loss_mlp": 1.04802513, + "epoch": 0.20838719374718173, + "flos": 22198827709440.0, + "grad_norm": 3.628659863163492, + "language_loss": 0.81463158, + "learning_rate": 3.6753692523658113e-06, + "loss": 0.83651215, + "num_input_tokens_seen": 74963630, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.953125, + "step": 3466, + "time_per_iteration": 2.535311222076416 + }, + { + "auxiliary_loss_clip": 0.01146517, + "auxiliary_loss_mlp": 0.01044717, + "balance_loss_clip": 1.02863586, + "balance_loss_mlp": 1.05303347, + "epoch": 0.2084473169998497, + "flos": 15158648908800.0, + "grad_norm": 1.967186340276973, + "language_loss": 0.81678396, + "learning_rate": 3.675156514448716e-06, + "loss": 0.83869636, + "num_input_tokens_seen": 74981875, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.9375, + "step": 3467, + "time_per_iteration": 2.4783830642700195 + }, + { + "auxiliary_loss_clip": 0.01142574, + "auxiliary_loss_mlp": 0.01041679, + "balance_loss_clip": 1.02469158, + "balance_loss_mlp": 1.05200005, + "epoch": 0.20850744025251766, + "flos": 17456788045440.0, + "grad_norm": 2.0682841758185235, + "language_loss": 0.8186093, + "learning_rate": 3.674943713009518e-06, + "loss": 0.84045184, + "num_input_tokens_seen": 74999155, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.90625, + "step": 3468, + "time_per_iteration": 2.5275001525878906 + }, + { + "auxiliary_loss_clip": 0.0114752, + "auxiliary_loss_mlp": 0.01046643, + "balance_loss_clip": 1.02677095, + "balance_loss_mlp": 1.05024171, + "epoch": 0.20856756350518563, + "flos": 25698968593920.0, + "grad_norm": 1.9832892060266627, + "language_loss": 0.90227246, + "learning_rate": 3.6747308480562856e-06, + "loss": 0.92421412, + "num_input_tokens_seen": 75017850, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.9765625, + "step": 3469, + "time_per_iteration": 3.999607563018799 + }, + { + "auxiliary_loss_clip": 0.01147477, + "auxiliary_loss_mlp": 0.01051285, + "balance_loss_clip": 1.03329682, + "balance_loss_mlp": 1.0530771, + "epoch": 0.2086276867578536, + "flos": 37889060970240.0, + "grad_norm": 1.764094275638393, + "language_loss": 0.7643016, + "learning_rate": 3.674517919597092e-06, + "loss": 0.78628922, + "num_input_tokens_seen": 75039270, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9453125, + "step": 3470, + "time_per_iteration": 4.186570405960083 + }, + { + "auxiliary_loss_clip": 0.0114555, + "auxiliary_loss_mlp": 0.01048445, + "balance_loss_clip": 1.03039646, + "balance_loss_mlp": 1.05154145, + "epoch": 0.20868781001052156, + "flos": 25557049958400.0, + "grad_norm": 1.7254586081909284, + "language_loss": 0.7592454, + "learning_rate": 3.674304927640011e-06, + "loss": 0.78118539, + "num_input_tokens_seen": 75059350, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.94140625, + "step": 3471, + "time_per_iteration": 2.5700020790100098 + }, + { + "auxiliary_loss_clip": 0.01148899, + "auxiliary_loss_mlp": 0.01054484, + "balance_loss_clip": 1.03488564, + "balance_loss_mlp": 1.04796982, + "epoch": 0.20874793326318955, + "flos": 27529192235520.0, + "grad_norm": 1.907022336492936, + "language_loss": 0.75515926, + "learning_rate": 3.67409187219312e-06, + "loss": 0.77719313, + "num_input_tokens_seen": 75080150, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 1.0078125, + "step": 3472, + "time_per_iteration": 2.555927038192749 + }, + { + "auxiliary_loss_clip": 0.01144631, + "auxiliary_loss_mlp": 0.01045397, + "balance_loss_clip": 1.02790928, + "balance_loss_mlp": 1.05051231, + "epoch": 0.20880805651585752, + "flos": 18548795370240.0, + "grad_norm": 1.9877478939715982, + "language_loss": 0.84168947, + "learning_rate": 3.6738787532644966e-06, + "loss": 0.86358976, + "num_input_tokens_seen": 75097920, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.94140625, + "step": 3473, + "time_per_iteration": 2.5261759757995605 + }, + { + "auxiliary_loss_clip": 0.01043725, + "auxiliary_loss_mlp": 0.0100228, + "balance_loss_clip": 1.00027776, + "balance_loss_mlp": 1.01290703, + "epoch": 0.20886817976852548, + "flos": 65946644225280.0, + "grad_norm": 0.8792852781400284, + "language_loss": 0.63631999, + "learning_rate": 3.6736655708622235e-06, + "loss": 0.65678006, + "num_input_tokens_seen": 75152410, + "router_z_loss_clip": 0.02001953, + "router_z_loss_mlp": 0.30859375, + "step": 3474, + "time_per_iteration": 3.025831460952759 + }, + { + "auxiliary_loss_clip": 0.01146356, + "auxiliary_loss_mlp": 0.01041236, + "balance_loss_clip": 1.02334285, + "balance_loss_mlp": 1.04993105, + "epoch": 0.20892830302119345, + "flos": 36539178929280.0, + "grad_norm": 2.882119897934913, + "language_loss": 0.69867098, + "learning_rate": 3.6734523249943844e-06, + "loss": 0.72054696, + "num_input_tokens_seen": 75173265, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.96484375, + "step": 3475, + "time_per_iteration": 2.751676559448242 + }, + { + "auxiliary_loss_clip": 0.01146508, + "auxiliary_loss_mlp": 0.01047852, + "balance_loss_clip": 1.02961278, + "balance_loss_mlp": 1.05162299, + "epoch": 0.2089884262738614, + "flos": 20956749361920.0, + "grad_norm": 1.4951270147360183, + "language_loss": 0.70032048, + "learning_rate": 3.673239015669065e-06, + "loss": 0.72226411, + "num_input_tokens_seen": 75193640, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.94921875, + "step": 3476, + "time_per_iteration": 2.5493083000183105 + }, + { + "auxiliary_loss_clip": 0.011446, + "auxiliary_loss_mlp": 0.01046029, + "balance_loss_clip": 1.02850533, + "balance_loss_mlp": 1.05099094, + "epoch": 0.20904854952652938, + "flos": 22784028088320.0, + "grad_norm": 2.0857679152031716, + "language_loss": 0.89590299, + "learning_rate": 3.6730256428943544e-06, + "loss": 0.91780925, + "num_input_tokens_seen": 75212545, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9375, + "step": 3477, + "time_per_iteration": 2.506962537765503 + }, + { + "auxiliary_loss_clip": 0.01142894, + "auxiliary_loss_mlp": 0.01047844, + "balance_loss_clip": 1.03005815, + "balance_loss_mlp": 1.04896593, + "epoch": 0.20910867277919734, + "flos": 27303277645440.0, + "grad_norm": 4.245750786990739, + "language_loss": 0.67988396, + "learning_rate": 3.672812206678344e-06, + "loss": 0.70179135, + "num_input_tokens_seen": 75230865, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9375, + "step": 3478, + "time_per_iteration": 2.57366681098938 + }, + { + "auxiliary_loss_clip": 0.01143008, + "auxiliary_loss_mlp": 0.0104171, + "balance_loss_clip": 1.02334023, + "balance_loss_mlp": 1.04826832, + "epoch": 0.20916879603186533, + "flos": 14319237000960.0, + "grad_norm": 2.137628491911851, + "language_loss": 0.85035646, + "learning_rate": 3.672598707029127e-06, + "loss": 0.87220371, + "num_input_tokens_seen": 75248285, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.94921875, + "step": 3479, + "time_per_iteration": 2.4716267585754395 + }, + { + "auxiliary_loss_clip": 0.01146636, + "auxiliary_loss_mlp": 0.01049707, + "balance_loss_clip": 1.03156328, + "balance_loss_mlp": 1.04972577, + "epoch": 0.2092289192845333, + "flos": 22273019251200.0, + "grad_norm": 2.2225866030569175, + "language_loss": 0.73807257, + "learning_rate": 3.6723851439548003e-06, + "loss": 0.76003599, + "num_input_tokens_seen": 75266310, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.96875, + "step": 3480, + "time_per_iteration": 2.4856386184692383 + }, + { + "auxiliary_loss_clip": 0.01141126, + "auxiliary_loss_mlp": 0.01047253, + "balance_loss_clip": 1.03113592, + "balance_loss_mlp": 1.04844785, + "epoch": 0.20928904253720126, + "flos": 14830712714880.0, + "grad_norm": 2.023418551380918, + "language_loss": 0.75601453, + "learning_rate": 3.67217151746346e-06, + "loss": 0.77789831, + "num_input_tokens_seen": 75284175, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.9296875, + "step": 3481, + "time_per_iteration": 2.4812443256378174 + }, + { + "auxiliary_loss_clip": 0.01145872, + "auxiliary_loss_mlp": 0.0104777, + "balance_loss_clip": 1.03051996, + "balance_loss_mlp": 1.05047393, + "epoch": 0.20934916578986923, + "flos": 23259162216960.0, + "grad_norm": 3.5251666716598273, + "language_loss": 0.85337639, + "learning_rate": 3.671957827563209e-06, + "loss": 0.87531281, + "num_input_tokens_seen": 75303465, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.953125, + "step": 3482, + "time_per_iteration": 2.521284580230713 + }, + { + "auxiliary_loss_clip": 0.01145664, + "auxiliary_loss_mlp": 0.01048133, + "balance_loss_clip": 1.02940559, + "balance_loss_mlp": 1.05097377, + "epoch": 0.2094092890425372, + "flos": 32014398677760.0, + "grad_norm": 2.8936854891166743, + "language_loss": 0.70626152, + "learning_rate": 3.6717440742621494e-06, + "loss": 0.72819948, + "num_input_tokens_seen": 75325290, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9453125, + "step": 3483, + "time_per_iteration": 2.5876524448394775 + }, + { + "auxiliary_loss_clip": 0.01146142, + "auxiliary_loss_mlp": 0.01060474, + "balance_loss_clip": 1.04193723, + "balance_loss_mlp": 1.04891169, + "epoch": 0.20946941229520516, + "flos": 20010647082240.0, + "grad_norm": 1.8606830424584557, + "language_loss": 0.74988431, + "learning_rate": 3.6715302575683865e-06, + "loss": 0.77195048, + "num_input_tokens_seen": 75343895, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.96875, + "step": 3484, + "time_per_iteration": 2.49701189994812 + }, + { + "auxiliary_loss_clip": 0.01143763, + "auxiliary_loss_mlp": 0.01048057, + "balance_loss_clip": 1.02991378, + "balance_loss_mlp": 1.05028141, + "epoch": 0.20952953554787315, + "flos": 30740072895360.0, + "grad_norm": 1.8378150509428508, + "language_loss": 0.70690203, + "learning_rate": 3.6713163774900292e-06, + "loss": 0.7288202, + "num_input_tokens_seen": 75367100, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9375, + "step": 3485, + "time_per_iteration": 2.5692059993743896 + }, + { + "auxiliary_loss_clip": 0.01146857, + "auxiliary_loss_mlp": 0.01045553, + "balance_loss_clip": 1.02712297, + "balance_loss_mlp": 1.05028093, + "epoch": 0.20958965880054112, + "flos": 27049209770880.0, + "grad_norm": 1.9069158447471781, + "language_loss": 0.82965356, + "learning_rate": 3.6711024340351875e-06, + "loss": 0.85157764, + "num_input_tokens_seen": 75389925, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.96875, + "step": 3486, + "time_per_iteration": 2.569308042526245 + }, + { + "auxiliary_loss_clip": 0.0114472, + "auxiliary_loss_mlp": 0.01050567, + "balance_loss_clip": 1.03309095, + "balance_loss_mlp": 1.04790449, + "epoch": 0.20964978205320908, + "flos": 34204123589760.0, + "grad_norm": 3.843984040964354, + "language_loss": 0.8699702, + "learning_rate": 3.6708884272119737e-06, + "loss": 0.89192313, + "num_input_tokens_seen": 75408575, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.96875, + "step": 3487, + "time_per_iteration": 2.608441114425659 + }, + { + "auxiliary_loss_clip": 0.01141021, + "auxiliary_loss_mlp": 0.01047132, + "balance_loss_clip": 1.0287739, + "balance_loss_mlp": 1.04695904, + "epoch": 0.20970990530587705, + "flos": 23477391296640.0, + "grad_norm": 2.4377115915778713, + "language_loss": 0.72369969, + "learning_rate": 3.670674357028504e-06, + "loss": 0.74558127, + "num_input_tokens_seen": 75427155, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.94140625, + "step": 3488, + "time_per_iteration": 2.529233694076538 + }, + { + "auxiliary_loss_clip": 0.01144055, + "auxiliary_loss_mlp": 0.01045689, + "balance_loss_clip": 1.02812946, + "balance_loss_mlp": 1.04897618, + "epoch": 0.209770028558545, + "flos": 18551452976640.0, + "grad_norm": 2.6657941113460764, + "language_loss": 0.80726898, + "learning_rate": 3.6704602234928945e-06, + "loss": 0.82916641, + "num_input_tokens_seen": 75444450, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.953125, + "step": 3489, + "time_per_iteration": 2.4847962856292725 + }, + { + "auxiliary_loss_clip": 0.01142088, + "auxiliary_loss_mlp": 0.01042551, + "balance_loss_clip": 1.0253495, + "balance_loss_mlp": 1.04718399, + "epoch": 0.20983015181121298, + "flos": 21617003208960.0, + "grad_norm": 1.7888402521564877, + "language_loss": 0.72827011, + "learning_rate": 3.670246026613266e-06, + "loss": 0.75011659, + "num_input_tokens_seen": 75462625, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.94921875, + "step": 3490, + "time_per_iteration": 2.543064594268799 + }, + { + "auxiliary_loss_clip": 0.01140159, + "auxiliary_loss_mlp": 0.0105099, + "balance_loss_clip": 1.03437209, + "balance_loss_mlp": 1.04955435, + "epoch": 0.20989027506388094, + "flos": 16614718531200.0, + "grad_norm": 5.073894522138561, + "language_loss": 0.70159817, + "learning_rate": 3.6700317663977415e-06, + "loss": 0.72350967, + "num_input_tokens_seen": 75480640, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.90625, + "step": 3491, + "time_per_iteration": 2.4785172939300537 + }, + { + "auxiliary_loss_clip": 0.01142629, + "auxiliary_loss_mlp": 0.01045142, + "balance_loss_clip": 1.02633047, + "balance_loss_mlp": 1.04678369, + "epoch": 0.20995039831654894, + "flos": 23216823060480.0, + "grad_norm": 3.7459720995568557, + "language_loss": 0.7931999, + "learning_rate": 3.669817442854444e-06, + "loss": 0.8150776, + "num_input_tokens_seen": 75494900, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9609375, + "step": 3492, + "time_per_iteration": 2.5213027000427246 + }, + { + "auxiliary_loss_clip": 0.01144565, + "auxiliary_loss_mlp": 0.01041079, + "balance_loss_clip": 1.02341175, + "balance_loss_mlp": 1.04977345, + "epoch": 0.2100105215692169, + "flos": 18147493647360.0, + "grad_norm": 1.9629392465329358, + "language_loss": 0.86883962, + "learning_rate": 3.669603055991502e-06, + "loss": 0.89069605, + "num_input_tokens_seen": 75513370, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9453125, + "step": 3493, + "time_per_iteration": 2.499797821044922 + }, + { + "auxiliary_loss_clip": 0.01139311, + "auxiliary_loss_mlp": 0.01040774, + "balance_loss_clip": 1.02408433, + "balance_loss_mlp": 1.04791212, + "epoch": 0.21007064482188487, + "flos": 15961611490560.0, + "grad_norm": 1.8525794886403055, + "language_loss": 0.68810928, + "learning_rate": 3.6693886058170455e-06, + "loss": 0.70991009, + "num_input_tokens_seen": 75532480, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9140625, + "step": 3494, + "time_per_iteration": 2.5374889373779297 + }, + { + "auxiliary_loss_clip": 0.01146689, + "auxiliary_loss_mlp": 0.01037903, + "balance_loss_clip": 1.02054656, + "balance_loss_mlp": 1.05010796, + "epoch": 0.21013076807455283, + "flos": 32234315696640.0, + "grad_norm": 1.7465496854212388, + "language_loss": 0.78900456, + "learning_rate": 3.6691740923392053e-06, + "loss": 0.81085044, + "num_input_tokens_seen": 75552745, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.96484375, + "step": 3495, + "time_per_iteration": 2.6390578746795654 + }, + { + "auxiliary_loss_clip": 0.0114231, + "auxiliary_loss_mlp": 0.01042653, + "balance_loss_clip": 1.02505755, + "balance_loss_mlp": 1.04696178, + "epoch": 0.2101908913272208, + "flos": 23696625957120.0, + "grad_norm": 1.7459726457298623, + "language_loss": 0.77192879, + "learning_rate": 3.668959515566116e-06, + "loss": 0.79377842, + "num_input_tokens_seen": 75574355, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.953125, + "step": 3496, + "time_per_iteration": 2.552386522293091 + }, + { + "auxiliary_loss_clip": 0.01145605, + "auxiliary_loss_mlp": 0.0105152, + "balance_loss_clip": 1.03297126, + "balance_loss_mlp": 1.04933989, + "epoch": 0.21025101457988876, + "flos": 20375786787840.0, + "grad_norm": 2.0396086665216777, + "language_loss": 0.82009852, + "learning_rate": 3.668744875505915e-06, + "loss": 0.84206975, + "num_input_tokens_seen": 75592215, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9609375, + "step": 3497, + "time_per_iteration": 2.498359441757202 + }, + { + "auxiliary_loss_clip": 0.01146873, + "auxiliary_loss_mlp": 0.01048221, + "balance_loss_clip": 1.03091133, + "balance_loss_mlp": 1.04979134, + "epoch": 0.21031113783255675, + "flos": 25775638174080.0, + "grad_norm": 2.5223195218779577, + "language_loss": 0.67314029, + "learning_rate": 3.668530172166741e-06, + "loss": 0.69509119, + "num_input_tokens_seen": 75610740, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.96875, + "step": 3498, + "time_per_iteration": 2.540766716003418 + }, + { + "auxiliary_loss_clip": 0.01145112, + "auxiliary_loss_mlp": 0.01045261, + "balance_loss_clip": 1.02679563, + "balance_loss_mlp": 1.04782224, + "epoch": 0.21037126108522472, + "flos": 22018197191040.0, + "grad_norm": 2.2477271783909414, + "language_loss": 0.80623376, + "learning_rate": 3.6683154055567352e-06, + "loss": 0.82813752, + "num_input_tokens_seen": 75631005, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.97265625, + "step": 3499, + "time_per_iteration": 2.5283098220825195 + }, + { + "auxiliary_loss_clip": 0.0114621, + "auxiliary_loss_mlp": 0.01043014, + "balance_loss_clip": 1.02612233, + "balance_loss_mlp": 1.05201602, + "epoch": 0.21043138433789269, + "flos": 25334403505920.0, + "grad_norm": 1.776862664007905, + "language_loss": 0.78366566, + "learning_rate": 3.668100575684043e-06, + "loss": 0.80555797, + "num_input_tokens_seen": 75650655, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.94140625, + "step": 3500, + "time_per_iteration": 2.5419158935546875 + }, + { + "auxiliary_loss_clip": 0.01142389, + "auxiliary_loss_mlp": 0.01042754, + "balance_loss_clip": 1.02524185, + "balance_loss_mlp": 1.0480907, + "epoch": 0.21049150759056065, + "flos": 25556654908800.0, + "grad_norm": 1.628727093990466, + "language_loss": 0.73989725, + "learning_rate": 3.6678856825568094e-06, + "loss": 0.76174867, + "num_input_tokens_seen": 75669895, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.94140625, + "step": 3501, + "time_per_iteration": 2.535419464111328 + }, + { + "auxiliary_loss_clip": 0.01140428, + "auxiliary_loss_mlp": 0.01041829, + "balance_loss_clip": 1.02429342, + "balance_loss_mlp": 1.04671168, + "epoch": 0.21055163084322862, + "flos": 24495602129280.0, + "grad_norm": 1.6206913905571714, + "language_loss": 0.75292969, + "learning_rate": 3.667670726183183e-06, + "loss": 0.77475226, + "num_input_tokens_seen": 75689535, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9375, + "step": 3502, + "time_per_iteration": 2.508277654647827 + }, + { + "auxiliary_loss_clip": 0.01141546, + "auxiliary_loss_mlp": 0.0104558, + "balance_loss_clip": 1.02796102, + "balance_loss_mlp": 1.0475595, + "epoch": 0.21061175409589658, + "flos": 25739045193600.0, + "grad_norm": 1.9145063235338367, + "language_loss": 0.77090263, + "learning_rate": 3.667455706571316e-06, + "loss": 0.7927739, + "num_input_tokens_seen": 75709265, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.94140625, + "step": 3503, + "time_per_iteration": 2.5607948303222656 + }, + { + "auxiliary_loss_clip": 0.01148374, + "auxiliary_loss_mlp": 0.01049218, + "balance_loss_clip": 1.02813029, + "balance_loss_mlp": 1.048738, + "epoch": 0.21067187734856455, + "flos": 18989168112000.0, + "grad_norm": 2.3817148130730144, + "language_loss": 0.77991742, + "learning_rate": 3.6672406237293617e-06, + "loss": 0.80189341, + "num_input_tokens_seen": 75727050, + "router_z_loss_clip": 0.2109375, + "router_z_loss_mlp": 0.9921875, + "step": 3504, + "time_per_iteration": 2.495028018951416 + }, + { + "auxiliary_loss_clip": 0.01145149, + "auxiliary_loss_mlp": 0.01047751, + "balance_loss_clip": 1.02952361, + "balance_loss_mlp": 1.0473187, + "epoch": 0.21073200060123254, + "flos": 24681368292480.0, + "grad_norm": 1.5529728217373517, + "language_loss": 0.77045631, + "learning_rate": 3.6670254776654754e-06, + "loss": 0.79238534, + "num_input_tokens_seen": 75747175, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9765625, + "step": 3505, + "time_per_iteration": 2.5408663749694824 + }, + { + "auxiliary_loss_clip": 0.01138823, + "auxiliary_loss_mlp": 0.01046578, + "balance_loss_clip": 1.02931666, + "balance_loss_mlp": 1.04786968, + "epoch": 0.2107921238539005, + "flos": 28549342402560.0, + "grad_norm": 1.9911708078552777, + "language_loss": 0.63704473, + "learning_rate": 3.6668102683878163e-06, + "loss": 0.65889871, + "num_input_tokens_seen": 75767690, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.91015625, + "step": 3506, + "time_per_iteration": 2.564246892929077 + }, + { + "auxiliary_loss_clip": 0.01140339, + "auxiliary_loss_mlp": 0.01046628, + "balance_loss_clip": 1.02904439, + "balance_loss_mlp": 1.04773796, + "epoch": 0.21085224710656847, + "flos": 25885848078720.0, + "grad_norm": 1.8633964271687153, + "language_loss": 0.81863034, + "learning_rate": 3.6665949959045443e-06, + "loss": 0.84050006, + "num_input_tokens_seen": 75787255, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.92578125, + "step": 3507, + "time_per_iteration": 2.6049435138702393 + }, + { + "auxiliary_loss_clip": 0.011401, + "auxiliary_loss_mlp": 0.0104784, + "balance_loss_clip": 1.0299232, + "balance_loss_mlp": 1.04645514, + "epoch": 0.21091237035923643, + "flos": 14976294537600.0, + "grad_norm": 2.0263301336255135, + "language_loss": 0.75496012, + "learning_rate": 3.666379660223824e-06, + "loss": 0.77683949, + "num_input_tokens_seen": 75805890, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.93359375, + "step": 3508, + "time_per_iteration": 2.5366437435150146 + }, + { + "auxiliary_loss_clip": 0.01144539, + "auxiliary_loss_mlp": 0.01042146, + "balance_loss_clip": 1.02395463, + "balance_loss_mlp": 1.04809749, + "epoch": 0.2109724936119044, + "flos": 16362518163840.0, + "grad_norm": 2.1922875924351115, + "language_loss": 0.85395098, + "learning_rate": 3.6661642613538192e-06, + "loss": 0.87581778, + "num_input_tokens_seen": 75821620, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.96484375, + "step": 3509, + "time_per_iteration": 2.4895167350769043 + }, + { + "auxiliary_loss_clip": 0.01146568, + "auxiliary_loss_mlp": 0.01043895, + "balance_loss_clip": 1.02503562, + "balance_loss_mlp": 1.04908204, + "epoch": 0.21103261686457236, + "flos": 31502492000640.0, + "grad_norm": 1.5522473876542349, + "language_loss": 0.67803288, + "learning_rate": 3.6659487993026987e-06, + "loss": 0.69993746, + "num_input_tokens_seen": 75842490, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.9765625, + "step": 3510, + "time_per_iteration": 4.065294027328491 + }, + { + "auxiliary_loss_clip": 0.01143018, + "auxiliary_loss_mlp": 0.01041477, + "balance_loss_clip": 1.02381003, + "balance_loss_mlp": 1.04653811, + "epoch": 0.21109274011724033, + "flos": 27344072517120.0, + "grad_norm": 1.9784941086490475, + "language_loss": 0.7240749, + "learning_rate": 3.6657332740786327e-06, + "loss": 0.74591982, + "num_input_tokens_seen": 75865985, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.96484375, + "step": 3511, + "time_per_iteration": 2.5701003074645996 + }, + { + "auxiliary_loss_clip": 0.01148402, + "auxiliary_loss_mlp": 0.01039531, + "balance_loss_clip": 1.02020764, + "balance_loss_mlp": 1.05022192, + "epoch": 0.21115286336990832, + "flos": 17820383466240.0, + "grad_norm": 2.3544542512902322, + "language_loss": 0.69737375, + "learning_rate": 3.665517685689794e-06, + "loss": 0.71925306, + "num_input_tokens_seen": 75882745, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.98046875, + "step": 3512, + "time_per_iteration": 3.9019229412078857 + }, + { + "auxiliary_loss_clip": 0.01143526, + "auxiliary_loss_mlp": 0.01047621, + "balance_loss_clip": 1.02872658, + "balance_loss_mlp": 1.04680824, + "epoch": 0.2112129866225763, + "flos": 27197987904000.0, + "grad_norm": 1.6756724017558497, + "language_loss": 0.73159289, + "learning_rate": 3.6653020341443584e-06, + "loss": 0.7535044, + "num_input_tokens_seen": 75904305, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.96875, + "step": 3513, + "time_per_iteration": 2.5643980503082275 + }, + { + "auxiliary_loss_clip": 0.01140444, + "auxiliary_loss_mlp": 0.0103852, + "balance_loss_clip": 1.02212906, + "balance_loss_mlp": 1.04916954, + "epoch": 0.21127310987524425, + "flos": 23731279603200.0, + "grad_norm": 1.635076517146385, + "language_loss": 0.74235332, + "learning_rate": 3.665086319450502e-06, + "loss": 0.76414299, + "num_input_tokens_seen": 75923710, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9140625, + "step": 3514, + "time_per_iteration": 2.5240070819854736 + }, + { + "auxiliary_loss_clip": 0.01144119, + "auxiliary_loss_mlp": 0.01040689, + "balance_loss_clip": 1.02347541, + "balance_loss_mlp": 1.0482856, + "epoch": 0.21133323312791222, + "flos": 18332505624960.0, + "grad_norm": 1.7928371848293583, + "language_loss": 0.76707381, + "learning_rate": 3.6648705416164062e-06, + "loss": 0.78892195, + "num_input_tokens_seen": 75942625, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9609375, + "step": 3515, + "time_per_iteration": 2.526527166366577 + }, + { + "auxiliary_loss_clip": 0.0114362, + "auxiliary_loss_mlp": 0.01042748, + "balance_loss_clip": 1.02517664, + "balance_loss_mlp": 1.04956555, + "epoch": 0.21139335638058018, + "flos": 17931203902080.0, + "grad_norm": 1.8516547188762509, + "language_loss": 0.68242604, + "learning_rate": 3.6646547006502518e-06, + "loss": 0.70428967, + "num_input_tokens_seen": 75959930, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.94140625, + "step": 3516, + "time_per_iteration": 2.46085786819458 + }, + { + "auxiliary_loss_clip": 0.01145197, + "auxiliary_loss_mlp": 0.01047209, + "balance_loss_clip": 1.02883935, + "balance_loss_mlp": 1.04901481, + "epoch": 0.21145347963324815, + "flos": 24572092141440.0, + "grad_norm": 1.653683865815189, + "language_loss": 0.85012519, + "learning_rate": 3.664438796560225e-06, + "loss": 0.87204921, + "num_input_tokens_seen": 75980335, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.9609375, + "step": 3517, + "time_per_iteration": 2.5080301761627197 + }, + { + "auxiliary_loss_clip": 0.01141463, + "auxiliary_loss_mlp": 0.01037033, + "balance_loss_clip": 1.01965201, + "balance_loss_mlp": 1.04722667, + "epoch": 0.21151360288591614, + "flos": 35845959375360.0, + "grad_norm": 2.26725319642869, + "language_loss": 0.62925792, + "learning_rate": 3.664222829354512e-06, + "loss": 0.65104288, + "num_input_tokens_seen": 76002095, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.94140625, + "step": 3518, + "time_per_iteration": 2.5949900150299072 + }, + { + "auxiliary_loss_clip": 0.01142565, + "auxiliary_loss_mlp": 0.01049413, + "balance_loss_clip": 1.03290248, + "balance_loss_mlp": 1.04891765, + "epoch": 0.2115737261385841, + "flos": 24641579001600.0, + "grad_norm": 1.8284325952385483, + "language_loss": 0.88772321, + "learning_rate": 3.664006799041303e-06, + "loss": 0.90964293, + "num_input_tokens_seen": 76020425, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.9375, + "step": 3519, + "time_per_iteration": 2.5356082916259766 + }, + { + "auxiliary_loss_clip": 0.0114445, + "auxiliary_loss_mlp": 0.01049295, + "balance_loss_clip": 1.03184235, + "balance_loss_mlp": 1.04866135, + "epoch": 0.21163384939125207, + "flos": 25226887121280.0, + "grad_norm": 1.5988506078375424, + "language_loss": 0.81066215, + "learning_rate": 3.6637907056287886e-06, + "loss": 0.83259952, + "num_input_tokens_seen": 76041210, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.95703125, + "step": 3520, + "time_per_iteration": 2.5069239139556885 + }, + { + "auxiliary_loss_clip": 0.0113827, + "auxiliary_loss_mlp": 0.01046076, + "balance_loss_clip": 1.02926779, + "balance_loss_mlp": 1.0469681, + "epoch": 0.21169397264392004, + "flos": 26067520091520.0, + "grad_norm": 1.592359744312873, + "language_loss": 0.76163614, + "learning_rate": 3.6635745491251642e-06, + "loss": 0.78347969, + "num_input_tokens_seen": 76062685, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9140625, + "step": 3521, + "time_per_iteration": 2.560037851333618 + }, + { + "auxiliary_loss_clip": 0.0113934, + "auxiliary_loss_mlp": 0.0104393, + "balance_loss_clip": 1.02842069, + "balance_loss_mlp": 1.04592443, + "epoch": 0.211754095896588, + "flos": 23108265181440.0, + "grad_norm": 2.0717596449561024, + "language_loss": 0.75950933, + "learning_rate": 3.663358329538626e-06, + "loss": 0.78134197, + "num_input_tokens_seen": 76082300, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.93359375, + "step": 3522, + "time_per_iteration": 2.4758715629577637 + }, + { + "auxiliary_loss_clip": 0.01141462, + "auxiliary_loss_mlp": 0.01049727, + "balance_loss_clip": 1.03176177, + "balance_loss_mlp": 1.04737353, + "epoch": 0.21181421914925597, + "flos": 27922341571200.0, + "grad_norm": 2.026497436525855, + "language_loss": 0.70436251, + "learning_rate": 3.663142046877374e-06, + "loss": 0.72627443, + "num_input_tokens_seen": 76101135, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.94140625, + "step": 3523, + "time_per_iteration": 2.5368640422821045 + }, + { + "auxiliary_loss_clip": 0.01140964, + "auxiliary_loss_mlp": 0.01044975, + "balance_loss_clip": 1.02786803, + "balance_loss_mlp": 1.04820895, + "epoch": 0.21187434240192393, + "flos": 17128636369920.0, + "grad_norm": 2.216886450348082, + "language_loss": 0.76683456, + "learning_rate": 3.6629257011496085e-06, + "loss": 0.7886939, + "num_input_tokens_seen": 76119320, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.92578125, + "step": 3524, + "time_per_iteration": 2.5932695865631104 + }, + { + "auxiliary_loss_clip": 0.01139634, + "auxiliary_loss_mlp": 0.01042013, + "balance_loss_clip": 1.02533603, + "balance_loss_mlp": 1.04276347, + "epoch": 0.21193446565459192, + "flos": 22347318533760.0, + "grad_norm": 2.020092904399728, + "language_loss": 0.81433582, + "learning_rate": 3.6627092923635338e-06, + "loss": 0.83615232, + "num_input_tokens_seen": 76137445, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.96875, + "step": 3525, + "time_per_iteration": 2.5425641536712646 + }, + { + "auxiliary_loss_clip": 0.011388, + "auxiliary_loss_mlp": 0.01041718, + "balance_loss_clip": 1.02484989, + "balance_loss_mlp": 1.04668331, + "epoch": 0.2119945889072599, + "flos": 27199316707200.0, + "grad_norm": 2.1031950889850655, + "language_loss": 0.75104785, + "learning_rate": 3.662492820527356e-06, + "loss": 0.77285308, + "num_input_tokens_seen": 76159500, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.921875, + "step": 3526, + "time_per_iteration": 2.533210515975952 + }, + { + "auxiliary_loss_clip": 0.01142205, + "auxiliary_loss_mlp": 0.01041732, + "balance_loss_clip": 1.02466083, + "balance_loss_mlp": 1.04663801, + "epoch": 0.21205471215992786, + "flos": 20991869884800.0, + "grad_norm": 1.9135764326712537, + "language_loss": 0.77385598, + "learning_rate": 3.662276285649284e-06, + "loss": 0.79569542, + "num_input_tokens_seen": 76177990, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.95703125, + "step": 3527, + "time_per_iteration": 2.53898286819458 + }, + { + "auxiliary_loss_clip": 0.0113944, + "auxiliary_loss_mlp": 0.01045919, + "balance_loss_clip": 1.02797842, + "balance_loss_mlp": 1.0461328, + "epoch": 0.21211483541259582, + "flos": 20777663128320.0, + "grad_norm": 1.981008674330079, + "language_loss": 0.78037727, + "learning_rate": 3.662059687737528e-06, + "loss": 0.80223083, + "num_input_tokens_seen": 76197125, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9296875, + "step": 3528, + "time_per_iteration": 2.5360231399536133 + }, + { + "auxiliary_loss_clip": 0.01138776, + "auxiliary_loss_mlp": 0.01047702, + "balance_loss_clip": 1.03096509, + "balance_loss_mlp": 1.04611731, + "epoch": 0.21217495866526379, + "flos": 18989994124800.0, + "grad_norm": 1.7275367809487383, + "language_loss": 0.8170321, + "learning_rate": 3.6618430268003024e-06, + "loss": 0.83889693, + "num_input_tokens_seen": 76216215, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.92578125, + "step": 3529, + "time_per_iteration": 2.531228542327881 + }, + { + "auxiliary_loss_clip": 0.01141251, + "auxiliary_loss_mlp": 0.01044804, + "balance_loss_clip": 1.028234, + "balance_loss_mlp": 1.04647708, + "epoch": 0.21223508191793175, + "flos": 20667309569280.0, + "grad_norm": 2.1603106904513547, + "language_loss": 0.76616383, + "learning_rate": 3.6616263028458235e-06, + "loss": 0.78802443, + "num_input_tokens_seen": 76237010, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9453125, + "step": 3530, + "time_per_iteration": 2.5361740589141846 + }, + { + "auxiliary_loss_clip": 0.01136983, + "auxiliary_loss_mlp": 0.01041907, + "balance_loss_clip": 1.02593338, + "balance_loss_mlp": 1.0451746, + "epoch": 0.21229520517059972, + "flos": 21616464504960.0, + "grad_norm": 2.3391242970409873, + "language_loss": 0.82978404, + "learning_rate": 3.661409515882308e-06, + "loss": 0.85157299, + "num_input_tokens_seen": 76255965, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.91796875, + "step": 3531, + "time_per_iteration": 2.571411609649658 + }, + { + "auxiliary_loss_clip": 0.01141528, + "auxiliary_loss_mlp": 0.01039512, + "balance_loss_clip": 1.02149963, + "balance_loss_mlp": 1.04744506, + "epoch": 0.2123553284232677, + "flos": 13991049411840.0, + "grad_norm": 2.416019676502894, + "language_loss": 0.73473567, + "learning_rate": 3.661192665917977e-06, + "loss": 0.75654608, + "num_input_tokens_seen": 76272150, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.94140625, + "step": 3532, + "time_per_iteration": 2.473006248474121 + }, + { + "auxiliary_loss_clip": 0.01138524, + "auxiliary_loss_mlp": 0.01042643, + "balance_loss_clip": 1.02485681, + "balance_loss_mlp": 1.04561734, + "epoch": 0.21241545167593567, + "flos": 18296774570880.0, + "grad_norm": 1.7353898898315339, + "language_loss": 0.73855233, + "learning_rate": 3.660975752961054e-06, + "loss": 0.76036394, + "num_input_tokens_seen": 76291425, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.92578125, + "step": 3533, + "time_per_iteration": 2.526780366897583 + }, + { + "auxiliary_loss_clip": 0.01140469, + "auxiliary_loss_mlp": 0.01045491, + "balance_loss_clip": 1.02833724, + "balance_loss_mlp": 1.04576015, + "epoch": 0.21247557492860364, + "flos": 34713121265280.0, + "grad_norm": 1.8944995629732337, + "language_loss": 0.7098999, + "learning_rate": 3.6607587770197634e-06, + "loss": 0.73175949, + "num_input_tokens_seen": 76313975, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9453125, + "step": 3534, + "time_per_iteration": 2.6947309970855713 + }, + { + "auxiliary_loss_clip": 0.01141409, + "auxiliary_loss_mlp": 0.01038239, + "balance_loss_clip": 1.02032161, + "balance_loss_mlp": 1.04669714, + "epoch": 0.2125356981812716, + "flos": 22053820504320.0, + "grad_norm": 1.9387778569542722, + "language_loss": 0.71567297, + "learning_rate": 3.6605417381023346e-06, + "loss": 0.73746949, + "num_input_tokens_seen": 76330955, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9453125, + "step": 3535, + "time_per_iteration": 2.6022329330444336 + }, + { + "auxiliary_loss_clip": 0.01136751, + "auxiliary_loss_mlp": 0.01046685, + "balance_loss_clip": 1.0299238, + "balance_loss_mlp": 1.04549336, + "epoch": 0.21259582143393957, + "flos": 28548336821760.0, + "grad_norm": 1.8756666540330442, + "language_loss": 0.7040931, + "learning_rate": 3.660324636216996e-06, + "loss": 0.72592747, + "num_input_tokens_seen": 76352680, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.91015625, + "step": 3536, + "time_per_iteration": 2.6005256175994873 + }, + { + "auxiliary_loss_clip": 0.01140865, + "auxiliary_loss_mlp": 0.01044171, + "balance_loss_clip": 1.02706444, + "balance_loss_mlp": 1.04512393, + "epoch": 0.21265594468660753, + "flos": 20120892900480.0, + "grad_norm": 1.9573194210103453, + "language_loss": 0.88217437, + "learning_rate": 3.660107471371981e-06, + "loss": 0.90402472, + "num_input_tokens_seen": 76370750, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.95703125, + "step": 3537, + "time_per_iteration": 2.5565810203552246 + }, + { + "auxiliary_loss_clip": 0.01134343, + "auxiliary_loss_mlp": 0.01040555, + "balance_loss_clip": 1.02425885, + "balance_loss_mlp": 1.0437026, + "epoch": 0.21271606793927553, + "flos": 23076161400960.0, + "grad_norm": 1.957058885696691, + "language_loss": 0.80129743, + "learning_rate": 3.659890243575524e-06, + "loss": 0.82304639, + "num_input_tokens_seen": 76390610, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.90625, + "step": 3538, + "time_per_iteration": 2.5501785278320312 + }, + { + "auxiliary_loss_clip": 0.01135328, + "auxiliary_loss_mlp": 0.01041186, + "balance_loss_clip": 1.025653, + "balance_loss_mlp": 1.0446775, + "epoch": 0.2127761911919435, + "flos": 26388201738240.0, + "grad_norm": 1.587715235485788, + "language_loss": 0.87131894, + "learning_rate": 3.659672952835863e-06, + "loss": 0.89308405, + "num_input_tokens_seen": 76408860, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.90625, + "step": 3539, + "time_per_iteration": 2.5751259326934814 + }, + { + "auxiliary_loss_clip": 0.01139264, + "auxiliary_loss_mlp": 0.01045476, + "balance_loss_clip": 1.02914476, + "balance_loss_mlp": 1.04718518, + "epoch": 0.21283631444461146, + "flos": 20228265630720.0, + "grad_norm": 3.3040839486156184, + "language_loss": 0.57464051, + "learning_rate": 3.659455599161237e-06, + "loss": 0.59648788, + "num_input_tokens_seen": 76424980, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.921875, + "step": 3540, + "time_per_iteration": 2.4746458530426025 + }, + { + "auxiliary_loss_clip": 0.01140156, + "auxiliary_loss_mlp": 0.01040246, + "balance_loss_clip": 1.02330637, + "balance_loss_mlp": 1.04658604, + "epoch": 0.21289643769727942, + "flos": 13516992691200.0, + "grad_norm": 5.8376417218282874, + "language_loss": 0.76062799, + "learning_rate": 3.659238182559888e-06, + "loss": 0.78243208, + "num_input_tokens_seen": 76443135, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.93359375, + "step": 3541, + "time_per_iteration": 2.5111818313598633 + }, + { + "auxiliary_loss_clip": 0.0113571, + "auxiliary_loss_mlp": 0.01041682, + "balance_loss_clip": 1.02517211, + "balance_loss_mlp": 1.04530454, + "epoch": 0.2129565609499474, + "flos": 24827021942400.0, + "grad_norm": 1.9190227230034667, + "language_loss": 0.69458514, + "learning_rate": 3.6590207030400615e-06, + "loss": 0.71635908, + "num_input_tokens_seen": 76462470, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.90234375, + "step": 3542, + "time_per_iteration": 2.556300401687622 + }, + { + "auxiliary_loss_clip": 0.01134092, + "auxiliary_loss_mlp": 0.01034857, + "balance_loss_clip": 1.01945567, + "balance_loss_mlp": 1.04443789, + "epoch": 0.21301668420261535, + "flos": 23659242877440.0, + "grad_norm": 1.8172219669397587, + "language_loss": 0.75591409, + "learning_rate": 3.658803160610004e-06, + "loss": 0.77760351, + "num_input_tokens_seen": 76481995, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8984375, + "step": 3543, + "time_per_iteration": 2.54424786567688 + }, + { + "auxiliary_loss_clip": 0.01138428, + "auxiliary_loss_mlp": 0.01038735, + "balance_loss_clip": 1.02253413, + "balance_loss_mlp": 1.04843175, + "epoch": 0.21307680745528332, + "flos": 16362805472640.0, + "grad_norm": 2.1531603349332915, + "language_loss": 0.66787028, + "learning_rate": 3.6585855552779634e-06, + "loss": 0.68964195, + "num_input_tokens_seen": 76500245, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8984375, + "step": 3544, + "time_per_iteration": 2.516359329223633 + }, + { + "auxiliary_loss_clip": 0.01136471, + "auxiliary_loss_mlp": 0.01040176, + "balance_loss_clip": 1.0245831, + "balance_loss_mlp": 1.04379654, + "epoch": 0.2131369307079513, + "flos": 19099054794240.0, + "grad_norm": 1.9827170900636153, + "language_loss": 0.71089172, + "learning_rate": 3.6583678870521934e-06, + "loss": 0.73265821, + "num_input_tokens_seen": 76519535, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.92578125, + "step": 3545, + "time_per_iteration": 2.5377357006073 + }, + { + "auxiliary_loss_clip": 0.01138848, + "auxiliary_loss_mlp": 0.01046644, + "balance_loss_clip": 1.03095567, + "balance_loss_mlp": 1.04571509, + "epoch": 0.21319705396061928, + "flos": 30372275583360.0, + "grad_norm": 1.730364240275379, + "language_loss": 0.72334421, + "learning_rate": 3.658150155940946e-06, + "loss": 0.74519908, + "num_input_tokens_seen": 76542065, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.9296875, + "step": 3546, + "time_per_iteration": 2.5640652179718018 + }, + { + "auxiliary_loss_clip": 0.0113929, + "auxiliary_loss_mlp": 0.0104318, + "balance_loss_clip": 1.02695596, + "balance_loss_mlp": 1.0467453, + "epoch": 0.21325717721328724, + "flos": 21756192410880.0, + "grad_norm": 1.889324350950523, + "language_loss": 0.80698627, + "learning_rate": 3.657932361952479e-06, + "loss": 0.82881093, + "num_input_tokens_seen": 76560540, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.92578125, + "step": 3547, + "time_per_iteration": 2.527398109436035 + }, + { + "auxiliary_loss_clip": 0.01140759, + "auxiliary_loss_mlp": 0.0104395, + "balance_loss_clip": 1.02702212, + "balance_loss_mlp": 1.04538703, + "epoch": 0.2133173004659552, + "flos": 28730870760960.0, + "grad_norm": 3.232228952830713, + "language_loss": 0.74496448, + "learning_rate": 3.6577145050950504e-06, + "loss": 0.76681155, + "num_input_tokens_seen": 76581760, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.953125, + "step": 3548, + "time_per_iteration": 2.5493834018707275 + }, + { + "auxiliary_loss_clip": 0.01141872, + "auxiliary_loss_mlp": 0.01045412, + "balance_loss_clip": 1.02719641, + "balance_loss_mlp": 1.04663396, + "epoch": 0.21337742371862317, + "flos": 16837077674880.0, + "grad_norm": 2.0441969792992265, + "language_loss": 0.74135804, + "learning_rate": 3.657496585376922e-06, + "loss": 0.76323086, + "num_input_tokens_seen": 76599940, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.953125, + "step": 3549, + "time_per_iteration": 2.514817476272583 + }, + { + "auxiliary_loss_clip": 0.01142468, + "auxiliary_loss_mlp": 0.01046789, + "balance_loss_clip": 1.03063631, + "balance_loss_mlp": 1.04963064, + "epoch": 0.21343754697129114, + "flos": 24424930120320.0, + "grad_norm": 1.6981522694050752, + "language_loss": 0.80653727, + "learning_rate": 3.657278602806357e-06, + "loss": 0.82842982, + "num_input_tokens_seen": 76619580, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.92578125, + "step": 3550, + "time_per_iteration": 2.541501045227051 + }, + { + "auxiliary_loss_clip": 0.01136887, + "auxiliary_loss_mlp": 0.01044073, + "balance_loss_clip": 1.02883255, + "balance_loss_mlp": 1.04706621, + "epoch": 0.21349767022395913, + "flos": 19277817805440.0, + "grad_norm": 1.615115943492657, + "language_loss": 0.88341218, + "learning_rate": 3.657060557391621e-06, + "loss": 0.90522182, + "num_input_tokens_seen": 76638195, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8984375, + "step": 3551, + "time_per_iteration": 2.5310463905334473 + }, + { + "auxiliary_loss_clip": 0.01136336, + "auxiliary_loss_mlp": 0.0104486, + "balance_loss_clip": 1.02887464, + "balance_loss_mlp": 1.04430258, + "epoch": 0.2135577934766271, + "flos": 17347547808000.0, + "grad_norm": 2.1215125327645152, + "language_loss": 0.83415043, + "learning_rate": 3.656842449140983e-06, + "loss": 0.8559624, + "num_input_tokens_seen": 76656695, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.91796875, + "step": 3552, + "time_per_iteration": 3.974120616912842 + }, + { + "auxiliary_loss_clip": 0.0113546, + "auxiliary_loss_mlp": 0.01048241, + "balance_loss_clip": 1.03164101, + "balance_loss_mlp": 1.04522753, + "epoch": 0.21361791672929506, + "flos": 24057204635520.0, + "grad_norm": 1.7556537525349103, + "language_loss": 0.76692683, + "learning_rate": 3.656624278062713e-06, + "loss": 0.78876388, + "num_input_tokens_seen": 76677430, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.90234375, + "step": 3553, + "time_per_iteration": 3.964289903640747 + }, + { + "auxiliary_loss_clip": 0.0113607, + "auxiliary_loss_mlp": 0.01040019, + "balance_loss_clip": 1.02520156, + "balance_loss_mlp": 1.04556942, + "epoch": 0.21367803998196302, + "flos": 22162306556160.0, + "grad_norm": 1.6502841430946371, + "language_loss": 0.72946119, + "learning_rate": 3.6564060441650843e-06, + "loss": 0.75122207, + "num_input_tokens_seen": 76697615, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.90625, + "step": 3554, + "time_per_iteration": 2.5141818523406982 + }, + { + "auxiliary_loss_clip": 0.01137832, + "auxiliary_loss_mlp": 0.01036933, + "balance_loss_clip": 1.02121508, + "balance_loss_mlp": 1.04672861, + "epoch": 0.213738163234631, + "flos": 20886867452160.0, + "grad_norm": 1.9371755733444218, + "language_loss": 0.6745261, + "learning_rate": 3.6561877474563724e-06, + "loss": 0.69627374, + "num_input_tokens_seen": 76715685, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.91015625, + "step": 3555, + "time_per_iteration": 2.6116089820861816 + }, + { + "auxiliary_loss_clip": 0.01138406, + "auxiliary_loss_mlp": 0.01039389, + "balance_loss_clip": 1.02293801, + "balance_loss_mlp": 1.04564714, + "epoch": 0.21379828648729896, + "flos": 28403114135040.0, + "grad_norm": 2.2550763051095752, + "language_loss": 0.64778429, + "learning_rate": 3.6559693879448553e-06, + "loss": 0.66956222, + "num_input_tokens_seen": 76735405, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9296875, + "step": 3556, + "time_per_iteration": 2.553746223449707 + }, + { + "auxiliary_loss_clip": 0.01139299, + "auxiliary_loss_mlp": 0.01045701, + "balance_loss_clip": 1.02893996, + "balance_loss_mlp": 1.04656768, + "epoch": 0.21385840973996692, + "flos": 25479662106240.0, + "grad_norm": 1.6295299556205536, + "language_loss": 0.72333252, + "learning_rate": 3.6557509656388125e-06, + "loss": 0.74518251, + "num_input_tokens_seen": 76754395, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9296875, + "step": 3557, + "time_per_iteration": 2.6562533378601074 + }, + { + "auxiliary_loss_clip": 0.0114268, + "auxiliary_loss_mlp": 0.01039642, + "balance_loss_clip": 1.02189136, + "balance_loss_mlp": 1.04716706, + "epoch": 0.2139185329926349, + "flos": 28074280101120.0, + "grad_norm": 1.6722734443717013, + "language_loss": 0.67139357, + "learning_rate": 3.655532480546528e-06, + "loss": 0.6932168, + "num_input_tokens_seen": 76777210, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.953125, + "step": 3558, + "time_per_iteration": 2.5435290336608887 + }, + { + "auxiliary_loss_clip": 0.01142773, + "auxiliary_loss_mlp": 0.01036302, + "balance_loss_clip": 1.01943386, + "balance_loss_mlp": 1.04542494, + "epoch": 0.21397865624530288, + "flos": 19608698914560.0, + "grad_norm": 1.8839208997443517, + "language_loss": 0.79702216, + "learning_rate": 3.655313932676286e-06, + "loss": 0.81881285, + "num_input_tokens_seen": 76795830, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9765625, + "step": 3559, + "time_per_iteration": 2.5535330772399902 + }, + { + "auxiliary_loss_clip": 0.01137143, + "auxiliary_loss_mlp": 0.01044167, + "balance_loss_clip": 1.02822304, + "balance_loss_mlp": 1.04436731, + "epoch": 0.21403877949797084, + "flos": 24681476033280.0, + "grad_norm": 1.6653874224583467, + "language_loss": 0.67549068, + "learning_rate": 3.655095322036373e-06, + "loss": 0.69730377, + "num_input_tokens_seen": 76814700, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.9296875, + "step": 3560, + "time_per_iteration": 2.5241451263427734 + }, + { + "auxiliary_loss_clip": 0.0114283, + "auxiliary_loss_mlp": 0.01041365, + "balance_loss_clip": 1.02514052, + "balance_loss_mlp": 1.04846883, + "epoch": 0.2140989027506388, + "flos": 19861150677120.0, + "grad_norm": 1.8721878156787213, + "language_loss": 0.72995424, + "learning_rate": 3.65487664863508e-06, + "loss": 0.75179613, + "num_input_tokens_seen": 76833400, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.9453125, + "step": 3561, + "time_per_iteration": 2.5678720474243164 + }, + { + "auxiliary_loss_clip": 0.01142897, + "auxiliary_loss_mlp": 0.01044952, + "balance_loss_clip": 1.02817965, + "balance_loss_mlp": 1.04897678, + "epoch": 0.21415902600330677, + "flos": 19135324552320.0, + "grad_norm": 2.2783713689110243, + "language_loss": 0.77110738, + "learning_rate": 3.654657912480698e-06, + "loss": 0.79298586, + "num_input_tokens_seen": 76850645, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9375, + "step": 3562, + "time_per_iteration": 2.4598803520202637 + }, + { + "auxiliary_loss_clip": 0.01140561, + "auxiliary_loss_mlp": 0.01038127, + "balance_loss_clip": 1.02160454, + "balance_loss_mlp": 1.04795694, + "epoch": 0.21421914925597474, + "flos": 22272624201600.0, + "grad_norm": 1.5929440625910447, + "language_loss": 0.84534913, + "learning_rate": 3.6544391135815237e-06, + "loss": 0.867136, + "num_input_tokens_seen": 76870135, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.92578125, + "step": 3563, + "time_per_iteration": 2.5654757022857666 + }, + { + "auxiliary_loss_clip": 0.0114087, + "auxiliary_loss_mlp": 0.01038331, + "balance_loss_clip": 1.02227342, + "balance_loss_mlp": 1.04757166, + "epoch": 0.2142792725086427, + "flos": 33875109987840.0, + "grad_norm": 1.6134338415520206, + "language_loss": 0.76727796, + "learning_rate": 3.6542202519458507e-06, + "loss": 0.78907001, + "num_input_tokens_seen": 76893905, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.93359375, + "step": 3564, + "time_per_iteration": 2.591064214706421 + }, + { + "auxiliary_loss_clip": 0.01138542, + "auxiliary_loss_mlp": 0.01041793, + "balance_loss_clip": 1.02560401, + "balance_loss_mlp": 1.0467248, + "epoch": 0.2143393957613107, + "flos": 19860216923520.0, + "grad_norm": 1.880454163642384, + "language_loss": 0.88260084, + "learning_rate": 3.654001327581981e-06, + "loss": 0.90440416, + "num_input_tokens_seen": 76914205, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.91796875, + "step": 3565, + "time_per_iteration": 2.571242094039917 + }, + { + "auxiliary_loss_clip": 0.0105057, + "auxiliary_loss_mlp": 0.01009282, + "balance_loss_clip": 1.00739813, + "balance_loss_mlp": 1.0192101, + "epoch": 0.21439951901397866, + "flos": 68530093090560.0, + "grad_norm": 0.8403524328969202, + "language_loss": 0.52300179, + "learning_rate": 3.653782340498215e-06, + "loss": 0.54360026, + "num_input_tokens_seen": 76975650, + "router_z_loss_clip": 0.01879883, + "router_z_loss_mlp": 0.3125, + "step": 3566, + "time_per_iteration": 3.055588722229004 + }, + { + "auxiliary_loss_clip": 0.01136421, + "auxiliary_loss_mlp": 0.01036219, + "balance_loss_clip": 1.02093637, + "balance_loss_mlp": 1.04677701, + "epoch": 0.21445964226664663, + "flos": 19682998197120.0, + "grad_norm": 1.91490691342046, + "language_loss": 0.67412555, + "learning_rate": 3.6535632907028566e-06, + "loss": 0.69585192, + "num_input_tokens_seen": 76992615, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.89453125, + "step": 3567, + "time_per_iteration": 2.5511529445648193 + }, + { + "auxiliary_loss_clip": 0.01135888, + "auxiliary_loss_mlp": 0.01041672, + "balance_loss_clip": 1.02630615, + "balance_loss_mlp": 1.04691041, + "epoch": 0.2145197655193146, + "flos": 31107259676160.0, + "grad_norm": 1.6974661731729381, + "language_loss": 0.74437779, + "learning_rate": 3.6533441782042126e-06, + "loss": 0.7661534, + "num_input_tokens_seen": 77017005, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.890625, + "step": 3568, + "time_per_iteration": 2.613090753555298 + }, + { + "auxiliary_loss_clip": 0.01137867, + "auxiliary_loss_mlp": 0.01043309, + "balance_loss_clip": 1.02710819, + "balance_loss_mlp": 1.04578757, + "epoch": 0.21457988877198256, + "flos": 20120785159680.0, + "grad_norm": 1.7479940521784256, + "language_loss": 0.77864397, + "learning_rate": 3.6531250030105917e-06, + "loss": 0.80045569, + "num_input_tokens_seen": 77034990, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.921875, + "step": 3569, + "time_per_iteration": 2.567439317703247 + }, + { + "auxiliary_loss_clip": 0.01147794, + "auxiliary_loss_mlp": 0.01038363, + "balance_loss_clip": 1.01981413, + "balance_loss_mlp": 1.05039883, + "epoch": 0.21464001202465052, + "flos": 18588045957120.0, + "grad_norm": 2.3364918832975317, + "language_loss": 0.69533777, + "learning_rate": 3.6529057651303053e-06, + "loss": 0.71719933, + "num_input_tokens_seen": 77052610, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9765625, + "step": 3570, + "time_per_iteration": 2.489550828933716 + }, + { + "auxiliary_loss_clip": 0.01144243, + "auxiliary_loss_mlp": 0.01043749, + "balance_loss_clip": 1.02703631, + "balance_loss_mlp": 1.0480299, + "epoch": 0.21470013527731852, + "flos": 21835160461440.0, + "grad_norm": 2.465398793786977, + "language_loss": 0.78108835, + "learning_rate": 3.6526864645716666e-06, + "loss": 0.80296826, + "num_input_tokens_seen": 77072475, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9609375, + "step": 3571, + "time_per_iteration": 2.527509927749634 + }, + { + "auxiliary_loss_clip": 0.01143428, + "auxiliary_loss_mlp": 0.01043615, + "balance_loss_clip": 1.02556705, + "balance_loss_mlp": 1.0501976, + "epoch": 0.21476025852998648, + "flos": 17603195880960.0, + "grad_norm": 2.5347995603010767, + "language_loss": 0.82851684, + "learning_rate": 3.652467101342991e-06, + "loss": 0.85038722, + "num_input_tokens_seen": 77089930, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.93359375, + "step": 3572, + "time_per_iteration": 2.491955280303955 + }, + { + "auxiliary_loss_clip": 0.01144597, + "auxiliary_loss_mlp": 0.01039432, + "balance_loss_clip": 1.02248025, + "balance_loss_mlp": 1.04700291, + "epoch": 0.21482038178265445, + "flos": 24828135264000.0, + "grad_norm": 2.35018592277076, + "language_loss": 0.64916813, + "learning_rate": 3.652247675452598e-06, + "loss": 0.67100847, + "num_input_tokens_seen": 77108970, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9765625, + "step": 3573, + "time_per_iteration": 2.5238969326019287 + }, + { + "auxiliary_loss_clip": 0.01133482, + "auxiliary_loss_mlp": 0.01040895, + "balance_loss_clip": 1.02481413, + "balance_loss_mlp": 1.04417133, + "epoch": 0.2148805050353224, + "flos": 23258228463360.0, + "grad_norm": 2.2164535787006705, + "language_loss": 0.75577438, + "learning_rate": 3.652028186908807e-06, + "loss": 0.77751815, + "num_input_tokens_seen": 77126045, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.890625, + "step": 3574, + "time_per_iteration": 2.5497734546661377 + }, + { + "auxiliary_loss_clip": 0.01137499, + "auxiliary_loss_mlp": 0.01035076, + "balance_loss_clip": 1.01752853, + "balance_loss_mlp": 1.04568887, + "epoch": 0.21494062828799038, + "flos": 21321098968320.0, + "grad_norm": 1.959683075701339, + "language_loss": 0.72380054, + "learning_rate": 3.6518086357199416e-06, + "loss": 0.74552631, + "num_input_tokens_seen": 77144600, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.91796875, + "step": 3575, + "time_per_iteration": 2.539255142211914 + }, + { + "auxiliary_loss_clip": 0.01141362, + "auxiliary_loss_mlp": 0.01036894, + "balance_loss_clip": 1.02097976, + "balance_loss_mlp": 1.04890776, + "epoch": 0.21500075154065834, + "flos": 18843334894080.0, + "grad_norm": 1.6473570004326006, + "language_loss": 0.68102455, + "learning_rate": 3.6515890218943277e-06, + "loss": 0.70280713, + "num_input_tokens_seen": 77162965, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.92578125, + "step": 3576, + "time_per_iteration": 2.515245199203491 + }, + { + "auxiliary_loss_clip": 0.01144679, + "auxiliary_loss_mlp": 0.01041063, + "balance_loss_clip": 1.02347922, + "balance_loss_mlp": 1.04820943, + "epoch": 0.2150608747933263, + "flos": 18441997257600.0, + "grad_norm": 2.1450103743023936, + "language_loss": 0.88840854, + "learning_rate": 3.651369345440292e-06, + "loss": 0.91026592, + "num_input_tokens_seen": 77179960, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9609375, + "step": 3577, + "time_per_iteration": 2.4426753520965576 + }, + { + "auxiliary_loss_clip": 0.01054886, + "auxiliary_loss_mlp": 0.01006787, + "balance_loss_clip": 1.00466526, + "balance_loss_mlp": 1.02252448, + "epoch": 0.2151209980459943, + "flos": 66598242894720.0, + "grad_norm": 0.8177210285410575, + "language_loss": 0.56242883, + "learning_rate": 3.6511496063661654e-06, + "loss": 0.5830456, + "num_input_tokens_seen": 77239500, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.32421875, + "step": 3578, + "time_per_iteration": 3.0434820652008057 + }, + { + "auxiliary_loss_clip": 0.0114273, + "auxiliary_loss_mlp": 0.01039801, + "balance_loss_clip": 1.02345788, + "balance_loss_mlp": 1.04957211, + "epoch": 0.21518112129866226, + "flos": 21575885114880.0, + "grad_norm": 1.6812319537870581, + "language_loss": 0.88500881, + "learning_rate": 3.6509298046802807e-06, + "loss": 0.90683413, + "num_input_tokens_seen": 77254680, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.9296875, + "step": 3579, + "time_per_iteration": 2.4646458625793457 + }, + { + "auxiliary_loss_clip": 0.01140846, + "auxiliary_loss_mlp": 0.01042477, + "balance_loss_clip": 1.02551329, + "balance_loss_mlp": 1.04618824, + "epoch": 0.21524124455133023, + "flos": 20047635112320.0, + "grad_norm": 1.7668055337606152, + "language_loss": 0.78238297, + "learning_rate": 3.650709940390972e-06, + "loss": 0.80421615, + "num_input_tokens_seen": 77274060, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9453125, + "step": 3580, + "time_per_iteration": 2.5029854774475098 + }, + { + "auxiliary_loss_clip": 0.01138764, + "auxiliary_loss_mlp": 0.01042384, + "balance_loss_clip": 1.02557576, + "balance_loss_mlp": 1.04757452, + "epoch": 0.2153013678039982, + "flos": 23951807153280.0, + "grad_norm": 1.7955176576656944, + "language_loss": 0.73129165, + "learning_rate": 3.6504900135065775e-06, + "loss": 0.75310302, + "num_input_tokens_seen": 77293255, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9140625, + "step": 3581, + "time_per_iteration": 2.503103733062744 + }, + { + "auxiliary_loss_clip": 0.01137091, + "auxiliary_loss_mlp": 0.0104596, + "balance_loss_clip": 1.02723205, + "balance_loss_mlp": 1.04665411, + "epoch": 0.21536149105666616, + "flos": 20594841880320.0, + "grad_norm": 2.610409860459302, + "language_loss": 0.70739609, + "learning_rate": 3.6502700240354357e-06, + "loss": 0.72922659, + "num_input_tokens_seen": 77312390, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.90234375, + "step": 3582, + "time_per_iteration": 2.4840197563171387 + }, + { + "auxiliary_loss_clip": 0.01137402, + "auxiliary_loss_mlp": 0.01041337, + "balance_loss_clip": 1.02401567, + "balance_loss_mlp": 1.04602027, + "epoch": 0.21542161430933413, + "flos": 12860042895360.0, + "grad_norm": 2.8570718584923633, + "language_loss": 0.84140432, + "learning_rate": 3.650049971985889e-06, + "loss": 0.86319172, + "num_input_tokens_seen": 77330985, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9140625, + "step": 3583, + "time_per_iteration": 2.4435312747955322 + }, + { + "auxiliary_loss_clip": 0.01143933, + "auxiliary_loss_mlp": 0.01045352, + "balance_loss_clip": 1.02834046, + "balance_loss_mlp": 1.04859185, + "epoch": 0.21548173756200212, + "flos": 26103933504000.0, + "grad_norm": 3.180305067245919, + "language_loss": 0.83226246, + "learning_rate": 3.6498298573662824e-06, + "loss": 0.8541553, + "num_input_tokens_seen": 77350770, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.953125, + "step": 3584, + "time_per_iteration": 2.521476984024048 + }, + { + "auxiliary_loss_clip": 0.01136808, + "auxiliary_loss_mlp": 0.01046426, + "balance_loss_clip": 1.02816272, + "balance_loss_mlp": 1.04518461, + "epoch": 0.21554186081467008, + "flos": 22163779013760.0, + "grad_norm": 2.0358477693345667, + "language_loss": 0.90233314, + "learning_rate": 3.6496096801849625e-06, + "loss": 0.92416549, + "num_input_tokens_seen": 77370510, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.91796875, + "step": 3585, + "time_per_iteration": 2.464745283126831 + }, + { + "auxiliary_loss_clip": 0.01140925, + "auxiliary_loss_mlp": 0.01042254, + "balance_loss_clip": 1.02568388, + "balance_loss_mlp": 1.04832685, + "epoch": 0.21560198406733805, + "flos": 22966741595520.0, + "grad_norm": 2.8296186032289348, + "language_loss": 0.74414444, + "learning_rate": 3.649389440450277e-06, + "loss": 0.76597619, + "num_input_tokens_seen": 77390645, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9296875, + "step": 3586, + "time_per_iteration": 2.5062146186828613 + }, + { + "auxiliary_loss_clip": 0.01141021, + "auxiliary_loss_mlp": 0.01042527, + "balance_loss_clip": 1.02668393, + "balance_loss_mlp": 1.04796743, + "epoch": 0.215662107320006, + "flos": 22784064001920.0, + "grad_norm": 2.1680236591426416, + "language_loss": 0.83055526, + "learning_rate": 3.6491691381705804e-06, + "loss": 0.85239077, + "num_input_tokens_seen": 77409655, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.9296875, + "step": 3587, + "time_per_iteration": 2.4784295558929443 + }, + { + "auxiliary_loss_clip": 0.01138799, + "auxiliary_loss_mlp": 0.01041925, + "balance_loss_clip": 1.02438986, + "balance_loss_mlp": 1.04664946, + "epoch": 0.21572223057267398, + "flos": 30883859038080.0, + "grad_norm": 1.8176747371086701, + "language_loss": 0.75756669, + "learning_rate": 3.648948773354224e-06, + "loss": 0.77937388, + "num_input_tokens_seen": 77430560, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.921875, + "step": 3588, + "time_per_iteration": 2.5896053314208984 + }, + { + "auxiliary_loss_clip": 0.01137468, + "auxiliary_loss_mlp": 0.01039715, + "balance_loss_clip": 1.02294254, + "balance_loss_mlp": 1.04534698, + "epoch": 0.21578235382534194, + "flos": 26910487445760.0, + "grad_norm": 1.8272464683057401, + "language_loss": 0.81006658, + "learning_rate": 3.6487283460095643e-06, + "loss": 0.83183837, + "num_input_tokens_seen": 77455000, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.921875, + "step": 3589, + "time_per_iteration": 2.540090799331665 + }, + { + "auxiliary_loss_clip": 0.01142059, + "auxiliary_loss_mlp": 0.01039493, + "balance_loss_clip": 1.02341199, + "balance_loss_mlp": 1.04792953, + "epoch": 0.2158424770780099, + "flos": 24425720219520.0, + "grad_norm": 2.6129530472479154, + "language_loss": 0.72591126, + "learning_rate": 3.648507856144961e-06, + "loss": 0.74772674, + "num_input_tokens_seen": 77475075, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.94140625, + "step": 3590, + "time_per_iteration": 2.5113861560821533 + }, + { + "auxiliary_loss_clip": 0.01145271, + "auxiliary_loss_mlp": 0.01046731, + "balance_loss_clip": 1.02769351, + "balance_loss_mlp": 1.04830956, + "epoch": 0.2159026003306779, + "flos": 23949975559680.0, + "grad_norm": 2.0133132975130477, + "language_loss": 0.83914638, + "learning_rate": 3.648287303768775e-06, + "loss": 0.86106646, + "num_input_tokens_seen": 77495945, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.96875, + "step": 3591, + "time_per_iteration": 2.488309621810913 + }, + { + "auxiliary_loss_clip": 0.01145642, + "auxiliary_loss_mlp": 0.01050952, + "balance_loss_clip": 1.03167534, + "balance_loss_mlp": 1.04884136, + "epoch": 0.21596272358334587, + "flos": 30040963511040.0, + "grad_norm": 2.271326779903827, + "language_loss": 0.69294131, + "learning_rate": 3.6480666888893686e-06, + "loss": 0.71490723, + "num_input_tokens_seen": 77517140, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.96875, + "step": 3592, + "time_per_iteration": 2.571373462677002 + }, + { + "auxiliary_loss_clip": 0.01143219, + "auxiliary_loss_mlp": 0.01049672, + "balance_loss_clip": 1.03150403, + "balance_loss_mlp": 1.04881072, + "epoch": 0.21602284683601383, + "flos": 20376217751040.0, + "grad_norm": 2.3999192225546677, + "language_loss": 0.84150124, + "learning_rate": 3.647846011515108e-06, + "loss": 0.86343014, + "num_input_tokens_seen": 77536085, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9453125, + "step": 3593, + "time_per_iteration": 2.4590611457824707 + }, + { + "auxiliary_loss_clip": 0.01144804, + "auxiliary_loss_mlp": 0.01049477, + "balance_loss_clip": 1.03210783, + "balance_loss_mlp": 1.04839182, + "epoch": 0.2160829700886818, + "flos": 20777339905920.0, + "grad_norm": 2.850380650061706, + "language_loss": 0.75163305, + "learning_rate": 3.6476252716543625e-06, + "loss": 0.77357584, + "num_input_tokens_seen": 77553675, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.96484375, + "step": 3594, + "time_per_iteration": 3.9338901042938232 + }, + { + "auxiliary_loss_clip": 0.01139476, + "auxiliary_loss_mlp": 0.01043593, + "balance_loss_clip": 1.02666509, + "balance_loss_mlp": 1.04763508, + "epoch": 0.21614309334134976, + "flos": 22309755886080.0, + "grad_norm": 2.0680180645872057, + "language_loss": 0.80541027, + "learning_rate": 3.6474044693155007e-06, + "loss": 0.82724094, + "num_input_tokens_seen": 77573360, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.921875, + "step": 3595, + "time_per_iteration": 3.9857921600341797 + }, + { + "auxiliary_loss_clip": 0.01146272, + "auxiliary_loss_mlp": 0.01043286, + "balance_loss_clip": 1.0259887, + "balance_loss_mlp": 1.04883027, + "epoch": 0.21620321659401773, + "flos": 19609524927360.0, + "grad_norm": 2.3330392864683347, + "language_loss": 0.78089929, + "learning_rate": 3.647183604506897e-06, + "loss": 0.80279487, + "num_input_tokens_seen": 77591865, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.97265625, + "step": 3596, + "time_per_iteration": 2.4515480995178223 + }, + { + "auxiliary_loss_clip": 0.01138472, + "auxiliary_loss_mlp": 0.0104618, + "balance_loss_clip": 1.03006268, + "balance_loss_mlp": 1.04786897, + "epoch": 0.2162633398466857, + "flos": 18844555956480.0, + "grad_norm": 1.9545740457841054, + "language_loss": 0.83011472, + "learning_rate": 3.6469626772369253e-06, + "loss": 0.85196126, + "num_input_tokens_seen": 77611600, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.90625, + "step": 3597, + "time_per_iteration": 2.504703998565674 + }, + { + "auxiliary_loss_clip": 0.011446, + "auxiliary_loss_mlp": 0.01045187, + "balance_loss_clip": 1.02756798, + "balance_loss_mlp": 1.05029655, + "epoch": 0.21632346309935369, + "flos": 18768820129920.0, + "grad_norm": 1.5849845027976412, + "language_loss": 0.80171728, + "learning_rate": 3.6467416875139642e-06, + "loss": 0.82361513, + "num_input_tokens_seen": 77630665, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.94140625, + "step": 3598, + "time_per_iteration": 2.487013101577759 + }, + { + "auxiliary_loss_clip": 0.0114385, + "auxiliary_loss_mlp": 0.01045551, + "balance_loss_clip": 1.02745485, + "balance_loss_mlp": 1.0476619, + "epoch": 0.21638358635202165, + "flos": 26324173745280.0, + "grad_norm": 1.8175927270691912, + "language_loss": 0.82054996, + "learning_rate": 3.6465206353463934e-06, + "loss": 0.842444, + "num_input_tokens_seen": 77650835, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9609375, + "step": 3599, + "time_per_iteration": 2.5515315532684326 + }, + { + "auxiliary_loss_clip": 0.0113915, + "auxiliary_loss_mlp": 0.0104149, + "balance_loss_clip": 1.0253613, + "balance_loss_mlp": 1.04831243, + "epoch": 0.21644370960468962, + "flos": 20740854666240.0, + "grad_norm": 3.186477441139726, + "language_loss": 0.7654863, + "learning_rate": 3.6462995207425947e-06, + "loss": 0.78729272, + "num_input_tokens_seen": 77669000, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.90625, + "step": 3600, + "time_per_iteration": 2.5067033767700195 + }, + { + "auxiliary_loss_clip": 0.01139528, + "auxiliary_loss_mlp": 0.01043686, + "balance_loss_clip": 1.02842712, + "balance_loss_mlp": 1.04657555, + "epoch": 0.21650383285735758, + "flos": 23952238116480.0, + "grad_norm": 1.9514188507385115, + "language_loss": 0.80026001, + "learning_rate": 3.6460783437109533e-06, + "loss": 0.82209218, + "num_input_tokens_seen": 77688745, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.9296875, + "step": 3601, + "time_per_iteration": 2.5383710861206055 + }, + { + "auxiliary_loss_clip": 0.01142747, + "auxiliary_loss_mlp": 0.01047381, + "balance_loss_clip": 1.0306437, + "balance_loss_mlp": 1.04938436, + "epoch": 0.21656395611002555, + "flos": 23696087253120.0, + "grad_norm": 1.8096424478422806, + "language_loss": 0.83358335, + "learning_rate": 3.6458571042598565e-06, + "loss": 0.85548466, + "num_input_tokens_seen": 77708445, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.93359375, + "step": 3602, + "time_per_iteration": 2.525151491165161 + }, + { + "auxiliary_loss_clip": 0.01140411, + "auxiliary_loss_mlp": 0.01048188, + "balance_loss_clip": 1.03065276, + "balance_loss_mlp": 1.04670155, + "epoch": 0.2166240793626935, + "flos": 20666052593280.0, + "grad_norm": 1.6489882186888527, + "language_loss": 0.74271673, + "learning_rate": 3.645635802397693e-06, + "loss": 0.76460266, + "num_input_tokens_seen": 77728465, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9375, + "step": 3603, + "time_per_iteration": 2.5083842277526855 + }, + { + "auxiliary_loss_clip": 0.01140372, + "auxiliary_loss_mlp": 0.01043135, + "balance_loss_clip": 1.02723289, + "balance_loss_mlp": 1.05022252, + "epoch": 0.2166842026153615, + "flos": 21580410228480.0, + "grad_norm": 1.5478742891076147, + "language_loss": 0.73956323, + "learning_rate": 3.645414438132855e-06, + "loss": 0.76139832, + "num_input_tokens_seen": 77746735, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.90234375, + "step": 3604, + "time_per_iteration": 2.5100204944610596 + }, + { + "auxiliary_loss_clip": 0.01137594, + "auxiliary_loss_mlp": 0.01042667, + "balance_loss_clip": 1.02598965, + "balance_loss_mlp": 1.04853153, + "epoch": 0.21674432586802947, + "flos": 25629948610560.0, + "grad_norm": 2.2268823896980376, + "language_loss": 0.80375803, + "learning_rate": 3.6451930114737366e-06, + "loss": 0.82556069, + "num_input_tokens_seen": 77768105, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.890625, + "step": 3605, + "time_per_iteration": 2.5182228088378906 + }, + { + "auxiliary_loss_clip": 0.01064224, + "auxiliary_loss_mlp": 0.01010449, + "balance_loss_clip": 1.0086962, + "balance_loss_mlp": 1.02975249, + "epoch": 0.21680444912069743, + "flos": 56417783616000.0, + "grad_norm": 0.6948121220218867, + "language_loss": 0.58376318, + "learning_rate": 3.6449715224287347e-06, + "loss": 0.60450989, + "num_input_tokens_seen": 77833750, + "router_z_loss_clip": 0.01757812, + "router_z_loss_mlp": 0.34375, + "step": 3606, + "time_per_iteration": 3.1655373573303223 + }, + { + "auxiliary_loss_clip": 0.01145196, + "auxiliary_loss_mlp": 0.01046918, + "balance_loss_clip": 1.02921534, + "balance_loss_mlp": 1.04939568, + "epoch": 0.2168645723733654, + "flos": 23878944414720.0, + "grad_norm": 2.6754398361548613, + "language_loss": 0.73210037, + "learning_rate": 3.644749971006248e-06, + "loss": 0.75402147, + "num_input_tokens_seen": 77853780, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9609375, + "step": 3607, + "time_per_iteration": 2.508920431137085 + }, + { + "auxiliary_loss_clip": 0.01146221, + "auxiliary_loss_mlp": 0.0104816, + "balance_loss_clip": 1.02995718, + "balance_loss_mlp": 1.04935443, + "epoch": 0.21692469562603336, + "flos": 16946174257920.0, + "grad_norm": 2.5718647894236053, + "language_loss": 0.76626337, + "learning_rate": 3.6445283572146765e-06, + "loss": 0.78820717, + "num_input_tokens_seen": 77872575, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.96875, + "step": 3608, + "time_per_iteration": 2.440258502960205 + }, + { + "auxiliary_loss_clip": 0.01144868, + "auxiliary_loss_mlp": 0.01046046, + "balance_loss_clip": 1.02985787, + "balance_loss_mlp": 1.04866827, + "epoch": 0.21698481887870133, + "flos": 25119047514240.0, + "grad_norm": 1.796333172920123, + "language_loss": 0.74395084, + "learning_rate": 3.6443066810624255e-06, + "loss": 0.76586002, + "num_input_tokens_seen": 77892700, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.9609375, + "step": 3609, + "time_per_iteration": 2.5326688289642334 + }, + { + "auxiliary_loss_clip": 0.01143438, + "auxiliary_loss_mlp": 0.01048498, + "balance_loss_clip": 1.03137922, + "balance_loss_mlp": 1.04871368, + "epoch": 0.2170449421313693, + "flos": 17894682748800.0, + "grad_norm": 1.781486329059154, + "language_loss": 0.88848329, + "learning_rate": 3.6440849425579e-06, + "loss": 0.91040266, + "num_input_tokens_seen": 77911060, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9453125, + "step": 3610, + "time_per_iteration": 2.4611029624938965 + }, + { + "auxiliary_loss_clip": 0.01144855, + "auxiliary_loss_mlp": 0.01038228, + "balance_loss_clip": 1.02090693, + "balance_loss_mlp": 1.05045652, + "epoch": 0.2171050653840373, + "flos": 22638446265600.0, + "grad_norm": 2.036787917991119, + "language_loss": 0.77587712, + "learning_rate": 3.6438631417095095e-06, + "loss": 0.79770797, + "num_input_tokens_seen": 77929930, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9453125, + "step": 3611, + "time_per_iteration": 2.5187723636627197 + }, + { + "auxiliary_loss_clip": 0.01136409, + "auxiliary_loss_mlp": 0.01044629, + "balance_loss_clip": 1.02829766, + "balance_loss_mlp": 1.04609489, + "epoch": 0.21716518863670525, + "flos": 19499997381120.0, + "grad_norm": 2.067133307741882, + "language_loss": 0.63197911, + "learning_rate": 3.6436412785256637e-06, + "loss": 0.65378946, + "num_input_tokens_seen": 77949060, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.90234375, + "step": 3612, + "time_per_iteration": 2.4585959911346436 + }, + { + "auxiliary_loss_clip": 0.0114176, + "auxiliary_loss_mlp": 0.01042113, + "balance_loss_clip": 1.02504194, + "balance_loss_mlp": 1.04799449, + "epoch": 0.21722531188937322, + "flos": 19792022952960.0, + "grad_norm": 1.9312736490377453, + "language_loss": 0.75120652, + "learning_rate": 3.643419353014776e-06, + "loss": 0.77304518, + "num_input_tokens_seen": 77967920, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9375, + "step": 3613, + "time_per_iteration": 2.4866983890533447 + }, + { + "auxiliary_loss_clip": 0.01136544, + "auxiliary_loss_mlp": 0.01046281, + "balance_loss_clip": 1.02900767, + "balance_loss_mlp": 1.04560208, + "epoch": 0.21728543514204118, + "flos": 13334386924800.0, + "grad_norm": 3.0184875495721, + "language_loss": 0.70767504, + "learning_rate": 3.643197365185261e-06, + "loss": 0.72950327, + "num_input_tokens_seen": 77985330, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.90625, + "step": 3614, + "time_per_iteration": 2.4454689025878906 + }, + { + "auxiliary_loss_clip": 0.01141605, + "auxiliary_loss_mlp": 0.01046562, + "balance_loss_clip": 1.0288837, + "balance_loss_mlp": 1.0491401, + "epoch": 0.21734555839470915, + "flos": 15231870783360.0, + "grad_norm": 1.8064523730299737, + "language_loss": 0.7314586, + "learning_rate": 3.6429753150455378e-06, + "loss": 0.75334036, + "num_input_tokens_seen": 78003105, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.92578125, + "step": 3615, + "time_per_iteration": 2.488711357116699 + }, + { + "auxiliary_loss_clip": 0.01145923, + "auxiliary_loss_mlp": 0.0104763, + "balance_loss_clip": 1.02832997, + "balance_loss_mlp": 1.04751146, + "epoch": 0.2174056816473771, + "flos": 19973982274560.0, + "grad_norm": 2.7876016160510377, + "language_loss": 0.90045536, + "learning_rate": 3.6427532026040263e-06, + "loss": 0.92239082, + "num_input_tokens_seen": 78019655, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.984375, + "step": 3616, + "time_per_iteration": 2.4552054405212402 + }, + { + "auxiliary_loss_clip": 0.01143252, + "auxiliary_loss_mlp": 0.01041039, + "balance_loss_clip": 1.02356279, + "balance_loss_mlp": 1.04853153, + "epoch": 0.21746580490004508, + "flos": 16687293960960.0, + "grad_norm": 2.4503731233397383, + "language_loss": 0.8111589, + "learning_rate": 3.642531027869148e-06, + "loss": 0.83300173, + "num_input_tokens_seen": 78036025, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9453125, + "step": 3617, + "time_per_iteration": 2.465254068374634 + }, + { + "auxiliary_loss_clip": 0.01143954, + "auxiliary_loss_mlp": 0.01045828, + "balance_loss_clip": 1.02928162, + "balance_loss_mlp": 1.04851139, + "epoch": 0.21752592815271307, + "flos": 25772298209280.0, + "grad_norm": 1.7784831572545423, + "language_loss": 0.75509727, + "learning_rate": 3.642308790849329e-06, + "loss": 0.77699506, + "num_input_tokens_seen": 78055645, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.953125, + "step": 3618, + "time_per_iteration": 2.5263705253601074 + }, + { + "auxiliary_loss_clip": 0.0114255, + "auxiliary_loss_mlp": 0.01049263, + "balance_loss_clip": 1.03103614, + "balance_loss_mlp": 1.04738426, + "epoch": 0.21758605140538104, + "flos": 11254692349440.0, + "grad_norm": 1.9247647214638754, + "language_loss": 0.69221723, + "learning_rate": 3.642086491552996e-06, + "loss": 0.71413535, + "num_input_tokens_seen": 78071660, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.94921875, + "step": 3619, + "time_per_iteration": 2.4615654945373535 + }, + { + "auxiliary_loss_clip": 0.01145954, + "auxiliary_loss_mlp": 0.01044723, + "balance_loss_clip": 1.02723491, + "balance_loss_mlp": 1.04906762, + "epoch": 0.217646174658049, + "flos": 19242625455360.0, + "grad_norm": 1.7662634429670958, + "language_loss": 0.78337491, + "learning_rate": 3.641864129988579e-06, + "loss": 0.80528164, + "num_input_tokens_seen": 78091265, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.96875, + "step": 3620, + "time_per_iteration": 2.4954700469970703 + }, + { + "auxiliary_loss_clip": 0.01133661, + "auxiliary_loss_mlp": 0.0103768, + "balance_loss_clip": 1.02116966, + "balance_loss_mlp": 1.04363799, + "epoch": 0.21770629791071697, + "flos": 21945083057280.0, + "grad_norm": 2.0129000326388695, + "language_loss": 0.79769373, + "learning_rate": 3.641641706164509e-06, + "loss": 0.81940717, + "num_input_tokens_seen": 78110095, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.90234375, + "step": 3621, + "time_per_iteration": 2.490427255630493 + }, + { + "auxiliary_loss_clip": 0.01138307, + "auxiliary_loss_mlp": 0.01040724, + "balance_loss_clip": 1.02436888, + "balance_loss_mlp": 1.04595852, + "epoch": 0.21776642116338493, + "flos": 24936764970240.0, + "grad_norm": 1.7548460288059653, + "language_loss": 0.87967801, + "learning_rate": 3.641419220089221e-06, + "loss": 0.90146828, + "num_input_tokens_seen": 78129475, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.921875, + "step": 3622, + "time_per_iteration": 2.484462022781372 + }, + { + "auxiliary_loss_clip": 0.01142961, + "auxiliary_loss_mlp": 0.01040225, + "balance_loss_clip": 1.02067459, + "balance_loss_mlp": 1.04766297, + "epoch": 0.2178265444160529, + "flos": 17821317219840.0, + "grad_norm": 4.811459611972859, + "language_loss": 0.76945633, + "learning_rate": 3.641196671771152e-06, + "loss": 0.79128814, + "num_input_tokens_seen": 78146880, + "router_z_loss_clip": 0.1953125, + "router_z_loss_mlp": 0.94921875, + "step": 3623, + "time_per_iteration": 2.4476547241210938 + }, + { + "auxiliary_loss_clip": 0.0114403, + "auxiliary_loss_mlp": 0.01048104, + "balance_loss_clip": 1.02992439, + "balance_loss_mlp": 1.04891419, + "epoch": 0.2178866676687209, + "flos": 17712902995200.0, + "grad_norm": 2.1152987510548615, + "language_loss": 0.84886312, + "learning_rate": 3.640974061218741e-06, + "loss": 0.8707844, + "num_input_tokens_seen": 78165065, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.94921875, + "step": 3624, + "time_per_iteration": 2.444913387298584 + }, + { + "auxiliary_loss_clip": 0.0114445, + "auxiliary_loss_mlp": 0.010571, + "balance_loss_clip": 1.0397315, + "balance_loss_mlp": 1.0487287, + "epoch": 0.21794679092138886, + "flos": 16945851035520.0, + "grad_norm": 2.345969751242133, + "language_loss": 0.77035248, + "learning_rate": 3.640751388440429e-06, + "loss": 0.79236794, + "num_input_tokens_seen": 78180005, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.95703125, + "step": 3625, + "time_per_iteration": 2.4511115550994873 + }, + { + "auxiliary_loss_clip": 0.01059313, + "auxiliary_loss_mlp": 0.01000008, + "balance_loss_clip": 0.99836272, + "balance_loss_mlp": 1.02361774, + "epoch": 0.21800691417405682, + "flos": 63718566566400.0, + "grad_norm": 0.8233389824181596, + "language_loss": 0.60720766, + "learning_rate": 3.64052865344466e-06, + "loss": 0.62780088, + "num_input_tokens_seen": 78245350, + "router_z_loss_clip": 0.01647949, + "router_z_loss_mlp": 0.35546875, + "step": 3626, + "time_per_iteration": 3.21004319190979 + }, + { + "auxiliary_loss_clip": 0.0114194, + "auxiliary_loss_mlp": 0.01047127, + "balance_loss_clip": 1.02858984, + "balance_loss_mlp": 1.04572678, + "epoch": 0.21806703742672479, + "flos": 21616392677760.0, + "grad_norm": 1.8978511257882154, + "language_loss": 0.90608853, + "learning_rate": 3.6403058562398795e-06, + "loss": 0.92797917, + "num_input_tokens_seen": 78264165, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9609375, + "step": 3627, + "time_per_iteration": 2.4744250774383545 + }, + { + "auxiliary_loss_clip": 0.01138482, + "auxiliary_loss_mlp": 0.01041219, + "balance_loss_clip": 1.02346826, + "balance_loss_mlp": 1.04541492, + "epoch": 0.21812716067939275, + "flos": 19354882435200.0, + "grad_norm": 1.8495097769686537, + "language_loss": 0.73612916, + "learning_rate": 3.6400829968345365e-06, + "loss": 0.75792623, + "num_input_tokens_seen": 78283745, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9296875, + "step": 3628, + "time_per_iteration": 2.4595446586608887 + }, + { + "auxiliary_loss_clip": 0.01137064, + "auxiliary_loss_mlp": 0.01039204, + "balance_loss_clip": 1.02232444, + "balance_loss_mlp": 1.04432046, + "epoch": 0.21818728393206072, + "flos": 23548063305600.0, + "grad_norm": 1.99633175048199, + "language_loss": 0.76800162, + "learning_rate": 3.6398600752370826e-06, + "loss": 0.78976429, + "num_input_tokens_seen": 78302900, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.92578125, + "step": 3629, + "time_per_iteration": 2.487610101699829 + }, + { + "auxiliary_loss_clip": 0.01140004, + "auxiliary_loss_mlp": 0.01041342, + "balance_loss_clip": 1.02514172, + "balance_loss_mlp": 1.04701388, + "epoch": 0.21824740718472868, + "flos": 30225652266240.0, + "grad_norm": 1.5547294213075904, + "language_loss": 0.71320152, + "learning_rate": 3.63963709145597e-06, + "loss": 0.73501503, + "num_input_tokens_seen": 78326470, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.9296875, + "step": 3630, + "time_per_iteration": 2.608846426010132 + }, + { + "auxiliary_loss_clip": 0.01134439, + "auxiliary_loss_mlp": 0.01042587, + "balance_loss_clip": 1.0277338, + "balance_loss_mlp": 1.04635286, + "epoch": 0.21830753043739667, + "flos": 26134672567680.0, + "grad_norm": 1.8110131954886999, + "language_loss": 0.76331747, + "learning_rate": 3.6394140454996544e-06, + "loss": 0.78508776, + "num_input_tokens_seen": 78345810, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8828125, + "step": 3631, + "time_per_iteration": 2.53765869140625 + }, + { + "auxiliary_loss_clip": 0.01138964, + "auxiliary_loss_mlp": 0.0103994, + "balance_loss_clip": 1.0237397, + "balance_loss_mlp": 1.0455693, + "epoch": 0.21836765369006464, + "flos": 21720712752000.0, + "grad_norm": 2.0710075205659906, + "language_loss": 0.74879777, + "learning_rate": 3.639190937376594e-06, + "loss": 0.77058685, + "num_input_tokens_seen": 78364085, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.93359375, + "step": 3632, + "time_per_iteration": 2.484896421432495 + }, + { + "auxiliary_loss_clip": 0.01136054, + "auxiliary_loss_mlp": 0.01035961, + "balance_loss_clip": 1.02029681, + "balance_loss_mlp": 1.04511309, + "epoch": 0.2184277769427326, + "flos": 19937604775680.0, + "grad_norm": 1.966664682342333, + "language_loss": 0.83337629, + "learning_rate": 3.638967767095249e-06, + "loss": 0.8550964, + "num_input_tokens_seen": 78381385, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.91015625, + "step": 3633, + "time_per_iteration": 2.4721779823303223 + }, + { + "auxiliary_loss_clip": 0.01136294, + "auxiliary_loss_mlp": 0.01048418, + "balance_loss_clip": 1.03228879, + "balance_loss_mlp": 1.04592657, + "epoch": 0.21848790019540057, + "flos": 20340235301760.0, + "grad_norm": 1.8655293845238095, + "language_loss": 0.81782126, + "learning_rate": 3.6387445346640823e-06, + "loss": 0.83966839, + "num_input_tokens_seen": 78400500, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.90234375, + "step": 3634, + "time_per_iteration": 2.5514795780181885 + }, + { + "auxiliary_loss_clip": 0.01144011, + "auxiliary_loss_mlp": 0.01041001, + "balance_loss_clip": 1.02468133, + "balance_loss_mlp": 1.04863131, + "epoch": 0.21854802344806853, + "flos": 15450818135040.0, + "grad_norm": 2.010090632845536, + "language_loss": 0.75077927, + "learning_rate": 3.638521240091558e-06, + "loss": 0.77262932, + "num_input_tokens_seen": 78418340, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.953125, + "step": 3635, + "time_per_iteration": 4.07889199256897 + }, + { + "auxiliary_loss_clip": 0.01137664, + "auxiliary_loss_mlp": 0.01053987, + "balance_loss_clip": 1.03775024, + "balance_loss_mlp": 1.04744601, + "epoch": 0.2186081467007365, + "flos": 16320717711360.0, + "grad_norm": 2.2167396678675155, + "language_loss": 0.87881035, + "learning_rate": 3.6382978833861445e-06, + "loss": 0.90072685, + "num_input_tokens_seen": 78434375, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.90234375, + "step": 3636, + "time_per_iteration": 3.9134533405303955 + }, + { + "auxiliary_loss_clip": 0.01138959, + "auxiliary_loss_mlp": 0.01051316, + "balance_loss_clip": 1.03406608, + "balance_loss_mlp": 1.0456109, + "epoch": 0.2186682699534045, + "flos": 21689255416320.0, + "grad_norm": 1.9800006249435054, + "language_loss": 0.75948632, + "learning_rate": 3.638074464556311e-06, + "loss": 0.78138912, + "num_input_tokens_seen": 78451735, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.93359375, + "step": 3637, + "time_per_iteration": 2.5531604290008545 + }, + { + "auxiliary_loss_clip": 0.01143812, + "auxiliary_loss_mlp": 0.01042573, + "balance_loss_clip": 1.02445328, + "balance_loss_mlp": 1.04728055, + "epoch": 0.21872839320607246, + "flos": 17739260599680.0, + "grad_norm": 4.376345077988984, + "language_loss": 0.89677018, + "learning_rate": 3.63785098361053e-06, + "loss": 0.91863406, + "num_input_tokens_seen": 78462730, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.96875, + "step": 3638, + "time_per_iteration": 2.435544967651367 + }, + { + "auxiliary_loss_clip": 0.01140476, + "auxiliary_loss_mlp": 0.01050633, + "balance_loss_clip": 1.03377736, + "balance_loss_mlp": 1.04854274, + "epoch": 0.21878851645874042, + "flos": 18652289431680.0, + "grad_norm": 2.382131601644944, + "language_loss": 0.89958721, + "learning_rate": 3.637627440557275e-06, + "loss": 0.9214983, + "num_input_tokens_seen": 78476300, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.91796875, + "step": 3639, + "time_per_iteration": 2.448150634765625 + }, + { + "auxiliary_loss_clip": 0.01138473, + "auxiliary_loss_mlp": 0.01045558, + "balance_loss_clip": 1.02972686, + "balance_loss_mlp": 1.04632282, + "epoch": 0.2188486397114084, + "flos": 25557301353600.0, + "grad_norm": 1.7796744672676124, + "language_loss": 0.79038727, + "learning_rate": 3.637403835405024e-06, + "loss": 0.81222755, + "num_input_tokens_seen": 78496135, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.921875, + "step": 3640, + "time_per_iteration": 2.544577121734619 + }, + { + "auxiliary_loss_clip": 0.01142754, + "auxiliary_loss_mlp": 0.01051502, + "balance_loss_clip": 1.03291786, + "balance_loss_mlp": 1.05100346, + "epoch": 0.21890876296407635, + "flos": 17892061056000.0, + "grad_norm": 2.046383525913898, + "language_loss": 0.72049212, + "learning_rate": 3.637180168162255e-06, + "loss": 0.74243474, + "num_input_tokens_seen": 78513855, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.91796875, + "step": 3641, + "time_per_iteration": 2.465439558029175 + }, + { + "auxiliary_loss_clip": 0.01142611, + "auxiliary_loss_mlp": 0.01042223, + "balance_loss_clip": 1.02610588, + "balance_loss_mlp": 1.05203855, + "epoch": 0.21896888621674432, + "flos": 17749100926080.0, + "grad_norm": 2.4771917366671, + "language_loss": 0.80913448, + "learning_rate": 3.63695643883745e-06, + "loss": 0.8309828, + "num_input_tokens_seen": 78531740, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.90625, + "step": 3642, + "time_per_iteration": 2.4598801136016846 + }, + { + "auxiliary_loss_clip": 0.01144439, + "auxiliary_loss_mlp": 0.01040377, + "balance_loss_clip": 1.02319944, + "balance_loss_mlp": 1.05089164, + "epoch": 0.21902900946941228, + "flos": 23076161400960.0, + "grad_norm": 2.0352379603627684, + "language_loss": 0.71573192, + "learning_rate": 3.6367326474390928e-06, + "loss": 0.73758006, + "num_input_tokens_seen": 78549600, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9375, + "step": 3643, + "time_per_iteration": 2.4988484382629395 + }, + { + "auxiliary_loss_clip": 0.01144262, + "auxiliary_loss_mlp": 0.01048332, + "balance_loss_clip": 1.03115392, + "balance_loss_mlp": 1.05041492, + "epoch": 0.21908913272208028, + "flos": 48178545004800.0, + "grad_norm": 2.9224514767679763, + "language_loss": 0.68172711, + "learning_rate": 3.6365087939756696e-06, + "loss": 0.70365304, + "num_input_tokens_seen": 78573350, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9375, + "step": 3644, + "time_per_iteration": 2.721107244491577 + }, + { + "auxiliary_loss_clip": 0.01144867, + "auxiliary_loss_mlp": 0.01041697, + "balance_loss_clip": 1.0252583, + "balance_loss_mlp": 1.04905653, + "epoch": 0.21914925597474824, + "flos": 22236749493120.0, + "grad_norm": 2.1869112310362504, + "language_loss": 0.77744782, + "learning_rate": 3.636284878455669e-06, + "loss": 0.79931343, + "num_input_tokens_seen": 78591005, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9609375, + "step": 3645, + "time_per_iteration": 2.4838709831237793 + }, + { + "auxiliary_loss_clip": 0.01140139, + "auxiliary_loss_mlp": 0.01048358, + "balance_loss_clip": 1.03275371, + "balance_loss_mlp": 1.04988873, + "epoch": 0.2192093792274162, + "flos": 22125605834880.0, + "grad_norm": 1.575077237748942, + "language_loss": 0.82405865, + "learning_rate": 3.636060900887582e-06, + "loss": 0.84594363, + "num_input_tokens_seen": 78610645, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.90234375, + "step": 3646, + "time_per_iteration": 2.467958927154541 + }, + { + "auxiliary_loss_clip": 0.01137932, + "auxiliary_loss_mlp": 0.01036528, + "balance_loss_clip": 1.02050591, + "balance_loss_mlp": 1.04901123, + "epoch": 0.21926950248008417, + "flos": 15669442264320.0, + "grad_norm": 1.7225223193128734, + "language_loss": 0.83016759, + "learning_rate": 3.635836861279901e-06, + "loss": 0.85191214, + "num_input_tokens_seen": 78628340, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.890625, + "step": 3647, + "time_per_iteration": 2.4670159816741943 + }, + { + "auxiliary_loss_clip": 0.01137396, + "auxiliary_loss_mlp": 0.01046032, + "balance_loss_clip": 1.02991438, + "balance_loss_mlp": 1.04734278, + "epoch": 0.21932962573275214, + "flos": 30262496641920.0, + "grad_norm": 1.5879018059409027, + "language_loss": 0.72555232, + "learning_rate": 3.635612759641123e-06, + "loss": 0.74738657, + "num_input_tokens_seen": 78649355, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.90234375, + "step": 3648, + "time_per_iteration": 2.5572352409362793 + }, + { + "auxiliary_loss_clip": 0.01140287, + "auxiliary_loss_mlp": 0.01045097, + "balance_loss_clip": 1.02628565, + "balance_loss_mlp": 1.04563618, + "epoch": 0.2193897489854201, + "flos": 10780132838400.0, + "grad_norm": 2.3666125536095612, + "language_loss": 0.74363017, + "learning_rate": 3.635388595979745e-06, + "loss": 0.76548404, + "num_input_tokens_seen": 78664915, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9453125, + "step": 3649, + "time_per_iteration": 2.4465692043304443 + }, + { + "auxiliary_loss_clip": 0.01133567, + "auxiliary_loss_mlp": 0.01043993, + "balance_loss_clip": 1.02869856, + "balance_loss_mlp": 1.04609215, + "epoch": 0.21944987223808807, + "flos": 19133313390720.0, + "grad_norm": 2.0558746559562953, + "language_loss": 0.86408567, + "learning_rate": 3.635164370304267e-06, + "loss": 0.88586134, + "num_input_tokens_seen": 78681475, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.875, + "step": 3650, + "time_per_iteration": 2.4408226013183594 + }, + { + "auxiliary_loss_clip": 0.01137285, + "auxiliary_loss_mlp": 0.0104387, + "balance_loss_clip": 1.02747929, + "balance_loss_mlp": 1.04549015, + "epoch": 0.21950999549075606, + "flos": 22711093522560.0, + "grad_norm": 2.0425834927064934, + "language_loss": 0.83693743, + "learning_rate": 3.6349400826231927e-06, + "loss": 0.85874897, + "num_input_tokens_seen": 78702300, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.91796875, + "step": 3651, + "time_per_iteration": 2.502694845199585 + }, + { + "auxiliary_loss_clip": 0.01137563, + "auxiliary_loss_mlp": 0.01046031, + "balance_loss_clip": 1.02941298, + "balance_loss_mlp": 1.04595184, + "epoch": 0.21957011874342403, + "flos": 10561329141120.0, + "grad_norm": 1.8702009414404626, + "language_loss": 0.74629313, + "learning_rate": 3.634715732945027e-06, + "loss": 0.76812911, + "num_input_tokens_seen": 78720230, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9140625, + "step": 3652, + "time_per_iteration": 2.4422640800476074 + }, + { + "auxiliary_loss_clip": 0.01052644, + "auxiliary_loss_mlp": 0.01011234, + "balance_loss_clip": 1.00946999, + "balance_loss_mlp": 1.0194056, + "epoch": 0.219630241996092, + "flos": 65747913252480.0, + "grad_norm": 0.7344385056765022, + "language_loss": 0.51548386, + "learning_rate": 3.6344913212782764e-06, + "loss": 0.53612262, + "num_input_tokens_seen": 78780200, + "router_z_loss_clip": 0.0177002, + "router_z_loss_mlp": 0.33203125, + "step": 3653, + "time_per_iteration": 3.0743935108184814 + }, + { + "auxiliary_loss_clip": 0.01142335, + "auxiliary_loss_mlp": 0.01048616, + "balance_loss_clip": 1.03215361, + "balance_loss_mlp": 1.05115473, + "epoch": 0.21969036524875996, + "flos": 23696518216320.0, + "grad_norm": 1.781801507589209, + "language_loss": 0.75256276, + "learning_rate": 3.6342668476314514e-06, + "loss": 0.77447224, + "num_input_tokens_seen": 78800575, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.91015625, + "step": 3654, + "time_per_iteration": 2.4826300144195557 + }, + { + "auxiliary_loss_clip": 0.01143131, + "auxiliary_loss_mlp": 0.01041429, + "balance_loss_clip": 1.02499056, + "balance_loss_mlp": 1.04988194, + "epoch": 0.21975048850142792, + "flos": 19640910435840.0, + "grad_norm": 1.9986760770887892, + "language_loss": 0.72757828, + "learning_rate": 3.634042312013064e-06, + "loss": 0.74942386, + "num_input_tokens_seen": 78819585, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9296875, + "step": 3655, + "time_per_iteration": 2.494662284851074 + }, + { + "auxiliary_loss_clip": 0.01139919, + "auxiliary_loss_mlp": 0.01044993, + "balance_loss_clip": 1.02860177, + "balance_loss_mlp": 1.04802227, + "epoch": 0.21981061175409589, + "flos": 22448550038400.0, + "grad_norm": 1.6963533722566047, + "language_loss": 0.80971813, + "learning_rate": 3.6338177144316276e-06, + "loss": 0.83156729, + "num_input_tokens_seen": 78837330, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.91796875, + "step": 3656, + "time_per_iteration": 2.465020179748535 + }, + { + "auxiliary_loss_clip": 0.01141894, + "auxiliary_loss_mlp": 0.01039083, + "balance_loss_clip": 1.02267933, + "balance_loss_mlp": 1.05085039, + "epoch": 0.21987073500676388, + "flos": 18151049093760.0, + "grad_norm": 2.205234752003223, + "language_loss": 0.84668207, + "learning_rate": 3.63359305489566e-06, + "loss": 0.86849183, + "num_input_tokens_seen": 78854955, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.91015625, + "step": 3657, + "time_per_iteration": 2.4626548290252686 + }, + { + "auxiliary_loss_clip": 0.01138622, + "auxiliary_loss_mlp": 0.0103825, + "balance_loss_clip": 1.02126312, + "balance_loss_mlp": 1.0460434, + "epoch": 0.21993085825943184, + "flos": 25626177682560.0, + "grad_norm": 1.714181577212399, + "language_loss": 0.80485702, + "learning_rate": 3.6333683334136803e-06, + "loss": 0.8266257, + "num_input_tokens_seen": 78874965, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.92578125, + "step": 3658, + "time_per_iteration": 2.492835521697998 + }, + { + "auxiliary_loss_clip": 0.01053481, + "auxiliary_loss_mlp": 0.01002458, + "balance_loss_clip": 1.00065756, + "balance_loss_mlp": 1.02029002, + "epoch": 0.2199909815120998, + "flos": 70923217743360.0, + "grad_norm": 0.8995084923077876, + "language_loss": 0.58224851, + "learning_rate": 3.6331435499942095e-06, + "loss": 0.60280788, + "num_input_tokens_seen": 78937740, + "router_z_loss_clip": 0.01794434, + "router_z_loss_mlp": 0.33203125, + "step": 3659, + "time_per_iteration": 3.1709213256835938 + }, + { + "auxiliary_loss_clip": 0.01140235, + "auxiliary_loss_mlp": 0.0103939, + "balance_loss_clip": 1.02248645, + "balance_loss_mlp": 1.04958415, + "epoch": 0.22005110476476777, + "flos": 21543529939200.0, + "grad_norm": 2.4575828715719177, + "language_loss": 0.74535513, + "learning_rate": 3.632918704645772e-06, + "loss": 0.76715136, + "num_input_tokens_seen": 78955055, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.90625, + "step": 3660, + "time_per_iteration": 2.474397897720337 + }, + { + "auxiliary_loss_clip": 0.01139013, + "auxiliary_loss_mlp": 0.01040353, + "balance_loss_clip": 1.02336597, + "balance_loss_mlp": 1.04723859, + "epoch": 0.22011122801743574, + "flos": 22054502862720.0, + "grad_norm": 2.0332694306983723, + "language_loss": 0.81225419, + "learning_rate": 3.632693797376893e-06, + "loss": 0.83404779, + "num_input_tokens_seen": 78974895, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.91796875, + "step": 3661, + "time_per_iteration": 2.4926669597625732 + }, + { + "auxiliary_loss_clip": 0.01138494, + "auxiliary_loss_mlp": 0.01042707, + "balance_loss_clip": 1.02639949, + "balance_loss_mlp": 1.04773009, + "epoch": 0.2201713512701037, + "flos": 26687589598080.0, + "grad_norm": 1.8682139743879211, + "language_loss": 0.73236209, + "learning_rate": 3.632468828196102e-06, + "loss": 0.75417411, + "num_input_tokens_seen": 78994990, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.90625, + "step": 3662, + "time_per_iteration": 2.5111234188079834 + }, + { + "auxiliary_loss_clip": 0.01140855, + "auxiliary_loss_mlp": 0.01048578, + "balance_loss_clip": 1.03333092, + "balance_loss_mlp": 1.05132473, + "epoch": 0.22023147452277167, + "flos": 22162198815360.0, + "grad_norm": 1.6440107639340105, + "language_loss": 0.77800119, + "learning_rate": 3.632243797111929e-06, + "loss": 0.79989552, + "num_input_tokens_seen": 79014405, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.89453125, + "step": 3663, + "time_per_iteration": 2.485520601272583 + }, + { + "auxiliary_loss_clip": 0.01144475, + "auxiliary_loss_mlp": 0.01043185, + "balance_loss_clip": 1.02581656, + "balance_loss_mlp": 1.05125535, + "epoch": 0.22029159777543966, + "flos": 22523280284160.0, + "grad_norm": 3.566897500342904, + "language_loss": 0.80484056, + "learning_rate": 3.632018704132908e-06, + "loss": 0.8267172, + "num_input_tokens_seen": 79032375, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.93359375, + "step": 3664, + "time_per_iteration": 2.4827098846435547 + }, + { + "auxiliary_loss_clip": 0.01146334, + "auxiliary_loss_mlp": 0.01042617, + "balance_loss_clip": 1.02354348, + "balance_loss_mlp": 1.04959095, + "epoch": 0.22035172102810763, + "flos": 13042469093760.0, + "grad_norm": 2.530665000734818, + "language_loss": 0.76296824, + "learning_rate": 3.6317935492675742e-06, + "loss": 0.78485775, + "num_input_tokens_seen": 79049635, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.96875, + "step": 3665, + "time_per_iteration": 2.5118229389190674 + }, + { + "auxiliary_loss_clip": 0.01139389, + "auxiliary_loss_mlp": 0.01044667, + "balance_loss_clip": 1.0282042, + "balance_loss_mlp": 1.04779172, + "epoch": 0.2204118442807756, + "flos": 12165817760640.0, + "grad_norm": 2.7337119989610468, + "language_loss": 0.97959125, + "learning_rate": 3.631568332524466e-06, + "loss": 1.00143182, + "num_input_tokens_seen": 79062890, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9140625, + "step": 3666, + "time_per_iteration": 2.4461512565612793 + }, + { + "auxiliary_loss_clip": 0.01136729, + "auxiliary_loss_mlp": 0.01039342, + "balance_loss_clip": 1.02241421, + "balance_loss_mlp": 1.04582953, + "epoch": 0.22047196753344356, + "flos": 40108806673920.0, + "grad_norm": 2.115803047817727, + "language_loss": 0.80494016, + "learning_rate": 3.631343053912122e-06, + "loss": 0.82670087, + "num_input_tokens_seen": 79085495, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.90625, + "step": 3667, + "time_per_iteration": 2.65198016166687 + }, + { + "auxiliary_loss_clip": 0.01144733, + "auxiliary_loss_mlp": 0.01046593, + "balance_loss_clip": 1.02776945, + "balance_loss_mlp": 1.04882097, + "epoch": 0.22053209078611152, + "flos": 20701137202560.0, + "grad_norm": 1.916720089378095, + "language_loss": 0.77463895, + "learning_rate": 3.631117713439087e-06, + "loss": 0.79655218, + "num_input_tokens_seen": 79101820, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9609375, + "step": 3668, + "time_per_iteration": 2.459141254425049 + }, + { + "auxiliary_loss_clip": 0.0114207, + "auxiliary_loss_mlp": 0.01042745, + "balance_loss_clip": 1.02568614, + "balance_loss_mlp": 1.05058837, + "epoch": 0.2205922140387795, + "flos": 24716309247360.0, + "grad_norm": 1.730318389149699, + "language_loss": 0.71514869, + "learning_rate": 3.630892311113904e-06, + "loss": 0.73699689, + "num_input_tokens_seen": 79123320, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9140625, + "step": 3669, + "time_per_iteration": 2.550732135772705 + }, + { + "auxiliary_loss_clip": 0.01139227, + "auxiliary_loss_mlp": 0.01037839, + "balance_loss_clip": 1.02106571, + "balance_loss_mlp": 1.04615474, + "epoch": 0.22065233729144745, + "flos": 23477247642240.0, + "grad_norm": 2.0994504177928826, + "language_loss": 0.85294032, + "learning_rate": 3.6306668469451215e-06, + "loss": 0.87471098, + "num_input_tokens_seen": 79141615, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9296875, + "step": 3670, + "time_per_iteration": 2.4727606773376465 + }, + { + "auxiliary_loss_clip": 0.01147385, + "auxiliary_loss_mlp": 0.01040938, + "balance_loss_clip": 1.02360499, + "balance_loss_mlp": 1.05130565, + "epoch": 0.22071246054411545, + "flos": 35225566646400.0, + "grad_norm": 1.775856591734502, + "language_loss": 0.76796275, + "learning_rate": 3.6304413209412886e-06, + "loss": 0.789846, + "num_input_tokens_seen": 79164910, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9609375, + "step": 3671, + "time_per_iteration": 2.613104820251465 + }, + { + "auxiliary_loss_clip": 0.01140966, + "auxiliary_loss_mlp": 0.01034463, + "balance_loss_clip": 1.01758265, + "balance_loss_mlp": 1.0487864, + "epoch": 0.2207725837967834, + "flos": 18150294908160.0, + "grad_norm": 2.8820912362302202, + "language_loss": 0.80472648, + "learning_rate": 3.6302157331109573e-06, + "loss": 0.82648075, + "num_input_tokens_seen": 79179685, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.921875, + "step": 3672, + "time_per_iteration": 2.4365992546081543 + }, + { + "auxiliary_loss_clip": 0.01144646, + "auxiliary_loss_mlp": 0.01047711, + "balance_loss_clip": 1.03129566, + "balance_loss_mlp": 1.05145025, + "epoch": 0.22083270704945138, + "flos": 20479675898880.0, + "grad_norm": 1.8912849075471436, + "language_loss": 0.736193, + "learning_rate": 3.629990083462682e-06, + "loss": 0.75811654, + "num_input_tokens_seen": 79196285, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9296875, + "step": 3673, + "time_per_iteration": 2.4908931255340576 + }, + { + "auxiliary_loss_clip": 0.01145514, + "auxiliary_loss_mlp": 0.01037762, + "balance_loss_clip": 1.02064395, + "balance_loss_mlp": 1.05221379, + "epoch": 0.22089283030211934, + "flos": 34125801984000.0, + "grad_norm": 1.9375944290288487, + "language_loss": 0.76505005, + "learning_rate": 3.6297643720050203e-06, + "loss": 0.78688282, + "num_input_tokens_seen": 79216060, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9296875, + "step": 3674, + "time_per_iteration": 2.569312572479248 + }, + { + "auxiliary_loss_clip": 0.01142786, + "auxiliary_loss_mlp": 0.01043506, + "balance_loss_clip": 1.02518344, + "balance_loss_mlp": 1.05025005, + "epoch": 0.2209529535547873, + "flos": 18077216688000.0, + "grad_norm": 2.0287396146216055, + "language_loss": 0.74786556, + "learning_rate": 3.6295385987465293e-06, + "loss": 0.76972854, + "num_input_tokens_seen": 79235145, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.92578125, + "step": 3675, + "time_per_iteration": 2.4762706756591797 + }, + { + "auxiliary_loss_clip": 0.01141021, + "auxiliary_loss_mlp": 0.01040878, + "balance_loss_clip": 1.02395034, + "balance_loss_mlp": 1.0473659, + "epoch": 0.22101307680745527, + "flos": 27235335070080.0, + "grad_norm": 1.7527405009289938, + "language_loss": 0.80050498, + "learning_rate": 3.629312763695772e-06, + "loss": 0.82232398, + "num_input_tokens_seen": 79256960, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9375, + "step": 3676, + "time_per_iteration": 2.5846786499023438 + }, + { + "auxiliary_loss_clip": 0.0114147, + "auxiliary_loss_mlp": 0.01047315, + "balance_loss_clip": 1.03106666, + "balance_loss_mlp": 1.0474596, + "epoch": 0.22107320006012326, + "flos": 16543256423040.0, + "grad_norm": 1.974355382670518, + "language_loss": 0.75501895, + "learning_rate": 3.6290868668613107e-06, + "loss": 0.77690685, + "num_input_tokens_seen": 79274860, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.94140625, + "step": 3677, + "time_per_iteration": 4.02753758430481 + }, + { + "auxiliary_loss_clip": 0.01135837, + "auxiliary_loss_mlp": 0.01041033, + "balance_loss_clip": 1.02455878, + "balance_loss_mlp": 1.0449332, + "epoch": 0.22113332331279123, + "flos": 22054466949120.0, + "grad_norm": 2.0397766719275494, + "language_loss": 0.83412457, + "learning_rate": 3.628860908251712e-06, + "loss": 0.85589325, + "num_input_tokens_seen": 79294005, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.91015625, + "step": 3678, + "time_per_iteration": 3.9455032348632812 + }, + { + "auxiliary_loss_clip": 0.01140751, + "auxiliary_loss_mlp": 0.01046282, + "balance_loss_clip": 1.02903211, + "balance_loss_mlp": 1.04866314, + "epoch": 0.2211934465654592, + "flos": 26612787525120.0, + "grad_norm": 1.7724652071984504, + "language_loss": 0.89272189, + "learning_rate": 3.6286348878755452e-06, + "loss": 0.91459215, + "num_input_tokens_seen": 79314005, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.921875, + "step": 3679, + "time_per_iteration": 2.548166036605835 + }, + { + "auxiliary_loss_clip": 0.01142658, + "auxiliary_loss_mlp": 0.01053161, + "balance_loss_clip": 1.03517246, + "balance_loss_mlp": 1.04887235, + "epoch": 0.22125356981812716, + "flos": 16360363347840.0, + "grad_norm": 2.4577897330130773, + "language_loss": 0.86718571, + "learning_rate": 3.6284088057413803e-06, + "loss": 0.88914388, + "num_input_tokens_seen": 79331030, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9375, + "step": 3680, + "time_per_iteration": 2.468712329864502 + }, + { + "auxiliary_loss_clip": 0.0114123, + "auxiliary_loss_mlp": 0.01044656, + "balance_loss_clip": 1.02809739, + "balance_loss_mlp": 1.05175805, + "epoch": 0.22131369307079513, + "flos": 21651118151040.0, + "grad_norm": 2.0752123015423556, + "language_loss": 0.81897914, + "learning_rate": 3.6281826618577894e-06, + "loss": 0.84083802, + "num_input_tokens_seen": 79348560, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.89453125, + "step": 3681, + "time_per_iteration": 2.532210350036621 + }, + { + "auxiliary_loss_clip": 0.01136266, + "auxiliary_loss_mlp": 0.0103672, + "balance_loss_clip": 1.02076972, + "balance_loss_mlp": 1.04784071, + "epoch": 0.2213738163234631, + "flos": 19609524927360.0, + "grad_norm": 2.44274183004677, + "language_loss": 0.79908317, + "learning_rate": 3.62795645623335e-06, + "loss": 0.82081306, + "num_input_tokens_seen": 79367175, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.88671875, + "step": 3682, + "time_per_iteration": 2.491135358810425 + }, + { + "auxiliary_loss_clip": 0.01140313, + "auxiliary_loss_mlp": 0.01042047, + "balance_loss_clip": 1.02433276, + "balance_loss_mlp": 1.04739022, + "epoch": 0.22143393957613106, + "flos": 23623404082560.0, + "grad_norm": 2.2064811404605376, + "language_loss": 0.77283889, + "learning_rate": 3.627730188876638e-06, + "loss": 0.79466248, + "num_input_tokens_seen": 79388435, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9296875, + "step": 3683, + "time_per_iteration": 2.503041982650757 + }, + { + "auxiliary_loss_clip": 0.01141417, + "auxiliary_loss_mlp": 0.01045647, + "balance_loss_clip": 1.02824235, + "balance_loss_mlp": 1.04623342, + "epoch": 0.22149406282879905, + "flos": 26177801823360.0, + "grad_norm": 2.114071962716483, + "language_loss": 0.72779894, + "learning_rate": 3.627503859796234e-06, + "loss": 0.74966961, + "num_input_tokens_seen": 79407910, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.953125, + "step": 3684, + "time_per_iteration": 2.521495819091797 + }, + { + "auxiliary_loss_clip": 0.01142849, + "auxiliary_loss_mlp": 0.01043772, + "balance_loss_clip": 1.02598643, + "balance_loss_mlp": 1.05060613, + "epoch": 0.221554186081467, + "flos": 14538758970240.0, + "grad_norm": 1.9389187138945425, + "language_loss": 0.80108052, + "learning_rate": 3.6272774690007207e-06, + "loss": 0.82294679, + "num_input_tokens_seen": 79424020, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.921875, + "step": 3685, + "time_per_iteration": 2.436958074569702 + }, + { + "auxiliary_loss_clip": 0.01135153, + "auxiliary_loss_mlp": 0.01040139, + "balance_loss_clip": 1.02504683, + "balance_loss_mlp": 1.04634571, + "epoch": 0.22161430933413498, + "flos": 22238257864320.0, + "grad_norm": 1.5568750132404718, + "language_loss": 0.87128556, + "learning_rate": 3.6270510164986823e-06, + "loss": 0.89303845, + "num_input_tokens_seen": 79445605, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.88671875, + "step": 3686, + "time_per_iteration": 2.5519070625305176 + }, + { + "auxiliary_loss_clip": 0.01138026, + "auxiliary_loss_mlp": 0.01042632, + "balance_loss_clip": 1.02552581, + "balance_loss_mlp": 1.04762685, + "epoch": 0.22167443258680294, + "flos": 23476529370240.0, + "grad_norm": 1.942015126167962, + "language_loss": 0.77953136, + "learning_rate": 3.626824502298707e-06, + "loss": 0.8013379, + "num_input_tokens_seen": 79463850, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.90234375, + "step": 3687, + "time_per_iteration": 2.495084285736084 + }, + { + "auxiliary_loss_clip": 0.01146436, + "auxiliary_loss_mlp": 0.01048705, + "balance_loss_clip": 1.03085971, + "balance_loss_mlp": 1.05057812, + "epoch": 0.2217345558394709, + "flos": 23221132692480.0, + "grad_norm": 1.8313314390802422, + "language_loss": 0.84722549, + "learning_rate": 3.626597926409383e-06, + "loss": 0.86917698, + "num_input_tokens_seen": 79482845, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.95703125, + "step": 3688, + "time_per_iteration": 2.5029165744781494 + }, + { + "auxiliary_loss_clip": 0.01146721, + "auxiliary_loss_mlp": 0.0104649, + "balance_loss_clip": 1.02897787, + "balance_loss_mlp": 1.05005932, + "epoch": 0.22179467909213887, + "flos": 20011078045440.0, + "grad_norm": 2.7913489877281905, + "language_loss": 0.81395769, + "learning_rate": 3.6263712888393027e-06, + "loss": 0.83588976, + "num_input_tokens_seen": 79501550, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.96484375, + "step": 3689, + "time_per_iteration": 2.487032651901245 + }, + { + "auxiliary_loss_clip": 0.0114216, + "auxiliary_loss_mlp": 0.01044991, + "balance_loss_clip": 1.02758622, + "balance_loss_mlp": 1.04985952, + "epoch": 0.22185480234480687, + "flos": 19683034110720.0, + "grad_norm": 2.5504206662352082, + "language_loss": 0.70040542, + "learning_rate": 3.626144589597061e-06, + "loss": 0.72227693, + "num_input_tokens_seen": 79519680, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.921875, + "step": 3690, + "time_per_iteration": 2.5005807876586914 + }, + { + "auxiliary_loss_clip": 0.01145048, + "auxiliary_loss_mlp": 0.01038313, + "balance_loss_clip": 1.0202167, + "balance_loss_mlp": 1.04890513, + "epoch": 0.22191492559747483, + "flos": 21981316901760.0, + "grad_norm": 1.7318147752747124, + "language_loss": 0.72394359, + "learning_rate": 3.6259178286912528e-06, + "loss": 0.74577713, + "num_input_tokens_seen": 79539000, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9609375, + "step": 3691, + "time_per_iteration": 2.4835989475250244 + }, + { + "auxiliary_loss_clip": 0.01145815, + "auxiliary_loss_mlp": 0.01049746, + "balance_loss_clip": 1.03169739, + "balance_loss_mlp": 1.05317688, + "epoch": 0.2219750488501428, + "flos": 23222066446080.0, + "grad_norm": 2.1843836481793057, + "language_loss": 0.71611524, + "learning_rate": 3.625691006130477e-06, + "loss": 0.73807085, + "num_input_tokens_seen": 79559695, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.92578125, + "step": 3692, + "time_per_iteration": 2.515230655670166 + }, + { + "auxiliary_loss_clip": 0.01146831, + "auxiliary_loss_mlp": 0.01044658, + "balance_loss_clip": 1.02750337, + "balance_loss_mlp": 1.05008483, + "epoch": 0.22203517210281076, + "flos": 22453685683200.0, + "grad_norm": 2.7650002202849113, + "language_loss": 0.87580657, + "learning_rate": 3.6254641219233362e-06, + "loss": 0.89772147, + "num_input_tokens_seen": 79579095, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.96875, + "step": 3693, + "time_per_iteration": 2.5013554096221924 + }, + { + "auxiliary_loss_clip": 0.01138596, + "auxiliary_loss_mlp": 0.010363, + "balance_loss_clip": 1.02086258, + "balance_loss_mlp": 1.04947054, + "epoch": 0.22209529535547873, + "flos": 17564555825280.0, + "grad_norm": 3.031177285152565, + "language_loss": 0.85307622, + "learning_rate": 3.6252371760784325e-06, + "loss": 0.87482512, + "num_input_tokens_seen": 79596430, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.890625, + "step": 3694, + "time_per_iteration": 2.4828481674194336 + }, + { + "auxiliary_loss_clip": 0.01147368, + "auxiliary_loss_mlp": 0.01041631, + "balance_loss_clip": 1.02370214, + "balance_loss_mlp": 1.0491637, + "epoch": 0.2221554186081467, + "flos": 21469015175040.0, + "grad_norm": 1.9517253418741858, + "language_loss": 0.69055748, + "learning_rate": 3.6250101686043725e-06, + "loss": 0.71244752, + "num_input_tokens_seen": 79615825, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.984375, + "step": 3695, + "time_per_iteration": 2.49957537651062 + }, + { + "auxiliary_loss_clip": 0.01141491, + "auxiliary_loss_mlp": 0.0104003, + "balance_loss_clip": 1.02438951, + "balance_loss_mlp": 1.05095696, + "epoch": 0.22221554186081466, + "flos": 27673445255040.0, + "grad_norm": 1.4867456423055678, + "language_loss": 0.71710318, + "learning_rate": 3.6247830995097637e-06, + "loss": 0.73891842, + "num_input_tokens_seen": 79637875, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.90625, + "step": 3696, + "time_per_iteration": 2.5991299152374268 + }, + { + "auxiliary_loss_clip": 0.01140811, + "auxiliary_loss_mlp": 0.01040962, + "balance_loss_clip": 1.02387977, + "balance_loss_mlp": 1.0483942, + "epoch": 0.22227566511348265, + "flos": 25958926298880.0, + "grad_norm": 1.901791440824732, + "language_loss": 0.87694812, + "learning_rate": 3.624555968803217e-06, + "loss": 0.8987658, + "num_input_tokens_seen": 79656970, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.921875, + "step": 3697, + "time_per_iteration": 2.524841547012329 + }, + { + "auxiliary_loss_clip": 0.01134138, + "auxiliary_loss_mlp": 0.01045972, + "balance_loss_clip": 1.03020072, + "balance_loss_mlp": 1.04646909, + "epoch": 0.22233578836615062, + "flos": 39203678833920.0, + "grad_norm": 1.985465494359005, + "language_loss": 0.66109681, + "learning_rate": 3.624328776493346e-06, + "loss": 0.68289793, + "num_input_tokens_seen": 79680275, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.87890625, + "step": 3698, + "time_per_iteration": 2.6806552410125732 + }, + { + "auxiliary_loss_clip": 0.01143188, + "auxiliary_loss_mlp": 0.01038878, + "balance_loss_clip": 1.0208652, + "balance_loss_mlp": 1.049245, + "epoch": 0.22239591161881858, + "flos": 36283782251520.0, + "grad_norm": 1.9701476357110561, + "language_loss": 0.82699466, + "learning_rate": 3.6241015225887637e-06, + "loss": 0.84881532, + "num_input_tokens_seen": 79701255, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9375, + "step": 3699, + "time_per_iteration": 2.620795965194702 + }, + { + "auxiliary_loss_clip": 0.01141189, + "auxiliary_loss_mlp": 0.01044961, + "balance_loss_clip": 1.02789021, + "balance_loss_mlp": 1.04960978, + "epoch": 0.22245603487148655, + "flos": 19719591177600.0, + "grad_norm": 1.6593732889446324, + "language_loss": 0.79488564, + "learning_rate": 3.62387420709809e-06, + "loss": 0.81674713, + "num_input_tokens_seen": 79721315, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9140625, + "step": 3700, + "time_per_iteration": 2.4886739253997803 + }, + { + "auxiliary_loss_clip": 0.01148421, + "auxiliary_loss_mlp": 0.01045024, + "balance_loss_clip": 1.02639139, + "balance_loss_mlp": 1.05154204, + "epoch": 0.2225161581241545, + "flos": 46280450615040.0, + "grad_norm": 7.082418544009014, + "language_loss": 0.72063768, + "learning_rate": 3.623646830029943e-06, + "loss": 0.74257213, + "num_input_tokens_seen": 79742705, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.96875, + "step": 3701, + "time_per_iteration": 2.7293899059295654 + }, + { + "auxiliary_loss_clip": 0.01139069, + "auxiliary_loss_mlp": 0.0104219, + "balance_loss_clip": 1.02520323, + "balance_loss_mlp": 1.04706395, + "epoch": 0.22257628137682248, + "flos": 23696194993920.0, + "grad_norm": 1.9269634413479926, + "language_loss": 0.79704928, + "learning_rate": 3.6234193913929454e-06, + "loss": 0.81886196, + "num_input_tokens_seen": 79763000, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.921875, + "step": 3702, + "time_per_iteration": 2.5527849197387695 + }, + { + "auxiliary_loss_clip": 0.01132932, + "auxiliary_loss_mlp": 0.0104181, + "balance_loss_clip": 1.02487028, + "balance_loss_mlp": 1.04518211, + "epoch": 0.22263640462949044, + "flos": 19353984595200.0, + "grad_norm": 2.7410709876553447, + "language_loss": 0.78632712, + "learning_rate": 3.623191891195723e-06, + "loss": 0.80807453, + "num_input_tokens_seen": 79781335, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.87890625, + "step": 3703, + "time_per_iteration": 2.4955005645751953 + }, + { + "auxiliary_loss_clip": 0.01140692, + "auxiliary_loss_mlp": 0.01037524, + "balance_loss_clip": 1.01810527, + "balance_loss_mlp": 1.0468421, + "epoch": 0.22269652788215843, + "flos": 20776047016320.0, + "grad_norm": 1.8479834568020117, + "language_loss": 0.74212444, + "learning_rate": 3.6229643294469005e-06, + "loss": 0.7639066, + "num_input_tokens_seen": 79800150, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.9375, + "step": 3704, + "time_per_iteration": 2.5000903606414795 + }, + { + "auxiliary_loss_clip": 0.0113847, + "auxiliary_loss_mlp": 0.01042668, + "balance_loss_clip": 1.02618146, + "balance_loss_mlp": 1.05030012, + "epoch": 0.2227566511348264, + "flos": 47958843467520.0, + "grad_norm": 1.7361108874663713, + "language_loss": 0.64372134, + "learning_rate": 3.6227367061551074e-06, + "loss": 0.66553271, + "num_input_tokens_seen": 79822390, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8828125, + "step": 3705, + "time_per_iteration": 2.6993744373321533 + }, + { + "auxiliary_loss_clip": 0.01064369, + "auxiliary_loss_mlp": 0.01006302, + "balance_loss_clip": 1.00454926, + "balance_loss_mlp": 1.03098035, + "epoch": 0.22281677438749437, + "flos": 66218953230720.0, + "grad_norm": 1.353184132187748, + "language_loss": 0.65301311, + "learning_rate": 3.6225090213289766e-06, + "loss": 0.67371976, + "num_input_tokens_seen": 79873350, + "router_z_loss_clip": 0.01757812, + "router_z_loss_mlp": 0.33398438, + "step": 3706, + "time_per_iteration": 2.9832844734191895 + }, + { + "auxiliary_loss_clip": 0.01137982, + "auxiliary_loss_mlp": 0.01036629, + "balance_loss_clip": 1.02076256, + "balance_loss_mlp": 1.0461061, + "epoch": 0.22287689764016233, + "flos": 21871609787520.0, + "grad_norm": 3.09427451037038, + "language_loss": 0.80608439, + "learning_rate": 3.622281274977141e-06, + "loss": 0.82783049, + "num_input_tokens_seen": 79891715, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.91796875, + "step": 3707, + "time_per_iteration": 2.5236454010009766 + }, + { + "auxiliary_loss_clip": 0.01139003, + "auxiliary_loss_mlp": 0.01038491, + "balance_loss_clip": 1.02184916, + "balance_loss_mlp": 1.04706407, + "epoch": 0.2229370208928303, + "flos": 27672475587840.0, + "grad_norm": 2.0318896185848057, + "language_loss": 0.78124011, + "learning_rate": 3.6220534671082367e-06, + "loss": 0.80301505, + "num_input_tokens_seen": 79911175, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.921875, + "step": 3708, + "time_per_iteration": 2.5254104137420654 + }, + { + "auxiliary_loss_clip": 0.01142891, + "auxiliary_loss_mlp": 0.01039636, + "balance_loss_clip": 1.02291107, + "balance_loss_mlp": 1.04897153, + "epoch": 0.22299714414549826, + "flos": 30154657034880.0, + "grad_norm": 1.913582269302705, + "language_loss": 0.79989487, + "learning_rate": 3.6218255977309024e-06, + "loss": 0.82172012, + "num_input_tokens_seen": 79931875, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9375, + "step": 3709, + "time_per_iteration": 2.5528371334075928 + }, + { + "auxiliary_loss_clip": 0.01139166, + "auxiliary_loss_mlp": 0.01046119, + "balance_loss_clip": 1.02913201, + "balance_loss_mlp": 1.04580092, + "epoch": 0.22305726739816625, + "flos": 23143134309120.0, + "grad_norm": 2.062693768306912, + "language_loss": 0.68752408, + "learning_rate": 3.6215976668537787e-06, + "loss": 0.70937693, + "num_input_tokens_seen": 79952445, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.93359375, + "step": 3710, + "time_per_iteration": 2.511275053024292 + }, + { + "auxiliary_loss_clip": 0.01144244, + "auxiliary_loss_mlp": 0.01039149, + "balance_loss_clip": 1.0221858, + "balance_loss_mlp": 1.04812646, + "epoch": 0.22311739065083422, + "flos": 19172061187200.0, + "grad_norm": 2.3083581079415216, + "language_loss": 0.90696692, + "learning_rate": 3.6213696744855096e-06, + "loss": 0.92880082, + "num_input_tokens_seen": 79971030, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9609375, + "step": 3711, + "time_per_iteration": 2.4757487773895264 + }, + { + "auxiliary_loss_clip": 0.01138091, + "auxiliary_loss_mlp": 0.01051989, + "balance_loss_clip": 1.03406, + "balance_loss_mlp": 1.04603434, + "epoch": 0.22317751390350218, + "flos": 13617757319040.0, + "grad_norm": 2.758927620438821, + "language_loss": 0.89628232, + "learning_rate": 3.6211416206347395e-06, + "loss": 0.91818309, + "num_input_tokens_seen": 79982085, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.921875, + "step": 3712, + "time_per_iteration": 2.3870105743408203 + }, + { + "auxiliary_loss_clip": 0.01139482, + "auxiliary_loss_mlp": 0.01051487, + "balance_loss_clip": 1.03356993, + "balance_loss_mlp": 1.04956841, + "epoch": 0.22323763715617015, + "flos": 11029065068160.0, + "grad_norm": 3.039950461935961, + "language_loss": 0.74859631, + "learning_rate": 3.620913505310117e-06, + "loss": 0.77050602, + "num_input_tokens_seen": 79997460, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.8984375, + "step": 3713, + "time_per_iteration": 2.4336304664611816 + }, + { + "auxiliary_loss_clip": 0.01138793, + "auxiliary_loss_mlp": 0.01041826, + "balance_loss_clip": 1.02543497, + "balance_loss_mlp": 1.048329, + "epoch": 0.22329776040883811, + "flos": 41351531466240.0, + "grad_norm": 1.8221921578975473, + "language_loss": 0.62592143, + "learning_rate": 3.6206853285202917e-06, + "loss": 0.64772761, + "num_input_tokens_seen": 80022450, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90625, + "step": 3714, + "time_per_iteration": 2.6230995655059814 + }, + { + "auxiliary_loss_clip": 0.01139199, + "auxiliary_loss_mlp": 0.01036969, + "balance_loss_clip": 1.02073312, + "balance_loss_mlp": 1.04734552, + "epoch": 0.22335788366150608, + "flos": 25119478477440.0, + "grad_norm": 1.9329837891440178, + "language_loss": 0.79052407, + "learning_rate": 3.6204570902739164e-06, + "loss": 0.81228578, + "num_input_tokens_seen": 80042100, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.91796875, + "step": 3715, + "time_per_iteration": 2.510436534881592 + }, + { + "auxiliary_loss_clip": 0.011421, + "auxiliary_loss_mlp": 0.01050674, + "balance_loss_clip": 1.03372216, + "balance_loss_mlp": 1.05021942, + "epoch": 0.22341800691417404, + "flos": 16983377769600.0, + "grad_norm": 1.6633570096565886, + "language_loss": 0.77182817, + "learning_rate": 3.620228790579645e-06, + "loss": 0.79375589, + "num_input_tokens_seen": 80059690, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.921875, + "step": 3716, + "time_per_iteration": 2.4398605823516846 + }, + { + "auxiliary_loss_clip": 0.01141179, + "auxiliary_loss_mlp": 0.01047022, + "balance_loss_clip": 1.03046429, + "balance_loss_mlp": 1.04845762, + "epoch": 0.22347813016684204, + "flos": 14136738975360.0, + "grad_norm": 2.028714583879474, + "language_loss": 0.79209757, + "learning_rate": 3.6200004294461367e-06, + "loss": 0.81397963, + "num_input_tokens_seen": 80076060, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.92578125, + "step": 3717, + "time_per_iteration": 2.456042766571045 + }, + { + "auxiliary_loss_clip": 0.01143546, + "auxiliary_loss_mlp": 0.01041127, + "balance_loss_clip": 1.02377009, + "balance_loss_mlp": 1.04934192, + "epoch": 0.22353825341951, + "flos": 23583147914880.0, + "grad_norm": 2.2103373086531115, + "language_loss": 0.68029571, + "learning_rate": 3.6197720068820497e-06, + "loss": 0.70214242, + "num_input_tokens_seen": 80094760, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.94140625, + "step": 3718, + "time_per_iteration": 2.4818973541259766 + }, + { + "auxiliary_loss_clip": 0.01142458, + "auxiliary_loss_mlp": 0.01038613, + "balance_loss_clip": 1.02067208, + "balance_loss_mlp": 1.04784536, + "epoch": 0.22359837667217797, + "flos": 29824206888960.0, + "grad_norm": 1.9912565029374794, + "language_loss": 0.80194163, + "learning_rate": 3.619543522896045e-06, + "loss": 0.8237524, + "num_input_tokens_seen": 80114475, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.9453125, + "step": 3719, + "time_per_iteration": 3.985903263092041 + }, + { + "auxiliary_loss_clip": 0.01145808, + "auxiliary_loss_mlp": 0.01052597, + "balance_loss_clip": 1.03396416, + "balance_loss_mlp": 1.04785836, + "epoch": 0.22365849992484593, + "flos": 17603088140160.0, + "grad_norm": 2.0930960597239707, + "language_loss": 0.86421579, + "learning_rate": 3.6193149774967885e-06, + "loss": 0.88619983, + "num_input_tokens_seen": 80132920, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.98046875, + "step": 3720, + "time_per_iteration": 3.914626359939575 + }, + { + "auxiliary_loss_clip": 0.0114136, + "auxiliary_loss_mlp": 0.01033623, + "balance_loss_clip": 1.01682639, + "balance_loss_mlp": 1.05105066, + "epoch": 0.2237186231775139, + "flos": 22710949868160.0, + "grad_norm": 1.6398614781610892, + "language_loss": 0.74860299, + "learning_rate": 3.619086370692945e-06, + "loss": 0.77035284, + "num_input_tokens_seen": 80152845, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.90234375, + "step": 3721, + "time_per_iteration": 2.485271453857422 + }, + { + "auxiliary_loss_clip": 0.011451, + "auxiliary_loss_mlp": 0.01043389, + "balance_loss_clip": 1.0256865, + "balance_loss_mlp": 1.0494988, + "epoch": 0.22377874643018186, + "flos": 13371518609280.0, + "grad_norm": 2.928465692067959, + "language_loss": 0.78943181, + "learning_rate": 3.6188577024931844e-06, + "loss": 0.81131673, + "num_input_tokens_seen": 80170680, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.953125, + "step": 3722, + "time_per_iteration": 2.471928834915161 + }, + { + "auxiliary_loss_clip": 0.01140042, + "auxiliary_loss_mlp": 0.0104164, + "balance_loss_clip": 1.02551126, + "balance_loss_mlp": 1.05004597, + "epoch": 0.22383886968284986, + "flos": 17894970057600.0, + "grad_norm": 2.2482737248582247, + "language_loss": 0.82315016, + "learning_rate": 3.618628972906178e-06, + "loss": 0.84496701, + "num_input_tokens_seen": 80189030, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8984375, + "step": 3723, + "time_per_iteration": 2.4540791511535645 + }, + { + "auxiliary_loss_clip": 0.01144828, + "auxiliary_loss_mlp": 0.01044673, + "balance_loss_clip": 1.02729177, + "balance_loss_mlp": 1.05062389, + "epoch": 0.22389899293551782, + "flos": 23879123982720.0, + "grad_norm": 2.154682666342997, + "language_loss": 0.84433442, + "learning_rate": 3.6184001819405984e-06, + "loss": 0.86622941, + "num_input_tokens_seen": 80208365, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.94140625, + "step": 3724, + "time_per_iteration": 2.526204824447632 + }, + { + "auxiliary_loss_clip": 0.0114043, + "auxiliary_loss_mlp": 0.010395, + "balance_loss_clip": 1.02297735, + "balance_loss_mlp": 1.04889762, + "epoch": 0.2239591161881858, + "flos": 27272430840960.0, + "grad_norm": 2.178002887638817, + "language_loss": 0.79036546, + "learning_rate": 3.618171329605121e-06, + "loss": 0.81216478, + "num_input_tokens_seen": 80228685, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.9140625, + "step": 3725, + "time_per_iteration": 2.513136625289917 + }, + { + "auxiliary_loss_clip": 0.01139478, + "auxiliary_loss_mlp": 0.01039417, + "balance_loss_clip": 1.02271581, + "balance_loss_mlp": 1.04898071, + "epoch": 0.22401923944085375, + "flos": 22236857233920.0, + "grad_norm": 1.6732241790302085, + "language_loss": 0.77158499, + "learning_rate": 3.6179424159084254e-06, + "loss": 0.79337394, + "num_input_tokens_seen": 80247635, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.90625, + "step": 3726, + "time_per_iteration": 2.5645246505737305 + }, + { + "auxiliary_loss_clip": 0.01150164, + "auxiliary_loss_mlp": 0.01045662, + "balance_loss_clip": 1.02677917, + "balance_loss_mlp": 1.05054045, + "epoch": 0.22407936269352172, + "flos": 12053668521600.0, + "grad_norm": 2.7042555627132296, + "language_loss": 0.72376108, + "learning_rate": 3.6177134408591914e-06, + "loss": 0.74571931, + "num_input_tokens_seen": 80260045, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.99609375, + "step": 3727, + "time_per_iteration": 2.4437429904937744 + }, + { + "auxiliary_loss_clip": 0.0114439, + "auxiliary_loss_mlp": 0.01040468, + "balance_loss_clip": 1.02140689, + "balance_loss_mlp": 1.04682648, + "epoch": 0.22413948594618968, + "flos": 19353553632000.0, + "grad_norm": 2.2876633759350327, + "language_loss": 0.86584771, + "learning_rate": 3.6174844044661013e-06, + "loss": 0.88769633, + "num_input_tokens_seen": 80277680, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.9765625, + "step": 3728, + "time_per_iteration": 2.496020793914795 + }, + { + "auxiliary_loss_clip": 0.01143576, + "auxiliary_loss_mlp": 0.0104763, + "balance_loss_clip": 1.02838981, + "balance_loss_mlp": 1.05045211, + "epoch": 0.22419960919885765, + "flos": 24170000319360.0, + "grad_norm": 2.0817566504616734, + "language_loss": 0.80479026, + "learning_rate": 3.6172553067378406e-06, + "loss": 0.82670236, + "num_input_tokens_seen": 80294795, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.9296875, + "step": 3729, + "time_per_iteration": 2.4733448028564453 + }, + { + "auxiliary_loss_clip": 0.01136706, + "auxiliary_loss_mlp": 0.01046287, + "balance_loss_clip": 1.03019357, + "balance_loss_mlp": 1.04672551, + "epoch": 0.22425973245152564, + "flos": 27378977558400.0, + "grad_norm": 2.3054621640206205, + "language_loss": 0.86468041, + "learning_rate": 3.6170261476830964e-06, + "loss": 0.88651037, + "num_input_tokens_seen": 80315425, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8984375, + "step": 3730, + "time_per_iteration": 2.5348362922668457 + }, + { + "auxiliary_loss_clip": 0.01136756, + "auxiliary_loss_mlp": 0.0103563, + "balance_loss_clip": 1.01917958, + "balance_loss_mlp": 1.04737782, + "epoch": 0.2243198557041936, + "flos": 13735652734080.0, + "grad_norm": 1.75673058423422, + "language_loss": 0.73293322, + "learning_rate": 3.616796927310559e-06, + "loss": 0.75465709, + "num_input_tokens_seen": 80333905, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.89453125, + "step": 3731, + "time_per_iteration": 2.4397478103637695 + }, + { + "auxiliary_loss_clip": 0.01141304, + "auxiliary_loss_mlp": 0.0104035, + "balance_loss_clip": 1.02370882, + "balance_loss_mlp": 1.04893279, + "epoch": 0.22437997895686157, + "flos": 19530700531200.0, + "grad_norm": 2.4044438539905575, + "language_loss": 0.75237334, + "learning_rate": 3.6165676456289195e-06, + "loss": 0.77418989, + "num_input_tokens_seen": 80352165, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.921875, + "step": 3732, + "time_per_iteration": 2.476630926132202 + }, + { + "auxiliary_loss_clip": 0.01141784, + "auxiliary_loss_mlp": 0.01058138, + "balance_loss_clip": 1.04106712, + "balance_loss_mlp": 1.0494858, + "epoch": 0.22444010220952954, + "flos": 23696230907520.0, + "grad_norm": 1.8584104659795708, + "language_loss": 0.88037199, + "learning_rate": 3.616338302646873e-06, + "loss": 0.90237123, + "num_input_tokens_seen": 80371305, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.921875, + "step": 3733, + "time_per_iteration": 2.4723222255706787 + }, + { + "auxiliary_loss_clip": 0.01137652, + "auxiliary_loss_mlp": 0.01042602, + "balance_loss_clip": 1.02473271, + "balance_loss_mlp": 1.04564941, + "epoch": 0.2245002254621975, + "flos": 22382905933440.0, + "grad_norm": 1.6767676579772364, + "language_loss": 0.84200239, + "learning_rate": 3.6161088983731166e-06, + "loss": 0.86380494, + "num_input_tokens_seen": 80391020, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.921875, + "step": 3734, + "time_per_iteration": 2.5214619636535645 + }, + { + "auxiliary_loss_clip": 0.01143902, + "auxiliary_loss_mlp": 0.01048514, + "balance_loss_clip": 1.03170574, + "balance_loss_mlp": 1.0513525, + "epoch": 0.22456034871486547, + "flos": 26942303917440.0, + "grad_norm": 1.6368426378189131, + "language_loss": 0.76838279, + "learning_rate": 3.6158794328163482e-06, + "loss": 0.79030693, + "num_input_tokens_seen": 80411365, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.92578125, + "step": 3735, + "time_per_iteration": 2.5025858879089355 + }, + { + "auxiliary_loss_clip": 0.01134798, + "auxiliary_loss_mlp": 0.01047796, + "balance_loss_clip": 1.032215, + "balance_loss_mlp": 1.04791164, + "epoch": 0.22462047196753343, + "flos": 28983538005120.0, + "grad_norm": 3.6998773026048046, + "language_loss": 0.84505916, + "learning_rate": 3.6156499059852702e-06, + "loss": 0.86688507, + "num_input_tokens_seen": 80431075, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8671875, + "step": 3736, + "time_per_iteration": 2.581409454345703 + }, + { + "auxiliary_loss_clip": 0.0114079, + "auxiliary_loss_mlp": 0.01039504, + "balance_loss_clip": 1.02306545, + "balance_loss_mlp": 1.04848719, + "epoch": 0.22468059522020142, + "flos": 20011329440640.0, + "grad_norm": 2.2208030259376192, + "language_loss": 0.86398852, + "learning_rate": 3.615420317888586e-06, + "loss": 0.88579136, + "num_input_tokens_seen": 80449240, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.92578125, + "step": 3737, + "time_per_iteration": 2.4498212337493896 + }, + { + "auxiliary_loss_clip": 0.01141365, + "auxiliary_loss_mlp": 0.01047163, + "balance_loss_clip": 1.02917397, + "balance_loss_mlp": 1.0476644, + "epoch": 0.2247407184728694, + "flos": 29314239546240.0, + "grad_norm": 2.434824168439142, + "language_loss": 0.79145718, + "learning_rate": 3.6151906685350006e-06, + "loss": 0.81334245, + "num_input_tokens_seen": 80467900, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9375, + "step": 3738, + "time_per_iteration": 2.5505504608154297 + }, + { + "auxiliary_loss_clip": 0.01140019, + "auxiliary_loss_mlp": 0.01041393, + "balance_loss_clip": 1.02564526, + "balance_loss_mlp": 1.0471611, + "epoch": 0.22480084172553735, + "flos": 22310366417280.0, + "grad_norm": 2.2711438439691314, + "language_loss": 0.75895345, + "learning_rate": 3.614960957933224e-06, + "loss": 0.78076756, + "num_input_tokens_seen": 80487100, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.9296875, + "step": 3739, + "time_per_iteration": 2.458307981491089 + }, + { + "auxiliary_loss_clip": 0.01137257, + "auxiliary_loss_mlp": 0.0104211, + "balance_loss_clip": 1.0255754, + "balance_loss_mlp": 1.04610491, + "epoch": 0.22486096497820532, + "flos": 25591272641280.0, + "grad_norm": 1.9782758832921432, + "language_loss": 0.74705702, + "learning_rate": 3.6147311860919655e-06, + "loss": 0.76885068, + "num_input_tokens_seen": 80508625, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9140625, + "step": 3740, + "time_per_iteration": 2.5424981117248535 + }, + { + "auxiliary_loss_clip": 0.011377, + "auxiliary_loss_mlp": 0.01039355, + "balance_loss_clip": 1.02234411, + "balance_loss_mlp": 1.04691672, + "epoch": 0.22492108823087328, + "flos": 17639824775040.0, + "grad_norm": 2.174963459036685, + "language_loss": 0.76083958, + "learning_rate": 3.614501353019939e-06, + "loss": 0.78261012, + "num_input_tokens_seen": 80527345, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.90625, + "step": 3741, + "time_per_iteration": 2.4539613723754883 + }, + { + "auxiliary_loss_clip": 0.01140029, + "auxiliary_loss_mlp": 0.01038592, + "balance_loss_clip": 1.02263021, + "balance_loss_mlp": 1.05022252, + "epoch": 0.22498121148354125, + "flos": 16034653797120.0, + "grad_norm": 1.917686629559915, + "language_loss": 0.87458241, + "learning_rate": 3.6142714587258592e-06, + "loss": 0.89636862, + "num_input_tokens_seen": 80545545, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8984375, + "step": 3742, + "time_per_iteration": 2.483146905899048 + }, + { + "auxiliary_loss_clip": 0.01137495, + "auxiliary_loss_mlp": 0.01051324, + "balance_loss_clip": 1.03403831, + "balance_loss_mlp": 1.04824293, + "epoch": 0.22504133473620924, + "flos": 24023772051840.0, + "grad_norm": 2.0726823880461116, + "language_loss": 0.81939828, + "learning_rate": 3.614041503218444e-06, + "loss": 0.84128648, + "num_input_tokens_seen": 80565040, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.890625, + "step": 3743, + "time_per_iteration": 2.4786789417266846 + }, + { + "auxiliary_loss_clip": 0.01140562, + "auxiliary_loss_mlp": 0.01038532, + "balance_loss_clip": 1.02241504, + "balance_loss_mlp": 1.04843307, + "epoch": 0.2251014579888772, + "flos": 16763963541120.0, + "grad_norm": 3.9980575521347697, + "language_loss": 0.63616955, + "learning_rate": 3.6138114865064134e-06, + "loss": 0.65796053, + "num_input_tokens_seen": 80582815, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.921875, + "step": 3744, + "time_per_iteration": 2.4746344089508057 + }, + { + "auxiliary_loss_clip": 0.01137356, + "auxiliary_loss_mlp": 0.01042928, + "balance_loss_clip": 1.02634597, + "balance_loss_mlp": 1.04524422, + "epoch": 0.22516158124154517, + "flos": 13991013498240.0, + "grad_norm": 3.3106228370485806, + "language_loss": 0.75711048, + "learning_rate": 3.613581408598489e-06, + "loss": 0.77891332, + "num_input_tokens_seen": 80600865, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.921875, + "step": 3745, + "time_per_iteration": 2.4295878410339355 + }, + { + "auxiliary_loss_clip": 0.01136631, + "auxiliary_loss_mlp": 0.0103759, + "balance_loss_clip": 1.02142549, + "balance_loss_mlp": 1.04637384, + "epoch": 0.22522170449421314, + "flos": 14390016750720.0, + "grad_norm": 1.8117958881819525, + "language_loss": 0.80839783, + "learning_rate": 3.6133512695033965e-06, + "loss": 0.83013999, + "num_input_tokens_seen": 80617455, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.90234375, + "step": 3746, + "time_per_iteration": 2.4423928260803223 + }, + { + "auxiliary_loss_clip": 0.01138701, + "auxiliary_loss_mlp": 0.01045887, + "balance_loss_clip": 1.02903056, + "balance_loss_mlp": 1.04503584, + "epoch": 0.2252818277468811, + "flos": 23805542972160.0, + "grad_norm": 2.508960709641407, + "language_loss": 0.86067426, + "learning_rate": 3.613121069229862e-06, + "loss": 0.8825202, + "num_input_tokens_seen": 80635125, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.93359375, + "step": 3747, + "time_per_iteration": 2.471223831176758 + }, + { + "auxiliary_loss_clip": 0.01136945, + "auxiliary_loss_mlp": 0.01034039, + "balance_loss_clip": 1.01789808, + "balance_loss_mlp": 1.04515314, + "epoch": 0.22534195099954907, + "flos": 24718033100160.0, + "grad_norm": 1.812236682782158, + "language_loss": 0.76358509, + "learning_rate": 3.6128908077866145e-06, + "loss": 0.78529495, + "num_input_tokens_seen": 80656370, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.91796875, + "step": 3748, + "time_per_iteration": 2.525108575820923 + }, + { + "auxiliary_loss_clip": 0.01142287, + "auxiliary_loss_mlp": 0.01044202, + "balance_loss_clip": 1.0274291, + "balance_loss_mlp": 1.04882264, + "epoch": 0.22540207425221703, + "flos": 21032341534080.0, + "grad_norm": 1.7339876982656162, + "language_loss": 0.79497123, + "learning_rate": 3.6126604851823864e-06, + "loss": 0.81683606, + "num_input_tokens_seen": 80676495, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9375, + "step": 3749, + "time_per_iteration": 2.4881162643432617 + }, + { + "auxiliary_loss_clip": 0.01134115, + "auxiliary_loss_mlp": 0.01039256, + "balance_loss_clip": 1.02423525, + "balance_loss_mlp": 1.04609084, + "epoch": 0.22546219750488503, + "flos": 19390362094080.0, + "grad_norm": 1.6101192523185979, + "language_loss": 0.8009423, + "learning_rate": 3.6124301014259108e-06, + "loss": 0.82267606, + "num_input_tokens_seen": 80694755, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8828125, + "step": 3750, + "time_per_iteration": 2.4656643867492676 + }, + { + "auxiliary_loss_clip": 0.01140861, + "auxiliary_loss_mlp": 0.01044128, + "balance_loss_clip": 1.02733183, + "balance_loss_mlp": 1.04821157, + "epoch": 0.225522320757553, + "flos": 25192628524800.0, + "grad_norm": 2.418289881699729, + "language_loss": 0.81336129, + "learning_rate": 3.6121996565259244e-06, + "loss": 0.83521116, + "num_input_tokens_seen": 80713670, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.92578125, + "step": 3751, + "time_per_iteration": 2.4960029125213623 + }, + { + "auxiliary_loss_clip": 0.01141479, + "auxiliary_loss_mlp": 0.01038662, + "balance_loss_clip": 1.02242589, + "balance_loss_mlp": 1.04915667, + "epoch": 0.22558244401022096, + "flos": 17163110448000.0, + "grad_norm": 1.757449596716865, + "language_loss": 0.83989275, + "learning_rate": 3.611969150491165e-06, + "loss": 0.86169416, + "num_input_tokens_seen": 80731450, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.921875, + "step": 3752, + "time_per_iteration": 2.4668636322021484 + }, + { + "auxiliary_loss_clip": 0.01136965, + "auxiliary_loss_mlp": 0.01039126, + "balance_loss_clip": 1.02375996, + "balance_loss_mlp": 1.04671109, + "epoch": 0.22564256726288892, + "flos": 15231008856960.0, + "grad_norm": 1.7780915453784651, + "language_loss": 0.78616595, + "learning_rate": 3.611738583330375e-06, + "loss": 0.80792689, + "num_input_tokens_seen": 80748415, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.90234375, + "step": 3753, + "time_per_iteration": 2.4305062294006348 + }, + { + "auxiliary_loss_clip": 0.01137405, + "auxiliary_loss_mlp": 0.01038232, + "balance_loss_clip": 1.02113724, + "balance_loss_mlp": 1.04717183, + "epoch": 0.2257026905155569, + "flos": 34568652764160.0, + "grad_norm": 1.990408742554116, + "language_loss": 0.78284466, + "learning_rate": 3.611507955052295e-06, + "loss": 0.80460101, + "num_input_tokens_seen": 80770835, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.90234375, + "step": 3754, + "time_per_iteration": 2.584170341491699 + }, + { + "auxiliary_loss_clip": 0.0113674, + "auxiliary_loss_mlp": 0.01040681, + "balance_loss_clip": 1.0243969, + "balance_loss_mlp": 1.04882884, + "epoch": 0.22576281376822485, + "flos": 19938430788480.0, + "grad_norm": 1.915767444367904, + "language_loss": 0.70267534, + "learning_rate": 3.6112772656656727e-06, + "loss": 0.72444952, + "num_input_tokens_seen": 80787840, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.87890625, + "step": 3755, + "time_per_iteration": 2.458731174468994 + }, + { + "auxiliary_loss_clip": 0.01145193, + "auxiliary_loss_mlp": 0.0104804, + "balance_loss_clip": 1.031744, + "balance_loss_mlp": 1.0502069, + "epoch": 0.22582293702089282, + "flos": 24602005192320.0, + "grad_norm": 2.7446757969812783, + "language_loss": 0.77373838, + "learning_rate": 3.6110465151792547e-06, + "loss": 0.79567063, + "num_input_tokens_seen": 80806335, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.94921875, + "step": 3756, + "time_per_iteration": 2.5073161125183105 + }, + { + "auxiliary_loss_clip": 0.01145039, + "auxiliary_loss_mlp": 0.01042429, + "balance_loss_clip": 1.02498841, + "balance_loss_mlp": 1.05014277, + "epoch": 0.2258830602735608, + "flos": 23035438356480.0, + "grad_norm": 1.8909279955578986, + "language_loss": 0.82552433, + "learning_rate": 3.6108157036017916e-06, + "loss": 0.847399, + "num_input_tokens_seen": 80825355, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.94921875, + "step": 3757, + "time_per_iteration": 2.471353054046631 + }, + { + "auxiliary_loss_clip": 0.01140888, + "auxiliary_loss_mlp": 0.01039381, + "balance_loss_clip": 1.02258492, + "balance_loss_mlp": 1.04810619, + "epoch": 0.22594318352622877, + "flos": 22158427887360.0, + "grad_norm": 1.8410990661161322, + "language_loss": 0.73181808, + "learning_rate": 3.6105848309420358e-06, + "loss": 0.7536208, + "num_input_tokens_seen": 80842570, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.92578125, + "step": 3758, + "time_per_iteration": 2.5376477241516113 + }, + { + "auxiliary_loss_clip": 0.01144551, + "auxiliary_loss_mlp": 0.0104662, + "balance_loss_clip": 1.02985883, + "balance_loss_mlp": 1.04991663, + "epoch": 0.22600330677889674, + "flos": 20594303176320.0, + "grad_norm": 2.0967514749881015, + "language_loss": 0.77208662, + "learning_rate": 3.6103538972087412e-06, + "loss": 0.79399836, + "num_input_tokens_seen": 80858745, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9453125, + "step": 3759, + "time_per_iteration": 2.447608709335327 + }, + { + "auxiliary_loss_clip": 0.01141959, + "auxiliary_loss_mlp": 0.01043995, + "balance_loss_clip": 1.02643597, + "balance_loss_mlp": 1.04806697, + "epoch": 0.2260634300315647, + "flos": 35659798162560.0, + "grad_norm": 1.9036057015372598, + "language_loss": 0.78638428, + "learning_rate": 3.6101229024106655e-06, + "loss": 0.80824387, + "num_input_tokens_seen": 80880085, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.94140625, + "step": 3760, + "time_per_iteration": 4.231990098953247 + }, + { + "auxiliary_loss_clip": 0.01054354, + "auxiliary_loss_mlp": 0.01007925, + "balance_loss_clip": 1.00607765, + "balance_loss_mlp": 1.02028942, + "epoch": 0.22612355328423267, + "flos": 72090455126400.0, + "grad_norm": 0.9344871733021222, + "language_loss": 0.60090166, + "learning_rate": 3.609891846556569e-06, + "loss": 0.62152445, + "num_input_tokens_seen": 80937660, + "router_z_loss_clip": 0.01843262, + "router_z_loss_mlp": 0.33984375, + "step": 3761, + "time_per_iteration": 4.482504367828369 + }, + { + "auxiliary_loss_clip": 0.0114253, + "auxiliary_loss_mlp": 0.01044191, + "balance_loss_clip": 1.02678633, + "balance_loss_mlp": 1.0478611, + "epoch": 0.22618367653690064, + "flos": 22783776693120.0, + "grad_norm": 2.386395888426225, + "language_loss": 0.77400732, + "learning_rate": 3.609660729655211e-06, + "loss": 0.79587454, + "num_input_tokens_seen": 80956265, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9453125, + "step": 3762, + "time_per_iteration": 2.5162198543548584 + }, + { + "auxiliary_loss_clip": 0.01143363, + "auxiliary_loss_mlp": 0.01040981, + "balance_loss_clip": 1.02395821, + "balance_loss_mlp": 1.05073345, + "epoch": 0.22624379978956863, + "flos": 20448254476800.0, + "grad_norm": 2.10132066013886, + "language_loss": 0.78800118, + "learning_rate": 3.6094295517153573e-06, + "loss": 0.80984461, + "num_input_tokens_seen": 80975185, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.92578125, + "step": 3763, + "time_per_iteration": 2.4578778743743896 + }, + { + "auxiliary_loss_clip": 0.01145794, + "auxiliary_loss_mlp": 0.0105418, + "balance_loss_clip": 1.03583384, + "balance_loss_mlp": 1.05000031, + "epoch": 0.2263039230422366, + "flos": 17494314779520.0, + "grad_norm": 1.8659674868358982, + "language_loss": 0.91363662, + "learning_rate": 3.6091983127457743e-06, + "loss": 0.93563628, + "num_input_tokens_seen": 80992830, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.95703125, + "step": 3764, + "time_per_iteration": 2.536231517791748 + }, + { + "auxiliary_loss_clip": 0.01138186, + "auxiliary_loss_mlp": 0.01054666, + "balance_loss_clip": 1.03740454, + "balance_loss_mlp": 1.04773271, + "epoch": 0.22636404629490456, + "flos": 28329748606080.0, + "grad_norm": 1.6188972360392109, + "language_loss": 0.75211406, + "learning_rate": 3.6089670127552293e-06, + "loss": 0.77404261, + "num_input_tokens_seen": 81013675, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.90625, + "step": 3765, + "time_per_iteration": 2.516646146774292 + }, + { + "auxiliary_loss_clip": 0.01139986, + "auxiliary_loss_mlp": 0.01045374, + "balance_loss_clip": 1.02868426, + "balance_loss_mlp": 1.04855943, + "epoch": 0.22642416954757252, + "flos": 17489143221120.0, + "grad_norm": 1.9315012383394614, + "language_loss": 0.89618981, + "learning_rate": 3.608735651752494e-06, + "loss": 0.91804343, + "num_input_tokens_seen": 81030345, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9140625, + "step": 3766, + "time_per_iteration": 2.4829306602478027 + }, + { + "auxiliary_loss_clip": 0.01138287, + "auxiliary_loss_mlp": 0.01042769, + "balance_loss_clip": 1.02568591, + "balance_loss_mlp": 1.04891181, + "epoch": 0.2264842928002405, + "flos": 24384530298240.0, + "grad_norm": 1.6662033714223943, + "language_loss": 0.74710411, + "learning_rate": 3.6085042297463417e-06, + "loss": 0.76891464, + "num_input_tokens_seen": 81051000, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.89453125, + "step": 3767, + "time_per_iteration": 2.4989218711853027 + }, + { + "auxiliary_loss_clip": 0.011397, + "auxiliary_loss_mlp": 0.01044149, + "balance_loss_clip": 1.02664912, + "balance_loss_mlp": 1.04619229, + "epoch": 0.22654441605290845, + "flos": 19830519354240.0, + "grad_norm": 1.4804117361030718, + "language_loss": 0.7156831, + "learning_rate": 3.6082727467455477e-06, + "loss": 0.73752159, + "num_input_tokens_seen": 81071205, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.93359375, + "step": 3768, + "time_per_iteration": 2.5078160762786865 + }, + { + "auxiliary_loss_clip": 0.01143764, + "auxiliary_loss_mlp": 0.01054415, + "balance_loss_clip": 1.03682017, + "balance_loss_mlp": 1.05247319, + "epoch": 0.22660453930557642, + "flos": 27454569730560.0, + "grad_norm": 1.80046116612075, + "language_loss": 0.78268003, + "learning_rate": 3.6080412027588905e-06, + "loss": 0.80466181, + "num_input_tokens_seen": 81091880, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9140625, + "step": 3769, + "time_per_iteration": 2.5122978687286377 + }, + { + "auxiliary_loss_clip": 0.01142038, + "auxiliary_loss_mlp": 0.01042012, + "balance_loss_clip": 1.02465522, + "balance_loss_mlp": 1.0467639, + "epoch": 0.2266646625582444, + "flos": 23988148738560.0, + "grad_norm": 1.7393050758681738, + "language_loss": 0.68427956, + "learning_rate": 3.6078095977951488e-06, + "loss": 0.70612001, + "num_input_tokens_seen": 81113290, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.953125, + "step": 3770, + "time_per_iteration": 2.557098150253296 + }, + { + "auxiliary_loss_clip": 0.01141766, + "auxiliary_loss_mlp": 0.01041674, + "balance_loss_clip": 1.02537811, + "balance_loss_mlp": 1.04682195, + "epoch": 0.22672478581091238, + "flos": 26028054023040.0, + "grad_norm": 1.6251414008252867, + "language_loss": 0.80370939, + "learning_rate": 3.6075779318631067e-06, + "loss": 0.82554382, + "num_input_tokens_seen": 81133535, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.94921875, + "step": 3771, + "time_per_iteration": 2.5156240463256836 + }, + { + "auxiliary_loss_clip": 0.01135038, + "auxiliary_loss_mlp": 0.01045619, + "balance_loss_clip": 1.0290848, + "balance_loss_mlp": 1.04606724, + "epoch": 0.22678490906358034, + "flos": 23841812730240.0, + "grad_norm": 1.567346312954514, + "language_loss": 0.78844583, + "learning_rate": 3.6073462049715486e-06, + "loss": 0.81025243, + "num_input_tokens_seen": 81154650, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.890625, + "step": 3772, + "time_per_iteration": 2.539632558822632 + }, + { + "auxiliary_loss_clip": 0.0105304, + "auxiliary_loss_mlp": 0.01005348, + "balance_loss_clip": 1.00351191, + "balance_loss_mlp": 1.02012253, + "epoch": 0.2268450323162483, + "flos": 65048088574080.0, + "grad_norm": 0.6518085485856671, + "language_loss": 0.54334348, + "learning_rate": 3.607114417129261e-06, + "loss": 0.56392735, + "num_input_tokens_seen": 81221240, + "router_z_loss_clip": 0.01831055, + "router_z_loss_mlp": 0.33007812, + "step": 3773, + "time_per_iteration": 3.1463003158569336 + }, + { + "auxiliary_loss_clip": 0.01136639, + "auxiliary_loss_mlp": 0.01039094, + "balance_loss_clip": 1.02222633, + "balance_loss_mlp": 1.04712117, + "epoch": 0.22690515556891627, + "flos": 22526081544960.0, + "grad_norm": 1.9230264173849037, + "language_loss": 0.70101082, + "learning_rate": 3.6068825683450334e-06, + "loss": 0.72276813, + "num_input_tokens_seen": 81241520, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.89453125, + "step": 3774, + "time_per_iteration": 2.5099127292633057 + }, + { + "auxiliary_loss_clip": 0.01134613, + "auxiliary_loss_mlp": 0.01038845, + "balance_loss_clip": 1.02232277, + "balance_loss_mlp": 1.04480648, + "epoch": 0.22696527882158424, + "flos": 18223444955520.0, + "grad_norm": 2.4369678263863057, + "language_loss": 0.74585366, + "learning_rate": 3.606650658627658e-06, + "loss": 0.76758826, + "num_input_tokens_seen": 81256825, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8984375, + "step": 3775, + "time_per_iteration": 2.4441745281219482 + }, + { + "auxiliary_loss_clip": 0.0113676, + "auxiliary_loss_mlp": 0.01038998, + "balance_loss_clip": 1.02311933, + "balance_loss_mlp": 1.04534245, + "epoch": 0.22702540207425223, + "flos": 17019252478080.0, + "grad_norm": 2.175545430509675, + "language_loss": 0.8256253, + "learning_rate": 3.606418687985928e-06, + "loss": 0.8473829, + "num_input_tokens_seen": 81275695, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.9140625, + "step": 3776, + "time_per_iteration": 2.4418301582336426 + }, + { + "auxiliary_loss_clip": 0.01139885, + "auxiliary_loss_mlp": 0.01037889, + "balance_loss_clip": 1.02125907, + "balance_loss_mlp": 1.04619908, + "epoch": 0.2270855253269202, + "flos": 21325731822720.0, + "grad_norm": 2.75835757539417, + "language_loss": 0.83031607, + "learning_rate": 3.606186656428641e-06, + "loss": 0.85209382, + "num_input_tokens_seen": 81294920, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9375, + "step": 3777, + "time_per_iteration": 2.5585062503814697 + }, + { + "auxiliary_loss_clip": 0.01137385, + "auxiliary_loss_mlp": 0.01039138, + "balance_loss_clip": 1.02232909, + "balance_loss_mlp": 1.04596353, + "epoch": 0.22714564857958816, + "flos": 23550469516800.0, + "grad_norm": 2.6678368583827288, + "language_loss": 0.72658038, + "learning_rate": 3.6059545639645955e-06, + "loss": 0.74834561, + "num_input_tokens_seen": 81314275, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9140625, + "step": 3778, + "time_per_iteration": 2.5019333362579346 + }, + { + "auxiliary_loss_clip": 0.0113896, + "auxiliary_loss_mlp": 0.01040521, + "balance_loss_clip": 1.02386749, + "balance_loss_mlp": 1.04576886, + "epoch": 0.22720577183225613, + "flos": 25989880844160.0, + "grad_norm": 2.229609453971581, + "language_loss": 0.6414392, + "learning_rate": 3.605722410602591e-06, + "loss": 0.663234, + "num_input_tokens_seen": 81333890, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.93359375, + "step": 3779, + "time_per_iteration": 2.5082859992980957 + }, + { + "auxiliary_loss_clip": 0.01138378, + "auxiliary_loss_mlp": 0.01043458, + "balance_loss_clip": 1.02794909, + "balance_loss_mlp": 1.04837573, + "epoch": 0.2272658950849241, + "flos": 20814076540800.0, + "grad_norm": 1.9715072832436495, + "language_loss": 0.70546824, + "learning_rate": 3.6054901963514323e-06, + "loss": 0.72728658, + "num_input_tokens_seen": 81353640, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8984375, + "step": 3780, + "time_per_iteration": 2.4703643321990967 + }, + { + "auxiliary_loss_clip": 0.01140054, + "auxiliary_loss_mlp": 0.0104493, + "balance_loss_clip": 1.02689338, + "balance_loss_mlp": 1.0489254, + "epoch": 0.22732601833759206, + "flos": 23909324342400.0, + "grad_norm": 2.5454366084291133, + "language_loss": 0.89717996, + "learning_rate": 3.6052579212199246e-06, + "loss": 0.91902977, + "num_input_tokens_seen": 81371595, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9140625, + "step": 3781, + "time_per_iteration": 2.4812376499176025 + }, + { + "auxiliary_loss_clip": 0.0113992, + "auxiliary_loss_mlp": 0.01041852, + "balance_loss_clip": 1.02436364, + "balance_loss_mlp": 1.04648304, + "epoch": 0.22738614159026002, + "flos": 15924407978880.0, + "grad_norm": 2.4601522898780805, + "language_loss": 0.7434786, + "learning_rate": 3.6050255852168753e-06, + "loss": 0.76529634, + "num_input_tokens_seen": 81388435, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.93359375, + "step": 3782, + "time_per_iteration": 2.4665582180023193 + }, + { + "auxiliary_loss_clip": 0.01136804, + "auxiliary_loss_mlp": 0.01041674, + "balance_loss_clip": 1.02587914, + "balance_loss_mlp": 1.04467201, + "epoch": 0.22744626484292801, + "flos": 24205515891840.0, + "grad_norm": 1.6148985015615094, + "language_loss": 0.82393098, + "learning_rate": 3.604793188351095e-06, + "loss": 0.84571576, + "num_input_tokens_seen": 81410195, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.921875, + "step": 3783, + "time_per_iteration": 2.4820034503936768 + }, + { + "auxiliary_loss_clip": 0.01137013, + "auxiliary_loss_mlp": 0.01040248, + "balance_loss_clip": 1.02310586, + "balance_loss_mlp": 1.04418266, + "epoch": 0.22750638809559598, + "flos": 24791614110720.0, + "grad_norm": 2.4165791890347714, + "language_loss": 0.75874048, + "learning_rate": 3.6045607306313964e-06, + "loss": 0.78051311, + "num_input_tokens_seen": 81430060, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9296875, + "step": 3784, + "time_per_iteration": 2.5087246894836426 + }, + { + "auxiliary_loss_clip": 0.01134704, + "auxiliary_loss_mlp": 0.01039995, + "balance_loss_clip": 1.02303135, + "balance_loss_mlp": 1.04345798, + "epoch": 0.22756651134826394, + "flos": 22236498097920.0, + "grad_norm": 1.6490497895559066, + "language_loss": 0.70716858, + "learning_rate": 3.604328212066594e-06, + "loss": 0.72891551, + "num_input_tokens_seen": 81447375, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.91015625, + "step": 3785, + "time_per_iteration": 2.4733574390411377 + }, + { + "auxiliary_loss_clip": 0.01051525, + "auxiliary_loss_mlp": 0.01004421, + "balance_loss_clip": 1.00252521, + "balance_loss_mlp": 1.01740241, + "epoch": 0.2276266346009319, + "flos": 62707466626560.0, + "grad_norm": 0.8187947911361427, + "language_loss": 0.61915314, + "learning_rate": 3.6040956326655047e-06, + "loss": 0.63971269, + "num_input_tokens_seen": 81505235, + "router_z_loss_clip": 0.0189209, + "router_z_loss_mlp": 0.34179688, + "step": 3786, + "time_per_iteration": 3.0474631786346436 + }, + { + "auxiliary_loss_clip": 0.01143523, + "auxiliary_loss_mlp": 0.01042446, + "balance_loss_clip": 1.02488649, + "balance_loss_mlp": 1.04777002, + "epoch": 0.22768675785359987, + "flos": 18613936684800.0, + "grad_norm": 2.6740153696427247, + "language_loss": 0.86285794, + "learning_rate": 3.6038629924369486e-06, + "loss": 0.88471758, + "num_input_tokens_seen": 81518685, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.953125, + "step": 3787, + "time_per_iteration": 2.4331281185150146 + }, + { + "auxiliary_loss_clip": 0.01137002, + "auxiliary_loss_mlp": 0.01040164, + "balance_loss_clip": 1.02433276, + "balance_loss_mlp": 1.04612255, + "epoch": 0.22774688110626784, + "flos": 26870195364480.0, + "grad_norm": 1.2844293081892826, + "language_loss": 0.72555876, + "learning_rate": 3.6036302913897474e-06, + "loss": 0.74733031, + "num_input_tokens_seen": 81538940, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.91015625, + "step": 3788, + "time_per_iteration": 2.5378167629241943 + }, + { + "auxiliary_loss_clip": 0.01136486, + "auxiliary_loss_mlp": 0.01036201, + "balance_loss_clip": 1.01929688, + "balance_loss_mlp": 1.04552293, + "epoch": 0.2278070043589358, + "flos": 15553593924480.0, + "grad_norm": 2.4737623033533587, + "language_loss": 0.67524469, + "learning_rate": 3.6033975295327243e-06, + "loss": 0.69697154, + "num_input_tokens_seen": 81555525, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.91015625, + "step": 3789, + "time_per_iteration": 2.412086248397827 + }, + { + "auxiliary_loss_clip": 0.01136455, + "auxiliary_loss_mlp": 0.01041211, + "balance_loss_clip": 1.02416384, + "balance_loss_mlp": 1.04507327, + "epoch": 0.2278671276116038, + "flos": 22416805393920.0, + "grad_norm": 2.1501364843402335, + "language_loss": 0.76075745, + "learning_rate": 3.6031647068747065e-06, + "loss": 0.78253406, + "num_input_tokens_seen": 81576305, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9140625, + "step": 3790, + "time_per_iteration": 2.503600835800171 + }, + { + "auxiliary_loss_clip": 0.01134768, + "auxiliary_loss_mlp": 0.01038527, + "balance_loss_clip": 1.02174211, + "balance_loss_mlp": 1.04253387, + "epoch": 0.22792725086427176, + "flos": 20631363033600.0, + "grad_norm": 2.0794940610838397, + "language_loss": 0.90613973, + "learning_rate": 3.602931823424522e-06, + "loss": 0.92787266, + "num_input_tokens_seen": 81594115, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.921875, + "step": 3791, + "time_per_iteration": 2.4503557682037354 + }, + { + "auxiliary_loss_clip": 0.01138123, + "auxiliary_loss_mlp": 0.01036907, + "balance_loss_clip": 1.02000308, + "balance_loss_mlp": 1.04407096, + "epoch": 0.22798737411693973, + "flos": 31428946903680.0, + "grad_norm": 1.8390004860332834, + "language_loss": 0.82869208, + "learning_rate": 3.6026988791910026e-06, + "loss": 0.85044241, + "num_input_tokens_seen": 81615355, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.94140625, + "step": 3792, + "time_per_iteration": 2.5451550483703613 + }, + { + "auxiliary_loss_clip": 0.01045824, + "auxiliary_loss_mlp": 0.01012041, + "balance_loss_clip": 1.01015747, + "balance_loss_mlp": 1.01168287, + "epoch": 0.2280474973696077, + "flos": 52396685827200.0, + "grad_norm": 1.1436128607221614, + "language_loss": 0.65615487, + "learning_rate": 3.602465874182981e-06, + "loss": 0.67673355, + "num_input_tokens_seen": 81662075, + "router_z_loss_clip": 0.01879883, + "router_z_loss_mlp": 0.34179688, + "step": 3793, + "time_per_iteration": 2.7929015159606934 + }, + { + "auxiliary_loss_clip": 0.01142047, + "auxiliary_loss_mlp": 0.01050177, + "balance_loss_clip": 1.03241456, + "balance_loss_mlp": 1.04557967, + "epoch": 0.22810762062227566, + "flos": 26396066816640.0, + "grad_norm": 2.282271850248546, + "language_loss": 0.77100229, + "learning_rate": 3.602232808409293e-06, + "loss": 0.79292452, + "num_input_tokens_seen": 81681625, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.96484375, + "step": 3794, + "time_per_iteration": 2.4882023334503174 + }, + { + "auxiliary_loss_clip": 0.01139112, + "auxiliary_loss_mlp": 0.01038286, + "balance_loss_clip": 1.02146518, + "balance_loss_mlp": 1.04517698, + "epoch": 0.22816774387494362, + "flos": 25630271832960.0, + "grad_norm": 2.1931228295055716, + "language_loss": 0.80724937, + "learning_rate": 3.6019996818787755e-06, + "loss": 0.82902336, + "num_input_tokens_seen": 81701170, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9375, + "step": 3795, + "time_per_iteration": 2.475311279296875 + }, + { + "auxiliary_loss_clip": 0.0113575, + "auxiliary_loss_mlp": 0.01044854, + "balance_loss_clip": 1.02747297, + "balance_loss_mlp": 1.04336488, + "epoch": 0.22822786712761162, + "flos": 22451602694400.0, + "grad_norm": 1.8416311408581074, + "language_loss": 0.77002209, + "learning_rate": 3.6017664946002704e-06, + "loss": 0.79182816, + "num_input_tokens_seen": 81721265, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.92578125, + "step": 3796, + "time_per_iteration": 2.4734761714935303 + }, + { + "auxiliary_loss_clip": 0.01135997, + "auxiliary_loss_mlp": 0.01038978, + "balance_loss_clip": 1.02236056, + "balance_loss_mlp": 1.04312813, + "epoch": 0.22828799038027958, + "flos": 12202554395520.0, + "grad_norm": 2.506500245398156, + "language_loss": 0.9594354, + "learning_rate": 3.6015332465826188e-06, + "loss": 0.98118514, + "num_input_tokens_seen": 81736565, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9296875, + "step": 3797, + "time_per_iteration": 2.4146203994750977 + }, + { + "auxiliary_loss_clip": 0.01138366, + "auxiliary_loss_mlp": 0.01040269, + "balance_loss_clip": 1.02338922, + "balance_loss_mlp": 1.04537892, + "epoch": 0.22834811363294755, + "flos": 22085708803200.0, + "grad_norm": 1.6428427275001165, + "language_loss": 0.81446218, + "learning_rate": 3.601299937834666e-06, + "loss": 0.83624852, + "num_input_tokens_seen": 81756240, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9296875, + "step": 3798, + "time_per_iteration": 2.490849733352661 + }, + { + "auxiliary_loss_clip": 0.01137089, + "auxiliary_loss_mlp": 0.01038732, + "balance_loss_clip": 1.02080309, + "balance_loss_mlp": 1.04262519, + "epoch": 0.2284082368856155, + "flos": 24860634094080.0, + "grad_norm": 2.3515161945239833, + "language_loss": 0.78744864, + "learning_rate": 3.6010665683652596e-06, + "loss": 0.80920684, + "num_input_tokens_seen": 81775720, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9453125, + "step": 3799, + "time_per_iteration": 2.470564842224121 + }, + { + "auxiliary_loss_clip": 0.0113724, + "auxiliary_loss_mlp": 0.01050228, + "balance_loss_clip": 1.0332408, + "balance_loss_mlp": 1.04381084, + "epoch": 0.22846836013828348, + "flos": 23292882109440.0, + "grad_norm": 1.655995083326211, + "language_loss": 0.75234401, + "learning_rate": 3.6008331381832484e-06, + "loss": 0.77421868, + "num_input_tokens_seen": 81795830, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.93359375, + "step": 3800, + "time_per_iteration": 2.510788917541504 + }, + { + "auxiliary_loss_clip": 0.01137174, + "auxiliary_loss_mlp": 0.01039053, + "balance_loss_clip": 1.02320981, + "balance_loss_mlp": 1.04583156, + "epoch": 0.22852848339095144, + "flos": 27416288810880.0, + "grad_norm": 1.661997570582357, + "language_loss": 0.63433349, + "learning_rate": 3.600599647297484e-06, + "loss": 0.6560958, + "num_input_tokens_seen": 81815745, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.9140625, + "step": 3801, + "time_per_iteration": 2.503643035888672 + }, + { + "auxiliary_loss_clip": 0.01136229, + "auxiliary_loss_mlp": 0.0103618, + "balance_loss_clip": 1.02027762, + "balance_loss_mlp": 1.04721296, + "epoch": 0.2285886066436194, + "flos": 26321157002880.0, + "grad_norm": 1.7846583359688928, + "language_loss": 0.81602335, + "learning_rate": 3.60036609571682e-06, + "loss": 0.83774745, + "num_input_tokens_seen": 81835155, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.890625, + "step": 3802, + "time_per_iteration": 4.002788782119751 + }, + { + "auxiliary_loss_clip": 0.01138233, + "auxiliary_loss_mlp": 0.01047462, + "balance_loss_clip": 1.03027248, + "balance_loss_mlp": 1.04454207, + "epoch": 0.2286487298962874, + "flos": 29716475022720.0, + "grad_norm": 1.7683413549342115, + "language_loss": 0.78830242, + "learning_rate": 3.600132483450114e-06, + "loss": 0.81015933, + "num_input_tokens_seen": 81855655, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9375, + "step": 3803, + "time_per_iteration": 3.9494168758392334 + }, + { + "auxiliary_loss_clip": 0.01135958, + "auxiliary_loss_mlp": 0.01042656, + "balance_loss_clip": 1.02544212, + "balance_loss_mlp": 1.04115725, + "epoch": 0.22870885314895537, + "flos": 21287199507840.0, + "grad_norm": 1.6939241338011581, + "language_loss": 0.85561395, + "learning_rate": 3.5998988105062235e-06, + "loss": 0.87740004, + "num_input_tokens_seen": 81876385, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.94921875, + "step": 3804, + "time_per_iteration": 2.4504544734954834 + }, + { + "auxiliary_loss_clip": 0.01139159, + "auxiliary_loss_mlp": 0.01043693, + "balance_loss_clip": 1.02744436, + "balance_loss_mlp": 1.04339862, + "epoch": 0.22876897640162333, + "flos": 14939450161920.0, + "grad_norm": 2.1651494765134736, + "language_loss": 0.76485813, + "learning_rate": 3.59966507689401e-06, + "loss": 0.78668666, + "num_input_tokens_seen": 81893225, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.9609375, + "step": 3805, + "time_per_iteration": 2.4578893184661865 + }, + { + "auxiliary_loss_clip": 0.01139764, + "auxiliary_loss_mlp": 0.01043484, + "balance_loss_clip": 1.02560234, + "balance_loss_mlp": 1.04387915, + "epoch": 0.2288290996542913, + "flos": 18113917409280.0, + "grad_norm": 2.4014048134005628, + "language_loss": 0.79309744, + "learning_rate": 3.5994312826223363e-06, + "loss": 0.81492996, + "num_input_tokens_seen": 81911350, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.95703125, + "step": 3806, + "time_per_iteration": 2.415726900100708 + }, + { + "auxiliary_loss_clip": 0.01139425, + "auxiliary_loss_mlp": 0.01043738, + "balance_loss_clip": 1.02717948, + "balance_loss_mlp": 1.04547703, + "epoch": 0.22888922290695926, + "flos": 39855457071360.0, + "grad_norm": 2.230394288716221, + "language_loss": 0.69194484, + "learning_rate": 3.5991974277000684e-06, + "loss": 0.71377647, + "num_input_tokens_seen": 81935420, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9375, + "step": 3807, + "time_per_iteration": 2.6051764488220215 + }, + { + "auxiliary_loss_clip": 0.01144692, + "auxiliary_loss_mlp": 0.01053011, + "balance_loss_clip": 1.03484392, + "balance_loss_mlp": 1.04811931, + "epoch": 0.22894934615962723, + "flos": 23403774372480.0, + "grad_norm": 2.5207266425605668, + "language_loss": 0.65717816, + "learning_rate": 3.5989635121360733e-06, + "loss": 0.67915517, + "num_input_tokens_seen": 81953845, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.96484375, + "step": 3808, + "time_per_iteration": 2.463885545730591 + }, + { + "auxiliary_loss_clip": 0.01137242, + "auxiliary_loss_mlp": 0.01041588, + "balance_loss_clip": 1.02564931, + "balance_loss_mlp": 1.04470515, + "epoch": 0.22900946941229522, + "flos": 18843011671680.0, + "grad_norm": 1.8002654314964242, + "language_loss": 0.74498177, + "learning_rate": 3.598729535939222e-06, + "loss": 0.76677001, + "num_input_tokens_seen": 81972100, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.92578125, + "step": 3809, + "time_per_iteration": 2.4587652683258057 + }, + { + "auxiliary_loss_clip": 0.01138179, + "auxiliary_loss_mlp": 0.01042926, + "balance_loss_clip": 1.02695227, + "balance_loss_mlp": 1.04707646, + "epoch": 0.22906959266496318, + "flos": 22929394429440.0, + "grad_norm": 1.6413135962032894, + "language_loss": 0.81699908, + "learning_rate": 3.5984954991183862e-06, + "loss": 0.83881009, + "num_input_tokens_seen": 81992760, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.9140625, + "step": 3810, + "time_per_iteration": 2.454545736312866 + }, + { + "auxiliary_loss_clip": 0.01135521, + "auxiliary_loss_mlp": 0.01040213, + "balance_loss_clip": 1.02448893, + "balance_loss_mlp": 1.04428005, + "epoch": 0.22912971591763115, + "flos": 19354523299200.0, + "grad_norm": 2.1876822434942245, + "language_loss": 0.78671384, + "learning_rate": 3.598261401682441e-06, + "loss": 0.8084712, + "num_input_tokens_seen": 82009080, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.9140625, + "step": 3811, + "time_per_iteration": 2.4564197063446045 + }, + { + "auxiliary_loss_clip": 0.01135961, + "auxiliary_loss_mlp": 0.01046878, + "balance_loss_clip": 1.0296042, + "balance_loss_mlp": 1.04317403, + "epoch": 0.22918983917029911, + "flos": 19933546538880.0, + "grad_norm": 1.8120535445273127, + "language_loss": 0.82811391, + "learning_rate": 3.5980272436402632e-06, + "loss": 0.84994221, + "num_input_tokens_seen": 82026705, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9296875, + "step": 3812, + "time_per_iteration": 2.4357566833496094 + }, + { + "auxiliary_loss_clip": 0.01144518, + "auxiliary_loss_mlp": 0.01051465, + "balance_loss_clip": 1.03463292, + "balance_loss_mlp": 1.04750013, + "epoch": 0.22924996242296708, + "flos": 16690885320960.0, + "grad_norm": 3.041111828111396, + "language_loss": 0.82337058, + "learning_rate": 3.5977930250007324e-06, + "loss": 0.84533036, + "num_input_tokens_seen": 82043245, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.96875, + "step": 3813, + "time_per_iteration": 2.4521987438201904 + }, + { + "auxiliary_loss_clip": 0.01139715, + "auxiliary_loss_mlp": 0.01046648, + "balance_loss_clip": 1.03009009, + "balance_loss_mlp": 1.04595184, + "epoch": 0.22931008567563504, + "flos": 33036164956800.0, + "grad_norm": 3.1740680187078896, + "language_loss": 0.69927102, + "learning_rate": 3.5975587457727298e-06, + "loss": 0.72113466, + "num_input_tokens_seen": 82066870, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9375, + "step": 3814, + "time_per_iteration": 2.5528602600097656 + }, + { + "auxiliary_loss_clip": 0.01134595, + "auxiliary_loss_mlp": 0.01044391, + "balance_loss_clip": 1.02773738, + "balance_loss_mlp": 1.04310775, + "epoch": 0.229370208928303, + "flos": 23330696152320.0, + "grad_norm": 2.479981906508555, + "language_loss": 0.67106915, + "learning_rate": 3.597324405965139e-06, + "loss": 0.69285899, + "num_input_tokens_seen": 82083180, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9140625, + "step": 3815, + "time_per_iteration": 2.4768760204315186 + }, + { + "auxiliary_loss_clip": 0.01139552, + "auxiliary_loss_mlp": 0.01052238, + "balance_loss_clip": 1.03593004, + "balance_loss_mlp": 1.04644942, + "epoch": 0.229430332180971, + "flos": 28617213150720.0, + "grad_norm": 1.8467960453518941, + "language_loss": 0.83103681, + "learning_rate": 3.597090005586848e-06, + "loss": 0.85295475, + "num_input_tokens_seen": 82102950, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.9296875, + "step": 3816, + "time_per_iteration": 2.507967710494995 + }, + { + "auxiliary_loss_clip": 0.0113842, + "auxiliary_loss_mlp": 0.01038991, + "balance_loss_clip": 1.02221847, + "balance_loss_mlp": 1.04643357, + "epoch": 0.22949045543363897, + "flos": 17238199829760.0, + "grad_norm": 2.1171855882825636, + "language_loss": 0.86756372, + "learning_rate": 3.596855544646742e-06, + "loss": 0.8893379, + "num_input_tokens_seen": 82119510, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.91796875, + "step": 3817, + "time_per_iteration": 2.4445815086364746 + }, + { + "auxiliary_loss_clip": 0.01142243, + "auxiliary_loss_mlp": 0.01049311, + "balance_loss_clip": 1.03278852, + "balance_loss_mlp": 1.04829407, + "epoch": 0.22955057868630693, + "flos": 27489438858240.0, + "grad_norm": 2.403232678237585, + "language_loss": 0.75039381, + "learning_rate": 3.5966210231537154e-06, + "loss": 0.77230936, + "num_input_tokens_seen": 82140095, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.9375, + "step": 3818, + "time_per_iteration": 2.508527994155884 + }, + { + "auxiliary_loss_clip": 0.01141204, + "auxiliary_loss_mlp": 0.01041338, + "balance_loss_clip": 1.02426732, + "balance_loss_mlp": 1.04769611, + "epoch": 0.2296107019389749, + "flos": 23476421629440.0, + "grad_norm": 1.6537639427714739, + "language_loss": 0.74597251, + "learning_rate": 3.596386441116659e-06, + "loss": 0.76779795, + "num_input_tokens_seen": 82159510, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.93359375, + "step": 3819, + "time_per_iteration": 2.5009493827819824 + }, + { + "auxiliary_loss_clip": 0.01138376, + "auxiliary_loss_mlp": 0.01044786, + "balance_loss_clip": 1.02806103, + "balance_loss_mlp": 1.04632187, + "epoch": 0.22967082519164286, + "flos": 31285160760960.0, + "grad_norm": 1.815385500594849, + "language_loss": 0.80775046, + "learning_rate": 3.5961517985444684e-06, + "loss": 0.8295821, + "num_input_tokens_seen": 82179580, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.921875, + "step": 3820, + "time_per_iteration": 2.5374531745910645 + }, + { + "auxiliary_loss_clip": 0.01142613, + "auxiliary_loss_mlp": 0.01041984, + "balance_loss_clip": 1.02384043, + "balance_loss_mlp": 1.04725921, + "epoch": 0.22973094844431083, + "flos": 14642935390080.0, + "grad_norm": 2.0886359367899763, + "language_loss": 0.69226766, + "learning_rate": 3.595917095446042e-06, + "loss": 0.71411359, + "num_input_tokens_seen": 82195585, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.953125, + "step": 3821, + "time_per_iteration": 2.4539082050323486 + }, + { + "auxiliary_loss_clip": 0.0113954, + "auxiliary_loss_mlp": 0.01036487, + "balance_loss_clip": 1.01912975, + "balance_loss_mlp": 1.0466336, + "epoch": 0.2297910716969788, + "flos": 22823853292800.0, + "grad_norm": 1.623620301878745, + "language_loss": 0.82655883, + "learning_rate": 3.5956823318302796e-06, + "loss": 0.84831905, + "num_input_tokens_seen": 82217530, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9296875, + "step": 3822, + "time_per_iteration": 2.5025360584259033 + }, + { + "auxiliary_loss_clip": 0.01137437, + "auxiliary_loss_mlp": 0.01040965, + "balance_loss_clip": 1.02264285, + "balance_loss_mlp": 1.04520607, + "epoch": 0.2298511949496468, + "flos": 23039029716480.0, + "grad_norm": 1.581563173789708, + "language_loss": 0.66093826, + "learning_rate": 3.5954475077060833e-06, + "loss": 0.68272227, + "num_input_tokens_seen": 82237980, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.921875, + "step": 3823, + "time_per_iteration": 2.500643253326416 + }, + { + "auxiliary_loss_clip": 0.0104753, + "auxiliary_loss_mlp": 0.01001124, + "balance_loss_clip": 0.99913329, + "balance_loss_mlp": 1.01448655, + "epoch": 0.22991131820231475, + "flos": 66890914911360.0, + "grad_norm": 0.8191682875264555, + "language_loss": 0.56770015, + "learning_rate": 3.595212623082357e-06, + "loss": 0.58818674, + "num_input_tokens_seen": 82301785, + "router_z_loss_clip": 0.01989746, + "router_z_loss_mlp": 0.33203125, + "step": 3824, + "time_per_iteration": 3.1365485191345215 + }, + { + "auxiliary_loss_clip": 0.01135805, + "auxiliary_loss_mlp": 0.01039563, + "balance_loss_clip": 1.02363658, + "balance_loss_mlp": 1.04575276, + "epoch": 0.22997144145498272, + "flos": 17887248633600.0, + "grad_norm": 2.487273324074565, + "language_loss": 0.72840559, + "learning_rate": 3.594977677968009e-06, + "loss": 0.75015926, + "num_input_tokens_seen": 82317355, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.90234375, + "step": 3825, + "time_per_iteration": 2.444730758666992 + }, + { + "auxiliary_loss_clip": 0.01143286, + "auxiliary_loss_mlp": 0.01046033, + "balance_loss_clip": 1.02810407, + "balance_loss_mlp": 1.04978526, + "epoch": 0.23003156470765068, + "flos": 24676843178880.0, + "grad_norm": 1.8892090994393747, + "language_loss": 0.87760615, + "learning_rate": 3.5947426723719473e-06, + "loss": 0.89949936, + "num_input_tokens_seen": 82336645, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9375, + "step": 3826, + "time_per_iteration": 2.492682456970215 + }, + { + "auxiliary_loss_clip": 0.01142911, + "auxiliary_loss_mlp": 0.01043844, + "balance_loss_clip": 1.0258677, + "balance_loss_mlp": 1.04683542, + "epoch": 0.23009168796031865, + "flos": 15814126247040.0, + "grad_norm": 2.6663888482282623, + "language_loss": 0.81568289, + "learning_rate": 3.594507606303083e-06, + "loss": 0.8375504, + "num_input_tokens_seen": 82354225, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9609375, + "step": 3827, + "time_per_iteration": 2.488593578338623 + }, + { + "auxiliary_loss_clip": 0.01135849, + "auxiliary_loss_mlp": 0.01043921, + "balance_loss_clip": 1.02750623, + "balance_loss_mlp": 1.04553437, + "epoch": 0.2301518112129866, + "flos": 16212842190720.0, + "grad_norm": 1.8456206141648608, + "language_loss": 0.86791205, + "learning_rate": 3.5942724797703314e-06, + "loss": 0.88970977, + "num_input_tokens_seen": 82370240, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90625, + "step": 3828, + "time_per_iteration": 2.4386606216430664 + }, + { + "auxiliary_loss_clip": 0.01138396, + "auxiliary_loss_mlp": 0.01049169, + "balance_loss_clip": 1.03147864, + "balance_loss_mlp": 1.04512644, + "epoch": 0.2302119344656546, + "flos": 20595452411520.0, + "grad_norm": 2.106420485404446, + "language_loss": 0.70638877, + "learning_rate": 3.594037292782607e-06, + "loss": 0.72826439, + "num_input_tokens_seen": 82389145, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.93359375, + "step": 3829, + "time_per_iteration": 2.475399971008301 + }, + { + "auxiliary_loss_clip": 0.01139852, + "auxiliary_loss_mlp": 0.01038095, + "balance_loss_clip": 1.02241933, + "balance_loss_mlp": 1.05011487, + "epoch": 0.23027205771832257, + "flos": 26796901662720.0, + "grad_norm": 1.5719627508253273, + "language_loss": 0.84045994, + "learning_rate": 3.5938020453488293e-06, + "loss": 0.86223942, + "num_input_tokens_seen": 82409185, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8984375, + "step": 3830, + "time_per_iteration": 2.4943718910217285 + }, + { + "auxiliary_loss_clip": 0.01139071, + "auxiliary_loss_mlp": 0.01049012, + "balance_loss_clip": 1.03172636, + "balance_loss_mlp": 1.04637957, + "epoch": 0.23033218097099054, + "flos": 43873143068160.0, + "grad_norm": 1.733206127117623, + "language_loss": 0.66863495, + "learning_rate": 3.5935667374779177e-06, + "loss": 0.69051576, + "num_input_tokens_seen": 82432070, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.92578125, + "step": 3831, + "time_per_iteration": 2.6513662338256836 + }, + { + "auxiliary_loss_clip": 0.01141151, + "auxiliary_loss_mlp": 0.01042727, + "balance_loss_clip": 1.02603793, + "balance_loss_mlp": 1.04735637, + "epoch": 0.2303923042236585, + "flos": 26067663745920.0, + "grad_norm": 2.238850649877041, + "language_loss": 0.75253022, + "learning_rate": 3.5933313691787957e-06, + "loss": 0.77436894, + "num_input_tokens_seen": 82450625, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.9375, + "step": 3832, + "time_per_iteration": 2.4889180660247803 + }, + { + "auxiliary_loss_clip": 0.01139559, + "auxiliary_loss_mlp": 0.01043075, + "balance_loss_clip": 1.02515745, + "balance_loss_mlp": 1.04709673, + "epoch": 0.23045242747632647, + "flos": 18296379521280.0, + "grad_norm": 1.8583815246829203, + "language_loss": 0.87474239, + "learning_rate": 3.593095940460389e-06, + "loss": 0.89656878, + "num_input_tokens_seen": 82468575, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.92578125, + "step": 3833, + "time_per_iteration": 2.46744966506958 + }, + { + "auxiliary_loss_clip": 0.01140821, + "auxiliary_loss_mlp": 0.01047215, + "balance_loss_clip": 1.02950096, + "balance_loss_mlp": 1.0478369, + "epoch": 0.23051255072899443, + "flos": 25520528805120.0, + "grad_norm": 3.2120713643012206, + "language_loss": 0.74875945, + "learning_rate": 3.592860451331624e-06, + "loss": 0.77063978, + "num_input_tokens_seen": 82488655, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9296875, + "step": 3834, + "time_per_iteration": 2.485504627227783 + }, + { + "auxiliary_loss_clip": 0.0113943, + "auxiliary_loss_mlp": 0.01051682, + "balance_loss_clip": 1.03408706, + "balance_loss_mlp": 1.0484879, + "epoch": 0.2305726739816624, + "flos": 21215198695680.0, + "grad_norm": 1.820281268490984, + "language_loss": 0.85338157, + "learning_rate": 3.592624901801432e-06, + "loss": 0.87529278, + "num_input_tokens_seen": 82507220, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.91015625, + "step": 3835, + "time_per_iteration": 2.4730474948883057 + }, + { + "auxiliary_loss_clip": 0.01146651, + "auxiliary_loss_mlp": 0.01049278, + "balance_loss_clip": 1.03142083, + "balance_loss_mlp": 1.04814029, + "epoch": 0.2306327972343304, + "flos": 23331127115520.0, + "grad_norm": 2.799799470431086, + "language_loss": 0.81974924, + "learning_rate": 3.5923892918787432e-06, + "loss": 0.84170854, + "num_input_tokens_seen": 82527920, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.984375, + "step": 3836, + "time_per_iteration": 2.464657783508301 + }, + { + "auxiliary_loss_clip": 0.0114557, + "auxiliary_loss_mlp": 0.01043707, + "balance_loss_clip": 1.02726793, + "balance_loss_mlp": 1.05202293, + "epoch": 0.23069292048699835, + "flos": 20666734951680.0, + "grad_norm": 1.7793450137018207, + "language_loss": 0.79603267, + "learning_rate": 3.5921536215724934e-06, + "loss": 0.81792545, + "num_input_tokens_seen": 82549040, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9375, + "step": 3837, + "time_per_iteration": 2.4715559482574463 + }, + { + "auxiliary_loss_clip": 0.01055276, + "auxiliary_loss_mlp": 0.01017826, + "balance_loss_clip": 1.01614499, + "balance_loss_mlp": 1.02046371, + "epoch": 0.23075304373966632, + "flos": 70454832393600.0, + "grad_norm": 0.9409846751082755, + "language_loss": 0.65487945, + "learning_rate": 3.5919178908916184e-06, + "loss": 0.67561042, + "num_input_tokens_seen": 82604070, + "router_z_loss_clip": 0.0168457, + "router_z_loss_mlp": 0.34765625, + "step": 3838, + "time_per_iteration": 2.9852375984191895 + }, + { + "auxiliary_loss_clip": 0.01139351, + "auxiliary_loss_mlp": 0.01047713, + "balance_loss_clip": 1.03131008, + "balance_loss_mlp": 1.04721856, + "epoch": 0.23081316699233428, + "flos": 16617986668800.0, + "grad_norm": 2.6310373190732648, + "language_loss": 0.7527796, + "learning_rate": 3.591682099845058e-06, + "loss": 0.77465028, + "num_input_tokens_seen": 82619665, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.921875, + "step": 3839, + "time_per_iteration": 2.4290778636932373 + }, + { + "auxiliary_loss_clip": 0.01145463, + "auxiliary_loss_mlp": 0.01042021, + "balance_loss_clip": 1.02486694, + "balance_loss_mlp": 1.0510757, + "epoch": 0.23087329024500225, + "flos": 13298081253120.0, + "grad_norm": 4.016837458595543, + "language_loss": 0.68691337, + "learning_rate": 3.591446248441752e-06, + "loss": 0.70878816, + "num_input_tokens_seen": 82637530, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9453125, + "step": 3840, + "time_per_iteration": 2.456422805786133 + }, + { + "auxiliary_loss_clip": 0.01143425, + "auxiliary_loss_mlp": 0.01039716, + "balance_loss_clip": 1.02084517, + "balance_loss_mlp": 1.04936612, + "epoch": 0.23093341349767021, + "flos": 17785729820160.0, + "grad_norm": 2.1574295618121426, + "language_loss": 0.79412574, + "learning_rate": 3.591210336690645e-06, + "loss": 0.81595719, + "num_input_tokens_seen": 82656130, + "router_z_loss_clip": 0.18847656, + "router_z_loss_mlp": 0.9375, + "step": 3841, + "time_per_iteration": 2.4762818813323975 + }, + { + "auxiliary_loss_clip": 0.01141641, + "auxiliary_loss_mlp": 0.01041348, + "balance_loss_clip": 1.02557695, + "balance_loss_mlp": 1.04872346, + "epoch": 0.23099353675033818, + "flos": 23988076911360.0, + "grad_norm": 5.070488540070664, + "language_loss": 0.83171731, + "learning_rate": 3.590974364600683e-06, + "loss": 0.85354722, + "num_input_tokens_seen": 82675295, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.9296875, + "step": 3842, + "time_per_iteration": 2.4908032417297363 + }, + { + "auxiliary_loss_clip": 0.01139394, + "auxiliary_loss_mlp": 0.0104314, + "balance_loss_clip": 1.0255568, + "balance_loss_mlp": 1.04567111, + "epoch": 0.23105366000300617, + "flos": 35995168471680.0, + "grad_norm": 1.6842769818445011, + "language_loss": 0.66523731, + "learning_rate": 3.5907383321808135e-06, + "loss": 0.68706262, + "num_input_tokens_seen": 82703260, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9375, + "step": 3843, + "time_per_iteration": 2.6503937244415283 + }, + { + "auxiliary_loss_clip": 0.01138914, + "auxiliary_loss_mlp": 0.01043512, + "balance_loss_clip": 1.02642977, + "balance_loss_mlp": 1.04793119, + "epoch": 0.23111378325567414, + "flos": 31245335556480.0, + "grad_norm": 1.8910129932977493, + "language_loss": 0.77445257, + "learning_rate": 3.590502239439987e-06, + "loss": 0.79627681, + "num_input_tokens_seen": 82725060, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.91015625, + "step": 3844, + "time_per_iteration": 5.4645676612854 + }, + { + "auxiliary_loss_clip": 0.01142407, + "auxiliary_loss_mlp": 0.01041287, + "balance_loss_clip": 1.02321458, + "balance_loss_mlp": 1.04744804, + "epoch": 0.2311739065083421, + "flos": 19208223204480.0, + "grad_norm": 1.6615026518232119, + "language_loss": 0.77974623, + "learning_rate": 3.590266086387156e-06, + "loss": 0.80158317, + "num_input_tokens_seen": 82742960, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.94921875, + "step": 3845, + "time_per_iteration": 2.467289686203003 + }, + { + "auxiliary_loss_clip": 0.01133475, + "auxiliary_loss_mlp": 0.01032005, + "balance_loss_clip": 1.01687717, + "balance_loss_mlp": 1.04577661, + "epoch": 0.23123402976101007, + "flos": 23360178240000.0, + "grad_norm": 2.1438137502119425, + "language_loss": 0.76064527, + "learning_rate": 3.590029873031276e-06, + "loss": 0.78230006, + "num_input_tokens_seen": 82760205, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.875, + "step": 3846, + "time_per_iteration": 2.4985382556915283 + }, + { + "auxiliary_loss_clip": 0.01140881, + "auxiliary_loss_mlp": 0.0104335, + "balance_loss_clip": 1.02638626, + "balance_loss_mlp": 1.04725194, + "epoch": 0.23129415301367803, + "flos": 13735365425280.0, + "grad_norm": 2.4609763976845556, + "language_loss": 0.69493651, + "learning_rate": 3.589793599381304e-06, + "loss": 0.71677887, + "num_input_tokens_seen": 82778590, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9375, + "step": 3847, + "time_per_iteration": 2.4514195919036865 + }, + { + "auxiliary_loss_clip": 0.01048129, + "auxiliary_loss_mlp": 0.01002559, + "balance_loss_clip": 1.00074661, + "balance_loss_mlp": 1.01598144, + "epoch": 0.231354276266346, + "flos": 69737015001600.0, + "grad_norm": 0.7927409416341922, + "language_loss": 0.61051595, + "learning_rate": 3.589557265446198e-06, + "loss": 0.63102281, + "num_input_tokens_seen": 82833925, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.3203125, + "step": 3848, + "time_per_iteration": 2.981518030166626 + }, + { + "auxiliary_loss_clip": 0.011385, + "auxiliary_loss_mlp": 0.0104523, + "balance_loss_clip": 1.02789688, + "balance_loss_mlp": 1.04593349, + "epoch": 0.231414399519014, + "flos": 18835900778880.0, + "grad_norm": 2.568019101440284, + "language_loss": 0.7746805, + "learning_rate": 3.589320871234923e-06, + "loss": 0.79651785, + "num_input_tokens_seen": 82850625, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.92578125, + "step": 3849, + "time_per_iteration": 2.450693130493164 + }, + { + "auxiliary_loss_clip": 0.01139635, + "auxiliary_loss_mlp": 0.01042495, + "balance_loss_clip": 1.02551913, + "balance_loss_mlp": 1.04533124, + "epoch": 0.23147452277168196, + "flos": 36135470995200.0, + "grad_norm": 1.9223002445017061, + "language_loss": 0.71673942, + "learning_rate": 3.5890844167564405e-06, + "loss": 0.73856068, + "num_input_tokens_seen": 82872105, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9453125, + "step": 3850, + "time_per_iteration": 2.589395761489868 + }, + { + "auxiliary_loss_clip": 0.01137166, + "auxiliary_loss_mlp": 0.01035391, + "balance_loss_clip": 1.01870215, + "balance_loss_mlp": 1.04362154, + "epoch": 0.23153464602434992, + "flos": 20812927305600.0, + "grad_norm": 3.8422038584857665, + "language_loss": 0.75846308, + "learning_rate": 3.588847902019718e-06, + "loss": 0.78018856, + "num_input_tokens_seen": 82890595, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.93359375, + "step": 3851, + "time_per_iteration": 2.495729446411133 + }, + { + "auxiliary_loss_clip": 0.01138492, + "auxiliary_loss_mlp": 0.01040821, + "balance_loss_clip": 1.0234046, + "balance_loss_mlp": 1.04747272, + "epoch": 0.2315947692770179, + "flos": 19939256801280.0, + "grad_norm": 1.914141324585442, + "language_loss": 0.69797802, + "learning_rate": 3.588611327033723e-06, + "loss": 0.71977121, + "num_input_tokens_seen": 82908910, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.91015625, + "step": 3852, + "time_per_iteration": 2.478408098220825 + }, + { + "auxiliary_loss_clip": 0.01140513, + "auxiliary_loss_mlp": 0.01037305, + "balance_loss_clip": 1.0206399, + "balance_loss_mlp": 1.04643583, + "epoch": 0.23165489252968585, + "flos": 12855553695360.0, + "grad_norm": 2.1861380100726144, + "language_loss": 0.67030561, + "learning_rate": 3.588374691807428e-06, + "loss": 0.69208378, + "num_input_tokens_seen": 82925405, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.94140625, + "step": 3853, + "time_per_iteration": 2.4445838928222656 + }, + { + "auxiliary_loss_clip": 0.01141194, + "auxiliary_loss_mlp": 0.01035545, + "balance_loss_clip": 1.01815248, + "balance_loss_mlp": 1.04680121, + "epoch": 0.23171501578235382, + "flos": 30628282792320.0, + "grad_norm": 1.6671703506367506, + "language_loss": 0.79851103, + "learning_rate": 3.5881379963498053e-06, + "loss": 0.82027847, + "num_input_tokens_seen": 82945615, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9453125, + "step": 3854, + "time_per_iteration": 2.5455782413482666 + }, + { + "auxiliary_loss_clip": 0.01146661, + "auxiliary_loss_mlp": 0.01042654, + "balance_loss_clip": 1.02476025, + "balance_loss_mlp": 1.04726899, + "epoch": 0.23177513903502178, + "flos": 23842782397440.0, + "grad_norm": 3.8560715318244556, + "language_loss": 0.64987147, + "learning_rate": 3.587901240669831e-06, + "loss": 0.67176461, + "num_input_tokens_seen": 82967570, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9921875, + "step": 3855, + "time_per_iteration": 2.5996124744415283 + }, + { + "auxiliary_loss_clip": 0.01140829, + "auxiliary_loss_mlp": 0.01044083, + "balance_loss_clip": 1.02753139, + "balance_loss_mlp": 1.04570055, + "epoch": 0.23183526228768978, + "flos": 29570282668800.0, + "grad_norm": 2.1096123404526623, + "language_loss": 0.70711654, + "learning_rate": 3.5876644247764815e-06, + "loss": 0.72896564, + "num_input_tokens_seen": 82987435, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.953125, + "step": 3856, + "time_per_iteration": 2.5024092197418213 + }, + { + "auxiliary_loss_clip": 0.01137323, + "auxiliary_loss_mlp": 0.01035633, + "balance_loss_clip": 1.02062488, + "balance_loss_mlp": 1.0464257, + "epoch": 0.23189538554035774, + "flos": 34458694254720.0, + "grad_norm": 6.089384897844753, + "language_loss": 0.76997125, + "learning_rate": 3.5874275486787387e-06, + "loss": 0.79170084, + "num_input_tokens_seen": 83010505, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.91015625, + "step": 3857, + "time_per_iteration": 2.5962576866149902 + }, + { + "auxiliary_loss_clip": 0.01143962, + "auxiliary_loss_mlp": 0.01048446, + "balance_loss_clip": 1.03018308, + "balance_loss_mlp": 1.0477798, + "epoch": 0.2319555087930257, + "flos": 18003815245440.0, + "grad_norm": 3.478057752262005, + "language_loss": 0.91006696, + "learning_rate": 3.587190612385584e-06, + "loss": 0.93199098, + "num_input_tokens_seen": 83026705, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.96484375, + "step": 3858, + "time_per_iteration": 2.4276509284973145 + }, + { + "auxiliary_loss_clip": 0.01136894, + "auxiliary_loss_mlp": 0.01042737, + "balance_loss_clip": 1.02576208, + "balance_loss_mlp": 1.04679012, + "epoch": 0.23201563204569367, + "flos": 23143852581120.0, + "grad_norm": 2.1437168922033747, + "language_loss": 0.75995493, + "learning_rate": 3.5869536159060026e-06, + "loss": 0.78175128, + "num_input_tokens_seen": 83046500, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.8984375, + "step": 3859, + "time_per_iteration": 2.485426187515259 + }, + { + "auxiliary_loss_clip": 0.01136619, + "auxiliary_loss_mlp": 0.01036655, + "balance_loss_clip": 1.01962614, + "balance_loss_mlp": 1.04423487, + "epoch": 0.23207575529836164, + "flos": 20667991927680.0, + "grad_norm": 1.9055462071213993, + "language_loss": 0.84061682, + "learning_rate": 3.58671655924898e-06, + "loss": 0.86234951, + "num_input_tokens_seen": 83065280, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.92578125, + "step": 3860, + "time_per_iteration": 2.4607324600219727 + }, + { + "auxiliary_loss_clip": 0.01137991, + "auxiliary_loss_mlp": 0.01040318, + "balance_loss_clip": 1.02317619, + "balance_loss_mlp": 1.04656291, + "epoch": 0.2321358785510296, + "flos": 16472189364480.0, + "grad_norm": 2.1337823805291047, + "language_loss": 0.82972974, + "learning_rate": 3.586479442423508e-06, + "loss": 0.85151279, + "num_input_tokens_seen": 83082310, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9140625, + "step": 3861, + "time_per_iteration": 2.451805591583252 + }, + { + "auxiliary_loss_clip": 0.01142125, + "auxiliary_loss_mlp": 0.01043892, + "balance_loss_clip": 1.02702415, + "balance_loss_mlp": 1.04800034, + "epoch": 0.2321960018036976, + "flos": 21616320850560.0, + "grad_norm": 1.8456518711772996, + "language_loss": 0.85918242, + "learning_rate": 3.586242265438576e-06, + "loss": 0.8810426, + "num_input_tokens_seen": 83102065, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.94140625, + "step": 3862, + "time_per_iteration": 2.4582395553588867 + }, + { + "auxiliary_loss_clip": 0.01136451, + "auxiliary_loss_mlp": 0.01044214, + "balance_loss_clip": 1.02914643, + "balance_loss_mlp": 1.0468179, + "epoch": 0.23225612505636556, + "flos": 22271474966400.0, + "grad_norm": 1.3833481647146872, + "language_loss": 0.7492758, + "learning_rate": 3.5860050283031773e-06, + "loss": 0.7710824, + "num_input_tokens_seen": 83121445, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8984375, + "step": 3863, + "time_per_iteration": 2.496985912322998 + }, + { + "auxiliary_loss_clip": 0.01139904, + "auxiliary_loss_mlp": 0.01042767, + "balance_loss_clip": 1.02723408, + "balance_loss_mlp": 1.05037498, + "epoch": 0.23231624830903352, + "flos": 17052325925760.0, + "grad_norm": 2.003739732436234, + "language_loss": 0.74640852, + "learning_rate": 3.58576773102631e-06, + "loss": 0.76823521, + "num_input_tokens_seen": 83138175, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.89453125, + "step": 3864, + "time_per_iteration": 2.440204381942749 + }, + { + "auxiliary_loss_clip": 0.0113912, + "auxiliary_loss_mlp": 0.01035726, + "balance_loss_clip": 1.01952517, + "balance_loss_mlp": 1.0468204, + "epoch": 0.2323763715617015, + "flos": 34640043045120.0, + "grad_norm": 3.940820538439298, + "language_loss": 0.70690906, + "learning_rate": 3.5855303736169714e-06, + "loss": 0.72865754, + "num_input_tokens_seen": 83161975, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.921875, + "step": 3865, + "time_per_iteration": 2.598194122314453 + }, + { + "auxiliary_loss_clip": 0.01148702, + "auxiliary_loss_mlp": 0.01049623, + "balance_loss_clip": 1.03091884, + "balance_loss_mlp": 1.04987264, + "epoch": 0.23243649481436945, + "flos": 25551698832000.0, + "grad_norm": 1.9658537667403149, + "language_loss": 0.94853866, + "learning_rate": 3.5852929560841617e-06, + "loss": 0.97052193, + "num_input_tokens_seen": 83180905, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.98828125, + "step": 3866, + "time_per_iteration": 2.496276617050171 + }, + { + "auxiliary_loss_clip": 0.01138876, + "auxiliary_loss_mlp": 0.01040339, + "balance_loss_clip": 1.02412629, + "balance_loss_mlp": 1.04817796, + "epoch": 0.23249661806703742, + "flos": 20483482740480.0, + "grad_norm": 2.6667540210019123, + "language_loss": 0.72528732, + "learning_rate": 3.5850554784368846e-06, + "loss": 0.74707949, + "num_input_tokens_seen": 83196390, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.90625, + "step": 3867, + "time_per_iteration": 2.4933414459228516 + }, + { + "auxiliary_loss_clip": 0.01140693, + "auxiliary_loss_mlp": 0.01043897, + "balance_loss_clip": 1.02625418, + "balance_loss_mlp": 1.04734945, + "epoch": 0.23255674131970538, + "flos": 20376612800640.0, + "grad_norm": 1.8421111702540602, + "language_loss": 0.82411921, + "learning_rate": 3.584817940684145e-06, + "loss": 0.84596509, + "num_input_tokens_seen": 83216165, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.93359375, + "step": 3868, + "time_per_iteration": 2.4994540214538574 + }, + { + "auxiliary_loss_clip": 0.01136829, + "auxiliary_loss_mlp": 0.01040452, + "balance_loss_clip": 1.02433491, + "balance_loss_mlp": 1.04700828, + "epoch": 0.23261686457237338, + "flos": 17056096853760.0, + "grad_norm": 1.815886356300666, + "language_loss": 0.73335075, + "learning_rate": 3.58458034283495e-06, + "loss": 0.75512362, + "num_input_tokens_seen": 83233845, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8984375, + "step": 3869, + "time_per_iteration": 2.4486095905303955 + }, + { + "auxiliary_loss_clip": 0.01139645, + "auxiliary_loss_mlp": 0.01047185, + "balance_loss_clip": 1.03108525, + "balance_loss_mlp": 1.04929376, + "epoch": 0.23267698782504134, + "flos": 29169878785920.0, + "grad_norm": 1.6948965109205438, + "language_loss": 0.79564929, + "learning_rate": 3.5843426848983097e-06, + "loss": 0.81751764, + "num_input_tokens_seen": 83254930, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.90234375, + "step": 3870, + "time_per_iteration": 2.506114959716797 + }, + { + "auxiliary_loss_clip": 0.01143066, + "auxiliary_loss_mlp": 0.01043212, + "balance_loss_clip": 1.02574801, + "balance_loss_mlp": 1.04845953, + "epoch": 0.2327371110777093, + "flos": 21174655219200.0, + "grad_norm": 3.2368167151878797, + "language_loss": 0.70599115, + "learning_rate": 3.5841049668832357e-06, + "loss": 0.72785389, + "num_input_tokens_seen": 83272095, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9453125, + "step": 3871, + "time_per_iteration": 2.455266237258911 + }, + { + "auxiliary_loss_clip": 0.01145685, + "auxiliary_loss_mlp": 0.01055983, + "balance_loss_clip": 1.03674293, + "balance_loss_mlp": 1.05011845, + "epoch": 0.23279723433037727, + "flos": 24863112132480.0, + "grad_norm": 2.2694181422477313, + "language_loss": 0.69087327, + "learning_rate": 3.5838671887987433e-06, + "loss": 0.71289003, + "num_input_tokens_seen": 83290980, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.95703125, + "step": 3872, + "time_per_iteration": 2.482089042663574 + }, + { + "auxiliary_loss_clip": 0.01147162, + "auxiliary_loss_mlp": 0.01045167, + "balance_loss_clip": 1.0271188, + "balance_loss_mlp": 1.04984593, + "epoch": 0.23285735758304524, + "flos": 38800617344640.0, + "grad_norm": 1.4965805681858408, + "language_loss": 0.78046703, + "learning_rate": 3.5836293506538474e-06, + "loss": 0.80239034, + "num_input_tokens_seen": 83315175, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.97265625, + "step": 3873, + "time_per_iteration": 2.6179678440093994 + }, + { + "auxiliary_loss_clip": 0.01053819, + "auxiliary_loss_mlp": 0.01009657, + "balance_loss_clip": 1.00777328, + "balance_loss_mlp": 1.02347898, + "epoch": 0.2329174808357132, + "flos": 53944113692160.0, + "grad_norm": 0.841863213022928, + "language_loss": 0.60519493, + "learning_rate": 3.5833914524575687e-06, + "loss": 0.6258297, + "num_input_tokens_seen": 83372060, + "router_z_loss_clip": 0.01879883, + "router_z_loss_mlp": 0.3046875, + "step": 3874, + "time_per_iteration": 2.955524444580078 + }, + { + "auxiliary_loss_clip": 0.01142096, + "auxiliary_loss_mlp": 0.01044266, + "balance_loss_clip": 1.02695727, + "balance_loss_mlp": 1.04998708, + "epoch": 0.23297760408838117, + "flos": 21216024708480.0, + "grad_norm": 2.0817330720741287, + "language_loss": 0.8082279, + "learning_rate": 3.583153494218927e-06, + "loss": 0.83009154, + "num_input_tokens_seen": 83389795, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.921875, + "step": 3875, + "time_per_iteration": 2.4961941242218018 + }, + { + "auxiliary_loss_clip": 0.01141065, + "auxiliary_loss_mlp": 0.01039949, + "balance_loss_clip": 1.02440381, + "balance_loss_mlp": 1.04931068, + "epoch": 0.23303772734104916, + "flos": 28403006394240.0, + "grad_norm": 1.6586054731564495, + "language_loss": 0.60997009, + "learning_rate": 3.5829154759469464e-06, + "loss": 0.63178027, + "num_input_tokens_seen": 83410005, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.91796875, + "step": 3876, + "time_per_iteration": 2.5234174728393555 + }, + { + "auxiliary_loss_clip": 0.01144475, + "auxiliary_loss_mlp": 0.0104992, + "balance_loss_clip": 1.0319072, + "balance_loss_mlp": 1.05151403, + "epoch": 0.23309785059371713, + "flos": 24314720215680.0, + "grad_norm": 1.9912662806979935, + "language_loss": 0.70357525, + "learning_rate": 3.5826773976506523e-06, + "loss": 0.72551912, + "num_input_tokens_seen": 83430250, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9296875, + "step": 3877, + "time_per_iteration": 2.5117876529693604 + }, + { + "auxiliary_loss_clip": 0.01142635, + "auxiliary_loss_mlp": 0.0104808, + "balance_loss_clip": 1.02984059, + "balance_loss_mlp": 1.04846656, + "epoch": 0.2331579738463851, + "flos": 15992925171840.0, + "grad_norm": 2.20617127152986, + "language_loss": 0.81169856, + "learning_rate": 3.582439259339073e-06, + "loss": 0.83360565, + "num_input_tokens_seen": 83447950, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.94140625, + "step": 3878, + "time_per_iteration": 2.418745517730713 + }, + { + "auxiliary_loss_clip": 0.01145943, + "auxiliary_loss_mlp": 0.01047242, + "balance_loss_clip": 1.02914643, + "balance_loss_mlp": 1.04905999, + "epoch": 0.23321809709905306, + "flos": 36426957863040.0, + "grad_norm": 2.449565501872003, + "language_loss": 0.74765849, + "learning_rate": 3.5822010610212374e-06, + "loss": 0.76959032, + "num_input_tokens_seen": 83467785, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.96875, + "step": 3879, + "time_per_iteration": 2.627453088760376 + }, + { + "auxiliary_loss_clip": 0.0113984, + "auxiliary_loss_mlp": 0.01043428, + "balance_loss_clip": 1.02597582, + "balance_loss_mlp": 1.04611635, + "epoch": 0.23327822035172102, + "flos": 21324762155520.0, + "grad_norm": 2.3281305870509685, + "language_loss": 0.89896512, + "learning_rate": 3.5819628027061795e-06, + "loss": 0.92079782, + "num_input_tokens_seen": 83485390, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9375, + "step": 3880, + "time_per_iteration": 2.529181957244873 + }, + { + "auxiliary_loss_clip": 0.01144521, + "auxiliary_loss_mlp": 0.01046534, + "balance_loss_clip": 1.02926075, + "balance_loss_mlp": 1.05019975, + "epoch": 0.233338343604389, + "flos": 19171881619200.0, + "grad_norm": 1.7300006336865508, + "language_loss": 0.72026277, + "learning_rate": 3.5817244844029334e-06, + "loss": 0.74217331, + "num_input_tokens_seen": 83504890, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9453125, + "step": 3881, + "time_per_iteration": 2.5004756450653076 + }, + { + "auxiliary_loss_clip": 0.01138796, + "auxiliary_loss_mlp": 0.01044785, + "balance_loss_clip": 1.02798867, + "balance_loss_mlp": 1.04610527, + "epoch": 0.23339846685705698, + "flos": 26908368543360.0, + "grad_norm": 1.5765664683306326, + "language_loss": 0.67988127, + "learning_rate": 3.581486106120537e-06, + "loss": 0.70171714, + "num_input_tokens_seen": 83526475, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9296875, + "step": 3882, + "time_per_iteration": 2.5134541988372803 + }, + { + "auxiliary_loss_clip": 0.01143679, + "auxiliary_loss_mlp": 0.01057975, + "balance_loss_clip": 1.04020119, + "balance_loss_mlp": 1.0481658, + "epoch": 0.23345859010972494, + "flos": 32343160884480.0, + "grad_norm": 3.2831975264627116, + "language_loss": 0.76596051, + "learning_rate": 3.5812476678680287e-06, + "loss": 0.78797704, + "num_input_tokens_seen": 83546620, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.953125, + "step": 3883, + "time_per_iteration": 2.5556836128234863 + }, + { + "auxiliary_loss_clip": 0.01046918, + "auxiliary_loss_mlp": 0.01002528, + "balance_loss_clip": 1.00059688, + "balance_loss_mlp": 1.01619315, + "epoch": 0.2335187133623929, + "flos": 58484229050880.0, + "grad_norm": 0.7953130928556094, + "language_loss": 0.59102494, + "learning_rate": 3.58100916965445e-06, + "loss": 0.6115194, + "num_input_tokens_seen": 83616160, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.30859375, + "step": 3884, + "time_per_iteration": 3.210090398788452 + }, + { + "auxiliary_loss_clip": 0.01139917, + "auxiliary_loss_mlp": 0.01035881, + "balance_loss_clip": 1.0196687, + "balance_loss_mlp": 1.04723644, + "epoch": 0.23357883661506088, + "flos": 24502317972480.0, + "grad_norm": 3.4795297654408617, + "language_loss": 0.80128157, + "learning_rate": 3.5807706114888455e-06, + "loss": 0.82303953, + "num_input_tokens_seen": 83636795, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.92578125, + "step": 3885, + "time_per_iteration": 4.129857301712036 + }, + { + "auxiliary_loss_clip": 0.01139579, + "auxiliary_loss_mlp": 0.01039954, + "balance_loss_clip": 1.02257299, + "balance_loss_mlp": 1.04763317, + "epoch": 0.23363895986772884, + "flos": 18948516894720.0, + "grad_norm": 2.392049069504846, + "language_loss": 0.88482237, + "learning_rate": 3.580531993380261e-06, + "loss": 0.9066177, + "num_input_tokens_seen": 83654050, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.921875, + "step": 3886, + "time_per_iteration": 4.002579689025879 + }, + { + "auxiliary_loss_clip": 0.01143892, + "auxiliary_loss_mlp": 0.01041505, + "balance_loss_clip": 1.02452922, + "balance_loss_mlp": 1.04953825, + "epoch": 0.2336990831203968, + "flos": 31686821619840.0, + "grad_norm": 2.2740188667520815, + "language_loss": 0.73199034, + "learning_rate": 3.5802933153377445e-06, + "loss": 0.75384426, + "num_input_tokens_seen": 83673720, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9453125, + "step": 3887, + "time_per_iteration": 2.5730721950531006 + }, + { + "auxiliary_loss_clip": 0.0114256, + "auxiliary_loss_mlp": 0.01043796, + "balance_loss_clip": 1.02709508, + "balance_loss_mlp": 1.04827881, + "epoch": 0.23375920637306477, + "flos": 27709750926720.0, + "grad_norm": 1.8689872769958875, + "language_loss": 0.84098816, + "learning_rate": 3.5800545773703475e-06, + "loss": 0.86285174, + "num_input_tokens_seen": 83693470, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.94140625, + "step": 3888, + "time_per_iteration": 2.526090145111084 + }, + { + "auxiliary_loss_clip": 0.01140206, + "auxiliary_loss_mlp": 0.01051088, + "balance_loss_clip": 1.03400528, + "balance_loss_mlp": 1.04775357, + "epoch": 0.23381932962573276, + "flos": 17675627656320.0, + "grad_norm": 5.34722340994348, + "language_loss": 0.87174153, + "learning_rate": 3.5798157794871225e-06, + "loss": 0.89365447, + "num_input_tokens_seen": 83711620, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.921875, + "step": 3889, + "time_per_iteration": 2.465535879135132 + }, + { + "auxiliary_loss_clip": 0.01143335, + "auxiliary_loss_mlp": 0.0104294, + "balance_loss_clip": 1.02659607, + "balance_loss_mlp": 1.04914057, + "epoch": 0.23387945287840073, + "flos": 14390842763520.0, + "grad_norm": 4.26980733686294, + "language_loss": 0.7660414, + "learning_rate": 3.579576921697125e-06, + "loss": 0.78790414, + "num_input_tokens_seen": 83727890, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.94140625, + "step": 3890, + "time_per_iteration": 2.4164645671844482 + }, + { + "auxiliary_loss_clip": 0.01144006, + "auxiliary_loss_mlp": 0.01046428, + "balance_loss_clip": 1.02940536, + "balance_loss_mlp": 1.05018783, + "epoch": 0.2339395761310687, + "flos": 46097988503040.0, + "grad_norm": 3.12388753004446, + "language_loss": 0.73396742, + "learning_rate": 3.579338004009412e-06, + "loss": 0.75587177, + "num_input_tokens_seen": 83749370, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9375, + "step": 3891, + "time_per_iteration": 2.692251443862915 + }, + { + "auxiliary_loss_clip": 0.01136776, + "auxiliary_loss_mlp": 0.01040069, + "balance_loss_clip": 1.0227952, + "balance_loss_mlp": 1.04672241, + "epoch": 0.23399969938373666, + "flos": 22382044007040.0, + "grad_norm": 1.6638493558493535, + "language_loss": 0.82791233, + "learning_rate": 3.5790990264330433e-06, + "loss": 0.84968084, + "num_input_tokens_seen": 83769560, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.8984375, + "step": 3892, + "time_per_iteration": 2.4657654762268066 + }, + { + "auxiliary_loss_clip": 0.01143467, + "auxiliary_loss_mlp": 0.01042619, + "balance_loss_clip": 1.02550626, + "balance_loss_mlp": 1.04892194, + "epoch": 0.23405982263640462, + "flos": 43508542066560.0, + "grad_norm": 2.124834647136637, + "language_loss": 0.64928782, + "learning_rate": 3.578859988977082e-06, + "loss": 0.67114866, + "num_input_tokens_seen": 83795635, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9453125, + "step": 3893, + "time_per_iteration": 2.6640076637268066 + }, + { + "auxiliary_loss_clip": 0.01139173, + "auxiliary_loss_mlp": 0.01038221, + "balance_loss_clip": 1.02056575, + "balance_loss_mlp": 1.04930127, + "epoch": 0.2341199458890726, + "flos": 22564685687040.0, + "grad_norm": 2.3013698222001753, + "language_loss": 0.79011095, + "learning_rate": 3.5786208916505916e-06, + "loss": 0.81188488, + "num_input_tokens_seen": 83814090, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.8984375, + "step": 3894, + "time_per_iteration": 2.4596238136291504 + }, + { + "auxiliary_loss_clip": 0.01139997, + "auxiliary_loss_mlp": 0.01044293, + "balance_loss_clip": 1.02772284, + "balance_loss_mlp": 1.0473485, + "epoch": 0.23418006914174055, + "flos": 25633970933760.0, + "grad_norm": 1.4729608662155413, + "language_loss": 0.81608742, + "learning_rate": 3.5783817344626383e-06, + "loss": 0.83793032, + "num_input_tokens_seen": 83836870, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.92578125, + "step": 3895, + "time_per_iteration": 2.5229499340057373 + }, + { + "auxiliary_loss_clip": 0.01141397, + "auxiliary_loss_mlp": 0.01048743, + "balance_loss_clip": 1.03210139, + "balance_loss_mlp": 1.04895353, + "epoch": 0.23424019239440855, + "flos": 13545936074880.0, + "grad_norm": 2.370345363223057, + "language_loss": 0.79861861, + "learning_rate": 3.578142517422292e-06, + "loss": 0.82052004, + "num_input_tokens_seen": 83853275, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.92578125, + "step": 3896, + "time_per_iteration": 2.4219553470611572 + }, + { + "auxiliary_loss_clip": 0.01142956, + "auxiliary_loss_mlp": 0.01042754, + "balance_loss_clip": 1.02507555, + "balance_loss_mlp": 1.04863656, + "epoch": 0.2343003156470765, + "flos": 22419498913920.0, + "grad_norm": 1.6083647422684384, + "language_loss": 0.83279634, + "learning_rate": 3.577903240538623e-06, + "loss": 0.85465348, + "num_input_tokens_seen": 83872340, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9453125, + "step": 3897, + "time_per_iteration": 2.497347593307495 + }, + { + "auxiliary_loss_clip": 0.01144102, + "auxiliary_loss_mlp": 0.01048556, + "balance_loss_clip": 1.03093636, + "balance_loss_mlp": 1.04880857, + "epoch": 0.23436043889974448, + "flos": 14790815683200.0, + "grad_norm": 2.0551194275294784, + "language_loss": 0.79281437, + "learning_rate": 3.577663903820705e-06, + "loss": 0.8147409, + "num_input_tokens_seen": 83888795, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.953125, + "step": 3898, + "time_per_iteration": 2.4275295734405518 + }, + { + "auxiliary_loss_clip": 0.01139139, + "auxiliary_loss_mlp": 0.01047646, + "balance_loss_clip": 1.0316844, + "balance_loss_mlp": 1.05034626, + "epoch": 0.23442056215241244, + "flos": 22965700101120.0, + "grad_norm": 3.329769754331659, + "language_loss": 0.73955798, + "learning_rate": 3.577424507277614e-06, + "loss": 0.76142585, + "num_input_tokens_seen": 83906820, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.88671875, + "step": 3899, + "time_per_iteration": 2.5017077922821045 + }, + { + "auxiliary_loss_clip": 0.01141437, + "auxiliary_loss_mlp": 0.01051006, + "balance_loss_clip": 1.03412604, + "balance_loss_mlp": 1.04896975, + "epoch": 0.2344806854050804, + "flos": 23071887682560.0, + "grad_norm": 1.8374782290855665, + "language_loss": 0.75695914, + "learning_rate": 3.5771850509184277e-06, + "loss": 0.77888358, + "num_input_tokens_seen": 83926370, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.92578125, + "step": 3900, + "time_per_iteration": 2.4796969890594482 + }, + { + "auxiliary_loss_clip": 0.01137483, + "auxiliary_loss_mlp": 0.01049218, + "balance_loss_clip": 1.03224266, + "balance_loss_mlp": 1.04685295, + "epoch": 0.23454080865774837, + "flos": 16327074418560.0, + "grad_norm": 1.9641187800197561, + "language_loss": 0.66949147, + "learning_rate": 3.5769455347522256e-06, + "loss": 0.69135845, + "num_input_tokens_seen": 83944600, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.90625, + "step": 3901, + "time_per_iteration": 2.5052907466888428 + }, + { + "auxiliary_loss_clip": 0.01050259, + "auxiliary_loss_mlp": 0.0101831, + "balance_loss_clip": 1.01646185, + "balance_loss_mlp": 1.01950026, + "epoch": 0.23460093191041637, + "flos": 67760958142080.0, + "grad_norm": 0.7670843237762338, + "language_loss": 0.58209252, + "learning_rate": 3.576705958788091e-06, + "loss": 0.6027782, + "num_input_tokens_seen": 84005100, + "router_z_loss_clip": 0.01843262, + "router_z_loss_mlp": 0.30859375, + "step": 3902, + "time_per_iteration": 3.0522701740264893 + }, + { + "auxiliary_loss_clip": 0.01140756, + "auxiliary_loss_mlp": 0.01044175, + "balance_loss_clip": 1.02684176, + "balance_loss_mlp": 1.04932666, + "epoch": 0.23466105516308433, + "flos": 20077619990400.0, + "grad_norm": 1.9913375770157136, + "language_loss": 0.80411339, + "learning_rate": 3.576466323035108e-06, + "loss": 0.82596278, + "num_input_tokens_seen": 84023775, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9140625, + "step": 3903, + "time_per_iteration": 2.515796184539795 + }, + { + "auxiliary_loss_clip": 0.01139226, + "auxiliary_loss_mlp": 0.01039647, + "balance_loss_clip": 1.02274299, + "balance_loss_mlp": 1.04670942, + "epoch": 0.2347211784157523, + "flos": 24535714642560.0, + "grad_norm": 3.712536549247666, + "language_loss": 0.82183945, + "learning_rate": 3.5762266275023645e-06, + "loss": 0.84362817, + "num_input_tokens_seen": 84042605, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.92578125, + "step": 3904, + "time_per_iteration": 2.48119854927063 + }, + { + "auxiliary_loss_clip": 0.01140858, + "auxiliary_loss_mlp": 0.0104346, + "balance_loss_clip": 1.02642536, + "balance_loss_mlp": 1.05013537, + "epoch": 0.23478130166842026, + "flos": 23805040181760.0, + "grad_norm": 1.9990680719867946, + "language_loss": 0.7137326, + "learning_rate": 3.57598687219895e-06, + "loss": 0.7355758, + "num_input_tokens_seen": 84061520, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.90625, + "step": 3905, + "time_per_iteration": 2.494558811187744 + }, + { + "auxiliary_loss_clip": 0.01136571, + "auxiliary_loss_mlp": 0.01036326, + "balance_loss_clip": 1.01987517, + "balance_loss_mlp": 1.04811251, + "epoch": 0.23484142492108823, + "flos": 24093618048000.0, + "grad_norm": 1.865256832649412, + "language_loss": 0.70834756, + "learning_rate": 3.5757470571339543e-06, + "loss": 0.73007655, + "num_input_tokens_seen": 84081800, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8828125, + "step": 3906, + "time_per_iteration": 2.5057764053344727 + }, + { + "auxiliary_loss_clip": 0.01144181, + "auxiliary_loss_mlp": 0.01037627, + "balance_loss_clip": 1.01881528, + "balance_loss_mlp": 1.04728532, + "epoch": 0.2349015481737562, + "flos": 29095830898560.0, + "grad_norm": 2.129912307166789, + "language_loss": 0.73542202, + "learning_rate": 3.575507182316473e-06, + "loss": 0.75724012, + "num_input_tokens_seen": 84102340, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.96875, + "step": 3907, + "time_per_iteration": 2.5734074115753174 + }, + { + "auxiliary_loss_clip": 0.01141507, + "auxiliary_loss_mlp": 0.01047564, + "balance_loss_clip": 1.03004074, + "balance_loss_mlp": 1.04927719, + "epoch": 0.23496167142642416, + "flos": 18916305373440.0, + "grad_norm": 1.7646530569469054, + "language_loss": 0.72807813, + "learning_rate": 3.575267247755601e-06, + "loss": 0.74996883, + "num_input_tokens_seen": 84120370, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.921875, + "step": 3908, + "time_per_iteration": 2.438422441482544 + }, + { + "auxiliary_loss_clip": 0.01049243, + "auxiliary_loss_mlp": 0.01002133, + "balance_loss_clip": 1.00030959, + "balance_loss_mlp": 1.01835775, + "epoch": 0.23502179467909215, + "flos": 55868062896000.0, + "grad_norm": 1.0194055540826834, + "language_loss": 0.73271406, + "learning_rate": 3.5750272534604367e-06, + "loss": 0.75322783, + "num_input_tokens_seen": 84165515, + "router_z_loss_clip": 0.01818848, + "router_z_loss_mlp": 0.30859375, + "step": 3909, + "time_per_iteration": 2.8451788425445557 + }, + { + "auxiliary_loss_clip": 0.01139398, + "auxiliary_loss_mlp": 0.01043023, + "balance_loss_clip": 1.02607155, + "balance_loss_mlp": 1.04842734, + "epoch": 0.23508191793176011, + "flos": 23401763210880.0, + "grad_norm": 1.5487453833335116, + "language_loss": 0.87906706, + "learning_rate": 3.5747871994400822e-06, + "loss": 0.9008913, + "num_input_tokens_seen": 84184540, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.91015625, + "step": 3910, + "time_per_iteration": 2.4648385047912598 + }, + { + "auxiliary_loss_clip": 0.01141916, + "auxiliary_loss_mlp": 0.01040084, + "balance_loss_clip": 1.02370465, + "balance_loss_mlp": 1.04950166, + "epoch": 0.23514204118442808, + "flos": 20047671025920.0, + "grad_norm": 2.1910966534760297, + "language_loss": 0.75809109, + "learning_rate": 3.5745470857036386e-06, + "loss": 0.7799111, + "num_input_tokens_seen": 84202025, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.921875, + "step": 3911, + "time_per_iteration": 2.4715898036956787 + }, + { + "auxiliary_loss_clip": 0.01136455, + "auxiliary_loss_mlp": 0.01042737, + "balance_loss_clip": 1.02729297, + "balance_loss_mlp": 1.04807627, + "epoch": 0.23520216443709605, + "flos": 21580589796480.0, + "grad_norm": 1.9083148186883727, + "language_loss": 0.81775904, + "learning_rate": 3.5743069122602122e-06, + "loss": 0.83955097, + "num_input_tokens_seen": 84221895, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8828125, + "step": 3912, + "time_per_iteration": 2.4627628326416016 + }, + { + "auxiliary_loss_clip": 0.01139949, + "auxiliary_loss_mlp": 0.01050703, + "balance_loss_clip": 1.03270197, + "balance_loss_mlp": 1.04939759, + "epoch": 0.235262287689764, + "flos": 23185796688000.0, + "grad_norm": 1.7554989092460516, + "language_loss": 0.71664345, + "learning_rate": 3.574066679118909e-06, + "loss": 0.73854995, + "num_input_tokens_seen": 84240455, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.90625, + "step": 3913, + "time_per_iteration": 2.5080020427703857 + }, + { + "auxiliary_loss_clip": 0.01147528, + "auxiliary_loss_mlp": 0.01045028, + "balance_loss_clip": 1.02691996, + "balance_loss_mlp": 1.05220175, + "epoch": 0.23532241094243198, + "flos": 23185222070400.0, + "grad_norm": 1.7040704955860875, + "language_loss": 0.75903499, + "learning_rate": 3.57382638628884e-06, + "loss": 0.78096056, + "num_input_tokens_seen": 84261605, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.953125, + "step": 3914, + "time_per_iteration": 2.487429618835449 + }, + { + "auxiliary_loss_clip": 0.01141443, + "auxiliary_loss_mlp": 0.01040515, + "balance_loss_clip": 1.02307451, + "balance_loss_mlp": 1.05093837, + "epoch": 0.23538253419509997, + "flos": 17019324305280.0, + "grad_norm": 2.554647654086476, + "language_loss": 0.89353001, + "learning_rate": 3.5735860337791174e-06, + "loss": 0.9153496, + "num_input_tokens_seen": 84278675, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.90625, + "step": 3915, + "time_per_iteration": 2.500753402709961 + }, + { + "auxiliary_loss_clip": 0.01044736, + "auxiliary_loss_mlp": 0.01003661, + "balance_loss_clip": 1.00158656, + "balance_loss_mlp": 1.0141747, + "epoch": 0.23544265744776793, + "flos": 63448588967040.0, + "grad_norm": 0.8049654288159457, + "language_loss": 0.5935356, + "learning_rate": 3.573345621598854e-06, + "loss": 0.61401957, + "num_input_tokens_seen": 84329765, + "router_z_loss_clip": 0.02075195, + "router_z_loss_mlp": 0.3046875, + "step": 3916, + "time_per_iteration": 2.9926259517669678 + }, + { + "auxiliary_loss_clip": 0.01042644, + "auxiliary_loss_mlp": 0.01002857, + "balance_loss_clip": 1.00075865, + "balance_loss_mlp": 1.01226258, + "epoch": 0.2355027807004359, + "flos": 70515343831680.0, + "grad_norm": 0.7742950949727582, + "language_loss": 0.49486533, + "learning_rate": 3.5731051497571675e-06, + "loss": 0.51532036, + "num_input_tokens_seen": 84393680, + "router_z_loss_clip": 0.02099609, + "router_z_loss_mlp": 0.3046875, + "step": 3917, + "time_per_iteration": 3.085294723510742 + }, + { + "auxiliary_loss_clip": 0.01142529, + "auxiliary_loss_mlp": 0.01052441, + "balance_loss_clip": 1.03615093, + "balance_loss_mlp": 1.04923129, + "epoch": 0.23556290395310386, + "flos": 21434289701760.0, + "grad_norm": 2.000752484300541, + "language_loss": 0.76012552, + "learning_rate": 3.5728646182631756e-06, + "loss": 0.78207517, + "num_input_tokens_seen": 84412640, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.93359375, + "step": 3918, + "time_per_iteration": 2.4883201122283936 + }, + { + "auxiliary_loss_clip": 0.01145359, + "auxiliary_loss_mlp": 0.0104448, + "balance_loss_clip": 1.02805305, + "balance_loss_mlp": 1.04997587, + "epoch": 0.23562302720577183, + "flos": 18186421011840.0, + "grad_norm": 2.209135495431813, + "language_loss": 0.68728662, + "learning_rate": 3.5726240271259995e-06, + "loss": 0.709185, + "num_input_tokens_seen": 84431605, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.953125, + "step": 3919, + "time_per_iteration": 2.4489476680755615 + }, + { + "auxiliary_loss_clip": 0.01137524, + "auxiliary_loss_mlp": 0.01038874, + "balance_loss_clip": 1.02216101, + "balance_loss_mlp": 1.04864836, + "epoch": 0.2356831504584398, + "flos": 33730497832320.0, + "grad_norm": 1.8210843900818243, + "language_loss": 0.70324695, + "learning_rate": 3.5723833763547634e-06, + "loss": 0.72501087, + "num_input_tokens_seen": 84454210, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.88671875, + "step": 3920, + "time_per_iteration": 2.6011908054351807 + }, + { + "auxiliary_loss_clip": 0.01141332, + "auxiliary_loss_mlp": 0.01046958, + "balance_loss_clip": 1.03128195, + "balance_loss_mlp": 1.05122209, + "epoch": 0.23574327371110776, + "flos": 24932778560640.0, + "grad_norm": 1.6333300745229378, + "language_loss": 0.77596343, + "learning_rate": 3.5721426659585916e-06, + "loss": 0.79784632, + "num_input_tokens_seen": 84475540, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8984375, + "step": 3921, + "time_per_iteration": 2.498924732208252 + }, + { + "auxiliary_loss_clip": 0.01142015, + "auxiliary_loss_mlp": 0.01042804, + "balance_loss_clip": 1.02615058, + "balance_loss_mlp": 1.05108023, + "epoch": 0.23580339696377575, + "flos": 17822107319040.0, + "grad_norm": 2.5438781918161375, + "language_loss": 0.7561245, + "learning_rate": 3.571901895946612e-06, + "loss": 0.7779727, + "num_input_tokens_seen": 84494580, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.91015625, + "step": 3922, + "time_per_iteration": 2.467103958129883 + }, + { + "auxiliary_loss_clip": 0.01138646, + "auxiliary_loss_mlp": 0.01041381, + "balance_loss_clip": 1.02583599, + "balance_loss_mlp": 1.0489881, + "epoch": 0.23586352021644372, + "flos": 26286611097600.0, + "grad_norm": 2.3317912313524625, + "language_loss": 0.80016744, + "learning_rate": 3.571661066327956e-06, + "loss": 0.82196772, + "num_input_tokens_seen": 84513850, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8984375, + "step": 3923, + "time_per_iteration": 2.5075273513793945 + }, + { + "auxiliary_loss_clip": 0.01138213, + "auxiliary_loss_mlp": 0.01046068, + "balance_loss_clip": 1.02985525, + "balance_loss_mlp": 1.04845715, + "epoch": 0.23592364346911168, + "flos": 14246697484800.0, + "grad_norm": 1.9692150152538963, + "language_loss": 0.74753797, + "learning_rate": 3.571420177111754e-06, + "loss": 0.76938081, + "num_input_tokens_seen": 84532315, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8984375, + "step": 3924, + "time_per_iteration": 2.442448377609253 + }, + { + "auxiliary_loss_clip": 0.01141205, + "auxiliary_loss_mlp": 0.01046148, + "balance_loss_clip": 1.03013766, + "balance_loss_mlp": 1.04995513, + "epoch": 0.23598376672177965, + "flos": 18587938216320.0, + "grad_norm": 2.1681544357284093, + "language_loss": 0.82770467, + "learning_rate": 3.5711792283071416e-06, + "loss": 0.84957814, + "num_input_tokens_seen": 84550970, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.9140625, + "step": 3925, + "time_per_iteration": 2.44718337059021 + }, + { + "auxiliary_loss_clip": 0.01138195, + "auxiliary_loss_mlp": 0.01047882, + "balance_loss_clip": 1.03100252, + "balance_loss_mlp": 1.04645014, + "epoch": 0.2360438899744476, + "flos": 22675542036480.0, + "grad_norm": 1.8844556004317345, + "language_loss": 0.59408414, + "learning_rate": 3.5709382199232564e-06, + "loss": 0.61594486, + "num_input_tokens_seen": 84571655, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.91796875, + "step": 3926, + "time_per_iteration": 2.4840757846832275 + }, + { + "auxiliary_loss_clip": 0.01135063, + "auxiliary_loss_mlp": 0.01045392, + "balance_loss_clip": 1.02977526, + "balance_loss_mlp": 1.04721665, + "epoch": 0.23610401322711558, + "flos": 29570139014400.0, + "grad_norm": 1.967091588265342, + "language_loss": 0.71317631, + "learning_rate": 3.570697151969235e-06, + "loss": 0.73498082, + "num_input_tokens_seen": 84593130, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.87890625, + "step": 3927, + "time_per_iteration": 4.117234945297241 + }, + { + "auxiliary_loss_clip": 0.01137568, + "auxiliary_loss_mlp": 0.01044401, + "balance_loss_clip": 1.0295651, + "balance_loss_mlp": 1.04787612, + "epoch": 0.23616413647978354, + "flos": 17858520731520.0, + "grad_norm": 1.8263460078369782, + "language_loss": 0.75102496, + "learning_rate": 3.570456024454221e-06, + "loss": 0.77284467, + "num_input_tokens_seen": 84612410, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8984375, + "step": 3928, + "time_per_iteration": 3.9637200832366943 + }, + { + "auxiliary_loss_clip": 0.01137493, + "auxiliary_loss_mlp": 0.01048389, + "balance_loss_clip": 1.03086567, + "balance_loss_mlp": 1.04693556, + "epoch": 0.23622425973245154, + "flos": 11034847157760.0, + "grad_norm": 2.885999758146942, + "language_loss": 0.81520462, + "learning_rate": 3.5702148373873576e-06, + "loss": 0.83706343, + "num_input_tokens_seen": 84627610, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.90625, + "step": 3929, + "time_per_iteration": 2.499310255050659 + }, + { + "auxiliary_loss_clip": 0.01146116, + "auxiliary_loss_mlp": 0.01047853, + "balance_loss_clip": 1.02998328, + "balance_loss_mlp": 1.04974854, + "epoch": 0.2362843829851195, + "flos": 23404061681280.0, + "grad_norm": 4.669381706210694, + "language_loss": 0.7194528, + "learning_rate": 3.569973590777789e-06, + "loss": 0.74139249, + "num_input_tokens_seen": 84648415, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.9609375, + "step": 3930, + "time_per_iteration": 2.4964945316314697 + }, + { + "auxiliary_loss_clip": 0.01137432, + "auxiliary_loss_mlp": 0.0103904, + "balance_loss_clip": 1.02245224, + "balance_loss_mlp": 1.046561, + "epoch": 0.23634450623778747, + "flos": 39529855261440.0, + "grad_norm": 2.489267518834959, + "language_loss": 0.73764896, + "learning_rate": 3.569732284634665e-06, + "loss": 0.7594136, + "num_input_tokens_seen": 84670080, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.91015625, + "step": 3931, + "time_per_iteration": 2.6283528804779053 + }, + { + "auxiliary_loss_clip": 0.01140852, + "auxiliary_loss_mlp": 0.01039788, + "balance_loss_clip": 1.02245522, + "balance_loss_mlp": 1.04971111, + "epoch": 0.23640462949045543, + "flos": 24207167917440.0, + "grad_norm": 2.06419219579993, + "language_loss": 0.8026945, + "learning_rate": 3.569490918967136e-06, + "loss": 0.82450092, + "num_input_tokens_seen": 84686465, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9140625, + "step": 3932, + "time_per_iteration": 2.4901018142700195 + }, + { + "auxiliary_loss_clip": 0.01138855, + "auxiliary_loss_mlp": 0.01039585, + "balance_loss_clip": 1.02483916, + "balance_loss_mlp": 1.05032694, + "epoch": 0.2364647527431234, + "flos": 26177622255360.0, + "grad_norm": 1.5491195596348342, + "language_loss": 0.85760093, + "learning_rate": 3.5692494937843537e-06, + "loss": 0.87938541, + "num_input_tokens_seen": 84708825, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8828125, + "step": 3933, + "time_per_iteration": 2.5625483989715576 + }, + { + "auxiliary_loss_clip": 0.01146232, + "auxiliary_loss_mlp": 0.01037898, + "balance_loss_clip": 1.02008784, + "balance_loss_mlp": 1.0532943, + "epoch": 0.23652487599579136, + "flos": 22637009721600.0, + "grad_norm": 2.0322099534023685, + "language_loss": 0.8277775, + "learning_rate": 3.5690080090954727e-06, + "loss": 0.84961879, + "num_input_tokens_seen": 84726165, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9296875, + "step": 3934, + "time_per_iteration": 2.512068748474121 + }, + { + "auxiliary_loss_clip": 0.01141394, + "auxiliary_loss_mlp": 0.01037778, + "balance_loss_clip": 1.02102923, + "balance_loss_mlp": 1.04977798, + "epoch": 0.23658499924845935, + "flos": 21762261809280.0, + "grad_norm": 1.774494675769988, + "language_loss": 0.7864846, + "learning_rate": 3.5687664649096515e-06, + "loss": 0.80827636, + "num_input_tokens_seen": 84745815, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.91796875, + "step": 3935, + "time_per_iteration": 2.4996352195739746 + }, + { + "auxiliary_loss_clip": 0.01138141, + "auxiliary_loss_mlp": 0.01035042, + "balance_loss_clip": 1.01913905, + "balance_loss_mlp": 1.04973102, + "epoch": 0.23664512250112732, + "flos": 21798998444160.0, + "grad_norm": 1.7164724890649055, + "language_loss": 0.79656923, + "learning_rate": 3.5685248612360487e-06, + "loss": 0.81830108, + "num_input_tokens_seen": 84765415, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8828125, + "step": 3936, + "time_per_iteration": 2.4868710041046143 + }, + { + "auxiliary_loss_clip": 0.01138439, + "auxiliary_loss_mlp": 0.01036243, + "balance_loss_clip": 1.0192436, + "balance_loss_mlp": 1.04798818, + "epoch": 0.23670524575379528, + "flos": 22637871648000.0, + "grad_norm": 1.4334555797897097, + "language_loss": 0.78783411, + "learning_rate": 3.568283198083826e-06, + "loss": 0.80958092, + "num_input_tokens_seen": 84787080, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.90625, + "step": 3937, + "time_per_iteration": 2.499565362930298 + }, + { + "auxiliary_loss_clip": 0.01136409, + "auxiliary_loss_mlp": 0.01037518, + "balance_loss_clip": 1.02244997, + "balance_loss_mlp": 1.04970455, + "epoch": 0.23676536900646325, + "flos": 16725000263040.0, + "grad_norm": 2.078138882715826, + "language_loss": 0.85105085, + "learning_rate": 3.568041475462147e-06, + "loss": 0.8727901, + "num_input_tokens_seen": 84805395, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8671875, + "step": 3938, + "time_per_iteration": 2.449214220046997 + }, + { + "auxiliary_loss_clip": 0.01135246, + "auxiliary_loss_mlp": 0.01044603, + "balance_loss_clip": 1.0285933, + "balance_loss_mlp": 1.04824734, + "epoch": 0.23682549225913122, + "flos": 11135611785600.0, + "grad_norm": 2.4851234695326423, + "language_loss": 0.93872499, + "learning_rate": 3.5677996933801785e-06, + "loss": 0.96052349, + "num_input_tokens_seen": 84818090, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87109375, + "step": 3939, + "time_per_iteration": 2.415891647338867 + }, + { + "auxiliary_loss_clip": 0.01140924, + "auxiliary_loss_mlp": 0.01043341, + "balance_loss_clip": 1.02598429, + "balance_loss_mlp": 1.04769599, + "epoch": 0.23688561551179918, + "flos": 22559226819840.0, + "grad_norm": 1.6764835140151866, + "language_loss": 0.8238095, + "learning_rate": 3.567557851847088e-06, + "loss": 0.84565216, + "num_input_tokens_seen": 84837695, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9296875, + "step": 3940, + "time_per_iteration": 2.47468900680542 + }, + { + "auxiliary_loss_clip": 0.01145021, + "auxiliary_loss_mlp": 0.01042824, + "balance_loss_clip": 1.02592003, + "balance_loss_mlp": 1.04990602, + "epoch": 0.23694573876446715, + "flos": 18514895909760.0, + "grad_norm": 2.2107440191497054, + "language_loss": 0.88986713, + "learning_rate": 3.5673159508720464e-06, + "loss": 0.91174555, + "num_input_tokens_seen": 84854630, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.953125, + "step": 3941, + "time_per_iteration": 2.455631971359253 + }, + { + "auxiliary_loss_clip": 0.01136515, + "auxiliary_loss_mlp": 0.01043393, + "balance_loss_clip": 1.02580976, + "balance_loss_mlp": 1.04538155, + "epoch": 0.23700586201713514, + "flos": 15335723980800.0, + "grad_norm": 2.1526885300024072, + "language_loss": 0.84676927, + "learning_rate": 3.5670739904642274e-06, + "loss": 0.86856836, + "num_input_tokens_seen": 84871805, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9140625, + "step": 3942, + "time_per_iteration": 2.43743634223938 + }, + { + "auxiliary_loss_clip": 0.01140361, + "auxiliary_loss_mlp": 0.01045145, + "balance_loss_clip": 1.02769232, + "balance_loss_mlp": 1.04840159, + "epoch": 0.2370659852698031, + "flos": 23947605262080.0, + "grad_norm": 1.8547641010298248, + "language_loss": 0.80905575, + "learning_rate": 3.5668319706328065e-06, + "loss": 0.83091086, + "num_input_tokens_seen": 84889815, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.921875, + "step": 3943, + "time_per_iteration": 2.5058658123016357 + }, + { + "auxiliary_loss_clip": 0.01143585, + "auxiliary_loss_mlp": 0.01039213, + "balance_loss_clip": 1.02084267, + "balance_loss_mlp": 1.04731488, + "epoch": 0.23712610852247107, + "flos": 15332527670400.0, + "grad_norm": 2.308079684052438, + "language_loss": 0.67493033, + "learning_rate": 3.566589891386959e-06, + "loss": 0.69675827, + "num_input_tokens_seen": 84904380, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.9609375, + "step": 3944, + "time_per_iteration": 2.4276273250579834 + }, + { + "auxiliary_loss_clip": 0.01144217, + "auxiliary_loss_mlp": 0.01038427, + "balance_loss_clip": 1.02116549, + "balance_loss_mlp": 1.05084419, + "epoch": 0.23718623177513903, + "flos": 19682567233920.0, + "grad_norm": 2.061169456768298, + "language_loss": 0.75421506, + "learning_rate": 3.566347752735866e-06, + "loss": 0.77604151, + "num_input_tokens_seen": 84922935, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.93359375, + "step": 3945, + "time_per_iteration": 2.474323272705078 + }, + { + "auxiliary_loss_clip": 0.01137318, + "auxiliary_loss_mlp": 0.01039206, + "balance_loss_clip": 1.02304149, + "balance_loss_mlp": 1.0469377, + "epoch": 0.237246355027807, + "flos": 24973322037120.0, + "grad_norm": 1.6081639136691026, + "language_loss": 0.63469779, + "learning_rate": 3.5661055546887094e-06, + "loss": 0.65646303, + "num_input_tokens_seen": 84943685, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.90234375, + "step": 3946, + "time_per_iteration": 2.5087931156158447 + }, + { + "auxiliary_loss_clip": 0.01137558, + "auxiliary_loss_mlp": 0.01038922, + "balance_loss_clip": 1.02186346, + "balance_loss_mlp": 1.04692435, + "epoch": 0.23730647828047496, + "flos": 15377416692480.0, + "grad_norm": 2.27613511663784, + "language_loss": 0.77508283, + "learning_rate": 3.5658632972546734e-06, + "loss": 0.79684764, + "num_input_tokens_seen": 84959505, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.90625, + "step": 3947, + "time_per_iteration": 2.4716949462890625 + }, + { + "auxiliary_loss_clip": 0.01141281, + "auxiliary_loss_mlp": 0.0104192, + "balance_loss_clip": 1.02496827, + "balance_loss_mlp": 1.05008841, + "epoch": 0.23736660153314296, + "flos": 28150662372480.0, + "grad_norm": 1.6255497375782806, + "language_loss": 0.80575311, + "learning_rate": 3.565620980442944e-06, + "loss": 0.8275851, + "num_input_tokens_seen": 84982130, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.91015625, + "step": 3948, + "time_per_iteration": 2.5750784873962402 + }, + { + "auxiliary_loss_clip": 0.01138983, + "auxiliary_loss_mlp": 0.01043821, + "balance_loss_clip": 1.02715611, + "balance_loss_mlp": 1.04736018, + "epoch": 0.23742672478581092, + "flos": 22086570729600.0, + "grad_norm": 2.0638215262656696, + "language_loss": 0.80578661, + "learning_rate": 3.5653786042627107e-06, + "loss": 0.82761467, + "num_input_tokens_seen": 85000640, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.91796875, + "step": 3949, + "time_per_iteration": 2.512665271759033 + }, + { + "auxiliary_loss_clip": 0.01138607, + "auxiliary_loss_mlp": 0.0104063, + "balance_loss_clip": 1.02382135, + "balance_loss_mlp": 1.04584646, + "epoch": 0.2374868480384789, + "flos": 19537093152000.0, + "grad_norm": 1.8976071400358168, + "language_loss": 0.73124689, + "learning_rate": 3.565136168723163e-06, + "loss": 0.75303924, + "num_input_tokens_seen": 85018970, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.92578125, + "step": 3950, + "time_per_iteration": 2.4842302799224854 + }, + { + "auxiliary_loss_clip": 0.01135058, + "auxiliary_loss_mlp": 0.01034229, + "balance_loss_clip": 1.01944709, + "balance_loss_mlp": 1.04712903, + "epoch": 0.23754697129114685, + "flos": 19422501788160.0, + "grad_norm": 2.0688047231241247, + "language_loss": 0.73064256, + "learning_rate": 3.564893673833495e-06, + "loss": 0.75233537, + "num_input_tokens_seen": 85035905, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8828125, + "step": 3951, + "time_per_iteration": 2.5215439796447754 + }, + { + "auxiliary_loss_clip": 0.01144126, + "auxiliary_loss_mlp": 0.0104004, + "balance_loss_clip": 1.02258813, + "balance_loss_mlp": 1.0507673, + "epoch": 0.23760709454381482, + "flos": 19501002961920.0, + "grad_norm": 1.7591828710207016, + "language_loss": 0.73658371, + "learning_rate": 3.564651119602903e-06, + "loss": 0.75842535, + "num_input_tokens_seen": 85054560, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.93359375, + "step": 3952, + "time_per_iteration": 2.550182342529297 + }, + { + "auxiliary_loss_clip": 0.0113686, + "auxiliary_loss_mlp": 0.01037761, + "balance_loss_clip": 1.02213275, + "balance_loss_mlp": 1.04537988, + "epoch": 0.23766721779648278, + "flos": 27636600879360.0, + "grad_norm": 1.6791264380286672, + "language_loss": 0.71064484, + "learning_rate": 3.564408506040583e-06, + "loss": 0.73239112, + "num_input_tokens_seen": 85074425, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.9140625, + "step": 3953, + "time_per_iteration": 2.530381202697754 + }, + { + "auxiliary_loss_clip": 0.01140701, + "auxiliary_loss_mlp": 0.01042499, + "balance_loss_clip": 1.02522552, + "balance_loss_mlp": 1.04806364, + "epoch": 0.23772734104915075, + "flos": 23404348990080.0, + "grad_norm": 1.9696108021357461, + "language_loss": 0.81686246, + "learning_rate": 3.5641658331557356e-06, + "loss": 0.83869451, + "num_input_tokens_seen": 85092865, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.92578125, + "step": 3954, + "time_per_iteration": 2.491629123687744 + }, + { + "auxiliary_loss_clip": 0.01141999, + "auxiliary_loss_mlp": 0.01047189, + "balance_loss_clip": 1.02915251, + "balance_loss_mlp": 1.04870319, + "epoch": 0.23778746430181874, + "flos": 15705496540800.0, + "grad_norm": 2.155968963382196, + "language_loss": 0.65756261, + "learning_rate": 3.5639231009575634e-06, + "loss": 0.67945445, + "num_input_tokens_seen": 85110175, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.93359375, + "step": 3955, + "time_per_iteration": 2.4659719467163086 + }, + { + "auxiliary_loss_clip": 0.01138242, + "auxiliary_loss_mlp": 0.01053219, + "balance_loss_clip": 1.0362916, + "balance_loss_mlp": 1.04739583, + "epoch": 0.2378475875544867, + "flos": 19426452284160.0, + "grad_norm": 1.3846492045019327, + "language_loss": 0.83788121, + "learning_rate": 3.5636803094552704e-06, + "loss": 0.85979581, + "num_input_tokens_seen": 85129925, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.91015625, + "step": 3956, + "time_per_iteration": 2.48734712600708 + }, + { + "auxiliary_loss_clip": 0.011338, + "auxiliary_loss_mlp": 0.01040785, + "balance_loss_clip": 1.02471578, + "balance_loss_mlp": 1.04647636, + "epoch": 0.23790771080715467, + "flos": 22268565964800.0, + "grad_norm": 2.1805686912335656, + "language_loss": 0.85228634, + "learning_rate": 3.5634374586580635e-06, + "loss": 0.8740322, + "num_input_tokens_seen": 85147755, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87109375, + "step": 3957, + "time_per_iteration": 2.50199294090271 + }, + { + "auxiliary_loss_clip": 0.01139099, + "auxiliary_loss_mlp": 0.01041833, + "balance_loss_clip": 1.02686596, + "balance_loss_mlp": 1.04807806, + "epoch": 0.23796783405982264, + "flos": 20047311889920.0, + "grad_norm": 2.0218180107915757, + "language_loss": 0.70133704, + "learning_rate": 3.563194548575151e-06, + "loss": 0.72314632, + "num_input_tokens_seen": 85165270, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.91015625, + "step": 3958, + "time_per_iteration": 2.4798173904418945 + }, + { + "auxiliary_loss_clip": 0.01136893, + "auxiliary_loss_mlp": 0.01043034, + "balance_loss_clip": 1.02530742, + "balance_loss_mlp": 1.04581285, + "epoch": 0.2380279573124906, + "flos": 14245943299200.0, + "grad_norm": 3.373562251556634, + "language_loss": 0.65834582, + "learning_rate": 3.562951579215745e-06, + "loss": 0.68014508, + "num_input_tokens_seen": 85181555, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.91015625, + "step": 3959, + "time_per_iteration": 2.4558637142181396 + }, + { + "auxiliary_loss_clip": 0.01138452, + "auxiliary_loss_mlp": 0.01041764, + "balance_loss_clip": 1.02565885, + "balance_loss_mlp": 1.04832602, + "epoch": 0.23808808056515857, + "flos": 21179180332800.0, + "grad_norm": 1.7230243338870097, + "language_loss": 0.72128749, + "learning_rate": 3.5627085505890586e-06, + "loss": 0.74308968, + "num_input_tokens_seen": 85199455, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.90234375, + "step": 3960, + "time_per_iteration": 2.4831748008728027 + }, + { + "auxiliary_loss_clip": 0.01139565, + "auxiliary_loss_mlp": 0.01041249, + "balance_loss_clip": 1.0249052, + "balance_loss_mlp": 1.04867244, + "epoch": 0.23814820381782653, + "flos": 22528308188160.0, + "grad_norm": 1.8711627571775973, + "language_loss": 0.74181205, + "learning_rate": 3.562465462704307e-06, + "loss": 0.7636202, + "num_input_tokens_seen": 85219170, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.91015625, + "step": 3961, + "time_per_iteration": 2.5167927742004395 + }, + { + "auxiliary_loss_clip": 0.01138898, + "auxiliary_loss_mlp": 0.0105126, + "balance_loss_clip": 1.03318763, + "balance_loss_mlp": 1.04605162, + "epoch": 0.23820832707049452, + "flos": 22304332932480.0, + "grad_norm": 2.643011810367893, + "language_loss": 0.66067994, + "learning_rate": 3.5622223155707085e-06, + "loss": 0.68258154, + "num_input_tokens_seen": 85238480, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9296875, + "step": 3962, + "time_per_iteration": 2.4900338649749756 + }, + { + "auxiliary_loss_clip": 0.01138484, + "auxiliary_loss_mlp": 0.01050468, + "balance_loss_clip": 1.03387976, + "balance_loss_mlp": 1.04738379, + "epoch": 0.2382684503231625, + "flos": 24864225454080.0, + "grad_norm": 1.7740384877146562, + "language_loss": 0.74581182, + "learning_rate": 3.561979109197483e-06, + "loss": 0.76770139, + "num_input_tokens_seen": 85259180, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.91015625, + "step": 3963, + "time_per_iteration": 2.5409018993377686 + }, + { + "auxiliary_loss_clip": 0.01142407, + "auxiliary_loss_mlp": 0.01046013, + "balance_loss_clip": 1.02899039, + "balance_loss_mlp": 1.0498383, + "epoch": 0.23832857357583045, + "flos": 21871609787520.0, + "grad_norm": 2.0190521185084753, + "language_loss": 0.76898873, + "learning_rate": 3.5617358435938538e-06, + "loss": 0.79087293, + "num_input_tokens_seen": 85278550, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.92578125, + "step": 3964, + "time_per_iteration": 2.492861270904541 + }, + { + "auxiliary_loss_clip": 0.01137102, + "auxiliary_loss_mlp": 0.01044921, + "balance_loss_clip": 1.02911341, + "balance_loss_mlp": 1.04792333, + "epoch": 0.23838869682849842, + "flos": 21288061434240.0, + "grad_norm": 2.0459212281672956, + "language_loss": 0.71593058, + "learning_rate": 3.561492518769045e-06, + "loss": 0.73775077, + "num_input_tokens_seen": 85297345, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.890625, + "step": 3965, + "time_per_iteration": 2.5120911598205566 + }, + { + "auxiliary_loss_clip": 0.01134569, + "auxiliary_loss_mlp": 0.01047354, + "balance_loss_clip": 1.03158259, + "balance_loss_mlp": 1.04674065, + "epoch": 0.23844882008116638, + "flos": 16180594755840.0, + "grad_norm": 1.8902557347099018, + "language_loss": 0.78008091, + "learning_rate": 3.561249134732282e-06, + "loss": 0.80190015, + "num_input_tokens_seen": 85315105, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.87890625, + "step": 3966, + "time_per_iteration": 2.4576594829559326 + }, + { + "auxiliary_loss_clip": 0.01135801, + "auxiliary_loss_mlp": 0.01042292, + "balance_loss_clip": 1.02656794, + "balance_loss_mlp": 1.04652119, + "epoch": 0.23850894333383435, + "flos": 21069724613760.0, + "grad_norm": 2.8460709531404, + "language_loss": 0.68860286, + "learning_rate": 3.561005691492797e-06, + "loss": 0.71038377, + "num_input_tokens_seen": 85334735, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.89453125, + "step": 3967, + "time_per_iteration": 2.484840154647827 + }, + { + "auxiliary_loss_clip": 0.01137019, + "auxiliary_loss_mlp": 0.01053581, + "balance_loss_clip": 1.03739274, + "balance_loss_mlp": 1.04645443, + "epoch": 0.23856906658650234, + "flos": 17201606849280.0, + "grad_norm": 2.11266161128335, + "language_loss": 0.67849773, + "learning_rate": 3.5607621890598185e-06, + "loss": 0.70040375, + "num_input_tokens_seen": 85352875, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.90234375, + "step": 3968, + "time_per_iteration": 2.441445827484131 + }, + { + "auxiliary_loss_clip": 0.01134651, + "auxiliary_loss_mlp": 0.01038945, + "balance_loss_clip": 1.02318573, + "balance_loss_mlp": 1.0451827, + "epoch": 0.2386291898391703, + "flos": 29494223619840.0, + "grad_norm": 1.8948052650888014, + "language_loss": 0.76742399, + "learning_rate": 3.5605186274425823e-06, + "loss": 0.78916001, + "num_input_tokens_seen": 85372205, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.89453125, + "step": 3969, + "time_per_iteration": 5.413191318511963 + }, + { + "auxiliary_loss_clip": 0.01134724, + "auxiliary_loss_mlp": 0.01040503, + "balance_loss_clip": 1.02519631, + "balance_loss_mlp": 1.04734492, + "epoch": 0.23868931309183827, + "flos": 21142443697920.0, + "grad_norm": 2.7243772241637263, + "language_loss": 0.76300085, + "learning_rate": 3.5602750066503225e-06, + "loss": 0.78475308, + "num_input_tokens_seen": 85389705, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.875, + "step": 3970, + "time_per_iteration": 2.4792258739471436 + }, + { + "auxiliary_loss_clip": 0.01137573, + "auxiliary_loss_mlp": 0.01042951, + "balance_loss_clip": 1.02545094, + "balance_loss_mlp": 1.04645324, + "epoch": 0.23874943634450624, + "flos": 25659394784640.0, + "grad_norm": 3.3207921386663584, + "language_loss": 0.85399735, + "learning_rate": 3.5600313266922793e-06, + "loss": 0.87580258, + "num_input_tokens_seen": 85407855, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9140625, + "step": 3971, + "time_per_iteration": 2.500506639480591 + }, + { + "auxiliary_loss_clip": 0.01055799, + "auxiliary_loss_mlp": 0.01017218, + "balance_loss_clip": 1.01547742, + "balance_loss_mlp": 1.02590835, + "epoch": 0.2388095595971742, + "flos": 58986618624000.0, + "grad_norm": 0.7461637295582213, + "language_loss": 0.62814003, + "learning_rate": 3.5597875875776915e-06, + "loss": 0.64887029, + "num_input_tokens_seen": 85470885, + "router_z_loss_clip": 0.01745605, + "router_z_loss_mlp": 0.29882812, + "step": 3972, + "time_per_iteration": 3.173640012741089 + }, + { + "auxiliary_loss_clip": 0.0113938, + "auxiliary_loss_mlp": 0.01037064, + "balance_loss_clip": 1.02119696, + "balance_loss_mlp": 1.04922092, + "epoch": 0.23886968284984217, + "flos": 16800341040000.0, + "grad_norm": 1.9456864585596687, + "language_loss": 0.8170895, + "learning_rate": 3.5595437893158013e-06, + "loss": 0.8388539, + "num_input_tokens_seen": 85488460, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.90234375, + "step": 3973, + "time_per_iteration": 2.4529452323913574 + }, + { + "auxiliary_loss_clip": 0.01137225, + "auxiliary_loss_mlp": 0.01044983, + "balance_loss_clip": 1.02849591, + "balance_loss_mlp": 1.04869485, + "epoch": 0.23892980610251013, + "flos": 22382654538240.0, + "grad_norm": 1.6994626560625323, + "language_loss": 0.79299271, + "learning_rate": 3.5592999319158546e-06, + "loss": 0.81481481, + "num_input_tokens_seen": 85508590, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8828125, + "step": 3974, + "time_per_iteration": 2.5395772457122803 + }, + { + "auxiliary_loss_clip": 0.01139215, + "auxiliary_loss_mlp": 0.01038332, + "balance_loss_clip": 1.02155876, + "balance_loss_mlp": 1.04858148, + "epoch": 0.23898992935517813, + "flos": 12823198519680.0, + "grad_norm": 1.8925619228877844, + "language_loss": 0.84428573, + "learning_rate": 3.5590560153870984e-06, + "loss": 0.86606121, + "num_input_tokens_seen": 85525970, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.90625, + "step": 3975, + "time_per_iteration": 2.430361032485962 + }, + { + "auxiliary_loss_clip": 0.01135199, + "auxiliary_loss_mlp": 0.01037689, + "balance_loss_clip": 1.02215612, + "balance_loss_mlp": 1.0471369, + "epoch": 0.2390500526078461, + "flos": 22345666508160.0, + "grad_norm": 2.06825719132721, + "language_loss": 0.8375293, + "learning_rate": 3.5588120397387816e-06, + "loss": 0.85925817, + "num_input_tokens_seen": 85543700, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.87890625, + "step": 3976, + "time_per_iteration": 2.480534791946411 + }, + { + "auxiliary_loss_clip": 0.01132825, + "auxiliary_loss_mlp": 0.01032287, + "balance_loss_clip": 1.01798213, + "balance_loss_mlp": 1.04606938, + "epoch": 0.23911017586051406, + "flos": 22635142214400.0, + "grad_norm": 1.747752931490835, + "language_loss": 0.74532628, + "learning_rate": 3.5585680049801566e-06, + "loss": 0.76697731, + "num_input_tokens_seen": 85562765, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8671875, + "step": 3977, + "time_per_iteration": 2.4641239643096924 + }, + { + "auxiliary_loss_clip": 0.01138905, + "auxiliary_loss_mlp": 0.01045175, + "balance_loss_clip": 1.02818775, + "balance_loss_mlp": 1.04930067, + "epoch": 0.23917029911318202, + "flos": 23653281219840.0, + "grad_norm": 1.6638092474338306, + "language_loss": 0.72395146, + "learning_rate": 3.5583239111204764e-06, + "loss": 0.74579227, + "num_input_tokens_seen": 85581755, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.89453125, + "step": 3978, + "time_per_iteration": 2.5007903575897217 + }, + { + "auxiliary_loss_clip": 0.01143288, + "auxiliary_loss_mlp": 0.01042156, + "balance_loss_clip": 1.0256691, + "balance_loss_mlp": 1.05204654, + "epoch": 0.23923042236585, + "flos": 22783597125120.0, + "grad_norm": 2.0169903221822683, + "language_loss": 0.78654587, + "learning_rate": 3.558079758168997e-06, + "loss": 0.80840027, + "num_input_tokens_seen": 85599455, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.91015625, + "step": 3979, + "time_per_iteration": 2.5006349086761475 + }, + { + "auxiliary_loss_clip": 0.0113581, + "auxiliary_loss_mlp": 0.01044452, + "balance_loss_clip": 1.02769148, + "balance_loss_mlp": 1.04762173, + "epoch": 0.23929054561851795, + "flos": 28147717457280.0, + "grad_norm": 1.6987462202935262, + "language_loss": 0.81945407, + "learning_rate": 3.557835546134977e-06, + "loss": 0.84125668, + "num_input_tokens_seen": 85619970, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8828125, + "step": 3980, + "time_per_iteration": 2.5287020206451416 + }, + { + "auxiliary_loss_clip": 0.01136489, + "auxiliary_loss_mlp": 0.01036341, + "balance_loss_clip": 1.01974702, + "balance_loss_mlp": 1.04967999, + "epoch": 0.23935066887118592, + "flos": 21686525982720.0, + "grad_norm": 1.749461413213386, + "language_loss": 0.8401112, + "learning_rate": 3.5575912750276775e-06, + "loss": 0.86183953, + "num_input_tokens_seen": 85638850, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8671875, + "step": 3981, + "time_per_iteration": 2.466660261154175 + }, + { + "auxiliary_loss_clip": 0.01141626, + "auxiliary_loss_mlp": 0.01044752, + "balance_loss_clip": 1.0275383, + "balance_loss_mlp": 1.04951072, + "epoch": 0.2394107921238539, + "flos": 32122274198400.0, + "grad_norm": 3.6241006318049864, + "language_loss": 0.76872683, + "learning_rate": 3.5573469448563607e-06, + "loss": 0.79059052, + "num_input_tokens_seen": 85656285, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.921875, + "step": 3982, + "time_per_iteration": 2.558145046234131 + }, + { + "auxiliary_loss_clip": 0.01135351, + "auxiliary_loss_mlp": 0.01043953, + "balance_loss_clip": 1.02811027, + "balance_loss_mlp": 1.04844236, + "epoch": 0.23947091537652188, + "flos": 17019180650880.0, + "grad_norm": 6.059829142106342, + "language_loss": 0.77878481, + "learning_rate": 3.5571025556302915e-06, + "loss": 0.80057788, + "num_input_tokens_seen": 85673020, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8671875, + "step": 3983, + "time_per_iteration": 2.4443132877349854 + }, + { + "auxiliary_loss_clip": 0.01136897, + "auxiliary_loss_mlp": 0.0104106, + "balance_loss_clip": 1.02446592, + "balance_loss_mlp": 1.04759789, + "epoch": 0.23953103862918984, + "flos": 20593584904320.0, + "grad_norm": 1.9981470653963032, + "language_loss": 0.73163629, + "learning_rate": 3.556858107358737e-06, + "loss": 0.75341582, + "num_input_tokens_seen": 85692565, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.89453125, + "step": 3984, + "time_per_iteration": 2.491344690322876 + }, + { + "auxiliary_loss_clip": 0.01137825, + "auxiliary_loss_mlp": 0.01045273, + "balance_loss_clip": 1.02860713, + "balance_loss_mlp": 1.04674625, + "epoch": 0.2395911618818578, + "flos": 20704405340160.0, + "grad_norm": 2.064924146489818, + "language_loss": 0.79049474, + "learning_rate": 3.5566136000509674e-06, + "loss": 0.81232572, + "num_input_tokens_seen": 85709730, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.90625, + "step": 3985, + "time_per_iteration": 2.4587738513946533 + }, + { + "auxiliary_loss_clip": 0.01139616, + "auxiliary_loss_mlp": 0.01041503, + "balance_loss_clip": 1.02484989, + "balance_loss_mlp": 1.04980683, + "epoch": 0.23965128513452577, + "flos": 27053519402880.0, + "grad_norm": 2.0182764415160563, + "language_loss": 0.73312742, + "learning_rate": 3.556369033716254e-06, + "loss": 0.7549386, + "num_input_tokens_seen": 85730045, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8984375, + "step": 3986, + "time_per_iteration": 2.5608811378479004 + }, + { + "auxiliary_loss_clip": 0.0114189, + "auxiliary_loss_mlp": 0.01051013, + "balance_loss_clip": 1.03495562, + "balance_loss_mlp": 1.04923773, + "epoch": 0.23971140838719374, + "flos": 23144319457920.0, + "grad_norm": 2.2624046500679333, + "language_loss": 0.87836051, + "learning_rate": 3.556124408363871e-06, + "loss": 0.90028954, + "num_input_tokens_seen": 85747590, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.92578125, + "step": 3987, + "time_per_iteration": 2.461778402328491 + }, + { + "auxiliary_loss_clip": 0.01132193, + "auxiliary_loss_mlp": 0.01036037, + "balance_loss_clip": 1.02161288, + "balance_loss_mlp": 1.04831004, + "epoch": 0.23977153163986173, + "flos": 18034554309120.0, + "grad_norm": 2.3750633167266306, + "language_loss": 0.8308624, + "learning_rate": 3.5558797240030945e-06, + "loss": 0.85254467, + "num_input_tokens_seen": 85763460, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.83984375, + "step": 3988, + "time_per_iteration": 2.4527788162231445 + }, + { + "auxiliary_loss_clip": 0.01134459, + "auxiliary_loss_mlp": 0.01040007, + "balance_loss_clip": 1.02336502, + "balance_loss_mlp": 1.04686844, + "epoch": 0.2398316548925297, + "flos": 18113378705280.0, + "grad_norm": 1.649806875732991, + "language_loss": 0.85145879, + "learning_rate": 3.5556349806432035e-06, + "loss": 0.87320346, + "num_input_tokens_seen": 85782050, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.875, + "step": 3989, + "time_per_iteration": 2.43949031829834 + }, + { + "auxiliary_loss_clip": 0.01135126, + "auxiliary_loss_mlp": 0.01037638, + "balance_loss_clip": 1.02249837, + "balance_loss_mlp": 1.04763699, + "epoch": 0.23989177814519766, + "flos": 12567730014720.0, + "grad_norm": 2.0784071273800944, + "language_loss": 0.84493041, + "learning_rate": 3.555390178293477e-06, + "loss": 0.86665809, + "num_input_tokens_seen": 85797400, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.875, + "step": 3990, + "time_per_iteration": 2.4476051330566406 + }, + { + "auxiliary_loss_clip": 0.01133399, + "auxiliary_loss_mlp": 0.01040211, + "balance_loss_clip": 1.02507186, + "balance_loss_mlp": 1.0463922, + "epoch": 0.23995190139786562, + "flos": 25264593423360.0, + "grad_norm": 3.585202907729512, + "language_loss": 0.75312221, + "learning_rate": 3.5551453169631994e-06, + "loss": 0.77485824, + "num_input_tokens_seen": 85818995, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.87109375, + "step": 3991, + "time_per_iteration": 2.502324104309082 + }, + { + "auxiliary_loss_clip": 0.01050073, + "auxiliary_loss_mlp": 0.01009423, + "balance_loss_clip": 1.00774217, + "balance_loss_mlp": 1.02049088, + "epoch": 0.2400120246505336, + "flos": 61960379650560.0, + "grad_norm": 0.8894590829003932, + "language_loss": 0.63734841, + "learning_rate": 3.554900396661656e-06, + "loss": 0.65794337, + "num_input_tokens_seen": 85876695, + "router_z_loss_clip": 0.0168457, + "router_z_loss_mlp": 0.296875, + "step": 3992, + "time_per_iteration": 3.0017786026000977 + }, + { + "auxiliary_loss_clip": 0.01050397, + "auxiliary_loss_mlp": 0.01010168, + "balance_loss_clip": 1.00857067, + "balance_loss_mlp": 1.02071452, + "epoch": 0.24007214790320155, + "flos": 66708560540160.0, + "grad_norm": 0.7530514643625366, + "language_loss": 0.62963343, + "learning_rate": 3.5546554173981334e-06, + "loss": 0.65023899, + "num_input_tokens_seen": 85940990, + "router_z_loss_clip": 0.01599121, + "router_z_loss_mlp": 0.296875, + "step": 3993, + "time_per_iteration": 3.176184892654419 + }, + { + "auxiliary_loss_clip": 0.01140668, + "auxiliary_loss_mlp": 0.01047015, + "balance_loss_clip": 1.03085065, + "balance_loss_mlp": 1.05099177, + "epoch": 0.24013227115586952, + "flos": 25809070757760.0, + "grad_norm": 1.6383486345725178, + "language_loss": 0.76938868, + "learning_rate": 3.5544103791819218e-06, + "loss": 0.79126549, + "num_input_tokens_seen": 85961165, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8984375, + "step": 3994, + "time_per_iteration": 2.4940826892852783 + }, + { + "auxiliary_loss_clip": 0.01135853, + "auxiliary_loss_mlp": 0.01047966, + "balance_loss_clip": 1.0305258, + "balance_loss_mlp": 1.04680216, + "epoch": 0.2401923944085375, + "flos": 25557480921600.0, + "grad_norm": 1.7751147523393542, + "language_loss": 0.78457522, + "learning_rate": 3.5541652820223124e-06, + "loss": 0.80641341, + "num_input_tokens_seen": 85982710, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.890625, + "step": 3995, + "time_per_iteration": 2.5075032711029053 + }, + { + "auxiliary_loss_clip": 0.01047716, + "auxiliary_loss_mlp": 0.01003894, + "balance_loss_clip": 1.00232053, + "balance_loss_mlp": 1.01837659, + "epoch": 0.24025251766120548, + "flos": 54941138478720.0, + "grad_norm": 0.8913570860108078, + "language_loss": 0.63479292, + "learning_rate": 3.5539201259286006e-06, + "loss": 0.65530908, + "num_input_tokens_seen": 86046935, + "router_z_loss_clip": 0.01574707, + "router_z_loss_mlp": 0.29296875, + "step": 3996, + "time_per_iteration": 3.1365764141082764 + }, + { + "auxiliary_loss_clip": 0.01137569, + "auxiliary_loss_mlp": 0.01045722, + "balance_loss_clip": 1.02916384, + "balance_loss_mlp": 1.04678392, + "epoch": 0.24031264091387344, + "flos": 20631075724800.0, + "grad_norm": 2.906997418482602, + "language_loss": 0.7009505, + "learning_rate": 3.5536749109100808e-06, + "loss": 0.72278345, + "num_input_tokens_seen": 86064355, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.90625, + "step": 3997, + "time_per_iteration": 2.464714765548706 + }, + { + "auxiliary_loss_clip": 0.01134848, + "auxiliary_loss_mlp": 0.01042521, + "balance_loss_clip": 1.02654672, + "balance_loss_mlp": 1.04642928, + "epoch": 0.2403727641665414, + "flos": 20886256920960.0, + "grad_norm": 1.9831176119326495, + "language_loss": 0.87292743, + "learning_rate": 3.5534296369760535e-06, + "loss": 0.89470112, + "num_input_tokens_seen": 86081340, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8828125, + "step": 3998, + "time_per_iteration": 2.4639480113983154 + }, + { + "auxiliary_loss_clip": 0.01134933, + "auxiliary_loss_mlp": 0.0103944, + "balance_loss_clip": 1.02306032, + "balance_loss_mlp": 1.04208946, + "epoch": 0.24043288741920937, + "flos": 22820046451200.0, + "grad_norm": 1.9745565965944727, + "language_loss": 0.75798607, + "learning_rate": 3.5531843041358183e-06, + "loss": 0.77972972, + "num_input_tokens_seen": 86102260, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9296875, + "step": 3999, + "time_per_iteration": 2.4753127098083496 + }, + { + "auxiliary_loss_clip": 0.01132817, + "auxiliary_loss_mlp": 0.01038028, + "balance_loss_clip": 1.02317488, + "balance_loss_mlp": 1.04545271, + "epoch": 0.24049301067187734, + "flos": 27959652823680.0, + "grad_norm": 1.9306579449884984, + "language_loss": 0.72642016, + "learning_rate": 3.552938912398679e-06, + "loss": 0.74812865, + "num_input_tokens_seen": 86123400, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.875, + "step": 4000, + "time_per_iteration": 2.5172412395477295 + }, + { + "auxiliary_loss_clip": 0.01140243, + "auxiliary_loss_mlp": 0.01036911, + "balance_loss_clip": 1.02025795, + "balance_loss_mlp": 1.04728866, + "epoch": 0.24055313392454533, + "flos": 27451409333760.0, + "grad_norm": 2.4587541869300824, + "language_loss": 0.65991902, + "learning_rate": 3.5526934617739397e-06, + "loss": 0.68169051, + "num_input_tokens_seen": 86144060, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9296875, + "step": 4001, + "time_per_iteration": 2.511198043823242 + }, + { + "auxiliary_loss_clip": 0.01131233, + "auxiliary_loss_mlp": 0.01040424, + "balance_loss_clip": 1.02330589, + "balance_loss_mlp": 1.0427444, + "epoch": 0.2406132571772133, + "flos": 25556618995200.0, + "grad_norm": 2.6796652593661903, + "language_loss": 0.82567388, + "learning_rate": 3.5524479522709095e-06, + "loss": 0.84739041, + "num_input_tokens_seen": 86163005, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.8828125, + "step": 4002, + "time_per_iteration": 2.5147531032562256 + }, + { + "auxiliary_loss_clip": 0.01136125, + "auxiliary_loss_mlp": 0.01038837, + "balance_loss_clip": 1.02382851, + "balance_loss_mlp": 1.04682446, + "epoch": 0.24067338042988126, + "flos": 24791398629120.0, + "grad_norm": 1.8902513751119636, + "language_loss": 0.82875729, + "learning_rate": 3.552202383898897e-06, + "loss": 0.8505069, + "num_input_tokens_seen": 86182580, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.890625, + "step": 4003, + "time_per_iteration": 2.508004665374756 + }, + { + "auxiliary_loss_clip": 0.01134817, + "auxiliary_loss_mlp": 0.01037746, + "balance_loss_clip": 1.0214386, + "balance_loss_mlp": 1.04608846, + "epoch": 0.24073350368254923, + "flos": 21177923356800.0, + "grad_norm": 2.0497424292602835, + "language_loss": 0.87504768, + "learning_rate": 3.551956756667215e-06, + "loss": 0.89677334, + "num_input_tokens_seen": 86200665, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.88671875, + "step": 4004, + "time_per_iteration": 2.4581985473632812 + }, + { + "auxiliary_loss_clip": 0.01133072, + "auxiliary_loss_mlp": 0.0104917, + "balance_loss_clip": 1.03356552, + "balance_loss_mlp": 1.04228568, + "epoch": 0.2407936269352172, + "flos": 22494300986880.0, + "grad_norm": 1.9722136456468877, + "language_loss": 0.77630293, + "learning_rate": 3.551711070585177e-06, + "loss": 0.79812533, + "num_input_tokens_seen": 86221640, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.90625, + "step": 4005, + "time_per_iteration": 2.556365728378296 + }, + { + "auxiliary_loss_clip": 0.01130485, + "auxiliary_loss_mlp": 0.01036948, + "balance_loss_clip": 1.02141535, + "balance_loss_mlp": 1.04398429, + "epoch": 0.24085375018788516, + "flos": 18551129754240.0, + "grad_norm": 1.7295620858093623, + "language_loss": 0.78973985, + "learning_rate": 3.5514653256620995e-06, + "loss": 0.81141412, + "num_input_tokens_seen": 86240795, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8671875, + "step": 4006, + "time_per_iteration": 2.460961103439331 + }, + { + "auxiliary_loss_clip": 0.0113781, + "auxiliary_loss_mlp": 0.01038185, + "balance_loss_clip": 1.02072108, + "balance_loss_mlp": 1.04375279, + "epoch": 0.24091387344055312, + "flos": 24170539023360.0, + "grad_norm": 2.2017624810959346, + "language_loss": 0.71201313, + "learning_rate": 3.551219521907302e-06, + "loss": 0.73377299, + "num_input_tokens_seen": 86262000, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.94140625, + "step": 4007, + "time_per_iteration": 2.5169517993927 + }, + { + "auxiliary_loss_clip": 0.01131131, + "auxiliary_loss_mlp": 0.01039619, + "balance_loss_clip": 1.02459884, + "balance_loss_mlp": 1.04453456, + "epoch": 0.24097399669322112, + "flos": 11036319615360.0, + "grad_norm": 1.805972702734942, + "language_loss": 0.75857127, + "learning_rate": 3.5509736593301042e-06, + "loss": 0.7802788, + "num_input_tokens_seen": 86279680, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8671875, + "step": 4008, + "time_per_iteration": 2.4489922523498535 + }, + { + "auxiliary_loss_clip": 0.01131483, + "auxiliary_loss_mlp": 0.01034828, + "balance_loss_clip": 1.01940203, + "balance_loss_mlp": 1.04296207, + "epoch": 0.24103411994588908, + "flos": 17165085696000.0, + "grad_norm": 2.356516377050019, + "language_loss": 0.73922294, + "learning_rate": 3.5507277379398295e-06, + "loss": 0.76088601, + "num_input_tokens_seen": 86297180, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8828125, + "step": 4009, + "time_per_iteration": 2.4701087474823 + }, + { + "auxiliary_loss_clip": 0.01133056, + "auxiliary_loss_mlp": 0.0104248, + "balance_loss_clip": 1.02664948, + "balance_loss_mlp": 1.04632092, + "epoch": 0.24109424319855705, + "flos": 20667956014080.0, + "grad_norm": 1.636895821506206, + "language_loss": 0.79938453, + "learning_rate": 3.550481757745804e-06, + "loss": 0.82113993, + "num_input_tokens_seen": 86317660, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8671875, + "step": 4010, + "time_per_iteration": 3.9670608043670654 + }, + { + "auxiliary_loss_clip": 0.01131615, + "auxiliary_loss_mlp": 0.01047202, + "balance_loss_clip": 1.02923679, + "balance_loss_mlp": 1.04108143, + "epoch": 0.241154366451225, + "flos": 28181796485760.0, + "grad_norm": 2.295886994366384, + "language_loss": 0.70799017, + "learning_rate": 3.5502357187573555e-06, + "loss": 0.72977829, + "num_input_tokens_seen": 86338325, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.90625, + "step": 4011, + "time_per_iteration": 3.9544472694396973 + }, + { + "auxiliary_loss_clip": 0.01131445, + "auxiliary_loss_mlp": 0.01039733, + "balance_loss_clip": 1.02429593, + "balance_loss_mlp": 1.04258561, + "epoch": 0.24121448970389298, + "flos": 21689722293120.0, + "grad_norm": 1.6166610897431488, + "language_loss": 0.69062299, + "learning_rate": 3.5499896209838118e-06, + "loss": 0.71233475, + "num_input_tokens_seen": 86357615, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.88671875, + "step": 4012, + "time_per_iteration": 2.501347303390503 + }, + { + "auxiliary_loss_clip": 0.01133874, + "auxiliary_loss_mlp": 0.01039375, + "balance_loss_clip": 1.02145839, + "balance_loss_mlp": 1.04454589, + "epoch": 0.24127461295656094, + "flos": 39676191269760.0, + "grad_norm": 2.0861437601678303, + "language_loss": 0.73424822, + "learning_rate": 3.5497434644345073e-06, + "loss": 0.75598073, + "num_input_tokens_seen": 86380355, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.89453125, + "step": 4013, + "time_per_iteration": 2.6360883712768555 + }, + { + "auxiliary_loss_clip": 0.01133872, + "auxiliary_loss_mlp": 0.01036616, + "balance_loss_clip": 1.02110672, + "balance_loss_mlp": 1.04450822, + "epoch": 0.2413347362092289, + "flos": 19135863256320.0, + "grad_norm": 1.8416541794010313, + "language_loss": 0.88554955, + "learning_rate": 3.5494972491187753e-06, + "loss": 0.9072544, + "num_input_tokens_seen": 86399125, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.89453125, + "step": 4014, + "time_per_iteration": 2.4663264751434326 + }, + { + "auxiliary_loss_clip": 0.01137985, + "auxiliary_loss_mlp": 0.01043677, + "balance_loss_clip": 1.02643979, + "balance_loss_mlp": 1.04453659, + "epoch": 0.2413948594618969, + "flos": 26939430829440.0, + "grad_norm": 2.755357499792604, + "language_loss": 0.94270647, + "learning_rate": 3.549250975045952e-06, + "loss": 0.96452308, + "num_input_tokens_seen": 86418625, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.93359375, + "step": 4015, + "time_per_iteration": 2.470952033996582 + }, + { + "auxiliary_loss_clip": 0.01133849, + "auxiliary_loss_mlp": 0.01038159, + "balance_loss_clip": 1.02174377, + "balance_loss_mlp": 1.04334664, + "epoch": 0.24145498271456486, + "flos": 25228108183680.0, + "grad_norm": 1.8402084517778015, + "language_loss": 0.82513833, + "learning_rate": 3.5490046422253768e-06, + "loss": 0.84685838, + "num_input_tokens_seen": 86438375, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90625, + "step": 4016, + "time_per_iteration": 2.4922966957092285 + }, + { + "auxiliary_loss_clip": 0.01127395, + "auxiliary_loss_mlp": 0.01040098, + "balance_loss_clip": 1.02423143, + "balance_loss_mlp": 1.04197156, + "epoch": 0.24151510596723283, + "flos": 40661759617920.0, + "grad_norm": 3.4212830828584386, + "language_loss": 0.69553781, + "learning_rate": 3.54875825066639e-06, + "loss": 0.71721268, + "num_input_tokens_seen": 86463230, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8515625, + "step": 4017, + "time_per_iteration": 2.596977710723877 + }, + { + "auxiliary_loss_clip": 0.01135423, + "auxiliary_loss_mlp": 0.01046549, + "balance_loss_clip": 1.02959788, + "balance_loss_mlp": 1.04421043, + "epoch": 0.2415752292199008, + "flos": 18146667634560.0, + "grad_norm": 2.0038503347112084, + "language_loss": 0.85114455, + "learning_rate": 3.5485118003783353e-06, + "loss": 0.87296432, + "num_input_tokens_seen": 86481230, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.91015625, + "step": 4018, + "time_per_iteration": 2.440749406814575 + }, + { + "auxiliary_loss_clip": 0.01046553, + "auxiliary_loss_mlp": 0.01012788, + "balance_loss_clip": 1.0109762, + "balance_loss_mlp": 1.01676679, + "epoch": 0.24163535247256876, + "flos": 67288409792640.0, + "grad_norm": 0.8182663934779763, + "language_loss": 0.60620981, + "learning_rate": 3.548265291370558e-06, + "loss": 0.62680322, + "num_input_tokens_seen": 86541260, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.296875, + "step": 4019, + "time_per_iteration": 3.112665891647339 + }, + { + "auxiliary_loss_clip": 0.01134098, + "auxiliary_loss_mlp": 0.01038378, + "balance_loss_clip": 1.02299976, + "balance_loss_mlp": 1.04433608, + "epoch": 0.24169547572523672, + "flos": 24929941386240.0, + "grad_norm": 1.880182475838635, + "language_loss": 0.73690915, + "learning_rate": 3.5480187236524055e-06, + "loss": 0.75863391, + "num_input_tokens_seen": 86559580, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8984375, + "step": 4020, + "time_per_iteration": 2.5049281120300293 + }, + { + "auxiliary_loss_clip": 0.01134711, + "auxiliary_loss_mlp": 0.01037599, + "balance_loss_clip": 1.02199471, + "balance_loss_mlp": 1.04660118, + "epoch": 0.24175559897790472, + "flos": 18728312567040.0, + "grad_norm": 1.9671591580269927, + "language_loss": 0.82012737, + "learning_rate": 3.5477720972332285e-06, + "loss": 0.84185052, + "num_input_tokens_seen": 86577560, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8828125, + "step": 4021, + "time_per_iteration": 2.464092493057251 + }, + { + "auxiliary_loss_clip": 0.01137652, + "auxiliary_loss_mlp": 0.01048543, + "balance_loss_clip": 1.03036344, + "balance_loss_mlp": 1.04551053, + "epoch": 0.24181572223057268, + "flos": 23039281111680.0, + "grad_norm": 1.9434993168468309, + "language_loss": 0.76464498, + "learning_rate": 3.547525412122378e-06, + "loss": 0.78650689, + "num_input_tokens_seen": 86595350, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.921875, + "step": 4022, + "time_per_iteration": 2.4939990043640137 + }, + { + "auxiliary_loss_clip": 0.01140564, + "auxiliary_loss_mlp": 0.01042084, + "balance_loss_clip": 1.0248704, + "balance_loss_mlp": 1.04610109, + "epoch": 0.24187584548324065, + "flos": 20376145923840.0, + "grad_norm": 1.893594506248005, + "language_loss": 0.75172901, + "learning_rate": 3.5472786683292083e-06, + "loss": 0.77355558, + "num_input_tokens_seen": 86614805, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9453125, + "step": 4023, + "time_per_iteration": 2.442469358444214 + }, + { + "auxiliary_loss_clip": 0.0113425, + "auxiliary_loss_mlp": 0.010453, + "balance_loss_clip": 1.0288136, + "balance_loss_mlp": 1.04636168, + "epoch": 0.2419359687359086, + "flos": 21397517153280.0, + "grad_norm": 1.7406117596406352, + "language_loss": 0.81464303, + "learning_rate": 3.5470318658630766e-06, + "loss": 0.83643848, + "num_input_tokens_seen": 86633700, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.87890625, + "step": 4024, + "time_per_iteration": 2.45035719871521 + }, + { + "auxiliary_loss_clip": 0.01134068, + "auxiliary_loss_mlp": 0.01045811, + "balance_loss_clip": 1.02951503, + "balance_loss_mlp": 1.0462923, + "epoch": 0.24199609198857658, + "flos": 18369385914240.0, + "grad_norm": 1.8550338864746303, + "language_loss": 0.85851878, + "learning_rate": 3.5467850047333424e-06, + "loss": 0.88031757, + "num_input_tokens_seen": 86650905, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.87890625, + "step": 4025, + "time_per_iteration": 2.4191699028015137 + }, + { + "auxiliary_loss_clip": 0.01136643, + "auxiliary_loss_mlp": 0.01048637, + "balance_loss_clip": 1.03154194, + "balance_loss_mlp": 1.04397535, + "epoch": 0.24205621524124454, + "flos": 19463871277440.0, + "grad_norm": 1.9498897834730646, + "language_loss": 0.71243072, + "learning_rate": 3.546538084949365e-06, + "loss": 0.73428357, + "num_input_tokens_seen": 86669185, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9296875, + "step": 4026, + "time_per_iteration": 2.476792812347412 + }, + { + "auxiliary_loss_clip": 0.01132732, + "auxiliary_loss_mlp": 0.01041866, + "balance_loss_clip": 1.0259757, + "balance_loss_mlp": 1.04589748, + "epoch": 0.2421163384939125, + "flos": 14976330451200.0, + "grad_norm": 1.8853181761927913, + "language_loss": 0.64215046, + "learning_rate": 3.546291106520509e-06, + "loss": 0.66389644, + "num_input_tokens_seen": 86686805, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8671875, + "step": 4027, + "time_per_iteration": 2.443652868270874 + }, + { + "auxiliary_loss_clip": 0.01136833, + "auxiliary_loss_mlp": 0.01037586, + "balance_loss_clip": 1.02226758, + "balance_loss_mlp": 1.04601741, + "epoch": 0.2421764617465805, + "flos": 18662057930880.0, + "grad_norm": 2.5479611354975007, + "language_loss": 0.70294374, + "learning_rate": 3.5460440694561388e-06, + "loss": 0.72468793, + "num_input_tokens_seen": 86705520, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.91015625, + "step": 4028, + "time_per_iteration": 2.48252534866333 + }, + { + "auxiliary_loss_clip": 0.01044866, + "auxiliary_loss_mlp": 0.01007457, + "balance_loss_clip": 1.00585961, + "balance_loss_mlp": 1.01464319, + "epoch": 0.24223658499924847, + "flos": 64347327164160.0, + "grad_norm": 0.8570499142131055, + "language_loss": 0.55407649, + "learning_rate": 3.545796973765623e-06, + "loss": 0.57459968, + "num_input_tokens_seen": 86767320, + "router_z_loss_clip": 0.01599121, + "router_z_loss_mlp": 0.30078125, + "step": 4029, + "time_per_iteration": 3.094402551651001 + }, + { + "auxiliary_loss_clip": 0.01135659, + "auxiliary_loss_mlp": 0.01043386, + "balance_loss_clip": 1.02567101, + "balance_loss_mlp": 1.04526591, + "epoch": 0.24229670825191643, + "flos": 25775243124480.0, + "grad_norm": 2.019101437715354, + "language_loss": 0.73829788, + "learning_rate": 3.54554981945833e-06, + "loss": 0.76008832, + "num_input_tokens_seen": 86788110, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.90234375, + "step": 4030, + "time_per_iteration": 2.5176522731781006 + }, + { + "auxiliary_loss_clip": 0.01135714, + "auxiliary_loss_mlp": 0.01053146, + "balance_loss_clip": 1.03655171, + "balance_loss_mlp": 1.04541922, + "epoch": 0.2423568315045844, + "flos": 20667094087680.0, + "grad_norm": 2.062987020241499, + "language_loss": 0.76440287, + "learning_rate": 3.5453026065436343e-06, + "loss": 0.78629148, + "num_input_tokens_seen": 86807640, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.90234375, + "step": 4031, + "time_per_iteration": 2.4774179458618164 + }, + { + "auxiliary_loss_clip": 0.01140068, + "auxiliary_loss_mlp": 0.01046331, + "balance_loss_clip": 1.02974856, + "balance_loss_mlp": 1.0464952, + "epoch": 0.24241695475725236, + "flos": 22416805393920.0, + "grad_norm": 7.078640241023749, + "language_loss": 0.65947008, + "learning_rate": 3.5450553350309083e-06, + "loss": 0.68133402, + "num_input_tokens_seen": 86826795, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9375, + "step": 4032, + "time_per_iteration": 2.500488519668579 + }, + { + "auxiliary_loss_clip": 0.01130465, + "auxiliary_loss_mlp": 0.01046589, + "balance_loss_clip": 1.03026938, + "balance_loss_mlp": 1.04175007, + "epoch": 0.24247707800992033, + "flos": 17128995505920.0, + "grad_norm": 3.1167913511387995, + "language_loss": 0.81353086, + "learning_rate": 3.5448080049295286e-06, + "loss": 0.83530146, + "num_input_tokens_seen": 86843175, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.88671875, + "step": 4033, + "time_per_iteration": 2.434652805328369 + }, + { + "auxiliary_loss_clip": 0.0113019, + "auxiliary_loss_mlp": 0.01039201, + "balance_loss_clip": 1.02310205, + "balance_loss_mlp": 1.04302979, + "epoch": 0.2425372012625883, + "flos": 31613743399680.0, + "grad_norm": 2.0372289343003023, + "language_loss": 0.69200158, + "learning_rate": 3.5445606162488754e-06, + "loss": 0.71369547, + "num_input_tokens_seen": 86863185, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.87109375, + "step": 4034, + "time_per_iteration": 2.583693027496338 + }, + { + "auxiliary_loss_clip": 0.01132981, + "auxiliary_loss_mlp": 0.01036321, + "balance_loss_clip": 1.01868999, + "balance_loss_mlp": 1.04278564, + "epoch": 0.24259732451525629, + "flos": 16326032924160.0, + "grad_norm": 2.4913709616978554, + "language_loss": 0.95772272, + "learning_rate": 3.5443131689983283e-06, + "loss": 0.97941571, + "num_input_tokens_seen": 86880040, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8984375, + "step": 4035, + "time_per_iteration": 2.4757437705993652 + }, + { + "auxiliary_loss_clip": 0.01126986, + "auxiliary_loss_mlp": 0.01047233, + "balance_loss_clip": 1.03220701, + "balance_loss_mlp": 1.04172754, + "epoch": 0.24265744776792425, + "flos": 22856639431680.0, + "grad_norm": 2.0212510419571794, + "language_loss": 0.77875686, + "learning_rate": 3.5440656631872715e-06, + "loss": 0.80049908, + "num_input_tokens_seen": 86900610, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8515625, + "step": 4036, + "time_per_iteration": 2.5642547607421875 + }, + { + "auxiliary_loss_clip": 0.01134779, + "auxiliary_loss_mlp": 0.01043471, + "balance_loss_clip": 1.02642441, + "balance_loss_mlp": 1.04447269, + "epoch": 0.24271757102059222, + "flos": 21871573873920.0, + "grad_norm": 1.648393445666421, + "language_loss": 0.74427915, + "learning_rate": 3.5438180988250898e-06, + "loss": 0.76606166, + "num_input_tokens_seen": 86919385, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.90234375, + "step": 4037, + "time_per_iteration": 2.4529507160186768 + }, + { + "auxiliary_loss_clip": 0.01133991, + "auxiliary_loss_mlp": 0.0104144, + "balance_loss_clip": 1.02497733, + "balance_loss_mlp": 1.04398596, + "epoch": 0.24277769427326018, + "flos": 19208582340480.0, + "grad_norm": 2.7681997598872656, + "language_loss": 0.76223898, + "learning_rate": 3.543570475921171e-06, + "loss": 0.78399336, + "num_input_tokens_seen": 86938885, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8984375, + "step": 4038, + "time_per_iteration": 2.4618003368377686 + }, + { + "auxiliary_loss_clip": 0.01135029, + "auxiliary_loss_mlp": 0.01044933, + "balance_loss_clip": 1.02742147, + "balance_loss_mlp": 1.04415751, + "epoch": 0.24283781752592815, + "flos": 19499889640320.0, + "grad_norm": 2.0050890767905645, + "language_loss": 0.72632921, + "learning_rate": 3.543322794484905e-06, + "loss": 0.74812889, + "num_input_tokens_seen": 86957705, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.90625, + "step": 4039, + "time_per_iteration": 2.4261560440063477 + }, + { + "auxiliary_loss_clip": 0.01135297, + "auxiliary_loss_mlp": 0.01043184, + "balance_loss_clip": 1.02631593, + "balance_loss_mlp": 1.04608393, + "epoch": 0.2428979407785961, + "flos": 19902196944000.0, + "grad_norm": 1.6810247735848671, + "language_loss": 0.78330719, + "learning_rate": 3.5430750545256843e-06, + "loss": 0.80509198, + "num_input_tokens_seen": 86975845, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.890625, + "step": 4040, + "time_per_iteration": 2.4808037281036377 + }, + { + "auxiliary_loss_clip": 0.01128006, + "auxiliary_loss_mlp": 0.01034019, + "balance_loss_clip": 1.01912999, + "balance_loss_mlp": 1.04237986, + "epoch": 0.2429580640312641, + "flos": 24715878284160.0, + "grad_norm": 1.8145876332629047, + "language_loss": 0.80390251, + "learning_rate": 3.5428272560529027e-06, + "loss": 0.82552278, + "num_input_tokens_seen": 86994800, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.85546875, + "step": 4041, + "time_per_iteration": 2.482576847076416 + }, + { + "auxiliary_loss_clip": 0.01134051, + "auxiliary_loss_mlp": 0.01043295, + "balance_loss_clip": 1.02769041, + "balance_loss_mlp": 1.04653025, + "epoch": 0.24301818728393207, + "flos": 25630343660160.0, + "grad_norm": 4.455498217071982, + "language_loss": 0.76670969, + "learning_rate": 3.542579399075957e-06, + "loss": 0.78848314, + "num_input_tokens_seen": 87016845, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.875, + "step": 4042, + "time_per_iteration": 2.4944398403167725 + }, + { + "auxiliary_loss_clip": 0.01130826, + "auxiliary_loss_mlp": 0.01033112, + "balance_loss_clip": 1.01815128, + "balance_loss_mlp": 1.04393744, + "epoch": 0.24307831053660003, + "flos": 26141388410880.0, + "grad_norm": 1.7591863299055037, + "language_loss": 0.8139993, + "learning_rate": 3.542331483604246e-06, + "loss": 0.83563864, + "num_input_tokens_seen": 87036270, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8671875, + "step": 4043, + "time_per_iteration": 2.4965035915374756 + }, + { + "auxiliary_loss_clip": 0.01135997, + "auxiliary_loss_mlp": 0.01037391, + "balance_loss_clip": 1.02053475, + "balance_loss_mlp": 1.04298007, + "epoch": 0.243138433789268, + "flos": 14972415868800.0, + "grad_norm": 2.448799092011911, + "language_loss": 0.73345625, + "learning_rate": 3.5420835096471706e-06, + "loss": 0.75519013, + "num_input_tokens_seen": 87049920, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9296875, + "step": 4044, + "time_per_iteration": 2.42809796333313 + }, + { + "auxiliary_loss_clip": 0.01136098, + "auxiliary_loss_mlp": 0.01042356, + "balance_loss_clip": 1.0252496, + "balance_loss_mlp": 1.04730773, + "epoch": 0.24319855704193596, + "flos": 25191694771200.0, + "grad_norm": 1.780616714891853, + "language_loss": 0.83562207, + "learning_rate": 3.5418354772141337e-06, + "loss": 0.85740674, + "num_input_tokens_seen": 87068230, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.88671875, + "step": 4045, + "time_per_iteration": 2.4965107440948486 + }, + { + "auxiliary_loss_clip": 0.01134201, + "auxiliary_loss_mlp": 0.01045916, + "balance_loss_clip": 1.02944136, + "balance_loss_mlp": 1.04542089, + "epoch": 0.24325868029460393, + "flos": 22127221946880.0, + "grad_norm": 2.1598753545738663, + "language_loss": 0.86787856, + "learning_rate": 3.541587386314541e-06, + "loss": 0.88967973, + "num_input_tokens_seen": 87086435, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.890625, + "step": 4046, + "time_per_iteration": 2.5126357078552246 + }, + { + "auxiliary_loss_clip": 0.01128157, + "auxiliary_loss_mlp": 0.01041362, + "balance_loss_clip": 1.02526259, + "balance_loss_mlp": 1.04252553, + "epoch": 0.2433188035472719, + "flos": 23582106420480.0, + "grad_norm": 1.9885516182116696, + "language_loss": 0.7281425, + "learning_rate": 3.5413392369578e-06, + "loss": 0.7498377, + "num_input_tokens_seen": 87105340, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.859375, + "step": 4047, + "time_per_iteration": 2.4886271953582764 + }, + { + "auxiliary_loss_clip": 0.01133305, + "auxiliary_loss_mlp": 0.01039984, + "balance_loss_clip": 1.02243662, + "balance_loss_mlp": 1.0435816, + "epoch": 0.2433789267999399, + "flos": 24462815990400.0, + "grad_norm": 2.411807088840578, + "language_loss": 0.72845596, + "learning_rate": 3.5410910291533213e-06, + "loss": 0.75018883, + "num_input_tokens_seen": 87125780, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8984375, + "step": 4048, + "time_per_iteration": 2.522012710571289 + }, + { + "auxiliary_loss_clip": 0.01132229, + "auxiliary_loss_mlp": 0.01042433, + "balance_loss_clip": 1.02720952, + "balance_loss_mlp": 1.04504991, + "epoch": 0.24343905005260785, + "flos": 16727909264640.0, + "grad_norm": 4.923738678144707, + "language_loss": 0.72984087, + "learning_rate": 3.5408427629105155e-06, + "loss": 0.75158751, + "num_input_tokens_seen": 87144470, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.875, + "step": 4049, + "time_per_iteration": 2.4399380683898926 + }, + { + "auxiliary_loss_clip": 0.0112907, + "auxiliary_loss_mlp": 0.01041944, + "balance_loss_clip": 1.02654243, + "balance_loss_mlp": 1.04297137, + "epoch": 0.24349917330527582, + "flos": 20043756443520.0, + "grad_norm": 6.058583880667159, + "language_loss": 0.7388249, + "learning_rate": 3.5405944382387985e-06, + "loss": 0.760535, + "num_input_tokens_seen": 87162830, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.859375, + "step": 4050, + "time_per_iteration": 2.4589998722076416 + }, + { + "auxiliary_loss_clip": 0.01128476, + "auxiliary_loss_mlp": 0.01044223, + "balance_loss_clip": 1.02925062, + "balance_loss_mlp": 1.04373455, + "epoch": 0.24355929655794378, + "flos": 17420554200960.0, + "grad_norm": 3.083460080669968, + "language_loss": 0.74948591, + "learning_rate": 3.5403460551475854e-06, + "loss": 0.77121294, + "num_input_tokens_seen": 87180905, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.84765625, + "step": 4051, + "time_per_iteration": 2.4284183979034424 + }, + { + "auxiliary_loss_clip": 0.01129104, + "auxiliary_loss_mlp": 0.01038015, + "balance_loss_clip": 1.02251768, + "balance_loss_mlp": 1.04273975, + "epoch": 0.24361941981061175, + "flos": 25410929431680.0, + "grad_norm": 2.420510968298769, + "language_loss": 0.70638204, + "learning_rate": 3.540097613646296e-06, + "loss": 0.72805327, + "num_input_tokens_seen": 87202290, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.86328125, + "step": 4052, + "time_per_iteration": 5.468756675720215 + }, + { + "auxiliary_loss_clip": 0.01131368, + "auxiliary_loss_mlp": 0.01048115, + "balance_loss_clip": 1.03215313, + "balance_loss_mlp": 1.04370522, + "epoch": 0.2436795430632797, + "flos": 22820800636800.0, + "grad_norm": 1.61331134721481, + "language_loss": 0.81265736, + "learning_rate": 3.539849113744351e-06, + "loss": 0.83445215, + "num_input_tokens_seen": 87221650, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.875, + "step": 4053, + "time_per_iteration": 2.5280394554138184 + }, + { + "auxiliary_loss_clip": 0.01135173, + "auxiliary_loss_mlp": 0.01035062, + "balance_loss_clip": 1.01895714, + "balance_loss_mlp": 1.04522192, + "epoch": 0.2437396663159477, + "flos": 15157786982400.0, + "grad_norm": 1.5461481286352234, + "language_loss": 0.77842951, + "learning_rate": 3.539600555451172e-06, + "loss": 0.80013186, + "num_input_tokens_seen": 87238515, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8984375, + "step": 4054, + "time_per_iteration": 2.424604892730713 + }, + { + "auxiliary_loss_clip": 0.01128011, + "auxiliary_loss_mlp": 0.01044969, + "balance_loss_clip": 1.02990091, + "balance_loss_mlp": 1.04097724, + "epoch": 0.24379978956861567, + "flos": 22091131756800.0, + "grad_norm": 1.616998838355979, + "language_loss": 0.83784473, + "learning_rate": 3.5393519387761866e-06, + "loss": 0.85957456, + "num_input_tokens_seen": 87256290, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.87109375, + "step": 4055, + "time_per_iteration": 2.4814612865448 + }, + { + "auxiliary_loss_clip": 0.0113426, + "auxiliary_loss_mlp": 0.01038657, + "balance_loss_clip": 1.02194405, + "balance_loss_mlp": 1.04221749, + "epoch": 0.24385991282128364, + "flos": 31467766527360.0, + "grad_norm": 3.407480313131798, + "language_loss": 0.55291057, + "learning_rate": 3.5391032637288217e-06, + "loss": 0.57463974, + "num_input_tokens_seen": 87277085, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.921875, + "step": 4056, + "time_per_iteration": 2.5356216430664062 + }, + { + "auxiliary_loss_clip": 0.01133242, + "auxiliary_loss_mlp": 0.01043161, + "balance_loss_clip": 1.02626896, + "balance_loss_mlp": 1.04361272, + "epoch": 0.2439200360739516, + "flos": 23838795987840.0, + "grad_norm": 2.24663888381965, + "language_loss": 0.79832959, + "learning_rate": 3.538854530318506e-06, + "loss": 0.82009363, + "num_input_tokens_seen": 87293020, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8984375, + "step": 4057, + "time_per_iteration": 2.4915707111358643 + }, + { + "auxiliary_loss_clip": 0.01128391, + "auxiliary_loss_mlp": 0.01037777, + "balance_loss_clip": 1.02195764, + "balance_loss_mlp": 1.04218984, + "epoch": 0.24398015932661957, + "flos": 19169978198400.0, + "grad_norm": 1.7432058239394113, + "language_loss": 0.78817719, + "learning_rate": 3.538605738554673e-06, + "loss": 0.80983889, + "num_input_tokens_seen": 87311445, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.86328125, + "step": 4058, + "time_per_iteration": 2.426687002182007 + }, + { + "auxiliary_loss_clip": 0.01133605, + "auxiliary_loss_mlp": 0.01038515, + "balance_loss_clip": 1.02366126, + "balance_loss_mlp": 1.04273307, + "epoch": 0.24404028257928753, + "flos": 25262474520960.0, + "grad_norm": 1.688831116872718, + "language_loss": 0.85133582, + "learning_rate": 3.538356888446756e-06, + "loss": 0.87305701, + "num_input_tokens_seen": 87332055, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.90625, + "step": 4059, + "time_per_iteration": 2.499464511871338 + }, + { + "auxiliary_loss_clip": 0.01127196, + "auxiliary_loss_mlp": 0.01035816, + "balance_loss_clip": 1.02079606, + "balance_loss_mlp": 1.04288411, + "epoch": 0.2441004058319555, + "flos": 26467600752000.0, + "grad_norm": 1.6494662829711617, + "language_loss": 0.73770267, + "learning_rate": 3.5381079800041913e-06, + "loss": 0.75933278, + "num_input_tokens_seen": 87351295, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84375, + "step": 4060, + "time_per_iteration": 2.4955050945281982 + }, + { + "auxiliary_loss_clip": 0.01137637, + "auxiliary_loss_mlp": 0.01050854, + "balance_loss_clip": 1.03262711, + "balance_loss_mlp": 1.04506934, + "epoch": 0.2441605290846235, + "flos": 26760524163840.0, + "grad_norm": 1.8597953216817902, + "language_loss": 0.73587501, + "learning_rate": 3.5378590132364182e-06, + "loss": 0.75775993, + "num_input_tokens_seen": 87370650, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.92578125, + "step": 4061, + "time_per_iteration": 2.5002825260162354 + }, + { + "auxiliary_loss_clip": 0.01129662, + "auxiliary_loss_mlp": 0.01036553, + "balance_loss_clip": 1.02248669, + "balance_loss_mlp": 1.04437923, + "epoch": 0.24422065233729146, + "flos": 21105850717440.0, + "grad_norm": 1.6775055914479682, + "language_loss": 0.76006806, + "learning_rate": 3.5376099881528768e-06, + "loss": 0.78173012, + "num_input_tokens_seen": 87389020, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8515625, + "step": 4062, + "time_per_iteration": 2.478625535964966 + }, + { + "auxiliary_loss_clip": 0.01126984, + "auxiliary_loss_mlp": 0.01035611, + "balance_loss_clip": 1.0205195, + "balance_loss_mlp": 1.04376316, + "epoch": 0.24428077558995942, + "flos": 25263156879360.0, + "grad_norm": 1.7282475931571, + "language_loss": 0.85710216, + "learning_rate": 3.537360904763011e-06, + "loss": 0.87872803, + "num_input_tokens_seen": 87409695, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.83203125, + "step": 4063, + "time_per_iteration": 2.5161943435668945 + }, + { + "auxiliary_loss_clip": 0.01135931, + "auxiliary_loss_mlp": 0.01042417, + "balance_loss_clip": 1.02603722, + "balance_loss_mlp": 1.04589176, + "epoch": 0.24434089884262739, + "flos": 20485278420480.0, + "grad_norm": 6.32752237165424, + "language_loss": 0.68127096, + "learning_rate": 3.5371117630762656e-06, + "loss": 0.70305437, + "num_input_tokens_seen": 87428250, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8984375, + "step": 4064, + "time_per_iteration": 2.4434523582458496 + }, + { + "auxiliary_loss_clip": 0.01134926, + "auxiliary_loss_mlp": 0.01037547, + "balance_loss_clip": 1.02083397, + "balance_loss_mlp": 1.04318714, + "epoch": 0.24440102209529535, + "flos": 23621895711360.0, + "grad_norm": 1.5178524812834733, + "language_loss": 0.7003206, + "learning_rate": 3.536862563102088e-06, + "loss": 0.72204536, + "num_input_tokens_seen": 87449380, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.91796875, + "step": 4065, + "time_per_iteration": 2.513827085494995 + }, + { + "auxiliary_loss_clip": 0.01136726, + "auxiliary_loss_mlp": 0.01047876, + "balance_loss_clip": 1.02960134, + "balance_loss_mlp": 1.04461718, + "epoch": 0.24446114534796332, + "flos": 20554729367040.0, + "grad_norm": 2.0517728790430048, + "language_loss": 0.83912247, + "learning_rate": 3.5366133048499282e-06, + "loss": 0.86096847, + "num_input_tokens_seen": 87465365, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.921875, + "step": 4066, + "time_per_iteration": 2.4601314067840576 + }, + { + "auxiliary_loss_clip": 0.01053849, + "auxiliary_loss_mlp": 0.01006665, + "balance_loss_clip": 1.00455475, + "balance_loss_mlp": 1.02389407, + "epoch": 0.24452126860063128, + "flos": 60389575009920.0, + "grad_norm": 0.7387464995159381, + "language_loss": 0.52291965, + "learning_rate": 3.5363639883292374e-06, + "loss": 0.54352474, + "num_input_tokens_seen": 87522525, + "router_z_loss_clip": 0.02111816, + "router_z_loss_mlp": 0.29882812, + "step": 4067, + "time_per_iteration": 2.9973862171173096 + }, + { + "auxiliary_loss_clip": 0.01133754, + "auxiliary_loss_mlp": 0.01040771, + "balance_loss_clip": 1.0242008, + "balance_loss_mlp": 1.04483843, + "epoch": 0.24458139185329927, + "flos": 15121660878720.0, + "grad_norm": 3.022186633601072, + "language_loss": 0.71927387, + "learning_rate": 3.5361146135494706e-06, + "loss": 0.74101913, + "num_input_tokens_seen": 87539170, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.890625, + "step": 4068, + "time_per_iteration": 2.4484708309173584 + }, + { + "auxiliary_loss_clip": 0.01132664, + "auxiliary_loss_mlp": 0.01040777, + "balance_loss_clip": 1.02457666, + "balance_loss_mlp": 1.04505873, + "epoch": 0.24464151510596724, + "flos": 27998723842560.0, + "grad_norm": 1.494083672668599, + "language_loss": 0.77513826, + "learning_rate": 3.5358651805200835e-06, + "loss": 0.79687262, + "num_input_tokens_seen": 87558875, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.875, + "step": 4069, + "time_per_iteration": 2.5724000930786133 + }, + { + "auxiliary_loss_clip": 0.01133522, + "auxiliary_loss_mlp": 0.01047384, + "balance_loss_clip": 1.03101087, + "balance_loss_mlp": 1.04646873, + "epoch": 0.2447016383586352, + "flos": 19792884879360.0, + "grad_norm": 1.9755919994455295, + "language_loss": 0.80163878, + "learning_rate": 3.5356156892505347e-06, + "loss": 0.82344782, + "num_input_tokens_seen": 87576485, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.87109375, + "step": 4070, + "time_per_iteration": 2.4932186603546143 + }, + { + "auxiliary_loss_clip": 0.01130692, + "auxiliary_loss_mlp": 0.01045764, + "balance_loss_clip": 1.03018379, + "balance_loss_mlp": 1.04351497, + "epoch": 0.24476176161130317, + "flos": 26067340523520.0, + "grad_norm": 1.6271146290001441, + "language_loss": 0.8410303, + "learning_rate": 3.5353661397502854e-06, + "loss": 0.86279482, + "num_input_tokens_seen": 87598620, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.875, + "step": 4071, + "time_per_iteration": 2.5299296379089355 + }, + { + "auxiliary_loss_clip": 0.0113627, + "auxiliary_loss_mlp": 0.01045362, + "balance_loss_clip": 1.02795792, + "balance_loss_mlp": 1.04406631, + "epoch": 0.24482188486397113, + "flos": 18843550375680.0, + "grad_norm": 1.720640728536457, + "language_loss": 0.79751229, + "learning_rate": 3.535116532028798e-06, + "loss": 0.81932867, + "num_input_tokens_seen": 87616595, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.921875, + "step": 4072, + "time_per_iteration": 2.470327854156494 + }, + { + "auxiliary_loss_clip": 0.01129102, + "auxiliary_loss_mlp": 0.01043575, + "balance_loss_clip": 1.02906084, + "balance_loss_mlp": 1.04437995, + "epoch": 0.2448820081166391, + "flos": 21251791676160.0, + "grad_norm": 1.615929332251483, + "language_loss": 0.70322561, + "learning_rate": 3.5348668660955382e-06, + "loss": 0.7249524, + "num_input_tokens_seen": 87635755, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.84765625, + "step": 4073, + "time_per_iteration": 2.4951980113983154 + }, + { + "auxiliary_loss_clip": 0.01129351, + "auxiliary_loss_mlp": 0.01041111, + "balance_loss_clip": 1.02662683, + "balance_loss_mlp": 1.04456043, + "epoch": 0.2449421313693071, + "flos": 23950586090880.0, + "grad_norm": 2.5968867848691133, + "language_loss": 0.67692697, + "learning_rate": 3.5346171419599728e-06, + "loss": 0.69863164, + "num_input_tokens_seen": 87652885, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.84765625, + "step": 4074, + "time_per_iteration": 2.4697325229644775 + }, + { + "auxiliary_loss_clip": 0.01052266, + "auxiliary_loss_mlp": 0.01006876, + "balance_loss_clip": 1.00504053, + "balance_loss_mlp": 1.0222578, + "epoch": 0.24500225462197506, + "flos": 60687669980160.0, + "grad_norm": 0.896032421619399, + "language_loss": 0.68665123, + "learning_rate": 3.5343673596315718e-06, + "loss": 0.70724261, + "num_input_tokens_seen": 87713220, + "router_z_loss_clip": 0.01831055, + "router_z_loss_mlp": 0.30078125, + "step": 4075, + "time_per_iteration": 3.1993846893310547 + }, + { + "auxiliary_loss_clip": 0.01131428, + "auxiliary_loss_mlp": 0.01040459, + "balance_loss_clip": 1.02548659, + "balance_loss_mlp": 1.04603517, + "epoch": 0.24506237787464302, + "flos": 26284204886400.0, + "grad_norm": 2.243483207404797, + "language_loss": 0.79306483, + "learning_rate": 3.5341175191198063e-06, + "loss": 0.81478369, + "num_input_tokens_seen": 87732680, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.85546875, + "step": 4076, + "time_per_iteration": 2.542245388031006 + }, + { + "auxiliary_loss_clip": 0.01134594, + "auxiliary_loss_mlp": 0.01045082, + "balance_loss_clip": 1.02749884, + "balance_loss_mlp": 1.04342794, + "epoch": 0.245122501127311, + "flos": 20552287242240.0, + "grad_norm": 2.0630196459837618, + "language_loss": 0.82211018, + "learning_rate": 3.533867620434151e-06, + "loss": 0.84390688, + "num_input_tokens_seen": 87751880, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.91015625, + "step": 4077, + "time_per_iteration": 2.5165140628814697 + }, + { + "auxiliary_loss_clip": 0.01132098, + "auxiliary_loss_mlp": 0.01044565, + "balance_loss_clip": 1.02695799, + "balance_loss_mlp": 1.04380083, + "epoch": 0.24518262437997895, + "flos": 29132603447040.0, + "grad_norm": 12.782264679420269, + "language_loss": 0.61930454, + "learning_rate": 3.533617663584082e-06, + "loss": 0.64107114, + "num_input_tokens_seen": 87771795, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8828125, + "step": 4078, + "time_per_iteration": 2.5202372074127197 + }, + { + "auxiliary_loss_clip": 0.01129452, + "auxiliary_loss_mlp": 0.0103596, + "balance_loss_clip": 1.02035594, + "balance_loss_mlp": 1.04474652, + "epoch": 0.24524274763264692, + "flos": 23476924419840.0, + "grad_norm": 1.7044874550491866, + "language_loss": 0.75514519, + "learning_rate": 3.5333676485790765e-06, + "loss": 0.77679932, + "num_input_tokens_seen": 87793640, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84765625, + "step": 4079, + "time_per_iteration": 2.483339309692383 + }, + { + "auxiliary_loss_clip": 0.01129188, + "auxiliary_loss_mlp": 0.01042937, + "balance_loss_clip": 1.02686739, + "balance_loss_mlp": 1.04370368, + "epoch": 0.24530287088531488, + "flos": 17201175886080.0, + "grad_norm": 1.8257477744529516, + "language_loss": 0.74925131, + "learning_rate": 3.5331175754286173e-06, + "loss": 0.77097261, + "num_input_tokens_seen": 87812390, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.85546875, + "step": 4080, + "time_per_iteration": 2.4843389987945557 + }, + { + "auxiliary_loss_clip": 0.01125805, + "auxiliary_loss_mlp": 0.01039252, + "balance_loss_clip": 1.02375531, + "balance_loss_mlp": 1.04129529, + "epoch": 0.24536299413798288, + "flos": 14867449349760.0, + "grad_norm": 2.211780780293779, + "language_loss": 0.82807517, + "learning_rate": 3.532867444142186e-06, + "loss": 0.84972572, + "num_input_tokens_seen": 87830640, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.84375, + "step": 4081, + "time_per_iteration": 2.4753835201263428 + }, + { + "auxiliary_loss_clip": 0.01128982, + "auxiliary_loss_mlp": 0.01039204, + "balance_loss_clip": 1.02445221, + "balance_loss_mlp": 1.04313576, + "epoch": 0.24542311739065084, + "flos": 35262051886080.0, + "grad_norm": 4.1574914526272515, + "language_loss": 0.73153239, + "learning_rate": 3.532617254729267e-06, + "loss": 0.75321424, + "num_input_tokens_seen": 87850450, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.859375, + "step": 4082, + "time_per_iteration": 2.5975396633148193 + }, + { + "auxiliary_loss_clip": 0.01127179, + "auxiliary_loss_mlp": 0.01042857, + "balance_loss_clip": 1.02837873, + "balance_loss_mlp": 1.04274178, + "epoch": 0.2454832406433188, + "flos": 21503130117120.0, + "grad_norm": 1.543838453785988, + "language_loss": 0.71628594, + "learning_rate": 3.5323670071993485e-06, + "loss": 0.73798621, + "num_input_tokens_seen": 87868810, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.84375, + "step": 4083, + "time_per_iteration": 2.471519947052002 + }, + { + "auxiliary_loss_clip": 0.01131409, + "auxiliary_loss_mlp": 0.01040567, + "balance_loss_clip": 1.02285206, + "balance_loss_mlp": 1.04234004, + "epoch": 0.24554336389598677, + "flos": 14756664827520.0, + "grad_norm": 2.1941070650453094, + "language_loss": 0.74700832, + "learning_rate": 3.532116701561919e-06, + "loss": 0.76872808, + "num_input_tokens_seen": 87885685, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.890625, + "step": 4084, + "time_per_iteration": 2.4286506175994873 + }, + { + "auxiliary_loss_clip": 0.01126312, + "auxiliary_loss_mlp": 0.01035336, + "balance_loss_clip": 1.01986289, + "balance_loss_mlp": 1.04189909, + "epoch": 0.24560348714865474, + "flos": 14976402278400.0, + "grad_norm": 2.042106499003273, + "language_loss": 0.85206825, + "learning_rate": 3.531866337826471e-06, + "loss": 0.8736847, + "num_input_tokens_seen": 87903715, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84375, + "step": 4085, + "time_per_iteration": 2.4283318519592285 + }, + { + "auxiliary_loss_clip": 0.01130256, + "auxiliary_loss_mlp": 0.01048422, + "balance_loss_clip": 1.03209007, + "balance_loss_mlp": 1.04266381, + "epoch": 0.2456636104013227, + "flos": 22675326554880.0, + "grad_norm": 1.8090063737063005, + "language_loss": 0.7876097, + "learning_rate": 3.5316159160024982e-06, + "loss": 0.80939639, + "num_input_tokens_seen": 87923375, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.875, + "step": 4086, + "time_per_iteration": 2.478954792022705 + }, + { + "auxiliary_loss_clip": 0.01126651, + "auxiliary_loss_mlp": 0.01041575, + "balance_loss_clip": 1.02669752, + "balance_loss_mlp": 1.04330873, + "epoch": 0.2457237336539907, + "flos": 27417869009280.0, + "grad_norm": 1.6669278195562474, + "language_loss": 0.75269985, + "learning_rate": 3.531365436099496e-06, + "loss": 0.77438211, + "num_input_tokens_seen": 87943115, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8359375, + "step": 4087, + "time_per_iteration": 2.4871292114257812 + }, + { + "auxiliary_loss_clip": 0.01132319, + "auxiliary_loss_mlp": 0.01041093, + "balance_loss_clip": 1.02364135, + "balance_loss_mlp": 1.04574418, + "epoch": 0.24578385690665866, + "flos": 20412379768320.0, + "grad_norm": 2.5789657141026, + "language_loss": 0.79284519, + "learning_rate": 3.5311148981269635e-06, + "loss": 0.81457937, + "num_input_tokens_seen": 87959505, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.8671875, + "step": 4088, + "time_per_iteration": 2.479841709136963 + }, + { + "auxiliary_loss_clip": 0.01123487, + "auxiliary_loss_mlp": 0.01033801, + "balance_loss_clip": 1.0196631, + "balance_loss_mlp": 1.04091823, + "epoch": 0.24584398015932662, + "flos": 23915393740800.0, + "grad_norm": 1.6187757849670203, + "language_loss": 0.7736612, + "learning_rate": 3.5308643020944e-06, + "loss": 0.79523408, + "num_input_tokens_seen": 87979725, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.828125, + "step": 4089, + "time_per_iteration": 2.483436346054077 + }, + { + "auxiliary_loss_clip": 0.01129539, + "auxiliary_loss_mlp": 0.01040613, + "balance_loss_clip": 1.02440071, + "balance_loss_mlp": 1.04232669, + "epoch": 0.2459041034119946, + "flos": 41496359103360.0, + "grad_norm": 3.8690522662716416, + "language_loss": 0.81463957, + "learning_rate": 3.530613648011309e-06, + "loss": 0.83634108, + "num_input_tokens_seen": 87998270, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.87109375, + "step": 4090, + "time_per_iteration": 2.657944917678833 + }, + { + "auxiliary_loss_clip": 0.01132703, + "auxiliary_loss_mlp": 0.01049826, + "balance_loss_clip": 1.03265369, + "balance_loss_mlp": 1.04411578, + "epoch": 0.24596422666466256, + "flos": 19936814676480.0, + "grad_norm": 1.9398667366019489, + "language_loss": 0.72874928, + "learning_rate": 3.5303629358871946e-06, + "loss": 0.75057453, + "num_input_tokens_seen": 88016760, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.88671875, + "step": 4091, + "time_per_iteration": 2.448307991027832 + }, + { + "auxiliary_loss_clip": 0.0113285, + "auxiliary_loss_mlp": 0.01036521, + "balance_loss_clip": 1.02166772, + "balance_loss_mlp": 1.04811478, + "epoch": 0.24602434991733052, + "flos": 21544391865600.0, + "grad_norm": 1.9209724672120978, + "language_loss": 0.76486623, + "learning_rate": 3.5301121657315653e-06, + "loss": 0.78656, + "num_input_tokens_seen": 88036465, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.84765625, + "step": 4092, + "time_per_iteration": 2.510815143585205 + }, + { + "auxiliary_loss_clip": 0.01134482, + "auxiliary_loss_mlp": 0.01035407, + "balance_loss_clip": 1.01899242, + "balance_loss_mlp": 1.04404068, + "epoch": 0.24608447316999849, + "flos": 23185078416000.0, + "grad_norm": 2.544549098738024, + "language_loss": 0.80905128, + "learning_rate": 3.5298613375539287e-06, + "loss": 0.83075017, + "num_input_tokens_seen": 88053270, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90234375, + "step": 4093, + "time_per_iteration": 2.4658117294311523 + }, + { + "auxiliary_loss_clip": 0.01133522, + "auxiliary_loss_mlp": 0.01042815, + "balance_loss_clip": 1.02542281, + "balance_loss_mlp": 1.04285693, + "epoch": 0.24614459642266648, + "flos": 19641951930240.0, + "grad_norm": 1.9793331271335382, + "language_loss": 0.87355959, + "learning_rate": 3.529610451363797e-06, + "loss": 0.89532292, + "num_input_tokens_seen": 88072305, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.90625, + "step": 4094, + "time_per_iteration": 5.436578035354614 + }, + { + "auxiliary_loss_clip": 0.01055645, + "auxiliary_loss_mlp": 0.01004731, + "balance_loss_clip": 1.00285995, + "balance_loss_mlp": 1.02449679, + "epoch": 0.24620471967533444, + "flos": 61739816186880.0, + "grad_norm": 0.7591937233735362, + "language_loss": 0.57501638, + "learning_rate": 3.5293595071706833e-06, + "loss": 0.59562016, + "num_input_tokens_seen": 88137995, + "router_z_loss_clip": 0.01867676, + "router_z_loss_mlp": 0.3125, + "step": 4095, + "time_per_iteration": 3.1966967582702637 + }, + { + "auxiliary_loss_clip": 0.01055105, + "auxiliary_loss_mlp": 0.01001708, + "balance_loss_clip": 0.99987203, + "balance_loss_mlp": 1.02336812, + "epoch": 0.2462648429280024, + "flos": 69154436315520.0, + "grad_norm": 0.643968481445629, + "language_loss": 0.56195372, + "learning_rate": 3.5291085049841042e-06, + "loss": 0.58252186, + "num_input_tokens_seen": 88208490, + "router_z_loss_clip": 0.01831055, + "router_z_loss_mlp": 0.31640625, + "step": 4096, + "time_per_iteration": 3.187084436416626 + }, + { + "auxiliary_loss_clip": 0.01134655, + "auxiliary_loss_mlp": 0.01035351, + "balance_loss_clip": 1.02030087, + "balance_loss_mlp": 1.04697204, + "epoch": 0.24632496618067037, + "flos": 29459605887360.0, + "grad_norm": 2.0390556104017907, + "language_loss": 0.77674699, + "learning_rate": 3.5288574448135773e-06, + "loss": 0.79844701, + "num_input_tokens_seen": 88228050, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.875, + "step": 4097, + "time_per_iteration": 2.5585436820983887 + }, + { + "auxiliary_loss_clip": 0.01134087, + "auxiliary_loss_mlp": 0.01044655, + "balance_loss_clip": 1.02608228, + "balance_loss_mlp": 1.04491377, + "epoch": 0.24638508943333834, + "flos": 24316444068480.0, + "grad_norm": 2.135816170269485, + "language_loss": 0.76393569, + "learning_rate": 3.5286063266686235e-06, + "loss": 0.78572309, + "num_input_tokens_seen": 88248090, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.890625, + "step": 4098, + "time_per_iteration": 2.478665828704834 + }, + { + "auxiliary_loss_clip": 0.01133268, + "auxiliary_loss_mlp": 0.01040789, + "balance_loss_clip": 1.02568507, + "balance_loss_mlp": 1.04479909, + "epoch": 0.2464452126860063, + "flos": 26613254401920.0, + "grad_norm": 2.152719854213413, + "language_loss": 0.68733507, + "learning_rate": 3.528355150558764e-06, + "loss": 0.70907569, + "num_input_tokens_seen": 88267545, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8828125, + "step": 4099, + "time_per_iteration": 2.515821933746338 + }, + { + "auxiliary_loss_clip": 0.01124761, + "auxiliary_loss_mlp": 0.01041381, + "balance_loss_clip": 1.02621734, + "balance_loss_mlp": 1.04163074, + "epoch": 0.24650533593867427, + "flos": 31212405763200.0, + "grad_norm": 2.459538616056665, + "language_loss": 0.65975124, + "learning_rate": 3.5281039164935237e-06, + "loss": 0.68141258, + "num_input_tokens_seen": 88289785, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.83203125, + "step": 4100, + "time_per_iteration": 2.562962532043457 + }, + { + "auxiliary_loss_clip": 0.01051305, + "auxiliary_loss_mlp": 0.01002462, + "balance_loss_clip": 1.00055432, + "balance_loss_mlp": 1.02057505, + "epoch": 0.24656545919134226, + "flos": 68494002900480.0, + "grad_norm": 0.7078763540659354, + "language_loss": 0.61549371, + "learning_rate": 3.5278526244824304e-06, + "loss": 0.63603139, + "num_input_tokens_seen": 88357320, + "router_z_loss_clip": 0.01904297, + "router_z_loss_mlp": 0.30859375, + "step": 4101, + "time_per_iteration": 3.1617352962493896 + }, + { + "auxiliary_loss_clip": 0.01128803, + "auxiliary_loss_mlp": 0.01034815, + "balance_loss_clip": 1.01893687, + "balance_loss_mlp": 1.04385781, + "epoch": 0.24662558244401023, + "flos": 20084192179200.0, + "grad_norm": 1.7154022892986804, + "language_loss": 0.73020113, + "learning_rate": 3.527601274535012e-06, + "loss": 0.75183737, + "num_input_tokens_seen": 88377040, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8515625, + "step": 4102, + "time_per_iteration": 2.5522637367248535 + }, + { + "auxiliary_loss_clip": 0.01132375, + "auxiliary_loss_mlp": 0.0104013, + "balance_loss_clip": 1.02463281, + "balance_loss_mlp": 1.04294777, + "epoch": 0.2466857056966782, + "flos": 30701361012480.0, + "grad_norm": 2.2979425011191528, + "language_loss": 0.75574934, + "learning_rate": 3.5273498666608004e-06, + "loss": 0.7774744, + "num_input_tokens_seen": 88395085, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.89453125, + "step": 4103, + "time_per_iteration": 2.5117204189300537 + }, + { + "auxiliary_loss_clip": 0.01129454, + "auxiliary_loss_mlp": 0.01043396, + "balance_loss_clip": 1.02647424, + "balance_loss_mlp": 1.04096079, + "epoch": 0.24674582894934616, + "flos": 22528523669760.0, + "grad_norm": 2.002646106823912, + "language_loss": 0.78701174, + "learning_rate": 3.5270984008693288e-06, + "loss": 0.80874026, + "num_input_tokens_seen": 88413205, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.88671875, + "step": 4104, + "time_per_iteration": 2.5791869163513184 + }, + { + "auxiliary_loss_clip": 0.011264, + "auxiliary_loss_mlp": 0.01041575, + "balance_loss_clip": 1.02333593, + "balance_loss_mlp": 1.0411272, + "epoch": 0.24680595220201412, + "flos": 20704297599360.0, + "grad_norm": 1.7283937272898544, + "language_loss": 0.83567655, + "learning_rate": 3.526846877170133e-06, + "loss": 0.85735631, + "num_input_tokens_seen": 88431525, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.8515625, + "step": 4105, + "time_per_iteration": 2.447399854660034 + }, + { + "auxiliary_loss_clip": 0.01134164, + "auxiliary_loss_mlp": 0.01043152, + "balance_loss_clip": 1.02768457, + "balance_loss_mlp": 1.04806173, + "epoch": 0.2468660754546821, + "flos": 21831174051840.0, + "grad_norm": 1.7373974977996043, + "language_loss": 0.7646578, + "learning_rate": 3.52659529557275e-06, + "loss": 0.78643101, + "num_input_tokens_seen": 88451210, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.859375, + "step": 4106, + "time_per_iteration": 2.519059658050537 + }, + { + "auxiliary_loss_clip": 0.01127139, + "auxiliary_loss_mlp": 0.01042215, + "balance_loss_clip": 1.02539492, + "balance_loss_mlp": 1.04087114, + "epoch": 0.24692619870735008, + "flos": 15267709578240.0, + "grad_norm": 2.1665884513414513, + "language_loss": 0.72764528, + "learning_rate": 3.5263436560867205e-06, + "loss": 0.74933887, + "num_input_tokens_seen": 88467790, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.86328125, + "step": 4107, + "time_per_iteration": 2.4489266872406006 + }, + { + "auxiliary_loss_clip": 0.01131987, + "auxiliary_loss_mlp": 0.01048032, + "balance_loss_clip": 1.03173625, + "balance_loss_mlp": 1.0454886, + "epoch": 0.24698632196001805, + "flos": 29680097523840.0, + "grad_norm": 2.3712774609847274, + "language_loss": 0.65420353, + "learning_rate": 3.526091958721587e-06, + "loss": 0.67600369, + "num_input_tokens_seen": 88490330, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8671875, + "step": 4108, + "time_per_iteration": 2.5401792526245117 + }, + { + "auxiliary_loss_clip": 0.01131766, + "auxiliary_loss_mlp": 0.01046054, + "balance_loss_clip": 1.02961504, + "balance_loss_mlp": 1.04324555, + "epoch": 0.247046445212686, + "flos": 39165469741440.0, + "grad_norm": 2.174268382145969, + "language_loss": 0.72611141, + "learning_rate": 3.5258402034868936e-06, + "loss": 0.74788952, + "num_input_tokens_seen": 88512435, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8828125, + "step": 4109, + "time_per_iteration": 2.593358278274536 + }, + { + "auxiliary_loss_clip": 0.01133432, + "auxiliary_loss_mlp": 0.010446, + "balance_loss_clip": 1.02788687, + "balance_loss_mlp": 1.04414606, + "epoch": 0.24710656846535398, + "flos": 22998845376000.0, + "grad_norm": 1.7026194733932167, + "language_loss": 0.79302657, + "learning_rate": 3.5255883903921866e-06, + "loss": 0.81480682, + "num_input_tokens_seen": 88529780, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.89453125, + "step": 4110, + "time_per_iteration": 2.4776864051818848 + }, + { + "auxiliary_loss_clip": 0.01133691, + "auxiliary_loss_mlp": 0.01032561, + "balance_loss_clip": 1.01618171, + "balance_loss_mlp": 1.04541993, + "epoch": 0.24716669171802194, + "flos": 26432803451520.0, + "grad_norm": 2.5002063230568545, + "language_loss": 0.80653715, + "learning_rate": 3.5253365194470144e-06, + "loss": 0.82819968, + "num_input_tokens_seen": 88547200, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8828125, + "step": 4111, + "time_per_iteration": 2.4957237243652344 + }, + { + "auxiliary_loss_clip": 0.01130011, + "auxiliary_loss_mlp": 0.01039883, + "balance_loss_clip": 1.02517819, + "balance_loss_mlp": 1.04273677, + "epoch": 0.2472268149706899, + "flos": 23329870139520.0, + "grad_norm": 2.4547784256207663, + "language_loss": 0.75205207, + "learning_rate": 3.5250845906609294e-06, + "loss": 0.77375102, + "num_input_tokens_seen": 88566415, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.875, + "step": 4112, + "time_per_iteration": 2.481778860092163 + }, + { + "auxiliary_loss_clip": 0.01130648, + "auxiliary_loss_mlp": 0.01044261, + "balance_loss_clip": 1.02868617, + "balance_loss_mlp": 1.04366612, + "epoch": 0.24728693822335787, + "flos": 23768734510080.0, + "grad_norm": 1.9927491285660106, + "language_loss": 0.82454932, + "learning_rate": 3.5248326040434835e-06, + "loss": 0.8462984, + "num_input_tokens_seen": 88585225, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.87109375, + "step": 4113, + "time_per_iteration": 2.4658617973327637 + }, + { + "auxiliary_loss_clip": 0.01129834, + "auxiliary_loss_mlp": 0.01036833, + "balance_loss_clip": 1.0205375, + "balance_loss_mlp": 1.0423646, + "epoch": 0.24734706147602586, + "flos": 19317499355520.0, + "grad_norm": 2.834925175676511, + "language_loss": 0.87073094, + "learning_rate": 3.5245805596042322e-06, + "loss": 0.89239764, + "num_input_tokens_seen": 88603280, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.875, + "step": 4114, + "time_per_iteration": 2.4575555324554443 + }, + { + "auxiliary_loss_clip": 0.01130204, + "auxiliary_loss_mlp": 0.01038167, + "balance_loss_clip": 1.02274156, + "balance_loss_mlp": 1.04354906, + "epoch": 0.24740718472869383, + "flos": 28036932935040.0, + "grad_norm": 2.804779626044085, + "language_loss": 0.753479, + "learning_rate": 3.524328457352734e-06, + "loss": 0.7751627, + "num_input_tokens_seen": 88624925, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8671875, + "step": 4115, + "time_per_iteration": 2.5051238536834717 + }, + { + "auxiliary_loss_clip": 0.01052886, + "auxiliary_loss_mlp": 0.01002125, + "balance_loss_clip": 1.00016963, + "balance_loss_mlp": 1.02261877, + "epoch": 0.2474673079813618, + "flos": 68107569408000.0, + "grad_norm": 0.6664049604648837, + "language_loss": 0.58203655, + "learning_rate": 3.5240762972985475e-06, + "loss": 0.60258663, + "num_input_tokens_seen": 88691475, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.30273438, + "step": 4116, + "time_per_iteration": 3.172032117843628 + }, + { + "auxiliary_loss_clip": 0.01130845, + "auxiliary_loss_mlp": 0.01035114, + "balance_loss_clip": 1.01992679, + "balance_loss_mlp": 1.04510772, + "epoch": 0.24752743123402976, + "flos": 29462119839360.0, + "grad_norm": 1.6806447251481575, + "language_loss": 0.83616889, + "learning_rate": 3.523824079451235e-06, + "loss": 0.8578285, + "num_input_tokens_seen": 88713425, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.85546875, + "step": 4117, + "time_per_iteration": 2.5228748321533203 + }, + { + "auxiliary_loss_clip": 0.01053619, + "auxiliary_loss_mlp": 0.0100274, + "balance_loss_clip": 1.00073707, + "balance_loss_mlp": 1.02337885, + "epoch": 0.24758755448669773, + "flos": 58350459824640.0, + "grad_norm": 0.9069522642789956, + "language_loss": 0.63507527, + "learning_rate": 3.5235718038203602e-06, + "loss": 0.65563887, + "num_input_tokens_seen": 88769995, + "router_z_loss_clip": 0.02001953, + "router_z_loss_mlp": 0.30078125, + "step": 4118, + "time_per_iteration": 2.9459333419799805 + }, + { + "auxiliary_loss_clip": 0.0113153, + "auxiliary_loss_mlp": 0.01040526, + "balance_loss_clip": 1.02470684, + "balance_loss_mlp": 1.04544902, + "epoch": 0.2476476777393657, + "flos": 20484416494080.0, + "grad_norm": 1.5050779056214143, + "language_loss": 0.79252797, + "learning_rate": 3.523319470415491e-06, + "loss": 0.8142485, + "num_input_tokens_seen": 88789970, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.859375, + "step": 4119, + "time_per_iteration": 2.438519239425659 + }, + { + "auxiliary_loss_clip": 0.01129874, + "auxiliary_loss_mlp": 0.01039225, + "balance_loss_clip": 1.02359676, + "balance_loss_mlp": 1.04430819, + "epoch": 0.24770780099203366, + "flos": 20485853038080.0, + "grad_norm": 1.9430586352888408, + "language_loss": 0.73955107, + "learning_rate": 3.5230670792461943e-06, + "loss": 0.76124215, + "num_input_tokens_seen": 88810000, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.85546875, + "step": 4120, + "time_per_iteration": 2.4728164672851562 + }, + { + "auxiliary_loss_clip": 0.01132188, + "auxiliary_loss_mlp": 0.01047049, + "balance_loss_clip": 1.03010893, + "balance_loss_mlp": 1.0446558, + "epoch": 0.24776792424470165, + "flos": 15153405523200.0, + "grad_norm": 3.4886461941998563, + "language_loss": 0.88028777, + "learning_rate": 3.522814630322041e-06, + "loss": 0.90208006, + "num_input_tokens_seen": 88827515, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.875, + "step": 4121, + "time_per_iteration": 2.4117653369903564 + }, + { + "auxiliary_loss_clip": 0.01134577, + "auxiliary_loss_mlp": 0.01037836, + "balance_loss_clip": 1.02102745, + "balance_loss_mlp": 1.04516518, + "epoch": 0.2478280474973696, + "flos": 21725453347200.0, + "grad_norm": 2.7360865086006285, + "language_loss": 0.69088298, + "learning_rate": 3.5225621236526045e-06, + "loss": 0.71260709, + "num_input_tokens_seen": 88845025, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.89453125, + "step": 4122, + "time_per_iteration": 2.484830617904663 + }, + { + "auxiliary_loss_clip": 0.0113273, + "auxiliary_loss_mlp": 0.01040601, + "balance_loss_clip": 1.02224231, + "balance_loss_mlp": 1.04380226, + "epoch": 0.24788817075003758, + "flos": 20412200200320.0, + "grad_norm": 2.016808492688271, + "language_loss": 0.80196065, + "learning_rate": 3.5223095592474596e-06, + "loss": 0.82369387, + "num_input_tokens_seen": 88861740, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.890625, + "step": 4123, + "time_per_iteration": 2.43839955329895 + }, + { + "auxiliary_loss_clip": 0.01130784, + "auxiliary_loss_mlp": 0.01041496, + "balance_loss_clip": 1.02620113, + "balance_loss_mlp": 1.04464054, + "epoch": 0.24794829400270554, + "flos": 22594455083520.0, + "grad_norm": 2.3250466211888745, + "language_loss": 0.74919629, + "learning_rate": 3.5220569371161846e-06, + "loss": 0.77091914, + "num_input_tokens_seen": 88879740, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.86328125, + "step": 4124, + "time_per_iteration": 2.4909141063690186 + }, + { + "auxiliary_loss_clip": 0.01127616, + "auxiliary_loss_mlp": 0.01034142, + "balance_loss_clip": 1.01922846, + "balance_loss_mlp": 1.0432241, + "epoch": 0.2480084172553735, + "flos": 39676047615360.0, + "grad_norm": 1.6909299882519486, + "language_loss": 0.73759794, + "learning_rate": 3.521804257268357e-06, + "loss": 0.75921559, + "num_input_tokens_seen": 88904095, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.84375, + "step": 4125, + "time_per_iteration": 2.6068458557128906 + }, + { + "auxiliary_loss_clip": 0.01135393, + "auxiliary_loss_mlp": 0.01046005, + "balance_loss_clip": 1.02914929, + "balance_loss_mlp": 1.04383993, + "epoch": 0.24806854050804147, + "flos": 22053712763520.0, + "grad_norm": 2.376019449241759, + "language_loss": 0.69416726, + "learning_rate": 3.5215515197135595e-06, + "loss": 0.71598125, + "num_input_tokens_seen": 88920740, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9140625, + "step": 4126, + "time_per_iteration": 2.4516806602478027 + }, + { + "auxiliary_loss_clip": 0.01130323, + "auxiliary_loss_mlp": 0.01047803, + "balance_loss_clip": 1.03112614, + "balance_loss_mlp": 1.04299593, + "epoch": 0.24812866376070947, + "flos": 15486764670720.0, + "grad_norm": 2.081795572279456, + "language_loss": 0.81602275, + "learning_rate": 3.5212987244613764e-06, + "loss": 0.83780402, + "num_input_tokens_seen": 88938510, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.875, + "step": 4127, + "time_per_iteration": 2.482492446899414 + }, + { + "auxiliary_loss_clip": 0.01134053, + "auxiliary_loss_mlp": 0.01045575, + "balance_loss_clip": 1.02998269, + "balance_loss_mlp": 1.04527378, + "epoch": 0.24818878701337743, + "flos": 14757419013120.0, + "grad_norm": 5.2721581441441465, + "language_loss": 0.84604752, + "learning_rate": 3.5210458715213927e-06, + "loss": 0.86784381, + "num_input_tokens_seen": 88955235, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.88671875, + "step": 4128, + "time_per_iteration": 2.4577715396881104 + }, + { + "auxiliary_loss_clip": 0.01132567, + "auxiliary_loss_mlp": 0.01043389, + "balance_loss_clip": 1.02779055, + "balance_loss_mlp": 1.04397762, + "epoch": 0.2482489102660454, + "flos": 27089501852160.0, + "grad_norm": 3.598051635390234, + "language_loss": 0.65576231, + "learning_rate": 3.5207929609031973e-06, + "loss": 0.67752188, + "num_input_tokens_seen": 88975210, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8828125, + "step": 4129, + "time_per_iteration": 2.498321294784546 + }, + { + "auxiliary_loss_clip": 0.01130769, + "auxiliary_loss_mlp": 0.01043091, + "balance_loss_clip": 1.02573466, + "balance_loss_mlp": 1.04308498, + "epoch": 0.24830903351871336, + "flos": 26467528924800.0, + "grad_norm": 2.23477186449736, + "language_loss": 0.75251818, + "learning_rate": 3.5205399926163806e-06, + "loss": 0.77425677, + "num_input_tokens_seen": 88996120, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.875, + "step": 4130, + "time_per_iteration": 2.534014940261841 + }, + { + "auxiliary_loss_clip": 0.01132521, + "auxiliary_loss_mlp": 0.01048652, + "balance_loss_clip": 1.03198647, + "balance_loss_mlp": 1.04404271, + "epoch": 0.24836915677138133, + "flos": 10228436870400.0, + "grad_norm": 2.282827015603824, + "language_loss": 0.77323985, + "learning_rate": 3.520286966670535e-06, + "loss": 0.79505157, + "num_input_tokens_seen": 89008685, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8828125, + "step": 4131, + "time_per_iteration": 2.3971383571624756 + }, + { + "auxiliary_loss_clip": 0.011274, + "auxiliary_loss_mlp": 0.01036942, + "balance_loss_clip": 1.02241063, + "balance_loss_mlp": 1.0428257, + "epoch": 0.2484292800240493, + "flos": 30080429579520.0, + "grad_norm": 1.5452946340590639, + "language_loss": 0.83932686, + "learning_rate": 3.520033883075255e-06, + "loss": 0.86097032, + "num_input_tokens_seen": 89031160, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.84375, + "step": 4132, + "time_per_iteration": 2.552804470062256 + }, + { + "auxiliary_loss_clip": 0.01129759, + "auxiliary_loss_mlp": 0.01042276, + "balance_loss_clip": 1.02601552, + "balance_loss_mlp": 1.04280567, + "epoch": 0.24848940327671726, + "flos": 13442944803840.0, + "grad_norm": 2.4707160060639857, + "language_loss": 0.71077073, + "learning_rate": 3.5197807418401386e-06, + "loss": 0.73249108, + "num_input_tokens_seen": 89047235, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.87109375, + "step": 4133, + "time_per_iteration": 2.40258526802063 + }, + { + "auxiliary_loss_clip": 0.01138495, + "auxiliary_loss_mlp": 0.01044523, + "balance_loss_clip": 1.02486503, + "balance_loss_mlp": 1.0454644, + "epoch": 0.24854952652938525, + "flos": 19970247260160.0, + "grad_norm": 2.206352055564895, + "language_loss": 0.61492884, + "learning_rate": 3.5195275429747834e-06, + "loss": 0.63675898, + "num_input_tokens_seen": 89064790, + "router_z_loss_clip": 0.19726562, + "router_z_loss_mlp": 0.9296875, + "step": 4134, + "time_per_iteration": 2.476027250289917 + }, + { + "auxiliary_loss_clip": 0.01133349, + "auxiliary_loss_mlp": 0.01037132, + "balance_loss_clip": 1.02063298, + "balance_loss_mlp": 1.04393268, + "epoch": 0.24860964978205322, + "flos": 18150187167360.0, + "grad_norm": 2.276340033899988, + "language_loss": 0.78899026, + "learning_rate": 3.5192742864887914e-06, + "loss": 0.81069505, + "num_input_tokens_seen": 89083250, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.89453125, + "step": 4135, + "time_per_iteration": 3.9668710231781006 + }, + { + "auxiliary_loss_clip": 0.01136879, + "auxiliary_loss_mlp": 0.01032054, + "balance_loss_clip": 1.01746297, + "balance_loss_mlp": 1.04908156, + "epoch": 0.24866977303472118, + "flos": 11728641329280.0, + "grad_norm": 2.12923907223803, + "language_loss": 0.82729924, + "learning_rate": 3.5190209723917662e-06, + "loss": 0.84898853, + "num_input_tokens_seen": 89100905, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.87890625, + "step": 4136, + "time_per_iteration": 3.8651821613311768 + }, + { + "auxiliary_loss_clip": 0.01135708, + "auxiliary_loss_mlp": 0.01045715, + "balance_loss_clip": 1.02919221, + "balance_loss_mlp": 1.04593039, + "epoch": 0.24872989628738915, + "flos": 34823582565120.0, + "grad_norm": 1.7063584090687087, + "language_loss": 0.70454097, + "learning_rate": 3.518767600693314e-06, + "loss": 0.72635514, + "num_input_tokens_seen": 89122630, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8984375, + "step": 4137, + "time_per_iteration": 2.581270456314087 + }, + { + "auxiliary_loss_clip": 0.01135031, + "auxiliary_loss_mlp": 0.01035007, + "balance_loss_clip": 1.0193553, + "balance_loss_mlp": 1.04428291, + "epoch": 0.2487900195400571, + "flos": 13699347062400.0, + "grad_norm": 2.0340803052703236, + "language_loss": 0.66840076, + "learning_rate": 3.518514171403042e-06, + "loss": 0.69010115, + "num_input_tokens_seen": 89141050, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.90625, + "step": 4138, + "time_per_iteration": 2.438858985900879 + }, + { + "auxiliary_loss_clip": 0.01130089, + "auxiliary_loss_mlp": 0.01035018, + "balance_loss_clip": 1.01977062, + "balance_loss_mlp": 1.0451256, + "epoch": 0.24885014279272508, + "flos": 25337815297920.0, + "grad_norm": 2.467393625239628, + "language_loss": 0.83937073, + "learning_rate": 3.51826068453056e-06, + "loss": 0.86102176, + "num_input_tokens_seen": 89160810, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8515625, + "step": 4139, + "time_per_iteration": 2.4858012199401855 + }, + { + "auxiliary_loss_clip": 0.01134672, + "auxiliary_loss_mlp": 0.0104164, + "balance_loss_clip": 1.02424788, + "balance_loss_mlp": 1.04416132, + "epoch": 0.24891026604539307, + "flos": 20631434860800.0, + "grad_norm": 1.5320149755260415, + "language_loss": 0.7864905, + "learning_rate": 3.518007140085481e-06, + "loss": 0.80825365, + "num_input_tokens_seen": 89180610, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.90625, + "step": 4140, + "time_per_iteration": 2.4608240127563477 + }, + { + "auxiliary_loss_clip": 0.01058216, + "auxiliary_loss_mlp": 0.01013447, + "balance_loss_clip": 1.01150382, + "balance_loss_mlp": 1.02780879, + "epoch": 0.24897038929806103, + "flos": 66960294030720.0, + "grad_norm": 0.8230161703115366, + "language_loss": 0.60980695, + "learning_rate": 3.51775353807742e-06, + "loss": 0.63052356, + "num_input_tokens_seen": 89241880, + "router_z_loss_clip": 0.01940918, + "router_z_loss_mlp": 0.3046875, + "step": 4141, + "time_per_iteration": 3.1306700706481934 + }, + { + "auxiliary_loss_clip": 0.01136317, + "auxiliary_loss_mlp": 0.01042658, + "balance_loss_clip": 1.02537298, + "balance_loss_mlp": 1.04692519, + "epoch": 0.249030512550729, + "flos": 36392555612160.0, + "grad_norm": 2.804889663143828, + "language_loss": 0.72997624, + "learning_rate": 3.5174998785159913e-06, + "loss": 0.75176597, + "num_input_tokens_seen": 89263340, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.89453125, + "step": 4142, + "time_per_iteration": 2.60341215133667 + }, + { + "auxiliary_loss_clip": 0.011336, + "auxiliary_loss_mlp": 0.01039484, + "balance_loss_clip": 1.02335465, + "balance_loss_mlp": 1.04601634, + "epoch": 0.24909063580339696, + "flos": 20154576879360.0, + "grad_norm": 2.0852522280017873, + "language_loss": 0.80985868, + "learning_rate": 3.5172461614108157e-06, + "loss": 0.83158958, + "num_input_tokens_seen": 89282870, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.875, + "step": 4143, + "time_per_iteration": 2.4621787071228027 + }, + { + "auxiliary_loss_clip": 0.01127478, + "auxiliary_loss_mlp": 0.01036024, + "balance_loss_clip": 1.02113485, + "balance_loss_mlp": 1.04291701, + "epoch": 0.24915075905606493, + "flos": 26396569607040.0, + "grad_norm": 1.8417531415701045, + "language_loss": 0.5884496, + "learning_rate": 3.5169923867715137e-06, + "loss": 0.61008459, + "num_input_tokens_seen": 89303830, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.84765625, + "step": 4144, + "time_per_iteration": 2.5253236293792725 + }, + { + "auxiliary_loss_clip": 0.0113091, + "auxiliary_loss_mlp": 0.0103722, + "balance_loss_clip": 1.02135301, + "balance_loss_mlp": 1.04400194, + "epoch": 0.2492108823087329, + "flos": 27527216987520.0, + "grad_norm": 2.2350400575734146, + "language_loss": 0.78882402, + "learning_rate": 3.516738554607708e-06, + "loss": 0.81050527, + "num_input_tokens_seen": 89324350, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8671875, + "step": 4145, + "time_per_iteration": 2.500868797302246 + }, + { + "auxiliary_loss_clip": 0.01141282, + "auxiliary_loss_mlp": 0.01049792, + "balance_loss_clip": 1.02981293, + "balance_loss_mlp": 1.04593182, + "epoch": 0.24927100556140086, + "flos": 16691388111360.0, + "grad_norm": 2.0986803435557415, + "language_loss": 0.65651333, + "learning_rate": 3.5164846649290253e-06, + "loss": 0.678424, + "num_input_tokens_seen": 89342875, + "router_z_loss_clip": 0.19921875, + "router_z_loss_mlp": 0.953125, + "step": 4146, + "time_per_iteration": 2.482405424118042 + }, + { + "auxiliary_loss_clip": 0.01048172, + "auxiliary_loss_mlp": 0.01006681, + "balance_loss_clip": 1.00482178, + "balance_loss_mlp": 1.01849687, + "epoch": 0.24933112881406885, + "flos": 62772464286720.0, + "grad_norm": 3.0854856510049458, + "language_loss": 0.67327654, + "learning_rate": 3.5162307177450915e-06, + "loss": 0.69382501, + "num_input_tokens_seen": 89404925, + "router_z_loss_clip": 0.01855469, + "router_z_loss_mlp": 0.296875, + "step": 4147, + "time_per_iteration": 3.1769258975982666 + }, + { + "auxiliary_loss_clip": 0.01136528, + "auxiliary_loss_mlp": 0.01046222, + "balance_loss_clip": 1.02930617, + "balance_loss_mlp": 1.04857254, + "epoch": 0.24939125206673682, + "flos": 26651894457600.0, + "grad_norm": 2.0368820911017025, + "language_loss": 0.8893261, + "learning_rate": 3.5159767130655366e-06, + "loss": 0.91115361, + "num_input_tokens_seen": 89425090, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8828125, + "step": 4148, + "time_per_iteration": 2.5202085971832275 + }, + { + "auxiliary_loss_clip": 0.0113885, + "auxiliary_loss_mlp": 0.01045745, + "balance_loss_clip": 1.02649307, + "balance_loss_mlp": 1.04754162, + "epoch": 0.24945137531940478, + "flos": 20704333512960.0, + "grad_norm": 1.8605307211390085, + "language_loss": 0.68053228, + "learning_rate": 3.5157226508999935e-06, + "loss": 0.70237827, + "num_input_tokens_seen": 89442615, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.9140625, + "step": 4149, + "time_per_iteration": 2.455733060836792 + }, + { + "auxiliary_loss_clip": 0.01133288, + "auxiliary_loss_mlp": 0.01038884, + "balance_loss_clip": 1.02291596, + "balance_loss_mlp": 1.04694724, + "epoch": 0.24951149857207275, + "flos": 23768662682880.0, + "grad_norm": 2.99652773874907, + "language_loss": 0.71235985, + "learning_rate": 3.515468531258095e-06, + "loss": 0.73408163, + "num_input_tokens_seen": 89463025, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.86328125, + "step": 4150, + "time_per_iteration": 2.514190196990967 + }, + { + "auxiliary_loss_clip": 0.01134014, + "auxiliary_loss_mlp": 0.01049321, + "balance_loss_clip": 1.03256035, + "balance_loss_mlp": 1.04471052, + "epoch": 0.2495716218247407, + "flos": 15664881237120.0, + "grad_norm": 1.862035570914478, + "language_loss": 0.72954226, + "learning_rate": 3.515214354149478e-06, + "loss": 0.75137556, + "num_input_tokens_seen": 89480225, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.89453125, + "step": 4151, + "time_per_iteration": 2.4198975563049316 + }, + { + "auxiliary_loss_clip": 0.01141172, + "auxiliary_loss_mlp": 0.01049288, + "balance_loss_clip": 1.03213382, + "balance_loss_mlp": 1.04694724, + "epoch": 0.24963174507740868, + "flos": 24052499953920.0, + "grad_norm": 4.099427504771762, + "language_loss": 0.62436807, + "learning_rate": 3.514960119583781e-06, + "loss": 0.64627266, + "num_input_tokens_seen": 89496985, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.94140625, + "step": 4152, + "time_per_iteration": 2.563032865524292 + }, + { + "auxiliary_loss_clip": 0.01131413, + "auxiliary_loss_mlp": 0.01038045, + "balance_loss_clip": 1.02188039, + "balance_loss_mlp": 1.04631066, + "epoch": 0.24969186833007664, + "flos": 21799501234560.0, + "grad_norm": 2.3735561607913596, + "language_loss": 0.77219248, + "learning_rate": 3.514705827570645e-06, + "loss": 0.79388708, + "num_input_tokens_seen": 89514420, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8515625, + "step": 4153, + "time_per_iteration": 2.5059967041015625 + }, + { + "auxiliary_loss_clip": 0.01132512, + "auxiliary_loss_mlp": 0.01040076, + "balance_loss_clip": 1.0242573, + "balance_loss_mlp": 1.04642224, + "epoch": 0.24975199158274464, + "flos": 19938143479680.0, + "grad_norm": 2.164577963489155, + "language_loss": 0.76443702, + "learning_rate": 3.514451478119711e-06, + "loss": 0.78616285, + "num_input_tokens_seen": 89532925, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.859375, + "step": 4154, + "time_per_iteration": 2.48317551612854 + }, + { + "auxiliary_loss_clip": 0.01138697, + "auxiliary_loss_mlp": 0.0104451, + "balance_loss_clip": 1.02586532, + "balance_loss_mlp": 1.04451203, + "epoch": 0.2498121148354126, + "flos": 25338389915520.0, + "grad_norm": 2.2000943153895722, + "language_loss": 0.70740849, + "learning_rate": 3.5141970712406258e-06, + "loss": 0.72924054, + "num_input_tokens_seen": 89552855, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.94140625, + "step": 4155, + "time_per_iteration": 2.498227834701538 + }, + { + "auxiliary_loss_clip": 0.01137147, + "auxiliary_loss_mlp": 0.01050913, + "balance_loss_clip": 1.03379464, + "balance_loss_mlp": 1.04736114, + "epoch": 0.24987223808808057, + "flos": 20558787603840.0, + "grad_norm": 1.8252469259439843, + "language_loss": 0.7499637, + "learning_rate": 3.513942606943036e-06, + "loss": 0.77184427, + "num_input_tokens_seen": 89572830, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.8984375, + "step": 4156, + "time_per_iteration": 2.473536729812622 + }, + { + "auxiliary_loss_clip": 0.01132111, + "auxiliary_loss_mlp": 0.01040008, + "balance_loss_clip": 1.0244987, + "balance_loss_mlp": 1.04498601, + "epoch": 0.24993236134074853, + "flos": 19749037351680.0, + "grad_norm": 2.1247768054564333, + "language_loss": 0.76757634, + "learning_rate": 3.513688085236591e-06, + "loss": 0.78929752, + "num_input_tokens_seen": 89590345, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.87109375, + "step": 4157, + "time_per_iteration": 2.476402759552002 + }, + { + "auxiliary_loss_clip": 0.01135567, + "auxiliary_loss_mlp": 0.01044785, + "balance_loss_clip": 1.02821517, + "balance_loss_mlp": 1.04551077, + "epoch": 0.2499924845934165, + "flos": 18770292587520.0, + "grad_norm": 1.6430173172536622, + "language_loss": 0.81497854, + "learning_rate": 3.513433506130942e-06, + "loss": 0.8367821, + "num_input_tokens_seen": 89610295, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8984375, + "step": 4158, + "time_per_iteration": 2.4706146717071533 + }, + { + "auxiliary_loss_clip": 0.01134661, + "auxiliary_loss_mlp": 0.01031651, + "balance_loss_clip": 1.01533163, + "balance_loss_mlp": 1.04511046, + "epoch": 0.25005260784608446, + "flos": 16872198197760.0, + "grad_norm": 2.425058111765743, + "language_loss": 0.75573325, + "learning_rate": 3.5131788696357427e-06, + "loss": 0.77739644, + "num_input_tokens_seen": 89627795, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.89453125, + "step": 4159, + "time_per_iteration": 2.447530746459961 + }, + { + "auxiliary_loss_clip": 0.01137664, + "auxiliary_loss_mlp": 0.01036787, + "balance_loss_clip": 1.01928759, + "balance_loss_mlp": 1.04643881, + "epoch": 0.2501127310987524, + "flos": 22124923476480.0, + "grad_norm": 2.3851333770237044, + "language_loss": 0.71434534, + "learning_rate": 3.512924175760649e-06, + "loss": 0.73608989, + "num_input_tokens_seen": 89648090, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9140625, + "step": 4160, + "time_per_iteration": 2.4909448623657227 + }, + { + "auxiliary_loss_clip": 0.01045571, + "auxiliary_loss_mlp": 0.01008394, + "balance_loss_clip": 1.0062604, + "balance_loss_mlp": 1.01615632, + "epoch": 0.2501728543514204, + "flos": 69458061980160.0, + "grad_norm": 0.7574731626167057, + "language_loss": 0.56755257, + "learning_rate": 3.5126694245153186e-06, + "loss": 0.58809221, + "num_input_tokens_seen": 89710345, + "router_z_loss_clip": 0.0213623, + "router_z_loss_mlp": 0.29492188, + "step": 4161, + "time_per_iteration": 3.1169064044952393 + }, + { + "auxiliary_loss_clip": 0.01143652, + "auxiliary_loss_mlp": 0.01045602, + "balance_loss_clip": 1.02767324, + "balance_loss_mlp": 1.04854345, + "epoch": 0.25023297760408836, + "flos": 16289978647680.0, + "grad_norm": 1.822598728260487, + "language_loss": 0.8071059, + "learning_rate": 3.5124146159094125e-06, + "loss": 0.82899845, + "num_input_tokens_seen": 89729390, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.94921875, + "step": 4162, + "time_per_iteration": 2.4679477214813232 + }, + { + "auxiliary_loss_clip": 0.01136921, + "auxiliary_loss_mlp": 0.01039377, + "balance_loss_clip": 1.02212739, + "balance_loss_mlp": 1.04364812, + "epoch": 0.2502931008567563, + "flos": 12237998140800.0, + "grad_norm": 2.543272880301035, + "language_loss": 0.87439299, + "learning_rate": 3.5121597499525927e-06, + "loss": 0.89615595, + "num_input_tokens_seen": 89742805, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.93359375, + "step": 4163, + "time_per_iteration": 2.411324977874756 + }, + { + "auxiliary_loss_clip": 0.01135069, + "auxiliary_loss_mlp": 0.01036709, + "balance_loss_clip": 1.02013874, + "balance_loss_mlp": 1.04609334, + "epoch": 0.25035322410942434, + "flos": 23181882105600.0, + "grad_norm": 1.8835095650007205, + "language_loss": 0.83242726, + "learning_rate": 3.5119048266545232e-06, + "loss": 0.85414505, + "num_input_tokens_seen": 89761145, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.890625, + "step": 4164, + "time_per_iteration": 2.4910058975219727 + }, + { + "auxiliary_loss_clip": 0.01130392, + "auxiliary_loss_mlp": 0.01047055, + "balance_loss_clip": 1.03235698, + "balance_loss_mlp": 1.04616356, + "epoch": 0.2504133473620923, + "flos": 20917534688640.0, + "grad_norm": 1.7333709529875627, + "language_loss": 0.74548686, + "learning_rate": 3.5116498460248716e-06, + "loss": 0.76726139, + "num_input_tokens_seen": 89780905, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.84375, + "step": 4165, + "time_per_iteration": 2.4566714763641357 + }, + { + "auxiliary_loss_clip": 0.01139627, + "auxiliary_loss_mlp": 0.01045895, + "balance_loss_clip": 1.02819216, + "balance_loss_mlp": 1.04689348, + "epoch": 0.2504734706147603, + "flos": 20776549806720.0, + "grad_norm": 5.301488379412456, + "language_loss": 0.74214685, + "learning_rate": 3.5113948080733062e-06, + "loss": 0.76400197, + "num_input_tokens_seen": 89799230, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9296875, + "step": 4166, + "time_per_iteration": 2.462092161178589 + }, + { + "auxiliary_loss_clip": 0.01134276, + "auxiliary_loss_mlp": 0.01045442, + "balance_loss_clip": 1.02898526, + "balance_loss_mlp": 1.04551435, + "epoch": 0.25053359386742824, + "flos": 24349373861760.0, + "grad_norm": 1.9752225074857819, + "language_loss": 0.82011521, + "learning_rate": 3.5111397128094973e-06, + "loss": 0.84191239, + "num_input_tokens_seen": 89818240, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.890625, + "step": 4167, + "time_per_iteration": 2.482534885406494 + }, + { + "auxiliary_loss_clip": 0.01134736, + "auxiliary_loss_mlp": 0.01044102, + "balance_loss_clip": 1.0280689, + "balance_loss_mlp": 1.04616201, + "epoch": 0.2505937171200962, + "flos": 21214336769280.0, + "grad_norm": 2.42679689243218, + "language_loss": 0.79602242, + "learning_rate": 3.51088456024312e-06, + "loss": 0.81781083, + "num_input_tokens_seen": 89834485, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8828125, + "step": 4168, + "time_per_iteration": 2.463700532913208 + }, + { + "auxiliary_loss_clip": 0.01139283, + "auxiliary_loss_mlp": 0.01042051, + "balance_loss_clip": 1.02353752, + "balance_loss_mlp": 1.04523754, + "epoch": 0.25065384037276417, + "flos": 41427231379200.0, + "grad_norm": 2.966293758738445, + "language_loss": 0.70029891, + "learning_rate": 3.510629350383849e-06, + "loss": 0.72211224, + "num_input_tokens_seen": 89855645, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9375, + "step": 4169, + "time_per_iteration": 2.6148693561553955 + }, + { + "auxiliary_loss_clip": 0.01131562, + "auxiliary_loss_mlp": 0.0104538, + "balance_loss_clip": 1.02926338, + "balance_loss_mlp": 1.0446701, + "epoch": 0.25071396362543213, + "flos": 26102389219200.0, + "grad_norm": 1.8138505316100015, + "language_loss": 0.77564663, + "learning_rate": 3.510374083241361e-06, + "loss": 0.79741603, + "num_input_tokens_seen": 89874895, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8671875, + "step": 4170, + "time_per_iteration": 2.522921562194824 + }, + { + "auxiliary_loss_clip": 0.01137572, + "auxiliary_loss_mlp": 0.01043275, + "balance_loss_clip": 1.02731323, + "balance_loss_mlp": 1.04796529, + "epoch": 0.2507740868781001, + "flos": 19098982967040.0, + "grad_norm": 2.4512078878938404, + "language_loss": 0.76246989, + "learning_rate": 3.5101187588253368e-06, + "loss": 0.78427839, + "num_input_tokens_seen": 89891700, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8984375, + "step": 4171, + "time_per_iteration": 2.4322195053100586 + }, + { + "auxiliary_loss_clip": 0.01046694, + "auxiliary_loss_mlp": 0.01021172, + "balance_loss_clip": 1.01924038, + "balance_loss_mlp": 1.01739454, + "epoch": 0.25083421013076806, + "flos": 64341868296960.0, + "grad_norm": 0.8497756598481241, + "language_loss": 0.60047227, + "learning_rate": 3.509863377145458e-06, + "loss": 0.62115091, + "num_input_tokens_seen": 89955775, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.29296875, + "step": 4172, + "time_per_iteration": 3.1110994815826416 + }, + { + "auxiliary_loss_clip": 0.01137052, + "auxiliary_loss_mlp": 0.01042686, + "balance_loss_clip": 1.02567458, + "balance_loss_mlp": 1.04652381, + "epoch": 0.25089433338343603, + "flos": 24279599692800.0, + "grad_norm": 1.4442293166181488, + "language_loss": 0.78647727, + "learning_rate": 3.509607938211409e-06, + "loss": 0.80827463, + "num_input_tokens_seen": 89977150, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.90625, + "step": 4173, + "time_per_iteration": 2.481062889099121 + }, + { + "auxiliary_loss_clip": 0.01140203, + "auxiliary_loss_mlp": 0.01046542, + "balance_loss_clip": 1.0300796, + "balance_loss_mlp": 1.05017626, + "epoch": 0.250954456636104, + "flos": 14721472477440.0, + "grad_norm": 2.4202296115923883, + "language_loss": 0.83543748, + "learning_rate": 3.509352442032875e-06, + "loss": 0.85730493, + "num_input_tokens_seen": 89994925, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8984375, + "step": 4174, + "time_per_iteration": 2.4566147327423096 + }, + { + "auxiliary_loss_clip": 0.01138282, + "auxiliary_loss_mlp": 0.01040651, + "balance_loss_clip": 1.02299595, + "balance_loss_mlp": 1.04786515, + "epoch": 0.25101457988877196, + "flos": 22273593868800.0, + "grad_norm": 2.0903096624482624, + "language_loss": 0.71291864, + "learning_rate": 3.509096888619545e-06, + "loss": 0.73470795, + "num_input_tokens_seen": 90013235, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.90234375, + "step": 4175, + "time_per_iteration": 2.4616360664367676 + }, + { + "auxiliary_loss_clip": 0.01138348, + "auxiliary_loss_mlp": 0.01036282, + "balance_loss_clip": 1.01866269, + "balance_loss_mlp": 1.0460453, + "epoch": 0.2510747031414399, + "flos": 25188929424000.0, + "grad_norm": 2.247188920587568, + "language_loss": 0.80564427, + "learning_rate": 3.50884127798111e-06, + "loss": 0.82739055, + "num_input_tokens_seen": 90032150, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.921875, + "step": 4176, + "time_per_iteration": 2.525686740875244 + }, + { + "auxiliary_loss_clip": 0.01138723, + "auxiliary_loss_mlp": 0.01044107, + "balance_loss_clip": 1.02553427, + "balance_loss_mlp": 1.04782593, + "epoch": 0.25113482639410795, + "flos": 20704189858560.0, + "grad_norm": 2.362252442770041, + "language_loss": 0.83099151, + "learning_rate": 3.5085856101272623e-06, + "loss": 0.8528198, + "num_input_tokens_seen": 90049085, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.90625, + "step": 4177, + "time_per_iteration": 5.424759387969971 + }, + { + "auxiliary_loss_clip": 0.01135735, + "auxiliary_loss_mlp": 0.01043794, + "balance_loss_clip": 1.02675891, + "balance_loss_mlp": 1.04777622, + "epoch": 0.2511949496467759, + "flos": 21506936958720.0, + "grad_norm": 2.9753996759374846, + "language_loss": 0.8209883, + "learning_rate": 3.508329885067698e-06, + "loss": 0.84278357, + "num_input_tokens_seen": 90067695, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.87890625, + "step": 4178, + "time_per_iteration": 2.451418161392212 + }, + { + "auxiliary_loss_clip": 0.01130203, + "auxiliary_loss_mlp": 0.01042984, + "balance_loss_clip": 1.02702177, + "balance_loss_mlp": 1.04445124, + "epoch": 0.2512550728994439, + "flos": 20701999128960.0, + "grad_norm": 2.6671564243834505, + "language_loss": 0.75406277, + "learning_rate": 3.508074102812112e-06, + "loss": 0.77579463, + "num_input_tokens_seen": 90083890, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.859375, + "step": 4179, + "time_per_iteration": 2.4710347652435303 + }, + { + "auxiliary_loss_clip": 0.01135846, + "auxiliary_loss_mlp": 0.01048206, + "balance_loss_clip": 1.03050375, + "balance_loss_mlp": 1.04526711, + "epoch": 0.25131519615211184, + "flos": 18478626151680.0, + "grad_norm": 2.189208999533023, + "language_loss": 0.70452499, + "learning_rate": 3.507818263370206e-06, + "loss": 0.72636557, + "num_input_tokens_seen": 90100995, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.90625, + "step": 4180, + "time_per_iteration": 2.433922290802002 + }, + { + "auxiliary_loss_clip": 0.01132491, + "auxiliary_loss_mlp": 0.01041648, + "balance_loss_clip": 1.02485168, + "balance_loss_mlp": 1.04449701, + "epoch": 0.2513753194047798, + "flos": 20484955198080.0, + "grad_norm": 2.0603947372587244, + "language_loss": 0.85379761, + "learning_rate": 3.5075623667516796e-06, + "loss": 0.875539, + "num_input_tokens_seen": 90120365, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8828125, + "step": 4181, + "time_per_iteration": 2.4513771533966064 + }, + { + "auxiliary_loss_clip": 0.01135555, + "auxiliary_loss_mlp": 0.01042648, + "balance_loss_clip": 1.02608991, + "balance_loss_mlp": 1.0464716, + "epoch": 0.25143544265744777, + "flos": 37670077704960.0, + "grad_norm": 1.9568163341605829, + "language_loss": 0.67662674, + "learning_rate": 3.507306412966238e-06, + "loss": 0.69840884, + "num_input_tokens_seen": 90142610, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.890625, + "step": 4182, + "time_per_iteration": 2.588513135910034 + }, + { + "auxiliary_loss_clip": 0.01047089, + "auxiliary_loss_mlp": 0.01008874, + "balance_loss_clip": 1.00675201, + "balance_loss_mlp": 1.01742792, + "epoch": 0.25149556591011574, + "flos": 69367457923200.0, + "grad_norm": 0.8484678873575391, + "language_loss": 0.70098495, + "learning_rate": 3.5070504020235853e-06, + "loss": 0.72154456, + "num_input_tokens_seen": 90200555, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.296875, + "step": 4183, + "time_per_iteration": 3.0990090370178223 + }, + { + "auxiliary_loss_clip": 0.01129729, + "auxiliary_loss_mlp": 0.01038846, + "balance_loss_clip": 1.02088118, + "balance_loss_mlp": 1.04070854, + "epoch": 0.2515556891627837, + "flos": 13990402967040.0, + "grad_norm": 1.7162399200173233, + "language_loss": 0.7452544, + "learning_rate": 3.506794333933431e-06, + "loss": 0.76694012, + "num_input_tokens_seen": 90218120, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.890625, + "step": 4184, + "time_per_iteration": 2.4367544651031494 + }, + { + "auxiliary_loss_clip": 0.01137253, + "auxiliary_loss_mlp": 0.01045885, + "balance_loss_clip": 1.02888608, + "balance_loss_mlp": 1.04825735, + "epoch": 0.25161581241545167, + "flos": 22163527618560.0, + "grad_norm": 1.9130230292696613, + "language_loss": 0.82872695, + "learning_rate": 3.506538208705484e-06, + "loss": 0.85055834, + "num_input_tokens_seen": 90236790, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.890625, + "step": 4185, + "time_per_iteration": 2.4604692459106445 + }, + { + "auxiliary_loss_clip": 0.01047588, + "auxiliary_loss_mlp": 0.01003961, + "balance_loss_clip": 1.00194633, + "balance_loss_mlp": 1.01820421, + "epoch": 0.25167593566811963, + "flos": 69358407696000.0, + "grad_norm": 0.7885291752286397, + "language_loss": 0.61534387, + "learning_rate": 3.5062820263494574e-06, + "loss": 0.63585937, + "num_input_tokens_seen": 90297070, + "router_z_loss_clip": 0.0201416, + "router_z_loss_mlp": 0.29296875, + "step": 4186, + "time_per_iteration": 2.9629924297332764 + }, + { + "auxiliary_loss_clip": 0.01133243, + "auxiliary_loss_mlp": 0.01040873, + "balance_loss_clip": 1.02320647, + "balance_loss_mlp": 1.04432559, + "epoch": 0.2517360589207876, + "flos": 13261452359040.0, + "grad_norm": 2.1070381215060308, + "language_loss": 0.79260957, + "learning_rate": 3.5060257868750656e-06, + "loss": 0.81435084, + "num_input_tokens_seen": 90315255, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.890625, + "step": 4187, + "time_per_iteration": 2.454988479614258 + }, + { + "auxiliary_loss_clip": 0.01136483, + "auxiliary_loss_mlp": 0.01049456, + "balance_loss_clip": 1.03235006, + "balance_loss_mlp": 1.04733062, + "epoch": 0.25179618217345556, + "flos": 20376828282240.0, + "grad_norm": 1.5254881034867085, + "language_loss": 0.79854965, + "learning_rate": 3.5057694902920244e-06, + "loss": 0.82040906, + "num_input_tokens_seen": 90334990, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.890625, + "step": 4188, + "time_per_iteration": 2.4807493686676025 + }, + { + "auxiliary_loss_clip": 0.01135001, + "auxiliary_loss_mlp": 0.01046554, + "balance_loss_clip": 1.03022218, + "balance_loss_mlp": 1.04635882, + "epoch": 0.25185630542612353, + "flos": 27664718250240.0, + "grad_norm": 1.727912733373243, + "language_loss": 0.74509478, + "learning_rate": 3.5055131366100534e-06, + "loss": 0.76691031, + "num_input_tokens_seen": 90351825, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.88671875, + "step": 4189, + "time_per_iteration": 2.4887545108795166 + }, + { + "auxiliary_loss_clip": 0.01131737, + "auxiliary_loss_mlp": 0.01044524, + "balance_loss_clip": 1.02914619, + "balance_loss_mlp": 1.04616165, + "epoch": 0.25191642867879155, + "flos": 20996430912000.0, + "grad_norm": 1.957544272457229, + "language_loss": 0.84454727, + "learning_rate": 3.5052567258388745e-06, + "loss": 0.86630988, + "num_input_tokens_seen": 90369860, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.85546875, + "step": 4190, + "time_per_iteration": 2.4629735946655273 + }, + { + "auxiliary_loss_clip": 0.01134245, + "auxiliary_loss_mlp": 0.01044478, + "balance_loss_clip": 1.02633452, + "balance_loss_mlp": 1.04529381, + "epoch": 0.2519765519314595, + "flos": 21105671149440.0, + "grad_norm": 1.9468541382775664, + "language_loss": 0.75593925, + "learning_rate": 3.5050002579882082e-06, + "loss": 0.77772641, + "num_input_tokens_seen": 90389245, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.88671875, + "step": 4191, + "time_per_iteration": 2.451493263244629 + }, + { + "auxiliary_loss_clip": 0.01042669, + "auxiliary_loss_mlp": 0.0101771, + "balance_loss_clip": 1.01577878, + "balance_loss_mlp": 1.01320672, + "epoch": 0.2520366751841275, + "flos": 62744993360640.0, + "grad_norm": 0.7165761170014687, + "language_loss": 0.57155997, + "learning_rate": 3.5047437330677823e-06, + "loss": 0.59216374, + "num_input_tokens_seen": 90456735, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.29492188, + "step": 4192, + "time_per_iteration": 3.1455304622650146 + }, + { + "auxiliary_loss_clip": 0.01132992, + "auxiliary_loss_mlp": 0.01042104, + "balance_loss_clip": 1.02593958, + "balance_loss_mlp": 1.04640245, + "epoch": 0.25209679843679544, + "flos": 22230716008320.0, + "grad_norm": 2.0419031963399434, + "language_loss": 0.76306844, + "learning_rate": 3.504487151087323e-06, + "loss": 0.78481936, + "num_input_tokens_seen": 90474165, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8671875, + "step": 4193, + "time_per_iteration": 2.46201491355896 + }, + { + "auxiliary_loss_clip": 0.01136471, + "auxiliary_loss_mlp": 0.01048175, + "balance_loss_clip": 1.03115189, + "balance_loss_mlp": 1.04506373, + "epoch": 0.2521569216894634, + "flos": 12166643773440.0, + "grad_norm": 2.1192679618590007, + "language_loss": 0.84261906, + "learning_rate": 3.5042305120565598e-06, + "loss": 0.86446548, + "num_input_tokens_seen": 90491660, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9140625, + "step": 4194, + "time_per_iteration": 2.4525146484375 + }, + { + "auxiliary_loss_clip": 0.01138489, + "auxiliary_loss_mlp": 0.01049416, + "balance_loss_clip": 1.03404951, + "balance_loss_mlp": 1.04636192, + "epoch": 0.2522170449421314, + "flos": 23699786353920.0, + "grad_norm": 1.488794247862028, + "language_loss": 0.88176262, + "learning_rate": 3.5039738159852253e-06, + "loss": 0.90364158, + "num_input_tokens_seen": 90514025, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.921875, + "step": 4195, + "time_per_iteration": 2.507788896560669 + }, + { + "auxiliary_loss_clip": 0.01136069, + "auxiliary_loss_mlp": 0.01042605, + "balance_loss_clip": 1.02323329, + "balance_loss_mlp": 1.04540074, + "epoch": 0.25227716819479934, + "flos": 20955456472320.0, + "grad_norm": 2.8940350432545787, + "language_loss": 0.85288155, + "learning_rate": 3.503717062883053e-06, + "loss": 0.87466824, + "num_input_tokens_seen": 90533530, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.90625, + "step": 4196, + "time_per_iteration": 2.4843344688415527 + }, + { + "auxiliary_loss_clip": 0.01135455, + "auxiliary_loss_mlp": 0.01042942, + "balance_loss_clip": 1.02644312, + "balance_loss_mlp": 1.0454607, + "epoch": 0.2523372914474673, + "flos": 23331342597120.0, + "grad_norm": 1.6596186150335415, + "language_loss": 0.83368516, + "learning_rate": 3.5034602527597786e-06, + "loss": 0.85546911, + "num_input_tokens_seen": 90554025, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8984375, + "step": 4197, + "time_per_iteration": 2.480834484100342 + }, + { + "auxiliary_loss_clip": 0.01139173, + "auxiliary_loss_mlp": 0.01047624, + "balance_loss_clip": 1.02840698, + "balance_loss_mlp": 1.04775643, + "epoch": 0.25239741470013527, + "flos": 36970321875840.0, + "grad_norm": 2.7573342641631093, + "language_loss": 0.72406292, + "learning_rate": 3.5032033856251405e-06, + "loss": 0.74593097, + "num_input_tokens_seen": 90576930, + "router_z_loss_clip": 0.19140625, + "router_z_loss_mlp": 0.9140625, + "step": 4198, + "time_per_iteration": 2.6081368923187256 + }, + { + "auxiliary_loss_clip": 0.01139571, + "auxiliary_loss_mlp": 0.01052953, + "balance_loss_clip": 1.03469038, + "balance_loss_mlp": 1.0462662, + "epoch": 0.25245753795280323, + "flos": 18515757836160.0, + "grad_norm": 1.9511850390779815, + "language_loss": 0.76798427, + "learning_rate": 3.50294646148888e-06, + "loss": 0.7899096, + "num_input_tokens_seen": 90595710, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.93359375, + "step": 4199, + "time_per_iteration": 2.463322162628174 + }, + { + "auxiliary_loss_clip": 0.01137047, + "auxiliary_loss_mlp": 0.01039237, + "balance_loss_clip": 1.02334595, + "balance_loss_mlp": 1.04600453, + "epoch": 0.2525176612054712, + "flos": 32344884737280.0, + "grad_norm": 1.6881838085079777, + "language_loss": 0.727651, + "learning_rate": 3.502689480360739e-06, + "loss": 0.74941385, + "num_input_tokens_seen": 90617945, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.91015625, + "step": 4200, + "time_per_iteration": 2.586298942565918 + }, + { + "auxiliary_loss_clip": 0.01134782, + "auxiliary_loss_mlp": 0.01047981, + "balance_loss_clip": 1.03206062, + "balance_loss_mlp": 1.04300654, + "epoch": 0.25257778445813917, + "flos": 45258217459200.0, + "grad_norm": 1.7166145531144803, + "language_loss": 0.82271791, + "learning_rate": 3.5024324422504616e-06, + "loss": 0.84454548, + "num_input_tokens_seen": 90640855, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.91796875, + "step": 4201, + "time_per_iteration": 2.6430721282958984 + }, + { + "auxiliary_loss_clip": 0.01138395, + "auxiliary_loss_mlp": 0.01046441, + "balance_loss_clip": 1.02960861, + "balance_loss_mlp": 1.04680324, + "epoch": 0.25263790771080713, + "flos": 23367791923200.0, + "grad_norm": 1.8945534984036327, + "language_loss": 0.74844849, + "learning_rate": 3.5021753471677965e-06, + "loss": 0.77029681, + "num_input_tokens_seen": 90661350, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9140625, + "step": 4202, + "time_per_iteration": 2.477376699447632 + }, + { + "auxiliary_loss_clip": 0.01133899, + "auxiliary_loss_mlp": 0.01041807, + "balance_loss_clip": 1.02545786, + "balance_loss_mlp": 1.04550529, + "epoch": 0.25269803096347515, + "flos": 18515039564160.0, + "grad_norm": 1.8769942277842264, + "language_loss": 0.73058856, + "learning_rate": 3.501918195122491e-06, + "loss": 0.75234556, + "num_input_tokens_seen": 90680540, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.88671875, + "step": 4203, + "time_per_iteration": 2.4526968002319336 + }, + { + "auxiliary_loss_clip": 0.01134593, + "auxiliary_loss_mlp": 0.01040695, + "balance_loss_clip": 1.02403569, + "balance_loss_mlp": 1.04434335, + "epoch": 0.2527581542161431, + "flos": 24610552629120.0, + "grad_norm": 1.7217444479200419, + "language_loss": 0.77377844, + "learning_rate": 3.501660986124297e-06, + "loss": 0.79553127, + "num_input_tokens_seen": 90703460, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.90234375, + "step": 4204, + "time_per_iteration": 2.540573835372925 + }, + { + "auxiliary_loss_clip": 0.01135598, + "auxiliary_loss_mlp": 0.01051513, + "balance_loss_clip": 1.03463292, + "balance_loss_mlp": 1.04443574, + "epoch": 0.2528182774688111, + "flos": 12641275111680.0, + "grad_norm": 3.2226665017353655, + "language_loss": 0.72443974, + "learning_rate": 3.5014037201829684e-06, + "loss": 0.74631095, + "num_input_tokens_seen": 90718815, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.9140625, + "step": 4205, + "time_per_iteration": 2.405823230743408 + }, + { + "auxiliary_loss_clip": 0.01131667, + "auxiliary_loss_mlp": 0.01038371, + "balance_loss_clip": 1.02304697, + "balance_loss_mlp": 1.04673433, + "epoch": 0.25287840072147905, + "flos": 46936789879680.0, + "grad_norm": 1.4419344159614245, + "language_loss": 0.75674903, + "learning_rate": 3.50114639730826e-06, + "loss": 0.77844942, + "num_input_tokens_seen": 90742125, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.84765625, + "step": 4206, + "time_per_iteration": 2.7117254734039307 + }, + { + "auxiliary_loss_clip": 0.01134608, + "auxiliary_loss_mlp": 0.01041465, + "balance_loss_clip": 1.02502584, + "balance_loss_mlp": 1.04381466, + "epoch": 0.252938523974147, + "flos": 18879712392960.0, + "grad_norm": 1.8459801280493204, + "language_loss": 0.79013956, + "learning_rate": 3.5008890175099296e-06, + "loss": 0.81190026, + "num_input_tokens_seen": 90760785, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90625, + "step": 4207, + "time_per_iteration": 2.4338433742523193 + }, + { + "auxiliary_loss_clip": 0.01131159, + "auxiliary_loss_mlp": 0.01042915, + "balance_loss_clip": 1.02688169, + "balance_loss_mlp": 1.04521704, + "epoch": 0.252998647226815, + "flos": 21434720664960.0, + "grad_norm": 1.5263501886522268, + "language_loss": 0.76010746, + "learning_rate": 3.5006315807977375e-06, + "loss": 0.78184819, + "num_input_tokens_seen": 90780045, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.859375, + "step": 4208, + "time_per_iteration": 2.4712774753570557 + }, + { + "auxiliary_loss_clip": 0.01131243, + "auxiliary_loss_mlp": 0.01042204, + "balance_loss_clip": 1.02559781, + "balance_loss_mlp": 1.04407811, + "epoch": 0.25305877047948294, + "flos": 25442171285760.0, + "grad_norm": 1.8494822470113228, + "language_loss": 0.6965062, + "learning_rate": 3.5003740871814456e-06, + "loss": 0.71824062, + "num_input_tokens_seen": 90797980, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.87109375, + "step": 4209, + "time_per_iteration": 2.4723262786865234 + }, + { + "auxiliary_loss_clip": 0.01046036, + "auxiliary_loss_mlp": 0.00999993, + "balance_loss_clip": 0.99819291, + "balance_loss_mlp": 1.01643014, + "epoch": 0.2531188937321509, + "flos": 60185603629440.0, + "grad_norm": 0.7581785291884388, + "language_loss": 0.55080217, + "learning_rate": 3.5001165366708175e-06, + "loss": 0.57126248, + "num_input_tokens_seen": 90864865, + "router_z_loss_clip": 0.01794434, + "router_z_loss_mlp": 0.296875, + "step": 4210, + "time_per_iteration": 3.141958236694336 + }, + { + "auxiliary_loss_clip": 0.0113523, + "auxiliary_loss_mlp": 0.01034659, + "balance_loss_clip": 1.01853585, + "balance_loss_mlp": 1.04541481, + "epoch": 0.25317901698481887, + "flos": 19682387665920.0, + "grad_norm": 2.0581011511690606, + "language_loss": 0.8021341, + "learning_rate": 3.4998589292756204e-06, + "loss": 0.82383299, + "num_input_tokens_seen": 90882885, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8984375, + "step": 4211, + "time_per_iteration": 2.4423909187316895 + }, + { + "auxiliary_loss_clip": 0.01128499, + "auxiliary_loss_mlp": 0.01039387, + "balance_loss_clip": 1.02402079, + "balance_loss_mlp": 1.04284227, + "epoch": 0.25323914023748684, + "flos": 24424355502720.0, + "grad_norm": 1.6375033978461933, + "language_loss": 0.78310406, + "learning_rate": 3.499601265005622e-06, + "loss": 0.80478293, + "num_input_tokens_seen": 90902985, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.85546875, + "step": 4212, + "time_per_iteration": 2.535416841506958 + }, + { + "auxiliary_loss_clip": 0.01131331, + "auxiliary_loss_mlp": 0.01040125, + "balance_loss_clip": 1.02356696, + "balance_loss_mlp": 1.04314673, + "epoch": 0.2532992634901548, + "flos": 25447450584960.0, + "grad_norm": 2.0206536972721088, + "language_loss": 0.53393918, + "learning_rate": 3.4993435438705938e-06, + "loss": 0.55565375, + "num_input_tokens_seen": 90923550, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8828125, + "step": 4213, + "time_per_iteration": 2.488844871520996 + }, + { + "auxiliary_loss_clip": 0.01132972, + "auxiliary_loss_mlp": 0.01042806, + "balance_loss_clip": 1.02566385, + "balance_loss_mlp": 1.04508567, + "epoch": 0.25335938674282277, + "flos": 18880538405760.0, + "grad_norm": 2.6682600080383816, + "language_loss": 0.65329081, + "learning_rate": 3.499085765880308e-06, + "loss": 0.67504859, + "num_input_tokens_seen": 90943260, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.87890625, + "step": 4214, + "time_per_iteration": 2.478422164916992 + }, + { + "auxiliary_loss_clip": 0.01043385, + "auxiliary_loss_mlp": 0.0100812, + "balance_loss_clip": 1.00630808, + "balance_loss_mlp": 1.0142169, + "epoch": 0.25341950999549073, + "flos": 53062649936640.0, + "grad_norm": 0.8479929036578698, + "language_loss": 0.58049941, + "learning_rate": 3.4988279310445396e-06, + "loss": 0.60101438, + "num_input_tokens_seen": 90996295, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.29296875, + "step": 4215, + "time_per_iteration": 2.824084997177124 + }, + { + "auxiliary_loss_clip": 0.01133433, + "auxiliary_loss_mlp": 0.01043479, + "balance_loss_clip": 1.02636075, + "balance_loss_mlp": 1.04583967, + "epoch": 0.2534796332481587, + "flos": 39020247054720.0, + "grad_norm": 1.7693463876532338, + "language_loss": 0.83949232, + "learning_rate": 3.498570039373066e-06, + "loss": 0.86126143, + "num_input_tokens_seen": 91017545, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.875, + "step": 4216, + "time_per_iteration": 2.650329828262329 + }, + { + "auxiliary_loss_clip": 0.01134428, + "auxiliary_loss_mlp": 0.01041784, + "balance_loss_clip": 1.02504706, + "balance_loss_mlp": 1.04571652, + "epoch": 0.2535397565008267, + "flos": 23586990670080.0, + "grad_norm": 1.7652170119003572, + "language_loss": 0.80028123, + "learning_rate": 3.498312090875666e-06, + "loss": 0.82204342, + "num_input_tokens_seen": 91037715, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.88671875, + "step": 4217, + "time_per_iteration": 2.49381160736084 + }, + { + "auxiliary_loss_clip": 0.01129632, + "auxiliary_loss_mlp": 0.01039348, + "balance_loss_clip": 1.02422011, + "balance_loss_mlp": 1.04193234, + "epoch": 0.2535998797534947, + "flos": 19281373251840.0, + "grad_norm": 2.1701414828965464, + "language_loss": 0.75014293, + "learning_rate": 3.4980540855621218e-06, + "loss": 0.7718327, + "num_input_tokens_seen": 91055295, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.87890625, + "step": 4218, + "time_per_iteration": 2.4794864654541016 + }, + { + "auxiliary_loss_clip": 0.01135591, + "auxiliary_loss_mlp": 0.01041436, + "balance_loss_clip": 1.02462721, + "balance_loss_mlp": 1.04470503, + "epoch": 0.25366000300616265, + "flos": 24024382583040.0, + "grad_norm": 1.8718582993796022, + "language_loss": 0.74483025, + "learning_rate": 3.4977960234422167e-06, + "loss": 0.76660055, + "num_input_tokens_seen": 91075485, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.90625, + "step": 4219, + "time_per_iteration": 5.428370952606201 + }, + { + "auxiliary_loss_clip": 0.01137942, + "auxiliary_loss_mlp": 0.01052625, + "balance_loss_clip": 1.0351491, + "balance_loss_mlp": 1.04695058, + "epoch": 0.2537201262588306, + "flos": 16289368116480.0, + "grad_norm": 2.1507448030921057, + "language_loss": 0.81194967, + "learning_rate": 3.497537904525736e-06, + "loss": 0.83385527, + "num_input_tokens_seen": 91093620, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.91015625, + "step": 4220, + "time_per_iteration": 2.454045534133911 + }, + { + "auxiliary_loss_clip": 0.01134951, + "auxiliary_loss_mlp": 0.01047743, + "balance_loss_clip": 1.03007603, + "balance_loss_mlp": 1.04596126, + "epoch": 0.2537802495114986, + "flos": 23294677789440.0, + "grad_norm": 2.058400170489012, + "language_loss": 0.70873475, + "learning_rate": 3.497279728822468e-06, + "loss": 0.73056173, + "num_input_tokens_seen": 91114110, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.890625, + "step": 4221, + "time_per_iteration": 2.4728429317474365 + }, + { + "auxiliary_loss_clip": 0.01134228, + "auxiliary_loss_mlp": 0.01039832, + "balance_loss_clip": 1.02309537, + "balance_loss_mlp": 1.0444454, + "epoch": 0.25384037276416654, + "flos": 17639142416640.0, + "grad_norm": 2.3290205392002847, + "language_loss": 0.62039649, + "learning_rate": 3.497021496342202e-06, + "loss": 0.64213717, + "num_input_tokens_seen": 91133135, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8984375, + "step": 4222, + "time_per_iteration": 2.4465436935424805 + }, + { + "auxiliary_loss_clip": 0.01137839, + "auxiliary_loss_mlp": 0.01052178, + "balance_loss_clip": 1.0352385, + "balance_loss_mlp": 1.04635429, + "epoch": 0.2539004960168345, + "flos": 21507044699520.0, + "grad_norm": 1.6514367228652884, + "language_loss": 0.74686599, + "learning_rate": 3.496763207094731e-06, + "loss": 0.76876616, + "num_input_tokens_seen": 91151805, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9140625, + "step": 4223, + "time_per_iteration": 2.449887275695801 + }, + { + "auxiliary_loss_clip": 0.01134875, + "auxiliary_loss_mlp": 0.01035973, + "balance_loss_clip": 1.02001095, + "balance_loss_mlp": 1.04763556, + "epoch": 0.2539606192695025, + "flos": 23950909313280.0, + "grad_norm": 1.7274606282993847, + "language_loss": 0.79782087, + "learning_rate": 3.49650486108985e-06, + "loss": 0.81952935, + "num_input_tokens_seen": 91172270, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.875, + "step": 4224, + "time_per_iteration": 2.4809348583221436 + }, + { + "auxiliary_loss_clip": 0.01129812, + "auxiliary_loss_mlp": 0.01043453, + "balance_loss_clip": 1.02668035, + "balance_loss_mlp": 1.04306865, + "epoch": 0.25402074252217044, + "flos": 24169784837760.0, + "grad_norm": 1.7388314634599362, + "language_loss": 0.77813148, + "learning_rate": 3.496246458337354e-06, + "loss": 0.79986417, + "num_input_tokens_seen": 91192080, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8671875, + "step": 4225, + "time_per_iteration": 2.4813735485076904 + }, + { + "auxiliary_loss_clip": 0.01135622, + "auxiliary_loss_mlp": 0.01054065, + "balance_loss_clip": 1.03661263, + "balance_loss_mlp": 1.04603362, + "epoch": 0.2540808657748384, + "flos": 22303758314880.0, + "grad_norm": 1.6070040517314534, + "language_loss": 0.84763634, + "learning_rate": 3.4959879988470426e-06, + "loss": 0.86953318, + "num_input_tokens_seen": 91211450, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.89453125, + "step": 4226, + "time_per_iteration": 2.4583990573883057 + }, + { + "auxiliary_loss_clip": 0.01130131, + "auxiliary_loss_mlp": 0.0104498, + "balance_loss_clip": 1.0277667, + "balance_loss_mlp": 1.04317141, + "epoch": 0.25414098902750637, + "flos": 27599541022080.0, + "grad_norm": 2.4872704745527168, + "language_loss": 0.70759654, + "learning_rate": 3.4957294826287164e-06, + "loss": 0.72934765, + "num_input_tokens_seen": 91231835, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.8671875, + "step": 4227, + "time_per_iteration": 2.532057762145996 + }, + { + "auxiliary_loss_clip": 0.01041509, + "auxiliary_loss_mlp": 0.01000975, + "balance_loss_clip": 0.9989962, + "balance_loss_mlp": 1.01186037, + "epoch": 0.25420111228017434, + "flos": 58170834887040.0, + "grad_norm": 0.9701035361715339, + "language_loss": 0.61865914, + "learning_rate": 3.4954709096921785e-06, + "loss": 0.63908398, + "num_input_tokens_seen": 91288755, + "router_z_loss_clip": 0.01977539, + "router_z_loss_mlp": 0.296875, + "step": 4228, + "time_per_iteration": 2.9040682315826416 + }, + { + "auxiliary_loss_clip": 0.01136332, + "auxiliary_loss_mlp": 0.01037582, + "balance_loss_clip": 1.02026105, + "balance_loss_mlp": 1.04564357, + "epoch": 0.2542612355328423, + "flos": 11464409905920.0, + "grad_norm": 4.885618231754604, + "language_loss": 0.86024547, + "learning_rate": 3.4952122800472336e-06, + "loss": 0.88198459, + "num_input_tokens_seen": 91302485, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.90625, + "step": 4229, + "time_per_iteration": 2.404157876968384 + }, + { + "auxiliary_loss_clip": 0.0113505, + "auxiliary_loss_mlp": 0.01044312, + "balance_loss_clip": 1.02696753, + "balance_loss_mlp": 1.0466435, + "epoch": 0.2543213587855103, + "flos": 22965879669120.0, + "grad_norm": 1.8862111092995248, + "language_loss": 0.77280557, + "learning_rate": 3.4949535937036892e-06, + "loss": 0.79459918, + "num_input_tokens_seen": 91321120, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.8828125, + "step": 4230, + "time_per_iteration": 2.4956207275390625 + }, + { + "auxiliary_loss_clip": 0.01133757, + "auxiliary_loss_mlp": 0.01046935, + "balance_loss_clip": 1.02980483, + "balance_loss_mlp": 1.04598594, + "epoch": 0.2543814820381783, + "flos": 18253178438400.0, + "grad_norm": 1.9381647251913205, + "language_loss": 0.75116754, + "learning_rate": 3.4946948506713544e-06, + "loss": 0.77297449, + "num_input_tokens_seen": 91338575, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.87890625, + "step": 4231, + "time_per_iteration": 2.4570302963256836 + }, + { + "auxiliary_loss_clip": 0.0113225, + "auxiliary_loss_mlp": 0.01038768, + "balance_loss_clip": 1.02253127, + "balance_loss_mlp": 1.04484463, + "epoch": 0.25444160529084625, + "flos": 15632705629440.0, + "grad_norm": 2.3236339630790916, + "language_loss": 0.74055511, + "learning_rate": 3.4944360509600416e-06, + "loss": 0.76226532, + "num_input_tokens_seen": 91357355, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.875, + "step": 4232, + "time_per_iteration": 2.4537932872772217 + }, + { + "auxiliary_loss_clip": 0.01134838, + "auxiliary_loss_mlp": 0.01041691, + "balance_loss_clip": 1.02412581, + "balance_loss_mlp": 1.04658151, + "epoch": 0.2545017285435142, + "flos": 24601610142720.0, + "grad_norm": 1.8521853851823955, + "language_loss": 0.86557174, + "learning_rate": 3.4941771945795637e-06, + "loss": 0.88733703, + "num_input_tokens_seen": 91376515, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8828125, + "step": 4233, + "time_per_iteration": 2.4943323135375977 + }, + { + "auxiliary_loss_clip": 0.01125532, + "auxiliary_loss_mlp": 0.01040876, + "balance_loss_clip": 1.02570057, + "balance_loss_mlp": 1.04215169, + "epoch": 0.2545618517961822, + "flos": 24679069822080.0, + "grad_norm": 1.5280608213400515, + "language_loss": 0.74841732, + "learning_rate": 3.493918281539737e-06, + "loss": 0.7700814, + "num_input_tokens_seen": 91397595, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8359375, + "step": 4234, + "time_per_iteration": 2.541349172592163 + }, + { + "auxiliary_loss_clip": 0.01133471, + "auxiliary_loss_mlp": 0.01042663, + "balance_loss_clip": 1.02661681, + "balance_loss_mlp": 1.04286838, + "epoch": 0.25462197504885015, + "flos": 23915106432000.0, + "grad_norm": 1.542232814469661, + "language_loss": 0.7489568, + "learning_rate": 3.493659311850379e-06, + "loss": 0.77071816, + "num_input_tokens_seen": 91417775, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.90625, + "step": 4235, + "time_per_iteration": 2.5059099197387695 + }, + { + "auxiliary_loss_clip": 0.01141785, + "auxiliary_loss_mlp": 0.01044345, + "balance_loss_clip": 1.02570069, + "balance_loss_mlp": 1.04655004, + "epoch": 0.2546820983015181, + "flos": 24789387467520.0, + "grad_norm": 2.0015253194085645, + "language_loss": 0.64487904, + "learning_rate": 3.4934002855213106e-06, + "loss": 0.6667403, + "num_input_tokens_seen": 91437665, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.953125, + "step": 4236, + "time_per_iteration": 2.512286424636841 + }, + { + "auxiliary_loss_clip": 0.01131709, + "auxiliary_loss_mlp": 0.01032901, + "balance_loss_clip": 1.01757693, + "balance_loss_mlp": 1.04509079, + "epoch": 0.2547422215541861, + "flos": 18734130570240.0, + "grad_norm": 1.5430935122242522, + "language_loss": 0.67046815, + "learning_rate": 3.493141202562354e-06, + "loss": 0.69211423, + "num_input_tokens_seen": 91456705, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8671875, + "step": 4237, + "time_per_iteration": 2.455911636352539 + }, + { + "auxiliary_loss_clip": 0.01134325, + "auxiliary_loss_mlp": 0.01045903, + "balance_loss_clip": 1.02916634, + "balance_loss_mlp": 1.04509199, + "epoch": 0.25480234480685404, + "flos": 21032449274880.0, + "grad_norm": 1.9754127990153556, + "language_loss": 0.74863333, + "learning_rate": 3.492882062983333e-06, + "loss": 0.77043563, + "num_input_tokens_seen": 91475535, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.89453125, + "step": 4238, + "time_per_iteration": 2.4770114421844482 + }, + { + "auxiliary_loss_clip": 0.01136693, + "auxiliary_loss_mlp": 0.0104647, + "balance_loss_clip": 1.02848125, + "balance_loss_mlp": 1.04734778, + "epoch": 0.254862468059522, + "flos": 25082167224960.0, + "grad_norm": 1.8397193389954023, + "language_loss": 0.8033936, + "learning_rate": 3.492622866794074e-06, + "loss": 0.82522523, + "num_input_tokens_seen": 91499140, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.89453125, + "step": 4239, + "time_per_iteration": 2.5087499618530273 + }, + { + "auxiliary_loss_clip": 0.01131893, + "auxiliary_loss_mlp": 0.01041684, + "balance_loss_clip": 1.02457762, + "balance_loss_mlp": 1.04512548, + "epoch": 0.25492259131219, + "flos": 20558392554240.0, + "grad_norm": 1.749971041952711, + "language_loss": 0.77208781, + "learning_rate": 3.492363614004407e-06, + "loss": 0.7938236, + "num_input_tokens_seen": 91518335, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.8671875, + "step": 4240, + "time_per_iteration": 2.4757072925567627 + }, + { + "auxiliary_loss_clip": 0.01141112, + "auxiliary_loss_mlp": 0.01042401, + "balance_loss_clip": 1.02463925, + "balance_loss_mlp": 1.04773092, + "epoch": 0.25498271456485794, + "flos": 25042485674880.0, + "grad_norm": 2.0511352101670126, + "language_loss": 0.83254647, + "learning_rate": 3.492104304624162e-06, + "loss": 0.85438156, + "num_input_tokens_seen": 91537655, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.93359375, + "step": 4241, + "time_per_iteration": 2.5062708854675293 + }, + { + "auxiliary_loss_clip": 0.01136148, + "auxiliary_loss_mlp": 0.01044003, + "balance_loss_clip": 1.02761221, + "balance_loss_mlp": 1.0463624, + "epoch": 0.2550428378175259, + "flos": 26178412354560.0, + "grad_norm": 1.6663950411566644, + "language_loss": 0.73410285, + "learning_rate": 3.4918449386631725e-06, + "loss": 0.75590432, + "num_input_tokens_seen": 91557545, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8984375, + "step": 4242, + "time_per_iteration": 2.5570173263549805 + }, + { + "auxiliary_loss_clip": 0.01136205, + "auxiliary_loss_mlp": 0.01038733, + "balance_loss_clip": 1.02249646, + "balance_loss_mlp": 1.04695976, + "epoch": 0.2551029610701939, + "flos": 15267170874240.0, + "grad_norm": 2.4092613771466453, + "language_loss": 0.72371018, + "learning_rate": 3.491585516131273e-06, + "loss": 0.74545956, + "num_input_tokens_seen": 91574405, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.890625, + "step": 4243, + "time_per_iteration": 2.440492868423462 + }, + { + "auxiliary_loss_clip": 0.01136318, + "auxiliary_loss_mlp": 0.01041492, + "balance_loss_clip": 1.02507675, + "balance_loss_mlp": 1.04668963, + "epoch": 0.2551630843228619, + "flos": 18112193556480.0, + "grad_norm": 2.3937572910440847, + "language_loss": 0.81865323, + "learning_rate": 3.491326037038301e-06, + "loss": 0.84043133, + "num_input_tokens_seen": 91593755, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8984375, + "step": 4244, + "time_per_iteration": 2.4728784561157227 + }, + { + "auxiliary_loss_clip": 0.01044231, + "auxiliary_loss_mlp": 0.01002536, + "balance_loss_clip": 1.00084293, + "balance_loss_mlp": 1.01474202, + "epoch": 0.25522320757552985, + "flos": 70520192167680.0, + "grad_norm": 0.7400094393930867, + "language_loss": 0.5777986, + "learning_rate": 3.4910665013940967e-06, + "loss": 0.5982663, + "num_input_tokens_seen": 91660335, + "router_z_loss_clip": 0.01696777, + "router_z_loss_mlp": 0.29492188, + "step": 4245, + "time_per_iteration": 3.155487537384033 + }, + { + "auxiliary_loss_clip": 0.01135489, + "auxiliary_loss_mlp": 0.01049355, + "balance_loss_clip": 1.03248656, + "balance_loss_mlp": 1.04526567, + "epoch": 0.2552833308281978, + "flos": 22893088757760.0, + "grad_norm": 1.9776048921576397, + "language_loss": 0.65246034, + "learning_rate": 3.4908069092085015e-06, + "loss": 0.67430878, + "num_input_tokens_seen": 91678500, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.90234375, + "step": 4246, + "time_per_iteration": 2.4889461994171143 + }, + { + "auxiliary_loss_clip": 0.01127053, + "auxiliary_loss_mlp": 0.0104224, + "balance_loss_clip": 1.02715993, + "balance_loss_mlp": 1.04366493, + "epoch": 0.2553434540808658, + "flos": 22053605022720.0, + "grad_norm": 1.748925776992144, + "language_loss": 0.81467927, + "learning_rate": 3.4905472604913585e-06, + "loss": 0.83637214, + "num_input_tokens_seen": 91696430, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8359375, + "step": 4247, + "time_per_iteration": 2.4680213928222656 + }, + { + "auxiliary_loss_clip": 0.0114026, + "auxiliary_loss_mlp": 0.01045845, + "balance_loss_clip": 1.02718902, + "balance_loss_mlp": 1.04570985, + "epoch": 0.25540357733353375, + "flos": 16544190176640.0, + "grad_norm": 2.9702547035135165, + "language_loss": 0.83062297, + "learning_rate": 3.490287555252514e-06, + "loss": 0.85248411, + "num_input_tokens_seen": 91713270, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.9453125, + "step": 4248, + "time_per_iteration": 2.446810245513916 + }, + { + "auxiliary_loss_clip": 0.01136577, + "auxiliary_loss_mlp": 0.01045007, + "balance_loss_clip": 1.02793586, + "balance_loss_mlp": 1.04672599, + "epoch": 0.2554637005862017, + "flos": 17565022702080.0, + "grad_norm": 2.21885342952208, + "language_loss": 0.84529531, + "learning_rate": 3.4900277935018166e-06, + "loss": 0.86711109, + "num_input_tokens_seen": 91728865, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.8984375, + "step": 4249, + "time_per_iteration": 2.4372382164001465 + }, + { + "auxiliary_loss_clip": 0.01044447, + "auxiliary_loss_mlp": 0.01003984, + "balance_loss_clip": 1.00217199, + "balance_loss_mlp": 1.01503897, + "epoch": 0.2555238238388697, + "flos": 72244763953920.0, + "grad_norm": 0.7531523874953217, + "language_loss": 0.56312215, + "learning_rate": 3.489767975249115e-06, + "loss": 0.58360648, + "num_input_tokens_seen": 91787470, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.29492188, + "step": 4250, + "time_per_iteration": 3.047654628753662 + }, + { + "auxiliary_loss_clip": 0.01133573, + "auxiliary_loss_mlp": 0.01038351, + "balance_loss_clip": 1.02139914, + "balance_loss_mlp": 1.04434705, + "epoch": 0.25558394709153764, + "flos": 24389414547840.0, + "grad_norm": 2.1374171101673243, + "language_loss": 0.80306417, + "learning_rate": 3.4895081005042632e-06, + "loss": 0.82478344, + "num_input_tokens_seen": 91805640, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.890625, + "step": 4251, + "time_per_iteration": 2.4866387844085693 + }, + { + "auxiliary_loss_clip": 0.01042955, + "auxiliary_loss_mlp": 0.01004928, + "balance_loss_clip": 1.00307989, + "balance_loss_mlp": 1.01383376, + "epoch": 0.2556440703442056, + "flos": 69231213636480.0, + "grad_norm": 0.7958061962206047, + "language_loss": 0.66077995, + "learning_rate": 3.4892481692771146e-06, + "loss": 0.6812588, + "num_input_tokens_seen": 91869695, + "router_z_loss_clip": 0.01843262, + "router_z_loss_mlp": 0.29296875, + "step": 4252, + "time_per_iteration": 3.117496967315674 + }, + { + "auxiliary_loss_clip": 0.011309, + "auxiliary_loss_mlp": 0.01037068, + "balance_loss_clip": 1.02198839, + "balance_loss_mlp": 1.04373813, + "epoch": 0.2557041935968736, + "flos": 24863902231680.0, + "grad_norm": 2.169743717969613, + "language_loss": 0.73382849, + "learning_rate": 3.4889881815775267e-06, + "loss": 0.75550812, + "num_input_tokens_seen": 91889920, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.87109375, + "step": 4253, + "time_per_iteration": 2.5709948539733887 + }, + { + "auxiliary_loss_clip": 0.01134729, + "auxiliary_loss_mlp": 0.01044447, + "balance_loss_clip": 1.02873516, + "balance_loss_mlp": 1.04698956, + "epoch": 0.25576431684954154, + "flos": 22492110257280.0, + "grad_norm": 1.9741012093631007, + "language_loss": 0.72927308, + "learning_rate": 3.488728137415357e-06, + "loss": 0.75106484, + "num_input_tokens_seen": 91908665, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.875, + "step": 4254, + "time_per_iteration": 2.509932518005371 + }, + { + "auxiliary_loss_clip": 0.01133463, + "auxiliary_loss_mlp": 0.01043601, + "balance_loss_clip": 1.02636361, + "balance_loss_mlp": 1.04452896, + "epoch": 0.2558244401022095, + "flos": 19826748426240.0, + "grad_norm": 1.7290530974650873, + "language_loss": 0.80863065, + "learning_rate": 3.4884680368004675e-06, + "loss": 0.8304013, + "num_input_tokens_seen": 91927855, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.890625, + "step": 4255, + "time_per_iteration": 2.4473092555999756 + }, + { + "auxiliary_loss_clip": 0.01133499, + "auxiliary_loss_mlp": 0.01043496, + "balance_loss_clip": 1.02681875, + "balance_loss_mlp": 1.04673088, + "epoch": 0.2558845633548775, + "flos": 23220486247680.0, + "grad_norm": 1.512169748685899, + "language_loss": 0.85572308, + "learning_rate": 3.488207879742721e-06, + "loss": 0.87749302, + "num_input_tokens_seen": 91948500, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8671875, + "step": 4256, + "time_per_iteration": 2.500788927078247 + }, + { + "auxiliary_loss_clip": 0.01136428, + "auxiliary_loss_mlp": 0.01048361, + "balance_loss_clip": 1.03119493, + "balance_loss_mlp": 1.04482555, + "epoch": 0.2559446866075455, + "flos": 16837867774080.0, + "grad_norm": 4.026866255210063, + "language_loss": 0.74821836, + "learning_rate": 3.4879476662519826e-06, + "loss": 0.77006626, + "num_input_tokens_seen": 91968375, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9140625, + "step": 4257, + "time_per_iteration": 2.4511358737945557 + }, + { + "auxiliary_loss_clip": 0.01040508, + "auxiliary_loss_mlp": 0.01009541, + "balance_loss_clip": 1.00763345, + "balance_loss_mlp": 1.01154876, + "epoch": 0.25600480986021346, + "flos": 57593786895360.0, + "grad_norm": 0.8061088541165783, + "language_loss": 0.65227318, + "learning_rate": 3.4876873963381196e-06, + "loss": 0.67277366, + "num_input_tokens_seen": 92028490, + "router_z_loss_clip": 0.01904297, + "router_z_loss_mlp": 0.2890625, + "step": 4258, + "time_per_iteration": 2.9953789710998535 + }, + { + "auxiliary_loss_clip": 0.01131454, + "auxiliary_loss_mlp": 0.0103789, + "balance_loss_clip": 1.0202589, + "balance_loss_mlp": 1.04548264, + "epoch": 0.2560649331128814, + "flos": 27819529868160.0, + "grad_norm": 1.622828615893818, + "language_loss": 0.7647177, + "learning_rate": 3.4874270700110013e-06, + "loss": 0.78641111, + "num_input_tokens_seen": 92048060, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.859375, + "step": 4259, + "time_per_iteration": 2.5079360008239746 + }, + { + "auxiliary_loss_clip": 0.01038142, + "auxiliary_loss_mlp": 0.01004188, + "balance_loss_clip": 1.00237584, + "balance_loss_mlp": 1.0093925, + "epoch": 0.2561250563655494, + "flos": 70950509101440.0, + "grad_norm": 0.7946947905759578, + "language_loss": 0.58501768, + "learning_rate": 3.4871666872804994e-06, + "loss": 0.60544097, + "num_input_tokens_seen": 92118180, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.28710938, + "step": 4260, + "time_per_iteration": 4.636982202529907 + }, + { + "auxiliary_loss_clip": 0.01131187, + "auxiliary_loss_mlp": 0.01044504, + "balance_loss_clip": 1.02759969, + "balance_loss_mlp": 1.04300261, + "epoch": 0.25618517961821735, + "flos": 27012329481600.0, + "grad_norm": 1.8728817118968701, + "language_loss": 0.76659095, + "learning_rate": 3.4869062481564875e-06, + "loss": 0.7883479, + "num_input_tokens_seen": 92137570, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8828125, + "step": 4261, + "time_per_iteration": 3.974956750869751 + }, + { + "auxiliary_loss_clip": 0.01130829, + "auxiliary_loss_mlp": 0.01037912, + "balance_loss_clip": 1.02280843, + "balance_loss_mlp": 1.04460573, + "epoch": 0.2562453028708853, + "flos": 23068296322560.0, + "grad_norm": 1.6516780840688012, + "language_loss": 0.8323037, + "learning_rate": 3.486645752648842e-06, + "loss": 0.85399115, + "num_input_tokens_seen": 92157625, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.86328125, + "step": 4262, + "time_per_iteration": 2.5251948833465576 + }, + { + "auxiliary_loss_clip": 0.01136997, + "auxiliary_loss_mlp": 0.01048847, + "balance_loss_clip": 1.03123951, + "balance_loss_mlp": 1.04404712, + "epoch": 0.2563054261235533, + "flos": 15120942606720.0, + "grad_norm": 2.7380780768968016, + "language_loss": 0.74153852, + "learning_rate": 3.4863852007674405e-06, + "loss": 0.76339698, + "num_input_tokens_seen": 92175350, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9296875, + "step": 4263, + "time_per_iteration": 2.42657208442688 + }, + { + "auxiliary_loss_clip": 0.01133473, + "auxiliary_loss_mlp": 0.01051758, + "balance_loss_clip": 1.03533101, + "balance_loss_mlp": 1.04720163, + "epoch": 0.25636554937622125, + "flos": 27854865872640.0, + "grad_norm": 1.7828084139599185, + "language_loss": 0.82793939, + "learning_rate": 3.486124592522163e-06, + "loss": 0.84979165, + "num_input_tokens_seen": 92196070, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.86328125, + "step": 4264, + "time_per_iteration": 2.534097194671631 + }, + { + "auxiliary_loss_clip": 0.01134463, + "auxiliary_loss_mlp": 0.01041936, + "balance_loss_clip": 1.02506804, + "balance_loss_mlp": 1.04660988, + "epoch": 0.2564256726288892, + "flos": 28906509288960.0, + "grad_norm": 1.7080317762970965, + "language_loss": 0.7443161, + "learning_rate": 3.4858639279228924e-06, + "loss": 0.76608008, + "num_input_tokens_seen": 92216310, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.87890625, + "step": 4265, + "time_per_iteration": 2.51088809967041 + }, + { + "auxiliary_loss_clip": 0.01129849, + "auxiliary_loss_mlp": 0.01032538, + "balance_loss_clip": 1.01679027, + "balance_loss_mlp": 1.0425024, + "epoch": 0.2564857958815572, + "flos": 18514931823360.0, + "grad_norm": 1.644190377842657, + "language_loss": 0.8153013, + "learning_rate": 3.485603206979513e-06, + "loss": 0.83692515, + "num_input_tokens_seen": 92234510, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.875, + "step": 4266, + "time_per_iteration": 2.4706335067749023 + }, + { + "auxiliary_loss_clip": 0.01128054, + "auxiliary_loss_mlp": 0.01035268, + "balance_loss_clip": 1.01909137, + "balance_loss_mlp": 1.04252076, + "epoch": 0.25654591913422514, + "flos": 25808280658560.0, + "grad_norm": 1.6333370834261398, + "language_loss": 0.79287028, + "learning_rate": 3.4853424297019103e-06, + "loss": 0.81450343, + "num_input_tokens_seen": 92254070, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.85546875, + "step": 4267, + "time_per_iteration": 2.4819366931915283 + }, + { + "auxiliary_loss_clip": 0.01127366, + "auxiliary_loss_mlp": 0.01041101, + "balance_loss_clip": 1.02480555, + "balance_loss_mlp": 1.04406714, + "epoch": 0.2566060423868931, + "flos": 19099665325440.0, + "grad_norm": 1.7559000109968124, + "language_loss": 0.78708017, + "learning_rate": 3.4850815960999736e-06, + "loss": 0.80876482, + "num_input_tokens_seen": 92275060, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8359375, + "step": 4268, + "time_per_iteration": 2.4778378009796143 + }, + { + "auxiliary_loss_clip": 0.0113239, + "auxiliary_loss_mlp": 0.01037875, + "balance_loss_clip": 1.02198434, + "balance_loss_mlp": 1.04507172, + "epoch": 0.25666616563956113, + "flos": 23842674656640.0, + "grad_norm": 2.2514359992660204, + "language_loss": 0.68120348, + "learning_rate": 3.484820706183595e-06, + "loss": 0.70290613, + "num_input_tokens_seen": 92293610, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.87109375, + "step": 4269, + "time_per_iteration": 2.4696271419525146 + }, + { + "auxiliary_loss_clip": 0.01134604, + "auxiliary_loss_mlp": 0.01042059, + "balance_loss_clip": 1.0249877, + "balance_loss_mlp": 1.04593778, + "epoch": 0.2567262888922291, + "flos": 14604259420800.0, + "grad_norm": 4.018282830570473, + "language_loss": 0.78496158, + "learning_rate": 3.484559759962666e-06, + "loss": 0.80672824, + "num_input_tokens_seen": 92308305, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.88671875, + "step": 4270, + "time_per_iteration": 2.418912172317505 + }, + { + "auxiliary_loss_clip": 0.01139603, + "auxiliary_loss_mlp": 0.01037807, + "balance_loss_clip": 1.01953244, + "balance_loss_mlp": 1.04711556, + "epoch": 0.25678641214489706, + "flos": 32923117877760.0, + "grad_norm": 2.0502449379686256, + "language_loss": 0.68136632, + "learning_rate": 3.4842987574470816e-06, + "loss": 0.70314038, + "num_input_tokens_seen": 92329875, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.921875, + "step": 4271, + "time_per_iteration": 2.5410749912261963 + }, + { + "auxiliary_loss_clip": 0.01137314, + "auxiliary_loss_mlp": 0.0104993, + "balance_loss_clip": 1.0325973, + "balance_loss_mlp": 1.04592848, + "epoch": 0.256846535397565, + "flos": 24098933260800.0, + "grad_norm": 4.518410893879739, + "language_loss": 0.8741951, + "learning_rate": 3.4840376986467403e-06, + "loss": 0.8960675, + "num_input_tokens_seen": 92348780, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9140625, + "step": 4272, + "time_per_iteration": 2.5022568702697754 + }, + { + "auxiliary_loss_clip": 0.0113724, + "auxiliary_loss_mlp": 0.01044761, + "balance_loss_clip": 1.02734506, + "balance_loss_mlp": 1.04770613, + "epoch": 0.256906658650233, + "flos": 19718441942400.0, + "grad_norm": 1.953603621991432, + "language_loss": 0.81442308, + "learning_rate": 3.483776583571541e-06, + "loss": 0.83624303, + "num_input_tokens_seen": 92368175, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.8984375, + "step": 4273, + "time_per_iteration": 2.453834295272827 + }, + { + "auxiliary_loss_clip": 0.01131691, + "auxiliary_loss_mlp": 0.01041869, + "balance_loss_clip": 1.02492929, + "balance_loss_mlp": 1.04724693, + "epoch": 0.25696678190290095, + "flos": 22926018551040.0, + "grad_norm": 1.682161023261006, + "language_loss": 0.77215779, + "learning_rate": 3.4835154122313846e-06, + "loss": 0.79389334, + "num_input_tokens_seen": 92387755, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.84375, + "step": 4274, + "time_per_iteration": 2.486238956451416 + }, + { + "auxiliary_loss_clip": 0.01129914, + "auxiliary_loss_mlp": 0.01037046, + "balance_loss_clip": 1.02061856, + "balance_loss_mlp": 1.04450369, + "epoch": 0.2570269051555689, + "flos": 27307838672640.0, + "grad_norm": 1.8548211040661395, + "language_loss": 0.8401829, + "learning_rate": 3.4832541846361743e-06, + "loss": 0.86185247, + "num_input_tokens_seen": 92409850, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8515625, + "step": 4275, + "time_per_iteration": 2.5145719051361084 + }, + { + "auxiliary_loss_clip": 0.01133209, + "auxiliary_loss_mlp": 0.0103751, + "balance_loss_clip": 1.02078438, + "balance_loss_mlp": 1.04492021, + "epoch": 0.2570870284082369, + "flos": 27563414918400.0, + "grad_norm": 3.0116628321367678, + "language_loss": 0.78124094, + "learning_rate": 3.4829929007958175e-06, + "loss": 0.80294812, + "num_input_tokens_seen": 92431250, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8828125, + "step": 4276, + "time_per_iteration": 2.533989906311035 + }, + { + "auxiliary_loss_clip": 0.01133318, + "auxiliary_loss_mlp": 0.01043592, + "balance_loss_clip": 1.02723646, + "balance_loss_mlp": 1.04575086, + "epoch": 0.25714715166090485, + "flos": 28730834847360.0, + "grad_norm": 1.750550841347414, + "language_loss": 0.79439288, + "learning_rate": 3.4827315607202214e-06, + "loss": 0.81616199, + "num_input_tokens_seen": 92452065, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.875, + "step": 4277, + "time_per_iteration": 2.5131442546844482 + }, + { + "auxiliary_loss_clip": 0.01134263, + "auxiliary_loss_mlp": 0.01036244, + "balance_loss_clip": 1.01981688, + "balance_loss_mlp": 1.04671657, + "epoch": 0.2572072749135728, + "flos": 20116152305280.0, + "grad_norm": 2.0431628844466543, + "language_loss": 0.78804862, + "learning_rate": 3.482470164419295e-06, + "loss": 0.80975372, + "num_input_tokens_seen": 92470025, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.875, + "step": 4278, + "time_per_iteration": 2.4813432693481445 + }, + { + "auxiliary_loss_clip": 0.01137794, + "auxiliary_loss_mlp": 0.01039572, + "balance_loss_clip": 1.02299643, + "balance_loss_mlp": 1.04657972, + "epoch": 0.2572673981662408, + "flos": 26030855283840.0, + "grad_norm": 2.020871128069371, + "language_loss": 0.74624676, + "learning_rate": 3.482208711902952e-06, + "loss": 0.76802039, + "num_input_tokens_seen": 92489825, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9140625, + "step": 4279, + "time_per_iteration": 2.4989213943481445 + }, + { + "auxiliary_loss_clip": 0.01136508, + "auxiliary_loss_mlp": 0.01051836, + "balance_loss_clip": 1.03472984, + "balance_loss_mlp": 1.04528475, + "epoch": 0.25732752141890874, + "flos": 16106618695680.0, + "grad_norm": 2.295268067844067, + "language_loss": 0.85406947, + "learning_rate": 3.4819472031811065e-06, + "loss": 0.87595296, + "num_input_tokens_seen": 92507270, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9140625, + "step": 4280, + "time_per_iteration": 2.479163408279419 + }, + { + "auxiliary_loss_clip": 0.0113599, + "auxiliary_loss_mlp": 0.01041209, + "balance_loss_clip": 1.02362585, + "balance_loss_mlp": 1.04583955, + "epoch": 0.2573876446715767, + "flos": 22524429519360.0, + "grad_norm": 2.2211313624852447, + "language_loss": 0.78780186, + "learning_rate": 3.4816856382636744e-06, + "loss": 0.80957377, + "num_input_tokens_seen": 92526300, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8984375, + "step": 4281, + "time_per_iteration": 2.463003158569336 + }, + { + "auxiliary_loss_clip": 0.01134819, + "auxiliary_loss_mlp": 0.01039212, + "balance_loss_clip": 1.02277303, + "balance_loss_mlp": 1.0472312, + "epoch": 0.2574477679242447, + "flos": 23950837486080.0, + "grad_norm": 1.9444978312753, + "language_loss": 0.87356091, + "learning_rate": 3.4814240171605737e-06, + "loss": 0.89530122, + "num_input_tokens_seen": 92546465, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.875, + "step": 4282, + "time_per_iteration": 2.5049889087677 + }, + { + "auxiliary_loss_clip": 0.01137104, + "auxiliary_loss_mlp": 0.0104319, + "balance_loss_clip": 1.02739513, + "balance_loss_mlp": 1.04648709, + "epoch": 0.2575078911769127, + "flos": 21981711951360.0, + "grad_norm": 1.5754049466604292, + "language_loss": 0.70172656, + "learning_rate": 3.4811623398817267e-06, + "loss": 0.72352946, + "num_input_tokens_seen": 92567260, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.90625, + "step": 4283, + "time_per_iteration": 2.520315408706665 + }, + { + "auxiliary_loss_clip": 0.01132284, + "auxiliary_loss_mlp": 0.01042212, + "balance_loss_clip": 1.02698922, + "balance_loss_mlp": 1.04772711, + "epoch": 0.25756801442958066, + "flos": 21945406279680.0, + "grad_norm": 2.712350413324169, + "language_loss": 0.80323613, + "learning_rate": 3.4809006064370553e-06, + "loss": 0.82498109, + "num_input_tokens_seen": 92585425, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.84765625, + "step": 4284, + "time_per_iteration": 2.483292579650879 + }, + { + "auxiliary_loss_clip": 0.01134487, + "auxiliary_loss_mlp": 0.01040012, + "balance_loss_clip": 1.02538466, + "balance_loss_mlp": 1.04674387, + "epoch": 0.2576281376822486, + "flos": 35261980058880.0, + "grad_norm": 2.1742402973432893, + "language_loss": 0.70485193, + "learning_rate": 3.4806388168364835e-06, + "loss": 0.72659695, + "num_input_tokens_seen": 92604770, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.875, + "step": 4285, + "time_per_iteration": 2.564211130142212 + }, + { + "auxiliary_loss_clip": 0.01137353, + "auxiliary_loss_mlp": 0.01038151, + "balance_loss_clip": 1.02282071, + "balance_loss_mlp": 1.04953337, + "epoch": 0.2576882609349166, + "flos": 14132285688960.0, + "grad_norm": 2.328286971317511, + "language_loss": 0.58380014, + "learning_rate": 3.4803769710899402e-06, + "loss": 0.60555518, + "num_input_tokens_seen": 92622635, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.87890625, + "step": 4286, + "time_per_iteration": 2.4425430297851562 + }, + { + "auxiliary_loss_clip": 0.01139826, + "auxiliary_loss_mlp": 0.01043664, + "balance_loss_clip": 1.02702272, + "balance_loss_mlp": 1.04858327, + "epoch": 0.25774838418758456, + "flos": 23258336204160.0, + "grad_norm": 1.6452331987585218, + "language_loss": 0.64191288, + "learning_rate": 3.480115069207354e-06, + "loss": 0.66374773, + "num_input_tokens_seen": 92642960, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.91015625, + "step": 4287, + "time_per_iteration": 2.470015287399292 + }, + { + "auxiliary_loss_clip": 0.0113955, + "auxiliary_loss_mlp": 0.01040533, + "balance_loss_clip": 1.02293801, + "balance_loss_mlp": 1.04739881, + "epoch": 0.2578085074402525, + "flos": 22601745544320.0, + "grad_norm": 2.0830358142366148, + "language_loss": 0.72029591, + "learning_rate": 3.4798531111986557e-06, + "loss": 0.74209672, + "num_input_tokens_seen": 92662455, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.921875, + "step": 4288, + "time_per_iteration": 2.4983417987823486 + }, + { + "auxiliary_loss_clip": 0.01135736, + "auxiliary_loss_mlp": 0.01038411, + "balance_loss_clip": 1.02263355, + "balance_loss_mlp": 1.04882312, + "epoch": 0.2578686306929205, + "flos": 24571840746240.0, + "grad_norm": 1.9870049696680936, + "language_loss": 0.76965904, + "learning_rate": 3.4795910970737786e-06, + "loss": 0.79140055, + "num_input_tokens_seen": 92683520, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8671875, + "step": 4289, + "time_per_iteration": 2.4997475147247314 + }, + { + "auxiliary_loss_clip": 0.01134323, + "auxiliary_loss_mlp": 0.01040378, + "balance_loss_clip": 1.02311635, + "balance_loss_mlp": 1.04562807, + "epoch": 0.25792875394558845, + "flos": 18113953322880.0, + "grad_norm": 1.946897603323323, + "language_loss": 0.85123539, + "learning_rate": 3.4793290268426592e-06, + "loss": 0.87298238, + "num_input_tokens_seen": 92701450, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.88671875, + "step": 4290, + "time_per_iteration": 2.454871416091919 + }, + { + "auxiliary_loss_clip": 0.01140117, + "auxiliary_loss_mlp": 0.0105053, + "balance_loss_clip": 1.03159952, + "balance_loss_mlp": 1.04959655, + "epoch": 0.2579888771982564, + "flos": 17712902995200.0, + "grad_norm": 2.195715426849753, + "language_loss": 0.72170424, + "learning_rate": 3.4790669005152354e-06, + "loss": 0.74361074, + "num_input_tokens_seen": 92720355, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.90625, + "step": 4291, + "time_per_iteration": 2.4512693881988525 + }, + { + "auxiliary_loss_clip": 0.01142047, + "auxiliary_loss_mlp": 0.01041391, + "balance_loss_clip": 1.02436781, + "balance_loss_mlp": 1.05002344, + "epoch": 0.2580490004509244, + "flos": 16434878112000.0, + "grad_norm": 2.4805881311796423, + "language_loss": 0.80718195, + "learning_rate": 3.4788047181014458e-06, + "loss": 0.82901633, + "num_input_tokens_seen": 92736755, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.921875, + "step": 4292, + "time_per_iteration": 2.469034433364868 + }, + { + "auxiliary_loss_clip": 0.01141659, + "auxiliary_loss_mlp": 0.01044805, + "balance_loss_clip": 1.02767503, + "balance_loss_mlp": 1.05171072, + "epoch": 0.25810912370359235, + "flos": 33835141128960.0, + "grad_norm": 7.501455001056755, + "language_loss": 0.67646754, + "learning_rate": 3.4785424796112337e-06, + "loss": 0.69833219, + "num_input_tokens_seen": 92757655, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.8984375, + "step": 4293, + "time_per_iteration": 2.5785787105560303 + }, + { + "auxiliary_loss_clip": 0.01130106, + "auxiliary_loss_mlp": 0.01042426, + "balance_loss_clip": 1.02660704, + "balance_loss_mlp": 1.04503, + "epoch": 0.2581692469562603, + "flos": 25192197561600.0, + "grad_norm": 1.9136357435420137, + "language_loss": 0.75409257, + "learning_rate": 3.478280185054542e-06, + "loss": 0.77581787, + "num_input_tokens_seen": 92776100, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8515625, + "step": 4294, + "time_per_iteration": 2.5044636726379395 + }, + { + "auxiliary_loss_clip": 0.01136505, + "auxiliary_loss_mlp": 0.01047021, + "balance_loss_clip": 1.02974749, + "balance_loss_mlp": 1.04808116, + "epoch": 0.2582293702089283, + "flos": 34932212271360.0, + "grad_norm": 2.168244565891273, + "language_loss": 0.81049722, + "learning_rate": 3.478017834441318e-06, + "loss": 0.83233249, + "num_input_tokens_seen": 92798880, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.8828125, + "step": 4295, + "time_per_iteration": 2.5875558853149414 + }, + { + "auxiliary_loss_clip": 0.01140472, + "auxiliary_loss_mlp": 0.01046123, + "balance_loss_clip": 1.02797985, + "balance_loss_mlp": 1.04796624, + "epoch": 0.2582894934615963, + "flos": 26833746038400.0, + "grad_norm": 2.1973562505628026, + "language_loss": 0.72515166, + "learning_rate": 3.4777554277815096e-06, + "loss": 0.74701762, + "num_input_tokens_seen": 92817750, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.92578125, + "step": 4296, + "time_per_iteration": 2.535693407058716 + }, + { + "auxiliary_loss_clip": 0.01138613, + "auxiliary_loss_mlp": 0.01039903, + "balance_loss_clip": 1.02322531, + "balance_loss_mlp": 1.04918242, + "epoch": 0.25834961671426426, + "flos": 23515241253120.0, + "grad_norm": 1.8330269406357795, + "language_loss": 0.86766148, + "learning_rate": 3.477492965085067e-06, + "loss": 0.88944662, + "num_input_tokens_seen": 92837995, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.89453125, + "step": 4297, + "time_per_iteration": 2.5001306533813477 + }, + { + "auxiliary_loss_clip": 0.01137068, + "auxiliary_loss_mlp": 0.01048271, + "balance_loss_clip": 1.03208232, + "balance_loss_mlp": 1.04755223, + "epoch": 0.25840973996693223, + "flos": 22451028076800.0, + "grad_norm": 2.2622150737063955, + "language_loss": 0.84706259, + "learning_rate": 3.477230446361943e-06, + "loss": 0.86891592, + "num_input_tokens_seen": 92857245, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.89453125, + "step": 4298, + "time_per_iteration": 2.489917278289795 + }, + { + "auxiliary_loss_clip": 0.01137188, + "auxiliary_loss_mlp": 0.0103747, + "balance_loss_clip": 1.02069676, + "balance_loss_mlp": 1.04739285, + "epoch": 0.2584698632196002, + "flos": 11290854366720.0, + "grad_norm": 2.0676974538336266, + "language_loss": 0.83596241, + "learning_rate": 3.4769678716220927e-06, + "loss": 0.85770899, + "num_input_tokens_seen": 92873265, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8984375, + "step": 4299, + "time_per_iteration": 2.4274845123291016 + }, + { + "auxiliary_loss_clip": 0.0113508, + "auxiliary_loss_mlp": 0.01035558, + "balance_loss_clip": 1.01985788, + "balance_loss_mlp": 1.04795814, + "epoch": 0.25852998647226816, + "flos": 17929982839680.0, + "grad_norm": 2.477231855960524, + "language_loss": 0.82685435, + "learning_rate": 3.4767052408754726e-06, + "loss": 0.84856081, + "num_input_tokens_seen": 92890880, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.87109375, + "step": 4300, + "time_per_iteration": 2.4730846881866455 + }, + { + "auxiliary_loss_clip": 0.01137103, + "auxiliary_loss_mlp": 0.0104166, + "balance_loss_clip": 1.02492332, + "balance_loss_mlp": 1.04620934, + "epoch": 0.2585901097249361, + "flos": 33256117889280.0, + "grad_norm": 2.2046546957653077, + "language_loss": 0.67186987, + "learning_rate": 3.4764425541320417e-06, + "loss": 0.69365752, + "num_input_tokens_seen": 92910770, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.91015625, + "step": 4301, + "time_per_iteration": 2.5633106231689453 + }, + { + "auxiliary_loss_clip": 0.01141797, + "auxiliary_loss_mlp": 0.01039122, + "balance_loss_clip": 1.02191997, + "balance_loss_mlp": 1.04805672, + "epoch": 0.2586502329776041, + "flos": 18441278985600.0, + "grad_norm": 2.459016606739088, + "language_loss": 0.80929118, + "learning_rate": 3.4761798114017617e-06, + "loss": 0.83110034, + "num_input_tokens_seen": 92929520, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9375, + "step": 4302, + "time_per_iteration": 5.438407897949219 + }, + { + "auxiliary_loss_clip": 0.01138396, + "auxiliary_loss_mlp": 0.0104179, + "balance_loss_clip": 1.02535129, + "balance_loss_mlp": 1.04789591, + "epoch": 0.25871035623027205, + "flos": 17968120104960.0, + "grad_norm": 2.9925401825996545, + "language_loss": 0.92246419, + "learning_rate": 3.475917012694595e-06, + "loss": 0.94426608, + "num_input_tokens_seen": 92947890, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90625, + "step": 4303, + "time_per_iteration": 2.514573574066162 + }, + { + "auxiliary_loss_clip": 0.01139372, + "auxiliary_loss_mlp": 0.01036604, + "balance_loss_clip": 1.020046, + "balance_loss_mlp": 1.04932761, + "epoch": 0.25877047948294, + "flos": 27777729415680.0, + "grad_norm": 1.8070234866344623, + "language_loss": 0.67034984, + "learning_rate": 3.475654158020507e-06, + "loss": 0.69210964, + "num_input_tokens_seen": 92967690, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8984375, + "step": 4304, + "time_per_iteration": 2.540682315826416 + }, + { + "auxiliary_loss_clip": 0.01138164, + "auxiliary_loss_mlp": 0.0105089, + "balance_loss_clip": 1.03355694, + "balance_loss_mlp": 1.04595923, + "epoch": 0.258830602735608, + "flos": 27125843437440.0, + "grad_norm": 2.73594521825367, + "language_loss": 0.72829735, + "learning_rate": 3.4753912473894657e-06, + "loss": 0.75018799, + "num_input_tokens_seen": 92986830, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.921875, + "step": 4305, + "time_per_iteration": 2.580801248550415 + }, + { + "auxiliary_loss_clip": 0.01138565, + "auxiliary_loss_mlp": 0.01041261, + "balance_loss_clip": 1.02417874, + "balance_loss_mlp": 1.04731607, + "epoch": 0.25889072598827595, + "flos": 17891486438400.0, + "grad_norm": 2.196623082948333, + "language_loss": 0.75595653, + "learning_rate": 3.4751282808114403e-06, + "loss": 0.77775478, + "num_input_tokens_seen": 93002740, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9140625, + "step": 4306, + "time_per_iteration": 2.44267201423645 + }, + { + "auxiliary_loss_clip": 0.01045399, + "auxiliary_loss_mlp": 0.01003539, + "balance_loss_clip": 1.00138175, + "balance_loss_mlp": 1.01567113, + "epoch": 0.2589508492409439, + "flos": 53934955724160.0, + "grad_norm": 0.8506593293873899, + "language_loss": 0.5717386, + "learning_rate": 3.474865258296403e-06, + "loss": 0.59222794, + "num_input_tokens_seen": 93058645, + "router_z_loss_clip": 0.02160645, + "router_z_loss_mlp": 0.296875, + "step": 4307, + "time_per_iteration": 3.0457189083099365 + }, + { + "auxiliary_loss_clip": 0.01135839, + "auxiliary_loss_mlp": 0.01039878, + "balance_loss_clip": 1.02389181, + "balance_loss_mlp": 1.04729199, + "epoch": 0.2590109724936119, + "flos": 22125785402880.0, + "grad_norm": 1.7695447826328226, + "language_loss": 0.71543598, + "learning_rate": 3.474602179854327e-06, + "loss": 0.73719311, + "num_input_tokens_seen": 93077140, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8828125, + "step": 4308, + "time_per_iteration": 2.4612655639648438 + }, + { + "auxiliary_loss_clip": 0.0113812, + "auxiliary_loss_mlp": 0.01041087, + "balance_loss_clip": 1.02439809, + "balance_loss_mlp": 1.04625905, + "epoch": 0.2590710957462799, + "flos": 13474294398720.0, + "grad_norm": 2.097007373458932, + "language_loss": 0.84195936, + "learning_rate": 3.4743390454951886e-06, + "loss": 0.86375141, + "num_input_tokens_seen": 93093580, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.91796875, + "step": 4309, + "time_per_iteration": 2.458937883377075 + }, + { + "auxiliary_loss_clip": 0.01138522, + "auxiliary_loss_mlp": 0.01041522, + "balance_loss_clip": 1.02609062, + "balance_loss_mlp": 1.04893243, + "epoch": 0.25913121899894787, + "flos": 22307098279680.0, + "grad_norm": 1.520786669442297, + "language_loss": 0.8451637, + "learning_rate": 3.474075855228966e-06, + "loss": 0.8669641, + "num_input_tokens_seen": 93112345, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8984375, + "step": 4310, + "time_per_iteration": 2.453946828842163 + }, + { + "auxiliary_loss_clip": 0.0113925, + "auxiliary_loss_mlp": 0.01043346, + "balance_loss_clip": 1.02706194, + "balance_loss_mlp": 1.04705715, + "epoch": 0.25919134225161583, + "flos": 25811728364160.0, + "grad_norm": 2.3904067628525305, + "language_loss": 0.77478111, + "learning_rate": 3.473812609065639e-06, + "loss": 0.79660702, + "num_input_tokens_seen": 93131545, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.921875, + "step": 4311, + "time_per_iteration": 2.5142812728881836 + }, + { + "auxiliary_loss_clip": 0.01138542, + "auxiliary_loss_mlp": 0.01041115, + "balance_loss_clip": 1.0248189, + "balance_loss_mlp": 1.04691362, + "epoch": 0.2592514655042838, + "flos": 31212262108800.0, + "grad_norm": 3.1447136536803852, + "language_loss": 0.72220832, + "learning_rate": 3.4735493070151904e-06, + "loss": 0.74400491, + "num_input_tokens_seen": 93150730, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.91796875, + "step": 4312, + "time_per_iteration": 2.5275332927703857 + }, + { + "auxiliary_loss_clip": 0.01134993, + "auxiliary_loss_mlp": 0.01040705, + "balance_loss_clip": 1.02434921, + "balance_loss_mlp": 1.04480851, + "epoch": 0.25931158875695176, + "flos": 18474998878080.0, + "grad_norm": 2.2264539824076683, + "language_loss": 0.69908661, + "learning_rate": 3.4732859490876044e-06, + "loss": 0.72084355, + "num_input_tokens_seen": 93167895, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90234375, + "step": 4313, + "time_per_iteration": 2.479011058807373 + }, + { + "auxiliary_loss_clip": 0.01133563, + "auxiliary_loss_mlp": 0.01043367, + "balance_loss_clip": 1.02800131, + "balance_loss_mlp": 1.04467726, + "epoch": 0.2593717120096197, + "flos": 19207935895680.0, + "grad_norm": 1.7186396349483555, + "language_loss": 0.80486274, + "learning_rate": 3.473022535292867e-06, + "loss": 0.82663202, + "num_input_tokens_seen": 93187650, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.88671875, + "step": 4314, + "time_per_iteration": 2.443934679031372 + }, + { + "auxiliary_loss_clip": 0.01138226, + "auxiliary_loss_mlp": 0.01047643, + "balance_loss_clip": 1.03030992, + "balance_loss_mlp": 1.04506671, + "epoch": 0.2594318352622877, + "flos": 31248100903680.0, + "grad_norm": 2.0498851814527863, + "language_loss": 0.6687156, + "learning_rate": 3.472759065640968e-06, + "loss": 0.69057429, + "num_input_tokens_seen": 93207370, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9296875, + "step": 4315, + "time_per_iteration": 2.5375983715057373 + }, + { + "auxiliary_loss_clip": 0.01132586, + "auxiliary_loss_mlp": 0.01040869, + "balance_loss_clip": 1.02563381, + "balance_loss_mlp": 1.04426146, + "epoch": 0.25949195851495566, + "flos": 22237144542720.0, + "grad_norm": 1.5303062780919283, + "language_loss": 0.7911852, + "learning_rate": 3.4724955401418976e-06, + "loss": 0.81291974, + "num_input_tokens_seen": 93227925, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8828125, + "step": 4316, + "time_per_iteration": 2.448997735977173 + }, + { + "auxiliary_loss_clip": 0.01136257, + "auxiliary_loss_mlp": 0.01039906, + "balance_loss_clip": 1.02333546, + "balance_loss_mlp": 1.0446136, + "epoch": 0.2595520817676236, + "flos": 28075716645120.0, + "grad_norm": 1.687308210321376, + "language_loss": 0.77601087, + "learning_rate": 3.4722319588056487e-06, + "loss": 0.79777247, + "num_input_tokens_seen": 93250020, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9140625, + "step": 4317, + "time_per_iteration": 2.5545339584350586 + }, + { + "auxiliary_loss_clip": 0.01136641, + "auxiliary_loss_mlp": 0.01048751, + "balance_loss_clip": 1.03160882, + "balance_loss_mlp": 1.04599953, + "epoch": 0.2596122050202916, + "flos": 20190954378240.0, + "grad_norm": 2.5535432929686883, + "language_loss": 0.77773315, + "learning_rate": 3.4719683216422163e-06, + "loss": 0.79958701, + "num_input_tokens_seen": 93269070, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.90625, + "step": 4318, + "time_per_iteration": 2.450573682785034 + }, + { + "auxiliary_loss_clip": 0.01133741, + "auxiliary_loss_mlp": 0.01045128, + "balance_loss_clip": 1.02717471, + "balance_loss_mlp": 1.04450393, + "epoch": 0.25967232827295955, + "flos": 22527949052160.0, + "grad_norm": 1.801084946435003, + "language_loss": 0.76197278, + "learning_rate": 3.471704628661598e-06, + "loss": 0.78376144, + "num_input_tokens_seen": 93290250, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.890625, + "step": 4319, + "time_per_iteration": 2.5243709087371826 + }, + { + "auxiliary_loss_clip": 0.01131874, + "auxiliary_loss_mlp": 0.01037382, + "balance_loss_clip": 1.0216701, + "balance_loss_mlp": 1.04500592, + "epoch": 0.2597324515256275, + "flos": 21068252156160.0, + "grad_norm": 1.8511829127720039, + "language_loss": 0.76338619, + "learning_rate": 3.4714408798737925e-06, + "loss": 0.78507876, + "num_input_tokens_seen": 93310090, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8671875, + "step": 4320, + "time_per_iteration": 2.4792070388793945 + }, + { + "auxiliary_loss_clip": 0.01135729, + "auxiliary_loss_mlp": 0.01038323, + "balance_loss_clip": 1.02205038, + "balance_loss_mlp": 1.04641151, + "epoch": 0.2597925747782955, + "flos": 22050013662720.0, + "grad_norm": 1.7592602092397844, + "language_loss": 0.71143925, + "learning_rate": 3.471177075288801e-06, + "loss": 0.73317981, + "num_input_tokens_seen": 93329570, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.890625, + "step": 4321, + "time_per_iteration": 2.5381112098693848 + }, + { + "auxiliary_loss_clip": 0.01137982, + "auxiliary_loss_mlp": 0.01044713, + "balance_loss_clip": 1.02813125, + "balance_loss_mlp": 1.04517424, + "epoch": 0.2598526980309635, + "flos": 19536949497600.0, + "grad_norm": 2.037757848326605, + "language_loss": 0.74483943, + "learning_rate": 3.4709132149166277e-06, + "loss": 0.76666641, + "num_input_tokens_seen": 93347920, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.9296875, + "step": 4322, + "time_per_iteration": 2.4379777908325195 + }, + { + "auxiliary_loss_clip": 0.01134471, + "auxiliary_loss_mlp": 0.01047461, + "balance_loss_clip": 1.03059244, + "balance_loss_mlp": 1.04368353, + "epoch": 0.25991282128363147, + "flos": 24495207079680.0, + "grad_norm": 1.9467125010752846, + "language_loss": 0.73674595, + "learning_rate": 3.470649298767278e-06, + "loss": 0.75856531, + "num_input_tokens_seen": 93367145, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.90625, + "step": 4323, + "time_per_iteration": 2.517399549484253 + }, + { + "auxiliary_loss_clip": 0.01141538, + "auxiliary_loss_mlp": 0.01044133, + "balance_loss_clip": 1.0263952, + "balance_loss_mlp": 1.04524922, + "epoch": 0.25997294453629943, + "flos": 24201457655040.0, + "grad_norm": 2.197207179409235, + "language_loss": 0.6710211, + "learning_rate": 3.4703853268507597e-06, + "loss": 0.69287789, + "num_input_tokens_seen": 93386555, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.96484375, + "step": 4324, + "time_per_iteration": 2.478419303894043 + }, + { + "auxiliary_loss_clip": 0.01132905, + "auxiliary_loss_mlp": 0.01043334, + "balance_loss_clip": 1.02839708, + "balance_loss_mlp": 1.04456055, + "epoch": 0.2600330677889674, + "flos": 31431460855680.0, + "grad_norm": 2.3342631450552838, + "language_loss": 0.70809424, + "learning_rate": 3.470121299177082e-06, + "loss": 0.72985667, + "num_input_tokens_seen": 93405590, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8828125, + "step": 4325, + "time_per_iteration": 2.5444648265838623 + }, + { + "auxiliary_loss_clip": 0.01133012, + "auxiliary_loss_mlp": 0.01037608, + "balance_loss_clip": 1.02139568, + "balance_loss_mlp": 1.04295206, + "epoch": 0.26009319104163536, + "flos": 32266527217920.0, + "grad_norm": 2.476658211689484, + "language_loss": 0.73041123, + "learning_rate": 3.469857215756257e-06, + "loss": 0.7521174, + "num_input_tokens_seen": 93424750, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.90234375, + "step": 4326, + "time_per_iteration": 2.5281147956848145 + }, + { + "auxiliary_loss_clip": 0.01127256, + "auxiliary_loss_mlp": 0.01038648, + "balance_loss_clip": 1.02424729, + "balance_loss_mlp": 1.04237306, + "epoch": 0.26015331429430333, + "flos": 26286754752000.0, + "grad_norm": 1.820673081097861, + "language_loss": 0.8661378, + "learning_rate": 3.4695930765982997e-06, + "loss": 0.88779688, + "num_input_tokens_seen": 93443465, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8515625, + "step": 4327, + "time_per_iteration": 2.4929087162017822 + }, + { + "auxiliary_loss_clip": 0.01138344, + "auxiliary_loss_mlp": 0.01049414, + "balance_loss_clip": 1.03121042, + "balance_loss_mlp": 1.04679346, + "epoch": 0.2602134375469713, + "flos": 21142335957120.0, + "grad_norm": 2.002075266566112, + "language_loss": 0.80111909, + "learning_rate": 3.4693288817132255e-06, + "loss": 0.82299662, + "num_input_tokens_seen": 93462580, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.9140625, + "step": 4328, + "time_per_iteration": 2.451131582260132 + }, + { + "auxiliary_loss_clip": 0.0112995, + "auxiliary_loss_mlp": 0.01040006, + "balance_loss_clip": 1.02394891, + "balance_loss_mlp": 1.04219353, + "epoch": 0.26027356079963926, + "flos": 25921327737600.0, + "grad_norm": 1.514483384647774, + "language_loss": 0.87428784, + "learning_rate": 3.4690646311110525e-06, + "loss": 0.89598739, + "num_input_tokens_seen": 93482790, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87890625, + "step": 4329, + "time_per_iteration": 2.522368907928467 + }, + { + "auxiliary_loss_clip": 0.01132983, + "auxiliary_loss_mlp": 0.010381, + "balance_loss_clip": 1.02261448, + "balance_loss_mlp": 1.04585731, + "epoch": 0.2603336840523072, + "flos": 26359222440960.0, + "grad_norm": 2.096665977126354, + "language_loss": 0.77746803, + "learning_rate": 3.468800324801802e-06, + "loss": 0.79917884, + "num_input_tokens_seen": 93498795, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.87109375, + "step": 4330, + "time_per_iteration": 2.4771482944488525 + }, + { + "auxiliary_loss_clip": 0.01134796, + "auxiliary_loss_mlp": 0.01047613, + "balance_loss_clip": 1.03136468, + "balance_loss_mlp": 1.04525268, + "epoch": 0.2603938073049752, + "flos": 23513661054720.0, + "grad_norm": 2.4595446714184654, + "language_loss": 0.75248575, + "learning_rate": 3.4685359627954958e-06, + "loss": 0.77430975, + "num_input_tokens_seen": 93518335, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.89453125, + "step": 4331, + "time_per_iteration": 2.5284199714660645 + }, + { + "auxiliary_loss_clip": 0.01137533, + "auxiliary_loss_mlp": 0.0103716, + "balance_loss_clip": 1.02158558, + "balance_loss_mlp": 1.05026567, + "epoch": 0.26045393055764315, + "flos": 25374300537600.0, + "grad_norm": 1.3491085383994963, + "language_loss": 0.69003588, + "learning_rate": 3.4682715451021584e-06, + "loss": 0.71178281, + "num_input_tokens_seen": 93539170, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.87109375, + "step": 4332, + "time_per_iteration": 2.476125478744507 + }, + { + "auxiliary_loss_clip": 0.0113624, + "auxiliary_loss_mlp": 0.01041054, + "balance_loss_clip": 1.02453184, + "balance_loss_mlp": 1.04542089, + "epoch": 0.2605140538103111, + "flos": 27635272076160.0, + "grad_norm": 2.3270567941112854, + "language_loss": 0.79674375, + "learning_rate": 3.4680070717318174e-06, + "loss": 0.81851673, + "num_input_tokens_seen": 93558480, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.91015625, + "step": 4333, + "time_per_iteration": 2.5234756469726562 + }, + { + "auxiliary_loss_clip": 0.01129676, + "auxiliary_loss_mlp": 0.01043365, + "balance_loss_clip": 1.02791548, + "balance_loss_mlp": 1.04336357, + "epoch": 0.2605741770629791, + "flos": 13769839503360.0, + "grad_norm": 1.7608965931322442, + "language_loss": 0.80725265, + "learning_rate": 3.467742542694501e-06, + "loss": 0.82898307, + "num_input_tokens_seen": 93575220, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.86328125, + "step": 4334, + "time_per_iteration": 2.4361026287078857 + }, + { + "auxiliary_loss_clip": 0.01132792, + "auxiliary_loss_mlp": 0.01038034, + "balance_loss_clip": 1.02128482, + "balance_loss_mlp": 1.04452491, + "epoch": 0.26063430031564705, + "flos": 26031681296640.0, + "grad_norm": 1.8337144126432974, + "language_loss": 0.80039275, + "learning_rate": 3.46747795800024e-06, + "loss": 0.822101, + "num_input_tokens_seen": 93597015, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.87890625, + "step": 4335, + "time_per_iteration": 2.5246174335479736 + }, + { + "auxiliary_loss_clip": 0.01043695, + "auxiliary_loss_mlp": 0.0102207, + "balance_loss_clip": 1.02024579, + "balance_loss_mlp": 1.01431763, + "epoch": 0.26069442356831507, + "flos": 62443809820800.0, + "grad_norm": 0.849908687169067, + "language_loss": 0.60851145, + "learning_rate": 3.467213317659068e-06, + "loss": 0.62916911, + "num_input_tokens_seen": 93657775, + "router_z_loss_clip": 0.01818848, + "router_z_loss_mlp": 0.29296875, + "step": 4336, + "time_per_iteration": 3.0349080562591553 + }, + { + "auxiliary_loss_clip": 0.01136323, + "auxiliary_loss_mlp": 0.01047902, + "balance_loss_clip": 1.03172541, + "balance_loss_mlp": 1.04599738, + "epoch": 0.26075454682098304, + "flos": 13626376583040.0, + "grad_norm": 6.860825703537795, + "language_loss": 0.77407634, + "learning_rate": 3.46694862168102e-06, + "loss": 0.79591858, + "num_input_tokens_seen": 93676145, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.90625, + "step": 4337, + "time_per_iteration": 2.4549763202667236 + }, + { + "auxiliary_loss_clip": 0.01135126, + "auxiliary_loss_mlp": 0.01045126, + "balance_loss_clip": 1.02755404, + "balance_loss_mlp": 1.04531193, + "epoch": 0.260814670073651, + "flos": 12126531260160.0, + "grad_norm": 2.1553767319060646, + "language_loss": 0.74116468, + "learning_rate": 3.4666838700761334e-06, + "loss": 0.76296723, + "num_input_tokens_seen": 93692480, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8984375, + "step": 4338, + "time_per_iteration": 2.4109654426574707 + }, + { + "auxiliary_loss_clip": 0.01137659, + "auxiliary_loss_mlp": 0.01042073, + "balance_loss_clip": 1.02495456, + "balance_loss_mlp": 1.0451895, + "epoch": 0.26087479332631897, + "flos": 15122522805120.0, + "grad_norm": 2.414973208379154, + "language_loss": 0.80645537, + "learning_rate": 3.466419062854447e-06, + "loss": 0.82825273, + "num_input_tokens_seen": 93710165, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.92578125, + "step": 4339, + "time_per_iteration": 2.4671595096588135 + }, + { + "auxiliary_loss_clip": 0.01133141, + "auxiliary_loss_mlp": 0.01038061, + "balance_loss_clip": 1.02287948, + "balance_loss_mlp": 1.04559159, + "epoch": 0.26093491657898693, + "flos": 24680937329280.0, + "grad_norm": 1.5844023841754464, + "language_loss": 0.76694596, + "learning_rate": 3.4661542000260033e-06, + "loss": 0.78865802, + "num_input_tokens_seen": 93730185, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.875, + "step": 4340, + "time_per_iteration": 2.4803388118743896 + }, + { + "auxiliary_loss_clip": 0.01137352, + "auxiliary_loss_mlp": 0.01037831, + "balance_loss_clip": 1.02185678, + "balance_loss_mlp": 1.04666209, + "epoch": 0.2609950398316549, + "flos": 25116138512640.0, + "grad_norm": 1.5290989424491332, + "language_loss": 0.82436979, + "learning_rate": 3.465889281600845e-06, + "loss": 0.84612167, + "num_input_tokens_seen": 93747690, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.90625, + "step": 4341, + "time_per_iteration": 2.5263681411743164 + }, + { + "auxiliary_loss_clip": 0.01134552, + "auxiliary_loss_mlp": 0.01039374, + "balance_loss_clip": 1.02236271, + "balance_loss_mlp": 1.04563117, + "epoch": 0.26105516308432286, + "flos": 28548588216960.0, + "grad_norm": 2.4125290221035773, + "language_loss": 0.76542389, + "learning_rate": 3.4656243075890183e-06, + "loss": 0.78716314, + "num_input_tokens_seen": 93767405, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.890625, + "step": 4342, + "time_per_iteration": 2.5043585300445557 + }, + { + "auxiliary_loss_clip": 0.01132446, + "auxiliary_loss_mlp": 0.010328, + "balance_loss_clip": 1.01570523, + "balance_loss_mlp": 1.04324019, + "epoch": 0.2611152863369908, + "flos": 39530609447040.0, + "grad_norm": 1.8018778201456855, + "language_loss": 0.66747689, + "learning_rate": 3.4653592780005707e-06, + "loss": 0.68912935, + "num_input_tokens_seen": 93789950, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.89453125, + "step": 4343, + "time_per_iteration": 2.6470234394073486 + }, + { + "auxiliary_loss_clip": 0.01136306, + "auxiliary_loss_mlp": 0.01041522, + "balance_loss_clip": 1.02467799, + "balance_loss_mlp": 1.04494977, + "epoch": 0.2611754095896588, + "flos": 13735329511680.0, + "grad_norm": 2.0339901471708646, + "language_loss": 0.73817015, + "learning_rate": 3.465094192845553e-06, + "loss": 0.75994843, + "num_input_tokens_seen": 93807835, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.9140625, + "step": 4344, + "time_per_iteration": 5.431513071060181 + }, + { + "auxiliary_loss_clip": 0.0113578, + "auxiliary_loss_mlp": 0.01038278, + "balance_loss_clip": 1.02257776, + "balance_loss_mlp": 1.04692459, + "epoch": 0.26123553284232676, + "flos": 21506649649920.0, + "grad_norm": 3.7636245605224072, + "language_loss": 0.86394477, + "learning_rate": 3.4648290521340165e-06, + "loss": 0.88568532, + "num_input_tokens_seen": 93825670, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.890625, + "step": 4345, + "time_per_iteration": 2.4908552169799805 + }, + { + "auxiliary_loss_clip": 0.01129626, + "auxiliary_loss_mlp": 0.01039942, + "balance_loss_clip": 1.02422452, + "balance_loss_mlp": 1.04427588, + "epoch": 0.2612956560949947, + "flos": 21139786091520.0, + "grad_norm": 1.88977116996907, + "language_loss": 0.7612443, + "learning_rate": 3.464563855876015e-06, + "loss": 0.78293997, + "num_input_tokens_seen": 93844045, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.85546875, + "step": 4346, + "time_per_iteration": 2.4966983795166016 + }, + { + "auxiliary_loss_clip": 0.01133219, + "auxiliary_loss_mlp": 0.01041377, + "balance_loss_clip": 1.02547407, + "balance_loss_mlp": 1.04483962, + "epoch": 0.2613557793476627, + "flos": 25119011600640.0, + "grad_norm": 1.5621162347417301, + "language_loss": 0.75868237, + "learning_rate": 3.464298604081606e-06, + "loss": 0.78042835, + "num_input_tokens_seen": 93864380, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8828125, + "step": 4347, + "time_per_iteration": 2.5392181873321533 + }, + { + "auxiliary_loss_clip": 0.01133725, + "auxiliary_loss_mlp": 0.01033882, + "balance_loss_clip": 1.01846862, + "balance_loss_mlp": 1.04549503, + "epoch": 0.26141590260033065, + "flos": 26067699659520.0, + "grad_norm": 1.4125954345922265, + "language_loss": 0.73354399, + "learning_rate": 3.4640332967608476e-06, + "loss": 0.75522006, + "num_input_tokens_seen": 93885475, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8828125, + "step": 4348, + "time_per_iteration": 2.5206878185272217 + }, + { + "auxiliary_loss_clip": 0.01134547, + "auxiliary_loss_mlp": 0.01039621, + "balance_loss_clip": 1.02286005, + "balance_loss_mlp": 1.04503882, + "epoch": 0.2614760258529987, + "flos": 25701518459520.0, + "grad_norm": 1.8182616406273437, + "language_loss": 0.91063923, + "learning_rate": 3.463767933923799e-06, + "loss": 0.93238091, + "num_input_tokens_seen": 93905545, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.89453125, + "step": 4349, + "time_per_iteration": 2.526134967803955 + }, + { + "auxiliary_loss_clip": 0.01132572, + "auxiliary_loss_mlp": 0.01042392, + "balance_loss_clip": 1.02663279, + "balance_loss_mlp": 1.0461632, + "epoch": 0.26153614910566664, + "flos": 17457147181440.0, + "grad_norm": 1.7312169360414529, + "language_loss": 0.79879099, + "learning_rate": 3.463502515580524e-06, + "loss": 0.82054067, + "num_input_tokens_seen": 93924185, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.86328125, + "step": 4350, + "time_per_iteration": 2.4420506954193115 + }, + { + "auxiliary_loss_clip": 0.01129246, + "auxiliary_loss_mlp": 0.01039783, + "balance_loss_clip": 1.02388072, + "balance_loss_mlp": 1.04430401, + "epoch": 0.2615962723583346, + "flos": 17712831168000.0, + "grad_norm": 1.8647374515536046, + "language_loss": 0.62139511, + "learning_rate": 3.4632370417410866e-06, + "loss": 0.64308536, + "num_input_tokens_seen": 93942825, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8515625, + "step": 4351, + "time_per_iteration": 2.4613640308380127 + }, + { + "auxiliary_loss_clip": 0.01134641, + "auxiliary_loss_mlp": 0.01038195, + "balance_loss_clip": 1.02241123, + "balance_loss_mlp": 1.04469466, + "epoch": 0.26165639561100257, + "flos": 23257725672960.0, + "grad_norm": 2.09308554357217, + "language_loss": 0.83596927, + "learning_rate": 3.462971512415555e-06, + "loss": 0.85769767, + "num_input_tokens_seen": 93962045, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8984375, + "step": 4352, + "time_per_iteration": 2.4712979793548584 + }, + { + "auxiliary_loss_clip": 0.01045226, + "auxiliary_loss_mlp": 0.01000353, + "balance_loss_clip": 0.9986006, + "balance_loss_mlp": 1.01526213, + "epoch": 0.26171651886367053, + "flos": 66737970800640.0, + "grad_norm": 0.8010954727993301, + "language_loss": 0.70645392, + "learning_rate": 3.462705927613996e-06, + "loss": 0.72690976, + "num_input_tokens_seen": 94021175, + "router_z_loss_clip": 0.01757812, + "router_z_loss_mlp": 0.29882812, + "step": 4353, + "time_per_iteration": 3.026418447494507 + }, + { + "auxiliary_loss_clip": 0.01132608, + "auxiliary_loss_mlp": 0.01047561, + "balance_loss_clip": 1.03045464, + "balance_loss_mlp": 1.04494369, + "epoch": 0.2617766421163385, + "flos": 22349581090560.0, + "grad_norm": 1.7700850953213416, + "language_loss": 0.77393121, + "learning_rate": 3.4624402873464816e-06, + "loss": 0.79573292, + "num_input_tokens_seen": 94043370, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.875, + "step": 4354, + "time_per_iteration": 2.535482883453369 + }, + { + "auxiliary_loss_clip": 0.01138552, + "auxiliary_loss_mlp": 0.0104457, + "balance_loss_clip": 1.02826262, + "balance_loss_mlp": 1.04513574, + "epoch": 0.26183676536900646, + "flos": 26067125041920.0, + "grad_norm": 2.1625978203859826, + "language_loss": 0.68280292, + "learning_rate": 3.462174591623085e-06, + "loss": 0.70463413, + "num_input_tokens_seen": 94063510, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.93359375, + "step": 4355, + "time_per_iteration": 2.5276527404785156 + }, + { + "auxiliary_loss_clip": 0.01130838, + "auxiliary_loss_mlp": 0.01039393, + "balance_loss_clip": 1.02207148, + "balance_loss_mlp": 1.04375613, + "epoch": 0.26189688862167443, + "flos": 20996466825600.0, + "grad_norm": 1.9702640724114775, + "language_loss": 0.67509294, + "learning_rate": 3.4619088404538815e-06, + "loss": 0.69679523, + "num_input_tokens_seen": 94083865, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.87109375, + "step": 4356, + "time_per_iteration": 2.454436779022217 + }, + { + "auxiliary_loss_clip": 0.01043638, + "auxiliary_loss_mlp": 0.01003266, + "balance_loss_clip": 1.00139415, + "balance_loss_mlp": 1.01376009, + "epoch": 0.2619570118743424, + "flos": 65798261141760.0, + "grad_norm": 0.6781381277043278, + "language_loss": 0.53156137, + "learning_rate": 3.4616430338489487e-06, + "loss": 0.55203032, + "num_input_tokens_seen": 94144095, + "router_z_loss_clip": 0.01867676, + "router_z_loss_mlp": 0.29882812, + "step": 4357, + "time_per_iteration": 2.99239444732666 + }, + { + "auxiliary_loss_clip": 0.01138081, + "auxiliary_loss_mlp": 0.01045526, + "balance_loss_clip": 1.02955151, + "balance_loss_mlp": 1.04608119, + "epoch": 0.26201713512701036, + "flos": 28766817296640.0, + "grad_norm": 1.843205511563007, + "language_loss": 0.84329486, + "learning_rate": 3.4613771718183654e-06, + "loss": 0.86513096, + "num_input_tokens_seen": 94163035, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.921875, + "step": 4358, + "time_per_iteration": 2.511441707611084 + }, + { + "auxiliary_loss_clip": 0.0113833, + "auxiliary_loss_mlp": 0.01042477, + "balance_loss_clip": 1.02476251, + "balance_loss_mlp": 1.0450834, + "epoch": 0.2620772583796783, + "flos": 26432516142720.0, + "grad_norm": 2.1805365254718367, + "language_loss": 0.67303276, + "learning_rate": 3.4611112543722127e-06, + "loss": 0.69484085, + "num_input_tokens_seen": 94182520, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9296875, + "step": 4359, + "time_per_iteration": 2.5318756103515625 + }, + { + "auxiliary_loss_clip": 0.0113089, + "auxiliary_loss_mlp": 0.01042276, + "balance_loss_clip": 1.02725601, + "balance_loss_mlp": 1.04242957, + "epoch": 0.2621373816323463, + "flos": 20156552127360.0, + "grad_norm": 1.947910834650985, + "language_loss": 0.78673261, + "learning_rate": 3.4608452815205757e-06, + "loss": 0.80846429, + "num_input_tokens_seen": 94201795, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.88671875, + "step": 4360, + "time_per_iteration": 2.4551331996917725 + }, + { + "auxiliary_loss_clip": 0.01129221, + "auxiliary_loss_mlp": 0.01040075, + "balance_loss_clip": 1.02454162, + "balance_loss_mlp": 1.04250073, + "epoch": 0.26219750488501425, + "flos": 28621235473920.0, + "grad_norm": 1.9921513845886445, + "language_loss": 0.68169516, + "learning_rate": 3.4605792532735387e-06, + "loss": 0.70338809, + "num_input_tokens_seen": 94222390, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8671875, + "step": 4361, + "time_per_iteration": 2.57106351852417 + }, + { + "auxiliary_loss_clip": 0.01135372, + "auxiliary_loss_mlp": 0.01057475, + "balance_loss_clip": 1.04022598, + "balance_loss_mlp": 1.04400647, + "epoch": 0.2622576281376823, + "flos": 15042549173760.0, + "grad_norm": 1.9312179198305752, + "language_loss": 0.84310883, + "learning_rate": 3.46031316964119e-06, + "loss": 0.86503732, + "num_input_tokens_seen": 94239980, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9140625, + "step": 4362, + "time_per_iteration": 2.430020570755005 + }, + { + "auxiliary_loss_clip": 0.01133753, + "auxiliary_loss_mlp": 0.01040156, + "balance_loss_clip": 1.02282345, + "balance_loss_mlp": 1.04637551, + "epoch": 0.26231775139035024, + "flos": 26396174557440.0, + "grad_norm": 1.792780117353334, + "language_loss": 0.65294504, + "learning_rate": 3.4600470306336197e-06, + "loss": 0.67468411, + "num_input_tokens_seen": 94260715, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.875, + "step": 4363, + "time_per_iteration": 2.546393632888794 + }, + { + "auxiliary_loss_clip": 0.01042076, + "auxiliary_loss_mlp": 0.01004318, + "balance_loss_clip": 1.00252998, + "balance_loss_mlp": 1.0123173, + "epoch": 0.2623778746430182, + "flos": 65408918647680.0, + "grad_norm": 0.8867533167936222, + "language_loss": 0.61098528, + "learning_rate": 3.4597808362609194e-06, + "loss": 0.63144922, + "num_input_tokens_seen": 94321285, + "router_z_loss_clip": 0.01782227, + "router_z_loss_mlp": 0.296875, + "step": 4364, + "time_per_iteration": 3.150812864303589 + }, + { + "auxiliary_loss_clip": 0.01138346, + "auxiliary_loss_mlp": 0.01051385, + "balance_loss_clip": 1.03358722, + "balance_loss_mlp": 1.0468297, + "epoch": 0.26243799789568617, + "flos": 12604215254400.0, + "grad_norm": 2.424942653514092, + "language_loss": 0.71549827, + "learning_rate": 3.459514586533184e-06, + "loss": 0.73739558, + "num_input_tokens_seen": 94335420, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9140625, + "step": 4365, + "time_per_iteration": 2.493540048599243 + }, + { + "auxiliary_loss_clip": 0.0113494, + "auxiliary_loss_mlp": 0.01045115, + "balance_loss_clip": 1.02917075, + "balance_loss_mlp": 1.04654169, + "epoch": 0.26249812114835414, + "flos": 28623821253120.0, + "grad_norm": 1.8316261966241354, + "language_loss": 0.76925993, + "learning_rate": 3.459248281460509e-06, + "loss": 0.79106045, + "num_input_tokens_seen": 94357440, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8828125, + "step": 4366, + "time_per_iteration": 2.536853313446045 + }, + { + "auxiliary_loss_clip": 0.01135829, + "auxiliary_loss_mlp": 0.01042418, + "balance_loss_clip": 1.02684951, + "balance_loss_mlp": 1.04666197, + "epoch": 0.2625582444010221, + "flos": 14465393441280.0, + "grad_norm": 2.2091260788228975, + "language_loss": 0.75838757, + "learning_rate": 3.4589819210529927e-06, + "loss": 0.78017008, + "num_input_tokens_seen": 94375690, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.890625, + "step": 4367, + "time_per_iteration": 2.4576163291931152 + }, + { + "auxiliary_loss_clip": 0.01131307, + "auxiliary_loss_mlp": 0.01040361, + "balance_loss_clip": 1.02454233, + "balance_loss_mlp": 1.04452682, + "epoch": 0.26261836765369007, + "flos": 16613174246400.0, + "grad_norm": 2.1913456464974392, + "language_loss": 0.69633925, + "learning_rate": 3.458715505320736e-06, + "loss": 0.71805596, + "num_input_tokens_seen": 94393190, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8671875, + "step": 4368, + "time_per_iteration": 2.4301586151123047 + }, + { + "auxiliary_loss_clip": 0.01130278, + "auxiliary_loss_mlp": 0.01046678, + "balance_loss_clip": 1.02970243, + "balance_loss_mlp": 1.04319167, + "epoch": 0.26267849090635803, + "flos": 20519932066560.0, + "grad_norm": 1.7035150195415922, + "language_loss": 0.78589904, + "learning_rate": 3.458449034273841e-06, + "loss": 0.80766863, + "num_input_tokens_seen": 94410975, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.8671875, + "step": 4369, + "time_per_iteration": 2.489316701889038 + }, + { + "auxiliary_loss_clip": 0.01132105, + "auxiliary_loss_mlp": 0.01042711, + "balance_loss_clip": 1.02653408, + "balance_loss_mlp": 1.04431546, + "epoch": 0.262738614159026, + "flos": 21323936142720.0, + "grad_norm": 2.0413446884893047, + "language_loss": 0.83486217, + "learning_rate": 3.4581825079224133e-06, + "loss": 0.85661036, + "num_input_tokens_seen": 94429985, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.87890625, + "step": 4370, + "time_per_iteration": 2.4422430992126465 + }, + { + "auxiliary_loss_clip": 0.01136913, + "auxiliary_loss_mlp": 0.01050187, + "balance_loss_clip": 1.03060055, + "balance_loss_mlp": 1.04530215, + "epoch": 0.26279873741169396, + "flos": 17603590930560.0, + "grad_norm": 2.3340239620956287, + "language_loss": 0.70963454, + "learning_rate": 3.4579159262765575e-06, + "loss": 0.73150551, + "num_input_tokens_seen": 94448660, + "router_z_loss_clip": 0.19628906, + "router_z_loss_mlp": 0.9140625, + "step": 4371, + "time_per_iteration": 2.5099778175354004 + }, + { + "auxiliary_loss_clip": 0.01043374, + "auxiliary_loss_mlp": 0.00999769, + "balance_loss_clip": 0.99784929, + "balance_loss_mlp": 1.01338005, + "epoch": 0.2628588606643619, + "flos": 60949746587520.0, + "grad_norm": 0.7657034729714577, + "language_loss": 0.56477904, + "learning_rate": 3.457649289346384e-06, + "loss": 0.58521044, + "num_input_tokens_seen": 94515630, + "router_z_loss_clip": 0.01916504, + "router_z_loss_mlp": 0.30078125, + "step": 4372, + "time_per_iteration": 3.244558572769165 + }, + { + "auxiliary_loss_clip": 0.01129835, + "auxiliary_loss_mlp": 0.01038918, + "balance_loss_clip": 1.02283084, + "balance_loss_mlp": 1.04335582, + "epoch": 0.2629189839170299, + "flos": 27016315891200.0, + "grad_norm": 1.7597219251079876, + "language_loss": 0.77415234, + "learning_rate": 3.4573825971420042e-06, + "loss": 0.79583991, + "num_input_tokens_seen": 94535385, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.86328125, + "step": 4373, + "time_per_iteration": 2.517784833908081 + }, + { + "auxiliary_loss_clip": 0.01131814, + "auxiliary_loss_mlp": 0.01041505, + "balance_loss_clip": 1.02563787, + "balance_loss_mlp": 1.04454422, + "epoch": 0.26297910716969786, + "flos": 17019863009280.0, + "grad_norm": 4.0873872332994905, + "language_loss": 0.71538949, + "learning_rate": 3.4571158496735294e-06, + "loss": 0.73712265, + "num_input_tokens_seen": 94552650, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.87109375, + "step": 4374, + "time_per_iteration": 2.442124605178833 + }, + { + "auxiliary_loss_clip": 0.01133779, + "auxiliary_loss_mlp": 0.01042, + "balance_loss_clip": 1.02435732, + "balance_loss_mlp": 1.0458709, + "epoch": 0.2630392304223659, + "flos": 24897370728960.0, + "grad_norm": 2.271567992891854, + "language_loss": 0.80945283, + "learning_rate": 3.4568490469510756e-06, + "loss": 0.83121061, + "num_input_tokens_seen": 94574075, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.875, + "step": 4375, + "time_per_iteration": 2.4889678955078125 + }, + { + "auxiliary_loss_clip": 0.0113002, + "auxiliary_loss_mlp": 0.0104209, + "balance_loss_clip": 1.0265336, + "balance_loss_mlp": 1.04366982, + "epoch": 0.26309935367503384, + "flos": 32854026067200.0, + "grad_norm": 2.3689389683703, + "language_loss": 0.65721256, + "learning_rate": 3.4565821889847603e-06, + "loss": 0.67893362, + "num_input_tokens_seen": 94594255, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.86328125, + "step": 4376, + "time_per_iteration": 2.563701629638672 + }, + { + "auxiliary_loss_clip": 0.01134504, + "auxiliary_loss_mlp": 0.0104592, + "balance_loss_clip": 1.02940989, + "balance_loss_mlp": 1.04445267, + "epoch": 0.2631594769277018, + "flos": 15887958652800.0, + "grad_norm": 1.8646607453842572, + "language_loss": 0.69517326, + "learning_rate": 3.4563152757847026e-06, + "loss": 0.71697748, + "num_input_tokens_seen": 94611410, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8984375, + "step": 4377, + "time_per_iteration": 2.486117124557495 + }, + { + "auxiliary_loss_clip": 0.01134243, + "auxiliary_loss_mlp": 0.01044305, + "balance_loss_clip": 1.02786613, + "balance_loss_mlp": 1.04500914, + "epoch": 0.2632196001803698, + "flos": 50804943557760.0, + "grad_norm": 1.711844873276418, + "language_loss": 0.7866202, + "learning_rate": 3.4560483073610233e-06, + "loss": 0.80840576, + "num_input_tokens_seen": 94636575, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.890625, + "step": 4378, + "time_per_iteration": 2.7608227729797363 + }, + { + "auxiliary_loss_clip": 0.01133245, + "auxiliary_loss_mlp": 0.01045029, + "balance_loss_clip": 1.03000844, + "balance_loss_mlp": 1.04554546, + "epoch": 0.26327972343303774, + "flos": 13733031041280.0, + "grad_norm": 2.6216377344963004, + "language_loss": 0.76320505, + "learning_rate": 3.455781283723846e-06, + "loss": 0.78498781, + "num_input_tokens_seen": 94654345, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.875, + "step": 4379, + "time_per_iteration": 2.4329168796539307 + }, + { + "auxiliary_loss_clip": 0.01138135, + "auxiliary_loss_mlp": 0.01041523, + "balance_loss_clip": 1.02252114, + "balance_loss_mlp": 1.04633284, + "epoch": 0.2633398466857057, + "flos": 23769057732480.0, + "grad_norm": 2.3003567904549156, + "language_loss": 0.78237861, + "learning_rate": 3.4555142048832975e-06, + "loss": 0.8041752, + "num_input_tokens_seen": 94673985, + "router_z_loss_clip": 0.19042969, + "router_z_loss_mlp": 0.91796875, + "step": 4380, + "time_per_iteration": 2.5423548221588135 + }, + { + "auxiliary_loss_clip": 0.01135772, + "auxiliary_loss_mlp": 0.01040582, + "balance_loss_clip": 1.02419698, + "balance_loss_mlp": 1.0444113, + "epoch": 0.26339996993837367, + "flos": 27600223380480.0, + "grad_norm": 2.288842357619654, + "language_loss": 0.63811409, + "learning_rate": 3.4552470708495036e-06, + "loss": 0.65987766, + "num_input_tokens_seen": 94693145, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9140625, + "step": 4381, + "time_per_iteration": 2.5096213817596436 + }, + { + "auxiliary_loss_clip": 0.01131521, + "auxiliary_loss_mlp": 0.01037713, + "balance_loss_clip": 1.02148831, + "balance_loss_mlp": 1.04359913, + "epoch": 0.26346009319104163, + "flos": 16946317912320.0, + "grad_norm": 1.8729093449566216, + "language_loss": 0.82822418, + "learning_rate": 3.454979881632595e-06, + "loss": 0.84991652, + "num_input_tokens_seen": 94710185, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.87890625, + "step": 4382, + "time_per_iteration": 2.4691555500030518 + }, + { + "auxiliary_loss_clip": 0.01138155, + "auxiliary_loss_mlp": 0.010471, + "balance_loss_clip": 1.02902842, + "balance_loss_mlp": 1.04550982, + "epoch": 0.2635202164437096, + "flos": 37232218915200.0, + "grad_norm": 2.126733729537993, + "language_loss": 0.69686437, + "learning_rate": 3.4547126372427035e-06, + "loss": 0.71871686, + "num_input_tokens_seen": 94730280, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9296875, + "step": 4383, + "time_per_iteration": 2.5923891067504883 + }, + { + "auxiliary_loss_clip": 0.01135659, + "auxiliary_loss_mlp": 0.01042881, + "balance_loss_clip": 1.02732468, + "balance_loss_mlp": 1.04591441, + "epoch": 0.26358033969637756, + "flos": 20996359084800.0, + "grad_norm": 1.929045699346076, + "language_loss": 0.69191134, + "learning_rate": 3.4544453376899638e-06, + "loss": 0.71369672, + "num_input_tokens_seen": 94748560, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8984375, + "step": 4384, + "time_per_iteration": 2.5067081451416016 + }, + { + "auxiliary_loss_clip": 0.01132133, + "auxiliary_loss_mlp": 0.01039726, + "balance_loss_clip": 1.02319217, + "balance_loss_mlp": 1.04400492, + "epoch": 0.26364046294904553, + "flos": 27746092512000.0, + "grad_norm": 2.1647401570854075, + "language_loss": 0.6994158, + "learning_rate": 3.45417798298451e-06, + "loss": 0.72113448, + "num_input_tokens_seen": 94767570, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8828125, + "step": 4385, + "time_per_iteration": 4.062510251998901 + }, + { + "auxiliary_loss_clip": 0.01138578, + "auxiliary_loss_mlp": 0.01042633, + "balance_loss_clip": 1.02551472, + "balance_loss_mlp": 1.04978371, + "epoch": 0.2637005862017135, + "flos": 22893088757760.0, + "grad_norm": 2.0926426044309543, + "language_loss": 0.85188037, + "learning_rate": 3.453910573136482e-06, + "loss": 0.87369245, + "num_input_tokens_seen": 94784985, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.88671875, + "step": 4386, + "time_per_iteration": 3.9604547023773193 + }, + { + "auxiliary_loss_clip": 0.0113699, + "auxiliary_loss_mlp": 0.01041328, + "balance_loss_clip": 1.02487707, + "balance_loss_mlp": 1.04755282, + "epoch": 0.26376070945438146, + "flos": 15048834053760.0, + "grad_norm": 2.2248904155103637, + "language_loss": 0.77169371, + "learning_rate": 3.4536431081560196e-06, + "loss": 0.79347688, + "num_input_tokens_seen": 94802545, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.89453125, + "step": 4387, + "time_per_iteration": 2.472367286682129 + }, + { + "auxiliary_loss_clip": 0.01137279, + "auxiliary_loss_mlp": 0.01046481, + "balance_loss_clip": 1.0305903, + "balance_loss_mlp": 1.04989982, + "epoch": 0.2638208327070494, + "flos": 21141833166720.0, + "grad_norm": 3.996041212149396, + "language_loss": 0.76269597, + "learning_rate": 3.453375588053264e-06, + "loss": 0.78453362, + "num_input_tokens_seen": 94820730, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.875, + "step": 4388, + "time_per_iteration": 2.4858386516571045 + }, + { + "auxiliary_loss_clip": 0.01132552, + "auxiliary_loss_mlp": 0.01035954, + "balance_loss_clip": 1.01924086, + "balance_loss_mlp": 1.04387724, + "epoch": 0.26388095595971744, + "flos": 21725597001600.0, + "grad_norm": 1.9510825560869567, + "language_loss": 0.86210662, + "learning_rate": 3.4531080128383617e-06, + "loss": 0.88379163, + "num_input_tokens_seen": 94839175, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.88671875, + "step": 4389, + "time_per_iteration": 2.508162260055542 + }, + { + "auxiliary_loss_clip": 0.0104392, + "auxiliary_loss_mlp": 0.01009323, + "balance_loss_clip": 1.00736833, + "balance_loss_mlp": 1.01341343, + "epoch": 0.2639410792123854, + "flos": 65515537192320.0, + "grad_norm": 0.8096176904924934, + "language_loss": 0.60333931, + "learning_rate": 3.452840382521457e-06, + "loss": 0.6238718, + "num_input_tokens_seen": 94898865, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.3046875, + "step": 4390, + "time_per_iteration": 3.0593924522399902 + }, + { + "auxiliary_loss_clip": 0.01135834, + "auxiliary_loss_mlp": 0.01038563, + "balance_loss_clip": 1.02213633, + "balance_loss_mlp": 1.04522729, + "epoch": 0.2640012024650534, + "flos": 23948574929280.0, + "grad_norm": 1.7836890720002585, + "language_loss": 0.77702433, + "learning_rate": 3.4525726971127e-06, + "loss": 0.79876828, + "num_input_tokens_seen": 94917490, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90625, + "step": 4391, + "time_per_iteration": 2.5331051349639893 + }, + { + "auxiliary_loss_clip": 0.0104332, + "auxiliary_loss_mlp": 0.01003932, + "balance_loss_clip": 1.00221586, + "balance_loss_mlp": 1.01322889, + "epoch": 0.26406132571772134, + "flos": 56441163369600.0, + "grad_norm": 0.9020745061185262, + "language_loss": 0.58752227, + "learning_rate": 3.45230495662224e-06, + "loss": 0.60799479, + "num_input_tokens_seen": 94969065, + "router_z_loss_clip": 0.01721191, + "router_z_loss_mlp": 0.30078125, + "step": 4392, + "time_per_iteration": 3.047438144683838 + }, + { + "auxiliary_loss_clip": 0.01140884, + "auxiliary_loss_mlp": 0.0104677, + "balance_loss_clip": 1.03039694, + "balance_loss_mlp": 1.04925656, + "epoch": 0.2641214489703893, + "flos": 22090557139200.0, + "grad_norm": 2.5811541881681697, + "language_loss": 0.68459845, + "learning_rate": 3.4520371610602306e-06, + "loss": 0.70647496, + "num_input_tokens_seen": 94988540, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.91796875, + "step": 4393, + "time_per_iteration": 2.5537288188934326 + }, + { + "auxiliary_loss_clip": 0.01139955, + "auxiliary_loss_mlp": 0.01040744, + "balance_loss_clip": 1.02258813, + "balance_loss_mlp": 1.04662204, + "epoch": 0.26418157222305727, + "flos": 16544764794240.0, + "grad_norm": 1.8702197697463565, + "language_loss": 0.83116519, + "learning_rate": 3.4517693104368267e-06, + "loss": 0.85297221, + "num_input_tokens_seen": 95004810, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.93359375, + "step": 4394, + "time_per_iteration": 2.421211004257202 + }, + { + "auxiliary_loss_clip": 0.01143407, + "auxiliary_loss_mlp": 0.01042347, + "balance_loss_clip": 1.02357125, + "balance_loss_mlp": 1.04951847, + "epoch": 0.26424169547572524, + "flos": 18002486442240.0, + "grad_norm": 2.049654769643576, + "language_loss": 0.70211649, + "learning_rate": 3.4515014047621856e-06, + "loss": 0.72397399, + "num_input_tokens_seen": 95024085, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.9375, + "step": 4395, + "time_per_iteration": 2.522111654281616 + }, + { + "auxiliary_loss_clip": 0.01135756, + "auxiliary_loss_mlp": 0.01035967, + "balance_loss_clip": 1.01925397, + "balance_loss_mlp": 1.04784906, + "epoch": 0.2643018187283932, + "flos": 16983162288000.0, + "grad_norm": 1.822626622734132, + "language_loss": 0.86866504, + "learning_rate": 3.4512334440464655e-06, + "loss": 0.89038229, + "num_input_tokens_seen": 95042515, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8828125, + "step": 4396, + "time_per_iteration": 2.4450392723083496 + }, + { + "auxiliary_loss_clip": 0.01042786, + "auxiliary_loss_mlp": 0.01024145, + "balance_loss_clip": 1.02226114, + "balance_loss_mlp": 1.01312816, + "epoch": 0.26436194198106117, + "flos": 59664359416320.0, + "grad_norm": 0.7917805441344085, + "language_loss": 0.54999918, + "learning_rate": 3.4509654282998277e-06, + "loss": 0.57066846, + "num_input_tokens_seen": 95094835, + "router_z_loss_clip": 0.01879883, + "router_z_loss_mlp": 0.296875, + "step": 4397, + "time_per_iteration": 2.8438708782196045 + }, + { + "auxiliary_loss_clip": 0.01134821, + "auxiliary_loss_mlp": 0.01052373, + "balance_loss_clip": 1.03567195, + "balance_loss_mlp": 1.04701614, + "epoch": 0.26442206523372913, + "flos": 32921322197760.0, + "grad_norm": 2.0493441687219724, + "language_loss": 0.77840483, + "learning_rate": 3.450697357532435e-06, + "loss": 0.80027676, + "num_input_tokens_seen": 95113480, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.87890625, + "step": 4398, + "time_per_iteration": 2.562499523162842 + }, + { + "auxiliary_loss_clip": 0.01141073, + "auxiliary_loss_mlp": 0.01040123, + "balance_loss_clip": 1.02262306, + "balance_loss_mlp": 1.05005002, + "epoch": 0.2644821884863971, + "flos": 21031300039680.0, + "grad_norm": 2.041566803030235, + "language_loss": 0.67037976, + "learning_rate": 3.4504292317544534e-06, + "loss": 0.69219166, + "num_input_tokens_seen": 95132580, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.91015625, + "step": 4399, + "time_per_iteration": 2.487778663635254 + }, + { + "auxiliary_loss_clip": 0.01128661, + "auxiliary_loss_mlp": 0.0103792, + "balance_loss_clip": 1.02288818, + "balance_loss_mlp": 1.04565811, + "epoch": 0.26454231173906506, + "flos": 20776801201920.0, + "grad_norm": 2.1160884119586303, + "language_loss": 0.86152196, + "learning_rate": 3.4501610509760504e-06, + "loss": 0.88318777, + "num_input_tokens_seen": 95152375, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 4400, + "time_per_iteration": 2.4837841987609863 + }, + { + "auxiliary_loss_clip": 0.01138875, + "auxiliary_loss_mlp": 0.01039625, + "balance_loss_clip": 1.02188635, + "balance_loss_mlp": 1.04813862, + "epoch": 0.264602434991733, + "flos": 16618669027200.0, + "grad_norm": 2.751022626956878, + "language_loss": 0.75779396, + "learning_rate": 3.4498928152073944e-06, + "loss": 0.77957898, + "num_input_tokens_seen": 95170265, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.90625, + "step": 4401, + "time_per_iteration": 2.548297166824341 + }, + { + "auxiliary_loss_clip": 0.01138206, + "auxiliary_loss_mlp": 0.01050893, + "balance_loss_clip": 1.03236771, + "balance_loss_mlp": 1.04606974, + "epoch": 0.26466255824440105, + "flos": 19062677295360.0, + "grad_norm": 1.9215434150559794, + "language_loss": 0.88267732, + "learning_rate": 3.4496245244586577e-06, + "loss": 0.90456831, + "num_input_tokens_seen": 95188655, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.921875, + "step": 4402, + "time_per_iteration": 2.4422647953033447 + }, + { + "auxiliary_loss_clip": 0.01135603, + "auxiliary_loss_mlp": 0.01048039, + "balance_loss_clip": 1.03151679, + "balance_loss_mlp": 1.04594266, + "epoch": 0.264722681497069, + "flos": 22638554006400.0, + "grad_norm": 1.8196807161845878, + "language_loss": 0.78123331, + "learning_rate": 3.4493561787400137e-06, + "loss": 0.80306977, + "num_input_tokens_seen": 95209615, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8984375, + "step": 4403, + "time_per_iteration": 2.587623357772827 + }, + { + "auxiliary_loss_clip": 0.0113528, + "auxiliary_loss_mlp": 0.01040463, + "balance_loss_clip": 1.02334428, + "balance_loss_mlp": 1.04440784, + "epoch": 0.264782804749737, + "flos": 22492253911680.0, + "grad_norm": 1.9946669841411302, + "language_loss": 0.87767446, + "learning_rate": 3.4490877780616387e-06, + "loss": 0.89943182, + "num_input_tokens_seen": 95227810, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.91015625, + "step": 4404, + "time_per_iteration": 2.492913246154785 + }, + { + "auxiliary_loss_clip": 0.01138307, + "auxiliary_loss_mlp": 0.01036911, + "balance_loss_clip": 1.02106786, + "balance_loss_mlp": 1.04683399, + "epoch": 0.26484292800240494, + "flos": 16800269212800.0, + "grad_norm": 1.7395093434050468, + "language_loss": 0.7593658, + "learning_rate": 3.448819322433709e-06, + "loss": 0.78111804, + "num_input_tokens_seen": 95245890, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.9140625, + "step": 4405, + "time_per_iteration": 2.508970260620117 + }, + { + "auxiliary_loss_clip": 0.01138042, + "auxiliary_loss_mlp": 0.01038835, + "balance_loss_clip": 1.02166891, + "balance_loss_mlp": 1.04870844, + "epoch": 0.2649030512550729, + "flos": 20449583280000.0, + "grad_norm": 1.9681610481113616, + "language_loss": 0.69979274, + "learning_rate": 3.4485508118664066e-06, + "loss": 0.72156149, + "num_input_tokens_seen": 95264955, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.89453125, + "step": 4406, + "time_per_iteration": 2.4548041820526123 + }, + { + "auxiliary_loss_clip": 0.01134971, + "auxiliary_loss_mlp": 0.01047688, + "balance_loss_clip": 1.03255999, + "balance_loss_mlp": 1.04781294, + "epoch": 0.2649631745077409, + "flos": 22416123035520.0, + "grad_norm": 1.7455123192469384, + "language_loss": 0.83764267, + "learning_rate": 3.448282246369912e-06, + "loss": 0.85946929, + "num_input_tokens_seen": 95284245, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.875, + "step": 4407, + "time_per_iteration": 2.5359292030334473 + }, + { + "auxiliary_loss_clip": 0.01134967, + "auxiliary_loss_mlp": 0.01032434, + "balance_loss_clip": 1.01566172, + "balance_loss_mlp": 1.04678226, + "epoch": 0.26502329776040884, + "flos": 35116110927360.0, + "grad_norm": 1.7942044569518307, + "language_loss": 0.76068008, + "learning_rate": 3.4480136259544084e-06, + "loss": 0.78235412, + "num_input_tokens_seen": 95307125, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8828125, + "step": 4408, + "time_per_iteration": 2.6124041080474854 + }, + { + "auxiliary_loss_clip": 0.011362, + "auxiliary_loss_mlp": 0.01034702, + "balance_loss_clip": 1.01832306, + "balance_loss_mlp": 1.04918611, + "epoch": 0.2650834210130768, + "flos": 38687498438400.0, + "grad_norm": 1.8724720588087471, + "language_loss": 0.70920485, + "learning_rate": 3.447744950630084e-06, + "loss": 0.73091388, + "num_input_tokens_seen": 95329150, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.87109375, + "step": 4409, + "time_per_iteration": 2.6539366245269775 + }, + { + "auxiliary_loss_clip": 0.01136441, + "auxiliary_loss_mlp": 0.01037034, + "balance_loss_clip": 1.01931942, + "balance_loss_mlp": 1.04666233, + "epoch": 0.26514354426574477, + "flos": 24716847951360.0, + "grad_norm": 1.7884535623295956, + "language_loss": 0.73085511, + "learning_rate": 3.4474762204071253e-06, + "loss": 0.75258988, + "num_input_tokens_seen": 95349880, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.8984375, + "step": 4410, + "time_per_iteration": 2.545083999633789 + }, + { + "auxiliary_loss_clip": 0.01139704, + "auxiliary_loss_mlp": 0.01049137, + "balance_loss_clip": 1.03218508, + "balance_loss_mlp": 1.04741001, + "epoch": 0.26520366751841273, + "flos": 20340055733760.0, + "grad_norm": 1.9280641145018393, + "language_loss": 0.73272175, + "learning_rate": 3.4472074352957244e-06, + "loss": 0.75461018, + "num_input_tokens_seen": 95368570, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.921875, + "step": 4411, + "time_per_iteration": 2.4818248748779297 + }, + { + "auxiliary_loss_clip": 0.01137094, + "auxiliary_loss_mlp": 0.01042757, + "balance_loss_clip": 1.02593684, + "balance_loss_mlp": 1.04815316, + "epoch": 0.2652637907710807, + "flos": 22343870828160.0, + "grad_norm": 2.073752901007566, + "language_loss": 0.82294202, + "learning_rate": 3.446938595306071e-06, + "loss": 0.84474051, + "num_input_tokens_seen": 95387065, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.88671875, + "step": 4412, + "time_per_iteration": 2.56634521484375 + }, + { + "auxiliary_loss_clip": 0.01134293, + "auxiliary_loss_mlp": 0.01047936, + "balance_loss_clip": 1.03161621, + "balance_loss_mlp": 1.04541004, + "epoch": 0.26532391402374866, + "flos": 19354235990400.0, + "grad_norm": 1.721718037322793, + "language_loss": 0.74245501, + "learning_rate": 3.4466697004483622e-06, + "loss": 0.76427728, + "num_input_tokens_seen": 95406345, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.890625, + "step": 4413, + "time_per_iteration": 2.4994029998779297 + }, + { + "auxiliary_loss_clip": 0.01046706, + "auxiliary_loss_mlp": 0.01018291, + "balance_loss_clip": 1.01659799, + "balance_loss_mlp": 1.0160358, + "epoch": 0.26538403727641663, + "flos": 44787611422080.0, + "grad_norm": 0.8825812455559224, + "language_loss": 0.56986731, + "learning_rate": 3.446400750732793e-06, + "loss": 0.59051728, + "num_input_tokens_seen": 95463595, + "router_z_loss_clip": 0.01696777, + "router_z_loss_mlp": 0.30664062, + "step": 4414, + "time_per_iteration": 2.9884986877441406 + }, + { + "auxiliary_loss_clip": 0.01128281, + "auxiliary_loss_mlp": 0.01041185, + "balance_loss_clip": 1.02605712, + "balance_loss_mlp": 1.04307461, + "epoch": 0.26544416052908465, + "flos": 28182119708160.0, + "grad_norm": 1.8727128035200367, + "language_loss": 0.74535894, + "learning_rate": 3.4461317461695625e-06, + "loss": 0.76705366, + "num_input_tokens_seen": 95484115, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8515625, + "step": 4415, + "time_per_iteration": 2.5531253814697266 + }, + { + "auxiliary_loss_clip": 0.01138825, + "auxiliary_loss_mlp": 0.01043694, + "balance_loss_clip": 1.02506185, + "balance_loss_mlp": 1.04656732, + "epoch": 0.2655042837817526, + "flos": 17565274097280.0, + "grad_norm": 2.3504707987247917, + "language_loss": 0.86662048, + "learning_rate": 3.4458626867688707e-06, + "loss": 0.88844568, + "num_input_tokens_seen": 95501435, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.921875, + "step": 4416, + "time_per_iteration": 2.4751384258270264 + }, + { + "auxiliary_loss_clip": 0.0113975, + "auxiliary_loss_mlp": 0.01042134, + "balance_loss_clip": 1.02439594, + "balance_loss_mlp": 1.0492208, + "epoch": 0.2655644070344206, + "flos": 23404636298880.0, + "grad_norm": 1.6281293305848954, + "language_loss": 0.76152384, + "learning_rate": 3.4455935725409217e-06, + "loss": 0.78334266, + "num_input_tokens_seen": 95520135, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.90625, + "step": 4417, + "time_per_iteration": 2.5017013549804688 + }, + { + "auxiliary_loss_clip": 0.01135215, + "auxiliary_loss_mlp": 0.01039785, + "balance_loss_clip": 1.02167702, + "balance_loss_mlp": 1.04778051, + "epoch": 0.26562453028708854, + "flos": 26468462678400.0, + "grad_norm": 1.7397383944852411, + "language_loss": 0.79984045, + "learning_rate": 3.4453244034959196e-06, + "loss": 0.82159042, + "num_input_tokens_seen": 95541705, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.875, + "step": 4418, + "time_per_iteration": 2.539454460144043 + }, + { + "auxiliary_loss_clip": 0.01138688, + "auxiliary_loss_mlp": 0.01046556, + "balance_loss_clip": 1.02983057, + "balance_loss_mlp": 1.04861307, + "epoch": 0.2656846535397565, + "flos": 19207576759680.0, + "grad_norm": 2.7780034581995965, + "language_loss": 0.67397833, + "learning_rate": 3.445055179644071e-06, + "loss": 0.69583082, + "num_input_tokens_seen": 95560300, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.90234375, + "step": 4419, + "time_per_iteration": 2.461444616317749 + }, + { + "auxiliary_loss_clip": 0.01139197, + "auxiliary_loss_mlp": 0.01045382, + "balance_loss_clip": 1.02739358, + "balance_loss_mlp": 1.04920876, + "epoch": 0.2657447767924245, + "flos": 30551325903360.0, + "grad_norm": 2.097903587873874, + "language_loss": 0.79365611, + "learning_rate": 3.444785900995585e-06, + "loss": 0.81550193, + "num_input_tokens_seen": 95580150, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.8984375, + "step": 4420, + "time_per_iteration": 2.5908427238464355 + }, + { + "auxiliary_loss_clip": 0.01141654, + "auxiliary_loss_mlp": 0.01049212, + "balance_loss_clip": 1.02990031, + "balance_loss_mlp": 1.0493983, + "epoch": 0.26580490004509244, + "flos": 20922742160640.0, + "grad_norm": 2.1223383047232933, + "language_loss": 0.81612432, + "learning_rate": 3.444516567560673e-06, + "loss": 0.83803296, + "num_input_tokens_seen": 95597570, + "router_z_loss_clip": 0.19335938, + "router_z_loss_mlp": 0.921875, + "step": 4421, + "time_per_iteration": 2.4869320392608643 + }, + { + "auxiliary_loss_clip": 0.01134642, + "auxiliary_loss_mlp": 0.01037045, + "balance_loss_clip": 1.02027202, + "balance_loss_mlp": 1.04734015, + "epoch": 0.2658650232977604, + "flos": 43945682584320.0, + "grad_norm": 1.5724937400793966, + "language_loss": 0.65278006, + "learning_rate": 3.444247179349548e-06, + "loss": 0.67449689, + "num_input_tokens_seen": 95619415, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.87109375, + "step": 4422, + "time_per_iteration": 2.7370638847351074 + }, + { + "auxiliary_loss_clip": 0.01138513, + "auxiliary_loss_mlp": 0.01046085, + "balance_loss_clip": 1.02965808, + "balance_loss_mlp": 1.04750621, + "epoch": 0.26592514655042837, + "flos": 29716439109120.0, + "grad_norm": 2.411979213410041, + "language_loss": 0.73841226, + "learning_rate": 3.4439777363724252e-06, + "loss": 0.76025832, + "num_input_tokens_seen": 95639155, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.91015625, + "step": 4423, + "time_per_iteration": 2.5510191917419434 + }, + { + "auxiliary_loss_clip": 0.01136367, + "auxiliary_loss_mlp": 0.01046611, + "balance_loss_clip": 1.03017163, + "balance_loss_mlp": 1.04504442, + "epoch": 0.26598526980309634, + "flos": 46677730014720.0, + "grad_norm": 1.6317340067044743, + "language_loss": 0.77703154, + "learning_rate": 3.443708238639522e-06, + "loss": 0.79886127, + "num_input_tokens_seen": 95663320, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.9140625, + "step": 4424, + "time_per_iteration": 2.809495449066162 + }, + { + "auxiliary_loss_clip": 0.01137168, + "auxiliary_loss_mlp": 0.01043079, + "balance_loss_clip": 1.02675951, + "balance_loss_mlp": 1.04695249, + "epoch": 0.2660453930557643, + "flos": 11509442582400.0, + "grad_norm": 2.064218808714238, + "language_loss": 0.79345673, + "learning_rate": 3.4434386861610573e-06, + "loss": 0.81525922, + "num_input_tokens_seen": 95680260, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.90234375, + "step": 4425, + "time_per_iteration": 2.48149037361145 + }, + { + "auxiliary_loss_clip": 0.01133425, + "auxiliary_loss_mlp": 0.01046814, + "balance_loss_clip": 1.03138816, + "balance_loss_mlp": 1.04685736, + "epoch": 0.26610551630843227, + "flos": 24791578197120.0, + "grad_norm": 1.774406296589384, + "language_loss": 0.80463314, + "learning_rate": 3.4431690789472532e-06, + "loss": 0.82643557, + "num_input_tokens_seen": 95701140, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8671875, + "step": 4426, + "time_per_iteration": 2.5968613624572754 + }, + { + "auxiliary_loss_clip": 0.01138948, + "auxiliary_loss_mlp": 0.0104835, + "balance_loss_clip": 1.03180957, + "balance_loss_mlp": 1.04982209, + "epoch": 0.26616563956110023, + "flos": 27636385397760.0, + "grad_norm": 1.8207507571493768, + "language_loss": 0.77337295, + "learning_rate": 3.442899417008333e-06, + "loss": 0.79524601, + "num_input_tokens_seen": 95722060, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.890625, + "step": 4427, + "time_per_iteration": 4.045380353927612 + }, + { + "auxiliary_loss_clip": 0.01133558, + "auxiliary_loss_mlp": 0.010331, + "balance_loss_clip": 1.01760316, + "balance_loss_mlp": 1.04737306, + "epoch": 0.26622576281376825, + "flos": 28362893880960.0, + "grad_norm": 1.8400253790543033, + "language_loss": 0.76800078, + "learning_rate": 3.4426297003545227e-06, + "loss": 0.78966737, + "num_input_tokens_seen": 95742495, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.86328125, + "step": 4428, + "time_per_iteration": 4.018831491470337 + }, + { + "auxiliary_loss_clip": 0.01135115, + "auxiliary_loss_mlp": 0.01034355, + "balance_loss_clip": 1.01858354, + "balance_loss_mlp": 1.04529297, + "epoch": 0.2662858860664362, + "flos": 18041341979520.0, + "grad_norm": 1.9075878866801723, + "language_loss": 0.83010298, + "learning_rate": 3.4423599289960495e-06, + "loss": 0.8517977, + "num_input_tokens_seen": 95761510, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8984375, + "step": 4429, + "time_per_iteration": 2.576535940170288 + }, + { + "auxiliary_loss_clip": 0.01133677, + "auxiliary_loss_mlp": 0.01042932, + "balance_loss_clip": 1.02644563, + "balance_loss_mlp": 1.04664719, + "epoch": 0.2663460093191042, + "flos": 22745818995840.0, + "grad_norm": 3.2197583620662082, + "language_loss": 0.72143924, + "learning_rate": 3.442090102943143e-06, + "loss": 0.74320537, + "num_input_tokens_seen": 95782385, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.87109375, + "step": 4430, + "time_per_iteration": 2.5262365341186523 + }, + { + "auxiliary_loss_clip": 0.01136153, + "auxiliary_loss_mlp": 0.01042808, + "balance_loss_clip": 1.02453375, + "balance_loss_mlp": 1.04667306, + "epoch": 0.26640613257177215, + "flos": 16508782344960.0, + "grad_norm": 2.382555523964676, + "language_loss": 0.81635833, + "learning_rate": 3.441820222206035e-06, + "loss": 0.83814788, + "num_input_tokens_seen": 95800595, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.89453125, + "step": 4431, + "time_per_iteration": 2.5135624408721924 + }, + { + "auxiliary_loss_clip": 0.01142285, + "auxiliary_loss_mlp": 0.01050952, + "balance_loss_clip": 1.03360736, + "balance_loss_mlp": 1.04865289, + "epoch": 0.2664662558244401, + "flos": 23075945919360.0, + "grad_norm": 2.34486467491615, + "language_loss": 0.76153386, + "learning_rate": 3.44155028679496e-06, + "loss": 0.78346616, + "num_input_tokens_seen": 95818480, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.9375, + "step": 4432, + "time_per_iteration": 2.469515562057495 + }, + { + "auxiliary_loss_clip": 0.01136779, + "auxiliary_loss_mlp": 0.01044676, + "balance_loss_clip": 1.02711606, + "balance_loss_mlp": 1.04703665, + "epoch": 0.2665263790771081, + "flos": 23769273214080.0, + "grad_norm": 2.148919041496035, + "language_loss": 0.82521772, + "learning_rate": 3.441280296720154e-06, + "loss": 0.84703225, + "num_input_tokens_seen": 95837205, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8984375, + "step": 4433, + "time_per_iteration": 2.540174961090088 + }, + { + "auxiliary_loss_clip": 0.01138849, + "auxiliary_loss_mlp": 0.01048222, + "balance_loss_clip": 1.03065097, + "balance_loss_mlp": 1.04955435, + "epoch": 0.26658650232977604, + "flos": 28001273708160.0, + "grad_norm": 2.091984027516481, + "language_loss": 0.76638913, + "learning_rate": 3.441010251991854e-06, + "loss": 0.78825986, + "num_input_tokens_seen": 95858395, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.890625, + "step": 4434, + "time_per_iteration": 2.549769878387451 + }, + { + "auxiliary_loss_clip": 0.01133542, + "auxiliary_loss_mlp": 0.01043118, + "balance_loss_clip": 1.02770376, + "balance_loss_mlp": 1.04645348, + "epoch": 0.266646625582444, + "flos": 22163635359360.0, + "grad_norm": 2.251252650424801, + "language_loss": 0.82632279, + "learning_rate": 3.440740152620301e-06, + "loss": 0.84808934, + "num_input_tokens_seen": 95877875, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.87109375, + "step": 4435, + "time_per_iteration": 2.5329744815826416 + }, + { + "auxiliary_loss_clip": 0.01140704, + "auxiliary_loss_mlp": 0.0105698, + "balance_loss_clip": 1.03870487, + "balance_loss_mlp": 1.04742312, + "epoch": 0.266706748835112, + "flos": 27853537069440.0, + "grad_norm": 2.2611652281579397, + "language_loss": 0.87278962, + "learning_rate": 3.4404699986157376e-06, + "loss": 0.89476645, + "num_input_tokens_seen": 95895820, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.9296875, + "step": 4436, + "time_per_iteration": 2.5375254154205322 + }, + { + "auxiliary_loss_clip": 0.01136328, + "auxiliary_loss_mlp": 0.0104305, + "balance_loss_clip": 1.02670658, + "balance_loss_mlp": 1.04566383, + "epoch": 0.26676687208777994, + "flos": 25812123413760.0, + "grad_norm": 1.4304916595737875, + "language_loss": 0.78941, + "learning_rate": 3.440199789988407e-06, + "loss": 0.81120378, + "num_input_tokens_seen": 95918025, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90625, + "step": 4437, + "time_per_iteration": 2.591017007827759 + }, + { + "auxiliary_loss_clip": 0.01134502, + "auxiliary_loss_mlp": 0.01041567, + "balance_loss_clip": 1.02533066, + "balance_loss_mlp": 1.04595256, + "epoch": 0.2668269953404479, + "flos": 36064583504640.0, + "grad_norm": 2.0731379310987412, + "language_loss": 0.63412011, + "learning_rate": 3.439929526748556e-06, + "loss": 0.65588087, + "num_input_tokens_seen": 95937725, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8828125, + "step": 4438, + "time_per_iteration": 2.6429452896118164 + }, + { + "auxiliary_loss_clip": 0.01137556, + "auxiliary_loss_mlp": 0.01036987, + "balance_loss_clip": 1.02125144, + "balance_loss_mlp": 1.04869223, + "epoch": 0.26688711859311587, + "flos": 26570987072640.0, + "grad_norm": 1.8133794638407341, + "language_loss": 0.75628942, + "learning_rate": 3.4396592089064334e-06, + "loss": 0.77803481, + "num_input_tokens_seen": 95956335, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.890625, + "step": 4439, + "time_per_iteration": 2.5296032428741455 + }, + { + "auxiliary_loss_clip": 0.01140638, + "auxiliary_loss_mlp": 0.01038527, + "balance_loss_clip": 1.02052629, + "balance_loss_mlp": 1.04913759, + "epoch": 0.26694724184578383, + "flos": 26761565658240.0, + "grad_norm": 1.7792140134846064, + "language_loss": 0.71444011, + "learning_rate": 3.4393888364722897e-06, + "loss": 0.7362318, + "num_input_tokens_seen": 95977135, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.9140625, + "step": 4440, + "time_per_iteration": 2.5714335441589355 + }, + { + "auxiliary_loss_clip": 0.01139576, + "auxiliary_loss_mlp": 0.01045623, + "balance_loss_clip": 1.02757502, + "balance_loss_mlp": 1.04816949, + "epoch": 0.2670073650984518, + "flos": 20959586536320.0, + "grad_norm": 1.8363906583736056, + "language_loss": 0.66291904, + "learning_rate": 3.439118409456376e-06, + "loss": 0.68477106, + "num_input_tokens_seen": 95995435, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.9140625, + "step": 4441, + "time_per_iteration": 2.522589683532715 + }, + { + "auxiliary_loss_clip": 0.01137665, + "auxiliary_loss_mlp": 0.0104418, + "balance_loss_clip": 1.02654862, + "balance_loss_mlp": 1.04803538, + "epoch": 0.2670674883511198, + "flos": 28366054277760.0, + "grad_norm": 1.5597318548365904, + "language_loss": 0.76451373, + "learning_rate": 3.4388479278689486e-06, + "loss": 0.78633213, + "num_input_tokens_seen": 96016340, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.89453125, + "step": 4442, + "time_per_iteration": 2.5659492015838623 + }, + { + "auxiliary_loss_clip": 0.01060214, + "auxiliary_loss_mlp": 0.0100059, + "balance_loss_clip": 0.99855101, + "balance_loss_mlp": 1.02895594, + "epoch": 0.2671276116037878, + "flos": 58971319430400.0, + "grad_norm": 0.912864167592289, + "language_loss": 0.61270142, + "learning_rate": 3.4385773917202637e-06, + "loss": 0.63330936, + "num_input_tokens_seen": 96071205, + "router_z_loss_clip": 0.02038574, + "router_z_loss_mlp": 0.3125, + "step": 4443, + "time_per_iteration": 3.0256776809692383 + }, + { + "auxiliary_loss_clip": 0.01140806, + "auxiliary_loss_mlp": 0.01035952, + "balance_loss_clip": 1.01968026, + "balance_loss_mlp": 1.0495882, + "epoch": 0.26718773485645575, + "flos": 43945072053120.0, + "grad_norm": 1.5525166591100914, + "language_loss": 0.76200545, + "learning_rate": 3.4383068010205793e-06, + "loss": 0.78377306, + "num_input_tokens_seen": 96094240, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.91015625, + "step": 4444, + "time_per_iteration": 2.7414674758911133 + }, + { + "auxiliary_loss_clip": 0.0114013, + "auxiliary_loss_mlp": 0.01040836, + "balance_loss_clip": 1.02330077, + "balance_loss_mlp": 1.04932773, + "epoch": 0.2672478581091237, + "flos": 25228323665280.0, + "grad_norm": 3.16165776963455, + "language_loss": 0.80212528, + "learning_rate": 3.438036155780158e-06, + "loss": 0.82393491, + "num_input_tokens_seen": 96114105, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.90625, + "step": 4445, + "time_per_iteration": 2.5349111557006836 + }, + { + "auxiliary_loss_clip": 0.01139166, + "auxiliary_loss_mlp": 0.0104, + "balance_loss_clip": 1.02232134, + "balance_loss_mlp": 1.04797101, + "epoch": 0.2673079813617917, + "flos": 15268176455040.0, + "grad_norm": 2.3952290716593825, + "language_loss": 0.89144397, + "learning_rate": 3.43776545600926e-06, + "loss": 0.91323566, + "num_input_tokens_seen": 96132140, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.91015625, + "step": 4446, + "time_per_iteration": 2.5512521266937256 + }, + { + "auxiliary_loss_clip": 0.01140462, + "auxiliary_loss_mlp": 0.01047593, + "balance_loss_clip": 1.0311892, + "balance_loss_mlp": 1.04977763, + "epoch": 0.26736810461445965, + "flos": 25812733944960.0, + "grad_norm": 1.831363923725005, + "language_loss": 0.68259656, + "learning_rate": 3.437494701718153e-06, + "loss": 0.70447719, + "num_input_tokens_seen": 96152090, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.90625, + "step": 4447, + "time_per_iteration": 2.5752837657928467 + }, + { + "auxiliary_loss_clip": 0.01140858, + "auxiliary_loss_mlp": 0.01040004, + "balance_loss_clip": 1.02261138, + "balance_loss_mlp": 1.04972827, + "epoch": 0.2674282278671276, + "flos": 24312709054080.0, + "grad_norm": 1.9862084341014827, + "language_loss": 0.82976532, + "learning_rate": 3.4372238929171026e-06, + "loss": 0.85157394, + "num_input_tokens_seen": 96170015, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.91015625, + "step": 4448, + "time_per_iteration": 2.6524059772491455 + }, + { + "auxiliary_loss_clip": 0.01137667, + "auxiliary_loss_mlp": 0.01048508, + "balance_loss_clip": 1.03110301, + "balance_loss_mlp": 1.04973495, + "epoch": 0.2674883511197956, + "flos": 22815521337600.0, + "grad_norm": 2.185461436072074, + "language_loss": 0.84288895, + "learning_rate": 3.436953029616378e-06, + "loss": 0.86475068, + "num_input_tokens_seen": 96188065, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.87890625, + "step": 4449, + "time_per_iteration": 2.5167598724365234 + }, + { + "auxiliary_loss_clip": 0.01148403, + "auxiliary_loss_mlp": 0.01047832, + "balance_loss_clip": 1.02892506, + "balance_loss_mlp": 1.05114913, + "epoch": 0.26754847437246354, + "flos": 25370170473600.0, + "grad_norm": 1.9936425417360089, + "language_loss": 0.84260273, + "learning_rate": 3.4366821118262506e-06, + "loss": 0.86456501, + "num_input_tokens_seen": 96205780, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.97265625, + "step": 4450, + "time_per_iteration": 2.555941343307495 + }, + { + "auxiliary_loss_clip": 0.01133946, + "auxiliary_loss_mlp": 0.0104095, + "balance_loss_clip": 1.02560782, + "balance_loss_mlp": 1.04674196, + "epoch": 0.2676085976251315, + "flos": 20230420446720.0, + "grad_norm": 1.900524277018137, + "language_loss": 0.81065774, + "learning_rate": 3.4364111395569937e-06, + "loss": 0.83240664, + "num_input_tokens_seen": 96224990, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.87109375, + "step": 4451, + "time_per_iteration": 2.5289859771728516 + }, + { + "auxiliary_loss_clip": 0.01140947, + "auxiliary_loss_mlp": 0.01041834, + "balance_loss_clip": 1.02593148, + "balance_loss_mlp": 1.05186319, + "epoch": 0.26766872087779947, + "flos": 28038225824640.0, + "grad_norm": 1.8040621200757803, + "language_loss": 0.86401796, + "learning_rate": 3.436140112818882e-06, + "loss": 0.88584578, + "num_input_tokens_seen": 96245345, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.890625, + "step": 4452, + "time_per_iteration": 2.617918014526367 + }, + { + "auxiliary_loss_clip": 0.01143372, + "auxiliary_loss_mlp": 0.01037743, + "balance_loss_clip": 1.02055311, + "balance_loss_mlp": 1.05132198, + "epoch": 0.26772884413046744, + "flos": 18325179250560.0, + "grad_norm": 1.9731948573099198, + "language_loss": 0.83129871, + "learning_rate": 3.435869031622194e-06, + "loss": 0.8531099, + "num_input_tokens_seen": 96259000, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.921875, + "step": 4453, + "time_per_iteration": 2.483130931854248 + }, + { + "auxiliary_loss_clip": 0.0113897, + "auxiliary_loss_mlp": 0.01046975, + "balance_loss_clip": 1.02936745, + "balance_loss_mlp": 1.04995108, + "epoch": 0.2677889673831354, + "flos": 22127509255680.0, + "grad_norm": 1.62656613015929, + "language_loss": 0.79744816, + "learning_rate": 3.435597895977208e-06, + "loss": 0.81930768, + "num_input_tokens_seen": 96277000, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.890625, + "step": 4454, + "time_per_iteration": 2.537853717803955 + }, + { + "auxiliary_loss_clip": 0.01141821, + "auxiliary_loss_mlp": 0.01042652, + "balance_loss_clip": 1.02599859, + "balance_loss_mlp": 1.04989707, + "epoch": 0.2678490906358034, + "flos": 23729699404800.0, + "grad_norm": 1.7640316216704761, + "language_loss": 0.7215519, + "learning_rate": 3.435326705894206e-06, + "loss": 0.74339664, + "num_input_tokens_seen": 96297010, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.91796875, + "step": 4455, + "time_per_iteration": 2.5023562908172607 + }, + { + "auxiliary_loss_clip": 0.01137457, + "auxiliary_loss_mlp": 0.01039805, + "balance_loss_clip": 1.02406991, + "balance_loss_mlp": 1.05066276, + "epoch": 0.2679092138884714, + "flos": 21762872340480.0, + "grad_norm": 1.5496021720121687, + "language_loss": 0.74044335, + "learning_rate": 3.435055461383471e-06, + "loss": 0.76221603, + "num_input_tokens_seen": 96315780, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8671875, + "step": 4456, + "time_per_iteration": 2.487581729888916 + }, + { + "auxiliary_loss_clip": 0.01141742, + "auxiliary_loss_mlp": 0.01038216, + "balance_loss_clip": 1.02121687, + "balance_loss_mlp": 1.04937947, + "epoch": 0.26796933714113935, + "flos": 19861186590720.0, + "grad_norm": 2.2089309948453697, + "language_loss": 0.70965469, + "learning_rate": 3.4347841624552896e-06, + "loss": 0.73145425, + "num_input_tokens_seen": 96333465, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.921875, + "step": 4457, + "time_per_iteration": 2.4584691524505615 + }, + { + "auxiliary_loss_clip": 0.01143072, + "auxiliary_loss_mlp": 0.010439, + "balance_loss_clip": 1.02681756, + "balance_loss_mlp": 1.05237103, + "epoch": 0.2680294603938073, + "flos": 20047886507520.0, + "grad_norm": 2.29797460876898, + "language_loss": 0.79029202, + "learning_rate": 3.4345128091199493e-06, + "loss": 0.81216174, + "num_input_tokens_seen": 96352005, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.90625, + "step": 4458, + "time_per_iteration": 2.6079578399658203 + }, + { + "auxiliary_loss_clip": 0.01052787, + "auxiliary_loss_mlp": 0.01006207, + "balance_loss_clip": 1.00439513, + "balance_loss_mlp": 1.02259135, + "epoch": 0.2680895836464753, + "flos": 72113763052800.0, + "grad_norm": 0.8640508796264214, + "language_loss": 0.58716619, + "learning_rate": 3.434241401387739e-06, + "loss": 0.60775614, + "num_input_tokens_seen": 96406265, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.30078125, + "step": 4459, + "time_per_iteration": 3.0725412368774414 + }, + { + "auxiliary_loss_clip": 0.0113409, + "auxiliary_loss_mlp": 0.01040081, + "balance_loss_clip": 1.02444053, + "balance_loss_mlp": 1.04671741, + "epoch": 0.26814970689914325, + "flos": 20449044576000.0, + "grad_norm": 2.0778557825519055, + "language_loss": 0.85224575, + "learning_rate": 3.4339699392689507e-06, + "loss": 0.87398744, + "num_input_tokens_seen": 96425225, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.875, + "step": 4460, + "time_per_iteration": 2.483299732208252 + }, + { + "auxiliary_loss_clip": 0.01136074, + "auxiliary_loss_mlp": 0.01043042, + "balance_loss_clip": 1.02653205, + "balance_loss_mlp": 1.04752469, + "epoch": 0.2682098301518112, + "flos": 17566674727680.0, + "grad_norm": 2.805871571962145, + "language_loss": 0.68256581, + "learning_rate": 3.4336984227738796e-06, + "loss": 0.70435691, + "num_input_tokens_seen": 96443780, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8828125, + "step": 4461, + "time_per_iteration": 2.439304828643799 + }, + { + "auxiliary_loss_clip": 0.01135713, + "auxiliary_loss_mlp": 0.01049055, + "balance_loss_clip": 1.03198409, + "balance_loss_mlp": 1.0470686, + "epoch": 0.2682699534044792, + "flos": 18333259810560.0, + "grad_norm": 1.5557483279788171, + "language_loss": 0.67342007, + "learning_rate": 3.43342685191282e-06, + "loss": 0.69526774, + "num_input_tokens_seen": 96464530, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.88671875, + "step": 4462, + "time_per_iteration": 2.5081140995025635 + }, + { + "auxiliary_loss_clip": 0.01136996, + "auxiliary_loss_mlp": 0.01041529, + "balance_loss_clip": 1.02413619, + "balance_loss_mlp": 1.04865909, + "epoch": 0.26833007665714714, + "flos": 25301294144640.0, + "grad_norm": 1.8707784514564991, + "language_loss": 0.6927141, + "learning_rate": 3.4331552266960705e-06, + "loss": 0.71449935, + "num_input_tokens_seen": 96483345, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.8828125, + "step": 4463, + "time_per_iteration": 2.5280556678771973 + }, + { + "auxiliary_loss_clip": 0.01140107, + "auxiliary_loss_mlp": 0.01041395, + "balance_loss_clip": 1.02414584, + "balance_loss_mlp": 1.04812574, + "epoch": 0.2683901999098151, + "flos": 16099759198080.0, + "grad_norm": 2.4976114648735304, + "language_loss": 0.77389008, + "learning_rate": 3.432883547133931e-06, + "loss": 0.79570508, + "num_input_tokens_seen": 96498305, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.921875, + "step": 4464, + "time_per_iteration": 2.469650983810425 + }, + { + "auxiliary_loss_clip": 0.01134508, + "auxiliary_loss_mlp": 0.01039425, + "balance_loss_clip": 1.02215123, + "balance_loss_mlp": 1.0458076, + "epoch": 0.2684503231624831, + "flos": 27308054154240.0, + "grad_norm": 1.844577670487785, + "language_loss": 0.70796561, + "learning_rate": 3.432611813236704e-06, + "loss": 0.72970498, + "num_input_tokens_seen": 96519740, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.88671875, + "step": 4465, + "time_per_iteration": 2.5685060024261475 + }, + { + "auxiliary_loss_clip": 0.01049569, + "auxiliary_loss_mlp": 0.0100238, + "balance_loss_clip": 1.00067484, + "balance_loss_mlp": 1.01956284, + "epoch": 0.26851044641515104, + "flos": 71858007239040.0, + "grad_norm": 0.6800540965400289, + "language_loss": 0.53096056, + "learning_rate": 3.4323400250146943e-06, + "loss": 0.55148005, + "num_input_tokens_seen": 96588870, + "router_z_loss_clip": 0.01708984, + "router_z_loss_mlp": 0.30078125, + "step": 4466, + "time_per_iteration": 3.2327654361724854 + }, + { + "auxiliary_loss_clip": 0.01133624, + "auxiliary_loss_mlp": 0.01039482, + "balance_loss_clip": 1.02219653, + "balance_loss_mlp": 1.04600596, + "epoch": 0.268570569667819, + "flos": 18733771434240.0, + "grad_norm": 2.0764143418179213, + "language_loss": 0.7343837, + "learning_rate": 3.4320681824782057e-06, + "loss": 0.75611472, + "num_input_tokens_seen": 96605100, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.875, + "step": 4467, + "time_per_iteration": 2.5052013397216797 + }, + { + "auxiliary_loss_clip": 0.01138792, + "auxiliary_loss_mlp": 0.01045438, + "balance_loss_clip": 1.0278548, + "balance_loss_mlp": 1.04801464, + "epoch": 0.268630692920487, + "flos": 18178376365440.0, + "grad_norm": 2.5834152956256555, + "language_loss": 0.80703115, + "learning_rate": 3.4317962856375493e-06, + "loss": 0.82887346, + "num_input_tokens_seen": 96621410, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.90625, + "step": 4468, + "time_per_iteration": 2.4547622203826904 + }, + { + "auxiliary_loss_clip": 0.01047735, + "auxiliary_loss_mlp": 0.01005617, + "balance_loss_clip": 1.00407946, + "balance_loss_mlp": 1.01768315, + "epoch": 0.268690816173155, + "flos": 68731768978560.0, + "grad_norm": 0.8449159500606429, + "language_loss": 0.59532088, + "learning_rate": 3.4315243345030334e-06, + "loss": 0.61585438, + "num_input_tokens_seen": 96684810, + "router_z_loss_clip": 0.01538086, + "router_z_loss_mlp": 0.30078125, + "step": 4469, + "time_per_iteration": 4.6310715675354 + }, + { + "auxiliary_loss_clip": 0.01137988, + "auxiliary_loss_mlp": 0.01045172, + "balance_loss_clip": 1.02687383, + "balance_loss_mlp": 1.04844749, + "epoch": 0.26875093942582295, + "flos": 23293636295040.0, + "grad_norm": 2.3316897890333954, + "language_loss": 0.81785607, + "learning_rate": 3.431252329084972e-06, + "loss": 0.83968771, + "num_input_tokens_seen": 96701920, + "router_z_loss_clip": 0.18261719, + "router_z_loss_mlp": 0.89453125, + "step": 4470, + "time_per_iteration": 2.5501935482025146 + }, + { + "auxiliary_loss_clip": 0.01129268, + "auxiliary_loss_mlp": 0.01037606, + "balance_loss_clip": 1.02091098, + "balance_loss_mlp": 1.04484963, + "epoch": 0.2688110626784909, + "flos": 21543458112000.0, + "grad_norm": 1.6194658793917844, + "language_loss": 0.82648492, + "learning_rate": 3.4309802693936786e-06, + "loss": 0.84815365, + "num_input_tokens_seen": 96721260, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.84375, + "step": 4471, + "time_per_iteration": 2.559220552444458 + }, + { + "auxiliary_loss_clip": 0.0113472, + "auxiliary_loss_mlp": 0.010367, + "balance_loss_clip": 1.02042806, + "balance_loss_mlp": 1.04853129, + "epoch": 0.2688711859311589, + "flos": 28400600183040.0, + "grad_norm": 8.458966217412893, + "language_loss": 0.69382554, + "learning_rate": 3.43070815543947e-06, + "loss": 0.71553975, + "num_input_tokens_seen": 96740385, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.86328125, + "step": 4472, + "time_per_iteration": 2.561326742172241 + }, + { + "auxiliary_loss_clip": 0.01135298, + "auxiliary_loss_mlp": 0.01036687, + "balance_loss_clip": 1.02045035, + "balance_loss_mlp": 1.04783702, + "epoch": 0.26893130918382685, + "flos": 25994944661760.0, + "grad_norm": 1.596928542569954, + "language_loss": 0.67870784, + "learning_rate": 3.4304359872326656e-06, + "loss": 0.70042771, + "num_input_tokens_seen": 96761860, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.875, + "step": 4473, + "time_per_iteration": 2.5437636375427246 + }, + { + "auxiliary_loss_clip": 0.01133236, + "auxiliary_loss_mlp": 0.01044607, + "balance_loss_clip": 1.02844238, + "balance_loss_mlp": 1.04768729, + "epoch": 0.2689914324364948, + "flos": 20339624770560.0, + "grad_norm": 1.8504576821316179, + "language_loss": 0.82971931, + "learning_rate": 3.4301637647835843e-06, + "loss": 0.85149777, + "num_input_tokens_seen": 96781890, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.85546875, + "step": 4474, + "time_per_iteration": 2.474095582962036 + }, + { + "auxiliary_loss_clip": 0.01132567, + "auxiliary_loss_mlp": 0.01046818, + "balance_loss_clip": 1.03042698, + "balance_loss_mlp": 1.04697323, + "epoch": 0.2690515556891628, + "flos": 19464553635840.0, + "grad_norm": 2.0689967373005977, + "language_loss": 0.70303237, + "learning_rate": 3.4298914881025494e-06, + "loss": 0.72482622, + "num_input_tokens_seen": 96800390, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.85546875, + "step": 4475, + "time_per_iteration": 2.4865996837615967 + }, + { + "auxiliary_loss_clip": 0.01135068, + "auxiliary_loss_mlp": 0.01040112, + "balance_loss_clip": 1.02335167, + "balance_loss_mlp": 1.04614162, + "epoch": 0.26911167894183075, + "flos": 18146631720960.0, + "grad_norm": 1.7721029234489851, + "language_loss": 0.73711979, + "learning_rate": 3.4296191571998863e-06, + "loss": 0.75887156, + "num_input_tokens_seen": 96816685, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.890625, + "step": 4476, + "time_per_iteration": 2.477308988571167 + }, + { + "auxiliary_loss_clip": 0.01130545, + "auxiliary_loss_mlp": 0.01040281, + "balance_loss_clip": 1.02456927, + "balance_loss_mlp": 1.04561102, + "epoch": 0.2691718021944987, + "flos": 19975131509760.0, + "grad_norm": 1.720914514753409, + "language_loss": 0.80110955, + "learning_rate": 3.429346772085922e-06, + "loss": 0.8228178, + "num_input_tokens_seen": 96836285, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84765625, + "step": 4477, + "time_per_iteration": 2.497809648513794 + }, + { + "auxiliary_loss_clip": 0.01133072, + "auxiliary_loss_mlp": 0.01042879, + "balance_loss_clip": 1.02578449, + "balance_loss_mlp": 1.04442573, + "epoch": 0.2692319254471667, + "flos": 37447215770880.0, + "grad_norm": 1.9038830637231319, + "language_loss": 0.64580482, + "learning_rate": 3.429074332770984e-06, + "loss": 0.66756433, + "num_input_tokens_seen": 96857745, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.8828125, + "step": 4478, + "time_per_iteration": 2.6485564708709717 + }, + { + "auxiliary_loss_clip": 0.01130767, + "auxiliary_loss_mlp": 0.0104511, + "balance_loss_clip": 1.02876592, + "balance_loss_mlp": 1.04380882, + "epoch": 0.26929204869983464, + "flos": 22127796564480.0, + "grad_norm": 1.8571100614964546, + "language_loss": 0.80653036, + "learning_rate": 3.4288018392654047e-06, + "loss": 0.82828909, + "num_input_tokens_seen": 96877295, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8671875, + "step": 4479, + "time_per_iteration": 2.4851014614105225 + }, + { + "auxiliary_loss_clip": 0.01135761, + "auxiliary_loss_mlp": 0.01043964, + "balance_loss_clip": 1.02725112, + "balance_loss_mlp": 1.04611528, + "epoch": 0.2693521719525026, + "flos": 19792813052160.0, + "grad_norm": 2.4630797167742458, + "language_loss": 0.80834484, + "learning_rate": 3.4285292915795166e-06, + "loss": 0.83014214, + "num_input_tokens_seen": 96896160, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.89453125, + "step": 4480, + "time_per_iteration": 2.490147590637207 + }, + { + "auxiliary_loss_clip": 0.01124775, + "auxiliary_loss_mlp": 0.01036236, + "balance_loss_clip": 1.02066684, + "balance_loss_mlp": 1.04153395, + "epoch": 0.2694122952051706, + "flos": 20994383836800.0, + "grad_norm": 1.7677898796301312, + "language_loss": 0.77612787, + "learning_rate": 3.4282566897236543e-06, + "loss": 0.79773796, + "num_input_tokens_seen": 96915410, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.83203125, + "step": 4481, + "time_per_iteration": 2.4699158668518066 + }, + { + "auxiliary_loss_clip": 0.01134279, + "auxiliary_loss_mlp": 0.01044694, + "balance_loss_clip": 1.02737296, + "balance_loss_mlp": 1.04591584, + "epoch": 0.2694724184578386, + "flos": 25849291011840.0, + "grad_norm": 2.5981026313468525, + "language_loss": 0.74701524, + "learning_rate": 3.4279840337081547e-06, + "loss": 0.76880491, + "num_input_tokens_seen": 96937865, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.8828125, + "step": 4482, + "time_per_iteration": 2.556087017059326 + }, + { + "auxiliary_loss_clip": 0.01135034, + "auxiliary_loss_mlp": 0.01039094, + "balance_loss_clip": 1.02198792, + "balance_loss_mlp": 1.04693186, + "epoch": 0.26953254171050656, + "flos": 21726961718400.0, + "grad_norm": 1.852738059166697, + "language_loss": 0.72176206, + "learning_rate": 3.4277113235433584e-06, + "loss": 0.74350333, + "num_input_tokens_seen": 96957710, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.8828125, + "step": 4483, + "time_per_iteration": 2.4762344360351562 + }, + { + "auxiliary_loss_clip": 0.01133416, + "auxiliary_loss_mlp": 0.01043511, + "balance_loss_clip": 1.02635717, + "balance_loss_mlp": 1.04290676, + "epoch": 0.2695926649631745, + "flos": 19682926369920.0, + "grad_norm": 2.626283812761087, + "language_loss": 0.87107188, + "learning_rate": 3.427438559239605e-06, + "loss": 0.8928411, + "num_input_tokens_seen": 96975890, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.90625, + "step": 4484, + "time_per_iteration": 2.486185073852539 + }, + { + "auxiliary_loss_clip": 0.01131969, + "auxiliary_loss_mlp": 0.01040339, + "balance_loss_clip": 1.02447212, + "balance_loss_mlp": 1.04373026, + "epoch": 0.2696527882158425, + "flos": 32886596724480.0, + "grad_norm": 1.901905407661022, + "language_loss": 0.66389644, + "learning_rate": 3.427165740807239e-06, + "loss": 0.68561947, + "num_input_tokens_seen": 96998595, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8828125, + "step": 4485, + "time_per_iteration": 2.5674586296081543 + }, + { + "auxiliary_loss_clip": 0.01133447, + "auxiliary_loss_mlp": 0.01040269, + "balance_loss_clip": 1.02371132, + "balance_loss_mlp": 1.0445261, + "epoch": 0.26971291146851045, + "flos": 12124843320960.0, + "grad_norm": 2.8933932068842783, + "language_loss": 0.72378826, + "learning_rate": 3.426892868256604e-06, + "loss": 0.74552536, + "num_input_tokens_seen": 97013715, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.890625, + "step": 4486, + "time_per_iteration": 2.471036434173584 + }, + { + "auxiliary_loss_clip": 0.01137696, + "auxiliary_loss_mlp": 0.01038547, + "balance_loss_clip": 1.02257311, + "balance_loss_mlp": 1.04809284, + "epoch": 0.2697730347211784, + "flos": 22634459856000.0, + "grad_norm": 1.8546648123058087, + "language_loss": 0.83810318, + "learning_rate": 3.4266199415980495e-06, + "loss": 0.85986561, + "num_input_tokens_seen": 97031570, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8984375, + "step": 4487, + "time_per_iteration": 2.4867916107177734 + }, + { + "auxiliary_loss_clip": 0.01137573, + "auxiliary_loss_mlp": 0.01044901, + "balance_loss_clip": 1.02749646, + "balance_loss_mlp": 1.0477773, + "epoch": 0.2698331579738464, + "flos": 23513050523520.0, + "grad_norm": 2.2079504028023598, + "language_loss": 0.71220767, + "learning_rate": 3.4263469608419234e-06, + "loss": 0.73403245, + "num_input_tokens_seen": 97049815, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.8984375, + "step": 4488, + "time_per_iteration": 2.5174567699432373 + }, + { + "auxiliary_loss_clip": 0.01136886, + "auxiliary_loss_mlp": 0.01045434, + "balance_loss_clip": 1.02851868, + "balance_loss_mlp": 1.04792523, + "epoch": 0.26989328122651435, + "flos": 24641040297600.0, + "grad_norm": 1.6338784898376273, + "language_loss": 0.83736706, + "learning_rate": 3.426073925998578e-06, + "loss": 0.85919023, + "num_input_tokens_seen": 97067570, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.890625, + "step": 4489, + "time_per_iteration": 2.5314295291900635 + }, + { + "auxiliary_loss_clip": 0.01136964, + "auxiliary_loss_mlp": 0.01054545, + "balance_loss_clip": 1.03696203, + "balance_loss_mlp": 1.04693484, + "epoch": 0.2699534044791823, + "flos": 10772555068800.0, + "grad_norm": 2.5551945574509176, + "language_loss": 0.89805245, + "learning_rate": 3.4258008370783656e-06, + "loss": 0.91996753, + "num_input_tokens_seen": 97082180, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8984375, + "step": 4490, + "time_per_iteration": 2.4975826740264893 + }, + { + "auxiliary_loss_clip": 0.01128305, + "auxiliary_loss_mlp": 0.01042718, + "balance_loss_clip": 1.02741122, + "balance_loss_mlp": 1.04349554, + "epoch": 0.2700135277318503, + "flos": 36171597098880.0, + "grad_norm": 1.8455290723250308, + "language_loss": 0.73354411, + "learning_rate": 3.4255276940916434e-06, + "loss": 0.75525427, + "num_input_tokens_seen": 97103470, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.84765625, + "step": 4491, + "time_per_iteration": 2.6303470134735107 + }, + { + "auxiliary_loss_clip": 0.01138617, + "auxiliary_loss_mlp": 0.01043046, + "balance_loss_clip": 1.02613568, + "balance_loss_mlp": 1.04974079, + "epoch": 0.27007365098451824, + "flos": 17418614866560.0, + "grad_norm": 3.089516252272487, + "language_loss": 0.74379975, + "learning_rate": 3.4252544970487676e-06, + "loss": 0.7656163, + "num_input_tokens_seen": 97118100, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.890625, + "step": 4492, + "time_per_iteration": 2.5124619007110596 + }, + { + "auxiliary_loss_clip": 0.01133231, + "auxiliary_loss_mlp": 0.01040234, + "balance_loss_clip": 1.0241406, + "balance_loss_mlp": 1.04671812, + "epoch": 0.2701337742371862, + "flos": 23185688947200.0, + "grad_norm": 1.896651323252439, + "language_loss": 0.88740528, + "learning_rate": 3.4249812459600986e-06, + "loss": 0.90913987, + "num_input_tokens_seen": 97136765, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8671875, + "step": 4493, + "time_per_iteration": 2.480473756790161 + }, + { + "auxiliary_loss_clip": 0.01134006, + "auxiliary_loss_mlp": 0.01041715, + "balance_loss_clip": 1.02564538, + "balance_loss_mlp": 1.04676843, + "epoch": 0.2701938974898542, + "flos": 24389450461440.0, + "grad_norm": 1.468971775969503, + "language_loss": 0.70976114, + "learning_rate": 3.424707940835998e-06, + "loss": 0.73151839, + "num_input_tokens_seen": 97157470, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87109375, + "step": 4494, + "time_per_iteration": 2.5703446865081787 + }, + { + "auxiliary_loss_clip": 0.0112953, + "auxiliary_loss_mlp": 0.01034192, + "balance_loss_clip": 1.01920152, + "balance_loss_mlp": 1.04545951, + "epoch": 0.2702540207425222, + "flos": 26214322976640.0, + "grad_norm": 2.0322990364449325, + "language_loss": 0.86294192, + "learning_rate": 3.42443458168683e-06, + "loss": 0.88457918, + "num_input_tokens_seen": 97176905, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.83984375, + "step": 4495, + "time_per_iteration": 2.5428457260131836 + }, + { + "auxiliary_loss_clip": 0.01134, + "auxiliary_loss_mlp": 0.0104559, + "balance_loss_clip": 1.02968764, + "balance_loss_mlp": 1.04731214, + "epoch": 0.27031414399519016, + "flos": 22926377687040.0, + "grad_norm": 1.8698467905293557, + "language_loss": 0.76562083, + "learning_rate": 3.424161168522959e-06, + "loss": 0.7874167, + "num_input_tokens_seen": 97196380, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8671875, + "step": 4496, + "time_per_iteration": 2.5074446201324463 + }, + { + "auxiliary_loss_clip": 0.01048323, + "auxiliary_loss_mlp": 0.01012102, + "balance_loss_clip": 1.01042128, + "balance_loss_mlp": 1.01925802, + "epoch": 0.2703742672478581, + "flos": 63019780404480.0, + "grad_norm": 0.7221920911850954, + "language_loss": 0.50221699, + "learning_rate": 3.423887701354754e-06, + "loss": 0.52282125, + "num_input_tokens_seen": 97260100, + "router_z_loss_clip": 0.0168457, + "router_z_loss_mlp": 0.2890625, + "step": 4497, + "time_per_iteration": 3.110724687576294 + }, + { + "auxiliary_loss_clip": 0.01137008, + "auxiliary_loss_mlp": 0.01045622, + "balance_loss_clip": 1.03011322, + "balance_loss_mlp": 1.05020094, + "epoch": 0.2704343905005261, + "flos": 18840820942080.0, + "grad_norm": 1.6519561002314052, + "language_loss": 0.72420043, + "learning_rate": 3.4236141801925847e-06, + "loss": 0.74602675, + "num_input_tokens_seen": 97277935, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8671875, + "step": 4498, + "time_per_iteration": 2.522507429122925 + }, + { + "auxiliary_loss_clip": 0.01047265, + "auxiliary_loss_mlp": 0.0100549, + "balance_loss_clip": 1.0038569, + "balance_loss_mlp": 1.0182879, + "epoch": 0.27049451375319405, + "flos": 71233412618880.0, + "grad_norm": 0.7584910907853958, + "language_loss": 0.59222841, + "learning_rate": 3.4233406050468237e-06, + "loss": 0.61275595, + "num_input_tokens_seen": 97338845, + "router_z_loss_clip": 0.01635742, + "router_z_loss_mlp": 0.2890625, + "step": 4499, + "time_per_iteration": 3.1193060874938965 + }, + { + "auxiliary_loss_clip": 0.01132683, + "auxiliary_loss_mlp": 0.01036933, + "balance_loss_clip": 1.02085209, + "balance_loss_mlp": 1.04637063, + "epoch": 0.270554637005862, + "flos": 24278594112000.0, + "grad_norm": 2.0468109740969576, + "language_loss": 0.7361812, + "learning_rate": 3.4230669759278438e-06, + "loss": 0.75787735, + "num_input_tokens_seen": 97356640, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.86328125, + "step": 4500, + "time_per_iteration": 2.5073533058166504 + }, + { + "auxiliary_loss_clip": 0.01130893, + "auxiliary_loss_mlp": 0.01044299, + "balance_loss_clip": 1.02739513, + "balance_loss_mlp": 1.04379177, + "epoch": 0.27061476025853, + "flos": 17632318832640.0, + "grad_norm": 3.2528800155878765, + "language_loss": 0.80392325, + "learning_rate": 3.4227932928460215e-06, + "loss": 0.82567519, + "num_input_tokens_seen": 97372585, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.87109375, + "step": 4501, + "time_per_iteration": 2.4665989875793457 + }, + { + "auxiliary_loss_clip": 0.01134872, + "auxiliary_loss_mlp": 0.01044198, + "balance_loss_clip": 1.0278666, + "balance_loss_mlp": 1.04683352, + "epoch": 0.27067488351119795, + "flos": 22710123855360.0, + "grad_norm": 1.9148884605164396, + "language_loss": 0.72832727, + "learning_rate": 3.422519555811735e-06, + "loss": 0.75011796, + "num_input_tokens_seen": 97393315, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8828125, + "step": 4502, + "time_per_iteration": 2.511070489883423 + }, + { + "auxiliary_loss_clip": 0.01134337, + "auxiliary_loss_mlp": 0.01038575, + "balance_loss_clip": 1.0209558, + "balance_loss_mlp": 1.04282784, + "epoch": 0.2707350067638659, + "flos": 41719616087040.0, + "grad_norm": 1.724044037192685, + "language_loss": 0.68474984, + "learning_rate": 3.4222457648353642e-06, + "loss": 0.70647895, + "num_input_tokens_seen": 97417860, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.9140625, + "step": 4503, + "time_per_iteration": 2.6554527282714844 + }, + { + "auxiliary_loss_clip": 0.01133759, + "auxiliary_loss_mlp": 0.01040282, + "balance_loss_clip": 1.02425468, + "balance_loss_mlp": 1.04659927, + "epoch": 0.2707951300165339, + "flos": 20193037367040.0, + "grad_norm": 2.0245220791315655, + "language_loss": 0.68488902, + "learning_rate": 3.4219719199272918e-06, + "loss": 0.7066294, + "num_input_tokens_seen": 97436780, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87109375, + "step": 4504, + "time_per_iteration": 2.4813036918640137 + }, + { + "auxiliary_loss_clip": 0.01135516, + "auxiliary_loss_mlp": 0.01043406, + "balance_loss_clip": 1.02811766, + "balance_loss_mlp": 1.05043292, + "epoch": 0.27085525326920185, + "flos": 21433966479360.0, + "grad_norm": 1.7616188880043606, + "language_loss": 0.75553012, + "learning_rate": 3.421698021097902e-06, + "loss": 0.77731931, + "num_input_tokens_seen": 97456190, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8515625, + "step": 4505, + "time_per_iteration": 2.482228994369507 + }, + { + "auxiliary_loss_clip": 0.01138199, + "auxiliary_loss_mlp": 0.01049925, + "balance_loss_clip": 1.03271127, + "balance_loss_mlp": 1.047171, + "epoch": 0.2709153765218698, + "flos": 17675232606720.0, + "grad_norm": 1.8888030992954683, + "language_loss": 0.73508286, + "learning_rate": 3.42142406835758e-06, + "loss": 0.75696409, + "num_input_tokens_seen": 97474545, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.9140625, + "step": 4506, + "time_per_iteration": 2.493534803390503 + }, + { + "auxiliary_loss_clip": 0.01137077, + "auxiliary_loss_mlp": 0.01040919, + "balance_loss_clip": 1.02390218, + "balance_loss_mlp": 1.04818904, + "epoch": 0.2709754997745378, + "flos": 24456243801600.0, + "grad_norm": 2.012438120988393, + "language_loss": 0.80958861, + "learning_rate": 3.421150061716715e-06, + "loss": 0.83136857, + "num_input_tokens_seen": 97494520, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.890625, + "step": 4507, + "time_per_iteration": 2.488477945327759 + }, + { + "auxiliary_loss_clip": 0.01046128, + "auxiliary_loss_mlp": 0.01011944, + "balance_loss_clip": 1.0102514, + "balance_loss_mlp": 1.01738429, + "epoch": 0.2710356230272058, + "flos": 65210798206080.0, + "grad_norm": 0.7384209784394716, + "language_loss": 0.50892401, + "learning_rate": 3.420876001185698e-06, + "loss": 0.52950472, + "num_input_tokens_seen": 97552455, + "router_z_loss_clip": 0.01696777, + "router_z_loss_mlp": 0.28710938, + "step": 4508, + "time_per_iteration": 3.005894660949707 + }, + { + "auxiliary_loss_clip": 0.01129132, + "auxiliary_loss_mlp": 0.01039667, + "balance_loss_clip": 1.02413416, + "balance_loss_mlp": 1.04509401, + "epoch": 0.27109574627987376, + "flos": 25484438615040.0, + "grad_norm": 4.914093534195162, + "language_loss": 0.74373507, + "learning_rate": 3.4206018867749197e-06, + "loss": 0.76542306, + "num_input_tokens_seen": 97572650, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.83984375, + "step": 4509, + "time_per_iteration": 2.555645227432251 + }, + { + "auxiliary_loss_clip": 0.01126467, + "auxiliary_loss_mlp": 0.01039629, + "balance_loss_clip": 1.02418542, + "balance_loss_mlp": 1.04368544, + "epoch": 0.2711558695325417, + "flos": 19682782715520.0, + "grad_norm": 1.7859895301291084, + "language_loss": 0.71706283, + "learning_rate": 3.4203277184947757e-06, + "loss": 0.73872381, + "num_input_tokens_seen": 97591150, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.828125, + "step": 4510, + "time_per_iteration": 2.469756841659546 + }, + { + "auxiliary_loss_clip": 0.01133239, + "auxiliary_loss_mlp": 0.01034855, + "balance_loss_clip": 1.01921451, + "balance_loss_mlp": 1.04728365, + "epoch": 0.2712159927852097, + "flos": 18587758648320.0, + "grad_norm": 4.171230322312489, + "language_loss": 0.70698422, + "learning_rate": 3.4200534963556627e-06, + "loss": 0.72866517, + "num_input_tokens_seen": 97607410, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.859375, + "step": 4511, + "time_per_iteration": 3.9261832237243652 + }, + { + "auxiliary_loss_clip": 0.01133865, + "auxiliary_loss_mlp": 0.01043141, + "balance_loss_clip": 1.02660656, + "balance_loss_mlp": 1.04600286, + "epoch": 0.27127611603787766, + "flos": 25630235919360.0, + "grad_norm": 2.0859148079323564, + "language_loss": 0.80823237, + "learning_rate": 3.419779220367979e-06, + "loss": 0.83000243, + "num_input_tokens_seen": 97626870, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.87890625, + "step": 4512, + "time_per_iteration": 2.5112404823303223 + }, + { + "auxiliary_loss_clip": 0.01128916, + "auxiliary_loss_mlp": 0.01035298, + "balance_loss_clip": 1.02108788, + "balance_loss_mlp": 1.04543233, + "epoch": 0.2713362392905456, + "flos": 23148952312320.0, + "grad_norm": 1.880665339674376, + "language_loss": 0.80508482, + "learning_rate": 3.419504890542124e-06, + "loss": 0.82672697, + "num_input_tokens_seen": 97646595, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8359375, + "step": 4513, + "time_per_iteration": 2.5550525188446045 + }, + { + "auxiliary_loss_clip": 0.01132709, + "auxiliary_loss_mlp": 0.01042049, + "balance_loss_clip": 1.02668297, + "balance_loss_mlp": 1.04505134, + "epoch": 0.2713963625432136, + "flos": 18366045949440.0, + "grad_norm": 1.8883190176483522, + "language_loss": 0.88062817, + "learning_rate": 3.4192305068885026e-06, + "loss": 0.90237576, + "num_input_tokens_seen": 97665485, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.875, + "step": 4514, + "time_per_iteration": 2.4411823749542236 + }, + { + "auxiliary_loss_clip": 0.0113378, + "auxiliary_loss_mlp": 0.01041006, + "balance_loss_clip": 1.02475166, + "balance_loss_mlp": 1.04799736, + "epoch": 0.27145648579588155, + "flos": 22491751121280.0, + "grad_norm": 2.468440108941068, + "language_loss": 0.92064375, + "learning_rate": 3.418956069417517e-06, + "loss": 0.94239157, + "num_input_tokens_seen": 97683800, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.859375, + "step": 4515, + "time_per_iteration": 2.507073402404785 + }, + { + "auxiliary_loss_clip": 0.01140812, + "auxiliary_loss_mlp": 0.01050656, + "balance_loss_clip": 1.03202391, + "balance_loss_mlp": 1.04952395, + "epoch": 0.2715166090485495, + "flos": 19239177749760.0, + "grad_norm": 2.5869205534481017, + "language_loss": 0.73691195, + "learning_rate": 3.4186815781395756e-06, + "loss": 0.75882661, + "num_input_tokens_seen": 97700505, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.9140625, + "step": 4516, + "time_per_iteration": 2.4427852630615234 + }, + { + "auxiliary_loss_clip": 0.01134153, + "auxiliary_loss_mlp": 0.01040166, + "balance_loss_clip": 1.02352417, + "balance_loss_mlp": 1.0466857, + "epoch": 0.2715767323012175, + "flos": 17709598944000.0, + "grad_norm": 6.588152355110397, + "language_loss": 0.76239699, + "learning_rate": 3.4184070330650866e-06, + "loss": 0.78414017, + "num_input_tokens_seen": 97717410, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.875, + "step": 4517, + "time_per_iteration": 2.4891836643218994 + }, + { + "auxiliary_loss_clip": 0.01133662, + "auxiliary_loss_mlp": 0.01039085, + "balance_loss_clip": 1.02201402, + "balance_loss_mlp": 1.0473218, + "epoch": 0.27163685555388545, + "flos": 22382834106240.0, + "grad_norm": 2.2012309941627066, + "language_loss": 0.76785064, + "learning_rate": 3.4181324342044607e-06, + "loss": 0.78957808, + "num_input_tokens_seen": 97734545, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.86328125, + "step": 4518, + "time_per_iteration": 2.503117561340332 + }, + { + "auxiliary_loss_clip": 0.01133735, + "auxiliary_loss_mlp": 0.01039304, + "balance_loss_clip": 1.0241586, + "balance_loss_mlp": 1.04699707, + "epoch": 0.2716969788065534, + "flos": 22346708002560.0, + "grad_norm": 1.6415373198141725, + "language_loss": 0.68314338, + "learning_rate": 3.41785778156811e-06, + "loss": 0.7048738, + "num_input_tokens_seen": 97754000, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8671875, + "step": 4519, + "time_per_iteration": 2.573230028152466 + }, + { + "auxiliary_loss_clip": 0.01132669, + "auxiliary_loss_mlp": 0.01034398, + "balance_loss_clip": 1.01893663, + "balance_loss_mlp": 1.04631245, + "epoch": 0.2717571020592214, + "flos": 25228467319680.0, + "grad_norm": 2.6734918677628685, + "language_loss": 0.755759, + "learning_rate": 3.417583075166451e-06, + "loss": 0.7774297, + "num_input_tokens_seen": 97772080, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.86328125, + "step": 4520, + "time_per_iteration": 2.535546064376831 + }, + { + "auxiliary_loss_clip": 0.01138716, + "auxiliary_loss_mlp": 0.01044302, + "balance_loss_clip": 1.02628946, + "balance_loss_mlp": 1.0501039, + "epoch": 0.2718172253118894, + "flos": 20189769229440.0, + "grad_norm": 2.5201661256644523, + "language_loss": 0.76219606, + "learning_rate": 3.4173083150099e-06, + "loss": 0.78402621, + "num_input_tokens_seen": 97789370, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.88671875, + "step": 4521, + "time_per_iteration": 2.491654396057129 + }, + { + "auxiliary_loss_clip": 0.01137284, + "auxiliary_loss_mlp": 0.01047464, + "balance_loss_clip": 1.03102481, + "balance_loss_mlp": 1.04803133, + "epoch": 0.27187734856455736, + "flos": 14319129260160.0, + "grad_norm": 2.3970894391693967, + "language_loss": 0.75911158, + "learning_rate": 3.417033501108875e-06, + "loss": 0.78095901, + "num_input_tokens_seen": 97807385, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.890625, + "step": 4522, + "time_per_iteration": 2.471673011779785 + }, + { + "auxiliary_loss_clip": 0.01137707, + "auxiliary_loss_mlp": 0.0103702, + "balance_loss_clip": 1.02042627, + "balance_loss_mlp": 1.04873872, + "epoch": 0.27193747181722533, + "flos": 21107682311040.0, + "grad_norm": 5.0666434109354075, + "language_loss": 0.72895801, + "learning_rate": 3.416758633473798e-06, + "loss": 0.75070536, + "num_input_tokens_seen": 97827930, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.890625, + "step": 4523, + "time_per_iteration": 2.5152363777160645 + }, + { + "auxiliary_loss_clip": 0.01129262, + "auxiliary_loss_mlp": 0.01038594, + "balance_loss_clip": 1.02208352, + "balance_loss_mlp": 1.04448104, + "epoch": 0.2719975950698933, + "flos": 19682782715520.0, + "grad_norm": 1.5338044020439772, + "language_loss": 0.74324989, + "learning_rate": 3.4164837121150915e-06, + "loss": 0.76492846, + "num_input_tokens_seen": 97847440, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.84765625, + "step": 4524, + "time_per_iteration": 2.495253562927246 + }, + { + "auxiliary_loss_clip": 0.01135118, + "auxiliary_loss_mlp": 0.01039959, + "balance_loss_clip": 1.02380621, + "balance_loss_mlp": 1.04772878, + "epoch": 0.27205771832256126, + "flos": 24754482426240.0, + "grad_norm": 2.881398237919427, + "language_loss": 0.76651889, + "learning_rate": 3.4162087370431803e-06, + "loss": 0.78826964, + "num_input_tokens_seen": 97867620, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.875, + "step": 4525, + "time_per_iteration": 2.511634111404419 + }, + { + "auxiliary_loss_clip": 0.01131035, + "auxiliary_loss_mlp": 0.01049235, + "balance_loss_clip": 1.0334518, + "balance_loss_mlp": 1.04626358, + "epoch": 0.2721178415752292, + "flos": 21755581879680.0, + "grad_norm": 1.8599028556429251, + "language_loss": 0.81914634, + "learning_rate": 3.4159337082684926e-06, + "loss": 0.84094906, + "num_input_tokens_seen": 97884345, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.84765625, + "step": 4526, + "time_per_iteration": 2.495011568069458 + }, + { + "auxiliary_loss_clip": 0.01137971, + "auxiliary_loss_mlp": 0.01044775, + "balance_loss_clip": 1.02770483, + "balance_loss_mlp": 1.0466783, + "epoch": 0.2721779648278972, + "flos": 12676826597760.0, + "grad_norm": 3.313629745591453, + "language_loss": 0.77007318, + "learning_rate": 3.4156586258014566e-06, + "loss": 0.79190063, + "num_input_tokens_seen": 97901500, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.9140625, + "step": 4527, + "time_per_iteration": 2.5181260108947754 + }, + { + "auxiliary_loss_clip": 0.0113407, + "auxiliary_loss_mlp": 0.01041797, + "balance_loss_clip": 1.02496481, + "balance_loss_mlp": 1.04637635, + "epoch": 0.27223808808056515, + "flos": 16253206099200.0, + "grad_norm": 2.1845797146290784, + "language_loss": 0.81825048, + "learning_rate": 3.415383489652503e-06, + "loss": 0.84000921, + "num_input_tokens_seen": 97917800, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.87890625, + "step": 4528, + "time_per_iteration": 2.469916582107544 + }, + { + "auxiliary_loss_clip": 0.01131576, + "auxiliary_loss_mlp": 0.01042667, + "balance_loss_clip": 1.0273608, + "balance_loss_mlp": 1.04669189, + "epoch": 0.2722982113332331, + "flos": 27745805203200.0, + "grad_norm": 1.6672454466706952, + "language_loss": 0.77123594, + "learning_rate": 3.4151082998320666e-06, + "loss": 0.79297841, + "num_input_tokens_seen": 97937225, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8515625, + "step": 4529, + "time_per_iteration": 2.5379140377044678 + }, + { + "auxiliary_loss_clip": 0.01133862, + "auxiliary_loss_mlp": 0.01044178, + "balance_loss_clip": 1.02900243, + "balance_loss_mlp": 1.04580855, + "epoch": 0.2723583345859011, + "flos": 21726243446400.0, + "grad_norm": 2.4153957329893228, + "language_loss": 0.8195889, + "learning_rate": 3.4148330563505805e-06, + "loss": 0.84136933, + "num_input_tokens_seen": 97956845, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8828125, + "step": 4530, + "time_per_iteration": 2.5363659858703613 + }, + { + "auxiliary_loss_clip": 0.01133042, + "auxiliary_loss_mlp": 0.01036315, + "balance_loss_clip": 1.02010226, + "balance_loss_mlp": 1.04630172, + "epoch": 0.27241845783856905, + "flos": 17347260499200.0, + "grad_norm": 2.1797176655983432, + "language_loss": 0.91650689, + "learning_rate": 3.4145577592184838e-06, + "loss": 0.93820047, + "num_input_tokens_seen": 97972465, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8671875, + "step": 4531, + "time_per_iteration": 2.508429765701294 + }, + { + "auxiliary_loss_clip": 0.01134833, + "auxiliary_loss_mlp": 0.01047772, + "balance_loss_clip": 1.03159511, + "balance_loss_mlp": 1.04611766, + "epoch": 0.272478581091237, + "flos": 24754302858240.0, + "grad_norm": 2.532443443519077, + "language_loss": 0.76107466, + "learning_rate": 3.4142824084462155e-06, + "loss": 0.78290069, + "num_input_tokens_seen": 97990770, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.88671875, + "step": 4532, + "time_per_iteration": 2.499457359313965 + }, + { + "auxiliary_loss_clip": 0.01130352, + "auxiliary_loss_mlp": 0.01034139, + "balance_loss_clip": 1.01861846, + "balance_loss_mlp": 1.04643464, + "epoch": 0.272538704343905, + "flos": 17890624512000.0, + "grad_norm": 3.1928401528407746, + "language_loss": 0.89197671, + "learning_rate": 3.4140070040442162e-06, + "loss": 0.91362166, + "num_input_tokens_seen": 98005775, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.83984375, + "step": 4533, + "time_per_iteration": 2.508202075958252 + }, + { + "auxiliary_loss_clip": 0.0113001, + "auxiliary_loss_mlp": 0.01037014, + "balance_loss_clip": 1.02118278, + "balance_loss_mlp": 1.04587626, + "epoch": 0.272598827596573, + "flos": 22932016122240.0, + "grad_norm": 2.096334750916122, + "language_loss": 0.7125262, + "learning_rate": 3.413731546022929e-06, + "loss": 0.73419642, + "num_input_tokens_seen": 98025750, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.84375, + "step": 4534, + "time_per_iteration": 2.5111024379730225 + }, + { + "auxiliary_loss_clip": 0.01134655, + "auxiliary_loss_mlp": 0.01040405, + "balance_loss_clip": 1.02315593, + "balance_loss_mlp": 1.04651427, + "epoch": 0.27265895084924097, + "flos": 24238409771520.0, + "grad_norm": 1.9613498766130548, + "language_loss": 0.91064882, + "learning_rate": 3.4134560343928005e-06, + "loss": 0.93239939, + "num_input_tokens_seen": 98044955, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.8828125, + "step": 4535, + "time_per_iteration": 2.5509371757507324 + }, + { + "auxiliary_loss_clip": 0.01138846, + "auxiliary_loss_mlp": 0.01039245, + "balance_loss_clip": 1.02262712, + "balance_loss_mlp": 1.05108571, + "epoch": 0.27271907410190893, + "flos": 27013155494400.0, + "grad_norm": 1.5906078149456282, + "language_loss": 0.72618866, + "learning_rate": 3.4131804691642778e-06, + "loss": 0.74796963, + "num_input_tokens_seen": 98065860, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.87890625, + "step": 4536, + "time_per_iteration": 2.5106241703033447 + }, + { + "auxiliary_loss_clip": 0.01133436, + "auxiliary_loss_mlp": 0.0103976, + "balance_loss_clip": 1.02302337, + "balance_loss_mlp": 1.04617631, + "epoch": 0.2727791973545769, + "flos": 34452588942720.0, + "grad_norm": 1.839444357786457, + "language_loss": 0.7144469, + "learning_rate": 3.41290485034781e-06, + "loss": 0.73617887, + "num_input_tokens_seen": 98085450, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.87109375, + "step": 4537, + "time_per_iteration": 2.588439464569092 + }, + { + "auxiliary_loss_clip": 0.01132537, + "auxiliary_loss_mlp": 0.01040014, + "balance_loss_clip": 1.02363503, + "balance_loss_mlp": 1.04501796, + "epoch": 0.27283932060724486, + "flos": 15041723160960.0, + "grad_norm": 2.431092364938405, + "language_loss": 0.78177559, + "learning_rate": 3.4126291779538485e-06, + "loss": 0.80350113, + "num_input_tokens_seen": 98099115, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.875, + "step": 4538, + "time_per_iteration": 2.438603639602661 + }, + { + "auxiliary_loss_clip": 0.01134265, + "auxiliary_loss_mlp": 0.01041521, + "balance_loss_clip": 1.02609527, + "balance_loss_mlp": 1.04698634, + "epoch": 0.2728994438599128, + "flos": 21652411040640.0, + "grad_norm": 1.4794812227008705, + "language_loss": 0.90038705, + "learning_rate": 3.412353451992847e-06, + "loss": 0.92214489, + "num_input_tokens_seen": 98118415, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.875, + "step": 4539, + "time_per_iteration": 2.5052709579467773 + }, + { + "auxiliary_loss_clip": 0.01132202, + "auxiliary_loss_mlp": 0.01041987, + "balance_loss_clip": 1.02414095, + "balance_loss_mlp": 1.04627967, + "epoch": 0.2729595671125808, + "flos": 17488424949120.0, + "grad_norm": 2.0712338481270884, + "language_loss": 0.88711655, + "learning_rate": 3.4120776724752607e-06, + "loss": 0.90885842, + "num_input_tokens_seen": 98136300, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.859375, + "step": 4540, + "time_per_iteration": 2.457939624786377 + }, + { + "auxiliary_loss_clip": 0.01133918, + "auxiliary_loss_mlp": 0.01033711, + "balance_loss_clip": 1.01771343, + "balance_loss_mlp": 1.04666936, + "epoch": 0.27301969036524876, + "flos": 19318145800320.0, + "grad_norm": 1.9363402300433894, + "language_loss": 0.81993663, + "learning_rate": 3.4118018394115476e-06, + "loss": 0.84161294, + "num_input_tokens_seen": 98154580, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.875, + "step": 4541, + "time_per_iteration": 2.461517333984375 + }, + { + "auxiliary_loss_clip": 0.01133224, + "auxiliary_loss_mlp": 0.01041774, + "balance_loss_clip": 1.02484596, + "balance_loss_mlp": 1.04623377, + "epoch": 0.2730798136179167, + "flos": 21065666376960.0, + "grad_norm": 1.8882731025231656, + "language_loss": 0.7925449, + "learning_rate": 3.4115259528121678e-06, + "loss": 0.81429487, + "num_input_tokens_seen": 98173115, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.87109375, + "step": 4542, + "time_per_iteration": 2.487905979156494 + }, + { + "auxiliary_loss_clip": 0.01136186, + "auxiliary_loss_mlp": 0.01040722, + "balance_loss_clip": 1.02441418, + "balance_loss_mlp": 1.04965162, + "epoch": 0.2731399368705847, + "flos": 19171737964800.0, + "grad_norm": 2.197105758262293, + "language_loss": 0.89471424, + "learning_rate": 3.411250012687582e-06, + "loss": 0.91648328, + "num_input_tokens_seen": 98190260, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8671875, + "step": 4543, + "time_per_iteration": 2.4903039932250977 + }, + { + "auxiliary_loss_clip": 0.01137887, + "auxiliary_loss_mlp": 0.01046974, + "balance_loss_clip": 1.02955735, + "balance_loss_mlp": 1.04841042, + "epoch": 0.27320006012325265, + "flos": 18290130554880.0, + "grad_norm": 2.084938235366164, + "language_loss": 0.63666493, + "learning_rate": 3.410974019048255e-06, + "loss": 0.65851355, + "num_input_tokens_seen": 98207115, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.89453125, + "step": 4544, + "time_per_iteration": 2.4529080390930176 + }, + { + "auxiliary_loss_clip": 0.01137894, + "auxiliary_loss_mlp": 0.01047722, + "balance_loss_clip": 1.03043687, + "balance_loss_mlp": 1.05032265, + "epoch": 0.2732601833759206, + "flos": 34860929731200.0, + "grad_norm": 1.5170655618085727, + "language_loss": 0.6996637, + "learning_rate": 3.410697971904651e-06, + "loss": 0.72151983, + "num_input_tokens_seen": 98230610, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.875, + "step": 4545, + "time_per_iteration": 2.6089117527008057 + }, + { + "auxiliary_loss_clip": 0.01048793, + "auxiliary_loss_mlp": 0.01019944, + "balance_loss_clip": 1.01828671, + "balance_loss_mlp": 1.01938868, + "epoch": 0.2733203066285886, + "flos": 53910824762880.0, + "grad_norm": 0.7273987605446792, + "language_loss": 0.61571473, + "learning_rate": 3.4104218712672383e-06, + "loss": 0.63640207, + "num_input_tokens_seen": 98293585, + "router_z_loss_clip": 0.01660156, + "router_z_loss_mlp": 0.29296875, + "step": 4546, + "time_per_iteration": 3.1125431060791016 + }, + { + "auxiliary_loss_clip": 0.01136724, + "auxiliary_loss_mlp": 0.0104828, + "balance_loss_clip": 1.03199649, + "balance_loss_mlp": 1.05012798, + "epoch": 0.2733804298812566, + "flos": 20660378244480.0, + "grad_norm": 1.9369682323358774, + "language_loss": 0.64982706, + "learning_rate": 3.410145717146488e-06, + "loss": 0.67167711, + "num_input_tokens_seen": 98311680, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8671875, + "step": 4547, + "time_per_iteration": 2.497563600540161 + }, + { + "auxiliary_loss_clip": 0.01132998, + "auxiliary_loss_mlp": 0.01041584, + "balance_loss_clip": 1.0262835, + "balance_loss_mlp": 1.04765081, + "epoch": 0.27344055313392457, + "flos": 25884339707520.0, + "grad_norm": 2.2377196076559183, + "language_loss": 0.77178854, + "learning_rate": 3.4098695095528694e-06, + "loss": 0.7935344, + "num_input_tokens_seen": 98330770, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8515625, + "step": 4548, + "time_per_iteration": 2.536813259124756 + }, + { + "auxiliary_loss_clip": 0.01133984, + "auxiliary_loss_mlp": 0.01043122, + "balance_loss_clip": 1.02854848, + "balance_loss_mlp": 1.04827595, + "epoch": 0.27350067638659253, + "flos": 22929753565440.0, + "grad_norm": 1.8894391736419274, + "language_loss": 0.82382214, + "learning_rate": 3.4095932484968585e-06, + "loss": 0.84559321, + "num_input_tokens_seen": 98349860, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.859375, + "step": 4549, + "time_per_iteration": 2.5156633853912354 + }, + { + "auxiliary_loss_clip": 0.01132691, + "auxiliary_loss_mlp": 0.0104484, + "balance_loss_clip": 1.02744722, + "balance_loss_mlp": 1.04482448, + "epoch": 0.2735607996392605, + "flos": 16574821499520.0, + "grad_norm": 2.2209993145005793, + "language_loss": 0.70675868, + "learning_rate": 3.4093169339889305e-06, + "loss": 0.72853404, + "num_input_tokens_seen": 98367040, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.875, + "step": 4550, + "time_per_iteration": 2.4510462284088135 + }, + { + "auxiliary_loss_clip": 0.0113302, + "auxiliary_loss_mlp": 0.01046908, + "balance_loss_clip": 1.03272784, + "balance_loss_mlp": 1.04789186, + "epoch": 0.27362092289192846, + "flos": 19645291895040.0, + "grad_norm": 2.43111621366583, + "language_loss": 0.78738058, + "learning_rate": 3.409040566039563e-06, + "loss": 0.80917984, + "num_input_tokens_seen": 98384010, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8515625, + "step": 4551, + "time_per_iteration": 2.470520496368408 + }, + { + "auxiliary_loss_clip": 0.01132621, + "auxiliary_loss_mlp": 0.01051474, + "balance_loss_clip": 1.03548765, + "balance_loss_mlp": 1.04601097, + "epoch": 0.27368104614459643, + "flos": 17639142416640.0, + "grad_norm": 2.681171335598487, + "language_loss": 0.70585275, + "learning_rate": 3.4087641446592362e-06, + "loss": 0.72769368, + "num_input_tokens_seen": 98399625, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8671875, + "step": 4552, + "time_per_iteration": 3.9179859161376953 + }, + { + "auxiliary_loss_clip": 0.01135382, + "auxiliary_loss_mlp": 0.01046031, + "balance_loss_clip": 1.02936506, + "balance_loss_mlp": 1.04864776, + "epoch": 0.2737411693972644, + "flos": 21580015178880.0, + "grad_norm": 2.3865688662341005, + "language_loss": 0.71857619, + "learning_rate": 3.408487669858431e-06, + "loss": 0.7403903, + "num_input_tokens_seen": 98417310, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8671875, + "step": 4553, + "time_per_iteration": 4.032766342163086 + }, + { + "auxiliary_loss_clip": 0.01131855, + "auxiliary_loss_mlp": 0.01044919, + "balance_loss_clip": 1.02853942, + "balance_loss_mlp": 1.04585433, + "epoch": 0.27380129264993236, + "flos": 25484043565440.0, + "grad_norm": 1.5870570208244068, + "language_loss": 0.59154749, + "learning_rate": 3.4082111416476337e-06, + "loss": 0.61331522, + "num_input_tokens_seen": 98438670, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.859375, + "step": 4554, + "time_per_iteration": 2.549534320831299 + }, + { + "auxiliary_loss_clip": 0.01138763, + "auxiliary_loss_mlp": 0.01041638, + "balance_loss_clip": 1.02448368, + "balance_loss_mlp": 1.04893517, + "epoch": 0.2738614159026003, + "flos": 18661196004480.0, + "grad_norm": 1.7727518382715788, + "language_loss": 0.73820007, + "learning_rate": 3.4079345600373275e-06, + "loss": 0.76000404, + "num_input_tokens_seen": 98456060, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.8984375, + "step": 4555, + "time_per_iteration": 2.5162432193756104 + }, + { + "auxiliary_loss_clip": 0.01136837, + "auxiliary_loss_mlp": 0.01039479, + "balance_loss_clip": 1.02348125, + "balance_loss_mlp": 1.04923606, + "epoch": 0.2739215391552683, + "flos": 23477139901440.0, + "grad_norm": 1.956724452661134, + "language_loss": 0.7785511, + "learning_rate": 3.407657925038002e-06, + "loss": 0.80031419, + "num_input_tokens_seen": 98473765, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.875, + "step": 4556, + "time_per_iteration": 2.5205135345458984 + }, + { + "auxiliary_loss_clip": 0.01145391, + "auxiliary_loss_mlp": 0.0105386, + "balance_loss_clip": 1.03640783, + "balance_loss_mlp": 1.04952264, + "epoch": 0.27398166240793626, + "flos": 17128636369920.0, + "grad_norm": 1.7956202604517526, + "language_loss": 0.82272434, + "learning_rate": 3.4073812366601473e-06, + "loss": 0.84471685, + "num_input_tokens_seen": 98490590, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.9609375, + "step": 4557, + "time_per_iteration": 2.486485719680786 + }, + { + "auxiliary_loss_clip": 0.01133092, + "auxiliary_loss_mlp": 0.01042572, + "balance_loss_clip": 1.02691972, + "balance_loss_mlp": 1.04657316, + "epoch": 0.2740417856606042, + "flos": 23404744039680.0, + "grad_norm": 1.7971714372597054, + "language_loss": 0.72697943, + "learning_rate": 3.4071044949142547e-06, + "loss": 0.74873614, + "num_input_tokens_seen": 98510590, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.86328125, + "step": 4558, + "time_per_iteration": 2.5272727012634277 + }, + { + "auxiliary_loss_clip": 0.01131967, + "auxiliary_loss_mlp": 0.01048867, + "balance_loss_clip": 1.03243995, + "balance_loss_mlp": 1.04504418, + "epoch": 0.2741019089132722, + "flos": 12780428400000.0, + "grad_norm": 2.1318143008079686, + "language_loss": 0.6804775, + "learning_rate": 3.406827699810819e-06, + "loss": 0.70228577, + "num_input_tokens_seen": 98527875, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8671875, + "step": 4559, + "time_per_iteration": 2.4787509441375732 + }, + { + "auxiliary_loss_clip": 0.01131026, + "auxiliary_loss_mlp": 0.01043891, + "balance_loss_clip": 1.02750015, + "balance_loss_mlp": 1.04517901, + "epoch": 0.27416203216594015, + "flos": 20631542601600.0, + "grad_norm": 3.5500966853689673, + "language_loss": 0.71847737, + "learning_rate": 3.4065508513603353e-06, + "loss": 0.74022651, + "num_input_tokens_seen": 98547575, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.859375, + "step": 4560, + "time_per_iteration": 2.490152359008789 + }, + { + "auxiliary_loss_clip": 0.0113572, + "auxiliary_loss_mlp": 0.01041958, + "balance_loss_clip": 1.02642488, + "balance_loss_mlp": 1.04779601, + "epoch": 0.27422215541860817, + "flos": 26541576812160.0, + "grad_norm": 1.7948619898284635, + "language_loss": 0.80998009, + "learning_rate": 3.406273949573303e-06, + "loss": 0.83175689, + "num_input_tokens_seen": 98566290, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.87890625, + "step": 4561, + "time_per_iteration": 2.554872512817383 + }, + { + "auxiliary_loss_clip": 0.01136406, + "auxiliary_loss_mlp": 0.01041546, + "balance_loss_clip": 1.02600157, + "balance_loss_mlp": 1.04711854, + "epoch": 0.27428227867127614, + "flos": 23331163029120.0, + "grad_norm": 1.7370289005889625, + "language_loss": 0.7531321, + "learning_rate": 3.4059969944602214e-06, + "loss": 0.77491164, + "num_input_tokens_seen": 98586255, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.890625, + "step": 4562, + "time_per_iteration": 2.4925429821014404 + }, + { + "auxiliary_loss_clip": 0.01133486, + "auxiliary_loss_mlp": 0.01037482, + "balance_loss_clip": 1.02173424, + "balance_loss_mlp": 1.04701662, + "epoch": 0.2743424019239441, + "flos": 23035115134080.0, + "grad_norm": 1.598166418515773, + "language_loss": 0.74503827, + "learning_rate": 3.4057199860315928e-06, + "loss": 0.76674795, + "num_input_tokens_seen": 98606030, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.86328125, + "step": 4563, + "time_per_iteration": 2.5514259338378906 + }, + { + "auxiliary_loss_clip": 0.01138282, + "auxiliary_loss_mlp": 0.01045739, + "balance_loss_clip": 1.02798915, + "balance_loss_mlp": 1.04708612, + "epoch": 0.27440252517661207, + "flos": 21981101420160.0, + "grad_norm": 1.8271759108968861, + "language_loss": 0.62526429, + "learning_rate": 3.4054429242979213e-06, + "loss": 0.64710456, + "num_input_tokens_seen": 98625225, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.9140625, + "step": 4564, + "time_per_iteration": 2.479156494140625 + }, + { + "auxiliary_loss_clip": 0.01136574, + "auxiliary_loss_mlp": 0.01042695, + "balance_loss_clip": 1.02513587, + "balance_loss_mlp": 1.04808652, + "epoch": 0.27446264842928003, + "flos": 40187451502080.0, + "grad_norm": 1.9245884320117708, + "language_loss": 0.78135669, + "learning_rate": 3.4051658092697135e-06, + "loss": 0.80314934, + "num_input_tokens_seen": 98649470, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8828125, + "step": 4565, + "time_per_iteration": 2.714069366455078 + }, + { + "auxiliary_loss_clip": 0.01133378, + "auxiliary_loss_mlp": 0.01039885, + "balance_loss_clip": 1.02443504, + "balance_loss_mlp": 1.04669619, + "epoch": 0.274522771681948, + "flos": 13479681438720.0, + "grad_norm": 2.3377831889988547, + "language_loss": 0.68350124, + "learning_rate": 3.404888640957477e-06, + "loss": 0.70523381, + "num_input_tokens_seen": 98666915, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8671875, + "step": 4566, + "time_per_iteration": 2.469357967376709 + }, + { + "auxiliary_loss_clip": 0.01133208, + "auxiliary_loss_mlp": 0.01046416, + "balance_loss_clip": 1.03211665, + "balance_loss_mlp": 1.04901338, + "epoch": 0.27458289493461596, + "flos": 28622133313920.0, + "grad_norm": 1.7938914020631171, + "language_loss": 0.60886472, + "learning_rate": 3.404611419371723e-06, + "loss": 0.63066101, + "num_input_tokens_seen": 98688240, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.84375, + "step": 4567, + "time_per_iteration": 2.5856754779815674 + }, + { + "auxiliary_loss_clip": 0.01134122, + "auxiliary_loss_mlp": 0.01042972, + "balance_loss_clip": 1.02597237, + "balance_loss_mlp": 1.04754972, + "epoch": 0.2746430181872839, + "flos": 20119815492480.0, + "grad_norm": 1.7650663548751138, + "language_loss": 0.82787997, + "learning_rate": 3.4043341445229627e-06, + "loss": 0.84965092, + "num_input_tokens_seen": 98708245, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.86328125, + "step": 4568, + "time_per_iteration": 2.476353168487549 + }, + { + "auxiliary_loss_clip": 0.0113839, + "auxiliary_loss_mlp": 0.01034679, + "balance_loss_clip": 1.01868141, + "balance_loss_mlp": 1.05012584, + "epoch": 0.2747031414399519, + "flos": 20193468330240.0, + "grad_norm": 2.0155686346894415, + "language_loss": 0.68656778, + "learning_rate": 3.4040568164217117e-06, + "loss": 0.7082985, + "num_input_tokens_seen": 98724575, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8828125, + "step": 4569, + "time_per_iteration": 2.5027451515197754 + }, + { + "auxiliary_loss_clip": 0.01133852, + "auxiliary_loss_mlp": 0.0103613, + "balance_loss_clip": 1.01947594, + "balance_loss_mlp": 1.0464673, + "epoch": 0.27476326469261986, + "flos": 13516346246400.0, + "grad_norm": 2.247407128453888, + "language_loss": 0.71138883, + "learning_rate": 3.4037794350784848e-06, + "loss": 0.73308867, + "num_input_tokens_seen": 98740700, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.875, + "step": 4570, + "time_per_iteration": 2.466845750808716 + }, + { + "auxiliary_loss_clip": 0.0104735, + "auxiliary_loss_mlp": 0.01010434, + "balance_loss_clip": 1.00881279, + "balance_loss_mlp": 1.01781416, + "epoch": 0.2748233879452878, + "flos": 65937127121280.0, + "grad_norm": 0.7344992896847644, + "language_loss": 0.55774754, + "learning_rate": 3.4035020005038014e-06, + "loss": 0.57832539, + "num_input_tokens_seen": 98803030, + "router_z_loss_clip": 0.01623535, + "router_z_loss_mlp": 0.296875, + "step": 4571, + "time_per_iteration": 3.192523241043091 + }, + { + "auxiliary_loss_clip": 0.01140339, + "auxiliary_loss_mlp": 0.01044242, + "balance_loss_clip": 1.02805328, + "balance_loss_mlp": 1.05039406, + "epoch": 0.2748835111979558, + "flos": 17384212615680.0, + "grad_norm": 3.6883594473706482, + "language_loss": 0.77785081, + "learning_rate": 3.4032245127081812e-06, + "loss": 0.79969662, + "num_input_tokens_seen": 98820505, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8984375, + "step": 4572, + "time_per_iteration": 2.4755914211273193 + }, + { + "auxiliary_loss_clip": 0.01129408, + "auxiliary_loss_mlp": 0.01036409, + "balance_loss_clip": 1.02200866, + "balance_loss_mlp": 1.04679561, + "epoch": 0.27494363445062375, + "flos": 23587565287680.0, + "grad_norm": 1.7042315716847805, + "language_loss": 0.81357443, + "learning_rate": 3.402946971702147e-06, + "loss": 0.83523262, + "num_input_tokens_seen": 98842150, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.828125, + "step": 4573, + "time_per_iteration": 2.540905237197876 + }, + { + "auxiliary_loss_clip": 0.01129787, + "auxiliary_loss_mlp": 0.01036343, + "balance_loss_clip": 1.02038062, + "balance_loss_mlp": 1.04580402, + "epoch": 0.2750037577032918, + "flos": 17164582905600.0, + "grad_norm": 1.7927939239771835, + "language_loss": 0.79077196, + "learning_rate": 3.402669377496223e-06, + "loss": 0.81243324, + "num_input_tokens_seen": 98861050, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.83984375, + "step": 4574, + "time_per_iteration": 2.451016664505005 + }, + { + "auxiliary_loss_clip": 0.01136155, + "auxiliary_loss_mlp": 0.01044603, + "balance_loss_clip": 1.02889121, + "balance_loss_mlp": 1.04886127, + "epoch": 0.27506388095595974, + "flos": 24491903028480.0, + "grad_norm": 2.232643844604772, + "language_loss": 0.74191976, + "learning_rate": 3.402391730100936e-06, + "loss": 0.76372731, + "num_input_tokens_seen": 98879695, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.875, + "step": 4575, + "time_per_iteration": 2.5744149684906006 + }, + { + "auxiliary_loss_clip": 0.01131901, + "auxiliary_loss_mlp": 0.01038458, + "balance_loss_clip": 1.02353263, + "balance_loss_mlp": 1.04711711, + "epoch": 0.2751240042086277, + "flos": 38764706722560.0, + "grad_norm": 1.8105072672356382, + "language_loss": 0.71877766, + "learning_rate": 3.402114029526814e-06, + "loss": 0.7404812, + "num_input_tokens_seen": 98902035, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.84765625, + "step": 4576, + "time_per_iteration": 2.634305715560913 + }, + { + "auxiliary_loss_clip": 0.01134924, + "auxiliary_loss_mlp": 0.01041859, + "balance_loss_clip": 1.02495503, + "balance_loss_mlp": 1.04823232, + "epoch": 0.27518412746129567, + "flos": 26907039740160.0, + "grad_norm": 1.7690392048384511, + "language_loss": 0.73200434, + "learning_rate": 3.4018362757843866e-06, + "loss": 0.75377214, + "num_input_tokens_seen": 98921835, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8671875, + "step": 4577, + "time_per_iteration": 2.5365946292877197 + }, + { + "auxiliary_loss_clip": 0.01137469, + "auxiliary_loss_mlp": 0.01038584, + "balance_loss_clip": 1.02182376, + "balance_loss_mlp": 1.04931974, + "epoch": 0.27524425071396363, + "flos": 24900531125760.0, + "grad_norm": 5.099060573221768, + "language_loss": 0.75943893, + "learning_rate": 3.401558468884188e-06, + "loss": 0.78119946, + "num_input_tokens_seen": 98939610, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8828125, + "step": 4578, + "time_per_iteration": 2.5121536254882812 + }, + { + "auxiliary_loss_clip": 0.01135832, + "auxiliary_loss_mlp": 0.01046459, + "balance_loss_clip": 1.02704024, + "balance_loss_mlp": 1.0475626, + "epoch": 0.2753043739666316, + "flos": 26288047641600.0, + "grad_norm": 2.3614458833507603, + "language_loss": 0.66299897, + "learning_rate": 3.4012806088367516e-06, + "loss": 0.68482184, + "num_input_tokens_seen": 98962250, + "router_z_loss_clip": 0.19433594, + "router_z_loss_mlp": 0.8828125, + "step": 4579, + "time_per_iteration": 2.5445947647094727 + }, + { + "auxiliary_loss_clip": 0.01137742, + "auxiliary_loss_mlp": 0.01056341, + "balance_loss_clip": 1.03841197, + "balance_loss_mlp": 1.04862928, + "epoch": 0.27536449721929956, + "flos": 24206772867840.0, + "grad_norm": 1.9384727438162337, + "language_loss": 0.8013078, + "learning_rate": 3.4010026956526137e-06, + "loss": 0.82324862, + "num_input_tokens_seen": 98981845, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.890625, + "step": 4580, + "time_per_iteration": 2.4895741939544678 + }, + { + "auxiliary_loss_clip": 0.01138586, + "auxiliary_loss_mlp": 0.01044508, + "balance_loss_clip": 1.02581632, + "balance_loss_mlp": 1.05140579, + "epoch": 0.27542462047196753, + "flos": 19537272720000.0, + "grad_norm": 1.4702192551629332, + "language_loss": 0.67702103, + "learning_rate": 3.4007247293423137e-06, + "loss": 0.698852, + "num_input_tokens_seen": 99001855, + "router_z_loss_clip": 0.1875, + "router_z_loss_mlp": 0.87109375, + "step": 4581, + "time_per_iteration": 2.5905539989471436 + }, + { + "auxiliary_loss_clip": 0.01137135, + "auxiliary_loss_mlp": 0.01046005, + "balance_loss_clip": 1.03024602, + "balance_loss_mlp": 1.04847145, + "epoch": 0.2754847437246355, + "flos": 14319165173760.0, + "grad_norm": 1.8568978026073784, + "language_loss": 0.78120708, + "learning_rate": 3.400446709916392e-06, + "loss": 0.80303848, + "num_input_tokens_seen": 99019880, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.88671875, + "step": 4582, + "time_per_iteration": 2.467210531234741 + }, + { + "auxiliary_loss_clip": 0.01133579, + "auxiliary_loss_mlp": 0.01040863, + "balance_loss_clip": 1.02537727, + "balance_loss_mlp": 1.04905152, + "epoch": 0.27554486697730346, + "flos": 18838773866880.0, + "grad_norm": 2.5358708072067406, + "language_loss": 0.84527528, + "learning_rate": 3.4001686373853895e-06, + "loss": 0.86701977, + "num_input_tokens_seen": 99037570, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.84375, + "step": 4583, + "time_per_iteration": 2.511457920074463 + }, + { + "auxiliary_loss_clip": 0.01138165, + "auxiliary_loss_mlp": 0.01041348, + "balance_loss_clip": 1.02529025, + "balance_loss_mlp": 1.04905808, + "epoch": 0.2756049902299714, + "flos": 22382295402240.0, + "grad_norm": 2.037294788318467, + "language_loss": 0.67308438, + "learning_rate": 3.3998905117598528e-06, + "loss": 0.69487947, + "num_input_tokens_seen": 99056875, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.890625, + "step": 4584, + "time_per_iteration": 2.5193254947662354 + }, + { + "auxiliary_loss_clip": 0.01132805, + "auxiliary_loss_mlp": 0.01041645, + "balance_loss_clip": 1.02645802, + "balance_loss_mlp": 1.04761386, + "epoch": 0.2756651134826394, + "flos": 19573901614080.0, + "grad_norm": 1.737999785464117, + "language_loss": 0.77330101, + "learning_rate": 3.399612333050327e-06, + "loss": 0.7950455, + "num_input_tokens_seen": 99074685, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8515625, + "step": 4585, + "time_per_iteration": 2.5393707752227783 + }, + { + "auxiliary_loss_clip": 0.0114213, + "auxiliary_loss_mlp": 0.01039297, + "balance_loss_clip": 1.02227354, + "balance_loss_mlp": 1.0530591, + "epoch": 0.27572523673530736, + "flos": 23586559706880.0, + "grad_norm": 1.654604836009794, + "language_loss": 0.71854031, + "learning_rate": 3.399334101267362e-06, + "loss": 0.74035466, + "num_input_tokens_seen": 99095300, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.890625, + "step": 4586, + "time_per_iteration": 2.534979820251465 + }, + { + "auxiliary_loss_clip": 0.01136306, + "auxiliary_loss_mlp": 0.01035904, + "balance_loss_clip": 1.01996541, + "balance_loss_mlp": 1.04988265, + "epoch": 0.2757853599879754, + "flos": 22820118278400.0, + "grad_norm": 1.5248017982775213, + "language_loss": 0.80546939, + "learning_rate": 3.3990558164215073e-06, + "loss": 0.82719147, + "num_input_tokens_seen": 99115965, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.86328125, + "step": 4587, + "time_per_iteration": 2.5424065589904785 + }, + { + "auxiliary_loss_clip": 0.01133784, + "auxiliary_loss_mlp": 0.0103925, + "balance_loss_clip": 1.02356219, + "balance_loss_mlp": 1.04939508, + "epoch": 0.27584548324064334, + "flos": 18551704371840.0, + "grad_norm": 2.136921841599078, + "language_loss": 0.82694119, + "learning_rate": 3.398777478523316e-06, + "loss": 0.8486715, + "num_input_tokens_seen": 99134265, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.84375, + "step": 4588, + "time_per_iteration": 2.467923879623413 + }, + { + "auxiliary_loss_clip": 0.01132148, + "auxiliary_loss_mlp": 0.01038443, + "balance_loss_clip": 1.0228622, + "balance_loss_mlp": 1.04754925, + "epoch": 0.2759056064933113, + "flos": 23769883745280.0, + "grad_norm": 1.3980423175693042, + "language_loss": 0.75352502, + "learning_rate": 3.398499087583342e-06, + "loss": 0.775231, + "num_input_tokens_seen": 99156185, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84375, + "step": 4589, + "time_per_iteration": 2.535837173461914 + }, + { + "auxiliary_loss_clip": 0.01132096, + "auxiliary_loss_mlp": 0.01042232, + "balance_loss_clip": 1.02526879, + "balance_loss_mlp": 1.04686022, + "epoch": 0.27596572974597927, + "flos": 24281898163200.0, + "grad_norm": 1.7720046877472317, + "language_loss": 0.88438141, + "learning_rate": 3.398220643612143e-06, + "loss": 0.90612471, + "num_input_tokens_seen": 99176735, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.8515625, + "step": 4590, + "time_per_iteration": 2.5248916149139404 + }, + { + "auxiliary_loss_clip": 0.01135164, + "auxiliary_loss_mlp": 0.01045359, + "balance_loss_clip": 1.02946877, + "balance_loss_mlp": 1.04789972, + "epoch": 0.27602585299864724, + "flos": 35040985632000.0, + "grad_norm": 1.6299691755620427, + "language_loss": 0.7129395, + "learning_rate": 3.397942146620277e-06, + "loss": 0.73474467, + "num_input_tokens_seen": 99199765, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.87109375, + "step": 4591, + "time_per_iteration": 2.6112425327301025 + }, + { + "auxiliary_loss_clip": 0.01135759, + "auxiliary_loss_mlp": 0.01049557, + "balance_loss_clip": 1.03268862, + "balance_loss_mlp": 1.04847574, + "epoch": 0.2760859762513152, + "flos": 24309405002880.0, + "grad_norm": 1.8477043284936983, + "language_loss": 0.80190659, + "learning_rate": 3.3976635966183046e-06, + "loss": 0.82375979, + "num_input_tokens_seen": 99218435, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.875, + "step": 4592, + "time_per_iteration": 2.483894109725952 + }, + { + "auxiliary_loss_clip": 0.01048363, + "auxiliary_loss_mlp": 0.01005872, + "balance_loss_clip": 1.00416684, + "balance_loss_mlp": 1.0189774, + "epoch": 0.27614609950398317, + "flos": 71260739890560.0, + "grad_norm": 0.7716758671018623, + "language_loss": 0.61627746, + "learning_rate": 3.3973849936167886e-06, + "loss": 0.63681984, + "num_input_tokens_seen": 99276200, + "router_z_loss_clip": 0.01708984, + "router_z_loss_mlp": 0.29296875, + "step": 4593, + "time_per_iteration": 3.0616326332092285 + }, + { + "auxiliary_loss_clip": 0.01135076, + "auxiliary_loss_mlp": 0.01045597, + "balance_loss_clip": 1.02965856, + "balance_loss_mlp": 1.04938328, + "epoch": 0.27620622275665113, + "flos": 29674854138240.0, + "grad_norm": 1.8877557773606983, + "language_loss": 0.77589142, + "learning_rate": 3.3971063376262937e-06, + "loss": 0.79769808, + "num_input_tokens_seen": 99297625, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.859375, + "step": 4594, + "time_per_iteration": 4.043708086013794 + }, + { + "auxiliary_loss_clip": 0.01134807, + "auxiliary_loss_mlp": 0.01033386, + "balance_loss_clip": 1.01769793, + "balance_loss_mlp": 1.04991734, + "epoch": 0.2762663460093191, + "flos": 15378063137280.0, + "grad_norm": 1.7681451067423914, + "language_loss": 0.91645586, + "learning_rate": 3.3968276286573866e-06, + "loss": 0.93813777, + "num_input_tokens_seen": 99315790, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84765625, + "step": 4595, + "time_per_iteration": 3.973101854324341 + }, + { + "auxiliary_loss_clip": 0.01138485, + "auxiliary_loss_mlp": 0.01047274, + "balance_loss_clip": 1.03034675, + "balance_loss_mlp": 1.05122674, + "epoch": 0.27632646926198706, + "flos": 20704082117760.0, + "grad_norm": 1.7288059110569738, + "language_loss": 0.69101036, + "learning_rate": 3.3965488667206353e-06, + "loss": 0.71286798, + "num_input_tokens_seen": 99334615, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.87109375, + "step": 4596, + "time_per_iteration": 2.509199380874634 + }, + { + "auxiliary_loss_clip": 0.0114029, + "auxiliary_loss_mlp": 0.01041278, + "balance_loss_clip": 1.0249939, + "balance_loss_mlp": 1.04883707, + "epoch": 0.276386592514655, + "flos": 32813374849920.0, + "grad_norm": 2.01522187594791, + "language_loss": 0.63536406, + "learning_rate": 3.3962700518266113e-06, + "loss": 0.65717971, + "num_input_tokens_seen": 99356685, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.9140625, + "step": 4597, + "time_per_iteration": 2.5944221019744873 + }, + { + "auxiliary_loss_clip": 0.01133967, + "auxiliary_loss_mlp": 0.0104198, + "balance_loss_clip": 1.02629232, + "balance_loss_mlp": 1.05002272, + "epoch": 0.276446715767323, + "flos": 18551704371840.0, + "grad_norm": 2.1842552390134586, + "language_loss": 0.86612505, + "learning_rate": 3.395991183985887e-06, + "loss": 0.88788456, + "num_input_tokens_seen": 99374810, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.83984375, + "step": 4598, + "time_per_iteration": 2.4870996475219727 + }, + { + "auxiliary_loss_clip": 0.01135257, + "auxiliary_loss_mlp": 0.01042781, + "balance_loss_clip": 1.02586544, + "balance_loss_mlp": 1.04847229, + "epoch": 0.27650683901999096, + "flos": 22819615488000.0, + "grad_norm": 2.0694668215518996, + "language_loss": 0.79822165, + "learning_rate": 3.395712263209037e-06, + "loss": 0.82000202, + "num_input_tokens_seen": 99391290, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8671875, + "step": 4599, + "time_per_iteration": 2.4923834800720215 + }, + { + "auxiliary_loss_clip": 0.01140028, + "auxiliary_loss_mlp": 0.0104597, + "balance_loss_clip": 1.02965581, + "balance_loss_mlp": 1.04958415, + "epoch": 0.276566962272659, + "flos": 21361534704000.0, + "grad_norm": 1.9049018096400723, + "language_loss": 0.78357869, + "learning_rate": 3.395433289506639e-06, + "loss": 0.80543864, + "num_input_tokens_seen": 99409120, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.90625, + "step": 4600, + "time_per_iteration": 2.496173620223999 + }, + { + "auxiliary_loss_clip": 0.01139042, + "auxiliary_loss_mlp": 0.01046211, + "balance_loss_clip": 1.03007007, + "balance_loss_mlp": 1.04887986, + "epoch": 0.27662708552532694, + "flos": 17710604524800.0, + "grad_norm": 1.9474431855639402, + "language_loss": 0.73361742, + "learning_rate": 3.3951542628892694e-06, + "loss": 0.75546992, + "num_input_tokens_seen": 99426180, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.90234375, + "step": 4601, + "time_per_iteration": 2.475919246673584 + }, + { + "auxiliary_loss_clip": 0.01135661, + "auxiliary_loss_mlp": 0.01045476, + "balance_loss_clip": 1.02883482, + "balance_loss_mlp": 1.04879355, + "epoch": 0.2766872087779949, + "flos": 21252725429760.0, + "grad_norm": 1.9134344988482315, + "language_loss": 0.79341739, + "learning_rate": 3.3948751833675113e-06, + "loss": 0.81522876, + "num_input_tokens_seen": 99447720, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8671875, + "step": 4602, + "time_per_iteration": 2.511716842651367 + }, + { + "auxiliary_loss_clip": 0.01140401, + "auxiliary_loss_mlp": 0.01051234, + "balance_loss_clip": 1.03349614, + "balance_loss_mlp": 1.04920423, + "epoch": 0.2767473320306629, + "flos": 12931900053120.0, + "grad_norm": 2.260382216699142, + "language_loss": 0.76887643, + "learning_rate": 3.3945960509519455e-06, + "loss": 0.79079276, + "num_input_tokens_seen": 99464720, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.91015625, + "step": 4603, + "time_per_iteration": 2.4667811393737793 + }, + { + "auxiliary_loss_clip": 0.0112975, + "auxiliary_loss_mlp": 0.01040095, + "balance_loss_clip": 1.0252831, + "balance_loss_mlp": 1.04736543, + "epoch": 0.27680745528333084, + "flos": 15012851604480.0, + "grad_norm": 1.7288101924316703, + "language_loss": 0.81411278, + "learning_rate": 3.3943168656531585e-06, + "loss": 0.83581114, + "num_input_tokens_seen": 99482310, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.82421875, + "step": 4604, + "time_per_iteration": 2.4586222171783447 + }, + { + "auxiliary_loss_clip": 0.01135813, + "auxiliary_loss_mlp": 0.01031451, + "balance_loss_clip": 1.01516712, + "balance_loss_mlp": 1.04756212, + "epoch": 0.2768675785359988, + "flos": 22637835734400.0, + "grad_norm": 1.7513688477785454, + "language_loss": 0.69912565, + "learning_rate": 3.3940376274817363e-06, + "loss": 0.72079831, + "num_input_tokens_seen": 99501255, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8828125, + "step": 4605, + "time_per_iteration": 2.5138533115386963 + }, + { + "auxiliary_loss_clip": 0.01045677, + "auxiliary_loss_mlp": 0.01001918, + "balance_loss_clip": 1.00033224, + "balance_loss_mlp": 1.01580858, + "epoch": 0.27692770178866677, + "flos": 66130542881280.0, + "grad_norm": 0.7252635192802935, + "language_loss": 0.57151282, + "learning_rate": 3.3937583364482673e-06, + "loss": 0.59198874, + "num_input_tokens_seen": 99568925, + "router_z_loss_clip": 0.01586914, + "router_z_loss_mlp": 0.296875, + "step": 4606, + "time_per_iteration": 3.184955596923828 + }, + { + "auxiliary_loss_clip": 0.01136733, + "auxiliary_loss_mlp": 0.01049361, + "balance_loss_clip": 1.03234947, + "balance_loss_mlp": 1.0481658, + "epoch": 0.27698782504133473, + "flos": 26464979059200.0, + "grad_norm": 2.0717297663627825, + "language_loss": 0.69666946, + "learning_rate": 3.3934789925633424e-06, + "loss": 0.71853042, + "num_input_tokens_seen": 99588455, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.88671875, + "step": 4607, + "time_per_iteration": 2.5373001098632812 + }, + { + "auxiliary_loss_clip": 0.011299, + "auxiliary_loss_mlp": 0.01035631, + "balance_loss_clip": 1.02014589, + "balance_loss_mlp": 1.04721832, + "epoch": 0.2770479482940027, + "flos": 25884806584320.0, + "grad_norm": 3.332085537790215, + "language_loss": 0.6982615, + "learning_rate": 3.393199595837555e-06, + "loss": 0.71991682, + "num_input_tokens_seen": 99609355, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.828125, + "step": 4608, + "time_per_iteration": 2.5396809577941895 + }, + { + "auxiliary_loss_clip": 0.01135928, + "auxiliary_loss_mlp": 0.01037862, + "balance_loss_clip": 1.02185202, + "balance_loss_mlp": 1.04715931, + "epoch": 0.27710807154667066, + "flos": 22857249962880.0, + "grad_norm": 1.8242818121189563, + "language_loss": 0.72541273, + "learning_rate": 3.392920146281499e-06, + "loss": 0.74715054, + "num_input_tokens_seen": 99628780, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.88671875, + "step": 4609, + "time_per_iteration": 2.5383543968200684 + }, + { + "auxiliary_loss_clip": 0.01134274, + "auxiliary_loss_mlp": 0.01048832, + "balance_loss_clip": 1.03226149, + "balance_loss_mlp": 1.04623055, + "epoch": 0.27716819479933863, + "flos": 17711071401600.0, + "grad_norm": 2.2576811985082967, + "language_loss": 0.84010947, + "learning_rate": 3.3926406439057714e-06, + "loss": 0.86194062, + "num_input_tokens_seen": 99644545, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.87890625, + "step": 4610, + "time_per_iteration": 2.4456827640533447 + }, + { + "auxiliary_loss_clip": 0.01141086, + "auxiliary_loss_mlp": 0.01051097, + "balance_loss_clip": 1.03344178, + "balance_loss_mlp": 1.04996872, + "epoch": 0.2772283180520066, + "flos": 19646046080640.0, + "grad_norm": 2.570198611472629, + "language_loss": 0.68948054, + "learning_rate": 3.3923610887209705e-06, + "loss": 0.71140236, + "num_input_tokens_seen": 99663125, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.9140625, + "step": 4611, + "time_per_iteration": 2.5342319011688232 + }, + { + "auxiliary_loss_clip": 0.01130823, + "auxiliary_loss_mlp": 0.01036995, + "balance_loss_clip": 1.0212357, + "balance_loss_mlp": 1.04892015, + "epoch": 0.27728844130467456, + "flos": 21032628842880.0, + "grad_norm": 2.354058548299899, + "language_loss": 0.73450744, + "learning_rate": 3.392081480737698e-06, + "loss": 0.75618565, + "num_input_tokens_seen": 99682645, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8203125, + "step": 4612, + "time_per_iteration": 2.472200632095337 + }, + { + "auxiliary_loss_clip": 0.01137408, + "auxiliary_loss_mlp": 0.0105089, + "balance_loss_clip": 1.03378379, + "balance_loss_mlp": 1.04807258, + "epoch": 0.2773485645573425, + "flos": 18989204025600.0, + "grad_norm": 2.166254073057622, + "language_loss": 0.66736221, + "learning_rate": 3.3918018199665563e-06, + "loss": 0.68924516, + "num_input_tokens_seen": 99700520, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.89453125, + "step": 4613, + "time_per_iteration": 2.5313632488250732 + }, + { + "auxiliary_loss_clip": 0.01131014, + "auxiliary_loss_mlp": 0.01043283, + "balance_loss_clip": 1.02721334, + "balance_loss_mlp": 1.04604864, + "epoch": 0.27740868781001055, + "flos": 21468440557440.0, + "grad_norm": 1.8826548789840187, + "language_loss": 0.79452634, + "learning_rate": 3.39152210641815e-06, + "loss": 0.81626928, + "num_input_tokens_seen": 99720355, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8515625, + "step": 4614, + "time_per_iteration": 2.4869751930236816 + }, + { + "auxiliary_loss_clip": 0.01135901, + "auxiliary_loss_mlp": 0.01043201, + "balance_loss_clip": 1.02684534, + "balance_loss_mlp": 1.0477469, + "epoch": 0.2774688110626785, + "flos": 19827825834240.0, + "grad_norm": 2.573597172535304, + "language_loss": 0.80251336, + "learning_rate": 3.3912423401030865e-06, + "loss": 0.8243044, + "num_input_tokens_seen": 99736090, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8828125, + "step": 4615, + "time_per_iteration": 2.521615505218506 + }, + { + "auxiliary_loss_clip": 0.01135416, + "auxiliary_loss_mlp": 0.01043384, + "balance_loss_clip": 1.02676582, + "balance_loss_mlp": 1.04627132, + "epoch": 0.2775289343153465, + "flos": 18216226321920.0, + "grad_norm": 2.403593727320557, + "language_loss": 0.63926548, + "learning_rate": 3.3909625210319735e-06, + "loss": 0.66105354, + "num_input_tokens_seen": 99751805, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.890625, + "step": 4616, + "time_per_iteration": 2.439410448074341 + }, + { + "auxiliary_loss_clip": 0.01133721, + "auxiliary_loss_mlp": 0.01041991, + "balance_loss_clip": 1.02593398, + "balance_loss_mlp": 1.04661143, + "epoch": 0.27758905756801444, + "flos": 16472476673280.0, + "grad_norm": 1.8467628074440183, + "language_loss": 0.82283223, + "learning_rate": 3.3906826492154226e-06, + "loss": 0.84458935, + "num_input_tokens_seen": 99770610, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87109375, + "step": 4617, + "time_per_iteration": 2.49495792388916 + }, + { + "auxiliary_loss_clip": 0.01133289, + "auxiliary_loss_mlp": 0.01041846, + "balance_loss_clip": 1.02587175, + "balance_loss_mlp": 1.04613662, + "epoch": 0.2776491808206824, + "flos": 18728240739840.0, + "grad_norm": 2.1015666973838942, + "language_loss": 0.76835418, + "learning_rate": 3.3904027246640458e-06, + "loss": 0.79010552, + "num_input_tokens_seen": 99787305, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87109375, + "step": 4618, + "time_per_iteration": 2.4882123470306396 + }, + { + "auxiliary_loss_clip": 0.01136682, + "auxiliary_loss_mlp": 0.0104057, + "balance_loss_clip": 1.02501273, + "balance_loss_mlp": 1.0495801, + "epoch": 0.27770930407335037, + "flos": 28038189911040.0, + "grad_norm": 1.6700061931983001, + "language_loss": 0.84698343, + "learning_rate": 3.390122747388459e-06, + "loss": 0.868756, + "num_input_tokens_seen": 99808940, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.87109375, + "step": 4619, + "time_per_iteration": 2.5996124744415283 + }, + { + "auxiliary_loss_clip": 0.01128767, + "auxiliary_loss_mlp": 0.0103795, + "balance_loss_clip": 1.02340662, + "balance_loss_mlp": 1.04523671, + "epoch": 0.27776942732601834, + "flos": 23549823072000.0, + "grad_norm": 1.4068177028172657, + "language_loss": 0.76720011, + "learning_rate": 3.3898427173992778e-06, + "loss": 0.78886724, + "num_input_tokens_seen": 99829575, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8359375, + "step": 4620, + "time_per_iteration": 2.4851698875427246 + }, + { + "auxiliary_loss_clip": 0.01130943, + "auxiliary_loss_mlp": 0.01036697, + "balance_loss_clip": 1.02126586, + "balance_loss_mlp": 1.04728413, + "epoch": 0.2778295505786863, + "flos": 23908713811200.0, + "grad_norm": 2.4956264272783084, + "language_loss": 0.78746819, + "learning_rate": 3.389562634707122e-06, + "loss": 0.80914462, + "num_input_tokens_seen": 99847575, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8359375, + "step": 4621, + "time_per_iteration": 2.543513774871826 + }, + { + "auxiliary_loss_clip": 0.01135835, + "auxiliary_loss_mlp": 0.01046354, + "balance_loss_clip": 1.02953315, + "balance_loss_mlp": 1.04871762, + "epoch": 0.27788967383135427, + "flos": 25554571920000.0, + "grad_norm": 1.9988562622182164, + "language_loss": 0.87520665, + "learning_rate": 3.389282499322611e-06, + "loss": 0.89702857, + "num_input_tokens_seen": 99864995, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.87109375, + "step": 4622, + "time_per_iteration": 2.4818174839019775 + }, + { + "auxiliary_loss_clip": 0.01133366, + "auxiliary_loss_mlp": 0.01046006, + "balance_loss_clip": 1.02960837, + "balance_loss_mlp": 1.04635906, + "epoch": 0.27794979708402223, + "flos": 16252631481600.0, + "grad_norm": 1.9062066208333321, + "language_loss": 0.81094646, + "learning_rate": 3.389002311256369e-06, + "loss": 0.83274019, + "num_input_tokens_seen": 99881540, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8671875, + "step": 4623, + "time_per_iteration": 2.509218692779541 + }, + { + "auxiliary_loss_clip": 0.01136736, + "auxiliary_loss_mlp": 0.01039194, + "balance_loss_clip": 1.02357817, + "balance_loss_mlp": 1.04981863, + "epoch": 0.2780099203366902, + "flos": 20667632791680.0, + "grad_norm": 1.93503772017796, + "language_loss": 0.81099498, + "learning_rate": 3.3887220705190204e-06, + "loss": 0.83275431, + "num_input_tokens_seen": 99899595, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8671875, + "step": 4624, + "time_per_iteration": 2.470041513442993 + }, + { + "auxiliary_loss_clip": 0.01135607, + "auxiliary_loss_mlp": 0.01041344, + "balance_loss_clip": 1.02563787, + "balance_loss_mlp": 1.05091214, + "epoch": 0.27807004358935816, + "flos": 17739583822080.0, + "grad_norm": 3.184384520938543, + "language_loss": 0.76514304, + "learning_rate": 3.388441777121191e-06, + "loss": 0.7869125, + "num_input_tokens_seen": 99913020, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.84765625, + "step": 4625, + "time_per_iteration": 2.4965567588806152 + }, + { + "auxiliary_loss_clip": 0.01133566, + "auxiliary_loss_mlp": 0.01041252, + "balance_loss_clip": 1.02439606, + "balance_loss_mlp": 1.04835677, + "epoch": 0.2781301668420261, + "flos": 16727119165440.0, + "grad_norm": 2.5511238477154095, + "language_loss": 0.70091927, + "learning_rate": 3.388161431073511e-06, + "loss": 0.7226674, + "num_input_tokens_seen": 99931405, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8515625, + "step": 4626, + "time_per_iteration": 2.462007522583008 + }, + { + "auxiliary_loss_clip": 0.01142353, + "auxiliary_loss_mlp": 0.01036943, + "balance_loss_clip": 1.01981282, + "balance_loss_mlp": 1.05177855, + "epoch": 0.27819029009469415, + "flos": 13844749317120.0, + "grad_norm": 2.1576082410571704, + "language_loss": 0.92738312, + "learning_rate": 3.38788103238661e-06, + "loss": 0.94917607, + "num_input_tokens_seen": 99948100, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.90625, + "step": 4627, + "time_per_iteration": 2.5731146335601807 + }, + { + "auxiliary_loss_clip": 0.01137702, + "auxiliary_loss_mlp": 0.01041394, + "balance_loss_clip": 1.02640903, + "balance_loss_mlp": 1.04856014, + "epoch": 0.2782504133473621, + "flos": 27089286370560.0, + "grad_norm": 4.44086075484182, + "language_loss": 0.85802954, + "learning_rate": 3.387600581071121e-06, + "loss": 0.87982047, + "num_input_tokens_seen": 99966470, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.890625, + "step": 4628, + "time_per_iteration": 2.502816915512085 + }, + { + "auxiliary_loss_clip": 0.01136721, + "auxiliary_loss_mlp": 0.01039197, + "balance_loss_clip": 1.02358079, + "balance_loss_mlp": 1.05035257, + "epoch": 0.2783105366000301, + "flos": 21068826773760.0, + "grad_norm": 1.4685731198996637, + "language_loss": 0.79003006, + "learning_rate": 3.387320077137679e-06, + "loss": 0.81178927, + "num_input_tokens_seen": 99985930, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.86328125, + "step": 4629, + "time_per_iteration": 2.544255256652832 + }, + { + "auxiliary_loss_clip": 0.01132865, + "auxiliary_loss_mlp": 0.01038616, + "balance_loss_clip": 1.02419138, + "balance_loss_mlp": 1.05083036, + "epoch": 0.27837065985269804, + "flos": 26501823434880.0, + "grad_norm": 1.4531737557023054, + "language_loss": 0.84322643, + "learning_rate": 3.3870395205969208e-06, + "loss": 0.86494124, + "num_input_tokens_seen": 100006235, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8203125, + "step": 4630, + "time_per_iteration": 2.514413833618164 + }, + { + "auxiliary_loss_clip": 0.01136217, + "auxiliary_loss_mlp": 0.01040004, + "balance_loss_clip": 1.02343392, + "balance_loss_mlp": 1.04834175, + "epoch": 0.278430783105366, + "flos": 20223201813120.0, + "grad_norm": 2.1800575167200997, + "language_loss": 0.80845618, + "learning_rate": 3.386758911459485e-06, + "loss": 0.83021843, + "num_input_tokens_seen": 100023655, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.87890625, + "step": 4631, + "time_per_iteration": 2.530393123626709 + }, + { + "auxiliary_loss_clip": 0.01141592, + "auxiliary_loss_mlp": 0.01050096, + "balance_loss_clip": 1.03403842, + "balance_loss_mlp": 1.05319762, + "epoch": 0.278490906358034, + "flos": 25592888753280.0, + "grad_norm": 2.154319840219951, + "language_loss": 0.71817827, + "learning_rate": 3.3864782497360126e-06, + "loss": 0.74009514, + "num_input_tokens_seen": 100043280, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8828125, + "step": 4632, + "time_per_iteration": 2.504826307296753 + }, + { + "auxiliary_loss_clip": 0.01135617, + "auxiliary_loss_mlp": 0.01040662, + "balance_loss_clip": 1.02571952, + "balance_loss_mlp": 1.05240536, + "epoch": 0.27855102961070194, + "flos": 16171544528640.0, + "grad_norm": 1.8401586776799086, + "language_loss": 0.82518554, + "learning_rate": 3.386197535437145e-06, + "loss": 0.84694839, + "num_input_tokens_seen": 100057690, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.83203125, + "step": 4633, + "time_per_iteration": 2.484894037246704 + }, + { + "auxiliary_loss_clip": 0.0113869, + "auxiliary_loss_mlp": 0.01044294, + "balance_loss_clip": 1.02622163, + "balance_loss_mlp": 1.05006409, + "epoch": 0.2786111528633699, + "flos": 22927598749440.0, + "grad_norm": 1.740894494158558, + "language_loss": 0.87933433, + "learning_rate": 3.385916768573529e-06, + "loss": 0.90116417, + "num_input_tokens_seen": 100075875, + "router_z_loss_clip": 0.18066406, + "router_z_loss_mlp": 0.88671875, + "step": 4634, + "time_per_iteration": 2.465115785598755 + }, + { + "auxiliary_loss_clip": 0.01139508, + "auxiliary_loss_mlp": 0.0103873, + "balance_loss_clip": 1.02182591, + "balance_loss_mlp": 1.05175185, + "epoch": 0.27867127611603787, + "flos": 23404205335680.0, + "grad_norm": 1.5848956099548452, + "language_loss": 0.77060932, + "learning_rate": 3.38563594915581e-06, + "loss": 0.79239166, + "num_input_tokens_seen": 100092930, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.875, + "step": 4635, + "time_per_iteration": 2.5032925605773926 + }, + { + "auxiliary_loss_clip": 0.01137724, + "auxiliary_loss_mlp": 0.01045364, + "balance_loss_clip": 1.02843595, + "balance_loss_mlp": 1.04919934, + "epoch": 0.27873139936870583, + "flos": 19829010983040.0, + "grad_norm": 1.7277393232375848, + "language_loss": 0.65047133, + "learning_rate": 3.385355077194637e-06, + "loss": 0.67230225, + "num_input_tokens_seen": 100110790, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8828125, + "step": 4636, + "time_per_iteration": 4.078390121459961 + }, + { + "auxiliary_loss_clip": 0.01137292, + "auxiliary_loss_mlp": 0.01039979, + "balance_loss_clip": 1.02249098, + "balance_loss_mlp": 1.04898095, + "epoch": 0.2787915226213738, + "flos": 17707659609600.0, + "grad_norm": 2.3949865449269034, + "language_loss": 0.84131932, + "learning_rate": 3.3850741527006604e-06, + "loss": 0.86309206, + "num_input_tokens_seen": 100126970, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.8828125, + "step": 4637, + "time_per_iteration": 3.9023706912994385 + }, + { + "auxiliary_loss_clip": 0.01131584, + "auxiliary_loss_mlp": 0.0104016, + "balance_loss_clip": 1.02468669, + "balance_loss_mlp": 1.04683113, + "epoch": 0.27885164587404176, + "flos": 22090557139200.0, + "grad_norm": 1.9572077756422592, + "language_loss": 0.75880706, + "learning_rate": 3.384793175684533e-06, + "loss": 0.78052455, + "num_input_tokens_seen": 100146720, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84765625, + "step": 4638, + "time_per_iteration": 2.5291664600372314 + }, + { + "auxiliary_loss_clip": 0.01137756, + "auxiliary_loss_mlp": 0.01044501, + "balance_loss_clip": 1.0281812, + "balance_loss_mlp": 1.04918075, + "epoch": 0.27891176912670973, + "flos": 19207684500480.0, + "grad_norm": 1.663593201704466, + "language_loss": 0.71469444, + "learning_rate": 3.38451214615691e-06, + "loss": 0.73651695, + "num_input_tokens_seen": 100165920, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8828125, + "step": 4639, + "time_per_iteration": 2.4396321773529053 + }, + { + "auxiliary_loss_clip": 0.01135046, + "auxiliary_loss_mlp": 0.01035153, + "balance_loss_clip": 1.01814222, + "balance_loss_mlp": 1.0477488, + "epoch": 0.27897189237937775, + "flos": 27600007898880.0, + "grad_norm": 2.020838508390905, + "language_loss": 0.65634811, + "learning_rate": 3.384231064128447e-06, + "loss": 0.67805016, + "num_input_tokens_seen": 100185525, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.87109375, + "step": 4640, + "time_per_iteration": 2.524146556854248 + }, + { + "auxiliary_loss_clip": 0.01135389, + "auxiliary_loss_mlp": 0.01038864, + "balance_loss_clip": 1.02278829, + "balance_loss_mlp": 1.04838169, + "epoch": 0.2790320156320457, + "flos": 21178210665600.0, + "grad_norm": 1.8663182251903623, + "language_loss": 0.71682954, + "learning_rate": 3.383949929609804e-06, + "loss": 0.738572, + "num_input_tokens_seen": 100204850, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.87109375, + "step": 4641, + "time_per_iteration": 2.45416522026062 + }, + { + "auxiliary_loss_clip": 0.01137426, + "auxiliary_loss_mlp": 0.01043433, + "balance_loss_clip": 1.02620697, + "balance_loss_mlp": 1.04805887, + "epoch": 0.2790921388847137, + "flos": 22783920347520.0, + "grad_norm": 1.721157258136314, + "language_loss": 0.74843872, + "learning_rate": 3.383668742611641e-06, + "loss": 0.77024734, + "num_input_tokens_seen": 100224520, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.890625, + "step": 4642, + "time_per_iteration": 2.498901128768921 + }, + { + "auxiliary_loss_clip": 0.01136083, + "auxiliary_loss_mlp": 0.01041154, + "balance_loss_clip": 1.0241071, + "balance_loss_mlp": 1.04755557, + "epoch": 0.27915226213738165, + "flos": 23400649889280.0, + "grad_norm": 1.7771181879405247, + "language_loss": 0.85500491, + "learning_rate": 3.3833875031446205e-06, + "loss": 0.87677723, + "num_input_tokens_seen": 100243935, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.8828125, + "step": 4643, + "time_per_iteration": 2.4678151607513428 + }, + { + "auxiliary_loss_clip": 0.01135774, + "auxiliary_loss_mlp": 0.01044591, + "balance_loss_clip": 1.02831888, + "balance_loss_mlp": 1.04914284, + "epoch": 0.2792123853900496, + "flos": 22747794243840.0, + "grad_norm": 1.8372365182177028, + "language_loss": 0.8320173, + "learning_rate": 3.383106211219407e-06, + "loss": 0.85382092, + "num_input_tokens_seen": 100262290, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8671875, + "step": 4644, + "time_per_iteration": 2.4989511966705322 + }, + { + "auxiliary_loss_clip": 0.01137034, + "auxiliary_loss_mlp": 0.01039698, + "balance_loss_clip": 1.02340162, + "balance_loss_mlp": 1.04927874, + "epoch": 0.2792725086427176, + "flos": 15049372757760.0, + "grad_norm": 2.1578284197730246, + "language_loss": 0.7905547, + "learning_rate": 3.3828248668466673e-06, + "loss": 0.81232202, + "num_input_tokens_seen": 100280015, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.87890625, + "step": 4645, + "time_per_iteration": 2.444539785385132 + }, + { + "auxiliary_loss_clip": 0.01045698, + "auxiliary_loss_mlp": 0.01013694, + "balance_loss_clip": 1.01202476, + "balance_loss_mlp": 1.01603949, + "epoch": 0.27933263189538554, + "flos": 62544861757440.0, + "grad_norm": 0.7789852310638867, + "language_loss": 0.62276232, + "learning_rate": 3.3825434700370705e-06, + "loss": 0.64335632, + "num_input_tokens_seen": 100338935, + "router_z_loss_clip": 0.01672363, + "router_z_loss_mlp": 0.296875, + "step": 4646, + "time_per_iteration": 3.0487425327301025 + }, + { + "auxiliary_loss_clip": 0.01130687, + "auxiliary_loss_mlp": 0.01035262, + "balance_loss_clip": 1.02039671, + "balance_loss_mlp": 1.04760003, + "epoch": 0.2793927551480535, + "flos": 25118365155840.0, + "grad_norm": 1.6043045349905556, + "language_loss": 0.89379698, + "learning_rate": 3.3822620208012865e-06, + "loss": 0.91545647, + "num_input_tokens_seen": 100359905, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.83203125, + "step": 4647, + "time_per_iteration": 2.537818193435669 + }, + { + "auxiliary_loss_clip": 0.01137315, + "auxiliary_loss_mlp": 0.0104209, + "balance_loss_clip": 1.02559125, + "balance_loss_mlp": 1.04848313, + "epoch": 0.27945287840072147, + "flos": 21324582587520.0, + "grad_norm": 1.6404696751402497, + "language_loss": 0.87119055, + "learning_rate": 3.381980519149988e-06, + "loss": 0.89298457, + "num_input_tokens_seen": 100376955, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.890625, + "step": 4648, + "time_per_iteration": 2.490755081176758 + }, + { + "auxiliary_loss_clip": 0.01138515, + "auxiliary_loss_mlp": 0.01036468, + "balance_loss_clip": 1.01993406, + "balance_loss_mlp": 1.04894495, + "epoch": 0.27951300165338944, + "flos": 27450547407360.0, + "grad_norm": 4.859667262510518, + "language_loss": 0.72424746, + "learning_rate": 3.38169896509385e-06, + "loss": 0.74599725, + "num_input_tokens_seen": 100397545, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.89453125, + "step": 4649, + "time_per_iteration": 2.551149368286133 + }, + { + "auxiliary_loss_clip": 0.01134145, + "auxiliary_loss_mlp": 0.0104133, + "balance_loss_clip": 1.02275741, + "balance_loss_mlp": 1.04667568, + "epoch": 0.2795731249060574, + "flos": 15159008044800.0, + "grad_norm": 2.198213539311656, + "language_loss": 0.80241156, + "learning_rate": 3.381417358643549e-06, + "loss": 0.8241663, + "num_input_tokens_seen": 100415080, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.875, + "step": 4650, + "time_per_iteration": 2.495481252670288 + }, + { + "auxiliary_loss_clip": 0.01043234, + "auxiliary_loss_mlp": 0.01001825, + "balance_loss_clip": 1.00015628, + "balance_loss_mlp": 1.01336908, + "epoch": 0.27963324815872537, + "flos": 60120103178880.0, + "grad_norm": 1.2001935939690993, + "language_loss": 0.58821332, + "learning_rate": 3.3811356998097624e-06, + "loss": 0.60866392, + "num_input_tokens_seen": 100471105, + "router_z_loss_clip": 0.01672363, + "router_z_loss_mlp": 0.296875, + "step": 4651, + "time_per_iteration": 3.089278221130371 + }, + { + "auxiliary_loss_clip": 0.01136266, + "auxiliary_loss_mlp": 0.01041939, + "balance_loss_clip": 1.0239383, + "balance_loss_mlp": 1.04576242, + "epoch": 0.27969337141139333, + "flos": 21765960910080.0, + "grad_norm": 1.6305345142383205, + "language_loss": 0.74335963, + "learning_rate": 3.3808539886031726e-06, + "loss": 0.76514173, + "num_input_tokens_seen": 100492520, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.90625, + "step": 4652, + "time_per_iteration": 2.5034215450286865 + }, + { + "auxiliary_loss_clip": 0.01140774, + "auxiliary_loss_mlp": 0.01044834, + "balance_loss_clip": 1.02826357, + "balance_loss_mlp": 1.05137777, + "epoch": 0.27975349466406135, + "flos": 39851398834560.0, + "grad_norm": 2.1744902530470527, + "language_loss": 0.79703641, + "learning_rate": 3.380572225034461e-06, + "loss": 0.81889254, + "num_input_tokens_seen": 100512870, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.89453125, + "step": 4653, + "time_per_iteration": 2.654989242553711 + }, + { + "auxiliary_loss_clip": 0.0113484, + "auxiliary_loss_mlp": 0.01045549, + "balance_loss_clip": 1.02851391, + "balance_loss_mlp": 1.04782343, + "epoch": 0.2798136179167293, + "flos": 21579799697280.0, + "grad_norm": 2.2131663157599597, + "language_loss": 0.79123974, + "learning_rate": 3.380290409114312e-06, + "loss": 0.81304365, + "num_input_tokens_seen": 100531655, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.87109375, + "step": 4654, + "time_per_iteration": 2.4707679748535156 + }, + { + "auxiliary_loss_clip": 0.01139148, + "auxiliary_loss_mlp": 0.01041113, + "balance_loss_clip": 1.02370811, + "balance_loss_mlp": 1.04861951, + "epoch": 0.2798737411693973, + "flos": 21537676022400.0, + "grad_norm": 2.2002818233708497, + "language_loss": 0.80829996, + "learning_rate": 3.3800085408534127e-06, + "loss": 0.83010256, + "num_input_tokens_seen": 100548005, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.90625, + "step": 4655, + "time_per_iteration": 2.513359546661377 + }, + { + "auxiliary_loss_clip": 0.01135255, + "auxiliary_loss_mlp": 0.01040566, + "balance_loss_clip": 1.0232811, + "balance_loss_mlp": 1.04709148, + "epoch": 0.27993386442206525, + "flos": 26981051713920.0, + "grad_norm": 1.5763016498426998, + "language_loss": 0.8125751, + "learning_rate": 3.3797266202624506e-06, + "loss": 0.8343333, + "num_input_tokens_seen": 100567980, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.87890625, + "step": 4656, + "time_per_iteration": 2.519552707672119 + }, + { + "auxiliary_loss_clip": 0.01135028, + "auxiliary_loss_mlp": 0.01039911, + "balance_loss_clip": 1.02292323, + "balance_loss_mlp": 1.04802632, + "epoch": 0.2799939876747332, + "flos": 24349876652160.0, + "grad_norm": 1.6475258015019663, + "language_loss": 0.83235347, + "learning_rate": 3.3794446473521176e-06, + "loss": 0.85410285, + "num_input_tokens_seen": 100588630, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.87109375, + "step": 4657, + "time_per_iteration": 2.533052444458008 + }, + { + "auxiliary_loss_clip": 0.01136508, + "auxiliary_loss_mlp": 0.01042865, + "balance_loss_clip": 1.0267477, + "balance_loss_mlp": 1.04885554, + "epoch": 0.2800541109274012, + "flos": 33656988648960.0, + "grad_norm": 1.9420207304275756, + "language_loss": 0.63918132, + "learning_rate": 3.379162622133105e-06, + "loss": 0.66097504, + "num_input_tokens_seen": 100608775, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.875, + "step": 4658, + "time_per_iteration": 2.577223777770996 + }, + { + "auxiliary_loss_clip": 0.01137419, + "auxiliary_loss_mlp": 0.01048486, + "balance_loss_clip": 1.03177238, + "balance_loss_mlp": 1.04906631, + "epoch": 0.28011423418006914, + "flos": 21614417429760.0, + "grad_norm": 1.71469006603513, + "language_loss": 0.78447223, + "learning_rate": 3.3788805446161073e-06, + "loss": 0.80633128, + "num_input_tokens_seen": 100627975, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8828125, + "step": 4659, + "time_per_iteration": 2.5102882385253906 + }, + { + "auxiliary_loss_clip": 0.01141159, + "auxiliary_loss_mlp": 0.01052526, + "balance_loss_clip": 1.03565836, + "balance_loss_mlp": 1.05118299, + "epoch": 0.2801743574327371, + "flos": 23112431159040.0, + "grad_norm": 1.8275002529569282, + "language_loss": 0.79481149, + "learning_rate": 3.3785984148118215e-06, + "loss": 0.81674838, + "num_input_tokens_seen": 100645430, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8984375, + "step": 4660, + "time_per_iteration": 2.478348731994629 + }, + { + "auxiliary_loss_clip": 0.01134524, + "auxiliary_loss_mlp": 0.0103899, + "balance_loss_clip": 1.02289653, + "balance_loss_mlp": 1.04855609, + "epoch": 0.2802344806854051, + "flos": 12641418766080.0, + "grad_norm": 1.7763153734220711, + "language_loss": 0.80286032, + "learning_rate": 3.3783162327309453e-06, + "loss": 0.82459545, + "num_input_tokens_seen": 100663775, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.859375, + "step": 4661, + "time_per_iteration": 2.514369249343872 + }, + { + "auxiliary_loss_clip": 0.01140753, + "auxiliary_loss_mlp": 0.01055451, + "balance_loss_clip": 1.03888094, + "balance_loss_mlp": 1.05259752, + "epoch": 0.28029460393807304, + "flos": 37267878142080.0, + "grad_norm": 1.5344085017366311, + "language_loss": 0.78856266, + "learning_rate": 3.3780339983841794e-06, + "loss": 0.8105247, + "num_input_tokens_seen": 100686085, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8828125, + "step": 4662, + "time_per_iteration": 2.6068239212036133 + }, + { + "auxiliary_loss_clip": 0.01142079, + "auxiliary_loss_mlp": 0.01052002, + "balance_loss_clip": 1.03345299, + "balance_loss_mlp": 1.04998207, + "epoch": 0.280354727190741, + "flos": 20741106061440.0, + "grad_norm": 2.3559784459233923, + "language_loss": 0.70354843, + "learning_rate": 3.377751711782227e-06, + "loss": 0.72548926, + "num_input_tokens_seen": 100705135, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.921875, + "step": 4663, + "time_per_iteration": 2.530852794647217 + }, + { + "auxiliary_loss_clip": 0.01139833, + "auxiliary_loss_mlp": 0.01053723, + "balance_loss_clip": 1.03522193, + "balance_loss_mlp": 1.05016875, + "epoch": 0.28041485044340897, + "flos": 21471026336640.0, + "grad_norm": 1.7070620658846938, + "language_loss": 0.77552772, + "learning_rate": 3.377469372935791e-06, + "loss": 0.7974633, + "num_input_tokens_seen": 100724960, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.8984375, + "step": 4664, + "time_per_iteration": 2.5026586055755615 + }, + { + "auxiliary_loss_clip": 0.01132144, + "auxiliary_loss_mlp": 0.01043613, + "balance_loss_clip": 1.02688766, + "balance_loss_mlp": 1.04697514, + "epoch": 0.28047497369607693, + "flos": 14794263388800.0, + "grad_norm": 1.9676420802042491, + "language_loss": 0.79575229, + "learning_rate": 3.377186981855578e-06, + "loss": 0.81750983, + "num_input_tokens_seen": 100741995, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8515625, + "step": 4665, + "time_per_iteration": 2.496948003768921 + }, + { + "auxiliary_loss_clip": 0.01136069, + "auxiliary_loss_mlp": 0.01042713, + "balance_loss_clip": 1.02628565, + "balance_loss_mlp": 1.04934978, + "epoch": 0.2805350969487449, + "flos": 23070738447360.0, + "grad_norm": 8.778135585709748, + "language_loss": 0.80523062, + "learning_rate": 3.3769045385522968e-06, + "loss": 0.82701844, + "num_input_tokens_seen": 100758985, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8671875, + "step": 4666, + "time_per_iteration": 2.4551992416381836 + }, + { + "auxiliary_loss_clip": 0.0113922, + "auxiliary_loss_mlp": 0.01054727, + "balance_loss_clip": 1.03710806, + "balance_loss_mlp": 1.05058241, + "epoch": 0.2805952202014129, + "flos": 20479855466880.0, + "grad_norm": 2.0519370530418493, + "language_loss": 0.84514672, + "learning_rate": 3.376622043036658e-06, + "loss": 0.86708617, + "num_input_tokens_seen": 100777820, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.88671875, + "step": 4667, + "time_per_iteration": 2.503024101257324 + }, + { + "auxiliary_loss_clip": 0.01141868, + "auxiliary_loss_mlp": 0.0104464, + "balance_loss_clip": 1.02771258, + "balance_loss_mlp": 1.05165899, + "epoch": 0.2806553434540809, + "flos": 27417330305280.0, + "grad_norm": 1.59556786146991, + "language_loss": 0.79110259, + "learning_rate": 3.376339495319373e-06, + "loss": 0.81296772, + "num_input_tokens_seen": 100798205, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.90234375, + "step": 4668, + "time_per_iteration": 2.5109217166900635 + }, + { + "auxiliary_loss_clip": 0.01137821, + "auxiliary_loss_mlp": 0.01045025, + "balance_loss_clip": 1.02783513, + "balance_loss_mlp": 1.0472095, + "epoch": 0.28071546670674885, + "flos": 26505019745280.0, + "grad_norm": 5.202292388628492, + "language_loss": 0.7594949, + "learning_rate": 3.3760568954111563e-06, + "loss": 0.78132337, + "num_input_tokens_seen": 100819800, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.90625, + "step": 4669, + "time_per_iteration": 2.5443029403686523 + }, + { + "auxiliary_loss_clip": 0.01139015, + "auxiliary_loss_mlp": 0.01050472, + "balance_loss_clip": 1.03276944, + "balance_loss_mlp": 1.05060363, + "epoch": 0.2807755899594168, + "flos": 20558679863040.0, + "grad_norm": 2.249572842905479, + "language_loss": 0.78818107, + "learning_rate": 3.375774243322725e-06, + "loss": 0.81007588, + "num_input_tokens_seen": 100837880, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.8828125, + "step": 4670, + "time_per_iteration": 2.4583303928375244 + }, + { + "auxiliary_loss_clip": 0.01142576, + "auxiliary_loss_mlp": 0.010505, + "balance_loss_clip": 1.03272545, + "balance_loss_mlp": 1.05169237, + "epoch": 0.2808357132120848, + "flos": 24313319585280.0, + "grad_norm": 2.1344815005037323, + "language_loss": 0.78915119, + "learning_rate": 3.3754915390647955e-06, + "loss": 0.81108201, + "num_input_tokens_seen": 100856350, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.91015625, + "step": 4671, + "time_per_iteration": 2.576904296875 + }, + { + "auxiliary_loss_clip": 0.01136631, + "auxiliary_loss_mlp": 0.01039569, + "balance_loss_clip": 1.02419102, + "balance_loss_mlp": 1.05212355, + "epoch": 0.28089583646475275, + "flos": 26432408401920.0, + "grad_norm": 1.655300005604084, + "language_loss": 0.74891758, + "learning_rate": 3.37520878264809e-06, + "loss": 0.77067947, + "num_input_tokens_seen": 100876135, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84375, + "step": 4672, + "time_per_iteration": 2.5101001262664795 + }, + { + "auxiliary_loss_clip": 0.01139664, + "auxiliary_loss_mlp": 0.01048727, + "balance_loss_clip": 1.0297612, + "balance_loss_mlp": 1.05017138, + "epoch": 0.2809559597174207, + "flos": 23111820627840.0, + "grad_norm": 2.377632390973165, + "language_loss": 0.7485683, + "learning_rate": 3.3749259740833286e-06, + "loss": 0.77045226, + "num_input_tokens_seen": 100894790, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.89453125, + "step": 4673, + "time_per_iteration": 2.5559215545654297 + }, + { + "auxiliary_loss_clip": 0.0113758, + "auxiliary_loss_mlp": 0.01040684, + "balance_loss_clip": 1.02367294, + "balance_loss_mlp": 1.04911065, + "epoch": 0.2810160829700887, + "flos": 20923496346240.0, + "grad_norm": 2.162495737742732, + "language_loss": 0.72274792, + "learning_rate": 3.374643113381237e-06, + "loss": 0.74453062, + "num_input_tokens_seen": 100915100, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.88671875, + "step": 4674, + "time_per_iteration": 2.4755725860595703 + }, + { + "auxiliary_loss_clip": 0.01142202, + "auxiliary_loss_mlp": 0.01043016, + "balance_loss_clip": 1.02487254, + "balance_loss_mlp": 1.05152214, + "epoch": 0.28107620622275664, + "flos": 14355901808640.0, + "grad_norm": 1.8501022214838438, + "language_loss": 0.77636325, + "learning_rate": 3.374360200552541e-06, + "loss": 0.79821539, + "num_input_tokens_seen": 100932795, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.90625, + "step": 4675, + "time_per_iteration": 2.5076191425323486 + }, + { + "auxiliary_loss_clip": 0.011417, + "auxiliary_loss_mlp": 0.01048524, + "balance_loss_clip": 1.03059506, + "balance_loss_mlp": 1.05080581, + "epoch": 0.2811363294754246, + "flos": 20919078973440.0, + "grad_norm": 4.743769816525981, + "language_loss": 0.7033428, + "learning_rate": 3.374077235607968e-06, + "loss": 0.72524506, + "num_input_tokens_seen": 100950505, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.90625, + "step": 4676, + "time_per_iteration": 2.4664652347564697 + }, + { + "auxiliary_loss_clip": 0.01136213, + "auxiliary_loss_mlp": 0.01042174, + "balance_loss_clip": 1.02637279, + "balance_loss_mlp": 1.05219054, + "epoch": 0.28119645272809257, + "flos": 20594841880320.0, + "grad_norm": 1.6504598517134752, + "language_loss": 0.70294476, + "learning_rate": 3.3737942185582487e-06, + "loss": 0.7247287, + "num_input_tokens_seen": 100968790, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.83984375, + "step": 4677, + "time_per_iteration": 3.9926962852478027 + }, + { + "auxiliary_loss_clip": 0.01140831, + "auxiliary_loss_mlp": 0.01046995, + "balance_loss_clip": 1.02779067, + "balance_loss_mlp": 1.05172849, + "epoch": 0.28125657598076054, + "flos": 25337420248320.0, + "grad_norm": 1.7155329144241396, + "language_loss": 0.63506716, + "learning_rate": 3.3735111494141153e-06, + "loss": 0.65694547, + "num_input_tokens_seen": 100990205, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.890625, + "step": 4678, + "time_per_iteration": 5.452545642852783 + }, + { + "auxiliary_loss_clip": 0.01140503, + "auxiliary_loss_mlp": 0.01048157, + "balance_loss_clip": 1.031039, + "balance_loss_mlp": 1.05193949, + "epoch": 0.2813166992334285, + "flos": 24827093769600.0, + "grad_norm": 1.4644682748892532, + "language_loss": 0.70249045, + "learning_rate": 3.3732280281863013e-06, + "loss": 0.7243771, + "num_input_tokens_seen": 101009815, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.88671875, + "step": 4679, + "time_per_iteration": 2.557156801223755 + }, + { + "auxiliary_loss_clip": 0.01138678, + "auxiliary_loss_mlp": 0.01040208, + "balance_loss_clip": 1.02276742, + "balance_loss_mlp": 1.05024076, + "epoch": 0.2813768224860965, + "flos": 21760753438080.0, + "grad_norm": 1.8307759218313573, + "language_loss": 0.74600148, + "learning_rate": 3.3729448548855422e-06, + "loss": 0.76779038, + "num_input_tokens_seen": 101026780, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.8828125, + "step": 4680, + "time_per_iteration": 2.478760004043579 + }, + { + "auxiliary_loss_clip": 0.01140599, + "auxiliary_loss_mlp": 0.01043469, + "balance_loss_clip": 1.0268507, + "balance_loss_mlp": 1.0514679, + "epoch": 0.2814369457387645, + "flos": 24316803204480.0, + "grad_norm": 1.8069902018568411, + "language_loss": 0.77090317, + "learning_rate": 3.3726616295225774e-06, + "loss": 0.79274386, + "num_input_tokens_seen": 101046215, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.890625, + "step": 4681, + "time_per_iteration": 2.5532946586608887 + }, + { + "auxiliary_loss_clip": 0.01142988, + "auxiliary_loss_mlp": 0.01041039, + "balance_loss_clip": 1.02353942, + "balance_loss_mlp": 1.05301392, + "epoch": 0.28149706899143245, + "flos": 18515326872960.0, + "grad_norm": 4.33574203258507, + "language_loss": 0.74047244, + "learning_rate": 3.372378352108146e-06, + "loss": 0.76231277, + "num_input_tokens_seen": 101063365, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.8984375, + "step": 4682, + "time_per_iteration": 2.450707197189331 + }, + { + "auxiliary_loss_clip": 0.0113683, + "auxiliary_loss_mlp": 0.01043566, + "balance_loss_clip": 1.02712727, + "balance_loss_mlp": 1.04989302, + "epoch": 0.2815571922441004, + "flos": 24863255786880.0, + "grad_norm": 1.4103030378304897, + "language_loss": 0.80830532, + "learning_rate": 3.3720950226529894e-06, + "loss": 0.8301093, + "num_input_tokens_seen": 101083835, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8671875, + "step": 4683, + "time_per_iteration": 2.5405828952789307 + }, + { + "auxiliary_loss_clip": 0.01142223, + "auxiliary_loss_mlp": 0.01047785, + "balance_loss_clip": 1.02984428, + "balance_loss_mlp": 1.05146146, + "epoch": 0.2816173154967684, + "flos": 19901622326400.0, + "grad_norm": 1.6936052100643573, + "language_loss": 0.76107442, + "learning_rate": 3.371811641167852e-06, + "loss": 0.78297454, + "num_input_tokens_seen": 101101740, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.90625, + "step": 4684, + "time_per_iteration": 2.4734883308410645 + }, + { + "auxiliary_loss_clip": 0.01135013, + "auxiliary_loss_mlp": 0.01038474, + "balance_loss_clip": 1.02196348, + "balance_loss_mlp": 1.04849601, + "epoch": 0.28167743874943635, + "flos": 17491333950720.0, + "grad_norm": 1.9675146174992446, + "language_loss": 0.7601878, + "learning_rate": 3.3715282076634807e-06, + "loss": 0.7819227, + "num_input_tokens_seen": 101120480, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.86328125, + "step": 4685, + "time_per_iteration": 2.521883010864258 + }, + { + "auxiliary_loss_clip": 0.01136456, + "auxiliary_loss_mlp": 0.01043262, + "balance_loss_clip": 1.02666783, + "balance_loss_mlp": 1.05083728, + "epoch": 0.2817375620021043, + "flos": 25302120157440.0, + "grad_norm": 2.003036282603561, + "language_loss": 0.7616905, + "learning_rate": 3.3712447221506218e-06, + "loss": 0.78348768, + "num_input_tokens_seen": 101142910, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.85546875, + "step": 4686, + "time_per_iteration": 2.5261688232421875 + }, + { + "auxiliary_loss_clip": 0.01140126, + "auxiliary_loss_mlp": 0.01051506, + "balance_loss_clip": 1.03319538, + "balance_loss_mlp": 1.04916072, + "epoch": 0.2817976852547723, + "flos": 18693227957760.0, + "grad_norm": 2.230965321609006, + "language_loss": 0.63345516, + "learning_rate": 3.370961184640025e-06, + "loss": 0.65537149, + "num_input_tokens_seen": 101160030, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.90625, + "step": 4687, + "time_per_iteration": 2.473508834838867 + }, + { + "auxiliary_loss_clip": 0.0114172, + "auxiliary_loss_mlp": 0.01052796, + "balance_loss_clip": 1.03577256, + "balance_loss_mlp": 1.05180609, + "epoch": 0.28185780850744024, + "flos": 22742263549440.0, + "grad_norm": 1.9761865692880811, + "language_loss": 0.76504958, + "learning_rate": 3.3706775951424433e-06, + "loss": 0.7869947, + "num_input_tokens_seen": 101177675, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.8984375, + "step": 4688, + "time_per_iteration": 2.4815330505371094 + }, + { + "auxiliary_loss_clip": 0.01135292, + "auxiliary_loss_mlp": 0.01040309, + "balance_loss_clip": 1.02364409, + "balance_loss_mlp": 1.04902148, + "epoch": 0.2819179317601082, + "flos": 14933919467520.0, + "grad_norm": 2.291650314126009, + "language_loss": 0.78333032, + "learning_rate": 3.37039395366863e-06, + "loss": 0.80508631, + "num_input_tokens_seen": 101192225, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.86328125, + "step": 4689, + "time_per_iteration": 2.464221239089966 + }, + { + "auxiliary_loss_clip": 0.01136671, + "auxiliary_loss_mlp": 0.01042633, + "balance_loss_clip": 1.02566934, + "balance_loss_mlp": 1.04886627, + "epoch": 0.2819780550127762, + "flos": 23145325038720.0, + "grad_norm": 2.2251394110426896, + "language_loss": 0.77819848, + "learning_rate": 3.37011026022934e-06, + "loss": 0.79999155, + "num_input_tokens_seen": 101210870, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.87890625, + "step": 4690, + "time_per_iteration": 2.4802086353302 + }, + { + "auxiliary_loss_clip": 0.01138887, + "auxiliary_loss_mlp": 0.01044233, + "balance_loss_clip": 1.02809191, + "balance_loss_mlp": 1.04984617, + "epoch": 0.28203817826544414, + "flos": 21616356764160.0, + "grad_norm": 1.762007121853784, + "language_loss": 0.8775022, + "learning_rate": 3.369826514835332e-06, + "loss": 0.89933336, + "num_input_tokens_seen": 101229965, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.890625, + "step": 4691, + "time_per_iteration": 2.5098307132720947 + }, + { + "auxiliary_loss_clip": 0.01144357, + "auxiliary_loss_mlp": 0.01043906, + "balance_loss_clip": 1.02714467, + "balance_loss_mlp": 1.0519383, + "epoch": 0.2820983015181121, + "flos": 24026788794240.0, + "grad_norm": 2.144178457094415, + "language_loss": 0.81952238, + "learning_rate": 3.3695427174973654e-06, + "loss": 0.84140503, + "num_input_tokens_seen": 101250980, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.921875, + "step": 4692, + "time_per_iteration": 2.501150131225586 + }, + { + "auxiliary_loss_clip": 0.01137061, + "auxiliary_loss_mlp": 0.01039873, + "balance_loss_clip": 1.02284956, + "balance_loss_mlp": 1.04852128, + "epoch": 0.2821584247707801, + "flos": 30007925976960.0, + "grad_norm": 1.7100054669520195, + "language_loss": 0.74535745, + "learning_rate": 3.3692588682262022e-06, + "loss": 0.7671268, + "num_input_tokens_seen": 101273335, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.88671875, + "step": 4693, + "time_per_iteration": 2.581108808517456 + }, + { + "auxiliary_loss_clip": 0.01139239, + "auxiliary_loss_mlp": 0.01036265, + "balance_loss_clip": 1.01942062, + "balance_loss_mlp": 1.04924035, + "epoch": 0.2822185480234481, + "flos": 21396762967680.0, + "grad_norm": 1.6174705324311944, + "language_loss": 0.7761777, + "learning_rate": 3.3689749670326046e-06, + "loss": 0.79793274, + "num_input_tokens_seen": 101292110, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8984375, + "step": 4694, + "time_per_iteration": 2.479616403579712 + }, + { + "auxiliary_loss_clip": 0.01136707, + "auxiliary_loss_mlp": 0.01038934, + "balance_loss_clip": 1.02220941, + "balance_loss_mlp": 1.05057073, + "epoch": 0.28227867127611606, + "flos": 27452809964160.0, + "grad_norm": 2.0658621313481604, + "language_loss": 0.66812259, + "learning_rate": 3.3686910139273392e-06, + "loss": 0.68987906, + "num_input_tokens_seen": 101312815, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.86328125, + "step": 4695, + "time_per_iteration": 2.560234546661377 + }, + { + "auxiliary_loss_clip": 0.0114143, + "auxiliary_loss_mlp": 0.01047559, + "balance_loss_clip": 1.02859259, + "balance_loss_mlp": 1.05084562, + "epoch": 0.282338794528784, + "flos": 22593736811520.0, + "grad_norm": 2.206840044366299, + "language_loss": 0.75868189, + "learning_rate": 3.3684070089211736e-06, + "loss": 0.78057176, + "num_input_tokens_seen": 101329045, + "router_z_loss_clip": 0.18945312, + "router_z_loss_mlp": 0.90625, + "step": 4696, + "time_per_iteration": 2.484731674194336 + }, + { + "auxiliary_loss_clip": 0.01142111, + "auxiliary_loss_mlp": 0.01049599, + "balance_loss_clip": 1.03283811, + "balance_loss_mlp": 1.05234432, + "epoch": 0.282398917781452, + "flos": 42010923386880.0, + "grad_norm": 4.801168729119655, + "language_loss": 0.62373543, + "learning_rate": 3.368122952024877e-06, + "loss": 0.64565253, + "num_input_tokens_seen": 101352715, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8984375, + "step": 4697, + "time_per_iteration": 2.6771903038024902 + }, + { + "auxiliary_loss_clip": 0.01131406, + "auxiliary_loss_mlp": 0.01035664, + "balance_loss_clip": 1.02003598, + "balance_loss_mlp": 1.0468322, + "epoch": 0.28245904103411995, + "flos": 23224724052480.0, + "grad_norm": 1.6839402690923742, + "language_loss": 0.73317522, + "learning_rate": 3.3678388432492214e-06, + "loss": 0.75484592, + "num_input_tokens_seen": 101374640, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84375, + "step": 4698, + "time_per_iteration": 2.5262162685394287 + }, + { + "auxiliary_loss_clip": 0.01130801, + "auxiliary_loss_mlp": 0.01044648, + "balance_loss_clip": 1.029091, + "balance_loss_mlp": 1.0463903, + "epoch": 0.2825191642867879, + "flos": 25374623760000.0, + "grad_norm": 2.1160143892835275, + "language_loss": 0.74896884, + "learning_rate": 3.3675546826049788e-06, + "loss": 0.77072334, + "num_input_tokens_seen": 101393595, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.84375, + "step": 4699, + "time_per_iteration": 2.5613014698028564 + }, + { + "auxiliary_loss_clip": 0.01139697, + "auxiliary_loss_mlp": 0.01041633, + "balance_loss_clip": 1.02369165, + "balance_loss_mlp": 1.05032122, + "epoch": 0.2825792875394559, + "flos": 17236799199360.0, + "grad_norm": 3.187545417707515, + "language_loss": 0.80256712, + "learning_rate": 3.3672704701029265e-06, + "loss": 0.8243804, + "num_input_tokens_seen": 101409265, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.89453125, + "step": 4700, + "time_per_iteration": 2.4355719089508057 + }, + { + "auxiliary_loss_clip": 0.01132153, + "auxiliary_loss_mlp": 0.01049134, + "balance_loss_clip": 1.03461456, + "balance_loss_mlp": 1.05022645, + "epoch": 0.28263941079212385, + "flos": 26723967096960.0, + "grad_norm": 1.7483881606912919, + "language_loss": 0.81309319, + "learning_rate": 3.3669862057538402e-06, + "loss": 0.8349061, + "num_input_tokens_seen": 101428365, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8203125, + "step": 4701, + "time_per_iteration": 2.590824842453003 + }, + { + "auxiliary_loss_clip": 0.0113653, + "auxiliary_loss_mlp": 0.01038832, + "balance_loss_clip": 1.02301347, + "balance_loss_mlp": 1.05007911, + "epoch": 0.2826995340447918, + "flos": 25921327737600.0, + "grad_norm": 2.214271940066586, + "language_loss": 0.73758674, + "learning_rate": 3.3667018895685004e-06, + "loss": 0.75934035, + "num_input_tokens_seen": 101447280, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.86328125, + "step": 4702, + "time_per_iteration": 2.496689796447754 + }, + { + "auxiliary_loss_clip": 0.01136189, + "auxiliary_loss_mlp": 0.01038892, + "balance_loss_clip": 1.02251232, + "balance_loss_mlp": 1.05127287, + "epoch": 0.2827596572974598, + "flos": 22379709623040.0, + "grad_norm": 1.7981890053968508, + "language_loss": 0.78189409, + "learning_rate": 3.3664175215576886e-06, + "loss": 0.8036449, + "num_input_tokens_seen": 101465435, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8515625, + "step": 4703, + "time_per_iteration": 2.5225300788879395 + }, + { + "auxiliary_loss_clip": 0.011353, + "auxiliary_loss_mlp": 0.01046746, + "balance_loss_clip": 1.02923465, + "balance_loss_mlp": 1.0484302, + "epoch": 0.28281978055012774, + "flos": 33547137880320.0, + "grad_norm": 1.6026897384097336, + "language_loss": 0.6944623, + "learning_rate": 3.3661331017321867e-06, + "loss": 0.71628278, + "num_input_tokens_seen": 101486355, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8671875, + "step": 4704, + "time_per_iteration": 2.5721168518066406 + }, + { + "auxiliary_loss_clip": 0.0113917, + "auxiliary_loss_mlp": 0.01043731, + "balance_loss_clip": 1.02685118, + "balance_loss_mlp": 1.05374229, + "epoch": 0.2828799038027957, + "flos": 23440870143360.0, + "grad_norm": 1.9868129767490792, + "language_loss": 0.69884789, + "learning_rate": 3.3658486301027807e-06, + "loss": 0.7206769, + "num_input_tokens_seen": 101505875, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.85546875, + "step": 4705, + "time_per_iteration": 2.532034397125244 + }, + { + "auxiliary_loss_clip": 0.01057982, + "auxiliary_loss_mlp": 0.01003525, + "balance_loss_clip": 1.0018208, + "balance_loss_mlp": 1.02761459, + "epoch": 0.2829400270554637, + "flos": 69873690251520.0, + "grad_norm": 0.7396595768854823, + "language_loss": 0.59243953, + "learning_rate": 3.3655641066802577e-06, + "loss": 0.61305463, + "num_input_tokens_seen": 101565045, + "router_z_loss_clip": 0.01708984, + "router_z_loss_mlp": 0.3046875, + "step": 4706, + "time_per_iteration": 3.1149942874908447 + }, + { + "auxiliary_loss_clip": 0.01135425, + "auxiliary_loss_mlp": 0.01040531, + "balance_loss_clip": 1.02586842, + "balance_loss_mlp": 1.05135274, + "epoch": 0.2830001503081317, + "flos": 24789028331520.0, + "grad_norm": 1.3972451569930537, + "language_loss": 0.82227451, + "learning_rate": 3.365279531475407e-06, + "loss": 0.84403402, + "num_input_tokens_seen": 101585825, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.83984375, + "step": 4707, + "time_per_iteration": 2.5387215614318848 + }, + { + "auxiliary_loss_clip": 0.01137999, + "auxiliary_loss_mlp": 0.01039747, + "balance_loss_clip": 1.02199709, + "balance_loss_mlp": 1.04914331, + "epoch": 0.28306027356079966, + "flos": 27669387018240.0, + "grad_norm": 1.4509576382878049, + "language_loss": 0.80561262, + "learning_rate": 3.36499490449902e-06, + "loss": 0.82739007, + "num_input_tokens_seen": 101606105, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.890625, + "step": 4708, + "time_per_iteration": 2.5140204429626465 + }, + { + "auxiliary_loss_clip": 0.0105521, + "auxiliary_loss_mlp": 0.01000508, + "balance_loss_clip": 0.99875605, + "balance_loss_mlp": 1.02517498, + "epoch": 0.2831203968134676, + "flos": 60527938199040.0, + "grad_norm": 0.9117312370003612, + "language_loss": 0.62801576, + "learning_rate": 3.3647102257618895e-06, + "loss": 0.64857292, + "num_input_tokens_seen": 101656875, + "router_z_loss_clip": 0.01757812, + "router_z_loss_mlp": 0.30078125, + "step": 4709, + "time_per_iteration": 2.936171054840088 + }, + { + "auxiliary_loss_clip": 0.01133236, + "auxiliary_loss_mlp": 0.01038943, + "balance_loss_clip": 1.02320743, + "balance_loss_mlp": 1.04888415, + "epoch": 0.2831805200661356, + "flos": 22054790171520.0, + "grad_norm": 1.3738384560226649, + "language_loss": 0.73850632, + "learning_rate": 3.3644254952748103e-06, + "loss": 0.76022816, + "num_input_tokens_seen": 101676225, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.84375, + "step": 4710, + "time_per_iteration": 2.4954519271850586 + }, + { + "auxiliary_loss_clip": 0.01137863, + "auxiliary_loss_mlp": 0.01049743, + "balance_loss_clip": 1.03191566, + "balance_loss_mlp": 1.04925823, + "epoch": 0.28324064331880355, + "flos": 22600668136320.0, + "grad_norm": 1.9168276099157815, + "language_loss": 0.79272872, + "learning_rate": 3.364140713048579e-06, + "loss": 0.81460476, + "num_input_tokens_seen": 101693710, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.88671875, + "step": 4711, + "time_per_iteration": 2.4867448806762695 + }, + { + "auxiliary_loss_clip": 0.01138366, + "auxiliary_loss_mlp": 0.01043891, + "balance_loss_clip": 1.02646244, + "balance_loss_mlp": 1.04965401, + "epoch": 0.2833007665714715, + "flos": 30404127968640.0, + "grad_norm": 2.0504814559042064, + "language_loss": 0.71246219, + "learning_rate": 3.363855879093996e-06, + "loss": 0.73428476, + "num_input_tokens_seen": 101714010, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.88671875, + "step": 4712, + "time_per_iteration": 2.575636863708496 + }, + { + "auxiliary_loss_clip": 0.01138441, + "auxiliary_loss_mlp": 0.01049763, + "balance_loss_clip": 1.03291881, + "balance_loss_mlp": 1.05000687, + "epoch": 0.2833608898241395, + "flos": 23549499849600.0, + "grad_norm": 1.8055678270358249, + "language_loss": 0.82008445, + "learning_rate": 3.3635709934218605e-06, + "loss": 0.84196651, + "num_input_tokens_seen": 101732995, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8828125, + "step": 4713, + "time_per_iteration": 2.493767499923706 + }, + { + "auxiliary_loss_clip": 0.01136571, + "auxiliary_loss_mlp": 0.01041134, + "balance_loss_clip": 1.02401519, + "balance_loss_mlp": 1.05028057, + "epoch": 0.28342101307680745, + "flos": 20266726118400.0, + "grad_norm": 1.7485744544400377, + "language_loss": 0.75356781, + "learning_rate": 3.3632860560429766e-06, + "loss": 0.77534491, + "num_input_tokens_seen": 101751385, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.86328125, + "step": 4714, + "time_per_iteration": 2.505153179168701 + }, + { + "auxiliary_loss_clip": 0.01136297, + "auxiliary_loss_mlp": 0.01045701, + "balance_loss_clip": 1.02967894, + "balance_loss_mlp": 1.04942465, + "epoch": 0.2834811363294754, + "flos": 30847050576000.0, + "grad_norm": 1.4087892826571713, + "language_loss": 0.78411347, + "learning_rate": 3.3630010669681494e-06, + "loss": 0.80593348, + "num_input_tokens_seen": 101773825, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8671875, + "step": 4715, + "time_per_iteration": 2.554814100265503 + }, + { + "auxiliary_loss_clip": 0.01135347, + "auxiliary_loss_mlp": 0.01036876, + "balance_loss_clip": 1.02042472, + "balance_loss_mlp": 1.04960322, + "epoch": 0.2835412595821434, + "flos": 22711021695360.0, + "grad_norm": 1.6801208741854476, + "language_loss": 0.73694074, + "learning_rate": 3.3627160262081845e-06, + "loss": 0.758663, + "num_input_tokens_seen": 101791920, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.859375, + "step": 4716, + "time_per_iteration": 2.5286571979522705 + }, + { + "auxiliary_loss_clip": 0.01139786, + "auxiliary_loss_mlp": 0.01041969, + "balance_loss_clip": 1.02437401, + "balance_loss_mlp": 1.04774714, + "epoch": 0.28360138283481134, + "flos": 18077719478400.0, + "grad_norm": 2.328876822443367, + "language_loss": 0.74648547, + "learning_rate": 3.3624309337738917e-06, + "loss": 0.76830298, + "num_input_tokens_seen": 101809515, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.921875, + "step": 4717, + "time_per_iteration": 2.46952223777771 + }, + { + "auxiliary_loss_clip": 0.01139264, + "auxiliary_loss_mlp": 0.01044702, + "balance_loss_clip": 1.02846563, + "balance_loss_mlp": 1.04963374, + "epoch": 0.2836615060874793, + "flos": 17854785717120.0, + "grad_norm": 1.4913957575980352, + "language_loss": 0.669999, + "learning_rate": 3.3621457896760813e-06, + "loss": 0.69183862, + "num_input_tokens_seen": 101827735, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.89453125, + "step": 4718, + "time_per_iteration": 2.4831228256225586 + }, + { + "auxiliary_loss_clip": 0.01137489, + "auxiliary_loss_mlp": 0.01046854, + "balance_loss_clip": 1.03000975, + "balance_loss_mlp": 1.04782009, + "epoch": 0.2837216293401473, + "flos": 25740302169600.0, + "grad_norm": 1.8756812569885382, + "language_loss": 0.72633672, + "learning_rate": 3.361860593925566e-06, + "loss": 0.74818015, + "num_input_tokens_seen": 101845970, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.89453125, + "step": 4719, + "time_per_iteration": 4.022828102111816 + }, + { + "auxiliary_loss_clip": 0.01135497, + "auxiliary_loss_mlp": 0.01041437, + "balance_loss_clip": 1.02554655, + "balance_loss_mlp": 1.04928601, + "epoch": 0.2837817525928153, + "flos": 20923532259840.0, + "grad_norm": 1.5135010931827333, + "language_loss": 0.80621493, + "learning_rate": 3.3615753465331605e-06, + "loss": 0.82798427, + "num_input_tokens_seen": 101865040, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.86328125, + "step": 4720, + "time_per_iteration": 5.367753505706787 + }, + { + "auxiliary_loss_clip": 0.0113932, + "auxiliary_loss_mlp": 0.01044199, + "balance_loss_clip": 1.02752209, + "balance_loss_mlp": 1.05115819, + "epoch": 0.28384187584548326, + "flos": 18916700423040.0, + "grad_norm": 1.7029911565101727, + "language_loss": 0.79467577, + "learning_rate": 3.3612900475096817e-06, + "loss": 0.81651098, + "num_input_tokens_seen": 101883735, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8828125, + "step": 4721, + "time_per_iteration": 2.50327730178833 + }, + { + "auxiliary_loss_clip": 0.01133729, + "auxiliary_loss_mlp": 0.01035212, + "balance_loss_clip": 1.01929736, + "balance_loss_mlp": 1.04810679, + "epoch": 0.2839019990981512, + "flos": 27343964776320.0, + "grad_norm": 2.0644081658079343, + "language_loss": 0.82823032, + "learning_rate": 3.3610046968659474e-06, + "loss": 0.84991974, + "num_input_tokens_seen": 101903025, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.859375, + "step": 4722, + "time_per_iteration": 2.4968478679656982 + }, + { + "auxiliary_loss_clip": 0.01137825, + "auxiliary_loss_mlp": 0.01039882, + "balance_loss_clip": 1.02364612, + "balance_loss_mlp": 1.05073261, + "epoch": 0.2839621223508192, + "flos": 18114312458880.0, + "grad_norm": 1.6187910677092856, + "language_loss": 0.70086461, + "learning_rate": 3.3607192946127785e-06, + "loss": 0.72264171, + "num_input_tokens_seen": 101922255, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.87109375, + "step": 4723, + "time_per_iteration": 2.4899258613586426 + }, + { + "auxiliary_loss_clip": 0.01135132, + "auxiliary_loss_mlp": 0.01044726, + "balance_loss_clip": 1.02747679, + "balance_loss_mlp": 1.04938078, + "epoch": 0.28402224560348716, + "flos": 26358360514560.0, + "grad_norm": 1.736224288784384, + "language_loss": 0.78556609, + "learning_rate": 3.360433840760998e-06, + "loss": 0.8073647, + "num_input_tokens_seen": 101943100, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.859375, + "step": 4724, + "time_per_iteration": 2.496594190597534 + }, + { + "auxiliary_loss_clip": 0.01139767, + "auxiliary_loss_mlp": 0.01043591, + "balance_loss_clip": 1.02660346, + "balance_loss_mlp": 1.05093193, + "epoch": 0.2840823688561551, + "flos": 24060795995520.0, + "grad_norm": 1.6232572980988387, + "language_loss": 0.92404163, + "learning_rate": 3.36014833532143e-06, + "loss": 0.94587529, + "num_input_tokens_seen": 101963160, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.88671875, + "step": 4725, + "time_per_iteration": 2.511526584625244 + }, + { + "auxiliary_loss_clip": 0.01140103, + "auxiliary_loss_mlp": 0.01043108, + "balance_loss_clip": 1.0257988, + "balance_loss_mlp": 1.05020452, + "epoch": 0.2841424921088231, + "flos": 29459821368960.0, + "grad_norm": 2.0539060112221645, + "language_loss": 0.88626051, + "learning_rate": 3.3598627783049e-06, + "loss": 0.90809256, + "num_input_tokens_seen": 101984300, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.8984375, + "step": 4726, + "time_per_iteration": 2.5431292057037354 + }, + { + "auxiliary_loss_clip": 0.01139706, + "auxiliary_loss_mlp": 0.01048538, + "balance_loss_clip": 1.03090727, + "balance_loss_mlp": 1.05034256, + "epoch": 0.28420261536149105, + "flos": 48100367053440.0, + "grad_norm": 2.15176079657567, + "language_loss": 0.78793001, + "learning_rate": 3.359577169722238e-06, + "loss": 0.80981243, + "num_input_tokens_seen": 102005765, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.89453125, + "step": 4727, + "time_per_iteration": 2.7037220001220703 + }, + { + "auxiliary_loss_clip": 0.01133758, + "auxiliary_loss_mlp": 0.01037347, + "balance_loss_clip": 1.02229071, + "balance_loss_mlp": 1.04985464, + "epoch": 0.284262738614159, + "flos": 25666146541440.0, + "grad_norm": 2.258515630996078, + "language_loss": 0.66358554, + "learning_rate": 3.3592915095842733e-06, + "loss": 0.68529654, + "num_input_tokens_seen": 102022755, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.83984375, + "step": 4728, + "time_per_iteration": 2.5066046714782715 + }, + { + "auxiliary_loss_clip": 0.01134281, + "auxiliary_loss_mlp": 0.01045863, + "balance_loss_clip": 1.02941179, + "balance_loss_mlp": 1.04727221, + "epoch": 0.284322861866827, + "flos": 19718980646400.0, + "grad_norm": 1.756924339447767, + "language_loss": 0.75958216, + "learning_rate": 3.3590057979018386e-06, + "loss": 0.78138363, + "num_input_tokens_seen": 102041850, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.87109375, + "step": 4729, + "time_per_iteration": 2.4989402294158936 + }, + { + "auxiliary_loss_clip": 0.01140784, + "auxiliary_loss_mlp": 0.01050786, + "balance_loss_clip": 1.03383398, + "balance_loss_mlp": 1.05095756, + "epoch": 0.28438298511949495, + "flos": 23915250086400.0, + "grad_norm": 1.9682162336594704, + "language_loss": 0.66691023, + "learning_rate": 3.3587200346857674e-06, + "loss": 0.68882596, + "num_input_tokens_seen": 102059500, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.8984375, + "step": 4730, + "time_per_iteration": 2.509514570236206 + }, + { + "auxiliary_loss_clip": 0.01138579, + "auxiliary_loss_mlp": 0.01039094, + "balance_loss_clip": 1.02232122, + "balance_loss_mlp": 1.05049443, + "epoch": 0.2844431083721629, + "flos": 26067340523520.0, + "grad_norm": 1.7814838549320247, + "language_loss": 0.74382442, + "learning_rate": 3.3584342199468965e-06, + "loss": 0.76560116, + "num_input_tokens_seen": 102080460, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8828125, + "step": 4731, + "time_per_iteration": 2.547813653945923 + }, + { + "auxiliary_loss_clip": 0.01136629, + "auxiliary_loss_mlp": 0.01033401, + "balance_loss_clip": 1.01700974, + "balance_loss_mlp": 1.04890573, + "epoch": 0.2845032316248309, + "flos": 25810435474560.0, + "grad_norm": 1.530013147894791, + "language_loss": 0.83553517, + "learning_rate": 3.3581483536960638e-06, + "loss": 0.85723549, + "num_input_tokens_seen": 102100950, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.87890625, + "step": 4732, + "time_per_iteration": 2.5120863914489746 + }, + { + "auxiliary_loss_clip": 0.01136161, + "auxiliary_loss_mlp": 0.0105072, + "balance_loss_clip": 1.03301716, + "balance_loss_mlp": 1.04855001, + "epoch": 0.2845633548774989, + "flos": 19823192979840.0, + "grad_norm": 1.9723104549008028, + "language_loss": 0.79331958, + "learning_rate": 3.357862435944109e-06, + "loss": 0.81518835, + "num_input_tokens_seen": 102119345, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.875, + "step": 4733, + "time_per_iteration": 2.5007243156433105 + }, + { + "auxiliary_loss_clip": 0.01142281, + "auxiliary_loss_mlp": 0.01047444, + "balance_loss_clip": 1.02999151, + "balance_loss_mlp": 1.05076027, + "epoch": 0.28462347813016686, + "flos": 23182815859200.0, + "grad_norm": 2.3591023601535834, + "language_loss": 0.71619761, + "learning_rate": 3.357576466701875e-06, + "loss": 0.73809481, + "num_input_tokens_seen": 102139050, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.9140625, + "step": 4734, + "time_per_iteration": 2.482696771621704 + }, + { + "auxiliary_loss_clip": 0.01131669, + "auxiliary_loss_mlp": 0.01036215, + "balance_loss_clip": 1.02036047, + "balance_loss_mlp": 1.04631829, + "epoch": 0.2846836013828348, + "flos": 18660477732480.0, + "grad_norm": 1.8927344989841068, + "language_loss": 0.73762977, + "learning_rate": 3.3572904459802056e-06, + "loss": 0.75930858, + "num_input_tokens_seen": 102157935, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.85546875, + "step": 4735, + "time_per_iteration": 2.4837005138397217 + }, + { + "auxiliary_loss_clip": 0.011344, + "auxiliary_loss_mlp": 0.01045777, + "balance_loss_clip": 1.03006482, + "balance_loss_mlp": 1.04755783, + "epoch": 0.2847437246355028, + "flos": 14173511523840.0, + "grad_norm": 1.630230460143418, + "language_loss": 0.79573876, + "learning_rate": 3.357004373789946e-06, + "loss": 0.81754053, + "num_input_tokens_seen": 102175325, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8671875, + "step": 4736, + "time_per_iteration": 2.4434666633605957 + }, + { + "auxiliary_loss_clip": 0.01139538, + "auxiliary_loss_mlp": 0.0104413, + "balance_loss_clip": 1.02740479, + "balance_loss_mlp": 1.05133057, + "epoch": 0.28480384788817076, + "flos": 29278364837760.0, + "grad_norm": 2.7860738328288637, + "language_loss": 0.59551513, + "learning_rate": 3.3567182501419453e-06, + "loss": 0.61735177, + "num_input_tokens_seen": 102196625, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8828125, + "step": 4737, + "time_per_iteration": 2.580573558807373 + }, + { + "auxiliary_loss_clip": 0.01132871, + "auxiliary_loss_mlp": 0.01039338, + "balance_loss_clip": 1.02334046, + "balance_loss_mlp": 1.04766428, + "epoch": 0.2848639711408387, + "flos": 22601314581120.0, + "grad_norm": 1.7923236486738074, + "language_loss": 0.86353856, + "learning_rate": 3.356432075047052e-06, + "loss": 0.8852607, + "num_input_tokens_seen": 102214975, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8515625, + "step": 4738, + "time_per_iteration": 2.483482837677002 + }, + { + "auxiliary_loss_clip": 0.0113957, + "auxiliary_loss_mlp": 0.01045248, + "balance_loss_clip": 1.02778435, + "balance_loss_mlp": 1.04864287, + "epoch": 0.2849240943935067, + "flos": 17599460866560.0, + "grad_norm": 2.438418234236932, + "language_loss": 0.89730442, + "learning_rate": 3.356145848516118e-06, + "loss": 0.91915256, + "num_input_tokens_seen": 102231885, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.91015625, + "step": 4739, + "time_per_iteration": 2.4746406078338623 + }, + { + "auxiliary_loss_clip": 0.01137971, + "auxiliary_loss_mlp": 0.01041062, + "balance_loss_clip": 1.02450418, + "balance_loss_mlp": 1.05253863, + "epoch": 0.28498421764617465, + "flos": 24862573428480.0, + "grad_norm": 1.3849266219761887, + "language_loss": 0.7207197, + "learning_rate": 3.355859570559998e-06, + "loss": 0.74250996, + "num_input_tokens_seen": 102252725, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.85546875, + "step": 4740, + "time_per_iteration": 2.49682879447937 + }, + { + "auxiliary_loss_clip": 0.01135048, + "auxiliary_loss_mlp": 0.01036754, + "balance_loss_clip": 1.0209707, + "balance_loss_mlp": 1.04970956, + "epoch": 0.2850443408988426, + "flos": 22782555630720.0, + "grad_norm": 1.6055473402712246, + "language_loss": 0.77937335, + "learning_rate": 3.3555732411895477e-06, + "loss": 0.80109143, + "num_input_tokens_seen": 102271730, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8515625, + "step": 4741, + "time_per_iteration": 2.51096248626709 + }, + { + "auxiliary_loss_clip": 0.01136227, + "auxiliary_loss_mlp": 0.01045688, + "balance_loss_clip": 1.02828324, + "balance_loss_mlp": 1.04566443, + "epoch": 0.2851044641515106, + "flos": 18844053166080.0, + "grad_norm": 1.6279093143019605, + "language_loss": 0.76295173, + "learning_rate": 3.3552868604156235e-06, + "loss": 0.78477085, + "num_input_tokens_seen": 102291325, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.90625, + "step": 4742, + "time_per_iteration": 2.462972402572632 + }, + { + "auxiliary_loss_clip": 0.01139125, + "auxiliary_loss_mlp": 0.01048964, + "balance_loss_clip": 1.03039074, + "balance_loss_mlp": 1.04792476, + "epoch": 0.28516458740417855, + "flos": 18880502492160.0, + "grad_norm": 1.8587468959738758, + "language_loss": 0.5772593, + "learning_rate": 3.355000428249086e-06, + "loss": 0.59914023, + "num_input_tokens_seen": 102309000, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.9140625, + "step": 4743, + "time_per_iteration": 2.511903762817383 + }, + { + "auxiliary_loss_clip": 0.01141389, + "auxiliary_loss_mlp": 0.01054233, + "balance_loss_clip": 1.03724515, + "balance_loss_mlp": 1.05195451, + "epoch": 0.2852247106568465, + "flos": 25299821687040.0, + "grad_norm": 2.12515026406258, + "language_loss": 0.74454999, + "learning_rate": 3.354713944700797e-06, + "loss": 0.7665062, + "num_input_tokens_seen": 102329240, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.89453125, + "step": 4744, + "time_per_iteration": 2.48883318901062 + }, + { + "auxiliary_loss_clip": 0.01135189, + "auxiliary_loss_mlp": 0.01043767, + "balance_loss_clip": 1.02801967, + "balance_loss_mlp": 1.04948175, + "epoch": 0.2852848339095145, + "flos": 11655383541120.0, + "grad_norm": 2.362002737479584, + "language_loss": 0.77483714, + "learning_rate": 3.3544274097816185e-06, + "loss": 0.79662669, + "num_input_tokens_seen": 102344440, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.859375, + "step": 4745, + "time_per_iteration": 2.4669532775878906 + }, + { + "auxiliary_loss_clip": 0.01130558, + "auxiliary_loss_mlp": 0.01039375, + "balance_loss_clip": 1.02363896, + "balance_loss_mlp": 1.04884791, + "epoch": 0.2853449571621825, + "flos": 12933228856320.0, + "grad_norm": 1.753549870597739, + "language_loss": 0.83101368, + "learning_rate": 3.3541408235024173e-06, + "loss": 0.85271305, + "num_input_tokens_seen": 102360985, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.81640625, + "step": 4746, + "time_per_iteration": 2.4236245155334473 + }, + { + "auxiliary_loss_clip": 0.01138419, + "auxiliary_loss_mlp": 0.01039496, + "balance_loss_clip": 1.02243769, + "balance_loss_mlp": 1.04718721, + "epoch": 0.28540508041485046, + "flos": 20010575255040.0, + "grad_norm": 1.6977094615171933, + "language_loss": 0.79818654, + "learning_rate": 3.3538541858740604e-06, + "loss": 0.81996572, + "num_input_tokens_seen": 102380320, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.9140625, + "step": 4747, + "time_per_iteration": 2.47261118888855 + }, + { + "auxiliary_loss_clip": 0.01044617, + "auxiliary_loss_mlp": 0.01004042, + "balance_loss_clip": 1.00208712, + "balance_loss_mlp": 1.01364255, + "epoch": 0.28546520366751843, + "flos": 68139349966080.0, + "grad_norm": 0.7754058718106229, + "language_loss": 0.60505557, + "learning_rate": 3.3535674969074173e-06, + "loss": 0.62554216, + "num_input_tokens_seen": 102439140, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.30859375, + "step": 4748, + "time_per_iteration": 3.087096691131592 + }, + { + "auxiliary_loss_clip": 0.0113463, + "auxiliary_loss_mlp": 0.01041876, + "balance_loss_clip": 1.02596188, + "balance_loss_mlp": 1.04764485, + "epoch": 0.2855253269201864, + "flos": 13251540205440.0, + "grad_norm": 2.177788697298361, + "language_loss": 0.80300528, + "learning_rate": 3.3532807566133592e-06, + "loss": 0.82477033, + "num_input_tokens_seen": 102450990, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.87109375, + "step": 4749, + "time_per_iteration": 2.4132721424102783 + }, + { + "auxiliary_loss_clip": 0.01134988, + "auxiliary_loss_mlp": 0.01038736, + "balance_loss_clip": 1.022488, + "balance_loss_mlp": 1.04882109, + "epoch": 0.28558545017285436, + "flos": 28620876337920.0, + "grad_norm": 1.910787577049047, + "language_loss": 0.7067076, + "learning_rate": 3.3529939650027587e-06, + "loss": 0.72844481, + "num_input_tokens_seen": 102471820, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.86328125, + "step": 4750, + "time_per_iteration": 2.5576114654541016 + }, + { + "auxiliary_loss_clip": 0.01132669, + "auxiliary_loss_mlp": 0.01037305, + "balance_loss_clip": 1.02121782, + "balance_loss_mlp": 1.04961181, + "epoch": 0.2856455734255223, + "flos": 34130470752000.0, + "grad_norm": 1.569446011166348, + "language_loss": 0.81798106, + "learning_rate": 3.3527071220864917e-06, + "loss": 0.83968079, + "num_input_tokens_seen": 102492625, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.83203125, + "step": 4751, + "time_per_iteration": 2.5805511474609375 + }, + { + "auxiliary_loss_clip": 0.01134337, + "auxiliary_loss_mlp": 0.01044364, + "balance_loss_clip": 1.02847314, + "balance_loss_mlp": 1.04876757, + "epoch": 0.2857056966781903, + "flos": 39786149779200.0, + "grad_norm": 1.8824724995030706, + "language_loss": 0.80753136, + "learning_rate": 3.3524202278754353e-06, + "loss": 0.82931828, + "num_input_tokens_seen": 102514145, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.859375, + "step": 4752, + "time_per_iteration": 2.6541080474853516 + }, + { + "auxiliary_loss_clip": 0.01134255, + "auxiliary_loss_mlp": 0.01039105, + "balance_loss_clip": 1.02258289, + "balance_loss_mlp": 1.04778147, + "epoch": 0.28576581993085826, + "flos": 21872292145920.0, + "grad_norm": 1.8943096426553439, + "language_loss": 0.78827929, + "learning_rate": 3.3521332823804676e-06, + "loss": 0.81001288, + "num_input_tokens_seen": 102532365, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.86328125, + "step": 4753, + "time_per_iteration": 2.4775567054748535 + }, + { + "auxiliary_loss_clip": 0.01140638, + "auxiliary_loss_mlp": 0.01043914, + "balance_loss_clip": 1.02559114, + "balance_loss_mlp": 1.05078959, + "epoch": 0.2858259431835262, + "flos": 19091656592640.0, + "grad_norm": 2.205371578508451, + "language_loss": 0.89809895, + "learning_rate": 3.3518462856124704e-06, + "loss": 0.91994447, + "num_input_tokens_seen": 102548425, + "router_z_loss_clip": 0.18359375, + "router_z_loss_mlp": 0.8984375, + "step": 4754, + "time_per_iteration": 2.486128091812134 + }, + { + "auxiliary_loss_clip": 0.01134093, + "auxiliary_loss_mlp": 0.01041613, + "balance_loss_clip": 1.02616322, + "balance_loss_mlp": 1.04897058, + "epoch": 0.2858860664361942, + "flos": 20334309557760.0, + "grad_norm": 1.932227485650823, + "language_loss": 0.8234359, + "learning_rate": 3.3515592375823267e-06, + "loss": 0.84519303, + "num_input_tokens_seen": 102566370, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8515625, + "step": 4755, + "time_per_iteration": 2.491184711456299 + }, + { + "auxiliary_loss_clip": 0.01133783, + "auxiliary_loss_mlp": 0.010447, + "balance_loss_clip": 1.02915466, + "balance_loss_mlp": 1.04667544, + "epoch": 0.28594618968886215, + "flos": 24461738582400.0, + "grad_norm": 1.4908389000148254, + "language_loss": 0.83846784, + "learning_rate": 3.351272138300922e-06, + "loss": 0.86025268, + "num_input_tokens_seen": 102588715, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.87109375, + "step": 4756, + "time_per_iteration": 2.5934014320373535 + }, + { + "auxiliary_loss_clip": 0.01048134, + "auxiliary_loss_mlp": 0.01008558, + "balance_loss_clip": 1.0067457, + "balance_loss_mlp": 1.01677859, + "epoch": 0.2860063129415301, + "flos": 71652850709760.0, + "grad_norm": 0.8659269702666513, + "language_loss": 0.61012161, + "learning_rate": 3.350984987779142e-06, + "loss": 0.63068855, + "num_input_tokens_seen": 102656715, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.3125, + "step": 4757, + "time_per_iteration": 3.2122225761413574 + }, + { + "auxiliary_loss_clip": 0.01137202, + "auxiliary_loss_mlp": 0.01033086, + "balance_loss_clip": 1.0173862, + "balance_loss_mlp": 1.05204773, + "epoch": 0.2860664361941981, + "flos": 20558679863040.0, + "grad_norm": 1.9457322051707677, + "language_loss": 0.65794766, + "learning_rate": 3.3506977860278756e-06, + "loss": 0.67965055, + "num_input_tokens_seen": 102676545, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8515625, + "step": 4758, + "time_per_iteration": 2.60023832321167 + }, + { + "auxiliary_loss_clip": 0.01134399, + "auxiliary_loss_mlp": 0.01036695, + "balance_loss_clip": 1.02027392, + "balance_loss_mlp": 1.04756904, + "epoch": 0.2861265594468661, + "flos": 35996389534080.0, + "grad_norm": 1.560843999265526, + "language_loss": 0.62950313, + "learning_rate": 3.3504105330580143e-06, + "loss": 0.65121412, + "num_input_tokens_seen": 102702875, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8671875, + "step": 4759, + "time_per_iteration": 2.6352102756500244 + }, + { + "auxiliary_loss_clip": 0.0113658, + "auxiliary_loss_mlp": 0.01042718, + "balance_loss_clip": 1.02611256, + "balance_loss_mlp": 1.05098844, + "epoch": 0.28618668269953407, + "flos": 20047419630720.0, + "grad_norm": 1.76909488275169, + "language_loss": 0.7385608, + "learning_rate": 3.3501232288804496e-06, + "loss": 0.76035368, + "num_input_tokens_seen": 102723160, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.85546875, + "step": 4760, + "time_per_iteration": 2.5397889614105225 + }, + { + "auxiliary_loss_clip": 0.01132288, + "auxiliary_loss_mlp": 0.01038545, + "balance_loss_clip": 1.02357185, + "balance_loss_mlp": 1.04949427, + "epoch": 0.28624680595220203, + "flos": 24971849579520.0, + "grad_norm": 1.9401243114633073, + "language_loss": 0.72422945, + "learning_rate": 3.3498358735060773e-06, + "loss": 0.74593776, + "num_input_tokens_seen": 102743855, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 4761, + "time_per_iteration": 4.029369592666626 + }, + { + "auxiliary_loss_clip": 0.01135721, + "auxiliary_loss_mlp": 0.0104628, + "balance_loss_clip": 1.0303421, + "balance_loss_mlp": 1.04875946, + "epoch": 0.28630692920487, + "flos": 22492253911680.0, + "grad_norm": 2.026540334724573, + "language_loss": 0.74605787, + "learning_rate": 3.349548466945793e-06, + "loss": 0.76787788, + "num_input_tokens_seen": 102761370, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.87109375, + "step": 4762, + "time_per_iteration": 3.9056994915008545 + }, + { + "auxiliary_loss_clip": 0.01134836, + "auxiliary_loss_mlp": 0.01045458, + "balance_loss_clip": 1.02963901, + "balance_loss_mlp": 1.05027771, + "epoch": 0.28636705245753796, + "flos": 21249888255360.0, + "grad_norm": 1.79451974437327, + "language_loss": 0.76088154, + "learning_rate": 3.349261009210496e-06, + "loss": 0.78268445, + "num_input_tokens_seen": 102780885, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.84375, + "step": 4763, + "time_per_iteration": 2.521223545074463 + }, + { + "auxiliary_loss_clip": 0.01133105, + "auxiliary_loss_mlp": 0.01035478, + "balance_loss_clip": 1.01907468, + "balance_loss_mlp": 1.04712808, + "epoch": 0.28642717571020593, + "flos": 24095772864000.0, + "grad_norm": 1.9430054907967222, + "language_loss": 0.76937616, + "learning_rate": 3.348973500311086e-06, + "loss": 0.79106188, + "num_input_tokens_seen": 102801000, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.859375, + "step": 4764, + "time_per_iteration": 2.4924814701080322 + }, + { + "auxiliary_loss_clip": 0.01137128, + "auxiliary_loss_mlp": 0.01041403, + "balance_loss_clip": 1.02354538, + "balance_loss_mlp": 1.04996395, + "epoch": 0.2864872989628739, + "flos": 22601386408320.0, + "grad_norm": 1.8973954036904035, + "language_loss": 0.71061826, + "learning_rate": 3.348685940258466e-06, + "loss": 0.73240352, + "num_input_tokens_seen": 102820230, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.87109375, + "step": 4765, + "time_per_iteration": 2.509204387664795 + }, + { + "auxiliary_loss_clip": 0.01131492, + "auxiliary_loss_mlp": 0.01037402, + "balance_loss_clip": 1.02149963, + "balance_loss_mlp": 1.04705501, + "epoch": 0.28654742221554186, + "flos": 32745073138560.0, + "grad_norm": 1.5129940587619137, + "language_loss": 0.75756145, + "learning_rate": 3.3483983290635395e-06, + "loss": 0.77925038, + "num_input_tokens_seen": 102842670, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.84375, + "step": 4766, + "time_per_iteration": 2.562422513961792 + }, + { + "auxiliary_loss_clip": 0.01135318, + "auxiliary_loss_mlp": 0.0103558, + "balance_loss_clip": 1.01960635, + "balance_loss_mlp": 1.05073392, + "epoch": 0.2866075454682098, + "flos": 26981626331520.0, + "grad_norm": 1.5780141248071407, + "language_loss": 0.77556801, + "learning_rate": 3.348110666737214e-06, + "loss": 0.79727697, + "num_input_tokens_seen": 102864480, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.84375, + "step": 4767, + "time_per_iteration": 2.5476057529449463 + }, + { + "auxiliary_loss_clip": 0.01133832, + "auxiliary_loss_mlp": 0.01041655, + "balance_loss_clip": 1.02591908, + "balance_loss_mlp": 1.04878676, + "epoch": 0.2866676687208778, + "flos": 23253847004160.0, + "grad_norm": 2.169490874338027, + "language_loss": 0.6494413, + "learning_rate": 3.3478229532903956e-06, + "loss": 0.67119616, + "num_input_tokens_seen": 102883740, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8515625, + "step": 4768, + "time_per_iteration": 2.4961044788360596 + }, + { + "auxiliary_loss_clip": 0.01137611, + "auxiliary_loss_mlp": 0.01044314, + "balance_loss_clip": 1.02807736, + "balance_loss_mlp": 1.04944301, + "epoch": 0.28672779197354575, + "flos": 21579727870080.0, + "grad_norm": 1.5253191671074575, + "language_loss": 0.70345664, + "learning_rate": 3.3475351887339967e-06, + "loss": 0.72527587, + "num_input_tokens_seen": 102902945, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8828125, + "step": 4769, + "time_per_iteration": 2.5243568420410156 + }, + { + "auxiliary_loss_clip": 0.01136117, + "auxiliary_loss_mlp": 0.01034848, + "balance_loss_clip": 1.01992261, + "balance_loss_mlp": 1.04866219, + "epoch": 0.2867879152262137, + "flos": 19865568049920.0, + "grad_norm": 1.7483868508562144, + "language_loss": 0.75552189, + "learning_rate": 3.3472473730789288e-06, + "loss": 0.77723145, + "num_input_tokens_seen": 102922405, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.875, + "step": 4770, + "time_per_iteration": 2.468655586242676 + }, + { + "auxiliary_loss_clip": 0.01137893, + "auxiliary_loss_mlp": 0.01043906, + "balance_loss_clip": 1.02745509, + "balance_loss_mlp": 1.0500282, + "epoch": 0.2868480384788817, + "flos": 28213325648640.0, + "grad_norm": 3.1666126901900107, + "language_loss": 0.6730839, + "learning_rate": 3.3469595063361045e-06, + "loss": 0.69490194, + "num_input_tokens_seen": 102938980, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.87890625, + "step": 4771, + "time_per_iteration": 2.5334818363189697 + }, + { + "auxiliary_loss_clip": 0.01046415, + "auxiliary_loss_mlp": 0.01005401, + "balance_loss_clip": 1.00367248, + "balance_loss_mlp": 1.01655006, + "epoch": 0.2869081617315497, + "flos": 65424286690560.0, + "grad_norm": 0.7694277286160668, + "language_loss": 0.56883639, + "learning_rate": 3.3466715885164414e-06, + "loss": 0.58935452, + "num_input_tokens_seen": 103000405, + "router_z_loss_clip": 0.01733398, + "router_z_loss_mlp": 0.29882812, + "step": 4772, + "time_per_iteration": 3.0373501777648926 + }, + { + "auxiliary_loss_clip": 0.01136901, + "auxiliary_loss_mlp": 0.01041473, + "balance_loss_clip": 1.02567768, + "balance_loss_mlp": 1.05014777, + "epoch": 0.28696828498421767, + "flos": 18660729127680.0, + "grad_norm": 2.6517872983988844, + "language_loss": 0.83356023, + "learning_rate": 3.346383619630856e-06, + "loss": 0.85534406, + "num_input_tokens_seen": 103017970, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8671875, + "step": 4773, + "time_per_iteration": 2.477537155151367 + }, + { + "auxiliary_loss_clip": 0.0113402, + "auxiliary_loss_mlp": 0.0103818, + "balance_loss_clip": 1.02159762, + "balance_loss_mlp": 1.04630029, + "epoch": 0.28702840823688563, + "flos": 23659745667840.0, + "grad_norm": 2.6367186533355356, + "language_loss": 0.77910906, + "learning_rate": 3.34609559969027e-06, + "loss": 0.80083102, + "num_input_tokens_seen": 103036385, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.875, + "step": 4774, + "time_per_iteration": 2.514545440673828 + }, + { + "auxiliary_loss_clip": 0.01136368, + "auxiliary_loss_mlp": 0.01037759, + "balance_loss_clip": 1.02091432, + "balance_loss_mlp": 1.05010271, + "epoch": 0.2870885314895536, + "flos": 13804744544640.0, + "grad_norm": 1.7122435327393783, + "language_loss": 0.73488462, + "learning_rate": 3.3458075287056034e-06, + "loss": 0.75662589, + "num_input_tokens_seen": 103052170, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.86328125, + "step": 4775, + "time_per_iteration": 2.4526851177215576 + }, + { + "auxiliary_loss_clip": 0.0113744, + "auxiliary_loss_mlp": 0.01037967, + "balance_loss_clip": 1.02267885, + "balance_loss_mlp": 1.05033445, + "epoch": 0.28714865474222157, + "flos": 17786771314560.0, + "grad_norm": 1.655187901014976, + "language_loss": 0.88345891, + "learning_rate": 3.34551940668778e-06, + "loss": 0.905213, + "num_input_tokens_seen": 103070510, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.87109375, + "step": 4776, + "time_per_iteration": 2.5487112998962402 + }, + { + "auxiliary_loss_clip": 0.01135791, + "auxiliary_loss_mlp": 0.01037024, + "balance_loss_clip": 1.02170587, + "balance_loss_mlp": 1.05060029, + "epoch": 0.28720877799488953, + "flos": 15997486199040.0, + "grad_norm": 1.7920640817181568, + "language_loss": 0.74046421, + "learning_rate": 3.345231233647726e-06, + "loss": 0.76219237, + "num_input_tokens_seen": 103089590, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8515625, + "step": 4777, + "time_per_iteration": 2.4858744144439697 + }, + { + "auxiliary_loss_clip": 0.01143681, + "auxiliary_loss_mlp": 0.01044417, + "balance_loss_clip": 1.02763224, + "balance_loss_mlp": 1.05306673, + "epoch": 0.2872689012475575, + "flos": 20923137210240.0, + "grad_norm": 1.9679293284940167, + "language_loss": 0.80052459, + "learning_rate": 3.3449430095963696e-06, + "loss": 0.82240558, + "num_input_tokens_seen": 103109080, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.90625, + "step": 4778, + "time_per_iteration": 2.536553382873535 + }, + { + "auxiliary_loss_clip": 0.01135156, + "auxiliary_loss_mlp": 0.01046142, + "balance_loss_clip": 1.03032279, + "balance_loss_mlp": 1.05058503, + "epoch": 0.28732902450022546, + "flos": 21325121291520.0, + "grad_norm": 1.6265242751714746, + "language_loss": 0.73940611, + "learning_rate": 3.3446547345446386e-06, + "loss": 0.76121908, + "num_input_tokens_seen": 103127755, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.84765625, + "step": 4779, + "time_per_iteration": 2.5068604946136475 + }, + { + "auxiliary_loss_clip": 0.01139025, + "auxiliary_loss_mlp": 0.01044309, + "balance_loss_clip": 1.02791739, + "balance_loss_mlp": 1.05089593, + "epoch": 0.2873891477528934, + "flos": 20850382212480.0, + "grad_norm": 1.5791887497798731, + "language_loss": 0.76378506, + "learning_rate": 3.3443664085034656e-06, + "loss": 0.78561842, + "num_input_tokens_seen": 103147035, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8828125, + "step": 4780, + "time_per_iteration": 2.6357336044311523 + }, + { + "auxiliary_loss_clip": 0.01132548, + "auxiliary_loss_mlp": 0.01042513, + "balance_loss_clip": 1.02789187, + "balance_loss_mlp": 1.04874134, + "epoch": 0.2874492710055614, + "flos": 17420051410560.0, + "grad_norm": 1.8554557560955622, + "language_loss": 0.81367111, + "learning_rate": 3.344078031483784e-06, + "loss": 0.83542168, + "num_input_tokens_seen": 103165410, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8359375, + "step": 4781, + "time_per_iteration": 2.484217405319214 + }, + { + "auxiliary_loss_clip": 0.01138983, + "auxiliary_loss_mlp": 0.01044127, + "balance_loss_clip": 1.02688909, + "balance_loss_mlp": 1.0511862, + "epoch": 0.28750939425822936, + "flos": 13406818700160.0, + "grad_norm": 1.9124031057386872, + "language_loss": 0.86249948, + "learning_rate": 3.3437896034965283e-06, + "loss": 0.88433063, + "num_input_tokens_seen": 103183710, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.87890625, + "step": 4782, + "time_per_iteration": 2.4822945594787598 + }, + { + "auxiliary_loss_clip": 0.0113749, + "auxiliary_loss_mlp": 0.0104499, + "balance_loss_clip": 1.02842641, + "balance_loss_mlp": 1.05222881, + "epoch": 0.2875695175108973, + "flos": 21870029589120.0, + "grad_norm": 1.5584901619772236, + "language_loss": 0.71195668, + "learning_rate": 3.3435011245526357e-06, + "loss": 0.73378146, + "num_input_tokens_seen": 103203790, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.85546875, + "step": 4783, + "time_per_iteration": 2.4959099292755127 + }, + { + "auxiliary_loss_clip": 0.01136896, + "auxiliary_loss_mlp": 0.0104062, + "balance_loss_clip": 1.02443171, + "balance_loss_mlp": 1.05179179, + "epoch": 0.2876296407635653, + "flos": 26245457089920.0, + "grad_norm": 3.6731562407195932, + "language_loss": 0.77011871, + "learning_rate": 3.343212594663047e-06, + "loss": 0.79189384, + "num_input_tokens_seen": 103223925, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8515625, + "step": 4784, + "time_per_iteration": 2.55037784576416 + }, + { + "auxiliary_loss_clip": 0.0113214, + "auxiliary_loss_mlp": 0.01041887, + "balance_loss_clip": 1.02603197, + "balance_loss_mlp": 1.04896331, + "epoch": 0.28768976401623325, + "flos": 25373654092800.0, + "grad_norm": 1.5223386635016902, + "language_loss": 0.75859249, + "learning_rate": 3.3429240138387015e-06, + "loss": 0.7803328, + "num_input_tokens_seen": 103244760, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.83203125, + "step": 4785, + "time_per_iteration": 2.526587724685669 + }, + { + "auxiliary_loss_clip": 0.01135192, + "auxiliary_loss_mlp": 0.01042659, + "balance_loss_clip": 1.02724528, + "balance_loss_mlp": 1.04946601, + "epoch": 0.28774988726890127, + "flos": 30664372982400.0, + "grad_norm": 1.9982438427344784, + "language_loss": 0.83033895, + "learning_rate": 3.3426353820905425e-06, + "loss": 0.85211748, + "num_input_tokens_seen": 103261995, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.859375, + "step": 4786, + "time_per_iteration": 2.5786821842193604 + }, + { + "auxiliary_loss_clip": 0.01134245, + "auxiliary_loss_mlp": 0.01033568, + "balance_loss_clip": 1.01899481, + "balance_loss_mlp": 1.04868317, + "epoch": 0.28781001052156924, + "flos": 20595452411520.0, + "grad_norm": 1.95457297040312, + "language_loss": 0.80007184, + "learning_rate": 3.342346699429516e-06, + "loss": 0.82174993, + "num_input_tokens_seen": 103279780, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.85546875, + "step": 4787, + "time_per_iteration": 2.4734396934509277 + }, + { + "auxiliary_loss_clip": 0.01136278, + "auxiliary_loss_mlp": 0.01039735, + "balance_loss_clip": 1.02397585, + "balance_loss_mlp": 1.04906642, + "epoch": 0.2878701337742372, + "flos": 26542330997760.0, + "grad_norm": 2.6671828195015044, + "language_loss": 0.83666658, + "learning_rate": 3.3420579658665677e-06, + "loss": 0.85842675, + "num_input_tokens_seen": 103300580, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.87109375, + "step": 4788, + "time_per_iteration": 2.5388548374176025 + }, + { + "auxiliary_loss_clip": 0.01137234, + "auxiliary_loss_mlp": 0.01046523, + "balance_loss_clip": 1.03135991, + "balance_loss_mlp": 1.05051816, + "epoch": 0.28793025702690517, + "flos": 28146855530880.0, + "grad_norm": 1.8168797658695668, + "language_loss": 0.73769903, + "learning_rate": 3.3417691814126468e-06, + "loss": 0.75953662, + "num_input_tokens_seen": 103320430, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8671875, + "step": 4789, + "time_per_iteration": 2.5259692668914795 + }, + { + "auxiliary_loss_clip": 0.01129641, + "auxiliary_loss_mlp": 0.01043253, + "balance_loss_clip": 1.02819657, + "balance_loss_mlp": 1.0466274, + "epoch": 0.28799038027957313, + "flos": 23805471144960.0, + "grad_norm": 1.7572733449240283, + "language_loss": 0.83982229, + "learning_rate": 3.341480346078704e-06, + "loss": 0.86155128, + "num_input_tokens_seen": 103337695, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 4790, + "time_per_iteration": 2.5347094535827637 + }, + { + "auxiliary_loss_clip": 0.01136016, + "auxiliary_loss_mlp": 0.01039193, + "balance_loss_clip": 1.02267063, + "balance_loss_mlp": 1.05011547, + "epoch": 0.2880505035322411, + "flos": 22344122223360.0, + "grad_norm": 1.8137236403798864, + "language_loss": 0.77924603, + "learning_rate": 3.3411914598756922e-06, + "loss": 0.80099815, + "num_input_tokens_seen": 103357010, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.859375, + "step": 4791, + "time_per_iteration": 2.475328207015991 + }, + { + "auxiliary_loss_clip": 0.0113676, + "auxiliary_loss_mlp": 0.01034669, + "balance_loss_clip": 1.01854002, + "balance_loss_mlp": 1.04824567, + "epoch": 0.28811062678490906, + "flos": 18004246208640.0, + "grad_norm": 1.933659829708973, + "language_loss": 0.70760292, + "learning_rate": 3.3409025228145654e-06, + "loss": 0.72931719, + "num_input_tokens_seen": 103375600, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.88671875, + "step": 4792, + "time_per_iteration": 2.4705538749694824 + }, + { + "auxiliary_loss_clip": 0.01135222, + "auxiliary_loss_mlp": 0.01035984, + "balance_loss_clip": 1.02065361, + "balance_loss_mlp": 1.04968917, + "epoch": 0.28817075003757703, + "flos": 22090880361600.0, + "grad_norm": 2.08648870526395, + "language_loss": 0.79392564, + "learning_rate": 3.3406135349062812e-06, + "loss": 0.81563771, + "num_input_tokens_seen": 103395225, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.85546875, + "step": 4793, + "time_per_iteration": 2.509697914123535 + }, + { + "auxiliary_loss_clip": 0.01131221, + "auxiliary_loss_mlp": 0.01040002, + "balance_loss_clip": 1.02479076, + "balance_loss_mlp": 1.04920101, + "epoch": 0.288230873290245, + "flos": 41683130847360.0, + "grad_norm": 1.6269924793239006, + "language_loss": 0.77731872, + "learning_rate": 3.340324496161797e-06, + "loss": 0.7990309, + "num_input_tokens_seen": 103417245, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8203125, + "step": 4794, + "time_per_iteration": 2.6943047046661377 + }, + { + "auxiliary_loss_clip": 0.01134923, + "auxiliary_loss_mlp": 0.0104449, + "balance_loss_clip": 1.02819395, + "balance_loss_mlp": 1.04913807, + "epoch": 0.28829099654291296, + "flos": 18624423456000.0, + "grad_norm": 2.663854929830155, + "language_loss": 0.8254813, + "learning_rate": 3.340035406592074e-06, + "loss": 0.84727538, + "num_input_tokens_seen": 103435500, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.859375, + "step": 4795, + "time_per_iteration": 2.4633255004882812 + }, + { + "auxiliary_loss_clip": 0.01129713, + "auxiliary_loss_mlp": 0.01038999, + "balance_loss_clip": 1.02387166, + "balance_loss_mlp": 1.04899204, + "epoch": 0.2883511197955809, + "flos": 24674832017280.0, + "grad_norm": 2.661730786650402, + "language_loss": 0.74650323, + "learning_rate": 3.339746266208074e-06, + "loss": 0.76819038, + "num_input_tokens_seen": 103451040, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.80859375, + "step": 4796, + "time_per_iteration": 2.5179266929626465 + }, + { + "auxiliary_loss_clip": 0.01138692, + "auxiliary_loss_mlp": 0.01040905, + "balance_loss_clip": 1.02334583, + "balance_loss_mlp": 1.04789257, + "epoch": 0.2884112430482489, + "flos": 23112143850240.0, + "grad_norm": 1.8865626242662115, + "language_loss": 0.72797763, + "learning_rate": 3.3394570750207614e-06, + "loss": 0.74977362, + "num_input_tokens_seen": 103471330, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.90625, + "step": 4797, + "time_per_iteration": 2.4910430908203125 + }, + { + "auxiliary_loss_clip": 0.01135339, + "auxiliary_loss_mlp": 0.01040629, + "balance_loss_clip": 1.02475667, + "balance_loss_mlp": 1.04989898, + "epoch": 0.28847136630091685, + "flos": 16873347432960.0, + "grad_norm": 2.109884297899412, + "language_loss": 0.74219149, + "learning_rate": 3.3391678330411017e-06, + "loss": 0.76395118, + "num_input_tokens_seen": 103488060, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8515625, + "step": 4798, + "time_per_iteration": 2.472590923309326 + }, + { + "auxiliary_loss_clip": 0.01134882, + "auxiliary_loss_mlp": 0.01043827, + "balance_loss_clip": 1.02631509, + "balance_loss_mlp": 1.04689598, + "epoch": 0.2885314895535849, + "flos": 25657527277440.0, + "grad_norm": 2.7660889265500996, + "language_loss": 0.64920753, + "learning_rate": 3.3388785402800642e-06, + "loss": 0.67099464, + "num_input_tokens_seen": 103503600, + "router_z_loss_clip": 0.17480469, + "router_z_loss_mlp": 0.87890625, + "step": 4799, + "time_per_iteration": 2.4816339015960693 + }, + { + "auxiliary_loss_clip": 0.01136164, + "auxiliary_loss_mlp": 0.01043963, + "balance_loss_clip": 1.02784538, + "balance_loss_mlp": 1.04912758, + "epoch": 0.28859161280625284, + "flos": 21107251347840.0, + "grad_norm": 2.0794132014970272, + "language_loss": 0.82202137, + "learning_rate": 3.3385891967486178e-06, + "loss": 0.84382272, + "num_input_tokens_seen": 103524195, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.87109375, + "step": 4800, + "time_per_iteration": 2.5249674320220947 + }, + { + "auxiliary_loss_clip": 0.01128617, + "auxiliary_loss_mlp": 0.01038626, + "balance_loss_clip": 1.02312899, + "balance_loss_mlp": 1.04702258, + "epoch": 0.2886517360589208, + "flos": 26469540086400.0, + "grad_norm": 1.639042715490093, + "language_loss": 0.90946537, + "learning_rate": 3.3382998024577347e-06, + "loss": 0.93113768, + "num_input_tokens_seen": 103545235, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.81640625, + "step": 4801, + "time_per_iteration": 2.531658172607422 + }, + { + "auxiliary_loss_clip": 0.01133327, + "auxiliary_loss_mlp": 0.01038392, + "balance_loss_clip": 1.0221796, + "balance_loss_mlp": 1.04792547, + "epoch": 0.28871185931158877, + "flos": 25265275781760.0, + "grad_norm": 2.176318344562637, + "language_loss": 0.73644328, + "learning_rate": 3.33801035741839e-06, + "loss": 0.75816047, + "num_input_tokens_seen": 103563305, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8515625, + "step": 4802, + "time_per_iteration": 4.080524444580078 + }, + { + "auxiliary_loss_clip": 0.01040178, + "auxiliary_loss_mlp": 0.01006047, + "balance_loss_clip": 1.00423479, + "balance_loss_mlp": 1.01114249, + "epoch": 0.28877198256425674, + "flos": 66665431284480.0, + "grad_norm": 0.7820100192493779, + "language_loss": 0.63009298, + "learning_rate": 3.337720861641558e-06, + "loss": 0.65055525, + "num_input_tokens_seen": 103625025, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.2890625, + "step": 4803, + "time_per_iteration": 4.464243412017822 + }, + { + "auxiliary_loss_clip": 0.0112919, + "auxiliary_loss_mlp": 0.01046023, + "balance_loss_clip": 1.03008461, + "balance_loss_mlp": 1.04523563, + "epoch": 0.2888321058169247, + "flos": 20303031790080.0, + "grad_norm": 1.7581002683255658, + "language_loss": 0.70800668, + "learning_rate": 3.3374313151382165e-06, + "loss": 0.72975886, + "num_input_tokens_seen": 103644235, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8359375, + "step": 4804, + "time_per_iteration": 2.4655730724334717 + }, + { + "auxiliary_loss_clip": 0.01134858, + "auxiliary_loss_mlp": 0.0104232, + "balance_loss_clip": 1.02464128, + "balance_loss_mlp": 1.04650438, + "epoch": 0.28889222906959267, + "flos": 25516721963520.0, + "grad_norm": 1.8916446417141755, + "language_loss": 0.68253011, + "learning_rate": 3.337141717919346e-06, + "loss": 0.70430195, + "num_input_tokens_seen": 103664700, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.8828125, + "step": 4805, + "time_per_iteration": 2.53932523727417 + }, + { + "auxiliary_loss_clip": 0.01133301, + "auxiliary_loss_mlp": 0.01041795, + "balance_loss_clip": 1.0262022, + "balance_loss_mlp": 1.04706144, + "epoch": 0.28895235232226063, + "flos": 32671312560000.0, + "grad_norm": 1.968490446816616, + "language_loss": 0.69469118, + "learning_rate": 3.3368520699959272e-06, + "loss": 0.71644211, + "num_input_tokens_seen": 103686595, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.86328125, + "step": 4806, + "time_per_iteration": 2.558811902999878 + }, + { + "auxiliary_loss_clip": 0.0113054, + "auxiliary_loss_mlp": 0.01046922, + "balance_loss_clip": 1.031461, + "balance_loss_mlp": 1.04788303, + "epoch": 0.2890124755749286, + "flos": 29714679342720.0, + "grad_norm": 1.428284074184194, + "language_loss": 0.71372461, + "learning_rate": 3.3365623713789443e-06, + "loss": 0.73549926, + "num_input_tokens_seen": 103707525, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.828125, + "step": 4807, + "time_per_iteration": 2.5614373683929443 + }, + { + "auxiliary_loss_clip": 0.01132479, + "auxiliary_loss_mlp": 0.01043529, + "balance_loss_clip": 1.02736425, + "balance_loss_mlp": 1.04677331, + "epoch": 0.28907259882759656, + "flos": 22674464628480.0, + "grad_norm": 1.7487230864068215, + "language_loss": 0.81519878, + "learning_rate": 3.336272622079382e-06, + "loss": 0.83695877, + "num_input_tokens_seen": 103727905, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.85546875, + "step": 4808, + "time_per_iteration": 2.4744319915771484 + }, + { + "auxiliary_loss_clip": 0.01128992, + "auxiliary_loss_mlp": 0.0105043, + "balance_loss_clip": 1.03418779, + "balance_loss_mlp": 1.04669142, + "epoch": 0.2891327220802645, + "flos": 22566050403840.0, + "grad_norm": 1.636259514454852, + "language_loss": 0.78387201, + "learning_rate": 3.3359828221082276e-06, + "loss": 0.80566621, + "num_input_tokens_seen": 103748335, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8203125, + "step": 4809, + "time_per_iteration": 2.4998364448547363 + }, + { + "auxiliary_loss_clip": 0.01134273, + "auxiliary_loss_mlp": 0.0104619, + "balance_loss_clip": 1.02908349, + "balance_loss_mlp": 1.04490733, + "epoch": 0.2891928453329325, + "flos": 21652806090240.0, + "grad_norm": 1.6563631129995537, + "language_loss": 0.78611737, + "learning_rate": 3.3356929714764714e-06, + "loss": 0.80792195, + "num_input_tokens_seen": 103767020, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.890625, + "step": 4810, + "time_per_iteration": 2.4702351093292236 + }, + { + "auxiliary_loss_clip": 0.01129985, + "auxiliary_loss_mlp": 0.01045099, + "balance_loss_clip": 1.02966762, + "balance_loss_mlp": 1.04653728, + "epoch": 0.28925296858560046, + "flos": 23222102359680.0, + "grad_norm": 2.008599276638055, + "language_loss": 0.77134252, + "learning_rate": 3.3354030701951032e-06, + "loss": 0.79309338, + "num_input_tokens_seen": 103786355, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8359375, + "step": 4811, + "time_per_iteration": 2.502671718597412 + }, + { + "auxiliary_loss_clip": 0.01130702, + "auxiliary_loss_mlp": 0.01050867, + "balance_loss_clip": 1.03385544, + "balance_loss_mlp": 1.0460732, + "epoch": 0.2893130918382685, + "flos": 28621666437120.0, + "grad_norm": 1.3273574459957262, + "language_loss": 0.76748705, + "learning_rate": 3.335113118275117e-06, + "loss": 0.78930271, + "num_input_tokens_seen": 103809345, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.84375, + "step": 4812, + "time_per_iteration": 2.5386435985565186 + }, + { + "auxiliary_loss_clip": 0.01038211, + "auxiliary_loss_mlp": 0.01023073, + "balance_loss_clip": 1.02121317, + "balance_loss_mlp": 1.00933552, + "epoch": 0.28937321509093644, + "flos": 72301288982400.0, + "grad_norm": 0.8452992206378583, + "language_loss": 0.60239071, + "learning_rate": 3.3348231157275085e-06, + "loss": 0.62300354, + "num_input_tokens_seen": 103871180, + "router_z_loss_clip": 0.01855469, + "router_z_loss_mlp": 0.2890625, + "step": 4813, + "time_per_iteration": 3.227616548538208 + }, + { + "auxiliary_loss_clip": 0.01130233, + "auxiliary_loss_mlp": 0.0104328, + "balance_loss_clip": 1.02727079, + "balance_loss_mlp": 1.04549837, + "epoch": 0.2894333383436044, + "flos": 16216397637120.0, + "grad_norm": 1.8826759768804342, + "language_loss": 0.81616402, + "learning_rate": 3.3345330625632725e-06, + "loss": 0.83789915, + "num_input_tokens_seen": 103889040, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.84765625, + "step": 4814, + "time_per_iteration": 2.5389657020568848 + }, + { + "auxiliary_loss_clip": 0.01132807, + "auxiliary_loss_mlp": 0.01045738, + "balance_loss_clip": 1.0297873, + "balance_loss_mlp": 1.04464495, + "epoch": 0.2894934615962724, + "flos": 24828278918400.0, + "grad_norm": 1.6532361717230013, + "language_loss": 0.72615647, + "learning_rate": 3.3342429587934094e-06, + "loss": 0.74794197, + "num_input_tokens_seen": 103910380, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8828125, + "step": 4815, + "time_per_iteration": 2.517458438873291 + }, + { + "auxiliary_loss_clip": 0.01129383, + "auxiliary_loss_mlp": 0.0104797, + "balance_loss_clip": 1.03274667, + "balance_loss_mlp": 1.04815507, + "epoch": 0.28955358484894034, + "flos": 20449978329600.0, + "grad_norm": 1.520143184033477, + "language_loss": 0.70801306, + "learning_rate": 3.3339528044289198e-06, + "loss": 0.72978652, + "num_input_tokens_seen": 103929955, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8125, + "step": 4816, + "time_per_iteration": 2.5287740230560303 + }, + { + "auxiliary_loss_clip": 0.01135738, + "auxiliary_loss_mlp": 0.0104281, + "balance_loss_clip": 1.02590585, + "balance_loss_mlp": 1.04615664, + "epoch": 0.2896137081016083, + "flos": 22565188477440.0, + "grad_norm": 3.3715101323822174, + "language_loss": 0.74736607, + "learning_rate": 3.3336625994808055e-06, + "loss": 0.76915157, + "num_input_tokens_seen": 103948020, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8984375, + "step": 4817, + "time_per_iteration": 2.4828009605407715 + }, + { + "auxiliary_loss_clip": 0.01134031, + "auxiliary_loss_mlp": 0.0105341, + "balance_loss_clip": 1.03637469, + "balance_loss_mlp": 1.0465169, + "epoch": 0.28967383135427627, + "flos": 26687948734080.0, + "grad_norm": 1.754631597755812, + "language_loss": 0.76169789, + "learning_rate": 3.3333723439600723e-06, + "loss": 0.78357232, + "num_input_tokens_seen": 103968740, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.875, + "step": 4818, + "time_per_iteration": 2.5453133583068848 + }, + { + "auxiliary_loss_clip": 0.011322, + "auxiliary_loss_mlp": 0.01035125, + "balance_loss_clip": 1.0184474, + "balance_loss_mlp": 1.04606366, + "epoch": 0.28973395460694423, + "flos": 15558262692480.0, + "grad_norm": 1.8604375380991018, + "language_loss": 0.79827082, + "learning_rate": 3.3330820378777263e-06, + "loss": 0.81994408, + "num_input_tokens_seen": 103986005, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.859375, + "step": 4819, + "time_per_iteration": 2.4516472816467285 + }, + { + "auxiliary_loss_clip": 0.01135104, + "auxiliary_loss_mlp": 0.01043377, + "balance_loss_clip": 1.02553141, + "balance_loss_mlp": 1.04452121, + "epoch": 0.2897940778596122, + "flos": 18697465762560.0, + "grad_norm": 1.6026789889191464, + "language_loss": 0.78726941, + "learning_rate": 3.332791681244776e-06, + "loss": 0.80905426, + "num_input_tokens_seen": 104005070, + "router_z_loss_clip": 0.17871094, + "router_z_loss_mlp": 0.90625, + "step": 4820, + "time_per_iteration": 2.512927770614624 + }, + { + "auxiliary_loss_clip": 0.0113352, + "auxiliary_loss_mlp": 0.01036477, + "balance_loss_clip": 1.0202527, + "balance_loss_mlp": 1.04560018, + "epoch": 0.28985420111228016, + "flos": 18770292587520.0, + "grad_norm": 2.352701358428358, + "language_loss": 0.73083222, + "learning_rate": 3.332501274072231e-06, + "loss": 0.75253224, + "num_input_tokens_seen": 104022945, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.87890625, + "step": 4821, + "time_per_iteration": 2.4575939178466797 + }, + { + "auxiliary_loss_clip": 0.01130585, + "auxiliary_loss_mlp": 0.01036495, + "balance_loss_clip": 1.01979387, + "balance_loss_mlp": 1.04503322, + "epoch": 0.28991432436494813, + "flos": 23069840607360.0, + "grad_norm": 1.843174914976853, + "language_loss": 0.72629523, + "learning_rate": 3.332210816371104e-06, + "loss": 0.74796605, + "num_input_tokens_seen": 104042080, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.85546875, + "step": 4822, + "time_per_iteration": 2.4981486797332764 + }, + { + "auxiliary_loss_clip": 0.01133236, + "auxiliary_loss_mlp": 0.01047323, + "balance_loss_clip": 1.03044343, + "balance_loss_mlp": 1.04679179, + "epoch": 0.2899744476176161, + "flos": 17603195880960.0, + "grad_norm": 1.7581642571514904, + "language_loss": 0.66571164, + "learning_rate": 3.3319203081524102e-06, + "loss": 0.68751729, + "num_input_tokens_seen": 104060975, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.86328125, + "step": 4823, + "time_per_iteration": 2.4363584518432617 + }, + { + "auxiliary_loss_clip": 0.01128693, + "auxiliary_loss_mlp": 0.01036254, + "balance_loss_clip": 1.02018452, + "balance_loss_mlp": 1.04382014, + "epoch": 0.29003457087028406, + "flos": 22309360836480.0, + "grad_norm": 3.6840420234688684, + "language_loss": 0.80786806, + "learning_rate": 3.331629749427164e-06, + "loss": 0.82951754, + "num_input_tokens_seen": 104081395, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8515625, + "step": 4824, + "time_per_iteration": 2.4978654384613037 + }, + { + "auxiliary_loss_clip": 0.01132559, + "auxiliary_loss_mlp": 0.01043716, + "balance_loss_clip": 1.02547669, + "balance_loss_mlp": 1.04512334, + "epoch": 0.2900946941229521, + "flos": 21944975316480.0, + "grad_norm": 1.8817460080316075, + "language_loss": 0.72507697, + "learning_rate": 3.331339140206385e-06, + "loss": 0.74683976, + "num_input_tokens_seen": 104099995, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.875, + "step": 4825, + "time_per_iteration": 2.4740118980407715 + }, + { + "auxiliary_loss_clip": 0.01136872, + "auxiliary_loss_mlp": 0.01035046, + "balance_loss_clip": 1.01760566, + "balance_loss_mlp": 1.04886889, + "epoch": 0.29015481737562004, + "flos": 17932173569280.0, + "grad_norm": 2.3450778905142813, + "language_loss": 0.73504382, + "learning_rate": 3.331048480501092e-06, + "loss": 0.75676298, + "num_input_tokens_seen": 104118930, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.8828125, + "step": 4826, + "time_per_iteration": 2.4689221382141113 + }, + { + "auxiliary_loss_clip": 0.01131943, + "auxiliary_loss_mlp": 0.01036484, + "balance_loss_clip": 1.02041411, + "balance_loss_mlp": 1.04524112, + "epoch": 0.290214940628288, + "flos": 22783525297920.0, + "grad_norm": 3.139827505949132, + "language_loss": 0.68472409, + "learning_rate": 3.3307577703223073e-06, + "loss": 0.70640838, + "num_input_tokens_seen": 104136940, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8671875, + "step": 4827, + "time_per_iteration": 2.5236809253692627 + }, + { + "auxiliary_loss_clip": 0.01136266, + "auxiliary_loss_mlp": 0.01036355, + "balance_loss_clip": 1.01816392, + "balance_loss_mlp": 1.04921937, + "epoch": 0.290275063880956, + "flos": 20006481104640.0, + "grad_norm": 1.8651963869616242, + "language_loss": 0.80072737, + "learning_rate": 3.3304670096810545e-06, + "loss": 0.82245356, + "num_input_tokens_seen": 104154280, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.87109375, + "step": 4828, + "time_per_iteration": 2.491584300994873 + }, + { + "auxiliary_loss_clip": 0.01133081, + "auxiliary_loss_mlp": 0.01042375, + "balance_loss_clip": 1.026335, + "balance_loss_mlp": 1.0482254, + "epoch": 0.29033518713362394, + "flos": 22053605022720.0, + "grad_norm": 2.2252387209358666, + "language_loss": 0.80475402, + "learning_rate": 3.33017619858836e-06, + "loss": 0.82650864, + "num_input_tokens_seen": 104172605, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.84765625, + "step": 4829, + "time_per_iteration": 2.473210334777832 + }, + { + "auxiliary_loss_clip": 0.01131629, + "auxiliary_loss_mlp": 0.01041142, + "balance_loss_clip": 1.02482176, + "balance_loss_mlp": 1.04794419, + "epoch": 0.2903953103862919, + "flos": 25630056351360.0, + "grad_norm": 1.544892870636461, + "language_loss": 0.82288766, + "learning_rate": 3.329885337055249e-06, + "loss": 0.84461534, + "num_input_tokens_seen": 104194120, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8359375, + "step": 4830, + "time_per_iteration": 2.52874755859375 + }, + { + "auxiliary_loss_clip": 0.01136051, + "auxiliary_loss_mlp": 0.01046661, + "balance_loss_clip": 1.02992344, + "balance_loss_mlp": 1.04847991, + "epoch": 0.29045543363895987, + "flos": 16945851035520.0, + "grad_norm": 2.366175746199002, + "language_loss": 0.78858435, + "learning_rate": 3.3295944250927546e-06, + "loss": 0.81041145, + "num_input_tokens_seen": 104210875, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.875, + "step": 4831, + "time_per_iteration": 2.5465588569641113 + }, + { + "auxiliary_loss_clip": 0.0112817, + "auxiliary_loss_mlp": 0.010386, + "balance_loss_clip": 1.02356744, + "balance_loss_mlp": 1.045138, + "epoch": 0.29051555689162784, + "flos": 26395492199040.0, + "grad_norm": 1.8105888440812088, + "language_loss": 0.74415791, + "learning_rate": 3.3293034627119055e-06, + "loss": 0.76582563, + "num_input_tokens_seen": 104229875, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 4832, + "time_per_iteration": 2.6398987770080566 + }, + { + "auxiliary_loss_clip": 0.0113002, + "auxiliary_loss_mlp": 0.01032742, + "balance_loss_clip": 1.01806784, + "balance_loss_mlp": 1.04516697, + "epoch": 0.2905756801442958, + "flos": 21103875469440.0, + "grad_norm": 1.6051950803449415, + "language_loss": 0.75986588, + "learning_rate": 3.329012449923736e-06, + "loss": 0.78149348, + "num_input_tokens_seen": 104250405, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.84765625, + "step": 4833, + "time_per_iteration": 2.4772675037384033 + }, + { + "auxiliary_loss_clip": 0.01129626, + "auxiliary_loss_mlp": 0.01037033, + "balance_loss_clip": 1.02108264, + "balance_loss_mlp": 1.04542434, + "epoch": 0.29063580339696377, + "flos": 15706071158400.0, + "grad_norm": 1.807689816327527, + "language_loss": 0.64523911, + "learning_rate": 3.3287213867392813e-06, + "loss": 0.6669057, + "num_input_tokens_seen": 104269185, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.83984375, + "step": 4834, + "time_per_iteration": 2.4944729804992676 + }, + { + "auxiliary_loss_clip": 0.0112967, + "auxiliary_loss_mlp": 0.01031422, + "balance_loss_clip": 1.01674771, + "balance_loss_mlp": 1.04650283, + "epoch": 0.29069592664963173, + "flos": 24644990793600.0, + "grad_norm": 1.5516449013863105, + "language_loss": 0.71436119, + "learning_rate": 3.3284302731695783e-06, + "loss": 0.73597211, + "num_input_tokens_seen": 104289400, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.83203125, + "step": 4835, + "time_per_iteration": 2.5122785568237305 + }, + { + "auxiliary_loss_clip": 0.01129192, + "auxiliary_loss_mlp": 0.01038882, + "balance_loss_clip": 1.02430248, + "balance_loss_mlp": 1.04510283, + "epoch": 0.2907560499022997, + "flos": 24973753000320.0, + "grad_norm": 2.123413568873549, + "language_loss": 0.79669547, + "learning_rate": 3.3281391092256668e-06, + "loss": 0.81837618, + "num_input_tokens_seen": 104310485, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.83984375, + "step": 4836, + "time_per_iteration": 2.533221483230591 + }, + { + "auxiliary_loss_clip": 0.01129403, + "auxiliary_loss_mlp": 0.01039274, + "balance_loss_clip": 1.02338338, + "balance_loss_mlp": 1.04589558, + "epoch": 0.29081617315496766, + "flos": 18657496903680.0, + "grad_norm": 1.6671781935549963, + "language_loss": 0.80777872, + "learning_rate": 3.3278478949185865e-06, + "loss": 0.82946539, + "num_input_tokens_seen": 104327330, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8359375, + "step": 4837, + "time_per_iteration": 2.4579083919525146 + }, + { + "auxiliary_loss_clip": 0.01131777, + "auxiliary_loss_mlp": 0.01037569, + "balance_loss_clip": 1.02170265, + "balance_loss_mlp": 1.04491532, + "epoch": 0.2908762964076356, + "flos": 35331035955840.0, + "grad_norm": 1.8624538054458508, + "language_loss": 0.67733121, + "learning_rate": 3.327556630259381e-06, + "loss": 0.69902468, + "num_input_tokens_seen": 104350350, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8671875, + "step": 4838, + "time_per_iteration": 2.613682270050049 + }, + { + "auxiliary_loss_clip": 0.01137044, + "auxiliary_loss_mlp": 0.010412, + "balance_loss_clip": 1.02485621, + "balance_loss_mlp": 1.04893696, + "epoch": 0.29093641966030365, + "flos": 23076305055360.0, + "grad_norm": 1.6135989987029238, + "language_loss": 0.71288264, + "learning_rate": 3.327265315259095e-06, + "loss": 0.73466504, + "num_input_tokens_seen": 104369995, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.87890625, + "step": 4839, + "time_per_iteration": 2.506908416748047 + }, + { + "auxiliary_loss_clip": 0.0112979, + "auxiliary_loss_mlp": 0.01038337, + "balance_loss_clip": 1.02341795, + "balance_loss_mlp": 1.04433274, + "epoch": 0.2909965429129716, + "flos": 35955415094400.0, + "grad_norm": 1.876317037835641, + "language_loss": 0.75619674, + "learning_rate": 3.326973949928776e-06, + "loss": 0.77787805, + "num_input_tokens_seen": 104392285, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.85546875, + "step": 4840, + "time_per_iteration": 2.6259472370147705 + }, + { + "auxiliary_loss_clip": 0.011316, + "auxiliary_loss_mlp": 0.01041868, + "balance_loss_clip": 1.02688372, + "balance_loss_mlp": 1.0469749, + "epoch": 0.2910566661656396, + "flos": 30880231764480.0, + "grad_norm": 1.9955793585576265, + "language_loss": 0.60459495, + "learning_rate": 3.326682534279471e-06, + "loss": 0.62632966, + "num_input_tokens_seen": 104412640, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84375, + "step": 4841, + "time_per_iteration": 2.5497686862945557 + }, + { + "auxiliary_loss_clip": 0.01133928, + "auxiliary_loss_mlp": 0.01038335, + "balance_loss_clip": 1.0215385, + "balance_loss_mlp": 1.0483892, + "epoch": 0.29111678941830754, + "flos": 30010188533760.0, + "grad_norm": 1.7266193979009703, + "language_loss": 0.71366, + "learning_rate": 3.326391068322232e-06, + "loss": 0.73538262, + "num_input_tokens_seen": 104435245, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.85546875, + "step": 4842, + "time_per_iteration": 2.5817017555236816 + }, + { + "auxiliary_loss_clip": 0.01131749, + "auxiliary_loss_mlp": 0.0103654, + "balance_loss_clip": 1.02188897, + "balance_loss_mlp": 1.04632473, + "epoch": 0.2911769126709755, + "flos": 22857393617280.0, + "grad_norm": 1.5806493177236067, + "language_loss": 0.72846174, + "learning_rate": 3.3260995520681098e-06, + "loss": 0.7501446, + "num_input_tokens_seen": 104455395, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.85546875, + "step": 4843, + "time_per_iteration": 2.4728853702545166 + }, + { + "auxiliary_loss_clip": 0.01132332, + "auxiliary_loss_mlp": 0.01038034, + "balance_loss_clip": 1.0223223, + "balance_loss_mlp": 1.04598284, + "epoch": 0.2912370359236435, + "flos": 21650507619840.0, + "grad_norm": 2.0237546438656393, + "language_loss": 0.5840022, + "learning_rate": 3.3258079855281602e-06, + "loss": 0.60570586, + "num_input_tokens_seen": 104473350, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.86328125, + "step": 4844, + "time_per_iteration": 3.9377825260162354 + }, + { + "auxiliary_loss_clip": 0.01136792, + "auxiliary_loss_mlp": 0.01042261, + "balance_loss_clip": 1.02518439, + "balance_loss_mlp": 1.04942751, + "epoch": 0.29129715917631144, + "flos": 22893340152960.0, + "grad_norm": 2.1502970284536493, + "language_loss": 0.86360186, + "learning_rate": 3.3255163687134396e-06, + "loss": 0.88539243, + "num_input_tokens_seen": 104492265, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.875, + "step": 4845, + "time_per_iteration": 5.415091276168823 + }, + { + "auxiliary_loss_clip": 0.01134782, + "auxiliary_loss_mlp": 0.01051996, + "balance_loss_clip": 1.03494883, + "balance_loss_mlp": 1.04779911, + "epoch": 0.2913572824289794, + "flos": 22674464628480.0, + "grad_norm": 1.7275133095664568, + "language_loss": 0.66684157, + "learning_rate": 3.3252247016350046e-06, + "loss": 0.68870938, + "num_input_tokens_seen": 104510755, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.8671875, + "step": 4846, + "time_per_iteration": 2.495901584625244 + }, + { + "auxiliary_loss_clip": 0.01131044, + "auxiliary_loss_mlp": 0.01042533, + "balance_loss_clip": 1.02700055, + "balance_loss_mlp": 1.04691291, + "epoch": 0.29141740568164737, + "flos": 23107403255040.0, + "grad_norm": 1.7117272730106567, + "language_loss": 0.70501876, + "learning_rate": 3.3249329843039166e-06, + "loss": 0.72675455, + "num_input_tokens_seen": 104530830, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.83984375, + "step": 4847, + "time_per_iteration": 2.50537109375 + }, + { + "auxiliary_loss_clip": 0.01131589, + "auxiliary_loss_mlp": 0.01035574, + "balance_loss_clip": 1.01918232, + "balance_loss_mlp": 1.04682243, + "epoch": 0.29147752893431533, + "flos": 23587026583680.0, + "grad_norm": 2.14972579950547, + "language_loss": 0.73494464, + "learning_rate": 3.324641216731237e-06, + "loss": 0.75661629, + "num_input_tokens_seen": 104550115, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.84765625, + "step": 4848, + "time_per_iteration": 2.506683111190796 + }, + { + "auxiliary_loss_clip": 0.01132855, + "auxiliary_loss_mlp": 0.01042107, + "balance_loss_clip": 1.02569222, + "balance_loss_mlp": 1.04670119, + "epoch": 0.2915376521869833, + "flos": 20591968792320.0, + "grad_norm": 2.106691725132959, + "language_loss": 0.76689458, + "learning_rate": 3.3243493989280295e-06, + "loss": 0.78864431, + "num_input_tokens_seen": 104566255, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.86328125, + "step": 4849, + "time_per_iteration": 2.475512742996216 + }, + { + "auxiliary_loss_clip": 0.01134647, + "auxiliary_loss_mlp": 0.01043325, + "balance_loss_clip": 1.02732718, + "balance_loss_mlp": 1.04683709, + "epoch": 0.29159777543965126, + "flos": 20811490761600.0, + "grad_norm": 1.7698868684834754, + "language_loss": 0.78437513, + "learning_rate": 3.3240575309053596e-06, + "loss": 0.80615485, + "num_input_tokens_seen": 104585235, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87890625, + "step": 4850, + "time_per_iteration": 2.4774062633514404 + }, + { + "auxiliary_loss_clip": 0.01130071, + "auxiliary_loss_mlp": 0.01038553, + "balance_loss_clip": 1.02231026, + "balance_loss_mlp": 1.04620552, + "epoch": 0.29165789869231923, + "flos": 24244155947520.0, + "grad_norm": 1.7416717517415665, + "language_loss": 0.75775445, + "learning_rate": 3.323765612674296e-06, + "loss": 0.77944064, + "num_input_tokens_seen": 104605315, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.83984375, + "step": 4851, + "time_per_iteration": 2.4973719120025635 + }, + { + "auxiliary_loss_clip": 0.01130818, + "auxiliary_loss_mlp": 0.01045515, + "balance_loss_clip": 1.03071558, + "balance_loss_mlp": 1.04819655, + "epoch": 0.29171802194498725, + "flos": 28949925853440.0, + "grad_norm": 1.378687766604426, + "language_loss": 0.77111661, + "learning_rate": 3.3234736442459078e-06, + "loss": 0.79287988, + "num_input_tokens_seen": 104626055, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.828125, + "step": 4852, + "time_per_iteration": 2.5339767932891846 + }, + { + "auxiliary_loss_clip": 0.01132717, + "auxiliary_loss_mlp": 0.01045328, + "balance_loss_clip": 1.0296402, + "balance_loss_mlp": 1.04735672, + "epoch": 0.2917781451976552, + "flos": 22598226011520.0, + "grad_norm": 1.5345579183576068, + "language_loss": 0.78385615, + "learning_rate": 3.3231816256312665e-06, + "loss": 0.80563664, + "num_input_tokens_seen": 104646005, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8515625, + "step": 4853, + "time_per_iteration": 2.511125087738037 + }, + { + "auxiliary_loss_clip": 0.0113401, + "auxiliary_loss_mlp": 0.01038526, + "balance_loss_clip": 1.02278996, + "balance_loss_mlp": 1.04668474, + "epoch": 0.2918382684503232, + "flos": 21574448570880.0, + "grad_norm": 2.984154109703724, + "language_loss": 0.87946999, + "learning_rate": 3.322889556841445e-06, + "loss": 0.90119541, + "num_input_tokens_seen": 104661620, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.875, + "step": 4854, + "time_per_iteration": 2.4654700756073 + }, + { + "auxiliary_loss_clip": 0.0113237, + "auxiliary_loss_mlp": 0.01052716, + "balance_loss_clip": 1.03352284, + "balance_loss_mlp": 1.04678071, + "epoch": 0.29189839170299114, + "flos": 24353503925760.0, + "grad_norm": 1.8357290509449282, + "language_loss": 0.86585724, + "learning_rate": 3.322597437887519e-06, + "loss": 0.88770819, + "num_input_tokens_seen": 104681445, + "router_z_loss_clip": 0.19238281, + "router_z_loss_mlp": 0.85546875, + "step": 4855, + "time_per_iteration": 2.519432783126831 + }, + { + "auxiliary_loss_clip": 0.01043355, + "auxiliary_loss_mlp": 0.01004722, + "balance_loss_clip": 1.00283837, + "balance_loss_mlp": 1.01374364, + "epoch": 0.2919585149556591, + "flos": 71316726215040.0, + "grad_norm": 0.8090362112321295, + "language_loss": 0.60199535, + "learning_rate": 3.322305268780566e-06, + "loss": 0.6224761, + "num_input_tokens_seen": 104747945, + "router_z_loss_clip": 0.01879883, + "router_z_loss_mlp": 0.296875, + "step": 4856, + "time_per_iteration": 3.164905309677124 + }, + { + "auxiliary_loss_clip": 0.01130578, + "auxiliary_loss_mlp": 0.01039982, + "balance_loss_clip": 1.02499735, + "balance_loss_mlp": 1.04626632, + "epoch": 0.2920186382083271, + "flos": 15633208419840.0, + "grad_norm": 2.394144218040463, + "language_loss": 0.67995465, + "learning_rate": 3.322013049531664e-06, + "loss": 0.70166028, + "num_input_tokens_seen": 104766225, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.84375, + "step": 4857, + "time_per_iteration": 2.4615678787231445 + }, + { + "auxiliary_loss_clip": 0.01129998, + "auxiliary_loss_mlp": 0.01035751, + "balance_loss_clip": 1.0210768, + "balance_loss_mlp": 1.04613733, + "epoch": 0.29207876146099504, + "flos": 28366018364160.0, + "grad_norm": 2.1807634638236566, + "language_loss": 0.83958411, + "learning_rate": 3.321720780151895e-06, + "loss": 0.86124158, + "num_input_tokens_seen": 104785345, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.83984375, + "step": 4858, + "time_per_iteration": 2.561347723007202 + }, + { + "auxiliary_loss_clip": 0.01131346, + "auxiliary_loss_mlp": 0.0103964, + "balance_loss_clip": 1.02478647, + "balance_loss_mlp": 1.04746854, + "epoch": 0.292138884713663, + "flos": 21870963342720.0, + "grad_norm": 2.0714117361066298, + "language_loss": 0.77547097, + "learning_rate": 3.321428460652342e-06, + "loss": 0.79718083, + "num_input_tokens_seen": 104804560, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.83984375, + "step": 4859, + "time_per_iteration": 2.4801361560821533 + }, + { + "auxiliary_loss_clip": 0.01132855, + "auxiliary_loss_mlp": 0.01043796, + "balance_loss_clip": 1.02764332, + "balance_loss_mlp": 1.04424477, + "epoch": 0.29219900796633097, + "flos": 20992552243200.0, + "grad_norm": 2.0548529873010564, + "language_loss": 0.68948561, + "learning_rate": 3.3211360910440885e-06, + "loss": 0.71125209, + "num_input_tokens_seen": 104821105, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8828125, + "step": 4860, + "time_per_iteration": 2.531022071838379 + }, + { + "auxiliary_loss_clip": 0.01129954, + "auxiliary_loss_mlp": 0.01040561, + "balance_loss_clip": 1.0267868, + "balance_loss_mlp": 1.04821134, + "epoch": 0.29225913121899894, + "flos": 35004608133120.0, + "grad_norm": 2.771004145303475, + "language_loss": 0.75952631, + "learning_rate": 3.320843671338222e-06, + "loss": 0.78123146, + "num_input_tokens_seen": 104841440, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.81640625, + "step": 4861, + "time_per_iteration": 2.619257926940918 + }, + { + "auxiliary_loss_clip": 0.01129568, + "auxiliary_loss_mlp": 0.01048123, + "balance_loss_clip": 1.03350759, + "balance_loss_mlp": 1.04631817, + "epoch": 0.2923192544716669, + "flos": 13515663888000.0, + "grad_norm": 1.7230129115334698, + "language_loss": 0.91648388, + "learning_rate": 3.320551201545832e-06, + "loss": 0.93826073, + "num_input_tokens_seen": 104858210, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.83203125, + "step": 4862, + "time_per_iteration": 2.4596564769744873 + }, + { + "auxiliary_loss_clip": 0.01129785, + "auxiliary_loss_mlp": 0.01038215, + "balance_loss_clip": 1.02336144, + "balance_loss_mlp": 1.04544663, + "epoch": 0.29237937772433487, + "flos": 19463512141440.0, + "grad_norm": 2.061794510539927, + "language_loss": 0.73736131, + "learning_rate": 3.320258681678008e-06, + "loss": 0.75904131, + "num_input_tokens_seen": 104875620, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.84375, + "step": 4863, + "time_per_iteration": 2.4478728771209717 + }, + { + "auxiliary_loss_clip": 0.01125934, + "auxiliary_loss_mlp": 0.01038806, + "balance_loss_clip": 1.02474487, + "balance_loss_mlp": 1.04584527, + "epoch": 0.29243950097700283, + "flos": 20850597694080.0, + "grad_norm": 1.6779515608592832, + "language_loss": 0.78057373, + "learning_rate": 3.319966111745842e-06, + "loss": 0.80222106, + "num_input_tokens_seen": 104894600, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.80078125, + "step": 4864, + "time_per_iteration": 2.487544059753418 + }, + { + "auxiliary_loss_clip": 0.0113348, + "auxiliary_loss_mlp": 0.01045654, + "balance_loss_clip": 1.02927482, + "balance_loss_mlp": 1.04763806, + "epoch": 0.29249962422967085, + "flos": 23584225322880.0, + "grad_norm": 2.699456605470703, + "language_loss": 0.81919956, + "learning_rate": 3.319673491760429e-06, + "loss": 0.8409909, + "num_input_tokens_seen": 104914530, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.859375, + "step": 4865, + "time_per_iteration": 2.486553192138672 + }, + { + "auxiliary_loss_clip": 0.01130825, + "auxiliary_loss_mlp": 0.01040981, + "balance_loss_clip": 1.02523327, + "balance_loss_mlp": 1.04592669, + "epoch": 0.2925597474823388, + "flos": 22273342473600.0, + "grad_norm": 1.8393536761495908, + "language_loss": 0.85281575, + "learning_rate": 3.3193808217328645e-06, + "loss": 0.87453377, + "num_input_tokens_seen": 104933460, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8515625, + "step": 4866, + "time_per_iteration": 2.4981276988983154 + }, + { + "auxiliary_loss_clip": 0.01124877, + "auxiliary_loss_mlp": 0.01037248, + "balance_loss_clip": 1.02263868, + "balance_loss_mlp": 1.04323506, + "epoch": 0.2926198707350068, + "flos": 34456108475520.0, + "grad_norm": 1.627734535935432, + "language_loss": 0.755858, + "learning_rate": 3.3190881016742476e-06, + "loss": 0.77747923, + "num_input_tokens_seen": 104954495, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.81640625, + "step": 4867, + "time_per_iteration": 2.5813703536987305 + }, + { + "auxiliary_loss_clip": 0.01129928, + "auxiliary_loss_mlp": 0.01049325, + "balance_loss_clip": 1.03337526, + "balance_loss_mlp": 1.04375887, + "epoch": 0.29267999398767475, + "flos": 20704153944960.0, + "grad_norm": 4.179606236398783, + "language_loss": 0.73403615, + "learning_rate": 3.3187953315956776e-06, + "loss": 0.75582874, + "num_input_tokens_seen": 104971915, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.859375, + "step": 4868, + "time_per_iteration": 2.48374342918396 + }, + { + "auxiliary_loss_clip": 0.01128319, + "auxiliary_loss_mlp": 0.01033217, + "balance_loss_clip": 1.01857829, + "balance_loss_mlp": 1.04520726, + "epoch": 0.2927401172403427, + "flos": 18368667642240.0, + "grad_norm": 1.3015957921166281, + "language_loss": 0.74555755, + "learning_rate": 3.3185025115082566e-06, + "loss": 0.76717293, + "num_input_tokens_seen": 104991335, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.83203125, + "step": 4869, + "time_per_iteration": 2.458434820175171 + }, + { + "auxiliary_loss_clip": 0.01131121, + "auxiliary_loss_mlp": 0.01038828, + "balance_loss_clip": 1.02390289, + "balance_loss_mlp": 1.04639244, + "epoch": 0.2928002404930107, + "flos": 26104041244800.0, + "grad_norm": 1.465584897312906, + "language_loss": 0.76539874, + "learning_rate": 3.318209641423088e-06, + "loss": 0.78709823, + "num_input_tokens_seen": 105012015, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.84765625, + "step": 4870, + "time_per_iteration": 2.5194873809814453 + }, + { + "auxiliary_loss_clip": 0.01133405, + "auxiliary_loss_mlp": 0.01046415, + "balance_loss_clip": 1.03040564, + "balance_loss_mlp": 1.04584765, + "epoch": 0.29286036374567864, + "flos": 21324726241920.0, + "grad_norm": 2.259080578005736, + "language_loss": 0.67315602, + "learning_rate": 3.3179167213512777e-06, + "loss": 0.69495422, + "num_input_tokens_seen": 105031460, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.875, + "step": 4871, + "time_per_iteration": 2.4556169509887695 + }, + { + "auxiliary_loss_clip": 0.01125512, + "auxiliary_loss_mlp": 0.01039548, + "balance_loss_clip": 1.02509975, + "balance_loss_mlp": 1.04283524, + "epoch": 0.2929204869983466, + "flos": 29569492569600.0, + "grad_norm": 1.8081222369362746, + "language_loss": 0.76924586, + "learning_rate": 3.317623751303933e-06, + "loss": 0.79089642, + "num_input_tokens_seen": 105052965, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.828125, + "step": 4872, + "time_per_iteration": 2.575421094894409 + }, + { + "auxiliary_loss_clip": 0.01131072, + "auxiliary_loss_mlp": 0.01043663, + "balance_loss_clip": 1.0271883, + "balance_loss_mlp": 1.04527128, + "epoch": 0.2929806102510146, + "flos": 19058259922560.0, + "grad_norm": 2.2968152323379347, + "language_loss": 0.72835052, + "learning_rate": 3.317330731292164e-06, + "loss": 0.75009787, + "num_input_tokens_seen": 105071840, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.859375, + "step": 4873, + "time_per_iteration": 2.4370815753936768 + }, + { + "auxiliary_loss_clip": 0.01133087, + "auxiliary_loss_mlp": 0.01041861, + "balance_loss_clip": 1.02518392, + "balance_loss_mlp": 1.04519463, + "epoch": 0.29304073350368254, + "flos": 21944221130880.0, + "grad_norm": 1.8384173868300016, + "language_loss": 0.77871835, + "learning_rate": 3.3170376613270812e-06, + "loss": 0.80046785, + "num_input_tokens_seen": 105089445, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.87890625, + "step": 4874, + "time_per_iteration": 2.512613534927368 + }, + { + "auxiliary_loss_clip": 0.01135856, + "auxiliary_loss_mlp": 0.01045857, + "balance_loss_clip": 1.02962041, + "balance_loss_mlp": 1.04670048, + "epoch": 0.2931008567563505, + "flos": 15450818135040.0, + "grad_norm": 2.084283832751276, + "language_loss": 0.77047002, + "learning_rate": 3.3167445414197985e-06, + "loss": 0.79228717, + "num_input_tokens_seen": 105106210, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.890625, + "step": 4875, + "time_per_iteration": 2.487417221069336 + }, + { + "auxiliary_loss_clip": 0.01136141, + "auxiliary_loss_mlp": 0.0103441, + "balance_loss_clip": 1.01912785, + "balance_loss_mlp": 1.04909277, + "epoch": 0.29316098000901847, + "flos": 16983162288000.0, + "grad_norm": 1.6806867883636405, + "language_loss": 0.69183826, + "learning_rate": 3.316451371581431e-06, + "loss": 0.71354383, + "num_input_tokens_seen": 105124200, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.87109375, + "step": 4876, + "time_per_iteration": 2.4764888286590576 + }, + { + "auxiliary_loss_clip": 0.01128897, + "auxiliary_loss_mlp": 0.01045114, + "balance_loss_clip": 1.03027201, + "balance_loss_mlp": 1.04482532, + "epoch": 0.29322110326168643, + "flos": 16357705741440.0, + "grad_norm": 2.3621737524413913, + "language_loss": 0.8195532, + "learning_rate": 3.316158151823096e-06, + "loss": 0.84129333, + "num_input_tokens_seen": 105140400, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.83984375, + "step": 4877, + "time_per_iteration": 2.4738340377807617 + }, + { + "auxiliary_loss_clip": 0.01134087, + "auxiliary_loss_mlp": 0.01042806, + "balance_loss_clip": 1.02765405, + "balance_loss_mlp": 1.04704273, + "epoch": 0.29328122651435445, + "flos": 13990869843840.0, + "grad_norm": 1.8654341954981455, + "language_loss": 0.67843962, + "learning_rate": 3.315864882155911e-06, + "loss": 0.70020854, + "num_input_tokens_seen": 105157535, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.87109375, + "step": 4878, + "time_per_iteration": 2.4606332778930664 + }, + { + "auxiliary_loss_clip": 0.01130502, + "auxiliary_loss_mlp": 0.01041377, + "balance_loss_clip": 1.02624929, + "balance_loss_mlp": 1.04562759, + "epoch": 0.2933413497670224, + "flos": 25264593423360.0, + "grad_norm": 1.8286598598322423, + "language_loss": 0.7351383, + "learning_rate": 3.3155715625909982e-06, + "loss": 0.7568571, + "num_input_tokens_seen": 105175185, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.84765625, + "step": 4879, + "time_per_iteration": 2.502901315689087 + }, + { + "auxiliary_loss_clip": 0.01137009, + "auxiliary_loss_mlp": 0.01046436, + "balance_loss_clip": 1.02881706, + "balance_loss_mlp": 1.0484302, + "epoch": 0.2934014730196904, + "flos": 32123746656000.0, + "grad_norm": 2.0641755158914634, + "language_loss": 0.65864384, + "learning_rate": 3.3152781931394803e-06, + "loss": 0.68047822, + "num_input_tokens_seen": 105194540, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.88671875, + "step": 4880, + "time_per_iteration": 2.5785939693450928 + }, + { + "auxiliary_loss_clip": 0.01130839, + "auxiliary_loss_mlp": 0.01045571, + "balance_loss_clip": 1.02962136, + "balance_loss_mlp": 1.04453218, + "epoch": 0.29346159627235835, + "flos": 24352498344960.0, + "grad_norm": 2.157512175932489, + "language_loss": 0.70518327, + "learning_rate": 3.314984773812481e-06, + "loss": 0.72694737, + "num_input_tokens_seen": 105213215, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.86328125, + "step": 4881, + "time_per_iteration": 2.4913742542266846 + }, + { + "auxiliary_loss_clip": 0.01133082, + "auxiliary_loss_mlp": 0.01039157, + "balance_loss_clip": 1.02336192, + "balance_loss_mlp": 1.0471015, + "epoch": 0.2935217195250263, + "flos": 22746752749440.0, + "grad_norm": 2.112776228996839, + "language_loss": 0.83907056, + "learning_rate": 3.314691304621127e-06, + "loss": 0.86079299, + "num_input_tokens_seen": 105231585, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.859375, + "step": 4882, + "time_per_iteration": 2.4955010414123535 + }, + { + "auxiliary_loss_clip": 0.01135526, + "auxiliary_loss_mlp": 0.01041581, + "balance_loss_clip": 1.02495086, + "balance_loss_mlp": 1.0470233, + "epoch": 0.2935818427776943, + "flos": 21725561088000.0, + "grad_norm": 2.198383771985309, + "language_loss": 0.71811014, + "learning_rate": 3.314397785576548e-06, + "loss": 0.73988116, + "num_input_tokens_seen": 105250120, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8828125, + "step": 4883, + "time_per_iteration": 2.474574089050293 + }, + { + "auxiliary_loss_clip": 0.01132572, + "auxiliary_loss_mlp": 0.01038466, + "balance_loss_clip": 1.02225327, + "balance_loss_mlp": 1.04580843, + "epoch": 0.29364196603036224, + "flos": 23804968354560.0, + "grad_norm": 3.497082861184858, + "language_loss": 0.92629534, + "learning_rate": 3.3141042166898726e-06, + "loss": 0.94800568, + "num_input_tokens_seen": 105266065, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8671875, + "step": 4884, + "time_per_iteration": 2.4947426319122314 + }, + { + "auxiliary_loss_clip": 0.01138135, + "auxiliary_loss_mlp": 0.01045606, + "balance_loss_clip": 1.03032374, + "balance_loss_mlp": 1.05094171, + "epoch": 0.2937020892830302, + "flos": 23470064922240.0, + "grad_norm": 2.2315982417854876, + "language_loss": 0.73729408, + "learning_rate": 3.313810597972234e-06, + "loss": 0.75913155, + "num_input_tokens_seen": 105282155, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.87109375, + "step": 4885, + "time_per_iteration": 2.5076494216918945 + }, + { + "auxiliary_loss_clip": 0.01132864, + "auxiliary_loss_mlp": 0.0104824, + "balance_loss_clip": 1.03185511, + "balance_loss_mlp": 1.0468272, + "epoch": 0.2937622125356982, + "flos": 24272740195200.0, + "grad_norm": 2.1964333946604135, + "language_loss": 0.85011208, + "learning_rate": 3.3135169294347655e-06, + "loss": 0.87192315, + "num_input_tokens_seen": 105299225, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.859375, + "step": 4886, + "time_per_iteration": 3.911407232284546 + }, + { + "auxiliary_loss_clip": 0.01135202, + "auxiliary_loss_mlp": 0.01041375, + "balance_loss_clip": 1.02624702, + "balance_loss_mlp": 1.04678059, + "epoch": 0.29382233578836614, + "flos": 20662461233280.0, + "grad_norm": 2.1393217933297657, + "language_loss": 0.77027792, + "learning_rate": 3.313223211088603e-06, + "loss": 0.79204369, + "num_input_tokens_seen": 105315710, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.88671875, + "step": 4887, + "time_per_iteration": 3.906132936477661 + }, + { + "auxiliary_loss_clip": 0.01135916, + "auxiliary_loss_mlp": 0.01046614, + "balance_loss_clip": 1.03127122, + "balance_loss_mlp": 1.04697633, + "epoch": 0.2938824590410341, + "flos": 16545052103040.0, + "grad_norm": 2.1952396364021536, + "language_loss": 0.79558414, + "learning_rate": 3.3129294429448855e-06, + "loss": 0.8174094, + "num_input_tokens_seen": 105333505, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.890625, + "step": 4888, + "time_per_iteration": 2.4338221549987793 + }, + { + "auxiliary_loss_clip": 0.01130748, + "auxiliary_loss_mlp": 0.01032451, + "balance_loss_clip": 1.0173831, + "balance_loss_mlp": 1.04529762, + "epoch": 0.29394258229370207, + "flos": 37925474382720.0, + "grad_norm": 1.4299668586503376, + "language_loss": 0.55301261, + "learning_rate": 3.3126356250147517e-06, + "loss": 0.57464457, + "num_input_tokens_seen": 105355605, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.85546875, + "step": 4889, + "time_per_iteration": 2.637645959854126 + }, + { + "auxiliary_loss_clip": 0.01134449, + "auxiliary_loss_mlp": 0.01039798, + "balance_loss_clip": 1.02314413, + "balance_loss_mlp": 1.0465076, + "epoch": 0.29400270554637004, + "flos": 20044690197120.0, + "grad_norm": 1.9477461279926194, + "language_loss": 0.84309214, + "learning_rate": 3.3123417573093434e-06, + "loss": 0.86483455, + "num_input_tokens_seen": 105374225, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.87890625, + "step": 4890, + "time_per_iteration": 2.445218801498413 + }, + { + "auxiliary_loss_clip": 0.01135501, + "auxiliary_loss_mlp": 0.01039459, + "balance_loss_clip": 1.02402174, + "balance_loss_mlp": 1.04780436, + "epoch": 0.294062828799038, + "flos": 15266380775040.0, + "grad_norm": 1.9951401673219091, + "language_loss": 0.72357798, + "learning_rate": 3.3120478398398046e-06, + "loss": 0.74532759, + "num_input_tokens_seen": 105391565, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.875, + "step": 4891, + "time_per_iteration": 2.434298515319824 + }, + { + "auxiliary_loss_clip": 0.01134115, + "auxiliary_loss_mlp": 0.01045308, + "balance_loss_clip": 1.02910721, + "balance_loss_mlp": 1.04683042, + "epoch": 0.294122952051706, + "flos": 22747147799040.0, + "grad_norm": 1.9834299238301316, + "language_loss": 0.77230573, + "learning_rate": 3.3117538726172797e-06, + "loss": 0.79410005, + "num_input_tokens_seen": 105409840, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.875, + "step": 4892, + "time_per_iteration": 2.4628806114196777 + }, + { + "auxiliary_loss_clip": 0.01130172, + "auxiliary_loss_mlp": 0.01035757, + "balance_loss_clip": 1.01989055, + "balance_loss_mlp": 1.04514182, + "epoch": 0.294183075304374, + "flos": 24972891073920.0, + "grad_norm": 1.7053650125053033, + "language_loss": 0.7846024, + "learning_rate": 3.3114598556529164e-06, + "loss": 0.80626166, + "num_input_tokens_seen": 105428645, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8515625, + "step": 4893, + "time_per_iteration": 2.505946159362793 + }, + { + "auxiliary_loss_clip": 0.01132333, + "auxiliary_loss_mlp": 0.01048117, + "balance_loss_clip": 1.03252435, + "balance_loss_mlp": 1.04651928, + "epoch": 0.29424319855704195, + "flos": 30952986762240.0, + "grad_norm": 1.8389301673785101, + "language_loss": 0.85052156, + "learning_rate": 3.311165788957864e-06, + "loss": 0.87232608, + "num_input_tokens_seen": 105447480, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.85546875, + "step": 4894, + "time_per_iteration": 2.5221872329711914 + }, + { + "auxiliary_loss_clip": 0.01132661, + "auxiliary_loss_mlp": 0.01036474, + "balance_loss_clip": 1.02120304, + "balance_loss_mlp": 1.04568195, + "epoch": 0.2943033218097099, + "flos": 15231583474560.0, + "grad_norm": 2.595597690193387, + "language_loss": 0.9027828, + "learning_rate": 3.310871672543274e-06, + "loss": 0.92447418, + "num_input_tokens_seen": 105464600, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.87109375, + "step": 4895, + "time_per_iteration": 2.4466798305511475 + }, + { + "auxiliary_loss_clip": 0.01135692, + "auxiliary_loss_mlp": 0.01040955, + "balance_loss_clip": 1.02434874, + "balance_loss_mlp": 1.04720199, + "epoch": 0.2943634450623779, + "flos": 21725884310400.0, + "grad_norm": 3.001231056574592, + "language_loss": 0.86597103, + "learning_rate": 3.3105775064202982e-06, + "loss": 0.88773751, + "num_input_tokens_seen": 105481510, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8828125, + "step": 4896, + "time_per_iteration": 2.459611654281616 + }, + { + "auxiliary_loss_clip": 0.01134294, + "auxiliary_loss_mlp": 0.01050105, + "balance_loss_clip": 1.03402412, + "balance_loss_mlp": 1.04802299, + "epoch": 0.29442356831504585, + "flos": 22602104680320.0, + "grad_norm": 2.652800133974417, + "language_loss": 0.73196733, + "learning_rate": 3.3102832906000924e-06, + "loss": 0.75381136, + "num_input_tokens_seen": 105501390, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.86328125, + "step": 4897, + "time_per_iteration": 2.4981348514556885 + }, + { + "auxiliary_loss_clip": 0.01136241, + "auxiliary_loss_mlp": 0.01042547, + "balance_loss_clip": 1.02546394, + "balance_loss_mlp": 1.0458895, + "epoch": 0.2944836915677138, + "flos": 20011401267840.0, + "grad_norm": 1.867954953207583, + "language_loss": 0.73798919, + "learning_rate": 3.309989025093813e-06, + "loss": 0.75977707, + "num_input_tokens_seen": 105519600, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.90234375, + "step": 4898, + "time_per_iteration": 2.439952850341797 + }, + { + "auxiliary_loss_clip": 0.01142949, + "auxiliary_loss_mlp": 0.01042887, + "balance_loss_clip": 1.02471972, + "balance_loss_mlp": 1.05136585, + "epoch": 0.2945438148203818, + "flos": 20045875345920.0, + "grad_norm": 2.6754375338801477, + "language_loss": 0.70309317, + "learning_rate": 3.309694709912618e-06, + "loss": 0.72495157, + "num_input_tokens_seen": 105535970, + "router_z_loss_clip": 0.18164062, + "router_z_loss_mlp": 0.9140625, + "step": 4899, + "time_per_iteration": 2.4757347106933594 + }, + { + "auxiliary_loss_clip": 0.01135914, + "auxiliary_loss_mlp": 0.01041458, + "balance_loss_clip": 1.02458405, + "balance_loss_mlp": 1.0484879, + "epoch": 0.29460393807304974, + "flos": 23733542160000.0, + "grad_norm": 1.9063479453414416, + "language_loss": 0.79007781, + "learning_rate": 3.3094003450676685e-06, + "loss": 0.8118515, + "num_input_tokens_seen": 105556735, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.875, + "step": 4900, + "time_per_iteration": 2.50555419921875 + }, + { + "auxiliary_loss_clip": 0.01131673, + "auxiliary_loss_mlp": 0.01042831, + "balance_loss_clip": 1.02720261, + "balance_loss_mlp": 1.04425764, + "epoch": 0.2946640613257177, + "flos": 14976079056000.0, + "grad_norm": 1.709443882500664, + "language_loss": 0.80718857, + "learning_rate": 3.3091059305701268e-06, + "loss": 0.8289336, + "num_input_tokens_seen": 105574875, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.875, + "step": 4901, + "time_per_iteration": 2.481768846511841 + }, + { + "auxiliary_loss_clip": 0.01127885, + "auxiliary_loss_mlp": 0.01035027, + "balance_loss_clip": 1.02062666, + "balance_loss_mlp": 1.04583955, + "epoch": 0.2947241845783857, + "flos": 24243904552320.0, + "grad_norm": 1.9567596526300628, + "language_loss": 0.57923675, + "learning_rate": 3.308811466431157e-06, + "loss": 0.60086584, + "num_input_tokens_seen": 105594225, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8203125, + "step": 4902, + "time_per_iteration": 2.491337299346924 + }, + { + "auxiliary_loss_clip": 0.01131951, + "auxiliary_loss_mlp": 0.01038919, + "balance_loss_clip": 1.02416682, + "balance_loss_mlp": 1.045946, + "epoch": 0.29478430783105364, + "flos": 19938394874880.0, + "grad_norm": 1.6713771638909152, + "language_loss": 0.75298065, + "learning_rate": 3.308516952661925e-06, + "loss": 0.77468932, + "num_input_tokens_seen": 105614000, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.859375, + "step": 4903, + "time_per_iteration": 2.4884400367736816 + }, + { + "auxiliary_loss_clip": 0.01132991, + "auxiliary_loss_mlp": 0.01042452, + "balance_loss_clip": 1.02560806, + "balance_loss_mlp": 1.04630995, + "epoch": 0.2948444310837216, + "flos": 27381347856000.0, + "grad_norm": 1.8012466742437707, + "language_loss": 0.6254617, + "learning_rate": 3.3082223892736e-06, + "loss": 0.64721614, + "num_input_tokens_seen": 105634575, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8671875, + "step": 4904, + "time_per_iteration": 2.5288941860198975 + }, + { + "auxiliary_loss_clip": 0.01134735, + "auxiliary_loss_mlp": 0.01038462, + "balance_loss_clip": 1.02252424, + "balance_loss_mlp": 1.04603219, + "epoch": 0.2949045543363896, + "flos": 23405462311680.0, + "grad_norm": 1.5173763027357385, + "language_loss": 0.7301079, + "learning_rate": 3.3079277762773496e-06, + "loss": 0.75183994, + "num_input_tokens_seen": 105654385, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.88671875, + "step": 4905, + "time_per_iteration": 2.5069708824157715 + }, + { + "auxiliary_loss_clip": 0.01131529, + "auxiliary_loss_mlp": 0.01041997, + "balance_loss_clip": 1.02577305, + "balance_loss_mlp": 1.0456897, + "epoch": 0.2949646775890576, + "flos": 23951483930880.0, + "grad_norm": 1.6701950888056076, + "language_loss": 0.81584871, + "learning_rate": 3.3076331136843476e-06, + "loss": 0.8375839, + "num_input_tokens_seen": 105673570, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.859375, + "step": 4906, + "time_per_iteration": 2.473604202270508 + }, + { + "auxiliary_loss_clip": 0.01128251, + "auxiliary_loss_mlp": 0.01034193, + "balance_loss_clip": 1.01870799, + "balance_loss_mlp": 1.04443395, + "epoch": 0.29502480084172555, + "flos": 22784315397120.0, + "grad_norm": 1.9494272179492087, + "language_loss": 0.87158448, + "learning_rate": 3.3073384015057667e-06, + "loss": 0.89320892, + "num_input_tokens_seen": 105691940, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.83984375, + "step": 4907, + "time_per_iteration": 2.490842819213867 + }, + { + "auxiliary_loss_clip": 0.01135464, + "auxiliary_loss_mlp": 0.01042187, + "balance_loss_clip": 1.02623653, + "balance_loss_mlp": 1.04758191, + "epoch": 0.2950849240943935, + "flos": 19646656611840.0, + "grad_norm": 2.3387997458884833, + "language_loss": 0.81563503, + "learning_rate": 3.307043639752782e-06, + "loss": 0.83741152, + "num_input_tokens_seen": 105709825, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87890625, + "step": 4908, + "time_per_iteration": 2.4586410522460938 + }, + { + "auxiliary_loss_clip": 0.01054339, + "auxiliary_loss_mlp": 0.01042068, + "balance_loss_clip": 1.03970814, + "balance_loss_mlp": 1.0157342, + "epoch": 0.2951450473470615, + "flos": 71002829260800.0, + "grad_norm": 0.7811313355607663, + "language_loss": 0.57214808, + "learning_rate": 3.3067488284365728e-06, + "loss": 0.59311211, + "num_input_tokens_seen": 105766880, + "router_z_loss_clip": 0.02355957, + "router_z_loss_mlp": 0.38671875, + "step": 4909, + "time_per_iteration": 2.9739394187927246 + }, + { + "auxiliary_loss_clip": 0.01136234, + "auxiliary_loss_mlp": 0.01038405, + "balance_loss_clip": 1.02340245, + "balance_loss_mlp": 1.05156505, + "epoch": 0.29520517059972945, + "flos": 22966310632320.0, + "grad_norm": 1.44395719574742, + "language_loss": 0.86585498, + "learning_rate": 3.3064539675683163e-06, + "loss": 0.88760138, + "num_input_tokens_seen": 105786875, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.84765625, + "step": 4910, + "time_per_iteration": 2.4779117107391357 + }, + { + "auxiliary_loss_clip": 0.01126914, + "auxiliary_loss_mlp": 0.01040378, + "balance_loss_clip": 1.02551222, + "balance_loss_mlp": 1.04549575, + "epoch": 0.2952652938523974, + "flos": 20485673470080.0, + "grad_norm": 1.8630755123750513, + "language_loss": 0.72632295, + "learning_rate": 3.3061590571591946e-06, + "loss": 0.74799585, + "num_input_tokens_seen": 105805315, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8125, + "step": 4911, + "time_per_iteration": 2.4959700107574463 + }, + { + "auxiliary_loss_clip": 0.01131053, + "auxiliary_loss_mlp": 0.01037399, + "balance_loss_clip": 1.02239108, + "balance_loss_mlp": 1.04823601, + "epoch": 0.2953254171050654, + "flos": 19646584784640.0, + "grad_norm": 1.774615067737937, + "language_loss": 0.8988539, + "learning_rate": 3.3058640972203904e-06, + "loss": 0.92053854, + "num_input_tokens_seen": 105825125, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 4912, + "time_per_iteration": 2.4532997608184814 + }, + { + "auxiliary_loss_clip": 0.01131716, + "auxiliary_loss_mlp": 0.01046481, + "balance_loss_clip": 1.03022075, + "balance_loss_mlp": 1.04712319, + "epoch": 0.29538554035773334, + "flos": 22747973811840.0, + "grad_norm": 1.458226475428025, + "language_loss": 0.83448595, + "learning_rate": 3.3055690877630894e-06, + "loss": 0.85626793, + "num_input_tokens_seen": 105846085, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.84765625, + "step": 4913, + "time_per_iteration": 2.515580654144287 + }, + { + "auxiliary_loss_clip": 0.01129704, + "auxiliary_loss_mlp": 0.01039162, + "balance_loss_clip": 1.02385521, + "balance_loss_mlp": 1.0438993, + "epoch": 0.2954456636104013, + "flos": 21871861182720.0, + "grad_norm": 1.6602062940724112, + "language_loss": 0.77029538, + "learning_rate": 3.3052740287984765e-06, + "loss": 0.79198408, + "num_input_tokens_seen": 105865400, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.859375, + "step": 4914, + "time_per_iteration": 2.457158088684082 + }, + { + "auxiliary_loss_clip": 0.01128554, + "auxiliary_loss_mlp": 0.01039033, + "balance_loss_clip": 1.02302349, + "balance_loss_mlp": 1.04553497, + "epoch": 0.2955057868630693, + "flos": 40442560871040.0, + "grad_norm": 1.9027466376674422, + "language_loss": 0.81550008, + "learning_rate": 3.3049789203377424e-06, + "loss": 0.83717597, + "num_input_tokens_seen": 105887920, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.828125, + "step": 4915, + "time_per_iteration": 2.6669511795043945 + }, + { + "auxiliary_loss_clip": 0.01134085, + "auxiliary_loss_mlp": 0.01037926, + "balance_loss_clip": 1.02215445, + "balance_loss_mlp": 1.0477066, + "epoch": 0.29556591011573724, + "flos": 22564506119040.0, + "grad_norm": 1.9544787473030132, + "language_loss": 0.84415555, + "learning_rate": 3.3046837623920772e-06, + "loss": 0.8658756, + "num_input_tokens_seen": 105904035, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.86328125, + "step": 4916, + "time_per_iteration": 2.473867416381836 + }, + { + "auxiliary_loss_clip": 0.01127987, + "auxiliary_loss_mlp": 0.01033684, + "balance_loss_clip": 1.01874673, + "balance_loss_mlp": 1.04477537, + "epoch": 0.2956260333684052, + "flos": 22089300163200.0, + "grad_norm": 3.5737730841451225, + "language_loss": 0.69611692, + "learning_rate": 3.3043885549726723e-06, + "loss": 0.71773368, + "num_input_tokens_seen": 105922685, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.83203125, + "step": 4917, + "time_per_iteration": 2.5078670978546143 + }, + { + "auxiliary_loss_clip": 0.01134116, + "auxiliary_loss_mlp": 0.01041431, + "balance_loss_clip": 1.02550471, + "balance_loss_mlp": 1.04932523, + "epoch": 0.2956861566210732, + "flos": 16435488643200.0, + "grad_norm": 2.1750223310256507, + "language_loss": 0.90840054, + "learning_rate": 3.3040932980907226e-06, + "loss": 0.93015605, + "num_input_tokens_seen": 105940425, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.84765625, + "step": 4918, + "time_per_iteration": 2.438870668411255 + }, + { + "auxiliary_loss_clip": 0.01134586, + "auxiliary_loss_mlp": 0.01040808, + "balance_loss_clip": 1.02504885, + "balance_loss_mlp": 1.04929781, + "epoch": 0.2957462798737412, + "flos": 25812087500160.0, + "grad_norm": 1.9164121886210477, + "language_loss": 0.72399461, + "learning_rate": 3.303797991757425e-06, + "loss": 0.74574864, + "num_input_tokens_seen": 105960550, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8515625, + "step": 4919, + "time_per_iteration": 2.5533134937286377 + }, + { + "auxiliary_loss_clip": 0.01130751, + "auxiliary_loss_mlp": 0.01042531, + "balance_loss_clip": 1.02661633, + "balance_loss_mlp": 1.04704165, + "epoch": 0.29580640312640916, + "flos": 16690849407360.0, + "grad_norm": 1.7148380002351797, + "language_loss": 0.75758076, + "learning_rate": 3.3035026359839763e-06, + "loss": 0.77931356, + "num_input_tokens_seen": 105978820, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8359375, + "step": 4920, + "time_per_iteration": 2.4288933277130127 + }, + { + "auxiliary_loss_clip": 0.01138995, + "auxiliary_loss_mlp": 0.01045405, + "balance_loss_clip": 1.02953875, + "balance_loss_mlp": 1.05214858, + "epoch": 0.2958665263790771, + "flos": 23945594100480.0, + "grad_norm": 2.2591712667141075, + "language_loss": 0.68327153, + "learning_rate": 3.3032072307815774e-06, + "loss": 0.7051155, + "num_input_tokens_seen": 105997545, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8671875, + "step": 4921, + "time_per_iteration": 2.5978074073791504 + }, + { + "auxiliary_loss_clip": 0.01136262, + "auxiliary_loss_mlp": 0.01042633, + "balance_loss_clip": 1.02580023, + "balance_loss_mlp": 1.04953861, + "epoch": 0.2959266496317451, + "flos": 18478410670080.0, + "grad_norm": 1.8781945072150448, + "language_loss": 0.74265885, + "learning_rate": 3.3029117761614298e-06, + "loss": 0.76444781, + "num_input_tokens_seen": 106015320, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8671875, + "step": 4922, + "time_per_iteration": 2.4518954753875732 + }, + { + "auxiliary_loss_clip": 0.0113841, + "auxiliary_loss_mlp": 0.0103604, + "balance_loss_clip": 1.01932716, + "balance_loss_mlp": 1.04900336, + "epoch": 0.29598677288441305, + "flos": 25957489754880.0, + "grad_norm": 2.178664992776949, + "language_loss": 0.76679426, + "learning_rate": 3.302616272134737e-06, + "loss": 0.78853875, + "num_input_tokens_seen": 106034555, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.89453125, + "step": 4923, + "time_per_iteration": 2.5565848350524902 + }, + { + "auxiliary_loss_clip": 0.0113218, + "auxiliary_loss_mlp": 0.01039495, + "balance_loss_clip": 1.02359807, + "balance_loss_mlp": 1.04730439, + "epoch": 0.296046896137081, + "flos": 25155999630720.0, + "grad_norm": 1.616043641477794, + "language_loss": 0.86307567, + "learning_rate": 3.3023207187127042e-06, + "loss": 0.88479245, + "num_input_tokens_seen": 106054200, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8515625, + "step": 4924, + "time_per_iteration": 2.5081374645233154 + }, + { + "auxiliary_loss_clip": 0.01132422, + "auxiliary_loss_mlp": 0.01034495, + "balance_loss_clip": 1.01864016, + "balance_loss_mlp": 1.04767513, + "epoch": 0.296107019389749, + "flos": 21761148487680.0, + "grad_norm": 1.3983202546472309, + "language_loss": 0.8180936, + "learning_rate": 3.3020251159065396e-06, + "loss": 0.83976275, + "num_input_tokens_seen": 106074700, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.84765625, + "step": 4925, + "time_per_iteration": 2.5473146438598633 + }, + { + "auxiliary_loss_clip": 0.01132696, + "auxiliary_loss_mlp": 0.01036748, + "balance_loss_clip": 1.02175128, + "balance_loss_mlp": 1.04893184, + "epoch": 0.29616714264241695, + "flos": 17960039544960.0, + "grad_norm": 2.5479827750219735, + "language_loss": 0.85168374, + "learning_rate": 3.301729463727452e-06, + "loss": 0.87337816, + "num_input_tokens_seen": 106091415, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8359375, + "step": 4926, + "time_per_iteration": 2.4603803157806396 + }, + { + "auxiliary_loss_clip": 0.0113285, + "auxiliary_loss_mlp": 0.01039481, + "balance_loss_clip": 1.02391791, + "balance_loss_mlp": 1.04658842, + "epoch": 0.2962272658950849, + "flos": 15012779777280.0, + "grad_norm": 2.1014080951069913, + "language_loss": 0.85908806, + "learning_rate": 3.3014337621866527e-06, + "loss": 0.88081133, + "num_input_tokens_seen": 106109135, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.86328125, + "step": 4927, + "time_per_iteration": 2.4724504947662354 + }, + { + "auxiliary_loss_clip": 0.01129564, + "auxiliary_loss_mlp": 0.01039461, + "balance_loss_clip": 1.02434492, + "balance_loss_mlp": 1.04636681, + "epoch": 0.2962873891477529, + "flos": 14720861946240.0, + "grad_norm": 1.8730507383843338, + "language_loss": 0.80967462, + "learning_rate": 3.3011380112953553e-06, + "loss": 0.83136487, + "num_input_tokens_seen": 106125750, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.83203125, + "step": 4928, + "time_per_iteration": 5.46146297454834 + }, + { + "auxiliary_loss_clip": 0.01138553, + "auxiliary_loss_mlp": 0.01041915, + "balance_loss_clip": 1.023211, + "balance_loss_mlp": 1.04749835, + "epoch": 0.29634751240042084, + "flos": 26723787528960.0, + "grad_norm": 3.002605920988437, + "language_loss": 0.72472513, + "learning_rate": 3.300842211064773e-06, + "loss": 0.7465297, + "num_input_tokens_seen": 106142835, + "router_z_loss_clip": 0.18652344, + "router_z_loss_mlp": 0.91015625, + "step": 4929, + "time_per_iteration": 2.4938502311706543 + }, + { + "auxiliary_loss_clip": 0.01136289, + "auxiliary_loss_mlp": 0.01043304, + "balance_loss_clip": 1.02631676, + "balance_loss_mlp": 1.04823208, + "epoch": 0.2964076356530888, + "flos": 14571293713920.0, + "grad_norm": 2.429634231323073, + "language_loss": 0.72424346, + "learning_rate": 3.3005463615061246e-06, + "loss": 0.74603939, + "num_input_tokens_seen": 106160680, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.8828125, + "step": 4930, + "time_per_iteration": 2.486492156982422 + }, + { + "auxiliary_loss_clip": 0.01059615, + "auxiliary_loss_mlp": 0.01002568, + "balance_loss_clip": 1.00047004, + "balance_loss_mlp": 1.0186131, + "epoch": 0.29646775890575683, + "flos": 63104315063040.0, + "grad_norm": 0.8134562784526058, + "language_loss": 0.60710716, + "learning_rate": 3.3002504626306275e-06, + "loss": 0.627729, + "num_input_tokens_seen": 106224415, + "router_z_loss_clip": 0.02099609, + "router_z_loss_mlp": 0.41015625, + "step": 4931, + "time_per_iteration": 3.002444267272949 + }, + { + "auxiliary_loss_clip": 0.01058931, + "auxiliary_loss_mlp": 0.01001224, + "balance_loss_clip": 0.99926931, + "balance_loss_mlp": 1.01823413, + "epoch": 0.2965278821584248, + "flos": 63067686168960.0, + "grad_norm": 0.7413672345708404, + "language_loss": 0.52383232, + "learning_rate": 3.2999545144495023e-06, + "loss": 0.54443383, + "num_input_tokens_seen": 106279140, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.40625, + "step": 4932, + "time_per_iteration": 2.974777936935425 + }, + { + "auxiliary_loss_clip": 0.01127694, + "auxiliary_loss_mlp": 0.01039106, + "balance_loss_clip": 1.02322757, + "balance_loss_mlp": 1.04449248, + "epoch": 0.29658800541109276, + "flos": 23768734510080.0, + "grad_norm": 3.155895790893495, + "language_loss": 0.81622797, + "learning_rate": 3.299658516973972e-06, + "loss": 0.83789599, + "num_input_tokens_seen": 106298190, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.83203125, + "step": 4933, + "time_per_iteration": 2.518906593322754 + }, + { + "auxiliary_loss_clip": 0.0112788, + "auxiliary_loss_mlp": 0.0103376, + "balance_loss_clip": 1.01854897, + "balance_loss_mlp": 1.04651821, + "epoch": 0.2966481286637607, + "flos": 23988543788160.0, + "grad_norm": 1.671865304120784, + "language_loss": 0.75257647, + "learning_rate": 3.299362470215261e-06, + "loss": 0.77419287, + "num_input_tokens_seen": 106319065, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8125, + "step": 4934, + "time_per_iteration": 2.5015640258789062 + }, + { + "auxiliary_loss_clip": 0.01134944, + "auxiliary_loss_mlp": 0.01045163, + "balance_loss_clip": 1.02837849, + "balance_loss_mlp": 1.04699588, + "epoch": 0.2967082519164287, + "flos": 17165157523200.0, + "grad_norm": 1.752558919138232, + "language_loss": 0.62510157, + "learning_rate": 3.299066374184594e-06, + "loss": 0.64690268, + "num_input_tokens_seen": 106338040, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.87890625, + "step": 4935, + "time_per_iteration": 2.462982654571533 + }, + { + "auxiliary_loss_clip": 0.01129673, + "auxiliary_loss_mlp": 0.01041832, + "balance_loss_clip": 1.02585804, + "balance_loss_mlp": 1.04613912, + "epoch": 0.29676837516909665, + "flos": 29387712816000.0, + "grad_norm": 1.4993711353436514, + "language_loss": 0.79789758, + "learning_rate": 3.2987702288932e-06, + "loss": 0.81961262, + "num_input_tokens_seen": 106358900, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8359375, + "step": 4936, + "time_per_iteration": 2.5267326831817627 + }, + { + "auxiliary_loss_clip": 0.01132719, + "auxiliary_loss_mlp": 0.01045272, + "balance_loss_clip": 1.02854681, + "balance_loss_mlp": 1.04649782, + "epoch": 0.2968284984217646, + "flos": 34751222616960.0, + "grad_norm": 1.8807271027259396, + "language_loss": 0.74074632, + "learning_rate": 3.298474034352309e-06, + "loss": 0.76252627, + "num_input_tokens_seen": 106381805, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.86328125, + "step": 4937, + "time_per_iteration": 2.607790946960449 + }, + { + "auxiliary_loss_clip": 0.01132772, + "auxiliary_loss_mlp": 0.01038823, + "balance_loss_clip": 1.0224793, + "balance_loss_mlp": 1.04839468, + "epoch": 0.2968886216744326, + "flos": 21544104556800.0, + "grad_norm": 1.629632810423829, + "language_loss": 0.7804476, + "learning_rate": 3.2981777905731526e-06, + "loss": 0.80216354, + "num_input_tokens_seen": 106402365, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.84375, + "step": 4938, + "time_per_iteration": 2.469116687774658 + }, + { + "auxiliary_loss_clip": 0.01134705, + "auxiliary_loss_mlp": 0.01041256, + "balance_loss_clip": 1.02543736, + "balance_loss_mlp": 1.04814208, + "epoch": 0.29694874492710055, + "flos": 12787323811200.0, + "grad_norm": 2.041677851061636, + "language_loss": 0.77017808, + "learning_rate": 3.297881497566964e-06, + "loss": 0.79193771, + "num_input_tokens_seen": 106419800, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8671875, + "step": 4939, + "time_per_iteration": 2.453615427017212 + }, + { + "auxiliary_loss_clip": 0.01136816, + "auxiliary_loss_mlp": 0.01036899, + "balance_loss_clip": 1.02075171, + "balance_loss_mlp": 1.04958081, + "epoch": 0.2970088681797685, + "flos": 24569973239040.0, + "grad_norm": 1.5588161926919628, + "language_loss": 0.78206903, + "learning_rate": 3.297585155344979e-06, + "loss": 0.80380619, + "num_input_tokens_seen": 106440300, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.875, + "step": 4940, + "time_per_iteration": 2.5125393867492676 + }, + { + "auxiliary_loss_clip": 0.01133351, + "auxiliary_loss_mlp": 0.01040737, + "balance_loss_clip": 1.0233798, + "balance_loss_mlp": 1.04633832, + "epoch": 0.2970689914324365, + "flos": 23659171050240.0, + "grad_norm": 3.9307439231373884, + "language_loss": 0.75487554, + "learning_rate": 3.297288763918435e-06, + "loss": 0.77661633, + "num_input_tokens_seen": 106460035, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.87109375, + "step": 4941, + "time_per_iteration": 2.5308516025543213 + }, + { + "auxiliary_loss_clip": 0.0113684, + "auxiliary_loss_mlp": 0.01050296, + "balance_loss_clip": 1.03295147, + "balance_loss_mlp": 1.04803753, + "epoch": 0.29712911468510445, + "flos": 39670301439360.0, + "grad_norm": 2.557458362521145, + "language_loss": 0.73998737, + "learning_rate": 3.2969923232985712e-06, + "loss": 0.7618587, + "num_input_tokens_seen": 106481095, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.890625, + "step": 4942, + "time_per_iteration": 2.6214303970336914 + }, + { + "auxiliary_loss_clip": 0.0113696, + "auxiliary_loss_mlp": 0.01047243, + "balance_loss_clip": 1.03017855, + "balance_loss_mlp": 1.04778039, + "epoch": 0.2971892379377724, + "flos": 26395312631040.0, + "grad_norm": 1.997792424787015, + "language_loss": 0.70484138, + "learning_rate": 3.2966958334966287e-06, + "loss": 0.72668344, + "num_input_tokens_seen": 106501590, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.890625, + "step": 4943, + "time_per_iteration": 2.533313751220703 + }, + { + "auxiliary_loss_clip": 0.01137748, + "auxiliary_loss_mlp": 0.01043005, + "balance_loss_clip": 1.02657795, + "balance_loss_mlp": 1.04838014, + "epoch": 0.2972493611904404, + "flos": 17603195880960.0, + "grad_norm": 1.9523342898428475, + "language_loss": 0.80111414, + "learning_rate": 3.2963992945238497e-06, + "loss": 0.82292169, + "num_input_tokens_seen": 106519430, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.89453125, + "step": 4944, + "time_per_iteration": 2.464364528656006 + }, + { + "auxiliary_loss_clip": 0.01129992, + "auxiliary_loss_mlp": 0.01044699, + "balance_loss_clip": 1.02979231, + "balance_loss_mlp": 1.04640603, + "epoch": 0.2973094844431084, + "flos": 20412774817920.0, + "grad_norm": 2.1633352367153105, + "language_loss": 0.83451837, + "learning_rate": 3.2961027063914795e-06, + "loss": 0.85626531, + "num_input_tokens_seen": 106535870, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8359375, + "step": 4945, + "time_per_iteration": 2.4981510639190674 + }, + { + "auxiliary_loss_clip": 0.011318, + "auxiliary_loss_mlp": 0.01039077, + "balance_loss_clip": 1.02353168, + "balance_loss_mlp": 1.04738569, + "epoch": 0.29736960769577636, + "flos": 17493488766720.0, + "grad_norm": 2.2158088930062747, + "language_loss": 0.66624904, + "learning_rate": 3.2958060691107654e-06, + "loss": 0.68795776, + "num_input_tokens_seen": 106553560, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84375, + "step": 4946, + "time_per_iteration": 2.526228666305542 + }, + { + "auxiliary_loss_clip": 0.0113802, + "auxiliary_loss_mlp": 0.01034492, + "balance_loss_clip": 1.01880383, + "balance_loss_mlp": 1.0509392, + "epoch": 0.2974297309484443, + "flos": 26103969417600.0, + "grad_norm": 1.7941079108563611, + "language_loss": 0.73766255, + "learning_rate": 3.2955093826929547e-06, + "loss": 0.75938767, + "num_input_tokens_seen": 106574115, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.87109375, + "step": 4947, + "time_per_iteration": 2.5380265712738037 + }, + { + "auxiliary_loss_clip": 0.01135403, + "auxiliary_loss_mlp": 0.01044741, + "balance_loss_clip": 1.02774215, + "balance_loss_mlp": 1.04653597, + "epoch": 0.2974898542011123, + "flos": 25666433850240.0, + "grad_norm": 2.40735653244717, + "language_loss": 0.7330308, + "learning_rate": 3.2952126471492985e-06, + "loss": 0.75483221, + "num_input_tokens_seen": 106593070, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.890625, + "step": 4948, + "time_per_iteration": 2.5096492767333984 + }, + { + "auxiliary_loss_clip": 0.01129361, + "auxiliary_loss_mlp": 0.01033618, + "balance_loss_clip": 1.01824629, + "balance_loss_mlp": 1.04442465, + "epoch": 0.29754997745378026, + "flos": 18661339658880.0, + "grad_norm": 2.0973131899278825, + "language_loss": 0.84031421, + "learning_rate": 3.2949158624910497e-06, + "loss": 0.86194396, + "num_input_tokens_seen": 106610695, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8515625, + "step": 4949, + "time_per_iteration": 2.4650402069091797 + }, + { + "auxiliary_loss_clip": 0.01129505, + "auxiliary_loss_mlp": 0.01036103, + "balance_loss_clip": 1.02019429, + "balance_loss_mlp": 1.04509461, + "epoch": 0.2976101007064482, + "flos": 22274599449600.0, + "grad_norm": 1.77267818675948, + "language_loss": 0.71322602, + "learning_rate": 3.2946190287294603e-06, + "loss": 0.73488206, + "num_input_tokens_seen": 106631300, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.84375, + "step": 4950, + "time_per_iteration": 2.491163969039917 + }, + { + "auxiliary_loss_clip": 0.01127031, + "auxiliary_loss_mlp": 0.01043355, + "balance_loss_clip": 1.02792883, + "balance_loss_mlp": 1.04543924, + "epoch": 0.2976702239591162, + "flos": 21945657674880.0, + "grad_norm": 1.7996518465212372, + "language_loss": 0.82192945, + "learning_rate": 3.294322145875789e-06, + "loss": 0.84363329, + "num_input_tokens_seen": 106650065, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.81640625, + "step": 4951, + "time_per_iteration": 2.5001299381256104 + }, + { + "auxiliary_loss_clip": 0.01127377, + "auxiliary_loss_mlp": 0.01035652, + "balance_loss_clip": 1.01936841, + "balance_loss_mlp": 1.04211378, + "epoch": 0.29773034721178415, + "flos": 24637197542400.0, + "grad_norm": 2.6816702718299763, + "language_loss": 0.73421168, + "learning_rate": 3.2940252139412912e-06, + "loss": 0.75584191, + "num_input_tokens_seen": 106668230, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8515625, + "step": 4952, + "time_per_iteration": 2.4888715744018555 + }, + { + "auxiliary_loss_clip": 0.01132645, + "auxiliary_loss_mlp": 0.01041244, + "balance_loss_clip": 1.0246501, + "balance_loss_mlp": 1.04677546, + "epoch": 0.2977904704644521, + "flos": 20557566541440.0, + "grad_norm": 1.7548041314188605, + "language_loss": 0.83702904, + "learning_rate": 3.293728232937228e-06, + "loss": 0.85876799, + "num_input_tokens_seen": 106687785, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.859375, + "step": 4953, + "time_per_iteration": 2.486267566680908 + }, + { + "auxiliary_loss_clip": 0.01131661, + "auxiliary_loss_mlp": 0.0103863, + "balance_loss_clip": 1.02330005, + "balance_loss_mlp": 1.04566419, + "epoch": 0.2978505937171201, + "flos": 18916449027840.0, + "grad_norm": 2.078619348093555, + "language_loss": 0.74560732, + "learning_rate": 3.2934312028748597e-06, + "loss": 0.7673102, + "num_input_tokens_seen": 106706875, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.859375, + "step": 4954, + "time_per_iteration": 2.454066276550293 + }, + { + "auxiliary_loss_clip": 0.01129016, + "auxiliary_loss_mlp": 0.01036885, + "balance_loss_clip": 1.02153063, + "balance_loss_mlp": 1.0450201, + "epoch": 0.29791071696978805, + "flos": 19317750750720.0, + "grad_norm": 1.9786208165821892, + "language_loss": 0.75643009, + "learning_rate": 3.293134123765452e-06, + "loss": 0.77808911, + "num_input_tokens_seen": 106725105, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.83984375, + "step": 4955, + "time_per_iteration": 2.487297773361206 + }, + { + "auxiliary_loss_clip": 0.01132846, + "auxiliary_loss_mlp": 0.01035521, + "balance_loss_clip": 1.01980329, + "balance_loss_mlp": 1.04604173, + "epoch": 0.297970840222456, + "flos": 18806813740800.0, + "grad_norm": 3.347495877937089, + "language_loss": 0.72235912, + "learning_rate": 3.2928369956202684e-06, + "loss": 0.74404275, + "num_input_tokens_seen": 106744780, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8671875, + "step": 4956, + "time_per_iteration": 2.453639507293701 + }, + { + "auxiliary_loss_clip": 0.01134178, + "auxiliary_loss_mlp": 0.01044496, + "balance_loss_clip": 1.02737164, + "balance_loss_mlp": 1.04482651, + "epoch": 0.298030963475124, + "flos": 22852760762880.0, + "grad_norm": 1.6786835957024704, + "language_loss": 0.79504669, + "learning_rate": 3.2925398184505754e-06, + "loss": 0.81683344, + "num_input_tokens_seen": 106764670, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.89453125, + "step": 4957, + "time_per_iteration": 2.4680192470550537 + }, + { + "auxiliary_loss_clip": 0.01133842, + "auxiliary_loss_mlp": 0.0103974, + "balance_loss_clip": 1.02283621, + "balance_loss_mlp": 1.04692602, + "epoch": 0.298091086727792, + "flos": 21868485304320.0, + "grad_norm": 1.5505958112028584, + "language_loss": 0.70515305, + "learning_rate": 3.2922425922676437e-06, + "loss": 0.7268889, + "num_input_tokens_seen": 106783695, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8671875, + "step": 4958, + "time_per_iteration": 2.463550090789795 + }, + { + "auxiliary_loss_clip": 0.01130665, + "auxiliary_loss_mlp": 0.0104304, + "balance_loss_clip": 1.02685153, + "balance_loss_mlp": 1.04660892, + "epoch": 0.29815120998045996, + "flos": 21175014355200.0, + "grad_norm": 1.6483091075690746, + "language_loss": 0.78709656, + "learning_rate": 3.291945317082743e-06, + "loss": 0.8088336, + "num_input_tokens_seen": 106803150, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.83984375, + "step": 4959, + "time_per_iteration": 2.4896273612976074 + }, + { + "auxiliary_loss_clip": 0.0112987, + "auxiliary_loss_mlp": 0.01045688, + "balance_loss_clip": 1.03010738, + "balance_loss_mlp": 1.04477429, + "epoch": 0.29821133323312793, + "flos": 19896271200000.0, + "grad_norm": 1.8058675414038505, + "language_loss": 0.79814601, + "learning_rate": 3.291647992907147e-06, + "loss": 0.81990159, + "num_input_tokens_seen": 106820705, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8515625, + "step": 4960, + "time_per_iteration": 2.4524307250976562 + }, + { + "auxiliary_loss_clip": 0.01133353, + "auxiliary_loss_mlp": 0.01047089, + "balance_loss_clip": 1.02998269, + "balance_loss_mlp": 1.04504156, + "epoch": 0.2982714564857959, + "flos": 12750766744320.0, + "grad_norm": 2.8105894923901418, + "language_loss": 0.73709917, + "learning_rate": 3.291350619752129e-06, + "loss": 0.75890362, + "num_input_tokens_seen": 106837335, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.8828125, + "step": 4961, + "time_per_iteration": 2.463160991668701 + }, + { + "auxiliary_loss_clip": 0.01132538, + "auxiliary_loss_mlp": 0.0103792, + "balance_loss_clip": 1.02301908, + "balance_loss_mlp": 1.0466218, + "epoch": 0.29833157973846386, + "flos": 22271905929600.0, + "grad_norm": 1.946317435202559, + "language_loss": 0.62041843, + "learning_rate": 3.291053197628967e-06, + "loss": 0.64212298, + "num_input_tokens_seen": 106856250, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.859375, + "step": 4962, + "time_per_iteration": 2.4734280109405518 + }, + { + "auxiliary_loss_clip": 0.0113099, + "auxiliary_loss_mlp": 0.01038012, + "balance_loss_clip": 1.02143037, + "balance_loss_mlp": 1.04580986, + "epoch": 0.2983917029911318, + "flos": 15372999319680.0, + "grad_norm": 1.708438122809617, + "language_loss": 0.83075964, + "learning_rate": 3.2907557265489375e-06, + "loss": 0.85244966, + "num_input_tokens_seen": 106873370, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8515625, + "step": 4963, + "time_per_iteration": 2.4676647186279297 + }, + { + "auxiliary_loss_clip": 0.01132139, + "auxiliary_loss_mlp": 0.01037543, + "balance_loss_clip": 1.02108073, + "balance_loss_mlp": 1.04811728, + "epoch": 0.2984518262437998, + "flos": 15377632174080.0, + "grad_norm": 2.8539744131594924, + "language_loss": 0.66537225, + "learning_rate": 3.290458206523322e-06, + "loss": 0.68706906, + "num_input_tokens_seen": 106890330, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.83984375, + "step": 4964, + "time_per_iteration": 2.425261974334717 + }, + { + "auxiliary_loss_clip": 0.01128116, + "auxiliary_loss_mlp": 0.01034534, + "balance_loss_clip": 1.01994288, + "balance_loss_mlp": 1.04498291, + "epoch": 0.29851194949646775, + "flos": 18108458542080.0, + "grad_norm": 1.6142193033036512, + "language_loss": 0.70836121, + "learning_rate": 3.2901606375634015e-06, + "loss": 0.72998774, + "num_input_tokens_seen": 106909190, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.83203125, + "step": 4965, + "time_per_iteration": 2.468221664428711 + }, + { + "auxiliary_loss_clip": 0.01137695, + "auxiliary_loss_mlp": 0.01047125, + "balance_loss_clip": 1.0309124, + "balance_loss_mlp": 1.05098724, + "epoch": 0.2985720727491357, + "flos": 22018233104640.0, + "grad_norm": 2.501073720290292, + "language_loss": 0.66185117, + "learning_rate": 3.289863019680461e-06, + "loss": 0.68369937, + "num_input_tokens_seen": 106927825, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8671875, + "step": 4966, + "time_per_iteration": 2.479327440261841 + }, + { + "auxiliary_loss_clip": 0.01134994, + "auxiliary_loss_mlp": 0.01040953, + "balance_loss_clip": 1.02595615, + "balance_loss_mlp": 1.04869342, + "epoch": 0.2986321960018037, + "flos": 13041355772160.0, + "grad_norm": 2.7651343279829215, + "language_loss": 0.74186444, + "learning_rate": 3.289565352885785e-06, + "loss": 0.76362395, + "num_input_tokens_seen": 106943155, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.86328125, + "step": 4967, + "time_per_iteration": 2.4752163887023926 + }, + { + "auxiliary_loss_clip": 0.01129475, + "auxiliary_loss_mlp": 0.01035044, + "balance_loss_clip": 1.02035177, + "balance_loss_mlp": 1.04422212, + "epoch": 0.29869231925447165, + "flos": 14465034305280.0, + "grad_norm": 1.9700123684688966, + "language_loss": 0.71222222, + "learning_rate": 3.2892676371906614e-06, + "loss": 0.73386747, + "num_input_tokens_seen": 106960295, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8515625, + "step": 4968, + "time_per_iteration": 2.448028564453125 + }, + { + "auxiliary_loss_clip": 0.01131577, + "auxiliary_loss_mlp": 0.0103395, + "balance_loss_clip": 1.01884651, + "balance_loss_mlp": 1.04596853, + "epoch": 0.2987524425071396, + "flos": 31650228639360.0, + "grad_norm": 2.0898000655075752, + "language_loss": 0.77127141, + "learning_rate": 3.2889698726063805e-06, + "loss": 0.79292667, + "num_input_tokens_seen": 106982870, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.85546875, + "step": 4969, + "time_per_iteration": 2.5737853050231934 + }, + { + "auxiliary_loss_clip": 0.01131698, + "auxiliary_loss_mlp": 0.01037718, + "balance_loss_clip": 1.022578, + "balance_loss_mlp": 1.04641569, + "epoch": 0.2988125657598076, + "flos": 21433427775360.0, + "grad_norm": 1.5683816051841135, + "language_loss": 0.69798505, + "learning_rate": 3.2886720591442327e-06, + "loss": 0.71967924, + "num_input_tokens_seen": 107002405, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8515625, + "step": 4970, + "time_per_iteration": 5.428143501281738 + }, + { + "auxiliary_loss_clip": 0.01135849, + "auxiliary_loss_mlp": 0.01045402, + "balance_loss_clip": 1.02831888, + "balance_loss_mlp": 1.04582572, + "epoch": 0.2988726890124756, + "flos": 18076965292800.0, + "grad_norm": 2.0403310419369314, + "language_loss": 0.85269564, + "learning_rate": 3.2883741968155103e-06, + "loss": 0.8745082, + "num_input_tokens_seen": 107017310, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.8984375, + "step": 4971, + "time_per_iteration": 2.4557158946990967 + }, + { + "auxiliary_loss_clip": 0.0113165, + "auxiliary_loss_mlp": 0.01044418, + "balance_loss_clip": 1.02905178, + "balance_loss_mlp": 1.0487361, + "epoch": 0.29893281226514357, + "flos": 21755653706880.0, + "grad_norm": 1.8300460221108372, + "language_loss": 0.79116535, + "learning_rate": 3.2880762856315107e-06, + "loss": 0.81292605, + "num_input_tokens_seen": 107034645, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.828125, + "step": 4972, + "time_per_iteration": 2.492119550704956 + }, + { + "auxiliary_loss_clip": 0.0113418, + "auxiliary_loss_mlp": 0.01040036, + "balance_loss_clip": 1.02457476, + "balance_loss_mlp": 1.0491786, + "epoch": 0.29899293551781153, + "flos": 16836718538880.0, + "grad_norm": 1.9080397703774756, + "language_loss": 0.85019803, + "learning_rate": 3.2877783256035285e-06, + "loss": 0.87194014, + "num_input_tokens_seen": 107051125, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84765625, + "step": 4973, + "time_per_iteration": 2.4409923553466797 + }, + { + "auxiliary_loss_clip": 0.01128243, + "auxiliary_loss_mlp": 0.0103693, + "balance_loss_clip": 1.02192163, + "balance_loss_mlp": 1.04866779, + "epoch": 0.2990530587704795, + "flos": 11729215946880.0, + "grad_norm": 1.5302170897903997, + "language_loss": 0.77397263, + "learning_rate": 3.287480316742863e-06, + "loss": 0.79562438, + "num_input_tokens_seen": 107068815, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.796875, + "step": 4974, + "time_per_iteration": 2.4786176681518555 + }, + { + "auxiliary_loss_clip": 0.01135129, + "auxiliary_loss_mlp": 0.01042004, + "balance_loss_clip": 1.02723432, + "balance_loss_mlp": 1.04905188, + "epoch": 0.29911318202314746, + "flos": 28039877850240.0, + "grad_norm": 2.0911748108299015, + "language_loss": 0.72264957, + "learning_rate": 3.287182259060815e-06, + "loss": 0.74442089, + "num_input_tokens_seen": 107090420, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.859375, + "step": 4975, + "time_per_iteration": 2.5267655849456787 + }, + { + "auxiliary_loss_clip": 0.01133427, + "auxiliary_loss_mlp": 0.01038056, + "balance_loss_clip": 1.02204621, + "balance_loss_mlp": 1.0501368, + "epoch": 0.2991733052758154, + "flos": 18733555952640.0, + "grad_norm": 4.957635138610608, + "language_loss": 0.76028466, + "learning_rate": 3.286884152568687e-06, + "loss": 0.78199953, + "num_input_tokens_seen": 107107255, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.83203125, + "step": 4976, + "time_per_iteration": 2.46476149559021 + }, + { + "auxiliary_loss_clip": 0.01131159, + "auxiliary_loss_mlp": 0.01039669, + "balance_loss_clip": 1.02464914, + "balance_loss_mlp": 1.04786563, + "epoch": 0.2992334285284834, + "flos": 15559160532480.0, + "grad_norm": 2.141179611311424, + "language_loss": 0.86060619, + "learning_rate": 3.2865859972777827e-06, + "loss": 0.88231456, + "num_input_tokens_seen": 107123840, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.83203125, + "step": 4977, + "time_per_iteration": 2.4342682361602783 + }, + { + "auxiliary_loss_clip": 0.01135764, + "auxiliary_loss_mlp": 0.01041989, + "balance_loss_clip": 1.02605033, + "balance_loss_mlp": 1.0510987, + "epoch": 0.29929355178115136, + "flos": 21797561900160.0, + "grad_norm": 1.6147948075287948, + "language_loss": 0.68286109, + "learning_rate": 3.2862877931994088e-06, + "loss": 0.7046386, + "num_input_tokens_seen": 107143475, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.84375, + "step": 4978, + "time_per_iteration": 2.539616823196411 + }, + { + "auxiliary_loss_clip": 0.01138133, + "auxiliary_loss_mlp": 0.01036989, + "balance_loss_clip": 1.02078843, + "balance_loss_mlp": 1.053123, + "epoch": 0.2993536750338193, + "flos": 21178533888000.0, + "grad_norm": 1.9781984123500023, + "language_loss": 0.7654568, + "learning_rate": 3.2859895403448726e-06, + "loss": 0.78720796, + "num_input_tokens_seen": 107161725, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8515625, + "step": 4979, + "time_per_iteration": 2.4865188598632812 + }, + { + "auxiliary_loss_clip": 0.01130641, + "auxiliary_loss_mlp": 0.01038072, + "balance_loss_clip": 1.02265859, + "balance_loss_mlp": 1.04520524, + "epoch": 0.2994137982864873, + "flos": 32122130544000.0, + "grad_norm": 1.7578947600277828, + "language_loss": 0.68300819, + "learning_rate": 3.285691238725484e-06, + "loss": 0.70469534, + "num_input_tokens_seen": 107183935, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.85546875, + "step": 4980, + "time_per_iteration": 2.6137757301330566 + }, + { + "auxiliary_loss_clip": 0.01132193, + "auxiliary_loss_mlp": 0.01039056, + "balance_loss_clip": 1.02396405, + "balance_loss_mlp": 1.05068171, + "epoch": 0.29947392153915525, + "flos": 21105419754240.0, + "grad_norm": 1.9242198828448243, + "language_loss": 0.73239923, + "learning_rate": 3.285392888352555e-06, + "loss": 0.75411171, + "num_input_tokens_seen": 107204285, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8125, + "step": 4981, + "time_per_iteration": 2.5342931747436523 + }, + { + "auxiliary_loss_clip": 0.01135451, + "auxiliary_loss_mlp": 0.01037274, + "balance_loss_clip": 1.02227712, + "balance_loss_mlp": 1.04691803, + "epoch": 0.2995340447918232, + "flos": 21542632099200.0, + "grad_norm": 1.470312251429405, + "language_loss": 0.86429024, + "learning_rate": 3.2850944892373987e-06, + "loss": 0.8860175, + "num_input_tokens_seen": 107225265, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.88671875, + "step": 4982, + "time_per_iteration": 2.5238943099975586 + }, + { + "auxiliary_loss_clip": 0.01138194, + "auxiliary_loss_mlp": 0.01041283, + "balance_loss_clip": 1.02446246, + "balance_loss_mlp": 1.04975057, + "epoch": 0.2995941680444912, + "flos": 16725143917440.0, + "grad_norm": 2.2481661066872904, + "language_loss": 0.86378068, + "learning_rate": 3.2847960413913307e-06, + "loss": 0.88557541, + "num_input_tokens_seen": 107241335, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8828125, + "step": 4983, + "time_per_iteration": 2.4477322101593018 + }, + { + "auxiliary_loss_clip": 0.01133456, + "auxiliary_loss_mlp": 0.01040756, + "balance_loss_clip": 1.02577138, + "balance_loss_mlp": 1.0483377, + "epoch": 0.2996542912971592, + "flos": 20923496346240.0, + "grad_norm": 1.8474343514891325, + "language_loss": 0.78286207, + "learning_rate": 3.284497544825668e-06, + "loss": 0.80460417, + "num_input_tokens_seen": 107259375, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8515625, + "step": 4984, + "time_per_iteration": 2.490079402923584 + }, + { + "auxiliary_loss_clip": 0.01136807, + "auxiliary_loss_mlp": 0.01046143, + "balance_loss_clip": 1.02960873, + "balance_loss_mlp": 1.05052662, + "epoch": 0.29971441454982717, + "flos": 25079868754560.0, + "grad_norm": 1.555514289558953, + "language_loss": 0.78418988, + "learning_rate": 3.2841989995517303e-06, + "loss": 0.80601943, + "num_input_tokens_seen": 107279890, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.86328125, + "step": 4985, + "time_per_iteration": 2.5188379287719727 + }, + { + "auxiliary_loss_clip": 0.01138287, + "auxiliary_loss_mlp": 0.01037976, + "balance_loss_clip": 1.02115583, + "balance_loss_mlp": 1.05010915, + "epoch": 0.29977453780249513, + "flos": 52555911840000.0, + "grad_norm": 3.8074401298215905, + "language_loss": 0.72157449, + "learning_rate": 3.283900405580837e-06, + "loss": 0.74333715, + "num_input_tokens_seen": 107303430, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8828125, + "step": 4986, + "time_per_iteration": 2.7730660438537598 + }, + { + "auxiliary_loss_clip": 0.01136069, + "auxiliary_loss_mlp": 0.01041734, + "balance_loss_clip": 1.02523577, + "balance_loss_mlp": 1.04813981, + "epoch": 0.2998346610551631, + "flos": 22237144542720.0, + "grad_norm": 1.7357810931981628, + "language_loss": 0.73332191, + "learning_rate": 3.283601762924312e-06, + "loss": 0.75509989, + "num_input_tokens_seen": 107323700, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.87890625, + "step": 4987, + "time_per_iteration": 2.4857406616210938 + }, + { + "auxiliary_loss_clip": 0.01131799, + "auxiliary_loss_mlp": 0.01036451, + "balance_loss_clip": 1.02162147, + "balance_loss_mlp": 1.04787469, + "epoch": 0.29989478430783106, + "flos": 16873203778560.0, + "grad_norm": 2.6184059112472817, + "language_loss": 0.80173379, + "learning_rate": 3.2833030715934793e-06, + "loss": 0.82341629, + "num_input_tokens_seen": 107341965, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.83984375, + "step": 4988, + "time_per_iteration": 2.477614641189575 + }, + { + "auxiliary_loss_clip": 0.01133993, + "auxiliary_loss_mlp": 0.01044495, + "balance_loss_clip": 1.02874756, + "balance_loss_mlp": 1.04897678, + "epoch": 0.29995490756049903, + "flos": 23768878164480.0, + "grad_norm": 1.615528223125509, + "language_loss": 0.70302641, + "learning_rate": 3.2830043315996658e-06, + "loss": 0.72481132, + "num_input_tokens_seen": 107362615, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8515625, + "step": 4989, + "time_per_iteration": 2.4942874908447266 + }, + { + "auxiliary_loss_clip": 0.01137636, + "auxiliary_loss_mlp": 0.01040507, + "balance_loss_clip": 1.02382946, + "balance_loss_mlp": 1.05045295, + "epoch": 0.300015030813167, + "flos": 14465321614080.0, + "grad_norm": 2.0547136882256654, + "language_loss": 0.85636222, + "learning_rate": 3.282705542954199e-06, + "loss": 0.87814367, + "num_input_tokens_seen": 107378980, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.87109375, + "step": 4990, + "time_per_iteration": 2.455134391784668 + }, + { + "auxiliary_loss_clip": 0.01135916, + "auxiliary_loss_mlp": 0.01035323, + "balance_loss_clip": 1.01884782, + "balance_loss_mlp": 1.04822564, + "epoch": 0.30007515406583496, + "flos": 25191982080000.0, + "grad_norm": 1.6641511475566748, + "language_loss": 0.67125142, + "learning_rate": 3.28240670566841e-06, + "loss": 0.69296378, + "num_input_tokens_seen": 107397640, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.87890625, + "step": 4991, + "time_per_iteration": 2.4928019046783447 + }, + { + "auxiliary_loss_clip": 0.01137405, + "auxiliary_loss_mlp": 0.0103888, + "balance_loss_clip": 1.02165437, + "balance_loss_mlp": 1.0479908, + "epoch": 0.3001352773185029, + "flos": 19391188106880.0, + "grad_norm": 1.5868946812173, + "language_loss": 0.78707612, + "learning_rate": 3.28210781975363e-06, + "loss": 0.80883896, + "num_input_tokens_seen": 107416020, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.89453125, + "step": 4992, + "time_per_iteration": 2.5030534267425537 + }, + { + "auxiliary_loss_clip": 0.01135049, + "auxiliary_loss_mlp": 0.01036168, + "balance_loss_clip": 1.02056348, + "balance_loss_mlp": 1.04976213, + "epoch": 0.3001954005711709, + "flos": 21543853161600.0, + "grad_norm": 1.8035914694742925, + "language_loss": 0.824085, + "learning_rate": 3.281808885221193e-06, + "loss": 0.84579718, + "num_input_tokens_seen": 107436340, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8515625, + "step": 4993, + "time_per_iteration": 2.475588083267212 + }, + { + "auxiliary_loss_clip": 0.01138101, + "auxiliary_loss_mlp": 0.01042764, + "balance_loss_clip": 1.02522802, + "balance_loss_mlp": 1.04808736, + "epoch": 0.30025552382383885, + "flos": 17384320356480.0, + "grad_norm": 2.0505124462232898, + "language_loss": 0.85850489, + "learning_rate": 3.2815099020824345e-06, + "loss": 0.88031358, + "num_input_tokens_seen": 107454585, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8984375, + "step": 4994, + "time_per_iteration": 2.47881817817688 + }, + { + "auxiliary_loss_clip": 0.0113641, + "auxiliary_loss_mlp": 0.01036445, + "balance_loss_clip": 1.02086425, + "balance_loss_mlp": 1.05017769, + "epoch": 0.3003156470765068, + "flos": 29533330552320.0, + "grad_norm": 1.5183999234373478, + "language_loss": 0.8111707, + "learning_rate": 3.2812108703486924e-06, + "loss": 0.83289921, + "num_input_tokens_seen": 107477180, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.86328125, + "step": 4995, + "time_per_iteration": 2.5481183528900146 + }, + { + "auxiliary_loss_clip": 0.0113418, + "auxiliary_loss_mlp": 0.01041404, + "balance_loss_clip": 1.02522683, + "balance_loss_mlp": 1.05089867, + "epoch": 0.3003757703291748, + "flos": 43646402465280.0, + "grad_norm": 1.7074459415862762, + "language_loss": 0.67098773, + "learning_rate": 3.2809117900313055e-06, + "loss": 0.69274354, + "num_input_tokens_seen": 107500250, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.83203125, + "step": 4996, + "time_per_iteration": 2.6810193061828613 + }, + { + "auxiliary_loss_clip": 0.01134671, + "auxiliary_loss_mlp": 0.01040082, + "balance_loss_clip": 1.02392912, + "balance_loss_mlp": 1.04883564, + "epoch": 0.30043589358184275, + "flos": 22528380015360.0, + "grad_norm": 1.7509046873587113, + "language_loss": 0.75304276, + "learning_rate": 3.280612661141615e-06, + "loss": 0.77479029, + "num_input_tokens_seen": 107520070, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.859375, + "step": 4997, + "time_per_iteration": 2.472226858139038 + }, + { + "auxiliary_loss_clip": 0.01132042, + "auxiliary_loss_mlp": 0.01038973, + "balance_loss_clip": 1.02372646, + "balance_loss_mlp": 1.04816282, + "epoch": 0.30049601683451077, + "flos": 20995892208000.0, + "grad_norm": 1.9401125864941864, + "language_loss": 0.77664721, + "learning_rate": 3.2803134836909646e-06, + "loss": 0.79835731, + "num_input_tokens_seen": 107539285, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.83984375, + "step": 4998, + "time_per_iteration": 2.495087146759033 + }, + { + "auxiliary_loss_clip": 0.01129805, + "auxiliary_loss_mlp": 0.01039417, + "balance_loss_clip": 1.02469468, + "balance_loss_mlp": 1.04812598, + "epoch": 0.30055614008717874, + "flos": 23916004272000.0, + "grad_norm": 1.5996751316274151, + "language_loss": 0.73429006, + "learning_rate": 3.2800142576906985e-06, + "loss": 0.75598228, + "num_input_tokens_seen": 107560260, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.81640625, + "step": 4999, + "time_per_iteration": 2.491774082183838 + }, + { + "auxiliary_loss_clip": 0.01134839, + "auxiliary_loss_mlp": 0.01037955, + "balance_loss_clip": 1.02250576, + "balance_loss_mlp": 1.0498935, + "epoch": 0.3006162633398467, + "flos": 19169798630400.0, + "grad_norm": 1.6017930279588588, + "language_loss": 0.756015, + "learning_rate": 3.2797149831521626e-06, + "loss": 0.77774298, + "num_input_tokens_seen": 107579260, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8515625, + "step": 5000, + "time_per_iteration": 2.572003126144409 + }, + { + "auxiliary_loss_clip": 0.01131295, + "auxiliary_loss_mlp": 0.01036738, + "balance_loss_clip": 1.02329731, + "balance_loss_mlp": 1.04886353, + "epoch": 0.30067638659251467, + "flos": 14679241061760.0, + "grad_norm": 1.977226227337592, + "language_loss": 0.81681275, + "learning_rate": 3.2794156600867073e-06, + "loss": 0.83849311, + "num_input_tokens_seen": 107595245, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.82421875, + "step": 5001, + "time_per_iteration": 2.4240355491638184 + }, + { + "auxiliary_loss_clip": 0.01136183, + "auxiliary_loss_mlp": 0.01041393, + "balance_loss_clip": 1.02538288, + "balance_loss_mlp": 1.05103087, + "epoch": 0.30073650984518263, + "flos": 23368007404800.0, + "grad_norm": 1.5846802536013025, + "language_loss": 0.8056432, + "learning_rate": 3.2791162885056815e-06, + "loss": 0.82741892, + "num_input_tokens_seen": 107613985, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8515625, + "step": 5002, + "time_per_iteration": 2.5848264694213867 + }, + { + "auxiliary_loss_clip": 0.01137551, + "auxiliary_loss_mlp": 0.01037496, + "balance_loss_clip": 1.02240372, + "balance_loss_mlp": 1.04907179, + "epoch": 0.3007966330978506, + "flos": 22966633854720.0, + "grad_norm": 1.6918091030667293, + "language_loss": 0.71209854, + "learning_rate": 3.2788168684204376e-06, + "loss": 0.73384899, + "num_input_tokens_seen": 107631435, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8828125, + "step": 5003, + "time_per_iteration": 2.4672186374664307 + }, + { + "auxiliary_loss_clip": 0.01136595, + "auxiliary_loss_mlp": 0.01038624, + "balance_loss_clip": 1.02377009, + "balance_loss_mlp": 1.05050564, + "epoch": 0.30085675635051856, + "flos": 27818452460160.0, + "grad_norm": 1.8725932973877313, + "language_loss": 0.70613277, + "learning_rate": 3.27851739984233e-06, + "loss": 0.72788501, + "num_input_tokens_seen": 107650530, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.859375, + "step": 5004, + "time_per_iteration": 2.579941511154175 + }, + { + "auxiliary_loss_clip": 0.01135872, + "auxiliary_loss_mlp": 0.01044061, + "balance_loss_clip": 1.02817035, + "balance_loss_mlp": 1.04977477, + "epoch": 0.3009168796031865, + "flos": 10882729059840.0, + "grad_norm": 2.8634075898885767, + "language_loss": 0.81359464, + "learning_rate": 3.278217882782715e-06, + "loss": 0.83539397, + "num_input_tokens_seen": 107662240, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.859375, + "step": 5005, + "time_per_iteration": 2.4043233394622803 + }, + { + "auxiliary_loss_clip": 0.01132041, + "auxiliary_loss_mlp": 0.01041947, + "balance_loss_clip": 1.02702177, + "balance_loss_mlp": 1.04792035, + "epoch": 0.3009770028558545, + "flos": 23805399317760.0, + "grad_norm": 2.9232502202927266, + "language_loss": 0.74906754, + "learning_rate": 3.2779183172529497e-06, + "loss": 0.77080745, + "num_input_tokens_seen": 107680330, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.84375, + "step": 5006, + "time_per_iteration": 2.5169718265533447 + }, + { + "auxiliary_loss_clip": 0.0113004, + "auxiliary_loss_mlp": 0.01041924, + "balance_loss_clip": 1.02712977, + "balance_loss_mlp": 1.04745531, + "epoch": 0.30103712610852246, + "flos": 26468211283200.0, + "grad_norm": 2.157802275476472, + "language_loss": 0.70810544, + "learning_rate": 3.2776187032643932e-06, + "loss": 0.72982514, + "num_input_tokens_seen": 107700020, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.828125, + "step": 5007, + "time_per_iteration": 2.500135898590088 + }, + { + "auxiliary_loss_clip": 0.01133792, + "auxiliary_loss_mlp": 0.01040278, + "balance_loss_clip": 1.02453065, + "balance_loss_mlp": 1.04947257, + "epoch": 0.3010972493611904, + "flos": 22856459863680.0, + "grad_norm": 2.301214894203853, + "language_loss": 0.76435697, + "learning_rate": 3.2773190408284075e-06, + "loss": 0.78609765, + "num_input_tokens_seen": 107718575, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.84375, + "step": 5008, + "time_per_iteration": 2.5071120262145996 + }, + { + "auxiliary_loss_clip": 0.0113182, + "auxiliary_loss_mlp": 0.01039886, + "balance_loss_clip": 1.02464485, + "balance_loss_mlp": 1.04823518, + "epoch": 0.3011573726138584, + "flos": 24053685102720.0, + "grad_norm": 1.7973688674758703, + "language_loss": 0.84830707, + "learning_rate": 3.2770193299563564e-06, + "loss": 0.87002409, + "num_input_tokens_seen": 107738635, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8359375, + "step": 5009, + "time_per_iteration": 2.531024694442749 + }, + { + "auxiliary_loss_clip": 0.01135897, + "auxiliary_loss_mlp": 0.01037547, + "balance_loss_clip": 1.0211432, + "balance_loss_mlp": 1.04830122, + "epoch": 0.30121749586652635, + "flos": 20259687052800.0, + "grad_norm": 1.9976209282841157, + "language_loss": 0.83813334, + "learning_rate": 3.276719570659604e-06, + "loss": 0.85986781, + "num_input_tokens_seen": 107753415, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.875, + "step": 5010, + "time_per_iteration": 2.4690375328063965 + }, + { + "auxiliary_loss_clip": 0.01130658, + "auxiliary_loss_mlp": 0.01034306, + "balance_loss_clip": 1.02003646, + "balance_loss_mlp": 1.04724431, + "epoch": 0.3012776191191944, + "flos": 26943058103040.0, + "grad_norm": 1.9597018241269177, + "language_loss": 0.85013181, + "learning_rate": 3.2764197629495176e-06, + "loss": 0.87178147, + "num_input_tokens_seen": 107773840, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.83203125, + "step": 5011, + "time_per_iteration": 2.501708745956421 + }, + { + "auxiliary_loss_clip": 0.01134213, + "auxiliary_loss_mlp": 0.01039104, + "balance_loss_clip": 1.02335644, + "balance_loss_mlp": 1.04754543, + "epoch": 0.30133774237186234, + "flos": 20412307941120.0, + "grad_norm": 2.0524404295798013, + "language_loss": 0.71966654, + "learning_rate": 3.2761199068374656e-06, + "loss": 0.74139971, + "num_input_tokens_seen": 107792020, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8671875, + "step": 5012, + "time_per_iteration": 3.979128360748291 + }, + { + "auxiliary_loss_clip": 0.01131878, + "auxiliary_loss_mlp": 0.01037572, + "balance_loss_clip": 1.0229032, + "balance_loss_mlp": 1.04721081, + "epoch": 0.3013978656245303, + "flos": 19792453916160.0, + "grad_norm": 1.9997819947408795, + "language_loss": 0.87396109, + "learning_rate": 3.275820002334819e-06, + "loss": 0.89565563, + "num_input_tokens_seen": 107809595, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.84765625, + "step": 5013, + "time_per_iteration": 2.467177629470825 + }, + { + "auxiliary_loss_clip": 0.01136565, + "auxiliary_loss_mlp": 0.01036881, + "balance_loss_clip": 1.0200367, + "balance_loss_mlp": 1.04842985, + "epoch": 0.30145798887719827, + "flos": 16249650652800.0, + "grad_norm": 3.4702040063697313, + "language_loss": 0.83367115, + "learning_rate": 3.2755200494529496e-06, + "loss": 0.85540557, + "num_input_tokens_seen": 107827230, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8828125, + "step": 5014, + "time_per_iteration": 2.4654901027679443 + }, + { + "auxiliary_loss_clip": 0.01128425, + "auxiliary_loss_mlp": 0.01033529, + "balance_loss_clip": 1.01896727, + "balance_loss_mlp": 1.0471499, + "epoch": 0.30151811212986623, + "flos": 24571733005440.0, + "grad_norm": 1.6346146355602116, + "language_loss": 0.68218327, + "learning_rate": 3.2752200482032323e-06, + "loss": 0.70380276, + "num_input_tokens_seen": 107847195, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8125, + "step": 5015, + "time_per_iteration": 2.4994328022003174 + }, + { + "auxiliary_loss_clip": 0.01132371, + "auxiliary_loss_mlp": 0.01038543, + "balance_loss_clip": 1.02309942, + "balance_loss_mlp": 1.04864407, + "epoch": 0.3015782353825342, + "flos": 21872076664320.0, + "grad_norm": 2.7110353723362635, + "language_loss": 0.74712509, + "learning_rate": 3.2749199985970436e-06, + "loss": 0.76883423, + "num_input_tokens_seen": 107866420, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8359375, + "step": 5016, + "time_per_iteration": 2.5168755054473877 + }, + { + "auxiliary_loss_clip": 0.01135364, + "auxiliary_loss_mlp": 0.01036445, + "balance_loss_clip": 1.0210197, + "balance_loss_mlp": 1.0498333, + "epoch": 0.30163835863520216, + "flos": 28769331248640.0, + "grad_norm": 1.6963436015958502, + "language_loss": 0.65179884, + "learning_rate": 3.2746199006457603e-06, + "loss": 0.67351693, + "num_input_tokens_seen": 107889090, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.85546875, + "step": 5017, + "time_per_iteration": 2.543577194213867 + }, + { + "auxiliary_loss_clip": 0.01134511, + "auxiliary_loss_mlp": 0.01043761, + "balance_loss_clip": 1.02860379, + "balance_loss_mlp": 1.05030179, + "epoch": 0.30169848188787013, + "flos": 22966202891520.0, + "grad_norm": 2.078433105892768, + "language_loss": 0.69045079, + "learning_rate": 3.2743197543607628e-06, + "loss": 0.71223348, + "num_input_tokens_seen": 107907520, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.84375, + "step": 5018, + "time_per_iteration": 2.498060464859009 + }, + { + "auxiliary_loss_clip": 0.01129538, + "auxiliary_loss_mlp": 0.01041131, + "balance_loss_clip": 1.02772546, + "balance_loss_mlp": 1.04842138, + "epoch": 0.3017586051405381, + "flos": 21835268202240.0, + "grad_norm": 1.9198297669603306, + "language_loss": 0.78841144, + "learning_rate": 3.2740195597534327e-06, + "loss": 0.81011814, + "num_input_tokens_seen": 107925650, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.8125, + "step": 5019, + "time_per_iteration": 2.4873573780059814 + }, + { + "auxiliary_loss_clip": 0.01134625, + "auxiliary_loss_mlp": 0.01041878, + "balance_loss_clip": 1.02695298, + "balance_loss_mlp": 1.05073094, + "epoch": 0.30181872839320606, + "flos": 22160403135360.0, + "grad_norm": 2.24109756344656, + "language_loss": 0.69867152, + "learning_rate": 3.2737193168351527e-06, + "loss": 0.72043651, + "num_input_tokens_seen": 107943975, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.83984375, + "step": 5020, + "time_per_iteration": 2.493370532989502 + }, + { + "auxiliary_loss_clip": 0.01136052, + "auxiliary_loss_mlp": 0.01040456, + "balance_loss_clip": 1.0256741, + "balance_loss_mlp": 1.04941368, + "epoch": 0.301878851645874, + "flos": 18114168804480.0, + "grad_norm": 1.9013759847828555, + "language_loss": 0.78134364, + "learning_rate": 3.2734190256173085e-06, + "loss": 0.80310869, + "num_input_tokens_seen": 107962950, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8671875, + "step": 5021, + "time_per_iteration": 2.4670474529266357 + }, + { + "auxiliary_loss_clip": 0.01133279, + "auxiliary_loss_mlp": 0.01029745, + "balance_loss_clip": 1.01527357, + "balance_loss_mlp": 1.04964936, + "epoch": 0.301938974898542, + "flos": 17602226213760.0, + "grad_norm": 2.3821225807179696, + "language_loss": 0.76075405, + "learning_rate": 3.2731186861112877e-06, + "loss": 0.78238434, + "num_input_tokens_seen": 107979700, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8359375, + "step": 5022, + "time_per_iteration": 2.4737884998321533 + }, + { + "auxiliary_loss_clip": 0.01133657, + "auxiliary_loss_mlp": 0.01042925, + "balance_loss_clip": 1.02791631, + "balance_loss_mlp": 1.04880631, + "epoch": 0.30199909815120995, + "flos": 11181219079680.0, + "grad_norm": 1.7684005868111572, + "language_loss": 0.69896525, + "learning_rate": 3.2728182983284793e-06, + "loss": 0.72073108, + "num_input_tokens_seen": 107996645, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84765625, + "step": 5023, + "time_per_iteration": 2.4453155994415283 + }, + { + "auxiliary_loss_clip": 0.01136173, + "auxiliary_loss_mlp": 0.01041698, + "balance_loss_clip": 1.02673686, + "balance_loss_mlp": 1.04927671, + "epoch": 0.302059221403878, + "flos": 21907843632000.0, + "grad_norm": 2.0912728997662127, + "language_loss": 0.71588898, + "learning_rate": 3.2725178622802724e-06, + "loss": 0.73766768, + "num_input_tokens_seen": 108015020, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8671875, + "step": 5024, + "time_per_iteration": 2.4998810291290283 + }, + { + "auxiliary_loss_clip": 0.0113052, + "auxiliary_loss_mlp": 0.01047301, + "balance_loss_clip": 1.0314939, + "balance_loss_mlp": 1.04858792, + "epoch": 0.30211934465654594, + "flos": 26396390039040.0, + "grad_norm": 1.6483742353836974, + "language_loss": 0.73955721, + "learning_rate": 3.272217377978061e-06, + "loss": 0.76133543, + "num_input_tokens_seen": 108036430, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8203125, + "step": 5025, + "time_per_iteration": 2.5167019367218018 + }, + { + "auxiliary_loss_clip": 0.0113244, + "auxiliary_loss_mlp": 0.01042201, + "balance_loss_clip": 1.02800322, + "balance_loss_mlp": 1.0518502, + "epoch": 0.3021794679092139, + "flos": 23400470321280.0, + "grad_norm": 1.4799709397217862, + "language_loss": 0.67022824, + "learning_rate": 3.2719168454332387e-06, + "loss": 0.6919747, + "num_input_tokens_seen": 108054250, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8046875, + "step": 5026, + "time_per_iteration": 2.5326507091522217 + }, + { + "auxiliary_loss_clip": 0.01134018, + "auxiliary_loss_mlp": 0.01043238, + "balance_loss_clip": 1.02799106, + "balance_loss_mlp": 1.05083036, + "epoch": 0.30223959116188187, + "flos": 20260979942400.0, + "grad_norm": 1.6876842646939136, + "language_loss": 0.85252607, + "learning_rate": 3.2716162646572034e-06, + "loss": 0.87429863, + "num_input_tokens_seen": 108071495, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.83203125, + "step": 5027, + "time_per_iteration": 2.4527347087860107 + }, + { + "auxiliary_loss_clip": 0.01129327, + "auxiliary_loss_mlp": 0.01045705, + "balance_loss_clip": 1.03187656, + "balance_loss_mlp": 1.04739702, + "epoch": 0.30229971441454984, + "flos": 26687840993280.0, + "grad_norm": 1.665552114762065, + "language_loss": 0.78757018, + "learning_rate": 3.271315635661351e-06, + "loss": 0.80932051, + "num_input_tokens_seen": 108092135, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.8203125, + "step": 5028, + "time_per_iteration": 2.5677576065063477 + }, + { + "auxiliary_loss_clip": 0.01132481, + "auxiliary_loss_mlp": 0.01044847, + "balance_loss_clip": 1.0295043, + "balance_loss_mlp": 1.04922223, + "epoch": 0.3023598376672178, + "flos": 34345323953280.0, + "grad_norm": 2.0260385179345346, + "language_loss": 0.76721144, + "learning_rate": 3.2710149584570826e-06, + "loss": 0.78898472, + "num_input_tokens_seen": 108112945, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.83203125, + "step": 5029, + "time_per_iteration": 2.611917734146118 + }, + { + "auxiliary_loss_clip": 0.01133028, + "auxiliary_loss_mlp": 0.01043332, + "balance_loss_clip": 1.02642775, + "balance_loss_mlp": 1.04855132, + "epoch": 0.30241996091988577, + "flos": 23112143850240.0, + "grad_norm": 1.944959289407135, + "language_loss": 0.81868339, + "learning_rate": 3.2707142330557993e-06, + "loss": 0.84044701, + "num_input_tokens_seen": 108130325, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.84375, + "step": 5030, + "time_per_iteration": 2.605531930923462 + }, + { + "auxiliary_loss_clip": 0.01132926, + "auxiliary_loss_mlp": 0.01045193, + "balance_loss_clip": 1.02982664, + "balance_loss_mlp": 1.04754734, + "epoch": 0.30248008417255373, + "flos": 19390002958080.0, + "grad_norm": 1.748277903644489, + "language_loss": 0.69869608, + "learning_rate": 3.270413459468905e-06, + "loss": 0.72047728, + "num_input_tokens_seen": 108150300, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.85546875, + "step": 5031, + "time_per_iteration": 2.496833086013794 + }, + { + "auxiliary_loss_clip": 0.01132221, + "auxiliary_loss_mlp": 0.01036128, + "balance_loss_clip": 1.02103615, + "balance_loss_mlp": 1.04892659, + "epoch": 0.3025402074252217, + "flos": 23769704177280.0, + "grad_norm": 1.8467264077922103, + "language_loss": 0.82302773, + "learning_rate": 3.2701126377078047e-06, + "loss": 0.84471118, + "num_input_tokens_seen": 108170330, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 5032, + "time_per_iteration": 2.5062966346740723 + }, + { + "auxiliary_loss_clip": 0.01140181, + "auxiliary_loss_mlp": 0.01046543, + "balance_loss_clip": 1.02991903, + "balance_loss_mlp": 1.05332685, + "epoch": 0.30260033067788966, + "flos": 25994118648960.0, + "grad_norm": 2.10117653020426, + "language_loss": 0.73383862, + "learning_rate": 3.269811767783906e-06, + "loss": 0.75570583, + "num_input_tokens_seen": 108191265, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8671875, + "step": 5033, + "time_per_iteration": 2.561467170715332 + }, + { + "auxiliary_loss_clip": 0.01130223, + "auxiliary_loss_mlp": 0.01045217, + "balance_loss_clip": 1.03000593, + "balance_loss_mlp": 1.04782772, + "epoch": 0.3026604539305576, + "flos": 25374551932800.0, + "grad_norm": 1.437497934350084, + "language_loss": 0.74057245, + "learning_rate": 3.2695108497086185e-06, + "loss": 0.76232684, + "num_input_tokens_seen": 108211615, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.82421875, + "step": 5034, + "time_per_iteration": 2.511861801147461 + }, + { + "auxiliary_loss_clip": 0.01131916, + "auxiliary_loss_mlp": 0.01033507, + "balance_loss_clip": 1.01840353, + "balance_loss_mlp": 1.04825819, + "epoch": 0.3027205771832256, + "flos": 25812733944960.0, + "grad_norm": 1.9672144407329994, + "language_loss": 0.71617639, + "learning_rate": 3.269209883493352e-06, + "loss": 0.73783064, + "num_input_tokens_seen": 108231080, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8359375, + "step": 5035, + "time_per_iteration": 2.545917272567749 + }, + { + "auxiliary_loss_clip": 0.0113067, + "auxiliary_loss_mlp": 0.01032255, + "balance_loss_clip": 1.01835537, + "balance_loss_mlp": 1.04876685, + "epoch": 0.30278070043589356, + "flos": 27344539393920.0, + "grad_norm": 1.774174351542542, + "language_loss": 0.87232339, + "learning_rate": 3.2689088691495196e-06, + "loss": 0.89395267, + "num_input_tokens_seen": 108251125, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.8203125, + "step": 5036, + "time_per_iteration": 2.5197184085845947 + }, + { + "auxiliary_loss_clip": 0.01131426, + "auxiliary_loss_mlp": 0.01043041, + "balance_loss_clip": 1.02679288, + "balance_loss_mlp": 1.04866219, + "epoch": 0.3028408236885616, + "flos": 24786227070720.0, + "grad_norm": 2.2121077897300134, + "language_loss": 0.77760899, + "learning_rate": 3.268607806688536e-06, + "loss": 0.7993536, + "num_input_tokens_seen": 108272545, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.828125, + "step": 5037, + "time_per_iteration": 2.5372917652130127 + }, + { + "auxiliary_loss_clip": 0.01133533, + "auxiliary_loss_mlp": 0.01041477, + "balance_loss_clip": 1.02603984, + "balance_loss_mlp": 1.04973745, + "epoch": 0.30290094694122954, + "flos": 12932474670720.0, + "grad_norm": 2.4260021818478634, + "language_loss": 0.77920854, + "learning_rate": 3.268306696121816e-06, + "loss": 0.80095863, + "num_input_tokens_seen": 108289725, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8359375, + "step": 5038, + "time_per_iteration": 2.4360761642456055 + }, + { + "auxiliary_loss_clip": 0.01128654, + "auxiliary_loss_mlp": 0.01034869, + "balance_loss_clip": 1.02073669, + "balance_loss_mlp": 1.04859674, + "epoch": 0.3029610701938975, + "flos": 25916443488000.0, + "grad_norm": 1.8428508909689656, + "language_loss": 0.74134624, + "learning_rate": 3.2680055374607804e-06, + "loss": 0.76298141, + "num_input_tokens_seen": 108310690, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.80078125, + "step": 5039, + "time_per_iteration": 2.520517587661743 + }, + { + "auxiliary_loss_clip": 0.01129815, + "auxiliary_loss_mlp": 0.01037874, + "balance_loss_clip": 1.02426052, + "balance_loss_mlp": 1.05003977, + "epoch": 0.3030211934465655, + "flos": 21980993679360.0, + "grad_norm": 1.8268154911840482, + "language_loss": 0.80263746, + "learning_rate": 3.267704330716847e-06, + "loss": 0.82431436, + "num_input_tokens_seen": 108328905, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.796875, + "step": 5040, + "time_per_iteration": 2.469822406768799 + }, + { + "auxiliary_loss_clip": 0.01131744, + "auxiliary_loss_mlp": 0.01036261, + "balance_loss_clip": 1.02227795, + "balance_loss_mlp": 1.05101466, + "epoch": 0.30308131669923344, + "flos": 20991977625600.0, + "grad_norm": 1.5747579863116856, + "language_loss": 0.81914759, + "learning_rate": 3.267403075901438e-06, + "loss": 0.8408277, + "num_input_tokens_seen": 108346680, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.8046875, + "step": 5041, + "time_per_iteration": 2.5240108966827393 + }, + { + "auxiliary_loss_clip": 0.01062494, + "auxiliary_loss_mlp": 0.01003022, + "balance_loss_clip": 1.00106716, + "balance_loss_mlp": 1.02890241, + "epoch": 0.3031414399519014, + "flos": 60548875827840.0, + "grad_norm": 0.7678965945904674, + "language_loss": 0.59521127, + "learning_rate": 3.267101773025978e-06, + "loss": 0.61586642, + "num_input_tokens_seen": 108413885, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.3359375, + "step": 5042, + "time_per_iteration": 3.169004440307617 + }, + { + "auxiliary_loss_clip": 0.01134227, + "auxiliary_loss_mlp": 0.01037406, + "balance_loss_clip": 1.02271986, + "balance_loss_mlp": 1.05006266, + "epoch": 0.30320156320456937, + "flos": 21907664064000.0, + "grad_norm": 1.6113397759888244, + "language_loss": 0.71136838, + "learning_rate": 3.266800422101892e-06, + "loss": 0.73308468, + "num_input_tokens_seen": 108433640, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.84375, + "step": 5043, + "time_per_iteration": 2.5217440128326416 + }, + { + "auxiliary_loss_clip": 0.01132657, + "auxiliary_loss_mlp": 0.01037151, + "balance_loss_clip": 1.02207136, + "balance_loss_mlp": 1.04824769, + "epoch": 0.30326168645723733, + "flos": 21652770176640.0, + "grad_norm": 2.6644669890018773, + "language_loss": 0.69351244, + "learning_rate": 3.266499023140606e-06, + "loss": 0.71521056, + "num_input_tokens_seen": 108452640, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84375, + "step": 5044, + "time_per_iteration": 2.4741897583007812 + }, + { + "auxiliary_loss_clip": 0.01129908, + "auxiliary_loss_mlp": 0.01037342, + "balance_loss_clip": 1.02252388, + "balance_loss_mlp": 1.04823565, + "epoch": 0.3033218097099053, + "flos": 21871286565120.0, + "grad_norm": 1.3748845619029404, + "language_loss": 0.77210236, + "learning_rate": 3.2661975761535513e-06, + "loss": 0.79377484, + "num_input_tokens_seen": 108472470, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.81640625, + "step": 5045, + "time_per_iteration": 2.5023043155670166 + }, + { + "auxiliary_loss_clip": 0.01132495, + "auxiliary_loss_mlp": 0.0103816, + "balance_loss_clip": 1.02240646, + "balance_loss_mlp": 1.04892182, + "epoch": 0.30338193296257326, + "flos": 27089717333760.0, + "grad_norm": 1.538768377317596, + "language_loss": 0.72444695, + "learning_rate": 3.2658960811521564e-06, + "loss": 0.74615347, + "num_input_tokens_seen": 108493025, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8359375, + "step": 5046, + "time_per_iteration": 2.5163753032684326 + }, + { + "auxiliary_loss_clip": 0.01134062, + "auxiliary_loss_mlp": 0.01042653, + "balance_loss_clip": 1.02611256, + "balance_loss_mlp": 1.04859519, + "epoch": 0.30344205621524123, + "flos": 19534363718400.0, + "grad_norm": 3.2419373644374176, + "language_loss": 0.80737638, + "learning_rate": 3.2655945381478564e-06, + "loss": 0.82914352, + "num_input_tokens_seen": 108513480, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.85546875, + "step": 5047, + "time_per_iteration": 2.547245979309082 + }, + { + "auxiliary_loss_clip": 0.01131045, + "auxiliary_loss_mlp": 0.01040382, + "balance_loss_clip": 1.02569556, + "balance_loss_mlp": 1.04871237, + "epoch": 0.3035021794679092, + "flos": 23910976368000.0, + "grad_norm": 1.9357354539113198, + "language_loss": 0.72334075, + "learning_rate": 3.265292947152084e-06, + "loss": 0.74505508, + "num_input_tokens_seen": 108533155, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.82421875, + "step": 5048, + "time_per_iteration": 2.494016170501709 + }, + { + "auxiliary_loss_clip": 0.01129755, + "auxiliary_loss_mlp": 0.01035796, + "balance_loss_clip": 1.02093613, + "balance_loss_mlp": 1.04574537, + "epoch": 0.30356230272057716, + "flos": 16143606725760.0, + "grad_norm": 1.7731178616486785, + "language_loss": 0.75098324, + "learning_rate": 3.2649913081762763e-06, + "loss": 0.7726388, + "num_input_tokens_seen": 108551900, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.83984375, + "step": 5049, + "time_per_iteration": 2.502979040145874 + }, + { + "auxiliary_loss_clip": 0.01133123, + "auxiliary_loss_mlp": 0.01037727, + "balance_loss_clip": 1.0226109, + "balance_loss_mlp": 1.04864645, + "epoch": 0.3036224259732452, + "flos": 28914697589760.0, + "grad_norm": 1.6762363098185904, + "language_loss": 0.8194561, + "learning_rate": 3.2646896212318717e-06, + "loss": 0.84116459, + "num_input_tokens_seen": 108574005, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.84375, + "step": 5050, + "time_per_iteration": 2.5254666805267334 + }, + { + "auxiliary_loss_clip": 0.01132852, + "auxiliary_loss_mlp": 0.01038752, + "balance_loss_clip": 1.02299261, + "balance_loss_mlp": 1.04868484, + "epoch": 0.30368254922591315, + "flos": 21105599322240.0, + "grad_norm": 2.8996577335854625, + "language_loss": 0.73712784, + "learning_rate": 3.2643878863303106e-06, + "loss": 0.7588439, + "num_input_tokens_seen": 108592715, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.83984375, + "step": 5051, + "time_per_iteration": 2.511455774307251 + }, + { + "auxiliary_loss_clip": 0.01130282, + "auxiliary_loss_mlp": 0.01035032, + "balance_loss_clip": 1.01967764, + "balance_loss_mlp": 1.04650712, + "epoch": 0.3037426724785811, + "flos": 23002293081600.0, + "grad_norm": 1.5939626777548828, + "language_loss": 0.76463652, + "learning_rate": 3.264086103483033e-06, + "loss": 0.78628969, + "num_input_tokens_seen": 108611770, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8359375, + "step": 5052, + "time_per_iteration": 2.478046417236328 + }, + { + "auxiliary_loss_clip": 0.01131682, + "auxiliary_loss_mlp": 0.01039995, + "balance_loss_clip": 1.02484894, + "balance_loss_mlp": 1.04609728, + "epoch": 0.3038027957312491, + "flos": 15632705629440.0, + "grad_norm": 1.8043694132732864, + "language_loss": 0.82780337, + "learning_rate": 3.2637842727014836e-06, + "loss": 0.84952009, + "num_input_tokens_seen": 108629070, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.85546875, + "step": 5053, + "time_per_iteration": 3.983353614807129 + }, + { + "auxiliary_loss_clip": 0.01130411, + "auxiliary_loss_mlp": 0.01042277, + "balance_loss_clip": 1.02661896, + "balance_loss_mlp": 1.04685903, + "epoch": 0.30386291898391704, + "flos": 12713994195840.0, + "grad_norm": 1.5364375285570075, + "language_loss": 0.70702368, + "learning_rate": 3.2634823939971083e-06, + "loss": 0.72875059, + "num_input_tokens_seen": 108646315, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8359375, + "step": 5054, + "time_per_iteration": 2.4379446506500244 + }, + { + "auxiliary_loss_clip": 0.01132155, + "auxiliary_loss_mlp": 0.01033029, + "balance_loss_clip": 1.01768088, + "balance_loss_mlp": 1.04817367, + "epoch": 0.303923042236585, + "flos": 26359437922560.0, + "grad_norm": 1.8280069054430388, + "language_loss": 0.69543922, + "learning_rate": 3.2631804673813545e-06, + "loss": 0.71709108, + "num_input_tokens_seen": 108665920, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.84375, + "step": 5055, + "time_per_iteration": 2.5247206687927246 + }, + { + "auxiliary_loss_clip": 0.01131491, + "auxiliary_loss_mlp": 0.01036764, + "balance_loss_clip": 1.02056348, + "balance_loss_mlp": 1.04682207, + "epoch": 0.30398316548925297, + "flos": 19719232041600.0, + "grad_norm": 2.038005952710024, + "language_loss": 0.67502165, + "learning_rate": 3.2628784928656707e-06, + "loss": 0.69670427, + "num_input_tokens_seen": 108683485, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.84765625, + "step": 5056, + "time_per_iteration": 2.4767425060272217 + }, + { + "auxiliary_loss_clip": 0.01130078, + "auxiliary_loss_mlp": 0.01039078, + "balance_loss_clip": 1.02434373, + "balance_loss_mlp": 1.04886115, + "epoch": 0.30404328874192094, + "flos": 24239846315520.0, + "grad_norm": 1.5579435169669187, + "language_loss": 0.82500231, + "learning_rate": 3.262576470461507e-06, + "loss": 0.84669387, + "num_input_tokens_seen": 108702700, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8125, + "step": 5057, + "time_per_iteration": 2.499105453491211 + }, + { + "auxiliary_loss_clip": 0.01129487, + "auxiliary_loss_mlp": 0.01036385, + "balance_loss_clip": 1.02171588, + "balance_loss_mlp": 1.04686213, + "epoch": 0.3041034119945889, + "flos": 24498942094080.0, + "grad_norm": 3.274565054245196, + "language_loss": 0.89040101, + "learning_rate": 3.2622744001803176e-06, + "loss": 0.91205966, + "num_input_tokens_seen": 108721860, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.828125, + "step": 5058, + "time_per_iteration": 2.4966368675231934 + }, + { + "auxiliary_loss_clip": 0.01131903, + "auxiliary_loss_mlp": 0.01042482, + "balance_loss_clip": 1.02681756, + "balance_loss_mlp": 1.04829955, + "epoch": 0.30416353524725687, + "flos": 28288881907200.0, + "grad_norm": 2.2189779437975274, + "language_loss": 0.71709251, + "learning_rate": 3.2619722820335564e-06, + "loss": 0.73883629, + "num_input_tokens_seen": 108743215, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8359375, + "step": 5059, + "time_per_iteration": 2.5429141521453857 + }, + { + "auxiliary_loss_clip": 0.01130965, + "auxiliary_loss_mlp": 0.01037733, + "balance_loss_clip": 1.0233928, + "balance_loss_mlp": 1.04720807, + "epoch": 0.30422365849992483, + "flos": 23660392112640.0, + "grad_norm": 10.158939103063299, + "language_loss": 0.73069966, + "learning_rate": 3.26167011603268e-06, + "loss": 0.75238669, + "num_input_tokens_seen": 108765505, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.8359375, + "step": 5060, + "time_per_iteration": 2.529862403869629 + }, + { + "auxiliary_loss_clip": 0.01132671, + "auxiliary_loss_mlp": 0.01034539, + "balance_loss_clip": 1.01979291, + "balance_loss_mlp": 1.04885316, + "epoch": 0.3042837817525928, + "flos": 22998773548800.0, + "grad_norm": 1.8510962431794071, + "language_loss": 0.76926744, + "learning_rate": 3.2613679021891463e-06, + "loss": 0.79093957, + "num_input_tokens_seen": 108783370, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8359375, + "step": 5061, + "time_per_iteration": 2.496739149093628 + }, + { + "auxiliary_loss_clip": 0.01138048, + "auxiliary_loss_mlp": 0.01039513, + "balance_loss_clip": 1.02312136, + "balance_loss_mlp": 1.0527482, + "epoch": 0.30434390500526076, + "flos": 22082332924800.0, + "grad_norm": 2.264413063412747, + "language_loss": 0.82064837, + "learning_rate": 3.261065640514415e-06, + "loss": 0.84242392, + "num_input_tokens_seen": 108797430, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8515625, + "step": 5062, + "time_per_iteration": 2.476290702819824 + }, + { + "auxiliary_loss_clip": 0.01128914, + "auxiliary_loss_mlp": 0.01032652, + "balance_loss_clip": 1.01821566, + "balance_loss_mlp": 1.04721808, + "epoch": 0.3044040282579287, + "flos": 25483504861440.0, + "grad_norm": 1.7072945635391377, + "language_loss": 0.74737656, + "learning_rate": 3.2607633310199483e-06, + "loss": 0.76899219, + "num_input_tokens_seen": 108816945, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.81640625, + "step": 5063, + "time_per_iteration": 2.5384082794189453 + }, + { + "auxiliary_loss_clip": 0.01132221, + "auxiliary_loss_mlp": 0.01037959, + "balance_loss_clip": 1.0214901, + "balance_loss_mlp": 1.04908288, + "epoch": 0.30446415151059675, + "flos": 21945478106880.0, + "grad_norm": 1.8176932093217915, + "language_loss": 0.84120226, + "learning_rate": 3.26046097371721e-06, + "loss": 0.86290407, + "num_input_tokens_seen": 108836615, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.83203125, + "step": 5064, + "time_per_iteration": 2.5108115673065186 + }, + { + "auxiliary_loss_clip": 0.01131651, + "auxiliary_loss_mlp": 0.01034827, + "balance_loss_clip": 1.01888871, + "balance_loss_mlp": 1.04751444, + "epoch": 0.3045242747632647, + "flos": 16435416816000.0, + "grad_norm": 1.7759562417820063, + "language_loss": 0.75990027, + "learning_rate": 3.2601585686176655e-06, + "loss": 0.78156507, + "num_input_tokens_seen": 108855165, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.84375, + "step": 5065, + "time_per_iteration": 2.5061376094818115 + }, + { + "auxiliary_loss_clip": 0.01133071, + "auxiliary_loss_mlp": 0.01040555, + "balance_loss_clip": 1.02470005, + "balance_loss_mlp": 1.04716659, + "epoch": 0.3045843980159327, + "flos": 31540341957120.0, + "grad_norm": 2.0133457948817406, + "language_loss": 0.62271762, + "learning_rate": 3.2598561157327814e-06, + "loss": 0.64445394, + "num_input_tokens_seen": 108874690, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.859375, + "step": 5066, + "time_per_iteration": 2.6000661849975586 + }, + { + "auxiliary_loss_clip": 0.01140413, + "auxiliary_loss_mlp": 0.01049285, + "balance_loss_clip": 1.03385913, + "balance_loss_mlp": 1.05344141, + "epoch": 0.30464452126860064, + "flos": 17853636481920.0, + "grad_norm": 1.7828452375691122, + "language_loss": 0.82887459, + "learning_rate": 3.2595536150740265e-06, + "loss": 0.85077155, + "num_input_tokens_seen": 108893140, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.87109375, + "step": 5067, + "time_per_iteration": 2.4754927158355713 + }, + { + "auxiliary_loss_clip": 0.01130661, + "auxiliary_loss_mlp": 0.01043304, + "balance_loss_clip": 1.02829516, + "balance_loss_mlp": 1.04839194, + "epoch": 0.3047046445212686, + "flos": 20631398947200.0, + "grad_norm": 2.0779895110277535, + "language_loss": 0.62978256, + "learning_rate": 3.259251066652873e-06, + "loss": 0.65152222, + "num_input_tokens_seen": 108911880, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8203125, + "step": 5068, + "time_per_iteration": 2.4957847595214844 + }, + { + "auxiliary_loss_clip": 0.01127866, + "auxiliary_loss_mlp": 0.01031598, + "balance_loss_clip": 1.01633286, + "balance_loss_mlp": 1.04544926, + "epoch": 0.3047647677739366, + "flos": 21287594557440.0, + "grad_norm": 1.6700683770947133, + "language_loss": 0.75058538, + "learning_rate": 3.258948470480793e-06, + "loss": 0.77217996, + "num_input_tokens_seen": 108930440, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.82421875, + "step": 5069, + "time_per_iteration": 2.487473964691162 + }, + { + "auxiliary_loss_clip": 0.0112831, + "auxiliary_loss_mlp": 0.01043362, + "balance_loss_clip": 1.02798414, + "balance_loss_mlp": 1.04746199, + "epoch": 0.30482489102660454, + "flos": 20995928121600.0, + "grad_norm": 1.839652658151057, + "language_loss": 0.75732648, + "learning_rate": 3.258645826569261e-06, + "loss": 0.7790432, + "num_input_tokens_seen": 108949125, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.80859375, + "step": 5070, + "time_per_iteration": 2.500335216522217 + }, + { + "auxiliary_loss_clip": 0.0113368, + "auxiliary_loss_mlp": 0.0103861, + "balance_loss_clip": 1.02229047, + "balance_loss_mlp": 1.04640067, + "epoch": 0.3048850142792725, + "flos": 26290812988800.0, + "grad_norm": 1.7318177446844936, + "language_loss": 0.81738281, + "learning_rate": 3.2583431349297527e-06, + "loss": 0.83910567, + "num_input_tokens_seen": 108972190, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.875, + "step": 5071, + "time_per_iteration": 2.5726318359375 + }, + { + "auxiliary_loss_clip": 0.01134597, + "auxiliary_loss_mlp": 0.01041754, + "balance_loss_clip": 1.02507651, + "balance_loss_mlp": 1.04737437, + "epoch": 0.30494513753194047, + "flos": 22346241125760.0, + "grad_norm": 1.5942809817556516, + "language_loss": 0.76252651, + "learning_rate": 3.2580403955737467e-06, + "loss": 0.78428996, + "num_input_tokens_seen": 108990325, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.875, + "step": 5072, + "time_per_iteration": 2.5147287845611572 + }, + { + "auxiliary_loss_clip": 0.01132045, + "auxiliary_loss_mlp": 0.01046119, + "balance_loss_clip": 1.03059769, + "balance_loss_mlp": 1.04904687, + "epoch": 0.30500526078460843, + "flos": 19537667769600.0, + "grad_norm": 2.176920469303851, + "language_loss": 0.71318722, + "learning_rate": 3.257737608512723e-06, + "loss": 0.73496878, + "num_input_tokens_seen": 109009505, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.83203125, + "step": 5073, + "time_per_iteration": 2.4736156463623047 + }, + { + "auxiliary_loss_clip": 0.01135708, + "auxiliary_loss_mlp": 0.0104584, + "balance_loss_clip": 1.02974713, + "balance_loss_mlp": 1.04842663, + "epoch": 0.3050653840372764, + "flos": 14465321614080.0, + "grad_norm": 2.146618897096623, + "language_loss": 0.7663309, + "learning_rate": 3.257434773758163e-06, + "loss": 0.78814638, + "num_input_tokens_seen": 109026350, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.875, + "step": 5074, + "time_per_iteration": 2.4547433853149414 + }, + { + "auxiliary_loss_clip": 0.01131716, + "auxiliary_loss_mlp": 0.01035183, + "balance_loss_clip": 1.02015638, + "balance_loss_mlp": 1.04879379, + "epoch": 0.30512550728994436, + "flos": 24243796811520.0, + "grad_norm": 1.8636036931869358, + "language_loss": 0.73939347, + "learning_rate": 3.25713189132155e-06, + "loss": 0.76106244, + "num_input_tokens_seen": 109044165, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 5075, + "time_per_iteration": 2.4922661781311035 + }, + { + "auxiliary_loss_clip": 0.01135073, + "auxiliary_loss_mlp": 0.01042646, + "balance_loss_clip": 1.02508652, + "balance_loss_mlp": 1.04769778, + "epoch": 0.30518563054261233, + "flos": 16360542915840.0, + "grad_norm": 2.14961805392919, + "language_loss": 0.75488788, + "learning_rate": 3.2568289612143703e-06, + "loss": 0.77666509, + "num_input_tokens_seen": 109060665, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.875, + "step": 5076, + "time_per_iteration": 2.471381187438965 + }, + { + "auxiliary_loss_clip": 0.0113449, + "auxiliary_loss_mlp": 0.01039246, + "balance_loss_clip": 1.02407038, + "balance_loss_mlp": 1.05137944, + "epoch": 0.30524575379528035, + "flos": 21579584215680.0, + "grad_norm": 1.505999917432091, + "language_loss": 0.79183954, + "learning_rate": 3.25652598344811e-06, + "loss": 0.81357688, + "num_input_tokens_seen": 109080035, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.828125, + "step": 5077, + "time_per_iteration": 2.5000534057617188 + }, + { + "auxiliary_loss_clip": 0.01127394, + "auxiliary_loss_mlp": 0.01030923, + "balance_loss_clip": 1.01739252, + "balance_loss_mlp": 1.0478642, + "epoch": 0.3053058770479483, + "flos": 16545231671040.0, + "grad_norm": 1.9961733055656423, + "language_loss": 0.74662113, + "learning_rate": 3.256222958034259e-06, + "loss": 0.76820433, + "num_input_tokens_seen": 109097385, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.796875, + "step": 5078, + "time_per_iteration": 2.4746944904327393 + }, + { + "auxiliary_loss_clip": 0.01130678, + "auxiliary_loss_mlp": 0.01047379, + "balance_loss_clip": 1.03203678, + "balance_loss_mlp": 1.04787958, + "epoch": 0.3053660003006163, + "flos": 12312907954560.0, + "grad_norm": 2.113994612729099, + "language_loss": 0.67216343, + "learning_rate": 3.255919884984307e-06, + "loss": 0.69394398, + "num_input_tokens_seen": 109115495, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.828125, + "step": 5079, + "time_per_iteration": 2.4575493335723877 + }, + { + "auxiliary_loss_clip": 0.01130366, + "auxiliary_loss_mlp": 0.01034996, + "balance_loss_clip": 1.02034521, + "balance_loss_mlp": 1.04758203, + "epoch": 0.30542612355328425, + "flos": 23112287504640.0, + "grad_norm": 1.7438542216491464, + "language_loss": 0.80291754, + "learning_rate": 3.2556167643097477e-06, + "loss": 0.82457113, + "num_input_tokens_seen": 109134235, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.828125, + "step": 5080, + "time_per_iteration": 2.490842342376709 + }, + { + "auxiliary_loss_clip": 0.01129694, + "auxiliary_loss_mlp": 0.01039719, + "balance_loss_clip": 1.02475858, + "balance_loss_mlp": 1.04612935, + "epoch": 0.3054862468059522, + "flos": 24389450461440.0, + "grad_norm": 2.2926909410882903, + "language_loss": 0.80971938, + "learning_rate": 3.255313596022074e-06, + "loss": 0.83141345, + "num_input_tokens_seen": 109152760, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8359375, + "step": 5081, + "time_per_iteration": 2.5298712253570557 + }, + { + "auxiliary_loss_clip": 0.01129539, + "auxiliary_loss_mlp": 0.01034881, + "balance_loss_clip": 1.01952672, + "balance_loss_mlp": 1.04690182, + "epoch": 0.3055463700586202, + "flos": 29386096704000.0, + "grad_norm": 1.691443128795128, + "language_loss": 0.71810889, + "learning_rate": 3.255010380132783e-06, + "loss": 0.73975313, + "num_input_tokens_seen": 109173925, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.828125, + "step": 5082, + "time_per_iteration": 2.5567750930786133 + }, + { + "auxiliary_loss_clip": 0.0113354, + "auxiliary_loss_mlp": 0.01038275, + "balance_loss_clip": 1.02073884, + "balance_loss_mlp": 1.0468955, + "epoch": 0.30560649331128814, + "flos": 25591775431680.0, + "grad_norm": 1.9955003311475592, + "language_loss": 0.73615241, + "learning_rate": 3.2547071166533736e-06, + "loss": 0.75787055, + "num_input_tokens_seen": 109192510, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8671875, + "step": 5083, + "time_per_iteration": 2.5083980560302734 + }, + { + "auxiliary_loss_clip": 0.01129694, + "auxiliary_loss_mlp": 0.0103765, + "balance_loss_clip": 1.02184248, + "balance_loss_mlp": 1.04441404, + "epoch": 0.3056666165639561, + "flos": 19128321400320.0, + "grad_norm": 3.7957379738132517, + "language_loss": 0.70895267, + "learning_rate": 3.254403805595344e-06, + "loss": 0.73062611, + "num_input_tokens_seen": 109210885, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8515625, + "step": 5084, + "time_per_iteration": 2.477665424346924 + }, + { + "auxiliary_loss_clip": 0.01134775, + "auxiliary_loss_mlp": 0.01032514, + "balance_loss_clip": 1.01631355, + "balance_loss_mlp": 1.04818797, + "epoch": 0.30572673981662407, + "flos": 15523860441600.0, + "grad_norm": 2.0055460894973933, + "language_loss": 0.78791595, + "learning_rate": 3.2541004469701962e-06, + "loss": 0.80958885, + "num_input_tokens_seen": 109229180, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8671875, + "step": 5085, + "time_per_iteration": 2.475783586502075 + }, + { + "auxiliary_loss_clip": 0.01127203, + "auxiliary_loss_mlp": 0.01037047, + "balance_loss_clip": 1.02187788, + "balance_loss_mlp": 1.04529142, + "epoch": 0.30578686306929204, + "flos": 21506541909120.0, + "grad_norm": 1.5510153728860234, + "language_loss": 0.77846372, + "learning_rate": 3.2537970407894342e-06, + "loss": 0.80010617, + "num_input_tokens_seen": 109249510, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8203125, + "step": 5086, + "time_per_iteration": 2.514472007751465 + }, + { + "auxiliary_loss_clip": 0.01132639, + "auxiliary_loss_mlp": 0.01041993, + "balance_loss_clip": 1.02592945, + "balance_loss_mlp": 1.04930758, + "epoch": 0.30584698632196, + "flos": 20954271323520.0, + "grad_norm": 1.7256556540888637, + "language_loss": 0.77121228, + "learning_rate": 3.253493587064563e-06, + "loss": 0.79295856, + "num_input_tokens_seen": 109268200, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8359375, + "step": 5087, + "time_per_iteration": 2.4817616939544678 + }, + { + "auxiliary_loss_clip": 0.01132683, + "auxiliary_loss_mlp": 0.01040214, + "balance_loss_clip": 1.02346563, + "balance_loss_mlp": 1.04716742, + "epoch": 0.30590710957462797, + "flos": 24681116897280.0, + "grad_norm": 2.0600622883478517, + "language_loss": 0.72582048, + "learning_rate": 3.2531900858070885e-06, + "loss": 0.74754953, + "num_input_tokens_seen": 109288370, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.85546875, + "step": 5088, + "time_per_iteration": 2.538318395614624 + }, + { + "auxiliary_loss_clip": 0.01135035, + "auxiliary_loss_mlp": 0.01039164, + "balance_loss_clip": 1.02300477, + "balance_loss_mlp": 1.04673004, + "epoch": 0.30596723282729593, + "flos": 17086907744640.0, + "grad_norm": 2.417480227404851, + "language_loss": 0.7889666, + "learning_rate": 3.252886537028521e-06, + "loss": 0.81070858, + "num_input_tokens_seen": 109306730, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8828125, + "step": 5089, + "time_per_iteration": 2.4561989307403564 + }, + { + "auxiliary_loss_clip": 0.0113113, + "auxiliary_loss_mlp": 0.0103884, + "balance_loss_clip": 1.02328289, + "balance_loss_mlp": 1.04813027, + "epoch": 0.30602735607996395, + "flos": 22857106308480.0, + "grad_norm": 2.044405318996134, + "language_loss": 0.77061844, + "learning_rate": 3.2525829407403703e-06, + "loss": 0.79231811, + "num_input_tokens_seen": 109327360, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.828125, + "step": 5090, + "time_per_iteration": 2.5215258598327637 + }, + { + "auxiliary_loss_clip": 0.01134524, + "auxiliary_loss_mlp": 0.01046182, + "balance_loss_clip": 1.02999353, + "balance_loss_mlp": 1.04693675, + "epoch": 0.3060874793326319, + "flos": 29861482227840.0, + "grad_norm": 1.7474050348479595, + "language_loss": 0.76481628, + "learning_rate": 3.2522792969541488e-06, + "loss": 0.78662336, + "num_input_tokens_seen": 109348135, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.875, + "step": 5091, + "time_per_iteration": 2.535468578338623 + }, + { + "auxiliary_loss_clip": 0.01133443, + "auxiliary_loss_mlp": 0.0103559, + "balance_loss_clip": 1.01955616, + "balance_loss_mlp": 1.04671383, + "epoch": 0.3061476025852999, + "flos": 20448577699200.0, + "grad_norm": 1.638842582319787, + "language_loss": 0.71933579, + "learning_rate": 3.2519756056813705e-06, + "loss": 0.7410261, + "num_input_tokens_seen": 109366220, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8671875, + "step": 5092, + "time_per_iteration": 2.512096405029297 + }, + { + "auxiliary_loss_clip": 0.01131454, + "auxiliary_loss_mlp": 0.01035497, + "balance_loss_clip": 1.02131701, + "balance_loss_mlp": 1.04765177, + "epoch": 0.30620772583796785, + "flos": 19391475415680.0, + "grad_norm": 1.9362192703697652, + "language_loss": 0.8216877, + "learning_rate": 3.2516718669335522e-06, + "loss": 0.84335721, + "num_input_tokens_seen": 109385260, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8359375, + "step": 5093, + "time_per_iteration": 2.464477300643921 + }, + { + "auxiliary_loss_clip": 0.01129105, + "auxiliary_loss_mlp": 0.01036711, + "balance_loss_clip": 1.02239954, + "balance_loss_mlp": 1.04639721, + "epoch": 0.3062678490906358, + "flos": 24024562151040.0, + "grad_norm": 1.6957020618246583, + "language_loss": 0.75365555, + "learning_rate": 3.2513680807222114e-06, + "loss": 0.77531368, + "num_input_tokens_seen": 109405025, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.828125, + "step": 5094, + "time_per_iteration": 2.5149855613708496 + }, + { + "auxiliary_loss_clip": 0.01128293, + "auxiliary_loss_mlp": 0.01039658, + "balance_loss_clip": 1.02464378, + "balance_loss_mlp": 1.04530072, + "epoch": 0.3063279723433038, + "flos": 19754639873280.0, + "grad_norm": 1.922814039194465, + "language_loss": 0.76033115, + "learning_rate": 3.251064247058868e-06, + "loss": 0.78201067, + "num_input_tokens_seen": 109422465, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 5095, + "time_per_iteration": 5.438723802566528 + }, + { + "auxiliary_loss_clip": 0.01127363, + "auxiliary_loss_mlp": 0.01038505, + "balance_loss_clip": 1.02325845, + "balance_loss_mlp": 1.04581833, + "epoch": 0.30638809559597174, + "flos": 22450022496000.0, + "grad_norm": 1.7577098515851188, + "language_loss": 0.8050971, + "learning_rate": 3.250760365955042e-06, + "loss": 0.82675582, + "num_input_tokens_seen": 109440575, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.81640625, + "step": 5096, + "time_per_iteration": 2.4706614017486572 + }, + { + "auxiliary_loss_clip": 0.01130131, + "auxiliary_loss_mlp": 0.01035159, + "balance_loss_clip": 1.02052069, + "balance_loss_mlp": 1.04556763, + "epoch": 0.3064482188486397, + "flos": 17165157523200.0, + "grad_norm": 2.0672553061960586, + "language_loss": 0.8209089, + "learning_rate": 3.250456437422258e-06, + "loss": 0.84256178, + "num_input_tokens_seen": 109459050, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.84375, + "step": 5097, + "time_per_iteration": 2.457242250442505 + }, + { + "auxiliary_loss_clip": 0.0112984, + "auxiliary_loss_mlp": 0.01039085, + "balance_loss_clip": 1.02227616, + "balance_loss_mlp": 1.04537082, + "epoch": 0.3065083421013077, + "flos": 23768483114880.0, + "grad_norm": 1.9081721986815667, + "language_loss": 0.77858478, + "learning_rate": 3.250152461472041e-06, + "loss": 0.80027401, + "num_input_tokens_seen": 109475860, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.84375, + "step": 5098, + "time_per_iteration": 2.4709839820861816 + }, + { + "auxiliary_loss_clip": 0.01128893, + "auxiliary_loss_mlp": 0.01035797, + "balance_loss_clip": 1.02057385, + "balance_loss_mlp": 1.0466584, + "epoch": 0.30656846535397564, + "flos": 26431833784320.0, + "grad_norm": 1.9501450681008343, + "language_loss": 0.83948421, + "learning_rate": 3.249848438115917e-06, + "loss": 0.86113107, + "num_input_tokens_seen": 109494760, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8203125, + "step": 5099, + "time_per_iteration": 2.537771224975586 + }, + { + "auxiliary_loss_clip": 0.01130145, + "auxiliary_loss_mlp": 0.01042266, + "balance_loss_clip": 1.02653074, + "balance_loss_mlp": 1.04364753, + "epoch": 0.3066285886066436, + "flos": 26651786716800.0, + "grad_norm": 2.2273819247618376, + "language_loss": 0.85744429, + "learning_rate": 3.2495443673654148e-06, + "loss": 0.87916839, + "num_input_tokens_seen": 109516480, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8671875, + "step": 5100, + "time_per_iteration": 2.5103259086608887 + }, + { + "auxiliary_loss_clip": 0.01129277, + "auxiliary_loss_mlp": 0.01038498, + "balance_loss_clip": 1.02259541, + "balance_loss_mlp": 1.04542243, + "epoch": 0.30668871185931157, + "flos": 15049947375360.0, + "grad_norm": 1.8863659276771934, + "language_loss": 0.79225194, + "learning_rate": 3.249240249232065e-06, + "loss": 0.81392968, + "num_input_tokens_seen": 109534615, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8359375, + "step": 5101, + "time_per_iteration": 2.4733920097351074 + }, + { + "auxiliary_loss_clip": 0.01131914, + "auxiliary_loss_mlp": 0.01045873, + "balance_loss_clip": 1.02869534, + "balance_loss_mlp": 1.04708326, + "epoch": 0.30674883511197953, + "flos": 20082109190400.0, + "grad_norm": 1.7393564952665503, + "language_loss": 0.79405224, + "learning_rate": 3.2489360837273998e-06, + "loss": 0.81583011, + "num_input_tokens_seen": 109554040, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.84765625, + "step": 5102, + "time_per_iteration": 2.4608778953552246 + }, + { + "auxiliary_loss_clip": 0.01134414, + "auxiliary_loss_mlp": 0.01038608, + "balance_loss_clip": 1.02135825, + "balance_loss_mlp": 1.04940438, + "epoch": 0.30680895836464755, + "flos": 22893807029760.0, + "grad_norm": 1.7201607461659805, + "language_loss": 0.88999605, + "learning_rate": 3.2486318708629532e-06, + "loss": 0.9117263, + "num_input_tokens_seen": 109574345, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.84765625, + "step": 5103, + "time_per_iteration": 2.5295228958129883 + }, + { + "auxiliary_loss_clip": 0.01131581, + "auxiliary_loss_mlp": 0.01041048, + "balance_loss_clip": 1.02549076, + "balance_loss_mlp": 1.04700959, + "epoch": 0.3068690816173155, + "flos": 23696159080320.0, + "grad_norm": 1.6453097169103326, + "language_loss": 0.74079049, + "learning_rate": 3.2483276106502607e-06, + "loss": 0.76251674, + "num_input_tokens_seen": 109593670, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84375, + "step": 5104, + "time_per_iteration": 2.4923107624053955 + }, + { + "auxiliary_loss_clip": 0.01132054, + "auxiliary_loss_mlp": 0.01042794, + "balance_loss_clip": 1.02690291, + "balance_loss_mlp": 1.04555643, + "epoch": 0.3069292048699835, + "flos": 23551044134400.0, + "grad_norm": 1.8308515164246026, + "language_loss": 0.73333633, + "learning_rate": 3.2480233031008605e-06, + "loss": 0.75508481, + "num_input_tokens_seen": 109613385, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.86328125, + "step": 5105, + "time_per_iteration": 2.542391777038574 + }, + { + "auxiliary_loss_clip": 0.01131684, + "auxiliary_loss_mlp": 0.01047183, + "balance_loss_clip": 1.03058875, + "balance_loss_mlp": 1.04582942, + "epoch": 0.30698932812265145, + "flos": 24531656405760.0, + "grad_norm": 5.5167708582846515, + "language_loss": 0.8714695, + "learning_rate": 3.2477189482262916e-06, + "loss": 0.89325809, + "num_input_tokens_seen": 109632395, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.859375, + "step": 5106, + "time_per_iteration": 2.5054032802581787 + }, + { + "auxiliary_loss_clip": 0.01136079, + "auxiliary_loss_mlp": 0.01048019, + "balance_loss_clip": 1.03128242, + "balance_loss_mlp": 1.04750919, + "epoch": 0.3070494513753194, + "flos": 20996430912000.0, + "grad_norm": 2.142568748510771, + "language_loss": 0.71183497, + "learning_rate": 3.2474145460380945e-06, + "loss": 0.73367596, + "num_input_tokens_seen": 109651380, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.88671875, + "step": 5107, + "time_per_iteration": 2.4980053901672363 + }, + { + "auxiliary_loss_clip": 0.01125715, + "auxiliary_loss_mlp": 0.01050168, + "balance_loss_clip": 1.03372955, + "balance_loss_mlp": 1.04304433, + "epoch": 0.3071095746279874, + "flos": 19025940660480.0, + "grad_norm": 1.7923615416213727, + "language_loss": 0.72302651, + "learning_rate": 3.247110096547814e-06, + "loss": 0.74478543, + "num_input_tokens_seen": 109670240, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.82421875, + "step": 5108, + "time_per_iteration": 2.4588091373443604 + }, + { + "auxiliary_loss_clip": 0.01129796, + "auxiliary_loss_mlp": 0.01039934, + "balance_loss_clip": 1.02435362, + "balance_loss_mlp": 1.04538584, + "epoch": 0.30716969788065535, + "flos": 21215521918080.0, + "grad_norm": 1.5361542639570684, + "language_loss": 0.85768104, + "learning_rate": 3.2468055997669926e-06, + "loss": 0.87937832, + "num_input_tokens_seen": 109690810, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84375, + "step": 5109, + "time_per_iteration": 2.5077664852142334 + }, + { + "auxiliary_loss_clip": 0.01129418, + "auxiliary_loss_mlp": 0.01036702, + "balance_loss_clip": 1.02176476, + "balance_loss_mlp": 1.04534364, + "epoch": 0.3072298211333233, + "flos": 25772765086080.0, + "grad_norm": 1.6710196569280569, + "language_loss": 0.67220587, + "learning_rate": 3.2465010557071788e-06, + "loss": 0.69386709, + "num_input_tokens_seen": 109711145, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.84375, + "step": 5110, + "time_per_iteration": 2.5019631385803223 + }, + { + "auxiliary_loss_clip": 0.01126741, + "auxiliary_loss_mlp": 0.01036309, + "balance_loss_clip": 1.0220511, + "balance_loss_mlp": 1.04472136, + "epoch": 0.3072899443859913, + "flos": 25848931875840.0, + "grad_norm": 1.5071731281437177, + "language_loss": 0.76981276, + "learning_rate": 3.246196464379919e-06, + "loss": 0.79144323, + "num_input_tokens_seen": 109731425, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8203125, + "step": 5111, + "time_per_iteration": 2.544111490249634 + }, + { + "auxiliary_loss_clip": 0.01130011, + "auxiliary_loss_mlp": 0.01040184, + "balance_loss_clip": 1.02486551, + "balance_loss_mlp": 1.04580235, + "epoch": 0.30735006763865924, + "flos": 25922800195200.0, + "grad_norm": 1.9077726149637915, + "language_loss": 0.67174292, + "learning_rate": 3.245891825796765e-06, + "loss": 0.69344485, + "num_input_tokens_seen": 109752720, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.84375, + "step": 5112, + "time_per_iteration": 2.5171637535095215 + }, + { + "auxiliary_loss_clip": 0.01136791, + "auxiliary_loss_mlp": 0.01041143, + "balance_loss_clip": 1.02382207, + "balance_loss_mlp": 1.04846382, + "epoch": 0.3074101908913272, + "flos": 30917004312960.0, + "grad_norm": 1.8925702151041777, + "language_loss": 0.798181, + "learning_rate": 3.2455871399692678e-06, + "loss": 0.81996036, + "num_input_tokens_seen": 109772840, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.8828125, + "step": 5113, + "time_per_iteration": 2.55889892578125 + }, + { + "auxiliary_loss_clip": 0.01130603, + "auxiliary_loss_mlp": 0.01041707, + "balance_loss_clip": 1.0257802, + "balance_loss_mlp": 1.04549623, + "epoch": 0.30747031414399517, + "flos": 18401058731520.0, + "grad_norm": 1.951625458848465, + "language_loss": 0.77243912, + "learning_rate": 3.2452824069089815e-06, + "loss": 0.79416221, + "num_input_tokens_seen": 109790150, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8515625, + "step": 5114, + "time_per_iteration": 2.4328107833862305 + }, + { + "auxiliary_loss_clip": 0.01132188, + "auxiliary_loss_mlp": 0.01037897, + "balance_loss_clip": 1.02079093, + "balance_loss_mlp": 1.04755759, + "epoch": 0.30753043739666314, + "flos": 22633166966400.0, + "grad_norm": 1.8985095809631356, + "language_loss": 0.62356925, + "learning_rate": 3.2449776266274623e-06, + "loss": 0.64527011, + "num_input_tokens_seen": 109807985, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.84765625, + "step": 5115, + "time_per_iteration": 2.480536699295044 + }, + { + "auxiliary_loss_clip": 0.01132859, + "auxiliary_loss_mlp": 0.01036205, + "balance_loss_clip": 1.02033865, + "balance_loss_mlp": 1.04663444, + "epoch": 0.3075905606493311, + "flos": 27344072517120.0, + "grad_norm": 3.0190652682973176, + "language_loss": 0.82743216, + "learning_rate": 3.2446727991362657e-06, + "loss": 0.84912288, + "num_input_tokens_seen": 109825920, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.859375, + "step": 5116, + "time_per_iteration": 2.5121662616729736 + }, + { + "auxiliary_loss_clip": 0.01131907, + "auxiliary_loss_mlp": 0.0103869, + "balance_loss_clip": 1.02322841, + "balance_loss_mlp": 1.04825926, + "epoch": 0.3076506839019991, + "flos": 22090808534400.0, + "grad_norm": 1.8681947014951163, + "language_loss": 0.75772393, + "learning_rate": 3.244367924446952e-06, + "loss": 0.77942991, + "num_input_tokens_seen": 109846220, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8359375, + "step": 5117, + "time_per_iteration": 2.48750376701355 + }, + { + "auxiliary_loss_clip": 0.0113581, + "auxiliary_loss_mlp": 0.01035582, + "balance_loss_clip": 1.01952434, + "balance_loss_mlp": 1.05018401, + "epoch": 0.3077108071546671, + "flos": 21289533891840.0, + "grad_norm": 2.225887232792708, + "language_loss": 0.71873093, + "learning_rate": 3.2440630025710826e-06, + "loss": 0.74044484, + "num_input_tokens_seen": 109863870, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.85546875, + "step": 5118, + "time_per_iteration": 2.4745492935180664 + }, + { + "auxiliary_loss_clip": 0.01130971, + "auxiliary_loss_mlp": 0.01039981, + "balance_loss_clip": 1.02442479, + "balance_loss_mlp": 1.04630661, + "epoch": 0.30777093040733505, + "flos": 21430985650560.0, + "grad_norm": 1.5789952404099556, + "language_loss": 0.74312431, + "learning_rate": 3.243758033520219e-06, + "loss": 0.76483381, + "num_input_tokens_seen": 109883500, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.84375, + "step": 5119, + "time_per_iteration": 2.5185489654541016 + }, + { + "auxiliary_loss_clip": 0.01136122, + "auxiliary_loss_mlp": 0.01051479, + "balance_loss_clip": 1.03291845, + "balance_loss_mlp": 1.04891181, + "epoch": 0.307831053660003, + "flos": 23149275534720.0, + "grad_norm": 1.733023320063412, + "language_loss": 0.80267692, + "learning_rate": 3.243453017305926e-06, + "loss": 0.82455289, + "num_input_tokens_seen": 109904620, + "router_z_loss_clip": 0.18554688, + "router_z_loss_mlp": 0.875, + "step": 5120, + "time_per_iteration": 2.5592849254608154 + }, + { + "auxiliary_loss_clip": 0.01127219, + "auxiliary_loss_mlp": 0.01048202, + "balance_loss_clip": 1.03299093, + "balance_loss_mlp": 1.04384947, + "epoch": 0.307891176912671, + "flos": 17019755268480.0, + "grad_norm": 1.564134517039273, + "language_loss": 0.80110037, + "learning_rate": 3.24314795393977e-06, + "loss": 0.82285464, + "num_input_tokens_seen": 109922275, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8359375, + "step": 5121, + "time_per_iteration": 2.440516948699951 + }, + { + "auxiliary_loss_clip": 0.0113076, + "auxiliary_loss_mlp": 0.01035014, + "balance_loss_clip": 1.01981497, + "balance_loss_mlp": 1.0480212, + "epoch": 0.30795130016533895, + "flos": 27705046245120.0, + "grad_norm": 1.5001896125792977, + "language_loss": 0.82594395, + "learning_rate": 3.242842843433319e-06, + "loss": 0.84760171, + "num_input_tokens_seen": 109944265, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.828125, + "step": 5122, + "time_per_iteration": 2.510576009750366 + }, + { + "auxiliary_loss_clip": 0.01050329, + "auxiliary_loss_mlp": 0.01017411, + "balance_loss_clip": 1.01562333, + "balance_loss_mlp": 1.01982307, + "epoch": 0.3080114234180069, + "flos": 69058699591680.0, + "grad_norm": 0.7473381596642288, + "language_loss": 0.58639288, + "learning_rate": 3.242537685798143e-06, + "loss": 0.60707027, + "num_input_tokens_seen": 110014160, + "router_z_loss_clip": 0.01782227, + "router_z_loss_mlp": 0.3046875, + "step": 5123, + "time_per_iteration": 3.2167654037475586 + }, + { + "auxiliary_loss_clip": 0.01134332, + "auxiliary_loss_mlp": 0.01036733, + "balance_loss_clip": 1.01917315, + "balance_loss_mlp": 1.04640436, + "epoch": 0.3080715466706749, + "flos": 24060221377920.0, + "grad_norm": 1.5767520801619384, + "language_loss": 0.83622873, + "learning_rate": 3.242232481045813e-06, + "loss": 0.85793942, + "num_input_tokens_seen": 110034865, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.87890625, + "step": 5124, + "time_per_iteration": 2.474625587463379 + }, + { + "auxiliary_loss_clip": 0.01135515, + "auxiliary_loss_mlp": 0.01039715, + "balance_loss_clip": 1.02420545, + "balance_loss_mlp": 1.04945302, + "epoch": 0.30813166992334284, + "flos": 25848680480640.0, + "grad_norm": 1.8429802725909379, + "language_loss": 0.78703862, + "learning_rate": 3.2419272291879035e-06, + "loss": 0.80879092, + "num_input_tokens_seen": 110052930, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.859375, + "step": 5125, + "time_per_iteration": 2.5806493759155273 + }, + { + "auxiliary_loss_clip": 0.01134207, + "auxiliary_loss_mlp": 0.01037354, + "balance_loss_clip": 1.02050948, + "balance_loss_mlp": 1.04717779, + "epoch": 0.3081917931760108, + "flos": 20449619193600.0, + "grad_norm": 1.8928574451074776, + "language_loss": 0.6450479, + "learning_rate": 3.241621930235989e-06, + "loss": 0.66676342, + "num_input_tokens_seen": 110071765, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.87109375, + "step": 5126, + "time_per_iteration": 2.467099666595459 + }, + { + "auxiliary_loss_clip": 0.01129876, + "auxiliary_loss_mlp": 0.01039444, + "balance_loss_clip": 1.02367234, + "balance_loss_mlp": 1.04831636, + "epoch": 0.3082519164286788, + "flos": 22166257052160.0, + "grad_norm": 1.5538294270453243, + "language_loss": 0.86619091, + "learning_rate": 3.241316584201646e-06, + "loss": 0.88788408, + "num_input_tokens_seen": 110092660, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.81640625, + "step": 5127, + "time_per_iteration": 2.543095111846924 + }, + { + "auxiliary_loss_clip": 0.01129649, + "auxiliary_loss_mlp": 0.01040552, + "balance_loss_clip": 1.02439952, + "balance_loss_mlp": 1.04648781, + "epoch": 0.30831203968134674, + "flos": 28913404700160.0, + "grad_norm": 2.186420023793508, + "language_loss": 0.68816996, + "learning_rate": 3.2410111910964538e-06, + "loss": 0.70987189, + "num_input_tokens_seen": 110114960, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.83203125, + "step": 5128, + "time_per_iteration": 2.525390863418579 + }, + { + "auxiliary_loss_clip": 0.01134323, + "auxiliary_loss_mlp": 0.01041963, + "balance_loss_clip": 1.02571476, + "balance_loss_mlp": 1.04763198, + "epoch": 0.3083721629340147, + "flos": 25667726739840.0, + "grad_norm": 1.801256837086347, + "language_loss": 0.71226776, + "learning_rate": 3.240705750931993e-06, + "loss": 0.7340306, + "num_input_tokens_seen": 110135750, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8671875, + "step": 5129, + "time_per_iteration": 2.5417068004608154 + }, + { + "auxiliary_loss_clip": 0.01045915, + "auxiliary_loss_mlp": 0.01008464, + "balance_loss_clip": 1.00633001, + "balance_loss_mlp": 1.01580441, + "epoch": 0.3084322861866827, + "flos": 68212679581440.0, + "grad_norm": 0.9000157132793972, + "language_loss": 0.59171313, + "learning_rate": 3.240400263719846e-06, + "loss": 0.61225688, + "num_input_tokens_seen": 110189480, + "router_z_loss_clip": 0.0213623, + "router_z_loss_mlp": 0.30078125, + "step": 5130, + "time_per_iteration": 3.024799108505249 + }, + { + "auxiliary_loss_clip": 0.01135089, + "auxiliary_loss_mlp": 0.01038466, + "balance_loss_clip": 1.02233696, + "balance_loss_mlp": 1.0485276, + "epoch": 0.3084924094393507, + "flos": 20296495514880.0, + "grad_norm": 2.1422150520884773, + "language_loss": 0.72951442, + "learning_rate": 3.2400947294715957e-06, + "loss": 0.75124997, + "num_input_tokens_seen": 110206445, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8671875, + "step": 5131, + "time_per_iteration": 2.5145480632781982 + }, + { + "auxiliary_loss_clip": 0.01130631, + "auxiliary_loss_mlp": 0.01036573, + "balance_loss_clip": 1.02222049, + "balance_loss_mlp": 1.04737425, + "epoch": 0.30855253269201866, + "flos": 23949831905280.0, + "grad_norm": 1.759562546324366, + "language_loss": 0.71208251, + "learning_rate": 3.2397891481988303e-06, + "loss": 0.73375452, + "num_input_tokens_seen": 110226845, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.83203125, + "step": 5132, + "time_per_iteration": 2.4997506141662598 + }, + { + "auxiliary_loss_clip": 0.01128489, + "auxiliary_loss_mlp": 0.01040365, + "balance_loss_clip": 1.02580929, + "balance_loss_mlp": 1.04823279, + "epoch": 0.3086126559446866, + "flos": 19281876042240.0, + "grad_norm": 1.7072095629792627, + "language_loss": 0.8999784, + "learning_rate": 3.239483519913136e-06, + "loss": 0.92166698, + "num_input_tokens_seen": 110244095, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8046875, + "step": 5133, + "time_per_iteration": 2.4972143173217773 + }, + { + "auxiliary_loss_clip": 0.01136466, + "auxiliary_loss_mlp": 0.01047935, + "balance_loss_clip": 1.03186607, + "balance_loss_mlp": 1.04911399, + "epoch": 0.3086727791973546, + "flos": 33760770019200.0, + "grad_norm": 1.8506383958840185, + "language_loss": 0.67226613, + "learning_rate": 3.239177844626102e-06, + "loss": 0.6941101, + "num_input_tokens_seen": 110264240, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.875, + "step": 5134, + "time_per_iteration": 2.5700669288635254 + }, + { + "auxiliary_loss_clip": 0.0113384, + "auxiliary_loss_mlp": 0.01047239, + "balance_loss_clip": 1.0317775, + "balance_loss_mlp": 1.04718161, + "epoch": 0.30873290245002255, + "flos": 16034151006720.0, + "grad_norm": 2.423009332179396, + "language_loss": 0.82865155, + "learning_rate": 3.2388721223493197e-06, + "loss": 0.85046244, + "num_input_tokens_seen": 110282450, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8671875, + "step": 5135, + "time_per_iteration": 2.4712367057800293 + }, + { + "auxiliary_loss_clip": 0.0104583, + "auxiliary_loss_mlp": 0.01004049, + "balance_loss_clip": 1.00197494, + "balance_loss_mlp": 1.015975, + "epoch": 0.3087930257026905, + "flos": 65048304055680.0, + "grad_norm": 0.7120747448350507, + "language_loss": 0.55243868, + "learning_rate": 3.2385663530943824e-06, + "loss": 0.57293749, + "num_input_tokens_seen": 110343715, + "router_z_loss_clip": 0.02075195, + "router_z_loss_mlp": 0.29882812, + "step": 5136, + "time_per_iteration": 3.1432137489318848 + }, + { + "auxiliary_loss_clip": 0.01132561, + "auxiliary_loss_mlp": 0.01040613, + "balance_loss_clip": 1.02502048, + "balance_loss_mlp": 1.04724097, + "epoch": 0.3088531489553585, + "flos": 74738829824640.0, + "grad_norm": 1.9824711220984585, + "language_loss": 0.76057774, + "learning_rate": 3.2382605368728852e-06, + "loss": 0.78230941, + "num_input_tokens_seen": 110368430, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8515625, + "step": 5137, + "time_per_iteration": 5.764686822891235 + }, + { + "auxiliary_loss_clip": 0.0113183, + "auxiliary_loss_mlp": 0.01037097, + "balance_loss_clip": 1.02310133, + "balance_loss_mlp": 1.04696631, + "epoch": 0.30891327220802645, + "flos": 21142300043520.0, + "grad_norm": 2.0179579208290264, + "language_loss": 0.79909992, + "learning_rate": 3.237954673696424e-06, + "loss": 0.8207891, + "num_input_tokens_seen": 110386735, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.84765625, + "step": 5138, + "time_per_iteration": 2.45621657371521 + }, + { + "auxiliary_loss_clip": 0.01132491, + "auxiliary_loss_mlp": 0.01042876, + "balance_loss_clip": 1.02666378, + "balance_loss_mlp": 1.04560494, + "epoch": 0.3089733954606944, + "flos": 25664494515840.0, + "grad_norm": 1.4272945699581137, + "language_loss": 0.81220984, + "learning_rate": 3.2376487635765983e-06, + "loss": 0.83396351, + "num_input_tokens_seen": 110406820, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.87109375, + "step": 5139, + "time_per_iteration": 2.5283203125 + }, + { + "auxiliary_loss_clip": 0.01137198, + "auxiliary_loss_mlp": 0.01042206, + "balance_loss_clip": 1.02492023, + "balance_loss_mlp": 1.04787183, + "epoch": 0.3090335187133624, + "flos": 19427350124160.0, + "grad_norm": 2.1565991279061736, + "language_loss": 0.77528149, + "learning_rate": 3.2373428065250067e-06, + "loss": 0.79707557, + "num_input_tokens_seen": 110424225, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.89453125, + "step": 5140, + "time_per_iteration": 2.43929386138916 + }, + { + "auxiliary_loss_clip": 0.01129104, + "auxiliary_loss_mlp": 0.01044008, + "balance_loss_clip": 1.02920234, + "balance_loss_mlp": 1.04757929, + "epoch": 0.30909364196603034, + "flos": 20011329440640.0, + "grad_norm": 2.2023621297160156, + "language_loss": 0.78595555, + "learning_rate": 3.237036802553252e-06, + "loss": 0.80768663, + "num_input_tokens_seen": 110443310, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.81640625, + "step": 5141, + "time_per_iteration": 2.5164880752563477 + }, + { + "auxiliary_loss_clip": 0.01134378, + "auxiliary_loss_mlp": 0.01047349, + "balance_loss_clip": 1.03046894, + "balance_loss_mlp": 1.04716825, + "epoch": 0.3091537652186983, + "flos": 19677575243520.0, + "grad_norm": 2.127714885761315, + "language_loss": 0.87142885, + "learning_rate": 3.2367307516729377e-06, + "loss": 0.89324611, + "num_input_tokens_seen": 110460215, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.87109375, + "step": 5142, + "time_per_iteration": 2.4362974166870117 + }, + { + "auxiliary_loss_clip": 0.01131531, + "auxiliary_loss_mlp": 0.0104755, + "balance_loss_clip": 1.03220749, + "balance_loss_mlp": 1.04556274, + "epoch": 0.3092138884713663, + "flos": 17020042577280.0, + "grad_norm": 1.7972015737501748, + "language_loss": 0.7877624, + "learning_rate": 3.23642465389567e-06, + "loss": 0.80955315, + "num_input_tokens_seen": 110479385, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.859375, + "step": 5143, + "time_per_iteration": 2.459317445755005 + }, + { + "auxiliary_loss_clip": 0.01130331, + "auxiliary_loss_mlp": 0.01043432, + "balance_loss_clip": 1.02742219, + "balance_loss_mlp": 1.04593444, + "epoch": 0.3092740117240343, + "flos": 25009986844800.0, + "grad_norm": 1.9461458902951219, + "language_loss": 0.72098875, + "learning_rate": 3.236118509233055e-06, + "loss": 0.74272639, + "num_input_tokens_seen": 110499885, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.84375, + "step": 5144, + "time_per_iteration": 2.4872243404388428 + }, + { + "auxiliary_loss_clip": 0.01132569, + "auxiliary_loss_mlp": 0.01040748, + "balance_loss_clip": 1.02418947, + "balance_loss_mlp": 1.04587483, + "epoch": 0.30933413497670226, + "flos": 25590410714880.0, + "grad_norm": 1.7305751805857612, + "language_loss": 0.74054307, + "learning_rate": 3.235812317696702e-06, + "loss": 0.76227629, + "num_input_tokens_seen": 110519690, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8671875, + "step": 5145, + "time_per_iteration": 2.524683952331543 + }, + { + "auxiliary_loss_clip": 0.01132717, + "auxiliary_loss_mlp": 0.01045609, + "balance_loss_clip": 1.02951622, + "balance_loss_mlp": 1.04737079, + "epoch": 0.3093942582293702, + "flos": 24389665943040.0, + "grad_norm": 1.6607552662218326, + "language_loss": 0.76461762, + "learning_rate": 3.2355060792982224e-06, + "loss": 0.78640091, + "num_input_tokens_seen": 110540520, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8515625, + "step": 5146, + "time_per_iteration": 2.4848198890686035 + }, + { + "auxiliary_loss_clip": 0.01130265, + "auxiliary_loss_mlp": 0.01037136, + "balance_loss_clip": 1.02213407, + "balance_loss_mlp": 1.04672074, + "epoch": 0.3094543814820382, + "flos": 19646441130240.0, + "grad_norm": 2.385312171088194, + "language_loss": 0.66755533, + "learning_rate": 3.2351997940492286e-06, + "loss": 0.68922937, + "num_input_tokens_seen": 110557950, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8359375, + "step": 5147, + "time_per_iteration": 2.4861929416656494 + }, + { + "auxiliary_loss_clip": 0.01135751, + "auxiliary_loss_mlp": 0.01040015, + "balance_loss_clip": 1.02517319, + "balance_loss_mlp": 1.04931486, + "epoch": 0.30951450473470615, + "flos": 25663812157440.0, + "grad_norm": 2.0402709532397205, + "language_loss": 0.75148058, + "learning_rate": 3.2348934619613346e-06, + "loss": 0.77323824, + "num_input_tokens_seen": 110578215, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8671875, + "step": 5148, + "time_per_iteration": 2.505180597305298 + }, + { + "auxiliary_loss_clip": 0.01139245, + "auxiliary_loss_mlp": 0.0104464, + "balance_loss_clip": 1.02815318, + "balance_loss_mlp": 1.04876494, + "epoch": 0.3095746279873741, + "flos": 12020415505920.0, + "grad_norm": 2.1288750992632677, + "language_loss": 0.72576058, + "learning_rate": 3.2345870830461567e-06, + "loss": 0.74759942, + "num_input_tokens_seen": 110592990, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.90625, + "step": 5149, + "time_per_iteration": 2.4605252742767334 + }, + { + "auxiliary_loss_clip": 0.01133233, + "auxiliary_loss_mlp": 0.01041255, + "balance_loss_clip": 1.02442312, + "balance_loss_mlp": 1.0457058, + "epoch": 0.3096347512400421, + "flos": 23623044946560.0, + "grad_norm": 2.112154456836484, + "language_loss": 0.84981489, + "learning_rate": 3.2342806573153132e-06, + "loss": 0.87155974, + "num_input_tokens_seen": 110612130, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.875, + "step": 5150, + "time_per_iteration": 2.4866578578948975 + }, + { + "auxiliary_loss_clip": 0.01131574, + "auxiliary_loss_mlp": 0.01041618, + "balance_loss_clip": 1.02515531, + "balance_loss_mlp": 1.04593086, + "epoch": 0.30969487449271005, + "flos": 22529313768960.0, + "grad_norm": 1.9529089609254688, + "language_loss": 0.79053164, + "learning_rate": 3.233974184780424e-06, + "loss": 0.81226349, + "num_input_tokens_seen": 110632045, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.859375, + "step": 5151, + "time_per_iteration": 2.4936540126800537 + }, + { + "auxiliary_loss_clip": 0.01133842, + "auxiliary_loss_mlp": 0.01042555, + "balance_loss_clip": 1.02580595, + "balance_loss_mlp": 1.0471015, + "epoch": 0.309754997745378, + "flos": 15267925059840.0, + "grad_norm": 3.1311630498810774, + "language_loss": 0.67020154, + "learning_rate": 3.2336676654531084e-06, + "loss": 0.69196552, + "num_input_tokens_seen": 110649340, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8671875, + "step": 5152, + "time_per_iteration": 2.429640054702759 + }, + { + "auxiliary_loss_clip": 0.01132623, + "auxiliary_loss_mlp": 0.01043705, + "balance_loss_clip": 1.0275166, + "balance_loss_mlp": 1.04688787, + "epoch": 0.309815120998046, + "flos": 26979291947520.0, + "grad_norm": 12.57465651148819, + "language_loss": 0.82058132, + "learning_rate": 3.2333610993449926e-06, + "loss": 0.84234464, + "num_input_tokens_seen": 110668450, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.859375, + "step": 5153, + "time_per_iteration": 2.578856945037842 + }, + { + "auxiliary_loss_clip": 0.01133847, + "auxiliary_loss_mlp": 0.01042916, + "balance_loss_clip": 1.02788973, + "balance_loss_mlp": 1.04822588, + "epoch": 0.30987524425071394, + "flos": 21143161969920.0, + "grad_norm": 1.7956706783057126, + "language_loss": 0.73902357, + "learning_rate": 3.2330544864676997e-06, + "loss": 0.76079118, + "num_input_tokens_seen": 110689410, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.85546875, + "step": 5154, + "time_per_iteration": 2.5063655376434326 + }, + { + "auxiliary_loss_clip": 0.01133271, + "auxiliary_loss_mlp": 0.01039056, + "balance_loss_clip": 1.02287924, + "balance_loss_mlp": 1.04747653, + "epoch": 0.3099353675033819, + "flos": 15268284195840.0, + "grad_norm": 2.516871287947693, + "language_loss": 0.76051688, + "learning_rate": 3.232747826832858e-06, + "loss": 0.78224009, + "num_input_tokens_seen": 110707350, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.859375, + "step": 5155, + "time_per_iteration": 2.4838123321533203 + }, + { + "auxiliary_loss_clip": 0.01135928, + "auxiliary_loss_mlp": 0.01042973, + "balance_loss_clip": 1.02701044, + "balance_loss_mlp": 1.04871869, + "epoch": 0.30999549075604993, + "flos": 15413794191360.0, + "grad_norm": 1.7492301646526522, + "language_loss": 0.7883296, + "learning_rate": 3.232441120452094e-06, + "loss": 0.81011862, + "num_input_tokens_seen": 110724910, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.87109375, + "step": 5156, + "time_per_iteration": 2.4420597553253174 + }, + { + "auxiliary_loss_clip": 0.01134302, + "auxiliary_loss_mlp": 0.01046544, + "balance_loss_clip": 1.02894902, + "balance_loss_mlp": 1.04688191, + "epoch": 0.3100556140087179, + "flos": 23184539712000.0, + "grad_norm": 3.007667649484548, + "language_loss": 0.75094402, + "learning_rate": 3.23213436733704e-06, + "loss": 0.77275252, + "num_input_tokens_seen": 110744010, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.875, + "step": 5157, + "time_per_iteration": 2.4922094345092773 + }, + { + "auxiliary_loss_clip": 0.01131262, + "auxiliary_loss_mlp": 0.01037688, + "balance_loss_clip": 1.02282262, + "balance_loss_mlp": 1.04701662, + "epoch": 0.31011573726138586, + "flos": 25742169676800.0, + "grad_norm": 1.583276716554569, + "language_loss": 0.69391131, + "learning_rate": 3.231827567499327e-06, + "loss": 0.71560085, + "num_input_tokens_seen": 110765835, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.84375, + "step": 5158, + "time_per_iteration": 2.5119874477386475 + }, + { + "auxiliary_loss_clip": 0.0113222, + "auxiliary_loss_mlp": 0.01040926, + "balance_loss_clip": 1.0260725, + "balance_loss_mlp": 1.04802489, + "epoch": 0.3101758605140538, + "flos": 20011329440640.0, + "grad_norm": 1.8674515495135584, + "language_loss": 0.84731698, + "learning_rate": 3.2315207209505896e-06, + "loss": 0.86904848, + "num_input_tokens_seen": 110784655, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.84375, + "step": 5159, + "time_per_iteration": 2.5553479194641113 + }, + { + "auxiliary_loss_clip": 0.01130577, + "auxiliary_loss_mlp": 0.01037318, + "balance_loss_clip": 1.0215224, + "balance_loss_mlp": 1.04617286, + "epoch": 0.3102359837667218, + "flos": 19135683688320.0, + "grad_norm": 1.6286624468626467, + "language_loss": 0.85222661, + "learning_rate": 3.231213827702462e-06, + "loss": 0.87390554, + "num_input_tokens_seen": 110802545, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.84375, + "step": 5160, + "time_per_iteration": 2.4521608352661133 + }, + { + "auxiliary_loss_clip": 0.01131067, + "auxiliary_loss_mlp": 0.01039214, + "balance_loss_clip": 1.02385354, + "balance_loss_mlp": 1.04720986, + "epoch": 0.31029610701938976, + "flos": 22265405568000.0, + "grad_norm": 2.1323719792042404, + "language_loss": 0.76438844, + "learning_rate": 3.230906887766584e-06, + "loss": 0.78609127, + "num_input_tokens_seen": 110820265, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.83984375, + "step": 5161, + "time_per_iteration": 2.4705073833465576 + }, + { + "auxiliary_loss_clip": 0.01133183, + "auxiliary_loss_mlp": 0.01040129, + "balance_loss_clip": 1.02420259, + "balance_loss_mlp": 1.04661226, + "epoch": 0.3103562302720577, + "flos": 20805349536000.0, + "grad_norm": 1.9681741891595628, + "language_loss": 0.81644946, + "learning_rate": 3.2305999011545924e-06, + "loss": 0.83818257, + "num_input_tokens_seen": 110836195, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8671875, + "step": 5162, + "time_per_iteration": 2.4359090328216553 + }, + { + "auxiliary_loss_clip": 0.01129525, + "auxiliary_loss_mlp": 0.01037231, + "balance_loss_clip": 1.0231998, + "balance_loss_mlp": 1.04580498, + "epoch": 0.3104163535247257, + "flos": 22344158136960.0, + "grad_norm": 1.6668116654420786, + "language_loss": 0.82879269, + "learning_rate": 3.2302928678781295e-06, + "loss": 0.85046029, + "num_input_tokens_seen": 110856420, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8359375, + "step": 5163, + "time_per_iteration": 2.536198854446411 + }, + { + "auxiliary_loss_clip": 0.01134589, + "auxiliary_loss_mlp": 0.01042558, + "balance_loss_clip": 1.02670264, + "balance_loss_mlp": 1.04848182, + "epoch": 0.31047647677739365, + "flos": 21689363157120.0, + "grad_norm": 1.61479678935284, + "language_loss": 0.76103258, + "learning_rate": 3.2299857879488376e-06, + "loss": 0.78280413, + "num_input_tokens_seen": 110876650, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.859375, + "step": 5164, + "time_per_iteration": 2.4736320972442627 + }, + { + "auxiliary_loss_clip": 0.01134485, + "auxiliary_loss_mlp": 0.01041252, + "balance_loss_clip": 1.02492666, + "balance_loss_mlp": 1.04932189, + "epoch": 0.3105366000300616, + "flos": 18917275040640.0, + "grad_norm": 1.73414256762253, + "language_loss": 0.74515426, + "learning_rate": 3.2296786613783626e-06, + "loss": 0.76691169, + "num_input_tokens_seen": 110894445, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8515625, + "step": 5165, + "time_per_iteration": 2.4788122177124023 + }, + { + "auxiliary_loss_clip": 0.01132367, + "auxiliary_loss_mlp": 0.01042006, + "balance_loss_clip": 1.02627063, + "balance_loss_mlp": 1.0472759, + "epoch": 0.3105967232827296, + "flos": 18260397072000.0, + "grad_norm": 2.461614607097325, + "language_loss": 0.75987816, + "learning_rate": 3.229371488178348e-06, + "loss": 0.78162187, + "num_input_tokens_seen": 110912855, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8515625, + "step": 5166, + "time_per_iteration": 2.4461371898651123 + }, + { + "auxiliary_loss_clip": 0.01133631, + "auxiliary_loss_mlp": 0.01045635, + "balance_loss_clip": 1.02939892, + "balance_loss_mlp": 1.04844868, + "epoch": 0.31065684653539755, + "flos": 17672144037120.0, + "grad_norm": 2.4324780660218557, + "language_loss": 0.73424876, + "learning_rate": 3.229064268360444e-06, + "loss": 0.75604147, + "num_input_tokens_seen": 110928025, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8515625, + "step": 5167, + "time_per_iteration": 2.4301631450653076 + }, + { + "auxiliary_loss_clip": 0.01047334, + "auxiliary_loss_mlp": 0.01006703, + "balance_loss_clip": 1.00467682, + "balance_loss_mlp": 1.01844001, + "epoch": 0.3107169697880655, + "flos": 68531996511360.0, + "grad_norm": 0.725291341239906, + "language_loss": 0.53031516, + "learning_rate": 3.2287570019362997e-06, + "loss": 0.55085552, + "num_input_tokens_seen": 110992215, + "router_z_loss_clip": 0.02026367, + "router_z_loss_mlp": 0.2890625, + "step": 5168, + "time_per_iteration": 3.1146020889282227 + }, + { + "auxiliary_loss_clip": 0.01133751, + "auxiliary_loss_mlp": 0.01043639, + "balance_loss_clip": 1.0269258, + "balance_loss_mlp": 1.0465318, + "epoch": 0.3107770930407335, + "flos": 13188733274880.0, + "grad_norm": 1.782356602828545, + "language_loss": 0.78745592, + "learning_rate": 3.2284496889175668e-06, + "loss": 0.80922985, + "num_input_tokens_seen": 111010400, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.87109375, + "step": 5169, + "time_per_iteration": 2.4755852222442627 + }, + { + "auxiliary_loss_clip": 0.01132974, + "auxiliary_loss_mlp": 0.01038846, + "balance_loss_clip": 1.02337217, + "balance_loss_mlp": 1.04640126, + "epoch": 0.3108372162934015, + "flos": 31580849520000.0, + "grad_norm": 1.536235209485244, + "language_loss": 0.6414057, + "learning_rate": 3.2281423293158986e-06, + "loss": 0.66312397, + "num_input_tokens_seen": 111033960, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8671875, + "step": 5170, + "time_per_iteration": 2.5690839290618896 + }, + { + "auxiliary_loss_clip": 0.01133399, + "auxiliary_loss_mlp": 0.01042243, + "balance_loss_clip": 1.02635252, + "balance_loss_mlp": 1.04721069, + "epoch": 0.31089733954606946, + "flos": 28729829266560.0, + "grad_norm": 2.41080559035864, + "language_loss": 0.77698815, + "learning_rate": 3.22783492314295e-06, + "loss": 0.79874456, + "num_input_tokens_seen": 111053265, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.86328125, + "step": 5171, + "time_per_iteration": 2.558258295059204 + }, + { + "auxiliary_loss_clip": 0.01132946, + "auxiliary_loss_mlp": 0.01053954, + "balance_loss_clip": 1.03769374, + "balance_loss_mlp": 1.04645526, + "epoch": 0.3109574627987374, + "flos": 19683249592320.0, + "grad_norm": 1.9319520361735263, + "language_loss": 0.83802366, + "learning_rate": 3.2275274704103785e-06, + "loss": 0.85989261, + "num_input_tokens_seen": 111071130, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8671875, + "step": 5172, + "time_per_iteration": 2.4601597785949707 + }, + { + "auxiliary_loss_clip": 0.01133186, + "auxiliary_loss_mlp": 0.01045771, + "balance_loss_clip": 1.02948654, + "balance_loss_mlp": 1.0467186, + "epoch": 0.3110175860514054, + "flos": 14683981656960.0, + "grad_norm": 1.9586589765002733, + "language_loss": 0.84225619, + "learning_rate": 3.227219971129842e-06, + "loss": 0.86404574, + "num_input_tokens_seen": 111089560, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8671875, + "step": 5173, + "time_per_iteration": 2.501840591430664 + }, + { + "auxiliary_loss_clip": 0.01128358, + "auxiliary_loss_mlp": 0.01034767, + "balance_loss_clip": 1.02038455, + "balance_loss_mlp": 1.04595959, + "epoch": 0.31107770930407336, + "flos": 25739655724800.0, + "grad_norm": 1.622637298809784, + "language_loss": 0.83323705, + "learning_rate": 3.226912425313001e-06, + "loss": 0.85486829, + "num_input_tokens_seen": 111109960, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.82421875, + "step": 5174, + "time_per_iteration": 2.507127285003662 + }, + { + "auxiliary_loss_clip": 0.01131648, + "auxiliary_loss_mlp": 0.01047063, + "balance_loss_clip": 1.03155434, + "balance_loss_mlp": 1.04670012, + "epoch": 0.3111378325567413, + "flos": 19208259118080.0, + "grad_norm": 2.3340025504670003, + "language_loss": 0.84681082, + "learning_rate": 3.2266048329715183e-06, + "loss": 0.86859798, + "num_input_tokens_seen": 111127960, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84765625, + "step": 5175, + "time_per_iteration": 2.4853246212005615 + }, + { + "auxiliary_loss_clip": 0.01133689, + "auxiliary_loss_mlp": 0.01047203, + "balance_loss_clip": 1.03029919, + "balance_loss_mlp": 1.04996502, + "epoch": 0.3111979558094093, + "flos": 23696374561920.0, + "grad_norm": 1.6466695594130172, + "language_loss": 0.83448446, + "learning_rate": 3.2262971941170575e-06, + "loss": 0.85629338, + "num_input_tokens_seen": 111146730, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8359375, + "step": 5176, + "time_per_iteration": 2.4759509563446045 + }, + { + "auxiliary_loss_clip": 0.01128858, + "auxiliary_loss_mlp": 0.01044446, + "balance_loss_clip": 1.02836514, + "balance_loss_mlp": 1.04442942, + "epoch": 0.31125807906207725, + "flos": 21033023892480.0, + "grad_norm": 1.7899579393784935, + "language_loss": 0.80820966, + "learning_rate": 3.2259895087612837e-06, + "loss": 0.8299427, + "num_input_tokens_seen": 111166295, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.84375, + "step": 5177, + "time_per_iteration": 2.5106611251831055 + }, + { + "auxiliary_loss_clip": 0.0113295, + "auxiliary_loss_mlp": 0.01041813, + "balance_loss_clip": 1.02629185, + "balance_loss_mlp": 1.048877, + "epoch": 0.3113182023147452, + "flos": 23076628277760.0, + "grad_norm": 1.9871899212943351, + "language_loss": 0.80703342, + "learning_rate": 3.2256817769158657e-06, + "loss": 0.82878101, + "num_input_tokens_seen": 111185665, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.84375, + "step": 5178, + "time_per_iteration": 4.0482330322265625 + }, + { + "auxiliary_loss_clip": 0.01131397, + "auxiliary_loss_mlp": 0.01048541, + "balance_loss_clip": 1.03310347, + "balance_loss_mlp": 1.04518402, + "epoch": 0.3113783255674132, + "flos": 11838994888320.0, + "grad_norm": 1.8347450184704097, + "language_loss": 0.81340981, + "learning_rate": 3.225373998592471e-06, + "loss": 0.83520925, + "num_input_tokens_seen": 111201615, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.86328125, + "step": 5179, + "time_per_iteration": 3.82991886138916 + }, + { + "auxiliary_loss_clip": 0.01132507, + "auxiliary_loss_mlp": 0.01049787, + "balance_loss_clip": 1.0338006, + "balance_loss_mlp": 1.04824936, + "epoch": 0.31143844882008115, + "flos": 16289547684480.0, + "grad_norm": 1.599561013411363, + "language_loss": 0.78199375, + "learning_rate": 3.2250661738027715e-06, + "loss": 0.8038168, + "num_input_tokens_seen": 111220515, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.84375, + "step": 5180, + "time_per_iteration": 2.4656291007995605 + }, + { + "auxiliary_loss_clip": 0.01130701, + "auxiliary_loss_mlp": 0.01035311, + "balance_loss_clip": 1.01915836, + "balance_loss_mlp": 1.04672408, + "epoch": 0.3114985720727491, + "flos": 23217792727680.0, + "grad_norm": 1.6380256774064115, + "language_loss": 0.83046079, + "learning_rate": 3.22475830255844e-06, + "loss": 0.85212088, + "num_input_tokens_seen": 111240395, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.84375, + "step": 5181, + "time_per_iteration": 2.5661914348602295 + }, + { + "auxiliary_loss_clip": 0.01128181, + "auxiliary_loss_mlp": 0.01043679, + "balance_loss_clip": 1.02903986, + "balance_loss_mlp": 1.0464232, + "epoch": 0.3115586953254171, + "flos": 30044626698240.0, + "grad_norm": 1.700886032828765, + "language_loss": 0.74084079, + "learning_rate": 3.2244503848711516e-06, + "loss": 0.76255929, + "num_input_tokens_seen": 111261100, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8203125, + "step": 5182, + "time_per_iteration": 2.5913209915161133 + }, + { + "auxiliary_loss_clip": 0.01136348, + "auxiliary_loss_mlp": 0.01050649, + "balance_loss_clip": 1.03479409, + "balance_loss_mlp": 1.04858768, + "epoch": 0.3116188185780851, + "flos": 25666326109440.0, + "grad_norm": 1.8010906920491343, + "language_loss": 0.70658493, + "learning_rate": 3.2241424207525815e-06, + "loss": 0.72845489, + "num_input_tokens_seen": 111281320, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.875, + "step": 5183, + "time_per_iteration": 2.4991438388824463 + }, + { + "auxiliary_loss_clip": 0.01045533, + "auxiliary_loss_mlp": 0.01014757, + "balance_loss_clip": 1.01301634, + "balance_loss_mlp": 1.01690507, + "epoch": 0.31167894183075306, + "flos": 69510058917120.0, + "grad_norm": 0.9414003998762589, + "language_loss": 0.59602594, + "learning_rate": 3.223834410214408e-06, + "loss": 0.61662877, + "num_input_tokens_seen": 111341405, + "router_z_loss_clip": 0.01745605, + "router_z_loss_mlp": 0.28515625, + "step": 5184, + "time_per_iteration": 3.0754520893096924 + }, + { + "auxiliary_loss_clip": 0.01130364, + "auxiliary_loss_mlp": 0.01047375, + "balance_loss_clip": 1.03264058, + "balance_loss_mlp": 1.04596519, + "epoch": 0.31173906508342103, + "flos": 14939845211520.0, + "grad_norm": 2.811836993883612, + "language_loss": 0.69750082, + "learning_rate": 3.223526353268311e-06, + "loss": 0.71927822, + "num_input_tokens_seen": 111358975, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.84375, + "step": 5185, + "time_per_iteration": 2.435033082962036 + }, + { + "auxiliary_loss_clip": 0.01136749, + "auxiliary_loss_mlp": 0.01048147, + "balance_loss_clip": 1.0323875, + "balance_loss_mlp": 1.05073345, + "epoch": 0.311799188336089, + "flos": 16176033728640.0, + "grad_norm": 2.346024133586612, + "language_loss": 0.63920057, + "learning_rate": 3.2232182499259725e-06, + "loss": 0.66104954, + "num_input_tokens_seen": 111375845, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.859375, + "step": 5186, + "time_per_iteration": 2.463900327682495 + }, + { + "auxiliary_loss_clip": 0.01137188, + "auxiliary_loss_mlp": 0.01049347, + "balance_loss_clip": 1.03219295, + "balance_loss_mlp": 1.04886758, + "epoch": 0.31185931158875696, + "flos": 25009627708800.0, + "grad_norm": 2.108066194391345, + "language_loss": 0.86249322, + "learning_rate": 3.2229101001990747e-06, + "loss": 0.88435853, + "num_input_tokens_seen": 111394150, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.8828125, + "step": 5187, + "time_per_iteration": 2.4854979515075684 + }, + { + "auxiliary_loss_clip": 0.01129847, + "auxiliary_loss_mlp": 0.0104672, + "balance_loss_clip": 1.03048384, + "balance_loss_mlp": 1.0451926, + "epoch": 0.3119194348414249, + "flos": 37232901273600.0, + "grad_norm": 1.7445298378798078, + "language_loss": 0.62983185, + "learning_rate": 3.2226019040993036e-06, + "loss": 0.6515975, + "num_input_tokens_seen": 111418355, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.84765625, + "step": 5188, + "time_per_iteration": 2.6161019802093506 + }, + { + "auxiliary_loss_clip": 0.01135744, + "auxiliary_loss_mlp": 0.01045566, + "balance_loss_clip": 1.02961564, + "balance_loss_mlp": 1.05116081, + "epoch": 0.3119795580940929, + "flos": 15012779777280.0, + "grad_norm": 2.1633857437120256, + "language_loss": 0.8347863, + "learning_rate": 3.222293661638346e-06, + "loss": 0.85659939, + "num_input_tokens_seen": 111435445, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.84375, + "step": 5189, + "time_per_iteration": 2.4360432624816895 + }, + { + "auxiliary_loss_clip": 0.01129905, + "auxiliary_loss_mlp": 0.01036753, + "balance_loss_clip": 1.0213753, + "balance_loss_mlp": 1.04657507, + "epoch": 0.31203968134676086, + "flos": 15998168557440.0, + "grad_norm": 1.6712014044776404, + "language_loss": 0.7916308, + "learning_rate": 3.22198537282789e-06, + "loss": 0.81329739, + "num_input_tokens_seen": 111453430, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.83203125, + "step": 5190, + "time_per_iteration": 2.472668170928955 + }, + { + "auxiliary_loss_clip": 0.01133914, + "auxiliary_loss_mlp": 0.01035258, + "balance_loss_clip": 1.01986194, + "balance_loss_mlp": 1.04946673, + "epoch": 0.3120998045994288, + "flos": 23837359443840.0, + "grad_norm": 1.4545499288259176, + "language_loss": 0.75318813, + "learning_rate": 3.2216770376796262e-06, + "loss": 0.77487987, + "num_input_tokens_seen": 111475325, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84375, + "step": 5191, + "time_per_iteration": 2.486673355102539 + }, + { + "auxiliary_loss_clip": 0.01049091, + "auxiliary_loss_mlp": 0.01002214, + "balance_loss_clip": 1.00025892, + "balance_loss_mlp": 1.02067924, + "epoch": 0.3121599278520968, + "flos": 69184205712000.0, + "grad_norm": 0.8451593954944295, + "language_loss": 0.63957787, + "learning_rate": 3.221368656205247e-06, + "loss": 0.66009092, + "num_input_tokens_seen": 111533960, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.28515625, + "step": 5192, + "time_per_iteration": 3.1464638710021973 + }, + { + "auxiliary_loss_clip": 0.01134311, + "auxiliary_loss_mlp": 0.01041004, + "balance_loss_clip": 1.02365923, + "balance_loss_mlp": 1.04795599, + "epoch": 0.31222005110476475, + "flos": 23806368984960.0, + "grad_norm": 1.6164756923867671, + "language_loss": 0.80154347, + "learning_rate": 3.221060228416446e-06, + "loss": 0.82329667, + "num_input_tokens_seen": 111554055, + "router_z_loss_clip": 0.17382812, + "router_z_loss_mlp": 0.86328125, + "step": 5193, + "time_per_iteration": 2.5156989097595215 + }, + { + "auxiliary_loss_clip": 0.01131615, + "auxiliary_loss_mlp": 0.01042627, + "balance_loss_clip": 1.02610445, + "balance_loss_mlp": 1.045856, + "epoch": 0.3122801743574327, + "flos": 25226132935680.0, + "grad_norm": 1.8140889441731107, + "language_loss": 0.72050476, + "learning_rate": 3.2207517543249183e-06, + "loss": 0.74224722, + "num_input_tokens_seen": 111574305, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.85546875, + "step": 5194, + "time_per_iteration": 2.519972801208496 + }, + { + "auxiliary_loss_clip": 0.01133223, + "auxiliary_loss_mlp": 0.01039811, + "balance_loss_clip": 1.02471924, + "balance_loss_mlp": 1.04870749, + "epoch": 0.3123402976101007, + "flos": 22966490200320.0, + "grad_norm": 1.3544515008303952, + "language_loss": 0.76475823, + "learning_rate": 3.2204432339423616e-06, + "loss": 0.78648859, + "num_input_tokens_seen": 111595680, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84765625, + "step": 5195, + "time_per_iteration": 2.512247323989868 + }, + { + "auxiliary_loss_clip": 0.01131656, + "auxiliary_loss_mlp": 0.01042642, + "balance_loss_clip": 1.02718091, + "balance_loss_mlp": 1.0449183, + "epoch": 0.3124004208627687, + "flos": 25192089820800.0, + "grad_norm": 1.3526234536893298, + "language_loss": 0.7817502, + "learning_rate": 3.220134667280476e-06, + "loss": 0.80349314, + "num_input_tokens_seen": 111618135, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8671875, + "step": 5196, + "time_per_iteration": 2.528002977371216 + }, + { + "auxiliary_loss_clip": 0.01044386, + "auxiliary_loss_mlp": 0.01000444, + "balance_loss_clip": 0.99860841, + "balance_loss_mlp": 1.01643729, + "epoch": 0.31246054411543667, + "flos": 67485165517440.0, + "grad_norm": 0.7752479618797538, + "language_loss": 0.54834789, + "learning_rate": 3.2198260543509613e-06, + "loss": 0.56879622, + "num_input_tokens_seen": 111682220, + "router_z_loss_clip": 0.01831055, + "router_z_loss_mlp": 0.27929688, + "step": 5197, + "time_per_iteration": 3.0728254318237305 + }, + { + "auxiliary_loss_clip": 0.01130689, + "auxiliary_loss_mlp": 0.01038137, + "balance_loss_clip": 1.02328372, + "balance_loss_mlp": 1.0477525, + "epoch": 0.31252066736810463, + "flos": 17858520731520.0, + "grad_norm": 1.6543672060788046, + "language_loss": 0.66300559, + "learning_rate": 3.21951739516552e-06, + "loss": 0.68469381, + "num_input_tokens_seen": 111700815, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.828125, + "step": 5198, + "time_per_iteration": 2.4312028884887695 + }, + { + "auxiliary_loss_clip": 0.01135097, + "auxiliary_loss_mlp": 0.01037705, + "balance_loss_clip": 1.02156413, + "balance_loss_mlp": 1.0472604, + "epoch": 0.3125807906207726, + "flos": 18475034791680.0, + "grad_norm": 2.083859755504136, + "language_loss": 0.69763082, + "learning_rate": 3.219208689735857e-06, + "loss": 0.71935886, + "num_input_tokens_seen": 111718195, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.875, + "step": 5199, + "time_per_iteration": 2.454464912414551 + }, + { + "auxiliary_loss_clip": 0.01131797, + "auxiliary_loss_mlp": 0.01049575, + "balance_loss_clip": 1.0336132, + "balance_loss_mlp": 1.04692471, + "epoch": 0.31264091387344056, + "flos": 18946541646720.0, + "grad_norm": 1.8982997112015956, + "language_loss": 0.79004937, + "learning_rate": 3.2188999380736785e-06, + "loss": 0.81186306, + "num_input_tokens_seen": 111734440, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.84765625, + "step": 5200, + "time_per_iteration": 2.4382827281951904 + }, + { + "auxiliary_loss_clip": 0.01127793, + "auxiliary_loss_mlp": 0.01036846, + "balance_loss_clip": 1.02187347, + "balance_loss_mlp": 1.04621911, + "epoch": 0.3127010371261085, + "flos": 21468512384640.0, + "grad_norm": 2.042457973745699, + "language_loss": 0.83946276, + "learning_rate": 3.2185911401906917e-06, + "loss": 0.86110914, + "num_input_tokens_seen": 111751960, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.81640625, + "step": 5201, + "time_per_iteration": 2.475511074066162 + }, + { + "auxiliary_loss_clip": 0.01134303, + "auxiliary_loss_mlp": 0.01046403, + "balance_loss_clip": 1.02990484, + "balance_loss_mlp": 1.04985881, + "epoch": 0.3127611603787765, + "flos": 15336047203200.0, + "grad_norm": 2.37604325800411, + "language_loss": 0.69560832, + "learning_rate": 3.2182822960986072e-06, + "loss": 0.71741533, + "num_input_tokens_seen": 111769585, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.84375, + "step": 5202, + "time_per_iteration": 2.4265501499176025 + }, + { + "auxiliary_loss_clip": 0.01133329, + "auxiliary_loss_mlp": 0.01041339, + "balance_loss_clip": 1.02737963, + "balance_loss_mlp": 1.04759419, + "epoch": 0.31282128363144446, + "flos": 17602980399360.0, + "grad_norm": 1.800546738819683, + "language_loss": 0.84001613, + "learning_rate": 3.2179734058091358e-06, + "loss": 0.86176282, + "num_input_tokens_seen": 111787880, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.85546875, + "step": 5203, + "time_per_iteration": 2.480233907699585 + }, + { + "auxiliary_loss_clip": 0.01131997, + "auxiliary_loss_mlp": 0.01047163, + "balance_loss_clip": 1.03176749, + "balance_loss_mlp": 1.04697657, + "epoch": 0.3128814068841124, + "flos": 26756753235840.0, + "grad_norm": 2.9129021624211417, + "language_loss": 0.60623944, + "learning_rate": 3.2176644693339913e-06, + "loss": 0.62803102, + "num_input_tokens_seen": 111805950, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8515625, + "step": 5204, + "time_per_iteration": 2.50688099861145 + }, + { + "auxiliary_loss_clip": 0.01129885, + "auxiliary_loss_mlp": 0.01041088, + "balance_loss_clip": 1.02672338, + "balance_loss_mlp": 1.04707503, + "epoch": 0.3129415301367804, + "flos": 22272372806400.0, + "grad_norm": 1.6006708998064776, + "language_loss": 0.65964866, + "learning_rate": 3.217355486684887e-06, + "loss": 0.68135834, + "num_input_tokens_seen": 111826135, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.828125, + "step": 5205, + "time_per_iteration": 2.4824163913726807 + }, + { + "auxiliary_loss_clip": 0.01132532, + "auxiliary_loss_mlp": 0.01043219, + "balance_loss_clip": 1.0264169, + "balance_loss_mlp": 1.0476222, + "epoch": 0.31300165338944835, + "flos": 26464907232000.0, + "grad_norm": 1.9498647702732133, + "language_loss": 0.76618874, + "learning_rate": 3.2170464578735414e-06, + "loss": 0.78794622, + "num_input_tokens_seen": 111844700, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.84765625, + "step": 5206, + "time_per_iteration": 2.4947307109832764 + }, + { + "auxiliary_loss_clip": 0.0112786, + "auxiliary_loss_mlp": 0.01039372, + "balance_loss_clip": 1.02416039, + "balance_loss_mlp": 1.04438448, + "epoch": 0.3130617766421163, + "flos": 21944652094080.0, + "grad_norm": 3.088705810465425, + "language_loss": 0.83287984, + "learning_rate": 3.216737382911672e-06, + "loss": 0.85455215, + "num_input_tokens_seen": 111861585, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8359375, + "step": 5207, + "time_per_iteration": 2.4767825603485107 + }, + { + "auxiliary_loss_clip": 0.01128039, + "auxiliary_loss_mlp": 0.01041894, + "balance_loss_clip": 1.02784562, + "balance_loss_mlp": 1.04694057, + "epoch": 0.3131218998947843, + "flos": 23292774368640.0, + "grad_norm": 1.5219202808663073, + "language_loss": 0.71293664, + "learning_rate": 3.216428261810999e-06, + "loss": 0.73463601, + "num_input_tokens_seen": 111882950, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8125, + "step": 5208, + "time_per_iteration": 2.4853296279907227 + }, + { + "auxiliary_loss_clip": 0.01133198, + "auxiliary_loss_mlp": 0.01041056, + "balance_loss_clip": 1.02534437, + "balance_loss_mlp": 1.04957032, + "epoch": 0.3131820231474523, + "flos": 21139642437120.0, + "grad_norm": 1.8332946649412374, + "language_loss": 0.74547577, + "learning_rate": 3.2161190945832445e-06, + "loss": 0.76721835, + "num_input_tokens_seen": 111901640, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8359375, + "step": 5209, + "time_per_iteration": 2.5162742137908936 + }, + { + "auxiliary_loss_clip": 0.0113008, + "auxiliary_loss_mlp": 0.01040855, + "balance_loss_clip": 1.02695489, + "balance_loss_mlp": 1.04557538, + "epoch": 0.31324214640012027, + "flos": 23909863046400.0, + "grad_norm": 1.818845882779476, + "language_loss": 0.77656835, + "learning_rate": 3.2158098812401325e-06, + "loss": 0.79827774, + "num_input_tokens_seen": 111919615, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.84375, + "step": 5210, + "time_per_iteration": 2.4701180458068848 + }, + { + "auxiliary_loss_clip": 0.01125909, + "auxiliary_loss_mlp": 0.0103947, + "balance_loss_clip": 1.02443743, + "balance_loss_mlp": 1.04593706, + "epoch": 0.31330226965278823, + "flos": 22236929061120.0, + "grad_norm": 1.8627745841798442, + "language_loss": 0.79177994, + "learning_rate": 3.2155006217933874e-06, + "loss": 0.81343371, + "num_input_tokens_seen": 111938485, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.80078125, + "step": 5211, + "time_per_iteration": 2.482102870941162 + }, + { + "auxiliary_loss_clip": 0.01130248, + "auxiliary_loss_mlp": 0.01038221, + "balance_loss_clip": 1.02448201, + "balance_loss_mlp": 1.04849112, + "epoch": 0.3133623929054562, + "flos": 19753993428480.0, + "grad_norm": 1.64859412039223, + "language_loss": 0.79837513, + "learning_rate": 3.2151913162547367e-06, + "loss": 0.82005984, + "num_input_tokens_seen": 111956425, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.8203125, + "step": 5212, + "time_per_iteration": 2.460986852645874 + }, + { + "auxiliary_loss_clip": 0.01133278, + "auxiliary_loss_mlp": 0.01049778, + "balance_loss_clip": 1.03395939, + "balance_loss_mlp": 1.04740417, + "epoch": 0.31342251615812416, + "flos": 27162256849920.0, + "grad_norm": 2.096287390218497, + "language_loss": 0.71467483, + "learning_rate": 3.2148819646359097e-06, + "loss": 0.73650539, + "num_input_tokens_seen": 111975915, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.859375, + "step": 5213, + "time_per_iteration": 2.5129754543304443 + }, + { + "auxiliary_loss_clip": 0.01135204, + "auxiliary_loss_mlp": 0.01041521, + "balance_loss_clip": 1.02660799, + "balance_loss_mlp": 1.05014026, + "epoch": 0.31348263941079213, + "flos": 20229809915520.0, + "grad_norm": 5.183832853627301, + "language_loss": 0.77595121, + "learning_rate": 3.2145725669486374e-06, + "loss": 0.79771841, + "num_input_tokens_seen": 111995055, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8515625, + "step": 5214, + "time_per_iteration": 2.453228712081909 + }, + { + "auxiliary_loss_clip": 0.01126524, + "auxiliary_loss_mlp": 0.01034893, + "balance_loss_clip": 1.02082658, + "balance_loss_mlp": 1.04599309, + "epoch": 0.3135427626634601, + "flos": 24607643627520.0, + "grad_norm": 1.6576138068605464, + "language_loss": 0.82562625, + "learning_rate": 3.2142631232046517e-06, + "loss": 0.84724051, + "num_input_tokens_seen": 112015830, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8046875, + "step": 5215, + "time_per_iteration": 2.544684886932373 + }, + { + "auxiliary_loss_clip": 0.01131802, + "auxiliary_loss_mlp": 0.01037959, + "balance_loss_clip": 1.02242613, + "balance_loss_mlp": 1.04732776, + "epoch": 0.31360288591612806, + "flos": 20959873845120.0, + "grad_norm": 2.510877303679677, + "language_loss": 0.79557931, + "learning_rate": 3.213953633415686e-06, + "loss": 0.81727695, + "num_input_tokens_seen": 112035065, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.84375, + "step": 5216, + "time_per_iteration": 2.4559943675994873 + }, + { + "auxiliary_loss_clip": 0.0113211, + "auxiliary_loss_mlp": 0.01047322, + "balance_loss_clip": 1.03042984, + "balance_loss_mlp": 1.04632115, + "epoch": 0.313663009168796, + "flos": 26980513009920.0, + "grad_norm": 2.0079960226100293, + "language_loss": 0.68489361, + "learning_rate": 3.213644097593477e-06, + "loss": 0.70668793, + "num_input_tokens_seen": 112058405, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.859375, + "step": 5217, + "time_per_iteration": 2.524624824523926 + }, + { + "auxiliary_loss_clip": 0.01134019, + "auxiliary_loss_mlp": 0.01036007, + "balance_loss_clip": 1.02095652, + "balance_loss_mlp": 1.04952598, + "epoch": 0.313723132421464, + "flos": 18040911016320.0, + "grad_norm": 1.8597778329644077, + "language_loss": 0.80357039, + "learning_rate": 3.2133345157497624e-06, + "loss": 0.82527065, + "num_input_tokens_seen": 112076420, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84765625, + "step": 5218, + "time_per_iteration": 2.437819480895996 + }, + { + "auxiliary_loss_clip": 0.01130766, + "auxiliary_loss_mlp": 0.01041589, + "balance_loss_clip": 1.025931, + "balance_loss_mlp": 1.04692423, + "epoch": 0.31378325567413196, + "flos": 22488913946880.0, + "grad_norm": 2.311414379590861, + "language_loss": 0.68608415, + "learning_rate": 3.2130248878962813e-06, + "loss": 0.70780772, + "num_input_tokens_seen": 112090775, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8359375, + "step": 5219, + "time_per_iteration": 2.4811697006225586 + }, + { + "auxiliary_loss_clip": 0.01132783, + "auxiliary_loss_mlp": 0.01040103, + "balance_loss_clip": 1.02585125, + "balance_loss_mlp": 1.05002093, + "epoch": 0.3138433789267999, + "flos": 22419247518720.0, + "grad_norm": 1.886141735907444, + "language_loss": 0.7973401, + "learning_rate": 3.2127152140447747e-06, + "loss": 0.81906897, + "num_input_tokens_seen": 112110980, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.828125, + "step": 5220, + "time_per_iteration": 5.5014426708221436 + }, + { + "auxiliary_loss_clip": 0.01129795, + "auxiliary_loss_mlp": 0.0103477, + "balance_loss_clip": 1.02036917, + "balance_loss_mlp": 1.0470016, + "epoch": 0.3139035021794679, + "flos": 13005912026880.0, + "grad_norm": 1.696615671785811, + "language_loss": 0.72865409, + "learning_rate": 3.212405494206986e-06, + "loss": 0.75029969, + "num_input_tokens_seen": 112129020, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.828125, + "step": 5221, + "time_per_iteration": 2.4286248683929443 + }, + { + "auxiliary_loss_clip": 0.01129062, + "auxiliary_loss_mlp": 0.01037622, + "balance_loss_clip": 1.02370405, + "balance_loss_mlp": 1.0478735, + "epoch": 0.31396362543213585, + "flos": 16945994689920.0, + "grad_norm": 1.5798649053475948, + "language_loss": 0.8195132, + "learning_rate": 3.2120957283946588e-06, + "loss": 0.84118003, + "num_input_tokens_seen": 112147865, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.8125, + "step": 5222, + "time_per_iteration": 2.453622817993164 + }, + { + "auxiliary_loss_clip": 0.01133873, + "auxiliary_loss_mlp": 0.01044471, + "balance_loss_clip": 1.02744806, + "balance_loss_mlp": 1.04833627, + "epoch": 0.31402374868480387, + "flos": 20156731695360.0, + "grad_norm": 1.948806511089887, + "language_loss": 0.70150459, + "learning_rate": 3.2117859166195407e-06, + "loss": 0.723288, + "num_input_tokens_seen": 112166745, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.85546875, + "step": 5223, + "time_per_iteration": 2.442513942718506 + }, + { + "auxiliary_loss_clip": 0.01130042, + "auxiliary_loss_mlp": 0.01034314, + "balance_loss_clip": 1.01980042, + "balance_loss_mlp": 1.04643512, + "epoch": 0.31408387193747184, + "flos": 21251073404160.0, + "grad_norm": 1.6111281957709347, + "language_loss": 0.80361176, + "learning_rate": 3.211476058893379e-06, + "loss": 0.82525527, + "num_input_tokens_seen": 112185895, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8359375, + "step": 5224, + "time_per_iteration": 2.5533599853515625 + }, + { + "auxiliary_loss_clip": 0.01139668, + "auxiliary_loss_mlp": 0.01042146, + "balance_loss_clip": 1.02615976, + "balance_loss_mlp": 1.05134106, + "epoch": 0.3141439951901398, + "flos": 27484267299840.0, + "grad_norm": 1.9819108050216143, + "language_loss": 0.58416283, + "learning_rate": 3.2111661552279243e-06, + "loss": 0.60598099, + "num_input_tokens_seen": 112204465, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8828125, + "step": 5225, + "time_per_iteration": 2.493633508682251 + }, + { + "auxiliary_loss_clip": 0.01125852, + "auxiliary_loss_mlp": 0.01031717, + "balance_loss_clip": 1.01826406, + "balance_loss_mlp": 1.04575014, + "epoch": 0.31420411844280777, + "flos": 17852235851520.0, + "grad_norm": 1.9016989590060558, + "language_loss": 0.81870753, + "learning_rate": 3.2108562056349273e-06, + "loss": 0.84028322, + "num_input_tokens_seen": 112221635, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.80078125, + "step": 5226, + "time_per_iteration": 2.455474376678467 + }, + { + "auxiliary_loss_clip": 0.01132046, + "auxiliary_loss_mlp": 0.0103993, + "balance_loss_clip": 1.0245285, + "balance_loss_mlp": 1.04804921, + "epoch": 0.31426424169547573, + "flos": 21616967295360.0, + "grad_norm": 3.2929472014065864, + "language_loss": 0.73947561, + "learning_rate": 3.210546210126141e-06, + "loss": 0.7611953, + "num_input_tokens_seen": 112241240, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.83984375, + "step": 5227, + "time_per_iteration": 2.4582889080047607 + }, + { + "auxiliary_loss_clip": 0.01132913, + "auxiliary_loss_mlp": 0.01042937, + "balance_loss_clip": 1.02783334, + "balance_loss_mlp": 1.04827404, + "epoch": 0.3143243649481437, + "flos": 30920631586560.0, + "grad_norm": 1.9061545786481, + "language_loss": 0.67636049, + "learning_rate": 3.2102361687133213e-06, + "loss": 0.69811898, + "num_input_tokens_seen": 112262350, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84375, + "step": 5228, + "time_per_iteration": 2.572122573852539 + }, + { + "auxiliary_loss_clip": 0.01130676, + "auxiliary_loss_mlp": 0.01040068, + "balance_loss_clip": 1.02567399, + "balance_loss_mlp": 1.04645872, + "epoch": 0.31438448820081166, + "flos": 22821411168000.0, + "grad_norm": 1.857425256773369, + "language_loss": 0.79938543, + "learning_rate": 3.2099260814082254e-06, + "loss": 0.82109284, + "num_input_tokens_seen": 112283710, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.83984375, + "step": 5229, + "time_per_iteration": 2.4785192012786865 + }, + { + "auxiliary_loss_clip": 0.01129346, + "auxiliary_loss_mlp": 0.01039876, + "balance_loss_clip": 1.02474797, + "balance_loss_mlp": 1.04716849, + "epoch": 0.3144446114534796, + "flos": 23292127923840.0, + "grad_norm": 1.8246409730399047, + "language_loss": 0.70264775, + "learning_rate": 3.209615948222611e-06, + "loss": 0.72434002, + "num_input_tokens_seen": 112304285, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8203125, + "step": 5230, + "time_per_iteration": 2.504387140274048 + }, + { + "auxiliary_loss_clip": 0.01129413, + "auxiliary_loss_mlp": 0.01043609, + "balance_loss_clip": 1.02805161, + "balance_loss_mlp": 1.04486191, + "epoch": 0.3145047347061476, + "flos": 31355976424320.0, + "grad_norm": 1.680902640440715, + "language_loss": 0.79707456, + "learning_rate": 3.209305769168239e-06, + "loss": 0.81880474, + "num_input_tokens_seen": 112325110, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84375, + "step": 5231, + "time_per_iteration": 2.535352945327759 + }, + { + "auxiliary_loss_clip": 0.01129002, + "auxiliary_loss_mlp": 0.01042731, + "balance_loss_clip": 1.02675736, + "balance_loss_mlp": 1.04756021, + "epoch": 0.31456485795881556, + "flos": 10889552643840.0, + "grad_norm": 2.0146998384070254, + "language_loss": 0.8507638, + "learning_rate": 3.2089955442568704e-06, + "loss": 0.87248111, + "num_input_tokens_seen": 112339855, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8125, + "step": 5232, + "time_per_iteration": 2.5626280307769775 + }, + { + "auxiliary_loss_clip": 0.01127281, + "auxiliary_loss_mlp": 0.01049783, + "balance_loss_clip": 1.03439283, + "balance_loss_mlp": 1.0461762, + "epoch": 0.3146249812114835, + "flos": 17092438439040.0, + "grad_norm": 1.5681064196444345, + "language_loss": 0.7984041, + "learning_rate": 3.2086852735002692e-06, + "loss": 0.82017469, + "num_input_tokens_seen": 112358480, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.80859375, + "step": 5233, + "time_per_iteration": 2.4478254318237305 + }, + { + "auxiliary_loss_clip": 0.01132884, + "auxiliary_loss_mlp": 0.01038194, + "balance_loss_clip": 1.0233047, + "balance_loss_mlp": 1.04861724, + "epoch": 0.3146851044641515, + "flos": 55291442889600.0, + "grad_norm": 1.628646597563271, + "language_loss": 0.70788991, + "learning_rate": 3.2083749569102024e-06, + "loss": 0.72960073, + "num_input_tokens_seen": 112382350, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.84375, + "step": 5234, + "time_per_iteration": 2.775871992111206 + }, + { + "auxiliary_loss_clip": 0.01131513, + "auxiliary_loss_mlp": 0.01035924, + "balance_loss_clip": 1.0205102, + "balance_loss_mlp": 1.04739237, + "epoch": 0.31474522771681945, + "flos": 27015884928000.0, + "grad_norm": 1.8519873535555593, + "language_loss": 0.72068667, + "learning_rate": 3.2080645944984356e-06, + "loss": 0.74236101, + "num_input_tokens_seen": 112400260, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.83984375, + "step": 5235, + "time_per_iteration": 2.515869617462158 + }, + { + "auxiliary_loss_clip": 0.01126993, + "auxiliary_loss_mlp": 0.01036359, + "balance_loss_clip": 1.02204823, + "balance_loss_mlp": 1.04428434, + "epoch": 0.3148053509694875, + "flos": 21251935330560.0, + "grad_norm": 2.06424580772138, + "language_loss": 0.7832365, + "learning_rate": 3.2077541862767384e-06, + "loss": 0.80487001, + "num_input_tokens_seen": 112419400, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.828125, + "step": 5236, + "time_per_iteration": 2.5591800212860107 + }, + { + "auxiliary_loss_clip": 0.01134794, + "auxiliary_loss_mlp": 0.01041698, + "balance_loss_clip": 1.02609372, + "balance_loss_mlp": 1.04730821, + "epoch": 0.31486547422215544, + "flos": 31248675521280.0, + "grad_norm": 1.44778330648976, + "language_loss": 0.75856584, + "learning_rate": 3.207443732256881e-06, + "loss": 0.78033078, + "num_input_tokens_seen": 112440825, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.875, + "step": 5237, + "time_per_iteration": 2.5414791107177734 + }, + { + "auxiliary_loss_clip": 0.01125329, + "auxiliary_loss_mlp": 0.01037879, + "balance_loss_clip": 1.02424169, + "balance_loss_mlp": 1.04500508, + "epoch": 0.3149255974748234, + "flos": 19828615933440.0, + "grad_norm": 2.1889759499940813, + "language_loss": 0.79916662, + "learning_rate": 3.2071332324506372e-06, + "loss": 0.82079864, + "num_input_tokens_seen": 112459180, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.8046875, + "step": 5238, + "time_per_iteration": 2.484102725982666 + }, + { + "auxiliary_loss_clip": 0.01045144, + "auxiliary_loss_mlp": 0.0100711, + "balance_loss_clip": 1.0053103, + "balance_loss_mlp": 1.01739836, + "epoch": 0.31498572072749137, + "flos": 67683965339520.0, + "grad_norm": 0.8333107882681854, + "language_loss": 0.67920464, + "learning_rate": 3.2068226868697795e-06, + "loss": 0.69972724, + "num_input_tokens_seen": 112516680, + "router_z_loss_clip": 0.01794434, + "router_z_loss_mlp": 0.27734375, + "step": 5239, + "time_per_iteration": 3.0362496376037598 + }, + { + "auxiliary_loss_clip": 0.01130796, + "auxiliary_loss_mlp": 0.01038602, + "balance_loss_clip": 1.02197254, + "balance_loss_mlp": 1.04535258, + "epoch": 0.31504584398015933, + "flos": 19793136274560.0, + "grad_norm": 2.0536997136778847, + "language_loss": 0.82329869, + "learning_rate": 3.2065120955260846e-06, + "loss": 0.84499264, + "num_input_tokens_seen": 112535895, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.85546875, + "step": 5240, + "time_per_iteration": 2.5182995796203613 + }, + { + "auxiliary_loss_clip": 0.01130164, + "auxiliary_loss_mlp": 0.01039014, + "balance_loss_clip": 1.02451253, + "balance_loss_mlp": 1.04874361, + "epoch": 0.3151059672328273, + "flos": 26615409217920.0, + "grad_norm": 2.2630790499207962, + "language_loss": 0.80981195, + "learning_rate": 3.2062014584313302e-06, + "loss": 0.83150375, + "num_input_tokens_seen": 112557490, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8125, + "step": 5241, + "time_per_iteration": 2.5001909732818604 + }, + { + "auxiliary_loss_clip": 0.01129016, + "auxiliary_loss_mlp": 0.01036038, + "balance_loss_clip": 1.02131534, + "balance_loss_mlp": 1.04834199, + "epoch": 0.31516609048549526, + "flos": 24204438483840.0, + "grad_norm": 1.5804052674973608, + "language_loss": 0.74575627, + "learning_rate": 3.2058907755972956e-06, + "loss": 0.76740676, + "num_input_tokens_seen": 112577075, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8046875, + "step": 5242, + "time_per_iteration": 2.530768871307373 + }, + { + "auxiliary_loss_clip": 0.01129278, + "auxiliary_loss_mlp": 0.01034942, + "balance_loss_clip": 1.0189085, + "balance_loss_mlp": 1.04601228, + "epoch": 0.31522621373816323, + "flos": 25958710817280.0, + "grad_norm": 1.9335835713568477, + "language_loss": 0.74171245, + "learning_rate": 3.2055800470357626e-06, + "loss": 0.7633546, + "num_input_tokens_seen": 112597620, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.83203125, + "step": 5243, + "time_per_iteration": 2.495138168334961 + }, + { + "auxiliary_loss_clip": 0.01129958, + "auxiliary_loss_mlp": 0.01036952, + "balance_loss_clip": 1.02221215, + "balance_loss_mlp": 1.04677868, + "epoch": 0.3152863369908312, + "flos": 21908813299200.0, + "grad_norm": 3.400707627247709, + "language_loss": 0.64608908, + "learning_rate": 3.205269272758513e-06, + "loss": 0.66775823, + "num_input_tokens_seen": 112617150, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.83203125, + "step": 5244, + "time_per_iteration": 2.4930343627929688 + }, + { + "auxiliary_loss_clip": 0.01132393, + "auxiliary_loss_mlp": 0.01035579, + "balance_loss_clip": 1.02088022, + "balance_loss_mlp": 1.04716229, + "epoch": 0.31534646024349916, + "flos": 16281072074880.0, + "grad_norm": 2.1590647535644965, + "language_loss": 0.91464043, + "learning_rate": 3.2049584527773313e-06, + "loss": 0.93632007, + "num_input_tokens_seen": 112631090, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8515625, + "step": 5245, + "time_per_iteration": 2.4007837772369385 + }, + { + "auxiliary_loss_clip": 0.0113079, + "auxiliary_loss_mlp": 0.0104148, + "balance_loss_clip": 1.02636433, + "balance_loss_mlp": 1.04643655, + "epoch": 0.3154065834961671, + "flos": 24717243000960.0, + "grad_norm": 9.888646015204756, + "language_loss": 0.75272042, + "learning_rate": 3.2046475871040048e-06, + "loss": 0.77444315, + "num_input_tokens_seen": 112651220, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.84375, + "step": 5246, + "time_per_iteration": 2.4886202812194824 + }, + { + "auxiliary_loss_clip": 0.01131208, + "auxiliary_loss_mlp": 0.01040085, + "balance_loss_clip": 1.02524352, + "balance_loss_mlp": 1.04602718, + "epoch": 0.3154667067488351, + "flos": 35371148469120.0, + "grad_norm": 1.4670109155165818, + "language_loss": 0.6160199, + "learning_rate": 3.204336675750321e-06, + "loss": 0.63773286, + "num_input_tokens_seen": 112671560, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8515625, + "step": 5247, + "time_per_iteration": 2.567185640335083 + }, + { + "auxiliary_loss_clip": 0.01132287, + "auxiliary_loss_mlp": 0.01038251, + "balance_loss_clip": 1.02283072, + "balance_loss_mlp": 1.04756081, + "epoch": 0.31552683000150306, + "flos": 17456464823040.0, + "grad_norm": 2.2084660310503526, + "language_loss": 0.82410538, + "learning_rate": 3.2040257187280693e-06, + "loss": 0.84581077, + "num_input_tokens_seen": 112689790, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84375, + "step": 5248, + "time_per_iteration": 2.52426815032959 + }, + { + "auxiliary_loss_clip": 0.01129578, + "auxiliary_loss_mlp": 0.01050015, + "balance_loss_clip": 1.03413653, + "balance_loss_mlp": 1.04662156, + "epoch": 0.3155869532541711, + "flos": 18405763413120.0, + "grad_norm": 1.8083364563285407, + "language_loss": 0.85017586, + "learning_rate": 3.2037147160490423e-06, + "loss": 0.87197179, + "num_input_tokens_seen": 112708265, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.828125, + "step": 5249, + "time_per_iteration": 2.4549005031585693 + }, + { + "auxiliary_loss_clip": 0.0113226, + "auxiliary_loss_mlp": 0.01038074, + "balance_loss_clip": 1.02245772, + "balance_loss_mlp": 1.04802227, + "epoch": 0.31564707650683904, + "flos": 21579763783680.0, + "grad_norm": 1.8090626711780673, + "language_loss": 0.85569501, + "learning_rate": 3.2034036677250322e-06, + "loss": 0.87739837, + "num_input_tokens_seen": 112727820, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84375, + "step": 5250, + "time_per_iteration": 2.502629041671753 + }, + { + "auxiliary_loss_clip": 0.01128678, + "auxiliary_loss_mlp": 0.01042591, + "balance_loss_clip": 1.02766562, + "balance_loss_mlp": 1.04532385, + "epoch": 0.315707199759507, + "flos": 21030976817280.0, + "grad_norm": 4.215523946509053, + "language_loss": 0.68559456, + "learning_rate": 3.203092573767835e-06, + "loss": 0.70730722, + "num_input_tokens_seen": 112743140, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8359375, + "step": 5251, + "time_per_iteration": 2.4467368125915527 + }, + { + "auxiliary_loss_clip": 0.01130513, + "auxiliary_loss_mlp": 0.01039965, + "balance_loss_clip": 1.02487266, + "balance_loss_mlp": 1.04848695, + "epoch": 0.31576732301217497, + "flos": 26828861788800.0, + "grad_norm": 1.7890606859490685, + "language_loss": 0.78783, + "learning_rate": 3.202781434189246e-06, + "loss": 0.80953479, + "num_input_tokens_seen": 112764705, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8203125, + "step": 5252, + "time_per_iteration": 2.5056369304656982 + }, + { + "auxiliary_loss_clip": 0.01129131, + "auxiliary_loss_mlp": 0.01040491, + "balance_loss_clip": 1.02635264, + "balance_loss_mlp": 1.04820085, + "epoch": 0.31582744626484294, + "flos": 22711165349760.0, + "grad_norm": 1.7467438086499925, + "language_loss": 0.74374568, + "learning_rate": 3.202470249001066e-06, + "loss": 0.76544189, + "num_input_tokens_seen": 112785310, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.80859375, + "step": 5253, + "time_per_iteration": 2.485865592956543 + }, + { + "auxiliary_loss_clip": 0.01129339, + "auxiliary_loss_mlp": 0.01038803, + "balance_loss_clip": 1.02308559, + "balance_loss_mlp": 1.04530692, + "epoch": 0.3158875695175109, + "flos": 23951914894080.0, + "grad_norm": 1.6622002067810395, + "language_loss": 0.73305148, + "learning_rate": 3.2021590182150924e-06, + "loss": 0.75473285, + "num_input_tokens_seen": 112802905, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.83984375, + "step": 5254, + "time_per_iteration": 2.5044641494750977 + }, + { + "auxiliary_loss_clip": 0.01131731, + "auxiliary_loss_mlp": 0.01038119, + "balance_loss_clip": 1.02293146, + "balance_loss_mlp": 1.04714012, + "epoch": 0.31594769277017887, + "flos": 13261883322240.0, + "grad_norm": 1.9319514966089122, + "language_loss": 0.78156364, + "learning_rate": 3.201847741843128e-06, + "loss": 0.80326211, + "num_input_tokens_seen": 112820305, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.84375, + "step": 5255, + "time_per_iteration": 2.4380881786346436 + }, + { + "auxiliary_loss_clip": 0.01130732, + "auxiliary_loss_mlp": 0.0104233, + "balance_loss_clip": 1.02565229, + "balance_loss_mlp": 1.04770398, + "epoch": 0.31600781602284683, + "flos": 23368258800000.0, + "grad_norm": 2.551434599641695, + "language_loss": 0.78019011, + "learning_rate": 3.2015364198969772e-06, + "loss": 0.80192077, + "num_input_tokens_seen": 112841185, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.828125, + "step": 5256, + "time_per_iteration": 2.517211437225342 + }, + { + "auxiliary_loss_clip": 0.01125561, + "auxiliary_loss_mlp": 0.01035033, + "balance_loss_clip": 1.02159786, + "balance_loss_mlp": 1.04710865, + "epoch": 0.3160679392755148, + "flos": 19828580019840.0, + "grad_norm": 1.6136648036258991, + "language_loss": 0.71117795, + "learning_rate": 3.2012250523884453e-06, + "loss": 0.73278391, + "num_input_tokens_seen": 112860570, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.78515625, + "step": 5257, + "time_per_iteration": 2.4690449237823486 + }, + { + "auxiliary_loss_clip": 0.01129863, + "auxiliary_loss_mlp": 0.01037255, + "balance_loss_clip": 1.02207994, + "balance_loss_mlp": 1.04662931, + "epoch": 0.31612806252818276, + "flos": 20193216935040.0, + "grad_norm": 1.9672329013590102, + "language_loss": 0.77098101, + "learning_rate": 3.2009136393293393e-06, + "loss": 0.79265225, + "num_input_tokens_seen": 112877975, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.83203125, + "step": 5258, + "time_per_iteration": 2.4586384296417236 + }, + { + "auxiliary_loss_clip": 0.01130533, + "auxiliary_loss_mlp": 0.01037626, + "balance_loss_clip": 1.02291536, + "balance_loss_mlp": 1.04706669, + "epoch": 0.31618818578085073, + "flos": 24235967646720.0, + "grad_norm": 4.102208009404704, + "language_loss": 0.72829109, + "learning_rate": 3.200602180731467e-06, + "loss": 0.7499727, + "num_input_tokens_seen": 112896170, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8359375, + "step": 5259, + "time_per_iteration": 2.463867425918579 + }, + { + "auxiliary_loss_clip": 0.011339, + "auxiliary_loss_mlp": 0.01048149, + "balance_loss_clip": 1.03382003, + "balance_loss_mlp": 1.04840684, + "epoch": 0.3162483090335187, + "flos": 25081844002560.0, + "grad_norm": 1.940451679167918, + "language_loss": 0.66212165, + "learning_rate": 3.20029067660664e-06, + "loss": 0.68394214, + "num_input_tokens_seen": 112916180, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.85546875, + "step": 5260, + "time_per_iteration": 2.498173475265503 + }, + { + "auxiliary_loss_clip": 0.01125905, + "auxiliary_loss_mlp": 0.0103285, + "balance_loss_clip": 1.01806808, + "balance_loss_mlp": 1.04255199, + "epoch": 0.31630843228618666, + "flos": 26323383646080.0, + "grad_norm": 1.9564366458132632, + "language_loss": 0.72557104, + "learning_rate": 3.1999791269666706e-06, + "loss": 0.74715853, + "num_input_tokens_seen": 112936745, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8359375, + "step": 5261, + "time_per_iteration": 4.0577170848846436 + }, + { + "auxiliary_loss_clip": 0.01040968, + "auxiliary_loss_mlp": 0.01005761, + "balance_loss_clip": 1.00365114, + "balance_loss_mlp": 1.01333809, + "epoch": 0.3163685555388547, + "flos": 66758441552640.0, + "grad_norm": 0.7495327099187281, + "language_loss": 0.50639355, + "learning_rate": 3.1996675318233716e-06, + "loss": 0.52686083, + "num_input_tokens_seen": 112994845, + "router_z_loss_clip": 0.02111816, + "router_z_loss_mlp": 0.27734375, + "step": 5262, + "time_per_iteration": 5.9139063358306885 + }, + { + "auxiliary_loss_clip": 0.01133191, + "auxiliary_loss_mlp": 0.01038454, + "balance_loss_clip": 1.02408338, + "balance_loss_mlp": 1.04845881, + "epoch": 0.31642867879152264, + "flos": 25995662933760.0, + "grad_norm": 1.4936033884005069, + "language_loss": 0.85241222, + "learning_rate": 3.19935589118856e-06, + "loss": 0.87412858, + "num_input_tokens_seen": 113015125, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.84765625, + "step": 5263, + "time_per_iteration": 2.4966084957122803 + }, + { + "auxiliary_loss_clip": 0.01127359, + "auxiliary_loss_mlp": 0.01045858, + "balance_loss_clip": 1.03201818, + "balance_loss_mlp": 1.04657304, + "epoch": 0.3164888020441906, + "flos": 25774955815680.0, + "grad_norm": 1.4671140059184749, + "language_loss": 0.81675243, + "learning_rate": 3.1990442050740535e-06, + "loss": 0.83848464, + "num_input_tokens_seen": 113035535, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.80859375, + "step": 5264, + "time_per_iteration": 2.5126495361328125 + }, + { + "auxiliary_loss_clip": 0.01133844, + "auxiliary_loss_mlp": 0.0103675, + "balance_loss_clip": 1.02107441, + "balance_loss_mlp": 1.0484283, + "epoch": 0.3165489252968586, + "flos": 19756220071680.0, + "grad_norm": 1.6829803459821215, + "language_loss": 0.79974926, + "learning_rate": 3.19873247349167e-06, + "loss": 0.82145512, + "num_input_tokens_seen": 113052720, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8515625, + "step": 5265, + "time_per_iteration": 2.444263219833374 + }, + { + "auxiliary_loss_clip": 0.0113354, + "auxiliary_loss_mlp": 0.01039316, + "balance_loss_clip": 1.02361572, + "balance_loss_mlp": 1.04815876, + "epoch": 0.31660904854952654, + "flos": 23183929180800.0, + "grad_norm": 1.5672890574859826, + "language_loss": 0.74875605, + "learning_rate": 3.1984206964532307e-06, + "loss": 0.77048463, + "num_input_tokens_seen": 113071435, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8515625, + "step": 5266, + "time_per_iteration": 2.5323407649993896 + }, + { + "auxiliary_loss_clip": 0.01131974, + "auxiliary_loss_mlp": 0.01043072, + "balance_loss_clip": 1.02851653, + "balance_loss_mlp": 1.04640543, + "epoch": 0.3166691718021945, + "flos": 20408501099520.0, + "grad_norm": 2.021043754719528, + "language_loss": 0.78872609, + "learning_rate": 3.1981088739705585e-06, + "loss": 0.81047654, + "num_input_tokens_seen": 113088645, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.85546875, + "step": 5267, + "time_per_iteration": 2.4591164588928223 + }, + { + "auxiliary_loss_clip": 0.01042632, + "auxiliary_loss_mlp": 0.01004279, + "balance_loss_clip": 1.00216913, + "balance_loss_mlp": 1.01493907, + "epoch": 0.31672929505486247, + "flos": 70144781172480.0, + "grad_norm": 0.7322532755123746, + "language_loss": 0.57800645, + "learning_rate": 3.197797006055478e-06, + "loss": 0.59847558, + "num_input_tokens_seen": 113152775, + "router_z_loss_clip": 0.02111816, + "router_z_loss_mlp": 0.27734375, + "step": 5268, + "time_per_iteration": 3.061121702194214 + }, + { + "auxiliary_loss_clip": 0.01132182, + "auxiliary_loss_mlp": 0.01037998, + "balance_loss_clip": 1.02291262, + "balance_loss_mlp": 1.04683709, + "epoch": 0.31678941830753043, + "flos": 14355758154240.0, + "grad_norm": 1.8728828385616285, + "language_loss": 0.72881675, + "learning_rate": 3.197485092719815e-06, + "loss": 0.75051844, + "num_input_tokens_seen": 113171410, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.85546875, + "step": 5269, + "time_per_iteration": 2.4871747493743896 + }, + { + "auxiliary_loss_clip": 0.0113037, + "auxiliary_loss_mlp": 0.01039313, + "balance_loss_clip": 1.02470934, + "balance_loss_mlp": 1.04689598, + "epoch": 0.3168495415601984, + "flos": 22747722416640.0, + "grad_norm": 2.0592855460289394, + "language_loss": 0.79914796, + "learning_rate": 3.1971731339753973e-06, + "loss": 0.82084477, + "num_input_tokens_seen": 113189965, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8359375, + "step": 5270, + "time_per_iteration": 2.502607822418213 + }, + { + "auxiliary_loss_clip": 0.01134389, + "auxiliary_loss_mlp": 0.01041999, + "balance_loss_clip": 1.02582264, + "balance_loss_mlp": 1.04792333, + "epoch": 0.31690966481286637, + "flos": 20115254465280.0, + "grad_norm": 1.9728362515560998, + "language_loss": 0.79207718, + "learning_rate": 3.1968611298340545e-06, + "loss": 0.8138411, + "num_input_tokens_seen": 113206355, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8671875, + "step": 5271, + "time_per_iteration": 2.4412505626678467 + }, + { + "auxiliary_loss_clip": 0.0113132, + "auxiliary_loss_mlp": 0.01040651, + "balance_loss_clip": 1.02440262, + "balance_loss_mlp": 1.04685235, + "epoch": 0.31696978806553433, + "flos": 21178928937600.0, + "grad_norm": 1.769221166791082, + "language_loss": 0.73264146, + "learning_rate": 3.1965490803076173e-06, + "loss": 0.75436121, + "num_input_tokens_seen": 113225440, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.84375, + "step": 5272, + "time_per_iteration": 2.4992945194244385 + }, + { + "auxiliary_loss_clip": 0.0113408, + "auxiliary_loss_mlp": 0.01039209, + "balance_loss_clip": 1.02262676, + "balance_loss_mlp": 1.04613161, + "epoch": 0.3170299113182023, + "flos": 42997030439040.0, + "grad_norm": 1.9537759660060814, + "language_loss": 0.69159341, + "learning_rate": 3.1962369854079194e-06, + "loss": 0.71332633, + "num_input_tokens_seen": 113248840, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.87890625, + "step": 5273, + "time_per_iteration": 2.6510114669799805 + }, + { + "auxiliary_loss_clip": 0.01128979, + "auxiliary_loss_mlp": 0.01036407, + "balance_loss_clip": 1.02110016, + "balance_loss_mlp": 1.04609132, + "epoch": 0.31709003457087026, + "flos": 24460158384000.0, + "grad_norm": 1.4826309074588198, + "language_loss": 0.67691469, + "learning_rate": 3.195924845146795e-06, + "loss": 0.69856858, + "num_input_tokens_seen": 113269630, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.828125, + "step": 5274, + "time_per_iteration": 2.5467329025268555 + }, + { + "auxiliary_loss_clip": 0.01124583, + "auxiliary_loss_mlp": 0.01035156, + "balance_loss_clip": 1.02092862, + "balance_loss_mlp": 1.04432762, + "epoch": 0.3171501578235382, + "flos": 24135310759680.0, + "grad_norm": 1.5251182195487059, + "language_loss": 0.80846918, + "learning_rate": 3.195612659536081e-06, + "loss": 0.83006656, + "num_input_tokens_seen": 113291200, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8046875, + "step": 5275, + "time_per_iteration": 2.511544704437256 + }, + { + "auxiliary_loss_clip": 0.0113165, + "auxiliary_loss_mlp": 0.01044428, + "balance_loss_clip": 1.0286448, + "balance_loss_mlp": 1.04539275, + "epoch": 0.31721028107620625, + "flos": 18879712392960.0, + "grad_norm": 1.952892513614063, + "language_loss": 0.72608984, + "learning_rate": 3.1953004285876147e-06, + "loss": 0.7478506, + "num_input_tokens_seen": 113310170, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.859375, + "step": 5276, + "time_per_iteration": 2.5273983478546143 + }, + { + "auxiliary_loss_clip": 0.01124489, + "auxiliary_loss_mlp": 0.0103537, + "balance_loss_clip": 1.02098107, + "balance_loss_mlp": 1.04455817, + "epoch": 0.3172704043288742, + "flos": 23147874904320.0, + "grad_norm": 1.3590988237701342, + "language_loss": 0.77843654, + "learning_rate": 3.194988152313236e-06, + "loss": 0.80003512, + "num_input_tokens_seen": 113331140, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.796875, + "step": 5277, + "time_per_iteration": 2.51247501373291 + }, + { + "auxiliary_loss_clip": 0.0112964, + "auxiliary_loss_mlp": 0.01034254, + "balance_loss_clip": 1.01833999, + "balance_loss_mlp": 1.04444003, + "epoch": 0.3173305275815422, + "flos": 17858520731520.0, + "grad_norm": 1.8256288285105424, + "language_loss": 0.78756094, + "learning_rate": 3.1946758307247878e-06, + "loss": 0.80919981, + "num_input_tokens_seen": 113350030, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8515625, + "step": 5278, + "time_per_iteration": 2.5376405715942383 + }, + { + "auxiliary_loss_clip": 0.01037546, + "auxiliary_loss_mlp": 0.01002993, + "balance_loss_clip": 1.0011332, + "balance_loss_mlp": 1.00972891, + "epoch": 0.31739065083421014, + "flos": 59973476883840.0, + "grad_norm": 0.8755672893463982, + "language_loss": 0.62821174, + "learning_rate": 3.1943634638341114e-06, + "loss": 0.64861709, + "num_input_tokens_seen": 113395820, + "router_z_loss_clip": 0.01855469, + "router_z_loss_mlp": 0.27734375, + "step": 5279, + "time_per_iteration": 2.823489189147949 + }, + { + "auxiliary_loss_clip": 0.01133426, + "auxiliary_loss_mlp": 0.01040678, + "balance_loss_clip": 1.0242753, + "balance_loss_mlp": 1.04568505, + "epoch": 0.3174507740868781, + "flos": 23800981944960.0, + "grad_norm": 1.6672726712999033, + "language_loss": 0.8099947, + "learning_rate": 3.194051051653053e-06, + "loss": 0.83173573, + "num_input_tokens_seen": 113416835, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.87890625, + "step": 5280, + "time_per_iteration": 2.490154981613159 + }, + { + "auxiliary_loss_clip": 0.01130309, + "auxiliary_loss_mlp": 0.01044119, + "balance_loss_clip": 1.02963543, + "balance_loss_mlp": 1.04713202, + "epoch": 0.31751089733954607, + "flos": 27638899349760.0, + "grad_norm": 1.444928497123541, + "language_loss": 0.77968711, + "learning_rate": 3.19373859419346e-06, + "loss": 0.80143142, + "num_input_tokens_seen": 113440850, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.83203125, + "step": 5281, + "time_per_iteration": 2.590106248855591 + }, + { + "auxiliary_loss_clip": 0.01129621, + "auxiliary_loss_mlp": 0.01035628, + "balance_loss_clip": 1.02001119, + "balance_loss_mlp": 1.0464325, + "epoch": 0.31757102059221404, + "flos": 23769273214080.0, + "grad_norm": 1.6441690082428626, + "language_loss": 0.78319824, + "learning_rate": 3.193426091467179e-06, + "loss": 0.8048507, + "num_input_tokens_seen": 113461000, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.83203125, + "step": 5282, + "time_per_iteration": 2.4879021644592285 + }, + { + "auxiliary_loss_clip": 0.01133826, + "auxiliary_loss_mlp": 0.0103931, + "balance_loss_clip": 1.02429008, + "balance_loss_mlp": 1.04685783, + "epoch": 0.317631143844882, + "flos": 25264521596160.0, + "grad_norm": 2.066002014025373, + "language_loss": 0.66989815, + "learning_rate": 3.193113543486061e-06, + "loss": 0.69162953, + "num_input_tokens_seen": 113480820, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8671875, + "step": 5283, + "time_per_iteration": 2.4914467334747314 + }, + { + "auxiliary_loss_clip": 0.01037416, + "auxiliary_loss_mlp": 0.01002537, + "balance_loss_clip": 1.00047421, + "balance_loss_mlp": 1.00956297, + "epoch": 0.31769126709754997, + "flos": 55825939221120.0, + "grad_norm": 0.7287723120729913, + "language_loss": 0.52796859, + "learning_rate": 3.192800950261958e-06, + "loss": 0.5483681, + "num_input_tokens_seen": 113536910, + "router_z_loss_clip": 0.02062988, + "router_z_loss_mlp": 0.27734375, + "step": 5284, + "time_per_iteration": 3.0077779293060303 + }, + { + "auxiliary_loss_clip": 0.01137201, + "auxiliary_loss_mlp": 0.01037818, + "balance_loss_clip": 1.02314341, + "balance_loss_mlp": 1.04976773, + "epoch": 0.31775139035021793, + "flos": 16690562098560.0, + "grad_norm": 1.732541053937659, + "language_loss": 0.7061168, + "learning_rate": 3.1924883118067235e-06, + "loss": 0.72786701, + "num_input_tokens_seen": 113555480, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.875, + "step": 5285, + "time_per_iteration": 2.4796152114868164 + }, + { + "auxiliary_loss_clip": 0.0103775, + "auxiliary_loss_mlp": 0.01003604, + "balance_loss_clip": 1.00170827, + "balance_loss_mlp": 1.00987303, + "epoch": 0.3178115136028859, + "flos": 64227241019520.0, + "grad_norm": 0.8184329386673247, + "language_loss": 0.60497808, + "learning_rate": 3.1921756281322123e-06, + "loss": 0.6253916, + "num_input_tokens_seen": 113616790, + "router_z_loss_clip": 0.0189209, + "router_z_loss_mlp": 0.27929688, + "step": 5286, + "time_per_iteration": 3.060959815979004 + }, + { + "auxiliary_loss_clip": 0.01131379, + "auxiliary_loss_mlp": 0.01042363, + "balance_loss_clip": 1.02701449, + "balance_loss_mlp": 1.04520202, + "epoch": 0.31787163685555386, + "flos": 18697465762560.0, + "grad_norm": 1.8142745455991967, + "language_loss": 0.72112805, + "learning_rate": 3.1918628992502826e-06, + "loss": 0.74286544, + "num_input_tokens_seen": 113635320, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.86328125, + "step": 5287, + "time_per_iteration": 2.480926752090454 + }, + { + "auxiliary_loss_clip": 0.01131312, + "auxiliary_loss_mlp": 0.01047698, + "balance_loss_clip": 1.03083003, + "balance_loss_mlp": 1.04454064, + "epoch": 0.31793176010822183, + "flos": 21324762155520.0, + "grad_norm": 1.8467549942081902, + "language_loss": 0.75335222, + "learning_rate": 3.191550125172792e-06, + "loss": 0.77514231, + "num_input_tokens_seen": 113654000, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.8671875, + "step": 5288, + "time_per_iteration": 2.4506337642669678 + }, + { + "auxiliary_loss_clip": 0.01123463, + "auxiliary_loss_mlp": 0.01036721, + "balance_loss_clip": 1.02344155, + "balance_loss_mlp": 1.04175711, + "epoch": 0.31799188336088985, + "flos": 20958688696320.0, + "grad_norm": 2.214262263159222, + "language_loss": 0.87642509, + "learning_rate": 3.1912373059116007e-06, + "loss": 0.89802694, + "num_input_tokens_seen": 113672375, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.8203125, + "step": 5289, + "time_per_iteration": 2.4887404441833496 + }, + { + "auxiliary_loss_clip": 0.01127988, + "auxiliary_loss_mlp": 0.01039305, + "balance_loss_clip": 1.02569127, + "balance_loss_mlp": 1.04635859, + "epoch": 0.3180520066135578, + "flos": 22491930689280.0, + "grad_norm": 1.8563377401537928, + "language_loss": 0.67677546, + "learning_rate": 3.190924441478572e-06, + "loss": 0.69844842, + "num_input_tokens_seen": 113692385, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.81640625, + "step": 5290, + "time_per_iteration": 2.4699981212615967 + }, + { + "auxiliary_loss_clip": 0.01130209, + "auxiliary_loss_mlp": 0.01045373, + "balance_loss_clip": 1.02983999, + "balance_loss_mlp": 1.04348135, + "epoch": 0.3181121298662258, + "flos": 27235335070080.0, + "grad_norm": 1.9889060202243536, + "language_loss": 0.79926544, + "learning_rate": 3.1906115318855687e-06, + "loss": 0.82102132, + "num_input_tokens_seen": 113712145, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8671875, + "step": 5291, + "time_per_iteration": 2.5350663661956787 + }, + { + "auxiliary_loss_clip": 0.011332, + "auxiliary_loss_mlp": 0.01037344, + "balance_loss_clip": 1.02160883, + "balance_loss_mlp": 1.04684091, + "epoch": 0.31817225311889374, + "flos": 23180158252800.0, + "grad_norm": 2.2851564798864694, + "language_loss": 0.79887748, + "learning_rate": 3.1902985771444577e-06, + "loss": 0.82058293, + "num_input_tokens_seen": 113731435, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8671875, + "step": 5292, + "time_per_iteration": 2.4561853408813477 + }, + { + "auxiliary_loss_clip": 0.01124086, + "auxiliary_loss_mlp": 0.01034983, + "balance_loss_clip": 1.02173245, + "balance_loss_mlp": 1.04506028, + "epoch": 0.3182323763715617, + "flos": 23258803080960.0, + "grad_norm": 1.6321803022225574, + "language_loss": 0.74406421, + "learning_rate": 3.1899855772671043e-06, + "loss": 0.76565492, + "num_input_tokens_seen": 113750825, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7890625, + "step": 5293, + "time_per_iteration": 2.562264919281006 + }, + { + "auxiliary_loss_clip": 0.01127349, + "auxiliary_loss_mlp": 0.0104221, + "balance_loss_clip": 1.02864981, + "balance_loss_mlp": 1.04655647, + "epoch": 0.3182924996242297, + "flos": 29016683280000.0, + "grad_norm": 1.669926034583184, + "language_loss": 0.74003655, + "learning_rate": 3.189672532265379e-06, + "loss": 0.7617321, + "num_input_tokens_seen": 113770010, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.80859375, + "step": 5294, + "time_per_iteration": 2.511491537094116 + }, + { + "auxiliary_loss_clip": 0.01131359, + "auxiliary_loss_mlp": 0.01034334, + "balance_loss_clip": 1.0186758, + "balance_loss_mlp": 1.04616928, + "epoch": 0.31835262287689764, + "flos": 20449188230400.0, + "grad_norm": 1.856323864882145, + "language_loss": 0.76211727, + "learning_rate": 3.189359442151152e-06, + "loss": 0.78377414, + "num_input_tokens_seen": 113788640, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8515625, + "step": 5295, + "time_per_iteration": 2.482302665710449 + }, + { + "auxiliary_loss_clip": 0.01134404, + "auxiliary_loss_mlp": 0.01042471, + "balance_loss_clip": 1.02765322, + "balance_loss_mlp": 1.04831004, + "epoch": 0.3184127461295656, + "flos": 25119478477440.0, + "grad_norm": 1.6316405915506296, + "language_loss": 0.69476807, + "learning_rate": 3.189046306936296e-06, + "loss": 0.71653676, + "num_input_tokens_seen": 113809515, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.859375, + "step": 5296, + "time_per_iteration": 2.4972259998321533 + }, + { + "auxiliary_loss_clip": 0.01129364, + "auxiliary_loss_mlp": 0.0103894, + "balance_loss_clip": 1.02421129, + "balance_loss_mlp": 1.04513788, + "epoch": 0.31847286938223357, + "flos": 25551231955200.0, + "grad_norm": 2.3772504575271367, + "language_loss": 0.77559733, + "learning_rate": 3.1887331266326846e-06, + "loss": 0.79728031, + "num_input_tokens_seen": 113829770, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.84375, + "step": 5297, + "time_per_iteration": 2.5681862831115723 + }, + { + "auxiliary_loss_clip": 0.01126969, + "auxiliary_loss_mlp": 0.01030144, + "balance_loss_clip": 1.01533866, + "balance_loss_mlp": 1.04480934, + "epoch": 0.31853299263490154, + "flos": 27782470010880.0, + "grad_norm": 1.9869765921291695, + "language_loss": 0.79451257, + "learning_rate": 3.1884199012521942e-06, + "loss": 0.81608367, + "num_input_tokens_seen": 113849320, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8203125, + "step": 5298, + "time_per_iteration": 2.4990038871765137 + }, + { + "auxiliary_loss_clip": 0.01132136, + "auxiliary_loss_mlp": 0.01039187, + "balance_loss_clip": 1.0245657, + "balance_loss_mlp": 1.04609096, + "epoch": 0.3185931158875695, + "flos": 22706747976960.0, + "grad_norm": 2.132815699592654, + "language_loss": 0.7431671, + "learning_rate": 3.1881066308067016e-06, + "loss": 0.7648803, + "num_input_tokens_seen": 113867860, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.859375, + "step": 5299, + "time_per_iteration": 2.4902234077453613 + }, + { + "auxiliary_loss_clip": 0.01130922, + "auxiliary_loss_mlp": 0.01042737, + "balance_loss_clip": 1.02775824, + "balance_loss_mlp": 1.04395795, + "epoch": 0.31865323914023747, + "flos": 24571517523840.0, + "grad_norm": 5.1444082132017925, + "language_loss": 0.7834971, + "learning_rate": 3.1877933153080873e-06, + "loss": 0.80523366, + "num_input_tokens_seen": 113886375, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8671875, + "step": 5300, + "time_per_iteration": 2.476113796234131 + }, + { + "auxiliary_loss_clip": 0.01127423, + "auxiliary_loss_mlp": 0.01038051, + "balance_loss_clip": 1.02245879, + "balance_loss_mlp": 1.04332328, + "epoch": 0.31871336239290543, + "flos": 18186564666240.0, + "grad_norm": 4.220537638442504, + "language_loss": 0.8416568, + "learning_rate": 3.1874799547682304e-06, + "loss": 0.86331153, + "num_input_tokens_seen": 113904065, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84375, + "step": 5301, + "time_per_iteration": 2.4672341346740723 + }, + { + "auxiliary_loss_clip": 0.01132761, + "auxiliary_loss_mlp": 0.01045513, + "balance_loss_clip": 1.0299325, + "balance_loss_mlp": 1.05064154, + "epoch": 0.31877348564557345, + "flos": 21826756679040.0, + "grad_norm": 2.4555807672502277, + "language_loss": 0.77689236, + "learning_rate": 3.187166549199015e-06, + "loss": 0.79867512, + "num_input_tokens_seen": 113918415, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8203125, + "step": 5302, + "time_per_iteration": 2.4480254650115967 + }, + { + "auxiliary_loss_clip": 0.011261, + "auxiliary_loss_mlp": 0.01037244, + "balance_loss_clip": 1.02197289, + "balance_loss_mlp": 1.0458461, + "epoch": 0.3188336088982414, + "flos": 22015252275840.0, + "grad_norm": 1.6601771821563076, + "language_loss": 0.79729378, + "learning_rate": 3.1868530986123255e-06, + "loss": 0.81892729, + "num_input_tokens_seen": 113938135, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8046875, + "step": 5303, + "time_per_iteration": 5.451193809509277 + }, + { + "auxiliary_loss_clip": 0.01137183, + "auxiliary_loss_mlp": 0.0104561, + "balance_loss_clip": 1.0295403, + "balance_loss_mlp": 1.04810047, + "epoch": 0.3188937321509094, + "flos": 20047886507520.0, + "grad_norm": 2.065727829234295, + "language_loss": 0.72734123, + "learning_rate": 3.186539603020047e-06, + "loss": 0.74916923, + "num_input_tokens_seen": 113957125, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.890625, + "step": 5304, + "time_per_iteration": 3.835230588912964 + }, + { + "auxiliary_loss_clip": 0.01126733, + "auxiliary_loss_mlp": 0.01039176, + "balance_loss_clip": 1.02546668, + "balance_loss_mlp": 1.04595399, + "epoch": 0.31895385540357735, + "flos": 25848105863040.0, + "grad_norm": 1.8866410100018438, + "language_loss": 0.71773344, + "learning_rate": 3.186226062434068e-06, + "loss": 0.73939252, + "num_input_tokens_seen": 113974875, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.80859375, + "step": 5305, + "time_per_iteration": 2.5330212116241455 + }, + { + "auxiliary_loss_clip": 0.01129402, + "auxiliary_loss_mlp": 0.0103604, + "balance_loss_clip": 1.02209806, + "balance_loss_mlp": 1.0472002, + "epoch": 0.3190139786562453, + "flos": 23477714519040.0, + "grad_norm": 1.6861128411196662, + "language_loss": 0.64708328, + "learning_rate": 3.1859124768662778e-06, + "loss": 0.66873765, + "num_input_tokens_seen": 113994450, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.82421875, + "step": 5306, + "time_per_iteration": 2.4788570404052734 + }, + { + "auxiliary_loss_clip": 0.01135221, + "auxiliary_loss_mlp": 0.01042556, + "balance_loss_clip": 1.02714205, + "balance_loss_mlp": 1.05026746, + "epoch": 0.3190741019089133, + "flos": 29095543589760.0, + "grad_norm": 2.161280639112344, + "language_loss": 0.79625881, + "learning_rate": 3.1855988463285678e-06, + "loss": 0.81803662, + "num_input_tokens_seen": 114013945, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84765625, + "step": 5307, + "time_per_iteration": 2.5614371299743652 + }, + { + "auxiliary_loss_clip": 0.0112354, + "auxiliary_loss_mlp": 0.01039882, + "balance_loss_clip": 1.02412832, + "balance_loss_mlp": 1.04311657, + "epoch": 0.31913422516158124, + "flos": 17129534209920.0, + "grad_norm": 1.727529620646192, + "language_loss": 0.77898794, + "learning_rate": 3.1852851708328308e-06, + "loss": 0.80062222, + "num_input_tokens_seen": 114031375, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8046875, + "step": 5308, + "time_per_iteration": 2.4443254470825195 + }, + { + "auxiliary_loss_clip": 0.01142678, + "auxiliary_loss_mlp": 0.01048979, + "balance_loss_clip": 1.03182518, + "balance_loss_mlp": 1.05046844, + "epoch": 0.3191943484142492, + "flos": 16069846147200.0, + "grad_norm": 5.1649453810283426, + "language_loss": 0.74302876, + "learning_rate": 3.184971450390961e-06, + "loss": 0.76494527, + "num_input_tokens_seen": 114048465, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.921875, + "step": 5309, + "time_per_iteration": 2.494800090789795 + }, + { + "auxiliary_loss_clip": 0.0112957, + "auxiliary_loss_mlp": 0.01034444, + "balance_loss_clip": 1.01998436, + "balance_loss_mlp": 1.04589248, + "epoch": 0.3192544716669172, + "flos": 22966166977920.0, + "grad_norm": 1.754429841361115, + "language_loss": 0.82606339, + "learning_rate": 3.184657685014856e-06, + "loss": 0.84770352, + "num_input_tokens_seen": 114068415, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8359375, + "step": 5310, + "time_per_iteration": 2.4630603790283203 + }, + { + "auxiliary_loss_clip": 0.01129012, + "auxiliary_loss_mlp": 0.01041266, + "balance_loss_clip": 1.02762246, + "balance_loss_mlp": 1.04536486, + "epoch": 0.31931459491958514, + "flos": 26870339018880.0, + "grad_norm": 1.4405475768569584, + "language_loss": 0.78319013, + "learning_rate": 3.184343874716412e-06, + "loss": 0.8048929, + "num_input_tokens_seen": 114088565, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.8359375, + "step": 5311, + "time_per_iteration": 2.5892724990844727 + }, + { + "auxiliary_loss_clip": 0.01130953, + "auxiliary_loss_mlp": 0.01040389, + "balance_loss_clip": 1.02419996, + "balance_loss_mlp": 1.04695129, + "epoch": 0.3193747181722531, + "flos": 21836525178240.0, + "grad_norm": 2.475613964939968, + "language_loss": 0.84316272, + "learning_rate": 3.1840300195075295e-06, + "loss": 0.86487615, + "num_input_tokens_seen": 114107160, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.83984375, + "step": 5312, + "time_per_iteration": 2.4625802040100098 + }, + { + "auxiliary_loss_clip": 0.01137215, + "auxiliary_loss_mlp": 0.01044515, + "balance_loss_clip": 1.02808809, + "balance_loss_mlp": 1.0480628, + "epoch": 0.31943484142492107, + "flos": 18324999682560.0, + "grad_norm": 2.3910939905221302, + "language_loss": 0.78584075, + "learning_rate": 3.1837161194001102e-06, + "loss": 0.80765808, + "num_input_tokens_seen": 114123420, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.890625, + "step": 5313, + "time_per_iteration": 2.548509359359741 + }, + { + "auxiliary_loss_clip": 0.01132383, + "auxiliary_loss_mlp": 0.01036276, + "balance_loss_clip": 1.02133918, + "balance_loss_mlp": 1.04814112, + "epoch": 0.31949496467758903, + "flos": 21615818060160.0, + "grad_norm": 2.1643333364087582, + "language_loss": 0.85868084, + "learning_rate": 3.183402174406057e-06, + "loss": 0.88036746, + "num_input_tokens_seen": 114139230, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.83984375, + "step": 5314, + "time_per_iteration": 2.4721946716308594 + }, + { + "auxiliary_loss_clip": 0.01131852, + "auxiliary_loss_mlp": 0.01040473, + "balance_loss_clip": 1.02502346, + "balance_loss_mlp": 1.04725409, + "epoch": 0.31955508793025705, + "flos": 21760214734080.0, + "grad_norm": 1.7188296838329389, + "language_loss": 0.79836512, + "learning_rate": 3.1830881845372747e-06, + "loss": 0.82008839, + "num_input_tokens_seen": 114159290, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84375, + "step": 5315, + "time_per_iteration": 2.512554407119751 + }, + { + "auxiliary_loss_clip": 0.01135172, + "auxiliary_loss_mlp": 0.01049715, + "balance_loss_clip": 1.03331804, + "balance_loss_mlp": 1.0493269, + "epoch": 0.319615211182925, + "flos": 17164331510400.0, + "grad_norm": 6.566744634036759, + "language_loss": 0.67652613, + "learning_rate": 3.18277414980567e-06, + "loss": 0.69837505, + "num_input_tokens_seen": 114177655, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.859375, + "step": 5316, + "time_per_iteration": 2.4364819526672363 + }, + { + "auxiliary_loss_clip": 0.01133826, + "auxiliary_loss_mlp": 0.01034907, + "balance_loss_clip": 1.02105474, + "balance_loss_mlp": 1.04888916, + "epoch": 0.319675334435593, + "flos": 28112812416000.0, + "grad_norm": 1.4751284993654519, + "language_loss": 0.69336772, + "learning_rate": 3.1824600702231515e-06, + "loss": 0.71505511, + "num_input_tokens_seen": 114200880, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.84765625, + "step": 5317, + "time_per_iteration": 2.6055562496185303 + }, + { + "auxiliary_loss_clip": 0.01043016, + "auxiliary_loss_mlp": 0.0100349, + "balance_loss_clip": 1.00143993, + "balance_loss_mlp": 1.01474404, + "epoch": 0.31973545768826095, + "flos": 69501119408640.0, + "grad_norm": 0.7259742625655435, + "language_loss": 0.53048342, + "learning_rate": 3.182145945801628e-06, + "loss": 0.5509485, + "num_input_tokens_seen": 114267145, + "router_z_loss_clip": 0.02050781, + "router_z_loss_mlp": 0.28320312, + "step": 5318, + "time_per_iteration": 3.200087308883667 + }, + { + "auxiliary_loss_clip": 0.01132062, + "auxiliary_loss_mlp": 0.01037492, + "balance_loss_clip": 1.02311563, + "balance_loss_mlp": 1.04900801, + "epoch": 0.3197955809409289, + "flos": 13699203408000.0, + "grad_norm": 1.839211184718713, + "language_loss": 0.83865941, + "learning_rate": 3.181831776553012e-06, + "loss": 0.8603549, + "num_input_tokens_seen": 114284630, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.83203125, + "step": 5319, + "time_per_iteration": 2.471498966217041 + }, + { + "auxiliary_loss_clip": 0.01131434, + "auxiliary_loss_mlp": 0.01042883, + "balance_loss_clip": 1.0279578, + "balance_loss_mlp": 1.04728413, + "epoch": 0.3198557041935969, + "flos": 33218124278400.0, + "grad_norm": 1.3959306603032393, + "language_loss": 0.63542199, + "learning_rate": 3.1815175624892165e-06, + "loss": 0.65716517, + "num_input_tokens_seen": 114305830, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.83984375, + "step": 5320, + "time_per_iteration": 2.5526087284088135 + }, + { + "auxiliary_loss_clip": 0.01136898, + "auxiliary_loss_mlp": 0.01040372, + "balance_loss_clip": 1.02528036, + "balance_loss_mlp": 1.04970324, + "epoch": 0.31991582744626484, + "flos": 23732033788800.0, + "grad_norm": 1.9943779690432752, + "language_loss": 0.70519614, + "learning_rate": 3.1812033036221567e-06, + "loss": 0.72696882, + "num_input_tokens_seen": 114325165, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.87109375, + "step": 5321, + "time_per_iteration": 2.5262763500213623 + }, + { + "auxiliary_loss_clip": 0.01141108, + "auxiliary_loss_mlp": 0.01056872, + "balance_loss_clip": 1.04030156, + "balance_loss_mlp": 1.05110431, + "epoch": 0.3199759506989328, + "flos": 18550842445440.0, + "grad_norm": 3.2234904552907238, + "language_loss": 0.86543447, + "learning_rate": 3.180888999963749e-06, + "loss": 0.88741434, + "num_input_tokens_seen": 114341310, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8984375, + "step": 5322, + "time_per_iteration": 2.4432008266448975 + }, + { + "auxiliary_loss_clip": 0.01132235, + "auxiliary_loss_mlp": 0.01035962, + "balance_loss_clip": 1.02119207, + "balance_loss_mlp": 1.04827893, + "epoch": 0.3200360739516008, + "flos": 22418888382720.0, + "grad_norm": 1.7854648356549414, + "language_loss": 0.82820231, + "learning_rate": 3.1805746515259123e-06, + "loss": 0.84988427, + "num_input_tokens_seen": 114360355, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.83984375, + "step": 5323, + "time_per_iteration": 2.554539680480957 + }, + { + "auxiliary_loss_clip": 0.01130058, + "auxiliary_loss_mlp": 0.01037837, + "balance_loss_clip": 1.02157664, + "balance_loss_mlp": 1.04700553, + "epoch": 0.32009619720426874, + "flos": 20595236929920.0, + "grad_norm": 1.8735349940723531, + "language_loss": 0.77858555, + "learning_rate": 3.1802602583205663e-06, + "loss": 0.8002646, + "num_input_tokens_seen": 114379220, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.828125, + "step": 5324, + "time_per_iteration": 2.452894687652588 + }, + { + "auxiliary_loss_clip": 0.0113163, + "auxiliary_loss_mlp": 0.01034726, + "balance_loss_clip": 1.01981282, + "balance_loss_mlp": 1.04770339, + "epoch": 0.3201563204569367, + "flos": 18147637301760.0, + "grad_norm": 1.8150910160625646, + "language_loss": 0.80162597, + "learning_rate": 3.1799458203596333e-06, + "loss": 0.82328951, + "num_input_tokens_seen": 114396365, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.83984375, + "step": 5325, + "time_per_iteration": 2.5261802673339844 + }, + { + "auxiliary_loss_clip": 0.01133025, + "auxiliary_loss_mlp": 0.01041931, + "balance_loss_clip": 1.02690446, + "balance_loss_mlp": 1.04872847, + "epoch": 0.32021644370960467, + "flos": 31684235840640.0, + "grad_norm": 1.8959189814779316, + "language_loss": 0.75171864, + "learning_rate": 3.179631337655037e-06, + "loss": 0.77346826, + "num_input_tokens_seen": 114416780, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84375, + "step": 5326, + "time_per_iteration": 2.5300135612487793 + }, + { + "auxiliary_loss_clip": 0.01129958, + "auxiliary_loss_mlp": 0.01037614, + "balance_loss_clip": 1.02285552, + "balance_loss_mlp": 1.04836321, + "epoch": 0.32027656696227264, + "flos": 26865921646080.0, + "grad_norm": 1.4421847054475023, + "language_loss": 0.80826092, + "learning_rate": 3.179316810218701e-06, + "loss": 0.82993662, + "num_input_tokens_seen": 114437405, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.81640625, + "step": 5327, + "time_per_iteration": 2.5393614768981934 + }, + { + "auxiliary_loss_clip": 0.01135097, + "auxiliary_loss_mlp": 0.01037836, + "balance_loss_clip": 1.022434, + "balance_loss_mlp": 1.04888535, + "epoch": 0.32033669021494066, + "flos": 24169928492160.0, + "grad_norm": 1.5386676468863185, + "language_loss": 0.77926928, + "learning_rate": 3.179002238062554e-06, + "loss": 0.80099857, + "num_input_tokens_seen": 114458505, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.859375, + "step": 5328, + "time_per_iteration": 2.471806287765503 + }, + { + "auxiliary_loss_clip": 0.011322, + "auxiliary_loss_mlp": 0.01041791, + "balance_loss_clip": 1.02550721, + "balance_loss_mlp": 1.04632294, + "epoch": 0.3203968134676086, + "flos": 24460768915200.0, + "grad_norm": 2.9951100938200765, + "language_loss": 0.73971635, + "learning_rate": 3.178687621198524e-06, + "loss": 0.76145625, + "num_input_tokens_seen": 114479050, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.859375, + "step": 5329, + "time_per_iteration": 2.52327561378479 + }, + { + "auxiliary_loss_clip": 0.01127399, + "auxiliary_loss_mlp": 0.01033731, + "balance_loss_clip": 1.02012336, + "balance_loss_mlp": 1.04675198, + "epoch": 0.3204569367202766, + "flos": 18004713085440.0, + "grad_norm": 2.060461898980319, + "language_loss": 0.71036464, + "learning_rate": 3.1783729596385415e-06, + "loss": 0.73197591, + "num_input_tokens_seen": 114497415, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.8046875, + "step": 5330, + "time_per_iteration": 2.4405477046966553 + }, + { + "auxiliary_loss_clip": 0.01136038, + "auxiliary_loss_mlp": 0.01049965, + "balance_loss_clip": 1.03343058, + "balance_loss_mlp": 1.0474323, + "epoch": 0.32051705997294455, + "flos": 30589678650240.0, + "grad_norm": 1.7909305839918348, + "language_loss": 0.80022657, + "learning_rate": 3.1780582533945376e-06, + "loss": 0.82208663, + "num_input_tokens_seen": 114518785, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8828125, + "step": 5331, + "time_per_iteration": 2.5934245586395264 + }, + { + "auxiliary_loss_clip": 0.01037799, + "auxiliary_loss_mlp": 0.01004509, + "balance_loss_clip": 1.00256538, + "balance_loss_mlp": 1.01001608, + "epoch": 0.3205771832256125, + "flos": 68417979765120.0, + "grad_norm": 0.8366333048595008, + "language_loss": 0.57806182, + "learning_rate": 3.177743502478447e-06, + "loss": 0.59848487, + "num_input_tokens_seen": 114577710, + "router_z_loss_clip": 0.01940918, + "router_z_loss_mlp": 0.27734375, + "step": 5332, + "time_per_iteration": 2.9984278678894043 + }, + { + "auxiliary_loss_clip": 0.01134361, + "auxiliary_loss_mlp": 0.01039294, + "balance_loss_clip": 1.02450585, + "balance_loss_mlp": 1.04747975, + "epoch": 0.3206373064782805, + "flos": 30443953173120.0, + "grad_norm": 1.7943987990453594, + "language_loss": 0.73309821, + "learning_rate": 3.177428706902205e-06, + "loss": 0.75483477, + "num_input_tokens_seen": 114598640, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.87109375, + "step": 5333, + "time_per_iteration": 2.554401159286499 + }, + { + "auxiliary_loss_clip": 0.01133668, + "auxiliary_loss_mlp": 0.01042462, + "balance_loss_clip": 1.02686942, + "balance_loss_mlp": 1.04836345, + "epoch": 0.32069742973094845, + "flos": 22054502862720.0, + "grad_norm": 1.5896288664703238, + "language_loss": 0.71050882, + "learning_rate": 3.1771138666777485e-06, + "loss": 0.73227012, + "num_input_tokens_seen": 114618780, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8515625, + "step": 5334, + "time_per_iteration": 2.468472957611084 + }, + { + "auxiliary_loss_clip": 0.01132404, + "auxiliary_loss_mlp": 0.01041967, + "balance_loss_clip": 1.02658951, + "balance_loss_mlp": 1.04644001, + "epoch": 0.3207575529836164, + "flos": 22054000072320.0, + "grad_norm": 1.9528247502362917, + "language_loss": 0.77601135, + "learning_rate": 3.1767989818170156e-06, + "loss": 0.797755, + "num_input_tokens_seen": 114637525, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.859375, + "step": 5335, + "time_per_iteration": 2.524211883544922 + }, + { + "auxiliary_loss_clip": 0.01131695, + "auxiliary_loss_mlp": 0.01040383, + "balance_loss_clip": 1.02519548, + "balance_loss_mlp": 1.04687452, + "epoch": 0.3208176762362844, + "flos": 34057536186240.0, + "grad_norm": 1.5197552931214375, + "language_loss": 0.68353152, + "learning_rate": 3.1764840523319477e-06, + "loss": 0.70525241, + "num_input_tokens_seen": 114659705, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.84765625, + "step": 5336, + "time_per_iteration": 2.5674326419830322 + }, + { + "auxiliary_loss_clip": 0.01131682, + "auxiliary_loss_mlp": 0.01045646, + "balance_loss_clip": 1.03027439, + "balance_loss_mlp": 1.04688144, + "epoch": 0.32087779948895234, + "flos": 21798711135360.0, + "grad_norm": 1.7063748564330914, + "language_loss": 0.7895453, + "learning_rate": 3.176169078234487e-06, + "loss": 0.81131858, + "num_input_tokens_seen": 114678340, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84765625, + "step": 5337, + "time_per_iteration": 2.5010595321655273 + }, + { + "auxiliary_loss_clip": 0.01124535, + "auxiliary_loss_mlp": 0.01035607, + "balance_loss_clip": 1.02194548, + "balance_loss_mlp": 1.04505002, + "epoch": 0.3209379227416203, + "flos": 21434110133760.0, + "grad_norm": 1.7193225847880926, + "language_loss": 0.73997593, + "learning_rate": 3.1758540595365766e-06, + "loss": 0.76157737, + "num_input_tokens_seen": 114696980, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.796875, + "step": 5338, + "time_per_iteration": 2.4961647987365723 + }, + { + "auxiliary_loss_clip": 0.01132045, + "auxiliary_loss_mlp": 0.01041805, + "balance_loss_clip": 1.02633142, + "balance_loss_mlp": 1.04477298, + "epoch": 0.3209980459942883, + "flos": 25849075530240.0, + "grad_norm": 1.8336519924948942, + "language_loss": 0.63149244, + "learning_rate": 3.1755389962501626e-06, + "loss": 0.65323097, + "num_input_tokens_seen": 114717330, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.87109375, + "step": 5339, + "time_per_iteration": 2.5218987464904785 + }, + { + "auxiliary_loss_clip": 0.01130495, + "auxiliary_loss_mlp": 0.01039604, + "balance_loss_clip": 1.02409506, + "balance_loss_mlp": 1.04546928, + "epoch": 0.32105816924695624, + "flos": 19099162535040.0, + "grad_norm": 1.814332726776551, + "language_loss": 0.81917858, + "learning_rate": 3.175223888387192e-06, + "loss": 0.84087962, + "num_input_tokens_seen": 114736320, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8515625, + "step": 5340, + "time_per_iteration": 2.427483558654785 + }, + { + "auxiliary_loss_clip": 0.0113181, + "auxiliary_loss_mlp": 0.01043319, + "balance_loss_clip": 1.02847123, + "balance_loss_mlp": 1.04696941, + "epoch": 0.3211182924996242, + "flos": 16581860565120.0, + "grad_norm": 1.7172536004624983, + "language_loss": 0.7620244, + "learning_rate": 3.1749087359596137e-06, + "loss": 0.78377569, + "num_input_tokens_seen": 114754575, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.84765625, + "step": 5341, + "time_per_iteration": 2.4785468578338623 + }, + { + "auxiliary_loss_clip": 0.01130847, + "auxiliary_loss_mlp": 0.01036241, + "balance_loss_clip": 1.02154231, + "balance_loss_mlp": 1.04897809, + "epoch": 0.3211784157522922, + "flos": 22672202071680.0, + "grad_norm": 1.9213308470980235, + "language_loss": 0.78627086, + "learning_rate": 3.1745935389793786e-06, + "loss": 0.80794168, + "num_input_tokens_seen": 114773590, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.81640625, + "step": 5342, + "time_per_iteration": 2.4524106979370117 + }, + { + "auxiliary_loss_clip": 0.01133398, + "auxiliary_loss_mlp": 0.01039022, + "balance_loss_clip": 1.02290499, + "balance_loss_mlp": 1.04772902, + "epoch": 0.3212385390049602, + "flos": 20558787603840.0, + "grad_norm": 3.762302479650767, + "language_loss": 0.74934483, + "learning_rate": 3.174278297458438e-06, + "loss": 0.77106899, + "num_input_tokens_seen": 114790775, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.859375, + "step": 5343, + "time_per_iteration": 2.4744415283203125 + }, + { + "auxiliary_loss_clip": 0.01130661, + "auxiliary_loss_mlp": 0.01035912, + "balance_loss_clip": 1.02040279, + "balance_loss_mlp": 1.04623377, + "epoch": 0.32129866225762815, + "flos": 24791147233920.0, + "grad_norm": 1.6135516142824962, + "language_loss": 0.82859504, + "learning_rate": 3.173963011408748e-06, + "loss": 0.85026079, + "num_input_tokens_seen": 114809835, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.84375, + "step": 5344, + "time_per_iteration": 2.47578763961792 + }, + { + "auxiliary_loss_clip": 0.01130938, + "auxiliary_loss_mlp": 0.01039787, + "balance_loss_clip": 1.02407503, + "balance_loss_mlp": 1.04474425, + "epoch": 0.3213587855102961, + "flos": 18366871962240.0, + "grad_norm": 2.07297685310976, + "language_loss": 0.79812628, + "learning_rate": 3.173647680842262e-06, + "loss": 0.81983352, + "num_input_tokens_seen": 114826505, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.859375, + "step": 5345, + "time_per_iteration": 5.33889365196228 + }, + { + "auxiliary_loss_clip": 0.01130545, + "auxiliary_loss_mlp": 0.01036519, + "balance_loss_clip": 1.02149296, + "balance_loss_mlp": 1.04473424, + "epoch": 0.3214189087629641, + "flos": 27015992668800.0, + "grad_norm": 1.8810220564208493, + "language_loss": 0.83404821, + "learning_rate": 3.1733323057709384e-06, + "loss": 0.85571885, + "num_input_tokens_seen": 114846140, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.859375, + "step": 5346, + "time_per_iteration": 2.500577688217163 + }, + { + "auxiliary_loss_clip": 0.01131977, + "auxiliary_loss_mlp": 0.0103944, + "balance_loss_clip": 1.02362108, + "balance_loss_mlp": 1.04492784, + "epoch": 0.32147903201563205, + "flos": 23148269953920.0, + "grad_norm": 1.4095386913443633, + "language_loss": 0.81571388, + "learning_rate": 3.1730168862067366e-06, + "loss": 0.83742809, + "num_input_tokens_seen": 114866660, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.87109375, + "step": 5347, + "time_per_iteration": 2.4491653442382812 + }, + { + "auxiliary_loss_clip": 0.01130206, + "auxiliary_loss_mlp": 0.01039058, + "balance_loss_clip": 1.02332854, + "balance_loss_mlp": 1.04715562, + "epoch": 0.3215391552683, + "flos": 16580747243520.0, + "grad_norm": 1.9965712334987884, + "language_loss": 0.79898697, + "learning_rate": 3.1727014221616164e-06, + "loss": 0.82067955, + "num_input_tokens_seen": 114882820, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.828125, + "step": 5348, + "time_per_iteration": 2.471261501312256 + }, + { + "auxiliary_loss_clip": 0.01132661, + "auxiliary_loss_mlp": 0.01047853, + "balance_loss_clip": 1.03262997, + "balance_loss_mlp": 1.04691792, + "epoch": 0.321599278520968, + "flos": 17821820010240.0, + "grad_norm": 1.9690807455187813, + "language_loss": 0.8506968, + "learning_rate": 3.172385913647542e-06, + "loss": 0.87250197, + "num_input_tokens_seen": 114900745, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.859375, + "step": 5349, + "time_per_iteration": 2.4376416206359863 + }, + { + "auxiliary_loss_clip": 0.01130553, + "auxiliary_loss_mlp": 0.01037186, + "balance_loss_clip": 1.02215409, + "balance_loss_mlp": 1.04589188, + "epoch": 0.32165940177363594, + "flos": 16251769555200.0, + "grad_norm": 1.7092259574450879, + "language_loss": 0.80862331, + "learning_rate": 3.172070360676475e-06, + "loss": 0.83030069, + "num_input_tokens_seen": 114917940, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84765625, + "step": 5350, + "time_per_iteration": 2.463998794555664 + }, + { + "auxiliary_loss_clip": 0.01129559, + "auxiliary_loss_mlp": 0.01040074, + "balance_loss_clip": 1.02545869, + "balance_loss_mlp": 1.04548049, + "epoch": 0.3217195250263039, + "flos": 27599900158080.0, + "grad_norm": 1.7709203173786705, + "language_loss": 0.79856229, + "learning_rate": 3.1717547632603828e-06, + "loss": 0.82025862, + "num_input_tokens_seen": 114937735, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.83984375, + "step": 5351, + "time_per_iteration": 2.5017340183258057 + }, + { + "auxiliary_loss_clip": 0.01129171, + "auxiliary_loss_mlp": 0.01040328, + "balance_loss_clip": 1.02396047, + "balance_loss_mlp": 1.04505897, + "epoch": 0.3217796482789719, + "flos": 21470595373440.0, + "grad_norm": 1.701097630272038, + "language_loss": 0.75491166, + "learning_rate": 3.1714391214112326e-06, + "loss": 0.77660662, + "num_input_tokens_seen": 114956630, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.83984375, + "step": 5352, + "time_per_iteration": 2.4916653633117676 + }, + { + "auxiliary_loss_clip": 0.01132153, + "auxiliary_loss_mlp": 0.01038225, + "balance_loss_clip": 1.02179837, + "balance_loss_mlp": 1.0472436, + "epoch": 0.32183977153163984, + "flos": 21215593745280.0, + "grad_norm": 1.8428416092094815, + "language_loss": 0.8174473, + "learning_rate": 3.1711234351409933e-06, + "loss": 0.83915108, + "num_input_tokens_seen": 114976470, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8515625, + "step": 5353, + "time_per_iteration": 2.4554946422576904 + }, + { + "auxiliary_loss_clip": 0.01127699, + "auxiliary_loss_mlp": 0.01037405, + "balance_loss_clip": 1.02147865, + "balance_loss_mlp": 1.04577875, + "epoch": 0.3218998947843078, + "flos": 24608182331520.0, + "grad_norm": 1.533417142425662, + "language_loss": 0.73054826, + "learning_rate": 3.1708077044616365e-06, + "loss": 0.75219929, + "num_input_tokens_seen": 114996710, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8203125, + "step": 5354, + "time_per_iteration": 2.521679639816284 + }, + { + "auxiliary_loss_clip": 0.01129194, + "auxiliary_loss_mlp": 0.01033035, + "balance_loss_clip": 1.01830053, + "balance_loss_mlp": 1.04482782, + "epoch": 0.3219600180369758, + "flos": 22270577126400.0, + "grad_norm": 1.5056594732405602, + "language_loss": 0.8349731, + "learning_rate": 3.1704919293851334e-06, + "loss": 0.8565954, + "num_input_tokens_seen": 115015775, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.84375, + "step": 5355, + "time_per_iteration": 2.4590871334075928 + }, + { + "auxiliary_loss_clip": 0.0113427, + "auxiliary_loss_mlp": 0.01045552, + "balance_loss_clip": 1.0299834, + "balance_loss_mlp": 1.04840243, + "epoch": 0.3220201412896438, + "flos": 14939126939520.0, + "grad_norm": 2.2450583198173737, + "language_loss": 0.71577442, + "learning_rate": 3.1701761099234597e-06, + "loss": 0.73757267, + "num_input_tokens_seen": 115034265, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.859375, + "step": 5356, + "time_per_iteration": 2.4499382972717285 + }, + { + "auxiliary_loss_clip": 0.01137452, + "auxiliary_loss_mlp": 0.0103626, + "balance_loss_clip": 1.0196538, + "balance_loss_mlp": 1.04720378, + "epoch": 0.32208026454231176, + "flos": 22667389649280.0, + "grad_norm": 2.5072162620412968, + "language_loss": 0.68480343, + "learning_rate": 3.1698602460885903e-06, + "loss": 0.70654052, + "num_input_tokens_seen": 115051945, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.90234375, + "step": 5357, + "time_per_iteration": 2.449125289916992 + }, + { + "auxiliary_loss_clip": 0.01042111, + "auxiliary_loss_mlp": 0.01002103, + "balance_loss_clip": 1.00029111, + "balance_loss_mlp": 1.01435876, + "epoch": 0.3221403877949797, + "flos": 64605130053120.0, + "grad_norm": 0.7023861387911429, + "language_loss": 0.58256829, + "learning_rate": 3.1695443378925035e-06, + "loss": 0.60301042, + "num_input_tokens_seen": 115119090, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.27734375, + "step": 5358, + "time_per_iteration": 3.1561930179595947 + }, + { + "auxiliary_loss_clip": 0.01130123, + "auxiliary_loss_mlp": 0.01041349, + "balance_loss_clip": 1.02506542, + "balance_loss_mlp": 1.04423356, + "epoch": 0.3222005110476477, + "flos": 20157019004160.0, + "grad_norm": 5.918956850418863, + "language_loss": 0.83524048, + "learning_rate": 3.1692283853471777e-06, + "loss": 0.85695517, + "num_input_tokens_seen": 115137755, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.859375, + "step": 5359, + "time_per_iteration": 2.4850337505340576 + }, + { + "auxiliary_loss_clip": 0.01132117, + "auxiliary_loss_mlp": 0.01034071, + "balance_loss_clip": 1.019122, + "balance_loss_mlp": 1.04514802, + "epoch": 0.32226063430031565, + "flos": 22674177319680.0, + "grad_norm": 1.5557598040672038, + "language_loss": 0.79817981, + "learning_rate": 3.168912388464595e-06, + "loss": 0.81984174, + "num_input_tokens_seen": 115158150, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8671875, + "step": 5360, + "time_per_iteration": 2.476698637008667 + }, + { + "auxiliary_loss_clip": 0.01040711, + "auxiliary_loss_mlp": 0.00999439, + "balance_loss_clip": 0.99754351, + "balance_loss_mlp": 1.01316833, + "epoch": 0.3223207575529836, + "flos": 63828525075840.0, + "grad_norm": 0.750004294413456, + "language_loss": 0.5697335, + "learning_rate": 3.168596347256737e-06, + "loss": 0.59013498, + "num_input_tokens_seen": 115212755, + "router_z_loss_clip": 0.0189209, + "router_z_loss_mlp": 0.27539062, + "step": 5361, + "time_per_iteration": 2.933368444442749 + }, + { + "auxiliary_loss_clip": 0.01129938, + "auxiliary_loss_mlp": 0.01039744, + "balance_loss_clip": 1.02452111, + "balance_loss_mlp": 1.04625082, + "epoch": 0.3223808808056516, + "flos": 26870123537280.0, + "grad_norm": 1.730134050345621, + "language_loss": 0.71349204, + "learning_rate": 3.168280261735588e-06, + "loss": 0.73518884, + "num_input_tokens_seen": 115233090, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8359375, + "step": 5362, + "time_per_iteration": 2.508444309234619 + }, + { + "auxiliary_loss_clip": 0.0113054, + "auxiliary_loss_mlp": 0.01040003, + "balance_loss_clip": 1.02606201, + "balance_loss_mlp": 1.04685211, + "epoch": 0.32244100405831955, + "flos": 26761350176640.0, + "grad_norm": 1.6566995758494631, + "language_loss": 0.74008292, + "learning_rate": 3.167964131913135e-06, + "loss": 0.76178837, + "num_input_tokens_seen": 115252645, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.8359375, + "step": 5363, + "time_per_iteration": 2.530428409576416 + }, + { + "auxiliary_loss_clip": 0.01134592, + "auxiliary_loss_mlp": 0.01040493, + "balance_loss_clip": 1.02481735, + "balance_loss_mlp": 1.04535139, + "epoch": 0.3225011273109875, + "flos": 23803029020160.0, + "grad_norm": 2.5112112412179624, + "language_loss": 0.77012563, + "learning_rate": 3.167647957801365e-06, + "loss": 0.79187649, + "num_input_tokens_seen": 115269085, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.890625, + "step": 5364, + "time_per_iteration": 2.475532054901123 + }, + { + "auxiliary_loss_clip": 0.01128803, + "auxiliary_loss_mlp": 0.01043179, + "balance_loss_clip": 1.02747917, + "balance_loss_mlp": 1.04455853, + "epoch": 0.3225612505636555, + "flos": 17274505501440.0, + "grad_norm": 2.1198351151285992, + "language_loss": 0.77043676, + "learning_rate": 3.1673317394122672e-06, + "loss": 0.79215652, + "num_input_tokens_seen": 115286470, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.84375, + "step": 5365, + "time_per_iteration": 2.4466004371643066 + }, + { + "auxiliary_loss_clip": 0.01133051, + "auxiliary_loss_mlp": 0.01049625, + "balance_loss_clip": 1.03444982, + "balance_loss_mlp": 1.04861832, + "epoch": 0.32262137381632344, + "flos": 23366247638400.0, + "grad_norm": 1.5183743876703555, + "language_loss": 0.76853883, + "learning_rate": 3.1670154767578333e-06, + "loss": 0.79036558, + "num_input_tokens_seen": 115307000, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.84375, + "step": 5366, + "time_per_iteration": 2.4716286659240723 + }, + { + "auxiliary_loss_clip": 0.01129975, + "auxiliary_loss_mlp": 0.01042819, + "balance_loss_clip": 1.02767324, + "balance_loss_mlp": 1.04463363, + "epoch": 0.3226814970689914, + "flos": 23258803080960.0, + "grad_norm": 1.6325357922005805, + "language_loss": 0.7200039, + "learning_rate": 3.166699169850055e-06, + "loss": 0.74173188, + "num_input_tokens_seen": 115325925, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8515625, + "step": 5367, + "time_per_iteration": 2.4936037063598633 + }, + { + "auxiliary_loss_clip": 0.01125689, + "auxiliary_loss_mlp": 0.01042014, + "balance_loss_clip": 1.02759588, + "balance_loss_mlp": 1.04335558, + "epoch": 0.32274162032165943, + "flos": 16395196561920.0, + "grad_norm": 1.8801069032327764, + "language_loss": 0.7456941, + "learning_rate": 3.1663828187009274e-06, + "loss": 0.76737112, + "num_input_tokens_seen": 115343705, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8203125, + "step": 5368, + "time_per_iteration": 2.436897039413452 + }, + { + "auxiliary_loss_clip": 0.01125271, + "auxiliary_loss_mlp": 0.0104042, + "balance_loss_clip": 1.02592432, + "balance_loss_mlp": 1.04390144, + "epoch": 0.3228017435743274, + "flos": 27855081354240.0, + "grad_norm": 1.5502047591083525, + "language_loss": 0.79212499, + "learning_rate": 3.1660664233224467e-06, + "loss": 0.81378186, + "num_input_tokens_seen": 115364170, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8125, + "step": 5369, + "time_per_iteration": 2.516191244125366 + }, + { + "auxiliary_loss_clip": 0.01124886, + "auxiliary_loss_mlp": 0.01034584, + "balance_loss_clip": 1.02042747, + "balance_loss_mlp": 1.04432988, + "epoch": 0.32286186682699536, + "flos": 19608770741760.0, + "grad_norm": 1.8370527927944635, + "language_loss": 0.83173579, + "learning_rate": 3.16574998372661e-06, + "loss": 0.85333049, + "num_input_tokens_seen": 115382495, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8046875, + "step": 5370, + "time_per_iteration": 2.423494338989258 + }, + { + "auxiliary_loss_clip": 0.01128659, + "auxiliary_loss_mlp": 0.01038158, + "balance_loss_clip": 1.02367377, + "balance_loss_mlp": 1.04524064, + "epoch": 0.3229219900796633, + "flos": 24134017870080.0, + "grad_norm": 1.743608915284185, + "language_loss": 0.83372939, + "learning_rate": 3.1654334999254177e-06, + "loss": 0.85539752, + "num_input_tokens_seen": 115399450, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8359375, + "step": 5371, + "time_per_iteration": 2.481677532196045 + }, + { + "auxiliary_loss_clip": 0.01131779, + "auxiliary_loss_mlp": 0.01048903, + "balance_loss_clip": 1.0323211, + "balance_loss_mlp": 1.04514813, + "epoch": 0.3229821133323313, + "flos": 17748705876480.0, + "grad_norm": 2.043238736788368, + "language_loss": 0.88539696, + "learning_rate": 3.1651169719308695e-06, + "loss": 0.90720367, + "num_input_tokens_seen": 115417700, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8671875, + "step": 5372, + "time_per_iteration": 2.434785842895508 + }, + { + "auxiliary_loss_clip": 0.01128015, + "auxiliary_loss_mlp": 0.01045541, + "balance_loss_clip": 1.03011537, + "balance_loss_mlp": 1.04532862, + "epoch": 0.32304223658499925, + "flos": 22346025644160.0, + "grad_norm": 1.9701661898720624, + "language_loss": 0.73064935, + "learning_rate": 3.1648003997549694e-06, + "loss": 0.75238496, + "num_input_tokens_seen": 115435840, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.828125, + "step": 5373, + "time_per_iteration": 2.509288787841797 + }, + { + "auxiliary_loss_clip": 0.01126431, + "auxiliary_loss_mlp": 0.01036263, + "balance_loss_clip": 1.0217371, + "balance_loss_mlp": 1.04496944, + "epoch": 0.3231023598376672, + "flos": 18478302929280.0, + "grad_norm": 2.118108535598075, + "language_loss": 0.81306481, + "learning_rate": 3.1644837834097214e-06, + "loss": 0.83469176, + "num_input_tokens_seen": 115454210, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8125, + "step": 5374, + "time_per_iteration": 2.43719744682312 + }, + { + "auxiliary_loss_clip": 0.01122361, + "auxiliary_loss_mlp": 0.01036065, + "balance_loss_clip": 1.02135515, + "balance_loss_mlp": 1.04158425, + "epoch": 0.3231624830903352, + "flos": 27636313570560.0, + "grad_norm": 2.0253542373007223, + "language_loss": 0.87507123, + "learning_rate": 3.1641671229071317e-06, + "loss": 0.89665556, + "num_input_tokens_seen": 115471785, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.80859375, + "step": 5375, + "time_per_iteration": 2.5192272663116455 + }, + { + "auxiliary_loss_clip": 0.0112955, + "auxiliary_loss_mlp": 0.01037551, + "balance_loss_clip": 1.02163696, + "balance_loss_mlp": 1.04312396, + "epoch": 0.32322260634300315, + "flos": 21726423014400.0, + "grad_norm": 1.8491566525281582, + "language_loss": 0.75873786, + "learning_rate": 3.1638504182592076e-06, + "loss": 0.78040886, + "num_input_tokens_seen": 115491405, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8671875, + "step": 5376, + "time_per_iteration": 2.463103771209717 + }, + { + "auxiliary_loss_clip": 0.01123814, + "auxiliary_loss_mlp": 0.0103315, + "balance_loss_clip": 1.01955426, + "balance_loss_mlp": 1.04269242, + "epoch": 0.3232827295956711, + "flos": 22637656166400.0, + "grad_norm": 1.5890241026671568, + "language_loss": 0.67173672, + "learning_rate": 3.1635336694779594e-06, + "loss": 0.69330645, + "num_input_tokens_seen": 115511555, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.8125, + "step": 5377, + "time_per_iteration": 2.5341343879699707 + }, + { + "auxiliary_loss_clip": 0.01127281, + "auxiliary_loss_mlp": 0.01046076, + "balance_loss_clip": 1.02922571, + "balance_loss_mlp": 1.04433763, + "epoch": 0.3233428528483391, + "flos": 26322593546880.0, + "grad_norm": 1.5071806558198568, + "language_loss": 0.7231617, + "learning_rate": 3.1632168765753982e-06, + "loss": 0.74489522, + "num_input_tokens_seen": 115532860, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.828125, + "step": 5378, + "time_per_iteration": 2.4838621616363525 + }, + { + "auxiliary_loss_clip": 0.01123972, + "auxiliary_loss_mlp": 0.01032073, + "balance_loss_clip": 1.0174818, + "balance_loss_mlp": 1.04056036, + "epoch": 0.32340297610100704, + "flos": 28585217111040.0, + "grad_norm": 1.9527598104570445, + "language_loss": 0.82083338, + "learning_rate": 3.1629000395635357e-06, + "loss": 0.84239388, + "num_input_tokens_seen": 115553850, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8359375, + "step": 5379, + "time_per_iteration": 2.5433154106140137 + }, + { + "auxiliary_loss_clip": 0.01127314, + "auxiliary_loss_mlp": 0.01032505, + "balance_loss_clip": 1.01805711, + "balance_loss_mlp": 1.04230165, + "epoch": 0.323463099353675, + "flos": 30773792787840.0, + "grad_norm": 1.9705325619840932, + "language_loss": 0.78379917, + "learning_rate": 3.162583158454388e-06, + "loss": 0.80539739, + "num_input_tokens_seen": 115575530, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8515625, + "step": 5380, + "time_per_iteration": 2.5306878089904785 + }, + { + "auxiliary_loss_clip": 0.0112988, + "auxiliary_loss_mlp": 0.01036402, + "balance_loss_clip": 1.02207887, + "balance_loss_mlp": 1.04637241, + "epoch": 0.32352322260634303, + "flos": 25228610974080.0, + "grad_norm": 1.5992937517204726, + "language_loss": 0.76871669, + "learning_rate": 3.1622662332599697e-06, + "loss": 0.79037952, + "num_input_tokens_seen": 115594885, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8359375, + "step": 5381, + "time_per_iteration": 2.545740842819214 + }, + { + "auxiliary_loss_clip": 0.0112089, + "auxiliary_loss_mlp": 0.01035663, + "balance_loss_clip": 1.02228761, + "balance_loss_mlp": 1.04212475, + "epoch": 0.323583345859011, + "flos": 23330480670720.0, + "grad_norm": 1.912812068704809, + "language_loss": 0.71864545, + "learning_rate": 3.1619492639922998e-06, + "loss": 0.74021101, + "num_input_tokens_seen": 115614080, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7890625, + "step": 5382, + "time_per_iteration": 2.488344430923462 + }, + { + "auxiliary_loss_clip": 0.01127382, + "auxiliary_loss_mlp": 0.0103402, + "balance_loss_clip": 1.0192976, + "balance_loss_mlp": 1.0424943, + "epoch": 0.32364346911167896, + "flos": 26207499392640.0, + "grad_norm": 2.8562908675977754, + "language_loss": 0.70752692, + "learning_rate": 3.1616322506633964e-06, + "loss": 0.72914088, + "num_input_tokens_seen": 115632820, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8515625, + "step": 5383, + "time_per_iteration": 2.5236711502075195 + }, + { + "auxiliary_loss_clip": 0.01123876, + "auxiliary_loss_mlp": 0.01039099, + "balance_loss_clip": 1.0259378, + "balance_loss_mlp": 1.0442363, + "epoch": 0.3237035923643469, + "flos": 23695764030720.0, + "grad_norm": 2.094388352971362, + "language_loss": 0.78742963, + "learning_rate": 3.161315193285283e-06, + "loss": 0.80905938, + "num_input_tokens_seen": 115652860, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.796875, + "step": 5384, + "time_per_iteration": 2.4685723781585693 + }, + { + "auxiliary_loss_clip": 0.0112912, + "auxiliary_loss_mlp": 0.01038116, + "balance_loss_clip": 1.0222249, + "balance_loss_mlp": 1.04443073, + "epoch": 0.3237637156170149, + "flos": 14428728633600.0, + "grad_norm": 2.069351852322995, + "language_loss": 0.74553645, + "learning_rate": 3.16099809186998e-06, + "loss": 0.76720881, + "num_input_tokens_seen": 115670940, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.84765625, + "step": 5385, + "time_per_iteration": 2.46968936920166 + }, + { + "auxiliary_loss_clip": 0.01127931, + "auxiliary_loss_mlp": 0.01035567, + "balance_loss_clip": 1.02101183, + "balance_loss_mlp": 1.04604125, + "epoch": 0.32382383886968286, + "flos": 31062981185280.0, + "grad_norm": 1.8196037573439483, + "language_loss": 0.72068852, + "learning_rate": 3.1606809464295145e-06, + "loss": 0.74232352, + "num_input_tokens_seen": 115691155, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8203125, + "step": 5386, + "time_per_iteration": 2.559480667114258 + }, + { + "auxiliary_loss_clip": 0.01128094, + "auxiliary_loss_mlp": 0.01036855, + "balance_loss_clip": 1.02119136, + "balance_loss_mlp": 1.04176617, + "epoch": 0.3238839621223508, + "flos": 23256935573760.0, + "grad_norm": 1.8525904099951498, + "language_loss": 0.94343817, + "learning_rate": 3.1603637569759095e-06, + "loss": 0.96508765, + "num_input_tokens_seen": 115710340, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.86328125, + "step": 5387, + "time_per_iteration": 5.378048896789551 + }, + { + "auxiliary_loss_clip": 0.0112709, + "auxiliary_loss_mlp": 0.01037979, + "balance_loss_clip": 1.02227962, + "balance_loss_mlp": 1.04373097, + "epoch": 0.3239440853750188, + "flos": 22964658606720.0, + "grad_norm": 2.7647642243142747, + "language_loss": 0.77544433, + "learning_rate": 3.1600465235211956e-06, + "loss": 0.79709506, + "num_input_tokens_seen": 115726745, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8359375, + "step": 5388, + "time_per_iteration": 2.4804563522338867 + }, + { + "auxiliary_loss_clip": 0.0112736, + "auxiliary_loss_mlp": 0.01030728, + "balance_loss_clip": 1.01554048, + "balance_loss_mlp": 1.04277194, + "epoch": 0.32400420862768675, + "flos": 36246614653440.0, + "grad_norm": 2.092216766577811, + "language_loss": 0.71867704, + "learning_rate": 3.1597292460774006e-06, + "loss": 0.74025786, + "num_input_tokens_seen": 115749385, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.84375, + "step": 5389, + "time_per_iteration": 2.5753331184387207 + }, + { + "auxiliary_loss_clip": 0.01128194, + "auxiliary_loss_mlp": 0.0103865, + "balance_loss_clip": 1.0233078, + "balance_loss_mlp": 1.04672205, + "epoch": 0.3240643318803547, + "flos": 21616500418560.0, + "grad_norm": 2.0374979548818497, + "language_loss": 0.80883735, + "learning_rate": 3.159411924656557e-06, + "loss": 0.83050573, + "num_input_tokens_seen": 115768105, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.81640625, + "step": 5390, + "time_per_iteration": 2.479557991027832 + }, + { + "auxiliary_loss_clip": 0.01130573, + "auxiliary_loss_mlp": 0.01044181, + "balance_loss_clip": 1.02911294, + "balance_loss_mlp": 1.04798484, + "epoch": 0.3241244551330227, + "flos": 23295611543040.0, + "grad_norm": 2.0682587448682384, + "language_loss": 0.72983515, + "learning_rate": 3.1590945592706967e-06, + "loss": 0.75158268, + "num_input_tokens_seen": 115787340, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.82421875, + "step": 5391, + "time_per_iteration": 2.4689247608184814 + }, + { + "auxiliary_loss_clip": 0.01125432, + "auxiliary_loss_mlp": 0.01041396, + "balance_loss_clip": 1.02728176, + "balance_loss_mlp": 1.04465139, + "epoch": 0.32418457838569065, + "flos": 14097236993280.0, + "grad_norm": 1.6356435132494873, + "language_loss": 0.77357036, + "learning_rate": 3.158777149931855e-06, + "loss": 0.79523861, + "num_input_tokens_seen": 115805565, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.80859375, + "step": 5392, + "time_per_iteration": 2.4942643642425537 + }, + { + "auxiliary_loss_clip": 0.01129141, + "auxiliary_loss_mlp": 0.01040265, + "balance_loss_clip": 1.02454162, + "balance_loss_mlp": 1.04454243, + "epoch": 0.3242447016383586, + "flos": 29752672953600.0, + "grad_norm": 2.035025217222515, + "language_loss": 0.62445068, + "learning_rate": 3.158459696652067e-06, + "loss": 0.64614469, + "num_input_tokens_seen": 115826725, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.84375, + "step": 5393, + "time_per_iteration": 2.5294058322906494 + }, + { + "auxiliary_loss_clip": 0.01127178, + "auxiliary_loss_mlp": 0.01037592, + "balance_loss_clip": 1.02292883, + "balance_loss_mlp": 1.0455395, + "epoch": 0.3243048248910266, + "flos": 24351205455360.0, + "grad_norm": 1.541011228274946, + "language_loss": 0.8250984, + "learning_rate": 3.158142199443371e-06, + "loss": 0.84674609, + "num_input_tokens_seen": 115846955, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.81640625, + "step": 5394, + "time_per_iteration": 2.5204803943634033 + }, + { + "auxiliary_loss_clip": 0.01125244, + "auxiliary_loss_mlp": 0.0104429, + "balance_loss_clip": 1.03089094, + "balance_loss_mlp": 1.04596353, + "epoch": 0.3243649481436946, + "flos": 24353037048960.0, + "grad_norm": 1.8431569167236632, + "language_loss": 0.81585443, + "learning_rate": 3.1578246583178076e-06, + "loss": 0.83754981, + "num_input_tokens_seen": 115865975, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.79296875, + "step": 5395, + "time_per_iteration": 2.481722116470337 + }, + { + "auxiliary_loss_clip": 0.01126361, + "auxiliary_loss_mlp": 0.01042015, + "balance_loss_clip": 1.02844906, + "balance_loss_mlp": 1.04834461, + "epoch": 0.32442507139636256, + "flos": 22925228451840.0, + "grad_norm": 3.644291671680186, + "language_loss": 0.83163011, + "learning_rate": 3.157507073287417e-06, + "loss": 0.8533138, + "num_input_tokens_seen": 115884950, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.78125, + "step": 5396, + "time_per_iteration": 2.5014734268188477 + }, + { + "auxiliary_loss_clip": 0.01133358, + "auxiliary_loss_mlp": 0.01039347, + "balance_loss_clip": 1.02392137, + "balance_loss_mlp": 1.04687238, + "epoch": 0.32448519464903053, + "flos": 22200192426240.0, + "grad_norm": 1.8637158339296453, + "language_loss": 0.75718713, + "learning_rate": 3.1571894443642414e-06, + "loss": 0.77891421, + "num_input_tokens_seen": 115904170, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8671875, + "step": 5397, + "time_per_iteration": 2.475958824157715 + }, + { + "auxiliary_loss_clip": 0.01125578, + "auxiliary_loss_mlp": 0.0103396, + "balance_loss_clip": 1.01953566, + "balance_loss_mlp": 1.04540443, + "epoch": 0.3245453179016985, + "flos": 18838450644480.0, + "grad_norm": 2.571224523552484, + "language_loss": 0.66835862, + "learning_rate": 3.1568717715603263e-06, + "loss": 0.68995398, + "num_input_tokens_seen": 115919255, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8046875, + "step": 5398, + "time_per_iteration": 2.447065830230713 + }, + { + "auxiliary_loss_clip": 0.01124854, + "auxiliary_loss_mlp": 0.0103244, + "balance_loss_clip": 1.0183022, + "balance_loss_mlp": 1.04326463, + "epoch": 0.32460544115436646, + "flos": 21178390233600.0, + "grad_norm": 1.4279244162742584, + "language_loss": 0.73232102, + "learning_rate": 3.156554054887718e-06, + "loss": 0.75389397, + "num_input_tokens_seen": 115938535, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8203125, + "step": 5399, + "time_per_iteration": 2.466137409210205 + }, + { + "auxiliary_loss_clip": 0.01129831, + "auxiliary_loss_mlp": 0.01035026, + "balance_loss_clip": 1.02016079, + "balance_loss_mlp": 1.04749155, + "epoch": 0.3246655644070344, + "flos": 21981137333760.0, + "grad_norm": 2.110147681467196, + "language_loss": 0.71391356, + "learning_rate": 3.1562362943584645e-06, + "loss": 0.73556215, + "num_input_tokens_seen": 115955005, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.82421875, + "step": 5400, + "time_per_iteration": 2.484243631362915 + }, + { + "auxiliary_loss_clip": 0.01128373, + "auxiliary_loss_mlp": 0.01035494, + "balance_loss_clip": 1.02108145, + "balance_loss_mlp": 1.04439175, + "epoch": 0.3247256876597024, + "flos": 32159729105280.0, + "grad_norm": 3.048924003265154, + "language_loss": 0.79583031, + "learning_rate": 3.155918489984614e-06, + "loss": 0.81746894, + "num_input_tokens_seen": 115975305, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.83984375, + "step": 5401, + "time_per_iteration": 2.5695505142211914 + }, + { + "auxiliary_loss_clip": 0.01130508, + "auxiliary_loss_mlp": 0.01042722, + "balance_loss_clip": 1.02642608, + "balance_loss_mlp": 1.04700303, + "epoch": 0.32478581091237035, + "flos": 20997544233600.0, + "grad_norm": 1.4209306386542333, + "language_loss": 0.87675726, + "learning_rate": 3.1556006417782196e-06, + "loss": 0.89848959, + "num_input_tokens_seen": 115994810, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8359375, + "step": 5402, + "time_per_iteration": 2.4811201095581055 + }, + { + "auxiliary_loss_clip": 0.01122645, + "auxiliary_loss_mlp": 0.01036689, + "balance_loss_clip": 1.02249742, + "balance_loss_mlp": 1.04369164, + "epoch": 0.3248459341650383, + "flos": 17924990849280.0, + "grad_norm": 1.934597728175988, + "language_loss": 0.84513289, + "learning_rate": 3.155282749751332e-06, + "loss": 0.86672628, + "num_input_tokens_seen": 116011095, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7890625, + "step": 5403, + "time_per_iteration": 2.418501377105713 + }, + { + "auxiliary_loss_clip": 0.01129275, + "auxiliary_loss_mlp": 0.01041866, + "balance_loss_clip": 1.02852631, + "balance_loss_mlp": 1.05024314, + "epoch": 0.3249060574177063, + "flos": 24535606901760.0, + "grad_norm": 2.0001546098828955, + "language_loss": 0.87642342, + "learning_rate": 3.154964813916007e-06, + "loss": 0.89813483, + "num_input_tokens_seen": 116028805, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7890625, + "step": 5404, + "time_per_iteration": 2.5094971656799316 + }, + { + "auxiliary_loss_clip": 0.01125879, + "auxiliary_loss_mlp": 0.01038939, + "balance_loss_clip": 1.02413273, + "balance_loss_mlp": 1.04579973, + "epoch": 0.32496618067037425, + "flos": 25994765093760.0, + "grad_norm": 1.6336968005079966, + "language_loss": 0.72491479, + "learning_rate": 3.1546468342843008e-06, + "loss": 0.74656296, + "num_input_tokens_seen": 116047765, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.80078125, + "step": 5405, + "time_per_iteration": 2.4927978515625 + }, + { + "auxiliary_loss_clip": 0.01125757, + "auxiliary_loss_mlp": 0.01035734, + "balance_loss_clip": 1.02147698, + "balance_loss_mlp": 1.04514825, + "epoch": 0.3250263039230422, + "flos": 19573757959680.0, + "grad_norm": 1.8637721662214948, + "language_loss": 0.83356953, + "learning_rate": 3.1543288108682707e-06, + "loss": 0.85518444, + "num_input_tokens_seen": 116068385, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.80859375, + "step": 5406, + "time_per_iteration": 2.534508228302002 + }, + { + "auxiliary_loss_clip": 0.01127659, + "auxiliary_loss_mlp": 0.01036296, + "balance_loss_clip": 1.02241969, + "balance_loss_mlp": 1.0469048, + "epoch": 0.3250864271757102, + "flos": 16763640318720.0, + "grad_norm": 1.836635199790601, + "language_loss": 0.8826412, + "learning_rate": 3.1540107436799764e-06, + "loss": 0.90428072, + "num_input_tokens_seen": 116085350, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.80859375, + "step": 5407, + "time_per_iteration": 2.4199326038360596 + }, + { + "auxiliary_loss_clip": 0.01127405, + "auxiliary_loss_mlp": 0.01036157, + "balance_loss_clip": 1.02160144, + "balance_loss_mlp": 1.04602861, + "epoch": 0.3251465504283782, + "flos": 27819458040960.0, + "grad_norm": 1.5140887230520799, + "language_loss": 0.69643426, + "learning_rate": 3.153692632731479e-06, + "loss": 0.71806979, + "num_input_tokens_seen": 116107560, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8125, + "step": 5408, + "time_per_iteration": 2.5646731853485107 + }, + { + "auxiliary_loss_clip": 0.01131319, + "auxiliary_loss_mlp": 0.01035172, + "balance_loss_clip": 1.02013946, + "balance_loss_mlp": 1.04438102, + "epoch": 0.32520667368104617, + "flos": 19063144172160.0, + "grad_norm": 1.6429750268405912, + "language_loss": 0.77442145, + "learning_rate": 3.153374478034841e-06, + "loss": 0.79608637, + "num_input_tokens_seen": 116125980, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.87109375, + "step": 5409, + "time_per_iteration": 2.450200080871582 + }, + { + "auxiliary_loss_clip": 0.01129924, + "auxiliary_loss_mlp": 0.01046371, + "balance_loss_clip": 1.03142262, + "balance_loss_mlp": 1.04331136, + "epoch": 0.32526679693371413, + "flos": 29382146208000.0, + "grad_norm": 2.3862040562488716, + "language_loss": 0.83582234, + "learning_rate": 3.1530562796021285e-06, + "loss": 0.85758531, + "num_input_tokens_seen": 116146530, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8671875, + "step": 5410, + "time_per_iteration": 2.5161662101745605 + }, + { + "auxiliary_loss_clip": 0.01121858, + "auxiliary_loss_mlp": 0.01034854, + "balance_loss_clip": 1.02089429, + "balance_loss_mlp": 1.04224813, + "epoch": 0.3253269201863821, + "flos": 20704513080960.0, + "grad_norm": 1.5577179591930796, + "language_loss": 0.71270931, + "learning_rate": 3.152738037445405e-06, + "loss": 0.73427641, + "num_input_tokens_seen": 116165695, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.796875, + "step": 5411, + "time_per_iteration": 2.4465057849884033 + }, + { + "auxiliary_loss_clip": 0.01125475, + "auxiliary_loss_mlp": 0.01039417, + "balance_loss_clip": 1.02544606, + "balance_loss_mlp": 1.04381669, + "epoch": 0.32538704343905006, + "flos": 29094142959360.0, + "grad_norm": 1.6024997274503978, + "language_loss": 0.83103073, + "learning_rate": 3.1524197515767403e-06, + "loss": 0.85267961, + "num_input_tokens_seen": 116185375, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.81640625, + "step": 5412, + "time_per_iteration": 2.512471914291382 + }, + { + "auxiliary_loss_clip": 0.01129762, + "auxiliary_loss_mlp": 0.01035505, + "balance_loss_clip": 1.01963782, + "balance_loss_mlp": 1.04417348, + "epoch": 0.325447166691718, + "flos": 24676124906880.0, + "grad_norm": 2.3149031646834577, + "language_loss": 0.80794364, + "learning_rate": 3.152101422008203e-06, + "loss": 0.82959628, + "num_input_tokens_seen": 116204335, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.85546875, + "step": 5413, + "time_per_iteration": 2.483309030532837 + }, + { + "auxiliary_loss_clip": 0.01128818, + "auxiliary_loss_mlp": 0.01042957, + "balance_loss_clip": 1.02723312, + "balance_loss_mlp": 1.04606462, + "epoch": 0.325507289944386, + "flos": 21543134889600.0, + "grad_norm": 1.5892127721025033, + "language_loss": 0.76887989, + "learning_rate": 3.151783048751864e-06, + "loss": 0.79059768, + "num_input_tokens_seen": 116222840, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.828125, + "step": 5414, + "time_per_iteration": 2.4696640968322754 + }, + { + "auxiliary_loss_clip": 0.01039619, + "auxiliary_loss_mlp": 0.01008091, + "balance_loss_clip": 1.00601661, + "balance_loss_mlp": 1.01271892, + "epoch": 0.32556741319705396, + "flos": 71518722347520.0, + "grad_norm": 0.9084647328862615, + "language_loss": 0.64009887, + "learning_rate": 3.1514646318197965e-06, + "loss": 0.66057593, + "num_input_tokens_seen": 116274940, + "router_z_loss_clip": 0.02075195, + "router_z_loss_mlp": 0.26953125, + "step": 5415, + "time_per_iteration": 2.982389450073242 + }, + { + "auxiliary_loss_clip": 0.01124624, + "auxiliary_loss_mlp": 0.01036817, + "balance_loss_clip": 1.02214265, + "balance_loss_mlp": 1.04286838, + "epoch": 0.3256275364497219, + "flos": 23732428838400.0, + "grad_norm": 2.942597496869342, + "language_loss": 0.74265057, + "learning_rate": 3.151146171224075e-06, + "loss": 0.764265, + "num_input_tokens_seen": 116297300, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.81640625, + "step": 5416, + "time_per_iteration": 2.526956558227539 + }, + { + "auxiliary_loss_clip": 0.01039656, + "auxiliary_loss_mlp": 0.01005548, + "balance_loss_clip": 1.00335431, + "balance_loss_mlp": 1.01254702, + "epoch": 0.3256876597023899, + "flos": 67289199891840.0, + "grad_norm": 0.7736939008633222, + "language_loss": 0.57947183, + "learning_rate": 3.1508276669767757e-06, + "loss": 0.59992385, + "num_input_tokens_seen": 116362370, + "router_z_loss_clip": 0.02197266, + "router_z_loss_mlp": 0.26953125, + "step": 5417, + "time_per_iteration": 3.1500296592712402 + }, + { + "auxiliary_loss_clip": 0.01038219, + "auxiliary_loss_mlp": 0.01002181, + "balance_loss_clip": 1.0002141, + "balance_loss_mlp": 1.01140058, + "epoch": 0.32574778295505785, + "flos": 71282323964160.0, + "grad_norm": 0.9133944403169288, + "language_loss": 0.63476181, + "learning_rate": 3.150509119089975e-06, + "loss": 0.65516579, + "num_input_tokens_seen": 116430365, + "router_z_loss_clip": 0.01965332, + "router_z_loss_mlp": 0.26953125, + "step": 5418, + "time_per_iteration": 3.1724026203155518 + }, + { + "auxiliary_loss_clip": 0.01125951, + "auxiliary_loss_mlp": 0.01041707, + "balance_loss_clip": 1.02739, + "balance_loss_mlp": 1.0441196, + "epoch": 0.3258079062077258, + "flos": 20776370238720.0, + "grad_norm": 3.240595355482155, + "language_loss": 0.69061959, + "learning_rate": 3.1501905275757537e-06, + "loss": 0.71229619, + "num_input_tokens_seen": 116447525, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.8203125, + "step": 5419, + "time_per_iteration": 2.4643847942352295 + }, + { + "auxiliary_loss_clip": 0.01125895, + "auxiliary_loss_mlp": 0.01035053, + "balance_loss_clip": 1.01951957, + "balance_loss_mlp": 1.04326844, + "epoch": 0.3258680294603938, + "flos": 22235456603520.0, + "grad_norm": 2.1209544014848443, + "language_loss": 0.77064359, + "learning_rate": 3.1498718924461926e-06, + "loss": 0.79225302, + "num_input_tokens_seen": 116466310, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.828125, + "step": 5420, + "time_per_iteration": 2.5241270065307617 + }, + { + "auxiliary_loss_clip": 0.01128645, + "auxiliary_loss_mlp": 0.01035079, + "balance_loss_clip": 1.01945043, + "balance_loss_mlp": 1.04400003, + "epoch": 0.3259281527130618, + "flos": 26979974305920.0, + "grad_norm": 1.4823274263144444, + "language_loss": 0.80134791, + "learning_rate": 3.1495532137133736e-06, + "loss": 0.82298517, + "num_input_tokens_seen": 116487825, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.84375, + "step": 5421, + "time_per_iteration": 2.5376439094543457 + }, + { + "auxiliary_loss_clip": 0.01122338, + "auxiliary_loss_mlp": 0.01037347, + "balance_loss_clip": 1.02359045, + "balance_loss_mlp": 1.04254711, + "epoch": 0.32598827596572977, + "flos": 26214251149440.0, + "grad_norm": 1.5045024534641303, + "language_loss": 0.75446749, + "learning_rate": 3.149234491389381e-06, + "loss": 0.77606434, + "num_input_tokens_seen": 116509950, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.796875, + "step": 5422, + "time_per_iteration": 2.5713820457458496 + }, + { + "auxiliary_loss_clip": 0.01128336, + "auxiliary_loss_mlp": 0.01039164, + "balance_loss_clip": 1.02324986, + "balance_loss_mlp": 1.04553628, + "epoch": 0.32604839921839773, + "flos": 17639752947840.0, + "grad_norm": 2.780294141224906, + "language_loss": 0.62795889, + "learning_rate": 3.1489157254863026e-06, + "loss": 0.64963388, + "num_input_tokens_seen": 116527695, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.828125, + "step": 5423, + "time_per_iteration": 2.4667959213256836 + }, + { + "auxiliary_loss_clip": 0.01118583, + "auxiliary_loss_mlp": 0.01031258, + "balance_loss_clip": 1.01824594, + "balance_loss_mlp": 1.04085255, + "epoch": 0.3261085224710657, + "flos": 23622721724160.0, + "grad_norm": 4.488088575635961, + "language_loss": 0.74664211, + "learning_rate": 3.148596916016224e-06, + "loss": 0.76814055, + "num_input_tokens_seen": 116547800, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.77734375, + "step": 5424, + "time_per_iteration": 2.488187313079834 + }, + { + "auxiliary_loss_clip": 0.01122401, + "auxiliary_loss_mlp": 0.01035954, + "balance_loss_clip": 1.02231038, + "balance_loss_mlp": 1.04298568, + "epoch": 0.32616864572373366, + "flos": 23260455106560.0, + "grad_norm": 1.6359586167011877, + "language_loss": 0.76958472, + "learning_rate": 3.1482780629912355e-06, + "loss": 0.79116821, + "num_input_tokens_seen": 116568460, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.796875, + "step": 5425, + "time_per_iteration": 2.5025157928466797 + }, + { + "auxiliary_loss_clip": 0.01127865, + "auxiliary_loss_mlp": 0.01047272, + "balance_loss_clip": 1.03051138, + "balance_loss_mlp": 1.04193544, + "epoch": 0.32622876897640163, + "flos": 25593427457280.0, + "grad_norm": 4.663874352034687, + "language_loss": 0.78857136, + "learning_rate": 3.147959166423428e-06, + "loss": 0.8103227, + "num_input_tokens_seen": 116588705, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.859375, + "step": 5426, + "time_per_iteration": 2.484064817428589 + }, + { + "auxiliary_loss_clip": 0.01124966, + "auxiliary_loss_mlp": 0.01037083, + "balance_loss_clip": 1.02116871, + "balance_loss_mlp": 1.04324198, + "epoch": 0.3262888922290696, + "flos": 22418996123520.0, + "grad_norm": 1.7688447582142532, + "language_loss": 0.74363142, + "learning_rate": 3.147640226324893e-06, + "loss": 0.76525187, + "num_input_tokens_seen": 116608845, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.81640625, + "step": 5427, + "time_per_iteration": 2.4785962104797363 + }, + { + "auxiliary_loss_clip": 0.0112706, + "auxiliary_loss_mlp": 0.01043058, + "balance_loss_clip": 1.02742934, + "balance_loss_mlp": 1.04290414, + "epoch": 0.32634901548173756, + "flos": 19718908819200.0, + "grad_norm": 1.911492416062928, + "language_loss": 0.79305124, + "learning_rate": 3.1473212427077266e-06, + "loss": 0.8147524, + "num_input_tokens_seen": 116628145, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.83984375, + "step": 5428, + "time_per_iteration": 3.9864413738250732 + }, + { + "auxiliary_loss_clip": 0.01123467, + "auxiliary_loss_mlp": 0.01041045, + "balance_loss_clip": 1.02597678, + "balance_loss_mlp": 1.04084587, + "epoch": 0.3264091387344055, + "flos": 16142924367360.0, + "grad_norm": 1.7222830625250152, + "language_loss": 0.71369523, + "learning_rate": 3.147002215584023e-06, + "loss": 0.73534036, + "num_input_tokens_seen": 116646920, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.82421875, + "step": 5429, + "time_per_iteration": 3.8856096267700195 + }, + { + "auxiliary_loss_clip": 0.01124973, + "auxiliary_loss_mlp": 0.01038401, + "balance_loss_clip": 1.02448976, + "balance_loss_mlp": 1.04308093, + "epoch": 0.3264692619870735, + "flos": 16399075230720.0, + "grad_norm": 1.889570703315701, + "language_loss": 0.78612322, + "learning_rate": 3.146683144965881e-06, + "loss": 0.80775696, + "num_input_tokens_seen": 116665100, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.8203125, + "step": 5430, + "time_per_iteration": 2.4374818801879883 + }, + { + "auxiliary_loss_clip": 0.01128219, + "auxiliary_loss_mlp": 0.01037824, + "balance_loss_clip": 1.02077675, + "balance_loss_mlp": 1.04359281, + "epoch": 0.32652938523974145, + "flos": 22382331315840.0, + "grad_norm": 1.8594684871120744, + "language_loss": 0.83897448, + "learning_rate": 3.146364030865399e-06, + "loss": 0.86063492, + "num_input_tokens_seen": 116682205, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.84765625, + "step": 5431, + "time_per_iteration": 2.4513139724731445 + }, + { + "auxiliary_loss_clip": 0.01122027, + "auxiliary_loss_mlp": 0.01038117, + "balance_loss_clip": 1.02431297, + "balance_loss_mlp": 1.04116321, + "epoch": 0.3265895084924094, + "flos": 21908059113600.0, + "grad_norm": 1.7565110160676718, + "language_loss": 0.70459324, + "learning_rate": 3.146044873294678e-06, + "loss": 0.72619462, + "num_input_tokens_seen": 116702575, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.80859375, + "step": 5432, + "time_per_iteration": 2.529365301132202 + }, + { + "auxiliary_loss_clip": 0.01123519, + "auxiliary_loss_mlp": 0.01035948, + "balance_loss_clip": 1.02182746, + "balance_loss_mlp": 1.04076195, + "epoch": 0.3266496317450774, + "flos": 16067152627200.0, + "grad_norm": 1.4205622330102, + "language_loss": 0.84161848, + "learning_rate": 3.1457256722658203e-06, + "loss": 0.86321318, + "num_input_tokens_seen": 116720885, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.828125, + "step": 5433, + "time_per_iteration": 2.4302597045898438 + }, + { + "auxiliary_loss_clip": 0.01123612, + "auxiliary_loss_mlp": 0.01035544, + "balance_loss_clip": 1.02132881, + "balance_loss_mlp": 1.0439055, + "epoch": 0.3267097549977454, + "flos": 22528236360960.0, + "grad_norm": 1.4699213962063424, + "language_loss": 0.85906386, + "learning_rate": 3.145406427790931e-06, + "loss": 0.88065541, + "num_input_tokens_seen": 116740395, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.796875, + "step": 5434, + "time_per_iteration": 2.496676445007324 + }, + { + "auxiliary_loss_clip": 0.01129105, + "auxiliary_loss_mlp": 0.01035794, + "balance_loss_clip": 1.02083361, + "balance_loss_mlp": 1.04468119, + "epoch": 0.32676987825041337, + "flos": 27270419679360.0, + "grad_norm": 1.8331918492971015, + "language_loss": 0.87817061, + "learning_rate": 3.1450871398821147e-06, + "loss": 0.89981961, + "num_input_tokens_seen": 116758870, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.84375, + "step": 5435, + "time_per_iteration": 2.51159405708313 + }, + { + "auxiliary_loss_clip": 0.0112533, + "auxiliary_loss_mlp": 0.01035758, + "balance_loss_clip": 1.02140474, + "balance_loss_mlp": 1.04326773, + "epoch": 0.32683000150308134, + "flos": 11508257433600.0, + "grad_norm": 2.5496215899058443, + "language_loss": 0.76460963, + "learning_rate": 3.144767808551479e-06, + "loss": 0.78622043, + "num_input_tokens_seen": 116773440, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.8203125, + "step": 5436, + "time_per_iteration": 2.43637752532959 + }, + { + "auxiliary_loss_clip": 0.01125315, + "auxiliary_loss_mlp": 0.01034854, + "balance_loss_clip": 1.02040625, + "balance_loss_mlp": 1.04435849, + "epoch": 0.3268901247557493, + "flos": 25630200005760.0, + "grad_norm": 1.5905557916714361, + "language_loss": 0.72127515, + "learning_rate": 3.144448433811134e-06, + "loss": 0.74287689, + "num_input_tokens_seen": 116794375, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.80859375, + "step": 5437, + "time_per_iteration": 2.493673086166382 + }, + { + "auxiliary_loss_clip": 0.01126466, + "auxiliary_loss_mlp": 0.01039117, + "balance_loss_clip": 1.02236819, + "balance_loss_mlp": 1.04143524, + "epoch": 0.32695024800841727, + "flos": 24860849575680.0, + "grad_norm": 1.6336098458574233, + "language_loss": 0.64049256, + "learning_rate": 3.144129015673189e-06, + "loss": 0.66214842, + "num_input_tokens_seen": 116815095, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.8515625, + "step": 5438, + "time_per_iteration": 2.5062596797943115 + }, + { + "auxiliary_loss_clip": 0.01126505, + "auxiliary_loss_mlp": 0.01034195, + "balance_loss_clip": 1.01943088, + "balance_loss_mlp": 1.04510128, + "epoch": 0.32701037126108523, + "flos": 28839249072000.0, + "grad_norm": 1.5452802319075516, + "language_loss": 0.74544024, + "learning_rate": 3.1438095541497576e-06, + "loss": 0.76704717, + "num_input_tokens_seen": 116836630, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8125, + "step": 5439, + "time_per_iteration": 2.501279830932617 + }, + { + "auxiliary_loss_clip": 0.01126727, + "auxiliary_loss_mlp": 0.01045237, + "balance_loss_clip": 1.02985907, + "balance_loss_mlp": 1.04374349, + "epoch": 0.3270704945137532, + "flos": 27965075777280.0, + "grad_norm": 2.6196339079167323, + "language_loss": 0.75183308, + "learning_rate": 3.1434900492529527e-06, + "loss": 0.77355272, + "num_input_tokens_seen": 116856880, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.828125, + "step": 5440, + "time_per_iteration": 2.507341146469116 + }, + { + "auxiliary_loss_clip": 0.01124779, + "auxiliary_loss_mlp": 0.01047409, + "balance_loss_clip": 1.03317571, + "balance_loss_mlp": 1.04308057, + "epoch": 0.32713061776642116, + "flos": 23690700213120.0, + "grad_norm": 1.9066250681455874, + "language_loss": 0.84613734, + "learning_rate": 3.1431705009948914e-06, + "loss": 0.86785924, + "num_input_tokens_seen": 116873770, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8203125, + "step": 5441, + "time_per_iteration": 2.4737346172332764 + }, + { + "auxiliary_loss_clip": 0.01126255, + "auxiliary_loss_mlp": 0.01042859, + "balance_loss_clip": 1.02743292, + "balance_loss_mlp": 1.04209113, + "epoch": 0.3271907410190891, + "flos": 22455625017600.0, + "grad_norm": 1.9602585650153952, + "language_loss": 0.8673979, + "learning_rate": 3.1428509093876897e-06, + "loss": 0.88908899, + "num_input_tokens_seen": 116891225, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84375, + "step": 5442, + "time_per_iteration": 2.4779980182647705 + }, + { + "auxiliary_loss_clip": 0.0113032, + "auxiliary_loss_mlp": 0.01038435, + "balance_loss_clip": 1.02193677, + "balance_loss_mlp": 1.04526424, + "epoch": 0.3272508642717571, + "flos": 22820118278400.0, + "grad_norm": 1.8849886885636646, + "language_loss": 0.77500421, + "learning_rate": 3.1425312744434668e-06, + "loss": 0.79669178, + "num_input_tokens_seen": 116912300, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8515625, + "step": 5443, + "time_per_iteration": 2.5263850688934326 + }, + { + "auxiliary_loss_clip": 0.01126577, + "auxiliary_loss_mlp": 0.01039448, + "balance_loss_clip": 1.02428412, + "balance_loss_mlp": 1.04207098, + "epoch": 0.32731098752442506, + "flos": 11801360413440.0, + "grad_norm": 2.0180593262473487, + "language_loss": 0.81630802, + "learning_rate": 3.142211596174343e-06, + "loss": 0.83796823, + "num_input_tokens_seen": 116929425, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.84765625, + "step": 5444, + "time_per_iteration": 2.447061061859131 + }, + { + "auxiliary_loss_clip": 0.0112612, + "auxiliary_loss_mlp": 0.01038044, + "balance_loss_clip": 1.02335095, + "balance_loss_mlp": 1.04356718, + "epoch": 0.327371110777093, + "flos": 21027780506880.0, + "grad_norm": 2.9587875585664523, + "language_loss": 0.59421074, + "learning_rate": 3.1418918745924423e-06, + "loss": 0.61585242, + "num_input_tokens_seen": 116948255, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.82421875, + "step": 5445, + "time_per_iteration": 2.4542667865753174 + }, + { + "auxiliary_loss_clip": 0.01128674, + "auxiliary_loss_mlp": 0.01039464, + "balance_loss_clip": 1.02397823, + "balance_loss_mlp": 1.04482532, + "epoch": 0.327431234029761, + "flos": 19062102677760.0, + "grad_norm": 2.043321690225375, + "language_loss": 0.88286638, + "learning_rate": 3.1415721097098865e-06, + "loss": 0.90454781, + "num_input_tokens_seen": 116964905, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8359375, + "step": 5446, + "time_per_iteration": 2.4518625736236572 + }, + { + "auxiliary_loss_clip": 0.01133247, + "auxiliary_loss_mlp": 0.01042878, + "balance_loss_clip": 1.02577102, + "balance_loss_mlp": 1.04609275, + "epoch": 0.32749135728242895, + "flos": 25849219184640.0, + "grad_norm": 1.9059445881205361, + "language_loss": 0.78455317, + "learning_rate": 3.141252301538802e-06, + "loss": 0.80631441, + "num_input_tokens_seen": 116983650, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.87109375, + "step": 5447, + "time_per_iteration": 2.488555669784546 + }, + { + "auxiliary_loss_clip": 0.01125433, + "auxiliary_loss_mlp": 0.01039956, + "balance_loss_clip": 1.02621138, + "balance_loss_mlp": 1.04297531, + "epoch": 0.327551480535097, + "flos": 20120533764480.0, + "grad_norm": 1.7948266966340543, + "language_loss": 0.73349774, + "learning_rate": 3.1409324500913157e-06, + "loss": 0.75515163, + "num_input_tokens_seen": 117003265, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.82421875, + "step": 5448, + "time_per_iteration": 2.460759162902832 + }, + { + "auxiliary_loss_clip": 0.01125074, + "auxiliary_loss_mlp": 0.0104344, + "balance_loss_clip": 1.02788281, + "balance_loss_mlp": 1.04221821, + "epoch": 0.32761160378776494, + "flos": 28803553931520.0, + "grad_norm": 1.3797343272994427, + "language_loss": 0.66896623, + "learning_rate": 3.1406125553795567e-06, + "loss": 0.69065142, + "num_input_tokens_seen": 117025370, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.828125, + "step": 5449, + "time_per_iteration": 2.5101547241210938 + }, + { + "auxiliary_loss_clip": 0.01125182, + "auxiliary_loss_mlp": 0.01035774, + "balance_loss_clip": 1.02111173, + "balance_loss_mlp": 1.04373384, + "epoch": 0.3276717270404329, + "flos": 26937778803840.0, + "grad_norm": 1.3889431777217922, + "language_loss": 0.65617704, + "learning_rate": 3.1402926174156556e-06, + "loss": 0.67778659, + "num_input_tokens_seen": 117044350, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8125, + "step": 5450, + "time_per_iteration": 2.4815587997436523 + }, + { + "auxiliary_loss_clip": 0.01126325, + "auxiliary_loss_mlp": 0.01041593, + "balance_loss_clip": 1.02644145, + "balance_loss_mlp": 1.04330397, + "epoch": 0.32773185029310087, + "flos": 25338425829120.0, + "grad_norm": 1.5376267502191867, + "language_loss": 0.77276003, + "learning_rate": 3.1399726362117437e-06, + "loss": 0.7944392, + "num_input_tokens_seen": 117064450, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.828125, + "step": 5451, + "time_per_iteration": 2.496264696121216 + }, + { + "auxiliary_loss_clip": 0.0112906, + "auxiliary_loss_mlp": 0.01039497, + "balance_loss_clip": 1.02348745, + "balance_loss_mlp": 1.04470944, + "epoch": 0.32779197354576883, + "flos": 26391721271040.0, + "grad_norm": 2.4373215337565015, + "language_loss": 0.7011131, + "learning_rate": 3.1396526117799555e-06, + "loss": 0.72279859, + "num_input_tokens_seen": 117083060, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.84375, + "step": 5452, + "time_per_iteration": 2.504953384399414 + }, + { + "auxiliary_loss_clip": 0.01121729, + "auxiliary_loss_mlp": 0.0103441, + "balance_loss_clip": 1.01944947, + "balance_loss_mlp": 1.04188132, + "epoch": 0.3278520967984368, + "flos": 24899381890560.0, + "grad_norm": 1.7019757848824575, + "language_loss": 0.78734571, + "learning_rate": 3.1393325441324256e-06, + "loss": 0.80890715, + "num_input_tokens_seen": 117101860, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.796875, + "step": 5453, + "time_per_iteration": 2.493701219558716 + }, + { + "auxiliary_loss_clip": 0.01126073, + "auxiliary_loss_mlp": 0.01030616, + "balance_loss_clip": 1.01610184, + "balance_loss_mlp": 1.04306984, + "epoch": 0.32791222005110476, + "flos": 29752996176000.0, + "grad_norm": 2.2894918901687333, + "language_loss": 0.75428879, + "learning_rate": 3.1390124332812916e-06, + "loss": 0.77585566, + "num_input_tokens_seen": 117123100, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.828125, + "step": 5454, + "time_per_iteration": 2.5295286178588867 + }, + { + "auxiliary_loss_clip": 0.01121153, + "auxiliary_loss_mlp": 0.01037163, + "balance_loss_clip": 1.02382326, + "balance_loss_mlp": 1.04198301, + "epoch": 0.32797234330377273, + "flos": 16508064072960.0, + "grad_norm": 2.0725507665811826, + "language_loss": 0.77059573, + "learning_rate": 3.1386922792386924e-06, + "loss": 0.79217887, + "num_input_tokens_seen": 117140515, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7890625, + "step": 5455, + "time_per_iteration": 2.426988124847412 + }, + { + "auxiliary_loss_clip": 0.0112837, + "auxiliary_loss_mlp": 0.01039409, + "balance_loss_clip": 1.02304173, + "balance_loss_mlp": 1.04281068, + "epoch": 0.3280324665564407, + "flos": 26577918397440.0, + "grad_norm": 1.669914346129418, + "language_loss": 0.74029738, + "learning_rate": 3.138372082016768e-06, + "loss": 0.76197511, + "num_input_tokens_seen": 117161485, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.85546875, + "step": 5456, + "time_per_iteration": 2.512131929397583 + }, + { + "auxiliary_loss_clip": 0.01126084, + "auxiliary_loss_mlp": 0.01049831, + "balance_loss_clip": 1.03444123, + "balance_loss_mlp": 1.04250574, + "epoch": 0.32809258980910866, + "flos": 22929969047040.0, + "grad_norm": 1.518027485126158, + "language_loss": 0.78283882, + "learning_rate": 3.1380518416276596e-06, + "loss": 0.80459797, + "num_input_tokens_seen": 117181870, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8359375, + "step": 5457, + "time_per_iteration": 2.4819135665893555 + }, + { + "auxiliary_loss_clip": 0.0112739, + "auxiliary_loss_mlp": 0.01038783, + "balance_loss_clip": 1.02432334, + "balance_loss_mlp": 1.04155684, + "epoch": 0.3281527130617766, + "flos": 22783848520320.0, + "grad_norm": 2.199350012619834, + "language_loss": 0.79332864, + "learning_rate": 3.1377315580835115e-06, + "loss": 0.81499034, + "num_input_tokens_seen": 117201380, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.859375, + "step": 5458, + "time_per_iteration": 2.4749457836151123 + }, + { + "auxiliary_loss_clip": 0.01123398, + "auxiliary_loss_mlp": 0.01035313, + "balance_loss_clip": 1.01988721, + "balance_loss_mlp": 1.04204702, + "epoch": 0.3282128363144446, + "flos": 21250678354560.0, + "grad_norm": 4.694290331797846, + "language_loss": 0.72896576, + "learning_rate": 3.1374112313964686e-06, + "loss": 0.75055289, + "num_input_tokens_seen": 117221040, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.81640625, + "step": 5459, + "time_per_iteration": 2.4506032466888428 + }, + { + "auxiliary_loss_clip": 0.01128673, + "auxiliary_loss_mlp": 0.01037647, + "balance_loss_clip": 1.02303815, + "balance_loss_mlp": 1.04444695, + "epoch": 0.32827295956711255, + "flos": 30843064166400.0, + "grad_norm": 1.8402325574836436, + "language_loss": 0.84511495, + "learning_rate": 3.1370908615786783e-06, + "loss": 0.86677814, + "num_input_tokens_seen": 117241395, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.84375, + "step": 5460, + "time_per_iteration": 2.521491527557373 + }, + { + "auxiliary_loss_clip": 0.01125172, + "auxiliary_loss_mlp": 0.01035571, + "balance_loss_clip": 1.02176023, + "balance_loss_mlp": 1.0420599, + "epoch": 0.3283330828197806, + "flos": 25915006944000.0, + "grad_norm": 1.7736363390075318, + "language_loss": 0.76822042, + "learning_rate": 3.136770448642288e-06, + "loss": 0.78982782, + "num_input_tokens_seen": 117259340, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.83203125, + "step": 5461, + "time_per_iteration": 2.4919962882995605 + }, + { + "auxiliary_loss_clip": 0.01129873, + "auxiliary_loss_mlp": 0.010367, + "balance_loss_clip": 1.02015376, + "balance_loss_mlp": 1.04589903, + "epoch": 0.32839320607244854, + "flos": 38582065042560.0, + "grad_norm": 1.6989905310418616, + "language_loss": 0.62835252, + "learning_rate": 3.1364499925994484e-06, + "loss": 0.65001822, + "num_input_tokens_seen": 117282375, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.83984375, + "step": 5462, + "time_per_iteration": 2.6128923892974854 + }, + { + "auxiliary_loss_clip": 0.0112585, + "auxiliary_loss_mlp": 0.01033948, + "balance_loss_clip": 1.02048922, + "balance_loss_mlp": 1.04426169, + "epoch": 0.3284533293251165, + "flos": 26650888876800.0, + "grad_norm": 1.8014296603715538, + "language_loss": 0.78155506, + "learning_rate": 3.1361294934623115e-06, + "loss": 0.80315304, + "num_input_tokens_seen": 117303830, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.8125, + "step": 5463, + "time_per_iteration": 2.5255165100097656 + }, + { + "auxiliary_loss_clip": 0.0112647, + "auxiliary_loss_mlp": 0.01034449, + "balance_loss_clip": 1.02001238, + "balance_loss_mlp": 1.04409099, + "epoch": 0.32851345257778447, + "flos": 15304158904320.0, + "grad_norm": 2.049558292675733, + "language_loss": 0.7029627, + "learning_rate": 3.1358089512430303e-06, + "loss": 0.72457188, + "num_input_tokens_seen": 117320665, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.82421875, + "step": 5464, + "time_per_iteration": 2.460951089859009 + }, + { + "auxiliary_loss_clip": 0.01127719, + "auxiliary_loss_mlp": 0.01039646, + "balance_loss_clip": 1.02505457, + "balance_loss_mlp": 1.04683673, + "epoch": 0.32857357583045244, + "flos": 23513732881920.0, + "grad_norm": 1.6142145677103121, + "language_loss": 0.72746348, + "learning_rate": 3.1354883659537594e-06, + "loss": 0.74913716, + "num_input_tokens_seen": 117339795, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.80859375, + "step": 5465, + "time_per_iteration": 2.4767887592315674 + }, + { + "auxiliary_loss_clip": 0.01128882, + "auxiliary_loss_mlp": 0.01036634, + "balance_loss_clip": 1.02208447, + "balance_loss_mlp": 1.04690027, + "epoch": 0.3286336990831204, + "flos": 20995209849600.0, + "grad_norm": 1.6282981827525145, + "language_loss": 0.82756901, + "learning_rate": 3.1351677376066567e-06, + "loss": 0.84922415, + "num_input_tokens_seen": 117359525, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8203125, + "step": 5466, + "time_per_iteration": 2.463127613067627 + }, + { + "auxiliary_loss_clip": 0.01127231, + "auxiliary_loss_mlp": 0.01037656, + "balance_loss_clip": 1.02343404, + "balance_loss_mlp": 1.04421949, + "epoch": 0.32869382233578837, + "flos": 23658811914240.0, + "grad_norm": 1.6977355395672606, + "language_loss": 0.79485095, + "learning_rate": 3.134847066213879e-06, + "loss": 0.81649983, + "num_input_tokens_seen": 117380320, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.83203125, + "step": 5467, + "time_per_iteration": 2.482245683670044 + }, + { + "auxiliary_loss_clip": 0.01128659, + "auxiliary_loss_mlp": 0.01034682, + "balance_loss_clip": 1.02011502, + "balance_loss_mlp": 1.0452255, + "epoch": 0.32875394558845633, + "flos": 25336522408320.0, + "grad_norm": 1.5356074654715184, + "language_loss": 0.74795353, + "learning_rate": 3.134526351787587e-06, + "loss": 0.76958692, + "num_input_tokens_seen": 117400695, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8359375, + "step": 5468, + "time_per_iteration": 2.4828743934631348 + }, + { + "auxiliary_loss_clip": 0.01136832, + "auxiliary_loss_mlp": 0.01041148, + "balance_loss_clip": 1.02467322, + "balance_loss_mlp": 1.04996455, + "epoch": 0.3288140688411243, + "flos": 14903108576640.0, + "grad_norm": 1.8525214053644714, + "language_loss": 0.78469932, + "learning_rate": 3.134205594339942e-06, + "loss": 0.8064791, + "num_input_tokens_seen": 117418800, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.8671875, + "step": 5469, + "time_per_iteration": 2.455672264099121 + }, + { + "auxiliary_loss_clip": 0.01129512, + "auxiliary_loss_mlp": 0.01034665, + "balance_loss_clip": 1.02008545, + "balance_loss_mlp": 1.04602098, + "epoch": 0.32887419209379226, + "flos": 18551345235840.0, + "grad_norm": 1.646072726718358, + "language_loss": 0.82014406, + "learning_rate": 3.133884793883107e-06, + "loss": 0.84178579, + "num_input_tokens_seen": 117438220, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8359375, + "step": 5470, + "time_per_iteration": 5.531651020050049 + }, + { + "auxiliary_loss_clip": 0.0112936, + "auxiliary_loss_mlp": 0.01038355, + "balance_loss_clip": 1.02315605, + "balance_loss_mlp": 1.04359245, + "epoch": 0.3289343153464602, + "flos": 48105610439040.0, + "grad_norm": 1.806312825179731, + "language_loss": 0.67675972, + "learning_rate": 3.1335639504292478e-06, + "loss": 0.69843686, + "num_input_tokens_seen": 117462560, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.859375, + "step": 5471, + "time_per_iteration": 2.7400858402252197 + }, + { + "auxiliary_loss_clip": 0.01135248, + "auxiliary_loss_mlp": 0.01042507, + "balance_loss_clip": 1.02578163, + "balance_loss_mlp": 1.04856122, + "epoch": 0.3289944385991282, + "flos": 27600295207680.0, + "grad_norm": 1.6357076803377442, + "language_loss": 0.65059721, + "learning_rate": 3.1332430639905288e-06, + "loss": 0.67237478, + "num_input_tokens_seen": 117483665, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8671875, + "step": 5472, + "time_per_iteration": 2.530604124069214 + }, + { + "auxiliary_loss_clip": 0.01132922, + "auxiliary_loss_mlp": 0.01043552, + "balance_loss_clip": 1.0271014, + "balance_loss_mlp": 1.04821706, + "epoch": 0.32905456185179616, + "flos": 20120318282880.0, + "grad_norm": 1.6631612231063349, + "language_loss": 0.88497955, + "learning_rate": 3.13292213457912e-06, + "loss": 0.9067443, + "num_input_tokens_seen": 117503565, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.84765625, + "step": 5473, + "time_per_iteration": 2.521026611328125 + }, + { + "auxiliary_loss_clip": 0.01133162, + "auxiliary_loss_mlp": 0.01043181, + "balance_loss_clip": 1.02669442, + "balance_loss_mlp": 1.0483191, + "epoch": 0.3291146851044642, + "flos": 23180230080000.0, + "grad_norm": 2.3087074790673423, + "language_loss": 0.78349268, + "learning_rate": 3.1326011622071903e-06, + "loss": 0.80525613, + "num_input_tokens_seen": 117521460, + "router_z_loss_clip": 0.16503906, + "router_z_loss_mlp": 0.84765625, + "step": 5474, + "time_per_iteration": 2.4769628047943115 + }, + { + "auxiliary_loss_clip": 0.01047146, + "auxiliary_loss_mlp": 0.00999487, + "balance_loss_clip": 0.99740046, + "balance_loss_mlp": 1.02056372, + "epoch": 0.32917480835713214, + "flos": 67621912594560.0, + "grad_norm": 0.888273800575083, + "language_loss": 0.60237771, + "learning_rate": 3.132280146886911e-06, + "loss": 0.62284404, + "num_input_tokens_seen": 117580550, + "router_z_loss_clip": 0.02087402, + "router_z_loss_mlp": 0.265625, + "step": 5475, + "time_per_iteration": 3.039971351623535 + }, + { + "auxiliary_loss_clip": 0.01133227, + "auxiliary_loss_mlp": 0.01051514, + "balance_loss_clip": 1.03437138, + "balance_loss_mlp": 1.04512429, + "epoch": 0.3292349316098001, + "flos": 27964537073280.0, + "grad_norm": 2.5350164106808766, + "language_loss": 0.76634103, + "learning_rate": 3.131959088630455e-06, + "loss": 0.78818846, + "num_input_tokens_seen": 117600645, + "router_z_loss_clip": 0.171875, + "router_z_loss_mlp": 0.8828125, + "step": 5476, + "time_per_iteration": 2.488698959350586 + }, + { + "auxiliary_loss_clip": 0.01131587, + "auxiliary_loss_mlp": 0.01040976, + "balance_loss_clip": 1.02640307, + "balance_loss_mlp": 1.04819024, + "epoch": 0.3292950548624681, + "flos": 20263673462400.0, + "grad_norm": 1.8435246505513339, + "language_loss": 0.74520677, + "learning_rate": 3.131637987449997e-06, + "loss": 0.76693243, + "num_input_tokens_seen": 117618880, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8359375, + "step": 5477, + "time_per_iteration": 2.533641815185547 + }, + { + "auxiliary_loss_clip": 0.01124642, + "auxiliary_loss_mlp": 0.01034244, + "balance_loss_clip": 1.02036786, + "balance_loss_mlp": 1.04507232, + "epoch": 0.32935517811513604, + "flos": 20812999132800.0, + "grad_norm": 1.9138938380730264, + "language_loss": 0.75581098, + "learning_rate": 3.131316843357713e-06, + "loss": 0.7773999, + "num_input_tokens_seen": 117636445, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.796875, + "step": 5478, + "time_per_iteration": 2.4541866779327393 + }, + { + "auxiliary_loss_clip": 0.01129718, + "auxiliary_loss_mlp": 0.01036647, + "balance_loss_clip": 1.02218664, + "balance_loss_mlp": 1.04736805, + "epoch": 0.329415301367804, + "flos": 18441853603200.0, + "grad_norm": 1.6780134795902322, + "language_loss": 0.80241555, + "learning_rate": 3.1309956563657807e-06, + "loss": 0.82407916, + "num_input_tokens_seen": 117653105, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8203125, + "step": 5479, + "time_per_iteration": 2.5348050594329834 + }, + { + "auxiliary_loss_clip": 0.01046129, + "auxiliary_loss_mlp": 0.01003977, + "balance_loss_clip": 1.00210512, + "balance_loss_mlp": 1.01921439, + "epoch": 0.32947542462047197, + "flos": 66323024887680.0, + "grad_norm": 0.7411588561506779, + "language_loss": 0.56543052, + "learning_rate": 3.1306744264863804e-06, + "loss": 0.58593154, + "num_input_tokens_seen": 117719225, + "router_z_loss_clip": 0.01867676, + "router_z_loss_mlp": 0.26953125, + "step": 5480, + "time_per_iteration": 3.121812343597412 + }, + { + "auxiliary_loss_clip": 0.01128951, + "auxiliary_loss_mlp": 0.01044263, + "balance_loss_clip": 1.02871847, + "balance_loss_mlp": 1.04606879, + "epoch": 0.32953554787313993, + "flos": 23221599569280.0, + "grad_norm": 1.656023636160042, + "language_loss": 0.77029848, + "learning_rate": 3.1303531537316915e-06, + "loss": 0.79203057, + "num_input_tokens_seen": 117738725, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.828125, + "step": 5481, + "time_per_iteration": 2.4819936752319336 + }, + { + "auxiliary_loss_clip": 0.01129556, + "auxiliary_loss_mlp": 0.01034734, + "balance_loss_clip": 1.02028024, + "balance_loss_mlp": 1.04622722, + "epoch": 0.3295956711258079, + "flos": 27009492307200.0, + "grad_norm": 1.8057287203311059, + "language_loss": 0.78732938, + "learning_rate": 3.130031838113899e-06, + "loss": 0.80897224, + "num_input_tokens_seen": 117757765, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.83203125, + "step": 5482, + "time_per_iteration": 2.501615285873413 + }, + { + "auxiliary_loss_clip": 0.01129525, + "auxiliary_loss_mlp": 0.01041437, + "balance_loss_clip": 1.02601135, + "balance_loss_mlp": 1.04573894, + "epoch": 0.32965579437847586, + "flos": 19171702051200.0, + "grad_norm": 1.6414395423474737, + "language_loss": 0.74055123, + "learning_rate": 3.129710479645185e-06, + "loss": 0.76226085, + "num_input_tokens_seen": 117776810, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8359375, + "step": 5483, + "time_per_iteration": 2.5213518142700195 + }, + { + "auxiliary_loss_clip": 0.01128456, + "auxiliary_loss_mlp": 0.01032447, + "balance_loss_clip": 1.0187676, + "balance_loss_mlp": 1.04614615, + "epoch": 0.32971591763114383, + "flos": 30482521401600.0, + "grad_norm": 1.8373674608308554, + "language_loss": 0.75627816, + "learning_rate": 3.1293890783377366e-06, + "loss": 0.77788723, + "num_input_tokens_seen": 117797730, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.8203125, + "step": 5484, + "time_per_iteration": 2.543795108795166 + }, + { + "auxiliary_loss_clip": 0.01129378, + "auxiliary_loss_mlp": 0.01038991, + "balance_loss_clip": 1.02441156, + "balance_loss_mlp": 1.04699099, + "epoch": 0.3297760408838118, + "flos": 16289583598080.0, + "grad_norm": 2.1329507570753243, + "language_loss": 0.7209897, + "learning_rate": 3.129067634203742e-06, + "loss": 0.74267334, + "num_input_tokens_seen": 117815365, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.82421875, + "step": 5485, + "time_per_iteration": 2.4598846435546875 + }, + { + "auxiliary_loss_clip": 0.01124565, + "auxiliary_loss_mlp": 0.01040064, + "balance_loss_clip": 1.02626562, + "balance_loss_mlp": 1.04448354, + "epoch": 0.32983616413647976, + "flos": 29530924341120.0, + "grad_norm": 1.7963509228415293, + "language_loss": 0.80416954, + "learning_rate": 3.128746147255388e-06, + "loss": 0.8258158, + "num_input_tokens_seen": 117836095, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.80078125, + "step": 5486, + "time_per_iteration": 2.5368754863739014 + }, + { + "auxiliary_loss_clip": 0.011236, + "auxiliary_loss_mlp": 0.01037413, + "balance_loss_clip": 1.02264309, + "balance_loss_mlp": 1.04300976, + "epoch": 0.3298962873891478, + "flos": 20631398947200.0, + "grad_norm": 2.3473245188806056, + "language_loss": 0.84351611, + "learning_rate": 3.1284246175048683e-06, + "loss": 0.86512625, + "num_input_tokens_seen": 117854655, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8046875, + "step": 5487, + "time_per_iteration": 2.5140841007232666 + }, + { + "auxiliary_loss_clip": 0.01131842, + "auxiliary_loss_mlp": 0.01040276, + "balance_loss_clip": 1.02440929, + "balance_loss_mlp": 1.04636502, + "epoch": 0.32995641064181574, + "flos": 14976007228800.0, + "grad_norm": 2.289610395509379, + "language_loss": 0.74163198, + "learning_rate": 3.1281030449643735e-06, + "loss": 0.76335323, + "num_input_tokens_seen": 117873300, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.85546875, + "step": 5488, + "time_per_iteration": 2.4159257411956787 + }, + { + "auxiliary_loss_clip": 0.01127802, + "auxiliary_loss_mlp": 0.01040136, + "balance_loss_clip": 1.02519917, + "balance_loss_mlp": 1.04548192, + "epoch": 0.3300165338944837, + "flos": 18661447399680.0, + "grad_norm": 2.3379517114480004, + "language_loss": 0.72564352, + "learning_rate": 3.127781429646098e-06, + "loss": 0.74732298, + "num_input_tokens_seen": 117891540, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.82421875, + "step": 5489, + "time_per_iteration": 2.4810056686401367 + }, + { + "auxiliary_loss_clip": 0.01122617, + "auxiliary_loss_mlp": 0.01033113, + "balance_loss_clip": 1.01852202, + "balance_loss_mlp": 1.04076719, + "epoch": 0.3300766571471517, + "flos": 25583730785280.0, + "grad_norm": 2.5348585918072235, + "language_loss": 0.88752508, + "learning_rate": 3.127459771562238e-06, + "loss": 0.90908241, + "num_input_tokens_seen": 117907690, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8203125, + "step": 5490, + "time_per_iteration": 2.448437452316284 + }, + { + "auxiliary_loss_clip": 0.01121475, + "auxiliary_loss_mlp": 0.01034389, + "balance_loss_clip": 1.02022719, + "balance_loss_mlp": 1.0403626, + "epoch": 0.33013678039981964, + "flos": 11363501623680.0, + "grad_norm": 1.9493471797358817, + "language_loss": 0.83395195, + "learning_rate": 3.1271380707249907e-06, + "loss": 0.85551059, + "num_input_tokens_seen": 117925640, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8125, + "step": 5491, + "time_per_iteration": 2.44634747505188 + }, + { + "auxiliary_loss_clip": 0.01126063, + "auxiliary_loss_mlp": 0.01039892, + "balance_loss_clip": 1.02492499, + "balance_loss_mlp": 1.04421842, + "epoch": 0.3301969036524876, + "flos": 24821203939200.0, + "grad_norm": 2.715750342336911, + "language_loss": 0.77514994, + "learning_rate": 3.126816327146554e-06, + "loss": 0.79680943, + "num_input_tokens_seen": 117944525, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.81640625, + "step": 5492, + "time_per_iteration": 2.4870479106903076 + }, + { + "auxiliary_loss_clip": 0.01131001, + "auxiliary_loss_mlp": 0.0104338, + "balance_loss_clip": 1.0269649, + "balance_loss_mlp": 1.04629827, + "epoch": 0.33025702690515557, + "flos": 15961144613760.0, + "grad_norm": 2.2776411561569265, + "language_loss": 0.7450884, + "learning_rate": 3.12649454083913e-06, + "loss": 0.76683223, + "num_input_tokens_seen": 117962515, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.84375, + "step": 5493, + "time_per_iteration": 2.4506607055664062 + }, + { + "auxiliary_loss_clip": 0.01045286, + "auxiliary_loss_mlp": 0.01012729, + "balance_loss_clip": 1.01074982, + "balance_loss_mlp": 1.01881337, + "epoch": 0.33031715015782354, + "flos": 59416755989760.0, + "grad_norm": 0.7955029917088393, + "language_loss": 0.53910893, + "learning_rate": 3.12617271181492e-06, + "loss": 0.55968904, + "num_input_tokens_seen": 118018780, + "router_z_loss_clip": 0.01977539, + "router_z_loss_mlp": 0.265625, + "step": 5494, + "time_per_iteration": 3.0042550563812256 + }, + { + "auxiliary_loss_clip": 0.01124159, + "auxiliary_loss_mlp": 0.01037133, + "balance_loss_clip": 1.02245855, + "balance_loss_mlp": 1.04378355, + "epoch": 0.3303772734104915, + "flos": 23184360144000.0, + "grad_norm": 1.6073630563578136, + "language_loss": 0.87087989, + "learning_rate": 3.1258508400861276e-06, + "loss": 0.89249277, + "num_input_tokens_seen": 118038610, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8046875, + "step": 5495, + "time_per_iteration": 2.4716837406158447 + }, + { + "auxiliary_loss_clip": 0.01128875, + "auxiliary_loss_mlp": 0.01047751, + "balance_loss_clip": 1.03133559, + "balance_loss_mlp": 1.04508138, + "epoch": 0.33043739666315947, + "flos": 33071896010880.0, + "grad_norm": 3.5655917637781784, + "language_loss": 0.73526418, + "learning_rate": 3.1255289256649587e-06, + "loss": 0.75703049, + "num_input_tokens_seen": 118055905, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8359375, + "step": 5496, + "time_per_iteration": 2.531670570373535 + }, + { + "auxiliary_loss_clip": 0.01124295, + "auxiliary_loss_mlp": 0.01028859, + "balance_loss_clip": 1.01509058, + "balance_loss_mlp": 1.04384971, + "epoch": 0.33049751991582743, + "flos": 24895431394560.0, + "grad_norm": 2.1703031984353514, + "language_loss": 0.72764325, + "learning_rate": 3.1252069685636196e-06, + "loss": 0.74917477, + "num_input_tokens_seen": 118073695, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.8046875, + "step": 5497, + "time_per_iteration": 2.5148839950561523 + }, + { + "auxiliary_loss_clip": 0.01123603, + "auxiliary_loss_mlp": 0.01033477, + "balance_loss_clip": 1.01861119, + "balance_loss_mlp": 1.04340625, + "epoch": 0.3305576431684954, + "flos": 29460575554560.0, + "grad_norm": 2.5654673530164307, + "language_loss": 0.80193126, + "learning_rate": 3.124884968794321e-06, + "loss": 0.82350206, + "num_input_tokens_seen": 118094030, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.80078125, + "step": 5498, + "time_per_iteration": 2.517765522003174 + }, + { + "auxiliary_loss_clip": 0.01123393, + "auxiliary_loss_mlp": 0.01038843, + "balance_loss_clip": 1.02397776, + "balance_loss_mlp": 1.03977811, + "epoch": 0.33061776642116336, + "flos": 22632305040000.0, + "grad_norm": 2.1435474357237405, + "language_loss": 0.76491725, + "learning_rate": 3.12456292636927e-06, + "loss": 0.78653955, + "num_input_tokens_seen": 118111665, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8359375, + "step": 5499, + "time_per_iteration": 2.5006067752838135 + }, + { + "auxiliary_loss_clip": 0.01122541, + "auxiliary_loss_mlp": 0.01031983, + "balance_loss_clip": 1.0175705, + "balance_loss_mlp": 1.04131985, + "epoch": 0.3306778896738313, + "flos": 25776320532480.0, + "grad_norm": 1.506886865759599, + "language_loss": 0.79332948, + "learning_rate": 3.124240841300681e-06, + "loss": 0.81487471, + "num_input_tokens_seen": 118132435, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8125, + "step": 5500, + "time_per_iteration": 2.4859495162963867 + }, + { + "auxiliary_loss_clip": 0.01129022, + "auxiliary_loss_mlp": 0.0103113, + "balance_loss_clip": 1.01607347, + "balance_loss_mlp": 1.04564214, + "epoch": 0.33073801292649935, + "flos": 36940552479360.0, + "grad_norm": 2.164639953437845, + "language_loss": 0.66065335, + "learning_rate": 3.1239187136007665e-06, + "loss": 0.68225485, + "num_input_tokens_seen": 118155255, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.83203125, + "step": 5501, + "time_per_iteration": 2.6189892292022705 + }, + { + "auxiliary_loss_clip": 0.01126823, + "auxiliary_loss_mlp": 0.01041122, + "balance_loss_clip": 1.02471876, + "balance_loss_mlp": 1.04285216, + "epoch": 0.3307981361791673, + "flos": 12967738848000.0, + "grad_norm": 2.260615362067107, + "language_loss": 0.77580702, + "learning_rate": 3.1235965432817417e-06, + "loss": 0.79748642, + "num_input_tokens_seen": 118169865, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.83984375, + "step": 5502, + "time_per_iteration": 2.4086782932281494 + }, + { + "auxiliary_loss_clip": 0.01130061, + "auxiliary_loss_mlp": 0.01039, + "balance_loss_clip": 1.02389622, + "balance_loss_mlp": 1.04632545, + "epoch": 0.3308582594318353, + "flos": 25374372364800.0, + "grad_norm": 2.045089737815956, + "language_loss": 0.72346115, + "learning_rate": 3.123274330355824e-06, + "loss": 0.74515176, + "num_input_tokens_seen": 118190760, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8359375, + "step": 5503, + "time_per_iteration": 2.5176749229431152 + }, + { + "auxiliary_loss_clip": 0.0112493, + "auxiliary_loss_mlp": 0.01033516, + "balance_loss_clip": 1.01865053, + "balance_loss_mlp": 1.04248357, + "epoch": 0.33091838268450324, + "flos": 26468570419200.0, + "grad_norm": 1.5402224202893484, + "language_loss": 0.75216055, + "learning_rate": 3.12295207483523e-06, + "loss": 0.77374506, + "num_input_tokens_seen": 118213620, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.82421875, + "step": 5504, + "time_per_iteration": 2.530212879180908 + }, + { + "auxiliary_loss_clip": 0.01127019, + "auxiliary_loss_mlp": 0.01038843, + "balance_loss_clip": 1.02438283, + "balance_loss_mlp": 1.04382253, + "epoch": 0.3309785059371712, + "flos": 24971167221120.0, + "grad_norm": 1.6148817370045387, + "language_loss": 0.70049053, + "learning_rate": 3.1226297767321816e-06, + "loss": 0.72214913, + "num_input_tokens_seen": 118235010, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.83203125, + "step": 5505, + "time_per_iteration": 2.5212292671203613 + }, + { + "auxiliary_loss_clip": 0.01126444, + "auxiliary_loss_mlp": 0.01041221, + "balance_loss_clip": 1.02720845, + "balance_loss_mlp": 1.04601455, + "epoch": 0.3310386291898392, + "flos": 20446710192000.0, + "grad_norm": 1.586520967819923, + "language_loss": 0.81541443, + "learning_rate": 3.122307436058899e-06, + "loss": 0.83709103, + "num_input_tokens_seen": 118255820, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.8046875, + "step": 5506, + "time_per_iteration": 2.5494561195373535 + }, + { + "auxiliary_loss_clip": 0.01128621, + "auxiliary_loss_mlp": 0.01037729, + "balance_loss_clip": 1.02277398, + "balance_loss_mlp": 1.04704857, + "epoch": 0.33109875244250714, + "flos": 23182672204800.0, + "grad_norm": 1.929478423939084, + "language_loss": 0.79097712, + "learning_rate": 3.121985052827606e-06, + "loss": 0.81264055, + "num_input_tokens_seen": 118274160, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.81640625, + "step": 5507, + "time_per_iteration": 2.498659610748291 + }, + { + "auxiliary_loss_clip": 0.01123401, + "auxiliary_loss_mlp": 0.01040623, + "balance_loss_clip": 1.02594829, + "balance_loss_mlp": 1.04136062, + "epoch": 0.3311588756951751, + "flos": 24168384207360.0, + "grad_norm": 1.6667627205960738, + "language_loss": 0.71733725, + "learning_rate": 3.1216626270505274e-06, + "loss": 0.73897743, + "num_input_tokens_seen": 118294385, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8203125, + "step": 5508, + "time_per_iteration": 2.478593111038208 + }, + { + "auxiliary_loss_clip": 0.01124563, + "auxiliary_loss_mlp": 0.0102968, + "balance_loss_clip": 1.01566064, + "balance_loss_mlp": 1.04539418, + "epoch": 0.33121899894784307, + "flos": 28145742209280.0, + "grad_norm": 2.030813517097255, + "language_loss": 0.72023594, + "learning_rate": 3.12134015873989e-06, + "loss": 0.74177837, + "num_input_tokens_seen": 118313105, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 5509, + "time_per_iteration": 2.539806842803955 + }, + { + "auxiliary_loss_clip": 0.01126062, + "auxiliary_loss_mlp": 0.01034216, + "balance_loss_clip": 1.01975, + "balance_loss_mlp": 1.04503942, + "epoch": 0.33127912220051103, + "flos": 29567660976000.0, + "grad_norm": 1.5191607241878, + "language_loss": 0.73049426, + "learning_rate": 3.121017647907921e-06, + "loss": 0.75209701, + "num_input_tokens_seen": 118335250, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8125, + "step": 5510, + "time_per_iteration": 2.536083698272705 + }, + { + "auxiliary_loss_clip": 0.01123553, + "auxiliary_loss_mlp": 0.01035708, + "balance_loss_clip": 1.02148628, + "balance_loss_mlp": 1.0429213, + "epoch": 0.331339245453179, + "flos": 14428836374400.0, + "grad_norm": 2.1286159820346984, + "language_loss": 0.87371129, + "learning_rate": 3.1206950945668508e-06, + "loss": 0.89530391, + "num_input_tokens_seen": 118351470, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8046875, + "step": 5511, + "time_per_iteration": 2.4380695819854736 + }, + { + "auxiliary_loss_clip": 0.01119745, + "auxiliary_loss_mlp": 0.01032859, + "balance_loss_clip": 1.01986468, + "balance_loss_mlp": 1.04396749, + "epoch": 0.33139936870584696, + "flos": 20887118847360.0, + "grad_norm": 1.6025966363766477, + "language_loss": 0.72926772, + "learning_rate": 3.12037249872891e-06, + "loss": 0.7507937, + "num_input_tokens_seen": 118370970, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7578125, + "step": 5512, + "time_per_iteration": 5.464786767959595 + }, + { + "auxiliary_loss_clip": 0.01124343, + "auxiliary_loss_mlp": 0.01041694, + "balance_loss_clip": 1.02759719, + "balance_loss_mlp": 1.04466701, + "epoch": 0.33145949195851493, + "flos": 36284356869120.0, + "grad_norm": 1.8365879467062751, + "language_loss": 0.72230887, + "learning_rate": 3.1200498604063317e-06, + "loss": 0.7439692, + "num_input_tokens_seen": 118393125, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.796875, + "step": 5513, + "time_per_iteration": 2.6175873279571533 + }, + { + "auxiliary_loss_clip": 0.01128264, + "auxiliary_loss_mlp": 0.01034719, + "balance_loss_clip": 1.01972222, + "balance_loss_mlp": 1.04398656, + "epoch": 0.33151961521118295, + "flos": 14279735018880.0, + "grad_norm": 1.8557947519919487, + "language_loss": 0.68629253, + "learning_rate": 3.1197271796113507e-06, + "loss": 0.70792234, + "num_input_tokens_seen": 118410860, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84375, + "step": 5514, + "time_per_iteration": 2.4340810775756836 + }, + { + "auxiliary_loss_clip": 0.01127749, + "auxiliary_loss_mlp": 0.01041934, + "balance_loss_clip": 1.0251019, + "balance_loss_mlp": 1.04505849, + "epoch": 0.3315797384638509, + "flos": 20774323163520.0, + "grad_norm": 2.411486097564539, + "language_loss": 0.66439879, + "learning_rate": 3.1194044563562026e-06, + "loss": 0.6860956, + "num_input_tokens_seen": 118429570, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.828125, + "step": 5515, + "time_per_iteration": 2.4983339309692383 + }, + { + "auxiliary_loss_clip": 0.01124572, + "auxiliary_loss_mlp": 0.01034351, + "balance_loss_clip": 1.01960468, + "balance_loss_mlp": 1.04258537, + "epoch": 0.3316398617165189, + "flos": 24679464871680.0, + "grad_norm": 1.4970111675637168, + "language_loss": 0.69111156, + "learning_rate": 3.1190816906531257e-06, + "loss": 0.71270084, + "num_input_tokens_seen": 118450285, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8203125, + "step": 5516, + "time_per_iteration": 2.515367031097412 + }, + { + "auxiliary_loss_clip": 0.0112502, + "auxiliary_loss_mlp": 0.01036046, + "balance_loss_clip": 1.02154398, + "balance_loss_mlp": 1.04021645, + "epoch": 0.33169998496918685, + "flos": 18587974129920.0, + "grad_norm": 2.365933570102145, + "language_loss": 0.80287617, + "learning_rate": 3.118758882514359e-06, + "loss": 0.82448685, + "num_input_tokens_seen": 118468270, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.84765625, + "step": 5517, + "time_per_iteration": 2.5149497985839844 + }, + { + "auxiliary_loss_clip": 0.01121413, + "auxiliary_loss_mlp": 0.01036775, + "balance_loss_clip": 1.02264261, + "balance_loss_mlp": 1.04258931, + "epoch": 0.3317601082218548, + "flos": 20193647898240.0, + "grad_norm": 2.188422581245926, + "language_loss": 0.74551105, + "learning_rate": 3.118436031952143e-06, + "loss": 0.76709294, + "num_input_tokens_seen": 118486615, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7890625, + "step": 5518, + "time_per_iteration": 2.450188159942627 + }, + { + "auxiliary_loss_clip": 0.01048984, + "auxiliary_loss_mlp": 0.01008888, + "balance_loss_clip": 1.00682592, + "balance_loss_mlp": 1.02244139, + "epoch": 0.3318202314745228, + "flos": 68974703637120.0, + "grad_norm": 0.6172932492598038, + "language_loss": 0.54346693, + "learning_rate": 3.1181131389787206e-06, + "loss": 0.56404567, + "num_input_tokens_seen": 118553580, + "router_z_loss_clip": 0.02062988, + "router_z_loss_mlp": 0.265625, + "step": 5519, + "time_per_iteration": 3.167750358581543 + }, + { + "auxiliary_loss_clip": 0.0112453, + "auxiliary_loss_mlp": 0.01039354, + "balance_loss_clip": 1.0239042, + "balance_loss_mlp": 1.0434345, + "epoch": 0.33188035472719074, + "flos": 21500113374720.0, + "grad_norm": 3.8105825888408855, + "language_loss": 0.78854358, + "learning_rate": 3.117790203606336e-06, + "loss": 0.81018245, + "num_input_tokens_seen": 118570280, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8125, + "step": 5520, + "time_per_iteration": 2.451781988143921 + }, + { + "auxiliary_loss_clip": 0.01121269, + "auxiliary_loss_mlp": 0.01032127, + "balance_loss_clip": 1.01835227, + "balance_loss_mlp": 1.04244733, + "epoch": 0.3319404779798587, + "flos": 28870490926080.0, + "grad_norm": 2.656623957411012, + "language_loss": 0.76576293, + "learning_rate": 3.1174672258472344e-06, + "loss": 0.78729689, + "num_input_tokens_seen": 118590455, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7890625, + "step": 5521, + "time_per_iteration": 2.525865077972412 + }, + { + "auxiliary_loss_clip": 0.01126792, + "auxiliary_loss_mlp": 0.01044731, + "balance_loss_clip": 1.02932894, + "balance_loss_mlp": 1.04259682, + "epoch": 0.33200060123252667, + "flos": 23076915586560.0, + "grad_norm": 3.3004720611075964, + "language_loss": 0.70353854, + "learning_rate": 3.117144205713664e-06, + "loss": 0.72525376, + "num_input_tokens_seen": 118609495, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.83984375, + "step": 5522, + "time_per_iteration": 2.472001791000366 + }, + { + "auxiliary_loss_clip": 0.01123475, + "auxiliary_loss_mlp": 0.01030854, + "balance_loss_clip": 1.01739514, + "balance_loss_mlp": 1.04362595, + "epoch": 0.33206072448519464, + "flos": 21142479611520.0, + "grad_norm": 1.7154852702320889, + "language_loss": 0.74052203, + "learning_rate": 3.1168211432178735e-06, + "loss": 0.76206541, + "num_input_tokens_seen": 118628720, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.80078125, + "step": 5523, + "time_per_iteration": 2.4924776554107666 + }, + { + "auxiliary_loss_clip": 0.01122263, + "auxiliary_loss_mlp": 0.0103648, + "balance_loss_clip": 1.0211792, + "balance_loss_mlp": 1.04308188, + "epoch": 0.3321208477378626, + "flos": 13079097987840.0, + "grad_norm": 1.6905303226226114, + "language_loss": 0.82272083, + "learning_rate": 3.116498038372114e-06, + "loss": 0.84430826, + "num_input_tokens_seen": 118645955, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.79296875, + "step": 5524, + "time_per_iteration": 2.439711332321167 + }, + { + "auxiliary_loss_clip": 0.01123508, + "auxiliary_loss_mlp": 0.01038508, + "balance_loss_clip": 1.0251627, + "balance_loss_mlp": 1.04402184, + "epoch": 0.33218097099053057, + "flos": 21215414177280.0, + "grad_norm": 1.6540586406432352, + "language_loss": 0.8307848, + "learning_rate": 3.116174891188636e-06, + "loss": 0.85240501, + "num_input_tokens_seen": 118665605, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.79296875, + "step": 5525, + "time_per_iteration": 2.4927310943603516 + }, + { + "auxiliary_loss_clip": 0.01044531, + "auxiliary_loss_mlp": 0.01006175, + "balance_loss_clip": 1.00405347, + "balance_loss_mlp": 1.01804781, + "epoch": 0.33224109424319853, + "flos": 64348979189760.0, + "grad_norm": 0.7716933739699889, + "language_loss": 0.5260945, + "learning_rate": 3.1158517016796945e-06, + "loss": 0.54660153, + "num_input_tokens_seen": 118728155, + "router_z_loss_clip": 0.02124023, + "router_z_loss_mlp": 0.265625, + "step": 5526, + "time_per_iteration": 3.0598835945129395 + }, + { + "auxiliary_loss_clip": 0.01126467, + "auxiliary_loss_mlp": 0.01042827, + "balance_loss_clip": 1.02724671, + "balance_loss_mlp": 1.04371929, + "epoch": 0.33230121749586655, + "flos": 17346003523200.0, + "grad_norm": 2.1037159361855737, + "language_loss": 0.77490491, + "learning_rate": 3.1155284698575445e-06, + "loss": 0.79659784, + "num_input_tokens_seen": 118743955, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.828125, + "step": 5527, + "time_per_iteration": 2.4878480434417725 + }, + { + "auxiliary_loss_clip": 0.01126946, + "auxiliary_loss_mlp": 0.01044009, + "balance_loss_clip": 1.03025246, + "balance_loss_mlp": 1.04651201, + "epoch": 0.3323613407485345, + "flos": 20997041443200.0, + "grad_norm": 2.9813221594214494, + "language_loss": 0.72143763, + "learning_rate": 3.1152051957344434e-06, + "loss": 0.74314719, + "num_input_tokens_seen": 118763275, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.8046875, + "step": 5528, + "time_per_iteration": 2.4562795162200928 + }, + { + "auxiliary_loss_clip": 0.0112635, + "auxiliary_loss_mlp": 0.01036386, + "balance_loss_clip": 1.02256346, + "balance_loss_mlp": 1.04463542, + "epoch": 0.3324214640012025, + "flos": 13152535344000.0, + "grad_norm": 1.7054310511699202, + "language_loss": 0.82638806, + "learning_rate": 3.1148818793226497e-06, + "loss": 0.84801543, + "num_input_tokens_seen": 118781110, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.81640625, + "step": 5529, + "time_per_iteration": 2.474243640899658 + }, + { + "auxiliary_loss_clip": 0.01129499, + "auxiliary_loss_mlp": 0.01036464, + "balance_loss_clip": 1.02223659, + "balance_loss_mlp": 1.04554248, + "epoch": 0.33248158725387045, + "flos": 22273522041600.0, + "grad_norm": 1.9738718949190572, + "language_loss": 0.69718957, + "learning_rate": 3.114558520634423e-06, + "loss": 0.71884924, + "num_input_tokens_seen": 118800620, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.83984375, + "step": 5530, + "time_per_iteration": 2.471686840057373 + }, + { + "auxiliary_loss_clip": 0.01127236, + "auxiliary_loss_mlp": 0.01045423, + "balance_loss_clip": 1.02996182, + "balance_loss_mlp": 1.04500127, + "epoch": 0.3325417105065384, + "flos": 20740998320640.0, + "grad_norm": 2.4616968900166643, + "language_loss": 0.7616601, + "learning_rate": 3.1142351196820256e-06, + "loss": 0.78338665, + "num_input_tokens_seen": 118818725, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8203125, + "step": 5531, + "time_per_iteration": 2.473328113555908 + }, + { + "auxiliary_loss_clip": 0.01128043, + "auxiliary_loss_mlp": 0.01037476, + "balance_loss_clip": 1.0231235, + "balance_loss_mlp": 1.04481292, + "epoch": 0.3326018337592064, + "flos": 24790536702720.0, + "grad_norm": 1.7553607817915955, + "language_loss": 0.73413068, + "learning_rate": 3.1139116764777206e-06, + "loss": 0.75578588, + "num_input_tokens_seen": 118839390, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.83203125, + "step": 5532, + "time_per_iteration": 2.4864931106567383 + }, + { + "auxiliary_loss_clip": 0.01128977, + "auxiliary_loss_mlp": 0.01026777, + "balance_loss_clip": 1.01321709, + "balance_loss_mlp": 1.04721618, + "epoch": 0.33266195701187434, + "flos": 14501699112960.0, + "grad_norm": 2.2280638741168057, + "language_loss": 0.65813714, + "learning_rate": 3.1135881910337735e-06, + "loss": 0.67969465, + "num_input_tokens_seen": 118856275, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.8203125, + "step": 5533, + "time_per_iteration": 2.5232229232788086 + }, + { + "auxiliary_loss_clip": 0.01126882, + "auxiliary_loss_mlp": 0.01040338, + "balance_loss_clip": 1.02541876, + "balance_loss_mlp": 1.04451632, + "epoch": 0.3327220802645423, + "flos": 15304410299520.0, + "grad_norm": 1.9248590192503388, + "language_loss": 0.70790148, + "learning_rate": 3.113264663362451e-06, + "loss": 0.72957367, + "num_input_tokens_seen": 118873830, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.82421875, + "step": 5534, + "time_per_iteration": 2.418875217437744 + }, + { + "auxiliary_loss_clip": 0.01125629, + "auxiliary_loss_mlp": 0.01033414, + "balance_loss_clip": 1.01890588, + "balance_loss_mlp": 1.04565191, + "epoch": 0.3327822035172103, + "flos": 23477534951040.0, + "grad_norm": 1.8142926842561948, + "language_loss": 0.6684956, + "learning_rate": 3.1129410934760204e-06, + "loss": 0.69008601, + "num_input_tokens_seen": 118891560, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.80078125, + "step": 5535, + "time_per_iteration": 2.5031726360321045 + }, + { + "auxiliary_loss_clip": 0.01126804, + "auxiliary_loss_mlp": 0.01038594, + "balance_loss_clip": 1.02450383, + "balance_loss_mlp": 1.04416704, + "epoch": 0.33284232676987824, + "flos": 25374516019200.0, + "grad_norm": 2.1308907042960525, + "language_loss": 0.72915065, + "learning_rate": 3.1126174813867517e-06, + "loss": 0.75080466, + "num_input_tokens_seen": 118910260, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.828125, + "step": 5536, + "time_per_iteration": 2.494007110595703 + }, + { + "auxiliary_loss_clip": 0.01126771, + "auxiliary_loss_mlp": 0.01038616, + "balance_loss_clip": 1.02474046, + "balance_loss_mlp": 1.0450089, + "epoch": 0.3329024500225462, + "flos": 23694363400320.0, + "grad_norm": 1.6653416647198893, + "language_loss": 0.81801486, + "learning_rate": 3.112293827106917e-06, + "loss": 0.83966869, + "num_input_tokens_seen": 118929985, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.81640625, + "step": 5537, + "time_per_iteration": 2.611788272857666 + }, + { + "auxiliary_loss_clip": 0.01131655, + "auxiliary_loss_mlp": 0.01042421, + "balance_loss_clip": 1.02805638, + "balance_loss_mlp": 1.04771638, + "epoch": 0.33296257327521417, + "flos": 31723163205120.0, + "grad_norm": 1.938500745409862, + "language_loss": 0.71606827, + "learning_rate": 3.111970130648789e-06, + "loss": 0.73780894, + "num_input_tokens_seen": 118951355, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.83984375, + "step": 5538, + "time_per_iteration": 2.538574695587158 + }, + { + "auxiliary_loss_clip": 0.01125022, + "auxiliary_loss_mlp": 0.01030337, + "balance_loss_clip": 1.01642489, + "balance_loss_mlp": 1.04461074, + "epoch": 0.33302269652788213, + "flos": 22744705674240.0, + "grad_norm": 2.0173985756025417, + "language_loss": 0.7442342, + "learning_rate": 3.1116463920246424e-06, + "loss": 0.76578778, + "num_input_tokens_seen": 118970910, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.8046875, + "step": 5539, + "time_per_iteration": 2.539393424987793 + }, + { + "auxiliary_loss_clip": 0.01132315, + "auxiliary_loss_mlp": 0.01045465, + "balance_loss_clip": 1.03062367, + "balance_loss_mlp": 1.04543138, + "epoch": 0.33308281978055015, + "flos": 11473747441920.0, + "grad_norm": 1.8798801752229715, + "language_loss": 0.70726681, + "learning_rate": 3.1113226112467527e-06, + "loss": 0.72904468, + "num_input_tokens_seen": 118989200, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8671875, + "step": 5540, + "time_per_iteration": 2.460745096206665 + }, + { + "auxiliary_loss_clip": 0.01123042, + "auxiliary_loss_mlp": 0.01035986, + "balance_loss_clip": 1.02156138, + "balance_loss_mlp": 1.04151917, + "epoch": 0.3331429430332181, + "flos": 38213693112960.0, + "grad_norm": 2.212860979219503, + "language_loss": 0.60678709, + "learning_rate": 3.1109987883273983e-06, + "loss": 0.62837738, + "num_input_tokens_seen": 119011030, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8125, + "step": 5541, + "time_per_iteration": 2.643308162689209 + }, + { + "auxiliary_loss_clip": 0.01129096, + "auxiliary_loss_mlp": 0.01040856, + "balance_loss_clip": 1.0256207, + "balance_loss_mlp": 1.04428339, + "epoch": 0.3332030662858861, + "flos": 22528667324160.0, + "grad_norm": 1.7250198470895146, + "language_loss": 0.68636936, + "learning_rate": 3.1106749232788584e-06, + "loss": 0.70806885, + "num_input_tokens_seen": 119030620, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8515625, + "step": 5542, + "time_per_iteration": 2.472029209136963 + }, + { + "auxiliary_loss_clip": 0.0112742, + "auxiliary_loss_mlp": 0.01037594, + "balance_loss_clip": 1.02362895, + "balance_loss_mlp": 1.04488277, + "epoch": 0.33326318953855405, + "flos": 15997773507840.0, + "grad_norm": 1.6472310915335262, + "language_loss": 0.75526464, + "learning_rate": 3.110351016113414e-06, + "loss": 0.77691472, + "num_input_tokens_seen": 119048015, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.82421875, + "step": 5543, + "time_per_iteration": 2.453550100326538 + }, + { + "auxiliary_loss_clip": 0.01132047, + "auxiliary_loss_mlp": 0.01037735, + "balance_loss_clip": 1.02342415, + "balance_loss_mlp": 1.04834402, + "epoch": 0.333323312791222, + "flos": 25593535198080.0, + "grad_norm": 1.6694578175563026, + "language_loss": 0.75282717, + "learning_rate": 3.110027066843348e-06, + "loss": 0.77452493, + "num_input_tokens_seen": 119066280, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.8359375, + "step": 5544, + "time_per_iteration": 2.486992835998535 + }, + { + "auxiliary_loss_clip": 0.01124934, + "auxiliary_loss_mlp": 0.01033224, + "balance_loss_clip": 1.01910329, + "balance_loss_mlp": 1.04350412, + "epoch": 0.33338343604389, + "flos": 25119550304640.0, + "grad_norm": 1.4864809930890506, + "language_loss": 0.70886022, + "learning_rate": 3.1097030754809456e-06, + "loss": 0.73044181, + "num_input_tokens_seen": 119087680, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8125, + "step": 5545, + "time_per_iteration": 2.5813279151916504 + }, + { + "auxiliary_loss_clip": 0.01125022, + "auxiliary_loss_mlp": 0.01037243, + "balance_loss_clip": 1.02333164, + "balance_loss_mlp": 1.04530168, + "epoch": 0.33344355929655795, + "flos": 16947287579520.0, + "grad_norm": 1.7150542013191912, + "language_loss": 0.69300294, + "learning_rate": 3.1093790420384894e-06, + "loss": 0.7146256, + "num_input_tokens_seen": 119105820, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.796875, + "step": 5546, + "time_per_iteration": 2.4564788341522217 + }, + { + "auxiliary_loss_clip": 0.01129119, + "auxiliary_loss_mlp": 0.01037833, + "balance_loss_clip": 1.02340865, + "balance_loss_mlp": 1.04343665, + "epoch": 0.3335036825492259, + "flos": 27889591345920.0, + "grad_norm": 1.6632006519185205, + "language_loss": 0.64804697, + "learning_rate": 3.1090549665282702e-06, + "loss": 0.66971648, + "num_input_tokens_seen": 119126630, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.859375, + "step": 5547, + "time_per_iteration": 2.554959774017334 + }, + { + "auxiliary_loss_clip": 0.01127842, + "auxiliary_loss_mlp": 0.01030841, + "balance_loss_clip": 1.01782918, + "balance_loss_mlp": 1.0467664, + "epoch": 0.3335638058018939, + "flos": 16179553261440.0, + "grad_norm": 2.454082693277369, + "language_loss": 0.856148, + "learning_rate": 3.1087308489625742e-06, + "loss": 0.87773478, + "num_input_tokens_seen": 119143375, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.8125, + "step": 5548, + "time_per_iteration": 2.451032876968384 + }, + { + "auxiliary_loss_clip": 0.01129139, + "auxiliary_loss_mlp": 0.01036581, + "balance_loss_clip": 1.02100003, + "balance_loss_mlp": 1.04508662, + "epoch": 0.33362392905456184, + "flos": 39896108288640.0, + "grad_norm": 2.024965729715467, + "language_loss": 0.74754196, + "learning_rate": 3.1084066893536945e-06, + "loss": 0.76919919, + "num_input_tokens_seen": 119166450, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.83984375, + "step": 5549, + "time_per_iteration": 2.6875991821289062 + }, + { + "auxiliary_loss_clip": 0.01128755, + "auxiliary_loss_mlp": 0.01038769, + "balance_loss_clip": 1.02362955, + "balance_loss_mlp": 1.04486775, + "epoch": 0.3336840523072298, + "flos": 44271212567040.0, + "grad_norm": 1.8150391856089545, + "language_loss": 0.68361247, + "learning_rate": 3.108082487713921e-06, + "loss": 0.70528769, + "num_input_tokens_seen": 119189645, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.83984375, + "step": 5550, + "time_per_iteration": 2.640758752822876 + }, + { + "auxiliary_loss_clip": 0.0112866, + "auxiliary_loss_mlp": 0.01039899, + "balance_loss_clip": 1.02611244, + "balance_loss_mlp": 1.04545677, + "epoch": 0.33374417555989777, + "flos": 15085678429440.0, + "grad_norm": 1.742869766825136, + "language_loss": 0.60666394, + "learning_rate": 3.1077582440555495e-06, + "loss": 0.62834954, + "num_input_tokens_seen": 119208045, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.83203125, + "step": 5551, + "time_per_iteration": 2.454871654510498 + }, + { + "auxiliary_loss_clip": 0.01127389, + "auxiliary_loss_mlp": 0.01040643, + "balance_loss_clip": 1.02569366, + "balance_loss_mlp": 1.0459497, + "epoch": 0.33380429881256574, + "flos": 15849174942720.0, + "grad_norm": 1.6119589143573256, + "language_loss": 0.70450759, + "learning_rate": 3.1074339583908746e-06, + "loss": 0.72618788, + "num_input_tokens_seen": 119224910, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8125, + "step": 5552, + "time_per_iteration": 2.4226949214935303 + }, + { + "auxiliary_loss_clip": 0.01127587, + "auxiliary_loss_mlp": 0.01036933, + "balance_loss_clip": 1.02297902, + "balance_loss_mlp": 1.04462051, + "epoch": 0.33386442206523376, + "flos": 13480327883520.0, + "grad_norm": 2.0022942324560145, + "language_loss": 0.8289907, + "learning_rate": 3.107109630732192e-06, + "loss": 0.85063589, + "num_input_tokens_seen": 119243290, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.828125, + "step": 5553, + "time_per_iteration": 3.8951358795166016 + }, + { + "auxiliary_loss_clip": 0.01128647, + "auxiliary_loss_mlp": 0.01036827, + "balance_loss_clip": 1.0211153, + "balance_loss_mlp": 1.04528964, + "epoch": 0.3339245453179017, + "flos": 16690669839360.0, + "grad_norm": 2.095475541363027, + "language_loss": 0.81220448, + "learning_rate": 3.1067852610918017e-06, + "loss": 0.83385921, + "num_input_tokens_seen": 119261195, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.83203125, + "step": 5554, + "time_per_iteration": 3.8097896575927734 + }, + { + "auxiliary_loss_clip": 0.01128551, + "auxiliary_loss_mlp": 0.0104249, + "balance_loss_clip": 1.02811968, + "balance_loss_mlp": 1.0457983, + "epoch": 0.3339846685705697, + "flos": 24610624456320.0, + "grad_norm": 1.4459560856203526, + "language_loss": 0.81277251, + "learning_rate": 3.1064608494820032e-06, + "loss": 0.83448291, + "num_input_tokens_seen": 119282845, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.828125, + "step": 5555, + "time_per_iteration": 2.51686954498291 + }, + { + "auxiliary_loss_clip": 0.01126865, + "auxiliary_loss_mlp": 0.01038536, + "balance_loss_clip": 1.02469552, + "balance_loss_mlp": 1.04441357, + "epoch": 0.33404479182323765, + "flos": 30953812775040.0, + "grad_norm": 1.713035899616047, + "language_loss": 0.74563497, + "learning_rate": 3.106136395915099e-06, + "loss": 0.76728898, + "num_input_tokens_seen": 119304430, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.82421875, + "step": 5556, + "time_per_iteration": 2.550630807876587 + }, + { + "auxiliary_loss_clip": 0.0112773, + "auxiliary_loss_mlp": 0.01038673, + "balance_loss_clip": 1.02459431, + "balance_loss_mlp": 1.04586554, + "epoch": 0.3341049150759056, + "flos": 23513301918720.0, + "grad_norm": 1.4096864083862861, + "language_loss": 0.82588691, + "learning_rate": 3.105811900403391e-06, + "loss": 0.84755093, + "num_input_tokens_seen": 119323830, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8203125, + "step": 5557, + "time_per_iteration": 2.498108148574829 + }, + { + "auxiliary_loss_clip": 0.01129625, + "auxiliary_loss_mlp": 0.01045289, + "balance_loss_clip": 1.03055513, + "balance_loss_mlp": 1.04486346, + "epoch": 0.3341650383285736, + "flos": 24026824707840.0, + "grad_norm": 1.7414701325609587, + "language_loss": 0.80056083, + "learning_rate": 3.1054873629591855e-06, + "loss": 0.82230997, + "num_input_tokens_seen": 119346340, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.84765625, + "step": 5558, + "time_per_iteration": 2.5519607067108154 + }, + { + "auxiliary_loss_clip": 0.01129512, + "auxiliary_loss_mlp": 0.01034927, + "balance_loss_clip": 1.02159929, + "balance_loss_mlp": 1.04537535, + "epoch": 0.33422516158124155, + "flos": 24901967669760.0, + "grad_norm": 1.595273660638049, + "language_loss": 0.81953323, + "learning_rate": 3.105162783594788e-06, + "loss": 0.84117764, + "num_input_tokens_seen": 119367285, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.84375, + "step": 5559, + "time_per_iteration": 2.5202248096466064 + }, + { + "auxiliary_loss_clip": 0.01126195, + "auxiliary_loss_mlp": 0.01038303, + "balance_loss_clip": 1.02384293, + "balance_loss_mlp": 1.04450536, + "epoch": 0.3342852848339095, + "flos": 18333403464960.0, + "grad_norm": 2.784570608011319, + "language_loss": 0.72027284, + "learning_rate": 3.1048381623225074e-06, + "loss": 0.74191785, + "num_input_tokens_seen": 119385370, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.81640625, + "step": 5560, + "time_per_iteration": 2.453016757965088 + }, + { + "auxiliary_loss_clip": 0.01133571, + "auxiliary_loss_mlp": 0.01046441, + "balance_loss_clip": 1.03118193, + "balance_loss_mlp": 1.04679513, + "epoch": 0.3343454080865775, + "flos": 30046530119040.0, + "grad_norm": 2.584817000325422, + "language_loss": 0.74888778, + "learning_rate": 3.1045134991546526e-06, + "loss": 0.77068788, + "num_input_tokens_seen": 119409150, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8671875, + "step": 5561, + "time_per_iteration": 2.526980400085449 + }, + { + "auxiliary_loss_clip": 0.01128977, + "auxiliary_loss_mlp": 0.01038649, + "balance_loss_clip": 1.02410603, + "balance_loss_mlp": 1.04610825, + "epoch": 0.33440553133924544, + "flos": 16398823835520.0, + "grad_norm": 2.2689753945529176, + "language_loss": 0.69638503, + "learning_rate": 3.1041887941035355e-06, + "loss": 0.71806127, + "num_input_tokens_seen": 119426475, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.828125, + "step": 5562, + "time_per_iteration": 2.483530282974243 + }, + { + "auxiliary_loss_clip": 0.01127212, + "auxiliary_loss_mlp": 0.01041398, + "balance_loss_clip": 1.02821374, + "balance_loss_mlp": 1.04549575, + "epoch": 0.3344656545919134, + "flos": 24242072958720.0, + "grad_norm": 1.5595683236821118, + "language_loss": 0.65407914, + "learning_rate": 3.1038640471814685e-06, + "loss": 0.67576528, + "num_input_tokens_seen": 119446900, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.8203125, + "step": 5563, + "time_per_iteration": 2.489734649658203 + }, + { + "auxiliary_loss_clip": 0.01131891, + "auxiliary_loss_mlp": 0.0104331, + "balance_loss_clip": 1.027843, + "balance_loss_mlp": 1.0464654, + "epoch": 0.3345257778445814, + "flos": 52118843149440.0, + "grad_norm": 3.650208894964183, + "language_loss": 0.74457055, + "learning_rate": 3.103539258400766e-06, + "loss": 0.76632255, + "num_input_tokens_seen": 119470945, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.85546875, + "step": 5564, + "time_per_iteration": 2.7312686443328857 + }, + { + "auxiliary_loss_clip": 0.01049511, + "auxiliary_loss_mlp": 0.00999253, + "balance_loss_clip": 0.99735802, + "balance_loss_mlp": 1.02280784, + "epoch": 0.33458590109724934, + "flos": 68048602254720.0, + "grad_norm": 0.7800603717209338, + "language_loss": 0.55489159, + "learning_rate": 3.103214427773745e-06, + "loss": 0.57537925, + "num_input_tokens_seen": 119529925, + "router_z_loss_clip": 0.0189209, + "router_z_loss_mlp": 0.265625, + "step": 5565, + "time_per_iteration": 3.0266246795654297 + }, + { + "auxiliary_loss_clip": 0.01126829, + "auxiliary_loss_mlp": 0.0103706, + "balance_loss_clip": 1.02271366, + "balance_loss_mlp": 1.04589689, + "epoch": 0.3346460243499173, + "flos": 37414788768000.0, + "grad_norm": 1.7346222757402157, + "language_loss": 0.64754677, + "learning_rate": 3.102889555312721e-06, + "loss": 0.66918564, + "num_input_tokens_seen": 119550700, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.80859375, + "step": 5566, + "time_per_iteration": 2.5819363594055176 + }, + { + "auxiliary_loss_clip": 0.01128946, + "auxiliary_loss_mlp": 0.01040566, + "balance_loss_clip": 1.0259037, + "balance_loss_mlp": 1.04706717, + "epoch": 0.3347061476025853, + "flos": 18697358021760.0, + "grad_norm": 1.73011072762743, + "language_loss": 0.77735972, + "learning_rate": 3.102564641030016e-06, + "loss": 0.7990548, + "num_input_tokens_seen": 119569295, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8203125, + "step": 5567, + "time_per_iteration": 2.508108377456665 + }, + { + "auxiliary_loss_clip": 0.01130701, + "auxiliary_loss_mlp": 0.01040305, + "balance_loss_clip": 1.02480745, + "balance_loss_mlp": 1.04583585, + "epoch": 0.3347662708552533, + "flos": 13917827537280.0, + "grad_norm": 1.719738804733239, + "language_loss": 0.76512182, + "learning_rate": 3.102239684937949e-06, + "loss": 0.78683186, + "num_input_tokens_seen": 119587375, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84765625, + "step": 5568, + "time_per_iteration": 2.4344217777252197 + }, + { + "auxiliary_loss_clip": 0.01130164, + "auxiliary_loss_mlp": 0.01044901, + "balance_loss_clip": 1.02973104, + "balance_loss_mlp": 1.04528308, + "epoch": 0.33482639410792125, + "flos": 19750402068480.0, + "grad_norm": 2.265483767853782, + "language_loss": 0.71277773, + "learning_rate": 3.101914687048842e-06, + "loss": 0.73452842, + "num_input_tokens_seen": 119604530, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.84765625, + "step": 5569, + "time_per_iteration": 2.462592840194702 + }, + { + "auxiliary_loss_clip": 0.0112772, + "auxiliary_loss_mlp": 0.0103514, + "balance_loss_clip": 1.01920176, + "balance_loss_mlp": 1.04275155, + "epoch": 0.3348865173605892, + "flos": 16102991422080.0, + "grad_norm": 1.859999754882374, + "language_loss": 0.90291858, + "learning_rate": 3.10158964737502e-06, + "loss": 0.9245472, + "num_input_tokens_seen": 119621025, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8515625, + "step": 5570, + "time_per_iteration": 2.432124614715576 + }, + { + "auxiliary_loss_clip": 0.0112712, + "auxiliary_loss_mlp": 0.01030792, + "balance_loss_clip": 1.01634383, + "balance_loss_mlp": 1.04461455, + "epoch": 0.3349466406132572, + "flos": 25008945350400.0, + "grad_norm": 1.7333982724081918, + "language_loss": 0.80038494, + "learning_rate": 3.101264565928808e-06, + "loss": 0.82196403, + "num_input_tokens_seen": 119641725, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.82421875, + "step": 5571, + "time_per_iteration": 2.52752947807312 + }, + { + "auxiliary_loss_clip": 0.0104544, + "auxiliary_loss_mlp": 0.00998336, + "balance_loss_clip": 0.99651235, + "balance_loss_mlp": 1.01880455, + "epoch": 0.33500676386592515, + "flos": 54319991564160.0, + "grad_norm": 0.9063074837999179, + "language_loss": 0.55948162, + "learning_rate": 3.1009394427225335e-06, + "loss": 0.5799194, + "num_input_tokens_seen": 119693560, + "router_z_loss_clip": 0.01818848, + "router_z_loss_mlp": 0.265625, + "step": 5572, + "time_per_iteration": 3.0247979164123535 + }, + { + "auxiliary_loss_clip": 0.01131084, + "auxiliary_loss_mlp": 0.01046374, + "balance_loss_clip": 1.03212237, + "balance_loss_mlp": 1.04797339, + "epoch": 0.3350668871185931, + "flos": 26797332625920.0, + "grad_norm": 2.028320341949736, + "language_loss": 0.78112698, + "learning_rate": 3.1006142777685257e-06, + "loss": 0.80290151, + "num_input_tokens_seen": 119712935, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.83203125, + "step": 5573, + "time_per_iteration": 2.5152878761291504 + }, + { + "auxiliary_loss_clip": 0.01130248, + "auxiliary_loss_mlp": 0.01046989, + "balance_loss_clip": 1.03143215, + "balance_loss_mlp": 1.04525197, + "epoch": 0.3351270103712611, + "flos": 33510508986240.0, + "grad_norm": 2.1279768530108503, + "language_loss": 0.72473001, + "learning_rate": 3.1002890710791133e-06, + "loss": 0.7465024, + "num_input_tokens_seen": 119731680, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8515625, + "step": 5574, + "time_per_iteration": 2.543531656265259 + }, + { + "auxiliary_loss_clip": 0.01125319, + "auxiliary_loss_mlp": 0.0103147, + "balance_loss_clip": 1.017308, + "balance_loss_mlp": 1.04292774, + "epoch": 0.33518713362392905, + "flos": 26506240807680.0, + "grad_norm": 2.78085640379241, + "language_loss": 0.87911499, + "learning_rate": 3.0999638226666287e-06, + "loss": 0.90068293, + "num_input_tokens_seen": 119752155, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.82421875, + "step": 5575, + "time_per_iteration": 2.546952724456787 + }, + { + "auxiliary_loss_clip": 0.01132707, + "auxiliary_loss_mlp": 0.01045745, + "balance_loss_clip": 1.02899647, + "balance_loss_mlp": 1.04479516, + "epoch": 0.335247256876597, + "flos": 17232345912960.0, + "grad_norm": 2.569353520757799, + "language_loss": 0.82441479, + "learning_rate": 3.0996385325434063e-06, + "loss": 0.84619927, + "num_input_tokens_seen": 119769195, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.875, + "step": 5576, + "time_per_iteration": 2.414294958114624 + }, + { + "auxiliary_loss_clip": 0.01129312, + "auxiliary_loss_mlp": 0.01044917, + "balance_loss_clip": 1.0286808, + "balance_loss_mlp": 1.043697, + "epoch": 0.335307380129265, + "flos": 25629373992960.0, + "grad_norm": 3.008815557703919, + "language_loss": 0.73384887, + "learning_rate": 3.0993132007217806e-06, + "loss": 0.75559115, + "num_input_tokens_seen": 119786810, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.85546875, + "step": 5577, + "time_per_iteration": 2.50136399269104 + }, + { + "auxiliary_loss_clip": 0.01131921, + "auxiliary_loss_mlp": 0.0104202, + "balance_loss_clip": 1.02667177, + "balance_loss_mlp": 1.04811549, + "epoch": 0.33536750338193294, + "flos": 19680089195520.0, + "grad_norm": 1.7225109171896533, + "language_loss": 0.81555498, + "learning_rate": 3.0989878272140883e-06, + "loss": 0.8372944, + "num_input_tokens_seen": 119805395, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8359375, + "step": 5578, + "time_per_iteration": 2.431365728378296 + }, + { + "auxiliary_loss_clip": 0.01125183, + "auxiliary_loss_mlp": 0.0103725, + "balance_loss_clip": 1.02277184, + "balance_loss_mlp": 1.04578936, + "epoch": 0.3354276266346009, + "flos": 18332613365760.0, + "grad_norm": 1.8947087551065327, + "language_loss": 0.71785814, + "learning_rate": 3.0986624120326676e-06, + "loss": 0.73948246, + "num_input_tokens_seen": 119823135, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.79296875, + "step": 5579, + "time_per_iteration": 2.4519495964050293 + }, + { + "auxiliary_loss_clip": 0.01130811, + "auxiliary_loss_mlp": 0.01037429, + "balance_loss_clip": 1.02191353, + "balance_loss_mlp": 1.0456152, + "epoch": 0.3354877498872689, + "flos": 17858556645120.0, + "grad_norm": 2.0306401350469225, + "language_loss": 0.81084043, + "learning_rate": 3.0983369551898573e-06, + "loss": 0.83252287, + "num_input_tokens_seen": 119842265, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8515625, + "step": 5580, + "time_per_iteration": 2.427481174468994 + }, + { + "auxiliary_loss_clip": 0.01130056, + "auxiliary_loss_mlp": 0.01036614, + "balance_loss_clip": 1.02091432, + "balance_loss_mlp": 1.04496789, + "epoch": 0.3355478731399369, + "flos": 24717745791360.0, + "grad_norm": 1.8687829543354073, + "language_loss": 0.77912092, + "learning_rate": 3.0980114566980003e-06, + "loss": 0.80078757, + "num_input_tokens_seen": 119862500, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8515625, + "step": 5581, + "time_per_iteration": 2.5320229530334473 + }, + { + "auxiliary_loss_clip": 0.01132086, + "auxiliary_loss_mlp": 0.01045037, + "balance_loss_clip": 1.02735782, + "balance_loss_mlp": 1.04367673, + "epoch": 0.33560799639260486, + "flos": 16873886136960.0, + "grad_norm": 5.02896087449, + "language_loss": 0.74623251, + "learning_rate": 3.0976859165694384e-06, + "loss": 0.76800376, + "num_input_tokens_seen": 119880160, + "router_z_loss_clip": 0.17675781, + "router_z_loss_mlp": 0.8828125, + "step": 5582, + "time_per_iteration": 2.421482801437378 + }, + { + "auxiliary_loss_clip": 0.0113015, + "auxiliary_loss_mlp": 0.01041343, + "balance_loss_clip": 1.02528524, + "balance_loss_mlp": 1.04456937, + "epoch": 0.3356681196452728, + "flos": 18333511205760.0, + "grad_norm": 1.790512330860928, + "language_loss": 0.82143587, + "learning_rate": 3.0973603348165166e-06, + "loss": 0.84315073, + "num_input_tokens_seen": 119899040, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.85546875, + "step": 5583, + "time_per_iteration": 2.4543566703796387 + }, + { + "auxiliary_loss_clip": 0.01127596, + "auxiliary_loss_mlp": 0.01044573, + "balance_loss_clip": 1.02991009, + "balance_loss_mlp": 1.04491317, + "epoch": 0.3357282428979408, + "flos": 34750612085760.0, + "grad_norm": 1.9267692381394996, + "language_loss": 0.7779209, + "learning_rate": 3.097034711451581e-06, + "loss": 0.79964256, + "num_input_tokens_seen": 119921120, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.828125, + "step": 5584, + "time_per_iteration": 2.6100947856903076 + }, + { + "auxiliary_loss_clip": 0.01129164, + "auxiliary_loss_mlp": 0.01038197, + "balance_loss_clip": 1.02343249, + "balance_loss_mlp": 1.04359186, + "epoch": 0.33578836615060875, + "flos": 21580087006080.0, + "grad_norm": 1.4758908421399493, + "language_loss": 0.75978506, + "learning_rate": 3.0967090464869795e-06, + "loss": 0.78145868, + "num_input_tokens_seen": 119940165, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.85546875, + "step": 5585, + "time_per_iteration": 2.4898715019226074 + }, + { + "auxiliary_loss_clip": 0.01121936, + "auxiliary_loss_mlp": 0.01037049, + "balance_loss_clip": 1.02170694, + "balance_loss_mlp": 1.04066801, + "epoch": 0.3358484894032767, + "flos": 24530291688960.0, + "grad_norm": 1.4987207146888684, + "language_loss": 0.77731383, + "learning_rate": 3.0963833399350608e-06, + "loss": 0.79890364, + "num_input_tokens_seen": 119959730, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8125, + "step": 5586, + "time_per_iteration": 2.4825005531311035 + }, + { + "auxiliary_loss_clip": 0.01136236, + "auxiliary_loss_mlp": 0.01048607, + "balance_loss_clip": 1.03070199, + "balance_loss_mlp": 1.04787183, + "epoch": 0.3359086126559447, + "flos": 22455589104000.0, + "grad_norm": 1.6235624689574053, + "language_loss": 0.81027555, + "learning_rate": 3.0960575918081756e-06, + "loss": 0.83212399, + "num_input_tokens_seen": 119979315, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.8828125, + "step": 5587, + "time_per_iteration": 2.486459493637085 + }, + { + "auxiliary_loss_clip": 0.01125436, + "auxiliary_loss_mlp": 0.01040884, + "balance_loss_clip": 1.0270915, + "balance_loss_mlp": 1.04548144, + "epoch": 0.33596873590861265, + "flos": 16543687386240.0, + "grad_norm": 1.7952449023594161, + "language_loss": 0.67014575, + "learning_rate": 3.095731802118677e-06, + "loss": 0.69180894, + "num_input_tokens_seen": 119996140, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.80078125, + "step": 5588, + "time_per_iteration": 2.435070753097534 + }, + { + "auxiliary_loss_clip": 0.01130516, + "auxiliary_loss_mlp": 0.0104412, + "balance_loss_clip": 1.02784824, + "balance_loss_mlp": 1.04568088, + "epoch": 0.3360288591612806, + "flos": 31175812782720.0, + "grad_norm": 1.6839710852868943, + "language_loss": 0.69882601, + "learning_rate": 3.095405970878919e-06, + "loss": 0.72057241, + "num_input_tokens_seen": 120017720, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.84765625, + "step": 5589, + "time_per_iteration": 2.548051118850708 + }, + { + "auxiliary_loss_clip": 0.01129859, + "auxiliary_loss_mlp": 0.01043753, + "balance_loss_clip": 1.02709961, + "balance_loss_mlp": 1.04461861, + "epoch": 0.3360889824139486, + "flos": 23696913265920.0, + "grad_norm": 2.1328325025080987, + "language_loss": 0.66886735, + "learning_rate": 3.0950800981012567e-06, + "loss": 0.69060349, + "num_input_tokens_seen": 120036335, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8515625, + "step": 5590, + "time_per_iteration": 2.4735047817230225 + }, + { + "auxiliary_loss_clip": 0.01126204, + "auxiliary_loss_mlp": 0.01045607, + "balance_loss_clip": 1.02993059, + "balance_loss_mlp": 1.04570127, + "epoch": 0.33614910566661654, + "flos": 19318109886720.0, + "grad_norm": 1.8322479695472769, + "language_loss": 0.73409903, + "learning_rate": 3.094754183798047e-06, + "loss": 0.75581712, + "num_input_tokens_seen": 120056120, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8046875, + "step": 5591, + "time_per_iteration": 2.4736244678497314 + }, + { + "auxiliary_loss_clip": 0.01127166, + "auxiliary_loss_mlp": 0.01042172, + "balance_loss_clip": 1.02686584, + "balance_loss_mlp": 1.04408562, + "epoch": 0.3362092289192845, + "flos": 16472261191680.0, + "grad_norm": 1.9183925576882788, + "language_loss": 0.69446647, + "learning_rate": 3.0944282279816493e-06, + "loss": 0.71615982, + "num_input_tokens_seen": 120073650, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.828125, + "step": 5592, + "time_per_iteration": 2.4232676029205322 + }, + { + "auxiliary_loss_clip": 0.0112535, + "auxiliary_loss_mlp": 0.01037895, + "balance_loss_clip": 1.02366149, + "balance_loss_mlp": 1.0442183, + "epoch": 0.33626935217195253, + "flos": 24243581329920.0, + "grad_norm": 2.4700576130478367, + "language_loss": 0.76281321, + "learning_rate": 3.094102230664423e-06, + "loss": 0.78444564, + "num_input_tokens_seen": 120093260, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8125, + "step": 5593, + "time_per_iteration": 2.4856812953948975 + }, + { + "auxiliary_loss_clip": 0.01128845, + "auxiliary_loss_mlp": 0.01044628, + "balance_loss_clip": 1.02703261, + "balance_loss_mlp": 1.04333365, + "epoch": 0.3363294754246205, + "flos": 19718765164800.0, + "grad_norm": 2.2267028217655516, + "language_loss": 0.71435678, + "learning_rate": 3.093776191858731e-06, + "loss": 0.73609149, + "num_input_tokens_seen": 120111830, + "router_z_loss_clip": 0.17578125, + "router_z_loss_mlp": 0.8515625, + "step": 5594, + "time_per_iteration": 2.437554359436035 + }, + { + "auxiliary_loss_clip": 0.0113233, + "auxiliary_loss_mlp": 0.01045948, + "balance_loss_clip": 1.02985501, + "balance_loss_mlp": 1.04690135, + "epoch": 0.33638959867728846, + "flos": 22596286677120.0, + "grad_norm": 1.637052204404589, + "language_loss": 0.80350173, + "learning_rate": 3.0934501115769363e-06, + "loss": 0.82528448, + "num_input_tokens_seen": 120130470, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.85546875, + "step": 5595, + "time_per_iteration": 5.51651668548584 + }, + { + "auxiliary_loss_clip": 0.0112868, + "auxiliary_loss_mlp": 0.01033292, + "balance_loss_clip": 1.01964831, + "balance_loss_mlp": 1.04542542, + "epoch": 0.3364497219299564, + "flos": 20994742972800.0, + "grad_norm": 1.8244163047079407, + "language_loss": 0.81611145, + "learning_rate": 3.0931239898314037e-06, + "loss": 0.83773112, + "num_input_tokens_seen": 120150735, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.83203125, + "step": 5596, + "time_per_iteration": 2.4959781169891357 + }, + { + "auxiliary_loss_clip": 0.01128091, + "auxiliary_loss_mlp": 0.01039521, + "balance_loss_clip": 1.02508509, + "balance_loss_mlp": 1.04461718, + "epoch": 0.3365098451826244, + "flos": 25228610974080.0, + "grad_norm": 1.7014468319312177, + "language_loss": 0.76001227, + "learning_rate": 3.0927978266344995e-06, + "loss": 0.78168839, + "num_input_tokens_seen": 120173230, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8359375, + "step": 5597, + "time_per_iteration": 2.4965333938598633 + }, + { + "auxiliary_loss_clip": 0.01126223, + "auxiliary_loss_mlp": 0.01037273, + "balance_loss_clip": 1.0233258, + "balance_loss_mlp": 1.04597533, + "epoch": 0.33656996843529235, + "flos": 24571697091840.0, + "grad_norm": 1.8007239192940239, + "language_loss": 0.78937811, + "learning_rate": 3.0924716219985916e-06, + "loss": 0.81101304, + "num_input_tokens_seen": 120191860, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.80078125, + "step": 5598, + "time_per_iteration": 2.587813377380371 + }, + { + "auxiliary_loss_clip": 0.01133011, + "auxiliary_loss_mlp": 0.01036012, + "balance_loss_clip": 1.02036011, + "balance_loss_mlp": 1.04606342, + "epoch": 0.3366300916879603, + "flos": 44091120752640.0, + "grad_norm": 1.4664560154247552, + "language_loss": 0.64197004, + "learning_rate": 3.0921453759360514e-06, + "loss": 0.66366023, + "num_input_tokens_seen": 120219195, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.87109375, + "step": 5599, + "time_per_iteration": 2.647618293762207 + }, + { + "auxiliary_loss_clip": 0.0113527, + "auxiliary_loss_mlp": 0.01043272, + "balance_loss_clip": 1.02685726, + "balance_loss_mlp": 1.0468514, + "epoch": 0.3366902149406283, + "flos": 13879869840000.0, + "grad_norm": 2.652004853610392, + "language_loss": 0.8172245, + "learning_rate": 3.091819088459249e-06, + "loss": 0.83900994, + "num_input_tokens_seen": 120232950, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.88671875, + "step": 5600, + "time_per_iteration": 2.441237211227417 + }, + { + "auxiliary_loss_clip": 0.01130498, + "auxiliary_loss_mlp": 0.01050016, + "balance_loss_clip": 1.03369582, + "balance_loss_mlp": 1.04399288, + "epoch": 0.33675033819329625, + "flos": 16253098358400.0, + "grad_norm": 3.359102963412802, + "language_loss": 0.82717538, + "learning_rate": 3.0914927595805573e-06, + "loss": 0.84898043, + "num_input_tokens_seen": 120248865, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.86328125, + "step": 5601, + "time_per_iteration": 2.4369428157806396 + }, + { + "auxiliary_loss_clip": 0.01127768, + "auxiliary_loss_mlp": 0.01032812, + "balance_loss_clip": 1.01911497, + "balance_loss_mlp": 1.04890418, + "epoch": 0.3368104614459642, + "flos": 17055809544960.0, + "grad_norm": 1.6511579237160083, + "language_loss": 0.82726496, + "learning_rate": 3.0911663893123507e-06, + "loss": 0.84887075, + "num_input_tokens_seen": 120267820, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7890625, + "step": 5602, + "time_per_iteration": 2.463291645050049 + }, + { + "auxiliary_loss_clip": 0.01130933, + "auxiliary_loss_mlp": 0.01055384, + "balance_loss_clip": 1.04039955, + "balance_loss_mlp": 1.04712546, + "epoch": 0.3368705846986322, + "flos": 17858628472320.0, + "grad_norm": 1.700541242008466, + "language_loss": 0.70208776, + "learning_rate": 3.0908399776670048e-06, + "loss": 0.72395098, + "num_input_tokens_seen": 120286540, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8359375, + "step": 5603, + "time_per_iteration": 2.4309756755828857 + }, + { + "auxiliary_loss_clip": 0.01133654, + "auxiliary_loss_mlp": 0.0103849, + "balance_loss_clip": 1.02392292, + "balance_loss_mlp": 1.04724145, + "epoch": 0.33693070795130015, + "flos": 22929502170240.0, + "grad_norm": 1.625433979180813, + "language_loss": 0.82925308, + "learning_rate": 3.090513524656898e-06, + "loss": 0.8509745, + "num_input_tokens_seen": 120307305, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.86328125, + "step": 5604, + "time_per_iteration": 2.4980010986328125 + }, + { + "auxiliary_loss_clip": 0.01129789, + "auxiliary_loss_mlp": 0.01042861, + "balance_loss_clip": 1.02782226, + "balance_loss_mlp": 1.0447166, + "epoch": 0.3369908312039681, + "flos": 22017443005440.0, + "grad_norm": 3.2518642032613654, + "language_loss": 0.73756403, + "learning_rate": 3.090187030294409e-06, + "loss": 0.75929046, + "num_input_tokens_seen": 120327845, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8515625, + "step": 5605, + "time_per_iteration": 2.4563212394714355 + }, + { + "auxiliary_loss_clip": 0.01132634, + "auxiliary_loss_mlp": 0.01040526, + "balance_loss_clip": 1.02520752, + "balance_loss_mlp": 1.04604197, + "epoch": 0.33705095445663613, + "flos": 11801970944640.0, + "grad_norm": 2.772980532366942, + "language_loss": 0.83487791, + "learning_rate": 3.089860494591919e-06, + "loss": 0.85660958, + "num_input_tokens_seen": 120343255, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8671875, + "step": 5606, + "time_per_iteration": 2.456441640853882 + }, + { + "auxiliary_loss_clip": 0.0112361, + "auxiliary_loss_mlp": 0.01039979, + "balance_loss_clip": 1.02549469, + "balance_loss_mlp": 1.0414753, + "epoch": 0.3371110777093041, + "flos": 25046400257280.0, + "grad_norm": 1.7790448991820722, + "language_loss": 0.67335433, + "learning_rate": 3.089533917561809e-06, + "loss": 0.69499022, + "num_input_tokens_seen": 120361745, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8203125, + "step": 5607, + "time_per_iteration": 2.4964821338653564 + }, + { + "auxiliary_loss_clip": 0.01130916, + "auxiliary_loss_mlp": 0.01041895, + "balance_loss_clip": 1.02694631, + "balance_loss_mlp": 1.04507923, + "epoch": 0.33717120096197206, + "flos": 26579031719040.0, + "grad_norm": 2.032375572186737, + "language_loss": 0.71093041, + "learning_rate": 3.089207299216464e-06, + "loss": 0.73265851, + "num_input_tokens_seen": 120380565, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.859375, + "step": 5608, + "time_per_iteration": 2.5247933864593506 + }, + { + "auxiliary_loss_clip": 0.01128549, + "auxiliary_loss_mlp": 0.01038175, + "balance_loss_clip": 1.0236311, + "balance_loss_mlp": 1.0446682, + "epoch": 0.33723132421464, + "flos": 15158541168000.0, + "grad_norm": 1.8968208773724307, + "language_loss": 0.79062563, + "learning_rate": 3.088880639568269e-06, + "loss": 0.81229293, + "num_input_tokens_seen": 120399235, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.83984375, + "step": 5609, + "time_per_iteration": 2.439502477645874 + }, + { + "auxiliary_loss_clip": 0.01129667, + "auxiliary_loss_mlp": 0.01042877, + "balance_loss_clip": 1.02706969, + "balance_loss_mlp": 1.04544735, + "epoch": 0.337291447467308, + "flos": 23436093634560.0, + "grad_norm": 2.0456898754189354, + "language_loss": 0.82218611, + "learning_rate": 3.0885539386296114e-06, + "loss": 0.84391159, + "num_input_tokens_seen": 120420095, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.84375, + "step": 5610, + "time_per_iteration": 2.502028226852417 + }, + { + "auxiliary_loss_clip": 0.01123686, + "auxiliary_loss_mlp": 0.01040586, + "balance_loss_clip": 1.02520823, + "balance_loss_mlp": 1.04264688, + "epoch": 0.33735157071997596, + "flos": 17238163916160.0, + "grad_norm": 1.8264685829582996, + "language_loss": 0.81998217, + "learning_rate": 3.088227196412879e-06, + "loss": 0.84162486, + "num_input_tokens_seen": 120437690, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8125, + "step": 5611, + "time_per_iteration": 2.4255177974700928 + }, + { + "auxiliary_loss_clip": 0.01130986, + "auxiliary_loss_mlp": 0.01044325, + "balance_loss_clip": 1.02728975, + "balance_loss_mlp": 1.04550552, + "epoch": 0.3374116939726439, + "flos": 28257388657920.0, + "grad_norm": 1.5753494383615703, + "language_loss": 0.79407716, + "learning_rate": 3.0879004129304626e-06, + "loss": 0.81583023, + "num_input_tokens_seen": 120459240, + "router_z_loss_clip": 0.16992188, + "router_z_loss_mlp": 0.85546875, + "step": 5612, + "time_per_iteration": 2.537048578262329 + }, + { + "auxiliary_loss_clip": 0.01124133, + "auxiliary_loss_mlp": 0.01037075, + "balance_loss_clip": 1.02212596, + "balance_loss_mlp": 1.04021907, + "epoch": 0.3374718172253119, + "flos": 35919396731520.0, + "grad_norm": 2.519050824799004, + "language_loss": 0.70024467, + "learning_rate": 3.087573588194753e-06, + "loss": 0.72185683, + "num_input_tokens_seen": 120481090, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.83984375, + "step": 5613, + "time_per_iteration": 2.570373773574829 + }, + { + "auxiliary_loss_clip": 0.01129945, + "auxiliary_loss_mlp": 0.01037211, + "balance_loss_clip": 1.02203548, + "balance_loss_mlp": 1.04490113, + "epoch": 0.33753194047797985, + "flos": 18186672407040.0, + "grad_norm": 1.6646408753448763, + "language_loss": 0.79615057, + "learning_rate": 3.087246722218144e-06, + "loss": 0.81782216, + "num_input_tokens_seen": 120500045, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8515625, + "step": 5614, + "time_per_iteration": 2.4379053115844727 + }, + { + "auxiliary_loss_clip": 0.01126744, + "auxiliary_loss_mlp": 0.01040084, + "balance_loss_clip": 1.02331161, + "balance_loss_mlp": 1.04260945, + "epoch": 0.3375920637306478, + "flos": 23148916398720.0, + "grad_norm": 1.8534958586083128, + "language_loss": 0.90879035, + "learning_rate": 3.086919815013031e-06, + "loss": 0.93045861, + "num_input_tokens_seen": 120521125, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.84375, + "step": 5615, + "time_per_iteration": 2.4876632690429688 + }, + { + "auxiliary_loss_clip": 0.0112252, + "auxiliary_loss_mlp": 0.01040203, + "balance_loss_clip": 1.02596951, + "balance_loss_mlp": 1.04105914, + "epoch": 0.3376521869833158, + "flos": 23112215677440.0, + "grad_norm": 1.6970154369052728, + "language_loss": 0.80636102, + "learning_rate": 3.086592866591809e-06, + "loss": 0.82798827, + "num_input_tokens_seen": 120539180, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8125, + "step": 5616, + "time_per_iteration": 2.476569175720215 + }, + { + "auxiliary_loss_clip": 0.01131427, + "auxiliary_loss_mlp": 0.01044004, + "balance_loss_clip": 1.02715993, + "balance_loss_mlp": 1.04379678, + "epoch": 0.33771231023598375, + "flos": 19274585581440.0, + "grad_norm": 2.5053489219363754, + "language_loss": 0.84079826, + "learning_rate": 3.0862658769668774e-06, + "loss": 0.86255258, + "num_input_tokens_seen": 120556280, + "router_z_loss_clip": 0.16894531, + "router_z_loss_mlp": 0.875, + "step": 5617, + "time_per_iteration": 2.4204065799713135 + }, + { + "auxiliary_loss_clip": 0.01125211, + "auxiliary_loss_mlp": 0.01036412, + "balance_loss_clip": 1.02190411, + "balance_loss_mlp": 1.04171932, + "epoch": 0.3377724334886517, + "flos": 18150187167360.0, + "grad_norm": 1.648273719366553, + "language_loss": 0.80173457, + "learning_rate": 3.0859388461506343e-06, + "loss": 0.82335079, + "num_input_tokens_seen": 120575395, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8359375, + "step": 5618, + "time_per_iteration": 2.4789302349090576 + }, + { + "auxiliary_loss_clip": 0.01128326, + "auxiliary_loss_mlp": 0.01034119, + "balance_loss_clip": 1.01895535, + "balance_loss_mlp": 1.04367077, + "epoch": 0.3378325567413197, + "flos": 25775997310080.0, + "grad_norm": 1.9548255306646998, + "language_loss": 0.70458674, + "learning_rate": 3.085611774155481e-06, + "loss": 0.72621119, + "num_input_tokens_seen": 120596075, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.84765625, + "step": 5619, + "time_per_iteration": 2.4674489498138428 + }, + { + "auxiliary_loss_clip": 0.01127452, + "auxiliary_loss_mlp": 0.01047075, + "balance_loss_clip": 1.0322814, + "balance_loss_mlp": 1.04403424, + "epoch": 0.3378926799939877, + "flos": 21317112558720.0, + "grad_norm": 5.009208052913787, + "language_loss": 0.69223797, + "learning_rate": 3.085284660993821e-06, + "loss": 0.7139833, + "num_input_tokens_seen": 120614195, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8359375, + "step": 5620, + "time_per_iteration": 2.475889205932617 + }, + { + "auxiliary_loss_clip": 0.01127115, + "auxiliary_loss_mlp": 0.01046185, + "balance_loss_clip": 1.03159392, + "balance_loss_mlp": 1.04497766, + "epoch": 0.33795280324665566, + "flos": 24900028335360.0, + "grad_norm": 2.0914960236262075, + "language_loss": 0.67498147, + "learning_rate": 3.084957506678058e-06, + "loss": 0.69671446, + "num_input_tokens_seen": 120634475, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8203125, + "step": 5621, + "time_per_iteration": 2.4732306003570557 + }, + { + "auxiliary_loss_clip": 0.01124388, + "auxiliary_loss_mlp": 0.01036572, + "balance_loss_clip": 1.02258897, + "balance_loss_mlp": 1.04336381, + "epoch": 0.33801292649932363, + "flos": 24753943722240.0, + "grad_norm": 1.811430245584347, + "language_loss": 0.82714671, + "learning_rate": 3.0846303112205975e-06, + "loss": 0.84875631, + "num_input_tokens_seen": 120654980, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.80859375, + "step": 5622, + "time_per_iteration": 2.5028531551361084 + }, + { + "auxiliary_loss_clip": 0.01122679, + "auxiliary_loss_mlp": 0.01041633, + "balance_loss_clip": 1.0279355, + "balance_loss_mlp": 1.04111528, + "epoch": 0.3380730497519916, + "flos": 26723967096960.0, + "grad_norm": 1.4271980952069887, + "language_loss": 0.73785996, + "learning_rate": 3.0843030746338464e-06, + "loss": 0.75950313, + "num_input_tokens_seen": 120676245, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.81640625, + "step": 5623, + "time_per_iteration": 2.483354091644287 + }, + { + "auxiliary_loss_clip": 0.01044412, + "auxiliary_loss_mlp": 0.01001556, + "balance_loss_clip": 0.99976796, + "balance_loss_mlp": 1.01787817, + "epoch": 0.33813317300465956, + "flos": 70035756416640.0, + "grad_norm": 0.7308868621653948, + "language_loss": 0.54898107, + "learning_rate": 3.083975796930215e-06, + "loss": 0.56944072, + "num_input_tokens_seen": 120741965, + "router_z_loss_clip": 0.01782227, + "router_z_loss_mlp": 0.265625, + "step": 5624, + "time_per_iteration": 3.2154293060302734 + }, + { + "auxiliary_loss_clip": 0.01128701, + "auxiliary_loss_mlp": 0.01040775, + "balance_loss_clip": 1.02536166, + "balance_loss_mlp": 1.04464245, + "epoch": 0.3381932962573275, + "flos": 24097317148800.0, + "grad_norm": 3.114382300094, + "language_loss": 0.73013008, + "learning_rate": 3.083648478122111e-06, + "loss": 0.75182486, + "num_input_tokens_seen": 120760410, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.83984375, + "step": 5625, + "time_per_iteration": 2.4632089138031006 + }, + { + "auxiliary_loss_clip": 0.01129587, + "auxiliary_loss_mlp": 0.01038275, + "balance_loss_clip": 1.02315879, + "balance_loss_mlp": 1.04408085, + "epoch": 0.3382534195099955, + "flos": 19278248768640.0, + "grad_norm": 1.7442247016960708, + "language_loss": 0.70501375, + "learning_rate": 3.0833211182219497e-06, + "loss": 0.72669238, + "num_input_tokens_seen": 120777705, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.85546875, + "step": 5626, + "time_per_iteration": 2.4782652854919434 + }, + { + "auxiliary_loss_clip": 0.01123049, + "auxiliary_loss_mlp": 0.01033781, + "balance_loss_clip": 1.01887965, + "balance_loss_mlp": 1.04265583, + "epoch": 0.33831354276266346, + "flos": 25226240676480.0, + "grad_norm": 1.496721640957227, + "language_loss": 0.81184483, + "learning_rate": 3.0829937172421425e-06, + "loss": 0.83341312, + "num_input_tokens_seen": 120798660, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8046875, + "step": 5627, + "time_per_iteration": 2.48683762550354 + }, + { + "auxiliary_loss_clip": 0.01133025, + "auxiliary_loss_mlp": 0.01038727, + "balance_loss_clip": 1.02332532, + "balance_loss_mlp": 1.04643917, + "epoch": 0.3383736660153314, + "flos": 23112000195840.0, + "grad_norm": 2.112092075284961, + "language_loss": 0.80725849, + "learning_rate": 3.0826662751951055e-06, + "loss": 0.82897604, + "num_input_tokens_seen": 120816705, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.86328125, + "step": 5628, + "time_per_iteration": 2.485978841781616 + }, + { + "auxiliary_loss_clip": 0.01125942, + "auxiliary_loss_mlp": 0.01032154, + "balance_loss_clip": 1.01716328, + "balance_loss_mlp": 1.04272234, + "epoch": 0.3384337892679994, + "flos": 23477139901440.0, + "grad_norm": 1.9378827683544937, + "language_loss": 0.77360773, + "learning_rate": 3.082338792093254e-06, + "loss": 0.79518872, + "num_input_tokens_seen": 120835375, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.83203125, + "step": 5629, + "time_per_iteration": 2.459749937057495 + }, + { + "auxiliary_loss_clip": 0.0112767, + "auxiliary_loss_mlp": 0.01042637, + "balance_loss_clip": 1.02604353, + "balance_loss_mlp": 1.0426172, + "epoch": 0.33849391252066735, + "flos": 19425805839360.0, + "grad_norm": 1.750727836719773, + "language_loss": 0.84873146, + "learning_rate": 3.0820112679490074e-06, + "loss": 0.87043452, + "num_input_tokens_seen": 120854260, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.84765625, + "step": 5630, + "time_per_iteration": 2.502168655395508 + }, + { + "auxiliary_loss_clip": 0.01128287, + "auxiliary_loss_mlp": 0.01039609, + "balance_loss_clip": 1.02593017, + "balance_loss_mlp": 1.04496086, + "epoch": 0.3385540357733353, + "flos": 21064840364160.0, + "grad_norm": 2.44277401951878, + "language_loss": 0.71778762, + "learning_rate": 3.0816837027747857e-06, + "loss": 0.73946661, + "num_input_tokens_seen": 120871590, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.83203125, + "step": 5631, + "time_per_iteration": 2.4541988372802734 + }, + { + "auxiliary_loss_clip": 0.01044995, + "auxiliary_loss_mlp": 0.01006836, + "balance_loss_clip": 1.0050118, + "balance_loss_mlp": 1.01844144, + "epoch": 0.3386141590260033, + "flos": 69208013450880.0, + "grad_norm": 0.84858361279948, + "language_loss": 0.56171906, + "learning_rate": 3.0813560965830084e-06, + "loss": 0.58223736, + "num_input_tokens_seen": 120925550, + "router_z_loss_clip": 0.01818848, + "router_z_loss_mlp": 0.265625, + "step": 5632, + "time_per_iteration": 3.130112409591675 + }, + { + "auxiliary_loss_clip": 0.01126092, + "auxiliary_loss_mlp": 0.01034757, + "balance_loss_clip": 1.01925933, + "balance_loss_mlp": 1.04301071, + "epoch": 0.3386742822786713, + "flos": 25519487310720.0, + "grad_norm": 1.4746675536042473, + "language_loss": 0.80288029, + "learning_rate": 3.0810284493861005e-06, + "loss": 0.82448882, + "num_input_tokens_seen": 120947620, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.828125, + "step": 5633, + "time_per_iteration": 2.4772210121154785 + }, + { + "auxiliary_loss_clip": 0.01126262, + "auxiliary_loss_mlp": 0.01031131, + "balance_loss_clip": 1.01671278, + "balance_loss_mlp": 1.04355168, + "epoch": 0.33873440553133927, + "flos": 23623116773760.0, + "grad_norm": 2.3860801146544692, + "language_loss": 0.59222949, + "learning_rate": 3.0807007611964855e-06, + "loss": 0.61380345, + "num_input_tokens_seen": 120965205, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.828125, + "step": 5634, + "time_per_iteration": 2.490783214569092 + }, + { + "auxiliary_loss_clip": 0.01124946, + "auxiliary_loss_mlp": 0.01033397, + "balance_loss_clip": 1.01930678, + "balance_loss_mlp": 1.04328096, + "epoch": 0.33879452878400723, + "flos": 17088882992640.0, + "grad_norm": 1.758176339753219, + "language_loss": 0.92591304, + "learning_rate": 3.080373032026589e-06, + "loss": 0.94749641, + "num_input_tokens_seen": 120983560, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.81640625, + "step": 5635, + "time_per_iteration": 2.4895272254943848 + }, + { + "auxiliary_loss_clip": 0.01123271, + "auxiliary_loss_mlp": 0.01030062, + "balance_loss_clip": 1.01594758, + "balance_loss_mlp": 1.04428411, + "epoch": 0.3388546520366752, + "flos": 15742053607680.0, + "grad_norm": 1.7397877385381144, + "language_loss": 0.74791968, + "learning_rate": 3.0800452618888386e-06, + "loss": 0.76945299, + "num_input_tokens_seen": 121001400, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 5636, + "time_per_iteration": 2.4868686199188232 + }, + { + "auxiliary_loss_clip": 0.01123089, + "auxiliary_loss_mlp": 0.01037449, + "balance_loss_clip": 1.02264357, + "balance_loss_mlp": 1.04291928, + "epoch": 0.33891477528934316, + "flos": 22418744728320.0, + "grad_norm": 1.533650755617547, + "language_loss": 0.83216572, + "learning_rate": 3.0797174507956637e-06, + "loss": 0.85377115, + "num_input_tokens_seen": 121021760, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.80078125, + "step": 5637, + "time_per_iteration": 5.43249249458313 + }, + { + "auxiliary_loss_clip": 0.0112926, + "auxiliary_loss_mlp": 0.01044612, + "balance_loss_clip": 1.02837586, + "balance_loss_mlp": 1.04624391, + "epoch": 0.3389748985420111, + "flos": 17274828723840.0, + "grad_norm": 1.6200031021198193, + "language_loss": 0.70037901, + "learning_rate": 3.079389598759495e-06, + "loss": 0.72211778, + "num_input_tokens_seen": 121041070, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.828125, + "step": 5638, + "time_per_iteration": 2.430814504623413 + }, + { + "auxiliary_loss_clip": 0.01128885, + "auxiliary_loss_mlp": 0.0104494, + "balance_loss_clip": 1.02993131, + "balance_loss_mlp": 1.0461942, + "epoch": 0.3390350217946791, + "flos": 27744979190400.0, + "grad_norm": 1.644027939558444, + "language_loss": 0.80699074, + "learning_rate": 3.079061705792765e-06, + "loss": 0.82872897, + "num_input_tokens_seen": 121060890, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 5639, + "time_per_iteration": 2.5219810009002686 + }, + { + "auxiliary_loss_clip": 0.01129363, + "auxiliary_loss_mlp": 0.01042542, + "balance_loss_clip": 1.02714002, + "balance_loss_mlp": 1.044734, + "epoch": 0.33909514504734706, + "flos": 20339804338560.0, + "grad_norm": 2.006873412015597, + "language_loss": 0.67907631, + "learning_rate": 3.078733771907907e-06, + "loss": 0.70079535, + "num_input_tokens_seen": 121079135, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84765625, + "step": 5640, + "time_per_iteration": 2.4252562522888184 + }, + { + "auxiliary_loss_clip": 0.01123424, + "auxiliary_loss_mlp": 0.01037389, + "balance_loss_clip": 1.02229738, + "balance_loss_mlp": 1.0432744, + "epoch": 0.339155268300015, + "flos": 14830030356480.0, + "grad_norm": 1.561334672972187, + "language_loss": 0.70158339, + "learning_rate": 3.0784057971173554e-06, + "loss": 0.72319156, + "num_input_tokens_seen": 121097685, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.80078125, + "step": 5641, + "time_per_iteration": 2.4703073501586914 + }, + { + "auxiliary_loss_clip": 0.01129782, + "auxiliary_loss_mlp": 0.01043462, + "balance_loss_clip": 1.02881122, + "balance_loss_mlp": 1.04692698, + "epoch": 0.339215391552683, + "flos": 26067951054720.0, + "grad_norm": 1.7323035027878293, + "language_loss": 0.87336594, + "learning_rate": 3.0780777814335483e-06, + "loss": 0.89509839, + "num_input_tokens_seen": 121115640, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.828125, + "step": 5642, + "time_per_iteration": 2.489957332611084 + }, + { + "auxiliary_loss_clip": 0.01119376, + "auxiliary_loss_mlp": 0.01030563, + "balance_loss_clip": 1.01812363, + "balance_loss_mlp": 1.04361117, + "epoch": 0.33927551480535095, + "flos": 14574705505920.0, + "grad_norm": 1.899951429632433, + "language_loss": 0.83783317, + "learning_rate": 3.077749724868924e-06, + "loss": 0.85933256, + "num_input_tokens_seen": 121132485, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7578125, + "step": 5643, + "time_per_iteration": 2.454176902770996 + }, + { + "auxiliary_loss_clip": 0.01122874, + "auxiliary_loss_mlp": 0.01041824, + "balance_loss_clip": 1.02779329, + "balance_loss_mlp": 1.04303253, + "epoch": 0.3393356380580189, + "flos": 23805578885760.0, + "grad_norm": 1.6286036888414737, + "language_loss": 0.76940101, + "learning_rate": 3.077421627435922e-06, + "loss": 0.79104799, + "num_input_tokens_seen": 121152935, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.796875, + "step": 5644, + "time_per_iteration": 2.46893048286438 + }, + { + "auxiliary_loss_clip": 0.01124612, + "auxiliary_loss_mlp": 0.01043858, + "balance_loss_clip": 1.02898121, + "balance_loss_mlp": 1.04242706, + "epoch": 0.3393957613106869, + "flos": 17347871030400.0, + "grad_norm": 4.638882451456986, + "language_loss": 0.62893367, + "learning_rate": 3.0770934891469832e-06, + "loss": 0.65061837, + "num_input_tokens_seen": 121169835, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8203125, + "step": 5645, + "time_per_iteration": 2.4539859294891357 + }, + { + "auxiliary_loss_clip": 0.01120022, + "auxiliary_loss_mlp": 0.01033694, + "balance_loss_clip": 1.02033067, + "balance_loss_mlp": 1.04122853, + "epoch": 0.3394558845633549, + "flos": 28433960939520.0, + "grad_norm": 2.1237754414429637, + "language_loss": 0.76276195, + "learning_rate": 3.076765310014552e-06, + "loss": 0.78429914, + "num_input_tokens_seen": 121190290, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7890625, + "step": 5646, + "time_per_iteration": 2.4913554191589355 + }, + { + "auxiliary_loss_clip": 0.01128945, + "auxiliary_loss_mlp": 0.01043856, + "balance_loss_clip": 1.02835846, + "balance_loss_mlp": 1.04360342, + "epoch": 0.33951600781602287, + "flos": 22086929865600.0, + "grad_norm": 1.9547585113359744, + "language_loss": 0.79175937, + "learning_rate": 3.0764370900510727e-06, + "loss": 0.81348741, + "num_input_tokens_seen": 121209060, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.85546875, + "step": 5647, + "time_per_iteration": 2.521603584289551 + }, + { + "auxiliary_loss_clip": 0.01128449, + "auxiliary_loss_mlp": 0.01040236, + "balance_loss_clip": 1.02541864, + "balance_loss_mlp": 1.04706085, + "epoch": 0.33957613106869083, + "flos": 23878262056320.0, + "grad_norm": 1.87789373580567, + "language_loss": 0.77358377, + "learning_rate": 3.0761088292689904e-06, + "loss": 0.79527068, + "num_input_tokens_seen": 121227480, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8125, + "step": 5648, + "time_per_iteration": 2.4812231063842773 + }, + { + "auxiliary_loss_clip": 0.0104448, + "auxiliary_loss_mlp": 0.01001624, + "balance_loss_clip": 0.99964541, + "balance_loss_mlp": 1.01817107, + "epoch": 0.3396362543213588, + "flos": 71242642414080.0, + "grad_norm": 0.7825270224300925, + "language_loss": 0.56261832, + "learning_rate": 3.075780527680754e-06, + "loss": 0.5830794, + "num_input_tokens_seen": 121291305, + "router_z_loss_clip": 0.01977539, + "router_z_loss_mlp": 0.26171875, + "step": 5649, + "time_per_iteration": 3.1050350666046143 + }, + { + "auxiliary_loss_clip": 0.01123703, + "auxiliary_loss_mlp": 0.01042955, + "balance_loss_clip": 1.02804756, + "balance_loss_mlp": 1.0422622, + "epoch": 0.33969637757402676, + "flos": 25921615046400.0, + "grad_norm": 1.5021179324123226, + "language_loss": 0.85269898, + "learning_rate": 3.0754521852988117e-06, + "loss": 0.87436557, + "num_input_tokens_seen": 121312740, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8125, + "step": 5650, + "time_per_iteration": 2.5013816356658936 + }, + { + "auxiliary_loss_clip": 0.01123225, + "auxiliary_loss_mlp": 0.01029214, + "balance_loss_clip": 1.01540327, + "balance_loss_mlp": 1.04317355, + "epoch": 0.33975650082669473, + "flos": 35261728663680.0, + "grad_norm": 1.6954461839420942, + "language_loss": 0.70868433, + "learning_rate": 3.0751238021356152e-06, + "loss": 0.73020875, + "num_input_tokens_seen": 121334220, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.80078125, + "step": 5651, + "time_per_iteration": 2.579455852508545 + }, + { + "auxiliary_loss_clip": 0.01123721, + "auxiliary_loss_mlp": 0.01037329, + "balance_loss_clip": 1.02354813, + "balance_loss_mlp": 1.04347372, + "epoch": 0.3398166240793627, + "flos": 16647001879680.0, + "grad_norm": 1.7042541017727943, + "language_loss": 0.81267643, + "learning_rate": 3.074795378203616e-06, + "loss": 0.83428693, + "num_input_tokens_seen": 121351870, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.8046875, + "step": 5652, + "time_per_iteration": 2.4690871238708496 + }, + { + "auxiliary_loss_clip": 0.01128696, + "auxiliary_loss_mlp": 0.01041185, + "balance_loss_clip": 1.02670693, + "balance_loss_mlp": 1.04464078, + "epoch": 0.33987674733203066, + "flos": 24062196625920.0, + "grad_norm": 1.8642865553854127, + "language_loss": 0.77315342, + "learning_rate": 3.0744669135152685e-06, + "loss": 0.79485226, + "num_input_tokens_seen": 121373400, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.83984375, + "step": 5653, + "time_per_iteration": 2.4836156368255615 + }, + { + "auxiliary_loss_clip": 0.01123907, + "auxiliary_loss_mlp": 0.01036159, + "balance_loss_clip": 1.02225959, + "balance_loss_mlp": 1.04310441, + "epoch": 0.3399368705846986, + "flos": 13250678279040.0, + "grad_norm": 4.3033812467068895, + "language_loss": 0.85072839, + "learning_rate": 3.0741384080830278e-06, + "loss": 0.87232912, + "num_input_tokens_seen": 121385225, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.80859375, + "step": 5654, + "time_per_iteration": 2.4139702320098877 + }, + { + "auxiliary_loss_clip": 0.01122836, + "auxiliary_loss_mlp": 0.01042828, + "balance_loss_clip": 1.02853489, + "balance_loss_mlp": 1.04074049, + "epoch": 0.3399969938373666, + "flos": 27012832272000.0, + "grad_norm": 5.132089356193866, + "language_loss": 0.65128249, + "learning_rate": 3.073809861919351e-06, + "loss": 0.67293918, + "num_input_tokens_seen": 121404735, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8203125, + "step": 5655, + "time_per_iteration": 2.475292444229126 + }, + { + "auxiliary_loss_clip": 0.01124776, + "auxiliary_loss_mlp": 0.01041891, + "balance_loss_clip": 1.02781832, + "balance_loss_mlp": 1.04365194, + "epoch": 0.34005711709003456, + "flos": 28550096588160.0, + "grad_norm": 1.4436453355930483, + "language_loss": 0.76766688, + "learning_rate": 3.073481275036697e-06, + "loss": 0.78933358, + "num_input_tokens_seen": 121426780, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8125, + "step": 5656, + "time_per_iteration": 2.550999879837036 + }, + { + "auxiliary_loss_clip": 0.01130894, + "auxiliary_loss_mlp": 0.01039317, + "balance_loss_clip": 1.02413023, + "balance_loss_mlp": 1.04413342, + "epoch": 0.3401172403427025, + "flos": 21617003208960.0, + "grad_norm": 1.5863892165941962, + "language_loss": 0.82438695, + "learning_rate": 3.073152647447525e-06, + "loss": 0.84608912, + "num_input_tokens_seen": 121447245, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8671875, + "step": 5657, + "time_per_iteration": 2.4573473930358887 + }, + { + "auxiliary_loss_clip": 0.01122831, + "auxiliary_loss_mlp": 0.01040787, + "balance_loss_clip": 1.02675629, + "balance_loss_mlp": 1.04342616, + "epoch": 0.3401773635953705, + "flos": 25885776251520.0, + "grad_norm": 1.6511746791476316, + "language_loss": 0.85153604, + "learning_rate": 3.0728239791642976e-06, + "loss": 0.87317222, + "num_input_tokens_seen": 121468165, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.79296875, + "step": 5658, + "time_per_iteration": 2.505319833755493 + }, + { + "auxiliary_loss_clip": 0.01042351, + "auxiliary_loss_mlp": 0.01002353, + "balance_loss_clip": 1.0001955, + "balance_loss_mlp": 1.01611352, + "epoch": 0.3402374868480385, + "flos": 65507995336320.0, + "grad_norm": 0.8147477326465351, + "language_loss": 0.60012162, + "learning_rate": 3.072495270199477e-06, + "loss": 0.62056863, + "num_input_tokens_seen": 121523795, + "router_z_loss_clip": 0.02160645, + "router_z_loss_mlp": 0.26171875, + "step": 5659, + "time_per_iteration": 3.024125814437866 + }, + { + "auxiliary_loss_clip": 0.01122626, + "auxiliary_loss_mlp": 0.01035679, + "balance_loss_clip": 1.02190423, + "balance_loss_mlp": 1.04398155, + "epoch": 0.34029761010070647, + "flos": 24060580513920.0, + "grad_norm": 1.936270792227836, + "language_loss": 0.67855251, + "learning_rate": 3.0721665205655284e-06, + "loss": 0.70013559, + "num_input_tokens_seen": 121542950, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.78515625, + "step": 5660, + "time_per_iteration": 2.5009706020355225 + }, + { + "auxiliary_loss_clip": 0.01125634, + "auxiliary_loss_mlp": 0.01045639, + "balance_loss_clip": 1.0307138, + "balance_loss_mlp": 1.04558277, + "epoch": 0.34035773335337444, + "flos": 27599720590080.0, + "grad_norm": 1.6106101267942714, + "language_loss": 0.67213613, + "learning_rate": 3.071837730274918e-06, + "loss": 0.69384885, + "num_input_tokens_seen": 121562765, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.80078125, + "step": 5661, + "time_per_iteration": 2.501034736633301 + }, + { + "auxiliary_loss_clip": 0.01123137, + "auxiliary_loss_mlp": 0.01037886, + "balance_loss_clip": 1.0241766, + "balance_loss_mlp": 1.04442382, + "epoch": 0.3404178566060424, + "flos": 20812783651200.0, + "grad_norm": 1.9145784194305409, + "language_loss": 0.78845918, + "learning_rate": 3.071508899340113e-06, + "loss": 0.81006938, + "num_input_tokens_seen": 121581610, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7890625, + "step": 5662, + "time_per_iteration": 2.4689018726348877 + }, + { + "auxiliary_loss_clip": 0.01123734, + "auxiliary_loss_mlp": 0.01039121, + "balance_loss_clip": 1.02395773, + "balance_loss_mlp": 1.04277706, + "epoch": 0.34047797985871037, + "flos": 26833566470400.0, + "grad_norm": 1.9415115692891318, + "language_loss": 0.73675144, + "learning_rate": 3.0711800277735833e-06, + "loss": 0.75838, + "num_input_tokens_seen": 121601885, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.80859375, + "step": 5663, + "time_per_iteration": 2.4802587032318115 + }, + { + "auxiliary_loss_clip": 0.01121343, + "auxiliary_loss_mlp": 0.01034164, + "balance_loss_clip": 1.02101541, + "balance_loss_mlp": 1.04342198, + "epoch": 0.34053810311137833, + "flos": 19682639061120.0, + "grad_norm": 2.0753473798431608, + "language_loss": 0.85900557, + "learning_rate": 3.0708511155877997e-06, + "loss": 0.88056058, + "num_input_tokens_seen": 121621335, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.77734375, + "step": 5664, + "time_per_iteration": 2.46343731880188 + }, + { + "auxiliary_loss_clip": 0.01127455, + "auxiliary_loss_mlp": 0.01033802, + "balance_loss_clip": 1.02055156, + "balance_loss_mlp": 1.0459125, + "epoch": 0.3405982263640463, + "flos": 21725740656000.0, + "grad_norm": 1.782528704092853, + "language_loss": 0.69047546, + "learning_rate": 3.070522162795235e-06, + "loss": 0.71208799, + "num_input_tokens_seen": 121641310, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.81640625, + "step": 5665, + "time_per_iteration": 2.4448721408843994 + }, + { + "auxiliary_loss_clip": 0.01123992, + "auxiliary_loss_mlp": 0.01035732, + "balance_loss_clip": 1.02006817, + "balance_loss_mlp": 1.04218054, + "epoch": 0.34065834961671426, + "flos": 18041629288320.0, + "grad_norm": 2.296518315240935, + "language_loss": 0.72806692, + "learning_rate": 3.0701931694083626e-06, + "loss": 0.74966413, + "num_input_tokens_seen": 121659625, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8203125, + "step": 5666, + "time_per_iteration": 2.4749717712402344 + }, + { + "auxiliary_loss_clip": 0.01126484, + "auxiliary_loss_mlp": 0.0103642, + "balance_loss_clip": 1.02236485, + "balance_loss_mlp": 1.04428983, + "epoch": 0.3407184728693822, + "flos": 21397337585280.0, + "grad_norm": 1.5083890198292058, + "language_loss": 0.73306108, + "learning_rate": 3.0698641354396576e-06, + "loss": 0.75469005, + "num_input_tokens_seen": 121679205, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8203125, + "step": 5667, + "time_per_iteration": 2.467684030532837 + }, + { + "auxiliary_loss_clip": 0.0104148, + "auxiliary_loss_mlp": 0.01001962, + "balance_loss_clip": 1.00007808, + "balance_loss_mlp": 1.01518095, + "epoch": 0.3407785961220502, + "flos": 68688101018880.0, + "grad_norm": 0.8424548288565059, + "language_loss": 0.6331358, + "learning_rate": 3.069535060901597e-06, + "loss": 0.65357018, + "num_input_tokens_seen": 121751085, + "router_z_loss_clip": 0.01879883, + "router_z_loss_mlp": 0.26367188, + "step": 5668, + "time_per_iteration": 3.233991861343384 + }, + { + "auxiliary_loss_clip": 0.01124776, + "auxiliary_loss_mlp": 0.01039147, + "balance_loss_clip": 1.02460372, + "balance_loss_mlp": 1.04407477, + "epoch": 0.34083871937471816, + "flos": 14064379027200.0, + "grad_norm": 2.1457172939364892, + "language_loss": 0.72030753, + "learning_rate": 3.0692059458066596e-06, + "loss": 0.74194676, + "num_input_tokens_seen": 121768565, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.80859375, + "step": 5669, + "time_per_iteration": 2.4226186275482178 + }, + { + "auxiliary_loss_clip": 0.01127607, + "auxiliary_loss_mlp": 0.01035265, + "balance_loss_clip": 1.02078128, + "balance_loss_mlp": 1.04468203, + "epoch": 0.3408988426273861, + "flos": 17085435287040.0, + "grad_norm": 1.9050671295461388, + "language_loss": 0.80285168, + "learning_rate": 3.0688767901673265e-06, + "loss": 0.82448041, + "num_input_tokens_seen": 121784925, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.828125, + "step": 5670, + "time_per_iteration": 2.4354984760284424 + }, + { + "auxiliary_loss_clip": 0.01127772, + "auxiliary_loss_mlp": 0.0103567, + "balance_loss_clip": 1.02122176, + "balance_loss_mlp": 1.04374027, + "epoch": 0.3409589658800541, + "flos": 24024562151040.0, + "grad_norm": 1.5994061750955757, + "language_loss": 0.76886785, + "learning_rate": 3.068547593996078e-06, + "loss": 0.79050225, + "num_input_tokens_seen": 121804425, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.83984375, + "step": 5671, + "time_per_iteration": 2.4775397777557373 + }, + { + "auxiliary_loss_clip": 0.01125342, + "auxiliary_loss_mlp": 0.01040087, + "balance_loss_clip": 1.02513266, + "balance_loss_mlp": 1.04437792, + "epoch": 0.34101908913272205, + "flos": 21142012734720.0, + "grad_norm": 1.9602332848552635, + "language_loss": 0.74416959, + "learning_rate": 3.0682183573053974e-06, + "loss": 0.7658239, + "num_input_tokens_seen": 121825145, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.80859375, + "step": 5672, + "time_per_iteration": 2.5027272701263428 + }, + { + "auxiliary_loss_clip": 0.01127201, + "auxiliary_loss_mlp": 0.01032286, + "balance_loss_clip": 1.01889324, + "balance_loss_mlp": 1.04523087, + "epoch": 0.3410792123853901, + "flos": 15702012921600.0, + "grad_norm": 1.991076139860355, + "language_loss": 0.73781157, + "learning_rate": 3.06788908010777e-06, + "loss": 0.75940639, + "num_input_tokens_seen": 121842185, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.8203125, + "step": 5673, + "time_per_iteration": 2.424955368041992 + }, + { + "auxiliary_loss_clip": 0.01123926, + "auxiliary_loss_mlp": 0.01036446, + "balance_loss_clip": 1.02243853, + "balance_loss_mlp": 1.04432535, + "epoch": 0.34113933563805804, + "flos": 23036012974080.0, + "grad_norm": 1.774655206888726, + "language_loss": 0.79900169, + "learning_rate": 3.067559762415682e-06, + "loss": 0.8206054, + "num_input_tokens_seen": 121862260, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.796875, + "step": 5674, + "time_per_iteration": 2.490407705307007 + }, + { + "auxiliary_loss_clip": 0.01041345, + "auxiliary_loss_mlp": 0.01001058, + "balance_loss_clip": 0.99942493, + "balance_loss_mlp": 1.01517344, + "epoch": 0.341199458890726, + "flos": 69614235336960.0, + "grad_norm": 0.7963469989165133, + "language_loss": 0.56096685, + "learning_rate": 3.0672304042416198e-06, + "loss": 0.58139086, + "num_input_tokens_seen": 121923560, + "router_z_loss_clip": 0.01635742, + "router_z_loss_mlp": 0.26171875, + "step": 5675, + "time_per_iteration": 3.223119020462036 + }, + { + "auxiliary_loss_clip": 0.01123194, + "auxiliary_loss_mlp": 0.01041089, + "balance_loss_clip": 1.0270282, + "balance_loss_mlp": 1.04428756, + "epoch": 0.34125958214339397, + "flos": 22346348866560.0, + "grad_norm": 1.6179892480447855, + "language_loss": 0.79029286, + "learning_rate": 3.0669010055980734e-06, + "loss": 0.81193566, + "num_input_tokens_seen": 121943515, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 5676, + "time_per_iteration": 2.4798848628997803 + }, + { + "auxiliary_loss_clip": 0.01123343, + "auxiliary_loss_mlp": 0.0103332, + "balance_loss_clip": 1.01836538, + "balance_loss_mlp": 1.0424788, + "epoch": 0.34131970539606193, + "flos": 21871933009920.0, + "grad_norm": 1.8072554320592242, + "language_loss": 0.85598934, + "learning_rate": 3.0665715664975357e-06, + "loss": 0.87755597, + "num_input_tokens_seen": 121962540, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.80859375, + "step": 5677, + "time_per_iteration": 2.4501733779907227 + }, + { + "auxiliary_loss_clip": 0.01122654, + "auxiliary_loss_mlp": 0.01041495, + "balance_loss_clip": 1.0266118, + "balance_loss_mlp": 1.04244757, + "epoch": 0.3413798286487299, + "flos": 24935723475840.0, + "grad_norm": 2.009404852791833, + "language_loss": 0.79283166, + "learning_rate": 3.0662420869524966e-06, + "loss": 0.81447315, + "num_input_tokens_seen": 121979830, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.80078125, + "step": 5678, + "time_per_iteration": 4.054651260375977 + }, + { + "auxiliary_loss_clip": 0.01123013, + "auxiliary_loss_mlp": 0.01033318, + "balance_loss_clip": 1.01983547, + "balance_loss_mlp": 1.04135132, + "epoch": 0.34143995190139786, + "flos": 25374372364800.0, + "grad_norm": 1.8818653655236122, + "language_loss": 0.74546856, + "learning_rate": 3.0659125669754506e-06, + "loss": 0.76703185, + "num_input_tokens_seen": 121999055, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.81640625, + "step": 5679, + "time_per_iteration": 3.9024462699890137 + }, + { + "auxiliary_loss_clip": 0.01042201, + "auxiliary_loss_mlp": 0.01001255, + "balance_loss_clip": 0.99970549, + "balance_loss_mlp": 1.01624846, + "epoch": 0.34150007515406583, + "flos": 67782578129280.0, + "grad_norm": 0.7519133883291979, + "language_loss": 0.59481025, + "learning_rate": 3.0655830065788923e-06, + "loss": 0.61524487, + "num_input_tokens_seen": 122067015, + "router_z_loss_clip": 0.01544189, + "router_z_loss_mlp": 0.25976562, + "step": 5680, + "time_per_iteration": 3.152480125427246 + }, + { + "auxiliary_loss_clip": 0.01121207, + "auxiliary_loss_mlp": 0.01033444, + "balance_loss_clip": 1.01953864, + "balance_loss_mlp": 1.04320455, + "epoch": 0.3415601984067338, + "flos": 20302421258880.0, + "grad_norm": 2.208026502208574, + "language_loss": 0.7233687, + "learning_rate": 3.0652534057753206e-06, + "loss": 0.74491525, + "num_input_tokens_seen": 122085295, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.78125, + "step": 5681, + "time_per_iteration": 2.4450337886810303 + }, + { + "auxiliary_loss_clip": 0.01118824, + "auxiliary_loss_mlp": 0.01041972, + "balance_loss_clip": 1.02798879, + "balance_loss_mlp": 1.04110432, + "epoch": 0.34162032165940176, + "flos": 26031178506240.0, + "grad_norm": 2.0075854608407058, + "language_loss": 0.7144351, + "learning_rate": 3.064923764577233e-06, + "loss": 0.7360431, + "num_input_tokens_seen": 122104020, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7734375, + "step": 5682, + "time_per_iteration": 2.53000807762146 + }, + { + "auxiliary_loss_clip": 0.01120348, + "auxiliary_loss_mlp": 0.01039153, + "balance_loss_clip": 1.02446055, + "balance_loss_mlp": 1.04079127, + "epoch": 0.3416804449120697, + "flos": 28803338449920.0, + "grad_norm": 1.4570201559150766, + "language_loss": 0.8396616, + "learning_rate": 3.0645940829971295e-06, + "loss": 0.86125666, + "num_input_tokens_seen": 122125080, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.796875, + "step": 5683, + "time_per_iteration": 2.511646270751953 + }, + { + "auxiliary_loss_clip": 0.01126192, + "auxiliary_loss_mlp": 0.01047188, + "balance_loss_clip": 1.03189898, + "balance_loss_mlp": 1.04384482, + "epoch": 0.3417405681647377, + "flos": 22601601889920.0, + "grad_norm": 2.5567263249521965, + "language_loss": 0.70622635, + "learning_rate": 3.0642643610475116e-06, + "loss": 0.72796011, + "num_input_tokens_seen": 122146350, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.82421875, + "step": 5684, + "time_per_iteration": 2.58811616897583 + }, + { + "auxiliary_loss_clip": 0.01120756, + "auxiliary_loss_mlp": 0.01034084, + "balance_loss_clip": 1.02119195, + "balance_loss_mlp": 1.0428822, + "epoch": 0.34180069141740566, + "flos": 24716237420160.0, + "grad_norm": 1.480860615854928, + "language_loss": 0.75386423, + "learning_rate": 3.0639345987408823e-06, + "loss": 0.77541268, + "num_input_tokens_seen": 122168085, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.78125, + "step": 5685, + "time_per_iteration": 2.485405445098877 + }, + { + "auxiliary_loss_clip": 0.01120925, + "auxiliary_loss_mlp": 0.0103682, + "balance_loss_clip": 1.02399325, + "balance_loss_mlp": 1.04268134, + "epoch": 0.3418608146700737, + "flos": 30518755246080.0, + "grad_norm": 1.6707381387615057, + "language_loss": 0.70186603, + "learning_rate": 3.0636047960897468e-06, + "loss": 0.72344351, + "num_input_tokens_seen": 122191040, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.78125, + "step": 5686, + "time_per_iteration": 2.5536224842071533 + }, + { + "auxiliary_loss_clip": 0.01121848, + "auxiliary_loss_mlp": 0.01042102, + "balance_loss_clip": 1.02681327, + "balance_loss_mlp": 1.04087019, + "epoch": 0.34192093792274164, + "flos": 15122343237120.0, + "grad_norm": 2.6880234800017844, + "language_loss": 0.77629769, + "learning_rate": 3.06327495310661e-06, + "loss": 0.79793721, + "num_input_tokens_seen": 122209225, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8125, + "step": 5687, + "time_per_iteration": 2.4526383876800537 + }, + { + "auxiliary_loss_clip": 0.01122013, + "auxiliary_loss_mlp": 0.01034386, + "balance_loss_clip": 1.01947296, + "balance_loss_mlp": 1.04425466, + "epoch": 0.3419810611754096, + "flos": 13187799521280.0, + "grad_norm": 1.7522626505921908, + "language_loss": 0.86505169, + "learning_rate": 3.062945069803981e-06, + "loss": 0.88661563, + "num_input_tokens_seen": 122226160, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.77734375, + "step": 5688, + "time_per_iteration": 2.457873821258545 + }, + { + "auxiliary_loss_clip": 0.01129554, + "auxiliary_loss_mlp": 0.01038372, + "balance_loss_clip": 1.02274323, + "balance_loss_mlp": 1.04438853, + "epoch": 0.34204118442807757, + "flos": 19536267139200.0, + "grad_norm": 1.6277101200549902, + "language_loss": 0.79875666, + "learning_rate": 3.0626151461943684e-06, + "loss": 0.82043588, + "num_input_tokens_seen": 122243115, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8515625, + "step": 5689, + "time_per_iteration": 2.4494895935058594 + }, + { + "auxiliary_loss_clip": 0.01124588, + "auxiliary_loss_mlp": 0.01038419, + "balance_loss_clip": 1.02351832, + "balance_loss_mlp": 1.04300821, + "epoch": 0.34210130768074554, + "flos": 15194846839680.0, + "grad_norm": 2.0745412821804057, + "language_loss": 0.7351048, + "learning_rate": 3.0622851822902834e-06, + "loss": 0.75673485, + "num_input_tokens_seen": 122261105, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.81640625, + "step": 5690, + "time_per_iteration": 2.448133945465088 + }, + { + "auxiliary_loss_clip": 0.01120421, + "auxiliary_loss_mlp": 0.01036215, + "balance_loss_clip": 1.02270865, + "balance_loss_mlp": 1.03998768, + "epoch": 0.3421614309334135, + "flos": 24936226266240.0, + "grad_norm": 2.433761635396741, + "language_loss": 0.7631194, + "learning_rate": 3.061955178104237e-06, + "loss": 0.78468573, + "num_input_tokens_seen": 122279995, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.8046875, + "step": 5691, + "time_per_iteration": 2.479569435119629 + }, + { + "auxiliary_loss_clip": 0.01120907, + "auxiliary_loss_mlp": 0.01041441, + "balance_loss_clip": 1.02782106, + "balance_loss_mlp": 1.0415988, + "epoch": 0.34222155418608147, + "flos": 21908633731200.0, + "grad_norm": 1.5387604656502187, + "language_loss": 0.68159282, + "learning_rate": 3.0616251336487447e-06, + "loss": 0.70321631, + "num_input_tokens_seen": 122299070, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.79296875, + "step": 5692, + "time_per_iteration": 2.490466356277466 + }, + { + "auxiliary_loss_clip": 0.01124667, + "auxiliary_loss_mlp": 0.01042741, + "balance_loss_clip": 1.02682638, + "balance_loss_mlp": 1.04275179, + "epoch": 0.34228167743874943, + "flos": 18114061063680.0, + "grad_norm": 2.6924087388900606, + "language_loss": 0.72292894, + "learning_rate": 3.06129504893632e-06, + "loss": 0.74460298, + "num_input_tokens_seen": 122316800, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8203125, + "step": 5693, + "time_per_iteration": 2.451026439666748 + }, + { + "auxiliary_loss_clip": 0.01122133, + "auxiliary_loss_mlp": 0.01037278, + "balance_loss_clip": 1.02408743, + "balance_loss_mlp": 1.0417974, + "epoch": 0.3423418006914174, + "flos": 21288600138240.0, + "grad_norm": 1.7157866574439644, + "language_loss": 0.75877678, + "learning_rate": 3.0609649239794813e-06, + "loss": 0.78037089, + "num_input_tokens_seen": 122335275, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.8046875, + "step": 5694, + "time_per_iteration": 2.499997615814209 + }, + { + "auxiliary_loss_clip": 0.01119791, + "auxiliary_loss_mlp": 0.01036927, + "balance_loss_clip": 1.02320051, + "balance_loss_mlp": 1.04253125, + "epoch": 0.34240192394408536, + "flos": 19823480288640.0, + "grad_norm": 1.9697512050835562, + "language_loss": 0.79815507, + "learning_rate": 3.060634758790747e-06, + "loss": 0.81972229, + "num_input_tokens_seen": 122353215, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7734375, + "step": 5695, + "time_per_iteration": 2.4279983043670654 + }, + { + "auxiliary_loss_clip": 0.01122261, + "auxiliary_loss_mlp": 0.01039624, + "balance_loss_clip": 1.0248661, + "balance_loss_mlp": 1.04168487, + "epoch": 0.3424620471967533, + "flos": 24535535074560.0, + "grad_norm": 1.7314755849975545, + "language_loss": 0.73487073, + "learning_rate": 3.060304553382635e-06, + "loss": 0.75648957, + "num_input_tokens_seen": 122372495, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8046875, + "step": 5696, + "time_per_iteration": 2.507782459259033 + }, + { + "auxiliary_loss_clip": 0.01122963, + "auxiliary_loss_mlp": 0.01047657, + "balance_loss_clip": 1.03301835, + "balance_loss_mlp": 1.0419805, + "epoch": 0.3425221704494213, + "flos": 25848895962240.0, + "grad_norm": 1.6676891559017708, + "language_loss": 0.70874155, + "learning_rate": 3.0599743077676685e-06, + "loss": 0.73044771, + "num_input_tokens_seen": 122394600, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.80859375, + "step": 5697, + "time_per_iteration": 2.4868175983428955 + }, + { + "auxiliary_loss_clip": 0.01122392, + "auxiliary_loss_mlp": 0.01034383, + "balance_loss_clip": 1.01949954, + "balance_loss_mlp": 1.04456246, + "epoch": 0.34258229370208926, + "flos": 21540513196800.0, + "grad_norm": 1.6712097888676536, + "language_loss": 0.81875223, + "learning_rate": 3.05964402195837e-06, + "loss": 0.84031999, + "num_input_tokens_seen": 122414700, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.77734375, + "step": 5698, + "time_per_iteration": 2.500499725341797 + }, + { + "auxiliary_loss_clip": 0.01121288, + "auxiliary_loss_mlp": 0.01043706, + "balance_loss_clip": 1.02712393, + "balance_loss_mlp": 1.03982306, + "epoch": 0.3426424169547573, + "flos": 23652778429440.0, + "grad_norm": 1.9988541020523172, + "language_loss": 0.69163442, + "learning_rate": 3.0593136959672645e-06, + "loss": 0.71328437, + "num_input_tokens_seen": 122432760, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8125, + "step": 5699, + "time_per_iteration": 2.4522063732147217 + }, + { + "auxiliary_loss_clip": 0.01123011, + "auxiliary_loss_mlp": 0.01035122, + "balance_loss_clip": 1.0211153, + "balance_loss_mlp": 1.0424068, + "epoch": 0.34270254020742524, + "flos": 24644883052800.0, + "grad_norm": 2.0139701241951196, + "language_loss": 0.72246462, + "learning_rate": 3.058983329806877e-06, + "loss": 0.74404591, + "num_input_tokens_seen": 122449105, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8046875, + "step": 5700, + "time_per_iteration": 2.4942879676818848 + }, + { + "auxiliary_loss_clip": 0.01123902, + "auxiliary_loss_mlp": 0.01033961, + "balance_loss_clip": 1.02018046, + "balance_loss_mlp": 1.04403377, + "epoch": 0.3427626634600932, + "flos": 20996754134400.0, + "grad_norm": 2.026861038115517, + "language_loss": 0.81818259, + "learning_rate": 3.0586529234897354e-06, + "loss": 0.83976114, + "num_input_tokens_seen": 122468700, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.796875, + "step": 5701, + "time_per_iteration": 2.4650135040283203 + }, + { + "auxiliary_loss_clip": 0.01124816, + "auxiliary_loss_mlp": 0.01032731, + "balance_loss_clip": 1.01886129, + "balance_loss_mlp": 1.04328442, + "epoch": 0.3428227867127612, + "flos": 21433786911360.0, + "grad_norm": 1.616013756330385, + "language_loss": 0.71818215, + "learning_rate": 3.0583224770283694e-06, + "loss": 0.73975766, + "num_input_tokens_seen": 122488160, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.81640625, + "step": 5702, + "time_per_iteration": 2.446018695831299 + }, + { + "auxiliary_loss_clip": 0.01038258, + "auxiliary_loss_mlp": 0.01007974, + "balance_loss_clip": 1.00623345, + "balance_loss_mlp": 1.01261425, + "epoch": 0.34288290996542914, + "flos": 55731782695680.0, + "grad_norm": 0.78067456401119, + "language_loss": 0.57387871, + "learning_rate": 3.057991990435309e-06, + "loss": 0.5943411, + "num_input_tokens_seen": 122542890, + "router_z_loss_clip": 0.01745605, + "router_z_loss_mlp": 0.2578125, + "step": 5703, + "time_per_iteration": 2.9596943855285645 + }, + { + "auxiliary_loss_clip": 0.01125647, + "auxiliary_loss_mlp": 0.01041995, + "balance_loss_clip": 1.02599692, + "balance_loss_mlp": 1.04436553, + "epoch": 0.3429430332180971, + "flos": 20156803522560.0, + "grad_norm": 1.8868866692845514, + "language_loss": 0.74849427, + "learning_rate": 3.057661463723086e-06, + "loss": 0.77017069, + "num_input_tokens_seen": 122561770, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8125, + "step": 5704, + "time_per_iteration": 2.475206136703491 + }, + { + "auxiliary_loss_clip": 0.01122188, + "auxiliary_loss_mlp": 0.01035238, + "balance_loss_clip": 1.0218513, + "balance_loss_mlp": 1.0432725, + "epoch": 0.34300315647076507, + "flos": 17965857548160.0, + "grad_norm": 2.4058395538044572, + "language_loss": 0.73303944, + "learning_rate": 3.0573308969042346e-06, + "loss": 0.75461364, + "num_input_tokens_seen": 122580580, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7890625, + "step": 5705, + "time_per_iteration": 2.435140609741211 + }, + { + "auxiliary_loss_clip": 0.0112299, + "auxiliary_loss_mlp": 0.01035313, + "balance_loss_clip": 1.0204711, + "balance_loss_mlp": 1.04320812, + "epoch": 0.34306327972343303, + "flos": 22086822124800.0, + "grad_norm": 3.54760070735666, + "language_loss": 0.79599071, + "learning_rate": 3.057000289991289e-06, + "loss": 0.81757367, + "num_input_tokens_seen": 122599810, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 5706, + "time_per_iteration": 2.4922068119049072 + }, + { + "auxiliary_loss_clip": 0.01127669, + "auxiliary_loss_mlp": 0.01032738, + "balance_loss_clip": 1.01815271, + "balance_loss_mlp": 1.04497337, + "epoch": 0.343123402976101, + "flos": 18442679616000.0, + "grad_norm": 1.9921713202453553, + "language_loss": 0.83170593, + "learning_rate": 3.056669642996787e-06, + "loss": 0.85330999, + "num_input_tokens_seen": 122616035, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.828125, + "step": 5707, + "time_per_iteration": 2.441812753677368 + }, + { + "auxiliary_loss_clip": 0.01126551, + "auxiliary_loss_mlp": 0.0103365, + "balance_loss_clip": 1.01919019, + "balance_loss_mlp": 1.04623604, + "epoch": 0.34318352622876896, + "flos": 17163685065600.0, + "grad_norm": 1.5424527465289883, + "language_loss": 0.75429368, + "learning_rate": 3.056338955933266e-06, + "loss": 0.77589571, + "num_input_tokens_seen": 122633785, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8046875, + "step": 5708, + "time_per_iteration": 2.448415756225586 + }, + { + "auxiliary_loss_clip": 0.01120092, + "auxiliary_loss_mlp": 0.01034667, + "balance_loss_clip": 1.02046943, + "balance_loss_mlp": 1.04284358, + "epoch": 0.34324364948143693, + "flos": 26688164215680.0, + "grad_norm": 1.6552343197625845, + "language_loss": 0.81159383, + "learning_rate": 3.0560082288132662e-06, + "loss": 0.83314145, + "num_input_tokens_seen": 122652100, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7734375, + "step": 5709, + "time_per_iteration": 2.488879919052124 + }, + { + "auxiliary_loss_clip": 0.01125291, + "auxiliary_loss_mlp": 0.01039585, + "balance_loss_clip": 1.0235213, + "balance_loss_mlp": 1.04413152, + "epoch": 0.3433037727341049, + "flos": 21251576194560.0, + "grad_norm": 2.1306910299424677, + "language_loss": 0.79152101, + "learning_rate": 3.055677461649329e-06, + "loss": 0.81316978, + "num_input_tokens_seen": 122669720, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8125, + "step": 5710, + "time_per_iteration": 2.487224817276001 + }, + { + "auxiliary_loss_clip": 0.01124884, + "auxiliary_loss_mlp": 0.01036256, + "balance_loss_clip": 1.0209142, + "balance_loss_mlp": 1.04181814, + "epoch": 0.34336389598677286, + "flos": 20629423699200.0, + "grad_norm": 1.821164645381994, + "language_loss": 0.69994622, + "learning_rate": 3.055346654453996e-06, + "loss": 0.72155762, + "num_input_tokens_seen": 122688715, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.828125, + "step": 5711, + "time_per_iteration": 2.471989631652832 + }, + { + "auxiliary_loss_clip": 0.01123068, + "auxiliary_loss_mlp": 0.01039448, + "balance_loss_clip": 1.02455926, + "balance_loss_mlp": 1.04235482, + "epoch": 0.3434240192394409, + "flos": 14538579402240.0, + "grad_norm": 1.7360043656013842, + "language_loss": 0.68002397, + "learning_rate": 3.055015807239812e-06, + "loss": 0.70164913, + "num_input_tokens_seen": 122706970, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.80859375, + "step": 5712, + "time_per_iteration": 2.440960168838501 + }, + { + "auxiliary_loss_clip": 0.01036814, + "auxiliary_loss_mlp": 0.01007067, + "balance_loss_clip": 1.00550556, + "balance_loss_mlp": 1.011006, + "epoch": 0.34348414249210885, + "flos": 58051538841600.0, + "grad_norm": 0.8415582534154722, + "language_loss": 0.58101094, + "learning_rate": 3.0546849200193226e-06, + "loss": 0.60144973, + "num_input_tokens_seen": 122758095, + "router_z_loss_clip": 0.01556396, + "router_z_loss_mlp": 0.2578125, + "step": 5713, + "time_per_iteration": 3.018573045730591 + }, + { + "auxiliary_loss_clip": 0.01122962, + "auxiliary_loss_mlp": 0.01042443, + "balance_loss_clip": 1.02773833, + "balance_loss_mlp": 1.04283524, + "epoch": 0.3435442657447768, + "flos": 20704441253760.0, + "grad_norm": 1.6636797952259372, + "language_loss": 0.80745685, + "learning_rate": 3.054353992805076e-06, + "loss": 0.82911092, + "num_input_tokens_seen": 122777815, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.80078125, + "step": 5714, + "time_per_iteration": 2.4916322231292725 + }, + { + "auxiliary_loss_clip": 0.01126185, + "auxiliary_loss_mlp": 0.01040552, + "balance_loss_clip": 1.02519822, + "balance_loss_mlp": 1.04508591, + "epoch": 0.3436043889974448, + "flos": 22930256355840.0, + "grad_norm": 1.759201097406795, + "language_loss": 0.71844554, + "learning_rate": 3.05402302560962e-06, + "loss": 0.7401129, + "num_input_tokens_seen": 122797555, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8125, + "step": 5715, + "time_per_iteration": 2.468292474746704 + }, + { + "auxiliary_loss_clip": 0.01036063, + "auxiliary_loss_mlp": 0.01006756, + "balance_loss_clip": 1.00499201, + "balance_loss_mlp": 1.01020741, + "epoch": 0.34366451225011274, + "flos": 58403285752320.0, + "grad_norm": 0.8941035310387452, + "language_loss": 0.65942305, + "learning_rate": 3.053692018445505e-06, + "loss": 0.67985129, + "num_input_tokens_seen": 122863955, + "router_z_loss_clip": 0.0177002, + "router_z_loss_mlp": 0.2578125, + "step": 5716, + "time_per_iteration": 3.101933717727661 + }, + { + "auxiliary_loss_clip": 0.0112152, + "auxiliary_loss_mlp": 0.0104123, + "balance_loss_clip": 1.02705014, + "balance_loss_mlp": 1.04254961, + "epoch": 0.3437246355027807, + "flos": 15596292216960.0, + "grad_norm": 2.0405702698755657, + "language_loss": 0.74612904, + "learning_rate": 3.0533609713252838e-06, + "loss": 0.76775646, + "num_input_tokens_seen": 122883000, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7890625, + "step": 5717, + "time_per_iteration": 2.426793098449707 + }, + { + "auxiliary_loss_clip": 0.01123044, + "auxiliary_loss_mlp": 0.01042851, + "balance_loss_clip": 1.02894473, + "balance_loss_mlp": 1.0413748, + "epoch": 0.34378475875544867, + "flos": 27672260106240.0, + "grad_norm": 1.6999619338826393, + "language_loss": 0.7507081, + "learning_rate": 3.0530298842615077e-06, + "loss": 0.77236706, + "num_input_tokens_seen": 122903265, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.81640625, + "step": 5718, + "time_per_iteration": 2.534557342529297 + }, + { + "auxiliary_loss_clip": 0.01125265, + "auxiliary_loss_mlp": 0.01040651, + "balance_loss_clip": 1.02563679, + "balance_loss_mlp": 1.04245746, + "epoch": 0.34384488200811664, + "flos": 31431496769280.0, + "grad_norm": 1.9991347741656986, + "language_loss": 0.63971305, + "learning_rate": 3.052698757266734e-06, + "loss": 0.66137218, + "num_input_tokens_seen": 122923860, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 5719, + "time_per_iteration": 2.5236892700195312 + }, + { + "auxiliary_loss_clip": 0.01124826, + "auxiliary_loss_mlp": 0.01038568, + "balance_loss_clip": 1.02251017, + "balance_loss_mlp": 1.0418756, + "epoch": 0.3439050052607846, + "flos": 24899920594560.0, + "grad_norm": 2.111950804429908, + "language_loss": 0.73612356, + "learning_rate": 3.0523675903535183e-06, + "loss": 0.75775748, + "num_input_tokens_seen": 122945305, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.828125, + "step": 5720, + "time_per_iteration": 5.3536376953125 + }, + { + "auxiliary_loss_clip": 0.0112352, + "auxiliary_loss_mlp": 0.01040582, + "balance_loss_clip": 1.02520978, + "balance_loss_mlp": 1.04300022, + "epoch": 0.34396512851345257, + "flos": 18150079426560.0, + "grad_norm": 1.805745396214866, + "language_loss": 0.74198145, + "learning_rate": 3.0520363835344173e-06, + "loss": 0.76362252, + "num_input_tokens_seen": 122962535, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8046875, + "step": 5721, + "time_per_iteration": 2.4301607608795166 + }, + { + "auxiliary_loss_clip": 0.01126876, + "auxiliary_loss_mlp": 0.0104413, + "balance_loss_clip": 1.0286088, + "balance_loss_mlp": 1.04481733, + "epoch": 0.34402525176612053, + "flos": 16034438315520.0, + "grad_norm": 3.5063882769532313, + "language_loss": 0.80132651, + "learning_rate": 3.051705136821992e-06, + "loss": 0.82303661, + "num_input_tokens_seen": 122979750, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8203125, + "step": 5722, + "time_per_iteration": 2.411731243133545 + }, + { + "auxiliary_loss_clip": 0.01122709, + "auxiliary_loss_mlp": 0.01032098, + "balance_loss_clip": 1.01809728, + "balance_loss_mlp": 1.04312289, + "epoch": 0.3440853750187885, + "flos": 21178641628800.0, + "grad_norm": 1.5863267197766868, + "language_loss": 0.8194539, + "learning_rate": 3.051373850228801e-06, + "loss": 0.84100199, + "num_input_tokens_seen": 122998955, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.796875, + "step": 5723, + "time_per_iteration": 2.476672410964966 + }, + { + "auxiliary_loss_clip": 0.01124156, + "auxiliary_loss_mlp": 0.01039991, + "balance_loss_clip": 1.02559686, + "balance_loss_mlp": 1.0428493, + "epoch": 0.34414549827145646, + "flos": 12677868092160.0, + "grad_norm": 1.852885568649272, + "language_loss": 0.8147676, + "learning_rate": 3.0510425237674096e-06, + "loss": 0.83640903, + "num_input_tokens_seen": 123016165, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.8125, + "step": 5724, + "time_per_iteration": 2.4115889072418213 + }, + { + "auxiliary_loss_clip": 0.01125316, + "auxiliary_loss_mlp": 0.01036091, + "balance_loss_clip": 1.0210526, + "balance_loss_mlp": 1.04397368, + "epoch": 0.3442056215241244, + "flos": 31284514316160.0, + "grad_norm": 1.759268883551978, + "language_loss": 0.6919744, + "learning_rate": 3.05071115745038e-06, + "loss": 0.71358848, + "num_input_tokens_seen": 123036900, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8125, + "step": 5725, + "time_per_iteration": 2.589571714401245 + }, + { + "auxiliary_loss_clip": 0.01130624, + "auxiliary_loss_mlp": 0.01042614, + "balance_loss_clip": 1.02578139, + "balance_loss_mlp": 1.04464412, + "epoch": 0.34426574477679245, + "flos": 23367289132800.0, + "grad_norm": 1.4578739764018875, + "language_loss": 0.69519544, + "learning_rate": 3.0503797512902773e-06, + "loss": 0.71692783, + "num_input_tokens_seen": 123057480, + "router_z_loss_clip": 0.16796875, + "router_z_loss_mlp": 0.859375, + "step": 5726, + "time_per_iteration": 2.4600956439971924 + }, + { + "auxiliary_loss_clip": 0.01123936, + "auxiliary_loss_mlp": 0.0103637, + "balance_loss_clip": 1.02222002, + "balance_loss_mlp": 1.0427928, + "epoch": 0.3443258680294604, + "flos": 24535427333760.0, + "grad_norm": 1.656148044371735, + "language_loss": 0.73426235, + "learning_rate": 3.0500483052996703e-06, + "loss": 0.7558654, + "num_input_tokens_seen": 123076890, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8125, + "step": 5727, + "time_per_iteration": 2.5102531909942627 + }, + { + "auxiliary_loss_clip": 0.01125161, + "auxiliary_loss_mlp": 0.01041626, + "balance_loss_clip": 1.02636731, + "balance_loss_mlp": 1.04398954, + "epoch": 0.3443859912821284, + "flos": 20230133137920.0, + "grad_norm": 1.8280399137078096, + "language_loss": 0.87897557, + "learning_rate": 3.0497168194911257e-06, + "loss": 0.90064341, + "num_input_tokens_seen": 123092530, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8125, + "step": 5728, + "time_per_iteration": 2.4304542541503906 + }, + { + "auxiliary_loss_clip": 0.01122947, + "auxiliary_loss_mlp": 0.01045129, + "balance_loss_clip": 1.03106284, + "balance_loss_mlp": 1.04264569, + "epoch": 0.34444611453479634, + "flos": 24316515895680.0, + "grad_norm": 2.0505664478102426, + "language_loss": 0.70451075, + "learning_rate": 3.0493852938772143e-06, + "loss": 0.72619152, + "num_input_tokens_seen": 123110560, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8046875, + "step": 5729, + "time_per_iteration": 2.4979374408721924 + }, + { + "auxiliary_loss_clip": 0.01122265, + "auxiliary_loss_mlp": 0.0103421, + "balance_loss_clip": 1.01952362, + "balance_loss_mlp": 1.0427525, + "epoch": 0.3445062377874643, + "flos": 16983413683200.0, + "grad_norm": 1.7284434335955414, + "language_loss": 0.73995942, + "learning_rate": 3.0490537284705078e-06, + "loss": 0.7615242, + "num_input_tokens_seen": 123128655, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.796875, + "step": 5730, + "time_per_iteration": 2.4471776485443115 + }, + { + "auxiliary_loss_clip": 0.0112363, + "auxiliary_loss_mlp": 0.01041517, + "balance_loss_clip": 1.02693152, + "balance_loss_mlp": 1.04263377, + "epoch": 0.3445663610401323, + "flos": 20302708567680.0, + "grad_norm": 2.104777326243209, + "language_loss": 0.80005515, + "learning_rate": 3.048722123283578e-06, + "loss": 0.82170659, + "num_input_tokens_seen": 123145130, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8125, + "step": 5731, + "time_per_iteration": 2.454735279083252 + }, + { + "auxiliary_loss_clip": 0.01123928, + "auxiliary_loss_mlp": 0.0104428, + "balance_loss_clip": 1.02953923, + "balance_loss_mlp": 1.04394484, + "epoch": 0.34462648429280024, + "flos": 15888102307200.0, + "grad_norm": 2.039149215632527, + "language_loss": 0.78837991, + "learning_rate": 3.0483904783290006e-06, + "loss": 0.81006193, + "num_input_tokens_seen": 123162265, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.796875, + "step": 5732, + "time_per_iteration": 2.4177064895629883 + }, + { + "auxiliary_loss_clip": 0.01043649, + "auxiliary_loss_mlp": 0.01003776, + "balance_loss_clip": 1.00184536, + "balance_loss_mlp": 1.01788378, + "epoch": 0.3446866075454682, + "flos": 59311035285120.0, + "grad_norm": 0.7440231134556253, + "language_loss": 0.53498071, + "learning_rate": 3.0480587936193505e-06, + "loss": 0.55545497, + "num_input_tokens_seen": 123218620, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.2578125, + "step": 5733, + "time_per_iteration": 3.0976667404174805 + }, + { + "auxiliary_loss_clip": 0.0112691, + "auxiliary_loss_mlp": 0.01042834, + "balance_loss_clip": 1.02806389, + "balance_loss_mlp": 1.04630947, + "epoch": 0.34474673079813617, + "flos": 22343799000960.0, + "grad_norm": 1.6025085195413686, + "language_loss": 0.83345532, + "learning_rate": 3.047727069167207e-06, + "loss": 0.85515279, + "num_input_tokens_seen": 123237325, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8046875, + "step": 5734, + "time_per_iteration": 2.462327718734741 + }, + { + "auxiliary_loss_clip": 0.01125766, + "auxiliary_loss_mlp": 0.0103401, + "balance_loss_clip": 1.01899588, + "balance_loss_mlp": 1.04382658, + "epoch": 0.34480685405080413, + "flos": 27670141203840.0, + "grad_norm": 2.7233898634254525, + "language_loss": 0.9245038, + "learning_rate": 3.0473953049851478e-06, + "loss": 0.94610149, + "num_input_tokens_seen": 123258650, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8203125, + "step": 5735, + "time_per_iteration": 2.600933790206909 + }, + { + "auxiliary_loss_clip": 0.01129266, + "auxiliary_loss_mlp": 0.01041814, + "balance_loss_clip": 1.02607846, + "balance_loss_mlp": 1.04662871, + "epoch": 0.3448669773034721, + "flos": 22456020067200.0, + "grad_norm": 1.628548106881684, + "language_loss": 0.76666284, + "learning_rate": 3.0470635010857533e-06, + "loss": 0.78837371, + "num_input_tokens_seen": 123277155, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.828125, + "step": 5736, + "time_per_iteration": 2.4607973098754883 + }, + { + "auxiliary_loss_clip": 0.0113014, + "auxiliary_loss_mlp": 0.01043797, + "balance_loss_clip": 1.02948046, + "balance_loss_mlp": 1.04773998, + "epoch": 0.34492710055614006, + "flos": 24936190352640.0, + "grad_norm": 1.59823002014571, + "language_loss": 0.78745639, + "learning_rate": 3.0467316574816064e-06, + "loss": 0.80919576, + "num_input_tokens_seen": 123297640, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.82421875, + "step": 5737, + "time_per_iteration": 2.5059142112731934 + }, + { + "auxiliary_loss_clip": 0.0112976, + "auxiliary_loss_mlp": 0.01040269, + "balance_loss_clip": 1.02459311, + "balance_loss_mlp": 1.04445243, + "epoch": 0.34498722380880803, + "flos": 20120821073280.0, + "grad_norm": 2.0456946138928767, + "language_loss": 0.71714234, + "learning_rate": 3.0463997741852893e-06, + "loss": 0.73884267, + "num_input_tokens_seen": 123314370, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8515625, + "step": 5738, + "time_per_iteration": 2.4374310970306396 + }, + { + "auxiliary_loss_clip": 0.01129235, + "auxiliary_loss_mlp": 0.01042362, + "balance_loss_clip": 1.02727044, + "balance_loss_mlp": 1.04496205, + "epoch": 0.34504734706147605, + "flos": 28438126917120.0, + "grad_norm": 1.8999072115309161, + "language_loss": 0.81518626, + "learning_rate": 3.046067851209389e-06, + "loss": 0.83690214, + "num_input_tokens_seen": 123336085, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.84375, + "step": 5739, + "time_per_iteration": 2.559990406036377 + }, + { + "auxiliary_loss_clip": 0.0112747, + "auxiliary_loss_mlp": 0.01045734, + "balance_loss_clip": 1.03067827, + "balance_loss_mlp": 1.04620492, + "epoch": 0.345107470314144, + "flos": 22674464628480.0, + "grad_norm": 2.6856273454827275, + "language_loss": 0.8322401, + "learning_rate": 3.0457358885664898e-06, + "loss": 0.85397214, + "num_input_tokens_seen": 123354460, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8125, + "step": 5740, + "time_per_iteration": 2.4684722423553467 + }, + { + "auxiliary_loss_clip": 0.01127563, + "auxiliary_loss_mlp": 0.01038564, + "balance_loss_clip": 1.0227803, + "balance_loss_mlp": 1.04611385, + "epoch": 0.345167593566812, + "flos": 20630716588800.0, + "grad_norm": 2.03424253553345, + "language_loss": 0.77135098, + "learning_rate": 3.045403886269181e-06, + "loss": 0.7930122, + "num_input_tokens_seen": 123373420, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8125, + "step": 5741, + "time_per_iteration": 2.48624587059021 + }, + { + "auxiliary_loss_clip": 0.01125981, + "auxiliary_loss_mlp": 0.01036869, + "balance_loss_clip": 1.02226019, + "balance_loss_mlp": 1.04276562, + "epoch": 0.34522771681947995, + "flos": 26214358890240.0, + "grad_norm": 1.4993687582247586, + "language_loss": 0.77224493, + "learning_rate": 3.045071844330053e-06, + "loss": 0.79387349, + "num_input_tokens_seen": 123394730, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.83203125, + "step": 5742, + "time_per_iteration": 2.5046300888061523 + }, + { + "auxiliary_loss_clip": 0.01123657, + "auxiliary_loss_mlp": 0.01039728, + "balance_loss_clip": 1.02479076, + "balance_loss_mlp": 1.04310095, + "epoch": 0.3452878400721479, + "flos": 19062354072960.0, + "grad_norm": 1.823337430242114, + "language_loss": 0.76346177, + "learning_rate": 3.0447397627616955e-06, + "loss": 0.78509557, + "num_input_tokens_seen": 123412895, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8046875, + "step": 5743, + "time_per_iteration": 2.4554226398468018 + }, + { + "auxiliary_loss_clip": 0.01124183, + "auxiliary_loss_mlp": 0.01036757, + "balance_loss_clip": 1.02278566, + "balance_loss_mlp": 1.04435802, + "epoch": 0.3453479633248159, + "flos": 27929739772800.0, + "grad_norm": 1.5691807126711539, + "language_loss": 0.70255435, + "learning_rate": 3.0444076415767016e-06, + "loss": 0.72416371, + "num_input_tokens_seen": 123432320, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.796875, + "step": 5744, + "time_per_iteration": 2.497314929962158 + }, + { + "auxiliary_loss_clip": 0.01121947, + "auxiliary_loss_mlp": 0.01036476, + "balance_loss_clip": 1.02205133, + "balance_loss_mlp": 1.04318309, + "epoch": 0.34540808657748384, + "flos": 19606113135360.0, + "grad_norm": 1.629619176768893, + "language_loss": 0.79692256, + "learning_rate": 3.044075480787665e-06, + "loss": 0.81850678, + "num_input_tokens_seen": 123450980, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7890625, + "step": 5745, + "time_per_iteration": 2.5154099464416504 + }, + { + "auxiliary_loss_clip": 0.01129348, + "auxiliary_loss_mlp": 0.01040489, + "balance_loss_clip": 1.02460432, + "balance_loss_mlp": 1.04556072, + "epoch": 0.3454682098301518, + "flos": 20411661496320.0, + "grad_norm": 1.7858540966841563, + "language_loss": 0.88775939, + "learning_rate": 3.043743280407182e-06, + "loss": 0.9094578, + "num_input_tokens_seen": 123469365, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8359375, + "step": 5746, + "time_per_iteration": 2.436028003692627 + }, + { + "auxiliary_loss_clip": 0.01129654, + "auxiliary_loss_mlp": 0.01039755, + "balance_loss_clip": 1.02438855, + "balance_loss_mlp": 1.04509354, + "epoch": 0.34552833308281977, + "flos": 21325121291520.0, + "grad_norm": 1.8755596522528313, + "language_loss": 0.64010286, + "learning_rate": 3.043411040447849e-06, + "loss": 0.66179693, + "num_input_tokens_seen": 123489425, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.84765625, + "step": 5747, + "time_per_iteration": 2.465817451477051 + }, + { + "auxiliary_loss_clip": 0.0112633, + "auxiliary_loss_mlp": 0.01035998, + "balance_loss_clip": 1.02193761, + "balance_loss_mlp": 1.04486203, + "epoch": 0.34558845633548774, + "flos": 36243633824640.0, + "grad_norm": 1.5413680181151455, + "language_loss": 0.72813559, + "learning_rate": 3.043078760922264e-06, + "loss": 0.74975884, + "num_input_tokens_seen": 123509970, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.8125, + "step": 5748, + "time_per_iteration": 2.566849946975708 + }, + { + "auxiliary_loss_clip": 0.01123147, + "auxiliary_loss_mlp": 0.01034299, + "balance_loss_clip": 1.020715, + "balance_loss_mlp": 1.04517043, + "epoch": 0.3456485795881557, + "flos": 22450561200000.0, + "grad_norm": 1.6451707518978071, + "language_loss": 0.75697249, + "learning_rate": 3.042746441843029e-06, + "loss": 0.77854693, + "num_input_tokens_seen": 123531055, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.77734375, + "step": 5749, + "time_per_iteration": 2.5068271160125732 + }, + { + "auxiliary_loss_clip": 0.01036655, + "auxiliary_loss_mlp": 0.01004838, + "balance_loss_clip": 1.00293088, + "balance_loss_mlp": 1.01066136, + "epoch": 0.34570870284082367, + "flos": 62004299005440.0, + "grad_norm": 0.8931526891439046, + "language_loss": 0.62754983, + "learning_rate": 3.0424140832227437e-06, + "loss": 0.64796478, + "num_input_tokens_seen": 123584720, + "router_z_loss_clip": 0.01904297, + "router_z_loss_mlp": 0.25976562, + "step": 5750, + "time_per_iteration": 2.930236577987671 + }, + { + "auxiliary_loss_clip": 0.01119501, + "auxiliary_loss_mlp": 0.01033201, + "balance_loss_clip": 1.01933062, + "balance_loss_mlp": 1.04268134, + "epoch": 0.34576882609349163, + "flos": 22782196494720.0, + "grad_norm": 2.1199041216122314, + "language_loss": 0.80762947, + "learning_rate": 3.042081685074012e-06, + "loss": 0.82915652, + "num_input_tokens_seen": 123604465, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 5751, + "time_per_iteration": 2.4710936546325684 + }, + { + "auxiliary_loss_clip": 0.01121328, + "auxiliary_loss_mlp": 0.01046759, + "balance_loss_clip": 1.03268027, + "balance_loss_mlp": 1.04408574, + "epoch": 0.34582894934615965, + "flos": 12348818576640.0, + "grad_norm": 3.882107217624466, + "language_loss": 0.83630323, + "learning_rate": 3.041749247409439e-06, + "loss": 0.85798407, + "num_input_tokens_seen": 123622320, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7734375, + "step": 5752, + "time_per_iteration": 2.421095132827759 + }, + { + "auxiliary_loss_clip": 0.01036836, + "auxiliary_loss_mlp": 0.01002038, + "balance_loss_clip": 1.00014234, + "balance_loss_mlp": 1.01131189, + "epoch": 0.3458890725988276, + "flos": 70167691071360.0, + "grad_norm": 0.7425573992046552, + "language_loss": 0.63106978, + "learning_rate": 3.0414167702416296e-06, + "loss": 0.6514585, + "num_input_tokens_seen": 123678010, + "router_z_loss_clip": 0.0189209, + "router_z_loss_mlp": 0.25585938, + "step": 5753, + "time_per_iteration": 2.960430383682251 + }, + { + "auxiliary_loss_clip": 0.01122475, + "auxiliary_loss_mlp": 0.01040243, + "balance_loss_clip": 1.0252701, + "balance_loss_mlp": 1.0433172, + "epoch": 0.3459491958514956, + "flos": 17092582093440.0, + "grad_norm": 1.7337780765213762, + "language_loss": 0.70964289, + "learning_rate": 3.0410842535831914e-06, + "loss": 0.73127007, + "num_input_tokens_seen": 123696830, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.79296875, + "step": 5754, + "time_per_iteration": 2.473090171813965 + }, + { + "auxiliary_loss_clip": 0.01126645, + "auxiliary_loss_mlp": 0.01033305, + "balance_loss_clip": 1.01889825, + "balance_loss_mlp": 1.04436386, + "epoch": 0.34600931910416355, + "flos": 16650952375680.0, + "grad_norm": 3.1958037374869357, + "language_loss": 0.72880316, + "learning_rate": 3.0407516974467343e-06, + "loss": 0.75040269, + "num_input_tokens_seen": 123714360, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.82421875, + "step": 5755, + "time_per_iteration": 2.486187219619751 + }, + { + "auxiliary_loss_clip": 0.01122516, + "auxiliary_loss_mlp": 0.01034129, + "balance_loss_clip": 1.01985335, + "balance_loss_mlp": 1.04448533, + "epoch": 0.3460694423568315, + "flos": 38546190334080.0, + "grad_norm": 1.6620890991055186, + "language_loss": 0.72366977, + "learning_rate": 3.040419101844869e-06, + "loss": 0.74523616, + "num_input_tokens_seen": 123739250, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 5756, + "time_per_iteration": 2.6883044242858887 + }, + { + "auxiliary_loss_clip": 0.01036738, + "auxiliary_loss_mlp": 0.01004698, + "balance_loss_clip": 1.00295758, + "balance_loss_mlp": 1.01152658, + "epoch": 0.3461295656094995, + "flos": 72081479704320.0, + "grad_norm": 0.7127234008063932, + "language_loss": 0.62522227, + "learning_rate": 3.040086466790207e-06, + "loss": 0.64563662, + "num_input_tokens_seen": 123802845, + "router_z_loss_clip": 0.01745605, + "router_z_loss_mlp": 0.25195312, + "step": 5757, + "time_per_iteration": 3.0644619464874268 + }, + { + "auxiliary_loss_clip": 0.01036676, + "auxiliary_loss_mlp": 0.01006374, + "balance_loss_clip": 1.00465703, + "balance_loss_mlp": 1.01123941, + "epoch": 0.34618968886216744, + "flos": 65460089571840.0, + "grad_norm": 0.8513650993905141, + "language_loss": 0.59153563, + "learning_rate": 3.039753792295362e-06, + "loss": 0.61196613, + "num_input_tokens_seen": 123861805, + "router_z_loss_clip": 0.01721191, + "router_z_loss_mlp": 0.25390625, + "step": 5758, + "time_per_iteration": 3.0601916313171387 + }, + { + "auxiliary_loss_clip": 0.01126165, + "auxiliary_loss_mlp": 0.0103975, + "balance_loss_clip": 1.02576697, + "balance_loss_mlp": 1.04562724, + "epoch": 0.3462498121148354, + "flos": 23472542960640.0, + "grad_norm": 1.8469236817688628, + "language_loss": 0.71498728, + "learning_rate": 3.0394210783729487e-06, + "loss": 0.73664641, + "num_input_tokens_seen": 123881820, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.8046875, + "step": 5759, + "time_per_iteration": 2.4722588062286377 + }, + { + "auxiliary_loss_clip": 0.0112123, + "auxiliary_loss_mlp": 0.01046171, + "balance_loss_clip": 1.03079295, + "balance_loss_mlp": 1.04248834, + "epoch": 0.3463099353675034, + "flos": 24170790418560.0, + "grad_norm": 1.8727439754442439, + "language_loss": 0.83008277, + "learning_rate": 3.0390883250355836e-06, + "loss": 0.85175675, + "num_input_tokens_seen": 123903700, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.7890625, + "step": 5760, + "time_per_iteration": 2.5002012252807617 + }, + { + "auxiliary_loss_clip": 0.01035648, + "auxiliary_loss_mlp": 0.01005512, + "balance_loss_clip": 1.00358045, + "balance_loss_mlp": 1.01033783, + "epoch": 0.34637005862017134, + "flos": 63700609766400.0, + "grad_norm": 0.8745886359800412, + "language_loss": 0.5653646, + "learning_rate": 3.0387555322958865e-06, + "loss": 0.58577621, + "num_input_tokens_seen": 123960075, + "router_z_loss_clip": 0.01928711, + "router_z_loss_mlp": 0.25390625, + "step": 5761, + "time_per_iteration": 3.0950896739959717 + }, + { + "auxiliary_loss_clip": 0.01120096, + "auxiliary_loss_mlp": 0.01038691, + "balance_loss_clip": 1.02470756, + "balance_loss_mlp": 1.04127657, + "epoch": 0.3464301818728393, + "flos": 13145532192000.0, + "grad_norm": 2.0018538772922883, + "language_loss": 0.95053494, + "learning_rate": 3.038422700166474e-06, + "loss": 0.97212291, + "num_input_tokens_seen": 123975805, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7890625, + "step": 5762, + "time_per_iteration": 5.290884256362915 + }, + { + "auxiliary_loss_clip": 0.01123668, + "auxiliary_loss_mlp": 0.01034396, + "balance_loss_clip": 1.01935804, + "balance_loss_mlp": 1.0417943, + "epoch": 0.34649030512550727, + "flos": 29315173299840.0, + "grad_norm": 2.194288284173203, + "language_loss": 0.69335818, + "learning_rate": 3.0380898286599692e-06, + "loss": 0.71493888, + "num_input_tokens_seen": 123997530, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8203125, + "step": 5763, + "time_per_iteration": 2.5411787033081055 + }, + { + "auxiliary_loss_clip": 0.01130216, + "auxiliary_loss_mlp": 0.01045092, + "balance_loss_clip": 1.02862906, + "balance_loss_mlp": 1.0458554, + "epoch": 0.34655042837817523, + "flos": 23730884553600.0, + "grad_norm": 2.0099592928074497, + "language_loss": 0.83589876, + "learning_rate": 3.0377569177889945e-06, + "loss": 0.85765183, + "num_input_tokens_seen": 124016375, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.84375, + "step": 5764, + "time_per_iteration": 2.48040771484375 + }, + { + "auxiliary_loss_clip": 0.01123556, + "auxiliary_loss_mlp": 0.01033291, + "balance_loss_clip": 1.0186758, + "balance_loss_mlp": 1.04343057, + "epoch": 0.34661055163084326, + "flos": 22054215553920.0, + "grad_norm": 2.159805793212971, + "language_loss": 0.67403859, + "learning_rate": 3.0374239675661722e-06, + "loss": 0.69560707, + "num_input_tokens_seen": 124033975, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.80078125, + "step": 5765, + "time_per_iteration": 2.502297878265381 + }, + { + "auxiliary_loss_clip": 0.01130095, + "auxiliary_loss_mlp": 0.01037318, + "balance_loss_clip": 1.02291703, + "balance_loss_mlp": 1.04937232, + "epoch": 0.3466706748835112, + "flos": 21799213925760.0, + "grad_norm": 2.083918060213648, + "language_loss": 0.77861524, + "learning_rate": 3.03709097800413e-06, + "loss": 0.80028939, + "num_input_tokens_seen": 124051930, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.80859375, + "step": 5766, + "time_per_iteration": 2.465325355529785 + }, + { + "auxiliary_loss_clip": 0.01122551, + "auxiliary_loss_mlp": 0.01035113, + "balance_loss_clip": 1.0215292, + "balance_loss_mlp": 1.04335451, + "epoch": 0.3467307981361792, + "flos": 19461680547840.0, + "grad_norm": 1.5377908130541305, + "language_loss": 0.73529994, + "learning_rate": 3.0367579491154943e-06, + "loss": 0.75687665, + "num_input_tokens_seen": 124071220, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7890625, + "step": 5767, + "time_per_iteration": 2.4656143188476562 + }, + { + "auxiliary_loss_clip": 0.01127128, + "auxiliary_loss_mlp": 0.01040956, + "balance_loss_clip": 1.02538764, + "balance_loss_mlp": 1.04720497, + "epoch": 0.34679092138884715, + "flos": 24827452905600.0, + "grad_norm": 2.233359981487989, + "language_loss": 0.77795279, + "learning_rate": 3.036424880912893e-06, + "loss": 0.79963356, + "num_input_tokens_seen": 124090140, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.796875, + "step": 5768, + "time_per_iteration": 2.4951131343841553 + }, + { + "auxiliary_loss_clip": 0.0103542, + "auxiliary_loss_mlp": 0.01008769, + "balance_loss_clip": 1.00693345, + "balance_loss_mlp": 1.01015306, + "epoch": 0.3468510446415151, + "flos": 63236070149760.0, + "grad_norm": 0.7739728920865777, + "language_loss": 0.57404095, + "learning_rate": 3.036091773408956e-06, + "loss": 0.59448284, + "num_input_tokens_seen": 124152025, + "router_z_loss_clip": 0.01831055, + "router_z_loss_mlp": 0.25195312, + "step": 5769, + "time_per_iteration": 3.0867085456848145 + }, + { + "auxiliary_loss_clip": 0.01135857, + "auxiliary_loss_mlp": 0.01043057, + "balance_loss_clip": 1.02577174, + "balance_loss_mlp": 1.04723847, + "epoch": 0.3469111678941831, + "flos": 12120713256960.0, + "grad_norm": 2.3808887206764244, + "language_loss": 0.85625517, + "learning_rate": 3.0357586266163154e-06, + "loss": 0.87804437, + "num_input_tokens_seen": 124165795, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.88671875, + "step": 5770, + "time_per_iteration": 2.4296391010284424 + }, + { + "auxiliary_loss_clip": 0.0103532, + "auxiliary_loss_mlp": 0.01003334, + "balance_loss_clip": 1.00152194, + "balance_loss_mlp": 1.01001954, + "epoch": 0.34697129114685105, + "flos": 65934110378880.0, + "grad_norm": 0.7779481231658855, + "language_loss": 0.59827816, + "learning_rate": 3.0354254405476036e-06, + "loss": 0.61866474, + "num_input_tokens_seen": 124222925, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.25390625, + "step": 5771, + "time_per_iteration": 2.858952522277832 + }, + { + "auxiliary_loss_clip": 0.0112466, + "auxiliary_loss_mlp": 0.01046684, + "balance_loss_clip": 1.03183091, + "balance_loss_mlp": 1.04478061, + "epoch": 0.347031414399519, + "flos": 34454205054720.0, + "grad_norm": 2.6949016474557475, + "language_loss": 0.71790159, + "learning_rate": 3.0350922152154557e-06, + "loss": 0.73961502, + "num_input_tokens_seen": 124240915, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 5772, + "time_per_iteration": 2.629441976547241 + }, + { + "auxiliary_loss_clip": 0.01125724, + "auxiliary_loss_mlp": 0.01041085, + "balance_loss_clip": 1.02608275, + "balance_loss_mlp": 1.04398608, + "epoch": 0.347091537652187, + "flos": 26944135511040.0, + "grad_norm": 1.4939658014033708, + "language_loss": 0.76165307, + "learning_rate": 3.034758950632507e-06, + "loss": 0.78332114, + "num_input_tokens_seen": 124262770, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.81640625, + "step": 5773, + "time_per_iteration": 2.5281848907470703 + }, + { + "auxiliary_loss_clip": 0.01127127, + "auxiliary_loss_mlp": 0.01043606, + "balance_loss_clip": 1.02811444, + "balance_loss_mlp": 1.04447389, + "epoch": 0.34715166090485494, + "flos": 21142228216320.0, + "grad_norm": 2.0748415381607717, + "language_loss": 0.70428938, + "learning_rate": 3.034425646811396e-06, + "loss": 0.72599673, + "num_input_tokens_seen": 124280950, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.828125, + "step": 5774, + "time_per_iteration": 2.4930198192596436 + }, + { + "auxiliary_loss_clip": 0.01125136, + "auxiliary_loss_mlp": 0.01040671, + "balance_loss_clip": 1.02630043, + "balance_loss_mlp": 1.04615033, + "epoch": 0.3472117841575229, + "flos": 23478001827840.0, + "grad_norm": 1.6801460468757594, + "language_loss": 0.76410925, + "learning_rate": 3.0340923037647602e-06, + "loss": 0.78576738, + "num_input_tokens_seen": 124299540, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7890625, + "step": 5775, + "time_per_iteration": 2.501793622970581 + }, + { + "auxiliary_loss_clip": 0.01129926, + "auxiliary_loss_mlp": 0.0104389, + "balance_loss_clip": 1.02778447, + "balance_loss_mlp": 1.04408336, + "epoch": 0.34727190741019087, + "flos": 17492806408320.0, + "grad_norm": 2.2786937073337956, + "language_loss": 0.78098702, + "learning_rate": 3.0337589215052404e-06, + "loss": 0.8027252, + "num_input_tokens_seen": 124316285, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.859375, + "step": 5776, + "time_per_iteration": 2.547508716583252 + }, + { + "auxiliary_loss_clip": 0.01034004, + "auxiliary_loss_mlp": 0.01012403, + "balance_loss_clip": 1.01073408, + "balance_loss_mlp": 1.00864577, + "epoch": 0.34733203066285884, + "flos": 65265491640960.0, + "grad_norm": 0.8366551978688649, + "language_loss": 0.63353252, + "learning_rate": 3.033425500045478e-06, + "loss": 0.65399659, + "num_input_tokens_seen": 124376650, + "router_z_loss_clip": 0.01672363, + "router_z_loss_mlp": 0.25390625, + "step": 5777, + "time_per_iteration": 3.118314743041992 + }, + { + "auxiliary_loss_clip": 0.01124542, + "auxiliary_loss_mlp": 0.01047894, + "balance_loss_clip": 1.03253984, + "balance_loss_mlp": 1.04198289, + "epoch": 0.3473921539155268, + "flos": 28658726294400.0, + "grad_norm": 2.1982821508403956, + "language_loss": 0.64399695, + "learning_rate": 3.033092039398119e-06, + "loss": 0.66572136, + "num_input_tokens_seen": 124396475, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.82421875, + "step": 5778, + "time_per_iteration": 2.5438621044158936 + }, + { + "auxiliary_loss_clip": 0.01128237, + "auxiliary_loss_mlp": 0.01054212, + "balance_loss_clip": 1.03947175, + "balance_loss_mlp": 1.04425573, + "epoch": 0.3474522771681948, + "flos": 40836895355520.0, + "grad_norm": 1.7264375706792277, + "language_loss": 0.71190178, + "learning_rate": 3.0327585395758046e-06, + "loss": 0.73372632, + "num_input_tokens_seen": 124416480, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.83984375, + "step": 5779, + "time_per_iteration": 2.6116013526916504 + }, + { + "auxiliary_loss_clip": 0.01128331, + "auxiliary_loss_mlp": 0.01048092, + "balance_loss_clip": 1.03270197, + "balance_loss_mlp": 1.04354596, + "epoch": 0.3475124004208628, + "flos": 24608577381120.0, + "grad_norm": 1.874853063849031, + "language_loss": 0.62552947, + "learning_rate": 3.0324250005911837e-06, + "loss": 0.64729369, + "num_input_tokens_seen": 124435950, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84765625, + "step": 5780, + "time_per_iteration": 2.5024712085723877 + }, + { + "auxiliary_loss_clip": 0.01124027, + "auxiliary_loss_mlp": 0.01041985, + "balance_loss_clip": 1.0278883, + "balance_loss_mlp": 1.04260445, + "epoch": 0.34757252367353075, + "flos": 22711309004160.0, + "grad_norm": 1.604616792806945, + "language_loss": 0.72373253, + "learning_rate": 3.0320914224569033e-06, + "loss": 0.74539268, + "num_input_tokens_seen": 124455410, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.81640625, + "step": 5781, + "time_per_iteration": 2.471235513687134 + }, + { + "auxiliary_loss_clip": 0.01125801, + "auxiliary_loss_mlp": 0.01050458, + "balance_loss_clip": 1.03416181, + "balance_loss_mlp": 1.04316914, + "epoch": 0.3476326469261987, + "flos": 19828184970240.0, + "grad_norm": 2.0942988164582266, + "language_loss": 0.76741016, + "learning_rate": 3.031757805185612e-06, + "loss": 0.78917271, + "num_input_tokens_seen": 124474870, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.828125, + "step": 5782, + "time_per_iteration": 2.4831414222717285 + }, + { + "auxiliary_loss_clip": 0.01123989, + "auxiliary_loss_mlp": 0.01036279, + "balance_loss_clip": 1.02140737, + "balance_loss_mlp": 1.04221606, + "epoch": 0.3476927701788667, + "flos": 19938107566080.0, + "grad_norm": 1.9917493867858045, + "language_loss": 0.62131268, + "learning_rate": 3.0314241487899622e-06, + "loss": 0.64291537, + "num_input_tokens_seen": 124494105, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8203125, + "step": 5783, + "time_per_iteration": 2.4483954906463623 + }, + { + "auxiliary_loss_clip": 0.01119293, + "auxiliary_loss_mlp": 0.01031994, + "balance_loss_clip": 1.01833832, + "balance_loss_mlp": 1.0410347, + "epoch": 0.34775289343153465, + "flos": 20735108490240.0, + "grad_norm": 1.6546414102961637, + "language_loss": 0.88575971, + "learning_rate": 3.031090453282605e-06, + "loss": 0.90727258, + "num_input_tokens_seen": 124512030, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.78125, + "step": 5784, + "time_per_iteration": 2.5281262397766113 + }, + { + "auxiliary_loss_clip": 0.01121731, + "auxiliary_loss_mlp": 0.01036934, + "balance_loss_clip": 1.02219379, + "balance_loss_mlp": 1.04283547, + "epoch": 0.3478130166842026, + "flos": 19354846521600.0, + "grad_norm": 1.7834042756277195, + "language_loss": 0.81664282, + "learning_rate": 3.0307567186761946e-06, + "loss": 0.83822948, + "num_input_tokens_seen": 124530980, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.7890625, + "step": 5785, + "time_per_iteration": 2.444279432296753 + }, + { + "auxiliary_loss_clip": 0.01126224, + "auxiliary_loss_mlp": 0.01037443, + "balance_loss_clip": 1.02301908, + "balance_loss_mlp": 1.04558039, + "epoch": 0.3478731399368706, + "flos": 22051198811520.0, + "grad_norm": 1.6236713309130966, + "language_loss": 0.80679643, + "learning_rate": 3.0304229449833862e-06, + "loss": 0.82843316, + "num_input_tokens_seen": 124549330, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8046875, + "step": 5786, + "time_per_iteration": 2.506639242172241 + }, + { + "auxiliary_loss_clip": 0.01123366, + "auxiliary_loss_mlp": 0.01033692, + "balance_loss_clip": 1.01860058, + "balance_loss_mlp": 1.0443275, + "epoch": 0.34793326318953854, + "flos": 18041449720320.0, + "grad_norm": 1.5789553434659291, + "language_loss": 0.74868137, + "learning_rate": 3.030089132216836e-06, + "loss": 0.77025199, + "num_input_tokens_seen": 124567200, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7890625, + "step": 5787, + "time_per_iteration": 2.4305543899536133 + }, + { + "auxiliary_loss_clip": 0.01122978, + "auxiliary_loss_mlp": 0.01037288, + "balance_loss_clip": 1.02276862, + "balance_loss_mlp": 1.04133916, + "epoch": 0.3479933864422065, + "flos": 29314670509440.0, + "grad_norm": 1.685205733624188, + "language_loss": 0.81207466, + "learning_rate": 3.029755280389203e-06, + "loss": 0.83367729, + "num_input_tokens_seen": 124587025, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.81640625, + "step": 5788, + "time_per_iteration": 2.58461332321167 + }, + { + "auxiliary_loss_clip": 0.01130932, + "auxiliary_loss_mlp": 0.01038586, + "balance_loss_clip": 1.02333927, + "balance_loss_mlp": 1.04716599, + "epoch": 0.3480535096948745, + "flos": 20120713332480.0, + "grad_norm": 1.7599288417752579, + "language_loss": 0.85399663, + "learning_rate": 3.029421389513147e-06, + "loss": 0.87569183, + "num_input_tokens_seen": 124605860, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8359375, + "step": 5789, + "time_per_iteration": 2.4460527896881104 + }, + { + "auxiliary_loss_clip": 0.01127788, + "auxiliary_loss_mlp": 0.01050411, + "balance_loss_clip": 1.03517616, + "balance_loss_mlp": 1.04420161, + "epoch": 0.34811363294754244, + "flos": 18548974938240.0, + "grad_norm": 1.9217222904205502, + "language_loss": 0.84973574, + "learning_rate": 3.029087459601328e-06, + "loss": 0.87151778, + "num_input_tokens_seen": 124624270, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8359375, + "step": 5790, + "time_per_iteration": 2.4690423011779785 + }, + { + "auxiliary_loss_clip": 0.0112493, + "auxiliary_loss_mlp": 0.01044909, + "balance_loss_clip": 1.0295074, + "balance_loss_mlp": 1.04403305, + "epoch": 0.3481737562002104, + "flos": 26870303105280.0, + "grad_norm": 2.0218239222922785, + "language_loss": 0.82098949, + "learning_rate": 3.0287534906664097e-06, + "loss": 0.8426879, + "num_input_tokens_seen": 124644005, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.80859375, + "step": 5791, + "time_per_iteration": 2.4949092864990234 + }, + { + "auxiliary_loss_clip": 0.01124824, + "auxiliary_loss_mlp": 0.01038811, + "balance_loss_clip": 1.02386248, + "balance_loss_mlp": 1.04235744, + "epoch": 0.3482338794528784, + "flos": 28908664104960.0, + "grad_norm": 1.7691925727921667, + "language_loss": 0.77531552, + "learning_rate": 3.028419482721056e-06, + "loss": 0.79695195, + "num_input_tokens_seen": 124663020, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.82421875, + "step": 5792, + "time_per_iteration": 2.5464468002319336 + }, + { + "auxiliary_loss_clip": 0.01121021, + "auxiliary_loss_mlp": 0.01031113, + "balance_loss_clip": 1.01623607, + "balance_loss_mlp": 1.04100966, + "epoch": 0.3482940027055464, + "flos": 22200767043840.0, + "grad_norm": 1.5041206153246893, + "language_loss": 0.81592953, + "learning_rate": 3.0280854357779325e-06, + "loss": 0.83745086, + "num_input_tokens_seen": 124682975, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.80078125, + "step": 5793, + "time_per_iteration": 2.454220771789551 + }, + { + "auxiliary_loss_clip": 0.01126572, + "auxiliary_loss_mlp": 0.01046613, + "balance_loss_clip": 1.03078222, + "balance_loss_mlp": 1.04426205, + "epoch": 0.34835412595821436, + "flos": 20302708567680.0, + "grad_norm": 1.7524057524538565, + "language_loss": 0.76222527, + "learning_rate": 3.027751349849706e-06, + "loss": 0.78395712, + "num_input_tokens_seen": 124701340, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8203125, + "step": 5794, + "time_per_iteration": 2.485077142715454 + }, + { + "auxiliary_loss_clip": 0.01121136, + "auxiliary_loss_mlp": 0.01036584, + "balance_loss_clip": 1.02165866, + "balance_loss_mlp": 1.04168189, + "epoch": 0.3484142492108823, + "flos": 20449691020800.0, + "grad_norm": 2.2347385462744165, + "language_loss": 0.56926, + "learning_rate": 3.0274172249490456e-06, + "loss": 0.59083712, + "num_input_tokens_seen": 124719165, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.796875, + "step": 5795, + "time_per_iteration": 2.4378490447998047 + }, + { + "auxiliary_loss_clip": 0.01121205, + "auxiliary_loss_mlp": 0.01036633, + "balance_loss_clip": 1.02250659, + "balance_loss_mlp": 1.04285967, + "epoch": 0.3484743724635503, + "flos": 24352929308160.0, + "grad_norm": 2.137832792929428, + "language_loss": 0.82437253, + "learning_rate": 3.0270830610886213e-06, + "loss": 0.84595084, + "num_input_tokens_seen": 124738670, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.78125, + "step": 5796, + "time_per_iteration": 2.5187671184539795 + }, + { + "auxiliary_loss_clip": 0.01120499, + "auxiliary_loss_mlp": 0.01029245, + "balance_loss_clip": 1.0153811, + "balance_loss_mlp": 1.043782, + "epoch": 0.34853449571621825, + "flos": 24353001135360.0, + "grad_norm": 1.7817355656860259, + "language_loss": 0.83580989, + "learning_rate": 3.0267488582811033e-06, + "loss": 0.85730731, + "num_input_tokens_seen": 124758760, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 5797, + "time_per_iteration": 2.518832206726074 + }, + { + "auxiliary_loss_clip": 0.01119982, + "auxiliary_loss_mlp": 0.01034692, + "balance_loss_clip": 1.02017224, + "balance_loss_mlp": 1.04206371, + "epoch": 0.3485946189688862, + "flos": 27267690245760.0, + "grad_norm": 1.7199370679887815, + "language_loss": 0.73215538, + "learning_rate": 3.026414616539167e-06, + "loss": 0.7537021, + "num_input_tokens_seen": 124777765, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 5798, + "time_per_iteration": 2.499967575073242 + }, + { + "auxiliary_loss_clip": 0.01123251, + "auxiliary_loss_mlp": 0.01041885, + "balance_loss_clip": 1.02660251, + "balance_loss_mlp": 1.04203498, + "epoch": 0.3486547422215542, + "flos": 20156695781760.0, + "grad_norm": 2.0872044860332597, + "language_loss": 0.75936413, + "learning_rate": 3.026080335875485e-06, + "loss": 0.78101552, + "num_input_tokens_seen": 124796775, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8125, + "step": 5799, + "time_per_iteration": 2.4452474117279053 + }, + { + "auxiliary_loss_clip": 0.01121272, + "auxiliary_loss_mlp": 0.01038695, + "balance_loss_clip": 1.0248909, + "balance_loss_mlp": 1.04197407, + "epoch": 0.34871486547422215, + "flos": 20230348619520.0, + "grad_norm": 1.7461935027983841, + "language_loss": 0.75557071, + "learning_rate": 3.025746016302734e-06, + "loss": 0.7771703, + "num_input_tokens_seen": 124815825, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.79296875, + "step": 5800, + "time_per_iteration": 2.4526796340942383 + }, + { + "auxiliary_loss_clip": 0.01129939, + "auxiliary_loss_mlp": 0.0104466, + "balance_loss_clip": 1.02854276, + "balance_loss_mlp": 1.04578733, + "epoch": 0.3487749887268901, + "flos": 44053234882560.0, + "grad_norm": 2.3150001070935127, + "language_loss": 0.67645729, + "learning_rate": 3.025411657833591e-06, + "loss": 0.69820327, + "num_input_tokens_seen": 124838420, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.84375, + "step": 5801, + "time_per_iteration": 2.644601821899414 + }, + { + "auxiliary_loss_clip": 0.01122812, + "auxiliary_loss_mlp": 0.0104057, + "balance_loss_clip": 1.02563334, + "balance_loss_mlp": 1.04446411, + "epoch": 0.3488351119795581, + "flos": 23295144666240.0, + "grad_norm": 1.9000140831486088, + "language_loss": 0.76785576, + "learning_rate": 3.025077260480735e-06, + "loss": 0.78948951, + "num_input_tokens_seen": 124857320, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.78515625, + "step": 5802, + "time_per_iteration": 2.46921968460083 + }, + { + "auxiliary_loss_clip": 0.01118956, + "auxiliary_loss_mlp": 0.01033761, + "balance_loss_clip": 1.01905692, + "balance_loss_mlp": 1.04294538, + "epoch": 0.34889523523222604, + "flos": 19934839428480.0, + "grad_norm": 1.750768588632487, + "language_loss": 0.78868455, + "learning_rate": 3.0247428242568474e-06, + "loss": 0.81021172, + "num_input_tokens_seen": 124875685, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.76171875, + "step": 5803, + "time_per_iteration": 3.979863405227661 + }, + { + "auxiliary_loss_clip": 0.01123257, + "auxiliary_loss_mlp": 0.01036614, + "balance_loss_clip": 1.02266085, + "balance_loss_mlp": 1.0410372, + "epoch": 0.348955358484894, + "flos": 30446179816320.0, + "grad_norm": 1.9657380954946277, + "language_loss": 0.67745399, + "learning_rate": 3.0244083491746085e-06, + "loss": 0.69905275, + "num_input_tokens_seen": 124895960, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.8203125, + "step": 5804, + "time_per_iteration": 3.8562989234924316 + }, + { + "auxiliary_loss_clip": 0.01121349, + "auxiliary_loss_mlp": 0.01044714, + "balance_loss_clip": 1.03001559, + "balance_loss_mlp": 1.0454638, + "epoch": 0.349015481737562, + "flos": 17999972490240.0, + "grad_norm": 2.669385195944029, + "language_loss": 0.76021814, + "learning_rate": 3.024073835246702e-06, + "loss": 0.78187871, + "num_input_tokens_seen": 124914140, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7578125, + "step": 5805, + "time_per_iteration": 2.458235263824463 + }, + { + "auxiliary_loss_clip": 0.01124464, + "auxiliary_loss_mlp": 0.01036858, + "balance_loss_clip": 1.02199244, + "balance_loss_mlp": 1.0451802, + "epoch": 0.34907560499023, + "flos": 27198490694400.0, + "grad_norm": 3.0752866237359884, + "language_loss": 0.67804134, + "learning_rate": 3.023739282485814e-06, + "loss": 0.69965458, + "num_input_tokens_seen": 124934180, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.79296875, + "step": 5806, + "time_per_iteration": 2.4840877056121826 + }, + { + "auxiliary_loss_clip": 0.01126527, + "auxiliary_loss_mlp": 0.01040199, + "balance_loss_clip": 1.02523851, + "balance_loss_mlp": 1.04571056, + "epoch": 0.34913572824289796, + "flos": 30226873328640.0, + "grad_norm": 1.4876164360326454, + "language_loss": 0.71957624, + "learning_rate": 3.023404690904629e-06, + "loss": 0.74124348, + "num_input_tokens_seen": 124956060, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8046875, + "step": 5807, + "time_per_iteration": 2.542815685272217 + }, + { + "auxiliary_loss_clip": 0.01123687, + "auxiliary_loss_mlp": 0.01038505, + "balance_loss_clip": 1.02295971, + "balance_loss_mlp": 1.04158592, + "epoch": 0.3491958514955659, + "flos": 29971907614080.0, + "grad_norm": 1.7054576034597768, + "language_loss": 0.74218416, + "learning_rate": 3.0230700605158364e-06, + "loss": 0.7638061, + "num_input_tokens_seen": 124976070, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8203125, + "step": 5808, + "time_per_iteration": 2.503438949584961 + }, + { + "auxiliary_loss_clip": 0.01122149, + "auxiliary_loss_mlp": 0.01048536, + "balance_loss_clip": 1.03412986, + "balance_loss_mlp": 1.04479396, + "epoch": 0.3492559747482339, + "flos": 22783273902720.0, + "grad_norm": 1.5095416937429198, + "language_loss": 0.84245461, + "learning_rate": 3.0227353913321238e-06, + "loss": 0.86416149, + "num_input_tokens_seen": 124996995, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7734375, + "step": 5809, + "time_per_iteration": 2.4860358238220215 + }, + { + "auxiliary_loss_clip": 0.01119567, + "auxiliary_loss_mlp": 0.01036784, + "balance_loss_clip": 1.02354026, + "balance_loss_mlp": 1.04322374, + "epoch": 0.34931609800090185, + "flos": 26068022881920.0, + "grad_norm": 1.8434153763939258, + "language_loss": 0.80251479, + "learning_rate": 3.0224006833661835e-06, + "loss": 0.82407832, + "num_input_tokens_seen": 125015600, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.76171875, + "step": 5810, + "time_per_iteration": 2.481653928756714 + }, + { + "auxiliary_loss_clip": 0.01124044, + "auxiliary_loss_mlp": 0.01039794, + "balance_loss_clip": 1.02613211, + "balance_loss_mlp": 1.04406404, + "epoch": 0.3493762212535698, + "flos": 29242023252480.0, + "grad_norm": 1.967526444092296, + "language_loss": 0.75335366, + "learning_rate": 3.0220659366307057e-06, + "loss": 0.77499199, + "num_input_tokens_seen": 125035290, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.80078125, + "step": 5811, + "time_per_iteration": 2.534524440765381 + }, + { + "auxiliary_loss_clip": 0.01128245, + "auxiliary_loss_mlp": 0.01039888, + "balance_loss_clip": 1.02543986, + "balance_loss_mlp": 1.04616523, + "epoch": 0.3494363445062378, + "flos": 27126058919040.0, + "grad_norm": 1.4977831051483896, + "language_loss": 0.80070162, + "learning_rate": 3.021731151138386e-06, + "loss": 0.82238293, + "num_input_tokens_seen": 125057130, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8203125, + "step": 5812, + "time_per_iteration": 2.503074884414673 + }, + { + "auxiliary_loss_clip": 0.01123214, + "auxiliary_loss_mlp": 0.01042781, + "balance_loss_clip": 1.02746272, + "balance_loss_mlp": 1.04195547, + "epoch": 0.34949646775890575, + "flos": 12276207233280.0, + "grad_norm": 1.9471141693502576, + "language_loss": 0.6923517, + "learning_rate": 3.021396326901918e-06, + "loss": 0.71401167, + "num_input_tokens_seen": 125073720, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8125, + "step": 5813, + "time_per_iteration": 2.4503591060638428 + }, + { + "auxiliary_loss_clip": 0.01122458, + "auxiliary_loss_mlp": 0.01039452, + "balance_loss_clip": 1.02492023, + "balance_loss_mlp": 1.04438448, + "epoch": 0.3495565910115737, + "flos": 17165516659200.0, + "grad_norm": 2.4036318537481334, + "language_loss": 0.77007949, + "learning_rate": 3.0210614639339998e-06, + "loss": 0.79169858, + "num_input_tokens_seen": 125090635, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.78125, + "step": 5814, + "time_per_iteration": 2.4173405170440674 + }, + { + "auxiliary_loss_clip": 0.01126142, + "auxiliary_loss_mlp": 0.01042541, + "balance_loss_clip": 1.02692485, + "balance_loss_mlp": 1.04406822, + "epoch": 0.3496167142642417, + "flos": 26465661417600.0, + "grad_norm": 1.5090517849605465, + "language_loss": 0.84283173, + "learning_rate": 3.020726562247328e-06, + "loss": 0.86451852, + "num_input_tokens_seen": 125110070, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8203125, + "step": 5815, + "time_per_iteration": 2.5173141956329346 + }, + { + "auxiliary_loss_clip": 0.01124466, + "auxiliary_loss_mlp": 0.01033251, + "balance_loss_clip": 1.01981044, + "balance_loss_mlp": 1.04368711, + "epoch": 0.34967683751690964, + "flos": 17414843938560.0, + "grad_norm": 2.123091285603595, + "language_loss": 0.77423191, + "learning_rate": 3.0203916218546024e-06, + "loss": 0.79580915, + "num_input_tokens_seen": 125125730, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.80859375, + "step": 5816, + "time_per_iteration": 2.413438558578491 + }, + { + "auxiliary_loss_clip": 0.01128865, + "auxiliary_loss_mlp": 0.01042596, + "balance_loss_clip": 1.02761126, + "balance_loss_mlp": 1.0468061, + "epoch": 0.3497369607695776, + "flos": 22600021691520.0, + "grad_norm": 2.144763996717865, + "language_loss": 0.58441401, + "learning_rate": 3.0200566427685246e-06, + "loss": 0.60612863, + "num_input_tokens_seen": 125146195, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8203125, + "step": 5817, + "time_per_iteration": 2.5161447525024414 + }, + { + "auxiliary_loss_clip": 0.01042618, + "auxiliary_loss_mlp": 0.01011257, + "balance_loss_clip": 1.00957632, + "balance_loss_mlp": 1.01738954, + "epoch": 0.34979708402224563, + "flos": 68529374818560.0, + "grad_norm": 0.8658844915790124, + "language_loss": 0.59855008, + "learning_rate": 3.0197216250017975e-06, + "loss": 0.61908889, + "num_input_tokens_seen": 125207790, + "router_z_loss_clip": 0.0168457, + "router_z_loss_mlp": 0.25195312, + "step": 5818, + "time_per_iteration": 3.105595111846924 + }, + { + "auxiliary_loss_clip": 0.01123632, + "auxiliary_loss_mlp": 0.01036752, + "balance_loss_clip": 1.02226782, + "balance_loss_mlp": 1.04561055, + "epoch": 0.3498572072749136, + "flos": 18989634988800.0, + "grad_norm": 3.0068929936640103, + "language_loss": 0.83458424, + "learning_rate": 3.019386568567123e-06, + "loss": 0.85618806, + "num_input_tokens_seen": 125226220, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 5819, + "time_per_iteration": 2.47537899017334 + }, + { + "auxiliary_loss_clip": 0.01123279, + "auxiliary_loss_mlp": 0.0103063, + "balance_loss_clip": 1.01655149, + "balance_loss_mlp": 1.04359841, + "epoch": 0.34991733052758156, + "flos": 27818883423360.0, + "grad_norm": 3.6330435008795483, + "language_loss": 0.70765841, + "learning_rate": 3.0190514734772083e-06, + "loss": 0.7291975, + "num_input_tokens_seen": 125247485, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.796875, + "step": 5820, + "time_per_iteration": 2.4817428588867188 + }, + { + "auxiliary_loss_clip": 0.01125706, + "auxiliary_loss_mlp": 0.01035768, + "balance_loss_clip": 1.022434, + "balance_loss_mlp": 1.04544306, + "epoch": 0.3499774537802495, + "flos": 33584197737600.0, + "grad_norm": 2.1579309336976547, + "language_loss": 0.70112801, + "learning_rate": 3.018716339744759e-06, + "loss": 0.7227428, + "num_input_tokens_seen": 125268625, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.80078125, + "step": 5821, + "time_per_iteration": 2.578753709793091 + }, + { + "auxiliary_loss_clip": 0.01131817, + "auxiliary_loss_mlp": 0.0103919, + "balance_loss_clip": 1.02328706, + "balance_loss_mlp": 1.04798198, + "epoch": 0.3500375770329175, + "flos": 23476744851840.0, + "grad_norm": 2.9634934958204076, + "language_loss": 0.73591399, + "learning_rate": 3.0183811673824842e-06, + "loss": 0.75762403, + "num_input_tokens_seen": 125287530, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.83984375, + "step": 5822, + "time_per_iteration": 2.469041109085083 + }, + { + "auxiliary_loss_clip": 0.01127055, + "auxiliary_loss_mlp": 0.01036959, + "balance_loss_clip": 1.02150989, + "balance_loss_mlp": 1.0447278, + "epoch": 0.35009770028558546, + "flos": 19026048401280.0, + "grad_norm": 1.5203539526389718, + "language_loss": 0.78104019, + "learning_rate": 3.018045956403094e-06, + "loss": 0.80268037, + "num_input_tokens_seen": 125307020, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.82421875, + "step": 5823, + "time_per_iteration": 2.4932196140289307 + }, + { + "auxiliary_loss_clip": 0.01038228, + "auxiliary_loss_mlp": 0.01001335, + "balance_loss_clip": 0.99964237, + "balance_loss_mlp": 1.01332808, + "epoch": 0.3501578235382534, + "flos": 68351868783360.0, + "grad_norm": 1.4438996436497689, + "language_loss": 0.59237444, + "learning_rate": 3.017710706819298e-06, + "loss": 0.61277008, + "num_input_tokens_seen": 125370445, + "router_z_loss_clip": 0.01696777, + "router_z_loss_mlp": 0.24902344, + "step": 5824, + "time_per_iteration": 3.109966278076172 + }, + { + "auxiliary_loss_clip": 0.01125511, + "auxiliary_loss_mlp": 0.01036598, + "balance_loss_clip": 1.0213685, + "balance_loss_mlp": 1.04462993, + "epoch": 0.3502179467909214, + "flos": 21250893836160.0, + "grad_norm": 1.8425293735622459, + "language_loss": 0.84740114, + "learning_rate": 3.017375418643811e-06, + "loss": 0.86902225, + "num_input_tokens_seen": 125388900, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.80859375, + "step": 5825, + "time_per_iteration": 2.4780030250549316 + }, + { + "auxiliary_loss_clip": 0.01125254, + "auxiliary_loss_mlp": 0.01038054, + "balance_loss_clip": 1.02292657, + "balance_loss_mlp": 1.04522121, + "epoch": 0.35027807004358935, + "flos": 11942955826560.0, + "grad_norm": 2.24584207136959, + "language_loss": 0.82778502, + "learning_rate": 3.0170400918893464e-06, + "loss": 0.84941804, + "num_input_tokens_seen": 125402675, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.80078125, + "step": 5826, + "time_per_iteration": 2.4147045612335205 + }, + { + "auxiliary_loss_clip": 0.01126938, + "auxiliary_loss_mlp": 0.01041103, + "balance_loss_clip": 1.02587962, + "balance_loss_mlp": 1.04480314, + "epoch": 0.3503381932962573, + "flos": 21470918595840.0, + "grad_norm": 1.5075773428374344, + "language_loss": 0.80714649, + "learning_rate": 3.0167047265686186e-06, + "loss": 0.8288269, + "num_input_tokens_seen": 125421360, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8203125, + "step": 5827, + "time_per_iteration": 2.4650330543518066 + }, + { + "auxiliary_loss_clip": 0.01123347, + "auxiliary_loss_mlp": 0.01035841, + "balance_loss_clip": 1.0220902, + "balance_loss_mlp": 1.04475152, + "epoch": 0.3503983165489253, + "flos": 21251109317760.0, + "grad_norm": 2.7582821019631836, + "language_loss": 0.70936024, + "learning_rate": 3.0163693226943467e-06, + "loss": 0.73095214, + "num_input_tokens_seen": 125440000, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.78515625, + "step": 5828, + "time_per_iteration": 2.4710564613342285 + }, + { + "auxiliary_loss_clip": 0.01130881, + "auxiliary_loss_mlp": 0.01043725, + "balance_loss_clip": 1.02666616, + "balance_loss_mlp": 1.04788435, + "epoch": 0.35045843980159325, + "flos": 27815723026560.0, + "grad_norm": 1.628373483521701, + "language_loss": 0.79397106, + "learning_rate": 3.016033880279248e-06, + "loss": 0.81571716, + "num_input_tokens_seen": 125460390, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.828125, + "step": 5829, + "time_per_iteration": 2.5081264972686768 + }, + { + "auxiliary_loss_clip": 0.01129997, + "auxiliary_loss_mlp": 0.0104564, + "balance_loss_clip": 1.02900994, + "balance_loss_mlp": 1.04607642, + "epoch": 0.3505185630542612, + "flos": 25921148169600.0, + "grad_norm": 1.7135270810407168, + "language_loss": 0.72111332, + "learning_rate": 3.0156983993360417e-06, + "loss": 0.74286962, + "num_input_tokens_seen": 125478410, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.83984375, + "step": 5830, + "time_per_iteration": 2.507263422012329 + }, + { + "auxiliary_loss_clip": 0.01122818, + "auxiliary_loss_mlp": 0.01033023, + "balance_loss_clip": 1.01801419, + "balance_loss_mlp": 1.04352021, + "epoch": 0.35057868630692923, + "flos": 20521763660160.0, + "grad_norm": 2.0188022258715996, + "language_loss": 0.88740343, + "learning_rate": 3.0153628798774513e-06, + "loss": 0.90896189, + "num_input_tokens_seen": 125495975, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.79296875, + "step": 5831, + "time_per_iteration": 2.4769816398620605 + }, + { + "auxiliary_loss_clip": 0.01122435, + "auxiliary_loss_mlp": 0.01040768, + "balance_loss_clip": 1.02560508, + "balance_loss_mlp": 1.04128802, + "epoch": 0.3506388095595972, + "flos": 20448649526400.0, + "grad_norm": 1.9377344606434141, + "language_loss": 0.78478962, + "learning_rate": 3.0150273219161985e-06, + "loss": 0.80642164, + "num_input_tokens_seen": 125515035, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.8125, + "step": 5832, + "time_per_iteration": 2.458019971847534 + }, + { + "auxiliary_loss_clip": 0.01125835, + "auxiliary_loss_mlp": 0.01043672, + "balance_loss_clip": 1.02744734, + "balance_loss_mlp": 1.04360127, + "epoch": 0.35069893281226516, + "flos": 23109665811840.0, + "grad_norm": 1.8976688118149017, + "language_loss": 0.70859557, + "learning_rate": 3.014691725465008e-06, + "loss": 0.73029065, + "num_input_tokens_seen": 125535555, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.82421875, + "step": 5833, + "time_per_iteration": 2.494739055633545 + }, + { + "auxiliary_loss_clip": 0.01121087, + "auxiliary_loss_mlp": 0.01030807, + "balance_loss_clip": 1.01635337, + "balance_loss_mlp": 1.04384482, + "epoch": 0.35075905606493313, + "flos": 27271999877760.0, + "grad_norm": 1.3472514068868482, + "language_loss": 0.80878949, + "learning_rate": 3.014356090536606e-06, + "loss": 0.83030844, + "num_input_tokens_seen": 125558195, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7734375, + "step": 5834, + "time_per_iteration": 2.521343231201172 + }, + { + "auxiliary_loss_clip": 0.01124914, + "auxiliary_loss_mlp": 0.01043402, + "balance_loss_clip": 1.02823853, + "balance_loss_mlp": 1.04525888, + "epoch": 0.3508191793176011, + "flos": 19128608709120.0, + "grad_norm": 2.219662071096021, + "language_loss": 0.83629, + "learning_rate": 3.0140204171437183e-06, + "loss": 0.8579731, + "num_input_tokens_seen": 125575375, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.796875, + "step": 5835, + "time_per_iteration": 2.53587007522583 + }, + { + "auxiliary_loss_clip": 0.01123177, + "auxiliary_loss_mlp": 0.01043674, + "balance_loss_clip": 1.02932119, + "balance_loss_mlp": 1.04351568, + "epoch": 0.35087930257026906, + "flos": 25557588662400.0, + "grad_norm": 2.120648036265282, + "language_loss": 0.76607329, + "learning_rate": 3.0136847052990754e-06, + "loss": 0.78774178, + "num_input_tokens_seen": 125596745, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.796875, + "step": 5836, + "time_per_iteration": 2.54390549659729 + }, + { + "auxiliary_loss_clip": 0.01128097, + "auxiliary_loss_mlp": 0.01038562, + "balance_loss_clip": 1.02382731, + "balance_loss_mlp": 1.04872775, + "epoch": 0.350939425822937, + "flos": 18004246208640.0, + "grad_norm": 2.2292749531356986, + "language_loss": 0.77354801, + "learning_rate": 3.0133489550154074e-06, + "loss": 0.79521459, + "num_input_tokens_seen": 125613980, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.79296875, + "step": 5837, + "time_per_iteration": 2.4478273391723633 + }, + { + "auxiliary_loss_clip": 0.01123898, + "auxiliary_loss_mlp": 0.01044754, + "balance_loss_clip": 1.02998376, + "balance_loss_mlp": 1.04441822, + "epoch": 0.350999549075605, + "flos": 22273198819200.0, + "grad_norm": 1.6098451794116821, + "language_loss": 0.68129408, + "learning_rate": 3.0130131663054442e-06, + "loss": 0.70298064, + "num_input_tokens_seen": 125632100, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.79296875, + "step": 5838, + "time_per_iteration": 2.505833864212036 + }, + { + "auxiliary_loss_clip": 0.01122037, + "auxiliary_loss_mlp": 0.01034351, + "balance_loss_clip": 1.01945019, + "balance_loss_mlp": 1.04240978, + "epoch": 0.35105967232827295, + "flos": 14392279307520.0, + "grad_norm": 2.0937603738721173, + "language_loss": 0.83561182, + "learning_rate": 3.0126773391819215e-06, + "loss": 0.85717571, + "num_input_tokens_seen": 125649190, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.796875, + "step": 5839, + "time_per_iteration": 2.4378576278686523 + }, + { + "auxiliary_loss_clip": 0.01126069, + "auxiliary_loss_mlp": 0.01042905, + "balance_loss_clip": 1.02775335, + "balance_loss_mlp": 1.04351032, + "epoch": 0.3511197955809409, + "flos": 25082346792960.0, + "grad_norm": 1.6277808139419232, + "language_loss": 0.58590645, + "learning_rate": 3.012341473657572e-06, + "loss": 0.60759622, + "num_input_tokens_seen": 125668680, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.828125, + "step": 5840, + "time_per_iteration": 2.4883387088775635 + }, + { + "auxiliary_loss_clip": 0.01125241, + "auxiliary_loss_mlp": 0.01035574, + "balance_loss_clip": 1.02015984, + "balance_loss_mlp": 1.04445219, + "epoch": 0.3511799188336089, + "flos": 25884160139520.0, + "grad_norm": 2.7790843018814058, + "language_loss": 0.87061596, + "learning_rate": 3.0120055697451322e-06, + "loss": 0.89222413, + "num_input_tokens_seen": 125686935, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.80859375, + "step": 5841, + "time_per_iteration": 2.5035836696624756 + }, + { + "auxiliary_loss_clip": 0.01128185, + "auxiliary_loss_mlp": 0.01041931, + "balance_loss_clip": 1.02551615, + "balance_loss_mlp": 1.0455035, + "epoch": 0.35124004208627685, + "flos": 20083725302400.0, + "grad_norm": 1.6842451001577108, + "language_loss": 0.74924648, + "learning_rate": 3.0116696274573406e-06, + "loss": 0.77094764, + "num_input_tokens_seen": 125707180, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.828125, + "step": 5842, + "time_per_iteration": 2.4677891731262207 + }, + { + "auxiliary_loss_clip": 0.01125535, + "auxiliary_loss_mlp": 0.01040751, + "balance_loss_clip": 1.02552199, + "balance_loss_mlp": 1.04403496, + "epoch": 0.3513001653389448, + "flos": 17783431349760.0, + "grad_norm": 3.45436030057014, + "language_loss": 0.68184745, + "learning_rate": 3.0113336468069346e-06, + "loss": 0.70351034, + "num_input_tokens_seen": 125722780, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8125, + "step": 5843, + "time_per_iteration": 2.4356935024261475 + }, + { + "auxiliary_loss_clip": 0.01123467, + "auxiliary_loss_mlp": 0.01042343, + "balance_loss_clip": 1.02734041, + "balance_loss_mlp": 1.04418659, + "epoch": 0.3513602885916128, + "flos": 29387138198400.0, + "grad_norm": 3.71115813366519, + "language_loss": 0.65957326, + "learning_rate": 3.010997627806655e-06, + "loss": 0.68123138, + "num_input_tokens_seen": 125742110, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.79296875, + "step": 5844, + "time_per_iteration": 2.4961743354797363 + }, + { + "auxiliary_loss_clip": 0.01124887, + "auxiliary_loss_mlp": 0.01040447, + "balance_loss_clip": 1.02446079, + "balance_loss_mlp": 1.04466677, + "epoch": 0.3514204118442808, + "flos": 16179876483840.0, + "grad_norm": 2.036064641334285, + "language_loss": 0.75629944, + "learning_rate": 3.010661570469245e-06, + "loss": 0.77795279, + "num_input_tokens_seen": 125759980, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8046875, + "step": 5845, + "time_per_iteration": 5.325402498245239 + }, + { + "auxiliary_loss_clip": 0.01123091, + "auxiliary_loss_mlp": 0.01039418, + "balance_loss_clip": 1.02483845, + "balance_loss_mlp": 1.04537153, + "epoch": 0.35148053509694877, + "flos": 23834665923840.0, + "grad_norm": 2.494167784966283, + "language_loss": 0.73075795, + "learning_rate": 3.0103254748074465e-06, + "loss": 0.75238299, + "num_input_tokens_seen": 125772660, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.77734375, + "step": 5846, + "time_per_iteration": 2.4515323638916016 + }, + { + "auxiliary_loss_clip": 0.01127959, + "auxiliary_loss_mlp": 0.01040823, + "balance_loss_clip": 1.02587426, + "balance_loss_mlp": 1.04755926, + "epoch": 0.35154065834961673, + "flos": 20991295267200.0, + "grad_norm": 1.6229430725765215, + "language_loss": 0.75876832, + "learning_rate": 3.0099893408340046e-06, + "loss": 0.78045619, + "num_input_tokens_seen": 125791935, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.8046875, + "step": 5847, + "time_per_iteration": 2.4869656562805176 + }, + { + "auxiliary_loss_clip": 0.0112227, + "auxiliary_loss_mlp": 0.01034732, + "balance_loss_clip": 1.02067161, + "balance_loss_mlp": 1.04212832, + "epoch": 0.3516007816022847, + "flos": 33255471444480.0, + "grad_norm": 2.14189752244475, + "language_loss": 0.72070903, + "learning_rate": 3.009653168561666e-06, + "loss": 0.74227905, + "num_input_tokens_seen": 125813455, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.80078125, + "step": 5848, + "time_per_iteration": 2.5580503940582275 + }, + { + "auxiliary_loss_clip": 0.01127957, + "auxiliary_loss_mlp": 0.01044586, + "balance_loss_clip": 1.02953017, + "balance_loss_mlp": 1.04648554, + "epoch": 0.35166090485495266, + "flos": 11726953390080.0, + "grad_norm": 2.252970750126207, + "language_loss": 0.89321303, + "learning_rate": 3.009316958003178e-06, + "loss": 0.91493851, + "num_input_tokens_seen": 125827660, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8125, + "step": 5849, + "time_per_iteration": 2.4167070388793945 + }, + { + "auxiliary_loss_clip": 0.01123705, + "auxiliary_loss_mlp": 0.01032745, + "balance_loss_clip": 1.01810622, + "balance_loss_mlp": 1.04373825, + "epoch": 0.3517210281076206, + "flos": 22638446265600.0, + "grad_norm": 2.8040734708025026, + "language_loss": 0.74810916, + "learning_rate": 3.0089807091712897e-06, + "loss": 0.76967371, + "num_input_tokens_seen": 125846655, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.80078125, + "step": 5850, + "time_per_iteration": 2.457970142364502 + }, + { + "auxiliary_loss_clip": 0.0112382, + "auxiliary_loss_mlp": 0.01032618, + "balance_loss_clip": 1.01809859, + "balance_loss_mlp": 1.04618788, + "epoch": 0.3517811513602886, + "flos": 21322750993920.0, + "grad_norm": 1.5003899492593988, + "language_loss": 0.7563765, + "learning_rate": 3.0086444220787515e-06, + "loss": 0.77794087, + "num_input_tokens_seen": 125866290, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.77734375, + "step": 5851, + "time_per_iteration": 2.48270845413208 + }, + { + "auxiliary_loss_clip": 0.01126446, + "auxiliary_loss_mlp": 0.01036787, + "balance_loss_clip": 1.0219928, + "balance_loss_mlp": 1.04683256, + "epoch": 0.35184127461295656, + "flos": 21032880238080.0, + "grad_norm": 2.074837490144385, + "language_loss": 0.87552518, + "learning_rate": 3.0083080967383165e-06, + "loss": 0.89715755, + "num_input_tokens_seen": 125884620, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 5852, + "time_per_iteration": 2.4690029621124268 + }, + { + "auxiliary_loss_clip": 0.01122074, + "auxiliary_loss_mlp": 0.01035979, + "balance_loss_clip": 1.02193666, + "balance_loss_mlp": 1.04361391, + "epoch": 0.3519013978656245, + "flos": 22455265881600.0, + "grad_norm": 2.0973347969099048, + "language_loss": 0.67880064, + "learning_rate": 3.007971733162737e-06, + "loss": 0.70038116, + "num_input_tokens_seen": 125902430, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.78515625, + "step": 5853, + "time_per_iteration": 2.4953458309173584 + }, + { + "auxiliary_loss_clip": 0.01125495, + "auxiliary_loss_mlp": 0.01034243, + "balance_loss_clip": 1.0195092, + "balance_loss_mlp": 1.04545975, + "epoch": 0.3519615211182925, + "flos": 13115295918720.0, + "grad_norm": 1.6680659623481517, + "language_loss": 0.8122859, + "learning_rate": 3.0076353313647686e-06, + "loss": 0.83388329, + "num_input_tokens_seen": 125920570, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.80078125, + "step": 5854, + "time_per_iteration": 2.4702916145324707 + }, + { + "auxiliary_loss_clip": 0.01122667, + "auxiliary_loss_mlp": 0.01030591, + "balance_loss_clip": 1.01734662, + "balance_loss_mlp": 1.04566765, + "epoch": 0.35202164437096045, + "flos": 19135144984320.0, + "grad_norm": 1.6003148952985655, + "language_loss": 0.73131359, + "learning_rate": 3.0072988913571666e-06, + "loss": 0.75284624, + "num_input_tokens_seen": 125939800, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.76953125, + "step": 5855, + "time_per_iteration": 2.4895823001861572 + }, + { + "auxiliary_loss_clip": 0.01120527, + "auxiliary_loss_mlp": 0.01039285, + "balance_loss_clip": 1.02549887, + "balance_loss_mlp": 1.04334307, + "epoch": 0.3520817676236284, + "flos": 26542187343360.0, + "grad_norm": 3.701560840262617, + "language_loss": 0.70894778, + "learning_rate": 3.006962413152691e-06, + "loss": 0.73054588, + "num_input_tokens_seen": 125958720, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7734375, + "step": 5856, + "time_per_iteration": 2.5133585929870605 + }, + { + "auxiliary_loss_clip": 0.01126422, + "auxiliary_loss_mlp": 0.01044153, + "balance_loss_clip": 1.02881038, + "balance_loss_mlp": 1.0456897, + "epoch": 0.3521418908762964, + "flos": 44893472803200.0, + "grad_norm": 1.8086114170356375, + "language_loss": 0.60915685, + "learning_rate": 3.0066258967640987e-06, + "loss": 0.63086259, + "num_input_tokens_seen": 125984310, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.80859375, + "step": 5857, + "time_per_iteration": 2.723238468170166 + }, + { + "auxiliary_loss_clip": 0.01123346, + "auxiliary_loss_mlp": 0.01039329, + "balance_loss_clip": 1.02434421, + "balance_loss_mlp": 1.04425693, + "epoch": 0.3522020141289644, + "flos": 20187398931840.0, + "grad_norm": 1.754440516271971, + "language_loss": 0.73341751, + "learning_rate": 3.006289342204152e-06, + "loss": 0.75504428, + "num_input_tokens_seen": 126002410, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7890625, + "step": 5858, + "time_per_iteration": 2.509556293487549 + }, + { + "auxiliary_loss_clip": 0.01125415, + "auxiliary_loss_mlp": 0.01041448, + "balance_loss_clip": 1.02720821, + "balance_loss_mlp": 1.04428148, + "epoch": 0.35226213738163237, + "flos": 27563917708800.0, + "grad_norm": 1.4710047028379252, + "language_loss": 0.76090813, + "learning_rate": 3.0059527494856126e-06, + "loss": 0.7825768, + "num_input_tokens_seen": 126022490, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8125, + "step": 5859, + "time_per_iteration": 2.584312677383423 + }, + { + "auxiliary_loss_clip": 0.0113226, + "auxiliary_loss_mlp": 0.01038835, + "balance_loss_clip": 1.0230875, + "balance_loss_mlp": 1.04828274, + "epoch": 0.35232226063430033, + "flos": 22966310632320.0, + "grad_norm": 1.6944630123418771, + "language_loss": 0.71475387, + "learning_rate": 3.0056161186212435e-06, + "loss": 0.73646474, + "num_input_tokens_seen": 126042895, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.83984375, + "step": 5860, + "time_per_iteration": 2.5120623111724854 + }, + { + "auxiliary_loss_clip": 0.01125655, + "auxiliary_loss_mlp": 0.0104098, + "balance_loss_clip": 1.02506578, + "balance_loss_mlp": 1.04208136, + "epoch": 0.3523823838869683, + "flos": 19168290259200.0, + "grad_norm": 2.10777684168558, + "language_loss": 0.6624974, + "learning_rate": 3.005279449623811e-06, + "loss": 0.68416381, + "num_input_tokens_seen": 126060130, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8359375, + "step": 5861, + "time_per_iteration": 2.4927096366882324 + }, + { + "auxiliary_loss_clip": 0.01123555, + "auxiliary_loss_mlp": 0.0103431, + "balance_loss_clip": 1.01994538, + "balance_loss_mlp": 1.04497313, + "epoch": 0.35244250713963626, + "flos": 17930988420480.0, + "grad_norm": 2.1064993181157843, + "language_loss": 0.66780227, + "learning_rate": 3.0049427425060815e-06, + "loss": 0.68938088, + "num_input_tokens_seen": 126077850, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78515625, + "step": 5862, + "time_per_iteration": 2.4275379180908203 + }, + { + "auxiliary_loss_clip": 0.01124727, + "auxiliary_loss_mlp": 0.01037294, + "balance_loss_clip": 1.02132034, + "balance_loss_mlp": 1.04420304, + "epoch": 0.35250263039230423, + "flos": 21432529935360.0, + "grad_norm": 2.0193315360348842, + "language_loss": 0.77049166, + "learning_rate": 3.0046059972808215e-06, + "loss": 0.79211187, + "num_input_tokens_seen": 126095985, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8046875, + "step": 5863, + "time_per_iteration": 2.504391670227051 + }, + { + "auxiliary_loss_clip": 0.0112515, + "auxiliary_loss_mlp": 0.01034667, + "balance_loss_clip": 1.02027822, + "balance_loss_mlp": 1.04449666, + "epoch": 0.3525627536449722, + "flos": 27416863428480.0, + "grad_norm": 2.7341123556359297, + "language_loss": 0.75018549, + "learning_rate": 3.0042692139608024e-06, + "loss": 0.77178371, + "num_input_tokens_seen": 126116070, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8046875, + "step": 5864, + "time_per_iteration": 2.4962751865386963 + }, + { + "auxiliary_loss_clip": 0.01123376, + "auxiliary_loss_mlp": 0.0104564, + "balance_loss_clip": 1.03110838, + "balance_loss_mlp": 1.04376507, + "epoch": 0.35262287689764016, + "flos": 24789818430720.0, + "grad_norm": 1.9972182581193567, + "language_loss": 0.79051632, + "learning_rate": 3.003932392558793e-06, + "loss": 0.81220651, + "num_input_tokens_seen": 126135205, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.796875, + "step": 5865, + "time_per_iteration": 2.5369789600372314 + }, + { + "auxiliary_loss_clip": 0.01130515, + "auxiliary_loss_mlp": 0.01045214, + "balance_loss_clip": 1.02901387, + "balance_loss_mlp": 1.04835618, + "epoch": 0.3526830001503081, + "flos": 17821604528640.0, + "grad_norm": 1.8375125007543296, + "language_loss": 0.81622374, + "learning_rate": 3.0035955330875677e-06, + "loss": 0.8379811, + "num_input_tokens_seen": 126151895, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8203125, + "step": 5866, + "time_per_iteration": 2.497587203979492 + }, + { + "auxiliary_loss_clip": 0.01131205, + "auxiliary_loss_mlp": 0.01037377, + "balance_loss_clip": 1.02081871, + "balance_loss_mlp": 1.04493296, + "epoch": 0.3527431234029761, + "flos": 18078114528000.0, + "grad_norm": 2.1796505180833696, + "language_loss": 0.84552217, + "learning_rate": 3.0032586355598986e-06, + "loss": 0.867208, + "num_input_tokens_seen": 126168515, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.86328125, + "step": 5867, + "time_per_iteration": 2.5673649311065674 + }, + { + "auxiliary_loss_clip": 0.01126594, + "auxiliary_loss_mlp": 0.01043148, + "balance_loss_clip": 1.02764452, + "balance_loss_mlp": 1.04441357, + "epoch": 0.35280324665564405, + "flos": 19427350124160.0, + "grad_norm": 2.2018810166756873, + "language_loss": 0.74618357, + "learning_rate": 3.0029216999885613e-06, + "loss": 0.76788092, + "num_input_tokens_seen": 126186460, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8203125, + "step": 5868, + "time_per_iteration": 2.4571762084960938 + }, + { + "auxiliary_loss_clip": 0.01127392, + "auxiliary_loss_mlp": 0.01039387, + "balance_loss_clip": 1.02433038, + "balance_loss_mlp": 1.04489541, + "epoch": 0.352863369908312, + "flos": 21504027957120.0, + "grad_norm": 2.0366485396940615, + "language_loss": 0.61648643, + "learning_rate": 3.0025847263863327e-06, + "loss": 0.63815421, + "num_input_tokens_seen": 126206170, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.82421875, + "step": 5869, + "time_per_iteration": 2.5125019550323486 + }, + { + "auxiliary_loss_clip": 0.01124688, + "auxiliary_loss_mlp": 0.01042493, + "balance_loss_clip": 1.02690625, + "balance_loss_mlp": 1.04286385, + "epoch": 0.35292349316098, + "flos": 22309504490880.0, + "grad_norm": 2.290977208251557, + "language_loss": 0.74328029, + "learning_rate": 3.0022477147659917e-06, + "loss": 0.76495212, + "num_input_tokens_seen": 126225605, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8203125, + "step": 5870, + "time_per_iteration": 2.4636306762695312 + }, + { + "auxiliary_loss_clip": 0.0112446, + "auxiliary_loss_mlp": 0.01036399, + "balance_loss_clip": 1.02097332, + "balance_loss_mlp": 1.04412317, + "epoch": 0.352983616413648, + "flos": 33109745967360.0, + "grad_norm": 1.44010977521146, + "language_loss": 0.71498513, + "learning_rate": 3.001910665140316e-06, + "loss": 0.73659372, + "num_input_tokens_seen": 126250230, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8046875, + "step": 5871, + "time_per_iteration": 2.629002094268799 + }, + { + "auxiliary_loss_clip": 0.01120822, + "auxiliary_loss_mlp": 0.01033704, + "balance_loss_clip": 1.01999545, + "balance_loss_mlp": 1.04340768, + "epoch": 0.35304373966631597, + "flos": 18696603836160.0, + "grad_norm": 2.215441176085892, + "language_loss": 0.74219513, + "learning_rate": 3.0015735775220873e-06, + "loss": 0.76374042, + "num_input_tokens_seen": 126268315, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 5872, + "time_per_iteration": 2.4672691822052 + }, + { + "auxiliary_loss_clip": 0.01121667, + "auxiliary_loss_mlp": 0.01036996, + "balance_loss_clip": 1.02291727, + "balance_loss_mlp": 1.04295182, + "epoch": 0.35310386291898394, + "flos": 23364954748800.0, + "grad_norm": 1.6120105579455812, + "language_loss": 0.82492435, + "learning_rate": 3.001236451924089e-06, + "loss": 0.84651101, + "num_input_tokens_seen": 126288390, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 5873, + "time_per_iteration": 2.549706220626831 + }, + { + "auxiliary_loss_clip": 0.01128213, + "auxiliary_loss_mlp": 0.01044661, + "balance_loss_clip": 1.02800715, + "balance_loss_mlp": 1.04399252, + "epoch": 0.3531639861716519, + "flos": 24461954064000.0, + "grad_norm": 1.8495868157058504, + "language_loss": 0.6583339, + "learning_rate": 3.000899288359104e-06, + "loss": 0.68006265, + "num_input_tokens_seen": 126305750, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.84375, + "step": 5874, + "time_per_iteration": 2.4949634075164795 + }, + { + "auxiliary_loss_clip": 0.01044147, + "auxiliary_loss_mlp": 0.01006984, + "balance_loss_clip": 1.00510025, + "balance_loss_mlp": 1.01915693, + "epoch": 0.35322410942431987, + "flos": 70312446881280.0, + "grad_norm": 0.771003921858337, + "language_loss": 0.61583531, + "learning_rate": 3.000562086839917e-06, + "loss": 0.63634658, + "num_input_tokens_seen": 126362495, + "router_z_loss_clip": 0.01879883, + "router_z_loss_mlp": 0.25, + "step": 5875, + "time_per_iteration": 2.9931485652923584 + }, + { + "auxiliary_loss_clip": 0.01124819, + "auxiliary_loss_mlp": 0.01043824, + "balance_loss_clip": 1.02995443, + "balance_loss_mlp": 1.04544568, + "epoch": 0.35328423267698783, + "flos": 19820894509440.0, + "grad_norm": 1.6836782364007539, + "language_loss": 0.800933, + "learning_rate": 3.0002248473793163e-06, + "loss": 0.82261944, + "num_input_tokens_seen": 126378320, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.796875, + "step": 5876, + "time_per_iteration": 2.443178415298462 + }, + { + "auxiliary_loss_clip": 0.01041911, + "auxiliary_loss_mlp": 0.01006634, + "balance_loss_clip": 1.00477409, + "balance_loss_mlp": 1.01663578, + "epoch": 0.3533443559296558, + "flos": 60826356391680.0, + "grad_norm": 1.6287450036197537, + "language_loss": 0.5674026, + "learning_rate": 2.999887569990088e-06, + "loss": 0.587888, + "num_input_tokens_seen": 126442735, + "router_z_loss_clip": 0.01855469, + "router_z_loss_mlp": 0.25195312, + "step": 5877, + "time_per_iteration": 3.1782116889953613 + }, + { + "auxiliary_loss_clip": 0.01124291, + "auxiliary_loss_mlp": 0.01030449, + "balance_loss_clip": 1.01677024, + "balance_loss_mlp": 1.04401922, + "epoch": 0.35340447918232376, + "flos": 24755775315840.0, + "grad_norm": 1.5579095187110108, + "language_loss": 0.71649593, + "learning_rate": 2.999550254685024e-06, + "loss": 0.73804337, + "num_input_tokens_seen": 126463090, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.80078125, + "step": 5878, + "time_per_iteration": 2.4984474182128906 + }, + { + "auxiliary_loss_clip": 0.01123007, + "auxiliary_loss_mlp": 0.0103937, + "balance_loss_clip": 1.02498162, + "balance_loss_mlp": 1.04198527, + "epoch": 0.3534646024349917, + "flos": 21796304924160.0, + "grad_norm": 1.9384917614544617, + "language_loss": 0.78492844, + "learning_rate": 2.9992129014769136e-06, + "loss": 0.80655217, + "num_input_tokens_seen": 126482105, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.8125, + "step": 5879, + "time_per_iteration": 2.5369913578033447 + }, + { + "auxiliary_loss_clip": 0.01126898, + "auxiliary_loss_mlp": 0.01045347, + "balance_loss_clip": 1.02870536, + "balance_loss_mlp": 1.04373121, + "epoch": 0.3535247256876597, + "flos": 20012119539840.0, + "grad_norm": 2.0656781659104917, + "language_loss": 0.63695049, + "learning_rate": 2.9988755103785493e-06, + "loss": 0.65867293, + "num_input_tokens_seen": 126502125, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.83203125, + "step": 5880, + "time_per_iteration": 2.457787036895752 + }, + { + "auxiliary_loss_clip": 0.01125585, + "auxiliary_loss_mlp": 0.01036241, + "balance_loss_clip": 1.02078009, + "balance_loss_mlp": 1.04375386, + "epoch": 0.35358484894032766, + "flos": 18187929383040.0, + "grad_norm": 3.125568384757795, + "language_loss": 0.65818816, + "learning_rate": 2.998538081402727e-06, + "loss": 0.67980647, + "num_input_tokens_seen": 126521950, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.81640625, + "step": 5881, + "time_per_iteration": 2.5198867321014404 + }, + { + "auxiliary_loss_clip": 0.01119138, + "auxiliary_loss_mlp": 0.01031894, + "balance_loss_clip": 1.01851869, + "balance_loss_mlp": 1.04197288, + "epoch": 0.3536449721929956, + "flos": 22820369673600.0, + "grad_norm": 1.3882047203281038, + "language_loss": 0.75280428, + "learning_rate": 2.998200614562239e-06, + "loss": 0.77431458, + "num_input_tokens_seen": 126542445, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7734375, + "step": 5882, + "time_per_iteration": 2.4526872634887695 + }, + { + "auxiliary_loss_clip": 0.01126623, + "auxiliary_loss_mlp": 0.01037746, + "balance_loss_clip": 1.02266037, + "balance_loss_mlp": 1.04543018, + "epoch": 0.3537050954456636, + "flos": 26432336574720.0, + "grad_norm": 2.123888211837838, + "language_loss": 0.70349854, + "learning_rate": 2.9978631098698847e-06, + "loss": 0.72514224, + "num_input_tokens_seen": 126560690, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8125, + "step": 5883, + "time_per_iteration": 2.538865566253662 + }, + { + "auxiliary_loss_clip": 0.01129519, + "auxiliary_loss_mlp": 0.01038175, + "balance_loss_clip": 1.0228982, + "balance_loss_mlp": 1.04584253, + "epoch": 0.3537652186983316, + "flos": 17197153562880.0, + "grad_norm": 2.009195754637657, + "language_loss": 0.78500903, + "learning_rate": 2.9975255673384614e-06, + "loss": 0.80668598, + "num_input_tokens_seen": 126577620, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8359375, + "step": 5884, + "time_per_iteration": 2.4410510063171387 + }, + { + "auxiliary_loss_clip": 0.0112138, + "auxiliary_loss_mlp": 0.01032576, + "balance_loss_clip": 1.01901007, + "balance_loss_mlp": 1.04336667, + "epoch": 0.3538253419509996, + "flos": 19536769929600.0, + "grad_norm": 1.8922441591552446, + "language_loss": 0.75478536, + "learning_rate": 2.9971879869807673e-06, + "loss": 0.77632499, + "num_input_tokens_seen": 126596235, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.78125, + "step": 5885, + "time_per_iteration": 2.555816650390625 + }, + { + "auxiliary_loss_clip": 0.01127447, + "auxiliary_loss_mlp": 0.01042213, + "balance_loss_clip": 1.02666783, + "balance_loss_mlp": 1.04478371, + "epoch": 0.35388546520366754, + "flos": 12128578335360.0, + "grad_norm": 2.2081606315958635, + "language_loss": 0.82679224, + "learning_rate": 2.996850368809606e-06, + "loss": 0.84848893, + "num_input_tokens_seen": 126612830, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.82421875, + "step": 5886, + "time_per_iteration": 2.482151985168457 + }, + { + "auxiliary_loss_clip": 0.01124743, + "auxiliary_loss_mlp": 0.01032293, + "balance_loss_clip": 1.01717782, + "balance_loss_mlp": 1.04533887, + "epoch": 0.3539455884563355, + "flos": 19678149861120.0, + "grad_norm": 2.4580910750403775, + "language_loss": 0.78723359, + "learning_rate": 2.9965127128377787e-06, + "loss": 0.80880398, + "num_input_tokens_seen": 126630910, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.79296875, + "step": 5887, + "time_per_iteration": 5.388309001922607 + }, + { + "auxiliary_loss_clip": 0.01122251, + "auxiliary_loss_mlp": 0.0104141, + "balance_loss_clip": 1.0269978, + "balance_loss_mlp": 1.04226518, + "epoch": 0.35400571170900347, + "flos": 18072045129600.0, + "grad_norm": 3.1093010737907867, + "language_loss": 0.65404654, + "learning_rate": 2.996175019078089e-06, + "loss": 0.67568314, + "num_input_tokens_seen": 126648365, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.80078125, + "step": 5888, + "time_per_iteration": 2.4438626766204834 + }, + { + "auxiliary_loss_clip": 0.01123271, + "auxiliary_loss_mlp": 0.01036138, + "balance_loss_clip": 1.02248812, + "balance_loss_mlp": 1.04373193, + "epoch": 0.35406583496167143, + "flos": 26068058795520.0, + "grad_norm": 1.6702882106954304, + "language_loss": 0.76662588, + "learning_rate": 2.9958372875433437e-06, + "loss": 0.78821993, + "num_input_tokens_seen": 126667500, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.796875, + "step": 5889, + "time_per_iteration": 2.503023624420166 + }, + { + "auxiliary_loss_clip": 0.01125083, + "auxiliary_loss_mlp": 0.01037766, + "balance_loss_clip": 1.02329397, + "balance_loss_mlp": 1.0469135, + "epoch": 0.3541259582143394, + "flos": 19792453916160.0, + "grad_norm": 1.7418080185903937, + "language_loss": 0.80142188, + "learning_rate": 2.9954995182463478e-06, + "loss": 0.82305038, + "num_input_tokens_seen": 126686820, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 5890, + "time_per_iteration": 2.4669902324676514 + }, + { + "auxiliary_loss_clip": 0.01118725, + "auxiliary_loss_mlp": 0.01034555, + "balance_loss_clip": 1.02204418, + "balance_loss_mlp": 1.04123974, + "epoch": 0.35418608146700736, + "flos": 24022084112640.0, + "grad_norm": 1.4765808553545194, + "language_loss": 0.79590207, + "learning_rate": 2.99516171119991e-06, + "loss": 0.81743479, + "num_input_tokens_seen": 126706965, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7734375, + "step": 5891, + "time_per_iteration": 2.491048812866211 + }, + { + "auxiliary_loss_clip": 0.01123501, + "auxiliary_loss_mlp": 0.01037192, + "balance_loss_clip": 1.02260685, + "balance_loss_mlp": 1.04425383, + "epoch": 0.35424620471967533, + "flos": 12385770693120.0, + "grad_norm": 2.0747162768055616, + "language_loss": 0.73339593, + "learning_rate": 2.9948238664168415e-06, + "loss": 0.7550028, + "num_input_tokens_seen": 126724015, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.79296875, + "step": 5892, + "time_per_iteration": 2.497422695159912 + }, + { + "auxiliary_loss_clip": 0.01124613, + "auxiliary_loss_mlp": 0.01038788, + "balance_loss_clip": 1.02425075, + "balance_loss_mlp": 1.04473233, + "epoch": 0.3543063279723433, + "flos": 19673624747520.0, + "grad_norm": 1.9338165898472526, + "language_loss": 0.66916019, + "learning_rate": 2.9944859839099518e-06, + "loss": 0.69079423, + "num_input_tokens_seen": 126737565, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.796875, + "step": 5893, + "time_per_iteration": 2.4516420364379883 + }, + { + "auxiliary_loss_clip": 0.01123079, + "auxiliary_loss_mlp": 0.01037638, + "balance_loss_clip": 1.02253413, + "balance_loss_mlp": 1.04405212, + "epoch": 0.35436645122501126, + "flos": 21909208348800.0, + "grad_norm": 1.878049090913109, + "language_loss": 0.69472313, + "learning_rate": 2.9941480636920533e-06, + "loss": 0.71633029, + "num_input_tokens_seen": 126756095, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7890625, + "step": 5894, + "time_per_iteration": 2.479174852371216 + }, + { + "auxiliary_loss_clip": 0.01123499, + "auxiliary_loss_mlp": 0.01033075, + "balance_loss_clip": 1.01983714, + "balance_loss_mlp": 1.04524636, + "epoch": 0.3544265744776792, + "flos": 21719527603200.0, + "grad_norm": 1.6954645527360779, + "language_loss": 0.74891931, + "learning_rate": 2.9938101057759615e-06, + "loss": 0.77048504, + "num_input_tokens_seen": 126775455, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.78125, + "step": 5895, + "time_per_iteration": 2.4786908626556396 + }, + { + "auxiliary_loss_clip": 0.01122907, + "auxiliary_loss_mlp": 0.01037799, + "balance_loss_clip": 1.02366102, + "balance_loss_mlp": 1.04388869, + "epoch": 0.3544866977303472, + "flos": 21213223447680.0, + "grad_norm": 2.0548310630504854, + "language_loss": 0.83688253, + "learning_rate": 2.993472110174491e-06, + "loss": 0.85848963, + "num_input_tokens_seen": 126792320, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 5896, + "time_per_iteration": 2.4765214920043945 + }, + { + "auxiliary_loss_clip": 0.01122608, + "auxiliary_loss_mlp": 0.01048408, + "balance_loss_clip": 1.03348279, + "balance_loss_mlp": 1.0444181, + "epoch": 0.35454682098301515, + "flos": 29311402371840.0, + "grad_norm": 1.6634726813042469, + "language_loss": 0.70031154, + "learning_rate": 2.9931340769004576e-06, + "loss": 0.7220217, + "num_input_tokens_seen": 126813680, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.78125, + "step": 5897, + "time_per_iteration": 2.5142548084259033 + }, + { + "auxiliary_loss_clip": 0.01121754, + "auxiliary_loss_mlp": 0.01038358, + "balance_loss_clip": 1.02430916, + "balance_loss_mlp": 1.04337025, + "epoch": 0.3546069442356832, + "flos": 24316587722880.0, + "grad_norm": 1.7331024671064506, + "language_loss": 0.82091749, + "learning_rate": 2.9927960059666816e-06, + "loss": 0.84251857, + "num_input_tokens_seen": 126834395, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.78515625, + "step": 5898, + "time_per_iteration": 2.4900712966918945 + }, + { + "auxiliary_loss_clip": 0.0112068, + "auxiliary_loss_mlp": 0.01036408, + "balance_loss_clip": 1.0234853, + "balance_loss_mlp": 1.04411018, + "epoch": 0.35466706748835114, + "flos": 22857285876480.0, + "grad_norm": 1.4876974136883365, + "language_loss": 0.73901182, + "learning_rate": 2.9924578973859804e-06, + "loss": 0.76058269, + "num_input_tokens_seen": 126855145, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.765625, + "step": 5899, + "time_per_iteration": 2.498659133911133 + }, + { + "auxiliary_loss_clip": 0.01121982, + "auxiliary_loss_mlp": 0.01042838, + "balance_loss_clip": 1.02825308, + "balance_loss_mlp": 1.04316258, + "epoch": 0.3547271907410191, + "flos": 28330107742080.0, + "grad_norm": 1.69682390123668, + "language_loss": 0.79345262, + "learning_rate": 2.9921197511711763e-06, + "loss": 0.81510079, + "num_input_tokens_seen": 126873790, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7890625, + "step": 5900, + "time_per_iteration": 2.548612594604492 + }, + { + "auxiliary_loss_clip": 0.01123598, + "auxiliary_loss_mlp": 0.01040285, + "balance_loss_clip": 1.02556252, + "balance_loss_mlp": 1.04530048, + "epoch": 0.35478731399368707, + "flos": 23514092017920.0, + "grad_norm": 1.7758743329418227, + "language_loss": 0.81637204, + "learning_rate": 2.991781567335093e-06, + "loss": 0.83801091, + "num_input_tokens_seen": 126892865, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.78125, + "step": 5901, + "time_per_iteration": 2.6031999588012695 + }, + { + "auxiliary_loss_clip": 0.01127681, + "auxiliary_loss_mlp": 0.01034926, + "balance_loss_clip": 1.02063251, + "balance_loss_mlp": 1.04535294, + "epoch": 0.35484743724635504, + "flos": 18624315715200.0, + "grad_norm": 1.92677562296577, + "language_loss": 0.75667071, + "learning_rate": 2.9914433458905525e-06, + "loss": 0.77829683, + "num_input_tokens_seen": 126911935, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.82421875, + "step": 5902, + "time_per_iteration": 2.528026819229126 + }, + { + "auxiliary_loss_clip": 0.0112195, + "auxiliary_loss_mlp": 0.01036748, + "balance_loss_clip": 1.02359962, + "balance_loss_mlp": 1.04320014, + "epoch": 0.354907560499023, + "flos": 17384499924480.0, + "grad_norm": 1.7304108811682997, + "language_loss": 0.70582771, + "learning_rate": 2.991105086850381e-06, + "loss": 0.72741467, + "num_input_tokens_seen": 126930040, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7890625, + "step": 5903, + "time_per_iteration": 2.423454999923706 + }, + { + "auxiliary_loss_clip": 0.01124223, + "auxiliary_loss_mlp": 0.01034819, + "balance_loss_clip": 1.0205555, + "balance_loss_mlp": 1.04234982, + "epoch": 0.35496768375169097, + "flos": 19208546426880.0, + "grad_norm": 2.52210089781831, + "language_loss": 0.74574983, + "learning_rate": 2.9907667902274053e-06, + "loss": 0.76734024, + "num_input_tokens_seen": 126948390, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8203125, + "step": 5904, + "time_per_iteration": 2.462024688720703 + }, + { + "auxiliary_loss_clip": 0.0112423, + "auxiliary_loss_mlp": 0.01040901, + "balance_loss_clip": 1.02649426, + "balance_loss_mlp": 1.04362941, + "epoch": 0.35502780700435893, + "flos": 18332792933760.0, + "grad_norm": 2.0389703534000443, + "language_loss": 0.78855121, + "learning_rate": 2.9904284560344536e-06, + "loss": 0.81020248, + "num_input_tokens_seen": 126964905, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.8046875, + "step": 5905, + "time_per_iteration": 2.418665885925293 + }, + { + "auxiliary_loss_clip": 0.0111773, + "auxiliary_loss_mlp": 0.01031377, + "balance_loss_clip": 1.0190388, + "balance_loss_mlp": 1.04383469, + "epoch": 0.3550879302570269, + "flos": 15448555578240.0, + "grad_norm": 2.1398902938273547, + "language_loss": 0.72515827, + "learning_rate": 2.990090084284356e-06, + "loss": 0.74664938, + "num_input_tokens_seen": 126982000, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.73828125, + "step": 5906, + "time_per_iteration": 2.441795825958252 + }, + { + "auxiliary_loss_clip": 0.01128267, + "auxiliary_loss_mlp": 0.01037607, + "balance_loss_clip": 1.02187109, + "balance_loss_mlp": 1.04545534, + "epoch": 0.35514805350969486, + "flos": 21979197999360.0, + "grad_norm": 2.0230910533888107, + "language_loss": 0.74762344, + "learning_rate": 2.9897516749899426e-06, + "loss": 0.7692821, + "num_input_tokens_seen": 126998390, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.828125, + "step": 5907, + "time_per_iteration": 2.4404122829437256 + }, + { + "auxiliary_loss_clip": 0.01123497, + "auxiliary_loss_mlp": 0.01033795, + "balance_loss_clip": 1.01939988, + "balance_loss_mlp": 1.04492426, + "epoch": 0.3552081767623628, + "flos": 29861949104640.0, + "grad_norm": 1.7742327577799557, + "language_loss": 0.75751841, + "learning_rate": 2.989413228164047e-06, + "loss": 0.77909136, + "num_input_tokens_seen": 127020220, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78515625, + "step": 5908, + "time_per_iteration": 2.5631895065307617 + }, + { + "auxiliary_loss_clip": 0.01126393, + "auxiliary_loss_mlp": 0.01033964, + "balance_loss_clip": 1.01961696, + "balance_loss_mlp": 1.04734707, + "epoch": 0.3552683000150308, + "flos": 26432264747520.0, + "grad_norm": 1.7057235578436956, + "language_loss": 0.68026733, + "learning_rate": 2.989074743819502e-06, + "loss": 0.70187092, + "num_input_tokens_seen": 127038585, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7890625, + "step": 5909, + "time_per_iteration": 2.480511426925659 + }, + { + "auxiliary_loss_clip": 0.0112022, + "auxiliary_loss_mlp": 0.01032654, + "balance_loss_clip": 1.01937413, + "balance_loss_mlp": 1.04523396, + "epoch": 0.35532842326769876, + "flos": 19785989468160.0, + "grad_norm": 3.5777269988287297, + "language_loss": 0.78628188, + "learning_rate": 2.988736221969144e-06, + "loss": 0.8078106, + "num_input_tokens_seen": 127056215, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75, + "step": 5910, + "time_per_iteration": 2.4763131141662598 + }, + { + "auxiliary_loss_clip": 0.01128543, + "auxiliary_loss_mlp": 0.01040834, + "balance_loss_clip": 1.02545595, + "balance_loss_mlp": 1.04625309, + "epoch": 0.3553885465203668, + "flos": 17239277237760.0, + "grad_norm": 1.525011794663279, + "language_loss": 0.70639479, + "learning_rate": 2.98839766262581e-06, + "loss": 0.72808856, + "num_input_tokens_seen": 127075825, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8203125, + "step": 5911, + "time_per_iteration": 2.4335598945617676 + }, + { + "auxiliary_loss_clip": 0.01119575, + "auxiliary_loss_mlp": 0.01035647, + "balance_loss_clip": 1.02149105, + "balance_loss_mlp": 1.04294884, + "epoch": 0.35544866977303474, + "flos": 14934350430720.0, + "grad_norm": 1.9668748220600272, + "language_loss": 0.87014282, + "learning_rate": 2.9880590658023366e-06, + "loss": 0.89169508, + "num_input_tokens_seen": 127091205, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.765625, + "step": 5912, + "time_per_iteration": 2.461251735687256 + }, + { + "auxiliary_loss_clip": 0.01123002, + "auxiliary_loss_mlp": 0.01032384, + "balance_loss_clip": 1.018556, + "balance_loss_mlp": 1.04507196, + "epoch": 0.3555087930257027, + "flos": 19756040503680.0, + "grad_norm": 1.7619620740638822, + "language_loss": 0.7701745, + "learning_rate": 2.9877204315115646e-06, + "loss": 0.79172838, + "num_input_tokens_seen": 127109210, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.78125, + "step": 5913, + "time_per_iteration": 2.4517738819122314 + }, + { + "auxiliary_loss_clip": 0.01124528, + "auxiliary_loss_mlp": 0.01033929, + "balance_loss_clip": 1.02001143, + "balance_loss_mlp": 1.04793298, + "epoch": 0.3555689162783707, + "flos": 21068252156160.0, + "grad_norm": 1.3300117090522248, + "language_loss": 0.82507938, + "learning_rate": 2.9873817597663353e-06, + "loss": 0.84666395, + "num_input_tokens_seen": 127128400, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.765625, + "step": 5914, + "time_per_iteration": 2.4964141845703125 + }, + { + "auxiliary_loss_clip": 0.01124534, + "auxiliary_loss_mlp": 0.01031988, + "balance_loss_clip": 1.01771307, + "balance_loss_mlp": 1.04573739, + "epoch": 0.35562903953103864, + "flos": 33069633454080.0, + "grad_norm": 2.1657623831524604, + "language_loss": 0.70703268, + "learning_rate": 2.98704305057949e-06, + "loss": 0.72859794, + "num_input_tokens_seen": 127149965, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7890625, + "step": 5915, + "time_per_iteration": 2.5425658226013184 + }, + { + "auxiliary_loss_clip": 0.01120767, + "auxiliary_loss_mlp": 0.01038436, + "balance_loss_clip": 1.0249182, + "balance_loss_mlp": 1.04248476, + "epoch": 0.3556891627837066, + "flos": 20557853850240.0, + "grad_norm": 1.7489130528457595, + "language_loss": 0.76365829, + "learning_rate": 2.9867043039638737e-06, + "loss": 0.78525031, + "num_input_tokens_seen": 127169865, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.78125, + "step": 5916, + "time_per_iteration": 2.49629545211792 + }, + { + "auxiliary_loss_clip": 0.01128234, + "auxiliary_loss_mlp": 0.01037051, + "balance_loss_clip": 1.02360404, + "balance_loss_mlp": 1.04853928, + "epoch": 0.35574928603637457, + "flos": 20703327932160.0, + "grad_norm": 1.96232440030472, + "language_loss": 0.88380635, + "learning_rate": 2.986365519932332e-06, + "loss": 0.90545923, + "num_input_tokens_seen": 127188075, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.796875, + "step": 5917, + "time_per_iteration": 2.4549498558044434 + }, + { + "auxiliary_loss_clip": 0.01123557, + "auxiliary_loss_mlp": 0.01025214, + "balance_loss_clip": 1.01144493, + "balance_loss_mlp": 1.04562521, + "epoch": 0.35580940928904253, + "flos": 15194595444480.0, + "grad_norm": 2.0473051476373048, + "language_loss": 0.74389327, + "learning_rate": 2.98602669849771e-06, + "loss": 0.76538098, + "num_input_tokens_seen": 127206065, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.78125, + "step": 5918, + "time_per_iteration": 2.448164701461792 + }, + { + "auxiliary_loss_clip": 0.01039303, + "auxiliary_loss_mlp": 0.01015071, + "balance_loss_clip": 1.01344931, + "balance_loss_mlp": 1.01430607, + "epoch": 0.3558695325417105, + "flos": 58639145431680.0, + "grad_norm": 1.0267040132589962, + "language_loss": 0.63732457, + "learning_rate": 2.985687839672857e-06, + "loss": 0.65786839, + "num_input_tokens_seen": 127257885, + "router_z_loss_clip": 0.01623535, + "router_z_loss_mlp": 0.25, + "step": 5919, + "time_per_iteration": 2.837815999984741 + }, + { + "auxiliary_loss_clip": 0.01124878, + "auxiliary_loss_mlp": 0.01032772, + "balance_loss_clip": 1.01805615, + "balance_loss_mlp": 1.04376245, + "epoch": 0.35592965579437846, + "flos": 22018233104640.0, + "grad_norm": 2.8747663216478503, + "language_loss": 0.73868048, + "learning_rate": 2.9853489434706223e-06, + "loss": 0.76025695, + "num_input_tokens_seen": 127275550, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.80859375, + "step": 5920, + "time_per_iteration": 2.4837357997894287 + }, + { + "auxiliary_loss_clip": 0.0112079, + "auxiliary_loss_mlp": 0.01034089, + "balance_loss_clip": 1.02015972, + "balance_loss_mlp": 1.04353166, + "epoch": 0.35598977904704643, + "flos": 23367684182400.0, + "grad_norm": 1.659561193633535, + "language_loss": 0.77124226, + "learning_rate": 2.985010009903857e-06, + "loss": 0.79279101, + "num_input_tokens_seen": 127295110, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7734375, + "step": 5921, + "time_per_iteration": 2.461014986038208 + }, + { + "auxiliary_loss_clip": 0.01122705, + "auxiliary_loss_mlp": 0.0103307, + "balance_loss_clip": 1.01968277, + "balance_loss_mlp": 1.04409981, + "epoch": 0.3560499022997144, + "flos": 17785334770560.0, + "grad_norm": 3.1644779785561563, + "language_loss": 0.67710596, + "learning_rate": 2.9846710389854133e-06, + "loss": 0.69866371, + "num_input_tokens_seen": 127312865, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7890625, + "step": 5922, + "time_per_iteration": 2.495504140853882 + }, + { + "auxiliary_loss_clip": 0.01122686, + "auxiliary_loss_mlp": 0.01029775, + "balance_loss_clip": 1.01567268, + "balance_loss_mlp": 1.04373431, + "epoch": 0.35611002555238236, + "flos": 20740459616640.0, + "grad_norm": 1.9745978513449503, + "language_loss": 0.79269004, + "learning_rate": 2.9843320307281454e-06, + "loss": 0.81421471, + "num_input_tokens_seen": 127331710, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 5923, + "time_per_iteration": 2.4515416622161865 + }, + { + "auxiliary_loss_clip": 0.01124058, + "auxiliary_loss_mlp": 0.01039223, + "balance_loss_clip": 1.02631271, + "balance_loss_mlp": 1.04502511, + "epoch": 0.3561701488050504, + "flos": 19462219251840.0, + "grad_norm": 1.7698063934253627, + "language_loss": 0.85475516, + "learning_rate": 2.983992985144908e-06, + "loss": 0.87638795, + "num_input_tokens_seen": 127350950, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7890625, + "step": 5924, + "time_per_iteration": 2.4790685176849365 + }, + { + "auxiliary_loss_clip": 0.01121235, + "auxiliary_loss_mlp": 0.01037826, + "balance_loss_clip": 1.02344394, + "balance_loss_mlp": 1.04368067, + "epoch": 0.35623027205771834, + "flos": 30774942023040.0, + "grad_norm": 1.844353158814239, + "language_loss": 0.77513188, + "learning_rate": 2.9836539022485578e-06, + "loss": 0.79672253, + "num_input_tokens_seen": 127369385, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7734375, + "step": 5925, + "time_per_iteration": 2.5064613819122314 + }, + { + "auxiliary_loss_clip": 0.01119102, + "auxiliary_loss_mlp": 0.01043971, + "balance_loss_clip": 1.0301789, + "balance_loss_mlp": 1.04067063, + "epoch": 0.3562903953103863, + "flos": 16981079299200.0, + "grad_norm": 1.7016119178915972, + "language_loss": 0.75874609, + "learning_rate": 2.9833147820519535e-06, + "loss": 0.78037679, + "num_input_tokens_seen": 127386965, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.78125, + "step": 5926, + "time_per_iteration": 2.451852798461914 + }, + { + "auxiliary_loss_clip": 0.01125239, + "auxiliary_loss_mlp": 0.01036384, + "balance_loss_clip": 1.02194762, + "balance_loss_mlp": 1.04408717, + "epoch": 0.3563505185630543, + "flos": 23839837482240.0, + "grad_norm": 2.0486133546267737, + "language_loss": 0.69321811, + "learning_rate": 2.9829756245679544e-06, + "loss": 0.71483439, + "num_input_tokens_seen": 127406075, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.80859375, + "step": 5927, + "time_per_iteration": 2.4770915508270264 + }, + { + "auxiliary_loss_clip": 0.01119921, + "auxiliary_loss_mlp": 0.01036862, + "balance_loss_clip": 1.0237366, + "balance_loss_mlp": 1.0428226, + "epoch": 0.35641064181572224, + "flos": 22273450214400.0, + "grad_norm": 1.8762651107969224, + "language_loss": 0.79633021, + "learning_rate": 2.9826364298094212e-06, + "loss": 0.81789798, + "num_input_tokens_seen": 127425350, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.76953125, + "step": 5928, + "time_per_iteration": 4.019433259963989 + }, + { + "auxiliary_loss_clip": 0.01120965, + "auxiliary_loss_mlp": 0.01039766, + "balance_loss_clip": 1.02581263, + "balance_loss_mlp": 1.04338682, + "epoch": 0.3564707650683902, + "flos": 23001251587200.0, + "grad_norm": 1.4128421638180557, + "language_loss": 0.81568098, + "learning_rate": 2.982297197789215e-06, + "loss": 0.83728826, + "num_input_tokens_seen": 127446335, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7734375, + "step": 5929, + "time_per_iteration": 3.869184970855713 + }, + { + "auxiliary_loss_clip": 0.01116246, + "auxiliary_loss_mlp": 0.01034774, + "balance_loss_clip": 1.02172661, + "balance_loss_mlp": 1.0402571, + "epoch": 0.35653088832105817, + "flos": 14684268965760.0, + "grad_norm": 1.7650523310611956, + "language_loss": 0.69981778, + "learning_rate": 2.981957928520201e-06, + "loss": 0.7213279, + "num_input_tokens_seen": 127462795, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7578125, + "step": 5930, + "time_per_iteration": 2.418992519378662 + }, + { + "auxiliary_loss_clip": 0.01123929, + "auxiliary_loss_mlp": 0.01043162, + "balance_loss_clip": 1.02858853, + "balance_loss_mlp": 1.04340863, + "epoch": 0.35659101157372614, + "flos": 23477068074240.0, + "grad_norm": 1.9164187115059894, + "language_loss": 0.67766178, + "learning_rate": 2.981618622015244e-06, + "loss": 0.69933271, + "num_input_tokens_seen": 127482675, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8046875, + "step": 5931, + "time_per_iteration": 2.4688074588775635 + }, + { + "auxiliary_loss_clip": 0.01121557, + "auxiliary_loss_mlp": 0.01033991, + "balance_loss_clip": 1.0203712, + "balance_loss_mlp": 1.04403675, + "epoch": 0.3566511348263941, + "flos": 26578672583040.0, + "grad_norm": 1.736290109138699, + "language_loss": 0.67451715, + "learning_rate": 2.981279278287211e-06, + "loss": 0.69607264, + "num_input_tokens_seen": 127502275, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.77734375, + "step": 5932, + "time_per_iteration": 2.4908299446105957 + }, + { + "auxiliary_loss_clip": 0.01118994, + "auxiliary_loss_mlp": 0.01031751, + "balance_loss_clip": 1.0182085, + "balance_loss_mlp": 1.04304647, + "epoch": 0.35671125807906207, + "flos": 13115008609920.0, + "grad_norm": 2.602576254435761, + "language_loss": 0.7878592, + "learning_rate": 2.980939897348969e-06, + "loss": 0.8093667, + "num_input_tokens_seen": 127520195, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 5933, + "time_per_iteration": 2.442464590072632 + }, + { + "auxiliary_loss_clip": 0.01122141, + "auxiliary_loss_mlp": 0.01042886, + "balance_loss_clip": 1.02893806, + "balance_loss_mlp": 1.04176354, + "epoch": 0.35677138133173003, + "flos": 33000577557120.0, + "grad_norm": 1.4946029259135472, + "language_loss": 0.69271672, + "learning_rate": 2.980600479213388e-06, + "loss": 0.71436697, + "num_input_tokens_seen": 127544495, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.8046875, + "step": 5934, + "time_per_iteration": 2.5435454845428467 + }, + { + "auxiliary_loss_clip": 0.01131019, + "auxiliary_loss_mlp": 0.0104198, + "balance_loss_clip": 1.02636409, + "balance_loss_mlp": 1.04726946, + "epoch": 0.356831504584398, + "flos": 20777842696320.0, + "grad_norm": 1.881720756405168, + "language_loss": 0.71268845, + "learning_rate": 2.9802610238933384e-06, + "loss": 0.73441839, + "num_input_tokens_seen": 127563810, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8359375, + "step": 5935, + "time_per_iteration": 2.460548162460327 + }, + { + "auxiliary_loss_clip": 0.01124043, + "auxiliary_loss_mlp": 0.01039228, + "balance_loss_clip": 1.02476776, + "balance_loss_mlp": 1.04411018, + "epoch": 0.35689162783706596, + "flos": 12165566365440.0, + "grad_norm": 2.474293421119334, + "language_loss": 0.78293073, + "learning_rate": 2.979921531401692e-06, + "loss": 0.8045634, + "num_input_tokens_seen": 127579065, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.796875, + "step": 5936, + "time_per_iteration": 2.4517645835876465 + }, + { + "auxiliary_loss_clip": 0.01121611, + "auxiliary_loss_mlp": 0.01039592, + "balance_loss_clip": 1.02472031, + "balance_loss_mlp": 1.04367638, + "epoch": 0.356951751089734, + "flos": 23841489507840.0, + "grad_norm": 1.4518862241402966, + "language_loss": 0.64218014, + "learning_rate": 2.9795820017513242e-06, + "loss": 0.66379213, + "num_input_tokens_seen": 127599105, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78125, + "step": 5937, + "time_per_iteration": 2.5837321281433105 + }, + { + "auxiliary_loss_clip": 0.01124449, + "auxiliary_loss_mlp": 0.01038603, + "balance_loss_clip": 1.02395844, + "balance_loss_mlp": 1.04442978, + "epoch": 0.35701187434240195, + "flos": 11722176881280.0, + "grad_norm": 2.5143509931773553, + "language_loss": 0.77877963, + "learning_rate": 2.9792424349551073e-06, + "loss": 0.80041015, + "num_input_tokens_seen": 127614940, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.80078125, + "step": 5938, + "time_per_iteration": 2.4190945625305176 + }, + { + "auxiliary_loss_clip": 0.0112532, + "auxiliary_loss_mlp": 0.01042565, + "balance_loss_clip": 1.02890944, + "balance_loss_mlp": 1.04582071, + "epoch": 0.3570719975950699, + "flos": 24898879100160.0, + "grad_norm": 1.8770011073758637, + "language_loss": 0.80256367, + "learning_rate": 2.9789028310259202e-06, + "loss": 0.82424247, + "num_input_tokens_seen": 127634960, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.796875, + "step": 5939, + "time_per_iteration": 2.5029094219207764 + }, + { + "auxiliary_loss_clip": 0.01126611, + "auxiliary_loss_mlp": 0.01035861, + "balance_loss_clip": 1.0213412, + "balance_loss_mlp": 1.04299128, + "epoch": 0.3571321208477379, + "flos": 25994836920960.0, + "grad_norm": 1.6875415435298406, + "language_loss": 0.79203522, + "learning_rate": 2.9785631899766395e-06, + "loss": 0.81365997, + "num_input_tokens_seen": 127654545, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8359375, + "step": 5940, + "time_per_iteration": 2.526545524597168 + }, + { + "auxiliary_loss_clip": 0.01124522, + "auxiliary_loss_mlp": 0.01031852, + "balance_loss_clip": 1.01704049, + "balance_loss_mlp": 1.0441246, + "epoch": 0.35719224410040584, + "flos": 14501663199360.0, + "grad_norm": 2.480743427796476, + "language_loss": 0.72739166, + "learning_rate": 2.9782235118201443e-06, + "loss": 0.74895537, + "num_input_tokens_seen": 127672320, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8046875, + "step": 5941, + "time_per_iteration": 2.4599413871765137 + }, + { + "auxiliary_loss_clip": 0.01123947, + "auxiliary_loss_mlp": 0.01040367, + "balance_loss_clip": 1.02546012, + "balance_loss_mlp": 1.04480743, + "epoch": 0.3572523673530738, + "flos": 31175453646720.0, + "grad_norm": 1.979069530543237, + "language_loss": 0.64202702, + "learning_rate": 2.9778837965693154e-06, + "loss": 0.66367018, + "num_input_tokens_seen": 127693315, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7890625, + "step": 5942, + "time_per_iteration": 2.5174636840820312 + }, + { + "auxiliary_loss_clip": 0.01123104, + "auxiliary_loss_mlp": 0.01036752, + "balance_loss_clip": 1.02194643, + "balance_loss_mlp": 1.04385567, + "epoch": 0.3573124906057418, + "flos": 15851976203520.0, + "grad_norm": 2.2469009256176053, + "language_loss": 0.74055374, + "learning_rate": 2.9775440442370354e-06, + "loss": 0.76215225, + "num_input_tokens_seen": 127711570, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.79296875, + "step": 5943, + "time_per_iteration": 2.5392913818359375 + }, + { + "auxiliary_loss_clip": 0.01039679, + "auxiliary_loss_mlp": 0.01008275, + "balance_loss_clip": 1.00640345, + "balance_loss_mlp": 1.01455188, + "epoch": 0.35737261385840974, + "flos": 60822729118080.0, + "grad_norm": 0.7872915284740177, + "language_loss": 0.60689372, + "learning_rate": 2.9772042548361867e-06, + "loss": 0.62737316, + "num_input_tokens_seen": 127772475, + "router_z_loss_clip": 0.01867676, + "router_z_loss_mlp": 0.25, + "step": 5944, + "time_per_iteration": 3.17051100730896 + }, + { + "auxiliary_loss_clip": 0.01121351, + "auxiliary_loss_mlp": 0.01034271, + "balance_loss_clip": 1.02003157, + "balance_loss_mlp": 1.04313469, + "epoch": 0.3574327371110777, + "flos": 18843765857280.0, + "grad_norm": 2.033108996495456, + "language_loss": 0.72646821, + "learning_rate": 2.976864428379655e-06, + "loss": 0.7480244, + "num_input_tokens_seen": 127790940, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 5945, + "time_per_iteration": 2.444373846054077 + }, + { + "auxiliary_loss_clip": 0.01121962, + "auxiliary_loss_mlp": 0.01039125, + "balance_loss_clip": 1.02446246, + "balance_loss_mlp": 1.04313612, + "epoch": 0.35749286036374567, + "flos": 23549679417600.0, + "grad_norm": 1.7423109631574678, + "language_loss": 0.81255424, + "learning_rate": 2.976524564880326e-06, + "loss": 0.8341651, + "num_input_tokens_seen": 127808275, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7890625, + "step": 5946, + "time_per_iteration": 2.470513343811035 + }, + { + "auxiliary_loss_clip": 0.01124684, + "auxiliary_loss_mlp": 0.01042743, + "balance_loss_clip": 1.02808666, + "balance_loss_mlp": 1.04524601, + "epoch": 0.35755298361641363, + "flos": 21105491581440.0, + "grad_norm": 1.9099881709146462, + "language_loss": 0.68893784, + "learning_rate": 2.9761846643510882e-06, + "loss": 0.71061212, + "num_input_tokens_seen": 127828840, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.79296875, + "step": 5947, + "time_per_iteration": 2.4653477668762207 + }, + { + "auxiliary_loss_clip": 0.01120435, + "auxiliary_loss_mlp": 0.01039661, + "balance_loss_clip": 1.02568388, + "balance_loss_mlp": 1.04441905, + "epoch": 0.3576131068690816, + "flos": 19245031666560.0, + "grad_norm": 1.655085874443405, + "language_loss": 0.75428057, + "learning_rate": 2.9758447268048297e-06, + "loss": 0.77588153, + "num_input_tokens_seen": 127846240, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7578125, + "step": 5948, + "time_per_iteration": 2.4385483264923096 + }, + { + "auxiliary_loss_clip": 0.01119692, + "auxiliary_loss_mlp": 0.0104111, + "balance_loss_clip": 1.02650094, + "balance_loss_mlp": 1.04049134, + "epoch": 0.35767323012174956, + "flos": 28654703971200.0, + "grad_norm": 2.354345427402619, + "language_loss": 0.70556438, + "learning_rate": 2.9755047522544415e-06, + "loss": 0.72717237, + "num_input_tokens_seen": 127866880, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.79296875, + "step": 5949, + "time_per_iteration": 2.4992663860321045 + }, + { + "auxiliary_loss_clip": 0.01121175, + "auxiliary_loss_mlp": 0.0103916, + "balance_loss_clip": 1.02567744, + "balance_loss_mlp": 1.04348552, + "epoch": 0.35773335337441753, + "flos": 17085363459840.0, + "grad_norm": 1.8941983472442732, + "language_loss": 0.77248389, + "learning_rate": 2.9751647407128154e-06, + "loss": 0.79408723, + "num_input_tokens_seen": 127883560, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.77734375, + "step": 5950, + "time_per_iteration": 2.4295101165771484 + }, + { + "auxiliary_loss_clip": 0.0112255, + "auxiliary_loss_mlp": 0.01038813, + "balance_loss_clip": 1.02394795, + "balance_loss_mlp": 1.04274225, + "epoch": 0.35779347662708555, + "flos": 15888605097600.0, + "grad_norm": 1.5707876816938207, + "language_loss": 0.72766685, + "learning_rate": 2.9748246921928445e-06, + "loss": 0.74928057, + "num_input_tokens_seen": 127902330, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 5951, + "time_per_iteration": 2.444349765777588 + }, + { + "auxiliary_loss_clip": 0.0112562, + "auxiliary_loss_mlp": 0.01039318, + "balance_loss_clip": 1.02452421, + "balance_loss_mlp": 1.04390478, + "epoch": 0.3578535998797535, + "flos": 28658834035200.0, + "grad_norm": 1.9955959935597258, + "language_loss": 0.69730532, + "learning_rate": 2.9744846067074236e-06, + "loss": 0.71895468, + "num_input_tokens_seen": 127922325, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.81640625, + "step": 5952, + "time_per_iteration": 2.49656081199646 + }, + { + "auxiliary_loss_clip": 0.01120518, + "auxiliary_loss_mlp": 0.010387, + "balance_loss_clip": 1.02497923, + "balance_loss_mlp": 1.04271066, + "epoch": 0.3579137231324215, + "flos": 37852432076160.0, + "grad_norm": 2.0583657570083416, + "language_loss": 0.69432503, + "learning_rate": 2.974144484269449e-06, + "loss": 0.71591723, + "num_input_tokens_seen": 127942635, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.77734375, + "step": 5953, + "time_per_iteration": 2.6221721172332764 + }, + { + "auxiliary_loss_clip": 0.0112099, + "auxiliary_loss_mlp": 0.01030421, + "balance_loss_clip": 1.01641417, + "balance_loss_mlp": 1.04322994, + "epoch": 0.35797384638508944, + "flos": 22346851656960.0, + "grad_norm": 1.5429391611916807, + "language_loss": 0.66673422, + "learning_rate": 2.9738043248918175e-06, + "loss": 0.68824828, + "num_input_tokens_seen": 127962520, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.77734375, + "step": 5954, + "time_per_iteration": 2.465116262435913 + }, + { + "auxiliary_loss_clip": 0.01123263, + "auxiliary_loss_mlp": 0.01037735, + "balance_loss_clip": 1.02383566, + "balance_loss_mlp": 1.04475307, + "epoch": 0.3580339696377574, + "flos": 13589711775360.0, + "grad_norm": 1.7040470297828096, + "language_loss": 0.74838006, + "learning_rate": 2.9734641285874282e-06, + "loss": 0.76998997, + "num_input_tokens_seen": 127981180, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.78515625, + "step": 5955, + "time_per_iteration": 2.4968783855438232 + }, + { + "auxiliary_loss_clip": 0.01117597, + "auxiliary_loss_mlp": 0.01035772, + "balance_loss_clip": 1.0219382, + "balance_loss_mlp": 1.04289603, + "epoch": 0.3580940928904254, + "flos": 23768231719680.0, + "grad_norm": 1.6820855707774873, + "language_loss": 0.76043999, + "learning_rate": 2.973123895369182e-06, + "loss": 0.78197372, + "num_input_tokens_seen": 127999725, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.74609375, + "step": 5956, + "time_per_iteration": 2.498699903488159 + }, + { + "auxiliary_loss_clip": 0.01117465, + "auxiliary_loss_mlp": 0.01034975, + "balance_loss_clip": 1.0214982, + "balance_loss_mlp": 1.04263568, + "epoch": 0.35815421614309334, + "flos": 19463871277440.0, + "grad_norm": 1.7390523407913014, + "language_loss": 0.73059452, + "learning_rate": 2.9727836252499805e-06, + "loss": 0.75211895, + "num_input_tokens_seen": 128018885, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 5957, + "time_per_iteration": 2.4503817558288574 + }, + { + "auxiliary_loss_clip": 0.0112235, + "auxiliary_loss_mlp": 0.01035332, + "balance_loss_clip": 1.02197433, + "balance_loss_mlp": 1.04503369, + "epoch": 0.3582143393957613, + "flos": 23368186972800.0, + "grad_norm": 2.990259024529503, + "language_loss": 0.70640051, + "learning_rate": 2.972443318242726e-06, + "loss": 0.7279774, + "num_input_tokens_seen": 128037875, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7734375, + "step": 5958, + "time_per_iteration": 2.4611945152282715 + }, + { + "auxiliary_loss_clip": 0.01116817, + "auxiliary_loss_mlp": 0.01029572, + "balance_loss_clip": 1.0165484, + "balance_loss_mlp": 1.0413444, + "epoch": 0.35827446264842927, + "flos": 26323275905280.0, + "grad_norm": 1.7206269565580243, + "language_loss": 0.88610697, + "learning_rate": 2.972102974360324e-06, + "loss": 0.90757084, + "num_input_tokens_seen": 128056045, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.75390625, + "step": 5959, + "time_per_iteration": 2.5129401683807373 + }, + { + "auxiliary_loss_clip": 0.01121057, + "auxiliary_loss_mlp": 0.01036795, + "balance_loss_clip": 1.02281785, + "balance_loss_mlp": 1.04400599, + "epoch": 0.35833458590109724, + "flos": 30446610779520.0, + "grad_norm": 1.483187088646708, + "language_loss": 0.58103061, + "learning_rate": 2.971762593615679e-06, + "loss": 0.6026091, + "num_input_tokens_seen": 128077815, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76953125, + "step": 5960, + "time_per_iteration": 2.5110409259796143 + }, + { + "auxiliary_loss_clip": 0.01120594, + "auxiliary_loss_mlp": 0.01037396, + "balance_loss_clip": 1.02201176, + "balance_loss_mlp": 1.04267251, + "epoch": 0.3583947091537652, + "flos": 14829886702080.0, + "grad_norm": 1.9323395592862886, + "language_loss": 0.76102602, + "learning_rate": 2.9714221760216993e-06, + "loss": 0.78260595, + "num_input_tokens_seen": 128095460, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.77734375, + "step": 5961, + "time_per_iteration": 2.46943736076355 + }, + { + "auxiliary_loss_clip": 0.01121367, + "auxiliary_loss_mlp": 0.01033122, + "balance_loss_clip": 1.01862621, + "balance_loss_mlp": 1.04458857, + "epoch": 0.35845483240643317, + "flos": 34240644743040.0, + "grad_norm": 1.8327349140058107, + "language_loss": 0.69974017, + "learning_rate": 2.971081721591294e-06, + "loss": 0.72128505, + "num_input_tokens_seen": 128118605, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.765625, + "step": 5962, + "time_per_iteration": 2.5654361248016357 + }, + { + "auxiliary_loss_clip": 0.01118608, + "auxiliary_loss_mlp": 0.01033334, + "balance_loss_clip": 1.0210433, + "balance_loss_mlp": 1.04321802, + "epoch": 0.35851495565910113, + "flos": 20960089326720.0, + "grad_norm": 1.5613001239774846, + "language_loss": 0.74749398, + "learning_rate": 2.9707412303373716e-06, + "loss": 0.76901346, + "num_input_tokens_seen": 128139205, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.75390625, + "step": 5963, + "time_per_iteration": 2.5135319232940674 + }, + { + "auxiliary_loss_clip": 0.01122172, + "auxiliary_loss_mlp": 0.01034993, + "balance_loss_clip": 1.02149796, + "balance_loss_mlp": 1.04597044, + "epoch": 0.35857507891176915, + "flos": 22309863626880.0, + "grad_norm": 1.5825069258384938, + "language_loss": 0.78811383, + "learning_rate": 2.9704007022728447e-06, + "loss": 0.80968547, + "num_input_tokens_seen": 128158765, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.76171875, + "step": 5964, + "time_per_iteration": 2.493169069290161 + }, + { + "auxiliary_loss_clip": 0.01124119, + "auxiliary_loss_mlp": 0.01033282, + "balance_loss_clip": 1.01870322, + "balance_loss_mlp": 1.04482806, + "epoch": 0.3586352021644371, + "flos": 23367863750400.0, + "grad_norm": 1.8296471859577264, + "language_loss": 0.66694742, + "learning_rate": 2.970060137410626e-06, + "loss": 0.6885215, + "num_input_tokens_seen": 128177850, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.79296875, + "step": 5965, + "time_per_iteration": 2.4995884895324707 + }, + { + "auxiliary_loss_clip": 0.01120182, + "auxiliary_loss_mlp": 0.01032631, + "balance_loss_clip": 1.01876068, + "balance_loss_mlp": 1.04270399, + "epoch": 0.3586953254171051, + "flos": 27849227437440.0, + "grad_norm": 4.210402322068537, + "language_loss": 0.79008359, + "learning_rate": 2.9697195357636294e-06, + "loss": 0.81161171, + "num_input_tokens_seen": 128196925, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7734375, + "step": 5966, + "time_per_iteration": 2.485438346862793 + }, + { + "auxiliary_loss_clip": 0.01121545, + "auxiliary_loss_mlp": 0.01037084, + "balance_loss_clip": 1.02238536, + "balance_loss_mlp": 1.04341781, + "epoch": 0.35875544866977305, + "flos": 19500500171520.0, + "grad_norm": 5.107721360348662, + "language_loss": 0.90911728, + "learning_rate": 2.9693788973447715e-06, + "loss": 0.93070352, + "num_input_tokens_seen": 128213955, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78125, + "step": 5967, + "time_per_iteration": 2.547287702560425 + }, + { + "auxiliary_loss_clip": 0.01125829, + "auxiliary_loss_mlp": 0.01041518, + "balance_loss_clip": 1.02648592, + "balance_loss_mlp": 1.04528475, + "epoch": 0.358815571922441, + "flos": 21471134077440.0, + "grad_norm": 1.7620117516801617, + "language_loss": 0.79739827, + "learning_rate": 2.9690382221669682e-06, + "loss": 0.81907177, + "num_input_tokens_seen": 128232980, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8046875, + "step": 5968, + "time_per_iteration": 2.4543471336364746 + }, + { + "auxiliary_loss_clip": 0.01127455, + "auxiliary_loss_mlp": 0.01052904, + "balance_loss_clip": 1.0384376, + "balance_loss_mlp": 1.04604244, + "epoch": 0.358875695175109, + "flos": 21835411856640.0, + "grad_norm": 2.0044885906540424, + "language_loss": 0.83642054, + "learning_rate": 2.9686975102431384e-06, + "loss": 0.85822409, + "num_input_tokens_seen": 128252795, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8125, + "step": 5969, + "time_per_iteration": 2.502815008163452 + }, + { + "auxiliary_loss_clip": 0.0111906, + "auxiliary_loss_mlp": 0.01032145, + "balance_loss_clip": 1.01893663, + "balance_loss_mlp": 1.04245603, + "epoch": 0.35893581842777694, + "flos": 32011633330560.0, + "grad_norm": 1.876228198696561, + "language_loss": 0.72377515, + "learning_rate": 2.968356761586202e-06, + "loss": 0.74528718, + "num_input_tokens_seen": 128273115, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.765625, + "step": 5970, + "time_per_iteration": 4.051819086074829 + }, + { + "auxiliary_loss_clip": 0.01119038, + "auxiliary_loss_mlp": 0.01035066, + "balance_loss_clip": 1.02178049, + "balance_loss_mlp": 1.0424037, + "epoch": 0.3589959416804449, + "flos": 20485817124480.0, + "grad_norm": 1.6844020581036279, + "language_loss": 0.79522693, + "learning_rate": 2.9680159762090805e-06, + "loss": 0.81676805, + "num_input_tokens_seen": 128292220, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.765625, + "step": 5971, + "time_per_iteration": 3.8910434246063232 + }, + { + "auxiliary_loss_clip": 0.01120261, + "auxiliary_loss_mlp": 0.01039002, + "balance_loss_clip": 1.02427924, + "balance_loss_mlp": 1.0402174, + "epoch": 0.3590560649331129, + "flos": 16180666583040.0, + "grad_norm": 1.924864359347905, + "language_loss": 0.78594625, + "learning_rate": 2.967675154124696e-06, + "loss": 0.80753887, + "num_input_tokens_seen": 128310305, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.80078125, + "step": 5972, + "time_per_iteration": 2.4272611141204834 + }, + { + "auxiliary_loss_clip": 0.01120688, + "auxiliary_loss_mlp": 0.01037349, + "balance_loss_clip": 1.02378309, + "balance_loss_mlp": 1.04185021, + "epoch": 0.35911618818578084, + "flos": 20375391738240.0, + "grad_norm": 3.2741380987368327, + "language_loss": 0.81252539, + "learning_rate": 2.9673342953459722e-06, + "loss": 0.83410573, + "num_input_tokens_seen": 128328305, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7890625, + "step": 5973, + "time_per_iteration": 2.469438314437866 + }, + { + "auxiliary_loss_clip": 0.0103695, + "auxiliary_loss_mlp": 0.01001955, + "balance_loss_clip": 0.9999882, + "balance_loss_mlp": 1.01160312, + "epoch": 0.3591763114384488, + "flos": 41236691685120.0, + "grad_norm": 0.9181567019376142, + "language_loss": 0.56828684, + "learning_rate": 2.9669933998858355e-06, + "loss": 0.58867586, + "num_input_tokens_seen": 128378380, + "router_z_loss_clip": 0.01965332, + "router_z_loss_mlp": 0.25390625, + "step": 5974, + "time_per_iteration": 2.918166399002075 + }, + { + "auxiliary_loss_clip": 0.01122634, + "auxiliary_loss_mlp": 0.01038804, + "balance_loss_clip": 1.02548242, + "balance_loss_mlp": 1.04407859, + "epoch": 0.35923643469111677, + "flos": 18695454600960.0, + "grad_norm": 1.6252506462115286, + "language_loss": 0.68750453, + "learning_rate": 2.9666524677572114e-06, + "loss": 0.7091189, + "num_input_tokens_seen": 128394315, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.78515625, + "step": 5975, + "time_per_iteration": 2.4578702449798584 + }, + { + "auxiliary_loss_clip": 0.01119888, + "auxiliary_loss_mlp": 0.01034451, + "balance_loss_clip": 1.02132642, + "balance_loss_mlp": 1.04269934, + "epoch": 0.35929655794378473, + "flos": 25009950931200.0, + "grad_norm": 1.7542310571392548, + "language_loss": 0.79961413, + "learning_rate": 2.96631149897303e-06, + "loss": 0.82115752, + "num_input_tokens_seen": 128414515, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7734375, + "step": 5976, + "time_per_iteration": 2.494723081588745 + }, + { + "auxiliary_loss_clip": 0.01119534, + "auxiliary_loss_mlp": 0.0104186, + "balance_loss_clip": 1.02761412, + "balance_loss_mlp": 1.04172039, + "epoch": 0.35935668119645275, + "flos": 14975576265600.0, + "grad_norm": 1.7409485188517788, + "language_loss": 0.79081398, + "learning_rate": 2.9659704935462194e-06, + "loss": 0.81242788, + "num_input_tokens_seen": 128430615, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.77734375, + "step": 5977, + "time_per_iteration": 2.4949100017547607 + }, + { + "auxiliary_loss_clip": 0.01116029, + "auxiliary_loss_mlp": 0.01034751, + "balance_loss_clip": 1.02151847, + "balance_loss_mlp": 1.04029524, + "epoch": 0.3594168044491207, + "flos": 21178138838400.0, + "grad_norm": 1.7920092294573908, + "language_loss": 0.80654621, + "learning_rate": 2.9656294514897102e-06, + "loss": 0.82805401, + "num_input_tokens_seen": 128449480, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7578125, + "step": 5978, + "time_per_iteration": 2.445866584777832 + }, + { + "auxiliary_loss_clip": 0.01122409, + "auxiliary_loss_mlp": 0.01034873, + "balance_loss_clip": 1.02046657, + "balance_loss_mlp": 1.04394007, + "epoch": 0.3594769277017887, + "flos": 27672152365440.0, + "grad_norm": 1.5382295990908517, + "language_loss": 0.67741489, + "learning_rate": 2.965288372816436e-06, + "loss": 0.69898772, + "num_input_tokens_seen": 128471465, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78515625, + "step": 5979, + "time_per_iteration": 2.538585662841797 + }, + { + "auxiliary_loss_clip": 0.01119324, + "auxiliary_loss_mlp": 0.01038492, + "balance_loss_clip": 1.02478838, + "balance_loss_mlp": 1.04136634, + "epoch": 0.35953705095445665, + "flos": 23002328995200.0, + "grad_norm": 2.3207911240165697, + "language_loss": 0.67176729, + "learning_rate": 2.9649472575393296e-06, + "loss": 0.69334549, + "num_input_tokens_seen": 128490645, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.78125, + "step": 5980, + "time_per_iteration": 2.4896938800811768 + }, + { + "auxiliary_loss_clip": 0.01123377, + "auxiliary_loss_mlp": 0.01039239, + "balance_loss_clip": 1.02377748, + "balance_loss_mlp": 1.0416832, + "epoch": 0.3595971742071246, + "flos": 25513992529920.0, + "grad_norm": 1.8107777091561479, + "language_loss": 0.71148199, + "learning_rate": 2.964606105671327e-06, + "loss": 0.73310816, + "num_input_tokens_seen": 128510225, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.81640625, + "step": 5981, + "time_per_iteration": 2.49064302444458 + }, + { + "auxiliary_loss_clip": 0.01121979, + "auxiliary_loss_mlp": 0.01038955, + "balance_loss_clip": 1.02387476, + "balance_loss_mlp": 1.0432086, + "epoch": 0.3596572974597926, + "flos": 29862559635840.0, + "grad_norm": 1.7933500913622242, + "language_loss": 0.71331298, + "learning_rate": 2.9642649172253635e-06, + "loss": 0.73492229, + "num_input_tokens_seen": 128530195, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7890625, + "step": 5982, + "time_per_iteration": 2.5167934894561768 + }, + { + "auxiliary_loss_clip": 0.01117371, + "auxiliary_loss_mlp": 0.01036663, + "balance_loss_clip": 1.02361536, + "balance_loss_mlp": 1.0427959, + "epoch": 0.35971742071246054, + "flos": 23112538899840.0, + "grad_norm": 1.6761533335073455, + "language_loss": 0.75808942, + "learning_rate": 2.9639236922143786e-06, + "loss": 0.77962971, + "num_input_tokens_seen": 128549990, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.74609375, + "step": 5983, + "time_per_iteration": 2.4915101528167725 + }, + { + "auxiliary_loss_clip": 0.01126703, + "auxiliary_loss_mlp": 0.01043227, + "balance_loss_clip": 1.02771819, + "balance_loss_mlp": 1.04474413, + "epoch": 0.3597775439651285, + "flos": 16725359399040.0, + "grad_norm": 2.1804669018597043, + "language_loss": 0.76302433, + "learning_rate": 2.96358243065131e-06, + "loss": 0.78472364, + "num_input_tokens_seen": 128567925, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8203125, + "step": 5984, + "time_per_iteration": 2.436640501022339 + }, + { + "auxiliary_loss_clip": 0.01118377, + "auxiliary_loss_mlp": 0.01037581, + "balance_loss_clip": 1.02356207, + "balance_loss_mlp": 1.0420785, + "epoch": 0.3598376672177965, + "flos": 19719483436800.0, + "grad_norm": 1.837904559260202, + "language_loss": 0.86617446, + "learning_rate": 2.9632411325490993e-06, + "loss": 0.88773406, + "num_input_tokens_seen": 128585655, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 5985, + "time_per_iteration": 2.476853609085083 + }, + { + "auxiliary_loss_clip": 0.0111809, + "auxiliary_loss_mlp": 0.01036238, + "balance_loss_clip": 1.02130079, + "balance_loss_mlp": 1.04078126, + "epoch": 0.35989779047046444, + "flos": 17311529445120.0, + "grad_norm": 1.416236209566339, + "language_loss": 0.72801065, + "learning_rate": 2.9628997979206884e-06, + "loss": 0.74955392, + "num_input_tokens_seen": 128604820, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.7734375, + "step": 5986, + "time_per_iteration": 2.443871021270752 + }, + { + "auxiliary_loss_clip": 0.01124328, + "auxiliary_loss_mlp": 0.0103792, + "balance_loss_clip": 1.02354908, + "balance_loss_mlp": 1.04230642, + "epoch": 0.3599579137231324, + "flos": 22711237176960.0, + "grad_norm": 1.880079313238184, + "language_loss": 0.73711401, + "learning_rate": 2.9625584267790204e-06, + "loss": 0.75873649, + "num_input_tokens_seen": 128623070, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.8203125, + "step": 5987, + "time_per_iteration": 2.517045736312866 + }, + { + "auxiliary_loss_clip": 0.01121357, + "auxiliary_loss_mlp": 0.01036656, + "balance_loss_clip": 1.0217309, + "balance_loss_mlp": 1.04161966, + "epoch": 0.36001803697580037, + "flos": 20959873845120.0, + "grad_norm": 1.8583263097896845, + "language_loss": 0.69824201, + "learning_rate": 2.9622170191370404e-06, + "loss": 0.71982217, + "num_input_tokens_seen": 128642430, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.796875, + "step": 5988, + "time_per_iteration": 2.484654426574707 + }, + { + "auxiliary_loss_clip": 0.01125207, + "auxiliary_loss_mlp": 0.01041231, + "balance_loss_clip": 1.02675915, + "balance_loss_mlp": 1.04297233, + "epoch": 0.36007816022846834, + "flos": 20485565729280.0, + "grad_norm": 1.851186734533378, + "language_loss": 0.72918314, + "learning_rate": 2.9618755750076953e-06, + "loss": 0.75084746, + "num_input_tokens_seen": 128661285, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8203125, + "step": 5989, + "time_per_iteration": 2.464378833770752 + }, + { + "auxiliary_loss_clip": 0.01120868, + "auxiliary_loss_mlp": 0.0103281, + "balance_loss_clip": 1.0194943, + "balance_loss_mlp": 1.04283333, + "epoch": 0.36013828348113636, + "flos": 28001237794560.0, + "grad_norm": 1.8425061302669492, + "language_loss": 0.79664916, + "learning_rate": 2.961534094403931e-06, + "loss": 0.81818593, + "num_input_tokens_seen": 128682210, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.78125, + "step": 5990, + "time_per_iteration": 2.4947755336761475 + }, + { + "auxiliary_loss_clip": 0.01121243, + "auxiliary_loss_mlp": 0.01028868, + "balance_loss_clip": 1.01472998, + "balance_loss_mlp": 1.04281235, + "epoch": 0.3601984067338043, + "flos": 20082181017600.0, + "grad_norm": 1.9352260247419832, + "language_loss": 0.84225297, + "learning_rate": 2.961192577338698e-06, + "loss": 0.86375415, + "num_input_tokens_seen": 128700445, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.78515625, + "step": 5991, + "time_per_iteration": 2.4728991985321045 + }, + { + "auxiliary_loss_clip": 0.01123597, + "auxiliary_loss_mlp": 0.01039266, + "balance_loss_clip": 1.02490079, + "balance_loss_mlp": 1.04197788, + "epoch": 0.3602585299864723, + "flos": 18617599872000.0, + "grad_norm": 1.9640325518662143, + "language_loss": 0.75616056, + "learning_rate": 2.9608510238249463e-06, + "loss": 0.77778924, + "num_input_tokens_seen": 128716855, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.81640625, + "step": 5992, + "time_per_iteration": 2.4422738552093506 + }, + { + "auxiliary_loss_clip": 0.01119253, + "auxiliary_loss_mlp": 0.01034904, + "balance_loss_clip": 1.02022302, + "balance_loss_mlp": 1.04177451, + "epoch": 0.36031865323914025, + "flos": 19573003774080.0, + "grad_norm": 6.32582004359923, + "language_loss": 0.77500135, + "learning_rate": 2.960509433875627e-06, + "loss": 0.79654288, + "num_input_tokens_seen": 128735835, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7734375, + "step": 5993, + "time_per_iteration": 2.4513776302337646 + }, + { + "auxiliary_loss_clip": 0.01124951, + "auxiliary_loss_mlp": 0.01036544, + "balance_loss_clip": 1.02281737, + "balance_loss_mlp": 1.04405534, + "epoch": 0.3603787764918082, + "flos": 17490615678720.0, + "grad_norm": 1.9096274983436938, + "language_loss": 0.74686468, + "learning_rate": 2.9601678075036943e-06, + "loss": 0.7684797, + "num_input_tokens_seen": 128752465, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.80859375, + "step": 5994, + "time_per_iteration": 2.4278860092163086 + }, + { + "auxiliary_loss_clip": 0.0112434, + "auxiliary_loss_mlp": 0.01038632, + "balance_loss_clip": 1.02506554, + "balance_loss_mlp": 1.04320991, + "epoch": 0.3604388997444762, + "flos": 15523393564800.0, + "grad_norm": 1.8397117218597796, + "language_loss": 0.68890274, + "learning_rate": 2.9598261447221024e-06, + "loss": 0.71053243, + "num_input_tokens_seen": 128770865, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.8125, + "step": 5995, + "time_per_iteration": 2.462557554244995 + }, + { + "auxiliary_loss_clip": 0.01124519, + "auxiliary_loss_mlp": 0.01040187, + "balance_loss_clip": 1.02548289, + "balance_loss_mlp": 1.04238582, + "epoch": 0.36049902299714415, + "flos": 17310883000320.0, + "grad_norm": 1.7352965040741237, + "language_loss": 0.82057822, + "learning_rate": 2.9594844455438057e-06, + "loss": 0.84222531, + "num_input_tokens_seen": 128789730, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8203125, + "step": 5996, + "time_per_iteration": 2.4284703731536865 + }, + { + "auxiliary_loss_clip": 0.01119849, + "auxiliary_loss_mlp": 0.01034523, + "balance_loss_clip": 1.02054608, + "balance_loss_mlp": 1.04242694, + "epoch": 0.3605591462498121, + "flos": 17056025026560.0, + "grad_norm": 1.56212250683249, + "language_loss": 0.73570979, + "learning_rate": 2.959142709981763e-06, + "loss": 0.75725353, + "num_input_tokens_seen": 128806610, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7734375, + "step": 5997, + "time_per_iteration": 2.4418485164642334 + }, + { + "auxiliary_loss_clip": 0.01120213, + "auxiliary_loss_mlp": 0.01036336, + "balance_loss_clip": 1.02272177, + "balance_loss_mlp": 1.04307055, + "epoch": 0.3606192695024801, + "flos": 16836862193280.0, + "grad_norm": 2.1655767572067637, + "language_loss": 0.68651283, + "learning_rate": 2.9588009380489337e-06, + "loss": 0.70807832, + "num_input_tokens_seen": 128824830, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 5998, + "time_per_iteration": 2.435884475708008 + }, + { + "auxiliary_loss_clip": 0.01124048, + "auxiliary_loss_mlp": 0.01034007, + "balance_loss_clip": 1.01983321, + "balance_loss_mlp": 1.04494119, + "epoch": 0.36067939275514804, + "flos": 12129655743360.0, + "grad_norm": 2.6750874406601914, + "language_loss": 0.77190387, + "learning_rate": 2.9584591297582758e-06, + "loss": 0.79348445, + "num_input_tokens_seen": 128838170, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7890625, + "step": 5999, + "time_per_iteration": 2.415649175643921 + }, + { + "auxiliary_loss_clip": 0.01123679, + "auxiliary_loss_mlp": 0.01037913, + "balance_loss_clip": 1.02381015, + "balance_loss_mlp": 1.04481769, + "epoch": 0.360739516007816, + "flos": 18041449720320.0, + "grad_norm": 2.719833162653021, + "language_loss": 0.78307509, + "learning_rate": 2.9581172851227516e-06, + "loss": 0.80469108, + "num_input_tokens_seen": 128855625, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7890625, + "step": 6000, + "time_per_iteration": 2.450085401535034 + }, + { + "auxiliary_loss_clip": 0.01122067, + "auxiliary_loss_mlp": 0.01034294, + "balance_loss_clip": 1.02061474, + "balance_loss_mlp": 1.04283905, + "epoch": 0.360799639260484, + "flos": 18549800951040.0, + "grad_norm": 1.6917067376727954, + "language_loss": 0.78621352, + "learning_rate": 2.9577754041553243e-06, + "loss": 0.80777717, + "num_input_tokens_seen": 128873540, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.79296875, + "step": 6001, + "time_per_iteration": 2.4247405529022217 + }, + { + "auxiliary_loss_clip": 0.01119251, + "auxiliary_loss_mlp": 0.0103132, + "balance_loss_clip": 1.01761651, + "balance_loss_mlp": 1.04341698, + "epoch": 0.36085976251315194, + "flos": 19682028529920.0, + "grad_norm": 1.9017223481518102, + "language_loss": 0.83743405, + "learning_rate": 2.9574334868689575e-06, + "loss": 0.85893983, + "num_input_tokens_seen": 128889925, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7578125, + "step": 6002, + "time_per_iteration": 2.4587790966033936 + }, + { + "auxiliary_loss_clip": 0.01117677, + "auxiliary_loss_mlp": 0.01030372, + "balance_loss_clip": 1.01753855, + "balance_loss_mlp": 1.04298413, + "epoch": 0.3609198857658199, + "flos": 24198943703040.0, + "grad_norm": 2.101850625944426, + "language_loss": 0.90627617, + "learning_rate": 2.9570915332766165e-06, + "loss": 0.92775667, + "num_input_tokens_seen": 128906890, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.75, + "step": 6003, + "time_per_iteration": 2.450408697128296 + }, + { + "auxiliary_loss_clip": 0.01040628, + "auxiliary_loss_mlp": 0.01013073, + "balance_loss_clip": 1.01102221, + "balance_loss_mlp": 1.01496768, + "epoch": 0.3609800090184879, + "flos": 57115995160320.0, + "grad_norm": 0.8843653445723816, + "language_loss": 0.53374904, + "learning_rate": 2.9567495433912693e-06, + "loss": 0.55428606, + "num_input_tokens_seen": 128965940, + "router_z_loss_clip": 0.02050781, + "router_z_loss_mlp": 0.25585938, + "step": 6004, + "time_per_iteration": 3.005659341812134 + }, + { + "auxiliary_loss_clip": 0.01121195, + "auxiliary_loss_mlp": 0.01037049, + "balance_loss_clip": 1.02152824, + "balance_loss_mlp": 1.04164577, + "epoch": 0.3610401322711559, + "flos": 20811239366400.0, + "grad_norm": 1.7248099575523852, + "language_loss": 0.77609527, + "learning_rate": 2.956407517225883e-06, + "loss": 0.7976777, + "num_input_tokens_seen": 128985835, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.796875, + "step": 6005, + "time_per_iteration": 2.4916067123413086 + }, + { + "auxiliary_loss_clip": 0.01124405, + "auxiliary_loss_mlp": 0.0103607, + "balance_loss_clip": 1.02230704, + "balance_loss_mlp": 1.04700613, + "epoch": 0.36110025552382385, + "flos": 13699167494400.0, + "grad_norm": 2.24467290311728, + "language_loss": 0.79267776, + "learning_rate": 2.956065454793429e-06, + "loss": 0.81428248, + "num_input_tokens_seen": 129003120, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7734375, + "step": 6006, + "time_per_iteration": 2.4366166591644287 + }, + { + "auxiliary_loss_clip": 0.01124848, + "auxiliary_loss_mlp": 0.01038917, + "balance_loss_clip": 1.02309775, + "balance_loss_mlp": 1.04587984, + "epoch": 0.3611603787764918, + "flos": 22455014486400.0, + "grad_norm": 1.7888636143213261, + "language_loss": 0.84360719, + "learning_rate": 2.955723356106876e-06, + "loss": 0.86524487, + "num_input_tokens_seen": 129021645, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.7890625, + "step": 6007, + "time_per_iteration": 2.51680850982666 + }, + { + "auxiliary_loss_clip": 0.01129854, + "auxiliary_loss_mlp": 0.01037601, + "balance_loss_clip": 1.02166319, + "balance_loss_mlp": 1.04622328, + "epoch": 0.3612205020291598, + "flos": 20886651970560.0, + "grad_norm": 2.0771979180574425, + "language_loss": 0.72564125, + "learning_rate": 2.955381221179198e-06, + "loss": 0.74731576, + "num_input_tokens_seen": 129038375, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8359375, + "step": 6008, + "time_per_iteration": 2.4473018646240234 + }, + { + "auxiliary_loss_clip": 0.01120564, + "auxiliary_loss_mlp": 0.01033967, + "balance_loss_clip": 1.02066362, + "balance_loss_mlp": 1.04255283, + "epoch": 0.36128062528182775, + "flos": 15741981780480.0, + "grad_norm": 1.9836274680059969, + "language_loss": 0.8284781, + "learning_rate": 2.955039050023368e-06, + "loss": 0.85002339, + "num_input_tokens_seen": 129056235, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.78125, + "step": 6009, + "time_per_iteration": 2.470031261444092 + }, + { + "auxiliary_loss_clip": 0.01125455, + "auxiliary_loss_mlp": 0.01043722, + "balance_loss_clip": 1.02945232, + "balance_loss_mlp": 1.04598057, + "epoch": 0.3613407485344957, + "flos": 16764502245120.0, + "grad_norm": 1.714442270200285, + "language_loss": 0.76139152, + "learning_rate": 2.954696842652362e-06, + "loss": 0.78308332, + "num_input_tokens_seen": 129072405, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.79296875, + "step": 6010, + "time_per_iteration": 2.446833848953247 + }, + { + "auxiliary_loss_clip": 0.01123758, + "auxiliary_loss_mlp": 0.01037408, + "balance_loss_clip": 1.0236752, + "balance_loss_mlp": 1.04619896, + "epoch": 0.3614008717871637, + "flos": 20371189847040.0, + "grad_norm": 1.905716478313633, + "language_loss": 0.82946253, + "learning_rate": 2.9543545990791554e-06, + "loss": 0.85107422, + "num_input_tokens_seen": 129090225, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.77734375, + "step": 6011, + "time_per_iteration": 2.508147716522217 + }, + { + "auxiliary_loss_clip": 0.01132257, + "auxiliary_loss_mlp": 0.01041461, + "balance_loss_clip": 1.0264287, + "balance_loss_mlp": 1.0491302, + "epoch": 0.36146099503983165, + "flos": 22776665800320.0, + "grad_norm": 1.8484903271380355, + "language_loss": 0.62762833, + "learning_rate": 2.954012319316727e-06, + "loss": 0.64936543, + "num_input_tokens_seen": 129107685, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.828125, + "step": 6012, + "time_per_iteration": 5.36588454246521 + }, + { + "auxiliary_loss_clip": 0.01118968, + "auxiliary_loss_mlp": 0.01034853, + "balance_loss_clip": 1.02112007, + "balance_loss_mlp": 1.04337454, + "epoch": 0.3615211182924996, + "flos": 22996654646400.0, + "grad_norm": 1.8689670235824563, + "language_loss": 0.84111822, + "learning_rate": 2.9536700033780565e-06, + "loss": 0.86265635, + "num_input_tokens_seen": 129125315, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7578125, + "step": 6013, + "time_per_iteration": 2.494051933288574 + }, + { + "auxiliary_loss_clip": 0.01124804, + "auxiliary_loss_mlp": 0.0104133, + "balance_loss_clip": 1.02690601, + "balance_loss_mlp": 1.04570448, + "epoch": 0.3615812415451676, + "flos": 16648079287680.0, + "grad_norm": 1.7351999387675028, + "language_loss": 0.91496456, + "learning_rate": 2.9533276512761228e-06, + "loss": 0.93662584, + "num_input_tokens_seen": 129141600, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7890625, + "step": 6014, + "time_per_iteration": 2.4356749057769775 + }, + { + "auxiliary_loss_clip": 0.01123597, + "auxiliary_loss_mlp": 0.01044534, + "balance_loss_clip": 1.03078914, + "balance_loss_mlp": 1.04549718, + "epoch": 0.36164136479783554, + "flos": 21320093387520.0, + "grad_norm": 1.727703603585928, + "language_loss": 0.73830914, + "learning_rate": 2.95298526302391e-06, + "loss": 0.75999045, + "num_input_tokens_seen": 129160665, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.78125, + "step": 6015, + "time_per_iteration": 2.4990644454956055 + }, + { + "auxiliary_loss_clip": 0.01125644, + "auxiliary_loss_mlp": 0.01038195, + "balance_loss_clip": 1.02394915, + "balance_loss_mlp": 1.04633307, + "epoch": 0.3617014880505035, + "flos": 24169569356160.0, + "grad_norm": 1.7277224025907603, + "language_loss": 0.65316677, + "learning_rate": 2.9526428386344e-06, + "loss": 0.67480516, + "num_input_tokens_seen": 129179220, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.79296875, + "step": 6016, + "time_per_iteration": 2.5260934829711914 + }, + { + "auxiliary_loss_clip": 0.01126131, + "auxiliary_loss_mlp": 0.01041518, + "balance_loss_clip": 1.02522171, + "balance_loss_mlp": 1.04727304, + "epoch": 0.3617616113031715, + "flos": 39014824101120.0, + "grad_norm": 1.744160138264151, + "language_loss": 0.72101283, + "learning_rate": 2.9523003781205785e-06, + "loss": 0.74268931, + "num_input_tokens_seen": 129200385, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.7890625, + "step": 6017, + "time_per_iteration": 2.638683795928955 + }, + { + "auxiliary_loss_clip": 0.01126121, + "auxiliary_loss_mlp": 0.01038852, + "balance_loss_clip": 1.02413559, + "balance_loss_mlp": 1.04454577, + "epoch": 0.3618217345558395, + "flos": 12130840892160.0, + "grad_norm": 1.9120538903838002, + "language_loss": 0.73590356, + "learning_rate": 2.9519578814954307e-06, + "loss": 0.75755334, + "num_input_tokens_seen": 129217395, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.81640625, + "step": 6018, + "time_per_iteration": 2.4477858543395996 + }, + { + "auxiliary_loss_clip": 0.01119909, + "auxiliary_loss_mlp": 0.01033819, + "balance_loss_clip": 1.02013361, + "balance_loss_mlp": 1.04458487, + "epoch": 0.36188185780850746, + "flos": 24935005203840.0, + "grad_norm": 1.754547200149591, + "language_loss": 0.69080901, + "learning_rate": 2.9516153487719448e-06, + "loss": 0.71234632, + "num_input_tokens_seen": 129238940, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75390625, + "step": 6019, + "time_per_iteration": 2.519831657409668 + }, + { + "auxiliary_loss_clip": 0.01124958, + "auxiliary_loss_mlp": 0.01034647, + "balance_loss_clip": 1.01980555, + "balance_loss_mlp": 1.0443728, + "epoch": 0.3619419810611754, + "flos": 20958832350720.0, + "grad_norm": 1.5467952079219929, + "language_loss": 0.76299942, + "learning_rate": 2.95127277996311e-06, + "loss": 0.78459549, + "num_input_tokens_seen": 129258240, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8046875, + "step": 6020, + "time_per_iteration": 2.4692177772521973 + }, + { + "auxiliary_loss_clip": 0.01125932, + "auxiliary_loss_mlp": 0.01043324, + "balance_loss_clip": 1.02814841, + "balance_loss_mlp": 1.04721653, + "epoch": 0.3620021043138434, + "flos": 22528882805760.0, + "grad_norm": 1.938447153390643, + "language_loss": 0.73921824, + "learning_rate": 2.9509301750819156e-06, + "loss": 0.76091087, + "num_input_tokens_seen": 129279040, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.78515625, + "step": 6021, + "time_per_iteration": 2.5069808959960938 + }, + { + "auxiliary_loss_clip": 0.01123146, + "auxiliary_loss_mlp": 0.01034023, + "balance_loss_clip": 1.02059376, + "balance_loss_mlp": 1.04596186, + "epoch": 0.36206222756651135, + "flos": 15596687266560.0, + "grad_norm": 1.8648032073369731, + "language_loss": 0.80978441, + "learning_rate": 2.9505875341413533e-06, + "loss": 0.83135605, + "num_input_tokens_seen": 129295415, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.76953125, + "step": 6022, + "time_per_iteration": 2.4620115756988525 + }, + { + "auxiliary_loss_clip": 0.01122576, + "auxiliary_loss_mlp": 0.0103516, + "balance_loss_clip": 1.02212477, + "balance_loss_mlp": 1.04778302, + "epoch": 0.3621223508191793, + "flos": 23587170238080.0, + "grad_norm": 1.6799220656127192, + "language_loss": 0.81351119, + "learning_rate": 2.950244857154417e-06, + "loss": 0.83508855, + "num_input_tokens_seen": 129312620, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.74609375, + "step": 6023, + "time_per_iteration": 2.4969308376312256 + }, + { + "auxiliary_loss_clip": 0.01124727, + "auxiliary_loss_mlp": 0.01034523, + "balance_loss_clip": 1.01975274, + "balance_loss_mlp": 1.04494548, + "epoch": 0.3621824740718473, + "flos": 22309899540480.0, + "grad_norm": 1.8793265875700644, + "language_loss": 0.79767907, + "learning_rate": 2.9499021441341e-06, + "loss": 0.81927156, + "num_input_tokens_seen": 129331825, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.796875, + "step": 6024, + "time_per_iteration": 2.468369245529175 + }, + { + "auxiliary_loss_clip": 0.01119855, + "auxiliary_loss_mlp": 0.01029797, + "balance_loss_clip": 1.01629043, + "balance_loss_mlp": 1.04456711, + "epoch": 0.36224259732451525, + "flos": 16763640318720.0, + "grad_norm": 1.7897574616215441, + "language_loss": 0.74720407, + "learning_rate": 2.9495593950933997e-06, + "loss": 0.7687006, + "num_input_tokens_seen": 129350400, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75390625, + "step": 6025, + "time_per_iteration": 2.4410412311553955 + }, + { + "auxiliary_loss_clip": 0.01120005, + "auxiliary_loss_mlp": 0.0103221, + "balance_loss_clip": 1.01849484, + "balance_loss_mlp": 1.04340899, + "epoch": 0.3623027205771832, + "flos": 23149742411520.0, + "grad_norm": 1.5522426900619628, + "language_loss": 0.72055018, + "learning_rate": 2.9492166100453107e-06, + "loss": 0.74207234, + "num_input_tokens_seen": 129371155, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.765625, + "step": 6026, + "time_per_iteration": 2.4997596740722656 + }, + { + "auxiliary_loss_clip": 0.01128673, + "auxiliary_loss_mlp": 0.01041263, + "balance_loss_clip": 1.02645707, + "balance_loss_mlp": 1.04604256, + "epoch": 0.3623628438298512, + "flos": 28549162834560.0, + "grad_norm": 2.401846993246305, + "language_loss": 0.79332775, + "learning_rate": 2.948873789002833e-06, + "loss": 0.81502712, + "num_input_tokens_seen": 129391230, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.82421875, + "step": 6027, + "time_per_iteration": 2.5326383113861084 + }, + { + "auxiliary_loss_clip": 0.0112338, + "auxiliary_loss_mlp": 0.01040685, + "balance_loss_clip": 1.02576041, + "balance_loss_mlp": 1.04399586, + "epoch": 0.36242296708251914, + "flos": 25484941405440.0, + "grad_norm": 1.7548337209278033, + "language_loss": 0.67809385, + "learning_rate": 2.9485309319789667e-06, + "loss": 0.69973445, + "num_input_tokens_seen": 129410065, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.796875, + "step": 6028, + "time_per_iteration": 2.548088788986206 + }, + { + "auxiliary_loss_clip": 0.0112104, + "auxiliary_loss_mlp": 0.01032512, + "balance_loss_clip": 1.01922584, + "balance_loss_mlp": 1.04415894, + "epoch": 0.3624830903351871, + "flos": 16290373697280.0, + "grad_norm": 1.63067637662311, + "language_loss": 0.85700679, + "learning_rate": 2.9481880389867117e-06, + "loss": 0.8785423, + "num_input_tokens_seen": 129428655, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.76953125, + "step": 6029, + "time_per_iteration": 2.429720878601074 + }, + { + "auxiliary_loss_clip": 0.01120137, + "auxiliary_loss_mlp": 0.01038059, + "balance_loss_clip": 1.02412939, + "balance_loss_mlp": 1.04442835, + "epoch": 0.36254321358785513, + "flos": 18296307694080.0, + "grad_norm": 1.6511023563359555, + "language_loss": 0.72693753, + "learning_rate": 2.9478451100390714e-06, + "loss": 0.74851942, + "num_input_tokens_seen": 129447845, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 6030, + "time_per_iteration": 2.4299302101135254 + }, + { + "auxiliary_loss_clip": 0.01123199, + "auxiliary_loss_mlp": 0.01041671, + "balance_loss_clip": 1.02529144, + "balance_loss_mlp": 1.04264557, + "epoch": 0.3626033368405231, + "flos": 14865294533760.0, + "grad_norm": 2.02536170930057, + "language_loss": 0.73986644, + "learning_rate": 2.94750214514905e-06, + "loss": 0.76151514, + "num_input_tokens_seen": 129463275, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8046875, + "step": 6031, + "time_per_iteration": 2.4376232624053955 + }, + { + "auxiliary_loss_clip": 0.01120355, + "auxiliary_loss_mlp": 0.01031654, + "balance_loss_clip": 1.0177424, + "balance_loss_mlp": 1.04309845, + "epoch": 0.36266346009319106, + "flos": 22306595489280.0, + "grad_norm": 1.8475328889194098, + "language_loss": 0.73286617, + "learning_rate": 2.9471591443296516e-06, + "loss": 0.75438625, + "num_input_tokens_seen": 129483205, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76953125, + "step": 6032, + "time_per_iteration": 2.4811155796051025 + }, + { + "auxiliary_loss_clip": 0.01121253, + "auxiliary_loss_mlp": 0.01038206, + "balance_loss_clip": 1.02412748, + "balance_loss_mlp": 1.0427382, + "epoch": 0.362723583345859, + "flos": 18222331633920.0, + "grad_norm": 1.684246043345259, + "language_loss": 0.77953577, + "learning_rate": 2.946816107593884e-06, + "loss": 0.80113035, + "num_input_tokens_seen": 129499885, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.78515625, + "step": 6033, + "time_per_iteration": 2.4283456802368164 + }, + { + "auxiliary_loss_clip": 0.01040416, + "auxiliary_loss_mlp": 0.01019079, + "balance_loss_clip": 1.01733828, + "balance_loss_mlp": 1.01487339, + "epoch": 0.362783706598527, + "flos": 68499174458880.0, + "grad_norm": 0.786107382559835, + "language_loss": 0.64822888, + "learning_rate": 2.9464730349547547e-06, + "loss": 0.66882384, + "num_input_tokens_seen": 129561885, + "router_z_loss_clip": 0.01745605, + "router_z_loss_mlp": 0.25585938, + "step": 6034, + "time_per_iteration": 3.1253511905670166 + }, + { + "auxiliary_loss_clip": 0.01118206, + "auxiliary_loss_mlp": 0.01035423, + "balance_loss_clip": 1.02139246, + "balance_loss_mlp": 1.04131126, + "epoch": 0.36284382985119495, + "flos": 26576589594240.0, + "grad_norm": 1.4985312456135769, + "language_loss": 0.90059769, + "learning_rate": 2.946129926425273e-06, + "loss": 0.92213392, + "num_input_tokens_seen": 129582325, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76953125, + "step": 6035, + "time_per_iteration": 2.4888923168182373 + }, + { + "auxiliary_loss_clip": 0.01122001, + "auxiliary_loss_mlp": 0.01034945, + "balance_loss_clip": 1.02030611, + "balance_loss_mlp": 1.04239392, + "epoch": 0.3629039531038629, + "flos": 20156767608960.0, + "grad_norm": 1.7493433732375512, + "language_loss": 0.73526931, + "learning_rate": 2.9457867820184496e-06, + "loss": 0.7568388, + "num_input_tokens_seen": 129600350, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.796875, + "step": 6036, + "time_per_iteration": 2.445058822631836 + }, + { + "auxiliary_loss_clip": 0.01124436, + "auxiliary_loss_mlp": 0.01029235, + "balance_loss_clip": 1.01500189, + "balance_loss_mlp": 1.04274487, + "epoch": 0.3629640763565309, + "flos": 18625716345600.0, + "grad_norm": 1.901551926176817, + "language_loss": 0.75938255, + "learning_rate": 2.945443601747297e-06, + "loss": 0.78091925, + "num_input_tokens_seen": 129618425, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.81640625, + "step": 6037, + "time_per_iteration": 2.422229766845703 + }, + { + "auxiliary_loss_clip": 0.0111661, + "auxiliary_loss_mlp": 0.0103799, + "balance_loss_clip": 1.0238812, + "balance_loss_mlp": 1.04227912, + "epoch": 0.36302419960919885, + "flos": 19571459489280.0, + "grad_norm": 1.6899683541385933, + "language_loss": 0.78120697, + "learning_rate": 2.945100385624828e-06, + "loss": 0.80275297, + "num_input_tokens_seen": 129636750, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 6038, + "time_per_iteration": 2.4582855701446533 + }, + { + "auxiliary_loss_clip": 0.0103994, + "auxiliary_loss_mlp": 0.01006466, + "balance_loss_clip": 1.00467765, + "balance_loss_mlp": 1.01452303, + "epoch": 0.3630843228618668, + "flos": 63797606444160.0, + "grad_norm": 0.8286249809211084, + "language_loss": 0.63413143, + "learning_rate": 2.9447571336640573e-06, + "loss": 0.65459549, + "num_input_tokens_seen": 129699030, + "router_z_loss_clip": 0.01782227, + "router_z_loss_mlp": 0.25390625, + "step": 6039, + "time_per_iteration": 3.1417860984802246 + }, + { + "auxiliary_loss_clip": 0.01120334, + "auxiliary_loss_mlp": 0.01035281, + "balance_loss_clip": 1.02175713, + "balance_loss_mlp": 1.04391789, + "epoch": 0.3631444461145348, + "flos": 21835160461440.0, + "grad_norm": 1.9215128015710738, + "language_loss": 0.70857447, + "learning_rate": 2.944413845878002e-06, + "loss": 0.73013067, + "num_input_tokens_seen": 129717135, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 6040, + "time_per_iteration": 2.505627155303955 + }, + { + "auxiliary_loss_clip": 0.0112497, + "auxiliary_loss_mlp": 0.01032549, + "balance_loss_clip": 1.01827383, + "balance_loss_mlp": 1.04445744, + "epoch": 0.36320456936720275, + "flos": 21722041555200.0, + "grad_norm": 2.327350689124367, + "language_loss": 0.81322253, + "learning_rate": 2.9440705222796783e-06, + "loss": 0.83479762, + "num_input_tokens_seen": 129735940, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8046875, + "step": 6041, + "time_per_iteration": 2.4475231170654297 + }, + { + "auxiliary_loss_clip": 0.01120173, + "auxiliary_loss_mlp": 0.01030159, + "balance_loss_clip": 1.01526928, + "balance_loss_mlp": 1.04150891, + "epoch": 0.3632646926198707, + "flos": 17019072910080.0, + "grad_norm": 2.252727008735842, + "language_loss": 0.83721769, + "learning_rate": 2.943727162882107e-06, + "loss": 0.85872102, + "num_input_tokens_seen": 129752790, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78515625, + "step": 6042, + "time_per_iteration": 2.461111545562744 + }, + { + "auxiliary_loss_clip": 0.01120803, + "auxiliary_loss_mlp": 0.01039778, + "balance_loss_clip": 1.02583623, + "balance_loss_mlp": 1.04390788, + "epoch": 0.36332481587253873, + "flos": 23331163029120.0, + "grad_norm": 1.6644116234057968, + "language_loss": 0.78122932, + "learning_rate": 2.9433837676983064e-06, + "loss": 0.80283511, + "num_input_tokens_seen": 129773655, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76953125, + "step": 6043, + "time_per_iteration": 2.477030038833618 + }, + { + "auxiliary_loss_clip": 0.01117368, + "auxiliary_loss_mlp": 0.01035057, + "balance_loss_clip": 1.02017403, + "balance_loss_mlp": 1.04266226, + "epoch": 0.3633849391252067, + "flos": 10743539857920.0, + "grad_norm": 3.8032713581650515, + "language_loss": 0.65792918, + "learning_rate": 2.943040336741298e-06, + "loss": 0.67945337, + "num_input_tokens_seen": 129791605, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.74609375, + "step": 6044, + "time_per_iteration": 2.471221446990967 + }, + { + "auxiliary_loss_clip": 0.01118191, + "auxiliary_loss_mlp": 0.01030901, + "balance_loss_clip": 1.01706135, + "balance_loss_mlp": 1.04186332, + "epoch": 0.36344506237787466, + "flos": 25849147357440.0, + "grad_norm": 1.74112377533005, + "language_loss": 0.80978471, + "learning_rate": 2.9426968700241066e-06, + "loss": 0.83127558, + "num_input_tokens_seen": 129811075, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76171875, + "step": 6045, + "time_per_iteration": 2.482147693634033 + }, + { + "auxiliary_loss_clip": 0.01122131, + "auxiliary_loss_mlp": 0.01038556, + "balance_loss_clip": 1.02388096, + "balance_loss_mlp": 1.04342091, + "epoch": 0.3635051856305426, + "flos": 30154046503680.0, + "grad_norm": 1.7414472049280392, + "language_loss": 0.64214617, + "learning_rate": 2.942353367559755e-06, + "loss": 0.66375309, + "num_input_tokens_seen": 129833755, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78515625, + "step": 6046, + "time_per_iteration": 2.593209743499756 + }, + { + "auxiliary_loss_clip": 0.01119542, + "auxiliary_loss_mlp": 0.01035387, + "balance_loss_clip": 1.02142787, + "balance_loss_mlp": 1.04214859, + "epoch": 0.3635653088832106, + "flos": 22198396746240.0, + "grad_norm": 1.623453692259123, + "language_loss": 0.77366132, + "learning_rate": 2.9420098293612692e-06, + "loss": 0.7952106, + "num_input_tokens_seen": 129854475, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7734375, + "step": 6047, + "time_per_iteration": 2.4650797843933105 + }, + { + "auxiliary_loss_clip": 0.01125471, + "auxiliary_loss_mlp": 0.01041953, + "balance_loss_clip": 1.02609777, + "balance_loss_mlp": 1.04148006, + "epoch": 0.36362543213587856, + "flos": 24787053083520.0, + "grad_norm": 1.508802610673932, + "language_loss": 0.79679012, + "learning_rate": 2.9416662554416767e-06, + "loss": 0.81846434, + "num_input_tokens_seen": 129873530, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8359375, + "step": 6048, + "time_per_iteration": 2.5329999923706055 + }, + { + "auxiliary_loss_clip": 0.01037747, + "auxiliary_loss_mlp": 0.01000265, + "balance_loss_clip": 0.99839348, + "balance_loss_mlp": 1.0124383, + "epoch": 0.3636855553885465, + "flos": 62526369231360.0, + "grad_norm": 0.7564639677567045, + "language_loss": 0.52584642, + "learning_rate": 2.9413226458140054e-06, + "loss": 0.54622656, + "num_input_tokens_seen": 129940400, + "router_z_loss_clip": 0.01867676, + "router_z_loss_mlp": 0.25390625, + "step": 6049, + "time_per_iteration": 3.1051762104034424 + }, + { + "auxiliary_loss_clip": 0.0112103, + "auxiliary_loss_mlp": 0.01036313, + "balance_loss_clip": 1.02172136, + "balance_loss_mlp": 1.04254675, + "epoch": 0.3637456786412145, + "flos": 24060652341120.0, + "grad_norm": 2.0453292842004833, + "language_loss": 0.86365628, + "learning_rate": 2.9409790004912845e-06, + "loss": 0.88522977, + "num_input_tokens_seen": 129958635, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.78515625, + "step": 6050, + "time_per_iteration": 2.469092845916748 + }, + { + "auxiliary_loss_clip": 0.01119484, + "auxiliary_loss_mlp": 0.01036489, + "balance_loss_clip": 1.0227803, + "balance_loss_mlp": 1.04309154, + "epoch": 0.36380580189388245, + "flos": 16691495852160.0, + "grad_norm": 1.7649295268136813, + "language_loss": 0.7855531, + "learning_rate": 2.940635319486546e-06, + "loss": 0.80711287, + "num_input_tokens_seen": 129977685, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76171875, + "step": 6051, + "time_per_iteration": 2.425166368484497 + }, + { + "auxiliary_loss_clip": 0.0111821, + "auxiliary_loss_mlp": 0.01034377, + "balance_loss_clip": 1.02044129, + "balance_loss_mlp": 1.04047346, + "epoch": 0.3638659251465504, + "flos": 25114091437440.0, + "grad_norm": 2.0280679706971423, + "language_loss": 0.83024764, + "learning_rate": 2.940291602812822e-06, + "loss": 0.8517735, + "num_input_tokens_seen": 129997530, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.77734375, + "step": 6052, + "time_per_iteration": 2.511826992034912 + }, + { + "auxiliary_loss_clip": 0.01114918, + "auxiliary_loss_mlp": 0.01034279, + "balance_loss_clip": 1.02146947, + "balance_loss_mlp": 1.03992438, + "epoch": 0.3639260483992184, + "flos": 23003011353600.0, + "grad_norm": 3.055248278017369, + "language_loss": 0.72156489, + "learning_rate": 2.939947850483145e-06, + "loss": 0.74305683, + "num_input_tokens_seen": 130017955, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.75, + "step": 6053, + "time_per_iteration": 4.030078887939453 + }, + { + "auxiliary_loss_clip": 0.01038499, + "auxiliary_loss_mlp": 0.01000787, + "balance_loss_clip": 0.99893934, + "balance_loss_mlp": 1.01315093, + "epoch": 0.36398617165188635, + "flos": 70716011160960.0, + "grad_norm": 0.7695228081579073, + "language_loss": 0.61234874, + "learning_rate": 2.9396040625105532e-06, + "loss": 0.63274157, + "num_input_tokens_seen": 130074275, + "router_z_loss_clip": 0.01843262, + "router_z_loss_mlp": 0.25390625, + "step": 6054, + "time_per_iteration": 4.498634576797485 + }, + { + "auxiliary_loss_clip": 0.01121607, + "auxiliary_loss_mlp": 0.01038553, + "balance_loss_clip": 1.02378917, + "balance_loss_mlp": 1.0425837, + "epoch": 0.3640462949045543, + "flos": 22235456603520.0, + "grad_norm": 1.9647165397438333, + "language_loss": 0.75846946, + "learning_rate": 2.9392602389080802e-06, + "loss": 0.78007108, + "num_input_tokens_seen": 130091375, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.7890625, + "step": 6055, + "time_per_iteration": 2.46478271484375 + }, + { + "auxiliary_loss_clip": 0.01121913, + "auxiliary_loss_mlp": 0.010384, + "balance_loss_clip": 1.0240891, + "balance_loss_mlp": 1.04369521, + "epoch": 0.3641064181572223, + "flos": 21543529939200.0, + "grad_norm": 1.6567803669377452, + "language_loss": 0.75263339, + "learning_rate": 2.938916379688765e-06, + "loss": 0.7742365, + "num_input_tokens_seen": 130111595, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78125, + "step": 6056, + "time_per_iteration": 2.4739041328430176 + }, + { + "auxiliary_loss_clip": 0.01120031, + "auxiliary_loss_mlp": 0.01039041, + "balance_loss_clip": 1.02447379, + "balance_loss_mlp": 1.04331231, + "epoch": 0.3641665414098903, + "flos": 22273306560000.0, + "grad_norm": 2.0844054878938607, + "language_loss": 0.80676425, + "learning_rate": 2.9385724848656468e-06, + "loss": 0.82835501, + "num_input_tokens_seen": 130131440, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.765625, + "step": 6057, + "time_per_iteration": 2.4778594970703125 + }, + { + "auxiliary_loss_clip": 0.01119344, + "auxiliary_loss_mlp": 0.0103916, + "balance_loss_clip": 1.02457452, + "balance_loss_mlp": 1.04333091, + "epoch": 0.36422666466255826, + "flos": 28329676778880.0, + "grad_norm": 1.8744131952209395, + "language_loss": 0.79986346, + "learning_rate": 2.9382285544517647e-06, + "loss": 0.82144856, + "num_input_tokens_seen": 130151375, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.76171875, + "step": 6058, + "time_per_iteration": 2.5267081260681152 + }, + { + "auxiliary_loss_clip": 0.01119278, + "auxiliary_loss_mlp": 0.01035858, + "balance_loss_clip": 1.02142191, + "balance_loss_mlp": 1.04207647, + "epoch": 0.36428678791522623, + "flos": 24170503109760.0, + "grad_norm": 1.8448855765347556, + "language_loss": 0.8485254, + "learning_rate": 2.9378845884601636e-06, + "loss": 0.87007678, + "num_input_tokens_seen": 130169960, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.76953125, + "step": 6059, + "time_per_iteration": 2.4876210689544678 + }, + { + "auxiliary_loss_clip": 0.01123355, + "auxiliary_loss_mlp": 0.01040047, + "balance_loss_clip": 1.02527666, + "balance_loss_mlp": 1.04397857, + "epoch": 0.3643469111678942, + "flos": 22528451842560.0, + "grad_norm": 1.4958849024653313, + "language_loss": 0.8783946, + "learning_rate": 2.937540586903884e-06, + "loss": 0.90002865, + "num_input_tokens_seen": 130189800, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.79296875, + "step": 6060, + "time_per_iteration": 2.516439199447632 + }, + { + "auxiliary_loss_clip": 0.01124396, + "auxiliary_loss_mlp": 0.0104086, + "balance_loss_clip": 1.02583957, + "balance_loss_mlp": 1.04366183, + "epoch": 0.36440703442056216, + "flos": 19426595938560.0, + "grad_norm": 2.6600271028380824, + "language_loss": 0.67965293, + "learning_rate": 2.937196549795971e-06, + "loss": 0.70130551, + "num_input_tokens_seen": 130206370, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.80859375, + "step": 6061, + "time_per_iteration": 2.4436440467834473 + }, + { + "auxiliary_loss_clip": 0.01127668, + "auxiliary_loss_mlp": 0.01039684, + "balance_loss_clip": 1.02444267, + "balance_loss_mlp": 1.04622734, + "epoch": 0.3644671576732301, + "flos": 18040515966720.0, + "grad_norm": 2.142951671935031, + "language_loss": 0.75072217, + "learning_rate": 2.9368524771494718e-06, + "loss": 0.77239573, + "num_input_tokens_seen": 130224445, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8125, + "step": 6062, + "time_per_iteration": 2.4325368404388428 + }, + { + "auxiliary_loss_clip": 0.011222, + "auxiliary_loss_mlp": 0.01035014, + "balance_loss_clip": 1.01910567, + "balance_loss_mlp": 1.04460645, + "epoch": 0.3645272809258981, + "flos": 21542811667200.0, + "grad_norm": 1.6782897381106048, + "language_loss": 0.72632384, + "learning_rate": 2.936508368977432e-06, + "loss": 0.74789596, + "num_input_tokens_seen": 130245380, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.77734375, + "step": 6063, + "time_per_iteration": 2.498168468475342 + }, + { + "auxiliary_loss_clip": 0.0112083, + "auxiliary_loss_mlp": 0.01039322, + "balance_loss_clip": 1.0249579, + "balance_loss_mlp": 1.04365671, + "epoch": 0.36458740417856605, + "flos": 22746860490240.0, + "grad_norm": 1.8702732296649918, + "language_loss": 0.68128121, + "learning_rate": 2.936164225292901e-06, + "loss": 0.70288265, + "num_input_tokens_seen": 130265575, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7734375, + "step": 6064, + "time_per_iteration": 2.4951584339141846 + }, + { + "auxiliary_loss_clip": 0.01125722, + "auxiliary_loss_mlp": 0.01046801, + "balance_loss_clip": 1.03205502, + "balance_loss_mlp": 1.04549003, + "epoch": 0.364647527431234, + "flos": 26140670138880.0, + "grad_norm": 1.679838788119498, + "language_loss": 0.74604851, + "learning_rate": 2.9358200461089297e-06, + "loss": 0.76777375, + "num_input_tokens_seen": 130286195, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8046875, + "step": 6065, + "time_per_iteration": 2.4980344772338867 + }, + { + "auxiliary_loss_clip": 0.01125488, + "auxiliary_loss_mlp": 0.01041621, + "balance_loss_clip": 1.02544403, + "balance_loss_mlp": 1.04464209, + "epoch": 0.364707650683902, + "flos": 31029907737600.0, + "grad_norm": 1.8520658730284223, + "language_loss": 0.75248677, + "learning_rate": 2.9354758314385676e-06, + "loss": 0.77415788, + "num_input_tokens_seen": 130306095, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8046875, + "step": 6066, + "time_per_iteration": 2.5525264739990234 + }, + { + "auxiliary_loss_clip": 0.01116764, + "auxiliary_loss_mlp": 0.01034497, + "balance_loss_clip": 1.02101445, + "balance_loss_mlp": 1.04115653, + "epoch": 0.36476777393656995, + "flos": 19572896033280.0, + "grad_norm": 2.55479391525507, + "language_loss": 0.76988614, + "learning_rate": 2.9351315812948684e-06, + "loss": 0.79139876, + "num_input_tokens_seen": 130324685, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 6067, + "time_per_iteration": 2.440595865249634 + }, + { + "auxiliary_loss_clip": 0.01120327, + "auxiliary_loss_mlp": 0.01037255, + "balance_loss_clip": 1.02422583, + "balance_loss_mlp": 1.04442596, + "epoch": 0.3648278971892379, + "flos": 17748849530880.0, + "grad_norm": 2.1532465459722574, + "language_loss": 0.70826519, + "learning_rate": 2.934787295690886e-06, + "loss": 0.72984099, + "num_input_tokens_seen": 130343855, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7578125, + "step": 6068, + "time_per_iteration": 2.4555468559265137 + }, + { + "auxiliary_loss_clip": 0.01123082, + "auxiliary_loss_mlp": 0.01037893, + "balance_loss_clip": 1.02359986, + "balance_loss_mlp": 1.04301953, + "epoch": 0.3648880204419059, + "flos": 17931167988480.0, + "grad_norm": 1.8428063971352102, + "language_loss": 0.73987395, + "learning_rate": 2.9344429746396755e-06, + "loss": 0.76148373, + "num_input_tokens_seen": 130362320, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.80078125, + "step": 6069, + "time_per_iteration": 2.4380593299865723 + }, + { + "auxiliary_loss_clip": 0.01124432, + "auxiliary_loss_mlp": 0.01035086, + "balance_loss_clip": 1.0203104, + "balance_loss_mlp": 1.04434299, + "epoch": 0.3649481436945739, + "flos": 22638266697600.0, + "grad_norm": 1.740540431199334, + "language_loss": 0.66149801, + "learning_rate": 2.9340986181542945e-06, + "loss": 0.68309319, + "num_input_tokens_seen": 130383165, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.80078125, + "step": 6070, + "time_per_iteration": 2.4852278232574463 + }, + { + "auxiliary_loss_clip": 0.01120342, + "auxiliary_loss_mlp": 0.01036109, + "balance_loss_clip": 1.02225685, + "balance_loss_mlp": 1.04412127, + "epoch": 0.36500826694724187, + "flos": 21579656042880.0, + "grad_norm": 1.5531027619052142, + "language_loss": 0.74474913, + "learning_rate": 2.9337542262477994e-06, + "loss": 0.76631367, + "num_input_tokens_seen": 130402425, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76171875, + "step": 6071, + "time_per_iteration": 2.483961820602417 + }, + { + "auxiliary_loss_clip": 0.01119978, + "auxiliary_loss_mlp": 0.01034213, + "balance_loss_clip": 1.01926446, + "balance_loss_mlp": 1.04232538, + "epoch": 0.36506839019990983, + "flos": 13772533023360.0, + "grad_norm": 2.0347636440980277, + "language_loss": 0.88132894, + "learning_rate": 2.9334097989332506e-06, + "loss": 0.90287089, + "num_input_tokens_seen": 130419440, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.77734375, + "step": 6072, + "time_per_iteration": 2.4083876609802246 + }, + { + "auxiliary_loss_clip": 0.01121735, + "auxiliary_loss_mlp": 0.01035678, + "balance_loss_clip": 1.02184379, + "balance_loss_mlp": 1.04389739, + "epoch": 0.3651285134525778, + "flos": 17274972378240.0, + "grad_norm": 2.230203116909298, + "language_loss": 0.72432441, + "learning_rate": 2.9330653362237094e-06, + "loss": 0.74589849, + "num_input_tokens_seen": 130438495, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.77734375, + "step": 6073, + "time_per_iteration": 2.4769015312194824 + }, + { + "auxiliary_loss_clip": 0.01123465, + "auxiliary_loss_mlp": 0.01039544, + "balance_loss_clip": 1.02520275, + "balance_loss_mlp": 1.04425395, + "epoch": 0.36518863670524576, + "flos": 21907987286400.0, + "grad_norm": 1.8811318432297164, + "language_loss": 0.66584921, + "learning_rate": 2.932720838132236e-06, + "loss": 0.68747932, + "num_input_tokens_seen": 130455575, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7890625, + "step": 6074, + "time_per_iteration": 2.4474194049835205 + }, + { + "auxiliary_loss_clip": 0.01117894, + "auxiliary_loss_mlp": 0.01032639, + "balance_loss_clip": 1.01891208, + "balance_loss_mlp": 1.04079318, + "epoch": 0.3652487599579137, + "flos": 27122180250240.0, + "grad_norm": 1.5068114870819531, + "language_loss": 0.72946787, + "learning_rate": 2.9323763046718954e-06, + "loss": 0.75097322, + "num_input_tokens_seen": 130476385, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76953125, + "step": 6075, + "time_per_iteration": 2.5063765048980713 + }, + { + "auxiliary_loss_clip": 0.01126029, + "auxiliary_loss_mlp": 0.01044219, + "balance_loss_clip": 1.02888894, + "balance_loss_mlp": 1.04484594, + "epoch": 0.3653088832105817, + "flos": 19755573626880.0, + "grad_norm": 2.7314154698808113, + "language_loss": 0.8938573, + "learning_rate": 2.9320317358557524e-06, + "loss": 0.91555977, + "num_input_tokens_seen": 130493630, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8125, + "step": 6076, + "time_per_iteration": 2.4518303871154785 + }, + { + "auxiliary_loss_clip": 0.01121617, + "auxiliary_loss_mlp": 0.01038999, + "balance_loss_clip": 1.02438378, + "balance_loss_mlp": 1.04457617, + "epoch": 0.36536900646324966, + "flos": 13115008609920.0, + "grad_norm": 2.2164690925931976, + "language_loss": 0.69506466, + "learning_rate": 2.931687131696872e-06, + "loss": 0.71667087, + "num_input_tokens_seen": 130510735, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.76953125, + "step": 6077, + "time_per_iteration": 2.447659730911255 + }, + { + "auxiliary_loss_clip": 0.01043202, + "auxiliary_loss_mlp": 0.01009421, + "balance_loss_clip": 1.00758541, + "balance_loss_mlp": 1.01693892, + "epoch": 0.3654291297159176, + "flos": 71100472383360.0, + "grad_norm": 0.7520139059893192, + "language_loss": 0.61798048, + "learning_rate": 2.9313424922083224e-06, + "loss": 0.63850671, + "num_input_tokens_seen": 130577050, + "router_z_loss_clip": 0.01831055, + "router_z_loss_mlp": 0.26171875, + "step": 6078, + "time_per_iteration": 3.1669509410858154 + }, + { + "auxiliary_loss_clip": 0.01119836, + "auxiliary_loss_mlp": 0.01036426, + "balance_loss_clip": 1.02238369, + "balance_loss_mlp": 1.04217839, + "epoch": 0.3654892529685856, + "flos": 23617478338560.0, + "grad_norm": 1.8851740765331422, + "language_loss": 0.78088033, + "learning_rate": 2.930997817403173e-06, + "loss": 0.80244297, + "num_input_tokens_seen": 130593780, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.77734375, + "step": 6079, + "time_per_iteration": 2.4570510387420654 + }, + { + "auxiliary_loss_clip": 0.01124854, + "auxiliary_loss_mlp": 0.01040383, + "balance_loss_clip": 1.02517176, + "balance_loss_mlp": 1.04497504, + "epoch": 0.36554937622125355, + "flos": 43470799850880.0, + "grad_norm": 2.129422570654268, + "language_loss": 0.62885886, + "learning_rate": 2.9306531072944913e-06, + "loss": 0.65051121, + "num_input_tokens_seen": 130615510, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.796875, + "step": 6080, + "time_per_iteration": 2.65580415725708 + }, + { + "auxiliary_loss_clip": 0.01122781, + "auxiliary_loss_mlp": 0.01034606, + "balance_loss_clip": 1.01974106, + "balance_loss_mlp": 1.04280567, + "epoch": 0.3656094994739215, + "flos": 23294641875840.0, + "grad_norm": 2.4061972925673385, + "language_loss": 0.67665905, + "learning_rate": 2.930308361895352e-06, + "loss": 0.69823289, + "num_input_tokens_seen": 130635410, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 6081, + "time_per_iteration": 2.4747202396392822 + }, + { + "auxiliary_loss_clip": 0.01124634, + "auxiliary_loss_mlp": 0.01038138, + "balance_loss_clip": 1.02287912, + "balance_loss_mlp": 1.04305673, + "epoch": 0.3656696227265895, + "flos": 24571984400640.0, + "grad_norm": 1.9082106177767983, + "language_loss": 0.74747473, + "learning_rate": 2.9299635812188257e-06, + "loss": 0.76910245, + "num_input_tokens_seen": 130657725, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.81640625, + "step": 6082, + "time_per_iteration": 2.5238633155822754 + }, + { + "auxiliary_loss_clip": 0.01126171, + "auxiliary_loss_mlp": 0.01029096, + "balance_loss_clip": 1.01576877, + "balance_loss_mlp": 1.04598689, + "epoch": 0.3657297459792575, + "flos": 27928375056000.0, + "grad_norm": 1.8091692998669453, + "language_loss": 0.82823056, + "learning_rate": 2.929618765277987e-06, + "loss": 0.84978318, + "num_input_tokens_seen": 130678360, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.8046875, + "step": 6083, + "time_per_iteration": 2.517704963684082 + }, + { + "auxiliary_loss_clip": 0.01041849, + "auxiliary_loss_mlp": 0.01002206, + "balance_loss_clip": 1.00026309, + "balance_loss_mlp": 1.01621974, + "epoch": 0.36578986923192547, + "flos": 67392622126080.0, + "grad_norm": 0.8152809684063654, + "language_loss": 0.59372437, + "learning_rate": 2.9292739140859125e-06, + "loss": 0.61416495, + "num_input_tokens_seen": 130742110, + "router_z_loss_clip": 0.01940918, + "router_z_loss_mlp": 0.25585938, + "step": 6084, + "time_per_iteration": 3.126275062561035 + }, + { + "auxiliary_loss_clip": 0.01121734, + "auxiliary_loss_mlp": 0.01036969, + "balance_loss_clip": 1.02273536, + "balance_loss_mlp": 1.04410744, + "epoch": 0.36584999248459343, + "flos": 20227511445120.0, + "grad_norm": 2.719357970509058, + "language_loss": 0.73096633, + "learning_rate": 2.9289290276556767e-06, + "loss": 0.75255334, + "num_input_tokens_seen": 130759870, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.77734375, + "step": 6085, + "time_per_iteration": 2.436722755432129 + }, + { + "auxiliary_loss_clip": 0.01122986, + "auxiliary_loss_mlp": 0.01028561, + "balance_loss_clip": 1.01485801, + "balance_loss_mlp": 1.0447793, + "epoch": 0.3659101157372614, + "flos": 19062461813760.0, + "grad_norm": 4.360512376704014, + "language_loss": 0.7831111, + "learning_rate": 2.9285841060003604e-06, + "loss": 0.80462652, + "num_input_tokens_seen": 130778510, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.78125, + "step": 6086, + "time_per_iteration": 2.557521104812622 + }, + { + "auxiliary_loss_clip": 0.0111444, + "auxiliary_loss_mlp": 0.01029117, + "balance_loss_clip": 1.0150919, + "balance_loss_mlp": 1.0403074, + "epoch": 0.36597023898992936, + "flos": 30810708990720.0, + "grad_norm": 1.7974113126538098, + "language_loss": 0.77105325, + "learning_rate": 2.9282391491330416e-06, + "loss": 0.79248881, + "num_input_tokens_seen": 130798535, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 6087, + "time_per_iteration": 2.544868230819702 + }, + { + "auxiliary_loss_clip": 0.01121777, + "auxiliary_loss_mlp": 0.01030672, + "balance_loss_clip": 1.01587856, + "balance_loss_mlp": 1.04190612, + "epoch": 0.36603036224259733, + "flos": 20521799573760.0, + "grad_norm": 5.741725291334025, + "language_loss": 0.70710862, + "learning_rate": 2.9278941570668002e-06, + "loss": 0.72863311, + "num_input_tokens_seen": 130816655, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.80078125, + "step": 6088, + "time_per_iteration": 2.491933822631836 + }, + { + "auxiliary_loss_clip": 0.01130389, + "auxiliary_loss_mlp": 0.01034477, + "balance_loss_clip": 1.01897383, + "balance_loss_mlp": 1.04569137, + "epoch": 0.3660904854952653, + "flos": 38329397798400.0, + "grad_norm": 1.6695945607154594, + "language_loss": 0.79878473, + "learning_rate": 2.92754912981472e-06, + "loss": 0.82043338, + "num_input_tokens_seen": 130841225, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.84765625, + "step": 6089, + "time_per_iteration": 2.666814088821411 + }, + { + "auxiliary_loss_clip": 0.01119748, + "auxiliary_loss_mlp": 0.01031445, + "balance_loss_clip": 1.01816463, + "balance_loss_mlp": 1.04267049, + "epoch": 0.36615060874793326, + "flos": 21835555511040.0, + "grad_norm": 1.7190941707632215, + "language_loss": 0.71335226, + "learning_rate": 2.927204067389884e-06, + "loss": 0.73486418, + "num_input_tokens_seen": 130861050, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7734375, + "step": 6090, + "time_per_iteration": 2.5138063430786133 + }, + { + "auxiliary_loss_clip": 0.01118993, + "auxiliary_loss_mlp": 0.01041328, + "balance_loss_clip": 1.02757084, + "balance_loss_mlp": 1.04391527, + "epoch": 0.3662107320006012, + "flos": 16581537342720.0, + "grad_norm": 1.9784029627642763, + "language_loss": 0.74276829, + "learning_rate": 2.9268589698053763e-06, + "loss": 0.76437145, + "num_input_tokens_seen": 130879775, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.75, + "step": 6091, + "time_per_iteration": 2.437126636505127 + }, + { + "auxiliary_loss_clip": 0.01120866, + "auxiliary_loss_mlp": 0.01039193, + "balance_loss_clip": 1.02506638, + "balance_loss_mlp": 1.04396391, + "epoch": 0.3662708552532692, + "flos": 20958365473920.0, + "grad_norm": 1.8707748404117035, + "language_loss": 0.72492194, + "learning_rate": 2.926513837074284e-06, + "loss": 0.74652249, + "num_input_tokens_seen": 130898070, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76953125, + "step": 6092, + "time_per_iteration": 2.5038540363311768 + }, + { + "auxiliary_loss_clip": 0.01122728, + "auxiliary_loss_mlp": 0.01045071, + "balance_loss_clip": 1.03072441, + "balance_loss_mlp": 1.04359424, + "epoch": 0.36633097850593715, + "flos": 21902707987200.0, + "grad_norm": 1.9548617375197639, + "language_loss": 0.78251863, + "learning_rate": 2.9261686692096942e-06, + "loss": 0.8041966, + "num_input_tokens_seen": 130915250, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7890625, + "step": 6093, + "time_per_iteration": 2.453854560852051 + }, + { + "auxiliary_loss_clip": 0.01119754, + "auxiliary_loss_mlp": 0.01036262, + "balance_loss_clip": 1.02226686, + "balance_loss_mlp": 1.04095936, + "epoch": 0.3663911017586051, + "flos": 32854133808000.0, + "grad_norm": 1.7535936892187265, + "language_loss": 0.74123377, + "learning_rate": 2.925823466224696e-06, + "loss": 0.76279384, + "num_input_tokens_seen": 130936995, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7890625, + "step": 6094, + "time_per_iteration": 2.5953075885772705 + }, + { + "auxiliary_loss_clip": 0.01125058, + "auxiliary_loss_mlp": 0.01052761, + "balance_loss_clip": 1.0381875, + "balance_loss_mlp": 1.04492939, + "epoch": 0.3664512250112731, + "flos": 27271748482560.0, + "grad_norm": 1.5564182913572622, + "language_loss": 0.79226458, + "learning_rate": 2.9254782281323785e-06, + "loss": 0.81404281, + "num_input_tokens_seen": 130957970, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.80078125, + "step": 6095, + "time_per_iteration": 5.4338037967681885 + }, + { + "auxiliary_loss_clip": 0.01125087, + "auxiliary_loss_mlp": 0.01035776, + "balance_loss_clip": 1.02055264, + "balance_loss_mlp": 1.04422212, + "epoch": 0.3665113482639411, + "flos": 17784436930560.0, + "grad_norm": 2.287741364035224, + "language_loss": 0.73586392, + "learning_rate": 2.925132954945834e-06, + "loss": 0.75747252, + "num_input_tokens_seen": 130974915, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.80859375, + "step": 6096, + "time_per_iteration": 3.923590660095215 + }, + { + "auxiliary_loss_clip": 0.0112257, + "auxiliary_loss_mlp": 0.01033526, + "balance_loss_clip": 1.0195781, + "balance_loss_mlp": 1.04206252, + "epoch": 0.36657147151660907, + "flos": 27854614477440.0, + "grad_norm": 2.2038030169597875, + "language_loss": 0.67285162, + "learning_rate": 2.924787646678155e-06, + "loss": 0.69441259, + "num_input_tokens_seen": 130995745, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.8046875, + "step": 6097, + "time_per_iteration": 2.4843504428863525 + }, + { + "auxiliary_loss_clip": 0.01123525, + "auxiliary_loss_mlp": 0.01038839, + "balance_loss_clip": 1.0249629, + "balance_loss_mlp": 1.04401898, + "epoch": 0.36663159476927704, + "flos": 25374013228800.0, + "grad_norm": 1.6404590263223953, + "language_loss": 0.77676886, + "learning_rate": 2.9244423033424365e-06, + "loss": 0.79839253, + "num_input_tokens_seen": 131015545, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.796875, + "step": 6098, + "time_per_iteration": 2.5663979053497314 + }, + { + "auxiliary_loss_clip": 0.0111895, + "auxiliary_loss_mlp": 0.01038815, + "balance_loss_clip": 1.02467644, + "balance_loss_mlp": 1.04334557, + "epoch": 0.366691718021945, + "flos": 21357225072000.0, + "grad_norm": 1.7512654587161538, + "language_loss": 0.73807114, + "learning_rate": 2.9240969249517723e-06, + "loss": 0.7596488, + "num_input_tokens_seen": 131033990, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7578125, + "step": 6099, + "time_per_iteration": 2.442549705505371 + }, + { + "auxiliary_loss_clip": 0.01116483, + "auxiliary_loss_mlp": 0.01047226, + "balance_loss_clip": 1.03380322, + "balance_loss_mlp": 1.04073739, + "epoch": 0.36675184127461297, + "flos": 16800376953600.0, + "grad_norm": 1.739052204204903, + "language_loss": 0.84383607, + "learning_rate": 2.9237515115192602e-06, + "loss": 0.86547315, + "num_input_tokens_seen": 131050710, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 6100, + "time_per_iteration": 2.4783878326416016 + }, + { + "auxiliary_loss_clip": 0.01124265, + "auxiliary_loss_mlp": 0.01034789, + "balance_loss_clip": 1.02046633, + "balance_loss_mlp": 1.04215789, + "epoch": 0.36681196452728093, + "flos": 21906514828800.0, + "grad_norm": 2.450199870045222, + "language_loss": 0.70504647, + "learning_rate": 2.9234060630579992e-06, + "loss": 0.72663701, + "num_input_tokens_seen": 131071435, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.8203125, + "step": 6101, + "time_per_iteration": 2.4591257572174072 + }, + { + "auxiliary_loss_clip": 0.01121661, + "auxiliary_loss_mlp": 0.01041857, + "balance_loss_clip": 1.02629983, + "balance_loss_mlp": 1.04228854, + "epoch": 0.3668720877799489, + "flos": 17712436118400.0, + "grad_norm": 2.0513606804107543, + "language_loss": 0.76049435, + "learning_rate": 2.9230605795810865e-06, + "loss": 0.78212953, + "num_input_tokens_seen": 131088775, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.79296875, + "step": 6102, + "time_per_iteration": 2.491046190261841 + }, + { + "auxiliary_loss_clip": 0.01126584, + "auxiliary_loss_mlp": 0.01036727, + "balance_loss_clip": 1.02142096, + "balance_loss_mlp": 1.04445052, + "epoch": 0.36693221103261686, + "flos": 47045455499520.0, + "grad_norm": 1.6383228145690705, + "language_loss": 0.69930172, + "learning_rate": 2.922715061101625e-06, + "loss": 0.72093487, + "num_input_tokens_seen": 131112800, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8203125, + "step": 6103, + "time_per_iteration": 2.676790952682495 + }, + { + "auxiliary_loss_clip": 0.01121704, + "auxiliary_loss_mlp": 0.01036936, + "balance_loss_clip": 1.02213061, + "balance_loss_mlp": 1.0423454, + "epoch": 0.3669923342852848, + "flos": 15960929132160.0, + "grad_norm": 1.8701272650505458, + "language_loss": 0.71414149, + "learning_rate": 2.922369507632716e-06, + "loss": 0.73572791, + "num_input_tokens_seen": 131131150, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.79296875, + "step": 6104, + "time_per_iteration": 2.438197374343872 + }, + { + "auxiliary_loss_clip": 0.01121263, + "auxiliary_loss_mlp": 0.01032552, + "balance_loss_clip": 1.01794899, + "balance_loss_mlp": 1.04288161, + "epoch": 0.3670524575379528, + "flos": 19974485064960.0, + "grad_norm": 2.0275913231037923, + "language_loss": 0.81653488, + "learning_rate": 2.9220239191874617e-06, + "loss": 0.83807302, + "num_input_tokens_seen": 131150365, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78515625, + "step": 6105, + "time_per_iteration": 2.437201976776123 + }, + { + "auxiliary_loss_clip": 0.0112675, + "auxiliary_loss_mlp": 0.010372, + "balance_loss_clip": 1.02255476, + "balance_loss_mlp": 1.0441767, + "epoch": 0.36711258079062076, + "flos": 25702955003520.0, + "grad_norm": 1.7477833912391936, + "language_loss": 0.81079835, + "learning_rate": 2.9216782957789692e-06, + "loss": 0.83243787, + "num_input_tokens_seen": 131169310, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.82421875, + "step": 6106, + "time_per_iteration": 2.5447771549224854 + }, + { + "auxiliary_loss_clip": 0.01041229, + "auxiliary_loss_mlp": 0.0100622, + "balance_loss_clip": 1.00440836, + "balance_loss_mlp": 1.01511836, + "epoch": 0.3671727040432887, + "flos": 60772743342720.0, + "grad_norm": 0.6829750500510474, + "language_loss": 0.59212124, + "learning_rate": 2.9213326374203426e-06, + "loss": 0.6125958, + "num_input_tokens_seen": 131232900, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.26171875, + "step": 6107, + "time_per_iteration": 3.0983083248138428 + }, + { + "auxiliary_loss_clip": 0.01119584, + "auxiliary_loss_mlp": 0.01031391, + "balance_loss_clip": 1.01756859, + "balance_loss_mlp": 1.04195333, + "epoch": 0.3672328272959567, + "flos": 18661303745280.0, + "grad_norm": 1.5524752326282045, + "language_loss": 0.74417794, + "learning_rate": 2.92098694412469e-06, + "loss": 0.7656877, + "num_input_tokens_seen": 131250920, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.77734375, + "step": 6108, + "time_per_iteration": 2.5146114826202393 + }, + { + "auxiliary_loss_clip": 0.01120578, + "auxiliary_loss_mlp": 0.01036173, + "balance_loss_clip": 1.02218354, + "balance_loss_mlp": 1.04104972, + "epoch": 0.3672929505486247, + "flos": 15049049535360.0, + "grad_norm": 2.0732100862766294, + "language_loss": 0.73141801, + "learning_rate": 2.9206412159051213e-06, + "loss": 0.7529856, + "num_input_tokens_seen": 131267910, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.796875, + "step": 6109, + "time_per_iteration": 2.4597368240356445 + }, + { + "auxiliary_loss_clip": 0.01118669, + "auxiliary_loss_mlp": 0.01034099, + "balance_loss_clip": 1.02015734, + "balance_loss_mlp": 1.0407654, + "epoch": 0.3673530738012927, + "flos": 20589347099520.0, + "grad_norm": 1.8280489650426288, + "language_loss": 0.53282952, + "learning_rate": 2.920295452774744e-06, + "loss": 0.55435723, + "num_input_tokens_seen": 131287150, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.77734375, + "step": 6110, + "time_per_iteration": 2.5454814434051514 + }, + { + "auxiliary_loss_clip": 0.01120475, + "auxiliary_loss_mlp": 0.01034416, + "balance_loss_clip": 1.01949728, + "balance_loss_mlp": 1.04360104, + "epoch": 0.36741319705396064, + "flos": 21689830033920.0, + "grad_norm": 1.4515242715586747, + "language_loss": 0.8026799, + "learning_rate": 2.919949654746672e-06, + "loss": 0.82422882, + "num_input_tokens_seen": 131308225, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.76953125, + "step": 6111, + "time_per_iteration": 2.4838016033172607 + }, + { + "auxiliary_loss_clip": 0.01119124, + "auxiliary_loss_mlp": 0.01040745, + "balance_loss_clip": 1.02637434, + "balance_loss_mlp": 1.04195952, + "epoch": 0.3674733203066286, + "flos": 29862200499840.0, + "grad_norm": 1.7574831080907656, + "language_loss": 0.72220403, + "learning_rate": 2.9196038218340163e-06, + "loss": 0.74380273, + "num_input_tokens_seen": 131332115, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.76953125, + "step": 6112, + "time_per_iteration": 2.590109348297119 + }, + { + "auxiliary_loss_clip": 0.01120572, + "auxiliary_loss_mlp": 0.01039294, + "balance_loss_clip": 1.02503061, + "balance_loss_mlp": 1.04220295, + "epoch": 0.36753344355929657, + "flos": 18257021193600.0, + "grad_norm": 1.6166739673118746, + "language_loss": 0.85398543, + "learning_rate": 2.919257954049892e-06, + "loss": 0.87558413, + "num_input_tokens_seen": 131351885, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78515625, + "step": 6113, + "time_per_iteration": 2.4480674266815186 + }, + { + "auxiliary_loss_clip": 0.01122714, + "auxiliary_loss_mlp": 0.01036848, + "balance_loss_clip": 1.02228022, + "balance_loss_mlp": 1.04214144, + "epoch": 0.36759356681196453, + "flos": 25301150490240.0, + "grad_norm": 1.8814317352542869, + "language_loss": 0.78741604, + "learning_rate": 2.918912051407413e-06, + "loss": 0.80901164, + "num_input_tokens_seen": 131370245, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.8046875, + "step": 6114, + "time_per_iteration": 2.4870779514312744 + }, + { + "auxiliary_loss_clip": 0.01125295, + "auxiliary_loss_mlp": 0.01044195, + "balance_loss_clip": 1.0278033, + "balance_loss_mlp": 1.04344988, + "epoch": 0.3676536900646325, + "flos": 21032952065280.0, + "grad_norm": 1.5830307408310422, + "language_loss": 0.66854429, + "learning_rate": 2.918566113919698e-06, + "loss": 0.69023919, + "num_input_tokens_seen": 131388115, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.81640625, + "step": 6115, + "time_per_iteration": 2.4361841678619385 + }, + { + "auxiliary_loss_clip": 0.01114869, + "auxiliary_loss_mlp": 0.01033503, + "balance_loss_clip": 1.01953745, + "balance_loss_mlp": 1.03984118, + "epoch": 0.36771381331730046, + "flos": 16288506190080.0, + "grad_norm": 2.406761648754093, + "language_loss": 0.76663208, + "learning_rate": 2.9182201415998636e-06, + "loss": 0.78811574, + "num_input_tokens_seen": 131404595, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75, + "step": 6116, + "time_per_iteration": 2.417569875717163 + }, + { + "auxiliary_loss_clip": 0.01119646, + "auxiliary_loss_mlp": 0.01040473, + "balance_loss_clip": 1.02685893, + "balance_loss_mlp": 1.04111099, + "epoch": 0.36777393656996843, + "flos": 22309971367680.0, + "grad_norm": 1.9705222106020779, + "language_loss": 0.62811542, + "learning_rate": 2.9178741344610286e-06, + "loss": 0.64971662, + "num_input_tokens_seen": 131423760, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.78515625, + "step": 6117, + "time_per_iteration": 2.443798065185547 + }, + { + "auxiliary_loss_clip": 0.01118324, + "auxiliary_loss_mlp": 0.0103365, + "balance_loss_clip": 1.019261, + "balance_loss_mlp": 1.04137671, + "epoch": 0.3678340598226364, + "flos": 26834069260800.0, + "grad_norm": 1.9131647495504847, + "language_loss": 0.72974634, + "learning_rate": 2.9175280925163156e-06, + "loss": 0.75126612, + "num_input_tokens_seen": 131444955, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.76953125, + "step": 6118, + "time_per_iteration": 2.531804084777832 + }, + { + "auxiliary_loss_clip": 0.01123956, + "auxiliary_loss_mlp": 0.01042313, + "balance_loss_clip": 1.02694678, + "balance_loss_mlp": 1.04156733, + "epoch": 0.36789418307530436, + "flos": 21761723105280.0, + "grad_norm": 2.002097677722335, + "language_loss": 0.72413695, + "learning_rate": 2.9171820157788445e-06, + "loss": 0.7457996, + "num_input_tokens_seen": 131465720, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.82421875, + "step": 6119, + "time_per_iteration": 2.4641144275665283 + }, + { + "auxiliary_loss_clip": 0.01121284, + "auxiliary_loss_mlp": 0.01032475, + "balance_loss_clip": 1.0179317, + "balance_loss_mlp": 1.04397964, + "epoch": 0.3679543063279723, + "flos": 15924192497280.0, + "grad_norm": 1.84976209385018, + "language_loss": 0.79848421, + "learning_rate": 2.9168359042617404e-06, + "loss": 0.82002181, + "num_input_tokens_seen": 131483080, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7734375, + "step": 6120, + "time_per_iteration": 2.487030029296875 + }, + { + "auxiliary_loss_clip": 0.01117761, + "auxiliary_loss_mlp": 0.01040133, + "balance_loss_clip": 1.02612031, + "balance_loss_mlp": 1.04084468, + "epoch": 0.3680144295806403, + "flos": 24275541456000.0, + "grad_norm": 1.8961465807450149, + "language_loss": 0.63855267, + "learning_rate": 2.916489757978126e-06, + "loss": 0.66013169, + "num_input_tokens_seen": 131502545, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76953125, + "step": 6121, + "time_per_iteration": 2.4573564529418945 + }, + { + "auxiliary_loss_clip": 0.01122895, + "auxiliary_loss_mlp": 0.01042434, + "balance_loss_clip": 1.02755642, + "balance_loss_mlp": 1.0431416, + "epoch": 0.36807455283330826, + "flos": 26104148985600.0, + "grad_norm": 1.8845840511442051, + "language_loss": 0.71209222, + "learning_rate": 2.9161435769411286e-06, + "loss": 0.73374552, + "num_input_tokens_seen": 131522155, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 6122, + "time_per_iteration": 2.5197854042053223 + }, + { + "auxiliary_loss_clip": 0.01116909, + "auxiliary_loss_mlp": 0.01034858, + "balance_loss_clip": 1.02091694, + "balance_loss_mlp": 1.04319501, + "epoch": 0.3681346760859763, + "flos": 24644990793600.0, + "grad_norm": 1.8566190114316727, + "language_loss": 0.69493115, + "learning_rate": 2.915797361163875e-06, + "loss": 0.71644878, + "num_input_tokens_seen": 131543865, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.734375, + "step": 6123, + "time_per_iteration": 2.5585381984710693 + }, + { + "auxiliary_loss_clip": 0.0112515, + "auxiliary_loss_mlp": 0.01039826, + "balance_loss_clip": 1.02426958, + "balance_loss_mlp": 1.04312396, + "epoch": 0.36819479933864424, + "flos": 23878369797120.0, + "grad_norm": 1.995367064863914, + "language_loss": 0.73392212, + "learning_rate": 2.9154511106594933e-06, + "loss": 0.7555719, + "num_input_tokens_seen": 131562155, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8203125, + "step": 6124, + "time_per_iteration": 2.56925368309021 + }, + { + "auxiliary_loss_clip": 0.01121929, + "auxiliary_loss_mlp": 0.01040848, + "balance_loss_clip": 1.02465916, + "balance_loss_mlp": 1.04337013, + "epoch": 0.3682549225913122, + "flos": 25553997302400.0, + "grad_norm": 1.997016319446362, + "language_loss": 0.74426562, + "learning_rate": 2.915104825441114e-06, + "loss": 0.76589334, + "num_input_tokens_seen": 131581695, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.7890625, + "step": 6125, + "time_per_iteration": 2.493232488632202 + }, + { + "auxiliary_loss_clip": 0.01124729, + "auxiliary_loss_mlp": 0.01046169, + "balance_loss_clip": 1.03009367, + "balance_loss_mlp": 1.04400194, + "epoch": 0.36831504584398017, + "flos": 16946605221120.0, + "grad_norm": 1.8135805598812564, + "language_loss": 0.78254056, + "learning_rate": 2.9147585055218686e-06, + "loss": 0.80424947, + "num_input_tokens_seen": 131599465, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8046875, + "step": 6126, + "time_per_iteration": 2.4767327308654785 + }, + { + "auxiliary_loss_clip": 0.01123227, + "auxiliary_loss_mlp": 0.0103777, + "balance_loss_clip": 1.02125943, + "balance_loss_mlp": 1.04164457, + "epoch": 0.36837516909664814, + "flos": 19865065259520.0, + "grad_norm": 2.275366104968191, + "language_loss": 0.66100526, + "learning_rate": 2.914412150914888e-06, + "loss": 0.68261528, + "num_input_tokens_seen": 131618330, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.81640625, + "step": 6127, + "time_per_iteration": 2.4442801475524902 + }, + { + "auxiliary_loss_clip": 0.01126304, + "auxiliary_loss_mlp": 0.01042916, + "balance_loss_clip": 1.02783585, + "balance_loss_mlp": 1.04527378, + "epoch": 0.3684352923493161, + "flos": 37626984362880.0, + "grad_norm": 1.809419798014635, + "language_loss": 0.70553637, + "learning_rate": 2.9140657616333074e-06, + "loss": 0.72722864, + "num_input_tokens_seen": 131638960, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.80859375, + "step": 6128, + "time_per_iteration": 2.6163570880889893 + }, + { + "auxiliary_loss_clip": 0.01121361, + "auxiliary_loss_mlp": 0.01041508, + "balance_loss_clip": 1.0266788, + "balance_loss_mlp": 1.04374862, + "epoch": 0.36849541560198407, + "flos": 14465501182080.0, + "grad_norm": 2.366686546837111, + "language_loss": 0.75425905, + "learning_rate": 2.9137193376902614e-06, + "loss": 0.77588773, + "num_input_tokens_seen": 131657440, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.77734375, + "step": 6129, + "time_per_iteration": 2.418318510055542 + }, + { + "auxiliary_loss_clip": 0.01119858, + "auxiliary_loss_mlp": 0.01041313, + "balance_loss_clip": 1.02652466, + "balance_loss_mlp": 1.0419023, + "epoch": 0.36855553885465203, + "flos": 25770753924480.0, + "grad_norm": 1.583632674026135, + "language_loss": 0.84801334, + "learning_rate": 2.9133728790988868e-06, + "loss": 0.86962497, + "num_input_tokens_seen": 131678035, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.78125, + "step": 6130, + "time_per_iteration": 2.4933249950408936 + }, + { + "auxiliary_loss_clip": 0.01041681, + "auxiliary_loss_mlp": 0.010081, + "balance_loss_clip": 1.00623989, + "balance_loss_mlp": 1.01602125, + "epoch": 0.36861566210732, + "flos": 65049417377280.0, + "grad_norm": 0.8093683158704721, + "language_loss": 0.60352623, + "learning_rate": 2.913026385872321e-06, + "loss": 0.62402403, + "num_input_tokens_seen": 131742470, + "router_z_loss_clip": 0.01855469, + "router_z_loss_mlp": 0.2578125, + "step": 6131, + "time_per_iteration": 3.1686718463897705 + }, + { + "auxiliary_loss_clip": 0.01117904, + "auxiliary_loss_mlp": 0.01031101, + "balance_loss_clip": 1.01657534, + "balance_loss_mlp": 1.04083943, + "epoch": 0.36867578535998796, + "flos": 30954495133440.0, + "grad_norm": 1.5510352980860918, + "language_loss": 0.72903317, + "learning_rate": 2.9126798580237034e-06, + "loss": 0.75052321, + "num_input_tokens_seen": 131764570, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.76953125, + "step": 6132, + "time_per_iteration": 2.54154109954834 + }, + { + "auxiliary_loss_clip": 0.01124361, + "auxiliary_loss_mlp": 0.0103786, + "balance_loss_clip": 1.02221942, + "balance_loss_mlp": 1.04263651, + "epoch": 0.3687359086126559, + "flos": 28837956182400.0, + "grad_norm": 1.665822939326855, + "language_loss": 0.74255228, + "learning_rate": 2.9123332955661736e-06, + "loss": 0.76417446, + "num_input_tokens_seen": 131785720, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.81640625, + "step": 6133, + "time_per_iteration": 2.501119375228882 + }, + { + "auxiliary_loss_clip": 0.01118037, + "auxiliary_loss_mlp": 0.01038324, + "balance_loss_clip": 1.02420318, + "balance_loss_mlp": 1.04308438, + "epoch": 0.3687960318653239, + "flos": 21396798881280.0, + "grad_norm": 1.60564703390979, + "language_loss": 0.71415824, + "learning_rate": 2.911986698512874e-06, + "loss": 0.73572183, + "num_input_tokens_seen": 131804430, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.75, + "step": 6134, + "time_per_iteration": 2.472978353500366 + }, + { + "auxiliary_loss_clip": 0.01121139, + "auxiliary_loss_mlp": 0.01035306, + "balance_loss_clip": 1.0202322, + "balance_loss_mlp": 1.04333591, + "epoch": 0.36885615511799186, + "flos": 20266043760000.0, + "grad_norm": 1.501197032587339, + "language_loss": 0.74985242, + "learning_rate": 2.9116400668769477e-06, + "loss": 0.77141684, + "num_input_tokens_seen": 131822060, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.77734375, + "step": 6135, + "time_per_iteration": 2.458523750305176 + }, + { + "auxiliary_loss_clip": 0.01043215, + "auxiliary_loss_mlp": 0.01004045, + "balance_loss_clip": 1.00209045, + "balance_loss_mlp": 1.01762199, + "epoch": 0.3689162783706599, + "flos": 63088836301440.0, + "grad_norm": 0.8063752733434837, + "language_loss": 0.5878793, + "learning_rate": 2.9112934006715376e-06, + "loss": 0.60835183, + "num_input_tokens_seen": 131880715, + "router_z_loss_clip": 0.01953125, + "router_z_loss_mlp": 0.25585938, + "step": 6136, + "time_per_iteration": 2.9917385578155518 + }, + { + "auxiliary_loss_clip": 0.0112236, + "auxiliary_loss_mlp": 0.01035902, + "balance_loss_clip": 1.02095878, + "balance_loss_mlp": 1.04477668, + "epoch": 0.36897640162332784, + "flos": 10961984419200.0, + "grad_norm": 1.8816926848284692, + "language_loss": 0.78812146, + "learning_rate": 2.9109466999097918e-06, + "loss": 0.80970407, + "num_input_tokens_seen": 131895850, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.77734375, + "step": 6137, + "time_per_iteration": 6.900243520736694 + }, + { + "auxiliary_loss_clip": 0.01122666, + "auxiliary_loss_mlp": 0.01040761, + "balance_loss_clip": 1.02594304, + "balance_loss_mlp": 1.04392326, + "epoch": 0.3690365248759958, + "flos": 20704297599360.0, + "grad_norm": 2.0278297083458345, + "language_loss": 0.74142605, + "learning_rate": 2.9105999646048552e-06, + "loss": 0.76306027, + "num_input_tokens_seen": 131915775, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7890625, + "step": 6138, + "time_per_iteration": 2.5056889057159424 + }, + { + "auxiliary_loss_clip": 0.01127012, + "auxiliary_loss_mlp": 0.01038954, + "balance_loss_clip": 1.02365959, + "balance_loss_mlp": 1.04482222, + "epoch": 0.3690966481286638, + "flos": 31826369957760.0, + "grad_norm": 1.957735157830462, + "language_loss": 0.64818108, + "learning_rate": 2.9102531947698764e-06, + "loss": 0.66984075, + "num_input_tokens_seen": 131935715, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8203125, + "step": 6139, + "time_per_iteration": 2.5345380306243896 + }, + { + "auxiliary_loss_clip": 0.01119273, + "auxiliary_loss_mlp": 0.01040439, + "balance_loss_clip": 1.02563334, + "balance_loss_mlp": 1.04279661, + "epoch": 0.36915677138133174, + "flos": 13114936782720.0, + "grad_norm": 2.0918485574433734, + "language_loss": 0.71384197, + "learning_rate": 2.909906390418006e-06, + "loss": 0.73543906, + "num_input_tokens_seen": 131954120, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.765625, + "step": 6140, + "time_per_iteration": 2.4318323135375977 + }, + { + "auxiliary_loss_clip": 0.01042951, + "auxiliary_loss_mlp": 0.00999596, + "balance_loss_clip": 0.99771231, + "balance_loss_mlp": 1.01712704, + "epoch": 0.3692168946339997, + "flos": 68686879956480.0, + "grad_norm": 0.7479140823872853, + "language_loss": 0.59281325, + "learning_rate": 2.9095595515623934e-06, + "loss": 0.61323869, + "num_input_tokens_seen": 132017485, + "router_z_loss_clip": 0.01879883, + "router_z_loss_mlp": 0.2578125, + "step": 6141, + "time_per_iteration": 3.1505937576293945 + }, + { + "auxiliary_loss_clip": 0.01122987, + "auxiliary_loss_mlp": 0.0103975, + "balance_loss_clip": 1.02499199, + "balance_loss_mlp": 1.04369187, + "epoch": 0.36927701788666767, + "flos": 22017873968640.0, + "grad_norm": 1.768624510630746, + "language_loss": 0.7473368, + "learning_rate": 2.909212678216192e-06, + "loss": 0.76896417, + "num_input_tokens_seen": 132036760, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.79296875, + "step": 6142, + "time_per_iteration": 2.4768457412719727 + }, + { + "auxiliary_loss_clip": 0.01119694, + "auxiliary_loss_mlp": 0.01036766, + "balance_loss_clip": 1.02291358, + "balance_loss_mlp": 1.04270506, + "epoch": 0.36933714113933563, + "flos": 21835591424640.0, + "grad_norm": 2.5385068391341603, + "language_loss": 0.76985848, + "learning_rate": 2.908865770392555e-06, + "loss": 0.79142308, + "num_input_tokens_seen": 132056935, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76953125, + "step": 6143, + "time_per_iteration": 2.4604313373565674 + }, + { + "auxiliary_loss_clip": 0.01118955, + "auxiliary_loss_mlp": 0.01035839, + "balance_loss_clip": 1.02289248, + "balance_loss_mlp": 1.04277074, + "epoch": 0.3693972643920036, + "flos": 23691705793920.0, + "grad_norm": 1.4994482416842545, + "language_loss": 0.81616801, + "learning_rate": 2.9085188281046364e-06, + "loss": 0.83771598, + "num_input_tokens_seen": 132077285, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.76171875, + "step": 6144, + "time_per_iteration": 2.529298782348633 + }, + { + "auxiliary_loss_clip": 0.0112261, + "auxiliary_loss_mlp": 0.01038443, + "balance_loss_clip": 1.02425694, + "balance_loss_mlp": 1.04323006, + "epoch": 0.36945738764467156, + "flos": 22856747172480.0, + "grad_norm": 1.9122738225408384, + "language_loss": 0.77019674, + "learning_rate": 2.908171851365593e-06, + "loss": 0.79180729, + "num_input_tokens_seen": 132095520, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.79296875, + "step": 6145, + "time_per_iteration": 2.4642515182495117 + }, + { + "auxiliary_loss_clip": 0.01123051, + "auxiliary_loss_mlp": 0.01032135, + "balance_loss_clip": 1.01760387, + "balance_loss_mlp": 1.04384804, + "epoch": 0.36951751089733953, + "flos": 16615939593600.0, + "grad_norm": 1.7518336089815172, + "language_loss": 0.76903462, + "learning_rate": 2.9078248401885815e-06, + "loss": 0.79058653, + "num_input_tokens_seen": 132112810, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.79296875, + "step": 6146, + "time_per_iteration": 2.49208927154541 + }, + { + "auxiliary_loss_clip": 0.01125412, + "auxiliary_loss_mlp": 0.01042993, + "balance_loss_clip": 1.02746034, + "balance_loss_mlp": 1.04481673, + "epoch": 0.3695776341500075, + "flos": 18914545607040.0, + "grad_norm": 1.7861503855196468, + "language_loss": 0.80794239, + "learning_rate": 2.907477794586761e-06, + "loss": 0.82962638, + "num_input_tokens_seen": 132131615, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8046875, + "step": 6147, + "time_per_iteration": 2.417968988418579 + }, + { + "auxiliary_loss_clip": 0.01120028, + "auxiliary_loss_mlp": 0.01037464, + "balance_loss_clip": 1.0238626, + "balance_loss_mlp": 1.04083371, + "epoch": 0.36963775740267546, + "flos": 20808474019200.0, + "grad_norm": 1.7356953572419536, + "language_loss": 0.83196342, + "learning_rate": 2.9071307145732926e-06, + "loss": 0.85353833, + "num_input_tokens_seen": 132149585, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.79296875, + "step": 6148, + "time_per_iteration": 2.4493086338043213 + }, + { + "auxiliary_loss_clip": 0.01118838, + "auxiliary_loss_mlp": 0.01038426, + "balance_loss_clip": 1.02424645, + "balance_loss_mlp": 1.04304922, + "epoch": 0.3696978806553435, + "flos": 26061881656320.0, + "grad_norm": 2.337121678381176, + "language_loss": 0.74373478, + "learning_rate": 2.9067836001613357e-06, + "loss": 0.76530743, + "num_input_tokens_seen": 132165555, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7578125, + "step": 6149, + "time_per_iteration": 2.4594686031341553 + }, + { + "auxiliary_loss_clip": 0.01124701, + "auxiliary_loss_mlp": 0.01038071, + "balance_loss_clip": 1.02210915, + "balance_loss_mlp": 1.04449439, + "epoch": 0.36975800390801145, + "flos": 26833925606400.0, + "grad_norm": 1.7562888589836316, + "language_loss": 0.70538592, + "learning_rate": 2.906436451364054e-06, + "loss": 0.72701365, + "num_input_tokens_seen": 132185100, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.80078125, + "step": 6150, + "time_per_iteration": 2.5232975482940674 + }, + { + "auxiliary_loss_clip": 0.01121201, + "auxiliary_loss_mlp": 0.01039019, + "balance_loss_clip": 1.02470183, + "balance_loss_mlp": 1.04390609, + "epoch": 0.3698181271606794, + "flos": 21142623265920.0, + "grad_norm": 1.6469943204532072, + "language_loss": 0.82023048, + "learning_rate": 2.906089268194611e-06, + "loss": 0.84183264, + "num_input_tokens_seen": 132203930, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7734375, + "step": 6151, + "time_per_iteration": 2.448066473007202 + }, + { + "auxiliary_loss_clip": 0.01036606, + "auxiliary_loss_mlp": 0.01001329, + "balance_loss_clip": 0.99951726, + "balance_loss_mlp": 1.01119328, + "epoch": 0.3698782504133474, + "flos": 66742639568640.0, + "grad_norm": 0.838014312453704, + "language_loss": 0.63083476, + "learning_rate": 2.9057420506661726e-06, + "loss": 0.65121406, + "num_input_tokens_seen": 132263845, + "router_z_loss_clip": 0.01806641, + "router_z_loss_mlp": 0.25390625, + "step": 6152, + "time_per_iteration": 3.170707941055298 + }, + { + "auxiliary_loss_clip": 0.01117624, + "auxiliary_loss_mlp": 0.01037374, + "balance_loss_clip": 1.02347398, + "balance_loss_mlp": 1.0429337, + "epoch": 0.36993837366601534, + "flos": 24311523905280.0, + "grad_norm": 1.8166659348284784, + "language_loss": 0.70360208, + "learning_rate": 2.9053947987919044e-06, + "loss": 0.72515202, + "num_input_tokens_seen": 132282350, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 6153, + "time_per_iteration": 2.480318546295166 + }, + { + "auxiliary_loss_clip": 0.01123537, + "auxiliary_loss_mlp": 0.01039275, + "balance_loss_clip": 1.02420688, + "balance_loss_mlp": 1.04319179, + "epoch": 0.3699984969186833, + "flos": 24349194293760.0, + "grad_norm": 2.0600031325492107, + "language_loss": 0.72201782, + "learning_rate": 2.9050475125849755e-06, + "loss": 0.74364597, + "num_input_tokens_seen": 132301930, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8046875, + "step": 6154, + "time_per_iteration": 2.48018479347229 + }, + { + "auxiliary_loss_clip": 0.0111958, + "auxiliary_loss_mlp": 0.01029952, + "balance_loss_clip": 1.01624274, + "balance_loss_mlp": 1.04201758, + "epoch": 0.37005862017135127, + "flos": 19829154637440.0, + "grad_norm": 1.8383479148193087, + "language_loss": 0.67877179, + "learning_rate": 2.9047001920585534e-06, + "loss": 0.70026708, + "num_input_tokens_seen": 132320915, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 6155, + "time_per_iteration": 2.454582929611206 + }, + { + "auxiliary_loss_clip": 0.01119091, + "auxiliary_loss_mlp": 0.01028886, + "balance_loss_clip": 1.01518905, + "balance_loss_mlp": 1.0420723, + "epoch": 0.37011874342401924, + "flos": 19573793873280.0, + "grad_norm": 1.7213710867444976, + "language_loss": 0.67835188, + "learning_rate": 2.9043528372258097e-06, + "loss": 0.6998316, + "num_input_tokens_seen": 132340415, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76953125, + "step": 6156, + "time_per_iteration": 2.456244707107544 + }, + { + "auxiliary_loss_clip": 0.01117904, + "auxiliary_loss_mlp": 0.01038018, + "balance_loss_clip": 1.02461255, + "balance_loss_mlp": 1.04180884, + "epoch": 0.3701788666766872, + "flos": 20374350243840.0, + "grad_norm": 1.7871024658649661, + "language_loss": 0.82324016, + "learning_rate": 2.904005448099916e-06, + "loss": 0.8447994, + "num_input_tokens_seen": 132358600, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 6157, + "time_per_iteration": 2.467258930206299 + }, + { + "auxiliary_loss_clip": 0.0112233, + "auxiliary_loss_mlp": 0.0103747, + "balance_loss_clip": 1.02214015, + "balance_loss_mlp": 1.04224074, + "epoch": 0.37023898992935517, + "flos": 15340931452800.0, + "grad_norm": 2.319348977212497, + "language_loss": 0.76519799, + "learning_rate": 2.9036580246940444e-06, + "loss": 0.78679597, + "num_input_tokens_seen": 132373160, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.80078125, + "step": 6158, + "time_per_iteration": 2.4462850093841553 + }, + { + "auxiliary_loss_clip": 0.01121021, + "auxiliary_loss_mlp": 0.01037892, + "balance_loss_clip": 1.02276468, + "balance_loss_mlp": 1.04128695, + "epoch": 0.37029911318202313, + "flos": 19573937527680.0, + "grad_norm": 2.3237426114128903, + "language_loss": 0.6888833, + "learning_rate": 2.9033105670213708e-06, + "loss": 0.71047246, + "num_input_tokens_seen": 132392345, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.796875, + "step": 6159, + "time_per_iteration": 2.444615364074707 + }, + { + "auxiliary_loss_clip": 0.0111775, + "auxiliary_loss_mlp": 0.0103647, + "balance_loss_clip": 1.02298164, + "balance_loss_mlp": 1.04054952, + "epoch": 0.3703592364346911, + "flos": 26213353309440.0, + "grad_norm": 1.7829911261722147, + "language_loss": 0.7101602, + "learning_rate": 2.9029630750950697e-06, + "loss": 0.73170245, + "num_input_tokens_seen": 132412620, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7734375, + "step": 6160, + "time_per_iteration": 2.4807472229003906 + }, + { + "auxiliary_loss_clip": 0.01114504, + "auxiliary_loss_mlp": 0.01030762, + "balance_loss_clip": 1.01808465, + "balance_loss_mlp": 1.04033566, + "epoch": 0.37041935968735906, + "flos": 20048317470720.0, + "grad_norm": 1.5671410195286926, + "language_loss": 0.79049259, + "learning_rate": 2.9026155489283176e-06, + "loss": 0.81194532, + "num_input_tokens_seen": 132431570, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7421875, + "step": 6161, + "time_per_iteration": 2.445615768432617 + }, + { + "auxiliary_loss_clip": 0.01119907, + "auxiliary_loss_mlp": 0.01037146, + "balance_loss_clip": 1.02266204, + "balance_loss_mlp": 1.04217172, + "epoch": 0.3704794829400271, + "flos": 24133802388480.0, + "grad_norm": 1.6578530571842398, + "language_loss": 0.7961942, + "learning_rate": 2.902267988534295e-06, + "loss": 0.81776464, + "num_input_tokens_seen": 132451525, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.77734375, + "step": 6162, + "time_per_iteration": 2.474179267883301 + }, + { + "auxiliary_loss_clip": 0.01118518, + "auxiliary_loss_mlp": 0.01035343, + "balance_loss_clip": 1.02122831, + "balance_loss_mlp": 1.04136944, + "epoch": 0.37053960619269505, + "flos": 14866874732160.0, + "grad_norm": 1.751569507310971, + "language_loss": 0.79592955, + "learning_rate": 2.9019203939261783e-06, + "loss": 0.81746811, + "num_input_tokens_seen": 132469875, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76953125, + "step": 6163, + "time_per_iteration": 2.429410696029663 + }, + { + "auxiliary_loss_clip": 0.01121642, + "auxiliary_loss_mlp": 0.01032856, + "balance_loss_clip": 1.01815772, + "balance_loss_mlp": 1.04239571, + "epoch": 0.370599729445363, + "flos": 21361498790400.0, + "grad_norm": 1.6995697719291154, + "language_loss": 0.68002689, + "learning_rate": 2.9015727651171507e-06, + "loss": 0.70157188, + "num_input_tokens_seen": 132488360, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.79296875, + "step": 6164, + "time_per_iteration": 2.4500439167022705 + }, + { + "auxiliary_loss_clip": 0.01125233, + "auxiliary_loss_mlp": 0.01035754, + "balance_loss_clip": 1.0206207, + "balance_loss_mlp": 1.04507017, + "epoch": 0.370659852698031, + "flos": 26829041356800.0, + "grad_norm": 2.4697759057606197, + "language_loss": 0.82807398, + "learning_rate": 2.9012251021203935e-06, + "loss": 0.84968388, + "num_input_tokens_seen": 132508630, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.80078125, + "step": 6165, + "time_per_iteration": 2.4863715171813965 + }, + { + "auxiliary_loss_clip": 0.01125688, + "auxiliary_loss_mlp": 0.01036899, + "balance_loss_clip": 1.02060854, + "balance_loss_mlp": 1.04388845, + "epoch": 0.37071997595069894, + "flos": 19099018880640.0, + "grad_norm": 1.8224972170046692, + "language_loss": 0.69500774, + "learning_rate": 2.9008774049490896e-06, + "loss": 0.71663356, + "num_input_tokens_seen": 132527465, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.81640625, + "step": 6166, + "time_per_iteration": 2.560605049133301 + }, + { + "auxiliary_loss_clip": 0.01038031, + "auxiliary_loss_mlp": 0.01006399, + "balance_loss_clip": 1.00471771, + "balance_loss_mlp": 1.01302195, + "epoch": 0.3707800992033669, + "flos": 52178384920320.0, + "grad_norm": 0.8093247029889314, + "language_loss": 0.56892115, + "learning_rate": 2.9005296736164244e-06, + "loss": 0.58936548, + "num_input_tokens_seen": 132579940, + "router_z_loss_clip": 0.0168457, + "router_z_loss_mlp": 0.25, + "step": 6167, + "time_per_iteration": 2.922917127609253 + }, + { + "auxiliary_loss_clip": 0.01118731, + "auxiliary_loss_mlp": 0.01033249, + "balance_loss_clip": 1.01992154, + "balance_loss_mlp": 1.04288507, + "epoch": 0.3708402224560349, + "flos": 19901837808000.0, + "grad_norm": 1.945139483069219, + "language_loss": 0.75539452, + "learning_rate": 2.900181908135584e-06, + "loss": 0.77691436, + "num_input_tokens_seen": 132598390, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7578125, + "step": 6168, + "time_per_iteration": 2.4489872455596924 + }, + { + "auxiliary_loss_clip": 0.01120115, + "auxiliary_loss_mlp": 0.01035803, + "balance_loss_clip": 1.02202857, + "balance_loss_mlp": 1.04180634, + "epoch": 0.37090034570870284, + "flos": 20007630339840.0, + "grad_norm": 2.5586684776543853, + "language_loss": 0.7432459, + "learning_rate": 2.899834108519755e-06, + "loss": 0.76480508, + "num_input_tokens_seen": 132616920, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.78125, + "step": 6169, + "time_per_iteration": 2.4537463188171387 + }, + { + "auxiliary_loss_clip": 0.01120897, + "auxiliary_loss_mlp": 0.01032585, + "balance_loss_clip": 1.01891184, + "balance_loss_mlp": 1.04480267, + "epoch": 0.3709604689613708, + "flos": 24134700228480.0, + "grad_norm": 1.3706540261028175, + "language_loss": 0.79311681, + "learning_rate": 2.899486274782127e-06, + "loss": 0.81465161, + "num_input_tokens_seen": 132637660, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7578125, + "step": 6170, + "time_per_iteration": 2.4723992347717285 + }, + { + "auxiliary_loss_clip": 0.01122845, + "auxiliary_loss_mlp": 0.01038875, + "balance_loss_clip": 1.02390242, + "balance_loss_mlp": 1.04451621, + "epoch": 0.37102059221403877, + "flos": 23876071326720.0, + "grad_norm": 1.6235616399590074, + "language_loss": 0.76385272, + "learning_rate": 2.8991384069358885e-06, + "loss": 0.78546989, + "num_input_tokens_seen": 132657635, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.78125, + "step": 6171, + "time_per_iteration": 2.5364768505096436 + }, + { + "auxiliary_loss_clip": 0.01123724, + "auxiliary_loss_mlp": 0.01031905, + "balance_loss_clip": 1.01663446, + "balance_loss_mlp": 1.04594254, + "epoch": 0.37108071546670673, + "flos": 14501268149760.0, + "grad_norm": 1.9768297571305458, + "language_loss": 0.80696416, + "learning_rate": 2.898790504994232e-06, + "loss": 0.82852054, + "num_input_tokens_seen": 132674455, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.77734375, + "step": 6172, + "time_per_iteration": 2.451099395751953 + }, + { + "auxiliary_loss_clip": 0.01124197, + "auxiliary_loss_mlp": 0.01037606, + "balance_loss_clip": 1.0219543, + "balance_loss_mlp": 1.04385138, + "epoch": 0.3711408387193747, + "flos": 34562619279360.0, + "grad_norm": 2.2157067962534875, + "language_loss": 0.59447742, + "learning_rate": 2.89844256897035e-06, + "loss": 0.61609542, + "num_input_tokens_seen": 132695140, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8046875, + "step": 6173, + "time_per_iteration": 2.5750677585601807 + }, + { + "auxiliary_loss_clip": 0.01121876, + "auxiliary_loss_mlp": 0.01036073, + "balance_loss_clip": 1.02122533, + "balance_loss_mlp": 1.04391754, + "epoch": 0.37120096197204266, + "flos": 17310703432320.0, + "grad_norm": 1.9248503394254857, + "language_loss": 0.81157243, + "learning_rate": 2.898094598877435e-06, + "loss": 0.83315188, + "num_input_tokens_seen": 132712470, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78125, + "step": 6174, + "time_per_iteration": 2.421182155609131 + }, + { + "auxiliary_loss_clip": 0.01117919, + "auxiliary_loss_mlp": 0.01033906, + "balance_loss_clip": 1.02035165, + "balance_loss_mlp": 1.04281855, + "epoch": 0.37126108522471063, + "flos": 30664049760000.0, + "grad_norm": 1.8542839121663495, + "language_loss": 0.79834068, + "learning_rate": 2.8977465947286826e-06, + "loss": 0.81985891, + "num_input_tokens_seen": 132732945, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.75, + "step": 6175, + "time_per_iteration": 2.533447027206421 + }, + { + "auxiliary_loss_clip": 0.01124428, + "auxiliary_loss_mlp": 0.01046668, + "balance_loss_clip": 1.03194535, + "balance_loss_mlp": 1.04644537, + "epoch": 0.37132120847737865, + "flos": 25155640494720.0, + "grad_norm": 1.6734071315129293, + "language_loss": 0.88764346, + "learning_rate": 2.89739855653729e-06, + "loss": 0.90935433, + "num_input_tokens_seen": 132752470, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.78125, + "step": 6176, + "time_per_iteration": 2.486224412918091 + }, + { + "auxiliary_loss_clip": 0.01122363, + "auxiliary_loss_mlp": 0.01036031, + "balance_loss_clip": 1.02174938, + "balance_loss_mlp": 1.04402244, + "epoch": 0.3713813317300466, + "flos": 21213474842880.0, + "grad_norm": 1.5809846817738957, + "language_loss": 0.73293233, + "learning_rate": 2.8970504843164546e-06, + "loss": 0.75451624, + "num_input_tokens_seen": 132771485, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 6177, + "time_per_iteration": 2.492033004760742 + }, + { + "auxiliary_loss_clip": 0.01119881, + "auxiliary_loss_mlp": 0.01039443, + "balance_loss_clip": 1.02551913, + "balance_loss_mlp": 1.04359818, + "epoch": 0.3714414549827146, + "flos": 21616644072960.0, + "grad_norm": 1.8832415058442271, + "language_loss": 0.75425023, + "learning_rate": 2.896702378079374e-06, + "loss": 0.77584344, + "num_input_tokens_seen": 132791465, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76171875, + "step": 6178, + "time_per_iteration": 4.005537748336792 + }, + { + "auxiliary_loss_clip": 0.01123036, + "auxiliary_loss_mlp": 0.01033082, + "balance_loss_clip": 1.01896191, + "balance_loss_mlp": 1.04618645, + "epoch": 0.37150157823538255, + "flos": 19972294335360.0, + "grad_norm": 1.761738877644596, + "language_loss": 0.7228415, + "learning_rate": 2.8963542378392502e-06, + "loss": 0.74440265, + "num_input_tokens_seen": 132810160, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76953125, + "step": 6179, + "time_per_iteration": 5.333393812179565 + }, + { + "auxiliary_loss_clip": 0.01122372, + "auxiliary_loss_mlp": 0.01035165, + "balance_loss_clip": 1.01987052, + "balance_loss_mlp": 1.04356897, + "epoch": 0.3715617014880505, + "flos": 24860562266880.0, + "grad_norm": 2.1666258639633518, + "language_loss": 0.69705212, + "learning_rate": 2.896006063609283e-06, + "loss": 0.71862751, + "num_input_tokens_seen": 132831265, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.7890625, + "step": 6180, + "time_per_iteration": 2.4896974563598633 + }, + { + "auxiliary_loss_clip": 0.01117383, + "auxiliary_loss_mlp": 0.01030855, + "balance_loss_clip": 1.01695561, + "balance_loss_mlp": 1.04157031, + "epoch": 0.3716218247407185, + "flos": 20449080489600.0, + "grad_norm": 1.7756296340851163, + "language_loss": 0.77702844, + "learning_rate": 2.8956578554026767e-06, + "loss": 0.79851079, + "num_input_tokens_seen": 132850005, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 6181, + "time_per_iteration": 2.4324231147766113 + }, + { + "auxiliary_loss_clip": 0.01118444, + "auxiliary_loss_mlp": 0.0103491, + "balance_loss_clip": 1.0202775, + "balance_loss_mlp": 1.04225945, + "epoch": 0.37168194799338644, + "flos": 24133479166080.0, + "grad_norm": 1.8526172549307973, + "language_loss": 0.78767365, + "learning_rate": 2.8953096132326343e-06, + "loss": 0.80920726, + "num_input_tokens_seen": 132865790, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.76171875, + "step": 6182, + "time_per_iteration": 2.47566819190979 + }, + { + "auxiliary_loss_clip": 0.01036072, + "auxiliary_loss_mlp": 0.01008449, + "balance_loss_clip": 1.00650644, + "balance_loss_mlp": 1.01082778, + "epoch": 0.3717420712460544, + "flos": 67408926900480.0, + "grad_norm": 0.7841437663574693, + "language_loss": 0.5748502, + "learning_rate": 2.894961337112362e-06, + "loss": 0.59529543, + "num_input_tokens_seen": 132921775, + "router_z_loss_clip": 0.01940918, + "router_z_loss_mlp": 0.25195312, + "step": 6183, + "time_per_iteration": 3.0538721084594727 + }, + { + "auxiliary_loss_clip": 0.01124733, + "auxiliary_loss_mlp": 0.010435, + "balance_loss_clip": 1.02772832, + "balance_loss_mlp": 1.04238844, + "epoch": 0.37180219449872237, + "flos": 22376908362240.0, + "grad_norm": 2.1996761862640715, + "language_loss": 0.76940209, + "learning_rate": 2.894613027055066e-06, + "loss": 0.79108441, + "num_input_tokens_seen": 132941060, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.82421875, + "step": 6184, + "time_per_iteration": 2.4653987884521484 + }, + { + "auxiliary_loss_clip": 0.0111964, + "auxiliary_loss_mlp": 0.01036854, + "balance_loss_clip": 1.02268612, + "balance_loss_mlp": 1.04353404, + "epoch": 0.37186231775139034, + "flos": 21869885934720.0, + "grad_norm": 13.965274526936179, + "language_loss": 0.72047049, + "learning_rate": 2.894264683073954e-06, + "loss": 0.74203539, + "num_input_tokens_seen": 132961850, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.76171875, + "step": 6185, + "time_per_iteration": 2.458340644836426 + }, + { + "auxiliary_loss_clip": 0.01117506, + "auxiliary_loss_mlp": 0.01027176, + "balance_loss_clip": 1.01282895, + "balance_loss_mlp": 1.04169369, + "epoch": 0.3719224410040583, + "flos": 22415225195520.0, + "grad_norm": 1.55661462109525, + "language_loss": 0.7702297, + "learning_rate": 2.8939163051822363e-06, + "loss": 0.79167652, + "num_input_tokens_seen": 132981625, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7578125, + "step": 6186, + "time_per_iteration": 2.4665393829345703 + }, + { + "auxiliary_loss_clip": 0.01125099, + "auxiliary_loss_mlp": 0.01041622, + "balance_loss_clip": 1.02608871, + "balance_loss_mlp": 1.0436089, + "epoch": 0.37198256425672627, + "flos": 25151223121920.0, + "grad_norm": 1.8483894715485976, + "language_loss": 0.83475709, + "learning_rate": 2.8935678933931224e-06, + "loss": 0.85642433, + "num_input_tokens_seen": 133001225, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8125, + "step": 6187, + "time_per_iteration": 2.520294427871704 + }, + { + "auxiliary_loss_clip": 0.01118811, + "auxiliary_loss_mlp": 0.0103693, + "balance_loss_clip": 1.02228546, + "balance_loss_mlp": 1.0421021, + "epoch": 0.37204268750939423, + "flos": 21138313633920.0, + "grad_norm": 2.555128723697134, + "language_loss": 0.84544367, + "learning_rate": 2.893219447719824e-06, + "loss": 0.86700106, + "num_input_tokens_seen": 133018820, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.765625, + "step": 6188, + "time_per_iteration": 2.4926793575286865 + }, + { + "auxiliary_loss_clip": 0.01121509, + "auxiliary_loss_mlp": 0.01034942, + "balance_loss_clip": 1.01966548, + "balance_loss_mlp": 1.04392672, + "epoch": 0.37210281076206225, + "flos": 21506829217920.0, + "grad_norm": 1.6829112555225307, + "language_loss": 0.65646267, + "learning_rate": 2.8928709681755548e-06, + "loss": 0.67802715, + "num_input_tokens_seen": 133040205, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.7734375, + "step": 6189, + "time_per_iteration": 2.447175979614258 + }, + { + "auxiliary_loss_clip": 0.01122656, + "auxiliary_loss_mlp": 0.01039465, + "balance_loss_clip": 1.02514815, + "balance_loss_mlp": 1.04456878, + "epoch": 0.3721629340147302, + "flos": 17347835116800.0, + "grad_norm": 2.6073714147883162, + "language_loss": 0.83948457, + "learning_rate": 2.8925224547735293e-06, + "loss": 0.8611058, + "num_input_tokens_seen": 133058095, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 6190, + "time_per_iteration": 2.4410126209259033 + }, + { + "auxiliary_loss_clip": 0.01125721, + "auxiliary_loss_mlp": 0.01033909, + "balance_loss_clip": 1.01949084, + "balance_loss_mlp": 1.04337156, + "epoch": 0.3722230572673982, + "flos": 16432400073600.0, + "grad_norm": 2.3404623023220643, + "language_loss": 0.88506198, + "learning_rate": 2.8921739075269633e-06, + "loss": 0.90665835, + "num_input_tokens_seen": 133071530, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.82421875, + "step": 6191, + "time_per_iteration": 2.452972650527954 + }, + { + "auxiliary_loss_clip": 0.01123549, + "auxiliary_loss_mlp": 0.01033146, + "balance_loss_clip": 1.01648057, + "balance_loss_mlp": 1.04218102, + "epoch": 0.37228318052006615, + "flos": 22674716023680.0, + "grad_norm": 1.570395080331924, + "language_loss": 0.74228191, + "learning_rate": 2.891825326449073e-06, + "loss": 0.76384884, + "num_input_tokens_seen": 133091410, + "router_z_loss_clip": 0.16699219, + "router_z_loss_mlp": 0.8125, + "step": 6192, + "time_per_iteration": 2.6486353874206543 + }, + { + "auxiliary_loss_clip": 0.01119485, + "auxiliary_loss_mlp": 0.01036496, + "balance_loss_clip": 1.02246475, + "balance_loss_mlp": 1.0427109, + "epoch": 0.3723433037727341, + "flos": 25265491263360.0, + "grad_norm": 2.4820365699908944, + "language_loss": 0.79760754, + "learning_rate": 2.8914767115530766e-06, + "loss": 0.81916732, + "num_input_tokens_seen": 133110365, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.765625, + "step": 6193, + "time_per_iteration": 2.525973081588745 + }, + { + "auxiliary_loss_clip": 0.01123101, + "auxiliary_loss_mlp": 0.01039009, + "balance_loss_clip": 1.02436423, + "balance_loss_mlp": 1.043504, + "epoch": 0.3724034270254021, + "flos": 10524664333440.0, + "grad_norm": 1.7895472081978328, + "language_loss": 0.84495157, + "learning_rate": 2.891128062852194e-06, + "loss": 0.86657262, + "num_input_tokens_seen": 133128255, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.796875, + "step": 6194, + "time_per_iteration": 2.419099807739258 + }, + { + "auxiliary_loss_clip": 0.01118251, + "auxiliary_loss_mlp": 0.01034958, + "balance_loss_clip": 1.02080166, + "balance_loss_mlp": 1.04037666, + "epoch": 0.37246355027807004, + "flos": 20266223328000.0, + "grad_norm": 2.9207659578016463, + "language_loss": 0.77555239, + "learning_rate": 2.890779380359646e-06, + "loss": 0.79708451, + "num_input_tokens_seen": 133143975, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.77734375, + "step": 6195, + "time_per_iteration": 2.3995044231414795 + }, + { + "auxiliary_loss_clip": 0.01119279, + "auxiliary_loss_mlp": 0.01032495, + "balance_loss_clip": 1.01814234, + "balance_loss_mlp": 1.0428412, + "epoch": 0.372523673530738, + "flos": 19500571998720.0, + "grad_norm": 1.677102671463593, + "language_loss": 0.79111922, + "learning_rate": 2.890430664088655e-06, + "loss": 0.81263697, + "num_input_tokens_seen": 133162935, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.765625, + "step": 6196, + "time_per_iteration": 2.445478916168213 + }, + { + "auxiliary_loss_clip": 0.01119995, + "auxiliary_loss_mlp": 0.01036406, + "balance_loss_clip": 1.02235723, + "balance_loss_mlp": 1.04315817, + "epoch": 0.372583796783406, + "flos": 16764250849920.0, + "grad_norm": 1.8393036550873767, + "language_loss": 0.8332746, + "learning_rate": 2.890081914052443e-06, + "loss": 0.85483867, + "num_input_tokens_seen": 133181180, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.765625, + "step": 6197, + "time_per_iteration": 2.392005443572998 + }, + { + "auxiliary_loss_clip": 0.01115911, + "auxiliary_loss_mlp": 0.01035382, + "balance_loss_clip": 1.0202899, + "balance_loss_mlp": 1.04070568, + "epoch": 0.37264392003607394, + "flos": 22637979388800.0, + "grad_norm": 2.267147370646453, + "language_loss": 0.64613056, + "learning_rate": 2.889733130264237e-06, + "loss": 0.66764355, + "num_input_tokens_seen": 133199615, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.75, + "step": 6198, + "time_per_iteration": 2.4624876976013184 + }, + { + "auxiliary_loss_clip": 0.0111678, + "auxiliary_loss_mlp": 0.01043759, + "balance_loss_clip": 1.02989507, + "balance_loss_mlp": 1.04129016, + "epoch": 0.3727040432887419, + "flos": 19973120348160.0, + "grad_norm": 2.4815957641530084, + "language_loss": 0.7439245, + "learning_rate": 2.889384312737261e-06, + "loss": 0.76552987, + "num_input_tokens_seen": 133219650, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75390625, + "step": 6199, + "time_per_iteration": 2.454932689666748 + }, + { + "auxiliary_loss_clip": 0.01117342, + "auxiliary_loss_mlp": 0.01032553, + "balance_loss_clip": 1.01881397, + "balance_loss_mlp": 1.04112601, + "epoch": 0.37276416654140987, + "flos": 63899122279680.0, + "grad_norm": 1.569210214205425, + "language_loss": 0.80711329, + "learning_rate": 2.889035461484742e-06, + "loss": 0.82861221, + "num_input_tokens_seen": 133245675, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76171875, + "step": 6200, + "time_per_iteration": 2.853854179382324 + }, + { + "auxiliary_loss_clip": 0.01118801, + "auxiliary_loss_mlp": 0.01040438, + "balance_loss_clip": 1.02588272, + "balance_loss_mlp": 1.04248428, + "epoch": 0.37282428979407783, + "flos": 39785970211200.0, + "grad_norm": 2.046105641958108, + "language_loss": 0.60723466, + "learning_rate": 2.88868657651991e-06, + "loss": 0.6288271, + "num_input_tokens_seen": 133266905, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.765625, + "step": 6201, + "time_per_iteration": 2.58642315864563 + }, + { + "auxiliary_loss_clip": 0.01122167, + "auxiliary_loss_mlp": 0.01032753, + "balance_loss_clip": 1.01813745, + "balance_loss_mlp": 1.04334736, + "epoch": 0.37288441304674586, + "flos": 22709046447360.0, + "grad_norm": 1.5967185311646992, + "language_loss": 0.72980845, + "learning_rate": 2.8883376578559934e-06, + "loss": 0.75135767, + "num_input_tokens_seen": 133286865, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7890625, + "step": 6202, + "time_per_iteration": 2.461116075515747 + }, + { + "auxiliary_loss_clip": 0.01120095, + "auxiliary_loss_mlp": 0.01034793, + "balance_loss_clip": 1.02064919, + "balance_loss_mlp": 1.04372942, + "epoch": 0.3729445362994138, + "flos": 18770292587520.0, + "grad_norm": 2.8761852736669793, + "language_loss": 0.739654, + "learning_rate": 2.8879887055062243e-06, + "loss": 0.76120287, + "num_input_tokens_seen": 133305295, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.765625, + "step": 6203, + "time_per_iteration": 2.4199976921081543 + }, + { + "auxiliary_loss_clip": 0.01113815, + "auxiliary_loss_mlp": 0.01033282, + "balance_loss_clip": 1.02083671, + "balance_loss_mlp": 1.03933048, + "epoch": 0.3730046595520818, + "flos": 22456199635200.0, + "grad_norm": 1.6894031212763305, + "language_loss": 0.81359541, + "learning_rate": 2.8876397194838353e-06, + "loss": 0.83506644, + "num_input_tokens_seen": 133324625, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.74609375, + "step": 6204, + "time_per_iteration": 2.527442693710327 + }, + { + "auxiliary_loss_clip": 0.01122288, + "auxiliary_loss_mlp": 0.01040396, + "balance_loss_clip": 1.02538753, + "balance_loss_mlp": 1.04287875, + "epoch": 0.37306478280474975, + "flos": 24316372241280.0, + "grad_norm": 1.5818895271767701, + "language_loss": 0.75028086, + "learning_rate": 2.8872906998020577e-06, + "loss": 0.77190769, + "num_input_tokens_seen": 133344625, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.796875, + "step": 6205, + "time_per_iteration": 2.515028953552246 + }, + { + "auxiliary_loss_clip": 0.01118084, + "auxiliary_loss_mlp": 0.01034946, + "balance_loss_clip": 1.02002704, + "balance_loss_mlp": 1.04183412, + "epoch": 0.3731249060574177, + "flos": 15815167741440.0, + "grad_norm": 1.8699710225203796, + "language_loss": 0.78044879, + "learning_rate": 2.886941646474128e-06, + "loss": 0.80197906, + "num_input_tokens_seen": 133363605, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.76171875, + "step": 6206, + "time_per_iteration": 2.433136224746704 + }, + { + "auxiliary_loss_clip": 0.01119546, + "auxiliary_loss_mlp": 0.01032561, + "balance_loss_clip": 1.01752925, + "balance_loss_mlp": 1.04182768, + "epoch": 0.3731850293100857, + "flos": 19828077229440.0, + "grad_norm": 2.1358392378140487, + "language_loss": 0.93595111, + "learning_rate": 2.886592559513283e-06, + "loss": 0.95747221, + "num_input_tokens_seen": 133379405, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7734375, + "step": 6207, + "time_per_iteration": 2.422592878341675 + }, + { + "auxiliary_loss_clip": 0.01120785, + "auxiliary_loss_mlp": 0.01031375, + "balance_loss_clip": 1.01774943, + "balance_loss_mlp": 1.04154027, + "epoch": 0.37324515256275365, + "flos": 19062354072960.0, + "grad_norm": 2.238385364236049, + "language_loss": 0.82666922, + "learning_rate": 2.886243438932759e-06, + "loss": 0.84819084, + "num_input_tokens_seen": 133397585, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.79296875, + "step": 6208, + "time_per_iteration": 2.5171287059783936 + }, + { + "auxiliary_loss_clip": 0.01122491, + "auxiliary_loss_mlp": 0.0103487, + "balance_loss_clip": 1.01911068, + "balance_loss_mlp": 1.04320371, + "epoch": 0.3733052758154216, + "flos": 20704333512960.0, + "grad_norm": 1.7601988102738153, + "language_loss": 0.73197794, + "learning_rate": 2.8858942847457953e-06, + "loss": 0.75355148, + "num_input_tokens_seen": 133415365, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.79296875, + "step": 6209, + "time_per_iteration": 2.480943202972412 + }, + { + "auxiliary_loss_clip": 0.01120081, + "auxiliary_loss_mlp": 0.01037238, + "balance_loss_clip": 1.02178252, + "balance_loss_mlp": 1.0430553, + "epoch": 0.3733653990680896, + "flos": 20193504243840.0, + "grad_norm": 1.4781766070975684, + "language_loss": 0.69951272, + "learning_rate": 2.8855450969656305e-06, + "loss": 0.72108591, + "num_input_tokens_seen": 133435700, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.76953125, + "step": 6210, + "time_per_iteration": 2.5063016414642334 + }, + { + "auxiliary_loss_clip": 0.01121548, + "auxiliary_loss_mlp": 0.01030573, + "balance_loss_clip": 1.01533842, + "balance_loss_mlp": 1.04171228, + "epoch": 0.37342552232075754, + "flos": 20339660684160.0, + "grad_norm": 1.960293983782413, + "language_loss": 0.77729124, + "learning_rate": 2.8851958756055073e-06, + "loss": 0.79881245, + "num_input_tokens_seen": 133455180, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.796875, + "step": 6211, + "time_per_iteration": 2.4845266342163086 + }, + { + "auxiliary_loss_clip": 0.01121905, + "auxiliary_loss_mlp": 0.01038644, + "balance_loss_clip": 1.0240593, + "balance_loss_mlp": 1.04219186, + "epoch": 0.3734856455734255, + "flos": 35517879527040.0, + "grad_norm": 1.9911666037414828, + "language_loss": 0.73026669, + "learning_rate": 2.884846620678668e-06, + "loss": 0.75187218, + "num_input_tokens_seen": 133476715, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.796875, + "step": 6212, + "time_per_iteration": 2.615323066711426 + }, + { + "auxiliary_loss_clip": 0.01130473, + "auxiliary_loss_mlp": 0.01047817, + "balance_loss_clip": 1.03231955, + "balance_loss_mlp": 1.04560018, + "epoch": 0.37354576882609347, + "flos": 21142300043520.0, + "grad_norm": 4.00760557025762, + "language_loss": 0.81895888, + "learning_rate": 2.884497332198356e-06, + "loss": 0.84074175, + "num_input_tokens_seen": 133494550, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.84765625, + "step": 6213, + "time_per_iteration": 2.4621500968933105 + }, + { + "auxiliary_loss_clip": 0.01119566, + "auxiliary_loss_mlp": 0.01039017, + "balance_loss_clip": 1.02433026, + "balance_loss_mlp": 1.04143643, + "epoch": 0.37360589207876144, + "flos": 21506793304320.0, + "grad_norm": 2.2631910468903014, + "language_loss": 0.7890203, + "learning_rate": 2.8841480101778167e-06, + "loss": 0.81060612, + "num_input_tokens_seen": 133512640, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.78125, + "step": 6214, + "time_per_iteration": 2.5582997798919678 + }, + { + "auxiliary_loss_clip": 0.01117625, + "auxiliary_loss_mlp": 0.01043035, + "balance_loss_clip": 1.02859902, + "balance_loss_mlp": 1.04069364, + "epoch": 0.37366601533142946, + "flos": 38435800861440.0, + "grad_norm": 1.7789401165216012, + "language_loss": 0.84881294, + "learning_rate": 2.883798654630296e-06, + "loss": 0.87041962, + "num_input_tokens_seen": 133535540, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.76953125, + "step": 6215, + "time_per_iteration": 2.6216535568237305 + }, + { + "auxiliary_loss_clip": 0.01121702, + "auxiliary_loss_mlp": 0.01041572, + "balance_loss_clip": 1.02595592, + "balance_loss_mlp": 1.04088581, + "epoch": 0.3737261385840974, + "flos": 18441171244800.0, + "grad_norm": 5.614431195109344, + "language_loss": 0.67669535, + "learning_rate": 2.8834492655690423e-06, + "loss": 0.69832802, + "num_input_tokens_seen": 133555795, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.80859375, + "step": 6216, + "time_per_iteration": 2.4592814445495605 + }, + { + "auxiliary_loss_clip": 0.01121492, + "auxiliary_loss_mlp": 0.01040499, + "balance_loss_clip": 1.02500176, + "balance_loss_mlp": 1.04252148, + "epoch": 0.3737862618367654, + "flos": 22929861306240.0, + "grad_norm": 2.041107256757408, + "language_loss": 0.65695626, + "learning_rate": 2.883099843007303e-06, + "loss": 0.67857617, + "num_input_tokens_seen": 133575905, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.7890625, + "step": 6217, + "time_per_iteration": 2.50801420211792 + }, + { + "auxiliary_loss_clip": 0.01122909, + "auxiliary_loss_mlp": 0.01039721, + "balance_loss_clip": 1.02378845, + "balance_loss_mlp": 1.04290843, + "epoch": 0.37384638508943335, + "flos": 15409664127360.0, + "grad_norm": 3.2488334570714725, + "language_loss": 0.80776107, + "learning_rate": 2.88275038695833e-06, + "loss": 0.82938731, + "num_input_tokens_seen": 133592585, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.80078125, + "step": 6218, + "time_per_iteration": 2.469524383544922 + }, + { + "auxiliary_loss_clip": 0.01117083, + "auxiliary_loss_mlp": 0.01032877, + "balance_loss_clip": 1.01851249, + "balance_loss_mlp": 1.04241216, + "epoch": 0.3739065083421013, + "flos": 24280820755200.0, + "grad_norm": 1.3682227753048604, + "language_loss": 0.78710622, + "learning_rate": 2.8824008974353736e-06, + "loss": 0.80860579, + "num_input_tokens_seen": 133615070, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.74609375, + "step": 6219, + "time_per_iteration": 2.595862627029419 + }, + { + "auxiliary_loss_clip": 0.01119648, + "auxiliary_loss_mlp": 0.0104261, + "balance_loss_clip": 1.02776265, + "balance_loss_mlp": 1.0430454, + "epoch": 0.3739666315947693, + "flos": 23002831785600.0, + "grad_norm": 2.1916352692915217, + "language_loss": 0.76985866, + "learning_rate": 2.8820513744516866e-06, + "loss": 0.79148126, + "num_input_tokens_seen": 133633490, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.765625, + "step": 6220, + "time_per_iteration": 6.68864631652832 + }, + { + "auxiliary_loss_clip": 0.01120187, + "auxiliary_loss_mlp": 0.01041991, + "balance_loss_clip": 1.02635062, + "balance_loss_mlp": 1.04149485, + "epoch": 0.37402675484743725, + "flos": 19391116279680.0, + "grad_norm": 1.921342744454882, + "language_loss": 0.82958305, + "learning_rate": 2.8817018180205235e-06, + "loss": 0.85120487, + "num_input_tokens_seen": 133653425, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.7890625, + "step": 6221, + "time_per_iteration": 3.9474618434906006 + }, + { + "auxiliary_loss_clip": 0.0111979, + "auxiliary_loss_mlp": 0.01042782, + "balance_loss_clip": 1.02852452, + "balance_loss_mlp": 1.04195023, + "epoch": 0.3740868781001052, + "flos": 17126158331520.0, + "grad_norm": 1.6461952088047174, + "language_loss": 0.75817096, + "learning_rate": 2.8813522281551387e-06, + "loss": 0.7797966, + "num_input_tokens_seen": 133670220, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 6222, + "time_per_iteration": 2.43192720413208 + }, + { + "auxiliary_loss_clip": 0.01121141, + "auxiliary_loss_mlp": 0.01033917, + "balance_loss_clip": 1.0191592, + "balance_loss_mlp": 1.04333961, + "epoch": 0.3741470013527732, + "flos": 20043505048320.0, + "grad_norm": 1.6728060456550218, + "language_loss": 0.70215583, + "learning_rate": 2.881002604868789e-06, + "loss": 0.72370636, + "num_input_tokens_seen": 133688910, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.77734375, + "step": 6223, + "time_per_iteration": 2.4719529151916504 + }, + { + "auxiliary_loss_clip": 0.01123096, + "auxiliary_loss_mlp": 0.010342, + "balance_loss_clip": 1.01976991, + "balance_loss_mlp": 1.04556298, + "epoch": 0.37420712460544114, + "flos": 36897279569280.0, + "grad_norm": 2.209456781749309, + "language_loss": 0.69100869, + "learning_rate": 2.8806529481747325e-06, + "loss": 0.71258163, + "num_input_tokens_seen": 133708690, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.77734375, + "step": 6224, + "time_per_iteration": 2.6382336616516113 + }, + { + "auxiliary_loss_clip": 0.01120784, + "auxiliary_loss_mlp": 0.01033639, + "balance_loss_clip": 1.01942348, + "balance_loss_mlp": 1.04488885, + "epoch": 0.3742672478581091, + "flos": 22201198007040.0, + "grad_norm": 1.8205395187863704, + "language_loss": 0.69828689, + "learning_rate": 2.880303258086228e-06, + "loss": 0.71983123, + "num_input_tokens_seen": 133728095, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7578125, + "step": 6225, + "time_per_iteration": 2.501041889190674 + }, + { + "auxiliary_loss_clip": 0.01118888, + "auxiliary_loss_mlp": 0.01038877, + "balance_loss_clip": 1.02376127, + "balance_loss_mlp": 1.04357982, + "epoch": 0.3743273711107771, + "flos": 24681547860480.0, + "grad_norm": 2.305559014636685, + "language_loss": 0.79056358, + "learning_rate": 2.879953534616536e-06, + "loss": 0.81214118, + "num_input_tokens_seen": 133745590, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.75390625, + "step": 6226, + "time_per_iteration": 2.485196113586426 + }, + { + "auxiliary_loss_clip": 0.01121484, + "auxiliary_loss_mlp": 0.01040329, + "balance_loss_clip": 1.02517128, + "balance_loss_mlp": 1.04342556, + "epoch": 0.37438749436344504, + "flos": 24459619680000.0, + "grad_norm": 2.1155280603994546, + "language_loss": 0.68059194, + "learning_rate": 2.879603777778917e-06, + "loss": 0.70221007, + "num_input_tokens_seen": 133766155, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.78125, + "step": 6227, + "time_per_iteration": 2.553396463394165 + }, + { + "auxiliary_loss_clip": 0.01119717, + "auxiliary_loss_mlp": 0.01034725, + "balance_loss_clip": 1.02044404, + "balance_loss_mlp": 1.04391932, + "epoch": 0.374447617616113, + "flos": 21798747048960.0, + "grad_norm": 1.719573737271176, + "language_loss": 0.82955533, + "learning_rate": 2.879253987586635e-06, + "loss": 0.85109973, + "num_input_tokens_seen": 133783185, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7578125, + "step": 6228, + "time_per_iteration": 2.449979305267334 + }, + { + "auxiliary_loss_clip": 0.01120724, + "auxiliary_loss_mlp": 0.01038988, + "balance_loss_clip": 1.0244565, + "balance_loss_mlp": 1.0452075, + "epoch": 0.374507740868781, + "flos": 17968191932160.0, + "grad_norm": 1.610770216359874, + "language_loss": 0.74802738, + "learning_rate": 2.8789041640529535e-06, + "loss": 0.76962447, + "num_input_tokens_seen": 133800975, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7578125, + "step": 6229, + "time_per_iteration": 2.4768621921539307 + }, + { + "auxiliary_loss_clip": 0.01121137, + "auxiliary_loss_mlp": 0.01039119, + "balance_loss_clip": 1.02384853, + "balance_loss_mlp": 1.04209936, + "epoch": 0.374567864121449, + "flos": 16105828596480.0, + "grad_norm": 1.8233250091751425, + "language_loss": 0.83350682, + "learning_rate": 2.8785543071911383e-06, + "loss": 0.85510933, + "num_input_tokens_seen": 133818020, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.7890625, + "step": 6230, + "time_per_iteration": 2.4503889083862305 + }, + { + "auxiliary_loss_clip": 0.01125186, + "auxiliary_loss_mlp": 0.01039118, + "balance_loss_clip": 1.02383518, + "balance_loss_mlp": 1.04665947, + "epoch": 0.37462798737411696, + "flos": 25773160135680.0, + "grad_norm": 1.8327028169227884, + "language_loss": 0.73589134, + "learning_rate": 2.878204417014456e-06, + "loss": 0.75753438, + "num_input_tokens_seen": 133840690, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.78515625, + "step": 6231, + "time_per_iteration": 2.5793888568878174 + }, + { + "auxiliary_loss_clip": 0.01126351, + "auxiliary_loss_mlp": 0.01042616, + "balance_loss_clip": 1.02754807, + "balance_loss_mlp": 1.04669595, + "epoch": 0.3746881106267849, + "flos": 16654507822080.0, + "grad_norm": 2.0748427868287536, + "language_loss": 0.72982037, + "learning_rate": 2.8778544935361735e-06, + "loss": 0.75151008, + "num_input_tokens_seen": 133858350, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.796875, + "step": 6232, + "time_per_iteration": 2.5400028228759766 + }, + { + "auxiliary_loss_clip": 0.01120736, + "auxiliary_loss_mlp": 0.01034639, + "balance_loss_clip": 1.01927304, + "balance_loss_mlp": 1.04244757, + "epoch": 0.3747482338794529, + "flos": 26177981391360.0, + "grad_norm": 1.7557793199484253, + "language_loss": 0.77042818, + "learning_rate": 2.877504536769561e-06, + "loss": 0.791982, + "num_input_tokens_seen": 133879775, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.78125, + "step": 6233, + "time_per_iteration": 2.6110641956329346 + }, + { + "auxiliary_loss_clip": 0.01124346, + "auxiliary_loss_mlp": 0.01039458, + "balance_loss_clip": 1.02521205, + "balance_loss_mlp": 1.04520559, + "epoch": 0.37480835713212085, + "flos": 12021061950720.0, + "grad_norm": 1.733253645903673, + "language_loss": 0.68936831, + "learning_rate": 2.8771545467278883e-06, + "loss": 0.71100628, + "num_input_tokens_seen": 133898295, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.79296875, + "step": 6234, + "time_per_iteration": 2.4476797580718994 + }, + { + "auxiliary_loss_clip": 0.01121608, + "auxiliary_loss_mlp": 0.01040174, + "balance_loss_clip": 1.02685833, + "balance_loss_mlp": 1.04514599, + "epoch": 0.3748684803847888, + "flos": 19679263182720.0, + "grad_norm": 1.8436539021155727, + "language_loss": 0.82329285, + "learning_rate": 2.8768045234244276e-06, + "loss": 0.84491062, + "num_input_tokens_seen": 133915230, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.765625, + "step": 6235, + "time_per_iteration": 2.4766016006469727 + }, + { + "auxiliary_loss_clip": 0.01127866, + "auxiliary_loss_mlp": 0.01032441, + "balance_loss_clip": 1.01823175, + "balance_loss_mlp": 1.04744995, + "epoch": 0.3749286036374568, + "flos": 20521189042560.0, + "grad_norm": 1.8082481713782126, + "language_loss": 0.77776909, + "learning_rate": 2.8764544668724517e-06, + "loss": 0.79937214, + "num_input_tokens_seen": 133934110, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.8046875, + "step": 6236, + "time_per_iteration": 2.440678596496582 + }, + { + "auxiliary_loss_clip": 0.01124108, + "auxiliary_loss_mlp": 0.0104869, + "balance_loss_clip": 1.03139293, + "balance_loss_mlp": 1.04308259, + "epoch": 0.37498872689012475, + "flos": 20704620821760.0, + "grad_norm": 2.0063576687211704, + "language_loss": 0.73203218, + "learning_rate": 2.876104377085234e-06, + "loss": 0.7537601, + "num_input_tokens_seen": 133952395, + "router_z_loss_clip": 0.17285156, + "router_z_loss_mlp": 0.80859375, + "step": 6237, + "time_per_iteration": 2.5782086849212646 + }, + { + "auxiliary_loss_clip": 0.01120953, + "auxiliary_loss_mlp": 0.01037906, + "balance_loss_clip": 1.02257562, + "balance_loss_mlp": 1.04084682, + "epoch": 0.3750488501427927, + "flos": 21574843620480.0, + "grad_norm": 2.2861902523152935, + "language_loss": 0.93017888, + "learning_rate": 2.8757542540760508e-06, + "loss": 0.9517675, + "num_input_tokens_seen": 133969635, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.80078125, + "step": 6238, + "time_per_iteration": 2.514997720718384 + }, + { + "auxiliary_loss_clip": 0.01121834, + "auxiliary_loss_mlp": 0.01033577, + "balance_loss_clip": 1.01821709, + "balance_loss_mlp": 1.04316592, + "epoch": 0.3751089733954607, + "flos": 15923869274880.0, + "grad_norm": 1.9811721217026943, + "language_loss": 0.71066076, + "learning_rate": 2.8754040978581777e-06, + "loss": 0.73221493, + "num_input_tokens_seen": 133987215, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.78515625, + "step": 6239, + "time_per_iteration": 2.5054962635040283 + }, + { + "auxiliary_loss_clip": 0.01127026, + "auxiliary_loss_mlp": 0.01031398, + "balance_loss_clip": 1.01659262, + "balance_loss_mlp": 1.04635918, + "epoch": 0.37516909664812864, + "flos": 36284644177920.0, + "grad_norm": 1.6550300124553972, + "language_loss": 0.6566934, + "learning_rate": 2.875053908444895e-06, + "loss": 0.67827761, + "num_input_tokens_seen": 134009250, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8046875, + "step": 6240, + "time_per_iteration": 2.5776519775390625 + }, + { + "auxiliary_loss_clip": 0.01124905, + "auxiliary_loss_mlp": 0.01031367, + "balance_loss_clip": 1.01703799, + "balance_loss_mlp": 1.04560649, + "epoch": 0.3752292199007966, + "flos": 13515915283200.0, + "grad_norm": 2.0148493018475877, + "language_loss": 0.75634778, + "learning_rate": 2.8747036858494795e-06, + "loss": 0.77791047, + "num_input_tokens_seen": 134026875, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.79296875, + "step": 6241, + "time_per_iteration": 2.503861904144287 + }, + { + "auxiliary_loss_clip": 0.01123464, + "auxiliary_loss_mlp": 0.01040235, + "balance_loss_clip": 1.02436805, + "balance_loss_mlp": 1.04321361, + "epoch": 0.3752893431534646, + "flos": 27198095644800.0, + "grad_norm": 2.5579725641576876, + "language_loss": 0.83610159, + "learning_rate": 2.874353430085213e-06, + "loss": 0.85773861, + "num_input_tokens_seen": 134047185, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.80078125, + "step": 6242, + "time_per_iteration": 2.4933042526245117 + }, + { + "auxiliary_loss_clip": 0.01122935, + "auxiliary_loss_mlp": 0.01038353, + "balance_loss_clip": 1.02435803, + "balance_loss_mlp": 1.04265308, + "epoch": 0.3753494664061326, + "flos": 30007674581760.0, + "grad_norm": 2.190530656574709, + "language_loss": 0.67888391, + "learning_rate": 2.8740031411653766e-06, + "loss": 0.70049673, + "num_input_tokens_seen": 134067330, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.8046875, + "step": 6243, + "time_per_iteration": 2.543820381164551 + }, + { + "auxiliary_loss_clip": 0.01121963, + "auxiliary_loss_mlp": 0.01038078, + "balance_loss_clip": 1.02241397, + "balance_loss_mlp": 1.04404676, + "epoch": 0.37540958965880056, + "flos": 24461954064000.0, + "grad_norm": 1.7974063962239055, + "language_loss": 0.84275806, + "learning_rate": 2.8736528191032535e-06, + "loss": 0.86435848, + "num_input_tokens_seen": 134085525, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.78125, + "step": 6244, + "time_per_iteration": 2.4710450172424316 + }, + { + "auxiliary_loss_clip": 0.01119065, + "auxiliary_loss_mlp": 0.01036596, + "balance_loss_clip": 1.02229667, + "balance_loss_mlp": 1.0436101, + "epoch": 0.3754697129114685, + "flos": 16508387295360.0, + "grad_norm": 2.387588700969948, + "language_loss": 0.83019805, + "learning_rate": 2.8733024639121277e-06, + "loss": 0.85175467, + "num_input_tokens_seen": 134101855, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.75390625, + "step": 6245, + "time_per_iteration": 2.4594197273254395 + }, + { + "auxiliary_loss_clip": 0.01122149, + "auxiliary_loss_mlp": 0.01037692, + "balance_loss_clip": 1.02207565, + "balance_loss_mlp": 1.04337263, + "epoch": 0.3755298361641365, + "flos": 19390900798080.0, + "grad_norm": 1.94802763897559, + "language_loss": 0.64043313, + "learning_rate": 2.8729520756052853e-06, + "loss": 0.66203153, + "num_input_tokens_seen": 134119360, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.7890625, + "step": 6246, + "time_per_iteration": 2.4522809982299805 + }, + { + "auxiliary_loss_clip": 0.01125162, + "auxiliary_loss_mlp": 0.01038669, + "balance_loss_clip": 1.0231837, + "balance_loss_mlp": 1.04382014, + "epoch": 0.37558995941680445, + "flos": 14720395069440.0, + "grad_norm": 1.7195896287931138, + "language_loss": 0.75146973, + "learning_rate": 2.8726016541960124e-06, + "loss": 0.77310807, + "num_input_tokens_seen": 134137475, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8125, + "step": 6247, + "time_per_iteration": 2.4527103900909424 + }, + { + "auxiliary_loss_clip": 0.01122539, + "auxiliary_loss_mlp": 0.01037762, + "balance_loss_clip": 1.02281308, + "balance_loss_mlp": 1.04276609, + "epoch": 0.3756500826694724, + "flos": 21689901861120.0, + "grad_norm": 3.472354315090956, + "language_loss": 0.55157161, + "learning_rate": 2.872251199697598e-06, + "loss": 0.5731746, + "num_input_tokens_seen": 134154580, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.796875, + "step": 6248, + "time_per_iteration": 2.4399521350860596 + }, + { + "auxiliary_loss_clip": 0.01119734, + "auxiliary_loss_mlp": 0.0103806, + "balance_loss_clip": 1.02334976, + "balance_loss_mlp": 1.04241502, + "epoch": 0.3757102059221404, + "flos": 26505666190080.0, + "grad_norm": 2.875026035710993, + "language_loss": 0.84247208, + "learning_rate": 2.8719007121233297e-06, + "loss": 0.86404997, + "num_input_tokens_seen": 134174285, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.7734375, + "step": 6249, + "time_per_iteration": 2.529763698577881 + }, + { + "auxiliary_loss_clip": 0.01120861, + "auxiliary_loss_mlp": 0.01033042, + "balance_loss_clip": 1.018713, + "balance_loss_mlp": 1.0427655, + "epoch": 0.37577032917480835, + "flos": 37338083274240.0, + "grad_norm": 1.7253468577749267, + "language_loss": 0.68124413, + "learning_rate": 2.8715501914864993e-06, + "loss": 0.70278323, + "num_input_tokens_seen": 134195940, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78125, + "step": 6250, + "time_per_iteration": 2.572439193725586 + }, + { + "auxiliary_loss_clip": 0.01124257, + "auxiliary_loss_mlp": 0.01042227, + "balance_loss_clip": 1.02791047, + "balance_loss_mlp": 1.04538727, + "epoch": 0.3758304524274763, + "flos": 21908597817600.0, + "grad_norm": 2.0419035804756716, + "language_loss": 0.77633286, + "learning_rate": 2.8711996378003987e-06, + "loss": 0.79799771, + "num_input_tokens_seen": 134212235, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7890625, + "step": 6251, + "time_per_iteration": 2.58437442779541 + }, + { + "auxiliary_loss_clip": 0.01120391, + "auxiliary_loss_mlp": 0.01033287, + "balance_loss_clip": 1.01910138, + "balance_loss_mlp": 1.04232824, + "epoch": 0.3758905756801443, + "flos": 36569343375360.0, + "grad_norm": 2.137051103462404, + "language_loss": 0.58463252, + "learning_rate": 2.8708490510783203e-06, + "loss": 0.60616934, + "num_input_tokens_seen": 134233810, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 6252, + "time_per_iteration": 2.6117262840270996 + }, + { + "auxiliary_loss_clip": 0.01124494, + "auxiliary_loss_mlp": 0.01043009, + "balance_loss_clip": 1.02730918, + "balance_loss_mlp": 1.04393482, + "epoch": 0.37595069893281224, + "flos": 24528783317760.0, + "grad_norm": 2.9959533965383836, + "language_loss": 0.89689183, + "learning_rate": 2.8704984313335584e-06, + "loss": 0.91856694, + "num_input_tokens_seen": 134252020, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8046875, + "step": 6253, + "time_per_iteration": 2.5241925716400146 + }, + { + "auxiliary_loss_clip": 0.01123311, + "auxiliary_loss_mlp": 0.01036763, + "balance_loss_clip": 1.0227623, + "balance_loss_mlp": 1.04618073, + "epoch": 0.3760108221854802, + "flos": 16435021766400.0, + "grad_norm": 1.9568868773694639, + "language_loss": 0.76368916, + "learning_rate": 2.8701477785794097e-06, + "loss": 0.78528988, + "num_input_tokens_seen": 134269495, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76953125, + "step": 6254, + "time_per_iteration": 2.44631028175354 + }, + { + "auxiliary_loss_clip": 0.011269, + "auxiliary_loss_mlp": 0.01044619, + "balance_loss_clip": 1.02906847, + "balance_loss_mlp": 1.04640615, + "epoch": 0.37607094543814823, + "flos": 13771742924160.0, + "grad_norm": 2.019237604940679, + "language_loss": 0.61830014, + "learning_rate": 2.869797092829169e-06, + "loss": 0.6400153, + "num_input_tokens_seen": 134287035, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8046875, + "step": 6255, + "time_per_iteration": 2.474303960800171 + }, + { + "auxiliary_loss_clip": 0.01125813, + "auxiliary_loss_mlp": 0.01037924, + "balance_loss_clip": 1.02204537, + "balance_loss_mlp": 1.0434109, + "epoch": 0.3761310686908162, + "flos": 19857918453120.0, + "grad_norm": 2.4357923747979675, + "language_loss": 0.74234015, + "learning_rate": 2.869446374096135e-06, + "loss": 0.76397753, + "num_input_tokens_seen": 134304840, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.82421875, + "step": 6256, + "time_per_iteration": 2.4332830905914307 + }, + { + "auxiliary_loss_clip": 0.01129168, + "auxiliary_loss_mlp": 0.01045861, + "balance_loss_clip": 1.03029239, + "balance_loss_mlp": 1.04842019, + "epoch": 0.37619119194348416, + "flos": 12750802657920.0, + "grad_norm": 1.807318668329893, + "language_loss": 0.70297635, + "learning_rate": 2.8690956223936088e-06, + "loss": 0.72472662, + "num_input_tokens_seen": 134323180, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.80859375, + "step": 6257, + "time_per_iteration": 2.600249767303467 + }, + { + "auxiliary_loss_clip": 0.01123849, + "auxiliary_loss_mlp": 0.01034306, + "balance_loss_clip": 1.01998889, + "balance_loss_mlp": 1.04582894, + "epoch": 0.3762513151961521, + "flos": 17530548624000.0, + "grad_norm": 1.8628634379537026, + "language_loss": 0.84647095, + "learning_rate": 2.868744837734889e-06, + "loss": 0.86805254, + "num_input_tokens_seen": 134341390, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78125, + "step": 6258, + "time_per_iteration": 2.443833351135254 + }, + { + "auxiliary_loss_clip": 0.01122949, + "auxiliary_loss_mlp": 0.01043602, + "balance_loss_clip": 1.02936888, + "balance_loss_mlp": 1.04430962, + "epoch": 0.3763114384488201, + "flos": 23617406511360.0, + "grad_norm": 1.514941849696829, + "language_loss": 0.81009686, + "learning_rate": 2.868394020133277e-06, + "loss": 0.83176237, + "num_input_tokens_seen": 134360425, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78515625, + "step": 6259, + "time_per_iteration": 2.5727832317352295 + }, + { + "auxiliary_loss_clip": 0.01130377, + "auxiliary_loss_mlp": 0.01042246, + "balance_loss_clip": 1.02660608, + "balance_loss_mlp": 1.04775453, + "epoch": 0.37637156170148806, + "flos": 25406978935680.0, + "grad_norm": 1.8915772167347047, + "language_loss": 0.71919596, + "learning_rate": 2.8680431696020783e-06, + "loss": 0.74092221, + "num_input_tokens_seen": 134379775, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.828125, + "step": 6260, + "time_per_iteration": 2.5225539207458496 + }, + { + "auxiliary_loss_clip": 0.0112693, + "auxiliary_loss_mlp": 0.01036069, + "balance_loss_clip": 1.02061951, + "balance_loss_mlp": 1.04538989, + "epoch": 0.376431684954156, + "flos": 23440906056960.0, + "grad_norm": 1.725193491542272, + "language_loss": 0.78423822, + "learning_rate": 2.867692286154594e-06, + "loss": 0.80586827, + "num_input_tokens_seen": 134400315, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.81640625, + "step": 6261, + "time_per_iteration": 2.4926671981811523 + }, + { + "auxiliary_loss_clip": 0.01132188, + "auxiliary_loss_mlp": 0.01043226, + "balance_loss_clip": 1.02784848, + "balance_loss_mlp": 1.04861188, + "epoch": 0.376491808206824, + "flos": 34204482725760.0, + "grad_norm": 1.7544905551461754, + "language_loss": 0.80327791, + "learning_rate": 2.867341369804132e-06, + "loss": 0.82503211, + "num_input_tokens_seen": 134422875, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8359375, + "step": 6262, + "time_per_iteration": 6.861605167388916 + }, + { + "auxiliary_loss_clip": 0.01121874, + "auxiliary_loss_mlp": 0.01032438, + "balance_loss_clip": 1.01796031, + "balance_loss_mlp": 1.04471791, + "epoch": 0.37655193145949195, + "flos": 35185669614720.0, + "grad_norm": 1.7128267856657793, + "language_loss": 0.80543715, + "learning_rate": 2.866990420563998e-06, + "loss": 0.82698023, + "num_input_tokens_seen": 134443025, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.76953125, + "step": 6263, + "time_per_iteration": 2.6574654579162598 + }, + { + "auxiliary_loss_clip": 0.01128017, + "auxiliary_loss_mlp": 0.01041966, + "balance_loss_clip": 1.02705324, + "balance_loss_mlp": 1.04757583, + "epoch": 0.3766120547121599, + "flos": 16761844638720.0, + "grad_norm": 2.7435231382382033, + "language_loss": 0.80158919, + "learning_rate": 2.866639438447501e-06, + "loss": 0.82328904, + "num_input_tokens_seen": 134460945, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.8046875, + "step": 6264, + "time_per_iteration": 2.4326720237731934 + }, + { + "auxiliary_loss_clip": 0.01122852, + "auxiliary_loss_mlp": 0.01046441, + "balance_loss_clip": 1.03120613, + "balance_loss_mlp": 1.04323912, + "epoch": 0.3766721779648279, + "flos": 23550361776000.0, + "grad_norm": 2.2579254623504585, + "language_loss": 0.73604524, + "learning_rate": 2.8662884234679497e-06, + "loss": 0.75773823, + "num_input_tokens_seen": 134480440, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.796875, + "step": 6265, + "time_per_iteration": 2.481248617172241 + }, + { + "auxiliary_loss_clip": 0.01126145, + "auxiliary_loss_mlp": 0.01038865, + "balance_loss_clip": 1.02525079, + "balance_loss_mlp": 1.04878664, + "epoch": 0.37673230121749585, + "flos": 29129191655040.0, + "grad_norm": 1.6798839148056366, + "language_loss": 0.68685853, + "learning_rate": 2.865937375638654e-06, + "loss": 0.70850861, + "num_input_tokens_seen": 134501110, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 6266, + "time_per_iteration": 2.517972946166992 + }, + { + "auxiliary_loss_clip": 0.01129377, + "auxiliary_loss_mlp": 0.0104268, + "balance_loss_clip": 1.02746832, + "balance_loss_mlp": 1.04570127, + "epoch": 0.3767924244701638, + "flos": 28146783703680.0, + "grad_norm": 21.71943634627446, + "language_loss": 0.6330213, + "learning_rate": 2.8655862949729264e-06, + "loss": 0.65474188, + "num_input_tokens_seen": 134522460, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.8359375, + "step": 6267, + "time_per_iteration": 2.534775733947754 + }, + { + "auxiliary_loss_clip": 0.01049589, + "auxiliary_loss_mlp": 0.01002617, + "balance_loss_clip": 1.00076914, + "balance_loss_mlp": 1.02342653, + "epoch": 0.37685254772283183, + "flos": 60797197526400.0, + "grad_norm": 0.7181832227527338, + "language_loss": 0.58946306, + "learning_rate": 2.8652351814840795e-06, + "loss": 0.60998511, + "num_input_tokens_seen": 134589545, + "router_z_loss_clip": 0.01843262, + "router_z_loss_mlp": 0.26171875, + "step": 6268, + "time_per_iteration": 3.168419361114502 + }, + { + "auxiliary_loss_clip": 0.011283, + "auxiliary_loss_mlp": 0.01038795, + "balance_loss_clip": 1.02268982, + "balance_loss_mlp": 1.04734302, + "epoch": 0.3769126709754998, + "flos": 26032543223040.0, + "grad_norm": 1.4797604992869704, + "language_loss": 0.65026355, + "learning_rate": 2.8648840351854283e-06, + "loss": 0.67193449, + "num_input_tokens_seen": 134610550, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8125, + "step": 6269, + "time_per_iteration": 2.5472333431243896 + }, + { + "auxiliary_loss_clip": 0.01127949, + "auxiliary_loss_mlp": 0.01038619, + "balance_loss_clip": 1.02263296, + "balance_loss_mlp": 1.05022144, + "epoch": 0.37697279422816776, + "flos": 23579879777280.0, + "grad_norm": 1.46875421159053, + "language_loss": 0.70592397, + "learning_rate": 2.8645328560902874e-06, + "loss": 0.72758961, + "num_input_tokens_seen": 134630485, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.77734375, + "step": 6270, + "time_per_iteration": 2.4763948917388916 + }, + { + "auxiliary_loss_clip": 0.01045864, + "auxiliary_loss_mlp": 0.0100198, + "balance_loss_clip": 1.00021577, + "balance_loss_mlp": 1.02014744, + "epoch": 0.3770329174808357, + "flos": 64745935367040.0, + "grad_norm": 0.7024360778923162, + "language_loss": 0.56136239, + "learning_rate": 2.8641816442119746e-06, + "loss": 0.58184087, + "num_input_tokens_seen": 134693510, + "router_z_loss_clip": 0.0177002, + "router_z_loss_mlp": 0.2578125, + "step": 6271, + "time_per_iteration": 3.0738816261291504 + }, + { + "auxiliary_loss_clip": 0.01124439, + "auxiliary_loss_mlp": 0.01039364, + "balance_loss_clip": 1.02326441, + "balance_loss_mlp": 1.04638743, + "epoch": 0.3770930407335037, + "flos": 21835304115840.0, + "grad_norm": 2.066611127756055, + "language_loss": 0.79340166, + "learning_rate": 2.8638303995638066e-06, + "loss": 0.81503969, + "num_input_tokens_seen": 134713115, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.78125, + "step": 6272, + "time_per_iteration": 2.4686055183410645 + }, + { + "auxiliary_loss_clip": 0.01122198, + "auxiliary_loss_mlp": 0.01031935, + "balance_loss_clip": 1.01802933, + "balance_loss_mlp": 1.04578209, + "epoch": 0.37715316398617166, + "flos": 22747901984640.0, + "grad_norm": 1.4641670728096365, + "language_loss": 0.74172843, + "learning_rate": 2.863479122159103e-06, + "loss": 0.76326972, + "num_input_tokens_seen": 134732635, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 6273, + "time_per_iteration": 2.5079009532928467 + }, + { + "auxiliary_loss_clip": 0.01124789, + "auxiliary_loss_mlp": 0.01045969, + "balance_loss_clip": 1.03112721, + "balance_loss_mlp": 1.04621577, + "epoch": 0.3772132872388396, + "flos": 18914581520640.0, + "grad_norm": 1.4163029825487425, + "language_loss": 0.71801323, + "learning_rate": 2.8631278120111858e-06, + "loss": 0.73972082, + "num_input_tokens_seen": 134750695, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78515625, + "step": 6274, + "time_per_iteration": 2.460338592529297 + }, + { + "auxiliary_loss_clip": 0.01128245, + "auxiliary_loss_mlp": 0.01036844, + "balance_loss_clip": 1.02277732, + "balance_loss_mlp": 1.04794264, + "epoch": 0.3772734104915076, + "flos": 17346219004800.0, + "grad_norm": 1.663376044288712, + "language_loss": 0.83692443, + "learning_rate": 2.8627764691333742e-06, + "loss": 0.85857534, + "num_input_tokens_seen": 134768935, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.80078125, + "step": 6275, + "time_per_iteration": 2.48319149017334 + }, + { + "auxiliary_loss_clip": 0.01121629, + "auxiliary_loss_mlp": 0.0103252, + "balance_loss_clip": 1.01949656, + "balance_loss_mlp": 1.04532933, + "epoch": 0.37733353374417555, + "flos": 32342370785280.0, + "grad_norm": 1.4340123311349162, + "language_loss": 0.75342453, + "learning_rate": 2.8624250935389935e-06, + "loss": 0.77496612, + "num_input_tokens_seen": 134791260, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.76171875, + "step": 6276, + "time_per_iteration": 2.5773236751556396 + }, + { + "auxiliary_loss_clip": 0.01127758, + "auxiliary_loss_mlp": 0.01042729, + "balance_loss_clip": 1.02724338, + "balance_loss_mlp": 1.04667568, + "epoch": 0.3773936569968435, + "flos": 23360681030400.0, + "grad_norm": 1.858122502551201, + "language_loss": 0.85519129, + "learning_rate": 2.862073685241366e-06, + "loss": 0.87689614, + "num_input_tokens_seen": 134808350, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8125, + "step": 6277, + "time_per_iteration": 2.5827369689941406 + }, + { + "auxiliary_loss_clip": 0.01123645, + "auxiliary_loss_mlp": 0.01032265, + "balance_loss_clip": 1.01833546, + "balance_loss_mlp": 1.04713118, + "epoch": 0.3774537802495115, + "flos": 21466788531840.0, + "grad_norm": 2.807350675061797, + "language_loss": 0.78055024, + "learning_rate": 2.861722244253818e-06, + "loss": 0.80210936, + "num_input_tokens_seen": 134826005, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 6278, + "time_per_iteration": 2.491334915161133 + }, + { + "auxiliary_loss_clip": 0.01128448, + "auxiliary_loss_mlp": 0.01044224, + "balance_loss_clip": 1.02795196, + "balance_loss_mlp": 1.04698181, + "epoch": 0.37751390350217945, + "flos": 24973717086720.0, + "grad_norm": 1.933979010172509, + "language_loss": 0.82702643, + "learning_rate": 2.8613707705896767e-06, + "loss": 0.84875309, + "num_input_tokens_seen": 134844995, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8125, + "step": 6279, + "time_per_iteration": 2.538426160812378 + }, + { + "auxiliary_loss_clip": 0.01125885, + "auxiliary_loss_mlp": 0.01037058, + "balance_loss_clip": 1.02310467, + "balance_loss_mlp": 1.04578614, + "epoch": 0.3775740267548474, + "flos": 27819098904960.0, + "grad_norm": 2.0225623598483358, + "language_loss": 0.74985826, + "learning_rate": 2.861019264262269e-06, + "loss": 0.77148765, + "num_input_tokens_seen": 134865285, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.80078125, + "step": 6280, + "time_per_iteration": 2.5161032676696777 + }, + { + "auxiliary_loss_clip": 0.01123339, + "auxiliary_loss_mlp": 0.01036466, + "balance_loss_clip": 1.02283478, + "balance_loss_mlp": 1.04662085, + "epoch": 0.3776341500075154, + "flos": 22565224391040.0, + "grad_norm": 1.4438938373085308, + "language_loss": 0.76017272, + "learning_rate": 2.8606677252849242e-06, + "loss": 0.78177071, + "num_input_tokens_seen": 134886535, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.765625, + "step": 6281, + "time_per_iteration": 2.504711151123047 + }, + { + "auxiliary_loss_clip": 0.01122332, + "auxiliary_loss_mlp": 0.01035077, + "balance_loss_clip": 1.02049732, + "balance_loss_mlp": 1.04368496, + "epoch": 0.3776942732601834, + "flos": 23077238808960.0, + "grad_norm": 1.7476205657776698, + "language_loss": 0.8391279, + "learning_rate": 2.860316153670974e-06, + "loss": 0.86070192, + "num_input_tokens_seen": 134907435, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.78515625, + "step": 6282, + "time_per_iteration": 2.4668593406677246 + }, + { + "auxiliary_loss_clip": 0.01120742, + "auxiliary_loss_mlp": 0.01037931, + "balance_loss_clip": 1.02337587, + "balance_loss_mlp": 1.04434681, + "epoch": 0.37775439651285136, + "flos": 21724411852800.0, + "grad_norm": 1.8037618077250128, + "language_loss": 0.70150751, + "learning_rate": 2.8599645494337484e-06, + "loss": 0.72309422, + "num_input_tokens_seen": 134925360, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.765625, + "step": 6283, + "time_per_iteration": 2.481948137283325 + }, + { + "auxiliary_loss_clip": 0.0112321, + "auxiliary_loss_mlp": 0.01045267, + "balance_loss_clip": 1.02967477, + "balance_loss_mlp": 1.04516089, + "epoch": 0.37781451976551933, + "flos": 23987753688960.0, + "grad_norm": 1.804590454145544, + "language_loss": 0.76529062, + "learning_rate": 2.859612912586581e-06, + "loss": 0.78697532, + "num_input_tokens_seen": 134944205, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.78125, + "step": 6284, + "time_per_iteration": 2.462968349456787 + }, + { + "auxiliary_loss_clip": 0.01130082, + "auxiliary_loss_mlp": 0.01034565, + "balance_loss_clip": 1.01884174, + "balance_loss_mlp": 1.0466392, + "epoch": 0.3778746430181873, + "flos": 13727967223680.0, + "grad_norm": 2.0529722445272167, + "language_loss": 0.85851312, + "learning_rate": 2.8592612431428055e-06, + "loss": 0.88015962, + "num_input_tokens_seen": 134960255, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.8359375, + "step": 6285, + "time_per_iteration": 2.4435150623321533 + }, + { + "auxiliary_loss_clip": 0.01125611, + "auxiliary_loss_mlp": 0.0104034, + "balance_loss_clip": 1.0240438, + "balance_loss_mlp": 1.04457164, + "epoch": 0.37793476627085526, + "flos": 19460495399040.0, + "grad_norm": 1.9682053367320125, + "language_loss": 0.83967972, + "learning_rate": 2.858909541115758e-06, + "loss": 0.86133921, + "num_input_tokens_seen": 134978605, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8125, + "step": 6286, + "time_per_iteration": 2.4270951747894287 + }, + { + "auxiliary_loss_clip": 0.01123272, + "auxiliary_loss_mlp": 0.01041948, + "balance_loss_clip": 1.0268203, + "balance_loss_mlp": 1.04474115, + "epoch": 0.3779948895235232, + "flos": 10707018704640.0, + "grad_norm": 2.20319687907872, + "language_loss": 0.81550682, + "learning_rate": 2.858557806518775e-06, + "loss": 0.83715904, + "num_input_tokens_seen": 134995020, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.78515625, + "step": 6287, + "time_per_iteration": 2.4504740238189697 + }, + { + "auxiliary_loss_clip": 0.01121581, + "auxiliary_loss_mlp": 0.01040758, + "balance_loss_clip": 1.02559495, + "balance_loss_mlp": 1.04340911, + "epoch": 0.3780550127761912, + "flos": 22310007281280.0, + "grad_norm": 2.428511311582982, + "language_loss": 0.73038173, + "learning_rate": 2.8582060393651927e-06, + "loss": 0.75200516, + "num_input_tokens_seen": 135012620, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.78125, + "step": 6288, + "time_per_iteration": 2.4988601207733154 + }, + { + "auxiliary_loss_clip": 0.01126071, + "auxiliary_loss_mlp": 0.01036255, + "balance_loss_clip": 1.02103162, + "balance_loss_mlp": 1.04705048, + "epoch": 0.37811513602885916, + "flos": 28950644125440.0, + "grad_norm": 1.726028925404572, + "language_loss": 0.75453335, + "learning_rate": 2.857854239668352e-06, + "loss": 0.7761566, + "num_input_tokens_seen": 135033365, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.7890625, + "step": 6289, + "time_per_iteration": 2.5323870182037354 + }, + { + "auxiliary_loss_clip": 0.01121413, + "auxiliary_loss_mlp": 0.01038832, + "balance_loss_clip": 1.02428889, + "balance_loss_mlp": 1.04395676, + "epoch": 0.3781752592815271, + "flos": 23112933949440.0, + "grad_norm": 1.9121243331279245, + "language_loss": 0.7341041, + "learning_rate": 2.857502407441593e-06, + "loss": 0.75570655, + "num_input_tokens_seen": 135052185, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7734375, + "step": 6290, + "time_per_iteration": 2.4703667163848877 + }, + { + "auxiliary_loss_clip": 0.01126076, + "auxiliary_loss_mlp": 0.01040267, + "balance_loss_clip": 1.02388752, + "balance_loss_mlp": 1.0441103, + "epoch": 0.3782353825341951, + "flos": 19755932762880.0, + "grad_norm": 2.4130424762969502, + "language_loss": 0.79729307, + "learning_rate": 2.8571505426982566e-06, + "loss": 0.81895649, + "num_input_tokens_seen": 135070425, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.8203125, + "step": 6291, + "time_per_iteration": 2.590517520904541 + }, + { + "auxiliary_loss_clip": 0.01124797, + "auxiliary_loss_mlp": 0.0103595, + "balance_loss_clip": 1.02038157, + "balance_loss_mlp": 1.04347014, + "epoch": 0.37829550578686305, + "flos": 22050839675520.0, + "grad_norm": 1.7851511943573266, + "language_loss": 0.76090503, + "learning_rate": 2.8567986454516854e-06, + "loss": 0.78251249, + "num_input_tokens_seen": 135090525, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8125, + "step": 6292, + "time_per_iteration": 2.486375570297241 + }, + { + "auxiliary_loss_clip": 0.0112214, + "auxiliary_loss_mlp": 0.01042986, + "balance_loss_clip": 1.02708387, + "balance_loss_mlp": 1.04380596, + "epoch": 0.378355629039531, + "flos": 16470357770880.0, + "grad_norm": 1.8744506208430416, + "language_loss": 0.69510674, + "learning_rate": 2.856446715715224e-06, + "loss": 0.71675801, + "num_input_tokens_seen": 135109575, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.78125, + "step": 6293, + "time_per_iteration": 2.477025032043457 + }, + { + "auxiliary_loss_clip": 0.01120173, + "auxiliary_loss_mlp": 0.01036754, + "balance_loss_clip": 1.02140629, + "balance_loss_mlp": 1.04180205, + "epoch": 0.378415752292199, + "flos": 19974844200960.0, + "grad_norm": 1.812028848861632, + "language_loss": 0.71631789, + "learning_rate": 2.8560947535022173e-06, + "loss": 0.73788714, + "num_input_tokens_seen": 135127000, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.78515625, + "step": 6294, + "time_per_iteration": 2.446382522583008 + }, + { + "auxiliary_loss_clip": 0.01128463, + "auxiliary_loss_mlp": 0.01035795, + "balance_loss_clip": 1.02050054, + "balance_loss_mlp": 1.04522586, + "epoch": 0.378475875544867, + "flos": 14647388676480.0, + "grad_norm": 2.0852903309957815, + "language_loss": 0.8254326, + "learning_rate": 2.855742758826011e-06, + "loss": 0.84707516, + "num_input_tokens_seen": 135145285, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.83203125, + "step": 6295, + "time_per_iteration": 2.4684417247772217 + }, + { + "auxiliary_loss_clip": 0.01123253, + "auxiliary_loss_mlp": 0.01033263, + "balance_loss_clip": 1.01870751, + "balance_loss_mlp": 1.04352689, + "epoch": 0.37853599879753497, + "flos": 26650996617600.0, + "grad_norm": 1.687128097470698, + "language_loss": 0.71806532, + "learning_rate": 2.8553907316999547e-06, + "loss": 0.73963046, + "num_input_tokens_seen": 135165240, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.796875, + "step": 6296, + "time_per_iteration": 2.515676975250244 + }, + { + "auxiliary_loss_clip": 0.01119269, + "auxiliary_loss_mlp": 0.01039959, + "balance_loss_clip": 1.02523708, + "balance_loss_mlp": 1.04370534, + "epoch": 0.37859612205020293, + "flos": 17311960408320.0, + "grad_norm": 1.741193546240543, + "language_loss": 0.77094543, + "learning_rate": 2.855038672137396e-06, + "loss": 0.79253769, + "num_input_tokens_seen": 135184045, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7578125, + "step": 6297, + "time_per_iteration": 2.4617502689361572 + }, + { + "auxiliary_loss_clip": 0.01123428, + "auxiliary_loss_mlp": 0.01035161, + "balance_loss_clip": 1.02042699, + "balance_loss_mlp": 1.04360187, + "epoch": 0.3786562453028709, + "flos": 18220392299520.0, + "grad_norm": 2.034703790395703, + "language_loss": 0.79179847, + "learning_rate": 2.854686580151684e-06, + "loss": 0.81338429, + "num_input_tokens_seen": 135202365, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.796875, + "step": 6298, + "time_per_iteration": 2.4516994953155518 + }, + { + "auxiliary_loss_clip": 0.01121762, + "auxiliary_loss_mlp": 0.01034647, + "balance_loss_clip": 1.02001977, + "balance_loss_mlp": 1.04453242, + "epoch": 0.37871636855553886, + "flos": 21214875473280.0, + "grad_norm": 2.0947541210526466, + "language_loss": 0.84758198, + "learning_rate": 2.8543344557561722e-06, + "loss": 0.86914611, + "num_input_tokens_seen": 135220955, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7734375, + "step": 6299, + "time_per_iteration": 2.4814558029174805 + }, + { + "auxiliary_loss_clip": 0.01123706, + "auxiliary_loss_mlp": 0.0103612, + "balance_loss_clip": 1.02153504, + "balance_loss_mlp": 1.04462421, + "epoch": 0.3787764918082068, + "flos": 20952727038720.0, + "grad_norm": 2.218392777517032, + "language_loss": 0.7657811, + "learning_rate": 2.8539822989642116e-06, + "loss": 0.78737932, + "num_input_tokens_seen": 135239715, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7890625, + "step": 6300, + "time_per_iteration": 2.4615044593811035 + }, + { + "auxiliary_loss_clip": 0.01127842, + "auxiliary_loss_mlp": 0.01039306, + "balance_loss_clip": 1.02135265, + "balance_loss_mlp": 1.04486537, + "epoch": 0.3788366150608748, + "flos": 17308009912320.0, + "grad_norm": 2.28104869272164, + "language_loss": 0.82490808, + "learning_rate": 2.8536301097891577e-06, + "loss": 0.84657955, + "num_input_tokens_seen": 135257035, + "router_z_loss_clip": 0.1796875, + "router_z_loss_mlp": 0.828125, + "step": 6301, + "time_per_iteration": 2.4864752292633057 + }, + { + "auxiliary_loss_clip": 0.01119304, + "auxiliary_loss_mlp": 0.0104447, + "balance_loss_clip": 1.02967012, + "balance_loss_mlp": 1.04097867, + "epoch": 0.37889673831354276, + "flos": 24311092942080.0, + "grad_norm": 1.8461206090891127, + "language_loss": 0.67669666, + "learning_rate": 2.8532778882443636e-06, + "loss": 0.69833434, + "num_input_tokens_seen": 135275720, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78125, + "step": 6302, + "time_per_iteration": 2.501873016357422 + }, + { + "auxiliary_loss_clip": 0.01122155, + "auxiliary_loss_mlp": 0.0104003, + "balance_loss_clip": 1.02617788, + "balance_loss_mlp": 1.04561174, + "epoch": 0.3789568615662107, + "flos": 26683603188480.0, + "grad_norm": 1.9271400579859064, + "language_loss": 0.68487787, + "learning_rate": 2.8529256343431867e-06, + "loss": 0.7064997, + "num_input_tokens_seen": 135294140, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 6303, + "time_per_iteration": 4.003960371017456 + }, + { + "auxiliary_loss_clip": 0.01119108, + "auxiliary_loss_mlp": 0.01034602, + "balance_loss_clip": 1.02055335, + "balance_loss_mlp": 1.04143763, + "epoch": 0.3790169848188787, + "flos": 23585194990080.0, + "grad_norm": 1.8915662489351535, + "language_loss": 0.77611423, + "learning_rate": 2.8525733480989846e-06, + "loss": 0.79765135, + "num_input_tokens_seen": 135314845, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.77734375, + "step": 6304, + "time_per_iteration": 5.393261432647705 + }, + { + "auxiliary_loss_clip": 0.01127431, + "auxiliary_loss_mlp": 0.01037711, + "balance_loss_clip": 1.02176046, + "balance_loss_mlp": 1.04611588, + "epoch": 0.37907710807154665, + "flos": 18437436230400.0, + "grad_norm": 2.1278904960845724, + "language_loss": 0.80447114, + "learning_rate": 2.8522210295251146e-06, + "loss": 0.82612252, + "num_input_tokens_seen": 135333055, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8125, + "step": 6305, + "time_per_iteration": 2.471761703491211 + }, + { + "auxiliary_loss_clip": 0.01041012, + "auxiliary_loss_mlp": 0.0101182, + "balance_loss_clip": 1.01011562, + "balance_loss_mlp": 1.01491702, + "epoch": 0.3791372313242146, + "flos": 50107165954560.0, + "grad_norm": 0.9794242329238577, + "language_loss": 0.64524716, + "learning_rate": 2.8518686786349387e-06, + "loss": 0.66577548, + "num_input_tokens_seen": 135387865, + "router_z_loss_clip": 0.01708984, + "router_z_loss_mlp": 0.26171875, + "step": 6306, + "time_per_iteration": 2.9702882766723633 + }, + { + "auxiliary_loss_clip": 0.01126961, + "auxiliary_loss_mlp": 0.01048893, + "balance_loss_clip": 1.03371215, + "balance_loss_mlp": 1.04693508, + "epoch": 0.3791973545768826, + "flos": 24316551809280.0, + "grad_norm": 1.6253037153644523, + "language_loss": 0.73722827, + "learning_rate": 2.851516295441817e-06, + "loss": 0.75898677, + "num_input_tokens_seen": 135409095, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.80078125, + "step": 6307, + "time_per_iteration": 2.508127450942993 + }, + { + "auxiliary_loss_clip": 0.01124488, + "auxiliary_loss_mlp": 0.01040535, + "balance_loss_clip": 1.02550268, + "balance_loss_mlp": 1.04390907, + "epoch": 0.3792574778295506, + "flos": 21579907438080.0, + "grad_norm": 1.494726737463818, + "language_loss": 0.78469551, + "learning_rate": 2.851163879959112e-06, + "loss": 0.80634576, + "num_input_tokens_seen": 135429585, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8046875, + "step": 6308, + "time_per_iteration": 2.453012466430664 + }, + { + "auxiliary_loss_clip": 0.01120475, + "auxiliary_loss_mlp": 0.01040507, + "balance_loss_clip": 1.02552223, + "balance_loss_mlp": 1.04146767, + "epoch": 0.37931760108221857, + "flos": 22272731942400.0, + "grad_norm": 2.8302348181917263, + "language_loss": 0.73083341, + "learning_rate": 2.8508114322001876e-06, + "loss": 0.75244319, + "num_input_tokens_seen": 135446320, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7890625, + "step": 6309, + "time_per_iteration": 2.495020866394043 + }, + { + "auxiliary_loss_clip": 0.01122333, + "auxiliary_loss_mlp": 0.01039635, + "balance_loss_clip": 1.02509165, + "balance_loss_mlp": 1.04503894, + "epoch": 0.37937772433488653, + "flos": 19682998197120.0, + "grad_norm": 1.4661467923449947, + "language_loss": 0.78449893, + "learning_rate": 2.8504589521784083e-06, + "loss": 0.80611867, + "num_input_tokens_seen": 135465720, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7734375, + "step": 6310, + "time_per_iteration": 2.466533899307251 + }, + { + "auxiliary_loss_clip": 0.01121702, + "auxiliary_loss_mlp": 0.0103985, + "balance_loss_clip": 1.02562881, + "balance_loss_mlp": 1.04319441, + "epoch": 0.3794378475875545, + "flos": 19099378016640.0, + "grad_norm": 1.894743489836823, + "language_loss": 0.76103079, + "learning_rate": 2.8501064399071403e-06, + "loss": 0.7826463, + "num_input_tokens_seen": 135485155, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78515625, + "step": 6311, + "time_per_iteration": 2.4859142303466797 + }, + { + "auxiliary_loss_clip": 0.0112103, + "auxiliary_loss_mlp": 0.0103355, + "balance_loss_clip": 1.01906657, + "balance_loss_mlp": 1.04379332, + "epoch": 0.37949797084022246, + "flos": 20339660684160.0, + "grad_norm": 1.4829862533126659, + "language_loss": 0.71025705, + "learning_rate": 2.8497538953997504e-06, + "loss": 0.73180288, + "num_input_tokens_seen": 135502675, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7734375, + "step": 6312, + "time_per_iteration": 2.4632480144500732 + }, + { + "auxiliary_loss_clip": 0.01041554, + "auxiliary_loss_mlp": 0.01005886, + "balance_loss_clip": 1.00425243, + "balance_loss_mlp": 1.01538157, + "epoch": 0.37955809409289043, + "flos": 63972203477760.0, + "grad_norm": 0.7762054489660294, + "language_loss": 0.56084001, + "learning_rate": 2.849401318669608e-06, + "loss": 0.58131444, + "num_input_tokens_seen": 135562005, + "router_z_loss_clip": 0.01635742, + "router_z_loss_mlp": 0.26171875, + "step": 6313, + "time_per_iteration": 3.0646302700042725 + }, + { + "auxiliary_loss_clip": 0.0112246, + "auxiliary_loss_mlp": 0.01043557, + "balance_loss_clip": 1.02876949, + "balance_loss_mlp": 1.04362202, + "epoch": 0.3796182173455584, + "flos": 31540665179520.0, + "grad_norm": 4.480184070608776, + "language_loss": 0.7158128, + "learning_rate": 2.849048709730083e-06, + "loss": 0.73747301, + "num_input_tokens_seen": 135582600, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.7890625, + "step": 6314, + "time_per_iteration": 2.5263309478759766 + }, + { + "auxiliary_loss_clip": 0.01126357, + "auxiliary_loss_mlp": 0.01038649, + "balance_loss_clip": 1.02331841, + "balance_loss_mlp": 1.04427075, + "epoch": 0.37967834059822636, + "flos": 12130804978560.0, + "grad_norm": 1.7655759267809688, + "language_loss": 0.73132306, + "learning_rate": 2.848696068594545e-06, + "loss": 0.75297308, + "num_input_tokens_seen": 135600280, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8203125, + "step": 6315, + "time_per_iteration": 2.4753336906433105 + }, + { + "auxiliary_loss_clip": 0.0111862, + "auxiliary_loss_mlp": 0.01038847, + "balance_loss_clip": 1.02454782, + "balance_loss_mlp": 1.04206967, + "epoch": 0.3797384638508943, + "flos": 39348578298240.0, + "grad_norm": 2.0286726324195477, + "language_loss": 0.71049547, + "learning_rate": 2.8483433952763677e-06, + "loss": 0.73207021, + "num_input_tokens_seen": 135621560, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.765625, + "step": 6316, + "time_per_iteration": 2.636176824569702 + }, + { + "auxiliary_loss_clip": 0.01122, + "auxiliary_loss_mlp": 0.01038731, + "balance_loss_clip": 1.02524233, + "balance_loss_mlp": 1.04524136, + "epoch": 0.3797985871035623, + "flos": 34054016653440.0, + "grad_norm": 1.8086467732489355, + "language_loss": 0.65270519, + "learning_rate": 2.847990689788923e-06, + "loss": 0.67431247, + "num_input_tokens_seen": 135641745, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 6317, + "time_per_iteration": 2.595952033996582 + }, + { + "auxiliary_loss_clip": 0.01118597, + "auxiliary_loss_mlp": 0.0103544, + "balance_loss_clip": 1.02174878, + "balance_loss_mlp": 1.04161143, + "epoch": 0.37985871035623026, + "flos": 23222174186880.0, + "grad_norm": 2.0501625369641867, + "language_loss": 0.85361171, + "learning_rate": 2.8476379521455877e-06, + "loss": 0.87515211, + "num_input_tokens_seen": 135660650, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76953125, + "step": 6318, + "time_per_iteration": 2.4805264472961426 + }, + { + "auxiliary_loss_clip": 0.01124758, + "auxiliary_loss_mlp": 0.01040235, + "balance_loss_clip": 1.02443993, + "balance_loss_mlp": 1.04483223, + "epoch": 0.3799188336088982, + "flos": 18114958903680.0, + "grad_norm": 2.489676718863087, + "language_loss": 0.76274204, + "learning_rate": 2.8472851823597354e-06, + "loss": 0.784392, + "num_input_tokens_seen": 135679980, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.796875, + "step": 6319, + "time_per_iteration": 2.4780025482177734 + }, + { + "auxiliary_loss_clip": 0.01123743, + "auxiliary_loss_mlp": 0.01044293, + "balance_loss_clip": 1.02961218, + "balance_loss_mlp": 1.04587555, + "epoch": 0.3799789568615662, + "flos": 21871897096320.0, + "grad_norm": 1.6998661229427972, + "language_loss": 0.63923568, + "learning_rate": 2.846932380444744e-06, + "loss": 0.66091597, + "num_input_tokens_seen": 135699400, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78125, + "step": 6320, + "time_per_iteration": 2.4700872898101807 + }, + { + "auxiliary_loss_clip": 0.01121041, + "auxiliary_loss_mlp": 0.01038091, + "balance_loss_clip": 1.02375042, + "balance_loss_mlp": 1.04365289, + "epoch": 0.3800390801142342, + "flos": 32962943082240.0, + "grad_norm": 1.883216130529445, + "language_loss": 0.7112022, + "learning_rate": 2.846579546413992e-06, + "loss": 0.73279351, + "num_input_tokens_seen": 135723455, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7734375, + "step": 6321, + "time_per_iteration": 2.5686967372894287 + }, + { + "auxiliary_loss_clip": 0.01123308, + "auxiliary_loss_mlp": 0.01038205, + "balance_loss_clip": 1.02372098, + "balance_loss_mlp": 1.04298186, + "epoch": 0.38009920336690217, + "flos": 26907075653760.0, + "grad_norm": 1.720302384597662, + "language_loss": 0.74730933, + "learning_rate": 2.846226680280859e-06, + "loss": 0.76892447, + "num_input_tokens_seen": 135744335, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8046875, + "step": 6322, + "time_per_iteration": 2.5368685722351074 + }, + { + "auxiliary_loss_clip": 0.01121658, + "auxiliary_loss_mlp": 0.01036997, + "balance_loss_clip": 1.02155948, + "balance_loss_mlp": 1.04405749, + "epoch": 0.38015932661957014, + "flos": 22488913946880.0, + "grad_norm": 2.6715016816856787, + "language_loss": 0.84910119, + "learning_rate": 2.845873782058725e-06, + "loss": 0.87068772, + "num_input_tokens_seen": 135761440, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.77734375, + "step": 6323, + "time_per_iteration": 2.483771562576294 + }, + { + "auxiliary_loss_clip": 0.01123254, + "auxiliary_loss_mlp": 0.01035788, + "balance_loss_clip": 1.01983762, + "balance_loss_mlp": 1.04395103, + "epoch": 0.3802194498722381, + "flos": 21980993679360.0, + "grad_norm": 2.3955157937634586, + "language_loss": 0.73466647, + "learning_rate": 2.845520851760973e-06, + "loss": 0.75625694, + "num_input_tokens_seen": 135779955, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.79296875, + "step": 6324, + "time_per_iteration": 2.4709885120391846 + }, + { + "auxiliary_loss_clip": 0.0112564, + "auxiliary_loss_mlp": 0.01035755, + "balance_loss_clip": 1.02020979, + "balance_loss_mlp": 1.045573, + "epoch": 0.38027957312490607, + "flos": 21324869896320.0, + "grad_norm": 1.6580896914625747, + "language_loss": 0.84147018, + "learning_rate": 2.8451678894009847e-06, + "loss": 0.86308414, + "num_input_tokens_seen": 135799840, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.80078125, + "step": 6325, + "time_per_iteration": 2.4846572875976562 + }, + { + "auxiliary_loss_clip": 0.01122273, + "auxiliary_loss_mlp": 0.01032055, + "balance_loss_clip": 1.01833439, + "balance_loss_mlp": 1.04476464, + "epoch": 0.38033969637757403, + "flos": 16691244456960.0, + "grad_norm": 1.7291759572194114, + "language_loss": 0.79642469, + "learning_rate": 2.8448148949921465e-06, + "loss": 0.81796801, + "num_input_tokens_seen": 135817880, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.77734375, + "step": 6326, + "time_per_iteration": 2.4206631183624268 + }, + { + "auxiliary_loss_clip": 0.0111945, + "auxiliary_loss_mlp": 0.01038943, + "balance_loss_clip": 1.02524638, + "balance_loss_mlp": 1.04261708, + "epoch": 0.380399819630242, + "flos": 36210847685760.0, + "grad_norm": 1.8040593924859922, + "language_loss": 0.72696453, + "learning_rate": 2.844461868547842e-06, + "loss": 0.74854851, + "num_input_tokens_seen": 135838940, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76953125, + "step": 6327, + "time_per_iteration": 2.5964794158935547 + }, + { + "auxiliary_loss_clip": 0.01122464, + "auxiliary_loss_mlp": 0.01037019, + "balance_loss_clip": 1.02165246, + "balance_loss_mlp": 1.04614949, + "epoch": 0.38045994288290996, + "flos": 21288851533440.0, + "grad_norm": 1.6287717027141382, + "language_loss": 0.83090091, + "learning_rate": 2.844108810081459e-06, + "loss": 0.85249579, + "num_input_tokens_seen": 135858325, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.76171875, + "step": 6328, + "time_per_iteration": 2.4602181911468506 + }, + { + "auxiliary_loss_clip": 0.01120102, + "auxiliary_loss_mlp": 0.01032163, + "balance_loss_clip": 1.01746464, + "balance_loss_mlp": 1.04347932, + "epoch": 0.38052006613557793, + "flos": 20922885815040.0, + "grad_norm": 1.31755328246291, + "language_loss": 0.61384171, + "learning_rate": 2.843755719606385e-06, + "loss": 0.63536435, + "num_input_tokens_seen": 135878430, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.765625, + "step": 6329, + "time_per_iteration": 2.5268959999084473 + }, + { + "auxiliary_loss_clip": 0.01124125, + "auxiliary_loss_mlp": 0.01041725, + "balance_loss_clip": 1.02731824, + "balance_loss_mlp": 1.04603863, + "epoch": 0.3805801893882459, + "flos": 20990720649600.0, + "grad_norm": 1.7232754549878644, + "language_loss": 0.5586049, + "learning_rate": 2.8434025971360104e-06, + "loss": 0.58026338, + "num_input_tokens_seen": 135894755, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 6330, + "time_per_iteration": 2.450221061706543 + }, + { + "auxiliary_loss_clip": 0.01119473, + "auxiliary_loss_mlp": 0.01035672, + "balance_loss_clip": 1.02255917, + "balance_loss_mlp": 1.04540074, + "epoch": 0.38064031264091386, + "flos": 25558594243200.0, + "grad_norm": 1.7778053530951745, + "language_loss": 0.65694439, + "learning_rate": 2.8430494426837243e-06, + "loss": 0.67849582, + "num_input_tokens_seen": 135918275, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 6331, + "time_per_iteration": 2.544187545776367 + }, + { + "auxiliary_loss_clip": 0.01126283, + "auxiliary_loss_mlp": 0.01041557, + "balance_loss_clip": 1.02635133, + "balance_loss_mlp": 1.04744291, + "epoch": 0.3807004358935818, + "flos": 15085857997440.0, + "grad_norm": 1.725296368277029, + "language_loss": 0.75737906, + "learning_rate": 2.842696256262919e-06, + "loss": 0.77905744, + "num_input_tokens_seen": 135937430, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.7890625, + "step": 6332, + "time_per_iteration": 2.443654775619507 + }, + { + "auxiliary_loss_clip": 0.01123212, + "auxiliary_loss_mlp": 0.0104071, + "balance_loss_clip": 1.02546334, + "balance_loss_mlp": 1.04323936, + "epoch": 0.3807605591462498, + "flos": 16399398453120.0, + "grad_norm": 2.2212054448627425, + "language_loss": 0.81889552, + "learning_rate": 2.842343037886987e-06, + "loss": 0.84053469, + "num_input_tokens_seen": 135954210, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.80078125, + "step": 6333, + "time_per_iteration": 2.467007637023926 + }, + { + "auxiliary_loss_clip": 0.01121534, + "auxiliary_loss_mlp": 0.01033012, + "balance_loss_clip": 1.0190227, + "balance_loss_mlp": 1.04437923, + "epoch": 0.3808206823989178, + "flos": 29057083102080.0, + "grad_norm": 1.583221243495577, + "language_loss": 0.86192155, + "learning_rate": 2.8419897875693226e-06, + "loss": 0.88346696, + "num_input_tokens_seen": 135974425, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7734375, + "step": 6334, + "time_per_iteration": 2.521341323852539 + }, + { + "auxiliary_loss_clip": 0.01123312, + "auxiliary_loss_mlp": 0.01035753, + "balance_loss_clip": 1.02130485, + "balance_loss_mlp": 1.04498506, + "epoch": 0.3808808056515858, + "flos": 15705855676800.0, + "grad_norm": 2.2115670432842847, + "language_loss": 0.79179001, + "learning_rate": 2.841636505323321e-06, + "loss": 0.8133806, + "num_input_tokens_seen": 135991985, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 6335, + "time_per_iteration": 2.4648449420928955 + }, + { + "auxiliary_loss_clip": 0.01124606, + "auxiliary_loss_mlp": 0.01035281, + "balance_loss_clip": 1.02027273, + "balance_loss_mlp": 1.04485524, + "epoch": 0.38094092890425374, + "flos": 20704584908160.0, + "grad_norm": 1.872233235491229, + "language_loss": 0.72775364, + "learning_rate": 2.8412831911623795e-06, + "loss": 0.74935251, + "num_input_tokens_seen": 136010015, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.796875, + "step": 6336, + "time_per_iteration": 2.443255662918091 + }, + { + "auxiliary_loss_clip": 0.01119223, + "auxiliary_loss_mlp": 0.01031785, + "balance_loss_clip": 1.0180763, + "balance_loss_mlp": 1.0430727, + "epoch": 0.3810010521569217, + "flos": 20667956014080.0, + "grad_norm": 1.9910419737037044, + "language_loss": 0.69146657, + "learning_rate": 2.840929845099894e-06, + "loss": 0.71297657, + "num_input_tokens_seen": 136028440, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76171875, + "step": 6337, + "time_per_iteration": 2.4838876724243164 + }, + { + "auxiliary_loss_clip": 0.01124374, + "auxiliary_loss_mlp": 0.01035164, + "balance_loss_clip": 1.02016187, + "balance_loss_mlp": 1.04606009, + "epoch": 0.38106117540958967, + "flos": 31827626933760.0, + "grad_norm": 1.9033617326941272, + "language_loss": 0.63247615, + "learning_rate": 2.8405764671492652e-06, + "loss": 0.65407151, + "num_input_tokens_seen": 136048360, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.78125, + "step": 6338, + "time_per_iteration": 2.5538294315338135 + }, + { + "auxiliary_loss_clip": 0.01123732, + "auxiliary_loss_mlp": 0.01040443, + "balance_loss_clip": 1.02492189, + "balance_loss_mlp": 1.04498446, + "epoch": 0.38112129866225763, + "flos": 16902757693440.0, + "grad_norm": 1.8718033662194862, + "language_loss": 0.69288802, + "learning_rate": 2.8402230573238923e-06, + "loss": 0.71452975, + "num_input_tokens_seen": 136065500, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.7890625, + "step": 6339, + "time_per_iteration": 2.490813970565796 + }, + { + "auxiliary_loss_clip": 0.0112515, + "auxiliary_loss_mlp": 0.01040007, + "balance_loss_clip": 1.0256902, + "balance_loss_mlp": 1.0461787, + "epoch": 0.3811814219149256, + "flos": 20887226588160.0, + "grad_norm": 2.5980221539464914, + "language_loss": 0.68312418, + "learning_rate": 2.839869615637177e-06, + "loss": 0.70477575, + "num_input_tokens_seen": 136084060, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7890625, + "step": 6340, + "time_per_iteration": 2.4576282501220703 + }, + { + "auxiliary_loss_clip": 0.01124677, + "auxiliary_loss_mlp": 0.01036157, + "balance_loss_clip": 1.02026618, + "balance_loss_mlp": 1.04393721, + "epoch": 0.38124154516759357, + "flos": 16690813493760.0, + "grad_norm": 2.141170258916756, + "language_loss": 0.89404309, + "learning_rate": 2.839516142102522e-06, + "loss": 0.91565144, + "num_input_tokens_seen": 136102310, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.80859375, + "step": 6341, + "time_per_iteration": 2.4688920974731445 + }, + { + "auxiliary_loss_clip": 0.01126312, + "auxiliary_loss_mlp": 0.01040778, + "balance_loss_clip": 1.02477455, + "balance_loss_mlp": 1.04559851, + "epoch": 0.38130166842026153, + "flos": 19681956702720.0, + "grad_norm": 1.5516456894508346, + "language_loss": 0.74665564, + "learning_rate": 2.83916263673333e-06, + "loss": 0.76832652, + "num_input_tokens_seen": 136120725, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8046875, + "step": 6342, + "time_per_iteration": 2.4610931873321533 + }, + { + "auxiliary_loss_clip": 0.0112203, + "auxiliary_loss_mlp": 0.01034151, + "balance_loss_clip": 1.01900578, + "balance_loss_mlp": 1.04325199, + "epoch": 0.3813617916729295, + "flos": 22198432659840.0, + "grad_norm": 1.6121504127073445, + "language_loss": 0.83334327, + "learning_rate": 2.838809099543007e-06, + "loss": 0.85490513, + "num_input_tokens_seen": 136139105, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.7890625, + "step": 6343, + "time_per_iteration": 2.490952730178833 + }, + { + "auxiliary_loss_clip": 0.0112236, + "auxiliary_loss_mlp": 0.01037814, + "balance_loss_clip": 1.0233357, + "balance_loss_mlp": 1.04305577, + "epoch": 0.38142191492559746, + "flos": 19096899978240.0, + "grad_norm": 1.5912858717665679, + "language_loss": 0.76965082, + "learning_rate": 2.838455530544959e-06, + "loss": 0.79125255, + "num_input_tokens_seen": 136158265, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.79296875, + "step": 6344, + "time_per_iteration": 2.458669424057007 + }, + { + "auxiliary_loss_clip": 0.01126022, + "auxiliary_loss_mlp": 0.01039382, + "balance_loss_clip": 1.02413464, + "balance_loss_mlp": 1.04601693, + "epoch": 0.3814820381782654, + "flos": 24097748112000.0, + "grad_norm": 2.369132092535199, + "language_loss": 0.72790027, + "learning_rate": 2.838101929752593e-06, + "loss": 0.7495544, + "num_input_tokens_seen": 136176100, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.796875, + "step": 6345, + "time_per_iteration": 5.361874341964722 + }, + { + "auxiliary_loss_clip": 0.0112125, + "auxiliary_loss_mlp": 0.01035577, + "balance_loss_clip": 1.02172494, + "balance_loss_mlp": 1.04348969, + "epoch": 0.3815421614309334, + "flos": 15778502933760.0, + "grad_norm": 1.723509048793367, + "language_loss": 0.69687438, + "learning_rate": 2.8377482971793187e-06, + "loss": 0.71844268, + "num_input_tokens_seen": 136195125, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.77734375, + "step": 6346, + "time_per_iteration": 3.8780832290649414 + }, + { + "auxiliary_loss_clip": 0.0112555, + "auxiliary_loss_mlp": 0.01037985, + "balance_loss_clip": 1.02351856, + "balance_loss_mlp": 1.04639161, + "epoch": 0.38160228468360136, + "flos": 19899754819200.0, + "grad_norm": 1.8691929226070287, + "language_loss": 0.75860906, + "learning_rate": 2.8373946328385437e-06, + "loss": 0.78024441, + "num_input_tokens_seen": 136213885, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.79296875, + "step": 6347, + "time_per_iteration": 2.4724838733673096 + }, + { + "auxiliary_loss_clip": 0.01121549, + "auxiliary_loss_mlp": 0.01036633, + "balance_loss_clip": 1.02258432, + "balance_loss_mlp": 1.04272556, + "epoch": 0.3816624079362694, + "flos": 19281050029440.0, + "grad_norm": 1.5494744961647557, + "language_loss": 0.74775678, + "learning_rate": 2.8370409367436813e-06, + "loss": 0.76933861, + "num_input_tokens_seen": 136232700, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 6348, + "time_per_iteration": 2.4360201358795166 + }, + { + "auxiliary_loss_clip": 0.01121636, + "auxiliary_loss_mlp": 0.01034098, + "balance_loss_clip": 1.01947105, + "balance_loss_mlp": 1.04346061, + "epoch": 0.38172253118893734, + "flos": 21177564220800.0, + "grad_norm": 2.012782025185047, + "language_loss": 0.86987114, + "learning_rate": 2.836687208908142e-06, + "loss": 0.89142847, + "num_input_tokens_seen": 136248975, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78125, + "step": 6349, + "time_per_iteration": 2.4653983116149902 + }, + { + "auxiliary_loss_clip": 0.01121297, + "auxiliary_loss_mlp": 0.01040466, + "balance_loss_clip": 1.02576792, + "balance_loss_mlp": 1.04300261, + "epoch": 0.3817826544416053, + "flos": 17529219820800.0, + "grad_norm": 3.1419886249283624, + "language_loss": 0.76335979, + "learning_rate": 2.836333449345341e-06, + "loss": 0.78497744, + "num_input_tokens_seen": 136266710, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78125, + "step": 6350, + "time_per_iteration": 2.4111151695251465 + }, + { + "auxiliary_loss_clip": 0.01122319, + "auxiliary_loss_mlp": 0.01032772, + "balance_loss_clip": 1.01693547, + "balance_loss_mlp": 1.04389453, + "epoch": 0.38184277769427327, + "flos": 16326535714560.0, + "grad_norm": 2.0441694615934325, + "language_loss": 0.76182568, + "learning_rate": 2.8359796580686907e-06, + "loss": 0.78337657, + "num_input_tokens_seen": 136284445, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.78125, + "step": 6351, + "time_per_iteration": 2.449831485748291 + }, + { + "auxiliary_loss_clip": 0.0112512, + "auxiliary_loss_mlp": 0.01039723, + "balance_loss_clip": 1.0235939, + "balance_loss_mlp": 1.04464602, + "epoch": 0.38190290094694124, + "flos": 30443450382720.0, + "grad_norm": 1.6974231581634962, + "language_loss": 0.74360836, + "learning_rate": 2.8356258350916085e-06, + "loss": 0.76525676, + "num_input_tokens_seen": 136305730, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.8046875, + "step": 6352, + "time_per_iteration": 2.5342295169830322 + }, + { + "auxiliary_loss_clip": 0.01117506, + "auxiliary_loss_mlp": 0.01033939, + "balance_loss_clip": 1.02103508, + "balance_loss_mlp": 1.04153097, + "epoch": 0.3819630241996092, + "flos": 14209924936320.0, + "grad_norm": 1.834359776939538, + "language_loss": 0.64362574, + "learning_rate": 2.8352719804275104e-06, + "loss": 0.66514015, + "num_input_tokens_seen": 136323850, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7578125, + "step": 6353, + "time_per_iteration": 2.434100866317749 + }, + { + "auxiliary_loss_clip": 0.01120333, + "auxiliary_loss_mlp": 0.01033689, + "balance_loss_clip": 1.02020061, + "balance_loss_mlp": 1.04363215, + "epoch": 0.38202314745227717, + "flos": 25009699536000.0, + "grad_norm": 1.6268216674771125, + "language_loss": 0.83035302, + "learning_rate": 2.834918094089816e-06, + "loss": 0.85189331, + "num_input_tokens_seen": 136344880, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 6354, + "time_per_iteration": 2.4903476238250732 + }, + { + "auxiliary_loss_clip": 0.0112166, + "auxiliary_loss_mlp": 0.01035018, + "balance_loss_clip": 1.02154744, + "balance_loss_mlp": 1.04571426, + "epoch": 0.38208327070494513, + "flos": 20814507504000.0, + "grad_norm": 1.7360324347242302, + "language_loss": 0.8071996, + "learning_rate": 2.834564176091943e-06, + "loss": 0.82876635, + "num_input_tokens_seen": 136366060, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 6355, + "time_per_iteration": 2.5086817741394043 + }, + { + "auxiliary_loss_clip": 0.0112186, + "auxiliary_loss_mlp": 0.01033195, + "balance_loss_clip": 1.01959288, + "balance_loss_mlp": 1.04464841, + "epoch": 0.3821433939576131, + "flos": 22637727993600.0, + "grad_norm": 1.7080815693685156, + "language_loss": 0.75032043, + "learning_rate": 2.8342102264473125e-06, + "loss": 0.77187097, + "num_input_tokens_seen": 136385625, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7734375, + "step": 6356, + "time_per_iteration": 2.471919298171997 + }, + { + "auxiliary_loss_clip": 0.01121242, + "auxiliary_loss_mlp": 0.0103649, + "balance_loss_clip": 1.02251887, + "balance_loss_mlp": 1.04420352, + "epoch": 0.38220351721028106, + "flos": 26869872142080.0, + "grad_norm": 1.8091380313160346, + "language_loss": 0.81251574, + "learning_rate": 2.833856245169348e-06, + "loss": 0.83409309, + "num_input_tokens_seen": 136405750, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76953125, + "step": 6357, + "time_per_iteration": 2.5302257537841797 + }, + { + "auxiliary_loss_clip": 0.01127375, + "auxiliary_loss_mlp": 0.01040855, + "balance_loss_clip": 1.02465415, + "balance_loss_mlp": 1.04773057, + "epoch": 0.38226364046294903, + "flos": 23367468700800.0, + "grad_norm": 3.08273691075534, + "language_loss": 0.77903318, + "learning_rate": 2.8335022322714695e-06, + "loss": 0.80071545, + "num_input_tokens_seen": 136426085, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.796875, + "step": 6358, + "time_per_iteration": 2.4700090885162354 + }, + { + "auxiliary_loss_clip": 0.01122323, + "auxiliary_loss_mlp": 0.01040593, + "balance_loss_clip": 1.02576303, + "balance_loss_mlp": 1.0432725, + "epoch": 0.382323763715617, + "flos": 19646225648640.0, + "grad_norm": 2.070211767582473, + "language_loss": 0.78700459, + "learning_rate": 2.8331481877671036e-06, + "loss": 0.80863374, + "num_input_tokens_seen": 136442670, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7890625, + "step": 6359, + "time_per_iteration": 2.4555094242095947 + }, + { + "auxiliary_loss_clip": 0.01118608, + "auxiliary_loss_mlp": 0.01041395, + "balance_loss_clip": 1.02698255, + "balance_loss_mlp": 1.04290545, + "epoch": 0.38238388696828496, + "flos": 54124741232640.0, + "grad_norm": 2.6399902686671113, + "language_loss": 0.69392359, + "learning_rate": 2.8327941116696754e-06, + "loss": 0.7155236, + "num_input_tokens_seen": 136465730, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7578125, + "step": 6360, + "time_per_iteration": 2.736069440841675 + }, + { + "auxiliary_loss_clip": 0.01118797, + "auxiliary_loss_mlp": 0.01031345, + "balance_loss_clip": 1.01632452, + "balance_loss_mlp": 1.04197633, + "epoch": 0.382444010220953, + "flos": 24936190352640.0, + "grad_norm": 1.9168722583294633, + "language_loss": 0.78836095, + "learning_rate": 2.83244000399261e-06, + "loss": 0.80986238, + "num_input_tokens_seen": 136487215, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.765625, + "step": 6361, + "time_per_iteration": 2.511254072189331 + }, + { + "auxiliary_loss_clip": 0.01115444, + "auxiliary_loss_mlp": 0.01036962, + "balance_loss_clip": 1.02274048, + "balance_loss_mlp": 1.04114652, + "epoch": 0.38250413347362094, + "flos": 42337351209600.0, + "grad_norm": 1.4566170801765106, + "language_loss": 0.65315771, + "learning_rate": 2.832085864749337e-06, + "loss": 0.67468172, + "num_input_tokens_seen": 136510365, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7421875, + "step": 6362, + "time_per_iteration": 2.632784128189087 + }, + { + "auxiliary_loss_clip": 0.01118848, + "auxiliary_loss_mlp": 0.01032495, + "balance_loss_clip": 1.01779675, + "balance_loss_mlp": 1.04175615, + "epoch": 0.3825642567262889, + "flos": 16289224462080.0, + "grad_norm": 1.8527291741217293, + "language_loss": 0.82063204, + "learning_rate": 2.8317316939532848e-06, + "loss": 0.84214544, + "num_input_tokens_seen": 136527100, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.76953125, + "step": 6363, + "time_per_iteration": 2.4478373527526855 + }, + { + "auxiliary_loss_clip": 0.01119064, + "auxiliary_loss_mlp": 0.01042512, + "balance_loss_clip": 1.02837944, + "balance_loss_mlp": 1.0446111, + "epoch": 0.3826243799789569, + "flos": 45654778586880.0, + "grad_norm": 1.811422380776527, + "language_loss": 0.58428323, + "learning_rate": 2.8313774916178825e-06, + "loss": 0.60589898, + "num_input_tokens_seen": 136550870, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7421875, + "step": 6364, + "time_per_iteration": 2.655128002166748 + }, + { + "auxiliary_loss_clip": 0.01123151, + "auxiliary_loss_mlp": 0.01039269, + "balance_loss_clip": 1.02496374, + "balance_loss_mlp": 1.04423463, + "epoch": 0.38268450323162484, + "flos": 25301581453440.0, + "grad_norm": 2.1451175401130893, + "language_loss": 0.68881112, + "learning_rate": 2.8310232577565635e-06, + "loss": 0.71043533, + "num_input_tokens_seen": 136569895, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7890625, + "step": 6365, + "time_per_iteration": 2.51526141166687 + }, + { + "auxiliary_loss_clip": 0.01121408, + "auxiliary_loss_mlp": 0.01036007, + "balance_loss_clip": 1.02065301, + "balance_loss_mlp": 1.04057527, + "epoch": 0.3827446264842928, + "flos": 21836022387840.0, + "grad_norm": 4.555943608034253, + "language_loss": 0.73442698, + "learning_rate": 2.830668992382758e-06, + "loss": 0.75600111, + "num_input_tokens_seen": 136588585, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.8046875, + "step": 6366, + "time_per_iteration": 2.448585033416748 + }, + { + "auxiliary_loss_clip": 0.01120534, + "auxiliary_loss_mlp": 0.01035025, + "balance_loss_clip": 1.02026677, + "balance_loss_mlp": 1.04226327, + "epoch": 0.38280474973696077, + "flos": 25734591907200.0, + "grad_norm": 2.0234001922769327, + "language_loss": 0.68829554, + "learning_rate": 2.830314695509902e-06, + "loss": 0.70985115, + "num_input_tokens_seen": 136606640, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.78125, + "step": 6367, + "time_per_iteration": 2.569301128387451 + }, + { + "auxiliary_loss_clip": 0.0111708, + "auxiliary_loss_mlp": 0.01033716, + "balance_loss_clip": 1.01917887, + "balance_loss_mlp": 1.04202485, + "epoch": 0.38286487298962874, + "flos": 24895934184960.0, + "grad_norm": 4.344593393004367, + "language_loss": 0.6481666, + "learning_rate": 2.82996036715143e-06, + "loss": 0.66967463, + "num_input_tokens_seen": 136624940, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.75, + "step": 6368, + "time_per_iteration": 2.4531960487365723 + }, + { + "auxiliary_loss_clip": 0.01120319, + "auxiliary_loss_mlp": 0.01034921, + "balance_loss_clip": 1.02053833, + "balance_loss_mlp": 1.04277039, + "epoch": 0.3829249962422967, + "flos": 28543703967360.0, + "grad_norm": 1.315785818077373, + "language_loss": 0.68389189, + "learning_rate": 2.8296060073207763e-06, + "loss": 0.70544434, + "num_input_tokens_seen": 136645540, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7734375, + "step": 6369, + "time_per_iteration": 2.5403318405151367 + }, + { + "auxiliary_loss_clip": 0.01116957, + "auxiliary_loss_mlp": 0.01042768, + "balance_loss_clip": 1.02774167, + "balance_loss_mlp": 1.04172897, + "epoch": 0.38298511949496467, + "flos": 21471205904640.0, + "grad_norm": 1.7184057003296296, + "language_loss": 0.78214431, + "learning_rate": 2.8292516160313804e-06, + "loss": 0.80374157, + "num_input_tokens_seen": 136664530, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.75, + "step": 6370, + "time_per_iteration": 2.4397096633911133 + }, + { + "auxiliary_loss_clip": 0.01119925, + "auxiliary_loss_mlp": 0.01039652, + "balance_loss_clip": 1.02569818, + "balance_loss_mlp": 1.04368424, + "epoch": 0.38304524274763263, + "flos": 31679998035840.0, + "grad_norm": 2.8055794910549525, + "language_loss": 0.64556968, + "learning_rate": 2.8288971932966805e-06, + "loss": 0.66716546, + "num_input_tokens_seen": 136682315, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76171875, + "step": 6371, + "time_per_iteration": 2.5470147132873535 + }, + { + "auxiliary_loss_clip": 0.01124784, + "auxiliary_loss_mlp": 0.01037674, + "balance_loss_clip": 1.0221653, + "balance_loss_mlp": 1.04452634, + "epoch": 0.3831053660003006, + "flos": 25076816098560.0, + "grad_norm": 1.8238449128176952, + "language_loss": 0.72682339, + "learning_rate": 2.8285427391301155e-06, + "loss": 0.7484479, + "num_input_tokens_seen": 136701185, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.80078125, + "step": 6372, + "time_per_iteration": 2.47695255279541 + }, + { + "auxiliary_loss_clip": 0.01121696, + "auxiliary_loss_mlp": 0.01038223, + "balance_loss_clip": 1.02325058, + "balance_loss_mlp": 1.04308939, + "epoch": 0.38316548925296856, + "flos": 23259018562560.0, + "grad_norm": 1.5970403518130607, + "language_loss": 0.84758627, + "learning_rate": 2.8281882535451266e-06, + "loss": 0.86918551, + "num_input_tokens_seen": 136721265, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.7890625, + "step": 6373, + "time_per_iteration": 2.514571189880371 + }, + { + "auxiliary_loss_clip": 0.01124014, + "auxiliary_loss_mlp": 0.01043172, + "balance_loss_clip": 1.02784181, + "balance_loss_mlp": 1.04392529, + "epoch": 0.3832256125056366, + "flos": 34423465991040.0, + "grad_norm": 4.718004058381721, + "language_loss": 0.74721354, + "learning_rate": 2.8278337365551567e-06, + "loss": 0.76888537, + "num_input_tokens_seen": 136741885, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.80078125, + "step": 6374, + "time_per_iteration": 2.5505032539367676 + }, + { + "auxiliary_loss_clip": 0.01124139, + "auxiliary_loss_mlp": 0.01041765, + "balance_loss_clip": 1.02675653, + "balance_loss_mlp": 1.04414058, + "epoch": 0.38328573575830455, + "flos": 21762764599680.0, + "grad_norm": 2.8586580554057472, + "language_loss": 0.75701195, + "learning_rate": 2.8274791881736485e-06, + "loss": 0.77867097, + "num_input_tokens_seen": 136760905, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.80078125, + "step": 6375, + "time_per_iteration": 2.467555522918701 + }, + { + "auxiliary_loss_clip": 0.01122331, + "auxiliary_loss_mlp": 0.01037666, + "balance_loss_clip": 1.02300918, + "balance_loss_mlp": 1.04375613, + "epoch": 0.3833458590109725, + "flos": 17380010724480.0, + "grad_norm": 2.257221103761015, + "language_loss": 0.72827101, + "learning_rate": 2.8271246084140457e-06, + "loss": 0.7498709, + "num_input_tokens_seen": 136777240, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78515625, + "step": 6376, + "time_per_iteration": 2.4082555770874023 + }, + { + "auxiliary_loss_clip": 0.01118574, + "auxiliary_loss_mlp": 0.01039859, + "balance_loss_clip": 1.02455282, + "balance_loss_mlp": 1.04245007, + "epoch": 0.3834059822636405, + "flos": 29424557191680.0, + "grad_norm": 1.5879949283042905, + "language_loss": 0.67586625, + "learning_rate": 2.826769997289796e-06, + "loss": 0.69745058, + "num_input_tokens_seen": 136801040, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.76171875, + "step": 6377, + "time_per_iteration": 2.54896879196167 + }, + { + "auxiliary_loss_clip": 0.01124961, + "auxiliary_loss_mlp": 0.01039863, + "balance_loss_clip": 1.02448511, + "balance_loss_mlp": 1.04608607, + "epoch": 0.38346610551630844, + "flos": 21470739027840.0, + "grad_norm": 2.1973025079181117, + "language_loss": 0.72991705, + "learning_rate": 2.826415354814344e-06, + "loss": 0.75156534, + "num_input_tokens_seen": 136819495, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.7890625, + "step": 6378, + "time_per_iteration": 2.4442975521087646 + }, + { + "auxiliary_loss_clip": 0.01121801, + "auxiliary_loss_mlp": 0.0104221, + "balance_loss_clip": 1.02755964, + "balance_loss_mlp": 1.04327178, + "epoch": 0.3835262287689764, + "flos": 27561224188800.0, + "grad_norm": 1.6808845830991803, + "language_loss": 0.69162869, + "learning_rate": 2.8260606810011396e-06, + "loss": 0.71326876, + "num_input_tokens_seen": 136838840, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78515625, + "step": 6379, + "time_per_iteration": 2.529088258743286 + }, + { + "auxiliary_loss_clip": 0.01121458, + "auxiliary_loss_mlp": 0.01038205, + "balance_loss_clip": 1.02344704, + "balance_loss_mlp": 1.04552865, + "epoch": 0.3835863520216444, + "flos": 15523716787200.0, + "grad_norm": 1.6321901167852362, + "language_loss": 0.82979369, + "learning_rate": 2.8257059758636315e-06, + "loss": 0.85139024, + "num_input_tokens_seen": 136854425, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7578125, + "step": 6380, + "time_per_iteration": 2.4336190223693848 + }, + { + "auxiliary_loss_clip": 0.01120843, + "auxiliary_loss_mlp": 0.0103481, + "balance_loss_clip": 1.02090406, + "balance_loss_mlp": 1.04595208, + "epoch": 0.38364647527431234, + "flos": 21904934630400.0, + "grad_norm": 1.4297951270127425, + "language_loss": 0.81347466, + "learning_rate": 2.8253512394152697e-06, + "loss": 0.83503115, + "num_input_tokens_seen": 136874355, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 6381, + "time_per_iteration": 2.5029306411743164 + }, + { + "auxiliary_loss_clip": 0.0104681, + "auxiliary_loss_mlp": 0.01005882, + "balance_loss_clip": 1.00420141, + "balance_loss_mlp": 1.02098966, + "epoch": 0.3837065985269803, + "flos": 65534927558400.0, + "grad_norm": 0.796129115027233, + "language_loss": 0.60459685, + "learning_rate": 2.8249964716695068e-06, + "loss": 0.6251238, + "num_input_tokens_seen": 136937475, + "router_z_loss_clip": 0.0168457, + "router_z_loss_mlp": 0.2578125, + "step": 6382, + "time_per_iteration": 3.0525829792022705 + }, + { + "auxiliary_loss_clip": 0.01123582, + "auxiliary_loss_mlp": 0.01036921, + "balance_loss_clip": 1.02186477, + "balance_loss_mlp": 1.04358447, + "epoch": 0.38376672177964827, + "flos": 28256598558720.0, + "grad_norm": 2.302869327575685, + "language_loss": 0.66052485, + "learning_rate": 2.824641672639794e-06, + "loss": 0.68212986, + "num_input_tokens_seen": 136955805, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.796875, + "step": 6383, + "time_per_iteration": 2.5166289806365967 + }, + { + "auxiliary_loss_clip": 0.01124634, + "auxiliary_loss_mlp": 0.01033937, + "balance_loss_clip": 1.01944149, + "balance_loss_mlp": 1.04657924, + "epoch": 0.38382684503231623, + "flos": 20631363033600.0, + "grad_norm": 2.2385812040155932, + "language_loss": 0.74811673, + "learning_rate": 2.824286842339587e-06, + "loss": 0.76970243, + "num_input_tokens_seen": 136975240, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 6384, + "time_per_iteration": 2.4451465606689453 + }, + { + "auxiliary_loss_clip": 0.01120418, + "auxiliary_loss_mlp": 0.01036466, + "balance_loss_clip": 1.02219081, + "balance_loss_mlp": 1.04429483, + "epoch": 0.3838869682849842, + "flos": 19605825826560.0, + "grad_norm": 1.4336247312181014, + "language_loss": 0.75883526, + "learning_rate": 2.823931980782341e-06, + "loss": 0.78040409, + "num_input_tokens_seen": 136994985, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7578125, + "step": 6385, + "time_per_iteration": 2.4994513988494873 + }, + { + "auxiliary_loss_clip": 0.01046845, + "auxiliary_loss_mlp": 0.0100207, + "balance_loss_clip": 1.0002346, + "balance_loss_mlp": 1.02044809, + "epoch": 0.38394709153765216, + "flos": 56556110891520.0, + "grad_norm": 0.9433326566144719, + "language_loss": 0.67094183, + "learning_rate": 2.82357708798151e-06, + "loss": 0.69143105, + "num_input_tokens_seen": 137046290, + "router_z_loss_clip": 0.01831055, + "router_z_loss_mlp": 0.265625, + "step": 6386, + "time_per_iteration": 2.938122272491455 + }, + { + "auxiliary_loss_clip": 0.0112227, + "auxiliary_loss_mlp": 0.01032989, + "balance_loss_clip": 1.01933384, + "balance_loss_mlp": 1.0465281, + "epoch": 0.3840072147903202, + "flos": 15888748752000.0, + "grad_norm": 1.7796918810721745, + "language_loss": 0.72464442, + "learning_rate": 2.8232221639505547e-06, + "loss": 0.74619704, + "num_input_tokens_seen": 137064725, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7578125, + "step": 6387, + "time_per_iteration": 5.465053081512451 + }, + { + "auxiliary_loss_clip": 0.01120429, + "auxiliary_loss_mlp": 0.01038992, + "balance_loss_clip": 1.02478194, + "balance_loss_mlp": 1.0451014, + "epoch": 0.38406733804298815, + "flos": 28218030330240.0, + "grad_norm": 1.6321565887315352, + "language_loss": 0.81181073, + "learning_rate": 2.822867208702932e-06, + "loss": 0.8334049, + "num_input_tokens_seen": 137086030, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.75390625, + "step": 6388, + "time_per_iteration": 3.940337657928467 + }, + { + "auxiliary_loss_clip": 0.01117866, + "auxiliary_loss_mlp": 0.01035288, + "balance_loss_clip": 1.02183485, + "balance_loss_mlp": 1.04249692, + "epoch": 0.3841274612956561, + "flos": 18223588609920.0, + "grad_norm": 1.6383752800672902, + "language_loss": 0.76158738, + "learning_rate": 2.8225122222521026e-06, + "loss": 0.78311884, + "num_input_tokens_seen": 137105400, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75390625, + "step": 6389, + "time_per_iteration": 2.4720914363861084 + }, + { + "auxiliary_loss_clip": 0.01125023, + "auxiliary_loss_mlp": 0.01044293, + "balance_loss_clip": 1.02846217, + "balance_loss_mlp": 1.04541564, + "epoch": 0.3841875845483241, + "flos": 19792884879360.0, + "grad_norm": 1.5616719605863645, + "language_loss": 0.76284117, + "learning_rate": 2.8221572046115273e-06, + "loss": 0.78453434, + "num_input_tokens_seen": 137124985, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.796875, + "step": 6390, + "time_per_iteration": 2.4576520919799805 + }, + { + "auxiliary_loss_clip": 0.01124413, + "auxiliary_loss_mlp": 0.01048913, + "balance_loss_clip": 1.03295112, + "balance_loss_mlp": 1.04433882, + "epoch": 0.38424770780099204, + "flos": 29898829393920.0, + "grad_norm": 1.6285452565530243, + "language_loss": 0.70119178, + "learning_rate": 2.821802155794668e-06, + "loss": 0.72292501, + "num_input_tokens_seen": 137146745, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.80078125, + "step": 6391, + "time_per_iteration": 2.5657877922058105 + }, + { + "auxiliary_loss_clip": 0.01121063, + "auxiliary_loss_mlp": 0.01035269, + "balance_loss_clip": 1.01978421, + "balance_loss_mlp": 1.04267848, + "epoch": 0.38430783105366, + "flos": 20813717404800.0, + "grad_norm": 1.938766253942268, + "language_loss": 0.84100312, + "learning_rate": 2.8214470758149884e-06, + "loss": 0.86256641, + "num_input_tokens_seen": 137163195, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.78515625, + "step": 6392, + "time_per_iteration": 2.4366884231567383 + }, + { + "auxiliary_loss_clip": 0.01120524, + "auxiliary_loss_mlp": 0.01035539, + "balance_loss_clip": 1.0215621, + "balance_loss_mlp": 1.04348612, + "epoch": 0.384367954306328, + "flos": 10998577399680.0, + "grad_norm": 2.11211623143903, + "language_loss": 0.61170864, + "learning_rate": 2.8210919646859536e-06, + "loss": 0.63326931, + "num_input_tokens_seen": 137179330, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76953125, + "step": 6393, + "time_per_iteration": 2.428238868713379 + }, + { + "auxiliary_loss_clip": 0.01128297, + "auxiliary_loss_mlp": 0.01035177, + "balance_loss_clip": 1.01886964, + "balance_loss_mlp": 1.04589796, + "epoch": 0.38442807755899594, + "flos": 25338030779520.0, + "grad_norm": 2.3555579295861775, + "language_loss": 0.71295553, + "learning_rate": 2.820736822421029e-06, + "loss": 0.73459029, + "num_input_tokens_seen": 137198655, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.82421875, + "step": 6394, + "time_per_iteration": 2.483506679534912 + }, + { + "auxiliary_loss_clip": 0.01129724, + "auxiliary_loss_mlp": 0.01033781, + "balance_loss_clip": 1.01760483, + "balance_loss_mlp": 1.04732203, + "epoch": 0.3844882008116639, + "flos": 21069760527360.0, + "grad_norm": 2.3366242235467047, + "language_loss": 0.81172824, + "learning_rate": 2.8203816490336822e-06, + "loss": 0.83336329, + "num_input_tokens_seen": 137217120, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.82421875, + "step": 6395, + "time_per_iteration": 2.471301317214966 + }, + { + "auxiliary_loss_clip": 0.01126851, + "auxiliary_loss_mlp": 0.01043233, + "balance_loss_clip": 1.02880275, + "balance_loss_mlp": 1.04770553, + "epoch": 0.38454832406433187, + "flos": 17963235855360.0, + "grad_norm": 3.9526859148826707, + "language_loss": 0.70642132, + "learning_rate": 2.8200264445373813e-06, + "loss": 0.72812212, + "num_input_tokens_seen": 137234410, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7890625, + "step": 6396, + "time_per_iteration": 2.4121108055114746 + }, + { + "auxiliary_loss_clip": 0.01046507, + "auxiliary_loss_mlp": 0.00999241, + "balance_loss_clip": 0.99745274, + "balance_loss_mlp": 1.01972008, + "epoch": 0.38460844731699984, + "flos": 67924999555200.0, + "grad_norm": 0.8889613923167966, + "language_loss": 0.59708536, + "learning_rate": 2.8196712089455954e-06, + "loss": 0.61754286, + "num_input_tokens_seen": 137294940, + "router_z_loss_clip": 0.01782227, + "router_z_loss_mlp": 0.26757812, + "step": 6397, + "time_per_iteration": 3.1453351974487305 + }, + { + "auxiliary_loss_clip": 0.01123309, + "auxiliary_loss_mlp": 0.01031546, + "balance_loss_clip": 1.01682329, + "balance_loss_mlp": 1.0459342, + "epoch": 0.3846685705696678, + "flos": 25849075530240.0, + "grad_norm": 1.8498202803423767, + "language_loss": 0.84868926, + "learning_rate": 2.819315942271794e-06, + "loss": 0.87023783, + "num_input_tokens_seen": 137315035, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.7734375, + "step": 6398, + "time_per_iteration": 2.488083839416504 + }, + { + "auxiliary_loss_clip": 0.01121502, + "auxiliary_loss_mlp": 0.01032478, + "balance_loss_clip": 1.01826787, + "balance_loss_mlp": 1.0444839, + "epoch": 0.38472869382233577, + "flos": 16290194129280.0, + "grad_norm": 1.942979036208199, + "language_loss": 0.79634017, + "learning_rate": 2.8189606445294515e-06, + "loss": 0.81787992, + "num_input_tokens_seen": 137333155, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7734375, + "step": 6399, + "time_per_iteration": 2.4537224769592285 + }, + { + "auxiliary_loss_clip": 0.01124087, + "auxiliary_loss_mlp": 0.01036794, + "balance_loss_clip": 1.02149892, + "balance_loss_mlp": 1.04439902, + "epoch": 0.38478881707500373, + "flos": 19353122668800.0, + "grad_norm": 1.8928366067789952, + "language_loss": 0.67337728, + "learning_rate": 2.818605315732038e-06, + "loss": 0.69498605, + "num_input_tokens_seen": 137351515, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.796875, + "step": 6400, + "time_per_iteration": 2.434598207473755 + }, + { + "auxiliary_loss_clip": 0.0112665, + "auxiliary_loss_mlp": 0.0104604, + "balance_loss_clip": 1.030936, + "balance_loss_mlp": 1.04645705, + "epoch": 0.38484894032767175, + "flos": 24860849575680.0, + "grad_norm": 1.6542190438860391, + "language_loss": 0.73004973, + "learning_rate": 2.81824995589303e-06, + "loss": 0.7517767, + "num_input_tokens_seen": 137371255, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.80078125, + "step": 6401, + "time_per_iteration": 2.4963061809539795 + }, + { + "auxiliary_loss_clip": 0.01123878, + "auxiliary_loss_mlp": 0.01038712, + "balance_loss_clip": 1.02329874, + "balance_loss_mlp": 1.045017, + "epoch": 0.3849090635803397, + "flos": 14501806853760.0, + "grad_norm": 1.9430058457885813, + "language_loss": 0.71920168, + "learning_rate": 2.8178945650259012e-06, + "loss": 0.74082762, + "num_input_tokens_seen": 137388980, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.7890625, + "step": 6402, + "time_per_iteration": 2.426349639892578 + }, + { + "auxiliary_loss_clip": 0.01118079, + "auxiliary_loss_mlp": 0.01034485, + "balance_loss_clip": 1.02007246, + "balance_loss_mlp": 1.04232907, + "epoch": 0.3849691868330077, + "flos": 18515865576960.0, + "grad_norm": 1.7846208976590752, + "language_loss": 0.82449806, + "learning_rate": 2.817539143144128e-06, + "loss": 0.84602368, + "num_input_tokens_seen": 137406885, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7578125, + "step": 6403, + "time_per_iteration": 2.4700570106506348 + }, + { + "auxiliary_loss_clip": 0.0112163, + "auxiliary_loss_mlp": 0.01038876, + "balance_loss_clip": 1.02340865, + "balance_loss_mlp": 1.04500651, + "epoch": 0.38502931008567565, + "flos": 21616392677760.0, + "grad_norm": 1.8891944292176732, + "language_loss": 0.82468271, + "learning_rate": 2.817183690261189e-06, + "loss": 0.84628773, + "num_input_tokens_seen": 137425535, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.765625, + "step": 6404, + "time_per_iteration": 2.481968402862549 + }, + { + "auxiliary_loss_clip": 0.01122268, + "auxiliary_loss_mlp": 0.01035856, + "balance_loss_clip": 1.02136576, + "balance_loss_mlp": 1.04299283, + "epoch": 0.3850894333383436, + "flos": 25415346804480.0, + "grad_norm": 1.6334992055527433, + "language_loss": 0.69588619, + "learning_rate": 2.816828206390563e-06, + "loss": 0.71746749, + "num_input_tokens_seen": 137447700, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.79296875, + "step": 6405, + "time_per_iteration": 2.5947635173797607 + }, + { + "auxiliary_loss_clip": 0.01119754, + "auxiliary_loss_mlp": 0.01038243, + "balance_loss_clip": 1.02475476, + "balance_loss_mlp": 1.04411674, + "epoch": 0.3851495565910116, + "flos": 20227870581120.0, + "grad_norm": 1.9268009005119906, + "language_loss": 0.79068285, + "learning_rate": 2.816472691545729e-06, + "loss": 0.81226277, + "num_input_tokens_seen": 137462245, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75390625, + "step": 6406, + "time_per_iteration": 2.4195396900177 + }, + { + "auxiliary_loss_clip": 0.01125718, + "auxiliary_loss_mlp": 0.01037827, + "balance_loss_clip": 1.02247298, + "balance_loss_mlp": 1.04682863, + "epoch": 0.38520967984367954, + "flos": 16508459122560.0, + "grad_norm": 2.277779532957622, + "language_loss": 0.8438794, + "learning_rate": 2.8161171457401694e-06, + "loss": 0.86551487, + "num_input_tokens_seen": 137476455, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.7890625, + "step": 6407, + "time_per_iteration": 2.4518916606903076 + }, + { + "auxiliary_loss_clip": 0.01043854, + "auxiliary_loss_mlp": 0.01007721, + "balance_loss_clip": 1.00623727, + "balance_loss_mlp": 1.01778841, + "epoch": 0.3852698030963475, + "flos": 61313772971520.0, + "grad_norm": 0.8214817017046727, + "language_loss": 0.64868087, + "learning_rate": 2.815761568987365e-06, + "loss": 0.66919661, + "num_input_tokens_seen": 137539845, + "router_z_loss_clip": 0.01483154, + "router_z_loss_mlp": 0.25976562, + "step": 6408, + "time_per_iteration": 3.090940475463867 + }, + { + "auxiliary_loss_clip": 0.01123062, + "auxiliary_loss_mlp": 0.01043064, + "balance_loss_clip": 1.02676785, + "balance_loss_mlp": 1.04405272, + "epoch": 0.3853299263490155, + "flos": 22893016930560.0, + "grad_norm": 1.5501960898767924, + "language_loss": 0.73628408, + "learning_rate": 2.8154059613008e-06, + "loss": 0.7579453, + "num_input_tokens_seen": 137559880, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.7890625, + "step": 6409, + "time_per_iteration": 2.4831972122192383 + }, + { + "auxiliary_loss_clip": 0.01129844, + "auxiliary_loss_mlp": 0.01049195, + "balance_loss_clip": 1.03255367, + "balance_loss_mlp": 1.04574656, + "epoch": 0.38539004960168344, + "flos": 20047491457920.0, + "grad_norm": 2.0394333066705874, + "language_loss": 0.70208335, + "learning_rate": 2.81505032269396e-06, + "loss": 0.72387373, + "num_input_tokens_seen": 137578225, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.83984375, + "step": 6410, + "time_per_iteration": 2.430617332458496 + }, + { + "auxiliary_loss_clip": 0.01043682, + "auxiliary_loss_mlp": 0.01003736, + "balance_loss_clip": 1.0021385, + "balance_loss_mlp": 1.01802111, + "epoch": 0.3854501728543514, + "flos": 68730691570560.0, + "grad_norm": 0.6794214350275563, + "language_loss": 0.60311568, + "learning_rate": 2.81469465318033e-06, + "loss": 0.62358987, + "num_input_tokens_seen": 137645770, + "router_z_loss_clip": 0.01599121, + "router_z_loss_mlp": 0.2578125, + "step": 6411, + "time_per_iteration": 3.1681244373321533 + }, + { + "auxiliary_loss_clip": 0.01118542, + "auxiliary_loss_mlp": 0.01029148, + "balance_loss_clip": 1.01543355, + "balance_loss_mlp": 1.04146707, + "epoch": 0.38551029610701937, + "flos": 20485027025280.0, + "grad_norm": 1.9543275921913768, + "language_loss": 0.7770192, + "learning_rate": 2.814338952773397e-06, + "loss": 0.79849613, + "num_input_tokens_seen": 137664090, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76953125, + "step": 6412, + "time_per_iteration": 2.4670822620391846 + }, + { + "auxiliary_loss_clip": 0.01124348, + "auxiliary_loss_mlp": 0.01037148, + "balance_loss_clip": 1.02093506, + "balance_loss_mlp": 1.0437274, + "epoch": 0.38557041935968733, + "flos": 23471788775040.0, + "grad_norm": 1.7609162802618283, + "language_loss": 0.78148544, + "learning_rate": 2.8139832214866493e-06, + "loss": 0.80310041, + "num_input_tokens_seen": 137683190, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.8046875, + "step": 6413, + "time_per_iteration": 2.4506192207336426 + }, + { + "auxiliary_loss_clip": 0.01040458, + "auxiliary_loss_mlp": 0.01006495, + "balance_loss_clip": 1.00485027, + "balance_loss_mlp": 1.01477003, + "epoch": 0.38563054261235535, + "flos": 63966636869760.0, + "grad_norm": 0.8068957555662655, + "language_loss": 0.61344963, + "learning_rate": 2.813627459333576e-06, + "loss": 0.63391918, + "num_input_tokens_seen": 137737315, + "router_z_loss_clip": 0.01647949, + "router_z_loss_mlp": 0.2578125, + "step": 6414, + "time_per_iteration": 2.897420883178711 + }, + { + "auxiliary_loss_clip": 0.01124739, + "auxiliary_loss_mlp": 0.0104191, + "balance_loss_clip": 1.02712834, + "balance_loss_mlp": 1.04452538, + "epoch": 0.3856906658650233, + "flos": 23987789602560.0, + "grad_norm": 2.3808373048749543, + "language_loss": 0.77121973, + "learning_rate": 2.8132716663276685e-06, + "loss": 0.79288626, + "num_input_tokens_seen": 137753535, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.80078125, + "step": 6415, + "time_per_iteration": 2.455246686935425 + }, + { + "auxiliary_loss_clip": 0.01115597, + "auxiliary_loss_mlp": 0.01032062, + "balance_loss_clip": 1.01916933, + "balance_loss_mlp": 1.04303658, + "epoch": 0.3857507891176913, + "flos": 25007436979200.0, + "grad_norm": 1.6468091717833364, + "language_loss": 0.79597795, + "learning_rate": 2.8129158424824173e-06, + "loss": 0.81745458, + "num_input_tokens_seen": 137773405, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7265625, + "step": 6416, + "time_per_iteration": 2.5162863731384277 + }, + { + "auxiliary_loss_clip": 0.0111887, + "auxiliary_loss_mlp": 0.01034214, + "balance_loss_clip": 1.02100587, + "balance_loss_mlp": 1.04190922, + "epoch": 0.38581091237035925, + "flos": 21536778182400.0, + "grad_norm": 1.6816352340920986, + "language_loss": 0.7957328, + "learning_rate": 2.8125599878113155e-06, + "loss": 0.81726366, + "num_input_tokens_seen": 137790810, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.76953125, + "step": 6417, + "time_per_iteration": 2.462679862976074 + }, + { + "auxiliary_loss_clip": 0.01116808, + "auxiliary_loss_mlp": 0.01037406, + "balance_loss_clip": 1.02369118, + "balance_loss_mlp": 1.03945839, + "epoch": 0.3858710356230272, + "flos": 17383889393280.0, + "grad_norm": 9.924006648688666, + "language_loss": 0.80246758, + "learning_rate": 2.8122041023278583e-06, + "loss": 0.82400978, + "num_input_tokens_seen": 137810265, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 6418, + "time_per_iteration": 2.4485208988189697 + }, + { + "auxiliary_loss_clip": 0.01114184, + "auxiliary_loss_mlp": 0.01033119, + "balance_loss_clip": 1.01992905, + "balance_loss_mlp": 1.03939319, + "epoch": 0.3859311588756952, + "flos": 20339588856960.0, + "grad_norm": 1.9958339666442106, + "language_loss": 0.79694712, + "learning_rate": 2.8118481860455407e-06, + "loss": 0.81842011, + "num_input_tokens_seen": 137828580, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.75, + "step": 6419, + "time_per_iteration": 2.4360008239746094 + }, + { + "auxiliary_loss_clip": 0.01115104, + "auxiliary_loss_mlp": 0.01034912, + "balance_loss_clip": 1.01972449, + "balance_loss_mlp": 1.04120576, + "epoch": 0.38599128212836314, + "flos": 26321157002880.0, + "grad_norm": 2.0553625572614678, + "language_loss": 0.67804086, + "learning_rate": 2.8114922389778573e-06, + "loss": 0.69954103, + "num_input_tokens_seen": 137846145, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.73828125, + "step": 6420, + "time_per_iteration": 2.489661931991577 + }, + { + "auxiliary_loss_clip": 0.01116038, + "auxiliary_loss_mlp": 0.01036432, + "balance_loss_clip": 1.02286029, + "balance_loss_mlp": 1.04163957, + "epoch": 0.3860514053810311, + "flos": 13553837066880.0, + "grad_norm": 2.4512212791744576, + "language_loss": 0.81831443, + "learning_rate": 2.8111362611383076e-06, + "loss": 0.83983916, + "num_input_tokens_seen": 137863705, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.74609375, + "step": 6421, + "time_per_iteration": 2.4278934001922607 + }, + { + "auxiliary_loss_clip": 0.01118285, + "auxiliary_loss_mlp": 0.01033321, + "balance_loss_clip": 1.01888454, + "balance_loss_mlp": 1.04031229, + "epoch": 0.3861115286336991, + "flos": 20954271323520.0, + "grad_norm": 2.2431145476637266, + "language_loss": 0.72079587, + "learning_rate": 2.8107802525403886e-06, + "loss": 0.74231195, + "num_input_tokens_seen": 137880285, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 6422, + "time_per_iteration": 2.441708564758301 + }, + { + "auxiliary_loss_clip": 0.01116019, + "auxiliary_loss_mlp": 0.01038012, + "balance_loss_clip": 1.02482104, + "balance_loss_mlp": 1.0425638, + "epoch": 0.38617165188636704, + "flos": 16362697731840.0, + "grad_norm": 1.6611822537555545, + "language_loss": 0.65814191, + "learning_rate": 2.8104242131976025e-06, + "loss": 0.6796822, + "num_input_tokens_seen": 137898335, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 6423, + "time_per_iteration": 2.4211878776550293 + }, + { + "auxiliary_loss_clip": 0.01121429, + "auxiliary_loss_mlp": 0.01039251, + "balance_loss_clip": 1.02561951, + "balance_loss_mlp": 1.0439117, + "epoch": 0.386231775139035, + "flos": 34787276893440.0, + "grad_norm": 1.965242475874499, + "language_loss": 0.68746173, + "learning_rate": 2.810068143123449e-06, + "loss": 0.70906854, + "num_input_tokens_seen": 137918605, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 6424, + "time_per_iteration": 2.5804436206817627 + }, + { + "auxiliary_loss_clip": 0.01117257, + "auxiliary_loss_mlp": 0.0103853, + "balance_loss_clip": 1.0243144, + "balance_loss_mlp": 1.04261661, + "epoch": 0.38629189839170297, + "flos": 21726171619200.0, + "grad_norm": 1.3808875353222407, + "language_loss": 0.72237349, + "learning_rate": 2.809712042331429e-06, + "loss": 0.74393135, + "num_input_tokens_seen": 137938245, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.74609375, + "step": 6425, + "time_per_iteration": 2.4568634033203125 + }, + { + "auxiliary_loss_clip": 0.01121874, + "auxiliary_loss_mlp": 0.01038367, + "balance_loss_clip": 1.02413344, + "balance_loss_mlp": 1.0424571, + "epoch": 0.38635202164437094, + "flos": 27923634460800.0, + "grad_norm": 2.566599175889616, + "language_loss": 0.80062914, + "learning_rate": 2.8093559108350484e-06, + "loss": 0.82223159, + "num_input_tokens_seen": 137956770, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.79296875, + "step": 6426, + "time_per_iteration": 2.5236575603485107 + }, + { + "auxiliary_loss_clip": 0.01123371, + "auxiliary_loss_mlp": 0.01036233, + "balance_loss_clip": 1.0222559, + "balance_loss_mlp": 1.04582727, + "epoch": 0.38641214489703896, + "flos": 23586631534080.0, + "grad_norm": 2.32293087490025, + "language_loss": 0.74624443, + "learning_rate": 2.80899974864781e-06, + "loss": 0.7678405, + "num_input_tokens_seen": 137977040, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7734375, + "step": 6427, + "time_per_iteration": 2.467555046081543 + }, + { + "auxiliary_loss_clip": 0.01118544, + "auxiliary_loss_mlp": 0.01039206, + "balance_loss_clip": 1.02530599, + "balance_loss_mlp": 1.04256904, + "epoch": 0.3864722681497069, + "flos": 12641239198080.0, + "grad_norm": 1.6951631816528543, + "language_loss": 0.69630527, + "learning_rate": 2.8086435557832203e-06, + "loss": 0.71788281, + "num_input_tokens_seen": 137993545, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 6428, + "time_per_iteration": 2.4336817264556885 + }, + { + "auxiliary_loss_clip": 0.01120968, + "auxiliary_loss_mlp": 0.01042024, + "balance_loss_clip": 1.02787971, + "balance_loss_mlp": 1.0427897, + "epoch": 0.3865323914023749, + "flos": 17598922162560.0, + "grad_norm": 2.175868568260599, + "language_loss": 0.84272587, + "learning_rate": 2.8082873322547863e-06, + "loss": 0.86435586, + "num_input_tokens_seen": 138010140, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.78125, + "step": 6429, + "time_per_iteration": 5.324048757553101 + }, + { + "auxiliary_loss_clip": 0.01121297, + "auxiliary_loss_mlp": 0.01037029, + "balance_loss_clip": 1.02358222, + "balance_loss_mlp": 1.04458523, + "epoch": 0.38659251465504285, + "flos": 18478949374080.0, + "grad_norm": 2.0434704200334726, + "language_loss": 0.808312, + "learning_rate": 2.807931078076015e-06, + "loss": 0.82989526, + "num_input_tokens_seen": 138028880, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 6430, + "time_per_iteration": 3.8362674713134766 + }, + { + "auxiliary_loss_clip": 0.01037896, + "auxiliary_loss_mlp": 0.01001686, + "balance_loss_clip": 1.00019002, + "balance_loss_mlp": 1.01247668, + "epoch": 0.3866526379077108, + "flos": 64165726978560.0, + "grad_norm": 0.7147232834997996, + "language_loss": 0.58793551, + "learning_rate": 2.807574793260416e-06, + "loss": 0.60833132, + "num_input_tokens_seen": 138098090, + "router_z_loss_clip": 0.01495361, + "router_z_loss_mlp": 0.25390625, + "step": 6431, + "time_per_iteration": 3.1054275035858154 + }, + { + "auxiliary_loss_clip": 0.01123522, + "auxiliary_loss_mlp": 0.01036133, + "balance_loss_clip": 1.0213275, + "balance_loss_mlp": 1.04425848, + "epoch": 0.3867127611603788, + "flos": 14388292897920.0, + "grad_norm": 1.8418420222570902, + "language_loss": 0.78914982, + "learning_rate": 2.8072184778215004e-06, + "loss": 0.81074637, + "num_input_tokens_seen": 138114735, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 6432, + "time_per_iteration": 2.441103458404541 + }, + { + "auxiliary_loss_clip": 0.01120861, + "auxiliary_loss_mlp": 0.01042942, + "balance_loss_clip": 1.02820802, + "balance_loss_mlp": 1.04033065, + "epoch": 0.38677288441304675, + "flos": 20010754823040.0, + "grad_norm": 3.1335187433073006, + "language_loss": 0.80734611, + "learning_rate": 2.806862131772779e-06, + "loss": 0.82898408, + "num_input_tokens_seen": 138130480, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.8046875, + "step": 6433, + "time_per_iteration": 2.4334840774536133 + }, + { + "auxiliary_loss_clip": 0.01122101, + "auxiliary_loss_mlp": 0.01036931, + "balance_loss_clip": 1.02167201, + "balance_loss_mlp": 1.04427695, + "epoch": 0.3868330076657147, + "flos": 22236893147520.0, + "grad_norm": 1.9920607209076013, + "language_loss": 0.70712543, + "learning_rate": 2.806505755127765e-06, + "loss": 0.72871572, + "num_input_tokens_seen": 138150640, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.77734375, + "step": 6434, + "time_per_iteration": 2.4485912322998047 + }, + { + "auxiliary_loss_clip": 0.01122001, + "auxiliary_loss_mlp": 0.01037218, + "balance_loss_clip": 1.02259684, + "balance_loss_mlp": 1.04096544, + "epoch": 0.3868931309183827, + "flos": 16727442387840.0, + "grad_norm": 3.1146547904297615, + "language_loss": 0.77674437, + "learning_rate": 2.806149347899972e-06, + "loss": 0.79833651, + "num_input_tokens_seen": 138169700, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.8125, + "step": 6435, + "time_per_iteration": 2.4734902381896973 + }, + { + "auxiliary_loss_clip": 0.01117121, + "auxiliary_loss_mlp": 0.01032568, + "balance_loss_clip": 1.01877558, + "balance_loss_mlp": 1.04157901, + "epoch": 0.38695325417105064, + "flos": 22674716023680.0, + "grad_norm": 1.6626735995393465, + "language_loss": 0.79557228, + "learning_rate": 2.805792910102915e-06, + "loss": 0.81706917, + "num_input_tokens_seen": 138185835, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.75390625, + "step": 6436, + "time_per_iteration": 2.461880922317505 + }, + { + "auxiliary_loss_clip": 0.01115966, + "auxiliary_loss_mlp": 0.01032941, + "balance_loss_clip": 1.01937521, + "balance_loss_mlp": 1.04099202, + "epoch": 0.3870133774237186, + "flos": 23112036109440.0, + "grad_norm": 1.7213495950653388, + "language_loss": 0.77057981, + "learning_rate": 2.8054364417501093e-06, + "loss": 0.79206884, + "num_input_tokens_seen": 138204080, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.75, + "step": 6437, + "time_per_iteration": 2.506342649459839 + }, + { + "auxiliary_loss_clip": 0.01118581, + "auxiliary_loss_mlp": 0.0104126, + "balance_loss_clip": 1.02759838, + "balance_loss_mlp": 1.0425818, + "epoch": 0.3870735006763866, + "flos": 17675699483520.0, + "grad_norm": 2.0991099349261013, + "language_loss": 0.8199805, + "learning_rate": 2.805079942855074e-06, + "loss": 0.84157896, + "num_input_tokens_seen": 138220710, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7578125, + "step": 6438, + "time_per_iteration": 2.4236960411071777 + }, + { + "auxiliary_loss_clip": 0.01119447, + "auxiliary_loss_mlp": 0.01039004, + "balance_loss_clip": 1.02413225, + "balance_loss_mlp": 1.04198575, + "epoch": 0.38713362392905454, + "flos": 23295791111040.0, + "grad_norm": 1.4416179830694351, + "language_loss": 0.75274503, + "learning_rate": 2.804723413431326e-06, + "loss": 0.77432954, + "num_input_tokens_seen": 138241720, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7734375, + "step": 6439, + "time_per_iteration": 2.4746499061584473 + }, + { + "auxiliary_loss_clip": 0.01115954, + "auxiliary_loss_mlp": 0.01030927, + "balance_loss_clip": 1.01804042, + "balance_loss_mlp": 1.04231787, + "epoch": 0.38719374718172256, + "flos": 21031192298880.0, + "grad_norm": 1.4591961315755648, + "language_loss": 0.74029297, + "learning_rate": 2.8043668534923855e-06, + "loss": 0.76176178, + "num_input_tokens_seen": 138261885, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.734375, + "step": 6440, + "time_per_iteration": 2.470442056655884 + }, + { + "auxiliary_loss_clip": 0.01120633, + "auxiliary_loss_mlp": 0.01041682, + "balance_loss_clip": 1.02755535, + "balance_loss_mlp": 1.04172719, + "epoch": 0.3872538704343905, + "flos": 19609776322560.0, + "grad_norm": 1.882594032026591, + "language_loss": 0.82420492, + "learning_rate": 2.804010263051774e-06, + "loss": 0.84582806, + "num_input_tokens_seen": 138280255, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 6441, + "time_per_iteration": 2.4857184886932373 + }, + { + "auxiliary_loss_clip": 0.01119023, + "auxiliary_loss_mlp": 0.01044385, + "balance_loss_clip": 1.03132594, + "balance_loss_mlp": 1.04210794, + "epoch": 0.3873139936870585, + "flos": 17530045833600.0, + "grad_norm": 2.099147848905264, + "language_loss": 0.81835496, + "learning_rate": 2.8036536421230118e-06, + "loss": 0.83998901, + "num_input_tokens_seen": 138296675, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.76953125, + "step": 6442, + "time_per_iteration": 2.4149296283721924 + }, + { + "auxiliary_loss_clip": 0.01116335, + "auxiliary_loss_mlp": 0.01035738, + "balance_loss_clip": 1.02142096, + "balance_loss_mlp": 1.04025602, + "epoch": 0.38737411693972645, + "flos": 17786555832960.0, + "grad_norm": 1.5694674536603201, + "language_loss": 0.83847654, + "learning_rate": 2.803296990719624e-06, + "loss": 0.85999727, + "num_input_tokens_seen": 138314985, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7578125, + "step": 6443, + "time_per_iteration": 2.4515957832336426 + }, + { + "auxiliary_loss_clip": 0.01039021, + "auxiliary_loss_mlp": 0.01007024, + "balance_loss_clip": 1.00551593, + "balance_loss_mlp": 1.0140909, + "epoch": 0.3874342401923944, + "flos": 58304637048960.0, + "grad_norm": 0.7719544775144753, + "language_loss": 0.50268674, + "learning_rate": 2.8029403088551327e-06, + "loss": 0.52314723, + "num_input_tokens_seen": 138373275, + "router_z_loss_clip": 0.01507568, + "router_z_loss_mlp": 0.24902344, + "step": 6444, + "time_per_iteration": 3.092834711074829 + }, + { + "auxiliary_loss_clip": 0.01115245, + "auxiliary_loss_mlp": 0.01037927, + "balance_loss_clip": 1.02502251, + "balance_loss_mlp": 1.04225266, + "epoch": 0.3874943634450624, + "flos": 17711933328000.0, + "grad_norm": 1.537835026490341, + "language_loss": 0.78736365, + "learning_rate": 2.802583596543065e-06, + "loss": 0.80889541, + "num_input_tokens_seen": 138391145, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7265625, + "step": 6445, + "time_per_iteration": 2.435347557067871 + }, + { + "auxiliary_loss_clip": 0.01115913, + "auxiliary_loss_mlp": 0.01033846, + "balance_loss_clip": 1.02055407, + "balance_loss_mlp": 1.04211605, + "epoch": 0.38755448669773035, + "flos": 19244852098560.0, + "grad_norm": 1.672895701432963, + "language_loss": 0.81121695, + "learning_rate": 2.8022268537969474e-06, + "loss": 0.83271456, + "num_input_tokens_seen": 138409875, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73828125, + "step": 6446, + "time_per_iteration": 2.469536781311035 + }, + { + "auxiliary_loss_clip": 0.01114406, + "auxiliary_loss_mlp": 0.01037114, + "balance_loss_clip": 1.02434087, + "balance_loss_mlp": 1.03933239, + "epoch": 0.3876146099503983, + "flos": 20594267262720.0, + "grad_norm": 1.877585125713849, + "language_loss": 0.77093089, + "learning_rate": 2.801870080630306e-06, + "loss": 0.79244608, + "num_input_tokens_seen": 138428965, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.75, + "step": 6447, + "time_per_iteration": 2.428525447845459 + }, + { + "auxiliary_loss_clip": 0.01116221, + "auxiliary_loss_mlp": 0.01032372, + "balance_loss_clip": 1.01940775, + "balance_loss_mlp": 1.04256356, + "epoch": 0.3876747332030663, + "flos": 19281121856640.0, + "grad_norm": 1.5240627220637166, + "language_loss": 0.75767821, + "learning_rate": 2.801513277056671e-06, + "loss": 0.7791642, + "num_input_tokens_seen": 138448090, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.734375, + "step": 6448, + "time_per_iteration": 2.4325876235961914 + }, + { + "auxiliary_loss_clip": 0.01115196, + "auxiliary_loss_mlp": 0.01033743, + "balance_loss_clip": 1.02023029, + "balance_loss_mlp": 1.04179466, + "epoch": 0.38773485645573424, + "flos": 18945895201920.0, + "grad_norm": 1.6442003276819328, + "language_loss": 0.75754648, + "learning_rate": 2.8011564430895725e-06, + "loss": 0.77903593, + "num_input_tokens_seen": 138466105, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 6449, + "time_per_iteration": 2.435208320617676 + }, + { + "auxiliary_loss_clip": 0.0111808, + "auxiliary_loss_mlp": 0.01033453, + "balance_loss_clip": 1.0194999, + "balance_loss_mlp": 1.03956699, + "epoch": 0.3877949797084022, + "flos": 23071348978560.0, + "grad_norm": 1.5394171504545016, + "language_loss": 0.78183508, + "learning_rate": 2.800799578742542e-06, + "loss": 0.80335045, + "num_input_tokens_seen": 138485160, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.78515625, + "step": 6450, + "time_per_iteration": 2.467933177947998 + }, + { + "auxiliary_loss_clip": 0.0112145, + "auxiliary_loss_mlp": 0.01036071, + "balance_loss_clip": 1.02190948, + "balance_loss_mlp": 1.04104686, + "epoch": 0.3878551029610702, + "flos": 29095543589760.0, + "grad_norm": 2.1284571270947263, + "language_loss": 0.77706474, + "learning_rate": 2.8004426840291106e-06, + "loss": 0.79863995, + "num_input_tokens_seen": 138504135, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8046875, + "step": 6451, + "time_per_iteration": 2.513192892074585 + }, + { + "auxiliary_loss_clip": 0.01112409, + "auxiliary_loss_mlp": 0.01026865, + "balance_loss_clip": 1.01337111, + "balance_loss_mlp": 1.03988457, + "epoch": 0.38791522621373814, + "flos": 20996394998400.0, + "grad_norm": 1.5965207120841256, + "language_loss": 0.7642619, + "learning_rate": 2.800085758962812e-06, + "loss": 0.7856546, + "num_input_tokens_seen": 138523955, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7265625, + "step": 6452, + "time_per_iteration": 2.453756809234619 + }, + { + "auxiliary_loss_clip": 0.01118677, + "auxiliary_loss_mlp": 0.01040253, + "balance_loss_clip": 1.02721739, + "balance_loss_mlp": 1.04313231, + "epoch": 0.3879753494664061, + "flos": 15486836497920.0, + "grad_norm": 1.5417712426283914, + "language_loss": 0.79843581, + "learning_rate": 2.799728803557182e-06, + "loss": 0.82002515, + "num_input_tokens_seen": 138541655, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7578125, + "step": 6453, + "time_per_iteration": 2.434788465499878 + }, + { + "auxiliary_loss_clip": 0.01126032, + "auxiliary_loss_mlp": 0.01037051, + "balance_loss_clip": 1.02257931, + "balance_loss_mlp": 1.0456028, + "epoch": 0.3880354727190741, + "flos": 22053964158720.0, + "grad_norm": 1.779502658436086, + "language_loss": 0.71759796, + "learning_rate": 2.7993718178257555e-06, + "loss": 0.73922884, + "num_input_tokens_seen": 138560860, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8046875, + "step": 6454, + "time_per_iteration": 2.456637382507324 + }, + { + "auxiliary_loss_clip": 0.01122488, + "auxiliary_loss_mlp": 0.01039713, + "balance_loss_clip": 1.02489531, + "balance_loss_mlp": 1.04253364, + "epoch": 0.3880955959717421, + "flos": 20340307128960.0, + "grad_norm": 2.1246626443539216, + "language_loss": 0.77918947, + "learning_rate": 2.7990148017820694e-06, + "loss": 0.80081153, + "num_input_tokens_seen": 138580200, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 6455, + "time_per_iteration": 2.4589757919311523 + }, + { + "auxiliary_loss_clip": 0.01118002, + "auxiliary_loss_mlp": 0.01034735, + "balance_loss_clip": 1.02040577, + "balance_loss_mlp": 1.04232621, + "epoch": 0.38815571922441006, + "flos": 23075407215360.0, + "grad_norm": 1.6339807395025958, + "language_loss": 0.75865024, + "learning_rate": 2.798657755439662e-06, + "loss": 0.78017759, + "num_input_tokens_seen": 138598315, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7578125, + "step": 6456, + "time_per_iteration": 2.4390318393707275 + }, + { + "auxiliary_loss_clip": 0.01121145, + "auxiliary_loss_mlp": 0.01033254, + "balance_loss_clip": 1.01944995, + "balance_loss_mlp": 1.04276633, + "epoch": 0.388215842477078, + "flos": 20776944856320.0, + "grad_norm": 2.085241252102015, + "language_loss": 0.60518527, + "learning_rate": 2.7983006788120726e-06, + "loss": 0.62672919, + "num_input_tokens_seen": 138615695, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.78515625, + "step": 6457, + "time_per_iteration": 2.459535837173462 + }, + { + "auxiliary_loss_clip": 0.01121291, + "auxiliary_loss_mlp": 0.01037459, + "balance_loss_clip": 1.02167547, + "balance_loss_mlp": 1.04195237, + "epoch": 0.388275965729746, + "flos": 20448182649600.0, + "grad_norm": 2.1234505206368475, + "language_loss": 0.80247247, + "learning_rate": 2.797943571912841e-06, + "loss": 0.82405996, + "num_input_tokens_seen": 138633180, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.79296875, + "step": 6458, + "time_per_iteration": 2.425049066543579 + }, + { + "auxiliary_loss_clip": 0.01120771, + "auxiliary_loss_mlp": 0.0103458, + "balance_loss_clip": 1.02072167, + "balance_loss_mlp": 1.04291797, + "epoch": 0.38833608898241395, + "flos": 27892392606720.0, + "grad_norm": 1.8371533851039183, + "language_loss": 0.81683058, + "learning_rate": 2.797586434755509e-06, + "loss": 0.83838403, + "num_input_tokens_seen": 138654785, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.77734375, + "step": 6459, + "time_per_iteration": 2.5234129428863525 + }, + { + "auxiliary_loss_clip": 0.01117247, + "auxiliary_loss_mlp": 0.01034445, + "balance_loss_clip": 1.02105141, + "balance_loss_mlp": 1.04261899, + "epoch": 0.3883962122350819, + "flos": 18076390675200.0, + "grad_norm": 3.3845315312390643, + "language_loss": 0.61609662, + "learning_rate": 2.7972292673536202e-06, + "loss": 0.63761353, + "num_input_tokens_seen": 138673330, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 6460, + "time_per_iteration": 2.4271440505981445 + }, + { + "auxiliary_loss_clip": 0.01121121, + "auxiliary_loss_mlp": 0.01034276, + "balance_loss_clip": 1.0216701, + "balance_loss_mlp": 1.04498553, + "epoch": 0.3884563354877499, + "flos": 23622254847360.0, + "grad_norm": 1.999840896697599, + "language_loss": 0.85928953, + "learning_rate": 2.796872069720717e-06, + "loss": 0.88084352, + "num_input_tokens_seen": 138694185, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.76171875, + "step": 6461, + "time_per_iteration": 2.4874932765960693 + }, + { + "auxiliary_loss_clip": 0.01121067, + "auxiliary_loss_mlp": 0.01041247, + "balance_loss_clip": 1.02712059, + "balance_loss_mlp": 1.04198229, + "epoch": 0.38851645874041785, + "flos": 27453528236160.0, + "grad_norm": 5.6194775515218085, + "language_loss": 0.71397054, + "learning_rate": 2.7965148418703456e-06, + "loss": 0.73559368, + "num_input_tokens_seen": 138714625, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 6462, + "time_per_iteration": 2.4839894771575928 + }, + { + "auxiliary_loss_clip": 0.01120048, + "auxiliary_loss_mlp": 0.01036945, + "balance_loss_clip": 1.02274752, + "balance_loss_mlp": 1.04190457, + "epoch": 0.3885765819930858, + "flos": 25228072270080.0, + "grad_norm": 2.13487298932128, + "language_loss": 0.7582581, + "learning_rate": 2.796157583816052e-06, + "loss": 0.77982807, + "num_input_tokens_seen": 138733585, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.78125, + "step": 6463, + "time_per_iteration": 2.4897215366363525 + }, + { + "auxiliary_loss_clip": 0.0112511, + "auxiliary_loss_mlp": 0.01046321, + "balance_loss_clip": 1.0305022, + "balance_loss_mlp": 1.04482341, + "epoch": 0.3886367052457538, + "flos": 16946605221120.0, + "grad_norm": 1.9442764767857983, + "language_loss": 0.70078236, + "learning_rate": 2.795800295571382e-06, + "loss": 0.72249663, + "num_input_tokens_seen": 138752335, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8046875, + "step": 6464, + "time_per_iteration": 2.4519219398498535 + }, + { + "auxiliary_loss_clip": 0.01118845, + "auxiliary_loss_mlp": 0.01036529, + "balance_loss_clip": 1.02258134, + "balance_loss_mlp": 1.04280329, + "epoch": 0.38869682849842174, + "flos": 27154140376320.0, + "grad_norm": 1.8350923871455525, + "language_loss": 0.69608724, + "learning_rate": 2.7954429771498858e-06, + "loss": 0.717641, + "num_input_tokens_seen": 138768450, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7578125, + "step": 6465, + "time_per_iteration": 2.524698495864868 + }, + { + "auxiliary_loss_clip": 0.01120474, + "auxiliary_loss_mlp": 0.01043161, + "balance_loss_clip": 1.02772307, + "balance_loss_mlp": 1.04204226, + "epoch": 0.3887569517510897, + "flos": 21063619301760.0, + "grad_norm": 2.02186972310505, + "language_loss": 0.77957165, + "learning_rate": 2.7950856285651117e-06, + "loss": 0.80120802, + "num_input_tokens_seen": 138786775, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.78125, + "step": 6466, + "time_per_iteration": 2.4420318603515625 + }, + { + "auxiliary_loss_clip": 0.0112437, + "auxiliary_loss_mlp": 0.01039754, + "balance_loss_clip": 1.02520418, + "balance_loss_mlp": 1.04476476, + "epoch": 0.38881707500375773, + "flos": 29497384016640.0, + "grad_norm": 1.578436157089315, + "language_loss": 0.69438803, + "learning_rate": 2.794728249830611e-06, + "loss": 0.71602929, + "num_input_tokens_seen": 138810100, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.796875, + "step": 6467, + "time_per_iteration": 2.526315212249756 + }, + { + "auxiliary_loss_clip": 0.01122941, + "auxiliary_loss_mlp": 0.01048409, + "balance_loss_clip": 1.03337657, + "balance_loss_mlp": 1.04374123, + "epoch": 0.3888771982564257, + "flos": 17488281294720.0, + "grad_norm": 2.7189933074164316, + "language_loss": 0.83444071, + "learning_rate": 2.794370840959936e-06, + "loss": 0.85615414, + "num_input_tokens_seen": 138825140, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.79296875, + "step": 6468, + "time_per_iteration": 2.433612108230591 + }, + { + "auxiliary_loss_clip": 0.01119544, + "auxiliary_loss_mlp": 0.01040242, + "balance_loss_clip": 1.02720666, + "balance_loss_mlp": 1.04250181, + "epoch": 0.38893732150909366, + "flos": 21942425450880.0, + "grad_norm": 5.890128393718138, + "language_loss": 0.84300733, + "learning_rate": 2.7940134019666383e-06, + "loss": 0.86460519, + "num_input_tokens_seen": 138844115, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.76953125, + "step": 6469, + "time_per_iteration": 2.501368284225464 + }, + { + "auxiliary_loss_clip": 0.011205, + "auxiliary_loss_mlp": 0.0104307, + "balance_loss_clip": 1.02871704, + "balance_loss_mlp": 1.0433706, + "epoch": 0.3889974447617616, + "flos": 24276367468800.0, + "grad_norm": 1.6566744770772097, + "language_loss": 0.74790764, + "learning_rate": 2.793655932864273e-06, + "loss": 0.76954335, + "num_input_tokens_seen": 138860860, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.76953125, + "step": 6470, + "time_per_iteration": 5.350924015045166 + }, + { + "auxiliary_loss_clip": 0.01120302, + "auxiliary_loss_mlp": 0.01041359, + "balance_loss_clip": 1.02632678, + "balance_loss_mlp": 1.04234362, + "epoch": 0.3890575680144296, + "flos": 25667116208640.0, + "grad_norm": 1.5254918915202156, + "language_loss": 0.74916464, + "learning_rate": 2.7932984336663953e-06, + "loss": 0.77078122, + "num_input_tokens_seen": 138881910, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.78125, + "step": 6471, + "time_per_iteration": 5.323298215866089 + }, + { + "auxiliary_loss_clip": 0.01121653, + "auxiliary_loss_mlp": 0.01045365, + "balance_loss_clip": 1.0310601, + "balance_loss_mlp": 1.04548645, + "epoch": 0.38911769126709755, + "flos": 22855274714880.0, + "grad_norm": 1.9258613787227117, + "language_loss": 0.68053186, + "learning_rate": 2.792940904386562e-06, + "loss": 0.70220202, + "num_input_tokens_seen": 138900975, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.76171875, + "step": 6472, + "time_per_iteration": 2.453610420227051 + }, + { + "auxiliary_loss_clip": 0.01120597, + "auxiliary_loss_mlp": 0.01046672, + "balance_loss_clip": 1.0330584, + "balance_loss_mlp": 1.04305148, + "epoch": 0.3891778145197655, + "flos": 25447522412160.0, + "grad_norm": 1.6233097762345425, + "language_loss": 0.76542008, + "learning_rate": 2.7925833450383293e-06, + "loss": 0.7870928, + "num_input_tokens_seen": 138920795, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7734375, + "step": 6473, + "time_per_iteration": 2.487966775894165 + }, + { + "auxiliary_loss_clip": 0.01123459, + "auxiliary_loss_mlp": 0.01046447, + "balance_loss_clip": 1.03157008, + "balance_loss_mlp": 1.04532015, + "epoch": 0.3892379377724335, + "flos": 14027965614720.0, + "grad_norm": 1.8986671727726652, + "language_loss": 0.70897496, + "learning_rate": 2.792225755635257e-06, + "loss": 0.73067403, + "num_input_tokens_seen": 138938770, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78125, + "step": 6474, + "time_per_iteration": 2.4192309379577637 + }, + { + "auxiliary_loss_clip": 0.01121654, + "auxiliary_loss_mlp": 0.01039414, + "balance_loss_clip": 1.02607441, + "balance_loss_mlp": 1.04441047, + "epoch": 0.38929806102510145, + "flos": 20157449967360.0, + "grad_norm": 1.400231739949646, + "language_loss": 0.68822956, + "learning_rate": 2.7918681361909046e-06, + "loss": 0.70984024, + "num_input_tokens_seen": 138958880, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7734375, + "step": 6475, + "time_per_iteration": 2.508747100830078 + }, + { + "auxiliary_loss_clip": 0.01129756, + "auxiliary_loss_mlp": 0.0104873, + "balance_loss_clip": 1.03369188, + "balance_loss_mlp": 1.04747105, + "epoch": 0.3893581842777694, + "flos": 22163958581760.0, + "grad_norm": 2.0025883037810055, + "language_loss": 0.76052523, + "learning_rate": 2.7915104867188332e-06, + "loss": 0.78231013, + "num_input_tokens_seen": 138977240, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.82421875, + "step": 6476, + "time_per_iteration": 2.4432644844055176 + }, + { + "auxiliary_loss_clip": 0.01040957, + "auxiliary_loss_mlp": 0.01003672, + "balance_loss_clip": 1.00199068, + "balance_loss_mlp": 1.01581097, + "epoch": 0.3894183075304374, + "flos": 67301877392640.0, + "grad_norm": 0.7803986728659921, + "language_loss": 0.58254546, + "learning_rate": 2.7911528072326055e-06, + "loss": 0.60299176, + "num_input_tokens_seen": 139039035, + "router_z_loss_clip": 0.0168457, + "router_z_loss_mlp": 0.25, + "step": 6477, + "time_per_iteration": 3.0704691410064697 + }, + { + "auxiliary_loss_clip": 0.01123971, + "auxiliary_loss_mlp": 0.01038208, + "balance_loss_clip": 1.02279997, + "balance_loss_mlp": 1.04507279, + "epoch": 0.38947843078310534, + "flos": 18547502480640.0, + "grad_norm": 1.75333723767605, + "language_loss": 0.77916539, + "learning_rate": 2.7907950977457832e-06, + "loss": 0.80078721, + "num_input_tokens_seen": 139055560, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.7890625, + "step": 6478, + "time_per_iteration": 2.488922357559204 + }, + { + "auxiliary_loss_clip": 0.01118156, + "auxiliary_loss_mlp": 0.0103482, + "balance_loss_clip": 1.0212301, + "balance_loss_mlp": 1.04128957, + "epoch": 0.3895385540357733, + "flos": 14605875532800.0, + "grad_norm": 1.928920480761015, + "language_loss": 0.82250136, + "learning_rate": 2.7904373582719317e-06, + "loss": 0.8440311, + "num_input_tokens_seen": 139071865, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.76953125, + "step": 6479, + "time_per_iteration": 2.4171228408813477 + }, + { + "auxiliary_loss_clip": 0.01118219, + "auxiliary_loss_mlp": 0.01036864, + "balance_loss_clip": 1.02262461, + "balance_loss_mlp": 1.04175949, + "epoch": 0.38959867728844133, + "flos": 19975203336960.0, + "grad_norm": 1.7024032073041733, + "language_loss": 0.80111545, + "learning_rate": 2.790079588824617e-06, + "loss": 0.82266629, + "num_input_tokens_seen": 139089640, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.765625, + "step": 6480, + "time_per_iteration": 2.4750797748565674 + }, + { + "auxiliary_loss_clip": 0.01117569, + "auxiliary_loss_mlp": 0.01027596, + "balance_loss_clip": 1.01428056, + "balance_loss_mlp": 1.04215932, + "epoch": 0.3896588005411093, + "flos": 22672130244480.0, + "grad_norm": 1.550121095479633, + "language_loss": 0.83083898, + "learning_rate": 2.7897217894174038e-06, + "loss": 0.85229063, + "num_input_tokens_seen": 139109365, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75390625, + "step": 6481, + "time_per_iteration": 2.4715166091918945 + }, + { + "auxiliary_loss_clip": 0.01117656, + "auxiliary_loss_mlp": 0.01037477, + "balance_loss_clip": 1.02437592, + "balance_loss_mlp": 1.04459131, + "epoch": 0.38971892379377726, + "flos": 20996035862400.0, + "grad_norm": 1.557560720892756, + "language_loss": 0.75559932, + "learning_rate": 2.789363960063863e-06, + "loss": 0.77715063, + "num_input_tokens_seen": 139128260, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73046875, + "step": 6482, + "time_per_iteration": 2.4623568058013916 + }, + { + "auxiliary_loss_clip": 0.01119557, + "auxiliary_loss_mlp": 0.01033451, + "balance_loss_clip": 1.01972985, + "balance_loss_mlp": 1.04252028, + "epoch": 0.3897790470464452, + "flos": 22528487756160.0, + "grad_norm": 3.29893715214875, + "language_loss": 0.79150903, + "learning_rate": 2.78900610077756e-06, + "loss": 0.81303906, + "num_input_tokens_seen": 139147315, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76953125, + "step": 6483, + "time_per_iteration": 2.4530816078186035 + }, + { + "auxiliary_loss_clip": 0.01117537, + "auxiliary_loss_mlp": 0.01028675, + "balance_loss_clip": 1.0135119, + "balance_loss_mlp": 1.04091668, + "epoch": 0.3898391702991132, + "flos": 26209905603840.0, + "grad_norm": 1.4423872752445677, + "language_loss": 0.79842782, + "learning_rate": 2.788648211572067e-06, + "loss": 0.81989002, + "num_input_tokens_seen": 139167270, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.765625, + "step": 6484, + "time_per_iteration": 2.511016845703125 + }, + { + "auxiliary_loss_clip": 0.01121595, + "auxiliary_loss_mlp": 0.01044531, + "balance_loss_clip": 1.02905726, + "balance_loss_mlp": 1.04556251, + "epoch": 0.38989929355178116, + "flos": 21065558636160.0, + "grad_norm": 1.7756536915325172, + "language_loss": 0.78321344, + "learning_rate": 2.7882902924609557e-06, + "loss": 0.80487472, + "num_input_tokens_seen": 139185970, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.76171875, + "step": 6485, + "time_per_iteration": 2.443439245223999 + }, + { + "auxiliary_loss_clip": 0.01121432, + "auxiliary_loss_mlp": 0.0103836, + "balance_loss_clip": 1.02298832, + "balance_loss_mlp": 1.0427072, + "epoch": 0.3899594168044491, + "flos": 25484115392640.0, + "grad_norm": 2.7221954850945425, + "language_loss": 0.85305119, + "learning_rate": 2.7879323434577965e-06, + "loss": 0.87464917, + "num_input_tokens_seen": 139203730, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.78515625, + "step": 6486, + "time_per_iteration": 2.5056657791137695 + }, + { + "auxiliary_loss_clip": 0.01120884, + "auxiliary_loss_mlp": 0.01033404, + "balance_loss_clip": 1.01942706, + "balance_loss_mlp": 1.04115701, + "epoch": 0.3900195400571171, + "flos": 31139363456640.0, + "grad_norm": 1.7551040773297495, + "language_loss": 0.85345674, + "learning_rate": 2.7875743645761645e-06, + "loss": 0.87499964, + "num_input_tokens_seen": 139222560, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.796875, + "step": 6487, + "time_per_iteration": 2.577178478240967 + }, + { + "auxiliary_loss_clip": 0.01117565, + "auxiliary_loss_mlp": 0.01032303, + "balance_loss_clip": 1.01737833, + "balance_loss_mlp": 1.04198551, + "epoch": 0.39007966330978505, + "flos": 20229917656320.0, + "grad_norm": 1.5246902220393208, + "language_loss": 0.73225224, + "learning_rate": 2.787216355829633e-06, + "loss": 0.75375092, + "num_input_tokens_seen": 139242165, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.75390625, + "step": 6488, + "time_per_iteration": 2.523616075515747 + }, + { + "auxiliary_loss_clip": 0.01123928, + "auxiliary_loss_mlp": 0.0103261, + "balance_loss_clip": 1.01795912, + "balance_loss_mlp": 1.04519773, + "epoch": 0.390139786562453, + "flos": 22528739151360.0, + "grad_norm": 2.5708303691917815, + "language_loss": 0.68585873, + "learning_rate": 2.786858317231779e-06, + "loss": 0.7074241, + "num_input_tokens_seen": 139262525, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7890625, + "step": 6489, + "time_per_iteration": 2.478531837463379 + }, + { + "auxiliary_loss_clip": 0.01115096, + "auxiliary_loss_mlp": 0.01041079, + "balance_loss_clip": 1.02680993, + "balance_loss_mlp": 1.04124475, + "epoch": 0.390199909815121, + "flos": 26432911192320.0, + "grad_norm": 1.801271673710844, + "language_loss": 0.81112868, + "learning_rate": 2.7865002487961788e-06, + "loss": 0.83269042, + "num_input_tokens_seen": 139282835, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.734375, + "step": 6490, + "time_per_iteration": 2.511854887008667 + }, + { + "auxiliary_loss_clip": 0.01121469, + "auxiliary_loss_mlp": 0.01033838, + "balance_loss_clip": 1.0193367, + "balance_loss_mlp": 1.04286718, + "epoch": 0.39026003306778895, + "flos": 17274577328640.0, + "grad_norm": 1.9146492238240407, + "language_loss": 0.89305747, + "learning_rate": 2.7861421505364104e-06, + "loss": 0.91461056, + "num_input_tokens_seen": 139299490, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78515625, + "step": 6491, + "time_per_iteration": 2.460026264190674 + }, + { + "auxiliary_loss_clip": 0.01121295, + "auxiliary_loss_mlp": 0.01035704, + "balance_loss_clip": 1.02187026, + "balance_loss_mlp": 1.04215312, + "epoch": 0.3903201563204569, + "flos": 24532841554560.0, + "grad_norm": 1.8200320241713732, + "language_loss": 0.78811067, + "learning_rate": 2.7857840224660523e-06, + "loss": 0.80968064, + "num_input_tokens_seen": 139317865, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7890625, + "step": 6492, + "time_per_iteration": 2.529750108718872 + }, + { + "auxiliary_loss_clip": 0.01122151, + "auxiliary_loss_mlp": 0.01037174, + "balance_loss_clip": 1.02316093, + "balance_loss_mlp": 1.04309416, + "epoch": 0.39038027957312493, + "flos": 23767944410880.0, + "grad_norm": 1.613220074099035, + "language_loss": 0.74635601, + "learning_rate": 2.7854258645986857e-06, + "loss": 0.76794928, + "num_input_tokens_seen": 139339840, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 6493, + "time_per_iteration": 2.506000280380249 + }, + { + "auxiliary_loss_clip": 0.01123496, + "auxiliary_loss_mlp": 0.0103661, + "balance_loss_clip": 1.02160168, + "balance_loss_mlp": 1.04215276, + "epoch": 0.3904404028257929, + "flos": 14100612871680.0, + "grad_norm": 1.9992899078543964, + "language_loss": 0.76100057, + "learning_rate": 2.7850676769478916e-06, + "loss": 0.78260159, + "num_input_tokens_seen": 139357555, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8125, + "step": 6494, + "time_per_iteration": 2.4696662425994873 + }, + { + "auxiliary_loss_clip": 0.01128232, + "auxiliary_loss_mlp": 0.01048514, + "balance_loss_clip": 1.03233767, + "balance_loss_mlp": 1.04337156, + "epoch": 0.39050052607846086, + "flos": 16910048154240.0, + "grad_norm": 2.027559897328472, + "language_loss": 0.74284697, + "learning_rate": 2.7847094595272525e-06, + "loss": 0.76461446, + "num_input_tokens_seen": 139374455, + "router_z_loss_clip": 0.16210938, + "router_z_loss_mlp": 0.84765625, + "step": 6495, + "time_per_iteration": 2.4156551361083984 + }, + { + "auxiliary_loss_clip": 0.01121782, + "auxiliary_loss_mlp": 0.01041912, + "balance_loss_clip": 1.02683187, + "balance_loss_mlp": 1.04346669, + "epoch": 0.39056064933112883, + "flos": 25915761129600.0, + "grad_norm": 1.725682312794404, + "language_loss": 0.67885542, + "learning_rate": 2.784351212350352e-06, + "loss": 0.70049238, + "num_input_tokens_seen": 139394770, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.78125, + "step": 6496, + "time_per_iteration": 2.530111789703369 + }, + { + "auxiliary_loss_clip": 0.01038749, + "auxiliary_loss_mlp": 0.01000219, + "balance_loss_clip": 0.99871743, + "balance_loss_mlp": 1.01313972, + "epoch": 0.3906207725837968, + "flos": 60028421713920.0, + "grad_norm": 0.6624336186281815, + "language_loss": 0.53998011, + "learning_rate": 2.783992935430775e-06, + "loss": 0.56036979, + "num_input_tokens_seen": 139454760, + "router_z_loss_clip": 0.01501465, + "router_z_loss_mlp": 0.25585938, + "step": 6497, + "time_per_iteration": 3.140427589416504 + }, + { + "auxiliary_loss_clip": 0.01119875, + "auxiliary_loss_mlp": 0.01038317, + "balance_loss_clip": 1.02404737, + "balance_loss_mlp": 1.04236674, + "epoch": 0.39068089583646476, + "flos": 21068683119360.0, + "grad_norm": 2.818865741362812, + "language_loss": 0.68966502, + "learning_rate": 2.7836346287821068e-06, + "loss": 0.71124697, + "num_input_tokens_seen": 139472645, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7734375, + "step": 6498, + "time_per_iteration": 2.4631001949310303 + }, + { + "auxiliary_loss_clip": 0.01037794, + "auxiliary_loss_mlp": 0.01003613, + "balance_loss_clip": 1.00210512, + "balance_loss_mlp": 1.0124712, + "epoch": 0.3907410190891327, + "flos": 70445677403520.0, + "grad_norm": 1.032001330091421, + "language_loss": 0.51830518, + "learning_rate": 2.783276292417936e-06, + "loss": 0.5387193, + "num_input_tokens_seen": 139536730, + "router_z_loss_clip": 0.01507568, + "router_z_loss_mlp": 0.25390625, + "step": 6499, + "time_per_iteration": 3.1206116676330566 + }, + { + "auxiliary_loss_clip": 0.01122549, + "auxiliary_loss_mlp": 0.01043094, + "balance_loss_clip": 1.0266552, + "balance_loss_mlp": 1.04158521, + "epoch": 0.3908011423418007, + "flos": 27962454084480.0, + "grad_norm": 1.8695650437594764, + "language_loss": 0.73693466, + "learning_rate": 2.7829179263518487e-06, + "loss": 0.75859112, + "num_input_tokens_seen": 139557540, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.80859375, + "step": 6500, + "time_per_iteration": 2.5413036346435547 + }, + { + "auxiliary_loss_clip": 0.01125544, + "auxiliary_loss_mlp": 0.01041341, + "balance_loss_clip": 1.02720869, + "balance_loss_mlp": 1.04501247, + "epoch": 0.39086126559446865, + "flos": 24462097718400.0, + "grad_norm": 2.5451317073491353, + "language_loss": 0.68355215, + "learning_rate": 2.7825595305974354e-06, + "loss": 0.70522094, + "num_input_tokens_seen": 139576875, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8046875, + "step": 6501, + "time_per_iteration": 2.4725823402404785 + }, + { + "auxiliary_loss_clip": 0.01118681, + "auxiliary_loss_mlp": 0.01039085, + "balance_loss_clip": 1.02550125, + "balance_loss_mlp": 1.04143763, + "epoch": 0.3909213888471366, + "flos": 16941541403520.0, + "grad_norm": 1.6766627212042646, + "language_loss": 0.79162323, + "learning_rate": 2.782201105168287e-06, + "loss": 0.81320089, + "num_input_tokens_seen": 139594295, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7734375, + "step": 6502, + "time_per_iteration": 2.4758012294769287 + }, + { + "auxiliary_loss_clip": 0.01118294, + "auxiliary_loss_mlp": 0.01037474, + "balance_loss_clip": 1.02378237, + "balance_loss_mlp": 1.0435648, + "epoch": 0.3909815120998046, + "flos": 29278400751360.0, + "grad_norm": 2.24722484247342, + "language_loss": 0.79379106, + "learning_rate": 2.7818426500779932e-06, + "loss": 0.81534874, + "num_input_tokens_seen": 139614080, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 6503, + "time_per_iteration": 2.510356903076172 + }, + { + "auxiliary_loss_clip": 0.0111339, + "auxiliary_loss_mlp": 0.01034049, + "balance_loss_clip": 1.02076924, + "balance_loss_mlp": 1.03882694, + "epoch": 0.39104163535247255, + "flos": 18951246328320.0, + "grad_norm": 1.8991979162106922, + "language_loss": 0.71695077, + "learning_rate": 2.7814841653401485e-06, + "loss": 0.73842514, + "num_input_tokens_seen": 139632755, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.74609375, + "step": 6504, + "time_per_iteration": 2.474257230758667 + }, + { + "auxiliary_loss_clip": 0.01116218, + "auxiliary_loss_mlp": 0.01038584, + "balance_loss_clip": 1.02404082, + "balance_loss_mlp": 1.03938556, + "epoch": 0.3911017586051405, + "flos": 26323347732480.0, + "grad_norm": 1.4403698273396093, + "language_loss": 0.83054864, + "learning_rate": 2.7811256509683454e-06, + "loss": 0.85209668, + "num_input_tokens_seen": 139654205, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.765625, + "step": 6505, + "time_per_iteration": 2.4917776584625244 + }, + { + "auxiliary_loss_clip": 0.01118318, + "auxiliary_loss_mlp": 0.01039423, + "balance_loss_clip": 1.02379465, + "balance_loss_mlp": 1.04268944, + "epoch": 0.3911618818578085, + "flos": 21835770992640.0, + "grad_norm": 1.9728617659661118, + "language_loss": 0.71202552, + "learning_rate": 2.7807671069761797e-06, + "loss": 0.73360288, + "num_input_tokens_seen": 139673595, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.7578125, + "step": 6506, + "time_per_iteration": 2.4846489429473877 + }, + { + "auxiliary_loss_clip": 0.01115165, + "auxiliary_loss_mlp": 0.01038977, + "balance_loss_clip": 1.02529216, + "balance_loss_mlp": 1.04129732, + "epoch": 0.3912220051104765, + "flos": 16359680989440.0, + "grad_norm": 2.0442674369719547, + "language_loss": 0.74914789, + "learning_rate": 2.7804085333772477e-06, + "loss": 0.77068931, + "num_input_tokens_seen": 139690565, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73828125, + "step": 6507, + "time_per_iteration": 2.4173166751861572 + }, + { + "auxiliary_loss_clip": 0.01036092, + "auxiliary_loss_mlp": 0.01010532, + "balance_loss_clip": 1.00900638, + "balance_loss_mlp": 1.01097417, + "epoch": 0.39128212836314447, + "flos": 71050986420480.0, + "grad_norm": 0.7697412763639314, + "language_loss": 0.56554615, + "learning_rate": 2.7800499301851446e-06, + "loss": 0.58601236, + "num_input_tokens_seen": 139749420, + "router_z_loss_clip": 0.01525879, + "router_z_loss_mlp": 0.25195312, + "step": 6508, + "time_per_iteration": 3.222599744796753 + }, + { + "auxiliary_loss_clip": 0.01118923, + "auxiliary_loss_mlp": 0.0103919, + "balance_loss_clip": 1.0256958, + "balance_loss_mlp": 1.04224479, + "epoch": 0.39134225161581243, + "flos": 20331975173760.0, + "grad_norm": 1.8903485988869968, + "language_loss": 0.7639432, + "learning_rate": 2.779691297413471e-06, + "loss": 0.78552431, + "num_input_tokens_seen": 139766265, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 6509, + "time_per_iteration": 2.4504122734069824 + }, + { + "auxiliary_loss_clip": 0.01119308, + "auxiliary_loss_mlp": 0.01046298, + "balance_loss_clip": 1.02919126, + "balance_loss_mlp": 1.04120517, + "epoch": 0.3914023748684804, + "flos": 17018390551680.0, + "grad_norm": 2.5320410479027284, + "language_loss": 0.82538676, + "learning_rate": 2.779332635075825e-06, + "loss": 0.84704286, + "num_input_tokens_seen": 139782400, + "router_z_loss_clip": 0.17089844, + "router_z_loss_mlp": 0.78125, + "step": 6510, + "time_per_iteration": 2.4280829429626465 + }, + { + "auxiliary_loss_clip": 0.01119081, + "auxiliary_loss_mlp": 0.01036031, + "balance_loss_clip": 1.02202439, + "balance_loss_mlp": 1.04137504, + "epoch": 0.39146249812114836, + "flos": 18405224709120.0, + "grad_norm": 1.9726874536239134, + "language_loss": 0.76478642, + "learning_rate": 2.7789739431858073e-06, + "loss": 0.78633761, + "num_input_tokens_seen": 139801435, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.77734375, + "step": 6511, + "time_per_iteration": 2.438093662261963 + }, + { + "auxiliary_loss_clip": 0.01035954, + "auxiliary_loss_mlp": 0.01004811, + "balance_loss_clip": 1.0033921, + "balance_loss_mlp": 1.01070499, + "epoch": 0.3915226213738163, + "flos": 67637355442560.0, + "grad_norm": 0.7278620231464888, + "language_loss": 0.57780313, + "learning_rate": 2.7786152217570196e-06, + "loss": 0.59821081, + "num_input_tokens_seen": 139869700, + "router_z_loss_clip": 0.01416016, + "router_z_loss_mlp": 0.25390625, + "step": 6512, + "time_per_iteration": 6.094903230667114 + }, + { + "auxiliary_loss_clip": 0.01120485, + "auxiliary_loss_mlp": 0.01036295, + "balance_loss_clip": 1.02039289, + "balance_loss_mlp": 1.04215658, + "epoch": 0.3915827446264843, + "flos": 26359330181760.0, + "grad_norm": 1.6857291908308145, + "language_loss": 0.69891763, + "learning_rate": 2.7782564708030647e-06, + "loss": 0.72048545, + "num_input_tokens_seen": 139890140, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.78125, + "step": 6513, + "time_per_iteration": 3.8939309120178223 + }, + { + "auxiliary_loss_clip": 0.01122702, + "auxiliary_loss_mlp": 0.01039276, + "balance_loss_clip": 1.02474439, + "balance_loss_mlp": 1.04184556, + "epoch": 0.39164286787915226, + "flos": 21943897908480.0, + "grad_norm": 2.2930968868818606, + "language_loss": 0.76267236, + "learning_rate": 2.7778976903375464e-06, + "loss": 0.7842921, + "num_input_tokens_seen": 139908020, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.80859375, + "step": 6514, + "time_per_iteration": 2.4622693061828613 + }, + { + "auxiliary_loss_clip": 0.01118584, + "auxiliary_loss_mlp": 0.01035563, + "balance_loss_clip": 1.02168727, + "balance_loss_mlp": 1.04042864, + "epoch": 0.3917029911318202, + "flos": 16399829416320.0, + "grad_norm": 1.7838082674219136, + "language_loss": 0.77452338, + "learning_rate": 2.7775388803740693e-06, + "loss": 0.79606491, + "num_input_tokens_seen": 139926180, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.78125, + "step": 6515, + "time_per_iteration": 2.4336462020874023 + }, + { + "auxiliary_loss_clip": 0.01114007, + "auxiliary_loss_mlp": 0.01038217, + "balance_loss_clip": 1.02564025, + "balance_loss_mlp": 1.03940558, + "epoch": 0.3917631143844882, + "flos": 26211701283840.0, + "grad_norm": 1.4542421972503212, + "language_loss": 0.79846406, + "learning_rate": 2.7771800409262406e-06, + "loss": 0.81998634, + "num_input_tokens_seen": 139947420, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.74609375, + "step": 6516, + "time_per_iteration": 2.500826597213745 + }, + { + "auxiliary_loss_clip": 0.01118601, + "auxiliary_loss_mlp": 0.01033224, + "balance_loss_clip": 1.01891923, + "balance_loss_mlp": 1.04082477, + "epoch": 0.39182323763715615, + "flos": 18548364407040.0, + "grad_norm": 2.228742695866407, + "language_loss": 0.70205939, + "learning_rate": 2.7768211720076665e-06, + "loss": 0.72357762, + "num_input_tokens_seen": 139965800, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.77734375, + "step": 6517, + "time_per_iteration": 2.425739288330078 + }, + { + "auxiliary_loss_clip": 0.01117481, + "auxiliary_loss_mlp": 0.01036962, + "balance_loss_clip": 1.0218817, + "balance_loss_mlp": 1.03986263, + "epoch": 0.3918833608898241, + "flos": 34313543395200.0, + "grad_norm": 1.595983335780194, + "language_loss": 0.72092575, + "learning_rate": 2.776462273631956e-06, + "loss": 0.74247015, + "num_input_tokens_seen": 139988140, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7734375, + "step": 6518, + "time_per_iteration": 2.559140205383301 + }, + { + "auxiliary_loss_clip": 0.01118745, + "auxiliary_loss_mlp": 0.01032386, + "balance_loss_clip": 1.0179677, + "balance_loss_mlp": 1.04041731, + "epoch": 0.3919434841424921, + "flos": 36939582812160.0, + "grad_norm": 1.563160017416143, + "language_loss": 0.61668754, + "learning_rate": 2.7761033458127177e-06, + "loss": 0.63819885, + "num_input_tokens_seen": 140010060, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 6519, + "time_per_iteration": 2.5673322677612305 + }, + { + "auxiliary_loss_clip": 0.01124684, + "auxiliary_loss_mlp": 0.0104391, + "balance_loss_clip": 1.02800775, + "balance_loss_mlp": 1.04341698, + "epoch": 0.3920036073951601, + "flos": 23508956373120.0, + "grad_norm": 2.4564373100444232, + "language_loss": 0.6693083, + "learning_rate": 2.775744388563563e-06, + "loss": 0.6909942, + "num_input_tokens_seen": 140029400, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8125, + "step": 6520, + "time_per_iteration": 2.487650156021118 + }, + { + "auxiliary_loss_clip": 0.0111526, + "auxiliary_loss_mlp": 0.01033685, + "balance_loss_clip": 1.01958799, + "balance_loss_mlp": 1.03966665, + "epoch": 0.39206373064782807, + "flos": 18406086635520.0, + "grad_norm": 1.7599889377917473, + "language_loss": 0.78522319, + "learning_rate": 2.775385401898104e-06, + "loss": 0.80671263, + "num_input_tokens_seen": 140048940, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7578125, + "step": 6521, + "time_per_iteration": 2.418458938598633 + }, + { + "auxiliary_loss_clip": 0.0112345, + "auxiliary_loss_mlp": 0.01036245, + "balance_loss_clip": 1.01853049, + "balance_loss_mlp": 1.04218912, + "epoch": 0.39212385390049603, + "flos": 12313051608960.0, + "grad_norm": 2.4256865138527353, + "language_loss": 0.70340407, + "learning_rate": 2.775026385829952e-06, + "loss": 0.7250011, + "num_input_tokens_seen": 140066380, + "router_z_loss_clip": 0.17773438, + "router_z_loss_mlp": 0.8125, + "step": 6522, + "time_per_iteration": 2.435802936553955 + }, + { + "auxiliary_loss_clip": 0.01120666, + "auxiliary_loss_mlp": 0.01034593, + "balance_loss_clip": 1.02013338, + "balance_loss_mlp": 1.04137838, + "epoch": 0.392183977153164, + "flos": 19719160214400.0, + "grad_norm": 1.8374103087918643, + "language_loss": 0.76740485, + "learning_rate": 2.774667340372722e-06, + "loss": 0.78895748, + "num_input_tokens_seen": 140085275, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7890625, + "step": 6523, + "time_per_iteration": 2.4279329776763916 + }, + { + "auxiliary_loss_clip": 0.01120195, + "auxiliary_loss_mlp": 0.0103949, + "balance_loss_clip": 1.02522683, + "balance_loss_mlp": 1.04124415, + "epoch": 0.39224410040583196, + "flos": 33144902403840.0, + "grad_norm": 2.339335808739943, + "language_loss": 0.61661494, + "learning_rate": 2.7743082655400293e-06, + "loss": 0.63821173, + "num_input_tokens_seen": 140105105, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7890625, + "step": 6524, + "time_per_iteration": 2.5591442584991455 + }, + { + "auxiliary_loss_clip": 0.01117506, + "auxiliary_loss_mlp": 0.0103718, + "balance_loss_clip": 1.02181363, + "balance_loss_mlp": 1.03898454, + "epoch": 0.39230422365849993, + "flos": 27782434097280.0, + "grad_norm": 1.6728206813409823, + "language_loss": 0.73940414, + "learning_rate": 2.773949161345489e-06, + "loss": 0.76095104, + "num_input_tokens_seen": 140125645, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.78515625, + "step": 6525, + "time_per_iteration": 2.4897830486297607 + }, + { + "auxiliary_loss_clip": 0.01117533, + "auxiliary_loss_mlp": 0.01036063, + "balance_loss_clip": 1.02224112, + "balance_loss_mlp": 1.03882146, + "epoch": 0.3923643469111679, + "flos": 17931634865280.0, + "grad_norm": 2.0942212479104363, + "language_loss": 0.81385779, + "learning_rate": 2.773590027802719e-06, + "loss": 0.83539373, + "num_input_tokens_seen": 140141925, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.78515625, + "step": 6526, + "time_per_iteration": 2.442091226577759 + }, + { + "auxiliary_loss_clip": 0.01115953, + "auxiliary_loss_mlp": 0.01036718, + "balance_loss_clip": 1.02265131, + "balance_loss_mlp": 1.03931344, + "epoch": 0.39242447016383586, + "flos": 24059539019520.0, + "grad_norm": 1.56527231709598, + "language_loss": 0.69802964, + "learning_rate": 2.7732308649253383e-06, + "loss": 0.71955633, + "num_input_tokens_seen": 140160965, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.765625, + "step": 6527, + "time_per_iteration": 2.465498924255371 + }, + { + "auxiliary_loss_clip": 0.01116064, + "auxiliary_loss_mlp": 0.01029624, + "balance_loss_clip": 1.0154264, + "balance_loss_mlp": 1.04067612, + "epoch": 0.3924845934165038, + "flos": 10664069016960.0, + "grad_norm": 2.4439619967755983, + "language_loss": 0.82215756, + "learning_rate": 2.772871672726965e-06, + "loss": 0.84361446, + "num_input_tokens_seen": 140177780, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.75390625, + "step": 6528, + "time_per_iteration": 2.488581418991089 + }, + { + "auxiliary_loss_clip": 0.01114295, + "auxiliary_loss_mlp": 0.0103716, + "balance_loss_clip": 1.02282465, + "balance_loss_mlp": 1.04024255, + "epoch": 0.3925447166691718, + "flos": 31245910174080.0, + "grad_norm": 1.4897772961790412, + "language_loss": 0.68726033, + "learning_rate": 2.7725124512212205e-06, + "loss": 0.70877492, + "num_input_tokens_seen": 140201660, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7421875, + "step": 6529, + "time_per_iteration": 2.5409562587738037 + }, + { + "auxiliary_loss_clip": 0.0111896, + "auxiliary_loss_mlp": 0.01039977, + "balance_loss_clip": 1.02561271, + "balance_loss_mlp": 1.04070282, + "epoch": 0.39260483992183975, + "flos": 29415040087680.0, + "grad_norm": 2.9003920421281926, + "language_loss": 0.79728955, + "learning_rate": 2.7721532004217267e-06, + "loss": 0.81887889, + "num_input_tokens_seen": 140218585, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78125, + "step": 6530, + "time_per_iteration": 2.514547109603882 + }, + { + "auxiliary_loss_clip": 0.01112608, + "auxiliary_loss_mlp": 0.01036942, + "balance_loss_clip": 1.02267241, + "balance_loss_mlp": 1.03750181, + "epoch": 0.3926649631745077, + "flos": 22857788666880.0, + "grad_norm": 1.6221630004730245, + "language_loss": 0.75564003, + "learning_rate": 2.7717939203421063e-06, + "loss": 0.77713549, + "num_input_tokens_seen": 140239905, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.75, + "step": 6531, + "time_per_iteration": 2.4572982788085938 + }, + { + "auxiliary_loss_clip": 0.01038893, + "auxiliary_loss_mlp": 0.0100286, + "balance_loss_clip": 1.00127435, + "balance_loss_mlp": 1.01370025, + "epoch": 0.3927250864271757, + "flos": 63893881872000.0, + "grad_norm": 0.8170127744653651, + "language_loss": 0.60378772, + "learning_rate": 2.7714346109959822e-06, + "loss": 0.62420523, + "num_input_tokens_seen": 140293820, + "router_z_loss_clip": 0.01586914, + "router_z_loss_mlp": 0.25195312, + "step": 6532, + "time_per_iteration": 2.929732084274292 + }, + { + "auxiliary_loss_clip": 0.01036987, + "auxiliary_loss_mlp": 0.01003862, + "balance_loss_clip": 1.00225282, + "balance_loss_mlp": 1.0117799, + "epoch": 0.3927852096798437, + "flos": 68909741890560.0, + "grad_norm": 0.7837299971611431, + "language_loss": 0.55545104, + "learning_rate": 2.771075272396981e-06, + "loss": 0.57585955, + "num_input_tokens_seen": 140360420, + "router_z_loss_clip": 0.01611328, + "router_z_loss_mlp": 0.25195312, + "step": 6533, + "time_per_iteration": 3.1820483207702637 + }, + { + "auxiliary_loss_clip": 0.01120735, + "auxiliary_loss_mlp": 0.01037419, + "balance_loss_clip": 1.02316761, + "balance_loss_mlp": 1.04170942, + "epoch": 0.39284533293251167, + "flos": 29715972232320.0, + "grad_norm": 1.9313522305780093, + "language_loss": 0.75972468, + "learning_rate": 2.7707159045587284e-06, + "loss": 0.78130615, + "num_input_tokens_seen": 140381950, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7890625, + "step": 6534, + "time_per_iteration": 2.5650813579559326 + }, + { + "auxiliary_loss_clip": 0.01122617, + "auxiliary_loss_mlp": 0.01038287, + "balance_loss_clip": 1.02376163, + "balance_loss_mlp": 1.04177046, + "epoch": 0.39290545618517964, + "flos": 18552027594240.0, + "grad_norm": 2.213634574223379, + "language_loss": 0.78067005, + "learning_rate": 2.770356507494851e-06, + "loss": 0.802279, + "num_input_tokens_seen": 140399410, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.80859375, + "step": 6535, + "time_per_iteration": 2.447950839996338 + }, + { + "auxiliary_loss_clip": 0.01113628, + "auxiliary_loss_mlp": 0.01032655, + "balance_loss_clip": 1.01950026, + "balance_loss_mlp": 1.03985262, + "epoch": 0.3929655794378476, + "flos": 26249479413120.0, + "grad_norm": 2.091132286884177, + "language_loss": 0.68613565, + "learning_rate": 2.769997081218978e-06, + "loss": 0.70759845, + "num_input_tokens_seen": 140419055, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73828125, + "step": 6536, + "time_per_iteration": 2.4873242378234863 + }, + { + "auxiliary_loss_clip": 0.01112038, + "auxiliary_loss_mlp": 0.01035235, + "balance_loss_clip": 1.0223484, + "balance_loss_mlp": 1.03908086, + "epoch": 0.39302570269051557, + "flos": 29277933874560.0, + "grad_norm": 1.7105256577096235, + "language_loss": 0.69052541, + "learning_rate": 2.769637625744738e-06, + "loss": 0.71199811, + "num_input_tokens_seen": 140438800, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.73046875, + "step": 6537, + "time_per_iteration": 2.5867457389831543 + }, + { + "auxiliary_loss_clip": 0.01117392, + "auxiliary_loss_mlp": 0.01038473, + "balance_loss_clip": 1.02420986, + "balance_loss_mlp": 1.04011965, + "epoch": 0.39308582594318353, + "flos": 17347440067200.0, + "grad_norm": 1.6628056753547982, + "language_loss": 0.79044384, + "learning_rate": 2.769278141085763e-06, + "loss": 0.81200254, + "num_input_tokens_seen": 140456880, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7734375, + "step": 6538, + "time_per_iteration": 2.437757968902588 + }, + { + "auxiliary_loss_clip": 0.01034351, + "auxiliary_loss_mlp": 0.01009828, + "balance_loss_clip": 1.0084635, + "balance_loss_mlp": 1.00972295, + "epoch": 0.3931459491958515, + "flos": 61007094650880.0, + "grad_norm": 0.8042725449961473, + "language_loss": 0.61871827, + "learning_rate": 2.768918627255683e-06, + "loss": 0.63916004, + "num_input_tokens_seen": 140507510, + "router_z_loss_clip": 0.01367188, + "router_z_loss_mlp": 0.24609375, + "step": 6539, + "time_per_iteration": 2.9012601375579834 + }, + { + "auxiliary_loss_clip": 0.01115165, + "auxiliary_loss_mlp": 0.01038758, + "balance_loss_clip": 1.02417326, + "balance_loss_mlp": 1.03897023, + "epoch": 0.39320607244851946, + "flos": 39016009249920.0, + "grad_norm": 2.1025744829352306, + "language_loss": 0.68334043, + "learning_rate": 2.7685590842681315e-06, + "loss": 0.70487964, + "num_input_tokens_seen": 140528740, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.76171875, + "step": 6540, + "time_per_iteration": 2.617544412612915 + }, + { + "auxiliary_loss_clip": 0.01114279, + "auxiliary_loss_mlp": 0.01035483, + "balance_loss_clip": 1.02167249, + "balance_loss_mlp": 1.0387044, + "epoch": 0.3932661957011874, + "flos": 24679752180480.0, + "grad_norm": 1.7155589252050778, + "language_loss": 0.72714561, + "learning_rate": 2.7681995121367433e-06, + "loss": 0.74864328, + "num_input_tokens_seen": 140547560, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 6541, + "time_per_iteration": 2.5576202869415283 + }, + { + "auxiliary_loss_clip": 0.01034882, + "auxiliary_loss_mlp": 0.01010056, + "balance_loss_clip": 1.00863171, + "balance_loss_mlp": 1.0103662, + "epoch": 0.3933263189538554, + "flos": 70096552185600.0, + "grad_norm": 0.8254504926360222, + "language_loss": 0.60302341, + "learning_rate": 2.7678399108751516e-06, + "loss": 0.62347269, + "num_input_tokens_seen": 140601175, + "router_z_loss_clip": 0.01422119, + "router_z_loss_mlp": 0.24511719, + "step": 6542, + "time_per_iteration": 2.921311378479004 + }, + { + "auxiliary_loss_clip": 0.01115263, + "auxiliary_loss_mlp": 0.01035713, + "balance_loss_clip": 1.02204013, + "balance_loss_mlp": 1.03968477, + "epoch": 0.39338644220652336, + "flos": 22929071207040.0, + "grad_norm": 1.9294145782355336, + "language_loss": 0.82255107, + "learning_rate": 2.7674802804969947e-06, + "loss": 0.84406084, + "num_input_tokens_seen": 140622200, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75390625, + "step": 6543, + "time_per_iteration": 2.5267767906188965 + }, + { + "auxiliary_loss_clip": 0.01111884, + "auxiliary_loss_mlp": 0.01035514, + "balance_loss_clip": 1.02153063, + "balance_loss_mlp": 1.03692436, + "epoch": 0.3934465654591913, + "flos": 30848163897600.0, + "grad_norm": 1.6066266241550669, + "language_loss": 0.69336796, + "learning_rate": 2.767120621015908e-06, + "loss": 0.7148419, + "num_input_tokens_seen": 140643125, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75, + "step": 6544, + "time_per_iteration": 2.5192980766296387 + }, + { + "auxiliary_loss_clip": 0.01118616, + "auxiliary_loss_mlp": 0.01042644, + "balance_loss_clip": 1.02729011, + "balance_loss_mlp": 1.03997457, + "epoch": 0.3935066887118593, + "flos": 29236528471680.0, + "grad_norm": 1.880723151689185, + "language_loss": 0.75104976, + "learning_rate": 2.76676093244553e-06, + "loss": 0.77266246, + "num_input_tokens_seen": 140662500, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.7890625, + "step": 6545, + "time_per_iteration": 2.5483953952789307 + }, + { + "auxiliary_loss_clip": 0.01112383, + "auxiliary_loss_mlp": 0.01035537, + "balance_loss_clip": 1.02350879, + "balance_loss_mlp": 1.04072022, + "epoch": 0.3935668119645273, + "flos": 19135288638720.0, + "grad_norm": 1.4191511939867936, + "language_loss": 0.74600172, + "learning_rate": 2.7664012147995015e-06, + "loss": 0.76748097, + "num_input_tokens_seen": 140681960, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.71875, + "step": 6546, + "time_per_iteration": 2.435189962387085 + }, + { + "auxiliary_loss_clip": 0.01120275, + "auxiliary_loss_mlp": 0.01037024, + "balance_loss_clip": 1.02256405, + "balance_loss_mlp": 1.03998446, + "epoch": 0.3936269352171953, + "flos": 18516116972160.0, + "grad_norm": 2.8050093889996326, + "language_loss": 0.81520575, + "learning_rate": 2.7660414680914617e-06, + "loss": 0.83677876, + "num_input_tokens_seen": 140699170, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.80078125, + "step": 6547, + "time_per_iteration": 2.5359435081481934 + }, + { + "auxiliary_loss_clip": 0.0111424, + "auxiliary_loss_mlp": 0.01028344, + "balance_loss_clip": 1.01444387, + "balance_loss_mlp": 1.03795588, + "epoch": 0.39368705846986324, + "flos": 15632813370240.0, + "grad_norm": 2.282095961224954, + "language_loss": 0.84300089, + "learning_rate": 2.7656816923350525e-06, + "loss": 0.86442673, + "num_input_tokens_seen": 140714920, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.765625, + "step": 6548, + "time_per_iteration": 2.430497407913208 + }, + { + "auxiliary_loss_clip": 0.01110548, + "auxiliary_loss_mlp": 0.01030679, + "balance_loss_clip": 1.01784039, + "balance_loss_mlp": 1.0382576, + "epoch": 0.3937471817225312, + "flos": 21325839563520.0, + "grad_norm": 1.5261467823901598, + "language_loss": 0.72481942, + "learning_rate": 2.7653218875439174e-06, + "loss": 0.74623168, + "num_input_tokens_seen": 140734595, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.72265625, + "step": 6549, + "time_per_iteration": 2.484938383102417 + }, + { + "auxiliary_loss_clip": 0.01116832, + "auxiliary_loss_mlp": 0.01034368, + "balance_loss_clip": 1.02025914, + "balance_loss_mlp": 1.04114747, + "epoch": 0.39380730497519917, + "flos": 20776693461120.0, + "grad_norm": 1.525417369659451, + "language_loss": 0.77678335, + "learning_rate": 2.764962053731699e-06, + "loss": 0.79829538, + "num_input_tokens_seen": 140754050, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7578125, + "step": 6550, + "time_per_iteration": 2.4533822536468506 + }, + { + "auxiliary_loss_clip": 0.01112095, + "auxiliary_loss_mlp": 0.0103049, + "balance_loss_clip": 1.01695979, + "balance_loss_mlp": 1.03770638, + "epoch": 0.39386742822786713, + "flos": 21609784575360.0, + "grad_norm": 1.6825180459961226, + "language_loss": 0.81065381, + "learning_rate": 2.7646021909120434e-06, + "loss": 0.83207965, + "num_input_tokens_seen": 140771440, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7421875, + "step": 6551, + "time_per_iteration": 2.4740419387817383 + }, + { + "auxiliary_loss_clip": 0.01115626, + "auxiliary_loss_mlp": 0.01037041, + "balance_loss_clip": 1.02310574, + "balance_loss_mlp": 1.03833413, + "epoch": 0.3939275514805351, + "flos": 12414642249600.0, + "grad_norm": 2.2350138021364003, + "language_loss": 0.80241704, + "learning_rate": 2.764242299098596e-06, + "loss": 0.82394373, + "num_input_tokens_seen": 140786715, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7734375, + "step": 6552, + "time_per_iteration": 2.4066245555877686 + }, + { + "auxiliary_loss_clip": 0.01118032, + "auxiliary_loss_mlp": 0.01038605, + "balance_loss_clip": 1.02449059, + "balance_loss_mlp": 1.04108357, + "epoch": 0.39398767473320306, + "flos": 18552027594240.0, + "grad_norm": 2.2028177738118884, + "language_loss": 0.71154666, + "learning_rate": 2.763882378305003e-06, + "loss": 0.73311305, + "num_input_tokens_seen": 140804950, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.76953125, + "step": 6553, + "time_per_iteration": 2.454035997390747 + }, + { + "auxiliary_loss_clip": 0.01115775, + "auxiliary_loss_mlp": 0.01034177, + "balance_loss_clip": 1.02037239, + "balance_loss_mlp": 1.0409205, + "epoch": 0.39404779798587103, + "flos": 29308888419840.0, + "grad_norm": 1.9276274050376605, + "language_loss": 0.63445336, + "learning_rate": 2.7635224285449144e-06, + "loss": 0.65595293, + "num_input_tokens_seen": 140822800, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.75, + "step": 6554, + "time_per_iteration": 5.467530250549316 + }, + { + "auxiliary_loss_clip": 0.01116231, + "auxiliary_loss_mlp": 0.0103909, + "balance_loss_clip": 1.02620983, + "balance_loss_mlp": 1.041237, + "epoch": 0.394107921238539, + "flos": 34897055834880.0, + "grad_norm": 2.7325305725381703, + "language_loss": 0.79567587, + "learning_rate": 2.7631624498319796e-06, + "loss": 0.81722915, + "num_input_tokens_seen": 140842940, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.75, + "step": 6555, + "time_per_iteration": 3.9707608222961426 + }, + { + "auxiliary_loss_clip": 0.01119332, + "auxiliary_loss_mlp": 0.01036045, + "balance_loss_clip": 1.0209887, + "balance_loss_mlp": 1.04194546, + "epoch": 0.39416804449120696, + "flos": 25081413039360.0, + "grad_norm": 1.8303237809157376, + "language_loss": 0.71571302, + "learning_rate": 2.7628024421798473e-06, + "loss": 0.73726678, + "num_input_tokens_seen": 140863060, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7734375, + "step": 6556, + "time_per_iteration": 2.5013363361358643 + }, + { + "auxiliary_loss_clip": 0.01115996, + "auxiliary_loss_mlp": 0.0103255, + "balance_loss_clip": 1.01806605, + "balance_loss_mlp": 1.03954887, + "epoch": 0.3942281677438749, + "flos": 32306639731200.0, + "grad_norm": 2.056709462434603, + "language_loss": 0.83915412, + "learning_rate": 2.7624424056021705e-06, + "loss": 0.86063957, + "num_input_tokens_seen": 140883795, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.765625, + "step": 6557, + "time_per_iteration": 2.7162060737609863 + }, + { + "auxiliary_loss_clip": 0.01115806, + "auxiliary_loss_mlp": 0.01035387, + "balance_loss_clip": 1.02195859, + "balance_loss_mlp": 1.04014397, + "epoch": 0.3942882909965429, + "flos": 24936621315840.0, + "grad_norm": 3.2694171829217953, + "language_loss": 0.80285048, + "learning_rate": 2.7620823401126004e-06, + "loss": 0.8243624, + "num_input_tokens_seen": 140903055, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 6558, + "time_per_iteration": 2.466904401779175 + }, + { + "auxiliary_loss_clip": 0.01115408, + "auxiliary_loss_mlp": 0.01033231, + "balance_loss_clip": 1.02037418, + "balance_loss_mlp": 1.04165912, + "epoch": 0.39434841424921085, + "flos": 11874797769600.0, + "grad_norm": 1.7254990423790144, + "language_loss": 0.71022832, + "learning_rate": 2.761722245724792e-06, + "loss": 0.73171461, + "num_input_tokens_seen": 140920685, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.734375, + "step": 6559, + "time_per_iteration": 2.474142551422119 + }, + { + "auxiliary_loss_clip": 0.01120627, + "auxiliary_loss_mlp": 0.0103693, + "balance_loss_clip": 1.02111125, + "balance_loss_mlp": 1.04030299, + "epoch": 0.3944085375018789, + "flos": 16361620323840.0, + "grad_norm": 1.8853849407225942, + "language_loss": 0.80391413, + "learning_rate": 2.7613621224524003e-06, + "loss": 0.82548964, + "num_input_tokens_seen": 140937320, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8046875, + "step": 6560, + "time_per_iteration": 2.4220218658447266 + }, + { + "auxiliary_loss_clip": 0.01121865, + "auxiliary_loss_mlp": 0.01037184, + "balance_loss_clip": 1.022223, + "balance_loss_mlp": 1.04395843, + "epoch": 0.39446866075454684, + "flos": 10633365866880.0, + "grad_norm": 3.2514761912447283, + "language_loss": 0.83440554, + "learning_rate": 2.7610019703090803e-06, + "loss": 0.85599601, + "num_input_tokens_seen": 140954855, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.78125, + "step": 6561, + "time_per_iteration": 2.458305835723877 + }, + { + "auxiliary_loss_clip": 0.01117482, + "auxiliary_loss_mlp": 0.01038407, + "balance_loss_clip": 1.02458477, + "balance_loss_mlp": 1.04098439, + "epoch": 0.3945287840072148, + "flos": 18187498419840.0, + "grad_norm": 2.862241713271481, + "language_loss": 0.79548055, + "learning_rate": 2.7606417893084887e-06, + "loss": 0.81703943, + "num_input_tokens_seen": 140973250, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 6562, + "time_per_iteration": 2.4390974044799805 + }, + { + "auxiliary_loss_clip": 0.01113935, + "auxiliary_loss_mlp": 0.01036907, + "balance_loss_clip": 1.02301359, + "balance_loss_mlp": 1.04043949, + "epoch": 0.39458890725988277, + "flos": 23039891642880.0, + "grad_norm": 1.512260767998718, + "language_loss": 0.81355608, + "learning_rate": 2.7602815794642853e-06, + "loss": 0.83506453, + "num_input_tokens_seen": 140993050, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.734375, + "step": 6563, + "time_per_iteration": 2.518843650817871 + }, + { + "auxiliary_loss_clip": 0.0111742, + "auxiliary_loss_mlp": 0.01040253, + "balance_loss_clip": 1.02541161, + "balance_loss_mlp": 1.041682, + "epoch": 0.39464903051255074, + "flos": 17159052211200.0, + "grad_norm": 1.9438463538262531, + "language_loss": 0.69416577, + "learning_rate": 2.759921340790127e-06, + "loss": 0.71574247, + "num_input_tokens_seen": 141010815, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7578125, + "step": 6564, + "time_per_iteration": 2.446140766143799 + }, + { + "auxiliary_loss_clip": 0.01118477, + "auxiliary_loss_mlp": 0.01034591, + "balance_loss_clip": 1.02079892, + "balance_loss_mlp": 1.04157352, + "epoch": 0.3947091537652187, + "flos": 15889000147200.0, + "grad_norm": 3.234298893133154, + "language_loss": 0.83141822, + "learning_rate": 2.759561073299676e-06, + "loss": 0.8529489, + "num_input_tokens_seen": 141028720, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76953125, + "step": 6565, + "time_per_iteration": 2.474611520767212 + }, + { + "auxiliary_loss_clip": 0.01115253, + "auxiliary_loss_mlp": 0.01033237, + "balance_loss_clip": 1.02002859, + "balance_loss_mlp": 1.04039359, + "epoch": 0.39476927701788667, + "flos": 18545491319040.0, + "grad_norm": 1.7678460287206497, + "language_loss": 0.82917452, + "learning_rate": 2.7592007770065937e-06, + "loss": 0.85065943, + "num_input_tokens_seen": 141046025, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.75, + "step": 6566, + "time_per_iteration": 2.432832956314087 + }, + { + "auxiliary_loss_clip": 0.01122918, + "auxiliary_loss_mlp": 0.01038867, + "balance_loss_clip": 1.02493143, + "balance_loss_mlp": 1.04225016, + "epoch": 0.39482940027055463, + "flos": 22275712771200.0, + "grad_norm": 2.357536272997057, + "language_loss": 0.7778033, + "learning_rate": 2.7588404519245403e-06, + "loss": 0.79942119, + "num_input_tokens_seen": 141066865, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.8046875, + "step": 6567, + "time_per_iteration": 2.5020110607147217 + }, + { + "auxiliary_loss_clip": 0.01110775, + "auxiliary_loss_mlp": 0.01040399, + "balance_loss_clip": 1.02689242, + "balance_loss_mlp": 1.04026425, + "epoch": 0.3948895235232226, + "flos": 14757634494720.0, + "grad_norm": 2.0625384967809546, + "language_loss": 0.80381507, + "learning_rate": 2.758480098067182e-06, + "loss": 0.8253268, + "num_input_tokens_seen": 141084210, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.70703125, + "step": 6568, + "time_per_iteration": 2.464186429977417 + }, + { + "auxiliary_loss_clip": 0.01116352, + "auxiliary_loss_mlp": 0.01036196, + "balance_loss_clip": 1.02282655, + "balance_loss_mlp": 1.04130197, + "epoch": 0.39494964677589056, + "flos": 22565763095040.0, + "grad_norm": 1.6625556258765348, + "language_loss": 0.84206939, + "learning_rate": 2.7581197154481816e-06, + "loss": 0.86359489, + "num_input_tokens_seen": 141103895, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.75, + "step": 6569, + "time_per_iteration": 2.4947829246520996 + }, + { + "auxiliary_loss_clip": 0.01118805, + "auxiliary_loss_mlp": 0.01037428, + "balance_loss_clip": 1.02418959, + "balance_loss_mlp": 1.04450357, + "epoch": 0.3950097700285585, + "flos": 22963186149120.0, + "grad_norm": 1.920459843417803, + "language_loss": 0.74973899, + "learning_rate": 2.7577593040812066e-06, + "loss": 0.77130127, + "num_input_tokens_seen": 141124000, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7421875, + "step": 6570, + "time_per_iteration": 2.50211763381958 + }, + { + "auxiliary_loss_clip": 0.01117011, + "auxiliary_loss_mlp": 0.01037708, + "balance_loss_clip": 1.02365923, + "balance_loss_mlp": 1.04104555, + "epoch": 0.3950698932812265, + "flos": 20595236929920.0, + "grad_norm": 1.649568183340291, + "language_loss": 0.79813123, + "learning_rate": 2.757398863979922e-06, + "loss": 0.81967843, + "num_input_tokens_seen": 141142535, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 6571, + "time_per_iteration": 2.477740526199341 + }, + { + "auxiliary_loss_clip": 0.01116017, + "auxiliary_loss_mlp": 0.01041795, + "balance_loss_clip": 1.02846146, + "balance_loss_mlp": 1.04203689, + "epoch": 0.39513001653389446, + "flos": 20375786787840.0, + "grad_norm": 1.628324795196944, + "language_loss": 0.77873337, + "learning_rate": 2.757038395157997e-06, + "loss": 0.80031145, + "num_input_tokens_seen": 141161575, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73828125, + "step": 6572, + "time_per_iteration": 2.4463839530944824 + }, + { + "auxiliary_loss_clip": 0.01118116, + "auxiliary_loss_mlp": 0.01040167, + "balance_loss_clip": 1.02636874, + "balance_loss_mlp": 1.0404911, + "epoch": 0.3951901397865625, + "flos": 26463650256000.0, + "grad_norm": 1.6456702645470058, + "language_loss": 0.7506038, + "learning_rate": 2.7566778976291002e-06, + "loss": 0.77218664, + "num_input_tokens_seen": 141181150, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.77734375, + "step": 6573, + "time_per_iteration": 2.501692295074463 + }, + { + "auxiliary_loss_clip": 0.01114036, + "auxiliary_loss_mlp": 0.01034006, + "balance_loss_clip": 1.02165031, + "balance_loss_mlp": 1.04046559, + "epoch": 0.39525026303923044, + "flos": 43838345767680.0, + "grad_norm": 1.4003162240803297, + "language_loss": 0.67956495, + "learning_rate": 2.7563173714069017e-06, + "loss": 0.70104533, + "num_input_tokens_seen": 141206310, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.734375, + "step": 6574, + "time_per_iteration": 2.6566920280456543 + }, + { + "auxiliary_loss_clip": 0.01119799, + "auxiliary_loss_mlp": 0.01034669, + "balance_loss_clip": 1.01978612, + "balance_loss_mlp": 1.04216623, + "epoch": 0.3953103862918984, + "flos": 18040803275520.0, + "grad_norm": 2.170019312223073, + "language_loss": 0.71719187, + "learning_rate": 2.755956816505072e-06, + "loss": 0.73873657, + "num_input_tokens_seen": 141223925, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7734375, + "step": 6575, + "time_per_iteration": 2.463792085647583 + }, + { + "auxiliary_loss_clip": 0.01119276, + "auxiliary_loss_mlp": 0.01042807, + "balance_loss_clip": 1.02859664, + "balance_loss_mlp": 1.04105997, + "epoch": 0.3953705095445664, + "flos": 16976015481600.0, + "grad_norm": 2.0080051897694324, + "language_loss": 0.73535955, + "learning_rate": 2.7555962329372845e-06, + "loss": 0.75698036, + "num_input_tokens_seen": 141239010, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 6576, + "time_per_iteration": 2.409817934036255 + }, + { + "auxiliary_loss_clip": 0.01115385, + "auxiliary_loss_mlp": 0.01036906, + "balance_loss_clip": 1.0243237, + "balance_loss_mlp": 1.03979337, + "epoch": 0.39543063279723434, + "flos": 17411144837760.0, + "grad_norm": 2.36733568983198, + "language_loss": 0.83294857, + "learning_rate": 2.7552356207172124e-06, + "loss": 0.8544715, + "num_input_tokens_seen": 141252255, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7578125, + "step": 6577, + "time_per_iteration": 2.4421181678771973 + }, + { + "auxiliary_loss_clip": 0.01115466, + "auxiliary_loss_mlp": 0.01031962, + "balance_loss_clip": 1.01860428, + "balance_loss_mlp": 1.04138541, + "epoch": 0.3954907560499023, + "flos": 22784207656320.0, + "grad_norm": 3.8530294325048984, + "language_loss": 0.89916354, + "learning_rate": 2.75487497985853e-06, + "loss": 0.92063785, + "num_input_tokens_seen": 141269325, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7421875, + "step": 6578, + "time_per_iteration": 2.470369577407837 + }, + { + "auxiliary_loss_clip": 0.01118987, + "auxiliary_loss_mlp": 0.01037124, + "balance_loss_clip": 1.02170992, + "balance_loss_mlp": 1.04030561, + "epoch": 0.39555087930257027, + "flos": 21944400698880.0, + "grad_norm": 1.7408596896151103, + "language_loss": 0.77871025, + "learning_rate": 2.7545143103749117e-06, + "loss": 0.80027139, + "num_input_tokens_seen": 141288505, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.7890625, + "step": 6579, + "time_per_iteration": 2.4619040489196777 + }, + { + "auxiliary_loss_clip": 0.01119633, + "auxiliary_loss_mlp": 0.01031625, + "balance_loss_clip": 1.01760054, + "balance_loss_mlp": 1.0407021, + "epoch": 0.39561100255523823, + "flos": 20404622430720.0, + "grad_norm": 2.037188254408411, + "language_loss": 0.68324131, + "learning_rate": 2.754153612280037e-06, + "loss": 0.70475388, + "num_input_tokens_seen": 141303680, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 6580, + "time_per_iteration": 2.4363577365875244 + }, + { + "auxiliary_loss_clip": 0.01115068, + "auxiliary_loss_mlp": 0.01028446, + "balance_loss_clip": 1.01499939, + "balance_loss_mlp": 1.04099488, + "epoch": 0.3956711258079062, + "flos": 27964572986880.0, + "grad_norm": 1.613777567548473, + "language_loss": 0.58620721, + "learning_rate": 2.7537928855875797e-06, + "loss": 0.60764229, + "num_input_tokens_seen": 141324090, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7421875, + "step": 6581, + "time_per_iteration": 2.5704734325408936 + }, + { + "auxiliary_loss_clip": 0.01118807, + "auxiliary_loss_mlp": 0.01038995, + "balance_loss_clip": 1.02479148, + "balance_loss_mlp": 1.04165769, + "epoch": 0.39573124906057416, + "flos": 14428297670400.0, + "grad_norm": 2.015576445189345, + "language_loss": 0.698632, + "learning_rate": 2.7534321303112224e-06, + "loss": 0.72021002, + "num_input_tokens_seen": 141342235, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7734375, + "step": 6582, + "time_per_iteration": 2.4640939235687256 + }, + { + "auxiliary_loss_clip": 0.01118406, + "auxiliary_loss_mlp": 0.01035395, + "balance_loss_clip": 1.02167404, + "balance_loss_mlp": 1.0415566, + "epoch": 0.39579137231324213, + "flos": 18733699607040.0, + "grad_norm": 2.285451965985758, + "language_loss": 0.76454568, + "learning_rate": 2.753071346464642e-06, + "loss": 0.78608364, + "num_input_tokens_seen": 141361195, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76953125, + "step": 6583, + "time_per_iteration": 2.437396287918091 + }, + { + "auxiliary_loss_clip": 0.01118401, + "auxiliary_loss_mlp": 0.01030837, + "balance_loss_clip": 1.01708043, + "balance_loss_mlp": 1.04192805, + "epoch": 0.3958514955659101, + "flos": 17676417755520.0, + "grad_norm": 1.5685917359515968, + "language_loss": 0.65989023, + "learning_rate": 2.7527105340615207e-06, + "loss": 0.68138266, + "num_input_tokens_seen": 141378275, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.765625, + "step": 6584, + "time_per_iteration": 2.4562485218048096 + }, + { + "auxiliary_loss_clip": 0.01120331, + "auxiliary_loss_mlp": 0.01037784, + "balance_loss_clip": 1.02262115, + "balance_loss_mlp": 1.04122627, + "epoch": 0.39591161881857806, + "flos": 29309103901440.0, + "grad_norm": 2.6735523944320136, + "language_loss": 0.72423065, + "learning_rate": 2.7523496931155413e-06, + "loss": 0.74581182, + "num_input_tokens_seen": 141396960, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.7890625, + "step": 6585, + "time_per_iteration": 2.517333984375 + }, + { + "auxiliary_loss_clip": 0.0111653, + "auxiliary_loss_mlp": 0.01031743, + "balance_loss_clip": 1.01811159, + "balance_loss_mlp": 1.04010367, + "epoch": 0.3959717420712461, + "flos": 25771831332480.0, + "grad_norm": 1.986310622320223, + "language_loss": 0.73430967, + "learning_rate": 2.7519888236403856e-06, + "loss": 0.75579244, + "num_input_tokens_seen": 141417320, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.765625, + "step": 6586, + "time_per_iteration": 2.513847827911377 + }, + { + "auxiliary_loss_clip": 0.01117404, + "auxiliary_loss_mlp": 0.01031194, + "balance_loss_clip": 1.01738322, + "balance_loss_mlp": 1.04139459, + "epoch": 0.39603186532391405, + "flos": 20923783655040.0, + "grad_norm": 2.2420315368265915, + "language_loss": 0.71627617, + "learning_rate": 2.7516279256497382e-06, + "loss": 0.73776209, + "num_input_tokens_seen": 141435985, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 6587, + "time_per_iteration": 2.498534917831421 + }, + { + "auxiliary_loss_clip": 0.01038457, + "auxiliary_loss_mlp": 0.01003592, + "balance_loss_clip": 1.0020541, + "balance_loss_mlp": 1.01416993, + "epoch": 0.396091988576582, + "flos": 54880986176640.0, + "grad_norm": 0.9067384171744824, + "language_loss": 0.61162889, + "learning_rate": 2.751266999157285e-06, + "loss": 0.63204944, + "num_input_tokens_seen": 141486075, + "router_z_loss_clip": 0.01531982, + "router_z_loss_mlp": 0.2421875, + "step": 6588, + "time_per_iteration": 2.9129557609558105 + }, + { + "auxiliary_loss_clip": 0.01117429, + "auxiliary_loss_mlp": 0.01035443, + "balance_loss_clip": 1.0215075, + "balance_loss_mlp": 1.04087436, + "epoch": 0.39615211182925, + "flos": 20702896968960.0, + "grad_norm": 1.9745840784771536, + "language_loss": 0.81579673, + "learning_rate": 2.7509060441767115e-06, + "loss": 0.83732545, + "num_input_tokens_seen": 141505280, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 6589, + "time_per_iteration": 2.487581253051758 + }, + { + "auxiliary_loss_clip": 0.01118186, + "auxiliary_loss_mlp": 0.01033247, + "balance_loss_clip": 1.01858449, + "balance_loss_mlp": 1.04102254, + "epoch": 0.39621223508191794, + "flos": 20994312009600.0, + "grad_norm": 2.0157149751951606, + "language_loss": 0.70171028, + "learning_rate": 2.7505450607217057e-06, + "loss": 0.72322464, + "num_input_tokens_seen": 141523930, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7734375, + "step": 6590, + "time_per_iteration": 2.4837629795074463 + }, + { + "auxiliary_loss_clip": 0.01118811, + "auxiliary_loss_mlp": 0.01039001, + "balance_loss_clip": 1.02517259, + "balance_loss_mlp": 1.04276454, + "epoch": 0.3962723583345859, + "flos": 23368833417600.0, + "grad_norm": 1.6568331410473631, + "language_loss": 0.76061213, + "learning_rate": 2.750184048805956e-06, + "loss": 0.7821902, + "num_input_tokens_seen": 141541320, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76171875, + "step": 6591, + "time_per_iteration": 2.574401617050171 + }, + { + "auxiliary_loss_clip": 0.01119076, + "auxiliary_loss_mlp": 0.0104207, + "balance_loss_clip": 1.02803326, + "balance_loss_mlp": 1.04253912, + "epoch": 0.39633248158725387, + "flos": 25115599808640.0, + "grad_norm": 1.7800794685008139, + "language_loss": 0.79121935, + "learning_rate": 2.749823008443152e-06, + "loss": 0.81283081, + "num_input_tokens_seen": 141561880, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.765625, + "step": 6592, + "time_per_iteration": 2.5065057277679443 + }, + { + "auxiliary_loss_clip": 0.01112832, + "auxiliary_loss_mlp": 0.010329, + "balance_loss_clip": 1.01945305, + "balance_loss_mlp": 1.04020298, + "epoch": 0.39639260483992184, + "flos": 39787622236800.0, + "grad_norm": 1.6584377020479992, + "language_loss": 0.69372392, + "learning_rate": 2.7494619396469843e-06, + "loss": 0.71518123, + "num_input_tokens_seen": 141586460, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7265625, + "step": 6593, + "time_per_iteration": 2.691351890563965 + }, + { + "auxiliary_loss_clip": 0.01119923, + "auxiliary_loss_mlp": 0.01038681, + "balance_loss_clip": 1.02389932, + "balance_loss_mlp": 1.04100418, + "epoch": 0.3964527280925898, + "flos": 17347045017600.0, + "grad_norm": 1.6545825162449217, + "language_loss": 0.77913815, + "learning_rate": 2.7491008424311452e-06, + "loss": 0.80072421, + "num_input_tokens_seen": 141605955, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7890625, + "step": 6594, + "time_per_iteration": 2.452536106109619 + }, + { + "auxiliary_loss_clip": 0.01038921, + "auxiliary_loss_mlp": 0.01002091, + "balance_loss_clip": 1.0005945, + "balance_loss_mlp": 1.0146898, + "epoch": 0.39651285134525777, + "flos": 71717848369920.0, + "grad_norm": 0.9454940833877284, + "language_loss": 0.63038307, + "learning_rate": 2.7487397168093265e-06, + "loss": 0.65079319, + "num_input_tokens_seen": 141673140, + "router_z_loss_clip": 0.01495361, + "router_z_loss_mlp": 0.2421875, + "step": 6595, + "time_per_iteration": 6.018520355224609 + }, + { + "auxiliary_loss_clip": 0.01121925, + "auxiliary_loss_mlp": 0.01044146, + "balance_loss_clip": 1.02908421, + "balance_loss_mlp": 1.04294038, + "epoch": 0.39657297459792573, + "flos": 25775710001280.0, + "grad_norm": 2.072222886004575, + "language_loss": 0.6329869, + "learning_rate": 2.748378562795223e-06, + "loss": 0.65464759, + "num_input_tokens_seen": 141692955, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7890625, + "step": 6596, + "time_per_iteration": 5.302752494812012 + }, + { + "auxiliary_loss_clip": 0.01115587, + "auxiliary_loss_mlp": 0.01035004, + "balance_loss_clip": 1.02110457, + "balance_loss_mlp": 1.04157937, + "epoch": 0.3966330978505937, + "flos": 20266115587200.0, + "grad_norm": 2.0492451282774273, + "language_loss": 0.78553772, + "learning_rate": 2.7480173804025293e-06, + "loss": 0.80704355, + "num_input_tokens_seen": 141710680, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.73828125, + "step": 6597, + "time_per_iteration": 2.457028388977051 + }, + { + "auxiliary_loss_clip": 0.01121814, + "auxiliary_loss_mlp": 0.01040279, + "balance_loss_clip": 1.02558672, + "balance_loss_mlp": 1.04262114, + "epoch": 0.39669322110326166, + "flos": 20631183465600.0, + "grad_norm": 1.95592503590265, + "language_loss": 0.67559552, + "learning_rate": 2.747656169644941e-06, + "loss": 0.69721651, + "num_input_tokens_seen": 141729860, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.79296875, + "step": 6598, + "time_per_iteration": 2.4448981285095215 + }, + { + "auxiliary_loss_clip": 0.01117545, + "auxiliary_loss_mlp": 0.01034771, + "balance_loss_clip": 1.02153933, + "balance_loss_mlp": 1.0411458, + "epoch": 0.3967533443559297, + "flos": 21726063878400.0, + "grad_norm": 2.3323846151329235, + "language_loss": 0.78922117, + "learning_rate": 2.747294930536157e-06, + "loss": 0.81074429, + "num_input_tokens_seen": 141749060, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.765625, + "step": 6599, + "time_per_iteration": 2.4799394607543945 + }, + { + "auxiliary_loss_clip": 0.01117884, + "auxiliary_loss_mlp": 0.01032083, + "balance_loss_clip": 1.01680064, + "balance_loss_mlp": 1.04196167, + "epoch": 0.39681346760859765, + "flos": 25484151306240.0, + "grad_norm": 1.67964508136209, + "language_loss": 0.72716624, + "learning_rate": 2.7469336630898737e-06, + "loss": 0.74866593, + "num_input_tokens_seen": 141769860, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.7578125, + "step": 6600, + "time_per_iteration": 2.4940543174743652 + }, + { + "auxiliary_loss_clip": 0.01115602, + "auxiliary_loss_mlp": 0.01032131, + "balance_loss_clip": 1.01864827, + "balance_loss_mlp": 1.03997052, + "epoch": 0.3968735908612656, + "flos": 20959586536320.0, + "grad_norm": 1.9442093512958227, + "language_loss": 0.85773253, + "learning_rate": 2.746572367319791e-06, + "loss": 0.87920988, + "num_input_tokens_seen": 141788465, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 6601, + "time_per_iteration": 2.4826369285583496 + }, + { + "auxiliary_loss_clip": 0.01124374, + "auxiliary_loss_mlp": 0.01038225, + "balance_loss_clip": 1.02191091, + "balance_loss_mlp": 1.04298782, + "epoch": 0.3969337141139336, + "flos": 10707090531840.0, + "grad_norm": 2.3202277168625054, + "language_loss": 0.70015699, + "learning_rate": 2.7462110432396095e-06, + "loss": 0.72178292, + "num_input_tokens_seen": 141804955, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.8125, + "step": 6602, + "time_per_iteration": 2.4452199935913086 + }, + { + "auxiliary_loss_clip": 0.01119686, + "auxiliary_loss_mlp": 0.01038286, + "balance_loss_clip": 1.02458847, + "balance_loss_mlp": 1.04225206, + "epoch": 0.39699383736660154, + "flos": 17593714690560.0, + "grad_norm": 2.564497124514123, + "language_loss": 0.83408487, + "learning_rate": 2.7458496908630305e-06, + "loss": 0.85566461, + "num_input_tokens_seen": 141820025, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 6603, + "time_per_iteration": 2.50046968460083 + }, + { + "auxiliary_loss_clip": 0.01115539, + "auxiliary_loss_mlp": 0.01032527, + "balance_loss_clip": 1.0192889, + "balance_loss_mlp": 1.04076076, + "epoch": 0.3970539606192695, + "flos": 17785945301760.0, + "grad_norm": 1.4733286794124776, + "language_loss": 0.72804213, + "learning_rate": 2.7454883102037563e-06, + "loss": 0.74952281, + "num_input_tokens_seen": 141838735, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.74609375, + "step": 6604, + "time_per_iteration": 2.435645580291748 + }, + { + "auxiliary_loss_clip": 0.01114015, + "auxiliary_loss_mlp": 0.01037208, + "balance_loss_clip": 1.02366602, + "balance_loss_mlp": 1.0427258, + "epoch": 0.3971140838719375, + "flos": 24789495208320.0, + "grad_norm": 1.694386771997249, + "language_loss": 0.82919562, + "learning_rate": 2.745126901275491e-06, + "loss": 0.85070789, + "num_input_tokens_seen": 141858090, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7109375, + "step": 6605, + "time_per_iteration": 2.538792371749878 + }, + { + "auxiliary_loss_clip": 0.01113567, + "auxiliary_loss_mlp": 0.0103244, + "balance_loss_clip": 1.02053654, + "balance_loss_mlp": 1.04017544, + "epoch": 0.39717420712460544, + "flos": 24243581329920.0, + "grad_norm": 1.515379376113219, + "language_loss": 0.73755872, + "learning_rate": 2.7447654640919383e-06, + "loss": 0.75901884, + "num_input_tokens_seen": 141877540, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.734375, + "step": 6606, + "time_per_iteration": 2.4766290187835693 + }, + { + "auxiliary_loss_clip": 0.0111968, + "auxiliary_loss_mlp": 0.01034451, + "balance_loss_clip": 1.0207423, + "balance_loss_mlp": 1.04279184, + "epoch": 0.3972343303772734, + "flos": 25884698843520.0, + "grad_norm": 1.9669838489657716, + "language_loss": 0.73925817, + "learning_rate": 2.744403998666805e-06, + "loss": 0.76079941, + "num_input_tokens_seen": 141897315, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76953125, + "step": 6607, + "time_per_iteration": 2.550140380859375 + }, + { + "auxiliary_loss_clip": 0.01121372, + "auxiliary_loss_mlp": 0.01034109, + "balance_loss_clip": 1.02045417, + "balance_loss_mlp": 1.04417753, + "epoch": 0.39729445362994137, + "flos": 45623716300800.0, + "grad_norm": 1.5241940789626238, + "language_loss": 0.67978024, + "learning_rate": 2.744042505013797e-06, + "loss": 0.70133507, + "num_input_tokens_seen": 141919580, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 6608, + "time_per_iteration": 2.70333194732666 + }, + { + "auxiliary_loss_clip": 0.01120221, + "auxiliary_loss_mlp": 0.01042402, + "balance_loss_clip": 1.0263803, + "balance_loss_mlp": 1.04247403, + "epoch": 0.39735457688260933, + "flos": 20193971120640.0, + "grad_norm": 2.3779993769587486, + "language_loss": 0.74649572, + "learning_rate": 2.7436809831466233e-06, + "loss": 0.76812196, + "num_input_tokens_seen": 141937045, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.77734375, + "step": 6609, + "time_per_iteration": 2.4810678958892822 + }, + { + "auxiliary_loss_clip": 0.01119236, + "auxiliary_loss_mlp": 0.01032767, + "balance_loss_clip": 1.01909387, + "balance_loss_mlp": 1.04284418, + "epoch": 0.3974147001352773, + "flos": 23331163029120.0, + "grad_norm": 4.182923272039756, + "language_loss": 0.71530509, + "learning_rate": 2.7433194330789927e-06, + "loss": 0.73682511, + "num_input_tokens_seen": 141956695, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.765625, + "step": 6610, + "time_per_iteration": 2.483358860015869 + }, + { + "auxiliary_loss_clip": 0.01110348, + "auxiliary_loss_mlp": 0.0103254, + "balance_loss_clip": 1.01881909, + "balance_loss_mlp": 1.03868747, + "epoch": 0.39747482338794526, + "flos": 21688644885120.0, + "grad_norm": 1.6591621928280806, + "language_loss": 0.7848928, + "learning_rate": 2.7429578548246133e-06, + "loss": 0.80632162, + "num_input_tokens_seen": 141975935, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71484375, + "step": 6611, + "time_per_iteration": 2.4707412719726562 + }, + { + "auxiliary_loss_clip": 0.01120047, + "auxiliary_loss_mlp": 0.01036907, + "balance_loss_clip": 1.0234127, + "balance_loss_mlp": 1.04496026, + "epoch": 0.3975349466406133, + "flos": 30988717816320.0, + "grad_norm": 1.7910222988347433, + "language_loss": 0.78681552, + "learning_rate": 2.7425962483971985e-06, + "loss": 0.80838501, + "num_input_tokens_seen": 141995750, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 6612, + "time_per_iteration": 2.552384614944458 + }, + { + "auxiliary_loss_clip": 0.01042423, + "auxiliary_loss_mlp": 0.01023175, + "balance_loss_clip": 1.02180374, + "balance_loss_mlp": 1.01794136, + "epoch": 0.39759506989328125, + "flos": 63683948833920.0, + "grad_norm": 0.8703127674216669, + "language_loss": 0.64956641, + "learning_rate": 2.742234613810459e-06, + "loss": 0.6702224, + "num_input_tokens_seen": 142057655, + "router_z_loss_clip": 0.01373291, + "router_z_loss_mlp": 0.24414062, + "step": 6613, + "time_per_iteration": 2.978494882583618 + }, + { + "auxiliary_loss_clip": 0.01116625, + "auxiliary_loss_mlp": 0.01031073, + "balance_loss_clip": 1.01683927, + "balance_loss_mlp": 1.04148316, + "epoch": 0.3976551931459492, + "flos": 23695835857920.0, + "grad_norm": 2.0550022834902797, + "language_loss": 0.71538055, + "learning_rate": 2.741872951078109e-06, + "loss": 0.73685759, + "num_input_tokens_seen": 142076020, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.75390625, + "step": 6614, + "time_per_iteration": 2.4898061752319336 + }, + { + "auxiliary_loss_clip": 0.01116246, + "auxiliary_loss_mlp": 0.0103036, + "balance_loss_clip": 1.01644266, + "balance_loss_mlp": 1.04124689, + "epoch": 0.3977153163986172, + "flos": 15669657745920.0, + "grad_norm": 1.8540793086422767, + "language_loss": 0.81317735, + "learning_rate": 2.741511260213862e-06, + "loss": 0.83464336, + "num_input_tokens_seen": 142093790, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 6615, + "time_per_iteration": 2.4708592891693115 + }, + { + "auxiliary_loss_clip": 0.01116463, + "auxiliary_loss_mlp": 0.01033552, + "balance_loss_clip": 1.02074313, + "balance_loss_mlp": 1.04221725, + "epoch": 0.39777543965128515, + "flos": 14064702249600.0, + "grad_norm": 2.466828000769562, + "language_loss": 0.67015827, + "learning_rate": 2.741149541231434e-06, + "loss": 0.69165838, + "num_input_tokens_seen": 142110545, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7421875, + "step": 6616, + "time_per_iteration": 2.4453790187835693 + }, + { + "auxiliary_loss_clip": 0.01120268, + "auxiliary_loss_mlp": 0.01034152, + "balance_loss_clip": 1.02032995, + "balance_loss_mlp": 1.04185963, + "epoch": 0.3978355629039531, + "flos": 23367468700800.0, + "grad_norm": 2.097035382924748, + "language_loss": 0.83857769, + "learning_rate": 2.740787794144541e-06, + "loss": 0.86012185, + "num_input_tokens_seen": 142128695, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.78125, + "step": 6617, + "time_per_iteration": 2.4740309715270996 + }, + { + "auxiliary_loss_clip": 0.01113934, + "auxiliary_loss_mlp": 0.01035523, + "balance_loss_clip": 1.02256477, + "balance_loss_mlp": 1.04305041, + "epoch": 0.3978956861566211, + "flos": 19062785036160.0, + "grad_norm": 1.6139116519566428, + "language_loss": 0.72253633, + "learning_rate": 2.7404260189669e-06, + "loss": 0.74403095, + "num_input_tokens_seen": 142148375, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 6618, + "time_per_iteration": 2.451362371444702 + }, + { + "auxiliary_loss_clip": 0.01117142, + "auxiliary_loss_mlp": 0.01035822, + "balance_loss_clip": 1.02070642, + "balance_loss_mlp": 1.04263783, + "epoch": 0.39795580940928904, + "flos": 30227699341440.0, + "grad_norm": 1.9091502235972209, + "language_loss": 0.65847683, + "learning_rate": 2.740064215712231e-06, + "loss": 0.6800065, + "num_input_tokens_seen": 142169735, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.74609375, + "step": 6619, + "time_per_iteration": 2.5479021072387695 + }, + { + "auxiliary_loss_clip": 0.01041684, + "auxiliary_loss_mlp": 0.00999907, + "balance_loss_clip": 0.99843466, + "balance_loss_mlp": 1.0170114, + "epoch": 0.398015932661957, + "flos": 69847224906240.0, + "grad_norm": 0.7720250582246381, + "language_loss": 0.58222711, + "learning_rate": 2.7397023843942527e-06, + "loss": 0.60264301, + "num_input_tokens_seen": 142229520, + "router_z_loss_clip": 0.01470947, + "router_z_loss_mlp": 0.24609375, + "step": 6620, + "time_per_iteration": 3.0502688884735107 + }, + { + "auxiliary_loss_clip": 0.01116159, + "auxiliary_loss_mlp": 0.01036059, + "balance_loss_clip": 1.02383971, + "balance_loss_mlp": 1.04254556, + "epoch": 0.39807605591462497, + "flos": 20157773189760.0, + "grad_norm": 1.5861085047038441, + "language_loss": 0.79551339, + "learning_rate": 2.739340525026686e-06, + "loss": 0.81703556, + "num_input_tokens_seen": 142247660, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.73828125, + "step": 6621, + "time_per_iteration": 2.4595162868499756 + }, + { + "auxiliary_loss_clip": 0.01115012, + "auxiliary_loss_mlp": 0.01030289, + "balance_loss_clip": 1.01709294, + "balance_loss_mlp": 1.04198873, + "epoch": 0.39813617916729294, + "flos": 21141761339520.0, + "grad_norm": 1.9955210259775171, + "language_loss": 0.78070045, + "learning_rate": 2.738978637623252e-06, + "loss": 0.80215347, + "num_input_tokens_seen": 142266990, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73046875, + "step": 6622, + "time_per_iteration": 2.487805128097534 + }, + { + "auxiliary_loss_clip": 0.01116184, + "auxiliary_loss_mlp": 0.01030398, + "balance_loss_clip": 1.01685607, + "balance_loss_mlp": 1.04132223, + "epoch": 0.3981963024199609, + "flos": 18988485753600.0, + "grad_norm": 1.5290489885204759, + "language_loss": 0.75010175, + "learning_rate": 2.738616722197674e-06, + "loss": 0.77156758, + "num_input_tokens_seen": 142287170, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.75, + "step": 6623, + "time_per_iteration": 2.464571714401245 + }, + { + "auxiliary_loss_clip": 0.01116211, + "auxiliary_loss_mlp": 0.01036455, + "balance_loss_clip": 1.02278805, + "balance_loss_mlp": 1.04220378, + "epoch": 0.39825642567262887, + "flos": 16575108808320.0, + "grad_norm": 1.7278538768787957, + "language_loss": 0.79535556, + "learning_rate": 2.7382547787636766e-06, + "loss": 0.81688213, + "num_input_tokens_seen": 142305405, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73828125, + "step": 6624, + "time_per_iteration": 2.4550037384033203 + }, + { + "auxiliary_loss_clip": 0.01120431, + "auxiliary_loss_mlp": 0.01041321, + "balance_loss_clip": 1.02627707, + "balance_loss_mlp": 1.04234707, + "epoch": 0.39831654892529683, + "flos": 22199833290240.0, + "grad_norm": 2.035642441182755, + "language_loss": 0.83558613, + "learning_rate": 2.7378928073349832e-06, + "loss": 0.85720372, + "num_input_tokens_seen": 142322710, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.78125, + "step": 6625, + "time_per_iteration": 2.456171989440918 + }, + { + "auxiliary_loss_clip": 0.01114643, + "auxiliary_loss_mlp": 0.01042231, + "balance_loss_clip": 1.02839124, + "balance_loss_mlp": 1.04085207, + "epoch": 0.39837667217796485, + "flos": 10487963612160.0, + "grad_norm": 2.051687002705142, + "language_loss": 0.86593187, + "learning_rate": 2.737530807925321e-06, + "loss": 0.88750064, + "num_input_tokens_seen": 142338535, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.73828125, + "step": 6626, + "time_per_iteration": 2.4335460662841797 + }, + { + "auxiliary_loss_clip": 0.01115116, + "auxiliary_loss_mlp": 0.01036656, + "balance_loss_clip": 1.02238643, + "balance_loss_mlp": 1.04094946, + "epoch": 0.3984367954306328, + "flos": 17965282930560.0, + "grad_norm": 2.3900066005878386, + "language_loss": 0.83897698, + "learning_rate": 2.737168780548417e-06, + "loss": 0.86049473, + "num_input_tokens_seen": 142354570, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7421875, + "step": 6627, + "time_per_iteration": 2.4269766807556152 + }, + { + "auxiliary_loss_clip": 0.01111344, + "auxiliary_loss_mlp": 0.01038178, + "balance_loss_clip": 1.02514243, + "balance_loss_mlp": 1.03955984, + "epoch": 0.3984969186833008, + "flos": 22711057608960.0, + "grad_norm": 1.4398151096773946, + "language_loss": 0.82760668, + "learning_rate": 2.736806725217998e-06, + "loss": 0.8491019, + "num_input_tokens_seen": 142374395, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71875, + "step": 6628, + "time_per_iteration": 2.529315948486328 + }, + { + "auxiliary_loss_clip": 0.01115476, + "auxiliary_loss_mlp": 0.01040725, + "balance_loss_clip": 1.027421, + "balance_loss_mlp": 1.04130399, + "epoch": 0.39855704193596875, + "flos": 23405785534080.0, + "grad_norm": 1.8256672588255014, + "language_loss": 0.70683473, + "learning_rate": 2.7364446419477945e-06, + "loss": 0.72839677, + "num_input_tokens_seen": 142396040, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7421875, + "step": 6629, + "time_per_iteration": 2.5025413036346436 + }, + { + "auxiliary_loss_clip": 0.01114279, + "auxiliary_loss_mlp": 0.01035106, + "balance_loss_clip": 1.02155161, + "balance_loss_mlp": 1.04309297, + "epoch": 0.3986171651886367, + "flos": 21251935330560.0, + "grad_norm": 4.278612279497538, + "language_loss": 0.80683714, + "learning_rate": 2.7360825307515366e-06, + "loss": 0.82833099, + "num_input_tokens_seen": 142415495, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7109375, + "step": 6630, + "time_per_iteration": 2.4792280197143555 + }, + { + "auxiliary_loss_clip": 0.01116417, + "auxiliary_loss_mlp": 0.01027934, + "balance_loss_clip": 1.01485634, + "balance_loss_mlp": 1.04143131, + "epoch": 0.3986772884413047, + "flos": 12458705258880.0, + "grad_norm": 1.8749880656247468, + "language_loss": 0.75354141, + "learning_rate": 2.7357203916429555e-06, + "loss": 0.7749849, + "num_input_tokens_seen": 142431865, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.75, + "step": 6631, + "time_per_iteration": 2.417546272277832 + }, + { + "auxiliary_loss_clip": 0.0111705, + "auxiliary_loss_mlp": 0.01035263, + "balance_loss_clip": 1.0218699, + "balance_loss_mlp": 1.04246461, + "epoch": 0.39873741169397264, + "flos": 19646117907840.0, + "grad_norm": 2.3246230169523194, + "language_loss": 0.7156167, + "learning_rate": 2.735358224635783e-06, + "loss": 0.73713982, + "num_input_tokens_seen": 142450595, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.74609375, + "step": 6632, + "time_per_iteration": 2.446089744567871 + }, + { + "auxiliary_loss_clip": 0.01111142, + "auxiliary_loss_mlp": 0.01037094, + "balance_loss_clip": 1.02449358, + "balance_loss_mlp": 1.03939462, + "epoch": 0.3987975349466406, + "flos": 21684766216320.0, + "grad_norm": 1.8450465759001686, + "language_loss": 0.74742806, + "learning_rate": 2.7349960297437533e-06, + "loss": 0.76891041, + "num_input_tokens_seen": 142466650, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71875, + "step": 6633, + "time_per_iteration": 2.431104898452759 + }, + { + "auxiliary_loss_clip": 0.011138, + "auxiliary_loss_mlp": 0.01027649, + "balance_loss_clip": 1.01455402, + "balance_loss_mlp": 1.03961205, + "epoch": 0.3988576581993086, + "flos": 23914064937600.0, + "grad_norm": 1.781985159362602, + "language_loss": 0.808864, + "learning_rate": 2.7346338069806e-06, + "loss": 0.83027852, + "num_input_tokens_seen": 142486165, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7421875, + "step": 6634, + "time_per_iteration": 2.471496105194092 + }, + { + "auxiliary_loss_clip": 0.01116689, + "auxiliary_loss_mlp": 0.01032338, + "balance_loss_clip": 1.01856947, + "balance_loss_mlp": 1.04252565, + "epoch": 0.39891778145197654, + "flos": 18149899858560.0, + "grad_norm": 1.7295196741572958, + "language_loss": 0.74605262, + "learning_rate": 2.7342715563600597e-06, + "loss": 0.7675429, + "num_input_tokens_seen": 142505035, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7421875, + "step": 6635, + "time_per_iteration": 2.4630682468414307 + }, + { + "auxiliary_loss_clip": 0.01120499, + "auxiliary_loss_mlp": 0.01044274, + "balance_loss_clip": 1.02930093, + "balance_loss_mlp": 1.04096711, + "epoch": 0.3989779047046445, + "flos": 22595281096320.0, + "grad_norm": 1.9670463450002986, + "language_loss": 0.66429746, + "learning_rate": 2.733909277895868e-06, + "loss": 0.68594521, + "num_input_tokens_seen": 142521870, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.796875, + "step": 6636, + "time_per_iteration": 2.454789876937866 + }, + { + "auxiliary_loss_clip": 0.0111332, + "auxiliary_loss_mlp": 0.01034295, + "balance_loss_clip": 1.02131867, + "balance_loss_mlp": 1.0403626, + "epoch": 0.39903802795731247, + "flos": 18077216688000.0, + "grad_norm": 1.695302941119513, + "language_loss": 0.81410646, + "learning_rate": 2.733546971601763e-06, + "loss": 0.83558261, + "num_input_tokens_seen": 142540455, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.73046875, + "step": 6637, + "time_per_iteration": 5.387745380401611 + }, + { + "auxiliary_loss_clip": 0.01040567, + "auxiliary_loss_mlp": 0.01000444, + "balance_loss_clip": 0.99893045, + "balance_loss_mlp": 1.0159328, + "epoch": 0.39909815120998043, + "flos": 70441367771520.0, + "grad_norm": 0.7139106827959352, + "language_loss": 0.53211641, + "learning_rate": 2.733184637491484e-06, + "loss": 0.55252659, + "num_input_tokens_seen": 142599665, + "router_z_loss_clip": 0.01513672, + "router_z_loss_mlp": 0.24609375, + "step": 6638, + "time_per_iteration": 4.465191125869751 + }, + { + "auxiliary_loss_clip": 0.01114413, + "auxiliary_loss_mlp": 0.01035276, + "balance_loss_clip": 1.02260959, + "balance_loss_mlp": 1.04064405, + "epoch": 0.39915827446264845, + "flos": 18549262247040.0, + "grad_norm": 1.9403504228046689, + "language_loss": 0.75377512, + "learning_rate": 2.732822275578769e-06, + "loss": 0.77527201, + "num_input_tokens_seen": 142618845, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.73828125, + "step": 6639, + "time_per_iteration": 2.4947104454040527 + }, + { + "auxiliary_loss_clip": 0.01112086, + "auxiliary_loss_mlp": 0.01030627, + "balance_loss_clip": 1.01788926, + "balance_loss_mlp": 1.04078937, + "epoch": 0.3992183977153164, + "flos": 29897249195520.0, + "grad_norm": 1.632879790681491, + "language_loss": 0.76217377, + "learning_rate": 2.7324598858773603e-06, + "loss": 0.78360093, + "num_input_tokens_seen": 142640885, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 6640, + "time_per_iteration": 2.524815320968628 + }, + { + "auxiliary_loss_clip": 0.01112883, + "auxiliary_loss_mlp": 0.01037412, + "balance_loss_clip": 1.02448368, + "balance_loss_mlp": 1.03855717, + "epoch": 0.3992785209679844, + "flos": 22565080736640.0, + "grad_norm": 2.5962495804033794, + "language_loss": 0.82264209, + "learning_rate": 2.7320974684009996e-06, + "loss": 0.84414506, + "num_input_tokens_seen": 142659340, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7421875, + "step": 6641, + "time_per_iteration": 2.4753921031951904 + }, + { + "auxiliary_loss_clip": 0.01116915, + "auxiliary_loss_mlp": 0.010322, + "balance_loss_clip": 1.01891971, + "balance_loss_mlp": 1.04188418, + "epoch": 0.39933864422065235, + "flos": 19682674974720.0, + "grad_norm": 2.015070946619467, + "language_loss": 0.7685014, + "learning_rate": 2.7317350231634288e-06, + "loss": 0.78999245, + "num_input_tokens_seen": 142677085, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75, + "step": 6642, + "time_per_iteration": 2.431239604949951 + }, + { + "auxiliary_loss_clip": 0.01114257, + "auxiliary_loss_mlp": 0.0103328, + "balance_loss_clip": 1.019642, + "balance_loss_mlp": 1.03963089, + "epoch": 0.3993987674733203, + "flos": 23038491012480.0, + "grad_norm": 2.2960488262105145, + "language_loss": 0.7247656, + "learning_rate": 2.731372550178393e-06, + "loss": 0.74624097, + "num_input_tokens_seen": 142694595, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 6643, + "time_per_iteration": 2.4759740829467773 + }, + { + "auxiliary_loss_clip": 0.01115242, + "auxiliary_loss_mlp": 0.01035377, + "balance_loss_clip": 1.0214113, + "balance_loss_mlp": 1.04014993, + "epoch": 0.3994588907259883, + "flos": 19390828970880.0, + "grad_norm": 1.5171926718970592, + "language_loss": 0.65988386, + "learning_rate": 2.7310100494596375e-06, + "loss": 0.68139005, + "num_input_tokens_seen": 142714175, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75, + "step": 6644, + "time_per_iteration": 2.437404155731201 + }, + { + "auxiliary_loss_clip": 0.01113182, + "auxiliary_loss_mlp": 0.01037023, + "balance_loss_clip": 1.0235281, + "balance_loss_mlp": 1.0386616, + "epoch": 0.39951901397865625, + "flos": 13734395758080.0, + "grad_norm": 1.956427678643188, + "language_loss": 0.78470129, + "learning_rate": 2.730647521020907e-06, + "loss": 0.80620331, + "num_input_tokens_seen": 142730955, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 6645, + "time_per_iteration": 2.44826078414917 + }, + { + "auxiliary_loss_clip": 0.01115381, + "auxiliary_loss_mlp": 0.01033765, + "balance_loss_clip": 1.02034187, + "balance_loss_mlp": 1.04042077, + "epoch": 0.3995791372313242, + "flos": 23586451966080.0, + "grad_norm": 1.409098570486763, + "language_loss": 0.69889182, + "learning_rate": 2.73028496487595e-06, + "loss": 0.72038329, + "num_input_tokens_seen": 142751200, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 6646, + "time_per_iteration": 2.4746181964874268 + }, + { + "auxiliary_loss_clip": 0.01113557, + "auxiliary_loss_mlp": 0.01035502, + "balance_loss_clip": 1.0222578, + "balance_loss_mlp": 1.03869605, + "epoch": 0.3996392604839922, + "flos": 21355896268800.0, + "grad_norm": 1.7478077072518943, + "language_loss": 0.72165501, + "learning_rate": 2.729922381038513e-06, + "loss": 0.74314553, + "num_input_tokens_seen": 142770170, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.74609375, + "step": 6647, + "time_per_iteration": 2.4814393520355225 + }, + { + "auxiliary_loss_clip": 0.01108545, + "auxiliary_loss_mlp": 0.01037927, + "balance_loss_clip": 1.02576208, + "balance_loss_mlp": 1.03874063, + "epoch": 0.39969938373666014, + "flos": 26032255914240.0, + "grad_norm": 1.4937426139380796, + "language_loss": 0.74371958, + "learning_rate": 2.7295597695223463e-06, + "loss": 0.76518434, + "num_input_tokens_seen": 142792680, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69921875, + "step": 6648, + "time_per_iteration": 2.4970345497131348 + }, + { + "auxiliary_loss_clip": 0.01115329, + "auxiliary_loss_mlp": 0.01036503, + "balance_loss_clip": 1.02300286, + "balance_loss_mlp": 1.04061389, + "epoch": 0.3997595069893281, + "flos": 20116367786880.0, + "grad_norm": 2.209642859907432, + "language_loss": 0.66124469, + "learning_rate": 2.7291971303412006e-06, + "loss": 0.68276298, + "num_input_tokens_seen": 142810510, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 6649, + "time_per_iteration": 2.4624104499816895 + }, + { + "auxiliary_loss_clip": 0.01116294, + "auxiliary_loss_mlp": 0.01036155, + "balance_loss_clip": 1.02280378, + "balance_loss_mlp": 1.0420115, + "epoch": 0.39981963024199607, + "flos": 27783403764480.0, + "grad_norm": 1.57860522688022, + "language_loss": 0.75273359, + "learning_rate": 2.728834463508826e-06, + "loss": 0.77425814, + "num_input_tokens_seen": 142832455, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7421875, + "step": 6650, + "time_per_iteration": 2.5091254711151123 + }, + { + "auxiliary_loss_clip": 0.01112744, + "auxiliary_loss_mlp": 0.01037491, + "balance_loss_clip": 1.02397823, + "balance_loss_mlp": 1.03905869, + "epoch": 0.39987975349466404, + "flos": 21944436612480.0, + "grad_norm": 1.4583647344722164, + "language_loss": 0.71954048, + "learning_rate": 2.728471769038975e-06, + "loss": 0.74104279, + "num_input_tokens_seen": 142852590, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73828125, + "step": 6651, + "time_per_iteration": 2.4820897579193115 + }, + { + "auxiliary_loss_clip": 0.01113579, + "auxiliary_loss_mlp": 0.01035523, + "balance_loss_clip": 1.02220726, + "balance_loss_mlp": 1.03815126, + "epoch": 0.39993987674733206, + "flos": 20704405340160.0, + "grad_norm": 1.787132664616244, + "language_loss": 0.72906494, + "learning_rate": 2.728109046945403e-06, + "loss": 0.75055599, + "num_input_tokens_seen": 142870595, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75390625, + "step": 6652, + "time_per_iteration": 2.4568119049072266 + }, + { + "auxiliary_loss_clip": 0.01039541, + "auxiliary_loss_mlp": 0.00999581, + "balance_loss_clip": 0.99819815, + "balance_loss_mlp": 1.01483345, + "epoch": 0.4, + "flos": 61525429862400.0, + "grad_norm": 0.8299860195083637, + "language_loss": 0.61066198, + "learning_rate": 2.727746297241862e-06, + "loss": 0.63105321, + "num_input_tokens_seen": 142925805, + "router_z_loss_clip": 0.01385498, + "router_z_loss_mlp": 0.24707031, + "step": 6653, + "time_per_iteration": 3.0071723461151123 + }, + { + "auxiliary_loss_clip": 0.01113323, + "auxiliary_loss_mlp": 0.01038964, + "balance_loss_clip": 1.02607179, + "balance_loss_mlp": 1.04303741, + "epoch": 0.400060123252668, + "flos": 14502309644160.0, + "grad_norm": 2.127427836980077, + "language_loss": 0.67038172, + "learning_rate": 2.7273835199421085e-06, + "loss": 0.6919046, + "num_input_tokens_seen": 142943145, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 6654, + "time_per_iteration": 2.442049026489258 + }, + { + "auxiliary_loss_clip": 0.01113347, + "auxiliary_loss_mlp": 0.01039111, + "balance_loss_clip": 1.02741051, + "balance_loss_mlp": 1.03887355, + "epoch": 0.40012024650533595, + "flos": 19093308618240.0, + "grad_norm": 2.299433298478917, + "language_loss": 0.89737195, + "learning_rate": 2.7270207150599e-06, + "loss": 0.91889656, + "num_input_tokens_seen": 142956925, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.74609375, + "step": 6655, + "time_per_iteration": 2.4836323261260986 + }, + { + "auxiliary_loss_clip": 0.01110377, + "auxiliary_loss_mlp": 0.0102991, + "balance_loss_clip": 1.01865685, + "balance_loss_mlp": 1.04077053, + "epoch": 0.4001803697580039, + "flos": 29351012094720.0, + "grad_norm": 1.5855954082229138, + "language_loss": 0.73497427, + "learning_rate": 2.7266578826089917e-06, + "loss": 0.75637716, + "num_input_tokens_seen": 142978040, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6953125, + "step": 6656, + "time_per_iteration": 2.5071847438812256 + }, + { + "auxiliary_loss_clip": 0.01116544, + "auxiliary_loss_mlp": 0.01046403, + "balance_loss_clip": 1.03248513, + "balance_loss_mlp": 1.04179835, + "epoch": 0.4002404930106719, + "flos": 20920048640640.0, + "grad_norm": 1.4675228136273628, + "language_loss": 0.7344414, + "learning_rate": 2.726295022603144e-06, + "loss": 0.75607085, + "num_input_tokens_seen": 142998390, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75, + "step": 6657, + "time_per_iteration": 2.575587034225464 + }, + { + "auxiliary_loss_clip": 0.01116565, + "auxiliary_loss_mlp": 0.0103855, + "balance_loss_clip": 1.02432823, + "balance_loss_mlp": 1.04162562, + "epoch": 0.40030061626333985, + "flos": 28405735827840.0, + "grad_norm": 1.4527474123065993, + "language_loss": 0.79588759, + "learning_rate": 2.725932135056117e-06, + "loss": 0.81743878, + "num_input_tokens_seen": 143021505, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.75, + "step": 6658, + "time_per_iteration": 2.7093567848205566 + }, + { + "auxiliary_loss_clip": 0.01115311, + "auxiliary_loss_mlp": 0.01041911, + "balance_loss_clip": 1.02917993, + "balance_loss_mlp": 1.0406971, + "epoch": 0.4003607395160078, + "flos": 25921615046400.0, + "grad_norm": 1.8904694620172307, + "language_loss": 0.77345288, + "learning_rate": 2.72556921998167e-06, + "loss": 0.79502499, + "num_input_tokens_seen": 143041375, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7421875, + "step": 6659, + "time_per_iteration": 2.5323445796966553 + }, + { + "auxiliary_loss_clip": 0.01105934, + "auxiliary_loss_mlp": 0.01028537, + "balance_loss_clip": 1.01713443, + "balance_loss_mlp": 1.03853416, + "epoch": 0.4004208627686758, + "flos": 20768648814720.0, + "grad_norm": 1.7715585064718242, + "language_loss": 0.72642064, + "learning_rate": 2.7252062773935662e-06, + "loss": 0.7477653, + "num_input_tokens_seen": 143058725, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.671875, + "step": 6660, + "time_per_iteration": 2.4459004402160645 + }, + { + "auxiliary_loss_clip": 0.01113964, + "auxiliary_loss_mlp": 0.01039676, + "balance_loss_clip": 1.02753496, + "balance_loss_mlp": 1.04069686, + "epoch": 0.40048098602134374, + "flos": 24681224638080.0, + "grad_norm": 1.7053131194953803, + "language_loss": 0.70897067, + "learning_rate": 2.7248433073055674e-06, + "loss": 0.73050702, + "num_input_tokens_seen": 143076995, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.73046875, + "step": 6661, + "time_per_iteration": 2.5339720249176025 + }, + { + "auxiliary_loss_clip": 0.011183, + "auxiliary_loss_mlp": 0.01041337, + "balance_loss_clip": 1.02808094, + "balance_loss_mlp": 1.04304504, + "epoch": 0.4005411092740117, + "flos": 23185688947200.0, + "grad_norm": 1.7756888608898216, + "language_loss": 0.75688839, + "learning_rate": 2.724480309731437e-06, + "loss": 0.77848476, + "num_input_tokens_seen": 143096780, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75390625, + "step": 6662, + "time_per_iteration": 2.4546353816986084 + }, + { + "auxiliary_loss_clip": 0.01115994, + "auxiliary_loss_mlp": 0.01033447, + "balance_loss_clip": 1.01979184, + "balance_loss_mlp": 1.03956914, + "epoch": 0.4006012325266797, + "flos": 17522324409600.0, + "grad_norm": 2.0032115325237076, + "language_loss": 0.66019243, + "learning_rate": 2.7241172846849417e-06, + "loss": 0.68168688, + "num_input_tokens_seen": 143112590, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.765625, + "step": 6663, + "time_per_iteration": 2.4437708854675293 + }, + { + "auxiliary_loss_clip": 0.01115313, + "auxiliary_loss_mlp": 0.01036965, + "balance_loss_clip": 1.02409601, + "balance_loss_mlp": 1.0406127, + "epoch": 0.40066135577934764, + "flos": 19857200181120.0, + "grad_norm": 2.5671112933527542, + "language_loss": 0.85808247, + "learning_rate": 2.7237542321798455e-06, + "loss": 0.87960517, + "num_input_tokens_seen": 143130220, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.74609375, + "step": 6664, + "time_per_iteration": 2.423644781112671 + }, + { + "auxiliary_loss_clip": 0.01116399, + "auxiliary_loss_mlp": 0.01033701, + "balance_loss_clip": 1.02062321, + "balance_loss_mlp": 1.04155052, + "epoch": 0.40072147903201566, + "flos": 18150007599360.0, + "grad_norm": 1.9940684324093096, + "language_loss": 0.84890211, + "learning_rate": 2.723391152229917e-06, + "loss": 0.87040305, + "num_input_tokens_seen": 143147160, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.75, + "step": 6665, + "time_per_iteration": 2.4386377334594727 + }, + { + "auxiliary_loss_clip": 0.01119995, + "auxiliary_loss_mlp": 0.01034483, + "balance_loss_clip": 1.02107859, + "balance_loss_mlp": 1.04381645, + "epoch": 0.4007816022846836, + "flos": 18661267831680.0, + "grad_norm": 1.7199178144884215, + "language_loss": 0.78264785, + "learning_rate": 2.7230280448489236e-06, + "loss": 0.8041926, + "num_input_tokens_seen": 143164605, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.76171875, + "step": 6666, + "time_per_iteration": 2.434093952178955 + }, + { + "auxiliary_loss_clip": 0.01118396, + "auxiliary_loss_mlp": 0.0103542, + "balance_loss_clip": 1.02121019, + "balance_loss_mlp": 1.04240537, + "epoch": 0.4008417255373516, + "flos": 25703170485120.0, + "grad_norm": 1.6354204552723763, + "language_loss": 0.73558462, + "learning_rate": 2.7226649100506333e-06, + "loss": 0.75712276, + "num_input_tokens_seen": 143183965, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.76171875, + "step": 6667, + "time_per_iteration": 2.520869255065918 + }, + { + "auxiliary_loss_clip": 0.01117838, + "auxiliary_loss_mlp": 0.0104414, + "balance_loss_clip": 1.02944148, + "balance_loss_mlp": 1.04147649, + "epoch": 0.40090184879001955, + "flos": 22858614679680.0, + "grad_norm": 1.370510933760038, + "language_loss": 0.75832677, + "learning_rate": 2.7223017478488183e-06, + "loss": 0.77994657, + "num_input_tokens_seen": 143204965, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.76171875, + "step": 6668, + "time_per_iteration": 2.475261688232422 + }, + { + "auxiliary_loss_clip": 0.0111899, + "auxiliary_loss_mlp": 0.01038268, + "balance_loss_clip": 1.02470183, + "balance_loss_mlp": 1.04511833, + "epoch": 0.4009619720426875, + "flos": 29059848449280.0, + "grad_norm": 1.7348003262037657, + "language_loss": 0.82309943, + "learning_rate": 2.721938558257248e-06, + "loss": 0.84467208, + "num_input_tokens_seen": 143225015, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.73828125, + "step": 6669, + "time_per_iteration": 2.530458927154541 + }, + { + "auxiliary_loss_clip": 0.0103961, + "auxiliary_loss_mlp": 0.01001267, + "balance_loss_clip": 0.99993151, + "balance_loss_mlp": 1.01565075, + "epoch": 0.4010220952953555, + "flos": 66059763131520.0, + "grad_norm": 0.698912500879513, + "language_loss": 0.53386176, + "learning_rate": 2.721575341289695e-06, + "loss": 0.55427051, + "num_input_tokens_seen": 143294925, + "router_z_loss_clip": 0.0133667, + "router_z_loss_mlp": 0.23925781, + "step": 6670, + "time_per_iteration": 3.247837781906128 + }, + { + "auxiliary_loss_clip": 0.01115169, + "auxiliary_loss_mlp": 0.01037927, + "balance_loss_clip": 1.02476037, + "balance_loss_mlp": 1.0415678, + "epoch": 0.40108221854802345, + "flos": 29642822184960.0, + "grad_norm": 1.8543411810419943, + "language_loss": 0.88405877, + "learning_rate": 2.7212120969599333e-06, + "loss": 0.9055897, + "num_input_tokens_seen": 143314170, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 6671, + "time_per_iteration": 2.5657830238342285 + }, + { + "auxiliary_loss_clip": 0.01115344, + "auxiliary_loss_mlp": 0.01034806, + "balance_loss_clip": 1.02088797, + "balance_loss_mlp": 1.04077482, + "epoch": 0.4011423418006914, + "flos": 19929560129280.0, + "grad_norm": 1.813982967664466, + "language_loss": 0.78926146, + "learning_rate": 2.720848825281736e-06, + "loss": 0.81076294, + "num_input_tokens_seen": 143330050, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.74609375, + "step": 6672, + "time_per_iteration": 2.444209337234497 + }, + { + "auxiliary_loss_clip": 0.01110996, + "auxiliary_loss_mlp": 0.01031049, + "balance_loss_clip": 1.01829374, + "balance_loss_mlp": 1.03889108, + "epoch": 0.4012024650533594, + "flos": 20084299920000.0, + "grad_norm": 1.9086088279717175, + "language_loss": 0.63218224, + "learning_rate": 2.72048552626888e-06, + "loss": 0.65360266, + "num_input_tokens_seen": 143348650, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 6673, + "time_per_iteration": 2.577171564102173 + }, + { + "auxiliary_loss_clip": 0.01114754, + "auxiliary_loss_mlp": 0.01033396, + "balance_loss_clip": 1.02027059, + "balance_loss_mlp": 1.0399313, + "epoch": 0.40126258830602735, + "flos": 21695719864320.0, + "grad_norm": 1.4529148407259798, + "language_loss": 0.80390126, + "learning_rate": 2.7201221999351402e-06, + "loss": 0.82538271, + "num_input_tokens_seen": 143370275, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.75, + "step": 6674, + "time_per_iteration": 2.5402464866638184 + }, + { + "auxiliary_loss_clip": 0.01119667, + "auxiliary_loss_mlp": 0.01030214, + "balance_loss_clip": 1.01687407, + "balance_loss_mlp": 1.04199886, + "epoch": 0.4013227115586953, + "flos": 12020379592320.0, + "grad_norm": 2.6082453610380574, + "language_loss": 0.82641548, + "learning_rate": 2.719758846294294e-06, + "loss": 0.84791422, + "num_input_tokens_seen": 143385390, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.77734375, + "step": 6675, + "time_per_iteration": 2.4605085849761963 + }, + { + "auxiliary_loss_clip": 0.0111374, + "auxiliary_loss_mlp": 0.01032911, + "balance_loss_clip": 1.0189873, + "balance_loss_mlp": 1.04002738, + "epoch": 0.4013828348113633, + "flos": 25447522412160.0, + "grad_norm": 1.7135878896985557, + "language_loss": 0.93308246, + "learning_rate": 2.71939546536012e-06, + "loss": 0.95454895, + "num_input_tokens_seen": 143404215, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.73828125, + "step": 6676, + "time_per_iteration": 2.496168851852417 + }, + { + "auxiliary_loss_clip": 0.01121217, + "auxiliary_loss_mlp": 0.01039781, + "balance_loss_clip": 1.02516031, + "balance_loss_mlp": 1.04100275, + "epoch": 0.40144295806403124, + "flos": 18582946225920.0, + "grad_norm": 4.942241320167032, + "language_loss": 0.79622304, + "learning_rate": 2.719032057146399e-06, + "loss": 0.81783295, + "num_input_tokens_seen": 143422245, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.80078125, + "step": 6677, + "time_per_iteration": 2.4565844535827637 + }, + { + "auxiliary_loss_clip": 0.01116689, + "auxiliary_loss_mlp": 0.01032915, + "balance_loss_clip": 1.01977801, + "balance_loss_mlp": 1.0429368, + "epoch": 0.4015030813166992, + "flos": 22930220442240.0, + "grad_norm": 3.7422980142657374, + "language_loss": 0.83766311, + "learning_rate": 2.71866862166691e-06, + "loss": 0.85915917, + "num_input_tokens_seen": 143443130, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73828125, + "step": 6678, + "time_per_iteration": 4.12173318862915 + }, + { + "auxiliary_loss_clip": 0.01114267, + "auxiliary_loss_mlp": 0.01037817, + "balance_loss_clip": 1.02480578, + "balance_loss_mlp": 1.04150224, + "epoch": 0.4015632045693672, + "flos": 20595057361920.0, + "grad_norm": 2.988298740497095, + "language_loss": 0.63948399, + "learning_rate": 2.718305158935434e-06, + "loss": 0.66100478, + "num_input_tokens_seen": 143461385, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7265625, + "step": 6679, + "time_per_iteration": 5.297976016998291 + }, + { + "auxiliary_loss_clip": 0.01112719, + "auxiliary_loss_mlp": 0.01029551, + "balance_loss_clip": 1.01653934, + "balance_loss_mlp": 1.04000115, + "epoch": 0.4016233278220352, + "flos": 23438930808960.0, + "grad_norm": 1.456514191681199, + "language_loss": 0.78654617, + "learning_rate": 2.7179416689657554e-06, + "loss": 0.80796885, + "num_input_tokens_seen": 143481750, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7265625, + "step": 6680, + "time_per_iteration": 2.467042922973633 + }, + { + "auxiliary_loss_clip": 0.01119549, + "auxiliary_loss_mlp": 0.0104553, + "balance_loss_clip": 1.03065872, + "balance_loss_mlp": 1.04160023, + "epoch": 0.40168345107470316, + "flos": 21431057477760.0, + "grad_norm": 1.6886011670643926, + "language_loss": 0.75628668, + "learning_rate": 2.7175781517716556e-06, + "loss": 0.77793747, + "num_input_tokens_seen": 143501540, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78125, + "step": 6681, + "time_per_iteration": 2.579265594482422 + }, + { + "auxiliary_loss_clip": 0.01118059, + "auxiliary_loss_mlp": 0.01030247, + "balance_loss_clip": 1.01727676, + "balance_loss_mlp": 1.04282522, + "epoch": 0.4017435743273711, + "flos": 22857214049280.0, + "grad_norm": 2.058228157074571, + "language_loss": 0.64001781, + "learning_rate": 2.7172146073669213e-06, + "loss": 0.66150093, + "num_input_tokens_seen": 143520530, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.75390625, + "step": 6682, + "time_per_iteration": 2.4423694610595703 + }, + { + "auxiliary_loss_clip": 0.01115099, + "auxiliary_loss_mlp": 0.01032652, + "balance_loss_clip": 1.01953304, + "balance_loss_mlp": 1.03868985, + "epoch": 0.4018036975800391, + "flos": 28622312881920.0, + "grad_norm": 1.6867457181896433, + "language_loss": 0.73334014, + "learning_rate": 2.716851035765337e-06, + "loss": 0.75481766, + "num_input_tokens_seen": 143540210, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.765625, + "step": 6683, + "time_per_iteration": 2.5543196201324463 + }, + { + "auxiliary_loss_clip": 0.01113172, + "auxiliary_loss_mlp": 0.01043204, + "balance_loss_clip": 1.02971554, + "balance_loss_mlp": 1.03814459, + "epoch": 0.40186382083270705, + "flos": 26651212099200.0, + "grad_norm": 1.6157462356379846, + "language_loss": 0.73054385, + "learning_rate": 2.7164874369806896e-06, + "loss": 0.75210762, + "num_input_tokens_seen": 143560940, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 6684, + "time_per_iteration": 2.584984302520752 + }, + { + "auxiliary_loss_clip": 0.01036703, + "auxiliary_loss_mlp": 0.01002873, + "balance_loss_clip": 1.0016098, + "balance_loss_mlp": 1.01262808, + "epoch": 0.401923944085375, + "flos": 59259969123840.0, + "grad_norm": 0.8051502477983452, + "language_loss": 0.60442972, + "learning_rate": 2.716123811026767e-06, + "loss": 0.62482548, + "num_input_tokens_seen": 143624015, + "router_z_loss_clip": 0.01263428, + "router_z_loss_mlp": 0.24023438, + "step": 6685, + "time_per_iteration": 3.2001583576202393 + }, + { + "auxiliary_loss_clip": 0.01118672, + "auxiliary_loss_mlp": 0.01032258, + "balance_loss_clip": 1.0184474, + "balance_loss_mlp": 1.0410161, + "epoch": 0.401984067338043, + "flos": 16982803152000.0, + "grad_norm": 2.1343445795660956, + "language_loss": 0.69979215, + "learning_rate": 2.715760157917357e-06, + "loss": 0.72130144, + "num_input_tokens_seen": 143642750, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.77734375, + "step": 6686, + "time_per_iteration": 2.486487627029419 + }, + { + "auxiliary_loss_clip": 0.01113204, + "auxiliary_loss_mlp": 0.01030839, + "balance_loss_clip": 1.01784527, + "balance_loss_mlp": 1.03917289, + "epoch": 0.40204419059071095, + "flos": 24972496024320.0, + "grad_norm": 1.4076322562781298, + "language_loss": 0.74622524, + "learning_rate": 2.7153964776662504e-06, + "loss": 0.76766562, + "num_input_tokens_seen": 143664515, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7421875, + "step": 6687, + "time_per_iteration": 2.4854915142059326 + }, + { + "auxiliary_loss_clip": 0.01117283, + "auxiliary_loss_mlp": 0.01035998, + "balance_loss_clip": 1.02219915, + "balance_loss_mlp": 1.04146934, + "epoch": 0.4021043138433789, + "flos": 23477463123840.0, + "grad_norm": 1.852699339351418, + "language_loss": 0.70648831, + "learning_rate": 2.7150327702872385e-06, + "loss": 0.72802114, + "num_input_tokens_seen": 143683135, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7578125, + "step": 6688, + "time_per_iteration": 2.452765703201294 + }, + { + "auxiliary_loss_clip": 0.01117224, + "auxiliary_loss_mlp": 0.0103962, + "balance_loss_clip": 1.02558923, + "balance_loss_mlp": 1.0390867, + "epoch": 0.4021644370960469, + "flos": 25995806588160.0, + "grad_norm": 1.7360862235805987, + "language_loss": 0.64509618, + "learning_rate": 2.7146690357941112e-06, + "loss": 0.6666646, + "num_input_tokens_seen": 143703985, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.78125, + "step": 6689, + "time_per_iteration": 2.5217337608337402 + }, + { + "auxiliary_loss_clip": 0.01117214, + "auxiliary_loss_mlp": 0.01033972, + "balance_loss_clip": 1.02059698, + "balance_loss_mlp": 1.03956485, + "epoch": 0.40222456034871484, + "flos": 13587987922560.0, + "grad_norm": 2.322807889185569, + "language_loss": 0.7306338, + "learning_rate": 2.7143052742006632e-06, + "loss": 0.75214565, + "num_input_tokens_seen": 143719245, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.77734375, + "step": 6690, + "time_per_iteration": 2.421478509902954 + }, + { + "auxiliary_loss_clip": 0.01114039, + "auxiliary_loss_mlp": 0.01036823, + "balance_loss_clip": 1.02357256, + "balance_loss_mlp": 1.03967643, + "epoch": 0.4022846836013828, + "flos": 24278019494400.0, + "grad_norm": 1.4867559931284213, + "language_loss": 0.74789405, + "learning_rate": 2.7139414855206872e-06, + "loss": 0.76940262, + "num_input_tokens_seen": 143739575, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7421875, + "step": 6691, + "time_per_iteration": 2.5322606563568115 + }, + { + "auxiliary_loss_clip": 0.01119421, + "auxiliary_loss_mlp": 0.01038807, + "balance_loss_clip": 1.02530634, + "balance_loss_mlp": 1.04281604, + "epoch": 0.40234480685405083, + "flos": 20151596050560.0, + "grad_norm": 1.5836527032457117, + "language_loss": 0.72676492, + "learning_rate": 2.7135776697679785e-06, + "loss": 0.74834728, + "num_input_tokens_seen": 143758515, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 6692, + "time_per_iteration": 2.486466407775879 + }, + { + "auxiliary_loss_clip": 0.01115579, + "auxiliary_loss_mlp": 0.01037632, + "balance_loss_clip": 1.02444792, + "balance_loss_mlp": 1.039814, + "epoch": 0.4024049301067188, + "flos": 22930220442240.0, + "grad_norm": 1.7516389520719526, + "language_loss": 0.83851349, + "learning_rate": 2.7132138269563333e-06, + "loss": 0.86004555, + "num_input_tokens_seen": 143776770, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7578125, + "step": 6693, + "time_per_iteration": 2.5068037509918213 + }, + { + "auxiliary_loss_clip": 0.01118781, + "auxiliary_loss_mlp": 0.01036408, + "balance_loss_clip": 1.02325296, + "balance_loss_mlp": 1.04313457, + "epoch": 0.40246505335938676, + "flos": 36028421487360.0, + "grad_norm": 1.699829604816944, + "language_loss": 0.71295136, + "learning_rate": 2.7128499570995483e-06, + "loss": 0.73450321, + "num_input_tokens_seen": 143798450, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7578125, + "step": 6694, + "time_per_iteration": 2.5704145431518555 + }, + { + "auxiliary_loss_clip": 0.01114045, + "auxiliary_loss_mlp": 0.01038433, + "balance_loss_clip": 1.02460432, + "balance_loss_mlp": 1.03981924, + "epoch": 0.4025251766120547, + "flos": 20594303176320.0, + "grad_norm": 2.0155422945498223, + "language_loss": 0.67754763, + "learning_rate": 2.7124860602114212e-06, + "loss": 0.69907242, + "num_input_tokens_seen": 143816995, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7421875, + "step": 6695, + "time_per_iteration": 2.4664762020111084 + }, + { + "auxiliary_loss_clip": 0.0111296, + "auxiliary_loss_mlp": 0.01030605, + "balance_loss_clip": 1.01736653, + "balance_loss_mlp": 1.03826809, + "epoch": 0.4025852998647227, + "flos": 64523932381440.0, + "grad_norm": 2.459399840574827, + "language_loss": 0.79355788, + "learning_rate": 2.7121221363057515e-06, + "loss": 0.81499356, + "num_input_tokens_seen": 143842090, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.74609375, + "step": 6696, + "time_per_iteration": 2.883577346801758 + }, + { + "auxiliary_loss_clip": 0.01118448, + "auxiliary_loss_mlp": 0.0103721, + "balance_loss_clip": 1.02291059, + "balance_loss_mlp": 1.04224885, + "epoch": 0.40264542311739066, + "flos": 20886292834560.0, + "grad_norm": 1.6846278858215487, + "language_loss": 0.70899725, + "learning_rate": 2.7117581853963393e-06, + "loss": 0.73055387, + "num_input_tokens_seen": 143860800, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.76171875, + "step": 6697, + "time_per_iteration": 2.4922237396240234 + }, + { + "auxiliary_loss_clip": 0.01113557, + "auxiliary_loss_mlp": 0.01038169, + "balance_loss_clip": 1.02555108, + "balance_loss_mlp": 1.04018331, + "epoch": 0.4027055463700586, + "flos": 26250197685120.0, + "grad_norm": 2.4926240162149162, + "language_loss": 0.61456931, + "learning_rate": 2.711394207496984e-06, + "loss": 0.63608658, + "num_input_tokens_seen": 143878950, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.734375, + "step": 6698, + "time_per_iteration": 2.4892961978912354 + }, + { + "auxiliary_loss_clip": 0.01115982, + "auxiliary_loss_mlp": 0.0103183, + "balance_loss_clip": 1.01840675, + "balance_loss_mlp": 1.03997493, + "epoch": 0.4027656696227266, + "flos": 20631398947200.0, + "grad_norm": 1.8414423865451628, + "language_loss": 0.76245844, + "learning_rate": 2.711030202621491e-06, + "loss": 0.78393662, + "num_input_tokens_seen": 143898385, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.76171875, + "step": 6699, + "time_per_iteration": 2.4576990604400635 + }, + { + "auxiliary_loss_clip": 0.01110513, + "auxiliary_loss_mlp": 0.01030361, + "balance_loss_clip": 1.0171113, + "balance_loss_mlp": 1.03855538, + "epoch": 0.40282579287539455, + "flos": 22346277039360.0, + "grad_norm": 1.5844300780087603, + "language_loss": 0.80345184, + "learning_rate": 2.7106661707836605e-06, + "loss": 0.82486057, + "num_input_tokens_seen": 143918795, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 6700, + "time_per_iteration": 2.4449126720428467 + }, + { + "auxiliary_loss_clip": 0.01117537, + "auxiliary_loss_mlp": 0.01043995, + "balance_loss_clip": 1.02886689, + "balance_loss_mlp": 1.03814912, + "epoch": 0.4028859161280625, + "flos": 29274988959360.0, + "grad_norm": 2.2662820598104227, + "language_loss": 0.74967611, + "learning_rate": 2.7103021119972977e-06, + "loss": 0.77129138, + "num_input_tokens_seen": 143938245, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.79296875, + "step": 6701, + "time_per_iteration": 2.5474703311920166 + }, + { + "auxiliary_loss_clip": 0.01112492, + "auxiliary_loss_mlp": 0.01038921, + "balance_loss_clip": 1.02598631, + "balance_loss_mlp": 1.03800225, + "epoch": 0.4029460393807305, + "flos": 28622312881920.0, + "grad_norm": 1.5176135502188826, + "language_loss": 0.65989178, + "learning_rate": 2.709938026276208e-06, + "loss": 0.6814059, + "num_input_tokens_seen": 143960995, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.74609375, + "step": 6702, + "time_per_iteration": 2.5158073902130127 + }, + { + "auxiliary_loss_clip": 0.01116121, + "auxiliary_loss_mlp": 0.01039218, + "balance_loss_clip": 1.02409053, + "balance_loss_mlp": 1.03949153, + "epoch": 0.40300616263339845, + "flos": 22601925112320.0, + "grad_norm": 1.577366316976287, + "language_loss": 0.66134161, + "learning_rate": 2.7095739136341964e-06, + "loss": 0.68289495, + "num_input_tokens_seen": 143979910, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.765625, + "step": 6703, + "time_per_iteration": 2.4974560737609863 + }, + { + "auxiliary_loss_clip": 0.01119665, + "auxiliary_loss_mlp": 0.01035087, + "balance_loss_clip": 1.0210917, + "balance_loss_mlp": 1.04285431, + "epoch": 0.4030662858860664, + "flos": 25520313323520.0, + "grad_norm": 2.6870156282512245, + "language_loss": 0.82005399, + "learning_rate": 2.709209774085071e-06, + "loss": 0.84160155, + "num_input_tokens_seen": 144000095, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.765625, + "step": 6704, + "time_per_iteration": 2.5040299892425537 + }, + { + "auxiliary_loss_clip": 0.01117271, + "auxiliary_loss_mlp": 0.01034919, + "balance_loss_clip": 1.02110291, + "balance_loss_mlp": 1.03974569, + "epoch": 0.40312640913873443, + "flos": 23586703361280.0, + "grad_norm": 2.5805971030690578, + "language_loss": 0.73468685, + "learning_rate": 2.7088456076426407e-06, + "loss": 0.75620878, + "num_input_tokens_seen": 144019695, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.77734375, + "step": 6705, + "time_per_iteration": 2.520252227783203 + }, + { + "auxiliary_loss_clip": 0.01113466, + "auxiliary_loss_mlp": 0.01035202, + "balance_loss_clip": 1.02208292, + "balance_loss_mlp": 1.03979278, + "epoch": 0.4031865323914024, + "flos": 20011042131840.0, + "grad_norm": 1.712587367637223, + "language_loss": 0.66288096, + "learning_rate": 2.708481414320713e-06, + "loss": 0.68436766, + "num_input_tokens_seen": 144038525, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.734375, + "step": 6706, + "time_per_iteration": 2.4254331588745117 + }, + { + "auxiliary_loss_clip": 0.01114724, + "auxiliary_loss_mlp": 0.0103993, + "balance_loss_clip": 1.02619088, + "balance_loss_mlp": 1.03957605, + "epoch": 0.40324665564407036, + "flos": 21871430219520.0, + "grad_norm": 1.3675174561755612, + "language_loss": 0.71328777, + "learning_rate": 2.7081171941330992e-06, + "loss": 0.73483431, + "num_input_tokens_seen": 144059485, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75, + "step": 6707, + "time_per_iteration": 2.5285422801971436 + }, + { + "auxiliary_loss_clip": 0.01109979, + "auxiliary_loss_mlp": 0.01035824, + "balance_loss_clip": 1.02169156, + "balance_loss_mlp": 1.03867698, + "epoch": 0.4033067788967383, + "flos": 23878728933120.0, + "grad_norm": 1.4937460074112463, + "language_loss": 0.80080485, + "learning_rate": 2.707752947093611e-06, + "loss": 0.82226288, + "num_input_tokens_seen": 144080265, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7109375, + "step": 6708, + "time_per_iteration": 2.4664134979248047 + }, + { + "auxiliary_loss_clip": 0.01117266, + "auxiliary_loss_mlp": 0.0103605, + "balance_loss_clip": 1.02170968, + "balance_loss_mlp": 1.03778601, + "epoch": 0.4033669021494063, + "flos": 17419907756160.0, + "grad_norm": 2.013607365016592, + "language_loss": 0.82944471, + "learning_rate": 2.70738867321606e-06, + "loss": 0.8509779, + "num_input_tokens_seen": 144098040, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.796875, + "step": 6709, + "time_per_iteration": 2.461277723312378 + }, + { + "auxiliary_loss_clip": 0.01119346, + "auxiliary_loss_mlp": 0.01038536, + "balance_loss_clip": 1.02454066, + "balance_loss_mlp": 1.04260051, + "epoch": 0.40342702540207426, + "flos": 29600554855680.0, + "grad_norm": 1.4165591336273893, + "language_loss": 0.71036613, + "learning_rate": 2.70702437251426e-06, + "loss": 0.73194492, + "num_input_tokens_seen": 144118265, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.765625, + "step": 6710, + "time_per_iteration": 2.5579922199249268 + }, + { + "auxiliary_loss_clip": 0.01116194, + "auxiliary_loss_mlp": 0.01038571, + "balance_loss_clip": 1.02461195, + "balance_loss_mlp": 1.04049003, + "epoch": 0.4034871486547422, + "flos": 11284605400320.0, + "grad_norm": 1.9864485278108117, + "language_loss": 0.85366702, + "learning_rate": 2.7066600450020236e-06, + "loss": 0.87521464, + "num_input_tokens_seen": 144133865, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7578125, + "step": 6711, + "time_per_iteration": 2.511082410812378 + }, + { + "auxiliary_loss_clip": 0.01116602, + "auxiliary_loss_mlp": 0.01034461, + "balance_loss_clip": 1.02038825, + "balance_loss_mlp": 1.04072142, + "epoch": 0.4035472719074102, + "flos": 15552839738880.0, + "grad_norm": 1.9069456024701996, + "language_loss": 0.76074743, + "learning_rate": 2.706295690693168e-06, + "loss": 0.78225803, + "num_input_tokens_seen": 144150125, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 6712, + "time_per_iteration": 2.419672727584839 + }, + { + "auxiliary_loss_clip": 0.0111779, + "auxiliary_loss_mlp": 0.01037728, + "balance_loss_clip": 1.02364349, + "balance_loss_mlp": 1.04200089, + "epoch": 0.40360739516007815, + "flos": 24674365140480.0, + "grad_norm": 2.1216019240756765, + "language_loss": 0.78926992, + "learning_rate": 2.7059313096015096e-06, + "loss": 0.81082511, + "num_input_tokens_seen": 144169295, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7578125, + "step": 6713, + "time_per_iteration": 2.520109176635742 + }, + { + "auxiliary_loss_clip": 0.01113814, + "auxiliary_loss_mlp": 0.01033041, + "balance_loss_clip": 1.01912916, + "balance_loss_mlp": 1.03721881, + "epoch": 0.4036675184127461, + "flos": 17304095329920.0, + "grad_norm": 1.8945946455640421, + "language_loss": 0.88507473, + "learning_rate": 2.705566901740865e-06, + "loss": 0.90654337, + "num_input_tokens_seen": 144185790, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.765625, + "step": 6714, + "time_per_iteration": 2.4076859951019287 + }, + { + "auxiliary_loss_clip": 0.01115997, + "auxiliary_loss_mlp": 0.01040851, + "balance_loss_clip": 1.02688611, + "balance_loss_mlp": 1.04049468, + "epoch": 0.4037276416654141, + "flos": 19864023765120.0, + "grad_norm": 2.116493132238348, + "language_loss": 0.69099832, + "learning_rate": 2.7052024671250527e-06, + "loss": 0.71256685, + "num_input_tokens_seen": 144205190, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75390625, + "step": 6715, + "time_per_iteration": 2.4805076122283936 + }, + { + "auxiliary_loss_clip": 0.01117346, + "auxiliary_loss_mlp": 0.01031825, + "balance_loss_clip": 1.01785374, + "balance_loss_mlp": 1.03944981, + "epoch": 0.40378776491808205, + "flos": 18296271780480.0, + "grad_norm": 7.495764991407429, + "language_loss": 0.76919901, + "learning_rate": 2.704838005767892e-06, + "loss": 0.79069078, + "num_input_tokens_seen": 144222705, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.78125, + "step": 6716, + "time_per_iteration": 2.4244720935821533 + }, + { + "auxiliary_loss_clip": 0.0111299, + "auxiliary_loss_mlp": 0.01037832, + "balance_loss_clip": 1.02485037, + "balance_loss_mlp": 1.03992844, + "epoch": 0.40384788817075, + "flos": 15049372757760.0, + "grad_norm": 1.8407988101654404, + "language_loss": 0.76272923, + "learning_rate": 2.7044735176832037e-06, + "loss": 0.78423738, + "num_input_tokens_seen": 144239545, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.73046875, + "step": 6717, + "time_per_iteration": 2.5080463886260986 + }, + { + "auxiliary_loss_clip": 0.01036903, + "auxiliary_loss_mlp": 0.01007011, + "balance_loss_clip": 1.00571179, + "balance_loss_mlp": 1.01217222, + "epoch": 0.40390801142341803, + "flos": 61929927895680.0, + "grad_norm": 0.940083561343906, + "language_loss": 0.60735488, + "learning_rate": 2.7041090028848084e-06, + "loss": 0.62779397, + "num_input_tokens_seen": 144288145, + "router_z_loss_clip": 0.01300049, + "router_z_loss_mlp": 0.24707031, + "step": 6718, + "time_per_iteration": 2.9391937255859375 + }, + { + "auxiliary_loss_clip": 0.01120577, + "auxiliary_loss_mlp": 0.01036292, + "balance_loss_clip": 1.02140856, + "balance_loss_mlp": 1.04066229, + "epoch": 0.403968134676086, + "flos": 22738779930240.0, + "grad_norm": 2.1744660134680776, + "language_loss": 0.74794078, + "learning_rate": 2.7037444613865306e-06, + "loss": 0.76950943, + "num_input_tokens_seen": 144302315, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 6719, + "time_per_iteration": 2.4630534648895264 + }, + { + "auxiliary_loss_clip": 0.01117045, + "auxiliary_loss_mlp": 0.01043036, + "balance_loss_clip": 1.02762175, + "balance_loss_mlp": 1.0402683, + "epoch": 0.40402825792875396, + "flos": 19784409269760.0, + "grad_norm": 2.5217598497166422, + "language_loss": 0.81235194, + "learning_rate": 2.7033798932021906e-06, + "loss": 0.83395278, + "num_input_tokens_seen": 144318990, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.76953125, + "step": 6720, + "time_per_iteration": 6.786137104034424 + }, + { + "auxiliary_loss_clip": 0.01113768, + "auxiliary_loss_mlp": 0.01030391, + "balance_loss_clip": 1.01644325, + "balance_loss_mlp": 1.0376296, + "epoch": 0.40408838118142193, + "flos": 19609273532160.0, + "grad_norm": 1.933287838521713, + "language_loss": 0.7720241, + "learning_rate": 2.7030152983456153e-06, + "loss": 0.79346573, + "num_input_tokens_seen": 144335765, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76171875, + "step": 6721, + "time_per_iteration": 3.9910030364990234 + }, + { + "auxiliary_loss_clip": 0.01112718, + "auxiliary_loss_mlp": 0.01026726, + "balance_loss_clip": 1.01460266, + "balance_loss_mlp": 1.04090941, + "epoch": 0.4041485044340899, + "flos": 24426043441920.0, + "grad_norm": 2.3110658804222566, + "language_loss": 0.7264756, + "learning_rate": 2.7026506768306304e-06, + "loss": 0.74787009, + "num_input_tokens_seen": 144355825, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.71875, + "step": 6722, + "time_per_iteration": 2.5377390384674072 + }, + { + "auxiliary_loss_clip": 0.01113608, + "auxiliary_loss_mlp": 0.01036417, + "balance_loss_clip": 1.02270842, + "balance_loss_mlp": 1.03896952, + "epoch": 0.40420862768675786, + "flos": 16760192613120.0, + "grad_norm": 1.7096890061042316, + "language_loss": 0.65681767, + "learning_rate": 2.7022860286710602e-06, + "loss": 0.67831796, + "num_input_tokens_seen": 144374320, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 6723, + "time_per_iteration": 2.429657220840454 + }, + { + "auxiliary_loss_clip": 0.01118233, + "auxiliary_loss_mlp": 0.01043022, + "balance_loss_clip": 1.02834117, + "balance_loss_mlp": 1.04056454, + "epoch": 0.4042687509394258, + "flos": 22491571553280.0, + "grad_norm": 1.4515559648574707, + "language_loss": 0.74074364, + "learning_rate": 2.701921353880734e-06, + "loss": 0.76235622, + "num_input_tokens_seen": 144394325, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7734375, + "step": 6724, + "time_per_iteration": 2.485166072845459 + }, + { + "auxiliary_loss_clip": 0.011096, + "auxiliary_loss_mlp": 0.01034503, + "balance_loss_clip": 1.02133048, + "balance_loss_mlp": 1.03799534, + "epoch": 0.4043288741920938, + "flos": 30336149479680.0, + "grad_norm": 1.783988932028688, + "language_loss": 0.74764013, + "learning_rate": 2.7015566524734787e-06, + "loss": 0.76908118, + "num_input_tokens_seen": 144412765, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71484375, + "step": 6725, + "time_per_iteration": 2.5141966342926025 + }, + { + "auxiliary_loss_clip": 0.01112534, + "auxiliary_loss_mlp": 0.01034717, + "balance_loss_clip": 1.02024531, + "balance_loss_mlp": 1.03874183, + "epoch": 0.40438899744476176, + "flos": 46348321363200.0, + "grad_norm": 1.8781247850607437, + "language_loss": 0.76928914, + "learning_rate": 2.701191924463126e-06, + "loss": 0.79076171, + "num_input_tokens_seen": 144435400, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.73828125, + "step": 6726, + "time_per_iteration": 2.685609817504883 + }, + { + "auxiliary_loss_clip": 0.01115432, + "auxiliary_loss_mlp": 0.01034649, + "balance_loss_clip": 1.02004611, + "balance_loss_mlp": 1.03858769, + "epoch": 0.4044491206974297, + "flos": 13333524998400.0, + "grad_norm": 2.1780936913008646, + "language_loss": 0.81682861, + "learning_rate": 2.7008271698635054e-06, + "loss": 0.83832943, + "num_input_tokens_seen": 144452925, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.76953125, + "step": 6727, + "time_per_iteration": 2.4221317768096924 + }, + { + "auxiliary_loss_clip": 0.0111635, + "auxiliary_loss_mlp": 0.01034771, + "balance_loss_clip": 1.02088916, + "balance_loss_mlp": 1.0411514, + "epoch": 0.4045092439500977, + "flos": 12093745121280.0, + "grad_norm": 2.0089286405461246, + "language_loss": 0.85300338, + "learning_rate": 2.700462388688447e-06, + "loss": 0.87451458, + "num_input_tokens_seen": 144470195, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 6728, + "time_per_iteration": 2.4719340801239014 + }, + { + "auxiliary_loss_clip": 0.01117368, + "auxiliary_loss_mlp": 0.01034182, + "balance_loss_clip": 1.02059257, + "balance_loss_mlp": 1.04241705, + "epoch": 0.40456936720276565, + "flos": 21179683123200.0, + "grad_norm": 1.6690883830899332, + "language_loss": 0.81804991, + "learning_rate": 2.700097580951786e-06, + "loss": 0.8395654, + "num_input_tokens_seen": 144490320, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.75, + "step": 6729, + "time_per_iteration": 2.4482905864715576 + }, + { + "auxiliary_loss_clip": 0.01114628, + "auxiliary_loss_mlp": 0.01034739, + "balance_loss_clip": 1.02092838, + "balance_loss_mlp": 1.04034996, + "epoch": 0.4046294904554336, + "flos": 23915286000000.0, + "grad_norm": 1.841339511320202, + "language_loss": 0.72582501, + "learning_rate": 2.6997327466673533e-06, + "loss": 0.74731869, + "num_input_tokens_seen": 144508990, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7421875, + "step": 6730, + "time_per_iteration": 2.537121295928955 + }, + { + "auxiliary_loss_clip": 0.01114402, + "auxiliary_loss_mlp": 0.01035728, + "balance_loss_clip": 1.0216732, + "balance_loss_mlp": 1.04037821, + "epoch": 0.4046896137081016, + "flos": 38071235773440.0, + "grad_norm": 1.6090983176176454, + "language_loss": 0.67394918, + "learning_rate": 2.699367885848985e-06, + "loss": 0.69545048, + "num_input_tokens_seen": 144529550, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 6731, + "time_per_iteration": 2.645958423614502 + }, + { + "auxiliary_loss_clip": 0.01114135, + "auxiliary_loss_mlp": 0.01034866, + "balance_loss_clip": 1.02196193, + "balance_loss_mlp": 1.03986645, + "epoch": 0.4047497369607696, + "flos": 23617262856960.0, + "grad_norm": 1.6078062973222544, + "language_loss": 0.74067897, + "learning_rate": 2.699002998510517e-06, + "loss": 0.76216894, + "num_input_tokens_seen": 144549310, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7421875, + "step": 6732, + "time_per_iteration": 2.5182886123657227 + }, + { + "auxiliary_loss_clip": 0.01114756, + "auxiliary_loss_mlp": 0.01028887, + "balance_loss_clip": 1.01650739, + "balance_loss_mlp": 1.04178488, + "epoch": 0.40480986021343757, + "flos": 12823593569280.0, + "grad_norm": 1.830865433765548, + "language_loss": 0.7690779, + "learning_rate": 2.6986380846657852e-06, + "loss": 0.79051435, + "num_input_tokens_seen": 144567430, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.73046875, + "step": 6733, + "time_per_iteration": 2.430748701095581 + }, + { + "auxiliary_loss_clip": 0.01120623, + "auxiliary_loss_mlp": 0.01038866, + "balance_loss_clip": 1.02358902, + "balance_loss_mlp": 1.04164028, + "epoch": 0.40486998346610553, + "flos": 23768770423680.0, + "grad_norm": 1.8916182343646197, + "language_loss": 0.7649287, + "learning_rate": 2.698273144328627e-06, + "loss": 0.78652358, + "num_input_tokens_seen": 144585975, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.7890625, + "step": 6734, + "time_per_iteration": 2.507070541381836 + }, + { + "auxiliary_loss_clip": 0.01121282, + "auxiliary_loss_mlp": 0.01030778, + "balance_loss_clip": 1.01729572, + "balance_loss_mlp": 1.04258728, + "epoch": 0.4049301067187735, + "flos": 22856818999680.0, + "grad_norm": 2.227264135735927, + "language_loss": 0.65026176, + "learning_rate": 2.6979081775128805e-06, + "loss": 0.67178231, + "num_input_tokens_seen": 144605225, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7890625, + "step": 6735, + "time_per_iteration": 2.4677040576934814 + }, + { + "auxiliary_loss_clip": 0.01113204, + "auxiliary_loss_mlp": 0.01034185, + "balance_loss_clip": 1.02154267, + "balance_loss_mlp": 1.04025424, + "epoch": 0.40499022997144146, + "flos": 22783992174720.0, + "grad_norm": 1.9551652085107198, + "language_loss": 0.83177966, + "learning_rate": 2.697543184232387e-06, + "loss": 0.85325354, + "num_input_tokens_seen": 144624145, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7265625, + "step": 6736, + "time_per_iteration": 2.5244226455688477 + }, + { + "auxiliary_loss_clip": 0.01121161, + "auxiliary_loss_mlp": 0.01037611, + "balance_loss_clip": 1.02344942, + "balance_loss_mlp": 1.04291666, + "epoch": 0.4050503532241094, + "flos": 23039352938880.0, + "grad_norm": 1.699075737504615, + "language_loss": 0.7520684, + "learning_rate": 2.6971781645009863e-06, + "loss": 0.77365613, + "num_input_tokens_seen": 144644470, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.78125, + "step": 6737, + "time_per_iteration": 2.510906457901001 + }, + { + "auxiliary_loss_clip": 0.01117535, + "auxiliary_loss_mlp": 0.0103775, + "balance_loss_clip": 1.02408242, + "balance_loss_mlp": 1.04335642, + "epoch": 0.4051104764767774, + "flos": 16647756065280.0, + "grad_norm": 2.288492776548484, + "language_loss": 0.71790028, + "learning_rate": 2.696813118332519e-06, + "loss": 0.73945308, + "num_input_tokens_seen": 144661055, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7421875, + "step": 6738, + "time_per_iteration": 2.514575481414795 + }, + { + "auxiliary_loss_clip": 0.01114478, + "auxiliary_loss_mlp": 0.01031321, + "balance_loss_clip": 1.01845288, + "balance_loss_mlp": 1.04022241, + "epoch": 0.40517059972944536, + "flos": 16358962717440.0, + "grad_norm": 2.003378473366394, + "language_loss": 0.75169361, + "learning_rate": 2.696448045740828e-06, + "loss": 0.77315164, + "num_input_tokens_seen": 144677935, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7421875, + "step": 6739, + "time_per_iteration": 2.4737000465393066 + }, + { + "auxiliary_loss_clip": 0.01119431, + "auxiliary_loss_mlp": 0.01034852, + "balance_loss_clip": 1.02107763, + "balance_loss_mlp": 1.04296541, + "epoch": 0.4052307229821133, + "flos": 28803374363520.0, + "grad_norm": 1.7865413260400147, + "language_loss": 0.73943472, + "learning_rate": 2.6960829467397576e-06, + "loss": 0.76097751, + "num_input_tokens_seen": 144697725, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.765625, + "step": 6740, + "time_per_iteration": 2.5434296131134033 + }, + { + "auxiliary_loss_clip": 0.0111643, + "auxiliary_loss_mlp": 0.01032682, + "balance_loss_clip": 1.0190562, + "balance_loss_mlp": 1.04310441, + "epoch": 0.4052908462347813, + "flos": 21397876289280.0, + "grad_norm": 1.5350516452213203, + "language_loss": 0.77179801, + "learning_rate": 2.695717821343153e-06, + "loss": 0.79328907, + "num_input_tokens_seen": 144718805, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.734375, + "step": 6741, + "time_per_iteration": 2.473451852798462 + }, + { + "auxiliary_loss_clip": 0.01120883, + "auxiliary_loss_mlp": 0.01035782, + "balance_loss_clip": 1.02082753, + "balance_loss_mlp": 1.04359269, + "epoch": 0.40535096948744925, + "flos": 22419067950720.0, + "grad_norm": 1.8990417013226273, + "language_loss": 0.70827335, + "learning_rate": 2.6953526695648577e-06, + "loss": 0.72983992, + "num_input_tokens_seen": 144737105, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.7734375, + "step": 6742, + "time_per_iteration": 2.4797537326812744 + }, + { + "auxiliary_loss_clip": 0.01121445, + "auxiliary_loss_mlp": 0.01029673, + "balance_loss_clip": 1.01517677, + "balance_loss_mlp": 1.04446578, + "epoch": 0.4054110927401172, + "flos": 17010776868480.0, + "grad_norm": 2.180199258846301, + "language_loss": 0.72242743, + "learning_rate": 2.6949874914187202e-06, + "loss": 0.74393857, + "num_input_tokens_seen": 144751350, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.76953125, + "step": 6743, + "time_per_iteration": 2.409444808959961 + }, + { + "auxiliary_loss_clip": 0.0112179, + "auxiliary_loss_mlp": 0.0103567, + "balance_loss_clip": 1.02128196, + "balance_loss_mlp": 1.04374886, + "epoch": 0.4054712159927852, + "flos": 21614848392960.0, + "grad_norm": 3.287949139408167, + "language_loss": 0.70554733, + "learning_rate": 2.694622286918588e-06, + "loss": 0.72712195, + "num_input_tokens_seen": 144770030, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 6744, + "time_per_iteration": 2.475775957107544 + }, + { + "auxiliary_loss_clip": 0.01116341, + "auxiliary_loss_mlp": 0.01034834, + "balance_loss_clip": 1.02154207, + "balance_loss_mlp": 1.04163671, + "epoch": 0.4055313392454532, + "flos": 25812554376960.0, + "grad_norm": 1.534678646828984, + "language_loss": 0.79982138, + "learning_rate": 2.6942570560783076e-06, + "loss": 0.82133317, + "num_input_tokens_seen": 144790965, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.74609375, + "step": 6745, + "time_per_iteration": 2.492379903793335 + }, + { + "auxiliary_loss_clip": 0.01120523, + "auxiliary_loss_mlp": 0.01033491, + "balance_loss_clip": 1.01904845, + "balance_loss_mlp": 1.0463028, + "epoch": 0.40559146249812117, + "flos": 14137098111360.0, + "grad_norm": 1.8557240822638386, + "language_loss": 0.66450787, + "learning_rate": 2.693891798911731e-06, + "loss": 0.68604791, + "num_input_tokens_seen": 144807755, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7421875, + "step": 6746, + "time_per_iteration": 2.4547531604766846 + }, + { + "auxiliary_loss_clip": 0.01118105, + "auxiliary_loss_mlp": 0.01029204, + "balance_loss_clip": 1.01573384, + "balance_loss_mlp": 1.04319298, + "epoch": 0.40565158575078913, + "flos": 41355481962240.0, + "grad_norm": 1.5006534813974708, + "language_loss": 0.5713616, + "learning_rate": 2.6935265154327075e-06, + "loss": 0.59283465, + "num_input_tokens_seen": 144832405, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 6747, + "time_per_iteration": 2.627912998199463 + }, + { + "auxiliary_loss_clip": 0.01119274, + "auxiliary_loss_mlp": 0.01041713, + "balance_loss_clip": 1.02859426, + "balance_loss_mlp": 1.04399908, + "epoch": 0.4057117090034571, + "flos": 28544529980160.0, + "grad_norm": 1.605109327396707, + "language_loss": 0.8454957, + "learning_rate": 2.693161205655089e-06, + "loss": 0.8671056, + "num_input_tokens_seen": 144853890, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.75390625, + "step": 6748, + "time_per_iteration": 2.5783345699310303 + }, + { + "auxiliary_loss_clip": 0.01120452, + "auxiliary_loss_mlp": 0.01035858, + "balance_loss_clip": 1.02210689, + "balance_loss_mlp": 1.04356313, + "epoch": 0.40577183225612506, + "flos": 18004066640640.0, + "grad_norm": 2.1468645636667705, + "language_loss": 0.81288636, + "learning_rate": 2.6927958695927287e-06, + "loss": 0.83444953, + "num_input_tokens_seen": 144871395, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76953125, + "step": 6749, + "time_per_iteration": 2.433042049407959 + }, + { + "auxiliary_loss_clip": 0.01120578, + "auxiliary_loss_mlp": 0.01037463, + "balance_loss_clip": 1.02395105, + "balance_loss_mlp": 1.04512405, + "epoch": 0.40583195550879303, + "flos": 19536734016000.0, + "grad_norm": 1.6093122324869749, + "language_loss": 0.75051296, + "learning_rate": 2.6924305072594784e-06, + "loss": 0.77209336, + "num_input_tokens_seen": 144890975, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 6750, + "time_per_iteration": 2.500444173812866 + }, + { + "auxiliary_loss_clip": 0.01120094, + "auxiliary_loss_mlp": 0.01034106, + "balance_loss_clip": 1.01919341, + "balance_loss_mlp": 1.04114318, + "epoch": 0.405892078761461, + "flos": 22309468577280.0, + "grad_norm": 2.1309201825140662, + "language_loss": 0.73826647, + "learning_rate": 2.692065118669195e-06, + "loss": 0.75980842, + "num_input_tokens_seen": 144908170, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.7890625, + "step": 6751, + "time_per_iteration": 2.4808826446533203 + }, + { + "auxiliary_loss_clip": 0.01120759, + "auxiliary_loss_mlp": 0.01031901, + "balance_loss_clip": 1.01758409, + "balance_loss_mlp": 1.04471755, + "epoch": 0.40595220201412896, + "flos": 25484402701440.0, + "grad_norm": 5.559089751596236, + "language_loss": 0.6666553, + "learning_rate": 2.6916997038357326e-06, + "loss": 0.68818188, + "num_input_tokens_seen": 144928020, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.76171875, + "step": 6752, + "time_per_iteration": 2.568223714828491 + }, + { + "auxiliary_loss_clip": 0.0112446, + "auxiliary_loss_mlp": 0.0103634, + "balance_loss_clip": 1.02189183, + "balance_loss_mlp": 1.04458666, + "epoch": 0.4060123252667969, + "flos": 49856004103680.0, + "grad_norm": 1.70284971706228, + "language_loss": 0.70600617, + "learning_rate": 2.691334262772948e-06, + "loss": 0.72761416, + "num_input_tokens_seen": 144951240, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.796875, + "step": 6753, + "time_per_iteration": 2.696746587753296 + }, + { + "auxiliary_loss_clip": 0.01119466, + "auxiliary_loss_mlp": 0.01035823, + "balance_loss_clip": 1.02145791, + "balance_loss_mlp": 1.04105067, + "epoch": 0.4060724485194649, + "flos": 21135476459520.0, + "grad_norm": 2.1929566205477804, + "language_loss": 0.71584499, + "learning_rate": 2.690968795494699e-06, + "loss": 0.73739791, + "num_input_tokens_seen": 144969100, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78515625, + "step": 6754, + "time_per_iteration": 2.49405837059021 + }, + { + "auxiliary_loss_clip": 0.01120059, + "auxiliary_loss_mlp": 0.0103975, + "balance_loss_clip": 1.02568889, + "balance_loss_mlp": 1.04273617, + "epoch": 0.40613257177213286, + "flos": 21758059918080.0, + "grad_norm": 1.7112877357577985, + "language_loss": 0.82864529, + "learning_rate": 2.690603302014844e-06, + "loss": 0.85024333, + "num_input_tokens_seen": 144987065, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7734375, + "step": 6755, + "time_per_iteration": 2.4666147232055664 + }, + { + "auxiliary_loss_clip": 0.01122705, + "auxiliary_loss_mlp": 0.01040879, + "balance_loss_clip": 1.02599001, + "balance_loss_mlp": 1.04292035, + "epoch": 0.4061926950248008, + "flos": 25555074710400.0, + "grad_norm": 1.484337354822898, + "language_loss": 0.70812732, + "learning_rate": 2.6902377823472426e-06, + "loss": 0.72976315, + "num_input_tokens_seen": 145007310, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 6756, + "time_per_iteration": 2.539236307144165 + }, + { + "auxiliary_loss_clip": 0.01120038, + "auxiliary_loss_mlp": 0.01041859, + "balance_loss_clip": 1.02724361, + "balance_loss_mlp": 1.04106975, + "epoch": 0.4062528182774688, + "flos": 23695799944320.0, + "grad_norm": 1.6617053894159006, + "language_loss": 0.79047221, + "learning_rate": 2.689872236505755e-06, + "loss": 0.81209117, + "num_input_tokens_seen": 145026210, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.79296875, + "step": 6757, + "time_per_iteration": 2.4614784717559814 + }, + { + "auxiliary_loss_clip": 0.01121935, + "auxiliary_loss_mlp": 0.0103313, + "balance_loss_clip": 1.01865852, + "balance_loss_mlp": 1.04454553, + "epoch": 0.4063129415301368, + "flos": 21726027964800.0, + "grad_norm": 1.5700268222495364, + "language_loss": 0.7851724, + "learning_rate": 2.6895066645042437e-06, + "loss": 0.806723, + "num_input_tokens_seen": 145045475, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7734375, + "step": 6758, + "time_per_iteration": 2.495060920715332 + }, + { + "auxiliary_loss_clip": 0.01116636, + "auxiliary_loss_mlp": 0.01031594, + "balance_loss_clip": 1.01692557, + "balance_loss_mlp": 1.04113591, + "epoch": 0.40637306478280477, + "flos": 12787575206400.0, + "grad_norm": 2.1344538838988454, + "language_loss": 0.88668954, + "learning_rate": 2.6891410663565703e-06, + "loss": 0.90817189, + "num_input_tokens_seen": 145062260, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.75390625, + "step": 6759, + "time_per_iteration": 2.410628318786621 + }, + { + "auxiliary_loss_clip": 0.01120377, + "auxiliary_loss_mlp": 0.01033455, + "balance_loss_clip": 1.01986527, + "balance_loss_mlp": 1.04366982, + "epoch": 0.40643318803547274, + "flos": 24024490323840.0, + "grad_norm": 2.0728742760332546, + "language_loss": 0.63888443, + "learning_rate": 2.688775442076598e-06, + "loss": 0.66042268, + "num_input_tokens_seen": 145082470, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.765625, + "step": 6760, + "time_per_iteration": 2.553819417953491 + }, + { + "auxiliary_loss_clip": 0.01119037, + "auxiliary_loss_mlp": 0.01032802, + "balance_loss_clip": 1.01796103, + "balance_loss_mlp": 1.0422858, + "epoch": 0.4064933112881407, + "flos": 25592421876480.0, + "grad_norm": 1.4242582463540345, + "language_loss": 0.75060493, + "learning_rate": 2.688409791678193e-06, + "loss": 0.77212334, + "num_input_tokens_seen": 145105685, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.765625, + "step": 6761, + "time_per_iteration": 2.520904302597046 + }, + { + "auxiliary_loss_clip": 0.01111351, + "auxiliary_loss_mlp": 0.01033598, + "balance_loss_clip": 1.02029395, + "balance_loss_mlp": 1.04054725, + "epoch": 0.40655343454080867, + "flos": 22054323294720.0, + "grad_norm": 1.4265975037167853, + "language_loss": 0.70109248, + "learning_rate": 2.6880441151752185e-06, + "loss": 0.72254199, + "num_input_tokens_seen": 145125590, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 6762, + "time_per_iteration": 6.884980916976929 + }, + { + "auxiliary_loss_clip": 0.01117935, + "auxiliary_loss_mlp": 0.01032239, + "balance_loss_clip": 1.01893568, + "balance_loss_mlp": 1.04316521, + "epoch": 0.40661355779347663, + "flos": 26468893641600.0, + "grad_norm": 2.223786523351799, + "language_loss": 0.73175049, + "learning_rate": 2.6876784125815433e-06, + "loss": 0.75325227, + "num_input_tokens_seen": 145146810, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75, + "step": 6763, + "time_per_iteration": 3.8783130645751953 + }, + { + "auxiliary_loss_clip": 0.01119915, + "auxiliary_loss_mlp": 0.01036194, + "balance_loss_clip": 1.02200174, + "balance_loss_mlp": 1.04246914, + "epoch": 0.4066736810461446, + "flos": 13261129136640.0, + "grad_norm": 1.725584811158307, + "language_loss": 0.6908524, + "learning_rate": 2.687312683911033e-06, + "loss": 0.71241343, + "num_input_tokens_seen": 145163130, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7734375, + "step": 6764, + "time_per_iteration": 2.4408676624298096 + }, + { + "auxiliary_loss_clip": 0.01123793, + "auxiliary_loss_mlp": 0.01040267, + "balance_loss_clip": 1.02481747, + "balance_loss_mlp": 1.04485261, + "epoch": 0.40673380429881256, + "flos": 28803625758720.0, + "grad_norm": 2.20566464671706, + "language_loss": 0.91570717, + "learning_rate": 2.686946929177557e-06, + "loss": 0.93734777, + "num_input_tokens_seen": 145181420, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.7890625, + "step": 6765, + "time_per_iteration": 2.4904191493988037 + }, + { + "auxiliary_loss_clip": 0.01122971, + "auxiliary_loss_mlp": 0.01041584, + "balance_loss_clip": 1.02672434, + "balance_loss_mlp": 1.04374599, + "epoch": 0.4067939275514805, + "flos": 12495334152960.0, + "grad_norm": 2.279622168201086, + "language_loss": 0.78459442, + "learning_rate": 2.6865811483949855e-06, + "loss": 0.80623996, + "num_input_tokens_seen": 145198545, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.79296875, + "step": 6766, + "time_per_iteration": 2.4594480991363525 + }, + { + "auxiliary_loss_clip": 0.01119893, + "auxiliary_loss_mlp": 0.01038568, + "balance_loss_clip": 1.02457929, + "balance_loss_mlp": 1.04144108, + "epoch": 0.4068540508041485, + "flos": 18770508069120.0, + "grad_norm": 1.9487336600068845, + "language_loss": 0.76438922, + "learning_rate": 2.6862153415771867e-06, + "loss": 0.78597391, + "num_input_tokens_seen": 145215835, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.78125, + "step": 6767, + "time_per_iteration": 2.4127700328826904 + }, + { + "auxiliary_loss_clip": 0.01119033, + "auxiliary_loss_mlp": 0.01036408, + "balance_loss_clip": 1.0229435, + "balance_loss_mlp": 1.0442543, + "epoch": 0.40691417405681646, + "flos": 28512821249280.0, + "grad_norm": 1.7431301492707811, + "language_loss": 0.77572781, + "learning_rate": 2.685849508738034e-06, + "loss": 0.79728222, + "num_input_tokens_seen": 145236555, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 6768, + "time_per_iteration": 2.5312347412109375 + }, + { + "auxiliary_loss_clip": 0.01118014, + "auxiliary_loss_mlp": 0.01031889, + "balance_loss_clip": 1.01861525, + "balance_loss_mlp": 1.04248428, + "epoch": 0.4069742973094844, + "flos": 20814040627200.0, + "grad_norm": 2.7094466648077935, + "language_loss": 0.87585759, + "learning_rate": 2.6854836498913995e-06, + "loss": 0.89735663, + "num_input_tokens_seen": 145254595, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75390625, + "step": 6769, + "time_per_iteration": 2.434276580810547 + }, + { + "auxiliary_loss_clip": 0.01120302, + "auxiliary_loss_mlp": 0.01032733, + "balance_loss_clip": 1.02028155, + "balance_loss_mlp": 1.04659963, + "epoch": 0.4070344205621524, + "flos": 21470272151040.0, + "grad_norm": 1.8989360481904207, + "language_loss": 0.80883789, + "learning_rate": 2.685117765051156e-06, + "loss": 0.83036822, + "num_input_tokens_seen": 145274005, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.73828125, + "step": 6770, + "time_per_iteration": 2.4768316745758057 + }, + { + "auxiliary_loss_clip": 0.01121746, + "auxiliary_loss_mlp": 0.01032044, + "balance_loss_clip": 1.01699948, + "balance_loss_mlp": 1.04308331, + "epoch": 0.4070945438148204, + "flos": 26830046937600.0, + "grad_norm": 1.6240016049823844, + "language_loss": 0.80161405, + "learning_rate": 2.6847518542311783e-06, + "loss": 0.82315195, + "num_input_tokens_seen": 145294850, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.78515625, + "step": 6771, + "time_per_iteration": 2.4864251613616943 + }, + { + "auxiliary_loss_clip": 0.01116481, + "auxiliary_loss_mlp": 0.01038824, + "balance_loss_clip": 1.02476382, + "balance_loss_mlp": 1.04181063, + "epoch": 0.4071546670674884, + "flos": 26354158623360.0, + "grad_norm": 1.5515756087522081, + "language_loss": 0.76267636, + "learning_rate": 2.6843859174453417e-06, + "loss": 0.7842294, + "num_input_tokens_seen": 145317050, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.74609375, + "step": 6772, + "time_per_iteration": 2.5570874214172363 + }, + { + "auxiliary_loss_clip": 0.01116059, + "auxiliary_loss_mlp": 0.01040475, + "balance_loss_clip": 1.0259316, + "balance_loss_mlp": 1.04014397, + "epoch": 0.40721479032015634, + "flos": 17895401020800.0, + "grad_norm": 1.6577007729475706, + "language_loss": 0.81418705, + "learning_rate": 2.6840199547075218e-06, + "loss": 0.83575237, + "num_input_tokens_seen": 145334480, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7578125, + "step": 6773, + "time_per_iteration": 2.4311835765838623 + }, + { + "auxiliary_loss_clip": 0.01040526, + "auxiliary_loss_mlp": 0.01005684, + "balance_loss_clip": 1.00416398, + "balance_loss_mlp": 1.01639521, + "epoch": 0.4072749135728243, + "flos": 49854570537600.0, + "grad_norm": 0.8363890316728796, + "language_loss": 0.6434871, + "learning_rate": 2.683653966031597e-06, + "loss": 0.66394925, + "num_input_tokens_seen": 145388695, + "router_z_loss_clip": 0.01519775, + "router_z_loss_mlp": 0.24121094, + "step": 6774, + "time_per_iteration": 2.987610340118408 + }, + { + "auxiliary_loss_clip": 0.01119504, + "auxiliary_loss_mlp": 0.01035806, + "balance_loss_clip": 1.02136981, + "balance_loss_mlp": 1.04115796, + "epoch": 0.40733503682549227, + "flos": 27563630400000.0, + "grad_norm": 13.875946104557459, + "language_loss": 0.72097111, + "learning_rate": 2.683287951431446e-06, + "loss": 0.74252421, + "num_input_tokens_seen": 145408240, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 6775, + "time_per_iteration": 2.5014185905456543 + }, + { + "auxiliary_loss_clip": 0.01118561, + "auxiliary_loss_mlp": 0.01041249, + "balance_loss_clip": 1.02736115, + "balance_loss_mlp": 1.04123604, + "epoch": 0.40739516007816023, + "flos": 22126970551680.0, + "grad_norm": 1.3741783359801052, + "language_loss": 0.77956975, + "learning_rate": 2.6829219109209474e-06, + "loss": 0.80116785, + "num_input_tokens_seen": 145428395, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7734375, + "step": 6776, + "time_per_iteration": 2.484910488128662 + }, + { + "auxiliary_loss_clip": 0.0112306, + "auxiliary_loss_mlp": 0.01038548, + "balance_loss_clip": 1.0240761, + "balance_loss_mlp": 1.04408884, + "epoch": 0.4074552833308282, + "flos": 23842243693440.0, + "grad_norm": 2.6337418369090404, + "language_loss": 0.79015827, + "learning_rate": 2.682555844513981e-06, + "loss": 0.81177437, + "num_input_tokens_seen": 145448290, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7890625, + "step": 6777, + "time_per_iteration": 2.4701852798461914 + }, + { + "auxiliary_loss_clip": 0.01039569, + "auxiliary_loss_mlp": 0.01005822, + "balance_loss_clip": 1.00424814, + "balance_loss_mlp": 1.01542926, + "epoch": 0.40751540658349616, + "flos": 58000008781440.0, + "grad_norm": 0.6828077953919364, + "language_loss": 0.5320037, + "learning_rate": 2.6821897522244286e-06, + "loss": 0.55245763, + "num_input_tokens_seen": 145509785, + "router_z_loss_clip": 0.01574707, + "router_z_loss_mlp": 0.2421875, + "step": 6778, + "time_per_iteration": 3.117647647857666 + }, + { + "auxiliary_loss_clip": 0.01119188, + "auxiliary_loss_mlp": 0.01041042, + "balance_loss_clip": 1.02658224, + "balance_loss_mlp": 1.04310179, + "epoch": 0.40757552983616413, + "flos": 21214659991680.0, + "grad_norm": 2.2984205071258272, + "language_loss": 0.82367444, + "learning_rate": 2.6818236340661718e-06, + "loss": 0.84527671, + "num_input_tokens_seen": 145528620, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.76171875, + "step": 6779, + "time_per_iteration": 2.4653449058532715 + }, + { + "auxiliary_loss_clip": 0.0111837, + "auxiliary_loss_mlp": 0.01037706, + "balance_loss_clip": 1.02289438, + "balance_loss_mlp": 1.0422008, + "epoch": 0.4076356530888321, + "flos": 26833530556800.0, + "grad_norm": 1.7439910283418456, + "language_loss": 0.7628178, + "learning_rate": 2.6814574900530957e-06, + "loss": 0.78437853, + "num_input_tokens_seen": 145547775, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.76171875, + "step": 6780, + "time_per_iteration": 2.5031514167785645 + }, + { + "auxiliary_loss_clip": 0.01114202, + "auxiliary_loss_mlp": 0.01030127, + "balance_loss_clip": 1.01759243, + "balance_loss_mlp": 1.04146945, + "epoch": 0.40769577634150006, + "flos": 12203021272320.0, + "grad_norm": 2.107375049179959, + "language_loss": 0.65990937, + "learning_rate": 2.6810913201990827e-06, + "loss": 0.68135262, + "num_input_tokens_seen": 145564465, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7265625, + "step": 6781, + "time_per_iteration": 2.431759834289551 + }, + { + "auxiliary_loss_clip": 0.01117153, + "auxiliary_loss_mlp": 0.01036981, + "balance_loss_clip": 1.02233076, + "balance_loss_mlp": 1.04050446, + "epoch": 0.407755899594168, + "flos": 33655264796160.0, + "grad_norm": 2.315782733130647, + "language_loss": 0.71046883, + "learning_rate": 2.6807251245180183e-06, + "loss": 0.73201013, + "num_input_tokens_seen": 145585965, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.765625, + "step": 6782, + "time_per_iteration": 2.567138433456421 + }, + { + "auxiliary_loss_clip": 0.01117461, + "auxiliary_loss_mlp": 0.01031478, + "balance_loss_clip": 1.01789367, + "balance_loss_mlp": 1.04120076, + "epoch": 0.407816022846836, + "flos": 20157342226560.0, + "grad_norm": 1.7193598407967954, + "language_loss": 0.82066965, + "learning_rate": 2.6803589030237897e-06, + "loss": 0.84215903, + "num_input_tokens_seen": 145605000, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.76171875, + "step": 6783, + "time_per_iteration": 2.46891188621521 + }, + { + "auxiliary_loss_clip": 0.01116877, + "auxiliary_loss_mlp": 0.01034742, + "balance_loss_clip": 1.02065194, + "balance_loss_mlp": 1.04063141, + "epoch": 0.40787614609950396, + "flos": 21178821196800.0, + "grad_norm": 1.6682285001774693, + "language_loss": 0.80728561, + "learning_rate": 2.679992655730283e-06, + "loss": 0.82880187, + "num_input_tokens_seen": 145623740, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 6784, + "time_per_iteration": 2.456216812133789 + }, + { + "auxiliary_loss_clip": 0.01122913, + "auxiliary_loss_mlp": 0.01041361, + "balance_loss_clip": 1.0258038, + "balance_loss_mlp": 1.04271793, + "epoch": 0.407936269352172, + "flos": 20520650338560.0, + "grad_norm": 1.7628578717327703, + "language_loss": 0.65640736, + "learning_rate": 2.679626382651386e-06, + "loss": 0.67805004, + "num_input_tokens_seen": 145643515, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.80078125, + "step": 6785, + "time_per_iteration": 2.46173357963562 + }, + { + "auxiliary_loss_clip": 0.01115104, + "auxiliary_loss_mlp": 0.01030368, + "balance_loss_clip": 1.01650357, + "balance_loss_mlp": 1.0397855, + "epoch": 0.40799639260483994, + "flos": 20118809911680.0, + "grad_norm": 1.9756209352263352, + "language_loss": 0.79518569, + "learning_rate": 2.679260083800989e-06, + "loss": 0.81664044, + "num_input_tokens_seen": 145660890, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75390625, + "step": 6786, + "time_per_iteration": 2.430769205093384 + }, + { + "auxiliary_loss_clip": 0.01115587, + "auxiliary_loss_mlp": 0.01036368, + "balance_loss_clip": 1.02349889, + "balance_loss_mlp": 1.04094195, + "epoch": 0.4080565158575079, + "flos": 20997328752000.0, + "grad_norm": 1.5131366331092475, + "language_loss": 0.81249726, + "learning_rate": 2.678893759192982e-06, + "loss": 0.8340168, + "num_input_tokens_seen": 145680070, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.74609375, + "step": 6787, + "time_per_iteration": 2.4589040279388428 + }, + { + "auxiliary_loss_clip": 0.01115847, + "auxiliary_loss_mlp": 0.01033088, + "balance_loss_clip": 1.01907516, + "balance_loss_mlp": 1.04059005, + "epoch": 0.40811663911017587, + "flos": 19317714837120.0, + "grad_norm": 1.9559544882723985, + "language_loss": 0.67917293, + "learning_rate": 2.678527408841255e-06, + "loss": 0.70066231, + "num_input_tokens_seen": 145698010, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.75390625, + "step": 6788, + "time_per_iteration": 2.4450576305389404 + }, + { + "auxiliary_loss_clip": 0.01116018, + "auxiliary_loss_mlp": 0.01041079, + "balance_loss_clip": 1.02644002, + "balance_loss_mlp": 1.03975677, + "epoch": 0.40817676236284384, + "flos": 40625382119040.0, + "grad_norm": 2.2689407766698584, + "language_loss": 0.6605472, + "learning_rate": 2.678161032759701e-06, + "loss": 0.68211812, + "num_input_tokens_seen": 145722215, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.765625, + "step": 6789, + "time_per_iteration": 2.6358134746551514 + }, + { + "auxiliary_loss_clip": 0.01116808, + "auxiliary_loss_mlp": 0.01035749, + "balance_loss_clip": 1.02133691, + "balance_loss_mlp": 1.0408318, + "epoch": 0.4082368856155118, + "flos": 20522086882560.0, + "grad_norm": 1.683929923970831, + "language_loss": 0.60006517, + "learning_rate": 2.6777946309622123e-06, + "loss": 0.62159079, + "num_input_tokens_seen": 145741090, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7578125, + "step": 6790, + "time_per_iteration": 2.4339373111724854 + }, + { + "auxiliary_loss_clip": 0.01117331, + "auxiliary_loss_mlp": 0.01041648, + "balance_loss_clip": 1.02752209, + "balance_loss_mlp": 1.04277873, + "epoch": 0.40829700886817977, + "flos": 11427745098240.0, + "grad_norm": 3.0836688581186538, + "language_loss": 0.69763649, + "learning_rate": 2.677428203462683e-06, + "loss": 0.71922624, + "num_input_tokens_seen": 145754985, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.74609375, + "step": 6791, + "time_per_iteration": 2.3970839977264404 + }, + { + "auxiliary_loss_clip": 0.01036371, + "auxiliary_loss_mlp": 0.01001155, + "balance_loss_clip": 0.99973643, + "balance_loss_mlp": 1.01245427, + "epoch": 0.40835713212084773, + "flos": 67330677121920.0, + "grad_norm": 0.7479961411193888, + "language_loss": 0.59600538, + "learning_rate": 2.6770617502750093e-06, + "loss": 0.61638063, + "num_input_tokens_seen": 145815260, + "router_z_loss_clip": 0.01416016, + "router_z_loss_mlp": 0.23828125, + "step": 6792, + "time_per_iteration": 3.0660579204559326 + }, + { + "auxiliary_loss_clip": 0.01122654, + "auxiliary_loss_mlp": 0.01046383, + "balance_loss_clip": 1.03205419, + "balance_loss_mlp": 1.04478419, + "epoch": 0.4084172553735157, + "flos": 21762010414080.0, + "grad_norm": 2.1865523890186975, + "language_loss": 0.8017205, + "learning_rate": 2.6766952714130857e-06, + "loss": 0.82341087, + "num_input_tokens_seen": 145832665, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 6793, + "time_per_iteration": 2.4930570125579834 + }, + { + "auxiliary_loss_clip": 0.01117695, + "auxiliary_loss_mlp": 0.01034941, + "balance_loss_clip": 1.0203917, + "balance_loss_mlp": 1.04145718, + "epoch": 0.40847737862618366, + "flos": 27417258478080.0, + "grad_norm": 1.7948567342085118, + "language_loss": 0.85040581, + "learning_rate": 2.6763287668908094e-06, + "loss": 0.87193215, + "num_input_tokens_seen": 145850240, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.76171875, + "step": 6794, + "time_per_iteration": 2.500248670578003 + }, + { + "auxiliary_loss_clip": 0.01119372, + "auxiliary_loss_mlp": 0.01036853, + "balance_loss_clip": 1.02316737, + "balance_loss_mlp": 1.04290628, + "epoch": 0.4085375018788516, + "flos": 18587255857920.0, + "grad_norm": 1.6403079662436217, + "language_loss": 0.79991007, + "learning_rate": 2.6759622367220788e-06, + "loss": 0.82147229, + "num_input_tokens_seen": 145869545, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.765625, + "step": 6795, + "time_per_iteration": 2.4969587326049805 + }, + { + "auxiliary_loss_clip": 0.01121457, + "auxiliary_loss_mlp": 0.01034155, + "balance_loss_clip": 1.01903319, + "balance_loss_mlp": 1.0415107, + "epoch": 0.4085976251315196, + "flos": 15411783029760.0, + "grad_norm": 3.0496031094407767, + "language_loss": 0.69604456, + "learning_rate": 2.675595680920792e-06, + "loss": 0.7176007, + "num_input_tokens_seen": 145884025, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.80078125, + "step": 6796, + "time_per_iteration": 2.415790319442749 + }, + { + "auxiliary_loss_clip": 0.01115637, + "auxiliary_loss_mlp": 0.01037628, + "balance_loss_clip": 1.02436018, + "balance_loss_mlp": 1.04028058, + "epoch": 0.40865774838418756, + "flos": 21252222639360.0, + "grad_norm": 1.6154855191434097, + "language_loss": 0.77814329, + "learning_rate": 2.6752290995008498e-06, + "loss": 0.799676, + "num_input_tokens_seen": 145903210, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75390625, + "step": 6797, + "time_per_iteration": 2.4960498809814453 + }, + { + "auxiliary_loss_clip": 0.01114842, + "auxiliary_loss_mlp": 0.01043476, + "balance_loss_clip": 1.03020835, + "balance_loss_mlp": 1.03869152, + "epoch": 0.4087178716368556, + "flos": 13772245714560.0, + "grad_norm": 2.268592052790042, + "language_loss": 0.85668063, + "learning_rate": 2.6748624924761523e-06, + "loss": 0.87826383, + "num_input_tokens_seen": 145920985, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.76171875, + "step": 6798, + "time_per_iteration": 2.4271299839019775 + }, + { + "auxiliary_loss_clip": 0.01115644, + "auxiliary_loss_mlp": 0.01035575, + "balance_loss_clip": 1.02341557, + "balance_loss_mlp": 1.04205322, + "epoch": 0.40877799488952354, + "flos": 23621752056960.0, + "grad_norm": 1.4625848333242037, + "language_loss": 0.8396889, + "learning_rate": 2.674495859860601e-06, + "loss": 0.86120105, + "num_input_tokens_seen": 145940350, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.734375, + "step": 6799, + "time_per_iteration": 2.5059525966644287 + }, + { + "auxiliary_loss_clip": 0.01118535, + "auxiliary_loss_mlp": 0.01043278, + "balance_loss_clip": 1.02861547, + "balance_loss_mlp": 1.04282522, + "epoch": 0.4088381181421915, + "flos": 20918791664640.0, + "grad_norm": 2.2336787226224453, + "language_loss": 0.83352369, + "learning_rate": 2.6741292016681e-06, + "loss": 0.85514188, + "num_input_tokens_seen": 145957460, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7578125, + "step": 6800, + "time_per_iteration": 2.441771984100342 + }, + { + "auxiliary_loss_clip": 0.01118367, + "auxiliary_loss_mlp": 0.01041201, + "balance_loss_clip": 1.02665734, + "balance_loss_mlp": 1.04080248, + "epoch": 0.4088982413948595, + "flos": 13297578462720.0, + "grad_norm": 1.815509221734431, + "language_loss": 0.74838769, + "learning_rate": 2.6737625179125514e-06, + "loss": 0.76998335, + "num_input_tokens_seen": 145975285, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.77734375, + "step": 6801, + "time_per_iteration": 2.4573957920074463 + }, + { + "auxiliary_loss_clip": 0.01118841, + "auxiliary_loss_mlp": 0.01039037, + "balance_loss_clip": 1.02418303, + "balance_loss_mlp": 1.04115379, + "epoch": 0.40895836464752744, + "flos": 15267673664640.0, + "grad_norm": 3.5876275394170682, + "language_loss": 0.79983771, + "learning_rate": 2.673395808607861e-06, + "loss": 0.8214165, + "num_input_tokens_seen": 145989150, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.77734375, + "step": 6802, + "time_per_iteration": 2.4583706855773926 + }, + { + "auxiliary_loss_clip": 0.01121333, + "auxiliary_loss_mlp": 0.01040482, + "balance_loss_clip": 1.02436495, + "balance_loss_mlp": 1.04269981, + "epoch": 0.4090184879001954, + "flos": 14501411804160.0, + "grad_norm": 1.9920926766799116, + "language_loss": 0.75564265, + "learning_rate": 2.673029073767934e-06, + "loss": 0.77726078, + "num_input_tokens_seen": 146006980, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.7890625, + "step": 6803, + "time_per_iteration": 3.8293817043304443 + }, + { + "auxiliary_loss_clip": 0.01117955, + "auxiliary_loss_mlp": 0.01037436, + "balance_loss_clip": 1.02296996, + "balance_loss_mlp": 1.04163659, + "epoch": 0.40907861115286337, + "flos": 13881593692800.0, + "grad_norm": 1.8273723177462575, + "language_loss": 0.78676009, + "learning_rate": 2.6726623134066764e-06, + "loss": 0.80831397, + "num_input_tokens_seen": 146025125, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.765625, + "step": 6804, + "time_per_iteration": 5.276589393615723 + }, + { + "auxiliary_loss_clip": 0.01121753, + "auxiliary_loss_mlp": 0.01038873, + "balance_loss_clip": 1.02486575, + "balance_loss_mlp": 1.04170704, + "epoch": 0.40913873440553133, + "flos": 28037615293440.0, + "grad_norm": 1.824409853433396, + "language_loss": 0.74958569, + "learning_rate": 2.672295527537998e-06, + "loss": 0.77119195, + "num_input_tokens_seen": 146044990, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.80078125, + "step": 6805, + "time_per_iteration": 2.4856061935424805 + }, + { + "auxiliary_loss_clip": 0.01121334, + "auxiliary_loss_mlp": 0.01040926, + "balance_loss_clip": 1.02701998, + "balance_loss_mlp": 1.04323924, + "epoch": 0.4091988576581993, + "flos": 21618188357760.0, + "grad_norm": 1.6270528279533119, + "language_loss": 0.79471934, + "learning_rate": 2.671928716175804e-06, + "loss": 0.816342, + "num_input_tokens_seen": 146066045, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.78125, + "step": 6806, + "time_per_iteration": 2.4999823570251465 + }, + { + "auxiliary_loss_clip": 0.01120343, + "auxiliary_loss_mlp": 0.01034262, + "balance_loss_clip": 1.02002871, + "balance_loss_mlp": 1.04182625, + "epoch": 0.40925898091086726, + "flos": 25224085860480.0, + "grad_norm": 1.8904572172377134, + "language_loss": 0.72131455, + "learning_rate": 2.671561879334007e-06, + "loss": 0.74286067, + "num_input_tokens_seen": 146086280, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78515625, + "step": 6807, + "time_per_iteration": 2.4900894165039062 + }, + { + "auxiliary_loss_clip": 0.01035827, + "auxiliary_loss_mlp": 0.01000695, + "balance_loss_clip": 0.99931204, + "balance_loss_mlp": 1.01169431, + "epoch": 0.40931910416353523, + "flos": 68930568800640.0, + "grad_norm": 0.8333385820049739, + "language_loss": 0.58798856, + "learning_rate": 2.6711950170265155e-06, + "loss": 0.60835379, + "num_input_tokens_seen": 146148840, + "router_z_loss_clip": 0.01385498, + "router_z_loss_mlp": 0.24121094, + "step": 6808, + "time_per_iteration": 3.1670446395874023 + }, + { + "auxiliary_loss_clip": 0.0111783, + "auxiliary_loss_mlp": 0.01047199, + "balance_loss_clip": 1.03397894, + "balance_loss_mlp": 1.04200959, + "epoch": 0.4093792274162032, + "flos": 20189553747840.0, + "grad_norm": 1.6310291749342813, + "language_loss": 0.54454345, + "learning_rate": 2.670828129267242e-06, + "loss": 0.56619376, + "num_input_tokens_seen": 146166195, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7578125, + "step": 6809, + "time_per_iteration": 2.445084571838379 + }, + { + "auxiliary_loss_clip": 0.01117961, + "auxiliary_loss_mlp": 0.01031185, + "balance_loss_clip": 1.0176785, + "balance_loss_mlp": 1.0413785, + "epoch": 0.40943935066887116, + "flos": 25228754628480.0, + "grad_norm": 1.8964783600080724, + "language_loss": 0.83296275, + "learning_rate": 2.6704612160700983e-06, + "loss": 0.85445428, + "num_input_tokens_seen": 146185045, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 6810, + "time_per_iteration": 2.507234573364258 + }, + { + "auxiliary_loss_clip": 0.01121577, + "auxiliary_loss_mlp": 0.01042346, + "balance_loss_clip": 1.02736187, + "balance_loss_mlp": 1.04350328, + "epoch": 0.4094994739215392, + "flos": 23255319461760.0, + "grad_norm": 2.219108175656967, + "language_loss": 0.77739668, + "learning_rate": 2.670094277448999e-06, + "loss": 0.79903591, + "num_input_tokens_seen": 146204655, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.78125, + "step": 6811, + "time_per_iteration": 2.4652421474456787 + }, + { + "auxiliary_loss_clip": 0.01118877, + "auxiliary_loss_mlp": 0.01033588, + "balance_loss_clip": 1.01804352, + "balance_loss_mlp": 1.04151464, + "epoch": 0.40955959717420715, + "flos": 17382165540480.0, + "grad_norm": 1.8555113442690365, + "language_loss": 0.69810557, + "learning_rate": 2.669727313417857e-06, + "loss": 0.7196303, + "num_input_tokens_seen": 146222000, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.7734375, + "step": 6812, + "time_per_iteration": 2.4447555541992188 + }, + { + "auxiliary_loss_clip": 0.0111498, + "auxiliary_loss_mlp": 0.01040667, + "balance_loss_clip": 1.02644539, + "balance_loss_mlp": 1.03930998, + "epoch": 0.4096197204268751, + "flos": 25082418620160.0, + "grad_norm": 1.4849650877087106, + "language_loss": 0.66131341, + "learning_rate": 2.6693603239905872e-06, + "loss": 0.68286985, + "num_input_tokens_seen": 146242630, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7578125, + "step": 6813, + "time_per_iteration": 2.461461067199707 + }, + { + "auxiliary_loss_clip": 0.01115791, + "auxiliary_loss_mlp": 0.01036723, + "balance_loss_clip": 1.02209592, + "balance_loss_mlp": 1.04076779, + "epoch": 0.4096798436795431, + "flos": 30586769648640.0, + "grad_norm": 1.8347983960230858, + "language_loss": 0.73899138, + "learning_rate": 2.6689933091811087e-06, + "loss": 0.76051652, + "num_input_tokens_seen": 146263070, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.75, + "step": 6814, + "time_per_iteration": 2.5444507598876953 + }, + { + "auxiliary_loss_clip": 0.01120309, + "auxiliary_loss_mlp": 0.01034627, + "balance_loss_clip": 1.02035785, + "balance_loss_mlp": 1.04147315, + "epoch": 0.40973996693221104, + "flos": 24133622820480.0, + "grad_norm": 2.162963447393967, + "language_loss": 0.65966797, + "learning_rate": 2.6686262690033357e-06, + "loss": 0.68121737, + "num_input_tokens_seen": 146282890, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7890625, + "step": 6815, + "time_per_iteration": 2.4877898693084717 + }, + { + "auxiliary_loss_clip": 0.01116543, + "auxiliary_loss_mlp": 0.01037411, + "balance_loss_clip": 1.02371955, + "balance_loss_mlp": 1.04337275, + "epoch": 0.409800090184879, + "flos": 23988974751360.0, + "grad_norm": 1.6370882031659308, + "language_loss": 0.76553667, + "learning_rate": 2.668259203471188e-06, + "loss": 0.78707623, + "num_input_tokens_seen": 146301755, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73046875, + "step": 6816, + "time_per_iteration": 2.5013954639434814 + }, + { + "auxiliary_loss_clip": 0.01119372, + "auxiliary_loss_mlp": 0.01038562, + "balance_loss_clip": 1.02404261, + "balance_loss_mlp": 1.04302227, + "epoch": 0.40986021343754697, + "flos": 16143678552960.0, + "grad_norm": 2.8457932880819463, + "language_loss": 0.81718624, + "learning_rate": 2.6678921125985843e-06, + "loss": 0.8387655, + "num_input_tokens_seen": 146316835, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.765625, + "step": 6817, + "time_per_iteration": 2.407566785812378 + }, + { + "auxiliary_loss_clip": 0.01121536, + "auxiliary_loss_mlp": 0.01037881, + "balance_loss_clip": 1.02179992, + "balance_loss_mlp": 1.04166436, + "epoch": 0.40992033669021494, + "flos": 24790824011520.0, + "grad_norm": 1.7366839484469832, + "language_loss": 0.79938078, + "learning_rate": 2.667524996399444e-06, + "loss": 0.82097495, + "num_input_tokens_seen": 146336650, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.80078125, + "step": 6818, + "time_per_iteration": 2.49364972114563 + }, + { + "auxiliary_loss_clip": 0.01114596, + "auxiliary_loss_mlp": 0.01036542, + "balance_loss_clip": 1.02288651, + "balance_loss_mlp": 1.03982878, + "epoch": 0.4099804599428829, + "flos": 29641888431360.0, + "grad_norm": 1.4683684500872527, + "language_loss": 0.65939564, + "learning_rate": 2.66715785488769e-06, + "loss": 0.68090701, + "num_input_tokens_seen": 146357640, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 6819, + "time_per_iteration": 2.5122451782226562 + }, + { + "auxiliary_loss_clip": 0.01123256, + "auxiliary_loss_mlp": 0.0103677, + "balance_loss_clip": 1.02191615, + "balance_loss_mlp": 1.04243147, + "epoch": 0.41004058319555087, + "flos": 24826590979200.0, + "grad_norm": 1.4566856211473176, + "language_loss": 0.85411352, + "learning_rate": 2.6667906880772428e-06, + "loss": 0.87571383, + "num_input_tokens_seen": 146379325, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.80859375, + "step": 6820, + "time_per_iteration": 2.4924051761627197 + }, + { + "auxiliary_loss_clip": 0.01116594, + "auxiliary_loss_mlp": 0.01033155, + "balance_loss_clip": 1.0189811, + "balance_loss_mlp": 1.04211807, + "epoch": 0.41010070644821883, + "flos": 25737464995200.0, + "grad_norm": 1.9363068637508836, + "language_loss": 0.71033639, + "learning_rate": 2.6664234959820256e-06, + "loss": 0.73183382, + "num_input_tokens_seen": 146398635, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7421875, + "step": 6821, + "time_per_iteration": 2.5236756801605225 + }, + { + "auxiliary_loss_clip": 0.01115707, + "auxiliary_loss_mlp": 0.01032479, + "balance_loss_clip": 1.01859689, + "balance_loss_mlp": 1.03997672, + "epoch": 0.4101608297008868, + "flos": 22346061557760.0, + "grad_norm": 2.2789873913326404, + "language_loss": 0.74732232, + "learning_rate": 2.6660562786159634e-06, + "loss": 0.76880419, + "num_input_tokens_seen": 146417585, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 6822, + "time_per_iteration": 2.485173225402832 + }, + { + "auxiliary_loss_clip": 0.01117367, + "auxiliary_loss_mlp": 0.01036407, + "balance_loss_clip": 1.02226305, + "balance_loss_mlp": 1.04145467, + "epoch": 0.41022095295355476, + "flos": 21945083057280.0, + "grad_norm": 1.8990120981529888, + "language_loss": 0.7503438, + "learning_rate": 2.6656890359929796e-06, + "loss": 0.77188146, + "num_input_tokens_seen": 146437035, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7578125, + "step": 6823, + "time_per_iteration": 2.46115779876709 + }, + { + "auxiliary_loss_clip": 0.01124707, + "auxiliary_loss_mlp": 0.01039141, + "balance_loss_clip": 1.02359605, + "balance_loss_mlp": 1.04229724, + "epoch": 0.4102810762062228, + "flos": 27450511493760.0, + "grad_norm": 2.6227876605231986, + "language_loss": 0.73347652, + "learning_rate": 2.665321768127001e-06, + "loss": 0.75511503, + "num_input_tokens_seen": 146457370, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.8203125, + "step": 6824, + "time_per_iteration": 2.504561185836792 + }, + { + "auxiliary_loss_clip": 0.01120752, + "auxiliary_loss_mlp": 0.01035065, + "balance_loss_clip": 1.01985359, + "balance_loss_mlp": 1.04105759, + "epoch": 0.41034119945889075, + "flos": 24499265316480.0, + "grad_norm": 2.228764168551681, + "language_loss": 0.71601099, + "learning_rate": 2.6649544750319548e-06, + "loss": 0.73756915, + "num_input_tokens_seen": 146478105, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.796875, + "step": 6825, + "time_per_iteration": 2.476551055908203 + }, + { + "auxiliary_loss_clip": 0.01117579, + "auxiliary_loss_mlp": 0.0103678, + "balance_loss_clip": 1.02359533, + "balance_loss_mlp": 1.04292464, + "epoch": 0.4104013227115587, + "flos": 24352641999360.0, + "grad_norm": 1.9864880407367733, + "language_loss": 0.84743512, + "learning_rate": 2.664587156721768e-06, + "loss": 0.86897874, + "num_input_tokens_seen": 146497835, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.74609375, + "step": 6826, + "time_per_iteration": 2.492030382156372 + }, + { + "auxiliary_loss_clip": 0.01117058, + "auxiliary_loss_mlp": 0.01035255, + "balance_loss_clip": 1.02066422, + "balance_loss_mlp": 1.0431006, + "epoch": 0.4104614459642267, + "flos": 23729340268800.0, + "grad_norm": 1.962634793360081, + "language_loss": 0.66582263, + "learning_rate": 2.6642198132103696e-06, + "loss": 0.68734574, + "num_input_tokens_seen": 146517735, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.73828125, + "step": 6827, + "time_per_iteration": 2.4629759788513184 + }, + { + "auxiliary_loss_clip": 0.01113749, + "auxiliary_loss_mlp": 0.01032027, + "balance_loss_clip": 1.01799607, + "balance_loss_mlp": 1.03989482, + "epoch": 0.41052156921689464, + "flos": 22127976132480.0, + "grad_norm": 1.3616881749334155, + "language_loss": 0.72346127, + "learning_rate": 2.663852444511689e-06, + "loss": 0.74491906, + "num_input_tokens_seen": 146537640, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.73828125, + "step": 6828, + "time_per_iteration": 2.4807186126708984 + }, + { + "auxiliary_loss_clip": 0.01120586, + "auxiliary_loss_mlp": 0.01042781, + "balance_loss_clip": 1.02777803, + "balance_loss_mlp": 1.0410856, + "epoch": 0.4105816924695626, + "flos": 20084371747200.0, + "grad_norm": 1.900432401993592, + "language_loss": 0.83422399, + "learning_rate": 2.6634850506396574e-06, + "loss": 0.85585773, + "num_input_tokens_seen": 146554695, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.796875, + "step": 6829, + "time_per_iteration": 2.4298055171966553 + }, + { + "auxiliary_loss_clip": 0.01114334, + "auxiliary_loss_mlp": 0.0103303, + "balance_loss_clip": 1.01940441, + "balance_loss_mlp": 1.03960419, + "epoch": 0.4106418157222306, + "flos": 18076785724800.0, + "grad_norm": 1.5044787550344432, + "language_loss": 0.9002744, + "learning_rate": 2.663117631608206e-06, + "loss": 0.92174798, + "num_input_tokens_seen": 146573740, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 6830, + "time_per_iteration": 2.4607503414154053 + }, + { + "auxiliary_loss_clip": 0.01115903, + "auxiliary_loss_mlp": 0.01025937, + "balance_loss_clip": 1.01268673, + "balance_loss_mlp": 1.04088628, + "epoch": 0.41070193897489854, + "flos": 21647850013440.0, + "grad_norm": 2.455330668305064, + "language_loss": 0.65950698, + "learning_rate": 2.662750187431268e-06, + "loss": 0.68092537, + "num_input_tokens_seen": 146592885, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75, + "step": 6831, + "time_per_iteration": 2.4402008056640625 + }, + { + "auxiliary_loss_clip": 0.01114416, + "auxiliary_loss_mlp": 0.01035741, + "balance_loss_clip": 1.02233577, + "balance_loss_mlp": 1.04019713, + "epoch": 0.4107620622275665, + "flos": 26648195356800.0, + "grad_norm": 1.7503077174044546, + "language_loss": 0.69414657, + "learning_rate": 2.662382718122776e-06, + "loss": 0.71564817, + "num_input_tokens_seen": 146611995, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7421875, + "step": 6832, + "time_per_iteration": 2.4985976219177246 + }, + { + "auxiliary_loss_clip": 0.0111274, + "auxiliary_loss_mlp": 0.0103582, + "balance_loss_clip": 1.02265322, + "balance_loss_mlp": 1.03861785, + "epoch": 0.41082218548023447, + "flos": 18734310138240.0, + "grad_norm": 2.137055635154832, + "language_loss": 0.73675501, + "learning_rate": 2.662015223696666e-06, + "loss": 0.75824058, + "num_input_tokens_seen": 146628045, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 6833, + "time_per_iteration": 2.423802375793457 + }, + { + "auxiliary_loss_clip": 0.01120262, + "auxiliary_loss_mlp": 0.01041415, + "balance_loss_clip": 1.02648401, + "balance_loss_mlp": 1.04171228, + "epoch": 0.41088230873290243, + "flos": 22893771116160.0, + "grad_norm": 1.6404428787043481, + "language_loss": 0.72538, + "learning_rate": 2.6616477041668713e-06, + "loss": 0.74699682, + "num_input_tokens_seen": 146648355, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78515625, + "step": 6834, + "time_per_iteration": 2.5415680408477783 + }, + { + "auxiliary_loss_clip": 0.01119029, + "auxiliary_loss_mlp": 0.01044226, + "balance_loss_clip": 1.03027868, + "balance_loss_mlp": 1.04038835, + "epoch": 0.4109424319855704, + "flos": 24276978000000.0, + "grad_norm": 2.0754355899076717, + "language_loss": 0.71026015, + "learning_rate": 2.661280159547329e-06, + "loss": 0.7318927, + "num_input_tokens_seen": 146668370, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.78515625, + "step": 6835, + "time_per_iteration": 2.4709722995758057 + }, + { + "auxiliary_loss_clip": 0.01118649, + "auxiliary_loss_mlp": 0.01040202, + "balance_loss_clip": 1.02521181, + "balance_loss_mlp": 1.04203069, + "epoch": 0.41100255523823837, + "flos": 12969139478400.0, + "grad_norm": 1.9290870315127813, + "language_loss": 0.86998641, + "learning_rate": 2.660912589851978e-06, + "loss": 0.89157486, + "num_input_tokens_seen": 146686665, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.765625, + "step": 6836, + "time_per_iteration": 2.4478323459625244 + }, + { + "auxiliary_loss_clip": 0.01114601, + "auxiliary_loss_mlp": 0.01038609, + "balance_loss_clip": 1.02464342, + "balance_loss_mlp": 1.040609, + "epoch": 0.4110626784909064, + "flos": 23145648261120.0, + "grad_norm": 1.7219230799083993, + "language_loss": 0.69017011, + "learning_rate": 2.6605449950947547e-06, + "loss": 0.71170223, + "num_input_tokens_seen": 146706570, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.73828125, + "step": 6837, + "time_per_iteration": 2.4600830078125 + }, + { + "auxiliary_loss_clip": 0.01116898, + "auxiliary_loss_mlp": 0.01038198, + "balance_loss_clip": 1.02394605, + "balance_loss_mlp": 1.04047167, + "epoch": 0.41112280174357435, + "flos": 22747399194240.0, + "grad_norm": 1.7295939332860302, + "language_loss": 0.75087547, + "learning_rate": 2.660177375289599e-06, + "loss": 0.77242649, + "num_input_tokens_seen": 146723425, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.765625, + "step": 6838, + "time_per_iteration": 2.460449695587158 + }, + { + "auxiliary_loss_clip": 0.01115474, + "auxiliary_loss_mlp": 0.01035463, + "balance_loss_clip": 1.02075219, + "balance_loss_mlp": 1.04058707, + "epoch": 0.4111829249962423, + "flos": 21102403011840.0, + "grad_norm": 1.8679563507274572, + "language_loss": 0.82247162, + "learning_rate": 2.659809730450451e-06, + "loss": 0.84398103, + "num_input_tokens_seen": 146741640, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.75, + "step": 6839, + "time_per_iteration": 2.4339215755462646 + }, + { + "auxiliary_loss_clip": 0.01112221, + "auxiliary_loss_mlp": 0.01032315, + "balance_loss_clip": 1.01875496, + "balance_loss_mlp": 1.03766727, + "epoch": 0.4112430482489103, + "flos": 21505787723520.0, + "grad_norm": 1.9294791670505813, + "language_loss": 0.80338049, + "learning_rate": 2.6594420605912523e-06, + "loss": 0.82482588, + "num_input_tokens_seen": 146759195, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.74609375, + "step": 6840, + "time_per_iteration": 2.464096784591675 + }, + { + "auxiliary_loss_clip": 0.01111724, + "auxiliary_loss_mlp": 0.01034503, + "balance_loss_clip": 1.02119339, + "balance_loss_mlp": 1.03856099, + "epoch": 0.41130317150157825, + "flos": 19570022945280.0, + "grad_norm": 1.7525143939260106, + "language_loss": 0.67515284, + "learning_rate": 2.6590743657259442e-06, + "loss": 0.6966151, + "num_input_tokens_seen": 146774990, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73046875, + "step": 6841, + "time_per_iteration": 2.412872314453125 + }, + { + "auxiliary_loss_clip": 0.01035921, + "auxiliary_loss_mlp": 0.01010132, + "balance_loss_clip": 1.00880933, + "balance_loss_mlp": 1.01203704, + "epoch": 0.4113632947542462, + "flos": 62383157706240.0, + "grad_norm": 0.7700890610990695, + "language_loss": 0.5963515, + "learning_rate": 2.65870664586847e-06, + "loss": 0.61681211, + "num_input_tokens_seen": 146839610, + "router_z_loss_clip": 0.01324463, + "router_z_loss_mlp": 0.23828125, + "step": 6842, + "time_per_iteration": 3.167282819747925 + }, + { + "auxiliary_loss_clip": 0.01111896, + "auxiliary_loss_mlp": 0.01033298, + "balance_loss_clip": 1.02044773, + "balance_loss_mlp": 1.04057288, + "epoch": 0.4114234180069142, + "flos": 13918617636480.0, + "grad_norm": 2.121884132790859, + "language_loss": 0.69212461, + "learning_rate": 2.6583389010327742e-06, + "loss": 0.71357656, + "num_input_tokens_seen": 146857360, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71484375, + "step": 6843, + "time_per_iteration": 2.4664626121520996 + }, + { + "auxiliary_loss_clip": 0.01035393, + "auxiliary_loss_mlp": 0.01003775, + "balance_loss_clip": 1.00222576, + "balance_loss_mlp": 1.01154804, + "epoch": 0.41148354125958214, + "flos": 64928505219840.0, + "grad_norm": 0.7178401469554447, + "language_loss": 0.53669417, + "learning_rate": 2.6579711312328013e-06, + "loss": 0.55708587, + "num_input_tokens_seen": 146917055, + "router_z_loss_clip": 0.01550293, + "router_z_loss_mlp": 0.23828125, + "step": 6844, + "time_per_iteration": 3.0998694896698 + }, + { + "auxiliary_loss_clip": 0.0111189, + "auxiliary_loss_mlp": 0.0103482, + "balance_loss_clip": 1.02213013, + "balance_loss_mlp": 1.03937054, + "epoch": 0.4115436645122501, + "flos": 18728779443840.0, + "grad_norm": 1.6545259135728443, + "language_loss": 0.66114587, + "learning_rate": 2.6576033364824967e-06, + "loss": 0.68261302, + "num_input_tokens_seen": 146935215, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7265625, + "step": 6845, + "time_per_iteration": 6.8190038204193115 + }, + { + "auxiliary_loss_clip": 0.01113046, + "auxiliary_loss_mlp": 0.01034986, + "balance_loss_clip": 1.0221113, + "balance_loss_mlp": 1.04133987, + "epoch": 0.41160378776491807, + "flos": 16252918790400.0, + "grad_norm": 1.8380761864561301, + "language_loss": 0.70359266, + "learning_rate": 2.657235516795808e-06, + "loss": 0.72507298, + "num_input_tokens_seen": 146951970, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 6846, + "time_per_iteration": 3.941171646118164 + }, + { + "auxiliary_loss_clip": 0.01112317, + "auxiliary_loss_mlp": 0.01035623, + "balance_loss_clip": 1.02163363, + "balance_loss_mlp": 1.03892803, + "epoch": 0.41166391101758604, + "flos": 27970031854080.0, + "grad_norm": 1.507800360258476, + "language_loss": 0.64964008, + "learning_rate": 2.6568676721866826e-06, + "loss": 0.67111951, + "num_input_tokens_seen": 146975615, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.73046875, + "step": 6847, + "time_per_iteration": 2.5782458782196045 + }, + { + "auxiliary_loss_clip": 0.01112352, + "auxiliary_loss_mlp": 0.0104301, + "balance_loss_clip": 1.02921724, + "balance_loss_mlp": 1.03790998, + "epoch": 0.411724034270254, + "flos": 34131296764800.0, + "grad_norm": 1.3239337291849294, + "language_loss": 0.70368952, + "learning_rate": 2.656499802669069e-06, + "loss": 0.72524321, + "num_input_tokens_seen": 146998855, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7421875, + "step": 6848, + "time_per_iteration": 2.552729606628418 + }, + { + "auxiliary_loss_clip": 0.01035603, + "auxiliary_loss_mlp": 0.00998835, + "balance_loss_clip": 0.99738103, + "balance_loss_mlp": 1.01178169, + "epoch": 0.41178415752292197, + "flos": 67923670752000.0, + "grad_norm": 0.8862972606407307, + "language_loss": 0.56235039, + "learning_rate": 2.6561319082569174e-06, + "loss": 0.58269477, + "num_input_tokens_seen": 147062710, + "router_z_loss_clip": 0.01452637, + "router_z_loss_mlp": 0.23828125, + "step": 6849, + "time_per_iteration": 3.144639730453491 + }, + { + "auxiliary_loss_clip": 0.01112679, + "auxiliary_loss_mlp": 0.01036148, + "balance_loss_clip": 1.02255821, + "balance_loss_mlp": 1.04060721, + "epoch": 0.41184428077558993, + "flos": 34313938444800.0, + "grad_norm": 1.58670522574793, + "language_loss": 0.76169646, + "learning_rate": 2.6557639889641783e-06, + "loss": 0.78318465, + "num_input_tokens_seen": 147086075, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.71875, + "step": 6850, + "time_per_iteration": 2.5668234825134277 + }, + { + "auxiliary_loss_clip": 0.01111269, + "auxiliary_loss_mlp": 0.01033693, + "balance_loss_clip": 1.02075291, + "balance_loss_mlp": 1.03937149, + "epoch": 0.41190440402825795, + "flos": 35444118948480.0, + "grad_norm": 1.4904377439692653, + "language_loss": 0.67717403, + "learning_rate": 2.6553960448048025e-06, + "loss": 0.69862366, + "num_input_tokens_seen": 147107590, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71875, + "step": 6851, + "time_per_iteration": 2.588646650314331 + }, + { + "auxiliary_loss_clip": 0.01116771, + "auxiliary_loss_mlp": 0.01043217, + "balance_loss_clip": 1.02792835, + "balance_loss_mlp": 1.03957748, + "epoch": 0.4119645272809259, + "flos": 20849879422080.0, + "grad_norm": 2.5339755397297776, + "language_loss": 0.79547226, + "learning_rate": 2.655028075792743e-06, + "loss": 0.81707215, + "num_input_tokens_seen": 147123715, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.76953125, + "step": 6852, + "time_per_iteration": 2.4342472553253174 + }, + { + "auxiliary_loss_clip": 0.01120035, + "auxiliary_loss_mlp": 0.01033443, + "balance_loss_clip": 1.01818419, + "balance_loss_mlp": 1.04227197, + "epoch": 0.4120246505335939, + "flos": 27562050201600.0, + "grad_norm": 3.302073757908878, + "language_loss": 0.78002989, + "learning_rate": 2.6546600819419537e-06, + "loss": 0.80156463, + "num_input_tokens_seen": 147144290, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.77734375, + "step": 6853, + "time_per_iteration": 2.536959409713745 + }, + { + "auxiliary_loss_clip": 0.01118617, + "auxiliary_loss_mlp": 0.01037367, + "balance_loss_clip": 1.022156, + "balance_loss_mlp": 1.04021645, + "epoch": 0.41208477378626185, + "flos": 37815444046080.0, + "grad_norm": 1.636675456410819, + "language_loss": 0.65871978, + "learning_rate": 2.6542920632663883e-06, + "loss": 0.68027961, + "num_input_tokens_seen": 147166340, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.78515625, + "step": 6854, + "time_per_iteration": 2.587641477584839 + }, + { + "auxiliary_loss_clip": 0.01113423, + "auxiliary_loss_mlp": 0.01032528, + "balance_loss_clip": 1.01973081, + "balance_loss_mlp": 1.04029512, + "epoch": 0.4121448970389298, + "flos": 23440762402560.0, + "grad_norm": 1.819965675297277, + "language_loss": 0.83530807, + "learning_rate": 2.6539240197800023e-06, + "loss": 0.85676759, + "num_input_tokens_seen": 147184025, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.73046875, + "step": 6855, + "time_per_iteration": 2.5173020362854004 + }, + { + "auxiliary_loss_clip": 0.01112, + "auxiliary_loss_mlp": 0.01036416, + "balance_loss_clip": 1.02333903, + "balance_loss_mlp": 1.03945315, + "epoch": 0.4122050202915978, + "flos": 21325300859520.0, + "grad_norm": 1.701531451547931, + "language_loss": 0.7926302, + "learning_rate": 2.6535559514967517e-06, + "loss": 0.81411433, + "num_input_tokens_seen": 147202730, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 6856, + "time_per_iteration": 2.4496660232543945 + }, + { + "auxiliary_loss_clip": 0.01115557, + "auxiliary_loss_mlp": 0.01034607, + "balance_loss_clip": 1.021119, + "balance_loss_mlp": 1.04115629, + "epoch": 0.41226514354426574, + "flos": 17306286059520.0, + "grad_norm": 6.346447490864035, + "language_loss": 0.79253089, + "learning_rate": 2.6531878584305935e-06, + "loss": 0.81403255, + "num_input_tokens_seen": 147215315, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 6857, + "time_per_iteration": 2.454458236694336 + }, + { + "auxiliary_loss_clip": 0.01114343, + "auxiliary_loss_mlp": 0.01036012, + "balance_loss_clip": 1.02169538, + "balance_loss_mlp": 1.03821683, + "epoch": 0.4123252667969337, + "flos": 17638855107840.0, + "grad_norm": 1.6045712878894351, + "language_loss": 0.70696247, + "learning_rate": 2.6528197405954873e-06, + "loss": 0.72846603, + "num_input_tokens_seen": 147233330, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.76171875, + "step": 6858, + "time_per_iteration": 2.453808069229126 + }, + { + "auxiliary_loss_clip": 0.01113834, + "auxiliary_loss_mlp": 0.01035636, + "balance_loss_clip": 1.02162266, + "balance_loss_mlp": 1.04016411, + "epoch": 0.4123853900496017, + "flos": 46424811375360.0, + "grad_norm": 1.4836752505963042, + "language_loss": 0.59489501, + "learning_rate": 2.652451598005391e-06, + "loss": 0.61638969, + "num_input_tokens_seen": 147257780, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.73828125, + "step": 6859, + "time_per_iteration": 2.6645431518554688 + }, + { + "auxiliary_loss_clip": 0.01112236, + "auxiliary_loss_mlp": 0.01036677, + "balance_loss_clip": 1.02283669, + "balance_loss_mlp": 1.03694463, + "epoch": 0.41244551330226964, + "flos": 17675160779520.0, + "grad_norm": 2.017738864380765, + "language_loss": 0.73062313, + "learning_rate": 2.652083430674264e-06, + "loss": 0.75211227, + "num_input_tokens_seen": 147276055, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75390625, + "step": 6860, + "time_per_iteration": 2.4230310916900635 + }, + { + "auxiliary_loss_clip": 0.01111098, + "auxiliary_loss_mlp": 0.01033206, + "balance_loss_clip": 1.02037311, + "balance_loss_mlp": 1.03779876, + "epoch": 0.4125056365549376, + "flos": 18693730748160.0, + "grad_norm": 1.603033952512427, + "language_loss": 0.74057221, + "learning_rate": 2.651715238616068e-06, + "loss": 0.76201528, + "num_input_tokens_seen": 147293200, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.734375, + "step": 6861, + "time_per_iteration": 2.466261863708496 + }, + { + "auxiliary_loss_clip": 0.01111959, + "auxiliary_loss_mlp": 0.01031192, + "balance_loss_clip": 1.0190326, + "balance_loss_mlp": 1.04026282, + "epoch": 0.41256575980760557, + "flos": 17895293280000.0, + "grad_norm": 2.017273954904035, + "language_loss": 0.79431915, + "learning_rate": 2.651347021844765e-06, + "loss": 0.81575066, + "num_input_tokens_seen": 147310640, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.71875, + "step": 6862, + "time_per_iteration": 2.4272851943969727 + }, + { + "auxiliary_loss_clip": 0.01115421, + "auxiliary_loss_mlp": 0.01032509, + "balance_loss_clip": 1.01946771, + "balance_loss_mlp": 1.04104841, + "epoch": 0.41262588306027354, + "flos": 21981316901760.0, + "grad_norm": 1.7023318630513873, + "language_loss": 0.76025152, + "learning_rate": 2.650978780374318e-06, + "loss": 0.78173077, + "num_input_tokens_seen": 147329435, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.74609375, + "step": 6863, + "time_per_iteration": 2.491703987121582 + }, + { + "auxiliary_loss_clip": 0.01034073, + "auxiliary_loss_mlp": 0.01002883, + "balance_loss_clip": 1.00128579, + "balance_loss_mlp": 1.01038253, + "epoch": 0.41268600631294156, + "flos": 53350006740480.0, + "grad_norm": 0.6998724627349664, + "language_loss": 0.52726007, + "learning_rate": 2.650610514218691e-06, + "loss": 0.54762965, + "num_input_tokens_seen": 147385805, + "router_z_loss_clip": 0.01599121, + "router_z_loss_mlp": 0.23632812, + "step": 6864, + "time_per_iteration": 3.05096173286438 + }, + { + "auxiliary_loss_clip": 0.01117449, + "auxiliary_loss_mlp": 0.01034441, + "balance_loss_clip": 1.02002299, + "balance_loss_mlp": 1.04010963, + "epoch": 0.4127461295656095, + "flos": 24385356311040.0, + "grad_norm": 1.8277977271365335, + "language_loss": 0.72328234, + "learning_rate": 2.6502422233918468e-06, + "loss": 0.74480128, + "num_input_tokens_seen": 147405160, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7734375, + "step": 6865, + "time_per_iteration": 2.5138418674468994 + }, + { + "auxiliary_loss_clip": 0.0103371, + "auxiliary_loss_mlp": 0.01003681, + "balance_loss_clip": 1.00216091, + "balance_loss_mlp": 1.00997901, + "epoch": 0.4128062528182775, + "flos": 71705242696320.0, + "grad_norm": 0.9175964026476935, + "language_loss": 0.66545808, + "learning_rate": 2.649873907907753e-06, + "loss": 0.68583202, + "num_input_tokens_seen": 147460245, + "router_z_loss_clip": 0.01519775, + "router_z_loss_mlp": 0.23730469, + "step": 6866, + "time_per_iteration": 2.965301513671875 + }, + { + "auxiliary_loss_clip": 0.01111664, + "auxiliary_loss_mlp": 0.01037824, + "balance_loss_clip": 1.02442443, + "balance_loss_mlp": 1.03779757, + "epoch": 0.41286637607094545, + "flos": 17849111368320.0, + "grad_norm": 1.9494269702964535, + "language_loss": 0.80854523, + "learning_rate": 2.649505567780375e-06, + "loss": 0.8300401, + "num_input_tokens_seen": 147476200, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73828125, + "step": 6867, + "time_per_iteration": 2.4153382778167725 + }, + { + "auxiliary_loss_clip": 0.01118424, + "auxiliary_loss_mlp": 0.01037514, + "balance_loss_clip": 1.02335191, + "balance_loss_mlp": 1.04141474, + "epoch": 0.4129264993236134, + "flos": 25549544016000.0, + "grad_norm": 2.031901046820099, + "language_loss": 0.77580094, + "learning_rate": 2.6491372030236815e-06, + "loss": 0.7973603, + "num_input_tokens_seen": 147494315, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7734375, + "step": 6868, + "time_per_iteration": 2.535595178604126 + }, + { + "auxiliary_loss_clip": 0.01033303, + "auxiliary_loss_mlp": 0.00999485, + "balance_loss_clip": 0.99789923, + "balance_loss_mlp": 1.0095768, + "epoch": 0.4129866225762814, + "flos": 65414446364160.0, + "grad_norm": 0.8413704541135547, + "language_loss": 0.5779494, + "learning_rate": 2.64876881365164e-06, + "loss": 0.59827721, + "num_input_tokens_seen": 147543665, + "router_z_loss_clip": 0.01586914, + "router_z_loss_mlp": 0.23730469, + "step": 6869, + "time_per_iteration": 2.8164174556732178 + }, + { + "auxiliary_loss_clip": 0.01112645, + "auxiliary_loss_mlp": 0.01034982, + "balance_loss_clip": 1.02101028, + "balance_loss_mlp": 1.03904057, + "epoch": 0.41304674582894935, + "flos": 28876991287680.0, + "grad_norm": 1.6360017889096097, + "language_loss": 0.74995548, + "learning_rate": 2.64840039967822e-06, + "loss": 0.77143168, + "num_input_tokens_seen": 147564870, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.734375, + "step": 6870, + "time_per_iteration": 2.5370054244995117 + }, + { + "auxiliary_loss_clip": 0.01114255, + "auxiliary_loss_mlp": 0.01042162, + "balance_loss_clip": 1.02757072, + "balance_loss_mlp": 1.03925085, + "epoch": 0.4131068690816173, + "flos": 22891975436160.0, + "grad_norm": 1.504144022647526, + "language_loss": 0.83272427, + "learning_rate": 2.6480319611173912e-06, + "loss": 0.85428846, + "num_input_tokens_seen": 147584840, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.75, + "step": 6871, + "time_per_iteration": 2.596686601638794 + }, + { + "auxiliary_loss_clip": 0.01117357, + "auxiliary_loss_mlp": 0.01041675, + "balance_loss_clip": 1.02738237, + "balance_loss_mlp": 1.04108167, + "epoch": 0.4131669923342853, + "flos": 26065185707520.0, + "grad_norm": 5.838045745285431, + "language_loss": 0.68951505, + "learning_rate": 2.6476634979831263e-06, + "loss": 0.71110535, + "num_input_tokens_seen": 147604635, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.76171875, + "step": 6872, + "time_per_iteration": 2.6045477390289307 + }, + { + "auxiliary_loss_clip": 0.01115693, + "auxiliary_loss_mlp": 0.01035465, + "balance_loss_clip": 1.02197695, + "balance_loss_mlp": 1.04050374, + "epoch": 0.41322711558695324, + "flos": 19244564789760.0, + "grad_norm": 1.864312912622832, + "language_loss": 0.75716275, + "learning_rate": 2.6472950102893964e-06, + "loss": 0.7786743, + "num_input_tokens_seen": 147620700, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 6873, + "time_per_iteration": 2.4200570583343506 + }, + { + "auxiliary_loss_clip": 0.01117091, + "auxiliary_loss_mlp": 0.01033505, + "balance_loss_clip": 1.01943827, + "balance_loss_mlp": 1.04055679, + "epoch": 0.4132872388396212, + "flos": 22674464628480.0, + "grad_norm": 1.671510122752512, + "language_loss": 0.82721817, + "learning_rate": 2.6469264980501746e-06, + "loss": 0.84872413, + "num_input_tokens_seen": 147639490, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.765625, + "step": 6874, + "time_per_iteration": 2.4689133167266846 + }, + { + "auxiliary_loss_clip": 0.01116401, + "auxiliary_loss_mlp": 0.01034964, + "balance_loss_clip": 1.02054608, + "balance_loss_mlp": 1.0397824, + "epoch": 0.4133473620922892, + "flos": 20150195420160.0, + "grad_norm": 2.003609916019722, + "language_loss": 0.71075761, + "learning_rate": 2.646557961279436e-06, + "loss": 0.73227131, + "num_input_tokens_seen": 147657205, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.765625, + "step": 6875, + "time_per_iteration": 2.4145123958587646 + }, + { + "auxiliary_loss_clip": 0.01110907, + "auxiliary_loss_mlp": 0.01040098, + "balance_loss_clip": 1.02686, + "balance_loss_mlp": 1.04001451, + "epoch": 0.41340748534495714, + "flos": 24242755317120.0, + "grad_norm": 1.617534223510663, + "language_loss": 0.82538921, + "learning_rate": 2.646189399991154e-06, + "loss": 0.84689927, + "num_input_tokens_seen": 147677005, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 6876, + "time_per_iteration": 2.49533748626709 + }, + { + "auxiliary_loss_clip": 0.01118483, + "auxiliary_loss_mlp": 0.010388, + "balance_loss_clip": 1.02354097, + "balance_loss_mlp": 1.03916812, + "epoch": 0.41346760859762516, + "flos": 14392171566720.0, + "grad_norm": 2.858959916779265, + "language_loss": 0.65397477, + "learning_rate": 2.6458208141993048e-06, + "loss": 0.6755476, + "num_input_tokens_seen": 147693435, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.79296875, + "step": 6877, + "time_per_iteration": 2.4231626987457275 + }, + { + "auxiliary_loss_clip": 0.01113806, + "auxiliary_loss_mlp": 0.01031336, + "balance_loss_clip": 1.01795483, + "balance_loss_mlp": 1.04000914, + "epoch": 0.4135277318502931, + "flos": 22492002516480.0, + "grad_norm": 2.013643508242888, + "language_loss": 0.76686853, + "learning_rate": 2.6454522039178668e-06, + "loss": 0.78831995, + "num_input_tokens_seen": 147714000, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73828125, + "step": 6878, + "time_per_iteration": 2.492220640182495 + }, + { + "auxiliary_loss_clip": 0.01114835, + "auxiliary_loss_mlp": 0.01039959, + "balance_loss_clip": 1.02589822, + "balance_loss_mlp": 1.040084, + "epoch": 0.4135878551029611, + "flos": 22418744728320.0, + "grad_norm": 1.8674435899066546, + "language_loss": 0.80248523, + "learning_rate": 2.6450835691608154e-06, + "loss": 0.82403314, + "num_input_tokens_seen": 147731010, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.75, + "step": 6879, + "time_per_iteration": 2.458623170852661 + }, + { + "auxiliary_loss_clip": 0.01114903, + "auxiliary_loss_mlp": 0.01036028, + "balance_loss_clip": 1.02160931, + "balance_loss_mlp": 1.03960526, + "epoch": 0.41364797835562905, + "flos": 27053232094080.0, + "grad_norm": 1.9200458523415633, + "language_loss": 0.84693611, + "learning_rate": 2.6447149099421315e-06, + "loss": 0.86844546, + "num_input_tokens_seen": 147750880, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.75390625, + "step": 6880, + "time_per_iteration": 2.605189323425293 + }, + { + "auxiliary_loss_clip": 0.01116516, + "auxiliary_loss_mlp": 0.01028456, + "balance_loss_clip": 1.01478863, + "balance_loss_mlp": 1.04023683, + "epoch": 0.413708101608297, + "flos": 22967603521920.0, + "grad_norm": 1.672120688006926, + "language_loss": 0.70195448, + "learning_rate": 2.6443462262757927e-06, + "loss": 0.72340417, + "num_input_tokens_seen": 147771360, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76171875, + "step": 6881, + "time_per_iteration": 2.4585211277008057 + }, + { + "auxiliary_loss_clip": 0.01113486, + "auxiliary_loss_mlp": 0.01037109, + "balance_loss_clip": 1.02450848, + "balance_loss_mlp": 1.04145753, + "epoch": 0.413768224860965, + "flos": 13333991875200.0, + "grad_norm": 1.702675342664879, + "language_loss": 0.81404376, + "learning_rate": 2.6439775181757805e-06, + "loss": 0.83554971, + "num_input_tokens_seen": 147787440, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71875, + "step": 6882, + "time_per_iteration": 2.451544761657715 + }, + { + "auxiliary_loss_clip": 0.01121461, + "auxiliary_loss_mlp": 0.01047549, + "balance_loss_clip": 1.0311873, + "balance_loss_mlp": 1.04304028, + "epoch": 0.41382834811363295, + "flos": 20813968800000.0, + "grad_norm": 1.9410860498070561, + "language_loss": 0.69296026, + "learning_rate": 2.643608785656077e-06, + "loss": 0.71465033, + "num_input_tokens_seen": 147805720, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.78515625, + "step": 6883, + "time_per_iteration": 2.4320569038391113 + }, + { + "auxiliary_loss_clip": 0.01115479, + "auxiliary_loss_mlp": 0.01035258, + "balance_loss_clip": 1.02175713, + "balance_loss_mlp": 1.04087615, + "epoch": 0.4138884713663009, + "flos": 20667130001280.0, + "grad_norm": 1.7677749997866015, + "language_loss": 0.75449616, + "learning_rate": 2.643240028730663e-06, + "loss": 0.77600354, + "num_input_tokens_seen": 147824605, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 6884, + "time_per_iteration": 2.4846954345703125 + }, + { + "auxiliary_loss_clip": 0.01116957, + "auxiliary_loss_mlp": 0.01037789, + "balance_loss_clip": 1.02394891, + "balance_loss_mlp": 1.04013455, + "epoch": 0.4139485946189689, + "flos": 29056616225280.0, + "grad_norm": 1.3782226444678463, + "language_loss": 0.75763476, + "learning_rate": 2.642871247413523e-06, + "loss": 0.7791822, + "num_input_tokens_seen": 147845445, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76953125, + "step": 6885, + "time_per_iteration": 2.513087511062622 + }, + { + "auxiliary_loss_clip": 0.01117144, + "auxiliary_loss_mlp": 0.01038796, + "balance_loss_clip": 1.0245266, + "balance_loss_mlp": 1.0402348, + "epoch": 0.41400871787163684, + "flos": 24425720219520.0, + "grad_norm": 1.8637223642679819, + "language_loss": 0.69820571, + "learning_rate": 2.6425024417186414e-06, + "loss": 0.71976513, + "num_input_tokens_seen": 147865580, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.765625, + "step": 6886, + "time_per_iteration": 2.49245285987854 + }, + { + "auxiliary_loss_clip": 0.01118338, + "auxiliary_loss_mlp": 0.0103733, + "balance_loss_clip": 1.02326965, + "balance_loss_mlp": 1.04143095, + "epoch": 0.4140688411243048, + "flos": 19464050845440.0, + "grad_norm": 1.5567308495418615, + "language_loss": 0.7542249, + "learning_rate": 2.642133611660002e-06, + "loss": 0.77578151, + "num_input_tokens_seen": 147885230, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76953125, + "step": 6887, + "time_per_iteration": 6.723928451538086 + }, + { + "auxiliary_loss_clip": 0.01114585, + "auxiliary_loss_mlp": 0.01031306, + "balance_loss_clip": 1.01735878, + "balance_loss_mlp": 1.03900433, + "epoch": 0.4141289643769728, + "flos": 19313656600320.0, + "grad_norm": 1.8847126889252832, + "language_loss": 0.69881892, + "learning_rate": 2.641764757251592e-06, + "loss": 0.72027779, + "num_input_tokens_seen": 147903035, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75390625, + "step": 6888, + "time_per_iteration": 3.9012765884399414 + }, + { + "auxiliary_loss_clip": 0.01110331, + "auxiliary_loss_mlp": 0.0103512, + "balance_loss_clip": 1.02070749, + "balance_loss_mlp": 1.03661156, + "epoch": 0.41418908762964074, + "flos": 16726903683840.0, + "grad_norm": 1.8064637161795956, + "language_loss": 0.75730169, + "learning_rate": 2.6413958785073976e-06, + "loss": 0.7787562, + "num_input_tokens_seen": 147918745, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.73828125, + "step": 6889, + "time_per_iteration": 2.4043526649475098 + }, + { + "auxiliary_loss_clip": 0.01115863, + "auxiliary_loss_mlp": 0.01033766, + "balance_loss_clip": 1.020468, + "balance_loss_mlp": 1.04220176, + "epoch": 0.41424921088230876, + "flos": 25296840858240.0, + "grad_norm": 1.5362774650785178, + "language_loss": 0.80159467, + "learning_rate": 2.6410269754414074e-06, + "loss": 0.82309097, + "num_input_tokens_seen": 147938265, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.734375, + "step": 6890, + "time_per_iteration": 2.515199661254883 + }, + { + "auxiliary_loss_clip": 0.01113118, + "auxiliary_loss_mlp": 0.01042194, + "balance_loss_clip": 1.02752495, + "balance_loss_mlp": 1.04047, + "epoch": 0.4143093341349767, + "flos": 20960520289920.0, + "grad_norm": 1.56935265602887, + "language_loss": 0.74256909, + "learning_rate": 2.6406580480676113e-06, + "loss": 0.76412225, + "num_input_tokens_seen": 147957320, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7265625, + "step": 6891, + "time_per_iteration": 2.4265213012695312 + }, + { + "auxiliary_loss_clip": 0.01120303, + "auxiliary_loss_mlp": 0.01037383, + "balance_loss_clip": 1.02144444, + "balance_loss_mlp": 1.04260397, + "epoch": 0.4143694573876447, + "flos": 22017694400640.0, + "grad_norm": 1.5959140747346865, + "language_loss": 0.84173661, + "learning_rate": 2.6402890963999963e-06, + "loss": 0.86331344, + "num_input_tokens_seen": 147977045, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.77734375, + "step": 6892, + "time_per_iteration": 2.4921038150787354 + }, + { + "auxiliary_loss_clip": 0.01116229, + "auxiliary_loss_mlp": 0.01035744, + "balance_loss_clip": 1.02204704, + "balance_loss_mlp": 1.04263163, + "epoch": 0.41442958064031266, + "flos": 35697396723840.0, + "grad_norm": 1.6122583846612435, + "language_loss": 0.70197237, + "learning_rate": 2.6399201204525554e-06, + "loss": 0.72349209, + "num_input_tokens_seen": 147996905, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.734375, + "step": 6893, + "time_per_iteration": 2.548509359359741 + }, + { + "auxiliary_loss_clip": 0.01115822, + "auxiliary_loss_mlp": 0.01029427, + "balance_loss_clip": 1.01573586, + "balance_loss_mlp": 1.04117119, + "epoch": 0.4144897038929806, + "flos": 28293766156800.0, + "grad_norm": 1.3754181360448814, + "language_loss": 0.72850323, + "learning_rate": 2.639551120239279e-06, + "loss": 0.74995577, + "num_input_tokens_seen": 148017875, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 6894, + "time_per_iteration": 2.521559715270996 + }, + { + "auxiliary_loss_clip": 0.01118828, + "auxiliary_loss_mlp": 0.01032562, + "balance_loss_clip": 1.01859689, + "balance_loss_mlp": 1.04199624, + "epoch": 0.4145498271456486, + "flos": 11648093080320.0, + "grad_norm": 2.672622146105704, + "language_loss": 0.6200121, + "learning_rate": 2.63918209577416e-06, + "loss": 0.64152598, + "num_input_tokens_seen": 148032300, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.765625, + "step": 6895, + "time_per_iteration": 2.3899357318878174 + }, + { + "auxiliary_loss_clip": 0.01112997, + "auxiliary_loss_mlp": 0.01034538, + "balance_loss_clip": 1.02091241, + "balance_loss_mlp": 1.03973091, + "epoch": 0.41460995039831655, + "flos": 27235622378880.0, + "grad_norm": 1.6922649240649819, + "language_loss": 0.70685059, + "learning_rate": 2.638813047071192e-06, + "loss": 0.72832596, + "num_input_tokens_seen": 148053260, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.73046875, + "step": 6896, + "time_per_iteration": 2.5296781063079834 + }, + { + "auxiliary_loss_clip": 0.01115349, + "auxiliary_loss_mlp": 0.01040374, + "balance_loss_clip": 1.02541351, + "balance_loss_mlp": 1.03898549, + "epoch": 0.4146700736509845, + "flos": 25922369232000.0, + "grad_norm": 1.6224007586570597, + "language_loss": 0.72848749, + "learning_rate": 2.6384439741443696e-06, + "loss": 0.7500447, + "num_input_tokens_seen": 148072965, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.765625, + "step": 6897, + "time_per_iteration": 2.481219530105591 + }, + { + "auxiliary_loss_clip": 0.01115287, + "auxiliary_loss_mlp": 0.01043208, + "balance_loss_clip": 1.02870619, + "balance_loss_mlp": 1.04093742, + "epoch": 0.4147301969036525, + "flos": 26833243248000.0, + "grad_norm": 4.403783878749548, + "language_loss": 0.84646589, + "learning_rate": 2.6380748770076873e-06, + "loss": 0.86805081, + "num_input_tokens_seen": 148093240, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7421875, + "step": 6898, + "time_per_iteration": 2.5150201320648193 + }, + { + "auxiliary_loss_clip": 0.01112871, + "auxiliary_loss_mlp": 0.01031359, + "balance_loss_clip": 1.01719725, + "balance_loss_mlp": 1.03681874, + "epoch": 0.41479032015632045, + "flos": 20298291194880.0, + "grad_norm": 1.644475487803214, + "language_loss": 0.74555075, + "learning_rate": 2.6377057556751416e-06, + "loss": 0.76699305, + "num_input_tokens_seen": 148110925, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.76171875, + "step": 6899, + "time_per_iteration": 2.4348104000091553 + }, + { + "auxiliary_loss_clip": 0.0112093, + "auxiliary_loss_mlp": 0.01037394, + "balance_loss_clip": 1.02145016, + "balance_loss_mlp": 1.04058647, + "epoch": 0.4148504434089884, + "flos": 25264988472960.0, + "grad_norm": 1.717830619902866, + "language_loss": 0.75609112, + "learning_rate": 2.6373366101607306e-06, + "loss": 0.77767438, + "num_input_tokens_seen": 148130670, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.8046875, + "step": 6900, + "time_per_iteration": 2.5260136127471924 + }, + { + "auxiliary_loss_clip": 0.01116235, + "auxiliary_loss_mlp": 0.01040453, + "balance_loss_clip": 1.02496767, + "balance_loss_mlp": 1.04113388, + "epoch": 0.4149105666616564, + "flos": 12822300679680.0, + "grad_norm": 2.5866137476185087, + "language_loss": 0.80409849, + "learning_rate": 2.6369674404784503e-06, + "loss": 0.82566535, + "num_input_tokens_seen": 148148350, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.75, + "step": 6901, + "time_per_iteration": 2.4218883514404297 + }, + { + "auxiliary_loss_clip": 0.01114876, + "auxiliary_loss_mlp": 0.01035504, + "balance_loss_clip": 1.02178299, + "balance_loss_mlp": 1.03989518, + "epoch": 0.41497068991432434, + "flos": 16763891713920.0, + "grad_norm": 1.8085429941764752, + "language_loss": 0.69120753, + "learning_rate": 2.6365982466423014e-06, + "loss": 0.71271133, + "num_input_tokens_seen": 148167550, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75, + "step": 6902, + "time_per_iteration": 2.525836944580078 + }, + { + "auxiliary_loss_clip": 0.0111323, + "auxiliary_loss_mlp": 0.01037724, + "balance_loss_clip": 1.02421129, + "balance_loss_mlp": 1.04042315, + "epoch": 0.4150308131669923, + "flos": 18000906243840.0, + "grad_norm": 2.1056004636318817, + "language_loss": 0.83287692, + "learning_rate": 2.6362290286662834e-06, + "loss": 0.85438645, + "num_input_tokens_seen": 148184740, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7265625, + "step": 6903, + "time_per_iteration": 2.402722120285034 + }, + { + "auxiliary_loss_clip": 0.01120503, + "auxiliary_loss_mlp": 0.01038275, + "balance_loss_clip": 1.02232492, + "balance_loss_mlp": 1.0413456, + "epoch": 0.41509093641966033, + "flos": 30044770352640.0, + "grad_norm": 1.8768082111891207, + "language_loss": 0.67704409, + "learning_rate": 2.6358597865643968e-06, + "loss": 0.69863188, + "num_input_tokens_seen": 148204605, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.79296875, + "step": 6904, + "time_per_iteration": 2.5442733764648438 + }, + { + "auxiliary_loss_clip": 0.01119512, + "auxiliary_loss_mlp": 0.01035175, + "balance_loss_clip": 1.02082872, + "balance_loss_mlp": 1.04166162, + "epoch": 0.4151510596723283, + "flos": 24279994742400.0, + "grad_norm": 1.5140892492412166, + "language_loss": 0.77502626, + "learning_rate": 2.635490520350643e-06, + "loss": 0.79657316, + "num_input_tokens_seen": 148224675, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.77734375, + "step": 6905, + "time_per_iteration": 2.471850633621216 + }, + { + "auxiliary_loss_clip": 0.01119242, + "auxiliary_loss_mlp": 0.01030653, + "balance_loss_clip": 1.0168426, + "balance_loss_mlp": 1.04261923, + "epoch": 0.41521118292499626, + "flos": 23476206147840.0, + "grad_norm": 2.8616602480779427, + "language_loss": 0.68461335, + "learning_rate": 2.635121230039025e-06, + "loss": 0.70611238, + "num_input_tokens_seen": 148243375, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 6906, + "time_per_iteration": 2.501025676727295 + }, + { + "auxiliary_loss_clip": 0.0111511, + "auxiliary_loss_mlp": 0.01034311, + "balance_loss_clip": 1.02097726, + "balance_loss_mlp": 1.041152, + "epoch": 0.4152713061776642, + "flos": 22125498094080.0, + "grad_norm": 3.9013632738704347, + "language_loss": 0.67466414, + "learning_rate": 2.6347519156435467e-06, + "loss": 0.69615829, + "num_input_tokens_seen": 148261140, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7421875, + "step": 6907, + "time_per_iteration": 2.467179298400879 + }, + { + "auxiliary_loss_clip": 0.01118262, + "auxiliary_loss_mlp": 0.01034081, + "balance_loss_clip": 1.02107513, + "balance_loss_mlp": 1.04266894, + "epoch": 0.4153314294303322, + "flos": 21251396626560.0, + "grad_norm": 1.8641722195673653, + "language_loss": 0.77219629, + "learning_rate": 2.6343825771782123e-06, + "loss": 0.79371971, + "num_input_tokens_seen": 148279655, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7578125, + "step": 6908, + "time_per_iteration": 2.5124471187591553 + }, + { + "auxiliary_loss_clip": 0.01043525, + "auxiliary_loss_mlp": 0.01011962, + "balance_loss_clip": 1.01046562, + "balance_loss_mlp": 1.01946032, + "epoch": 0.41539155268300015, + "flos": 57920681594880.0, + "grad_norm": 0.7844742119516283, + "language_loss": 0.64862758, + "learning_rate": 2.634013214657026e-06, + "loss": 0.66918248, + "num_input_tokens_seen": 148339005, + "router_z_loss_clip": 0.01495361, + "router_z_loss_mlp": 0.24023438, + "step": 6909, + "time_per_iteration": 3.0118794441223145 + }, + { + "auxiliary_loss_clip": 0.01116053, + "auxiliary_loss_mlp": 0.01037602, + "balance_loss_clip": 1.02441156, + "balance_loss_mlp": 1.04182351, + "epoch": 0.4154516759356681, + "flos": 21903677654400.0, + "grad_norm": 1.432390678759805, + "language_loss": 0.87292743, + "learning_rate": 2.633643828093996e-06, + "loss": 0.8944639, + "num_input_tokens_seen": 148358715, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 6910, + "time_per_iteration": 2.4972214698791504 + }, + { + "auxiliary_loss_clip": 0.01041579, + "auxiliary_loss_mlp": 0.01001773, + "balance_loss_clip": 1.00033653, + "balance_loss_mlp": 1.01748466, + "epoch": 0.4155117991883361, + "flos": 67833677226240.0, + "grad_norm": 0.808989444092677, + "language_loss": 0.6214478, + "learning_rate": 2.633274417503128e-06, + "loss": 0.64188129, + "num_input_tokens_seen": 148417280, + "router_z_loss_clip": 0.01434326, + "router_z_loss_mlp": 0.24023438, + "step": 6911, + "time_per_iteration": 3.040469169616699 + }, + { + "auxiliary_loss_clip": 0.01126363, + "auxiliary_loss_mlp": 0.01038312, + "balance_loss_clip": 1.02386987, + "balance_loss_mlp": 1.04570675, + "epoch": 0.41557192244100405, + "flos": 14282679934080.0, + "grad_norm": 2.7143139070983313, + "language_loss": 0.87920213, + "learning_rate": 2.6329049828984312e-06, + "loss": 0.90084887, + "num_input_tokens_seen": 148432610, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.8046875, + "step": 6912, + "time_per_iteration": 2.449566602706909 + }, + { + "auxiliary_loss_clip": 0.01119018, + "auxiliary_loss_mlp": 0.01031086, + "balance_loss_clip": 1.01842034, + "balance_loss_mlp": 1.04461241, + "epoch": 0.415632045693672, + "flos": 24461954064000.0, + "grad_norm": 3.208266477782979, + "language_loss": 0.62984204, + "learning_rate": 2.632535524293914e-06, + "loss": 0.65134311, + "num_input_tokens_seen": 148451510, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.74609375, + "step": 6913, + "time_per_iteration": 2.4690184593200684 + }, + { + "auxiliary_loss_clip": 0.01117176, + "auxiliary_loss_mlp": 0.01030635, + "balance_loss_clip": 1.01793909, + "balance_loss_mlp": 1.04389513, + "epoch": 0.41569216894634, + "flos": 20115290378880.0, + "grad_norm": 1.933222600231973, + "language_loss": 0.75131822, + "learning_rate": 2.632166041703586e-06, + "loss": 0.77279633, + "num_input_tokens_seen": 148469945, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.734375, + "step": 6914, + "time_per_iteration": 2.483322858810425 + }, + { + "auxiliary_loss_clip": 0.01118579, + "auxiliary_loss_mlp": 0.01035638, + "balance_loss_clip": 1.0218277, + "balance_loss_mlp": 1.04198337, + "epoch": 0.41575229219900794, + "flos": 23798827128960.0, + "grad_norm": 1.8027192281548683, + "language_loss": 0.87621439, + "learning_rate": 2.631796535141458e-06, + "loss": 0.89775658, + "num_input_tokens_seen": 148486655, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.765625, + "step": 6915, + "time_per_iteration": 2.448347806930542 + }, + { + "auxiliary_loss_clip": 0.01120782, + "auxiliary_loss_mlp": 0.01038445, + "balance_loss_clip": 1.02461123, + "balance_loss_mlp": 1.0447371, + "epoch": 0.4158124154516759, + "flos": 23108229267840.0, + "grad_norm": 2.7843871284315007, + "language_loss": 0.71427178, + "learning_rate": 2.6314270046215426e-06, + "loss": 0.7358641, + "num_input_tokens_seen": 148505035, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76171875, + "step": 6916, + "time_per_iteration": 2.490709066390991 + }, + { + "auxiliary_loss_clip": 0.01124406, + "auxiliary_loss_mlp": 0.01032755, + "balance_loss_clip": 1.018736, + "balance_loss_mlp": 1.04548466, + "epoch": 0.41587253870434393, + "flos": 24242970798720.0, + "grad_norm": 1.511699121237688, + "language_loss": 0.71604288, + "learning_rate": 2.631057450157852e-06, + "loss": 0.73761451, + "num_input_tokens_seen": 148525575, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 6917, + "time_per_iteration": 2.471165895462036 + }, + { + "auxiliary_loss_clip": 0.01118269, + "auxiliary_loss_mlp": 0.01033966, + "balance_loss_clip": 1.0205791, + "balance_loss_mlp": 1.04267478, + "epoch": 0.4159326619570119, + "flos": 23881602021120.0, + "grad_norm": 1.6845020116344738, + "language_loss": 0.80811357, + "learning_rate": 2.6306878717643988e-06, + "loss": 0.82963598, + "num_input_tokens_seen": 148547270, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7578125, + "step": 6918, + "time_per_iteration": 2.526092767715454 + }, + { + "auxiliary_loss_clip": 0.01123257, + "auxiliary_loss_mlp": 0.01037246, + "balance_loss_clip": 1.02276754, + "balance_loss_mlp": 1.04565763, + "epoch": 0.41599278520967986, + "flos": 40626531354240.0, + "grad_norm": 1.4136427424617275, + "language_loss": 0.70455492, + "learning_rate": 2.6303182694551995e-06, + "loss": 0.72615993, + "num_input_tokens_seen": 148572100, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7734375, + "step": 6919, + "time_per_iteration": 2.6142234802246094 + }, + { + "auxiliary_loss_clip": 0.01122602, + "auxiliary_loss_mlp": 0.0103457, + "balance_loss_clip": 1.02063489, + "balance_loss_mlp": 1.04595828, + "epoch": 0.4160529084623478, + "flos": 18222942165120.0, + "grad_norm": 3.306135174045704, + "language_loss": 0.80995989, + "learning_rate": 2.6299486432442677e-06, + "loss": 0.83153164, + "num_input_tokens_seen": 148591245, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.765625, + "step": 6920, + "time_per_iteration": 2.4816763401031494 + }, + { + "auxiliary_loss_clip": 0.01123811, + "auxiliary_loss_mlp": 0.01037947, + "balance_loss_clip": 1.02265263, + "balance_loss_mlp": 1.04559636, + "epoch": 0.4161130317150158, + "flos": 13661963982720.0, + "grad_norm": 1.8850349699187139, + "language_loss": 0.66103178, + "learning_rate": 2.6295789931456195e-06, + "loss": 0.68264937, + "num_input_tokens_seen": 148607980, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.78125, + "step": 6921, + "time_per_iteration": 2.4444103240966797 + }, + { + "auxiliary_loss_clip": 0.01122422, + "auxiliary_loss_mlp": 0.01039999, + "balance_loss_clip": 1.02613473, + "balance_loss_mlp": 1.04591656, + "epoch": 0.41617315496768376, + "flos": 16178511767040.0, + "grad_norm": 2.004797667242706, + "language_loss": 0.80354667, + "learning_rate": 2.629209319173274e-06, + "loss": 0.82517087, + "num_input_tokens_seen": 148624490, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 6922, + "time_per_iteration": 2.4668424129486084 + }, + { + "auxiliary_loss_clip": 0.01124248, + "auxiliary_loss_mlp": 0.01032463, + "balance_loss_clip": 1.01878977, + "balance_loss_mlp": 1.04562068, + "epoch": 0.4162332782203517, + "flos": 26213317395840.0, + "grad_norm": 1.7750243686484017, + "language_loss": 0.67461836, + "learning_rate": 2.628839621341247e-06, + "loss": 0.69618553, + "num_input_tokens_seen": 148646490, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7890625, + "step": 6923, + "time_per_iteration": 2.500643014907837 + }, + { + "auxiliary_loss_clip": 0.0112335, + "auxiliary_loss_mlp": 0.01043172, + "balance_loss_clip": 1.02822304, + "balance_loss_mlp": 1.04540539, + "epoch": 0.4162934014730197, + "flos": 28183987215360.0, + "grad_norm": 1.7543246434734396, + "language_loss": 0.75878662, + "learning_rate": 2.6284698996635593e-06, + "loss": 0.78045189, + "num_input_tokens_seen": 148668580, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.77734375, + "step": 6924, + "time_per_iteration": 2.5196292400360107 + }, + { + "auxiliary_loss_clip": 0.01120451, + "auxiliary_loss_mlp": 0.01037748, + "balance_loss_clip": 1.02382445, + "balance_loss_mlp": 1.04238617, + "epoch": 0.41635352472568765, + "flos": 19865316654720.0, + "grad_norm": 1.7266126934206025, + "language_loss": 0.72481495, + "learning_rate": 2.62810015415423e-06, + "loss": 0.74639702, + "num_input_tokens_seen": 148688410, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.78125, + "step": 6925, + "time_per_iteration": 2.4335598945617676 + }, + { + "auxiliary_loss_clip": 0.01118059, + "auxiliary_loss_mlp": 0.01034152, + "balance_loss_clip": 1.02069342, + "balance_loss_mlp": 1.0413928, + "epoch": 0.4164136479783556, + "flos": 14935356011520.0, + "grad_norm": 1.8465053152696829, + "language_loss": 0.83475816, + "learning_rate": 2.6277303848272792e-06, + "loss": 0.85628033, + "num_input_tokens_seen": 148704855, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 6926, + "time_per_iteration": 2.5088613033294678 + }, + { + "auxiliary_loss_clip": 0.01115859, + "auxiliary_loss_mlp": 0.01035349, + "balance_loss_clip": 1.02305889, + "balance_loss_mlp": 1.04325294, + "epoch": 0.4164737712310236, + "flos": 21757593041280.0, + "grad_norm": 1.6423809052501923, + "language_loss": 0.86620545, + "learning_rate": 2.6273605916967302e-06, + "loss": 0.88771755, + "num_input_tokens_seen": 148723065, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.7265625, + "step": 6927, + "time_per_iteration": 2.534503936767578 + }, + { + "auxiliary_loss_clip": 0.01118504, + "auxiliary_loss_mlp": 0.0104184, + "balance_loss_clip": 1.0272553, + "balance_loss_mlp": 1.04246414, + "epoch": 0.41653389448369155, + "flos": 20740136394240.0, + "grad_norm": 1.9802013979545179, + "language_loss": 0.72300684, + "learning_rate": 2.626990774776604e-06, + "loss": 0.74461025, + "num_input_tokens_seen": 148741780, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.76171875, + "step": 6928, + "time_per_iteration": 3.88004732131958 + }, + { + "auxiliary_loss_clip": 0.01116658, + "auxiliary_loss_mlp": 0.01039078, + "balance_loss_clip": 1.02459407, + "balance_loss_mlp": 1.04092073, + "epoch": 0.4165940177363595, + "flos": 24972891073920.0, + "grad_norm": 1.862862690513255, + "language_loss": 0.78142846, + "learning_rate": 2.6266209340809254e-06, + "loss": 0.80298579, + "num_input_tokens_seen": 148759795, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7578125, + "step": 6929, + "time_per_iteration": 5.323524713516235 + }, + { + "auxiliary_loss_clip": 0.01119115, + "auxiliary_loss_mlp": 0.01034916, + "balance_loss_clip": 1.02201128, + "balance_loss_mlp": 1.0432961, + "epoch": 0.41665414098902753, + "flos": 20521727746560.0, + "grad_norm": 1.7470362991732848, + "language_loss": 0.71024638, + "learning_rate": 2.6262510696237182e-06, + "loss": 0.73178667, + "num_input_tokens_seen": 148778680, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7578125, + "step": 6930, + "time_per_iteration": 2.4636495113372803 + }, + { + "auxiliary_loss_clip": 0.01116513, + "auxiliary_loss_mlp": 0.01035142, + "balance_loss_clip": 1.02139127, + "balance_loss_mlp": 1.04026747, + "epoch": 0.4167142642416955, + "flos": 19682926369920.0, + "grad_norm": 1.7271533589437842, + "language_loss": 0.80665648, + "learning_rate": 2.625881181419007e-06, + "loss": 0.82817304, + "num_input_tokens_seen": 148796470, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.765625, + "step": 6931, + "time_per_iteration": 2.4350993633270264 + }, + { + "auxiliary_loss_clip": 0.01115154, + "auxiliary_loss_mlp": 0.01038095, + "balance_loss_clip": 1.02392721, + "balance_loss_mlp": 1.04003608, + "epoch": 0.41677438749436346, + "flos": 23763742519680.0, + "grad_norm": 1.8450466812598405, + "language_loss": 0.79109526, + "learning_rate": 2.6255112694808193e-06, + "loss": 0.81262779, + "num_input_tokens_seen": 148815300, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.75, + "step": 6932, + "time_per_iteration": 2.499152660369873 + }, + { + "auxiliary_loss_clip": 0.01117704, + "auxiliary_loss_mlp": 0.01039084, + "balance_loss_clip": 1.02421236, + "balance_loss_mlp": 1.04105997, + "epoch": 0.41683451074703143, + "flos": 30410053712640.0, + "grad_norm": 2.265953381144445, + "language_loss": 0.81735384, + "learning_rate": 2.6251413338231813e-06, + "loss": 0.83892173, + "num_input_tokens_seen": 148834315, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.765625, + "step": 6933, + "time_per_iteration": 2.5096874237060547 + }, + { + "auxiliary_loss_clip": 0.01119747, + "auxiliary_loss_mlp": 0.01037299, + "balance_loss_clip": 1.02184963, + "balance_loss_mlp": 1.04056907, + "epoch": 0.4168946339996994, + "flos": 21506757390720.0, + "grad_norm": 2.1923639109766144, + "language_loss": 0.76769817, + "learning_rate": 2.624771374460121e-06, + "loss": 0.78926861, + "num_input_tokens_seen": 148852420, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.79296875, + "step": 6934, + "time_per_iteration": 2.4590814113616943 + }, + { + "auxiliary_loss_clip": 0.01120428, + "auxiliary_loss_mlp": 0.01034168, + "balance_loss_clip": 1.02003551, + "balance_loss_mlp": 1.04396558, + "epoch": 0.41695475725236736, + "flos": 17638675539840.0, + "grad_norm": 1.774753965654226, + "language_loss": 0.67036676, + "learning_rate": 2.624401391405668e-06, + "loss": 0.69191271, + "num_input_tokens_seen": 148869305, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.765625, + "step": 6935, + "time_per_iteration": 2.4111990928649902 + }, + { + "auxiliary_loss_clip": 0.01120243, + "auxiliary_loss_mlp": 0.01040903, + "balance_loss_clip": 1.0266757, + "balance_loss_mlp": 1.04329324, + "epoch": 0.4170148805050353, + "flos": 15668903560320.0, + "grad_norm": 2.7357101171275504, + "language_loss": 0.73245633, + "learning_rate": 2.6240313846738513e-06, + "loss": 0.75406778, + "num_input_tokens_seen": 148886395, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.76953125, + "step": 6936, + "time_per_iteration": 2.452911376953125 + }, + { + "auxiliary_loss_clip": 0.01117055, + "auxiliary_loss_mlp": 0.01034796, + "balance_loss_clip": 1.02102733, + "balance_loss_mlp": 1.0418582, + "epoch": 0.4170750037577033, + "flos": 15159151699200.0, + "grad_norm": 1.8471548990860345, + "language_loss": 0.73746514, + "learning_rate": 2.6236613542787024e-06, + "loss": 0.75898361, + "num_input_tokens_seen": 148905235, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.75, + "step": 6937, + "time_per_iteration": 2.426177978515625 + }, + { + "auxiliary_loss_clip": 0.01116111, + "auxiliary_loss_mlp": 0.01035449, + "balance_loss_clip": 1.02194881, + "balance_loss_mlp": 1.04150152, + "epoch": 0.41713512701037125, + "flos": 28768289754240.0, + "grad_norm": 1.512143650526939, + "language_loss": 0.8406328, + "learning_rate": 2.6232913002342518e-06, + "loss": 0.8621484, + "num_input_tokens_seen": 148928130, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 6938, + "time_per_iteration": 2.543088436126709 + }, + { + "auxiliary_loss_clip": 0.0112279, + "auxiliary_loss_mlp": 0.01034707, + "balance_loss_clip": 1.01959753, + "balance_loss_mlp": 1.04346168, + "epoch": 0.4171952502630392, + "flos": 28256993608320.0, + "grad_norm": 2.0225615339435183, + "language_loss": 0.74319148, + "learning_rate": 2.6229212225545334e-06, + "loss": 0.76476645, + "num_input_tokens_seen": 148948790, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.79296875, + "step": 6939, + "time_per_iteration": 2.5119175910949707 + }, + { + "auxiliary_loss_clip": 0.01120397, + "auxiliary_loss_mlp": 0.01033285, + "balance_loss_clip": 1.01864552, + "balance_loss_mlp": 1.04396725, + "epoch": 0.4172553735157072, + "flos": 24571697091840.0, + "grad_norm": 1.7048101001333908, + "language_loss": 0.7502594, + "learning_rate": 2.622551121253579e-06, + "loss": 0.77179623, + "num_input_tokens_seen": 148967690, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.765625, + "step": 6940, + "time_per_iteration": 2.505476474761963 + }, + { + "auxiliary_loss_clip": 0.01118418, + "auxiliary_loss_mlp": 0.0103924, + "balance_loss_clip": 1.02621651, + "balance_loss_mlp": 1.04277742, + "epoch": 0.41731549676837515, + "flos": 27045797978880.0, + "grad_norm": 1.6601557953990327, + "language_loss": 0.71575844, + "learning_rate": 2.622180996345424e-06, + "loss": 0.73733509, + "num_input_tokens_seen": 148987150, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7578125, + "step": 6941, + "time_per_iteration": 2.4826831817626953 + }, + { + "auxiliary_loss_clip": 0.01120873, + "auxiliary_loss_mlp": 0.0103738, + "balance_loss_clip": 1.02307487, + "balance_loss_mlp": 1.04215777, + "epoch": 0.4173756200210431, + "flos": 28394063907840.0, + "grad_norm": 1.8824806717934597, + "language_loss": 0.73884863, + "learning_rate": 2.621810847844104e-06, + "loss": 0.76043111, + "num_input_tokens_seen": 149004895, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7890625, + "step": 6942, + "time_per_iteration": 2.510179281234741 + }, + { + "auxiliary_loss_clip": 0.01124355, + "auxiliary_loss_mlp": 0.01037141, + "balance_loss_clip": 1.02190626, + "balance_loss_mlp": 1.04450595, + "epoch": 0.41743574327371114, + "flos": 22521556431360.0, + "grad_norm": 2.1000096782313644, + "language_loss": 0.72619486, + "learning_rate": 2.6214406757636534e-06, + "loss": 0.74780977, + "num_input_tokens_seen": 149020970, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.80078125, + "step": 6943, + "time_per_iteration": 2.437713861465454 + }, + { + "auxiliary_loss_clip": 0.01121913, + "auxiliary_loss_mlp": 0.01033826, + "balance_loss_clip": 1.01844811, + "balance_loss_mlp": 1.04391849, + "epoch": 0.4174958665263791, + "flos": 30113431200000.0, + "grad_norm": 1.5914405962225948, + "language_loss": 0.63451827, + "learning_rate": 2.621070480118111e-06, + "loss": 0.6560756, + "num_input_tokens_seen": 149041795, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.78125, + "step": 6944, + "time_per_iteration": 2.5866405963897705 + }, + { + "auxiliary_loss_clip": 0.01118766, + "auxiliary_loss_mlp": 0.01031312, + "balance_loss_clip": 1.01747799, + "balance_loss_mlp": 1.04272938, + "epoch": 0.41755598977904707, + "flos": 25263444188160.0, + "grad_norm": 1.6963739292171327, + "language_loss": 0.7014094, + "learning_rate": 2.620700260921513e-06, + "loss": 0.72291017, + "num_input_tokens_seen": 149063700, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76171875, + "step": 6945, + "time_per_iteration": 2.4984183311462402 + }, + { + "auxiliary_loss_clip": 0.01116559, + "auxiliary_loss_mlp": 0.01041419, + "balance_loss_clip": 1.02556372, + "balance_loss_mlp": 1.04024088, + "epoch": 0.41761611303171503, + "flos": 19828580019840.0, + "grad_norm": 1.623733928455925, + "language_loss": 0.80850792, + "learning_rate": 2.620330018187899e-06, + "loss": 0.83008766, + "num_input_tokens_seen": 149082410, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.76171875, + "step": 6946, + "time_per_iteration": 2.5301356315612793 + }, + { + "auxiliary_loss_clip": 0.01118432, + "auxiliary_loss_mlp": 0.01036458, + "balance_loss_clip": 1.02281451, + "balance_loss_mlp": 1.04321134, + "epoch": 0.417676236284383, + "flos": 15523249910400.0, + "grad_norm": 2.2176705837507784, + "language_loss": 0.77525783, + "learning_rate": 2.6199597519313086e-06, + "loss": 0.79680669, + "num_input_tokens_seen": 149098745, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75390625, + "step": 6947, + "time_per_iteration": 2.432767391204834 + }, + { + "auxiliary_loss_clip": 0.01119017, + "auxiliary_loss_mlp": 0.01035109, + "balance_loss_clip": 1.0204227, + "balance_loss_mlp": 1.04268038, + "epoch": 0.41773635953705096, + "flos": 32524473761280.0, + "grad_norm": 2.207686964264854, + "language_loss": 0.71242738, + "learning_rate": 2.6195894621657825e-06, + "loss": 0.73396862, + "num_input_tokens_seen": 149122255, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.76171875, + "step": 6948, + "time_per_iteration": 2.565560817718506 + }, + { + "auxiliary_loss_clip": 0.01112399, + "auxiliary_loss_mlp": 0.01029195, + "balance_loss_clip": 1.01575994, + "balance_loss_mlp": 1.03894424, + "epoch": 0.4177964827897189, + "flos": 23440941970560.0, + "grad_norm": 1.5189916920378803, + "language_loss": 0.77142775, + "learning_rate": 2.619219148905362e-06, + "loss": 0.7928437, + "num_input_tokens_seen": 149142845, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.734375, + "step": 6949, + "time_per_iteration": 2.459484338760376 + }, + { + "auxiliary_loss_clip": 0.01122421, + "auxiliary_loss_mlp": 0.01036015, + "balance_loss_clip": 1.02156091, + "balance_loss_mlp": 1.04367769, + "epoch": 0.4178566060423869, + "flos": 22748907565440.0, + "grad_norm": 1.5094834159772865, + "language_loss": 0.81985492, + "learning_rate": 2.6188488121640888e-06, + "loss": 0.84143925, + "num_input_tokens_seen": 149163375, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7890625, + "step": 6950, + "time_per_iteration": 2.5348877906799316 + }, + { + "auxiliary_loss_clip": 0.01113505, + "auxiliary_loss_mlp": 0.01030512, + "balance_loss_clip": 1.01804328, + "balance_loss_mlp": 1.04157758, + "epoch": 0.41791672929505486, + "flos": 26032794618240.0, + "grad_norm": 1.3221945547908684, + "language_loss": 0.76189649, + "learning_rate": 2.618478451956007e-06, + "loss": 0.78333664, + "num_input_tokens_seen": 149185610, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71875, + "step": 6951, + "time_per_iteration": 2.5055410861968994 + }, + { + "auxiliary_loss_clip": 0.01121988, + "auxiliary_loss_mlp": 0.01034046, + "balance_loss_clip": 1.01894784, + "balance_loss_mlp": 1.04247046, + "epoch": 0.4179768525477228, + "flos": 19568694142080.0, + "grad_norm": 1.7645474682355455, + "language_loss": 0.72922826, + "learning_rate": 2.61810806829516e-06, + "loss": 0.75078857, + "num_input_tokens_seen": 149203990, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.796875, + "step": 6952, + "time_per_iteration": 2.499979019165039 + }, + { + "auxiliary_loss_clip": 0.01117763, + "auxiliary_loss_mlp": 0.01032178, + "balance_loss_clip": 1.01826596, + "balance_loss_mlp": 1.04266691, + "epoch": 0.4180369758003908, + "flos": 17783826399360.0, + "grad_norm": 3.0061867681934795, + "language_loss": 0.7182008, + "learning_rate": 2.617737661195593e-06, + "loss": 0.73970026, + "num_input_tokens_seen": 149221385, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 6953, + "time_per_iteration": 2.4045305252075195 + }, + { + "auxiliary_loss_clip": 0.01116286, + "auxiliary_loss_mlp": 0.01035837, + "balance_loss_clip": 1.02106667, + "balance_loss_mlp": 1.04293513, + "epoch": 0.41809709905305875, + "flos": 20960663944320.0, + "grad_norm": 1.696123367289706, + "language_loss": 0.76163101, + "learning_rate": 2.617367230671353e-06, + "loss": 0.78315222, + "num_input_tokens_seen": 149241175, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.734375, + "step": 6954, + "time_per_iteration": 2.5208778381347656 + }, + { + "auxiliary_loss_clip": 0.01117033, + "auxiliary_loss_mlp": 0.01037952, + "balance_loss_clip": 1.02243114, + "balance_loss_mlp": 1.0407306, + "epoch": 0.4181572223057267, + "flos": 22017622573440.0, + "grad_norm": 2.123626835554744, + "language_loss": 0.84569108, + "learning_rate": 2.616996776736485e-06, + "loss": 0.86724097, + "num_input_tokens_seen": 149259115, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.765625, + "step": 6955, + "time_per_iteration": 2.4470770359039307 + }, + { + "auxiliary_loss_clip": 0.01115036, + "auxiliary_loss_mlp": 0.01035899, + "balance_loss_clip": 1.02206469, + "balance_loss_mlp": 1.04131222, + "epoch": 0.4182173455583947, + "flos": 26245528917120.0, + "grad_norm": 1.7424753883235222, + "language_loss": 0.83219767, + "learning_rate": 2.616626299405037e-06, + "loss": 0.85370708, + "num_input_tokens_seen": 149278705, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.73828125, + "step": 6956, + "time_per_iteration": 2.53238582611084 + }, + { + "auxiliary_loss_clip": 0.01120034, + "auxiliary_loss_mlp": 0.01041481, + "balance_loss_clip": 1.02661586, + "balance_loss_mlp": 1.04286742, + "epoch": 0.4182774688110627, + "flos": 14791605782400.0, + "grad_norm": 2.117667338273699, + "language_loss": 0.71621263, + "learning_rate": 2.616255798691059e-06, + "loss": 0.73782784, + "num_input_tokens_seen": 149294040, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7734375, + "step": 6957, + "time_per_iteration": 2.4127233028411865 + }, + { + "auxiliary_loss_clip": 0.01116705, + "auxiliary_loss_mlp": 0.01037238, + "balance_loss_clip": 1.02450657, + "balance_loss_mlp": 1.0416609, + "epoch": 0.41833759206373067, + "flos": 20412020632320.0, + "grad_norm": 2.020066118448717, + "language_loss": 0.75841641, + "learning_rate": 2.6158852746085982e-06, + "loss": 0.77995586, + "num_input_tokens_seen": 149310385, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.75, + "step": 6958, + "time_per_iteration": 2.621243476867676 + }, + { + "auxiliary_loss_clip": 0.01116903, + "auxiliary_loss_mlp": 0.01031018, + "balance_loss_clip": 1.01718402, + "balance_loss_mlp": 1.04121447, + "epoch": 0.41839771531639863, + "flos": 23656333875840.0, + "grad_norm": 1.5992923753241641, + "language_loss": 0.76712382, + "learning_rate": 2.6155147271717066e-06, + "loss": 0.78860307, + "num_input_tokens_seen": 149328235, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 6959, + "time_per_iteration": 2.4936535358428955 + }, + { + "auxiliary_loss_clip": 0.01117896, + "auxiliary_loss_mlp": 0.01036611, + "balance_loss_clip": 1.02191257, + "balance_loss_mlp": 1.04106176, + "epoch": 0.4184578385690666, + "flos": 19754137082880.0, + "grad_norm": 1.629552094504433, + "language_loss": 0.76652783, + "learning_rate": 2.6151441563944347e-06, + "loss": 0.78807288, + "num_input_tokens_seen": 149347465, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.765625, + "step": 6960, + "time_per_iteration": 2.513699769973755 + }, + { + "auxiliary_loss_clip": 0.01111464, + "auxiliary_loss_mlp": 0.01030825, + "balance_loss_clip": 1.01822484, + "balance_loss_mlp": 1.04088879, + "epoch": 0.41851796182173456, + "flos": 20193396503040.0, + "grad_norm": 1.8359587043053753, + "language_loss": 0.75856298, + "learning_rate": 2.614773562290835e-06, + "loss": 0.7799859, + "num_input_tokens_seen": 149366685, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.70703125, + "step": 6961, + "time_per_iteration": 2.4798686504364014 + }, + { + "auxiliary_loss_clip": 0.01040549, + "auxiliary_loss_mlp": 0.010007, + "balance_loss_clip": 0.99909067, + "balance_loss_mlp": 1.01660466, + "epoch": 0.41857808507440253, + "flos": 59018794231680.0, + "grad_norm": 0.7788999280449799, + "language_loss": 0.5466665, + "learning_rate": 2.61440294487496e-06, + "loss": 0.56707895, + "num_input_tokens_seen": 149422925, + "router_z_loss_clip": 0.01611328, + "router_z_loss_mlp": 0.23925781, + "step": 6962, + "time_per_iteration": 3.0343000888824463 + }, + { + "auxiliary_loss_clip": 0.01119412, + "auxiliary_loss_mlp": 0.01036235, + "balance_loss_clip": 1.02266252, + "balance_loss_mlp": 1.04263735, + "epoch": 0.4186382083270705, + "flos": 18478805719680.0, + "grad_norm": 1.8026406871934313, + "language_loss": 0.85487044, + "learning_rate": 2.614032304160864e-06, + "loss": 0.87642694, + "num_input_tokens_seen": 149440820, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.765625, + "step": 6963, + "time_per_iteration": 2.4352054595947266 + }, + { + "auxiliary_loss_clip": 0.01117647, + "auxiliary_loss_mlp": 0.01035822, + "balance_loss_clip": 1.02210093, + "balance_loss_mlp": 1.04331315, + "epoch": 0.41869833157973846, + "flos": 21578758202880.0, + "grad_norm": 1.6053381131745172, + "language_loss": 0.70357138, + "learning_rate": 2.6136616401626014e-06, + "loss": 0.72510606, + "num_input_tokens_seen": 149461060, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7421875, + "step": 6964, + "time_per_iteration": 2.50482439994812 + }, + { + "auxiliary_loss_clip": 0.01113376, + "auxiliary_loss_mlp": 0.01035614, + "balance_loss_clip": 1.02268004, + "balance_loss_mlp": 1.04087543, + "epoch": 0.4187584548324064, + "flos": 35517412650240.0, + "grad_norm": 1.8351593031507138, + "language_loss": 0.70862091, + "learning_rate": 2.6132909528942273e-06, + "loss": 0.73011076, + "num_input_tokens_seen": 149483115, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.72265625, + "step": 6965, + "time_per_iteration": 2.6057491302490234 + }, + { + "auxiliary_loss_clip": 0.01113418, + "auxiliary_loss_mlp": 0.01032504, + "balance_loss_clip": 1.02033257, + "balance_loss_mlp": 1.0413456, + "epoch": 0.4188185780850744, + "flos": 18655880791680.0, + "grad_norm": 1.4950689447506187, + "language_loss": 0.7175675, + "learning_rate": 2.6129202423697997e-06, + "loss": 0.73902673, + "num_input_tokens_seen": 149501495, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.71875, + "step": 6966, + "time_per_iteration": 2.4892048835754395 + }, + { + "auxiliary_loss_clip": 0.01120204, + "auxiliary_loss_mlp": 0.0103471, + "balance_loss_clip": 1.02016091, + "balance_loss_mlp": 1.0421617, + "epoch": 0.41887870133774235, + "flos": 40333428374400.0, + "grad_norm": 2.333720493500319, + "language_loss": 0.71266413, + "learning_rate": 2.612549508603375e-06, + "loss": 0.73421323, + "num_input_tokens_seen": 149523170, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.78125, + "step": 6967, + "time_per_iteration": 2.604076862335205 + }, + { + "auxiliary_loss_clip": 0.01039013, + "auxiliary_loss_mlp": 0.01005246, + "balance_loss_clip": 1.00366104, + "balance_loss_mlp": 1.01515508, + "epoch": 0.4189388245904103, + "flos": 61371336516480.0, + "grad_norm": 0.6722087248044618, + "language_loss": 0.46224236, + "learning_rate": 2.612178751609011e-06, + "loss": 0.48268497, + "num_input_tokens_seen": 149583955, + "router_z_loss_clip": 0.01586914, + "router_z_loss_mlp": 0.23828125, + "step": 6968, + "time_per_iteration": 3.0401268005371094 + }, + { + "auxiliary_loss_clip": 0.01117965, + "auxiliary_loss_mlp": 0.01038122, + "balance_loss_clip": 1.02345359, + "balance_loss_mlp": 1.03981948, + "epoch": 0.4189989478430783, + "flos": 28215624119040.0, + "grad_norm": 1.6180807795397785, + "language_loss": 0.74930859, + "learning_rate": 2.6118079714007685e-06, + "loss": 0.77086943, + "num_input_tokens_seen": 149604440, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78125, + "step": 6969, + "time_per_iteration": 2.5126969814300537 + }, + { + "auxiliary_loss_clip": 0.01112428, + "auxiliary_loss_mlp": 0.01034589, + "balance_loss_clip": 1.02160668, + "balance_loss_mlp": 1.0382787, + "epoch": 0.4190590710957463, + "flos": 24565879088640.0, + "grad_norm": 2.2016737043444903, + "language_loss": 0.80248457, + "learning_rate": 2.611437167992705e-06, + "loss": 0.8239547, + "num_input_tokens_seen": 149623745, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7421875, + "step": 6970, + "time_per_iteration": 5.640556573867798 + }, + { + "auxiliary_loss_clip": 0.01114015, + "auxiliary_loss_mlp": 0.01030966, + "balance_loss_clip": 1.01774538, + "balance_loss_mlp": 1.04030848, + "epoch": 0.41911919434841427, + "flos": 21726027964800.0, + "grad_norm": 1.9623449568843938, + "language_loss": 0.82789886, + "learning_rate": 2.6110663413988835e-06, + "loss": 0.84934866, + "num_input_tokens_seen": 149643025, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 6971, + "time_per_iteration": 3.8554296493530273 + }, + { + "auxiliary_loss_clip": 0.01113275, + "auxiliary_loss_mlp": 0.01035215, + "balance_loss_clip": 1.02057588, + "balance_loss_mlp": 1.04049933, + "epoch": 0.41917931760108224, + "flos": 17601543855360.0, + "grad_norm": 1.6158786040890867, + "language_loss": 0.7468822, + "learning_rate": 2.6106954916333648e-06, + "loss": 0.76836711, + "num_input_tokens_seen": 149660695, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7265625, + "step": 6972, + "time_per_iteration": 2.474414587020874 + }, + { + "auxiliary_loss_clip": 0.01113414, + "auxiliary_loss_mlp": 0.01033398, + "balance_loss_clip": 1.02039838, + "balance_loss_mlp": 1.0393647, + "epoch": 0.4192394408537502, + "flos": 37816701022080.0, + "grad_norm": 1.4614195470734719, + "language_loss": 0.72808421, + "learning_rate": 2.610324618710212e-06, + "loss": 0.74955231, + "num_input_tokens_seen": 149682040, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7421875, + "step": 6973, + "time_per_iteration": 2.5945606231689453 + }, + { + "auxiliary_loss_clip": 0.0112256, + "auxiliary_loss_mlp": 0.01041918, + "balance_loss_clip": 1.02769673, + "balance_loss_mlp": 1.04242992, + "epoch": 0.41929956410641817, + "flos": 23107726477440.0, + "grad_norm": 2.1718837857164464, + "language_loss": 0.74863386, + "learning_rate": 2.609953722643489e-06, + "loss": 0.77027869, + "num_input_tokens_seen": 149700855, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.8046875, + "step": 6974, + "time_per_iteration": 2.4790663719177246 + }, + { + "auxiliary_loss_clip": 0.01112575, + "auxiliary_loss_mlp": 0.01030019, + "balance_loss_clip": 1.01669776, + "balance_loss_mlp": 1.03879452, + "epoch": 0.41935968735908613, + "flos": 22524537260160.0, + "grad_norm": 2.8466202693933265, + "language_loss": 0.72836936, + "learning_rate": 2.609582803447259e-06, + "loss": 0.74979532, + "num_input_tokens_seen": 149717360, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73828125, + "step": 6975, + "time_per_iteration": 2.4560608863830566 + }, + { + "auxiliary_loss_clip": 0.01114785, + "auxiliary_loss_mlp": 0.01033126, + "balance_loss_clip": 1.01961374, + "balance_loss_mlp": 1.04139054, + "epoch": 0.4194198106117541, + "flos": 26870446759680.0, + "grad_norm": 1.6070899494887878, + "language_loss": 0.80725533, + "learning_rate": 2.6092118611355885e-06, + "loss": 0.82873446, + "num_input_tokens_seen": 149738975, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 6976, + "time_per_iteration": 2.5148777961730957 + }, + { + "auxiliary_loss_clip": 0.01112592, + "auxiliary_loss_mlp": 0.01025549, + "balance_loss_clip": 1.0124954, + "balance_loss_mlp": 1.03755522, + "epoch": 0.41947993386442206, + "flos": 19902412425600.0, + "grad_norm": 2.297468657248195, + "language_loss": 0.67767072, + "learning_rate": 2.6088408957225425e-06, + "loss": 0.6990521, + "num_input_tokens_seen": 149757055, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.75, + "step": 6977, + "time_per_iteration": 2.4294896125793457 + }, + { + "auxiliary_loss_clip": 0.01116519, + "auxiliary_loss_mlp": 0.01034878, + "balance_loss_clip": 1.02193213, + "balance_loss_mlp": 1.04046345, + "epoch": 0.41954005711709, + "flos": 17383889393280.0, + "grad_norm": 2.6461140984259304, + "language_loss": 0.80869353, + "learning_rate": 2.6084699072221898e-06, + "loss": 0.83020747, + "num_input_tokens_seen": 149772885, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.76171875, + "step": 6978, + "time_per_iteration": 2.4688472747802734 + }, + { + "auxiliary_loss_clip": 0.01114359, + "auxiliary_loss_mlp": 0.01036249, + "balance_loss_clip": 1.02207506, + "balance_loss_mlp": 1.0377202, + "epoch": 0.419600180369758, + "flos": 25003306915200.0, + "grad_norm": 1.725404980402679, + "language_loss": 0.82583737, + "learning_rate": 2.6080988956485964e-06, + "loss": 0.84734344, + "num_input_tokens_seen": 149791515, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.765625, + "step": 6979, + "time_per_iteration": 2.4702186584472656 + }, + { + "auxiliary_loss_clip": 0.01113345, + "auxiliary_loss_mlp": 0.01033062, + "balance_loss_clip": 1.0194428, + "balance_loss_mlp": 1.0388211, + "epoch": 0.41966030362242596, + "flos": 17383781652480.0, + "grad_norm": 1.8637978278873943, + "language_loss": 0.83381826, + "learning_rate": 2.6077278610158325e-06, + "loss": 0.85528231, + "num_input_tokens_seen": 149807250, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 6980, + "time_per_iteration": 2.5195069313049316 + }, + { + "auxiliary_loss_clip": 0.01116413, + "auxiliary_loss_mlp": 0.01032944, + "balance_loss_clip": 1.01975989, + "balance_loss_mlp": 1.03946161, + "epoch": 0.4197204268750939, + "flos": 22156165330560.0, + "grad_norm": 2.9241676519266004, + "language_loss": 0.79068786, + "learning_rate": 2.6073568033379665e-06, + "loss": 0.81218135, + "num_input_tokens_seen": 149821640, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.765625, + "step": 6981, + "time_per_iteration": 2.4457991123199463 + }, + { + "auxiliary_loss_clip": 0.01110331, + "auxiliary_loss_mlp": 0.01034012, + "balance_loss_clip": 1.02078593, + "balance_loss_mlp": 1.03806782, + "epoch": 0.4197805501277619, + "flos": 22084128604800.0, + "grad_norm": 1.6203222993930824, + "language_loss": 0.84426481, + "learning_rate": 2.6069857226290696e-06, + "loss": 0.86570823, + "num_input_tokens_seen": 149840545, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.72265625, + "step": 6982, + "time_per_iteration": 2.483635425567627 + }, + { + "auxiliary_loss_clip": 0.01116431, + "auxiliary_loss_mlp": 0.01036883, + "balance_loss_clip": 1.02191043, + "balance_loss_mlp": 1.03910255, + "epoch": 0.4198406733804299, + "flos": 26432192920320.0, + "grad_norm": 1.9325593989695682, + "language_loss": 0.56615967, + "learning_rate": 2.606614618903214e-06, + "loss": 0.58769286, + "num_input_tokens_seen": 149860375, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7734375, + "step": 6983, + "time_per_iteration": 2.4729864597320557 + }, + { + "auxiliary_loss_clip": 0.01114232, + "auxiliary_loss_mlp": 0.01035133, + "balance_loss_clip": 1.02243733, + "balance_loss_mlp": 1.0403446, + "epoch": 0.4199007966330979, + "flos": 12531029293440.0, + "grad_norm": 2.639890794043824, + "language_loss": 0.82404107, + "learning_rate": 2.606243492174471e-06, + "loss": 0.84553468, + "num_input_tokens_seen": 149877850, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.73828125, + "step": 6984, + "time_per_iteration": 2.4610702991485596 + }, + { + "auxiliary_loss_clip": 0.01113379, + "auxiliary_loss_mlp": 0.01028562, + "balance_loss_clip": 1.01515102, + "balance_loss_mlp": 1.03938794, + "epoch": 0.41996091988576584, + "flos": 21762944167680.0, + "grad_norm": 1.6654879970317658, + "language_loss": 0.78883481, + "learning_rate": 2.605872342456914e-06, + "loss": 0.81025428, + "num_input_tokens_seen": 149896110, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73828125, + "step": 6985, + "time_per_iteration": 2.4739370346069336 + }, + { + "auxiliary_loss_clip": 0.01118591, + "auxiliary_loss_mlp": 0.01034658, + "balance_loss_clip": 1.02042401, + "balance_loss_mlp": 1.03950381, + "epoch": 0.4200210431384338, + "flos": 26541935948160.0, + "grad_norm": 3.375844113891133, + "language_loss": 0.77833611, + "learning_rate": 2.6055011697646173e-06, + "loss": 0.79986858, + "num_input_tokens_seen": 149916495, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7890625, + "step": 6986, + "time_per_iteration": 2.5488531589508057 + }, + { + "auxiliary_loss_clip": 0.01111943, + "auxiliary_loss_mlp": 0.01030974, + "balance_loss_clip": 1.01886213, + "balance_loss_mlp": 1.03984082, + "epoch": 0.42008116639110177, + "flos": 26795824254720.0, + "grad_norm": 1.5789932508621725, + "language_loss": 0.72640669, + "learning_rate": 2.605129974111655e-06, + "loss": 0.74783587, + "num_input_tokens_seen": 149936445, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71875, + "step": 6987, + "time_per_iteration": 2.522143840789795 + }, + { + "auxiliary_loss_clip": 0.01117787, + "auxiliary_loss_mlp": 0.01042745, + "balance_loss_clip": 1.02886939, + "balance_loss_mlp": 1.04176915, + "epoch": 0.42014128964376973, + "flos": 32087333243520.0, + "grad_norm": 1.4538200585449164, + "language_loss": 0.75399673, + "learning_rate": 2.604758755512104e-06, + "loss": 0.77560198, + "num_input_tokens_seen": 149959430, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76171875, + "step": 6988, + "time_per_iteration": 2.57265305519104 + }, + { + "auxiliary_loss_clip": 0.01118811, + "auxiliary_loss_mlp": 0.01036964, + "balance_loss_clip": 1.02287364, + "balance_loss_mlp": 1.04034519, + "epoch": 0.4202014128964377, + "flos": 26467133875200.0, + "grad_norm": 1.6383736622893421, + "language_loss": 0.74155712, + "learning_rate": 2.60438751398004e-06, + "loss": 0.76311487, + "num_input_tokens_seen": 149980365, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.78125, + "step": 6989, + "time_per_iteration": 2.4846689701080322 + }, + { + "auxiliary_loss_clip": 0.01118468, + "auxiliary_loss_mlp": 0.01036151, + "balance_loss_clip": 1.02213192, + "balance_loss_mlp": 1.041116, + "epoch": 0.42026153614910566, + "flos": 13401216178560.0, + "grad_norm": 2.649933968591077, + "language_loss": 0.70989478, + "learning_rate": 2.6040162495295404e-06, + "loss": 0.73144102, + "num_input_tokens_seen": 149997375, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7734375, + "step": 6990, + "time_per_iteration": 2.5092554092407227 + }, + { + "auxiliary_loss_clip": 0.01038945, + "auxiliary_loss_mlp": 0.01004482, + "balance_loss_clip": 1.00287271, + "balance_loss_mlp": 1.01510215, + "epoch": 0.42032165940177363, + "flos": 60250457635200.0, + "grad_norm": 0.8281033043630844, + "language_loss": 0.60529578, + "learning_rate": 2.603644962174685e-06, + "loss": 0.62573004, + "num_input_tokens_seen": 150051230, + "router_z_loss_clip": 0.01611328, + "router_z_loss_mlp": 0.23828125, + "step": 6991, + "time_per_iteration": 2.921936511993408 + }, + { + "auxiliary_loss_clip": 0.01120177, + "auxiliary_loss_mlp": 0.01037059, + "balance_loss_clip": 1.02322519, + "balance_loss_mlp": 1.04332614, + "epoch": 0.4203817826544416, + "flos": 24535211852160.0, + "grad_norm": 1.5069916983433078, + "language_loss": 0.83222365, + "learning_rate": 2.6032736519295517e-06, + "loss": 0.85379601, + "num_input_tokens_seen": 150071135, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76953125, + "step": 6992, + "time_per_iteration": 2.495664358139038 + }, + { + "auxiliary_loss_clip": 0.01039195, + "auxiliary_loss_mlp": 0.01003357, + "balance_loss_clip": 1.00179517, + "balance_loss_mlp": 1.01546574, + "epoch": 0.42044190590710956, + "flos": 58820781530880.0, + "grad_norm": 0.8165124973650228, + "language_loss": 0.65523541, + "learning_rate": 2.6029023188082217e-06, + "loss": 0.67566097, + "num_input_tokens_seen": 150125220, + "router_z_loss_clip": 0.01556396, + "router_z_loss_mlp": 0.23730469, + "step": 6993, + "time_per_iteration": 3.078948736190796 + }, + { + "auxiliary_loss_clip": 0.01122889, + "auxiliary_loss_mlp": 0.01034205, + "balance_loss_clip": 1.01845777, + "balance_loss_mlp": 1.04213274, + "epoch": 0.4205020291597775, + "flos": 16436063260800.0, + "grad_norm": 2.0847143106579806, + "language_loss": 0.83213866, + "learning_rate": 2.6025309628247746e-06, + "loss": 0.85370958, + "num_input_tokens_seen": 150142300, + "router_z_loss_clip": 0.15820312, + "router_z_loss_mlp": 0.8046875, + "step": 6994, + "time_per_iteration": 2.42958402633667 + }, + { + "auxiliary_loss_clip": 0.01115372, + "auxiliary_loss_mlp": 0.01034094, + "balance_loss_clip": 1.02112424, + "balance_loss_mlp": 1.04195786, + "epoch": 0.4205621524124455, + "flos": 18405655672320.0, + "grad_norm": 1.6590785995391892, + "language_loss": 0.78497195, + "learning_rate": 2.6021595839932934e-06, + "loss": 0.8064667, + "num_input_tokens_seen": 150161345, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.734375, + "step": 6995, + "time_per_iteration": 2.4311602115631104 + }, + { + "auxiliary_loss_clip": 0.01113356, + "auxiliary_loss_mlp": 0.01031571, + "balance_loss_clip": 1.01849341, + "balance_loss_mlp": 1.04043221, + "epoch": 0.4206222756651135, + "flos": 25520097841920.0, + "grad_norm": 1.5317093362831764, + "language_loss": 0.79829741, + "learning_rate": 2.60178818232786e-06, + "loss": 0.81974673, + "num_input_tokens_seen": 150182420, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73046875, + "step": 6996, + "time_per_iteration": 2.5032711029052734 + }, + { + "auxiliary_loss_clip": 0.01118604, + "auxiliary_loss_mlp": 0.01031481, + "balance_loss_clip": 1.01837945, + "balance_loss_mlp": 1.04208779, + "epoch": 0.4206823989177815, + "flos": 15304338472320.0, + "grad_norm": 2.3208366966184837, + "language_loss": 0.7522642, + "learning_rate": 2.601416757842559e-06, + "loss": 0.77376509, + "num_input_tokens_seen": 150200175, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.765625, + "step": 6997, + "time_per_iteration": 2.4281609058380127 + }, + { + "auxiliary_loss_clip": 0.01117176, + "auxiliary_loss_mlp": 0.01038831, + "balance_loss_clip": 1.02492523, + "balance_loss_mlp": 1.03965962, + "epoch": 0.42074252217044944, + "flos": 15554096714880.0, + "grad_norm": 1.9779533128263025, + "language_loss": 0.76193553, + "learning_rate": 2.6010453105514743e-06, + "loss": 0.78349566, + "num_input_tokens_seen": 150217100, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7734375, + "step": 6998, + "time_per_iteration": 2.4484825134277344 + }, + { + "auxiliary_loss_clip": 0.01121567, + "auxiliary_loss_mlp": 0.01043992, + "balance_loss_clip": 1.02950823, + "balance_loss_mlp": 1.04302716, + "epoch": 0.4208026454231174, + "flos": 26145877610880.0, + "grad_norm": 1.545568275541188, + "language_loss": 0.76295245, + "learning_rate": 2.60067384046869e-06, + "loss": 0.78460807, + "num_input_tokens_seen": 150239830, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78515625, + "step": 6999, + "time_per_iteration": 2.5371389389038086 + }, + { + "auxiliary_loss_clip": 0.01116809, + "auxiliary_loss_mlp": 0.01039134, + "balance_loss_clip": 1.02512717, + "balance_loss_mlp": 1.04221511, + "epoch": 0.42086276867578537, + "flos": 23550110380800.0, + "grad_norm": 1.7925226690493865, + "language_loss": 0.64549243, + "learning_rate": 2.600302347608295e-06, + "loss": 0.66705179, + "num_input_tokens_seen": 150260690, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.74609375, + "step": 7000, + "time_per_iteration": 2.492664337158203 + }, + { + "auxiliary_loss_clip": 0.01117436, + "auxiliary_loss_mlp": 0.01037742, + "balance_loss_clip": 1.02347827, + "balance_loss_mlp": 1.04157186, + "epoch": 0.42092289192845334, + "flos": 18113414618880.0, + "grad_norm": 1.6489015448559594, + "language_loss": 0.76201057, + "learning_rate": 2.5999308319843743e-06, + "loss": 0.7835623, + "num_input_tokens_seen": 150279885, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7578125, + "step": 7001, + "time_per_iteration": 2.4374375343322754 + }, + { + "auxiliary_loss_clip": 0.01116209, + "auxiliary_loss_mlp": 0.0103509, + "balance_loss_clip": 1.02163076, + "balance_loss_mlp": 1.04236293, + "epoch": 0.4209830151811213, + "flos": 20006588845440.0, + "grad_norm": 1.558613926183474, + "language_loss": 0.86427414, + "learning_rate": 2.5995592936110154e-06, + "loss": 0.88578713, + "num_input_tokens_seen": 150297390, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73828125, + "step": 7002, + "time_per_iteration": 2.4840235710144043 + }, + { + "auxiliary_loss_clip": 0.01116213, + "auxiliary_loss_mlp": 0.01035955, + "balance_loss_clip": 1.02331328, + "balance_loss_mlp": 1.04153061, + "epoch": 0.42104313843378927, + "flos": 21978946604160.0, + "grad_norm": 2.8393435321353713, + "language_loss": 0.67447579, + "learning_rate": 2.5991877325023096e-06, + "loss": 0.69599748, + "num_input_tokens_seen": 150317390, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.75, + "step": 7003, + "time_per_iteration": 2.452779531478882 + }, + { + "auxiliary_loss_clip": 0.01120595, + "auxiliary_loss_mlp": 0.01042271, + "balance_loss_clip": 1.02727461, + "balance_loss_mlp": 1.04151964, + "epoch": 0.42110326168645723, + "flos": 25443966965760.0, + "grad_norm": 2.097012731379119, + "language_loss": 0.76887131, + "learning_rate": 2.598816148672344e-06, + "loss": 0.79049993, + "num_input_tokens_seen": 150337455, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7890625, + "step": 7004, + "time_per_iteration": 2.4988765716552734 + }, + { + "auxiliary_loss_clip": 0.0111532, + "auxiliary_loss_mlp": 0.01041124, + "balance_loss_clip": 1.02649117, + "balance_loss_mlp": 1.04101729, + "epoch": 0.4211633849391252, + "flos": 17822574195840.0, + "grad_norm": 1.5948979245136696, + "language_loss": 0.68152726, + "learning_rate": 2.59844454213521e-06, + "loss": 0.70309174, + "num_input_tokens_seen": 150355385, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7421875, + "step": 7005, + "time_per_iteration": 2.4434568881988525 + }, + { + "auxiliary_loss_clip": 0.01118015, + "auxiliary_loss_mlp": 0.01037165, + "balance_loss_clip": 1.02340817, + "balance_loss_mlp": 1.04088581, + "epoch": 0.42122350819179316, + "flos": 16282436791680.0, + "grad_norm": 1.9728430752981747, + "language_loss": 0.72047079, + "learning_rate": 2.5980729129049994e-06, + "loss": 0.74202257, + "num_input_tokens_seen": 150371750, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76953125, + "step": 7006, + "time_per_iteration": 2.4487879276275635 + }, + { + "auxiliary_loss_clip": 0.01117712, + "auxiliary_loss_mlp": 0.01033901, + "balance_loss_clip": 1.01978087, + "balance_loss_mlp": 1.04068065, + "epoch": 0.4212836314444611, + "flos": 19645866512640.0, + "grad_norm": 1.688876483049264, + "language_loss": 0.70708871, + "learning_rate": 2.5977012609958033e-06, + "loss": 0.72860485, + "num_input_tokens_seen": 150389955, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76953125, + "step": 7007, + "time_per_iteration": 2.437270164489746 + }, + { + "auxiliary_loss_clip": 0.01116417, + "auxiliary_loss_mlp": 0.01037894, + "balance_loss_clip": 1.02416158, + "balance_loss_mlp": 1.04059708, + "epoch": 0.4213437546971291, + "flos": 18369026778240.0, + "grad_norm": 1.7353334268618703, + "language_loss": 0.82159567, + "learning_rate": 2.5973295864217166e-06, + "loss": 0.84313881, + "num_input_tokens_seen": 150405780, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7578125, + "step": 7008, + "time_per_iteration": 2.460923194885254 + }, + { + "auxiliary_loss_clip": 0.011146, + "auxiliary_loss_mlp": 0.01040233, + "balance_loss_clip": 1.02642226, + "balance_loss_mlp": 1.03877473, + "epoch": 0.42140387794979706, + "flos": 27704507541120.0, + "grad_norm": 2.1040552452231505, + "language_loss": 0.71574211, + "learning_rate": 2.596957889196831e-06, + "loss": 0.7372905, + "num_input_tokens_seen": 150425615, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 7009, + "time_per_iteration": 2.501915693283081 + }, + { + "auxiliary_loss_clip": 0.01116238, + "auxiliary_loss_mlp": 0.01032831, + "balance_loss_clip": 1.01875222, + "balance_loss_mlp": 1.03954792, + "epoch": 0.4214640012024651, + "flos": 28147071012480.0, + "grad_norm": 2.7512785082136952, + "language_loss": 0.66407478, + "learning_rate": 2.596586169335243e-06, + "loss": 0.68556547, + "num_input_tokens_seen": 150445765, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.765625, + "step": 7010, + "time_per_iteration": 2.5036494731903076 + }, + { + "auxiliary_loss_clip": 0.01114938, + "auxiliary_loss_mlp": 0.01037239, + "balance_loss_clip": 1.02353597, + "balance_loss_mlp": 1.03993797, + "epoch": 0.42152412445513304, + "flos": 22997265177600.0, + "grad_norm": 1.553770179625671, + "language_loss": 0.7243132, + "learning_rate": 2.5962144268510477e-06, + "loss": 0.74583495, + "num_input_tokens_seen": 150464405, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75, + "step": 7011, + "time_per_iteration": 2.471482276916504 + }, + { + "auxiliary_loss_clip": 0.01036961, + "auxiliary_loss_mlp": 0.01009192, + "balance_loss_clip": 1.00765407, + "balance_loss_mlp": 1.01291788, + "epoch": 0.421584247707801, + "flos": 63749592938880.0, + "grad_norm": 0.789677431109339, + "language_loss": 0.54321265, + "learning_rate": 2.5958426617583417e-06, + "loss": 0.56367421, + "num_input_tokens_seen": 150520430, + "router_z_loss_clip": 0.01538086, + "router_z_loss_mlp": 0.24023438, + "step": 7012, + "time_per_iteration": 7.156486511230469 + }, + { + "auxiliary_loss_clip": 0.01118573, + "auxiliary_loss_mlp": 0.01033855, + "balance_loss_clip": 1.01982975, + "balance_loss_mlp": 1.04137254, + "epoch": 0.421644370960469, + "flos": 24314612474880.0, + "grad_norm": 1.3072085820070551, + "language_loss": 0.78510618, + "learning_rate": 2.5954708740712215e-06, + "loss": 0.80663049, + "num_input_tokens_seen": 150542610, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7734375, + "step": 7013, + "time_per_iteration": 2.4873650074005127 + }, + { + "auxiliary_loss_clip": 0.0111676, + "auxiliary_loss_mlp": 0.0103421, + "balance_loss_clip": 1.01945186, + "balance_loss_mlp": 1.0393039, + "epoch": 0.42170449421313694, + "flos": 23440690575360.0, + "grad_norm": 1.8972197450653994, + "language_loss": 0.8102268, + "learning_rate": 2.595099063803787e-06, + "loss": 0.83173645, + "num_input_tokens_seen": 150560970, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.7734375, + "step": 7014, + "time_per_iteration": 2.4698970317840576 + }, + { + "auxiliary_loss_clip": 0.01116577, + "auxiliary_loss_mlp": 0.01032875, + "balance_loss_clip": 1.01885617, + "balance_loss_mlp": 1.039801, + "epoch": 0.4217646174658049, + "flos": 23695476721920.0, + "grad_norm": 1.584816158328088, + "language_loss": 0.7775718, + "learning_rate": 2.5947272309701354e-06, + "loss": 0.79906625, + "num_input_tokens_seen": 150582615, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76953125, + "step": 7015, + "time_per_iteration": 2.48061203956604 + }, + { + "auxiliary_loss_clip": 0.01119879, + "auxiliary_loss_mlp": 0.01038639, + "balance_loss_clip": 1.02382123, + "balance_loss_mlp": 1.04211378, + "epoch": 0.42182474071847287, + "flos": 24971562270720.0, + "grad_norm": 1.4014002437510662, + "language_loss": 0.82126868, + "learning_rate": 2.594355375584368e-06, + "loss": 0.84285378, + "num_input_tokens_seen": 150603640, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.77734375, + "step": 7016, + "time_per_iteration": 2.4971818923950195 + }, + { + "auxiliary_loss_clip": 0.01119768, + "auxiliary_loss_mlp": 0.01033652, + "balance_loss_clip": 1.01964498, + "balance_loss_mlp": 1.04142356, + "epoch": 0.42188486397114083, + "flos": 22856639431680.0, + "grad_norm": 2.18227993050423, + "language_loss": 0.68093193, + "learning_rate": 2.593983497660586e-06, + "loss": 0.70246613, + "num_input_tokens_seen": 150622490, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.78125, + "step": 7017, + "time_per_iteration": 2.4511165618896484 + }, + { + "auxiliary_loss_clip": 0.01038936, + "auxiliary_loss_mlp": 0.00999099, + "balance_loss_clip": 0.9975912, + "balance_loss_mlp": 1.01494193, + "epoch": 0.4219449872238088, + "flos": 66975700965120.0, + "grad_norm": 0.6893654540123721, + "language_loss": 0.59420347, + "learning_rate": 2.5936115972128895e-06, + "loss": 0.61458385, + "num_input_tokens_seen": 150689545, + "router_z_loss_clip": 0.01507568, + "router_z_loss_mlp": 0.24023438, + "step": 7018, + "time_per_iteration": 3.1184492111206055 + }, + { + "auxiliary_loss_clip": 0.01118505, + "auxiliary_loss_mlp": 0.01034307, + "balance_loss_clip": 1.02027655, + "balance_loss_mlp": 1.03985381, + "epoch": 0.42200511047647676, + "flos": 13115367745920.0, + "grad_norm": 1.7697613946295114, + "language_loss": 0.75391936, + "learning_rate": 2.593239674255382e-06, + "loss": 0.77544749, + "num_input_tokens_seen": 150707610, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 7019, + "time_per_iteration": 2.415177822113037 + }, + { + "auxiliary_loss_clip": 0.01116733, + "auxiliary_loss_mlp": 0.01034757, + "balance_loss_clip": 1.01955771, + "balance_loss_mlp": 1.04044795, + "epoch": 0.42206523372914473, + "flos": 13991193066240.0, + "grad_norm": 2.151945399878188, + "language_loss": 0.69014722, + "learning_rate": 2.592867728802166e-06, + "loss": 0.71166205, + "num_input_tokens_seen": 150724530, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.76171875, + "step": 7020, + "time_per_iteration": 2.502906560897827 + }, + { + "auxiliary_loss_clip": 0.01115881, + "auxiliary_loss_mlp": 0.01032881, + "balance_loss_clip": 1.01976776, + "balance_loss_mlp": 1.04312158, + "epoch": 0.4221253569818127, + "flos": 21942317710080.0, + "grad_norm": 1.807686142219978, + "language_loss": 0.80839896, + "learning_rate": 2.592495760867347e-06, + "loss": 0.82988656, + "num_input_tokens_seen": 150742870, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 7021, + "time_per_iteration": 2.4480793476104736 + }, + { + "auxiliary_loss_clip": 0.01117987, + "auxiliary_loss_mlp": 0.01030288, + "balance_loss_clip": 1.01682925, + "balance_loss_mlp": 1.04118109, + "epoch": 0.42218548023448066, + "flos": 32192587071360.0, + "grad_norm": 1.7624230978889854, + "language_loss": 0.70018518, + "learning_rate": 2.5921237704650293e-06, + "loss": 0.721668, + "num_input_tokens_seen": 150765500, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 7022, + "time_per_iteration": 2.5637993812561035 + }, + { + "auxiliary_loss_clip": 0.01110409, + "auxiliary_loss_mlp": 0.01030174, + "balance_loss_clip": 1.01816332, + "balance_loss_mlp": 1.03993058, + "epoch": 0.4222456034871487, + "flos": 30118961894400.0, + "grad_norm": 1.4995673529455043, + "language_loss": 0.66985959, + "learning_rate": 2.5917517576093188e-06, + "loss": 0.69126534, + "num_input_tokens_seen": 150784945, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.703125, + "step": 7023, + "time_per_iteration": 2.518887996673584 + }, + { + "auxiliary_loss_clip": 0.01113824, + "auxiliary_loss_mlp": 0.01032224, + "balance_loss_clip": 1.01872325, + "balance_loss_mlp": 1.04102015, + "epoch": 0.42230572673981664, + "flos": 22127904305280.0, + "grad_norm": 1.5242794814383198, + "language_loss": 0.69374228, + "learning_rate": 2.591379722314322e-06, + "loss": 0.71520281, + "num_input_tokens_seen": 150803120, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7265625, + "step": 7024, + "time_per_iteration": 2.47479510307312 + }, + { + "auxiliary_loss_clip": 0.01115853, + "auxiliary_loss_mlp": 0.01036383, + "balance_loss_clip": 1.02272165, + "balance_loss_mlp": 1.0406878, + "epoch": 0.4223658499924846, + "flos": 22055077480320.0, + "grad_norm": 1.4987089123245305, + "language_loss": 0.76659822, + "learning_rate": 2.591007664594147e-06, + "loss": 0.78812057, + "num_input_tokens_seen": 150823135, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75, + "step": 7025, + "time_per_iteration": 2.459552526473999 + }, + { + "auxiliary_loss_clip": 0.01111611, + "auxiliary_loss_mlp": 0.01032742, + "balance_loss_clip": 1.01950371, + "balance_loss_mlp": 1.03944087, + "epoch": 0.4224259732451526, + "flos": 20410727742720.0, + "grad_norm": 1.7650754883430373, + "language_loss": 0.79574716, + "learning_rate": 2.5906355844629024e-06, + "loss": 0.81719071, + "num_input_tokens_seen": 150842070, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.72265625, + "step": 7026, + "time_per_iteration": 2.4876604080200195 + }, + { + "auxiliary_loss_clip": 0.01039298, + "auxiliary_loss_mlp": 0.00998847, + "balance_loss_clip": 0.99741668, + "balance_loss_mlp": 1.01518142, + "epoch": 0.42248609649782054, + "flos": 62846655828480.0, + "grad_norm": 0.7186593098349721, + "language_loss": 0.6191169, + "learning_rate": 2.5902634819346966e-06, + "loss": 0.63949835, + "num_input_tokens_seen": 150907450, + "router_z_loss_clip": 0.01428223, + "router_z_loss_mlp": 0.24121094, + "step": 7027, + "time_per_iteration": 3.1553335189819336 + }, + { + "auxiliary_loss_clip": 0.01115441, + "auxiliary_loss_mlp": 0.01038835, + "balance_loss_clip": 1.02524519, + "balance_loss_mlp": 1.04096365, + "epoch": 0.4225462197504885, + "flos": 26249946289920.0, + "grad_norm": 4.428318649676281, + "language_loss": 0.70515895, + "learning_rate": 2.5898913570236414e-06, + "loss": 0.72670174, + "num_input_tokens_seen": 150928040, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.74609375, + "step": 7028, + "time_per_iteration": 2.5373435020446777 + }, + { + "auxiliary_loss_clip": 0.01117282, + "auxiliary_loss_mlp": 0.01038664, + "balance_loss_clip": 1.02488303, + "balance_loss_mlp": 1.04104543, + "epoch": 0.42260634300315647, + "flos": 20521943228160.0, + "grad_norm": 1.8463743475085548, + "language_loss": 0.82555425, + "learning_rate": 2.589519209743846e-06, + "loss": 0.84711367, + "num_input_tokens_seen": 150945760, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76171875, + "step": 7029, + "time_per_iteration": 2.5120980739593506 + }, + { + "auxiliary_loss_clip": 0.0112087, + "auxiliary_loss_mlp": 0.010423, + "balance_loss_clip": 1.02790523, + "balance_loss_mlp": 1.04274035, + "epoch": 0.42266646625582444, + "flos": 24316731377280.0, + "grad_norm": 2.3903311172404, + "language_loss": 0.75230241, + "learning_rate": 2.589147040109424e-06, + "loss": 0.77393407, + "num_input_tokens_seen": 150965665, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 7030, + "time_per_iteration": 2.5118141174316406 + }, + { + "auxiliary_loss_clip": 0.01114314, + "auxiliary_loss_mlp": 0.01038089, + "balance_loss_clip": 1.02359271, + "balance_loss_mlp": 1.03835046, + "epoch": 0.4227265895084924, + "flos": 24204151175040.0, + "grad_norm": 1.9474535697331137, + "language_loss": 0.86421049, + "learning_rate": 2.588774848134486e-06, + "loss": 0.88573444, + "num_input_tokens_seen": 150982260, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7578125, + "step": 7031, + "time_per_iteration": 2.500140905380249 + }, + { + "auxiliary_loss_clip": 0.01115501, + "auxiliary_loss_mlp": 0.0103786, + "balance_loss_clip": 1.02328062, + "balance_loss_mlp": 1.04060841, + "epoch": 0.42278671276116037, + "flos": 16909760845440.0, + "grad_norm": 2.1339679402128717, + "language_loss": 0.72855937, + "learning_rate": 2.5884026338331473e-06, + "loss": 0.75009298, + "num_input_tokens_seen": 150999990, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.75, + "step": 7032, + "time_per_iteration": 2.477363109588623 + }, + { + "auxiliary_loss_clip": 0.0111615, + "auxiliary_loss_mlp": 0.01040791, + "balance_loss_clip": 1.02711725, + "balance_loss_mlp": 1.0390861, + "epoch": 0.42284683601382833, + "flos": 25411073086080.0, + "grad_norm": 1.7148750065903648, + "language_loss": 0.699175, + "learning_rate": 2.5880303972195222e-06, + "loss": 0.72074443, + "num_input_tokens_seen": 151021105, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 7033, + "time_per_iteration": 2.5661494731903076 + }, + { + "auxiliary_loss_clip": 0.01115751, + "auxiliary_loss_mlp": 0.01032627, + "balance_loss_clip": 1.01895976, + "balance_loss_mlp": 1.03992891, + "epoch": 0.4229069592664963, + "flos": 23040322606080.0, + "grad_norm": 1.8649473631938416, + "language_loss": 0.90448046, + "learning_rate": 2.5876581383077256e-06, + "loss": 0.92596424, + "num_input_tokens_seen": 151040665, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7578125, + "step": 7034, + "time_per_iteration": 2.4802892208099365 + }, + { + "auxiliary_loss_clip": 0.01112625, + "auxiliary_loss_mlp": 0.01036891, + "balance_loss_clip": 1.02369416, + "balance_loss_mlp": 1.03800857, + "epoch": 0.42296708251916426, + "flos": 26067448264320.0, + "grad_norm": 1.6052176008605175, + "language_loss": 0.77130729, + "learning_rate": 2.5872858571118723e-06, + "loss": 0.79280239, + "num_input_tokens_seen": 151061240, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.74609375, + "step": 7035, + "time_per_iteration": 2.5044498443603516 + }, + { + "auxiliary_loss_clip": 0.01118013, + "auxiliary_loss_mlp": 0.01040699, + "balance_loss_clip": 1.02682912, + "balance_loss_mlp": 1.0414331, + "epoch": 0.4230272057718323, + "flos": 19458376496640.0, + "grad_norm": 1.9123378440021823, + "language_loss": 0.82216996, + "learning_rate": 2.5869135536460817e-06, + "loss": 0.84375703, + "num_input_tokens_seen": 151076870, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 7036, + "time_per_iteration": 2.4178695678710938 + }, + { + "auxiliary_loss_clip": 0.01112842, + "auxiliary_loss_mlp": 0.01036995, + "balance_loss_clip": 1.02382207, + "balance_loss_mlp": 1.0403924, + "epoch": 0.42308732902450025, + "flos": 22383300983040.0, + "grad_norm": 1.6417488866700152, + "language_loss": 0.70871484, + "learning_rate": 2.58654122792447e-06, + "loss": 0.73021322, + "num_input_tokens_seen": 151095110, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7265625, + "step": 7037, + "time_per_iteration": 2.485499858856201 + }, + { + "auxiliary_loss_clip": 0.01115475, + "auxiliary_loss_mlp": 0.01037386, + "balance_loss_clip": 1.02303314, + "balance_loss_mlp": 1.03976059, + "epoch": 0.4231474522771682, + "flos": 20995425331200.0, + "grad_norm": 1.5138937767155718, + "language_loss": 0.77942061, + "learning_rate": 2.586168879961155e-06, + "loss": 0.80094922, + "num_input_tokens_seen": 151114355, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7578125, + "step": 7038, + "time_per_iteration": 2.4569690227508545 + }, + { + "auxiliary_loss_clip": 0.01120787, + "auxiliary_loss_mlp": 0.01044399, + "balance_loss_clip": 1.02919412, + "balance_loss_mlp": 1.04072356, + "epoch": 0.4232075755298362, + "flos": 14975863574400.0, + "grad_norm": 2.366884859254005, + "language_loss": 0.66797423, + "learning_rate": 2.585796509770259e-06, + "loss": 0.6896261, + "num_input_tokens_seen": 151131505, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.80078125, + "step": 7039, + "time_per_iteration": 2.441373825073242 + }, + { + "auxiliary_loss_clip": 0.01119114, + "auxiliary_loss_mlp": 0.01037872, + "balance_loss_clip": 1.02372193, + "balance_loss_mlp": 1.04042578, + "epoch": 0.42326769878250414, + "flos": 24532661986560.0, + "grad_norm": 1.6082175120791662, + "language_loss": 0.75897467, + "learning_rate": 2.5854241173658996e-06, + "loss": 0.78054452, + "num_input_tokens_seen": 151151555, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.78515625, + "step": 7040, + "time_per_iteration": 2.471653938293457 + }, + { + "auxiliary_loss_clip": 0.01117046, + "auxiliary_loss_mlp": 0.01035054, + "balance_loss_clip": 1.02067101, + "balance_loss_mlp": 1.03962982, + "epoch": 0.4233278220351721, + "flos": 26870303105280.0, + "grad_norm": 1.477939672492119, + "language_loss": 0.65098798, + "learning_rate": 2.5850517027621996e-06, + "loss": 0.67250896, + "num_input_tokens_seen": 151172385, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7734375, + "step": 7041, + "time_per_iteration": 2.502443313598633 + }, + { + "auxiliary_loss_clip": 0.01118281, + "auxiliary_loss_mlp": 0.01036522, + "balance_loss_clip": 1.02233624, + "balance_loss_mlp": 1.04045236, + "epoch": 0.4233879452878401, + "flos": 42814927463040.0, + "grad_norm": 1.7627160436135367, + "language_loss": 0.73621082, + "learning_rate": 2.5846792659732803e-06, + "loss": 0.75775892, + "num_input_tokens_seen": 151194930, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.77734375, + "step": 7042, + "time_per_iteration": 2.6498820781707764 + }, + { + "auxiliary_loss_clip": 0.01112749, + "auxiliary_loss_mlp": 0.01033182, + "balance_loss_clip": 1.02020609, + "balance_loss_mlp": 1.03977966, + "epoch": 0.42344806854050804, + "flos": 25229006023680.0, + "grad_norm": 1.3177903064215164, + "language_loss": 0.82185107, + "learning_rate": 2.5843068070132643e-06, + "loss": 0.84331036, + "num_input_tokens_seen": 151217905, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.73046875, + "step": 7043, + "time_per_iteration": 2.528604745864868 + }, + { + "auxiliary_loss_clip": 0.0111836, + "auxiliary_loss_mlp": 0.01041223, + "balance_loss_clip": 1.02608395, + "balance_loss_mlp": 1.04329216, + "epoch": 0.423508191793176, + "flos": 22778820616320.0, + "grad_norm": 2.3747778329738742, + "language_loss": 0.65231359, + "learning_rate": 2.5839343258962763e-06, + "loss": 0.67390943, + "num_input_tokens_seen": 151234580, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.75, + "step": 7044, + "time_per_iteration": 2.4399802684783936 + }, + { + "auxiliary_loss_clip": 0.01121384, + "auxiliary_loss_mlp": 0.01046534, + "balance_loss_clip": 1.03126323, + "balance_loss_mlp": 1.04322433, + "epoch": 0.42356831504584397, + "flos": 34637493179520.0, + "grad_norm": 1.7497316034691441, + "language_loss": 0.7502315, + "learning_rate": 2.5835618226364393e-06, + "loss": 0.77191073, + "num_input_tokens_seen": 151254765, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.78125, + "step": 7045, + "time_per_iteration": 2.612898588180542 + }, + { + "auxiliary_loss_clip": 0.01116302, + "auxiliary_loss_mlp": 0.01035318, + "balance_loss_clip": 1.02141845, + "balance_loss_mlp": 1.04219389, + "epoch": 0.42362843829851193, + "flos": 17596767346560.0, + "grad_norm": 2.1011396794876385, + "language_loss": 0.80564952, + "learning_rate": 2.5831892972478797e-06, + "loss": 0.82716572, + "num_input_tokens_seen": 151269045, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.73828125, + "step": 7046, + "time_per_iteration": 2.4105727672576904 + }, + { + "auxiliary_loss_clip": 0.01119082, + "auxiliary_loss_mlp": 0.0103605, + "balance_loss_clip": 1.021685, + "balance_loss_mlp": 1.04078197, + "epoch": 0.4236885615511799, + "flos": 22565691267840.0, + "grad_norm": 1.59844067944401, + "language_loss": 0.76846749, + "learning_rate": 2.5828167497447242e-06, + "loss": 0.7900188, + "num_input_tokens_seen": 151287530, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78125, + "step": 7047, + "time_per_iteration": 2.486297130584717 + }, + { + "auxiliary_loss_clip": 0.01116569, + "auxiliary_loss_mlp": 0.01034292, + "balance_loss_clip": 1.02102375, + "balance_loss_mlp": 1.04264975, + "epoch": 0.42374868480384786, + "flos": 26469216864000.0, + "grad_norm": 1.8697996227798281, + "language_loss": 0.67980373, + "learning_rate": 2.582444180141098e-06, + "loss": 0.70131224, + "num_input_tokens_seen": 151308905, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73828125, + "step": 7048, + "time_per_iteration": 2.5031991004943848 + }, + { + "auxiliary_loss_clip": 0.01119136, + "auxiliary_loss_mlp": 0.01038379, + "balance_loss_clip": 1.02371609, + "balance_loss_mlp": 1.04227185, + "epoch": 0.4238088080565159, + "flos": 20370220179840.0, + "grad_norm": 1.7311423758965327, + "language_loss": 0.7829181, + "learning_rate": 2.5820715884511307e-06, + "loss": 0.80449331, + "num_input_tokens_seen": 151326525, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.76953125, + "step": 7049, + "time_per_iteration": 2.549767255783081 + }, + { + "auxiliary_loss_clip": 0.01121261, + "auxiliary_loss_mlp": 0.01039585, + "balance_loss_clip": 1.02570868, + "balance_loss_mlp": 1.0433383, + "epoch": 0.42386893130918385, + "flos": 21172105353600.0, + "grad_norm": 1.7774881318176563, + "language_loss": 0.82656097, + "learning_rate": 2.5816989746889504e-06, + "loss": 0.84816945, + "num_input_tokens_seen": 151344675, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.78125, + "step": 7050, + "time_per_iteration": 2.498494863510132 + }, + { + "auxiliary_loss_clip": 0.01115122, + "auxiliary_loss_mlp": 0.01035845, + "balance_loss_clip": 1.02233815, + "balance_loss_mlp": 1.0382762, + "epoch": 0.4239290545618518, + "flos": 17675627656320.0, + "grad_norm": 2.0169322630318844, + "language_loss": 0.73429018, + "learning_rate": 2.581326338868687e-06, + "loss": 0.75579983, + "num_input_tokens_seen": 151360730, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 7051, + "time_per_iteration": 2.441920042037964 + }, + { + "auxiliary_loss_clip": 0.01118227, + "auxiliary_loss_mlp": 0.01033059, + "balance_loss_clip": 1.01983249, + "balance_loss_mlp": 1.04219055, + "epoch": 0.4239891778145198, + "flos": 24314504734080.0, + "grad_norm": 1.4713561275118965, + "language_loss": 0.86205333, + "learning_rate": 2.5809536810044706e-06, + "loss": 0.8835662, + "num_input_tokens_seen": 151380445, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7578125, + "step": 7052, + "time_per_iteration": 2.511756658554077 + }, + { + "auxiliary_loss_clip": 0.01116616, + "auxiliary_loss_mlp": 0.01041035, + "balance_loss_clip": 1.02657533, + "balance_loss_mlp": 1.03951788, + "epoch": 0.42404930106718774, + "flos": 20558428467840.0, + "grad_norm": 1.4100722391624452, + "language_loss": 0.7240659, + "learning_rate": 2.5805810011104323e-06, + "loss": 0.74564236, + "num_input_tokens_seen": 151399325, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7734375, + "step": 7053, + "time_per_iteration": 3.9099857807159424 + }, + { + "auxiliary_loss_clip": 0.01116742, + "auxiliary_loss_mlp": 0.01033237, + "balance_loss_clip": 1.0190872, + "balance_loss_mlp": 1.04233611, + "epoch": 0.4241094243198557, + "flos": 22308067946880.0, + "grad_norm": 1.5741365926511655, + "language_loss": 0.82153803, + "learning_rate": 2.580208299200704e-06, + "loss": 0.84303784, + "num_input_tokens_seen": 151417240, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.74609375, + "step": 7054, + "time_per_iteration": 5.327679634094238 + }, + { + "auxiliary_loss_clip": 0.01040448, + "auxiliary_loss_mlp": 0.0101831, + "balance_loss_clip": 1.01700425, + "balance_loss_mlp": 1.01674867, + "epoch": 0.4241695475725237, + "flos": 70612445272320.0, + "grad_norm": 0.7840713570529064, + "language_loss": 0.60388172, + "learning_rate": 2.5798355752894183e-06, + "loss": 0.62446928, + "num_input_tokens_seen": 151476015, + "router_z_loss_clip": 0.01306152, + "router_z_loss_mlp": 0.23632812, + "step": 7055, + "time_per_iteration": 3.0450727939605713 + }, + { + "auxiliary_loss_clip": 0.01119418, + "auxiliary_loss_mlp": 0.01041139, + "balance_loss_clip": 1.02651238, + "balance_loss_mlp": 1.04204714, + "epoch": 0.42422967082519164, + "flos": 14027462824320.0, + "grad_norm": 2.951771931203088, + "language_loss": 0.76762712, + "learning_rate": 2.5794628293907107e-06, + "loss": 0.78923267, + "num_input_tokens_seen": 151492035, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7734375, + "step": 7056, + "time_per_iteration": 2.442148447036743 + }, + { + "auxiliary_loss_clip": 0.01121258, + "auxiliary_loss_mlp": 0.01039415, + "balance_loss_clip": 1.02375674, + "balance_loss_mlp": 1.04127979, + "epoch": 0.4242897940778596, + "flos": 22345522853760.0, + "grad_norm": 2.7846662247260388, + "language_loss": 0.84346795, + "learning_rate": 2.579090061518714e-06, + "loss": 0.86507463, + "num_input_tokens_seen": 151508970, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.80078125, + "step": 7057, + "time_per_iteration": 2.474519968032837 + }, + { + "auxiliary_loss_clip": 0.01119549, + "auxiliary_loss_mlp": 0.010377, + "balance_loss_clip": 1.02272737, + "balance_loss_mlp": 1.04053187, + "epoch": 0.42434991733052757, + "flos": 22595855713920.0, + "grad_norm": 3.1820547358610605, + "language_loss": 0.82999814, + "learning_rate": 2.5787172716875642e-06, + "loss": 0.85157061, + "num_input_tokens_seen": 151525295, + "router_z_loss_clip": 0.14941406, + "router_z_loss_mlp": 0.7890625, + "step": 7058, + "time_per_iteration": 2.473520517349243 + }, + { + "auxiliary_loss_clip": 0.01118587, + "auxiliary_loss_mlp": 0.01028517, + "balance_loss_clip": 1.01533902, + "balance_loss_mlp": 1.04417813, + "epoch": 0.42441004058319554, + "flos": 20011437181440.0, + "grad_norm": 1.7435131696457398, + "language_loss": 0.80453449, + "learning_rate": 2.5783444599113973e-06, + "loss": 0.82600558, + "num_input_tokens_seen": 151544435, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 7059, + "time_per_iteration": 2.4719533920288086 + }, + { + "auxiliary_loss_clip": 0.01120005, + "auxiliary_loss_mlp": 0.01033964, + "balance_loss_clip": 1.01860404, + "balance_loss_mlp": 1.041839, + "epoch": 0.4244701638358635, + "flos": 11144985235200.0, + "grad_norm": 1.9429107045123646, + "language_loss": 0.70341688, + "learning_rate": 2.57797162620435e-06, + "loss": 0.72495657, + "num_input_tokens_seen": 151559520, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.78125, + "step": 7060, + "time_per_iteration": 2.4377660751342773 + }, + { + "auxiliary_loss_clip": 0.0112048, + "auxiliary_loss_mlp": 0.01032925, + "balance_loss_clip": 1.01914454, + "balance_loss_mlp": 1.04378521, + "epoch": 0.42453028708853147, + "flos": 23987753688960.0, + "grad_norm": 1.5364996273974925, + "language_loss": 0.76182258, + "learning_rate": 2.577598770580562e-06, + "loss": 0.78335667, + "num_input_tokens_seen": 151579790, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.765625, + "step": 7061, + "time_per_iteration": 2.486786365509033 + }, + { + "auxiliary_loss_clip": 0.01122599, + "auxiliary_loss_mlp": 0.01038557, + "balance_loss_clip": 1.02319098, + "balance_loss_mlp": 1.04407752, + "epoch": 0.42459041034119943, + "flos": 18406338030720.0, + "grad_norm": 3.328289037638814, + "language_loss": 0.729635, + "learning_rate": 2.5772258930541693e-06, + "loss": 0.75124645, + "num_input_tokens_seen": 151598285, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.7890625, + "step": 7062, + "time_per_iteration": 2.474193572998047 + }, + { + "auxiliary_loss_clip": 0.01116832, + "auxiliary_loss_mlp": 0.0104003, + "balance_loss_clip": 1.02600455, + "balance_loss_mlp": 1.03964305, + "epoch": 0.42465053359386745, + "flos": 20958006337920.0, + "grad_norm": 1.701854582957673, + "language_loss": 0.66343361, + "learning_rate": 2.5768529936393137e-06, + "loss": 0.68500221, + "num_input_tokens_seen": 151615430, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7734375, + "step": 7063, + "time_per_iteration": 2.458003520965576 + }, + { + "auxiliary_loss_clip": 0.01115284, + "auxiliary_loss_mlp": 0.01031318, + "balance_loss_clip": 1.0181458, + "balance_loss_mlp": 1.04179168, + "epoch": 0.4247106568465354, + "flos": 33106190520960.0, + "grad_norm": 1.4878317325171677, + "language_loss": 0.78371775, + "learning_rate": 2.5764800723501354e-06, + "loss": 0.80518377, + "num_input_tokens_seen": 151637030, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 7064, + "time_per_iteration": 2.5735623836517334 + }, + { + "auxiliary_loss_clip": 0.01118889, + "auxiliary_loss_mlp": 0.01040848, + "balance_loss_clip": 1.02636456, + "balance_loss_mlp": 1.04172683, + "epoch": 0.4247707800992034, + "flos": 20046916840320.0, + "grad_norm": 1.8409826195637737, + "language_loss": 0.74893892, + "learning_rate": 2.5761071292007736e-06, + "loss": 0.7705363, + "num_input_tokens_seen": 151655745, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.76953125, + "step": 7065, + "time_per_iteration": 2.4962844848632812 + }, + { + "auxiliary_loss_clip": 0.01119456, + "auxiliary_loss_mlp": 0.01035708, + "balance_loss_clip": 1.0206933, + "balance_loss_mlp": 1.04322076, + "epoch": 0.42483090335187135, + "flos": 22385132576640.0, + "grad_norm": 1.415711347923808, + "language_loss": 0.72713453, + "learning_rate": 2.5757341642053725e-06, + "loss": 0.74868619, + "num_input_tokens_seen": 151678040, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.76171875, + "step": 7066, + "time_per_iteration": 2.551297426223755 + }, + { + "auxiliary_loss_clip": 0.01119285, + "auxiliary_loss_mlp": 0.01038218, + "balance_loss_clip": 1.02307224, + "balance_loss_mlp": 1.04031396, + "epoch": 0.4248910266045393, + "flos": 21356830022400.0, + "grad_norm": 1.9392042625935109, + "language_loss": 0.79517603, + "learning_rate": 2.5753611773780745e-06, + "loss": 0.81675112, + "num_input_tokens_seen": 151696410, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.7890625, + "step": 7067, + "time_per_iteration": 2.4871444702148438 + }, + { + "auxiliary_loss_clip": 0.010394, + "auxiliary_loss_mlp": 0.01005215, + "balance_loss_clip": 1.00373113, + "balance_loss_mlp": 1.01538539, + "epoch": 0.4249511498572073, + "flos": 64008114099840.0, + "grad_norm": 0.919528911316311, + "language_loss": 0.63477993, + "learning_rate": 2.574988168733022e-06, + "loss": 0.65522605, + "num_input_tokens_seen": 151756365, + "router_z_loss_clip": 0.01483154, + "router_z_loss_mlp": 0.24023438, + "step": 7068, + "time_per_iteration": 3.0116004943847656 + }, + { + "auxiliary_loss_clip": 0.01119716, + "auxiliary_loss_mlp": 0.01036194, + "balance_loss_clip": 1.02073288, + "balance_loss_mlp": 1.04235375, + "epoch": 0.42501127310987524, + "flos": 19607046888960.0, + "grad_norm": 1.681037886347605, + "language_loss": 0.72381866, + "learning_rate": 2.574615138284361e-06, + "loss": 0.74537772, + "num_input_tokens_seen": 151775165, + "router_z_loss_clip": 0.15527344, + "router_z_loss_mlp": 0.7734375, + "step": 7069, + "time_per_iteration": 2.5046679973602295 + }, + { + "auxiliary_loss_clip": 0.01122307, + "auxiliary_loss_mlp": 0.01034999, + "balance_loss_clip": 1.01864338, + "balance_loss_mlp": 1.04424644, + "epoch": 0.4250713963625432, + "flos": 19462326992640.0, + "grad_norm": 3.2712432047864852, + "language_loss": 0.79297352, + "learning_rate": 2.5742420860462364e-06, + "loss": 0.81454653, + "num_input_tokens_seen": 151792620, + "router_z_loss_clip": 0.16308594, + "router_z_loss_mlp": 0.78125, + "step": 7070, + "time_per_iteration": 2.43115496635437 + }, + { + "auxiliary_loss_clip": 0.01118123, + "auxiliary_loss_mlp": 0.01033635, + "balance_loss_clip": 1.01863861, + "balance_loss_mlp": 1.04104066, + "epoch": 0.4251315196152112, + "flos": 25337707557120.0, + "grad_norm": 1.8101520547589562, + "language_loss": 0.70179212, + "learning_rate": 2.573869012032795e-06, + "loss": 0.7233097, + "num_input_tokens_seen": 151812850, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7734375, + "step": 7071, + "time_per_iteration": 2.5141680240631104 + }, + { + "auxiliary_loss_clip": 0.01118096, + "auxiliary_loss_mlp": 0.01033548, + "balance_loss_clip": 1.01942205, + "balance_loss_mlp": 1.04123151, + "epoch": 0.42519164286787914, + "flos": 26359186527360.0, + "grad_norm": 2.3450864635540825, + "language_loss": 0.71075511, + "learning_rate": 2.5734959162581824e-06, + "loss": 0.73227149, + "num_input_tokens_seen": 151831785, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76953125, + "step": 7072, + "time_per_iteration": 2.489187002182007 + }, + { + "auxiliary_loss_clip": 0.01122118, + "auxiliary_loss_mlp": 0.01032433, + "balance_loss_clip": 1.01865244, + "balance_loss_mlp": 1.04270983, + "epoch": 0.4252517661205471, + "flos": 26031070765440.0, + "grad_norm": 1.5399076436438217, + "language_loss": 0.81655496, + "learning_rate": 2.5731227987365475e-06, + "loss": 0.83810043, + "num_input_tokens_seen": 151853885, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.79296875, + "step": 7073, + "time_per_iteration": 2.5192041397094727 + }, + { + "auxiliary_loss_clip": 0.0111768, + "auxiliary_loss_mlp": 0.01034416, + "balance_loss_clip": 1.02097535, + "balance_loss_mlp": 1.04180706, + "epoch": 0.42531188937321507, + "flos": 12713635059840.0, + "grad_norm": 2.1264240253054227, + "language_loss": 0.90777069, + "learning_rate": 2.5727496594820386e-06, + "loss": 0.92929167, + "num_input_tokens_seen": 151871780, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 7074, + "time_per_iteration": 2.418611526489258 + }, + { + "auxiliary_loss_clip": 0.01122809, + "auxiliary_loss_mlp": 0.01039179, + "balance_loss_clip": 1.0234437, + "balance_loss_mlp": 1.04282892, + "epoch": 0.42537201262588303, + "flos": 22091670460800.0, + "grad_norm": 1.5751331844442036, + "language_loss": 0.63971686, + "learning_rate": 2.572376498508805e-06, + "loss": 0.66133678, + "num_input_tokens_seen": 151891600, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.80078125, + "step": 7075, + "time_per_iteration": 2.5064475536346436 + }, + { + "auxiliary_loss_clip": 0.01114521, + "auxiliary_loss_mlp": 0.01030161, + "balance_loss_clip": 1.01708984, + "balance_loss_mlp": 1.04121399, + "epoch": 0.42543213587855105, + "flos": 23003119094400.0, + "grad_norm": 1.5599863464934922, + "language_loss": 0.73547149, + "learning_rate": 2.5720033158309973e-06, + "loss": 0.75691831, + "num_input_tokens_seen": 151911330, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.734375, + "step": 7076, + "time_per_iteration": 2.487424850463867 + }, + { + "auxiliary_loss_clip": 0.01122674, + "auxiliary_loss_mlp": 0.01040326, + "balance_loss_clip": 1.02565181, + "balance_loss_mlp": 1.04370356, + "epoch": 0.425492259131219, + "flos": 25082454533760.0, + "grad_norm": 1.8221025125090708, + "language_loss": 0.78215933, + "learning_rate": 2.571630111462766e-06, + "loss": 0.80378938, + "num_input_tokens_seen": 151930355, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7890625, + "step": 7077, + "time_per_iteration": 2.4964394569396973 + }, + { + "auxiliary_loss_clip": 0.01114549, + "auxiliary_loss_mlp": 0.01034843, + "balance_loss_clip": 1.0221417, + "balance_loss_mlp": 1.04220366, + "epoch": 0.425552382383887, + "flos": 22816850140800.0, + "grad_norm": 1.6016827264272244, + "language_loss": 0.73013902, + "learning_rate": 2.571256885418265e-06, + "loss": 0.75163293, + "num_input_tokens_seen": 151949695, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 7078, + "time_per_iteration": 2.47660756111145 + }, + { + "auxiliary_loss_clip": 0.01120871, + "auxiliary_loss_mlp": 0.01042162, + "balance_loss_clip": 1.02880406, + "balance_loss_mlp": 1.0461756, + "epoch": 0.42561250563655495, + "flos": 13553585671680.0, + "grad_norm": 1.731645410920913, + "language_loss": 0.79469633, + "learning_rate": 2.5708836377116445e-06, + "loss": 0.81632668, + "num_input_tokens_seen": 151967640, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.74609375, + "step": 7079, + "time_per_iteration": 2.499232769012451 + }, + { + "auxiliary_loss_clip": 0.0112172, + "auxiliary_loss_mlp": 0.01031179, + "balance_loss_clip": 1.0181613, + "balance_loss_mlp": 1.04761243, + "epoch": 0.4256726288892229, + "flos": 46978303023360.0, + "grad_norm": 1.4705007316204746, + "language_loss": 0.72263241, + "learning_rate": 2.5705103683570592e-06, + "loss": 0.74416137, + "num_input_tokens_seen": 151994020, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7421875, + "step": 7080, + "time_per_iteration": 2.732074499130249 + }, + { + "auxiliary_loss_clip": 0.0111869, + "auxiliary_loss_mlp": 0.01035587, + "balance_loss_clip": 1.02206242, + "balance_loss_mlp": 1.04246545, + "epoch": 0.4257327521418909, + "flos": 23586451966080.0, + "grad_norm": 2.328741773172896, + "language_loss": 0.80405676, + "learning_rate": 2.5701370773686646e-06, + "loss": 0.82559955, + "num_input_tokens_seen": 152013415, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.76171875, + "step": 7081, + "time_per_iteration": 2.6035380363464355 + }, + { + "auxiliary_loss_clip": 0.01115229, + "auxiliary_loss_mlp": 0.01030383, + "balance_loss_clip": 1.01753235, + "balance_loss_mlp": 1.04303384, + "epoch": 0.42579287539455885, + "flos": 18989994124800.0, + "grad_norm": 1.7894721227922463, + "language_loss": 0.81618208, + "learning_rate": 2.5697637647606138e-06, + "loss": 0.8376382, + "num_input_tokens_seen": 152030860, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.72265625, + "step": 7082, + "time_per_iteration": 2.444728374481201 + }, + { + "auxiliary_loss_clip": 0.01119852, + "auxiliary_loss_mlp": 0.0103706, + "balance_loss_clip": 1.02286816, + "balance_loss_mlp": 1.04368842, + "epoch": 0.4258529986472268, + "flos": 25191910252800.0, + "grad_norm": 2.6988843094625508, + "language_loss": 0.69388473, + "learning_rate": 2.569390430547065e-06, + "loss": 0.71545386, + "num_input_tokens_seen": 152050395, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.76171875, + "step": 7083, + "time_per_iteration": 2.5369133949279785 + }, + { + "auxiliary_loss_clip": 0.01040302, + "auxiliary_loss_mlp": 0.00999977, + "balance_loss_clip": 0.99864787, + "balance_loss_mlp": 1.01655924, + "epoch": 0.4259131218998948, + "flos": 69968280718080.0, + "grad_norm": 0.8706759407802692, + "language_loss": 0.67112887, + "learning_rate": 2.569017074742173e-06, + "loss": 0.69153166, + "num_input_tokens_seen": 152113555, + "router_z_loss_clip": 0.01330566, + "router_z_loss_mlp": 0.23828125, + "step": 7084, + "time_per_iteration": 3.1631839275360107 + }, + { + "auxiliary_loss_clip": 0.01118847, + "auxiliary_loss_mlp": 0.01044199, + "balance_loss_clip": 1.02887428, + "balance_loss_mlp": 1.04295874, + "epoch": 0.42597324515256274, + "flos": 18004964480640.0, + "grad_norm": 2.6244995349856595, + "language_loss": 0.78095287, + "learning_rate": 2.5686436973600964e-06, + "loss": 0.80258334, + "num_input_tokens_seen": 152131575, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.7578125, + "step": 7085, + "time_per_iteration": 2.493157148361206 + }, + { + "auxiliary_loss_clip": 0.01129017, + "auxiliary_loss_mlp": 0.01046431, + "balance_loss_clip": 1.03102934, + "balance_loss_mlp": 1.04819477, + "epoch": 0.4260333684052307, + "flos": 15158792563200.0, + "grad_norm": 2.071277468695464, + "language_loss": 0.75757217, + "learning_rate": 2.568270298414995e-06, + "loss": 0.77932662, + "num_input_tokens_seen": 152149435, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.80859375, + "step": 7086, + "time_per_iteration": 2.426295280456543 + }, + { + "auxiliary_loss_clip": 0.01119794, + "auxiliary_loss_mlp": 0.01037065, + "balance_loss_clip": 1.02280188, + "balance_loss_mlp": 1.0433557, + "epoch": 0.42609349165789867, + "flos": 14939342421120.0, + "grad_norm": 2.1734108107028147, + "language_loss": 0.8001647, + "learning_rate": 2.5678968779210255e-06, + "loss": 0.82173336, + "num_input_tokens_seen": 152166860, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.765625, + "step": 7087, + "time_per_iteration": 2.46087384223938 + }, + { + "auxiliary_loss_clip": 0.01123365, + "auxiliary_loss_mlp": 0.01032798, + "balance_loss_clip": 1.01846862, + "balance_loss_mlp": 1.04632342, + "epoch": 0.42615361491056664, + "flos": 23731961961600.0, + "grad_norm": 1.8444426655441133, + "language_loss": 0.6603114, + "learning_rate": 2.5675234358923505e-06, + "loss": 0.68187302, + "num_input_tokens_seen": 152187475, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.76953125, + "step": 7088, + "time_per_iteration": 2.481919527053833 + }, + { + "auxiliary_loss_clip": 0.01123249, + "auxiliary_loss_mlp": 0.01039067, + "balance_loss_clip": 1.02472591, + "balance_loss_mlp": 1.0449152, + "epoch": 0.42621373816323466, + "flos": 24936441747840.0, + "grad_norm": 1.8812259313043718, + "language_loss": 0.68482029, + "learning_rate": 2.56714997234313e-06, + "loss": 0.70644343, + "num_input_tokens_seen": 152207235, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78125, + "step": 7089, + "time_per_iteration": 2.523918628692627 + }, + { + "auxiliary_loss_clip": 0.01121302, + "auxiliary_loss_mlp": 0.01038776, + "balance_loss_clip": 1.02473295, + "balance_loss_mlp": 1.0418849, + "epoch": 0.4262738614159026, + "flos": 13552975140480.0, + "grad_norm": 2.8669230196035027, + "language_loss": 0.72897398, + "learning_rate": 2.566776487287525e-06, + "loss": 0.75057483, + "num_input_tokens_seen": 152224240, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.79296875, + "step": 7090, + "time_per_iteration": 2.4340648651123047 + }, + { + "auxiliary_loss_clip": 0.01124016, + "auxiliary_loss_mlp": 0.01046428, + "balance_loss_clip": 1.03208125, + "balance_loss_mlp": 1.04372311, + "epoch": 0.4263339846685706, + "flos": 29748794284800.0, + "grad_norm": 1.7953532910276222, + "language_loss": 0.75347531, + "learning_rate": 2.5664029807396994e-06, + "loss": 0.77517974, + "num_input_tokens_seen": 152242595, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.80078125, + "step": 7091, + "time_per_iteration": 2.5973541736602783 + }, + { + "auxiliary_loss_clip": 0.0111574, + "auxiliary_loss_mlp": 0.01034559, + "balance_loss_clip": 1.02188134, + "balance_loss_mlp": 1.04312468, + "epoch": 0.42639410792123855, + "flos": 16834204586880.0, + "grad_norm": 1.6821401092021848, + "language_loss": 0.82308388, + "learning_rate": 2.5660294527138156e-06, + "loss": 0.84458697, + "num_input_tokens_seen": 152260840, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7265625, + "step": 7092, + "time_per_iteration": 2.453181266784668 + }, + { + "auxiliary_loss_clip": 0.0112628, + "auxiliary_loss_mlp": 0.01045355, + "balance_loss_clip": 1.03138983, + "balance_loss_mlp": 1.0454514, + "epoch": 0.4264542311739065, + "flos": 28763118195840.0, + "grad_norm": 1.6505279256890275, + "language_loss": 0.73916072, + "learning_rate": 2.565655903224038e-06, + "loss": 0.76087701, + "num_input_tokens_seen": 152280580, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.80859375, + "step": 7093, + "time_per_iteration": 2.5176479816436768 + }, + { + "auxiliary_loss_clip": 0.01120741, + "auxiliary_loss_mlp": 0.01039533, + "balance_loss_clip": 1.02482259, + "balance_loss_mlp": 1.04376769, + "epoch": 0.4265143544265745, + "flos": 24713615727360.0, + "grad_norm": 2.5315083588078555, + "language_loss": 0.69390249, + "learning_rate": 2.565282332284532e-06, + "loss": 0.71550524, + "num_input_tokens_seen": 152298455, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.76953125, + "step": 7094, + "time_per_iteration": 2.489561080932617 + }, + { + "auxiliary_loss_clip": 0.01122789, + "auxiliary_loss_mlp": 0.01038187, + "balance_loss_clip": 1.02379799, + "balance_loss_mlp": 1.04475617, + "epoch": 0.42657447767924245, + "flos": 21865971352320.0, + "grad_norm": 1.6055215896501054, + "language_loss": 0.81466055, + "learning_rate": 2.564908739909464e-06, + "loss": 0.83627033, + "num_input_tokens_seen": 152316995, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 7095, + "time_per_iteration": 6.829655647277832 + }, + { + "auxiliary_loss_clip": 0.01122192, + "auxiliary_loss_mlp": 0.01044565, + "balance_loss_clip": 1.02972341, + "balance_loss_mlp": 1.04453826, + "epoch": 0.4266346009319104, + "flos": 21470236237440.0, + "grad_norm": 1.7098780852895776, + "language_loss": 0.80283463, + "learning_rate": 2.5645351261129996e-06, + "loss": 0.82450223, + "num_input_tokens_seen": 152334800, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.77734375, + "step": 7096, + "time_per_iteration": 3.894577980041504 + }, + { + "auxiliary_loss_clip": 0.01125109, + "auxiliary_loss_mlp": 0.01041794, + "balance_loss_clip": 1.02754259, + "balance_loss_mlp": 1.04520798, + "epoch": 0.4266947241845784, + "flos": 25519379569920.0, + "grad_norm": 1.947200367016257, + "language_loss": 0.65628326, + "learning_rate": 2.5641614909093066e-06, + "loss": 0.67795235, + "num_input_tokens_seen": 152355175, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.796875, + "step": 7097, + "time_per_iteration": 2.5192034244537354 + }, + { + "auxiliary_loss_clip": 0.01117089, + "auxiliary_loss_mlp": 0.0103085, + "balance_loss_clip": 1.01711667, + "balance_loss_mlp": 1.04297018, + "epoch": 0.42675484743724634, + "flos": 26541217676160.0, + "grad_norm": 1.8194330831870058, + "language_loss": 0.74512994, + "learning_rate": 2.5637878343125535e-06, + "loss": 0.76660931, + "num_input_tokens_seen": 152377245, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7421875, + "step": 7098, + "time_per_iteration": 2.498380661010742 + }, + { + "auxiliary_loss_clip": 0.01118318, + "auxiliary_loss_mlp": 0.01032967, + "balance_loss_clip": 1.01969302, + "balance_loss_mlp": 1.04259086, + "epoch": 0.4268149706899143, + "flos": 23112718467840.0, + "grad_norm": 1.7218259388529535, + "language_loss": 0.75169343, + "learning_rate": 2.5634141563369086e-06, + "loss": 0.77320623, + "num_input_tokens_seen": 152396985, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7578125, + "step": 7099, + "time_per_iteration": 2.4900684356689453 + }, + { + "auxiliary_loss_clip": 0.01122249, + "auxiliary_loss_mlp": 0.01039401, + "balance_loss_clip": 1.02458942, + "balance_loss_mlp": 1.0437479, + "epoch": 0.4268750939425823, + "flos": 22706532495360.0, + "grad_norm": 1.9952935228943551, + "language_loss": 0.83543229, + "learning_rate": 2.5630404569965432e-06, + "loss": 0.85704881, + "num_input_tokens_seen": 152415590, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78515625, + "step": 7100, + "time_per_iteration": 2.467902183532715 + }, + { + "auxiliary_loss_clip": 0.01121229, + "auxiliary_loss_mlp": 0.01034586, + "balance_loss_clip": 1.0208652, + "balance_loss_mlp": 1.04333866, + "epoch": 0.42693521719525024, + "flos": 25374875155200.0, + "grad_norm": 1.3501788659102136, + "language_loss": 0.82243335, + "learning_rate": 2.562666736305627e-06, + "loss": 0.84399146, + "num_input_tokens_seen": 152436735, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.77734375, + "step": 7101, + "time_per_iteration": 2.5363035202026367 + }, + { + "auxiliary_loss_clip": 0.01124462, + "auxiliary_loss_mlp": 0.01034069, + "balance_loss_clip": 1.01972795, + "balance_loss_mlp": 1.04426765, + "epoch": 0.42699534044791826, + "flos": 18150689957760.0, + "grad_norm": 1.8760573998828747, + "language_loss": 0.7243284, + "learning_rate": 2.5622929942783314e-06, + "loss": 0.74591374, + "num_input_tokens_seen": 152455685, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.80078125, + "step": 7102, + "time_per_iteration": 2.443894624710083 + }, + { + "auxiliary_loss_clip": 0.01117542, + "auxiliary_loss_mlp": 0.01033335, + "balance_loss_clip": 1.02012062, + "balance_loss_mlp": 1.04262853, + "epoch": 0.4270554637005862, + "flos": 13698413308800.0, + "grad_norm": 1.799822548331586, + "language_loss": 0.82910782, + "learning_rate": 2.5619192309288297e-06, + "loss": 0.85061657, + "num_input_tokens_seen": 152473500, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.75, + "step": 7103, + "time_per_iteration": 2.4751625061035156 + }, + { + "auxiliary_loss_clip": 0.01122919, + "auxiliary_loss_mlp": 0.01036815, + "balance_loss_clip": 1.02205122, + "balance_loss_mlp": 1.04319, + "epoch": 0.4271155869532542, + "flos": 17493596507520.0, + "grad_norm": 2.0452515416159227, + "language_loss": 0.73823762, + "learning_rate": 2.561545446271294e-06, + "loss": 0.759835, + "num_input_tokens_seen": 152491320, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.796875, + "step": 7104, + "time_per_iteration": 2.433727264404297 + }, + { + "auxiliary_loss_clip": 0.01120598, + "auxiliary_loss_mlp": 0.01031923, + "balance_loss_clip": 1.01842821, + "balance_loss_mlp": 1.04307532, + "epoch": 0.42717571020592215, + "flos": 32452293381120.0, + "grad_norm": 2.0713947006575713, + "language_loss": 0.75097072, + "learning_rate": 2.5611716403198987e-06, + "loss": 0.77249593, + "num_input_tokens_seen": 152511970, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.77734375, + "step": 7105, + "time_per_iteration": 2.553220748901367 + }, + { + "auxiliary_loss_clip": 0.01123627, + "auxiliary_loss_mlp": 0.01038594, + "balance_loss_clip": 1.02499223, + "balance_loss_mlp": 1.04497468, + "epoch": 0.4272358334585901, + "flos": 16253062444800.0, + "grad_norm": 1.944135826622959, + "language_loss": 0.7652669, + "learning_rate": 2.560797813088819e-06, + "loss": 0.78688908, + "num_input_tokens_seen": 152530515, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.78515625, + "step": 7106, + "time_per_iteration": 2.4320499897003174 + }, + { + "auxiliary_loss_clip": 0.01116905, + "auxiliary_loss_mlp": 0.0103438, + "balance_loss_clip": 1.02062345, + "balance_loss_mlp": 1.04073668, + "epoch": 0.4272959567112581, + "flos": 24200092938240.0, + "grad_norm": 1.7002032775641, + "language_loss": 0.79748225, + "learning_rate": 2.560423964592229e-06, + "loss": 0.81899506, + "num_input_tokens_seen": 152549295, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76171875, + "step": 7107, + "time_per_iteration": 2.5138087272644043 + }, + { + "auxiliary_loss_clip": 0.01118344, + "auxiliary_loss_mlp": 0.01032973, + "balance_loss_clip": 1.01978803, + "balance_loss_mlp": 1.04365969, + "epoch": 0.42735607996392605, + "flos": 27963495578880.0, + "grad_norm": 1.5777370161888564, + "language_loss": 0.67986816, + "learning_rate": 2.5600500948443075e-06, + "loss": 0.70138133, + "num_input_tokens_seen": 152570725, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.74609375, + "step": 7108, + "time_per_iteration": 2.5148940086364746 + }, + { + "auxiliary_loss_clip": 0.01118179, + "auxiliary_loss_mlp": 0.01037518, + "balance_loss_clip": 1.02417231, + "balance_loss_mlp": 1.04141963, + "epoch": 0.427416203216594, + "flos": 20295597674880.0, + "grad_norm": 1.697941372596268, + "language_loss": 0.71379381, + "learning_rate": 2.5596762038592294e-06, + "loss": 0.73535079, + "num_input_tokens_seen": 152588950, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.765625, + "step": 7109, + "time_per_iteration": 2.514293909072876 + }, + { + "auxiliary_loss_clip": 0.01119837, + "auxiliary_loss_mlp": 0.01032817, + "balance_loss_clip": 1.01668775, + "balance_loss_mlp": 1.04248762, + "epoch": 0.427476326469262, + "flos": 26943955943040.0, + "grad_norm": 1.808555345827523, + "language_loss": 0.64390564, + "learning_rate": 2.559302291651174e-06, + "loss": 0.66543221, + "num_input_tokens_seen": 152608965, + "router_z_loss_clip": 0.16113281, + "router_z_loss_mlp": 0.7734375, + "step": 7110, + "time_per_iteration": 2.507896661758423 + }, + { + "auxiliary_loss_clip": 0.01121216, + "auxiliary_loss_mlp": 0.01033451, + "balance_loss_clip": 1.01876426, + "balance_loss_mlp": 1.04310989, + "epoch": 0.42753644972192995, + "flos": 25702847262720.0, + "grad_norm": 1.6911252843933642, + "language_loss": 0.76596475, + "learning_rate": 2.5589283582343197e-06, + "loss": 0.78751141, + "num_input_tokens_seen": 152630220, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78125, + "step": 7111, + "time_per_iteration": 2.5065102577209473 + }, + { + "auxiliary_loss_clip": 0.01122655, + "auxiliary_loss_mlp": 0.01034743, + "balance_loss_clip": 1.02051497, + "balance_loss_mlp": 1.04446638, + "epoch": 0.4275965729745979, + "flos": 18767419499520.0, + "grad_norm": 1.6101339491766522, + "language_loss": 0.73021042, + "learning_rate": 2.558554403622845e-06, + "loss": 0.75178432, + "num_input_tokens_seen": 152648835, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 7112, + "time_per_iteration": 2.462275266647339 + }, + { + "auxiliary_loss_clip": 0.0111568, + "auxiliary_loss_mlp": 0.01038733, + "balance_loss_clip": 1.02527392, + "balance_loss_mlp": 1.04112434, + "epoch": 0.4276566962272659, + "flos": 23764424878080.0, + "grad_norm": 1.5100904202471843, + "language_loss": 0.71723974, + "learning_rate": 2.5581804278309323e-06, + "loss": 0.7387839, + "num_input_tokens_seen": 152668375, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 7113, + "time_per_iteration": 2.517184019088745 + }, + { + "auxiliary_loss_clip": 0.01122905, + "auxiliary_loss_mlp": 0.01044494, + "balance_loss_clip": 1.03027248, + "balance_loss_mlp": 1.04463625, + "epoch": 0.42771681947993384, + "flos": 22492505306880.0, + "grad_norm": 4.019227207544938, + "language_loss": 0.62055492, + "learning_rate": 2.5578064308727617e-06, + "loss": 0.64222896, + "num_input_tokens_seen": 152689725, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 7114, + "time_per_iteration": 2.4808969497680664 + }, + { + "auxiliary_loss_clip": 0.01127351, + "auxiliary_loss_mlp": 0.01044357, + "balance_loss_clip": 1.02779305, + "balance_loss_mlp": 1.045439, + "epoch": 0.42777694273260186, + "flos": 25044712318080.0, + "grad_norm": 1.7285817614937915, + "language_loss": 0.64558339, + "learning_rate": 2.5574324127625153e-06, + "loss": 0.66730046, + "num_input_tokens_seen": 152709375, + "router_z_loss_clip": 0.16601562, + "router_z_loss_mlp": 0.8203125, + "step": 7115, + "time_per_iteration": 2.4979755878448486 + }, + { + "auxiliary_loss_clip": 0.01118312, + "auxiliary_loss_mlp": 0.01036556, + "balance_loss_clip": 1.02321672, + "balance_loss_mlp": 1.04225278, + "epoch": 0.4278370659852698, + "flos": 18661519226880.0, + "grad_norm": 1.5459011503250888, + "language_loss": 0.7331425, + "learning_rate": 2.5570583735143753e-06, + "loss": 0.75469118, + "num_input_tokens_seen": 152727510, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.76171875, + "step": 7116, + "time_per_iteration": 2.4514083862304688 + }, + { + "auxiliary_loss_clip": 0.01115475, + "auxiliary_loss_mlp": 0.01042446, + "balance_loss_clip": 1.02976263, + "balance_loss_mlp": 1.04102111, + "epoch": 0.4278971892379378, + "flos": 27308269635840.0, + "grad_norm": 1.5398002166428786, + "language_loss": 0.69214165, + "learning_rate": 2.5566843131425275e-06, + "loss": 0.7137208, + "num_input_tokens_seen": 152746670, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7421875, + "step": 7117, + "time_per_iteration": 2.522881269454956 + }, + { + "auxiliary_loss_clip": 0.01122059, + "auxiliary_loss_mlp": 0.01040733, + "balance_loss_clip": 1.02657676, + "balance_loss_mlp": 1.04530859, + "epoch": 0.42795731249060576, + "flos": 12888698970240.0, + "grad_norm": 2.268053258549222, + "language_loss": 0.69909632, + "learning_rate": 2.5563102316611536e-06, + "loss": 0.72072423, + "num_input_tokens_seen": 152760545, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.765625, + "step": 7118, + "time_per_iteration": 2.3870341777801514 + }, + { + "auxiliary_loss_clip": 0.01119033, + "auxiliary_loss_mlp": 0.01043307, + "balance_loss_clip": 1.02948511, + "balance_loss_mlp": 1.04353809, + "epoch": 0.4280174357432737, + "flos": 33401448316800.0, + "grad_norm": 2.1225989928468803, + "language_loss": 0.74740356, + "learning_rate": 2.55593612908444e-06, + "loss": 0.76902699, + "num_input_tokens_seen": 152780970, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75390625, + "step": 7119, + "time_per_iteration": 2.5487277507781982 + }, + { + "auxiliary_loss_clip": 0.01118083, + "auxiliary_loss_mlp": 0.01033891, + "balance_loss_clip": 1.02040291, + "balance_loss_mlp": 1.04196107, + "epoch": 0.4280775589959417, + "flos": 18259104182400.0, + "grad_norm": 1.8104905013477006, + "language_loss": 0.74987411, + "learning_rate": 2.555562005426573e-06, + "loss": 0.7713939, + "num_input_tokens_seen": 152798475, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.76171875, + "step": 7120, + "time_per_iteration": 2.415062427520752 + }, + { + "auxiliary_loss_clip": 0.01120406, + "auxiliary_loss_mlp": 0.01036574, + "balance_loss_clip": 1.02321029, + "balance_loss_mlp": 1.04422045, + "epoch": 0.42813768224860965, + "flos": 21471277731840.0, + "grad_norm": 1.6187265972443616, + "language_loss": 0.77002251, + "learning_rate": 2.5551878607017385e-06, + "loss": 0.7915923, + "num_input_tokens_seen": 152817555, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.76171875, + "step": 7121, + "time_per_iteration": 2.4686522483825684 + }, + { + "auxiliary_loss_clip": 0.01117478, + "auxiliary_loss_mlp": 0.01035893, + "balance_loss_clip": 1.02299464, + "balance_loss_mlp": 1.04225755, + "epoch": 0.4281978055012776, + "flos": 15669262696320.0, + "grad_norm": 1.8413618192799084, + "language_loss": 0.85525274, + "learning_rate": 2.554813694924126e-06, + "loss": 0.87678635, + "num_input_tokens_seen": 152836295, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.75, + "step": 7122, + "time_per_iteration": 2.4149863719940186 + }, + { + "auxiliary_loss_clip": 0.01114983, + "auxiliary_loss_mlp": 0.01034591, + "balance_loss_clip": 1.02088189, + "balance_loss_mlp": 1.04111362, + "epoch": 0.4282579287539456, + "flos": 17712005155200.0, + "grad_norm": 1.6495062264118223, + "language_loss": 0.81354666, + "learning_rate": 2.554439508107921e-06, + "loss": 0.83504236, + "num_input_tokens_seen": 152854950, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73828125, + "step": 7123, + "time_per_iteration": 2.4846510887145996 + }, + { + "auxiliary_loss_clip": 0.01116497, + "auxiliary_loss_mlp": 0.01035689, + "balance_loss_clip": 1.02171159, + "balance_loss_mlp": 1.04286349, + "epoch": 0.42831805200661355, + "flos": 19281157770240.0, + "grad_norm": 1.6842679543274752, + "language_loss": 0.81069416, + "learning_rate": 2.5540653002673153e-06, + "loss": 0.83221602, + "num_input_tokens_seen": 152873995, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.734375, + "step": 7124, + "time_per_iteration": 2.477781057357788 + }, + { + "auxiliary_loss_clip": 0.01116184, + "auxiliary_loss_mlp": 0.01039979, + "balance_loss_clip": 1.02485132, + "balance_loss_mlp": 1.04072952, + "epoch": 0.4283781752592815, + "flos": 19792633484160.0, + "grad_norm": 7.024350858631177, + "language_loss": 0.80178392, + "learning_rate": 2.553691071416498e-06, + "loss": 0.82334554, + "num_input_tokens_seen": 152892925, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.75390625, + "step": 7125, + "time_per_iteration": 2.466099262237549 + }, + { + "auxiliary_loss_clip": 0.01117521, + "auxiliary_loss_mlp": 0.01035658, + "balance_loss_clip": 1.0230993, + "balance_loss_mlp": 1.04386544, + "epoch": 0.4284382985119495, + "flos": 16508064072960.0, + "grad_norm": 1.7536027507395449, + "language_loss": 0.74772543, + "learning_rate": 2.553316821569659e-06, + "loss": 0.76925719, + "num_input_tokens_seen": 152910935, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.734375, + "step": 7126, + "time_per_iteration": 2.4476282596588135 + }, + { + "auxiliary_loss_clip": 0.01118141, + "auxiliary_loss_mlp": 0.01037481, + "balance_loss_clip": 1.02313387, + "balance_loss_mlp": 1.04261374, + "epoch": 0.42849842176461744, + "flos": 23330767979520.0, + "grad_norm": 2.2527301233175496, + "language_loss": 0.81376731, + "learning_rate": 2.5529425507409913e-06, + "loss": 0.83532357, + "num_input_tokens_seen": 152931030, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7578125, + "step": 7127, + "time_per_iteration": 2.50627064704895 + }, + { + "auxiliary_loss_clip": 0.01117482, + "auxiliary_loss_mlp": 0.01039553, + "balance_loss_clip": 1.02554011, + "balance_loss_mlp": 1.04140556, + "epoch": 0.4285585450172854, + "flos": 17274433674240.0, + "grad_norm": 1.7148593982179101, + "language_loss": 0.76451397, + "learning_rate": 2.5525682589446867e-06, + "loss": 0.78608435, + "num_input_tokens_seen": 152948085, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 7128, + "time_per_iteration": 2.4261910915374756 + }, + { + "auxiliary_loss_clip": 0.01119221, + "auxiliary_loss_mlp": 0.01034781, + "balance_loss_clip": 1.02018988, + "balance_loss_mlp": 1.04154372, + "epoch": 0.42861866826995343, + "flos": 24279599692800.0, + "grad_norm": 1.979642374109765, + "language_loss": 0.74111116, + "learning_rate": 2.552193946194937e-06, + "loss": 0.76265121, + "num_input_tokens_seen": 152966265, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.77734375, + "step": 7129, + "time_per_iteration": 2.4977691173553467 + }, + { + "auxiliary_loss_clip": 0.01119175, + "auxiliary_loss_mlp": 0.01034497, + "balance_loss_clip": 1.02102661, + "balance_loss_mlp": 1.04335773, + "epoch": 0.4286787915226214, + "flos": 24353108876160.0, + "grad_norm": 1.7995906720856931, + "language_loss": 0.77753568, + "learning_rate": 2.5518196125059394e-06, + "loss": 0.79907238, + "num_input_tokens_seen": 152986775, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 7130, + "time_per_iteration": 2.4983179569244385 + }, + { + "auxiliary_loss_clip": 0.01123055, + "auxiliary_loss_mlp": 0.01039363, + "balance_loss_clip": 1.02476025, + "balance_loss_mlp": 1.04523921, + "epoch": 0.42873891477528936, + "flos": 15449992122240.0, + "grad_norm": 1.8571755273934152, + "language_loss": 0.7349695, + "learning_rate": 2.551445257891886e-06, + "loss": 0.75659359, + "num_input_tokens_seen": 153003595, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.77734375, + "step": 7131, + "time_per_iteration": 2.5469563007354736 + }, + { + "auxiliary_loss_clip": 0.01120536, + "auxiliary_loss_mlp": 0.01036711, + "balance_loss_clip": 1.02257299, + "balance_loss_mlp": 1.04343748, + "epoch": 0.4287990380279573, + "flos": 17639573379840.0, + "grad_norm": 2.0596069487020268, + "language_loss": 0.76299751, + "learning_rate": 2.551070882366973e-06, + "loss": 0.78456992, + "num_input_tokens_seen": 153021960, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7734375, + "step": 7132, + "time_per_iteration": 2.432889223098755 + }, + { + "auxiliary_loss_clip": 0.01119567, + "auxiliary_loss_mlp": 0.01042884, + "balance_loss_clip": 1.02821565, + "balance_loss_mlp": 1.04352558, + "epoch": 0.4288591612806253, + "flos": 27162328677120.0, + "grad_norm": 1.5221162096651724, + "language_loss": 0.78525162, + "learning_rate": 2.550696485945397e-06, + "loss": 0.80687612, + "num_input_tokens_seen": 153042110, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7578125, + "step": 7133, + "time_per_iteration": 2.544379472732544 + }, + { + "auxiliary_loss_clip": 0.01120837, + "auxiliary_loss_mlp": 0.0103873, + "balance_loss_clip": 1.02484238, + "balance_loss_mlp": 1.04305482, + "epoch": 0.42891928453329325, + "flos": 17163182275200.0, + "grad_norm": 1.8479371259746051, + "language_loss": 0.75017452, + "learning_rate": 2.550322068641355e-06, + "loss": 0.77177012, + "num_input_tokens_seen": 153058925, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.77734375, + "step": 7134, + "time_per_iteration": 2.416792154312134 + }, + { + "auxiliary_loss_clip": 0.01114501, + "auxiliary_loss_mlp": 0.01031974, + "balance_loss_clip": 1.01937902, + "balance_loss_mlp": 1.04046178, + "epoch": 0.4289794077859612, + "flos": 18187031543040.0, + "grad_norm": 2.2902258120670975, + "language_loss": 0.84066433, + "learning_rate": 2.5499476304690455e-06, + "loss": 0.86212909, + "num_input_tokens_seen": 153078070, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7421875, + "step": 7135, + "time_per_iteration": 2.4513847827911377 + }, + { + "auxiliary_loss_clip": 0.01114319, + "auxiliary_loss_mlp": 0.01036122, + "balance_loss_clip": 1.02250218, + "balance_loss_mlp": 1.04050052, + "epoch": 0.4290395310386292, + "flos": 28256885867520.0, + "grad_norm": 1.9123929145525593, + "language_loss": 0.74716437, + "learning_rate": 2.549573171442666e-06, + "loss": 0.76866877, + "num_input_tokens_seen": 153096680, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73828125, + "step": 7136, + "time_per_iteration": 2.5260956287384033 + }, + { + "auxiliary_loss_clip": 0.01117454, + "auxiliary_loss_mlp": 0.01038013, + "balance_loss_clip": 1.0243752, + "balance_loss_mlp": 1.04027987, + "epoch": 0.42909965429129715, + "flos": 16216074414720.0, + "grad_norm": 1.9374198184766858, + "language_loss": 0.78982937, + "learning_rate": 2.5491986915764175e-06, + "loss": 0.81138408, + "num_input_tokens_seen": 153113305, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76953125, + "step": 7137, + "time_per_iteration": 6.664285898208618 + }, + { + "auxiliary_loss_clip": 0.01123569, + "auxiliary_loss_mlp": 0.01034938, + "balance_loss_clip": 1.02053773, + "balance_loss_mlp": 1.04498768, + "epoch": 0.4291597775439651, + "flos": 23112862122240.0, + "grad_norm": 1.8145904182691066, + "language_loss": 0.76599205, + "learning_rate": 2.548824190884499e-06, + "loss": 0.78757715, + "num_input_tokens_seen": 153132735, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7890625, + "step": 7138, + "time_per_iteration": 2.4640390872955322 + }, + { + "auxiliary_loss_clip": 0.01043511, + "auxiliary_loss_mlp": 0.01001663, + "balance_loss_clip": 1.00025678, + "balance_loss_mlp": 1.02006102, + "epoch": 0.4292199007966331, + "flos": 67546212681600.0, + "grad_norm": 0.7743592729173089, + "language_loss": 0.56193811, + "learning_rate": 2.548449669381113e-06, + "loss": 0.58238983, + "num_input_tokens_seen": 153187925, + "router_z_loss_clip": 0.01403809, + "router_z_loss_mlp": 0.234375, + "step": 7139, + "time_per_iteration": 2.938645362854004 + }, + { + "auxiliary_loss_clip": 0.01114131, + "auxiliary_loss_mlp": 0.01041532, + "balance_loss_clip": 1.02957499, + "balance_loss_mlp": 1.04185057, + "epoch": 0.42928002404930105, + "flos": 22999850956800.0, + "grad_norm": 1.6343660010586272, + "language_loss": 0.81107223, + "learning_rate": 2.5480751270804595e-06, + "loss": 0.83262885, + "num_input_tokens_seen": 153206990, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.72265625, + "step": 7140, + "time_per_iteration": 2.4621551036834717 + }, + { + "auxiliary_loss_clip": 0.01117324, + "auxiliary_loss_mlp": 0.01032433, + "balance_loss_clip": 1.01819944, + "balance_loss_mlp": 1.04155135, + "epoch": 0.429340147301969, + "flos": 11544922241280.0, + "grad_norm": 1.7453668118354997, + "language_loss": 0.81973499, + "learning_rate": 2.5477005639967424e-06, + "loss": 0.84123254, + "num_input_tokens_seen": 153222345, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7578125, + "step": 7141, + "time_per_iteration": 2.4552011489868164 + }, + { + "auxiliary_loss_clip": 0.011238, + "auxiliary_loss_mlp": 0.01040057, + "balance_loss_clip": 1.02569795, + "balance_loss_mlp": 1.04469872, + "epoch": 0.42940027055463703, + "flos": 25264988472960.0, + "grad_norm": 1.6365702711839187, + "language_loss": 0.86302745, + "learning_rate": 2.547325980144166e-06, + "loss": 0.88466609, + "num_input_tokens_seen": 153240570, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7890625, + "step": 7142, + "time_per_iteration": 2.466599464416504 + }, + { + "auxiliary_loss_clip": 0.01119038, + "auxiliary_loss_mlp": 0.01033549, + "balance_loss_clip": 1.0205493, + "balance_loss_mlp": 1.04692888, + "epoch": 0.429460393807305, + "flos": 23805004268160.0, + "grad_norm": 1.8779834210446977, + "language_loss": 0.78367496, + "learning_rate": 2.5469513755369323e-06, + "loss": 0.80520082, + "num_input_tokens_seen": 153259575, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71875, + "step": 7143, + "time_per_iteration": 2.528383493423462 + }, + { + "auxiliary_loss_clip": 0.01120121, + "auxiliary_loss_mlp": 0.01040708, + "balance_loss_clip": 1.02731538, + "balance_loss_mlp": 1.04566526, + "epoch": 0.42952051705997296, + "flos": 13918294414080.0, + "grad_norm": 2.185103050312315, + "language_loss": 0.76671416, + "learning_rate": 2.5465767501892484e-06, + "loss": 0.78832245, + "num_input_tokens_seen": 153276650, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.74609375, + "step": 7144, + "time_per_iteration": 2.4433047771453857 + }, + { + "auxiliary_loss_clip": 0.01119183, + "auxiliary_loss_mlp": 0.01031791, + "balance_loss_clip": 1.01801622, + "balance_loss_mlp": 1.043118, + "epoch": 0.4295806403126409, + "flos": 26760380509440.0, + "grad_norm": 2.969999234773645, + "language_loss": 0.73481476, + "learning_rate": 2.54620210411532e-06, + "loss": 0.75632453, + "num_input_tokens_seen": 153298025, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76171875, + "step": 7145, + "time_per_iteration": 2.5330073833465576 + }, + { + "auxiliary_loss_clip": 0.01120569, + "auxiliary_loss_mlp": 0.01038539, + "balance_loss_clip": 1.02447844, + "balance_loss_mlp": 1.04405165, + "epoch": 0.4296407635653089, + "flos": 20952619297920.0, + "grad_norm": 1.854643653820381, + "language_loss": 0.78928959, + "learning_rate": 2.545827437329352e-06, + "loss": 0.81088066, + "num_input_tokens_seen": 153315775, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.765625, + "step": 7146, + "time_per_iteration": 2.4481821060180664 + }, + { + "auxiliary_loss_clip": 0.01116396, + "auxiliary_loss_mlp": 0.0102847, + "balance_loss_clip": 1.01590514, + "balance_loss_mlp": 1.04295409, + "epoch": 0.42970088681797686, + "flos": 15852335339520.0, + "grad_norm": 1.9767254736067894, + "language_loss": 0.83134973, + "learning_rate": 2.5454527498455532e-06, + "loss": 0.85279846, + "num_input_tokens_seen": 153332765, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.734375, + "step": 7147, + "time_per_iteration": 2.500633478164673 + }, + { + "auxiliary_loss_clip": 0.01124897, + "auxiliary_loss_mlp": 0.01039853, + "balance_loss_clip": 1.02473724, + "balance_loss_mlp": 1.04802537, + "epoch": 0.4297610100706448, + "flos": 22382618624640.0, + "grad_norm": 1.8398177405042841, + "language_loss": 0.86894512, + "learning_rate": 2.545078041678131e-06, + "loss": 0.89059258, + "num_input_tokens_seen": 153350760, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.76953125, + "step": 7148, + "time_per_iteration": 2.481743097305298 + }, + { + "auxiliary_loss_clip": 0.01120854, + "auxiliary_loss_mlp": 0.01036737, + "balance_loss_clip": 1.02405918, + "balance_loss_mlp": 1.04469061, + "epoch": 0.4298211333233128, + "flos": 27925681536000.0, + "grad_norm": 1.5258683369520107, + "language_loss": 0.77855921, + "learning_rate": 2.5447033128412957e-06, + "loss": 0.80013508, + "num_input_tokens_seen": 153370765, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.76171875, + "step": 7149, + "time_per_iteration": 2.6060431003570557 + }, + { + "auxiliary_loss_clip": 0.01118454, + "auxiliary_loss_mlp": 0.01036469, + "balance_loss_clip": 1.02247977, + "balance_loss_mlp": 1.04456902, + "epoch": 0.42988125657598075, + "flos": 24425612478720.0, + "grad_norm": 1.7047076849986806, + "language_loss": 0.79828095, + "learning_rate": 2.544328563349256e-06, + "loss": 0.81983018, + "num_input_tokens_seen": 153390725, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.73828125, + "step": 7150, + "time_per_iteration": 2.4652955532073975 + }, + { + "auxiliary_loss_clip": 0.01125949, + "auxiliary_loss_mlp": 0.0104301, + "balance_loss_clip": 1.02763176, + "balance_loss_mlp": 1.0467031, + "epoch": 0.4299413798286487, + "flos": 15850180523520.0, + "grad_norm": 1.7972230644563891, + "language_loss": 0.74738395, + "learning_rate": 2.5439537932162222e-06, + "loss": 0.76907349, + "num_input_tokens_seen": 153408010, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.79296875, + "step": 7151, + "time_per_iteration": 2.5019421577453613 + }, + { + "auxiliary_loss_clip": 0.0112419, + "auxiliary_loss_mlp": 0.01036782, + "balance_loss_clip": 1.02284098, + "balance_loss_mlp": 1.0458225, + "epoch": 0.4300015030813167, + "flos": 22309504490880.0, + "grad_norm": 1.924911798883302, + "language_loss": 0.70084447, + "learning_rate": 2.543579002456406e-06, + "loss": 0.72245419, + "num_input_tokens_seen": 153426865, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.78515625, + "step": 7152, + "time_per_iteration": 2.456465482711792 + }, + { + "auxiliary_loss_clip": 0.01117938, + "auxiliary_loss_mlp": 0.01035992, + "balance_loss_clip": 1.02268243, + "balance_loss_mlp": 1.04186821, + "epoch": 0.43006162633398465, + "flos": 34897666366080.0, + "grad_norm": 1.5367633238023177, + "language_loss": 0.71064591, + "learning_rate": 2.54320419108402e-06, + "loss": 0.73218524, + "num_input_tokens_seen": 153449410, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7578125, + "step": 7153, + "time_per_iteration": 2.6120920181274414 + }, + { + "auxiliary_loss_clip": 0.01120146, + "auxiliary_loss_mlp": 0.01033168, + "balance_loss_clip": 1.01941729, + "balance_loss_mlp": 1.04342091, + "epoch": 0.4301217495866526, + "flos": 15961575576960.0, + "grad_norm": 1.8794751780958798, + "language_loss": 0.79155993, + "learning_rate": 2.542829359113276e-06, + "loss": 0.81309307, + "num_input_tokens_seen": 153467910, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.765625, + "step": 7154, + "time_per_iteration": 2.4222962856292725 + }, + { + "auxiliary_loss_clip": 0.0111738, + "auxiliary_loss_mlp": 0.01030563, + "balance_loss_clip": 1.01818347, + "balance_loss_mlp": 1.04361236, + "epoch": 0.43018187283932063, + "flos": 18770364414720.0, + "grad_norm": 1.4801057977091479, + "language_loss": 0.78793395, + "learning_rate": 2.542454506558389e-06, + "loss": 0.80941343, + "num_input_tokens_seen": 153487100, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.73828125, + "step": 7155, + "time_per_iteration": 2.4554193019866943 + }, + { + "auxiliary_loss_clip": 0.01117238, + "auxiliary_loss_mlp": 0.01028737, + "balance_loss_clip": 1.01582694, + "balance_loss_mlp": 1.04335082, + "epoch": 0.4302419960919886, + "flos": 20151703791360.0, + "grad_norm": 1.7176839192841982, + "language_loss": 0.88779187, + "learning_rate": 2.5420796334335723e-06, + "loss": 0.90925157, + "num_input_tokens_seen": 153505565, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.734375, + "step": 7156, + "time_per_iteration": 2.446831464767456 + }, + { + "auxiliary_loss_clip": 0.01120931, + "auxiliary_loss_mlp": 0.01033948, + "balance_loss_clip": 1.01953602, + "balance_loss_mlp": 1.04361558, + "epoch": 0.43030211934465656, + "flos": 26432731624320.0, + "grad_norm": 1.9517774058288286, + "language_loss": 0.82738447, + "learning_rate": 2.541704739753042e-06, + "loss": 0.84893334, + "num_input_tokens_seen": 153526130, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7734375, + "step": 7157, + "time_per_iteration": 2.5298144817352295 + }, + { + "auxiliary_loss_clip": 0.01124397, + "auxiliary_loss_mlp": 0.01035742, + "balance_loss_clip": 1.0210917, + "balance_loss_mlp": 1.04532623, + "epoch": 0.43036224259732453, + "flos": 24389234979840.0, + "grad_norm": 1.8458285726729726, + "language_loss": 0.72177351, + "learning_rate": 2.5413298255310132e-06, + "loss": 0.74337494, + "num_input_tokens_seen": 153546370, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7890625, + "step": 7158, + "time_per_iteration": 2.4691712856292725 + }, + { + "auxiliary_loss_clip": 0.0111825, + "auxiliary_loss_mlp": 0.0103104, + "balance_loss_clip": 1.01796317, + "balance_loss_mlp": 1.04215837, + "epoch": 0.4304223658499925, + "flos": 17201714590080.0, + "grad_norm": 2.077812294320108, + "language_loss": 0.82865965, + "learning_rate": 2.5409548907817034e-06, + "loss": 0.85015261, + "num_input_tokens_seen": 153562800, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7578125, + "step": 7159, + "time_per_iteration": 2.4462857246398926 + }, + { + "auxiliary_loss_clip": 0.01118056, + "auxiliary_loss_mlp": 0.01032317, + "balance_loss_clip": 1.01887655, + "balance_loss_mlp": 1.04236865, + "epoch": 0.43048248910266046, + "flos": 14903000835840.0, + "grad_norm": 2.094804075931644, + "language_loss": 0.83043528, + "learning_rate": 2.54057993551933e-06, + "loss": 0.85193908, + "num_input_tokens_seen": 153578395, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 7160, + "time_per_iteration": 2.587928533554077 + }, + { + "auxiliary_loss_clip": 0.01123066, + "auxiliary_loss_mlp": 0.01038163, + "balance_loss_clip": 1.02249885, + "balance_loss_mlp": 1.04402685, + "epoch": 0.4305426123553284, + "flos": 21579835610880.0, + "grad_norm": 3.027641474238522, + "language_loss": 0.77379316, + "learning_rate": 2.5402049597581116e-06, + "loss": 0.79540545, + "num_input_tokens_seen": 153596880, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.79296875, + "step": 7161, + "time_per_iteration": 2.502628803253174 + }, + { + "auxiliary_loss_clip": 0.01119327, + "auxiliary_loss_mlp": 0.01034462, + "balance_loss_clip": 1.020854, + "balance_loss_mlp": 1.04304039, + "epoch": 0.4306027356079964, + "flos": 22601278667520.0, + "grad_norm": 2.05136398687674, + "language_loss": 0.73137891, + "learning_rate": 2.5398299635122662e-06, + "loss": 0.75291681, + "num_input_tokens_seen": 153616570, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.765625, + "step": 7162, + "time_per_iteration": 2.439053773880005 + }, + { + "auxiliary_loss_clip": 0.01042786, + "auxiliary_loss_mlp": 0.01005692, + "balance_loss_clip": 1.00411832, + "balance_loss_mlp": 1.01966858, + "epoch": 0.43066285886066435, + "flos": 70672091806080.0, + "grad_norm": 0.7926335078551056, + "language_loss": 0.59016478, + "learning_rate": 2.5394549467960147e-06, + "loss": 0.61064959, + "num_input_tokens_seen": 153671450, + "router_z_loss_clip": 0.01574707, + "router_z_loss_mlp": 0.23046875, + "step": 7163, + "time_per_iteration": 2.9588072299957275 + }, + { + "auxiliary_loss_clip": 0.01115064, + "auxiliary_loss_mlp": 0.01035604, + "balance_loss_clip": 1.02299142, + "balance_loss_mlp": 1.04035139, + "epoch": 0.4307229821133323, + "flos": 26720591218560.0, + "grad_norm": 1.6277980092745115, + "language_loss": 0.79140532, + "learning_rate": 2.5390799096235783e-06, + "loss": 0.81291205, + "num_input_tokens_seen": 153691405, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.74609375, + "step": 7164, + "time_per_iteration": 2.484001398086548 + }, + { + "auxiliary_loss_clip": 0.01119155, + "auxiliary_loss_mlp": 0.01041343, + "balance_loss_clip": 1.0275383, + "balance_loss_mlp": 1.04078794, + "epoch": 0.4307831053660003, + "flos": 26177119464960.0, + "grad_norm": 1.8180486110770353, + "language_loss": 0.67282438, + "learning_rate": 2.538704852009177e-06, + "loss": 0.69442934, + "num_input_tokens_seen": 153711555, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.78125, + "step": 7165, + "time_per_iteration": 2.533599376678467 + }, + { + "auxiliary_loss_clip": 0.01119036, + "auxiliary_loss_mlp": 0.01043422, + "balance_loss_clip": 1.03069651, + "balance_loss_mlp": 1.04327762, + "epoch": 0.43084322861866825, + "flos": 18910343715840.0, + "grad_norm": 1.850302447549428, + "language_loss": 0.75248688, + "learning_rate": 2.538329773967034e-06, + "loss": 0.77411151, + "num_input_tokens_seen": 153730095, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7578125, + "step": 7166, + "time_per_iteration": 2.439861536026001 + }, + { + "auxiliary_loss_clip": 0.01117069, + "auxiliary_loss_mlp": 0.01036345, + "balance_loss_clip": 1.0239172, + "balance_loss_mlp": 1.04362941, + "epoch": 0.4309033518713362, + "flos": 26432911192320.0, + "grad_norm": 1.612504951400803, + "language_loss": 0.71537554, + "learning_rate": 2.537954675511372e-06, + "loss": 0.73690969, + "num_input_tokens_seen": 153749320, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.734375, + "step": 7167, + "time_per_iteration": 2.499190092086792 + }, + { + "auxiliary_loss_clip": 0.01111616, + "auxiliary_loss_mlp": 0.01034998, + "balance_loss_clip": 1.02232647, + "balance_loss_mlp": 1.03984129, + "epoch": 0.43096347512400424, + "flos": 21213295274880.0, + "grad_norm": 1.6022700342177734, + "language_loss": 0.78459173, + "learning_rate": 2.537579556656414e-06, + "loss": 0.80605787, + "num_input_tokens_seen": 153767825, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 7168, + "time_per_iteration": 2.4372310638427734 + }, + { + "auxiliary_loss_clip": 0.01118326, + "auxiliary_loss_mlp": 0.01041364, + "balance_loss_clip": 1.02733326, + "balance_loss_mlp": 1.04224193, + "epoch": 0.4310235983766722, + "flos": 16540131939840.0, + "grad_norm": 2.3121674941994383, + "language_loss": 0.82260263, + "learning_rate": 2.537204417416387e-06, + "loss": 0.8441996, + "num_input_tokens_seen": 153785350, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 7169, + "time_per_iteration": 2.4545183181762695 + }, + { + "auxiliary_loss_clip": 0.01038578, + "auxiliary_loss_mlp": 0.01010207, + "balance_loss_clip": 1.00865698, + "balance_loss_mlp": 1.0153358, + "epoch": 0.43108372162934017, + "flos": 64775704763520.0, + "grad_norm": 0.6800543146405372, + "language_loss": 0.60812157, + "learning_rate": 2.5368292578055132e-06, + "loss": 0.62860942, + "num_input_tokens_seen": 153856400, + "router_z_loss_clip": 0.01550293, + "router_z_loss_mlp": 0.23242188, + "step": 7170, + "time_per_iteration": 3.2204582691192627 + }, + { + "auxiliary_loss_clip": 0.01116832, + "auxiliary_loss_mlp": 0.01033041, + "balance_loss_clip": 1.02039874, + "balance_loss_mlp": 1.04148889, + "epoch": 0.43114384488200813, + "flos": 13444094039040.0, + "grad_norm": 2.0659828341911615, + "language_loss": 0.76225841, + "learning_rate": 2.536454077838021e-06, + "loss": 0.78375715, + "num_input_tokens_seen": 153875230, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.75390625, + "step": 7171, + "time_per_iteration": 2.465665817260742 + }, + { + "auxiliary_loss_clip": 0.01117327, + "auxiliary_loss_mlp": 0.0103468, + "balance_loss_clip": 1.02172232, + "balance_loss_mlp": 1.04197574, + "epoch": 0.4312039681346761, + "flos": 26286682924800.0, + "grad_norm": 1.6834410044967325, + "language_loss": 0.77283418, + "learning_rate": 2.5360788775281357e-06, + "loss": 0.7943542, + "num_input_tokens_seen": 153894740, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.75390625, + "step": 7172, + "time_per_iteration": 2.4916739463806152 + }, + { + "auxiliary_loss_clip": 0.0111787, + "auxiliary_loss_mlp": 0.01039799, + "balance_loss_clip": 1.02544653, + "balance_loss_mlp": 1.04015696, + "epoch": 0.43126409138734406, + "flos": 20376684627840.0, + "grad_norm": 1.7953579135961333, + "language_loss": 0.76852405, + "learning_rate": 2.535703656890086e-06, + "loss": 0.79010069, + "num_input_tokens_seen": 153913230, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.77734375, + "step": 7173, + "time_per_iteration": 2.4764745235443115 + }, + { + "auxiliary_loss_clip": 0.01115542, + "auxiliary_loss_mlp": 0.01029073, + "balance_loss_clip": 1.01571512, + "balance_loss_mlp": 1.04070854, + "epoch": 0.431324214640012, + "flos": 22123091882880.0, + "grad_norm": 1.4568106417702447, + "language_loss": 0.77103329, + "learning_rate": 2.5353284159381e-06, + "loss": 0.79247946, + "num_input_tokens_seen": 153933250, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.75, + "step": 7174, + "time_per_iteration": 2.4860222339630127 + }, + { + "auxiliary_loss_clip": 0.01119703, + "auxiliary_loss_mlp": 0.01032961, + "balance_loss_clip": 1.01815498, + "balance_loss_mlp": 1.04199743, + "epoch": 0.43138433789268, + "flos": 15231008856960.0, + "grad_norm": 1.4198827217143106, + "language_loss": 0.82505399, + "learning_rate": 2.534953154686407e-06, + "loss": 0.84658062, + "num_input_tokens_seen": 153951325, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.77734375, + "step": 7175, + "time_per_iteration": 2.462977647781372 + }, + { + "auxiliary_loss_clip": 0.01121086, + "auxiliary_loss_mlp": 0.01036761, + "balance_loss_clip": 1.0223192, + "balance_loss_mlp": 1.04153752, + "epoch": 0.43144446114534796, + "flos": 18150294908160.0, + "grad_norm": 2.338333143716513, + "language_loss": 0.74985862, + "learning_rate": 2.5345778731492366e-06, + "loss": 0.77143705, + "num_input_tokens_seen": 153966975, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.79296875, + "step": 7176, + "time_per_iteration": 2.4185218811035156 + }, + { + "auxiliary_loss_clip": 0.01117308, + "auxiliary_loss_mlp": 0.01034436, + "balance_loss_clip": 1.020643, + "balance_loss_mlp": 1.03969014, + "epoch": 0.4315045843980159, + "flos": 22929861306240.0, + "grad_norm": 1.6024853029290826, + "language_loss": 0.73364419, + "learning_rate": 2.534202571340819e-06, + "loss": 0.75516164, + "num_input_tokens_seen": 153986695, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.77734375, + "step": 7177, + "time_per_iteration": 2.487114667892456 + }, + { + "auxiliary_loss_clip": 0.01124437, + "auxiliary_loss_mlp": 0.01042376, + "balance_loss_clip": 1.0264492, + "balance_loss_mlp": 1.04060507, + "epoch": 0.4315647076506839, + "flos": 22126862810880.0, + "grad_norm": 1.878519248272382, + "language_loss": 0.81681836, + "learning_rate": 2.533827249275387e-06, + "loss": 0.83848649, + "num_input_tokens_seen": 154004710, + "router_z_loss_clip": 0.15917969, + "router_z_loss_mlp": 0.8359375, + "step": 7178, + "time_per_iteration": 2.443887948989868 + }, + { + "auxiliary_loss_clip": 0.01113093, + "auxiliary_loss_mlp": 0.01033286, + "balance_loss_clip": 1.01988733, + "balance_loss_mlp": 1.04052329, + "epoch": 0.43162483090335185, + "flos": 26871129118080.0, + "grad_norm": 1.4541906286028654, + "language_loss": 0.83824348, + "learning_rate": 2.5334519069671725e-06, + "loss": 0.8597073, + "num_input_tokens_seen": 154024320, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7265625, + "step": 7179, + "time_per_iteration": 5.329441547393799 + }, + { + "auxiliary_loss_clip": 0.01114931, + "auxiliary_loss_mlp": 0.01033766, + "balance_loss_clip": 1.02040303, + "balance_loss_mlp": 1.03945267, + "epoch": 0.4316849541560198, + "flos": 13913122855680.0, + "grad_norm": 2.045990303945265, + "language_loss": 0.75710779, + "learning_rate": 2.5330765444304075e-06, + "loss": 0.77859473, + "num_input_tokens_seen": 154041755, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.75390625, + "step": 7180, + "time_per_iteration": 2.5520315170288086 + }, + { + "auxiliary_loss_clip": 0.01116541, + "auxiliary_loss_mlp": 0.0103858, + "balance_loss_clip": 1.0240128, + "balance_loss_mlp": 1.03862667, + "epoch": 0.4317450774086878, + "flos": 16435165420800.0, + "grad_norm": 1.7639080321754919, + "language_loss": 0.81907403, + "learning_rate": 2.5327011616793274e-06, + "loss": 0.84062529, + "num_input_tokens_seen": 154056775, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.78125, + "step": 7181, + "time_per_iteration": 2.4059271812438965 + }, + { + "auxiliary_loss_clip": 0.01118777, + "auxiliary_loss_mlp": 0.0103845, + "balance_loss_clip": 1.02357888, + "balance_loss_mlp": 1.04020417, + "epoch": 0.4318052006613558, + "flos": 20554980762240.0, + "grad_norm": 1.5777864051255721, + "language_loss": 0.88434547, + "learning_rate": 2.532325758728165e-06, + "loss": 0.90591776, + "num_input_tokens_seen": 154075015, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.78515625, + "step": 7182, + "time_per_iteration": 2.463463306427002 + }, + { + "auxiliary_loss_clip": 0.01113816, + "auxiliary_loss_mlp": 0.0103166, + "balance_loss_clip": 1.01873803, + "balance_loss_mlp": 1.03918862, + "epoch": 0.43186532391402377, + "flos": 22820046451200.0, + "grad_norm": 1.70694658333996, + "language_loss": 0.75826657, + "learning_rate": 2.5319503355911566e-06, + "loss": 0.77972138, + "num_input_tokens_seen": 154095170, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.74609375, + "step": 7183, + "time_per_iteration": 2.4562740325927734 + }, + { + "auxiliary_loss_clip": 0.01116225, + "auxiliary_loss_mlp": 0.01032272, + "balance_loss_clip": 1.01819921, + "balance_loss_mlp": 1.03917336, + "epoch": 0.43192544716669173, + "flos": 25556583081600.0, + "grad_norm": 2.311500131527462, + "language_loss": 0.77666485, + "learning_rate": 2.5315748922825393e-06, + "loss": 0.79814982, + "num_input_tokens_seen": 154116895, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76953125, + "step": 7184, + "time_per_iteration": 2.5283145904541016 + }, + { + "auxiliary_loss_clip": 0.01110208, + "auxiliary_loss_mlp": 0.01033883, + "balance_loss_clip": 1.02065074, + "balance_loss_mlp": 1.03938413, + "epoch": 0.4319855704193597, + "flos": 30954674701440.0, + "grad_norm": 1.5490664406704935, + "language_loss": 0.73325193, + "learning_rate": 2.5311994288165474e-06, + "loss": 0.75469285, + "num_input_tokens_seen": 154138395, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.70703125, + "step": 7185, + "time_per_iteration": 2.520885467529297 + }, + { + "auxiliary_loss_clip": 0.01118704, + "auxiliary_loss_mlp": 0.01037072, + "balance_loss_clip": 1.02283251, + "balance_loss_mlp": 1.03961062, + "epoch": 0.43204569367202766, + "flos": 24238732993920.0, + "grad_norm": 2.5540588454326, + "language_loss": 0.75974178, + "learning_rate": 2.530823945207421e-06, + "loss": 0.78129953, + "num_input_tokens_seen": 154156775, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7890625, + "step": 7186, + "time_per_iteration": 2.5005605220794678 + }, + { + "auxiliary_loss_clip": 0.01116031, + "auxiliary_loss_mlp": 0.01035244, + "balance_loss_clip": 1.02164185, + "balance_loss_mlp": 1.03987479, + "epoch": 0.43210581692469563, + "flos": 18406948561920.0, + "grad_norm": 5.067701176656461, + "language_loss": 0.76043296, + "learning_rate": 2.5304484414693962e-06, + "loss": 0.78194571, + "num_input_tokens_seen": 154177500, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.76171875, + "step": 7187, + "time_per_iteration": 2.4769227504730225 + }, + { + "auxiliary_loss_clip": 0.0103801, + "auxiliary_loss_mlp": 0.00999247, + "balance_loss_clip": 0.99792367, + "balance_loss_mlp": 1.0145607, + "epoch": 0.4321659401773636, + "flos": 49832378910720.0, + "grad_norm": 0.8526585096921939, + "language_loss": 0.68180382, + "learning_rate": 2.530072917616714e-06, + "loss": 0.70217645, + "num_input_tokens_seen": 154237110, + "router_z_loss_clip": 0.01324463, + "router_z_loss_mlp": 0.234375, + "step": 7188, + "time_per_iteration": 3.095301389694214 + }, + { + "auxiliary_loss_clip": 0.01112959, + "auxiliary_loss_mlp": 0.01035631, + "balance_loss_clip": 1.02231503, + "balance_loss_mlp": 1.03992498, + "epoch": 0.43222606343003156, + "flos": 17128564542720.0, + "grad_norm": 1.742468102969242, + "language_loss": 0.7809816, + "learning_rate": 2.529697373663614e-06, + "loss": 0.80246753, + "num_input_tokens_seen": 154253910, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73046875, + "step": 7189, + "time_per_iteration": 2.4332470893859863 + }, + { + "auxiliary_loss_clip": 0.01118752, + "auxiliary_loss_mlp": 0.01041299, + "balance_loss_clip": 1.0263027, + "balance_loss_mlp": 1.03817415, + "epoch": 0.4322861866826995, + "flos": 22749949059840.0, + "grad_norm": 1.8713383629003246, + "language_loss": 0.7119785, + "learning_rate": 2.5293218096243364e-06, + "loss": 0.73357898, + "num_input_tokens_seen": 154274770, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.8046875, + "step": 7190, + "time_per_iteration": 2.494537115097046 + }, + { + "auxiliary_loss_clip": 0.01113042, + "auxiliary_loss_mlp": 0.0103584, + "balance_loss_clip": 1.02275729, + "balance_loss_mlp": 1.0380528, + "epoch": 0.4323463099353675, + "flos": 27891925729920.0, + "grad_norm": 1.5245278530879214, + "language_loss": 0.79833174, + "learning_rate": 2.5289462255131223e-06, + "loss": 0.81982064, + "num_input_tokens_seen": 154295035, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.75, + "step": 7191, + "time_per_iteration": 2.478376865386963 + }, + { + "auxiliary_loss_clip": 0.01113503, + "auxiliary_loss_mlp": 0.0103395, + "balance_loss_clip": 1.020944, + "balance_loss_mlp": 1.03872573, + "epoch": 0.43240643318803546, + "flos": 21614740652160.0, + "grad_norm": 1.7647822638177795, + "language_loss": 0.74647141, + "learning_rate": 2.5285706213442146e-06, + "loss": 0.76794595, + "num_input_tokens_seen": 154314905, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.75, + "step": 7192, + "time_per_iteration": 2.4613609313964844 + }, + { + "auxiliary_loss_clip": 0.011176, + "auxiliary_loss_mlp": 0.01041388, + "balance_loss_clip": 1.02696347, + "balance_loss_mlp": 1.04183233, + "epoch": 0.4324665564407034, + "flos": 17558378686080.0, + "grad_norm": 2.014554632256561, + "language_loss": 0.78898597, + "learning_rate": 2.5281949971318557e-06, + "loss": 0.81057584, + "num_input_tokens_seen": 154331740, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7578125, + "step": 7193, + "time_per_iteration": 2.4220309257507324 + }, + { + "auxiliary_loss_clip": 0.011155, + "auxiliary_loss_mlp": 0.01040121, + "balance_loss_clip": 1.02609015, + "balance_loss_mlp": 1.0394038, + "epoch": 0.4325266796933714, + "flos": 18402423448320.0, + "grad_norm": 1.7200377707292065, + "language_loss": 0.75406849, + "learning_rate": 2.5278193528902897e-06, + "loss": 0.77562475, + "num_input_tokens_seen": 154348740, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 7194, + "time_per_iteration": 2.466512441635132 + }, + { + "auxiliary_loss_clip": 0.01117198, + "auxiliary_loss_mlp": 0.01037831, + "balance_loss_clip": 1.02435398, + "balance_loss_mlp": 1.04108119, + "epoch": 0.4325868029460394, + "flos": 22564793427840.0, + "grad_norm": 5.005212308773382, + "language_loss": 0.60044503, + "learning_rate": 2.5274436886337613e-06, + "loss": 0.62199533, + "num_input_tokens_seen": 154368835, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.76171875, + "step": 7195, + "time_per_iteration": 2.4522454738616943 + }, + { + "auxiliary_loss_clip": 0.0111962, + "auxiliary_loss_mlp": 0.01041876, + "balance_loss_clip": 1.02713561, + "balance_loss_mlp": 1.04041934, + "epoch": 0.43264692619870737, + "flos": 14605516396800.0, + "grad_norm": 2.2806268233026628, + "language_loss": 0.64930809, + "learning_rate": 2.527068004376515e-06, + "loss": 0.67092311, + "num_input_tokens_seen": 154384620, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.7890625, + "step": 7196, + "time_per_iteration": 2.4453718662261963 + }, + { + "auxiliary_loss_clip": 0.011204, + "auxiliary_loss_mlp": 0.01034681, + "balance_loss_clip": 1.02024436, + "balance_loss_mlp": 1.04024911, + "epoch": 0.43270704945137534, + "flos": 21501657659520.0, + "grad_norm": 4.696072713783665, + "language_loss": 0.72759318, + "learning_rate": 2.526692300132797e-06, + "loss": 0.74914396, + "num_input_tokens_seen": 154402865, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.80078125, + "step": 7197, + "time_per_iteration": 2.500256061553955 + }, + { + "auxiliary_loss_clip": 0.01116404, + "auxiliary_loss_mlp": 0.01045003, + "balance_loss_clip": 1.03106129, + "balance_loss_mlp": 1.04246271, + "epoch": 0.4327671727040433, + "flos": 25155891889920.0, + "grad_norm": 1.598666024351184, + "language_loss": 0.72644413, + "learning_rate": 2.5263165759168547e-06, + "loss": 0.7480582, + "num_input_tokens_seen": 154423625, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7421875, + "step": 7198, + "time_per_iteration": 2.567762613296509 + }, + { + "auxiliary_loss_clip": 0.01115203, + "auxiliary_loss_mlp": 0.01034805, + "balance_loss_clip": 1.02138782, + "balance_loss_mlp": 1.03913903, + "epoch": 0.43282729595671127, + "flos": 25447163276160.0, + "grad_norm": 1.3766106050597056, + "language_loss": 0.81292808, + "learning_rate": 2.525940831742934e-06, + "loss": 0.83442813, + "num_input_tokens_seen": 154444775, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7578125, + "step": 7199, + "time_per_iteration": 2.4782636165618896 + }, + { + "auxiliary_loss_clip": 0.01118715, + "auxiliary_loss_mlp": 0.01041613, + "balance_loss_clip": 1.02829099, + "balance_loss_mlp": 1.04219055, + "epoch": 0.43288741920937923, + "flos": 24126116878080.0, + "grad_norm": 2.2182298419994346, + "language_loss": 0.68883061, + "learning_rate": 2.525565067625286e-06, + "loss": 0.71043384, + "num_input_tokens_seen": 154460815, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.765625, + "step": 7200, + "time_per_iteration": 2.4730873107910156 + }, + { + "auxiliary_loss_clip": 0.01117767, + "auxiliary_loss_mlp": 0.01043187, + "balance_loss_clip": 1.02809453, + "balance_loss_mlp": 1.04055738, + "epoch": 0.4329475424620472, + "flos": 19204955066880.0, + "grad_norm": 2.134839210265846, + "language_loss": 0.87135142, + "learning_rate": 2.525189283578157e-06, + "loss": 0.89296097, + "num_input_tokens_seen": 154479145, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7734375, + "step": 7201, + "time_per_iteration": 2.47463321685791 + }, + { + "auxiliary_loss_clip": 0.01125345, + "auxiliary_loss_mlp": 0.01042574, + "balance_loss_clip": 1.02696979, + "balance_loss_mlp": 1.04488945, + "epoch": 0.43300766571471516, + "flos": 22638374438400.0, + "grad_norm": 2.16649852661544, + "language_loss": 0.64551014, + "learning_rate": 2.5248134796157974e-06, + "loss": 0.66718936, + "num_input_tokens_seen": 154498905, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.8046875, + "step": 7202, + "time_per_iteration": 2.520963668823242 + }, + { + "auxiliary_loss_clip": 0.0111734, + "auxiliary_loss_mlp": 0.01031708, + "balance_loss_clip": 1.01931047, + "balance_loss_mlp": 1.04092193, + "epoch": 0.4330677889673831, + "flos": 22121080721280.0, + "grad_norm": 1.7838197935762699, + "language_loss": 0.81707418, + "learning_rate": 2.5244376557524586e-06, + "loss": 0.83856463, + "num_input_tokens_seen": 154517270, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.765625, + "step": 7203, + "time_per_iteration": 2.474724531173706 + }, + { + "auxiliary_loss_clip": 0.01121178, + "auxiliary_loss_mlp": 0.01047095, + "balance_loss_clip": 1.03284955, + "balance_loss_mlp": 1.04118741, + "epoch": 0.4331279122200511, + "flos": 23221527742080.0, + "grad_norm": 1.864866510083204, + "language_loss": 0.81476939, + "learning_rate": 2.5240618120023912e-06, + "loss": 0.83645213, + "num_input_tokens_seen": 154535945, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.80078125, + "step": 7204, + "time_per_iteration": 2.527064323425293 + }, + { + "auxiliary_loss_clip": 0.01117221, + "auxiliary_loss_mlp": 0.01035641, + "balance_loss_clip": 1.02226007, + "balance_loss_mlp": 1.04050207, + "epoch": 0.43318803547271906, + "flos": 18259750627200.0, + "grad_norm": 1.78968083236078, + "language_loss": 0.73432428, + "learning_rate": 2.5236859483798468e-06, + "loss": 0.75585294, + "num_input_tokens_seen": 154554935, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.765625, + "step": 7205, + "time_per_iteration": 2.406350612640381 + }, + { + "auxiliary_loss_clip": 0.01116769, + "auxiliary_loss_mlp": 0.01037164, + "balance_loss_clip": 1.02414668, + "balance_loss_mlp": 1.04308569, + "epoch": 0.433248158725387, + "flos": 27418407713280.0, + "grad_norm": 1.6284714357196102, + "language_loss": 0.75110108, + "learning_rate": 2.5233100648990803e-06, + "loss": 0.77264041, + "num_input_tokens_seen": 154576065, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.73828125, + "step": 7206, + "time_per_iteration": 2.527343511581421 + }, + { + "auxiliary_loss_clip": 0.01115193, + "auxiliary_loss_mlp": 0.01036596, + "balance_loss_clip": 1.02247548, + "balance_loss_mlp": 1.03899562, + "epoch": 0.433308281978055, + "flos": 23218008209280.0, + "grad_norm": 2.1762520186821854, + "language_loss": 0.78700626, + "learning_rate": 2.522934161574342e-06, + "loss": 0.80852419, + "num_input_tokens_seen": 154595110, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 7207, + "time_per_iteration": 2.4470536708831787 + }, + { + "auxiliary_loss_clip": 0.01121794, + "auxiliary_loss_mlp": 0.0103555, + "balance_loss_clip": 1.02026772, + "balance_loss_mlp": 1.04215813, + "epoch": 0.433368405230723, + "flos": 15852407166720.0, + "grad_norm": 1.6893238531796995, + "language_loss": 0.81100202, + "learning_rate": 2.5225582384198888e-06, + "loss": 0.83257544, + "num_input_tokens_seen": 154612255, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.796875, + "step": 7208, + "time_per_iteration": 2.4634876251220703 + }, + { + "auxiliary_loss_clip": 0.0111942, + "auxiliary_loss_mlp": 0.01034218, + "balance_loss_clip": 1.02083671, + "balance_loss_mlp": 1.04337454, + "epoch": 0.433428528483391, + "flos": 19026084314880.0, + "grad_norm": 2.072374936090108, + "language_loss": 0.70074689, + "learning_rate": 2.5221822954499744e-06, + "loss": 0.72228324, + "num_input_tokens_seen": 154630440, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7578125, + "step": 7209, + "time_per_iteration": 2.4699575901031494 + }, + { + "auxiliary_loss_clip": 0.01113916, + "auxiliary_loss_mlp": 0.01034855, + "balance_loss_clip": 1.02102125, + "balance_loss_mlp": 1.0392952, + "epoch": 0.43348865173605894, + "flos": 24718248581760.0, + "grad_norm": 1.533200118487429, + "language_loss": 0.81202382, + "learning_rate": 2.5218063326788557e-06, + "loss": 0.83351159, + "num_input_tokens_seen": 154652515, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.74609375, + "step": 7210, + "time_per_iteration": 2.5462334156036377 + }, + { + "auxiliary_loss_clip": 0.01114494, + "auxiliary_loss_mlp": 0.01036333, + "balance_loss_clip": 1.02280319, + "balance_loss_mlp": 1.03895545, + "epoch": 0.4335487749887269, + "flos": 22090664880000.0, + "grad_norm": 1.7483210767520514, + "language_loss": 0.81570554, + "learning_rate": 2.5214303501207885e-06, + "loss": 0.83721387, + "num_input_tokens_seen": 154670965, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 7211, + "time_per_iteration": 2.4835634231567383 + }, + { + "auxiliary_loss_clip": 0.01113663, + "auxiliary_loss_mlp": 0.01033957, + "balance_loss_clip": 1.02150583, + "balance_loss_mlp": 1.03778863, + "epoch": 0.43360889824139487, + "flos": 22382941847040.0, + "grad_norm": 2.083548110229539, + "language_loss": 0.74785221, + "learning_rate": 2.521054347790029e-06, + "loss": 0.76932836, + "num_input_tokens_seen": 154689980, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7578125, + "step": 7212, + "time_per_iteration": 2.492600917816162 + }, + { + "auxiliary_loss_clip": 0.01117192, + "auxiliary_loss_mlp": 0.01032782, + "balance_loss_clip": 1.01990747, + "balance_loss_mlp": 1.04162407, + "epoch": 0.43366902149406283, + "flos": 17528286067200.0, + "grad_norm": 1.6640529640233686, + "language_loss": 0.76755834, + "learning_rate": 2.5206783257008375e-06, + "loss": 0.78905809, + "num_input_tokens_seen": 154706570, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.75390625, + "step": 7213, + "time_per_iteration": 2.4060752391815186 + }, + { + "auxiliary_loss_clip": 0.01114624, + "auxiliary_loss_mlp": 0.01034054, + "balance_loss_clip": 1.02070832, + "balance_loss_mlp": 1.03933454, + "epoch": 0.4337291447467308, + "flos": 19022672522880.0, + "grad_norm": 1.5718517519296942, + "language_loss": 0.64949977, + "learning_rate": 2.520302283867471e-06, + "loss": 0.67098659, + "num_input_tokens_seen": 154725210, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75, + "step": 7214, + "time_per_iteration": 2.512662410736084 + }, + { + "auxiliary_loss_clip": 0.01110495, + "auxiliary_loss_mlp": 0.01034308, + "balance_loss_clip": 1.02173781, + "balance_loss_mlp": 1.03869057, + "epoch": 0.43378926799939876, + "flos": 27234042180480.0, + "grad_norm": 1.5916808794412316, + "language_loss": 0.71483207, + "learning_rate": 2.519926222304191e-06, + "loss": 0.73628008, + "num_input_tokens_seen": 154745945, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71875, + "step": 7215, + "time_per_iteration": 2.5099971294403076 + }, + { + "auxiliary_loss_clip": 0.01113826, + "auxiliary_loss_mlp": 0.0103555, + "balance_loss_clip": 1.02224684, + "balance_loss_mlp": 1.04080701, + "epoch": 0.43384939125206673, + "flos": 15961108700160.0, + "grad_norm": 2.1029551712935692, + "language_loss": 0.7531544, + "learning_rate": 2.519550141025255e-06, + "loss": 0.77464819, + "num_input_tokens_seen": 154763580, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73046875, + "step": 7216, + "time_per_iteration": 2.496631383895874 + }, + { + "auxiliary_loss_clip": 0.01124083, + "auxiliary_loss_mlp": 0.01044464, + "balance_loss_clip": 1.02873421, + "balance_loss_mlp": 1.04232287, + "epoch": 0.4339095145047347, + "flos": 21793216354560.0, + "grad_norm": 2.4885665438006086, + "language_loss": 0.75943911, + "learning_rate": 2.519174040044927e-06, + "loss": 0.78112465, + "num_input_tokens_seen": 154776825, + "router_z_loss_clip": 0.15722656, + "router_z_loss_mlp": 0.81640625, + "step": 7217, + "time_per_iteration": 2.4563424587249756 + }, + { + "auxiliary_loss_clip": 0.01116821, + "auxiliary_loss_mlp": 0.01034071, + "balance_loss_clip": 1.02048075, + "balance_loss_mlp": 1.04149795, + "epoch": 0.43396963775740266, + "flos": 14209853109120.0, + "grad_norm": 2.0012841708103677, + "language_loss": 0.73723286, + "learning_rate": 2.5187979193774664e-06, + "loss": 0.7587418, + "num_input_tokens_seen": 154794025, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.75390625, + "step": 7218, + "time_per_iteration": 2.5055034160614014 + }, + { + "auxiliary_loss_clip": 0.01119586, + "auxiliary_loss_mlp": 0.01030517, + "balance_loss_clip": 1.01706386, + "balance_loss_mlp": 1.0420804, + "epoch": 0.4340297610100706, + "flos": 19719052473600.0, + "grad_norm": 1.7121326309499156, + "language_loss": 0.68759704, + "learning_rate": 2.5184217790371367e-06, + "loss": 0.7090981, + "num_input_tokens_seen": 154813105, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7734375, + "step": 7219, + "time_per_iteration": 2.4480419158935547 + }, + { + "auxiliary_loss_clip": 0.01117032, + "auxiliary_loss_mlp": 0.01034297, + "balance_loss_clip": 1.02088046, + "balance_loss_mlp": 1.0424881, + "epoch": 0.4340898842627386, + "flos": 18953508885120.0, + "grad_norm": 1.5876624694807844, + "language_loss": 0.77227521, + "learning_rate": 2.518045619038202e-06, + "loss": 0.79378843, + "num_input_tokens_seen": 154833525, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7421875, + "step": 7220, + "time_per_iteration": 6.918288230895996 + }, + { + "auxiliary_loss_clip": 0.01116062, + "auxiliary_loss_mlp": 0.01035178, + "balance_loss_clip": 1.02162933, + "balance_loss_mlp": 1.04022503, + "epoch": 0.4341500075154066, + "flos": 22018304931840.0, + "grad_norm": 1.9118836764348202, + "language_loss": 0.69684327, + "learning_rate": 2.5176694393949243e-06, + "loss": 0.71835566, + "num_input_tokens_seen": 154853090, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7578125, + "step": 7221, + "time_per_iteration": 2.470270872116089 + }, + { + "auxiliary_loss_clip": 0.0111827, + "auxiliary_loss_mlp": 0.01037458, + "balance_loss_clip": 1.02436888, + "balance_loss_mlp": 1.04102325, + "epoch": 0.4342101307680746, + "flos": 23582465556480.0, + "grad_norm": 2.3043912227088206, + "language_loss": 0.64915985, + "learning_rate": 2.51729324012157e-06, + "loss": 0.67071712, + "num_input_tokens_seen": 154872055, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7734375, + "step": 7222, + "time_per_iteration": 2.553450584411621 + }, + { + "auxiliary_loss_clip": 0.01115314, + "auxiliary_loss_mlp": 0.01033134, + "balance_loss_clip": 1.01851892, + "balance_loss_mlp": 1.0400629, + "epoch": 0.43427025402074254, + "flos": 17967976450560.0, + "grad_norm": 1.98015103861908, + "language_loss": 0.73039752, + "learning_rate": 2.5169170212324053e-06, + "loss": 0.75188196, + "num_input_tokens_seen": 154886645, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.75390625, + "step": 7223, + "time_per_iteration": 2.4311954975128174 + }, + { + "auxiliary_loss_clip": 0.01117336, + "auxiliary_loss_mlp": 0.01030543, + "balance_loss_clip": 1.01639247, + "balance_loss_mlp": 1.03914881, + "epoch": 0.4343303772734105, + "flos": 26286395616000.0, + "grad_norm": 1.7516175042559776, + "language_loss": 0.93677819, + "learning_rate": 2.516540782741694e-06, + "loss": 0.95825702, + "num_input_tokens_seen": 154906775, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.78125, + "step": 7224, + "time_per_iteration": 2.5507140159606934 + }, + { + "auxiliary_loss_clip": 0.0111604, + "auxiliary_loss_mlp": 0.01035929, + "balance_loss_clip": 1.02230883, + "balance_loss_mlp": 1.04143298, + "epoch": 0.43439050052607847, + "flos": 26833961520000.0, + "grad_norm": 1.4456333860398556, + "language_loss": 0.61234355, + "learning_rate": 2.5161645246637056e-06, + "loss": 0.63386333, + "num_input_tokens_seen": 154926990, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 7225, + "time_per_iteration": 2.4982893466949463 + }, + { + "auxiliary_loss_clip": 0.01118584, + "auxiliary_loss_mlp": 0.01040064, + "balance_loss_clip": 1.02594388, + "balance_loss_mlp": 1.04326594, + "epoch": 0.43445062377874644, + "flos": 21397660807680.0, + "grad_norm": 1.8262630970377216, + "language_loss": 0.77771807, + "learning_rate": 2.5157882470127054e-06, + "loss": 0.79930449, + "num_input_tokens_seen": 154946210, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.75390625, + "step": 7226, + "time_per_iteration": 2.5427355766296387 + }, + { + "auxiliary_loss_clip": 0.0111488, + "auxiliary_loss_mlp": 0.01032848, + "balance_loss_clip": 1.01968753, + "balance_loss_mlp": 1.04169869, + "epoch": 0.4345107470314144, + "flos": 19901945548800.0, + "grad_norm": 1.6421213218207402, + "language_loss": 0.84485722, + "learning_rate": 2.515411949802964e-06, + "loss": 0.8663345, + "num_input_tokens_seen": 154964995, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73046875, + "step": 7227, + "time_per_iteration": 2.450390577316284 + }, + { + "auxiliary_loss_clip": 0.01115781, + "auxiliary_loss_mlp": 0.01035722, + "balance_loss_clip": 1.02163696, + "balance_loss_mlp": 1.04135513, + "epoch": 0.43457087028408237, + "flos": 26432623883520.0, + "grad_norm": 2.0443971193166735, + "language_loss": 0.76866895, + "learning_rate": 2.5150356330487498e-06, + "loss": 0.79018396, + "num_input_tokens_seen": 154984775, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 7228, + "time_per_iteration": 2.5690906047821045 + }, + { + "auxiliary_loss_clip": 0.01118098, + "auxiliary_loss_mlp": 0.01036235, + "balance_loss_clip": 1.02229989, + "balance_loss_mlp": 1.04278994, + "epoch": 0.43463099353675033, + "flos": 31868816855040.0, + "grad_norm": 1.4832672479414948, + "language_loss": 0.80732882, + "learning_rate": 2.5146592967643324e-06, + "loss": 0.82887214, + "num_input_tokens_seen": 155008125, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75390625, + "step": 7229, + "time_per_iteration": 2.552069902420044 + }, + { + "auxiliary_loss_clip": 0.01118257, + "auxiliary_loss_mlp": 0.01040852, + "balance_loss_clip": 1.02682161, + "balance_loss_mlp": 1.04213512, + "epoch": 0.4346911167894183, + "flos": 24571266128640.0, + "grad_norm": 2.091517296377785, + "language_loss": 0.81964421, + "learning_rate": 2.5142829409639834e-06, + "loss": 0.84123534, + "num_input_tokens_seen": 155027885, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 7230, + "time_per_iteration": 2.5944671630859375 + }, + { + "auxiliary_loss_clip": 0.01123399, + "auxiliary_loss_mlp": 0.01044498, + "balance_loss_clip": 1.03034186, + "balance_loss_mlp": 1.0445168, + "epoch": 0.43475124004208626, + "flos": 17090678672640.0, + "grad_norm": 2.146338977702966, + "language_loss": 0.77091062, + "learning_rate": 2.513906565661973e-06, + "loss": 0.79258955, + "num_input_tokens_seen": 155043375, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7890625, + "step": 7231, + "time_per_iteration": 2.460886001586914 + }, + { + "auxiliary_loss_clip": 0.01116567, + "auxiliary_loss_mlp": 0.01034718, + "balance_loss_clip": 1.02217722, + "balance_loss_mlp": 1.0421958, + "epoch": 0.4348113632947542, + "flos": 26104615862400.0, + "grad_norm": 1.391615561962781, + "language_loss": 0.6858201, + "learning_rate": 2.513530170872575e-06, + "loss": 0.70733297, + "num_input_tokens_seen": 155062930, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7421875, + "step": 7232, + "time_per_iteration": 2.614415407180786 + }, + { + "auxiliary_loss_clip": 0.01119763, + "auxiliary_loss_mlp": 0.01034466, + "balance_loss_clip": 1.02036333, + "balance_loss_mlp": 1.04160166, + "epoch": 0.4348714865474222, + "flos": 34200496316160.0, + "grad_norm": 1.6911603415584286, + "language_loss": 0.7200706, + "learning_rate": 2.5131537566100605e-06, + "loss": 0.74161285, + "num_input_tokens_seen": 155084980, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.78125, + "step": 7233, + "time_per_iteration": 2.5665411949157715 + }, + { + "auxiliary_loss_clip": 0.01120637, + "auxiliary_loss_mlp": 0.01040107, + "balance_loss_clip": 1.02490747, + "balance_loss_mlp": 1.04198027, + "epoch": 0.43493160980009016, + "flos": 31537468869120.0, + "grad_norm": 1.536262058034198, + "language_loss": 0.746382, + "learning_rate": 2.5127773228887053e-06, + "loss": 0.7679894, + "num_input_tokens_seen": 155107260, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.78515625, + "step": 7234, + "time_per_iteration": 2.577014207839966 + }, + { + "auxiliary_loss_clip": 0.01123093, + "auxiliary_loss_mlp": 0.01039703, + "balance_loss_clip": 1.02523136, + "balance_loss_mlp": 1.04223037, + "epoch": 0.4349917330527582, + "flos": 24061334699520.0, + "grad_norm": 1.829117772001415, + "language_loss": 0.58860987, + "learning_rate": 2.512400869722782e-06, + "loss": 0.61023784, + "num_input_tokens_seen": 155126720, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.80859375, + "step": 7235, + "time_per_iteration": 2.4759557247161865 + }, + { + "auxiliary_loss_clip": 0.01116416, + "auxiliary_loss_mlp": 0.01032457, + "balance_loss_clip": 1.01931453, + "balance_loss_mlp": 1.04053211, + "epoch": 0.43505185630542614, + "flos": 30519329863680.0, + "grad_norm": 1.4942606531447196, + "language_loss": 0.7751596, + "learning_rate": 2.512024397126566e-06, + "loss": 0.79664838, + "num_input_tokens_seen": 155148640, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7578125, + "step": 7236, + "time_per_iteration": 2.6113193035125732 + }, + { + "auxiliary_loss_clip": 0.01113405, + "auxiliary_loss_mlp": 0.01033592, + "balance_loss_clip": 1.01958489, + "balance_loss_mlp": 1.04001045, + "epoch": 0.4351119795580941, + "flos": 15735158196480.0, + "grad_norm": 1.713978383195529, + "language_loss": 0.8155449, + "learning_rate": 2.5116479051143345e-06, + "loss": 0.83701491, + "num_input_tokens_seen": 155165870, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.734375, + "step": 7237, + "time_per_iteration": 2.4341909885406494 + }, + { + "auxiliary_loss_clip": 0.01116801, + "auxiliary_loss_mlp": 0.01035584, + "balance_loss_clip": 1.02109957, + "balance_loss_mlp": 1.04103971, + "epoch": 0.4351721028107621, + "flos": 18731760272640.0, + "grad_norm": 3.0219595130639156, + "language_loss": 0.62897265, + "learning_rate": 2.5112713937003623e-06, + "loss": 0.65049648, + "num_input_tokens_seen": 155185315, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7578125, + "step": 7238, + "time_per_iteration": 2.5014469623565674 + }, + { + "auxiliary_loss_clip": 0.01111642, + "auxiliary_loss_mlp": 0.01041748, + "balance_loss_clip": 1.02848005, + "balance_loss_mlp": 1.03874493, + "epoch": 0.43523222606343004, + "flos": 25226887121280.0, + "grad_norm": 1.5839613956475427, + "language_loss": 0.85889554, + "learning_rate": 2.510894862898928e-06, + "loss": 0.88042951, + "num_input_tokens_seen": 155205790, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73046875, + "step": 7239, + "time_per_iteration": 2.4976143836975098 + }, + { + "auxiliary_loss_clip": 0.01118679, + "auxiliary_loss_mlp": 0.01032563, + "balance_loss_clip": 1.01896167, + "balance_loss_mlp": 1.0434041, + "epoch": 0.435292349316098, + "flos": 22709190101760.0, + "grad_norm": 1.4715329043565741, + "language_loss": 0.7269268, + "learning_rate": 2.510518312724309e-06, + "loss": 0.74843925, + "num_input_tokens_seen": 155226475, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.75, + "step": 7240, + "time_per_iteration": 2.5350124835968018 + }, + { + "auxiliary_loss_clip": 0.01119982, + "auxiliary_loss_mlp": 0.01033555, + "balance_loss_clip": 1.01897597, + "balance_loss_mlp": 1.04185855, + "epoch": 0.43535247256876597, + "flos": 25775889569280.0, + "grad_norm": 1.6878068305061695, + "language_loss": 0.81562793, + "learning_rate": 2.5101417431907842e-06, + "loss": 0.83716333, + "num_input_tokens_seen": 155247110, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.78125, + "step": 7241, + "time_per_iteration": 2.4924368858337402 + }, + { + "auxiliary_loss_clip": 0.01125084, + "auxiliary_loss_mlp": 0.01041861, + "balance_loss_clip": 1.02636945, + "balance_loss_mlp": 1.04387474, + "epoch": 0.43541259582143393, + "flos": 17528142412800.0, + "grad_norm": 3.067853888150903, + "language_loss": 0.79639387, + "learning_rate": 2.5097651543126345e-06, + "loss": 0.81806338, + "num_input_tokens_seen": 155261335, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.8125, + "step": 7242, + "time_per_iteration": 2.4884228706359863 + }, + { + "auxiliary_loss_clip": 0.01118288, + "auxiliary_loss_mlp": 0.01035892, + "balance_loss_clip": 1.02146733, + "balance_loss_mlp": 1.03994465, + "epoch": 0.4354727190741019, + "flos": 15195205975680.0, + "grad_norm": 2.2924190339180135, + "language_loss": 0.6872946, + "learning_rate": 2.509388546104138e-06, + "loss": 0.70883644, + "num_input_tokens_seen": 155278510, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78515625, + "step": 7243, + "time_per_iteration": 2.428065538406372 + }, + { + "auxiliary_loss_clip": 0.01114023, + "auxiliary_loss_mlp": 0.0103125, + "balance_loss_clip": 1.01814318, + "balance_loss_mlp": 1.04141152, + "epoch": 0.43553284232676986, + "flos": 16649264436480.0, + "grad_norm": 1.6975937608840317, + "language_loss": 0.8125546, + "learning_rate": 2.5090119185795766e-06, + "loss": 0.83400726, + "num_input_tokens_seen": 155296450, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 7244, + "time_per_iteration": 2.4931905269622803 + }, + { + "auxiliary_loss_clip": 0.01118248, + "auxiliary_loss_mlp": 0.01030839, + "balance_loss_clip": 1.01785159, + "balance_loss_mlp": 1.0428431, + "epoch": 0.43559296557943783, + "flos": 23400865370880.0, + "grad_norm": 1.7229772693729426, + "language_loss": 0.74017537, + "learning_rate": 2.508635271753234e-06, + "loss": 0.7616663, + "num_input_tokens_seen": 155316080, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.75390625, + "step": 7245, + "time_per_iteration": 2.4678800106048584 + }, + { + "auxiliary_loss_clip": 0.01116663, + "auxiliary_loss_mlp": 0.0103805, + "balance_loss_clip": 1.0248003, + "balance_loss_mlp": 1.041008, + "epoch": 0.4356530888321058, + "flos": 22419067950720.0, + "grad_norm": 1.577710817669204, + "language_loss": 0.7671771, + "learning_rate": 2.508258605639389e-06, + "loss": 0.78872424, + "num_input_tokens_seen": 155336765, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75390625, + "step": 7246, + "time_per_iteration": 2.5109541416168213 + }, + { + "auxiliary_loss_clip": 0.01118541, + "auxiliary_loss_mlp": 0.01037789, + "balance_loss_clip": 1.02348995, + "balance_loss_mlp": 1.04209638, + "epoch": 0.43571321208477376, + "flos": 21616141282560.0, + "grad_norm": 1.7904357433283469, + "language_loss": 0.85364228, + "learning_rate": 2.5078819202523275e-06, + "loss": 0.87520564, + "num_input_tokens_seen": 155356440, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.765625, + "step": 7247, + "time_per_iteration": 2.4546074867248535 + }, + { + "auxiliary_loss_clip": 0.01117365, + "auxiliary_loss_mlp": 0.01039048, + "balance_loss_clip": 1.02600694, + "balance_loss_mlp": 1.0420599, + "epoch": 0.4357733353374418, + "flos": 23987358639360.0, + "grad_norm": 1.5214849587217785, + "language_loss": 0.72576565, + "learning_rate": 2.507505215606333e-06, + "loss": 0.74732977, + "num_input_tokens_seen": 155377070, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.75, + "step": 7248, + "time_per_iteration": 2.5288567543029785 + }, + { + "auxiliary_loss_clip": 0.01117005, + "auxiliary_loss_mlp": 0.01036462, + "balance_loss_clip": 1.02280688, + "balance_loss_mlp": 1.04225719, + "epoch": 0.43583345859010975, + "flos": 25264737077760.0, + "grad_norm": 1.6049303411594007, + "language_loss": 0.87276042, + "learning_rate": 2.5071284917156893e-06, + "loss": 0.8942951, + "num_input_tokens_seen": 155398415, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 7249, + "time_per_iteration": 2.497281312942505 + }, + { + "auxiliary_loss_clip": 0.0111866, + "auxiliary_loss_mlp": 0.01043972, + "balance_loss_clip": 1.03053117, + "balance_loss_mlp": 1.04112244, + "epoch": 0.4358935818427777, + "flos": 23696302734720.0, + "grad_norm": 1.835450546624213, + "language_loss": 0.81989753, + "learning_rate": 2.506751748594683e-06, + "loss": 0.84152383, + "num_input_tokens_seen": 155415625, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7734375, + "step": 7250, + "time_per_iteration": 2.5563321113586426 + }, + { + "auxiliary_loss_clip": 0.01124846, + "auxiliary_loss_mlp": 0.01038743, + "balance_loss_clip": 1.02484369, + "balance_loss_mlp": 1.04729581, + "epoch": 0.4359537050954457, + "flos": 29532827761920.0, + "grad_norm": 1.737362510880261, + "language_loss": 0.84760177, + "learning_rate": 2.5063749862575988e-06, + "loss": 0.86923766, + "num_input_tokens_seen": 155435505, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.77734375, + "step": 7251, + "time_per_iteration": 2.5427803993225098 + }, + { + "auxiliary_loss_clip": 0.01113729, + "auxiliary_loss_mlp": 0.01038592, + "balance_loss_clip": 1.02469254, + "balance_loss_mlp": 1.03979266, + "epoch": 0.43601382834811364, + "flos": 22711273090560.0, + "grad_norm": 1.5112002334274994, + "language_loss": 0.69018251, + "learning_rate": 2.5059982047187245e-06, + "loss": 0.71170568, + "num_input_tokens_seen": 155455425, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.73828125, + "step": 7252, + "time_per_iteration": 2.5041210651397705 + }, + { + "auxiliary_loss_clip": 0.01115762, + "auxiliary_loss_mlp": 0.01036375, + "balance_loss_clip": 1.02233779, + "balance_loss_mlp": 1.04257536, + "epoch": 0.4360739516007816, + "flos": 19098731571840.0, + "grad_norm": 1.7846888638519947, + "language_loss": 0.83733922, + "learning_rate": 2.505621403992348e-06, + "loss": 0.85886061, + "num_input_tokens_seen": 155474250, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.734375, + "step": 7253, + "time_per_iteration": 2.434375047683716 + }, + { + "auxiliary_loss_clip": 0.01116361, + "auxiliary_loss_mlp": 0.01038184, + "balance_loss_clip": 1.02386165, + "balance_loss_mlp": 1.04254532, + "epoch": 0.43613407485344957, + "flos": 23404420817280.0, + "grad_norm": 1.4489781171091827, + "language_loss": 0.70361209, + "learning_rate": 2.505244584092757e-06, + "loss": 0.72515762, + "num_input_tokens_seen": 155494685, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.73828125, + "step": 7254, + "time_per_iteration": 2.5304319858551025 + }, + { + "auxiliary_loss_clip": 0.01116723, + "auxiliary_loss_mlp": 0.01038221, + "balance_loss_clip": 1.02503693, + "balance_loss_mlp": 1.04295266, + "epoch": 0.43619419810611754, + "flos": 22637799820800.0, + "grad_norm": 2.261189856456705, + "language_loss": 0.80833256, + "learning_rate": 2.5048677450342406e-06, + "loss": 0.82988203, + "num_input_tokens_seen": 155513040, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73828125, + "step": 7255, + "time_per_iteration": 2.4619336128234863 + }, + { + "auxiliary_loss_clip": 0.01115842, + "auxiliary_loss_mlp": 0.0103715, + "balance_loss_clip": 1.02375722, + "balance_loss_mlp": 1.0402987, + "epoch": 0.4362543213587855, + "flos": 20047958334720.0, + "grad_norm": 1.6623402785544918, + "language_loss": 0.77301329, + "learning_rate": 2.504490886831089e-06, + "loss": 0.79454327, + "num_input_tokens_seen": 155530100, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75390625, + "step": 7256, + "time_per_iteration": 2.502201557159424 + }, + { + "auxiliary_loss_clip": 0.01117553, + "auxiliary_loss_mlp": 0.01039022, + "balance_loss_clip": 1.02568853, + "balance_loss_mlp": 1.04400241, + "epoch": 0.43631444461145347, + "flos": 21361319222400.0, + "grad_norm": 1.8521029690454978, + "language_loss": 0.76273203, + "learning_rate": 2.5041140094975922e-06, + "loss": 0.78429782, + "num_input_tokens_seen": 155549375, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.734375, + "step": 7257, + "time_per_iteration": 2.4721548557281494 + }, + { + "auxiliary_loss_clip": 0.01115455, + "auxiliary_loss_mlp": 0.01039484, + "balance_loss_clip": 1.02553642, + "balance_loss_mlp": 1.04027009, + "epoch": 0.43637456786412143, + "flos": 22418529246720.0, + "grad_norm": 1.675034420512285, + "language_loss": 0.73065001, + "learning_rate": 2.5037371130480417e-06, + "loss": 0.75219941, + "num_input_tokens_seen": 155569395, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75, + "step": 7258, + "time_per_iteration": 2.5251166820526123 + }, + { + "auxiliary_loss_clip": 0.0111727, + "auxiliary_loss_mlp": 0.01034289, + "balance_loss_clip": 1.02083004, + "balance_loss_mlp": 1.04163384, + "epoch": 0.4364346911167894, + "flos": 28548839612160.0, + "grad_norm": 2.491243867162561, + "language_loss": 0.76496607, + "learning_rate": 2.5033601974967297e-06, + "loss": 0.78648162, + "num_input_tokens_seen": 155589090, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7578125, + "step": 7259, + "time_per_iteration": 2.4948387145996094 + }, + { + "auxiliary_loss_clip": 0.01038123, + "auxiliary_loss_mlp": 0.01006149, + "balance_loss_clip": 1.00483215, + "balance_loss_mlp": 1.01505399, + "epoch": 0.43649481436945736, + "flos": 62659345380480.0, + "grad_norm": 0.7446610885032177, + "language_loss": 0.570382, + "learning_rate": 2.5029832628579483e-06, + "loss": 0.59082472, + "num_input_tokens_seen": 155648660, + "router_z_loss_clip": 0.01318359, + "router_z_loss_mlp": 0.23144531, + "step": 7260, + "time_per_iteration": 3.023712396621704 + }, + { + "auxiliary_loss_clip": 0.01119405, + "auxiliary_loss_mlp": 0.01044844, + "balance_loss_clip": 1.03061068, + "balance_loss_mlp": 1.0423255, + "epoch": 0.4365549376221254, + "flos": 30592120775040.0, + "grad_norm": 2.013500079504657, + "language_loss": 0.71356845, + "learning_rate": 2.5026063091459907e-06, + "loss": 0.7352109, + "num_input_tokens_seen": 155669945, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.76953125, + "step": 7261, + "time_per_iteration": 2.559830665588379 + }, + { + "auxiliary_loss_clip": 0.01117377, + "auxiliary_loss_mlp": 0.01045312, + "balance_loss_clip": 1.0311265, + "balance_loss_mlp": 1.04076374, + "epoch": 0.43661506087479335, + "flos": 17165875795200.0, + "grad_norm": 1.767533570577482, + "language_loss": 0.69423878, + "learning_rate": 2.5022293363751522e-06, + "loss": 0.71586561, + "num_input_tokens_seen": 155688555, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.765625, + "step": 7262, + "time_per_iteration": 5.4921791553497314 + }, + { + "auxiliary_loss_clip": 0.01111099, + "auxiliary_loss_mlp": 0.01029231, + "balance_loss_clip": 1.01699996, + "balance_loss_mlp": 1.04062569, + "epoch": 0.4366751841274613, + "flos": 22047499710720.0, + "grad_norm": 1.7128833789230435, + "language_loss": 0.80033064, + "learning_rate": 2.501852344559726e-06, + "loss": 0.82173395, + "num_input_tokens_seen": 155705370, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.70703125, + "step": 7263, + "time_per_iteration": 2.5026779174804688 + }, + { + "auxiliary_loss_clip": 0.0111778, + "auxiliary_loss_mlp": 0.01046117, + "balance_loss_clip": 1.03210425, + "balance_loss_mlp": 1.043383, + "epoch": 0.4367353073801293, + "flos": 15997306631040.0, + "grad_norm": 1.8087965620474522, + "language_loss": 0.75092399, + "learning_rate": 2.50147533371401e-06, + "loss": 0.77256304, + "num_input_tokens_seen": 155721890, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 7264, + "time_per_iteration": 2.487065553665161 + }, + { + "auxiliary_loss_clip": 0.01114844, + "auxiliary_loss_mlp": 0.01035156, + "balance_loss_clip": 1.02143478, + "balance_loss_mlp": 1.04089546, + "epoch": 0.43679543063279724, + "flos": 38217535868160.0, + "grad_norm": 1.8571442110240568, + "language_loss": 0.61855227, + "learning_rate": 2.501098303852298e-06, + "loss": 0.6400522, + "num_input_tokens_seen": 155743970, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.73828125, + "step": 7265, + "time_per_iteration": 2.5982677936553955 + }, + { + "auxiliary_loss_clip": 0.01112809, + "auxiliary_loss_mlp": 0.01031457, + "balance_loss_clip": 1.01859391, + "balance_loss_mlp": 1.04026711, + "epoch": 0.4368555538854652, + "flos": 15193230727680.0, + "grad_norm": 2.1628188735926845, + "language_loss": 0.72982574, + "learning_rate": 2.5007212549888884e-06, + "loss": 0.75126845, + "num_input_tokens_seen": 155761830, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7265625, + "step": 7266, + "time_per_iteration": 2.4690847396850586 + }, + { + "auxiliary_loss_clip": 0.0111929, + "auxiliary_loss_mlp": 0.01037863, + "balance_loss_clip": 1.0240345, + "balance_loss_mlp": 1.04332638, + "epoch": 0.4369156771381332, + "flos": 23069086421760.0, + "grad_norm": 2.2896909207829954, + "language_loss": 0.81570059, + "learning_rate": 2.5003441871380794e-06, + "loss": 0.83727205, + "num_input_tokens_seen": 155779610, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76171875, + "step": 7267, + "time_per_iteration": 2.463283061981201 + }, + { + "auxiliary_loss_clip": 0.01113248, + "auxiliary_loss_mlp": 0.01030451, + "balance_loss_clip": 1.01803577, + "balance_loss_mlp": 1.04085267, + "epoch": 0.43697580039080114, + "flos": 23441085624960.0, + "grad_norm": 1.9116109849221483, + "language_loss": 0.74723095, + "learning_rate": 2.4999671003141674e-06, + "loss": 0.76866794, + "num_input_tokens_seen": 155798765, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.72265625, + "step": 7268, + "time_per_iteration": 2.516263723373413 + }, + { + "auxiliary_loss_clip": 0.01121105, + "auxiliary_loss_mlp": 0.01042039, + "balance_loss_clip": 1.02736425, + "balance_loss_mlp": 1.04315591, + "epoch": 0.4370359236434691, + "flos": 18514680428160.0, + "grad_norm": 1.9119374296408282, + "language_loss": 0.7954827, + "learning_rate": 2.499589994531454e-06, + "loss": 0.81711417, + "num_input_tokens_seen": 155817750, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78125, + "step": 7269, + "time_per_iteration": 2.4647111892700195 + }, + { + "auxiliary_loss_clip": 0.01117424, + "auxiliary_loss_mlp": 0.01037217, + "balance_loss_clip": 1.02404499, + "balance_loss_mlp": 1.04315174, + "epoch": 0.43709604689613707, + "flos": 23222497409280.0, + "grad_norm": 2.072373926876921, + "language_loss": 0.75031221, + "learning_rate": 2.499212869804237e-06, + "loss": 0.77185863, + "num_input_tokens_seen": 155836490, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 7270, + "time_per_iteration": 2.4963974952697754 + }, + { + "auxiliary_loss_clip": 0.01116927, + "auxiliary_loss_mlp": 0.01029854, + "balance_loss_clip": 1.01639485, + "balance_loss_mlp": 1.04269087, + "epoch": 0.43715617014880503, + "flos": 23803711378560.0, + "grad_norm": 1.906091328168401, + "language_loss": 0.79437554, + "learning_rate": 2.4988357261468182e-06, + "loss": 0.81584334, + "num_input_tokens_seen": 155856225, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7421875, + "step": 7271, + "time_per_iteration": 2.487238645553589 + }, + { + "auxiliary_loss_clip": 0.01039832, + "auxiliary_loss_mlp": 0.01001038, + "balance_loss_clip": 0.99965489, + "balance_loss_mlp": 1.01678514, + "epoch": 0.437216293401473, + "flos": 61941204766080.0, + "grad_norm": 0.6948313241096988, + "language_loss": 0.54902828, + "learning_rate": 2.4984585635734993e-06, + "loss": 0.56943697, + "num_input_tokens_seen": 155916770, + "router_z_loss_clip": 0.01385498, + "router_z_loss_mlp": 0.23046875, + "step": 7272, + "time_per_iteration": 3.1392502784729004 + }, + { + "auxiliary_loss_clip": 0.011197, + "auxiliary_loss_mlp": 0.01042804, + "balance_loss_clip": 1.0286535, + "balance_loss_mlp": 1.04332781, + "epoch": 0.43727641665414096, + "flos": 21982250655360.0, + "grad_norm": 2.967819772960297, + "language_loss": 0.70136559, + "learning_rate": 2.498081382098581e-06, + "loss": 0.72299063, + "num_input_tokens_seen": 155936490, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.765625, + "step": 7273, + "time_per_iteration": 2.468592643737793 + }, + { + "auxiliary_loss_clip": 0.01119234, + "auxiliary_loss_mlp": 0.01039173, + "balance_loss_clip": 1.02515411, + "balance_loss_mlp": 1.04280722, + "epoch": 0.437336539906809, + "flos": 39530860842240.0, + "grad_norm": 1.832145479464728, + "language_loss": 0.75091398, + "learning_rate": 2.497704181736367e-06, + "loss": 0.77249801, + "num_input_tokens_seen": 155957595, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.765625, + "step": 7274, + "time_per_iteration": 2.669516086578369 + }, + { + "auxiliary_loss_clip": 0.01112894, + "auxiliary_loss_mlp": 0.01029332, + "balance_loss_clip": 1.01741123, + "balance_loss_mlp": 1.04002881, + "epoch": 0.43739666315947695, + "flos": 17457147181440.0, + "grad_norm": 1.8126381729021082, + "language_loss": 0.80507416, + "learning_rate": 2.49732696250116e-06, + "loss": 0.82649636, + "num_input_tokens_seen": 155975710, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.7265625, + "step": 7275, + "time_per_iteration": 2.455235481262207 + }, + { + "auxiliary_loss_clip": 0.01118348, + "auxiliary_loss_mlp": 0.01036773, + "balance_loss_clip": 1.02357626, + "balance_loss_mlp": 1.04496706, + "epoch": 0.4374567864121449, + "flos": 16358747235840.0, + "grad_norm": 2.065941875742038, + "language_loss": 0.80955482, + "learning_rate": 2.496949724407266e-06, + "loss": 0.83110607, + "num_input_tokens_seen": 155993090, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 7276, + "time_per_iteration": 2.543306827545166 + }, + { + "auxiliary_loss_clip": 0.01122471, + "auxiliary_loss_mlp": 0.01034927, + "balance_loss_clip": 1.02145052, + "balance_loss_mlp": 1.04409111, + "epoch": 0.4375169096648129, + "flos": 30587523834240.0, + "grad_norm": 1.794283698167311, + "language_loss": 0.73373604, + "learning_rate": 2.496572467468988e-06, + "loss": 0.75530994, + "num_input_tokens_seen": 156013685, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.78515625, + "step": 7277, + "time_per_iteration": 2.5931403636932373 + }, + { + "auxiliary_loss_clip": 0.01117806, + "auxiliary_loss_mlp": 0.01035681, + "balance_loss_clip": 1.0222764, + "balance_loss_mlp": 1.04351854, + "epoch": 0.43757703291748085, + "flos": 30555599621760.0, + "grad_norm": 1.8969119275678887, + "language_loss": 0.72953606, + "learning_rate": 2.4961951917006317e-06, + "loss": 0.75107086, + "num_input_tokens_seen": 156034300, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7421875, + "step": 7278, + "time_per_iteration": 2.576266288757324 + }, + { + "auxiliary_loss_clip": 0.0111536, + "auxiliary_loss_mlp": 0.01033831, + "balance_loss_clip": 1.02152252, + "balance_loss_mlp": 1.04212785, + "epoch": 0.4376371561701488, + "flos": 21397373498880.0, + "grad_norm": 1.6273415021791042, + "language_loss": 0.65815622, + "learning_rate": 2.4958178971165046e-06, + "loss": 0.6796481, + "num_input_tokens_seen": 156053805, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.734375, + "step": 7279, + "time_per_iteration": 2.4717864990234375 + }, + { + "auxiliary_loss_clip": 0.01122391, + "auxiliary_loss_mlp": 0.01034407, + "balance_loss_clip": 1.02098393, + "balance_loss_mlp": 1.04393768, + "epoch": 0.4376972794228168, + "flos": 23404384903680.0, + "grad_norm": 1.838486718423984, + "language_loss": 0.82088757, + "learning_rate": 2.4954405837309126e-06, + "loss": 0.84245551, + "num_input_tokens_seen": 156073295, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.78515625, + "step": 7280, + "time_per_iteration": 2.5370771884918213 + }, + { + "auxiliary_loss_clip": 0.01114089, + "auxiliary_loss_mlp": 0.01033911, + "balance_loss_clip": 1.0209589, + "balance_loss_mlp": 1.04176164, + "epoch": 0.43775740267548474, + "flos": 22892945103360.0, + "grad_norm": 1.430381072646336, + "language_loss": 0.76786566, + "learning_rate": 2.4950632515581653e-06, + "loss": 0.78934562, + "num_input_tokens_seen": 156094540, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.72265625, + "step": 7281, + "time_per_iteration": 2.5260467529296875 + }, + { + "auxiliary_loss_clip": 0.01116043, + "auxiliary_loss_mlp": 0.01038639, + "balance_loss_clip": 1.02582431, + "balance_loss_mlp": 1.04211211, + "epoch": 0.4378175259281527, + "flos": 23294390480640.0, + "grad_norm": 1.8435972134321474, + "language_loss": 0.7572853, + "learning_rate": 2.494685900612569e-06, + "loss": 0.77883214, + "num_input_tokens_seen": 156114070, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7421875, + "step": 7282, + "time_per_iteration": 2.5332953929901123 + }, + { + "auxiliary_loss_clip": 0.01119087, + "auxiliary_loss_mlp": 0.01039188, + "balance_loss_clip": 1.02581239, + "balance_loss_mlp": 1.04421043, + "epoch": 0.43787764918082067, + "flos": 23876897339520.0, + "grad_norm": 1.8874106414487752, + "language_loss": 0.8494271, + "learning_rate": 2.4943085309084333e-06, + "loss": 0.87100983, + "num_input_tokens_seen": 156132130, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.75, + "step": 7283, + "time_per_iteration": 2.458500623703003 + }, + { + "auxiliary_loss_clip": 0.01119709, + "auxiliary_loss_mlp": 0.01034549, + "balance_loss_clip": 1.02060771, + "balance_loss_mlp": 1.04216719, + "epoch": 0.43793777243348864, + "flos": 23988148738560.0, + "grad_norm": 1.9095323636494845, + "language_loss": 0.8005324, + "learning_rate": 2.49393114246007e-06, + "loss": 0.82207501, + "num_input_tokens_seen": 156150820, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7734375, + "step": 7284, + "time_per_iteration": 2.5258796215057373 + }, + { + "auxiliary_loss_clip": 0.01115467, + "auxiliary_loss_mlp": 0.01040827, + "balance_loss_clip": 1.02851903, + "balance_loss_mlp": 1.04236269, + "epoch": 0.4379978956861566, + "flos": 18624064320000.0, + "grad_norm": 1.535068058496724, + "language_loss": 0.8028115, + "learning_rate": 2.493553735281787e-06, + "loss": 0.82437444, + "num_input_tokens_seen": 156170125, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.734375, + "step": 7285, + "time_per_iteration": 2.4441394805908203 + }, + { + "auxiliary_loss_clip": 0.0111367, + "auxiliary_loss_mlp": 0.01028929, + "balance_loss_clip": 1.01576853, + "balance_loss_mlp": 1.04086363, + "epoch": 0.43805801893882457, + "flos": 21981388728960.0, + "grad_norm": 1.9937836479025883, + "language_loss": 0.75031531, + "learning_rate": 2.493176309387897e-06, + "loss": 0.77174133, + "num_input_tokens_seen": 156187320, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7265625, + "step": 7286, + "time_per_iteration": 2.539954423904419 + }, + { + "auxiliary_loss_clip": 0.01118753, + "auxiliary_loss_mlp": 0.01030019, + "balance_loss_clip": 1.01642346, + "balance_loss_mlp": 1.04179096, + "epoch": 0.43811814219149253, + "flos": 26393337383040.0, + "grad_norm": 1.7090844157721894, + "language_loss": 0.73834682, + "learning_rate": 2.492798864792712e-06, + "loss": 0.75983447, + "num_input_tokens_seen": 156207455, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.76953125, + "step": 7287, + "time_per_iteration": 2.5056257247924805 + }, + { + "auxiliary_loss_clip": 0.01117808, + "auxiliary_loss_mlp": 0.010426, + "balance_loss_clip": 1.02887869, + "balance_loss_mlp": 1.04187727, + "epoch": 0.43817826544416055, + "flos": 17493309198720.0, + "grad_norm": 1.8325493621162303, + "language_loss": 0.82288051, + "learning_rate": 2.492421401510545e-06, + "loss": 0.84448457, + "num_input_tokens_seen": 156226560, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7578125, + "step": 7288, + "time_per_iteration": 2.4812850952148438 + }, + { + "auxiliary_loss_clip": 0.01117047, + "auxiliary_loss_mlp": 0.01033722, + "balance_loss_clip": 1.02008474, + "balance_loss_mlp": 1.03895211, + "epoch": 0.4382383886968285, + "flos": 21581020759680.0, + "grad_norm": 1.476666560822241, + "language_loss": 0.84346598, + "learning_rate": 2.4920439195557093e-06, + "loss": 0.86497366, + "num_input_tokens_seen": 156246740, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.78125, + "step": 7289, + "time_per_iteration": 2.482379674911499 + }, + { + "auxiliary_loss_clip": 0.01119976, + "auxiliary_loss_mlp": 0.01036661, + "balance_loss_clip": 1.0235244, + "balance_loss_mlp": 1.04139173, + "epoch": 0.4382985119494965, + "flos": 27923742201600.0, + "grad_norm": 1.4352131560569001, + "language_loss": 0.78107727, + "learning_rate": 2.4916664189425183e-06, + "loss": 0.80264366, + "num_input_tokens_seen": 156266440, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.78515625, + "step": 7290, + "time_per_iteration": 2.5521459579467773 + }, + { + "auxiliary_loss_clip": 0.01115969, + "auxiliary_loss_mlp": 0.0104054, + "balance_loss_clip": 1.02761197, + "balance_loss_mlp": 1.04235792, + "epoch": 0.43835863520216445, + "flos": 24936836797440.0, + "grad_norm": 3.384239132873348, + "language_loss": 0.77987993, + "learning_rate": 2.491288899685288e-06, + "loss": 0.80144495, + "num_input_tokens_seen": 156286900, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.734375, + "step": 7291, + "time_per_iteration": 2.512519121170044 + }, + { + "auxiliary_loss_clip": 0.01117762, + "auxiliary_loss_mlp": 0.01031364, + "balance_loss_clip": 1.01792359, + "balance_loss_mlp": 1.04297888, + "epoch": 0.4384187584548324, + "flos": 33510293504640.0, + "grad_norm": 1.5428221976657872, + "language_loss": 0.65224636, + "learning_rate": 2.4909113617983325e-06, + "loss": 0.67373765, + "num_input_tokens_seen": 156307690, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 7292, + "time_per_iteration": 2.597714424133301 + }, + { + "auxiliary_loss_clip": 0.0111598, + "auxiliary_loss_mlp": 0.01031038, + "balance_loss_clip": 1.01864016, + "balance_loss_mlp": 1.03967905, + "epoch": 0.4384788817075004, + "flos": 23951052967680.0, + "grad_norm": 1.884679810356821, + "language_loss": 0.74216962, + "learning_rate": 2.49053380529597e-06, + "loss": 0.76363981, + "num_input_tokens_seen": 156326620, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.76171875, + "step": 7293, + "time_per_iteration": 2.4943923950195312 + }, + { + "auxiliary_loss_clip": 0.01119197, + "auxiliary_loss_mlp": 0.01040872, + "balance_loss_clip": 1.02732337, + "balance_loss_mlp": 1.04433274, + "epoch": 0.43853900496016834, + "flos": 19098516090240.0, + "grad_norm": 2.4110491255972684, + "language_loss": 0.78757977, + "learning_rate": 2.490156230192516e-06, + "loss": 0.8091805, + "num_input_tokens_seen": 156345495, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.75, + "step": 7294, + "time_per_iteration": 2.495358467102051 + }, + { + "auxiliary_loss_clip": 0.0111963, + "auxiliary_loss_mlp": 0.01041568, + "balance_loss_clip": 1.02864015, + "balance_loss_mlp": 1.04313052, + "epoch": 0.4385991282128363, + "flos": 13225362168960.0, + "grad_norm": 1.7229696907351246, + "language_loss": 0.73184276, + "learning_rate": 2.4897786365022883e-06, + "loss": 0.7534548, + "num_input_tokens_seen": 156363155, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.765625, + "step": 7295, + "time_per_iteration": 2.4645302295684814 + }, + { + "auxiliary_loss_clip": 0.01119056, + "auxiliary_loss_mlp": 0.01039679, + "balance_loss_clip": 1.02573109, + "balance_loss_mlp": 1.042575, + "epoch": 0.4386592514655043, + "flos": 14319883445760.0, + "grad_norm": 2.059865438640582, + "language_loss": 0.75337231, + "learning_rate": 2.4894010242396063e-06, + "loss": 0.77495956, + "num_input_tokens_seen": 156380940, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.765625, + "step": 7296, + "time_per_iteration": 2.46444034576416 + }, + { + "auxiliary_loss_clip": 0.01117475, + "auxiliary_loss_mlp": 0.01033002, + "balance_loss_clip": 1.01976418, + "balance_loss_mlp": 1.04255402, + "epoch": 0.43871937471817224, + "flos": 22784423137920.0, + "grad_norm": 1.6034841999072227, + "language_loss": 0.69515687, + "learning_rate": 2.4890233934187873e-06, + "loss": 0.71666169, + "num_input_tokens_seen": 156400415, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75, + "step": 7297, + "time_per_iteration": 2.4995949268341064 + }, + { + "auxiliary_loss_clip": 0.01115206, + "auxiliary_loss_mlp": 0.01031948, + "balance_loss_clip": 1.01913857, + "balance_loss_mlp": 1.04173827, + "epoch": 0.4387794979708402, + "flos": 28072304853120.0, + "grad_norm": 1.494373898338378, + "language_loss": 0.70457232, + "learning_rate": 2.4886457440541535e-06, + "loss": 0.72604382, + "num_input_tokens_seen": 156421120, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.734375, + "step": 7298, + "time_per_iteration": 2.574982166290283 + }, + { + "auxiliary_loss_clip": 0.01117164, + "auxiliary_loss_mlp": 0.01029544, + "balance_loss_clip": 1.01672888, + "balance_loss_mlp": 1.04384279, + "epoch": 0.43883962122350817, + "flos": 26249551240320.0, + "grad_norm": 1.5912334767066174, + "language_loss": 0.7241621, + "learning_rate": 2.4882680761600238e-06, + "loss": 0.74562919, + "num_input_tokens_seen": 156441535, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.734375, + "step": 7299, + "time_per_iteration": 2.539013385772705 + }, + { + "auxiliary_loss_clip": 0.01120808, + "auxiliary_loss_mlp": 0.01047348, + "balance_loss_clip": 1.03278041, + "balance_loss_mlp": 1.043944, + "epoch": 0.43889974447617613, + "flos": 25883765089920.0, + "grad_norm": 1.8082969607549542, + "language_loss": 0.77112591, + "learning_rate": 2.487890389750719e-06, + "loss": 0.79280752, + "num_input_tokens_seen": 156462015, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.76953125, + "step": 7300, + "time_per_iteration": 2.567291259765625 + }, + { + "auxiliary_loss_clip": 0.0111673, + "auxiliary_loss_mlp": 0.01037561, + "balance_loss_clip": 1.02442431, + "balance_loss_mlp": 1.04064155, + "epoch": 0.43895986772884416, + "flos": 25046615738880.0, + "grad_norm": 1.6241879676388415, + "language_loss": 0.70685148, + "learning_rate": 2.4875126848405626e-06, + "loss": 0.72839439, + "num_input_tokens_seen": 156482165, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7578125, + "step": 7301, + "time_per_iteration": 2.497025489807129 + }, + { + "auxiliary_loss_clip": 0.01122863, + "auxiliary_loss_mlp": 0.01033554, + "balance_loss_clip": 1.01932693, + "balance_loss_mlp": 1.04512143, + "epoch": 0.4390199909815121, + "flos": 25994585525760.0, + "grad_norm": 1.911748384222125, + "language_loss": 0.70491576, + "learning_rate": 2.4871349614438757e-06, + "loss": 0.72647995, + "num_input_tokens_seen": 156503170, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.77734375, + "step": 7302, + "time_per_iteration": 2.5212793350219727 + }, + { + "auxiliary_loss_clip": 0.011184, + "auxiliary_loss_mlp": 0.01039693, + "balance_loss_clip": 1.02676439, + "balance_loss_mlp": 1.04383337, + "epoch": 0.4390801142341801, + "flos": 29022249888000.0, + "grad_norm": 1.741042450815644, + "language_loss": 0.82304549, + "learning_rate": 2.486757219574983e-06, + "loss": 0.84462643, + "num_input_tokens_seen": 156523005, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.74609375, + "step": 7303, + "time_per_iteration": 2.5407814979553223 + }, + { + "auxiliary_loss_clip": 0.01123737, + "auxiliary_loss_mlp": 0.01042372, + "balance_loss_clip": 1.02753651, + "balance_loss_mlp": 1.04429436, + "epoch": 0.43914023748684805, + "flos": 33438544087680.0, + "grad_norm": 2.4492152950747412, + "language_loss": 0.68408841, + "learning_rate": 2.4863794592482067e-06, + "loss": 0.70574951, + "num_input_tokens_seen": 156544440, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.796875, + "step": 7304, + "time_per_iteration": 4.099287509918213 + }, + { + "auxiliary_loss_clip": 0.01116014, + "auxiliary_loss_mlp": 0.01039361, + "balance_loss_clip": 1.02631354, + "balance_loss_mlp": 1.04335666, + "epoch": 0.439200360739516, + "flos": 34531844302080.0, + "grad_norm": 1.4059546174528585, + "language_loss": 0.78115439, + "learning_rate": 2.486001680477873e-06, + "loss": 0.80270815, + "num_input_tokens_seen": 156565410, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 7305, + "time_per_iteration": 2.6079509258270264 + }, + { + "auxiliary_loss_clip": 0.01116718, + "auxiliary_loss_mlp": 0.01037045, + "balance_loss_clip": 1.02376556, + "balance_loss_mlp": 1.04186165, + "epoch": 0.439260483992184, + "flos": 21907843632000.0, + "grad_norm": 1.688110038500655, + "language_loss": 0.68754542, + "learning_rate": 2.485623883278308e-06, + "loss": 0.70908302, + "num_input_tokens_seen": 156584210, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.74609375, + "step": 7306, + "time_per_iteration": 2.4539954662323 + }, + { + "auxiliary_loss_clip": 0.01119821, + "auxiliary_loss_mlp": 0.0103523, + "balance_loss_clip": 1.0214076, + "balance_loss_mlp": 1.04369712, + "epoch": 0.43932060724485195, + "flos": 20996430912000.0, + "grad_norm": 1.4603628541776523, + "language_loss": 0.6270709, + "learning_rate": 2.4852460676638344e-06, + "loss": 0.64862138, + "num_input_tokens_seen": 156602730, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76171875, + "step": 7307, + "time_per_iteration": 2.490736484527588 + }, + { + "auxiliary_loss_clip": 0.01121175, + "auxiliary_loss_mlp": 0.01032991, + "balance_loss_clip": 1.02001536, + "balance_loss_mlp": 1.04338455, + "epoch": 0.4393807304975199, + "flos": 17747053850880.0, + "grad_norm": 1.9032558944481925, + "language_loss": 0.72409779, + "learning_rate": 2.4848682336487828e-06, + "loss": 0.74563944, + "num_input_tokens_seen": 156619405, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.77734375, + "step": 7308, + "time_per_iteration": 2.4319982528686523 + }, + { + "auxiliary_loss_clip": 0.01120176, + "auxiliary_loss_mlp": 0.01037474, + "balance_loss_clip": 1.02347863, + "balance_loss_mlp": 1.04077995, + "epoch": 0.4394408537501879, + "flos": 22528523669760.0, + "grad_norm": 1.6404677903158766, + "language_loss": 0.76631165, + "learning_rate": 2.4844903812474787e-06, + "loss": 0.78788805, + "num_input_tokens_seen": 156638165, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.79296875, + "step": 7309, + "time_per_iteration": 2.5045857429504395 + }, + { + "auxiliary_loss_clip": 0.01115088, + "auxiliary_loss_mlp": 0.01032267, + "balance_loss_clip": 1.01943445, + "balance_loss_mlp": 1.04314303, + "epoch": 0.43950097700285584, + "flos": 23440654661760.0, + "grad_norm": 1.788496009330223, + "language_loss": 0.70666951, + "learning_rate": 2.484112510474251e-06, + "loss": 0.72814304, + "num_input_tokens_seen": 156658845, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 7310, + "time_per_iteration": 2.4732789993286133 + }, + { + "auxiliary_loss_clip": 0.01120896, + "auxiliary_loss_mlp": 0.01036599, + "balance_loss_clip": 1.02293789, + "balance_loss_mlp": 1.04397106, + "epoch": 0.4395611002555238, + "flos": 23180696956800.0, + "grad_norm": 2.1134854859852505, + "language_loss": 0.75800377, + "learning_rate": 2.483734621343429e-06, + "loss": 0.77957869, + "num_input_tokens_seen": 156677275, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.765625, + "step": 7311, + "time_per_iteration": 2.5372462272644043 + }, + { + "auxiliary_loss_clip": 0.01119727, + "auxiliary_loss_mlp": 0.01034557, + "balance_loss_clip": 1.02171779, + "balance_loss_mlp": 1.04376173, + "epoch": 0.43962122350819177, + "flos": 22127365601280.0, + "grad_norm": 1.9313159099964634, + "language_loss": 0.8127231, + "learning_rate": 2.483356713869341e-06, + "loss": 0.83426595, + "num_input_tokens_seen": 156695815, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7578125, + "step": 7312, + "time_per_iteration": 2.4858858585357666 + }, + { + "auxiliary_loss_clip": 0.01115776, + "auxiliary_loss_mlp": 0.01037621, + "balance_loss_clip": 1.02404332, + "balance_loss_mlp": 1.04030704, + "epoch": 0.43968134676085974, + "flos": 17420554200960.0, + "grad_norm": 2.2005104401689177, + "language_loss": 0.85444236, + "learning_rate": 2.482978788066318e-06, + "loss": 0.87597632, + "num_input_tokens_seen": 156714385, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.75390625, + "step": 7313, + "time_per_iteration": 2.493032932281494 + }, + { + "auxiliary_loss_clip": 0.01119815, + "auxiliary_loss_mlp": 0.01035042, + "balance_loss_clip": 1.02176809, + "balance_loss_mlp": 1.04182911, + "epoch": 0.43974147001352776, + "flos": 18952646958720.0, + "grad_norm": 3.8587100296686145, + "language_loss": 0.67464912, + "learning_rate": 2.4826008439486904e-06, + "loss": 0.69619775, + "num_input_tokens_seen": 156732615, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.78125, + "step": 7314, + "time_per_iteration": 2.4542195796966553 + }, + { + "auxiliary_loss_clip": 0.01121265, + "auxiliary_loss_mlp": 0.01034379, + "balance_loss_clip": 1.02063417, + "balance_loss_mlp": 1.04389846, + "epoch": 0.4398015932661957, + "flos": 18953508885120.0, + "grad_norm": 1.8025616803524547, + "language_loss": 0.76954508, + "learning_rate": 2.4822228815307915e-06, + "loss": 0.79110146, + "num_input_tokens_seen": 156750920, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7734375, + "step": 7315, + "time_per_iteration": 2.4988253116607666 + }, + { + "auxiliary_loss_clip": 0.01117641, + "auxiliary_loss_mlp": 0.01032745, + "balance_loss_clip": 1.01938725, + "balance_loss_mlp": 1.04280567, + "epoch": 0.4398617165188637, + "flos": 24199913370240.0, + "grad_norm": 2.4575060004131895, + "language_loss": 0.74807358, + "learning_rate": 2.4818449008269523e-06, + "loss": 0.76957744, + "num_input_tokens_seen": 156768520, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.75, + "step": 7316, + "time_per_iteration": 2.530104398727417 + }, + { + "auxiliary_loss_clip": 0.0112083, + "auxiliary_loss_mlp": 0.01041846, + "balance_loss_clip": 1.02928746, + "balance_loss_mlp": 1.04640257, + "epoch": 0.43992183977153165, + "flos": 22236677665920.0, + "grad_norm": 2.8405076524150568, + "language_loss": 0.65180635, + "learning_rate": 2.481466901851506e-06, + "loss": 0.67343318, + "num_input_tokens_seen": 156788700, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.74609375, + "step": 7317, + "time_per_iteration": 2.5233771800994873 + }, + { + "auxiliary_loss_clip": 0.01121891, + "auxiliary_loss_mlp": 0.01034231, + "balance_loss_clip": 1.02082634, + "balance_loss_mlp": 1.04455566, + "epoch": 0.4399819630241996, + "flos": 18697465762560.0, + "grad_norm": 1.7710834755986071, + "language_loss": 0.7968365, + "learning_rate": 2.4810888846187865e-06, + "loss": 0.8183977, + "num_input_tokens_seen": 156806470, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7734375, + "step": 7318, + "time_per_iteration": 2.4618961811065674 + }, + { + "auxiliary_loss_clip": 0.01122714, + "auxiliary_loss_mlp": 0.01036897, + "balance_loss_clip": 1.02316427, + "balance_loss_mlp": 1.04423118, + "epoch": 0.4400420862768676, + "flos": 23879375377920.0, + "grad_norm": 1.4932738321413537, + "language_loss": 0.79472506, + "learning_rate": 2.4807108491431283e-06, + "loss": 0.81632113, + "num_input_tokens_seen": 156825895, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.78515625, + "step": 7319, + "time_per_iteration": 2.5342819690704346 + }, + { + "auxiliary_loss_clip": 0.01117114, + "auxiliary_loss_mlp": 0.01040515, + "balance_loss_clip": 1.02637124, + "balance_loss_mlp": 1.04102063, + "epoch": 0.44010220952953555, + "flos": 28037615293440.0, + "grad_norm": 1.641668171652613, + "language_loss": 0.80221331, + "learning_rate": 2.4803327954388667e-06, + "loss": 0.82378966, + "num_input_tokens_seen": 156845990, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 7320, + "time_per_iteration": 2.520888566970825 + }, + { + "auxiliary_loss_clip": 0.01116164, + "auxiliary_loss_mlp": 0.01036235, + "balance_loss_clip": 1.02323556, + "balance_loss_mlp": 1.04136741, + "epoch": 0.4401623327822035, + "flos": 23768985905280.0, + "grad_norm": 1.6986497736973376, + "language_loss": 0.69795078, + "learning_rate": 2.4799547235203376e-06, + "loss": 0.71947479, + "num_input_tokens_seen": 156866685, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.75, + "step": 7321, + "time_per_iteration": 2.5457892417907715 + }, + { + "auxiliary_loss_clip": 0.01039878, + "auxiliary_loss_mlp": 0.01008287, + "balance_loss_clip": 1.00702953, + "balance_loss_mlp": 1.01681685, + "epoch": 0.4402224560348715, + "flos": 70774583264640.0, + "grad_norm": 0.8741267032944617, + "language_loss": 0.56908953, + "learning_rate": 2.4795766334018763e-06, + "loss": 0.58957124, + "num_input_tokens_seen": 156923450, + "router_z_loss_clip": 0.01257324, + "router_z_loss_mlp": 0.23046875, + "step": 7322, + "time_per_iteration": 3.164207935333252 + }, + { + "auxiliary_loss_clip": 0.01117179, + "auxiliary_loss_mlp": 0.01029685, + "balance_loss_clip": 1.01813388, + "balance_loss_mlp": 1.04277694, + "epoch": 0.44028257928753944, + "flos": 22891795868160.0, + "grad_norm": 1.4567737767029483, + "language_loss": 0.76075542, + "learning_rate": 2.479198525097822e-06, + "loss": 0.78222406, + "num_input_tokens_seen": 156944795, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.74609375, + "step": 7323, + "time_per_iteration": 2.5279085636138916 + }, + { + "auxiliary_loss_clip": 0.01117385, + "auxiliary_loss_mlp": 0.01037268, + "balance_loss_clip": 1.02369034, + "balance_loss_mlp": 1.0409224, + "epoch": 0.4403427025402074, + "flos": 17895760156800.0, + "grad_norm": 1.5548582319563429, + "language_loss": 0.8034448, + "learning_rate": 2.478820398622511e-06, + "loss": 0.82499135, + "num_input_tokens_seen": 156962755, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.765625, + "step": 7324, + "time_per_iteration": 2.4854304790496826 + }, + { + "auxiliary_loss_clip": 0.01039688, + "auxiliary_loss_mlp": 0.01006776, + "balance_loss_clip": 1.00549471, + "balance_loss_mlp": 1.01659369, + "epoch": 0.4404028257928754, + "flos": 69562525708800.0, + "grad_norm": 0.66599266679982, + "language_loss": 0.54557002, + "learning_rate": 2.478442253990283e-06, + "loss": 0.56603467, + "num_input_tokens_seen": 157028095, + "router_z_loss_clip": 0.01281738, + "router_z_loss_mlp": 0.23144531, + "step": 7325, + "time_per_iteration": 3.081268787384033 + }, + { + "auxiliary_loss_clip": 0.01116252, + "auxiliary_loss_mlp": 0.0103012, + "balance_loss_clip": 1.01792467, + "balance_loss_mlp": 1.04348588, + "epoch": 0.44046294904554334, + "flos": 20923675914240.0, + "grad_norm": 1.5427042359768692, + "language_loss": 0.69823551, + "learning_rate": 2.4780640912154766e-06, + "loss": 0.71969926, + "num_input_tokens_seen": 157048365, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.7265625, + "step": 7326, + "time_per_iteration": 2.489088535308838 + }, + { + "auxiliary_loss_clip": 0.01112531, + "auxiliary_loss_mlp": 0.01029175, + "balance_loss_clip": 1.01634765, + "balance_loss_mlp": 1.03926969, + "epoch": 0.44052307229821136, + "flos": 23623475909760.0, + "grad_norm": 1.4106900729498488, + "language_loss": 0.76410896, + "learning_rate": 2.477685910312432e-06, + "loss": 0.78552604, + "num_input_tokens_seen": 157069130, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.734375, + "step": 7327, + "time_per_iteration": 2.5099427700042725 + }, + { + "auxiliary_loss_clip": 0.01112963, + "auxiliary_loss_mlp": 0.01032486, + "balance_loss_clip": 1.01947999, + "balance_loss_mlp": 1.04029953, + "epoch": 0.4405831955508793, + "flos": 17597665186560.0, + "grad_norm": 1.92290278058118, + "language_loss": 0.83856362, + "learning_rate": 2.4773077112954897e-06, + "loss": 0.86001813, + "num_input_tokens_seen": 157084940, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7265625, + "step": 7328, + "time_per_iteration": 2.453078269958496 + }, + { + "auxiliary_loss_clip": 0.01114955, + "auxiliary_loss_mlp": 0.01028238, + "balance_loss_clip": 1.01505983, + "balance_loss_mlp": 1.04100752, + "epoch": 0.4406433188035473, + "flos": 21463376739840.0, + "grad_norm": 2.489103584507488, + "language_loss": 0.77842677, + "learning_rate": 2.4769294941789908e-06, + "loss": 0.79985875, + "num_input_tokens_seen": 157102770, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73828125, + "step": 7329, + "time_per_iteration": 2.4908933639526367 + }, + { + "auxiliary_loss_clip": 0.01118689, + "auxiliary_loss_mlp": 0.01033841, + "balance_loss_clip": 1.02069247, + "balance_loss_mlp": 1.04125428, + "epoch": 0.44070344205621526, + "flos": 22673566788480.0, + "grad_norm": 1.7085588184823939, + "language_loss": 0.73343551, + "learning_rate": 2.476551258977278e-06, + "loss": 0.75496078, + "num_input_tokens_seen": 157122035, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7734375, + "step": 7330, + "time_per_iteration": 2.463330030441284 + }, + { + "auxiliary_loss_clip": 0.01116337, + "auxiliary_loss_mlp": 0.01032094, + "balance_loss_clip": 1.01974368, + "balance_loss_mlp": 1.04176283, + "epoch": 0.4407635653088832, + "flos": 23441193365760.0, + "grad_norm": 1.7732063146110093, + "language_loss": 0.74867487, + "learning_rate": 2.4761730057046936e-06, + "loss": 0.77015924, + "num_input_tokens_seen": 157142800, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.74609375, + "step": 7331, + "time_per_iteration": 2.5421340465545654 + }, + { + "auxiliary_loss_clip": 0.01111824, + "auxiliary_loss_mlp": 0.01030025, + "balance_loss_clip": 1.01797271, + "balance_loss_mlp": 1.03957462, + "epoch": 0.4408236885615512, + "flos": 24021294013440.0, + "grad_norm": 1.4577784912363292, + "language_loss": 0.76381409, + "learning_rate": 2.475794734375581e-06, + "loss": 0.78523266, + "num_input_tokens_seen": 157163295, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.72265625, + "step": 7332, + "time_per_iteration": 2.5218567848205566 + }, + { + "auxiliary_loss_clip": 0.01114527, + "auxiliary_loss_mlp": 0.0103924, + "balance_loss_clip": 1.02724767, + "balance_loss_mlp": 1.03985786, + "epoch": 0.44088381181421915, + "flos": 12676826597760.0, + "grad_norm": 1.6787739774558346, + "language_loss": 0.7317301, + "learning_rate": 2.475416445004285e-06, + "loss": 0.75326777, + "num_input_tokens_seen": 157180890, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.74609375, + "step": 7333, + "time_per_iteration": 2.4611384868621826 + }, + { + "auxiliary_loss_clip": 0.01113948, + "auxiliary_loss_mlp": 0.01034468, + "balance_loss_clip": 1.0218792, + "balance_loss_mlp": 1.04222834, + "epoch": 0.4409439350668871, + "flos": 24569865498240.0, + "grad_norm": 1.7946296457229314, + "language_loss": 0.79795265, + "learning_rate": 2.4750381376051493e-06, + "loss": 0.81943679, + "num_input_tokens_seen": 157200580, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71875, + "step": 7334, + "time_per_iteration": 2.4846577644348145 + }, + { + "auxiliary_loss_clip": 0.01123312, + "auxiliary_loss_mlp": 0.01041094, + "balance_loss_clip": 1.02473879, + "balance_loss_mlp": 1.04168534, + "epoch": 0.4410040583195551, + "flos": 22668574798080.0, + "grad_norm": 2.170087212124324, + "language_loss": 0.7549156, + "learning_rate": 2.47465981219252e-06, + "loss": 0.77655965, + "num_input_tokens_seen": 157218345, + "router_z_loss_clip": 0.1640625, + "router_z_loss_mlp": 0.81640625, + "step": 7335, + "time_per_iteration": 2.5086324214935303 + }, + { + "auxiliary_loss_clip": 0.01118233, + "auxiliary_loss_mlp": 0.01039933, + "balance_loss_clip": 1.02661777, + "balance_loss_mlp": 1.04259086, + "epoch": 0.44106418157222305, + "flos": 10852528700160.0, + "grad_norm": 1.91450979477167, + "language_loss": 0.72583538, + "learning_rate": 2.4742814687807423e-06, + "loss": 0.74741697, + "num_input_tokens_seen": 157234395, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7578125, + "step": 7336, + "time_per_iteration": 2.436680555343628 + }, + { + "auxiliary_loss_clip": 0.01118765, + "auxiliary_loss_mlp": 0.01039128, + "balance_loss_clip": 1.0251267, + "balance_loss_mlp": 1.04040349, + "epoch": 0.441124304824891, + "flos": 21726710323200.0, + "grad_norm": 9.267090991138677, + "language_loss": 0.62665188, + "learning_rate": 2.473903107384165e-06, + "loss": 0.64823085, + "num_input_tokens_seen": 157254805, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.78515625, + "step": 7337, + "time_per_iteration": 2.484269618988037 + }, + { + "auxiliary_loss_clip": 0.01039049, + "auxiliary_loss_mlp": 0.0100578, + "balance_loss_clip": 1.00452268, + "balance_loss_mlp": 1.01618195, + "epoch": 0.441184428077559, + "flos": 63220486625280.0, + "grad_norm": 0.7410103266773326, + "language_loss": 0.52670205, + "learning_rate": 2.473524728017134e-06, + "loss": 0.54715037, + "num_input_tokens_seen": 157317870, + "router_z_loss_clip": 0.01257324, + "router_z_loss_mlp": 0.22851562, + "step": 7338, + "time_per_iteration": 3.104921340942383 + }, + { + "auxiliary_loss_clip": 0.01120745, + "auxiliary_loss_mlp": 0.01047395, + "balance_loss_clip": 1.03303015, + "balance_loss_mlp": 1.04076958, + "epoch": 0.44124455133022694, + "flos": 21177959270400.0, + "grad_norm": 1.7777015345810536, + "language_loss": 0.70687723, + "learning_rate": 2.473146330693997e-06, + "loss": 0.7285586, + "num_input_tokens_seen": 157336505, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.80078125, + "step": 7339, + "time_per_iteration": 2.5172934532165527 + }, + { + "auxiliary_loss_clip": 0.01113596, + "auxiliary_loss_mlp": 0.01038279, + "balance_loss_clip": 1.02603626, + "balance_loss_mlp": 1.04237795, + "epoch": 0.4413046745828949, + "flos": 17457865453440.0, + "grad_norm": 1.6032661325040427, + "language_loss": 0.69992614, + "learning_rate": 2.472767915429105e-06, + "loss": 0.7214449, + "num_input_tokens_seen": 157354995, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.7109375, + "step": 7340, + "time_per_iteration": 2.4677066802978516 + }, + { + "auxiliary_loss_clip": 0.0103753, + "auxiliary_loss_mlp": 0.01002043, + "balance_loss_clip": 1.00078511, + "balance_loss_mlp": 1.01463652, + "epoch": 0.4413647978355629, + "flos": 61586153804160.0, + "grad_norm": 0.8913600985584349, + "language_loss": 0.64017105, + "learning_rate": 2.4723894822368054e-06, + "loss": 0.66056681, + "num_input_tokens_seen": 157404260, + "router_z_loss_clip": 0.01257324, + "router_z_loss_mlp": 0.22851562, + "step": 7341, + "time_per_iteration": 2.87821888923645 + }, + { + "auxiliary_loss_clip": 0.01113838, + "auxiliary_loss_mlp": 0.01038155, + "balance_loss_clip": 1.02473783, + "balance_loss_mlp": 1.04029536, + "epoch": 0.4414249210882309, + "flos": 27527001505920.0, + "grad_norm": 2.415120536593597, + "language_loss": 0.73162079, + "learning_rate": 2.47201103113145e-06, + "loss": 0.75314075, + "num_input_tokens_seen": 157423045, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 7342, + "time_per_iteration": 2.6009373664855957 + }, + { + "auxiliary_loss_clip": 0.01114735, + "auxiliary_loss_mlp": 0.01037861, + "balance_loss_clip": 1.02390742, + "balance_loss_mlp": 1.03866804, + "epoch": 0.44148504434089886, + "flos": 23513984277120.0, + "grad_norm": 1.834134484008718, + "language_loss": 0.7961756, + "learning_rate": 2.4716325621273886e-06, + "loss": 0.81770158, + "num_input_tokens_seen": 157441815, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76171875, + "step": 7343, + "time_per_iteration": 2.5102362632751465 + }, + { + "auxiliary_loss_clip": 0.01113089, + "auxiliary_loss_mlp": 0.01030659, + "balance_loss_clip": 1.01745617, + "balance_loss_mlp": 1.03901291, + "epoch": 0.4415451675935668, + "flos": 21580589796480.0, + "grad_norm": 1.5507634652992637, + "language_loss": 0.76845753, + "learning_rate": 2.4712540752389725e-06, + "loss": 0.789895, + "num_input_tokens_seen": 157460470, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73828125, + "step": 7344, + "time_per_iteration": 2.517014741897583 + }, + { + "auxiliary_loss_clip": 0.01036094, + "auxiliary_loss_mlp": 0.01000265, + "balance_loss_clip": 0.99887604, + "balance_loss_mlp": 1.01319945, + "epoch": 0.4416052908462348, + "flos": 59006368126080.0, + "grad_norm": 0.7920555871551813, + "language_loss": 0.63752162, + "learning_rate": 2.470875570480556e-06, + "loss": 0.65788519, + "num_input_tokens_seen": 157512655, + "router_z_loss_clip": 0.01391602, + "router_z_loss_mlp": 0.22949219, + "step": 7345, + "time_per_iteration": 7.267446517944336 + }, + { + "auxiliary_loss_clip": 0.01121083, + "auxiliary_loss_mlp": 0.01039556, + "balance_loss_clip": 1.02610314, + "balance_loss_mlp": 1.04385495, + "epoch": 0.44166541409890275, + "flos": 26357642242560.0, + "grad_norm": 2.1109182100548596, + "language_loss": 0.86316586, + "learning_rate": 2.470497047866489e-06, + "loss": 0.88477224, + "num_input_tokens_seen": 157533700, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7734375, + "step": 7346, + "time_per_iteration": 2.5508806705474854 + }, + { + "auxiliary_loss_clip": 0.01118131, + "auxiliary_loss_mlp": 0.01040679, + "balance_loss_clip": 1.02691066, + "balance_loss_mlp": 1.04238844, + "epoch": 0.4417255373515707, + "flos": 20192678231040.0, + "grad_norm": 1.947149735733886, + "language_loss": 0.8050105, + "learning_rate": 2.470118507411128e-06, + "loss": 0.82659858, + "num_input_tokens_seen": 157551105, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7578125, + "step": 7347, + "time_per_iteration": 2.474933624267578 + }, + { + "auxiliary_loss_clip": 0.01117003, + "auxiliary_loss_mlp": 0.01037561, + "balance_loss_clip": 1.02367926, + "balance_loss_mlp": 1.04158723, + "epoch": 0.4417856606042387, + "flos": 17887895078400.0, + "grad_norm": 1.6941368254206504, + "language_loss": 0.82639945, + "learning_rate": 2.4697399491288263e-06, + "loss": 0.84794509, + "num_input_tokens_seen": 157568285, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 7348, + "time_per_iteration": 2.4525363445281982 + }, + { + "auxiliary_loss_clip": 0.01119343, + "auxiliary_loss_mlp": 0.01037184, + "balance_loss_clip": 1.02335548, + "balance_loss_mlp": 1.04179621, + "epoch": 0.44184578385690665, + "flos": 27964034282880.0, + "grad_norm": 1.5736626646923677, + "language_loss": 0.7025882, + "learning_rate": 2.469361373033938e-06, + "loss": 0.72415352, + "num_input_tokens_seen": 157590405, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7734375, + "step": 7349, + "time_per_iteration": 2.511890172958374 + }, + { + "auxiliary_loss_clip": 0.01117351, + "auxiliary_loss_mlp": 0.01038625, + "balance_loss_clip": 1.02426577, + "balance_loss_mlp": 1.03973794, + "epoch": 0.4419059071095746, + "flos": 23367899664000.0, + "grad_norm": 1.6465526230005572, + "language_loss": 0.74427998, + "learning_rate": 2.468982779140819e-06, + "loss": 0.76583976, + "num_input_tokens_seen": 157607420, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.77734375, + "step": 7350, + "time_per_iteration": 2.496570110321045 + }, + { + "auxiliary_loss_clip": 0.01116736, + "auxiliary_loss_mlp": 0.01034613, + "balance_loss_clip": 1.02167273, + "balance_loss_mlp": 1.0410589, + "epoch": 0.4419660303622426, + "flos": 15012169246080.0, + "grad_norm": 1.9521663807923895, + "language_loss": 0.80709779, + "learning_rate": 2.468604167463827e-06, + "loss": 0.8286112, + "num_input_tokens_seen": 157624990, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7578125, + "step": 7351, + "time_per_iteration": 2.432551860809326 + }, + { + "auxiliary_loss_clip": 0.01111348, + "auxiliary_loss_mlp": 0.01035932, + "balance_loss_clip": 1.02401161, + "balance_loss_mlp": 1.03947091, + "epoch": 0.44202615361491054, + "flos": 25371750672000.0, + "grad_norm": 1.5082806208548023, + "language_loss": 0.73055673, + "learning_rate": 2.4682255380173176e-06, + "loss": 0.75202954, + "num_input_tokens_seen": 157645300, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.71875, + "step": 7352, + "time_per_iteration": 2.515235424041748 + }, + { + "auxiliary_loss_clip": 0.01116736, + "auxiliary_loss_mlp": 0.01031954, + "balance_loss_clip": 1.0184238, + "balance_loss_mlp": 1.04159904, + "epoch": 0.4420862768675785, + "flos": 24681116897280.0, + "grad_norm": 1.8470037483547026, + "language_loss": 0.87457407, + "learning_rate": 2.467846890815649e-06, + "loss": 0.89606094, + "num_input_tokens_seen": 157664060, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.75, + "step": 7353, + "time_per_iteration": 2.4880294799804688 + }, + { + "auxiliary_loss_clip": 0.0111698, + "auxiliary_loss_mlp": 0.01035238, + "balance_loss_clip": 1.02288198, + "balance_loss_mlp": 1.04091954, + "epoch": 0.44214640012024653, + "flos": 19528437974400.0, + "grad_norm": 2.0344010928875567, + "language_loss": 0.75522006, + "learning_rate": 2.4674682258731795e-06, + "loss": 0.77674222, + "num_input_tokens_seen": 157680905, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.7578125, + "step": 7354, + "time_per_iteration": 2.454554319381714 + }, + { + "auxiliary_loss_clip": 0.01112104, + "auxiliary_loss_mlp": 0.01035786, + "balance_loss_clip": 1.0235672, + "balance_loss_mlp": 1.03940272, + "epoch": 0.4422065233729145, + "flos": 47557434003840.0, + "grad_norm": 1.7346650465528282, + "language_loss": 0.64754039, + "learning_rate": 2.467089543204268e-06, + "loss": 0.66901928, + "num_input_tokens_seen": 157701980, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.7265625, + "step": 7355, + "time_per_iteration": 2.711973190307617 + }, + { + "auxiliary_loss_clip": 0.0112036, + "auxiliary_loss_mlp": 0.01036556, + "balance_loss_clip": 1.02248383, + "balance_loss_mlp": 1.04187799, + "epoch": 0.44226664662558246, + "flos": 19281050029440.0, + "grad_norm": 1.914030541413853, + "language_loss": 0.78126168, + "learning_rate": 2.466710842823274e-06, + "loss": 0.80283082, + "num_input_tokens_seen": 157720555, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.78515625, + "step": 7356, + "time_per_iteration": 2.470214366912842 + }, + { + "auxiliary_loss_clip": 0.01118926, + "auxiliary_loss_mlp": 0.01036798, + "balance_loss_clip": 1.02317214, + "balance_loss_mlp": 1.0414896, + "epoch": 0.4423267698782504, + "flos": 17821820010240.0, + "grad_norm": 1.5192892311950144, + "language_loss": 0.7712661, + "learning_rate": 2.4663321247445577e-06, + "loss": 0.79282331, + "num_input_tokens_seen": 157739160, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7734375, + "step": 7357, + "time_per_iteration": 2.461174249649048 + }, + { + "auxiliary_loss_clip": 0.01117699, + "auxiliary_loss_mlp": 0.01038392, + "balance_loss_clip": 1.02454567, + "balance_loss_mlp": 1.0424664, + "epoch": 0.4423868931309184, + "flos": 29204424691200.0, + "grad_norm": 1.4937655647898813, + "language_loss": 0.73591524, + "learning_rate": 2.465953388982481e-06, + "loss": 0.75747615, + "num_input_tokens_seen": 157760020, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 7358, + "time_per_iteration": 2.556330919265747 + }, + { + "auxiliary_loss_clip": 0.01117067, + "auxiliary_loss_mlp": 0.01030767, + "balance_loss_clip": 1.01871514, + "balance_loss_mlp": 1.0415349, + "epoch": 0.44244701638358636, + "flos": 29713135057920.0, + "grad_norm": 1.6567493539100802, + "language_loss": 0.75616974, + "learning_rate": 2.465574635551405e-06, + "loss": 0.77764809, + "num_input_tokens_seen": 157780435, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.75390625, + "step": 7359, + "time_per_iteration": 2.50827693939209 + }, + { + "auxiliary_loss_clip": 0.01116785, + "auxiliary_loss_mlp": 0.01033041, + "balance_loss_clip": 1.01920068, + "balance_loss_mlp": 1.04107249, + "epoch": 0.4425071396362543, + "flos": 22930040874240.0, + "grad_norm": 1.743382279224751, + "language_loss": 0.7001307, + "learning_rate": 2.4651958644656923e-06, + "loss": 0.72162896, + "num_input_tokens_seen": 157799420, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 7360, + "time_per_iteration": 2.4941389560699463 + }, + { + "auxiliary_loss_clip": 0.01117522, + "auxiliary_loss_mlp": 0.01033558, + "balance_loss_clip": 1.0205518, + "balance_loss_mlp": 1.04113221, + "epoch": 0.4425672628889223, + "flos": 19792346175360.0, + "grad_norm": 2.0593935576965996, + "language_loss": 0.69252694, + "learning_rate": 2.4648170757397053e-06, + "loss": 0.71403772, + "num_input_tokens_seen": 157817025, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.76171875, + "step": 7361, + "time_per_iteration": 2.4985222816467285 + }, + { + "auxiliary_loss_clip": 0.01116054, + "auxiliary_loss_mlp": 0.01032222, + "balance_loss_clip": 1.01840568, + "balance_loss_mlp": 1.04025078, + "epoch": 0.44262738614159025, + "flos": 13662215377920.0, + "grad_norm": 3.464971296188532, + "language_loss": 0.82380062, + "learning_rate": 2.464438269387809e-06, + "loss": 0.84528339, + "num_input_tokens_seen": 157834345, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 7362, + "time_per_iteration": 2.5396664142608643 + }, + { + "auxiliary_loss_clip": 0.01120785, + "auxiliary_loss_mlp": 0.0103602, + "balance_loss_clip": 1.0216198, + "balance_loss_mlp": 1.0414443, + "epoch": 0.4426875093942582, + "flos": 14210212245120.0, + "grad_norm": 1.6248096382426125, + "language_loss": 0.74421227, + "learning_rate": 2.464059445424366e-06, + "loss": 0.76578033, + "num_input_tokens_seen": 157852290, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.79296875, + "step": 7363, + "time_per_iteration": 2.452195167541504 + }, + { + "auxiliary_loss_clip": 0.01036428, + "auxiliary_loss_mlp": 0.01011165, + "balance_loss_clip": 1.00969243, + "balance_loss_mlp": 1.0129478, + "epoch": 0.4427476326469262, + "flos": 70117525728000.0, + "grad_norm": 0.6750552451063064, + "language_loss": 0.55668789, + "learning_rate": 2.463680603863743e-06, + "loss": 0.57716382, + "num_input_tokens_seen": 157923060, + "router_z_loss_clip": 0.01470947, + "router_z_loss_mlp": 0.234375, + "step": 7364, + "time_per_iteration": 3.1631510257720947 + }, + { + "auxiliary_loss_clip": 0.0111342, + "auxiliary_loss_mlp": 0.01031288, + "balance_loss_clip": 1.01869917, + "balance_loss_mlp": 1.0388242, + "epoch": 0.44280775589959415, + "flos": 25445080287360.0, + "grad_norm": 1.5647849634077904, + "language_loss": 0.74008644, + "learning_rate": 2.463301744720305e-06, + "loss": 0.76153356, + "num_input_tokens_seen": 157944110, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.74609375, + "step": 7365, + "time_per_iteration": 2.5025317668914795 + }, + { + "auxiliary_loss_clip": 0.01113151, + "auxiliary_loss_mlp": 0.01038694, + "balance_loss_clip": 1.02544355, + "balance_loss_mlp": 1.0385282, + "epoch": 0.4428678791522621, + "flos": 22857214049280.0, + "grad_norm": 1.5168930353966135, + "language_loss": 0.74242592, + "learning_rate": 2.4629228680084184e-06, + "loss": 0.76394439, + "num_input_tokens_seen": 157964295, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.74609375, + "step": 7366, + "time_per_iteration": 2.4882071018218994 + }, + { + "auxiliary_loss_clip": 0.01117127, + "auxiliary_loss_mlp": 0.01032858, + "balance_loss_clip": 1.01911306, + "balance_loss_mlp": 1.04244351, + "epoch": 0.44292800240493013, + "flos": 25812446636160.0, + "grad_norm": 1.7268166919008578, + "language_loss": 0.73934573, + "learning_rate": 2.46254397374245e-06, + "loss": 0.7608456, + "num_input_tokens_seen": 157983970, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.74609375, + "step": 7367, + "time_per_iteration": 2.494215250015259 + }, + { + "auxiliary_loss_clip": 0.01115817, + "auxiliary_loss_mlp": 0.01038126, + "balance_loss_clip": 1.02484, + "balance_loss_mlp": 1.04093957, + "epoch": 0.4429881256575981, + "flos": 32416885549440.0, + "grad_norm": 1.708386000191459, + "language_loss": 0.7409333, + "learning_rate": 2.4621650619367677e-06, + "loss": 0.76247275, + "num_input_tokens_seen": 158006515, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75, + "step": 7368, + "time_per_iteration": 2.5647008419036865 + }, + { + "auxiliary_loss_clip": 0.01114523, + "auxiliary_loss_mlp": 0.01031707, + "balance_loss_clip": 1.01905274, + "balance_loss_mlp": 1.04091215, + "epoch": 0.44304824891026606, + "flos": 22163707186560.0, + "grad_norm": 1.8689444780395545, + "language_loss": 0.79986328, + "learning_rate": 2.4617861326057403e-06, + "loss": 0.82132554, + "num_input_tokens_seen": 158025565, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.734375, + "step": 7369, + "time_per_iteration": 2.4666872024536133 + }, + { + "auxiliary_loss_clip": 0.01112296, + "auxiliary_loss_mlp": 0.0102878, + "balance_loss_clip": 1.01627517, + "balance_loss_mlp": 1.04060125, + "epoch": 0.443108372162934, + "flos": 25338569483520.0, + "grad_norm": 1.7167890006148945, + "language_loss": 0.72231519, + "learning_rate": 2.461407185763737e-06, + "loss": 0.74372596, + "num_input_tokens_seen": 158045620, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71484375, + "step": 7370, + "time_per_iteration": 2.5508570671081543 + }, + { + "auxiliary_loss_clip": 0.01113663, + "auxiliary_loss_mlp": 0.01033079, + "balance_loss_clip": 1.02021682, + "balance_loss_mlp": 1.03883541, + "epoch": 0.443168495415602, + "flos": 23330947547520.0, + "grad_norm": 1.7515847136682843, + "language_loss": 0.70318949, + "learning_rate": 2.461028221425126e-06, + "loss": 0.72465694, + "num_input_tokens_seen": 158063505, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.75, + "step": 7371, + "time_per_iteration": 2.4617960453033447 + }, + { + "auxiliary_loss_clip": 0.01110948, + "auxiliary_loss_mlp": 0.01030075, + "balance_loss_clip": 1.01818371, + "balance_loss_mlp": 1.03891456, + "epoch": 0.44322861866826996, + "flos": 21871502046720.0, + "grad_norm": 2.199744355071377, + "language_loss": 0.68163198, + "learning_rate": 2.4606492396042786e-06, + "loss": 0.70304221, + "num_input_tokens_seen": 158080335, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.72265625, + "step": 7372, + "time_per_iteration": 2.4743239879608154 + }, + { + "auxiliary_loss_clip": 0.0111515, + "auxiliary_loss_mlp": 0.01030978, + "balance_loss_clip": 1.01702499, + "balance_loss_mlp": 1.03971767, + "epoch": 0.4432887419209379, + "flos": 20084407660800.0, + "grad_norm": 1.696523180994532, + "language_loss": 0.83959508, + "learning_rate": 2.4602702403155664e-06, + "loss": 0.86105639, + "num_input_tokens_seen": 158098955, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75390625, + "step": 7373, + "time_per_iteration": 2.44077467918396 + }, + { + "auxiliary_loss_clip": 0.01038641, + "auxiliary_loss_mlp": 0.01004854, + "balance_loss_clip": 1.00331616, + "balance_loss_mlp": 1.01527071, + "epoch": 0.4433488651736059, + "flos": 70035540935040.0, + "grad_norm": 0.8140024563186875, + "language_loss": 0.55299437, + "learning_rate": 2.4598912235733604e-06, + "loss": 0.57342935, + "num_input_tokens_seen": 158164110, + "router_z_loss_clip": 0.01538086, + "router_z_loss_mlp": 0.234375, + "step": 7374, + "time_per_iteration": 3.1360692977905273 + }, + { + "auxiliary_loss_clip": 0.01113767, + "auxiliary_loss_mlp": 0.01042375, + "balance_loss_clip": 1.02858198, + "balance_loss_mlp": 1.04092741, + "epoch": 0.44340898842627385, + "flos": 16282472705280.0, + "grad_norm": 2.2551701608050636, + "language_loss": 0.82651508, + "learning_rate": 2.4595121893920327e-06, + "loss": 0.84807646, + "num_input_tokens_seen": 158179850, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7265625, + "step": 7375, + "time_per_iteration": 2.4277329444885254 + }, + { + "auxiliary_loss_clip": 0.01116501, + "auxiliary_loss_mlp": 0.01029081, + "balance_loss_clip": 1.01610494, + "balance_loss_mlp": 1.04118764, + "epoch": 0.4434691116789418, + "flos": 16611989097600.0, + "grad_norm": 1.7856786314152562, + "language_loss": 0.83470213, + "learning_rate": 2.4591331377859578e-06, + "loss": 0.85615796, + "num_input_tokens_seen": 158196590, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.75390625, + "step": 7376, + "time_per_iteration": 2.481781482696533 + }, + { + "auxiliary_loss_clip": 0.01114604, + "auxiliary_loss_mlp": 0.01032944, + "balance_loss_clip": 1.02043331, + "balance_loss_mlp": 1.04121447, + "epoch": 0.4435292349316098, + "flos": 19063251912960.0, + "grad_norm": 1.7657537697851593, + "language_loss": 0.77321744, + "learning_rate": 2.4587540687695077e-06, + "loss": 0.79469293, + "num_input_tokens_seen": 158216355, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.734375, + "step": 7377, + "time_per_iteration": 2.4599812030792236 + }, + { + "auxiliary_loss_clip": 0.01112621, + "auxiliary_loss_mlp": 0.0102944, + "balance_loss_clip": 1.01692927, + "balance_loss_mlp": 1.04132032, + "epoch": 0.44358935818427775, + "flos": 21251324799360.0, + "grad_norm": 1.8620341755948002, + "language_loss": 0.75641978, + "learning_rate": 2.458374982357057e-06, + "loss": 0.77784032, + "num_input_tokens_seen": 158235825, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7109375, + "step": 7378, + "time_per_iteration": 2.5178849697113037 + }, + { + "auxiliary_loss_clip": 0.01114317, + "auxiliary_loss_mlp": 0.01035639, + "balance_loss_clip": 1.02302647, + "balance_loss_mlp": 1.04010391, + "epoch": 0.4436494814369457, + "flos": 12495298239360.0, + "grad_norm": 2.670150777415059, + "language_loss": 0.69005907, + "learning_rate": 2.457995878562982e-06, + "loss": 0.71155864, + "num_input_tokens_seen": 158254230, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7421875, + "step": 7379, + "time_per_iteration": 2.460470199584961 + }, + { + "auxiliary_loss_clip": 0.01116042, + "auxiliary_loss_mlp": 0.01029842, + "balance_loss_clip": 1.01689601, + "balance_loss_mlp": 1.04134107, + "epoch": 0.44370960468961373, + "flos": 23659853408640.0, + "grad_norm": 1.5614200394729, + "language_loss": 0.73110741, + "learning_rate": 2.457616757401656e-06, + "loss": 0.75256622, + "num_input_tokens_seen": 158273400, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.75, + "step": 7380, + "time_per_iteration": 2.5134148597717285 + }, + { + "auxiliary_loss_clip": 0.0111454, + "auxiliary_loss_mlp": 0.01031046, + "balance_loss_clip": 1.01841021, + "balance_loss_mlp": 1.0408597, + "epoch": 0.4437697279422817, + "flos": 32416849635840.0, + "grad_norm": 1.5217984285789272, + "language_loss": 0.6470772, + "learning_rate": 2.457237618887458e-06, + "loss": 0.66853309, + "num_input_tokens_seen": 158296840, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.734375, + "step": 7381, + "time_per_iteration": 2.5547850131988525 + }, + { + "auxiliary_loss_clip": 0.01116209, + "auxiliary_loss_mlp": 0.0103301, + "balance_loss_clip": 1.02020693, + "balance_loss_mlp": 1.04110599, + "epoch": 0.44382985119494966, + "flos": 18112875914880.0, + "grad_norm": 2.3862697145357394, + "language_loss": 0.8018291, + "learning_rate": 2.456858463034763e-06, + "loss": 0.82332134, + "num_input_tokens_seen": 158314935, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.75, + "step": 7382, + "time_per_iteration": 2.575241804122925 + }, + { + "auxiliary_loss_clip": 0.01118453, + "auxiliary_loss_mlp": 0.01039182, + "balance_loss_clip": 1.02631903, + "balance_loss_mlp": 1.04359293, + "epoch": 0.44388997444761763, + "flos": 30774151923840.0, + "grad_norm": 1.657830016653087, + "language_loss": 0.65369737, + "learning_rate": 2.456479289857949e-06, + "loss": 0.67527372, + "num_input_tokens_seen": 158334620, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.75, + "step": 7383, + "time_per_iteration": 2.530205726623535 + }, + { + "auxiliary_loss_clip": 0.01118822, + "auxiliary_loss_mlp": 0.01032867, + "balance_loss_clip": 1.01928902, + "balance_loss_mlp": 1.04226518, + "epoch": 0.4439500977002856, + "flos": 20339157893760.0, + "grad_norm": 3.0329093562680023, + "language_loss": 0.75660288, + "learning_rate": 2.4561000993713953e-06, + "loss": 0.77811974, + "num_input_tokens_seen": 158350550, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.765625, + "step": 7384, + "time_per_iteration": 2.5266385078430176 + }, + { + "auxiliary_loss_clip": 0.01118828, + "auxiliary_loss_mlp": 0.01033934, + "balance_loss_clip": 1.02092242, + "balance_loss_mlp": 1.04284334, + "epoch": 0.44401022095295356, + "flos": 20371225760640.0, + "grad_norm": 1.5666997146068944, + "language_loss": 0.81029254, + "learning_rate": 2.4557208915894796e-06, + "loss": 0.83182013, + "num_input_tokens_seen": 158369555, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.76171875, + "step": 7385, + "time_per_iteration": 2.4479992389678955 + }, + { + "auxiliary_loss_clip": 0.01116566, + "auxiliary_loss_mlp": 0.01035084, + "balance_loss_clip": 1.02111292, + "balance_loss_mlp": 1.04122996, + "epoch": 0.4440703442056215, + "flos": 20230635928320.0, + "grad_norm": 1.6468061831775258, + "language_loss": 0.82127023, + "learning_rate": 2.455341666526582e-06, + "loss": 0.84278667, + "num_input_tokens_seen": 158388045, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75390625, + "step": 7386, + "time_per_iteration": 2.48417067527771 + }, + { + "auxiliary_loss_clip": 0.01120079, + "auxiliary_loss_mlp": 0.01037797, + "balance_loss_clip": 1.02320611, + "balance_loss_mlp": 1.04189587, + "epoch": 0.4441304674582895, + "flos": 39494698824960.0, + "grad_norm": 1.953099317045194, + "language_loss": 0.69732893, + "learning_rate": 2.4549624241970832e-06, + "loss": 0.71890771, + "num_input_tokens_seen": 158410115, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.78125, + "step": 7387, + "time_per_iteration": 5.494876146316528 + }, + { + "auxiliary_loss_clip": 0.01114673, + "auxiliary_loss_mlp": 0.01038672, + "balance_loss_clip": 1.02546382, + "balance_loss_mlp": 1.03957582, + "epoch": 0.44419059071095746, + "flos": 14829671220480.0, + "grad_norm": 2.035383956259629, + "language_loss": 0.7170803, + "learning_rate": 2.4545831646153628e-06, + "loss": 0.73861378, + "num_input_tokens_seen": 158427765, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75, + "step": 7388, + "time_per_iteration": 2.4271323680877686 + }, + { + "auxiliary_loss_clip": 0.011178, + "auxiliary_loss_mlp": 0.01031227, + "balance_loss_clip": 1.01776195, + "balance_loss_mlp": 1.04137266, + "epoch": 0.4442507139636254, + "flos": 22637835734400.0, + "grad_norm": 1.4848855642281624, + "language_loss": 0.6881609, + "learning_rate": 2.4542038877958044e-06, + "loss": 0.70965117, + "num_input_tokens_seen": 158446375, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 7389, + "time_per_iteration": 2.4847142696380615 + }, + { + "auxiliary_loss_clip": 0.01115516, + "auxiliary_loss_mlp": 0.01032017, + "balance_loss_clip": 1.01918983, + "balance_loss_mlp": 1.04167664, + "epoch": 0.4443108372162934, + "flos": 38290721829120.0, + "grad_norm": 2.0051609497188587, + "language_loss": 0.74621141, + "learning_rate": 2.453824593752788e-06, + "loss": 0.76768672, + "num_input_tokens_seen": 158467260, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.73828125, + "step": 7390, + "time_per_iteration": 2.594834804534912 + }, + { + "auxiliary_loss_clip": 0.01116041, + "auxiliary_loss_mlp": 0.01033099, + "balance_loss_clip": 1.0202961, + "balance_loss_mlp": 1.04296565, + "epoch": 0.44437096046896135, + "flos": 17748993185280.0, + "grad_norm": 2.702415761973985, + "language_loss": 0.811364, + "learning_rate": 2.4534452825006988e-06, + "loss": 0.83285546, + "num_input_tokens_seen": 158486720, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.73046875, + "step": 7391, + "time_per_iteration": 2.4757862091064453 + }, + { + "auxiliary_loss_clip": 0.01116609, + "auxiliary_loss_mlp": 0.01034214, + "balance_loss_clip": 1.02070808, + "balance_loss_mlp": 1.04341137, + "epoch": 0.4444310837216293, + "flos": 13732348682880.0, + "grad_norm": 1.6224407429556025, + "language_loss": 0.73400211, + "learning_rate": 2.4530659540539185e-06, + "loss": 0.75551033, + "num_input_tokens_seen": 158502530, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 7392, + "time_per_iteration": 2.423929214477539 + }, + { + "auxiliary_loss_clip": 0.01113533, + "auxiliary_loss_mlp": 0.01029467, + "balance_loss_clip": 1.01705766, + "balance_loss_mlp": 1.03988051, + "epoch": 0.44449120697429734, + "flos": 25010238240000.0, + "grad_norm": 1.5529830220947678, + "language_loss": 0.79523122, + "learning_rate": 2.4526866084268313e-06, + "loss": 0.81666124, + "num_input_tokens_seen": 158522715, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.734375, + "step": 7393, + "time_per_iteration": 2.5162272453308105 + }, + { + "auxiliary_loss_clip": 0.01119885, + "auxiliary_loss_mlp": 0.01034531, + "balance_loss_clip": 1.02125716, + "balance_loss_mlp": 1.04248941, + "epoch": 0.4445513302269653, + "flos": 32671707609600.0, + "grad_norm": 1.9165659224437794, + "language_loss": 0.8090415, + "learning_rate": 2.4523072456338226e-06, + "loss": 0.83058566, + "num_input_tokens_seen": 158543615, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7734375, + "step": 7394, + "time_per_iteration": 2.5386714935302734 + }, + { + "auxiliary_loss_clip": 0.01114869, + "auxiliary_loss_mlp": 0.01039877, + "balance_loss_clip": 1.02772927, + "balance_loss_mlp": 1.04228508, + "epoch": 0.44461145347963327, + "flos": 11655814504320.0, + "grad_norm": 3.6807348725160502, + "language_loss": 0.79471326, + "learning_rate": 2.4519278656892785e-06, + "loss": 0.81626076, + "num_input_tokens_seen": 158560330, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.7265625, + "step": 7395, + "time_per_iteration": 2.4668092727661133 + }, + { + "auxiliary_loss_clip": 0.01114654, + "auxiliary_loss_mlp": 0.01033845, + "balance_loss_clip": 1.02162027, + "balance_loss_mlp": 1.04132056, + "epoch": 0.44467157673230123, + "flos": 20886759711360.0, + "grad_norm": 1.800276006342892, + "language_loss": 0.68493867, + "learning_rate": 2.451548468607584e-06, + "loss": 0.70642376, + "num_input_tokens_seen": 158579735, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.734375, + "step": 7396, + "time_per_iteration": 2.463660717010498 + }, + { + "auxiliary_loss_clip": 0.01117407, + "auxiliary_loss_mlp": 0.01031157, + "balance_loss_clip": 1.01831245, + "balance_loss_mlp": 1.0412426, + "epoch": 0.4447316999849692, + "flos": 18546137763840.0, + "grad_norm": 1.8246827609425533, + "language_loss": 0.81007254, + "learning_rate": 2.451169054403126e-06, + "loss": 0.83155811, + "num_input_tokens_seen": 158597075, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.76171875, + "step": 7397, + "time_per_iteration": 2.4812188148498535 + }, + { + "auxiliary_loss_clip": 0.01116158, + "auxiliary_loss_mlp": 0.01033503, + "balance_loss_clip": 1.02078366, + "balance_loss_mlp": 1.04323518, + "epoch": 0.44479182323763716, + "flos": 23769057732480.0, + "grad_norm": 1.7006854584246183, + "language_loss": 0.67145807, + "learning_rate": 2.450789623090293e-06, + "loss": 0.69295466, + "num_input_tokens_seen": 158616650, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7265625, + "step": 7398, + "time_per_iteration": 2.5075526237487793 + }, + { + "auxiliary_loss_clip": 0.01114723, + "auxiliary_loss_mlp": 0.01036485, + "balance_loss_clip": 1.02443874, + "balance_loss_mlp": 1.04204428, + "epoch": 0.44485194649030513, + "flos": 16543831040640.0, + "grad_norm": 1.9000444103330927, + "language_loss": 0.69551516, + "learning_rate": 2.450410174683472e-06, + "loss": 0.71702719, + "num_input_tokens_seen": 158634515, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.7265625, + "step": 7399, + "time_per_iteration": 2.522737741470337 + }, + { + "auxiliary_loss_clip": 0.01113023, + "auxiliary_loss_mlp": 0.01037037, + "balance_loss_clip": 1.02465105, + "balance_loss_mlp": 1.0408442, + "epoch": 0.4449120697429731, + "flos": 22600955445120.0, + "grad_norm": 1.713461165054691, + "language_loss": 0.7287724, + "learning_rate": 2.4500307091970514e-06, + "loss": 0.75027299, + "num_input_tokens_seen": 158653760, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71875, + "step": 7400, + "time_per_iteration": 2.4633662700653076 + }, + { + "auxiliary_loss_clip": 0.0111367, + "auxiliary_loss_mlp": 0.01030255, + "balance_loss_clip": 1.01755965, + "balance_loss_mlp": 1.04038024, + "epoch": 0.44497219299564106, + "flos": 20004864992640.0, + "grad_norm": 1.5216060200654076, + "language_loss": 0.85054708, + "learning_rate": 2.449651226645422e-06, + "loss": 0.87198627, + "num_input_tokens_seen": 158672190, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.734375, + "step": 7401, + "time_per_iteration": 2.5034339427948 + }, + { + "auxiliary_loss_clip": 0.01111761, + "auxiliary_loss_mlp": 0.01033771, + "balance_loss_clip": 1.02213049, + "balance_loss_mlp": 1.04065824, + "epoch": 0.445032316248309, + "flos": 25594253470080.0, + "grad_norm": 1.696028331559664, + "language_loss": 0.83296156, + "learning_rate": 2.449271727042973e-06, + "loss": 0.85441685, + "num_input_tokens_seen": 158694115, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.7109375, + "step": 7402, + "time_per_iteration": 2.501981258392334 + }, + { + "auxiliary_loss_clip": 0.0111579, + "auxiliary_loss_mlp": 0.01032303, + "balance_loss_clip": 1.01979768, + "balance_loss_mlp": 1.0420711, + "epoch": 0.445092439500977, + "flos": 21250426959360.0, + "grad_norm": 1.736524647333069, + "language_loss": 0.76953578, + "learning_rate": 2.4488922104040947e-06, + "loss": 0.7910167, + "num_input_tokens_seen": 158711000, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.73828125, + "step": 7403, + "time_per_iteration": 2.4778058528900146 + }, + { + "auxiliary_loss_clip": 0.01038113, + "auxiliary_loss_mlp": 0.01001838, + "balance_loss_clip": 1.00046086, + "balance_loss_mlp": 1.014925, + "epoch": 0.44515256275364495, + "flos": 57764900309760.0, + "grad_norm": 0.7475420058163609, + "language_loss": 0.60081208, + "learning_rate": 2.4485126767431793e-06, + "loss": 0.62121159, + "num_input_tokens_seen": 158769675, + "router_z_loss_clip": 0.01379395, + "router_z_loss_mlp": 0.23242188, + "step": 7404, + "time_per_iteration": 3.0548532009124756 + }, + { + "auxiliary_loss_clip": 0.01118666, + "auxiliary_loss_mlp": 0.01035192, + "balance_loss_clip": 1.02225208, + "balance_loss_mlp": 1.04285121, + "epoch": 0.4452126860063129, + "flos": 15596004908160.0, + "grad_norm": 1.6312624429793499, + "language_loss": 0.81696916, + "learning_rate": 2.4481331260746177e-06, + "loss": 0.83850771, + "num_input_tokens_seen": 158788215, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7578125, + "step": 7405, + "time_per_iteration": 2.474632978439331 + }, + { + "auxiliary_loss_clip": 0.0111153, + "auxiliary_loss_mlp": 0.01031071, + "balance_loss_clip": 1.01864958, + "balance_loss_mlp": 1.03843176, + "epoch": 0.4452728092589809, + "flos": 21617398258560.0, + "grad_norm": 1.4258557139975254, + "language_loss": 0.74869186, + "learning_rate": 2.4477535584128036e-06, + "loss": 0.77011788, + "num_input_tokens_seen": 158809090, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.73046875, + "step": 7406, + "time_per_iteration": 2.4767563343048096 + }, + { + "auxiliary_loss_clip": 0.01108887, + "auxiliary_loss_mlp": 0.01030592, + "balance_loss_clip": 1.01837921, + "balance_loss_mlp": 1.03819203, + "epoch": 0.4453329325116489, + "flos": 29497491757440.0, + "grad_norm": 1.5627122296340765, + "language_loss": 0.65510803, + "learning_rate": 2.447373973772129e-06, + "loss": 0.67650282, + "num_input_tokens_seen": 158828320, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.70703125, + "step": 7407, + "time_per_iteration": 2.5395827293395996 + }, + { + "auxiliary_loss_clip": 0.01118546, + "auxiliary_loss_mlp": 0.01029122, + "balance_loss_clip": 1.01691461, + "balance_loss_mlp": 1.04306138, + "epoch": 0.44539305576431687, + "flos": 21361139654400.0, + "grad_norm": 1.5061477696527659, + "language_loss": 0.67724633, + "learning_rate": 2.4469943721669887e-06, + "loss": 0.69872296, + "num_input_tokens_seen": 158847040, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.75390625, + "step": 7408, + "time_per_iteration": 2.462306261062622 + }, + { + "auxiliary_loss_clip": 0.0111265, + "auxiliary_loss_mlp": 0.01032186, + "balance_loss_clip": 1.01891828, + "balance_loss_mlp": 1.0386107, + "epoch": 0.44545317901698483, + "flos": 41427626428800.0, + "grad_norm": 1.4978343447976226, + "language_loss": 0.71923941, + "learning_rate": 2.4466147536117776e-06, + "loss": 0.74068785, + "num_input_tokens_seen": 158870490, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7421875, + "step": 7409, + "time_per_iteration": 2.674224615097046 + }, + { + "auxiliary_loss_clip": 0.01114249, + "auxiliary_loss_mlp": 0.01034826, + "balance_loss_clip": 1.02100968, + "balance_loss_mlp": 1.03980279, + "epoch": 0.4455133022696528, + "flos": 22055005653120.0, + "grad_norm": 2.031581575195052, + "language_loss": 0.64823419, + "learning_rate": 2.4462351181208895e-06, + "loss": 0.66972494, + "num_input_tokens_seen": 158889920, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7421875, + "step": 7410, + "time_per_iteration": 2.524874687194824 + }, + { + "auxiliary_loss_clip": 0.01120724, + "auxiliary_loss_mlp": 0.01033754, + "balance_loss_clip": 1.0200448, + "balance_loss_mlp": 1.04309118, + "epoch": 0.44557342552232077, + "flos": 23476960333440.0, + "grad_norm": 2.015615502497161, + "language_loss": 0.74042189, + "learning_rate": 2.4458554657087217e-06, + "loss": 0.76196671, + "num_input_tokens_seen": 158909580, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.77734375, + "step": 7411, + "time_per_iteration": 2.512510061264038 + }, + { + "auxiliary_loss_clip": 0.01112773, + "auxiliary_loss_mlp": 0.01031337, + "balance_loss_clip": 1.01900446, + "balance_loss_mlp": 1.04189968, + "epoch": 0.44563354877498873, + "flos": 19134678107520.0, + "grad_norm": 1.869475782048451, + "language_loss": 0.79242551, + "learning_rate": 2.4454757963896695e-06, + "loss": 0.81386662, + "num_input_tokens_seen": 158924600, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.70703125, + "step": 7412, + "time_per_iteration": 2.472858190536499 + }, + { + "auxiliary_loss_clip": 0.01114909, + "auxiliary_loss_mlp": 0.01035461, + "balance_loss_clip": 1.02299762, + "balance_loss_mlp": 1.03920937, + "epoch": 0.4456936720276567, + "flos": 13621420506240.0, + "grad_norm": 3.400478569187806, + "language_loss": 0.798675, + "learning_rate": 2.4450961101781304e-06, + "loss": 0.82017869, + "num_input_tokens_seen": 158939345, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7578125, + "step": 7413, + "time_per_iteration": 2.4117238521575928 + }, + { + "auxiliary_loss_clip": 0.01112114, + "auxiliary_loss_mlp": 0.01028076, + "balance_loss_clip": 1.01601171, + "balance_loss_mlp": 1.04039168, + "epoch": 0.44575379528032466, + "flos": 14713715139840.0, + "grad_norm": 1.7210919700182319, + "language_loss": 0.76510686, + "learning_rate": 2.4447164070885026e-06, + "loss": 0.7865088, + "num_input_tokens_seen": 158955855, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.71875, + "step": 7414, + "time_per_iteration": 2.460224151611328 + }, + { + "auxiliary_loss_clip": 0.01113268, + "auxiliary_loss_mlp": 0.01033582, + "balance_loss_clip": 1.02064216, + "balance_loss_mlp": 1.04047227, + "epoch": 0.4458139185329926, + "flos": 24170682677760.0, + "grad_norm": 1.4395051245379855, + "language_loss": 0.83344847, + "learning_rate": 2.4443366871351837e-06, + "loss": 0.85491699, + "num_input_tokens_seen": 158976315, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7265625, + "step": 7415, + "time_per_iteration": 2.487433910369873 + }, + { + "auxiliary_loss_clip": 0.01111103, + "auxiliary_loss_mlp": 0.01039317, + "balance_loss_clip": 1.02675223, + "balance_loss_mlp": 1.03786182, + "epoch": 0.4458740417856606, + "flos": 21762225895680.0, + "grad_norm": 1.5295363489819147, + "language_loss": 0.84025514, + "learning_rate": 2.4439569503325732e-06, + "loss": 0.86175931, + "num_input_tokens_seen": 158996725, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.734375, + "step": 7416, + "time_per_iteration": 2.4827380180358887 + }, + { + "auxiliary_loss_clip": 0.0111513, + "auxiliary_loss_mlp": 0.01031747, + "balance_loss_clip": 1.01872349, + "balance_loss_mlp": 1.03937066, + "epoch": 0.44593416503832856, + "flos": 21068790860160.0, + "grad_norm": 1.5840815969934987, + "language_loss": 0.8099134, + "learning_rate": 2.4435771966950706e-06, + "loss": 0.83138216, + "num_input_tokens_seen": 159017255, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7578125, + "step": 7417, + "time_per_iteration": 2.48150897026062 + }, + { + "auxiliary_loss_clip": 0.01115498, + "auxiliary_loss_mlp": 0.01039656, + "balance_loss_clip": 1.02679276, + "balance_loss_mlp": 1.04055572, + "epoch": 0.4459942882909965, + "flos": 22600488568320.0, + "grad_norm": 1.9543176040955477, + "language_loss": 0.81078619, + "learning_rate": 2.443197426237077e-06, + "loss": 0.83233768, + "num_input_tokens_seen": 159035010, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.75, + "step": 7418, + "time_per_iteration": 2.489847421646118 + }, + { + "auxiliary_loss_clip": 0.0111452, + "auxiliary_loss_mlp": 0.01029153, + "balance_loss_clip": 1.01647544, + "balance_loss_mlp": 1.04015303, + "epoch": 0.4460544115436645, + "flos": 26505486622080.0, + "grad_norm": 1.586204851514133, + "language_loss": 0.77404898, + "learning_rate": 2.442817638972991e-06, + "loss": 0.79548573, + "num_input_tokens_seen": 159055345, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7421875, + "step": 7419, + "time_per_iteration": 2.497434377670288 + }, + { + "auxiliary_loss_clip": 0.01112333, + "auxiliary_loss_mlp": 0.01034157, + "balance_loss_clip": 1.02190208, + "balance_loss_mlp": 1.03983605, + "epoch": 0.4461145347963325, + "flos": 17604021893760.0, + "grad_norm": 1.7862585645473121, + "language_loss": 0.72408056, + "learning_rate": 2.4424378349172176e-06, + "loss": 0.74554545, + "num_input_tokens_seen": 159074225, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.7265625, + "step": 7420, + "time_per_iteration": 2.459458351135254 + }, + { + "auxiliary_loss_clip": 0.01113499, + "auxiliary_loss_mlp": 0.0103005, + "balance_loss_clip": 1.0166688, + "balance_loss_mlp": 1.0416131, + "epoch": 0.44617465804900047, + "flos": 27268193036160.0, + "grad_norm": 1.6779849239209732, + "language_loss": 0.75009704, + "learning_rate": 2.442058014084156e-06, + "loss": 0.77153254, + "num_input_tokens_seen": 159095415, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.71875, + "step": 7421, + "time_per_iteration": 2.51987624168396 + }, + { + "auxiliary_loss_clip": 0.01110345, + "auxiliary_loss_mlp": 0.01032284, + "balance_loss_clip": 1.02002299, + "balance_loss_mlp": 1.04095602, + "epoch": 0.44623478130166844, + "flos": 17786412178560.0, + "grad_norm": 1.9054244397804427, + "language_loss": 0.76410532, + "learning_rate": 2.44167817648821e-06, + "loss": 0.78553158, + "num_input_tokens_seen": 159114615, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 7422, + "time_per_iteration": 2.4755024909973145 + }, + { + "auxiliary_loss_clip": 0.0111206, + "auxiliary_loss_mlp": 0.01031918, + "balance_loss_clip": 1.01975894, + "balance_loss_mlp": 1.03931499, + "epoch": 0.4462949045543364, + "flos": 23003011353600.0, + "grad_norm": 1.4448000656244153, + "language_loss": 0.65126681, + "learning_rate": 2.441298322143784e-06, + "loss": 0.6727066, + "num_input_tokens_seen": 159134370, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.7265625, + "step": 7423, + "time_per_iteration": 2.4828243255615234 + }, + { + "auxiliary_loss_clip": 0.01110236, + "auxiliary_loss_mlp": 0.01028687, + "balance_loss_clip": 1.01719534, + "balance_loss_mlp": 1.04027271, + "epoch": 0.44635502780700437, + "flos": 17820096157440.0, + "grad_norm": 1.510185037273786, + "language_loss": 0.78842837, + "learning_rate": 2.4409184510652807e-06, + "loss": 0.80981761, + "num_input_tokens_seen": 159152540, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.69921875, + "step": 7424, + "time_per_iteration": 2.4399938583374023 + }, + { + "auxiliary_loss_clip": 0.01111318, + "auxiliary_loss_mlp": 0.01032011, + "balance_loss_clip": 1.02010214, + "balance_loss_mlp": 1.04070699, + "epoch": 0.44641515105967233, + "flos": 26688020561280.0, + "grad_norm": 1.3563203456934205, + "language_loss": 0.80225039, + "learning_rate": 2.4405385632671063e-06, + "loss": 0.82368374, + "num_input_tokens_seen": 159173425, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.703125, + "step": 7425, + "time_per_iteration": 2.5406088829040527 + }, + { + "auxiliary_loss_clip": 0.01111697, + "auxiliary_loss_mlp": 0.01031502, + "balance_loss_clip": 1.0190568, + "balance_loss_mlp": 1.04027843, + "epoch": 0.4464752743123403, + "flos": 18913324544640.0, + "grad_norm": 2.6114514678489895, + "language_loss": 0.77294517, + "learning_rate": 2.4401586587636655e-06, + "loss": 0.79437709, + "num_input_tokens_seen": 159191210, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71484375, + "step": 7426, + "time_per_iteration": 2.4845876693725586 + }, + { + "auxiliary_loss_clip": 0.01112123, + "auxiliary_loss_mlp": 0.01028013, + "balance_loss_clip": 1.01636636, + "balance_loss_mlp": 1.03881311, + "epoch": 0.44653539756500826, + "flos": 29570318582400.0, + "grad_norm": 1.552934875151276, + "language_loss": 0.64668226, + "learning_rate": 2.4397787375693634e-06, + "loss": 0.66808361, + "num_input_tokens_seen": 159211755, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.734375, + "step": 7427, + "time_per_iteration": 2.540630340576172 + }, + { + "auxiliary_loss_clip": 0.01116984, + "auxiliary_loss_mlp": 0.01032083, + "balance_loss_clip": 1.02009046, + "balance_loss_mlp": 1.04497719, + "epoch": 0.44659552081767623, + "flos": 21468979261440.0, + "grad_norm": 1.583763048167789, + "language_loss": 0.75103819, + "learning_rate": 2.439398799698608e-06, + "loss": 0.77252889, + "num_input_tokens_seen": 159230315, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.71875, + "step": 7428, + "time_per_iteration": 3.8718421459198 + }, + { + "auxiliary_loss_clip": 0.01111211, + "auxiliary_loss_mlp": 0.01032965, + "balance_loss_clip": 1.0205195, + "balance_loss_mlp": 1.03955674, + "epoch": 0.4466556440703442, + "flos": 17931886260480.0, + "grad_norm": 1.8476152433667956, + "language_loss": 0.77595931, + "learning_rate": 2.439018845165806e-06, + "loss": 0.79740107, + "num_input_tokens_seen": 159249810, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71875, + "step": 7429, + "time_per_iteration": 5.381062984466553 + }, + { + "auxiliary_loss_clip": 0.01114674, + "auxiliary_loss_mlp": 0.01032028, + "balance_loss_clip": 1.01935029, + "balance_loss_mlp": 1.04038692, + "epoch": 0.44671576732301216, + "flos": 21107430915840.0, + "grad_norm": 1.5332211966047418, + "language_loss": 0.91229695, + "learning_rate": 2.438638873985366e-06, + "loss": 0.93376398, + "num_input_tokens_seen": 159271715, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7421875, + "step": 7430, + "time_per_iteration": 2.4677700996398926 + }, + { + "auxiliary_loss_clip": 0.0111698, + "auxiliary_loss_mlp": 0.01038071, + "balance_loss_clip": 1.02439737, + "balance_loss_mlp": 1.04052413, + "epoch": 0.4467758905756801, + "flos": 23508920459520.0, + "grad_norm": 1.5443417480404311, + "language_loss": 0.79630744, + "learning_rate": 2.4382588861716954e-06, + "loss": 0.81785798, + "num_input_tokens_seen": 159290690, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.765625, + "step": 7431, + "time_per_iteration": 2.567082405090332 + }, + { + "auxiliary_loss_clip": 0.0111745, + "auxiliary_loss_mlp": 0.01037244, + "balance_loss_clip": 1.02438116, + "balance_loss_mlp": 1.04187393, + "epoch": 0.4468360138283481, + "flos": 18734022829440.0, + "grad_norm": 2.0676923701008807, + "language_loss": 0.80376756, + "learning_rate": 2.437878881739204e-06, + "loss": 0.82531446, + "num_input_tokens_seen": 159309400, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.75390625, + "step": 7432, + "time_per_iteration": 2.4359145164489746 + }, + { + "auxiliary_loss_clip": 0.01115042, + "auxiliary_loss_mlp": 0.01036362, + "balance_loss_clip": 1.02394009, + "balance_loss_mlp": 1.03957176, + "epoch": 0.4468961370810161, + "flos": 23477139901440.0, + "grad_norm": 2.022128912320156, + "language_loss": 0.76601076, + "learning_rate": 2.437498860702301e-06, + "loss": 0.78752482, + "num_input_tokens_seen": 159327425, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.75390625, + "step": 7433, + "time_per_iteration": 2.48732852935791 + }, + { + "auxiliary_loss_clip": 0.0110862, + "auxiliary_loss_mlp": 0.01033692, + "balance_loss_clip": 1.0233326, + "balance_loss_mlp": 1.03873658, + "epoch": 0.4469562603336841, + "flos": 30075042539520.0, + "grad_norm": 1.6660023236153727, + "language_loss": 0.7773807, + "learning_rate": 2.437118823075398e-06, + "loss": 0.79880381, + "num_input_tokens_seen": 159345805, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.69921875, + "step": 7434, + "time_per_iteration": 2.501410961151123 + }, + { + "auxiliary_loss_clip": 0.01117105, + "auxiliary_loss_mlp": 0.01034097, + "balance_loss_clip": 1.02160966, + "balance_loss_mlp": 1.04261708, + "epoch": 0.44701638358635204, + "flos": 22456415116800.0, + "grad_norm": 1.6324454169441744, + "language_loss": 0.64255738, + "learning_rate": 2.436738768872905e-06, + "loss": 0.66406941, + "num_input_tokens_seen": 159364595, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.74609375, + "step": 7435, + "time_per_iteration": 2.506918430328369 + }, + { + "auxiliary_loss_clip": 0.01116438, + "auxiliary_loss_mlp": 0.01029509, + "balance_loss_clip": 1.01706398, + "balance_loss_mlp": 1.04181314, + "epoch": 0.44707650683902, + "flos": 24057851080320.0, + "grad_norm": 1.4705490989927619, + "language_loss": 0.83558768, + "learning_rate": 2.4363586981092346e-06, + "loss": 0.8570472, + "num_input_tokens_seen": 159385265, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.74609375, + "step": 7436, + "time_per_iteration": 2.482273817062378 + }, + { + "auxiliary_loss_clip": 0.01114793, + "auxiliary_loss_mlp": 0.01033883, + "balance_loss_clip": 1.02067423, + "balance_loss_mlp": 1.0400939, + "epoch": 0.44713663009168797, + "flos": 23766938830080.0, + "grad_norm": 1.6782401052542175, + "language_loss": 0.79564971, + "learning_rate": 2.435978610798798e-06, + "loss": 0.81713653, + "num_input_tokens_seen": 159405080, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.74609375, + "step": 7437, + "time_per_iteration": 2.519118309020996 + }, + { + "auxiliary_loss_clip": 0.01114275, + "auxiliary_loss_mlp": 0.01032586, + "balance_loss_clip": 1.02017021, + "balance_loss_mlp": 1.03965664, + "epoch": 0.44719675334435594, + "flos": 24499265316480.0, + "grad_norm": 1.5877629147247494, + "language_loss": 0.71921134, + "learning_rate": 2.435598506956009e-06, + "loss": 0.74067998, + "num_input_tokens_seen": 159424595, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.74609375, + "step": 7438, + "time_per_iteration": 2.4918689727783203 + }, + { + "auxiliary_loss_clip": 0.01114196, + "auxiliary_loss_mlp": 0.01034386, + "balance_loss_clip": 1.02114832, + "balance_loss_mlp": 1.03908634, + "epoch": 0.4472568765970239, + "flos": 29781759991680.0, + "grad_norm": 1.558408845854645, + "language_loss": 0.67469549, + "learning_rate": 2.4352183865952808e-06, + "loss": 0.6961813, + "num_input_tokens_seen": 159443865, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75, + "step": 7439, + "time_per_iteration": 2.549445390701294 + }, + { + "auxiliary_loss_clip": 0.01116567, + "auxiliary_loss_mlp": 0.01035107, + "balance_loss_clip": 1.0218277, + "balance_loss_mlp": 1.04164815, + "epoch": 0.44731699984969187, + "flos": 24643123286400.0, + "grad_norm": 1.6525243551580215, + "language_loss": 0.73600596, + "learning_rate": 2.4348382497310285e-06, + "loss": 0.7575227, + "num_input_tokens_seen": 159464525, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.75, + "step": 7440, + "time_per_iteration": 2.487545967102051 + }, + { + "auxiliary_loss_clip": 0.01112285, + "auxiliary_loss_mlp": 0.01034061, + "balance_loss_clip": 1.02215195, + "balance_loss_mlp": 1.03937638, + "epoch": 0.44737712310235983, + "flos": 29455691304960.0, + "grad_norm": 1.5916362290459067, + "language_loss": 0.74376386, + "learning_rate": 2.4344580963776655e-06, + "loss": 0.76522732, + "num_input_tokens_seen": 159486385, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.73046875, + "step": 7441, + "time_per_iteration": 2.537848472595215 + }, + { + "auxiliary_loss_clip": 0.01116121, + "auxiliary_loss_mlp": 0.01039305, + "balance_loss_clip": 1.02585804, + "balance_loss_mlp": 1.04112506, + "epoch": 0.4474372463550278, + "flos": 24896832024960.0, + "grad_norm": 2.062950208020596, + "language_loss": 0.74780977, + "learning_rate": 2.4340779265496082e-06, + "loss": 0.769364, + "num_input_tokens_seen": 159503880, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 7442, + "time_per_iteration": 2.45829701423645 + }, + { + "auxiliary_loss_clip": 0.0111647, + "auxiliary_loss_mlp": 0.01034752, + "balance_loss_clip": 1.02123356, + "balance_loss_mlp": 1.03977489, + "epoch": 0.44749736960769576, + "flos": 33181603125120.0, + "grad_norm": 1.7358505546612006, + "language_loss": 0.7456758, + "learning_rate": 2.433697740261273e-06, + "loss": 0.76718801, + "num_input_tokens_seen": 159522980, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.76953125, + "step": 7443, + "time_per_iteration": 2.604759931564331 + }, + { + "auxiliary_loss_clip": 0.01111225, + "auxiliary_loss_mlp": 0.01028498, + "balance_loss_clip": 1.01500916, + "balance_loss_mlp": 1.0379262, + "epoch": 0.4475574928603637, + "flos": 21071807602560.0, + "grad_norm": 1.8898561004653542, + "language_loss": 0.77591091, + "learning_rate": 2.4333175375270748e-06, + "loss": 0.79730821, + "num_input_tokens_seen": 159543340, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 7444, + "time_per_iteration": 2.5373945236206055 + }, + { + "auxiliary_loss_clip": 0.01110179, + "auxiliary_loss_mlp": 0.01030626, + "balance_loss_clip": 1.01813924, + "balance_loss_mlp": 1.03841698, + "epoch": 0.4476176161130317, + "flos": 21862523646720.0, + "grad_norm": 2.3020631966175893, + "language_loss": 0.85495317, + "learning_rate": 2.4329373183614333e-06, + "loss": 0.87636125, + "num_input_tokens_seen": 159558210, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71875, + "step": 7445, + "time_per_iteration": 2.4707260131835938 + }, + { + "auxiliary_loss_clip": 0.01116558, + "auxiliary_loss_mlp": 0.01030825, + "balance_loss_clip": 1.01741982, + "balance_loss_mlp": 1.04191256, + "epoch": 0.4476777393656997, + "flos": 22528667324160.0, + "grad_norm": 3.672789877680737, + "language_loss": 0.64349431, + "learning_rate": 2.432557082778765e-06, + "loss": 0.66496813, + "num_input_tokens_seen": 159577920, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.74609375, + "step": 7446, + "time_per_iteration": 2.4802255630493164 + }, + { + "auxiliary_loss_clip": 0.0103814, + "auxiliary_loss_mlp": 0.0100263, + "balance_loss_clip": 1.00128329, + "balance_loss_mlp": 1.01421368, + "epoch": 0.4477378626183677, + "flos": 49017133877760.0, + "grad_norm": 0.7477421339074387, + "language_loss": 0.50242257, + "learning_rate": 2.4321768307934884e-06, + "loss": 0.52283025, + "num_input_tokens_seen": 159632295, + "router_z_loss_clip": 0.01348877, + "router_z_loss_mlp": 0.24023438, + "step": 7447, + "time_per_iteration": 2.9262073040008545 + }, + { + "auxiliary_loss_clip": 0.01037975, + "auxiliary_loss_mlp": 0.01002161, + "balance_loss_clip": 1.00088537, + "balance_loss_mlp": 1.01407075, + "epoch": 0.44779798587103564, + "flos": 56542179392640.0, + "grad_norm": 0.7583700928831021, + "language_loss": 0.59290731, + "learning_rate": 2.4317965624200235e-06, + "loss": 0.61330867, + "num_input_tokens_seen": 159698435, + "router_z_loss_clip": 0.01275635, + "router_z_loss_mlp": 0.23925781, + "step": 7448, + "time_per_iteration": 3.2298059463500977 + }, + { + "auxiliary_loss_clip": 0.01112419, + "auxiliary_loss_mlp": 0.01032837, + "balance_loss_clip": 1.02082074, + "balance_loss_mlp": 1.03913987, + "epoch": 0.4478581091237036, + "flos": 46498536040320.0, + "grad_norm": 1.4697324100578784, + "language_loss": 0.59226847, + "learning_rate": 2.431416277672789e-06, + "loss": 0.61372101, + "num_input_tokens_seen": 159722150, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.73046875, + "step": 7449, + "time_per_iteration": 2.667651891708374 + }, + { + "auxiliary_loss_clip": 0.01114654, + "auxiliary_loss_mlp": 0.01028568, + "balance_loss_clip": 1.01638436, + "balance_loss_mlp": 1.04082561, + "epoch": 0.4479182323763716, + "flos": 20814363849600.0, + "grad_norm": 1.6912833904949394, + "language_loss": 0.79799938, + "learning_rate": 2.4310359765662065e-06, + "loss": 0.8194316, + "num_input_tokens_seen": 159740550, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.73828125, + "step": 7450, + "time_per_iteration": 2.488041400909424 + }, + { + "auxiliary_loss_clip": 0.01112446, + "auxiliary_loss_mlp": 0.01031639, + "balance_loss_clip": 1.01900911, + "balance_loss_mlp": 1.03948057, + "epoch": 0.44797835562903954, + "flos": 14245979212800.0, + "grad_norm": 2.443005371711525, + "language_loss": 0.79474008, + "learning_rate": 2.430655659114697e-06, + "loss": 0.81618094, + "num_input_tokens_seen": 159758245, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.73046875, + "step": 7451, + "time_per_iteration": 2.4184346199035645 + }, + { + "auxiliary_loss_clip": 0.01037194, + "auxiliary_loss_mlp": 0.01000693, + "balance_loss_clip": 0.99944174, + "balance_loss_mlp": 1.01323009, + "epoch": 0.4480384788817075, + "flos": 63534560169600.0, + "grad_norm": 2.1611139577707608, + "language_loss": 0.62848771, + "learning_rate": 2.430275325332681e-06, + "loss": 0.64886659, + "num_input_tokens_seen": 159826790, + "router_z_loss_clip": 0.01251221, + "router_z_loss_mlp": 0.24023438, + "step": 7452, + "time_per_iteration": 3.1637966632843018 + }, + { + "auxiliary_loss_clip": 0.01115495, + "auxiliary_loss_mlp": 0.01036415, + "balance_loss_clip": 1.0227952, + "balance_loss_mlp": 1.04087877, + "epoch": 0.44809860213437547, + "flos": 21652626522240.0, + "grad_norm": 1.7752989444397396, + "language_loss": 0.62657529, + "learning_rate": 2.429894975234582e-06, + "loss": 0.64809442, + "num_input_tokens_seen": 159845805, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.74609375, + "step": 7453, + "time_per_iteration": 2.4473493099212646 + }, + { + "auxiliary_loss_clip": 0.01036714, + "auxiliary_loss_mlp": 0.01000711, + "balance_loss_clip": 0.99935836, + "balance_loss_mlp": 1.01265335, + "epoch": 0.44815872538704343, + "flos": 69190634246400.0, + "grad_norm": 0.7532005340797263, + "language_loss": 0.57028639, + "learning_rate": 2.4295146088348224e-06, + "loss": 0.59066069, + "num_input_tokens_seen": 159898860, + "router_z_loss_clip": 0.0135498, + "router_z_loss_mlp": 0.24023438, + "step": 7454, + "time_per_iteration": 2.9524526596069336 + }, + { + "auxiliary_loss_clip": 0.01111502, + "auxiliary_loss_mlp": 0.01027601, + "balance_loss_clip": 1.01563811, + "balance_loss_mlp": 1.03850055, + "epoch": 0.4482188486397114, + "flos": 12598289510400.0, + "grad_norm": 2.2509965352428334, + "language_loss": 0.75078607, + "learning_rate": 2.4291342261478255e-06, + "loss": 0.7721771, + "num_input_tokens_seen": 159911555, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.73046875, + "step": 7455, + "time_per_iteration": 2.4103891849517822 + }, + { + "auxiliary_loss_clip": 0.01112978, + "auxiliary_loss_mlp": 0.01029679, + "balance_loss_clip": 1.01761508, + "balance_loss_mlp": 1.03976846, + "epoch": 0.44827897189237936, + "flos": 34058182631040.0, + "grad_norm": 1.6579032105665654, + "language_loss": 0.76428723, + "learning_rate": 2.428753827188016e-06, + "loss": 0.78571379, + "num_input_tokens_seen": 159931470, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.734375, + "step": 7456, + "time_per_iteration": 2.5631935596466064 + }, + { + "auxiliary_loss_clip": 0.01115214, + "auxiliary_loss_mlp": 0.01035659, + "balance_loss_clip": 1.02398849, + "balance_loss_mlp": 1.04312015, + "epoch": 0.44833909514504733, + "flos": 25147416280320.0, + "grad_norm": 1.9831255862845865, + "language_loss": 0.76475745, + "learning_rate": 2.428373411969818e-06, + "loss": 0.78626615, + "num_input_tokens_seen": 159946115, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.71875, + "step": 7457, + "time_per_iteration": 2.464808702468872 + }, + { + "auxiliary_loss_clip": 0.01113345, + "auxiliary_loss_mlp": 0.01029291, + "balance_loss_clip": 1.01611805, + "balance_loss_mlp": 1.03910387, + "epoch": 0.4483992183977153, + "flos": 16179984224640.0, + "grad_norm": 1.9767465188311044, + "language_loss": 0.67705971, + "learning_rate": 2.4279929805076576e-06, + "loss": 0.69848609, + "num_input_tokens_seen": 159963915, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 7458, + "time_per_iteration": 2.4457101821899414 + }, + { + "auxiliary_loss_clip": 0.01116638, + "auxiliary_loss_mlp": 0.01031464, + "balance_loss_clip": 1.01787972, + "balance_loss_mlp": 1.04051626, + "epoch": 0.44845934165038326, + "flos": 17746048270080.0, + "grad_norm": 1.5619796593676711, + "language_loss": 0.72202468, + "learning_rate": 2.427612532815961e-06, + "loss": 0.74350572, + "num_input_tokens_seen": 159982140, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.76171875, + "step": 7459, + "time_per_iteration": 2.433029890060425 + }, + { + "auxiliary_loss_clip": 0.0110945, + "auxiliary_loss_mlp": 0.01029093, + "balance_loss_clip": 1.01676071, + "balance_loss_mlp": 1.03716815, + "epoch": 0.4485194649030513, + "flos": 21835914647040.0, + "grad_norm": 1.8000530949283695, + "language_loss": 0.69520539, + "learning_rate": 2.427232068909154e-06, + "loss": 0.71659082, + "num_input_tokens_seen": 160002280, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.72265625, + "step": 7460, + "time_per_iteration": 2.4872210025787354 + }, + { + "auxiliary_loss_clip": 0.01111602, + "auxiliary_loss_mlp": 0.01034327, + "balance_loss_clip": 1.02144051, + "balance_loss_mlp": 1.03848231, + "epoch": 0.44857958815571924, + "flos": 20084515401600.0, + "grad_norm": 1.9864484577730697, + "language_loss": 0.77204525, + "learning_rate": 2.4268515888016635e-06, + "loss": 0.79350454, + "num_input_tokens_seen": 160020260, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.734375, + "step": 7461, + "time_per_iteration": 2.455543279647827 + }, + { + "auxiliary_loss_clip": 0.01111999, + "auxiliary_loss_mlp": 0.01029771, + "balance_loss_clip": 1.0180943, + "balance_loss_mlp": 1.03780031, + "epoch": 0.4486397114083872, + "flos": 27053519402880.0, + "grad_norm": 1.7106561387980361, + "language_loss": 0.67983574, + "learning_rate": 2.4264710925079184e-06, + "loss": 0.70125341, + "num_input_tokens_seen": 160040240, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.7421875, + "step": 7462, + "time_per_iteration": 2.5366299152374268 + }, + { + "auxiliary_loss_clip": 0.01034999, + "auxiliary_loss_mlp": 0.01002003, + "balance_loss_clip": 1.00071561, + "balance_loss_mlp": 1.01134682, + "epoch": 0.4486998346610552, + "flos": 67321195931520.0, + "grad_norm": 0.7463947253576576, + "language_loss": 0.54503644, + "learning_rate": 2.4260905800423462e-06, + "loss": 0.56540644, + "num_input_tokens_seen": 160093865, + "router_z_loss_clip": 0.01287842, + "router_z_loss_mlp": 0.23632812, + "step": 7463, + "time_per_iteration": 3.0639255046844482 + }, + { + "auxiliary_loss_clip": 0.01110954, + "auxiliary_loss_mlp": 0.01029704, + "balance_loss_clip": 1.01699638, + "balance_loss_mlp": 1.03847826, + "epoch": 0.44875995791372314, + "flos": 27636816360960.0, + "grad_norm": 1.9527582175804243, + "language_loss": 0.75866246, + "learning_rate": 2.4257100514193775e-06, + "loss": 0.78006899, + "num_input_tokens_seen": 160113590, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 7464, + "time_per_iteration": 2.5135347843170166 + }, + { + "auxiliary_loss_clip": 0.01109858, + "auxiliary_loss_mlp": 0.01033694, + "balance_loss_clip": 1.02225554, + "balance_loss_mlp": 1.03903246, + "epoch": 0.4488200811663911, + "flos": 13005947940480.0, + "grad_norm": 1.8117694427226085, + "language_loss": 0.73671377, + "learning_rate": 2.425329506653441e-06, + "loss": 0.75814927, + "num_input_tokens_seen": 160131795, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.70703125, + "step": 7465, + "time_per_iteration": 2.433394432067871 + }, + { + "auxiliary_loss_clip": 0.01118642, + "auxiliary_loss_mlp": 0.01035747, + "balance_loss_clip": 1.02193666, + "balance_loss_mlp": 1.04127038, + "epoch": 0.44888020441905907, + "flos": 27489977562240.0, + "grad_norm": 1.824586312100338, + "language_loss": 0.7996276, + "learning_rate": 2.424948945758966e-06, + "loss": 0.82117152, + "num_input_tokens_seen": 160150635, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7734375, + "step": 7466, + "time_per_iteration": 2.5013458728790283 + }, + { + "auxiliary_loss_clip": 0.01114545, + "auxiliary_loss_mlp": 0.01031895, + "balance_loss_clip": 1.01967633, + "balance_loss_mlp": 1.04118383, + "epoch": 0.44894032767172704, + "flos": 18259678800000.0, + "grad_norm": 2.612382799524426, + "language_loss": 0.80522013, + "learning_rate": 2.4245683687503844e-06, + "loss": 0.82668447, + "num_input_tokens_seen": 160168615, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.734375, + "step": 7467, + "time_per_iteration": 2.4517929553985596 + }, + { + "auxiliary_loss_clip": 0.01109457, + "auxiliary_loss_mlp": 0.01031548, + "balance_loss_clip": 1.01998448, + "balance_loss_mlp": 1.03988719, + "epoch": 0.449000450924395, + "flos": 21579835610880.0, + "grad_norm": 1.7208509955346651, + "language_loss": 0.75153285, + "learning_rate": 2.424187775642129e-06, + "loss": 0.7729429, + "num_input_tokens_seen": 160187295, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6953125, + "step": 7468, + "time_per_iteration": 2.4585771560668945 + }, + { + "auxiliary_loss_clip": 0.01112269, + "auxiliary_loss_mlp": 0.01025298, + "balance_loss_clip": 1.01422918, + "balance_loss_mlp": 1.04034877, + "epoch": 0.44906057417706297, + "flos": 17967904623360.0, + "grad_norm": 1.8721286685005696, + "language_loss": 0.7099303, + "learning_rate": 2.4238071664486297e-06, + "loss": 0.73130596, + "num_input_tokens_seen": 160205115, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.71875, + "step": 7469, + "time_per_iteration": 2.420208692550659 + }, + { + "auxiliary_loss_clip": 0.01114048, + "auxiliary_loss_mlp": 0.01035745, + "balance_loss_clip": 1.02298415, + "balance_loss_mlp": 1.04046845, + "epoch": 0.44912069742973093, + "flos": 20047347803520.0, + "grad_norm": 1.7828692415308351, + "language_loss": 0.71891844, + "learning_rate": 2.4234265411843203e-06, + "loss": 0.74041635, + "num_input_tokens_seen": 160222580, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.734375, + "step": 7470, + "time_per_iteration": 5.381145477294922 + }, + { + "auxiliary_loss_clip": 0.01112344, + "auxiliary_loss_mlp": 0.01032399, + "balance_loss_clip": 1.01940536, + "balance_loss_mlp": 1.03871441, + "epoch": 0.4491808206823989, + "flos": 21033526682880.0, + "grad_norm": 2.1026178485463274, + "language_loss": 0.76912111, + "learning_rate": 2.423045899863634e-06, + "loss": 0.79056853, + "num_input_tokens_seen": 160241520, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.73828125, + "step": 7471, + "time_per_iteration": 3.925541400909424 + }, + { + "auxiliary_loss_clip": 0.01113353, + "auxiliary_loss_mlp": 0.01033085, + "balance_loss_clip": 1.02128363, + "balance_loss_mlp": 1.04100883, + "epoch": 0.44924094393506686, + "flos": 22967136645120.0, + "grad_norm": 1.8719894830330126, + "language_loss": 0.70339048, + "learning_rate": 2.4226652425010048e-06, + "loss": 0.72485489, + "num_input_tokens_seen": 160261815, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.7265625, + "step": 7472, + "time_per_iteration": 2.5138602256774902 + }, + { + "auxiliary_loss_clip": 0.01038244, + "auxiliary_loss_mlp": 0.01015151, + "balance_loss_clip": 1.01388156, + "balance_loss_mlp": 1.01404762, + "epoch": 0.4493010671877349, + "flos": 59233467864960.0, + "grad_norm": 0.7429949026472541, + "language_loss": 0.61734539, + "learning_rate": 2.4222845691108676e-06, + "loss": 0.63787931, + "num_input_tokens_seen": 160317070, + "router_z_loss_clip": 0.01269531, + "router_z_loss_mlp": 0.2421875, + "step": 7473, + "time_per_iteration": 3.0049262046813965 + }, + { + "auxiliary_loss_clip": 0.01114767, + "auxiliary_loss_mlp": 0.01037887, + "balance_loss_clip": 1.02495253, + "balance_loss_mlp": 1.04087818, + "epoch": 0.44936119044040285, + "flos": 18004892653440.0, + "grad_norm": 2.4001000632965828, + "language_loss": 0.78185022, + "learning_rate": 2.421903879707657e-06, + "loss": 0.80337679, + "num_input_tokens_seen": 160334980, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.73828125, + "step": 7474, + "time_per_iteration": 2.4396324157714844 + }, + { + "auxiliary_loss_clip": 0.01110455, + "auxiliary_loss_mlp": 0.01034912, + "balance_loss_clip": 1.0225265, + "balance_loss_mlp": 1.04009926, + "epoch": 0.4494213136930708, + "flos": 21251827589760.0, + "grad_norm": 1.704620828516005, + "language_loss": 0.72103465, + "learning_rate": 2.4215231743058086e-06, + "loss": 0.74248827, + "num_input_tokens_seen": 160354500, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.703125, + "step": 7475, + "time_per_iteration": 2.464167356491089 + }, + { + "auxiliary_loss_clip": 0.01112269, + "auxiliary_loss_mlp": 0.01030142, + "balance_loss_clip": 1.01847768, + "balance_loss_mlp": 1.03917694, + "epoch": 0.4494814369457388, + "flos": 27418695022080.0, + "grad_norm": 1.7869016250475191, + "language_loss": 0.76343799, + "learning_rate": 2.4211424529197594e-06, + "loss": 0.7848621, + "num_input_tokens_seen": 160373650, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.73046875, + "step": 7476, + "time_per_iteration": 2.529374837875366 + }, + { + "auxiliary_loss_clip": 0.01116691, + "auxiliary_loss_mlp": 0.010361, + "balance_loss_clip": 1.02194357, + "balance_loss_mlp": 1.04036331, + "epoch": 0.44954156019840674, + "flos": 22854053652480.0, + "grad_norm": 2.3312494175836034, + "language_loss": 0.71774453, + "learning_rate": 2.4207617155639464e-06, + "loss": 0.73927242, + "num_input_tokens_seen": 160393430, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.765625, + "step": 7477, + "time_per_iteration": 2.4914534091949463 + }, + { + "auxiliary_loss_clip": 0.01116651, + "auxiliary_loss_mlp": 0.01030749, + "balance_loss_clip": 1.01757061, + "balance_loss_mlp": 1.04089749, + "epoch": 0.4496016834510747, + "flos": 17201570935680.0, + "grad_norm": 2.2338487326584073, + "language_loss": 0.68136394, + "learning_rate": 2.4203809622528062e-06, + "loss": 0.70283794, + "num_input_tokens_seen": 160410545, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7578125, + "step": 7478, + "time_per_iteration": 2.4622039794921875 + }, + { + "auxiliary_loss_clip": 0.01112091, + "auxiliary_loss_mlp": 0.01032835, + "balance_loss_clip": 1.02097332, + "balance_loss_mlp": 1.04130244, + "epoch": 0.4496618067037427, + "flos": 18916628595840.0, + "grad_norm": 1.8288012816153718, + "language_loss": 0.89528286, + "learning_rate": 2.420000193000779e-06, + "loss": 0.91673213, + "num_input_tokens_seen": 160428105, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.7109375, + "step": 7479, + "time_per_iteration": 2.4738242626190186 + }, + { + "auxiliary_loss_clip": 0.01114783, + "auxiliary_loss_mlp": 0.01032708, + "balance_loss_clip": 1.01970804, + "balance_loss_mlp": 1.0423162, + "epoch": 0.44972192995641064, + "flos": 21031659175680.0, + "grad_norm": 2.1133613410879155, + "language_loss": 0.75824946, + "learning_rate": 2.419619407822302e-06, + "loss": 0.77972436, + "num_input_tokens_seen": 160448815, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7265625, + "step": 7480, + "time_per_iteration": 2.536190986633301 + }, + { + "auxiliary_loss_clip": 0.01116796, + "auxiliary_loss_mlp": 0.01031783, + "balance_loss_clip": 1.01906347, + "balance_loss_mlp": 1.04211199, + "epoch": 0.4497820532090786, + "flos": 20777088510720.0, + "grad_norm": 2.1813635775429794, + "language_loss": 0.80066407, + "learning_rate": 2.419238606731815e-06, + "loss": 0.82214987, + "num_input_tokens_seen": 160465940, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.74609375, + "step": 7481, + "time_per_iteration": 2.4618031978607178 + }, + { + "auxiliary_loss_clip": 0.01110042, + "auxiliary_loss_mlp": 0.01030863, + "balance_loss_clip": 1.01809597, + "balance_loss_mlp": 1.04028749, + "epoch": 0.44984217646174657, + "flos": 33802606385280.0, + "grad_norm": 1.5995355023246276, + "language_loss": 0.68636084, + "learning_rate": 2.418857789743758e-06, + "loss": 0.70776993, + "num_input_tokens_seen": 160486710, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 7482, + "time_per_iteration": 2.5711851119995117 + }, + { + "auxiliary_loss_clip": 0.0111451, + "auxiliary_loss_mlp": 0.01035168, + "balance_loss_clip": 1.02260911, + "balance_loss_mlp": 1.04059076, + "epoch": 0.44990229971441453, + "flos": 15518365660800.0, + "grad_norm": 2.0339843826279504, + "language_loss": 0.84802616, + "learning_rate": 2.418476956872571e-06, + "loss": 0.86952293, + "num_input_tokens_seen": 160503405, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.73828125, + "step": 7483, + "time_per_iteration": 2.4510746002197266 + }, + { + "auxiliary_loss_clip": 0.01120092, + "auxiliary_loss_mlp": 0.01034767, + "balance_loss_clip": 1.02177286, + "balance_loss_mlp": 1.04386485, + "epoch": 0.4499624229670825, + "flos": 29861913191040.0, + "grad_norm": 1.8187080510096723, + "language_loss": 0.80409968, + "learning_rate": 2.4180961081326967e-06, + "loss": 0.82564819, + "num_input_tokens_seen": 160525080, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.765625, + "step": 7484, + "time_per_iteration": 2.539834976196289 + }, + { + "auxiliary_loss_clip": 0.01118118, + "auxiliary_loss_mlp": 0.01028797, + "balance_loss_clip": 1.01529098, + "balance_loss_mlp": 1.03992271, + "epoch": 0.45002254621975046, + "flos": 18513674847360.0, + "grad_norm": 2.310143901315373, + "language_loss": 0.75594473, + "learning_rate": 2.4177152435385754e-06, + "loss": 0.77741385, + "num_input_tokens_seen": 160540895, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.78125, + "step": 7485, + "time_per_iteration": 2.408979892730713 + }, + { + "auxiliary_loss_clip": 0.01041505, + "auxiliary_loss_mlp": 0.01002218, + "balance_loss_clip": 1.00065601, + "balance_loss_mlp": 1.0170331, + "epoch": 0.4500826694724185, + "flos": 70420394229120.0, + "grad_norm": 0.7895891566174408, + "language_loss": 0.5867179, + "learning_rate": 2.4173343631046504e-06, + "loss": 0.60715508, + "num_input_tokens_seen": 160598270, + "router_z_loss_clip": 0.015625, + "router_z_loss_mlp": 0.24511719, + "step": 7486, + "time_per_iteration": 3.09049654006958 + }, + { + "auxiliary_loss_clip": 0.01113596, + "auxiliary_loss_mlp": 0.01031271, + "balance_loss_clip": 1.0184797, + "balance_loss_mlp": 1.04104531, + "epoch": 0.45014279272508645, + "flos": 15778897983360.0, + "grad_norm": 2.266854053846726, + "language_loss": 0.83153397, + "learning_rate": 2.4169534668453654e-06, + "loss": 0.85298264, + "num_input_tokens_seen": 160614720, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7265625, + "step": 7487, + "time_per_iteration": 2.431209087371826 + }, + { + "auxiliary_loss_clip": 0.01113173, + "auxiliary_loss_mlp": 0.01028971, + "balance_loss_clip": 1.01626313, + "balance_loss_mlp": 1.04103804, + "epoch": 0.4502029159777544, + "flos": 21799573061760.0, + "grad_norm": 1.5035728003068896, + "language_loss": 0.77055335, + "learning_rate": 2.4165725547751622e-06, + "loss": 0.79197478, + "num_input_tokens_seen": 160635170, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 7488, + "time_per_iteration": 2.5085837841033936 + }, + { + "auxiliary_loss_clip": 0.01121581, + "auxiliary_loss_mlp": 0.01038217, + "balance_loss_clip": 1.02446008, + "balance_loss_mlp": 1.04378915, + "epoch": 0.4502630392304224, + "flos": 28767966531840.0, + "grad_norm": 2.6401168824150574, + "language_loss": 0.71564645, + "learning_rate": 2.4161916269084858e-06, + "loss": 0.73724437, + "num_input_tokens_seen": 160654490, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.77734375, + "step": 7489, + "time_per_iteration": 2.5106120109558105 + }, + { + "auxiliary_loss_clip": 0.01119744, + "auxiliary_loss_mlp": 0.01032559, + "balance_loss_clip": 1.01856422, + "balance_loss_mlp": 1.04424906, + "epoch": 0.45032316248309034, + "flos": 15844182952320.0, + "grad_norm": 2.1685657644370853, + "language_loss": 0.6962117, + "learning_rate": 2.4158106832597817e-06, + "loss": 0.71773469, + "num_input_tokens_seen": 160669400, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7578125, + "step": 7490, + "time_per_iteration": 2.4383597373962402 + }, + { + "auxiliary_loss_clip": 0.01038961, + "auxiliary_loss_mlp": 0.01000463, + "balance_loss_clip": 0.99907476, + "balance_loss_mlp": 1.01472032, + "epoch": 0.4503832857357583, + "flos": 57853600945920.0, + "grad_norm": 1.805652104877531, + "language_loss": 0.56691748, + "learning_rate": 2.415429723843495e-06, + "loss": 0.5873118, + "num_input_tokens_seen": 160733820, + "router_z_loss_clip": 0.01391602, + "router_z_loss_mlp": 0.2421875, + "step": 7491, + "time_per_iteration": 3.0662994384765625 + }, + { + "auxiliary_loss_clip": 0.01111025, + "auxiliary_loss_mlp": 0.01029852, + "balance_loss_clip": 1.01719177, + "balance_loss_mlp": 1.03987265, + "epoch": 0.4504434089884263, + "flos": 23878082488320.0, + "grad_norm": 1.5869212574214921, + "language_loss": 0.79462028, + "learning_rate": 2.4150487486740713e-06, + "loss": 0.81602901, + "num_input_tokens_seen": 160753175, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 7492, + "time_per_iteration": 2.497849464416504 + }, + { + "auxiliary_loss_clip": 0.01119638, + "auxiliary_loss_mlp": 0.01038139, + "balance_loss_clip": 1.02474022, + "balance_loss_mlp": 1.04271042, + "epoch": 0.45050353224109424, + "flos": 17785083375360.0, + "grad_norm": 2.074371460837293, + "language_loss": 0.92560953, + "learning_rate": 2.4146677577659573e-06, + "loss": 0.9471873, + "num_input_tokens_seen": 160768310, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.76953125, + "step": 7493, + "time_per_iteration": 2.4717981815338135 + }, + { + "auxiliary_loss_clip": 0.01039013, + "auxiliary_loss_mlp": 0.01000993, + "balance_loss_clip": 0.99946707, + "balance_loss_mlp": 1.01443267, + "epoch": 0.4505636554937622, + "flos": 65063420703360.0, + "grad_norm": 0.8118074327791402, + "language_loss": 0.62908041, + "learning_rate": 2.4142867511336e-06, + "loss": 0.64948046, + "num_input_tokens_seen": 160827370, + "router_z_loss_clip": 0.01525879, + "router_z_loss_mlp": 0.24609375, + "step": 7494, + "time_per_iteration": 3.1021509170532227 + }, + { + "auxiliary_loss_clip": 0.01113816, + "auxiliary_loss_mlp": 0.01032596, + "balance_loss_clip": 1.02063334, + "balance_loss_mlp": 1.04122376, + "epoch": 0.45062377874643017, + "flos": 22200084685440.0, + "grad_norm": 1.4599772474200656, + "language_loss": 0.81980979, + "learning_rate": 2.4139057287914484e-06, + "loss": 0.8412739, + "num_input_tokens_seen": 160849140, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.7265625, + "step": 7495, + "time_per_iteration": 2.528707981109619 + }, + { + "auxiliary_loss_clip": 0.01114077, + "auxiliary_loss_mlp": 0.010313, + "balance_loss_clip": 1.01793659, + "balance_loss_mlp": 1.04069221, + "epoch": 0.45068390199909814, + "flos": 37670293186560.0, + "grad_norm": 1.6718702145442927, + "language_loss": 0.85639864, + "learning_rate": 2.41352469075395e-06, + "loss": 0.87785244, + "num_input_tokens_seen": 160871280, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.734375, + "step": 7496, + "time_per_iteration": 2.5862984657287598 + }, + { + "auxiliary_loss_clip": 0.0111579, + "auxiliary_loss_mlp": 0.0103187, + "balance_loss_clip": 1.01913798, + "balance_loss_mlp": 1.04234052, + "epoch": 0.4507440252517661, + "flos": 22302501338880.0, + "grad_norm": 2.117680053603533, + "language_loss": 0.76342994, + "learning_rate": 2.4131436370355534e-06, + "loss": 0.78490651, + "num_input_tokens_seen": 160888625, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.734375, + "step": 7497, + "time_per_iteration": 2.4831669330596924 + }, + { + "auxiliary_loss_clip": 0.01114815, + "auxiliary_loss_mlp": 0.01030719, + "balance_loss_clip": 1.01798773, + "balance_loss_mlp": 1.03939152, + "epoch": 0.45080414850443407, + "flos": 13188374138880.0, + "grad_norm": 2.971687057549937, + "language_loss": 0.75124824, + "learning_rate": 2.4127625676507088e-06, + "loss": 0.77270365, + "num_input_tokens_seen": 160907040, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.75390625, + "step": 7498, + "time_per_iteration": 2.4243438243865967 + }, + { + "auxiliary_loss_clip": 0.01116531, + "auxiliary_loss_mlp": 0.01041824, + "balance_loss_clip": 1.02853799, + "balance_loss_mlp": 1.04190993, + "epoch": 0.4508642717571021, + "flos": 21944939402880.0, + "grad_norm": 1.8265166276024245, + "language_loss": 0.70487583, + "learning_rate": 2.4123814826138663e-06, + "loss": 0.72645926, + "num_input_tokens_seen": 160927115, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.74609375, + "step": 7499, + "time_per_iteration": 2.496595859527588 + }, + { + "auxiliary_loss_clip": 0.01118241, + "auxiliary_loss_mlp": 0.0103412, + "balance_loss_clip": 1.02090549, + "balance_loss_mlp": 1.04258835, + "epoch": 0.45092439500977005, + "flos": 23367468700800.0, + "grad_norm": 1.819855114084185, + "language_loss": 0.76870257, + "learning_rate": 2.412000381939477e-06, + "loss": 0.79022616, + "num_input_tokens_seen": 160944405, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7578125, + "step": 7500, + "time_per_iteration": 2.4659407138824463 + }, + { + "auxiliary_loss_clip": 0.01114886, + "auxiliary_loss_mlp": 0.01032223, + "balance_loss_clip": 1.01943755, + "balance_loss_mlp": 1.04146719, + "epoch": 0.450984518262438, + "flos": 20772958446720.0, + "grad_norm": 1.7705256698152247, + "language_loss": 0.62966442, + "learning_rate": 2.411619265641992e-06, + "loss": 0.6511355, + "num_input_tokens_seen": 160961345, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.734375, + "step": 7501, + "time_per_iteration": 2.474149703979492 + }, + { + "auxiliary_loss_clip": 0.01117269, + "auxiliary_loss_mlp": 0.0103454, + "balance_loss_clip": 1.02093208, + "balance_loss_mlp": 1.04161, + "epoch": 0.451044641515106, + "flos": 17707372300800.0, + "grad_norm": 1.9049764473951474, + "language_loss": 0.84758866, + "learning_rate": 2.411238133735863e-06, + "loss": 0.86910677, + "num_input_tokens_seen": 160977330, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7578125, + "step": 7502, + "time_per_iteration": 2.419093370437622 + }, + { + "auxiliary_loss_clip": 0.01111337, + "auxiliary_loss_mlp": 0.01033516, + "balance_loss_clip": 1.02135682, + "balance_loss_mlp": 1.04026246, + "epoch": 0.45110476476777395, + "flos": 20594698225920.0, + "grad_norm": 1.4187712379612754, + "language_loss": 0.79906255, + "learning_rate": 2.4108569862355418e-06, + "loss": 0.8205111, + "num_input_tokens_seen": 160997280, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.7109375, + "step": 7503, + "time_per_iteration": 2.536954164505005 + }, + { + "auxiliary_loss_clip": 0.01112743, + "auxiliary_loss_mlp": 0.01036948, + "balance_loss_clip": 1.02458, + "balance_loss_mlp": 1.04287815, + "epoch": 0.4511648880204419, + "flos": 16034043265920.0, + "grad_norm": 3.706114905397956, + "language_loss": 0.80931562, + "learning_rate": 2.410475823155484e-06, + "loss": 0.83081251, + "num_input_tokens_seen": 161014235, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6953125, + "step": 7504, + "time_per_iteration": 2.4356000423431396 + }, + { + "auxiliary_loss_clip": 0.01112245, + "auxiliary_loss_mlp": 0.01034569, + "balance_loss_clip": 1.02284479, + "balance_loss_mlp": 1.04033744, + "epoch": 0.4512250112731099, + "flos": 23978811202560.0, + "grad_norm": 5.269565558405545, + "language_loss": 0.63377774, + "learning_rate": 2.4100946445101405e-06, + "loss": 0.6552459, + "num_input_tokens_seen": 161032360, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.71875, + "step": 7505, + "time_per_iteration": 2.4934160709381104 + }, + { + "auxiliary_loss_clip": 0.01036723, + "auxiliary_loss_mlp": 0.0101133, + "balance_loss_clip": 1.00969648, + "balance_loss_mlp": 1.01246166, + "epoch": 0.45128513452577784, + "flos": 71462308037760.0, + "grad_norm": 0.8504866778221882, + "language_loss": 0.5887711, + "learning_rate": 2.409713450313968e-06, + "loss": 0.60925162, + "num_input_tokens_seen": 161091360, + "router_z_loss_clip": 0.01635742, + "router_z_loss_mlp": 0.2421875, + "step": 7506, + "time_per_iteration": 3.1150898933410645 + }, + { + "auxiliary_loss_clip": 0.01112738, + "auxiliary_loss_mlp": 0.0103395, + "balance_loss_clip": 1.02087879, + "balance_loss_mlp": 1.04194486, + "epoch": 0.4513452577784458, + "flos": 22090844448000.0, + "grad_norm": 1.6347442617822043, + "language_loss": 0.79238498, + "learning_rate": 2.40933224058142e-06, + "loss": 0.81385183, + "num_input_tokens_seen": 161110825, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.70703125, + "step": 7507, + "time_per_iteration": 2.484036684036255 + }, + { + "auxiliary_loss_clip": 0.0111511, + "auxiliary_loss_mlp": 0.01033622, + "balance_loss_clip": 1.019871, + "balance_loss_mlp": 1.04084098, + "epoch": 0.4514053810311138, + "flos": 24276403382400.0, + "grad_norm": 1.5108356171854629, + "language_loss": 0.7397756, + "learning_rate": 2.4089510153269526e-06, + "loss": 0.76126289, + "num_input_tokens_seen": 161130685, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7421875, + "step": 7508, + "time_per_iteration": 2.4958505630493164 + }, + { + "auxiliary_loss_clip": 0.01112961, + "auxiliary_loss_mlp": 0.01036508, + "balance_loss_clip": 1.02423549, + "balance_loss_mlp": 1.04263186, + "epoch": 0.45146550428378174, + "flos": 17886781756800.0, + "grad_norm": 1.9053667394121476, + "language_loss": 0.78955048, + "learning_rate": 2.4085697745650217e-06, + "loss": 0.81104517, + "num_input_tokens_seen": 161147555, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 7509, + "time_per_iteration": 2.4640209674835205 + }, + { + "auxiliary_loss_clip": 0.01114289, + "auxiliary_loss_mlp": 0.01029902, + "balance_loss_clip": 1.01759398, + "balance_loss_mlp": 1.0420239, + "epoch": 0.4515256275364497, + "flos": 24243437675520.0, + "grad_norm": 1.8944319049742213, + "language_loss": 0.73495883, + "learning_rate": 2.4081885183100837e-06, + "loss": 0.75640076, + "num_input_tokens_seen": 161166255, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.72265625, + "step": 7510, + "time_per_iteration": 2.462289810180664 + }, + { + "auxiliary_loss_clip": 0.01115796, + "auxiliary_loss_mlp": 0.01032073, + "balance_loss_clip": 1.01856017, + "balance_loss_mlp": 1.04091644, + "epoch": 0.45158575078911767, + "flos": 20631039811200.0, + "grad_norm": 1.9974195471898801, + "language_loss": 0.77053016, + "learning_rate": 2.4078072465765964e-06, + "loss": 0.79200888, + "num_input_tokens_seen": 161184720, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 7511, + "time_per_iteration": 2.5831305980682373 + }, + { + "auxiliary_loss_clip": 0.01114808, + "auxiliary_loss_mlp": 0.01032776, + "balance_loss_clip": 1.01937711, + "balance_loss_mlp": 1.04086745, + "epoch": 0.45164587404178563, + "flos": 23327751237120.0, + "grad_norm": 1.734048899080759, + "language_loss": 0.79124206, + "learning_rate": 2.4074259593790174e-06, + "loss": 0.81271791, + "num_input_tokens_seen": 161204360, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73828125, + "step": 7512, + "time_per_iteration": 6.862476587295532 + }, + { + "auxiliary_loss_clip": 0.01118735, + "auxiliary_loss_mlp": 0.01037966, + "balance_loss_clip": 1.02435863, + "balance_loss_mlp": 1.04064548, + "epoch": 0.45170599729445365, + "flos": 23805973935360.0, + "grad_norm": 1.9681233127218394, + "language_loss": 0.87461096, + "learning_rate": 2.4070446567318053e-06, + "loss": 0.89617801, + "num_input_tokens_seen": 161223575, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.78125, + "step": 7513, + "time_per_iteration": 2.5551092624664307 + }, + { + "auxiliary_loss_clip": 0.01105419, + "auxiliary_loss_mlp": 0.01030567, + "balance_loss_clip": 1.01893246, + "balance_loss_mlp": 1.0379355, + "epoch": 0.4517661205471216, + "flos": 23512942782720.0, + "grad_norm": 1.6638824980939535, + "language_loss": 0.67135286, + "learning_rate": 2.406663338649419e-06, + "loss": 0.69271272, + "num_input_tokens_seen": 161243805, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.671875, + "step": 7514, + "time_per_iteration": 2.4804775714874268 + }, + { + "auxiliary_loss_clip": 0.01115847, + "auxiliary_loss_mlp": 0.01029857, + "balance_loss_clip": 1.01448536, + "balance_loss_mlp": 1.04221404, + "epoch": 0.4518262437997896, + "flos": 23513948363520.0, + "grad_norm": 2.644844833078513, + "language_loss": 0.69455916, + "learning_rate": 2.406282005146318e-06, + "loss": 0.71601617, + "num_input_tokens_seen": 161261450, + "router_z_loss_clip": 0.15429688, + "router_z_loss_mlp": 0.734375, + "step": 7515, + "time_per_iteration": 2.530089855194092 + }, + { + "auxiliary_loss_clip": 0.01117096, + "auxiliary_loss_mlp": 0.01034746, + "balance_loss_clip": 1.02060795, + "balance_loss_mlp": 1.04084945, + "epoch": 0.45188636705245755, + "flos": 14568061489920.0, + "grad_norm": 2.154684023631233, + "language_loss": 0.81658673, + "learning_rate": 2.405900656236963e-06, + "loss": 0.83810514, + "num_input_tokens_seen": 161276965, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.76171875, + "step": 7516, + "time_per_iteration": 2.405810832977295 + }, + { + "auxiliary_loss_clip": 0.01111826, + "auxiliary_loss_mlp": 0.01032545, + "balance_loss_clip": 1.01940227, + "balance_loss_mlp": 1.04099917, + "epoch": 0.4519464903051255, + "flos": 19901550499200.0, + "grad_norm": 1.5513632113186169, + "language_loss": 0.65810448, + "learning_rate": 2.4055192919358137e-06, + "loss": 0.6795482, + "num_input_tokens_seen": 161295375, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.70703125, + "step": 7517, + "time_per_iteration": 2.487539768218994 + }, + { + "auxiliary_loss_clip": 0.0111082, + "auxiliary_loss_mlp": 0.0102731, + "balance_loss_clip": 1.01549673, + "balance_loss_mlp": 1.04066491, + "epoch": 0.4520066135577935, + "flos": 18844376388480.0, + "grad_norm": 1.7604175245242084, + "language_loss": 0.63401121, + "learning_rate": 2.405137912257333e-06, + "loss": 0.65539253, + "num_input_tokens_seen": 161313010, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.69921875, + "step": 7518, + "time_per_iteration": 2.4280178546905518 + }, + { + "auxiliary_loss_clip": 0.01112305, + "auxiliary_loss_mlp": 0.010337, + "balance_loss_clip": 1.02124858, + "balance_loss_mlp": 1.04022479, + "epoch": 0.45206673681046144, + "flos": 48214419713280.0, + "grad_norm": 1.4125127095428567, + "language_loss": 0.59552354, + "learning_rate": 2.404756517215982e-06, + "loss": 0.61698353, + "num_input_tokens_seen": 161336690, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71875, + "step": 7519, + "time_per_iteration": 2.706774950027466 + }, + { + "auxiliary_loss_clip": 0.01114162, + "auxiliary_loss_mlp": 0.01036796, + "balance_loss_clip": 1.02404702, + "balance_loss_mlp": 1.04053855, + "epoch": 0.4521268600631294, + "flos": 23842171866240.0, + "grad_norm": 1.3128892020538214, + "language_loss": 0.72288704, + "learning_rate": 2.404375106826223e-06, + "loss": 0.74439663, + "num_input_tokens_seen": 161357845, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.734375, + "step": 7520, + "time_per_iteration": 2.4802541732788086 + }, + { + "auxiliary_loss_clip": 0.01113212, + "auxiliary_loss_mlp": 0.01037416, + "balance_loss_clip": 1.0250659, + "balance_loss_mlp": 1.04033482, + "epoch": 0.4521869833157974, + "flos": 18843622202880.0, + "grad_norm": 1.8726393810843218, + "language_loss": 0.75520414, + "learning_rate": 2.4039936811025194e-06, + "loss": 0.77671039, + "num_input_tokens_seen": 161375160, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.73046875, + "step": 7521, + "time_per_iteration": 2.4384777545928955 + }, + { + "auxiliary_loss_clip": 0.0111833, + "auxiliary_loss_mlp": 0.01035726, + "balance_loss_clip": 1.022416, + "balance_loss_mlp": 1.04222465, + "epoch": 0.45224710656846534, + "flos": 19788072456960.0, + "grad_norm": 1.6736116772601735, + "language_loss": 0.67521721, + "learning_rate": 2.4036122400593343e-06, + "loss": 0.69675779, + "num_input_tokens_seen": 161393690, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.76171875, + "step": 7522, + "time_per_iteration": 2.4317188262939453 + }, + { + "auxiliary_loss_clip": 0.01109922, + "auxiliary_loss_mlp": 0.01033885, + "balance_loss_clip": 1.02090335, + "balance_loss_mlp": 1.03857231, + "epoch": 0.4523072298211333, + "flos": 28256131681920.0, + "grad_norm": 1.5002177443666298, + "language_loss": 0.60627949, + "learning_rate": 2.403230783711134e-06, + "loss": 0.62771761, + "num_input_tokens_seen": 161415015, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 7523, + "time_per_iteration": 2.5312907695770264 + }, + { + "auxiliary_loss_clip": 0.01116524, + "auxiliary_loss_mlp": 0.01039355, + "balance_loss_clip": 1.02556825, + "balance_loss_mlp": 1.0399549, + "epoch": 0.45236735307380127, + "flos": 11181039511680.0, + "grad_norm": 2.0404967948828796, + "language_loss": 0.78325248, + "learning_rate": 2.4028493120723813e-06, + "loss": 0.80481124, + "num_input_tokens_seen": 161432940, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.765625, + "step": 7524, + "time_per_iteration": 2.4078996181488037 + }, + { + "auxiliary_loss_clip": 0.01111336, + "auxiliary_loss_mlp": 0.01034812, + "balance_loss_clip": 1.02216387, + "balance_loss_mlp": 1.03912878, + "epoch": 0.45242747632646924, + "flos": 22601386408320.0, + "grad_norm": 1.9789251534337415, + "language_loss": 0.63518596, + "learning_rate": 2.4024678251575417e-06, + "loss": 0.65664744, + "num_input_tokens_seen": 161452215, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.72265625, + "step": 7525, + "time_per_iteration": 2.503176212310791 + }, + { + "auxiliary_loss_clip": 0.01112174, + "auxiliary_loss_mlp": 0.01035043, + "balance_loss_clip": 1.02262783, + "balance_loss_mlp": 1.04040241, + "epoch": 0.45248759957913726, + "flos": 18256267008000.0, + "grad_norm": 1.5288172547930599, + "language_loss": 0.79163349, + "learning_rate": 2.402086322981083e-06, + "loss": 0.8131057, + "num_input_tokens_seen": 161469520, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71875, + "step": 7526, + "time_per_iteration": 2.4558780193328857 + }, + { + "auxiliary_loss_clip": 0.01111805, + "auxiliary_loss_mlp": 0.01030821, + "balance_loss_clip": 1.01851869, + "balance_loss_mlp": 1.04029512, + "epoch": 0.4525477228318052, + "flos": 22450094323200.0, + "grad_norm": 1.6413449131819307, + "language_loss": 0.80729342, + "learning_rate": 2.40170480555747e-06, + "loss": 0.82871962, + "num_input_tokens_seen": 161487335, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.7109375, + "step": 7527, + "time_per_iteration": 2.470186948776245 + }, + { + "auxiliary_loss_clip": 0.0111113, + "auxiliary_loss_mlp": 0.01028615, + "balance_loss_clip": 1.01566291, + "balance_loss_mlp": 1.039428, + "epoch": 0.4526078460844732, + "flos": 29644869260160.0, + "grad_norm": 1.450835161887395, + "language_loss": 0.65505683, + "learning_rate": 2.4013232729011706e-06, + "loss": 0.67645425, + "num_input_tokens_seen": 161510095, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71875, + "step": 7528, + "time_per_iteration": 2.541700601577759 + }, + { + "auxiliary_loss_clip": 0.01110752, + "auxiliary_loss_mlp": 0.01032238, + "balance_loss_clip": 1.02031136, + "balance_loss_mlp": 1.03976476, + "epoch": 0.45266796933714115, + "flos": 23039747988480.0, + "grad_norm": 1.6649436204324595, + "language_loss": 0.7542727, + "learning_rate": 2.4009417250266525e-06, + "loss": 0.7757026, + "num_input_tokens_seen": 161528725, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.7109375, + "step": 7529, + "time_per_iteration": 2.5726876258850098 + }, + { + "auxiliary_loss_clip": 0.01112607, + "auxiliary_loss_mlp": 0.01030788, + "balance_loss_clip": 1.01853299, + "balance_loss_mlp": 1.03971684, + "epoch": 0.4527280925898091, + "flos": 14428405411200.0, + "grad_norm": 1.7825780716691442, + "language_loss": 0.73193467, + "learning_rate": 2.400560161948384e-06, + "loss": 0.75336862, + "num_input_tokens_seen": 161547195, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.73046875, + "step": 7530, + "time_per_iteration": 2.4584052562713623 + }, + { + "auxiliary_loss_clip": 0.01113111, + "auxiliary_loss_mlp": 0.01033658, + "balance_loss_clip": 1.02193975, + "balance_loss_mlp": 1.04003453, + "epoch": 0.4527882158424771, + "flos": 22925515760640.0, + "grad_norm": 1.6012488985464985, + "language_loss": 0.75947326, + "learning_rate": 2.400178583680834e-06, + "loss": 0.78094089, + "num_input_tokens_seen": 161565565, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.734375, + "step": 7531, + "time_per_iteration": 2.484959363937378 + }, + { + "auxiliary_loss_clip": 0.01108375, + "auxiliary_loss_mlp": 0.01034859, + "balance_loss_clip": 1.02182305, + "balance_loss_mlp": 1.0382148, + "epoch": 0.45284833909514505, + "flos": 25555326105600.0, + "grad_norm": 1.4359815558452909, + "language_loss": 0.66874713, + "learning_rate": 2.3997969902384717e-06, + "loss": 0.69017947, + "num_input_tokens_seen": 161586630, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 7532, + "time_per_iteration": 2.486598253250122 + }, + { + "auxiliary_loss_clip": 0.01112272, + "auxiliary_loss_mlp": 0.01035317, + "balance_loss_clip": 1.02322936, + "balance_loss_mlp": 1.04091084, + "epoch": 0.452908462347813, + "flos": 18150007599360.0, + "grad_norm": 2.0450394734969874, + "language_loss": 0.78902352, + "learning_rate": 2.399415381635768e-06, + "loss": 0.81049943, + "num_input_tokens_seen": 161603815, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71484375, + "step": 7533, + "time_per_iteration": 2.4407958984375 + }, + { + "auxiliary_loss_clip": 0.01115719, + "auxiliary_loss_mlp": 0.01034272, + "balance_loss_clip": 1.02032459, + "balance_loss_mlp": 1.03807485, + "epoch": 0.452968585600481, + "flos": 19062749122560.0, + "grad_norm": 1.646532255034537, + "language_loss": 0.83279264, + "learning_rate": 2.3990337578871927e-06, + "loss": 0.85429263, + "num_input_tokens_seen": 161622900, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7734375, + "step": 7534, + "time_per_iteration": 2.430670976638794 + }, + { + "auxiliary_loss_clip": 0.01113826, + "auxiliary_loss_mlp": 0.01034802, + "balance_loss_clip": 1.02148068, + "balance_loss_mlp": 1.03927064, + "epoch": 0.45302870885314894, + "flos": 22051737515520.0, + "grad_norm": 1.4654832124358697, + "language_loss": 0.76578003, + "learning_rate": 2.3986521190072176e-06, + "loss": 0.78726631, + "num_input_tokens_seen": 161641700, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.74609375, + "step": 7535, + "time_per_iteration": 2.4744579792022705 + }, + { + "auxiliary_loss_clip": 0.0110944, + "auxiliary_loss_mlp": 0.01031373, + "balance_loss_clip": 1.01957679, + "balance_loss_mlp": 1.03883696, + "epoch": 0.4530888321058169, + "flos": 20376217751040.0, + "grad_norm": 1.5977579258117844, + "language_loss": 0.80234635, + "learning_rate": 2.3982704650103138e-06, + "loss": 0.82375443, + "num_input_tokens_seen": 161661955, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.703125, + "step": 7536, + "time_per_iteration": 2.4481444358825684 + }, + { + "auxiliary_loss_clip": 0.01111518, + "auxiliary_loss_mlp": 0.0102989, + "balance_loss_clip": 1.0173198, + "balance_loss_mlp": 1.03711987, + "epoch": 0.4531489553584849, + "flos": 14830425406080.0, + "grad_norm": 2.0610118763249536, + "language_loss": 0.75895774, + "learning_rate": 2.3978887959109544e-06, + "loss": 0.78037184, + "num_input_tokens_seen": 161679245, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7421875, + "step": 7537, + "time_per_iteration": 2.430119276046753 + }, + { + "auxiliary_loss_clip": 0.01115071, + "auxiliary_loss_mlp": 0.01032409, + "balance_loss_clip": 1.02058339, + "balance_loss_mlp": 1.04172075, + "epoch": 0.45320907861115284, + "flos": 21944975316480.0, + "grad_norm": 2.095176663386117, + "language_loss": 0.76420474, + "learning_rate": 2.3975071117236118e-06, + "loss": 0.78567952, + "num_input_tokens_seen": 161698795, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.734375, + "step": 7538, + "time_per_iteration": 2.4675159454345703 + }, + { + "auxiliary_loss_clip": 0.01041439, + "auxiliary_loss_mlp": 0.0100041, + "balance_loss_clip": 0.99908096, + "balance_loss_mlp": 1.01700771, + "epoch": 0.45326920186382086, + "flos": 66251455038720.0, + "grad_norm": 0.7965700347609973, + "language_loss": 0.62345123, + "learning_rate": 2.3971254124627593e-06, + "loss": 0.64386964, + "num_input_tokens_seen": 161761980, + "router_z_loss_clip": 0.01330566, + "router_z_loss_mlp": 0.24414062, + "step": 7539, + "time_per_iteration": 3.0961101055145264 + }, + { + "auxiliary_loss_clip": 0.01112571, + "auxiliary_loss_mlp": 0.01036685, + "balance_loss_clip": 1.02466285, + "balance_loss_mlp": 1.04064226, + "epoch": 0.4533293251164888, + "flos": 14684233052160.0, + "grad_norm": 1.8102149318529874, + "language_loss": 0.65997463, + "learning_rate": 2.396743698142872e-06, + "loss": 0.68146718, + "num_input_tokens_seen": 161779455, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.71875, + "step": 7540, + "time_per_iteration": 2.418170928955078 + }, + { + "auxiliary_loss_clip": 0.01118532, + "auxiliary_loss_mlp": 0.01040664, + "balance_loss_clip": 1.02721667, + "balance_loss_mlp": 1.04177594, + "epoch": 0.4533894483691568, + "flos": 22601206840320.0, + "grad_norm": 1.6922846601909878, + "language_loss": 0.84666622, + "learning_rate": 2.396361968778424e-06, + "loss": 0.86825818, + "num_input_tokens_seen": 161798980, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 7541, + "time_per_iteration": 2.4960954189300537 + }, + { + "auxiliary_loss_clip": 0.01113117, + "auxiliary_loss_mlp": 0.01031114, + "balance_loss_clip": 1.01888943, + "balance_loss_mlp": 1.03968024, + "epoch": 0.45344957162182475, + "flos": 34751617666560.0, + "grad_norm": 1.7180151747286094, + "language_loss": 0.76435781, + "learning_rate": 2.395980224383889e-06, + "loss": 0.78580016, + "num_input_tokens_seen": 161819745, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.734375, + "step": 7542, + "time_per_iteration": 2.574286937713623 + }, + { + "auxiliary_loss_clip": 0.011146, + "auxiliary_loss_mlp": 0.01029849, + "balance_loss_clip": 1.01687872, + "balance_loss_mlp": 1.04101157, + "epoch": 0.4535096948744927, + "flos": 23550218121600.0, + "grad_norm": 1.4680148354813627, + "language_loss": 0.80267954, + "learning_rate": 2.395598464973746e-06, + "loss": 0.82412398, + "num_input_tokens_seen": 161838575, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.734375, + "step": 7543, + "time_per_iteration": 2.5228359699249268 + }, + { + "auxiliary_loss_clip": 0.01115681, + "auxiliary_loss_mlp": 0.01037869, + "balance_loss_clip": 1.02517343, + "balance_loss_mlp": 1.04107285, + "epoch": 0.4535698181271607, + "flos": 25557552748800.0, + "grad_norm": 1.6471991367559184, + "language_loss": 0.75933033, + "learning_rate": 2.395216690562469e-06, + "loss": 0.78086591, + "num_input_tokens_seen": 161858590, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.74609375, + "step": 7544, + "time_per_iteration": 2.4976110458374023 + }, + { + "auxiliary_loss_clip": 0.01117877, + "auxiliary_loss_mlp": 0.01033773, + "balance_loss_clip": 1.02154779, + "balance_loss_mlp": 1.04304671, + "epoch": 0.45362994137982865, + "flos": 24864117713280.0, + "grad_norm": 1.8438932042246456, + "language_loss": 0.75447458, + "learning_rate": 2.3948349011645355e-06, + "loss": 0.77599108, + "num_input_tokens_seen": 161878390, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.75, + "step": 7545, + "time_per_iteration": 2.5022737979888916 + }, + { + "auxiliary_loss_clip": 0.01114305, + "auxiliary_loss_mlp": 0.0102975, + "balance_loss_clip": 1.01697659, + "balance_loss_mlp": 1.04100811, + "epoch": 0.4536900646324966, + "flos": 30806794408320.0, + "grad_norm": 1.5497429650402368, + "language_loss": 0.7210325, + "learning_rate": 2.394453096794423e-06, + "loss": 0.74247307, + "num_input_tokens_seen": 161898610, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.734375, + "step": 7546, + "time_per_iteration": 2.5246150493621826 + }, + { + "auxiliary_loss_clip": 0.01118375, + "auxiliary_loss_mlp": 0.01032123, + "balance_loss_clip": 1.01857507, + "balance_loss_mlp": 1.04212511, + "epoch": 0.4537501878851646, + "flos": 23404313076480.0, + "grad_norm": 1.558937793954525, + "language_loss": 0.7557559, + "learning_rate": 2.394071277466609e-06, + "loss": 0.77726084, + "num_input_tokens_seen": 161918210, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.76171875, + "step": 7547, + "time_per_iteration": 2.4949920177459717 + }, + { + "auxiliary_loss_clip": 0.01116665, + "auxiliary_loss_mlp": 0.0103361, + "balance_loss_clip": 1.02041912, + "balance_loss_mlp": 1.04200041, + "epoch": 0.45381031113783254, + "flos": 18149289327360.0, + "grad_norm": 2.0285954992459865, + "language_loss": 0.69878972, + "learning_rate": 2.393689443195573e-06, + "loss": 0.72029251, + "num_input_tokens_seen": 161936950, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.74609375, + "step": 7548, + "time_per_iteration": 2.4486818313598633 + }, + { + "auxiliary_loss_clip": 0.01114191, + "auxiliary_loss_mlp": 0.01040331, + "balance_loss_clip": 1.02771258, + "balance_loss_mlp": 1.04018688, + "epoch": 0.4538704343905005, + "flos": 25336666062720.0, + "grad_norm": 2.0627316040888117, + "language_loss": 0.72691673, + "learning_rate": 2.393307593995794e-06, + "loss": 0.74846196, + "num_input_tokens_seen": 161955550, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7421875, + "step": 7549, + "time_per_iteration": 2.509470224380493 + }, + { + "auxiliary_loss_clip": 0.01112378, + "auxiliary_loss_mlp": 0.01029099, + "balance_loss_clip": 1.01698172, + "balance_loss_mlp": 1.04035378, + "epoch": 0.4539305576431685, + "flos": 28731445378560.0, + "grad_norm": 1.7136809619022837, + "language_loss": 0.65253317, + "learning_rate": 2.392925729881751e-06, + "loss": 0.67394793, + "num_input_tokens_seen": 161976760, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71875, + "step": 7550, + "time_per_iteration": 2.5133440494537354 + }, + { + "auxiliary_loss_clip": 0.01113494, + "auxiliary_loss_mlp": 0.01037344, + "balance_loss_clip": 1.0250591, + "balance_loss_mlp": 1.04179323, + "epoch": 0.45399068089583644, + "flos": 22492397566080.0, + "grad_norm": 1.6025854653449239, + "language_loss": 0.68823695, + "learning_rate": 2.3925438508679263e-06, + "loss": 0.70974535, + "num_input_tokens_seen": 161996120, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.71875, + "step": 7551, + "time_per_iteration": 2.5188024044036865 + }, + { + "auxiliary_loss_clip": 0.01113711, + "auxiliary_loss_mlp": 0.01033106, + "balance_loss_clip": 1.02022541, + "balance_loss_mlp": 1.03923821, + "epoch": 0.45405080414850446, + "flos": 12893403651840.0, + "grad_norm": 1.6542843637965088, + "language_loss": 0.79214859, + "learning_rate": 2.392161956968798e-06, + "loss": 0.81361675, + "num_input_tokens_seen": 162011125, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.74609375, + "step": 7552, + "time_per_iteration": 2.4087183475494385 + }, + { + "auxiliary_loss_clip": 0.01039804, + "auxiliary_loss_mlp": 0.01010172, + "balance_loss_clip": 1.00893259, + "balance_loss_mlp": 1.01586497, + "epoch": 0.4541109274011724, + "flos": 59766919724160.0, + "grad_norm": 0.8232859688183145, + "language_loss": 0.57765305, + "learning_rate": 2.39178004819885e-06, + "loss": 0.59815282, + "num_input_tokens_seen": 162068705, + "router_z_loss_clip": 0.01239014, + "router_z_loss_mlp": 0.24023438, + "step": 7553, + "time_per_iteration": 4.437517881393433 + }, + { + "auxiliary_loss_clip": 0.01110712, + "auxiliary_loss_mlp": 0.01035759, + "balance_loss_clip": 1.02388608, + "balance_loss_mlp": 1.03907371, + "epoch": 0.4541710506538404, + "flos": 28511743841280.0, + "grad_norm": 1.3573100009257986, + "language_loss": 0.76541936, + "learning_rate": 2.3913981245725626e-06, + "loss": 0.78688413, + "num_input_tokens_seen": 162089655, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.71875, + "step": 7554, + "time_per_iteration": 5.404860258102417 + }, + { + "auxiliary_loss_clip": 0.01116899, + "auxiliary_loss_mlp": 0.01032245, + "balance_loss_clip": 1.01859498, + "balance_loss_mlp": 1.04073453, + "epoch": 0.45423117390650836, + "flos": 17675591742720.0, + "grad_norm": 2.6663912268828156, + "language_loss": 0.77148789, + "learning_rate": 2.3910161861044194e-06, + "loss": 0.79297936, + "num_input_tokens_seen": 162108465, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76171875, + "step": 7555, + "time_per_iteration": 2.5254242420196533 + }, + { + "auxiliary_loss_clip": 0.01111282, + "auxiliary_loss_mlp": 0.01033205, + "balance_loss_clip": 1.02112269, + "balance_loss_mlp": 1.03910041, + "epoch": 0.4542912971591763, + "flos": 28072556248320.0, + "grad_norm": 1.268885764239303, + "language_loss": 0.72658741, + "learning_rate": 2.390634232808903e-06, + "loss": 0.74803221, + "num_input_tokens_seen": 162129910, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.72265625, + "step": 7556, + "time_per_iteration": 2.5096001625061035 + }, + { + "auxiliary_loss_clip": 0.01117527, + "auxiliary_loss_mlp": 0.01033023, + "balance_loss_clip": 1.01987422, + "balance_loss_mlp": 1.0412432, + "epoch": 0.4543514204118443, + "flos": 22671771108480.0, + "grad_norm": 1.9256457801142723, + "language_loss": 0.63244998, + "learning_rate": 2.3902522647004982e-06, + "loss": 0.65395546, + "num_input_tokens_seen": 162148840, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.76171875, + "step": 7557, + "time_per_iteration": 2.489269495010376 + }, + { + "auxiliary_loss_clip": 0.010384, + "auxiliary_loss_mlp": 0.01000398, + "balance_loss_clip": 0.99909872, + "balance_loss_mlp": 1.01432419, + "epoch": 0.45441154366451225, + "flos": 58216549921920.0, + "grad_norm": 0.6891763329400619, + "language_loss": 0.57655525, + "learning_rate": 2.3898702817936875e-06, + "loss": 0.59694326, + "num_input_tokens_seen": 162208500, + "router_z_loss_clip": 0.01300049, + "router_z_loss_mlp": 0.24023438, + "step": 7558, + "time_per_iteration": 2.9631850719451904 + }, + { + "auxiliary_loss_clip": 0.01117663, + "auxiliary_loss_mlp": 0.01034797, + "balance_loss_clip": 1.02106977, + "balance_loss_mlp": 1.04180217, + "epoch": 0.4544716669171802, + "flos": 16764286763520.0, + "grad_norm": 2.9054431891281847, + "language_loss": 0.56152129, + "learning_rate": 2.3894882841029573e-06, + "loss": 0.58304584, + "num_input_tokens_seen": 162224650, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7578125, + "step": 7559, + "time_per_iteration": 2.4718172550201416 + }, + { + "auxiliary_loss_clip": 0.01116333, + "auxiliary_loss_mlp": 0.01036141, + "balance_loss_clip": 1.02320707, + "balance_loss_mlp": 1.04311991, + "epoch": 0.4545317901698482, + "flos": 15925233991680.0, + "grad_norm": 2.1225715432080863, + "language_loss": 0.72038132, + "learning_rate": 2.389106271642792e-06, + "loss": 0.74190605, + "num_input_tokens_seen": 162242930, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.734375, + "step": 7560, + "time_per_iteration": 2.4289052486419678 + }, + { + "auxiliary_loss_clip": 0.01118313, + "auxiliary_loss_mlp": 0.01032424, + "balance_loss_clip": 1.01870942, + "balance_loss_mlp": 1.04184937, + "epoch": 0.45459191342251615, + "flos": 17639752947840.0, + "grad_norm": 1.8567895139214563, + "language_loss": 0.68786752, + "learning_rate": 2.3887242444276775e-06, + "loss": 0.70937485, + "num_input_tokens_seen": 162261455, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.765625, + "step": 7561, + "time_per_iteration": 2.483013153076172 + }, + { + "auxiliary_loss_clip": 0.01111487, + "auxiliary_loss_mlp": 0.01031067, + "balance_loss_clip": 1.01933646, + "balance_loss_mlp": 1.04098606, + "epoch": 0.4546520366751841, + "flos": 16176608346240.0, + "grad_norm": 1.6472040447099916, + "language_loss": 0.84813452, + "learning_rate": 2.3883422024721015e-06, + "loss": 0.86956006, + "num_input_tokens_seen": 162279725, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.703125, + "step": 7562, + "time_per_iteration": 2.435842752456665 + }, + { + "auxiliary_loss_clip": 0.0111239, + "auxiliary_loss_mlp": 0.01033708, + "balance_loss_clip": 1.02132261, + "balance_loss_mlp": 1.0416292, + "epoch": 0.4547121599278521, + "flos": 19751443562880.0, + "grad_norm": 1.8588056575997567, + "language_loss": 0.89808047, + "learning_rate": 2.38796014579055e-06, + "loss": 0.91954148, + "num_input_tokens_seen": 162297865, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.70703125, + "step": 7563, + "time_per_iteration": 2.4962618350982666 + }, + { + "auxiliary_loss_clip": 0.0111349, + "auxiliary_loss_mlp": 0.01037794, + "balance_loss_clip": 1.02425742, + "balance_loss_mlp": 1.03999305, + "epoch": 0.45477228318052004, + "flos": 19937461121280.0, + "grad_norm": 1.9222778596605532, + "language_loss": 0.71644425, + "learning_rate": 2.3875780743975097e-06, + "loss": 0.73795712, + "num_input_tokens_seen": 162316010, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 7564, + "time_per_iteration": 2.4343371391296387 + }, + { + "auxiliary_loss_clip": 0.01115348, + "auxiliary_loss_mlp": 0.01031628, + "balance_loss_clip": 1.01898563, + "balance_loss_mlp": 1.04060352, + "epoch": 0.454832406433188, + "flos": 21288312829440.0, + "grad_norm": 2.0985180699884496, + "language_loss": 0.67973971, + "learning_rate": 2.3871959883074713e-06, + "loss": 0.70120943, + "num_input_tokens_seen": 162336115, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.74609375, + "step": 7565, + "time_per_iteration": 2.5114333629608154 + }, + { + "auxiliary_loss_clip": 0.0111081, + "auxiliary_loss_mlp": 0.010292, + "balance_loss_clip": 1.01651037, + "balance_loss_mlp": 1.03948641, + "epoch": 0.45489252968585603, + "flos": 24498726612480.0, + "grad_norm": 1.555148092913002, + "language_loss": 0.80112624, + "learning_rate": 2.386813887534922e-06, + "loss": 0.8225264, + "num_input_tokens_seen": 162355705, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71484375, + "step": 7566, + "time_per_iteration": 2.4678473472595215 + }, + { + "auxiliary_loss_clip": 0.01114664, + "auxiliary_loss_mlp": 0.01028518, + "balance_loss_clip": 1.01451695, + "balance_loss_mlp": 1.04058981, + "epoch": 0.454952652938524, + "flos": 17092474352640.0, + "grad_norm": 1.5438575571986708, + "language_loss": 0.73526263, + "learning_rate": 2.3864317720943508e-06, + "loss": 0.75669444, + "num_input_tokens_seen": 162374055, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 7567, + "time_per_iteration": 2.4749765396118164 + }, + { + "auxiliary_loss_clip": 0.01117694, + "auxiliary_loss_mlp": 0.01031931, + "balance_loss_clip": 1.01924706, + "balance_loss_mlp": 1.04315984, + "epoch": 0.45501277619119196, + "flos": 27630387826560.0, + "grad_norm": 1.4420173241258303, + "language_loss": 0.80870211, + "learning_rate": 2.386049642000249e-06, + "loss": 0.83019841, + "num_input_tokens_seen": 162393560, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.74609375, + "step": 7568, + "time_per_iteration": 2.5098068714141846 + }, + { + "auxiliary_loss_clip": 0.01119299, + "auxiliary_loss_mlp": 0.01043854, + "balance_loss_clip": 1.02927494, + "balance_loss_mlp": 1.04110444, + "epoch": 0.4550728994438599, + "flos": 19974664632960.0, + "grad_norm": 1.9046518074434846, + "language_loss": 0.79472029, + "learning_rate": 2.3856674972671055e-06, + "loss": 0.81635177, + "num_input_tokens_seen": 162413170, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.78125, + "step": 7569, + "time_per_iteration": 2.5105931758880615 + }, + { + "auxiliary_loss_clip": 0.0111814, + "auxiliary_loss_mlp": 0.01032381, + "balance_loss_clip": 1.01811135, + "balance_loss_mlp": 1.04233003, + "epoch": 0.4551330226965279, + "flos": 26066873646720.0, + "grad_norm": 1.3375300297611126, + "language_loss": 0.74826288, + "learning_rate": 2.385285337909412e-06, + "loss": 0.76976812, + "num_input_tokens_seen": 162434080, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7578125, + "step": 7570, + "time_per_iteration": 2.5360968112945557 + }, + { + "auxiliary_loss_clip": 0.0111381, + "auxiliary_loss_mlp": 0.01037907, + "balance_loss_clip": 1.02502048, + "balance_loss_mlp": 1.04281187, + "epoch": 0.45519314594919585, + "flos": 32781091501440.0, + "grad_norm": 1.5540611030471656, + "language_loss": 0.74696088, + "learning_rate": 2.3849031639416596e-06, + "loss": 0.76847816, + "num_input_tokens_seen": 162455445, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 7571, + "time_per_iteration": 2.5796499252319336 + }, + { + "auxiliary_loss_clip": 0.01110782, + "auxiliary_loss_mlp": 0.01029523, + "balance_loss_clip": 1.01708317, + "balance_loss_mlp": 1.04096079, + "epoch": 0.4552532692018638, + "flos": 19172671718400.0, + "grad_norm": 1.522963408290285, + "language_loss": 0.81392241, + "learning_rate": 2.3845209753783414e-06, + "loss": 0.83532542, + "num_input_tokens_seen": 162474940, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.69921875, + "step": 7572, + "time_per_iteration": 2.452230215072632 + }, + { + "auxiliary_loss_clip": 0.01119128, + "auxiliary_loss_mlp": 0.01034444, + "balance_loss_clip": 1.02052081, + "balance_loss_mlp": 1.04266822, + "epoch": 0.4553133924545318, + "flos": 26027156183040.0, + "grad_norm": 2.158291075293226, + "language_loss": 0.72932756, + "learning_rate": 2.3841387722339486e-06, + "loss": 0.75086331, + "num_input_tokens_seen": 162493340, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 7573, + "time_per_iteration": 2.547351598739624 + }, + { + "auxiliary_loss_clip": 0.01119875, + "auxiliary_loss_mlp": 0.01036094, + "balance_loss_clip": 1.02106202, + "balance_loss_mlp": 1.04362583, + "epoch": 0.45537351570719975, + "flos": 30661535808000.0, + "grad_norm": 1.8799787689923733, + "language_loss": 0.74544156, + "learning_rate": 2.3837565545229748e-06, + "loss": 0.76700127, + "num_input_tokens_seen": 162514360, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.76171875, + "step": 7574, + "time_per_iteration": 2.512343406677246 + }, + { + "auxiliary_loss_clip": 0.01116382, + "auxiliary_loss_mlp": 0.01031805, + "balance_loss_clip": 1.01870358, + "balance_loss_mlp": 1.0413028, + "epoch": 0.4554336389598677, + "flos": 24353396184960.0, + "grad_norm": 1.8832109226527793, + "language_loss": 0.7161721, + "learning_rate": 2.383374322259915e-06, + "loss": 0.73765397, + "num_input_tokens_seen": 162535240, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.75, + "step": 7575, + "time_per_iteration": 2.516036033630371 + }, + { + "auxiliary_loss_clip": 0.01114571, + "auxiliary_loss_mlp": 0.01030488, + "balance_loss_clip": 1.0174526, + "balance_loss_mlp": 1.04138458, + "epoch": 0.4554937622125357, + "flos": 20557925677440.0, + "grad_norm": 1.7001526143902996, + "language_loss": 0.73163939, + "learning_rate": 2.3829920754592617e-06, + "loss": 0.75308996, + "num_input_tokens_seen": 162553880, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.734375, + "step": 7576, + "time_per_iteration": 2.446596145629883 + }, + { + "auxiliary_loss_clip": 0.01114194, + "auxiliary_loss_mlp": 0.01035671, + "balance_loss_clip": 1.02232563, + "balance_loss_mlp": 1.04252386, + "epoch": 0.45555388546520365, + "flos": 22820764723200.0, + "grad_norm": 1.8829162969496007, + "language_loss": 0.66556787, + "learning_rate": 2.382609814135511e-06, + "loss": 0.68706656, + "num_input_tokens_seen": 162574485, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.71484375, + "step": 7577, + "time_per_iteration": 2.496425151824951 + }, + { + "auxiliary_loss_clip": 0.01119433, + "auxiliary_loss_mlp": 0.01041229, + "balance_loss_clip": 1.02655983, + "balance_loss_mlp": 1.04481244, + "epoch": 0.4556140087178716, + "flos": 21725992051200.0, + "grad_norm": 1.905892479596231, + "language_loss": 0.74408162, + "learning_rate": 2.382227538303157e-06, + "loss": 0.76568818, + "num_input_tokens_seen": 162595130, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.74609375, + "step": 7578, + "time_per_iteration": 2.4517569541931152 + }, + { + "auxiliary_loss_clip": 0.01117156, + "auxiliary_loss_mlp": 0.01031849, + "balance_loss_clip": 1.01923108, + "balance_loss_mlp": 1.0432775, + "epoch": 0.45567413197053963, + "flos": 25994513698560.0, + "grad_norm": 1.9332037742405612, + "language_loss": 0.70189863, + "learning_rate": 2.381845247976697e-06, + "loss": 0.72338867, + "num_input_tokens_seen": 162615720, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.73828125, + "step": 7579, + "time_per_iteration": 2.5487825870513916 + }, + { + "auxiliary_loss_clip": 0.0111145, + "auxiliary_loss_mlp": 0.01032903, + "balance_loss_clip": 1.02031469, + "balance_loss_mlp": 1.03969145, + "epoch": 0.4557342552232076, + "flos": 21537604195200.0, + "grad_norm": 1.6152122780510265, + "language_loss": 0.78727221, + "learning_rate": 2.381462943170627e-06, + "loss": 0.8087157, + "num_input_tokens_seen": 162635825, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71875, + "step": 7580, + "time_per_iteration": 2.465355157852173 + }, + { + "auxiliary_loss_clip": 0.01115593, + "auxiliary_loss_mlp": 0.01028037, + "balance_loss_clip": 1.01463163, + "balance_loss_mlp": 1.04341292, + "epoch": 0.45579437847587556, + "flos": 40001972647680.0, + "grad_norm": 1.4438503581091628, + "language_loss": 0.68864352, + "learning_rate": 2.381080623899444e-06, + "loss": 0.71007979, + "num_input_tokens_seen": 162659130, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.72265625, + "step": 7581, + "time_per_iteration": 2.6738851070404053 + }, + { + "auxiliary_loss_clip": 0.01111798, + "auxiliary_loss_mlp": 0.0103026, + "balance_loss_clip": 1.01742125, + "balance_loss_mlp": 1.03975797, + "epoch": 0.4558545017285435, + "flos": 31138501530240.0, + "grad_norm": 1.5604567804249607, + "language_loss": 0.73416924, + "learning_rate": 2.3806982901776455e-06, + "loss": 0.75558978, + "num_input_tokens_seen": 162681665, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 7582, + "time_per_iteration": 2.5402657985687256 + }, + { + "auxiliary_loss_clip": 0.01118117, + "auxiliary_loss_mlp": 0.01045735, + "balance_loss_clip": 1.03065467, + "balance_loss_mlp": 1.04215884, + "epoch": 0.4559146249812115, + "flos": 21725776569600.0, + "grad_norm": 1.7600515256353326, + "language_loss": 0.72337949, + "learning_rate": 2.380315942019729e-06, + "loss": 0.74501801, + "num_input_tokens_seen": 162702040, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7578125, + "step": 7583, + "time_per_iteration": 2.51399564743042 + }, + { + "auxiliary_loss_clip": 0.01119408, + "auxiliary_loss_mlp": 0.01036162, + "balance_loss_clip": 1.02300692, + "balance_loss_mlp": 1.04282498, + "epoch": 0.45597474823387946, + "flos": 23805973935360.0, + "grad_norm": 1.711799016610791, + "language_loss": 0.72402817, + "learning_rate": 2.379933579440195e-06, + "loss": 0.74558389, + "num_input_tokens_seen": 162722375, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.765625, + "step": 7584, + "time_per_iteration": 2.4907238483428955 + }, + { + "auxiliary_loss_clip": 0.01116974, + "auxiliary_loss_mlp": 0.01032287, + "balance_loss_clip": 1.01922798, + "balance_loss_mlp": 1.04356861, + "epoch": 0.4560348714865474, + "flos": 31905661230720.0, + "grad_norm": 1.4921764730017937, + "language_loss": 0.68272889, + "learning_rate": 2.379551202453541e-06, + "loss": 0.70422149, + "num_input_tokens_seen": 162746095, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.734375, + "step": 7585, + "time_per_iteration": 2.5741868019104004 + }, + { + "auxiliary_loss_clip": 0.01114249, + "auxiliary_loss_mlp": 0.01031651, + "balance_loss_clip": 1.01928306, + "balance_loss_mlp": 1.04099321, + "epoch": 0.4560949947392154, + "flos": 22048828513920.0, + "grad_norm": 1.3206982799231843, + "language_loss": 0.76102924, + "learning_rate": 2.379168811074267e-06, + "loss": 0.78248823, + "num_input_tokens_seen": 162766330, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.73046875, + "step": 7586, + "time_per_iteration": 2.466991662979126 + }, + { + "auxiliary_loss_clip": 0.01112098, + "auxiliary_loss_mlp": 0.01028236, + "balance_loss_clip": 1.01651812, + "balance_loss_mlp": 1.0406158, + "epoch": 0.45615511799188335, + "flos": 24571804832640.0, + "grad_norm": 1.9114474136682882, + "language_loss": 0.77912259, + "learning_rate": 2.3787864053168747e-06, + "loss": 0.80052596, + "num_input_tokens_seen": 162784755, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.71484375, + "step": 7587, + "time_per_iteration": 2.534231185913086 + }, + { + "auxiliary_loss_clip": 0.01118125, + "auxiliary_loss_mlp": 0.01039323, + "balance_loss_clip": 1.02616787, + "balance_loss_mlp": 1.03976679, + "epoch": 0.4562152412445513, + "flos": 18330709944960.0, + "grad_norm": 2.2451216970422068, + "language_loss": 0.69211191, + "learning_rate": 2.378403985195863e-06, + "loss": 0.71368635, + "num_input_tokens_seen": 162803850, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.78125, + "step": 7588, + "time_per_iteration": 2.4104104042053223 + }, + { + "auxiliary_loss_clip": 0.011124, + "auxiliary_loss_mlp": 0.01034229, + "balance_loss_clip": 1.02178395, + "balance_loss_mlp": 1.0401839, + "epoch": 0.4562753644972193, + "flos": 13516525814400.0, + "grad_norm": 1.610626761932897, + "language_loss": 0.79335272, + "learning_rate": 2.378021550725735e-06, + "loss": 0.81481898, + "num_input_tokens_seen": 162820775, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.72265625, + "step": 7589, + "time_per_iteration": 2.465728998184204 + }, + { + "auxiliary_loss_clip": 0.01113978, + "auxiliary_loss_mlp": 0.01032914, + "balance_loss_clip": 1.01955092, + "balance_loss_mlp": 1.04108429, + "epoch": 0.45633548774988725, + "flos": 29639697701760.0, + "grad_norm": 2.193606067712595, + "language_loss": 0.6227479, + "learning_rate": 2.377639101920992e-06, + "loss": 0.64421678, + "num_input_tokens_seen": 162839695, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73046875, + "step": 7590, + "time_per_iteration": 2.509962558746338 + }, + { + "auxiliary_loss_clip": 0.0111218, + "auxiliary_loss_mlp": 0.01040835, + "balance_loss_clip": 1.02830625, + "balance_loss_mlp": 1.03874183, + "epoch": 0.4563956110025552, + "flos": 22233409528320.0, + "grad_norm": 5.263909382371274, + "language_loss": 0.72727275, + "learning_rate": 2.377256638796135e-06, + "loss": 0.74880284, + "num_input_tokens_seen": 162856095, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.734375, + "step": 7591, + "time_per_iteration": 2.529491424560547 + }, + { + "auxiliary_loss_clip": 0.01117071, + "auxiliary_loss_mlp": 0.01037677, + "balance_loss_clip": 1.02413523, + "balance_loss_mlp": 1.04252648, + "epoch": 0.45645573425522323, + "flos": 17092043389440.0, + "grad_norm": 2.0725698163141058, + "language_loss": 0.76985544, + "learning_rate": 2.3768741613656695e-06, + "loss": 0.79140294, + "num_input_tokens_seen": 162874070, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 7592, + "time_per_iteration": 2.4446723461151123 + }, + { + "auxiliary_loss_clip": 0.01113834, + "auxiliary_loss_mlp": 0.01028586, + "balance_loss_clip": 1.01604521, + "balance_loss_mlp": 1.04070461, + "epoch": 0.4565158575078912, + "flos": 20332334309760.0, + "grad_norm": 1.9266503814961675, + "language_loss": 0.69611561, + "learning_rate": 2.376491669644098e-06, + "loss": 0.71753979, + "num_input_tokens_seen": 162891000, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.73046875, + "step": 7593, + "time_per_iteration": 2.4879302978515625 + }, + { + "auxiliary_loss_clip": 0.01107529, + "auxiliary_loss_mlp": 0.01030986, + "balance_loss_clip": 1.01882672, + "balance_loss_mlp": 1.03803527, + "epoch": 0.45657598076055916, + "flos": 23983013093760.0, + "grad_norm": 2.17790627040614, + "language_loss": 0.84199911, + "learning_rate": 2.3761091636459248e-06, + "loss": 0.86338425, + "num_input_tokens_seen": 162910120, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 7594, + "time_per_iteration": 2.464733362197876 + }, + { + "auxiliary_loss_clip": 0.01035796, + "auxiliary_loss_mlp": 0.00998737, + "balance_loss_clip": 0.99745506, + "balance_loss_mlp": 1.01167154, + "epoch": 0.45663610401322713, + "flos": 69364297526400.0, + "grad_norm": 0.7964417819777524, + "language_loss": 0.52721512, + "learning_rate": 2.375726643385654e-06, + "loss": 0.54756045, + "num_input_tokens_seen": 162963720, + "router_z_loss_clip": 0.01281738, + "router_z_loss_mlp": 0.2421875, + "step": 7595, + "time_per_iteration": 6.0974061489105225 + }, + { + "auxiliary_loss_clip": 0.01117501, + "auxiliary_loss_mlp": 0.01031747, + "balance_loss_clip": 1.01843739, + "balance_loss_mlp": 1.04165292, + "epoch": 0.4566962272658951, + "flos": 15149095891200.0, + "grad_norm": 2.1595430840247714, + "language_loss": 0.87448329, + "learning_rate": 2.3753441088777915e-06, + "loss": 0.89597577, + "num_input_tokens_seen": 162975760, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7578125, + "step": 7596, + "time_per_iteration": 3.862628936767578 + }, + { + "auxiliary_loss_clip": 0.01113625, + "auxiliary_loss_mlp": 0.01039379, + "balance_loss_clip": 1.02698088, + "balance_loss_mlp": 1.03993344, + "epoch": 0.45675635051856306, + "flos": 18697465762560.0, + "grad_norm": 2.2425847761174196, + "language_loss": 0.77131474, + "learning_rate": 2.374961560136843e-06, + "loss": 0.79284477, + "num_input_tokens_seen": 162994865, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.734375, + "step": 7597, + "time_per_iteration": 2.4821672439575195 + }, + { + "auxiliary_loss_clip": 0.01113745, + "auxiliary_loss_mlp": 0.01034483, + "balance_loss_clip": 1.02122104, + "balance_loss_mlp": 1.04004443, + "epoch": 0.456816473771231, + "flos": 19098300608640.0, + "grad_norm": 1.7340388440754042, + "language_loss": 0.78560513, + "learning_rate": 2.374578997177314e-06, + "loss": 0.80708742, + "num_input_tokens_seen": 163014730, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73828125, + "step": 7598, + "time_per_iteration": 2.4350392818450928 + }, + { + "auxiliary_loss_clip": 0.01113148, + "auxiliary_loss_mlp": 0.01029189, + "balance_loss_clip": 1.01735115, + "balance_loss_mlp": 1.04057133, + "epoch": 0.456876597023899, + "flos": 28950069507840.0, + "grad_norm": 2.435026889485133, + "language_loss": 0.71715307, + "learning_rate": 2.374196420013712e-06, + "loss": 0.73857641, + "num_input_tokens_seen": 163033405, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.7265625, + "step": 7599, + "time_per_iteration": 2.5838844776153564 + }, + { + "auxiliary_loss_clip": 0.01108114, + "auxiliary_loss_mlp": 0.01035222, + "balance_loss_clip": 1.02238345, + "balance_loss_mlp": 1.03702497, + "epoch": 0.45693672027656695, + "flos": 23289470317440.0, + "grad_norm": 1.734840239500452, + "language_loss": 0.69377261, + "learning_rate": 2.373813828660544e-06, + "loss": 0.71520597, + "num_input_tokens_seen": 163051400, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7109375, + "step": 7600, + "time_per_iteration": 2.4499921798706055 + }, + { + "auxiliary_loss_clip": 0.01112216, + "auxiliary_loss_mlp": 0.01038134, + "balance_loss_clip": 1.02584386, + "balance_loss_mlp": 1.03979039, + "epoch": 0.4569968435292349, + "flos": 20558212986240.0, + "grad_norm": 1.9688741418230387, + "language_loss": 0.78654951, + "learning_rate": 2.373431223132319e-06, + "loss": 0.80805302, + "num_input_tokens_seen": 163069250, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.72265625, + "step": 7601, + "time_per_iteration": 2.555522918701172 + }, + { + "auxiliary_loss_clip": 0.01114065, + "auxiliary_loss_mlp": 0.01036912, + "balance_loss_clip": 1.02500272, + "balance_loss_mlp": 1.04013097, + "epoch": 0.4570569667819029, + "flos": 41282619223680.0, + "grad_norm": 1.706657696767707, + "language_loss": 0.71609282, + "learning_rate": 2.3730486034435448e-06, + "loss": 0.73760259, + "num_input_tokens_seen": 163091755, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.73828125, + "step": 7602, + "time_per_iteration": 2.6383092403411865 + }, + { + "auxiliary_loss_clip": 0.01112609, + "auxiliary_loss_mlp": 0.01031807, + "balance_loss_clip": 1.01735842, + "balance_loss_mlp": 1.03901231, + "epoch": 0.45711709003457085, + "flos": 26031573555840.0, + "grad_norm": 1.778856324344474, + "language_loss": 0.72776276, + "learning_rate": 2.372665969608729e-06, + "loss": 0.7492069, + "num_input_tokens_seen": 163111600, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.734375, + "step": 7603, + "time_per_iteration": 2.566542387008667 + }, + { + "auxiliary_loss_clip": 0.01113258, + "auxiliary_loss_mlp": 0.0103744, + "balance_loss_clip": 1.02284837, + "balance_loss_mlp": 1.03945732, + "epoch": 0.4571772132872388, + "flos": 22158068751360.0, + "grad_norm": 1.783042546573846, + "language_loss": 0.83495164, + "learning_rate": 2.372283321642383e-06, + "loss": 0.8564586, + "num_input_tokens_seen": 163127350, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.73828125, + "step": 7604, + "time_per_iteration": 2.4322941303253174 + }, + { + "auxiliary_loss_clip": 0.0112315, + "auxiliary_loss_mlp": 0.0103587, + "balance_loss_clip": 1.02152371, + "balance_loss_mlp": 1.04472041, + "epoch": 0.45723733653990684, + "flos": 23878872587520.0, + "grad_norm": 1.742561007105776, + "language_loss": 0.85827744, + "learning_rate": 2.371900659559016e-06, + "loss": 0.87986767, + "num_input_tokens_seen": 163145855, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.78515625, + "step": 7605, + "time_per_iteration": 2.495654582977295 + }, + { + "auxiliary_loss_clip": 0.01116353, + "auxiliary_loss_mlp": 0.01035384, + "balance_loss_clip": 1.02216911, + "balance_loss_mlp": 1.04045463, + "epoch": 0.4572974597925748, + "flos": 16871803148160.0, + "grad_norm": 1.9150435252301277, + "language_loss": 0.73814523, + "learning_rate": 2.371517983373138e-06, + "loss": 0.75966263, + "num_input_tokens_seen": 163163830, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7578125, + "step": 7606, + "time_per_iteration": 2.472698926925659 + }, + { + "auxiliary_loss_clip": 0.01115234, + "auxiliary_loss_mlp": 0.0103916, + "balance_loss_clip": 1.02525389, + "balance_loss_mlp": 1.03985333, + "epoch": 0.45735758304524277, + "flos": 13771491528960.0, + "grad_norm": 4.395321075422478, + "language_loss": 0.7975688, + "learning_rate": 2.371135293099262e-06, + "loss": 0.81911278, + "num_input_tokens_seen": 163180700, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75390625, + "step": 7607, + "time_per_iteration": 2.500666618347168 + }, + { + "auxiliary_loss_clip": 0.01117549, + "auxiliary_loss_mlp": 0.01042987, + "balance_loss_clip": 1.0295403, + "balance_loss_mlp": 1.0436604, + "epoch": 0.45741770629791073, + "flos": 21100750986240.0, + "grad_norm": 2.5876510188713437, + "language_loss": 0.80827034, + "learning_rate": 2.3707525887518982e-06, + "loss": 0.82987565, + "num_input_tokens_seen": 163199450, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73828125, + "step": 7608, + "time_per_iteration": 2.454738140106201 + }, + { + "auxiliary_loss_clip": 0.0111299, + "auxiliary_loss_mlp": 0.01040349, + "balance_loss_clip": 1.02624631, + "balance_loss_mlp": 1.03830588, + "epoch": 0.4574778295505787, + "flos": 23112898035840.0, + "grad_norm": 1.6879461416077837, + "language_loss": 0.68500757, + "learning_rate": 2.370369870345559e-06, + "loss": 0.70654094, + "num_input_tokens_seen": 163217875, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.75, + "step": 7609, + "time_per_iteration": 2.567387580871582 + }, + { + "auxiliary_loss_clip": 0.01113281, + "auxiliary_loss_mlp": 0.01039479, + "balance_loss_clip": 1.02609158, + "balance_loss_mlp": 1.03981042, + "epoch": 0.45753795280324666, + "flos": 24352929308160.0, + "grad_norm": 1.861126687806453, + "language_loss": 0.80749559, + "learning_rate": 2.369987137894757e-06, + "loss": 0.82902324, + "num_input_tokens_seen": 163237430, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.734375, + "step": 7610, + "time_per_iteration": 2.5181450843811035 + }, + { + "auxiliary_loss_clip": 0.01115569, + "auxiliary_loss_mlp": 0.01034872, + "balance_loss_clip": 1.02122259, + "balance_loss_mlp": 1.04017019, + "epoch": 0.4575980760559146, + "flos": 16653789550080.0, + "grad_norm": 1.991436967054915, + "language_loss": 0.82063943, + "learning_rate": 2.3696043914140057e-06, + "loss": 0.84214383, + "num_input_tokens_seen": 163253905, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75390625, + "step": 7611, + "time_per_iteration": 2.5181667804718018 + }, + { + "auxiliary_loss_clip": 0.01117824, + "auxiliary_loss_mlp": 0.01030256, + "balance_loss_clip": 1.01684475, + "balance_loss_mlp": 1.04256463, + "epoch": 0.4576581993085826, + "flos": 35911423912320.0, + "grad_norm": 1.7999257820591783, + "language_loss": 0.74032104, + "learning_rate": 2.369221630917819e-06, + "loss": 0.76180184, + "num_input_tokens_seen": 163274285, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.75, + "step": 7612, + "time_per_iteration": 2.573192596435547 + }, + { + "auxiliary_loss_clip": 0.01111253, + "auxiliary_loss_mlp": 0.01031239, + "balance_loss_clip": 1.01775634, + "balance_loss_mlp": 1.03739977, + "epoch": 0.45771832256125056, + "flos": 20080421251200.0, + "grad_norm": 1.4998899682115554, + "language_loss": 0.84958243, + "learning_rate": 2.368838856420711e-06, + "loss": 0.87100732, + "num_input_tokens_seen": 163293150, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73828125, + "step": 7613, + "time_per_iteration": 2.519374132156372 + }, + { + "auxiliary_loss_clip": 0.01113962, + "auxiliary_loss_mlp": 0.01028737, + "balance_loss_clip": 1.01548696, + "balance_loss_mlp": 1.04007339, + "epoch": 0.4577784458139185, + "flos": 10744329957120.0, + "grad_norm": 2.119092433129462, + "language_loss": 0.75686407, + "learning_rate": 2.3684560679371965e-06, + "loss": 0.77829111, + "num_input_tokens_seen": 163310065, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73828125, + "step": 7614, + "time_per_iteration": 2.435258388519287 + }, + { + "auxiliary_loss_clip": 0.01111665, + "auxiliary_loss_mlp": 0.01031457, + "balance_loss_clip": 1.01870763, + "balance_loss_mlp": 1.03973377, + "epoch": 0.4578385690665865, + "flos": 21907269014400.0, + "grad_norm": 1.4729553038511707, + "language_loss": 0.74797261, + "learning_rate": 2.368073265481791e-06, + "loss": 0.76940382, + "num_input_tokens_seen": 163329415, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.71875, + "step": 7615, + "time_per_iteration": 2.4776275157928467 + }, + { + "auxiliary_loss_clip": 0.01037994, + "auxiliary_loss_mlp": 0.00999141, + "balance_loss_clip": 0.99766314, + "balance_loss_mlp": 1.01355577, + "epoch": 0.45789869231925445, + "flos": 64758286667520.0, + "grad_norm": 0.7822572530544061, + "language_loss": 0.57660586, + "learning_rate": 2.3676904490690105e-06, + "loss": 0.59697717, + "num_input_tokens_seen": 163385875, + "router_z_loss_clip": 0.01477051, + "router_z_loss_mlp": 0.24414062, + "step": 7616, + "time_per_iteration": 2.9986298084259033 + }, + { + "auxiliary_loss_clip": 0.01111756, + "auxiliary_loss_mlp": 0.01038669, + "balance_loss_clip": 1.0251503, + "balance_loss_mlp": 1.03939307, + "epoch": 0.4579588155719224, + "flos": 16144001775360.0, + "grad_norm": 1.5412759634284317, + "language_loss": 0.70953274, + "learning_rate": 2.3673076187133704e-06, + "loss": 0.73103696, + "num_input_tokens_seen": 163405170, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.72265625, + "step": 7617, + "time_per_iteration": 2.514575958251953 + }, + { + "auxiliary_loss_clip": 0.01116383, + "auxiliary_loss_mlp": 0.01032517, + "balance_loss_clip": 1.01886725, + "balance_loss_mlp": 1.04211044, + "epoch": 0.45801893882459044, + "flos": 21395541905280.0, + "grad_norm": 2.1003257335678245, + "language_loss": 0.76458549, + "learning_rate": 2.36692477442939e-06, + "loss": 0.78607446, + "num_input_tokens_seen": 163423155, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7421875, + "step": 7618, + "time_per_iteration": 2.431196689605713 + }, + { + "auxiliary_loss_clip": 0.01118549, + "auxiliary_loss_mlp": 0.01044975, + "balance_loss_clip": 1.0323689, + "balance_loss_mlp": 1.0429455, + "epoch": 0.4580790620772584, + "flos": 19536554448000.0, + "grad_norm": 1.7069120237831286, + "language_loss": 0.76705682, + "learning_rate": 2.366541916231585e-06, + "loss": 0.788692, + "num_input_tokens_seen": 163442450, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.75390625, + "step": 7619, + "time_per_iteration": 2.491133213043213 + }, + { + "auxiliary_loss_clip": 0.01112973, + "auxiliary_loss_mlp": 0.0103583, + "balance_loss_clip": 1.02378964, + "balance_loss_mlp": 1.04174709, + "epoch": 0.45813918532992637, + "flos": 16581070465920.0, + "grad_norm": 1.9887034550999254, + "language_loss": 0.7175532, + "learning_rate": 2.366159044134473e-06, + "loss": 0.73904121, + "num_input_tokens_seen": 163459810, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.7109375, + "step": 7620, + "time_per_iteration": 2.429659366607666 + }, + { + "auxiliary_loss_clip": 0.0111009, + "auxiliary_loss_mlp": 0.01028718, + "balance_loss_clip": 1.01643384, + "balance_loss_mlp": 1.03828478, + "epoch": 0.45819930858259433, + "flos": 42230301701760.0, + "grad_norm": 2.3637648648526035, + "language_loss": 0.78374821, + "learning_rate": 2.3657761581525748e-06, + "loss": 0.80513632, + "num_input_tokens_seen": 163482970, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.71875, + "step": 7621, + "time_per_iteration": 2.69990611076355 + }, + { + "auxiliary_loss_clip": 0.01037733, + "auxiliary_loss_mlp": 0.01001998, + "balance_loss_clip": 1.00071096, + "balance_loss_mlp": 1.01315987, + "epoch": 0.4582594318352623, + "flos": 63714795638400.0, + "grad_norm": 0.7958411378428579, + "language_loss": 0.6499809, + "learning_rate": 2.3653932583004063e-06, + "loss": 0.67037821, + "num_input_tokens_seen": 163545330, + "router_z_loss_clip": 0.01287842, + "router_z_loss_mlp": 0.24609375, + "step": 7622, + "time_per_iteration": 3.0476205348968506 + }, + { + "auxiliary_loss_clip": 0.01114449, + "auxiliary_loss_mlp": 0.01030125, + "balance_loss_clip": 1.01667762, + "balance_loss_mlp": 1.04142582, + "epoch": 0.45831955508793026, + "flos": 26869979882880.0, + "grad_norm": 1.9256202714320767, + "language_loss": 0.79611146, + "learning_rate": 2.3650103445924903e-06, + "loss": 0.81755722, + "num_input_tokens_seen": 163564620, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73046875, + "step": 7623, + "time_per_iteration": 2.547234535217285 + }, + { + "auxiliary_loss_clip": 0.01115872, + "auxiliary_loss_mlp": 0.01036985, + "balance_loss_clip": 1.02382421, + "balance_loss_mlp": 1.04050457, + "epoch": 0.45837967834059823, + "flos": 18733951002240.0, + "grad_norm": 1.996922752989922, + "language_loss": 0.70809233, + "learning_rate": 2.3646274170433452e-06, + "loss": 0.72962081, + "num_input_tokens_seen": 163581010, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.75390625, + "step": 7624, + "time_per_iteration": 2.442575693130493 + }, + { + "auxiliary_loss_clip": 0.01113872, + "auxiliary_loss_mlp": 0.01032309, + "balance_loss_clip": 1.01944637, + "balance_loss_mlp": 1.0383656, + "epoch": 0.4584398015932662, + "flos": 21178102924800.0, + "grad_norm": 2.876738245253823, + "language_loss": 0.7299192, + "learning_rate": 2.364244475667491e-06, + "loss": 0.75138104, + "num_input_tokens_seen": 163599955, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7578125, + "step": 7625, + "time_per_iteration": 2.53002667427063 + }, + { + "auxiliary_loss_clip": 0.01116016, + "auxiliary_loss_mlp": 0.01033086, + "balance_loss_clip": 1.02058113, + "balance_loss_mlp": 1.04226136, + "epoch": 0.45849992484593416, + "flos": 19790047704960.0, + "grad_norm": 3.1470354950748716, + "language_loss": 0.78132713, + "learning_rate": 2.363861520479451e-06, + "loss": 0.80281818, + "num_input_tokens_seen": 163618545, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.73828125, + "step": 7626, + "time_per_iteration": 2.4544708728790283 + }, + { + "auxiliary_loss_clip": 0.01117004, + "auxiliary_loss_mlp": 0.0103582, + "balance_loss_clip": 1.02270126, + "balance_loss_mlp": 1.04142714, + "epoch": 0.4585600480986021, + "flos": 18223265387520.0, + "grad_norm": 1.604401840334718, + "language_loss": 0.85191864, + "learning_rate": 2.3634785514937445e-06, + "loss": 0.87344688, + "num_input_tokens_seen": 163636055, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7578125, + "step": 7627, + "time_per_iteration": 2.478867769241333 + }, + { + "auxiliary_loss_clip": 0.01117716, + "auxiliary_loss_mlp": 0.01036428, + "balance_loss_clip": 1.02293992, + "balance_loss_mlp": 1.04074025, + "epoch": 0.4586201713512701, + "flos": 29022213974400.0, + "grad_norm": 1.506714204397822, + "language_loss": 0.69413865, + "learning_rate": 2.3630955687248953e-06, + "loss": 0.71568, + "num_input_tokens_seen": 163657485, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.76953125, + "step": 7628, + "time_per_iteration": 2.5127782821655273 + }, + { + "auxiliary_loss_clip": 0.01113376, + "auxiliary_loss_mlp": 0.01030025, + "balance_loss_clip": 1.01654255, + "balance_loss_mlp": 1.04060626, + "epoch": 0.45868029460393805, + "flos": 23404600385280.0, + "grad_norm": 1.5379008002675938, + "language_loss": 0.78294545, + "learning_rate": 2.3627125721874265e-06, + "loss": 0.8043794, + "num_input_tokens_seen": 163676030, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7265625, + "step": 7629, + "time_per_iteration": 2.4944000244140625 + }, + { + "auxiliary_loss_clip": 0.0111907, + "auxiliary_loss_mlp": 0.01039687, + "balance_loss_clip": 1.02578115, + "balance_loss_mlp": 1.04031289, + "epoch": 0.458740417856606, + "flos": 18221972497920.0, + "grad_norm": 2.0009780664883223, + "language_loss": 0.79405141, + "learning_rate": 2.3623295618958595e-06, + "loss": 0.81563896, + "num_input_tokens_seen": 163694490, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7890625, + "step": 7630, + "time_per_iteration": 2.443598747253418 + }, + { + "auxiliary_loss_clip": 0.0111732, + "auxiliary_loss_mlp": 0.01034928, + "balance_loss_clip": 1.02108812, + "balance_loss_mlp": 1.03952336, + "epoch": 0.458800541109274, + "flos": 34568760504960.0, + "grad_norm": 1.67887072973593, + "language_loss": 0.71819407, + "learning_rate": 2.3619465378647198e-06, + "loss": 0.73971653, + "num_input_tokens_seen": 163717035, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7734375, + "step": 7631, + "time_per_iteration": 2.613935708999634 + }, + { + "auxiliary_loss_clip": 0.01118321, + "auxiliary_loss_mlp": 0.0103646, + "balance_loss_clip": 1.02248299, + "balance_loss_mlp": 1.04306722, + "epoch": 0.458860664361942, + "flos": 17712112896000.0, + "grad_norm": 2.655938907200588, + "language_loss": 0.71337265, + "learning_rate": 2.361563500108531e-06, + "loss": 0.7349205, + "num_input_tokens_seen": 163734525, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75390625, + "step": 7632, + "time_per_iteration": 2.4854414463043213 + }, + { + "auxiliary_loss_clip": 0.01118604, + "auxiliary_loss_mlp": 0.01033523, + "balance_loss_clip": 1.0190748, + "balance_loss_mlp": 1.04055059, + "epoch": 0.45892078761460997, + "flos": 18441889516800.0, + "grad_norm": 15.51679170955813, + "language_loss": 0.69212449, + "learning_rate": 2.3611804486418178e-06, + "loss": 0.71364582, + "num_input_tokens_seen": 163752860, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.78125, + "step": 7633, + "time_per_iteration": 2.488741874694824 + }, + { + "auxiliary_loss_clip": 0.01115341, + "auxiliary_loss_mlp": 0.01036682, + "balance_loss_clip": 1.02366996, + "balance_loss_mlp": 1.04068875, + "epoch": 0.45898091086727794, + "flos": 22672956257280.0, + "grad_norm": 1.4724338826500494, + "language_loss": 0.80777454, + "learning_rate": 2.3607973834791062e-06, + "loss": 0.82929468, + "num_input_tokens_seen": 163772495, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.74609375, + "step": 7634, + "time_per_iteration": 2.4676551818847656 + }, + { + "auxiliary_loss_clip": 0.01118954, + "auxiliary_loss_mlp": 0.0103355, + "balance_loss_clip": 1.0188632, + "balance_loss_mlp": 1.04032791, + "epoch": 0.4590410341199459, + "flos": 21652949744640.0, + "grad_norm": 1.9575518559569576, + "language_loss": 0.81853092, + "learning_rate": 2.3604143046349216e-06, + "loss": 0.84005594, + "num_input_tokens_seen": 163791475, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.78515625, + "step": 7635, + "time_per_iteration": 2.513383150100708 + }, + { + "auxiliary_loss_clip": 0.01112964, + "auxiliary_loss_mlp": 0.01040022, + "balance_loss_clip": 1.02696204, + "balance_loss_mlp": 1.04045606, + "epoch": 0.45910115737261387, + "flos": 36535372087680.0, + "grad_norm": 1.4265799385965707, + "language_loss": 0.64948833, + "learning_rate": 2.3600312121237905e-06, + "loss": 0.67101824, + "num_input_tokens_seen": 163812995, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 7636, + "time_per_iteration": 4.062237501144409 + }, + { + "auxiliary_loss_clip": 0.01114223, + "auxiliary_loss_mlp": 0.01029599, + "balance_loss_clip": 1.01690328, + "balance_loss_mlp": 1.04186797, + "epoch": 0.45916128062528183, + "flos": 24419866302720.0, + "grad_norm": 1.4568741521374282, + "language_loss": 0.80726147, + "learning_rate": 2.3596481059602395e-06, + "loss": 0.82869971, + "num_input_tokens_seen": 163833945, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 7637, + "time_per_iteration": 4.017204999923706 + }, + { + "auxiliary_loss_clip": 0.011204, + "auxiliary_loss_mlp": 0.01037267, + "balance_loss_clip": 1.02297974, + "balance_loss_mlp": 1.0438447, + "epoch": 0.4592214038779498, + "flos": 23221958705280.0, + "grad_norm": 1.56098785708404, + "language_loss": 0.75311542, + "learning_rate": 2.3592649861587965e-06, + "loss": 0.77469212, + "num_input_tokens_seen": 163853885, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.765625, + "step": 7638, + "time_per_iteration": 2.4801623821258545 + }, + { + "auxiliary_loss_clip": 0.01113334, + "auxiliary_loss_mlp": 0.01033656, + "balance_loss_clip": 1.02054262, + "balance_loss_mlp": 1.04093051, + "epoch": 0.45928152713061776, + "flos": 19172133014400.0, + "grad_norm": 1.6757486640396035, + "language_loss": 0.74225289, + "learning_rate": 2.358881852733989e-06, + "loss": 0.76372278, + "num_input_tokens_seen": 163871855, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 7639, + "time_per_iteration": 2.457977294921875 + }, + { + "auxiliary_loss_clip": 0.0111659, + "auxiliary_loss_mlp": 0.01034149, + "balance_loss_clip": 1.02073193, + "balance_loss_mlp": 1.0410862, + "epoch": 0.4593416503832857, + "flos": 22414686491520.0, + "grad_norm": 2.7996676169839856, + "language_loss": 0.68441081, + "learning_rate": 2.358498705700346e-06, + "loss": 0.70591819, + "num_input_tokens_seen": 163891450, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75390625, + "step": 7640, + "time_per_iteration": 2.4815306663513184 + }, + { + "auxiliary_loss_clip": 0.01116242, + "auxiliary_loss_mlp": 0.01039241, + "balance_loss_clip": 1.02532363, + "balance_loss_mlp": 1.03950286, + "epoch": 0.4594017736359537, + "flos": 18880215183360.0, + "grad_norm": 4.694339799219563, + "language_loss": 0.75290608, + "learning_rate": 2.3581155450723958e-06, + "loss": 0.77446091, + "num_input_tokens_seen": 163909345, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 7641, + "time_per_iteration": 2.4738545417785645 + }, + { + "auxiliary_loss_clip": 0.0111703, + "auxiliary_loss_mlp": 0.01031975, + "balance_loss_clip": 1.0180217, + "balance_loss_mlp": 1.041008, + "epoch": 0.45946189688862166, + "flos": 20518567349760.0, + "grad_norm": 1.7266679695779108, + "language_loss": 0.74649787, + "learning_rate": 2.357732370864668e-06, + "loss": 0.76798791, + "num_input_tokens_seen": 163926940, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76171875, + "step": 7642, + "time_per_iteration": 2.474160671234131 + }, + { + "auxiliary_loss_clip": 0.01036998, + "auxiliary_loss_mlp": 0.00999788, + "balance_loss_clip": 0.99855977, + "balance_loss_mlp": 1.01273584, + "epoch": 0.4595220201412896, + "flos": 61405990162560.0, + "grad_norm": 0.8383581259748949, + "language_loss": 0.58191991, + "learning_rate": 2.357349183091694e-06, + "loss": 0.60228777, + "num_input_tokens_seen": 163977785, + "router_z_loss_clip": 0.01226807, + "router_z_loss_mlp": 0.2421875, + "step": 7643, + "time_per_iteration": 2.810622453689575 + }, + { + "auxiliary_loss_clip": 0.01118319, + "auxiliary_loss_mlp": 0.01036506, + "balance_loss_clip": 1.02267814, + "balance_loss_mlp": 1.03810704, + "epoch": 0.4595821433939576, + "flos": 23330947547520.0, + "grad_norm": 1.5583198955297553, + "language_loss": 0.92945647, + "learning_rate": 2.3569659817680016e-06, + "loss": 0.95100462, + "num_input_tokens_seen": 163996630, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.80078125, + "step": 7644, + "time_per_iteration": 2.4740004539489746 + }, + { + "auxiliary_loss_clip": 0.01116764, + "auxiliary_loss_mlp": 0.01038017, + "balance_loss_clip": 1.02458835, + "balance_loss_mlp": 1.04016256, + "epoch": 0.4596422666466256, + "flos": 14282356711680.0, + "grad_norm": 1.923875093759249, + "language_loss": 0.8283661, + "learning_rate": 2.3565827669081243e-06, + "loss": 0.8499139, + "num_input_tokens_seen": 164013190, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.765625, + "step": 7645, + "time_per_iteration": 2.459575891494751 + }, + { + "auxiliary_loss_clip": 0.01035246, + "auxiliary_loss_mlp": 0.00999372, + "balance_loss_clip": 0.99805516, + "balance_loss_mlp": 1.0108279, + "epoch": 0.4597023898992936, + "flos": 65727337737600.0, + "grad_norm": 0.7553504929083139, + "language_loss": 0.59931064, + "learning_rate": 2.356199538526593e-06, + "loss": 0.6196568, + "num_input_tokens_seen": 164074030, + "router_z_loss_clip": 0.01318359, + "router_z_loss_mlp": 0.24414062, + "step": 7646, + "time_per_iteration": 3.0040318965911865 + }, + { + "auxiliary_loss_clip": 0.01116678, + "auxiliary_loss_mlp": 0.01032804, + "balance_loss_clip": 1.01953018, + "balance_loss_mlp": 1.04043436, + "epoch": 0.45976251315196154, + "flos": 26907075653760.0, + "grad_norm": 1.6094604606837348, + "language_loss": 0.72804034, + "learning_rate": 2.355816296637939e-06, + "loss": 0.74953508, + "num_input_tokens_seen": 164095515, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.76171875, + "step": 7647, + "time_per_iteration": 2.539550304412842 + }, + { + "auxiliary_loss_clip": 0.01114997, + "auxiliary_loss_mlp": 0.01034751, + "balance_loss_clip": 1.02135134, + "balance_loss_mlp": 1.03845108, + "epoch": 0.4598226364046295, + "flos": 26618066824320.0, + "grad_norm": 1.5906503149252664, + "language_loss": 0.66864169, + "learning_rate": 2.3554330412566957e-06, + "loss": 0.69013917, + "num_input_tokens_seen": 164117270, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.765625, + "step": 7648, + "time_per_iteration": 2.538694143295288 + }, + { + "auxiliary_loss_clip": 0.01112764, + "auxiliary_loss_mlp": 0.01033419, + "balance_loss_clip": 1.01969171, + "balance_loss_mlp": 1.03751159, + "epoch": 0.45988275965729747, + "flos": 24387762522240.0, + "grad_norm": 1.4797855079557312, + "language_loss": 0.78785735, + "learning_rate": 2.3550497723973953e-06, + "loss": 0.80931914, + "num_input_tokens_seen": 164137850, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.75, + "step": 7649, + "time_per_iteration": 2.5164248943328857 + }, + { + "auxiliary_loss_clip": 0.01113076, + "auxiliary_loss_mlp": 0.01037179, + "balance_loss_clip": 1.02412558, + "balance_loss_mlp": 1.03840113, + "epoch": 0.45994288290996543, + "flos": 24535822383360.0, + "grad_norm": 3.1550947466117303, + "language_loss": 0.69324255, + "learning_rate": 2.3546664900745726e-06, + "loss": 0.7147451, + "num_input_tokens_seen": 164157960, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.75, + "step": 7650, + "time_per_iteration": 2.5182442665100098 + }, + { + "auxiliary_loss_clip": 0.01118739, + "auxiliary_loss_mlp": 0.01039502, + "balance_loss_clip": 1.0245893, + "balance_loss_mlp": 1.03925538, + "epoch": 0.4600030061626334, + "flos": 14830245838080.0, + "grad_norm": 1.968615763904363, + "language_loss": 0.83896518, + "learning_rate": 2.354283194302761e-06, + "loss": 0.86054754, + "num_input_tokens_seen": 164174590, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.79296875, + "step": 7651, + "time_per_iteration": 2.4545249938964844 + }, + { + "auxiliary_loss_clip": 0.01114537, + "auxiliary_loss_mlp": 0.01030219, + "balance_loss_clip": 1.01685548, + "balance_loss_mlp": 1.04122114, + "epoch": 0.46006312941530136, + "flos": 18113845582080.0, + "grad_norm": 2.1703456469435944, + "language_loss": 0.75375223, + "learning_rate": 2.3538998850964948e-06, + "loss": 0.77519977, + "num_input_tokens_seen": 164192935, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.734375, + "step": 7652, + "time_per_iteration": 2.4435648918151855 + }, + { + "auxiliary_loss_clip": 0.01113746, + "auxiliary_loss_mlp": 0.01029979, + "balance_loss_clip": 1.01611495, + "balance_loss_mlp": 1.03735042, + "epoch": 0.46012325266796933, + "flos": 21976468565760.0, + "grad_norm": 1.8091521205399639, + "language_loss": 0.75805604, + "learning_rate": 2.3535165624703097e-06, + "loss": 0.77949333, + "num_input_tokens_seen": 164213160, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 7653, + "time_per_iteration": 2.530977487564087 + }, + { + "auxiliary_loss_clip": 0.01121671, + "auxiliary_loss_mlp": 0.01038496, + "balance_loss_clip": 1.02338028, + "balance_loss_mlp": 1.04202819, + "epoch": 0.4601833759206373, + "flos": 15268068714240.0, + "grad_norm": 2.3598469293633584, + "language_loss": 0.6584686, + "learning_rate": 2.353133226438741e-06, + "loss": 0.68007028, + "num_input_tokens_seen": 164229330, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.796875, + "step": 7654, + "time_per_iteration": 2.3942883014678955 + }, + { + "auxiliary_loss_clip": 0.01112793, + "auxiliary_loss_mlp": 0.0103367, + "balance_loss_clip": 1.02026534, + "balance_loss_mlp": 1.0375098, + "epoch": 0.46024349917330526, + "flos": 27088999061760.0, + "grad_norm": 1.647085409720671, + "language_loss": 0.79088843, + "learning_rate": 2.3527498770163248e-06, + "loss": 0.81235307, + "num_input_tokens_seen": 164248240, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.75390625, + "step": 7655, + "time_per_iteration": 2.5213396549224854 + }, + { + "auxiliary_loss_clip": 0.01110004, + "auxiliary_loss_mlp": 0.01030693, + "balance_loss_clip": 1.01755643, + "balance_loss_mlp": 1.03802609, + "epoch": 0.4603036224259732, + "flos": 24462923731200.0, + "grad_norm": 2.0582079675710134, + "language_loss": 0.67502171, + "learning_rate": 2.3523665142175985e-06, + "loss": 0.69642866, + "num_input_tokens_seen": 164268020, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 7656, + "time_per_iteration": 2.4714531898498535 + }, + { + "auxiliary_loss_clip": 0.01112759, + "auxiliary_loss_mlp": 0.01032502, + "balance_loss_clip": 1.01965153, + "balance_loss_mlp": 1.03784871, + "epoch": 0.4603637456786412, + "flos": 28109292883200.0, + "grad_norm": 1.7896797448491664, + "language_loss": 0.81050038, + "learning_rate": 2.351983138057098e-06, + "loss": 0.83195299, + "num_input_tokens_seen": 164287305, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.75, + "step": 7657, + "time_per_iteration": 2.549114227294922 + }, + { + "auxiliary_loss_clip": 0.01113625, + "auxiliary_loss_mlp": 0.01031549, + "balance_loss_clip": 1.01767325, + "balance_loss_mlp": 1.03843951, + "epoch": 0.4604238689313092, + "flos": 24348942898560.0, + "grad_norm": 2.212167065380131, + "language_loss": 0.70071685, + "learning_rate": 2.3515997485493623e-06, + "loss": 0.72216856, + "num_input_tokens_seen": 164306835, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 7658, + "time_per_iteration": 2.4548964500427246 + }, + { + "auxiliary_loss_clip": 0.0103337, + "auxiliary_loss_mlp": 0.01002994, + "balance_loss_clip": 1.00179565, + "balance_loss_mlp": 1.00924027, + "epoch": 0.4604839921839772, + "flos": 53606229431040.0, + "grad_norm": 0.9542906494873047, + "language_loss": 0.62159562, + "learning_rate": 2.351216345708928e-06, + "loss": 0.64195925, + "num_input_tokens_seen": 164367095, + "router_z_loss_clip": 0.01196289, + "router_z_loss_mlp": 0.2421875, + "step": 7659, + "time_per_iteration": 3.194460153579712 + }, + { + "auxiliary_loss_clip": 0.01114248, + "auxiliary_loss_mlp": 0.0103108, + "balance_loss_clip": 1.01774633, + "balance_loss_mlp": 1.04089022, + "epoch": 0.46054411543664514, + "flos": 31248424126080.0, + "grad_norm": 2.0710979138047123, + "language_loss": 0.68395913, + "learning_rate": 2.350832929550336e-06, + "loss": 0.70541239, + "num_input_tokens_seen": 164388895, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.734375, + "step": 7660, + "time_per_iteration": 2.5212934017181396 + }, + { + "auxiliary_loss_clip": 0.01112449, + "auxiliary_loss_mlp": 0.01041428, + "balance_loss_clip": 1.02767086, + "balance_loss_mlp": 1.03826356, + "epoch": 0.4606042386893131, + "flos": 24092863862400.0, + "grad_norm": 1.7599753910943126, + "language_loss": 0.76785183, + "learning_rate": 2.3504495000881227e-06, + "loss": 0.78939056, + "num_input_tokens_seen": 164409080, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7421875, + "step": 7661, + "time_per_iteration": 2.504199981689453 + }, + { + "auxiliary_loss_clip": 0.01111854, + "auxiliary_loss_mlp": 0.0103438, + "balance_loss_clip": 1.02109385, + "balance_loss_mlp": 1.03997183, + "epoch": 0.46066436194198107, + "flos": 26578457101440.0, + "grad_norm": 1.743819837097498, + "language_loss": 0.74565995, + "learning_rate": 2.3500660573368305e-06, + "loss": 0.76712227, + "num_input_tokens_seen": 164427585, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 7662, + "time_per_iteration": 2.479710817337036 + }, + { + "auxiliary_loss_clip": 0.01118488, + "auxiliary_loss_mlp": 0.01032606, + "balance_loss_clip": 1.01835489, + "balance_loss_mlp": 1.03899062, + "epoch": 0.46072448519464904, + "flos": 17775602184960.0, + "grad_norm": 2.744789888238294, + "language_loss": 0.78880358, + "learning_rate": 2.349682601310998e-06, + "loss": 0.81031454, + "num_input_tokens_seen": 164438455, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.796875, + "step": 7663, + "time_per_iteration": 2.433105230331421 + }, + { + "auxiliary_loss_clip": 0.01110139, + "auxiliary_loss_mlp": 0.01035881, + "balance_loss_clip": 1.02286935, + "balance_loss_mlp": 1.03860092, + "epoch": 0.460784608447317, + "flos": 15086109392640.0, + "grad_norm": 1.8568277173945746, + "language_loss": 0.73164225, + "learning_rate": 2.3492991320251653e-06, + "loss": 0.75310248, + "num_input_tokens_seen": 164456830, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71484375, + "step": 7664, + "time_per_iteration": 2.4182069301605225 + }, + { + "auxiliary_loss_clip": 0.01114696, + "auxiliary_loss_mlp": 0.01033369, + "balance_loss_clip": 1.02064347, + "balance_loss_mlp": 1.040645, + "epoch": 0.46084473169998497, + "flos": 18588261438720.0, + "grad_norm": 1.6231584574242337, + "language_loss": 0.72039741, + "learning_rate": 2.3489156494938753e-06, + "loss": 0.74187809, + "num_input_tokens_seen": 164475375, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.73828125, + "step": 7665, + "time_per_iteration": 2.4458460807800293 + }, + { + "auxiliary_loss_clip": 0.01115054, + "auxiliary_loss_mlp": 0.01032258, + "balance_loss_clip": 1.01965141, + "balance_loss_mlp": 1.03982568, + "epoch": 0.46090485495265293, + "flos": 19494789909120.0, + "grad_norm": 1.8683756247621939, + "language_loss": 0.78134775, + "learning_rate": 2.348532153731669e-06, + "loss": 0.80282086, + "num_input_tokens_seen": 164492040, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.75390625, + "step": 7666, + "time_per_iteration": 2.4217963218688965 + }, + { + "auxiliary_loss_clip": 0.01112281, + "auxiliary_loss_mlp": 0.01034367, + "balance_loss_clip": 1.02005553, + "balance_loss_mlp": 1.03926802, + "epoch": 0.4609649782053209, + "flos": 33364927163520.0, + "grad_norm": 1.2927592404362929, + "language_loss": 0.73972279, + "learning_rate": 2.348148644753088e-06, + "loss": 0.76118922, + "num_input_tokens_seen": 164513665, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.73046875, + "step": 7667, + "time_per_iteration": 2.586657762527466 + }, + { + "auxiliary_loss_clip": 0.0111122, + "auxiliary_loss_mlp": 0.010306, + "balance_loss_clip": 1.01803541, + "balance_loss_mlp": 1.03743756, + "epoch": 0.46102510145798886, + "flos": 23769165473280.0, + "grad_norm": 1.3923437909363505, + "language_loss": 0.75857067, + "learning_rate": 2.347765122572676e-06, + "loss": 0.77998888, + "num_input_tokens_seen": 164533890, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.73828125, + "step": 7668, + "time_per_iteration": 2.456688642501831 + }, + { + "auxiliary_loss_clip": 0.01112338, + "auxiliary_loss_mlp": 0.01029444, + "balance_loss_clip": 1.01699305, + "balance_loss_mlp": 1.04143405, + "epoch": 0.4610852247106568, + "flos": 23294821443840.0, + "grad_norm": 2.015120719246451, + "language_loss": 0.77794099, + "learning_rate": 2.347381587204975e-06, + "loss": 0.79935884, + "num_input_tokens_seen": 164553815, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 7669, + "time_per_iteration": 2.503912925720215 + }, + { + "auxiliary_loss_clip": 0.01112792, + "auxiliary_loss_mlp": 0.01029623, + "balance_loss_clip": 1.01688588, + "balance_loss_mlp": 1.03798747, + "epoch": 0.4611453479633248, + "flos": 25447450584960.0, + "grad_norm": 1.8162494299938103, + "language_loss": 0.82330608, + "learning_rate": 2.34699803866453e-06, + "loss": 0.84473014, + "num_input_tokens_seen": 164573125, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.75, + "step": 7670, + "time_per_iteration": 2.481456995010376 + }, + { + "auxiliary_loss_clip": 0.01111476, + "auxiliary_loss_mlp": 0.0103085, + "balance_loss_clip": 1.01781416, + "balance_loss_mlp": 1.03845906, + "epoch": 0.4612054712159928, + "flos": 21139606523520.0, + "grad_norm": 1.6076372414606255, + "language_loss": 0.63204038, + "learning_rate": 2.3466144769658845e-06, + "loss": 0.6534636, + "num_input_tokens_seen": 164592575, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73046875, + "step": 7671, + "time_per_iteration": 2.4743082523345947 + }, + { + "auxiliary_loss_clip": 0.01034608, + "auxiliary_loss_mlp": 0.01007042, + "balance_loss_clip": 1.00571287, + "balance_loss_mlp": 1.01008546, + "epoch": 0.4612655944686608, + "flos": 69959266404480.0, + "grad_norm": 0.6877278401983052, + "language_loss": 0.55879581, + "learning_rate": 2.346230902123583e-06, + "loss": 0.57921231, + "num_input_tokens_seen": 164659795, + "router_z_loss_clip": 0.01330566, + "router_z_loss_mlp": 0.24609375, + "step": 7672, + "time_per_iteration": 3.15800142288208 + }, + { + "auxiliary_loss_clip": 0.0111558, + "auxiliary_loss_mlp": 0.01035904, + "balance_loss_clip": 1.02255249, + "balance_loss_mlp": 1.04003441, + "epoch": 0.46132571772132874, + "flos": 16837149502080.0, + "grad_norm": 1.8329231831015789, + "language_loss": 0.70920408, + "learning_rate": 2.3458473141521715e-06, + "loss": 0.73071891, + "num_input_tokens_seen": 164678735, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7578125, + "step": 7673, + "time_per_iteration": 2.4639430046081543 + }, + { + "auxiliary_loss_clip": 0.01112366, + "auxiliary_loss_mlp": 0.01034204, + "balance_loss_clip": 1.02145457, + "balance_loss_mlp": 1.04083312, + "epoch": 0.4613858409739967, + "flos": 35808935431680.0, + "grad_norm": 1.6780898708072003, + "language_loss": 0.70402145, + "learning_rate": 2.345463713066195e-06, + "loss": 0.72548711, + "num_input_tokens_seen": 164700885, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71484375, + "step": 7674, + "time_per_iteration": 2.5660369396209717 + }, + { + "auxiliary_loss_clip": 0.01111645, + "auxiliary_loss_mlp": 0.01037491, + "balance_loss_clip": 1.02384138, + "balance_loss_mlp": 1.03684926, + "epoch": 0.4614459642266647, + "flos": 35266756567680.0, + "grad_norm": 1.5790047103218752, + "language_loss": 0.65408182, + "learning_rate": 2.3450800988801996e-06, + "loss": 0.67557311, + "num_input_tokens_seen": 164726960, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75, + "step": 7675, + "time_per_iteration": 2.616771697998047 + }, + { + "auxiliary_loss_clip": 0.01034961, + "auxiliary_loss_mlp": 0.010075, + "balance_loss_clip": 1.00611675, + "balance_loss_mlp": 1.01053035, + "epoch": 0.46150608747933264, + "flos": 66704610044160.0, + "grad_norm": 0.7425701763607123, + "language_loss": 0.58600932, + "learning_rate": 2.3446964716087327e-06, + "loss": 0.60643393, + "num_input_tokens_seen": 164788525, + "router_z_loss_clip": 0.01385498, + "router_z_loss_mlp": 0.24511719, + "step": 7676, + "time_per_iteration": 3.09281325340271 + }, + { + "auxiliary_loss_clip": 0.01034023, + "auxiliary_loss_mlp": 0.01002968, + "balance_loss_clip": 1.00172222, + "balance_loss_mlp": 1.00993788, + "epoch": 0.4615662107320006, + "flos": 55830177025920.0, + "grad_norm": 0.7891273111868267, + "language_loss": 0.62684548, + "learning_rate": 2.344312831266341e-06, + "loss": 0.64721537, + "num_input_tokens_seen": 164843525, + "router_z_loss_clip": 0.01245117, + "router_z_loss_mlp": 0.24121094, + "step": 7677, + "time_per_iteration": 2.9087297916412354 + }, + { + "auxiliary_loss_clip": 0.01112185, + "auxiliary_loss_mlp": 0.01031192, + "balance_loss_clip": 1.018502, + "balance_loss_mlp": 1.03929043, + "epoch": 0.46162633398466857, + "flos": 15483245137920.0, + "grad_norm": 2.8566258545012464, + "language_loss": 0.76442772, + "learning_rate": 2.3439291778675718e-06, + "loss": 0.78586149, + "num_input_tokens_seen": 164859895, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7265625, + "step": 7678, + "time_per_iteration": 3.80979061126709 + }, + { + "auxiliary_loss_clip": 0.01115647, + "auxiliary_loss_mlp": 0.01035074, + "balance_loss_clip": 1.02148438, + "balance_loss_mlp": 1.04122365, + "epoch": 0.46168645723733653, + "flos": 20011437181440.0, + "grad_norm": 1.9875640695173902, + "language_loss": 0.66738796, + "learning_rate": 2.343545511426974e-06, + "loss": 0.68889523, + "num_input_tokens_seen": 164878030, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7421875, + "step": 7679, + "time_per_iteration": 5.473088502883911 + }, + { + "auxiliary_loss_clip": 0.01112323, + "auxiliary_loss_mlp": 0.01038079, + "balance_loss_clip": 1.02563381, + "balance_loss_mlp": 1.03913581, + "epoch": 0.4617465804900045, + "flos": 20298542590080.0, + "grad_norm": 1.9247599304086902, + "language_loss": 0.69658661, + "learning_rate": 2.3431618319590963e-06, + "loss": 0.71809065, + "num_input_tokens_seen": 164895710, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.73046875, + "step": 7680, + "time_per_iteration": 2.435971736907959 + }, + { + "auxiliary_loss_clip": 0.01121586, + "auxiliary_loss_mlp": 0.01041647, + "balance_loss_clip": 1.02805138, + "balance_loss_mlp": 1.04467559, + "epoch": 0.46180670374267246, + "flos": 22346312952960.0, + "grad_norm": 3.979685754880411, + "language_loss": 0.63813865, + "learning_rate": 2.342778139478487e-06, + "loss": 0.65977097, + "num_input_tokens_seen": 164913365, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.76953125, + "step": 7681, + "time_per_iteration": 2.486614942550659 + }, + { + "auxiliary_loss_clip": 0.01111536, + "auxiliary_loss_mlp": 0.01029983, + "balance_loss_clip": 1.01790738, + "balance_loss_mlp": 1.03925776, + "epoch": 0.46186682699534043, + "flos": 19895696582400.0, + "grad_norm": 1.518283771877835, + "language_loss": 0.66871607, + "learning_rate": 2.342394433999697e-06, + "loss": 0.69013125, + "num_input_tokens_seen": 164931620, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.72265625, + "step": 7682, + "time_per_iteration": 2.434720516204834 + }, + { + "auxiliary_loss_clip": 0.01114783, + "auxiliary_loss_mlp": 0.01036687, + "balance_loss_clip": 1.02353811, + "balance_loss_mlp": 1.03967464, + "epoch": 0.4619269502480084, + "flos": 31503569408640.0, + "grad_norm": 2.2113144827233397, + "language_loss": 0.74337292, + "learning_rate": 2.342010715537275e-06, + "loss": 0.76488769, + "num_input_tokens_seen": 164950905, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.75, + "step": 7683, + "time_per_iteration": 2.532867908477783 + }, + { + "auxiliary_loss_clip": 0.01113653, + "auxiliary_loss_mlp": 0.0103323, + "balance_loss_clip": 1.02046251, + "balance_loss_mlp": 1.04082799, + "epoch": 0.46198707350067636, + "flos": 25009484054400.0, + "grad_norm": 1.7237723920320163, + "language_loss": 0.76637614, + "learning_rate": 2.3416269841057726e-06, + "loss": 0.78784502, + "num_input_tokens_seen": 164970950, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7265625, + "step": 7684, + "time_per_iteration": 2.4763615131378174 + }, + { + "auxiliary_loss_clip": 0.01121747, + "auxiliary_loss_mlp": 0.01039637, + "balance_loss_clip": 1.02557588, + "balance_loss_mlp": 1.04270399, + "epoch": 0.4620471967533444, + "flos": 18292357198080.0, + "grad_norm": 2.012138726469413, + "language_loss": 0.80012244, + "learning_rate": 2.3412432397197412e-06, + "loss": 0.82173628, + "num_input_tokens_seen": 164989855, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7890625, + "step": 7685, + "time_per_iteration": 2.467780113220215 + }, + { + "auxiliary_loss_clip": 0.01113653, + "auxiliary_loss_mlp": 0.01038402, + "balance_loss_clip": 1.02434742, + "balance_loss_mlp": 1.04206526, + "epoch": 0.46210732000601235, + "flos": 33985104410880.0, + "grad_norm": 2.0493507584177424, + "language_loss": 0.66546774, + "learning_rate": 2.340859482393731e-06, + "loss": 0.68698829, + "num_input_tokens_seen": 165012290, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.71484375, + "step": 7686, + "time_per_iteration": 2.5675110816955566 + }, + { + "auxiliary_loss_clip": 0.01115805, + "auxiliary_loss_mlp": 0.01031167, + "balance_loss_clip": 1.01730859, + "balance_loss_mlp": 1.03924084, + "epoch": 0.4621674432586803, + "flos": 25009412227200.0, + "grad_norm": 2.0396518023333243, + "language_loss": 0.73831183, + "learning_rate": 2.340475712142296e-06, + "loss": 0.75978148, + "num_input_tokens_seen": 165030810, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.765625, + "step": 7687, + "time_per_iteration": 2.5077569484710693 + }, + { + "auxiliary_loss_clip": 0.01113947, + "auxiliary_loss_mlp": 0.01030534, + "balance_loss_clip": 1.01686668, + "balance_loss_mlp": 1.04119587, + "epoch": 0.4622275665113483, + "flos": 22014031213440.0, + "grad_norm": 2.1950912061668784, + "language_loss": 0.74758142, + "learning_rate": 2.3400919289799873e-06, + "loss": 0.76902628, + "num_input_tokens_seen": 165050205, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7265625, + "step": 7688, + "time_per_iteration": 2.4487764835357666 + }, + { + "auxiliary_loss_clip": 0.01112886, + "auxiliary_loss_mlp": 0.01035944, + "balance_loss_clip": 1.0214963, + "balance_loss_mlp": 1.03912246, + "epoch": 0.46228768976401624, + "flos": 24058820747520.0, + "grad_norm": 1.6667608580722473, + "language_loss": 0.78718561, + "learning_rate": 2.3397081329213585e-06, + "loss": 0.80867392, + "num_input_tokens_seen": 165069370, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.73828125, + "step": 7689, + "time_per_iteration": 2.504210948944092 + }, + { + "auxiliary_loss_clip": 0.01118414, + "auxiliary_loss_mlp": 0.01040294, + "balance_loss_clip": 1.02561891, + "balance_loss_mlp": 1.04086494, + "epoch": 0.4623478130166842, + "flos": 26651391667200.0, + "grad_norm": 3.5840156670541448, + "language_loss": 0.56649667, + "learning_rate": 2.339324323980964e-06, + "loss": 0.58808374, + "num_input_tokens_seen": 165089610, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7734375, + "step": 7690, + "time_per_iteration": 2.4970550537109375 + }, + { + "auxiliary_loss_clip": 0.01113577, + "auxiliary_loss_mlp": 0.01034848, + "balance_loss_clip": 1.02076888, + "balance_loss_mlp": 1.03844917, + "epoch": 0.46240793626935217, + "flos": 20558428467840.0, + "grad_norm": 2.2671044925643202, + "language_loss": 0.82513797, + "learning_rate": 2.3389405021733562e-06, + "loss": 0.84662223, + "num_input_tokens_seen": 165109050, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.75, + "step": 7691, + "time_per_iteration": 2.4712584018707275 + }, + { + "auxiliary_loss_clip": 0.01115177, + "auxiliary_loss_mlp": 0.01028123, + "balance_loss_clip": 1.01502824, + "balance_loss_mlp": 1.04124403, + "epoch": 0.46246805952202014, + "flos": 22456055980800.0, + "grad_norm": 1.513473472081282, + "language_loss": 0.75326777, + "learning_rate": 2.338556667513091e-06, + "loss": 0.77470076, + "num_input_tokens_seen": 165130130, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73828125, + "step": 7692, + "time_per_iteration": 2.462574005126953 + }, + { + "auxiliary_loss_clip": 0.01117023, + "auxiliary_loss_mlp": 0.01036723, + "balance_loss_clip": 1.0225668, + "balance_loss_mlp": 1.04110909, + "epoch": 0.4625281827746881, + "flos": 35041308854400.0, + "grad_norm": 4.10345040195295, + "language_loss": 0.74055338, + "learning_rate": 2.338172820014723e-06, + "loss": 0.76209086, + "num_input_tokens_seen": 165152685, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7578125, + "step": 7693, + "time_per_iteration": 2.578394889831543 + }, + { + "auxiliary_loss_clip": 0.01114967, + "auxiliary_loss_mlp": 0.01035157, + "balance_loss_clip": 1.02170396, + "balance_loss_mlp": 1.04132485, + "epoch": 0.46258830602735607, + "flos": 21068647205760.0, + "grad_norm": 1.5049695528407014, + "language_loss": 0.85576218, + "learning_rate": 2.337788959692808e-06, + "loss": 0.87726343, + "num_input_tokens_seen": 165173315, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 7694, + "time_per_iteration": 2.447938919067383 + }, + { + "auxiliary_loss_clip": 0.01116538, + "auxiliary_loss_mlp": 0.01036993, + "balance_loss_clip": 1.02379656, + "balance_loss_mlp": 1.04131126, + "epoch": 0.46264842928002403, + "flos": 26177227205760.0, + "grad_norm": 2.103971064334481, + "language_loss": 0.78631961, + "learning_rate": 2.337405086561902e-06, + "loss": 0.80785489, + "num_input_tokens_seen": 165192395, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.75, + "step": 7695, + "time_per_iteration": 2.510712146759033 + }, + { + "auxiliary_loss_clip": 0.01110008, + "auxiliary_loss_mlp": 0.01034157, + "balance_loss_clip": 1.021294, + "balance_loss_mlp": 1.0382899, + "epoch": 0.462708552532692, + "flos": 16764214936320.0, + "grad_norm": 1.7164209999926379, + "language_loss": 0.72215033, + "learning_rate": 2.3370212006365606e-06, + "loss": 0.74359202, + "num_input_tokens_seen": 165211355, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 7696, + "time_per_iteration": 2.427879571914673 + }, + { + "auxiliary_loss_clip": 0.01116967, + "auxiliary_loss_mlp": 0.01044874, + "balance_loss_clip": 1.03040195, + "balance_loss_mlp": 1.04200339, + "epoch": 0.46276867578535996, + "flos": 15560453422080.0, + "grad_norm": 1.7618442658513396, + "language_loss": 0.69068033, + "learning_rate": 2.3366373019313423e-06, + "loss": 0.71229875, + "num_input_tokens_seen": 165229380, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.75, + "step": 7697, + "time_per_iteration": 2.4759252071380615 + }, + { + "auxiliary_loss_clip": 0.01115214, + "auxiliary_loss_mlp": 0.01030228, + "balance_loss_clip": 1.01716232, + "balance_loss_mlp": 1.0421176, + "epoch": 0.462828799038028, + "flos": 22415404763520.0, + "grad_norm": 1.7059169761391482, + "language_loss": 0.84603721, + "learning_rate": 2.3362533904608025e-06, + "loss": 0.8674916, + "num_input_tokens_seen": 165247200, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73046875, + "step": 7698, + "time_per_iteration": 2.4416439533233643 + }, + { + "auxiliary_loss_clip": 0.01114129, + "auxiliary_loss_mlp": 0.01033925, + "balance_loss_clip": 1.02094316, + "balance_loss_mlp": 1.04008198, + "epoch": 0.46288892229069595, + "flos": 21069580959360.0, + "grad_norm": 2.2131790671554894, + "language_loss": 0.71495068, + "learning_rate": 2.335869466239502e-06, + "loss": 0.73643124, + "num_input_tokens_seen": 165265825, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7421875, + "step": 7699, + "time_per_iteration": 2.477674722671509 + }, + { + "auxiliary_loss_clip": 0.01115631, + "auxiliary_loss_mlp": 0.01036078, + "balance_loss_clip": 1.02183843, + "balance_loss_mlp": 1.03854418, + "epoch": 0.4629490455433639, + "flos": 23185688947200.0, + "grad_norm": 1.667240614809052, + "language_loss": 0.7189334, + "learning_rate": 2.335485529281996e-06, + "loss": 0.7404505, + "num_input_tokens_seen": 165284380, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7734375, + "step": 7700, + "time_per_iteration": 2.4664909839630127 + }, + { + "auxiliary_loss_clip": 0.01113806, + "auxiliary_loss_mlp": 0.01036044, + "balance_loss_clip": 1.0229491, + "balance_loss_mlp": 1.04012191, + "epoch": 0.4630091687960319, + "flos": 18835541642880.0, + "grad_norm": 1.9820544405348388, + "language_loss": 0.7245025, + "learning_rate": 2.3351015796028467e-06, + "loss": 0.74600095, + "num_input_tokens_seen": 165300320, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73828125, + "step": 7701, + "time_per_iteration": 2.4769680500030518 + }, + { + "auxiliary_loss_clip": 0.01117689, + "auxiliary_loss_mlp": 0.01035148, + "balance_loss_clip": 1.02129054, + "balance_loss_mlp": 1.04037929, + "epoch": 0.46306929204869984, + "flos": 38907020407680.0, + "grad_norm": 1.837243395087381, + "language_loss": 0.64583158, + "learning_rate": 2.3347176172166114e-06, + "loss": 0.66735995, + "num_input_tokens_seen": 165318130, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7734375, + "step": 7702, + "time_per_iteration": 2.5806338787078857 + }, + { + "auxiliary_loss_clip": 0.0111042, + "auxiliary_loss_mlp": 0.01030383, + "balance_loss_clip": 1.01753259, + "balance_loss_mlp": 1.03832746, + "epoch": 0.4631294153013678, + "flos": 19644178573440.0, + "grad_norm": 1.912512853345874, + "language_loss": 0.73265111, + "learning_rate": 2.33433364213785e-06, + "loss": 0.7540592, + "num_input_tokens_seen": 165336225, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.71875, + "step": 7703, + "time_per_iteration": 2.482374429702759 + }, + { + "auxiliary_loss_clip": 0.01119217, + "auxiliary_loss_mlp": 0.01033067, + "balance_loss_clip": 1.01882708, + "balance_loss_mlp": 1.04163849, + "epoch": 0.4631895385540358, + "flos": 24608254158720.0, + "grad_norm": 1.555397834218836, + "language_loss": 0.68780202, + "learning_rate": 2.3339496543811243e-06, + "loss": 0.70932484, + "num_input_tokens_seen": 165355005, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7734375, + "step": 7704, + "time_per_iteration": 2.4661428928375244 + }, + { + "auxiliary_loss_clip": 0.01118717, + "auxiliary_loss_mlp": 0.01027068, + "balance_loss_clip": 1.01313281, + "balance_loss_mlp": 1.04138649, + "epoch": 0.46324966180670374, + "flos": 26320115508480.0, + "grad_norm": 4.360671756910266, + "language_loss": 0.80963224, + "learning_rate": 2.3335656539609934e-06, + "loss": 0.83109009, + "num_input_tokens_seen": 165374910, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7734375, + "step": 7705, + "time_per_iteration": 2.5129587650299072 + }, + { + "auxiliary_loss_clip": 0.01116357, + "auxiliary_loss_mlp": 0.01032014, + "balance_loss_clip": 1.01863885, + "balance_loss_mlp": 1.03983259, + "epoch": 0.4633097850593717, + "flos": 19240506552960.0, + "grad_norm": 1.6860050062378817, + "language_loss": 0.77783883, + "learning_rate": 2.3331816408920196e-06, + "loss": 0.79932249, + "num_input_tokens_seen": 165392590, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.765625, + "step": 7706, + "time_per_iteration": 2.4212512969970703 + }, + { + "auxiliary_loss_clip": 0.01109433, + "auxiliary_loss_mlp": 0.0102981, + "balance_loss_clip": 1.01676846, + "balance_loss_mlp": 1.03858304, + "epoch": 0.46336990831203967, + "flos": 22783166161920.0, + "grad_norm": 1.9896841653009631, + "language_loss": 0.69805431, + "learning_rate": 2.3327976151887654e-06, + "loss": 0.71944684, + "num_input_tokens_seen": 165411195, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.70703125, + "step": 7707, + "time_per_iteration": 2.452716112136841 + }, + { + "auxiliary_loss_clip": 0.0111828, + "auxiliary_loss_mlp": 0.01037502, + "balance_loss_clip": 1.02268386, + "balance_loss_mlp": 1.03958869, + "epoch": 0.46343003156470763, + "flos": 38210604543360.0, + "grad_norm": 1.9384057680294333, + "language_loss": 0.61103344, + "learning_rate": 2.332413576865791e-06, + "loss": 0.63259125, + "num_input_tokens_seen": 165430150, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7890625, + "step": 7708, + "time_per_iteration": 2.567363739013672 + }, + { + "auxiliary_loss_clip": 0.01115409, + "auxiliary_loss_mlp": 0.01033039, + "balance_loss_clip": 1.01932991, + "balance_loss_mlp": 1.0407182, + "epoch": 0.4634901548173756, + "flos": 31938555110400.0, + "grad_norm": 1.9580912850569934, + "language_loss": 0.77165091, + "learning_rate": 2.3320295259376614e-06, + "loss": 0.7931354, + "num_input_tokens_seen": 165450595, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 7709, + "time_per_iteration": 2.532893657684326 + }, + { + "auxiliary_loss_clip": 0.01120131, + "auxiliary_loss_mlp": 0.0103614, + "balance_loss_clip": 1.02199614, + "balance_loss_mlp": 1.04260027, + "epoch": 0.46355027807004356, + "flos": 20082540153600.0, + "grad_norm": 1.8889269845152723, + "language_loss": 0.76972783, + "learning_rate": 2.3316454624189385e-06, + "loss": 0.79129058, + "num_input_tokens_seen": 165469515, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7734375, + "step": 7710, + "time_per_iteration": 2.4608266353607178 + }, + { + "auxiliary_loss_clip": 0.01119418, + "auxiliary_loss_mlp": 0.01032956, + "balance_loss_clip": 1.01812005, + "balance_loss_mlp": 1.04201198, + "epoch": 0.4636104013227116, + "flos": 24061370613120.0, + "grad_norm": 8.865430766980356, + "language_loss": 0.73548961, + "learning_rate": 2.3312613863241865e-06, + "loss": 0.75701332, + "num_input_tokens_seen": 165488125, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7734375, + "step": 7711, + "time_per_iteration": 2.4964261054992676 + }, + { + "auxiliary_loss_clip": 0.01114775, + "auxiliary_loss_mlp": 0.01043054, + "balance_loss_clip": 1.02818859, + "balance_loss_mlp": 1.04039836, + "epoch": 0.46367052457537955, + "flos": 23914639555200.0, + "grad_norm": 1.6554647385393604, + "language_loss": 0.71667624, + "learning_rate": 2.33087729766797e-06, + "loss": 0.73825449, + "num_input_tokens_seen": 165509225, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.74609375, + "step": 7712, + "time_per_iteration": 2.46760630607605 + }, + { + "auxiliary_loss_clip": 0.01121722, + "auxiliary_loss_mlp": 0.01038556, + "balance_loss_clip": 1.02325535, + "balance_loss_mlp": 1.04231286, + "epoch": 0.4637306478280475, + "flos": 26396533693440.0, + "grad_norm": 3.3767356374822053, + "language_loss": 0.72924775, + "learning_rate": 2.3304931964648524e-06, + "loss": 0.7508505, + "num_input_tokens_seen": 165529945, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.79296875, + "step": 7713, + "time_per_iteration": 2.501405954360962 + }, + { + "auxiliary_loss_clip": 0.01117475, + "auxiliary_loss_mlp": 0.01034085, + "balance_loss_clip": 1.0192256, + "balance_loss_mlp": 1.0397234, + "epoch": 0.4637907710807155, + "flos": 21980706370560.0, + "grad_norm": 1.980318346106041, + "language_loss": 0.58787149, + "learning_rate": 2.3301090827294e-06, + "loss": 0.60938716, + "num_input_tokens_seen": 165550690, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.77734375, + "step": 7714, + "time_per_iteration": 2.495403528213501 + }, + { + "auxiliary_loss_clip": 0.01113059, + "auxiliary_loss_mlp": 0.01032058, + "balance_loss_clip": 1.01873016, + "balance_loss_mlp": 1.03932118, + "epoch": 0.46385089433338345, + "flos": 12422291846400.0, + "grad_norm": 2.071541116221401, + "language_loss": 0.70241058, + "learning_rate": 2.3297249564761784e-06, + "loss": 0.72386181, + "num_input_tokens_seen": 165567775, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73828125, + "step": 7715, + "time_per_iteration": 2.4438905715942383 + }, + { + "auxiliary_loss_clip": 0.01120226, + "auxiliary_loss_mlp": 0.01034367, + "balance_loss_clip": 1.0211767, + "balance_loss_mlp": 1.04094183, + "epoch": 0.4639110175860514, + "flos": 23915752876800.0, + "grad_norm": 2.6792778299233775, + "language_loss": 0.67974752, + "learning_rate": 2.3293408177197527e-06, + "loss": 0.70129347, + "num_input_tokens_seen": 165587010, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.796875, + "step": 7716, + "time_per_iteration": 2.4544179439544678 + }, + { + "auxiliary_loss_clip": 0.01119502, + "auxiliary_loss_mlp": 0.010334, + "balance_loss_clip": 1.01913667, + "balance_loss_mlp": 1.04161263, + "epoch": 0.4639711408387194, + "flos": 25300396304640.0, + "grad_norm": 1.7705358267642153, + "language_loss": 0.81100738, + "learning_rate": 2.328956666474691e-06, + "loss": 0.8325364, + "num_input_tokens_seen": 165607850, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.78125, + "step": 7717, + "time_per_iteration": 2.491530179977417 + }, + { + "auxiliary_loss_clip": 0.0111535, + "auxiliary_loss_mlp": 0.0103239, + "balance_loss_clip": 1.01868117, + "balance_loss_mlp": 1.04001844, + "epoch": 0.46403126409138734, + "flos": 21211822817280.0, + "grad_norm": 1.8289041555667496, + "language_loss": 0.73165905, + "learning_rate": 2.3285725027555593e-06, + "loss": 0.75313652, + "num_input_tokens_seen": 165627175, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75390625, + "step": 7718, + "time_per_iteration": 2.4480137825012207 + }, + { + "auxiliary_loss_clip": 0.01114178, + "auxiliary_loss_mlp": 0.01037785, + "balance_loss_clip": 1.02355695, + "balance_loss_mlp": 1.03966463, + "epoch": 0.4640913873440553, + "flos": 35845564325760.0, + "grad_norm": 1.5484606356008148, + "language_loss": 0.70390046, + "learning_rate": 2.3281883265769254e-06, + "loss": 0.72542012, + "num_input_tokens_seen": 165648340, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.74609375, + "step": 7719, + "time_per_iteration": 2.565831422805786 + }, + { + "auxiliary_loss_clip": 0.0112125, + "auxiliary_loss_mlp": 0.01039419, + "balance_loss_clip": 1.02523875, + "balance_loss_mlp": 1.0433172, + "epoch": 0.46415151059672327, + "flos": 19166207270400.0, + "grad_norm": 1.6620583446293502, + "language_loss": 0.86685133, + "learning_rate": 2.327804137953357e-06, + "loss": 0.88845801, + "num_input_tokens_seen": 165667195, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.77734375, + "step": 7720, + "time_per_iteration": 5.243311166763306 + }, + { + "auxiliary_loss_clip": 0.01036993, + "auxiliary_loss_mlp": 0.01001352, + "balance_loss_clip": 0.99992698, + "balance_loss_mlp": 1.01241243, + "epoch": 0.46421163384939124, + "flos": 58912750304640.0, + "grad_norm": 0.7219170830729655, + "language_loss": 0.55086505, + "learning_rate": 2.3274199368994226e-06, + "loss": 0.57124853, + "num_input_tokens_seen": 165726760, + "router_z_loss_clip": 0.01422119, + "router_z_loss_mlp": 0.24609375, + "step": 7721, + "time_per_iteration": 4.553914785385132 + }, + { + "auxiliary_loss_clip": 0.01116315, + "auxiliary_loss_mlp": 0.01037313, + "balance_loss_clip": 1.02322233, + "balance_loss_mlp": 1.041767, + "epoch": 0.4642717571020592, + "flos": 20157342226560.0, + "grad_norm": 2.566766868002949, + "language_loss": 0.79665279, + "learning_rate": 2.3270357234296918e-06, + "loss": 0.81818902, + "num_input_tokens_seen": 165745005, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.74609375, + "step": 7722, + "time_per_iteration": 2.445401430130005 + }, + { + "auxiliary_loss_clip": 0.01118781, + "auxiliary_loss_mlp": 0.01032902, + "balance_loss_clip": 1.01957417, + "balance_loss_mlp": 1.04163325, + "epoch": 0.46433188035472717, + "flos": 25046184775680.0, + "grad_norm": 1.5891837623192666, + "language_loss": 0.77772748, + "learning_rate": 2.3266514975587332e-06, + "loss": 0.79924428, + "num_input_tokens_seen": 165765750, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7734375, + "step": 7723, + "time_per_iteration": 2.4992403984069824 + }, + { + "auxiliary_loss_clip": 0.01112839, + "auxiliary_loss_mlp": 0.01030607, + "balance_loss_clip": 1.01748788, + "balance_loss_mlp": 1.03973961, + "epoch": 0.4643920036073952, + "flos": 28075644817920.0, + "grad_norm": 1.5026814907271808, + "language_loss": 0.68433344, + "learning_rate": 2.326267259301118e-06, + "loss": 0.70576787, + "num_input_tokens_seen": 165787515, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73046875, + "step": 7724, + "time_per_iteration": 2.496286630630493 + }, + { + "auxiliary_loss_clip": 0.01112054, + "auxiliary_loss_mlp": 0.0103399, + "balance_loss_clip": 1.02032912, + "balance_loss_mlp": 1.03761983, + "epoch": 0.46445212686006315, + "flos": 18369350000640.0, + "grad_norm": 2.246547977212262, + "language_loss": 0.67335129, + "learning_rate": 2.325883008671415e-06, + "loss": 0.6948117, + "num_input_tokens_seen": 165806675, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7421875, + "step": 7725, + "time_per_iteration": 2.471104621887207 + }, + { + "auxiliary_loss_clip": 0.01108683, + "auxiliary_loss_mlp": 0.01037158, + "balance_loss_clip": 1.02523649, + "balance_loss_mlp": 1.03763461, + "epoch": 0.4645122501127311, + "flos": 31721618920320.0, + "grad_norm": 1.6153664866621378, + "language_loss": 0.64700842, + "learning_rate": 2.3254987456841955e-06, + "loss": 0.66846681, + "num_input_tokens_seen": 165829835, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.7109375, + "step": 7726, + "time_per_iteration": 2.5408668518066406 + }, + { + "auxiliary_loss_clip": 0.01117387, + "auxiliary_loss_mlp": 0.01032558, + "balance_loss_clip": 1.01916456, + "balance_loss_mlp": 1.04313767, + "epoch": 0.4645723733653991, + "flos": 23768806337280.0, + "grad_norm": 1.8244750339479887, + "language_loss": 0.74908936, + "learning_rate": 2.3251144703540307e-06, + "loss": 0.77058876, + "num_input_tokens_seen": 165849380, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7421875, + "step": 7727, + "time_per_iteration": 2.4853005409240723 + }, + { + "auxiliary_loss_clip": 0.01114218, + "auxiliary_loss_mlp": 0.01036565, + "balance_loss_clip": 1.02248645, + "balance_loss_mlp": 1.03968906, + "epoch": 0.46463249661806705, + "flos": 33145512935040.0, + "grad_norm": 2.0019169498028657, + "language_loss": 0.78683269, + "learning_rate": 2.3247301826954936e-06, + "loss": 0.80834055, + "num_input_tokens_seen": 165868620, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 7728, + "time_per_iteration": 2.5397188663482666 + }, + { + "auxiliary_loss_clip": 0.0111559, + "auxiliary_loss_mlp": 0.01036908, + "balance_loss_clip": 1.02303171, + "balance_loss_mlp": 1.0405283, + "epoch": 0.464692619870735, + "flos": 18296020385280.0, + "grad_norm": 2.3286376832796343, + "language_loss": 0.76053888, + "learning_rate": 2.324345882723155e-06, + "loss": 0.78206384, + "num_input_tokens_seen": 165885915, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 7729, + "time_per_iteration": 2.4818129539489746 + }, + { + "auxiliary_loss_clip": 0.011162, + "auxiliary_loss_mlp": 0.0103847, + "balance_loss_clip": 1.02543473, + "balance_loss_mlp": 1.04205704, + "epoch": 0.464752743123403, + "flos": 22638051216000.0, + "grad_norm": 1.578112141950269, + "language_loss": 0.79568058, + "learning_rate": 2.323961570451588e-06, + "loss": 0.81722724, + "num_input_tokens_seen": 165905465, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7421875, + "step": 7730, + "time_per_iteration": 2.5124597549438477 + }, + { + "auxiliary_loss_clip": 0.01113512, + "auxiliary_loss_mlp": 0.01037643, + "balance_loss_clip": 1.0245595, + "balance_loss_mlp": 1.03948402, + "epoch": 0.46481286637607094, + "flos": 20412128373120.0, + "grad_norm": 1.5075999703309564, + "language_loss": 0.76621842, + "learning_rate": 2.3235772458953655e-06, + "loss": 0.78772998, + "num_input_tokens_seen": 165924640, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7421875, + "step": 7731, + "time_per_iteration": 2.4976460933685303 + }, + { + "auxiliary_loss_clip": 0.01112526, + "auxiliary_loss_mlp": 0.01031386, + "balance_loss_clip": 1.01798737, + "balance_loss_mlp": 1.0393635, + "epoch": 0.4648729896287389, + "flos": 34275406129920.0, + "grad_norm": 1.7163179847514425, + "language_loss": 0.65824252, + "learning_rate": 2.323192909069061e-06, + "loss": 0.67968166, + "num_input_tokens_seen": 165945765, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73046875, + "step": 7732, + "time_per_iteration": 2.5720393657684326 + }, + { + "auxiliary_loss_clip": 0.01116963, + "auxiliary_loss_mlp": 0.01036386, + "balance_loss_clip": 1.02186668, + "balance_loss_mlp": 1.03906608, + "epoch": 0.4649331128814069, + "flos": 21321781326720.0, + "grad_norm": 2.6101927282287454, + "language_loss": 0.72711408, + "learning_rate": 2.32280855998725e-06, + "loss": 0.74864757, + "num_input_tokens_seen": 165964025, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.78125, + "step": 7733, + "time_per_iteration": 2.4926271438598633 + }, + { + "auxiliary_loss_clip": 0.01036248, + "auxiliary_loss_mlp": 0.01002748, + "balance_loss_clip": 1.00131154, + "balance_loss_mlp": 1.01211238, + "epoch": 0.46499323613407484, + "flos": 58308515717760.0, + "grad_norm": 1.2459739814545432, + "language_loss": 0.51962316, + "learning_rate": 2.3224241986645057e-06, + "loss": 0.54001307, + "num_input_tokens_seen": 166021950, + "router_z_loss_clip": 0.01434326, + "router_z_loss_mlp": 0.2421875, + "step": 7734, + "time_per_iteration": 3.0107176303863525 + }, + { + "auxiliary_loss_clip": 0.01113986, + "auxiliary_loss_mlp": 0.0103542, + "balance_loss_clip": 1.02194381, + "balance_loss_mlp": 1.04043412, + "epoch": 0.4650533593867428, + "flos": 10889660384640.0, + "grad_norm": 2.036607770310226, + "language_loss": 0.75633866, + "learning_rate": 2.3220398251154035e-06, + "loss": 0.77783275, + "num_input_tokens_seen": 166039675, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 7735, + "time_per_iteration": 2.487781286239624 + }, + { + "auxiliary_loss_clip": 0.01111506, + "auxiliary_loss_mlp": 0.01040266, + "balance_loss_clip": 1.02682567, + "balance_loss_mlp": 1.03985715, + "epoch": 0.46511348263941077, + "flos": 19974592805760.0, + "grad_norm": 2.402877095125316, + "language_loss": 0.70207214, + "learning_rate": 2.321655439354519e-06, + "loss": 0.7235899, + "num_input_tokens_seen": 166057745, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71484375, + "step": 7736, + "time_per_iteration": 2.4449374675750732 + }, + { + "auxiliary_loss_clip": 0.0111302, + "auxiliary_loss_mlp": 0.0103234, + "balance_loss_clip": 1.0199604, + "balance_loss_mlp": 1.04052627, + "epoch": 0.46517360589207873, + "flos": 19678401256320.0, + "grad_norm": 1.6375102922586726, + "language_loss": 0.72185129, + "learning_rate": 2.321271041396427e-06, + "loss": 0.74330497, + "num_input_tokens_seen": 166076440, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7265625, + "step": 7737, + "time_per_iteration": 2.494582176208496 + }, + { + "auxiliary_loss_clip": 0.01118991, + "auxiliary_loss_mlp": 0.01038693, + "balance_loss_clip": 1.02450085, + "balance_loss_mlp": 1.04341006, + "epoch": 0.46523372914474675, + "flos": 16872665074560.0, + "grad_norm": 2.6166748549663605, + "language_loss": 0.83362406, + "learning_rate": 2.3208866312557065e-06, + "loss": 0.85520089, + "num_input_tokens_seen": 166092520, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7578125, + "step": 7738, + "time_per_iteration": 2.427828550338745 + }, + { + "auxiliary_loss_clip": 0.01037214, + "auxiliary_loss_mlp": 0.01002748, + "balance_loss_clip": 1.0013417, + "balance_loss_mlp": 1.0132978, + "epoch": 0.4652938523974147, + "flos": 53439138339840.0, + "grad_norm": 0.7680630195464891, + "language_loss": 0.57788324, + "learning_rate": 2.320502208946932e-06, + "loss": 0.59828281, + "num_input_tokens_seen": 166156285, + "router_z_loss_clip": 0.01403809, + "router_z_loss_mlp": 0.24023438, + "step": 7739, + "time_per_iteration": 3.133042335510254 + }, + { + "auxiliary_loss_clip": 0.01113786, + "auxiliary_loss_mlp": 0.01038133, + "balance_loss_clip": 1.02543104, + "balance_loss_mlp": 1.03974605, + "epoch": 0.4653539756500827, + "flos": 15231296165760.0, + "grad_norm": 1.823827375035505, + "language_loss": 0.8481009, + "learning_rate": 2.3201177744846815e-06, + "loss": 0.86962008, + "num_input_tokens_seen": 166173455, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7421875, + "step": 7740, + "time_per_iteration": 2.4921228885650635 + }, + { + "auxiliary_loss_clip": 0.0111501, + "auxiliary_loss_mlp": 0.01037681, + "balance_loss_clip": 1.02391815, + "balance_loss_mlp": 1.04139423, + "epoch": 0.46541409890275065, + "flos": 23732249270400.0, + "grad_norm": 1.5033977780241194, + "language_loss": 0.76110768, + "learning_rate": 2.3197333278835327e-06, + "loss": 0.7826345, + "num_input_tokens_seen": 166194370, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.734375, + "step": 7741, + "time_per_iteration": 2.4922451972961426 + }, + { + "auxiliary_loss_clip": 0.01117905, + "auxiliary_loss_mlp": 0.01032255, + "balance_loss_clip": 1.01915359, + "balance_loss_mlp": 1.0404247, + "epoch": 0.4654742221554186, + "flos": 20847329556480.0, + "grad_norm": 1.7276921705055903, + "language_loss": 0.80555934, + "learning_rate": 2.319348869158064e-06, + "loss": 0.82706094, + "num_input_tokens_seen": 166213195, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7734375, + "step": 7742, + "time_per_iteration": 2.4906904697418213 + }, + { + "auxiliary_loss_clip": 0.01116814, + "auxiliary_loss_mlp": 0.01039288, + "balance_loss_clip": 1.02518523, + "balance_loss_mlp": 1.04049921, + "epoch": 0.4655343454080866, + "flos": 20704836303360.0, + "grad_norm": 1.9912151117228205, + "language_loss": 0.72541988, + "learning_rate": 2.3189643983228555e-06, + "loss": 0.74698091, + "num_input_tokens_seen": 166231350, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.765625, + "step": 7743, + "time_per_iteration": 2.4746901988983154 + }, + { + "auxiliary_loss_clip": 0.01114723, + "auxiliary_loss_mlp": 0.01030494, + "balance_loss_clip": 1.01745892, + "balance_loss_mlp": 1.0409807, + "epoch": 0.46559446866075455, + "flos": 18989850470400.0, + "grad_norm": 2.076205829431248, + "language_loss": 0.71137214, + "learning_rate": 2.318579915392483e-06, + "loss": 0.73282433, + "num_input_tokens_seen": 166250530, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.734375, + "step": 7744, + "time_per_iteration": 2.4928057193756104 + }, + { + "auxiliary_loss_clip": 0.01112536, + "auxiliary_loss_mlp": 0.01033232, + "balance_loss_clip": 1.02108455, + "balance_loss_mlp": 1.04053736, + "epoch": 0.4656545919134225, + "flos": 34496364643200.0, + "grad_norm": 1.5849641227794893, + "language_loss": 0.85084593, + "learning_rate": 2.31819542038153e-06, + "loss": 0.87230361, + "num_input_tokens_seen": 166272545, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71875, + "step": 7745, + "time_per_iteration": 2.574612617492676 + }, + { + "auxiliary_loss_clip": 0.01112672, + "auxiliary_loss_mlp": 0.01039212, + "balance_loss_clip": 1.02561605, + "balance_loss_mlp": 1.04127502, + "epoch": 0.4657147151660905, + "flos": 24310554238080.0, + "grad_norm": 1.35434162506916, + "language_loss": 0.73171556, + "learning_rate": 2.317810913304574e-06, + "loss": 0.75323439, + "num_input_tokens_seen": 166292135, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.71484375, + "step": 7746, + "time_per_iteration": 2.5375149250030518 + }, + { + "auxiliary_loss_clip": 0.01112894, + "auxiliary_loss_mlp": 0.01035164, + "balance_loss_clip": 1.02271867, + "balance_loss_mlp": 1.04081106, + "epoch": 0.46577483841875844, + "flos": 58795139220480.0, + "grad_norm": 1.5285629366651527, + "language_loss": 0.6993416, + "learning_rate": 2.3174263941761963e-06, + "loss": 0.72082222, + "num_input_tokens_seen": 166316710, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71875, + "step": 7747, + "time_per_iteration": 2.792043685913086 + }, + { + "auxiliary_loss_clip": 0.01112826, + "auxiliary_loss_mlp": 0.01031484, + "balance_loss_clip": 1.01872873, + "balance_loss_mlp": 1.03958046, + "epoch": 0.4658349616714264, + "flos": 31321969223040.0, + "grad_norm": 1.4175797777041124, + "language_loss": 0.67509431, + "learning_rate": 2.317041863010978e-06, + "loss": 0.69653738, + "num_input_tokens_seen": 166338535, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.73046875, + "step": 7748, + "time_per_iteration": 2.625060796737671 + }, + { + "auxiliary_loss_clip": 0.01117966, + "auxiliary_loss_mlp": 0.01037997, + "balance_loss_clip": 1.02341771, + "balance_loss_mlp": 1.04018533, + "epoch": 0.46589508492409437, + "flos": 14860338456960.0, + "grad_norm": 2.247229042591788, + "language_loss": 0.63667625, + "learning_rate": 2.3166573198235007e-06, + "loss": 0.65823585, + "num_input_tokens_seen": 166355540, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.77734375, + "step": 7749, + "time_per_iteration": 2.4132370948791504 + }, + { + "auxiliary_loss_clip": 0.01117494, + "auxiliary_loss_mlp": 0.0103098, + "balance_loss_clip": 1.01702619, + "balance_loss_mlp": 1.04231274, + "epoch": 0.46595520817676234, + "flos": 12895989431040.0, + "grad_norm": 2.928439488128299, + "language_loss": 0.74594498, + "learning_rate": 2.3162727646283456e-06, + "loss": 0.76742983, + "num_input_tokens_seen": 166372635, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75, + "step": 7750, + "time_per_iteration": 2.494771718978882 + }, + { + "auxiliary_loss_clip": 0.01115846, + "auxiliary_loss_mlp": 0.0103076, + "balance_loss_clip": 1.01701522, + "balance_loss_mlp": 1.0404911, + "epoch": 0.46601533142943036, + "flos": 32854169721600.0, + "grad_norm": 2.044073047720548, + "language_loss": 0.7496438, + "learning_rate": 2.3158881974400963e-06, + "loss": 0.77110994, + "num_input_tokens_seen": 166393175, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75390625, + "step": 7751, + "time_per_iteration": 2.5510993003845215 + }, + { + "auxiliary_loss_clip": 0.01118875, + "auxiliary_loss_mlp": 0.01031961, + "balance_loss_clip": 1.01816297, + "balance_loss_mlp": 1.04188776, + "epoch": 0.4660754546820983, + "flos": 19967517826560.0, + "grad_norm": 1.8775850665267624, + "language_loss": 0.73678327, + "learning_rate": 2.3155036182733345e-06, + "loss": 0.7582916, + "num_input_tokens_seen": 166408630, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.76953125, + "step": 7752, + "time_per_iteration": 2.5834901332855225 + }, + { + "auxiliary_loss_clip": 0.01118438, + "auxiliary_loss_mlp": 0.0103798, + "balance_loss_clip": 1.02401483, + "balance_loss_mlp": 1.041453, + "epoch": 0.4661355779347663, + "flos": 26688164215680.0, + "grad_norm": 2.485236836866318, + "language_loss": 0.69320381, + "learning_rate": 2.315119027142644e-06, + "loss": 0.71476793, + "num_input_tokens_seen": 166428170, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.76953125, + "step": 7753, + "time_per_iteration": 2.522881507873535 + }, + { + "auxiliary_loss_clip": 0.01111836, + "auxiliary_loss_mlp": 0.01031808, + "balance_loss_clip": 1.01862359, + "balance_loss_mlp": 1.04056942, + "epoch": 0.46619570118743425, + "flos": 20959442881920.0, + "grad_norm": 1.8174540980864333, + "language_loss": 0.72607052, + "learning_rate": 2.3147344240626076e-06, + "loss": 0.74750698, + "num_input_tokens_seen": 166446705, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71484375, + "step": 7754, + "time_per_iteration": 2.5403332710266113 + }, + { + "auxiliary_loss_clip": 0.01115444, + "auxiliary_loss_mlp": 0.01027181, + "balance_loss_clip": 1.01410365, + "balance_loss_mlp": 1.04032147, + "epoch": 0.4662558244401022, + "flos": 24426079355520.0, + "grad_norm": 1.501284890447191, + "language_loss": 0.78961611, + "learning_rate": 2.3143498090478114e-06, + "loss": 0.81104231, + "num_input_tokens_seen": 166466750, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.75, + "step": 7755, + "time_per_iteration": 2.4917664527893066 + }, + { + "auxiliary_loss_clip": 0.01110479, + "auxiliary_loss_mlp": 0.01029481, + "balance_loss_clip": 1.01675534, + "balance_loss_mlp": 1.03968203, + "epoch": 0.4663159476927702, + "flos": 20595452411520.0, + "grad_norm": 1.6390600579035761, + "language_loss": 0.72281897, + "learning_rate": 2.3139651821128382e-06, + "loss": 0.74421859, + "num_input_tokens_seen": 166485400, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 7756, + "time_per_iteration": 2.549678325653076 + }, + { + "auxiliary_loss_clip": 0.01111703, + "auxiliary_loss_mlp": 0.01030785, + "balance_loss_clip": 1.01770794, + "balance_loss_mlp": 1.03845477, + "epoch": 0.46637607094543815, + "flos": 25661872823040.0, + "grad_norm": 1.8004000990726714, + "language_loss": 0.78193069, + "learning_rate": 2.313580543272274e-06, + "loss": 0.80335552, + "num_input_tokens_seen": 166505730, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.734375, + "step": 7757, + "time_per_iteration": 2.483161687850952 + }, + { + "auxiliary_loss_clip": 0.01114658, + "auxiliary_loss_mlp": 0.01030645, + "balance_loss_clip": 1.01855707, + "balance_loss_mlp": 1.04131472, + "epoch": 0.4664361941981061, + "flos": 24273853516800.0, + "grad_norm": 2.024129481036371, + "language_loss": 0.66473371, + "learning_rate": 2.313195892540705e-06, + "loss": 0.68618673, + "num_input_tokens_seen": 166523770, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.734375, + "step": 7758, + "time_per_iteration": 2.5083394050598145 + }, + { + "auxiliary_loss_clip": 0.01113834, + "auxiliary_loss_mlp": 0.01037253, + "balance_loss_clip": 1.0243423, + "balance_loss_mlp": 1.04062152, + "epoch": 0.4664963174507741, + "flos": 18405871153920.0, + "grad_norm": 1.603488256474455, + "language_loss": 0.74207008, + "learning_rate": 2.3128112299327147e-06, + "loss": 0.76358092, + "num_input_tokens_seen": 166542935, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.73046875, + "step": 7759, + "time_per_iteration": 2.424461841583252 + }, + { + "auxiliary_loss_clip": 0.01113311, + "auxiliary_loss_mlp": 0.01033749, + "balance_loss_clip": 1.02089834, + "balance_loss_mlp": 1.04054224, + "epoch": 0.46655644070344204, + "flos": 22455122227200.0, + "grad_norm": 1.4805046968385447, + "language_loss": 0.77701056, + "learning_rate": 2.312426555462893e-06, + "loss": 0.79848123, + "num_input_tokens_seen": 166563935, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.73046875, + "step": 7760, + "time_per_iteration": 2.5147666931152344 + }, + { + "auxiliary_loss_clip": 0.01109461, + "auxiliary_loss_mlp": 0.01028634, + "balance_loss_clip": 1.01549125, + "balance_loss_mlp": 1.03895068, + "epoch": 0.46661656395611, + "flos": 13808407731840.0, + "grad_norm": 1.6623756387577715, + "language_loss": 0.74081796, + "learning_rate": 2.3120418691458237e-06, + "loss": 0.76219893, + "num_input_tokens_seen": 166582175, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 7761, + "time_per_iteration": 3.816096305847168 + }, + { + "auxiliary_loss_clip": 0.01117051, + "auxiliary_loss_mlp": 0.01031991, + "balance_loss_clip": 1.01743007, + "balance_loss_mlp": 1.040905, + "epoch": 0.466676687208778, + "flos": 21652159645440.0, + "grad_norm": 1.9521312394592187, + "language_loss": 0.78150368, + "learning_rate": 2.3116571709960956e-06, + "loss": 0.80299413, + "num_input_tokens_seen": 166601870, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.76171875, + "step": 7762, + "time_per_iteration": 5.593664169311523 + }, + { + "auxiliary_loss_clip": 0.01036542, + "auxiliary_loss_mlp": 0.01002344, + "balance_loss_clip": 1.00103235, + "balance_loss_mlp": 1.0128268, + "epoch": 0.46673681046144594, + "flos": 68534259068160.0, + "grad_norm": 0.7996147947039336, + "language_loss": 0.59759605, + "learning_rate": 2.311272461028297e-06, + "loss": 0.61798495, + "num_input_tokens_seen": 166668960, + "router_z_loss_clip": 0.01312256, + "router_z_loss_mlp": 0.23828125, + "step": 7763, + "time_per_iteration": 4.692638874053955 + }, + { + "auxiliary_loss_clip": 0.01115827, + "auxiliary_loss_mlp": 0.01035827, + "balance_loss_clip": 1.02139115, + "balance_loss_mlp": 1.03950739, + "epoch": 0.46679693371411396, + "flos": 15814449469440.0, + "grad_norm": 2.0939196550691075, + "language_loss": 0.78502893, + "learning_rate": 2.3108877392570146e-06, + "loss": 0.80654544, + "num_input_tokens_seen": 166686110, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.765625, + "step": 7764, + "time_per_iteration": 2.437487840652466 + }, + { + "auxiliary_loss_clip": 0.01113145, + "auxiliary_loss_mlp": 0.01035809, + "balance_loss_clip": 1.02385855, + "balance_loss_mlp": 1.04100394, + "epoch": 0.4668570569667819, + "flos": 18514572687360.0, + "grad_norm": 1.8134732296760265, + "language_loss": 0.72272134, + "learning_rate": 2.310503005696839e-06, + "loss": 0.74421084, + "num_input_tokens_seen": 166703930, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.72265625, + "step": 7765, + "time_per_iteration": 2.4413938522338867 + }, + { + "auxiliary_loss_clip": 0.01114151, + "auxiliary_loss_mlp": 0.01034738, + "balance_loss_clip": 1.02123809, + "balance_loss_mlp": 1.03898025, + "epoch": 0.4669171802194499, + "flos": 19206643006080.0, + "grad_norm": 2.045608669049209, + "language_loss": 0.77604026, + "learning_rate": 2.3101182603623576e-06, + "loss": 0.79752916, + "num_input_tokens_seen": 166719940, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 7766, + "time_per_iteration": 2.4388277530670166 + }, + { + "auxiliary_loss_clip": 0.01112932, + "auxiliary_loss_mlp": 0.01033882, + "balance_loss_clip": 1.02094162, + "balance_loss_mlp": 1.03921056, + "epoch": 0.46697730347211786, + "flos": 12276135406080.0, + "grad_norm": 1.9270773145684021, + "language_loss": 0.65106744, + "learning_rate": 2.3097335032681607e-06, + "loss": 0.67253554, + "num_input_tokens_seen": 166738285, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.73828125, + "step": 7767, + "time_per_iteration": 2.4259531497955322 + }, + { + "auxiliary_loss_clip": 0.01115563, + "auxiliary_loss_mlp": 0.01036202, + "balance_loss_clip": 1.02361989, + "balance_loss_mlp": 1.04137385, + "epoch": 0.4670374267247858, + "flos": 23586739274880.0, + "grad_norm": 1.832674622819915, + "language_loss": 0.74584204, + "learning_rate": 2.3093487344288393e-06, + "loss": 0.76735973, + "num_input_tokens_seen": 166758170, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7421875, + "step": 7768, + "time_per_iteration": 2.5001304149627686 + }, + { + "auxiliary_loss_clip": 0.01114611, + "auxiliary_loss_mlp": 0.01031838, + "balance_loss_clip": 1.01907098, + "balance_loss_mlp": 1.04069757, + "epoch": 0.4670975499774538, + "flos": 15991093578240.0, + "grad_norm": 1.7275432453698176, + "language_loss": 0.70713127, + "learning_rate": 2.308963953858982e-06, + "loss": 0.72859579, + "num_input_tokens_seen": 166775750, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.73828125, + "step": 7769, + "time_per_iteration": 2.466909408569336 + }, + { + "auxiliary_loss_clip": 0.01113851, + "auxiliary_loss_mlp": 0.01033973, + "balance_loss_clip": 1.02159858, + "balance_loss_mlp": 1.03928077, + "epoch": 0.46715767323012175, + "flos": 15377596260480.0, + "grad_norm": 1.9729575937492385, + "language_loss": 0.8121224, + "learning_rate": 2.3085791615731803e-06, + "loss": 0.83360064, + "num_input_tokens_seen": 166791720, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.74609375, + "step": 7770, + "time_per_iteration": 2.458648204803467 + }, + { + "auxiliary_loss_clip": 0.01036054, + "auxiliary_loss_mlp": 0.01001838, + "balance_loss_clip": 1.00070572, + "balance_loss_mlp": 1.01253605, + "epoch": 0.4672177964827897, + "flos": 60252217401600.0, + "grad_norm": 0.7993613034211892, + "language_loss": 0.5567323, + "learning_rate": 2.3081943575860265e-06, + "loss": 0.57711124, + "num_input_tokens_seen": 166856360, + "router_z_loss_clip": 0.01135254, + "router_z_loss_mlp": 0.23632812, + "step": 7771, + "time_per_iteration": 3.0888803005218506 + }, + { + "auxiliary_loss_clip": 0.01111082, + "auxiliary_loss_mlp": 0.01036097, + "balance_loss_clip": 1.02332425, + "balance_loss_mlp": 1.03920853, + "epoch": 0.4672779197354577, + "flos": 27636134002560.0, + "grad_norm": 2.068311261086289, + "language_loss": 0.65702665, + "learning_rate": 2.3078095419121117e-06, + "loss": 0.67849845, + "num_input_tokens_seen": 166875925, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.71875, + "step": 7772, + "time_per_iteration": 2.5242044925689697 + }, + { + "auxiliary_loss_clip": 0.01112309, + "auxiliary_loss_mlp": 0.01032453, + "balance_loss_clip": 1.0201087, + "balance_loss_mlp": 1.04012156, + "epoch": 0.46733804298812565, + "flos": 31394257344000.0, + "grad_norm": 1.8148576314480773, + "language_loss": 0.63699466, + "learning_rate": 2.3074247145660283e-06, + "loss": 0.65844226, + "num_input_tokens_seen": 166896520, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.72265625, + "step": 7773, + "time_per_iteration": 2.5828921794891357 + }, + { + "auxiliary_loss_clip": 0.01114763, + "auxiliary_loss_mlp": 0.01034304, + "balance_loss_clip": 1.02112508, + "balance_loss_mlp": 1.04050922, + "epoch": 0.4673981662407936, + "flos": 19500607912320.0, + "grad_norm": 1.942265734861076, + "language_loss": 0.79793948, + "learning_rate": 2.3070398755623685e-06, + "loss": 0.81943017, + "num_input_tokens_seen": 166915370, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 7774, + "time_per_iteration": 2.448124647140503 + }, + { + "auxiliary_loss_clip": 0.01116733, + "auxiliary_loss_mlp": 0.01030065, + "balance_loss_clip": 1.01732183, + "balance_loss_mlp": 1.04113531, + "epoch": 0.4674582894934616, + "flos": 20521835487360.0, + "grad_norm": 1.627446474145158, + "language_loss": 0.77884328, + "learning_rate": 2.306655024915726e-06, + "loss": 0.80031127, + "num_input_tokens_seen": 166934875, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7578125, + "step": 7775, + "time_per_iteration": 2.527324676513672 + }, + { + "auxiliary_loss_clip": 0.01111153, + "auxiliary_loss_mlp": 0.01029234, + "balance_loss_clip": 1.01650286, + "balance_loss_mlp": 1.03931999, + "epoch": 0.46751841274612954, + "flos": 22090952188800.0, + "grad_norm": 1.8679682194131426, + "language_loss": 0.69634461, + "learning_rate": 2.306270162640694e-06, + "loss": 0.71774852, + "num_input_tokens_seen": 166954285, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 7776, + "time_per_iteration": 2.4637980461120605 + }, + { + "auxiliary_loss_clip": 0.01113537, + "auxiliary_loss_mlp": 0.01032642, + "balance_loss_clip": 1.02123928, + "balance_loss_mlp": 1.04122162, + "epoch": 0.46757853599879756, + "flos": 26980082046720.0, + "grad_norm": 1.3721760360464321, + "language_loss": 0.73558104, + "learning_rate": 2.3058852887518678e-06, + "loss": 0.75704277, + "num_input_tokens_seen": 166975975, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.72265625, + "step": 7777, + "time_per_iteration": 2.520732879638672 + }, + { + "auxiliary_loss_clip": 0.01113463, + "auxiliary_loss_mlp": 0.01029901, + "balance_loss_clip": 1.0170207, + "balance_loss_mlp": 1.04067683, + "epoch": 0.4676386592514655, + "flos": 24134053783680.0, + "grad_norm": 2.1302386072463717, + "language_loss": 0.69626892, + "learning_rate": 2.3055004032638394e-06, + "loss": 0.71770251, + "num_input_tokens_seen": 166996140, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7265625, + "step": 7778, + "time_per_iteration": 2.514420509338379 + }, + { + "auxiliary_loss_clip": 0.01114478, + "auxiliary_loss_mlp": 0.01039246, + "balance_loss_clip": 1.02606773, + "balance_loss_mlp": 1.04059839, + "epoch": 0.4676987825041335, + "flos": 25483720343040.0, + "grad_norm": 1.560538067350171, + "language_loss": 0.73252767, + "learning_rate": 2.305115506191206e-06, + "loss": 0.75406492, + "num_input_tokens_seen": 167016105, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 7779, + "time_per_iteration": 2.5243053436279297 + }, + { + "auxiliary_loss_clip": 0.01112098, + "auxiliary_loss_mlp": 0.01039794, + "balance_loss_clip": 1.02767682, + "balance_loss_mlp": 1.04009414, + "epoch": 0.46775890575680146, + "flos": 21945298538880.0, + "grad_norm": 1.5361358548392845, + "language_loss": 0.72206026, + "learning_rate": 2.304730597548562e-06, + "loss": 0.74357915, + "num_input_tokens_seen": 167036185, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71875, + "step": 7780, + "time_per_iteration": 2.462562322616577 + }, + { + "auxiliary_loss_clip": 0.01116485, + "auxiliary_loss_mlp": 0.01036348, + "balance_loss_clip": 1.02259159, + "balance_loss_mlp": 1.03972697, + "epoch": 0.4678190290094694, + "flos": 25228395492480.0, + "grad_norm": 2.377229275085917, + "language_loss": 0.73864317, + "learning_rate": 2.3043456773505023e-06, + "loss": 0.76017153, + "num_input_tokens_seen": 167054515, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.765625, + "step": 7781, + "time_per_iteration": 2.502406358718872 + }, + { + "auxiliary_loss_clip": 0.01117462, + "auxiliary_loss_mlp": 0.01035425, + "balance_loss_clip": 1.02216315, + "balance_loss_mlp": 1.04165602, + "epoch": 0.4678791522621374, + "flos": 32268358811520.0, + "grad_norm": 1.718665338253189, + "language_loss": 0.62727809, + "learning_rate": 2.3039607456116252e-06, + "loss": 0.64880699, + "num_input_tokens_seen": 167077245, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7578125, + "step": 7782, + "time_per_iteration": 2.5425686836242676 + }, + { + "auxiliary_loss_clip": 0.01117055, + "auxiliary_loss_mlp": 0.01039299, + "balance_loss_clip": 1.02660906, + "balance_loss_mlp": 1.0408988, + "epoch": 0.46793927551480535, + "flos": 27046480337280.0, + "grad_norm": 1.7203724678454408, + "language_loss": 0.62933487, + "learning_rate": 2.3035758023465254e-06, + "loss": 0.65089834, + "num_input_tokens_seen": 167097235, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.76171875, + "step": 7783, + "time_per_iteration": 2.5380141735076904 + }, + { + "auxiliary_loss_clip": 0.01122421, + "auxiliary_loss_mlp": 0.01036672, + "balance_loss_clip": 1.02271223, + "balance_loss_mlp": 1.04462993, + "epoch": 0.4679993987674733, + "flos": 17457398576640.0, + "grad_norm": 2.164400906730855, + "language_loss": 0.67745304, + "learning_rate": 2.303190847569801e-06, + "loss": 0.69904399, + "num_input_tokens_seen": 167113155, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.77734375, + "step": 7784, + "time_per_iteration": 2.4520463943481445 + }, + { + "auxiliary_loss_clip": 0.01110794, + "auxiliary_loss_mlp": 0.01031461, + "balance_loss_clip": 1.01965284, + "balance_loss_mlp": 1.03855705, + "epoch": 0.4680595220201413, + "flos": 17165121609600.0, + "grad_norm": 1.8603472350259396, + "language_loss": 0.84720063, + "learning_rate": 2.3028058812960497e-06, + "loss": 0.8686232, + "num_input_tokens_seen": 167131765, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.72265625, + "step": 7785, + "time_per_iteration": 2.459446907043457 + }, + { + "auxiliary_loss_clip": 0.01115116, + "auxiliary_loss_mlp": 0.01029458, + "balance_loss_clip": 1.01639259, + "balance_loss_mlp": 1.04066038, + "epoch": 0.46811964527280925, + "flos": 11327591001600.0, + "grad_norm": 2.0359259581468154, + "language_loss": 0.77018952, + "learning_rate": 2.3024209035398678e-06, + "loss": 0.79163527, + "num_input_tokens_seen": 167149030, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.74609375, + "step": 7786, + "time_per_iteration": 2.415062427520752 + }, + { + "auxiliary_loss_clip": 0.01110671, + "auxiliary_loss_mlp": 0.01027657, + "balance_loss_clip": 1.01558685, + "balance_loss_mlp": 1.0400672, + "epoch": 0.4681797685254772, + "flos": 24278809593600.0, + "grad_norm": 2.023612965965443, + "language_loss": 0.73795342, + "learning_rate": 2.302035914315856e-06, + "loss": 0.75933665, + "num_input_tokens_seen": 167167375, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.70703125, + "step": 7787, + "time_per_iteration": 2.5224268436431885 + }, + { + "auxiliary_loss_clip": 0.01113364, + "auxiliary_loss_mlp": 0.01039171, + "balance_loss_clip": 1.02599859, + "balance_loss_mlp": 1.04109263, + "epoch": 0.4682398917781452, + "flos": 31650372293760.0, + "grad_norm": 1.7002718084162438, + "language_loss": 0.65639925, + "learning_rate": 2.3016509136386116e-06, + "loss": 0.67792457, + "num_input_tokens_seen": 167188065, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.72265625, + "step": 7788, + "time_per_iteration": 2.534850835800171 + }, + { + "auxiliary_loss_clip": 0.01110419, + "auxiliary_loss_mlp": 0.01030066, + "balance_loss_clip": 1.01878858, + "balance_loss_mlp": 1.03911507, + "epoch": 0.46830001503081314, + "flos": 28110765340800.0, + "grad_norm": 1.9511727744147118, + "language_loss": 0.63813901, + "learning_rate": 2.3012659015227343e-06, + "loss": 0.65954381, + "num_input_tokens_seen": 167209675, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.71484375, + "step": 7789, + "time_per_iteration": 2.5479812622070312 + }, + { + "auxiliary_loss_clip": 0.01036451, + "auxiliary_loss_mlp": 0.01005013, + "balance_loss_clip": 1.00388098, + "balance_loss_mlp": 1.01292431, + "epoch": 0.4683601382834811, + "flos": 57881718316800.0, + "grad_norm": 0.7071467356489777, + "language_loss": 0.61922455, + "learning_rate": 2.300880877982825e-06, + "loss": 0.6396392, + "num_input_tokens_seen": 167273940, + "router_z_loss_clip": 0.01135254, + "router_z_loss_mlp": 0.23632812, + "step": 7790, + "time_per_iteration": 3.1510462760925293 + }, + { + "auxiliary_loss_clip": 0.01112801, + "auxiliary_loss_mlp": 0.0103052, + "balance_loss_clip": 1.01836109, + "balance_loss_mlp": 1.04223442, + "epoch": 0.46842026153614913, + "flos": 21871933009920.0, + "grad_norm": 1.5995715197713376, + "language_loss": 0.79338831, + "learning_rate": 2.3004958430334808e-06, + "loss": 0.81482148, + "num_input_tokens_seen": 167292730, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.703125, + "step": 7791, + "time_per_iteration": 2.5008740425109863 + }, + { + "auxiliary_loss_clip": 0.01114115, + "auxiliary_loss_mlp": 0.01035459, + "balance_loss_clip": 1.02297759, + "balance_loss_mlp": 1.04113936, + "epoch": 0.4684803847888171, + "flos": 24900818434560.0, + "grad_norm": 1.651557239680421, + "language_loss": 0.7484895, + "learning_rate": 2.3001107966893052e-06, + "loss": 0.76998532, + "num_input_tokens_seen": 167313460, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.73046875, + "step": 7792, + "time_per_iteration": 2.4964823722839355 + }, + { + "auxiliary_loss_clip": 0.01108357, + "auxiliary_loss_mlp": 0.01031288, + "balance_loss_clip": 1.01953983, + "balance_loss_mlp": 1.03747678, + "epoch": 0.46854050804148506, + "flos": 26251670142720.0, + "grad_norm": 1.7412725365893262, + "language_loss": 0.6822598, + "learning_rate": 2.299725738964898e-06, + "loss": 0.70365626, + "num_input_tokens_seen": 167335385, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.70703125, + "step": 7793, + "time_per_iteration": 2.5480096340179443 + }, + { + "auxiliary_loss_clip": 0.01112468, + "auxiliary_loss_mlp": 0.01027992, + "balance_loss_clip": 1.01638055, + "balance_loss_mlp": 1.04102671, + "epoch": 0.468600631294153, + "flos": 21579799697280.0, + "grad_norm": 1.577590367357015, + "language_loss": 0.73983628, + "learning_rate": 2.2993406698748607e-06, + "loss": 0.76124084, + "num_input_tokens_seen": 167353625, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.71484375, + "step": 7794, + "time_per_iteration": 2.453190803527832 + }, + { + "auxiliary_loss_clip": 0.01114261, + "auxiliary_loss_mlp": 0.01035788, + "balance_loss_clip": 1.0227052, + "balance_loss_mlp": 1.04182243, + "epoch": 0.468660754546821, + "flos": 25885632597120.0, + "grad_norm": 1.5518603627769951, + "language_loss": 0.63617218, + "learning_rate": 2.2989555894337953e-06, + "loss": 0.65767258, + "num_input_tokens_seen": 167374565, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 7795, + "time_per_iteration": 2.5087008476257324 + }, + { + "auxiliary_loss_clip": 0.01108593, + "auxiliary_loss_mlp": 0.01023894, + "balance_loss_clip": 1.01140058, + "balance_loss_mlp": 1.03883195, + "epoch": 0.46872087779948896, + "flos": 35475001666560.0, + "grad_norm": 1.6379638897021238, + "language_loss": 0.68002474, + "learning_rate": 2.298570497656304e-06, + "loss": 0.70134962, + "num_input_tokens_seen": 167395010, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 7796, + "time_per_iteration": 2.6073970794677734 + }, + { + "auxiliary_loss_clip": 0.01110063, + "auxiliary_loss_mlp": 0.01031393, + "balance_loss_clip": 1.01876903, + "balance_loss_mlp": 1.03811777, + "epoch": 0.4687810010521569, + "flos": 26396425952640.0, + "grad_norm": 1.6469110962479863, + "language_loss": 0.70039898, + "learning_rate": 2.2981853945569894e-06, + "loss": 0.72181356, + "num_input_tokens_seen": 167415285, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71875, + "step": 7797, + "time_per_iteration": 2.5202813148498535 + }, + { + "auxiliary_loss_clip": 0.01114247, + "auxiliary_loss_mlp": 0.01030143, + "balance_loss_clip": 1.01626134, + "balance_loss_mlp": 1.04066193, + "epoch": 0.4688411243048249, + "flos": 19972761212160.0, + "grad_norm": 5.424608495577661, + "language_loss": 0.67517138, + "learning_rate": 2.297800280150454e-06, + "loss": 0.69661522, + "num_input_tokens_seen": 167432405, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.734375, + "step": 7798, + "time_per_iteration": 2.425443649291992 + }, + { + "auxiliary_loss_clip": 0.01033599, + "auxiliary_loss_mlp": 0.00999727, + "balance_loss_clip": 0.99840373, + "balance_loss_mlp": 1.00991392, + "epoch": 0.46890124755749285, + "flos": 63977015900160.0, + "grad_norm": 0.9386412030406017, + "language_loss": 0.64531696, + "learning_rate": 2.2974151544513033e-06, + "loss": 0.66565025, + "num_input_tokens_seen": 167499365, + "router_z_loss_clip": 0.01324463, + "router_z_loss_mlp": 0.23730469, + "step": 7799, + "time_per_iteration": 3.2528939247131348 + }, + { + "auxiliary_loss_clip": 0.01108747, + "auxiliary_loss_mlp": 0.01025125, + "balance_loss_clip": 1.01308465, + "balance_loss_mlp": 1.03731787, + "epoch": 0.4689613708101608, + "flos": 23768985905280.0, + "grad_norm": 1.4163336480228355, + "language_loss": 0.72242683, + "learning_rate": 2.2970300174741395e-06, + "loss": 0.74376553, + "num_input_tokens_seen": 167520390, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.71484375, + "step": 7800, + "time_per_iteration": 2.481309175491333 + }, + { + "auxiliary_loss_clip": 0.01109702, + "auxiliary_loss_mlp": 0.01030663, + "balance_loss_clip": 1.01937377, + "balance_loss_mlp": 1.0401566, + "epoch": 0.4690214940628288, + "flos": 24788705109120.0, + "grad_norm": 2.26920520557406, + "language_loss": 0.72428536, + "learning_rate": 2.296644869233568e-06, + "loss": 0.74568903, + "num_input_tokens_seen": 167539865, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6953125, + "step": 7801, + "time_per_iteration": 2.491105079650879 + }, + { + "auxiliary_loss_clip": 0.01116821, + "auxiliary_loss_mlp": 0.0103741, + "balance_loss_clip": 1.02352786, + "balance_loss_mlp": 1.04097068, + "epoch": 0.46908161731549675, + "flos": 18077324428800.0, + "grad_norm": 2.06336431229611, + "language_loss": 0.62303418, + "learning_rate": 2.2962597097441936e-06, + "loss": 0.64457649, + "num_input_tokens_seen": 167558190, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 7802, + "time_per_iteration": 2.419229030609131 + }, + { + "auxiliary_loss_clip": 0.01113209, + "auxiliary_loss_mlp": 0.01033494, + "balance_loss_clip": 1.02101874, + "balance_loss_mlp": 1.03946614, + "epoch": 0.4691417405681647, + "flos": 25703350053120.0, + "grad_norm": 1.7578029510137774, + "language_loss": 0.73409998, + "learning_rate": 2.2958745390206206e-06, + "loss": 0.75556695, + "num_input_tokens_seen": 167577685, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.73828125, + "step": 7803, + "time_per_iteration": 3.984971523284912 + }, + { + "auxiliary_loss_clip": 0.01107549, + "auxiliary_loss_mlp": 0.01035068, + "balance_loss_clip": 1.02289057, + "balance_loss_mlp": 1.0363642, + "epoch": 0.46920186382083273, + "flos": 17457039440640.0, + "grad_norm": 2.1225810300999384, + "language_loss": 0.77638352, + "learning_rate": 2.2954893570774558e-06, + "loss": 0.79780972, + "num_input_tokens_seen": 167596390, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.7109375, + "step": 7804, + "time_per_iteration": 5.432345390319824 + }, + { + "auxiliary_loss_clip": 0.01108405, + "auxiliary_loss_mlp": 0.01026664, + "balance_loss_clip": 1.01417041, + "balance_loss_mlp": 1.03702545, + "epoch": 0.4692619870735007, + "flos": 20339445202560.0, + "grad_norm": 1.8629622532391696, + "language_loss": 0.77384996, + "learning_rate": 2.295104163929305e-06, + "loss": 0.79520065, + "num_input_tokens_seen": 167614980, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71484375, + "step": 7805, + "time_per_iteration": 3.873565196990967 + }, + { + "auxiliary_loss_clip": 0.01119773, + "auxiliary_loss_mlp": 0.01037518, + "balance_loss_clip": 1.02423811, + "balance_loss_mlp": 1.04193878, + "epoch": 0.46932211032616866, + "flos": 29496558003840.0, + "grad_norm": 1.5711850680288217, + "language_loss": 0.82902926, + "learning_rate": 2.2947189595907742e-06, + "loss": 0.85060221, + "num_input_tokens_seen": 167635895, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.78125, + "step": 7806, + "time_per_iteration": 2.554081439971924 + }, + { + "auxiliary_loss_clip": 0.0111231, + "auxiliary_loss_mlp": 0.01034772, + "balance_loss_clip": 1.02150404, + "balance_loss_mlp": 1.03812897, + "epoch": 0.4693822335788366, + "flos": 36211242735360.0, + "grad_norm": 1.7011762555096541, + "language_loss": 0.77454185, + "learning_rate": 2.294333744076472e-06, + "loss": 0.79601264, + "num_input_tokens_seen": 167657440, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7421875, + "step": 7807, + "time_per_iteration": 2.5786170959472656 + }, + { + "auxiliary_loss_clip": 0.01112504, + "auxiliary_loss_mlp": 0.01033071, + "balance_loss_clip": 1.01985693, + "balance_loss_mlp": 1.03987944, + "epoch": 0.4694423568315046, + "flos": 20338978325760.0, + "grad_norm": 1.9089254292763438, + "language_loss": 0.51788038, + "learning_rate": 2.2939485174010035e-06, + "loss": 0.53933609, + "num_input_tokens_seen": 167675025, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7265625, + "step": 7808, + "time_per_iteration": 2.4730944633483887 + }, + { + "auxiliary_loss_clip": 0.01034297, + "auxiliary_loss_mlp": 0.01010423, + "balance_loss_clip": 1.00899839, + "balance_loss_mlp": 1.01039815, + "epoch": 0.46950248008417256, + "flos": 64326353621760.0, + "grad_norm": 0.782722095319277, + "language_loss": 0.57725239, + "learning_rate": 2.293563279578978e-06, + "loss": 0.59769958, + "num_input_tokens_seen": 167729635, + "router_z_loss_clip": 0.01422119, + "router_z_loss_mlp": 0.23925781, + "step": 7809, + "time_per_iteration": 2.9356954097747803 + }, + { + "auxiliary_loss_clip": 0.01116298, + "auxiliary_loss_mlp": 0.01036482, + "balance_loss_clip": 1.0237031, + "balance_loss_mlp": 1.04176784, + "epoch": 0.4695626033368405, + "flos": 19200106730880.0, + "grad_norm": 2.074581573353579, + "language_loss": 0.72116458, + "learning_rate": 2.2931780306250045e-06, + "loss": 0.74269235, + "num_input_tokens_seen": 167745135, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.74609375, + "step": 7810, + "time_per_iteration": 2.493408679962158 + }, + { + "auxiliary_loss_clip": 0.01114023, + "auxiliary_loss_mlp": 0.0103688, + "balance_loss_clip": 1.02402329, + "balance_loss_mlp": 1.040115, + "epoch": 0.4696227265895085, + "flos": 23002436736000.0, + "grad_norm": 2.1541938985336992, + "language_loss": 0.8075912, + "learning_rate": 2.29279277055369e-06, + "loss": 0.82910025, + "num_input_tokens_seen": 167763875, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.73828125, + "step": 7811, + "time_per_iteration": 2.4555575847625732 + }, + { + "auxiliary_loss_clip": 0.01114703, + "auxiliary_loss_mlp": 0.0103453, + "balance_loss_clip": 1.02146435, + "balance_loss_mlp": 1.04074228, + "epoch": 0.46968284984217645, + "flos": 21870855601920.0, + "grad_norm": 1.576643907851126, + "language_loss": 0.8039701, + "learning_rate": 2.292407499379644e-06, + "loss": 0.82546234, + "num_input_tokens_seen": 167784895, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7421875, + "step": 7812, + "time_per_iteration": 2.4640350341796875 + }, + { + "auxiliary_loss_clip": 0.01109494, + "auxiliary_loss_mlp": 0.01029039, + "balance_loss_clip": 1.0166117, + "balance_loss_mlp": 1.03902435, + "epoch": 0.4697429730948444, + "flos": 19974987855360.0, + "grad_norm": 1.5853543039664872, + "language_loss": 0.73764664, + "learning_rate": 2.292022217117477e-06, + "loss": 0.75903195, + "num_input_tokens_seen": 167803185, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.703125, + "step": 7813, + "time_per_iteration": 2.4320507049560547 + }, + { + "auxiliary_loss_clip": 0.01110282, + "auxiliary_loss_mlp": 0.01028304, + "balance_loss_clip": 1.01483905, + "balance_loss_mlp": 1.03869295, + "epoch": 0.4698030963475124, + "flos": 15156206784000.0, + "grad_norm": 2.2861298905980756, + "language_loss": 0.84540617, + "learning_rate": 2.291636923781798e-06, + "loss": 0.86679196, + "num_input_tokens_seen": 167816550, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71484375, + "step": 7814, + "time_per_iteration": 2.4274749755859375 + }, + { + "auxiliary_loss_clip": 0.01107762, + "auxiliary_loss_mlp": 0.01036717, + "balance_loss_clip": 1.02381229, + "balance_loss_mlp": 1.03796697, + "epoch": 0.46986321960018035, + "flos": 15151178880000.0, + "grad_norm": 1.8672463737050276, + "language_loss": 0.81747186, + "learning_rate": 2.291251619387217e-06, + "loss": 0.83891666, + "num_input_tokens_seen": 167831845, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69921875, + "step": 7815, + "time_per_iteration": 2.4163284301757812 + }, + { + "auxiliary_loss_clip": 0.01113111, + "auxiliary_loss_mlp": 0.01033733, + "balance_loss_clip": 1.02026868, + "balance_loss_mlp": 1.03994465, + "epoch": 0.4699233428528483, + "flos": 23108911626240.0, + "grad_norm": 2.4869249923010917, + "language_loss": 0.77289331, + "learning_rate": 2.2908663039483468e-06, + "loss": 0.79436171, + "num_input_tokens_seen": 167850360, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73046875, + "step": 7816, + "time_per_iteration": 2.4678542613983154 + }, + { + "auxiliary_loss_clip": 0.01033373, + "auxiliary_loss_mlp": 0.01001411, + "balance_loss_clip": 0.9998135, + "balance_loss_mlp": 1.00933015, + "epoch": 0.46998346610551633, + "flos": 68105558246400.0, + "grad_norm": 0.8340649958424211, + "language_loss": 0.5901494, + "learning_rate": 2.290480977479796e-06, + "loss": 0.61049724, + "num_input_tokens_seen": 167908660, + "router_z_loss_clip": 0.01599121, + "router_z_loss_mlp": 0.24023438, + "step": 7817, + "time_per_iteration": 3.0594780445098877 + }, + { + "auxiliary_loss_clip": 0.01108016, + "auxiliary_loss_mlp": 0.01029692, + "balance_loss_clip": 1.01726496, + "balance_loss_mlp": 1.03904927, + "epoch": 0.4700435893581843, + "flos": 24129456842880.0, + "grad_norm": 1.7036287613919965, + "language_loss": 0.79255462, + "learning_rate": 2.2900956399961775e-06, + "loss": 0.81393164, + "num_input_tokens_seen": 167927905, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 7818, + "time_per_iteration": 2.5072269439697266 + }, + { + "auxiliary_loss_clip": 0.01109812, + "auxiliary_loss_mlp": 0.01032563, + "balance_loss_clip": 1.02011776, + "balance_loss_mlp": 1.03705192, + "epoch": 0.47010371261085226, + "flos": 20150518642560.0, + "grad_norm": 1.8212678437549825, + "language_loss": 0.83521211, + "learning_rate": 2.289710291512104e-06, + "loss": 0.85663581, + "num_input_tokens_seen": 167945995, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7265625, + "step": 7819, + "time_per_iteration": 2.4294557571411133 + }, + { + "auxiliary_loss_clip": 0.01114675, + "auxiliary_loss_mlp": 0.01035104, + "balance_loss_clip": 1.02144313, + "balance_loss_mlp": 1.0395112, + "epoch": 0.47016383586352023, + "flos": 15122199582720.0, + "grad_norm": 2.0332467146742457, + "language_loss": 0.75860727, + "learning_rate": 2.289324932042186e-06, + "loss": 0.78010511, + "num_input_tokens_seen": 167963380, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.75390625, + "step": 7820, + "time_per_iteration": 2.446664333343506 + }, + { + "auxiliary_loss_clip": 0.0111083, + "auxiliary_loss_mlp": 0.01033323, + "balance_loss_clip": 1.02034664, + "balance_loss_mlp": 1.04058981, + "epoch": 0.4702239591161882, + "flos": 13552975140480.0, + "grad_norm": 1.889014789758207, + "language_loss": 0.73767376, + "learning_rate": 2.288939561601039e-06, + "loss": 0.75911528, + "num_input_tokens_seen": 167981740, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.703125, + "step": 7821, + "time_per_iteration": 2.4138526916503906 + }, + { + "auxiliary_loss_clip": 0.01110991, + "auxiliary_loss_mlp": 0.01040092, + "balance_loss_clip": 1.02792668, + "balance_loss_mlp": 1.04042852, + "epoch": 0.47028408236885616, + "flos": 24276511123200.0, + "grad_norm": 1.6752111617055698, + "language_loss": 0.88782346, + "learning_rate": 2.2885541802032746e-06, + "loss": 0.9093343, + "num_input_tokens_seen": 167999380, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.703125, + "step": 7822, + "time_per_iteration": 2.5215280055999756 + }, + { + "auxiliary_loss_clip": 0.01110261, + "auxiliary_loss_mlp": 0.01029425, + "balance_loss_clip": 1.01693165, + "balance_loss_mlp": 1.03927922, + "epoch": 0.4703442056215241, + "flos": 22856926740480.0, + "grad_norm": 1.5082152139738452, + "language_loss": 0.79467583, + "learning_rate": 2.2881687878635055e-06, + "loss": 0.8160727, + "num_input_tokens_seen": 168018395, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 7823, + "time_per_iteration": 2.4513280391693115 + }, + { + "auxiliary_loss_clip": 0.01034267, + "auxiliary_loss_mlp": 0.01003747, + "balance_loss_clip": 1.00228715, + "balance_loss_mlp": 1.01028728, + "epoch": 0.4704043288741921, + "flos": 69240227950080.0, + "grad_norm": 0.6886986665104876, + "language_loss": 0.56664526, + "learning_rate": 2.2877833845963487e-06, + "loss": 0.5870254, + "num_input_tokens_seen": 168084080, + "router_z_loss_clip": 0.0145874, + "router_z_loss_mlp": 0.24023438, + "step": 7824, + "time_per_iteration": 3.1640188694000244 + }, + { + "auxiliary_loss_clip": 0.01113151, + "auxiliary_loss_mlp": 0.01035787, + "balance_loss_clip": 1.02209568, + "balance_loss_mlp": 1.03935504, + "epoch": 0.47046445212686006, + "flos": 18041090584320.0, + "grad_norm": 1.7687808389256934, + "language_loss": 0.81284839, + "learning_rate": 2.2873979704164157e-06, + "loss": 0.83433783, + "num_input_tokens_seen": 168101555, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73828125, + "step": 7825, + "time_per_iteration": 2.4225590229034424 + }, + { + "auxiliary_loss_clip": 0.01114172, + "auxiliary_loss_mlp": 0.01030918, + "balance_loss_clip": 1.01788807, + "balance_loss_mlp": 1.04160166, + "epoch": 0.470524575379528, + "flos": 23951448017280.0, + "grad_norm": 1.5897626143629002, + "language_loss": 0.66397595, + "learning_rate": 2.287012545338324e-06, + "loss": 0.68542683, + "num_input_tokens_seen": 168121530, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 7826, + "time_per_iteration": 2.512421131134033 + }, + { + "auxiliary_loss_clip": 0.0111203, + "auxiliary_loss_mlp": 0.01036996, + "balance_loss_clip": 1.02366889, + "balance_loss_mlp": 1.03788161, + "epoch": 0.470584698632196, + "flos": 18113558273280.0, + "grad_norm": 2.2414984964582354, + "language_loss": 0.83768737, + "learning_rate": 2.2866271093766877e-06, + "loss": 0.85917771, + "num_input_tokens_seen": 168140335, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7421875, + "step": 7827, + "time_per_iteration": 2.449002504348755 + }, + { + "auxiliary_loss_clip": 0.01034449, + "auxiliary_loss_mlp": 0.01000576, + "balance_loss_clip": 0.99914598, + "balance_loss_mlp": 1.01066613, + "epoch": 0.47064482188486395, + "flos": 57251916224640.0, + "grad_norm": 0.821565097847141, + "language_loss": 0.55694902, + "learning_rate": 2.286241662546122e-06, + "loss": 0.57729936, + "num_input_tokens_seen": 168200535, + "router_z_loss_clip": 0.01428223, + "router_z_loss_mlp": 0.23828125, + "step": 7828, + "time_per_iteration": 3.0819802284240723 + }, + { + "auxiliary_loss_clip": 0.01109156, + "auxiliary_loss_mlp": 0.01028442, + "balance_loss_clip": 1.01605594, + "balance_loss_mlp": 1.03884375, + "epoch": 0.4707049451375319, + "flos": 17895077798400.0, + "grad_norm": 1.9071991460911069, + "language_loss": 0.81054831, + "learning_rate": 2.285856204861245e-06, + "loss": 0.8319242, + "num_input_tokens_seen": 168219610, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.703125, + "step": 7829, + "time_per_iteration": 2.415055513381958 + }, + { + "auxiliary_loss_clip": 0.01110764, + "auxiliary_loss_mlp": 0.01032324, + "balance_loss_clip": 1.02058768, + "balance_loss_mlp": 1.04020715, + "epoch": 0.47076506839019994, + "flos": 25232669210880.0, + "grad_norm": 1.3327561380149306, + "language_loss": 0.7576915, + "learning_rate": 2.2854707363366703e-06, + "loss": 0.77912241, + "num_input_tokens_seen": 168242505, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.703125, + "step": 7830, + "time_per_iteration": 2.5643560886383057 + }, + { + "auxiliary_loss_clip": 0.0111195, + "auxiliary_loss_mlp": 0.01032106, + "balance_loss_clip": 1.01860535, + "balance_loss_mlp": 1.04144919, + "epoch": 0.4708251916428679, + "flos": 13479681438720.0, + "grad_norm": 1.972485160119179, + "language_loss": 0.78818381, + "learning_rate": 2.2850852569870177e-06, + "loss": 0.80962437, + "num_input_tokens_seen": 168260220, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.703125, + "step": 7831, + "time_per_iteration": 2.4193694591522217 + }, + { + "auxiliary_loss_clip": 0.01115316, + "auxiliary_loss_mlp": 0.01035851, + "balance_loss_clip": 1.02204037, + "balance_loss_mlp": 1.03843021, + "epoch": 0.47088531489553587, + "flos": 30147833450880.0, + "grad_norm": 1.7552368254682797, + "language_loss": 0.76044565, + "learning_rate": 2.2846997668269033e-06, + "loss": 0.78195733, + "num_input_tokens_seen": 168277360, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.76953125, + "step": 7832, + "time_per_iteration": 2.5059313774108887 + }, + { + "auxiliary_loss_clip": 0.01110226, + "auxiliary_loss_mlp": 0.01028235, + "balance_loss_clip": 1.0164752, + "balance_loss_mlp": 1.03971505, + "epoch": 0.47094543814820383, + "flos": 21798280172160.0, + "grad_norm": 1.221217846393107, + "language_loss": 0.74499595, + "learning_rate": 2.2843142658709454e-06, + "loss": 0.76638055, + "num_input_tokens_seen": 168296605, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.703125, + "step": 7833, + "time_per_iteration": 2.473198652267456 + }, + { + "auxiliary_loss_clip": 0.01111984, + "auxiliary_loss_mlp": 0.01035389, + "balance_loss_clip": 1.0222286, + "balance_loss_mlp": 1.04079628, + "epoch": 0.4710055614008718, + "flos": 23003011353600.0, + "grad_norm": 1.540147977988576, + "language_loss": 0.7563647, + "learning_rate": 2.283928754133762e-06, + "loss": 0.77783847, + "num_input_tokens_seen": 168316205, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 7834, + "time_per_iteration": 2.4742865562438965 + }, + { + "auxiliary_loss_clip": 0.01110721, + "auxiliary_loss_mlp": 0.01038538, + "balance_loss_clip": 1.02601528, + "balance_loss_mlp": 1.04030991, + "epoch": 0.47106568465353976, + "flos": 42741346452480.0, + "grad_norm": 1.3686611384111311, + "language_loss": 0.66174978, + "learning_rate": 2.283543231629972e-06, + "loss": 0.68324244, + "num_input_tokens_seen": 168338935, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 7835, + "time_per_iteration": 2.631727933883667 + }, + { + "auxiliary_loss_clip": 0.01034831, + "auxiliary_loss_mlp": 0.01005422, + "balance_loss_clip": 1.00418234, + "balance_loss_mlp": 1.01069164, + "epoch": 0.4711258079062077, + "flos": 68554008570240.0, + "grad_norm": 0.8728088219103824, + "language_loss": 0.62162638, + "learning_rate": 2.283157698374194e-06, + "loss": 0.64202893, + "num_input_tokens_seen": 168392800, + "router_z_loss_clip": 0.01239014, + "router_z_loss_mlp": 0.2421875, + "step": 7836, + "time_per_iteration": 3.0448570251464844 + }, + { + "auxiliary_loss_clip": 0.01113991, + "auxiliary_loss_mlp": 0.01035938, + "balance_loss_clip": 1.02254474, + "balance_loss_mlp": 1.03829992, + "epoch": 0.4711859311588757, + "flos": 25446588658560.0, + "grad_norm": 1.5467691894783375, + "language_loss": 0.69550622, + "learning_rate": 2.2827721543810475e-06, + "loss": 0.71700549, + "num_input_tokens_seen": 168412940, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7578125, + "step": 7837, + "time_per_iteration": 2.480307102203369 + }, + { + "auxiliary_loss_clip": 0.01113119, + "auxiliary_loss_mlp": 0.01041426, + "balance_loss_clip": 1.02703786, + "balance_loss_mlp": 1.03986847, + "epoch": 0.47124605441154366, + "flos": 21981891519360.0, + "grad_norm": 1.8364060529940534, + "language_loss": 0.66015977, + "learning_rate": 2.282386599665153e-06, + "loss": 0.68170524, + "num_input_tokens_seen": 168431995, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.734375, + "step": 7838, + "time_per_iteration": 2.461975336074829 + }, + { + "auxiliary_loss_clip": 0.01112229, + "auxiliary_loss_mlp": 0.01030934, + "balance_loss_clip": 1.01755917, + "balance_loss_mlp": 1.03790629, + "epoch": 0.4713061776642116, + "flos": 25412689198080.0, + "grad_norm": 1.9120341376079564, + "language_loss": 0.77139461, + "learning_rate": 2.2820010342411304e-06, + "loss": 0.79282629, + "num_input_tokens_seen": 168454585, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7421875, + "step": 7839, + "time_per_iteration": 2.4788944721221924 + }, + { + "auxiliary_loss_clip": 0.01107554, + "auxiliary_loss_mlp": 0.01028891, + "balance_loss_clip": 1.0168395, + "balance_loss_mlp": 1.03794789, + "epoch": 0.4713663009168796, + "flos": 26542259170560.0, + "grad_norm": 1.9130481219619113, + "language_loss": 0.72918046, + "learning_rate": 2.2816154581235993e-06, + "loss": 0.75054491, + "num_input_tokens_seen": 168471265, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6953125, + "step": 7840, + "time_per_iteration": 2.495239019393921 + }, + { + "auxiliary_loss_clip": 0.01108309, + "auxiliary_loss_mlp": 0.01028993, + "balance_loss_clip": 1.01623714, + "balance_loss_mlp": 1.03712356, + "epoch": 0.47142642416954755, + "flos": 23623583650560.0, + "grad_norm": 1.5808172060169028, + "language_loss": 0.74886942, + "learning_rate": 2.2812298713271833e-06, + "loss": 0.77024251, + "num_input_tokens_seen": 168491360, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 7841, + "time_per_iteration": 2.454484224319458 + }, + { + "auxiliary_loss_clip": 0.01109803, + "auxiliary_loss_mlp": 0.01032659, + "balance_loss_clip": 1.02002275, + "balance_loss_mlp": 1.03838921, + "epoch": 0.4714865474222155, + "flos": 22310150935680.0, + "grad_norm": 1.602853925212418, + "language_loss": 0.70333457, + "learning_rate": 2.280844273866501e-06, + "loss": 0.72475922, + "num_input_tokens_seen": 168511335, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71484375, + "step": 7842, + "time_per_iteration": 2.4781782627105713 + }, + { + "auxiliary_loss_clip": 0.01111668, + "auxiliary_loss_mlp": 0.01029251, + "balance_loss_clip": 1.01659727, + "balance_loss_mlp": 1.04060411, + "epoch": 0.4715466706748835, + "flos": 17822430541440.0, + "grad_norm": 2.29732654226483, + "language_loss": 0.78893888, + "learning_rate": 2.280458665756177e-06, + "loss": 0.81034797, + "num_input_tokens_seen": 168529920, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 7843, + "time_per_iteration": 2.4125685691833496 + }, + { + "auxiliary_loss_clip": 0.01110204, + "auxiliary_loss_mlp": 0.0103101, + "balance_loss_clip": 1.01920795, + "balance_loss_mlp": 1.03860044, + "epoch": 0.4716067939275515, + "flos": 23659530186240.0, + "grad_norm": 1.6968163407172614, + "language_loss": 0.74375969, + "learning_rate": 2.280073047010832e-06, + "loss": 0.76517189, + "num_input_tokens_seen": 168550595, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.71484375, + "step": 7844, + "time_per_iteration": 3.915900230407715 + }, + { + "auxiliary_loss_clip": 0.01110838, + "auxiliary_loss_mlp": 0.0104121, + "balance_loss_clip": 1.0281688, + "balance_loss_mlp": 1.03888059, + "epoch": 0.47166691718021947, + "flos": 17930162407680.0, + "grad_norm": 1.5835392600478553, + "language_loss": 0.78286111, + "learning_rate": 2.279687417645088e-06, + "loss": 0.80438167, + "num_input_tokens_seen": 168569765, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 7845, + "time_per_iteration": 3.8502118587493896 + }, + { + "auxiliary_loss_clip": 0.01105883, + "auxiliary_loss_mlp": 0.01033342, + "balance_loss_clip": 1.02098632, + "balance_loss_mlp": 1.03725934, + "epoch": 0.47172704043288743, + "flos": 26614583205120.0, + "grad_norm": 1.4155938367608039, + "language_loss": 0.7311433, + "learning_rate": 2.2793017776735703e-06, + "loss": 0.75253546, + "num_input_tokens_seen": 168591525, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6875, + "step": 7846, + "time_per_iteration": 5.374008655548096 + }, + { + "auxiliary_loss_clip": 0.01105934, + "auxiliary_loss_mlp": 0.01030323, + "balance_loss_clip": 1.01794863, + "balance_loss_mlp": 1.03715074, + "epoch": 0.4717871636855554, + "flos": 27922700707200.0, + "grad_norm": 1.2885600176299252, + "language_loss": 0.74075842, + "learning_rate": 2.2789161271109e-06, + "loss": 0.76212096, + "num_input_tokens_seen": 168611235, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 7847, + "time_per_iteration": 2.5333058834075928 + }, + { + "auxiliary_loss_clip": 0.01110234, + "auxiliary_loss_mlp": 0.01034306, + "balance_loss_clip": 1.02229548, + "balance_loss_mlp": 1.03908157, + "epoch": 0.47184728693822336, + "flos": 14502237816960.0, + "grad_norm": 1.6263943719256755, + "language_loss": 0.80717957, + "learning_rate": 2.278530465971703e-06, + "loss": 0.82862496, + "num_input_tokens_seen": 168628710, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.7109375, + "step": 7848, + "time_per_iteration": 2.408688545227051 + }, + { + "auxiliary_loss_clip": 0.01115584, + "auxiliary_loss_mlp": 0.01031135, + "balance_loss_clip": 1.01844501, + "balance_loss_mlp": 1.04345632, + "epoch": 0.47190741019089133, + "flos": 17856545483520.0, + "grad_norm": 1.7499376956487047, + "language_loss": 0.70086265, + "learning_rate": 2.2781447942706032e-06, + "loss": 0.72232985, + "num_input_tokens_seen": 168645645, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 7849, + "time_per_iteration": 2.453542709350586 + }, + { + "auxiliary_loss_clip": 0.01114658, + "auxiliary_loss_mlp": 0.01035623, + "balance_loss_clip": 1.02144289, + "balance_loss_mlp": 1.03961349, + "epoch": 0.4719675334435593, + "flos": 17895472848000.0, + "grad_norm": 2.1591296324254095, + "language_loss": 0.69831544, + "learning_rate": 2.277759112022224e-06, + "loss": 0.71981823, + "num_input_tokens_seen": 168664165, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.75, + "step": 7850, + "time_per_iteration": 2.421095371246338 + }, + { + "auxiliary_loss_clip": 0.01115823, + "auxiliary_loss_mlp": 0.01030409, + "balance_loss_clip": 1.0175221, + "balance_loss_mlp": 1.04188704, + "epoch": 0.47202765669622726, + "flos": 20704369426560.0, + "grad_norm": 1.815710496912415, + "language_loss": 0.75220203, + "learning_rate": 2.2773734192411916e-06, + "loss": 0.7736643, + "num_input_tokens_seen": 168681940, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.73828125, + "step": 7851, + "time_per_iteration": 2.4666483402252197 + }, + { + "auxiliary_loss_clip": 0.01112485, + "auxiliary_loss_mlp": 0.01036309, + "balance_loss_clip": 1.02262962, + "balance_loss_mlp": 1.03831601, + "epoch": 0.4720877799488952, + "flos": 16360255607040.0, + "grad_norm": 1.7847776856215107, + "language_loss": 0.76165771, + "learning_rate": 2.276987715942132e-06, + "loss": 0.78314561, + "num_input_tokens_seen": 168698830, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7421875, + "step": 7852, + "time_per_iteration": 2.415109395980835 + }, + { + "auxiliary_loss_clip": 0.01111391, + "auxiliary_loss_mlp": 0.01029022, + "balance_loss_clip": 1.01553345, + "balance_loss_mlp": 1.04077876, + "epoch": 0.4721479032015632, + "flos": 20668171495680.0, + "grad_norm": 1.4478461916623044, + "language_loss": 0.68933171, + "learning_rate": 2.2766020021396696e-06, + "loss": 0.71073586, + "num_input_tokens_seen": 168718305, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.70703125, + "step": 7853, + "time_per_iteration": 2.4654150009155273 + }, + { + "auxiliary_loss_clip": 0.01033922, + "auxiliary_loss_mlp": 0.00998653, + "balance_loss_clip": 0.99743122, + "balance_loss_mlp": 1.01008511, + "epoch": 0.47220802645423116, + "flos": 67750438435200.0, + "grad_norm": 0.6983660788322832, + "language_loss": 0.50161922, + "learning_rate": 2.276216277848432e-06, + "loss": 0.52194494, + "num_input_tokens_seen": 168782365, + "router_z_loss_clip": 0.01220703, + "router_z_loss_mlp": 0.23828125, + "step": 7854, + "time_per_iteration": 3.190991163253784 + }, + { + "auxiliary_loss_clip": 0.0111395, + "auxiliary_loss_mlp": 0.01032681, + "balance_loss_clip": 1.0189656, + "balance_loss_mlp": 1.04039025, + "epoch": 0.4722681497068991, + "flos": 20921449271040.0, + "grad_norm": 1.7794050652620443, + "language_loss": 0.63844812, + "learning_rate": 2.2758305430830455e-06, + "loss": 0.65991443, + "num_input_tokens_seen": 168800485, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.734375, + "step": 7855, + "time_per_iteration": 2.503614664077759 + }, + { + "auxiliary_loss_clip": 0.01111503, + "auxiliary_loss_mlp": 0.01035422, + "balance_loss_clip": 1.02274394, + "balance_loss_mlp": 1.0393486, + "epoch": 0.4723282729595671, + "flos": 28293083798400.0, + "grad_norm": 1.8062233622492851, + "language_loss": 0.75802517, + "learning_rate": 2.2754447978581376e-06, + "loss": 0.7794944, + "num_input_tokens_seen": 168818965, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 7856, + "time_per_iteration": 2.499197244644165 + }, + { + "auxiliary_loss_clip": 0.01108332, + "auxiliary_loss_mlp": 0.01034982, + "balance_loss_clip": 1.02270377, + "balance_loss_mlp": 1.03774405, + "epoch": 0.4723883962122351, + "flos": 27125053338240.0, + "grad_norm": 1.914023874649731, + "language_loss": 0.7484442, + "learning_rate": 2.2750590421883347e-06, + "loss": 0.76987731, + "num_input_tokens_seen": 168840355, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.70703125, + "step": 7857, + "time_per_iteration": 2.5192370414733887 + }, + { + "auxiliary_loss_clip": 0.01109783, + "auxiliary_loss_mlp": 0.01043222, + "balance_loss_clip": 1.03118157, + "balance_loss_mlp": 1.03967714, + "epoch": 0.47244851946490307, + "flos": 31537253387520.0, + "grad_norm": 1.4716352183066603, + "language_loss": 0.6482265, + "learning_rate": 2.2746732760882655e-06, + "loss": 0.66975653, + "num_input_tokens_seen": 168861765, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.703125, + "step": 7858, + "time_per_iteration": 2.5169341564178467 + }, + { + "auxiliary_loss_clip": 0.01107114, + "auxiliary_loss_mlp": 0.01034557, + "balance_loss_clip": 1.02124774, + "balance_loss_mlp": 1.03680444, + "epoch": 0.47250864271757104, + "flos": 20886544229760.0, + "grad_norm": 1.569061056560701, + "language_loss": 0.70402861, + "learning_rate": 2.2742874995725575e-06, + "loss": 0.72544539, + "num_input_tokens_seen": 168881310, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.703125, + "step": 7859, + "time_per_iteration": 2.4850962162017822 + }, + { + "auxiliary_loss_clip": 0.01115812, + "auxiliary_loss_mlp": 0.01037422, + "balance_loss_clip": 1.0245533, + "balance_loss_mlp": 1.03993118, + "epoch": 0.472568765970239, + "flos": 20522086882560.0, + "grad_norm": 1.957216681544069, + "language_loss": 0.62261212, + "learning_rate": 2.2739017126558413e-06, + "loss": 0.64414442, + "num_input_tokens_seen": 168899470, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7578125, + "step": 7860, + "time_per_iteration": 2.435559034347534 + }, + { + "auxiliary_loss_clip": 0.01114067, + "auxiliary_loss_mlp": 0.01039582, + "balance_loss_clip": 1.02632594, + "balance_loss_mlp": 1.03998029, + "epoch": 0.47262888922290697, + "flos": 35805200417280.0, + "grad_norm": 2.1159962326169097, + "language_loss": 0.71988773, + "learning_rate": 2.2735159153527445e-06, + "loss": 0.7414242, + "num_input_tokens_seen": 168921495, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73828125, + "step": 7861, + "time_per_iteration": 2.5884346961975098 + }, + { + "auxiliary_loss_clip": 0.0111296, + "auxiliary_loss_mlp": 0.01037156, + "balance_loss_clip": 1.02440643, + "balance_loss_mlp": 1.03970647, + "epoch": 0.47268901247557493, + "flos": 20667740532480.0, + "grad_norm": 1.8695032169355525, + "language_loss": 0.85058391, + "learning_rate": 2.273130107677896e-06, + "loss": 0.87208509, + "num_input_tokens_seen": 168940515, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.734375, + "step": 7862, + "time_per_iteration": 2.439347505569458 + }, + { + "auxiliary_loss_clip": 0.01111085, + "auxiliary_loss_mlp": 0.01030878, + "balance_loss_clip": 1.01822364, + "balance_loss_mlp": 1.03786755, + "epoch": 0.4727491357282429, + "flos": 19573291082880.0, + "grad_norm": 1.736958967740828, + "language_loss": 0.8456251, + "learning_rate": 2.272744289645927e-06, + "loss": 0.86704469, + "num_input_tokens_seen": 168958340, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.734375, + "step": 7863, + "time_per_iteration": 2.48335862159729 + }, + { + "auxiliary_loss_clip": 0.01112215, + "auxiliary_loss_mlp": 0.0103699, + "balance_loss_clip": 1.02422917, + "balance_loss_mlp": 1.04029155, + "epoch": 0.47280925898091086, + "flos": 18217231902720.0, + "grad_norm": 1.8450896018132297, + "language_loss": 0.65939879, + "learning_rate": 2.272358461271467e-06, + "loss": 0.68089092, + "num_input_tokens_seen": 168974850, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.71875, + "step": 7864, + "time_per_iteration": 2.430302381515503 + }, + { + "auxiliary_loss_clip": 0.01111041, + "auxiliary_loss_mlp": 0.01030923, + "balance_loss_clip": 1.01771474, + "balance_loss_mlp": 1.03911948, + "epoch": 0.4728693822335788, + "flos": 17821820010240.0, + "grad_norm": 1.898956112201793, + "language_loss": 0.65435767, + "learning_rate": 2.271972622569147e-06, + "loss": 0.67577726, + "num_input_tokens_seen": 168992860, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71875, + "step": 7865, + "time_per_iteration": 2.4585866928100586 + }, + { + "auxiliary_loss_clip": 0.01107492, + "auxiliary_loss_mlp": 0.01033897, + "balance_loss_clip": 1.02195215, + "balance_loss_mlp": 1.0378449, + "epoch": 0.4729295054862468, + "flos": 20595057361920.0, + "grad_norm": 2.8918998215840244, + "language_loss": 0.74357843, + "learning_rate": 2.2715867735535976e-06, + "loss": 0.76499236, + "num_input_tokens_seen": 169010325, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6953125, + "step": 7866, + "time_per_iteration": 2.4264490604400635 + }, + { + "auxiliary_loss_clip": 0.01111501, + "auxiliary_loss_mlp": 0.0102998, + "balance_loss_clip": 1.01718307, + "balance_loss_mlp": 1.03777552, + "epoch": 0.47298962873891476, + "flos": 23368079232000.0, + "grad_norm": 3.2754467592530476, + "language_loss": 0.8285951, + "learning_rate": 2.271200914239451e-06, + "loss": 0.85000992, + "num_input_tokens_seen": 169029840, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.734375, + "step": 7867, + "time_per_iteration": 2.4925811290740967 + }, + { + "auxiliary_loss_clip": 0.011073, + "auxiliary_loss_mlp": 0.01029056, + "balance_loss_clip": 1.01655674, + "balance_loss_mlp": 1.03702307, + "epoch": 0.4730497519915827, + "flos": 22052240305920.0, + "grad_norm": 1.5927913973026295, + "language_loss": 0.79137915, + "learning_rate": 2.2708150446413385e-06, + "loss": 0.81274265, + "num_input_tokens_seen": 169049975, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 7868, + "time_per_iteration": 2.454094171524048 + }, + { + "auxiliary_loss_clip": 0.01114352, + "auxiliary_loss_mlp": 0.01029772, + "balance_loss_clip": 1.01608682, + "balance_loss_mlp": 1.03858244, + "epoch": 0.4731098752442507, + "flos": 21069724613760.0, + "grad_norm": 2.558281214251347, + "language_loss": 0.74588537, + "learning_rate": 2.2704291647738915e-06, + "loss": 0.76732659, + "num_input_tokens_seen": 169069540, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7578125, + "step": 7869, + "time_per_iteration": 2.4809184074401855 + }, + { + "auxiliary_loss_clip": 0.01114593, + "auxiliary_loss_mlp": 0.01042175, + "balance_loss_clip": 1.02767277, + "balance_loss_mlp": 1.04122782, + "epoch": 0.4731699984969187, + "flos": 22528775064960.0, + "grad_norm": 1.571794234452096, + "language_loss": 0.73950672, + "learning_rate": 2.2700432746517443e-06, + "loss": 0.76107442, + "num_input_tokens_seen": 169089940, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.734375, + "step": 7870, + "time_per_iteration": 2.4553706645965576 + }, + { + "auxiliary_loss_clip": 0.01117025, + "auxiliary_loss_mlp": 0.01038302, + "balance_loss_clip": 1.02400887, + "balance_loss_mlp": 1.04082036, + "epoch": 0.4732301217495867, + "flos": 24898124914560.0, + "grad_norm": 1.9039581815830153, + "language_loss": 0.81513011, + "learning_rate": 2.2696573742895292e-06, + "loss": 0.83668333, + "num_input_tokens_seen": 169109650, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.76171875, + "step": 7871, + "time_per_iteration": 2.5156424045562744 + }, + { + "auxiliary_loss_clip": 0.0111227, + "auxiliary_loss_mlp": 0.01033696, + "balance_loss_clip": 1.02067888, + "balance_loss_mlp": 1.03990555, + "epoch": 0.47329024500225464, + "flos": 22784423137920.0, + "grad_norm": 1.6438263319482285, + "language_loss": 0.75679815, + "learning_rate": 2.269271463701879e-06, + "loss": 0.77825779, + "num_input_tokens_seen": 169128990, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 7872, + "time_per_iteration": 2.453831672668457 + }, + { + "auxiliary_loss_clip": 0.01110565, + "auxiliary_loss_mlp": 0.01034313, + "balance_loss_clip": 1.02088451, + "balance_loss_mlp": 1.03784847, + "epoch": 0.4733503682549226, + "flos": 38695902220800.0, + "grad_norm": 1.7923349992019921, + "language_loss": 0.67857021, + "learning_rate": 2.268885542903428e-06, + "loss": 0.700019, + "num_input_tokens_seen": 169154645, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7265625, + "step": 7873, + "time_per_iteration": 2.6532957553863525 + }, + { + "auxiliary_loss_clip": 0.01113022, + "auxiliary_loss_mlp": 0.01031944, + "balance_loss_clip": 1.01881886, + "balance_loss_mlp": 1.04162037, + "epoch": 0.47341049150759057, + "flos": 22966849336320.0, + "grad_norm": 1.6289748569468698, + "language_loss": 0.72085869, + "learning_rate": 2.26849961190881e-06, + "loss": 0.74230838, + "num_input_tokens_seen": 169174995, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71484375, + "step": 7874, + "time_per_iteration": 2.474073648452759 + }, + { + "auxiliary_loss_clip": 0.01113429, + "auxiliary_loss_mlp": 0.01035269, + "balance_loss_clip": 1.02190506, + "balance_loss_mlp": 1.03987253, + "epoch": 0.47347061476025853, + "flos": 14538471661440.0, + "grad_norm": 2.446593699000123, + "language_loss": 0.65108937, + "learning_rate": 2.26811367073266e-06, + "loss": 0.67257631, + "num_input_tokens_seen": 169191815, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.734375, + "step": 7875, + "time_per_iteration": 2.4433648586273193 + }, + { + "auxiliary_loss_clip": 0.01115895, + "auxiliary_loss_mlp": 0.0103072, + "balance_loss_clip": 1.01718342, + "balance_loss_mlp": 1.04219341, + "epoch": 0.4735307380129265, + "flos": 30263250827520.0, + "grad_norm": 2.56524610984038, + "language_loss": 0.81091076, + "learning_rate": 2.2677277193896125e-06, + "loss": 0.83237696, + "num_input_tokens_seen": 169210430, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73828125, + "step": 7876, + "time_per_iteration": 2.540485143661499 + }, + { + "auxiliary_loss_clip": 0.01108757, + "auxiliary_loss_mlp": 0.01035508, + "balance_loss_clip": 1.02232385, + "balance_loss_mlp": 1.0358628, + "epoch": 0.47359086126559446, + "flos": 19391044452480.0, + "grad_norm": 1.7859307736041579, + "language_loss": 0.7925123, + "learning_rate": 2.267341757894304e-06, + "loss": 0.81395495, + "num_input_tokens_seen": 169229295, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73046875, + "step": 7877, + "time_per_iteration": 2.627589225769043 + }, + { + "auxiliary_loss_clip": 0.01110689, + "auxiliary_loss_mlp": 0.01030438, + "balance_loss_clip": 1.01751554, + "balance_loss_mlp": 1.03852785, + "epoch": 0.47365098451826243, + "flos": 21939408708480.0, + "grad_norm": 1.8692095295200843, + "language_loss": 0.70723194, + "learning_rate": 2.2669557862613685e-06, + "loss": 0.72864318, + "num_input_tokens_seen": 169247855, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.72265625, + "step": 7878, + "time_per_iteration": 2.535684108734131 + }, + { + "auxiliary_loss_clip": 0.01108668, + "auxiliary_loss_mlp": 0.01030671, + "balance_loss_clip": 1.01792121, + "balance_loss_mlp": 1.03918552, + "epoch": 0.4737111077709304, + "flos": 25845053207040.0, + "grad_norm": 1.811278524460759, + "language_loss": 0.75030494, + "learning_rate": 2.2665698045054425e-06, + "loss": 0.77169836, + "num_input_tokens_seen": 169268860, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 7879, + "time_per_iteration": 2.518188953399658 + }, + { + "auxiliary_loss_clip": 0.01034961, + "auxiliary_loss_mlp": 0.01000904, + "balance_loss_clip": 0.99943775, + "balance_loss_mlp": 1.01098931, + "epoch": 0.47377123102359836, + "flos": 67760886314880.0, + "grad_norm": 0.7286317750961989, + "language_loss": 0.6135056, + "learning_rate": 2.266183812641164e-06, + "loss": 0.63386428, + "num_input_tokens_seen": 169331855, + "router_z_loss_clip": 0.01464844, + "router_z_loss_mlp": 0.24023438, + "step": 7880, + "time_per_iteration": 3.0518951416015625 + }, + { + "auxiliary_loss_clip": 0.01110775, + "auxiliary_loss_mlp": 0.01033424, + "balance_loss_clip": 1.01922059, + "balance_loss_mlp": 1.03901792, + "epoch": 0.4738313542762663, + "flos": 24315977191680.0, + "grad_norm": 1.5146846775966347, + "language_loss": 0.6795128, + "learning_rate": 2.2657978106831675e-06, + "loss": 0.70095479, + "num_input_tokens_seen": 169352175, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.71875, + "step": 7881, + "time_per_iteration": 2.5058367252349854 + }, + { + "auxiliary_loss_clip": 0.01111211, + "auxiliary_loss_mlp": 0.01028797, + "balance_loss_clip": 1.01614857, + "balance_loss_mlp": 1.03997886, + "epoch": 0.4738914775289343, + "flos": 20705339093760.0, + "grad_norm": 1.916106799054198, + "language_loss": 0.77455914, + "learning_rate": 2.265411798646092e-06, + "loss": 0.79595923, + "num_input_tokens_seen": 169371215, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 7882, + "time_per_iteration": 2.475503921508789 + }, + { + "auxiliary_loss_clip": 0.01113056, + "auxiliary_loss_mlp": 0.01030232, + "balance_loss_clip": 1.01675582, + "balance_loss_mlp": 1.03993428, + "epoch": 0.4739516007816023, + "flos": 25446337263360.0, + "grad_norm": 1.505527482540033, + "language_loss": 0.7617712, + "learning_rate": 2.2650257765445747e-06, + "loss": 0.78320408, + "num_input_tokens_seen": 169391745, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 7883, + "time_per_iteration": 2.5051398277282715 + }, + { + "auxiliary_loss_clip": 0.01111273, + "auxiliary_loss_mlp": 0.01029698, + "balance_loss_clip": 1.01724708, + "balance_loss_mlp": 1.03893495, + "epoch": 0.4740117240342703, + "flos": 19974341410560.0, + "grad_norm": 1.7576670192685107, + "language_loss": 0.71994746, + "learning_rate": 2.2646397443932525e-06, + "loss": 0.74135715, + "num_input_tokens_seen": 169409845, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7265625, + "step": 7884, + "time_per_iteration": 2.4406635761260986 + }, + { + "auxiliary_loss_clip": 0.01117273, + "auxiliary_loss_mlp": 0.0103414, + "balance_loss_clip": 1.02024651, + "balance_loss_mlp": 1.04002821, + "epoch": 0.47407184728693824, + "flos": 15661146222720.0, + "grad_norm": 2.026641651540024, + "language_loss": 0.82025737, + "learning_rate": 2.2642537022067655e-06, + "loss": 0.84177154, + "num_input_tokens_seen": 169426085, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7734375, + "step": 7885, + "time_per_iteration": 2.463895797729492 + }, + { + "auxiliary_loss_clip": 0.01115601, + "auxiliary_loss_mlp": 0.01034821, + "balance_loss_clip": 1.02152371, + "balance_loss_mlp": 1.04353762, + "epoch": 0.4741319705396062, + "flos": 18588800142720.0, + "grad_norm": 1.728500395905687, + "language_loss": 0.73431885, + "learning_rate": 2.263867649999751e-06, + "loss": 0.75582302, + "num_input_tokens_seen": 169444705, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 7886, + "time_per_iteration": 3.8351001739501953 + }, + { + "auxiliary_loss_clip": 0.01116571, + "auxiliary_loss_mlp": 0.01034684, + "balance_loss_clip": 1.02036691, + "balance_loss_mlp": 1.03938007, + "epoch": 0.47419209379227417, + "flos": 13261093223040.0, + "grad_norm": 2.1265145819393667, + "language_loss": 0.73465097, + "learning_rate": 2.263481587786849e-06, + "loss": 0.75616348, + "num_input_tokens_seen": 169460850, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.76953125, + "step": 7887, + "time_per_iteration": 5.437266111373901 + }, + { + "auxiliary_loss_clip": 0.01108649, + "auxiliary_loss_mlp": 0.01028216, + "balance_loss_clip": 1.01562774, + "balance_loss_mlp": 1.03885245, + "epoch": 0.47425221704494214, + "flos": 20044043752320.0, + "grad_norm": 1.895223723891788, + "language_loss": 0.77138984, + "learning_rate": 2.2630955155826993e-06, + "loss": 0.79275852, + "num_input_tokens_seen": 169478890, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69921875, + "step": 7888, + "time_per_iteration": 3.8908259868621826 + }, + { + "auxiliary_loss_clip": 0.01113126, + "auxiliary_loss_mlp": 0.01032757, + "balance_loss_clip": 1.02004313, + "balance_loss_mlp": 1.04045427, + "epoch": 0.4743123402976101, + "flos": 27271892136960.0, + "grad_norm": 1.663584432705133, + "language_loss": 0.72822642, + "learning_rate": 2.2627094334019406e-06, + "loss": 0.74968517, + "num_input_tokens_seen": 169499690, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7265625, + "step": 7889, + "time_per_iteration": 2.5004560947418213 + }, + { + "auxiliary_loss_clip": 0.01036118, + "auxiliary_loss_mlp": 0.01004378, + "balance_loss_clip": 1.00301266, + "balance_loss_mlp": 1.0120219, + "epoch": 0.47437246355027807, + "flos": 55393970261760.0, + "grad_norm": 1.138520548555467, + "language_loss": 0.5608511, + "learning_rate": 2.262323341259214e-06, + "loss": 0.58125609, + "num_input_tokens_seen": 169560475, + "router_z_loss_clip": 0.01367188, + "router_z_loss_mlp": 0.24121094, + "step": 7890, + "time_per_iteration": 3.116922378540039 + }, + { + "auxiliary_loss_clip": 0.01114938, + "auxiliary_loss_mlp": 0.0103492, + "balance_loss_clip": 1.02009606, + "balance_loss_mlp": 1.04115105, + "epoch": 0.47443258680294603, + "flos": 23878477537920.0, + "grad_norm": 2.185015538438359, + "language_loss": 0.6552254, + "learning_rate": 2.2619372391691605e-06, + "loss": 0.67672396, + "num_input_tokens_seen": 169580110, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.734375, + "step": 7891, + "time_per_iteration": 2.475003242492676 + }, + { + "auxiliary_loss_clip": 0.011182, + "auxiliary_loss_mlp": 0.01035949, + "balance_loss_clip": 1.02170992, + "balance_loss_mlp": 1.04182184, + "epoch": 0.474492710055614, + "flos": 21977761455360.0, + "grad_norm": 2.136023484028619, + "language_loss": 0.70221758, + "learning_rate": 2.26155112714642e-06, + "loss": 0.72375906, + "num_input_tokens_seen": 169597510, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.76171875, + "step": 7892, + "time_per_iteration": 2.45662260055542 + }, + { + "auxiliary_loss_clip": 0.01036198, + "auxiliary_loss_mlp": 0.01003564, + "balance_loss_clip": 1.00212717, + "balance_loss_mlp": 1.01211762, + "epoch": 0.47455283330828196, + "flos": 62557180122240.0, + "grad_norm": 0.8097608885887184, + "language_loss": 0.5861572, + "learning_rate": 2.2611650052056355e-06, + "loss": 0.60655481, + "num_input_tokens_seen": 169660010, + "router_z_loss_clip": 0.01434326, + "router_z_loss_mlp": 0.24121094, + "step": 7893, + "time_per_iteration": 3.1652448177337646 + }, + { + "auxiliary_loss_clip": 0.01114001, + "auxiliary_loss_mlp": 0.01033874, + "balance_loss_clip": 1.02131534, + "balance_loss_mlp": 1.04149461, + "epoch": 0.47461295656094993, + "flos": 12093637380480.0, + "grad_norm": 1.8991850536849317, + "language_loss": 0.77645361, + "learning_rate": 2.2607788733614463e-06, + "loss": 0.79793239, + "num_input_tokens_seen": 169678485, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7265625, + "step": 7894, + "time_per_iteration": 2.4849085807800293 + }, + { + "auxiliary_loss_clip": 0.01112228, + "auxiliary_loss_mlp": 0.01031855, + "balance_loss_clip": 1.01912403, + "balance_loss_mlp": 1.04029822, + "epoch": 0.4746730798136179, + "flos": 20884568981760.0, + "grad_norm": 1.6188047164673534, + "language_loss": 0.74456996, + "learning_rate": 2.260392731628497e-06, + "loss": 0.76601076, + "num_input_tokens_seen": 169697335, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 7895, + "time_per_iteration": 2.456735372543335 + }, + { + "auxiliary_loss_clip": 0.01110765, + "auxiliary_loss_mlp": 0.01029148, + "balance_loss_clip": 1.01553416, + "balance_loss_mlp": 1.03990245, + "epoch": 0.4747332030662859, + "flos": 19974808287360.0, + "grad_norm": 1.9073077974003343, + "language_loss": 0.82539713, + "learning_rate": 2.260006580021429e-06, + "loss": 0.84679627, + "num_input_tokens_seen": 169715395, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7109375, + "step": 7896, + "time_per_iteration": 2.5201456546783447 + }, + { + "auxiliary_loss_clip": 0.01110585, + "auxiliary_loss_mlp": 0.01029897, + "balance_loss_clip": 1.0161047, + "balance_loss_mlp": 1.03953171, + "epoch": 0.4747933263189539, + "flos": 16034186920320.0, + "grad_norm": 1.922550471395919, + "language_loss": 0.75487721, + "learning_rate": 2.259620418554886e-06, + "loss": 0.77628207, + "num_input_tokens_seen": 169733755, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7109375, + "step": 7897, + "time_per_iteration": 2.42526912689209 + }, + { + "auxiliary_loss_clip": 0.01116598, + "auxiliary_loss_mlp": 0.01036051, + "balance_loss_clip": 1.02316415, + "balance_loss_mlp": 1.04003334, + "epoch": 0.47485344957162184, + "flos": 13955102876160.0, + "grad_norm": 2.1696415620255145, + "language_loss": 0.63682836, + "learning_rate": 2.25923424724351e-06, + "loss": 0.65835488, + "num_input_tokens_seen": 169751390, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.765625, + "step": 7898, + "time_per_iteration": 2.443390369415283 + }, + { + "auxiliary_loss_clip": 0.01111767, + "auxiliary_loss_mlp": 0.01036151, + "balance_loss_clip": 1.02263284, + "balance_loss_mlp": 1.03901982, + "epoch": 0.4749135728242898, + "flos": 20449080489600.0, + "grad_norm": 2.0733269605967997, + "language_loss": 0.6999402, + "learning_rate": 2.258848066101946e-06, + "loss": 0.72141939, + "num_input_tokens_seen": 169769500, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7265625, + "step": 7899, + "time_per_iteration": 2.5906245708465576 + }, + { + "auxiliary_loss_clip": 0.01114723, + "auxiliary_loss_mlp": 0.01036945, + "balance_loss_clip": 1.02314603, + "balance_loss_mlp": 1.04054523, + "epoch": 0.4749736960769578, + "flos": 28949961767040.0, + "grad_norm": 1.8534573860401393, + "language_loss": 0.68523431, + "learning_rate": 2.258461875144837e-06, + "loss": 0.70675093, + "num_input_tokens_seen": 169789215, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7421875, + "step": 7900, + "time_per_iteration": 2.5417144298553467 + }, + { + "auxiliary_loss_clip": 0.01112761, + "auxiliary_loss_mlp": 0.01033859, + "balance_loss_clip": 1.02096641, + "balance_loss_mlp": 1.03979492, + "epoch": 0.47503381932962574, + "flos": 31938770592000.0, + "grad_norm": 1.9751823447072345, + "language_loss": 0.70783907, + "learning_rate": 2.2580756743868273e-06, + "loss": 0.72930533, + "num_input_tokens_seen": 169808825, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.73046875, + "step": 7901, + "time_per_iteration": 2.5215682983398438 + }, + { + "auxiliary_loss_clip": 0.01115181, + "auxiliary_loss_mlp": 0.01041261, + "balance_loss_clip": 1.02833235, + "balance_loss_mlp": 1.0420568, + "epoch": 0.4750939425822937, + "flos": 22127257860480.0, + "grad_norm": 1.7245601487210742, + "language_loss": 0.73674953, + "learning_rate": 2.2576894638425636e-06, + "loss": 0.75831395, + "num_input_tokens_seen": 169827590, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.734375, + "step": 7902, + "time_per_iteration": 2.4854226112365723 + }, + { + "auxiliary_loss_clip": 0.01108872, + "auxiliary_loss_mlp": 0.01033639, + "balance_loss_clip": 1.02169394, + "balance_loss_mlp": 1.03990698, + "epoch": 0.47515406583496167, + "flos": 20850094903680.0, + "grad_norm": 1.6802974507725348, + "language_loss": 0.68601072, + "learning_rate": 2.257303243526688e-06, + "loss": 0.70743585, + "num_input_tokens_seen": 169844925, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.69140625, + "step": 7903, + "time_per_iteration": 2.44101619720459 + }, + { + "auxiliary_loss_clip": 0.01108361, + "auxiliary_loss_mlp": 0.0103213, + "balance_loss_clip": 1.01995277, + "balance_loss_mlp": 1.03901863, + "epoch": 0.47521418908762963, + "flos": 17524802448000.0, + "grad_norm": 1.4630263980427167, + "language_loss": 0.7225582, + "learning_rate": 2.256917013453848e-06, + "loss": 0.74396306, + "num_input_tokens_seen": 169862705, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6953125, + "step": 7904, + "time_per_iteration": 2.469230890274048 + }, + { + "auxiliary_loss_clip": 0.01108968, + "auxiliary_loss_mlp": 0.01030528, + "balance_loss_clip": 1.01894665, + "balance_loss_mlp": 1.03912354, + "epoch": 0.4752743123402976, + "flos": 20559434048640.0, + "grad_norm": 1.669936371268517, + "language_loss": 0.86257637, + "learning_rate": 2.25653077363869e-06, + "loss": 0.88397133, + "num_input_tokens_seen": 169880155, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6953125, + "step": 7905, + "time_per_iteration": 2.442215919494629 + }, + { + "auxiliary_loss_clip": 0.0110692, + "auxiliary_loss_mlp": 0.0102936, + "balance_loss_clip": 1.01750422, + "balance_loss_mlp": 1.03796053, + "epoch": 0.47533443559296557, + "flos": 26360623071360.0, + "grad_norm": 1.6116801799731275, + "language_loss": 0.82223809, + "learning_rate": 2.2561445240958583e-06, + "loss": 0.84360093, + "num_input_tokens_seen": 169901525, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 7906, + "time_per_iteration": 2.503708600997925 + }, + { + "auxiliary_loss_clip": 0.01033043, + "auxiliary_loss_mlp": 0.01004824, + "balance_loss_clip": 1.00345886, + "balance_loss_mlp": 1.00910616, + "epoch": 0.47539455884563353, + "flos": 65949660967680.0, + "grad_norm": 0.6702574149317626, + "language_loss": 0.59028685, + "learning_rate": 2.255758264840002e-06, + "loss": 0.61066544, + "num_input_tokens_seen": 169970345, + "router_z_loss_clip": 0.01367188, + "router_z_loss_mlp": 0.23925781, + "step": 7907, + "time_per_iteration": 3.156270980834961 + }, + { + "auxiliary_loss_clip": 0.01112242, + "auxiliary_loss_mlp": 0.01036172, + "balance_loss_clip": 1.02349377, + "balance_loss_mlp": 1.04145598, + "epoch": 0.4754546820983015, + "flos": 17238128002560.0, + "grad_norm": 1.9115330257313565, + "language_loss": 0.81044137, + "learning_rate": 2.255371995885765e-06, + "loss": 0.83192551, + "num_input_tokens_seen": 169986440, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 7908, + "time_per_iteration": 2.4719884395599365 + }, + { + "auxiliary_loss_clip": 0.01115991, + "auxiliary_loss_mlp": 0.01033248, + "balance_loss_clip": 1.01944923, + "balance_loss_mlp": 1.04349983, + "epoch": 0.47551480535096946, + "flos": 19825886499840.0, + "grad_norm": 1.7275790068018955, + "language_loss": 0.73515987, + "learning_rate": 2.254985717247797e-06, + "loss": 0.75665224, + "num_input_tokens_seen": 170005705, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.72265625, + "step": 7909, + "time_per_iteration": 2.4672436714172363 + }, + { + "auxiliary_loss_clip": 0.01110088, + "auxiliary_loss_mlp": 0.01031421, + "balance_loss_clip": 1.01887441, + "balance_loss_mlp": 1.03941047, + "epoch": 0.4755749286036375, + "flos": 22163958581760.0, + "grad_norm": 1.618978075546398, + "language_loss": 0.75284743, + "learning_rate": 2.2545994289407457e-06, + "loss": 0.77426249, + "num_input_tokens_seen": 170023415, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.70703125, + "step": 7910, + "time_per_iteration": 2.498745918273926 + }, + { + "auxiliary_loss_clip": 0.0110873, + "auxiliary_loss_mlp": 0.01026701, + "balance_loss_clip": 1.01494122, + "balance_loss_mlp": 1.03872323, + "epoch": 0.47563505185630545, + "flos": 21648280976640.0, + "grad_norm": 1.8146975429148502, + "language_loss": 0.78950047, + "learning_rate": 2.2542131309792577e-06, + "loss": 0.81085479, + "num_input_tokens_seen": 170042395, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.703125, + "step": 7911, + "time_per_iteration": 2.4530739784240723 + }, + { + "auxiliary_loss_clip": 0.01112727, + "auxiliary_loss_mlp": 0.01030628, + "balance_loss_clip": 1.01709199, + "balance_loss_mlp": 1.03904319, + "epoch": 0.4756951751089734, + "flos": 20628777254400.0, + "grad_norm": 1.5788116451196046, + "language_loss": 0.75611186, + "learning_rate": 2.253826823377983e-06, + "loss": 0.77754539, + "num_input_tokens_seen": 170061610, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 7912, + "time_per_iteration": 2.468348741531372 + }, + { + "auxiliary_loss_clip": 0.01107815, + "auxiliary_loss_mlp": 0.01033048, + "balance_loss_clip": 1.02094245, + "balance_loss_mlp": 1.03746927, + "epoch": 0.4757552983616414, + "flos": 25848788221440.0, + "grad_norm": 1.4305595105203048, + "language_loss": 0.74305665, + "learning_rate": 2.253440506151569e-06, + "loss": 0.76446521, + "num_input_tokens_seen": 170083505, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.703125, + "step": 7913, + "time_per_iteration": 2.4857094287872314 + }, + { + "auxiliary_loss_clip": 0.01111637, + "auxiliary_loss_mlp": 0.01026142, + "balance_loss_clip": 1.01336265, + "balance_loss_mlp": 1.04057527, + "epoch": 0.47581542161430934, + "flos": 18223013992320.0, + "grad_norm": 1.9652679728787295, + "language_loss": 0.72320372, + "learning_rate": 2.253054179314666e-06, + "loss": 0.74458152, + "num_input_tokens_seen": 170100690, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7109375, + "step": 7914, + "time_per_iteration": 2.4559848308563232 + }, + { + "auxiliary_loss_clip": 0.01114052, + "auxiliary_loss_mlp": 0.01031259, + "balance_loss_clip": 1.0191946, + "balance_loss_mlp": 1.04203475, + "epoch": 0.4758755448669773, + "flos": 21579763783680.0, + "grad_norm": 1.960460869956429, + "language_loss": 0.64513958, + "learning_rate": 2.2526678428819227e-06, + "loss": 0.66659272, + "num_input_tokens_seen": 170119240, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71875, + "step": 7915, + "time_per_iteration": 2.4528729915618896 + }, + { + "auxiliary_loss_clip": 0.01106319, + "auxiliary_loss_mlp": 0.01032601, + "balance_loss_clip": 1.020257, + "balance_loss_mlp": 1.03847694, + "epoch": 0.47593566811964527, + "flos": 15231152511360.0, + "grad_norm": 1.6765568872542898, + "language_loss": 0.76760435, + "learning_rate": 2.2522814968679896e-06, + "loss": 0.7889936, + "num_input_tokens_seen": 170136450, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6796875, + "step": 7916, + "time_per_iteration": 2.4544637203216553 + }, + { + "auxiliary_loss_clip": 0.01109831, + "auxiliary_loss_mlp": 0.01029473, + "balance_loss_clip": 1.01720083, + "balance_loss_mlp": 1.038872, + "epoch": 0.47599579137231324, + "flos": 21543242630400.0, + "grad_norm": 1.7964770898598468, + "language_loss": 0.64513361, + "learning_rate": 2.2518951412875173e-06, + "loss": 0.66652668, + "num_input_tokens_seen": 170155295, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.70703125, + "step": 7917, + "time_per_iteration": 2.4966535568237305 + }, + { + "auxiliary_loss_clip": 0.01033431, + "auxiliary_loss_mlp": 0.01003778, + "balance_loss_clip": 1.00258541, + "balance_loss_mlp": 1.00975943, + "epoch": 0.4760559146249812, + "flos": 64554602595840.0, + "grad_norm": 0.8336021747517385, + "language_loss": 0.6568867, + "learning_rate": 2.2515087761551557e-06, + "loss": 0.67725885, + "num_input_tokens_seen": 170222325, + "router_z_loss_clip": 0.01190186, + "router_z_loss_mlp": 0.23632812, + "step": 7918, + "time_per_iteration": 3.0902352333068848 + }, + { + "auxiliary_loss_clip": 0.01111138, + "auxiliary_loss_mlp": 0.01031837, + "balance_loss_clip": 1.01937342, + "balance_loss_mlp": 1.03909731, + "epoch": 0.47611603787764917, + "flos": 22233876405120.0, + "grad_norm": 1.7210259476746916, + "language_loss": 0.6884234, + "learning_rate": 2.2511224014855563e-06, + "loss": 0.70985305, + "num_input_tokens_seen": 170241625, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71875, + "step": 7919, + "time_per_iteration": 2.451730728149414 + }, + { + "auxiliary_loss_clip": 0.01111075, + "auxiliary_loss_mlp": 0.0103435, + "balance_loss_clip": 1.02188087, + "balance_loss_mlp": 1.03897047, + "epoch": 0.47617616113031713, + "flos": 22780005765120.0, + "grad_norm": 1.5380536315740185, + "language_loss": 0.74750632, + "learning_rate": 2.2507360172933694e-06, + "loss": 0.7689606, + "num_input_tokens_seen": 170262470, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71875, + "step": 7920, + "time_per_iteration": 2.5365359783172607 + }, + { + "auxiliary_loss_clip": 0.0111556, + "auxiliary_loss_mlp": 0.01032511, + "balance_loss_clip": 1.01854539, + "balance_loss_mlp": 1.04174948, + "epoch": 0.4762362843829851, + "flos": 24133802388480.0, + "grad_norm": 1.4190261222987137, + "language_loss": 0.77478063, + "learning_rate": 2.2503496235932487e-06, + "loss": 0.79626137, + "num_input_tokens_seen": 170283460, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.73828125, + "step": 7921, + "time_per_iteration": 2.4841856956481934 + }, + { + "auxiliary_loss_clip": 0.01112061, + "auxiliary_loss_mlp": 0.01035062, + "balance_loss_clip": 1.02113843, + "balance_loss_mlp": 1.03917885, + "epoch": 0.47629640763565306, + "flos": 22452069571200.0, + "grad_norm": 1.531083685843196, + "language_loss": 0.78213, + "learning_rate": 2.249963220399845e-06, + "loss": 0.80360126, + "num_input_tokens_seen": 170304225, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7265625, + "step": 7922, + "time_per_iteration": 2.537930965423584 + }, + { + "auxiliary_loss_clip": 0.01115671, + "auxiliary_loss_mlp": 0.0103746, + "balance_loss_clip": 1.02360809, + "balance_loss_mlp": 1.04113102, + "epoch": 0.4763565308883211, + "flos": 11181398647680.0, + "grad_norm": 1.7101716924021442, + "language_loss": 0.72932559, + "learning_rate": 2.2495768077278104e-06, + "loss": 0.75085688, + "num_input_tokens_seen": 170322110, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.74609375, + "step": 7923, + "time_per_iteration": 2.4527640342712402 + }, + { + "auxiliary_loss_clip": 0.01110421, + "auxiliary_loss_mlp": 0.01032066, + "balance_loss_clip": 1.01978159, + "balance_loss_mlp": 1.03808331, + "epoch": 0.47641665414098905, + "flos": 22382151747840.0, + "grad_norm": 2.125534979901623, + "language_loss": 0.81915551, + "learning_rate": 2.2491903855917992e-06, + "loss": 0.84058034, + "num_input_tokens_seen": 170340700, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.7265625, + "step": 7924, + "time_per_iteration": 2.480109930038452 + }, + { + "auxiliary_loss_clip": 0.01120558, + "auxiliary_loss_mlp": 0.01038344, + "balance_loss_clip": 1.0246644, + "balance_loss_mlp": 1.04359889, + "epoch": 0.476476777393657, + "flos": 25046148862080.0, + "grad_norm": 1.7710398873833821, + "language_loss": 0.80079067, + "learning_rate": 2.2488039540064626e-06, + "loss": 0.82237971, + "num_input_tokens_seen": 170359780, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.76953125, + "step": 7925, + "time_per_iteration": 2.4877142906188965 + }, + { + "auxiliary_loss_clip": 0.01110581, + "auxiliary_loss_mlp": 0.01035936, + "balance_loss_clip": 1.02343702, + "balance_loss_mlp": 1.03800642, + "epoch": 0.476536900646325, + "flos": 27269916888960.0, + "grad_norm": 2.066985409764694, + "language_loss": 0.72263825, + "learning_rate": 2.2484175129864558e-06, + "loss": 0.74410343, + "num_input_tokens_seen": 170381260, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7265625, + "step": 7926, + "time_per_iteration": 2.561140298843384 + }, + { + "auxiliary_loss_clip": 0.01116818, + "auxiliary_loss_mlp": 0.01029726, + "balance_loss_clip": 1.01623797, + "balance_loss_mlp": 1.04205072, + "epoch": 0.47659702389899294, + "flos": 25301401885440.0, + "grad_norm": 8.404578303652414, + "language_loss": 0.68589562, + "learning_rate": 2.248031062546432e-06, + "loss": 0.7073611, + "num_input_tokens_seen": 170400595, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 7927, + "time_per_iteration": 2.4860117435455322 + }, + { + "auxiliary_loss_clip": 0.01111384, + "auxiliary_loss_mlp": 0.01025704, + "balance_loss_clip": 1.0138253, + "balance_loss_mlp": 1.04121518, + "epoch": 0.4766571471516609, + "flos": 25992861672960.0, + "grad_norm": 1.5906069345122125, + "language_loss": 0.68003678, + "learning_rate": 2.247644602701045e-06, + "loss": 0.70140767, + "num_input_tokens_seen": 170421110, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.703125, + "step": 7928, + "time_per_iteration": 3.917212724685669 + }, + { + "auxiliary_loss_clip": 0.0111287, + "auxiliary_loss_mlp": 0.01028459, + "balance_loss_clip": 1.0160315, + "balance_loss_mlp": 1.04099739, + "epoch": 0.4767172704043289, + "flos": 16032211672320.0, + "grad_norm": 2.0359036820122762, + "language_loss": 0.79055941, + "learning_rate": 2.2472581334649496e-06, + "loss": 0.81197274, + "num_input_tokens_seen": 170436700, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71875, + "step": 7929, + "time_per_iteration": 5.38159441947937 + }, + { + "auxiliary_loss_clip": 0.01109888, + "auxiliary_loss_mlp": 0.01032742, + "balance_loss_clip": 1.02098787, + "balance_loss_mlp": 1.04033756, + "epoch": 0.47677739365699684, + "flos": 39235351651200.0, + "grad_norm": 1.8427147864954625, + "language_loss": 0.6634798, + "learning_rate": 2.2468716548528016e-06, + "loss": 0.68490613, + "num_input_tokens_seen": 170459555, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6953125, + "step": 7930, + "time_per_iteration": 4.1562559604644775 + }, + { + "auxiliary_loss_clip": 0.01110022, + "auxiliary_loss_mlp": 0.01030402, + "balance_loss_clip": 1.01830864, + "balance_loss_mlp": 1.03929853, + "epoch": 0.4768375169096648, + "flos": 24717781704960.0, + "grad_norm": 1.7695493738399266, + "language_loss": 0.80279613, + "learning_rate": 2.2464851668792555e-06, + "loss": 0.82420039, + "num_input_tokens_seen": 170479175, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.70703125, + "step": 7931, + "time_per_iteration": 2.483144760131836 + }, + { + "auxiliary_loss_clip": 0.01112785, + "auxiliary_loss_mlp": 0.01029869, + "balance_loss_clip": 1.01667237, + "balance_loss_mlp": 1.04009867, + "epoch": 0.47689764016233277, + "flos": 22528667324160.0, + "grad_norm": 1.714860616709588, + "language_loss": 0.75956833, + "learning_rate": 2.2460986695589678e-06, + "loss": 0.78099489, + "num_input_tokens_seen": 170498450, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7265625, + "step": 7932, + "time_per_iteration": 2.4789490699768066 + }, + { + "auxiliary_loss_clip": 0.0111028, + "auxiliary_loss_mlp": 0.01033967, + "balance_loss_clip": 1.02110386, + "balance_loss_mlp": 1.04108882, + "epoch": 0.47695776341500074, + "flos": 15120619384320.0, + "grad_norm": 2.3368480026304748, + "language_loss": 0.79639196, + "learning_rate": 2.245712162906593e-06, + "loss": 0.81783438, + "num_input_tokens_seen": 170516255, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69140625, + "step": 7933, + "time_per_iteration": 2.4574432373046875 + }, + { + "auxiliary_loss_clip": 0.01116858, + "auxiliary_loss_mlp": 0.0103583, + "balance_loss_clip": 1.02131057, + "balance_loss_mlp": 1.04114437, + "epoch": 0.4770178866676687, + "flos": 14678917839360.0, + "grad_norm": 1.7879612820388389, + "language_loss": 0.73776019, + "learning_rate": 2.2453256469367888e-06, + "loss": 0.759287, + "num_input_tokens_seen": 170532705, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7578125, + "step": 7934, + "time_per_iteration": 2.4703593254089355 + }, + { + "auxiliary_loss_clip": 0.0111259, + "auxiliary_loss_mlp": 0.01028961, + "balance_loss_clip": 1.01611567, + "balance_loss_mlp": 1.03858674, + "epoch": 0.47707800992033667, + "flos": 22565583527040.0, + "grad_norm": 1.719427707895152, + "language_loss": 0.7973842, + "learning_rate": 2.244939121664211e-06, + "loss": 0.81879967, + "num_input_tokens_seen": 170551925, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7421875, + "step": 7935, + "time_per_iteration": 2.459326982498169 + }, + { + "auxiliary_loss_clip": 0.01119081, + "auxiliary_loss_mlp": 0.01039794, + "balance_loss_clip": 1.02566767, + "balance_loss_mlp": 1.04244995, + "epoch": 0.4771381331730047, + "flos": 30918225375360.0, + "grad_norm": 1.7712234775739364, + "language_loss": 0.71105671, + "learning_rate": 2.2445525871035177e-06, + "loss": 0.73264545, + "num_input_tokens_seen": 170572320, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.76953125, + "step": 7936, + "time_per_iteration": 2.599914312362671 + }, + { + "auxiliary_loss_clip": 0.01112402, + "auxiliary_loss_mlp": 0.01028093, + "balance_loss_clip": 1.01529551, + "balance_loss_mlp": 1.03864932, + "epoch": 0.47719825642567265, + "flos": 25738901539200.0, + "grad_norm": 2.8731818732430927, + "language_loss": 0.68026948, + "learning_rate": 2.2441660432693656e-06, + "loss": 0.7016744, + "num_input_tokens_seen": 170589470, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.73828125, + "step": 7937, + "time_per_iteration": 2.4884297847747803 + }, + { + "auxiliary_loss_clip": 0.01035404, + "auxiliary_loss_mlp": 0.00999711, + "balance_loss_clip": 0.99838793, + "balance_loss_mlp": 1.01120472, + "epoch": 0.4772583796783406, + "flos": 66355128668160.0, + "grad_norm": 0.7133873095384958, + "language_loss": 0.56401992, + "learning_rate": 2.2437794901764128e-06, + "loss": 0.58437109, + "num_input_tokens_seen": 170662265, + "router_z_loss_clip": 0.01324463, + "router_z_loss_mlp": 0.2421875, + "step": 7938, + "time_per_iteration": 3.27707576751709 + }, + { + "auxiliary_loss_clip": 0.01113753, + "auxiliary_loss_mlp": 0.01032085, + "balance_loss_clip": 1.01889467, + "balance_loss_mlp": 1.04162848, + "epoch": 0.4773185029310086, + "flos": 22051091070720.0, + "grad_norm": 1.6305385471502185, + "language_loss": 0.88721037, + "learning_rate": 2.243392927839317e-06, + "loss": 0.9086687, + "num_input_tokens_seen": 170679680, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.72265625, + "step": 7939, + "time_per_iteration": 2.503838300704956 + }, + { + "auxiliary_loss_clip": 0.01110311, + "auxiliary_loss_mlp": 0.01032369, + "balance_loss_clip": 1.02037096, + "balance_loss_mlp": 1.03832293, + "epoch": 0.47737862618367655, + "flos": 16727801523840.0, + "grad_norm": 2.146362570276984, + "language_loss": 0.76661658, + "learning_rate": 2.2430063562727367e-06, + "loss": 0.78804338, + "num_input_tokens_seen": 170697340, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.72265625, + "step": 7940, + "time_per_iteration": 2.4230127334594727 + }, + { + "auxiliary_loss_clip": 0.01109098, + "auxiliary_loss_mlp": 0.01031125, + "balance_loss_clip": 1.0194304, + "balance_loss_mlp": 1.03975916, + "epoch": 0.4774387494363445, + "flos": 19609453100160.0, + "grad_norm": 1.568994035010224, + "language_loss": 0.84892023, + "learning_rate": 2.2426197754913322e-06, + "loss": 0.87032247, + "num_input_tokens_seen": 170714905, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6953125, + "step": 7941, + "time_per_iteration": 2.4640510082244873 + }, + { + "auxiliary_loss_clip": 0.01116894, + "auxiliary_loss_mlp": 0.01035857, + "balance_loss_clip": 1.02263689, + "balance_loss_mlp": 1.04307771, + "epoch": 0.4774988726890125, + "flos": 16653969118080.0, + "grad_norm": 2.0154740266117104, + "language_loss": 0.75996536, + "learning_rate": 2.24223318550976e-06, + "loss": 0.78149283, + "num_input_tokens_seen": 170731810, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73828125, + "step": 7942, + "time_per_iteration": 2.4304351806640625 + }, + { + "auxiliary_loss_clip": 0.01113984, + "auxiliary_loss_mlp": 0.01038477, + "balance_loss_clip": 1.02646661, + "balance_loss_mlp": 1.0415473, + "epoch": 0.47755899594168044, + "flos": 20485565729280.0, + "grad_norm": 1.8198127192389717, + "language_loss": 0.64578187, + "learning_rate": 2.241846586342682e-06, + "loss": 0.66730648, + "num_input_tokens_seen": 170750270, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.72265625, + "step": 7943, + "time_per_iteration": 2.469884157180786 + }, + { + "auxiliary_loss_clip": 0.01114805, + "auxiliary_loss_mlp": 0.01036635, + "balance_loss_clip": 1.02318239, + "balance_loss_mlp": 1.04029822, + "epoch": 0.4776191191943484, + "flos": 21652806090240.0, + "grad_norm": 1.6437441778624493, + "language_loss": 0.73638076, + "learning_rate": 2.2414599780047577e-06, + "loss": 0.75789517, + "num_input_tokens_seen": 170769015, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 7944, + "time_per_iteration": 2.462620258331299 + }, + { + "auxiliary_loss_clip": 0.0111381, + "auxiliary_loss_mlp": 0.01034914, + "balance_loss_clip": 1.02092481, + "balance_loss_mlp": 1.04105759, + "epoch": 0.4776792424470164, + "flos": 18770220760320.0, + "grad_norm": 2.2015870606275785, + "language_loss": 0.67936689, + "learning_rate": 2.2410733605106456e-06, + "loss": 0.70085418, + "num_input_tokens_seen": 170785725, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7265625, + "step": 7945, + "time_per_iteration": 2.498506784439087 + }, + { + "auxiliary_loss_clip": 0.01110287, + "auxiliary_loss_mlp": 0.01036964, + "balance_loss_clip": 1.02483487, + "balance_loss_mlp": 1.03805077, + "epoch": 0.47773936569968434, + "flos": 29715828577920.0, + "grad_norm": 1.8282867356700874, + "language_loss": 0.75330615, + "learning_rate": 2.240686733875009e-06, + "loss": 0.77477872, + "num_input_tokens_seen": 170804600, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.72265625, + "step": 7946, + "time_per_iteration": 2.5168514251708984 + }, + { + "auxiliary_loss_clip": 0.01116531, + "auxiliary_loss_mlp": 0.01041116, + "balance_loss_clip": 1.02759135, + "balance_loss_mlp": 1.04283607, + "epoch": 0.4777994889523523, + "flos": 24791542283520.0, + "grad_norm": 1.7491504350819331, + "language_loss": 0.79312646, + "learning_rate": 2.240300098112506e-06, + "loss": 0.81470287, + "num_input_tokens_seen": 170824230, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.734375, + "step": 7947, + "time_per_iteration": 2.5980498790740967 + }, + { + "auxiliary_loss_clip": 0.01107555, + "auxiliary_loss_mlp": 0.01036726, + "balance_loss_clip": 1.02433419, + "balance_loss_mlp": 1.0381552, + "epoch": 0.47785961220502027, + "flos": 17858161595520.0, + "grad_norm": 1.7633094448758173, + "language_loss": 0.73717982, + "learning_rate": 2.2399134532377998e-06, + "loss": 0.75862265, + "num_input_tokens_seen": 170843365, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6953125, + "step": 7948, + "time_per_iteration": 2.446190357208252 + }, + { + "auxiliary_loss_clip": 0.01115036, + "auxiliary_loss_mlp": 0.01033851, + "balance_loss_clip": 1.02050555, + "balance_loss_mlp": 1.04240656, + "epoch": 0.4779197354576883, + "flos": 20266546550400.0, + "grad_norm": 1.5048270934573464, + "language_loss": 0.77945703, + "learning_rate": 2.2395267992655514e-06, + "loss": 0.80094588, + "num_input_tokens_seen": 170863515, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7265625, + "step": 7949, + "time_per_iteration": 2.4999916553497314 + }, + { + "auxiliary_loss_clip": 0.01107805, + "auxiliary_loss_mlp": 0.01032834, + "balance_loss_clip": 1.02077556, + "balance_loss_mlp": 1.0387454, + "epoch": 0.47797985871035625, + "flos": 17056599644160.0, + "grad_norm": 2.112378262987889, + "language_loss": 0.74019569, + "learning_rate": 2.2391401362104227e-06, + "loss": 0.7616021, + "num_input_tokens_seen": 170881245, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6875, + "step": 7950, + "time_per_iteration": 2.4387645721435547 + }, + { + "auxiliary_loss_clip": 0.01110159, + "auxiliary_loss_mlp": 0.01039658, + "balance_loss_clip": 1.02609253, + "balance_loss_mlp": 1.03978574, + "epoch": 0.4780399819630242, + "flos": 31358418549120.0, + "grad_norm": 1.7104198942075015, + "language_loss": 0.74135828, + "learning_rate": 2.2387534640870756e-06, + "loss": 0.76285648, + "num_input_tokens_seen": 170901285, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.703125, + "step": 7951, + "time_per_iteration": 2.579258680343628 + }, + { + "auxiliary_loss_clip": 0.01112662, + "auxiliary_loss_mlp": 0.01032575, + "balance_loss_clip": 1.01945043, + "balance_loss_mlp": 1.03915167, + "epoch": 0.4781001052156922, + "flos": 24899597372160.0, + "grad_norm": 1.8112920130665326, + "language_loss": 0.79960251, + "learning_rate": 2.238366782910174e-06, + "loss": 0.82105488, + "num_input_tokens_seen": 170919740, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.734375, + "step": 7952, + "time_per_iteration": 2.5007214546203613 + }, + { + "auxiliary_loss_clip": 0.01114258, + "auxiliary_loss_mlp": 0.01040283, + "balance_loss_clip": 1.02687836, + "balance_loss_mlp": 1.04040217, + "epoch": 0.47816022846836015, + "flos": 18697717157760.0, + "grad_norm": 1.7026148138194093, + "language_loss": 0.78196061, + "learning_rate": 2.23798009269438e-06, + "loss": 0.80350602, + "num_input_tokens_seen": 170938510, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73828125, + "step": 7953, + "time_per_iteration": 2.4699995517730713 + }, + { + "auxiliary_loss_clip": 0.01114922, + "auxiliary_loss_mlp": 0.01036085, + "balance_loss_clip": 1.02362204, + "balance_loss_mlp": 1.0405128, + "epoch": 0.4782203517210281, + "flos": 11977573559040.0, + "grad_norm": 2.2363441879819224, + "language_loss": 0.84142399, + "learning_rate": 2.2375933934543566e-06, + "loss": 0.86293399, + "num_input_tokens_seen": 170951170, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7421875, + "step": 7954, + "time_per_iteration": 2.41294527053833 + }, + { + "auxiliary_loss_clip": 0.01109876, + "auxiliary_loss_mlp": 0.01035461, + "balance_loss_clip": 1.02254462, + "balance_loss_mlp": 1.03839588, + "epoch": 0.4782804749736961, + "flos": 20813501923200.0, + "grad_norm": 1.442835840236476, + "language_loss": 0.70588672, + "learning_rate": 2.237206685204768e-06, + "loss": 0.72734004, + "num_input_tokens_seen": 170970990, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71484375, + "step": 7955, + "time_per_iteration": 2.4867892265319824 + }, + { + "auxiliary_loss_clip": 0.0111092, + "auxiliary_loss_mlp": 0.01037464, + "balance_loss_clip": 1.02507281, + "balance_loss_mlp": 1.03925073, + "epoch": 0.47834059822636404, + "flos": 23840304359040.0, + "grad_norm": 1.5835230785797205, + "language_loss": 0.817267, + "learning_rate": 2.2368199679602787e-06, + "loss": 0.83875084, + "num_input_tokens_seen": 170991215, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71484375, + "step": 7956, + "time_per_iteration": 2.4756619930267334 + }, + { + "auxiliary_loss_clip": 0.0111219, + "auxiliary_loss_mlp": 0.01033269, + "balance_loss_clip": 1.01935172, + "balance_loss_mlp": 1.04097366, + "epoch": 0.478400721479032, + "flos": 22633777497600.0, + "grad_norm": 1.8961411498697718, + "language_loss": 0.84901869, + "learning_rate": 2.2364332417355516e-06, + "loss": 0.87047327, + "num_input_tokens_seen": 171007325, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7109375, + "step": 7957, + "time_per_iteration": 2.4848859310150146 + }, + { + "auxiliary_loss_clip": 0.01110703, + "auxiliary_loss_mlp": 0.01032705, + "balance_loss_clip": 1.02065289, + "balance_loss_mlp": 1.0396328, + "epoch": 0.4784608447317, + "flos": 19354954262400.0, + "grad_norm": 1.5799276625975138, + "language_loss": 0.79682672, + "learning_rate": 2.2360465065452527e-06, + "loss": 0.81826073, + "num_input_tokens_seen": 171025650, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.7109375, + "step": 7958, + "time_per_iteration": 2.439040422439575 + }, + { + "auxiliary_loss_clip": 0.01109825, + "auxiliary_loss_mlp": 0.01034266, + "balance_loss_clip": 1.02074742, + "balance_loss_mlp": 1.03806448, + "epoch": 0.47852096798436794, + "flos": 24021114445440.0, + "grad_norm": 2.0401185124291406, + "language_loss": 0.82728368, + "learning_rate": 2.235659762404047e-06, + "loss": 0.8487246, + "num_input_tokens_seen": 171045045, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71875, + "step": 7959, + "time_per_iteration": 2.500182867050171 + }, + { + "auxiliary_loss_clip": 0.01108176, + "auxiliary_loss_mlp": 0.01033073, + "balance_loss_clip": 1.0219152, + "balance_loss_mlp": 1.04054058, + "epoch": 0.4785810912370359, + "flos": 25666433850240.0, + "grad_norm": 2.3853858164000292, + "language_loss": 0.7333414, + "learning_rate": 2.235273009326599e-06, + "loss": 0.75475383, + "num_input_tokens_seen": 171062910, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.67578125, + "step": 7960, + "time_per_iteration": 2.4852850437164307 + }, + { + "auxiliary_loss_clip": 0.01108515, + "auxiliary_loss_mlp": 0.01036385, + "balance_loss_clip": 1.02413607, + "balance_loss_mlp": 1.03937268, + "epoch": 0.47864121448970387, + "flos": 21432134885760.0, + "grad_norm": 1.8739024393884087, + "language_loss": 0.77067018, + "learning_rate": 2.2348862473275745e-06, + "loss": 0.79211915, + "num_input_tokens_seen": 171080875, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69140625, + "step": 7961, + "time_per_iteration": 2.482361316680908 + }, + { + "auxiliary_loss_clip": 0.01108097, + "auxiliary_loss_mlp": 0.01030382, + "balance_loss_clip": 1.01817513, + "balance_loss_mlp": 1.03838158, + "epoch": 0.47870133774237184, + "flos": 16143894034560.0, + "grad_norm": 1.629700477315198, + "language_loss": 0.77528512, + "learning_rate": 2.2344994764216405e-06, + "loss": 0.7966699, + "num_input_tokens_seen": 171099190, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6953125, + "step": 7962, + "time_per_iteration": 2.427537679672241 + }, + { + "auxiliary_loss_clip": 0.01112825, + "auxiliary_loss_mlp": 0.01034413, + "balance_loss_clip": 1.02196801, + "balance_loss_mlp": 1.04174328, + "epoch": 0.47876146099503986, + "flos": 26906788344960.0, + "grad_norm": 1.5913499246445781, + "language_loss": 0.64895082, + "learning_rate": 2.2341126966234635e-06, + "loss": 0.67042321, + "num_input_tokens_seen": 171119060, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 7963, + "time_per_iteration": 2.51082181930542 + }, + { + "auxiliary_loss_clip": 0.01110812, + "auxiliary_loss_mlp": 0.01030041, + "balance_loss_clip": 1.01748848, + "balance_loss_mlp": 1.03972077, + "epoch": 0.4788215842477078, + "flos": 45332085778560.0, + "grad_norm": 1.658229101322456, + "language_loss": 0.77974397, + "learning_rate": 2.2337259079477083e-06, + "loss": 0.80115253, + "num_input_tokens_seen": 171141900, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7109375, + "step": 7964, + "time_per_iteration": 2.6512372493743896 + }, + { + "auxiliary_loss_clip": 0.01113836, + "auxiliary_loss_mlp": 0.0103048, + "balance_loss_clip": 1.01617479, + "balance_loss_mlp": 1.03944111, + "epoch": 0.4788817075003758, + "flos": 22237180456320.0, + "grad_norm": 1.7558149312417117, + "language_loss": 0.76227248, + "learning_rate": 2.233339110409044e-06, + "loss": 0.78371561, + "num_input_tokens_seen": 171161045, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.74609375, + "step": 7965, + "time_per_iteration": 2.4919536113739014 + }, + { + "auxiliary_loss_clip": 0.01108501, + "auxiliary_loss_mlp": 0.01032128, + "balance_loss_clip": 1.01957512, + "balance_loss_mlp": 1.0382036, + "epoch": 0.47894183075304375, + "flos": 16471183783680.0, + "grad_norm": 2.251400870531799, + "language_loss": 0.74590349, + "learning_rate": 2.232952304022137e-06, + "loss": 0.76730978, + "num_input_tokens_seen": 171179675, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 7966, + "time_per_iteration": 2.4254770278930664 + }, + { + "auxiliary_loss_clip": 0.01108309, + "auxiliary_loss_mlp": 0.01030103, + "balance_loss_clip": 1.0169003, + "balance_loss_mlp": 1.03785586, + "epoch": 0.4790019540057117, + "flos": 24282688262400.0, + "grad_norm": 1.521959054408531, + "language_loss": 0.72728515, + "learning_rate": 2.232565488801655e-06, + "loss": 0.74866927, + "num_input_tokens_seen": 171201175, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 7967, + "time_per_iteration": 2.522883892059326 + }, + { + "auxiliary_loss_clip": 0.01103831, + "auxiliary_loss_mlp": 0.01026385, + "balance_loss_clip": 1.01433849, + "balance_loss_mlp": 1.0371958, + "epoch": 0.4790620772583797, + "flos": 25666469763840.0, + "grad_norm": 2.344774601020355, + "language_loss": 0.79174602, + "learning_rate": 2.232178664762267e-06, + "loss": 0.81304824, + "num_input_tokens_seen": 171221750, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66796875, + "step": 7968, + "time_per_iteration": 2.4777579307556152 + }, + { + "auxiliary_loss_clip": 0.01035385, + "auxiliary_loss_mlp": 0.01007575, + "balance_loss_clip": 1.00622833, + "balance_loss_mlp": 1.0118711, + "epoch": 0.47912220051104765, + "flos": 69428077102080.0, + "grad_norm": 0.7636022901302345, + "language_loss": 0.62258303, + "learning_rate": 2.2317918319186408e-06, + "loss": 0.64301264, + "num_input_tokens_seen": 171292235, + "router_z_loss_clip": 0.01348877, + "router_z_loss_mlp": 0.23535156, + "step": 7969, + "time_per_iteration": 4.618057012557983 + }, + { + "auxiliary_loss_clip": 0.01107101, + "auxiliary_loss_mlp": 0.01027179, + "balance_loss_clip": 1.01555026, + "balance_loss_mlp": 1.04000521, + "epoch": 0.4791823237637156, + "flos": 24168922911360.0, + "grad_norm": 1.5307915717866403, + "language_loss": 0.77086926, + "learning_rate": 2.2314049902854446e-06, + "loss": 0.79221207, + "num_input_tokens_seen": 171312215, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 7970, + "time_per_iteration": 2.469363212585449 + }, + { + "auxiliary_loss_clip": 0.01106643, + "auxiliary_loss_mlp": 0.01032734, + "balance_loss_clip": 1.01962733, + "balance_loss_mlp": 1.03676999, + "epoch": 0.4792424470163836, + "flos": 24751465683840.0, + "grad_norm": 1.595425961628827, + "language_loss": 0.70320344, + "learning_rate": 2.231018139877349e-06, + "loss": 0.72459716, + "num_input_tokens_seen": 171332975, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.69921875, + "step": 7971, + "time_per_iteration": 5.436426401138306 + }, + { + "auxiliary_loss_clip": 0.01107204, + "auxiliary_loss_mlp": 0.01025624, + "balance_loss_clip": 1.01228452, + "balance_loss_mlp": 1.03725302, + "epoch": 0.47930257026905154, + "flos": 23257905240960.0, + "grad_norm": 1.2757793979028687, + "language_loss": 0.79909688, + "learning_rate": 2.230631280709021e-06, + "loss": 0.82042515, + "num_input_tokens_seen": 171353880, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.69921875, + "step": 7972, + "time_per_iteration": 2.4788928031921387 + }, + { + "auxiliary_loss_clip": 0.01109213, + "auxiliary_loss_mlp": 0.01025441, + "balance_loss_clip": 1.01220274, + "balance_loss_mlp": 1.03801394, + "epoch": 0.4793626935217195, + "flos": 14064091718400.0, + "grad_norm": 2.154896563362021, + "language_loss": 0.69762838, + "learning_rate": 2.2302444127951327e-06, + "loss": 0.71897495, + "num_input_tokens_seen": 171370930, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 7973, + "time_per_iteration": 2.462674140930176 + }, + { + "auxiliary_loss_clip": 0.01108438, + "auxiliary_loss_mlp": 0.01031526, + "balance_loss_clip": 1.01943266, + "balance_loss_mlp": 1.0401777, + "epoch": 0.4794228167743875, + "flos": 21798854789760.0, + "grad_norm": 1.7300676969557445, + "language_loss": 0.78652924, + "learning_rate": 2.2298575361503523e-06, + "loss": 0.80792892, + "num_input_tokens_seen": 171387575, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 7974, + "time_per_iteration": 2.523935079574585 + }, + { + "auxiliary_loss_clip": 0.01035404, + "auxiliary_loss_mlp": 0.01004075, + "balance_loss_clip": 1.00275135, + "balance_loss_mlp": 1.01174331, + "epoch": 0.47948294002705544, + "flos": 66968805553920.0, + "grad_norm": 0.7575595850509929, + "language_loss": 0.54076326, + "learning_rate": 2.2294706507893517e-06, + "loss": 0.56115806, + "num_input_tokens_seen": 171449980, + "router_z_loss_clip": 0.01324463, + "router_z_loss_mlp": 0.23632812, + "step": 7975, + "time_per_iteration": 3.120290756225586 + }, + { + "auxiliary_loss_clip": 0.01113379, + "auxiliary_loss_mlp": 0.01033426, + "balance_loss_clip": 1.01946688, + "balance_loss_mlp": 1.03872228, + "epoch": 0.47954306327972346, + "flos": 12422471414400.0, + "grad_norm": 2.0952625936259226, + "language_loss": 0.90246761, + "learning_rate": 2.2290837567268008e-06, + "loss": 0.92393565, + "num_input_tokens_seen": 171465290, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.75, + "step": 7976, + "time_per_iteration": 2.4177215099334717 + }, + { + "auxiliary_loss_clip": 0.01113502, + "auxiliary_loss_mlp": 0.01034601, + "balance_loss_clip": 1.02070153, + "balance_loss_mlp": 1.03989267, + "epoch": 0.4796031865323914, + "flos": 18361951799040.0, + "grad_norm": 2.1692733838107148, + "language_loss": 0.73631197, + "learning_rate": 2.2286968539773713e-06, + "loss": 0.75779295, + "num_input_tokens_seen": 171481130, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.734375, + "step": 7977, + "time_per_iteration": 2.478994846343994 + }, + { + "auxiliary_loss_clip": 0.01105095, + "auxiliary_loss_mlp": 0.01033101, + "balance_loss_clip": 1.02095962, + "balance_loss_mlp": 1.03737617, + "epoch": 0.4796633097850594, + "flos": 21835088634240.0, + "grad_norm": 1.5189317692466735, + "language_loss": 0.78386033, + "learning_rate": 2.228309942555734e-06, + "loss": 0.80524224, + "num_input_tokens_seen": 171501140, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6796875, + "step": 7978, + "time_per_iteration": 2.441770315170288 + }, + { + "auxiliary_loss_clip": 0.01110092, + "auxiliary_loss_mlp": 0.01032979, + "balance_loss_clip": 1.02036691, + "balance_loss_mlp": 1.03895688, + "epoch": 0.47972343303772735, + "flos": 23437350610560.0, + "grad_norm": 1.9080949377976553, + "language_loss": 0.89561266, + "learning_rate": 2.22792302247656e-06, + "loss": 0.91704339, + "num_input_tokens_seen": 171519835, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 7979, + "time_per_iteration": 2.5005874633789062 + }, + { + "auxiliary_loss_clip": 0.01111373, + "auxiliary_loss_mlp": 0.01032357, + "balance_loss_clip": 1.01854038, + "balance_loss_mlp": 1.03977728, + "epoch": 0.4797835562903953, + "flos": 24899776940160.0, + "grad_norm": 1.512941625260848, + "language_loss": 0.77104276, + "learning_rate": 2.227536093754523e-06, + "loss": 0.79248011, + "num_input_tokens_seen": 171540980, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.71875, + "step": 7980, + "time_per_iteration": 2.514702320098877 + }, + { + "auxiliary_loss_clip": 0.01112304, + "auxiliary_loss_mlp": 0.01031739, + "balance_loss_clip": 1.0177083, + "balance_loss_mlp": 1.03812611, + "epoch": 0.4798436795430633, + "flos": 35042996793600.0, + "grad_norm": 1.6709892763913308, + "language_loss": 0.71718562, + "learning_rate": 2.227149156404295e-06, + "loss": 0.738626, + "num_input_tokens_seen": 171563600, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 7981, + "time_per_iteration": 2.606919050216675 + }, + { + "auxiliary_loss_clip": 0.01107255, + "auxiliary_loss_mlp": 0.01029759, + "balance_loss_clip": 1.01743317, + "balance_loss_mlp": 1.03878653, + "epoch": 0.47990380279573125, + "flos": 20590209025920.0, + "grad_norm": 1.7550369517172573, + "language_loss": 0.70141387, + "learning_rate": 2.2267622104405473e-06, + "loss": 0.72278404, + "num_input_tokens_seen": 171580700, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.68359375, + "step": 7982, + "time_per_iteration": 2.4303736686706543 + }, + { + "auxiliary_loss_clip": 0.01102477, + "auxiliary_loss_mlp": 0.0102651, + "balance_loss_clip": 1.01558483, + "balance_loss_mlp": 1.03694749, + "epoch": 0.4799639260483992, + "flos": 26359402008960.0, + "grad_norm": 2.256566494766253, + "language_loss": 0.70977259, + "learning_rate": 2.2263752558779544e-06, + "loss": 0.73106241, + "num_input_tokens_seen": 171602035, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.65625, + "step": 7983, + "time_per_iteration": 2.520749092102051 + }, + { + "auxiliary_loss_clip": 0.01032541, + "auxiliary_loss_mlp": 0.01011047, + "balance_loss_clip": 1.00992036, + "balance_loss_mlp": 1.00916195, + "epoch": 0.4800240493010672, + "flos": 70979021521920.0, + "grad_norm": 0.8049867321392653, + "language_loss": 0.59458363, + "learning_rate": 2.2259882927311883e-06, + "loss": 0.6150195, + "num_input_tokens_seen": 171659215, + "router_z_loss_clip": 0.0112915, + "router_z_loss_mlp": 0.234375, + "step": 7984, + "time_per_iteration": 3.0019614696502686 + }, + { + "auxiliary_loss_clip": 0.01107343, + "auxiliary_loss_mlp": 0.01031912, + "balance_loss_clip": 1.01912713, + "balance_loss_mlp": 1.0376364, + "epoch": 0.48008417255373514, + "flos": 17086656349440.0, + "grad_norm": 1.5803111762139084, + "language_loss": 0.66603255, + "learning_rate": 2.2256013210149247e-06, + "loss": 0.68742514, + "num_input_tokens_seen": 171675710, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 7985, + "time_per_iteration": 2.459381341934204 + }, + { + "auxiliary_loss_clip": 0.01108889, + "auxiliary_loss_mlp": 0.01036103, + "balance_loss_clip": 1.02279973, + "balance_loss_mlp": 1.03655791, + "epoch": 0.4801442958064031, + "flos": 15413435055360.0, + "grad_norm": 1.8105960725352928, + "language_loss": 0.70750952, + "learning_rate": 2.225214340743835e-06, + "loss": 0.72895944, + "num_input_tokens_seen": 171692510, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.72265625, + "step": 7986, + "time_per_iteration": 2.412890911102295 + }, + { + "auxiliary_loss_clip": 0.01113566, + "auxiliary_loss_mlp": 0.01038838, + "balance_loss_clip": 1.02515244, + "balance_loss_mlp": 1.03964305, + "epoch": 0.4802044190590711, + "flos": 11473747441920.0, + "grad_norm": 2.571002176109277, + "language_loss": 0.78704774, + "learning_rate": 2.2248273519325956e-06, + "loss": 0.80857182, + "num_input_tokens_seen": 171710235, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7421875, + "step": 7987, + "time_per_iteration": 2.464531898498535 + }, + { + "auxiliary_loss_clip": 0.01107017, + "auxiliary_loss_mlp": 0.01036694, + "balance_loss_clip": 1.02410507, + "balance_loss_mlp": 1.03615475, + "epoch": 0.48026454231173904, + "flos": 20951003185920.0, + "grad_norm": 1.8312114483143844, + "language_loss": 0.75309592, + "learning_rate": 2.2244403545958812e-06, + "loss": 0.77453303, + "num_input_tokens_seen": 171726715, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 7988, + "time_per_iteration": 2.4185469150543213 + }, + { + "auxiliary_loss_clip": 0.01113071, + "auxiliary_loss_mlp": 0.01029368, + "balance_loss_clip": 1.01667249, + "balance_loss_mlp": 1.04115009, + "epoch": 0.48032466556440706, + "flos": 20448110822400.0, + "grad_norm": 1.9770525324174564, + "language_loss": 0.78992975, + "learning_rate": 2.224053348748365e-06, + "loss": 0.81135416, + "num_input_tokens_seen": 171743605, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 7989, + "time_per_iteration": 2.4614450931549072 + }, + { + "auxiliary_loss_clip": 0.01113161, + "auxiliary_loss_mlp": 0.01036646, + "balance_loss_clip": 1.02273488, + "balance_loss_mlp": 1.03810394, + "epoch": 0.480384788817075, + "flos": 37120823861760.0, + "grad_norm": 1.6525338075260034, + "language_loss": 0.73414218, + "learning_rate": 2.223666334404724e-06, + "loss": 0.75564027, + "num_input_tokens_seen": 171765445, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 7990, + "time_per_iteration": 2.562366008758545 + }, + { + "auxiliary_loss_clip": 0.01032695, + "auxiliary_loss_mlp": 0.0100018, + "balance_loss_clip": 0.99901813, + "balance_loss_mlp": 1.00915992, + "epoch": 0.480444912069743, + "flos": 69552577641600.0, + "grad_norm": 1.0595345338831614, + "language_loss": 0.59085703, + "learning_rate": 2.223279311579633e-06, + "loss": 0.61118573, + "num_input_tokens_seen": 171830115, + "router_z_loss_clip": 0.01159668, + "router_z_loss_mlp": 0.23535156, + "step": 7991, + "time_per_iteration": 3.1877033710479736 + }, + { + "auxiliary_loss_clip": 0.01107688, + "auxiliary_loss_mlp": 0.01029352, + "balance_loss_clip": 1.01626837, + "balance_loss_mlp": 1.03751063, + "epoch": 0.48050503532241096, + "flos": 29822231640960.0, + "grad_norm": 1.8662124275999659, + "language_loss": 0.67495418, + "learning_rate": 2.222892280287768e-06, + "loss": 0.69632453, + "num_input_tokens_seen": 171849135, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 7992, + "time_per_iteration": 2.5135016441345215 + }, + { + "auxiliary_loss_clip": 0.01109706, + "auxiliary_loss_mlp": 0.0103267, + "balance_loss_clip": 1.01969361, + "balance_loss_mlp": 1.03664112, + "epoch": 0.4805651585750789, + "flos": 23948539015680.0, + "grad_norm": 1.6211148746347477, + "language_loss": 0.76493919, + "learning_rate": 2.2225052405438056e-06, + "loss": 0.78636301, + "num_input_tokens_seen": 171868880, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.73046875, + "step": 7993, + "time_per_iteration": 2.5075619220733643 + }, + { + "auxiliary_loss_clip": 0.01108105, + "auxiliary_loss_mlp": 0.01035536, + "balance_loss_clip": 1.02267301, + "balance_loss_mlp": 1.03899574, + "epoch": 0.4806252818277469, + "flos": 25665428269440.0, + "grad_norm": 1.5028541481112037, + "language_loss": 0.78277898, + "learning_rate": 2.222118192362422e-06, + "loss": 0.80421537, + "num_input_tokens_seen": 171889455, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69140625, + "step": 7994, + "time_per_iteration": 2.4792723655700684 + }, + { + "auxiliary_loss_clip": 0.01108503, + "auxiliary_loss_mlp": 0.01032981, + "balance_loss_clip": 1.02010691, + "balance_loss_mlp": 1.03752637, + "epoch": 0.48068540508041485, + "flos": 13151996640000.0, + "grad_norm": 1.8792905950371066, + "language_loss": 0.79627287, + "learning_rate": 2.2217311357582946e-06, + "loss": 0.81768769, + "num_input_tokens_seen": 171906070, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 7995, + "time_per_iteration": 2.4605226516723633 + }, + { + "auxiliary_loss_clip": 0.0110729, + "auxiliary_loss_mlp": 0.01029971, + "balance_loss_clip": 1.01676297, + "balance_loss_mlp": 1.03693795, + "epoch": 0.4807455283330828, + "flos": 21176738208000.0, + "grad_norm": 1.8681673839648991, + "language_loss": 0.8255161, + "learning_rate": 2.2213440707461e-06, + "loss": 0.84688872, + "num_input_tokens_seen": 171926515, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 7996, + "time_per_iteration": 2.4627599716186523 + }, + { + "auxiliary_loss_clip": 0.01108595, + "auxiliary_loss_mlp": 0.01028236, + "balance_loss_clip": 1.01562989, + "balance_loss_mlp": 1.03879523, + "epoch": 0.4808056515857508, + "flos": 12275991751680.0, + "grad_norm": 1.619215200240117, + "language_loss": 0.80642337, + "learning_rate": 2.220956997340516e-06, + "loss": 0.82779169, + "num_input_tokens_seen": 171943845, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69921875, + "step": 7997, + "time_per_iteration": 2.450486660003662 + }, + { + "auxiliary_loss_clip": 0.01108626, + "auxiliary_loss_mlp": 0.01034627, + "balance_loss_clip": 1.02174699, + "balance_loss_mlp": 1.03695917, + "epoch": 0.48086577483841875, + "flos": 24826052275200.0, + "grad_norm": 1.8605056175819474, + "language_loss": 0.72481054, + "learning_rate": 2.220569915556221e-06, + "loss": 0.74624306, + "num_input_tokens_seen": 171964970, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 7998, + "time_per_iteration": 2.484501361846924 + }, + { + "auxiliary_loss_clip": 0.0111064, + "auxiliary_loss_mlp": 0.01032099, + "balance_loss_clip": 1.01893795, + "balance_loss_mlp": 1.03890526, + "epoch": 0.4809258980910867, + "flos": 24465365856000.0, + "grad_norm": 1.7021894106986095, + "language_loss": 0.71182632, + "learning_rate": 2.220182825407892e-06, + "loss": 0.73325378, + "num_input_tokens_seen": 171986340, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71484375, + "step": 7999, + "time_per_iteration": 2.5011837482452393 + }, + { + "auxiliary_loss_clip": 0.01112886, + "auxiliary_loss_mlp": 0.0104057, + "balance_loss_clip": 1.02758801, + "balance_loss_mlp": 1.03862715, + "epoch": 0.4809860213437547, + "flos": 21215952881280.0, + "grad_norm": 2.087936802810397, + "language_loss": 0.71136171, + "learning_rate": 2.2197957269102083e-06, + "loss": 0.73289621, + "num_input_tokens_seen": 172007300, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7421875, + "step": 8000, + "time_per_iteration": 2.473083019256592 + }, + { + "auxiliary_loss_clip": 0.01111531, + "auxiliary_loss_mlp": 0.01036663, + "balance_loss_clip": 1.02291203, + "balance_loss_mlp": 1.03987443, + "epoch": 0.48104614459642264, + "flos": 37632084094080.0, + "grad_norm": 1.2945806687832948, + "language_loss": 0.75104553, + "learning_rate": 2.2194086200778485e-06, + "loss": 0.77252746, + "num_input_tokens_seen": 172029585, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.71484375, + "step": 8001, + "time_per_iteration": 2.6078953742980957 + }, + { + "auxiliary_loss_clip": 0.0111278, + "auxiliary_loss_mlp": 0.01040194, + "balance_loss_clip": 1.02701581, + "balance_loss_mlp": 1.03889596, + "epoch": 0.48110626784909066, + "flos": 18406122549120.0, + "grad_norm": 1.8640621993165467, + "language_loss": 0.81407833, + "learning_rate": 2.219021504925493e-06, + "loss": 0.83560812, + "num_input_tokens_seen": 172047495, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73828125, + "step": 8002, + "time_per_iteration": 2.4381091594696045 + }, + { + "auxiliary_loss_clip": 0.01113899, + "auxiliary_loss_mlp": 0.01038529, + "balance_loss_clip": 1.02415216, + "balance_loss_mlp": 1.04037309, + "epoch": 0.48116639110175863, + "flos": 28439814856320.0, + "grad_norm": 1.7407260367663493, + "language_loss": 0.71673185, + "learning_rate": 2.218634381467819e-06, + "loss": 0.7382561, + "num_input_tokens_seen": 172067625, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.734375, + "step": 8003, + "time_per_iteration": 2.5028979778289795 + }, + { + "auxiliary_loss_clip": 0.01110475, + "auxiliary_loss_mlp": 0.01038852, + "balance_loss_clip": 1.0263828, + "balance_loss_mlp": 1.04041362, + "epoch": 0.4812265143544266, + "flos": 21725237865600.0, + "grad_norm": 1.9713418243952783, + "language_loss": 0.82751715, + "learning_rate": 2.218247249719507e-06, + "loss": 0.84901035, + "num_input_tokens_seen": 172087885, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 8004, + "time_per_iteration": 2.4438235759735107 + }, + { + "auxiliary_loss_clip": 0.0112055, + "auxiliary_loss_mlp": 0.01044746, + "balance_loss_clip": 1.02951062, + "balance_loss_mlp": 1.04235947, + "epoch": 0.48128663760709456, + "flos": 13224679810560.0, + "grad_norm": 2.0081127141146964, + "language_loss": 0.77780354, + "learning_rate": 2.217860109695239e-06, + "loss": 0.7994566, + "num_input_tokens_seen": 172105815, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.78125, + "step": 8005, + "time_per_iteration": 2.4440789222717285 + }, + { + "auxiliary_loss_clip": 0.01109918, + "auxiliary_loss_mlp": 0.01035382, + "balance_loss_clip": 1.0218395, + "balance_loss_mlp": 1.03705537, + "epoch": 0.4813467608597625, + "flos": 24243437675520.0, + "grad_norm": 3.988142696329101, + "language_loss": 0.70656502, + "learning_rate": 2.217472961409692e-06, + "loss": 0.72801799, + "num_input_tokens_seen": 172126125, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7265625, + "step": 8006, + "time_per_iteration": 2.4627490043640137 + }, + { + "auxiliary_loss_clip": 0.0111164, + "auxiliary_loss_mlp": 0.01036734, + "balance_loss_clip": 1.02357328, + "balance_loss_mlp": 1.03939271, + "epoch": 0.4814068841124305, + "flos": 27480424544640.0, + "grad_norm": 1.9148811651735764, + "language_loss": 0.70463514, + "learning_rate": 2.2170858048775495e-06, + "loss": 0.72611892, + "num_input_tokens_seen": 172141945, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.72265625, + "step": 8007, + "time_per_iteration": 2.4923551082611084 + }, + { + "auxiliary_loss_clip": 0.01112639, + "auxiliary_loss_mlp": 0.01035836, + "balance_loss_clip": 1.02225244, + "balance_loss_mlp": 1.03924334, + "epoch": 0.48146700736509845, + "flos": 19572896033280.0, + "grad_norm": 2.0099977087556202, + "language_loss": 0.71720552, + "learning_rate": 2.2166986401134914e-06, + "loss": 0.7386902, + "num_input_tokens_seen": 172161095, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.734375, + "step": 8008, + "time_per_iteration": 2.443068742752075 + }, + { + "auxiliary_loss_clip": 0.01114704, + "auxiliary_loss_mlp": 0.01046807, + "balance_loss_clip": 1.0317508, + "balance_loss_mlp": 1.03984571, + "epoch": 0.4815271306177664, + "flos": 20627771673600.0, + "grad_norm": 1.7155117192574523, + "language_loss": 0.60448718, + "learning_rate": 2.216311467132199e-06, + "loss": 0.62610233, + "num_input_tokens_seen": 172178750, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.74609375, + "step": 8009, + "time_per_iteration": 2.4860730171203613 + }, + { + "auxiliary_loss_clip": 0.01041953, + "auxiliary_loss_mlp": 0.01003034, + "balance_loss_clip": 1.00188339, + "balance_loss_mlp": 1.01788867, + "epoch": 0.4815872538704344, + "flos": 67691076232320.0, + "grad_norm": 0.861211973736155, + "language_loss": 0.61329502, + "learning_rate": 2.2159242859483547e-06, + "loss": 0.6337449, + "num_input_tokens_seen": 172240235, + "router_z_loss_clip": 0.01147461, + "router_z_loss_mlp": 0.24121094, + "step": 8010, + "time_per_iteration": 3.073617935180664 + }, + { + "auxiliary_loss_clip": 0.01115187, + "auxiliary_loss_mlp": 0.01045892, + "balance_loss_clip": 1.03135991, + "balance_loss_mlp": 1.04191947, + "epoch": 0.48164737712310235, + "flos": 22820764723200.0, + "grad_norm": 2.200850795507016, + "language_loss": 0.73003197, + "learning_rate": 2.215537096576639e-06, + "loss": 0.75164282, + "num_input_tokens_seen": 172259875, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.734375, + "step": 8011, + "time_per_iteration": 3.875464677810669 + }, + { + "auxiliary_loss_clip": 0.01108987, + "auxiliary_loss_mlp": 0.01036612, + "balance_loss_clip": 1.02398205, + "balance_loss_mlp": 1.03922546, + "epoch": 0.4817075003757703, + "flos": 23733865382400.0, + "grad_norm": 1.7669872730797296, + "language_loss": 0.79906964, + "learning_rate": 2.2151498990317354e-06, + "loss": 0.82052571, + "num_input_tokens_seen": 172280150, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 8012, + "time_per_iteration": 5.410374164581299 + }, + { + "auxiliary_loss_clip": 0.01114267, + "auxiliary_loss_mlp": 0.01047469, + "balance_loss_clip": 1.03336632, + "balance_loss_mlp": 1.04086518, + "epoch": 0.4817676236284383, + "flos": 28182909807360.0, + "grad_norm": 1.5982967759080098, + "language_loss": 0.73816693, + "learning_rate": 2.214762693328326e-06, + "loss": 0.75978434, + "num_input_tokens_seen": 172300810, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.734375, + "step": 8013, + "time_per_iteration": 4.00807785987854 + }, + { + "auxiliary_loss_clip": 0.01112131, + "auxiliary_loss_mlp": 0.01033013, + "balance_loss_clip": 1.02043676, + "balance_loss_mlp": 1.04102039, + "epoch": 0.48182774688110624, + "flos": 17091756080640.0, + "grad_norm": 4.768803838152643, + "language_loss": 0.90554619, + "learning_rate": 2.214375479481094e-06, + "loss": 0.92699754, + "num_input_tokens_seen": 172317930, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 8014, + "time_per_iteration": 2.4615042209625244 + }, + { + "auxiliary_loss_clip": 0.01116604, + "auxiliary_loss_mlp": 0.01038374, + "balance_loss_clip": 1.02456379, + "balance_loss_mlp": 1.04058647, + "epoch": 0.4818878701337742, + "flos": 12567873669120.0, + "grad_norm": 3.0531094865391073, + "language_loss": 0.74407947, + "learning_rate": 2.213988257504722e-06, + "loss": 0.76562929, + "num_input_tokens_seen": 172336340, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 8015, + "time_per_iteration": 2.434838056564331 + }, + { + "auxiliary_loss_clip": 0.01117647, + "auxiliary_loss_mlp": 0.01040504, + "balance_loss_clip": 1.02588332, + "balance_loss_mlp": 1.04072225, + "epoch": 0.48194799338644223, + "flos": 24608505553920.0, + "grad_norm": 2.017951331310383, + "language_loss": 0.8059243, + "learning_rate": 2.213601027413894e-06, + "loss": 0.82750583, + "num_input_tokens_seen": 172354315, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.76953125, + "step": 8016, + "time_per_iteration": 2.513319492340088 + }, + { + "auxiliary_loss_clip": 0.01109398, + "auxiliary_loss_mlp": 0.01035038, + "balance_loss_clip": 1.02206254, + "balance_loss_mlp": 1.04101717, + "epoch": 0.4820081166391102, + "flos": 21105204272640.0, + "grad_norm": 2.4127244097624847, + "language_loss": 0.76781118, + "learning_rate": 2.2132137892232933e-06, + "loss": 0.78925556, + "num_input_tokens_seen": 172372695, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6796875, + "step": 8017, + "time_per_iteration": 2.4602606296539307 + }, + { + "auxiliary_loss_clip": 0.011107, + "auxiliary_loss_mlp": 0.01032569, + "balance_loss_clip": 1.01862764, + "balance_loss_mlp": 1.04151559, + "epoch": 0.48206823989177816, + "flos": 25264593423360.0, + "grad_norm": 1.9887798442379552, + "language_loss": 0.80156118, + "learning_rate": 2.2128265429476043e-06, + "loss": 0.82299387, + "num_input_tokens_seen": 172390905, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.69140625, + "step": 8018, + "time_per_iteration": 2.5529282093048096 + }, + { + "auxiliary_loss_clip": 0.01113443, + "auxiliary_loss_mlp": 0.01029419, + "balance_loss_clip": 1.01667559, + "balance_loss_mlp": 1.04109669, + "epoch": 0.4821283631444461, + "flos": 24645062620800.0, + "grad_norm": 1.7653706812529009, + "language_loss": 0.75843483, + "learning_rate": 2.2124392886015124e-06, + "loss": 0.77986348, + "num_input_tokens_seen": 172412295, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 8019, + "time_per_iteration": 2.4978489875793457 + }, + { + "auxiliary_loss_clip": 0.01112605, + "auxiliary_loss_mlp": 0.01036673, + "balance_loss_clip": 1.02286255, + "balance_loss_mlp": 1.03955722, + "epoch": 0.4821884863971141, + "flos": 23952094462080.0, + "grad_norm": 1.7828460534537498, + "language_loss": 0.78554976, + "learning_rate": 2.212052026199701e-06, + "loss": 0.80704254, + "num_input_tokens_seen": 172432625, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.73046875, + "step": 8020, + "time_per_iteration": 2.503870725631714 + }, + { + "auxiliary_loss_clip": 0.01112649, + "auxiliary_loss_mlp": 0.01034544, + "balance_loss_clip": 1.02043533, + "balance_loss_mlp": 1.04134321, + "epoch": 0.48224860964978206, + "flos": 17160668323200.0, + "grad_norm": 2.4275685595470207, + "language_loss": 0.69718045, + "learning_rate": 2.211664755756855e-06, + "loss": 0.71865243, + "num_input_tokens_seen": 172450010, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7109375, + "step": 8021, + "time_per_iteration": 2.4298038482666016 + }, + { + "auxiliary_loss_clip": 0.011165, + "auxiliary_loss_mlp": 0.01032417, + "balance_loss_clip": 1.01797438, + "balance_loss_mlp": 1.0407902, + "epoch": 0.48230873290245, + "flos": 23075838178560.0, + "grad_norm": 1.6547112313669838, + "language_loss": 0.62773043, + "learning_rate": 2.2112774772876603e-06, + "loss": 0.64921963, + "num_input_tokens_seen": 172469080, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7578125, + "step": 8022, + "time_per_iteration": 2.4862682819366455 + }, + { + "auxiliary_loss_clip": 0.01109497, + "auxiliary_loss_mlp": 0.01029473, + "balance_loss_clip": 1.01683092, + "balance_loss_mlp": 1.03976464, + "epoch": 0.482368856155118, + "flos": 19353517718400.0, + "grad_norm": 2.257171661165274, + "language_loss": 0.66345549, + "learning_rate": 2.2108901908068028e-06, + "loss": 0.68484527, + "num_input_tokens_seen": 172484850, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 8023, + "time_per_iteration": 2.4498074054718018 + }, + { + "auxiliary_loss_clip": 0.01109691, + "auxiliary_loss_mlp": 0.01035383, + "balance_loss_clip": 1.02181077, + "balance_loss_mlp": 1.0379076, + "epoch": 0.48242897940778595, + "flos": 20078984707200.0, + "grad_norm": 2.6609441563285485, + "language_loss": 0.76680458, + "learning_rate": 2.2105028963289683e-06, + "loss": 0.78825533, + "num_input_tokens_seen": 172503525, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.71875, + "step": 8024, + "time_per_iteration": 2.5641326904296875 + }, + { + "auxiliary_loss_clip": 0.01111982, + "auxiliary_loss_mlp": 0.01033968, + "balance_loss_clip": 1.01926339, + "balance_loss_mlp": 1.03856826, + "epoch": 0.4824891026604539, + "flos": 23403989854080.0, + "grad_norm": 1.4456982310337658, + "language_loss": 0.75299227, + "learning_rate": 2.2101155938688423e-06, + "loss": 0.77445179, + "num_input_tokens_seen": 172524360, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.734375, + "step": 8025, + "time_per_iteration": 2.4700748920440674 + }, + { + "auxiliary_loss_clip": 0.0111201, + "auxiliary_loss_mlp": 0.01034955, + "balance_loss_clip": 1.02159774, + "balance_loss_mlp": 1.04015994, + "epoch": 0.4825492259131219, + "flos": 20368675895040.0, + "grad_norm": 1.85740453148256, + "language_loss": 0.71010149, + "learning_rate": 2.209728283441112e-06, + "loss": 0.7315712, + "num_input_tokens_seen": 172541480, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.71875, + "step": 8026, + "time_per_iteration": 2.451942205429077 + }, + { + "auxiliary_loss_clip": 0.01115796, + "auxiliary_loss_mlp": 0.01043054, + "balance_loss_clip": 1.02739012, + "balance_loss_mlp": 1.04088664, + "epoch": 0.48260934916578985, + "flos": 14319021519360.0, + "grad_norm": 2.002376238963681, + "language_loss": 0.74738306, + "learning_rate": 2.209340965060465e-06, + "loss": 0.76897156, + "num_input_tokens_seen": 172559005, + "router_z_loss_clip": 0.15625, + "router_z_loss_mlp": 0.75, + "step": 8027, + "time_per_iteration": 2.511625051498413 + }, + { + "auxiliary_loss_clip": 0.01116324, + "auxiliary_loss_mlp": 0.01036177, + "balance_loss_clip": 1.02260458, + "balance_loss_mlp": 1.0418303, + "epoch": 0.4826694724184578, + "flos": 22121152548480.0, + "grad_norm": 1.8015680699639052, + "language_loss": 0.6744982, + "learning_rate": 2.2089536387415868e-06, + "loss": 0.69602323, + "num_input_tokens_seen": 172578435, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.74609375, + "step": 8028, + "time_per_iteration": 2.487610101699829 + }, + { + "auxiliary_loss_clip": 0.01114464, + "auxiliary_loss_mlp": 0.01039272, + "balance_loss_clip": 1.02490783, + "balance_loss_mlp": 1.04192257, + "epoch": 0.48272959567112583, + "flos": 16181169373440.0, + "grad_norm": 1.8869203156454395, + "language_loss": 0.73063505, + "learning_rate": 2.2085663044991655e-06, + "loss": 0.75217235, + "num_input_tokens_seen": 172596095, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7265625, + "step": 8029, + "time_per_iteration": 2.4256598949432373 + }, + { + "auxiliary_loss_clip": 0.01114009, + "auxiliary_loss_mlp": 0.01031401, + "balance_loss_clip": 1.01691651, + "balance_loss_mlp": 1.03949094, + "epoch": 0.4827897189237938, + "flos": 23180445561600.0, + "grad_norm": 1.9568889088417416, + "language_loss": 0.85374999, + "learning_rate": 2.2081789623478896e-06, + "loss": 0.87520409, + "num_input_tokens_seen": 172615255, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7421875, + "step": 8030, + "time_per_iteration": 2.4838480949401855 + }, + { + "auxiliary_loss_clip": 0.01111314, + "auxiliary_loss_mlp": 0.01032477, + "balance_loss_clip": 1.01917291, + "balance_loss_mlp": 1.03858352, + "epoch": 0.48284984217646176, + "flos": 21652626522240.0, + "grad_norm": 1.946134860300181, + "language_loss": 0.74173188, + "learning_rate": 2.2077916123024466e-06, + "loss": 0.76316977, + "num_input_tokens_seen": 172633185, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7265625, + "step": 8031, + "time_per_iteration": 2.475564956665039 + }, + { + "auxiliary_loss_clip": 0.01118074, + "auxiliary_loss_mlp": 0.01045075, + "balance_loss_clip": 1.03023958, + "balance_loss_mlp": 1.04181576, + "epoch": 0.48290996542912973, + "flos": 31467443304960.0, + "grad_norm": 1.8194651882134072, + "language_loss": 0.71833324, + "learning_rate": 2.2074042543775245e-06, + "loss": 0.73996472, + "num_input_tokens_seen": 172654280, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.76171875, + "step": 8032, + "time_per_iteration": 2.5389230251312256 + }, + { + "auxiliary_loss_clip": 0.01111799, + "auxiliary_loss_mlp": 0.01036984, + "balance_loss_clip": 1.02326274, + "balance_loss_mlp": 1.03896618, + "epoch": 0.4829700886817977, + "flos": 24461954064000.0, + "grad_norm": 1.5190699612157064, + "language_loss": 0.74008, + "learning_rate": 2.2070168885878126e-06, + "loss": 0.76156777, + "num_input_tokens_seen": 172675545, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7265625, + "step": 8033, + "time_per_iteration": 2.497344493865967 + }, + { + "auxiliary_loss_clip": 0.01118056, + "auxiliary_loss_mlp": 0.01037099, + "balance_loss_clip": 1.02273428, + "balance_loss_mlp": 1.04200494, + "epoch": 0.48303021193446566, + "flos": 25702164904320.0, + "grad_norm": 1.7070178882470917, + "language_loss": 0.82929307, + "learning_rate": 2.2066295149479996e-06, + "loss": 0.85084462, + "num_input_tokens_seen": 172696455, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.76171875, + "step": 8034, + "time_per_iteration": 2.504986524581909 + }, + { + "auxiliary_loss_clip": 0.01110784, + "auxiliary_loss_mlp": 0.01032285, + "balance_loss_clip": 1.01862347, + "balance_loss_mlp": 1.04048431, + "epoch": 0.4830903351871336, + "flos": 20085233673600.0, + "grad_norm": 2.2841237596844493, + "language_loss": 0.79519325, + "learning_rate": 2.2062421334727744e-06, + "loss": 0.81662393, + "num_input_tokens_seen": 172716720, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.703125, + "step": 8035, + "time_per_iteration": 2.497851610183716 + }, + { + "auxiliary_loss_clip": 0.01115806, + "auxiliary_loss_mlp": 0.01041785, + "balance_loss_clip": 1.02656746, + "balance_loss_mlp": 1.04139149, + "epoch": 0.4831504584398016, + "flos": 39452216014080.0, + "grad_norm": 1.7925521800027493, + "language_loss": 0.69359076, + "learning_rate": 2.2058547441768267e-06, + "loss": 0.71516669, + "num_input_tokens_seen": 172737435, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.7421875, + "step": 8036, + "time_per_iteration": 2.6260759830474854 + }, + { + "auxiliary_loss_clip": 0.01112129, + "auxiliary_loss_mlp": 0.01034751, + "balance_loss_clip": 1.0211308, + "balance_loss_mlp": 1.03983057, + "epoch": 0.48321058169246955, + "flos": 20006588845440.0, + "grad_norm": 2.034912964838748, + "language_loss": 0.72518653, + "learning_rate": 2.205467347074847e-06, + "loss": 0.74665534, + "num_input_tokens_seen": 172755700, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.72265625, + "step": 8037, + "time_per_iteration": 2.4452965259552 + }, + { + "auxiliary_loss_clip": 0.01120439, + "auxiliary_loss_mlp": 0.0103565, + "balance_loss_clip": 1.02053404, + "balance_loss_mlp": 1.04226792, + "epoch": 0.4832707049451375, + "flos": 20741465197440.0, + "grad_norm": 2.369475157435804, + "language_loss": 0.69122416, + "learning_rate": 2.205079942181525e-06, + "loss": 0.71278501, + "num_input_tokens_seen": 172775185, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.78125, + "step": 8038, + "time_per_iteration": 2.4694747924804688 + }, + { + "auxiliary_loss_clip": 0.01114495, + "auxiliary_loss_mlp": 0.01036639, + "balance_loss_clip": 1.02201188, + "balance_loss_mlp": 1.04133189, + "epoch": 0.4833308281978055, + "flos": 33145584762240.0, + "grad_norm": 1.4952565926757524, + "language_loss": 0.78972542, + "learning_rate": 2.20469252951155e-06, + "loss": 0.8112368, + "num_input_tokens_seen": 172796990, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.73046875, + "step": 8039, + "time_per_iteration": 2.5778839588165283 + }, + { + "auxiliary_loss_clip": 0.01116832, + "auxiliary_loss_mlp": 0.01032622, + "balance_loss_clip": 1.01874638, + "balance_loss_mlp": 1.04335415, + "epoch": 0.48339095145047345, + "flos": 19099234362240.0, + "grad_norm": 1.6799663014860025, + "language_loss": 0.76981616, + "learning_rate": 2.2043051090796143e-06, + "loss": 0.79131073, + "num_input_tokens_seen": 172814915, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.734375, + "step": 8040, + "time_per_iteration": 2.4846322536468506 + }, + { + "auxiliary_loss_clip": 0.01116146, + "auxiliary_loss_mlp": 0.01037901, + "balance_loss_clip": 1.02283335, + "balance_loss_mlp": 1.04120946, + "epoch": 0.4834510747031414, + "flos": 34459448440320.0, + "grad_norm": 1.5584368035119462, + "language_loss": 0.75443131, + "learning_rate": 2.203917680900409e-06, + "loss": 0.77597177, + "num_input_tokens_seen": 172837060, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.75, + "step": 8041, + "time_per_iteration": 2.5853140354156494 + }, + { + "auxiliary_loss_clip": 0.01115991, + "auxiliary_loss_mlp": 0.0103594, + "balance_loss_clip": 1.02178383, + "balance_loss_mlp": 1.04486728, + "epoch": 0.48351119795580944, + "flos": 27380845065600.0, + "grad_norm": 1.8135207231669344, + "language_loss": 0.66745925, + "learning_rate": 2.203530244988624e-06, + "loss": 0.68897855, + "num_input_tokens_seen": 172856545, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7109375, + "step": 8042, + "time_per_iteration": 2.5322182178497314 + }, + { + "auxiliary_loss_clip": 0.01040325, + "auxiliary_loss_mlp": 0.00998367, + "balance_loss_clip": 0.99714488, + "balance_loss_mlp": 1.0165081, + "epoch": 0.4835713212084774, + "flos": 67143941291520.0, + "grad_norm": 0.687656922942032, + "language_loss": 0.58557642, + "learning_rate": 2.2031428013589517e-06, + "loss": 0.60596335, + "num_input_tokens_seen": 172923055, + "router_z_loss_clip": 0.01220703, + "router_z_loss_mlp": 0.23828125, + "step": 8043, + "time_per_iteration": 3.1435444355010986 + }, + { + "auxiliary_loss_clip": 0.01115264, + "auxiliary_loss_mlp": 0.01034627, + "balance_loss_clip": 1.01982713, + "balance_loss_mlp": 1.04060805, + "epoch": 0.48363144446114537, + "flos": 17967473660160.0, + "grad_norm": 1.8614249809437893, + "language_loss": 0.71973354, + "learning_rate": 2.2027553500260847e-06, + "loss": 0.7412324, + "num_input_tokens_seen": 172940700, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7421875, + "step": 8044, + "time_per_iteration": 2.4688329696655273 + }, + { + "auxiliary_loss_clip": 0.01113296, + "auxiliary_loss_mlp": 0.01032041, + "balance_loss_clip": 1.01702118, + "balance_loss_mlp": 1.04181921, + "epoch": 0.48369156771381333, + "flos": 20593513077120.0, + "grad_norm": 1.358705165779184, + "language_loss": 0.75938857, + "learning_rate": 2.202367891004714e-06, + "loss": 0.78084195, + "num_input_tokens_seen": 172961125, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.71484375, + "step": 8045, + "time_per_iteration": 2.455991506576538 + }, + { + "auxiliary_loss_clip": 0.01117445, + "auxiliary_loss_mlp": 0.0104056, + "balance_loss_clip": 1.02640939, + "balance_loss_mlp": 1.04251719, + "epoch": 0.4837516909664813, + "flos": 22675075159680.0, + "grad_norm": 1.8505124624812508, + "language_loss": 0.69661564, + "learning_rate": 2.201980424309533e-06, + "loss": 0.71819568, + "num_input_tokens_seen": 172980405, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.75, + "step": 8046, + "time_per_iteration": 2.480437994003296 + }, + { + "auxiliary_loss_clip": 0.01113741, + "auxiliary_loss_mlp": 0.01036039, + "balance_loss_clip": 1.02159083, + "balance_loss_mlp": 1.04073739, + "epoch": 0.48381181421914926, + "flos": 25518625384320.0, + "grad_norm": 3.209923694390607, + "language_loss": 0.819103, + "learning_rate": 2.2015929499552337e-06, + "loss": 0.84060085, + "num_input_tokens_seen": 172999105, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.73046875, + "step": 8047, + "time_per_iteration": 2.4875996112823486 + }, + { + "auxiliary_loss_clip": 0.01111465, + "auxiliary_loss_mlp": 0.01031694, + "balance_loss_clip": 1.01802719, + "balance_loss_mlp": 1.04047942, + "epoch": 0.4838719374718172, + "flos": 24207491139840.0, + "grad_norm": 1.602624612336977, + "language_loss": 0.80215144, + "learning_rate": 2.2012054679565092e-06, + "loss": 0.82358307, + "num_input_tokens_seen": 173019935, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7109375, + "step": 8048, + "time_per_iteration": 2.5097532272338867 + }, + { + "auxiliary_loss_clip": 0.0111735, + "auxiliary_loss_mlp": 0.01036589, + "balance_loss_clip": 1.02204585, + "balance_loss_mlp": 1.0415504, + "epoch": 0.4839320607244852, + "flos": 26724577628160.0, + "grad_norm": 1.5504815305200743, + "language_loss": 0.81360143, + "learning_rate": 2.200817978328054e-06, + "loss": 0.83514082, + "num_input_tokens_seen": 173039700, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7578125, + "step": 8049, + "time_per_iteration": 2.5025296211242676 + }, + { + "auxiliary_loss_clip": 0.01111119, + "auxiliary_loss_mlp": 0.01034433, + "balance_loss_clip": 1.02170801, + "balance_loss_mlp": 1.04200411, + "epoch": 0.48399218397715316, + "flos": 20448900921600.0, + "grad_norm": 1.7765572151997517, + "language_loss": 0.72636938, + "learning_rate": 2.2004304810845602e-06, + "loss": 0.74782485, + "num_input_tokens_seen": 173059170, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 8050, + "time_per_iteration": 2.4983279705047607 + }, + { + "auxiliary_loss_clip": 0.01039152, + "auxiliary_loss_mlp": 0.01005399, + "balance_loss_clip": 1.00414741, + "balance_loss_mlp": 1.01505625, + "epoch": 0.4840523072298211, + "flos": 67180570185600.0, + "grad_norm": 0.7015070380534334, + "language_loss": 0.56459856, + "learning_rate": 2.200042976240723e-06, + "loss": 0.58504415, + "num_input_tokens_seen": 173119000, + "router_z_loss_clip": 0.01251221, + "router_z_loss_mlp": 0.24121094, + "step": 8051, + "time_per_iteration": 3.1124837398529053 + }, + { + "auxiliary_loss_clip": 0.01116144, + "auxiliary_loss_mlp": 0.01033942, + "balance_loss_clip": 1.0198456, + "balance_loss_mlp": 1.04258502, + "epoch": 0.4841124304824891, + "flos": 22411490181120.0, + "grad_norm": 2.416646260203107, + "language_loss": 0.7510823, + "learning_rate": 2.199655463811236e-06, + "loss": 0.77258313, + "num_input_tokens_seen": 173137570, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.734375, + "step": 8052, + "time_per_iteration": 3.970653772354126 + }, + { + "auxiliary_loss_clip": 0.01114314, + "auxiliary_loss_mlp": 0.01033056, + "balance_loss_clip": 1.01953709, + "balance_loss_mlp": 1.04124272, + "epoch": 0.48417255373515705, + "flos": 13843959217920.0, + "grad_norm": 3.0848333967382855, + "language_loss": 0.65859687, + "learning_rate": 2.1992679438107936e-06, + "loss": 0.68007052, + "num_input_tokens_seen": 173154355, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73046875, + "step": 8053, + "time_per_iteration": 2.489314079284668 + }, + { + "auxiliary_loss_clip": 0.01108306, + "auxiliary_loss_mlp": 0.01033063, + "balance_loss_clip": 1.01981306, + "balance_loss_mlp": 1.03776336, + "epoch": 0.484232676987825, + "flos": 31649689935360.0, + "grad_norm": 1.8753990029707186, + "language_loss": 0.6933912, + "learning_rate": 2.198880416254091e-06, + "loss": 0.71480489, + "num_input_tokens_seen": 173174845, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.703125, + "step": 8054, + "time_per_iteration": 4.118170976638794 + }, + { + "auxiliary_loss_clip": 0.01110556, + "auxiliary_loss_mlp": 0.01035408, + "balance_loss_clip": 1.02187181, + "balance_loss_mlp": 1.03860784, + "epoch": 0.48429280024049304, + "flos": 24095377814400.0, + "grad_norm": 1.7081803235265158, + "language_loss": 0.69577026, + "learning_rate": 2.1984928811558233e-06, + "loss": 0.7172299, + "num_input_tokens_seen": 173195025, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71875, + "step": 8055, + "time_per_iteration": 3.932403326034546 + }, + { + "auxiliary_loss_clip": 0.01115214, + "auxiliary_loss_mlp": 0.01036587, + "balance_loss_clip": 1.0229013, + "balance_loss_mlp": 1.04260492, + "epoch": 0.484352923493161, + "flos": 17530081747200.0, + "grad_norm": 2.9345474086324397, + "language_loss": 0.631603, + "learning_rate": 2.198105338530685e-06, + "loss": 0.65312105, + "num_input_tokens_seen": 173213065, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7265625, + "step": 8056, + "time_per_iteration": 2.4628608226776123 + }, + { + "auxiliary_loss_clip": 0.01110953, + "auxiliary_loss_mlp": 0.0103397, + "balance_loss_clip": 1.01945043, + "balance_loss_mlp": 1.03856075, + "epoch": 0.48441304674582897, + "flos": 29166862043520.0, + "grad_norm": 1.6727278675155979, + "language_loss": 0.67380416, + "learning_rate": 2.1977177883933726e-06, + "loss": 0.69525343, + "num_input_tokens_seen": 173234545, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7265625, + "step": 8057, + "time_per_iteration": 2.5488758087158203 + }, + { + "auxiliary_loss_clip": 0.0111047, + "auxiliary_loss_mlp": 0.01036278, + "balance_loss_clip": 1.02286661, + "balance_loss_mlp": 1.03944063, + "epoch": 0.48447316999849693, + "flos": 15886701676800.0, + "grad_norm": 1.62294394814829, + "language_loss": 0.81633735, + "learning_rate": 2.1973302307585827e-06, + "loss": 0.83780485, + "num_input_tokens_seen": 173252175, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7109375, + "step": 8058, + "time_per_iteration": 2.4864389896392822 + }, + { + "auxiliary_loss_clip": 0.01116596, + "auxiliary_loss_mlp": 0.01038397, + "balance_loss_clip": 1.02458692, + "balance_loss_mlp": 1.04142284, + "epoch": 0.4845332932511649, + "flos": 24381405815040.0, + "grad_norm": 1.5675258134335472, + "language_loss": 0.79917222, + "learning_rate": 2.1969426656410097e-06, + "loss": 0.82072222, + "num_input_tokens_seen": 173268790, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.75390625, + "step": 8059, + "time_per_iteration": 2.4964730739593506 + }, + { + "auxiliary_loss_clip": 0.01117834, + "auxiliary_loss_mlp": 0.0104156, + "balance_loss_clip": 1.02709424, + "balance_loss_mlp": 1.04217446, + "epoch": 0.48459341650383286, + "flos": 37116478316160.0, + "grad_norm": 2.4233986338774347, + "language_loss": 0.66882968, + "learning_rate": 2.196555093055352e-06, + "loss": 0.69042355, + "num_input_tokens_seen": 173288030, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7578125, + "step": 8060, + "time_per_iteration": 2.6209259033203125 + }, + { + "auxiliary_loss_clip": 0.01116591, + "auxiliary_loss_mlp": 0.01040178, + "balance_loss_clip": 1.02654088, + "balance_loss_mlp": 1.04357326, + "epoch": 0.48465353975650083, + "flos": 22966777509120.0, + "grad_norm": 1.8494683744964096, + "language_loss": 0.67328548, + "learning_rate": 2.1961675130163046e-06, + "loss": 0.69485319, + "num_input_tokens_seen": 173305965, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73046875, + "step": 8061, + "time_per_iteration": 2.460986614227295 + }, + { + "auxiliary_loss_clip": 0.01116735, + "auxiliary_loss_mlp": 0.0104191, + "balance_loss_clip": 1.0274322, + "balance_loss_mlp": 1.04356933, + "epoch": 0.4847136630091688, + "flos": 17707695523200.0, + "grad_norm": 2.133282380017761, + "language_loss": 0.82559311, + "learning_rate": 2.1957799255385653e-06, + "loss": 0.84717953, + "num_input_tokens_seen": 173321985, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.73046875, + "step": 8062, + "time_per_iteration": 2.453993320465088 + }, + { + "auxiliary_loss_clip": 0.01112047, + "auxiliary_loss_mlp": 0.01035491, + "balance_loss_clip": 1.022277, + "balance_loss_mlp": 1.04087675, + "epoch": 0.48477378626183676, + "flos": 22018269018240.0, + "grad_norm": 1.7643008090816974, + "language_loss": 0.7443378, + "learning_rate": 2.1953923306368325e-06, + "loss": 0.76581317, + "num_input_tokens_seen": 173341315, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 8063, + "time_per_iteration": 2.4603588581085205 + }, + { + "auxiliary_loss_clip": 0.01113086, + "auxiliary_loss_mlp": 0.01033324, + "balance_loss_clip": 1.01978183, + "balance_loss_mlp": 1.04069591, + "epoch": 0.4848339095145047, + "flos": 27962956874880.0, + "grad_norm": 1.6491790763512546, + "language_loss": 0.78826106, + "learning_rate": 2.1950047283258023e-06, + "loss": 0.80972517, + "num_input_tokens_seen": 173361055, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7265625, + "step": 8064, + "time_per_iteration": 2.5214664936065674 + }, + { + "auxiliary_loss_clip": 0.01110182, + "auxiliary_loss_mlp": 0.01036452, + "balance_loss_clip": 1.02426863, + "balance_loss_mlp": 1.04178667, + "epoch": 0.4848940327671727, + "flos": 21688752625920.0, + "grad_norm": 1.866783501124255, + "language_loss": 0.79383814, + "learning_rate": 2.194617118620173e-06, + "loss": 0.81530446, + "num_input_tokens_seen": 173379255, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.68359375, + "step": 8065, + "time_per_iteration": 2.445235013961792 + }, + { + "auxiliary_loss_clip": 0.01104927, + "auxiliary_loss_mlp": 0.01033365, + "balance_loss_clip": 1.02112269, + "balance_loss_mlp": 1.03714252, + "epoch": 0.48495415601984065, + "flos": 20631578515200.0, + "grad_norm": 2.505071872189949, + "language_loss": 0.76120496, + "learning_rate": 2.194229501534644e-06, + "loss": 0.78258789, + "num_input_tokens_seen": 173398370, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.67578125, + "step": 8066, + "time_per_iteration": 2.484790325164795 + }, + { + "auxiliary_loss_clip": 0.01111648, + "auxiliary_loss_mlp": 0.01033332, + "balance_loss_clip": 1.02022457, + "balance_loss_mlp": 1.04121971, + "epoch": 0.4850142792725086, + "flos": 25628152930560.0, + "grad_norm": 1.8377201756800503, + "language_loss": 0.7205655, + "learning_rate": 2.193841877083912e-06, + "loss": 0.74201524, + "num_input_tokens_seen": 173419595, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 8067, + "time_per_iteration": 2.4876203536987305 + }, + { + "auxiliary_loss_clip": 0.01112073, + "auxiliary_loss_mlp": 0.0103587, + "balance_loss_clip": 1.02231634, + "balance_loss_mlp": 1.04024172, + "epoch": 0.4850744025251766, + "flos": 13771958405760.0, + "grad_norm": 2.0010459311949393, + "language_loss": 0.79434109, + "learning_rate": 2.1934542452826767e-06, + "loss": 0.81582052, + "num_input_tokens_seen": 173435390, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.71875, + "step": 8068, + "time_per_iteration": 2.4537808895111084 + }, + { + "auxiliary_loss_clip": 0.01108507, + "auxiliary_loss_mlp": 0.01034395, + "balance_loss_clip": 1.02171147, + "balance_loss_mlp": 1.0385673, + "epoch": 0.4851345257778446, + "flos": 20261339078400.0, + "grad_norm": 1.4177927500996443, + "language_loss": 0.8413924, + "learning_rate": 2.193066606145638e-06, + "loss": 0.86282146, + "num_input_tokens_seen": 173454095, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 8069, + "time_per_iteration": 2.4553275108337402 + }, + { + "auxiliary_loss_clip": 0.0110935, + "auxiliary_loss_mlp": 0.0103299, + "balance_loss_clip": 1.02042496, + "balance_loss_mlp": 1.03913558, + "epoch": 0.48519464903051257, + "flos": 27089681420160.0, + "grad_norm": 1.6522403411207847, + "language_loss": 0.77863526, + "learning_rate": 2.192678959687493e-06, + "loss": 0.8000586, + "num_input_tokens_seen": 173475300, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 8070, + "time_per_iteration": 2.5032036304473877 + }, + { + "auxiliary_loss_clip": 0.01110754, + "auxiliary_loss_mlp": 0.01033168, + "balance_loss_clip": 1.01985812, + "balance_loss_mlp": 1.0400399, + "epoch": 0.48525477228318054, + "flos": 17127235739520.0, + "grad_norm": 2.1929202067055993, + "language_loss": 0.78031409, + "learning_rate": 2.192291305922943e-06, + "loss": 0.80175334, + "num_input_tokens_seen": 173492005, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.70703125, + "step": 8071, + "time_per_iteration": 2.4315407276153564 + }, + { + "auxiliary_loss_clip": 0.01109006, + "auxiliary_loss_mlp": 0.01032016, + "balance_loss_clip": 1.01822925, + "balance_loss_mlp": 1.03733289, + "epoch": 0.4853148955358485, + "flos": 28180324028160.0, + "grad_norm": 1.7778798626181176, + "language_loss": 0.72204757, + "learning_rate": 2.1919036448666873e-06, + "loss": 0.74345779, + "num_input_tokens_seen": 173511995, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.71875, + "step": 8072, + "time_per_iteration": 2.510474920272827 + }, + { + "auxiliary_loss_clip": 0.01116993, + "auxiliary_loss_mlp": 0.01039835, + "balance_loss_clip": 1.02580357, + "balance_loss_mlp": 1.04254019, + "epoch": 0.48537501878851647, + "flos": 17493309198720.0, + "grad_norm": 2.999761551965867, + "language_loss": 0.8779549, + "learning_rate": 2.1915159765334262e-06, + "loss": 0.89952314, + "num_input_tokens_seen": 173530215, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 8073, + "time_per_iteration": 2.4295654296875 + }, + { + "auxiliary_loss_clip": 0.01106811, + "auxiliary_loss_mlp": 0.01031305, + "balance_loss_clip": 1.01805508, + "balance_loss_mlp": 1.03857493, + "epoch": 0.48543514204118443, + "flos": 28584857975040.0, + "grad_norm": 1.702758380167849, + "language_loss": 0.60793108, + "learning_rate": 2.19112830093786e-06, + "loss": 0.62931222, + "num_input_tokens_seen": 173550920, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.68359375, + "step": 8074, + "time_per_iteration": 2.641831636428833 + }, + { + "auxiliary_loss_clip": 0.01112393, + "auxiliary_loss_mlp": 0.01039275, + "balance_loss_clip": 1.02540481, + "balance_loss_mlp": 1.03871894, + "epoch": 0.4854952652938524, + "flos": 20959981585920.0, + "grad_norm": 1.6649133015556126, + "language_loss": 0.73151296, + "learning_rate": 2.19074061809469e-06, + "loss": 0.75302958, + "num_input_tokens_seen": 173569065, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.734375, + "step": 8075, + "time_per_iteration": 2.4624290466308594 + }, + { + "auxiliary_loss_clip": 0.01108632, + "auxiliary_loss_mlp": 0.01035606, + "balance_loss_clip": 1.02328563, + "balance_loss_mlp": 1.04028702, + "epoch": 0.48555538854652036, + "flos": 66529543155840.0, + "grad_norm": 1.6285965401893183, + "language_loss": 0.82012558, + "learning_rate": 2.1903529280186163e-06, + "loss": 0.84156799, + "num_input_tokens_seen": 173596085, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.68359375, + "step": 8076, + "time_per_iteration": 2.902468681335449 + }, + { + "auxiliary_loss_clip": 0.01112144, + "auxiliary_loss_mlp": 0.01033517, + "balance_loss_clip": 1.01899099, + "balance_loss_mlp": 1.0407958, + "epoch": 0.4856155117991883, + "flos": 15924982596480.0, + "grad_norm": 1.793912725367087, + "language_loss": 0.86204815, + "learning_rate": 2.1899652307243407e-06, + "loss": 0.88350475, + "num_input_tokens_seen": 173613900, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7109375, + "step": 8077, + "time_per_iteration": 2.4470572471618652 + }, + { + "auxiliary_loss_clip": 0.01035955, + "auxiliary_loss_mlp": 0.01003512, + "balance_loss_clip": 1.00206935, + "balance_loss_mlp": 1.01168394, + "epoch": 0.4856756350518563, + "flos": 71047395060480.0, + "grad_norm": 0.9017192941717106, + "language_loss": 0.58489066, + "learning_rate": 2.189577526226564e-06, + "loss": 0.60528529, + "num_input_tokens_seen": 173671305, + "router_z_loss_clip": 0.0144043, + "router_z_loss_mlp": 0.24316406, + "step": 8078, + "time_per_iteration": 3.061302661895752 + }, + { + "auxiliary_loss_clip": 0.01115187, + "auxiliary_loss_mlp": 0.01030587, + "balance_loss_clip": 1.01750946, + "balance_loss_mlp": 1.04146993, + "epoch": 0.48573575830452426, + "flos": 29825679346560.0, + "grad_norm": 1.8290534457206422, + "language_loss": 0.72197151, + "learning_rate": 2.1891898145399884e-06, + "loss": 0.7434293, + "num_input_tokens_seen": 173692070, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.734375, + "step": 8079, + "time_per_iteration": 2.545018434524536 + }, + { + "auxiliary_loss_clip": 0.0111477, + "auxiliary_loss_mlp": 0.01029859, + "balance_loss_clip": 1.01643038, + "balance_loss_mlp": 1.04235518, + "epoch": 0.4857958815571922, + "flos": 17639501552640.0, + "grad_norm": 2.180592453343409, + "language_loss": 0.79515052, + "learning_rate": 2.1888020956793172e-06, + "loss": 0.81659681, + "num_input_tokens_seen": 173709785, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.72265625, + "step": 8080, + "time_per_iteration": 2.4793026447296143 + }, + { + "auxiliary_loss_clip": 0.01111199, + "auxiliary_loss_mlp": 0.01030093, + "balance_loss_clip": 1.01659858, + "balance_loss_mlp": 1.03938115, + "epoch": 0.4858560048098602, + "flos": 21105491581440.0, + "grad_norm": 2.102088815710231, + "language_loss": 0.83866465, + "learning_rate": 2.188414369659251e-06, + "loss": 0.86007756, + "num_input_tokens_seen": 173728770, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71875, + "step": 8081, + "time_per_iteration": 2.4615542888641357 + }, + { + "auxiliary_loss_clip": 0.0110941, + "auxiliary_loss_mlp": 0.01032193, + "balance_loss_clip": 1.01766098, + "balance_loss_mlp": 1.03858256, + "epoch": 0.4859161280625282, + "flos": 22090844448000.0, + "grad_norm": 1.4514708090647532, + "language_loss": 0.83281112, + "learning_rate": 2.1880266364944924e-06, + "loss": 0.85422719, + "num_input_tokens_seen": 173747355, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.70703125, + "step": 8082, + "time_per_iteration": 2.506359100341797 + }, + { + "auxiliary_loss_clip": 0.01111508, + "auxiliary_loss_mlp": 0.01032003, + "balance_loss_clip": 1.01930749, + "balance_loss_mlp": 1.04239488, + "epoch": 0.4859762513151962, + "flos": 17493452853120.0, + "grad_norm": 2.0513098734750153, + "language_loss": 0.87210095, + "learning_rate": 2.187638896199746e-06, + "loss": 0.89353603, + "num_input_tokens_seen": 173764825, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 8083, + "time_per_iteration": 2.4269142150878906 + }, + { + "auxiliary_loss_clip": 0.01109655, + "auxiliary_loss_mlp": 0.01038876, + "balance_loss_clip": 1.0264957, + "balance_loss_mlp": 1.03958535, + "epoch": 0.48603637456786414, + "flos": 18004246208640.0, + "grad_norm": 1.6599209376706838, + "language_loss": 0.8107174, + "learning_rate": 2.1872511487897126e-06, + "loss": 0.83220273, + "num_input_tokens_seen": 173783215, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69921875, + "step": 8084, + "time_per_iteration": 2.451949119567871 + }, + { + "auxiliary_loss_clip": 0.01112614, + "auxiliary_loss_mlp": 0.01035292, + "balance_loss_clip": 1.02148795, + "balance_loss_mlp": 1.04034543, + "epoch": 0.4860964978205321, + "flos": 22492038430080.0, + "grad_norm": 2.346430029405153, + "language_loss": 0.68347323, + "learning_rate": 2.186863394279098e-06, + "loss": 0.70495236, + "num_input_tokens_seen": 173801905, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.72265625, + "step": 8085, + "time_per_iteration": 2.499215841293335 + }, + { + "auxiliary_loss_clip": 0.0111142, + "auxiliary_loss_mlp": 0.01040793, + "balance_loss_clip": 1.0276444, + "balance_loss_mlp": 1.04064536, + "epoch": 0.48615662107320007, + "flos": 23372532518400.0, + "grad_norm": 1.46412171762657, + "language_loss": 0.77375883, + "learning_rate": 2.1864756326826046e-06, + "loss": 0.79528093, + "num_input_tokens_seen": 173824690, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.70703125, + "step": 8086, + "time_per_iteration": 2.541616678237915 + }, + { + "auxiliary_loss_clip": 0.01111956, + "auxiliary_loss_mlp": 0.01029921, + "balance_loss_clip": 1.01655173, + "balance_loss_mlp": 1.04059958, + "epoch": 0.48621674432586803, + "flos": 34418833136640.0, + "grad_norm": 1.9494281519542558, + "language_loss": 0.69733107, + "learning_rate": 2.1860878640149355e-06, + "loss": 0.71874988, + "num_input_tokens_seen": 173844450, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7109375, + "step": 8087, + "time_per_iteration": 2.5694613456726074 + }, + { + "auxiliary_loss_clip": 0.01115057, + "auxiliary_loss_mlp": 0.01037115, + "balance_loss_clip": 1.02278614, + "balance_loss_mlp": 1.03913963, + "epoch": 0.486276867578536, + "flos": 33107555237760.0, + "grad_norm": 1.610275852133116, + "language_loss": 0.72411895, + "learning_rate": 2.1857000882907974e-06, + "loss": 0.7456407, + "num_input_tokens_seen": 173864975, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7578125, + "step": 8088, + "time_per_iteration": 2.5770511627197266 + }, + { + "auxiliary_loss_clip": 0.01111259, + "auxiliary_loss_mlp": 0.01037547, + "balance_loss_clip": 1.02443993, + "balance_loss_mlp": 1.04033983, + "epoch": 0.48633699083120396, + "flos": 21470703114240.0, + "grad_norm": 1.6468852838011347, + "language_loss": 0.7557345, + "learning_rate": 2.185312305524892e-06, + "loss": 0.77722251, + "num_input_tokens_seen": 173883805, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 8089, + "time_per_iteration": 2.4625489711761475 + }, + { + "auxiliary_loss_clip": 0.01114004, + "auxiliary_loss_mlp": 0.01030212, + "balance_loss_clip": 1.0165205, + "balance_loss_mlp": 1.04078937, + "epoch": 0.48639711408387193, + "flos": 20084335833600.0, + "grad_norm": 1.5811587339913937, + "language_loss": 0.83939755, + "learning_rate": 2.184924515731926e-06, + "loss": 0.86083972, + "num_input_tokens_seen": 173903520, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73046875, + "step": 8090, + "time_per_iteration": 2.500293731689453 + }, + { + "auxiliary_loss_clip": 0.01107626, + "auxiliary_loss_mlp": 0.010336, + "balance_loss_clip": 1.02016521, + "balance_loss_mlp": 1.03945088, + "epoch": 0.4864572373365399, + "flos": 20778884190720.0, + "grad_norm": 1.6075799019512609, + "language_loss": 0.76256877, + "learning_rate": 2.1845367189266045e-06, + "loss": 0.78398097, + "num_input_tokens_seen": 173924255, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.68359375, + "step": 8091, + "time_per_iteration": 2.465998411178589 + }, + { + "auxiliary_loss_clip": 0.01110716, + "auxiliary_loss_mlp": 0.01030434, + "balance_loss_clip": 1.01714182, + "balance_loss_mlp": 1.03904068, + "epoch": 0.48651736058920786, + "flos": 26025360503040.0, + "grad_norm": 1.4690121920213544, + "language_loss": 0.80391169, + "learning_rate": 2.184148915123631e-06, + "loss": 0.82532316, + "num_input_tokens_seen": 173943285, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 8092, + "time_per_iteration": 2.509016513824463 + }, + { + "auxiliary_loss_clip": 0.01113066, + "auxiliary_loss_mlp": 0.01030079, + "balance_loss_clip": 1.01679361, + "balance_loss_mlp": 1.040061, + "epoch": 0.4865774838418758, + "flos": 20485601642880.0, + "grad_norm": 1.4222056252501818, + "language_loss": 0.71696734, + "learning_rate": 2.1837611043377126e-06, + "loss": 0.73839879, + "num_input_tokens_seen": 173962205, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73046875, + "step": 8093, + "time_per_iteration": 2.47951078414917 + }, + { + "auxiliary_loss_clip": 0.01109125, + "auxiliary_loss_mlp": 0.01032178, + "balance_loss_clip": 1.0194819, + "balance_loss_mlp": 1.03917289, + "epoch": 0.4866376070945438, + "flos": 23547704169600.0, + "grad_norm": 1.5524869827771763, + "language_loss": 0.67529863, + "learning_rate": 2.1833732865835545e-06, + "loss": 0.69671166, + "num_input_tokens_seen": 173980945, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 8094, + "time_per_iteration": 3.9874253273010254 + }, + { + "auxiliary_loss_clip": 0.01116894, + "auxiliary_loss_mlp": 0.01033116, + "balance_loss_clip": 1.01933527, + "balance_loss_mlp": 1.04218793, + "epoch": 0.4866977303472118, + "flos": 16690598012160.0, + "grad_norm": 1.8480915023468016, + "language_loss": 0.66936231, + "learning_rate": 2.1829854618758636e-06, + "loss": 0.69086242, + "num_input_tokens_seen": 173998860, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.74609375, + "step": 8095, + "time_per_iteration": 2.477593183517456 + }, + { + "auxiliary_loss_clip": 0.01112855, + "auxiliary_loss_mlp": 0.01032969, + "balance_loss_clip": 1.01847899, + "balance_loss_mlp": 1.04048705, + "epoch": 0.4867578535998798, + "flos": 17896011552000.0, + "grad_norm": 2.265808316415622, + "language_loss": 0.78996563, + "learning_rate": 2.182597630229345e-06, + "loss": 0.8114239, + "num_input_tokens_seen": 174016665, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.72265625, + "step": 8096, + "time_per_iteration": 5.404834985733032 + }, + { + "auxiliary_loss_clip": 0.01107949, + "auxiliary_loss_mlp": 0.01032056, + "balance_loss_clip": 1.01872253, + "balance_loss_mlp": 1.03737998, + "epoch": 0.48681797685254774, + "flos": 22637799820800.0, + "grad_norm": 1.7396987354687747, + "language_loss": 0.67313123, + "learning_rate": 2.1822097916587067e-06, + "loss": 0.69453126, + "num_input_tokens_seen": 174034800, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.70703125, + "step": 8097, + "time_per_iteration": 2.450967788696289 + }, + { + "auxiliary_loss_clip": 0.01108969, + "auxiliary_loss_mlp": 0.01033813, + "balance_loss_clip": 1.02071154, + "balance_loss_mlp": 1.03922939, + "epoch": 0.4868781001052157, + "flos": 20886077352960.0, + "grad_norm": 1.4534902730904964, + "language_loss": 0.71347374, + "learning_rate": 2.1818219461786543e-06, + "loss": 0.73490155, + "num_input_tokens_seen": 174054445, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.69921875, + "step": 8098, + "time_per_iteration": 2.4994144439697266 + }, + { + "auxiliary_loss_clip": 0.01116904, + "auxiliary_loss_mlp": 0.01037143, + "balance_loss_clip": 1.02274871, + "balance_loss_mlp": 1.04109979, + "epoch": 0.48693822335788367, + "flos": 41974940937600.0, + "grad_norm": 1.7962943745015671, + "language_loss": 0.66037756, + "learning_rate": 2.1814340938038956e-06, + "loss": 0.68191803, + "num_input_tokens_seen": 174077890, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7578125, + "step": 8099, + "time_per_iteration": 2.624321222305298 + }, + { + "auxiliary_loss_clip": 0.01107533, + "auxiliary_loss_mlp": 0.01032829, + "balance_loss_clip": 1.01988339, + "balance_loss_mlp": 1.03698707, + "epoch": 0.48699834661055164, + "flos": 24243294021120.0, + "grad_norm": 1.6079322443898665, + "language_loss": 0.66464651, + "learning_rate": 2.181046234549138e-06, + "loss": 0.68605012, + "num_input_tokens_seen": 174097460, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.70703125, + "step": 8100, + "time_per_iteration": 2.52364182472229 + }, + { + "auxiliary_loss_clip": 0.01108299, + "auxiliary_loss_mlp": 0.01030722, + "balance_loss_clip": 1.01802635, + "balance_loss_mlp": 1.03990841, + "epoch": 0.4870584698632196, + "flos": 25923877603200.0, + "grad_norm": 1.3375285332360751, + "language_loss": 0.76606798, + "learning_rate": 2.180658368429088e-06, + "loss": 0.78745818, + "num_input_tokens_seen": 174120775, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.68359375, + "step": 8101, + "time_per_iteration": 2.5515174865722656 + }, + { + "auxiliary_loss_clip": 0.01037344, + "auxiliary_loss_mlp": 0.01004126, + "balance_loss_clip": 1.00279069, + "balance_loss_mlp": 1.01343942, + "epoch": 0.48711859311588757, + "flos": 70211933648640.0, + "grad_norm": 0.6857117323737989, + "language_loss": 0.52317238, + "learning_rate": 2.1802704954584565e-06, + "loss": 0.54358709, + "num_input_tokens_seen": 174189135, + "router_z_loss_clip": 0.0133667, + "router_z_loss_mlp": 0.23925781, + "step": 8102, + "time_per_iteration": 3.2370035648345947 + }, + { + "auxiliary_loss_clip": 0.01110233, + "auxiliary_loss_mlp": 0.01033636, + "balance_loss_clip": 1.02098215, + "balance_loss_mlp": 1.03864419, + "epoch": 0.48717871636855553, + "flos": 12342964659840.0, + "grad_norm": 2.066543814817077, + "language_loss": 0.73703957, + "learning_rate": 2.1798826156519484e-06, + "loss": 0.75847828, + "num_input_tokens_seen": 174203250, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71484375, + "step": 8103, + "time_per_iteration": 2.401146650314331 + }, + { + "auxiliary_loss_clip": 0.01113681, + "auxiliary_loss_mlp": 0.01042266, + "balance_loss_clip": 1.02845609, + "balance_loss_mlp": 1.04083562, + "epoch": 0.4872388396212235, + "flos": 23477139901440.0, + "grad_norm": 2.0729106414348686, + "language_loss": 0.62816393, + "learning_rate": 2.1794947290242737e-06, + "loss": 0.64972341, + "num_input_tokens_seen": 174224145, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7265625, + "step": 8104, + "time_per_iteration": 2.489887237548828 + }, + { + "auxiliary_loss_clip": 0.01111014, + "auxiliary_loss_mlp": 0.01029613, + "balance_loss_clip": 1.01661348, + "balance_loss_mlp": 1.04093325, + "epoch": 0.48729896287389146, + "flos": 31427582186880.0, + "grad_norm": 2.098514623938467, + "language_loss": 0.68962336, + "learning_rate": 2.1791068355901413e-06, + "loss": 0.71102965, + "num_input_tokens_seen": 174244435, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.703125, + "step": 8105, + "time_per_iteration": 2.521994113922119 + }, + { + "auxiliary_loss_clip": 0.01106075, + "auxiliary_loss_mlp": 0.01029561, + "balance_loss_clip": 1.01682925, + "balance_loss_mlp": 1.0371716, + "epoch": 0.4873590861265594, + "flos": 19057936700160.0, + "grad_norm": 1.8440715600711883, + "language_loss": 0.73333305, + "learning_rate": 2.178718935364259e-06, + "loss": 0.75468934, + "num_input_tokens_seen": 174262710, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 8106, + "time_per_iteration": 2.471409797668457 + }, + { + "auxiliary_loss_clip": 0.01116936, + "auxiliary_loss_mlp": 0.01033734, + "balance_loss_clip": 1.01994157, + "balance_loss_mlp": 1.04300117, + "epoch": 0.4874192093792274, + "flos": 24348296453760.0, + "grad_norm": 1.861183691551934, + "language_loss": 0.77122629, + "learning_rate": 2.1783310283613373e-06, + "loss": 0.79273301, + "num_input_tokens_seen": 174281545, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.73828125, + "step": 8107, + "time_per_iteration": 2.4802913665771484 + }, + { + "auxiliary_loss_clip": 0.01109095, + "auxiliary_loss_mlp": 0.01027736, + "balance_loss_clip": 1.01563621, + "balance_loss_mlp": 1.04061639, + "epoch": 0.4874793326318954, + "flos": 23112610727040.0, + "grad_norm": 1.543990493512169, + "language_loss": 0.75148052, + "learning_rate": 2.1779431145960853e-06, + "loss": 0.77284884, + "num_input_tokens_seen": 174300290, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 8108, + "time_per_iteration": 2.4680538177490234 + }, + { + "auxiliary_loss_clip": 0.01108856, + "auxiliary_loss_mlp": 0.0102965, + "balance_loss_clip": 1.01803327, + "balance_loss_mlp": 1.04023099, + "epoch": 0.4875394558845634, + "flos": 19026156142080.0, + "grad_norm": 1.75674444511609, + "language_loss": 0.73340857, + "learning_rate": 2.177555194083212e-06, + "loss": 0.75479364, + "num_input_tokens_seen": 174318490, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6875, + "step": 8109, + "time_per_iteration": 2.4528889656066895 + }, + { + "auxiliary_loss_clip": 0.01108152, + "auxiliary_loss_mlp": 0.01030298, + "balance_loss_clip": 1.0175966, + "balance_loss_mlp": 1.0391928, + "epoch": 0.48759957913723134, + "flos": 21433607343360.0, + "grad_norm": 1.7970671112238439, + "language_loss": 0.78590822, + "learning_rate": 2.177167266837428e-06, + "loss": 0.80729276, + "num_input_tokens_seen": 174335505, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 8110, + "time_per_iteration": 2.4653971195220947 + }, + { + "auxiliary_loss_clip": 0.0111191, + "auxiliary_loss_mlp": 0.01040228, + "balance_loss_clip": 1.02730024, + "balance_loss_mlp": 1.04083896, + "epoch": 0.4876597023898993, + "flos": 17748669962880.0, + "grad_norm": 1.8027530171186463, + "language_loss": 0.72216076, + "learning_rate": 2.176779332873444e-06, + "loss": 0.74368215, + "num_input_tokens_seen": 174353990, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 8111, + "time_per_iteration": 2.4242806434631348 + }, + { + "auxiliary_loss_clip": 0.0111024, + "auxiliary_loss_mlp": 0.01034699, + "balance_loss_clip": 1.02137125, + "balance_loss_mlp": 1.04143023, + "epoch": 0.4877198256425673, + "flos": 17019647527680.0, + "grad_norm": 1.5451794032223725, + "language_loss": 0.75719351, + "learning_rate": 2.17639139220597e-06, + "loss": 0.77864289, + "num_input_tokens_seen": 174373425, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6875, + "step": 8112, + "time_per_iteration": 2.4681711196899414 + }, + { + "auxiliary_loss_clip": 0.0111572, + "auxiliary_loss_mlp": 0.01036111, + "balance_loss_clip": 1.0223484, + "balance_loss_mlp": 1.04125154, + "epoch": 0.48777994889523524, + "flos": 22384091082240.0, + "grad_norm": 1.5422638957013077, + "language_loss": 0.75012642, + "learning_rate": 2.1760034448497166e-06, + "loss": 0.77164471, + "num_input_tokens_seen": 174393070, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.74609375, + "step": 8113, + "time_per_iteration": 2.458070993423462 + }, + { + "auxiliary_loss_clip": 0.0103493, + "auxiliary_loss_mlp": 0.00999333, + "balance_loss_clip": 0.99799174, + "balance_loss_mlp": 1.01145339, + "epoch": 0.4878400721479032, + "flos": 61241772159360.0, + "grad_norm": 0.779968435998717, + "language_loss": 0.48876739, + "learning_rate": 2.1756154908193943e-06, + "loss": 0.50911003, + "num_input_tokens_seen": 174446880, + "router_z_loss_clip": 0.01342773, + "router_z_loss_mlp": 0.23535156, + "step": 8114, + "time_per_iteration": 2.964735507965088 + }, + { + "auxiliary_loss_clip": 0.01112827, + "auxiliary_loss_mlp": 0.01041502, + "balance_loss_clip": 1.02769804, + "balance_loss_mlp": 1.04015875, + "epoch": 0.48790019540057117, + "flos": 24536612482560.0, + "grad_norm": 1.346675786458265, + "language_loss": 0.76713175, + "learning_rate": 2.1752275301297155e-06, + "loss": 0.78867507, + "num_input_tokens_seen": 174468485, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7265625, + "step": 8115, + "time_per_iteration": 2.5008208751678467 + }, + { + "auxiliary_loss_clip": 0.01116462, + "auxiliary_loss_mlp": 0.01036306, + "balance_loss_clip": 1.02220368, + "balance_loss_mlp": 1.0430454, + "epoch": 0.48796031865323913, + "flos": 21833939399040.0, + "grad_norm": 2.9741706409780697, + "language_loss": 0.72150338, + "learning_rate": 2.1748395627953915e-06, + "loss": 0.74303102, + "num_input_tokens_seen": 174486360, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.734375, + "step": 8116, + "time_per_iteration": 2.471170425415039 + }, + { + "auxiliary_loss_clip": 0.01108955, + "auxiliary_loss_mlp": 0.01038046, + "balance_loss_clip": 1.02506459, + "balance_loss_mlp": 1.03951752, + "epoch": 0.4880204419059071, + "flos": 18588907883520.0, + "grad_norm": 1.626628974836948, + "language_loss": 0.63457322, + "learning_rate": 2.1744515888311335e-06, + "loss": 0.65604323, + "num_input_tokens_seen": 174505075, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6953125, + "step": 8117, + "time_per_iteration": 2.4408295154571533 + }, + { + "auxiliary_loss_clip": 0.01106242, + "auxiliary_loss_mlp": 0.01033541, + "balance_loss_clip": 1.02082098, + "balance_loss_mlp": 1.03648984, + "epoch": 0.48808056515857506, + "flos": 19172168928000.0, + "grad_norm": 1.7937040821955612, + "language_loss": 0.79223609, + "learning_rate": 2.1740636082516533e-06, + "loss": 0.81363392, + "num_input_tokens_seen": 174523385, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 8118, + "time_per_iteration": 2.4724843502044678 + }, + { + "auxiliary_loss_clip": 0.01111434, + "auxiliary_loss_mlp": 0.01037003, + "balance_loss_clip": 1.02359247, + "balance_loss_mlp": 1.03926289, + "epoch": 0.48814068841124303, + "flos": 20120497850880.0, + "grad_norm": 2.8027989615224427, + "language_loss": 0.63472134, + "learning_rate": 2.1736756210716645e-06, + "loss": 0.65620571, + "num_input_tokens_seen": 174542200, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.72265625, + "step": 8119, + "time_per_iteration": 2.478968381881714 + }, + { + "auxiliary_loss_clip": 0.01111182, + "auxiliary_loss_mlp": 0.01032744, + "balance_loss_clip": 1.02006578, + "balance_loss_mlp": 1.04054463, + "epoch": 0.488200811663911, + "flos": 22965592360320.0, + "grad_norm": 1.9034604660173908, + "language_loss": 0.72397757, + "learning_rate": 2.173287627305878e-06, + "loss": 0.74541688, + "num_input_tokens_seen": 174563620, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 8120, + "time_per_iteration": 2.5204596519470215 + }, + { + "auxiliary_loss_clip": 0.01109957, + "auxiliary_loss_mlp": 0.01034651, + "balance_loss_clip": 1.02122211, + "balance_loss_mlp": 1.03855026, + "epoch": 0.48826093491657896, + "flos": 33910697387520.0, + "grad_norm": 1.5930525886491658, + "language_loss": 0.63636339, + "learning_rate": 2.1728996269690075e-06, + "loss": 0.65780938, + "num_input_tokens_seen": 174586465, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71484375, + "step": 8121, + "time_per_iteration": 2.5647690296173096 + }, + { + "auxiliary_loss_clip": 0.01113983, + "auxiliary_loss_mlp": 0.01038884, + "balance_loss_clip": 1.02521062, + "balance_loss_mlp": 1.04131413, + "epoch": 0.488321058169247, + "flos": 23070307484160.0, + "grad_norm": 1.870740841609923, + "language_loss": 0.82433021, + "learning_rate": 2.1725116200757664e-06, + "loss": 0.84585893, + "num_input_tokens_seen": 174604035, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7265625, + "step": 8122, + "time_per_iteration": 2.4753966331481934 + }, + { + "auxiliary_loss_clip": 0.01113704, + "auxiliary_loss_mlp": 0.01034348, + "balance_loss_clip": 1.02019167, + "balance_loss_mlp": 1.04063094, + "epoch": 0.48838118142191494, + "flos": 19317714837120.0, + "grad_norm": 2.206764356510625, + "language_loss": 0.85308874, + "learning_rate": 2.172123606640866e-06, + "loss": 0.8745693, + "num_input_tokens_seen": 174621715, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.73046875, + "step": 8123, + "time_per_iteration": 2.5124545097351074 + }, + { + "auxiliary_loss_clip": 0.01111875, + "auxiliary_loss_mlp": 0.01033202, + "balance_loss_clip": 1.02075016, + "balance_loss_mlp": 1.03892267, + "epoch": 0.4884413046745829, + "flos": 25410678036480.0, + "grad_norm": 2.940858316224804, + "language_loss": 0.85766631, + "learning_rate": 2.1717355866790227e-06, + "loss": 0.87911713, + "num_input_tokens_seen": 174643835, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.73046875, + "step": 8124, + "time_per_iteration": 2.5632708072662354 + }, + { + "auxiliary_loss_clip": 0.01112362, + "auxiliary_loss_mlp": 0.01034904, + "balance_loss_clip": 1.02157593, + "balance_loss_mlp": 1.04022837, + "epoch": 0.4885014279272509, + "flos": 20991546662400.0, + "grad_norm": 2.663608167377633, + "language_loss": 0.79223049, + "learning_rate": 2.171347560204948e-06, + "loss": 0.81370318, + "num_input_tokens_seen": 174660955, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 8125, + "time_per_iteration": 2.4487855434417725 + }, + { + "auxiliary_loss_clip": 0.01108941, + "auxiliary_loss_mlp": 0.01033765, + "balance_loss_clip": 1.0211587, + "balance_loss_mlp": 1.03887916, + "epoch": 0.48856155117991884, + "flos": 13771599269760.0, + "grad_norm": 2.7973571608225063, + "language_loss": 0.72273839, + "learning_rate": 2.170959527233356e-06, + "loss": 0.74416542, + "num_input_tokens_seen": 174678270, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 8126, + "time_per_iteration": 2.437833309173584 + }, + { + "auxiliary_loss_clip": 0.01111271, + "auxiliary_loss_mlp": 0.01033174, + "balance_loss_clip": 1.02007866, + "balance_loss_mlp": 1.0383321, + "epoch": 0.4886216744325868, + "flos": 32087764206720.0, + "grad_norm": 1.6636646152839605, + "language_loss": 0.68598747, + "learning_rate": 2.1705714877789633e-06, + "loss": 0.70743197, + "num_input_tokens_seen": 174698360, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73046875, + "step": 8127, + "time_per_iteration": 2.593252420425415 + }, + { + "auxiliary_loss_clip": 0.01111716, + "auxiliary_loss_mlp": 0.01036256, + "balance_loss_clip": 1.02271378, + "balance_loss_mlp": 1.03772545, + "epoch": 0.48868179768525477, + "flos": 19610063631360.0, + "grad_norm": 2.237259843406747, + "language_loss": 0.76160932, + "learning_rate": 2.170183441856481e-06, + "loss": 0.78308904, + "num_input_tokens_seen": 174716755, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73828125, + "step": 8128, + "time_per_iteration": 2.4540648460388184 + }, + { + "auxiliary_loss_clip": 0.01111402, + "auxiliary_loss_mlp": 0.01033977, + "balance_loss_clip": 1.02170467, + "balance_loss_mlp": 1.03979826, + "epoch": 0.48874192093792274, + "flos": 21286912199040.0, + "grad_norm": 1.8007841393953645, + "language_loss": 0.75974828, + "learning_rate": 2.1697953894806265e-06, + "loss": 0.78120208, + "num_input_tokens_seen": 174735560, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.71875, + "step": 8129, + "time_per_iteration": 2.4460771083831787 + }, + { + "auxiliary_loss_clip": 0.01108237, + "auxiliary_loss_mlp": 0.01031838, + "balance_loss_clip": 1.01829541, + "balance_loss_mlp": 1.03739452, + "epoch": 0.4888020441905907, + "flos": 14173439696640.0, + "grad_norm": 2.2474332482435684, + "language_loss": 0.64869368, + "learning_rate": 2.169407330666114e-06, + "loss": 0.67009449, + "num_input_tokens_seen": 174752730, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.70703125, + "step": 8130, + "time_per_iteration": 2.4403305053710938 + }, + { + "auxiliary_loss_clip": 0.01104742, + "auxiliary_loss_mlp": 0.01033698, + "balance_loss_clip": 1.0213058, + "balance_loss_mlp": 1.03528643, + "epoch": 0.48886216744325867, + "flos": 24097891766400.0, + "grad_norm": 2.48357292354413, + "language_loss": 0.71885133, + "learning_rate": 2.169019265427658e-06, + "loss": 0.74023575, + "num_input_tokens_seen": 174772520, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6953125, + "step": 8131, + "time_per_iteration": 2.4774324893951416 + }, + { + "auxiliary_loss_clip": 0.01113099, + "auxiliary_loss_mlp": 0.01038002, + "balance_loss_clip": 1.02447748, + "balance_loss_mlp": 1.04011512, + "epoch": 0.48892229069592663, + "flos": 38431419402240.0, + "grad_norm": 1.6326145167913504, + "language_loss": 0.69524658, + "learning_rate": 2.1686311937799745e-06, + "loss": 0.7167576, + "num_input_tokens_seen": 174796540, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73046875, + "step": 8132, + "time_per_iteration": 2.5888383388519287 + }, + { + "auxiliary_loss_clip": 0.011075, + "auxiliary_loss_mlp": 0.0102913, + "balance_loss_clip": 1.01641083, + "balance_loss_mlp": 1.03793633, + "epoch": 0.4889824139485946, + "flos": 23843321101440.0, + "grad_norm": 1.374551885233197, + "language_loss": 0.70177239, + "learning_rate": 2.1682431157377797e-06, + "loss": 0.72313869, + "num_input_tokens_seen": 174817840, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 8133, + "time_per_iteration": 2.5105628967285156 + }, + { + "auxiliary_loss_clip": 0.01108745, + "auxiliary_loss_mlp": 0.01033533, + "balance_loss_clip": 1.02086735, + "balance_loss_mlp": 1.03843439, + "epoch": 0.48904253720126256, + "flos": 24425827960320.0, + "grad_norm": 1.701581568458854, + "language_loss": 0.70707083, + "learning_rate": 2.1678550313157883e-06, + "loss": 0.72849363, + "num_input_tokens_seen": 174837885, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 8134, + "time_per_iteration": 2.4894602298736572 + }, + { + "auxiliary_loss_clip": 0.01114154, + "auxiliary_loss_mlp": 0.01035613, + "balance_loss_clip": 1.02214789, + "balance_loss_mlp": 1.04088461, + "epoch": 0.4891026604539306, + "flos": 24170682677760.0, + "grad_norm": 2.0967568848691105, + "language_loss": 0.80384946, + "learning_rate": 2.167466940528718e-06, + "loss": 0.82534719, + "num_input_tokens_seen": 174855240, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 8135, + "time_per_iteration": 2.453099489212036 + }, + { + "auxiliary_loss_clip": 0.0110553, + "auxiliary_loss_mlp": 0.0103091, + "balance_loss_clip": 1.01895332, + "balance_loss_mlp": 1.03636014, + "epoch": 0.48916278370659855, + "flos": 21470954509440.0, + "grad_norm": 1.7196560423786724, + "language_loss": 0.74302435, + "learning_rate": 2.1670788433912843e-06, + "loss": 0.7643888, + "num_input_tokens_seen": 174875145, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.69140625, + "step": 8136, + "time_per_iteration": 3.877336025238037 + }, + { + "auxiliary_loss_clip": 0.0110843, + "auxiliary_loss_mlp": 0.01030189, + "balance_loss_clip": 1.01817274, + "balance_loss_mlp": 1.03903699, + "epoch": 0.4892229069592665, + "flos": 22309755886080.0, + "grad_norm": 2.212302237726986, + "language_loss": 0.73165262, + "learning_rate": 2.166690739918204e-06, + "loss": 0.75303876, + "num_input_tokens_seen": 174894770, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6953125, + "step": 8137, + "time_per_iteration": 5.387110471725464 + }, + { + "auxiliary_loss_clip": 0.01109302, + "auxiliary_loss_mlp": 0.01031575, + "balance_loss_clip": 1.01846206, + "balance_loss_mlp": 1.03721762, + "epoch": 0.4892830302119345, + "flos": 12786856934400.0, + "grad_norm": 1.8416541749331667, + "language_loss": 0.74448442, + "learning_rate": 2.1663026301241944e-06, + "loss": 0.76589316, + "num_input_tokens_seen": 174912780, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 8138, + "time_per_iteration": 3.9045798778533936 + }, + { + "auxiliary_loss_clip": 0.01108399, + "auxiliary_loss_mlp": 0.01033464, + "balance_loss_clip": 1.02114367, + "balance_loss_mlp": 1.039101, + "epoch": 0.48934315346460244, + "flos": 20813896972800.0, + "grad_norm": 1.5284975125240874, + "language_loss": 0.74403191, + "learning_rate": 2.165914514023972e-06, + "loss": 0.76545048, + "num_input_tokens_seen": 174931250, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.69140625, + "step": 8139, + "time_per_iteration": 2.4808132648468018 + }, + { + "auxiliary_loss_clip": 0.01109253, + "auxiliary_loss_mlp": 0.01034607, + "balance_loss_clip": 1.02220941, + "balance_loss_mlp": 1.03792441, + "epoch": 0.4894032767172704, + "flos": 19755537713280.0, + "grad_norm": 1.7092479760411836, + "language_loss": 0.61867124, + "learning_rate": 2.165526391632255e-06, + "loss": 0.64010978, + "num_input_tokens_seen": 174951105, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 8140, + "time_per_iteration": 2.4676973819732666 + }, + { + "auxiliary_loss_clip": 0.01110437, + "auxiliary_loss_mlp": 0.01040632, + "balance_loss_clip": 1.02696478, + "balance_loss_mlp": 1.03864169, + "epoch": 0.4894633999699384, + "flos": 17818982835840.0, + "grad_norm": 11.553990271771063, + "language_loss": 0.82090259, + "learning_rate": 2.1651382629637608e-06, + "loss": 0.84241331, + "num_input_tokens_seen": 174969120, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71875, + "step": 8141, + "time_per_iteration": 2.4469456672668457 + }, + { + "auxiliary_loss_clip": 0.01112856, + "auxiliary_loss_mlp": 0.01033863, + "balance_loss_clip": 1.02006459, + "balance_loss_mlp": 1.04014516, + "epoch": 0.48952352322260634, + "flos": 25523222325120.0, + "grad_norm": 1.575169950356119, + "language_loss": 0.72470534, + "learning_rate": 2.1647501280332066e-06, + "loss": 0.74617255, + "num_input_tokens_seen": 174991295, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7265625, + "step": 8142, + "time_per_iteration": 2.5793039798736572 + }, + { + "auxiliary_loss_clip": 0.01105636, + "auxiliary_loss_mlp": 0.01032347, + "balance_loss_clip": 1.02019358, + "balance_loss_mlp": 1.03645492, + "epoch": 0.4895836464752743, + "flos": 29055502903680.0, + "grad_norm": 1.7422772510583273, + "language_loss": 0.66720849, + "learning_rate": 2.1643619868553105e-06, + "loss": 0.68858832, + "num_input_tokens_seen": 175012830, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69140625, + "step": 8143, + "time_per_iteration": 2.529869556427002 + }, + { + "auxiliary_loss_clip": 0.01104447, + "auxiliary_loss_mlp": 0.01029513, + "balance_loss_clip": 1.01746714, + "balance_loss_mlp": 1.03620982, + "epoch": 0.48964376972794227, + "flos": 33546958312320.0, + "grad_norm": 1.6744857165672533, + "language_loss": 0.75076014, + "learning_rate": 2.163973839444793e-06, + "loss": 0.77209973, + "num_input_tokens_seen": 175035695, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.68359375, + "step": 8144, + "time_per_iteration": 2.5917482376098633 + }, + { + "auxiliary_loss_clip": 0.0110724, + "auxiliary_loss_mlp": 0.01028535, + "balance_loss_clip": 1.0158155, + "balance_loss_mlp": 1.0373745, + "epoch": 0.48970389298061023, + "flos": 22054035985920.0, + "grad_norm": 1.7401505251342857, + "language_loss": 0.75606745, + "learning_rate": 2.1635856858163695e-06, + "loss": 0.77742517, + "num_input_tokens_seen": 175056425, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 8145, + "time_per_iteration": 2.4766342639923096 + }, + { + "auxiliary_loss_clip": 0.01110696, + "auxiliary_loss_mlp": 0.01035586, + "balance_loss_clip": 1.0224849, + "balance_loss_mlp": 1.03849018, + "epoch": 0.4897640162332782, + "flos": 20084299920000.0, + "grad_norm": 1.7624340526507305, + "language_loss": 0.79901314, + "learning_rate": 2.163197525984761e-06, + "loss": 0.820476, + "num_input_tokens_seen": 175074800, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 8146, + "time_per_iteration": 2.461480140686035 + }, + { + "auxiliary_loss_clip": 0.01102906, + "auxiliary_loss_mlp": 0.01031626, + "balance_loss_clip": 1.01866233, + "balance_loss_mlp": 1.03510666, + "epoch": 0.48982413948594616, + "flos": 23806225330560.0, + "grad_norm": 1.6218674355963285, + "language_loss": 0.74327677, + "learning_rate": 2.162809359964687e-06, + "loss": 0.76462203, + "num_input_tokens_seen": 175094500, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.67578125, + "step": 8147, + "time_per_iteration": 2.4981865882873535 + }, + { + "auxiliary_loss_clip": 0.01109193, + "auxiliary_loss_mlp": 0.01028663, + "balance_loss_clip": 1.01614654, + "balance_loss_mlp": 1.0397613, + "epoch": 0.4898842627386142, + "flos": 17639645207040.0, + "grad_norm": 2.4473724892456126, + "language_loss": 0.83147472, + "learning_rate": 2.162421187770864e-06, + "loss": 0.8528533, + "num_input_tokens_seen": 175112920, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 8148, + "time_per_iteration": 2.4251036643981934 + }, + { + "auxiliary_loss_clip": 0.01104505, + "auxiliary_loss_mlp": 0.01027888, + "balance_loss_clip": 1.01701021, + "balance_loss_mlp": 1.03808641, + "epoch": 0.48994438599128215, + "flos": 16617914841600.0, + "grad_norm": 1.6244569398372493, + "language_loss": 0.73749536, + "learning_rate": 2.162033009418015e-06, + "loss": 0.75881934, + "num_input_tokens_seen": 175129910, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6640625, + "step": 8149, + "time_per_iteration": 2.4356369972229004 + }, + { + "auxiliary_loss_clip": 0.01112401, + "auxiliary_loss_mlp": 0.01030368, + "balance_loss_clip": 1.01667118, + "balance_loss_mlp": 1.03944612, + "epoch": 0.4900045092439501, + "flos": 26614834600320.0, + "grad_norm": 2.7362049095417516, + "language_loss": 0.75515091, + "learning_rate": 2.1616448249208567e-06, + "loss": 0.77657855, + "num_input_tokens_seen": 175148705, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7265625, + "step": 8150, + "time_per_iteration": 2.4834423065185547 + }, + { + "auxiliary_loss_clip": 0.01111432, + "auxiliary_loss_mlp": 0.01030069, + "balance_loss_clip": 1.0169735, + "balance_loss_mlp": 1.04018414, + "epoch": 0.4900646324966181, + "flos": 19902125116800.0, + "grad_norm": 2.027803048960678, + "language_loss": 0.72891176, + "learning_rate": 2.1612566342941106e-06, + "loss": 0.75032675, + "num_input_tokens_seen": 175167425, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 8151, + "time_per_iteration": 2.448648691177368 + }, + { + "auxiliary_loss_clip": 0.01033992, + "auxiliary_loss_mlp": 0.01002772, + "balance_loss_clip": 1.0015738, + "balance_loss_mlp": 1.01003349, + "epoch": 0.49012475574928605, + "flos": 59189620337280.0, + "grad_norm": 0.8338756787223442, + "language_loss": 0.54366148, + "learning_rate": 2.1608684375524977e-06, + "loss": 0.5640291, + "num_input_tokens_seen": 175227985, + "router_z_loss_clip": 0.01196289, + "router_z_loss_mlp": 0.24023438, + "step": 8152, + "time_per_iteration": 3.0414862632751465 + }, + { + "auxiliary_loss_clip": 0.01109949, + "auxiliary_loss_mlp": 0.01030719, + "balance_loss_clip": 1.01807642, + "balance_loss_mlp": 1.03726649, + "epoch": 0.490184879001954, + "flos": 45259797657600.0, + "grad_norm": 1.8071588573161568, + "language_loss": 0.61403525, + "learning_rate": 2.1604802347107364e-06, + "loss": 0.6354419, + "num_input_tokens_seen": 175251895, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7265625, + "step": 8153, + "time_per_iteration": 2.6923155784606934 + }, + { + "auxiliary_loss_clip": 0.01105747, + "auxiliary_loss_mlp": 0.01036584, + "balance_loss_clip": 1.02371526, + "balance_loss_mlp": 1.03589535, + "epoch": 0.490245002254622, + "flos": 28002135634560.0, + "grad_norm": 1.4691031789751592, + "language_loss": 0.76673591, + "learning_rate": 2.160092025783549e-06, + "loss": 0.78815919, + "num_input_tokens_seen": 175272770, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69921875, + "step": 8154, + "time_per_iteration": 2.490353584289551 + }, + { + "auxiliary_loss_clip": 0.01034079, + "auxiliary_loss_mlp": 0.01008709, + "balance_loss_clip": 1.00767767, + "balance_loss_mlp": 1.01043367, + "epoch": 0.49030512550728994, + "flos": 58951318533120.0, + "grad_norm": 0.9669855284605297, + "language_loss": 0.67019808, + "learning_rate": 2.1597038107856564e-06, + "loss": 0.69062597, + "num_input_tokens_seen": 175336320, + "router_z_loss_clip": 0.01031494, + "router_z_loss_mlp": 0.23632812, + "step": 8155, + "time_per_iteration": 3.1443841457366943 + }, + { + "auxiliary_loss_clip": 0.0110817, + "auxiliary_loss_mlp": 0.0102773, + "balance_loss_clip": 1.01594031, + "balance_loss_mlp": 1.03842843, + "epoch": 0.4903652487599579, + "flos": 19791843384960.0, + "grad_norm": 2.3165784732113965, + "language_loss": 0.76883155, + "learning_rate": 2.1593155897317784e-06, + "loss": 0.79019058, + "num_input_tokens_seen": 175353540, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.69921875, + "step": 8156, + "time_per_iteration": 2.4431064128875732 + }, + { + "auxiliary_loss_clip": 0.01107345, + "auxiliary_loss_mlp": 0.01029515, + "balance_loss_clip": 1.01737309, + "balance_loss_mlp": 1.03692055, + "epoch": 0.49042537201262587, + "flos": 21762082241280.0, + "grad_norm": 2.1340841853754084, + "language_loss": 0.83395588, + "learning_rate": 2.1589273626366377e-06, + "loss": 0.85532445, + "num_input_tokens_seen": 175370445, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.703125, + "step": 8157, + "time_per_iteration": 2.478027582168579 + }, + { + "auxiliary_loss_clip": 0.01108499, + "auxiliary_loss_mlp": 0.01032069, + "balance_loss_clip": 1.01971316, + "balance_loss_mlp": 1.03797531, + "epoch": 0.49048549526529384, + "flos": 18953042008320.0, + "grad_norm": 1.799550006100146, + "language_loss": 0.79893947, + "learning_rate": 2.158539129514956e-06, + "loss": 0.8203451, + "num_input_tokens_seen": 175389020, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.70703125, + "step": 8158, + "time_per_iteration": 2.453590154647827 + }, + { + "auxiliary_loss_clip": 0.0111001, + "auxiliary_loss_mlp": 0.01030333, + "balance_loss_clip": 1.01731563, + "balance_loss_mlp": 1.03768444, + "epoch": 0.4905456185179618, + "flos": 26906393295360.0, + "grad_norm": 2.6065217447562015, + "language_loss": 0.69529265, + "learning_rate": 2.158150890381454e-06, + "loss": 0.71669614, + "num_input_tokens_seen": 175409545, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.72265625, + "step": 8159, + "time_per_iteration": 2.531371593475342 + }, + { + "auxiliary_loss_clip": 0.01106025, + "auxiliary_loss_mlp": 0.01032529, + "balance_loss_clip": 1.01975548, + "balance_loss_mlp": 1.03706563, + "epoch": 0.49060574177062977, + "flos": 20412343854720.0, + "grad_norm": 1.8340548446534848, + "language_loss": 0.73084885, + "learning_rate": 2.157762645250854e-06, + "loss": 0.7522344, + "num_input_tokens_seen": 175429335, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 8160, + "time_per_iteration": 2.4504506587982178 + }, + { + "auxiliary_loss_clip": 0.01109213, + "auxiliary_loss_mlp": 0.0103886, + "balance_loss_clip": 1.02510881, + "balance_loss_mlp": 1.03650105, + "epoch": 0.4906658650232978, + "flos": 17493704248320.0, + "grad_norm": 1.9580885379656197, + "language_loss": 0.71372044, + "learning_rate": 2.1573743941378796e-06, + "loss": 0.73520112, + "num_input_tokens_seen": 175446955, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7265625, + "step": 8161, + "time_per_iteration": 2.4428305625915527 + }, + { + "auxiliary_loss_clip": 0.01106928, + "auxiliary_loss_mlp": 0.01033381, + "balance_loss_clip": 1.02100754, + "balance_loss_mlp": 1.03813958, + "epoch": 0.49072598827596575, + "flos": 26614439550720.0, + "grad_norm": 1.8633116916333885, + "language_loss": 0.67950338, + "learning_rate": 2.1569861370572517e-06, + "loss": 0.70090652, + "num_input_tokens_seen": 175468195, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6875, + "step": 8162, + "time_per_iteration": 2.478804349899292 + }, + { + "auxiliary_loss_clip": 0.01110496, + "auxiliary_loss_mlp": 0.01033543, + "balance_loss_clip": 1.01964319, + "balance_loss_mlp": 1.03701675, + "epoch": 0.4907861115286337, + "flos": 20412595249920.0, + "grad_norm": 1.7117590070355053, + "language_loss": 0.63264233, + "learning_rate": 2.1565978740236944e-06, + "loss": 0.65408272, + "num_input_tokens_seen": 175487455, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.734375, + "step": 8163, + "time_per_iteration": 2.474439859390259 + }, + { + "auxiliary_loss_clip": 0.01104573, + "auxiliary_loss_mlp": 0.01030344, + "balance_loss_clip": 1.01754081, + "balance_loss_mlp": 1.03680897, + "epoch": 0.4908462347813017, + "flos": 14064271286400.0, + "grad_norm": 5.481003364843308, + "language_loss": 0.76853907, + "learning_rate": 2.1562096050519293e-06, + "loss": 0.78988826, + "num_input_tokens_seen": 175504450, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 8164, + "time_per_iteration": 2.4202303886413574 + }, + { + "auxiliary_loss_clip": 0.01106417, + "auxiliary_loss_mlp": 0.01028083, + "balance_loss_clip": 1.01487494, + "balance_loss_mlp": 1.03511751, + "epoch": 0.49090635803396965, + "flos": 18735100237440.0, + "grad_norm": 1.943812351193686, + "language_loss": 0.76509839, + "learning_rate": 2.1558213301566806e-06, + "loss": 0.78644335, + "num_input_tokens_seen": 175523600, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71484375, + "step": 8165, + "time_per_iteration": 2.4495608806610107 + }, + { + "auxiliary_loss_clip": 0.01106443, + "auxiliary_loss_mlp": 0.0103224, + "balance_loss_clip": 1.01949036, + "balance_loss_mlp": 1.03724587, + "epoch": 0.4909664812866376, + "flos": 20558500295040.0, + "grad_norm": 1.5511500992998777, + "language_loss": 0.77538848, + "learning_rate": 2.1554330493526716e-06, + "loss": 0.79677534, + "num_input_tokens_seen": 175542720, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 8166, + "time_per_iteration": 2.431838274002075 + }, + { + "auxiliary_loss_clip": 0.01035489, + "auxiliary_loss_mlp": 0.00999269, + "balance_loss_clip": 0.99796408, + "balance_loss_mlp": 1.01166928, + "epoch": 0.4910266045393056, + "flos": 54684017948160.0, + "grad_norm": 0.7997768420675069, + "language_loss": 0.54261303, + "learning_rate": 2.1550447626546253e-06, + "loss": 0.56296062, + "num_input_tokens_seen": 175598640, + "router_z_loss_clip": 0.01306152, + "router_z_loss_mlp": 0.23828125, + "step": 8167, + "time_per_iteration": 3.1150460243225098 + }, + { + "auxiliary_loss_clip": 0.01104818, + "auxiliary_loss_mlp": 0.0103103, + "balance_loss_clip": 1.0184176, + "balance_loss_mlp": 1.03619838, + "epoch": 0.49108672779197354, + "flos": 16246454342400.0, + "grad_norm": 2.5337625100343173, + "language_loss": 0.85566431, + "learning_rate": 2.1546564700772665e-06, + "loss": 0.8770228, + "num_input_tokens_seen": 175615675, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 8168, + "time_per_iteration": 2.4139063358306885 + }, + { + "auxiliary_loss_clip": 0.01105043, + "auxiliary_loss_mlp": 0.01029235, + "balance_loss_clip": 1.01706409, + "balance_loss_mlp": 1.03805184, + "epoch": 0.4911468510446415, + "flos": 19825419623040.0, + "grad_norm": 1.6015963996367162, + "language_loss": 0.73052484, + "learning_rate": 2.1542681716353193e-06, + "loss": 0.75186759, + "num_input_tokens_seen": 175632255, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.671875, + "step": 8169, + "time_per_iteration": 2.45638370513916 + }, + { + "auxiliary_loss_clip": 0.01104357, + "auxiliary_loss_mlp": 0.01028462, + "balance_loss_clip": 1.01673138, + "balance_loss_mlp": 1.03472865, + "epoch": 0.4912069742973095, + "flos": 21212684743680.0, + "grad_norm": 1.6971136818289634, + "language_loss": 0.78070778, + "learning_rate": 2.1538798673435068e-06, + "loss": 0.80203593, + "num_input_tokens_seen": 175651625, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6953125, + "step": 8170, + "time_per_iteration": 2.4314279556274414 + }, + { + "auxiliary_loss_clip": 0.01108102, + "auxiliary_loss_mlp": 0.01033192, + "balance_loss_clip": 1.02121162, + "balance_loss_mlp": 1.03809822, + "epoch": 0.49126709754997744, + "flos": 19537129065600.0, + "grad_norm": 3.6606474387116363, + "language_loss": 0.75769788, + "learning_rate": 2.1534915572165545e-06, + "loss": 0.77911079, + "num_input_tokens_seen": 175669265, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.703125, + "step": 8171, + "time_per_iteration": 2.4608027935028076 + }, + { + "auxiliary_loss_clip": 0.01109941, + "auxiliary_loss_mlp": 0.0103435, + "balance_loss_clip": 1.02194011, + "balance_loss_mlp": 1.03800821, + "epoch": 0.4913272208026454, + "flos": 12239686080000.0, + "grad_norm": 2.121204048765929, + "language_loss": 0.81676465, + "learning_rate": 2.1531032412691875e-06, + "loss": 0.83820748, + "num_input_tokens_seen": 175686065, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71875, + "step": 8172, + "time_per_iteration": 2.44052791595459 + }, + { + "auxiliary_loss_clip": 0.01034483, + "auxiliary_loss_mlp": 0.00996712, + "balance_loss_clip": 0.99551356, + "balance_loss_mlp": 1.0111028, + "epoch": 0.49138734405531337, + "flos": 65465871661440.0, + "grad_norm": 0.6914312886696967, + "language_loss": 0.53323382, + "learning_rate": 2.1527149195161295e-06, + "loss": 0.55354571, + "num_input_tokens_seen": 175748595, + "router_z_loss_clip": 0.01196289, + "router_z_loss_mlp": 0.234375, + "step": 8173, + "time_per_iteration": 3.0708565711975098 + }, + { + "auxiliary_loss_clip": 0.01108663, + "auxiliary_loss_mlp": 0.01032993, + "balance_loss_clip": 1.01985621, + "balance_loss_mlp": 1.0374558, + "epoch": 0.4914474673079814, + "flos": 18439052342400.0, + "grad_norm": 1.811286975884668, + "language_loss": 0.62879664, + "learning_rate": 2.152326591972107e-06, + "loss": 0.65021324, + "num_input_tokens_seen": 175766770, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 8174, + "time_per_iteration": 2.4336249828338623 + }, + { + "auxiliary_loss_clip": 0.01106845, + "auxiliary_loss_mlp": 0.01034775, + "balance_loss_clip": 1.02208483, + "balance_loss_mlp": 1.03750002, + "epoch": 0.49150759056064935, + "flos": 21685053525120.0, + "grad_norm": 1.779537870111139, + "language_loss": 0.69111979, + "learning_rate": 2.1519382586518445e-06, + "loss": 0.71253598, + "num_input_tokens_seen": 175783605, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 8175, + "time_per_iteration": 2.4554460048675537 + }, + { + "auxiliary_loss_clip": 0.01106829, + "auxiliary_loss_mlp": 0.01032059, + "balance_loss_clip": 1.01980472, + "balance_loss_mlp": 1.03808653, + "epoch": 0.4915677138133173, + "flos": 22382439056640.0, + "grad_norm": 1.5246237839161791, + "language_loss": 0.74398279, + "learning_rate": 2.151549919570068e-06, + "loss": 0.76537168, + "num_input_tokens_seen": 175801390, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 8176, + "time_per_iteration": 2.4888904094696045 + }, + { + "auxiliary_loss_clip": 0.01107276, + "auxiliary_loss_mlp": 0.01042259, + "balance_loss_clip": 1.0297358, + "balance_loss_mlp": 1.03694725, + "epoch": 0.4916278370659853, + "flos": 18402890325120.0, + "grad_norm": 1.7568126082203932, + "language_loss": 0.69846892, + "learning_rate": 2.1511615747415036e-06, + "loss": 0.71996421, + "num_input_tokens_seen": 175819830, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 8177, + "time_per_iteration": 3.8634564876556396 + }, + { + "auxiliary_loss_clip": 0.01035127, + "auxiliary_loss_mlp": 0.00999453, + "balance_loss_clip": 0.99834442, + "balance_loss_mlp": 1.01137829, + "epoch": 0.49168796031865325, + "flos": 66609124715520.0, + "grad_norm": 0.6749706589091774, + "language_loss": 0.46188164, + "learning_rate": 2.150773224180877e-06, + "loss": 0.48222741, + "num_input_tokens_seen": 175881765, + "router_z_loss_clip": 0.0111084, + "router_z_loss_mlp": 0.23828125, + "step": 8178, + "time_per_iteration": 3.0891001224517822 + }, + { + "auxiliary_loss_clip": 0.01110485, + "auxiliary_loss_mlp": 0.01036426, + "balance_loss_clip": 1.02311015, + "balance_loss_mlp": 1.03835034, + "epoch": 0.4917480835713212, + "flos": 20959335141120.0, + "grad_norm": 1.813634772504209, + "language_loss": 0.66008747, + "learning_rate": 2.1503848679029147e-06, + "loss": 0.68155658, + "num_input_tokens_seen": 175901795, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 8179, + "time_per_iteration": 5.296982049942017 + }, + { + "auxiliary_loss_clip": 0.01111217, + "auxiliary_loss_mlp": 0.01035796, + "balance_loss_clip": 1.021873, + "balance_loss_mlp": 1.03712761, + "epoch": 0.4918082068239892, + "flos": 15772900412160.0, + "grad_norm": 1.8426949121819989, + "language_loss": 0.70288503, + "learning_rate": 2.149996505922343e-06, + "loss": 0.72435522, + "num_input_tokens_seen": 175917770, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7421875, + "step": 8180, + "time_per_iteration": 3.9257376194000244 + }, + { + "auxiliary_loss_clip": 0.01104266, + "auxiliary_loss_mlp": 0.01037933, + "balance_loss_clip": 1.02467656, + "balance_loss_mlp": 1.03577447, + "epoch": 0.49186833007665715, + "flos": 24604806453120.0, + "grad_norm": 1.68068912028803, + "language_loss": 0.83982801, + "learning_rate": 2.1496081382538895e-06, + "loss": 0.86125004, + "num_input_tokens_seen": 175937000, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.68359375, + "step": 8181, + "time_per_iteration": 2.464665174484253 + }, + { + "auxiliary_loss_clip": 0.01104535, + "auxiliary_loss_mlp": 0.01030918, + "balance_loss_clip": 1.01947999, + "balance_loss_mlp": 1.03746653, + "epoch": 0.4919284533293251, + "flos": 22090557139200.0, + "grad_norm": 2.0240623883749724, + "language_loss": 0.72286201, + "learning_rate": 2.1492197649122793e-06, + "loss": 0.74421656, + "num_input_tokens_seen": 175955170, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 8182, + "time_per_iteration": 2.5358242988586426 + }, + { + "auxiliary_loss_clip": 0.01108049, + "auxiliary_loss_mlp": 0.01031803, + "balance_loss_clip": 1.01904118, + "balance_loss_mlp": 1.03814411, + "epoch": 0.4919885765819931, + "flos": 23368043318400.0, + "grad_norm": 2.2040850478726357, + "language_loss": 0.72828728, + "learning_rate": 2.1488313859122412e-06, + "loss": 0.74968582, + "num_input_tokens_seen": 175973725, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 8183, + "time_per_iteration": 2.484051465988159 + }, + { + "auxiliary_loss_clip": 0.01110545, + "auxiliary_loss_mlp": 0.01031345, + "balance_loss_clip": 1.0178628, + "balance_loss_mlp": 1.03733599, + "epoch": 0.49204869983466104, + "flos": 21360493209600.0, + "grad_norm": 1.6157316160481727, + "language_loss": 0.77338606, + "learning_rate": 2.1484430012685015e-06, + "loss": 0.79480493, + "num_input_tokens_seen": 175993885, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 8184, + "time_per_iteration": 2.4630794525146484 + }, + { + "auxiliary_loss_clip": 0.01107787, + "auxiliary_loss_mlp": 0.01035741, + "balance_loss_clip": 1.02359986, + "balance_loss_mlp": 1.03868532, + "epoch": 0.492108823087329, + "flos": 21142695093120.0, + "grad_norm": 1.7266312313882144, + "language_loss": 0.71020061, + "learning_rate": 2.148054610995789e-06, + "loss": 0.73163593, + "num_input_tokens_seen": 176014210, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69140625, + "step": 8185, + "time_per_iteration": 2.4472904205322266 + }, + { + "auxiliary_loss_clip": 0.01109756, + "auxiliary_loss_mlp": 0.01037838, + "balance_loss_clip": 1.02348495, + "balance_loss_mlp": 1.03818357, + "epoch": 0.49216894633999697, + "flos": 25116605389440.0, + "grad_norm": 2.357724154899576, + "language_loss": 0.75007719, + "learning_rate": 2.147666215108831e-06, + "loss": 0.7715531, + "num_input_tokens_seen": 176033890, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.71484375, + "step": 8186, + "time_per_iteration": 2.497887372970581 + }, + { + "auxiliary_loss_clip": 0.01108113, + "auxiliary_loss_mlp": 0.01036969, + "balance_loss_clip": 1.0240649, + "balance_loss_mlp": 1.03769946, + "epoch": 0.49222906959266494, + "flos": 22637943475200.0, + "grad_norm": 2.2731376810200947, + "language_loss": 0.67426246, + "learning_rate": 2.1472778136223545e-06, + "loss": 0.69571328, + "num_input_tokens_seen": 176052720, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 8187, + "time_per_iteration": 2.4402377605438232 + }, + { + "auxiliary_loss_clip": 0.01105993, + "auxiliary_loss_mlp": 0.01034798, + "balance_loss_clip": 1.02205503, + "balance_loss_mlp": 1.03659558, + "epoch": 0.49228919284533296, + "flos": 20410548174720.0, + "grad_norm": 1.3838016666023416, + "language_loss": 0.66984355, + "learning_rate": 2.1468894065510894e-06, + "loss": 0.69125152, + "num_input_tokens_seen": 176072545, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 8188, + "time_per_iteration": 2.4889986515045166 + }, + { + "auxiliary_loss_clip": 0.01108628, + "auxiliary_loss_mlp": 0.01027775, + "balance_loss_clip": 1.01627779, + "balance_loss_mlp": 1.03854966, + "epoch": 0.4923493160980009, + "flos": 27122359818240.0, + "grad_norm": 1.5428848144341532, + "language_loss": 0.7457763, + "learning_rate": 2.1465009939097623e-06, + "loss": 0.76714027, + "num_input_tokens_seen": 176091490, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.703125, + "step": 8189, + "time_per_iteration": 2.4837827682495117 + }, + { + "auxiliary_loss_clip": 0.011062, + "auxiliary_loss_mlp": 0.01032136, + "balance_loss_clip": 1.01975584, + "balance_loss_mlp": 1.03744173, + "epoch": 0.4924094393506689, + "flos": 35736683224320.0, + "grad_norm": 1.5888967888129601, + "language_loss": 0.64360684, + "learning_rate": 2.146112575713104e-06, + "loss": 0.66499019, + "num_input_tokens_seen": 176113200, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 8190, + "time_per_iteration": 2.606388807296753 + }, + { + "auxiliary_loss_clip": 0.01107034, + "auxiliary_loss_mlp": 0.01027622, + "balance_loss_clip": 1.01528418, + "balance_loss_mlp": 1.0383538, + "epoch": 0.49246956260333685, + "flos": 20412487509120.0, + "grad_norm": 1.9368790872615624, + "language_loss": 0.71231604, + "learning_rate": 2.1457241519758413e-06, + "loss": 0.73366261, + "num_input_tokens_seen": 176132485, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6875, + "step": 8191, + "time_per_iteration": 2.4383578300476074 + }, + { + "auxiliary_loss_clip": 0.01106251, + "auxiliary_loss_mlp": 0.01033855, + "balance_loss_clip": 1.02162957, + "balance_loss_mlp": 1.03718042, + "epoch": 0.4925296858560048, + "flos": 38976938231040.0, + "grad_norm": 1.5667911589112589, + "language_loss": 0.71698356, + "learning_rate": 2.1453357227127043e-06, + "loss": 0.7383846, + "num_input_tokens_seen": 176155755, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69140625, + "step": 8192, + "time_per_iteration": 2.6127231121063232 + }, + { + "auxiliary_loss_clip": 0.01033253, + "auxiliary_loss_mlp": 0.01011533, + "balance_loss_clip": 1.01047826, + "balance_loss_mlp": 1.00980878, + "epoch": 0.4925898091086728, + "flos": 64278917712000.0, + "grad_norm": 0.7610920789142134, + "language_loss": 0.52138889, + "learning_rate": 2.1449472879384224e-06, + "loss": 0.54183674, + "num_input_tokens_seen": 176216295, + "router_z_loss_clip": 0.01055908, + "router_z_loss_mlp": 0.234375, + "step": 8193, + "time_per_iteration": 3.1151235103607178 + }, + { + "auxiliary_loss_clip": 0.01106303, + "auxiliary_loss_mlp": 0.01037325, + "balance_loss_clip": 1.02470672, + "balance_loss_mlp": 1.03862, + "epoch": 0.49264993236134075, + "flos": 23036372110080.0, + "grad_norm": 1.5012892842908303, + "language_loss": 0.77071059, + "learning_rate": 2.1445588476677246e-06, + "loss": 0.79214686, + "num_input_tokens_seen": 176235925, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.67578125, + "step": 8194, + "time_per_iteration": 2.4766407012939453 + }, + { + "auxiliary_loss_clip": 0.01104661, + "auxiliary_loss_mlp": 0.01029401, + "balance_loss_clip": 1.01783228, + "balance_loss_mlp": 1.03554666, + "epoch": 0.4927100556140087, + "flos": 24718212668160.0, + "grad_norm": 1.9786600447906189, + "language_loss": 0.70556259, + "learning_rate": 2.144170401915341e-06, + "loss": 0.7269032, + "num_input_tokens_seen": 176253865, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.69140625, + "step": 8195, + "time_per_iteration": 2.489412784576416 + }, + { + "auxiliary_loss_clip": 0.01108151, + "auxiliary_loss_mlp": 0.01027525, + "balance_loss_clip": 1.01537156, + "balance_loss_mlp": 1.0380609, + "epoch": 0.4927701788666767, + "flos": 23505544581120.0, + "grad_norm": 1.8494849345903903, + "language_loss": 0.81095743, + "learning_rate": 2.143781950696001e-06, + "loss": 0.83231419, + "num_input_tokens_seen": 176271525, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69921875, + "step": 8196, + "time_per_iteration": 2.5489988327026367 + }, + { + "auxiliary_loss_clip": 0.01108856, + "auxiliary_loss_mlp": 0.01033443, + "balance_loss_clip": 1.02033019, + "balance_loss_mlp": 1.03709757, + "epoch": 0.49283030211934464, + "flos": 22928891639040.0, + "grad_norm": 1.848981865854384, + "language_loss": 0.7100687, + "learning_rate": 2.1433934940244356e-06, + "loss": 0.73149174, + "num_input_tokens_seen": 176290810, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 8197, + "time_per_iteration": 2.4621787071228027 + }, + { + "auxiliary_loss_clip": 0.01105723, + "auxiliary_loss_mlp": 0.01031298, + "balance_loss_clip": 1.01988339, + "balance_loss_mlp": 1.03815627, + "epoch": 0.4928904253720126, + "flos": 16873024210560.0, + "grad_norm": 1.7362069513061655, + "language_loss": 0.84122622, + "learning_rate": 2.143005031915374e-06, + "loss": 0.86259645, + "num_input_tokens_seen": 176309165, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.67578125, + "step": 8198, + "time_per_iteration": 2.4596786499023438 + }, + { + "auxiliary_loss_clip": 0.01110423, + "auxiliary_loss_mlp": 0.01034702, + "balance_loss_clip": 1.02139831, + "balance_loss_mlp": 1.03913713, + "epoch": 0.4929505486246806, + "flos": 14866551509760.0, + "grad_norm": 1.767623263247313, + "language_loss": 0.76214266, + "learning_rate": 2.1426165643835467e-06, + "loss": 0.78359395, + "num_input_tokens_seen": 176324960, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 8199, + "time_per_iteration": 2.413482189178467 + }, + { + "auxiliary_loss_clip": 0.01109624, + "auxiliary_loss_mlp": 0.01035996, + "balance_loss_clip": 1.02215028, + "balance_loss_mlp": 1.03712904, + "epoch": 0.49301067187734854, + "flos": 23842351434240.0, + "grad_norm": 1.555242231339172, + "language_loss": 0.59918249, + "learning_rate": 2.1422280914436864e-06, + "loss": 0.62063873, + "num_input_tokens_seen": 176346195, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7265625, + "step": 8200, + "time_per_iteration": 2.515371561050415 + }, + { + "auxiliary_loss_clip": 0.01101467, + "auxiliary_loss_mlp": 0.01033481, + "balance_loss_clip": 1.02128601, + "balance_loss_mlp": 1.03560054, + "epoch": 0.49307079513001656, + "flos": 22491284244480.0, + "grad_norm": 1.4972351372180894, + "language_loss": 0.78781515, + "learning_rate": 2.1418396131105213e-06, + "loss": 0.80916464, + "num_input_tokens_seen": 176366735, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66015625, + "step": 8201, + "time_per_iteration": 2.4688665866851807 + }, + { + "auxiliary_loss_clip": 0.01112129, + "auxiliary_loss_mlp": 0.01032302, + "balance_loss_clip": 1.01858091, + "balance_loss_mlp": 1.03761029, + "epoch": 0.4931309183826845, + "flos": 15924587546880.0, + "grad_norm": 2.1515546014570766, + "language_loss": 0.67352241, + "learning_rate": 2.141451129398785e-06, + "loss": 0.69496673, + "num_input_tokens_seen": 176384475, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7421875, + "step": 8202, + "time_per_iteration": 2.6021947860717773 + }, + { + "auxiliary_loss_clip": 0.01106578, + "auxiliary_loss_mlp": 0.01030125, + "balance_loss_clip": 1.01781058, + "balance_loss_mlp": 1.03682148, + "epoch": 0.4931910416353525, + "flos": 27309059735040.0, + "grad_norm": 3.4273755266911845, + "language_loss": 0.75192142, + "learning_rate": 2.1410626403232076e-06, + "loss": 0.77328843, + "num_input_tokens_seen": 176402645, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.69921875, + "step": 8203, + "time_per_iteration": 2.501173496246338 + }, + { + "auxiliary_loss_clip": 0.01106791, + "auxiliary_loss_mlp": 0.01034465, + "balance_loss_clip": 1.0214237, + "balance_loss_mlp": 1.03780818, + "epoch": 0.49325116488802045, + "flos": 20806139635200.0, + "grad_norm": 2.0656815740777152, + "language_loss": 0.80908394, + "learning_rate": 2.1406741458985197e-06, + "loss": 0.83049649, + "num_input_tokens_seen": 176416715, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6875, + "step": 8204, + "time_per_iteration": 2.481666088104248 + }, + { + "auxiliary_loss_clip": 0.01105243, + "auxiliary_loss_mlp": 0.01033404, + "balance_loss_clip": 1.02180493, + "balance_loss_mlp": 1.03788805, + "epoch": 0.4933112881406884, + "flos": 19865963099520.0, + "grad_norm": 2.2280647806743183, + "language_loss": 0.65550953, + "learning_rate": 2.140285646139455e-06, + "loss": 0.67689598, + "num_input_tokens_seen": 176435755, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 8205, + "time_per_iteration": 2.439408302307129 + }, + { + "auxiliary_loss_clip": 0.01112282, + "auxiliary_loss_mlp": 0.01035253, + "balance_loss_clip": 1.02083468, + "balance_loss_mlp": 1.03837705, + "epoch": 0.4933714113933564, + "flos": 21827977741440.0, + "grad_norm": 1.7727903919462147, + "language_loss": 0.67009246, + "learning_rate": 2.139897141060744e-06, + "loss": 0.69156778, + "num_input_tokens_seen": 176453915, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.73828125, + "step": 8206, + "time_per_iteration": 2.4607954025268555 + }, + { + "auxiliary_loss_clip": 0.01106251, + "auxiliary_loss_mlp": 0.01026736, + "balance_loss_clip": 1.01473176, + "balance_loss_mlp": 1.03630567, + "epoch": 0.49343153464602435, + "flos": 27890130049920.0, + "grad_norm": 1.822649710507408, + "language_loss": 0.76363301, + "learning_rate": 2.1395086306771196e-06, + "loss": 0.78496289, + "num_input_tokens_seen": 176475175, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69921875, + "step": 8207, + "time_per_iteration": 2.508553981781006 + }, + { + "auxiliary_loss_clip": 0.01109244, + "auxiliary_loss_mlp": 0.01032575, + "balance_loss_clip": 1.01912785, + "balance_loss_mlp": 1.03869963, + "epoch": 0.4934916578986923, + "flos": 24681080983680.0, + "grad_norm": 2.308112072386131, + "language_loss": 0.59984541, + "learning_rate": 2.1391201150033147e-06, + "loss": 0.62126362, + "num_input_tokens_seen": 176494250, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.703125, + "step": 8208, + "time_per_iteration": 2.505990982055664 + }, + { + "auxiliary_loss_clip": 0.01108969, + "auxiliary_loss_mlp": 0.01029375, + "balance_loss_clip": 1.01622033, + "balance_loss_mlp": 1.03816974, + "epoch": 0.4935517811513603, + "flos": 23405139089280.0, + "grad_norm": 2.3772506823576407, + "language_loss": 0.7851491, + "learning_rate": 2.1387315940540598e-06, + "loss": 0.80653256, + "num_input_tokens_seen": 176513325, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.70703125, + "step": 8209, + "time_per_iteration": 2.4622652530670166 + }, + { + "auxiliary_loss_clip": 0.01105789, + "auxiliary_loss_mlp": 0.01030424, + "balance_loss_clip": 1.01728129, + "balance_loss_mlp": 1.03630066, + "epoch": 0.49361190440402825, + "flos": 21944508439680.0, + "grad_norm": 1.7984719462813816, + "language_loss": 0.78806269, + "learning_rate": 2.138343067844089e-06, + "loss": 0.80942488, + "num_input_tokens_seen": 176532915, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6953125, + "step": 8210, + "time_per_iteration": 2.4884698390960693 + }, + { + "auxiliary_loss_clip": 0.01111365, + "auxiliary_loss_mlp": 0.01032283, + "balance_loss_clip": 1.01888382, + "balance_loss_mlp": 1.0381912, + "epoch": 0.4936720276566962, + "flos": 25115671635840.0, + "grad_norm": 2.2650712316686903, + "language_loss": 0.81229484, + "learning_rate": 2.1379545363881363e-06, + "loss": 0.83373135, + "num_input_tokens_seen": 176552775, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73046875, + "step": 8211, + "time_per_iteration": 2.4839043617248535 + }, + { + "auxiliary_loss_clip": 0.01109974, + "auxiliary_loss_mlp": 0.01036004, + "balance_loss_clip": 1.02280188, + "balance_loss_mlp": 1.03911519, + "epoch": 0.4937321509093642, + "flos": 26358935132160.0, + "grad_norm": 2.6136684102444665, + "language_loss": 0.91496241, + "learning_rate": 2.137565999700933e-06, + "loss": 0.93642217, + "num_input_tokens_seen": 176572185, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 8212, + "time_per_iteration": 2.5103862285614014 + }, + { + "auxiliary_loss_clip": 0.01106972, + "auxiliary_loss_mlp": 0.01031672, + "balance_loss_clip": 1.01925647, + "balance_loss_mlp": 1.036484, + "epoch": 0.49379227416203214, + "flos": 22961390469120.0, + "grad_norm": 1.7787072133843917, + "language_loss": 0.64901662, + "learning_rate": 2.1371774577972138e-06, + "loss": 0.670403, + "num_input_tokens_seen": 176591490, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.703125, + "step": 8213, + "time_per_iteration": 2.460123300552368 + }, + { + "auxiliary_loss_clip": 0.01106125, + "auxiliary_loss_mlp": 0.01027241, + "balance_loss_clip": 1.01356125, + "balance_loss_mlp": 1.03668904, + "epoch": 0.49385239741470016, + "flos": 32489101843200.0, + "grad_norm": 1.9389339120527038, + "language_loss": 0.75199962, + "learning_rate": 2.136788910691711e-06, + "loss": 0.77333331, + "num_input_tokens_seen": 176612715, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.69140625, + "step": 8214, + "time_per_iteration": 2.5719900131225586 + }, + { + "auxiliary_loss_clip": 0.01109359, + "auxiliary_loss_mlp": 0.01035274, + "balance_loss_clip": 1.02212512, + "balance_loss_mlp": 1.03959298, + "epoch": 0.4939125206673681, + "flos": 22492864442880.0, + "grad_norm": 1.828808325177945, + "language_loss": 0.84395385, + "learning_rate": 2.1364003583991594e-06, + "loss": 0.86540014, + "num_input_tokens_seen": 176631950, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.6953125, + "step": 8215, + "time_per_iteration": 2.468804121017456 + }, + { + "auxiliary_loss_clip": 0.01101226, + "auxiliary_loss_mlp": 0.0102806, + "balance_loss_clip": 1.01656199, + "balance_loss_mlp": 1.03478694, + "epoch": 0.4939726439200361, + "flos": 31176351486720.0, + "grad_norm": 1.6051587100805058, + "language_loss": 0.82859147, + "learning_rate": 2.136011800934292e-06, + "loss": 0.84988439, + "num_input_tokens_seen": 176653060, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 8216, + "time_per_iteration": 2.5819287300109863 + }, + { + "auxiliary_loss_clip": 0.01107134, + "auxiliary_loss_mlp": 0.01031262, + "balance_loss_clip": 1.01918006, + "balance_loss_mlp": 1.03821325, + "epoch": 0.49403276717270406, + "flos": 22674213233280.0, + "grad_norm": 1.4383830441547378, + "language_loss": 0.74774921, + "learning_rate": 2.1356232383118442e-06, + "loss": 0.76913321, + "num_input_tokens_seen": 176673895, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 8217, + "time_per_iteration": 2.4628379344940186 + }, + { + "auxiliary_loss_clip": 0.01104285, + "auxiliary_loss_mlp": 0.01032577, + "balance_loss_clip": 1.01928544, + "balance_loss_mlp": 1.03777707, + "epoch": 0.494092890425372, + "flos": 20741070147840.0, + "grad_norm": 1.733886360732455, + "language_loss": 0.78829861, + "learning_rate": 2.1352346705465494e-06, + "loss": 0.80966723, + "num_input_tokens_seen": 176692550, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6640625, + "step": 8218, + "time_per_iteration": 2.4809412956237793 + }, + { + "auxiliary_loss_clip": 0.0110198, + "auxiliary_loss_mlp": 0.01035085, + "balance_loss_clip": 1.02269292, + "balance_loss_mlp": 1.03510332, + "epoch": 0.49415301367804, + "flos": 18369026778240.0, + "grad_norm": 2.0240627965271187, + "language_loss": 0.76301086, + "learning_rate": 2.134846097653142e-06, + "loss": 0.78438151, + "num_input_tokens_seen": 176709335, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 8219, + "time_per_iteration": 3.8202009201049805 + }, + { + "auxiliary_loss_clip": 0.01108105, + "auxiliary_loss_mlp": 0.01033934, + "balance_loss_clip": 1.02106571, + "balance_loss_mlp": 1.03764367, + "epoch": 0.49421313693070795, + "flos": 17530620451200.0, + "grad_norm": 1.6690505128843895, + "language_loss": 0.6190055, + "learning_rate": 2.134457519646357e-06, + "loss": 0.64042592, + "num_input_tokens_seen": 176727715, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 8220, + "time_per_iteration": 2.453230857849121 + }, + { + "auxiliary_loss_clip": 0.01106287, + "auxiliary_loss_mlp": 0.01029403, + "balance_loss_clip": 1.01656425, + "balance_loss_mlp": 1.03672814, + "epoch": 0.4942732601833759, + "flos": 20812173120000.0, + "grad_norm": 1.7319378421104112, + "language_loss": 0.72381485, + "learning_rate": 2.1340689365409296e-06, + "loss": 0.74517179, + "num_input_tokens_seen": 176747530, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 8221, + "time_per_iteration": 5.506774187088013 + }, + { + "auxiliary_loss_clip": 0.01106502, + "auxiliary_loss_mlp": 0.01037169, + "balance_loss_clip": 1.02521193, + "balance_loss_mlp": 1.04006767, + "epoch": 0.4943333834360439, + "flos": 15048941794560.0, + "grad_norm": 1.681203667545881, + "language_loss": 0.79131603, + "learning_rate": 2.133680348351595e-06, + "loss": 0.81275266, + "num_input_tokens_seen": 176765260, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6640625, + "step": 8222, + "time_per_iteration": 2.491175889968872 + }, + { + "auxiliary_loss_clip": 0.01108448, + "auxiliary_loss_mlp": 0.01034922, + "balance_loss_clip": 1.02147555, + "balance_loss_mlp": 1.03941715, + "epoch": 0.49439350668871185, + "flos": 16070420764800.0, + "grad_norm": 2.3506903054927015, + "language_loss": 0.73205507, + "learning_rate": 2.133291755093088e-06, + "loss": 0.75348878, + "num_input_tokens_seen": 176781770, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.69140625, + "step": 8223, + "time_per_iteration": 2.4359662532806396 + }, + { + "auxiliary_loss_clip": 0.01109917, + "auxiliary_loss_mlp": 0.01035963, + "balance_loss_clip": 1.02264762, + "balance_loss_mlp": 1.03850269, + "epoch": 0.4944536299413798, + "flos": 20880079781760.0, + "grad_norm": 1.7533498543998463, + "language_loss": 0.75144434, + "learning_rate": 2.132903156780144e-06, + "loss": 0.7729032, + "num_input_tokens_seen": 176800655, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.71484375, + "step": 8224, + "time_per_iteration": 2.5716288089752197 + }, + { + "auxiliary_loss_clip": 0.01111376, + "auxiliary_loss_mlp": 0.01030429, + "balance_loss_clip": 1.01807868, + "balance_loss_mlp": 1.04080439, + "epoch": 0.4945137531940478, + "flos": 26608908856320.0, + "grad_norm": 2.086998261136206, + "language_loss": 0.63982892, + "learning_rate": 2.1325145534274997e-06, + "loss": 0.66124696, + "num_input_tokens_seen": 176820610, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.70703125, + "step": 8225, + "time_per_iteration": 2.524048089981079 + }, + { + "auxiliary_loss_clip": 0.01107484, + "auxiliary_loss_mlp": 0.01033109, + "balance_loss_clip": 1.0206579, + "balance_loss_mlp": 1.03766608, + "epoch": 0.49457387644671574, + "flos": 23988148738560.0, + "grad_norm": 1.839126557537864, + "language_loss": 0.76359057, + "learning_rate": 2.1321259450498893e-06, + "loss": 0.78499651, + "num_input_tokens_seen": 176840520, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 8226, + "time_per_iteration": 2.5069961547851562 + }, + { + "auxiliary_loss_clip": 0.01109174, + "auxiliary_loss_mlp": 0.01039999, + "balance_loss_clip": 1.02578914, + "balance_loss_mlp": 1.03735518, + "epoch": 0.49463399969938376, + "flos": 26976598427520.0, + "grad_norm": 1.6377261486682646, + "language_loss": 0.71156305, + "learning_rate": 2.131737331662051e-06, + "loss": 0.73305476, + "num_input_tokens_seen": 176860265, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.71875, + "step": 8227, + "time_per_iteration": 2.4763920307159424 + }, + { + "auxiliary_loss_clip": 0.01112289, + "auxiliary_loss_mlp": 0.01034696, + "balance_loss_clip": 1.02160668, + "balance_loss_mlp": 1.03914213, + "epoch": 0.49469412295205173, + "flos": 29681534067840.0, + "grad_norm": 1.614424212368193, + "language_loss": 0.71484196, + "learning_rate": 2.131348713278718e-06, + "loss": 0.73631173, + "num_input_tokens_seen": 176882910, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.734375, + "step": 8228, + "time_per_iteration": 2.550083637237549 + }, + { + "auxiliary_loss_clip": 0.01105792, + "auxiliary_loss_mlp": 0.0103118, + "balance_loss_clip": 1.01829386, + "balance_loss_mlp": 1.03837276, + "epoch": 0.4947542462047197, + "flos": 24131791226880.0, + "grad_norm": 1.6200219454444607, + "language_loss": 0.83788311, + "learning_rate": 2.1309600899146304e-06, + "loss": 0.85925281, + "num_input_tokens_seen": 176903030, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.67578125, + "step": 8229, + "time_per_iteration": 2.474684238433838 + }, + { + "auxiliary_loss_clip": 0.01108289, + "auxiliary_loss_mlp": 0.01035108, + "balance_loss_clip": 1.02103567, + "balance_loss_mlp": 1.03685689, + "epoch": 0.49481436945738766, + "flos": 20045049333120.0, + "grad_norm": 2.055489394198818, + "language_loss": 0.75105131, + "learning_rate": 2.1305714615845227e-06, + "loss": 0.77248526, + "num_input_tokens_seen": 176919025, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.71484375, + "step": 8230, + "time_per_iteration": 2.506950616836548 + }, + { + "auxiliary_loss_clip": 0.01109012, + "auxiliary_loss_mlp": 0.01027613, + "balance_loss_clip": 1.01497638, + "balance_loss_mlp": 1.03868175, + "epoch": 0.4948744927100556, + "flos": 15669550005120.0, + "grad_norm": 2.703005059233118, + "language_loss": 0.79713035, + "learning_rate": 2.1301828283031314e-06, + "loss": 0.8184967, + "num_input_tokens_seen": 176937945, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 8231, + "time_per_iteration": 2.4176137447357178 + }, + { + "auxiliary_loss_clip": 0.01035427, + "auxiliary_loss_mlp": 0.01002857, + "balance_loss_clip": 1.00169492, + "balance_loss_mlp": 1.01191425, + "epoch": 0.4949346159627236, + "flos": 68872071502080.0, + "grad_norm": 0.7419788553124401, + "language_loss": 0.60237485, + "learning_rate": 2.1297941900851944e-06, + "loss": 0.62275773, + "num_input_tokens_seen": 177004575, + "router_z_loss_clip": 0.01159668, + "router_z_loss_mlp": 0.23535156, + "step": 8232, + "time_per_iteration": 3.183783531188965 + }, + { + "auxiliary_loss_clip": 0.0111307, + "auxiliary_loss_mlp": 0.01035045, + "balance_loss_clip": 1.02119923, + "balance_loss_mlp": 1.03889871, + "epoch": 0.49499473921539155, + "flos": 24790285307520.0, + "grad_norm": 1.7147216218758814, + "language_loss": 0.69257128, + "learning_rate": 2.1294055469454496e-06, + "loss": 0.71405244, + "num_input_tokens_seen": 177024155, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7421875, + "step": 8233, + "time_per_iteration": 2.477755546569824 + }, + { + "auxiliary_loss_clip": 0.01106869, + "auxiliary_loss_mlp": 0.01035041, + "balance_loss_clip": 1.02111769, + "balance_loss_mlp": 1.03714275, + "epoch": 0.4950548624680595, + "flos": 32707905540480.0, + "grad_norm": 3.246275947254348, + "language_loss": 0.6678468, + "learning_rate": 2.129016898898633e-06, + "loss": 0.68926585, + "num_input_tokens_seen": 177046185, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.6953125, + "step": 8234, + "time_per_iteration": 2.5594117641448975 + }, + { + "auxiliary_loss_clip": 0.0103478, + "auxiliary_loss_mlp": 0.01003988, + "balance_loss_clip": 1.00288522, + "balance_loss_mlp": 1.01140106, + "epoch": 0.4951149857207275, + "flos": 50082173066880.0, + "grad_norm": 0.8288840425421409, + "language_loss": 0.57987183, + "learning_rate": 2.128628245959482e-06, + "loss": 0.60025948, + "num_input_tokens_seen": 177099025, + "router_z_loss_clip": 0.01104736, + "router_z_loss_mlp": 0.234375, + "step": 8235, + "time_per_iteration": 3.0041370391845703 + }, + { + "auxiliary_loss_clip": 0.01108932, + "auxiliary_loss_mlp": 0.01037443, + "balance_loss_clip": 1.02345991, + "balance_loss_mlp": 1.03770208, + "epoch": 0.49517510897339545, + "flos": 22236785406720.0, + "grad_norm": 1.4917768542550827, + "language_loss": 0.76824737, + "learning_rate": 2.1282395881427355e-06, + "loss": 0.78971112, + "num_input_tokens_seen": 177118365, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7109375, + "step": 8236, + "time_per_iteration": 2.498105525970459 + }, + { + "auxiliary_loss_clip": 0.01107773, + "auxiliary_loss_mlp": 0.01032214, + "balance_loss_clip": 1.01948881, + "balance_loss_mlp": 1.03860247, + "epoch": 0.4952352322260634, + "flos": 25374120969600.0, + "grad_norm": 1.8006519774313887, + "language_loss": 0.72554326, + "learning_rate": 2.1278509254631315e-06, + "loss": 0.74694312, + "num_input_tokens_seen": 177136415, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 8237, + "time_per_iteration": 2.487849473953247 + }, + { + "auxiliary_loss_clip": 0.01105674, + "auxiliary_loss_mlp": 0.01032739, + "balance_loss_clip": 1.02024627, + "balance_loss_mlp": 1.03722131, + "epoch": 0.4952953554787314, + "flos": 24608721035520.0, + "grad_norm": 1.8061825502363815, + "language_loss": 0.75687563, + "learning_rate": 2.127462257935406e-06, + "loss": 0.77825987, + "num_input_tokens_seen": 177155690, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 8238, + "time_per_iteration": 2.4926116466522217 + }, + { + "auxiliary_loss_clip": 0.01110283, + "auxiliary_loss_mlp": 0.01034271, + "balance_loss_clip": 1.02057362, + "balance_loss_mlp": 1.03765702, + "epoch": 0.49535547873139935, + "flos": 17311278049920.0, + "grad_norm": 2.197202607879525, + "language_loss": 0.73434591, + "learning_rate": 2.1270735855743008e-06, + "loss": 0.75579149, + "num_input_tokens_seen": 177173350, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7265625, + "step": 8239, + "time_per_iteration": 2.4181203842163086 + }, + { + "auxiliary_loss_clip": 0.01109997, + "auxiliary_loss_mlp": 0.01037672, + "balance_loss_clip": 1.02266932, + "balance_loss_mlp": 1.03704619, + "epoch": 0.4954156019840673, + "flos": 20740315962240.0, + "grad_norm": 2.4131176994917936, + "language_loss": 0.78344893, + "learning_rate": 2.126684908394552e-06, + "loss": 0.80492562, + "num_input_tokens_seen": 177191115, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.73046875, + "step": 8240, + "time_per_iteration": 2.479642391204834 + }, + { + "auxiliary_loss_clip": 0.01104608, + "auxiliary_loss_mlp": 0.01040833, + "balance_loss_clip": 1.02865601, + "balance_loss_mlp": 1.03746533, + "epoch": 0.49547572523673533, + "flos": 12820684567680.0, + "grad_norm": 2.0234307188816993, + "language_loss": 0.85579056, + "learning_rate": 2.126296226410898e-06, + "loss": 0.87724495, + "num_input_tokens_seen": 177206155, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.671875, + "step": 8241, + "time_per_iteration": 2.4081263542175293 + }, + { + "auxiliary_loss_clip": 0.01106442, + "auxiliary_loss_mlp": 0.01035574, + "balance_loss_clip": 1.02337933, + "balance_loss_mlp": 1.03813624, + "epoch": 0.4955358484894033, + "flos": 15597046402560.0, + "grad_norm": 1.761079127200854, + "language_loss": 0.77041149, + "learning_rate": 2.1259075396380794e-06, + "loss": 0.79183173, + "num_input_tokens_seen": 177224815, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 8242, + "time_per_iteration": 2.4439215660095215 + }, + { + "auxiliary_loss_clip": 0.01106589, + "auxiliary_loss_mlp": 0.01031002, + "balance_loss_clip": 1.01821673, + "balance_loss_mlp": 1.03676701, + "epoch": 0.49559597174207126, + "flos": 26464368528000.0, + "grad_norm": 1.7216813067847012, + "language_loss": 0.67493725, + "learning_rate": 2.125518848090833e-06, + "loss": 0.6963132, + "num_input_tokens_seen": 177244490, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 8243, + "time_per_iteration": 2.4888081550598145 + }, + { + "auxiliary_loss_clip": 0.01107757, + "auxiliary_loss_mlp": 0.01030474, + "balance_loss_clip": 1.01805878, + "balance_loss_mlp": 1.03910422, + "epoch": 0.4956560949947392, + "flos": 23148234040320.0, + "grad_norm": 1.8355775234908949, + "language_loss": 0.68218768, + "learning_rate": 2.125130151783901e-06, + "loss": 0.70357001, + "num_input_tokens_seen": 177264340, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 8244, + "time_per_iteration": 2.481220245361328 + }, + { + "auxiliary_loss_clip": 0.01109231, + "auxiliary_loss_mlp": 0.01035961, + "balance_loss_clip": 1.02201915, + "balance_loss_mlp": 1.03828287, + "epoch": 0.4957162182474072, + "flos": 20773461237120.0, + "grad_norm": 1.8414695050792438, + "language_loss": 0.74998277, + "learning_rate": 2.12474145073202e-06, + "loss": 0.77143466, + "num_input_tokens_seen": 177283055, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7109375, + "step": 8245, + "time_per_iteration": 2.459244728088379 + }, + { + "auxiliary_loss_clip": 0.01105994, + "auxiliary_loss_mlp": 0.01029176, + "balance_loss_clip": 1.01628923, + "balance_loss_mlp": 1.03797877, + "epoch": 0.49577634150007516, + "flos": 18734202397440.0, + "grad_norm": 3.047248940663427, + "language_loss": 0.81496358, + "learning_rate": 2.1243527449499306e-06, + "loss": 0.83631527, + "num_input_tokens_seen": 177301140, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6796875, + "step": 8246, + "time_per_iteration": 2.54664945602417 + }, + { + "auxiliary_loss_clip": 0.01110421, + "auxiliary_loss_mlp": 0.01039175, + "balance_loss_clip": 1.02553713, + "balance_loss_mlp": 1.03858495, + "epoch": 0.4958364647527431, + "flos": 25554176870400.0, + "grad_norm": 1.7095262667552558, + "language_loss": 0.83750397, + "learning_rate": 2.1239640344523733e-06, + "loss": 0.85899985, + "num_input_tokens_seen": 177323095, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71875, + "step": 8247, + "time_per_iteration": 2.478410482406616 + }, + { + "auxiliary_loss_clip": 0.01111487, + "auxiliary_loss_mlp": 0.01030982, + "balance_loss_clip": 1.01897812, + "balance_loss_mlp": 1.04011726, + "epoch": 0.4958965880054111, + "flos": 24425325169920.0, + "grad_norm": 2.0177325188605018, + "language_loss": 0.83758432, + "learning_rate": 2.123575319254087e-06, + "loss": 0.85900903, + "num_input_tokens_seen": 177339845, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.7109375, + "step": 8248, + "time_per_iteration": 2.490619659423828 + }, + { + "auxiliary_loss_clip": 0.01109734, + "auxiliary_loss_mlp": 0.01028278, + "balance_loss_clip": 1.01518941, + "balance_loss_mlp": 1.03800774, + "epoch": 0.49595671125807905, + "flos": 25083460114560.0, + "grad_norm": 2.055191909263014, + "language_loss": 0.73715985, + "learning_rate": 2.123186599369812e-06, + "loss": 0.75853992, + "num_input_tokens_seen": 177359980, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 8249, + "time_per_iteration": 2.5232534408569336 + }, + { + "auxiliary_loss_clip": 0.0111234, + "auxiliary_loss_mlp": 0.01038359, + "balance_loss_clip": 1.02504992, + "balance_loss_mlp": 1.04018188, + "epoch": 0.496016834510747, + "flos": 16435883692800.0, + "grad_norm": 1.9063816639589337, + "language_loss": 0.76176995, + "learning_rate": 2.122797874814289e-06, + "loss": 0.78327698, + "num_input_tokens_seen": 177378580, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.72265625, + "step": 8250, + "time_per_iteration": 2.5368192195892334 + }, + { + "auxiliary_loss_clip": 0.01108406, + "auxiliary_loss_mlp": 0.0103451, + "balance_loss_clip": 1.02170718, + "balance_loss_mlp": 1.03792036, + "epoch": 0.496076957763415, + "flos": 23437925228160.0, + "grad_norm": 1.615677709430237, + "language_loss": 0.69986647, + "learning_rate": 2.1224091456022585e-06, + "loss": 0.72129565, + "num_input_tokens_seen": 177398790, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.703125, + "step": 8251, + "time_per_iteration": 2.4543070793151855 + }, + { + "auxiliary_loss_clip": 0.01108023, + "auxiliary_loss_mlp": 0.01027913, + "balance_loss_clip": 1.01586699, + "balance_loss_mlp": 1.03890181, + "epoch": 0.49613708101608295, + "flos": 16909509450240.0, + "grad_norm": 1.8749041446582064, + "language_loss": 0.79864365, + "learning_rate": 2.122020411748461e-06, + "loss": 0.82000297, + "num_input_tokens_seen": 177416515, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 8252, + "time_per_iteration": 2.4386792182922363 + }, + { + "auxiliary_loss_clip": 0.01108859, + "auxiliary_loss_mlp": 0.01028142, + "balance_loss_clip": 1.01384854, + "balance_loss_mlp": 1.03821409, + "epoch": 0.4961972042687509, + "flos": 16618094409600.0, + "grad_norm": 1.7863838823967775, + "language_loss": 0.80688357, + "learning_rate": 2.1216316732676363e-06, + "loss": 0.82825357, + "num_input_tokens_seen": 177434425, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.70703125, + "step": 8253, + "time_per_iteration": 2.440727710723877 + }, + { + "auxiliary_loss_clip": 0.01105434, + "auxiliary_loss_mlp": 0.01030191, + "balance_loss_clip": 1.01863384, + "balance_loss_mlp": 1.03654194, + "epoch": 0.49625732752141893, + "flos": 28956749437440.0, + "grad_norm": 1.548882190492268, + "language_loss": 0.67088544, + "learning_rate": 2.1212429301745275e-06, + "loss": 0.69224173, + "num_input_tokens_seen": 177459675, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6875, + "step": 8254, + "time_per_iteration": 2.575490951538086 + }, + { + "auxiliary_loss_clip": 0.011067, + "auxiliary_loss_mlp": 0.01035621, + "balance_loss_clip": 1.02257323, + "balance_loss_mlp": 1.03522658, + "epoch": 0.4963174507740869, + "flos": 23112359331840.0, + "grad_norm": 1.5646536445016186, + "language_loss": 0.73859739, + "learning_rate": 2.1208541824838743e-06, + "loss": 0.76002055, + "num_input_tokens_seen": 177478895, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71484375, + "step": 8255, + "time_per_iteration": 2.478703498840332 + }, + { + "auxiliary_loss_clip": 0.0110576, + "auxiliary_loss_mlp": 0.01034939, + "balance_loss_clip": 1.02208281, + "balance_loss_mlp": 1.0362165, + "epoch": 0.49637757402675486, + "flos": 13917863450880.0, + "grad_norm": 1.8563521426834817, + "language_loss": 0.81378329, + "learning_rate": 2.1204654302104183e-06, + "loss": 0.8351903, + "num_input_tokens_seen": 177494920, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 8256, + "time_per_iteration": 2.4312291145324707 + }, + { + "auxiliary_loss_clip": 0.01105024, + "auxiliary_loss_mlp": 0.01024955, + "balance_loss_clip": 1.01246178, + "balance_loss_mlp": 1.03679466, + "epoch": 0.49643769727942283, + "flos": 22309001700480.0, + "grad_norm": 1.8572652078491616, + "language_loss": 0.80710369, + "learning_rate": 2.120076673368901e-06, + "loss": 0.82840347, + "num_input_tokens_seen": 177515455, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 8257, + "time_per_iteration": 2.4589884281158447 + }, + { + "auxiliary_loss_clip": 0.01111951, + "auxiliary_loss_mlp": 0.0103531, + "balance_loss_clip": 1.02173841, + "balance_loss_mlp": 1.03759003, + "epoch": 0.4964978205320908, + "flos": 19500248776320.0, + "grad_norm": 2.788575980623821, + "language_loss": 0.66533971, + "learning_rate": 2.1196879119740647e-06, + "loss": 0.68681228, + "num_input_tokens_seen": 177534040, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7421875, + "step": 8258, + "time_per_iteration": 2.477653741836548 + }, + { + "auxiliary_loss_clip": 0.01103911, + "auxiliary_loss_mlp": 0.01028497, + "balance_loss_clip": 1.01674283, + "balance_loss_mlp": 1.03566313, + "epoch": 0.49655794378475876, + "flos": 23436524597760.0, + "grad_norm": 2.207120440649978, + "language_loss": 0.77672231, + "learning_rate": 2.1192991460406502e-06, + "loss": 0.79804647, + "num_input_tokens_seen": 177554510, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6796875, + "step": 8259, + "time_per_iteration": 2.482516050338745 + }, + { + "auxiliary_loss_clip": 0.01107983, + "auxiliary_loss_mlp": 0.01030822, + "balance_loss_clip": 1.01802468, + "balance_loss_mlp": 1.03903294, + "epoch": 0.4966180670374267, + "flos": 26831124345600.0, + "grad_norm": 1.536511866358609, + "language_loss": 0.78612608, + "learning_rate": 2.1189103755834e-06, + "loss": 0.80751413, + "num_input_tokens_seen": 177575780, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6875, + "step": 8260, + "time_per_iteration": 4.0255560874938965 + }, + { + "auxiliary_loss_clip": 0.0110786, + "auxiliary_loss_mlp": 0.01030696, + "balance_loss_clip": 1.01785684, + "balance_loss_mlp": 1.03662324, + "epoch": 0.4966781902900947, + "flos": 22009326531840.0, + "grad_norm": 4.674193904345997, + "language_loss": 0.76227403, + "learning_rate": 2.1185216006170573e-06, + "loss": 0.78365964, + "num_input_tokens_seen": 177588965, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 8261, + "time_per_iteration": 2.537996530532837 + }, + { + "auxiliary_loss_clip": 0.01104467, + "auxiliary_loss_mlp": 0.01028346, + "balance_loss_clip": 1.01622844, + "balance_loss_mlp": 1.03667367, + "epoch": 0.49673831354276266, + "flos": 26213353309440.0, + "grad_norm": 1.9998040798137362, + "language_loss": 0.89328134, + "learning_rate": 2.1181328211563627e-06, + "loss": 0.91460943, + "num_input_tokens_seen": 177608425, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 8262, + "time_per_iteration": 5.405071020126343 + }, + { + "auxiliary_loss_clip": 0.01104636, + "auxiliary_loss_mlp": 0.01029475, + "balance_loss_clip": 1.0172143, + "balance_loss_mlp": 1.03765512, + "epoch": 0.4967984367954306, + "flos": 23182277155200.0, + "grad_norm": 1.4087924984120455, + "language_loss": 0.73918653, + "learning_rate": 2.11774403721606e-06, + "loss": 0.76052761, + "num_input_tokens_seen": 177628240, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.671875, + "step": 8263, + "time_per_iteration": 3.9610228538513184 + }, + { + "auxiliary_loss_clip": 0.01112691, + "auxiliary_loss_mlp": 0.01033653, + "balance_loss_clip": 1.0196991, + "balance_loss_mlp": 1.04077482, + "epoch": 0.4968585600480986, + "flos": 19281445079040.0, + "grad_norm": 2.641620630884259, + "language_loss": 0.69445115, + "learning_rate": 2.1173552488108923e-06, + "loss": 0.71591461, + "num_input_tokens_seen": 177645920, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.71875, + "step": 8264, + "time_per_iteration": 2.4799907207489014 + }, + { + "auxiliary_loss_clip": 0.01105995, + "auxiliary_loss_mlp": 0.01028905, + "balance_loss_clip": 1.01585722, + "balance_loss_mlp": 1.03470981, + "epoch": 0.49691868330076655, + "flos": 22528703237760.0, + "grad_norm": 1.3808235907294704, + "language_loss": 0.64915001, + "learning_rate": 2.1169664559556007e-06, + "loss": 0.67049909, + "num_input_tokens_seen": 177667185, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 8265, + "time_per_iteration": 2.491708517074585 + }, + { + "auxiliary_loss_clip": 0.01034788, + "auxiliary_loss_mlp": 0.01001781, + "balance_loss_clip": 1.00064886, + "balance_loss_mlp": 1.01169205, + "epoch": 0.4969788065534345, + "flos": 66577128675840.0, + "grad_norm": 0.8684712318419048, + "language_loss": 0.53446817, + "learning_rate": 2.1165776586649304e-06, + "loss": 0.55483389, + "num_input_tokens_seen": 177733020, + "router_z_loss_clip": 0.01135254, + "router_z_loss_mlp": 0.23144531, + "step": 8266, + "time_per_iteration": 3.1343002319335938 + }, + { + "auxiliary_loss_clip": 0.01104137, + "auxiliary_loss_mlp": 0.0102799, + "balance_loss_clip": 1.01567531, + "balance_loss_mlp": 1.03706813, + "epoch": 0.49703892980610254, + "flos": 24059503105920.0, + "grad_norm": 3.469499482915289, + "language_loss": 0.79616332, + "learning_rate": 2.1161888569536223e-06, + "loss": 0.81748462, + "num_input_tokens_seen": 177753370, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.671875, + "step": 8267, + "time_per_iteration": 2.5316126346588135 + }, + { + "auxiliary_loss_clip": 0.01109343, + "auxiliary_loss_mlp": 0.01032461, + "balance_loss_clip": 1.01856148, + "balance_loss_mlp": 1.03869104, + "epoch": 0.4970990530587705, + "flos": 29126174912640.0, + "grad_norm": 2.5132671844419434, + "language_loss": 0.74805677, + "learning_rate": 2.1158000508364223e-06, + "loss": 0.76947474, + "num_input_tokens_seen": 177771530, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.70703125, + "step": 8268, + "time_per_iteration": 2.5102896690368652 + }, + { + "auxiliary_loss_clip": 0.0110689, + "auxiliary_loss_mlp": 0.01033853, + "balance_loss_clip": 1.01998329, + "balance_loss_mlp": 1.0366255, + "epoch": 0.49715917631143847, + "flos": 46026167258880.0, + "grad_norm": 1.9572065929893177, + "language_loss": 0.67818397, + "learning_rate": 2.115411240328073e-06, + "loss": 0.6995914, + "num_input_tokens_seen": 177796355, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.703125, + "step": 8269, + "time_per_iteration": 2.7194817066192627 + }, + { + "auxiliary_loss_clip": 0.0110431, + "auxiliary_loss_mlp": 0.01032199, + "balance_loss_clip": 1.01968217, + "balance_loss_mlp": 1.03744197, + "epoch": 0.49721929956410643, + "flos": 20191277600640.0, + "grad_norm": 1.6139896668987463, + "language_loss": 0.85450721, + "learning_rate": 2.1150224254433167e-06, + "loss": 0.87587237, + "num_input_tokens_seen": 177814300, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.671875, + "step": 8270, + "time_per_iteration": 2.4423561096191406 + }, + { + "auxiliary_loss_clip": 0.01108462, + "auxiliary_loss_mlp": 0.01029428, + "balance_loss_clip": 1.01834702, + "balance_loss_mlp": 1.03857064, + "epoch": 0.4972794228167744, + "flos": 21653560275840.0, + "grad_norm": 1.6811398863814482, + "language_loss": 0.71087623, + "learning_rate": 2.114633606196899e-06, + "loss": 0.73225504, + "num_input_tokens_seen": 177833615, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.69921875, + "step": 8271, + "time_per_iteration": 2.54892635345459 + }, + { + "auxiliary_loss_clip": 0.01109407, + "auxiliary_loss_mlp": 0.01029685, + "balance_loss_clip": 1.01633358, + "balance_loss_mlp": 1.03880143, + "epoch": 0.49733954606944236, + "flos": 24279743347200.0, + "grad_norm": 1.4557340389451365, + "language_loss": 0.7848624, + "learning_rate": 2.1142447826035635e-06, + "loss": 0.80625331, + "num_input_tokens_seen": 177855315, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.703125, + "step": 8272, + "time_per_iteration": 2.462470054626465 + }, + { + "auxiliary_loss_clip": 0.01109645, + "auxiliary_loss_mlp": 0.01035441, + "balance_loss_clip": 1.02275074, + "balance_loss_mlp": 1.03950167, + "epoch": 0.4973996693221103, + "flos": 37852575730560.0, + "grad_norm": 2.5057831430835686, + "language_loss": 0.66278791, + "learning_rate": 2.1138559546780544e-06, + "loss": 0.68423879, + "num_input_tokens_seen": 177875590, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 8273, + "time_per_iteration": 2.6735026836395264 + }, + { + "auxiliary_loss_clip": 0.01109746, + "auxiliary_loss_mlp": 0.01031477, + "balance_loss_clip": 1.01891851, + "balance_loss_mlp": 1.03968048, + "epoch": 0.4974597925747783, + "flos": 21361426963200.0, + "grad_norm": 1.871691944459235, + "language_loss": 0.77977264, + "learning_rate": 2.1134671224351163e-06, + "loss": 0.80118477, + "num_input_tokens_seen": 177894175, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69921875, + "step": 8274, + "time_per_iteration": 2.462465763092041 + }, + { + "auxiliary_loss_clip": 0.01110748, + "auxiliary_loss_mlp": 0.01032049, + "balance_loss_clip": 1.01864374, + "balance_loss_mlp": 1.03865933, + "epoch": 0.49751991582744626, + "flos": 30738133560960.0, + "grad_norm": 2.0388244744713724, + "language_loss": 0.75829184, + "learning_rate": 2.113078285889493e-06, + "loss": 0.77971983, + "num_input_tokens_seen": 177913920, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.72265625, + "step": 8275, + "time_per_iteration": 2.6034398078918457 + }, + { + "auxiliary_loss_clip": 0.01110746, + "auxiliary_loss_mlp": 0.01034383, + "balance_loss_clip": 1.01974416, + "balance_loss_mlp": 1.03761268, + "epoch": 0.4975800390801142, + "flos": 14100541044480.0, + "grad_norm": 1.9341151140441402, + "language_loss": 0.8392635, + "learning_rate": 2.1126894450559303e-06, + "loss": 0.86071479, + "num_input_tokens_seen": 177930425, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.734375, + "step": 8276, + "time_per_iteration": 2.435999870300293 + }, + { + "auxiliary_loss_clip": 0.01102851, + "auxiliary_loss_mlp": 0.01028231, + "balance_loss_clip": 1.01664937, + "balance_loss_mlp": 1.03633988, + "epoch": 0.4976401623327822, + "flos": 24207275658240.0, + "grad_norm": 1.3535075156355831, + "language_loss": 0.70188868, + "learning_rate": 2.112300599949172e-06, + "loss": 0.72319949, + "num_input_tokens_seen": 177949885, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 8277, + "time_per_iteration": 2.5726187229156494 + }, + { + "auxiliary_loss_clip": 0.01105349, + "auxiliary_loss_mlp": 0.01032526, + "balance_loss_clip": 1.01952052, + "balance_loss_mlp": 1.03669858, + "epoch": 0.49770028558545015, + "flos": 21136769349120.0, + "grad_norm": 1.773647946812319, + "language_loss": 0.82609779, + "learning_rate": 2.111911750583964e-06, + "loss": 0.84747648, + "num_input_tokens_seen": 177965720, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6875, + "step": 8278, + "time_per_iteration": 2.4459898471832275 + }, + { + "auxiliary_loss_clip": 0.01108155, + "auxiliary_loss_mlp": 0.01032407, + "balance_loss_clip": 1.01992559, + "balance_loss_mlp": 1.03671384, + "epoch": 0.4977604088381181, + "flos": 16763927627520.0, + "grad_norm": 1.8017237706358624, + "language_loss": 0.6784246, + "learning_rate": 2.111522896975052e-06, + "loss": 0.69983023, + "num_input_tokens_seen": 177983190, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71484375, + "step": 8279, + "time_per_iteration": 2.4793283939361572 + }, + { + "auxiliary_loss_clip": 0.0110668, + "auxiliary_loss_mlp": 0.01034393, + "balance_loss_clip": 1.0204277, + "balance_loss_mlp": 1.03561902, + "epoch": 0.49782053209078614, + "flos": 15703521292800.0, + "grad_norm": 1.9740212049853438, + "language_loss": 0.70469928, + "learning_rate": 2.1111340391371794e-06, + "loss": 0.72610998, + "num_input_tokens_seen": 178000155, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7109375, + "step": 8280, + "time_per_iteration": 2.427778482437134 + }, + { + "auxiliary_loss_clip": 0.01104778, + "auxiliary_loss_mlp": 0.01033065, + "balance_loss_clip": 1.02028, + "balance_loss_mlp": 1.03475237, + "epoch": 0.4978806553434541, + "flos": 24753692327040.0, + "grad_norm": 1.6232736941666084, + "language_loss": 0.64461923, + "learning_rate": 2.1107451770850936e-06, + "loss": 0.66599762, + "num_input_tokens_seen": 178021060, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 8281, + "time_per_iteration": 2.511054515838623 + }, + { + "auxiliary_loss_clip": 0.01109361, + "auxiliary_loss_mlp": 0.01035185, + "balance_loss_clip": 1.02175605, + "balance_loss_mlp": 1.03830338, + "epoch": 0.49794077859612207, + "flos": 13115726881920.0, + "grad_norm": 1.82873470978674, + "language_loss": 0.72714734, + "learning_rate": 2.1103563108335387e-06, + "loss": 0.74859279, + "num_input_tokens_seen": 178038180, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7109375, + "step": 8282, + "time_per_iteration": 2.417059898376465 + }, + { + "auxiliary_loss_clip": 0.01103243, + "auxiliary_loss_mlp": 0.01029512, + "balance_loss_clip": 1.01804423, + "balance_loss_mlp": 1.03591275, + "epoch": 0.49800090184879003, + "flos": 27525133998720.0, + "grad_norm": 1.6753255120783885, + "language_loss": 0.73373008, + "learning_rate": 2.109967440397263e-06, + "loss": 0.75505757, + "num_input_tokens_seen": 178057565, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.671875, + "step": 8283, + "time_per_iteration": 2.531747341156006 + }, + { + "auxiliary_loss_clip": 0.01106082, + "auxiliary_loss_mlp": 0.01037237, + "balance_loss_clip": 1.02446926, + "balance_loss_mlp": 1.03696167, + "epoch": 0.498061025101458, + "flos": 19792489829760.0, + "grad_norm": 1.6101503544989328, + "language_loss": 0.78866243, + "learning_rate": 2.1095785657910095e-06, + "loss": 0.81009555, + "num_input_tokens_seen": 178076965, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69140625, + "step": 8284, + "time_per_iteration": 2.4609432220458984 + }, + { + "auxiliary_loss_clip": 0.01113439, + "auxiliary_loss_mlp": 0.01038109, + "balance_loss_clip": 1.02398884, + "balance_loss_mlp": 1.0390476, + "epoch": 0.49812114835412596, + "flos": 29893909230720.0, + "grad_norm": 1.8191212695174297, + "language_loss": 0.73705399, + "learning_rate": 2.109189687029526e-06, + "loss": 0.75856948, + "num_input_tokens_seen": 178095105, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.74609375, + "step": 8285, + "time_per_iteration": 2.5364696979522705 + }, + { + "auxiliary_loss_clip": 0.01112037, + "auxiliary_loss_mlp": 0.01032281, + "balance_loss_clip": 1.01872683, + "balance_loss_mlp": 1.0420599, + "epoch": 0.49818127160679393, + "flos": 23147048891520.0, + "grad_norm": 1.6445235471758528, + "language_loss": 0.74477649, + "learning_rate": 2.1088008041275598e-06, + "loss": 0.76621962, + "num_input_tokens_seen": 178114505, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.69921875, + "step": 8286, + "time_per_iteration": 2.4888620376586914 + }, + { + "auxiliary_loss_clip": 0.01112849, + "auxiliary_loss_mlp": 0.0104004, + "balance_loss_clip": 1.02713549, + "balance_loss_mlp": 1.04156506, + "epoch": 0.4982413948594619, + "flos": 21652806090240.0, + "grad_norm": 1.7365216069979077, + "language_loss": 0.85467643, + "learning_rate": 2.1084119170998545e-06, + "loss": 0.87620533, + "num_input_tokens_seen": 178131595, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 8287, + "time_per_iteration": 2.5058188438415527 + }, + { + "auxiliary_loss_clip": 0.01107755, + "auxiliary_loss_mlp": 0.01025542, + "balance_loss_clip": 1.01267338, + "balance_loss_mlp": 1.03729916, + "epoch": 0.49830151811212986, + "flos": 32486982940800.0, + "grad_norm": 1.6348463305948138, + "language_loss": 0.72363204, + "learning_rate": 2.108023025961159e-06, + "loss": 0.74496502, + "num_input_tokens_seen": 178152055, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 8288, + "time_per_iteration": 2.528475046157837 + }, + { + "auxiliary_loss_clip": 0.0111456, + "auxiliary_loss_mlp": 0.01038212, + "balance_loss_clip": 1.02319193, + "balance_loss_mlp": 1.04041409, + "epoch": 0.4983616413647978, + "flos": 18142358002560.0, + "grad_norm": 2.900373689725773, + "language_loss": 0.80002087, + "learning_rate": 2.10763413072622e-06, + "loss": 0.82154852, + "num_input_tokens_seen": 178168150, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7421875, + "step": 8289, + "time_per_iteration": 2.4667603969573975 + }, + { + "auxiliary_loss_clip": 0.01106957, + "auxiliary_loss_mlp": 0.01033142, + "balance_loss_clip": 1.0199995, + "balance_loss_mlp": 1.03680038, + "epoch": 0.4984217646174658, + "flos": 19718836992000.0, + "grad_norm": 2.15669041751919, + "language_loss": 0.73524791, + "learning_rate": 2.107245231409784e-06, + "loss": 0.7566489, + "num_input_tokens_seen": 178186150, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 8290, + "time_per_iteration": 2.4318900108337402 + }, + { + "auxiliary_loss_clip": 0.01112096, + "auxiliary_loss_mlp": 0.01037317, + "balance_loss_clip": 1.02232039, + "balance_loss_mlp": 1.04070783, + "epoch": 0.49848188787013376, + "flos": 24936549488640.0, + "grad_norm": 1.4681011524205945, + "language_loss": 0.84016359, + "learning_rate": 2.106856328026598e-06, + "loss": 0.86165774, + "num_input_tokens_seen": 178207665, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.7109375, + "step": 8291, + "time_per_iteration": 2.502545118331909 + }, + { + "auxiliary_loss_clip": 0.01117247, + "auxiliary_loss_mlp": 0.01037074, + "balance_loss_clip": 1.02307272, + "balance_loss_mlp": 1.04216146, + "epoch": 0.4985420111228017, + "flos": 22382439056640.0, + "grad_norm": 1.910804847598398, + "language_loss": 0.67084122, + "learning_rate": 2.106467420591409e-06, + "loss": 0.69238442, + "num_input_tokens_seen": 178226325, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.75, + "step": 8292, + "time_per_iteration": 2.4527781009674072 + }, + { + "auxiliary_loss_clip": 0.01108825, + "auxiliary_loss_mlp": 0.01031205, + "balance_loss_clip": 1.01933742, + "balance_loss_mlp": 1.03864646, + "epoch": 0.4986021343754697, + "flos": 16216469464320.0, + "grad_norm": 1.7642237687107358, + "language_loss": 0.67300534, + "learning_rate": 2.106078509118965e-06, + "loss": 0.69440567, + "num_input_tokens_seen": 178244960, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.703125, + "step": 8293, + "time_per_iteration": 2.4598476886749268 + }, + { + "auxiliary_loss_clip": 0.01110354, + "auxiliary_loss_mlp": 0.01028466, + "balance_loss_clip": 1.01615214, + "balance_loss_mlp": 1.03958893, + "epoch": 0.4986622576281377, + "flos": 23403594804480.0, + "grad_norm": 1.987515516196069, + "language_loss": 0.8202461, + "learning_rate": 2.1056895936240133e-06, + "loss": 0.84163427, + "num_input_tokens_seen": 178265400, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.7109375, + "step": 8294, + "time_per_iteration": 2.4827442169189453 + }, + { + "auxiliary_loss_clip": 0.01110277, + "auxiliary_loss_mlp": 0.01034377, + "balance_loss_clip": 1.02032816, + "balance_loss_mlp": 1.03937042, + "epoch": 0.49872238088080567, + "flos": 19974556892160.0, + "grad_norm": 1.7471179574646651, + "language_loss": 0.73073918, + "learning_rate": 2.1053006741213016e-06, + "loss": 0.7521857, + "num_input_tokens_seen": 178284535, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7109375, + "step": 8295, + "time_per_iteration": 2.4712820053100586 + }, + { + "auxiliary_loss_clip": 0.01108254, + "auxiliary_loss_mlp": 0.01036676, + "balance_loss_clip": 1.02435029, + "balance_loss_mlp": 1.03895998, + "epoch": 0.49878250413347364, + "flos": 22893016930560.0, + "grad_norm": 1.9200384732673381, + "language_loss": 0.673262, + "learning_rate": 2.1049117506255775e-06, + "loss": 0.69471127, + "num_input_tokens_seen": 178302425, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 8296, + "time_per_iteration": 2.45139479637146 + }, + { + "auxiliary_loss_clip": 0.01111689, + "auxiliary_loss_mlp": 0.01033799, + "balance_loss_clip": 1.0202632, + "balance_loss_mlp": 1.03996015, + "epoch": 0.4988426273861416, + "flos": 32598449821440.0, + "grad_norm": 1.713618634115876, + "language_loss": 0.64634776, + "learning_rate": 2.1045228231515895e-06, + "loss": 0.66780269, + "num_input_tokens_seen": 178323065, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.71875, + "step": 8297, + "time_per_iteration": 2.5514614582061768 + }, + { + "auxiliary_loss_clip": 0.0110753, + "auxiliary_loss_mlp": 0.01033016, + "balance_loss_clip": 1.02121472, + "balance_loss_mlp": 1.03931689, + "epoch": 0.49890275063880957, + "flos": 20923604087040.0, + "grad_norm": 1.9440676372274848, + "language_loss": 0.69621831, + "learning_rate": 2.1041338917140857e-06, + "loss": 0.71762383, + "num_input_tokens_seen": 178343985, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.68359375, + "step": 8298, + "time_per_iteration": 2.4699370861053467 + }, + { + "auxiliary_loss_clip": 0.01107047, + "auxiliary_loss_mlp": 0.01036879, + "balance_loss_clip": 1.02421331, + "balance_loss_mlp": 1.03804398, + "epoch": 0.49896287389147753, + "flos": 18624459369600.0, + "grad_norm": 2.087380746796303, + "language_loss": 0.84278095, + "learning_rate": 2.103744956327814e-06, + "loss": 0.86422026, + "num_input_tokens_seen": 178362345, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 8299, + "time_per_iteration": 2.4820563793182373 + }, + { + "auxiliary_loss_clip": 0.01113334, + "auxiliary_loss_mlp": 0.01037602, + "balance_loss_clip": 1.02327859, + "balance_loss_mlp": 1.03978848, + "epoch": 0.4990229971441455, + "flos": 24826555065600.0, + "grad_norm": 5.591354549929027, + "language_loss": 0.69272447, + "learning_rate": 2.1033560170075234e-06, + "loss": 0.71423382, + "num_input_tokens_seen": 178383190, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.734375, + "step": 8300, + "time_per_iteration": 2.473634719848633 + }, + { + "auxiliary_loss_clip": 0.01037164, + "auxiliary_loss_mlp": 0.01003582, + "balance_loss_clip": 1.00239551, + "balance_loss_mlp": 1.01397431, + "epoch": 0.49908312039681346, + "flos": 71384525136000.0, + "grad_norm": 0.7592353305728455, + "language_loss": 0.51136976, + "learning_rate": 2.1029670737679623e-06, + "loss": 0.5317772, + "num_input_tokens_seen": 178444250, + "router_z_loss_clip": 0.01184082, + "router_z_loss_mlp": 0.23242188, + "step": 8301, + "time_per_iteration": 3.1719589233398438 + }, + { + "auxiliary_loss_clip": 0.01106374, + "auxiliary_loss_mlp": 0.01040035, + "balance_loss_clip": 1.02670741, + "balance_loss_mlp": 1.03841138, + "epoch": 0.4991432436494814, + "flos": 19828651847040.0, + "grad_norm": 1.9297901828770159, + "language_loss": 0.84423494, + "learning_rate": 2.102578126623879e-06, + "loss": 0.86569905, + "num_input_tokens_seen": 178463250, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.6796875, + "step": 8302, + "time_per_iteration": 3.8624472618103027 + }, + { + "auxiliary_loss_clip": 0.01107901, + "auxiliary_loss_mlp": 0.01027812, + "balance_loss_clip": 1.0157299, + "balance_loss_mlp": 1.03963566, + "epoch": 0.4992033669021494, + "flos": 15121912273920.0, + "grad_norm": 1.7245012471823244, + "language_loss": 0.68831706, + "learning_rate": 2.102189175590024e-06, + "loss": 0.70967424, + "num_input_tokens_seen": 178481340, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 8303, + "time_per_iteration": 2.4496121406555176 + }, + { + "auxiliary_loss_clip": 0.01111721, + "auxiliary_loss_mlp": 0.01031821, + "balance_loss_clip": 1.01871395, + "balance_loss_mlp": 1.0395093, + "epoch": 0.49926349015481736, + "flos": 31207952476800.0, + "grad_norm": 1.8500063703376581, + "language_loss": 0.72523201, + "learning_rate": 2.101800220681144e-06, + "loss": 0.7466675, + "num_input_tokens_seen": 178501545, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.72265625, + "step": 8304, + "time_per_iteration": 5.351519346237183 + }, + { + "auxiliary_loss_clip": 0.01109868, + "auxiliary_loss_mlp": 0.01038641, + "balance_loss_clip": 1.02633858, + "balance_loss_mlp": 1.03971672, + "epoch": 0.4993236134074853, + "flos": 24900207903360.0, + "grad_norm": 2.113610055263332, + "language_loss": 0.81011766, + "learning_rate": 2.10141126191199e-06, + "loss": 0.83160275, + "num_input_tokens_seen": 178519700, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 8305, + "time_per_iteration": 3.9764394760131836 + }, + { + "auxiliary_loss_clip": 0.0103618, + "auxiliary_loss_mlp": 0.01001804, + "balance_loss_clip": 1.00061762, + "balance_loss_mlp": 1.01301277, + "epoch": 0.4993837366601533, + "flos": 70420573797120.0, + "grad_norm": 0.7225706425993785, + "language_loss": 0.56916559, + "learning_rate": 2.1010222992973107e-06, + "loss": 0.58954537, + "num_input_tokens_seen": 178576740, + "router_z_loss_clip": 0.01184082, + "router_z_loss_mlp": 0.23144531, + "step": 8306, + "time_per_iteration": 3.1952388286590576 + }, + { + "auxiliary_loss_clip": 0.01114208, + "auxiliary_loss_mlp": 0.01037149, + "balance_loss_clip": 1.02323711, + "balance_loss_mlp": 1.04268515, + "epoch": 0.4994438599128213, + "flos": 15961216440960.0, + "grad_norm": 1.791967653711514, + "language_loss": 0.82407033, + "learning_rate": 2.1006333328518556e-06, + "loss": 0.84558392, + "num_input_tokens_seen": 178594745, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.71484375, + "step": 8307, + "time_per_iteration": 2.4501423835754395 + }, + { + "auxiliary_loss_clip": 0.01108969, + "auxiliary_loss_mlp": 0.01033029, + "balance_loss_clip": 1.01987445, + "balance_loss_mlp": 1.03845966, + "epoch": 0.4995039831654893, + "flos": 27928303228800.0, + "grad_norm": 2.0869484891217973, + "language_loss": 0.60544026, + "learning_rate": 2.1002443625903748e-06, + "loss": 0.62686026, + "num_input_tokens_seen": 178614110, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 8308, + "time_per_iteration": 2.5023903846740723 + }, + { + "auxiliary_loss_clip": 0.01106463, + "auxiliary_loss_mlp": 0.01030347, + "balance_loss_clip": 1.0179193, + "balance_loss_mlp": 1.03760242, + "epoch": 0.49956410641815724, + "flos": 24204797619840.0, + "grad_norm": 1.5917355796130328, + "language_loss": 0.74632615, + "learning_rate": 2.0998553885276168e-06, + "loss": 0.76769423, + "num_input_tokens_seen": 178634170, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6875, + "step": 8309, + "time_per_iteration": 2.473018169403076 + }, + { + "auxiliary_loss_clip": 0.01109782, + "auxiliary_loss_mlp": 0.01034327, + "balance_loss_clip": 1.02136922, + "balance_loss_mlp": 1.03926158, + "epoch": 0.4996242296708252, + "flos": 16180127879040.0, + "grad_norm": 2.147167346860859, + "language_loss": 0.80117911, + "learning_rate": 2.0994664106783335e-06, + "loss": 0.82262021, + "num_input_tokens_seen": 178651775, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.703125, + "step": 8310, + "time_per_iteration": 2.4172844886779785 + }, + { + "auxiliary_loss_clip": 0.01112518, + "auxiliary_loss_mlp": 0.01035729, + "balance_loss_clip": 1.02339089, + "balance_loss_mlp": 1.04019213, + "epoch": 0.49968435292349317, + "flos": 16873527000960.0, + "grad_norm": 1.6036366291386785, + "language_loss": 0.70938641, + "learning_rate": 2.0990774290572735e-06, + "loss": 0.73086882, + "num_input_tokens_seen": 178669720, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.72265625, + "step": 8311, + "time_per_iteration": 2.4804234504699707 + }, + { + "auxiliary_loss_clip": 0.01111462, + "auxiliary_loss_mlp": 0.01034214, + "balance_loss_clip": 1.02229297, + "balance_loss_mlp": 1.04154408, + "epoch": 0.49974447617616113, + "flos": 14939521989120.0, + "grad_norm": 1.923283457940722, + "language_loss": 0.77138013, + "learning_rate": 2.098688443679187e-06, + "loss": 0.79283684, + "num_input_tokens_seen": 178686765, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69921875, + "step": 8312, + "time_per_iteration": 2.4233593940734863 + }, + { + "auxiliary_loss_clip": 0.01111451, + "auxiliary_loss_mlp": 0.01032282, + "balance_loss_clip": 1.01910901, + "balance_loss_mlp": 1.04093099, + "epoch": 0.4998045994288291, + "flos": 26651535321600.0, + "grad_norm": 1.7466795572602452, + "language_loss": 0.84205925, + "learning_rate": 2.0982994545588256e-06, + "loss": 0.86349666, + "num_input_tokens_seen": 178705845, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.70703125, + "step": 8313, + "time_per_iteration": 2.509953260421753 + }, + { + "auxiliary_loss_clip": 0.01111508, + "auxiliary_loss_mlp": 0.01029516, + "balance_loss_clip": 1.01633728, + "balance_loss_mlp": 1.03987491, + "epoch": 0.49986472268149706, + "flos": 20953768533120.0, + "grad_norm": 2.119225345296983, + "language_loss": 0.80887723, + "learning_rate": 2.097910461710939e-06, + "loss": 0.83028746, + "num_input_tokens_seen": 178723410, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71484375, + "step": 8314, + "time_per_iteration": 2.452765703201294 + }, + { + "auxiliary_loss_clip": 0.01113768, + "auxiliary_loss_mlp": 0.01041835, + "balance_loss_clip": 1.02763736, + "balance_loss_mlp": 1.0418222, + "epoch": 0.49992484593416503, + "flos": 22783884433920.0, + "grad_norm": 2.4967995028767778, + "language_loss": 0.79017889, + "learning_rate": 2.0975214651502773e-06, + "loss": 0.81173497, + "num_input_tokens_seen": 178743560, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.71875, + "step": 8315, + "time_per_iteration": 2.4926230907440186 + }, + { + "auxiliary_loss_clip": 0.01110205, + "auxiliary_loss_mlp": 0.01034063, + "balance_loss_clip": 1.02123618, + "balance_loss_mlp": 1.04051793, + "epoch": 0.499984969186833, + "flos": 46786970252160.0, + "grad_norm": 2.5792388666411274, + "language_loss": 0.73983908, + "learning_rate": 2.0971324648915926e-06, + "loss": 0.76128173, + "num_input_tokens_seen": 178767225, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 8316, + "time_per_iteration": 2.692228317260742 + }, + { + "auxiliary_loss_clip": 0.01109445, + "auxiliary_loss_mlp": 0.01032836, + "balance_loss_clip": 1.02058125, + "balance_loss_mlp": 1.04118443, + "epoch": 0.500045092439501, + "flos": 25556978131200.0, + "grad_norm": 1.4190232020266644, + "language_loss": 0.81204319, + "learning_rate": 2.0967434609496343e-06, + "loss": 0.83346593, + "num_input_tokens_seen": 178786810, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.68359375, + "step": 8317, + "time_per_iteration": 2.4997825622558594 + }, + { + "auxiliary_loss_clip": 0.01111618, + "auxiliary_loss_mlp": 0.01038384, + "balance_loss_clip": 1.02436495, + "balance_loss_mlp": 1.04001343, + "epoch": 0.5001052156921689, + "flos": 20704764476160.0, + "grad_norm": 1.649167878849496, + "language_loss": 0.83189869, + "learning_rate": 2.0963544533391548e-06, + "loss": 0.85339868, + "num_input_tokens_seen": 178805660, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.71875, + "step": 8318, + "time_per_iteration": 2.516118049621582 + }, + { + "auxiliary_loss_clip": 0.01111509, + "auxiliary_loss_mlp": 0.01030416, + "balance_loss_clip": 1.01778626, + "balance_loss_mlp": 1.04068375, + "epoch": 0.500165338944837, + "flos": 21251109317760.0, + "grad_norm": 1.8062739344487506, + "language_loss": 0.81684446, + "learning_rate": 2.0959654420749045e-06, + "loss": 0.83826375, + "num_input_tokens_seen": 178824780, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.70703125, + "step": 8319, + "time_per_iteration": 2.4977705478668213 + }, + { + "auxiliary_loss_clip": 0.01112348, + "auxiliary_loss_mlp": 0.01026791, + "balance_loss_clip": 1.01469707, + "balance_loss_mlp": 1.04046464, + "epoch": 0.5002254621975049, + "flos": 27854398995840.0, + "grad_norm": 1.7611824883833367, + "language_loss": 0.71951354, + "learning_rate": 2.095576427171635e-06, + "loss": 0.74090493, + "num_input_tokens_seen": 178845640, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71875, + "step": 8320, + "time_per_iteration": 2.5664663314819336 + }, + { + "auxiliary_loss_clip": 0.01116964, + "auxiliary_loss_mlp": 0.01043637, + "balance_loss_clip": 1.02903366, + "balance_loss_mlp": 1.03925049, + "epoch": 0.5002855854501729, + "flos": 15551941898880.0, + "grad_norm": 3.538267489088781, + "language_loss": 0.76840645, + "learning_rate": 2.0951874086440978e-06, + "loss": 0.79001242, + "num_input_tokens_seen": 178862290, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7734375, + "step": 8321, + "time_per_iteration": 2.5154004096984863 + }, + { + "auxiliary_loss_clip": 0.01113289, + "auxiliary_loss_mlp": 0.01038756, + "balance_loss_clip": 1.0255599, + "balance_loss_mlp": 1.04125774, + "epoch": 0.5003457087028408, + "flos": 16107408794880.0, + "grad_norm": 1.9154758393965534, + "language_loss": 0.82959068, + "learning_rate": 2.0947983865070455e-06, + "loss": 0.85111117, + "num_input_tokens_seen": 178879805, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71875, + "step": 8322, + "time_per_iteration": 2.4235384464263916 + }, + { + "auxiliary_loss_clip": 0.01114951, + "auxiliary_loss_mlp": 0.01035, + "balance_loss_clip": 1.02180934, + "balance_loss_mlp": 1.04190695, + "epoch": 0.5004058319555088, + "flos": 22710518904960.0, + "grad_norm": 2.1453827228353166, + "language_loss": 0.73670769, + "learning_rate": 2.094409360775228e-06, + "loss": 0.7582072, + "num_input_tokens_seen": 178896985, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73046875, + "step": 8323, + "time_per_iteration": 2.495490312576294 + }, + { + "auxiliary_loss_clip": 0.01111344, + "auxiliary_loss_mlp": 0.01035029, + "balance_loss_clip": 1.02152205, + "balance_loss_mlp": 1.04043198, + "epoch": 0.5004659552081767, + "flos": 30117956313600.0, + "grad_norm": 1.517177144462768, + "language_loss": 0.69255745, + "learning_rate": 2.0940203314633977e-06, + "loss": 0.71402115, + "num_input_tokens_seen": 178920605, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7109375, + "step": 8324, + "time_per_iteration": 2.534043550491333 + }, + { + "auxiliary_loss_clip": 0.01110991, + "auxiliary_loss_mlp": 0.01033694, + "balance_loss_clip": 1.02072978, + "balance_loss_mlp": 1.03958941, + "epoch": 0.5005260784608447, + "flos": 18624710764800.0, + "grad_norm": 1.9198571129878061, + "language_loss": 0.72153628, + "learning_rate": 2.0936312985863077e-06, + "loss": 0.7429831, + "num_input_tokens_seen": 178937760, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71484375, + "step": 8325, + "time_per_iteration": 2.4783544540405273 + }, + { + "auxiliary_loss_clip": 0.01114311, + "auxiliary_loss_mlp": 0.01038383, + "balance_loss_clip": 1.0237087, + "balance_loss_mlp": 1.04212904, + "epoch": 0.5005862017135126, + "flos": 24859987649280.0, + "grad_norm": 1.5620326365302057, + "language_loss": 0.73494631, + "learning_rate": 2.093242262158709e-06, + "loss": 0.7564733, + "num_input_tokens_seen": 178957985, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.72265625, + "step": 8326, + "time_per_iteration": 2.4836461544036865 + }, + { + "auxiliary_loss_clip": 0.01110122, + "auxiliary_loss_mlp": 0.01031456, + "balance_loss_clip": 1.0189389, + "balance_loss_mlp": 1.03965449, + "epoch": 0.5006463249661807, + "flos": 18734381965440.0, + "grad_norm": 1.5385455876451686, + "language_loss": 0.78168696, + "learning_rate": 2.0928532221953544e-06, + "loss": 0.80310273, + "num_input_tokens_seen": 178977070, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 8327, + "time_per_iteration": 2.477095127105713 + }, + { + "auxiliary_loss_clip": 0.01117029, + "auxiliary_loss_mlp": 0.01035945, + "balance_loss_clip": 1.02261126, + "balance_loss_mlp": 1.04402947, + "epoch": 0.5007064482188487, + "flos": 13042145871360.0, + "grad_norm": 2.31963767631444, + "language_loss": 0.88008773, + "learning_rate": 2.092464178710997e-06, + "loss": 0.90161747, + "num_input_tokens_seen": 178994175, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73046875, + "step": 8328, + "time_per_iteration": 2.479931116104126 + }, + { + "auxiliary_loss_clip": 0.01116123, + "auxiliary_loss_mlp": 0.01035928, + "balance_loss_clip": 1.02290463, + "balance_loss_mlp": 1.0408715, + "epoch": 0.5007665714715166, + "flos": 21288671965440.0, + "grad_norm": 2.0106246059801482, + "language_loss": 0.74407351, + "learning_rate": 2.092075131720388e-06, + "loss": 0.76559395, + "num_input_tokens_seen": 179013710, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.75, + "step": 8329, + "time_per_iteration": 2.480037212371826 + }, + { + "auxiliary_loss_clip": 0.01112626, + "auxiliary_loss_mlp": 0.01033135, + "balance_loss_clip": 1.02014136, + "balance_loss_mlp": 1.04276633, + "epoch": 0.5008266947241846, + "flos": 29754576374400.0, + "grad_norm": 2.2897047741072063, + "language_loss": 0.79602063, + "learning_rate": 2.091686081238281e-06, + "loss": 0.81747818, + "num_input_tokens_seen": 179035255, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.69921875, + "step": 8330, + "time_per_iteration": 2.529446601867676 + }, + { + "auxiliary_loss_clip": 0.0103803, + "auxiliary_loss_mlp": 0.01000333, + "balance_loss_clip": 0.99922389, + "balance_loss_mlp": 1.01505685, + "epoch": 0.5008868179768525, + "flos": 63557829204480.0, + "grad_norm": 0.7317803530986337, + "language_loss": 0.56073356, + "learning_rate": 2.0912970272794282e-06, + "loss": 0.58111727, + "num_input_tokens_seen": 179090915, + "router_z_loss_clip": 0.0111084, + "router_z_loss_mlp": 0.23046875, + "step": 8331, + "time_per_iteration": 2.89511775970459 + }, + { + "auxiliary_loss_clip": 0.01110931, + "auxiliary_loss_mlp": 0.0102697, + "balance_loss_clip": 1.01504326, + "balance_loss_mlp": 1.041206, + "epoch": 0.5009469412295205, + "flos": 27375637593600.0, + "grad_norm": 2.865515028785386, + "language_loss": 0.65518546, + "learning_rate": 2.0909079698585833e-06, + "loss": 0.67656446, + "num_input_tokens_seen": 179109160, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 8332, + "time_per_iteration": 2.497129201889038 + }, + { + "auxiliary_loss_clip": 0.01109356, + "auxiliary_loss_mlp": 0.01034733, + "balance_loss_clip": 1.02261496, + "balance_loss_mlp": 1.0400846, + "epoch": 0.5010070644821885, + "flos": 27378833904000.0, + "grad_norm": 1.477043934406584, + "language_loss": 0.74687374, + "learning_rate": 2.0905189089904993e-06, + "loss": 0.76831466, + "num_input_tokens_seen": 179130610, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69140625, + "step": 8333, + "time_per_iteration": 2.506769895553589 + }, + { + "auxiliary_loss_clip": 0.01114084, + "auxiliary_loss_mlp": 0.01034969, + "balance_loss_clip": 1.02242804, + "balance_loss_mlp": 1.04128885, + "epoch": 0.5010671877348565, + "flos": 20662748542080.0, + "grad_norm": 3.419508092200526, + "language_loss": 0.80619013, + "learning_rate": 2.090129844689929e-06, + "loss": 0.82768065, + "num_input_tokens_seen": 179147860, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7265625, + "step": 8334, + "time_per_iteration": 2.4492759704589844 + }, + { + "auxiliary_loss_clip": 0.01038411, + "auxiliary_loss_mlp": 0.00996695, + "balance_loss_clip": 0.99557459, + "balance_loss_mlp": 1.01541471, + "epoch": 0.5011273109875244, + "flos": 59128645000320.0, + "grad_norm": 0.8938151962133672, + "language_loss": 0.62658346, + "learning_rate": 2.089740776971626e-06, + "loss": 0.64693451, + "num_input_tokens_seen": 179210490, + "router_z_loss_clip": 0.01123047, + "router_z_loss_mlp": 0.23046875, + "step": 8335, + "time_per_iteration": 3.044527530670166 + }, + { + "auxiliary_loss_clip": 0.01108292, + "auxiliary_loss_mlp": 0.01027703, + "balance_loss_clip": 1.01548398, + "balance_loss_mlp": 1.03883338, + "epoch": 0.5011874342401924, + "flos": 25336342840320.0, + "grad_norm": 1.39366543335018, + "language_loss": 0.79443586, + "learning_rate": 2.0893517058503435e-06, + "loss": 0.81579578, + "num_input_tokens_seen": 179231360, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6953125, + "step": 8336, + "time_per_iteration": 2.5133562088012695 + }, + { + "auxiliary_loss_clip": 0.01111717, + "auxiliary_loss_mlp": 0.01031101, + "balance_loss_clip": 1.01791, + "balance_loss_mlp": 1.0402261, + "epoch": 0.5012475574928603, + "flos": 20229953569920.0, + "grad_norm": 1.7464580749308463, + "language_loss": 0.80139911, + "learning_rate": 2.088962631340836e-06, + "loss": 0.82282722, + "num_input_tokens_seen": 179250625, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71484375, + "step": 8337, + "time_per_iteration": 2.4671413898468018 + }, + { + "auxiliary_loss_clip": 0.01114807, + "auxiliary_loss_mlp": 0.01033341, + "balance_loss_clip": 1.0201329, + "balance_loss_mlp": 1.03992128, + "epoch": 0.5013076807455283, + "flos": 22710123855360.0, + "grad_norm": 1.859552309481282, + "language_loss": 0.79314995, + "learning_rate": 2.0885735534578555e-06, + "loss": 0.8146314, + "num_input_tokens_seen": 179267360, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.75, + "step": 8338, + "time_per_iteration": 2.4763965606689453 + }, + { + "auxiliary_loss_clip": 0.01112164, + "auxiliary_loss_mlp": 0.01031399, + "balance_loss_clip": 1.0178982, + "balance_loss_mlp": 1.0390203, + "epoch": 0.5013678039981962, + "flos": 24245161528320.0, + "grad_norm": 1.6104717001039177, + "language_loss": 0.85006964, + "learning_rate": 2.0881844722161583e-06, + "loss": 0.87150526, + "num_input_tokens_seen": 179289810, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73046875, + "step": 8339, + "time_per_iteration": 2.507951259613037 + }, + { + "auxiliary_loss_clip": 0.01110925, + "auxiliary_loss_mlp": 0.01038014, + "balance_loss_clip": 1.02476954, + "balance_loss_mlp": 1.03943646, + "epoch": 0.5014279272508643, + "flos": 26176688501760.0, + "grad_norm": 1.484784321746097, + "language_loss": 0.70492387, + "learning_rate": 2.0877953876304962e-06, + "loss": 0.72641325, + "num_input_tokens_seen": 179310620, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71484375, + "step": 8340, + "time_per_iteration": 2.5271620750427246 + }, + { + "auxiliary_loss_clip": 0.01116646, + "auxiliary_loss_mlp": 0.01035561, + "balance_loss_clip": 1.02178025, + "balance_loss_mlp": 1.04153883, + "epoch": 0.5014880505035323, + "flos": 21430446946560.0, + "grad_norm": 1.9114275861555547, + "language_loss": 0.77793235, + "learning_rate": 2.0874062997156245e-06, + "loss": 0.79945439, + "num_input_tokens_seen": 179329005, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.75, + "step": 8341, + "time_per_iteration": 2.467557430267334 + }, + { + "auxiliary_loss_clip": 0.01116354, + "auxiliary_loss_mlp": 0.01039659, + "balance_loss_clip": 1.02543771, + "balance_loss_mlp": 1.04048502, + "epoch": 0.5015481737562002, + "flos": 15770745596160.0, + "grad_norm": 2.478803711535475, + "language_loss": 0.8961392, + "learning_rate": 2.0870172084862975e-06, + "loss": 0.91769934, + "num_input_tokens_seen": 179343785, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7578125, + "step": 8342, + "time_per_iteration": 2.454822063446045 + }, + { + "auxiliary_loss_clip": 0.01110124, + "auxiliary_loss_mlp": 0.01036028, + "balance_loss_clip": 1.02272439, + "balance_loss_mlp": 1.03894877, + "epoch": 0.5016082970088682, + "flos": 26830801123200.0, + "grad_norm": 3.1772216639919906, + "language_loss": 0.76625615, + "learning_rate": 2.0866281139572682e-06, + "loss": 0.7877177, + "num_input_tokens_seen": 179364070, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 8343, + "time_per_iteration": 2.485499143600464 + }, + { + "auxiliary_loss_clip": 0.0110844, + "auxiliary_loss_mlp": 0.01027749, + "balance_loss_clip": 1.01584053, + "balance_loss_mlp": 1.03967083, + "epoch": 0.5016684202615361, + "flos": 21470595373440.0, + "grad_norm": 2.1220779506727574, + "language_loss": 0.67086864, + "learning_rate": 2.086239016143293e-06, + "loss": 0.69223046, + "num_input_tokens_seen": 179384225, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 8344, + "time_per_iteration": 3.88729190826416 + }, + { + "auxiliary_loss_clip": 0.01111927, + "auxiliary_loss_mlp": 0.01034179, + "balance_loss_clip": 1.02143502, + "balance_loss_mlp": 1.03998613, + "epoch": 0.5017285435142042, + "flos": 26246821806720.0, + "grad_norm": 1.9395231632627998, + "language_loss": 0.75212955, + "learning_rate": 2.0858499150591258e-06, + "loss": 0.77359062, + "num_input_tokens_seen": 179402595, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 8345, + "time_per_iteration": 2.4836034774780273 + }, + { + "auxiliary_loss_clip": 0.01112737, + "auxiliary_loss_mlp": 0.01031643, + "balance_loss_clip": 1.01769578, + "balance_loss_mlp": 1.04121828, + "epoch": 0.5017886667668721, + "flos": 20777555387520.0, + "grad_norm": 1.95370753247372, + "language_loss": 0.78477418, + "learning_rate": 2.0854608107195203e-06, + "loss": 0.80621803, + "num_input_tokens_seen": 179419635, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.71484375, + "step": 8346, + "time_per_iteration": 5.420297861099243 + }, + { + "auxiliary_loss_clip": 0.01110161, + "auxiliary_loss_mlp": 0.01036529, + "balance_loss_clip": 1.02408957, + "balance_loss_mlp": 1.03860831, + "epoch": 0.5018487900195401, + "flos": 20156408472960.0, + "grad_norm": 1.6533044146295508, + "language_loss": 0.69167304, + "learning_rate": 2.0850717031392333e-06, + "loss": 0.71313995, + "num_input_tokens_seen": 179438770, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71484375, + "step": 8347, + "time_per_iteration": 2.5022430419921875 + }, + { + "auxiliary_loss_clip": 0.01112834, + "auxiliary_loss_mlp": 0.01034397, + "balance_loss_clip": 1.02136123, + "balance_loss_mlp": 1.03990984, + "epoch": 0.501908913272208, + "flos": 18150689957760.0, + "grad_norm": 1.8545802319259819, + "language_loss": 0.71527761, + "learning_rate": 2.0846825923330174e-06, + "loss": 0.73674989, + "num_input_tokens_seen": 179457475, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73046875, + "step": 8348, + "time_per_iteration": 2.491255760192871 + }, + { + "auxiliary_loss_clip": 0.01108777, + "auxiliary_loss_mlp": 0.01032852, + "balance_loss_clip": 1.02089548, + "balance_loss_mlp": 1.04003596, + "epoch": 0.501969036524876, + "flos": 23112287504640.0, + "grad_norm": 1.6664488621380107, + "language_loss": 0.73957872, + "learning_rate": 2.0842934783156303e-06, + "loss": 0.76099503, + "num_input_tokens_seen": 179478140, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6875, + "step": 8349, + "time_per_iteration": 2.478173017501831 + }, + { + "auxiliary_loss_clip": 0.01111134, + "auxiliary_loss_mlp": 0.01030526, + "balance_loss_clip": 1.01726353, + "balance_loss_mlp": 1.03897953, + "epoch": 0.5020291597775439, + "flos": 11363214314880.0, + "grad_norm": 2.0979883436616915, + "language_loss": 0.63680947, + "learning_rate": 2.0839043611018266e-06, + "loss": 0.65822613, + "num_input_tokens_seen": 179494325, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.72265625, + "step": 8350, + "time_per_iteration": 2.407949686050415 + }, + { + "auxiliary_loss_clip": 0.01035777, + "auxiliary_loss_mlp": 0.01011664, + "balance_loss_clip": 1.01064515, + "balance_loss_mlp": 1.01269341, + "epoch": 0.5020892830302119, + "flos": 64011094928640.0, + "grad_norm": 1.0786206787107346, + "language_loss": 0.59814817, + "learning_rate": 2.0835152407063597e-06, + "loss": 0.6186226, + "num_input_tokens_seen": 179553545, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.23144531, + "step": 8351, + "time_per_iteration": 3.199061393737793 + }, + { + "auxiliary_loss_clip": 0.01111613, + "auxiliary_loss_mlp": 0.01034378, + "balance_loss_clip": 1.02156925, + "balance_loss_mlp": 1.0395788, + "epoch": 0.5021494062828799, + "flos": 23732859801600.0, + "grad_norm": 2.3062568387149365, + "language_loss": 0.75367033, + "learning_rate": 2.0831261171439873e-06, + "loss": 0.77513033, + "num_input_tokens_seen": 179573645, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.71875, + "step": 8352, + "time_per_iteration": 2.506408214569092 + }, + { + "auxiliary_loss_clip": 0.01113074, + "auxiliary_loss_mlp": 0.01032571, + "balance_loss_clip": 1.01963115, + "balance_loss_mlp": 1.04205072, + "epoch": 0.5022095295355479, + "flos": 21576747041280.0, + "grad_norm": 1.6126052392954302, + "language_loss": 0.71743786, + "learning_rate": 2.082736990429464e-06, + "loss": 0.73889434, + "num_input_tokens_seen": 179591435, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 8353, + "time_per_iteration": 2.469383478164673 + }, + { + "auxiliary_loss_clip": 0.01115894, + "auxiliary_loss_mlp": 0.0103681, + "balance_loss_clip": 1.02279735, + "balance_loss_mlp": 1.04492378, + "epoch": 0.5022696527882159, + "flos": 21397229844480.0, + "grad_norm": 3.986170886248432, + "language_loss": 0.73818904, + "learning_rate": 2.0823478605775455e-06, + "loss": 0.75971609, + "num_input_tokens_seen": 179609955, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7109375, + "step": 8354, + "time_per_iteration": 2.510967254638672 + }, + { + "auxiliary_loss_clip": 0.01111051, + "auxiliary_loss_mlp": 0.01036606, + "balance_loss_clip": 1.02324271, + "balance_loss_mlp": 1.04122615, + "epoch": 0.5023297760408838, + "flos": 27160712565120.0, + "grad_norm": 1.6375075569861386, + "language_loss": 0.72198367, + "learning_rate": 2.0819587276029884e-06, + "loss": 0.74346024, + "num_input_tokens_seen": 179630875, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.69921875, + "step": 8355, + "time_per_iteration": 2.5355918407440186 + }, + { + "auxiliary_loss_clip": 0.01113009, + "auxiliary_loss_mlp": 0.01036959, + "balance_loss_clip": 1.0234164, + "balance_loss_mlp": 1.04037476, + "epoch": 0.5023898992935518, + "flos": 26213820186240.0, + "grad_norm": 1.5634548911110102, + "language_loss": 0.81171584, + "learning_rate": 2.081569591520548e-06, + "loss": 0.83321553, + "num_input_tokens_seen": 179649835, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7265625, + "step": 8356, + "time_per_iteration": 2.5366694927215576 + }, + { + "auxiliary_loss_clip": 0.01115056, + "auxiliary_loss_mlp": 0.01038235, + "balance_loss_clip": 1.0234828, + "balance_loss_mlp": 1.03943825, + "epoch": 0.5024500225462197, + "flos": 13440323111040.0, + "grad_norm": 2.216032444638608, + "language_loss": 0.76043326, + "learning_rate": 2.0811804523449803e-06, + "loss": 0.78196621, + "num_input_tokens_seen": 179667605, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.7578125, + "step": 8357, + "time_per_iteration": 2.4454803466796875 + }, + { + "auxiliary_loss_clip": 0.01112875, + "auxiliary_loss_mlp": 0.01033956, + "balance_loss_clip": 1.0196929, + "balance_loss_mlp": 1.04054666, + "epoch": 0.5025101457988878, + "flos": 21579584215680.0, + "grad_norm": 1.6874014883711121, + "language_loss": 0.75969183, + "learning_rate": 2.0807913100910417e-06, + "loss": 0.78116012, + "num_input_tokens_seen": 179686910, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7265625, + "step": 8358, + "time_per_iteration": 2.4932358264923096 + }, + { + "auxiliary_loss_clip": 0.01111732, + "auxiliary_loss_mlp": 0.01034386, + "balance_loss_clip": 1.02163708, + "balance_loss_mlp": 1.04097748, + "epoch": 0.5025702690515557, + "flos": 24645134448000.0, + "grad_norm": 2.322067399050787, + "language_loss": 0.72372258, + "learning_rate": 2.0804021647734887e-06, + "loss": 0.74518377, + "num_input_tokens_seen": 179706395, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 8359, + "time_per_iteration": 2.500152826309204 + }, + { + "auxiliary_loss_clip": 0.01111655, + "auxiliary_loss_mlp": 0.01036283, + "balance_loss_clip": 1.02361679, + "balance_loss_mlp": 1.04144287, + "epoch": 0.5026303923042237, + "flos": 22090162089600.0, + "grad_norm": 1.6242275025336705, + "language_loss": 0.77095789, + "learning_rate": 2.080013016407077e-06, + "loss": 0.79243731, + "num_input_tokens_seen": 179725735, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 8360, + "time_per_iteration": 2.5194928646087646 + }, + { + "auxiliary_loss_clip": 0.01111322, + "auxiliary_loss_mlp": 0.01032708, + "balance_loss_clip": 1.02062035, + "balance_loss_mlp": 1.04179871, + "epoch": 0.5026905155568916, + "flos": 23697200574720.0, + "grad_norm": 1.6325944972725464, + "language_loss": 0.76545495, + "learning_rate": 2.0796238650065645e-06, + "loss": 0.78689528, + "num_input_tokens_seen": 179746150, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 8361, + "time_per_iteration": 2.4667415618896484 + }, + { + "auxiliary_loss_clip": 0.0111058, + "auxiliary_loss_mlp": 0.01033627, + "balance_loss_clip": 1.01973319, + "balance_loss_mlp": 1.03841019, + "epoch": 0.5027506388095596, + "flos": 25812410722560.0, + "grad_norm": 1.6123805658340187, + "language_loss": 0.84681976, + "learning_rate": 2.0792347105867065e-06, + "loss": 0.86826181, + "num_input_tokens_seen": 179767550, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.71875, + "step": 8362, + "time_per_iteration": 2.5463051795959473 + }, + { + "auxiliary_loss_clip": 0.01109115, + "auxiliary_loss_mlp": 0.01034772, + "balance_loss_clip": 1.02232695, + "balance_loss_mlp": 1.03756952, + "epoch": 0.5028107620622275, + "flos": 27526606456320.0, + "grad_norm": 1.4590070504225026, + "language_loss": 0.78211838, + "learning_rate": 2.0788455531622605e-06, + "loss": 0.80355728, + "num_input_tokens_seen": 179790075, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71484375, + "step": 8363, + "time_per_iteration": 2.5163207054138184 + }, + { + "auxiliary_loss_clip": 0.0110737, + "auxiliary_loss_mlp": 0.0103085, + "balance_loss_clip": 1.01799965, + "balance_loss_mlp": 1.04016399, + "epoch": 0.5028708853148955, + "flos": 24534278098560.0, + "grad_norm": 3.0044110074814627, + "language_loss": 0.75747573, + "learning_rate": 2.0784563927479838e-06, + "loss": 0.77885795, + "num_input_tokens_seen": 179806515, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.671875, + "step": 8364, + "time_per_iteration": 2.490145444869995 + }, + { + "auxiliary_loss_clip": 0.01106443, + "auxiliary_loss_mlp": 0.01029652, + "balance_loss_clip": 1.01749849, + "balance_loss_mlp": 1.03816295, + "epoch": 0.5029310085675635, + "flos": 20813609664000.0, + "grad_norm": 1.5639014752994398, + "language_loss": 0.69354087, + "learning_rate": 2.0780672293586317e-06, + "loss": 0.7149018, + "num_input_tokens_seen": 179826450, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.68359375, + "step": 8365, + "time_per_iteration": 2.473787307739258 + }, + { + "auxiliary_loss_clip": 0.01113542, + "auxiliary_loss_mlp": 0.01035128, + "balance_loss_clip": 1.02149057, + "balance_loss_mlp": 1.03982782, + "epoch": 0.5029911318202315, + "flos": 22342470197760.0, + "grad_norm": 1.442330503817835, + "language_loss": 0.73213601, + "learning_rate": 2.0776780630089635e-06, + "loss": 0.75362265, + "num_input_tokens_seen": 179846770, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.734375, + "step": 8366, + "time_per_iteration": 2.549877405166626 + }, + { + "auxiliary_loss_clip": 0.01109966, + "auxiliary_loss_mlp": 0.01032343, + "balance_loss_clip": 1.02064812, + "balance_loss_mlp": 1.04103982, + "epoch": 0.5030512550728995, + "flos": 24352713826560.0, + "grad_norm": 1.4509464249778803, + "language_loss": 0.78301162, + "learning_rate": 2.077288893713735e-06, + "loss": 0.80443466, + "num_input_tokens_seen": 179866585, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6875, + "step": 8367, + "time_per_iteration": 2.495147705078125 + }, + { + "auxiliary_loss_clip": 0.01108781, + "auxiliary_loss_mlp": 0.01030247, + "balance_loss_clip": 1.0180459, + "balance_loss_mlp": 1.03853226, + "epoch": 0.5031113783255674, + "flos": 18259930195200.0, + "grad_norm": 1.842981496070619, + "language_loss": 0.69923592, + "learning_rate": 2.0768997214877035e-06, + "loss": 0.72062624, + "num_input_tokens_seen": 179885575, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.703125, + "step": 8368, + "time_per_iteration": 2.4830057621002197 + }, + { + "auxiliary_loss_clip": 0.01035945, + "auxiliary_loss_mlp": 0.01007176, + "balance_loss_clip": 1.00621665, + "balance_loss_mlp": 1.01321661, + "epoch": 0.5031715015782354, + "flos": 57253173200640.0, + "grad_norm": 0.8570502115037558, + "language_loss": 0.63344997, + "learning_rate": 2.0765105463456274e-06, + "loss": 0.65388119, + "num_input_tokens_seen": 179939650, + "router_z_loss_clip": 0.00958252, + "router_z_loss_mlp": 0.22851562, + "step": 8369, + "time_per_iteration": 3.0224173069000244 + }, + { + "auxiliary_loss_clip": 0.0110829, + "auxiliary_loss_mlp": 0.01031598, + "balance_loss_clip": 1.01973677, + "balance_loss_mlp": 1.03877878, + "epoch": 0.5032316248309033, + "flos": 27527360641920.0, + "grad_norm": 2.153532760870157, + "language_loss": 0.60134995, + "learning_rate": 2.076121368302263e-06, + "loss": 0.62274879, + "num_input_tokens_seen": 179961765, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6953125, + "step": 8370, + "time_per_iteration": 2.570244073867798 + }, + { + "auxiliary_loss_clip": 0.01110861, + "auxiliary_loss_mlp": 0.01034812, + "balance_loss_clip": 1.02094817, + "balance_loss_mlp": 1.03846478, + "epoch": 0.5032917480835714, + "flos": 34495825939200.0, + "grad_norm": 1.5686803599666441, + "language_loss": 0.68485558, + "learning_rate": 2.0757321873723695e-06, + "loss": 0.7063123, + "num_input_tokens_seen": 179983015, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.72265625, + "step": 8371, + "time_per_iteration": 2.5606741905212402 + }, + { + "auxiliary_loss_clip": 0.01110798, + "auxiliary_loss_mlp": 0.01030836, + "balance_loss_clip": 1.01710284, + "balance_loss_mlp": 1.04021561, + "epoch": 0.5033518713362393, + "flos": 33656773167360.0, + "grad_norm": 2.6972353884187776, + "language_loss": 0.67238319, + "learning_rate": 2.0753430035707042e-06, + "loss": 0.6937995, + "num_input_tokens_seen": 180003210, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.703125, + "step": 8372, + "time_per_iteration": 2.5703678131103516 + }, + { + "auxiliary_loss_clip": 0.0110914, + "auxiliary_loss_mlp": 0.01035865, + "balance_loss_clip": 1.02197719, + "balance_loss_mlp": 1.03876567, + "epoch": 0.5034119945889073, + "flos": 28185495586560.0, + "grad_norm": 2.7198935997293683, + "language_loss": 0.66590893, + "learning_rate": 2.0749538169120235e-06, + "loss": 0.68735898, + "num_input_tokens_seen": 180025530, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.703125, + "step": 8373, + "time_per_iteration": 2.526221513748169 + }, + { + "auxiliary_loss_clip": 0.01106535, + "auxiliary_loss_mlp": 0.01028349, + "balance_loss_clip": 1.01558208, + "balance_loss_mlp": 1.03755879, + "epoch": 0.5034721178415752, + "flos": 21358697529600.0, + "grad_norm": 1.6286907446961802, + "language_loss": 0.74674404, + "learning_rate": 2.0745646274110872e-06, + "loss": 0.76809293, + "num_input_tokens_seen": 180043180, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69140625, + "step": 8374, + "time_per_iteration": 2.488349199295044 + }, + { + "auxiliary_loss_clip": 0.01111709, + "auxiliary_loss_mlp": 0.01037533, + "balance_loss_clip": 1.02400887, + "balance_loss_mlp": 1.04047632, + "epoch": 0.5035322410942432, + "flos": 22674823764480.0, + "grad_norm": 1.5485355079726564, + "language_loss": 0.67947745, + "learning_rate": 2.0741754350826525e-06, + "loss": 0.70096987, + "num_input_tokens_seen": 180062905, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7109375, + "step": 8375, + "time_per_iteration": 2.4445972442626953 + }, + { + "auxiliary_loss_clip": 0.01114075, + "auxiliary_loss_mlp": 0.01034329, + "balance_loss_clip": 1.02008343, + "balance_loss_mlp": 1.04047072, + "epoch": 0.5035923643469111, + "flos": 19828723674240.0, + "grad_norm": 1.8481066708574578, + "language_loss": 0.78526819, + "learning_rate": 2.0737862399414777e-06, + "loss": 0.8067522, + "num_input_tokens_seen": 180082000, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.734375, + "step": 8376, + "time_per_iteration": 2.468104124069214 + }, + { + "auxiliary_loss_clip": 0.0111296, + "auxiliary_loss_mlp": 0.0103107, + "balance_loss_clip": 1.01704502, + "balance_loss_mlp": 1.03864694, + "epoch": 0.5036524875995791, + "flos": 30514625182080.0, + "grad_norm": 2.8611372201727234, + "language_loss": 0.59723544, + "learning_rate": 2.0733970420023213e-06, + "loss": 0.61867571, + "num_input_tokens_seen": 180101340, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 8377, + "time_per_iteration": 2.5277962684631348 + }, + { + "auxiliary_loss_clip": 0.01108859, + "auxiliary_loss_mlp": 0.01034548, + "balance_loss_clip": 1.02114892, + "balance_loss_mlp": 1.03836918, + "epoch": 0.5037126108522471, + "flos": 14720574637440.0, + "grad_norm": 1.9462161897860946, + "language_loss": 0.76360452, + "learning_rate": 2.0730078412799425e-06, + "loss": 0.78503865, + "num_input_tokens_seen": 180119160, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.703125, + "step": 8378, + "time_per_iteration": 2.448323965072632 + }, + { + "auxiliary_loss_clip": 0.01109358, + "auxiliary_loss_mlp": 0.01034908, + "balance_loss_clip": 1.02211046, + "balance_loss_mlp": 1.03916407, + "epoch": 0.5037727341049151, + "flos": 25297702784640.0, + "grad_norm": 1.6531450393233522, + "language_loss": 0.74565625, + "learning_rate": 2.0726186377890985e-06, + "loss": 0.7670989, + "num_input_tokens_seen": 180138730, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.703125, + "step": 8379, + "time_per_iteration": 2.5036356449127197 + }, + { + "auxiliary_loss_clip": 0.01109557, + "auxiliary_loss_mlp": 0.01031633, + "balance_loss_clip": 1.01952767, + "balance_loss_mlp": 1.04144955, + "epoch": 0.5038328573575831, + "flos": 28541764632960.0, + "grad_norm": 5.059413081923233, + "language_loss": 0.6692574, + "learning_rate": 2.072229431544548e-06, + "loss": 0.6906693, + "num_input_tokens_seen": 180158810, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 8380, + "time_per_iteration": 2.524144411087036 + }, + { + "auxiliary_loss_clip": 0.01109006, + "auxiliary_loss_mlp": 0.01030794, + "balance_loss_clip": 1.01879573, + "balance_loss_mlp": 1.03999329, + "epoch": 0.503892980610251, + "flos": 31649869503360.0, + "grad_norm": 1.7991215942112995, + "language_loss": 0.63869506, + "learning_rate": 2.071840222561051e-06, + "loss": 0.66009307, + "num_input_tokens_seen": 180179700, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 8381, + "time_per_iteration": 2.5605592727661133 + }, + { + "auxiliary_loss_clip": 0.01108854, + "auxiliary_loss_mlp": 0.01035256, + "balance_loss_clip": 1.02296555, + "balance_loss_mlp": 1.04009557, + "epoch": 0.503953103862919, + "flos": 27089358197760.0, + "grad_norm": 1.6170974847944384, + "language_loss": 0.67252153, + "learning_rate": 2.071451010853365e-06, + "loss": 0.69396263, + "num_input_tokens_seen": 180199890, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 8382, + "time_per_iteration": 2.5227982997894287 + }, + { + "auxiliary_loss_clip": 0.01116241, + "auxiliary_loss_mlp": 0.01039894, + "balance_loss_clip": 1.02614903, + "balance_loss_mlp": 1.04075313, + "epoch": 0.5040132271155869, + "flos": 15632957024640.0, + "grad_norm": 2.0398701191748, + "language_loss": 0.62190729, + "learning_rate": 2.0710617964362506e-06, + "loss": 0.64346862, + "num_input_tokens_seen": 180217840, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.75390625, + "step": 8383, + "time_per_iteration": 2.43418288230896 + }, + { + "auxiliary_loss_clip": 0.01106599, + "auxiliary_loss_mlp": 0.01034318, + "balance_loss_clip": 1.02198625, + "balance_loss_mlp": 1.03885436, + "epoch": 0.504073350368255, + "flos": 13590106824960.0, + "grad_norm": 3.355380782185913, + "language_loss": 0.67041314, + "learning_rate": 2.070672579324465e-06, + "loss": 0.69182235, + "num_input_tokens_seen": 180236465, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.67578125, + "step": 8384, + "time_per_iteration": 2.450605630874634 + }, + { + "auxiliary_loss_clip": 0.01112035, + "auxiliary_loss_mlp": 0.01036023, + "balance_loss_clip": 1.02414393, + "balance_loss_mlp": 1.0412066, + "epoch": 0.5041334736209229, + "flos": 29058160510080.0, + "grad_norm": 1.6534299501213623, + "language_loss": 0.70829523, + "learning_rate": 2.0702833595327674e-06, + "loss": 0.72977579, + "num_input_tokens_seen": 180258025, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.70703125, + "step": 8385, + "time_per_iteration": 3.9600095748901367 + }, + { + "auxiliary_loss_clip": 0.01106768, + "auxiliary_loss_mlp": 0.01027134, + "balance_loss_clip": 1.0147717, + "balance_loss_mlp": 1.03961098, + "epoch": 0.5041935968735909, + "flos": 24608361899520.0, + "grad_norm": 2.2280411323646687, + "language_loss": 0.83021009, + "learning_rate": 2.069894137075919e-06, + "loss": 0.85154909, + "num_input_tokens_seen": 180277825, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.671875, + "step": 8386, + "time_per_iteration": 2.5137035846710205 + }, + { + "auxiliary_loss_clip": 0.01109584, + "auxiliary_loss_mlp": 0.01034789, + "balance_loss_clip": 1.02156925, + "balance_loss_mlp": 1.03921139, + "epoch": 0.5042537201262588, + "flos": 26286934320000.0, + "grad_norm": 1.4630184477724049, + "language_loss": 0.66776884, + "learning_rate": 2.0695049119686766e-06, + "loss": 0.6892125, + "num_input_tokens_seen": 180300465, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 8387, + "time_per_iteration": 5.38523268699646 + }, + { + "auxiliary_loss_clip": 0.01110278, + "auxiliary_loss_mlp": 0.01029754, + "balance_loss_clip": 1.01780963, + "balance_loss_mlp": 1.04077113, + "epoch": 0.5043138433789268, + "flos": 22017371178240.0, + "grad_norm": 1.3874005116173278, + "language_loss": 0.80059648, + "learning_rate": 2.0691156842258016e-06, + "loss": 0.82199681, + "num_input_tokens_seen": 180321050, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 8388, + "time_per_iteration": 3.938295364379883 + }, + { + "auxiliary_loss_clip": 0.01109371, + "auxiliary_loss_mlp": 0.0103035, + "balance_loss_clip": 1.0181793, + "balance_loss_mlp": 1.03903794, + "epoch": 0.5043739666315947, + "flos": 28767104605440.0, + "grad_norm": 2.6549702991910453, + "language_loss": 0.69832838, + "learning_rate": 2.0687264538620537e-06, + "loss": 0.71972561, + "num_input_tokens_seen": 180338870, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.703125, + "step": 8389, + "time_per_iteration": 2.514204978942871 + }, + { + "auxiliary_loss_clip": 0.01110176, + "auxiliary_loss_mlp": 0.01033206, + "balance_loss_clip": 1.02127957, + "balance_loss_mlp": 1.03844476, + "epoch": 0.5044340898842627, + "flos": 27599253713280.0, + "grad_norm": 1.5923484046165255, + "language_loss": 0.69297862, + "learning_rate": 2.068337220892191e-06, + "loss": 0.71441251, + "num_input_tokens_seen": 180361285, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.71875, + "step": 8390, + "time_per_iteration": 2.517423152923584 + }, + { + "auxiliary_loss_clip": 0.01034589, + "auxiliary_loss_mlp": 0.01005008, + "balance_loss_clip": 1.00389957, + "balance_loss_mlp": 1.0117954, + "epoch": 0.5044942131369307, + "flos": 67458050749440.0, + "grad_norm": 0.8182221752596884, + "language_loss": 0.52977288, + "learning_rate": 2.067947985330974e-06, + "loss": 0.55016881, + "num_input_tokens_seen": 180415170, + "router_z_loss_clip": 0.0111084, + "router_z_loss_mlp": 0.22851562, + "step": 8391, + "time_per_iteration": 2.8990061283111572 + }, + { + "auxiliary_loss_clip": 0.01034773, + "auxiliary_loss_mlp": 0.01000958, + "balance_loss_clip": 0.99989092, + "balance_loss_mlp": 1.01217151, + "epoch": 0.5045543363895987, + "flos": 58630849390080.0, + "grad_norm": 0.8813101083301623, + "language_loss": 0.60678625, + "learning_rate": 2.0675587471931628e-06, + "loss": 0.62714356, + "num_input_tokens_seen": 180468060, + "router_z_loss_clip": 0.01068115, + "router_z_loss_mlp": 0.2265625, + "step": 8392, + "time_per_iteration": 2.91495680809021 + }, + { + "auxiliary_loss_clip": 0.01106534, + "auxiliary_loss_mlp": 0.01032936, + "balance_loss_clip": 1.02103257, + "balance_loss_mlp": 1.03893185, + "epoch": 0.5046144596422667, + "flos": 22526620248960.0, + "grad_norm": 1.5806327501196855, + "language_loss": 0.84691715, + "learning_rate": 2.067169506493517e-06, + "loss": 0.86831182, + "num_input_tokens_seen": 180486610, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 8393, + "time_per_iteration": 2.5033814907073975 + }, + { + "auxiliary_loss_clip": 0.01110241, + "auxiliary_loss_mlp": 0.01028823, + "balance_loss_clip": 1.01680708, + "balance_loss_mlp": 1.04046786, + "epoch": 0.5046745828949346, + "flos": 27454246508160.0, + "grad_norm": 2.96195836984414, + "language_loss": 0.50628948, + "learning_rate": 2.0667802632467974e-06, + "loss": 0.52768016, + "num_input_tokens_seen": 180508135, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6953125, + "step": 8394, + "time_per_iteration": 2.492766857147217 + }, + { + "auxiliary_loss_clip": 0.01108439, + "auxiliary_loss_mlp": 0.01032191, + "balance_loss_clip": 1.01906633, + "balance_loss_mlp": 1.03773594, + "epoch": 0.5047347061476026, + "flos": 17274541415040.0, + "grad_norm": 1.6061893361767445, + "language_loss": 0.75181741, + "learning_rate": 2.0663910174677627e-06, + "loss": 0.7732237, + "num_input_tokens_seen": 180527000, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.70703125, + "step": 8395, + "time_per_iteration": 2.4661927223205566 + }, + { + "auxiliary_loss_clip": 0.01107947, + "auxiliary_loss_mlp": 0.01030847, + "balance_loss_clip": 1.01859236, + "balance_loss_mlp": 1.03834832, + "epoch": 0.5047948294002705, + "flos": 16649515831680.0, + "grad_norm": 2.243385214175979, + "language_loss": 0.67677552, + "learning_rate": 2.0660017691711737e-06, + "loss": 0.69816345, + "num_input_tokens_seen": 180544715, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6953125, + "step": 8396, + "time_per_iteration": 2.416499376296997 + }, + { + "auxiliary_loss_clip": 0.01109504, + "auxiliary_loss_mlp": 0.01027964, + "balance_loss_clip": 1.01623356, + "balance_loss_mlp": 1.0404129, + "epoch": 0.5048549526529386, + "flos": 26865706164480.0, + "grad_norm": 1.7915756184866887, + "language_loss": 0.79064161, + "learning_rate": 2.065612518371792e-06, + "loss": 0.81201625, + "num_input_tokens_seen": 180565365, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6875, + "step": 8397, + "time_per_iteration": 2.5530309677124023 + }, + { + "auxiliary_loss_clip": 0.01107401, + "auxiliary_loss_mlp": 0.01029415, + "balance_loss_clip": 1.01741672, + "balance_loss_mlp": 1.03848135, + "epoch": 0.5049150759056065, + "flos": 21833939399040.0, + "grad_norm": 1.652903699623706, + "language_loss": 0.66017222, + "learning_rate": 2.065223265084376e-06, + "loss": 0.68154037, + "num_input_tokens_seen": 180586670, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6875, + "step": 8398, + "time_per_iteration": 2.4544124603271484 + }, + { + "auxiliary_loss_clip": 0.01108938, + "auxiliary_loss_mlp": 0.01029574, + "balance_loss_clip": 1.017313, + "balance_loss_mlp": 1.0395267, + "epoch": 0.5049751991582745, + "flos": 21685807710720.0, + "grad_norm": 1.639047703672107, + "language_loss": 0.71633506, + "learning_rate": 2.064834009323688e-06, + "loss": 0.73772013, + "num_input_tokens_seen": 180605085, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6953125, + "step": 8399, + "time_per_iteration": 2.5301358699798584 + }, + { + "auxiliary_loss_clip": 0.01111081, + "auxiliary_loss_mlp": 0.01038179, + "balance_loss_clip": 1.02533388, + "balance_loss_mlp": 1.03947675, + "epoch": 0.5050353224109424, + "flos": 21359379888000.0, + "grad_norm": 1.6970917460172408, + "language_loss": 0.81506133, + "learning_rate": 2.0644447511044878e-06, + "loss": 0.83655393, + "num_input_tokens_seen": 180624370, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71484375, + "step": 8400, + "time_per_iteration": 2.4705498218536377 + }, + { + "auxiliary_loss_clip": 0.01109099, + "auxiliary_loss_mlp": 0.01029733, + "balance_loss_clip": 1.01716256, + "balance_loss_mlp": 1.03942847, + "epoch": 0.5050954456636104, + "flos": 22820082364800.0, + "grad_norm": 1.8569234799708698, + "language_loss": 0.79040837, + "learning_rate": 2.0640554904415362e-06, + "loss": 0.81179667, + "num_input_tokens_seen": 180642450, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69921875, + "step": 8401, + "time_per_iteration": 2.4791224002838135 + }, + { + "auxiliary_loss_clip": 0.01109433, + "auxiliary_loss_mlp": 0.01030061, + "balance_loss_clip": 1.01775241, + "balance_loss_mlp": 1.03751659, + "epoch": 0.5051555689162783, + "flos": 30448226891520.0, + "grad_norm": 1.5775455049866824, + "language_loss": 0.69999743, + "learning_rate": 2.063666227349593e-06, + "loss": 0.72139227, + "num_input_tokens_seen": 180665250, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.71875, + "step": 8402, + "time_per_iteration": 2.5591325759887695 + }, + { + "auxiliary_loss_clip": 0.01105942, + "auxiliary_loss_mlp": 0.0102692, + "balance_loss_clip": 1.01515996, + "balance_loss_mlp": 1.03572834, + "epoch": 0.5052156921689464, + "flos": 21287953693440.0, + "grad_norm": 1.822367858534602, + "language_loss": 0.68917859, + "learning_rate": 2.063276961843422e-06, + "loss": 0.71050715, + "num_input_tokens_seen": 180687425, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.703125, + "step": 8403, + "time_per_iteration": 2.5292510986328125 + }, + { + "auxiliary_loss_clip": 0.01106316, + "auxiliary_loss_mlp": 0.01034839, + "balance_loss_clip": 1.02275133, + "balance_loss_mlp": 1.03929162, + "epoch": 0.5052758154216143, + "flos": 25081305298560.0, + "grad_norm": 1.4593040849849852, + "language_loss": 0.85396838, + "learning_rate": 2.062887693937781e-06, + "loss": 0.87537992, + "num_input_tokens_seen": 180708725, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.671875, + "step": 8404, + "time_per_iteration": 2.4852187633514404 + }, + { + "auxiliary_loss_clip": 0.01107561, + "auxiliary_loss_mlp": 0.01027359, + "balance_loss_clip": 1.01565218, + "balance_loss_mlp": 1.03806567, + "epoch": 0.5053359386742823, + "flos": 20885502735360.0, + "grad_norm": 1.5717367434630007, + "language_loss": 0.75364089, + "learning_rate": 2.0624984236474322e-06, + "loss": 0.77499014, + "num_input_tokens_seen": 180727990, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6953125, + "step": 8405, + "time_per_iteration": 2.4850387573242188 + }, + { + "auxiliary_loss_clip": 0.01109835, + "auxiliary_loss_mlp": 0.01028348, + "balance_loss_clip": 1.01514542, + "balance_loss_mlp": 1.0388459, + "epoch": 0.5053960619269503, + "flos": 37743335493120.0, + "grad_norm": 1.5541955318463554, + "language_loss": 0.72983336, + "learning_rate": 2.0621091509871378e-06, + "loss": 0.75121522, + "num_input_tokens_seen": 180749765, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 8406, + "time_per_iteration": 2.59979510307312 + }, + { + "auxiliary_loss_clip": 0.01102813, + "auxiliary_loss_mlp": 0.01028972, + "balance_loss_clip": 1.01712823, + "balance_loss_mlp": 1.03577971, + "epoch": 0.5054561851796182, + "flos": 23513840622720.0, + "grad_norm": 1.7094740961502104, + "language_loss": 0.76863986, + "learning_rate": 2.0617198759716568e-06, + "loss": 0.7899577, + "num_input_tokens_seen": 180769580, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 8407, + "time_per_iteration": 2.527543067932129 + }, + { + "auxiliary_loss_clip": 0.01108813, + "auxiliary_loss_mlp": 0.01027236, + "balance_loss_clip": 1.01535106, + "balance_loss_mlp": 1.03706717, + "epoch": 0.5055163084322862, + "flos": 30410233280640.0, + "grad_norm": 1.6525886874932982, + "language_loss": 0.63115776, + "learning_rate": 2.0613305986157535e-06, + "loss": 0.65251827, + "num_input_tokens_seen": 180790295, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.71875, + "step": 8408, + "time_per_iteration": 2.53218150138855 + }, + { + "auxiliary_loss_clip": 0.01106822, + "auxiliary_loss_mlp": 0.01031761, + "balance_loss_clip": 1.01871967, + "balance_loss_mlp": 1.0382477, + "epoch": 0.5055764316849541, + "flos": 20259651139200.0, + "grad_norm": 1.695436010833495, + "language_loss": 0.63705122, + "learning_rate": 2.0609413189341865e-06, + "loss": 0.65843707, + "num_input_tokens_seen": 180807875, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6875, + "step": 8409, + "time_per_iteration": 2.4916255474090576 + }, + { + "auxiliary_loss_clip": 0.01105638, + "auxiliary_loss_mlp": 0.01025409, + "balance_loss_clip": 1.01410186, + "balance_loss_mlp": 1.03845859, + "epoch": 0.5056365549376222, + "flos": 26070895969920.0, + "grad_norm": 1.3247049855298083, + "language_loss": 0.70876539, + "learning_rate": 2.0605520369417193e-06, + "loss": 0.73007584, + "num_input_tokens_seen": 180831300, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 8410, + "time_per_iteration": 2.527935266494751 + }, + { + "auxiliary_loss_clip": 0.01107655, + "auxiliary_loss_mlp": 0.0103361, + "balance_loss_clip": 1.02100372, + "balance_loss_mlp": 1.03812361, + "epoch": 0.5056966781902901, + "flos": 19279074781440.0, + "grad_norm": 1.5323244298402565, + "language_loss": 0.79243749, + "learning_rate": 2.060162752653113e-06, + "loss": 0.81385016, + "num_input_tokens_seen": 180849055, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 8411, + "time_per_iteration": 2.4926035404205322 + }, + { + "auxiliary_loss_clip": 0.01107995, + "auxiliary_loss_mlp": 0.0103704, + "balance_loss_clip": 1.02357578, + "balance_loss_mlp": 1.03764153, + "epoch": 0.5057568014429581, + "flos": 21323325611520.0, + "grad_norm": 1.7118743762511017, + "language_loss": 0.81584603, + "learning_rate": 2.0597734660831285e-06, + "loss": 0.83729643, + "num_input_tokens_seen": 180867395, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.703125, + "step": 8412, + "time_per_iteration": 2.4696593284606934 + }, + { + "auxiliary_loss_clip": 0.0110966, + "auxiliary_loss_mlp": 0.0103257, + "balance_loss_clip": 1.02057767, + "balance_loss_mlp": 1.04071307, + "epoch": 0.505816924695626, + "flos": 17493596507520.0, + "grad_norm": 2.1036912411500555, + "language_loss": 0.80586725, + "learning_rate": 2.0593841772465283e-06, + "loss": 0.82728952, + "num_input_tokens_seen": 180886670, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6875, + "step": 8413, + "time_per_iteration": 2.4840738773345947 + }, + { + "auxiliary_loss_clip": 0.01111974, + "auxiliary_loss_mlp": 0.01032705, + "balance_loss_clip": 1.01959252, + "balance_loss_mlp": 1.04003644, + "epoch": 0.505877047948294, + "flos": 21142084561920.0, + "grad_norm": 1.7598991939758672, + "language_loss": 0.80167186, + "learning_rate": 2.0589948861580737e-06, + "loss": 0.82311857, + "num_input_tokens_seen": 180904645, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 8414, + "time_per_iteration": 2.4437410831451416 + }, + { + "auxiliary_loss_clip": 0.01106268, + "auxiliary_loss_mlp": 0.0102984, + "balance_loss_clip": 1.0174123, + "balance_loss_mlp": 1.03536403, + "epoch": 0.5059371712009619, + "flos": 36350036887680.0, + "grad_norm": 2.1880801569958486, + "language_loss": 0.62188816, + "learning_rate": 2.058605592832528e-06, + "loss": 0.64324927, + "num_input_tokens_seen": 180922340, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 8415, + "time_per_iteration": 2.617699384689331 + }, + { + "auxiliary_loss_clip": 0.01107633, + "auxiliary_loss_mlp": 0.01029289, + "balance_loss_clip": 1.01712978, + "balance_loss_mlp": 1.03840709, + "epoch": 0.50599729445363, + "flos": 22673387220480.0, + "grad_norm": 1.5996951654726725, + "language_loss": 0.81836188, + "learning_rate": 2.0582162972846515e-06, + "loss": 0.8397311, + "num_input_tokens_seen": 180941350, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69140625, + "step": 8416, + "time_per_iteration": 2.484717607498169 + }, + { + "auxiliary_loss_clip": 0.0110817, + "auxiliary_loss_mlp": 0.01033768, + "balance_loss_clip": 1.02253819, + "balance_loss_mlp": 1.04098511, + "epoch": 0.5060574177062979, + "flos": 22747866071040.0, + "grad_norm": 1.7782267995500585, + "language_loss": 0.79110944, + "learning_rate": 2.0578269995292078e-06, + "loss": 0.81252885, + "num_input_tokens_seen": 180960720, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 8417, + "time_per_iteration": 2.544739246368408 + }, + { + "auxiliary_loss_clip": 0.01103419, + "auxiliary_loss_mlp": 0.01030044, + "balance_loss_clip": 1.01789641, + "balance_loss_mlp": 1.03713858, + "epoch": 0.5061175409589659, + "flos": 21653201139840.0, + "grad_norm": 1.8205649281423022, + "language_loss": 0.62930262, + "learning_rate": 2.0574376995809588e-06, + "loss": 0.65063727, + "num_input_tokens_seen": 180979725, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 8418, + "time_per_iteration": 2.4795963764190674 + }, + { + "auxiliary_loss_clip": 0.01109111, + "auxiliary_loss_mlp": 0.01034585, + "balance_loss_clip": 1.02232397, + "balance_loss_mlp": 1.03859878, + "epoch": 0.5061776642116339, + "flos": 21616249023360.0, + "grad_norm": 2.1933090002480182, + "language_loss": 0.77840686, + "learning_rate": 2.0570483974546653e-06, + "loss": 0.79984379, + "num_input_tokens_seen": 180998980, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.703125, + "step": 8419, + "time_per_iteration": 2.491931915283203 + }, + { + "auxiliary_loss_clip": 0.0110836, + "auxiliary_loss_mlp": 0.01032721, + "balance_loss_clip": 1.01950645, + "balance_loss_mlp": 1.0373354, + "epoch": 0.5062377874643018, + "flos": 24426294837120.0, + "grad_norm": 1.7154546366730201, + "language_loss": 0.77258635, + "learning_rate": 2.0566590931650917e-06, + "loss": 0.79399723, + "num_input_tokens_seen": 181019165, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.70703125, + "step": 8420, + "time_per_iteration": 2.5963363647460938 + }, + { + "auxiliary_loss_clip": 0.01109095, + "auxiliary_loss_mlp": 0.01037587, + "balance_loss_clip": 1.02459884, + "balance_loss_mlp": 1.03782094, + "epoch": 0.5062979107169698, + "flos": 22524429519360.0, + "grad_norm": 1.679092087125118, + "language_loss": 0.77511621, + "learning_rate": 2.056269786726999e-06, + "loss": 0.79658306, + "num_input_tokens_seen": 181037110, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 8421, + "time_per_iteration": 2.4954135417938232 + }, + { + "auxiliary_loss_clip": 0.01105449, + "auxiliary_loss_mlp": 0.01029273, + "balance_loss_clip": 1.01762629, + "balance_loss_mlp": 1.03668654, + "epoch": 0.5063580339696377, + "flos": 24571984400640.0, + "grad_norm": 1.4641430762434493, + "language_loss": 0.66987717, + "learning_rate": 2.0558804781551512e-06, + "loss": 0.69122434, + "num_input_tokens_seen": 181057775, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6875, + "step": 8422, + "time_per_iteration": 2.4802937507629395 + }, + { + "auxiliary_loss_clip": 0.01109498, + "auxiliary_loss_mlp": 0.01032318, + "balance_loss_clip": 1.01998544, + "balance_loss_mlp": 1.04081178, + "epoch": 0.5064181572223058, + "flos": 22596143022720.0, + "grad_norm": 1.8050040320885787, + "language_loss": 0.81599188, + "learning_rate": 2.05549116746431e-06, + "loss": 0.83741009, + "num_input_tokens_seen": 181078260, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 8423, + "time_per_iteration": 2.591792345046997 + }, + { + "auxiliary_loss_clip": 0.01109343, + "auxiliary_loss_mlp": 0.01031623, + "balance_loss_clip": 1.01859319, + "balance_loss_mlp": 1.03820443, + "epoch": 0.5064782804749737, + "flos": 25994944661760.0, + "grad_norm": 1.8632464802837558, + "language_loss": 0.74227667, + "learning_rate": 2.055101854669237e-06, + "loss": 0.76368636, + "num_input_tokens_seen": 181098755, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 8424, + "time_per_iteration": 2.5076076984405518 + }, + { + "auxiliary_loss_clip": 0.01105301, + "auxiliary_loss_mlp": 0.01033895, + "balance_loss_clip": 1.02120495, + "balance_loss_mlp": 1.03742146, + "epoch": 0.5065384037276417, + "flos": 28553041503360.0, + "grad_norm": 1.6339612294396895, + "language_loss": 0.71546394, + "learning_rate": 2.0547125397846975e-06, + "loss": 0.73685586, + "num_input_tokens_seen": 181121570, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 8425, + "time_per_iteration": 2.570103406906128 + }, + { + "auxiliary_loss_clip": 0.01108568, + "auxiliary_loss_mlp": 0.01035107, + "balance_loss_clip": 1.02325118, + "balance_loss_mlp": 1.0379858, + "epoch": 0.5065985269803096, + "flos": 22966023323520.0, + "grad_norm": 1.6987499343502257, + "language_loss": 0.78614688, + "learning_rate": 2.0543232228254524e-06, + "loss": 0.80758357, + "num_input_tokens_seen": 181140240, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.703125, + "step": 8426, + "time_per_iteration": 2.4616403579711914 + }, + { + "auxiliary_loss_clip": 0.01111124, + "auxiliary_loss_mlp": 0.01035589, + "balance_loss_clip": 1.02312577, + "balance_loss_mlp": 1.03994358, + "epoch": 0.5066586502329776, + "flos": 21608563512960.0, + "grad_norm": 2.818748758654822, + "language_loss": 0.77855921, + "learning_rate": 2.053933903806265e-06, + "loss": 0.80002636, + "num_input_tokens_seen": 181158630, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 8427, + "time_per_iteration": 3.908625364303589 + }, + { + "auxiliary_loss_clip": 0.0110433, + "auxiliary_loss_mlp": 0.01026092, + "balance_loss_clip": 1.01382565, + "balance_loss_mlp": 1.03709817, + "epoch": 0.5067187734856455, + "flos": 20339912079360.0, + "grad_norm": 1.8142719003609429, + "language_loss": 0.71444368, + "learning_rate": 2.0535445827418997e-06, + "loss": 0.73574793, + "num_input_tokens_seen": 181176405, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.671875, + "step": 8428, + "time_per_iteration": 2.4540021419525146 + }, + { + "auxiliary_loss_clip": 0.0110492, + "auxiliary_loss_mlp": 0.01031645, + "balance_loss_clip": 1.01983786, + "balance_loss_mlp": 1.03622389, + "epoch": 0.5067788967383136, + "flos": 28841080665600.0, + "grad_norm": 1.6344761677930288, + "language_loss": 0.82693905, + "learning_rate": 2.0531552596471168e-06, + "loss": 0.84830469, + "num_input_tokens_seen": 181197595, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 8429, + "time_per_iteration": 3.977104902267456 + }, + { + "auxiliary_loss_clip": 0.01113682, + "auxiliary_loss_mlp": 0.01036238, + "balance_loss_clip": 1.02267253, + "balance_loss_mlp": 1.04074979, + "epoch": 0.5068390199909815, + "flos": 32450174478720.0, + "grad_norm": 2.1730745276419485, + "language_loss": 0.73167485, + "learning_rate": 2.052765934536682e-06, + "loss": 0.75317407, + "num_input_tokens_seen": 181218560, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7265625, + "step": 8430, + "time_per_iteration": 4.066487073898315 + }, + { + "auxiliary_loss_clip": 0.01109473, + "auxiliary_loss_mlp": 0.01031357, + "balance_loss_clip": 1.01953173, + "balance_loss_mlp": 1.03904748, + "epoch": 0.5068991432436495, + "flos": 23146582014720.0, + "grad_norm": 1.7614160050819483, + "language_loss": 0.76304209, + "learning_rate": 2.0523766074253575e-06, + "loss": 0.78445041, + "num_input_tokens_seen": 181237095, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.703125, + "step": 8431, + "time_per_iteration": 2.459061861038208 + }, + { + "auxiliary_loss_clip": 0.01107362, + "auxiliary_loss_mlp": 0.01031584, + "balance_loss_clip": 1.01899588, + "balance_loss_mlp": 1.0388869, + "epoch": 0.5069592664963174, + "flos": 19936096404480.0, + "grad_norm": 1.4179396940955034, + "language_loss": 0.72168291, + "learning_rate": 2.0519872783279074e-06, + "loss": 0.74307233, + "num_input_tokens_seen": 181255940, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 8432, + "time_per_iteration": 2.4937191009521484 + }, + { + "auxiliary_loss_clip": 0.01040308, + "auxiliary_loss_mlp": 0.0100546, + "balance_loss_clip": 1.00428617, + "balance_loss_mlp": 1.01756871, + "epoch": 0.5070193897489854, + "flos": 65793771941760.0, + "grad_norm": 0.7612043046384747, + "language_loss": 0.63704848, + "learning_rate": 2.0515979472590945e-06, + "loss": 0.65750623, + "num_input_tokens_seen": 181316945, + "router_z_loss_clip": 0.01171875, + "router_z_loss_mlp": 0.22753906, + "step": 8433, + "time_per_iteration": 3.10312819480896 + }, + { + "auxiliary_loss_clip": 0.01109071, + "auxiliary_loss_mlp": 0.01035563, + "balance_loss_clip": 1.02276051, + "balance_loss_mlp": 1.0391171, + "epoch": 0.5070795130016534, + "flos": 17275331514240.0, + "grad_norm": 1.7667352609332163, + "language_loss": 0.77104461, + "learning_rate": 2.051208614233681e-06, + "loss": 0.79249096, + "num_input_tokens_seen": 181335555, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 8434, + "time_per_iteration": 2.4761765003204346 + }, + { + "auxiliary_loss_clip": 0.01110101, + "auxiliary_loss_mlp": 0.01032152, + "balance_loss_clip": 1.01997447, + "balance_loss_mlp": 1.03937244, + "epoch": 0.5071396362543213, + "flos": 21069940095360.0, + "grad_norm": 1.7167508969307774, + "language_loss": 0.71062863, + "learning_rate": 2.0508192792664326e-06, + "loss": 0.73205119, + "num_input_tokens_seen": 181354580, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.70703125, + "step": 8435, + "time_per_iteration": 2.476259231567383 + }, + { + "auxiliary_loss_clip": 0.01111512, + "auxiliary_loss_mlp": 0.01034776, + "balance_loss_clip": 1.02086425, + "balance_loss_mlp": 1.04086459, + "epoch": 0.5071997595069894, + "flos": 23144822248320.0, + "grad_norm": 2.1519666669040407, + "language_loss": 0.71635526, + "learning_rate": 2.050429942372112e-06, + "loss": 0.73781812, + "num_input_tokens_seen": 181374320, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.70703125, + "step": 8436, + "time_per_iteration": 2.4717278480529785 + }, + { + "auxiliary_loss_clip": 0.0111073, + "auxiliary_loss_mlp": 0.01029431, + "balance_loss_clip": 1.01621652, + "balance_loss_mlp": 1.04132712, + "epoch": 0.5072598827596573, + "flos": 22747183712640.0, + "grad_norm": 1.5051036444651287, + "language_loss": 0.8370682, + "learning_rate": 2.050040603565483e-06, + "loss": 0.85846984, + "num_input_tokens_seen": 181392190, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.6953125, + "step": 8437, + "time_per_iteration": 2.51187801361084 + }, + { + "auxiliary_loss_clip": 0.01107572, + "auxiliary_loss_mlp": 0.01025487, + "balance_loss_clip": 1.01340485, + "balance_loss_mlp": 1.03941774, + "epoch": 0.5073200060123253, + "flos": 22566301799040.0, + "grad_norm": 1.8339895444539178, + "language_loss": 0.80925703, + "learning_rate": 2.049651262861309e-06, + "loss": 0.83058763, + "num_input_tokens_seen": 181413890, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 8438, + "time_per_iteration": 2.5101053714752197 + }, + { + "auxiliary_loss_clip": 0.01112175, + "auxiliary_loss_mlp": 0.0103796, + "balance_loss_clip": 1.023947, + "balance_loss_mlp": 1.04053128, + "epoch": 0.5073801292649932, + "flos": 25806341324160.0, + "grad_norm": 1.458277190934999, + "language_loss": 0.79797888, + "learning_rate": 2.0492619202743543e-06, + "loss": 0.81948024, + "num_input_tokens_seen": 181433240, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.71875, + "step": 8439, + "time_per_iteration": 2.5196681022644043 + }, + { + "auxiliary_loss_clip": 0.01107511, + "auxiliary_loss_mlp": 0.01033327, + "balance_loss_clip": 1.02176344, + "balance_loss_mlp": 1.03948164, + "epoch": 0.5074402525176612, + "flos": 25373941401600.0, + "grad_norm": 1.5054968059802218, + "language_loss": 0.7129699, + "learning_rate": 2.048872575819383e-06, + "loss": 0.73437822, + "num_input_tokens_seen": 181453535, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6796875, + "step": 8440, + "time_per_iteration": 2.482475757598877 + }, + { + "auxiliary_loss_clip": 0.01110635, + "auxiliary_loss_mlp": 0.0103111, + "balance_loss_clip": 1.01877761, + "balance_loss_mlp": 1.03933895, + "epoch": 0.5075003757703291, + "flos": 26064431521920.0, + "grad_norm": 1.6937518353915977, + "language_loss": 0.70555139, + "learning_rate": 2.048483229511158e-06, + "loss": 0.72696882, + "num_input_tokens_seen": 181474195, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.7109375, + "step": 8441, + "time_per_iteration": 2.5299065113067627 + }, + { + "auxiliary_loss_clip": 0.01113885, + "auxiliary_loss_mlp": 0.01035023, + "balance_loss_clip": 1.0219456, + "balance_loss_mlp": 1.04142308, + "epoch": 0.5075604990229972, + "flos": 21835447770240.0, + "grad_norm": 1.8980066327338418, + "language_loss": 0.63670987, + "learning_rate": 2.0480938813644445e-06, + "loss": 0.65819889, + "num_input_tokens_seen": 181494000, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.72265625, + "step": 8442, + "time_per_iteration": 2.4623775482177734 + }, + { + "auxiliary_loss_clip": 0.01108296, + "auxiliary_loss_mlp": 0.01027699, + "balance_loss_clip": 1.016011, + "balance_loss_mlp": 1.04047632, + "epoch": 0.5076206222756651, + "flos": 31978703537280.0, + "grad_norm": 1.5153774279484464, + "language_loss": 0.7150898, + "learning_rate": 2.047704531394006e-06, + "loss": 0.73644972, + "num_input_tokens_seen": 181515955, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 8443, + "time_per_iteration": 2.586273670196533 + }, + { + "auxiliary_loss_clip": 0.01110925, + "auxiliary_loss_mlp": 0.01033689, + "balance_loss_clip": 1.02046299, + "balance_loss_mlp": 1.03887248, + "epoch": 0.5076807455283331, + "flos": 36904031326080.0, + "grad_norm": 1.223488951652841, + "language_loss": 0.61766541, + "learning_rate": 2.047315179614607e-06, + "loss": 0.63911152, + "num_input_tokens_seen": 181540225, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 8444, + "time_per_iteration": 2.5941321849823 + }, + { + "auxiliary_loss_clip": 0.01107921, + "auxiliary_loss_mlp": 0.01032806, + "balance_loss_clip": 1.02056909, + "balance_loss_mlp": 1.0380075, + "epoch": 0.507740868781001, + "flos": 29862415981440.0, + "grad_norm": 1.7476957798256931, + "language_loss": 0.6370405, + "learning_rate": 2.046925826041012e-06, + "loss": 0.65844774, + "num_input_tokens_seen": 181560125, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69921875, + "step": 8445, + "time_per_iteration": 2.622295379638672 + }, + { + "auxiliary_loss_clip": 0.01042597, + "auxiliary_loss_mlp": 0.01005213, + "balance_loss_clip": 1.00411069, + "balance_loss_mlp": 1.019732, + "epoch": 0.507800992033669, + "flos": 61918974247680.0, + "grad_norm": 0.8272934825203048, + "language_loss": 0.61873507, + "learning_rate": 2.0465364706879845e-06, + "loss": 0.6392132, + "num_input_tokens_seen": 181618830, + "router_z_loss_clip": 0.01104736, + "router_z_loss_mlp": 0.22851562, + "step": 8446, + "time_per_iteration": 3.106067180633545 + }, + { + "auxiliary_loss_clip": 0.01107421, + "auxiliary_loss_mlp": 0.01028834, + "balance_loss_clip": 1.01656127, + "balance_loss_mlp": 1.03849411, + "epoch": 0.507861115286337, + "flos": 20700490757760.0, + "grad_norm": 1.6783761303243148, + "language_loss": 0.80458808, + "learning_rate": 2.04614711357029e-06, + "loss": 0.82595056, + "num_input_tokens_seen": 181637120, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 8447, + "time_per_iteration": 2.483449935913086 + }, + { + "auxiliary_loss_clip": 0.01109683, + "auxiliary_loss_mlp": 0.01031748, + "balance_loss_clip": 1.01955903, + "balance_loss_mlp": 1.04166472, + "epoch": 0.507921238539005, + "flos": 30847050576000.0, + "grad_norm": 1.6097524760484219, + "language_loss": 0.70526159, + "learning_rate": 2.0457577547026916e-06, + "loss": 0.72667593, + "num_input_tokens_seen": 181659965, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6796875, + "step": 8448, + "time_per_iteration": 2.5377211570739746 + }, + { + "auxiliary_loss_clip": 0.01108561, + "auxiliary_loss_mlp": 0.01030678, + "balance_loss_clip": 1.01906157, + "balance_loss_mlp": 1.04054332, + "epoch": 0.507981361791673, + "flos": 35700197984640.0, + "grad_norm": 1.775058362169557, + "language_loss": 0.72186208, + "learning_rate": 2.045368394099955e-06, + "loss": 0.74325454, + "num_input_tokens_seen": 181685290, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 8449, + "time_per_iteration": 2.6247637271881104 + }, + { + "auxiliary_loss_clip": 0.01105391, + "auxiliary_loss_mlp": 0.01030497, + "balance_loss_clip": 1.01862371, + "balance_loss_mlp": 1.0373019, + "epoch": 0.5080414850443409, + "flos": 27161466750720.0, + "grad_norm": 1.4717194557779922, + "language_loss": 0.72751403, + "learning_rate": 2.044979031776844e-06, + "loss": 0.74887294, + "num_input_tokens_seen": 181706080, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 8450, + "time_per_iteration": 2.5097148418426514 + }, + { + "auxiliary_loss_clip": 0.01112404, + "auxiliary_loss_mlp": 0.01033223, + "balance_loss_clip": 1.02104533, + "balance_loss_mlp": 1.04217696, + "epoch": 0.5081016082970089, + "flos": 27085192220160.0, + "grad_norm": 1.631370100986613, + "language_loss": 0.7704621, + "learning_rate": 2.0445896677481234e-06, + "loss": 0.7919184, + "num_input_tokens_seen": 181724805, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69921875, + "step": 8451, + "time_per_iteration": 2.5109496116638184 + }, + { + "auxiliary_loss_clip": 0.01109885, + "auxiliary_loss_mlp": 0.01036862, + "balance_loss_clip": 1.02502477, + "balance_loss_mlp": 1.03928411, + "epoch": 0.5081617315496768, + "flos": 22856531690880.0, + "grad_norm": 1.7784899256909827, + "language_loss": 0.8518312, + "learning_rate": 2.044200302028559e-06, + "loss": 0.8732987, + "num_input_tokens_seen": 181743725, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.703125, + "step": 8452, + "time_per_iteration": 2.4603476524353027 + }, + { + "auxiliary_loss_clip": 0.01115612, + "auxiliary_loss_mlp": 0.01036365, + "balance_loss_clip": 1.02284074, + "balance_loss_mlp": 1.04209125, + "epoch": 0.5082218548023448, + "flos": 16281898087680.0, + "grad_norm": 2.2856093940760274, + "language_loss": 0.78046912, + "learning_rate": 2.0438109346329143e-06, + "loss": 0.80198884, + "num_input_tokens_seen": 181757720, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.734375, + "step": 8453, + "time_per_iteration": 2.450873613357544 + }, + { + "auxiliary_loss_clip": 0.01106928, + "auxiliary_loss_mlp": 0.01034326, + "balance_loss_clip": 1.02200532, + "balance_loss_mlp": 1.03973246, + "epoch": 0.5082819780550127, + "flos": 24460768915200.0, + "grad_norm": 1.6556718901191125, + "language_loss": 0.7626555, + "learning_rate": 2.0434215655759544e-06, + "loss": 0.78406799, + "num_input_tokens_seen": 181778545, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.671875, + "step": 8454, + "time_per_iteration": 2.4831783771514893 + }, + { + "auxiliary_loss_clip": 0.01110162, + "auxiliary_loss_mlp": 0.01032609, + "balance_loss_clip": 1.01998448, + "balance_loss_mlp": 1.03985167, + "epoch": 0.5083421013076808, + "flos": 23403271582080.0, + "grad_norm": 1.7440679508015728, + "language_loss": 0.89345592, + "learning_rate": 2.0430321948724446e-06, + "loss": 0.91488367, + "num_input_tokens_seen": 181799495, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 8455, + "time_per_iteration": 2.48486590385437 + }, + { + "auxiliary_loss_clip": 0.01116133, + "auxiliary_loss_mlp": 0.01035999, + "balance_loss_clip": 1.02230144, + "balance_loss_mlp": 1.04198599, + "epoch": 0.5084022245603487, + "flos": 23872695448320.0, + "grad_norm": 2.029385394187206, + "language_loss": 0.62613618, + "learning_rate": 2.042642822537149e-06, + "loss": 0.64765751, + "num_input_tokens_seen": 181818400, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7421875, + "step": 8456, + "time_per_iteration": 2.476060390472412 + }, + { + "auxiliary_loss_clip": 0.01038842, + "auxiliary_loss_mlp": 0.00998694, + "balance_loss_clip": 0.99766272, + "balance_loss_mlp": 1.01592362, + "epoch": 0.5084623478130167, + "flos": 62873336655360.0, + "grad_norm": 0.816065361839575, + "language_loss": 0.62538505, + "learning_rate": 2.0422534485848343e-06, + "loss": 0.64576042, + "num_input_tokens_seen": 181875975, + "router_z_loss_clip": 0.01031494, + "router_z_loss_mlp": 0.22949219, + "step": 8457, + "time_per_iteration": 2.9627416133880615 + }, + { + "auxiliary_loss_clip": 0.01110833, + "auxiliary_loss_mlp": 0.01033568, + "balance_loss_clip": 1.02069306, + "balance_loss_mlp": 1.04062462, + "epoch": 0.5085224710656846, + "flos": 22346133384960.0, + "grad_norm": 1.5574868486202833, + "language_loss": 0.67412502, + "learning_rate": 2.0418640730302644e-06, + "loss": 0.69556904, + "num_input_tokens_seen": 181896450, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 8458, + "time_per_iteration": 2.4851465225219727 + }, + { + "auxiliary_loss_clip": 0.01109854, + "auxiliary_loss_mlp": 0.01031481, + "balance_loss_clip": 1.01840341, + "balance_loss_mlp": 1.03811622, + "epoch": 0.5085825943183526, + "flos": 26066263115520.0, + "grad_norm": 1.6253676139168076, + "language_loss": 0.77861875, + "learning_rate": 2.0414746958882043e-06, + "loss": 0.80003208, + "num_input_tokens_seen": 181916770, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 8459, + "time_per_iteration": 2.5043020248413086 + }, + { + "auxiliary_loss_clip": 0.01117652, + "auxiliary_loss_mlp": 0.01035652, + "balance_loss_clip": 1.02252126, + "balance_loss_mlp": 1.04386926, + "epoch": 0.5086427175710206, + "flos": 17420733768960.0, + "grad_norm": 2.213093169353168, + "language_loss": 0.81109118, + "learning_rate": 2.0410853171734196e-06, + "loss": 0.83262426, + "num_input_tokens_seen": 181932710, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73828125, + "step": 8460, + "time_per_iteration": 2.4239838123321533 + }, + { + "auxiliary_loss_clip": 0.01111375, + "auxiliary_loss_mlp": 0.010378, + "balance_loss_clip": 1.02565289, + "balance_loss_mlp": 1.03999329, + "epoch": 0.5087028408236886, + "flos": 20631758083200.0, + "grad_norm": 1.5640945155523684, + "language_loss": 0.6866132, + "learning_rate": 2.0406959369006754e-06, + "loss": 0.70810497, + "num_input_tokens_seen": 181950665, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.7109375, + "step": 8461, + "time_per_iteration": 2.469954490661621 + }, + { + "auxiliary_loss_clip": 0.01107585, + "auxiliary_loss_mlp": 0.01032567, + "balance_loss_clip": 1.01997876, + "balance_loss_mlp": 1.03908265, + "epoch": 0.5087629640763566, + "flos": 25593822506880.0, + "grad_norm": 1.5611830538381608, + "language_loss": 0.76059598, + "learning_rate": 2.0403065550847375e-06, + "loss": 0.7819975, + "num_input_tokens_seen": 181971270, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 8462, + "time_per_iteration": 2.4907591342926025 + }, + { + "auxiliary_loss_clip": 0.01111001, + "auxiliary_loss_mlp": 0.01036225, + "balance_loss_clip": 1.02376187, + "balance_loss_mlp": 1.04031515, + "epoch": 0.5088230873290245, + "flos": 13261631927040.0, + "grad_norm": 1.977849325123916, + "language_loss": 0.8121528, + "learning_rate": 2.0399171717403706e-06, + "loss": 0.83362508, + "num_input_tokens_seen": 181988410, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.70703125, + "step": 8463, + "time_per_iteration": 2.460604190826416 + }, + { + "auxiliary_loss_clip": 0.01109081, + "auxiliary_loss_mlp": 0.01037256, + "balance_loss_clip": 1.02527571, + "balance_loss_mlp": 1.03999758, + "epoch": 0.5088832105816925, + "flos": 20043469134720.0, + "grad_norm": 1.7045720874408852, + "language_loss": 0.7630803, + "learning_rate": 2.039527786882341e-06, + "loss": 0.78454363, + "num_input_tokens_seen": 182006530, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 8464, + "time_per_iteration": 2.4307475090026855 + }, + { + "auxiliary_loss_clip": 0.01037487, + "auxiliary_loss_mlp": 0.01005228, + "balance_loss_clip": 1.00426793, + "balance_loss_mlp": 1.01476121, + "epoch": 0.5089433338343604, + "flos": 67422179018880.0, + "grad_norm": 0.687733273493157, + "language_loss": 0.59352195, + "learning_rate": 2.0391384005254133e-06, + "loss": 0.61394918, + "num_input_tokens_seen": 182074240, + "router_z_loss_clip": 0.00958252, + "router_z_loss_mlp": 0.2265625, + "step": 8465, + "time_per_iteration": 3.1989307403564453 + }, + { + "auxiliary_loss_clip": 0.01106873, + "auxiliary_loss_mlp": 0.01035022, + "balance_loss_clip": 1.02263045, + "balance_loss_mlp": 1.03822207, + "epoch": 0.5090034570870284, + "flos": 22710339336960.0, + "grad_norm": 1.7579634525926484, + "language_loss": 0.79857922, + "learning_rate": 2.038749012684354e-06, + "loss": 0.81999815, + "num_input_tokens_seen": 182093360, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 8466, + "time_per_iteration": 2.472186326980591 + }, + { + "auxiliary_loss_clip": 0.01104861, + "auxiliary_loss_mlp": 0.01032192, + "balance_loss_clip": 1.01950181, + "balance_loss_mlp": 1.03679371, + "epoch": 0.5090635803396963, + "flos": 20445812352000.0, + "grad_norm": 1.5999387152583837, + "language_loss": 0.78222281, + "learning_rate": 2.0383596233739286e-06, + "loss": 0.80359334, + "num_input_tokens_seen": 182110170, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 8467, + "time_per_iteration": 2.4692180156707764 + }, + { + "auxiliary_loss_clip": 0.01107209, + "auxiliary_loss_mlp": 0.01032197, + "balance_loss_clip": 1.02041364, + "balance_loss_mlp": 1.03994191, + "epoch": 0.5091237035923644, + "flos": 23768878164480.0, + "grad_norm": 1.7540939283261232, + "language_loss": 0.7467652, + "learning_rate": 2.0379702326089013e-06, + "loss": 0.76815927, + "num_input_tokens_seen": 182129570, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 8468, + "time_per_iteration": 3.8722333908081055 + }, + { + "auxiliary_loss_clip": 0.01107691, + "auxiliary_loss_mlp": 0.01031114, + "balance_loss_clip": 1.01877546, + "balance_loss_mlp": 1.03856027, + "epoch": 0.5091838268450323, + "flos": 18327908684160.0, + "grad_norm": 1.7320149470681812, + "language_loss": 0.77835757, + "learning_rate": 2.03758084040404e-06, + "loss": 0.79974556, + "num_input_tokens_seen": 182147565, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.69140625, + "step": 8469, + "time_per_iteration": 2.4514496326446533 + }, + { + "auxiliary_loss_clip": 0.01112445, + "auxiliary_loss_mlp": 0.01035475, + "balance_loss_clip": 1.0221895, + "balance_loss_mlp": 1.04265046, + "epoch": 0.5092439500977003, + "flos": 29057621806080.0, + "grad_norm": 1.5013208791161945, + "language_loss": 0.69422746, + "learning_rate": 2.037191446774109e-06, + "loss": 0.71570665, + "num_input_tokens_seen": 182169695, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6953125, + "step": 8470, + "time_per_iteration": 2.5658817291259766 + }, + { + "auxiliary_loss_clip": 0.01112957, + "auxiliary_loss_mlp": 0.01033067, + "balance_loss_clip": 1.01997817, + "balance_loss_mlp": 1.04058552, + "epoch": 0.5093040733503682, + "flos": 13553908894080.0, + "grad_norm": 2.018231732442679, + "language_loss": 0.73409355, + "learning_rate": 2.0368020517338745e-06, + "loss": 0.75555384, + "num_input_tokens_seen": 182186385, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.72265625, + "step": 8471, + "time_per_iteration": 5.355906009674072 + }, + { + "auxiliary_loss_clip": 0.01036047, + "auxiliary_loss_mlp": 0.01003435, + "balance_loss_clip": 1.00242805, + "balance_loss_mlp": 1.01322865, + "epoch": 0.5093641966030362, + "flos": 68906617407360.0, + "grad_norm": 0.7572542385247485, + "language_loss": 0.58153868, + "learning_rate": 2.036412655298103e-06, + "loss": 0.60193354, + "num_input_tokens_seen": 182247095, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.22851562, + "step": 8472, + "time_per_iteration": 3.0752861499786377 + }, + { + "auxiliary_loss_clip": 0.01111139, + "auxiliary_loss_mlp": 0.01032478, + "balance_loss_clip": 1.02100456, + "balance_loss_mlp": 1.04138827, + "epoch": 0.5094243198557042, + "flos": 21580948932480.0, + "grad_norm": 1.783541878810952, + "language_loss": 0.69200397, + "learning_rate": 2.03602325748156e-06, + "loss": 0.71344012, + "num_input_tokens_seen": 182266380, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6953125, + "step": 8473, + "time_per_iteration": 2.4832053184509277 + }, + { + "auxiliary_loss_clip": 0.01109225, + "auxiliary_loss_mlp": 0.01033767, + "balance_loss_clip": 1.02144074, + "balance_loss_mlp": 1.03987551, + "epoch": 0.5094844431083722, + "flos": 28840721529600.0, + "grad_norm": 2.2073606957030143, + "language_loss": 0.85564739, + "learning_rate": 2.0356338582990105e-06, + "loss": 0.87707734, + "num_input_tokens_seen": 182284685, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6953125, + "step": 8474, + "time_per_iteration": 2.5068845748901367 + }, + { + "auxiliary_loss_clip": 0.01110669, + "auxiliary_loss_mlp": 0.01031974, + "balance_loss_clip": 1.01944494, + "balance_loss_mlp": 1.03983307, + "epoch": 0.5095445663610402, + "flos": 14976114969600.0, + "grad_norm": 2.014074019348489, + "language_loss": 0.64659619, + "learning_rate": 2.035244457765222e-06, + "loss": 0.66802263, + "num_input_tokens_seen": 182301810, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.70703125, + "step": 8475, + "time_per_iteration": 2.4363739490509033 + }, + { + "auxiliary_loss_clip": 0.01115225, + "auxiliary_loss_mlp": 0.01038799, + "balance_loss_clip": 1.02557325, + "balance_loss_mlp": 1.04094887, + "epoch": 0.5096046896137081, + "flos": 20777088510720.0, + "grad_norm": 4.024838672705198, + "language_loss": 0.81962836, + "learning_rate": 2.0348550558949605e-06, + "loss": 0.84116852, + "num_input_tokens_seen": 182320285, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 8476, + "time_per_iteration": 2.448249578475952 + }, + { + "auxiliary_loss_clip": 0.01111186, + "auxiliary_loss_mlp": 0.01035572, + "balance_loss_clip": 1.02019382, + "balance_loss_mlp": 1.03794646, + "epoch": 0.5096648128663761, + "flos": 23185078416000.0, + "grad_norm": 1.9611523426566915, + "language_loss": 0.81148994, + "learning_rate": 2.0344656527029917e-06, + "loss": 0.83295757, + "num_input_tokens_seen": 182339465, + "router_z_loss_clip": 0.15332031, + "router_z_loss_mlp": 0.734375, + "step": 8477, + "time_per_iteration": 2.470248222351074 + }, + { + "auxiliary_loss_clip": 0.01111185, + "auxiliary_loss_mlp": 0.01029928, + "balance_loss_clip": 1.01584899, + "balance_loss_mlp": 1.03962493, + "epoch": 0.509724936119044, + "flos": 22309432663680.0, + "grad_norm": 1.8342280591951767, + "language_loss": 0.61682522, + "learning_rate": 2.034076248204082e-06, + "loss": 0.6382364, + "num_input_tokens_seen": 182358375, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.71484375, + "step": 8478, + "time_per_iteration": 2.4439172744750977 + }, + { + "auxiliary_loss_clip": 0.01108849, + "auxiliary_loss_mlp": 0.01037275, + "balance_loss_clip": 1.02540779, + "balance_loss_mlp": 1.03930426, + "epoch": 0.509785059371712, + "flos": 26287077974400.0, + "grad_norm": 1.4883331760724325, + "language_loss": 0.65860271, + "learning_rate": 2.0336868424129968e-06, + "loss": 0.6800639, + "num_input_tokens_seen": 182377935, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6953125, + "step": 8479, + "time_per_iteration": 2.4965710639953613 + }, + { + "auxiliary_loss_clip": 0.01107177, + "auxiliary_loss_mlp": 0.01031743, + "balance_loss_clip": 1.01974487, + "balance_loss_mlp": 1.0389936, + "epoch": 0.50984518262438, + "flos": 22964586779520.0, + "grad_norm": 1.620468938265791, + "language_loss": 0.69455707, + "learning_rate": 2.0332974353445037e-06, + "loss": 0.71594626, + "num_input_tokens_seen": 182396440, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 8480, + "time_per_iteration": 2.4500057697296143 + }, + { + "auxiliary_loss_clip": 0.01109301, + "auxiliary_loss_mlp": 0.01031568, + "balance_loss_clip": 1.01871157, + "balance_loss_mlp": 1.03733814, + "epoch": 0.509905305877048, + "flos": 26213389223040.0, + "grad_norm": 1.6808533459383284, + "language_loss": 0.79027826, + "learning_rate": 2.0329080270133688e-06, + "loss": 0.81168693, + "num_input_tokens_seen": 182415890, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 8481, + "time_per_iteration": 2.507157564163208 + }, + { + "auxiliary_loss_clip": 0.01104124, + "auxiliary_loss_mlp": 0.01034339, + "balance_loss_clip": 1.02170324, + "balance_loss_mlp": 1.03702283, + "epoch": 0.5099654291297159, + "flos": 20340055733760.0, + "grad_norm": 1.5080021873745288, + "language_loss": 0.83429766, + "learning_rate": 2.0325186174343578e-06, + "loss": 0.85568231, + "num_input_tokens_seen": 182434235, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.671875, + "step": 8482, + "time_per_iteration": 2.4544076919555664 + }, + { + "auxiliary_loss_clip": 0.0111291, + "auxiliary_loss_mlp": 0.01032891, + "balance_loss_clip": 1.01925349, + "balance_loss_mlp": 1.03990221, + "epoch": 0.5100255523823839, + "flos": 29054820545280.0, + "grad_norm": 1.7853243252822575, + "language_loss": 0.85625446, + "learning_rate": 2.032129206622238e-06, + "loss": 0.87771249, + "num_input_tokens_seen": 182454360, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.73046875, + "step": 8483, + "time_per_iteration": 2.519747734069824 + }, + { + "auxiliary_loss_clip": 0.01107969, + "auxiliary_loss_mlp": 0.01031848, + "balance_loss_clip": 1.01965284, + "balance_loss_mlp": 1.03712344, + "epoch": 0.5100856756350518, + "flos": 22455912326400.0, + "grad_norm": 1.7164607290812173, + "language_loss": 0.83208412, + "learning_rate": 2.031739794591775e-06, + "loss": 0.85348231, + "num_input_tokens_seen": 182471940, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.70703125, + "step": 8484, + "time_per_iteration": 2.4549949169158936 + }, + { + "auxiliary_loss_clip": 0.01109177, + "auxiliary_loss_mlp": 0.01028587, + "balance_loss_clip": 1.0154798, + "balance_loss_mlp": 1.03849459, + "epoch": 0.5101457988877198, + "flos": 19171055606400.0, + "grad_norm": 2.0216137506651983, + "language_loss": 0.81388122, + "learning_rate": 2.031350381357736e-06, + "loss": 0.83525884, + "num_input_tokens_seen": 182490685, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.70703125, + "step": 8485, + "time_per_iteration": 2.4612390995025635 + }, + { + "auxiliary_loss_clip": 0.01103382, + "auxiliary_loss_mlp": 0.01032642, + "balance_loss_clip": 1.02036929, + "balance_loss_mlp": 1.03675199, + "epoch": 0.5102059221403878, + "flos": 14866371941760.0, + "grad_norm": 2.1191716083834025, + "language_loss": 0.73653662, + "learning_rate": 2.0309609669348874e-06, + "loss": 0.7578969, + "num_input_tokens_seen": 182508325, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.66796875, + "step": 8486, + "time_per_iteration": 2.426042318344116 + }, + { + "auxiliary_loss_clip": 0.01112031, + "auxiliary_loss_mlp": 0.01031218, + "balance_loss_clip": 1.01824152, + "balance_loss_mlp": 1.03990436, + "epoch": 0.5102660453930558, + "flos": 22961103160320.0, + "grad_norm": 1.4808929350883289, + "language_loss": 0.69956315, + "learning_rate": 2.0305715513379953e-06, + "loss": 0.72099566, + "num_input_tokens_seen": 182527020, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.72265625, + "step": 8487, + "time_per_iteration": 2.5032570362091064 + }, + { + "auxiliary_loss_clip": 0.01108669, + "auxiliary_loss_mlp": 0.01033355, + "balance_loss_clip": 1.01987231, + "balance_loss_mlp": 1.04012084, + "epoch": 0.5103261686457238, + "flos": 23149311448320.0, + "grad_norm": 1.9552461936614123, + "language_loss": 0.72984374, + "learning_rate": 2.030182134581827e-06, + "loss": 0.75126404, + "num_input_tokens_seen": 182543505, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.6875, + "step": 8488, + "time_per_iteration": 2.454589605331421 + }, + { + "auxiliary_loss_clip": 0.011087, + "auxiliary_loss_mlp": 0.01032468, + "balance_loss_clip": 1.02002835, + "balance_loss_mlp": 1.03795087, + "epoch": 0.5103862918983917, + "flos": 14319237000960.0, + "grad_norm": 1.814097723080907, + "language_loss": 0.69584548, + "learning_rate": 2.0297927166811503e-06, + "loss": 0.71725714, + "num_input_tokens_seen": 182562250, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.70703125, + "step": 8489, + "time_per_iteration": 2.4295358657836914 + }, + { + "auxiliary_loss_clip": 0.01108544, + "auxiliary_loss_mlp": 0.01030245, + "balance_loss_clip": 1.01800227, + "balance_loss_mlp": 1.03788161, + "epoch": 0.5104464151510597, + "flos": 25848536826240.0, + "grad_norm": 1.8877500438207433, + "language_loss": 0.72447532, + "learning_rate": 2.0294032976507297e-06, + "loss": 0.7458632, + "num_input_tokens_seen": 182581910, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.70703125, + "step": 8490, + "time_per_iteration": 2.484398603439331 + }, + { + "auxiliary_loss_clip": 0.01105533, + "auxiliary_loss_mlp": 0.01028247, + "balance_loss_clip": 1.01649261, + "balance_loss_mlp": 1.03803921, + "epoch": 0.5105065384037276, + "flos": 21652913831040.0, + "grad_norm": 1.594832362291185, + "language_loss": 0.80287743, + "learning_rate": 2.0290138775053337e-06, + "loss": 0.82421523, + "num_input_tokens_seen": 182601350, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.67578125, + "step": 8491, + "time_per_iteration": 2.4715051651000977 + }, + { + "auxiliary_loss_clip": 0.01103108, + "auxiliary_loss_mlp": 0.01028042, + "balance_loss_clip": 1.0155549, + "balance_loss_mlp": 1.03651989, + "epoch": 0.5105666616563956, + "flos": 22491571553280.0, + "grad_norm": 2.311833139697555, + "language_loss": 0.79033649, + "learning_rate": 2.028624456259728e-06, + "loss": 0.81164801, + "num_input_tokens_seen": 182619660, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6640625, + "step": 8492, + "time_per_iteration": 2.4697651863098145 + }, + { + "auxiliary_loss_clip": 0.01114847, + "auxiliary_loss_mlp": 0.01038448, + "balance_loss_clip": 1.02560329, + "balance_loss_mlp": 1.04234147, + "epoch": 0.5106267849090635, + "flos": 22455768672000.0, + "grad_norm": 2.1680982451379607, + "language_loss": 0.77821648, + "learning_rate": 2.0282350339286804e-06, + "loss": 0.79974937, + "num_input_tokens_seen": 182639815, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7265625, + "step": 8493, + "time_per_iteration": 2.490349054336548 + }, + { + "auxiliary_loss_clip": 0.01109447, + "auxiliary_loss_mlp": 0.01029414, + "balance_loss_clip": 1.01608634, + "balance_loss_mlp": 1.03989387, + "epoch": 0.5106869081617316, + "flos": 23547093638400.0, + "grad_norm": 2.213061013784994, + "language_loss": 0.83690828, + "learning_rate": 2.0278456105269574e-06, + "loss": 0.85829687, + "num_input_tokens_seen": 182659655, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6953125, + "step": 8494, + "time_per_iteration": 2.4604976177215576 + }, + { + "auxiliary_loss_clip": 0.01112511, + "auxiliary_loss_mlp": 0.01033364, + "balance_loss_clip": 1.02189648, + "balance_loss_mlp": 1.04180336, + "epoch": 0.5107470314143995, + "flos": 26792987080320.0, + "grad_norm": 1.8678450133518327, + "language_loss": 0.79117751, + "learning_rate": 2.027456186069326e-06, + "loss": 0.81263626, + "num_input_tokens_seen": 182677075, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.70703125, + "step": 8495, + "time_per_iteration": 2.5202648639678955 + }, + { + "auxiliary_loss_clip": 0.01109453, + "auxiliary_loss_mlp": 0.01034821, + "balance_loss_clip": 1.02276945, + "balance_loss_mlp": 1.04033172, + "epoch": 0.5108071546670675, + "flos": 25739691638400.0, + "grad_norm": 1.5685043948688704, + "language_loss": 0.78221929, + "learning_rate": 2.0270667605705535e-06, + "loss": 0.80366194, + "num_input_tokens_seen": 182699625, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 8496, + "time_per_iteration": 2.499793767929077 + }, + { + "auxiliary_loss_clip": 0.01105005, + "auxiliary_loss_mlp": 0.01026512, + "balance_loss_clip": 1.01508582, + "balance_loss_mlp": 1.03803635, + "epoch": 0.5108672779197354, + "flos": 18697537589760.0, + "grad_norm": 1.9336450862291243, + "language_loss": 0.7876817, + "learning_rate": 2.0266773340454066e-06, + "loss": 0.8089968, + "num_input_tokens_seen": 182717020, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 8497, + "time_per_iteration": 2.450246572494507 + }, + { + "auxiliary_loss_clip": 0.01106851, + "auxiliary_loss_mlp": 0.01032259, + "balance_loss_clip": 1.0203619, + "balance_loss_mlp": 1.03829265, + "epoch": 0.5109274011724034, + "flos": 26688164215680.0, + "grad_norm": 1.6296784083005205, + "language_loss": 0.8186121, + "learning_rate": 2.0262879065086525e-06, + "loss": 0.84000313, + "num_input_tokens_seen": 182736955, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.68359375, + "step": 8498, + "time_per_iteration": 2.4860284328460693 + }, + { + "auxiliary_loss_clip": 0.0110713, + "auxiliary_loss_mlp": 0.01028216, + "balance_loss_clip": 1.01559711, + "balance_loss_mlp": 1.03989053, + "epoch": 0.5109875244250714, + "flos": 22784028088320.0, + "grad_norm": 1.9511970266493632, + "language_loss": 0.71084464, + "learning_rate": 2.0258984779750584e-06, + "loss": 0.73219806, + "num_input_tokens_seen": 182757620, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.671875, + "step": 8499, + "time_per_iteration": 2.488870859146118 + }, + { + "auxiliary_loss_clip": 0.01108699, + "auxiliary_loss_mlp": 0.01031134, + "balance_loss_clip": 1.01899862, + "balance_loss_mlp": 1.03962827, + "epoch": 0.5110476476777394, + "flos": 35588515622400.0, + "grad_norm": 1.470448999091522, + "language_loss": 0.72600758, + "learning_rate": 2.0255090484593914e-06, + "loss": 0.74740595, + "num_input_tokens_seen": 182780195, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69140625, + "step": 8500, + "time_per_iteration": 2.554612874984741 + }, + { + "auxiliary_loss_clip": 0.01113166, + "auxiliary_loss_mlp": 0.01032162, + "balance_loss_clip": 1.01870334, + "balance_loss_mlp": 1.03988254, + "epoch": 0.5111077709304074, + "flos": 19280798634240.0, + "grad_norm": 2.631045408977224, + "language_loss": 0.63011086, + "learning_rate": 2.0251196179764183e-06, + "loss": 0.65156412, + "num_input_tokens_seen": 182795765, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 8501, + "time_per_iteration": 2.4470977783203125 + }, + { + "auxiliary_loss_clip": 0.01109012, + "auxiliary_loss_mlp": 0.01033513, + "balance_loss_clip": 1.02117443, + "balance_loss_mlp": 1.03708565, + "epoch": 0.5111678941830753, + "flos": 20668207409280.0, + "grad_norm": 1.7479031643347964, + "language_loss": 0.8759163, + "learning_rate": 2.024730186540907e-06, + "loss": 0.89734155, + "num_input_tokens_seen": 182813120, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.71875, + "step": 8502, + "time_per_iteration": 2.4252443313598633 + }, + { + "auxiliary_loss_clip": 0.01103318, + "auxiliary_loss_mlp": 0.01033556, + "balance_loss_clip": 1.02192163, + "balance_loss_mlp": 1.0349071, + "epoch": 0.5112280174357433, + "flos": 26287903987200.0, + "grad_norm": 1.3950925269756227, + "language_loss": 0.82526219, + "learning_rate": 2.0243407541676253e-06, + "loss": 0.84663093, + "num_input_tokens_seen": 182835745, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.68359375, + "step": 8503, + "time_per_iteration": 2.5170319080352783 + }, + { + "auxiliary_loss_clip": 0.01038121, + "auxiliary_loss_mlp": 0.01001996, + "balance_loss_clip": 1.00103021, + "balance_loss_mlp": 1.01512361, + "epoch": 0.5112881406884112, + "flos": 59474247707520.0, + "grad_norm": 0.8658208518316733, + "language_loss": 0.63857049, + "learning_rate": 2.023951320871339e-06, + "loss": 0.65897167, + "num_input_tokens_seen": 182892540, + "router_z_loss_clip": 0.00964355, + "router_z_loss_mlp": 0.23046875, + "step": 8504, + "time_per_iteration": 3.098529577255249 + }, + { + "auxiliary_loss_clip": 0.01108099, + "auxiliary_loss_mlp": 0.01030933, + "balance_loss_clip": 1.01815391, + "balance_loss_mlp": 1.03960776, + "epoch": 0.5113482639410792, + "flos": 26468857728000.0, + "grad_norm": 3.195489539056655, + "language_loss": 0.84326482, + "learning_rate": 2.023561886666816e-06, + "loss": 0.86465514, + "num_input_tokens_seen": 182911515, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.68359375, + "step": 8505, + "time_per_iteration": 2.5145134925842285 + }, + { + "auxiliary_loss_clip": 0.01107392, + "auxiliary_loss_mlp": 0.01026895, + "balance_loss_clip": 1.01499188, + "balance_loss_mlp": 1.0399797, + "epoch": 0.5114083871937471, + "flos": 29895848565120.0, + "grad_norm": 1.9725783043316722, + "language_loss": 0.75117159, + "learning_rate": 2.0231724515688246e-06, + "loss": 0.77251446, + "num_input_tokens_seen": 182930860, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 8506, + "time_per_iteration": 2.529463052749634 + }, + { + "auxiliary_loss_clip": 0.01107977, + "auxiliary_loss_mlp": 0.01032843, + "balance_loss_clip": 1.01951551, + "balance_loss_mlp": 1.03808045, + "epoch": 0.5114685104464152, + "flos": 24314576561280.0, + "grad_norm": 1.6477689192158658, + "language_loss": 0.58288801, + "learning_rate": 2.022783015592131e-06, + "loss": 0.60429621, + "num_input_tokens_seen": 182949960, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.69921875, + "step": 8507, + "time_per_iteration": 2.515449047088623 + }, + { + "auxiliary_loss_clip": 0.01111035, + "auxiliary_loss_mlp": 0.01039811, + "balance_loss_clip": 1.02690697, + "balance_loss_mlp": 1.04132211, + "epoch": 0.5115286336990831, + "flos": 17019288391680.0, + "grad_norm": 1.6046089096743523, + "language_loss": 0.85276306, + "learning_rate": 2.022393578751503e-06, + "loss": 0.87427151, + "num_input_tokens_seen": 182968085, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 8508, + "time_per_iteration": 2.4760663509368896 + }, + { + "auxiliary_loss_clip": 0.01110329, + "auxiliary_loss_mlp": 0.01033776, + "balance_loss_clip": 1.02051985, + "balance_loss_mlp": 1.03969765, + "epoch": 0.5115887569517511, + "flos": 23659386531840.0, + "grad_norm": 1.6014168180464263, + "language_loss": 0.72123772, + "learning_rate": 2.022004141061709e-06, + "loss": 0.74267876, + "num_input_tokens_seen": 182987275, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.70703125, + "step": 8509, + "time_per_iteration": 2.5354809761047363 + }, + { + "auxiliary_loss_clip": 0.01107381, + "auxiliary_loss_mlp": 0.01032, + "balance_loss_clip": 1.02060962, + "balance_loss_mlp": 1.03980041, + "epoch": 0.511648880204419, + "flos": 16107193313280.0, + "grad_norm": 1.6675565589278303, + "language_loss": 0.75862014, + "learning_rate": 2.0216147025375153e-06, + "loss": 0.78001392, + "num_input_tokens_seen": 183004700, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.67578125, + "step": 8510, + "time_per_iteration": 3.945136785507202 + }, + { + "auxiliary_loss_clip": 0.01108162, + "auxiliary_loss_mlp": 0.01033651, + "balance_loss_clip": 1.02163482, + "balance_loss_mlp": 1.04065561, + "epoch": 0.511709003457087, + "flos": 32634970974720.0, + "grad_norm": 1.6646040073598372, + "language_loss": 0.71192694, + "learning_rate": 2.0212252631936907e-06, + "loss": 0.73334503, + "num_input_tokens_seen": 183025830, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 8511, + "time_per_iteration": 2.541703701019287 + }, + { + "auxiliary_loss_clip": 0.01105914, + "auxiliary_loss_mlp": 0.01029434, + "balance_loss_clip": 1.01763797, + "balance_loss_mlp": 1.03958058, + "epoch": 0.511769126709755, + "flos": 21762082241280.0, + "grad_norm": 2.060947746528677, + "language_loss": 0.66430634, + "learning_rate": 2.020835823045001e-06, + "loss": 0.68565977, + "num_input_tokens_seen": 183045140, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 8512, + "time_per_iteration": 5.427145481109619 + }, + { + "auxiliary_loss_clip": 0.01109669, + "auxiliary_loss_mlp": 0.01036594, + "balance_loss_clip": 1.02326632, + "balance_loss_mlp": 1.03883505, + "epoch": 0.511829249962423, + "flos": 23915357827200.0, + "grad_norm": 2.433145093070313, + "language_loss": 0.66578728, + "learning_rate": 2.0204463821062146e-06, + "loss": 0.6872499, + "num_input_tokens_seen": 183063935, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.70703125, + "step": 8513, + "time_per_iteration": 3.935227870941162 + }, + { + "auxiliary_loss_clip": 0.01106032, + "auxiliary_loss_mlp": 0.01033163, + "balance_loss_clip": 1.02099788, + "balance_loss_mlp": 1.03927946, + "epoch": 0.511889373215091, + "flos": 23727005884800.0, + "grad_norm": 2.0509279474405115, + "language_loss": 0.69136906, + "learning_rate": 2.0200569403921e-06, + "loss": 0.71276104, + "num_input_tokens_seen": 183084135, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6640625, + "step": 8514, + "time_per_iteration": 2.5390119552612305 + }, + { + "auxiliary_loss_clip": 0.01102947, + "auxiliary_loss_mlp": 0.01026976, + "balance_loss_clip": 1.01599109, + "balance_loss_mlp": 1.03685427, + "epoch": 0.5119494964677589, + "flos": 28111519526400.0, + "grad_norm": 1.6362442678403473, + "language_loss": 0.66014814, + "learning_rate": 2.019667497917424e-06, + "loss": 0.68144739, + "num_input_tokens_seen": 183104570, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.66015625, + "step": 8515, + "time_per_iteration": 2.492664098739624 + }, + { + "auxiliary_loss_clip": 0.01103893, + "auxiliary_loss_mlp": 0.01031754, + "balance_loss_clip": 1.02031612, + "balance_loss_mlp": 1.03691602, + "epoch": 0.5120096197204269, + "flos": 24973214296320.0, + "grad_norm": 2.89314496105325, + "language_loss": 0.74966168, + "learning_rate": 2.019278054696955e-06, + "loss": 0.77101815, + "num_input_tokens_seen": 183123850, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 8516, + "time_per_iteration": 2.5428519248962402 + }, + { + "auxiliary_loss_clip": 0.01111253, + "auxiliary_loss_mlp": 0.01033594, + "balance_loss_clip": 1.02181602, + "balance_loss_mlp": 1.04198885, + "epoch": 0.5120697429730948, + "flos": 17968012364160.0, + "grad_norm": 1.7790403014833382, + "language_loss": 0.77862155, + "learning_rate": 2.0188886107454595e-06, + "loss": 0.80007005, + "num_input_tokens_seen": 183141725, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.69140625, + "step": 8517, + "time_per_iteration": 2.4259724617004395 + }, + { + "auxiliary_loss_clip": 0.01110887, + "auxiliary_loss_mlp": 0.01031413, + "balance_loss_clip": 1.01897407, + "balance_loss_mlp": 1.03983212, + "epoch": 0.5121298662257628, + "flos": 23292343405440.0, + "grad_norm": 1.7905284866787141, + "language_loss": 0.73672384, + "learning_rate": 2.0184991660777063e-06, + "loss": 0.75814688, + "num_input_tokens_seen": 183161300, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 8518, + "time_per_iteration": 2.5707037448883057 + }, + { + "auxiliary_loss_clip": 0.01107458, + "auxiliary_loss_mlp": 0.01037485, + "balance_loss_clip": 1.02557039, + "balance_loss_mlp": 1.03892565, + "epoch": 0.5121899894784308, + "flos": 17311062568320.0, + "grad_norm": 1.6752140453085944, + "language_loss": 0.78055197, + "learning_rate": 2.0181097207084625e-06, + "loss": 0.80200136, + "num_input_tokens_seen": 183180495, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 8519, + "time_per_iteration": 2.417372226715088 + }, + { + "auxiliary_loss_clip": 0.01109296, + "auxiliary_loss_mlp": 0.0103241, + "balance_loss_clip": 1.02049518, + "balance_loss_mlp": 1.04082775, + "epoch": 0.5122501127310988, + "flos": 24930085040640.0, + "grad_norm": 1.573776111474748, + "language_loss": 0.79204106, + "learning_rate": 2.017720274652497e-06, + "loss": 0.8134582, + "num_input_tokens_seen": 183200330, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.68359375, + "step": 8520, + "time_per_iteration": 2.63323712348938 + }, + { + "auxiliary_loss_clip": 0.01112541, + "auxiliary_loss_mlp": 0.01039702, + "balance_loss_clip": 1.02623105, + "balance_loss_mlp": 1.03924751, + "epoch": 0.5123102359837667, + "flos": 18442859184000.0, + "grad_norm": 1.6319482307550086, + "language_loss": 0.81403995, + "learning_rate": 2.0173308279245765e-06, + "loss": 0.83556241, + "num_input_tokens_seen": 183218230, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 8521, + "time_per_iteration": 2.4723713397979736 + }, + { + "auxiliary_loss_clip": 0.01106273, + "auxiliary_loss_mlp": 0.01029547, + "balance_loss_clip": 1.01685691, + "balance_loss_mlp": 1.03599286, + "epoch": 0.5123703592364347, + "flos": 26684860164480.0, + "grad_norm": 1.90297827684807, + "language_loss": 0.68368387, + "learning_rate": 2.0169413805394692e-06, + "loss": 0.70504206, + "num_input_tokens_seen": 183236735, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 8522, + "time_per_iteration": 2.516411066055298 + }, + { + "auxiliary_loss_clip": 0.01115928, + "auxiliary_loss_mlp": 0.01041085, + "balance_loss_clip": 1.02506292, + "balance_loss_mlp": 1.04201221, + "epoch": 0.5124304824891026, + "flos": 28803948981120.0, + "grad_norm": 2.718510344621862, + "language_loss": 0.6155864, + "learning_rate": 2.0165519325119433e-06, + "loss": 0.63715655, + "num_input_tokens_seen": 183257550, + "router_z_loss_clip": 0.16015625, + "router_z_loss_mlp": 0.73828125, + "step": 8523, + "time_per_iteration": 2.524775266647339 + }, + { + "auxiliary_loss_clip": 0.01110788, + "auxiliary_loss_mlp": 0.010355, + "balance_loss_clip": 1.0238173, + "balance_loss_mlp": 1.04113579, + "epoch": 0.5124906057417706, + "flos": 21761830846080.0, + "grad_norm": 2.0609816781673884, + "language_loss": 0.78066456, + "learning_rate": 2.0161624838567656e-06, + "loss": 0.80212736, + "num_input_tokens_seen": 183275515, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6953125, + "step": 8524, + "time_per_iteration": 2.526226043701172 + }, + { + "auxiliary_loss_clip": 0.01109029, + "auxiliary_loss_mlp": 0.0103495, + "balance_loss_clip": 1.02350545, + "balance_loss_mlp": 1.0413003, + "epoch": 0.5125507289944387, + "flos": 18880538405760.0, + "grad_norm": 1.8496964430325211, + "language_loss": 0.75055063, + "learning_rate": 2.015773034588706e-06, + "loss": 0.77199042, + "num_input_tokens_seen": 183293880, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6796875, + "step": 8525, + "time_per_iteration": 2.432555913925171 + }, + { + "auxiliary_loss_clip": 0.01112941, + "auxiliary_loss_mlp": 0.01036722, + "balance_loss_clip": 1.02385902, + "balance_loss_mlp": 1.04111516, + "epoch": 0.5126108522471066, + "flos": 35627838036480.0, + "grad_norm": 1.559913373859493, + "language_loss": 0.74452645, + "learning_rate": 2.015383584722531e-06, + "loss": 0.76602304, + "num_input_tokens_seen": 183315860, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 8526, + "time_per_iteration": 2.6282670497894287 + }, + { + "auxiliary_loss_clip": 0.01110533, + "auxiliary_loss_mlp": 0.01040593, + "balance_loss_clip": 1.02799845, + "balance_loss_mlp": 1.04028583, + "epoch": 0.5126709754997746, + "flos": 20190918464640.0, + "grad_norm": 1.490779495017149, + "language_loss": 0.65322489, + "learning_rate": 2.0149941342730088e-06, + "loss": 0.67473614, + "num_input_tokens_seen": 183335480, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 8527, + "time_per_iteration": 2.467350482940674 + }, + { + "auxiliary_loss_clip": 0.01108518, + "auxiliary_loss_mlp": 0.01039646, + "balance_loss_clip": 1.02852428, + "balance_loss_mlp": 1.04277444, + "epoch": 0.5127310987524425, + "flos": 18588548747520.0, + "grad_norm": 1.5603597457219889, + "language_loss": 0.74514449, + "learning_rate": 2.014604683254908e-06, + "loss": 0.76662612, + "num_input_tokens_seen": 183354395, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 8528, + "time_per_iteration": 2.513795852661133 + }, + { + "auxiliary_loss_clip": 0.01105318, + "auxiliary_loss_mlp": 0.01034313, + "balance_loss_clip": 1.02236843, + "balance_loss_mlp": 1.03608227, + "epoch": 0.5127912220051105, + "flos": 22454691264000.0, + "grad_norm": 1.756255656529514, + "language_loss": 0.83061087, + "learning_rate": 2.014215231682995e-06, + "loss": 0.85200721, + "num_input_tokens_seen": 183372980, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 8529, + "time_per_iteration": 2.4574379920959473 + }, + { + "auxiliary_loss_clip": 0.01106885, + "auxiliary_loss_mlp": 0.01032856, + "balance_loss_clip": 1.02045822, + "balance_loss_mlp": 1.03895748, + "epoch": 0.5128513452577784, + "flos": 19093703667840.0, + "grad_norm": 1.6787234743344808, + "language_loss": 0.73559862, + "learning_rate": 2.01382577957204e-06, + "loss": 0.75699604, + "num_input_tokens_seen": 183390160, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 8530, + "time_per_iteration": 2.4669532775878906 + }, + { + "auxiliary_loss_clip": 0.01039899, + "auxiliary_loss_mlp": 0.0100398, + "balance_loss_clip": 1.00278807, + "balance_loss_mlp": 1.01703906, + "epoch": 0.5129114685104464, + "flos": 67892285243520.0, + "grad_norm": 0.7465649329198393, + "language_loss": 0.60806251, + "learning_rate": 2.0134363269368095e-06, + "loss": 0.6285013, + "num_input_tokens_seen": 183455280, + "router_z_loss_clip": 0.01190186, + "router_z_loss_mlp": 0.22851562, + "step": 8531, + "time_per_iteration": 3.1615967750549316 + }, + { + "auxiliary_loss_clip": 0.01110166, + "auxiliary_loss_mlp": 0.01029475, + "balance_loss_clip": 1.01732779, + "balance_loss_mlp": 1.04014051, + "epoch": 0.5129715917631144, + "flos": 20449152316800.0, + "grad_norm": 1.6561974446519532, + "language_loss": 0.76540768, + "learning_rate": 2.0130468737920725e-06, + "loss": 0.78680408, + "num_input_tokens_seen": 183473955, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.703125, + "step": 8532, + "time_per_iteration": 2.4836883544921875 + }, + { + "auxiliary_loss_clip": 0.01107783, + "auxiliary_loss_mlp": 0.01031606, + "balance_loss_clip": 1.01894033, + "balance_loss_mlp": 1.03866601, + "epoch": 0.5130317150157824, + "flos": 35116146840960.0, + "grad_norm": 2.847315245703251, + "language_loss": 0.67183244, + "learning_rate": 2.012657420152597e-06, + "loss": 0.6932264, + "num_input_tokens_seen": 183497195, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 8533, + "time_per_iteration": 2.6025052070617676 + }, + { + "auxiliary_loss_clip": 0.01112515, + "auxiliary_loss_mlp": 0.01036644, + "balance_loss_clip": 1.02333999, + "balance_loss_mlp": 1.04080868, + "epoch": 0.5130918382684503, + "flos": 19791627903360.0, + "grad_norm": 1.8363553974693196, + "language_loss": 0.81724054, + "learning_rate": 2.01226796603315e-06, + "loss": 0.83873212, + "num_input_tokens_seen": 183513675, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 8534, + "time_per_iteration": 2.465374231338501 + }, + { + "auxiliary_loss_clip": 0.01111356, + "auxiliary_loss_mlp": 0.01035387, + "balance_loss_clip": 1.02167177, + "balance_loss_mlp": 1.0399549, + "epoch": 0.5131519615211183, + "flos": 26323096337280.0, + "grad_norm": 1.5787063577136407, + "language_loss": 0.63588178, + "learning_rate": 2.0118785114485017e-06, + "loss": 0.65734923, + "num_input_tokens_seen": 183535165, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71484375, + "step": 8535, + "time_per_iteration": 2.50287127494812 + }, + { + "auxiliary_loss_clip": 0.01111823, + "auxiliary_loss_mlp": 0.0102686, + "balance_loss_clip": 1.01434922, + "balance_loss_mlp": 1.04166365, + "epoch": 0.5132120847737862, + "flos": 19171917532800.0, + "grad_norm": 1.5428442042942097, + "language_loss": 0.69746888, + "learning_rate": 2.011489056413418e-06, + "loss": 0.71885574, + "num_input_tokens_seen": 183553780, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69921875, + "step": 8536, + "time_per_iteration": 2.459897041320801 + }, + { + "auxiliary_loss_clip": 0.01113816, + "auxiliary_loss_mlp": 0.01033547, + "balance_loss_clip": 1.01963568, + "balance_loss_mlp": 1.04082823, + "epoch": 0.5132722080264542, + "flos": 20230420446720.0, + "grad_norm": 2.3299626101952784, + "language_loss": 0.71215963, + "learning_rate": 2.011099600942669e-06, + "loss": 0.73363328, + "num_input_tokens_seen": 183572285, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7265625, + "step": 8537, + "time_per_iteration": 2.4840991497039795 + }, + { + "auxiliary_loss_clip": 0.01111456, + "auxiliary_loss_mlp": 0.0103313, + "balance_loss_clip": 1.02013016, + "balance_loss_mlp": 1.03927016, + "epoch": 0.5133323312791223, + "flos": 16469459930880.0, + "grad_norm": 6.302946358508802, + "language_loss": 0.80441952, + "learning_rate": 2.0107101450510214e-06, + "loss": 0.82586539, + "num_input_tokens_seen": 183589330, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71875, + "step": 8538, + "time_per_iteration": 2.4378812313079834 + }, + { + "auxiliary_loss_clip": 0.01107763, + "auxiliary_loss_mlp": 0.01031809, + "balance_loss_clip": 1.01880276, + "balance_loss_mlp": 1.03764546, + "epoch": 0.5133924545317902, + "flos": 26068094709120.0, + "grad_norm": 1.8808034234185624, + "language_loss": 0.78517324, + "learning_rate": 2.0103206887532437e-06, + "loss": 0.80656898, + "num_input_tokens_seen": 183609205, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.703125, + "step": 8539, + "time_per_iteration": 2.5144600868225098 + }, + { + "auxiliary_loss_clip": 0.0111221, + "auxiliary_loss_mlp": 0.01032928, + "balance_loss_clip": 1.02025044, + "balance_loss_mlp": 1.04009342, + "epoch": 0.5134525777844582, + "flos": 29131023248640.0, + "grad_norm": 1.5130664168284647, + "language_loss": 0.75880563, + "learning_rate": 2.009931232064105e-06, + "loss": 0.78025699, + "num_input_tokens_seen": 183629985, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 8540, + "time_per_iteration": 2.55734920501709 + }, + { + "auxiliary_loss_clip": 0.0111462, + "auxiliary_loss_mlp": 0.01033022, + "balance_loss_clip": 1.01905608, + "balance_loss_mlp": 1.04176068, + "epoch": 0.5135127010371261, + "flos": 17454776883840.0, + "grad_norm": 2.8219986700547555, + "language_loss": 0.74552548, + "learning_rate": 2.0095417749983724e-06, + "loss": 0.76700193, + "num_input_tokens_seen": 183648220, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.73046875, + "step": 8541, + "time_per_iteration": 2.432055711746216 + }, + { + "auxiliary_loss_clip": 0.01110326, + "auxiliary_loss_mlp": 0.01033335, + "balance_loss_clip": 1.02005482, + "balance_loss_mlp": 1.03941679, + "epoch": 0.5135728242897941, + "flos": 21944975316480.0, + "grad_norm": 1.945278300015613, + "language_loss": 0.70215029, + "learning_rate": 2.0091523175708162e-06, + "loss": 0.72358692, + "num_input_tokens_seen": 183668230, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 8542, + "time_per_iteration": 2.5227723121643066 + }, + { + "auxiliary_loss_clip": 0.01112639, + "auxiliary_loss_mlp": 0.01026897, + "balance_loss_clip": 1.01403403, + "balance_loss_mlp": 1.04146171, + "epoch": 0.513632947542462, + "flos": 22674859678080.0, + "grad_norm": 1.83289507202946, + "language_loss": 0.78898811, + "learning_rate": 2.0087628597962023e-06, + "loss": 0.8103835, + "num_input_tokens_seen": 183687800, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 8543, + "time_per_iteration": 2.4559075832366943 + }, + { + "auxiliary_loss_clip": 0.0111214, + "auxiliary_loss_mlp": 0.01037576, + "balance_loss_clip": 1.02426672, + "balance_loss_mlp": 1.04161441, + "epoch": 0.51369307079513, + "flos": 29457163762560.0, + "grad_norm": 1.9171309591761885, + "language_loss": 0.68051696, + "learning_rate": 2.008373401689299e-06, + "loss": 0.70201409, + "num_input_tokens_seen": 183709025, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.703125, + "step": 8544, + "time_per_iteration": 2.5344274044036865 + }, + { + "auxiliary_loss_clip": 0.01113551, + "auxiliary_loss_mlp": 0.01039409, + "balance_loss_clip": 1.02671301, + "balance_loss_mlp": 1.04096842, + "epoch": 0.513753194047798, + "flos": 18989347680000.0, + "grad_norm": 2.2205990317105395, + "language_loss": 0.7225253, + "learning_rate": 2.0079839432648765e-06, + "loss": 0.74405491, + "num_input_tokens_seen": 183725740, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7265625, + "step": 8545, + "time_per_iteration": 2.4303176403045654 + }, + { + "auxiliary_loss_clip": 0.01112226, + "auxiliary_loss_mlp": 0.01038034, + "balance_loss_clip": 1.02431881, + "balance_loss_mlp": 1.03957486, + "epoch": 0.513813317300466, + "flos": 17821855923840.0, + "grad_norm": 1.967971348268394, + "language_loss": 0.81898367, + "learning_rate": 2.0075944845377016e-06, + "loss": 0.84048629, + "num_input_tokens_seen": 183743995, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7265625, + "step": 8546, + "time_per_iteration": 2.4504597187042236 + }, + { + "auxiliary_loss_clip": 0.01111418, + "auxiliary_loss_mlp": 0.01033938, + "balance_loss_clip": 1.02099776, + "balance_loss_mlp": 1.03963637, + "epoch": 0.5138734405531339, + "flos": 24061191045120.0, + "grad_norm": 1.6545588723955058, + "language_loss": 0.73301136, + "learning_rate": 2.007205025522544e-06, + "loss": 0.75446492, + "num_input_tokens_seen": 183764150, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71875, + "step": 8547, + "time_per_iteration": 2.4682819843292236 + }, + { + "auxiliary_loss_clip": 0.01108626, + "auxiliary_loss_mlp": 0.01043301, + "balance_loss_clip": 1.03010488, + "balance_loss_mlp": 1.03783822, + "epoch": 0.5139335638058019, + "flos": 26097253574400.0, + "grad_norm": 1.620202866362127, + "language_loss": 0.73577881, + "learning_rate": 2.0068155662341702e-06, + "loss": 0.75729811, + "num_input_tokens_seen": 183783280, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.70703125, + "step": 8548, + "time_per_iteration": 2.511922597885132 + }, + { + "auxiliary_loss_clip": 0.01110019, + "auxiliary_loss_mlp": 0.01034147, + "balance_loss_clip": 1.02117133, + "balance_loss_mlp": 1.03852081, + "epoch": 0.5139936870584698, + "flos": 18917095472640.0, + "grad_norm": 1.506476906057379, + "language_loss": 0.82239324, + "learning_rate": 2.0064261066873495e-06, + "loss": 0.84383494, + "num_input_tokens_seen": 183800725, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71484375, + "step": 8549, + "time_per_iteration": 2.433605194091797 + }, + { + "auxiliary_loss_clip": 0.01110043, + "auxiliary_loss_mlp": 0.01035127, + "balance_loss_clip": 1.02292621, + "balance_loss_mlp": 1.04096317, + "epoch": 0.5140538103111378, + "flos": 16144001775360.0, + "grad_norm": 1.8131541317091766, + "language_loss": 0.72331119, + "learning_rate": 2.0060366468968504e-06, + "loss": 0.7447629, + "num_input_tokens_seen": 183818735, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69140625, + "step": 8550, + "time_per_iteration": 2.4659972190856934 + }, + { + "auxiliary_loss_clip": 0.01114255, + "auxiliary_loss_mlp": 0.01034858, + "balance_loss_clip": 1.02173352, + "balance_loss_mlp": 1.0404501, + "epoch": 0.5141139335638057, + "flos": 22420145358720.0, + "grad_norm": 1.6035097357113468, + "language_loss": 0.75497758, + "learning_rate": 2.0056471868774408e-06, + "loss": 0.77646863, + "num_input_tokens_seen": 183840015, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73828125, + "step": 8551, + "time_per_iteration": 2.453734874725342 + }, + { + "auxiliary_loss_clip": 0.01108366, + "auxiliary_loss_mlp": 0.01030625, + "balance_loss_clip": 1.01805425, + "balance_loss_mlp": 1.04017091, + "epoch": 0.5141740568164738, + "flos": 27089645506560.0, + "grad_norm": 1.6015349884444547, + "language_loss": 0.69001007, + "learning_rate": 2.0052577266438897e-06, + "loss": 0.71140003, + "num_input_tokens_seen": 183860145, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.68359375, + "step": 8552, + "time_per_iteration": 3.9047505855560303 + }, + { + "auxiliary_loss_clip": 0.01110174, + "auxiliary_loss_mlp": 0.01032262, + "balance_loss_clip": 1.01927972, + "balance_loss_mlp": 1.03868091, + "epoch": 0.5142341800691418, + "flos": 24973250209920.0, + "grad_norm": 1.7916575293353634, + "language_loss": 0.74736363, + "learning_rate": 2.004868266210965e-06, + "loss": 0.76878798, + "num_input_tokens_seen": 183880540, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71484375, + "step": 8553, + "time_per_iteration": 2.5039455890655518 + }, + { + "auxiliary_loss_clip": 0.01109768, + "auxiliary_loss_mlp": 0.01035209, + "balance_loss_clip": 1.02241778, + "balance_loss_mlp": 1.0397613, + "epoch": 0.5142943033218097, + "flos": 20704513080960.0, + "grad_norm": 1.707634664835445, + "language_loss": 0.68126231, + "learning_rate": 2.004478805593435e-06, + "loss": 0.70271206, + "num_input_tokens_seen": 183900895, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 8554, + "time_per_iteration": 5.488779544830322 + }, + { + "auxiliary_loss_clip": 0.01112685, + "auxiliary_loss_mlp": 0.01036304, + "balance_loss_clip": 1.02173042, + "balance_loss_mlp": 1.03879559, + "epoch": 0.5143544265744777, + "flos": 22925479847040.0, + "grad_norm": 2.3217393931515846, + "language_loss": 0.73303884, + "learning_rate": 2.004089344806068e-06, + "loss": 0.75452876, + "num_input_tokens_seen": 183920335, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.73828125, + "step": 8555, + "time_per_iteration": 3.866107940673828 + }, + { + "auxiliary_loss_clip": 0.01111396, + "auxiliary_loss_mlp": 0.01035591, + "balance_loss_clip": 1.02278817, + "balance_loss_mlp": 1.04023397, + "epoch": 0.5144145498271456, + "flos": 15921391236480.0, + "grad_norm": 2.3509367679077124, + "language_loss": 0.74724478, + "learning_rate": 2.003699883863633e-06, + "loss": 0.76871467, + "num_input_tokens_seen": 183936220, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7109375, + "step": 8556, + "time_per_iteration": 2.423941135406494 + }, + { + "auxiliary_loss_clip": 0.01105419, + "auxiliary_loss_mlp": 0.0103358, + "balance_loss_clip": 1.02135563, + "balance_loss_mlp": 1.03695798, + "epoch": 0.5144746730798136, + "flos": 19681238430720.0, + "grad_norm": 1.7510489074761373, + "language_loss": 0.86147487, + "learning_rate": 2.003310422780898e-06, + "loss": 0.88286483, + "num_input_tokens_seen": 183953250, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 8557, + "time_per_iteration": 2.4232289791107178 + }, + { + "auxiliary_loss_clip": 0.01105513, + "auxiliary_loss_mlp": 0.01033194, + "balance_loss_clip": 1.02162433, + "balance_loss_mlp": 1.03741109, + "epoch": 0.5145347963324816, + "flos": 23914711382400.0, + "grad_norm": 1.4648111070630687, + "language_loss": 0.89026904, + "learning_rate": 2.0029209615726307e-06, + "loss": 0.91165608, + "num_input_tokens_seen": 183973865, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6796875, + "step": 8558, + "time_per_iteration": 2.4937002658843994 + }, + { + "auxiliary_loss_clip": 0.01106843, + "auxiliary_loss_mlp": 0.01032131, + "balance_loss_clip": 1.01952434, + "balance_loss_mlp": 1.03844643, + "epoch": 0.5145949195851496, + "flos": 18260002022400.0, + "grad_norm": 1.959206520418211, + "language_loss": 0.65027267, + "learning_rate": 2.002531500253602e-06, + "loss": 0.67166239, + "num_input_tokens_seen": 183992555, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.68359375, + "step": 8559, + "time_per_iteration": 2.4625425338745117 + }, + { + "auxiliary_loss_clip": 0.01109462, + "auxiliary_loss_mlp": 0.01035258, + "balance_loss_clip": 1.02255082, + "balance_loss_mlp": 1.04041696, + "epoch": 0.5146550428378175, + "flos": 26213425136640.0, + "grad_norm": 1.5416961138531182, + "language_loss": 0.62973124, + "learning_rate": 2.002142038838577e-06, + "loss": 0.65117842, + "num_input_tokens_seen": 184010825, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 8560, + "time_per_iteration": 2.509413719177246 + }, + { + "auxiliary_loss_clip": 0.01107571, + "auxiliary_loss_mlp": 0.01030305, + "balance_loss_clip": 1.01798463, + "balance_loss_mlp": 1.03850913, + "epoch": 0.5147151660904855, + "flos": 22674177319680.0, + "grad_norm": 1.5387222778191898, + "language_loss": 0.69879884, + "learning_rate": 2.0017525773423265e-06, + "loss": 0.72017759, + "num_input_tokens_seen": 184030155, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69140625, + "step": 8561, + "time_per_iteration": 2.4802825450897217 + }, + { + "auxiliary_loss_clip": 0.01108154, + "auxiliary_loss_mlp": 0.01030825, + "balance_loss_clip": 1.01894569, + "balance_loss_mlp": 1.03752971, + "epoch": 0.5147752893431534, + "flos": 24972388283520.0, + "grad_norm": 1.5731273846161422, + "language_loss": 0.66646934, + "learning_rate": 2.0013631157796177e-06, + "loss": 0.68785918, + "num_input_tokens_seen": 184051440, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.70703125, + "step": 8562, + "time_per_iteration": 2.505180835723877 + }, + { + "auxiliary_loss_clip": 0.01110444, + "auxiliary_loss_mlp": 0.01030865, + "balance_loss_clip": 1.01824713, + "balance_loss_mlp": 1.03924227, + "epoch": 0.5148354125958214, + "flos": 22744669760640.0, + "grad_norm": 1.6680045222139546, + "language_loss": 0.77707577, + "learning_rate": 2.0009736541652188e-06, + "loss": 0.79848886, + "num_input_tokens_seen": 184070205, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 8563, + "time_per_iteration": 2.4935452938079834 + }, + { + "auxiliary_loss_clip": 0.01112567, + "auxiliary_loss_mlp": 0.01033673, + "balance_loss_clip": 1.01932585, + "balance_loss_mlp": 1.03827047, + "epoch": 0.5148955358484893, + "flos": 23068763199360.0, + "grad_norm": 2.1629374301288284, + "language_loss": 0.82324845, + "learning_rate": 2.0005841925139e-06, + "loss": 0.84471083, + "num_input_tokens_seen": 184087345, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.7421875, + "step": 8564, + "time_per_iteration": 2.4590399265289307 + }, + { + "auxiliary_loss_clip": 0.01112048, + "auxiliary_loss_mlp": 0.01035558, + "balance_loss_clip": 1.0223794, + "balance_loss_mlp": 1.03859615, + "epoch": 0.5149556591011574, + "flos": 20340127560960.0, + "grad_norm": 1.7207643570499924, + "language_loss": 0.73255235, + "learning_rate": 2.0001947308404283e-06, + "loss": 0.75402838, + "num_input_tokens_seen": 184107110, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 8565, + "time_per_iteration": 2.471970558166504 + }, + { + "auxiliary_loss_clip": 0.01113674, + "auxiliary_loss_mlp": 0.01033516, + "balance_loss_clip": 1.01884174, + "balance_loss_mlp": 1.03977931, + "epoch": 0.5150157823538254, + "flos": 22638230784000.0, + "grad_norm": 1.8782058792026062, + "language_loss": 0.683079, + "learning_rate": 1.9998052691595715e-06, + "loss": 0.70455092, + "num_input_tokens_seen": 184127105, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.73828125, + "step": 8566, + "time_per_iteration": 2.4981720447540283 + }, + { + "auxiliary_loss_clip": 0.01109217, + "auxiliary_loss_mlp": 0.01029217, + "balance_loss_clip": 1.01639605, + "balance_loss_mlp": 1.03583431, + "epoch": 0.5150759056064933, + "flos": 26067627832320.0, + "grad_norm": 2.0482874573832177, + "language_loss": 0.78111541, + "learning_rate": 1.9994158074861005e-06, + "loss": 0.80249971, + "num_input_tokens_seen": 184148060, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.734375, + "step": 8567, + "time_per_iteration": 2.490272045135498 + }, + { + "auxiliary_loss_clip": 0.01113521, + "auxiliary_loss_mlp": 0.01033959, + "balance_loss_clip": 1.02054214, + "balance_loss_mlp": 1.04046249, + "epoch": 0.5151360288591613, + "flos": 25952641418880.0, + "grad_norm": 2.0737995601061274, + "language_loss": 0.790721, + "learning_rate": 1.9990263458347806e-06, + "loss": 0.81219578, + "num_input_tokens_seen": 184166175, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73046875, + "step": 8568, + "time_per_iteration": 2.602315902709961 + }, + { + "auxiliary_loss_clip": 0.01106374, + "auxiliary_loss_mlp": 0.01031237, + "balance_loss_clip": 1.01885664, + "balance_loss_mlp": 1.03637588, + "epoch": 0.5151961521118292, + "flos": 18507246312960.0, + "grad_norm": 2.0499636702484945, + "language_loss": 0.90935498, + "learning_rate": 1.9986368842203825e-06, + "loss": 0.93073106, + "num_input_tokens_seen": 184182600, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.703125, + "step": 8569, + "time_per_iteration": 2.430600643157959 + }, + { + "auxiliary_loss_clip": 0.01110259, + "auxiliary_loss_mlp": 0.01030056, + "balance_loss_clip": 1.01677024, + "balance_loss_mlp": 1.03865302, + "epoch": 0.5152562753644973, + "flos": 22233696837120.0, + "grad_norm": 1.6639049645433037, + "language_loss": 0.76229095, + "learning_rate": 1.998247422657674e-06, + "loss": 0.78369409, + "num_input_tokens_seen": 184202020, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 8570, + "time_per_iteration": 2.48988676071167 + }, + { + "auxiliary_loss_clip": 0.01108277, + "auxiliary_loss_mlp": 0.01037495, + "balance_loss_clip": 1.02357769, + "balance_loss_mlp": 1.03741157, + "epoch": 0.5153163986171652, + "flos": 38436555047040.0, + "grad_norm": 1.5896565556148876, + "language_loss": 0.7375021, + "learning_rate": 1.9978579611614227e-06, + "loss": 0.75895989, + "num_input_tokens_seen": 184224850, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7109375, + "step": 8571, + "time_per_iteration": 2.618849754333496 + }, + { + "auxiliary_loss_clip": 0.01035305, + "auxiliary_loss_mlp": 0.00998776, + "balance_loss_clip": 0.99780464, + "balance_loss_mlp": 1.0127461, + "epoch": 0.5153765218698332, + "flos": 66384503015040.0, + "grad_norm": 0.7780004501915253, + "language_loss": 0.52940249, + "learning_rate": 1.9974684997463984e-06, + "loss": 0.54974329, + "num_input_tokens_seen": 184288520, + "router_z_loss_clip": 0.00970459, + "router_z_loss_mlp": 0.22558594, + "step": 8572, + "time_per_iteration": 3.1418654918670654 + }, + { + "auxiliary_loss_clip": 0.01108043, + "auxiliary_loss_mlp": 0.01032788, + "balance_loss_clip": 1.02087331, + "balance_loss_mlp": 1.04004169, + "epoch": 0.5154366451225011, + "flos": 24024669891840.0, + "grad_norm": 1.7275406058075027, + "language_loss": 0.76217729, + "learning_rate": 1.9970790384273687e-06, + "loss": 0.78358561, + "num_input_tokens_seen": 184308565, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 8573, + "time_per_iteration": 2.4757239818573 + }, + { + "auxiliary_loss_clip": 0.01105818, + "auxiliary_loss_mlp": 0.01029217, + "balance_loss_clip": 1.01627111, + "balance_loss_mlp": 1.03679562, + "epoch": 0.5154967683751691, + "flos": 23468843859840.0, + "grad_norm": 1.9279490614808483, + "language_loss": 0.77039665, + "learning_rate": 1.996689577219102e-06, + "loss": 0.79174697, + "num_input_tokens_seen": 184326795, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6875, + "step": 8574, + "time_per_iteration": 2.478935718536377 + }, + { + "auxiliary_loss_clip": 0.01107394, + "auxiliary_loss_mlp": 0.01029105, + "balance_loss_clip": 1.01714277, + "balance_loss_mlp": 1.03757906, + "epoch": 0.515556891627837, + "flos": 23805650712960.0, + "grad_norm": 1.6824577114627284, + "language_loss": 0.85421538, + "learning_rate": 1.996300116136367e-06, + "loss": 0.87558043, + "num_input_tokens_seen": 184345990, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.69921875, + "step": 8575, + "time_per_iteration": 2.4811151027679443 + }, + { + "auxiliary_loss_clip": 0.01108061, + "auxiliary_loss_mlp": 0.01032098, + "balance_loss_clip": 1.01971185, + "balance_loss_mlp": 1.03703451, + "epoch": 0.515617014880505, + "flos": 19828544106240.0, + "grad_norm": 1.6692718685381052, + "language_loss": 0.76704675, + "learning_rate": 1.995910655193932e-06, + "loss": 0.78844833, + "num_input_tokens_seen": 184366300, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 8576, + "time_per_iteration": 2.490389108657837 + }, + { + "auxiliary_loss_clip": 0.011134, + "auxiliary_loss_mlp": 0.01031565, + "balance_loss_clip": 1.01836872, + "balance_loss_mlp": 1.03960061, + "epoch": 0.515677138133173, + "flos": 14245907385600.0, + "grad_norm": 3.052053268886893, + "language_loss": 0.75463682, + "learning_rate": 1.9955211944065654e-06, + "loss": 0.77608645, + "num_input_tokens_seen": 184383030, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73828125, + "step": 8577, + "time_per_iteration": 2.416757583618164 + }, + { + "auxiliary_loss_clip": 0.0111005, + "auxiliary_loss_mlp": 0.01037518, + "balance_loss_clip": 1.02441728, + "balance_loss_mlp": 1.0376997, + "epoch": 0.515737261385841, + "flos": 28289707920000.0, + "grad_norm": 1.834882992604573, + "language_loss": 0.80803275, + "learning_rate": 1.9951317337890353e-06, + "loss": 0.82950842, + "num_input_tokens_seen": 184403410, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 8578, + "time_per_iteration": 2.517292022705078 + }, + { + "auxiliary_loss_clip": 0.01104508, + "auxiliary_loss_mlp": 0.01032552, + "balance_loss_clip": 1.02046442, + "balance_loss_mlp": 1.0357188, + "epoch": 0.515797384638509, + "flos": 27891925729920.0, + "grad_norm": 1.7011032882300805, + "language_loss": 0.76299787, + "learning_rate": 1.9947422733561105e-06, + "loss": 0.78436846, + "num_input_tokens_seen": 184423830, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 8579, + "time_per_iteration": 2.4907805919647217 + }, + { + "auxiliary_loss_clip": 0.01109486, + "auxiliary_loss_mlp": 0.01031503, + "balance_loss_clip": 1.01890254, + "balance_loss_mlp": 1.03864014, + "epoch": 0.5158575078911769, + "flos": 23040071210880.0, + "grad_norm": 1.5884760036798964, + "language_loss": 0.79018867, + "learning_rate": 1.994352813122559e-06, + "loss": 0.81159854, + "num_input_tokens_seen": 184445050, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 8580, + "time_per_iteration": 2.490298271179199 + }, + { + "auxiliary_loss_clip": 0.01111804, + "auxiliary_loss_mlp": 0.01036819, + "balance_loss_clip": 1.0237354, + "balance_loss_mlp": 1.03874159, + "epoch": 0.5159176311438449, + "flos": 12641346938880.0, + "grad_norm": 2.2420547036898277, + "language_loss": 0.72657341, + "learning_rate": 1.99396335310315e-06, + "loss": 0.74805963, + "num_input_tokens_seen": 184460775, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73046875, + "step": 8581, + "time_per_iteration": 2.419196367263794 + }, + { + "auxiliary_loss_clip": 0.01107618, + "auxiliary_loss_mlp": 0.01030311, + "balance_loss_clip": 1.01844954, + "balance_loss_mlp": 1.03848028, + "epoch": 0.5159777543965128, + "flos": 15558154951680.0, + "grad_norm": 2.260602789840083, + "language_loss": 0.74468267, + "learning_rate": 1.9935738933126508e-06, + "loss": 0.76606196, + "num_input_tokens_seen": 184477365, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.69140625, + "step": 8582, + "time_per_iteration": 2.4235429763793945 + }, + { + "auxiliary_loss_clip": 0.01107491, + "auxiliary_loss_mlp": 0.01033441, + "balance_loss_clip": 1.02201486, + "balance_loss_mlp": 1.03820109, + "epoch": 0.5160378776491809, + "flos": 23221671396480.0, + "grad_norm": 3.661326019284234, + "language_loss": 0.66308093, + "learning_rate": 1.99318443376583e-06, + "loss": 0.68449032, + "num_input_tokens_seen": 184497045, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.69140625, + "step": 8583, + "time_per_iteration": 2.483489990234375 + }, + { + "auxiliary_loss_clip": 0.0111088, + "auxiliary_loss_mlp": 0.01036135, + "balance_loss_clip": 1.02315259, + "balance_loss_mlp": 1.04015112, + "epoch": 0.5160980009018488, + "flos": 21944616180480.0, + "grad_norm": 1.4772972874821377, + "language_loss": 0.75878769, + "learning_rate": 1.9927949744774568e-06, + "loss": 0.78025782, + "num_input_tokens_seen": 184517675, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.70703125, + "step": 8584, + "time_per_iteration": 2.469770908355713 + }, + { + "auxiliary_loss_clip": 0.01109729, + "auxiliary_loss_mlp": 0.01038496, + "balance_loss_clip": 1.026057, + "balance_loss_mlp": 1.03763115, + "epoch": 0.5161581241545168, + "flos": 22784064001920.0, + "grad_norm": 1.908038470800245, + "language_loss": 0.78773153, + "learning_rate": 1.9924055154622983e-06, + "loss": 0.80921382, + "num_input_tokens_seen": 184537745, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.72265625, + "step": 8585, + "time_per_iteration": 2.4765405654907227 + }, + { + "auxiliary_loss_clip": 0.01103444, + "auxiliary_loss_mlp": 0.01031519, + "balance_loss_clip": 1.01976502, + "balance_loss_mlp": 1.03624129, + "epoch": 0.5162182474071847, + "flos": 19675384513920.0, + "grad_norm": 2.394419079152278, + "language_loss": 0.81022364, + "learning_rate": 1.9920160567351238e-06, + "loss": 0.83157325, + "num_input_tokens_seen": 184553630, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.671875, + "step": 8586, + "time_per_iteration": 2.45131254196167 + }, + { + "auxiliary_loss_clip": 0.01107797, + "auxiliary_loss_mlp": 0.01033426, + "balance_loss_clip": 1.02106369, + "balance_loss_mlp": 1.03754663, + "epoch": 0.5162783706598527, + "flos": 20046198568320.0, + "grad_norm": 2.0375667228771572, + "language_loss": 0.71716821, + "learning_rate": 1.991626598310701e-06, + "loss": 0.73858047, + "num_input_tokens_seen": 184573530, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.703125, + "step": 8587, + "time_per_iteration": 2.464603900909424 + }, + { + "auxiliary_loss_clip": 0.0103385, + "auxiliary_loss_mlp": 0.01011507, + "balance_loss_clip": 1.01052976, + "balance_loss_mlp": 1.01128352, + "epoch": 0.5163384939125206, + "flos": 69959553713280.0, + "grad_norm": 0.7317367951541988, + "language_loss": 0.57798368, + "learning_rate": 1.9912371402037984e-06, + "loss": 0.59843719, + "num_input_tokens_seen": 184637875, + "router_z_loss_clip": 0.00976562, + "router_z_loss_mlp": 0.22558594, + "step": 8588, + "time_per_iteration": 3.0708353519439697 + }, + { + "auxiliary_loss_clip": 0.01106665, + "auxiliary_loss_mlp": 0.01038792, + "balance_loss_clip": 1.02560759, + "balance_loss_mlp": 1.03631115, + "epoch": 0.5163986171651886, + "flos": 17417034668160.0, + "grad_norm": 1.9433685436573729, + "language_loss": 0.7553345, + "learning_rate": 1.990847682429185e-06, + "loss": 0.77678907, + "num_input_tokens_seen": 184656125, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 8589, + "time_per_iteration": 2.4392945766448975 + }, + { + "auxiliary_loss_clip": 0.0110855, + "auxiliary_loss_mlp": 0.01032646, + "balance_loss_clip": 1.02110088, + "balance_loss_mlp": 1.03822279, + "epoch": 0.5164587404178566, + "flos": 21322679166720.0, + "grad_norm": 2.018268520776434, + "language_loss": 0.67597556, + "learning_rate": 1.990458225001627e-06, + "loss": 0.69738752, + "num_input_tokens_seen": 184675920, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.703125, + "step": 8590, + "time_per_iteration": 2.480978012084961 + }, + { + "auxiliary_loss_clip": 0.01034536, + "auxiliary_loss_mlp": 0.01003309, + "balance_loss_clip": 1.00217628, + "balance_loss_mlp": 1.01181984, + "epoch": 0.5165188636705246, + "flos": 68057149691520.0, + "grad_norm": 0.7844517010344912, + "language_loss": 0.5593977, + "learning_rate": 1.990068767935895e-06, + "loss": 0.57977605, + "num_input_tokens_seen": 184730520, + "router_z_loss_clip": 0.01135254, + "router_z_loss_mlp": 0.2265625, + "step": 8591, + "time_per_iteration": 3.0380799770355225 + }, + { + "auxiliary_loss_clip": 0.01101472, + "auxiliary_loss_mlp": 0.01023222, + "balance_loss_clip": 1.01192665, + "balance_loss_mlp": 1.03659964, + "epoch": 0.5165789869231926, + "flos": 19385657412480.0, + "grad_norm": 1.5513724058155185, + "language_loss": 0.81425416, + "learning_rate": 1.9896793112467566e-06, + "loss": 0.83550113, + "num_input_tokens_seen": 184748340, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6484375, + "step": 8592, + "time_per_iteration": 2.4280107021331787 + }, + { + "auxiliary_loss_clip": 0.0110705, + "auxiliary_loss_mlp": 0.01023209, + "balance_loss_clip": 1.01141334, + "balance_loss_mlp": 1.04046106, + "epoch": 0.5166391101758605, + "flos": 20960197067520.0, + "grad_norm": 1.8100942034895195, + "language_loss": 0.83394146, + "learning_rate": 1.989289854948979e-06, + "loss": 0.85524404, + "num_input_tokens_seen": 184766615, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66796875, + "step": 8593, + "time_per_iteration": 3.9351704120635986 + }, + { + "auxiliary_loss_clip": 0.01109969, + "auxiliary_loss_mlp": 0.01031901, + "balance_loss_clip": 1.02004552, + "balance_loss_mlp": 1.04028952, + "epoch": 0.5166992334285285, + "flos": 29462407148160.0, + "grad_norm": 1.576203753972958, + "language_loss": 0.68724298, + "learning_rate": 1.9889003990573314e-06, + "loss": 0.70866162, + "num_input_tokens_seen": 184788075, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6953125, + "step": 8594, + "time_per_iteration": 2.547206163406372 + }, + { + "auxiliary_loss_clip": 0.01105211, + "auxiliary_loss_mlp": 0.0102705, + "balance_loss_clip": 1.01459885, + "balance_loss_mlp": 1.03660214, + "epoch": 0.5167593566811964, + "flos": 20304360593280.0, + "grad_norm": 1.9981153431236998, + "language_loss": 0.77706152, + "learning_rate": 1.988510943586582e-06, + "loss": 0.79838419, + "num_input_tokens_seen": 184808710, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6875, + "step": 8595, + "time_per_iteration": 2.5214362144470215 + }, + { + "auxiliary_loss_clip": 0.01107198, + "auxiliary_loss_mlp": 0.0103521, + "balance_loss_clip": 1.02278233, + "balance_loss_mlp": 1.03896379, + "epoch": 0.5168194799338645, + "flos": 14611370313600.0, + "grad_norm": 1.5236872991766963, + "language_loss": 0.64860648, + "learning_rate": 1.9881214885514986e-06, + "loss": 0.67003053, + "num_input_tokens_seen": 184826475, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6796875, + "step": 8596, + "time_per_iteration": 5.460975885391235 + }, + { + "auxiliary_loss_clip": 0.01109553, + "auxiliary_loss_mlp": 0.01029844, + "balance_loss_clip": 1.01603329, + "balance_loss_mlp": 1.04030609, + "epoch": 0.5168796031865324, + "flos": 25007257411200.0, + "grad_norm": 1.6129264208414336, + "language_loss": 0.75417203, + "learning_rate": 1.9877320339668492e-06, + "loss": 0.77556598, + "num_input_tokens_seen": 184845245, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.6953125, + "step": 8597, + "time_per_iteration": 2.477386236190796 + }, + { + "auxiliary_loss_clip": 0.01108076, + "auxiliary_loss_mlp": 0.01025716, + "balance_loss_clip": 1.01356828, + "balance_loss_mlp": 1.03728151, + "epoch": 0.5169397264392004, + "flos": 26939969533440.0, + "grad_norm": 1.684107970499364, + "language_loss": 0.80853873, + "learning_rate": 1.987342579847403e-06, + "loss": 0.82987666, + "num_input_tokens_seen": 184866605, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.70703125, + "step": 8598, + "time_per_iteration": 2.5056118965148926 + }, + { + "auxiliary_loss_clip": 0.01107998, + "auxiliary_loss_mlp": 0.01038534, + "balance_loss_clip": 1.02550411, + "balance_loss_mlp": 1.03853858, + "epoch": 0.5169998496918683, + "flos": 25407804948480.0, + "grad_norm": 1.5161151475530301, + "language_loss": 0.75315893, + "learning_rate": 1.9869531262079273e-06, + "loss": 0.77462423, + "num_input_tokens_seen": 184886945, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6953125, + "step": 8599, + "time_per_iteration": 2.4907233715057373 + }, + { + "auxiliary_loss_clip": 0.01106465, + "auxiliary_loss_mlp": 0.01033371, + "balance_loss_clip": 1.02142024, + "balance_loss_mlp": 1.03874612, + "epoch": 0.5170599729445363, + "flos": 24680793674880.0, + "grad_norm": 5.031269669902368, + "language_loss": 0.72193408, + "learning_rate": 1.9865636730631904e-06, + "loss": 0.74333239, + "num_input_tokens_seen": 184905590, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6796875, + "step": 8600, + "time_per_iteration": 2.4958672523498535 + }, + { + "auxiliary_loss_clip": 0.01107841, + "auxiliary_loss_mlp": 0.01031875, + "balance_loss_clip": 1.01924503, + "balance_loss_mlp": 1.03902841, + "epoch": 0.5171200961972042, + "flos": 20994455664000.0, + "grad_norm": 1.5543027238719596, + "language_loss": 0.74527812, + "learning_rate": 1.9861742204279602e-06, + "loss": 0.76667523, + "num_input_tokens_seen": 184925555, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 8601, + "time_per_iteration": 2.4545562267303467 + }, + { + "auxiliary_loss_clip": 0.01108233, + "auxiliary_loss_mlp": 0.01038986, + "balance_loss_clip": 1.02540207, + "balance_loss_mlp": 1.03855383, + "epoch": 0.5171802194498722, + "flos": 22745639427840.0, + "grad_norm": 1.930843678841908, + "language_loss": 0.83770829, + "learning_rate": 1.9857847683170045e-06, + "loss": 0.85918051, + "num_input_tokens_seen": 184944490, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.6953125, + "step": 8602, + "time_per_iteration": 2.478315591812134 + }, + { + "auxiliary_loss_clip": 0.01109334, + "auxiliary_loss_mlp": 0.01030291, + "balance_loss_clip": 1.01727891, + "balance_loss_mlp": 1.03919971, + "epoch": 0.5172403427025402, + "flos": 28176732668160.0, + "grad_norm": 1.739467426965746, + "language_loss": 0.74487793, + "learning_rate": 1.9853953167450926e-06, + "loss": 0.76627421, + "num_input_tokens_seen": 184963190, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 8603, + "time_per_iteration": 2.541987180709839 + }, + { + "auxiliary_loss_clip": 0.01110457, + "auxiliary_loss_mlp": 0.0103389, + "balance_loss_clip": 1.02172458, + "balance_loss_mlp": 1.04043818, + "epoch": 0.5173004659552082, + "flos": 20337829090560.0, + "grad_norm": 2.0493295845447435, + "language_loss": 0.72732627, + "learning_rate": 1.9850058657269915e-06, + "loss": 0.74876976, + "num_input_tokens_seen": 184981220, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69921875, + "step": 8604, + "time_per_iteration": 2.464036226272583 + }, + { + "auxiliary_loss_clip": 0.01113997, + "auxiliary_loss_mlp": 0.01032385, + "balance_loss_clip": 1.01927209, + "balance_loss_mlp": 1.03878832, + "epoch": 0.5173605892078762, + "flos": 19063323740160.0, + "grad_norm": 1.890584135418456, + "language_loss": 0.85098851, + "learning_rate": 1.984616415277469e-06, + "loss": 0.87245226, + "num_input_tokens_seen": 184998810, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.75, + "step": 8605, + "time_per_iteration": 2.469414472579956 + }, + { + "auxiliary_loss_clip": 0.01107307, + "auxiliary_loss_mlp": 0.01024655, + "balance_loss_clip": 1.01271009, + "balance_loss_mlp": 1.03827572, + "epoch": 0.5174207124605441, + "flos": 27995168396160.0, + "grad_norm": 1.4962077074735805, + "language_loss": 0.64887142, + "learning_rate": 1.984226965411294e-06, + "loss": 0.67019105, + "num_input_tokens_seen": 185021185, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.69140625, + "step": 8606, + "time_per_iteration": 2.5391039848327637 + }, + { + "auxiliary_loss_clip": 0.01108829, + "auxiliary_loss_mlp": 0.01027754, + "balance_loss_clip": 1.0153147, + "balance_loss_mlp": 1.04041243, + "epoch": 0.5174808357132121, + "flos": 19496657416320.0, + "grad_norm": 1.6359731326945595, + "language_loss": 0.77811146, + "learning_rate": 1.983837516143234e-06, + "loss": 0.79947728, + "num_input_tokens_seen": 185038465, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 8607, + "time_per_iteration": 2.4382975101470947 + }, + { + "auxiliary_loss_clip": 0.01111137, + "auxiliary_loss_mlp": 0.01033709, + "balance_loss_clip": 1.02053022, + "balance_loss_mlp": 1.0399344, + "epoch": 0.51754095896588, + "flos": 22784171742720.0, + "grad_norm": 3.5447610791610638, + "language_loss": 0.72232366, + "learning_rate": 1.983448067488057e-06, + "loss": 0.74377209, + "num_input_tokens_seen": 185057340, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 8608, + "time_per_iteration": 2.511740207672119 + }, + { + "auxiliary_loss_clip": 0.01115322, + "auxiliary_loss_mlp": 0.01032677, + "balance_loss_clip": 1.01927149, + "balance_loss_mlp": 1.04073501, + "epoch": 0.5176010822185481, + "flos": 22669257156480.0, + "grad_norm": 1.8799970026389359, + "language_loss": 0.86513162, + "learning_rate": 1.983058619460531e-06, + "loss": 0.88661158, + "num_input_tokens_seen": 185074935, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 8609, + "time_per_iteration": 2.453684091567993 + }, + { + "auxiliary_loss_clip": 0.01108892, + "auxiliary_loss_mlp": 0.01030489, + "balance_loss_clip": 1.01888371, + "balance_loss_mlp": 1.03858495, + "epoch": 0.517661205471216, + "flos": 23951196622080.0, + "grad_norm": 1.565375500859336, + "language_loss": 0.73396695, + "learning_rate": 1.9826691720754237e-06, + "loss": 0.75536072, + "num_input_tokens_seen": 185095050, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.703125, + "step": 8610, + "time_per_iteration": 2.5529308319091797 + }, + { + "auxiliary_loss_clip": 0.01115772, + "auxiliary_loss_mlp": 0.01032009, + "balance_loss_clip": 1.01813269, + "balance_loss_mlp": 1.04202247, + "epoch": 0.517721328723884, + "flos": 15596076735360.0, + "grad_norm": 1.8297114771569651, + "language_loss": 0.67358816, + "learning_rate": 1.9822797253475034e-06, + "loss": 0.69506592, + "num_input_tokens_seen": 185112275, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.73828125, + "step": 8611, + "time_per_iteration": 2.4198501110076904 + }, + { + "auxiliary_loss_clip": 0.01108783, + "auxiliary_loss_mlp": 0.01030358, + "balance_loss_clip": 1.01808488, + "balance_loss_mlp": 1.0382731, + "epoch": 0.5177814519765519, + "flos": 20960197067520.0, + "grad_norm": 2.316941620789411, + "language_loss": 0.77502143, + "learning_rate": 1.9818902792915373e-06, + "loss": 0.79641283, + "num_input_tokens_seen": 185132165, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.70703125, + "step": 8612, + "time_per_iteration": 2.4943206310272217 + }, + { + "auxiliary_loss_clip": 0.01110636, + "auxiliary_loss_mlp": 0.01034055, + "balance_loss_clip": 1.02186632, + "balance_loss_mlp": 1.03938198, + "epoch": 0.5178415752292199, + "flos": 17967832796160.0, + "grad_norm": 1.9039649692993772, + "language_loss": 0.8192755, + "learning_rate": 1.981500833922294e-06, + "loss": 0.84072244, + "num_input_tokens_seen": 185151025, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.71484375, + "step": 8613, + "time_per_iteration": 2.434479236602783 + }, + { + "auxiliary_loss_clip": 0.01114755, + "auxiliary_loss_mlp": 0.01034084, + "balance_loss_clip": 1.02059531, + "balance_loss_mlp": 1.04346251, + "epoch": 0.5179016984818878, + "flos": 17821496787840.0, + "grad_norm": 2.1674567731422987, + "language_loss": 0.66747862, + "learning_rate": 1.981111389254541e-06, + "loss": 0.68896699, + "num_input_tokens_seen": 185168455, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7109375, + "step": 8614, + "time_per_iteration": 2.4598941802978516 + }, + { + "auxiliary_loss_clip": 0.01112182, + "auxiliary_loss_mlp": 0.0103035, + "balance_loss_clip": 1.01736188, + "balance_loss_mlp": 1.04048586, + "epoch": 0.5179618217345558, + "flos": 17820455293440.0, + "grad_norm": 1.9388641649707037, + "language_loss": 0.86660814, + "learning_rate": 1.9807219453030453e-06, + "loss": 0.88803345, + "num_input_tokens_seen": 185184415, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71875, + "step": 8615, + "time_per_iteration": 2.434614419937134 + }, + { + "auxiliary_loss_clip": 0.01110692, + "auxiliary_loss_mlp": 0.01040873, + "balance_loss_clip": 1.02877903, + "balance_loss_mlp": 1.04087663, + "epoch": 0.5180219449872238, + "flos": 22522131048960.0, + "grad_norm": 1.572223272426788, + "language_loss": 0.80601507, + "learning_rate": 1.9803325020825763e-06, + "loss": 0.82753074, + "num_input_tokens_seen": 185202910, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69921875, + "step": 8616, + "time_per_iteration": 2.489898920059204 + }, + { + "auxiliary_loss_clip": 0.01119523, + "auxiliary_loss_mlp": 0.01042995, + "balance_loss_clip": 1.02928019, + "balance_loss_mlp": 1.04558134, + "epoch": 0.5180820682398918, + "flos": 23915465568000.0, + "grad_norm": 1.6322050900799092, + "language_loss": 0.7524333, + "learning_rate": 1.9799430596079e-06, + "loss": 0.77405852, + "num_input_tokens_seen": 185223085, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7421875, + "step": 8617, + "time_per_iteration": 2.4741597175598145 + }, + { + "auxiliary_loss_clip": 0.0111036, + "auxiliary_loss_mlp": 0.01032777, + "balance_loss_clip": 1.01977718, + "balance_loss_mlp": 1.03946304, + "epoch": 0.5181421914925598, + "flos": 16979930064000.0, + "grad_norm": 1.8314484463575909, + "language_loss": 0.70137858, + "learning_rate": 1.979553617893785e-06, + "loss": 0.72280991, + "num_input_tokens_seen": 185241295, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.70703125, + "step": 8618, + "time_per_iteration": 2.4596426486968994 + }, + { + "auxiliary_loss_clip": 0.01036764, + "auxiliary_loss_mlp": 0.01001258, + "balance_loss_clip": 1.00007808, + "balance_loss_mlp": 1.01408625, + "epoch": 0.5182023147452277, + "flos": 66059870872320.0, + "grad_norm": 0.9556911586994957, + "language_loss": 0.67222798, + "learning_rate": 1.979164176954999e-06, + "loss": 0.69260818, + "num_input_tokens_seen": 185298295, + "router_z_loss_clip": 0.01177979, + "router_z_loss_mlp": 0.2265625, + "step": 8619, + "time_per_iteration": 3.0123016834259033 + }, + { + "auxiliary_loss_clip": 0.01107081, + "auxiliary_loss_mlp": 0.01032829, + "balance_loss_clip": 1.02055597, + "balance_loss_mlp": 1.03924203, + "epoch": 0.5182624379978957, + "flos": 18187749815040.0, + "grad_norm": 2.197431442121674, + "language_loss": 0.79314506, + "learning_rate": 1.97877473680631e-06, + "loss": 0.81454414, + "num_input_tokens_seen": 185317000, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6796875, + "step": 8620, + "time_per_iteration": 2.445173740386963 + }, + { + "auxiliary_loss_clip": 0.01108259, + "auxiliary_loss_mlp": 0.01038483, + "balance_loss_clip": 1.02625203, + "balance_loss_mlp": 1.03989077, + "epoch": 0.5183225612505636, + "flos": 14026708638720.0, + "grad_norm": 2.0514402600561765, + "language_loss": 0.81893396, + "learning_rate": 1.9783852974624846e-06, + "loss": 0.84040135, + "num_input_tokens_seen": 185331185, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 8621, + "time_per_iteration": 2.4382779598236084 + }, + { + "auxiliary_loss_clip": 0.01109273, + "auxiliary_loss_mlp": 0.01032378, + "balance_loss_clip": 1.02073121, + "balance_loss_mlp": 1.0391438, + "epoch": 0.5183826845032317, + "flos": 23659781581440.0, + "grad_norm": 1.9740999547408657, + "language_loss": 0.65540636, + "learning_rate": 1.9779958589382905e-06, + "loss": 0.67682284, + "num_input_tokens_seen": 185348955, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.69921875, + "step": 8622, + "time_per_iteration": 2.494173288345337 + }, + { + "auxiliary_loss_clip": 0.01114132, + "auxiliary_loss_mlp": 0.01038751, + "balance_loss_clip": 1.02528644, + "balance_loss_mlp": 1.04077148, + "epoch": 0.5184428077558996, + "flos": 15888605097600.0, + "grad_norm": 1.975231537474399, + "language_loss": 0.60350323, + "learning_rate": 1.977606421248497e-06, + "loss": 0.62503201, + "num_input_tokens_seen": 185367330, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 8623, + "time_per_iteration": 2.427819013595581 + }, + { + "auxiliary_loss_clip": 0.0110912, + "auxiliary_loss_mlp": 0.01032142, + "balance_loss_clip": 1.01995301, + "balance_loss_mlp": 1.03832614, + "epoch": 0.5185029310085676, + "flos": 21030833162880.0, + "grad_norm": 1.7021073046505133, + "language_loss": 0.76074666, + "learning_rate": 1.9772169844078685e-06, + "loss": 0.78215921, + "num_input_tokens_seen": 185385060, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.70703125, + "step": 8624, + "time_per_iteration": 2.4636356830596924 + }, + { + "auxiliary_loss_clip": 0.01109665, + "auxiliary_loss_mlp": 0.01036507, + "balance_loss_clip": 1.02441311, + "balance_loss_mlp": 1.03890038, + "epoch": 0.5185630542612355, + "flos": 26542690133760.0, + "grad_norm": 2.7326139645058456, + "language_loss": 0.71175325, + "learning_rate": 1.9768275484311756e-06, + "loss": 0.73321491, + "num_input_tokens_seen": 185403745, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.70703125, + "step": 8625, + "time_per_iteration": 2.4977569580078125 + }, + { + "auxiliary_loss_clip": 0.01110816, + "auxiliary_loss_mlp": 0.01034143, + "balance_loss_clip": 1.02223408, + "balance_loss_mlp": 1.03980732, + "epoch": 0.5186231775139035, + "flos": 20668422890880.0, + "grad_norm": 1.8950159086376122, + "language_loss": 0.67929721, + "learning_rate": 1.976438113333184e-06, + "loss": 0.70074677, + "num_input_tokens_seen": 185422620, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.7109375, + "step": 8626, + "time_per_iteration": 2.4934957027435303 + }, + { + "auxiliary_loss_clip": 0.01108701, + "auxiliary_loss_mlp": 0.010311, + "balance_loss_clip": 1.01889873, + "balance_loss_mlp": 1.03984976, + "epoch": 0.5186833007665714, + "flos": 20885502735360.0, + "grad_norm": 2.322377605069906, + "language_loss": 0.70487207, + "learning_rate": 1.9760486791286612e-06, + "loss": 0.72627008, + "num_input_tokens_seen": 185439380, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 8627, + "time_per_iteration": 2.445827007293701 + }, + { + "auxiliary_loss_clip": 0.01114683, + "auxiliary_loss_mlp": 0.01037557, + "balance_loss_clip": 1.02539158, + "balance_loss_mlp": 1.04147446, + "epoch": 0.5187434240192395, + "flos": 20886903365760.0, + "grad_norm": 1.9255563847501656, + "language_loss": 0.73209083, + "learning_rate": 1.9756592458323753e-06, + "loss": 0.75361323, + "num_input_tokens_seen": 185458830, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.734375, + "step": 8628, + "time_per_iteration": 2.500955581665039 + }, + { + "auxiliary_loss_clip": 0.01110668, + "auxiliary_loss_mlp": 0.01032241, + "balance_loss_clip": 1.02039731, + "balance_loss_mlp": 1.04147768, + "epoch": 0.5188035472719074, + "flos": 19859929614720.0, + "grad_norm": 3.3927220028721994, + "language_loss": 0.77245331, + "learning_rate": 1.9752698134590927e-06, + "loss": 0.79388249, + "num_input_tokens_seen": 185477270, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.69140625, + "step": 8629, + "time_per_iteration": 2.4560301303863525 + }, + { + "auxiliary_loss_clip": 0.01113327, + "auxiliary_loss_mlp": 0.01030634, + "balance_loss_clip": 1.0179081, + "balance_loss_mlp": 1.04206562, + "epoch": 0.5188636705245754, + "flos": 21138313633920.0, + "grad_norm": 2.1928775386787187, + "language_loss": 0.74820137, + "learning_rate": 1.9748803820235815e-06, + "loss": 0.76964092, + "num_input_tokens_seen": 185495795, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 8630, + "time_per_iteration": 2.496370792388916 + }, + { + "auxiliary_loss_clip": 0.01110145, + "auxiliary_loss_mlp": 0.01035209, + "balance_loss_clip": 1.02210796, + "balance_loss_mlp": 1.03882229, + "epoch": 0.5189237937772434, + "flos": 22419786222720.0, + "grad_norm": 1.6137116253106134, + "language_loss": 0.80663669, + "learning_rate": 1.9744909515406093e-06, + "loss": 0.82809031, + "num_input_tokens_seen": 185514885, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 8631, + "time_per_iteration": 2.4534530639648438 + }, + { + "auxiliary_loss_clip": 0.01112884, + "auxiliary_loss_mlp": 0.01032333, + "balance_loss_clip": 1.01893413, + "balance_loss_mlp": 1.04085588, + "epoch": 0.5189839170299113, + "flos": 25446696399360.0, + "grad_norm": 1.5022963557810187, + "language_loss": 0.74575752, + "learning_rate": 1.974101522024942e-06, + "loss": 0.76720965, + "num_input_tokens_seen": 185537155, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.71875, + "step": 8632, + "time_per_iteration": 2.5295352935791016 + }, + { + "auxiliary_loss_clip": 0.01105073, + "auxiliary_loss_mlp": 0.01030609, + "balance_loss_clip": 1.01810372, + "balance_loss_mlp": 1.03738809, + "epoch": 0.5190440402825793, + "flos": 18587722734720.0, + "grad_norm": 1.784064079335437, + "language_loss": 0.78812337, + "learning_rate": 1.9737120934913477e-06, + "loss": 0.80948019, + "num_input_tokens_seen": 185555520, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.67578125, + "step": 8633, + "time_per_iteration": 2.4241905212402344 + }, + { + "auxiliary_loss_clip": 0.01109914, + "auxiliary_loss_mlp": 0.01031089, + "balance_loss_clip": 1.01873302, + "balance_loss_mlp": 1.03893745, + "epoch": 0.5191041635352472, + "flos": 21908633731200.0, + "grad_norm": 1.7026702061892323, + "language_loss": 0.80149853, + "learning_rate": 1.9733226659545936e-06, + "loss": 0.82290852, + "num_input_tokens_seen": 185573855, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.70703125, + "step": 8634, + "time_per_iteration": 2.4851884841918945 + }, + { + "auxiliary_loss_clip": 0.01108415, + "auxiliary_loss_mlp": 0.01035144, + "balance_loss_clip": 1.02305627, + "balance_loss_mlp": 1.04024315, + "epoch": 0.5191642867879153, + "flos": 27527971173120.0, + "grad_norm": 1.4600796720036056, + "language_loss": 0.68628252, + "learning_rate": 1.9729332394294467e-06, + "loss": 0.70771807, + "num_input_tokens_seen": 185595145, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 8635, + "time_per_iteration": 3.921346426010132 + }, + { + "auxiliary_loss_clip": 0.01113121, + "auxiliary_loss_mlp": 0.01033538, + "balance_loss_clip": 1.02083683, + "balance_loss_mlp": 1.04083443, + "epoch": 0.5192244100405832, + "flos": 15705999331200.0, + "grad_norm": 1.6781612563386181, + "language_loss": 0.7704699, + "learning_rate": 1.9725438139306742e-06, + "loss": 0.79193652, + "num_input_tokens_seen": 185613320, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 8636, + "time_per_iteration": 2.45908260345459 + }, + { + "auxiliary_loss_clip": 0.01112314, + "auxiliary_loss_mlp": 0.01031183, + "balance_loss_clip": 1.01861811, + "balance_loss_mlp": 1.04090476, + "epoch": 0.5192845332932512, + "flos": 12057080313600.0, + "grad_norm": 1.9891179602637588, + "language_loss": 0.71459377, + "learning_rate": 1.9721543894730425e-06, + "loss": 0.73602873, + "num_input_tokens_seen": 185630730, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71484375, + "step": 8637, + "time_per_iteration": 5.353722810745239 + }, + { + "auxiliary_loss_clip": 0.01108688, + "auxiliary_loss_mlp": 0.01032602, + "balance_loss_clip": 1.01964426, + "balance_loss_mlp": 1.0394423, + "epoch": 0.5193446565459191, + "flos": 18953185662720.0, + "grad_norm": 3.7284266214304576, + "language_loss": 0.75943041, + "learning_rate": 1.9717649660713194e-06, + "loss": 0.78084332, + "num_input_tokens_seen": 185648515, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.69140625, + "step": 8638, + "time_per_iteration": 3.902477741241455 + }, + { + "auxiliary_loss_clip": 0.0110838, + "auxiliary_loss_mlp": 0.01030358, + "balance_loss_clip": 1.0175786, + "balance_loss_mlp": 1.03863966, + "epoch": 0.5194047797985871, + "flos": 20374960775040.0, + "grad_norm": 2.006346025426826, + "language_loss": 0.74846971, + "learning_rate": 1.971375543740272e-06, + "loss": 0.76985711, + "num_input_tokens_seen": 185665220, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 8639, + "time_per_iteration": 2.453634738922119 + }, + { + "auxiliary_loss_clip": 0.01109964, + "auxiliary_loss_mlp": 0.01028741, + "balance_loss_clip": 1.01604497, + "balance_loss_mlp": 1.04051375, + "epoch": 0.519464903051255, + "flos": 24353001135360.0, + "grad_norm": 1.6163455561126134, + "language_loss": 0.77538067, + "learning_rate": 1.9709861224946665e-06, + "loss": 0.79676771, + "num_input_tokens_seen": 185683750, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 8640, + "time_per_iteration": 2.482334613800049 + }, + { + "auxiliary_loss_clip": 0.01110191, + "auxiliary_loss_mlp": 0.01030568, + "balance_loss_clip": 1.01883161, + "balance_loss_mlp": 1.04175985, + "epoch": 0.519525026303923, + "flos": 14061829161600.0, + "grad_norm": 1.623082815057782, + "language_loss": 0.65734208, + "learning_rate": 1.97059670234927e-06, + "loss": 0.67874962, + "num_input_tokens_seen": 185700625, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.68359375, + "step": 8641, + "time_per_iteration": 2.4567995071411133 + }, + { + "auxiliary_loss_clip": 0.01109994, + "auxiliary_loss_mlp": 0.01033178, + "balance_loss_clip": 1.02142978, + "balance_loss_mlp": 1.04105425, + "epoch": 0.519585149556591, + "flos": 28835873193600.0, + "grad_norm": 1.8491224599980307, + "language_loss": 0.76197445, + "learning_rate": 1.97020728331885e-06, + "loss": 0.78340614, + "num_input_tokens_seen": 185721155, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.69140625, + "step": 8642, + "time_per_iteration": 2.5128276348114014 + }, + { + "auxiliary_loss_clip": 0.01109094, + "auxiliary_loss_mlp": 0.01031951, + "balance_loss_clip": 1.02001774, + "balance_loss_mlp": 1.04037452, + "epoch": 0.519645272809259, + "flos": 25373007648000.0, + "grad_norm": 1.4733024685255247, + "language_loss": 0.83179498, + "learning_rate": 1.9698178654181726e-06, + "loss": 0.85320538, + "num_input_tokens_seen": 185740990, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 8643, + "time_per_iteration": 2.5094587802886963 + }, + { + "auxiliary_loss_clip": 0.0111188, + "auxiliary_loss_mlp": 0.01042001, + "balance_loss_clip": 1.02856052, + "balance_loss_mlp": 1.03983521, + "epoch": 0.519705396061927, + "flos": 25372863993600.0, + "grad_norm": 1.5341454697133152, + "language_loss": 0.70307451, + "learning_rate": 1.969428448662004e-06, + "loss": 0.72461337, + "num_input_tokens_seen": 185762235, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71875, + "step": 8644, + "time_per_iteration": 2.5111963748931885 + }, + { + "auxiliary_loss_clip": 0.01110422, + "auxiliary_loss_mlp": 0.01031611, + "balance_loss_clip": 1.01967788, + "balance_loss_mlp": 1.03966331, + "epoch": 0.5197655193145949, + "flos": 28476228268800.0, + "grad_norm": 1.8635414079348847, + "language_loss": 0.80144334, + "learning_rate": 1.9690390330651133e-06, + "loss": 0.82286364, + "num_input_tokens_seen": 185783415, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.70703125, + "step": 8645, + "time_per_iteration": 2.529616117477417 + }, + { + "auxiliary_loss_clip": 0.01109035, + "auxiliary_loss_mlp": 0.01029263, + "balance_loss_clip": 1.01647151, + "balance_loss_mlp": 1.03836131, + "epoch": 0.5198256425672629, + "flos": 20009138711040.0, + "grad_norm": 1.899493861617854, + "language_loss": 0.78147799, + "learning_rate": 1.968649618642264e-06, + "loss": 0.80286086, + "num_input_tokens_seen": 185801345, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.70703125, + "step": 8646, + "time_per_iteration": 2.4409830570220947 + }, + { + "auxiliary_loss_clip": 0.01112803, + "auxiliary_loss_mlp": 0.01033352, + "balance_loss_clip": 1.02101934, + "balance_loss_mlp": 1.04184628, + "epoch": 0.5198857658199308, + "flos": 19828867328640.0, + "grad_norm": 1.8109153766187511, + "language_loss": 0.66239858, + "learning_rate": 1.9682602054082252e-06, + "loss": 0.68386012, + "num_input_tokens_seen": 185820815, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.7109375, + "step": 8647, + "time_per_iteration": 2.4503657817840576 + }, + { + "auxiliary_loss_clip": 0.01113411, + "auxiliary_loss_mlp": 0.01032738, + "balance_loss_clip": 1.01834917, + "balance_loss_mlp": 1.04010677, + "epoch": 0.5199458890725989, + "flos": 24461918150400.0, + "grad_norm": 4.112424605735972, + "language_loss": 0.71817285, + "learning_rate": 1.967870793377763e-06, + "loss": 0.73963439, + "num_input_tokens_seen": 185841450, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.734375, + "step": 8648, + "time_per_iteration": 2.49595308303833 + }, + { + "auxiliary_loss_clip": 0.01112873, + "auxiliary_loss_mlp": 0.0103029, + "balance_loss_clip": 1.01714706, + "balance_loss_mlp": 1.0411458, + "epoch": 0.5200060123252668, + "flos": 23404779953280.0, + "grad_norm": 1.6438613988660609, + "language_loss": 0.64412069, + "learning_rate": 1.967481382565642e-06, + "loss": 0.66555232, + "num_input_tokens_seen": 185859935, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 8649, + "time_per_iteration": 2.4781436920166016 + }, + { + "auxiliary_loss_clip": 0.01116558, + "auxiliary_loss_mlp": 0.01035075, + "balance_loss_clip": 1.02025771, + "balance_loss_mlp": 1.04224229, + "epoch": 0.5200661355779348, + "flos": 17201355454080.0, + "grad_norm": 1.8268985026448872, + "language_loss": 0.70691884, + "learning_rate": 1.9670919729866315e-06, + "loss": 0.72843516, + "num_input_tokens_seen": 185876795, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7421875, + "step": 8650, + "time_per_iteration": 2.4350762367248535 + }, + { + "auxiliary_loss_clip": 0.01108729, + "auxiliary_loss_mlp": 0.01028355, + "balance_loss_clip": 1.01559973, + "balance_loss_mlp": 1.03854239, + "epoch": 0.5201262588306027, + "flos": 18515075477760.0, + "grad_norm": 1.6557672224542628, + "language_loss": 0.7709741, + "learning_rate": 1.966702564655496e-06, + "loss": 0.79234493, + "num_input_tokens_seen": 185895570, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 8651, + "time_per_iteration": 2.4439852237701416 + }, + { + "auxiliary_loss_clip": 0.01115555, + "auxiliary_loss_mlp": 0.01035706, + "balance_loss_clip": 1.02171111, + "balance_loss_mlp": 1.04384518, + "epoch": 0.5201863820832707, + "flos": 18619395552000.0, + "grad_norm": 1.7772284952150523, + "language_loss": 0.78304142, + "learning_rate": 1.966313157587003e-06, + "loss": 0.80455399, + "num_input_tokens_seen": 185913700, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.71484375, + "step": 8652, + "time_per_iteration": 2.4581267833709717 + }, + { + "auxiliary_loss_clip": 0.01114617, + "auxiliary_loss_mlp": 0.01031149, + "balance_loss_clip": 1.01683807, + "balance_loss_mlp": 1.04281044, + "epoch": 0.5202465053359386, + "flos": 22857142222080.0, + "grad_norm": 2.0186078989624017, + "language_loss": 0.7027083, + "learning_rate": 1.9659237517959187e-06, + "loss": 0.72416592, + "num_input_tokens_seen": 185932460, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.71875, + "step": 8653, + "time_per_iteration": 2.4945242404937744 + }, + { + "auxiliary_loss_clip": 0.01114383, + "auxiliary_loss_mlp": 0.01040745, + "balance_loss_clip": 1.02703571, + "balance_loss_mlp": 1.04092932, + "epoch": 0.5203066285886067, + "flos": 21981532383360.0, + "grad_norm": 1.6276924489714153, + "language_loss": 0.78420818, + "learning_rate": 1.965534347297008e-06, + "loss": 0.80575949, + "num_input_tokens_seen": 185952030, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.734375, + "step": 8654, + "time_per_iteration": 2.4857122898101807 + }, + { + "auxiliary_loss_clip": 0.01117815, + "auxiliary_loss_mlp": 0.01038442, + "balance_loss_clip": 1.02450645, + "balance_loss_mlp": 1.04275405, + "epoch": 0.5203667518412746, + "flos": 20233329448320.0, + "grad_norm": 2.316843494652732, + "language_loss": 0.8424964, + "learning_rate": 1.9651449441050393e-06, + "loss": 0.86405897, + "num_input_tokens_seen": 185973130, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 8655, + "time_per_iteration": 2.48307728767395 + }, + { + "auxiliary_loss_clip": 0.0111092, + "auxiliary_loss_mlp": 0.01030844, + "balance_loss_clip": 1.01860702, + "balance_loss_mlp": 1.04225183, + "epoch": 0.5204268750939426, + "flos": 15705460627200.0, + "grad_norm": 3.712191764961765, + "language_loss": 0.65503991, + "learning_rate": 1.9647555422347777e-06, + "loss": 0.67645752, + "num_input_tokens_seen": 185990200, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 8656, + "time_per_iteration": 2.442760705947876 + }, + { + "auxiliary_loss_clip": 0.01114044, + "auxiliary_loss_mlp": 0.01029702, + "balance_loss_clip": 1.0173285, + "balance_loss_mlp": 1.04263127, + "epoch": 0.5204869983466105, + "flos": 27449469999360.0, + "grad_norm": 2.4919467158509385, + "language_loss": 0.73240453, + "learning_rate": 1.9643661417009893e-06, + "loss": 0.753842, + "num_input_tokens_seen": 186009880, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.71484375, + "step": 8657, + "time_per_iteration": 2.5198535919189453 + }, + { + "auxiliary_loss_clip": 0.01111884, + "auxiliary_loss_mlp": 0.01033998, + "balance_loss_clip": 1.02064037, + "balance_loss_mlp": 1.042382, + "epoch": 0.5205471215992785, + "flos": 20595452411520.0, + "grad_norm": 1.757060291742625, + "language_loss": 0.71675289, + "learning_rate": 1.9639767425184408e-06, + "loss": 0.73821175, + "num_input_tokens_seen": 186026680, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.6953125, + "step": 8658, + "time_per_iteration": 2.4651598930358887 + }, + { + "auxiliary_loss_clip": 0.0111093, + "auxiliary_loss_mlp": 0.01031842, + "balance_loss_clip": 1.01868176, + "balance_loss_mlp": 1.0400281, + "epoch": 0.5206072448519465, + "flos": 22127904305280.0, + "grad_norm": 1.6795003925123537, + "language_loss": 0.83473611, + "learning_rate": 1.963587344701897e-06, + "loss": 0.85616386, + "num_input_tokens_seen": 186046920, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 8659, + "time_per_iteration": 2.462956428527832 + }, + { + "auxiliary_loss_clip": 0.01119845, + "auxiliary_loss_mlp": 0.01039223, + "balance_loss_clip": 1.02470934, + "balance_loss_mlp": 1.04351366, + "epoch": 0.5206673681046144, + "flos": 18330422636160.0, + "grad_norm": 1.9135176980647008, + "language_loss": 0.75763941, + "learning_rate": 1.9631979482661253e-06, + "loss": 0.77923, + "num_input_tokens_seen": 186062090, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.76171875, + "step": 8660, + "time_per_iteration": 2.4544646739959717 + }, + { + "auxiliary_loss_clip": 0.01111893, + "auxiliary_loss_mlp": 0.0103427, + "balance_loss_clip": 1.02199721, + "balance_loss_mlp": 1.04152977, + "epoch": 0.5207274913572825, + "flos": 20230240878720.0, + "grad_norm": 1.7715737398241405, + "language_loss": 0.78001404, + "learning_rate": 1.9628085532258906e-06, + "loss": 0.80147564, + "num_input_tokens_seen": 186081135, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.703125, + "step": 8661, + "time_per_iteration": 2.4456324577331543 + }, + { + "auxiliary_loss_clip": 0.01113873, + "auxiliary_loss_mlp": 0.01030884, + "balance_loss_clip": 1.01818848, + "balance_loss_mlp": 1.0404228, + "epoch": 0.5207876146099504, + "flos": 22127042378880.0, + "grad_norm": 1.805356331270093, + "language_loss": 0.70643514, + "learning_rate": 1.9624191595959603e-06, + "loss": 0.72788274, + "num_input_tokens_seen": 186099700, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.734375, + "step": 8662, + "time_per_iteration": 2.5272181034088135 + }, + { + "auxiliary_loss_clip": 0.01110335, + "auxiliary_loss_mlp": 0.01032574, + "balance_loss_clip": 1.01835203, + "balance_loss_mlp": 1.04033709, + "epoch": 0.5208477378626184, + "flos": 23878908501120.0, + "grad_norm": 1.669754729528693, + "language_loss": 0.6935755, + "learning_rate": 1.962029767391098e-06, + "loss": 0.71500456, + "num_input_tokens_seen": 186119740, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.69921875, + "step": 8663, + "time_per_iteration": 2.468287706375122 + }, + { + "auxiliary_loss_clip": 0.01113011, + "auxiliary_loss_mlp": 0.01031152, + "balance_loss_clip": 1.01822364, + "balance_loss_mlp": 1.04173064, + "epoch": 0.5209078611152863, + "flos": 20961525870720.0, + "grad_norm": 2.618720199838109, + "language_loss": 0.76771712, + "learning_rate": 1.961640376626072e-06, + "loss": 0.7891587, + "num_input_tokens_seen": 186140645, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71484375, + "step": 8664, + "time_per_iteration": 2.519645929336548 + }, + { + "auxiliary_loss_clip": 0.01111987, + "auxiliary_loss_mlp": 0.01036724, + "balance_loss_clip": 1.02387905, + "balance_loss_mlp": 1.04057467, + "epoch": 0.5209679843679543, + "flos": 20667740532480.0, + "grad_norm": 1.987870026093088, + "language_loss": 0.76193488, + "learning_rate": 1.961250987315646e-06, + "loss": 0.78342199, + "num_input_tokens_seen": 186160130, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71484375, + "step": 8665, + "time_per_iteration": 2.4501259326934814 + }, + { + "auxiliary_loss_clip": 0.01111359, + "auxiliary_loss_mlp": 0.01033252, + "balance_loss_clip": 1.02113414, + "balance_loss_mlp": 1.04135728, + "epoch": 0.5210281076206222, + "flos": 20227295963520.0, + "grad_norm": 1.609030555811117, + "language_loss": 0.71689177, + "learning_rate": 1.960861599474586e-06, + "loss": 0.73833793, + "num_input_tokens_seen": 186179485, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69921875, + "step": 8666, + "time_per_iteration": 2.4961183071136475 + }, + { + "auxiliary_loss_clip": 0.01119663, + "auxiliary_loss_mlp": 0.01035445, + "balance_loss_clip": 1.02031779, + "balance_loss_mlp": 1.04257357, + "epoch": 0.5210882308732903, + "flos": 16069989801600.0, + "grad_norm": 2.081998488723945, + "language_loss": 0.68599117, + "learning_rate": 1.9604722131176592e-06, + "loss": 0.7075423, + "num_input_tokens_seen": 186197140, + "router_z_loss_clip": 0.15136719, + "router_z_loss_mlp": 0.76953125, + "step": 8667, + "time_per_iteration": 2.4216842651367188 + }, + { + "auxiliary_loss_clip": 0.01107841, + "auxiliary_loss_mlp": 0.01034805, + "balance_loss_clip": 1.02247858, + "balance_loss_mlp": 1.03913903, + "epoch": 0.5211483541259582, + "flos": 24825298089600.0, + "grad_norm": 1.3811752682570164, + "language_loss": 0.81006289, + "learning_rate": 1.960082828259629e-06, + "loss": 0.83148932, + "num_input_tokens_seen": 186216800, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6875, + "step": 8668, + "time_per_iteration": 2.5712640285491943 + }, + { + "auxiliary_loss_clip": 0.01112305, + "auxiliary_loss_mlp": 0.01031429, + "balance_loss_clip": 1.0184648, + "balance_loss_mlp": 1.0413909, + "epoch": 0.5212084773786262, + "flos": 20370651143040.0, + "grad_norm": 2.7130530435254507, + "language_loss": 0.63821161, + "learning_rate": 1.9596934449152623e-06, + "loss": 0.65964901, + "num_input_tokens_seen": 186235320, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 8669, + "time_per_iteration": 2.485560894012451 + }, + { + "auxiliary_loss_clip": 0.01114118, + "auxiliary_loss_mlp": 0.01040749, + "balance_loss_clip": 1.02779722, + "balance_loss_mlp": 1.0434041, + "epoch": 0.5212686006312941, + "flos": 23145468693120.0, + "grad_norm": 1.5472632399176471, + "language_loss": 0.66420943, + "learning_rate": 1.959304063099325e-06, + "loss": 0.68575811, + "num_input_tokens_seen": 186254460, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 8670, + "time_per_iteration": 2.5161590576171875 + }, + { + "auxiliary_loss_clip": 0.01107902, + "auxiliary_loss_mlp": 0.01034298, + "balance_loss_clip": 1.02204931, + "balance_loss_mlp": 1.04005504, + "epoch": 0.5213287238839621, + "flos": 27774030314880.0, + "grad_norm": 2.0274420083477436, + "language_loss": 0.7666502, + "learning_rate": 1.9589146828265806e-06, + "loss": 0.78807229, + "num_input_tokens_seen": 186269465, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6796875, + "step": 8671, + "time_per_iteration": 2.4505884647369385 + }, + { + "auxiliary_loss_clip": 0.01117202, + "auxiliary_loss_mlp": 0.01035681, + "balance_loss_clip": 1.022246, + "balance_loss_mlp": 1.0442729, + "epoch": 0.5213888471366301, + "flos": 19937676602880.0, + "grad_norm": 6.168212064153821, + "language_loss": 0.78184325, + "learning_rate": 1.958525304111796e-06, + "loss": 0.80337209, + "num_input_tokens_seen": 186288660, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73046875, + "step": 8672, + "time_per_iteration": 2.501171350479126 + }, + { + "auxiliary_loss_clip": 0.0110814, + "auxiliary_loss_mlp": 0.0103169, + "balance_loss_clip": 1.01958418, + "balance_loss_mlp": 1.03945541, + "epoch": 0.521448970389298, + "flos": 16982731324800.0, + "grad_norm": 1.8428028532242804, + "language_loss": 0.72013724, + "learning_rate": 1.958135926969736e-06, + "loss": 0.74153554, + "num_input_tokens_seen": 186305760, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 8673, + "time_per_iteration": 2.4188430309295654 + }, + { + "auxiliary_loss_clip": 0.01110821, + "auxiliary_loss_mlp": 0.0102766, + "balance_loss_clip": 1.01467764, + "balance_loss_mlp": 1.04007983, + "epoch": 0.5215090936419661, + "flos": 18989706816000.0, + "grad_norm": 1.5425888836045836, + "language_loss": 0.75258517, + "learning_rate": 1.957746551415166e-06, + "loss": 0.77397001, + "num_input_tokens_seen": 186324135, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.70703125, + "step": 8674, + "time_per_iteration": 2.4615721702575684 + }, + { + "auxiliary_loss_clip": 0.01112251, + "auxiliary_loss_mlp": 0.01034993, + "balance_loss_clip": 1.02111149, + "balance_loss_mlp": 1.03926849, + "epoch": 0.521569216894634, + "flos": 16143427157760.0, + "grad_norm": 2.4005630002003198, + "language_loss": 0.86177206, + "learning_rate": 1.9573571774628506e-06, + "loss": 0.88324457, + "num_input_tokens_seen": 186340205, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7265625, + "step": 8675, + "time_per_iteration": 2.4192757606506348 + }, + { + "auxiliary_loss_clip": 0.01036097, + "auxiliary_loss_mlp": 0.00999914, + "balance_loss_clip": 0.99874002, + "balance_loss_mlp": 1.01361609, + "epoch": 0.521629340147302, + "flos": 57579493282560.0, + "grad_norm": 0.8810836824461878, + "language_loss": 0.6315189, + "learning_rate": 1.9569678051275556e-06, + "loss": 0.65187901, + "num_input_tokens_seen": 186396940, + "router_z_loss_clip": 0.01171875, + "router_z_loss_mlp": 0.22460938, + "step": 8676, + "time_per_iteration": 4.428101062774658 + }, + { + "auxiliary_loss_clip": 0.01110201, + "auxiliary_loss_mlp": 0.01030366, + "balance_loss_clip": 1.0180341, + "balance_loss_mlp": 1.04064405, + "epoch": 0.5216894633999699, + "flos": 26796901662720.0, + "grad_norm": 1.671918865817182, + "language_loss": 0.68830431, + "learning_rate": 1.956578434424046e-06, + "loss": 0.70970994, + "num_input_tokens_seen": 186418680, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6953125, + "step": 8677, + "time_per_iteration": 2.54658579826355 + }, + { + "auxiliary_loss_clip": 0.01110241, + "auxiliary_loss_mlp": 0.01030131, + "balance_loss_clip": 1.01739907, + "balance_loss_mlp": 1.03994, + "epoch": 0.5217495866526379, + "flos": 26358719650560.0, + "grad_norm": 1.5408434392952677, + "language_loss": 0.65516353, + "learning_rate": 1.956189065367086e-06, + "loss": 0.6765672, + "num_input_tokens_seen": 186438265, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 8678, + "time_per_iteration": 2.4848899841308594 + }, + { + "auxiliary_loss_clip": 0.01115921, + "auxiliary_loss_mlp": 0.01039888, + "balance_loss_clip": 1.02607715, + "balance_loss_mlp": 1.04188991, + "epoch": 0.5218097099053058, + "flos": 23584009841280.0, + "grad_norm": 2.860112109233836, + "language_loss": 0.69020754, + "learning_rate": 1.9557996979714414e-06, + "loss": 0.71176565, + "num_input_tokens_seen": 186456870, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7421875, + "step": 8679, + "time_per_iteration": 5.267160654067993 + }, + { + "auxiliary_loss_clip": 0.01114296, + "auxiliary_loss_mlp": 0.01037367, + "balance_loss_clip": 1.02467108, + "balance_loss_mlp": 1.04272938, + "epoch": 0.5218698331579739, + "flos": 18077396256000.0, + "grad_norm": 1.7057222009225053, + "language_loss": 0.66956079, + "learning_rate": 1.9554103322518764e-06, + "loss": 0.69107741, + "num_input_tokens_seen": 186476425, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71484375, + "step": 8680, + "time_per_iteration": 3.938239574432373 + }, + { + "auxiliary_loss_clip": 0.01112432, + "auxiliary_loss_mlp": 0.01035918, + "balance_loss_clip": 1.02248955, + "balance_loss_mlp": 1.04123902, + "epoch": 0.5219299564106418, + "flos": 19281121856640.0, + "grad_norm": 1.8837479968625288, + "language_loss": 0.83069575, + "learning_rate": 1.955020968223156e-06, + "loss": 0.85217923, + "num_input_tokens_seen": 186492555, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7109375, + "step": 8681, + "time_per_iteration": 2.475834369659424 + }, + { + "auxiliary_loss_clip": 0.01110385, + "auxiliary_loss_mlp": 0.01034309, + "balance_loss_clip": 1.02189326, + "balance_loss_mlp": 1.03964293, + "epoch": 0.5219900796633098, + "flos": 26651355753600.0, + "grad_norm": 1.7236617199536146, + "language_loss": 0.77448237, + "learning_rate": 1.9546316059000454e-06, + "loss": 0.79592931, + "num_input_tokens_seen": 186513190, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.70703125, + "step": 8682, + "time_per_iteration": 2.484111785888672 + }, + { + "auxiliary_loss_clip": 0.01112356, + "auxiliary_loss_mlp": 0.01043116, + "balance_loss_clip": 1.03124917, + "balance_loss_mlp": 1.041852, + "epoch": 0.5220502029159777, + "flos": 34312717382400.0, + "grad_norm": 1.4820765209382558, + "language_loss": 0.68982363, + "learning_rate": 1.9542422452973082e-06, + "loss": 0.71137834, + "num_input_tokens_seen": 186534830, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.703125, + "step": 8683, + "time_per_iteration": 2.579467535018921 + }, + { + "auxiliary_loss_clip": 0.01112188, + "auxiliary_loss_mlp": 0.01040104, + "balance_loss_clip": 1.02706265, + "balance_loss_mlp": 1.04016137, + "epoch": 0.5221103261686457, + "flos": 22156488552960.0, + "grad_norm": 1.598693343235541, + "language_loss": 0.7622329, + "learning_rate": 1.9538528864297104e-06, + "loss": 0.78375584, + "num_input_tokens_seen": 186554390, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 8684, + "time_per_iteration": 2.4642298221588135 + }, + { + "auxiliary_loss_clip": 0.01107617, + "auxiliary_loss_mlp": 0.01031395, + "balance_loss_clip": 1.01886606, + "balance_loss_mlp": 1.03845632, + "epoch": 0.5221704494213137, + "flos": 19208402772480.0, + "grad_norm": 1.6077803987399797, + "language_loss": 0.75887376, + "learning_rate": 1.9534635293120153e-06, + "loss": 0.7802639, + "num_input_tokens_seen": 186572360, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 8685, + "time_per_iteration": 2.4533908367156982 + }, + { + "auxiliary_loss_clip": 0.01113803, + "auxiliary_loss_mlp": 0.01038269, + "balance_loss_clip": 1.02562094, + "balance_loss_mlp": 1.0427258, + "epoch": 0.5222305726739817, + "flos": 19354056422400.0, + "grad_norm": 1.88354393014551, + "language_loss": 0.80851054, + "learning_rate": 1.9530741739589876e-06, + "loss": 0.83003128, + "num_input_tokens_seen": 186590655, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 8686, + "time_per_iteration": 2.430154323577881 + }, + { + "auxiliary_loss_clip": 0.01106791, + "auxiliary_loss_mlp": 0.01036694, + "balance_loss_clip": 1.02474344, + "balance_loss_mlp": 1.03876567, + "epoch": 0.5222906959266497, + "flos": 27814789272960.0, + "grad_norm": 1.664143868034185, + "language_loss": 0.70208037, + "learning_rate": 1.9526848203853927e-06, + "loss": 0.72351515, + "num_input_tokens_seen": 186610345, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6796875, + "step": 8687, + "time_per_iteration": 2.510512590408325 + }, + { + "auxiliary_loss_clip": 0.01107628, + "auxiliary_loss_mlp": 0.01033442, + "balance_loss_clip": 1.02171767, + "balance_loss_mlp": 1.03840709, + "epoch": 0.5223508191793176, + "flos": 12712988615040.0, + "grad_norm": 2.0206883326938407, + "language_loss": 0.82963884, + "learning_rate": 1.9522954686059936e-06, + "loss": 0.85104954, + "num_input_tokens_seen": 186624360, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.69140625, + "step": 8688, + "time_per_iteration": 2.4092836380004883 + }, + { + "auxiliary_loss_clip": 0.0110979, + "auxiliary_loss_mlp": 0.01033698, + "balance_loss_clip": 1.02107966, + "balance_loss_mlp": 1.04007506, + "epoch": 0.5224109424319856, + "flos": 15632238752640.0, + "grad_norm": 2.711188417076446, + "language_loss": 0.73736638, + "learning_rate": 1.9519061186355558e-06, + "loss": 0.75880128, + "num_input_tokens_seen": 186638680, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69921875, + "step": 8689, + "time_per_iteration": 2.4741477966308594 + }, + { + "auxiliary_loss_clip": 0.01109408, + "auxiliary_loss_mlp": 0.01033862, + "balance_loss_clip": 1.02147067, + "balance_loss_mlp": 1.04056704, + "epoch": 0.5224710656846535, + "flos": 15742233175680.0, + "grad_norm": 1.8604688899774438, + "language_loss": 0.82882619, + "learning_rate": 1.9515167704888417e-06, + "loss": 0.85025889, + "num_input_tokens_seen": 186655840, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 8690, + "time_per_iteration": 2.4194648265838623 + }, + { + "auxiliary_loss_clip": 0.01110389, + "auxiliary_loss_mlp": 0.01038197, + "balance_loss_clip": 1.02476192, + "balance_loss_mlp": 1.03937626, + "epoch": 0.5225311889373215, + "flos": 26030998938240.0, + "grad_norm": 2.3332187959772246, + "language_loss": 0.79397631, + "learning_rate": 1.9511274241806173e-06, + "loss": 0.81546217, + "num_input_tokens_seen": 186674150, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7109375, + "step": 8691, + "time_per_iteration": 2.52500319480896 + }, + { + "auxiliary_loss_clip": 0.01113798, + "auxiliary_loss_mlp": 0.0104147, + "balance_loss_clip": 1.02794003, + "balance_loss_mlp": 1.04154706, + "epoch": 0.5225913121899894, + "flos": 18369278173440.0, + "grad_norm": 1.8556717943569576, + "language_loss": 0.7679857, + "learning_rate": 1.950738079725646e-06, + "loss": 0.78953838, + "num_input_tokens_seen": 186690675, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.72265625, + "step": 8692, + "time_per_iteration": 2.4420764446258545 + }, + { + "auxiliary_loss_clip": 0.0110865, + "auxiliary_loss_mlp": 0.01032578, + "balance_loss_clip": 1.02139628, + "balance_loss_mlp": 1.04145277, + "epoch": 0.5226514354426575, + "flos": 29273516501760.0, + "grad_norm": 1.6990103355094375, + "language_loss": 0.72441196, + "learning_rate": 1.950348737138691e-06, + "loss": 0.74582422, + "num_input_tokens_seen": 186710380, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.671875, + "step": 8693, + "time_per_iteration": 2.551316261291504 + }, + { + "auxiliary_loss_clip": 0.01114591, + "auxiliary_loss_mlp": 0.01042549, + "balance_loss_clip": 1.02841115, + "balance_loss_mlp": 1.04073966, + "epoch": 0.5227115586953254, + "flos": 22853299466880.0, + "grad_norm": 1.780524663497215, + "language_loss": 0.81990045, + "learning_rate": 1.949959396434517e-06, + "loss": 0.84147185, + "num_input_tokens_seen": 186729135, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7421875, + "step": 8694, + "time_per_iteration": 2.4666013717651367 + }, + { + "auxiliary_loss_clip": 0.01036217, + "auxiliary_loss_mlp": 0.01006918, + "balance_loss_clip": 1.00584531, + "balance_loss_mlp": 1.01379716, + "epoch": 0.5227716819479934, + "flos": 57474419022720.0, + "grad_norm": 0.771665075265138, + "language_loss": 0.55743444, + "learning_rate": 1.949570057627888e-06, + "loss": 0.57786584, + "num_input_tokens_seen": 186791115, + "router_z_loss_clip": 0.01074219, + "router_z_loss_mlp": 0.22460938, + "step": 8695, + "time_per_iteration": 3.116420269012451 + }, + { + "auxiliary_loss_clip": 0.01111884, + "auxiliary_loss_mlp": 0.01033913, + "balance_loss_clip": 1.02121711, + "balance_loss_mlp": 1.04176521, + "epoch": 0.5228318052006613, + "flos": 13808264077440.0, + "grad_norm": 1.693403101851131, + "language_loss": 0.7333045, + "learning_rate": 1.9491807207335672e-06, + "loss": 0.75476253, + "num_input_tokens_seen": 186808660, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 8696, + "time_per_iteration": 2.437974452972412 + }, + { + "auxiliary_loss_clip": 0.01112043, + "auxiliary_loss_mlp": 0.01030931, + "balance_loss_clip": 1.01840782, + "balance_loss_mlp": 1.04123831, + "epoch": 0.5228919284533293, + "flos": 15596184476160.0, + "grad_norm": 1.6647399718358808, + "language_loss": 0.7097398, + "learning_rate": 1.948791385766319e-06, + "loss": 0.73116946, + "num_input_tokens_seen": 186825900, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.70703125, + "step": 8697, + "time_per_iteration": 2.5316948890686035 + }, + { + "auxiliary_loss_clip": 0.01107343, + "auxiliary_loss_mlp": 0.01028965, + "balance_loss_clip": 1.0171392, + "balance_loss_mlp": 1.04016519, + "epoch": 0.5229520517059973, + "flos": 22491499726080.0, + "grad_norm": 1.6518576838111187, + "language_loss": 0.80392116, + "learning_rate": 1.948402052740906e-06, + "loss": 0.82528424, + "num_input_tokens_seen": 186843735, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 8698, + "time_per_iteration": 2.4515864849090576 + }, + { + "auxiliary_loss_clip": 0.01110863, + "auxiliary_loss_mlp": 0.01034022, + "balance_loss_clip": 1.0218327, + "balance_loss_mlp": 1.04055512, + "epoch": 0.5230121749586653, + "flos": 22090880361600.0, + "grad_norm": 1.702568194733703, + "language_loss": 0.74550211, + "learning_rate": 1.948012721672093e-06, + "loss": 0.76695091, + "num_input_tokens_seen": 186862440, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.703125, + "step": 8699, + "time_per_iteration": 2.508180856704712 + }, + { + "auxiliary_loss_clip": 0.01114495, + "auxiliary_loss_mlp": 0.01030324, + "balance_loss_clip": 1.01700819, + "balance_loss_mlp": 1.04079318, + "epoch": 0.5230722982113333, + "flos": 22127150119680.0, + "grad_norm": 1.4994824070372519, + "language_loss": 0.73465139, + "learning_rate": 1.947623392574642e-06, + "loss": 0.75609958, + "num_input_tokens_seen": 186880940, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.734375, + "step": 8700, + "time_per_iteration": 2.455620765686035 + }, + { + "auxiliary_loss_clip": 0.01114495, + "auxiliary_loss_mlp": 0.01035916, + "balance_loss_clip": 1.02276719, + "balance_loss_mlp": 1.0418222, + "epoch": 0.5231324214640012, + "flos": 25009268572800.0, + "grad_norm": 1.82733314477648, + "language_loss": 0.66863132, + "learning_rate": 1.947234065463318e-06, + "loss": 0.69013548, + "num_input_tokens_seen": 186900785, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7265625, + "step": 8701, + "time_per_iteration": 2.5278706550598145 + }, + { + "auxiliary_loss_clip": 0.01107483, + "auxiliary_loss_mlp": 0.01034421, + "balance_loss_clip": 1.02162433, + "balance_loss_mlp": 1.03844106, + "epoch": 0.5231925447166692, + "flos": 25740517651200.0, + "grad_norm": 2.0326391886622686, + "language_loss": 0.66616488, + "learning_rate": 1.9468447403528826e-06, + "loss": 0.68758386, + "num_input_tokens_seen": 186920895, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69140625, + "step": 8702, + "time_per_iteration": 2.474238872528076 + }, + { + "auxiliary_loss_clip": 0.01110954, + "auxiliary_loss_mlp": 0.01033297, + "balance_loss_clip": 1.02040434, + "balance_loss_mlp": 1.04128182, + "epoch": 0.5232526679693371, + "flos": 21433930565760.0, + "grad_norm": 1.9248840397651374, + "language_loss": 0.7671175, + "learning_rate": 1.946455417258101e-06, + "loss": 0.78856003, + "num_input_tokens_seen": 186940605, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 8703, + "time_per_iteration": 2.466836929321289 + }, + { + "auxiliary_loss_clip": 0.01115826, + "auxiliary_loss_mlp": 0.0104125, + "balance_loss_clip": 1.02648616, + "balance_loss_mlp": 1.04065156, + "epoch": 0.5233127912220051, + "flos": 35298393471360.0, + "grad_norm": 2.7352924521395576, + "language_loss": 0.76380461, + "learning_rate": 1.9460660961937348e-06, + "loss": 0.78537536, + "num_input_tokens_seen": 186960820, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.75390625, + "step": 8704, + "time_per_iteration": 2.566021680831909 + }, + { + "auxiliary_loss_clip": 0.01109442, + "auxiliary_loss_mlp": 0.01039766, + "balance_loss_clip": 1.0272727, + "balance_loss_mlp": 1.04157901, + "epoch": 0.523372914474673, + "flos": 17051320344960.0, + "grad_norm": 1.6527680542100833, + "language_loss": 0.7804389, + "learning_rate": 1.9456767771745474e-06, + "loss": 0.80193096, + "num_input_tokens_seen": 186976240, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 8705, + "time_per_iteration": 2.4414021968841553 + }, + { + "auxiliary_loss_clip": 0.01113477, + "auxiliary_loss_mlp": 0.01028501, + "balance_loss_clip": 1.01545918, + "balance_loss_mlp": 1.04121351, + "epoch": 0.5234330377273411, + "flos": 18406302117120.0, + "grad_norm": 1.9173845394592544, + "language_loss": 0.69808084, + "learning_rate": 1.9452874602153027e-06, + "loss": 0.7195006, + "num_input_tokens_seen": 186992855, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.72265625, + "step": 8706, + "time_per_iteration": 2.4252305030822754 + }, + { + "auxiliary_loss_clip": 0.01033927, + "auxiliary_loss_mlp": 0.00999849, + "balance_loss_clip": 0.99876386, + "balance_loss_mlp": 1.01179016, + "epoch": 0.523493160980009, + "flos": 65850296970240.0, + "grad_norm": 0.6804801593959132, + "language_loss": 0.52532774, + "learning_rate": 1.9448981453307623e-06, + "loss": 0.5456655, + "num_input_tokens_seen": 187051205, + "router_z_loss_clip": 0.01086426, + "router_z_loss_mlp": 0.22167969, + "step": 8707, + "time_per_iteration": 3.142758369445801 + }, + { + "auxiliary_loss_clip": 0.01109991, + "auxiliary_loss_mlp": 0.01035147, + "balance_loss_clip": 1.02262449, + "balance_loss_mlp": 1.03904724, + "epoch": 0.523553284232677, + "flos": 21872076664320.0, + "grad_norm": 1.7383881327323734, + "language_loss": 0.74716955, + "learning_rate": 1.9445088325356904e-06, + "loss": 0.76862097, + "num_input_tokens_seen": 187070540, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7109375, + "step": 8708, + "time_per_iteration": 2.4591562747955322 + }, + { + "auxiliary_loss_clip": 0.01109127, + "auxiliary_loss_mlp": 0.01026089, + "balance_loss_clip": 1.01402545, + "balance_loss_mlp": 1.04014444, + "epoch": 0.5236134074853449, + "flos": 20848191482880.0, + "grad_norm": 1.691977522935515, + "language_loss": 0.77432841, + "learning_rate": 1.944119521844849e-06, + "loss": 0.79568058, + "num_input_tokens_seen": 187089975, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.69140625, + "step": 8709, + "time_per_iteration": 2.480982780456543 + }, + { + "auxiliary_loss_clip": 0.01114299, + "auxiliary_loss_mlp": 0.01034543, + "balance_loss_clip": 1.01927257, + "balance_loss_mlp": 1.03814077, + "epoch": 0.5236735307380129, + "flos": 25520421064320.0, + "grad_norm": 1.9878514646446084, + "language_loss": 0.8357569, + "learning_rate": 1.9437302132730003e-06, + "loss": 0.85724527, + "num_input_tokens_seen": 187108775, + "router_z_loss_clip": 0.15234375, + "router_z_loss_mlp": 0.76171875, + "step": 8710, + "time_per_iteration": 2.4901626110076904 + }, + { + "auxiliary_loss_clip": 0.01107894, + "auxiliary_loss_mlp": 0.01026835, + "balance_loss_clip": 1.01440704, + "balance_loss_mlp": 1.03936791, + "epoch": 0.523733653990681, + "flos": 23583112001280.0, + "grad_norm": 1.6699101384293633, + "language_loss": 0.69427162, + "learning_rate": 1.943340906834908e-06, + "loss": 0.71561891, + "num_input_tokens_seen": 187128830, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 8711, + "time_per_iteration": 2.476573944091797 + }, + { + "auxiliary_loss_clip": 0.01108558, + "auxiliary_loss_mlp": 0.0103175, + "balance_loss_clip": 1.01879799, + "balance_loss_mlp": 1.03732038, + "epoch": 0.5237937772433489, + "flos": 21106245767040.0, + "grad_norm": 1.8448951706521464, + "language_loss": 0.83195686, + "learning_rate": 1.9429516025453345e-06, + "loss": 0.85335994, + "num_input_tokens_seen": 187149570, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 8712, + "time_per_iteration": 2.4485836029052734 + }, + { + "auxiliary_loss_clip": 0.01111097, + "auxiliary_loss_mlp": 0.01036681, + "balance_loss_clip": 1.02286506, + "balance_loss_mlp": 1.03859973, + "epoch": 0.5238539004960169, + "flos": 19172887200000.0, + "grad_norm": 1.7709353735200277, + "language_loss": 0.69517416, + "learning_rate": 1.9425623004190415e-06, + "loss": 0.71665198, + "num_input_tokens_seen": 187170575, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7265625, + "step": 8713, + "time_per_iteration": 2.496649980545044 + }, + { + "auxiliary_loss_clip": 0.01112233, + "auxiliary_loss_mlp": 0.01033578, + "balance_loss_clip": 1.01934421, + "balance_loss_mlp": 1.03752589, + "epoch": 0.5239140237486848, + "flos": 17888218300800.0, + "grad_norm": 2.61615049353435, + "language_loss": 0.76978022, + "learning_rate": 1.9421730004707925e-06, + "loss": 0.79123831, + "num_input_tokens_seen": 187187190, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.74609375, + "step": 8714, + "time_per_iteration": 2.42134428024292 + }, + { + "auxiliary_loss_clip": 0.01113875, + "auxiliary_loss_mlp": 0.01030729, + "balance_loss_clip": 1.01703143, + "balance_loss_mlp": 1.04200637, + "epoch": 0.5239741470013528, + "flos": 17930413802880.0, + "grad_norm": 1.883747352805191, + "language_loss": 0.75953126, + "learning_rate": 1.9417837027153483e-06, + "loss": 0.78097725, + "num_input_tokens_seen": 187204350, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71875, + "step": 8715, + "time_per_iteration": 2.453313112258911 + }, + { + "auxiliary_loss_clip": 0.01106451, + "auxiliary_loss_mlp": 0.01030996, + "balance_loss_clip": 1.01807356, + "balance_loss_mlp": 1.0377413, + "epoch": 0.5240342702540207, + "flos": 30993386584320.0, + "grad_norm": 1.4951701207047352, + "language_loss": 0.7078892, + "learning_rate": 1.9413944071674723e-06, + "loss": 0.72926366, + "num_input_tokens_seen": 187225605, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6875, + "step": 8716, + "time_per_iteration": 2.536285638809204 + }, + { + "auxiliary_loss_clip": 0.01107976, + "auxiliary_loss_mlp": 0.01035517, + "balance_loss_clip": 1.02394176, + "balance_loss_mlp": 1.03838778, + "epoch": 0.5240943935066887, + "flos": 25005066681600.0, + "grad_norm": 2.055978260271784, + "language_loss": 0.86706465, + "learning_rate": 1.941005113841926e-06, + "loss": 0.88849956, + "num_input_tokens_seen": 187241335, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6953125, + "step": 8717, + "time_per_iteration": 2.5015134811401367 + }, + { + "auxiliary_loss_clip": 0.01108796, + "auxiliary_loss_mlp": 0.01029411, + "balance_loss_clip": 1.01737654, + "balance_loss_mlp": 1.03882921, + "epoch": 0.5241545167593566, + "flos": 23659099223040.0, + "grad_norm": 1.8178940063432978, + "language_loss": 0.60516441, + "learning_rate": 1.9406158227534723e-06, + "loss": 0.6265465, + "num_input_tokens_seen": 187259925, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.69921875, + "step": 8718, + "time_per_iteration": 4.028836488723755 + }, + { + "auxiliary_loss_clip": 0.01112927, + "auxiliary_loss_mlp": 0.01031382, + "balance_loss_clip": 1.01830447, + "balance_loss_mlp": 1.04012215, + "epoch": 0.5242146400120247, + "flos": 23400398494080.0, + "grad_norm": 1.7437517815053911, + "language_loss": 0.71897364, + "learning_rate": 1.940226533916872e-06, + "loss": 0.74041677, + "num_input_tokens_seen": 187279035, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 8719, + "time_per_iteration": 2.455796003341675 + }, + { + "auxiliary_loss_clip": 0.01106409, + "auxiliary_loss_mlp": 0.01027949, + "balance_loss_clip": 1.0163976, + "balance_loss_mlp": 1.03797865, + "epoch": 0.5242747632646926, + "flos": 17749065012480.0, + "grad_norm": 1.705660803101178, + "language_loss": 0.72716737, + "learning_rate": 1.9398372473468877e-06, + "loss": 0.74851096, + "num_input_tokens_seen": 187297555, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.68359375, + "step": 8720, + "time_per_iteration": 2.445131301879883 + }, + { + "auxiliary_loss_clip": 0.01110289, + "auxiliary_loss_mlp": 0.01032373, + "balance_loss_clip": 1.01948094, + "balance_loss_mlp": 1.04000795, + "epoch": 0.5243348865173606, + "flos": 32597731549440.0, + "grad_norm": 1.6022030744217663, + "language_loss": 0.70251679, + "learning_rate": 1.939447963058281e-06, + "loss": 0.72394347, + "num_input_tokens_seen": 187320265, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 8721, + "time_per_iteration": 5.4637322425842285 + }, + { + "auxiliary_loss_clip": 0.01106478, + "auxiliary_loss_mlp": 0.01031051, + "balance_loss_clip": 1.01883805, + "balance_loss_mlp": 1.03700781, + "epoch": 0.5243950097700285, + "flos": 25484115392640.0, + "grad_norm": 1.710812698690052, + "language_loss": 0.86623824, + "learning_rate": 1.939058681065813e-06, + "loss": 0.88761353, + "num_input_tokens_seen": 187338045, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6953125, + "step": 8722, + "time_per_iteration": 2.4582130908966064 + }, + { + "auxiliary_loss_clip": 0.01107687, + "auxiliary_loss_mlp": 0.01032314, + "balance_loss_clip": 1.01850319, + "balance_loss_mlp": 1.03929901, + "epoch": 0.5244551330226965, + "flos": 15268391936640.0, + "grad_norm": 1.6752601944842513, + "language_loss": 0.79654807, + "learning_rate": 1.938669401384247e-06, + "loss": 0.8179481, + "num_input_tokens_seen": 187356040, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.68359375, + "step": 8723, + "time_per_iteration": 2.4436798095703125 + }, + { + "auxiliary_loss_clip": 0.0111223, + "auxiliary_loss_mlp": 0.01035425, + "balance_loss_clip": 1.02165055, + "balance_loss_mlp": 1.04074168, + "epoch": 0.5245152562753645, + "flos": 22237108629120.0, + "grad_norm": 2.2643940307400054, + "language_loss": 0.74980783, + "learning_rate": 1.9382801240283426e-06, + "loss": 0.77128434, + "num_input_tokens_seen": 187374185, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.71484375, + "step": 8724, + "time_per_iteration": 2.4523351192474365 + }, + { + "auxiliary_loss_clip": 0.01114812, + "auxiliary_loss_mlp": 0.01033971, + "balance_loss_clip": 1.0193913, + "balance_loss_mlp": 1.03920281, + "epoch": 0.5245753795280325, + "flos": 29426460612480.0, + "grad_norm": 1.7907307804166401, + "language_loss": 0.70031178, + "learning_rate": 1.9378908490128625e-06, + "loss": 0.72179961, + "num_input_tokens_seen": 187396640, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7578125, + "step": 8725, + "time_per_iteration": 2.548102617263794 + }, + { + "auxiliary_loss_clip": 0.01033499, + "auxiliary_loss_mlp": 0.00998708, + "balance_loss_clip": 0.99755734, + "balance_loss_mlp": 1.01092362, + "epoch": 0.5246355027807005, + "flos": 58834392785280.0, + "grad_norm": 0.7538969042021075, + "language_loss": 0.55637997, + "learning_rate": 1.937501576352568e-06, + "loss": 0.576702, + "num_input_tokens_seen": 187455945, + "router_z_loss_clip": 0.01147461, + "router_z_loss_mlp": 0.2265625, + "step": 8726, + "time_per_iteration": 3.055438995361328 + }, + { + "auxiliary_loss_clip": 0.01033831, + "auxiliary_loss_mlp": 0.00998072, + "balance_loss_clip": 0.99698144, + "balance_loss_mlp": 1.01147294, + "epoch": 0.5246956260333684, + "flos": 64526592965760.0, + "grad_norm": 0.8042859023243575, + "language_loss": 0.58400142, + "learning_rate": 1.937112306062219e-06, + "loss": 0.60432053, + "num_input_tokens_seen": 187519975, + "router_z_loss_clip": 0.01092529, + "router_z_loss_mlp": 0.22460938, + "step": 8727, + "time_per_iteration": 3.071913719177246 + }, + { + "auxiliary_loss_clip": 0.0111222, + "auxiliary_loss_mlp": 0.0103046, + "balance_loss_clip": 1.01701272, + "balance_loss_mlp": 1.03976107, + "epoch": 0.5247557492860364, + "flos": 24533631653760.0, + "grad_norm": 1.3114988788354258, + "language_loss": 0.70559728, + "learning_rate": 1.9367230381565786e-06, + "loss": 0.72702408, + "num_input_tokens_seen": 187541775, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.72265625, + "step": 8728, + "time_per_iteration": -0.15050816535949707 + }, + { + "auxiliary_loss_clip": 0.01108011, + "auxiliary_loss_mlp": 0.01026221, + "balance_loss_clip": 1.01421666, + "balance_loss_mlp": 1.03783965, + "epoch": 0.5248158725387043, + "flos": 18806131382400.0, + "grad_norm": 1.5256282262341387, + "language_loss": 0.6966821, + "learning_rate": 1.9363337726504062e-06, + "loss": 0.71802437, + "num_input_tokens_seen": 187560425, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.703125, + "step": 8729, + "time_per_iteration": 2.470921039581299 + }, + { + "auxiliary_loss_clip": 0.0111289, + "auxiliary_loss_mlp": 0.01031243, + "balance_loss_clip": 1.01859486, + "balance_loss_mlp": 1.04002178, + "epoch": 0.5248759957913723, + "flos": 20955851521920.0, + "grad_norm": 1.7430499295764175, + "language_loss": 0.83498538, + "learning_rate": 1.935944509558464e-06, + "loss": 0.85642672, + "num_input_tokens_seen": 187579930, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7265625, + "step": 8730, + "time_per_iteration": 2.447209358215332 + }, + { + "auxiliary_loss_clip": 0.01109708, + "auxiliary_loss_mlp": 0.01033665, + "balance_loss_clip": 1.02034974, + "balance_loss_mlp": 1.03944659, + "epoch": 0.5249361190440403, + "flos": 18660980522880.0, + "grad_norm": 2.372255604306618, + "language_loss": 0.79440451, + "learning_rate": 1.9355552488955125e-06, + "loss": 0.81583822, + "num_input_tokens_seen": 187595365, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.703125, + "step": 8731, + "time_per_iteration": 2.4764487743377686 + }, + { + "auxiliary_loss_clip": 0.01104468, + "auxiliary_loss_mlp": 0.0103625, + "balance_loss_clip": 1.02373886, + "balance_loss_mlp": 1.03691411, + "epoch": 0.5249962422967083, + "flos": 24863327614080.0, + "grad_norm": 1.577877427677953, + "language_loss": 0.83057785, + "learning_rate": 1.935165990676312e-06, + "loss": 0.8519851, + "num_input_tokens_seen": 187614715, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.67578125, + "step": 8732, + "time_per_iteration": 2.4856929779052734 + }, + { + "auxiliary_loss_clip": 0.01106984, + "auxiliary_loss_mlp": 0.01032336, + "balance_loss_clip": 1.02020669, + "balance_loss_mlp": 1.03737712, + "epoch": 0.5250563655493762, + "flos": 15262681674240.0, + "grad_norm": 1.6308728168221684, + "language_loss": 0.77874607, + "learning_rate": 1.9347767349156237e-06, + "loss": 0.80013925, + "num_input_tokens_seen": 187630745, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 8733, + "time_per_iteration": 2.440887212753296 + }, + { + "auxiliary_loss_clip": 0.01113418, + "auxiliary_loss_mlp": 0.01038191, + "balance_loss_clip": 1.02521539, + "balance_loss_mlp": 1.04069221, + "epoch": 0.5251164888020442, + "flos": 18625177641600.0, + "grad_norm": 1.8154235824744323, + "language_loss": 0.81740808, + "learning_rate": 1.934387481628208e-06, + "loss": 0.83892411, + "num_input_tokens_seen": 187648200, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7265625, + "step": 8734, + "time_per_iteration": 2.4394965171813965 + }, + { + "auxiliary_loss_clip": 0.01106918, + "auxiliary_loss_mlp": 0.01030684, + "balance_loss_clip": 1.01828647, + "balance_loss_mlp": 1.03909111, + "epoch": 0.5251766120547121, + "flos": 29710764760320.0, + "grad_norm": 1.3786944232239873, + "language_loss": 0.76792759, + "learning_rate": 1.933998230828826e-06, + "loss": 0.78930354, + "num_input_tokens_seen": 187669205, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 8735, + "time_per_iteration": 2.5392351150512695 + }, + { + "auxiliary_loss_clip": 0.01110743, + "auxiliary_loss_mlp": 0.01030467, + "balance_loss_clip": 1.01861119, + "balance_loss_mlp": 1.03907919, + "epoch": 0.5252367353073801, + "flos": 23440295525760.0, + "grad_norm": 1.5767625018953106, + "language_loss": 0.80153042, + "learning_rate": 1.9336089825322376e-06, + "loss": 0.8229425, + "num_input_tokens_seen": 187690890, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.71484375, + "step": 8736, + "time_per_iteration": 2.470860242843628 + }, + { + "auxiliary_loss_clip": 0.01110972, + "auxiliary_loss_mlp": 0.01032625, + "balance_loss_clip": 1.01979208, + "balance_loss_mlp": 1.04068267, + "epoch": 0.5252968585600482, + "flos": 30810708990720.0, + "grad_norm": 2.2098484474485716, + "language_loss": 0.69838667, + "learning_rate": 1.9332197367532033e-06, + "loss": 0.71982265, + "num_input_tokens_seen": 187713045, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 8737, + "time_per_iteration": 2.5947840213775635 + }, + { + "auxiliary_loss_clip": 0.01106725, + "auxiliary_loss_mlp": 0.0103151, + "balance_loss_clip": 1.01885569, + "balance_loss_mlp": 1.0369395, + "epoch": 0.5253569818127161, + "flos": 20628274464000.0, + "grad_norm": 1.4975240773091183, + "language_loss": 0.77464664, + "learning_rate": 1.9328304935064833e-06, + "loss": 0.79602897, + "num_input_tokens_seen": 187733640, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 8738, + "time_per_iteration": 2.4910526275634766 + }, + { + "auxiliary_loss_clip": 0.01034294, + "auxiliary_loss_mlp": 0.01014673, + "balance_loss_clip": 1.01349294, + "balance_loss_mlp": 1.01161027, + "epoch": 0.5254171050653841, + "flos": 63428695810560.0, + "grad_norm": 0.7501251002484244, + "language_loss": 0.54472572, + "learning_rate": 1.932441252806837e-06, + "loss": 0.56521541, + "num_input_tokens_seen": 187792930, + "router_z_loss_clip": 0.01177979, + "router_z_loss_mlp": 0.2265625, + "step": 8739, + "time_per_iteration": 3.0936102867126465 + }, + { + "auxiliary_loss_clip": 0.01108375, + "auxiliary_loss_mlp": 0.01032467, + "balance_loss_clip": 1.02058792, + "balance_loss_mlp": 1.03920436, + "epoch": 0.525477228318052, + "flos": 34670782108800.0, + "grad_norm": 4.076584700627864, + "language_loss": 0.847902, + "learning_rate": 1.9320520146690263e-06, + "loss": 0.86931044, + "num_input_tokens_seen": 187812495, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.69140625, + "step": 8740, + "time_per_iteration": 2.5510640144348145 + }, + { + "auxiliary_loss_clip": 0.01107783, + "auxiliary_loss_mlp": 0.01034609, + "balance_loss_clip": 1.02204442, + "balance_loss_mlp": 1.0391773, + "epoch": 0.52553735157072, + "flos": 17930844766080.0, + "grad_norm": 1.9479054855450806, + "language_loss": 0.69464219, + "learning_rate": 1.9316627791078093e-06, + "loss": 0.71606612, + "num_input_tokens_seen": 187829685, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 8741, + "time_per_iteration": 2.4474291801452637 + }, + { + "auxiliary_loss_clip": 0.01112521, + "auxiliary_loss_mlp": 0.01029406, + "balance_loss_clip": 1.01657915, + "balance_loss_mlp": 1.04100168, + "epoch": 0.5255974748233879, + "flos": 9940864584960.0, + "grad_norm": 1.7696604002482594, + "language_loss": 0.6591152, + "learning_rate": 1.931273546137947e-06, + "loss": 0.68053448, + "num_input_tokens_seen": 187846495, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.71484375, + "step": 8742, + "time_per_iteration": 2.4151360988616943 + }, + { + "auxiliary_loss_clip": 0.01112065, + "auxiliary_loss_mlp": 0.01035718, + "balance_loss_clip": 1.02191377, + "balance_loss_mlp": 1.03977919, + "epoch": 0.5256575980760559, + "flos": 16868427269760.0, + "grad_norm": 2.337521906395912, + "language_loss": 0.63094312, + "learning_rate": 1.9308843157741983e-06, + "loss": 0.65242094, + "num_input_tokens_seen": 187862010, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.72265625, + "step": 8743, + "time_per_iteration": 2.4369444847106934 + }, + { + "auxiliary_loss_clip": 0.01033192, + "auxiliary_loss_mlp": 0.01006558, + "balance_loss_clip": 1.00549126, + "balance_loss_mlp": 1.01085198, + "epoch": 0.5257177213287239, + "flos": 62386210362240.0, + "grad_norm": 0.7737212884291378, + "language_loss": 0.54199207, + "learning_rate": 1.930495088031323e-06, + "loss": 0.56238955, + "num_input_tokens_seen": 187922730, + "router_z_loss_clip": 0.01068115, + "router_z_loss_mlp": 0.22363281, + "step": 8744, + "time_per_iteration": 3.1759095191955566 + }, + { + "auxiliary_loss_clip": 0.01114357, + "auxiliary_loss_mlp": 0.01031493, + "balance_loss_clip": 1.01773655, + "balance_loss_mlp": 1.04095125, + "epoch": 0.5257778445813919, + "flos": 20776908942720.0, + "grad_norm": 2.20739797588364, + "language_loss": 0.75574982, + "learning_rate": 1.9301058629240814e-06, + "loss": 0.77720833, + "num_input_tokens_seen": 187940160, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.734375, + "step": 8745, + "time_per_iteration": 2.447798728942871 + }, + { + "auxiliary_loss_clip": 0.01109482, + "auxiliary_loss_mlp": 0.01035025, + "balance_loss_clip": 1.02291942, + "balance_loss_mlp": 1.03964972, + "epoch": 0.5258379678340598, + "flos": 17018606033280.0, + "grad_norm": 1.9635902719056224, + "language_loss": 0.80408484, + "learning_rate": 1.9297166404672324e-06, + "loss": 0.82552993, + "num_input_tokens_seen": 187958625, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69921875, + "step": 8746, + "time_per_iteration": 2.4415667057037354 + }, + { + "auxiliary_loss_clip": 0.01107231, + "auxiliary_loss_mlp": 0.01032675, + "balance_loss_clip": 1.02002132, + "balance_loss_mlp": 1.03842771, + "epoch": 0.5258980910867278, + "flos": 21068754946560.0, + "grad_norm": 1.8094795225841998, + "language_loss": 0.75289273, + "learning_rate": 1.9293274206755353e-06, + "loss": 0.77429175, + "num_input_tokens_seen": 187977575, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 8747, + "time_per_iteration": 2.4909451007843018 + }, + { + "auxiliary_loss_clip": 0.01103122, + "auxiliary_loss_mlp": 0.01029398, + "balance_loss_clip": 1.01716161, + "balance_loss_mlp": 1.03701103, + "epoch": 0.5259582143393957, + "flos": 18004461690240.0, + "grad_norm": 2.3964471896172554, + "language_loss": 0.82515085, + "learning_rate": 1.9289382035637505e-06, + "loss": 0.84647602, + "num_input_tokens_seen": 187996650, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6640625, + "step": 8748, + "time_per_iteration": 2.4266607761383057 + }, + { + "auxiliary_loss_clip": 0.01108131, + "auxiliary_loss_mlp": 0.01032899, + "balance_loss_clip": 1.01948202, + "balance_loss_mlp": 1.03713202, + "epoch": 0.5260183375920637, + "flos": 22783848520320.0, + "grad_norm": 1.9711847853488498, + "language_loss": 0.80562335, + "learning_rate": 1.9285489891466345e-06, + "loss": 0.82703364, + "num_input_tokens_seen": 188013510, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7109375, + "step": 8749, + "time_per_iteration": 2.4795496463775635 + }, + { + "auxiliary_loss_clip": 0.01108885, + "auxiliary_loss_mlp": 0.01033706, + "balance_loss_clip": 1.02076626, + "balance_loss_mlp": 1.04021406, + "epoch": 0.5260784608447318, + "flos": 27052406081280.0, + "grad_norm": 1.712765899743528, + "language_loss": 0.72119522, + "learning_rate": 1.9281597774389487e-06, + "loss": 0.74262118, + "num_input_tokens_seen": 188032085, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6875, + "step": 8750, + "time_per_iteration": 2.5028066635131836 + }, + { + "auxiliary_loss_clip": 0.01105706, + "auxiliary_loss_mlp": 0.01029058, + "balance_loss_clip": 1.0166955, + "balance_loss_mlp": 1.03688407, + "epoch": 0.5261385840973997, + "flos": 20662820369280.0, + "grad_norm": 1.3484208983844765, + "language_loss": 0.76440692, + "learning_rate": 1.9277705684554517e-06, + "loss": 0.78575456, + "num_input_tokens_seen": 188050590, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6875, + "step": 8751, + "time_per_iteration": 2.49141788482666 + }, + { + "auxiliary_loss_clip": 0.0110665, + "auxiliary_loss_mlp": 0.0103178, + "balance_loss_clip": 1.01973987, + "balance_loss_mlp": 1.03969383, + "epoch": 0.5261987073500677, + "flos": 23622649896960.0, + "grad_norm": 1.3930828226372818, + "language_loss": 0.75950229, + "learning_rate": 1.927381362210902e-06, + "loss": 0.78088653, + "num_input_tokens_seen": 188071620, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66796875, + "step": 8752, + "time_per_iteration": 2.4891488552093506 + }, + { + "auxiliary_loss_clip": 0.01110452, + "auxiliary_loss_mlp": 0.0102755, + "balance_loss_clip": 1.01418078, + "balance_loss_mlp": 1.03927755, + "epoch": 0.5262588306027356, + "flos": 27636241743360.0, + "grad_norm": 1.4497375157025647, + "language_loss": 0.6776315, + "learning_rate": 1.926992158720058e-06, + "loss": 0.69901145, + "num_input_tokens_seen": 188091740, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7109375, + "step": 8753, + "time_per_iteration": 2.5364086627960205 + }, + { + "auxiliary_loss_clip": 0.01108994, + "auxiliary_loss_mlp": 0.01032754, + "balance_loss_clip": 1.02072024, + "balance_loss_mlp": 1.04052699, + "epoch": 0.5263189538554036, + "flos": 21759711943680.0, + "grad_norm": 1.4822261150811287, + "language_loss": 0.83834231, + "learning_rate": 1.9266029579976785e-06, + "loss": 0.85975981, + "num_input_tokens_seen": 188111165, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6875, + "step": 8754, + "time_per_iteration": 2.4782354831695557 + }, + { + "auxiliary_loss_clip": 0.01108303, + "auxiliary_loss_mlp": 0.01031002, + "balance_loss_clip": 1.01821733, + "balance_loss_mlp": 1.03804278, + "epoch": 0.5263790771080715, + "flos": 14276359140480.0, + "grad_norm": 2.116384687985529, + "language_loss": 0.8708753, + "learning_rate": 1.926213760058522e-06, + "loss": 0.8922683, + "num_input_tokens_seen": 188127825, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.703125, + "step": 8755, + "time_per_iteration": 2.426422357559204 + }, + { + "auxiliary_loss_clip": 0.01031717, + "auxiliary_loss_mlp": 0.01000414, + "balance_loss_clip": 0.99934119, + "balance_loss_mlp": 1.0092082, + "epoch": 0.5264392003607395, + "flos": 65806413528960.0, + "grad_norm": 0.7185760813251492, + "language_loss": 0.58853483, + "learning_rate": 1.9258245649173477e-06, + "loss": 0.60885608, + "num_input_tokens_seen": 188194050, + "router_z_loss_clip": 0.01074219, + "router_z_loss_mlp": 0.22460938, + "step": 8756, + "time_per_iteration": 3.1429710388183594 + }, + { + "auxiliary_loss_clip": 0.01109539, + "auxiliary_loss_mlp": 0.01033323, + "balance_loss_clip": 1.02052546, + "balance_loss_mlp": 1.03787899, + "epoch": 0.5264993236134075, + "flos": 21032413361280.0, + "grad_norm": 4.297833550953773, + "language_loss": 0.70166421, + "learning_rate": 1.925435372588913e-06, + "loss": 0.72309285, + "num_input_tokens_seen": 188212565, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.71875, + "step": 8757, + "time_per_iteration": 2.4352152347564697 + }, + { + "auxiliary_loss_clip": 0.01108207, + "auxiliary_loss_mlp": 0.01030645, + "balance_loss_clip": 1.01828289, + "balance_loss_mlp": 1.03741014, + "epoch": 0.5265594468660755, + "flos": 16618202150400.0, + "grad_norm": 1.637312529409449, + "language_loss": 0.8773526, + "learning_rate": 1.9250461830879768e-06, + "loss": 0.89874113, + "num_input_tokens_seen": 188229505, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.70703125, + "step": 8758, + "time_per_iteration": 2.4447832107543945 + }, + { + "auxiliary_loss_clip": 0.0110992, + "auxiliary_loss_mlp": 0.01033594, + "balance_loss_clip": 1.02048147, + "balance_loss_mlp": 1.03790975, + "epoch": 0.5266195701187434, + "flos": 24134125610880.0, + "grad_norm": 1.3883962898678874, + "language_loss": 0.76014191, + "learning_rate": 1.9246569964292965e-06, + "loss": 0.78157705, + "num_input_tokens_seen": 188250395, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.72265625, + "step": 8759, + "time_per_iteration": 2.4818501472473145 + }, + { + "auxiliary_loss_clip": 0.01102801, + "auxiliary_loss_mlp": 0.01026631, + "balance_loss_clip": 1.01460838, + "balance_loss_mlp": 1.0357269, + "epoch": 0.5266796933714114, + "flos": 15844111125120.0, + "grad_norm": 1.9978294175433113, + "language_loss": 0.71896535, + "learning_rate": 1.9242678126276307e-06, + "loss": 0.74025965, + "num_input_tokens_seen": 188266785, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 8760, + "time_per_iteration": 3.8544509410858154 + }, + { + "auxiliary_loss_clip": 0.01113013, + "auxiliary_loss_mlp": 0.01034813, + "balance_loss_clip": 1.02161074, + "balance_loss_mlp": 1.03947306, + "epoch": 0.5267398166240793, + "flos": 20951434149120.0, + "grad_norm": 1.9164441807727424, + "language_loss": 0.76221085, + "learning_rate": 1.923878631697736e-06, + "loss": 0.78368914, + "num_input_tokens_seen": 188282525, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 8761, + "time_per_iteration": 2.43031907081604 + }, + { + "auxiliary_loss_clip": 0.01108816, + "auxiliary_loss_mlp": 0.01030109, + "balance_loss_clip": 1.01879597, + "balance_loss_mlp": 1.03958154, + "epoch": 0.5267999398767473, + "flos": 20996394998400.0, + "grad_norm": 1.712095639698782, + "language_loss": 0.70643085, + "learning_rate": 1.923489453654373e-06, + "loss": 0.7278201, + "num_input_tokens_seen": 188301395, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.69140625, + "step": 8762, + "time_per_iteration": 5.321688652038574 + }, + { + "auxiliary_loss_clip": 0.01031212, + "auxiliary_loss_mlp": 0.0100382, + "balance_loss_clip": 1.00266957, + "balance_loss_mlp": 1.00896931, + "epoch": 0.5268600631294152, + "flos": 66849401767680.0, + "grad_norm": 0.9468786857883086, + "language_loss": 0.65414345, + "learning_rate": 1.9231002785122963e-06, + "loss": 0.67449379, + "num_input_tokens_seen": 188357665, + "router_z_loss_clip": 0.01147461, + "router_z_loss_mlp": 0.22265625, + "step": 8763, + "time_per_iteration": 4.360533237457275 + }, + { + "auxiliary_loss_clip": 0.0110798, + "auxiliary_loss_mlp": 0.01031724, + "balance_loss_clip": 1.01927209, + "balance_loss_mlp": 1.03798556, + "epoch": 0.5269201863820833, + "flos": 17165552572800.0, + "grad_norm": 1.6073395480000416, + "language_loss": 0.70771408, + "learning_rate": 1.922711106286265e-06, + "loss": 0.72911114, + "num_input_tokens_seen": 188376935, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.69921875, + "step": 8764, + "time_per_iteration": 2.4463791847229004 + }, + { + "auxiliary_loss_clip": 0.0110759, + "auxiliary_loss_mlp": 0.01029413, + "balance_loss_clip": 1.01640153, + "balance_loss_mlp": 1.03704798, + "epoch": 0.5269803096347513, + "flos": 20522589672960.0, + "grad_norm": 1.6766716538329436, + "language_loss": 0.74135405, + "learning_rate": 1.9223219369910368e-06, + "loss": 0.76272404, + "num_input_tokens_seen": 188394995, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.703125, + "step": 8765, + "time_per_iteration": 2.4344265460968018 + }, + { + "auxiliary_loss_clip": 0.0110988, + "auxiliary_loss_mlp": 0.01033345, + "balance_loss_clip": 1.0194571, + "balance_loss_mlp": 1.03650451, + "epoch": 0.5270404328874192, + "flos": 27230989524480.0, + "grad_norm": 1.4935943977467754, + "language_loss": 0.85193348, + "learning_rate": 1.9219327706413677e-06, + "loss": 0.87336564, + "num_input_tokens_seen": 188415475, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.734375, + "step": 8766, + "time_per_iteration": 2.52951979637146 + }, + { + "auxiliary_loss_clip": 0.0111099, + "auxiliary_loss_mlp": 0.01033628, + "balance_loss_clip": 1.0204674, + "balance_loss_mlp": 1.03980124, + "epoch": 0.5271005561400872, + "flos": 23110491824640.0, + "grad_norm": 1.7377061989269131, + "language_loss": 0.79036993, + "learning_rate": 1.921543607252017e-06, + "loss": 0.8118161, + "num_input_tokens_seen": 188435665, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 8767, + "time_per_iteration": 2.4478976726531982 + }, + { + "auxiliary_loss_clip": 0.0110965, + "auxiliary_loss_mlp": 0.01032349, + "balance_loss_clip": 1.01897943, + "balance_loss_mlp": 1.03842282, + "epoch": 0.5271606793927551, + "flos": 22564793427840.0, + "grad_norm": 1.871676480421452, + "language_loss": 0.73691523, + "learning_rate": 1.9211544468377394e-06, + "loss": 0.75833523, + "num_input_tokens_seen": 188455405, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7109375, + "step": 8768, + "time_per_iteration": 2.464952230453491 + }, + { + "auxiliary_loss_clip": 0.01106727, + "auxiliary_loss_mlp": 0.0103356, + "balance_loss_clip": 1.02222896, + "balance_loss_mlp": 1.03777611, + "epoch": 0.5272208026454231, + "flos": 18764259102720.0, + "grad_norm": 3.4895191769574354, + "language_loss": 0.74093413, + "learning_rate": 1.9207652894132933e-06, + "loss": 0.76233703, + "num_input_tokens_seen": 188472940, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.69140625, + "step": 8769, + "time_per_iteration": 2.4464261531829834 + }, + { + "auxiliary_loss_clip": 0.01108124, + "auxiliary_loss_mlp": 0.01036366, + "balance_loss_clip": 1.02372384, + "balance_loss_mlp": 1.03890908, + "epoch": 0.5272809258980911, + "flos": 20412164286720.0, + "grad_norm": 1.6831893733690892, + "language_loss": 0.7382611, + "learning_rate": 1.920376134993436e-06, + "loss": 0.75970602, + "num_input_tokens_seen": 188493035, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 8770, + "time_per_iteration": 2.4870028495788574 + }, + { + "auxiliary_loss_clip": 0.01110065, + "auxiliary_loss_mlp": 0.01030567, + "balance_loss_clip": 1.01798415, + "balance_loss_mlp": 1.03966439, + "epoch": 0.5273410491507591, + "flos": 28256742213120.0, + "grad_norm": 1.642757388746556, + "language_loss": 0.68108106, + "learning_rate": 1.9199869835929224e-06, + "loss": 0.70248735, + "num_input_tokens_seen": 188513860, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 8771, + "time_per_iteration": 2.5180561542510986 + }, + { + "auxiliary_loss_clip": 0.01106371, + "auxiliary_loss_mlp": 0.0103421, + "balance_loss_clip": 1.02130556, + "balance_loss_mlp": 1.03755426, + "epoch": 0.527401172403427, + "flos": 22455158140800.0, + "grad_norm": 1.8518077177131755, + "language_loss": 0.76476532, + "learning_rate": 1.9195978352265115e-06, + "loss": 0.78617108, + "num_input_tokens_seen": 188533345, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6875, + "step": 8772, + "time_per_iteration": 2.491196870803833 + }, + { + "auxiliary_loss_clip": 0.0111024, + "auxiliary_loss_mlp": 0.01040318, + "balance_loss_clip": 1.0271337, + "balance_loss_mlp": 1.03862512, + "epoch": 0.527461295656095, + "flos": 21031084558080.0, + "grad_norm": 1.8756798124264933, + "language_loss": 0.65986812, + "learning_rate": 1.9192086899089585e-06, + "loss": 0.68137372, + "num_input_tokens_seen": 188551550, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71484375, + "step": 8773, + "time_per_iteration": 2.464393138885498 + }, + { + "auxiliary_loss_clip": 0.01109322, + "auxiliary_loss_mlp": 0.01039476, + "balance_loss_clip": 1.02802014, + "balance_loss_mlp": 1.03791332, + "epoch": 0.5275214189087629, + "flos": 26322018929280.0, + "grad_norm": 1.5758079694219151, + "language_loss": 0.86029238, + "learning_rate": 1.91881954765502e-06, + "loss": 0.88178039, + "num_input_tokens_seen": 188571615, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.71484375, + "step": 8774, + "time_per_iteration": 2.5185675621032715 + }, + { + "auxiliary_loss_clip": 0.01105827, + "auxiliary_loss_mlp": 0.01030351, + "balance_loss_clip": 1.01860261, + "balance_loss_mlp": 1.03663182, + "epoch": 0.5275815421614309, + "flos": 20047024581120.0, + "grad_norm": 1.5254562165137588, + "language_loss": 0.79877412, + "learning_rate": 1.9184304084794523e-06, + "loss": 0.82013589, + "num_input_tokens_seen": 188591965, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.69140625, + "step": 8775, + "time_per_iteration": 2.454387664794922 + }, + { + "auxiliary_loss_clip": 0.01104897, + "auxiliary_loss_mlp": 0.01037755, + "balance_loss_clip": 1.02507758, + "balance_loss_mlp": 1.03681672, + "epoch": 0.5276416654140988, + "flos": 21432206712960.0, + "grad_norm": 1.7390352493983339, + "language_loss": 0.83807105, + "learning_rate": 1.918041272397012e-06, + "loss": 0.85949761, + "num_input_tokens_seen": 188610675, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 8776, + "time_per_iteration": 2.5026144981384277 + }, + { + "auxiliary_loss_clip": 0.01108103, + "auxiliary_loss_mlp": 0.01028347, + "balance_loss_clip": 1.01603246, + "balance_loss_mlp": 1.03759074, + "epoch": 0.5277017886667669, + "flos": 17165085696000.0, + "grad_norm": 1.6658876230443522, + "language_loss": 0.68375832, + "learning_rate": 1.9176521394224547e-06, + "loss": 0.70512283, + "num_input_tokens_seen": 188628235, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 8777, + "time_per_iteration": 2.417186975479126 + }, + { + "auxiliary_loss_clip": 0.01108134, + "auxiliary_loss_mlp": 0.01037656, + "balance_loss_clip": 1.0258069, + "balance_loss_mlp": 1.04009652, + "epoch": 0.5277619119194349, + "flos": 20448146736000.0, + "grad_norm": 2.132165937202497, + "language_loss": 0.82494706, + "learning_rate": 1.9172630095705358e-06, + "loss": 0.84640491, + "num_input_tokens_seen": 188648925, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 8778, + "time_per_iteration": 2.487772226333618 + }, + { + "auxiliary_loss_clip": 0.01110776, + "auxiliary_loss_mlp": 0.01033482, + "balance_loss_clip": 1.02037513, + "balance_loss_mlp": 1.04014647, + "epoch": 0.5278220351721028, + "flos": 24061083304320.0, + "grad_norm": 2.126071455139116, + "language_loss": 0.79359961, + "learning_rate": 1.916873882856013e-06, + "loss": 0.8150422, + "num_input_tokens_seen": 188668125, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.70703125, + "step": 8779, + "time_per_iteration": 2.4676833152770996 + }, + { + "auxiliary_loss_clip": 0.01102313, + "auxiliary_loss_mlp": 0.01031379, + "balance_loss_clip": 1.01942825, + "balance_loss_mlp": 1.03535295, + "epoch": 0.5278821584247708, + "flos": 24642907804800.0, + "grad_norm": 2.916693496001438, + "language_loss": 0.7667526, + "learning_rate": 1.9164847592936406e-06, + "loss": 0.78808951, + "num_input_tokens_seen": 188684410, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66796875, + "step": 8780, + "time_per_iteration": 2.489880323410034 + }, + { + "auxiliary_loss_clip": 0.01113237, + "auxiliary_loss_mlp": 0.01030309, + "balance_loss_clip": 1.01724982, + "balance_loss_mlp": 1.04084253, + "epoch": 0.5279422816774387, + "flos": 35408244240000.0, + "grad_norm": 1.5814481661794648, + "language_loss": 0.69506466, + "learning_rate": 1.916095638898174e-06, + "loss": 0.71650016, + "num_input_tokens_seen": 188706130, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.72265625, + "step": 8781, + "time_per_iteration": 2.570308208465576 + }, + { + "auxiliary_loss_clip": 0.01105161, + "auxiliary_loss_mlp": 0.0103252, + "balance_loss_clip": 1.02130246, + "balance_loss_mlp": 1.03748012, + "epoch": 0.5280024049301068, + "flos": 22967028904320.0, + "grad_norm": 1.5392288400315197, + "language_loss": 0.72434068, + "learning_rate": 1.9157065216843696e-06, + "loss": 0.74571753, + "num_input_tokens_seen": 188725030, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6796875, + "step": 8782, + "time_per_iteration": 2.4902799129486084 + }, + { + "auxiliary_loss_clip": 0.01104346, + "auxiliary_loss_mlp": 0.01027508, + "balance_loss_clip": 1.01557565, + "balance_loss_mlp": 1.03629112, + "epoch": 0.5280625281827747, + "flos": 21507619317120.0, + "grad_norm": 1.9147695733655095, + "language_loss": 0.68684381, + "learning_rate": 1.915317407666982e-06, + "loss": 0.70816237, + "num_input_tokens_seen": 188744325, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 8783, + "time_per_iteration": 2.4489378929138184 + }, + { + "auxiliary_loss_clip": 0.01115533, + "auxiliary_loss_mlp": 0.01037336, + "balance_loss_clip": 1.02257824, + "balance_loss_mlp": 1.04052663, + "epoch": 0.5281226514354427, + "flos": 31208167958400.0, + "grad_norm": 1.8253305439767769, + "language_loss": 0.69502926, + "learning_rate": 1.9149282968607674e-06, + "loss": 0.71655798, + "num_input_tokens_seen": 188765100, + "router_z_loss_clip": 0.14746094, + "router_z_loss_mlp": 0.75, + "step": 8784, + "time_per_iteration": 2.55877947807312 + }, + { + "auxiliary_loss_clip": 0.0111041, + "auxiliary_loss_mlp": 0.01032831, + "balance_loss_clip": 1.01936626, + "balance_loss_mlp": 1.03718495, + "epoch": 0.5281827746881106, + "flos": 25077821679360.0, + "grad_norm": 2.137542562274274, + "language_loss": 0.75317723, + "learning_rate": 1.91453918928048e-06, + "loss": 0.77460963, + "num_input_tokens_seen": 188783995, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73046875, + "step": 8785, + "time_per_iteration": 2.5042202472686768 + }, + { + "auxiliary_loss_clip": 0.01109301, + "auxiliary_loss_mlp": 0.01031712, + "balance_loss_clip": 1.01858115, + "balance_loss_mlp": 1.03923512, + "epoch": 0.5282428979407786, + "flos": 20631255292800.0, + "grad_norm": 1.5356836172740989, + "language_loss": 0.8301636, + "learning_rate": 1.9141500849408745e-06, + "loss": 0.85157377, + "num_input_tokens_seen": 188803120, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.69921875, + "step": 8786, + "time_per_iteration": 2.530207872390747 + }, + { + "auxiliary_loss_clip": 0.01102608, + "auxiliary_loss_mlp": 0.01023798, + "balance_loss_clip": 1.01268828, + "balance_loss_mlp": 1.03662145, + "epoch": 0.5283030211934465, + "flos": 22419391173120.0, + "grad_norm": 6.419117505425037, + "language_loss": 0.8292653, + "learning_rate": 1.9137609838567076e-06, + "loss": 0.85052931, + "num_input_tokens_seen": 188820960, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66015625, + "step": 8787, + "time_per_iteration": 2.450303792953491 + }, + { + "auxiliary_loss_clip": 0.0110476, + "auxiliary_loss_mlp": 0.01027361, + "balance_loss_clip": 1.01601219, + "balance_loss_mlp": 1.03739762, + "epoch": 0.5283631444461145, + "flos": 23615467176960.0, + "grad_norm": 1.657610649379585, + "language_loss": 0.83385652, + "learning_rate": 1.9133718860427316e-06, + "loss": 0.85517776, + "num_input_tokens_seen": 188837165, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.671875, + "step": 8788, + "time_per_iteration": 2.4752538204193115 + }, + { + "auxiliary_loss_clip": 0.01107311, + "auxiliary_loss_mlp": 0.01036961, + "balance_loss_clip": 1.02353776, + "balance_loss_mlp": 1.04022217, + "epoch": 0.5284232676987825, + "flos": 32671994918400.0, + "grad_norm": 1.6616469699693164, + "language_loss": 0.7467941, + "learning_rate": 1.9129827915137027e-06, + "loss": 0.76823682, + "num_input_tokens_seen": 188858555, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.671875, + "step": 8789, + "time_per_iteration": 2.5324580669403076 + }, + { + "auxiliary_loss_clip": 0.01109504, + "auxiliary_loss_mlp": 0.01032501, + "balance_loss_clip": 1.01999021, + "balance_loss_mlp": 1.03898668, + "epoch": 0.5284833909514505, + "flos": 26760919213440.0, + "grad_norm": 1.4692396487834778, + "language_loss": 0.69505095, + "learning_rate": 1.9125937002843754e-06, + "loss": 0.71647108, + "num_input_tokens_seen": 188879050, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 8790, + "time_per_iteration": 2.51625919342041 + }, + { + "auxiliary_loss_clip": 0.01104373, + "auxiliary_loss_mlp": 0.01027676, + "balance_loss_clip": 1.01602292, + "balance_loss_mlp": 1.03740895, + "epoch": 0.5285435142041185, + "flos": 22090700793600.0, + "grad_norm": 1.5973748463846205, + "language_loss": 0.78992987, + "learning_rate": 1.9122046123695036e-06, + "loss": 0.81125033, + "num_input_tokens_seen": 188898885, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.671875, + "step": 8791, + "time_per_iteration": 2.4552273750305176 + }, + { + "auxiliary_loss_clip": 0.01108186, + "auxiliary_loss_mlp": 0.01029947, + "balance_loss_clip": 1.01800871, + "balance_loss_mlp": 1.04050541, + "epoch": 0.5286036374567864, + "flos": 20375463565440.0, + "grad_norm": 1.8738977568036352, + "language_loss": 0.66256213, + "learning_rate": 1.9118155277838423e-06, + "loss": 0.68394351, + "num_input_tokens_seen": 188917225, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 8792, + "time_per_iteration": 2.485501527786255 + }, + { + "auxiliary_loss_clip": 0.01103536, + "auxiliary_loss_mlp": 0.01035011, + "balance_loss_clip": 1.02362621, + "balance_loss_mlp": 1.03610563, + "epoch": 0.5286637607094544, + "flos": 24352175122560.0, + "grad_norm": 2.0158719758485226, + "language_loss": 0.79919344, + "learning_rate": 1.9114264465421443e-06, + "loss": 0.82057893, + "num_input_tokens_seen": 188936120, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.671875, + "step": 8793, + "time_per_iteration": 2.4918789863586426 + }, + { + "auxiliary_loss_clip": 0.01108596, + "auxiliary_loss_mlp": 0.01036682, + "balance_loss_clip": 1.02393866, + "balance_loss_mlp": 1.03883982, + "epoch": 0.5287238839621223, + "flos": 17271165536640.0, + "grad_norm": 1.8030848585204593, + "language_loss": 0.84791529, + "learning_rate": 1.9110373686591645e-06, + "loss": 0.86936802, + "num_input_tokens_seen": 188953405, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 8794, + "time_per_iteration": 2.451828718185425 + }, + { + "auxiliary_loss_clip": 0.01110927, + "auxiliary_loss_mlp": 0.01032186, + "balance_loss_clip": 1.01894772, + "balance_loss_mlp": 1.03798628, + "epoch": 0.5287840072147904, + "flos": 17566890209280.0, + "grad_norm": 1.927550813134725, + "language_loss": 0.67570889, + "learning_rate": 1.9106482941496564e-06, + "loss": 0.69714004, + "num_input_tokens_seen": 188971150, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73046875, + "step": 8795, + "time_per_iteration": 2.4460599422454834 + }, + { + "auxiliary_loss_clip": 0.01107843, + "auxiliary_loss_mlp": 0.01029404, + "balance_loss_clip": 1.01716161, + "balance_loss_mlp": 1.03754616, + "epoch": 0.5288441304674583, + "flos": 18552099421440.0, + "grad_norm": 1.883468232968509, + "language_loss": 0.80662012, + "learning_rate": 1.910259223028374e-06, + "loss": 0.82799256, + "num_input_tokens_seen": 188989550, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.703125, + "step": 8796, + "time_per_iteration": 2.4592626094818115 + }, + { + "auxiliary_loss_clip": 0.01112299, + "auxiliary_loss_mlp": 0.0103268, + "balance_loss_clip": 1.01978111, + "balance_loss_mlp": 1.04186153, + "epoch": 0.5289042537201263, + "flos": 20814507504000.0, + "grad_norm": 1.9732503530858911, + "language_loss": 0.69071984, + "learning_rate": 1.909870155310071e-06, + "loss": 0.71216959, + "num_input_tokens_seen": 189008795, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 8797, + "time_per_iteration": 2.4451231956481934 + }, + { + "auxiliary_loss_clip": 0.01103286, + "auxiliary_loss_mlp": 0.01032584, + "balance_loss_clip": 1.02128911, + "balance_loss_mlp": 1.03739119, + "epoch": 0.5289643769727942, + "flos": 15735265937280.0, + "grad_norm": 1.7017381786261847, + "language_loss": 0.82339096, + "learning_rate": 1.9094810910095005e-06, + "loss": 0.84474969, + "num_input_tokens_seen": 189025540, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.66015625, + "step": 8798, + "time_per_iteration": 2.4694111347198486 + }, + { + "auxiliary_loss_clip": 0.01111092, + "auxiliary_loss_mlp": 0.0103436, + "balance_loss_clip": 1.02102065, + "balance_loss_mlp": 1.03840899, + "epoch": 0.5290245002254622, + "flos": 19537308633600.0, + "grad_norm": 2.0619187329461575, + "language_loss": 0.70591879, + "learning_rate": 1.9090920301414166e-06, + "loss": 0.72737336, + "num_input_tokens_seen": 189044885, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7265625, + "step": 8799, + "time_per_iteration": 2.456692695617676 + }, + { + "auxiliary_loss_clip": 0.01104599, + "auxiliary_loss_mlp": 0.01032577, + "balance_loss_clip": 1.02132988, + "balance_loss_mlp": 1.03975451, + "epoch": 0.5290846234781301, + "flos": 15815131827840.0, + "grad_norm": 1.8240531153484045, + "language_loss": 0.69601536, + "learning_rate": 1.9087029727205716e-06, + "loss": 0.71738708, + "num_input_tokens_seen": 189061280, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 8800, + "time_per_iteration": 2.490417242050171 + }, + { + "auxiliary_loss_clip": 0.01036269, + "auxiliary_loss_mlp": 0.01012691, + "balance_loss_clip": 1.01148117, + "balance_loss_mlp": 1.01404071, + "epoch": 0.5291447467307981, + "flos": 70057624821120.0, + "grad_norm": 0.998441198923784, + "language_loss": 0.57013941, + "learning_rate": 1.9083139187617193e-06, + "loss": 0.59062898, + "num_input_tokens_seen": 189114775, + "router_z_loss_clip": 0.01208496, + "router_z_loss_mlp": 0.22265625, + "step": 8801, + "time_per_iteration": 4.385375022888184 + }, + { + "auxiliary_loss_clip": 0.01109021, + "auxiliary_loss_mlp": 0.0103377, + "balance_loss_clip": 1.02168214, + "balance_loss_mlp": 1.03874719, + "epoch": 0.529204869983466, + "flos": 28364186770560.0, + "grad_norm": 1.5128121202389628, + "language_loss": 0.63942313, + "learning_rate": 1.9079248682796123e-06, + "loss": 0.66085106, + "num_input_tokens_seen": 189134700, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.703125, + "step": 8802, + "time_per_iteration": 2.5486578941345215 + }, + { + "auxiliary_loss_clip": 0.01104672, + "auxiliary_loss_mlp": 0.01029396, + "balance_loss_clip": 1.01684964, + "balance_loss_mlp": 1.03677487, + "epoch": 0.5292649932361341, + "flos": 33758830684800.0, + "grad_norm": 1.7172902320691381, + "language_loss": 0.68250531, + "learning_rate": 1.907535821289003e-06, + "loss": 0.70384604, + "num_input_tokens_seen": 189155365, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 8803, + "time_per_iteration": 2.576460361480713 + }, + { + "auxiliary_loss_clip": 0.01102984, + "auxiliary_loss_mlp": 0.01034692, + "balance_loss_clip": 1.02233613, + "balance_loss_mlp": 1.03654003, + "epoch": 0.5293251164888021, + "flos": 20447679859200.0, + "grad_norm": 1.6769492859989101, + "language_loss": 0.76551962, + "learning_rate": 1.9071467778046458e-06, + "loss": 0.78689635, + "num_input_tokens_seen": 189173885, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6640625, + "step": 8804, + "time_per_iteration": 4.018502473831177 + }, + { + "auxiliary_loss_clip": 0.0103564, + "auxiliary_loss_mlp": 0.01005394, + "balance_loss_clip": 1.00417256, + "balance_loss_mlp": 1.01327515, + "epoch": 0.52938523974147, + "flos": 66545312204160.0, + "grad_norm": 0.749734320345171, + "language_loss": 0.53018034, + "learning_rate": 1.906757737841291e-06, + "loss": 0.55059063, + "num_input_tokens_seen": 189236515, + "router_z_loss_clip": 0.01220703, + "router_z_loss_mlp": 0.22363281, + "step": 8805, + "time_per_iteration": 4.599541902542114 + }, + { + "auxiliary_loss_clip": 0.01034831, + "auxiliary_loss_mlp": 0.01001215, + "balance_loss_clip": 0.99995738, + "balance_loss_mlp": 1.0124402, + "epoch": 0.529445362994138, + "flos": 67151734542720.0, + "grad_norm": 0.7381494507925852, + "language_loss": 0.63778675, + "learning_rate": 1.906368701413693e-06, + "loss": 0.65814722, + "num_input_tokens_seen": 189300500, + "router_z_loss_clip": 0.01257324, + "router_z_loss_mlp": 0.22460938, + "step": 8806, + "time_per_iteration": 3.067852735519409 + }, + { + "auxiliary_loss_clip": 0.01110413, + "auxiliary_loss_mlp": 0.01034235, + "balance_loss_clip": 1.02229047, + "balance_loss_mlp": 1.03770947, + "epoch": 0.5295054862468059, + "flos": 17749316407680.0, + "grad_norm": 1.9894097123133165, + "language_loss": 0.72397399, + "learning_rate": 1.9059796685366026e-06, + "loss": 0.74542046, + "num_input_tokens_seen": 189319745, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.7265625, + "step": 8807, + "time_per_iteration": 2.4303808212280273 + }, + { + "auxiliary_loss_clip": 0.0110442, + "auxiliary_loss_mlp": 0.01029131, + "balance_loss_clip": 1.01775241, + "balance_loss_mlp": 1.03735805, + "epoch": 0.529565609499474, + "flos": 11397401084160.0, + "grad_norm": 4.619049711580288, + "language_loss": 0.69640231, + "learning_rate": 1.9055906392247723e-06, + "loss": 0.71773779, + "num_input_tokens_seen": 189334550, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.671875, + "step": 8808, + "time_per_iteration": 2.418649435043335 + }, + { + "auxiliary_loss_clip": 0.01105928, + "auxiliary_loss_mlp": 0.01029731, + "balance_loss_clip": 1.01817942, + "balance_loss_mlp": 1.03796387, + "epoch": 0.5296257327521419, + "flos": 17196363463680.0, + "grad_norm": 1.7756221154666856, + "language_loss": 0.8668943, + "learning_rate": 1.9052016134929554e-06, + "loss": 0.88825089, + "num_input_tokens_seen": 189351735, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6796875, + "step": 8809, + "time_per_iteration": 2.413883686065674 + }, + { + "auxiliary_loss_clip": 0.01112398, + "auxiliary_loss_mlp": 0.01034558, + "balance_loss_clip": 1.0209322, + "balance_loss_mlp": 1.03908372, + "epoch": 0.5296858560048099, + "flos": 39964086777600.0, + "grad_norm": 1.608353260814621, + "language_loss": 0.64362073, + "learning_rate": 1.9048125913559016e-06, + "loss": 0.66509026, + "num_input_tokens_seen": 189373105, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73046875, + "step": 8810, + "time_per_iteration": 2.6121585369110107 + }, + { + "auxiliary_loss_clip": 0.0110573, + "auxiliary_loss_mlp": 0.01035568, + "balance_loss_clip": 1.02372456, + "balance_loss_mlp": 1.03820479, + "epoch": 0.5297459792574778, + "flos": 20961418129920.0, + "grad_norm": 1.5055977388002117, + "language_loss": 0.68083066, + "learning_rate": 1.9044235728283646e-06, + "loss": 0.70224369, + "num_input_tokens_seen": 189394615, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 8811, + "time_per_iteration": 2.4806406497955322 + }, + { + "auxiliary_loss_clip": 0.010328, + "auxiliary_loss_mlp": 0.00998698, + "balance_loss_clip": 0.99739295, + "balance_loss_mlp": 1.01059103, + "epoch": 0.5298061025101458, + "flos": 66523620389760.0, + "grad_norm": 0.6652461754552681, + "language_loss": 0.53400505, + "learning_rate": 1.9040345579250953e-06, + "loss": 0.5543201, + "num_input_tokens_seen": 189459750, + "router_z_loss_clip": 0.01306152, + "router_z_loss_mlp": 0.22265625, + "step": 8812, + "time_per_iteration": 3.175478458404541 + }, + { + "auxiliary_loss_clip": 0.01032825, + "auxiliary_loss_mlp": 0.01000267, + "balance_loss_clip": 0.99906272, + "balance_loss_mlp": 1.01074851, + "epoch": 0.5298662257628137, + "flos": 67662994775040.0, + "grad_norm": 0.7207460213448722, + "language_loss": 0.56372511, + "learning_rate": 1.9036455466608453e-06, + "loss": 0.58405602, + "num_input_tokens_seen": 189527540, + "router_z_loss_clip": 0.01202393, + "router_z_loss_mlp": 0.22070312, + "step": 8813, + "time_per_iteration": 3.1315269470214844 + }, + { + "auxiliary_loss_clip": 0.01102589, + "auxiliary_loss_mlp": 0.01028915, + "balance_loss_clip": 1.01751852, + "balance_loss_mlp": 1.03824615, + "epoch": 0.5299263490154817, + "flos": 19646405216640.0, + "grad_norm": 1.5478508872520975, + "language_loss": 0.81618506, + "learning_rate": 1.9032565390503657e-06, + "loss": 0.8375001, + "num_input_tokens_seen": 189546900, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.640625, + "step": 8814, + "time_per_iteration": 2.431269884109497 + }, + { + "auxiliary_loss_clip": 0.01113436, + "auxiliary_loss_mlp": 0.01028986, + "balance_loss_clip": 1.01687407, + "balance_loss_mlp": 1.04241931, + "epoch": 0.5299864722681497, + "flos": 22055005653120.0, + "grad_norm": 1.5843849623618003, + "language_loss": 0.84997016, + "learning_rate": 1.9028675351084076e-06, + "loss": 0.8713944, + "num_input_tokens_seen": 189566490, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.7109375, + "step": 8815, + "time_per_iteration": 2.531074285507202 + }, + { + "auxiliary_loss_clip": 0.01105692, + "auxiliary_loss_mlp": 0.01030775, + "balance_loss_clip": 1.01968288, + "balance_loss_mlp": 1.03940964, + "epoch": 0.5300465955208177, + "flos": 21763698353280.0, + "grad_norm": 2.126267576495584, + "language_loss": 0.66768968, + "learning_rate": 1.9024785348497225e-06, + "loss": 0.68905437, + "num_input_tokens_seen": 189585580, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6640625, + "step": 8816, + "time_per_iteration": 2.525468111038208 + }, + { + "auxiliary_loss_clip": 0.01107527, + "auxiliary_loss_mlp": 0.01033036, + "balance_loss_clip": 1.0210259, + "balance_loss_mlp": 1.03860188, + "epoch": 0.5301067187734857, + "flos": 42996491735040.0, + "grad_norm": 1.7854125043951103, + "language_loss": 0.72206688, + "learning_rate": 1.9020895382890611e-06, + "loss": 0.74347246, + "num_input_tokens_seen": 189608485, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6875, + "step": 8817, + "time_per_iteration": 2.6937406063079834 + }, + { + "auxiliary_loss_clip": 0.01107001, + "auxiliary_loss_mlp": 0.01031476, + "balance_loss_clip": 1.01921499, + "balance_loss_mlp": 1.03620088, + "epoch": 0.5301668420261536, + "flos": 20554298403840.0, + "grad_norm": 1.6863401200151742, + "language_loss": 0.6522249, + "learning_rate": 1.9017005454411743e-06, + "loss": 0.67360961, + "num_input_tokens_seen": 189627815, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.70703125, + "step": 8818, + "time_per_iteration": 2.509539842605591 + }, + { + "auxiliary_loss_clip": 0.0110849, + "auxiliary_loss_mlp": 0.01027368, + "balance_loss_clip": 1.01462412, + "balance_loss_mlp": 1.0393914, + "epoch": 0.5302269652788216, + "flos": 17486665182720.0, + "grad_norm": 1.999877555758676, + "language_loss": 0.75154972, + "learning_rate": 1.9013115563208126e-06, + "loss": 0.77290833, + "num_input_tokens_seen": 189644850, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 8819, + "time_per_iteration": 2.473130702972412 + }, + { + "auxiliary_loss_clip": 0.01110233, + "auxiliary_loss_mlp": 0.0103388, + "balance_loss_clip": 1.0214107, + "balance_loss_mlp": 1.03858495, + "epoch": 0.5302870885314895, + "flos": 14574202715520.0, + "grad_norm": 2.27674417450437, + "language_loss": 0.82333302, + "learning_rate": 1.9009225709427267e-06, + "loss": 0.84477413, + "num_input_tokens_seen": 189660945, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71875, + "step": 8820, + "time_per_iteration": 2.4328434467315674 + }, + { + "auxiliary_loss_clip": 0.01106236, + "auxiliary_loss_mlp": 0.01031495, + "balance_loss_clip": 1.02030122, + "balance_loss_mlp": 1.03725612, + "epoch": 0.5303472117841576, + "flos": 23438032968960.0, + "grad_norm": 2.049749716635941, + "language_loss": 0.72593045, + "learning_rate": 1.9005335893216667e-06, + "loss": 0.74730772, + "num_input_tokens_seen": 189680425, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.69140625, + "step": 8821, + "time_per_iteration": 2.508608102798462 + }, + { + "auxiliary_loss_clip": 0.01102657, + "auxiliary_loss_mlp": 0.01028883, + "balance_loss_clip": 1.01779675, + "balance_loss_mlp": 1.0363605, + "epoch": 0.5304073350368255, + "flos": 22709010533760.0, + "grad_norm": 1.3923419148404492, + "language_loss": 0.73939008, + "learning_rate": 1.9001446114723824e-06, + "loss": 0.76070547, + "num_input_tokens_seen": 189700375, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6640625, + "step": 8822, + "time_per_iteration": 2.4427592754364014 + }, + { + "auxiliary_loss_clip": 0.01107083, + "auxiliary_loss_mlp": 0.01035958, + "balance_loss_clip": 1.02312553, + "balance_loss_mlp": 1.03773904, + "epoch": 0.5304674582894935, + "flos": 27928554624000.0, + "grad_norm": 1.6902308577802683, + "language_loss": 0.67477053, + "learning_rate": 1.8997556374096257e-06, + "loss": 0.69620097, + "num_input_tokens_seen": 189721225, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 8823, + "time_per_iteration": 2.5047175884246826 + }, + { + "auxiliary_loss_clip": 0.0110955, + "auxiliary_loss_mlp": 0.0103452, + "balance_loss_clip": 1.02113247, + "balance_loss_mlp": 1.03756142, + "epoch": 0.5305275815421614, + "flos": 21250642440960.0, + "grad_norm": 1.5189625554392572, + "language_loss": 0.69347805, + "learning_rate": 1.8993666671481444e-06, + "loss": 0.71491873, + "num_input_tokens_seen": 189740170, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71875, + "step": 8824, + "time_per_iteration": 2.4358925819396973 + }, + { + "auxiliary_loss_clip": 0.01104053, + "auxiliary_loss_mlp": 0.01030719, + "balance_loss_clip": 1.01879227, + "balance_loss_mlp": 1.03755724, + "epoch": 0.5305877047948294, + "flos": 17603088140160.0, + "grad_norm": 2.2315847136946956, + "language_loss": 0.75412273, + "learning_rate": 1.898977700702689e-06, + "loss": 0.77547044, + "num_input_tokens_seen": 189757890, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 8825, + "time_per_iteration": 2.480656385421753 + }, + { + "auxiliary_loss_clip": 0.01105338, + "auxiliary_loss_mlp": 0.01036746, + "balance_loss_clip": 1.02433622, + "balance_loss_mlp": 1.03730893, + "epoch": 0.5306478280474973, + "flos": 15195493284480.0, + "grad_norm": 2.0577399670241125, + "language_loss": 0.85668242, + "learning_rate": 1.8985887380880103e-06, + "loss": 0.87810326, + "num_input_tokens_seen": 189775390, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 8826, + "time_per_iteration": 2.422227621078491 + }, + { + "auxiliary_loss_clip": 0.01103641, + "auxiliary_loss_mlp": 0.01030185, + "balance_loss_clip": 1.01760268, + "balance_loss_mlp": 1.03594768, + "epoch": 0.5307079513001653, + "flos": 15341218761600.0, + "grad_norm": 1.3501660325975628, + "language_loss": 0.64042354, + "learning_rate": 1.8981997793188558e-06, + "loss": 0.66176176, + "num_input_tokens_seen": 189793975, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.67578125, + "step": 8827, + "time_per_iteration": 2.461434841156006 + }, + { + "auxiliary_loss_clip": 0.0110958, + "auxiliary_loss_mlp": 0.0103759, + "balance_loss_clip": 1.02452421, + "balance_loss_mlp": 1.03835428, + "epoch": 0.5307680745528333, + "flos": 43544452688640.0, + "grad_norm": 1.5699076783392119, + "language_loss": 0.60028976, + "learning_rate": 1.8978108244099762e-06, + "loss": 0.62176144, + "num_input_tokens_seen": 189817870, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71484375, + "step": 8828, + "time_per_iteration": 2.621673107147217 + }, + { + "auxiliary_loss_clip": 0.01109989, + "auxiliary_loss_mlp": 0.01030803, + "balance_loss_clip": 1.01791096, + "balance_loss_mlp": 1.03909802, + "epoch": 0.5308281978055013, + "flos": 20048928001920.0, + "grad_norm": 1.7449235888895405, + "language_loss": 0.81386358, + "learning_rate": 1.8974218733761208e-06, + "loss": 0.83527148, + "num_input_tokens_seen": 189837905, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 8829, + "time_per_iteration": 2.472055673599243 + }, + { + "auxiliary_loss_clip": 0.0110653, + "auxiliary_loss_mlp": 0.01033982, + "balance_loss_clip": 1.02196574, + "balance_loss_mlp": 1.03871477, + "epoch": 0.5308883210581693, + "flos": 20703938463360.0, + "grad_norm": 1.483207387046285, + "language_loss": 0.78292549, + "learning_rate": 1.8970329262320375e-06, + "loss": 0.80433053, + "num_input_tokens_seen": 189856970, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 8830, + "time_per_iteration": 2.4544272422790527 + }, + { + "auxiliary_loss_clip": 0.01106311, + "auxiliary_loss_mlp": 0.01031189, + "balance_loss_clip": 1.01877975, + "balance_loss_mlp": 1.03778768, + "epoch": 0.5309484443108372, + "flos": 14355506759040.0, + "grad_norm": 2.0257257472461525, + "language_loss": 0.80643964, + "learning_rate": 1.8966439829924768e-06, + "loss": 0.82781464, + "num_input_tokens_seen": 189872830, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 8831, + "time_per_iteration": 2.4307594299316406 + }, + { + "auxiliary_loss_clip": 0.01103982, + "auxiliary_loss_mlp": 0.01028528, + "balance_loss_clip": 1.01611233, + "balance_loss_mlp": 1.03561974, + "epoch": 0.5310085675635052, + "flos": 20010503427840.0, + "grad_norm": 2.026603228036347, + "language_loss": 0.73146117, + "learning_rate": 1.896255043672186e-06, + "loss": 0.75278628, + "num_input_tokens_seen": 189891635, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 8832, + "time_per_iteration": 2.429567813873291 + }, + { + "auxiliary_loss_clip": 0.01111675, + "auxiliary_loss_mlp": 0.01034067, + "balance_loss_clip": 1.02162194, + "balance_loss_mlp": 1.04065752, + "epoch": 0.5310686908161731, + "flos": 22127293774080.0, + "grad_norm": 1.9229428073701915, + "language_loss": 0.75382435, + "learning_rate": 1.8958661082859143e-06, + "loss": 0.77528179, + "num_input_tokens_seen": 189909050, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 8833, + "time_per_iteration": 2.4731011390686035 + }, + { + "auxiliary_loss_clip": 0.01107496, + "auxiliary_loss_mlp": 0.01030529, + "balance_loss_clip": 1.01733804, + "balance_loss_mlp": 1.03697777, + "epoch": 0.5311288140688412, + "flos": 24717889445760.0, + "grad_norm": 1.9718581367947616, + "language_loss": 0.73314357, + "learning_rate": 1.8954771768484103e-06, + "loss": 0.75452387, + "num_input_tokens_seen": 189927405, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 8834, + "time_per_iteration": 2.476289987564087 + }, + { + "auxiliary_loss_clip": 0.01113252, + "auxiliary_loss_mlp": 0.01033749, + "balance_loss_clip": 1.01980758, + "balance_loss_mlp": 1.03958392, + "epoch": 0.5311889373215091, + "flos": 24097712198400.0, + "grad_norm": 2.0084943443028975, + "language_loss": 0.77603996, + "learning_rate": 1.8950882493744226e-06, + "loss": 0.79750997, + "num_input_tokens_seen": 189947740, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.73828125, + "step": 8835, + "time_per_iteration": 2.512998104095459 + }, + { + "auxiliary_loss_clip": 0.01106643, + "auxiliary_loss_mlp": 0.01036561, + "balance_loss_clip": 1.02318025, + "balance_loss_mlp": 1.03647518, + "epoch": 0.5312490605741771, + "flos": 22017012042240.0, + "grad_norm": 1.8374817013403106, + "language_loss": 0.72753531, + "learning_rate": 1.8946993258786985e-06, + "loss": 0.74896735, + "num_input_tokens_seen": 189966495, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.703125, + "step": 8836, + "time_per_iteration": 2.4509310722351074 + }, + { + "auxiliary_loss_clip": 0.01108843, + "auxiliary_loss_mlp": 0.01033453, + "balance_loss_clip": 1.01986957, + "balance_loss_mlp": 1.03784788, + "epoch": 0.531309183826845, + "flos": 19390541662080.0, + "grad_norm": 2.66525227198108, + "language_loss": 0.80936503, + "learning_rate": 1.894310406375987e-06, + "loss": 0.83078802, + "num_input_tokens_seen": 189985325, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.70703125, + "step": 8837, + "time_per_iteration": 2.471662759780884 + }, + { + "auxiliary_loss_clip": 0.0110708, + "auxiliary_loss_mlp": 0.01028737, + "balance_loss_clip": 1.01615477, + "balance_loss_mlp": 1.03874159, + "epoch": 0.531369307079513, + "flos": 20190056538240.0, + "grad_norm": 1.8452061032611426, + "language_loss": 0.85926068, + "learning_rate": 1.893921490881035e-06, + "loss": 0.88061881, + "num_input_tokens_seen": 190003290, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.68359375, + "step": 8838, + "time_per_iteration": 2.4360713958740234 + }, + { + "auxiliary_loss_clip": 0.011058, + "auxiliary_loss_mlp": 0.01029554, + "balance_loss_clip": 1.01779366, + "balance_loss_mlp": 1.03785229, + "epoch": 0.5314294303321809, + "flos": 18880143356160.0, + "grad_norm": 1.8224224127823847, + "language_loss": 0.7208544, + "learning_rate": 1.8935325794085906e-06, + "loss": 0.74220788, + "num_input_tokens_seen": 190023260, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6796875, + "step": 8839, + "time_per_iteration": 2.4806606769561768 + }, + { + "auxiliary_loss_clip": 0.01106476, + "auxiliary_loss_mlp": 0.01033997, + "balance_loss_clip": 1.02167034, + "balance_loss_mlp": 1.03606987, + "epoch": 0.531489553584849, + "flos": 23040035297280.0, + "grad_norm": 1.889571361745381, + "language_loss": 0.76674354, + "learning_rate": 1.8931436719734023e-06, + "loss": 0.78814822, + "num_input_tokens_seen": 190042035, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 8840, + "time_per_iteration": 2.47389817237854 + }, + { + "auxiliary_loss_clip": 0.01108751, + "auxiliary_loss_mlp": 0.01031156, + "balance_loss_clip": 1.01798964, + "balance_loss_mlp": 1.03678751, + "epoch": 0.5315496768375169, + "flos": 19790478668160.0, + "grad_norm": 1.9758748106511805, + "language_loss": 0.77377498, + "learning_rate": 1.892754768590216e-06, + "loss": 0.79517406, + "num_input_tokens_seen": 190057545, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71875, + "step": 8841, + "time_per_iteration": 2.4235799312591553 + }, + { + "auxiliary_loss_clip": 0.0103176, + "auxiliary_loss_mlp": 0.01011801, + "balance_loss_clip": 1.01060319, + "balance_loss_mlp": 1.00937963, + "epoch": 0.5316098000901849, + "flos": 71023228185600.0, + "grad_norm": 0.6971901974616477, + "language_loss": 0.56793272, + "learning_rate": 1.8923658692737793e-06, + "loss": 0.5883683, + "num_input_tokens_seen": 190123800, + "router_z_loss_clip": 0.01196289, + "router_z_loss_mlp": 0.22363281, + "step": 8842, + "time_per_iteration": 3.1749658584594727 + }, + { + "auxiliary_loss_clip": 0.0111031, + "auxiliary_loss_mlp": 0.01038298, + "balance_loss_clip": 1.02445221, + "balance_loss_mlp": 1.03839254, + "epoch": 0.5316699233428529, + "flos": 16435560470400.0, + "grad_norm": 1.7048374639197847, + "language_loss": 0.73877072, + "learning_rate": 1.8919769740388407e-06, + "loss": 0.76025677, + "num_input_tokens_seen": 190141625, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.71875, + "step": 8843, + "time_per_iteration": 3.7764668464660645 + }, + { + "auxiliary_loss_clip": 0.01031369, + "auxiliary_loss_mlp": 0.01005783, + "balance_loss_clip": 1.00454903, + "balance_loss_mlp": 1.0092088, + "epoch": 0.5317300465955208, + "flos": 67420814302080.0, + "grad_norm": 0.8754586803272454, + "language_loss": 0.61063367, + "learning_rate": 1.891588082900145e-06, + "loss": 0.63100517, + "num_input_tokens_seen": 190198110, + "router_z_loss_clip": 0.0123291, + "router_z_loss_mlp": 0.22265625, + "step": 8844, + "time_per_iteration": 3.1397178173065186 + }, + { + "auxiliary_loss_clip": 0.01031644, + "auxiliary_loss_mlp": 0.01000918, + "balance_loss_clip": 0.9997676, + "balance_loss_mlp": 1.00950778, + "epoch": 0.5317901698481888, + "flos": 59508075340800.0, + "grad_norm": 0.9433503667086528, + "language_loss": 0.62195891, + "learning_rate": 1.8911991958724411e-06, + "loss": 0.64228451, + "num_input_tokens_seen": 190259950, + "router_z_loss_clip": 0.01147461, + "router_z_loss_mlp": 0.22167969, + "step": 8845, + "time_per_iteration": 3.0431036949157715 + }, + { + "auxiliary_loss_clip": 0.01107979, + "auxiliary_loss_mlp": 0.01032637, + "balance_loss_clip": 1.01908851, + "balance_loss_mlp": 1.0369339, + "epoch": 0.5318502931008567, + "flos": 19129219240320.0, + "grad_norm": 2.021195915673457, + "language_loss": 0.7583214, + "learning_rate": 1.890810312970474e-06, + "loss": 0.77972758, + "num_input_tokens_seen": 190278265, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7109375, + "step": 8846, + "time_per_iteration": 5.309458017349243 + }, + { + "auxiliary_loss_clip": 0.01106825, + "auxiliary_loss_mlp": 0.01031551, + "balance_loss_clip": 1.01994586, + "balance_loss_mlp": 1.03744686, + "epoch": 0.5319104163535248, + "flos": 24681045070080.0, + "grad_norm": 1.5634287795910362, + "language_loss": 0.75384724, + "learning_rate": 1.8904214342089903e-06, + "loss": 0.775231, + "num_input_tokens_seen": 190298400, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6953125, + "step": 8847, + "time_per_iteration": 2.4939441680908203 + }, + { + "auxiliary_loss_clip": 0.01104626, + "auxiliary_loss_mlp": 0.01029614, + "balance_loss_clip": 1.01720405, + "balance_loss_mlp": 1.03563881, + "epoch": 0.5319705396061927, + "flos": 19385513758080.0, + "grad_norm": 1.798053797011527, + "language_loss": 0.87663037, + "learning_rate": 1.8900325596027378e-06, + "loss": 0.89797276, + "num_input_tokens_seen": 190316235, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 8848, + "time_per_iteration": 2.417572498321533 + }, + { + "auxiliary_loss_clip": 0.01107802, + "auxiliary_loss_mlp": 0.01035349, + "balance_loss_clip": 1.02124095, + "balance_loss_mlp": 1.03765666, + "epoch": 0.5320306628588607, + "flos": 18259319664000.0, + "grad_norm": 2.6565378723095834, + "language_loss": 0.74641025, + "learning_rate": 1.8896436891664609e-06, + "loss": 0.76784182, + "num_input_tokens_seen": 190335060, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.703125, + "step": 8849, + "time_per_iteration": 2.4509243965148926 + }, + { + "auxiliary_loss_clip": 0.01107364, + "auxiliary_loss_mlp": 0.01029496, + "balance_loss_clip": 1.01655602, + "balance_loss_mlp": 1.03593016, + "epoch": 0.5320907861115286, + "flos": 23732321097600.0, + "grad_norm": 2.164126567755358, + "language_loss": 0.79812169, + "learning_rate": 1.8892548229149066e-06, + "loss": 0.81949031, + "num_input_tokens_seen": 190353265, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71484375, + "step": 8850, + "time_per_iteration": 2.45766544342041 + }, + { + "auxiliary_loss_clip": 0.01104904, + "auxiliary_loss_mlp": 0.01028828, + "balance_loss_clip": 1.01615, + "balance_loss_mlp": 1.03538489, + "epoch": 0.5321509093641966, + "flos": 34495251321600.0, + "grad_norm": 1.4483393548737078, + "language_loss": 0.54913849, + "learning_rate": 1.888865960862821e-06, + "loss": 0.57047582, + "num_input_tokens_seen": 190376575, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 8851, + "time_per_iteration": 2.607548713684082 + }, + { + "auxiliary_loss_clip": 0.01110841, + "auxiliary_loss_mlp": 0.01030593, + "balance_loss_clip": 1.01821876, + "balance_loss_mlp": 1.03916895, + "epoch": 0.5322110326168645, + "flos": 20010934391040.0, + "grad_norm": 1.7052679387317837, + "language_loss": 0.68385565, + "learning_rate": 1.8884771030249484e-06, + "loss": 0.70526993, + "num_input_tokens_seen": 190395185, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71484375, + "step": 8852, + "time_per_iteration": 2.4444568157196045 + }, + { + "auxiliary_loss_clip": 0.01031832, + "auxiliary_loss_mlp": 0.00999979, + "balance_loss_clip": 0.9987337, + "balance_loss_mlp": 1.00941014, + "epoch": 0.5322711558695326, + "flos": 64631164435200.0, + "grad_norm": 0.8061011864926959, + "language_loss": 0.62881088, + "learning_rate": 1.8880882494160357e-06, + "loss": 0.64912903, + "num_input_tokens_seen": 190452595, + "router_z_loss_clip": 0.01245117, + "router_z_loss_mlp": 0.22460938, + "step": 8853, + "time_per_iteration": 3.0409493446350098 + }, + { + "auxiliary_loss_clip": 0.01108315, + "auxiliary_loss_mlp": 0.01029698, + "balance_loss_clip": 1.01691902, + "balance_loss_mlp": 1.03633368, + "epoch": 0.5323312791222005, + "flos": 14939342421120.0, + "grad_norm": 2.2642894326377196, + "language_loss": 0.79002404, + "learning_rate": 1.8876994000508278e-06, + "loss": 0.81140411, + "num_input_tokens_seen": 190469140, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.71875, + "step": 8854, + "time_per_iteration": 2.4175822734832764 + }, + { + "auxiliary_loss_clip": 0.01103338, + "auxiliary_loss_mlp": 0.01028455, + "balance_loss_clip": 1.01717186, + "balance_loss_mlp": 1.03635907, + "epoch": 0.5323914023748685, + "flos": 23440834229760.0, + "grad_norm": 1.6616394070358602, + "language_loss": 0.73815715, + "learning_rate": 1.8873105549440698e-06, + "loss": 0.75947511, + "num_input_tokens_seen": 190489015, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.671875, + "step": 8855, + "time_per_iteration": 2.5298781394958496 + }, + { + "auxiliary_loss_clip": 0.01104403, + "auxiliary_loss_mlp": 0.01029662, + "balance_loss_clip": 1.01806259, + "balance_loss_mlp": 1.03597307, + "epoch": 0.5324515256275365, + "flos": 26286180134400.0, + "grad_norm": 1.9409120124024815, + "language_loss": 0.64495003, + "learning_rate": 1.886921714110507e-06, + "loss": 0.66629064, + "num_input_tokens_seen": 190508065, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.68359375, + "step": 8856, + "time_per_iteration": 2.483076333999634 + }, + { + "auxiliary_loss_clip": 0.01111855, + "auxiliary_loss_mlp": 0.01035084, + "balance_loss_clip": 1.02166665, + "balance_loss_mlp": 1.03986931, + "epoch": 0.5325116488802044, + "flos": 26870913636480.0, + "grad_norm": 1.6437419686120303, + "language_loss": 0.77630389, + "learning_rate": 1.8865328775648842e-06, + "loss": 0.79777324, + "num_input_tokens_seen": 190527045, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.71875, + "step": 8857, + "time_per_iteration": 2.534383773803711 + }, + { + "auxiliary_loss_clip": 0.01105473, + "auxiliary_loss_mlp": 0.01033387, + "balance_loss_clip": 1.02073884, + "balance_loss_mlp": 1.03602767, + "epoch": 0.5325717721328724, + "flos": 25884734757120.0, + "grad_norm": 2.590488147317335, + "language_loss": 0.71136224, + "learning_rate": 1.8861440453219456e-06, + "loss": 0.73275089, + "num_input_tokens_seen": 190544075, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 8858, + "time_per_iteration": 2.48335862159729 + }, + { + "auxiliary_loss_clip": 0.01108291, + "auxiliary_loss_mlp": 0.01033735, + "balance_loss_clip": 1.02001405, + "balance_loss_mlp": 1.03818965, + "epoch": 0.5326318953855403, + "flos": 21799321666560.0, + "grad_norm": 1.5574852735183802, + "language_loss": 0.69423437, + "learning_rate": 1.8857552173964367e-06, + "loss": 0.71565467, + "num_input_tokens_seen": 190566030, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.703125, + "step": 8859, + "time_per_iteration": 2.5293610095977783 + }, + { + "auxiliary_loss_clip": 0.01104952, + "auxiliary_loss_mlp": 0.01027434, + "balance_loss_clip": 1.01622272, + "balance_loss_mlp": 1.03947163, + "epoch": 0.5326920186382084, + "flos": 20922921728640.0, + "grad_norm": 1.5500879507245162, + "language_loss": 0.69682205, + "learning_rate": 1.8853663938031013e-06, + "loss": 0.71814591, + "num_input_tokens_seen": 190585605, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 8860, + "time_per_iteration": 2.453315019607544 + }, + { + "auxiliary_loss_clip": 0.01105883, + "auxiliary_loss_mlp": 0.01031982, + "balance_loss_clip": 1.02027583, + "balance_loss_mlp": 1.03789401, + "epoch": 0.5327521418908763, + "flos": 21433427775360.0, + "grad_norm": 1.830505462704671, + "language_loss": 0.78035998, + "learning_rate": 1.884977574556683e-06, + "loss": 0.80173862, + "num_input_tokens_seen": 190604625, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 8861, + "time_per_iteration": 2.4910025596618652 + }, + { + "auxiliary_loss_clip": 0.01107901, + "auxiliary_loss_mlp": 0.01037922, + "balance_loss_clip": 1.02470744, + "balance_loss_mlp": 1.03778684, + "epoch": 0.5328122651435443, + "flos": 21760250647680.0, + "grad_norm": 3.045684614472066, + "language_loss": 0.85532111, + "learning_rate": 1.8845887596719279e-06, + "loss": 0.87677932, + "num_input_tokens_seen": 190625060, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 8862, + "time_per_iteration": 2.4594204425811768 + }, + { + "auxiliary_loss_clip": 0.01108341, + "auxiliary_loss_mlp": 0.01035571, + "balance_loss_clip": 1.02181435, + "balance_loss_mlp": 1.03708994, + "epoch": 0.5328723883962122, + "flos": 18296487262080.0, + "grad_norm": 2.155580167277434, + "language_loss": 0.61776686, + "learning_rate": 1.8841999491635778e-06, + "loss": 0.63920593, + "num_input_tokens_seen": 190643150, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7109375, + "step": 8863, + "time_per_iteration": 2.431844472885132 + }, + { + "auxiliary_loss_clip": 0.01107834, + "auxiliary_loss_mlp": 0.01033129, + "balance_loss_clip": 1.02161896, + "balance_loss_mlp": 1.03979647, + "epoch": 0.5329325116488802, + "flos": 25374911068800.0, + "grad_norm": 1.808986842092349, + "language_loss": 0.73174077, + "learning_rate": 1.883811143046377e-06, + "loss": 0.7531504, + "num_input_tokens_seen": 190662725, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6796875, + "step": 8864, + "time_per_iteration": 2.481052875518799 + }, + { + "auxiliary_loss_clip": 0.01106149, + "auxiliary_loss_mlp": 0.01036127, + "balance_loss_clip": 1.02406275, + "balance_loss_mlp": 1.03704095, + "epoch": 0.5329926349015481, + "flos": 25592098654080.0, + "grad_norm": 1.770075213018519, + "language_loss": 0.64782691, + "learning_rate": 1.8834223413350702e-06, + "loss": 0.66924965, + "num_input_tokens_seen": 190683680, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.69140625, + "step": 8865, + "time_per_iteration": 2.5422523021698 + }, + { + "auxiliary_loss_clip": 0.01106424, + "auxiliary_loss_mlp": 0.010298, + "balance_loss_clip": 1.01711667, + "balance_loss_mlp": 1.0374155, + "epoch": 0.5330527581542162, + "flos": 22889605138560.0, + "grad_norm": 1.6788966461131323, + "language_loss": 0.78194928, + "learning_rate": 1.8830335440443989e-06, + "loss": 0.80331147, + "num_input_tokens_seen": 190703350, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 8866, + "time_per_iteration": 2.4783847332000732 + }, + { + "auxiliary_loss_clip": 0.01106298, + "auxiliary_loss_mlp": 0.01033575, + "balance_loss_clip": 1.02127266, + "balance_loss_mlp": 1.03756702, + "epoch": 0.5331128814068841, + "flos": 16026752805120.0, + "grad_norm": 2.4645319902700136, + "language_loss": 0.73618174, + "learning_rate": 1.882644751189108e-06, + "loss": 0.75758052, + "num_input_tokens_seen": 190721170, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 8867, + "time_per_iteration": 2.4607431888580322 + }, + { + "auxiliary_loss_clip": 0.01109812, + "auxiliary_loss_mlp": 0.01039021, + "balance_loss_clip": 1.02575922, + "balance_loss_mlp": 1.03957081, + "epoch": 0.5331730046595521, + "flos": 39344699629440.0, + "grad_norm": 1.616723113347984, + "language_loss": 0.72235525, + "learning_rate": 1.88225596278394e-06, + "loss": 0.7438435, + "num_input_tokens_seen": 190743795, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.703125, + "step": 8868, + "time_per_iteration": 2.6005828380584717 + }, + { + "auxiliary_loss_clip": 0.01107325, + "auxiliary_loss_mlp": 0.01032835, + "balance_loss_clip": 1.02044368, + "balance_loss_mlp": 1.03801441, + "epoch": 0.5332331279122201, + "flos": 24024382583040.0, + "grad_norm": 1.8848687711222403, + "language_loss": 0.78688312, + "learning_rate": 1.881867178843637e-06, + "loss": 0.80828476, + "num_input_tokens_seen": 190761560, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6953125, + "step": 8869, + "time_per_iteration": 2.527679681777954 + }, + { + "auxiliary_loss_clip": 0.01112421, + "auxiliary_loss_mlp": 0.0103673, + "balance_loss_clip": 1.02396262, + "balance_loss_mlp": 1.03942657, + "epoch": 0.533293251164888, + "flos": 17129318728320.0, + "grad_norm": 1.8336580730917733, + "language_loss": 0.75656843, + "learning_rate": 1.8814783993829434e-06, + "loss": 0.7780599, + "num_input_tokens_seen": 190778875, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7265625, + "step": 8870, + "time_per_iteration": 2.408651113510132 + }, + { + "auxiliary_loss_clip": 0.01112864, + "auxiliary_loss_mlp": 0.0103788, + "balance_loss_clip": 1.024266, + "balance_loss_mlp": 1.04069293, + "epoch": 0.533353374417556, + "flos": 22126360020480.0, + "grad_norm": 1.8439379115111716, + "language_loss": 0.75255805, + "learning_rate": 1.8810896244165997e-06, + "loss": 0.77406549, + "num_input_tokens_seen": 190799830, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.71875, + "step": 8871, + "time_per_iteration": 2.501173257827759 + }, + { + "auxiliary_loss_clip": 0.01109454, + "auxiliary_loss_mlp": 0.01032765, + "balance_loss_clip": 1.02014637, + "balance_loss_mlp": 1.03973055, + "epoch": 0.533413497670224, + "flos": 15011091838080.0, + "grad_norm": 1.7881983016452072, + "language_loss": 0.72249746, + "learning_rate": 1.8807008539593498e-06, + "loss": 0.74391973, + "num_input_tokens_seen": 190817155, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 8872, + "time_per_iteration": 2.4058215618133545 + }, + { + "auxiliary_loss_clip": 0.01110293, + "auxiliary_loss_mlp": 0.01038023, + "balance_loss_clip": 1.02498162, + "balance_loss_mlp": 1.04132104, + "epoch": 0.533473620922892, + "flos": 19609955890560.0, + "grad_norm": 1.7441588702127815, + "language_loss": 0.65051317, + "learning_rate": 1.880312088025936e-06, + "loss": 0.67199636, + "num_input_tokens_seen": 190835240, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6875, + "step": 8873, + "time_per_iteration": 2.4598374366760254 + }, + { + "auxiliary_loss_clip": 0.01108657, + "auxiliary_loss_mlp": 0.01037842, + "balance_loss_clip": 1.02549779, + "balance_loss_mlp": 1.03951979, + "epoch": 0.5335337441755599, + "flos": 14282644020480.0, + "grad_norm": 7.037025883542546, + "language_loss": 0.80012232, + "learning_rate": 1.879923326631099e-06, + "loss": 0.82158732, + "num_input_tokens_seen": 190851620, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.69140625, + "step": 8874, + "time_per_iteration": 2.43198299407959 + }, + { + "auxiliary_loss_clip": 0.0110808, + "auxiliary_loss_mlp": 0.01031501, + "balance_loss_clip": 1.01874542, + "balance_loss_mlp": 1.03897262, + "epoch": 0.5335938674282279, + "flos": 20814830726400.0, + "grad_norm": 2.558835697133273, + "language_loss": 0.70077014, + "learning_rate": 1.879534569789582e-06, + "loss": 0.72216594, + "num_input_tokens_seen": 190870545, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6875, + "step": 8875, + "time_per_iteration": 2.4746885299682617 + }, + { + "auxiliary_loss_clip": 0.01033299, + "auxiliary_loss_mlp": 0.01014121, + "balance_loss_clip": 1.01290536, + "balance_loss_mlp": 1.01076412, + "epoch": 0.5336539906808958, + "flos": 71396448451200.0, + "grad_norm": 0.7274620052615154, + "language_loss": 0.59653223, + "learning_rate": 1.879145817516126e-06, + "loss": 0.61700642, + "num_input_tokens_seen": 190931995, + "router_z_loss_clip": 0.012146, + "router_z_loss_mlp": 0.22460938, + "step": 8876, + "time_per_iteration": 3.1654725074768066 + }, + { + "auxiliary_loss_clip": 0.01107319, + "auxiliary_loss_mlp": 0.01031759, + "balance_loss_clip": 1.01971292, + "balance_loss_mlp": 1.0382477, + "epoch": 0.5337141139335638, + "flos": 20152996680960.0, + "grad_norm": 1.894052458703423, + "language_loss": 0.74833322, + "learning_rate": 1.8787570698254727e-06, + "loss": 0.76972401, + "num_input_tokens_seen": 190949890, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6875, + "step": 8877, + "time_per_iteration": 2.4836068153381348 + }, + { + "auxiliary_loss_clip": 0.01032923, + "auxiliary_loss_mlp": 0.0100501, + "balance_loss_clip": 1.00374663, + "balance_loss_mlp": 1.01051378, + "epoch": 0.5337742371862317, + "flos": 67728387484800.0, + "grad_norm": 0.7537185456157387, + "language_loss": 0.57229304, + "learning_rate": 1.8783683267323629e-06, + "loss": 0.59267235, + "num_input_tokens_seen": 191008480, + "router_z_loss_clip": 0.01263428, + "router_z_loss_mlp": 0.22460938, + "step": 8878, + "time_per_iteration": 2.9712772369384766 + }, + { + "auxiliary_loss_clip": 0.01111898, + "auxiliary_loss_mlp": 0.01034899, + "balance_loss_clip": 1.02161908, + "balance_loss_mlp": 1.04023981, + "epoch": 0.5338343604388998, + "flos": 25008909436800.0, + "grad_norm": 1.4246995459674998, + "language_loss": 0.72007561, + "learning_rate": 1.8779795882515395e-06, + "loss": 0.74154353, + "num_input_tokens_seen": 191028995, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 8879, + "time_per_iteration": 2.5073280334472656 + }, + { + "auxiliary_loss_clip": 0.01110375, + "auxiliary_loss_mlp": 0.01029972, + "balance_loss_clip": 1.01706791, + "balance_loss_mlp": 1.03980017, + "epoch": 0.5338944836915677, + "flos": 17601256546560.0, + "grad_norm": 2.331544880776984, + "language_loss": 0.8328526, + "learning_rate": 1.8775908543977416e-06, + "loss": 0.85425603, + "num_input_tokens_seen": 191045285, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 8880, + "time_per_iteration": 2.4154322147369385 + }, + { + "auxiliary_loss_clip": 0.01106028, + "auxiliary_loss_mlp": 0.0103408, + "balance_loss_clip": 1.02200413, + "balance_loss_mlp": 1.03857374, + "epoch": 0.5339546069442357, + "flos": 21724124544000.0, + "grad_norm": 1.3819058164028981, + "language_loss": 0.79567689, + "learning_rate": 1.8772021251857107e-06, + "loss": 0.81707799, + "num_input_tokens_seen": 191066105, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.671875, + "step": 8881, + "time_per_iteration": 2.4748446941375732 + }, + { + "auxiliary_loss_clip": 0.01032611, + "auxiliary_loss_mlp": 0.00999583, + "balance_loss_clip": 0.99825948, + "balance_loss_mlp": 1.01026177, + "epoch": 0.5340147301969036, + "flos": 69723583315200.0, + "grad_norm": 0.7951386121617492, + "language_loss": 0.59243226, + "learning_rate": 1.8768134006301882e-06, + "loss": 0.61275423, + "num_input_tokens_seen": 191126315, + "router_z_loss_clip": 0.01324463, + "router_z_loss_mlp": 0.22363281, + "step": 8882, + "time_per_iteration": 3.0554563999176025 + }, + { + "auxiliary_loss_clip": 0.01032284, + "auxiliary_loss_mlp": 0.01002778, + "balance_loss_clip": 1.0013417, + "balance_loss_mlp": 1.00965989, + "epoch": 0.5340748534495716, + "flos": 63880701580800.0, + "grad_norm": 0.8657705918333868, + "language_loss": 0.63714904, + "learning_rate": 1.876424680745913e-06, + "loss": 0.65749967, + "num_input_tokens_seen": 191174240, + "router_z_loss_clip": 0.01434326, + "router_z_loss_mlp": 0.2265625, + "step": 8883, + "time_per_iteration": 2.8666210174560547 + }, + { + "auxiliary_loss_clip": 0.01112111, + "auxiliary_loss_mlp": 0.01028534, + "balance_loss_clip": 1.01528406, + "balance_loss_mlp": 1.04020667, + "epoch": 0.5341349767022396, + "flos": 28694313694080.0, + "grad_norm": 2.5638154038033334, + "language_loss": 0.82000816, + "learning_rate": 1.8760359655476272e-06, + "loss": 0.84141463, + "num_input_tokens_seen": 191193335, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71875, + "step": 8884, + "time_per_iteration": 3.910738706588745 + }, + { + "auxiliary_loss_clip": 0.01104995, + "auxiliary_loss_mlp": 0.01028727, + "balance_loss_clip": 1.0165143, + "balance_loss_mlp": 1.03923178, + "epoch": 0.5341950999549075, + "flos": 16289691338880.0, + "grad_norm": 1.647799538914853, + "language_loss": 0.7224586, + "learning_rate": 1.8756472550500695e-06, + "loss": 0.74379575, + "num_input_tokens_seen": 191210900, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65625, + "step": 8885, + "time_per_iteration": 2.4330668449401855 + }, + { + "auxiliary_loss_clip": 0.01111824, + "auxiliary_loss_mlp": 0.01031625, + "balance_loss_clip": 1.01816654, + "balance_loss_mlp": 1.03816104, + "epoch": 0.5342552232075756, + "flos": 14355650413440.0, + "grad_norm": 1.9571098005847307, + "language_loss": 0.78834218, + "learning_rate": 1.87525854926798e-06, + "loss": 0.80977666, + "num_input_tokens_seen": 191226730, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 8886, + "time_per_iteration": 2.4285924434661865 + }, + { + "auxiliary_loss_clip": 0.01109212, + "auxiliary_loss_mlp": 0.01027987, + "balance_loss_clip": 1.01453424, + "balance_loss_mlp": 1.03859282, + "epoch": 0.5343153464602435, + "flos": 30297976300800.0, + "grad_norm": 1.4869737557636773, + "language_loss": 0.74745071, + "learning_rate": 1.8748698482160996e-06, + "loss": 0.76882267, + "num_input_tokens_seen": 191250435, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.70703125, + "step": 8887, + "time_per_iteration": 5.458622932434082 + }, + { + "auxiliary_loss_clip": 0.01106415, + "auxiliary_loss_mlp": 0.01025963, + "balance_loss_clip": 1.01351762, + "balance_loss_mlp": 1.03839684, + "epoch": 0.5343754697129115, + "flos": 15596292216960.0, + "grad_norm": 1.9580001729257437, + "language_loss": 0.68680072, + "learning_rate": 1.8744811519091663e-06, + "loss": 0.70812452, + "num_input_tokens_seen": 191268315, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6796875, + "step": 8888, + "time_per_iteration": 3.871016263961792 + }, + { + "auxiliary_loss_clip": 0.01115673, + "auxiliary_loss_mlp": 0.01037433, + "balance_loss_clip": 1.02426004, + "balance_loss_mlp": 1.03957748, + "epoch": 0.5344355929655794, + "flos": 16909617191040.0, + "grad_norm": 2.039365083298093, + "language_loss": 0.77427757, + "learning_rate": 1.8740924603619208e-06, + "loss": 0.79580867, + "num_input_tokens_seen": 191287000, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.76171875, + "step": 8889, + "time_per_iteration": 2.4321072101593018 + }, + { + "auxiliary_loss_clip": 0.01107574, + "auxiliary_loss_mlp": 0.01036398, + "balance_loss_clip": 1.02382183, + "balance_loss_mlp": 1.03896809, + "epoch": 0.5344957162182474, + "flos": 16798186224000.0, + "grad_norm": 1.7896399215033527, + "language_loss": 0.68882942, + "learning_rate": 1.873703773589102e-06, + "loss": 0.71026921, + "num_input_tokens_seen": 191304565, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 8890, + "time_per_iteration": 2.4512557983398438 + }, + { + "auxiliary_loss_clip": 0.01112757, + "auxiliary_loss_mlp": 0.01039562, + "balance_loss_clip": 1.02532864, + "balance_loss_mlp": 1.03882933, + "epoch": 0.5345558394709153, + "flos": 12705590413440.0, + "grad_norm": 3.075420511300943, + "language_loss": 0.77339637, + "learning_rate": 1.8733150916054483e-06, + "loss": 0.79491955, + "num_input_tokens_seen": 191318300, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.73828125, + "step": 8891, + "time_per_iteration": 2.4134135246276855 + }, + { + "auxiliary_loss_clip": 0.01106185, + "auxiliary_loss_mlp": 0.01030725, + "balance_loss_clip": 1.01904237, + "balance_loss_mlp": 1.03873932, + "epoch": 0.5346159627235834, + "flos": 22455050400000.0, + "grad_norm": 1.5298342127178157, + "language_loss": 0.73841035, + "learning_rate": 1.872926414425699e-06, + "loss": 0.75977939, + "num_input_tokens_seen": 191337925, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.67578125, + "step": 8892, + "time_per_iteration": 2.4843709468841553 + }, + { + "auxiliary_loss_clip": 0.0110608, + "auxiliary_loss_mlp": 0.0103126, + "balance_loss_clip": 1.01874948, + "balance_loss_mlp": 1.03663301, + "epoch": 0.5346760859762513, + "flos": 22415763899520.0, + "grad_norm": 1.5614617741562322, + "language_loss": 0.88069522, + "learning_rate": 1.8725377420645932e-06, + "loss": 0.90206861, + "num_input_tokens_seen": 191357120, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 8893, + "time_per_iteration": 2.445389747619629 + }, + { + "auxiliary_loss_clip": 0.0110385, + "auxiliary_loss_mlp": 0.01031179, + "balance_loss_clip": 1.01968718, + "balance_loss_mlp": 1.03617978, + "epoch": 0.5347362092289193, + "flos": 22816131868800.0, + "grad_norm": 1.5898186397759002, + "language_loss": 0.72623652, + "learning_rate": 1.872149074536869e-06, + "loss": 0.74758679, + "num_input_tokens_seen": 191375395, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6796875, + "step": 8894, + "time_per_iteration": 2.475914239883423 + }, + { + "auxiliary_loss_clip": 0.01106294, + "auxiliary_loss_mlp": 0.01030754, + "balance_loss_clip": 1.01774812, + "balance_loss_mlp": 1.03794241, + "epoch": 0.5347963324815872, + "flos": 23219480666880.0, + "grad_norm": 2.053516557339631, + "language_loss": 0.74730217, + "learning_rate": 1.8717604118572648e-06, + "loss": 0.7686727, + "num_input_tokens_seen": 191395595, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.68359375, + "step": 8895, + "time_per_iteration": 2.4524707794189453 + }, + { + "auxiliary_loss_clip": 0.01105976, + "auxiliary_loss_mlp": 0.01028564, + "balance_loss_clip": 1.01558769, + "balance_loss_mlp": 1.03688455, + "epoch": 0.5348564557342552, + "flos": 22601350494720.0, + "grad_norm": 1.7004701648033584, + "language_loss": 0.76999986, + "learning_rate": 1.8713717540405178e-06, + "loss": 0.79134524, + "num_input_tokens_seen": 191413730, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.69140625, + "step": 8896, + "time_per_iteration": 2.4727749824523926 + }, + { + "auxiliary_loss_clip": 0.01105321, + "auxiliary_loss_mlp": 0.01024889, + "balance_loss_clip": 1.01200807, + "balance_loss_mlp": 1.03771544, + "epoch": 0.5349165789869232, + "flos": 18002378701440.0, + "grad_norm": 1.674513516034323, + "language_loss": 0.78698516, + "learning_rate": 1.8709831011013676e-06, + "loss": 0.80828726, + "num_input_tokens_seen": 191432400, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.67578125, + "step": 8897, + "time_per_iteration": 2.437924861907959 + }, + { + "auxiliary_loss_clip": 0.01110861, + "auxiliary_loss_mlp": 0.01028076, + "balance_loss_clip": 1.015589, + "balance_loss_mlp": 1.04029751, + "epoch": 0.5349767022395912, + "flos": 17159770483200.0, + "grad_norm": 1.8516386867396797, + "language_loss": 0.75758165, + "learning_rate": 1.8705944530545509e-06, + "loss": 0.77897102, + "num_input_tokens_seen": 191448855, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 8898, + "time_per_iteration": 2.4490232467651367 + }, + { + "auxiliary_loss_clip": 0.0103315, + "auxiliary_loss_mlp": 0.00997269, + "balance_loss_clip": 0.99616033, + "balance_loss_mlp": 1.01073837, + "epoch": 0.5350368254922592, + "flos": 70992058158720.0, + "grad_norm": 0.8534656988697606, + "language_loss": 0.58027738, + "learning_rate": 1.8702058099148052e-06, + "loss": 0.60058159, + "num_input_tokens_seen": 191519690, + "router_z_loss_clip": 0.0111084, + "router_z_loss_mlp": 0.22460938, + "step": 8899, + "time_per_iteration": 3.2222988605499268 + }, + { + "auxiliary_loss_clip": 0.01105996, + "auxiliary_loss_mlp": 0.01028751, + "balance_loss_clip": 1.01625824, + "balance_loss_mlp": 1.03779793, + "epoch": 0.5350969487449271, + "flos": 27417833095680.0, + "grad_norm": 1.754025350675293, + "language_loss": 0.69734174, + "learning_rate": 1.869817171696868e-06, + "loss": 0.7186892, + "num_input_tokens_seen": 191539380, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 8900, + "time_per_iteration": 2.5348854064941406 + }, + { + "auxiliary_loss_clip": 0.01109931, + "auxiliary_loss_mlp": 0.01031276, + "balance_loss_clip": 1.01857448, + "balance_loss_mlp": 1.03874683, + "epoch": 0.5351570719975951, + "flos": 19316134638720.0, + "grad_norm": 1.712056344952118, + "language_loss": 0.71436262, + "learning_rate": 1.8694285384154777e-06, + "loss": 0.73577476, + "num_input_tokens_seen": 191557400, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 8901, + "time_per_iteration": 2.486694097518921 + }, + { + "auxiliary_loss_clip": 0.01108252, + "auxiliary_loss_mlp": 0.0102913, + "balance_loss_clip": 1.01632655, + "balance_loss_mlp": 1.03779531, + "epoch": 0.535217195250263, + "flos": 19828580019840.0, + "grad_norm": 2.0243685582186477, + "language_loss": 0.77403963, + "learning_rate": 1.8690399100853699e-06, + "loss": 0.79541337, + "num_input_tokens_seen": 191575860, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.703125, + "step": 8902, + "time_per_iteration": 2.4521291255950928 + }, + { + "auxiliary_loss_clip": 0.01103104, + "auxiliary_loss_mlp": 0.01032569, + "balance_loss_clip": 1.02103007, + "balance_loss_mlp": 1.03727639, + "epoch": 0.535277318502931, + "flos": 22127868391680.0, + "grad_norm": 1.5596437382067054, + "language_loss": 0.69763452, + "learning_rate": 1.868651286721281e-06, + "loss": 0.71899128, + "num_input_tokens_seen": 191595775, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 8903, + "time_per_iteration": 2.4639296531677246 + }, + { + "auxiliary_loss_clip": 0.01111291, + "auxiliary_loss_mlp": 0.01038911, + "balance_loss_clip": 1.02613187, + "balance_loss_mlp": 1.03885889, + "epoch": 0.5353374417555989, + "flos": 25045897466880.0, + "grad_norm": 1.4813880450748405, + "language_loss": 0.71867597, + "learning_rate": 1.86826266833795e-06, + "loss": 0.74017799, + "num_input_tokens_seen": 191617785, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7265625, + "step": 8904, + "time_per_iteration": 2.518556833267212 + }, + { + "auxiliary_loss_clip": 0.01109721, + "auxiliary_loss_mlp": 0.01035534, + "balance_loss_clip": 1.0223856, + "balance_loss_mlp": 1.03955388, + "epoch": 0.535397565008267, + "flos": 19388710068480.0, + "grad_norm": 1.7385404274740348, + "language_loss": 0.73125184, + "learning_rate": 1.8678740549501103e-06, + "loss": 0.75270438, + "num_input_tokens_seen": 191636900, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 8905, + "time_per_iteration": 2.481398582458496 + }, + { + "auxiliary_loss_clip": 0.01103053, + "auxiliary_loss_mlp": 0.01033307, + "balance_loss_clip": 1.02244139, + "balance_loss_mlp": 1.03704035, + "epoch": 0.5354576882609349, + "flos": 21471205904640.0, + "grad_norm": 1.4036286343955833, + "language_loss": 0.83569062, + "learning_rate": 1.8674854465725005e-06, + "loss": 0.85705423, + "num_input_tokens_seen": 191656720, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.66015625, + "step": 8906, + "time_per_iteration": 2.4822022914886475 + }, + { + "auxiliary_loss_clip": 0.01110585, + "auxiliary_loss_mlp": 0.01033769, + "balance_loss_clip": 1.02053666, + "balance_loss_mlp": 1.03906655, + "epoch": 0.5355178115136029, + "flos": 20777519473920.0, + "grad_norm": 3.1110381495397688, + "language_loss": 0.74120319, + "learning_rate": 1.8670968432198563e-06, + "loss": 0.76264668, + "num_input_tokens_seen": 191674445, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71484375, + "step": 8907, + "time_per_iteration": 2.4488067626953125 + }, + { + "auxiliary_loss_clip": 0.01109051, + "auxiliary_loss_mlp": 0.01028908, + "balance_loss_clip": 1.01639736, + "balance_loss_mlp": 1.03933167, + "epoch": 0.5355779347662708, + "flos": 23514020190720.0, + "grad_norm": 1.8326240405987804, + "language_loss": 0.77272546, + "learning_rate": 1.866708244906912e-06, + "loss": 0.79410505, + "num_input_tokens_seen": 191695000, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 8908, + "time_per_iteration": 2.5009818077087402 + }, + { + "auxiliary_loss_clip": 0.01111027, + "auxiliary_loss_mlp": 0.01035847, + "balance_loss_clip": 1.02252579, + "balance_loss_mlp": 1.039222, + "epoch": 0.5356380580189388, + "flos": 20303211358080.0, + "grad_norm": 9.969716540759343, + "language_loss": 0.7407465, + "learning_rate": 1.8663196516484055e-06, + "loss": 0.7622152, + "num_input_tokens_seen": 191713295, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.71875, + "step": 8909, + "time_per_iteration": 2.4272916316986084 + }, + { + "auxiliary_loss_clip": 0.01110397, + "auxiliary_loss_mlp": 0.01034564, + "balance_loss_clip": 1.02267265, + "balance_loss_mlp": 1.04071856, + "epoch": 0.5356981812716068, + "flos": 21361642444800.0, + "grad_norm": 1.9518435489791055, + "language_loss": 0.841941, + "learning_rate": 1.8659310634590702e-06, + "loss": 0.86339062, + "num_input_tokens_seen": 191732725, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69921875, + "step": 8910, + "time_per_iteration": 2.4678404331207275 + }, + { + "auxiliary_loss_clip": 0.01109272, + "auxiliary_loss_mlp": 0.01030589, + "balance_loss_clip": 1.0175302, + "balance_loss_mlp": 1.03802073, + "epoch": 0.5357583045242748, + "flos": 23111246010240.0, + "grad_norm": 1.5065365564315203, + "language_loss": 0.81728303, + "learning_rate": 1.8655424803536427e-06, + "loss": 0.83868158, + "num_input_tokens_seen": 191753765, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 8911, + "time_per_iteration": 2.482515335083008 + }, + { + "auxiliary_loss_clip": 0.01107773, + "auxiliary_loss_mlp": 0.01033913, + "balance_loss_clip": 1.02217102, + "balance_loss_mlp": 1.03894281, + "epoch": 0.5358184277769428, + "flos": 21141761339520.0, + "grad_norm": 1.8795354415042287, + "language_loss": 0.6902765, + "learning_rate": 1.8651539023468585e-06, + "loss": 0.71169335, + "num_input_tokens_seen": 191773560, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6875, + "step": 8912, + "time_per_iteration": 2.489625930786133 + }, + { + "auxiliary_loss_clip": 0.01110703, + "auxiliary_loss_mlp": 0.01035561, + "balance_loss_clip": 1.02269232, + "balance_loss_mlp": 1.04099894, + "epoch": 0.5358785510296107, + "flos": 16282400878080.0, + "grad_norm": 1.778457710383864, + "language_loss": 0.71355128, + "learning_rate": 1.8647653294534509e-06, + "loss": 0.73501396, + "num_input_tokens_seen": 191791255, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 8913, + "time_per_iteration": 2.4120781421661377 + }, + { + "auxiliary_loss_clip": 0.01114215, + "auxiliary_loss_mlp": 0.01036322, + "balance_loss_clip": 1.02322149, + "balance_loss_mlp": 1.04114628, + "epoch": 0.5359386742822787, + "flos": 16976877408000.0, + "grad_norm": 1.8082872891744106, + "language_loss": 0.72335684, + "learning_rate": 1.864376761688156e-06, + "loss": 0.7448622, + "num_input_tokens_seen": 191809325, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.734375, + "step": 8914, + "time_per_iteration": 2.466946840286255 + }, + { + "auxiliary_loss_clip": 0.01115012, + "auxiliary_loss_mlp": 0.01039705, + "balance_loss_clip": 1.02528632, + "balance_loss_mlp": 1.04084253, + "epoch": 0.5359987975349466, + "flos": 20812927305600.0, + "grad_norm": 2.2402764225711915, + "language_loss": 0.70448041, + "learning_rate": 1.8639881990657079e-06, + "loss": 0.72602755, + "num_input_tokens_seen": 191829795, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.7421875, + "step": 8915, + "time_per_iteration": 2.5281713008880615 + }, + { + "auxiliary_loss_clip": 0.01108649, + "auxiliary_loss_mlp": 0.01036619, + "balance_loss_clip": 1.02335119, + "balance_loss_mlp": 1.03934813, + "epoch": 0.5360589207876146, + "flos": 22199941031040.0, + "grad_norm": 4.884439280571106, + "language_loss": 0.75188339, + "learning_rate": 1.8635996416008408e-06, + "loss": 0.77333617, + "num_input_tokens_seen": 191850840, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.69140625, + "step": 8916, + "time_per_iteration": 2.4901540279388428 + }, + { + "auxiliary_loss_clip": 0.01110696, + "auxiliary_loss_mlp": 0.01029597, + "balance_loss_clip": 1.01685333, + "balance_loss_mlp": 1.03908181, + "epoch": 0.5361190440402825, + "flos": 31394365084800.0, + "grad_norm": 2.001008974250462, + "language_loss": 0.72230595, + "learning_rate": 1.863211089308289e-06, + "loss": 0.74370885, + "num_input_tokens_seen": 191869520, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71484375, + "step": 8917, + "time_per_iteration": 2.5355899333953857 + }, + { + "auxiliary_loss_clip": 0.01109638, + "auxiliary_loss_mlp": 0.01037576, + "balance_loss_clip": 1.02460611, + "balance_loss_mlp": 1.04033589, + "epoch": 0.5361791672929506, + "flos": 16069882060800.0, + "grad_norm": 2.185479233449534, + "language_loss": 0.71158117, + "learning_rate": 1.8628225422027865e-06, + "loss": 0.73305333, + "num_input_tokens_seen": 191887240, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.69140625, + "step": 8918, + "time_per_iteration": 2.497854709625244 + }, + { + "auxiliary_loss_clip": 0.011106, + "auxiliary_loss_mlp": 0.01036514, + "balance_loss_clip": 1.02387154, + "balance_loss_mlp": 1.04111099, + "epoch": 0.5362392905456185, + "flos": 20740926493440.0, + "grad_norm": 1.4281907235735687, + "language_loss": 0.75156265, + "learning_rate": 1.862434000299067e-06, + "loss": 0.7730338, + "num_input_tokens_seen": 191905690, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 8919, + "time_per_iteration": 2.4522061347961426 + }, + { + "auxiliary_loss_clip": 0.01109131, + "auxiliary_loss_mlp": 0.01031322, + "balance_loss_clip": 1.0192163, + "balance_loss_mlp": 1.0374527, + "epoch": 0.5362994137982865, + "flos": 17340077779200.0, + "grad_norm": 1.9146697385716565, + "language_loss": 0.71194351, + "learning_rate": 1.862045463611864e-06, + "loss": 0.73334807, + "num_input_tokens_seen": 191920725, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71875, + "step": 8920, + "time_per_iteration": 2.4363694190979004 + }, + { + "auxiliary_loss_clip": 0.01106889, + "auxiliary_loss_mlp": 0.01031821, + "balance_loss_clip": 1.01886892, + "balance_loss_mlp": 1.03738046, + "epoch": 0.5363595370509544, + "flos": 42813957795840.0, + "grad_norm": 1.417495166440162, + "language_loss": 0.68572164, + "learning_rate": 1.8616569321559105e-06, + "loss": 0.7071088, + "num_input_tokens_seen": 191944645, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 8921, + "time_per_iteration": 2.659815788269043 + }, + { + "auxiliary_loss_clip": 0.01110885, + "auxiliary_loss_mlp": 0.01036076, + "balance_loss_clip": 1.02357066, + "balance_loss_mlp": 1.04096341, + "epoch": 0.5364196603036224, + "flos": 19171953446400.0, + "grad_norm": 1.806007791508249, + "language_loss": 0.81778204, + "learning_rate": 1.86126840594594e-06, + "loss": 0.83925164, + "num_input_tokens_seen": 191962265, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69921875, + "step": 8922, + "time_per_iteration": 2.4896881580352783 + }, + { + "auxiliary_loss_clip": 0.01109712, + "auxiliary_loss_mlp": 0.01028286, + "balance_loss_clip": 1.01601934, + "balance_loss_mlp": 1.03847456, + "epoch": 0.5364797835562904, + "flos": 17931060247680.0, + "grad_norm": 1.9048762186543056, + "language_loss": 0.76640022, + "learning_rate": 1.860879884996686e-06, + "loss": 0.78778023, + "num_input_tokens_seen": 191978850, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.7109375, + "step": 8923, + "time_per_iteration": 2.46250319480896 + }, + { + "auxiliary_loss_clip": 0.01112498, + "auxiliary_loss_mlp": 0.0103384, + "balance_loss_clip": 1.02061963, + "balance_loss_mlp": 1.04007745, + "epoch": 0.5365399068089584, + "flos": 30228058477440.0, + "grad_norm": 1.372230243923659, + "language_loss": 0.70459902, + "learning_rate": 1.8604913693228804e-06, + "loss": 0.72606242, + "num_input_tokens_seen": 192002000, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7265625, + "step": 8924, + "time_per_iteration": 2.5744879245758057 + }, + { + "auxiliary_loss_clip": 0.0111402, + "auxiliary_loss_mlp": 0.01036175, + "balance_loss_clip": 1.02251387, + "balance_loss_mlp": 1.04109585, + "epoch": 0.5366000300616264, + "flos": 24891696380160.0, + "grad_norm": 1.82023886715655, + "language_loss": 0.86756319, + "learning_rate": 1.8601028589392558e-06, + "loss": 0.88906515, + "num_input_tokens_seen": 192019100, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7265625, + "step": 8925, + "time_per_iteration": 2.4910149574279785 + }, + { + "auxiliary_loss_clip": 0.01110722, + "auxiliary_loss_mlp": 0.01031177, + "balance_loss_clip": 1.01847553, + "balance_loss_mlp": 1.03855276, + "epoch": 0.5366601533142943, + "flos": 29826649013760.0, + "grad_norm": 1.7557992545857284, + "language_loss": 0.77842706, + "learning_rate": 1.8597143538605455e-06, + "loss": 0.79984611, + "num_input_tokens_seen": 192041660, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 8926, + "time_per_iteration": 3.935426950454712 + }, + { + "auxiliary_loss_clip": 0.01108374, + "auxiliary_loss_mlp": 0.0103378, + "balance_loss_clip": 1.02207375, + "balance_loss_mlp": 1.04045248, + "epoch": 0.5367202765669623, + "flos": 27199352620800.0, + "grad_norm": 1.9312965019913735, + "language_loss": 0.66655087, + "learning_rate": 1.85932585410148e-06, + "loss": 0.68797243, + "num_input_tokens_seen": 192063540, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 8927, + "time_per_iteration": 2.547527313232422 + }, + { + "auxiliary_loss_clip": 0.01109886, + "auxiliary_loss_mlp": 0.01028352, + "balance_loss_clip": 1.01575708, + "balance_loss_mlp": 1.03839135, + "epoch": 0.5367803998196302, + "flos": 20229953569920.0, + "grad_norm": 1.6954569855299475, + "language_loss": 0.73241496, + "learning_rate": 1.8589373596767929e-06, + "loss": 0.75379729, + "num_input_tokens_seen": 192081760, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71484375, + "step": 8928, + "time_per_iteration": 2.432772636413574 + }, + { + "auxiliary_loss_clip": 0.01109785, + "auxiliary_loss_mlp": 0.01031284, + "balance_loss_clip": 1.01908278, + "balance_loss_mlp": 1.03883481, + "epoch": 0.5368405230722982, + "flos": 32154629374080.0, + "grad_norm": 1.7056756537874223, + "language_loss": 0.62998128, + "learning_rate": 1.8585488706012154e-06, + "loss": 0.65139198, + "num_input_tokens_seen": 192101620, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.7109375, + "step": 8929, + "time_per_iteration": 5.517207145690918 + }, + { + "auxiliary_loss_clip": 0.01109689, + "auxiliary_loss_mlp": 0.01031499, + "balance_loss_clip": 1.01881528, + "balance_loss_mlp": 1.03864491, + "epoch": 0.5369006463249661, + "flos": 26247935128320.0, + "grad_norm": 1.7096435666181475, + "language_loss": 0.65986609, + "learning_rate": 1.8581603868894781e-06, + "loss": 0.68127799, + "num_input_tokens_seen": 192121805, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 8930, + "time_per_iteration": 4.042668581008911 + }, + { + "auxiliary_loss_clip": 0.01105563, + "auxiliary_loss_mlp": 0.01029425, + "balance_loss_clip": 1.01673484, + "balance_loss_mlp": 1.03648782, + "epoch": 0.5369607695776342, + "flos": 26211306234240.0, + "grad_norm": 1.4058068619041801, + "language_loss": 0.66875708, + "learning_rate": 1.8577719085563136e-06, + "loss": 0.69010699, + "num_input_tokens_seen": 192141765, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 8931, + "time_per_iteration": 2.4965057373046875 + }, + { + "auxiliary_loss_clip": 0.01111171, + "auxiliary_loss_mlp": 0.01032988, + "balance_loss_clip": 1.01932693, + "balance_loss_mlp": 1.04157209, + "epoch": 0.5370208928303021, + "flos": 25009017177600.0, + "grad_norm": 1.7390938861026815, + "language_loss": 0.75847304, + "learning_rate": 1.8573834356164525e-06, + "loss": 0.77991474, + "num_input_tokens_seen": 192161560, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.6953125, + "step": 8932, + "time_per_iteration": 2.4885287284851074 + }, + { + "auxiliary_loss_clip": 0.01110784, + "auxiliary_loss_mlp": 0.0103335, + "balance_loss_clip": 1.01999855, + "balance_loss_mlp": 1.04103768, + "epoch": 0.5370810160829701, + "flos": 31792147274880.0, + "grad_norm": 1.8276755120836934, + "language_loss": 0.66255939, + "learning_rate": 1.8569949680846261e-06, + "loss": 0.68400073, + "num_input_tokens_seen": 192180190, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.69921875, + "step": 8933, + "time_per_iteration": 2.545335292816162 + }, + { + "auxiliary_loss_clip": 0.01106255, + "auxiliary_loss_mlp": 0.01032805, + "balance_loss_clip": 1.02077079, + "balance_loss_mlp": 1.03900647, + "epoch": 0.537141139335638, + "flos": 23842602829440.0, + "grad_norm": 1.6337429593741761, + "language_loss": 0.82865143, + "learning_rate": 1.856606505975565e-06, + "loss": 0.85004205, + "num_input_tokens_seen": 192198855, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 8934, + "time_per_iteration": 2.503974437713623 + }, + { + "auxiliary_loss_clip": 0.0110502, + "auxiliary_loss_mlp": 0.01035873, + "balance_loss_clip": 1.02293336, + "balance_loss_mlp": 1.03738618, + "epoch": 0.537201262588306, + "flos": 18508826511360.0, + "grad_norm": 1.7935675007471827, + "language_loss": 0.79473621, + "learning_rate": 1.856218049303999e-06, + "loss": 0.81614518, + "num_input_tokens_seen": 192216555, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.67578125, + "step": 8935, + "time_per_iteration": 2.4432904720306396 + }, + { + "auxiliary_loss_clip": 0.01109317, + "auxiliary_loss_mlp": 0.01037309, + "balance_loss_clip": 1.02450609, + "balance_loss_mlp": 1.03854251, + "epoch": 0.537261385840974, + "flos": 25662950231040.0, + "grad_norm": 1.6092738011459846, + "language_loss": 0.83558774, + "learning_rate": 1.855829598084659e-06, + "loss": 0.857054, + "num_input_tokens_seen": 192236910, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.70703125, + "step": 8936, + "time_per_iteration": 2.5320403575897217 + }, + { + "auxiliary_loss_clip": 0.01106939, + "auxiliary_loss_mlp": 0.01029184, + "balance_loss_clip": 1.0173173, + "balance_loss_mlp": 1.03860474, + "epoch": 0.537321509093642, + "flos": 40735017406080.0, + "grad_norm": 1.2642552304862777, + "language_loss": 0.72749949, + "learning_rate": 1.8554411523322754e-06, + "loss": 0.74886072, + "num_input_tokens_seen": 192260790, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.68359375, + "step": 8937, + "time_per_iteration": 2.6381869316101074 + }, + { + "auxiliary_loss_clip": 0.01110674, + "auxiliary_loss_mlp": 0.01028782, + "balance_loss_clip": 1.01589561, + "balance_loss_mlp": 1.03737688, + "epoch": 0.53738163234631, + "flos": 17238487138560.0, + "grad_norm": 2.79948851304012, + "language_loss": 0.81773913, + "learning_rate": 1.8550527120615778e-06, + "loss": 0.83913368, + "num_input_tokens_seen": 192277230, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.73046875, + "step": 8938, + "time_per_iteration": 2.4865500926971436 + }, + { + "auxiliary_loss_clip": 0.01115105, + "auxiliary_loss_mlp": 0.01035683, + "balance_loss_clip": 1.0231539, + "balance_loss_mlp": 1.04058433, + "epoch": 0.5374417555989779, + "flos": 12821977457280.0, + "grad_norm": 2.3721010649860403, + "language_loss": 0.80348092, + "learning_rate": 1.8546642772872957e-06, + "loss": 0.82498878, + "num_input_tokens_seen": 192292840, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7421875, + "step": 8939, + "time_per_iteration": 2.4440550804138184 + }, + { + "auxiliary_loss_clip": 0.01034483, + "auxiliary_loss_mlp": 0.01002274, + "balance_loss_clip": 1.00117719, + "balance_loss_mlp": 1.01246023, + "epoch": 0.5375018788516459, + "flos": 67256018703360.0, + "grad_norm": 0.7105496368182959, + "language_loss": 0.52484262, + "learning_rate": 1.8542758480241589e-06, + "loss": 0.54521012, + "num_input_tokens_seen": 192358240, + "router_z_loss_clip": 0.01098633, + "router_z_loss_mlp": 0.22070312, + "step": 8940, + "time_per_iteration": 3.091242790222168 + }, + { + "auxiliary_loss_clip": 0.01107473, + "auxiliary_loss_mlp": 0.01029266, + "balance_loss_clip": 1.01732159, + "balance_loss_mlp": 1.03880298, + "epoch": 0.5375620021043138, + "flos": 18114168804480.0, + "grad_norm": 1.7538523818266185, + "language_loss": 0.71252179, + "learning_rate": 1.8538874242868965e-06, + "loss": 0.73388922, + "num_input_tokens_seen": 192377370, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6875, + "step": 8941, + "time_per_iteration": 2.497748613357544 + }, + { + "auxiliary_loss_clip": 0.01106467, + "auxiliary_loss_mlp": 0.01030418, + "balance_loss_clip": 1.01807404, + "balance_loss_mlp": 1.03906739, + "epoch": 0.5376221253569818, + "flos": 23149383275520.0, + "grad_norm": 1.7257322220940274, + "language_loss": 0.7928313, + "learning_rate": 1.853499006090237e-06, + "loss": 0.81420016, + "num_input_tokens_seen": 192396450, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.671875, + "step": 8942, + "time_per_iteration": 2.5012340545654297 + }, + { + "auxiliary_loss_clip": 0.01113441, + "auxiliary_loss_mlp": 0.01036513, + "balance_loss_clip": 1.02305436, + "balance_loss_mlp": 1.04004788, + "epoch": 0.5376822486096497, + "flos": 29972302663680.0, + "grad_norm": 1.6646036710876846, + "language_loss": 0.69918364, + "learning_rate": 1.853110593448911e-06, + "loss": 0.72068322, + "num_input_tokens_seen": 192417390, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 8943, + "time_per_iteration": 2.5815587043762207 + }, + { + "auxiliary_loss_clip": 0.01032313, + "auxiliary_loss_mlp": 0.0099905, + "balance_loss_clip": 0.99804258, + "balance_loss_mlp": 1.01022053, + "epoch": 0.5377423718623178, + "flos": 54168950874240.0, + "grad_norm": 0.8193486791235207, + "language_loss": 0.59579939, + "learning_rate": 1.852722186377645e-06, + "loss": 0.61611301, + "num_input_tokens_seen": 192478060, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.22070312, + "step": 8944, + "time_per_iteration": 3.0560412406921387 + }, + { + "auxiliary_loss_clip": 0.01117959, + "auxiliary_loss_mlp": 0.01036857, + "balance_loss_clip": 1.02264094, + "balance_loss_mlp": 1.0415678, + "epoch": 0.5378024951149857, + "flos": 23257079228160.0, + "grad_norm": 2.048508714437824, + "language_loss": 0.77503264, + "learning_rate": 1.852333784891169e-06, + "loss": 0.79658085, + "num_input_tokens_seen": 192495985, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.765625, + "step": 8945, + "time_per_iteration": 2.4893672466278076 + }, + { + "auxiliary_loss_clip": 0.01109506, + "auxiliary_loss_mlp": 0.01034395, + "balance_loss_clip": 1.02192593, + "balance_loss_mlp": 1.03820658, + "epoch": 0.5378626183676537, + "flos": 24024095274240.0, + "grad_norm": 1.7269314210534699, + "language_loss": 0.68465722, + "learning_rate": 1.8519453890042112e-06, + "loss": 0.70609617, + "num_input_tokens_seen": 192515445, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7109375, + "step": 8946, + "time_per_iteration": 2.4605491161346436 + }, + { + "auxiliary_loss_clip": 0.01109149, + "auxiliary_loss_mlp": 0.0104377, + "balance_loss_clip": 1.03090715, + "balance_loss_mlp": 1.03953493, + "epoch": 0.5379227416203216, + "flos": 27161789973120.0, + "grad_norm": 1.7416668567009066, + "language_loss": 0.76750016, + "learning_rate": 1.851556998731498e-06, + "loss": 0.78902936, + "num_input_tokens_seen": 192536530, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 8947, + "time_per_iteration": 2.547470808029175 + }, + { + "auxiliary_loss_clip": 0.01108626, + "auxiliary_loss_mlp": 0.01029842, + "balance_loss_clip": 1.01731312, + "balance_loss_mlp": 1.03834343, + "epoch": 0.5379828648729896, + "flos": 24681619687680.0, + "grad_norm": 1.559080956726188, + "language_loss": 0.60268521, + "learning_rate": 1.8511686140877592e-06, + "loss": 0.62406987, + "num_input_tokens_seen": 192556075, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 8948, + "time_per_iteration": 2.486721992492676 + }, + { + "auxiliary_loss_clip": 0.01112593, + "auxiliary_loss_mlp": 0.01034497, + "balance_loss_clip": 1.0221529, + "balance_loss_mlp": 1.04152977, + "epoch": 0.5380429881256577, + "flos": 22523280284160.0, + "grad_norm": 1.6883046071040144, + "language_loss": 0.7951721, + "learning_rate": 1.8507802350877205e-06, + "loss": 0.816643, + "num_input_tokens_seen": 192575535, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.7109375, + "step": 8949, + "time_per_iteration": 2.504025936126709 + }, + { + "auxiliary_loss_clip": 0.01107717, + "auxiliary_loss_mlp": 0.0103256, + "balance_loss_clip": 1.01955473, + "balance_loss_mlp": 1.03890014, + "epoch": 0.5381031113783256, + "flos": 26979543342720.0, + "grad_norm": 1.5394027339965872, + "language_loss": 0.77871096, + "learning_rate": 1.850391861746111e-06, + "loss": 0.80011374, + "num_input_tokens_seen": 192594490, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6875, + "step": 8950, + "time_per_iteration": 2.4836034774780273 + }, + { + "auxiliary_loss_clip": 0.01108112, + "auxiliary_loss_mlp": 0.01031572, + "balance_loss_clip": 1.01990116, + "balance_loss_mlp": 1.04001009, + "epoch": 0.5381632346309936, + "flos": 24754087376640.0, + "grad_norm": 1.7709921726317892, + "language_loss": 0.72630781, + "learning_rate": 1.8500034940776573e-06, + "loss": 0.74770463, + "num_input_tokens_seen": 192615650, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 8951, + "time_per_iteration": 2.5027382373809814 + }, + { + "auxiliary_loss_clip": 0.01109504, + "auxiliary_loss_mlp": 0.0102749, + "balance_loss_clip": 1.01503229, + "balance_loss_mlp": 1.03817379, + "epoch": 0.5382233578836615, + "flos": 15560058372480.0, + "grad_norm": 1.739294207658579, + "language_loss": 0.75148916, + "learning_rate": 1.849615132097085e-06, + "loss": 0.7728591, + "num_input_tokens_seen": 192633840, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71484375, + "step": 8952, + "time_per_iteration": 2.423635244369507 + }, + { + "auxiliary_loss_clip": 0.01109041, + "auxiliary_loss_mlp": 0.01028303, + "balance_loss_clip": 1.01504064, + "balance_loss_mlp": 1.03914118, + "epoch": 0.5382834811363295, + "flos": 25084501608960.0, + "grad_norm": 1.5972619646266322, + "language_loss": 0.79724902, + "learning_rate": 1.8492267758191228e-06, + "loss": 0.81862247, + "num_input_tokens_seen": 192655890, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.69921875, + "step": 8953, + "time_per_iteration": 2.532107353210449 + }, + { + "auxiliary_loss_clip": 0.01106301, + "auxiliary_loss_mlp": 0.01033247, + "balance_loss_clip": 1.01993775, + "balance_loss_mlp": 1.03857923, + "epoch": 0.5383436043889974, + "flos": 13297901685120.0, + "grad_norm": 2.0280242140271336, + "language_loss": 0.80724108, + "learning_rate": 1.8488384252584964e-06, + "loss": 0.82863653, + "num_input_tokens_seen": 192673025, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.67578125, + "step": 8954, + "time_per_iteration": 2.404942512512207 + }, + { + "auxiliary_loss_clip": 0.01112016, + "auxiliary_loss_mlp": 0.01030551, + "balance_loss_clip": 1.01780725, + "balance_loss_mlp": 1.04119825, + "epoch": 0.5384037276416654, + "flos": 23039388852480.0, + "grad_norm": 2.327007095214437, + "language_loss": 0.76461661, + "learning_rate": 1.8484500804299318e-06, + "loss": 0.78604227, + "num_input_tokens_seen": 192692190, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 8955, + "time_per_iteration": 2.511826992034912 + }, + { + "auxiliary_loss_clip": 0.01110374, + "auxiliary_loss_mlp": 0.01036995, + "balance_loss_clip": 1.02414417, + "balance_loss_mlp": 1.04121125, + "epoch": 0.5384638508943334, + "flos": 20631147552000.0, + "grad_norm": 1.5710344626373696, + "language_loss": 0.7823422, + "learning_rate": 1.8480617413481557e-06, + "loss": 0.80381584, + "num_input_tokens_seen": 192710380, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69140625, + "step": 8956, + "time_per_iteration": 2.484722375869751 + }, + { + "auxiliary_loss_clip": 0.0103322, + "auxiliary_loss_mlp": 0.01002994, + "balance_loss_clip": 1.00186145, + "balance_loss_mlp": 1.01120663, + "epoch": 0.5385239741470014, + "flos": 66737683491840.0, + "grad_norm": 0.8559223539778376, + "language_loss": 0.63550651, + "learning_rate": 1.8476734080278932e-06, + "loss": 0.65586865, + "num_input_tokens_seen": 192768995, + "router_z_loss_clip": 0.01135254, + "router_z_loss_mlp": 0.22070312, + "step": 8957, + "time_per_iteration": 3.065546751022339 + }, + { + "auxiliary_loss_clip": 0.01032349, + "auxiliary_loss_mlp": 0.01008296, + "balance_loss_clip": 1.00706863, + "balance_loss_mlp": 1.01029825, + "epoch": 0.5385840973996693, + "flos": 64716058229760.0, + "grad_norm": 0.7038941855074313, + "language_loss": 0.5158186, + "learning_rate": 1.8472850804838705e-06, + "loss": 0.53622508, + "num_input_tokens_seen": 192825585, + "router_z_loss_clip": 0.01226807, + "router_z_loss_mlp": 0.22070312, + "step": 8958, + "time_per_iteration": 3.0705761909484863 + }, + { + "auxiliary_loss_clip": 0.01115886, + "auxiliary_loss_mlp": 0.01030672, + "balance_loss_clip": 1.01678383, + "balance_loss_mlp": 1.04319501, + "epoch": 0.5386442206523373, + "flos": 26141783460480.0, + "grad_norm": 1.5948521762422991, + "language_loss": 0.77216792, + "learning_rate": 1.8468967587308128e-06, + "loss": 0.79363346, + "num_input_tokens_seen": 192847335, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7265625, + "step": 8959, + "time_per_iteration": 2.4907429218292236 + }, + { + "auxiliary_loss_clip": 0.01109786, + "auxiliary_loss_mlp": 0.0103173, + "balance_loss_clip": 1.0190165, + "balance_loss_mlp": 1.03810203, + "epoch": 0.5387043439050052, + "flos": 18251849635200.0, + "grad_norm": 2.0946376118717493, + "language_loss": 0.83630693, + "learning_rate": 1.8465084427834455e-06, + "loss": 0.85772204, + "num_input_tokens_seen": 192862205, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 8960, + "time_per_iteration": 2.4251809120178223 + }, + { + "auxiliary_loss_clip": 0.01112347, + "auxiliary_loss_mlp": 0.01030452, + "balance_loss_clip": 1.01780403, + "balance_loss_mlp": 1.0417726, + "epoch": 0.5387644671576732, + "flos": 29788296266880.0, + "grad_norm": 1.575363596920687, + "language_loss": 0.78489578, + "learning_rate": 1.8461201326564933e-06, + "loss": 0.80632377, + "num_input_tokens_seen": 192883695, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 8961, + "time_per_iteration": 2.5358235836029053 + }, + { + "auxiliary_loss_clip": 0.01110674, + "auxiliary_loss_mlp": 0.01032204, + "balance_loss_clip": 1.01921666, + "balance_loss_mlp": 1.04004741, + "epoch": 0.5388245904103413, + "flos": 22374466237440.0, + "grad_norm": 1.7764783659945997, + "language_loss": 0.84602159, + "learning_rate": 1.845731828364681e-06, + "loss": 0.86745036, + "num_input_tokens_seen": 192900190, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.70703125, + "step": 8962, + "time_per_iteration": 2.462369918823242 + }, + { + "auxiliary_loss_clip": 0.01032215, + "auxiliary_loss_mlp": 0.00998189, + "balance_loss_clip": 0.99696141, + "balance_loss_mlp": 1.01020229, + "epoch": 0.5388847136630092, + "flos": 69807794751360.0, + "grad_norm": 0.7323858189394533, + "language_loss": 0.54189092, + "learning_rate": 1.8453435299227333e-06, + "loss": 0.56219494, + "num_input_tokens_seen": 192958675, + "router_z_loss_clip": 0.01226807, + "router_z_loss_mlp": 0.22070312, + "step": 8963, + "time_per_iteration": 3.000844717025757 + }, + { + "auxiliary_loss_clip": 0.01031141, + "auxiliary_loss_mlp": 0.00998281, + "balance_loss_clip": 0.99717277, + "balance_loss_mlp": 1.00911307, + "epoch": 0.5389448369156772, + "flos": 69822303845760.0, + "grad_norm": 0.8055122078658323, + "language_loss": 0.63433194, + "learning_rate": 1.8449552373453744e-06, + "loss": 0.65462613, + "num_input_tokens_seen": 193033135, + "router_z_loss_clip": 0.0111084, + "router_z_loss_mlp": 0.22070312, + "step": 8964, + "time_per_iteration": 3.241182565689087 + }, + { + "auxiliary_loss_clip": 0.01112883, + "auxiliary_loss_mlp": 0.01030674, + "balance_loss_clip": 1.01782298, + "balance_loss_mlp": 1.03918004, + "epoch": 0.5390049601683451, + "flos": 31722444933120.0, + "grad_norm": 1.532843563745025, + "language_loss": 0.69958258, + "learning_rate": 1.8445669506473287e-06, + "loss": 0.72101814, + "num_input_tokens_seen": 193055570, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.73828125, + "step": 8965, + "time_per_iteration": 2.524223804473877 + }, + { + "auxiliary_loss_clip": 0.01114315, + "auxiliary_loss_mlp": 0.01035135, + "balance_loss_clip": 1.02103257, + "balance_loss_mlp": 1.04133582, + "epoch": 0.5390650834210131, + "flos": 18113486446080.0, + "grad_norm": 2.362623955664157, + "language_loss": 0.81848061, + "learning_rate": 1.8441786698433192e-06, + "loss": 0.83997512, + "num_input_tokens_seen": 193073120, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.73046875, + "step": 8966, + "time_per_iteration": 2.477625608444214 + }, + { + "auxiliary_loss_clip": 0.01110928, + "auxiliary_loss_mlp": 0.0103216, + "balance_loss_clip": 1.01913619, + "balance_loss_mlp": 1.04063606, + "epoch": 0.539125206673681, + "flos": 17416711445760.0, + "grad_norm": 1.8348280049509287, + "language_loss": 0.72713602, + "learning_rate": 1.8437903949480706e-06, + "loss": 0.74856687, + "num_input_tokens_seen": 193090105, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 8967, + "time_per_iteration": 2.419088125228882 + }, + { + "auxiliary_loss_clip": 0.0110771, + "auxiliary_loss_mlp": 0.01031235, + "balance_loss_clip": 1.01884913, + "balance_loss_mlp": 1.03676677, + "epoch": 0.539185329926349, + "flos": 22198935450240.0, + "grad_norm": 1.8042691798262989, + "language_loss": 0.81596529, + "learning_rate": 1.8434021259763065e-06, + "loss": 0.83735478, + "num_input_tokens_seen": 193109325, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 8968, + "time_per_iteration": 3.8650004863739014 + }, + { + "auxiliary_loss_clip": 0.01111598, + "auxiliary_loss_mlp": 0.01030147, + "balance_loss_clip": 1.0168612, + "balance_loss_mlp": 1.0391978, + "epoch": 0.539245453179017, + "flos": 21434397442560.0, + "grad_norm": 1.5993373110169542, + "language_loss": 0.73938435, + "learning_rate": 1.8430138629427484e-06, + "loss": 0.76080179, + "num_input_tokens_seen": 193130595, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.72265625, + "step": 8969, + "time_per_iteration": 2.485146999359131 + }, + { + "auxiliary_loss_clip": 0.01111919, + "auxiliary_loss_mlp": 0.01032887, + "balance_loss_clip": 1.01886833, + "balance_loss_mlp": 1.03785658, + "epoch": 0.539305576431685, + "flos": 20735000749440.0, + "grad_norm": 2.3553854013154907, + "language_loss": 0.82165599, + "learning_rate": 1.8426256058621205e-06, + "loss": 0.84310412, + "num_input_tokens_seen": 193148930, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 8970, + "time_per_iteration": 2.4504613876342773 + }, + { + "auxiliary_loss_clip": 0.01109668, + "auxiliary_loss_mlp": 0.01032979, + "balance_loss_clip": 1.02005112, + "balance_loss_mlp": 1.03989851, + "epoch": 0.5393656996843529, + "flos": 30920452018560.0, + "grad_norm": 1.5328161731771237, + "language_loss": 0.75619417, + "learning_rate": 1.842237354749146e-06, + "loss": 0.77762067, + "num_input_tokens_seen": 193170140, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 8971, + "time_per_iteration": 5.434189558029175 + }, + { + "auxiliary_loss_clip": 0.01030677, + "auxiliary_loss_mlp": 0.00999826, + "balance_loss_clip": 0.99856228, + "balance_loss_mlp": 1.00854254, + "epoch": 0.5394258229370209, + "flos": 50317781351040.0, + "grad_norm": 0.8757990223887638, + "language_loss": 0.60310632, + "learning_rate": 1.8418491096185465e-06, + "loss": 0.62341136, + "num_input_tokens_seen": 193227235, + "router_z_loss_clip": 0.01263428, + "router_z_loss_mlp": 0.22167969, + "step": 8972, + "time_per_iteration": 3.070239782333374 + }, + { + "auxiliary_loss_clip": 0.01109336, + "auxiliary_loss_mlp": 0.01044193, + "balance_loss_clip": 1.03085351, + "balance_loss_mlp": 1.0389235, + "epoch": 0.5394859461896888, + "flos": 25411935012480.0, + "grad_norm": 1.4916710753135305, + "language_loss": 0.78427428, + "learning_rate": 1.841460870485045e-06, + "loss": 0.80580956, + "num_input_tokens_seen": 193248435, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.703125, + "step": 8973, + "time_per_iteration": 2.4841833114624023 + }, + { + "auxiliary_loss_clip": 0.01116334, + "auxiliary_loss_mlp": 0.01037039, + "balance_loss_clip": 1.02265668, + "balance_loss_mlp": 1.03959453, + "epoch": 0.5395460694423568, + "flos": 25478476957440.0, + "grad_norm": 2.2712479958365304, + "language_loss": 0.73893452, + "learning_rate": 1.8410726373633623e-06, + "loss": 0.76046824, + "num_input_tokens_seen": 193267490, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.765625, + "step": 8974, + "time_per_iteration": 2.5056395530700684 + }, + { + "auxiliary_loss_clip": 0.01029707, + "auxiliary_loss_mlp": 0.01005081, + "balance_loss_clip": 1.00388896, + "balance_loss_mlp": 1.00777423, + "epoch": 0.5396061926950249, + "flos": 53249493507840.0, + "grad_norm": 0.7339193766969773, + "language_loss": 0.51197326, + "learning_rate": 1.8406844102682215e-06, + "loss": 0.53232116, + "num_input_tokens_seen": 193326050, + "router_z_loss_clip": 0.01190186, + "router_z_loss_mlp": 0.21972656, + "step": 8975, + "time_per_iteration": 3.0552287101745605 + }, + { + "auxiliary_loss_clip": 0.01110098, + "auxiliary_loss_mlp": 0.01040418, + "balance_loss_clip": 1.02723336, + "balance_loss_mlp": 1.03983927, + "epoch": 0.5396663159476928, + "flos": 26725080418560.0, + "grad_norm": 1.5397959415241314, + "language_loss": 0.71919322, + "learning_rate": 1.840296189214344e-06, + "loss": 0.74069834, + "num_input_tokens_seen": 193348785, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 8976, + "time_per_iteration": 2.5368118286132812 + }, + { + "auxiliary_loss_clip": 0.01112047, + "auxiliary_loss_mlp": 0.01035595, + "balance_loss_clip": 1.02300107, + "balance_loss_mlp": 1.03994215, + "epoch": 0.5397264392003608, + "flos": 23253380127360.0, + "grad_norm": 2.148603673983975, + "language_loss": 0.70274073, + "learning_rate": 1.8399079742164509e-06, + "loss": 0.72421718, + "num_input_tokens_seen": 193367080, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71875, + "step": 8977, + "time_per_iteration": 2.4685816764831543 + }, + { + "auxiliary_loss_clip": 0.01113255, + "auxiliary_loss_mlp": 0.01034111, + "balance_loss_clip": 1.02102757, + "balance_loss_mlp": 1.04169548, + "epoch": 0.5397865624530287, + "flos": 18294188791680.0, + "grad_norm": 1.656094242871676, + "language_loss": 0.7241326, + "learning_rate": 1.8395197652892636e-06, + "loss": 0.7456063, + "num_input_tokens_seen": 193383715, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 8978, + "time_per_iteration": 2.4495601654052734 + }, + { + "auxiliary_loss_clip": 0.01118429, + "auxiliary_loss_mlp": 0.01032682, + "balance_loss_clip": 1.01778078, + "balance_loss_mlp": 1.04137743, + "epoch": 0.5398466857056967, + "flos": 15297514888320.0, + "grad_norm": 2.582100330429111, + "language_loss": 0.73947239, + "learning_rate": 1.8391315624475028e-06, + "loss": 0.76098353, + "num_input_tokens_seen": 193400560, + "router_z_loss_clip": 0.1484375, + "router_z_loss_mlp": 0.7734375, + "step": 8979, + "time_per_iteration": 2.467693328857422 + }, + { + "auxiliary_loss_clip": 0.0111825, + "auxiliary_loss_mlp": 0.01049486, + "balance_loss_clip": 1.03538978, + "balance_loss_mlp": 1.04216337, + "epoch": 0.5399068089583646, + "flos": 17821748183040.0, + "grad_norm": 2.0456901795615656, + "language_loss": 0.76959479, + "learning_rate": 1.8387433657058892e-06, + "loss": 0.79127216, + "num_input_tokens_seen": 193418680, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.76171875, + "step": 8980, + "time_per_iteration": 2.5299665927886963 + }, + { + "auxiliary_loss_clip": 0.01111255, + "auxiliary_loss_mlp": 0.01035786, + "balance_loss_clip": 1.02332902, + "balance_loss_mlp": 1.0388093, + "epoch": 0.5399669322110326, + "flos": 27381635164800.0, + "grad_norm": 1.6658662418671077, + "language_loss": 0.81773221, + "learning_rate": 1.8383551750791431e-06, + "loss": 0.83920264, + "num_input_tokens_seen": 193439310, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7265625, + "step": 8981, + "time_per_iteration": 2.593594789505005 + }, + { + "auxiliary_loss_clip": 0.01113866, + "auxiliary_loss_mlp": 0.01032362, + "balance_loss_clip": 1.01837826, + "balance_loss_mlp": 1.03922904, + "epoch": 0.5400270554637006, + "flos": 20449116403200.0, + "grad_norm": 1.7978808319720327, + "language_loss": 0.66842318, + "learning_rate": 1.8379669905819857e-06, + "loss": 0.68988544, + "num_input_tokens_seen": 193458115, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.74609375, + "step": 8982, + "time_per_iteration": 2.5118813514709473 + }, + { + "auxiliary_loss_clip": 0.01110986, + "auxiliary_loss_mlp": 0.01039664, + "balance_loss_clip": 1.02715898, + "balance_loss_mlp": 1.03987551, + "epoch": 0.5400871787163686, + "flos": 21689578638720.0, + "grad_norm": 1.4560866330096367, + "language_loss": 0.82442951, + "learning_rate": 1.8375788122291358e-06, + "loss": 0.84593606, + "num_input_tokens_seen": 193477365, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7109375, + "step": 8983, + "time_per_iteration": 2.457221269607544 + }, + { + "auxiliary_loss_clip": 0.01110015, + "auxiliary_loss_mlp": 0.01035726, + "balance_loss_clip": 1.02204108, + "balance_loss_mlp": 1.03799057, + "epoch": 0.5401473019690365, + "flos": 19204739585280.0, + "grad_norm": 1.7289170608138429, + "language_loss": 0.7078771, + "learning_rate": 1.8371906400353138e-06, + "loss": 0.72933447, + "num_input_tokens_seen": 193495595, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71875, + "step": 8984, + "time_per_iteration": 2.4523980617523193 + }, + { + "auxiliary_loss_clip": 0.01115801, + "auxiliary_loss_mlp": 0.01034543, + "balance_loss_clip": 1.02000558, + "balance_loss_mlp": 1.04127955, + "epoch": 0.5402074252217045, + "flos": 20627376624000.0, + "grad_norm": 1.7555929792269789, + "language_loss": 0.80110276, + "learning_rate": 1.8368024740152386e-06, + "loss": 0.82260621, + "num_input_tokens_seen": 193514035, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7421875, + "step": 8985, + "time_per_iteration": 2.446753740310669 + }, + { + "auxiliary_loss_clip": 0.01104654, + "auxiliary_loss_mlp": 0.0102882, + "balance_loss_clip": 1.01560616, + "balance_loss_mlp": 1.03796721, + "epoch": 0.5402675484743724, + "flos": 24973465691520.0, + "grad_norm": 2.3719765019392844, + "language_loss": 0.78840292, + "learning_rate": 1.83641431418363e-06, + "loss": 0.80973768, + "num_input_tokens_seen": 193535445, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.66796875, + "step": 8986, + "time_per_iteration": 2.5318102836608887 + }, + { + "auxiliary_loss_clip": 0.01109855, + "auxiliary_loss_mlp": 0.01031286, + "balance_loss_clip": 1.01879263, + "balance_loss_mlp": 1.03847885, + "epoch": 0.5403276717270404, + "flos": 19459022941440.0, + "grad_norm": 1.6989773263518806, + "language_loss": 0.77060419, + "learning_rate": 1.8360261605552075e-06, + "loss": 0.79201555, + "num_input_tokens_seen": 193554780, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71484375, + "step": 8987, + "time_per_iteration": 2.524240732192993 + }, + { + "auxiliary_loss_clip": 0.01109666, + "auxiliary_loss_mlp": 0.01031219, + "balance_loss_clip": 1.0178858, + "balance_loss_mlp": 1.03889561, + "epoch": 0.5403877949797083, + "flos": 18442140912000.0, + "grad_norm": 2.580263640738581, + "language_loss": 0.71292162, + "learning_rate": 1.8356380131446887e-06, + "loss": 0.73433048, + "num_input_tokens_seen": 193573580, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 8988, + "time_per_iteration": 2.4638671875 + }, + { + "auxiliary_loss_clip": 0.01110225, + "auxiliary_loss_mlp": 0.01036979, + "balance_loss_clip": 1.0228405, + "balance_loss_mlp": 1.03822088, + "epoch": 0.5404479182323764, + "flos": 28292868316800.0, + "grad_norm": 2.2630612952232827, + "language_loss": 0.67666376, + "learning_rate": 1.8352498719667934e-06, + "loss": 0.69813585, + "num_input_tokens_seen": 193590490, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.71875, + "step": 8989, + "time_per_iteration": 2.508855104446411 + }, + { + "auxiliary_loss_clip": 0.01111455, + "auxiliary_loss_mlp": 0.01037396, + "balance_loss_clip": 1.02386594, + "balance_loss_mlp": 1.03881633, + "epoch": 0.5405080414850444, + "flos": 23367325046400.0, + "grad_norm": 1.5798861838358007, + "language_loss": 0.77628905, + "learning_rate": 1.8348617370362399e-06, + "loss": 0.79777759, + "num_input_tokens_seen": 193609900, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7265625, + "step": 8990, + "time_per_iteration": 2.489483118057251 + }, + { + "auxiliary_loss_clip": 0.01106485, + "auxiliary_loss_mlp": 0.01028032, + "balance_loss_clip": 1.01594377, + "balance_loss_mlp": 1.03673029, + "epoch": 0.5405681647377123, + "flos": 21106425335040.0, + "grad_norm": 1.5931818725193578, + "language_loss": 0.69039345, + "learning_rate": 1.834473608367745e-06, + "loss": 0.71173859, + "num_input_tokens_seen": 193629775, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 8991, + "time_per_iteration": 2.4418294429779053 + }, + { + "auxiliary_loss_clip": 0.01109673, + "auxiliary_loss_mlp": 0.01035539, + "balance_loss_clip": 1.02171683, + "balance_loss_mlp": 1.03739381, + "epoch": 0.5406282879903803, + "flos": 20449188230400.0, + "grad_norm": 1.7624988623501092, + "language_loss": 0.7614572, + "learning_rate": 1.8340854859760277e-06, + "loss": 0.78290933, + "num_input_tokens_seen": 193648070, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.72265625, + "step": 8992, + "time_per_iteration": 2.4845540523529053 + }, + { + "auxiliary_loss_clip": 0.01110684, + "auxiliary_loss_mlp": 0.01032261, + "balance_loss_clip": 1.01963115, + "balance_loss_mlp": 1.03731656, + "epoch": 0.5406884112430482, + "flos": 14209493973120.0, + "grad_norm": 2.6314606707027304, + "language_loss": 0.76393229, + "learning_rate": 1.8336973698758056e-06, + "loss": 0.78536171, + "num_input_tokens_seen": 193665060, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.734375, + "step": 8993, + "time_per_iteration": 2.4074175357818604 + }, + { + "auxiliary_loss_clip": 0.01107083, + "auxiliary_loss_mlp": 0.01033943, + "balance_loss_clip": 1.02129519, + "balance_loss_mlp": 1.03785443, + "epoch": 0.5407485344957162, + "flos": 23875568536320.0, + "grad_norm": 1.6731423627794038, + "language_loss": 0.70444834, + "learning_rate": 1.8333092600817959e-06, + "loss": 0.72585857, + "num_input_tokens_seen": 193683620, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 8994, + "time_per_iteration": 2.5207760334014893 + }, + { + "auxiliary_loss_clip": 0.01110631, + "auxiliary_loss_mlp": 0.01031119, + "balance_loss_clip": 1.01729715, + "balance_loss_mlp": 1.03817177, + "epoch": 0.5408086577483842, + "flos": 23148485435520.0, + "grad_norm": 1.7966588085871025, + "language_loss": 0.74846065, + "learning_rate": 1.8329211566087157e-06, + "loss": 0.76987815, + "num_input_tokens_seen": 193702990, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7265625, + "step": 8995, + "time_per_iteration": 2.468820095062256 + }, + { + "auxiliary_loss_clip": 0.01107091, + "auxiliary_loss_mlp": 0.01035132, + "balance_loss_clip": 1.02315211, + "balance_loss_mlp": 1.0381844, + "epoch": 0.5408687810010522, + "flos": 18771046773120.0, + "grad_norm": 1.845320286189123, + "language_loss": 0.73867524, + "learning_rate": 1.832533059471282e-06, + "loss": 0.7600975, + "num_input_tokens_seen": 193721785, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6875, + "step": 8996, + "time_per_iteration": 2.4607067108154297 + }, + { + "auxiliary_loss_clip": 0.01105028, + "auxiliary_loss_mlp": 0.01033976, + "balance_loss_clip": 1.02183414, + "balance_loss_mlp": 1.03760076, + "epoch": 0.5409289042537201, + "flos": 13881557779200.0, + "grad_norm": 1.7779086932858201, + "language_loss": 0.73281908, + "learning_rate": 1.8321449686842115e-06, + "loss": 0.75420916, + "num_input_tokens_seen": 193740315, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.671875, + "step": 8997, + "time_per_iteration": 2.433438301086426 + }, + { + "auxiliary_loss_clip": 0.01109644, + "auxiliary_loss_mlp": 0.01033634, + "balance_loss_clip": 1.02052116, + "balance_loss_mlp": 1.03904319, + "epoch": 0.5409890275063881, + "flos": 14465357527680.0, + "grad_norm": 2.01233035965423, + "language_loss": 0.71775877, + "learning_rate": 1.8317568842622207e-06, + "loss": 0.73919159, + "num_input_tokens_seen": 193757580, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 8998, + "time_per_iteration": 2.4791901111602783 + }, + { + "auxiliary_loss_clip": 0.01107126, + "auxiliary_loss_mlp": 0.01037885, + "balance_loss_clip": 1.02471876, + "balance_loss_mlp": 1.03724909, + "epoch": 0.541049150759056, + "flos": 48977449349760.0, + "grad_norm": 1.596226887866337, + "language_loss": 0.70601052, + "learning_rate": 1.8313688062200256e-06, + "loss": 0.72746068, + "num_input_tokens_seen": 193780965, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.69921875, + "step": 8999, + "time_per_iteration": 2.6774816513061523 + }, + { + "auxiliary_loss_clip": 0.01106234, + "auxiliary_loss_mlp": 0.01035678, + "balance_loss_clip": 1.0222373, + "balance_loss_mlp": 1.03789854, + "epoch": 0.541109274011724, + "flos": 18147601388160.0, + "grad_norm": 2.5727427903087716, + "language_loss": 0.80433559, + "learning_rate": 1.8309807345723422e-06, + "loss": 0.8257547, + "num_input_tokens_seen": 193797855, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.68359375, + "step": 9000, + "time_per_iteration": 2.4608795642852783 + }, + { + "auxiliary_loss_clip": 0.0110639, + "auxiliary_loss_mlp": 0.01029527, + "balance_loss_clip": 1.01646805, + "balance_loss_mlp": 1.03770971, + "epoch": 0.541169397264392, + "flos": 20522553759360.0, + "grad_norm": 1.4688376580267075, + "language_loss": 0.72885478, + "learning_rate": 1.8305926693338863e-06, + "loss": 0.75021398, + "num_input_tokens_seen": 193817375, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6875, + "step": 9001, + "time_per_iteration": 2.469433069229126 + }, + { + "auxiliary_loss_clip": 0.01112566, + "auxiliary_loss_mlp": 0.01035385, + "balance_loss_clip": 1.0213902, + "balance_loss_mlp": 1.03844023, + "epoch": 0.54122952051706, + "flos": 20044043752320.0, + "grad_norm": 2.257759724972284, + "language_loss": 0.85127461, + "learning_rate": 1.8302046105193734e-06, + "loss": 0.87275422, + "num_input_tokens_seen": 193832205, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.7421875, + "step": 9002, + "time_per_iteration": 2.4405739307403564 + }, + { + "auxiliary_loss_clip": 0.01107037, + "auxiliary_loss_mlp": 0.010314, + "balance_loss_clip": 1.02020574, + "balance_loss_mlp": 1.0384078, + "epoch": 0.541289643769728, + "flos": 19062246332160.0, + "grad_norm": 1.7125809204353786, + "language_loss": 0.77755821, + "learning_rate": 1.8298165581435183e-06, + "loss": 0.79894257, + "num_input_tokens_seen": 193849830, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6875, + "step": 9003, + "time_per_iteration": 2.451507806777954 + }, + { + "auxiliary_loss_clip": 0.01105384, + "auxiliary_loss_mlp": 0.01029055, + "balance_loss_clip": 1.01557827, + "balance_loss_mlp": 1.03640234, + "epoch": 0.5413497670223959, + "flos": 22382295402240.0, + "grad_norm": 2.168361582224207, + "language_loss": 0.69784325, + "learning_rate": 1.8294285122210372e-06, + "loss": 0.71918762, + "num_input_tokens_seen": 193869945, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.6875, + "step": 9004, + "time_per_iteration": 2.613961935043335 + }, + { + "auxiliary_loss_clip": 0.01028073, + "auxiliary_loss_mlp": 0.01010119, + "balance_loss_clip": 1.00899816, + "balance_loss_mlp": 1.00624812, + "epoch": 0.5414098902750639, + "flos": 70031734093440.0, + "grad_norm": 0.9677352946959291, + "language_loss": 0.59124619, + "learning_rate": 1.8290404727666434e-06, + "loss": 0.61162812, + "num_input_tokens_seen": 193930860, + "router_z_loss_clip": 0.01123047, + "router_z_loss_mlp": 0.21875, + "step": 9005, + "time_per_iteration": 3.175964832305908 + }, + { + "auxiliary_loss_clip": 0.01110665, + "auxiliary_loss_mlp": 0.01033824, + "balance_loss_clip": 1.02183771, + "balance_loss_mlp": 1.03938627, + "epoch": 0.5414700135277318, + "flos": 21798962530560.0, + "grad_norm": 1.6968329328942213, + "language_loss": 0.77685302, + "learning_rate": 1.8286524397950517e-06, + "loss": 0.79829788, + "num_input_tokens_seen": 193949075, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.7109375, + "step": 9006, + "time_per_iteration": 2.455742359161377 + }, + { + "auxiliary_loss_clip": 0.01104494, + "auxiliary_loss_mlp": 0.01032845, + "balance_loss_clip": 1.02205062, + "balance_loss_mlp": 1.03625751, + "epoch": 0.5415301367803999, + "flos": 16907929251840.0, + "grad_norm": 1.624690870596759, + "language_loss": 0.82998371, + "learning_rate": 1.8282644133209777e-06, + "loss": 0.8513571, + "num_input_tokens_seen": 193967630, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.68359375, + "step": 9007, + "time_per_iteration": 2.4356093406677246 + }, + { + "auxiliary_loss_clip": 0.01107937, + "auxiliary_loss_mlp": 0.01030224, + "balance_loss_clip": 1.01693249, + "balance_loss_mlp": 1.03761423, + "epoch": 0.5415902600330678, + "flos": 25704176065920.0, + "grad_norm": 2.1377427178959434, + "language_loss": 0.67209023, + "learning_rate": 1.8278763933591334e-06, + "loss": 0.69347185, + "num_input_tokens_seen": 193988730, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.703125, + "step": 9008, + "time_per_iteration": 2.5489509105682373 + }, + { + "auxiliary_loss_clip": 0.01111879, + "auxiliary_loss_mlp": 0.01031733, + "balance_loss_clip": 1.01810145, + "balance_loss_mlp": 1.03802204, + "epoch": 0.5416503832857358, + "flos": 19208151377280.0, + "grad_norm": 2.189253604566193, + "language_loss": 0.74129766, + "learning_rate": 1.827488379924234e-06, + "loss": 0.76273382, + "num_input_tokens_seen": 194005160, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7421875, + "step": 9009, + "time_per_iteration": 3.8252077102661133 + }, + { + "auxiliary_loss_clip": 0.01110449, + "auxiliary_loss_mlp": 0.01034408, + "balance_loss_clip": 1.02109861, + "balance_loss_mlp": 1.03791738, + "epoch": 0.5417105065384037, + "flos": 12713706887040.0, + "grad_norm": 2.141173328238238, + "language_loss": 0.87482637, + "learning_rate": 1.8271003730309923e-06, + "loss": 0.89627492, + "num_input_tokens_seen": 194021700, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.72265625, + "step": 9010, + "time_per_iteration": 2.4628190994262695 + }, + { + "auxiliary_loss_clip": 0.01106778, + "auxiliary_loss_mlp": 0.01032743, + "balance_loss_clip": 1.02007151, + "balance_loss_mlp": 1.03684556, + "epoch": 0.5417706297910717, + "flos": 30335933998080.0, + "grad_norm": 1.9800903494769417, + "language_loss": 0.64830345, + "learning_rate": 1.826712372694122e-06, + "loss": 0.66969872, + "num_input_tokens_seen": 194042620, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 9011, + "time_per_iteration": 2.530303955078125 + }, + { + "auxiliary_loss_clip": 0.01109504, + "auxiliary_loss_mlp": 0.01036995, + "balance_loss_clip": 1.02463341, + "balance_loss_mlp": 1.03945065, + "epoch": 0.5418307530437396, + "flos": 29020992912000.0, + "grad_norm": 3.61342010762258, + "language_loss": 0.79000378, + "learning_rate": 1.8263243789283362e-06, + "loss": 0.81146884, + "num_input_tokens_seen": 194061800, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.69921875, + "step": 9012, + "time_per_iteration": 5.477705240249634 + }, + { + "auxiliary_loss_clip": 0.01106272, + "auxiliary_loss_mlp": 0.01030108, + "balance_loss_clip": 1.01720369, + "balance_loss_mlp": 1.0364089, + "epoch": 0.5418908762964076, + "flos": 16873455173760.0, + "grad_norm": 1.7419259634167055, + "language_loss": 0.74031919, + "learning_rate": 1.8259363917483466e-06, + "loss": 0.76168299, + "num_input_tokens_seen": 194079890, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69921875, + "step": 9013, + "time_per_iteration": 3.8720171451568604 + }, + { + "auxiliary_loss_clip": 0.01109547, + "auxiliary_loss_mlp": 0.01029661, + "balance_loss_clip": 1.01657844, + "balance_loss_mlp": 1.0367403, + "epoch": 0.5419509995490756, + "flos": 18949702043520.0, + "grad_norm": 2.040050456437719, + "language_loss": 0.72289932, + "learning_rate": 1.8255484111688667e-06, + "loss": 0.74429148, + "num_input_tokens_seen": 194097625, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 9014, + "time_per_iteration": 2.436251640319824 + }, + { + "auxiliary_loss_clip": 0.01108382, + "auxiliary_loss_mlp": 0.01031413, + "balance_loss_clip": 1.01889062, + "balance_loss_mlp": 1.03802454, + "epoch": 0.5420111228017436, + "flos": 18077719478400.0, + "grad_norm": 1.601636110073364, + "language_loss": 0.80585766, + "learning_rate": 1.8251604372046085e-06, + "loss": 0.82725561, + "num_input_tokens_seen": 194116055, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 9015, + "time_per_iteration": 2.4523091316223145 + }, + { + "auxiliary_loss_clip": 0.01112438, + "auxiliary_loss_mlp": 0.0103619, + "balance_loss_clip": 1.02298188, + "balance_loss_mlp": 1.03929543, + "epoch": 0.5420712460544116, + "flos": 19061779455360.0, + "grad_norm": 3.6814275573944717, + "language_loss": 0.81413746, + "learning_rate": 1.8247724698702843e-06, + "loss": 0.83562374, + "num_input_tokens_seen": 194130365, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73046875, + "step": 9016, + "time_per_iteration": 2.4310686588287354 + }, + { + "auxiliary_loss_clip": 0.01107219, + "auxiliary_loss_mlp": 0.01030151, + "balance_loss_clip": 1.01763988, + "balance_loss_mlp": 1.03753281, + "epoch": 0.5421313693070795, + "flos": 18187103370240.0, + "grad_norm": 2.1017981350927646, + "language_loss": 0.81103092, + "learning_rate": 1.8243845091806053e-06, + "loss": 0.83240461, + "num_input_tokens_seen": 194148975, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69921875, + "step": 9017, + "time_per_iteration": 2.427536725997925 + }, + { + "auxiliary_loss_clip": 0.01104389, + "auxiliary_loss_mlp": 0.01029579, + "balance_loss_clip": 1.01719928, + "balance_loss_mlp": 1.03666961, + "epoch": 0.5421914925597475, + "flos": 13005947940480.0, + "grad_norm": 1.7397815948262747, + "language_loss": 0.77372575, + "learning_rate": 1.8239965551502837e-06, + "loss": 0.79506552, + "num_input_tokens_seen": 194167185, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 9018, + "time_per_iteration": 2.4533066749572754 + }, + { + "auxiliary_loss_clip": 0.01107196, + "auxiliary_loss_mlp": 0.01037507, + "balance_loss_clip": 1.02436996, + "balance_loss_mlp": 1.03481603, + "epoch": 0.5422516158124154, + "flos": 46758457831680.0, + "grad_norm": 1.448924926163926, + "language_loss": 0.66352963, + "learning_rate": 1.8236086077940303e-06, + "loss": 0.68497658, + "num_input_tokens_seen": 194192840, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7265625, + "step": 9019, + "time_per_iteration": 2.6830832958221436 + }, + { + "auxiliary_loss_clip": 0.01103655, + "auxiliary_loss_mlp": 0.0102679, + "balance_loss_clip": 1.0157038, + "balance_loss_mlp": 1.03604794, + "epoch": 0.5423117390650835, + "flos": 31758642864000.0, + "grad_norm": 1.5485094933207573, + "language_loss": 0.69635725, + "learning_rate": 1.8232206671265555e-06, + "loss": 0.71766162, + "num_input_tokens_seen": 194213150, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.67578125, + "step": 9020, + "time_per_iteration": 2.5516250133514404 + }, + { + "auxiliary_loss_clip": 0.01101699, + "auxiliary_loss_mlp": 0.01036657, + "balance_loss_clip": 1.02415812, + "balance_loss_mlp": 1.03544152, + "epoch": 0.5423718623177514, + "flos": 27201974313600.0, + "grad_norm": 1.4647880942088878, + "language_loss": 0.80443847, + "learning_rate": 1.8228327331625717e-06, + "loss": 0.825822, + "num_input_tokens_seen": 194234665, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6640625, + "step": 9021, + "time_per_iteration": 2.52411150932312 + }, + { + "auxiliary_loss_clip": 0.01107355, + "auxiliary_loss_mlp": 0.01033488, + "balance_loss_clip": 1.02107835, + "balance_loss_mlp": 1.03812504, + "epoch": 0.5424319855704194, + "flos": 23546447193600.0, + "grad_norm": 1.483970922248673, + "language_loss": 0.78272343, + "learning_rate": 1.822444805916788e-06, + "loss": 0.80413187, + "num_input_tokens_seen": 194253790, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6953125, + "step": 9022, + "time_per_iteration": 2.4745841026306152 + }, + { + "auxiliary_loss_clip": 0.01104936, + "auxiliary_loss_mlp": 0.01033878, + "balance_loss_clip": 1.02170706, + "balance_loss_mlp": 1.03559494, + "epoch": 0.5424921088230873, + "flos": 26615624699520.0, + "grad_norm": 1.6624827413591161, + "language_loss": 0.82107073, + "learning_rate": 1.822056885403915e-06, + "loss": 0.84245884, + "num_input_tokens_seen": 194274950, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6953125, + "step": 9023, + "time_per_iteration": 2.4953298568725586 + }, + { + "auxiliary_loss_clip": 0.01106969, + "auxiliary_loss_mlp": 0.01028855, + "balance_loss_clip": 1.01670718, + "balance_loss_mlp": 1.03815961, + "epoch": 0.5425522320757553, + "flos": 23586811102080.0, + "grad_norm": 1.8210142178846183, + "language_loss": 0.71515894, + "learning_rate": 1.8216689716386627e-06, + "loss": 0.73651719, + "num_input_tokens_seen": 194296155, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 9024, + "time_per_iteration": 2.512596368789673 + }, + { + "auxiliary_loss_clip": 0.01107389, + "auxiliary_loss_mlp": 0.01030904, + "balance_loss_clip": 1.01878023, + "balance_loss_mlp": 1.03640127, + "epoch": 0.5426123553284232, + "flos": 30592264429440.0, + "grad_norm": 1.659326462636006, + "language_loss": 0.64976329, + "learning_rate": 1.8212810646357405e-06, + "loss": 0.67114621, + "num_input_tokens_seen": 194318025, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.7109375, + "step": 9025, + "time_per_iteration": 2.512734889984131 + }, + { + "auxiliary_loss_clip": 0.0110856, + "auxiliary_loss_mlp": 0.01030004, + "balance_loss_clip": 1.01776159, + "balance_loss_mlp": 1.0378685, + "epoch": 0.5426724785810912, + "flos": 12495118671360.0, + "grad_norm": 6.402510966233504, + "language_loss": 0.74099922, + "learning_rate": 1.8208931644098591e-06, + "loss": 0.76238489, + "num_input_tokens_seen": 194336150, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.70703125, + "step": 9026, + "time_per_iteration": 2.42434024810791 + }, + { + "auxiliary_loss_clip": 0.01107364, + "auxiliary_loss_mlp": 0.01040251, + "balance_loss_clip": 1.02587438, + "balance_loss_mlp": 1.03585124, + "epoch": 0.5427326018337592, + "flos": 26064611089920.0, + "grad_norm": 1.637995325273745, + "language_loss": 0.78638506, + "learning_rate": 1.8205052709757265e-06, + "loss": 0.80786121, + "num_input_tokens_seen": 194355980, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.71484375, + "step": 9027, + "time_per_iteration": 2.488490104675293 + }, + { + "auxiliary_loss_clip": 0.01029187, + "auxiliary_loss_mlp": 0.01006045, + "balance_loss_clip": 1.00479341, + "balance_loss_mlp": 1.00745916, + "epoch": 0.5427927250864272, + "flos": 65984745576960.0, + "grad_norm": 0.7366554152868067, + "language_loss": 0.56548405, + "learning_rate": 1.8201173843480515e-06, + "loss": 0.58583641, + "num_input_tokens_seen": 194422660, + "router_z_loss_clip": 0.01251221, + "router_z_loss_mlp": 0.21679688, + "step": 9028, + "time_per_iteration": 3.0799479484558105 + }, + { + "auxiliary_loss_clip": 0.01108987, + "auxiliary_loss_mlp": 0.01030483, + "balance_loss_clip": 1.01727474, + "balance_loss_mlp": 1.03760409, + "epoch": 0.5428528483390952, + "flos": 19975382904960.0, + "grad_norm": 2.289578054979344, + "language_loss": 0.7793408, + "learning_rate": 1.8197295045415442e-06, + "loss": 0.80073547, + "num_input_tokens_seen": 194438545, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71484375, + "step": 9029, + "time_per_iteration": 2.454566478729248 + }, + { + "auxiliary_loss_clip": 0.01105649, + "auxiliary_loss_mlp": 0.01027546, + "balance_loss_clip": 1.01489735, + "balance_loss_mlp": 1.03734791, + "epoch": 0.5429129715917631, + "flos": 21832323287040.0, + "grad_norm": 1.5369423730734595, + "language_loss": 0.83306921, + "learning_rate": 1.8193416315709112e-06, + "loss": 0.85440123, + "num_input_tokens_seen": 194458060, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.68359375, + "step": 9030, + "time_per_iteration": 2.4675095081329346 + }, + { + "auxiliary_loss_clip": 0.01105498, + "auxiliary_loss_mlp": 0.01028416, + "balance_loss_clip": 1.01676893, + "balance_loss_mlp": 1.0374887, + "epoch": 0.5429730948444311, + "flos": 27782685492480.0, + "grad_norm": 1.5422544284751551, + "language_loss": 0.74720484, + "learning_rate": 1.8189537654508623e-06, + "loss": 0.76854396, + "num_input_tokens_seen": 194477405, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 9031, + "time_per_iteration": 2.4871413707733154 + }, + { + "auxiliary_loss_clip": 0.01103416, + "auxiliary_loss_mlp": 0.01030421, + "balance_loss_clip": 1.01883435, + "balance_loss_mlp": 1.03710687, + "epoch": 0.543033218097099, + "flos": 26760452336640.0, + "grad_norm": 1.9031998711979703, + "language_loss": 0.85544586, + "learning_rate": 1.8185659061961045e-06, + "loss": 0.87678427, + "num_input_tokens_seen": 194497085, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6640625, + "step": 9032, + "time_per_iteration": 2.492750406265259 + }, + { + "auxiliary_loss_clip": 0.01110136, + "auxiliary_loss_mlp": 0.01029381, + "balance_loss_clip": 1.01670289, + "balance_loss_mlp": 1.03757548, + "epoch": 0.5430933413497671, + "flos": 22675254727680.0, + "grad_norm": 1.71218946587007, + "language_loss": 0.73568988, + "learning_rate": 1.8181780538213457e-06, + "loss": 0.75708508, + "num_input_tokens_seen": 194516785, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7265625, + "step": 9033, + "time_per_iteration": 2.458281993865967 + }, + { + "auxiliary_loss_clip": 0.01106249, + "auxiliary_loss_mlp": 0.01033307, + "balance_loss_clip": 1.02057564, + "balance_loss_mlp": 1.03709424, + "epoch": 0.543153464602435, + "flos": 24607499973120.0, + "grad_norm": 1.6976408638259588, + "language_loss": 0.75797909, + "learning_rate": 1.8177902083412935e-06, + "loss": 0.77937472, + "num_input_tokens_seen": 194536475, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 9034, + "time_per_iteration": 2.491690158843994 + }, + { + "auxiliary_loss_clip": 0.01105341, + "auxiliary_loss_mlp": 0.01031202, + "balance_loss_clip": 1.01932836, + "balance_loss_mlp": 1.03710067, + "epoch": 0.543213587855103, + "flos": 19025725178880.0, + "grad_norm": 1.7098309272106547, + "language_loss": 0.84488094, + "learning_rate": 1.817402369770655e-06, + "loss": 0.86624634, + "num_input_tokens_seen": 194554495, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 9035, + "time_per_iteration": 2.4352262020111084 + }, + { + "auxiliary_loss_clip": 0.01028064, + "auxiliary_loss_mlp": 0.01007827, + "balance_loss_clip": 1.00669503, + "balance_loss_mlp": 1.00628209, + "epoch": 0.5432737111077709, + "flos": 65686435125120.0, + "grad_norm": 0.7231810753813949, + "language_loss": 0.55908412, + "learning_rate": 1.8170145381241364e-06, + "loss": 0.57944304, + "num_input_tokens_seen": 194617620, + "router_z_loss_clip": 0.01135254, + "router_z_loss_mlp": 0.21777344, + "step": 9036, + "time_per_iteration": 3.041694402694702 + }, + { + "auxiliary_loss_clip": 0.01108199, + "auxiliary_loss_mlp": 0.01034372, + "balance_loss_clip": 1.02147961, + "balance_loss_mlp": 1.03686309, + "epoch": 0.5433338343604389, + "flos": 22091670460800.0, + "grad_norm": 1.5099374695532384, + "language_loss": 0.75264686, + "learning_rate": 1.8166267134164451e-06, + "loss": 0.77407253, + "num_input_tokens_seen": 194637690, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9037, + "time_per_iteration": 2.4950051307678223 + }, + { + "auxiliary_loss_clip": 0.01106194, + "auxiliary_loss_mlp": 0.01035411, + "balance_loss_clip": 1.02301288, + "balance_loss_mlp": 1.03557479, + "epoch": 0.5433939576131068, + "flos": 34672649616000.0, + "grad_norm": 1.5216693219084618, + "language_loss": 0.66438931, + "learning_rate": 1.8162388956622875e-06, + "loss": 0.68580532, + "num_input_tokens_seen": 194659520, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.70703125, + "step": 9038, + "time_per_iteration": 2.559807777404785 + }, + { + "auxiliary_loss_clip": 0.01103453, + "auxiliary_loss_mlp": 0.01030083, + "balance_loss_clip": 1.0184598, + "balance_loss_mlp": 1.03513312, + "epoch": 0.5434540808657748, + "flos": 20303355012480.0, + "grad_norm": 1.8787316560909988, + "language_loss": 0.78100199, + "learning_rate": 1.8158510848763692e-06, + "loss": 0.80233729, + "num_input_tokens_seen": 194677645, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.68359375, + "step": 9039, + "time_per_iteration": 2.4654388427734375 + }, + { + "auxiliary_loss_clip": 0.01106931, + "auxiliary_loss_mlp": 0.01032936, + "balance_loss_clip": 1.02066386, + "balance_loss_mlp": 1.03744531, + "epoch": 0.5435142041184428, + "flos": 23112790295040.0, + "grad_norm": 1.8309305249268624, + "language_loss": 0.76449573, + "learning_rate": 1.8154632810733962e-06, + "loss": 0.78589433, + "num_input_tokens_seen": 194697400, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 9040, + "time_per_iteration": 2.527614116668701 + }, + { + "auxiliary_loss_clip": 0.0102829, + "auxiliary_loss_mlp": 0.01001895, + "balance_loss_clip": 1.00074422, + "balance_loss_mlp": 1.0065496, + "epoch": 0.5435743273711108, + "flos": 64012746954240.0, + "grad_norm": 0.6649082596858222, + "language_loss": 0.52501261, + "learning_rate": 1.815075484268074e-06, + "loss": 0.54531443, + "num_input_tokens_seen": 194761205, + "router_z_loss_clip": 0.01147461, + "router_z_loss_mlp": 0.21777344, + "step": 9041, + "time_per_iteration": 3.0513055324554443 + }, + { + "auxiliary_loss_clip": 0.01105303, + "auxiliary_loss_mlp": 0.01036545, + "balance_loss_clip": 1.02383089, + "balance_loss_mlp": 1.03610432, + "epoch": 0.5436344506237788, + "flos": 25118903859840.0, + "grad_norm": 1.5670483715805776, + "language_loss": 0.76206207, + "learning_rate": 1.8146876944751078e-06, + "loss": 0.78348053, + "num_input_tokens_seen": 194782445, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 9042, + "time_per_iteration": 2.4679293632507324 + }, + { + "auxiliary_loss_clip": 0.01102475, + "auxiliary_loss_mlp": 0.01031619, + "balance_loss_clip": 1.02001429, + "balance_loss_mlp": 1.03483939, + "epoch": 0.5436945738764467, + "flos": 19572967860480.0, + "grad_norm": 1.637929025007711, + "language_loss": 0.67479855, + "learning_rate": 1.8142999117092033e-06, + "loss": 0.69613945, + "num_input_tokens_seen": 194800325, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.67578125, + "step": 9043, + "time_per_iteration": 2.469393730163574 + }, + { + "auxiliary_loss_clip": 0.01101674, + "auxiliary_loss_mlp": 0.01031755, + "balance_loss_clip": 1.019876, + "balance_loss_mlp": 1.03556848, + "epoch": 0.5437546971291147, + "flos": 21142515525120.0, + "grad_norm": 1.6229792564391676, + "language_loss": 0.8417449, + "learning_rate": 1.8139121359850644e-06, + "loss": 0.86307919, + "num_input_tokens_seen": 194818675, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66015625, + "step": 9044, + "time_per_iteration": 2.4827311038970947 + }, + { + "auxiliary_loss_clip": 0.01111478, + "auxiliary_loss_mlp": 0.01026732, + "balance_loss_clip": 1.01375592, + "balance_loss_mlp": 1.03744245, + "epoch": 0.5438148203817826, + "flos": 25118688378240.0, + "grad_norm": 4.385221285903045, + "language_loss": 0.6211096, + "learning_rate": 1.8135243673173956e-06, + "loss": 0.6424917, + "num_input_tokens_seen": 194836595, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7421875, + "step": 9045, + "time_per_iteration": 2.5340473651885986 + }, + { + "auxiliary_loss_clip": 0.01108322, + "auxiliary_loss_mlp": 0.01030915, + "balance_loss_clip": 1.01814771, + "balance_loss_mlp": 1.03780746, + "epoch": 0.5438749436344507, + "flos": 23002939526400.0, + "grad_norm": 1.4286240482824728, + "language_loss": 0.69942701, + "learning_rate": 1.8131366057209023e-06, + "loss": 0.72081935, + "num_input_tokens_seen": 194857520, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.703125, + "step": 9046, + "time_per_iteration": 2.4620296955108643 + }, + { + "auxiliary_loss_clip": 0.01104565, + "auxiliary_loss_mlp": 0.01027743, + "balance_loss_clip": 1.01592338, + "balance_loss_mlp": 1.03681147, + "epoch": 0.5439350668871186, + "flos": 15487016065920.0, + "grad_norm": 2.1944623143587667, + "language_loss": 0.77171725, + "learning_rate": 1.8127488512102868e-06, + "loss": 0.79304034, + "num_input_tokens_seen": 194876020, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 9047, + "time_per_iteration": 2.4618160724639893 + }, + { + "auxiliary_loss_clip": 0.01107988, + "auxiliary_loss_mlp": 0.0103533, + "balance_loss_clip": 1.0232358, + "balance_loss_mlp": 1.03817999, + "epoch": 0.5439951901397866, + "flos": 17238415311360.0, + "grad_norm": 1.7709524835714412, + "language_loss": 0.72530591, + "learning_rate": 1.8123611038002547e-06, + "loss": 0.74673903, + "num_input_tokens_seen": 194894650, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69921875, + "step": 9048, + "time_per_iteration": 2.43306827545166 + }, + { + "auxiliary_loss_clip": 0.01108101, + "auxiliary_loss_mlp": 0.0103279, + "balance_loss_clip": 1.01999831, + "balance_loss_mlp": 1.03979266, + "epoch": 0.5440553133924545, + "flos": 18661016436480.0, + "grad_norm": 2.1212679973875805, + "language_loss": 0.93380594, + "learning_rate": 1.8119733635055076e-06, + "loss": 0.95521486, + "num_input_tokens_seen": 194911935, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.68359375, + "step": 9049, + "time_per_iteration": 2.4344465732574463 + }, + { + "auxiliary_loss_clip": 0.01102747, + "auxiliary_loss_mlp": 0.01029345, + "balance_loss_clip": 1.01810968, + "balance_loss_mlp": 1.0347991, + "epoch": 0.5441154366451225, + "flos": 27122934435840.0, + "grad_norm": 1.8375314287256255, + "language_loss": 0.73678643, + "learning_rate": 1.8115856303407492e-06, + "loss": 0.75810736, + "num_input_tokens_seen": 194931620, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6796875, + "step": 9050, + "time_per_iteration": 2.491992473602295 + }, + { + "auxiliary_loss_clip": 0.01109361, + "auxiliary_loss_mlp": 0.01028722, + "balance_loss_clip": 1.01630008, + "balance_loss_mlp": 1.0390985, + "epoch": 0.5441755598977904, + "flos": 25993867253760.0, + "grad_norm": 1.7129729573051025, + "language_loss": 0.67238903, + "learning_rate": 1.8111979043206832e-06, + "loss": 0.69376987, + "num_input_tokens_seen": 194952560, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.703125, + "step": 9051, + "time_per_iteration": 3.862109661102295 + }, + { + "auxiliary_loss_clip": 0.01104183, + "auxiliary_loss_mlp": 0.01029305, + "balance_loss_clip": 1.0174253, + "balance_loss_mlp": 1.03553367, + "epoch": 0.5442356831504584, + "flos": 32380041173760.0, + "grad_norm": 1.6461015999412698, + "language_loss": 0.67748392, + "learning_rate": 1.810810185460011e-06, + "loss": 0.6988188, + "num_input_tokens_seen": 194973915, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 9052, + "time_per_iteration": 2.5398967266082764 + }, + { + "auxiliary_loss_clip": 0.01108274, + "auxiliary_loss_mlp": 0.010316, + "balance_loss_clip": 1.01914227, + "balance_loss_mlp": 1.03725493, + "epoch": 0.5442958064031264, + "flos": 24164290056960.0, + "grad_norm": 1.7506645402052365, + "language_loss": 0.92625535, + "learning_rate": 1.810422473773436e-06, + "loss": 0.94765407, + "num_input_tokens_seen": 194990170, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 9053, + "time_per_iteration": 2.4675142765045166 + }, + { + "auxiliary_loss_clip": 0.01107915, + "auxiliary_loss_mlp": 0.01034902, + "balance_loss_clip": 1.02233112, + "balance_loss_mlp": 1.03685415, + "epoch": 0.5443559296557944, + "flos": 18764690065920.0, + "grad_norm": 2.7890591975918206, + "language_loss": 0.83447516, + "learning_rate": 1.8100347692756595e-06, + "loss": 0.85590339, + "num_input_tokens_seen": 195006395, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 9054, + "time_per_iteration": 5.314599275588989 + }, + { + "auxiliary_loss_clip": 0.01109858, + "auxiliary_loss_mlp": 0.01032667, + "balance_loss_clip": 1.02034652, + "balance_loss_mlp": 1.04010189, + "epoch": 0.5444160529084624, + "flos": 22632556435200.0, + "grad_norm": 2.3459133888285564, + "language_loss": 0.68981498, + "learning_rate": 1.8096470719813836e-06, + "loss": 0.71124029, + "num_input_tokens_seen": 195025080, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 9055, + "time_per_iteration": 3.926511287689209 + }, + { + "auxiliary_loss_clip": 0.01028465, + "auxiliary_loss_mlp": 0.00999723, + "balance_loss_clip": 0.99868602, + "balance_loss_mlp": 1.00688159, + "epoch": 0.5444761761611303, + "flos": 69671909600640.0, + "grad_norm": 0.7309752042107527, + "language_loss": 0.57659, + "learning_rate": 1.80925938190531e-06, + "loss": 0.59687185, + "num_input_tokens_seen": 195085725, + "router_z_loss_clip": 0.01037598, + "router_z_loss_mlp": 0.21582031, + "step": 9056, + "time_per_iteration": 3.0622963905334473 + }, + { + "auxiliary_loss_clip": 0.01106856, + "auxiliary_loss_mlp": 0.01029461, + "balance_loss_clip": 1.01665783, + "balance_loss_mlp": 1.03565168, + "epoch": 0.5445362994137983, + "flos": 14278442129280.0, + "grad_norm": 1.7313106745452744, + "language_loss": 0.69337952, + "learning_rate": 1.8088716990621395e-06, + "loss": 0.71474266, + "num_input_tokens_seen": 195102585, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7109375, + "step": 9057, + "time_per_iteration": 2.4510855674743652 + }, + { + "auxiliary_loss_clip": 0.01106022, + "auxiliary_loss_mlp": 0.01035977, + "balance_loss_clip": 1.02320337, + "balance_loss_mlp": 1.03730392, + "epoch": 0.5445964226664662, + "flos": 28986195611520.0, + "grad_norm": 2.1714933584662615, + "language_loss": 0.7508406, + "learning_rate": 1.8084840234665738e-06, + "loss": 0.77226055, + "num_input_tokens_seen": 195120055, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6875, + "step": 9058, + "time_per_iteration": 2.526362419128418 + }, + { + "auxiliary_loss_clip": 0.01028725, + "auxiliary_loss_mlp": 0.0100208, + "balance_loss_clip": 1.00100732, + "balance_loss_mlp": 1.00713301, + "epoch": 0.5446565459191343, + "flos": 68620230270720.0, + "grad_norm": 0.7971345769694276, + "language_loss": 0.62662959, + "learning_rate": 1.808096355133312e-06, + "loss": 0.64693761, + "num_input_tokens_seen": 195181045, + "router_z_loss_clip": 0.01074219, + "router_z_loss_mlp": 0.21582031, + "step": 9059, + "time_per_iteration": 3.1505026817321777 + }, + { + "auxiliary_loss_clip": 0.01105797, + "auxiliary_loss_mlp": 0.01030676, + "balance_loss_clip": 1.01862383, + "balance_loss_mlp": 1.03710485, + "epoch": 0.5447166691718022, + "flos": 16216469464320.0, + "grad_norm": 1.9373576881408119, + "language_loss": 0.791785, + "learning_rate": 1.8077086940770572e-06, + "loss": 0.81314969, + "num_input_tokens_seen": 195198840, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6875, + "step": 9060, + "time_per_iteration": 2.4754552841186523 + }, + { + "auxiliary_loss_clip": 0.01106659, + "auxiliary_loss_mlp": 0.01033204, + "balance_loss_clip": 1.02058554, + "balance_loss_mlp": 1.03625464, + "epoch": 0.5447767924244702, + "flos": 25849039616640.0, + "grad_norm": 1.604299719110434, + "language_loss": 0.7939564, + "learning_rate": 1.8073210403125072e-06, + "loss": 0.81535506, + "num_input_tokens_seen": 195218720, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 9061, + "time_per_iteration": 2.556467056274414 + }, + { + "auxiliary_loss_clip": 0.01104902, + "auxiliary_loss_mlp": 0.01026453, + "balance_loss_clip": 1.0152173, + "balance_loss_mlp": 1.03701198, + "epoch": 0.5448369156771381, + "flos": 19677718897920.0, + "grad_norm": 1.7809339372629867, + "language_loss": 0.87091219, + "learning_rate": 1.8069333938543627e-06, + "loss": 0.89222574, + "num_input_tokens_seen": 195235770, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6796875, + "step": 9062, + "time_per_iteration": 2.4758143424987793 + }, + { + "auxiliary_loss_clip": 0.01111266, + "auxiliary_loss_mlp": 0.01032954, + "balance_loss_clip": 1.01959074, + "balance_loss_mlp": 1.03804517, + "epoch": 0.5448970389298061, + "flos": 19281804215040.0, + "grad_norm": 1.9589069040824287, + "language_loss": 0.82366961, + "learning_rate": 1.8065457547173233e-06, + "loss": 0.84511185, + "num_input_tokens_seen": 195254870, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.734375, + "step": 9063, + "time_per_iteration": 2.4351277351379395 + }, + { + "auxiliary_loss_clip": 0.01106592, + "auxiliary_loss_mlp": 0.01028534, + "balance_loss_clip": 1.01580811, + "balance_loss_mlp": 1.0372479, + "epoch": 0.544957162182474, + "flos": 20991690316800.0, + "grad_norm": 1.809751627458355, + "language_loss": 0.63477433, + "learning_rate": 1.8061581229160878e-06, + "loss": 0.65612566, + "num_input_tokens_seen": 195273390, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 9064, + "time_per_iteration": 2.5002574920654297 + }, + { + "auxiliary_loss_clip": 0.01108938, + "auxiliary_loss_mlp": 0.01031086, + "balance_loss_clip": 1.01844406, + "balance_loss_mlp": 1.0378474, + "epoch": 0.545017285435142, + "flos": 25374587846400.0, + "grad_norm": 1.5950372697964212, + "language_loss": 0.79787326, + "learning_rate": 1.8057704984653566e-06, + "loss": 0.81927347, + "num_input_tokens_seen": 195295635, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 9065, + "time_per_iteration": 2.485886335372925 + }, + { + "auxiliary_loss_clip": 0.01104024, + "auxiliary_loss_mlp": 0.01029589, + "balance_loss_clip": 1.01893747, + "balance_loss_mlp": 1.03695667, + "epoch": 0.54507740868781, + "flos": 19134749934720.0, + "grad_norm": 1.9866274876050938, + "language_loss": 0.78143919, + "learning_rate": 1.805382881379827e-06, + "loss": 0.80277526, + "num_input_tokens_seen": 195312545, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.671875, + "step": 9066, + "time_per_iteration": 2.4608097076416016 + }, + { + "auxiliary_loss_clip": 0.0110724, + "auxiliary_loss_mlp": 0.0102858, + "balance_loss_clip": 1.0161345, + "balance_loss_mlp": 1.03510523, + "epoch": 0.545137531940478, + "flos": 26249802635520.0, + "grad_norm": 1.7709941680506742, + "language_loss": 0.75842655, + "learning_rate": 1.8049952716741975e-06, + "loss": 0.7797848, + "num_input_tokens_seen": 195332955, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.72265625, + "step": 9067, + "time_per_iteration": 2.4940598011016846 + }, + { + "auxiliary_loss_clip": 0.01114286, + "auxiliary_loss_mlp": 0.01035797, + "balance_loss_clip": 1.02152777, + "balance_loss_mlp": 1.0393995, + "epoch": 0.545197655193146, + "flos": 37555629995520.0, + "grad_norm": 2.2574843156274, + "language_loss": 0.63637972, + "learning_rate": 1.8046076693631682e-06, + "loss": 0.65788054, + "num_input_tokens_seen": 195355930, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.75, + "step": 9068, + "time_per_iteration": 2.570791244506836 + }, + { + "auxiliary_loss_clip": 0.0110619, + "auxiliary_loss_mlp": 0.01035364, + "balance_loss_clip": 1.02378917, + "balance_loss_mlp": 1.03860283, + "epoch": 0.5452577784458139, + "flos": 26031250333440.0, + "grad_norm": 1.608624941379858, + "language_loss": 0.7232843, + "learning_rate": 1.8042200744614343e-06, + "loss": 0.74469984, + "num_input_tokens_seen": 195376445, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.67578125, + "step": 9069, + "time_per_iteration": 2.49194073677063 + }, + { + "auxiliary_loss_clip": 0.01105915, + "auxiliary_loss_mlp": 0.01029861, + "balance_loss_clip": 1.01882815, + "balance_loss_mlp": 1.03988457, + "epoch": 0.5453179016984819, + "flos": 17639034675840.0, + "grad_norm": 1.7038570560603954, + "language_loss": 0.74060583, + "learning_rate": 1.8038324869836957e-06, + "loss": 0.76196355, + "num_input_tokens_seen": 195393725, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.66015625, + "step": 9070, + "time_per_iteration": 2.4085381031036377 + }, + { + "auxiliary_loss_clip": 0.0110378, + "auxiliary_loss_mlp": 0.01032237, + "balance_loss_clip": 1.02016675, + "balance_loss_mlp": 1.035869, + "epoch": 0.5453780249511498, + "flos": 23216679406080.0, + "grad_norm": 1.9518916968876514, + "language_loss": 0.60487843, + "learning_rate": 1.8034449069446489e-06, + "loss": 0.62623858, + "num_input_tokens_seen": 195411380, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 9071, + "time_per_iteration": 2.4736368656158447 + }, + { + "auxiliary_loss_clip": 0.01029891, + "auxiliary_loss_mlp": 0.01009543, + "balance_loss_clip": 1.00851762, + "balance_loss_mlp": 1.00855255, + "epoch": 0.5454381482038179, + "flos": 68696504801280.0, + "grad_norm": 0.702361481728272, + "language_loss": 0.57095647, + "learning_rate": 1.80305733435899e-06, + "loss": 0.59135079, + "num_input_tokens_seen": 195482015, + "router_z_loss_clip": 0.01025391, + "router_z_loss_mlp": 0.21386719, + "step": 9072, + "time_per_iteration": 3.1778738498687744 + }, + { + "auxiliary_loss_clip": 0.01104044, + "auxiliary_loss_mlp": 0.01029766, + "balance_loss_clip": 1.01834023, + "balance_loss_mlp": 1.03754437, + "epoch": 0.5454982714564858, + "flos": 13260626346240.0, + "grad_norm": 1.6497532443668452, + "language_loss": 0.69947577, + "learning_rate": 1.8026697692414174e-06, + "loss": 0.72081387, + "num_input_tokens_seen": 195500440, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 9073, + "time_per_iteration": 2.414483070373535 + }, + { + "auxiliary_loss_clip": 0.01102116, + "auxiliary_loss_mlp": 0.01032371, + "balance_loss_clip": 1.02133226, + "balance_loss_mlp": 1.03575385, + "epoch": 0.5455583947091538, + "flos": 21835878733440.0, + "grad_norm": 1.7860657423568516, + "language_loss": 0.71207851, + "learning_rate": 1.802282211606627e-06, + "loss": 0.73342335, + "num_input_tokens_seen": 195520860, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6640625, + "step": 9074, + "time_per_iteration": 2.5126519203186035 + }, + { + "auxiliary_loss_clip": 0.01105462, + "auxiliary_loss_mlp": 0.01037255, + "balance_loss_clip": 1.02541733, + "balance_loss_mlp": 1.03713095, + "epoch": 0.5456185179618217, + "flos": 17817438551040.0, + "grad_norm": 1.7043380827263428, + "language_loss": 0.68845975, + "learning_rate": 1.8018946614693148e-06, + "loss": 0.70988691, + "num_input_tokens_seen": 195538615, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.68359375, + "step": 9075, + "time_per_iteration": 2.4271233081817627 + }, + { + "auxiliary_loss_clip": 0.01104973, + "auxiliary_loss_mlp": 0.01029765, + "balance_loss_clip": 1.01904845, + "balance_loss_mlp": 1.03828716, + "epoch": 0.5456786412144897, + "flos": 21069401391360.0, + "grad_norm": 2.0277857780736155, + "language_loss": 0.804497, + "learning_rate": 1.8015071188441768e-06, + "loss": 0.82584435, + "num_input_tokens_seen": 195557460, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.66796875, + "step": 9076, + "time_per_iteration": 2.5117785930633545 + }, + { + "auxiliary_loss_clip": 0.01105415, + "auxiliary_loss_mlp": 0.01030375, + "balance_loss_clip": 1.01892447, + "balance_loss_mlp": 1.03663969, + "epoch": 0.5457387644671576, + "flos": 23294965098240.0, + "grad_norm": 1.583996751680831, + "language_loss": 0.80426413, + "learning_rate": 1.8011195837459089e-06, + "loss": 0.82562208, + "num_input_tokens_seen": 195577985, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6875, + "step": 9077, + "time_per_iteration": 2.4544837474823 + }, + { + "auxiliary_loss_clip": 0.0110649, + "auxiliary_loss_mlp": 0.01030156, + "balance_loss_clip": 1.01880729, + "balance_loss_mlp": 1.03688538, + "epoch": 0.5457988877198257, + "flos": 21617039122560.0, + "grad_norm": 1.9788210228225505, + "language_loss": 0.67737269, + "learning_rate": 1.8007320561892064e-06, + "loss": 0.69873917, + "num_input_tokens_seen": 195597620, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6953125, + "step": 9078, + "time_per_iteration": 2.5323657989501953 + }, + { + "auxiliary_loss_clip": 0.01107395, + "auxiliary_loss_mlp": 0.01032857, + "balance_loss_clip": 1.02072752, + "balance_loss_mlp": 1.03703523, + "epoch": 0.5458590109724936, + "flos": 23762485543680.0, + "grad_norm": 1.8696943679753917, + "language_loss": 0.80740905, + "learning_rate": 1.800344536188764e-06, + "loss": 0.82881159, + "num_input_tokens_seen": 195615910, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.703125, + "step": 9079, + "time_per_iteration": 2.458158493041992 + }, + { + "auxiliary_loss_clip": 0.01110307, + "auxiliary_loss_mlp": 0.01032502, + "balance_loss_clip": 1.01966298, + "balance_loss_mlp": 1.03775454, + "epoch": 0.5459191342251616, + "flos": 24424283675520.0, + "grad_norm": 1.6840905516778153, + "language_loss": 0.75812018, + "learning_rate": 1.799957023759277e-06, + "loss": 0.77954829, + "num_input_tokens_seen": 195635620, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7265625, + "step": 9080, + "time_per_iteration": 2.4955971240997314 + }, + { + "auxiliary_loss_clip": 0.01108419, + "auxiliary_loss_mlp": 0.01033024, + "balance_loss_clip": 1.02007222, + "balance_loss_mlp": 1.03805685, + "epoch": 0.5459792574778296, + "flos": 23623009032960.0, + "grad_norm": 2.4851521305720627, + "language_loss": 0.83080792, + "learning_rate": 1.7995695189154392e-06, + "loss": 0.85222232, + "num_input_tokens_seen": 195652495, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.703125, + "step": 9081, + "time_per_iteration": 2.4580602645874023 + }, + { + "auxiliary_loss_clip": 0.01110385, + "auxiliary_loss_mlp": 0.01029387, + "balance_loss_clip": 1.01722193, + "balance_loss_mlp": 1.03842843, + "epoch": 0.5460393807304975, + "flos": 19135540033920.0, + "grad_norm": 1.5408403844848193, + "language_loss": 0.69658768, + "learning_rate": 1.7991820216719461e-06, + "loss": 0.71798551, + "num_input_tokens_seen": 195671965, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.71875, + "step": 9082, + "time_per_iteration": 2.472858428955078 + }, + { + "auxiliary_loss_clip": 0.01102277, + "auxiliary_loss_mlp": 0.01026377, + "balance_loss_clip": 1.01434886, + "balance_loss_mlp": 1.03546321, + "epoch": 0.5460995039831655, + "flos": 35918534805120.0, + "grad_norm": 1.7415454834760362, + "language_loss": 0.66599333, + "learning_rate": 1.7987945320434906e-06, + "loss": 0.68727982, + "num_input_tokens_seen": 195694725, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 9083, + "time_per_iteration": 2.5756945610046387 + }, + { + "auxiliary_loss_clip": 0.01104147, + "auxiliary_loss_mlp": 0.01029182, + "balance_loss_clip": 1.01772594, + "balance_loss_mlp": 1.03678334, + "epoch": 0.5461596272358334, + "flos": 26759231274240.0, + "grad_norm": 1.6516896910486423, + "language_loss": 0.78909004, + "learning_rate": 1.798407050044766e-06, + "loss": 0.81042337, + "num_input_tokens_seen": 195714090, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.67578125, + "step": 9084, + "time_per_iteration": 2.5361523628234863 + }, + { + "auxiliary_loss_clip": 0.01107894, + "auxiliary_loss_mlp": 0.01032728, + "balance_loss_clip": 1.02093244, + "balance_loss_mlp": 1.03781819, + "epoch": 0.5462197504885015, + "flos": 20886580143360.0, + "grad_norm": 2.0163372032767826, + "language_loss": 0.74970639, + "learning_rate": 1.7980195756904675e-06, + "loss": 0.77111256, + "num_input_tokens_seen": 195733585, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.69921875, + "step": 9085, + "time_per_iteration": 2.461916208267212 + }, + { + "auxiliary_loss_clip": 0.01107723, + "auxiliary_loss_mlp": 0.0102905, + "balance_loss_clip": 1.01702785, + "balance_loss_mlp": 1.03705621, + "epoch": 0.5462798737411694, + "flos": 25804976607360.0, + "grad_norm": 1.6682732441654566, + "language_loss": 0.74792248, + "learning_rate": 1.7976321089952857e-06, + "loss": 0.76929021, + "num_input_tokens_seen": 195752820, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.703125, + "step": 9086, + "time_per_iteration": 2.530505657196045 + }, + { + "auxiliary_loss_clip": 0.01105061, + "auxiliary_loss_mlp": 0.01029176, + "balance_loss_clip": 1.01707602, + "balance_loss_mlp": 1.03592753, + "epoch": 0.5463399969938374, + "flos": 25775027642880.0, + "grad_norm": 1.5861549378759865, + "language_loss": 0.76987553, + "learning_rate": 1.7972446499739155e-06, + "loss": 0.79121786, + "num_input_tokens_seen": 195773740, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69140625, + "step": 9087, + "time_per_iteration": 2.4786858558654785 + }, + { + "auxiliary_loss_clip": 0.01110207, + "auxiliary_loss_mlp": 0.01035533, + "balance_loss_clip": 1.0228374, + "balance_loss_mlp": 1.03895903, + "epoch": 0.5464001202465053, + "flos": 18843298980480.0, + "grad_norm": 1.736831801992395, + "language_loss": 0.77471095, + "learning_rate": 1.7968571986410484e-06, + "loss": 0.79616833, + "num_input_tokens_seen": 195792125, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71484375, + "step": 9088, + "time_per_iteration": 2.450409173965454 + }, + { + "auxiliary_loss_clip": 0.01030156, + "auxiliary_loss_mlp": 0.01001999, + "balance_loss_clip": 1.0009743, + "balance_loss_mlp": 1.0086112, + "epoch": 0.5464602434991733, + "flos": 69049541623680.0, + "grad_norm": 0.7273835392783513, + "language_loss": 0.57771385, + "learning_rate": 1.7964697550113758e-06, + "loss": 0.59803545, + "num_input_tokens_seen": 195854935, + "router_z_loss_clip": 0.01025391, + "router_z_loss_mlp": 0.21484375, + "step": 9089, + "time_per_iteration": 3.1002800464630127 + }, + { + "auxiliary_loss_clip": 0.01107167, + "auxiliary_loss_mlp": 0.01030942, + "balance_loss_clip": 1.01875257, + "balance_loss_mlp": 1.03710759, + "epoch": 0.5465203667518412, + "flos": 27560039040000.0, + "grad_norm": 1.6935215277859987, + "language_loss": 0.76448178, + "learning_rate": 1.7960823190995918e-06, + "loss": 0.78586286, + "num_input_tokens_seen": 195874715, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.703125, + "step": 9090, + "time_per_iteration": 2.5178091526031494 + }, + { + "auxiliary_loss_clip": 0.0110913, + "auxiliary_loss_mlp": 0.01035309, + "balance_loss_clip": 1.02226758, + "balance_loss_mlp": 1.0362854, + "epoch": 0.5465804900045093, + "flos": 21210206705280.0, + "grad_norm": 2.128546091443876, + "language_loss": 0.73422724, + "learning_rate": 1.7956948909203855e-06, + "loss": 0.75567162, + "num_input_tokens_seen": 195892610, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7265625, + "step": 9091, + "time_per_iteration": 2.4523463249206543 + }, + { + "auxiliary_loss_clip": 0.0110893, + "auxiliary_loss_mlp": 0.01035178, + "balance_loss_clip": 1.02313828, + "balance_loss_mlp": 1.03835773, + "epoch": 0.5466406132571772, + "flos": 22488949860480.0, + "grad_norm": 1.850730557544026, + "language_loss": 0.77855682, + "learning_rate": 1.7953074704884498e-06, + "loss": 0.79999787, + "num_input_tokens_seen": 195911085, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.70703125, + "step": 9092, + "time_per_iteration": 2.463998556137085 + }, + { + "auxiliary_loss_clip": 0.01110185, + "auxiliary_loss_mlp": 0.01032754, + "balance_loss_clip": 1.01975393, + "balance_loss_mlp": 1.03879404, + "epoch": 0.5467007365098452, + "flos": 17675843137920.0, + "grad_norm": 1.992080116269468, + "language_loss": 0.74526983, + "learning_rate": 1.794920057818476e-06, + "loss": 0.76669919, + "num_input_tokens_seen": 195929845, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71484375, + "step": 9093, + "time_per_iteration": 3.8121659755706787 + }, + { + "auxiliary_loss_clip": 0.01108983, + "auxiliary_loss_mlp": 0.01037849, + "balance_loss_clip": 1.02420545, + "balance_loss_mlp": 1.03643596, + "epoch": 0.5467608597625132, + "flos": 15698852524800.0, + "grad_norm": 1.8684331289519012, + "language_loss": 0.69012475, + "learning_rate": 1.7945326529251533e-06, + "loss": 0.71159303, + "num_input_tokens_seen": 195946350, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.72265625, + "step": 9094, + "time_per_iteration": 2.406708240509033 + }, + { + "auxiliary_loss_clip": 0.0110964, + "auxiliary_loss_mlp": 0.01035906, + "balance_loss_clip": 1.02463508, + "balance_loss_mlp": 1.0408746, + "epoch": 0.5468209830151811, + "flos": 24312816794880.0, + "grad_norm": 3.1943674750228426, + "language_loss": 0.68355155, + "learning_rate": 1.7941452558231731e-06, + "loss": 0.70500696, + "num_input_tokens_seen": 195959840, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6875, + "step": 9095, + "time_per_iteration": 2.4663615226745605 + }, + { + "auxiliary_loss_clip": 0.0110876, + "auxiliary_loss_mlp": 0.01035993, + "balance_loss_clip": 1.0244838, + "balance_loss_mlp": 1.04013026, + "epoch": 0.5468811062678491, + "flos": 29166323339520.0, + "grad_norm": 1.544968347193232, + "language_loss": 0.66645032, + "learning_rate": 1.7937578665272256e-06, + "loss": 0.6878978, + "num_input_tokens_seen": 195981125, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6875, + "step": 9096, + "time_per_iteration": 5.378362417221069 + }, + { + "auxiliary_loss_clip": 0.01030132, + "auxiliary_loss_mlp": 0.00998409, + "balance_loss_clip": 0.99731266, + "balance_loss_mlp": 1.00865221, + "epoch": 0.546941229520517, + "flos": 67867037982720.0, + "grad_norm": 0.7389922300516351, + "language_loss": 0.57573926, + "learning_rate": 1.7933704850520007e-06, + "loss": 0.59602463, + "num_input_tokens_seen": 196038880, + "router_z_loss_clip": 0.01098633, + "router_z_loss_mlp": 0.21484375, + "step": 9097, + "time_per_iteration": 3.168614387512207 + }, + { + "auxiliary_loss_clip": 0.01030189, + "auxiliary_loss_mlp": 0.01002061, + "balance_loss_clip": 1.00105369, + "balance_loss_mlp": 1.00863671, + "epoch": 0.5470013527731851, + "flos": 58270306625280.0, + "grad_norm": 0.9052213801384115, + "language_loss": 0.64790761, + "learning_rate": 1.7929831114121868e-06, + "loss": 0.66823018, + "num_input_tokens_seen": 196099215, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.21484375, + "step": 9098, + "time_per_iteration": 3.01711106300354 + }, + { + "auxiliary_loss_clip": 0.01108703, + "auxiliary_loss_mlp": 0.01036918, + "balance_loss_clip": 1.02399004, + "balance_loss_mlp": 1.03762555, + "epoch": 0.547061476025853, + "flos": 22965915582720.0, + "grad_norm": 1.9907442514686344, + "language_loss": 0.73179287, + "learning_rate": 1.7925957456224753e-06, + "loss": 0.75324905, + "num_input_tokens_seen": 196120370, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9099, + "time_per_iteration": 2.50752592086792 + }, + { + "auxiliary_loss_clip": 0.01105594, + "auxiliary_loss_mlp": 0.01028658, + "balance_loss_clip": 1.01767302, + "balance_loss_mlp": 1.03749669, + "epoch": 0.547121599278521, + "flos": 29968244426880.0, + "grad_norm": 1.9036037415187144, + "language_loss": 0.72414565, + "learning_rate": 1.7922083876975537e-06, + "loss": 0.74548817, + "num_input_tokens_seen": 196139075, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6796875, + "step": 9100, + "time_per_iteration": 2.5455925464630127 + }, + { + "auxiliary_loss_clip": 0.01105887, + "auxiliary_loss_mlp": 0.01026443, + "balance_loss_clip": 1.01381898, + "balance_loss_mlp": 1.03679228, + "epoch": 0.5471817225311889, + "flos": 36535443914880.0, + "grad_norm": 1.608228209483335, + "language_loss": 0.67675304, + "learning_rate": 1.7918210376521102e-06, + "loss": 0.69807637, + "num_input_tokens_seen": 196159990, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69140625, + "step": 9101, + "time_per_iteration": 2.638460397720337 + }, + { + "auxiliary_loss_clip": 0.01108046, + "auxiliary_loss_mlp": 0.0102741, + "balance_loss_clip": 1.01515532, + "balance_loss_mlp": 1.03816807, + "epoch": 0.5472418457838569, + "flos": 25775243124480.0, + "grad_norm": 1.6461027740418694, + "language_loss": 0.78004694, + "learning_rate": 1.7914336955008343e-06, + "loss": 0.80140156, + "num_input_tokens_seen": 196180570, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 9102, + "time_per_iteration": 2.515669822692871 + }, + { + "auxiliary_loss_clip": 0.01109549, + "auxiliary_loss_mlp": 0.01036821, + "balance_loss_clip": 1.02434635, + "balance_loss_mlp": 1.04091179, + "epoch": 0.5473019690365248, + "flos": 27887687925120.0, + "grad_norm": 1.641023318874669, + "language_loss": 0.72358656, + "learning_rate": 1.791046361258413e-06, + "loss": 0.74505031, + "num_input_tokens_seen": 196200300, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 9103, + "time_per_iteration": 2.516160249710083 + }, + { + "auxiliary_loss_clip": 0.0110583, + "auxiliary_loss_mlp": 0.01027804, + "balance_loss_clip": 1.01571035, + "balance_loss_mlp": 1.03704, + "epoch": 0.5473620922891929, + "flos": 57631490219520.0, + "grad_norm": 1.3192542299458547, + "language_loss": 0.65333968, + "learning_rate": 1.7906590349395356e-06, + "loss": 0.674676, + "num_input_tokens_seen": 196228525, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 9104, + "time_per_iteration": 2.8076846599578857 + }, + { + "auxiliary_loss_clip": 0.01110613, + "auxiliary_loss_mlp": 0.01032122, + "balance_loss_clip": 1.0188477, + "balance_loss_mlp": 1.03879666, + "epoch": 0.5474222155418608, + "flos": 19354056422400.0, + "grad_norm": 1.7582225342351636, + "language_loss": 0.81346989, + "learning_rate": 1.790271716558888e-06, + "loss": 0.83489728, + "num_input_tokens_seen": 196247690, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 9105, + "time_per_iteration": 2.4436333179473877 + }, + { + "auxiliary_loss_clip": 0.01106137, + "auxiliary_loss_mlp": 0.01029973, + "balance_loss_clip": 1.01836777, + "balance_loss_mlp": 1.03727031, + "epoch": 0.5474823387945288, + "flos": 25120448144640.0, + "grad_norm": 1.5498107295674015, + "language_loss": 0.80534816, + "learning_rate": 1.7898844061311575e-06, + "loss": 0.82670921, + "num_input_tokens_seen": 196268555, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6875, + "step": 9106, + "time_per_iteration": 2.5293564796447754 + }, + { + "auxiliary_loss_clip": 0.01108965, + "auxiliary_loss_mlp": 0.0103263, + "balance_loss_clip": 1.02120996, + "balance_loss_mlp": 1.03986609, + "epoch": 0.5475424620471967, + "flos": 18004174381440.0, + "grad_norm": 1.7454593746340303, + "language_loss": 0.69378364, + "learning_rate": 1.7894971036710322e-06, + "loss": 0.71519959, + "num_input_tokens_seen": 196285585, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.69140625, + "step": 9107, + "time_per_iteration": 2.423023223876953 + }, + { + "auxiliary_loss_clip": 0.01110146, + "auxiliary_loss_mlp": 0.01029027, + "balance_loss_clip": 1.01680255, + "balance_loss_mlp": 1.03831339, + "epoch": 0.5476025852998647, + "flos": 22309324922880.0, + "grad_norm": 1.6483473327352183, + "language_loss": 0.63088882, + "learning_rate": 1.789109809193197e-06, + "loss": 0.65228057, + "num_input_tokens_seen": 196305085, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.71875, + "step": 9108, + "time_per_iteration": 2.4629247188568115 + }, + { + "auxiliary_loss_clip": 0.01106827, + "auxiliary_loss_mlp": 0.0102654, + "balance_loss_clip": 1.01526904, + "balance_loss_mlp": 1.03832912, + "epoch": 0.5476627085525327, + "flos": 20120497850880.0, + "grad_norm": 1.6809972098624877, + "language_loss": 0.74894333, + "learning_rate": 1.7887225227123396e-06, + "loss": 0.77027702, + "num_input_tokens_seen": 196323945, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.68359375, + "step": 9109, + "time_per_iteration": 2.445711851119995 + }, + { + "auxiliary_loss_clip": 0.01105646, + "auxiliary_loss_mlp": 0.01033842, + "balance_loss_clip": 1.02130747, + "balance_loss_mlp": 1.03783536, + "epoch": 0.5477228318052006, + "flos": 17712579772800.0, + "grad_norm": 1.9460400321268034, + "language_loss": 0.77668434, + "learning_rate": 1.7883352442431457e-06, + "loss": 0.79807919, + "num_input_tokens_seen": 196342200, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.67578125, + "step": 9110, + "time_per_iteration": 2.4724810123443604 + }, + { + "auxiliary_loss_clip": 0.01105056, + "auxiliary_loss_mlp": 0.01033075, + "balance_loss_clip": 1.02193475, + "balance_loss_mlp": 1.03772378, + "epoch": 0.5477829550578687, + "flos": 25848895962240.0, + "grad_norm": 1.7745449116751173, + "language_loss": 0.71189445, + "learning_rate": 1.7879479738002993e-06, + "loss": 0.73327577, + "num_input_tokens_seen": 196362940, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.67578125, + "step": 9111, + "time_per_iteration": 2.5220110416412354 + }, + { + "auxiliary_loss_clip": 0.01108238, + "auxiliary_loss_mlp": 0.01036998, + "balance_loss_clip": 1.02525544, + "balance_loss_mlp": 1.03890049, + "epoch": 0.5478430783105366, + "flos": 23039676161280.0, + "grad_norm": 1.5754245119869974, + "language_loss": 0.71029758, + "learning_rate": 1.7875607113984876e-06, + "loss": 0.73174989, + "num_input_tokens_seen": 196383070, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.69140625, + "step": 9112, + "time_per_iteration": 2.4876022338867188 + }, + { + "auxiliary_loss_clip": 0.01108992, + "auxiliary_loss_mlp": 0.01029489, + "balance_loss_clip": 1.0176518, + "balance_loss_mlp": 1.03795052, + "epoch": 0.5479032015632046, + "flos": 16071210864000.0, + "grad_norm": 2.4321144529101946, + "language_loss": 0.88027447, + "learning_rate": 1.7871734570523953e-06, + "loss": 0.90165925, + "num_input_tokens_seen": 196398485, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.7109375, + "step": 9113, + "time_per_iteration": 2.4495129585266113 + }, + { + "auxiliary_loss_clip": 0.01110892, + "auxiliary_loss_mlp": 0.01031433, + "balance_loss_clip": 1.01863575, + "balance_loss_mlp": 1.04015231, + "epoch": 0.5479633248158725, + "flos": 24278701852800.0, + "grad_norm": 1.4380357531145453, + "language_loss": 0.73040199, + "learning_rate": 1.7867862107767067e-06, + "loss": 0.75182521, + "num_input_tokens_seen": 196417725, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.70703125, + "step": 9114, + "time_per_iteration": 2.49124813079834 + }, + { + "auxiliary_loss_clip": 0.0110468, + "auxiliary_loss_mlp": 0.01031545, + "balance_loss_clip": 1.0205301, + "balance_loss_mlp": 1.03658402, + "epoch": 0.5480234480685405, + "flos": 26358216860160.0, + "grad_norm": 1.7175878836105734, + "language_loss": 0.72105908, + "learning_rate": 1.7863989725861066e-06, + "loss": 0.74242127, + "num_input_tokens_seen": 196437840, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6796875, + "step": 9115, + "time_per_iteration": 2.4818665981292725 + }, + { + "auxiliary_loss_clip": 0.01109482, + "auxiliary_loss_mlp": 0.01032226, + "balance_loss_clip": 1.01915491, + "balance_loss_mlp": 1.03801298, + "epoch": 0.5480835713212084, + "flos": 22055077480320.0, + "grad_norm": 1.8153830213846445, + "language_loss": 0.7222048, + "learning_rate": 1.7860117424952781e-06, + "loss": 0.74362183, + "num_input_tokens_seen": 196457300, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71484375, + "step": 9116, + "time_per_iteration": 2.4857382774353027 + }, + { + "auxiliary_loss_clip": 0.01108168, + "auxiliary_loss_mlp": 0.01038569, + "balance_loss_clip": 1.02634406, + "balance_loss_mlp": 1.03931904, + "epoch": 0.5481436945738765, + "flos": 25301042749440.0, + "grad_norm": 2.1442712779415025, + "language_loss": 0.76391387, + "learning_rate": 1.7856245205189063e-06, + "loss": 0.78538126, + "num_input_tokens_seen": 196476720, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6875, + "step": 9117, + "time_per_iteration": 2.481539726257324 + }, + { + "auxiliary_loss_clip": 0.01102281, + "auxiliary_loss_mlp": 0.01032027, + "balance_loss_clip": 1.02069592, + "balance_loss_mlp": 1.03559899, + "epoch": 0.5482038178265444, + "flos": 33580857772800.0, + "grad_norm": 1.6184993035700161, + "language_loss": 0.62667149, + "learning_rate": 1.785237306671674e-06, + "loss": 0.64801455, + "num_input_tokens_seen": 196496765, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 9118, + "time_per_iteration": 2.582087516784668 + }, + { + "auxiliary_loss_clip": 0.01112715, + "auxiliary_loss_mlp": 0.01030243, + "balance_loss_clip": 1.01705241, + "balance_loss_mlp": 1.04148602, + "epoch": 0.5482639410792124, + "flos": 19026192055680.0, + "grad_norm": 2.080656601028848, + "language_loss": 0.79054701, + "learning_rate": 1.7848501009682646e-06, + "loss": 0.81197661, + "num_input_tokens_seen": 196516220, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 9119, + "time_per_iteration": 2.431641101837158 + }, + { + "auxiliary_loss_clip": 0.01106769, + "auxiliary_loss_mlp": 0.01032074, + "balance_loss_clip": 1.02143443, + "balance_loss_mlp": 1.0393101, + "epoch": 0.5483240643318803, + "flos": 25410318900480.0, + "grad_norm": 1.6818671426073972, + "language_loss": 0.82585561, + "learning_rate": 1.7844629034233604e-06, + "loss": 0.84724402, + "num_input_tokens_seen": 196533860, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.671875, + "step": 9120, + "time_per_iteration": 2.501248359680176 + }, + { + "auxiliary_loss_clip": 0.01110356, + "auxiliary_loss_mlp": 0.01038217, + "balance_loss_clip": 1.02538443, + "balance_loss_mlp": 1.03979588, + "epoch": 0.5483841875845483, + "flos": 21466896272640.0, + "grad_norm": 1.7397757233914666, + "language_loss": 0.80841327, + "learning_rate": 1.7840757140516455e-06, + "loss": 0.82989895, + "num_input_tokens_seen": 196551305, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.70703125, + "step": 9121, + "time_per_iteration": 2.449951171875 + }, + { + "auxiliary_loss_clip": 0.01108531, + "auxiliary_loss_mlp": 0.0103453, + "balance_loss_clip": 1.02164376, + "balance_loss_mlp": 1.03663361, + "epoch": 0.5484443108372163, + "flos": 24747263792640.0, + "grad_norm": 2.0253856212842662, + "language_loss": 0.61077833, + "learning_rate": 1.7836885328678008e-06, + "loss": 0.63220894, + "num_input_tokens_seen": 196569420, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 9122, + "time_per_iteration": 2.4943363666534424 + }, + { + "auxiliary_loss_clip": 0.01107335, + "auxiliary_loss_mlp": 0.01031907, + "balance_loss_clip": 1.02135706, + "balance_loss_mlp": 1.03908038, + "epoch": 0.5485044340898843, + "flos": 25375377945600.0, + "grad_norm": 1.7986157880414966, + "language_loss": 0.71862841, + "learning_rate": 1.7833013598865084e-06, + "loss": 0.74002087, + "num_input_tokens_seen": 196590610, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.68359375, + "step": 9123, + "time_per_iteration": 2.4815285205841064 + }, + { + "auxiliary_loss_clip": 0.01108125, + "auxiliary_loss_mlp": 0.01028332, + "balance_loss_clip": 1.01702476, + "balance_loss_mlp": 1.03875828, + "epoch": 0.5485645573425523, + "flos": 12641167370880.0, + "grad_norm": 1.9471016807647592, + "language_loss": 0.83393133, + "learning_rate": 1.7829141951224505e-06, + "loss": 0.8552959, + "num_input_tokens_seen": 196606495, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6953125, + "step": 9124, + "time_per_iteration": 2.442490816116333 + }, + { + "auxiliary_loss_clip": 0.0110907, + "auxiliary_loss_mlp": 0.01034722, + "balance_loss_clip": 1.02254486, + "balance_loss_mlp": 1.04040182, + "epoch": 0.5486246805952202, + "flos": 28329425383680.0, + "grad_norm": 1.9388864941150135, + "language_loss": 0.79954362, + "learning_rate": 1.7825270385903075e-06, + "loss": 0.82098156, + "num_input_tokens_seen": 196626365, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 9125, + "time_per_iteration": 2.5117273330688477 + }, + { + "auxiliary_loss_clip": 0.01109363, + "auxiliary_loss_mlp": 0.01030791, + "balance_loss_clip": 1.01844716, + "balance_loss_mlp": 1.03870225, + "epoch": 0.5486848038478882, + "flos": 16800017817600.0, + "grad_norm": 2.35248102892353, + "language_loss": 0.74499249, + "learning_rate": 1.7821398903047617e-06, + "loss": 0.76639402, + "num_input_tokens_seen": 196644465, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.70703125, + "step": 9126, + "time_per_iteration": 2.481576442718506 + }, + { + "auxiliary_loss_clip": 0.01110687, + "auxiliary_loss_mlp": 0.01031323, + "balance_loss_clip": 1.01794803, + "balance_loss_mlp": 1.03789783, + "epoch": 0.5487449271005561, + "flos": 17236224581760.0, + "grad_norm": 2.4816786154583212, + "language_loss": 0.66715956, + "learning_rate": 1.7817527502804928e-06, + "loss": 0.68857968, + "num_input_tokens_seen": 196659160, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7265625, + "step": 9127, + "time_per_iteration": 2.462186574935913 + }, + { + "auxiliary_loss_clip": 0.01106989, + "auxiliary_loss_mlp": 0.01034985, + "balance_loss_clip": 1.02183664, + "balance_loss_mlp": 1.03737557, + "epoch": 0.5488050503532241, + "flos": 17340867878400.0, + "grad_norm": 1.7392555793748137, + "language_loss": 0.83598024, + "learning_rate": 1.781365618532181e-06, + "loss": 0.85740006, + "num_input_tokens_seen": 196677410, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.6953125, + "step": 9128, + "time_per_iteration": 2.4559218883514404 + }, + { + "auxiliary_loss_clip": 0.01107314, + "auxiliary_loss_mlp": 0.01031923, + "balance_loss_clip": 1.01948333, + "balance_loss_mlp": 1.03735828, + "epoch": 0.548865173605892, + "flos": 17239169496960.0, + "grad_norm": 1.8252742071628254, + "language_loss": 0.74370325, + "learning_rate": 1.7809784950745078e-06, + "loss": 0.76509559, + "num_input_tokens_seen": 196696765, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.69921875, + "step": 9129, + "time_per_iteration": 2.443394422531128 + }, + { + "auxiliary_loss_clip": 0.01111598, + "auxiliary_loss_mlp": 0.01030328, + "balance_loss_clip": 1.01749516, + "balance_loss_mlp": 1.0391345, + "epoch": 0.5489252968585601, + "flos": 17456716218240.0, + "grad_norm": 2.8843985474075557, + "language_loss": 0.6325981, + "learning_rate": 1.7805913799221511e-06, + "loss": 0.65401739, + "num_input_tokens_seen": 196714895, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7265625, + "step": 9130, + "time_per_iteration": 2.424933433532715 + }, + { + "auxiliary_loss_clip": 0.01109538, + "auxiliary_loss_mlp": 0.01035884, + "balance_loss_clip": 1.02319455, + "balance_loss_mlp": 1.03782725, + "epoch": 0.548985420111228, + "flos": 26323383646080.0, + "grad_norm": 2.1259011139704804, + "language_loss": 0.62936115, + "learning_rate": 1.7802042730897915e-06, + "loss": 0.65081537, + "num_input_tokens_seen": 196735510, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71484375, + "step": 9131, + "time_per_iteration": 2.508054256439209 + }, + { + "auxiliary_loss_clip": 0.01109907, + "auxiliary_loss_mlp": 0.01032434, + "balance_loss_clip": 1.01925564, + "balance_loss_mlp": 1.03880227, + "epoch": 0.549045543363896, + "flos": 18693730748160.0, + "grad_norm": 1.7299030045344002, + "language_loss": 0.74452615, + "learning_rate": 1.7798171745921084e-06, + "loss": 0.76594955, + "num_input_tokens_seen": 196752855, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 9132, + "time_per_iteration": 2.456127166748047 + }, + { + "auxiliary_loss_clip": 0.0110607, + "auxiliary_loss_mlp": 0.01027832, + "balance_loss_clip": 1.01656091, + "balance_loss_mlp": 1.03589082, + "epoch": 0.5491056666165639, + "flos": 24717386655360.0, + "grad_norm": 1.6111198761107228, + "language_loss": 0.8129831, + "learning_rate": 1.7794300844437795e-06, + "loss": 0.83432209, + "num_input_tokens_seen": 196772230, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.703125, + "step": 9133, + "time_per_iteration": 2.490236759185791 + }, + { + "auxiliary_loss_clip": 0.01106997, + "auxiliary_loss_mlp": 0.01033411, + "balance_loss_clip": 1.02131128, + "balance_loss_mlp": 1.03802598, + "epoch": 0.5491657898692319, + "flos": 21576926609280.0, + "grad_norm": 1.7268592344479874, + "language_loss": 0.70094633, + "learning_rate": 1.7790430026594841e-06, + "loss": 0.72235036, + "num_input_tokens_seen": 196790405, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 9134, + "time_per_iteration": 3.827064275741577 + }, + { + "auxiliary_loss_clip": 0.01110087, + "auxiliary_loss_mlp": 0.01032274, + "balance_loss_clip": 1.02014494, + "balance_loss_mlp": 1.03806603, + "epoch": 0.5492259131219, + "flos": 50476432746240.0, + "grad_norm": 1.744868024388231, + "language_loss": 0.61109304, + "learning_rate": 1.7786559292539004e-06, + "loss": 0.63251662, + "num_input_tokens_seen": 196813785, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71875, + "step": 9135, + "time_per_iteration": 2.730273723602295 + }, + { + "auxiliary_loss_clip": 0.01110668, + "auxiliary_loss_mlp": 0.01034696, + "balance_loss_clip": 1.02089787, + "balance_loss_mlp": 1.03864121, + "epoch": 0.5492860363745679, + "flos": 25119262995840.0, + "grad_norm": 1.7368953039767876, + "language_loss": 0.72582811, + "learning_rate": 1.7782688642417058e-06, + "loss": 0.74728173, + "num_input_tokens_seen": 196834390, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.71875, + "step": 9136, + "time_per_iteration": 2.483704090118408 + }, + { + "auxiliary_loss_clip": 0.01111013, + "auxiliary_loss_mlp": 0.01036009, + "balance_loss_clip": 1.02256799, + "balance_loss_mlp": 1.03636873, + "epoch": 0.5493461596272359, + "flos": 22633777497600.0, + "grad_norm": 3.852349726597511, + "language_loss": 0.68771708, + "learning_rate": 1.7778818076375781e-06, + "loss": 0.70918733, + "num_input_tokens_seen": 196853290, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.74609375, + "step": 9137, + "time_per_iteration": 5.456461191177368 + }, + { + "auxiliary_loss_clip": 0.01031834, + "auxiliary_loss_mlp": 0.01007044, + "balance_loss_clip": 1.00602436, + "balance_loss_mlp": 1.01015878, + "epoch": 0.5494062828799038, + "flos": 66151800754560.0, + "grad_norm": 0.9040496486989937, + "language_loss": 0.6527245, + "learning_rate": 1.7774947594561947e-06, + "loss": 0.67311323, + "num_input_tokens_seen": 196913120, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.21679688, + "step": 9138, + "time_per_iteration": 4.559895753860474 + }, + { + "auxiliary_loss_clip": 0.01109871, + "auxiliary_loss_mlp": 0.01030983, + "balance_loss_clip": 1.01828778, + "balance_loss_mlp": 1.03911173, + "epoch": 0.5494664061325718, + "flos": 21105958458240.0, + "grad_norm": 1.6793798945838962, + "language_loss": 0.74981934, + "learning_rate": 1.7771077197122321e-06, + "loss": 0.7712279, + "num_input_tokens_seen": 196931530, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 9139, + "time_per_iteration": 2.4897236824035645 + }, + { + "auxiliary_loss_clip": 0.01107792, + "auxiliary_loss_mlp": 0.01029596, + "balance_loss_clip": 1.01754975, + "balance_loss_mlp": 1.03827238, + "epoch": 0.5495265293852397, + "flos": 14392566616320.0, + "grad_norm": 1.7331605634368676, + "language_loss": 0.71274745, + "learning_rate": 1.7767206884203672e-06, + "loss": 0.73412126, + "num_input_tokens_seen": 196949430, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6953125, + "step": 9140, + "time_per_iteration": 2.416760206222534 + }, + { + "auxiliary_loss_clip": 0.01105846, + "auxiliary_loss_mlp": 0.01033348, + "balance_loss_clip": 1.02035391, + "balance_loss_mlp": 1.03625703, + "epoch": 0.5495866526379077, + "flos": 25549148966400.0, + "grad_norm": 1.6373657351429003, + "language_loss": 0.76304853, + "learning_rate": 1.7763336655952762e-06, + "loss": 0.78444046, + "num_input_tokens_seen": 196968265, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6953125, + "step": 9141, + "time_per_iteration": 2.495957612991333 + }, + { + "auxiliary_loss_clip": 0.01104653, + "auxiliary_loss_mlp": 0.0102863, + "balance_loss_clip": 1.01651192, + "balance_loss_mlp": 1.03816998, + "epoch": 0.5496467758905756, + "flos": 21317256213120.0, + "grad_norm": 1.8000642859490852, + "language_loss": 0.74711812, + "learning_rate": 1.7759466512516346e-06, + "loss": 0.76845098, + "num_input_tokens_seen": 196984930, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 9142, + "time_per_iteration": 2.476701259613037 + }, + { + "auxiliary_loss_clip": 0.01112649, + "auxiliary_loss_mlp": 0.01033014, + "balance_loss_clip": 1.01920366, + "balance_loss_mlp": 1.04044414, + "epoch": 0.5497068991432437, + "flos": 22233086305920.0, + "grad_norm": 3.087747357168804, + "language_loss": 0.76516807, + "learning_rate": 1.7755596454041192e-06, + "loss": 0.78662473, + "num_input_tokens_seen": 197002320, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.72265625, + "step": 9143, + "time_per_iteration": 2.4777820110321045 + }, + { + "auxiliary_loss_clip": 0.01105498, + "auxiliary_loss_mlp": 0.0103083, + "balance_loss_clip": 1.0188787, + "balance_loss_mlp": 1.03639066, + "epoch": 0.5497670223959116, + "flos": 18479093028480.0, + "grad_norm": 4.124964872446098, + "language_loss": 0.79934669, + "learning_rate": 1.7751726480674044e-06, + "loss": 0.82070994, + "num_input_tokens_seen": 197020825, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.69140625, + "step": 9144, + "time_per_iteration": 2.470946788787842 + }, + { + "auxiliary_loss_clip": 0.01109215, + "auxiliary_loss_mlp": 0.01028797, + "balance_loss_clip": 1.0163275, + "balance_loss_mlp": 1.03886819, + "epoch": 0.5498271456485796, + "flos": 29205107049600.0, + "grad_norm": 2.259125962742438, + "language_loss": 0.71273595, + "learning_rate": 1.7747856592561645e-06, + "loss": 0.73411608, + "num_input_tokens_seen": 197040450, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.703125, + "step": 9145, + "time_per_iteration": 2.5155293941497803 + }, + { + "auxiliary_loss_clip": 0.0110796, + "auxiliary_loss_mlp": 0.01027624, + "balance_loss_clip": 1.01604867, + "balance_loss_mlp": 1.03797007, + "epoch": 0.5498872689012475, + "flos": 34824372664320.0, + "grad_norm": 1.760392083970442, + "language_loss": 0.70398986, + "learning_rate": 1.774398678985076e-06, + "loss": 0.72534567, + "num_input_tokens_seen": 197063930, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.703125, + "step": 9146, + "time_per_iteration": 2.5837745666503906 + }, + { + "auxiliary_loss_clip": 0.01103828, + "auxiliary_loss_mlp": 0.01030506, + "balance_loss_clip": 1.01897275, + "balance_loss_mlp": 1.03747129, + "epoch": 0.5499473921539155, + "flos": 25921938268800.0, + "grad_norm": 1.7328002119898687, + "language_loss": 0.6403445, + "learning_rate": 1.7740117072688113e-06, + "loss": 0.66168791, + "num_input_tokens_seen": 197082660, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 9147, + "time_per_iteration": 2.5004754066467285 + }, + { + "auxiliary_loss_clip": 0.01110115, + "auxiliary_loss_mlp": 0.01029285, + "balance_loss_clip": 1.01714349, + "balance_loss_mlp": 1.04033351, + "epoch": 0.5500075154065835, + "flos": 22273701609600.0, + "grad_norm": 2.3129813772985854, + "language_loss": 0.80632472, + "learning_rate": 1.7736247441220458e-06, + "loss": 0.82771873, + "num_input_tokens_seen": 197100675, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6953125, + "step": 9148, + "time_per_iteration": 2.4941914081573486 + }, + { + "auxiliary_loss_clip": 0.01109987, + "auxiliary_loss_mlp": 0.01034322, + "balance_loss_clip": 1.0224669, + "balance_loss_mlp": 1.04013515, + "epoch": 0.5500676386592515, + "flos": 28037507552640.0, + "grad_norm": 1.5952381042001647, + "language_loss": 0.78739786, + "learning_rate": 1.773237789559453e-06, + "loss": 0.80884099, + "num_input_tokens_seen": 197121320, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.69921875, + "step": 9149, + "time_per_iteration": 2.5276949405670166 + }, + { + "auxiliary_loss_clip": 0.01108964, + "auxiliary_loss_mlp": 0.01029124, + "balance_loss_clip": 1.01695323, + "balance_loss_mlp": 1.03880644, + "epoch": 0.5501277619119195, + "flos": 23914819123200.0, + "grad_norm": 2.0296810240639847, + "language_loss": 0.72119236, + "learning_rate": 1.7728508435957052e-06, + "loss": 0.74257326, + "num_input_tokens_seen": 197138965, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.703125, + "step": 9150, + "time_per_iteration": 2.4646284580230713 + }, + { + "auxiliary_loss_clip": 0.01110946, + "auxiliary_loss_mlp": 0.01027984, + "balance_loss_clip": 1.01450694, + "balance_loss_mlp": 1.03812099, + "epoch": 0.5501878851645874, + "flos": 20923783655040.0, + "grad_norm": 1.6901514106805953, + "language_loss": 0.74800563, + "learning_rate": 1.772463906245477e-06, + "loss": 0.76939499, + "num_input_tokens_seen": 197156460, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7265625, + "step": 9151, + "time_per_iteration": 2.4528467655181885 + }, + { + "auxiliary_loss_clip": 0.01109486, + "auxiliary_loss_mlp": 0.01027197, + "balance_loss_clip": 1.01572907, + "balance_loss_mlp": 1.03945291, + "epoch": 0.5502480084172554, + "flos": 20665298407680.0, + "grad_norm": 1.835684303690663, + "language_loss": 0.76049578, + "learning_rate": 1.7720769775234394e-06, + "loss": 0.78186262, + "num_input_tokens_seen": 197175140, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.703125, + "step": 9152, + "time_per_iteration": 2.4587628841400146 + }, + { + "auxiliary_loss_clip": 0.011054, + "auxiliary_loss_mlp": 0.01028906, + "balance_loss_clip": 1.01691318, + "balance_loss_mlp": 1.03700173, + "epoch": 0.5503081316699233, + "flos": 26432552056320.0, + "grad_norm": 1.7890824738540096, + "language_loss": 0.82162666, + "learning_rate": 1.7716900574442662e-06, + "loss": 0.84296966, + "num_input_tokens_seen": 197194345, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.68359375, + "step": 9153, + "time_per_iteration": 2.490391492843628 + }, + { + "auxiliary_loss_clip": 0.01107152, + "auxiliary_loss_mlp": 0.01032594, + "balance_loss_clip": 1.02004111, + "balance_loss_mlp": 1.03787208, + "epoch": 0.5503682549225913, + "flos": 30629144718720.0, + "grad_norm": 1.7732052023343188, + "language_loss": 0.74143934, + "learning_rate": 1.7713031460226294e-06, + "loss": 0.76283687, + "num_input_tokens_seen": 197215535, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69140625, + "step": 9154, + "time_per_iteration": 2.5304152965545654 + }, + { + "auxiliary_loss_clip": 0.01113689, + "auxiliary_loss_mlp": 0.01035044, + "balance_loss_clip": 1.02184761, + "balance_loss_mlp": 1.04016376, + "epoch": 0.5504283781752592, + "flos": 22565439872640.0, + "grad_norm": 1.4983591953206352, + "language_loss": 0.7257731, + "learning_rate": 1.770916243273199e-06, + "loss": 0.74726045, + "num_input_tokens_seen": 197234945, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 9155, + "time_per_iteration": 2.4642586708068848 + }, + { + "auxiliary_loss_clip": 0.01033812, + "auxiliary_loss_mlp": 0.01001849, + "balance_loss_clip": 1.00080609, + "balance_loss_mlp": 1.01202416, + "epoch": 0.5504885014279273, + "flos": 67901009270400.0, + "grad_norm": 0.7480439065154532, + "language_loss": 0.55414248, + "learning_rate": 1.7705293492106483e-06, + "loss": 0.57449913, + "num_input_tokens_seen": 197302285, + "router_z_loss_clip": 0.01043701, + "router_z_loss_mlp": 0.21777344, + "step": 9156, + "time_per_iteration": 3.184554100036621 + }, + { + "auxiliary_loss_clip": 0.0110658, + "auxiliary_loss_mlp": 0.01029477, + "balance_loss_clip": 1.01741338, + "balance_loss_mlp": 1.0373919, + "epoch": 0.5505486246805952, + "flos": 22450058409600.0, + "grad_norm": 1.690497670143624, + "language_loss": 0.82608092, + "learning_rate": 1.7701424638496475e-06, + "loss": 0.84744143, + "num_input_tokens_seen": 197321575, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.69140625, + "step": 9157, + "time_per_iteration": 2.4718377590179443 + }, + { + "auxiliary_loss_clip": 0.01115009, + "auxiliary_loss_mlp": 0.01031366, + "balance_loss_clip": 1.01764512, + "balance_loss_mlp": 1.04140961, + "epoch": 0.5506087479332632, + "flos": 26906896085760.0, + "grad_norm": 2.5846917450647138, + "language_loss": 0.75262648, + "learning_rate": 1.7697555872048677e-06, + "loss": 0.77409017, + "num_input_tokens_seen": 197340255, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.734375, + "step": 9158, + "time_per_iteration": 2.483400583267212 + }, + { + "auxiliary_loss_clip": 0.01105976, + "auxiliary_loss_mlp": 0.01030833, + "balance_loss_clip": 1.01863742, + "balance_loss_mlp": 1.0392096, + "epoch": 0.5506688711859311, + "flos": 22930256355840.0, + "grad_norm": 1.6248211907364027, + "language_loss": 0.69624805, + "learning_rate": 1.769368719290979e-06, + "loss": 0.71761608, + "num_input_tokens_seen": 197360360, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66796875, + "step": 9159, + "time_per_iteration": 2.5159049034118652 + }, + { + "auxiliary_loss_clip": 0.01110817, + "auxiliary_loss_mlp": 0.01032003, + "balance_loss_clip": 1.01913416, + "balance_loss_mlp": 1.03923249, + "epoch": 0.5507289944385991, + "flos": 29606408772480.0, + "grad_norm": 1.7392637683079002, + "language_loss": 0.67766821, + "learning_rate": 1.7689818601226516e-06, + "loss": 0.69909644, + "num_input_tokens_seen": 197381905, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71484375, + "step": 9160, + "time_per_iteration": 2.5915122032165527 + }, + { + "auxiliary_loss_clip": 0.01106091, + "auxiliary_loss_mlp": 0.01032828, + "balance_loss_clip": 1.02106166, + "balance_loss_mlp": 1.03855252, + "epoch": 0.5507891176912671, + "flos": 15334431091200.0, + "grad_norm": 1.9414097965551829, + "language_loss": 0.71404171, + "learning_rate": 1.7685950097145552e-06, + "loss": 0.7354309, + "num_input_tokens_seen": 197398555, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.67578125, + "step": 9161, + "time_per_iteration": 2.4698691368103027 + }, + { + "auxiliary_loss_clip": 0.0110819, + "auxiliary_loss_mlp": 0.01035993, + "balance_loss_clip": 1.02365494, + "balance_loss_mlp": 1.03864145, + "epoch": 0.5508492409439351, + "flos": 26578313447040.0, + "grad_norm": 2.0077015754602985, + "language_loss": 0.69346386, + "learning_rate": 1.768208168081359e-06, + "loss": 0.71490568, + "num_input_tokens_seen": 197419630, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 9162, + "time_per_iteration": 2.514615297317505 + }, + { + "auxiliary_loss_clip": 0.01107873, + "auxiliary_loss_mlp": 0.01037948, + "balance_loss_clip": 1.02538323, + "balance_loss_mlp": 1.03850245, + "epoch": 0.5509093641966031, + "flos": 25443428261760.0, + "grad_norm": 1.6272332912595904, + "language_loss": 0.8531208, + "learning_rate": 1.767821335237733e-06, + "loss": 0.87457901, + "num_input_tokens_seen": 197438480, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 9163, + "time_per_iteration": 2.55450439453125 + }, + { + "auxiliary_loss_clip": 0.01107861, + "auxiliary_loss_mlp": 0.01032273, + "balance_loss_clip": 1.02065635, + "balance_loss_mlp": 1.0394969, + "epoch": 0.550969487449271, + "flos": 18698543170560.0, + "grad_norm": 1.5452929110279412, + "language_loss": 0.8063103, + "learning_rate": 1.7674345111983441e-06, + "loss": 0.8277117, + "num_input_tokens_seen": 197456755, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.68359375, + "step": 9164, + "time_per_iteration": 2.477283239364624 + }, + { + "auxiliary_loss_clip": 0.01112735, + "auxiliary_loss_mlp": 0.01027637, + "balance_loss_clip": 1.0152092, + "balance_loss_mlp": 1.04160368, + "epoch": 0.551029610701939, + "flos": 22708723224960.0, + "grad_norm": 1.8276675469309818, + "language_loss": 0.73409986, + "learning_rate": 1.767047695977863e-06, + "loss": 0.75550359, + "num_input_tokens_seen": 197475530, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 9165, + "time_per_iteration": 2.4870002269744873 + }, + { + "auxiliary_loss_clip": 0.01105934, + "auxiliary_loss_mlp": 0.01028437, + "balance_loss_clip": 1.01700497, + "balance_loss_mlp": 1.03732443, + "epoch": 0.5510897339546069, + "flos": 12420496166400.0, + "grad_norm": 1.8849650051461906, + "language_loss": 0.79019225, + "learning_rate": 1.7666608895909563e-06, + "loss": 0.81153595, + "num_input_tokens_seen": 197490835, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6875, + "step": 9166, + "time_per_iteration": 2.435049295425415 + }, + { + "auxiliary_loss_clip": 0.01108748, + "auxiliary_loss_mlp": 0.01028394, + "balance_loss_clip": 1.01596665, + "balance_loss_mlp": 1.03822398, + "epoch": 0.5511498572072749, + "flos": 18770579896320.0, + "grad_norm": 2.033929506473001, + "language_loss": 0.76165509, + "learning_rate": 1.7662740920522913e-06, + "loss": 0.78302646, + "num_input_tokens_seen": 197508770, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.70703125, + "step": 9167, + "time_per_iteration": 2.474677562713623 + }, + { + "auxiliary_loss_clip": 0.01105676, + "auxiliary_loss_mlp": 0.01029086, + "balance_loss_clip": 1.01631832, + "balance_loss_mlp": 1.03744709, + "epoch": 0.5512099804599428, + "flos": 19573326996480.0, + "grad_norm": 2.261050601267758, + "language_loss": 0.79845661, + "learning_rate": 1.7658873033765374e-06, + "loss": 0.81980425, + "num_input_tokens_seen": 197527340, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 9168, + "time_per_iteration": 2.484435796737671 + }, + { + "auxiliary_loss_clip": 0.01110227, + "auxiliary_loss_mlp": 0.0103542, + "balance_loss_clip": 1.0231235, + "balance_loss_mlp": 1.03901529, + "epoch": 0.5512701037126109, + "flos": 26245600744320.0, + "grad_norm": 1.641322965099804, + "language_loss": 0.68934894, + "learning_rate": 1.7655005235783591e-06, + "loss": 0.71080542, + "num_input_tokens_seen": 197547280, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.7109375, + "step": 9169, + "time_per_iteration": 2.5206069946289062 + }, + { + "auxiliary_loss_clip": 0.01102507, + "auxiliary_loss_mlp": 0.01025884, + "balance_loss_clip": 1.014714, + "balance_loss_mlp": 1.03545678, + "epoch": 0.5513302269652788, + "flos": 21945406279680.0, + "grad_norm": 2.0185216192280553, + "language_loss": 0.85350084, + "learning_rate": 1.7651137526724251e-06, + "loss": 0.87478477, + "num_input_tokens_seen": 197565045, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.671875, + "step": 9170, + "time_per_iteration": 2.4762823581695557 + }, + { + "auxiliary_loss_clip": 0.01031617, + "auxiliary_loss_mlp": 0.01002445, + "balance_loss_clip": 1.00143194, + "balance_loss_mlp": 1.00984073, + "epoch": 0.5513903502179468, + "flos": 68235948616320.0, + "grad_norm": 0.7807167648980764, + "language_loss": 0.5990442, + "learning_rate": 1.7647269906734017e-06, + "loss": 0.61938488, + "num_input_tokens_seen": 197625005, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.21777344, + "step": 9171, + "time_per_iteration": 3.0934739112854004 + }, + { + "auxiliary_loss_clip": 0.01106302, + "auxiliary_loss_mlp": 0.01032525, + "balance_loss_clip": 1.02024603, + "balance_loss_mlp": 1.03768301, + "epoch": 0.5514504734706147, + "flos": 18734238311040.0, + "grad_norm": 1.4242208217777272, + "language_loss": 0.701002, + "learning_rate": 1.7643402375959533e-06, + "loss": 0.72239029, + "num_input_tokens_seen": 197645050, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 9172, + "time_per_iteration": 2.482672929763794 + }, + { + "auxiliary_loss_clip": 0.01104259, + "auxiliary_loss_mlp": 0.01029819, + "balance_loss_clip": 1.0176115, + "balance_loss_mlp": 1.03602123, + "epoch": 0.5515105967232827, + "flos": 22270972176000.0, + "grad_norm": 1.708440744181033, + "language_loss": 0.75790203, + "learning_rate": 1.7639534934547474e-06, + "loss": 0.77924281, + "num_input_tokens_seen": 197663910, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 9173, + "time_per_iteration": 2.476710557937622 + }, + { + "auxiliary_loss_clip": 0.01104019, + "auxiliary_loss_mlp": 0.01032582, + "balance_loss_clip": 1.02019644, + "balance_loss_mlp": 1.0371182, + "epoch": 0.5515707199759508, + "flos": 22557682535040.0, + "grad_norm": 1.5740431144983165, + "language_loss": 0.74457419, + "learning_rate": 1.7635667582644484e-06, + "loss": 0.76594019, + "num_input_tokens_seen": 197681580, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 9174, + "time_per_iteration": 2.4599406719207764 + }, + { + "auxiliary_loss_clip": 0.01108196, + "auxiliary_loss_mlp": 0.01029333, + "balance_loss_clip": 1.0173409, + "balance_loss_mlp": 1.03827941, + "epoch": 0.5516308432286187, + "flos": 28291072636800.0, + "grad_norm": 1.784111045924148, + "language_loss": 0.72615731, + "learning_rate": 1.7631800320397217e-06, + "loss": 0.74753261, + "num_input_tokens_seen": 197702095, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69921875, + "step": 9175, + "time_per_iteration": 2.5028982162475586 + }, + { + "auxiliary_loss_clip": 0.01108059, + "auxiliary_loss_mlp": 0.01035871, + "balance_loss_clip": 1.02403331, + "balance_loss_mlp": 1.0378927, + "epoch": 0.5516909664812867, + "flos": 18764474584320.0, + "grad_norm": 1.8209397746213287, + "language_loss": 0.69452918, + "learning_rate": 1.7627933147952318e-06, + "loss": 0.71596849, + "num_input_tokens_seen": 197720720, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.703125, + "step": 9176, + "time_per_iteration": 3.852022171020508 + }, + { + "auxiliary_loss_clip": 0.01104891, + "auxiliary_loss_mlp": 0.01027669, + "balance_loss_clip": 1.01612973, + "balance_loss_mlp": 1.03734601, + "epoch": 0.5517510897339546, + "flos": 27740346336000.0, + "grad_norm": 1.7630507090786165, + "language_loss": 0.70797551, + "learning_rate": 1.7624066065456435e-06, + "loss": 0.7293011, + "num_input_tokens_seen": 197741820, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 9177, + "time_per_iteration": 2.507990837097168 + }, + { + "auxiliary_loss_clip": 0.01109377, + "auxiliary_loss_mlp": 0.0102783, + "balance_loss_clip": 1.01644588, + "balance_loss_mlp": 1.03980064, + "epoch": 0.5518112129866226, + "flos": 18404470523520.0, + "grad_norm": 1.556329351454275, + "language_loss": 0.80197215, + "learning_rate": 1.7620199073056204e-06, + "loss": 0.82334423, + "num_input_tokens_seen": 197759160, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6953125, + "step": 9178, + "time_per_iteration": 2.4645802974700928 + }, + { + "auxiliary_loss_clip": 0.01110368, + "auxiliary_loss_mlp": 0.01040375, + "balance_loss_clip": 1.02744687, + "balance_loss_mlp": 1.03942454, + "epoch": 0.5518713362392905, + "flos": 25082670015360.0, + "grad_norm": 1.5358645892565401, + "language_loss": 0.74621391, + "learning_rate": 1.761633217089826e-06, + "loss": 0.7677213, + "num_input_tokens_seen": 197779760, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9179, + "time_per_iteration": 4.023995399475098 + }, + { + "auxiliary_loss_clip": 0.01106959, + "auxiliary_loss_mlp": 0.01034445, + "balance_loss_clip": 1.02213681, + "balance_loss_mlp": 1.0385108, + "epoch": 0.5519314594919585, + "flos": 36538999361280.0, + "grad_norm": 1.8924336027697886, + "language_loss": 0.70433038, + "learning_rate": 1.761246535912924e-06, + "loss": 0.72574437, + "num_input_tokens_seen": 197801545, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.68359375, + "step": 9180, + "time_per_iteration": 4.060170650482178 + }, + { + "auxiliary_loss_clip": 0.0110796, + "auxiliary_loss_mlp": 0.01034059, + "balance_loss_clip": 1.02197158, + "balance_loss_mlp": 1.03808069, + "epoch": 0.5519915827446265, + "flos": 20448613612800.0, + "grad_norm": 1.9150410275355574, + "language_loss": 0.66870642, + "learning_rate": 1.7608598637895776e-06, + "loss": 0.69012666, + "num_input_tokens_seen": 197820760, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69921875, + "step": 9181, + "time_per_iteration": 2.4741644859313965 + }, + { + "auxiliary_loss_clip": 0.01109873, + "auxiliary_loss_mlp": 0.01028117, + "balance_loss_clip": 1.01514149, + "balance_loss_mlp": 1.03774214, + "epoch": 0.5520517059972945, + "flos": 23768052151680.0, + "grad_norm": 1.9118124234638791, + "language_loss": 0.79398257, + "learning_rate": 1.7604732007344486e-06, + "loss": 0.81536245, + "num_input_tokens_seen": 197840195, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.72265625, + "step": 9182, + "time_per_iteration": 2.4744672775268555 + }, + { + "auxiliary_loss_clip": 0.01107607, + "auxiliary_loss_mlp": 0.01027131, + "balance_loss_clip": 1.0145787, + "balance_loss_mlp": 1.03817368, + "epoch": 0.5521118292499624, + "flos": 22196457411840.0, + "grad_norm": 1.7815316362256517, + "language_loss": 0.82710314, + "learning_rate": 1.7600865467622003e-06, + "loss": 0.84845054, + "num_input_tokens_seen": 197859475, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 9183, + "time_per_iteration": 2.4999542236328125 + }, + { + "auxiliary_loss_clip": 0.01106614, + "auxiliary_loss_mlp": 0.01026614, + "balance_loss_clip": 1.01474106, + "balance_loss_mlp": 1.03841662, + "epoch": 0.5521719525026304, + "flos": 23583291569280.0, + "grad_norm": 1.3300741669264389, + "language_loss": 0.67200708, + "learning_rate": 1.7596999018874936e-06, + "loss": 0.69333941, + "num_input_tokens_seen": 197879395, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 9184, + "time_per_iteration": 2.4747231006622314 + }, + { + "auxiliary_loss_clip": 0.01107758, + "auxiliary_loss_mlp": 0.01025737, + "balance_loss_clip": 1.01336932, + "balance_loss_mlp": 1.03818047, + "epoch": 0.5522320757552983, + "flos": 26137617482880.0, + "grad_norm": 1.521307728440283, + "language_loss": 0.76197934, + "learning_rate": 1.7593132661249917e-06, + "loss": 0.78331435, + "num_input_tokens_seen": 197900815, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6953125, + "step": 9185, + "time_per_iteration": 2.534573793411255 + }, + { + "auxiliary_loss_clip": 0.01109207, + "auxiliary_loss_mlp": 0.01034245, + "balance_loss_clip": 1.02194285, + "balance_loss_mlp": 1.0396924, + "epoch": 0.5522921990079663, + "flos": 24676160820480.0, + "grad_norm": 1.6519250451143856, + "language_loss": 0.7376985, + "learning_rate": 1.7589266394893536e-06, + "loss": 0.75913298, + "num_input_tokens_seen": 197918985, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 9186, + "time_per_iteration": 2.5148305892944336 + }, + { + "auxiliary_loss_clip": 0.01111442, + "auxiliary_loss_mlp": 0.01033032, + "balance_loss_clip": 1.02137351, + "balance_loss_mlp": 1.04041481, + "epoch": 0.5523523222606344, + "flos": 22748153379840.0, + "grad_norm": 2.3297788732806275, + "language_loss": 0.6611231, + "learning_rate": 1.7585400219952421e-06, + "loss": 0.68256783, + "num_input_tokens_seen": 197937725, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.7109375, + "step": 9187, + "time_per_iteration": 2.4953529834747314 + }, + { + "auxiliary_loss_clip": 0.0110884, + "auxiliary_loss_mlp": 0.010278, + "balance_loss_clip": 1.01550388, + "balance_loss_mlp": 1.0389905, + "epoch": 0.5524124455133023, + "flos": 19755825022080.0, + "grad_norm": 1.699111440652827, + "language_loss": 0.77629888, + "learning_rate": 1.758153413657318e-06, + "loss": 0.79766524, + "num_input_tokens_seen": 197955635, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69921875, + "step": 9188, + "time_per_iteration": 2.4593770503997803 + }, + { + "auxiliary_loss_clip": 0.01105648, + "auxiliary_loss_mlp": 0.01030962, + "balance_loss_clip": 1.01829576, + "balance_loss_mlp": 1.03729725, + "epoch": 0.5524725687659703, + "flos": 23294821443840.0, + "grad_norm": 1.837373875573988, + "language_loss": 0.81666493, + "learning_rate": 1.7577668144902394e-06, + "loss": 0.83803099, + "num_input_tokens_seen": 197974490, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.68359375, + "step": 9189, + "time_per_iteration": 2.514223098754883 + }, + { + "auxiliary_loss_clip": 0.01105635, + "auxiliary_loss_mlp": 0.01028064, + "balance_loss_clip": 1.0153625, + "balance_loss_mlp": 1.03796136, + "epoch": 0.5525326920186382, + "flos": 24862178378880.0, + "grad_norm": 1.3687672594772107, + "language_loss": 0.76419669, + "learning_rate": 1.7573802245086684e-06, + "loss": 0.78553367, + "num_input_tokens_seen": 197995735, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 9190, + "time_per_iteration": 2.4991939067840576 + }, + { + "auxiliary_loss_clip": 0.01111398, + "auxiliary_loss_mlp": 0.0103191, + "balance_loss_clip": 1.01837981, + "balance_loss_mlp": 1.03823757, + "epoch": 0.5525928152713062, + "flos": 13735580906880.0, + "grad_norm": 3.1168017297152484, + "language_loss": 0.78959441, + "learning_rate": 1.7569936437272627e-06, + "loss": 0.81102753, + "num_input_tokens_seen": 198009685, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73046875, + "step": 9191, + "time_per_iteration": 2.447239875793457 + }, + { + "auxiliary_loss_clip": 0.01106392, + "auxiliary_loss_mlp": 0.01030056, + "balance_loss_clip": 1.01799178, + "balance_loss_mlp": 1.03781414, + "epoch": 0.5526529385239741, + "flos": 13071592045440.0, + "grad_norm": 2.1697062429363427, + "language_loss": 0.68734175, + "learning_rate": 1.7566070721606829e-06, + "loss": 0.70870626, + "num_input_tokens_seen": 198026845, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.68359375, + "step": 9192, + "time_per_iteration": 2.424194812774658 + }, + { + "auxiliary_loss_clip": 0.01104657, + "auxiliary_loss_mlp": 0.01031796, + "balance_loss_clip": 1.0210079, + "balance_loss_mlp": 1.03741503, + "epoch": 0.5527130617766421, + "flos": 23148377694720.0, + "grad_norm": 1.580245881596358, + "language_loss": 0.77429307, + "learning_rate": 1.756220509823588e-06, + "loss": 0.79565763, + "num_input_tokens_seen": 198045275, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.671875, + "step": 9193, + "time_per_iteration": 2.486544370651245 + }, + { + "auxiliary_loss_clip": 0.01106633, + "auxiliary_loss_mlp": 0.01033853, + "balance_loss_clip": 1.02199149, + "balance_loss_mlp": 1.03775311, + "epoch": 0.55277318502931, + "flos": 21285547482240.0, + "grad_norm": 1.6936547327162281, + "language_loss": 0.78554469, + "learning_rate": 1.7558339567306344e-06, + "loss": 0.80694956, + "num_input_tokens_seen": 198065760, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 9194, + "time_per_iteration": 2.446010112762451 + }, + { + "auxiliary_loss_clip": 0.01111391, + "auxiliary_loss_mlp": 0.01032697, + "balance_loss_clip": 1.01982856, + "balance_loss_mlp": 1.03737998, + "epoch": 0.5528333082819781, + "flos": 38324549462400.0, + "grad_norm": 1.6547854303314034, + "language_loss": 0.69580936, + "learning_rate": 1.7554474128964825e-06, + "loss": 0.71725023, + "num_input_tokens_seen": 198087595, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.73828125, + "step": 9195, + "time_per_iteration": 2.633622407913208 + }, + { + "auxiliary_loss_clip": 0.01114776, + "auxiliary_loss_mlp": 0.01029834, + "balance_loss_clip": 1.01669717, + "balance_loss_mlp": 1.0401336, + "epoch": 0.552893431534646, + "flos": 13553621585280.0, + "grad_norm": 2.085899367605988, + "language_loss": 0.73877811, + "learning_rate": 1.7550608783357887e-06, + "loss": 0.76022422, + "num_input_tokens_seen": 198104620, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.74609375, + "step": 9196, + "time_per_iteration": 2.4477953910827637 + }, + { + "auxiliary_loss_clip": 0.01107465, + "auxiliary_loss_mlp": 0.01032297, + "balance_loss_clip": 1.0202986, + "balance_loss_mlp": 1.03845131, + "epoch": 0.552953554787314, + "flos": 21939408708480.0, + "grad_norm": 1.5760086547957552, + "language_loss": 0.76767844, + "learning_rate": 1.7546743530632115e-06, + "loss": 0.78907609, + "num_input_tokens_seen": 198123565, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 9197, + "time_per_iteration": 2.4946064949035645 + }, + { + "auxiliary_loss_clip": 0.01104392, + "auxiliary_loss_mlp": 0.0102516, + "balance_loss_clip": 1.01429963, + "balance_loss_mlp": 1.03566051, + "epoch": 0.5530136780399819, + "flos": 43658002558080.0, + "grad_norm": 1.6045583807501234, + "language_loss": 0.76419538, + "learning_rate": 1.754287837093407e-06, + "loss": 0.78549087, + "num_input_tokens_seen": 198148270, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6875, + "step": 9198, + "time_per_iteration": 2.7027511596679688 + }, + { + "auxiliary_loss_clip": 0.01105504, + "auxiliary_loss_mlp": 0.01025375, + "balance_loss_clip": 1.0140028, + "balance_loss_mlp": 1.03652, + "epoch": 0.5530738012926499, + "flos": 25045502417280.0, + "grad_norm": 1.7911524754161214, + "language_loss": 0.79089695, + "learning_rate": 1.7539013304410327e-06, + "loss": 0.81220573, + "num_input_tokens_seen": 198168810, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6875, + "step": 9199, + "time_per_iteration": 2.5071682929992676 + }, + { + "auxiliary_loss_clip": 0.01106031, + "auxiliary_loss_mlp": 0.01030471, + "balance_loss_clip": 1.01893711, + "balance_loss_mlp": 1.03667951, + "epoch": 0.553133924545318, + "flos": 16472081623680.0, + "grad_norm": 1.789754163992573, + "language_loss": 0.64116317, + "learning_rate": 1.7535148331207443e-06, + "loss": 0.66252816, + "num_input_tokens_seen": 198186200, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6953125, + "step": 9200, + "time_per_iteration": 2.453810214996338 + }, + { + "auxiliary_loss_clip": 0.01112463, + "auxiliary_loss_mlp": 0.01027812, + "balance_loss_clip": 1.01444292, + "balance_loss_mlp": 1.03949916, + "epoch": 0.5531940477979859, + "flos": 24606207083520.0, + "grad_norm": 1.54627322023295, + "language_loss": 0.66172588, + "learning_rate": 1.7531283451471978e-06, + "loss": 0.6831286, + "num_input_tokens_seen": 198207050, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73046875, + "step": 9201, + "time_per_iteration": 2.5050048828125 + }, + { + "auxiliary_loss_clip": 0.01110041, + "auxiliary_loss_mlp": 0.01032711, + "balance_loss_clip": 1.02000964, + "balance_loss_mlp": 1.04039264, + "epoch": 0.5532541710506539, + "flos": 22159577122560.0, + "grad_norm": 2.1300156031813624, + "language_loss": 0.60931027, + "learning_rate": 1.7527418665350502e-06, + "loss": 0.63073778, + "num_input_tokens_seen": 198224565, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 9202, + "time_per_iteration": 2.454374074935913 + }, + { + "auxiliary_loss_clip": 0.01105546, + "auxiliary_loss_mlp": 0.01028359, + "balance_loss_clip": 1.01677179, + "balance_loss_mlp": 1.0374378, + "epoch": 0.5533142943033218, + "flos": 21397265758080.0, + "grad_norm": 1.6333926311503897, + "language_loss": 0.64007318, + "learning_rate": 1.7523553972989548e-06, + "loss": 0.66141224, + "num_input_tokens_seen": 198244790, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6796875, + "step": 9203, + "time_per_iteration": 2.520813226699829 + }, + { + "auxiliary_loss_clip": 0.01106796, + "auxiliary_loss_mlp": 0.0102947, + "balance_loss_clip": 1.01762629, + "balance_loss_mlp": 1.03710103, + "epoch": 0.5533744175559898, + "flos": 23550541344000.0, + "grad_norm": 1.5876710884236471, + "language_loss": 0.63839149, + "learning_rate": 1.7519689374535683e-06, + "loss": 0.65975416, + "num_input_tokens_seen": 198264375, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6953125, + "step": 9204, + "time_per_iteration": 2.519796371459961 + }, + { + "auxiliary_loss_clip": 0.01103569, + "auxiliary_loss_mlp": 0.01026922, + "balance_loss_clip": 1.01617515, + "balance_loss_mlp": 1.0357914, + "epoch": 0.5534345408086577, + "flos": 24061514267520.0, + "grad_norm": 1.7042490030554438, + "language_loss": 0.77431834, + "learning_rate": 1.7515824870135445e-06, + "loss": 0.79562324, + "num_input_tokens_seen": 198283895, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6796875, + "step": 9205, + "time_per_iteration": 2.5149800777435303 + }, + { + "auxiliary_loss_clip": 0.01105223, + "auxiliary_loss_mlp": 0.01029733, + "balance_loss_clip": 1.01799703, + "balance_loss_mlp": 1.03753543, + "epoch": 0.5534946640613257, + "flos": 33771831408000.0, + "grad_norm": 1.5447277527142993, + "language_loss": 0.72338134, + "learning_rate": 1.751196045993537e-06, + "loss": 0.74473095, + "num_input_tokens_seen": 198310035, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.67578125, + "step": 9206, + "time_per_iteration": 2.6088132858276367 + }, + { + "auxiliary_loss_clip": 0.01107088, + "auxiliary_loss_mlp": 0.01030178, + "balance_loss_clip": 1.01891243, + "balance_loss_mlp": 1.03847539, + "epoch": 0.5535547873139937, + "flos": 15159223526400.0, + "grad_norm": 1.9679878300179545, + "language_loss": 0.75601065, + "learning_rate": 1.7508096144082012e-06, + "loss": 0.77738333, + "num_input_tokens_seen": 198327810, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6875, + "step": 9207, + "time_per_iteration": 2.4550647735595703 + }, + { + "auxiliary_loss_clip": 0.01112139, + "auxiliary_loss_mlp": 0.01031947, + "balance_loss_clip": 1.01861894, + "balance_loss_mlp": 1.03909707, + "epoch": 0.5536149105666617, + "flos": 16980863817600.0, + "grad_norm": 2.4900859433120055, + "language_loss": 0.61790574, + "learning_rate": 1.750423192272189e-06, + "loss": 0.6393466, + "num_input_tokens_seen": 198343150, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73046875, + "step": 9208, + "time_per_iteration": 2.4474070072174072 + }, + { + "auxiliary_loss_clip": 0.01109576, + "auxiliary_loss_mlp": 0.01030497, + "balance_loss_clip": 1.01908827, + "balance_loss_mlp": 1.03917742, + "epoch": 0.5536750338193296, + "flos": 18149935772160.0, + "grad_norm": 2.138498398763569, + "language_loss": 0.64059991, + "learning_rate": 1.7500367796001547e-06, + "loss": 0.66200066, + "num_input_tokens_seen": 198360925, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.703125, + "step": 9209, + "time_per_iteration": 2.49118709564209 + }, + { + "auxiliary_loss_clip": 0.01106938, + "auxiliary_loss_mlp": 0.01032725, + "balance_loss_clip": 1.02030945, + "balance_loss_mlp": 1.03779769, + "epoch": 0.5537351570719976, + "flos": 22747794243840.0, + "grad_norm": 1.9091325066097349, + "language_loss": 0.8244276, + "learning_rate": 1.7496503764067513e-06, + "loss": 0.84582424, + "num_input_tokens_seen": 198379265, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69140625, + "step": 9210, + "time_per_iteration": 2.479508876800537 + }, + { + "auxiliary_loss_clip": 0.01104462, + "auxiliary_loss_mlp": 0.01027145, + "balance_loss_clip": 1.01554608, + "balance_loss_mlp": 1.03640354, + "epoch": 0.5537952803246655, + "flos": 26356026130560.0, + "grad_norm": 1.9903415105614328, + "language_loss": 0.72810864, + "learning_rate": 1.74926398270663e-06, + "loss": 0.74942476, + "num_input_tokens_seen": 198399490, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 9211, + "time_per_iteration": 2.56174635887146 + }, + { + "auxiliary_loss_clip": 0.01109862, + "auxiliary_loss_mlp": 0.01034269, + "balance_loss_clip": 1.02101886, + "balance_loss_mlp": 1.03795481, + "epoch": 0.5538554035773335, + "flos": 18037427397120.0, + "grad_norm": 1.687820261734967, + "language_loss": 0.66492426, + "learning_rate": 1.7488775985144437e-06, + "loss": 0.68636549, + "num_input_tokens_seen": 198419110, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 9212, + "time_per_iteration": 2.4493961334228516 + }, + { + "auxiliary_loss_clip": 0.01107628, + "auxiliary_loss_mlp": 0.01032485, + "balance_loss_clip": 1.01846039, + "balance_loss_mlp": 1.03564453, + "epoch": 0.5539155268300014, + "flos": 31686247002240.0, + "grad_norm": 1.478127311181698, + "language_loss": 0.51676697, + "learning_rate": 1.7484912238448443e-06, + "loss": 0.53816813, + "num_input_tokens_seen": 198441360, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.71875, + "step": 9213, + "time_per_iteration": 2.5872037410736084 + }, + { + "auxiliary_loss_clip": 0.01111386, + "auxiliary_loss_mlp": 0.01031335, + "balance_loss_clip": 1.01868105, + "balance_loss_mlp": 1.03979373, + "epoch": 0.5539756500826695, + "flos": 15193769431680.0, + "grad_norm": 1.9151587743929102, + "language_loss": 0.8548407, + "learning_rate": 1.7481048587124827e-06, + "loss": 0.87626791, + "num_input_tokens_seen": 198459835, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71484375, + "step": 9214, + "time_per_iteration": 2.4696502685546875 + }, + { + "auxiliary_loss_clip": 0.01108266, + "auxiliary_loss_mlp": 0.01027132, + "balance_loss_clip": 1.01570582, + "balance_loss_mlp": 1.03970075, + "epoch": 0.5540357733353375, + "flos": 26353117128960.0, + "grad_norm": 1.700191688942819, + "language_loss": 0.70016778, + "learning_rate": 1.7477185031320108e-06, + "loss": 0.72152174, + "num_input_tokens_seen": 198478955, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6875, + "step": 9215, + "time_per_iteration": 2.50022029876709 + }, + { + "auxiliary_loss_clip": 0.01109258, + "auxiliary_loss_mlp": 0.01029951, + "balance_loss_clip": 1.01724386, + "balance_loss_mlp": 1.03815317, + "epoch": 0.5540958965880054, + "flos": 21323684747520.0, + "grad_norm": 1.5266679061001223, + "language_loss": 0.73124695, + "learning_rate": 1.7473321571180773e-06, + "loss": 0.75263906, + "num_input_tokens_seen": 198499030, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9216, + "time_per_iteration": 2.4683403968811035 + }, + { + "auxiliary_loss_clip": 0.01105693, + "auxiliary_loss_mlp": 0.01031419, + "balance_loss_clip": 1.01916385, + "balance_loss_mlp": 1.03830385, + "epoch": 0.5541560198406734, + "flos": 25666828899840.0, + "grad_norm": 1.9596921442179602, + "language_loss": 0.71501839, + "learning_rate": 1.7469458206853345e-06, + "loss": 0.73638952, + "num_input_tokens_seen": 198520265, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 9217, + "time_per_iteration": 2.542431592941284 + }, + { + "auxiliary_loss_clip": 0.01103432, + "auxiliary_loss_mlp": 0.010249, + "balance_loss_clip": 1.01331282, + "balance_loss_mlp": 1.03553486, + "epoch": 0.5542161430933413, + "flos": 21939624190080.0, + "grad_norm": 1.8113809838055568, + "language_loss": 0.7838676, + "learning_rate": 1.7465594938484315e-06, + "loss": 0.80515093, + "num_input_tokens_seen": 198539645, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 9218, + "time_per_iteration": 3.8476054668426514 + }, + { + "auxiliary_loss_clip": 0.01108339, + "auxiliary_loss_mlp": 0.01034768, + "balance_loss_clip": 1.02095163, + "balance_loss_mlp": 1.03540277, + "epoch": 0.5542762663460093, + "flos": 19571459489280.0, + "grad_norm": 2.0355993872839675, + "language_loss": 0.72591358, + "learning_rate": 1.7461731766220176e-06, + "loss": 0.74734467, + "num_input_tokens_seen": 198558710, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7265625, + "step": 9219, + "time_per_iteration": 2.4924545288085938 + }, + { + "auxiliary_loss_clip": 0.01111531, + "auxiliary_loss_mlp": 0.01039554, + "balance_loss_clip": 1.02701962, + "balance_loss_mlp": 1.03986812, + "epoch": 0.5543363895986773, + "flos": 19499063627520.0, + "grad_norm": 1.546677051774663, + "language_loss": 0.71403503, + "learning_rate": 1.7457868690207426e-06, + "loss": 0.73554587, + "num_input_tokens_seen": 198577050, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71875, + "step": 9220, + "time_per_iteration": 2.4362480640411377 + }, + { + "auxiliary_loss_clip": 0.01106556, + "auxiliary_loss_mlp": 0.0102571, + "balance_loss_clip": 1.01424217, + "balance_loss_mlp": 1.03777957, + "epoch": 0.5543965128513453, + "flos": 22635609091200.0, + "grad_norm": 1.6357699921116782, + "language_loss": 0.79294407, + "learning_rate": 1.7454005710592547e-06, + "loss": 0.81426674, + "num_input_tokens_seen": 198595290, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6875, + "step": 9221, + "time_per_iteration": 5.3692920207977295 + }, + { + "auxiliary_loss_clip": 0.01107012, + "auxiliary_loss_mlp": 0.01030177, + "balance_loss_clip": 1.01745725, + "balance_loss_mlp": 1.03750253, + "epoch": 0.5544566361040132, + "flos": 25989952671360.0, + "grad_norm": 1.7434924477802918, + "language_loss": 0.83865321, + "learning_rate": 1.7450142827522027e-06, + "loss": 0.86002505, + "num_input_tokens_seen": 198614110, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 9222, + "time_per_iteration": 2.5054023265838623 + }, + { + "auxiliary_loss_clip": 0.0111308, + "auxiliary_loss_mlp": 0.01034265, + "balance_loss_clip": 1.02092493, + "balance_loss_mlp": 1.04003119, + "epoch": 0.5545167593566812, + "flos": 28257568225920.0, + "grad_norm": 1.7723513069494143, + "language_loss": 0.75498754, + "learning_rate": 1.7446280041142344e-06, + "loss": 0.77646095, + "num_input_tokens_seen": 198633880, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73046875, + "step": 9223, + "time_per_iteration": 2.5140554904937744 + }, + { + "auxiliary_loss_clip": 0.01108507, + "auxiliary_loss_mlp": 0.01028359, + "balance_loss_clip": 1.0155921, + "balance_loss_mlp": 1.03917074, + "epoch": 0.5545768826093491, + "flos": 28476551491200.0, + "grad_norm": 1.798104527740367, + "language_loss": 0.81975842, + "learning_rate": 1.7442417351599986e-06, + "loss": 0.84112704, + "num_input_tokens_seen": 198653505, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 9224, + "time_per_iteration": 2.5273303985595703 + }, + { + "auxiliary_loss_clip": 0.01110718, + "auxiliary_loss_mlp": 0.01040562, + "balance_loss_clip": 1.02769315, + "balance_loss_mlp": 1.0393647, + "epoch": 0.5546370058620171, + "flos": 18478051534080.0, + "grad_norm": 2.764116317399656, + "language_loss": 0.5700891, + "learning_rate": 1.743855475904141e-06, + "loss": 0.59160185, + "num_input_tokens_seen": 198671890, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9225, + "time_per_iteration": 2.4379100799560547 + }, + { + "auxiliary_loss_clip": 0.01110187, + "auxiliary_loss_mlp": 0.01036326, + "balance_loss_clip": 1.02342129, + "balance_loss_mlp": 1.03836024, + "epoch": 0.554697129114685, + "flos": 22930507751040.0, + "grad_norm": 1.5085866030732613, + "language_loss": 0.67495418, + "learning_rate": 1.7434692263613098e-06, + "loss": 0.69641924, + "num_input_tokens_seen": 198691995, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 9226, + "time_per_iteration": 2.4891088008880615 + }, + { + "auxiliary_loss_clip": 0.01107189, + "auxiliary_loss_mlp": 0.01032344, + "balance_loss_clip": 1.01961827, + "balance_loss_mlp": 1.03644681, + "epoch": 0.5547572523673531, + "flos": 21797166850560.0, + "grad_norm": 1.4051697234065024, + "language_loss": 0.74315172, + "learning_rate": 1.7430829865461518e-06, + "loss": 0.76454705, + "num_input_tokens_seen": 198712440, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 9227, + "time_per_iteration": 2.4678173065185547 + }, + { + "auxiliary_loss_clip": 0.01114145, + "auxiliary_loss_mlp": 0.01030915, + "balance_loss_clip": 1.01826084, + "balance_loss_mlp": 1.04228091, + "epoch": 0.5548173756200211, + "flos": 22342829333760.0, + "grad_norm": 2.5448731753452405, + "language_loss": 0.73452151, + "learning_rate": 1.7426967564733118e-06, + "loss": 0.75597215, + "num_input_tokens_seen": 198731515, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 9228, + "time_per_iteration": 2.4851813316345215 + }, + { + "auxiliary_loss_clip": 0.01110082, + "auxiliary_loss_mlp": 0.01030138, + "balance_loss_clip": 1.01803231, + "balance_loss_mlp": 1.03902888, + "epoch": 0.554877498872689, + "flos": 17858736213120.0, + "grad_norm": 2.153919283771507, + "language_loss": 0.76069826, + "learning_rate": 1.7423105361574373e-06, + "loss": 0.7821005, + "num_input_tokens_seen": 198749750, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.7109375, + "step": 9229, + "time_per_iteration": 2.4682509899139404 + }, + { + "auxiliary_loss_clip": 0.01110192, + "auxiliary_loss_mlp": 0.01039645, + "balance_loss_clip": 1.02623343, + "balance_loss_mlp": 1.03956127, + "epoch": 0.554937622125357, + "flos": 17238343484160.0, + "grad_norm": 1.3529022003633056, + "language_loss": 0.68695533, + "learning_rate": 1.741924325613172e-06, + "loss": 0.70845366, + "num_input_tokens_seen": 198768320, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.70703125, + "step": 9230, + "time_per_iteration": 2.4558916091918945 + }, + { + "auxiliary_loss_clip": 0.01110086, + "auxiliary_loss_mlp": 0.01033326, + "balance_loss_clip": 1.02054107, + "balance_loss_mlp": 1.03759503, + "epoch": 0.5549977453780249, + "flos": 25368087484800.0, + "grad_norm": 2.0513203800368327, + "language_loss": 0.67574155, + "learning_rate": 1.741538124855163e-06, + "loss": 0.69717568, + "num_input_tokens_seen": 198787230, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7265625, + "step": 9231, + "time_per_iteration": 2.4816246032714844 + }, + { + "auxiliary_loss_clip": 0.01113542, + "auxiliary_loss_mlp": 0.01035024, + "balance_loss_clip": 1.02160072, + "balance_loss_mlp": 1.03941798, + "epoch": 0.555057868630693, + "flos": 25079114568960.0, + "grad_norm": 1.5458592279354035, + "language_loss": 0.77953124, + "learning_rate": 1.7411519338980548e-06, + "loss": 0.80101693, + "num_input_tokens_seen": 198806720, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7421875, + "step": 9232, + "time_per_iteration": 2.5161256790161133 + }, + { + "auxiliary_loss_clip": 0.01106102, + "auxiliary_loss_mlp": 0.01037505, + "balance_loss_clip": 1.02622199, + "balance_loss_mlp": 1.03777027, + "epoch": 0.5551179918833609, + "flos": 26104220812800.0, + "grad_norm": 1.5305081634070101, + "language_loss": 0.82585824, + "learning_rate": 1.7407657527564898e-06, + "loss": 0.84729433, + "num_input_tokens_seen": 198826235, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.68359375, + "step": 9233, + "time_per_iteration": 2.513498306274414 + }, + { + "auxiliary_loss_clip": 0.01112184, + "auxiliary_loss_mlp": 0.01039099, + "balance_loss_clip": 1.02717805, + "balance_loss_mlp": 1.03902006, + "epoch": 0.5551781151360289, + "flos": 19384759572480.0, + "grad_norm": 2.1768956460608053, + "language_loss": 0.75171268, + "learning_rate": 1.7403795814451142e-06, + "loss": 0.77322543, + "num_input_tokens_seen": 198842655, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.734375, + "step": 9234, + "time_per_iteration": 2.4618585109710693 + }, + { + "auxiliary_loss_clip": 0.01105123, + "auxiliary_loss_mlp": 0.01028385, + "balance_loss_clip": 1.01663136, + "balance_loss_mlp": 1.03685272, + "epoch": 0.5552382383886968, + "flos": 21725956137600.0, + "grad_norm": 2.1362991517660146, + "language_loss": 0.64992738, + "learning_rate": 1.7399934199785706e-06, + "loss": 0.6712625, + "num_input_tokens_seen": 198861210, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.68359375, + "step": 9235, + "time_per_iteration": 2.4449851512908936 + }, + { + "auxiliary_loss_clip": 0.01108941, + "auxiliary_loss_mlp": 0.01032132, + "balance_loss_clip": 1.01977587, + "balance_loss_mlp": 1.03794515, + "epoch": 0.5552983616413648, + "flos": 14356189117440.0, + "grad_norm": 1.8479272776295672, + "language_loss": 0.67863953, + "learning_rate": 1.7396072683715029e-06, + "loss": 0.70005023, + "num_input_tokens_seen": 198880045, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.7109375, + "step": 9236, + "time_per_iteration": 2.4798662662506104 + }, + { + "auxiliary_loss_clip": 0.01104311, + "auxiliary_loss_mlp": 0.01024908, + "balance_loss_clip": 1.0127244, + "balance_loss_mlp": 1.03731084, + "epoch": 0.5553584848940327, + "flos": 25478548784640.0, + "grad_norm": 3.129052058582791, + "language_loss": 0.86174095, + "learning_rate": 1.7392211266385536e-06, + "loss": 0.88303316, + "num_input_tokens_seen": 198900210, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.671875, + "step": 9237, + "time_per_iteration": 2.4789483547210693 + }, + { + "auxiliary_loss_clip": 0.01104495, + "auxiliary_loss_mlp": 0.01032906, + "balance_loss_clip": 1.02062178, + "balance_loss_mlp": 1.03669763, + "epoch": 0.5554186081467007, + "flos": 22163850840960.0, + "grad_norm": 1.712591160520522, + "language_loss": 0.73281908, + "learning_rate": 1.7388349947943652e-06, + "loss": 0.75419307, + "num_input_tokens_seen": 198919055, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.67578125, + "step": 9238, + "time_per_iteration": 2.4812166690826416 + }, + { + "auxiliary_loss_clip": 0.01109816, + "auxiliary_loss_mlp": 0.0103234, + "balance_loss_clip": 1.01997221, + "balance_loss_mlp": 1.03750467, + "epoch": 0.5554787313993687, + "flos": 49746656125440.0, + "grad_norm": 1.5735650405734192, + "language_loss": 0.78268331, + "learning_rate": 1.73844887285358e-06, + "loss": 0.80410492, + "num_input_tokens_seen": 198943505, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.72265625, + "step": 9239, + "time_per_iteration": 2.6846883296966553 + }, + { + "auxiliary_loss_clip": 0.01107901, + "auxiliary_loss_mlp": 0.0102797, + "balance_loss_clip": 1.01580429, + "balance_loss_mlp": 1.03730011, + "epoch": 0.5555388546520367, + "flos": 22127365601280.0, + "grad_norm": 1.4802036052022307, + "language_loss": 0.79760826, + "learning_rate": 1.7380627608308393e-06, + "loss": 0.81896698, + "num_input_tokens_seen": 198963590, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.703125, + "step": 9240, + "time_per_iteration": 2.4733242988586426 + }, + { + "auxiliary_loss_clip": 0.01107185, + "auxiliary_loss_mlp": 0.01032369, + "balance_loss_clip": 1.02035236, + "balance_loss_mlp": 1.0374887, + "epoch": 0.5555989779047047, + "flos": 24682122478080.0, + "grad_norm": 1.5810234034759716, + "language_loss": 0.6520583, + "learning_rate": 1.737676658740786e-06, + "loss": 0.67345387, + "num_input_tokens_seen": 198982680, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69921875, + "step": 9241, + "time_per_iteration": 2.4733994007110596 + }, + { + "auxiliary_loss_clip": 0.01109149, + "auxiliary_loss_mlp": 0.01032145, + "balance_loss_clip": 1.01974106, + "balance_loss_mlp": 1.03843307, + "epoch": 0.5556591011573726, + "flos": 16106510954880.0, + "grad_norm": 1.9354963557050642, + "language_loss": 0.72742647, + "learning_rate": 1.7372905665980594e-06, + "loss": 0.74883944, + "num_input_tokens_seen": 199000185, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.70703125, + "step": 9242, + "time_per_iteration": 2.439195394515991 + }, + { + "auxiliary_loss_clip": 0.01109113, + "auxiliary_loss_mlp": 0.0103746, + "balance_loss_clip": 1.02429366, + "balance_loss_mlp": 1.03737354, + "epoch": 0.5557192244100406, + "flos": 12933695733120.0, + "grad_norm": 1.6615305539564786, + "language_loss": 0.63989079, + "learning_rate": 1.7369044844173012e-06, + "loss": 0.66135651, + "num_input_tokens_seen": 199018380, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71875, + "step": 9243, + "time_per_iteration": 2.5009653568267822 + }, + { + "auxiliary_loss_clip": 0.01109943, + "auxiliary_loss_mlp": 0.01031237, + "balance_loss_clip": 1.01894689, + "balance_loss_mlp": 1.03998828, + "epoch": 0.5557793476627085, + "flos": 23111712887040.0, + "grad_norm": 1.8112849174534187, + "language_loss": 0.75149089, + "learning_rate": 1.7365184122131509e-06, + "loss": 0.77290273, + "num_input_tokens_seen": 199037115, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 9244, + "time_per_iteration": 2.475520610809326 + }, + { + "auxiliary_loss_clip": 0.01102344, + "auxiliary_loss_mlp": 0.01028296, + "balance_loss_clip": 1.01693511, + "balance_loss_mlp": 1.03605533, + "epoch": 0.5558394709153766, + "flos": 21428040735360.0, + "grad_norm": 2.1432873648263473, + "language_loss": 0.74578094, + "learning_rate": 1.7361323500002486e-06, + "loss": 0.76708734, + "num_input_tokens_seen": 199053375, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6640625, + "step": 9245, + "time_per_iteration": 2.45875883102417 + }, + { + "auxiliary_loss_clip": 0.01111156, + "auxiliary_loss_mlp": 0.01031213, + "balance_loss_clip": 1.01832068, + "balance_loss_mlp": 1.03885865, + "epoch": 0.5558995941680445, + "flos": 25078324469760.0, + "grad_norm": 2.0585608296199, + "language_loss": 0.79468071, + "learning_rate": 1.7357462977932348e-06, + "loss": 0.81610441, + "num_input_tokens_seen": 199070930, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.72265625, + "step": 9246, + "time_per_iteration": 2.5065393447875977 + }, + { + "auxiliary_loss_clip": 0.01108664, + "auxiliary_loss_mlp": 0.01032421, + "balance_loss_clip": 1.02022004, + "balance_loss_mlp": 1.03894639, + "epoch": 0.5559597174207125, + "flos": 20011149872640.0, + "grad_norm": 1.99088564820557, + "language_loss": 0.73864704, + "learning_rate": 1.7353602556067471e-06, + "loss": 0.76005793, + "num_input_tokens_seen": 199088675, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69921875, + "step": 9247, + "time_per_iteration": 2.535578489303589 + }, + { + "auxiliary_loss_clip": 0.01108657, + "auxiliary_loss_mlp": 0.01033098, + "balance_loss_clip": 1.02047944, + "balance_loss_mlp": 1.03822637, + "epoch": 0.5560198406733804, + "flos": 16835677044480.0, + "grad_norm": 3.9448346084731214, + "language_loss": 0.76161623, + "learning_rate": 1.7349742234554254e-06, + "loss": 0.78303373, + "num_input_tokens_seen": 199103075, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 9248, + "time_per_iteration": 2.4247324466705322 + }, + { + "auxiliary_loss_clip": 0.01031453, + "auxiliary_loss_mlp": 0.01002871, + "balance_loss_clip": 1.00163698, + "balance_loss_mlp": 1.00995636, + "epoch": 0.5560799639260484, + "flos": 70697051758080.0, + "grad_norm": 0.8418132845618771, + "language_loss": 0.59482312, + "learning_rate": 1.7345882013539081e-06, + "loss": 0.61516631, + "num_input_tokens_seen": 199160325, + "router_z_loss_clip": 0.0123291, + "router_z_loss_mlp": 0.21484375, + "step": 9249, + "time_per_iteration": 3.1760778427124023 + }, + { + "auxiliary_loss_clip": 0.01104052, + "auxiliary_loss_mlp": 0.01027363, + "balance_loss_clip": 1.01514411, + "balance_loss_mlp": 1.03505003, + "epoch": 0.5561400871787163, + "flos": 23148593176320.0, + "grad_norm": 1.8510226601540976, + "language_loss": 0.79942709, + "learning_rate": 1.734202189316832e-06, + "loss": 0.82074124, + "num_input_tokens_seen": 199179760, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 9250, + "time_per_iteration": 2.4803051948547363 + }, + { + "auxiliary_loss_clip": 0.01107715, + "auxiliary_loss_mlp": 0.01032139, + "balance_loss_clip": 1.01952708, + "balance_loss_mlp": 1.03654897, + "epoch": 0.5562002104313843, + "flos": 17566423332480.0, + "grad_norm": 2.627943398678235, + "language_loss": 0.68456143, + "learning_rate": 1.733816187358836e-06, + "loss": 0.70596004, + "num_input_tokens_seen": 199196695, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 9251, + "time_per_iteration": 2.4627792835235596 + }, + { + "auxiliary_loss_clip": 0.01106616, + "auxiliary_loss_mlp": 0.01029568, + "balance_loss_clip": 1.01753998, + "balance_loss_mlp": 1.03680301, + "epoch": 0.5562603336840523, + "flos": 25045430590080.0, + "grad_norm": 1.9270315036455492, + "language_loss": 0.75472188, + "learning_rate": 1.7334301954945569e-06, + "loss": 0.77608371, + "num_input_tokens_seen": 199217845, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6953125, + "step": 9252, + "time_per_iteration": 2.5131149291992188 + }, + { + "auxiliary_loss_clip": 0.01109989, + "auxiliary_loss_mlp": 0.01032399, + "balance_loss_clip": 1.02020955, + "balance_loss_mlp": 1.0379473, + "epoch": 0.5563204569367203, + "flos": 29059022436480.0, + "grad_norm": 1.5243167641625328, + "language_loss": 0.72841972, + "learning_rate": 1.7330442137386313e-06, + "loss": 0.74984354, + "num_input_tokens_seen": 199239250, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.71875, + "step": 9253, + "time_per_iteration": 2.545469045639038 + }, + { + "auxiliary_loss_clip": 0.01108615, + "auxiliary_loss_mlp": 0.01029855, + "balance_loss_clip": 1.01835763, + "balance_loss_mlp": 1.03873754, + "epoch": 0.5563805801893883, + "flos": 22090449398400.0, + "grad_norm": 1.7630844010149394, + "language_loss": 0.8319999, + "learning_rate": 1.7326582421056965e-06, + "loss": 0.85338461, + "num_input_tokens_seen": 199258320, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.69921875, + "step": 9254, + "time_per_iteration": 2.4762439727783203 + }, + { + "auxiliary_loss_clip": 0.01028463, + "auxiliary_loss_mlp": 0.00998119, + "balance_loss_clip": 0.99699229, + "balance_loss_mlp": 1.00661826, + "epoch": 0.5564407034420562, + "flos": 58636128689280.0, + "grad_norm": 0.880020971367601, + "language_loss": 0.64831799, + "learning_rate": 1.732272280610387e-06, + "loss": 0.66858381, + "num_input_tokens_seen": 199314840, + "router_z_loss_clip": 0.0112915, + "router_z_loss_mlp": 0.21875, + "step": 9255, + "time_per_iteration": 2.894592523574829 + }, + { + "auxiliary_loss_clip": 0.01108855, + "auxiliary_loss_mlp": 0.01034937, + "balance_loss_clip": 1.02330816, + "balance_loss_mlp": 1.04103208, + "epoch": 0.5565008266947242, + "flos": 23112323418240.0, + "grad_norm": 1.9305562864951415, + "language_loss": 0.69224131, + "learning_rate": 1.7318863292673399e-06, + "loss": 0.71367919, + "num_input_tokens_seen": 199335405, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 9256, + "time_per_iteration": 2.489379644393921 + }, + { + "auxiliary_loss_clip": 0.01102517, + "auxiliary_loss_mlp": 0.01029113, + "balance_loss_clip": 1.01805019, + "balance_loss_mlp": 1.03555584, + "epoch": 0.5565609499473921, + "flos": 21578399066880.0, + "grad_norm": 1.531147439374393, + "language_loss": 0.75793779, + "learning_rate": 1.73150038809119e-06, + "loss": 0.77925408, + "num_input_tokens_seen": 199354345, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.66796875, + "step": 9257, + "time_per_iteration": 2.484574794769287 + }, + { + "auxiliary_loss_clip": 0.01106314, + "auxiliary_loss_mlp": 0.01036216, + "balance_loss_clip": 1.02477169, + "balance_loss_mlp": 1.03559875, + "epoch": 0.5566210732000602, + "flos": 18369637309440.0, + "grad_norm": 4.5210433992726635, + "language_loss": 0.61403644, + "learning_rate": 1.7311144570965724e-06, + "loss": 0.63546175, + "num_input_tokens_seen": 199372250, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.70703125, + "step": 9258, + "time_per_iteration": 2.4358863830566406 + }, + { + "auxiliary_loss_clip": 0.0110731, + "auxiliary_loss_mlp": 0.01032708, + "balance_loss_clip": 1.01988161, + "balance_loss_mlp": 1.0372082, + "epoch": 0.5566811964527281, + "flos": 25703350053120.0, + "grad_norm": 1.630618195357818, + "language_loss": 0.79231477, + "learning_rate": 1.7307285362981215e-06, + "loss": 0.81371492, + "num_input_tokens_seen": 199392815, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.703125, + "step": 9259, + "time_per_iteration": 3.931269884109497 + }, + { + "auxiliary_loss_clip": 0.01106185, + "auxiliary_loss_mlp": 0.01031095, + "balance_loss_clip": 1.01859045, + "balance_loss_mlp": 1.03665948, + "epoch": 0.5567413197053961, + "flos": 26943991856640.0, + "grad_norm": 1.9981692343252953, + "language_loss": 0.81332636, + "learning_rate": 1.7303426257104712e-06, + "loss": 0.83469915, + "num_input_tokens_seen": 199412375, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 9260, + "time_per_iteration": 2.5092766284942627 + }, + { + "auxiliary_loss_clip": 0.01108362, + "auxiliary_loss_mlp": 0.01037114, + "balance_loss_clip": 1.02450144, + "balance_loss_mlp": 1.03862071, + "epoch": 0.556801442958064, + "flos": 20850597694080.0, + "grad_norm": 1.4782542821591422, + "language_loss": 0.68771613, + "learning_rate": 1.729956725348256e-06, + "loss": 0.70917082, + "num_input_tokens_seen": 199431490, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69921875, + "step": 9261, + "time_per_iteration": 2.5739381313323975 + }, + { + "auxiliary_loss_clip": 0.01027391, + "auxiliary_loss_mlp": 0.01004087, + "balance_loss_clip": 1.00296021, + "balance_loss_mlp": 1.00587916, + "epoch": 0.556861566210732, + "flos": 70498213044480.0, + "grad_norm": 0.7282105219345391, + "language_loss": 0.61132908, + "learning_rate": 1.729570835226108e-06, + "loss": 0.63164389, + "num_input_tokens_seen": 199495855, + "router_z_loss_clip": 0.0112915, + "router_z_loss_mlp": 0.21484375, + "step": 9262, + "time_per_iteration": 5.870652675628662 + }, + { + "auxiliary_loss_clip": 0.01108355, + "auxiliary_loss_mlp": 0.01033188, + "balance_loss_clip": 1.02145159, + "balance_loss_mlp": 1.0379622, + "epoch": 0.5569216894633999, + "flos": 25337276593920.0, + "grad_norm": 1.6754840031905727, + "language_loss": 0.64504874, + "learning_rate": 1.7291849553586622e-06, + "loss": 0.66646421, + "num_input_tokens_seen": 199515870, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.703125, + "step": 9263, + "time_per_iteration": 3.9533426761627197 + }, + { + "auxiliary_loss_clip": 0.01107431, + "auxiliary_loss_mlp": 0.0103239, + "balance_loss_clip": 1.02047563, + "balance_loss_mlp": 1.03795195, + "epoch": 0.556981812716068, + "flos": 22638733574400.0, + "grad_norm": 2.058460487271679, + "language_loss": 0.73137188, + "learning_rate": 1.7287990857605497e-06, + "loss": 0.75277007, + "num_input_tokens_seen": 199535745, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 9264, + "time_per_iteration": 2.493511199951172 + }, + { + "auxiliary_loss_clip": 0.0111011, + "auxiliary_loss_mlp": 0.01031978, + "balance_loss_clip": 1.02008092, + "balance_loss_mlp": 1.04015422, + "epoch": 0.5570419359687359, + "flos": 11035852738560.0, + "grad_norm": 1.9025948017547305, + "language_loss": 0.75953865, + "learning_rate": 1.7284132264464022e-06, + "loss": 0.78095955, + "num_input_tokens_seen": 199554035, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.703125, + "step": 9265, + "time_per_iteration": 2.4533309936523438 + }, + { + "auxiliary_loss_clip": 0.01103692, + "auxiliary_loss_mlp": 0.01030017, + "balance_loss_clip": 1.01909113, + "balance_loss_mlp": 1.03774786, + "epoch": 0.5571020592214039, + "flos": 22823135020800.0, + "grad_norm": 1.366142740242795, + "language_loss": 0.7096293, + "learning_rate": 1.7280273774308536e-06, + "loss": 0.73096645, + "num_input_tokens_seen": 199576120, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.66015625, + "step": 9266, + "time_per_iteration": 2.5045597553253174 + }, + { + "auxiliary_loss_clip": 0.01106333, + "auxiliary_loss_mlp": 0.01033928, + "balance_loss_clip": 1.02204871, + "balance_loss_mlp": 1.03720617, + "epoch": 0.5571621824740719, + "flos": 22927778317440.0, + "grad_norm": 1.7291111077620351, + "language_loss": 0.681355, + "learning_rate": 1.727641538728533e-06, + "loss": 0.7027576, + "num_input_tokens_seen": 199593780, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69140625, + "step": 9267, + "time_per_iteration": 2.5197811126708984 + }, + { + "auxiliary_loss_clip": 0.01103883, + "auxiliary_loss_mlp": 0.01034523, + "balance_loss_clip": 1.02367473, + "balance_loss_mlp": 1.03763127, + "epoch": 0.5572223057267398, + "flos": 22966705681920.0, + "grad_norm": 1.9159467095237732, + "language_loss": 0.74278724, + "learning_rate": 1.7272557103540736e-06, + "loss": 0.76417124, + "num_input_tokens_seen": 199613220, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6640625, + "step": 9268, + "time_per_iteration": 2.489957332611084 + }, + { + "auxiliary_loss_clip": 0.01105844, + "auxiliary_loss_mlp": 0.01030138, + "balance_loss_clip": 1.0188365, + "balance_loss_mlp": 1.03773642, + "epoch": 0.5572824289794078, + "flos": 20960053413120.0, + "grad_norm": 2.490438410193009, + "language_loss": 0.7539283, + "learning_rate": 1.726869892322104e-06, + "loss": 0.77528816, + "num_input_tokens_seen": 199632085, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6796875, + "step": 9269, + "time_per_iteration": 2.5165016651153564 + }, + { + "auxiliary_loss_clip": 0.01106358, + "auxiliary_loss_mlp": 0.01029515, + "balance_loss_clip": 1.01847041, + "balance_loss_mlp": 1.0366416, + "epoch": 0.5573425522320757, + "flos": 25042413847680.0, + "grad_norm": 1.5593232015543566, + "language_loss": 0.82527506, + "learning_rate": 1.726484084647256e-06, + "loss": 0.84663379, + "num_input_tokens_seen": 199649295, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6953125, + "step": 9270, + "time_per_iteration": 2.495546579360962 + }, + { + "auxiliary_loss_clip": 0.01107465, + "auxiliary_loss_mlp": 0.01031385, + "balance_loss_clip": 1.01927948, + "balance_loss_mlp": 1.03695226, + "epoch": 0.5574026754847438, + "flos": 23659637927040.0, + "grad_norm": 2.4402155421947485, + "language_loss": 0.79217434, + "learning_rate": 1.7260982873441591e-06, + "loss": 0.81356287, + "num_input_tokens_seen": 199668870, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.703125, + "step": 9271, + "time_per_iteration": 2.5050055980682373 + }, + { + "auxiliary_loss_clip": 0.01107417, + "auxiliary_loss_mlp": 0.01031316, + "balance_loss_clip": 1.01938963, + "balance_loss_mlp": 1.03778744, + "epoch": 0.5574627987374117, + "flos": 24782240661120.0, + "grad_norm": 1.994384891359262, + "language_loss": 0.90424085, + "learning_rate": 1.725712500427442e-06, + "loss": 0.92562819, + "num_input_tokens_seen": 199684870, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 9272, + "time_per_iteration": 2.455949068069458 + }, + { + "auxiliary_loss_clip": 0.0110516, + "auxiliary_loss_mlp": 0.01030586, + "balance_loss_clip": 1.01864076, + "balance_loss_mlp": 1.03754234, + "epoch": 0.5575229219900797, + "flos": 21834944979840.0, + "grad_norm": 1.979276269767202, + "language_loss": 0.83862162, + "learning_rate": 1.7253267239117347e-06, + "loss": 0.85997909, + "num_input_tokens_seen": 199701975, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 9273, + "time_per_iteration": 2.4802021980285645 + }, + { + "auxiliary_loss_clip": 0.01108902, + "auxiliary_loss_mlp": 0.01041124, + "balance_loss_clip": 1.02752197, + "balance_loss_mlp": 1.03908944, + "epoch": 0.5575830452427476, + "flos": 27815148408960.0, + "grad_norm": 2.0454885443684905, + "language_loss": 0.73996758, + "learning_rate": 1.7249409578116655e-06, + "loss": 0.76146781, + "num_input_tokens_seen": 199721865, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.69921875, + "step": 9274, + "time_per_iteration": 2.4761173725128174 + }, + { + "auxiliary_loss_clip": 0.01116526, + "auxiliary_loss_mlp": 0.01034454, + "balance_loss_clip": 1.02121544, + "balance_loss_mlp": 1.04015088, + "epoch": 0.5576431684954156, + "flos": 17812805696640.0, + "grad_norm": 2.9773966002159824, + "language_loss": 0.78126067, + "learning_rate": 1.7245552021418629e-06, + "loss": 0.8027705, + "num_input_tokens_seen": 199736455, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.765625, + "step": 9275, + "time_per_iteration": 2.4496877193450928 + }, + { + "auxiliary_loss_clip": 0.01106389, + "auxiliary_loss_mlp": 0.01029467, + "balance_loss_clip": 1.01745057, + "balance_loss_mlp": 1.03767419, + "epoch": 0.5577032917480835, + "flos": 15486872411520.0, + "grad_norm": 1.6885485925360224, + "language_loss": 0.74829316, + "learning_rate": 1.7241694569169546e-06, + "loss": 0.76965177, + "num_input_tokens_seen": 199753125, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6875, + "step": 9276, + "time_per_iteration": 2.413726806640625 + }, + { + "auxiliary_loss_clip": 0.0110324, + "auxiliary_loss_mlp": 0.01031184, + "balance_loss_clip": 1.01978803, + "balance_loss_mlp": 1.03508329, + "epoch": 0.5577634150007516, + "flos": 21579763783680.0, + "grad_norm": 1.7672131346084554, + "language_loss": 0.75013113, + "learning_rate": 1.7237837221515678e-06, + "loss": 0.77147532, + "num_input_tokens_seen": 199771365, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6796875, + "step": 9277, + "time_per_iteration": 2.4982142448425293 + }, + { + "auxiliary_loss_clip": 0.01102538, + "auxiliary_loss_mlp": 0.01032622, + "balance_loss_clip": 1.02155328, + "balance_loss_mlp": 1.03504467, + "epoch": 0.5578235382534195, + "flos": 21139750177920.0, + "grad_norm": 1.8714980055762023, + "language_loss": 0.71817064, + "learning_rate": 1.7233979978603304e-06, + "loss": 0.73952222, + "num_input_tokens_seen": 199790035, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.67578125, + "step": 9278, + "time_per_iteration": 2.4389007091522217 + }, + { + "auxiliary_loss_clip": 0.01109043, + "auxiliary_loss_mlp": 0.01034833, + "balance_loss_clip": 1.02185118, + "balance_loss_mlp": 1.0372287, + "epoch": 0.5578836615060875, + "flos": 26505199313280.0, + "grad_norm": 1.6538282955120047, + "language_loss": 0.75750679, + "learning_rate": 1.723012284057868e-06, + "loss": 0.77894545, + "num_input_tokens_seen": 199811125, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71875, + "step": 9279, + "time_per_iteration": 2.5255484580993652 + }, + { + "auxiliary_loss_clip": 0.0110534, + "auxiliary_loss_mlp": 0.01029428, + "balance_loss_clip": 1.01767397, + "balance_loss_mlp": 1.03544426, + "epoch": 0.5579437847587555, + "flos": 20153786780160.0, + "grad_norm": 2.2545627368714034, + "language_loss": 0.67431748, + "learning_rate": 1.7226265807588082e-06, + "loss": 0.69566512, + "num_input_tokens_seen": 199829915, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.69921875, + "step": 9280, + "time_per_iteration": 2.5258350372314453 + }, + { + "auxiliary_loss_clip": 0.01107802, + "auxiliary_loss_mlp": 0.01037985, + "balance_loss_clip": 1.02595139, + "balance_loss_mlp": 1.03626418, + "epoch": 0.5580039080114234, + "flos": 26102281478400.0, + "grad_norm": 1.676674952402485, + "language_loss": 0.72964156, + "learning_rate": 1.7222408879777763e-06, + "loss": 0.75109941, + "num_input_tokens_seen": 199850670, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.71484375, + "step": 9281, + "time_per_iteration": 2.505610466003418 + }, + { + "auxiliary_loss_clip": 0.01106676, + "auxiliary_loss_mlp": 0.01030885, + "balance_loss_clip": 1.01922011, + "balance_loss_mlp": 1.03804862, + "epoch": 0.5580640312640914, + "flos": 13771671096960.0, + "grad_norm": 2.9649443100281627, + "language_loss": 0.75254506, + "learning_rate": 1.7218552057293974e-06, + "loss": 0.77392066, + "num_input_tokens_seen": 199867645, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6875, + "step": 9282, + "time_per_iteration": 2.444455623626709 + }, + { + "auxiliary_loss_clip": 0.01104903, + "auxiliary_loss_mlp": 0.01026903, + "balance_loss_clip": 1.01507115, + "balance_loss_mlp": 1.03695285, + "epoch": 0.5581241545167593, + "flos": 17675986792320.0, + "grad_norm": 1.6849195839549764, + "language_loss": 0.66588777, + "learning_rate": 1.721469534028297e-06, + "loss": 0.68720585, + "num_input_tokens_seen": 199886320, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 9283, + "time_per_iteration": 2.4668378829956055 + }, + { + "auxiliary_loss_clip": 0.01105958, + "auxiliary_loss_mlp": 0.01025736, + "balance_loss_clip": 1.01500154, + "balance_loss_mlp": 1.03703356, + "epoch": 0.5581842777694274, + "flos": 19569161018880.0, + "grad_norm": 2.7565054625366305, + "language_loss": 0.8290503, + "learning_rate": 1.7210838728890994e-06, + "loss": 0.85036725, + "num_input_tokens_seen": 199904895, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6875, + "step": 9284, + "time_per_iteration": 2.430774688720703 + }, + { + "auxiliary_loss_clip": 0.01105717, + "auxiliary_loss_mlp": 0.01029119, + "balance_loss_clip": 1.01653099, + "balance_loss_mlp": 1.03554368, + "epoch": 0.5582444010220953, + "flos": 20595165102720.0, + "grad_norm": 2.3933521300057836, + "language_loss": 0.85047686, + "learning_rate": 1.7206982223264304e-06, + "loss": 0.87182522, + "num_input_tokens_seen": 199921090, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 9285, + "time_per_iteration": 2.4788479804992676 + }, + { + "auxiliary_loss_clip": 0.01105483, + "auxiliary_loss_mlp": 0.01031556, + "balance_loss_clip": 1.02009416, + "balance_loss_mlp": 1.03531575, + "epoch": 0.5583045242747633, + "flos": 19135504120320.0, + "grad_norm": 3.198131799092361, + "language_loss": 0.73653531, + "learning_rate": 1.720312582354912e-06, + "loss": 0.75790572, + "num_input_tokens_seen": 199939925, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.703125, + "step": 9286, + "time_per_iteration": 2.439715623855591 + }, + { + "auxiliary_loss_clip": 0.01107925, + "auxiliary_loss_mlp": 0.01031209, + "balance_loss_clip": 1.01946068, + "balance_loss_mlp": 1.03781044, + "epoch": 0.5583646475274312, + "flos": 27454569730560.0, + "grad_norm": 1.684452503968906, + "language_loss": 0.74169838, + "learning_rate": 1.7199269529891684e-06, + "loss": 0.76308966, + "num_input_tokens_seen": 199960015, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.703125, + "step": 9287, + "time_per_iteration": 2.534813642501831 + }, + { + "auxiliary_loss_clip": 0.01112227, + "auxiliary_loss_mlp": 0.01030459, + "balance_loss_clip": 1.01780486, + "balance_loss_mlp": 1.03982437, + "epoch": 0.5584247707800992, + "flos": 23653784010240.0, + "grad_norm": 2.339953652318452, + "language_loss": 0.75018406, + "learning_rate": 1.7195413342438233e-06, + "loss": 0.77161086, + "num_input_tokens_seen": 199980505, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 9288, + "time_per_iteration": 2.470242977142334 + }, + { + "auxiliary_loss_clip": 0.01109468, + "auxiliary_loss_mlp": 0.01037482, + "balance_loss_clip": 1.0241785, + "balance_loss_mlp": 1.03922033, + "epoch": 0.5584848940327671, + "flos": 13698880185600.0, + "grad_norm": 1.8804248151935914, + "language_loss": 0.77241838, + "learning_rate": 1.7191557261334984e-06, + "loss": 0.79388785, + "num_input_tokens_seen": 199999020, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.703125, + "step": 9289, + "time_per_iteration": 2.5357422828674316 + }, + { + "auxiliary_loss_clip": 0.01112615, + "auxiliary_loss_mlp": 0.01031708, + "balance_loss_clip": 1.01918483, + "balance_loss_mlp": 1.03802335, + "epoch": 0.5585450172854352, + "flos": 27016208150400.0, + "grad_norm": 1.7341259817318901, + "language_loss": 0.61310709, + "learning_rate": 1.718770128672817e-06, + "loss": 0.63455033, + "num_input_tokens_seen": 200019020, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.74609375, + "step": 9290, + "time_per_iteration": 2.479149580001831 + }, + { + "auxiliary_loss_clip": 0.01107208, + "auxiliary_loss_mlp": 0.01028765, + "balance_loss_clip": 1.01678467, + "balance_loss_mlp": 1.03602409, + "epoch": 0.5586051405381031, + "flos": 23185653033600.0, + "grad_norm": 1.9512495779204855, + "language_loss": 0.67988908, + "learning_rate": 1.7183845418764e-06, + "loss": 0.70124876, + "num_input_tokens_seen": 200038110, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.7109375, + "step": 9291, + "time_per_iteration": 2.4684019088745117 + }, + { + "auxiliary_loss_clip": 0.01108099, + "auxiliary_loss_mlp": 0.01033244, + "balance_loss_clip": 1.02022064, + "balance_loss_mlp": 1.0363071, + "epoch": 0.5586652637907711, + "flos": 20775544225920.0, + "grad_norm": 2.2522167745355524, + "language_loss": 0.83802187, + "learning_rate": 1.7179989657588698e-06, + "loss": 0.85943532, + "num_input_tokens_seen": 200056210, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 9292, + "time_per_iteration": 2.550994873046875 + }, + { + "auxiliary_loss_clip": 0.01104675, + "auxiliary_loss_mlp": 0.01033633, + "balance_loss_clip": 1.02180171, + "balance_loss_mlp": 1.03674221, + "epoch": 0.5587253870434391, + "flos": 28219897837440.0, + "grad_norm": 1.8368239448999808, + "language_loss": 0.73363894, + "learning_rate": 1.7176134003348476e-06, + "loss": 0.75502205, + "num_input_tokens_seen": 200075620, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 9293, + "time_per_iteration": 2.5334718227386475 + }, + { + "auxiliary_loss_clip": 0.01104517, + "auxiliary_loss_mlp": 0.01031653, + "balance_loss_clip": 1.02023864, + "balance_loss_mlp": 1.03715324, + "epoch": 0.558785510296107, + "flos": 26615732440320.0, + "grad_norm": 1.6770372644425844, + "language_loss": 0.7251429, + "learning_rate": 1.7172278456189523e-06, + "loss": 0.7465046, + "num_input_tokens_seen": 200095945, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 9294, + "time_per_iteration": 2.4782567024230957 + }, + { + "auxiliary_loss_clip": 0.01107679, + "auxiliary_loss_mlp": 0.01030624, + "balance_loss_clip": 1.01867914, + "balance_loss_mlp": 1.03769052, + "epoch": 0.558845633548775, + "flos": 20156767608960.0, + "grad_norm": 2.2895769976939437, + "language_loss": 0.68138099, + "learning_rate": 1.716842301625806e-06, + "loss": 0.70276403, + "num_input_tokens_seen": 200114185, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69921875, + "step": 9295, + "time_per_iteration": 2.433671474456787 + }, + { + "auxiliary_loss_clip": 0.01108675, + "auxiliary_loss_mlp": 0.0103174, + "balance_loss_clip": 1.01949131, + "balance_loss_mlp": 1.03873825, + "epoch": 0.5589057568014429, + "flos": 24350774492160.0, + "grad_norm": 1.7275865639530346, + "language_loss": 0.80619705, + "learning_rate": 1.7164567683700281e-06, + "loss": 0.82760113, + "num_input_tokens_seen": 200135030, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.69921875, + "step": 9296, + "time_per_iteration": 2.4831361770629883 + }, + { + "auxiliary_loss_clip": 0.01106832, + "auxiliary_loss_mlp": 0.01031695, + "balance_loss_clip": 1.0200243, + "balance_loss_mlp": 1.03788233, + "epoch": 0.558965880054111, + "flos": 21105168359040.0, + "grad_norm": 1.8948732644892212, + "language_loss": 0.65465128, + "learning_rate": 1.7160712458662379e-06, + "loss": 0.67603648, + "num_input_tokens_seen": 200154290, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6875, + "step": 9297, + "time_per_iteration": 2.4711036682128906 + }, + { + "auxiliary_loss_clip": 0.01109853, + "auxiliary_loss_mlp": 0.01035081, + "balance_loss_clip": 1.02202153, + "balance_loss_mlp": 1.03785491, + "epoch": 0.5590260033067789, + "flos": 18436071513600.0, + "grad_norm": 1.6800872146948855, + "language_loss": 0.7513994, + "learning_rate": 1.7156857341290544e-06, + "loss": 0.77284867, + "num_input_tokens_seen": 200171555, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 9298, + "time_per_iteration": 2.421066999435425 + }, + { + "auxiliary_loss_clip": 0.01031879, + "auxiliary_loss_mlp": 0.01018081, + "balance_loss_clip": 1.01695406, + "balance_loss_mlp": 1.01014686, + "epoch": 0.5590861265594469, + "flos": 70577432490240.0, + "grad_norm": 0.6830476030131911, + "language_loss": 0.52463478, + "learning_rate": 1.7153002331730967e-06, + "loss": 0.54513437, + "num_input_tokens_seen": 200237010, + "router_z_loss_clip": 0.0112915, + "router_z_loss_mlp": 0.21777344, + "step": 9299, + "time_per_iteration": 3.096731424331665 + }, + { + "auxiliary_loss_clip": 0.0110307, + "auxiliary_loss_mlp": 0.01029526, + "balance_loss_clip": 1.01799822, + "balance_loss_mlp": 1.03608131, + "epoch": 0.5591462498121148, + "flos": 30664408896000.0, + "grad_norm": 1.8758260689947703, + "language_loss": 0.68378884, + "learning_rate": 1.7149147430129824e-06, + "loss": 0.70511478, + "num_input_tokens_seen": 200260820, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 9300, + "time_per_iteration": 2.5355281829833984 + }, + { + "auxiliary_loss_clip": 0.01107824, + "auxiliary_loss_mlp": 0.01040798, + "balance_loss_clip": 1.02798903, + "balance_loss_mlp": 1.0372839, + "epoch": 0.5592063730647828, + "flos": 18150438562560.0, + "grad_norm": 1.868740801794004, + "language_loss": 0.81233132, + "learning_rate": 1.7145292636633293e-06, + "loss": 0.83381754, + "num_input_tokens_seen": 200278035, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.703125, + "step": 9301, + "time_per_iteration": 3.9131312370300293 + }, + { + "auxiliary_loss_clip": 0.01104347, + "auxiliary_loss_mlp": 0.01026194, + "balance_loss_clip": 1.01370668, + "balance_loss_mlp": 1.03488898, + "epoch": 0.5592664963174507, + "flos": 24060400945920.0, + "grad_norm": 2.564037719481304, + "language_loss": 0.67297423, + "learning_rate": 1.714143795138756e-06, + "loss": 0.69427967, + "num_input_tokens_seen": 200297255, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 9302, + "time_per_iteration": 2.484609365463257 + }, + { + "auxiliary_loss_clip": 0.011096, + "auxiliary_loss_mlp": 0.01024968, + "balance_loss_clip": 1.01249897, + "balance_loss_mlp": 1.03721702, + "epoch": 0.5593266195701188, + "flos": 19827897661440.0, + "grad_norm": 2.803806869845176, + "language_loss": 0.70999819, + "learning_rate": 1.713758337453878e-06, + "loss": 0.73134387, + "num_input_tokens_seen": 200317505, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7265625, + "step": 9303, + "time_per_iteration": 2.442859649658203 + }, + { + "auxiliary_loss_clip": 0.01105574, + "auxiliary_loss_mlp": 0.0102866, + "balance_loss_clip": 1.01791978, + "balance_loss_mlp": 1.03930676, + "epoch": 0.5593867428227867, + "flos": 25300755440640.0, + "grad_norm": 1.540239070281283, + "language_loss": 0.72772652, + "learning_rate": 1.7133728906233124e-06, + "loss": 0.74906886, + "num_input_tokens_seen": 200338350, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6640625, + "step": 9304, + "time_per_iteration": 5.429321765899658 + }, + { + "auxiliary_loss_clip": 0.01104708, + "auxiliary_loss_mlp": 0.01027817, + "balance_loss_clip": 1.01613426, + "balance_loss_mlp": 1.03523278, + "epoch": 0.5594468660754547, + "flos": 12933013374720.0, + "grad_norm": 1.8535856395803625, + "language_loss": 0.77888674, + "learning_rate": 1.7129874546616763e-06, + "loss": 0.80021197, + "num_input_tokens_seen": 200353965, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6953125, + "step": 9305, + "time_per_iteration": 3.8705790042877197 + }, + { + "auxiliary_loss_clip": 0.01102125, + "auxiliary_loss_mlp": 0.01024983, + "balance_loss_clip": 1.01390815, + "balance_loss_mlp": 1.03657615, + "epoch": 0.5595069893281227, + "flos": 19062713208960.0, + "grad_norm": 1.7045399129758072, + "language_loss": 0.69334519, + "learning_rate": 1.7126020295835836e-06, + "loss": 0.7146163, + "num_input_tokens_seen": 200373595, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 9306, + "time_per_iteration": 2.4669442176818848 + }, + { + "auxiliary_loss_clip": 0.01030152, + "auxiliary_loss_mlp": 0.01003605, + "balance_loss_clip": 1.0025028, + "balance_loss_mlp": 1.00838459, + "epoch": 0.5595671125807906, + "flos": 70273375862400.0, + "grad_norm": 0.9104128938879268, + "language_loss": 0.60324359, + "learning_rate": 1.7122166154036518e-06, + "loss": 0.62358117, + "num_input_tokens_seen": 200429155, + "router_z_loss_clip": 0.01104736, + "router_z_loss_mlp": 0.21777344, + "step": 9307, + "time_per_iteration": 3.167161703109741 + }, + { + "auxiliary_loss_clip": 0.01105033, + "auxiliary_loss_mlp": 0.01030814, + "balance_loss_clip": 1.01972127, + "balance_loss_mlp": 1.03697395, + "epoch": 0.5596272358334586, + "flos": 20665513889280.0, + "grad_norm": 1.9188877301503315, + "language_loss": 0.73981357, + "learning_rate": 1.7118312121364943e-06, + "loss": 0.76117194, + "num_input_tokens_seen": 200448290, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6796875, + "step": 9308, + "time_per_iteration": 2.544931650161743 + }, + { + "auxiliary_loss_clip": 0.01107282, + "auxiliary_loss_mlp": 0.01031253, + "balance_loss_clip": 1.01833069, + "balance_loss_mlp": 1.03571653, + "epoch": 0.5596873590861265, + "flos": 25041013217280.0, + "grad_norm": 1.8987333438245737, + "language_loss": 0.69393057, + "learning_rate": 1.7114458197967257e-06, + "loss": 0.71531588, + "num_input_tokens_seen": 200466555, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 9309, + "time_per_iteration": 2.5008022785186768 + }, + { + "auxiliary_loss_clip": 0.01108803, + "auxiliary_loss_mlp": 0.01031964, + "balance_loss_clip": 1.01787376, + "balance_loss_mlp": 1.03872681, + "epoch": 0.5597474823387946, + "flos": 25958387594880.0, + "grad_norm": 2.0715816525821458, + "language_loss": 0.75254035, + "learning_rate": 1.7110604383989613e-06, + "loss": 0.77394807, + "num_input_tokens_seen": 200485980, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.703125, + "step": 9310, + "time_per_iteration": 2.5096590518951416 + }, + { + "auxiliary_loss_clip": 0.01111521, + "auxiliary_loss_mlp": 0.01032538, + "balance_loss_clip": 1.01953197, + "balance_loss_mlp": 1.03922331, + "epoch": 0.5598076055914625, + "flos": 26177442687360.0, + "grad_norm": 4.006602699764322, + "language_loss": 0.69449794, + "learning_rate": 1.7106750679578133e-06, + "loss": 0.71593851, + "num_input_tokens_seen": 200504555, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.72265625, + "step": 9311, + "time_per_iteration": 2.5238418579101562 + }, + { + "auxiliary_loss_clip": 0.01103209, + "auxiliary_loss_mlp": 0.01028086, + "balance_loss_clip": 1.01616526, + "balance_loss_mlp": 1.03474474, + "epoch": 0.5598677288441305, + "flos": 11655778590720.0, + "grad_norm": 1.8631623558730779, + "language_loss": 0.72497612, + "learning_rate": 1.7102897084878962e-06, + "loss": 0.74628901, + "num_input_tokens_seen": 200522700, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.68359375, + "step": 9312, + "time_per_iteration": 2.4980969429016113 + }, + { + "auxiliary_loss_clip": 0.01106218, + "auxiliary_loss_mlp": 0.01030938, + "balance_loss_clip": 1.01871908, + "balance_loss_mlp": 1.03834271, + "epoch": 0.5599278520967984, + "flos": 22966597941120.0, + "grad_norm": 1.9916809517025356, + "language_loss": 0.89106059, + "learning_rate": 1.709904360003822e-06, + "loss": 0.91243219, + "num_input_tokens_seen": 200541910, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6796875, + "step": 9313, + "time_per_iteration": 2.43849515914917 + }, + { + "auxiliary_loss_clip": 0.01107396, + "auxiliary_loss_mlp": 0.01034864, + "balance_loss_clip": 1.0224545, + "balance_loss_mlp": 1.03886163, + "epoch": 0.5599879753494664, + "flos": 21215557831680.0, + "grad_norm": 1.848557040479868, + "language_loss": 0.77809632, + "learning_rate": 1.709519022520204e-06, + "loss": 0.79951894, + "num_input_tokens_seen": 200562600, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 9314, + "time_per_iteration": 2.4745004177093506 + }, + { + "auxiliary_loss_clip": 0.01103678, + "auxiliary_loss_mlp": 0.01027591, + "balance_loss_clip": 1.0153954, + "balance_loss_mlp": 1.03497362, + "epoch": 0.5600480986021343, + "flos": 31903219105920.0, + "grad_norm": 1.6135281246099127, + "language_loss": 0.7005592, + "learning_rate": 1.7091336960516537e-06, + "loss": 0.72187185, + "num_input_tokens_seen": 200584795, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 9315, + "time_per_iteration": 2.523815631866455 + }, + { + "auxiliary_loss_clip": 0.0110827, + "auxiliary_loss_mlp": 0.01034837, + "balance_loss_clip": 1.02225423, + "balance_loss_mlp": 1.03666615, + "epoch": 0.5601082218548024, + "flos": 28476048700800.0, + "grad_norm": 2.163442884097896, + "language_loss": 0.66467899, + "learning_rate": 1.7087483806127824e-06, + "loss": 0.68611002, + "num_input_tokens_seen": 200606945, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71875, + "step": 9316, + "time_per_iteration": 2.530667304992676 + }, + { + "auxiliary_loss_clip": 0.01106878, + "auxiliary_loss_mlp": 0.01031394, + "balance_loss_clip": 1.01796496, + "balance_loss_mlp": 1.03770351, + "epoch": 0.5601683451074703, + "flos": 24097173494400.0, + "grad_norm": 2.3805446029838624, + "language_loss": 0.86762506, + "learning_rate": 1.7083630762182022e-06, + "loss": 0.88900781, + "num_input_tokens_seen": 200626340, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.69140625, + "step": 9317, + "time_per_iteration": 2.469134569168091 + }, + { + "auxiliary_loss_clip": 0.01108894, + "auxiliary_loss_mlp": 0.01035341, + "balance_loss_clip": 1.02155399, + "balance_loss_mlp": 1.03657329, + "epoch": 0.5602284683601383, + "flos": 26356205698560.0, + "grad_norm": 1.7151693589962669, + "language_loss": 0.77363193, + "learning_rate": 1.7079777828825233e-06, + "loss": 0.79507434, + "num_input_tokens_seen": 200644520, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.72265625, + "step": 9318, + "time_per_iteration": 2.4952752590179443 + }, + { + "auxiliary_loss_clip": 0.01101693, + "auxiliary_loss_mlp": 0.01035081, + "balance_loss_clip": 1.02351773, + "balance_loss_mlp": 1.03302336, + "epoch": 0.5602885916128063, + "flos": 24496392228480.0, + "grad_norm": 1.698102214619228, + "language_loss": 0.75956237, + "learning_rate": 1.7075925006203558e-06, + "loss": 0.7809301, + "num_input_tokens_seen": 200664845, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6875, + "step": 9319, + "time_per_iteration": 2.479919910430908 + }, + { + "auxiliary_loss_clip": 0.01104648, + "auxiliary_loss_mlp": 0.01034126, + "balance_loss_clip": 1.02235985, + "balance_loss_mlp": 1.03689611, + "epoch": 0.5603487148654742, + "flos": 27345006270720.0, + "grad_norm": 1.554434910389292, + "language_loss": 0.85508537, + "learning_rate": 1.7072072294463101e-06, + "loss": 0.87647313, + "num_input_tokens_seen": 200686535, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.67578125, + "step": 9320, + "time_per_iteration": 2.511880874633789 + }, + { + "auxiliary_loss_clip": 0.01030962, + "auxiliary_loss_mlp": 0.00999706, + "balance_loss_clip": 0.99860352, + "balance_loss_mlp": 1.00918674, + "epoch": 0.5604088381181422, + "flos": 54087756180480.0, + "grad_norm": 0.7458732992694707, + "language_loss": 0.52630556, + "learning_rate": 1.706821969374996e-06, + "loss": 0.54661226, + "num_input_tokens_seen": 200736965, + "router_z_loss_clip": 0.01104736, + "router_z_loss_mlp": 0.21777344, + "step": 9321, + "time_per_iteration": 2.8576598167419434 + }, + { + "auxiliary_loss_clip": 0.01104414, + "auxiliary_loss_mlp": 0.01031081, + "balance_loss_clip": 1.01938033, + "balance_loss_mlp": 1.03744757, + "epoch": 0.5604689613708101, + "flos": 22236390357120.0, + "grad_norm": 1.4865751697326912, + "language_loss": 0.74422431, + "learning_rate": 1.7064367204210216e-06, + "loss": 0.76557928, + "num_input_tokens_seen": 200757420, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 9322, + "time_per_iteration": 2.480198383331299 + }, + { + "auxiliary_loss_clip": 0.01105934, + "auxiliary_loss_mlp": 0.01033008, + "balance_loss_clip": 1.01982379, + "balance_loss_mlp": 1.03641856, + "epoch": 0.5605290846234782, + "flos": 35297782940160.0, + "grad_norm": 1.8343710411867171, + "language_loss": 0.73661906, + "learning_rate": 1.7060514825989963e-06, + "loss": 0.75800848, + "num_input_tokens_seen": 200779520, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.6953125, + "step": 9323, + "time_per_iteration": 2.5517938137054443 + }, + { + "auxiliary_loss_clip": 0.01109096, + "auxiliary_loss_mlp": 0.01026708, + "balance_loss_clip": 1.01386333, + "balance_loss_mlp": 1.03797293, + "epoch": 0.5605892078761461, + "flos": 20263314326400.0, + "grad_norm": 1.5108510359489868, + "language_loss": 0.61287946, + "learning_rate": 1.7056662559235286e-06, + "loss": 0.63423753, + "num_input_tokens_seen": 200799485, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9324, + "time_per_iteration": 2.4675137996673584 + }, + { + "auxiliary_loss_clip": 0.01106981, + "auxiliary_loss_mlp": 0.0102911, + "balance_loss_clip": 1.01650345, + "balance_loss_mlp": 1.03693414, + "epoch": 0.5606493311288141, + "flos": 17308333134720.0, + "grad_norm": 2.2169286979326768, + "language_loss": 0.87785721, + "learning_rate": 1.705281040409226e-06, + "loss": 0.89921808, + "num_input_tokens_seen": 200817540, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 9325, + "time_per_iteration": 2.4160819053649902 + }, + { + "auxiliary_loss_clip": 0.01108623, + "auxiliary_loss_mlp": 0.01030754, + "balance_loss_clip": 1.01805806, + "balance_loss_mlp": 1.03765607, + "epoch": 0.560709454381482, + "flos": 21652985658240.0, + "grad_norm": 1.6383695475184654, + "language_loss": 0.74048722, + "learning_rate": 1.7048958360706952e-06, + "loss": 0.76188105, + "num_input_tokens_seen": 200838380, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9326, + "time_per_iteration": 2.463094711303711 + }, + { + "auxiliary_loss_clip": 0.01112046, + "auxiliary_loss_mlp": 0.01029376, + "balance_loss_clip": 1.01620328, + "balance_loss_mlp": 1.0386548, + "epoch": 0.56076957763415, + "flos": 20303355012480.0, + "grad_norm": 3.3443611641012674, + "language_loss": 0.78365433, + "learning_rate": 1.7045106429225447e-06, + "loss": 0.80506855, + "num_input_tokens_seen": 200855640, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 9327, + "time_per_iteration": 2.445756673812866 + }, + { + "auxiliary_loss_clip": 0.01108683, + "auxiliary_loss_mlp": 0.01031541, + "balance_loss_clip": 1.01842213, + "balance_loss_mlp": 1.03914046, + "epoch": 0.5608297008868179, + "flos": 25045897466880.0, + "grad_norm": 2.5559440694427478, + "language_loss": 0.78508025, + "learning_rate": 1.7041254609793795e-06, + "loss": 0.80648255, + "num_input_tokens_seen": 200876585, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6953125, + "step": 9328, + "time_per_iteration": 2.5156970024108887 + }, + { + "auxiliary_loss_clip": 0.01106121, + "auxiliary_loss_mlp": 0.01028303, + "balance_loss_clip": 1.01594675, + "balance_loss_mlp": 1.03623605, + "epoch": 0.560889824139486, + "flos": 19866825025920.0, + "grad_norm": 1.528557811702872, + "language_loss": 0.73765361, + "learning_rate": 1.7037402902558066e-06, + "loss": 0.7589978, + "num_input_tokens_seen": 200898175, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69921875, + "step": 9329, + "time_per_iteration": 2.4843335151672363 + }, + { + "auxiliary_loss_clip": 0.01110712, + "auxiliary_loss_mlp": 0.01036618, + "balance_loss_clip": 1.02325511, + "balance_loss_mlp": 1.03798938, + "epoch": 0.5609499473921539, + "flos": 22929394429440.0, + "grad_norm": 1.6466003553704387, + "language_loss": 0.83545572, + "learning_rate": 1.7033551307664324e-06, + "loss": 0.85692906, + "num_input_tokens_seen": 200917515, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7265625, + "step": 9330, + "time_per_iteration": 2.482752561569214 + }, + { + "auxiliary_loss_clip": 0.01031116, + "auxiliary_loss_mlp": 0.01002487, + "balance_loss_clip": 1.00147378, + "balance_loss_mlp": 1.0092634, + "epoch": 0.5610100706448219, + "flos": 53035825455360.0, + "grad_norm": 0.7161961657295335, + "language_loss": 0.57873559, + "learning_rate": 1.7029699825258603e-06, + "loss": 0.59907156, + "num_input_tokens_seen": 200978615, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.21875, + "step": 9331, + "time_per_iteration": 3.063901662826538 + }, + { + "auxiliary_loss_clip": 0.01108686, + "auxiliary_loss_mlp": 0.01032168, + "balance_loss_clip": 1.02002692, + "balance_loss_mlp": 1.03850377, + "epoch": 0.5610701938974898, + "flos": 21834944979840.0, + "grad_norm": 1.694841283599879, + "language_loss": 0.82141155, + "learning_rate": 1.7025848455486971e-06, + "loss": 0.84282017, + "num_input_tokens_seen": 200997745, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.703125, + "step": 9332, + "time_per_iteration": 2.475790500640869 + }, + { + "auxiliary_loss_clip": 0.01113328, + "auxiliary_loss_mlp": 0.01036451, + "balance_loss_clip": 1.02233052, + "balance_loss_mlp": 1.03915834, + "epoch": 0.5611303171501578, + "flos": 17457183095040.0, + "grad_norm": 1.7394490434662164, + "language_loss": 0.8172127, + "learning_rate": 1.7021997198495454e-06, + "loss": 0.83871055, + "num_input_tokens_seen": 201016370, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 9333, + "time_per_iteration": 2.4251558780670166 + }, + { + "auxiliary_loss_clip": 0.01106502, + "auxiliary_loss_mlp": 0.01027346, + "balance_loss_clip": 1.01541877, + "balance_loss_mlp": 1.03641915, + "epoch": 0.5611904404028258, + "flos": 22637799820800.0, + "grad_norm": 1.5456564302164297, + "language_loss": 0.73111224, + "learning_rate": 1.7018146054430108e-06, + "loss": 0.7524507, + "num_input_tokens_seen": 201034310, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.703125, + "step": 9334, + "time_per_iteration": 2.5241355895996094 + }, + { + "auxiliary_loss_clip": 0.01108924, + "auxiliary_loss_mlp": 0.01037845, + "balance_loss_clip": 1.02525675, + "balance_loss_mlp": 1.03886223, + "epoch": 0.5612505636554938, + "flos": 14316327999360.0, + "grad_norm": 1.7664531017043277, + "language_loss": 0.71317977, + "learning_rate": 1.7014295023436961e-06, + "loss": 0.73464751, + "num_input_tokens_seen": 201052030, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 9335, + "time_per_iteration": 2.4215545654296875 + }, + { + "auxiliary_loss_clip": 0.01109063, + "auxiliary_loss_mlp": 0.01029211, + "balance_loss_clip": 1.01659274, + "balance_loss_mlp": 1.0381881, + "epoch": 0.5613106869081618, + "flos": 16508279554560.0, + "grad_norm": 1.7059405915097856, + "language_loss": 0.76673937, + "learning_rate": 1.701044410566205e-06, + "loss": 0.78812212, + "num_input_tokens_seen": 201068445, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 9336, + "time_per_iteration": 2.456911087036133 + }, + { + "auxiliary_loss_clip": 0.01106617, + "auxiliary_loss_mlp": 0.01032175, + "balance_loss_clip": 1.0203793, + "balance_loss_mlp": 1.0376699, + "epoch": 0.5613708101608297, + "flos": 24058569352320.0, + "grad_norm": 2.253598480453168, + "language_loss": 0.644315, + "learning_rate": 1.7006593301251393e-06, + "loss": 0.66570294, + "num_input_tokens_seen": 201082140, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 9337, + "time_per_iteration": 2.4435572624206543 + }, + { + "auxiliary_loss_clip": 0.01029918, + "auxiliary_loss_mlp": 0.01004848, + "balance_loss_clip": 1.00367343, + "balance_loss_mlp": 1.00804543, + "epoch": 0.5614309334134977, + "flos": 64905735997440.0, + "grad_norm": 0.9905116764848269, + "language_loss": 0.62572861, + "learning_rate": 1.700274261035102e-06, + "loss": 0.64607626, + "num_input_tokens_seen": 201137245, + "router_z_loss_clip": 0.01171875, + "router_z_loss_mlp": 0.21875, + "step": 9338, + "time_per_iteration": 3.039401054382324 + }, + { + "auxiliary_loss_clip": 0.01110236, + "auxiliary_loss_mlp": 0.0103103, + "balance_loss_clip": 1.01862049, + "balance_loss_mlp": 1.03832674, + "epoch": 0.5614910566661656, + "flos": 32919849740160.0, + "grad_norm": 1.7660421922814409, + "language_loss": 0.65246809, + "learning_rate": 1.6998892033106946e-06, + "loss": 0.67388076, + "num_input_tokens_seen": 201157270, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71875, + "step": 9339, + "time_per_iteration": 2.5356857776641846 + }, + { + "auxiliary_loss_clip": 0.01106617, + "auxiliary_loss_mlp": 0.01032872, + "balance_loss_clip": 1.0203191, + "balance_loss_mlp": 1.03761101, + "epoch": 0.5615511799188336, + "flos": 18588871969920.0, + "grad_norm": 3.5768294087083317, + "language_loss": 0.69863123, + "learning_rate": 1.6995041569665184e-06, + "loss": 0.72002614, + "num_input_tokens_seen": 201174530, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 9340, + "time_per_iteration": 2.4699902534484863 + }, + { + "auxiliary_loss_clip": 0.01105107, + "auxiliary_loss_mlp": 0.01027787, + "balance_loss_clip": 1.01596177, + "balance_loss_mlp": 1.03900409, + "epoch": 0.5616113031715015, + "flos": 22820010537600.0, + "grad_norm": 1.8075752300654697, + "language_loss": 0.77621818, + "learning_rate": 1.6991191220171756e-06, + "loss": 0.7975471, + "num_input_tokens_seen": 201194905, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66015625, + "step": 9341, + "time_per_iteration": 2.456268072128296 + }, + { + "auxiliary_loss_clip": 0.01106711, + "auxiliary_loss_mlp": 0.01030586, + "balance_loss_clip": 1.01759195, + "balance_loss_mlp": 1.03572893, + "epoch": 0.5616714264241696, + "flos": 22345702421760.0, + "grad_norm": 1.9728763199974049, + "language_loss": 0.79315615, + "learning_rate": 1.6987340984772653e-06, + "loss": 0.81452906, + "num_input_tokens_seen": 201213715, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 9342, + "time_per_iteration": 2.4534597396850586 + }, + { + "auxiliary_loss_clip": 0.01111218, + "auxiliary_loss_mlp": 0.01030819, + "balance_loss_clip": 1.01735401, + "balance_loss_mlp": 1.03851485, + "epoch": 0.5617315496768375, + "flos": 18807783408000.0, + "grad_norm": 2.593835689079262, + "language_loss": 0.76322573, + "learning_rate": 1.6983490863613882e-06, + "loss": 0.78464609, + "num_input_tokens_seen": 201231415, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7265625, + "step": 9343, + "time_per_iteration": 3.8814024925231934 + }, + { + "auxiliary_loss_clip": 0.01109994, + "auxiliary_loss_mlp": 0.01037634, + "balance_loss_clip": 1.0245204, + "balance_loss_mlp": 1.03978682, + "epoch": 0.5617916729295055, + "flos": 18369314087040.0, + "grad_norm": 1.5945215839270617, + "language_loss": 0.68185151, + "learning_rate": 1.6979640856841442e-06, + "loss": 0.70332778, + "num_input_tokens_seen": 201249625, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 9344, + "time_per_iteration": 2.4659440517425537 + }, + { + "auxiliary_loss_clip": 0.01109593, + "auxiliary_loss_mlp": 0.01036696, + "balance_loss_clip": 1.02364254, + "balance_loss_mlp": 1.0381155, + "epoch": 0.5618517961821734, + "flos": 28179964892160.0, + "grad_norm": 2.2863999357797202, + "language_loss": 0.66754413, + "learning_rate": 1.6975790964601318e-06, + "loss": 0.68900704, + "num_input_tokens_seen": 201271205, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71484375, + "step": 9345, + "time_per_iteration": 2.5232093334198 + }, + { + "auxiliary_loss_clip": 0.01109525, + "auxiliary_loss_mlp": 0.01025247, + "balance_loss_clip": 1.01317143, + "balance_loss_mlp": 1.03883803, + "epoch": 0.5619119194348414, + "flos": 15486872411520.0, + "grad_norm": 1.8616054032141576, + "language_loss": 0.87347126, + "learning_rate": 1.6971941187039512e-06, + "loss": 0.89481902, + "num_input_tokens_seen": 201287700, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.70703125, + "step": 9346, + "time_per_iteration": 3.9651877880096436 + }, + { + "auxiliary_loss_clip": 0.0110623, + "auxiliary_loss_mlp": 0.01035149, + "balance_loss_clip": 1.02200019, + "balance_loss_mlp": 1.03657687, + "epoch": 0.5619720426875094, + "flos": 29128652951040.0, + "grad_norm": 2.36966351637476, + "language_loss": 0.59370089, + "learning_rate": 1.6968091524301993e-06, + "loss": 0.61511469, + "num_input_tokens_seen": 201307530, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.6953125, + "step": 9347, + "time_per_iteration": 3.9802420139312744 + }, + { + "auxiliary_loss_clip": 0.01108812, + "auxiliary_loss_mlp": 0.01037423, + "balance_loss_clip": 1.02319539, + "balance_loss_mlp": 1.03742838, + "epoch": 0.5620321659401774, + "flos": 18003743418240.0, + "grad_norm": 2.4273405009541107, + "language_loss": 0.68972194, + "learning_rate": 1.6964241976534745e-06, + "loss": 0.71118426, + "num_input_tokens_seen": 201326210, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7109375, + "step": 9348, + "time_per_iteration": 2.4413368701934814 + }, + { + "auxiliary_loss_clip": 0.01111452, + "auxiliary_loss_mlp": 0.01027053, + "balance_loss_clip": 1.01292634, + "balance_loss_mlp": 1.03695107, + "epoch": 0.5620922891928454, + "flos": 20594518657920.0, + "grad_norm": 1.9093659081457641, + "language_loss": 0.79040921, + "learning_rate": 1.6960392543883754e-06, + "loss": 0.81179428, + "num_input_tokens_seen": 201346120, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7421875, + "step": 9349, + "time_per_iteration": 2.4354894161224365 + }, + { + "auxiliary_loss_clip": 0.01110239, + "auxiliary_loss_mlp": 0.01028142, + "balance_loss_clip": 1.01527977, + "balance_loss_mlp": 1.03902698, + "epoch": 0.5621524124455133, + "flos": 26287006147200.0, + "grad_norm": 2.4504118343525207, + "language_loss": 0.67282045, + "learning_rate": 1.6956543226494975e-06, + "loss": 0.69420421, + "num_input_tokens_seen": 201365700, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9350, + "time_per_iteration": 2.548351287841797 + }, + { + "auxiliary_loss_clip": 0.01110364, + "auxiliary_loss_mlp": 0.01037163, + "balance_loss_clip": 1.02408016, + "balance_loss_mlp": 1.03830576, + "epoch": 0.5622125356981813, + "flos": 12750299867520.0, + "grad_norm": 2.1113714103165884, + "language_loss": 0.78716242, + "learning_rate": 1.6952694024514381e-06, + "loss": 0.80863774, + "num_input_tokens_seen": 201382795, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 9351, + "time_per_iteration": 2.4350974559783936 + }, + { + "auxiliary_loss_clip": 0.01112089, + "auxiliary_loss_mlp": 0.01032433, + "balance_loss_clip": 1.01989186, + "balance_loss_mlp": 1.03818786, + "epoch": 0.5622726589508492, + "flos": 23805327490560.0, + "grad_norm": 1.498970106789848, + "language_loss": 0.58875829, + "learning_rate": 1.6948844938087945e-06, + "loss": 0.6102035, + "num_input_tokens_seen": 201402780, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.73828125, + "step": 9352, + "time_per_iteration": 2.4637343883514404 + }, + { + "auxiliary_loss_clip": 0.01103109, + "auxiliary_loss_mlp": 0.01031893, + "balance_loss_clip": 1.01988828, + "balance_loss_mlp": 1.03640223, + "epoch": 0.5623327822035172, + "flos": 24718212668160.0, + "grad_norm": 1.2149782460758531, + "language_loss": 0.71828997, + "learning_rate": 1.6944995967361604e-06, + "loss": 0.73964, + "num_input_tokens_seen": 201424140, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66796875, + "step": 9353, + "time_per_iteration": 2.4747259616851807 + }, + { + "auxiliary_loss_clip": 0.01110024, + "auxiliary_loss_mlp": 0.01028607, + "balance_loss_clip": 1.01584542, + "balance_loss_mlp": 1.03763878, + "epoch": 0.5623929054561851, + "flos": 14019274523520.0, + "grad_norm": 5.092816610198626, + "language_loss": 0.75717902, + "learning_rate": 1.6941147112481327e-06, + "loss": 0.77856535, + "num_input_tokens_seen": 201439645, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7265625, + "step": 9354, + "time_per_iteration": 2.412938356399536 + }, + { + "auxiliary_loss_clip": 0.01111475, + "auxiliary_loss_mlp": 0.01033305, + "balance_loss_clip": 1.02066851, + "balance_loss_mlp": 1.03783214, + "epoch": 0.5624530287088532, + "flos": 20704405340160.0, + "grad_norm": 2.4650169046981434, + "language_loss": 0.72549778, + "learning_rate": 1.6937298373593056e-06, + "loss": 0.74694556, + "num_input_tokens_seen": 201459970, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.734375, + "step": 9355, + "time_per_iteration": 2.484099864959717 + }, + { + "auxiliary_loss_clip": 0.01108801, + "auxiliary_loss_mlp": 0.01030058, + "balance_loss_clip": 1.01700521, + "balance_loss_mlp": 1.03818929, + "epoch": 0.5625131519615211, + "flos": 21470918595840.0, + "grad_norm": 1.8617046290731056, + "language_loss": 0.73371327, + "learning_rate": 1.693344975084274e-06, + "loss": 0.75510186, + "num_input_tokens_seen": 201480055, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.70703125, + "step": 9356, + "time_per_iteration": 2.465129852294922 + }, + { + "auxiliary_loss_clip": 0.0110695, + "auxiliary_loss_mlp": 0.01035155, + "balance_loss_clip": 1.02280545, + "balance_loss_mlp": 1.03822494, + "epoch": 0.5625732752141891, + "flos": 18698004466560.0, + "grad_norm": 1.9991704999969526, + "language_loss": 0.82985485, + "learning_rate": 1.6929601244376318e-06, + "loss": 0.85127592, + "num_input_tokens_seen": 201497645, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 9357, + "time_per_iteration": 2.41115665435791 + }, + { + "auxiliary_loss_clip": 0.01107392, + "auxiliary_loss_mlp": 0.01030025, + "balance_loss_clip": 1.01797318, + "balance_loss_mlp": 1.03697777, + "epoch": 0.562633398466857, + "flos": 16216900427520.0, + "grad_norm": 1.9946457873090748, + "language_loss": 0.720213, + "learning_rate": 1.6925752854339722e-06, + "loss": 0.74158716, + "num_input_tokens_seen": 201515455, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.703125, + "step": 9358, + "time_per_iteration": 2.4276978969573975 + }, + { + "auxiliary_loss_clip": 0.01105818, + "auxiliary_loss_mlp": 0.0103922, + "balance_loss_clip": 1.0264169, + "balance_loss_mlp": 1.03677905, + "epoch": 0.562693521719525, + "flos": 22491930689280.0, + "grad_norm": 2.1174896987661755, + "language_loss": 0.77650487, + "learning_rate": 1.6921904580878885e-06, + "loss": 0.79795527, + "num_input_tokens_seen": 201534500, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69140625, + "step": 9359, + "time_per_iteration": 2.5595555305480957 + }, + { + "auxiliary_loss_clip": 0.01108374, + "auxiliary_loss_mlp": 0.01029979, + "balance_loss_clip": 1.0177722, + "balance_loss_mlp": 1.03723145, + "epoch": 0.562753644972193, + "flos": 25331171281920.0, + "grad_norm": 1.6788321894876823, + "language_loss": 0.70193481, + "learning_rate": 1.6918056424139736e-06, + "loss": 0.7233184, + "num_input_tokens_seen": 201553280, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.7109375, + "step": 9360, + "time_per_iteration": 2.485053062438965 + }, + { + "auxiliary_loss_clip": 0.01030911, + "auxiliary_loss_mlp": 0.01001933, + "balance_loss_clip": 1.0007472, + "balance_loss_mlp": 1.00916827, + "epoch": 0.562813768224861, + "flos": 67392622126080.0, + "grad_norm": 0.7762895856423075, + "language_loss": 0.55579072, + "learning_rate": 1.6914208384268197e-06, + "loss": 0.57611912, + "num_input_tokens_seen": 201610030, + "router_z_loss_clip": 0.01184082, + "router_z_loss_mlp": 0.21679688, + "step": 9361, + "time_per_iteration": 3.025913953781128 + }, + { + "auxiliary_loss_clip": 0.01105882, + "auxiliary_loss_mlp": 0.01033189, + "balance_loss_clip": 1.02153039, + "balance_loss_mlp": 1.03833425, + "epoch": 0.562873891477529, + "flos": 23331163029120.0, + "grad_norm": 1.3888397041491727, + "language_loss": 0.8183462, + "learning_rate": 1.691036046141018e-06, + "loss": 0.83973688, + "num_input_tokens_seen": 201628370, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.67578125, + "step": 9362, + "time_per_iteration": 2.5037269592285156 + }, + { + "auxiliary_loss_clip": 0.01105782, + "auxiliary_loss_mlp": 0.0103495, + "balance_loss_clip": 1.02248108, + "balance_loss_mlp": 1.03707612, + "epoch": 0.5629340147301969, + "flos": 38472824805120.0, + "grad_norm": 1.5280416781125297, + "language_loss": 0.74536633, + "learning_rate": 1.6906512655711614e-06, + "loss": 0.7667737, + "num_input_tokens_seen": 201649790, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 9363, + "time_per_iteration": 2.617192268371582 + }, + { + "auxiliary_loss_clip": 0.01109847, + "auxiliary_loss_mlp": 0.01032322, + "balance_loss_clip": 1.01944757, + "balance_loss_mlp": 1.03815794, + "epoch": 0.5629941379828649, + "flos": 29242023252480.0, + "grad_norm": 1.6569550766143035, + "language_loss": 0.83350259, + "learning_rate": 1.690266496731839e-06, + "loss": 0.85492432, + "num_input_tokens_seen": 201669175, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 9364, + "time_per_iteration": 2.5304059982299805 + }, + { + "auxiliary_loss_clip": 0.01107314, + "auxiliary_loss_mlp": 0.01034102, + "balance_loss_clip": 1.0222224, + "balance_loss_mlp": 1.03869832, + "epoch": 0.5630542612355328, + "flos": 19420885676160.0, + "grad_norm": 2.211298310091642, + "language_loss": 0.64659059, + "learning_rate": 1.689881739637642e-06, + "loss": 0.66800475, + "num_input_tokens_seen": 201687000, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 9365, + "time_per_iteration": 2.4514007568359375 + }, + { + "auxiliary_loss_clip": 0.01114055, + "auxiliary_loss_mlp": 0.01036393, + "balance_loss_clip": 1.02264261, + "balance_loss_mlp": 1.03817499, + "epoch": 0.5631143844882008, + "flos": 22266303408000.0, + "grad_norm": 3.047674915648226, + "language_loss": 0.81461316, + "learning_rate": 1.6894969943031611e-06, + "loss": 0.83611768, + "num_input_tokens_seen": 201703335, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7578125, + "step": 9366, + "time_per_iteration": 2.4486207962036133 + }, + { + "auxiliary_loss_clip": 0.01107023, + "auxiliary_loss_mlp": 0.01032651, + "balance_loss_clip": 1.02089667, + "balance_loss_mlp": 1.03850698, + "epoch": 0.5631745077408687, + "flos": 22965305051520.0, + "grad_norm": 1.4263654905382444, + "language_loss": 0.73047578, + "learning_rate": 1.6891122607429845e-06, + "loss": 0.75187254, + "num_input_tokens_seen": 201723495, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.68359375, + "step": 9367, + "time_per_iteration": 2.4800310134887695 + }, + { + "auxiliary_loss_clip": 0.01030227, + "auxiliary_loss_mlp": 0.01002627, + "balance_loss_clip": 1.00138175, + "balance_loss_mlp": 1.00840044, + "epoch": 0.5632346309935368, + "flos": 65080515576960.0, + "grad_norm": 0.6249011108272925, + "language_loss": 0.5348472, + "learning_rate": 1.6887275389717028e-06, + "loss": 0.55517572, + "num_input_tokens_seen": 201792615, + "router_z_loss_clip": 0.01245117, + "router_z_loss_mlp": 0.21875, + "step": 9368, + "time_per_iteration": 3.1797282695770264 + }, + { + "auxiliary_loss_clip": 0.01108664, + "auxiliary_loss_mlp": 0.01035647, + "balance_loss_clip": 1.02317202, + "balance_loss_mlp": 1.03974152, + "epoch": 0.5632947542462047, + "flos": 23002903612800.0, + "grad_norm": 1.7643271699947485, + "language_loss": 0.69015235, + "learning_rate": 1.6883428290039046e-06, + "loss": 0.71159542, + "num_input_tokens_seen": 201812520, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 9369, + "time_per_iteration": 2.4736390113830566 + }, + { + "auxiliary_loss_clip": 0.01105862, + "auxiliary_loss_mlp": 0.01033878, + "balance_loss_clip": 1.02091432, + "balance_loss_mlp": 1.03527367, + "epoch": 0.5633548774988727, + "flos": 30482593228800.0, + "grad_norm": 1.7859826045223857, + "language_loss": 0.7540313, + "learning_rate": 1.6879581308541763e-06, + "loss": 0.77542865, + "num_input_tokens_seen": 201834185, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.70703125, + "step": 9370, + "time_per_iteration": 2.5553858280181885 + }, + { + "auxiliary_loss_clip": 0.01109895, + "auxiliary_loss_mlp": 0.01033197, + "balance_loss_clip": 1.01930332, + "balance_loss_mlp": 1.0373863, + "epoch": 0.5634150007515406, + "flos": 18515039564160.0, + "grad_norm": 3.078957924920332, + "language_loss": 0.75699127, + "learning_rate": 1.687573444537108e-06, + "loss": 0.77842218, + "num_input_tokens_seen": 201851305, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7265625, + "step": 9371, + "time_per_iteration": 2.4327011108398438 + }, + { + "auxiliary_loss_clip": 0.01106308, + "auxiliary_loss_mlp": 0.01035962, + "balance_loss_clip": 1.02386189, + "balance_loss_mlp": 1.03729022, + "epoch": 0.5634751240042086, + "flos": 19244672530560.0, + "grad_norm": 2.3308389897051702, + "language_loss": 0.76292467, + "learning_rate": 1.687188770067285e-06, + "loss": 0.7843473, + "num_input_tokens_seen": 201870350, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69140625, + "step": 9372, + "time_per_iteration": 2.447720766067505 + }, + { + "auxiliary_loss_clip": 0.01106021, + "auxiliary_loss_mlp": 0.01032443, + "balance_loss_clip": 1.02006888, + "balance_loss_mlp": 1.03829265, + "epoch": 0.5635352472568766, + "flos": 12020630987520.0, + "grad_norm": 2.0572116747420224, + "language_loss": 0.72010261, + "learning_rate": 1.6868041074592956e-06, + "loss": 0.74148726, + "num_input_tokens_seen": 201886800, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 9373, + "time_per_iteration": 2.4268109798431396 + }, + { + "auxiliary_loss_clip": 0.01110498, + "auxiliary_loss_mlp": 0.01031888, + "balance_loss_clip": 1.01839924, + "balance_loss_mlp": 1.03994441, + "epoch": 0.5635953705095446, + "flos": 21871645701120.0, + "grad_norm": 2.3770492627250617, + "language_loss": 0.82499874, + "learning_rate": 1.6864194567277264e-06, + "loss": 0.84642255, + "num_input_tokens_seen": 201904730, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.70703125, + "step": 9374, + "time_per_iteration": 2.49582576751709 + }, + { + "auxiliary_loss_clip": 0.0110343, + "auxiliary_loss_mlp": 0.01026872, + "balance_loss_clip": 1.0145762, + "balance_loss_mlp": 1.03463507, + "epoch": 0.5636554937622126, + "flos": 27126166659840.0, + "grad_norm": 1.5156995265370945, + "language_loss": 0.66020733, + "learning_rate": 1.6860348178871618e-06, + "loss": 0.68151033, + "num_input_tokens_seen": 201924850, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 9375, + "time_per_iteration": 2.516523599624634 + }, + { + "auxiliary_loss_clip": 0.01109185, + "auxiliary_loss_mlp": 0.01036661, + "balance_loss_clip": 1.02434063, + "balance_loss_mlp": 1.03792977, + "epoch": 0.5637156170148805, + "flos": 12926405272320.0, + "grad_norm": 5.168267369431286, + "language_loss": 0.80860347, + "learning_rate": 1.6856501909521889e-06, + "loss": 0.83006191, + "num_input_tokens_seen": 201939500, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.7109375, + "step": 9376, + "time_per_iteration": 2.4961087703704834 + }, + { + "auxiliary_loss_clip": 0.01110113, + "auxiliary_loss_mlp": 0.01033851, + "balance_loss_clip": 1.02070785, + "balance_loss_mlp": 1.03650188, + "epoch": 0.5637757402675485, + "flos": 45551033130240.0, + "grad_norm": 1.331404975713729, + "language_loss": 0.69354665, + "learning_rate": 1.6852655759373925e-06, + "loss": 0.71498632, + "num_input_tokens_seen": 201963000, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 9377, + "time_per_iteration": 2.6732125282287598 + }, + { + "auxiliary_loss_clip": 0.01104228, + "auxiliary_loss_mlp": 0.01030175, + "balance_loss_clip": 1.01828349, + "balance_loss_mlp": 1.03818166, + "epoch": 0.5638358635202164, + "flos": 20886041439360.0, + "grad_norm": 1.3430474289029712, + "language_loss": 0.74622703, + "learning_rate": 1.6848809728573565e-06, + "loss": 0.76757109, + "num_input_tokens_seen": 201983145, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66015625, + "step": 9378, + "time_per_iteration": 2.4836812019348145 + }, + { + "auxiliary_loss_clip": 0.01111215, + "auxiliary_loss_mlp": 0.01035583, + "balance_loss_clip": 1.0216949, + "balance_loss_mlp": 1.03538918, + "epoch": 0.5638959867728844, + "flos": 18806562345600.0, + "grad_norm": 2.4002466182561366, + "language_loss": 0.81976169, + "learning_rate": 1.6844963817266656e-06, + "loss": 0.84122968, + "num_input_tokens_seen": 202000335, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 9379, + "time_per_iteration": 2.4185829162597656 + }, + { + "auxiliary_loss_clip": 0.01106862, + "auxiliary_loss_mlp": 0.01029607, + "balance_loss_clip": 1.01691699, + "balance_loss_mlp": 1.03549135, + "epoch": 0.5639561100255523, + "flos": 27490336698240.0, + "grad_norm": 2.697413775835763, + "language_loss": 0.71534967, + "learning_rate": 1.6841118025599042e-06, + "loss": 0.73671436, + "num_input_tokens_seen": 202018275, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71484375, + "step": 9380, + "time_per_iteration": 2.5077950954437256 + }, + { + "auxiliary_loss_clip": 0.01110271, + "auxiliary_loss_mlp": 0.01034366, + "balance_loss_clip": 1.02068686, + "balance_loss_mlp": 1.03794408, + "epoch": 0.5640162332782204, + "flos": 18076570243200.0, + "grad_norm": 3.2105212283898905, + "language_loss": 0.74216485, + "learning_rate": 1.6837272353716542e-06, + "loss": 0.7636112, + "num_input_tokens_seen": 202034330, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.72265625, + "step": 9381, + "time_per_iteration": 2.4029319286346436 + }, + { + "auxiliary_loss_clip": 0.01110337, + "auxiliary_loss_mlp": 0.01031927, + "balance_loss_clip": 1.01963091, + "balance_loss_mlp": 1.03806376, + "epoch": 0.5640763565308883, + "flos": 20884856290560.0, + "grad_norm": 3.316310717009383, + "language_loss": 0.72300208, + "learning_rate": 1.683342680176499e-06, + "loss": 0.7444247, + "num_input_tokens_seen": 202053100, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.72265625, + "step": 9382, + "time_per_iteration": 2.501958131790161 + }, + { + "auxiliary_loss_clip": 0.01028829, + "auxiliary_loss_mlp": 0.00999503, + "balance_loss_clip": 0.99848998, + "balance_loss_mlp": 1.00756264, + "epoch": 0.5641364797835563, + "flos": 64447912224000.0, + "grad_norm": 0.7363360341332579, + "language_loss": 0.54461426, + "learning_rate": 1.682958136989022e-06, + "loss": 0.5648976, + "num_input_tokens_seen": 202120125, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.21289062, + "step": 9383, + "time_per_iteration": 3.2148938179016113 + }, + { + "auxiliary_loss_clip": 0.01110708, + "auxiliary_loss_mlp": 0.01030206, + "balance_loss_clip": 1.01627028, + "balance_loss_mlp": 1.03699017, + "epoch": 0.5641966030362242, + "flos": 18660944609280.0, + "grad_norm": 1.8140556963544339, + "language_loss": 0.71018171, + "learning_rate": 1.6825736058238033e-06, + "loss": 0.73159087, + "num_input_tokens_seen": 202138030, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.73828125, + "step": 9384, + "time_per_iteration": 2.442484140396118 + }, + { + "auxiliary_loss_clip": 0.0110745, + "auxiliary_loss_mlp": 0.01030718, + "balance_loss_clip": 1.01751578, + "balance_loss_mlp": 1.03652072, + "epoch": 0.5642567262888922, + "flos": 22492325738880.0, + "grad_norm": 7.95557819766849, + "language_loss": 0.76225626, + "learning_rate": 1.6821890866954263e-06, + "loss": 0.78363794, + "num_input_tokens_seen": 202155580, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 9385, + "time_per_iteration": 3.928744316101074 + }, + { + "auxiliary_loss_clip": 0.01105207, + "auxiliary_loss_mlp": 0.0103345, + "balance_loss_clip": 1.02080739, + "balance_loss_mlp": 1.0359602, + "epoch": 0.5643168495415603, + "flos": 13003972692480.0, + "grad_norm": 2.157193633028955, + "language_loss": 0.82184142, + "learning_rate": 1.6818045796184703e-06, + "loss": 0.84322798, + "num_input_tokens_seen": 202170365, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 9386, + "time_per_iteration": 2.397623062133789 + }, + { + "auxiliary_loss_clip": 0.01112226, + "auxiliary_loss_mlp": 0.01035726, + "balance_loss_clip": 1.0220114, + "balance_loss_mlp": 1.03887677, + "epoch": 0.5643769727942282, + "flos": 18588297352320.0, + "grad_norm": 2.006582014999343, + "language_loss": 0.6989364, + "learning_rate": 1.681420084607516e-06, + "loss": 0.72041589, + "num_input_tokens_seen": 202189095, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73046875, + "step": 9387, + "time_per_iteration": 5.281404733657837 + }, + { + "auxiliary_loss_clip": 0.01110413, + "auxiliary_loss_mlp": 0.01033865, + "balance_loss_clip": 1.02143192, + "balance_loss_mlp": 1.03790522, + "epoch": 0.5644370960468962, + "flos": 33806269572480.0, + "grad_norm": 1.551891117692425, + "language_loss": 0.74553275, + "learning_rate": 1.6810356016771452e-06, + "loss": 0.76697552, + "num_input_tokens_seen": 202213500, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7265625, + "step": 9388, + "time_per_iteration": 4.091272830963135 + }, + { + "auxiliary_loss_clip": 0.01103254, + "auxiliary_loss_mlp": 0.01030903, + "balance_loss_clip": 1.01970327, + "balance_loss_mlp": 1.03551602, + "epoch": 0.5644972192995641, + "flos": 21214911386880.0, + "grad_norm": 1.6063296237871756, + "language_loss": 0.82072294, + "learning_rate": 1.6806511308419353e-06, + "loss": 0.8420645, + "num_input_tokens_seen": 202231920, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.67578125, + "step": 9389, + "time_per_iteration": 2.4588046073913574 + }, + { + "auxiliary_loss_clip": 0.01110191, + "auxiliary_loss_mlp": 0.01034189, + "balance_loss_clip": 1.01995528, + "balance_loss_mlp": 1.03775918, + "epoch": 0.5645573425522321, + "flos": 18587722734720.0, + "grad_norm": 1.8781979731175902, + "language_loss": 0.64145517, + "learning_rate": 1.680266672116467e-06, + "loss": 0.66289902, + "num_input_tokens_seen": 202247600, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.72265625, + "step": 9390, + "time_per_iteration": 2.4152185916900635 + }, + { + "auxiliary_loss_clip": 0.01108689, + "auxiliary_loss_mlp": 0.01031011, + "balance_loss_clip": 1.01928711, + "balance_loss_mlp": 1.0396266, + "epoch": 0.5646174658049, + "flos": 18113809668480.0, + "grad_norm": 1.6485981004433565, + "language_loss": 0.91899133, + "learning_rate": 1.6798822255153192e-06, + "loss": 0.94038832, + "num_input_tokens_seen": 202265350, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.69140625, + "step": 9391, + "time_per_iteration": 2.4316937923431396 + }, + { + "auxiliary_loss_clip": 0.01113898, + "auxiliary_loss_mlp": 0.01036386, + "balance_loss_clip": 1.02221847, + "balance_loss_mlp": 1.03941607, + "epoch": 0.564677589057568, + "flos": 28329964087680.0, + "grad_norm": 1.8545056387285421, + "language_loss": 0.60528994, + "learning_rate": 1.6794977910530684e-06, + "loss": 0.62679285, + "num_input_tokens_seen": 202284285, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7421875, + "step": 9392, + "time_per_iteration": 2.524616003036499 + }, + { + "auxiliary_loss_clip": 0.01106729, + "auxiliary_loss_mlp": 0.01027429, + "balance_loss_clip": 1.01412547, + "balance_loss_mlp": 1.03683674, + "epoch": 0.564737712310236, + "flos": 22163743100160.0, + "grad_norm": 1.8891326454378248, + "language_loss": 0.81002814, + "learning_rate": 1.6791133687442937e-06, + "loss": 0.83136976, + "num_input_tokens_seen": 202303450, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.69921875, + "step": 9393, + "time_per_iteration": 2.5394442081451416 + }, + { + "auxiliary_loss_clip": 0.01109875, + "auxiliary_loss_mlp": 0.01029911, + "balance_loss_clip": 1.0175252, + "balance_loss_mlp": 1.03945863, + "epoch": 0.564797835562904, + "flos": 20959011918720.0, + "grad_norm": 1.6361233529041357, + "language_loss": 0.87129962, + "learning_rate": 1.6787289586035725e-06, + "loss": 0.89269751, + "num_input_tokens_seen": 202322315, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.703125, + "step": 9394, + "time_per_iteration": 2.4735207557678223 + }, + { + "auxiliary_loss_clip": 0.01109379, + "auxiliary_loss_mlp": 0.01030351, + "balance_loss_clip": 1.01856136, + "balance_loss_mlp": 1.04019666, + "epoch": 0.5648579588155719, + "flos": 17420302805760.0, + "grad_norm": 2.1407868955990232, + "language_loss": 0.84850395, + "learning_rate": 1.6783445606454814e-06, + "loss": 0.8699013, + "num_input_tokens_seen": 202339905, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.69140625, + "step": 9395, + "time_per_iteration": 2.457840919494629 + }, + { + "auxiliary_loss_clip": 0.01029319, + "auxiliary_loss_mlp": 0.00999952, + "balance_loss_clip": 0.99876004, + "balance_loss_mlp": 1.00789344, + "epoch": 0.5649180820682399, + "flos": 69929568835200.0, + "grad_norm": 0.857023745969297, + "language_loss": 0.58308172, + "learning_rate": 1.677960174884597e-06, + "loss": 0.60337436, + "num_input_tokens_seen": 202397320, + "router_z_loss_clip": 0.01190186, + "router_z_loss_mlp": 0.21484375, + "step": 9396, + "time_per_iteration": 3.073537588119507 + }, + { + "auxiliary_loss_clip": 0.01110535, + "auxiliary_loss_mlp": 0.01030433, + "balance_loss_clip": 1.01818371, + "balance_loss_mlp": 1.03816915, + "epoch": 0.5649782053209078, + "flos": 24973070641920.0, + "grad_norm": 2.248812637940723, + "language_loss": 0.70105237, + "learning_rate": 1.6775758013354943e-06, + "loss": 0.72246206, + "num_input_tokens_seen": 202416865, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.72265625, + "step": 9397, + "time_per_iteration": 2.4962973594665527 + }, + { + "auxiliary_loss_clip": 0.01109847, + "auxiliary_loss_mlp": 0.01032556, + "balance_loss_clip": 1.02008653, + "balance_loss_mlp": 1.03723562, + "epoch": 0.5650383285735758, + "flos": 21726602582400.0, + "grad_norm": 1.751232513493423, + "language_loss": 0.66376907, + "learning_rate": 1.67719144001275e-06, + "loss": 0.68519312, + "num_input_tokens_seen": 202436210, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7265625, + "step": 9398, + "time_per_iteration": 2.4747612476348877 + }, + { + "auxiliary_loss_clip": 0.01027927, + "auxiliary_loss_mlp": 0.0100079, + "balance_loss_clip": 0.99962217, + "balance_loss_mlp": 1.00642622, + "epoch": 0.5650984518262439, + "flos": 65904484636800.0, + "grad_norm": 0.8050196413226386, + "language_loss": 0.58135325, + "learning_rate": 1.6768070909309386e-06, + "loss": 0.60164046, + "num_input_tokens_seen": 202492925, + "router_z_loss_clip": 0.01165771, + "router_z_loss_mlp": 0.21484375, + "step": 9399, + "time_per_iteration": 3.043860912322998 + }, + { + "auxiliary_loss_clip": 0.01109539, + "auxiliary_loss_mlp": 0.01034107, + "balance_loss_clip": 1.01959336, + "balance_loss_mlp": 1.03663015, + "epoch": 0.5651585750789118, + "flos": 21032592929280.0, + "grad_norm": 1.8022721102148394, + "language_loss": 0.72654182, + "learning_rate": 1.6764227541046347e-06, + "loss": 0.74797827, + "num_input_tokens_seen": 202511905, + "router_z_loss_clip": 0.14550781, + "router_z_loss_mlp": 0.7265625, + "step": 9400, + "time_per_iteration": 2.46345853805542 + }, + { + "auxiliary_loss_clip": 0.01112209, + "auxiliary_loss_mlp": 0.01036399, + "balance_loss_clip": 1.02223074, + "balance_loss_mlp": 1.03858781, + "epoch": 0.5652186983315798, + "flos": 18551919853440.0, + "grad_norm": 2.2275961694321254, + "language_loss": 0.61034292, + "learning_rate": 1.676038429548412e-06, + "loss": 0.63182896, + "num_input_tokens_seen": 202529815, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.734375, + "step": 9401, + "time_per_iteration": 2.4518327713012695 + }, + { + "auxiliary_loss_clip": 0.01105602, + "auxiliary_loss_mlp": 0.01028063, + "balance_loss_clip": 1.01590967, + "balance_loss_mlp": 1.03578329, + "epoch": 0.5652788215842477, + "flos": 18478662065280.0, + "grad_norm": 1.8211208041554372, + "language_loss": 0.81334603, + "learning_rate": 1.6756541172768453e-06, + "loss": 0.8346827, + "num_input_tokens_seen": 202547710, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69921875, + "step": 9402, + "time_per_iteration": 2.4201457500457764 + }, + { + "auxiliary_loss_clip": 0.0110456, + "auxiliary_loss_mlp": 0.01033217, + "balance_loss_clip": 1.02154684, + "balance_loss_mlp": 1.03594768, + "epoch": 0.5653389448369157, + "flos": 30044052080640.0, + "grad_norm": 1.4814077209882908, + "language_loss": 0.77969164, + "learning_rate": 1.6752698173045068e-06, + "loss": 0.80106944, + "num_input_tokens_seen": 202568835, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.68359375, + "step": 9403, + "time_per_iteration": 2.5353829860687256 + }, + { + "auxiliary_loss_clip": 0.01106959, + "auxiliary_loss_mlp": 0.01027921, + "balance_loss_clip": 1.01558244, + "balance_loss_mlp": 1.03666544, + "epoch": 0.5653990680895836, + "flos": 16727550128640.0, + "grad_norm": 1.6092170779922605, + "language_loss": 0.68699729, + "learning_rate": 1.6748855296459685e-06, + "loss": 0.70834613, + "num_input_tokens_seen": 202587385, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 9404, + "time_per_iteration": 2.4321181774139404 + }, + { + "auxiliary_loss_clip": 0.01102774, + "auxiliary_loss_mlp": 0.0103085, + "balance_loss_clip": 1.01951897, + "balance_loss_mlp": 1.03503776, + "epoch": 0.5654591913422516, + "flos": 14538256179840.0, + "grad_norm": 2.484491546437136, + "language_loss": 0.66842878, + "learning_rate": 1.6745012543158045e-06, + "loss": 0.68976498, + "num_input_tokens_seen": 202604815, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6796875, + "step": 9405, + "time_per_iteration": 2.440232992172241 + }, + { + "auxiliary_loss_clip": 0.01104276, + "auxiliary_loss_mlp": 0.01031828, + "balance_loss_clip": 1.02001476, + "balance_loss_mlp": 1.03823268, + "epoch": 0.5655193145949196, + "flos": 26209905603840.0, + "grad_norm": 1.9824391842040467, + "language_loss": 0.74238181, + "learning_rate": 1.6741169913285852e-06, + "loss": 0.76374286, + "num_input_tokens_seen": 202623775, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66015625, + "step": 9406, + "time_per_iteration": 2.4748172760009766 + }, + { + "auxiliary_loss_clip": 0.0110835, + "auxiliary_loss_mlp": 0.01033658, + "balance_loss_clip": 1.02006197, + "balance_loss_mlp": 1.03640151, + "epoch": 0.5655794378475876, + "flos": 25046579825280.0, + "grad_norm": 1.7875183280919196, + "language_loss": 0.79345733, + "learning_rate": 1.673732740698882e-06, + "loss": 0.81487745, + "num_input_tokens_seen": 202643375, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71875, + "step": 9407, + "time_per_iteration": 2.507815361022949 + }, + { + "auxiliary_loss_clip": 0.01104854, + "auxiliary_loss_mlp": 0.01031122, + "balance_loss_clip": 1.0192194, + "balance_loss_mlp": 1.03815317, + "epoch": 0.5656395611002555, + "flos": 31032852652800.0, + "grad_norm": 1.520930632215419, + "language_loss": 0.70626116, + "learning_rate": 1.6733485024412666e-06, + "loss": 0.7276209, + "num_input_tokens_seen": 202668400, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 9408, + "time_per_iteration": 2.62674880027771 + }, + { + "auxiliary_loss_clip": 0.01106638, + "auxiliary_loss_mlp": 0.01032436, + "balance_loss_clip": 1.02018738, + "balance_loss_mlp": 1.03758848, + "epoch": 0.5656996843529235, + "flos": 20229522606720.0, + "grad_norm": 2.0177540820880377, + "language_loss": 0.81701803, + "learning_rate": 1.672964276570308e-06, + "loss": 0.83840877, + "num_input_tokens_seen": 202685125, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.69140625, + "step": 9409, + "time_per_iteration": 2.4532053470611572 + }, + { + "auxiliary_loss_clip": 0.01105936, + "auxiliary_loss_mlp": 0.01026562, + "balance_loss_clip": 1.01446199, + "balance_loss_mlp": 1.03632855, + "epoch": 0.5657598076055914, + "flos": 20996251344000.0, + "grad_norm": 1.7583452820695855, + "language_loss": 0.77886415, + "learning_rate": 1.6725800631005776e-06, + "loss": 0.80018914, + "num_input_tokens_seen": 202703830, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69921875, + "step": 9410, + "time_per_iteration": 2.441938877105713 + }, + { + "auxiliary_loss_clip": 0.01107661, + "auxiliary_loss_mlp": 0.01033936, + "balance_loss_clip": 1.02188444, + "balance_loss_mlp": 1.0371294, + "epoch": 0.5658199308582594, + "flos": 11545999649280.0, + "grad_norm": 2.4716186369957405, + "language_loss": 0.83512276, + "learning_rate": 1.6721958620466432e-06, + "loss": 0.85653877, + "num_input_tokens_seen": 202719835, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.703125, + "step": 9411, + "time_per_iteration": 2.4718945026397705 + }, + { + "auxiliary_loss_clip": 0.01111478, + "auxiliary_loss_mlp": 0.01031553, + "balance_loss_clip": 1.01870787, + "balance_loss_mlp": 1.03809881, + "epoch": 0.5658800541109275, + "flos": 14172146807040.0, + "grad_norm": 2.235812012909735, + "language_loss": 0.67052126, + "learning_rate": 1.6718116734230749e-06, + "loss": 0.69195151, + "num_input_tokens_seen": 202736795, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.734375, + "step": 9412, + "time_per_iteration": 2.4114651679992676 + }, + { + "auxiliary_loss_clip": 0.01102875, + "auxiliary_loss_mlp": 0.01027937, + "balance_loss_clip": 1.01747072, + "balance_loss_mlp": 1.03637409, + "epoch": 0.5659401773635954, + "flos": 27305073325440.0, + "grad_norm": 1.4642683426161254, + "language_loss": 0.58723432, + "learning_rate": 1.6714274972444413e-06, + "loss": 0.60854244, + "num_input_tokens_seen": 202756900, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.6640625, + "step": 9413, + "time_per_iteration": 2.5274460315704346 + }, + { + "auxiliary_loss_clip": 0.01102994, + "auxiliary_loss_mlp": 0.01028734, + "balance_loss_clip": 1.01708674, + "balance_loss_mlp": 1.03515315, + "epoch": 0.5660003006162634, + "flos": 16728196573440.0, + "grad_norm": 1.4689493119012975, + "language_loss": 0.69065028, + "learning_rate": 1.6710433335253092e-06, + "loss": 0.71196759, + "num_input_tokens_seen": 202775145, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 9414, + "time_per_iteration": 2.4249722957611084 + }, + { + "auxiliary_loss_clip": 0.01100758, + "auxiliary_loss_mlp": 0.01026939, + "balance_loss_clip": 1.0162462, + "balance_loss_mlp": 1.03464198, + "epoch": 0.5660604238689313, + "flos": 21653452535040.0, + "grad_norm": 2.330719071721026, + "language_loss": 0.78351963, + "learning_rate": 1.670659182280247e-06, + "loss": 0.80479658, + "num_input_tokens_seen": 202794505, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.66015625, + "step": 9415, + "time_per_iteration": 2.4853508472442627 + }, + { + "auxiliary_loss_clip": 0.01027693, + "auxiliary_loss_mlp": 0.01002734, + "balance_loss_clip": 1.00167274, + "balance_loss_mlp": 1.00642896, + "epoch": 0.5661205471215993, + "flos": 68824022083200.0, + "grad_norm": 0.686572948711127, + "language_loss": 0.49232727, + "learning_rate": 1.670275043523822e-06, + "loss": 0.51263154, + "num_input_tokens_seen": 202858580, + "router_z_loss_clip": 0.01062012, + "router_z_loss_mlp": 0.21289062, + "step": 9416, + "time_per_iteration": 3.1817550659179688 + }, + { + "auxiliary_loss_clip": 0.01106414, + "auxiliary_loss_mlp": 0.01032774, + "balance_loss_clip": 1.02036452, + "balance_loss_mlp": 1.03713977, + "epoch": 0.5661806703742672, + "flos": 28621774177920.0, + "grad_norm": 1.6874553076405654, + "language_loss": 0.62577593, + "learning_rate": 1.6698909172706e-06, + "loss": 0.6471678, + "num_input_tokens_seen": 202878565, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6953125, + "step": 9417, + "time_per_iteration": 2.5856666564941406 + }, + { + "auxiliary_loss_clip": 0.01107822, + "auxiliary_loss_mlp": 0.01030901, + "balance_loss_clip": 1.01865244, + "balance_loss_mlp": 1.03606224, + "epoch": 0.5662407936269352, + "flos": 21397948116480.0, + "grad_norm": 1.797784660701456, + "language_loss": 0.68931323, + "learning_rate": 1.6695068035351479e-06, + "loss": 0.71070051, + "num_input_tokens_seen": 202897350, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.71875, + "step": 9418, + "time_per_iteration": 2.4920060634613037 + }, + { + "auxiliary_loss_clip": 0.01105804, + "auxiliary_loss_mlp": 0.01032238, + "balance_loss_clip": 1.01873779, + "balance_loss_mlp": 1.035465, + "epoch": 0.5663009168796032, + "flos": 25660005315840.0, + "grad_norm": 1.9782803688051387, + "language_loss": 0.64613676, + "learning_rate": 1.6691227023320304e-06, + "loss": 0.66751719, + "num_input_tokens_seen": 202916745, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.703125, + "step": 9419, + "time_per_iteration": 2.5130629539489746 + }, + { + "auxiliary_loss_clip": 0.01028877, + "auxiliary_loss_mlp": 0.01005663, + "balance_loss_clip": 1.00455463, + "balance_loss_mlp": 1.00721812, + "epoch": 0.5663610401322712, + "flos": 67930458422400.0, + "grad_norm": 0.7373486000439856, + "language_loss": 0.59778821, + "learning_rate": 1.6687386136758135e-06, + "loss": 0.61813354, + "num_input_tokens_seen": 202982375, + "router_z_loss_clip": 0.0111084, + "router_z_loss_mlp": 0.21679688, + "step": 9420, + "time_per_iteration": 3.1712303161621094 + }, + { + "auxiliary_loss_clip": 0.01101914, + "auxiliary_loss_mlp": 0.01029636, + "balance_loss_clip": 1.01874661, + "balance_loss_mlp": 1.03477347, + "epoch": 0.5664211633849391, + "flos": 24609367480320.0, + "grad_norm": 1.7745364781392496, + "language_loss": 0.74103463, + "learning_rate": 1.6683545375810618e-06, + "loss": 0.76235008, + "num_input_tokens_seen": 203002430, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.671875, + "step": 9421, + "time_per_iteration": 2.4926223754882812 + }, + { + "auxiliary_loss_clip": 0.01109317, + "auxiliary_loss_mlp": 0.01035488, + "balance_loss_clip": 1.02292371, + "balance_loss_mlp": 1.03705812, + "epoch": 0.5664812866376071, + "flos": 11648811352320.0, + "grad_norm": 1.8540803425049197, + "language_loss": 0.72345394, + "learning_rate": 1.6679704740623389e-06, + "loss": 0.74490201, + "num_input_tokens_seen": 203019425, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.72265625, + "step": 9422, + "time_per_iteration": 2.4081509113311768 + }, + { + "auxiliary_loss_clip": 0.01103997, + "auxiliary_loss_mlp": 0.01034779, + "balance_loss_clip": 1.02378821, + "balance_loss_mlp": 1.03694618, + "epoch": 0.566541409890275, + "flos": 24643985212800.0, + "grad_norm": 1.7305682094853587, + "language_loss": 0.81321973, + "learning_rate": 1.6675864231342085e-06, + "loss": 0.83460754, + "num_input_tokens_seen": 203039035, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.671875, + "step": 9423, + "time_per_iteration": 2.4871041774749756 + }, + { + "auxiliary_loss_clip": 0.01102932, + "auxiliary_loss_mlp": 0.0103348, + "balance_loss_clip": 1.0210824, + "balance_loss_mlp": 1.0354147, + "epoch": 0.566601533142943, + "flos": 22270577126400.0, + "grad_norm": 1.656660590859511, + "language_loss": 0.8069616, + "learning_rate": 1.6672023848112353e-06, + "loss": 0.82832569, + "num_input_tokens_seen": 203059320, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.67578125, + "step": 9424, + "time_per_iteration": 2.4634275436401367 + }, + { + "auxiliary_loss_clip": 0.01111676, + "auxiliary_loss_mlp": 0.01032203, + "balance_loss_clip": 1.01844072, + "balance_loss_mlp": 1.03887486, + "epoch": 0.5666616563956111, + "flos": 29971656218880.0, + "grad_norm": 1.8161233698436283, + "language_loss": 0.78745866, + "learning_rate": 1.6668183591079805e-06, + "loss": 0.80889738, + "num_input_tokens_seen": 203078490, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7265625, + "step": 9425, + "time_per_iteration": 2.5064780712127686 + }, + { + "auxiliary_loss_clip": 0.01105998, + "auxiliary_loss_mlp": 0.01028946, + "balance_loss_clip": 1.01658988, + "balance_loss_mlp": 1.03674626, + "epoch": 0.566721779648279, + "flos": 17781456101760.0, + "grad_norm": 1.8642193992685885, + "language_loss": 0.5897873, + "learning_rate": 1.6664343460390064e-06, + "loss": 0.61113673, + "num_input_tokens_seen": 203096065, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6953125, + "step": 9426, + "time_per_iteration": 2.4720263481140137 + }, + { + "auxiliary_loss_clip": 0.01110856, + "auxiliary_loss_mlp": 0.01030111, + "balance_loss_clip": 1.01804113, + "balance_loss_mlp": 1.03823078, + "epoch": 0.566781902900947, + "flos": 21033490769280.0, + "grad_norm": 2.0557394177022768, + "language_loss": 0.81685758, + "learning_rate": 1.6660503456188764e-06, + "loss": 0.83826721, + "num_input_tokens_seen": 203115270, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.7265625, + "step": 9427, + "time_per_iteration": 3.872758388519287 + }, + { + "auxiliary_loss_clip": 0.01104828, + "auxiliary_loss_mlp": 0.01036492, + "balance_loss_clip": 1.02394485, + "balance_loss_mlp": 1.03744185, + "epoch": 0.5668420261536149, + "flos": 23148593176320.0, + "grad_norm": 1.8776390907485432, + "language_loss": 0.86198628, + "learning_rate": 1.6656663578621498e-06, + "loss": 0.88339949, + "num_input_tokens_seen": 203134290, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.671875, + "step": 9428, + "time_per_iteration": 2.4911303520202637 + }, + { + "auxiliary_loss_clip": 0.01112998, + "auxiliary_loss_mlp": 0.01036692, + "balance_loss_clip": 1.02427602, + "balance_loss_mlp": 1.04080331, + "epoch": 0.5669021494062829, + "flos": 22601601889920.0, + "grad_norm": 2.1518083513194552, + "language_loss": 0.74125421, + "learning_rate": 1.6652823827833886e-06, + "loss": 0.7627511, + "num_input_tokens_seen": 203152935, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71875, + "step": 9429, + "time_per_iteration": 3.9635231494903564 + }, + { + "auxiliary_loss_clip": 0.01109434, + "auxiliary_loss_mlp": 0.0103455, + "balance_loss_clip": 1.02127612, + "balance_loss_mlp": 1.03756118, + "epoch": 0.5669622726589508, + "flos": 17381231786880.0, + "grad_norm": 1.7976574461964, + "language_loss": 0.7496838, + "learning_rate": 1.6648984203971538e-06, + "loss": 0.77112365, + "num_input_tokens_seen": 203170110, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 9430, + "time_per_iteration": 3.8817877769470215 + }, + { + "auxiliary_loss_clip": 0.01106735, + "auxiliary_loss_mlp": 0.01033765, + "balance_loss_clip": 1.02152157, + "balance_loss_mlp": 1.03621042, + "epoch": 0.5670223959116188, + "flos": 18763253521920.0, + "grad_norm": 2.3751678803775285, + "language_loss": 0.7272107, + "learning_rate": 1.6645144707180032e-06, + "loss": 0.74861568, + "num_input_tokens_seen": 203188825, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.703125, + "step": 9431, + "time_per_iteration": 2.51401948928833 + }, + { + "auxiliary_loss_clip": 0.01100909, + "auxiliary_loss_mlp": 0.01029023, + "balance_loss_clip": 1.01810944, + "balance_loss_mlp": 1.03722477, + "epoch": 0.5670825191642868, + "flos": 13553334276480.0, + "grad_norm": 1.9291254540879526, + "language_loss": 0.73248518, + "learning_rate": 1.6641305337604984e-06, + "loss": 0.75378448, + "num_input_tokens_seen": 203206860, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.63671875, + "step": 9432, + "time_per_iteration": 2.4319839477539062 + }, + { + "auxiliary_loss_clip": 0.01107311, + "auxiliary_loss_mlp": 0.01032729, + "balance_loss_clip": 1.02087343, + "balance_loss_mlp": 1.03681755, + "epoch": 0.5671426424169548, + "flos": 22054035985920.0, + "grad_norm": 1.5888571716641233, + "language_loss": 0.77957594, + "learning_rate": 1.663746609539197e-06, + "loss": 0.80097634, + "num_input_tokens_seen": 203225625, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.703125, + "step": 9433, + "time_per_iteration": 2.5169765949249268 + }, + { + "auxiliary_loss_clip": 0.01111851, + "auxiliary_loss_mlp": 0.01031044, + "balance_loss_clip": 1.01645875, + "balance_loss_mlp": 1.03870261, + "epoch": 0.5672027656696227, + "flos": 21323972056320.0, + "grad_norm": 1.7704673621088174, + "language_loss": 0.63839334, + "learning_rate": 1.6633626980686582e-06, + "loss": 0.65982234, + "num_input_tokens_seen": 203242920, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.73046875, + "step": 9434, + "time_per_iteration": 2.4372098445892334 + }, + { + "auxiliary_loss_clip": 0.01102835, + "auxiliary_loss_mlp": 0.01026729, + "balance_loss_clip": 1.01495695, + "balance_loss_mlp": 1.03529072, + "epoch": 0.5672628889222907, + "flos": 23514056104320.0, + "grad_norm": 1.879777953851778, + "language_loss": 0.66724491, + "learning_rate": 1.6629787993634399e-06, + "loss": 0.68854052, + "num_input_tokens_seen": 203261995, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.67578125, + "step": 9435, + "time_per_iteration": 2.5156021118164062 + }, + { + "auxiliary_loss_clip": 0.01104078, + "auxiliary_loss_mlp": 0.01032518, + "balance_loss_clip": 1.02028716, + "balance_loss_mlp": 1.03599691, + "epoch": 0.5673230121749586, + "flos": 27121928855040.0, + "grad_norm": 1.3893571871291595, + "language_loss": 0.71398699, + "learning_rate": 1.6625949134380984e-06, + "loss": 0.73535293, + "num_input_tokens_seen": 203280670, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6796875, + "step": 9436, + "time_per_iteration": 2.4815714359283447 + }, + { + "auxiliary_loss_clip": 0.01109121, + "auxiliary_loss_mlp": 0.01029795, + "balance_loss_clip": 1.01723647, + "balance_loss_mlp": 1.03756368, + "epoch": 0.5673831354276266, + "flos": 31141985149440.0, + "grad_norm": 1.6654091498260946, + "language_loss": 0.73988926, + "learning_rate": 1.6622110403071921e-06, + "loss": 0.76127845, + "num_input_tokens_seen": 203304800, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71484375, + "step": 9437, + "time_per_iteration": 2.5965943336486816 + }, + { + "auxiliary_loss_clip": 0.01112439, + "auxiliary_loss_mlp": 0.01032397, + "balance_loss_clip": 1.01942062, + "balance_loss_mlp": 1.04159832, + "epoch": 0.5674432586802945, + "flos": 27673193859840.0, + "grad_norm": 2.439390833366172, + "language_loss": 0.60905057, + "learning_rate": 1.661827179985277e-06, + "loss": 0.63049889, + "num_input_tokens_seen": 203324060, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.70703125, + "step": 9438, + "time_per_iteration": 2.512578010559082 + }, + { + "auxiliary_loss_clip": 0.01105416, + "auxiliary_loss_mlp": 0.01028895, + "balance_loss_clip": 1.01714146, + "balance_loss_mlp": 1.03543329, + "epoch": 0.5675033819329626, + "flos": 26615157822720.0, + "grad_norm": 1.6600048607148805, + "language_loss": 0.75087392, + "learning_rate": 1.661443332486909e-06, + "loss": 0.77221704, + "num_input_tokens_seen": 203344360, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.69921875, + "step": 9439, + "time_per_iteration": 2.531489133834839 + }, + { + "auxiliary_loss_clip": 0.01107772, + "auxiliary_loss_mlp": 0.01029394, + "balance_loss_clip": 1.0159471, + "balance_loss_mlp": 1.03828883, + "epoch": 0.5675635051856306, + "flos": 19098372435840.0, + "grad_norm": 1.8930047517001285, + "language_loss": 0.8361944, + "learning_rate": 1.6610594978266438e-06, + "loss": 0.857566, + "num_input_tokens_seen": 203362115, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.6953125, + "step": 9440, + "time_per_iteration": 2.4386231899261475 + }, + { + "auxiliary_loss_clip": 0.01111147, + "auxiliary_loss_mlp": 0.01034391, + "balance_loss_clip": 1.02123022, + "balance_loss_mlp": 1.03704751, + "epoch": 0.5676236284382985, + "flos": 17566315591680.0, + "grad_norm": 2.0023123091206467, + "language_loss": 0.7550447, + "learning_rate": 1.6606756760190365e-06, + "loss": 0.77650005, + "num_input_tokens_seen": 203380550, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 9441, + "time_per_iteration": 2.4788920879364014 + }, + { + "auxiliary_loss_clip": 0.01108262, + "auxiliary_loss_mlp": 0.01032728, + "balance_loss_clip": 1.02022874, + "balance_loss_mlp": 1.0381217, + "epoch": 0.5676837516909665, + "flos": 15954069634560.0, + "grad_norm": 2.003106565766755, + "language_loss": 0.83199525, + "learning_rate": 1.6602918670786413e-06, + "loss": 0.85340512, + "num_input_tokens_seen": 203396590, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 9442, + "time_per_iteration": 2.4066359996795654 + }, + { + "auxiliary_loss_clip": 0.01103828, + "auxiliary_loss_mlp": 0.01030609, + "balance_loss_clip": 1.01906371, + "balance_loss_mlp": 1.0388906, + "epoch": 0.5677438749436344, + "flos": 18295912644480.0, + "grad_norm": 2.099488848818881, + "language_loss": 0.74606907, + "learning_rate": 1.6599080710200126e-06, + "loss": 0.76741344, + "num_input_tokens_seen": 203414280, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 9443, + "time_per_iteration": 2.4699020385742188 + }, + { + "auxiliary_loss_clip": 0.0110959, + "auxiliary_loss_mlp": 0.01034472, + "balance_loss_clip": 1.02184737, + "balance_loss_mlp": 1.03892851, + "epoch": 0.5678039981963025, + "flos": 17931311642880.0, + "grad_norm": 1.9353911334921245, + "language_loss": 0.77443373, + "learning_rate": 1.6595242878577046e-06, + "loss": 0.79587436, + "num_input_tokens_seen": 203433280, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.70703125, + "step": 9444, + "time_per_iteration": 2.418164014816284 + }, + { + "auxiliary_loss_clip": 0.01110885, + "auxiliary_loss_mlp": 0.01037563, + "balance_loss_clip": 1.02498603, + "balance_loss_mlp": 1.03886068, + "epoch": 0.5678641214489704, + "flos": 19316350120320.0, + "grad_norm": 1.6369546772732781, + "language_loss": 0.80673003, + "learning_rate": 1.6591405176062687e-06, + "loss": 0.82821453, + "num_input_tokens_seen": 203449935, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71875, + "step": 9445, + "time_per_iteration": 2.4474682807922363 + }, + { + "auxiliary_loss_clip": 0.01105393, + "auxiliary_loss_mlp": 0.01026724, + "balance_loss_clip": 1.0146122, + "balance_loss_mlp": 1.03579414, + "epoch": 0.5679242447016384, + "flos": 27751084502400.0, + "grad_norm": 1.310891415120181, + "language_loss": 0.70843911, + "learning_rate": 1.658756760280259e-06, + "loss": 0.72976023, + "num_input_tokens_seen": 203473025, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 9446, + "time_per_iteration": 2.5338428020477295 + }, + { + "auxiliary_loss_clip": 0.01111342, + "auxiliary_loss_mlp": 0.01031244, + "balance_loss_clip": 1.018489, + "balance_loss_mlp": 1.03815663, + "epoch": 0.5679843679543063, + "flos": 23769093646080.0, + "grad_norm": 1.8305308972685952, + "language_loss": 0.7354359, + "learning_rate": 1.6583730158942276e-06, + "loss": 0.75686181, + "num_input_tokens_seen": 203492895, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.734375, + "step": 9447, + "time_per_iteration": 2.5152740478515625 + }, + { + "auxiliary_loss_clip": 0.01110587, + "auxiliary_loss_mlp": 0.01032075, + "balance_loss_clip": 1.01963568, + "balance_loss_mlp": 1.0382061, + "epoch": 0.5680444912069743, + "flos": 25591883172480.0, + "grad_norm": 2.262443693729548, + "language_loss": 0.74931812, + "learning_rate": 1.657989284462725e-06, + "loss": 0.77074468, + "num_input_tokens_seen": 203513710, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.72265625, + "step": 9448, + "time_per_iteration": 2.468688488006592 + }, + { + "auxiliary_loss_clip": 0.0111579, + "auxiliary_loss_mlp": 0.01035922, + "balance_loss_clip": 1.0227201, + "balance_loss_mlp": 1.04175234, + "epoch": 0.5681046144596422, + "flos": 23695799944320.0, + "grad_norm": 2.1518179799978356, + "language_loss": 0.76137841, + "learning_rate": 1.6576055660003038e-06, + "loss": 0.78289551, + "num_input_tokens_seen": 203531630, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73828125, + "step": 9449, + "time_per_iteration": 2.510693311691284 + }, + { + "auxiliary_loss_clip": 0.01110533, + "auxiliary_loss_mlp": 0.01033296, + "balance_loss_clip": 1.02046347, + "balance_loss_mlp": 1.03867984, + "epoch": 0.5681647377123102, + "flos": 28000770917760.0, + "grad_norm": 1.6592475910366993, + "language_loss": 0.74742198, + "learning_rate": 1.6572218605215128e-06, + "loss": 0.76886022, + "num_input_tokens_seen": 203551885, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 9450, + "time_per_iteration": 2.5034866333007812 + }, + { + "auxiliary_loss_clip": 0.01112382, + "auxiliary_loss_mlp": 0.01035469, + "balance_loss_clip": 1.02404845, + "balance_loss_mlp": 1.04081213, + "epoch": 0.5682248609649782, + "flos": 22747758330240.0, + "grad_norm": 3.8340234675809017, + "language_loss": 0.67216206, + "learning_rate": 1.6568381680409038e-06, + "loss": 0.69364059, + "num_input_tokens_seen": 203572250, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.71875, + "step": 9451, + "time_per_iteration": 2.503805637359619 + }, + { + "auxiliary_loss_clip": 0.01115025, + "auxiliary_loss_mlp": 0.01031671, + "balance_loss_clip": 1.01743114, + "balance_loss_mlp": 1.03788531, + "epoch": 0.5682849842176462, + "flos": 21288600138240.0, + "grad_norm": 1.8009184427821863, + "language_loss": 0.71697223, + "learning_rate": 1.656454488573026e-06, + "loss": 0.7384392, + "num_input_tokens_seen": 203590605, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7734375, + "step": 9452, + "time_per_iteration": 2.4519643783569336 + }, + { + "auxiliary_loss_clip": 0.01105455, + "auxiliary_loss_mlp": 0.01031445, + "balance_loss_clip": 1.01947021, + "balance_loss_mlp": 1.03679395, + "epoch": 0.5683451074703142, + "flos": 21141689512320.0, + "grad_norm": 1.6525298490216664, + "language_loss": 0.70272237, + "learning_rate": 1.656070822132428e-06, + "loss": 0.72409141, + "num_input_tokens_seen": 203610080, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6875, + "step": 9453, + "time_per_iteration": 2.5260796546936035 + }, + { + "auxiliary_loss_clip": 0.01110332, + "auxiliary_loss_mlp": 0.01034031, + "balance_loss_clip": 1.02190745, + "balance_loss_mlp": 1.03889799, + "epoch": 0.5684052307229821, + "flos": 22344481359360.0, + "grad_norm": 2.2860746429720833, + "language_loss": 0.69546616, + "learning_rate": 1.6556871687336592e-06, + "loss": 0.71690989, + "num_input_tokens_seen": 203630060, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71484375, + "step": 9454, + "time_per_iteration": 2.457736015319824 + }, + { + "auxiliary_loss_clip": 0.01103936, + "auxiliary_loss_mlp": 0.01029774, + "balance_loss_clip": 1.01837158, + "balance_loss_mlp": 1.03616297, + "epoch": 0.5684653539756501, + "flos": 21798639308160.0, + "grad_norm": 1.8998375571155763, + "language_loss": 0.60430771, + "learning_rate": 1.6553035283912671e-06, + "loss": 0.6256448, + "num_input_tokens_seen": 203649065, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.67578125, + "step": 9455, + "time_per_iteration": 2.506091594696045 + }, + { + "auxiliary_loss_clip": 0.01116555, + "auxiliary_loss_mlp": 0.01030863, + "balance_loss_clip": 1.01808953, + "balance_loss_mlp": 1.0424788, + "epoch": 0.568525477228318, + "flos": 22999635475200.0, + "grad_norm": 2.102932497256003, + "language_loss": 0.72914851, + "learning_rate": 1.6549199011198e-06, + "loss": 0.75062263, + "num_input_tokens_seen": 203667545, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7421875, + "step": 9456, + "time_per_iteration": 2.439221143722534 + }, + { + "auxiliary_loss_clip": 0.01108291, + "auxiliary_loss_mlp": 0.01032712, + "balance_loss_clip": 1.02125049, + "balance_loss_mlp": 1.03915823, + "epoch": 0.568585600480986, + "flos": 21392489249280.0, + "grad_norm": 1.5692423529190727, + "language_loss": 0.76402628, + "learning_rate": 1.6545362869338048e-06, + "loss": 0.78543633, + "num_input_tokens_seen": 203686025, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.69140625, + "step": 9457, + "time_per_iteration": 2.475327491760254 + }, + { + "auxiliary_loss_clip": 0.01110625, + "auxiliary_loss_mlp": 0.01036596, + "balance_loss_clip": 1.02338171, + "balance_loss_mlp": 1.03828931, + "epoch": 0.568645723733654, + "flos": 30007351359360.0, + "grad_norm": 1.8808926225586853, + "language_loss": 0.66305089, + "learning_rate": 1.6541526858478285e-06, + "loss": 0.68452305, + "num_input_tokens_seen": 203705540, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7265625, + "step": 9458, + "time_per_iteration": 2.5271642208099365 + }, + { + "auxiliary_loss_clip": 0.01111416, + "auxiliary_loss_mlp": 0.01027286, + "balance_loss_clip": 1.01424456, + "balance_loss_mlp": 1.03845215, + "epoch": 0.568705846986322, + "flos": 20412667077120.0, + "grad_norm": 2.21557799175144, + "language_loss": 0.67912495, + "learning_rate": 1.6537690978764167e-06, + "loss": 0.70051199, + "num_input_tokens_seen": 203723670, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.73046875, + "step": 9459, + "time_per_iteration": 2.534374237060547 + }, + { + "auxiliary_loss_clip": 0.0111268, + "auxiliary_loss_mlp": 0.01032195, + "balance_loss_clip": 1.01942194, + "balance_loss_mlp": 1.04046702, + "epoch": 0.5687659702389899, + "flos": 17456752131840.0, + "grad_norm": 3.4353012744759335, + "language_loss": 0.77999187, + "learning_rate": 1.6533855230341155e-06, + "loss": 0.8014406, + "num_input_tokens_seen": 203739705, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.72265625, + "step": 9460, + "time_per_iteration": 2.434570789337158 + }, + { + "auxiliary_loss_clip": 0.01109374, + "auxiliary_loss_mlp": 0.01035426, + "balance_loss_clip": 1.02221131, + "balance_loss_mlp": 1.03767824, + "epoch": 0.5688260934916579, + "flos": 25406081095680.0, + "grad_norm": 1.7026913094631195, + "language_loss": 0.71950358, + "learning_rate": 1.65300196133547e-06, + "loss": 0.74095166, + "num_input_tokens_seen": 203759000, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71875, + "step": 9461, + "time_per_iteration": 2.5527231693267822 + }, + { + "auxiliary_loss_clip": 0.01109281, + "auxiliary_loss_mlp": 0.01030442, + "balance_loss_clip": 1.01769304, + "balance_loss_mlp": 1.03814745, + "epoch": 0.5688862167443258, + "flos": 21608024808960.0, + "grad_norm": 2.8717094069028617, + "language_loss": 0.72976351, + "learning_rate": 1.6526184127950249e-06, + "loss": 0.75116074, + "num_input_tokens_seen": 203774295, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9462, + "time_per_iteration": 2.422624111175537 + }, + { + "auxiliary_loss_clip": 0.0110396, + "auxiliary_loss_mlp": 0.01026733, + "balance_loss_clip": 1.01590848, + "balance_loss_mlp": 1.03640223, + "epoch": 0.5689463399969938, + "flos": 22418996123520.0, + "grad_norm": 1.8933127595424433, + "language_loss": 0.7326529, + "learning_rate": 1.6522348774273246e-06, + "loss": 0.75395983, + "num_input_tokens_seen": 203792710, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.67578125, + "step": 9463, + "time_per_iteration": 2.466491460800171 + }, + { + "auxiliary_loss_clip": 0.01106737, + "auxiliary_loss_mlp": 0.01030565, + "balance_loss_clip": 1.01810765, + "balance_loss_mlp": 1.03583968, + "epoch": 0.5690064632496618, + "flos": 18296810484480.0, + "grad_norm": 1.7491308846328846, + "language_loss": 0.74368691, + "learning_rate": 1.6518513552469123e-06, + "loss": 0.76505989, + "num_input_tokens_seen": 203811645, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 9464, + "time_per_iteration": 2.406031370162964 + }, + { + "auxiliary_loss_clip": 0.01110663, + "auxiliary_loss_mlp": 0.01036268, + "balance_loss_clip": 1.02382255, + "balance_loss_mlp": 1.03892159, + "epoch": 0.5690665865023298, + "flos": 21579260993280.0, + "grad_norm": 1.714079864723851, + "language_loss": 0.84333247, + "learning_rate": 1.6514678462683312e-06, + "loss": 0.86480176, + "num_input_tokens_seen": 203830040, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71875, + "step": 9465, + "time_per_iteration": 2.514777183532715 + }, + { + "auxiliary_loss_clip": 0.01103611, + "auxiliary_loss_mlp": 0.01029517, + "balance_loss_clip": 1.01757169, + "balance_loss_mlp": 1.03546405, + "epoch": 0.5691267097549978, + "flos": 24421446501120.0, + "grad_norm": 1.8589721720108319, + "language_loss": 0.7226572, + "learning_rate": 1.651084350506125e-06, + "loss": 0.74398845, + "num_input_tokens_seen": 203851245, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.68359375, + "step": 9466, + "time_per_iteration": 2.475188732147217 + }, + { + "auxiliary_loss_clip": 0.01029497, + "auxiliary_loss_mlp": 0.01005385, + "balance_loss_clip": 1.00427043, + "balance_loss_mlp": 1.0077517, + "epoch": 0.5691868330076657, + "flos": 61657906199040.0, + "grad_norm": 0.7081654133828948, + "language_loss": 0.55354679, + "learning_rate": 1.6507008679748343e-06, + "loss": 0.57389557, + "num_input_tokens_seen": 203916400, + "router_z_loss_clip": 0.01116943, + "router_z_loss_mlp": 0.21777344, + "step": 9467, + "time_per_iteration": 3.185729742050171 + }, + { + "auxiliary_loss_clip": 0.01111718, + "auxiliary_loss_mlp": 0.01032639, + "balance_loss_clip": 1.01861966, + "balance_loss_mlp": 1.03861189, + "epoch": 0.5692469562603337, + "flos": 21325193118720.0, + "grad_norm": 2.2495356407271854, + "language_loss": 0.63680357, + "learning_rate": 1.6503173986890023e-06, + "loss": 0.65824717, + "num_input_tokens_seen": 203935870, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.73046875, + "step": 9468, + "time_per_iteration": 2.4373323917388916 + }, + { + "auxiliary_loss_clip": 0.01108125, + "auxiliary_loss_mlp": 0.01028705, + "balance_loss_clip": 1.01587772, + "balance_loss_mlp": 1.03801632, + "epoch": 0.5693070795130016, + "flos": 23367899664000.0, + "grad_norm": 1.8525378978069993, + "language_loss": 0.79367, + "learning_rate": 1.64993394266317e-06, + "loss": 0.81503832, + "num_input_tokens_seen": 203954950, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 9469, + "time_per_iteration": 3.8166728019714355 + }, + { + "auxiliary_loss_clip": 0.0111246, + "auxiliary_loss_mlp": 0.01041609, + "balance_loss_clip": 1.02810884, + "balance_loss_mlp": 1.03860152, + "epoch": 0.5693672027656697, + "flos": 18697250280960.0, + "grad_norm": 1.9923541987272968, + "language_loss": 0.69606256, + "learning_rate": 1.6495504999118769e-06, + "loss": 0.71760333, + "num_input_tokens_seen": 203972715, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73828125, + "step": 9470, + "time_per_iteration": 2.4572556018829346 + }, + { + "auxiliary_loss_clip": 0.01107845, + "auxiliary_loss_mlp": 0.01031625, + "balance_loss_clip": 1.01882184, + "balance_loss_mlp": 1.03729832, + "epoch": 0.5694273260183376, + "flos": 20449188230400.0, + "grad_norm": 1.5518202279497855, + "language_loss": 0.74791551, + "learning_rate": 1.6491670704496644e-06, + "loss": 0.76931024, + "num_input_tokens_seen": 203990775, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.70703125, + "step": 9471, + "time_per_iteration": 3.926091432571411 + }, + { + "auxiliary_loss_clip": 0.01108882, + "auxiliary_loss_mlp": 0.01032599, + "balance_loss_clip": 1.02006447, + "balance_loss_mlp": 1.03928542, + "epoch": 0.5694874492710056, + "flos": 17603195880960.0, + "grad_norm": 1.9616270612820847, + "language_loss": 0.57270539, + "learning_rate": 1.6487836542910716e-06, + "loss": 0.59412026, + "num_input_tokens_seen": 204008845, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 9472, + "time_per_iteration": 3.8452813625335693 + }, + { + "auxiliary_loss_clip": 0.01103976, + "auxiliary_loss_mlp": 0.010308, + "balance_loss_clip": 1.01886702, + "balance_loss_mlp": 1.03722382, + "epoch": 0.5695475725236735, + "flos": 13370836250880.0, + "grad_norm": 1.803122156723958, + "language_loss": 0.73615265, + "learning_rate": 1.648400251450638e-06, + "loss": 0.75750041, + "num_input_tokens_seen": 204023755, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66796875, + "step": 9473, + "time_per_iteration": 2.4637346267700195 + }, + { + "auxiliary_loss_clip": 0.01028797, + "auxiliary_loss_mlp": 0.01004803, + "balance_loss_clip": 1.00359905, + "balance_loss_mlp": 1.00722575, + "epoch": 0.5696076957763415, + "flos": 68174398661760.0, + "grad_norm": 0.6476817486149063, + "language_loss": 0.57596511, + "learning_rate": 1.6480168619429023e-06, + "loss": 0.59630114, + "num_input_tokens_seen": 204091255, + "router_z_loss_clip": 0.01202393, + "router_z_loss_mlp": 0.21679688, + "step": 9474, + "time_per_iteration": 3.09342622756958 + }, + { + "auxiliary_loss_clip": 0.01108341, + "auxiliary_loss_mlp": 0.01034518, + "balance_loss_clip": 1.02095199, + "balance_loss_mlp": 1.03955841, + "epoch": 0.5696678190290094, + "flos": 33838301525760.0, + "grad_norm": 1.7127367690076127, + "language_loss": 0.53624213, + "learning_rate": 1.6476334857824017e-06, + "loss": 0.55767071, + "num_input_tokens_seen": 204113285, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.6875, + "step": 9475, + "time_per_iteration": 2.6103556156158447 + }, + { + "auxiliary_loss_clip": 0.01110194, + "auxiliary_loss_mlp": 0.01032608, + "balance_loss_clip": 1.01969719, + "balance_loss_mlp": 1.03914022, + "epoch": 0.5697279422816774, + "flos": 26356600748160.0, + "grad_norm": 1.5220537573313933, + "language_loss": 0.79891974, + "learning_rate": 1.647250122983675e-06, + "loss": 0.82034773, + "num_input_tokens_seen": 204133045, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9476, + "time_per_iteration": 2.5519871711730957 + }, + { + "auxiliary_loss_clip": 0.01113361, + "auxiliary_loss_mlp": 0.0103459, + "balance_loss_clip": 1.02248454, + "balance_loss_mlp": 1.04071283, + "epoch": 0.5697880655343454, + "flos": 22930507751040.0, + "grad_norm": 2.93922823935367, + "language_loss": 0.66361278, + "learning_rate": 1.6468667735612592e-06, + "loss": 0.68509227, + "num_input_tokens_seen": 204152590, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.7265625, + "step": 9477, + "time_per_iteration": 2.556461811065674 + }, + { + "auxiliary_loss_clip": 0.01107946, + "auxiliary_loss_mlp": 0.01030235, + "balance_loss_clip": 1.01737881, + "balance_loss_mlp": 1.03697014, + "epoch": 0.5698481887870134, + "flos": 26761314263040.0, + "grad_norm": 1.8188873629652118, + "language_loss": 0.70921832, + "learning_rate": 1.6464834375296906e-06, + "loss": 0.73060012, + "num_input_tokens_seen": 204171815, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9478, + "time_per_iteration": 2.5022385120391846 + }, + { + "auxiliary_loss_clip": 0.01104521, + "auxiliary_loss_mlp": 0.01027788, + "balance_loss_clip": 1.01615286, + "balance_loss_mlp": 1.03824937, + "epoch": 0.5699083120396814, + "flos": 15742269089280.0, + "grad_norm": 1.5933810632151244, + "language_loss": 0.69647413, + "learning_rate": 1.6461001149035055e-06, + "loss": 0.71779716, + "num_input_tokens_seen": 204188535, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 9479, + "time_per_iteration": 2.544422149658203 + }, + { + "auxiliary_loss_clip": 0.01103959, + "auxiliary_loss_mlp": 0.01028863, + "balance_loss_clip": 1.01729965, + "balance_loss_mlp": 1.03753138, + "epoch": 0.5699684352923493, + "flos": 19537272720000.0, + "grad_norm": 1.4338626650619826, + "language_loss": 0.71364439, + "learning_rate": 1.6457168056972392e-06, + "loss": 0.7349726, + "num_input_tokens_seen": 204208365, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6640625, + "step": 9480, + "time_per_iteration": 2.5680878162384033 + }, + { + "auxiliary_loss_clip": 0.01106665, + "auxiliary_loss_mlp": 0.01029171, + "balance_loss_clip": 1.01615977, + "balance_loss_mlp": 1.03689599, + "epoch": 0.5700285585450173, + "flos": 16253349753600.0, + "grad_norm": 2.894404055389402, + "language_loss": 0.71927261, + "learning_rate": 1.6453335099254276e-06, + "loss": 0.74063098, + "num_input_tokens_seen": 204226560, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.69921875, + "step": 9481, + "time_per_iteration": 2.4576737880706787 + }, + { + "auxiliary_loss_clip": 0.01108109, + "auxiliary_loss_mlp": 0.01030771, + "balance_loss_clip": 1.01848626, + "balance_loss_mlp": 1.03819919, + "epoch": 0.5700886817976852, + "flos": 19864993432320.0, + "grad_norm": 1.6819252466037764, + "language_loss": 0.78134334, + "learning_rate": 1.6449502276026041e-06, + "loss": 0.80273211, + "num_input_tokens_seen": 204245410, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69921875, + "step": 9482, + "time_per_iteration": 2.4244532585144043 + }, + { + "auxiliary_loss_clip": 0.01107032, + "auxiliary_loss_mlp": 0.01026772, + "balance_loss_clip": 1.01462436, + "balance_loss_mlp": 1.0372206, + "epoch": 0.5701488050503533, + "flos": 23841704989440.0, + "grad_norm": 2.1918431398286686, + "language_loss": 0.77641654, + "learning_rate": 1.6445669587433043e-06, + "loss": 0.79775453, + "num_input_tokens_seen": 204264840, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69921875, + "step": 9483, + "time_per_iteration": 2.4840755462646484 + }, + { + "auxiliary_loss_clip": 0.01106594, + "auxiliary_loss_mlp": 0.01033667, + "balance_loss_clip": 1.0217644, + "balance_loss_mlp": 1.037377, + "epoch": 0.5702089283030212, + "flos": 23659673840640.0, + "grad_norm": 2.4281256207615702, + "language_loss": 0.8098467, + "learning_rate": 1.6441837033620612e-06, + "loss": 0.8312493, + "num_input_tokens_seen": 204284335, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69140625, + "step": 9484, + "time_per_iteration": 2.4726784229278564 + }, + { + "auxiliary_loss_clip": 0.01107682, + "auxiliary_loss_mlp": 0.01030904, + "balance_loss_clip": 1.01798165, + "balance_loss_mlp": 1.03656316, + "epoch": 0.5702690515556892, + "flos": 27891171544320.0, + "grad_norm": 9.175896769478262, + "language_loss": 0.60516417, + "learning_rate": 1.6438004614734073e-06, + "loss": 0.62655002, + "num_input_tokens_seen": 204302590, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 9485, + "time_per_iteration": 2.5423014163970947 + }, + { + "auxiliary_loss_clip": 0.01107039, + "auxiliary_loss_mlp": 0.01033529, + "balance_loss_clip": 1.02155399, + "balance_loss_mlp": 1.03619039, + "epoch": 0.5703291748083571, + "flos": 24023951619840.0, + "grad_norm": 1.6367482229195742, + "language_loss": 0.65350515, + "learning_rate": 1.6434172330918757e-06, + "loss": 0.67491084, + "num_input_tokens_seen": 204323055, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.7109375, + "step": 9486, + "time_per_iteration": 2.4597506523132324 + }, + { + "auxiliary_loss_clip": 0.01028731, + "auxiliary_loss_mlp": 0.01001408, + "balance_loss_clip": 1.00001299, + "balance_loss_mlp": 1.0072422, + "epoch": 0.5703892980610251, + "flos": 57023382919680.0, + "grad_norm": 0.6639559744347447, + "language_loss": 0.48005819, + "learning_rate": 1.6430340182319978e-06, + "loss": 0.50035954, + "num_input_tokens_seen": 204386160, + "router_z_loss_clip": 0.01397705, + "router_z_loss_mlp": 0.21484375, + "step": 9487, + "time_per_iteration": 3.139495849609375 + }, + { + "auxiliary_loss_clip": 0.01107717, + "auxiliary_loss_mlp": 0.01034452, + "balance_loss_clip": 1.02199435, + "balance_loss_mlp": 1.03726935, + "epoch": 0.570449421313693, + "flos": 24351025887360.0, + "grad_norm": 3.049670437576873, + "language_loss": 0.86058694, + "learning_rate": 1.6426508169083067e-06, + "loss": 0.88200867, + "num_input_tokens_seen": 204406315, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.703125, + "step": 9488, + "time_per_iteration": 2.474616289138794 + }, + { + "auxiliary_loss_clip": 0.01111409, + "auxiliary_loss_mlp": 0.01033222, + "balance_loss_clip": 1.02065694, + "balance_loss_mlp": 1.03814459, + "epoch": 0.570509544566361, + "flos": 24828566227200.0, + "grad_norm": 1.4447763000600118, + "language_loss": 0.79057854, + "learning_rate": 1.6422676291353314e-06, + "loss": 0.81202483, + "num_input_tokens_seen": 204427645, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.73046875, + "step": 9489, + "time_per_iteration": 2.5065059661865234 + }, + { + "auxiliary_loss_clip": 0.01109061, + "auxiliary_loss_mlp": 0.01030936, + "balance_loss_clip": 1.01978409, + "balance_loss_mlp": 1.03869939, + "epoch": 0.570569667819029, + "flos": 21397301671680.0, + "grad_norm": 1.7186115243718623, + "language_loss": 0.69906354, + "learning_rate": 1.641884454927604e-06, + "loss": 0.72046351, + "num_input_tokens_seen": 204445910, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.703125, + "step": 9490, + "time_per_iteration": 2.431102752685547 + }, + { + "auxiliary_loss_clip": 0.01107746, + "auxiliary_loss_mlp": 0.01028873, + "balance_loss_clip": 1.01676106, + "balance_loss_mlp": 1.03836775, + "epoch": 0.570629791071697, + "flos": 23216751233280.0, + "grad_norm": 1.5472180668734579, + "language_loss": 0.76222062, + "learning_rate": 1.6415012942996548e-06, + "loss": 0.78358686, + "num_input_tokens_seen": 204464680, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 9491, + "time_per_iteration": 2.4962759017944336 + }, + { + "auxiliary_loss_clip": 0.01028502, + "auxiliary_loss_mlp": 0.01004058, + "balance_loss_clip": 1.00276494, + "balance_loss_mlp": 1.00699997, + "epoch": 0.570689914324365, + "flos": 65284666525440.0, + "grad_norm": 0.7944597612251223, + "language_loss": 0.57379556, + "learning_rate": 1.641118147266011e-06, + "loss": 0.59412122, + "num_input_tokens_seen": 204525580, + "router_z_loss_clip": 0.01293945, + "router_z_loss_mlp": 0.21484375, + "step": 9492, + "time_per_iteration": 3.0417838096618652 + }, + { + "auxiliary_loss_clip": 0.01108126, + "auxiliary_loss_mlp": 0.01033252, + "balance_loss_clip": 1.02009118, + "balance_loss_mlp": 1.03813028, + "epoch": 0.5707500375770329, + "flos": 21141904993920.0, + "grad_norm": 1.7217254573804663, + "language_loss": 0.71475661, + "learning_rate": 1.6407350138412035e-06, + "loss": 0.73617041, + "num_input_tokens_seen": 204541320, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.69921875, + "step": 9493, + "time_per_iteration": 2.4304161071777344 + }, + { + "auxiliary_loss_clip": 0.01111414, + "auxiliary_loss_mlp": 0.01030431, + "balance_loss_clip": 1.01807487, + "balance_loss_mlp": 1.0384568, + "epoch": 0.5708101608297009, + "flos": 20812747737600.0, + "grad_norm": 1.5364295350921338, + "language_loss": 0.77778745, + "learning_rate": 1.6403518940397606e-06, + "loss": 0.7992059, + "num_input_tokens_seen": 204560275, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.73046875, + "step": 9494, + "time_per_iteration": 2.495940923690796 + }, + { + "auxiliary_loss_clip": 0.01111436, + "auxiliary_loss_mlp": 0.01033742, + "balance_loss_clip": 1.02015769, + "balance_loss_mlp": 1.03685784, + "epoch": 0.5708702840823688, + "flos": 25812338895360.0, + "grad_norm": 2.275602748234112, + "language_loss": 0.80153453, + "learning_rate": 1.6399687878762096e-06, + "loss": 0.82298625, + "num_input_tokens_seen": 204579430, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.74609375, + "step": 9495, + "time_per_iteration": 2.464423418045044 + }, + { + "auxiliary_loss_clip": 0.01117033, + "auxiliary_loss_mlp": 0.01039006, + "balance_loss_clip": 1.02393782, + "balance_loss_mlp": 1.04061937, + "epoch": 0.5709304073350369, + "flos": 23651916503040.0, + "grad_norm": 3.463558707959815, + "language_loss": 0.66745138, + "learning_rate": 1.6395856953650784e-06, + "loss": 0.68901181, + "num_input_tokens_seen": 204597710, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.765625, + "step": 9496, + "time_per_iteration": 2.460413694381714 + }, + { + "auxiliary_loss_clip": 0.01113845, + "auxiliary_loss_mlp": 0.01037224, + "balance_loss_clip": 1.02361047, + "balance_loss_mlp": 1.03911281, + "epoch": 0.5709905305877048, + "flos": 16107552449280.0, + "grad_norm": 2.3847499053839067, + "language_loss": 0.6960094, + "learning_rate": 1.6392026165208938e-06, + "loss": 0.71752012, + "num_input_tokens_seen": 204616140, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.74609375, + "step": 9497, + "time_per_iteration": 2.4051928520202637 + }, + { + "auxiliary_loss_clip": 0.01111626, + "auxiliary_loss_mlp": 0.01030681, + "balance_loss_clip": 1.01712704, + "balance_loss_mlp": 1.03815341, + "epoch": 0.5710506538403728, + "flos": 24750819239040.0, + "grad_norm": 1.8796088723274103, + "language_loss": 0.81200778, + "learning_rate": 1.638819551358182e-06, + "loss": 0.83343083, + "num_input_tokens_seen": 204636470, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.734375, + "step": 9498, + "time_per_iteration": 2.4764246940612793 + }, + { + "auxiliary_loss_clip": 0.01111235, + "auxiliary_loss_mlp": 0.01035181, + "balance_loss_clip": 1.02085817, + "balance_loss_mlp": 1.03874803, + "epoch": 0.5711107770930407, + "flos": 21982250655360.0, + "grad_norm": 1.7968018947144153, + "language_loss": 0.66237068, + "learning_rate": 1.638436499891469e-06, + "loss": 0.68383479, + "num_input_tokens_seen": 204656640, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.72265625, + "step": 9499, + "time_per_iteration": 2.4842209815979004 + }, + { + "auxiliary_loss_clip": 0.01109681, + "auxiliary_loss_mlp": 0.0103261, + "balance_loss_clip": 1.01994455, + "balance_loss_mlp": 1.03881264, + "epoch": 0.5711709003457087, + "flos": 19574009354880.0, + "grad_norm": 2.341189176641991, + "language_loss": 0.71659786, + "learning_rate": 1.6380534621352805e-06, + "loss": 0.73802078, + "num_input_tokens_seen": 204675475, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9500, + "time_per_iteration": 2.474968671798706 + }, + { + "auxiliary_loss_clip": 0.01113264, + "auxiliary_loss_mlp": 0.01032452, + "balance_loss_clip": 1.01911259, + "balance_loss_mlp": 1.03896177, + "epoch": 0.5712310235983766, + "flos": 24242683489920.0, + "grad_norm": 1.7510176581013566, + "language_loss": 0.76148939, + "learning_rate": 1.6376704381041407e-06, + "loss": 0.78294659, + "num_input_tokens_seen": 204695385, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7421875, + "step": 9501, + "time_per_iteration": 2.481982707977295 + }, + { + "auxiliary_loss_clip": 0.01112022, + "auxiliary_loss_mlp": 0.01030774, + "balance_loss_clip": 1.01872778, + "balance_loss_mlp": 1.03827071, + "epoch": 0.5712911468510447, + "flos": 20996143603200.0, + "grad_norm": 1.6683693962706503, + "language_loss": 0.75252867, + "learning_rate": 1.6372874278125742e-06, + "loss": 0.7739566, + "num_input_tokens_seen": 204714730, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.73828125, + "step": 9502, + "time_per_iteration": 2.4645891189575195 + }, + { + "auxiliary_loss_clip": 0.01108222, + "auxiliary_loss_mlp": 0.01026372, + "balance_loss_clip": 1.01413548, + "balance_loss_mlp": 1.03776038, + "epoch": 0.5713512701037126, + "flos": 18916987731840.0, + "grad_norm": 3.8399261830524076, + "language_loss": 0.82397389, + "learning_rate": 1.636904431275105e-06, + "loss": 0.84531981, + "num_input_tokens_seen": 204735025, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.703125, + "step": 9503, + "time_per_iteration": 2.4945871829986572 + }, + { + "auxiliary_loss_clip": 0.01108893, + "auxiliary_loss_mlp": 0.01035138, + "balance_loss_clip": 1.02267456, + "balance_loss_mlp": 1.03824139, + "epoch": 0.5714113933563806, + "flos": 17413443308160.0, + "grad_norm": 2.09557851646671, + "language_loss": 0.85872537, + "learning_rate": 1.6365214485062553e-06, + "loss": 0.8801657, + "num_input_tokens_seen": 204751365, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 9504, + "time_per_iteration": 2.3861567974090576 + }, + { + "auxiliary_loss_clip": 0.01106356, + "auxiliary_loss_mlp": 0.0102491, + "balance_loss_clip": 1.01232708, + "balance_loss_mlp": 1.03753018, + "epoch": 0.5714715166090486, + "flos": 20193360589440.0, + "grad_norm": 1.9315555303189194, + "language_loss": 0.75182885, + "learning_rate": 1.6361384795205496e-06, + "loss": 0.7731415, + "num_input_tokens_seen": 204768980, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 9505, + "time_per_iteration": 2.462536573410034 + }, + { + "auxiliary_loss_clip": 0.01108197, + "auxiliary_loss_mlp": 0.01031273, + "balance_loss_clip": 1.01970994, + "balance_loss_mlp": 1.03717351, + "epoch": 0.5715316398617165, + "flos": 18551668458240.0, + "grad_norm": 1.6115496885789637, + "language_loss": 0.81918782, + "learning_rate": 1.635755524332509e-06, + "loss": 0.84058261, + "num_input_tokens_seen": 204788110, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.7109375, + "step": 9506, + "time_per_iteration": 2.467022180557251 + }, + { + "auxiliary_loss_clip": 0.01106598, + "auxiliary_loss_mlp": 0.01027974, + "balance_loss_clip": 1.01546264, + "balance_loss_mlp": 1.03684521, + "epoch": 0.5715917631143845, + "flos": 18478195188480.0, + "grad_norm": 1.6660041805363315, + "language_loss": 0.77144134, + "learning_rate": 1.6353725829566552e-06, + "loss": 0.79278708, + "num_input_tokens_seen": 204807240, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69921875, + "step": 9507, + "time_per_iteration": 2.4672694206237793 + }, + { + "auxiliary_loss_clip": 0.01110344, + "auxiliary_loss_mlp": 0.01035951, + "balance_loss_clip": 1.02186108, + "balance_loss_mlp": 1.03726792, + "epoch": 0.5716518863670524, + "flos": 24020037037440.0, + "grad_norm": 2.45367934924197, + "language_loss": 0.68435538, + "learning_rate": 1.63498965540751e-06, + "loss": 0.7058183, + "num_input_tokens_seen": 204826415, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.73046875, + "step": 9508, + "time_per_iteration": 2.464097261428833 + }, + { + "auxiliary_loss_clip": 0.01110426, + "auxiliary_loss_mlp": 0.01029096, + "balance_loss_clip": 1.01629877, + "balance_loss_mlp": 1.03722239, + "epoch": 0.5717120096197205, + "flos": 17819485626240.0, + "grad_norm": 2.0052906721639836, + "language_loss": 0.79419613, + "learning_rate": 1.634606741699593e-06, + "loss": 0.81559134, + "num_input_tokens_seen": 204844305, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.73046875, + "step": 9509, + "time_per_iteration": 2.504023551940918 + }, + { + "auxiliary_loss_clip": 0.01105897, + "auxiliary_loss_mlp": 0.01033534, + "balance_loss_clip": 1.02089834, + "balance_loss_mlp": 1.03664279, + "epoch": 0.5717721328723884, + "flos": 21866043179520.0, + "grad_norm": 1.839099502620817, + "language_loss": 0.7265448, + "learning_rate": 1.6342238418474255e-06, + "loss": 0.74793911, + "num_input_tokens_seen": 204861765, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 9510, + "time_per_iteration": 3.815577507019043 + }, + { + "auxiliary_loss_clip": 0.01107423, + "auxiliary_loss_mlp": 0.01030098, + "balance_loss_clip": 1.01810002, + "balance_loss_mlp": 1.03668678, + "epoch": 0.5718322561250564, + "flos": 28437624126720.0, + "grad_norm": 1.3819155223826083, + "language_loss": 0.69395494, + "learning_rate": 1.6338409558655264e-06, + "loss": 0.71533018, + "num_input_tokens_seen": 204882505, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.70703125, + "step": 9511, + "time_per_iteration": 2.5445902347564697 + }, + { + "auxiliary_loss_clip": 0.0110843, + "auxiliary_loss_mlp": 0.01036527, + "balance_loss_clip": 1.02426028, + "balance_loss_mlp": 1.03781009, + "epoch": 0.5718923793777243, + "flos": 13551825905280.0, + "grad_norm": 1.8672218842214499, + "language_loss": 0.61565816, + "learning_rate": 1.6334580837684152e-06, + "loss": 0.63710779, + "num_input_tokens_seen": 204899830, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.70703125, + "step": 9512, + "time_per_iteration": 3.8341665267944336 + }, + { + "auxiliary_loss_clip": 0.01106641, + "auxiliary_loss_mlp": 0.01028616, + "balance_loss_clip": 1.0164628, + "balance_loss_mlp": 1.03667331, + "epoch": 0.5719525026303923, + "flos": 17822035491840.0, + "grad_norm": 4.170405845803043, + "language_loss": 0.7586627, + "learning_rate": 1.6330752255706104e-06, + "loss": 0.78001529, + "num_input_tokens_seen": 204918100, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69921875, + "step": 9513, + "time_per_iteration": 5.299789667129517 + }, + { + "auxiliary_loss_clip": 0.01028017, + "auxiliary_loss_mlp": 0.00999308, + "balance_loss_clip": 0.99809855, + "balance_loss_mlp": 1.00645494, + "epoch": 0.5720126258830602, + "flos": 61298042814720.0, + "grad_norm": 0.8876641821203675, + "language_loss": 0.6684342, + "learning_rate": 1.6326923812866288e-06, + "loss": 0.68870747, + "num_input_tokens_seen": 204972925, + "router_z_loss_clip": 0.01208496, + "router_z_loss_mlp": 0.21582031, + "step": 9514, + "time_per_iteration": 3.0201942920684814 + }, + { + "auxiliary_loss_clip": 0.01114776, + "auxiliary_loss_mlp": 0.0104014, + "balance_loss_clip": 1.02696776, + "balance_loss_mlp": 1.04034257, + "epoch": 0.5720727491357283, + "flos": 23988040997760.0, + "grad_norm": 2.046774799271973, + "language_loss": 0.81059563, + "learning_rate": 1.63230955093099e-06, + "loss": 0.8321448, + "num_input_tokens_seen": 204990910, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.74609375, + "step": 9515, + "time_per_iteration": 2.440838575363159 + }, + { + "auxiliary_loss_clip": 0.01104804, + "auxiliary_loss_mlp": 0.01027026, + "balance_loss_clip": 1.01469994, + "balance_loss_mlp": 1.03602076, + "epoch": 0.5721328723883962, + "flos": 23405426398080.0, + "grad_norm": 1.8601231206296425, + "language_loss": 0.86125237, + "learning_rate": 1.6319267345182092e-06, + "loss": 0.88257068, + "num_input_tokens_seen": 205010500, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 9516, + "time_per_iteration": 2.477764368057251 + }, + { + "auxiliary_loss_clip": 0.01104974, + "auxiliary_loss_mlp": 0.01029332, + "balance_loss_clip": 1.01654696, + "balance_loss_mlp": 1.03561044, + "epoch": 0.5721929956410642, + "flos": 18804910320000.0, + "grad_norm": 1.8026555789133811, + "language_loss": 0.87531322, + "learning_rate": 1.6315439320628038e-06, + "loss": 0.89665627, + "num_input_tokens_seen": 205028560, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 9517, + "time_per_iteration": 2.425889253616333 + }, + { + "auxiliary_loss_clip": 0.0110684, + "auxiliary_loss_mlp": 0.0103103, + "balance_loss_clip": 1.01804841, + "balance_loss_mlp": 1.03662252, + "epoch": 0.5722531188937322, + "flos": 27196659100800.0, + "grad_norm": 1.765867586501473, + "language_loss": 0.8479656, + "learning_rate": 1.6311611435792893e-06, + "loss": 0.86934435, + "num_input_tokens_seen": 205048650, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.703125, + "step": 9518, + "time_per_iteration": 2.515908718109131 + }, + { + "auxiliary_loss_clip": 0.01102718, + "auxiliary_loss_mlp": 0.01031303, + "balance_loss_clip": 1.01909649, + "balance_loss_mlp": 1.03518391, + "epoch": 0.5723132421464001, + "flos": 15195672852480.0, + "grad_norm": 1.8620909672026127, + "language_loss": 0.7880826, + "learning_rate": 1.6307783690821812e-06, + "loss": 0.80942279, + "num_input_tokens_seen": 205066480, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.67578125, + "step": 9519, + "time_per_iteration": 2.400693893432617 + }, + { + "auxiliary_loss_clip": 0.01105893, + "auxiliary_loss_mlp": 0.01030213, + "balance_loss_clip": 1.01795244, + "balance_loss_mlp": 1.03658307, + "epoch": 0.5723733653990681, + "flos": 27599433281280.0, + "grad_norm": 1.5438950427184228, + "language_loss": 0.82970679, + "learning_rate": 1.6303956085859944e-06, + "loss": 0.85106778, + "num_input_tokens_seen": 205087475, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.69140625, + "step": 9520, + "time_per_iteration": 2.5011074542999268 + }, + { + "auxiliary_loss_clip": 0.01110791, + "auxiliary_loss_mlp": 0.01039958, + "balance_loss_clip": 1.0268625, + "balance_loss_mlp": 1.03927732, + "epoch": 0.572433488651736, + "flos": 18222870337920.0, + "grad_norm": 2.123220131944119, + "language_loss": 0.71853209, + "learning_rate": 1.630012862105243e-06, + "loss": 0.74003959, + "num_input_tokens_seen": 205106495, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71484375, + "step": 9521, + "time_per_iteration": 2.4156429767608643 + }, + { + "auxiliary_loss_clip": 0.01106899, + "auxiliary_loss_mlp": 0.01033537, + "balance_loss_clip": 1.02117443, + "balance_loss_mlp": 1.0362848, + "epoch": 0.5724936119044041, + "flos": 31249106484480.0, + "grad_norm": 1.6921576366095024, + "language_loss": 0.77830148, + "learning_rate": 1.6296301296544415e-06, + "loss": 0.79970586, + "num_input_tokens_seen": 205128285, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.70703125, + "step": 9522, + "time_per_iteration": 2.5682153701782227 + }, + { + "auxiliary_loss_clip": 0.01102798, + "auxiliary_loss_mlp": 0.01031779, + "balance_loss_clip": 1.02081728, + "balance_loss_mlp": 1.03628266, + "epoch": 0.572553735157072, + "flos": 19202189719680.0, + "grad_norm": 1.565759699688635, + "language_loss": 0.71671265, + "learning_rate": 1.629247411248102e-06, + "loss": 0.73805845, + "num_input_tokens_seen": 205146595, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6640625, + "step": 9523, + "time_per_iteration": 2.402622938156128 + }, + { + "auxiliary_loss_clip": 0.01104927, + "auxiliary_loss_mlp": 0.01026614, + "balance_loss_clip": 1.01511574, + "balance_loss_mlp": 1.03639328, + "epoch": 0.57261385840974, + "flos": 21214911386880.0, + "grad_norm": 1.6537237547017787, + "language_loss": 0.70046443, + "learning_rate": 1.628864706900738e-06, + "loss": 0.72177982, + "num_input_tokens_seen": 205164295, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6875, + "step": 9524, + "time_per_iteration": 2.478745698928833 + }, + { + "auxiliary_loss_clip": 0.01107047, + "auxiliary_loss_mlp": 0.01030877, + "balance_loss_clip": 1.01944458, + "balance_loss_mlp": 1.03783476, + "epoch": 0.5726739816624079, + "flos": 33984529793280.0, + "grad_norm": 1.431879051430598, + "language_loss": 0.65079439, + "learning_rate": 1.6284820166268615e-06, + "loss": 0.67217362, + "num_input_tokens_seen": 205185380, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.69140625, + "step": 9525, + "time_per_iteration": 2.5722320079803467 + }, + { + "auxiliary_loss_clip": 0.01102, + "auxiliary_loss_mlp": 0.01029624, + "balance_loss_clip": 1.01825702, + "balance_loss_mlp": 1.03385937, + "epoch": 0.5727341049150759, + "flos": 24275972419200.0, + "grad_norm": 1.7621674355193322, + "language_loss": 0.72353703, + "learning_rate": 1.628099340440984e-06, + "loss": 0.74485326, + "num_input_tokens_seen": 205204895, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.68359375, + "step": 9526, + "time_per_iteration": 2.5182504653930664 + }, + { + "auxiliary_loss_clip": 0.01102827, + "auxiliary_loss_mlp": 0.01031539, + "balance_loss_clip": 1.02022004, + "balance_loss_mlp": 1.03617597, + "epoch": 0.5727942281677438, + "flos": 28400564269440.0, + "grad_norm": 1.6243804380597333, + "language_loss": 0.80131519, + "learning_rate": 1.6277166783576176e-06, + "loss": 0.8226589, + "num_input_tokens_seen": 205223440, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66796875, + "step": 9527, + "time_per_iteration": 2.556168556213379 + }, + { + "auxiliary_loss_clip": 0.01104789, + "auxiliary_loss_mlp": 0.01036832, + "balance_loss_clip": 1.02399302, + "balance_loss_mlp": 1.03633451, + "epoch": 0.5728543514204119, + "flos": 19536769929600.0, + "grad_norm": 1.8731920412295517, + "language_loss": 0.71818352, + "learning_rate": 1.6273340303912713e-06, + "loss": 0.7395997, + "num_input_tokens_seen": 205242800, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.68359375, + "step": 9528, + "time_per_iteration": 2.502045154571533 + }, + { + "auxiliary_loss_clip": 0.01107269, + "auxiliary_loss_mlp": 0.01033482, + "balance_loss_clip": 1.02113199, + "balance_loss_mlp": 1.03742957, + "epoch": 0.5729144746730798, + "flos": 21506757390720.0, + "grad_norm": 1.9532280974694858, + "language_loss": 0.853854, + "learning_rate": 1.6269513965564557e-06, + "loss": 0.87526155, + "num_input_tokens_seen": 205259465, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69921875, + "step": 9529, + "time_per_iteration": 2.468146324157715 + }, + { + "auxiliary_loss_clip": 0.01028852, + "auxiliary_loss_mlp": 0.0100185, + "balance_loss_clip": 1.0006398, + "balance_loss_mlp": 1.00712085, + "epoch": 0.5729745979257478, + "flos": 58681628242560.0, + "grad_norm": 0.7632636876236247, + "language_loss": 0.56091511, + "learning_rate": 1.6265687768676813e-06, + "loss": 0.58122212, + "num_input_tokens_seen": 205314100, + "router_z_loss_clip": 0.01208496, + "router_z_loss_mlp": 0.21777344, + "step": 9530, + "time_per_iteration": 2.955796003341675 + }, + { + "auxiliary_loss_clip": 0.01109966, + "auxiliary_loss_mlp": 0.01023962, + "balance_loss_clip": 1.01241684, + "balance_loss_mlp": 1.03820443, + "epoch": 0.5730347211784158, + "flos": 18552099421440.0, + "grad_norm": 2.605800582107851, + "language_loss": 0.66667211, + "learning_rate": 1.6261861713394553e-06, + "loss": 0.68801141, + "num_input_tokens_seen": 205333420, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.71875, + "step": 9531, + "time_per_iteration": 2.4874041080474854 + }, + { + "auxiliary_loss_clip": 0.01107074, + "auxiliary_loss_mlp": 0.01031474, + "balance_loss_clip": 1.0189929, + "balance_loss_mlp": 1.0362972, + "epoch": 0.5730948444310837, + "flos": 38031482396160.0, + "grad_norm": 2.577990064326961, + "language_loss": 0.75677073, + "learning_rate": 1.6258035799862876e-06, + "loss": 0.77815616, + "num_input_tokens_seen": 205350995, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.70703125, + "step": 9532, + "time_per_iteration": 2.653745651245117 + }, + { + "auxiliary_loss_clip": 0.01105987, + "auxiliary_loss_mlp": 0.01030007, + "balance_loss_clip": 1.01779997, + "balance_loss_mlp": 1.03636467, + "epoch": 0.5731549676837517, + "flos": 25227066689280.0, + "grad_norm": 3.4857041080787696, + "language_loss": 0.78726482, + "learning_rate": 1.625421002822686e-06, + "loss": 0.80862474, + "num_input_tokens_seen": 205372675, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6953125, + "step": 9533, + "time_per_iteration": 2.5444183349609375 + }, + { + "auxiliary_loss_clip": 0.01105558, + "auxiliary_loss_mlp": 0.01033011, + "balance_loss_clip": 1.02156746, + "balance_loss_mlp": 1.03771889, + "epoch": 0.5732150909364196, + "flos": 23368222886400.0, + "grad_norm": 2.5155449858561036, + "language_loss": 0.8564285, + "learning_rate": 1.6250384398631574e-06, + "loss": 0.87781423, + "num_input_tokens_seen": 205392590, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6796875, + "step": 9534, + "time_per_iteration": 2.611769199371338 + }, + { + "auxiliary_loss_clip": 0.01108602, + "auxiliary_loss_mlp": 0.01035947, + "balance_loss_clip": 1.02241731, + "balance_loss_mlp": 1.03833961, + "epoch": 0.5732752141890877, + "flos": 23079357711360.0, + "grad_norm": 1.7913378128419626, + "language_loss": 0.74880809, + "learning_rate": 1.6246558911222085e-06, + "loss": 0.7702536, + "num_input_tokens_seen": 205414885, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.703125, + "step": 9535, + "time_per_iteration": 2.5294063091278076 + }, + { + "auxiliary_loss_clip": 0.01113223, + "auxiliary_loss_mlp": 0.010319, + "balance_loss_clip": 1.01927602, + "balance_loss_mlp": 1.04021287, + "epoch": 0.5733353374417556, + "flos": 24352282863360.0, + "grad_norm": 1.60935564318513, + "language_loss": 0.70712042, + "learning_rate": 1.624273356614346e-06, + "loss": 0.72857165, + "num_input_tokens_seen": 205434440, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.73046875, + "step": 9536, + "time_per_iteration": 2.5115044116973877 + }, + { + "auxiliary_loss_clip": 0.01104773, + "auxiliary_loss_mlp": 0.01029145, + "balance_loss_clip": 1.01741457, + "balance_loss_mlp": 1.03604972, + "epoch": 0.5733954606944236, + "flos": 27198849830400.0, + "grad_norm": 1.9605571924010112, + "language_loss": 0.69843078, + "learning_rate": 1.6238908363540755e-06, + "loss": 0.71977001, + "num_input_tokens_seen": 205454225, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6875, + "step": 9537, + "time_per_iteration": 2.485203266143799 + }, + { + "auxiliary_loss_clip": 0.01106743, + "auxiliary_loss_mlp": 0.01033894, + "balance_loss_clip": 1.02179384, + "balance_loss_mlp": 1.03693986, + "epoch": 0.5734555839470915, + "flos": 28765129357440.0, + "grad_norm": 1.9885156073739136, + "language_loss": 0.6257112, + "learning_rate": 1.623508330355902e-06, + "loss": 0.64711761, + "num_input_tokens_seen": 205474750, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 9538, + "time_per_iteration": 2.5242531299591064 + }, + { + "auxiliary_loss_clip": 0.01106895, + "auxiliary_loss_mlp": 0.01034021, + "balance_loss_clip": 1.02131939, + "balance_loss_mlp": 1.03750122, + "epoch": 0.5735157071997595, + "flos": 22966813422720.0, + "grad_norm": 1.847251631174476, + "language_loss": 0.83067656, + "learning_rate": 1.6231258386343306e-06, + "loss": 0.85208571, + "num_input_tokens_seen": 205495495, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 9539, + "time_per_iteration": 2.4557297229766846 + }, + { + "auxiliary_loss_clip": 0.01108422, + "auxiliary_loss_mlp": 0.01034021, + "balance_loss_clip": 1.02155805, + "balance_loss_mlp": 1.03672779, + "epoch": 0.5735758304524274, + "flos": 18989455420800.0, + "grad_norm": 1.9303873756935568, + "language_loss": 0.73266071, + "learning_rate": 1.6227433612038647e-06, + "loss": 0.75408518, + "num_input_tokens_seen": 205510070, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71484375, + "step": 9540, + "time_per_iteration": 2.449195384979248 + }, + { + "auxiliary_loss_clip": 0.01102256, + "auxiliary_loss_mlp": 0.01025926, + "balance_loss_clip": 1.01486361, + "balance_loss_mlp": 1.03386962, + "epoch": 0.5736359537050955, + "flos": 28397942576640.0, + "grad_norm": 1.7719156274309316, + "language_loss": 0.80036277, + "learning_rate": 1.6223608980790089e-06, + "loss": 0.82164454, + "num_input_tokens_seen": 205530190, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.68359375, + "step": 9541, + "time_per_iteration": 2.4807605743408203 + }, + { + "auxiliary_loss_clip": 0.01109647, + "auxiliary_loss_mlp": 0.01035157, + "balance_loss_clip": 1.02247286, + "balance_loss_mlp": 1.03748846, + "epoch": 0.5736960769577634, + "flos": 15627210848640.0, + "grad_norm": 2.3537030152809817, + "language_loss": 0.64358872, + "learning_rate": 1.6219784492742654e-06, + "loss": 0.66503674, + "num_input_tokens_seen": 205547380, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 9542, + "time_per_iteration": 2.417178153991699 + }, + { + "auxiliary_loss_clip": 0.01105899, + "auxiliary_loss_mlp": 0.0102862, + "balance_loss_clip": 1.01681268, + "balance_loss_mlp": 1.03586972, + "epoch": 0.5737562002104314, + "flos": 18003994813440.0, + "grad_norm": 2.222303069950764, + "language_loss": 0.82983625, + "learning_rate": 1.6215960148041365e-06, + "loss": 0.85118151, + "num_input_tokens_seen": 205566540, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.703125, + "step": 9543, + "time_per_iteration": 2.4162886142730713 + }, + { + "auxiliary_loss_clip": 0.01111645, + "auxiliary_loss_mlp": 0.01030794, + "balance_loss_clip": 1.01729405, + "balance_loss_mlp": 1.0378089, + "epoch": 0.5738163234630994, + "flos": 20698192287360.0, + "grad_norm": 2.297441344794182, + "language_loss": 0.73850191, + "learning_rate": 1.6212135946831257e-06, + "loss": 0.75992632, + "num_input_tokens_seen": 205584200, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73828125, + "step": 9544, + "time_per_iteration": 2.4531123638153076 + }, + { + "auxiliary_loss_clip": 0.01110237, + "auxiliary_loss_mlp": 0.01029691, + "balance_loss_clip": 1.01687646, + "balance_loss_mlp": 1.03741252, + "epoch": 0.5738764467157673, + "flos": 23149311448320.0, + "grad_norm": 2.106910148542404, + "language_loss": 0.75869375, + "learning_rate": 1.620831188925733e-06, + "loss": 0.78009301, + "num_input_tokens_seen": 205604675, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.73046875, + "step": 9545, + "time_per_iteration": 2.446340799331665 + }, + { + "auxiliary_loss_clip": 0.01109663, + "auxiliary_loss_mlp": 0.01033226, + "balance_loss_clip": 1.02086437, + "balance_loss_mlp": 1.03903508, + "epoch": 0.5739365699684353, + "flos": 29492930730240.0, + "grad_norm": 1.6841481616941998, + "language_loss": 0.56267381, + "learning_rate": 1.620448797546459e-06, + "loss": 0.58410275, + "num_input_tokens_seen": 205624680, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.70703125, + "step": 9546, + "time_per_iteration": 2.5431458950042725 + }, + { + "auxiliary_loss_clip": 0.01109256, + "auxiliary_loss_mlp": 0.01032725, + "balance_loss_clip": 1.02027345, + "balance_loss_mlp": 1.0375458, + "epoch": 0.5739966932211032, + "flos": 14027247342720.0, + "grad_norm": 2.2354008467729236, + "language_loss": 0.76396316, + "learning_rate": 1.6200664205598055e-06, + "loss": 0.78538299, + "num_input_tokens_seen": 205641950, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71875, + "step": 9547, + "time_per_iteration": 2.399355173110962 + }, + { + "auxiliary_loss_clip": 0.01108464, + "auxiliary_loss_mlp": 0.01030986, + "balance_loss_clip": 1.01847458, + "balance_loss_mlp": 1.03692102, + "epoch": 0.5740568164737713, + "flos": 19062030850560.0, + "grad_norm": 3.5736288481687457, + "language_loss": 0.74030554, + "learning_rate": 1.6196840579802704e-06, + "loss": 0.76169997, + "num_input_tokens_seen": 205660130, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71484375, + "step": 9548, + "time_per_iteration": 2.438188314437866 + }, + { + "auxiliary_loss_clip": 0.01107619, + "auxiliary_loss_mlp": 0.01033978, + "balance_loss_clip": 1.02162778, + "balance_loss_mlp": 1.03630018, + "epoch": 0.5741169397264392, + "flos": 22127832478080.0, + "grad_norm": 2.070673757769185, + "language_loss": 0.6898725, + "learning_rate": 1.619301709822355e-06, + "loss": 0.71128839, + "num_input_tokens_seen": 205678895, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.71484375, + "step": 9549, + "time_per_iteration": 2.4443182945251465 + }, + { + "auxiliary_loss_clip": 0.01109324, + "auxiliary_loss_mlp": 0.01029147, + "balance_loss_clip": 1.01756024, + "balance_loss_mlp": 1.0398941, + "epoch": 0.5741770629791072, + "flos": 24936836797440.0, + "grad_norm": 1.5143454441571018, + "language_loss": 0.79360747, + "learning_rate": 1.6189193761005564e-06, + "loss": 0.81499219, + "num_input_tokens_seen": 205698450, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6953125, + "step": 9550, + "time_per_iteration": 2.570117473602295 + }, + { + "auxiliary_loss_clip": 0.0111024, + "auxiliary_loss_mlp": 0.01030597, + "balance_loss_clip": 1.01844966, + "balance_loss_mlp": 1.03862011, + "epoch": 0.5742371862317751, + "flos": 18801462614400.0, + "grad_norm": 1.8121895379081407, + "language_loss": 0.67906272, + "learning_rate": 1.6185370568293727e-06, + "loss": 0.70047116, + "num_input_tokens_seen": 205714870, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.71484375, + "step": 9551, + "time_per_iteration": 2.423403024673462 + }, + { + "auxiliary_loss_clip": 0.01109924, + "auxiliary_loss_mlp": 0.01036266, + "balance_loss_clip": 1.02370107, + "balance_loss_mlp": 1.03743887, + "epoch": 0.5742973094844431, + "flos": 24460661174400.0, + "grad_norm": 1.628701607162486, + "language_loss": 0.71362531, + "learning_rate": 1.6181547520233031e-06, + "loss": 0.73508722, + "num_input_tokens_seen": 205736045, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.72265625, + "step": 9552, + "time_per_iteration": 3.886622428894043 + }, + { + "auxiliary_loss_clip": 0.01109635, + "auxiliary_loss_mlp": 0.01031692, + "balance_loss_clip": 1.01972914, + "balance_loss_mlp": 1.03975332, + "epoch": 0.574357432737111, + "flos": 21652770176640.0, + "grad_norm": 1.7228318188262413, + "language_loss": 0.79922652, + "learning_rate": 1.617772461696843e-06, + "loss": 0.82063985, + "num_input_tokens_seen": 205754445, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69921875, + "step": 9553, + "time_per_iteration": 2.431051731109619 + }, + { + "auxiliary_loss_clip": 0.01108983, + "auxiliary_loss_mlp": 0.01030986, + "balance_loss_clip": 1.01900589, + "balance_loss_mlp": 1.03611398, + "epoch": 0.5744175559897791, + "flos": 16544728880640.0, + "grad_norm": 2.015136287210995, + "language_loss": 0.83396381, + "learning_rate": 1.6173901858644895e-06, + "loss": 0.85536349, + "num_input_tokens_seen": 205770595, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.73046875, + "step": 9554, + "time_per_iteration": 3.823064088821411 + }, + { + "auxiliary_loss_clip": 0.0111382, + "auxiliary_loss_mlp": 0.01036712, + "balance_loss_clip": 1.02347982, + "balance_loss_mlp": 1.04021072, + "epoch": 0.574477679242447, + "flos": 24207598880640.0, + "grad_norm": 1.4846822756962552, + "language_loss": 0.70777845, + "learning_rate": 1.6170079245407385e-06, + "loss": 0.72928381, + "num_input_tokens_seen": 205791935, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 9555, + "time_per_iteration": 5.333508491516113 + }, + { + "auxiliary_loss_clip": 0.01109263, + "auxiliary_loss_mlp": 0.01027381, + "balance_loss_clip": 1.01494122, + "balance_loss_mlp": 1.03861225, + "epoch": 0.574537802495115, + "flos": 14903000835840.0, + "grad_norm": 2.115239569910986, + "language_loss": 0.72206348, + "learning_rate": 1.6166256777400853e-06, + "loss": 0.7434299, + "num_input_tokens_seen": 205807260, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.70703125, + "step": 9556, + "time_per_iteration": 2.4479689598083496 + }, + { + "auxiliary_loss_clip": 0.01109212, + "auxiliary_loss_mlp": 0.01034822, + "balance_loss_clip": 1.02174449, + "balance_loss_mlp": 1.03852749, + "epoch": 0.5745979257477829, + "flos": 24934969290240.0, + "grad_norm": 1.5580789907924004, + "language_loss": 0.73779786, + "learning_rate": 1.6162434454770248e-06, + "loss": 0.75923818, + "num_input_tokens_seen": 205826885, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.70703125, + "step": 9557, + "time_per_iteration": 2.53330397605896 + }, + { + "auxiliary_loss_clip": 0.01108535, + "auxiliary_loss_mlp": 0.01033277, + "balance_loss_clip": 1.02114749, + "balance_loss_mlp": 1.03805625, + "epoch": 0.5746580490004509, + "flos": 17235757704960.0, + "grad_norm": 1.551535187819687, + "language_loss": 0.67825913, + "learning_rate": 1.6158612277660514e-06, + "loss": 0.69967735, + "num_input_tokens_seen": 205844630, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.703125, + "step": 9558, + "time_per_iteration": 2.4345078468322754 + }, + { + "auxiliary_loss_clip": 0.01115654, + "auxiliary_loss_mlp": 0.01039699, + "balance_loss_clip": 1.02509618, + "balance_loss_mlp": 1.03993464, + "epoch": 0.5747181722531189, + "flos": 13187871348480.0, + "grad_norm": 2.018077791857229, + "language_loss": 0.71494532, + "learning_rate": 1.615479024621659e-06, + "loss": 0.73649883, + "num_input_tokens_seen": 205860960, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7578125, + "step": 9559, + "time_per_iteration": 2.4112660884857178 + }, + { + "auxiliary_loss_clip": 0.01109449, + "auxiliary_loss_mlp": 0.01029457, + "balance_loss_clip": 1.01856709, + "balance_loss_mlp": 1.03951454, + "epoch": 0.5747782955057869, + "flos": 22963006581120.0, + "grad_norm": 1.8277860809166269, + "language_loss": 0.79002881, + "learning_rate": 1.6150968360583398e-06, + "loss": 0.81141782, + "num_input_tokens_seen": 205880675, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.69921875, + "step": 9560, + "time_per_iteration": 2.461737871170044 + }, + { + "auxiliary_loss_clip": 0.01110078, + "auxiliary_loss_mlp": 0.01029167, + "balance_loss_clip": 1.01649547, + "balance_loss_mlp": 1.03796887, + "epoch": 0.5748384187584549, + "flos": 23403235668480.0, + "grad_norm": 2.312922307701609, + "language_loss": 0.64114952, + "learning_rate": 1.614714662090588e-06, + "loss": 0.66254199, + "num_input_tokens_seen": 205900050, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 9561, + "time_per_iteration": 2.4589121341705322 + }, + { + "auxiliary_loss_clip": 0.01116817, + "auxiliary_loss_mlp": 0.01037364, + "balance_loss_clip": 1.02403021, + "balance_loss_mlp": 1.04126084, + "epoch": 0.5748985420111228, + "flos": 17785514338560.0, + "grad_norm": 1.619271715020599, + "language_loss": 0.71404445, + "learning_rate": 1.6143325027328945e-06, + "loss": 0.73558629, + "num_input_tokens_seen": 205918855, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7578125, + "step": 9562, + "time_per_iteration": 2.4472360610961914 + }, + { + "auxiliary_loss_clip": 0.01108014, + "auxiliary_loss_mlp": 0.01036964, + "balance_loss_clip": 1.0256269, + "balance_loss_mlp": 1.03870499, + "epoch": 0.5749586652637908, + "flos": 19866250408320.0, + "grad_norm": 1.47664891140277, + "language_loss": 0.84212148, + "learning_rate": 1.613950357999751e-06, + "loss": 0.86357129, + "num_input_tokens_seen": 205936970, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6953125, + "step": 9563, + "time_per_iteration": 2.448540449142456 + }, + { + "auxiliary_loss_clip": 0.01112952, + "auxiliary_loss_mlp": 0.01035939, + "balance_loss_clip": 1.02251637, + "balance_loss_mlp": 1.03915787, + "epoch": 0.5750187885164587, + "flos": 21287235421440.0, + "grad_norm": 2.1518785584706266, + "language_loss": 0.57469738, + "learning_rate": 1.6135682279056488e-06, + "loss": 0.59618628, + "num_input_tokens_seen": 205954630, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73828125, + "step": 9564, + "time_per_iteration": 2.455137252807617 + }, + { + "auxiliary_loss_clip": 0.01104038, + "auxiliary_loss_mlp": 0.0102815, + "balance_loss_clip": 1.01645529, + "balance_loss_mlp": 1.03663075, + "epoch": 0.5750789117691267, + "flos": 18804658924800.0, + "grad_norm": 1.7205024550895016, + "language_loss": 0.75828826, + "learning_rate": 1.613186112465078e-06, + "loss": 0.7796101, + "num_input_tokens_seen": 205971510, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 9565, + "time_per_iteration": 2.4293572902679443 + }, + { + "auxiliary_loss_clip": 0.01030195, + "auxiliary_loss_mlp": 0.01000571, + "balance_loss_clip": 0.9991762, + "balance_loss_mlp": 1.00864065, + "epoch": 0.5751390350217946, + "flos": 70663224124800.0, + "grad_norm": 0.7426631899706556, + "language_loss": 0.60724127, + "learning_rate": 1.6128040116925287e-06, + "loss": 0.62754893, + "num_input_tokens_seen": 206035125, + "router_z_loss_clip": 0.01397705, + "router_z_loss_mlp": 0.21582031, + "step": 9566, + "time_per_iteration": 3.156651496887207 + }, + { + "auxiliary_loss_clip": 0.01109259, + "auxiliary_loss_mlp": 0.01033183, + "balance_loss_clip": 1.02127385, + "balance_loss_mlp": 1.03952003, + "epoch": 0.5751991582744627, + "flos": 14246338348800.0, + "grad_norm": 1.8230299531471923, + "language_loss": 0.7537874, + "learning_rate": 1.6124219256024901e-06, + "loss": 0.77521175, + "num_input_tokens_seen": 206052075, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 9567, + "time_per_iteration": 2.414881706237793 + }, + { + "auxiliary_loss_clip": 0.01108744, + "auxiliary_loss_mlp": 0.01029486, + "balance_loss_clip": 1.01692748, + "balance_loss_mlp": 1.03808224, + "epoch": 0.5752592815271306, + "flos": 18328160079360.0, + "grad_norm": 1.5717614086198337, + "language_loss": 0.74559051, + "learning_rate": 1.6120398542094504e-06, + "loss": 0.76697284, + "num_input_tokens_seen": 206069970, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.70703125, + "step": 9568, + "time_per_iteration": 2.458827495574951 + }, + { + "auxiliary_loss_clip": 0.0111112, + "auxiliary_loss_mlp": 0.01029788, + "balance_loss_clip": 1.01751542, + "balance_loss_mlp": 1.0394876, + "epoch": 0.5753194047797986, + "flos": 20922742160640.0, + "grad_norm": 1.7630953099139652, + "language_loss": 0.70951653, + "learning_rate": 1.6116577975278994e-06, + "loss": 0.73092568, + "num_input_tokens_seen": 206088950, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.71484375, + "step": 9569, + "time_per_iteration": 2.4545505046844482 + }, + { + "auxiliary_loss_clip": 0.01112247, + "auxiliary_loss_mlp": 0.01040682, + "balance_loss_clip": 1.02746797, + "balance_loss_mlp": 1.04058015, + "epoch": 0.5753795280324665, + "flos": 19281804215040.0, + "grad_norm": 1.9393871177420576, + "language_loss": 0.55699342, + "learning_rate": 1.6112757555723223e-06, + "loss": 0.57852268, + "num_input_tokens_seen": 206107780, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71875, + "step": 9570, + "time_per_iteration": 2.478793144226074 + }, + { + "auxiliary_loss_clip": 0.01106131, + "auxiliary_loss_mlp": 0.01038048, + "balance_loss_clip": 1.02648425, + "balance_loss_mlp": 1.03744042, + "epoch": 0.5754396512851345, + "flos": 21652877917440.0, + "grad_norm": 1.6217673569741213, + "language_loss": 0.64154774, + "learning_rate": 1.6108937283572082e-06, + "loss": 0.6629895, + "num_input_tokens_seen": 206127445, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6875, + "step": 9571, + "time_per_iteration": 2.4446957111358643 + }, + { + "auxiliary_loss_clip": 0.01108191, + "auxiliary_loss_mlp": 0.0103505, + "balance_loss_clip": 1.02213967, + "balance_loss_mlp": 1.03693449, + "epoch": 0.5754997745378025, + "flos": 51021700179840.0, + "grad_norm": 1.5404037339802243, + "language_loss": 0.67144608, + "learning_rate": 1.6105117158970434e-06, + "loss": 0.69287848, + "num_input_tokens_seen": 206152005, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9572, + "time_per_iteration": 2.739871025085449 + }, + { + "auxiliary_loss_clip": 0.0110922, + "auxiliary_loss_mlp": 0.01031668, + "balance_loss_clip": 1.01920414, + "balance_loss_mlp": 1.03968024, + "epoch": 0.5755598977904705, + "flos": 22856890826880.0, + "grad_norm": 2.3042557910685897, + "language_loss": 0.72336781, + "learning_rate": 1.6101297182063123e-06, + "loss": 0.74477673, + "num_input_tokens_seen": 206169875, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 9573, + "time_per_iteration": 2.446484088897705 + }, + { + "auxiliary_loss_clip": 0.01105342, + "auxiliary_loss_mlp": 0.01028983, + "balance_loss_clip": 1.01808691, + "balance_loss_mlp": 1.03999066, + "epoch": 0.5756200210431385, + "flos": 38472824805120.0, + "grad_norm": 1.9447567655956284, + "language_loss": 0.76657987, + "learning_rate": 1.6097477352995022e-06, + "loss": 0.78792316, + "num_input_tokens_seen": 206192635, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.65625, + "step": 9574, + "time_per_iteration": 2.620338201522827 + }, + { + "auxiliary_loss_clip": 0.01113268, + "auxiliary_loss_mlp": 0.01030568, + "balance_loss_clip": 1.01712155, + "balance_loss_mlp": 1.03815711, + "epoch": 0.5756801442958064, + "flos": 23910006700800.0, + "grad_norm": 2.450005891087765, + "language_loss": 0.66523874, + "learning_rate": 1.6093657671910968e-06, + "loss": 0.6866771, + "num_input_tokens_seen": 206211485, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75, + "step": 9575, + "time_per_iteration": 2.4487204551696777 + }, + { + "auxiliary_loss_clip": 0.01106224, + "auxiliary_loss_mlp": 0.01032413, + "balance_loss_clip": 1.02086747, + "balance_loss_mlp": 1.03883016, + "epoch": 0.5757402675484744, + "flos": 21105276099840.0, + "grad_norm": 1.5135571903226765, + "language_loss": 0.79637057, + "learning_rate": 1.6089838138955804e-06, + "loss": 0.81775701, + "num_input_tokens_seen": 206231740, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.67578125, + "step": 9576, + "time_per_iteration": 2.499525547027588 + }, + { + "auxiliary_loss_clip": 0.01106499, + "auxiliary_loss_mlp": 0.0102964, + "balance_loss_clip": 1.0181545, + "balance_loss_mlp": 1.038414, + "epoch": 0.5758003908011423, + "flos": 20559110826240.0, + "grad_norm": 1.624550594516776, + "language_loss": 0.69612324, + "learning_rate": 1.6086018754274372e-06, + "loss": 0.71748459, + "num_input_tokens_seen": 206250975, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6796875, + "step": 9577, + "time_per_iteration": 2.4342739582061768 + }, + { + "auxiliary_loss_clip": 0.0111126, + "auxiliary_loss_mlp": 0.01032786, + "balance_loss_clip": 1.02076983, + "balance_loss_mlp": 1.03889465, + "epoch": 0.5758605140538103, + "flos": 16473015377280.0, + "grad_norm": 1.7262479676640925, + "language_loss": 0.66394711, + "learning_rate": 1.6082199518011504e-06, + "loss": 0.68538755, + "num_input_tokens_seen": 206268800, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.7265625, + "step": 9578, + "time_per_iteration": 2.452836513519287 + }, + { + "auxiliary_loss_clip": 0.01104785, + "auxiliary_loss_mlp": 0.01028747, + "balance_loss_clip": 1.01713598, + "balance_loss_mlp": 1.03683639, + "epoch": 0.5759206373064782, + "flos": 21287558643840.0, + "grad_norm": 1.5955641210398863, + "language_loss": 0.72130096, + "learning_rate": 1.6078380430312016e-06, + "loss": 0.74263626, + "num_input_tokens_seen": 206287190, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 9579, + "time_per_iteration": 2.4709668159484863 + }, + { + "auxiliary_loss_clip": 0.01113888, + "auxiliary_loss_mlp": 0.01031946, + "balance_loss_clip": 1.01880956, + "balance_loss_mlp": 1.03966045, + "epoch": 0.5759807605591463, + "flos": 26067879227520.0, + "grad_norm": 2.099656741464949, + "language_loss": 0.64655066, + "learning_rate": 1.6074561491320742e-06, + "loss": 0.66800898, + "num_input_tokens_seen": 206307020, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 9580, + "time_per_iteration": 2.5071680545806885 + }, + { + "auxiliary_loss_clip": 0.01108728, + "auxiliary_loss_mlp": 0.01033088, + "balance_loss_clip": 1.02024293, + "balance_loss_mlp": 1.03776896, + "epoch": 0.5760408838118142, + "flos": 18873068376960.0, + "grad_norm": 1.9172914104456789, + "language_loss": 0.8563143, + "learning_rate": 1.6070742701182486e-06, + "loss": 0.87773246, + "num_input_tokens_seen": 206324095, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9581, + "time_per_iteration": 2.459761142730713 + }, + { + "auxiliary_loss_clip": 0.01117292, + "auxiliary_loss_mlp": 0.01040765, + "balance_loss_clip": 1.02792597, + "balance_loss_mlp": 1.04308629, + "epoch": 0.5761010070644822, + "flos": 15378134964480.0, + "grad_norm": 2.0860755056974627, + "language_loss": 0.67691463, + "learning_rate": 1.6066924060042057e-06, + "loss": 0.69849521, + "num_input_tokens_seen": 206343210, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7421875, + "step": 9582, + "time_per_iteration": 2.461245536804199 + }, + { + "auxiliary_loss_clip": 0.01030428, + "auxiliary_loss_mlp": 0.01000716, + "balance_loss_clip": 0.99950552, + "balance_loss_mlp": 1.00893497, + "epoch": 0.5761611303171501, + "flos": 71471932882560.0, + "grad_norm": 0.6389163922736963, + "language_loss": 0.57233906, + "learning_rate": 1.6063105568044271e-06, + "loss": 0.59265041, + "num_input_tokens_seen": 206415935, + "router_z_loss_clip": 0.01208496, + "router_z_loss_mlp": 0.21484375, + "step": 9583, + "time_per_iteration": 3.212454080581665 + }, + { + "auxiliary_loss_clip": 0.01108245, + "auxiliary_loss_mlp": 0.01029211, + "balance_loss_clip": 1.01740384, + "balance_loss_mlp": 1.0381434, + "epoch": 0.5762212535698181, + "flos": 16246167033600.0, + "grad_norm": 1.8641226876424317, + "language_loss": 0.82294947, + "learning_rate": 1.6059287225333912e-06, + "loss": 0.84432399, + "num_input_tokens_seen": 206431900, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.69921875, + "step": 9584, + "time_per_iteration": 2.445197582244873 + }, + { + "auxiliary_loss_clip": 0.0102928, + "auxiliary_loss_mlp": 0.01000964, + "balance_loss_clip": 0.99972469, + "balance_loss_mlp": 1.00788319, + "epoch": 0.5762813768224861, + "flos": 70185504216960.0, + "grad_norm": 0.6211358186522926, + "language_loss": 0.49536344, + "learning_rate": 1.6055469032055773e-06, + "loss": 0.51566589, + "num_input_tokens_seen": 206501200, + "router_z_loss_clip": 0.01239014, + "router_z_loss_mlp": 0.21484375, + "step": 9585, + "time_per_iteration": 3.1135380268096924 + }, + { + "auxiliary_loss_clip": 0.01103387, + "auxiliary_loss_mlp": 0.01026782, + "balance_loss_clip": 1.01523662, + "balance_loss_mlp": 1.0356468, + "epoch": 0.5763415000751541, + "flos": 20518028645760.0, + "grad_norm": 2.0469276219055037, + "language_loss": 0.84745687, + "learning_rate": 1.605165098835465e-06, + "loss": 0.86875856, + "num_input_tokens_seen": 206520575, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6796875, + "step": 9586, + "time_per_iteration": 2.4322049617767334 + }, + { + "auxiliary_loss_clip": 0.01107042, + "auxiliary_loss_mlp": 0.01033774, + "balance_loss_clip": 1.02099502, + "balance_loss_mlp": 1.0371176, + "epoch": 0.5764016233278221, + "flos": 15815526877440.0, + "grad_norm": 1.708349469848261, + "language_loss": 0.79935288, + "learning_rate": 1.6047833094375308e-06, + "loss": 0.82076108, + "num_input_tokens_seen": 206538060, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 9587, + "time_per_iteration": 2.420388698577881 + }, + { + "auxiliary_loss_clip": 0.01106973, + "auxiliary_loss_mlp": 0.01030504, + "balance_loss_clip": 1.01772523, + "balance_loss_mlp": 1.03791797, + "epoch": 0.57646174658049, + "flos": 20772312001920.0, + "grad_norm": 1.476870264659234, + "language_loss": 0.65978181, + "learning_rate": 1.6044015350262542e-06, + "loss": 0.68115664, + "num_input_tokens_seen": 206557320, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69140625, + "step": 9588, + "time_per_iteration": 2.470181941986084 + }, + { + "auxiliary_loss_clip": 0.0110785, + "auxiliary_loss_mlp": 0.0103402, + "balance_loss_clip": 1.02095485, + "balance_loss_mlp": 1.03747165, + "epoch": 0.576521869833158, + "flos": 23549930812800.0, + "grad_norm": 1.7939970430826904, + "language_loss": 0.78344554, + "learning_rate": 1.6040197756161104e-06, + "loss": 0.80486423, + "num_input_tokens_seen": 206575780, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 9589, + "time_per_iteration": 2.4622039794921875 + }, + { + "auxiliary_loss_clip": 0.0110208, + "auxiliary_loss_mlp": 0.01023642, + "balance_loss_clip": 1.01255536, + "balance_loss_mlp": 1.03513849, + "epoch": 0.5765819930858259, + "flos": 20266582464000.0, + "grad_norm": 1.899286870644745, + "language_loss": 0.79484087, + "learning_rate": 1.6036380312215762e-06, + "loss": 0.81609809, + "num_input_tokens_seen": 206594100, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.66796875, + "step": 9590, + "time_per_iteration": 2.4738223552703857 + }, + { + "auxiliary_loss_clip": 0.01107337, + "auxiliary_loss_mlp": 0.0102776, + "balance_loss_clip": 1.01693606, + "balance_loss_mlp": 1.03926742, + "epoch": 0.5766421163384939, + "flos": 23148772744320.0, + "grad_norm": 1.6468651932641252, + "language_loss": 0.63016611, + "learning_rate": 1.6032563018571283e-06, + "loss": 0.65151715, + "num_input_tokens_seen": 206613325, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6796875, + "step": 9591, + "time_per_iteration": 2.4630722999572754 + }, + { + "auxiliary_loss_clip": 0.0110984, + "auxiliary_loss_mlp": 0.01035664, + "balance_loss_clip": 1.02349293, + "balance_loss_mlp": 1.03998208, + "epoch": 0.5767022395911618, + "flos": 25848895962240.0, + "grad_norm": 1.6611744555405081, + "language_loss": 0.77684325, + "learning_rate": 1.6028745875372406e-06, + "loss": 0.7982983, + "num_input_tokens_seen": 206634265, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69921875, + "step": 9592, + "time_per_iteration": 2.4990251064300537 + }, + { + "auxiliary_loss_clip": 0.01029258, + "auxiliary_loss_mlp": 0.01002299, + "balance_loss_clip": 1.00114298, + "balance_loss_mlp": 1.00790858, + "epoch": 0.5767623628438299, + "flos": 68293299657600.0, + "grad_norm": 0.7302836874791289, + "language_loss": 0.59611464, + "learning_rate": 1.6024928882763885e-06, + "loss": 0.61643022, + "num_input_tokens_seen": 206696990, + "router_z_loss_clip": 0.01153564, + "router_z_loss_mlp": 0.21386719, + "step": 9593, + "time_per_iteration": 3.1885087490081787 + }, + { + "auxiliary_loss_clip": 0.01110729, + "auxiliary_loss_mlp": 0.01039747, + "balance_loss_clip": 1.0265801, + "balance_loss_mlp": 1.03883052, + "epoch": 0.5768224860964978, + "flos": 30188448754560.0, + "grad_norm": 2.3535875138052806, + "language_loss": 0.7131753, + "learning_rate": 1.6021112040890463e-06, + "loss": 0.73468006, + "num_input_tokens_seen": 206717815, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71875, + "step": 9594, + "time_per_iteration": 3.89677357673645 + }, + { + "auxiliary_loss_clip": 0.01107394, + "auxiliary_loss_mlp": 0.01032136, + "balance_loss_clip": 1.02087677, + "balance_loss_mlp": 1.03755784, + "epoch": 0.5768826093491658, + "flos": 17895041884800.0, + "grad_norm": 1.9084853230861274, + "language_loss": 0.71146429, + "learning_rate": 1.6017295349896863e-06, + "loss": 0.73285961, + "num_input_tokens_seen": 206735985, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.69921875, + "step": 9595, + "time_per_iteration": 2.438798666000366 + }, + { + "auxiliary_loss_clip": 0.01106901, + "auxiliary_loss_mlp": 0.01029125, + "balance_loss_clip": 1.01726389, + "balance_loss_mlp": 1.03756046, + "epoch": 0.5769427326018337, + "flos": 17457183095040.0, + "grad_norm": 2.7843520689138646, + "language_loss": 0.69750065, + "learning_rate": 1.6013478809927828e-06, + "loss": 0.71886092, + "num_input_tokens_seen": 206753370, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6953125, + "step": 9596, + "time_per_iteration": 3.8589518070220947 + }, + { + "auxiliary_loss_clip": 0.01111865, + "auxiliary_loss_mlp": 0.01036248, + "balance_loss_clip": 1.02235997, + "balance_loss_mlp": 1.03845882, + "epoch": 0.5770028558545017, + "flos": 39421728345600.0, + "grad_norm": 2.3208716765708974, + "language_loss": 0.67437601, + "learning_rate": 1.6009662421128074e-06, + "loss": 0.69585705, + "num_input_tokens_seen": 206777645, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.734375, + "step": 9597, + "time_per_iteration": 4.025861501693726 + }, + { + "auxiliary_loss_clip": 0.01107063, + "auxiliary_loss_mlp": 0.01033781, + "balance_loss_clip": 1.02220011, + "balance_loss_mlp": 1.03775668, + "epoch": 0.5770629791071697, + "flos": 21536383132800.0, + "grad_norm": 2.263151487781109, + "language_loss": 0.81492549, + "learning_rate": 1.6005846183642323e-06, + "loss": 0.83633393, + "num_input_tokens_seen": 206794865, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.69140625, + "step": 9598, + "time_per_iteration": 2.4457364082336426 + }, + { + "auxiliary_loss_clip": 0.01108029, + "auxiliary_loss_mlp": 0.01030455, + "balance_loss_clip": 1.01787877, + "balance_loss_mlp": 1.03758776, + "epoch": 0.5771231023598377, + "flos": 20886795624960.0, + "grad_norm": 1.482456402920166, + "language_loss": 0.72767603, + "learning_rate": 1.6002030097615277e-06, + "loss": 0.74906087, + "num_input_tokens_seen": 206814095, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 9599, + "time_per_iteration": 2.440633773803711 + }, + { + "auxiliary_loss_clip": 0.0110238, + "auxiliary_loss_mlp": 0.01029145, + "balance_loss_clip": 1.0178082, + "balance_loss_mlp": 1.03569376, + "epoch": 0.5771832256125057, + "flos": 18077216688000.0, + "grad_norm": 1.8193310631715605, + "language_loss": 0.77990794, + "learning_rate": 1.5998214163191663e-06, + "loss": 0.80122316, + "num_input_tokens_seen": 206832245, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66796875, + "step": 9600, + "time_per_iteration": 2.4627256393432617 + }, + { + "auxiliary_loss_clip": 0.01108817, + "auxiliary_loss_mlp": 0.01033376, + "balance_loss_clip": 1.0210135, + "balance_loss_mlp": 1.03849137, + "epoch": 0.5772433488651736, + "flos": 26359078786560.0, + "grad_norm": 1.5552976085447456, + "language_loss": 0.72505343, + "learning_rate": 1.5994398380516163e-06, + "loss": 0.74647534, + "num_input_tokens_seen": 206851535, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.703125, + "step": 9601, + "time_per_iteration": 2.5040857791900635 + }, + { + "auxiliary_loss_clip": 0.01107029, + "auxiliary_loss_mlp": 0.01033902, + "balance_loss_clip": 1.02177262, + "balance_loss_mlp": 1.03861833, + "epoch": 0.5773034721178416, + "flos": 19680987035520.0, + "grad_norm": 1.6061208919603027, + "language_loss": 0.68449026, + "learning_rate": 1.599058274973348e-06, + "loss": 0.7058996, + "num_input_tokens_seen": 206870595, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.68359375, + "step": 9602, + "time_per_iteration": 2.4730873107910156 + }, + { + "auxiliary_loss_clip": 0.01102166, + "auxiliary_loss_mlp": 0.01030844, + "balance_loss_clip": 1.01990008, + "balance_loss_mlp": 1.03666043, + "epoch": 0.5773635953705095, + "flos": 25082885496960.0, + "grad_norm": 1.4427131087039327, + "language_loss": 0.72969544, + "learning_rate": 1.5986767270988297e-06, + "loss": 0.75102556, + "num_input_tokens_seen": 206892320, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.65625, + "step": 9603, + "time_per_iteration": 2.4821383953094482 + }, + { + "auxiliary_loss_clip": 0.01106029, + "auxiliary_loss_mlp": 0.01029794, + "balance_loss_clip": 1.01787269, + "balance_loss_mlp": 1.03815305, + "epoch": 0.5774237186231775, + "flos": 21032987978880.0, + "grad_norm": 1.760798848795816, + "language_loss": 0.76811421, + "learning_rate": 1.5982951944425298e-06, + "loss": 0.78947246, + "num_input_tokens_seen": 206912485, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 9604, + "time_per_iteration": 2.4963274002075195 + }, + { + "auxiliary_loss_clip": 0.01108714, + "auxiliary_loss_mlp": 0.01033384, + "balance_loss_clip": 1.01986611, + "balance_loss_mlp": 1.03805828, + "epoch": 0.5774838418758454, + "flos": 15231727128960.0, + "grad_norm": 1.8255502953236893, + "language_loss": 0.83589303, + "learning_rate": 1.5979136770189174e-06, + "loss": 0.85731399, + "num_input_tokens_seen": 206929100, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.70703125, + "step": 9605, + "time_per_iteration": 2.420722484588623 + }, + { + "auxiliary_loss_clip": 0.01115788, + "auxiliary_loss_mlp": 0.01032506, + "balance_loss_clip": 1.01826096, + "balance_loss_mlp": 1.041394, + "epoch": 0.5775439651285135, + "flos": 23582609210880.0, + "grad_norm": 1.6448412923605056, + "language_loss": 0.78043878, + "learning_rate": 1.5975321748424581e-06, + "loss": 0.80192173, + "num_input_tokens_seen": 206947020, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.74609375, + "step": 9606, + "time_per_iteration": 2.47755765914917 + }, + { + "auxiliary_loss_clip": 0.01105815, + "auxiliary_loss_mlp": 0.01033593, + "balance_loss_clip": 1.02192283, + "balance_loss_mlp": 1.03780627, + "epoch": 0.5776040883811814, + "flos": 18040515966720.0, + "grad_norm": 1.6466821062116115, + "language_loss": 0.74067813, + "learning_rate": 1.597150687927619e-06, + "loss": 0.76207221, + "num_input_tokens_seen": 206964065, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 9607, + "time_per_iteration": 2.473158597946167 + }, + { + "auxiliary_loss_clip": 0.01107935, + "auxiliary_loss_mlp": 0.01030683, + "balance_loss_clip": 1.01862538, + "balance_loss_mlp": 1.03809416, + "epoch": 0.5776642116338494, + "flos": 18624638937600.0, + "grad_norm": 1.6703318324983303, + "language_loss": 0.69666326, + "learning_rate": 1.5967692162888664e-06, + "loss": 0.71804941, + "num_input_tokens_seen": 206981940, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.69921875, + "step": 9608, + "time_per_iteration": 2.457597255706787 + }, + { + "auxiliary_loss_clip": 0.01109603, + "auxiliary_loss_mlp": 0.01031186, + "balance_loss_clip": 1.01841307, + "balance_loss_mlp": 1.03859639, + "epoch": 0.5777243348865173, + "flos": 28402539517440.0, + "grad_norm": 1.7239529426914375, + "language_loss": 0.76340568, + "learning_rate": 1.596387759940665e-06, + "loss": 0.78481352, + "num_input_tokens_seen": 207002365, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7109375, + "step": 9609, + "time_per_iteration": 2.478379964828491 + }, + { + "auxiliary_loss_clip": 0.01106636, + "auxiliary_loss_mlp": 0.01028415, + "balance_loss_clip": 1.01672637, + "balance_loss_mlp": 1.03600001, + "epoch": 0.5777844581391853, + "flos": 24024705805440.0, + "grad_norm": 1.8185868001057917, + "language_loss": 0.77262604, + "learning_rate": 1.5960063188974808e-06, + "loss": 0.79397655, + "num_input_tokens_seen": 207021195, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.70703125, + "step": 9610, + "time_per_iteration": 2.4817564487457275 + }, + { + "auxiliary_loss_clip": 0.01108101, + "auxiliary_loss_mlp": 0.0102822, + "balance_loss_clip": 1.01526141, + "balance_loss_mlp": 1.03805757, + "epoch": 0.5778445813918534, + "flos": 17777361951360.0, + "grad_norm": 2.0354514470011327, + "language_loss": 0.68514067, + "learning_rate": 1.5956248931737777e-06, + "loss": 0.70650387, + "num_input_tokens_seen": 207037465, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.69921875, + "step": 9611, + "time_per_iteration": 2.401411771774292 + }, + { + "auxiliary_loss_clip": 0.01104847, + "auxiliary_loss_mlp": 0.01026691, + "balance_loss_clip": 1.01431727, + "balance_loss_mlp": 1.03594267, + "epoch": 0.5779047046445213, + "flos": 22233194046720.0, + "grad_norm": 1.8201815228945446, + "language_loss": 0.82796168, + "learning_rate": 1.5952434827840185e-06, + "loss": 0.84927702, + "num_input_tokens_seen": 207054230, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69140625, + "step": 9612, + "time_per_iteration": 2.4473085403442383 + }, + { + "auxiliary_loss_clip": 0.0110712, + "auxiliary_loss_mlp": 0.01031451, + "balance_loss_clip": 1.01915455, + "balance_loss_mlp": 1.0376699, + "epoch": 0.5779648278971893, + "flos": 21434361528960.0, + "grad_norm": 1.6350469107350603, + "language_loss": 0.79244345, + "learning_rate": 1.594862087742667e-06, + "loss": 0.81382918, + "num_input_tokens_seen": 207073150, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6953125, + "step": 9613, + "time_per_iteration": 2.427710771560669 + }, + { + "auxiliary_loss_clip": 0.01104237, + "auxiliary_loss_mlp": 0.01034281, + "balance_loss_clip": 1.02318311, + "balance_loss_mlp": 1.03584552, + "epoch": 0.5780249511498572, + "flos": 19026120228480.0, + "grad_norm": 1.8237036529741348, + "language_loss": 0.77103758, + "learning_rate": 1.5944807080641863e-06, + "loss": 0.79242271, + "num_input_tokens_seen": 207090375, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.68359375, + "step": 9614, + "time_per_iteration": 2.44856595993042 + }, + { + "auxiliary_loss_clip": 0.01108473, + "auxiliary_loss_mlp": 0.0103142, + "balance_loss_clip": 1.01954651, + "balance_loss_mlp": 1.03704453, + "epoch": 0.5780850744025252, + "flos": 12124663752960.0, + "grad_norm": 2.4290592896418093, + "language_loss": 0.8083241, + "learning_rate": 1.5940993437630375e-06, + "loss": 0.829723, + "num_input_tokens_seen": 207106030, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.71484375, + "step": 9615, + "time_per_iteration": 2.387230396270752 + }, + { + "auxiliary_loss_clip": 0.01107982, + "auxiliary_loss_mlp": 0.01032204, + "balance_loss_clip": 1.01979423, + "balance_loss_mlp": 1.0372864, + "epoch": 0.5781451976551931, + "flos": 25044425009280.0, + "grad_norm": 1.467111790124014, + "language_loss": 0.67172909, + "learning_rate": 1.5937179948536825e-06, + "loss": 0.69313097, + "num_input_tokens_seen": 207125435, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.70703125, + "step": 9616, + "time_per_iteration": 2.5091681480407715 + }, + { + "auxiliary_loss_clip": 0.01103982, + "auxiliary_loss_mlp": 0.01031765, + "balance_loss_clip": 1.01983774, + "balance_loss_mlp": 1.03701568, + "epoch": 0.5782053209078611, + "flos": 19245606284160.0, + "grad_norm": 1.7373937933185963, + "language_loss": 0.77820861, + "learning_rate": 1.5933366613505812e-06, + "loss": 0.79956603, + "num_input_tokens_seen": 207145095, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66796875, + "step": 9617, + "time_per_iteration": 2.434692144393921 + }, + { + "auxiliary_loss_clip": 0.01105528, + "auxiliary_loss_mlp": 0.01031393, + "balance_loss_clip": 1.01911426, + "balance_loss_mlp": 1.03798401, + "epoch": 0.578265444160529, + "flos": 25993831340160.0, + "grad_norm": 1.4913926039582375, + "language_loss": 0.75064909, + "learning_rate": 1.5929553432681947e-06, + "loss": 0.77201837, + "num_input_tokens_seen": 207166045, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.67578125, + "step": 9618, + "time_per_iteration": 2.5143377780914307 + }, + { + "auxiliary_loss_clip": 0.01103572, + "auxiliary_loss_mlp": 0.0103001, + "balance_loss_clip": 1.01855421, + "balance_loss_mlp": 1.03614712, + "epoch": 0.5783255674131971, + "flos": 21798603394560.0, + "grad_norm": 1.5244275331123438, + "language_loss": 0.81895173, + "learning_rate": 1.5925740406209826e-06, + "loss": 0.84028757, + "num_input_tokens_seen": 207185290, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.67578125, + "step": 9619, + "time_per_iteration": 2.436741828918457 + }, + { + "auxiliary_loss_clip": 0.01106581, + "auxiliary_loss_mlp": 0.01035226, + "balance_loss_clip": 1.02319741, + "balance_loss_mlp": 1.03689742, + "epoch": 0.578385690665865, + "flos": 24789746603520.0, + "grad_norm": 2.8855702259785874, + "language_loss": 0.7266885, + "learning_rate": 1.5921927534234039e-06, + "loss": 0.7481066, + "num_input_tokens_seen": 207205505, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6953125, + "step": 9620, + "time_per_iteration": 2.508638858795166 + }, + { + "auxiliary_loss_clip": 0.01106937, + "auxiliary_loss_mlp": 0.01031542, + "balance_loss_clip": 1.01994872, + "balance_loss_mlp": 1.0379591, + "epoch": 0.578445813918533, + "flos": 21212864311680.0, + "grad_norm": 1.4901469929607327, + "language_loss": 0.77143538, + "learning_rate": 1.591811481689916e-06, + "loss": 0.79282016, + "num_input_tokens_seen": 207225315, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6875, + "step": 9621, + "time_per_iteration": 2.4620673656463623 + }, + { + "auxiliary_loss_clip": 0.01106096, + "auxiliary_loss_mlp": 0.01031373, + "balance_loss_clip": 1.01862931, + "balance_loss_mlp": 1.03550279, + "epoch": 0.5785059371712009, + "flos": 25046795306880.0, + "grad_norm": 1.5105026325174375, + "language_loss": 0.70597667, + "learning_rate": 1.5914302254349787e-06, + "loss": 0.72735131, + "num_input_tokens_seen": 207247690, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.70703125, + "step": 9622, + "time_per_iteration": 2.509505033493042 + }, + { + "auxiliary_loss_clip": 0.01028849, + "auxiliary_loss_mlp": 0.01001525, + "balance_loss_clip": 1.00028539, + "balance_loss_mlp": 1.007653, + "epoch": 0.5785660604238689, + "flos": 70843172284800.0, + "grad_norm": 0.7726155153830789, + "language_loss": 0.55941814, + "learning_rate": 1.5910489846730476e-06, + "loss": 0.57972187, + "num_input_tokens_seen": 207301735, + "router_z_loss_clip": 0.01239014, + "router_z_loss_mlp": 0.21191406, + "step": 9623, + "time_per_iteration": 3.0823814868927 + }, + { + "auxiliary_loss_clip": 0.01109291, + "auxiliary_loss_mlp": 0.01036509, + "balance_loss_clip": 1.02344918, + "balance_loss_mlp": 1.03692317, + "epoch": 0.578626183676537, + "flos": 31649977244160.0, + "grad_norm": 2.2221143081246373, + "language_loss": 0.71056175, + "learning_rate": 1.5906677594185799e-06, + "loss": 0.73201978, + "num_input_tokens_seen": 207321240, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.72265625, + "step": 9624, + "time_per_iteration": 2.5265705585479736 + }, + { + "auxiliary_loss_clip": 0.01107503, + "auxiliary_loss_mlp": 0.01037993, + "balance_loss_clip": 1.02552414, + "balance_loss_mlp": 1.03862953, + "epoch": 0.5786863069292049, + "flos": 21865181253120.0, + "grad_norm": 2.222167937534436, + "language_loss": 0.82642812, + "learning_rate": 1.5902865496860322e-06, + "loss": 0.84788311, + "num_input_tokens_seen": 207339540, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 9625, + "time_per_iteration": 2.451249122619629 + }, + { + "auxiliary_loss_clip": 0.01104891, + "auxiliary_loss_mlp": 0.01034617, + "balance_loss_clip": 1.02198672, + "balance_loss_mlp": 1.03701115, + "epoch": 0.5787464301818729, + "flos": 23364954748800.0, + "grad_norm": 1.455235974234194, + "language_loss": 0.69956779, + "learning_rate": 1.5899053554898591e-06, + "loss": 0.72096288, + "num_input_tokens_seen": 207360470, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6796875, + "step": 9626, + "time_per_iteration": 2.4975287914276123 + }, + { + "auxiliary_loss_clip": 0.01103607, + "auxiliary_loss_mlp": 0.01036492, + "balance_loss_clip": 1.02480352, + "balance_loss_mlp": 1.03568482, + "epoch": 0.5788065534345408, + "flos": 30004011394560.0, + "grad_norm": 1.93553238886208, + "language_loss": 0.71862161, + "learning_rate": 1.5895241768445166e-06, + "loss": 0.7400226, + "num_input_tokens_seen": 207383080, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 9627, + "time_per_iteration": 2.5138702392578125 + }, + { + "auxiliary_loss_clip": 0.01104177, + "auxiliary_loss_mlp": 0.01028958, + "balance_loss_clip": 1.01737726, + "balance_loss_mlp": 1.03599048, + "epoch": 0.5788666766872088, + "flos": 24527849564160.0, + "grad_norm": 1.727007676436273, + "language_loss": 0.8414377, + "learning_rate": 1.589143013764458e-06, + "loss": 0.86276901, + "num_input_tokens_seen": 207401000, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6796875, + "step": 9628, + "time_per_iteration": 2.4851796627044678 + }, + { + "auxiliary_loss_clip": 0.01103695, + "auxiliary_loss_mlp": 0.01025516, + "balance_loss_clip": 1.01394033, + "balance_loss_mlp": 1.03516388, + "epoch": 0.5789267999398767, + "flos": 23732823888000.0, + "grad_norm": 1.6873428245402236, + "language_loss": 0.71942705, + "learning_rate": 1.5887618662641376e-06, + "loss": 0.74071914, + "num_input_tokens_seen": 207419230, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6875, + "step": 9629, + "time_per_iteration": 2.520517587661743 + }, + { + "auxiliary_loss_clip": 0.01108734, + "auxiliary_loss_mlp": 0.01034367, + "balance_loss_clip": 1.02181387, + "balance_loss_mlp": 1.03963637, + "epoch": 0.5789869231925447, + "flos": 21135045496320.0, + "grad_norm": 1.9628574132847711, + "language_loss": 0.74576336, + "learning_rate": 1.5883807343580087e-06, + "loss": 0.76719439, + "num_input_tokens_seen": 207437615, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69140625, + "step": 9630, + "time_per_iteration": 2.454810380935669 + }, + { + "auxiliary_loss_clip": 0.01101829, + "auxiliary_loss_mlp": 0.01028168, + "balance_loss_clip": 1.01682508, + "balance_loss_mlp": 1.03553247, + "epoch": 0.5790470464452127, + "flos": 21209632087680.0, + "grad_norm": 1.6371763310429226, + "language_loss": 0.79325604, + "learning_rate": 1.587999618060523e-06, + "loss": 0.814556, + "num_input_tokens_seen": 207457270, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 9631, + "time_per_iteration": 2.440864324569702 + }, + { + "auxiliary_loss_clip": 0.01104911, + "auxiliary_loss_mlp": 0.01030044, + "balance_loss_clip": 1.01800966, + "balance_loss_mlp": 1.03596497, + "epoch": 0.5791071696978807, + "flos": 23404384903680.0, + "grad_norm": 1.6037309933130668, + "language_loss": 0.75137591, + "learning_rate": 1.5876185173861333e-06, + "loss": 0.77272546, + "num_input_tokens_seen": 207477890, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6875, + "step": 9632, + "time_per_iteration": 2.4771668910980225 + }, + { + "auxiliary_loss_clip": 0.01106006, + "auxiliary_loss_mlp": 0.01027741, + "balance_loss_clip": 1.01517081, + "balance_loss_mlp": 1.03731871, + "epoch": 0.5791672929505486, + "flos": 24206521472640.0, + "grad_norm": 2.4626986888140716, + "language_loss": 0.79077435, + "learning_rate": 1.5872374323492915e-06, + "loss": 0.81211185, + "num_input_tokens_seen": 207497670, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 9633, + "time_per_iteration": 2.448436737060547 + }, + { + "auxiliary_loss_clip": 0.01115908, + "auxiliary_loss_mlp": 0.0103724, + "balance_loss_clip": 1.02447283, + "balance_loss_mlp": 1.04036343, + "epoch": 0.5792274162032166, + "flos": 24348871071360.0, + "grad_norm": 1.7086543878642706, + "language_loss": 0.77430606, + "learning_rate": 1.5868563629644464e-06, + "loss": 0.79583752, + "num_input_tokens_seen": 207516105, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.75390625, + "step": 9634, + "time_per_iteration": 2.4811017513275146 + }, + { + "auxiliary_loss_clip": 0.01108474, + "auxiliary_loss_mlp": 0.01038015, + "balance_loss_clip": 1.02580237, + "balance_loss_mlp": 1.03722477, + "epoch": 0.5792875394558845, + "flos": 20449403712000.0, + "grad_norm": 2.1301414361920843, + "language_loss": 0.63183784, + "learning_rate": 1.5864753092460502e-06, + "loss": 0.65330267, + "num_input_tokens_seen": 207533685, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.71484375, + "step": 9635, + "time_per_iteration": 3.8360743522644043 + }, + { + "auxiliary_loss_clip": 0.01105747, + "auxiliary_loss_mlp": 0.01036, + "balance_loss_clip": 1.02431154, + "balance_loss_mlp": 1.03854156, + "epoch": 0.5793476627085525, + "flos": 24060329118720.0, + "grad_norm": 1.5921207664968484, + "language_loss": 0.76923883, + "learning_rate": 1.5860942712085516e-06, + "loss": 0.79065627, + "num_input_tokens_seen": 207552840, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.671875, + "step": 9636, + "time_per_iteration": 2.4524970054626465 + }, + { + "auxiliary_loss_clip": 0.01101976, + "auxiliary_loss_mlp": 0.01032358, + "balance_loss_clip": 1.020854, + "balance_loss_mlp": 1.03643167, + "epoch": 0.5794077859612206, + "flos": 22054287381120.0, + "grad_norm": 1.6428369167222547, + "language_loss": 0.68367255, + "learning_rate": 1.5857132488663998e-06, + "loss": 0.70501596, + "num_input_tokens_seen": 207572095, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 9637, + "time_per_iteration": 3.9001073837280273 + }, + { + "auxiliary_loss_clip": 0.01107722, + "auxiliary_loss_mlp": 0.010306, + "balance_loss_clip": 1.0181725, + "balance_loss_mlp": 1.03622174, + "epoch": 0.5794679092138885, + "flos": 11434855991040.0, + "grad_norm": 2.3860817889930326, + "language_loss": 0.72291076, + "learning_rate": 1.585332242234043e-06, + "loss": 0.74429405, + "num_input_tokens_seen": 207587495, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71484375, + "step": 9638, + "time_per_iteration": 3.8099658489227295 + }, + { + "auxiliary_loss_clip": 0.01106068, + "auxiliary_loss_mlp": 0.01031021, + "balance_loss_clip": 1.01981568, + "balance_loss_mlp": 1.03809261, + "epoch": 0.5795280324665565, + "flos": 18880215183360.0, + "grad_norm": 2.0300843650533387, + "language_loss": 0.72111142, + "learning_rate": 1.5849512513259291e-06, + "loss": 0.7424823, + "num_input_tokens_seen": 207606795, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6796875, + "step": 9639, + "time_per_iteration": 3.9071426391601562 + }, + { + "auxiliary_loss_clip": 0.01107692, + "auxiliary_loss_mlp": 0.01034902, + "balance_loss_clip": 1.02291572, + "balance_loss_mlp": 1.03860509, + "epoch": 0.5795881557192244, + "flos": 13005947940480.0, + "grad_norm": 2.0103274032155163, + "language_loss": 0.69715077, + "learning_rate": 1.5845702761565054e-06, + "loss": 0.71857667, + "num_input_tokens_seen": 207623620, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 9640, + "time_per_iteration": 2.433104991912842 + }, + { + "auxiliary_loss_clip": 0.01113005, + "auxiliary_loss_mlp": 0.01038437, + "balance_loss_clip": 1.02583635, + "balance_loss_mlp": 1.03887677, + "epoch": 0.5796482789718924, + "flos": 19932397303680.0, + "grad_norm": 2.7872404958031884, + "language_loss": 0.77623034, + "learning_rate": 1.5841893167402183e-06, + "loss": 0.79774475, + "num_input_tokens_seen": 207639380, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7421875, + "step": 9641, + "time_per_iteration": 2.4301722049713135 + }, + { + "auxiliary_loss_clip": 0.01106272, + "auxiliary_loss_mlp": 0.01030792, + "balance_loss_clip": 1.01930058, + "balance_loss_mlp": 1.0378499, + "epoch": 0.5797084022245603, + "flos": 21650794928640.0, + "grad_norm": 1.8500908876117999, + "language_loss": 0.73673463, + "learning_rate": 1.5838083730915143e-06, + "loss": 0.75810528, + "num_input_tokens_seen": 207657915, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.68359375, + "step": 9642, + "time_per_iteration": 2.49660325050354 + }, + { + "auxiliary_loss_clip": 0.01104964, + "auxiliary_loss_mlp": 0.01029372, + "balance_loss_clip": 1.01718903, + "balance_loss_mlp": 1.03625488, + "epoch": 0.5797685254772283, + "flos": 26031573555840.0, + "grad_norm": 1.696347443177098, + "language_loss": 0.73574042, + "learning_rate": 1.5834274452248378e-06, + "loss": 0.75708383, + "num_input_tokens_seen": 207678620, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 9643, + "time_per_iteration": 2.485637903213501 + }, + { + "auxiliary_loss_clip": 0.01108659, + "auxiliary_loss_mlp": 0.01031042, + "balance_loss_clip": 1.01862597, + "balance_loss_mlp": 1.03768921, + "epoch": 0.5798286487298963, + "flos": 22705167778560.0, + "grad_norm": 1.9990943096580656, + "language_loss": 0.67527819, + "learning_rate": 1.5830465331546352e-06, + "loss": 0.69667518, + "num_input_tokens_seen": 207696980, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 9644, + "time_per_iteration": 2.487901449203491 + }, + { + "auxiliary_loss_clip": 0.01112826, + "auxiliary_loss_mlp": 0.01029368, + "balance_loss_clip": 1.01664853, + "balance_loss_mlp": 1.03988528, + "epoch": 0.5798887719825643, + "flos": 23148988225920.0, + "grad_norm": 2.232135453826953, + "language_loss": 0.85353506, + "learning_rate": 1.5826656368953496e-06, + "loss": 0.87495703, + "num_input_tokens_seen": 207714065, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7265625, + "step": 9645, + "time_per_iteration": 2.4591071605682373 + }, + { + "auxiliary_loss_clip": 0.01108849, + "auxiliary_loss_mlp": 0.01029438, + "balance_loss_clip": 1.01782739, + "balance_loss_mlp": 1.03902066, + "epoch": 0.5799488952352322, + "flos": 24426043441920.0, + "grad_norm": 1.87513340954769, + "language_loss": 0.7528075, + "learning_rate": 1.5822847564614244e-06, + "loss": 0.77419043, + "num_input_tokens_seen": 207734720, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.69921875, + "step": 9646, + "time_per_iteration": 2.5096170902252197 + }, + { + "auxiliary_loss_clip": 0.01111341, + "auxiliary_loss_mlp": 0.01033913, + "balance_loss_clip": 1.02068663, + "balance_loss_mlp": 1.03949249, + "epoch": 0.5800090184879002, + "flos": 38395903829760.0, + "grad_norm": 1.666102030467492, + "language_loss": 0.5938943, + "learning_rate": 1.5819038918673038e-06, + "loss": 0.61534685, + "num_input_tokens_seen": 207755435, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71875, + "step": 9647, + "time_per_iteration": 2.5928401947021484 + }, + { + "auxiliary_loss_clip": 0.01109539, + "auxiliary_loss_mlp": 0.0103437, + "balance_loss_clip": 1.02159083, + "balance_loss_mlp": 1.0388217, + "epoch": 0.5800691417405681, + "flos": 19784840232960.0, + "grad_norm": 1.5329184941218248, + "language_loss": 0.84261942, + "learning_rate": 1.5815230431274288e-06, + "loss": 0.86405849, + "num_input_tokens_seen": 207773570, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.70703125, + "step": 9648, + "time_per_iteration": 2.460245132446289 + }, + { + "auxiliary_loss_clip": 0.01027507, + "auxiliary_loss_mlp": 0.00998956, + "balance_loss_clip": 0.99765694, + "balance_loss_mlp": 1.00610447, + "epoch": 0.5801292649932361, + "flos": 70314565783680.0, + "grad_norm": 0.8404119708733213, + "language_loss": 0.62959844, + "learning_rate": 1.581142210256242e-06, + "loss": 0.64986312, + "num_input_tokens_seen": 207830095, + "router_z_loss_clip": 0.01300049, + "router_z_loss_mlp": 0.21484375, + "step": 9649, + "time_per_iteration": 3.1300153732299805 + }, + { + "auxiliary_loss_clip": 0.01103333, + "auxiliary_loss_mlp": 0.01031569, + "balance_loss_clip": 1.02015436, + "balance_loss_mlp": 1.03649998, + "epoch": 0.5801893882459042, + "flos": 18734812928640.0, + "grad_norm": 2.3310983541006434, + "language_loss": 0.82039601, + "learning_rate": 1.5807613932681857e-06, + "loss": 0.84174502, + "num_input_tokens_seen": 207848555, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 9650, + "time_per_iteration": 2.4216153621673584 + }, + { + "auxiliary_loss_clip": 0.0111056, + "auxiliary_loss_mlp": 0.01032759, + "balance_loss_clip": 1.02018833, + "balance_loss_mlp": 1.0376749, + "epoch": 0.5802495114985721, + "flos": 15596507698560.0, + "grad_norm": 2.3176650701334442, + "language_loss": 0.77372313, + "learning_rate": 1.580380592177698e-06, + "loss": 0.79515636, + "num_input_tokens_seen": 207867060, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7265625, + "step": 9651, + "time_per_iteration": 2.4731314182281494 + }, + { + "auxiliary_loss_clip": 0.01110796, + "auxiliary_loss_mlp": 0.01036307, + "balance_loss_clip": 1.02309239, + "balance_loss_mlp": 1.03978133, + "epoch": 0.5803096347512401, + "flos": 18255405081600.0, + "grad_norm": 2.0034024707617575, + "language_loss": 0.74143803, + "learning_rate": 1.5799998069992213e-06, + "loss": 0.76290905, + "num_input_tokens_seen": 207884520, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 9652, + "time_per_iteration": 2.426095485687256 + }, + { + "auxiliary_loss_clip": 0.01106661, + "auxiliary_loss_mlp": 0.01031597, + "balance_loss_clip": 1.01887703, + "balance_loss_mlp": 1.03536129, + "epoch": 0.580369758003908, + "flos": 22893160584960.0, + "grad_norm": 1.9100146686462136, + "language_loss": 0.76669693, + "learning_rate": 1.579619037747193e-06, + "loss": 0.78807956, + "num_input_tokens_seen": 207905370, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9653, + "time_per_iteration": 2.479843854904175 + }, + { + "auxiliary_loss_clip": 0.01107407, + "auxiliary_loss_mlp": 0.01030936, + "balance_loss_clip": 1.01702428, + "balance_loss_mlp": 1.03746295, + "epoch": 0.580429881256576, + "flos": 18697681244160.0, + "grad_norm": 2.3557465918911578, + "language_loss": 0.74466497, + "learning_rate": 1.5792382844360534e-06, + "loss": 0.76604843, + "num_input_tokens_seen": 207923790, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.69921875, + "step": 9654, + "time_per_iteration": 2.4389872550964355 + }, + { + "auxiliary_loss_clip": 0.01105384, + "auxiliary_loss_mlp": 0.01033574, + "balance_loss_clip": 1.02185535, + "balance_loss_mlp": 1.0386194, + "epoch": 0.5804900045092439, + "flos": 24681978823680.0, + "grad_norm": 1.67229579578488, + "language_loss": 0.70335853, + "learning_rate": 1.5788575470802408e-06, + "loss": 0.72474813, + "num_input_tokens_seen": 207942335, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 9655, + "time_per_iteration": 2.4667346477508545 + }, + { + "auxiliary_loss_clip": 0.01112207, + "auxiliary_loss_mlp": 0.01038295, + "balance_loss_clip": 1.025087, + "balance_loss_mlp": 1.03787553, + "epoch": 0.580550127761912, + "flos": 23112790295040.0, + "grad_norm": 3.1924669760277666, + "language_loss": 0.69441068, + "learning_rate": 1.5784768256941915e-06, + "loss": 0.71591568, + "num_input_tokens_seen": 207961975, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 9656, + "time_per_iteration": 2.47267746925354 + }, + { + "auxiliary_loss_clip": 0.01104052, + "auxiliary_loss_mlp": 0.01031775, + "balance_loss_clip": 1.02040219, + "balance_loss_mlp": 1.0376507, + "epoch": 0.5806102510145799, + "flos": 18475681236480.0, + "grad_norm": 1.8802574367017126, + "language_loss": 0.71315479, + "learning_rate": 1.5780961202923433e-06, + "loss": 0.73451304, + "num_input_tokens_seen": 207979520, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6640625, + "step": 9657, + "time_per_iteration": 2.411862850189209 + }, + { + "auxiliary_loss_clip": 0.01110384, + "auxiliary_loss_mlp": 0.01033397, + "balance_loss_clip": 1.02014136, + "balance_loss_mlp": 1.03748548, + "epoch": 0.5806703742672479, + "flos": 23915645136000.0, + "grad_norm": 2.139189937245848, + "language_loss": 0.70763719, + "learning_rate": 1.5777154308891328e-06, + "loss": 0.72907501, + "num_input_tokens_seen": 207998375, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7265625, + "step": 9658, + "time_per_iteration": 2.4618098735809326 + }, + { + "auxiliary_loss_clip": 0.01029117, + "auxiliary_loss_mlp": 0.01007613, + "balance_loss_clip": 1.00649261, + "balance_loss_mlp": 1.00762427, + "epoch": 0.5807304975199158, + "flos": 66311999412480.0, + "grad_norm": 0.6568503671216013, + "language_loss": 0.53557444, + "learning_rate": 1.5773347574989953e-06, + "loss": 0.5559417, + "num_input_tokens_seen": 208060605, + "router_z_loss_clip": 0.01123047, + "router_z_loss_mlp": 0.21484375, + "step": 9659, + "time_per_iteration": 3.081292152404785 + }, + { + "auxiliary_loss_clip": 0.01109597, + "auxiliary_loss_mlp": 0.01038179, + "balance_loss_clip": 1.02564979, + "balance_loss_mlp": 1.0386076, + "epoch": 0.5807906207725838, + "flos": 31722444933120.0, + "grad_norm": 2.325531986819307, + "language_loss": 0.62134814, + "learning_rate": 1.576954100136366e-06, + "loss": 0.6428259, + "num_input_tokens_seen": 208080320, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7109375, + "step": 9660, + "time_per_iteration": 2.5101215839385986 + }, + { + "auxiliary_loss_clip": 0.01107552, + "auxiliary_loss_mlp": 0.01033971, + "balance_loss_clip": 1.02121592, + "balance_loss_mlp": 1.03510964, + "epoch": 0.5808507440252517, + "flos": 23801161512960.0, + "grad_norm": 1.644077336412447, + "language_loss": 0.65339613, + "learning_rate": 1.5765734588156797e-06, + "loss": 0.67481142, + "num_input_tokens_seen": 208099305, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7265625, + "step": 9661, + "time_per_iteration": 2.495326042175293 + }, + { + "auxiliary_loss_clip": 0.01101624, + "auxiliary_loss_mlp": 0.0102562, + "balance_loss_clip": 1.01473665, + "balance_loss_mlp": 1.03630924, + "epoch": 0.5809108672779197, + "flos": 13698449222400.0, + "grad_norm": 1.4453410326473544, + "language_loss": 0.74667752, + "learning_rate": 1.5761928335513704e-06, + "loss": 0.76795, + "num_input_tokens_seen": 208116960, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.65234375, + "step": 9662, + "time_per_iteration": 2.4072024822235107 + }, + { + "auxiliary_loss_clip": 0.0102818, + "auxiliary_loss_mlp": 0.01003249, + "balance_loss_clip": 1.0020808, + "balance_loss_mlp": 1.00680053, + "epoch": 0.5809709905305876, + "flos": 69134866381440.0, + "grad_norm": 0.8844058515803096, + "language_loss": 0.58421201, + "learning_rate": 1.5758122243578709e-06, + "loss": 0.60452628, + "num_input_tokens_seen": 208182190, + "router_z_loss_clip": 0.01165771, + "router_z_loss_mlp": 0.21484375, + "step": 9663, + "time_per_iteration": 3.128176689147949 + }, + { + "auxiliary_loss_clip": 0.01107731, + "auxiliary_loss_mlp": 0.01032948, + "balance_loss_clip": 1.02058566, + "balance_loss_mlp": 1.03855336, + "epoch": 0.5810311137832557, + "flos": 19827538525440.0, + "grad_norm": 2.2307426037080558, + "language_loss": 0.82198572, + "learning_rate": 1.5754316312496152e-06, + "loss": 0.84339249, + "num_input_tokens_seen": 208197015, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.69140625, + "step": 9664, + "time_per_iteration": 2.4268438816070557 + }, + { + "auxiliary_loss_clip": 0.01106716, + "auxiliary_loss_mlp": 0.01024753, + "balance_loss_clip": 1.0119977, + "balance_loss_mlp": 1.03471017, + "epoch": 0.5810912370359237, + "flos": 29238503719680.0, + "grad_norm": 1.6499573770914204, + "language_loss": 0.81283242, + "learning_rate": 1.5750510542410337e-06, + "loss": 0.8341471, + "num_input_tokens_seen": 208215795, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.72265625, + "step": 9665, + "time_per_iteration": 2.539750337600708 + }, + { + "auxiliary_loss_clip": 0.01113083, + "auxiliary_loss_mlp": 0.01032325, + "balance_loss_clip": 1.0180558, + "balance_loss_mlp": 1.03968716, + "epoch": 0.5811513602885916, + "flos": 22785572373120.0, + "grad_norm": 1.6493862237198238, + "language_loss": 0.81106472, + "learning_rate": 1.5746704933465599e-06, + "loss": 0.83251882, + "num_input_tokens_seen": 208234655, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.734375, + "step": 9666, + "time_per_iteration": 2.4637341499328613 + }, + { + "auxiliary_loss_clip": 0.01105376, + "auxiliary_loss_mlp": 0.01032179, + "balance_loss_clip": 1.02047861, + "balance_loss_mlp": 1.03734851, + "epoch": 0.5812114835412596, + "flos": 18734346051840.0, + "grad_norm": 1.772076851837157, + "language_loss": 0.79902422, + "learning_rate": 1.5742899485806227e-06, + "loss": 0.82039976, + "num_input_tokens_seen": 208251300, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 9667, + "time_per_iteration": 2.4630167484283447 + }, + { + "auxiliary_loss_clip": 0.01113135, + "auxiliary_loss_mlp": 0.01033461, + "balance_loss_clip": 1.01935887, + "balance_loss_mlp": 1.03786182, + "epoch": 0.5812716067939275, + "flos": 26431295080320.0, + "grad_norm": 1.5126376316707284, + "language_loss": 0.78524494, + "learning_rate": 1.573909419957653e-06, + "loss": 0.80671084, + "num_input_tokens_seen": 208272685, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.75, + "step": 9668, + "time_per_iteration": 2.4933431148529053 + }, + { + "auxiliary_loss_clip": 0.01109741, + "auxiliary_loss_mlp": 0.01031977, + "balance_loss_clip": 1.01976347, + "balance_loss_mlp": 1.03882718, + "epoch": 0.5813317300465956, + "flos": 43397865285120.0, + "grad_norm": 2.2917193824708395, + "language_loss": 0.6405921, + "learning_rate": 1.5735289074920819e-06, + "loss": 0.66200924, + "num_input_tokens_seen": 208294315, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.7109375, + "step": 9669, + "time_per_iteration": 2.711413860321045 + }, + { + "auxiliary_loss_clip": 0.01109059, + "auxiliary_loss_mlp": 0.01034536, + "balance_loss_clip": 1.02185786, + "balance_loss_mlp": 1.03847837, + "epoch": 0.5813918532992635, + "flos": 24785472885120.0, + "grad_norm": 1.7201818199144705, + "language_loss": 0.73401237, + "learning_rate": 1.5731484111983363e-06, + "loss": 0.75544822, + "num_input_tokens_seen": 208315610, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 9670, + "time_per_iteration": 2.481351375579834 + }, + { + "auxiliary_loss_clip": 0.01108134, + "auxiliary_loss_mlp": 0.01035647, + "balance_loss_clip": 1.02327895, + "balance_loss_mlp": 1.03665125, + "epoch": 0.5814519765519315, + "flos": 22857357703680.0, + "grad_norm": 2.1547601144280693, + "language_loss": 0.79159272, + "learning_rate": 1.5727679310908464e-06, + "loss": 0.81303054, + "num_input_tokens_seen": 208334725, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71484375, + "step": 9671, + "time_per_iteration": 2.481765031814575 + }, + { + "auxiliary_loss_clip": 0.01113516, + "auxiliary_loss_mlp": 0.01036309, + "balance_loss_clip": 1.02253985, + "balance_loss_mlp": 1.04052281, + "epoch": 0.5815120998045994, + "flos": 24060831909120.0, + "grad_norm": 1.8667318330129747, + "language_loss": 0.60387075, + "learning_rate": 1.5723874671840399e-06, + "loss": 0.62536901, + "num_input_tokens_seen": 208353825, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.73046875, + "step": 9672, + "time_per_iteration": 2.4585747718811035 + }, + { + "auxiliary_loss_clip": 0.01106042, + "auxiliary_loss_mlp": 0.01027644, + "balance_loss_clip": 1.01597953, + "balance_loss_mlp": 1.03804862, + "epoch": 0.5815722230572674, + "flos": 24279491952000.0, + "grad_norm": 1.9986212138203583, + "language_loss": 0.81078732, + "learning_rate": 1.572007019492342e-06, + "loss": 0.83212423, + "num_input_tokens_seen": 208374160, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 9673, + "time_per_iteration": 2.4950785636901855 + }, + { + "auxiliary_loss_clip": 0.01113708, + "auxiliary_loss_mlp": 0.01035096, + "balance_loss_clip": 1.02148843, + "balance_loss_mlp": 1.03956604, + "epoch": 0.5816323463099353, + "flos": 22200371994240.0, + "grad_norm": 1.7057299891387632, + "language_loss": 0.87750065, + "learning_rate": 1.5716265880301817e-06, + "loss": 0.89898866, + "num_input_tokens_seen": 208392105, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7421875, + "step": 9674, + "time_per_iteration": 2.440136432647705 + }, + { + "auxiliary_loss_clip": 0.01108901, + "auxiliary_loss_mlp": 0.01033088, + "balance_loss_clip": 1.02156651, + "balance_loss_mlp": 1.03789747, + "epoch": 0.5816924695626033, + "flos": 24134448833280.0, + "grad_norm": 1.5021502044615473, + "language_loss": 0.78512001, + "learning_rate": 1.571246172811984e-06, + "loss": 0.80653995, + "num_input_tokens_seen": 208411755, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.7109375, + "step": 9675, + "time_per_iteration": 2.474719285964966 + }, + { + "auxiliary_loss_clip": 0.01108481, + "auxiliary_loss_mlp": 0.010293, + "balance_loss_clip": 1.0162822, + "balance_loss_mlp": 1.03912115, + "epoch": 0.5817525928152713, + "flos": 21324223451520.0, + "grad_norm": 2.1292944862371486, + "language_loss": 0.70189106, + "learning_rate": 1.5708657738521748e-06, + "loss": 0.72326887, + "num_input_tokens_seen": 208429995, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6953125, + "step": 9676, + "time_per_iteration": 2.435563325881958 + }, + { + "auxiliary_loss_clip": 0.01108816, + "auxiliary_loss_mlp": 0.01030513, + "balance_loss_clip": 1.01728702, + "balance_loss_mlp": 1.03810883, + "epoch": 0.5818127160679393, + "flos": 26934510666240.0, + "grad_norm": 2.2453262518267216, + "language_loss": 0.63408953, + "learning_rate": 1.5704853911651779e-06, + "loss": 0.65548283, + "num_input_tokens_seen": 208443655, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.70703125, + "step": 9677, + "time_per_iteration": 3.852684736251831 + }, + { + "auxiliary_loss_clip": 0.01029913, + "auxiliary_loss_mlp": 0.01005476, + "balance_loss_clip": 1.00418842, + "balance_loss_mlp": 1.00840485, + "epoch": 0.5818728393206073, + "flos": 63918626342400.0, + "grad_norm": 0.8082693819649737, + "language_loss": 0.54284507, + "learning_rate": 1.5701050247654182e-06, + "loss": 0.56319892, + "num_input_tokens_seen": 208498405, + "router_z_loss_clip": 0.01287842, + "router_z_loss_mlp": 0.21484375, + "step": 9678, + "time_per_iteration": 3.1727702617645264 + }, + { + "auxiliary_loss_clip": 0.01029364, + "auxiliary_loss_mlp": 0.01, + "balance_loss_clip": 0.99879593, + "balance_loss_mlp": 1.00781882, + "epoch": 0.5819329625732752, + "flos": 64954108638720.0, + "grad_norm": 0.7323225743115229, + "language_loss": 0.56212348, + "learning_rate": 1.569724674667319e-06, + "loss": 0.58241719, + "num_input_tokens_seen": 208559075, + "router_z_loss_clip": 0.01202393, + "router_z_loss_mlp": 0.21484375, + "step": 9679, + "time_per_iteration": 4.407592296600342 + }, + { + "auxiliary_loss_clip": 0.01106016, + "auxiliary_loss_mlp": 0.01028689, + "balance_loss_clip": 1.01719165, + "balance_loss_mlp": 1.03636777, + "epoch": 0.5819930858259432, + "flos": 21215270522880.0, + "grad_norm": 1.5677269140843855, + "language_loss": 0.65393043, + "learning_rate": 1.5693443408853032e-06, + "loss": 0.67527747, + "num_input_tokens_seen": 208577770, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6953125, + "step": 9680, + "time_per_iteration": 3.854875087738037 + }, + { + "auxiliary_loss_clip": 0.01106852, + "auxiliary_loss_mlp": 0.01029479, + "balance_loss_clip": 1.01755846, + "balance_loss_mlp": 1.0371331, + "epoch": 0.5820532090786111, + "flos": 19458520151040.0, + "grad_norm": 1.7974099210270778, + "language_loss": 0.83398807, + "learning_rate": 1.5689640234337933e-06, + "loss": 0.85535139, + "num_input_tokens_seen": 208595110, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 9681, + "time_per_iteration": 3.906952381134033 + }, + { + "auxiliary_loss_clip": 0.01107734, + "auxiliary_loss_mlp": 0.01029608, + "balance_loss_clip": 1.01704884, + "balance_loss_mlp": 1.03765953, + "epoch": 0.5821133323312792, + "flos": 17712615686400.0, + "grad_norm": 1.7009206287297167, + "language_loss": 0.75691867, + "learning_rate": 1.5685837223272109e-06, + "loss": 0.77829218, + "num_input_tokens_seen": 208612080, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 9682, + "time_per_iteration": 2.4177029132843018 + }, + { + "auxiliary_loss_clip": 0.01109999, + "auxiliary_loss_mlp": 0.01029335, + "balance_loss_clip": 1.01696062, + "balance_loss_mlp": 1.03816795, + "epoch": 0.5821734555839471, + "flos": 24571804832640.0, + "grad_norm": 2.1225270667604, + "language_loss": 0.75228214, + "learning_rate": 1.568203437579977e-06, + "loss": 0.77367556, + "num_input_tokens_seen": 208630235, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71875, + "step": 9683, + "time_per_iteration": 2.483633279800415 + }, + { + "auxiliary_loss_clip": 0.01110877, + "auxiliary_loss_mlp": 0.01029498, + "balance_loss_clip": 1.01652765, + "balance_loss_mlp": 1.03809631, + "epoch": 0.5822335788366151, + "flos": 22382259488640.0, + "grad_norm": 1.7411447986789845, + "language_loss": 0.74026191, + "learning_rate": 1.5678231692065116e-06, + "loss": 0.76166564, + "num_input_tokens_seen": 208647925, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7265625, + "step": 9684, + "time_per_iteration": 2.445389986038208 + }, + { + "auxiliary_loss_clip": 0.0111011, + "auxiliary_loss_mlp": 0.01036105, + "balance_loss_clip": 1.02327847, + "balance_loss_mlp": 1.03914332, + "epoch": 0.582293702089283, + "flos": 26722494639360.0, + "grad_norm": 2.480778861643935, + "language_loss": 0.77930081, + "learning_rate": 1.5674429172212348e-06, + "loss": 0.80076301, + "num_input_tokens_seen": 208666180, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7109375, + "step": 9685, + "time_per_iteration": 2.4822564125061035 + }, + { + "auxiliary_loss_clip": 0.01108161, + "auxiliary_loss_mlp": 0.01037765, + "balance_loss_clip": 1.02525425, + "balance_loss_mlp": 1.0376507, + "epoch": 0.582353825341951, + "flos": 17348661129600.0, + "grad_norm": 1.6531366373498986, + "language_loss": 0.75214118, + "learning_rate": 1.5670626816385667e-06, + "loss": 0.77360046, + "num_input_tokens_seen": 208684240, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 9686, + "time_per_iteration": 2.441162109375 + }, + { + "auxiliary_loss_clip": 0.0102947, + "auxiliary_loss_mlp": 0.01008506, + "balance_loss_clip": 1.00720644, + "balance_loss_mlp": 1.00800455, + "epoch": 0.5824139485946189, + "flos": 55473261534720.0, + "grad_norm": 0.8335448804232356, + "language_loss": 0.57427585, + "learning_rate": 1.5666824624729244e-06, + "loss": 0.59465551, + "num_input_tokens_seen": 208736090, + "router_z_loss_clip": 0.01300049, + "router_z_loss_mlp": 0.21484375, + "step": 9687, + "time_per_iteration": 2.887495279312134 + }, + { + "auxiliary_loss_clip": 0.01106071, + "auxiliary_loss_mlp": 0.01028754, + "balance_loss_clip": 1.01534319, + "balance_loss_mlp": 1.03597438, + "epoch": 0.582474071847287, + "flos": 20303031790080.0, + "grad_norm": 1.808127013520305, + "language_loss": 0.69851446, + "learning_rate": 1.566302259738727e-06, + "loss": 0.7198627, + "num_input_tokens_seen": 208754600, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.703125, + "step": 9688, + "time_per_iteration": 2.475397825241089 + }, + { + "auxiliary_loss_clip": 0.01108083, + "auxiliary_loss_mlp": 0.01032707, + "balance_loss_clip": 1.02077413, + "balance_loss_mlp": 1.03770781, + "epoch": 0.5825341950999549, + "flos": 23878010661120.0, + "grad_norm": 2.8185672100752224, + "language_loss": 0.65197223, + "learning_rate": 1.5659220734503918e-06, + "loss": 0.67338014, + "num_input_tokens_seen": 208773140, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.703125, + "step": 9689, + "time_per_iteration": 2.461808204650879 + }, + { + "auxiliary_loss_clip": 0.01107982, + "auxiliary_loss_mlp": 0.01031185, + "balance_loss_clip": 1.0186801, + "balance_loss_mlp": 1.03977919, + "epoch": 0.5825943183526229, + "flos": 23113041690240.0, + "grad_norm": 1.5648827403998262, + "language_loss": 0.73213816, + "learning_rate": 1.5655419036223341e-06, + "loss": 0.75352979, + "num_input_tokens_seen": 208793410, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 9690, + "time_per_iteration": 2.459392786026001 + }, + { + "auxiliary_loss_clip": 0.01110714, + "auxiliary_loss_mlp": 0.0103267, + "balance_loss_clip": 1.01889586, + "balance_loss_mlp": 1.03849721, + "epoch": 0.5826544416052909, + "flos": 22857429530880.0, + "grad_norm": 1.9110650477929338, + "language_loss": 0.76118016, + "learning_rate": 1.5651617502689717e-06, + "loss": 0.78261399, + "num_input_tokens_seen": 208811920, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.72265625, + "step": 9691, + "time_per_iteration": 2.454533338546753 + }, + { + "auxiliary_loss_clip": 0.01107915, + "auxiliary_loss_mlp": 0.01033477, + "balance_loss_clip": 1.02115119, + "balance_loss_mlp": 1.03619492, + "epoch": 0.5827145648579588, + "flos": 31501845555840.0, + "grad_norm": 1.7126808977143095, + "language_loss": 0.80746913, + "learning_rate": 1.5647816134047184e-06, + "loss": 0.82888305, + "num_input_tokens_seen": 208834720, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.71875, + "step": 9692, + "time_per_iteration": 2.501497268676758 + }, + { + "auxiliary_loss_clip": 0.01027994, + "auxiliary_loss_mlp": 0.01002254, + "balance_loss_clip": 1.00103843, + "balance_loss_mlp": 1.0067246, + "epoch": 0.5827746881106268, + "flos": 69811817074560.0, + "grad_norm": 0.7602984909294345, + "language_loss": 0.56910902, + "learning_rate": 1.5644014930439907e-06, + "loss": 0.5894115, + "num_input_tokens_seen": 208898415, + "router_z_loss_clip": 0.012146, + "router_z_loss_mlp": 0.21289062, + "step": 9693, + "time_per_iteration": 3.0237975120544434 + }, + { + "auxiliary_loss_clip": 0.01106474, + "auxiliary_loss_mlp": 0.01033695, + "balance_loss_clip": 1.02250707, + "balance_loss_mlp": 1.03660345, + "epoch": 0.5828348113632947, + "flos": 23112395245440.0, + "grad_norm": 2.266427213008104, + "language_loss": 0.79537672, + "learning_rate": 1.5640213892012025e-06, + "loss": 0.81677842, + "num_input_tokens_seen": 208919045, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.69921875, + "step": 9694, + "time_per_iteration": 2.4761908054351807 + }, + { + "auxiliary_loss_clip": 0.01104051, + "auxiliary_loss_mlp": 0.01033564, + "balance_loss_clip": 1.02250743, + "balance_loss_mlp": 1.03815889, + "epoch": 0.5828949346159628, + "flos": 21873082245120.0, + "grad_norm": 1.3946621855299897, + "language_loss": 0.75905991, + "learning_rate": 1.5636413018907656e-06, + "loss": 0.7804361, + "num_input_tokens_seen": 208939375, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.66015625, + "step": 9695, + "time_per_iteration": 2.4863994121551514 + }, + { + "auxiliary_loss_clip": 0.01028568, + "auxiliary_loss_mlp": 0.01000024, + "balance_loss_clip": 0.99865955, + "balance_loss_mlp": 1.00692177, + "epoch": 0.5829550578686307, + "flos": 65962553950080.0, + "grad_norm": 0.7688369043614423, + "language_loss": 0.54971713, + "learning_rate": 1.563261231127095e-06, + "loss": 0.57000303, + "num_input_tokens_seen": 209004760, + "router_z_loss_clip": 0.01367188, + "router_z_loss_mlp": 0.21679688, + "step": 9696, + "time_per_iteration": 3.1397409439086914 + }, + { + "auxiliary_loss_clip": 0.01108342, + "auxiliary_loss_mlp": 0.0102802, + "balance_loss_clip": 1.01588464, + "balance_loss_mlp": 1.03907263, + "epoch": 0.5830151811212987, + "flos": 16289799079680.0, + "grad_norm": 2.461981122956424, + "language_loss": 0.7641257, + "learning_rate": 1.5628811769246021e-06, + "loss": 0.78548938, + "num_input_tokens_seen": 209022930, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 9697, + "time_per_iteration": 2.4391984939575195 + }, + { + "auxiliary_loss_clip": 0.01110278, + "auxiliary_loss_mlp": 0.01032297, + "balance_loss_clip": 1.01940477, + "balance_loss_mlp": 1.03790259, + "epoch": 0.5830753043739666, + "flos": 24168851084160.0, + "grad_norm": 1.5880971870479619, + "language_loss": 0.77744102, + "learning_rate": 1.5625011392976991e-06, + "loss": 0.79886687, + "num_input_tokens_seen": 209043740, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7265625, + "step": 9698, + "time_per_iteration": 2.5576770305633545 + }, + { + "auxiliary_loss_clip": 0.01107792, + "auxiliary_loss_mlp": 0.01037348, + "balance_loss_clip": 1.02412117, + "balance_loss_mlp": 1.03847361, + "epoch": 0.5831354276266346, + "flos": 27059050097280.0, + "grad_norm": 1.8122014087406897, + "language_loss": 0.83381891, + "learning_rate": 1.5621211182607966e-06, + "loss": 0.85527027, + "num_input_tokens_seen": 209068885, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.6953125, + "step": 9699, + "time_per_iteration": 2.5637032985687256 + }, + { + "auxiliary_loss_clip": 0.01108462, + "auxiliary_loss_mlp": 0.01029094, + "balance_loss_clip": 1.01663673, + "balance_loss_mlp": 1.03769052, + "epoch": 0.5831955508793025, + "flos": 23623475909760.0, + "grad_norm": 2.315377539273772, + "language_loss": 0.66859722, + "learning_rate": 1.561741113828305e-06, + "loss": 0.68997276, + "num_input_tokens_seen": 209087340, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.70703125, + "step": 9700, + "time_per_iteration": 2.471012592315674 + }, + { + "auxiliary_loss_clip": 0.01106015, + "auxiliary_loss_mlp": 0.01032562, + "balance_loss_clip": 1.0199858, + "balance_loss_mlp": 1.03591251, + "epoch": 0.5832556741319705, + "flos": 24973250209920.0, + "grad_norm": 1.5256356872175616, + "language_loss": 0.713889, + "learning_rate": 1.5613611260146344e-06, + "loss": 0.73527479, + "num_input_tokens_seen": 209108840, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 9701, + "time_per_iteration": 2.4697649478912354 + }, + { + "auxiliary_loss_clip": 0.01104917, + "auxiliary_loss_mlp": 0.01031819, + "balance_loss_clip": 1.01984477, + "balance_loss_mlp": 1.03625238, + "epoch": 0.5833157973846385, + "flos": 23221563655680.0, + "grad_norm": 1.810379708827147, + "language_loss": 0.85387969, + "learning_rate": 1.5609811548341936e-06, + "loss": 0.87524706, + "num_input_tokens_seen": 209127985, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6875, + "step": 9702, + "time_per_iteration": 2.481027841567993 + }, + { + "auxiliary_loss_clip": 0.01100783, + "auxiliary_loss_mlp": 0.0103365, + "balance_loss_clip": 1.02206278, + "balance_loss_mlp": 1.0346241, + "epoch": 0.5833759206373065, + "flos": 21977941023360.0, + "grad_norm": 1.4628982512923412, + "language_loss": 0.77776694, + "learning_rate": 1.560601200301392e-06, + "loss": 0.79911131, + "num_input_tokens_seen": 209146885, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 9703, + "time_per_iteration": 2.435124397277832 + }, + { + "auxiliary_loss_clip": 0.01110145, + "auxiliary_loss_mlp": 0.01030447, + "balance_loss_clip": 1.01736951, + "balance_loss_mlp": 1.03907001, + "epoch": 0.5834360438899745, + "flos": 21762405463680.0, + "grad_norm": 1.7159930715569567, + "language_loss": 0.71405482, + "learning_rate": 1.5602212624306366e-06, + "loss": 0.73546076, + "num_input_tokens_seen": 209166130, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 9704, + "time_per_iteration": 2.4737584590911865 + }, + { + "auxiliary_loss_clip": 0.01107118, + "auxiliary_loss_mlp": 0.01031577, + "balance_loss_clip": 1.02001369, + "balance_loss_mlp": 1.03844225, + "epoch": 0.5834961671426424, + "flos": 15992566035840.0, + "grad_norm": 2.155391395554278, + "language_loss": 0.814731, + "learning_rate": 1.559841341236335e-06, + "loss": 0.83611786, + "num_input_tokens_seen": 209183350, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6875, + "step": 9705, + "time_per_iteration": 2.456681966781616 + }, + { + "auxiliary_loss_clip": 0.01105829, + "auxiliary_loss_mlp": 0.01029323, + "balance_loss_clip": 1.01780725, + "balance_loss_mlp": 1.03706515, + "epoch": 0.5835562903953104, + "flos": 22818322598400.0, + "grad_norm": 2.7067870421451805, + "language_loss": 0.80659604, + "learning_rate": 1.5594614367328937e-06, + "loss": 0.82794762, + "num_input_tokens_seen": 209203945, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6875, + "step": 9706, + "time_per_iteration": 2.497509717941284 + }, + { + "auxiliary_loss_clip": 0.01104424, + "auxiliary_loss_mlp": 0.01031781, + "balance_loss_clip": 1.01860809, + "balance_loss_mlp": 1.03667164, + "epoch": 0.5836164136479783, + "flos": 48468056624640.0, + "grad_norm": 2.0481497339382084, + "language_loss": 0.74599034, + "learning_rate": 1.5590815489347187e-06, + "loss": 0.7673524, + "num_input_tokens_seen": 209227080, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.6796875, + "step": 9707, + "time_per_iteration": 2.6745028495788574 + }, + { + "auxiliary_loss_clip": 0.01103427, + "auxiliary_loss_mlp": 0.01030508, + "balance_loss_clip": 1.01876628, + "balance_loss_mlp": 1.03624749, + "epoch": 0.5836765369006464, + "flos": 26905998245760.0, + "grad_norm": 1.608372812838098, + "language_loss": 0.81249726, + "learning_rate": 1.5587016778562163e-06, + "loss": 0.83383656, + "num_input_tokens_seen": 209248170, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 9708, + "time_per_iteration": 2.492741584777832 + }, + { + "auxiliary_loss_clip": 0.01106344, + "auxiliary_loss_mlp": 0.01027852, + "balance_loss_clip": 1.01569307, + "balance_loss_mlp": 1.03903604, + "epoch": 0.5837366601533143, + "flos": 20084048524800.0, + "grad_norm": 1.7521527331614153, + "language_loss": 0.78249604, + "learning_rate": 1.5583218235117896e-06, + "loss": 0.80383801, + "num_input_tokens_seen": 209267730, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.671875, + "step": 9709, + "time_per_iteration": 2.476956844329834 + }, + { + "auxiliary_loss_clip": 0.01027997, + "auxiliary_loss_mlp": 0.01002158, + "balance_loss_clip": 1.00083506, + "balance_loss_mlp": 1.0065155, + "epoch": 0.5837967834059823, + "flos": 65363885971200.0, + "grad_norm": 0.7691792257321526, + "language_loss": 0.56582153, + "learning_rate": 1.557941985915844e-06, + "loss": 0.58612299, + "num_input_tokens_seen": 209332510, + "router_z_loss_clip": 0.01324463, + "router_z_loss_mlp": 0.21484375, + "step": 9710, + "time_per_iteration": 3.0814101696014404 + }, + { + "auxiliary_loss_clip": 0.0110345, + "auxiliary_loss_mlp": 0.01032732, + "balance_loss_clip": 1.0211035, + "balance_loss_mlp": 1.03715682, + "epoch": 0.5838569066586502, + "flos": 25338641310720.0, + "grad_norm": 1.5515305439757483, + "language_loss": 0.65762496, + "learning_rate": 1.5575621650827833e-06, + "loss": 0.67898679, + "num_input_tokens_seen": 209353355, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 9711, + "time_per_iteration": 2.4872825145721436 + }, + { + "auxiliary_loss_clip": 0.01112071, + "auxiliary_loss_mlp": 0.01034684, + "balance_loss_clip": 1.02147532, + "balance_loss_mlp": 1.03822017, + "epoch": 0.5839170299113182, + "flos": 22229243550720.0, + "grad_norm": 1.6429842517443687, + "language_loss": 0.78599298, + "learning_rate": 1.5571823610270085e-06, + "loss": 0.80746061, + "num_input_tokens_seen": 209370960, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73828125, + "step": 9712, + "time_per_iteration": 2.442077398300171 + }, + { + "auxiliary_loss_clip": 0.01105322, + "auxiliary_loss_mlp": 0.01025498, + "balance_loss_clip": 1.01343966, + "balance_loss_mlp": 1.03646183, + "epoch": 0.5839771531639861, + "flos": 22200012858240.0, + "grad_norm": 1.7240347174541215, + "language_loss": 0.73268932, + "learning_rate": 1.5568025737629234e-06, + "loss": 0.7539975, + "num_input_tokens_seen": 209390955, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6875, + "step": 9713, + "time_per_iteration": 2.459120750427246 + }, + { + "auxiliary_loss_clip": 0.01110691, + "auxiliary_loss_mlp": 0.01029593, + "balance_loss_clip": 1.01647365, + "balance_loss_mlp": 1.03805757, + "epoch": 0.5840372764166541, + "flos": 22419355259520.0, + "grad_norm": 1.8470967199163717, + "language_loss": 0.69391453, + "learning_rate": 1.5564228033049292e-06, + "loss": 0.71531737, + "num_input_tokens_seen": 209410260, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7265625, + "step": 9714, + "time_per_iteration": 2.4558205604553223 + }, + { + "auxiliary_loss_clip": 0.01106219, + "auxiliary_loss_mlp": 0.01030193, + "balance_loss_clip": 1.01737761, + "balance_loss_mlp": 1.03574395, + "epoch": 0.5840973996693221, + "flos": 19828256797440.0, + "grad_norm": 1.7342681115417722, + "language_loss": 0.79977894, + "learning_rate": 1.5560430496674268e-06, + "loss": 0.82114303, + "num_input_tokens_seen": 209429920, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.70703125, + "step": 9715, + "time_per_iteration": 2.426506757736206 + }, + { + "auxiliary_loss_clip": 0.01106351, + "auxiliary_loss_mlp": 0.01029273, + "balance_loss_clip": 1.01666617, + "balance_loss_mlp": 1.037099, + "epoch": 0.5841575229219901, + "flos": 21142982401920.0, + "grad_norm": 4.9488403812071535, + "language_loss": 0.72778314, + "learning_rate": 1.5556633128648167e-06, + "loss": 0.74913943, + "num_input_tokens_seen": 209449470, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69140625, + "step": 9716, + "time_per_iteration": 2.44687819480896 + }, + { + "auxiliary_loss_clip": 0.01103683, + "auxiliary_loss_mlp": 0.01027677, + "balance_loss_clip": 1.01595879, + "balance_loss_mlp": 1.03716838, + "epoch": 0.5842176461746581, + "flos": 24640322025600.0, + "grad_norm": 1.6127648254863816, + "language_loss": 0.74810076, + "learning_rate": 1.5552835929114976e-06, + "loss": 0.76941431, + "num_input_tokens_seen": 209467695, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 9717, + "time_per_iteration": 2.460857629776001 + }, + { + "auxiliary_loss_clip": 0.01105902, + "auxiliary_loss_mlp": 0.0103646, + "balance_loss_clip": 1.02414012, + "balance_loss_mlp": 1.03733993, + "epoch": 0.584277769427326, + "flos": 19131158574720.0, + "grad_norm": 2.202005488151785, + "language_loss": 0.7997486, + "learning_rate": 1.5549038898218697e-06, + "loss": 0.82117224, + "num_input_tokens_seen": 209484250, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 9718, + "time_per_iteration": 2.4178881645202637 + }, + { + "auxiliary_loss_clip": 0.01106549, + "auxiliary_loss_mlp": 0.01032036, + "balance_loss_clip": 1.01891065, + "balance_loss_mlp": 1.03846669, + "epoch": 0.584337892679994, + "flos": 22675111073280.0, + "grad_norm": 1.4800218219438264, + "language_loss": 0.67422116, + "learning_rate": 1.5545242036103306e-06, + "loss": 0.69560701, + "num_input_tokens_seen": 209502830, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6796875, + "step": 9719, + "time_per_iteration": 3.8449153900146484 + }, + { + "auxiliary_loss_clip": 0.01107677, + "auxiliary_loss_mlp": 0.01028351, + "balance_loss_clip": 1.01631081, + "balance_loss_mlp": 1.03717732, + "epoch": 0.5843980159326619, + "flos": 31284083352960.0, + "grad_norm": 2.1638863024999484, + "language_loss": 0.75937355, + "learning_rate": 1.5541445342912786e-06, + "loss": 0.78073382, + "num_input_tokens_seen": 209525995, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.703125, + "step": 9720, + "time_per_iteration": 2.521005630493164 + }, + { + "auxiliary_loss_clip": 0.01106215, + "auxiliary_loss_mlp": 0.01037967, + "balance_loss_clip": 1.02579594, + "balance_loss_mlp": 1.03623533, + "epoch": 0.58445813918533, + "flos": 22748117466240.0, + "grad_norm": 1.5774446570210707, + "language_loss": 0.83079016, + "learning_rate": 1.5537648818791105e-06, + "loss": 0.85223192, + "num_input_tokens_seen": 209545895, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69921875, + "step": 9721, + "time_per_iteration": 3.9998085498809814 + }, + { + "auxiliary_loss_clip": 0.01030301, + "auxiliary_loss_mlp": 0.01007637, + "balance_loss_clip": 1.00636697, + "balance_loss_mlp": 1.00867438, + "epoch": 0.5845182624379979, + "flos": 60686556658560.0, + "grad_norm": 0.9369686939257119, + "language_loss": 0.71297473, + "learning_rate": 1.5533852463882226e-06, + "loss": 0.73335409, + "num_input_tokens_seen": 209602315, + "router_z_loss_clip": 0.01269531, + "router_z_loss_mlp": 0.21679688, + "step": 9722, + "time_per_iteration": 4.55988335609436 + }, + { + "auxiliary_loss_clip": 0.01104254, + "auxiliary_loss_mlp": 0.01033708, + "balance_loss_clip": 1.0219183, + "balance_loss_mlp": 1.03621197, + "epoch": 0.5845783856906659, + "flos": 16362446336640.0, + "grad_norm": 2.3592007880272097, + "language_loss": 0.89236099, + "learning_rate": 1.5530056278330113e-06, + "loss": 0.91374058, + "num_input_tokens_seen": 209617615, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 9723, + "time_per_iteration": 3.8671655654907227 + }, + { + "auxiliary_loss_clip": 0.01106969, + "auxiliary_loss_mlp": 0.01031836, + "balance_loss_clip": 1.02042723, + "balance_loss_mlp": 1.03859067, + "epoch": 0.5846385089433338, + "flos": 20083402080000.0, + "grad_norm": 1.4227647539631216, + "language_loss": 0.68610382, + "learning_rate": 1.5526260262278709e-06, + "loss": 0.70749187, + "num_input_tokens_seen": 209637005, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.68359375, + "step": 9724, + "time_per_iteration": 2.428325653076172 + }, + { + "auxiliary_loss_clip": 0.01114065, + "auxiliary_loss_mlp": 0.01034629, + "balance_loss_clip": 1.0221715, + "balance_loss_mlp": 1.04199743, + "epoch": 0.5846986321960018, + "flos": 17311062568320.0, + "grad_norm": 1.8750713541003288, + "language_loss": 0.86348903, + "learning_rate": 1.552246441587197e-06, + "loss": 0.88497603, + "num_input_tokens_seen": 209653170, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.71875, + "step": 9725, + "time_per_iteration": 2.4113223552703857 + }, + { + "auxiliary_loss_clip": 0.01112675, + "auxiliary_loss_mlp": 0.01038738, + "balance_loss_clip": 1.02615535, + "balance_loss_mlp": 1.04008734, + "epoch": 0.5847587554486697, + "flos": 17197907748480.0, + "grad_norm": 1.9888550356442254, + "language_loss": 0.82856494, + "learning_rate": 1.5518668739253821e-06, + "loss": 0.85007912, + "num_input_tokens_seen": 209671275, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7265625, + "step": 9726, + "time_per_iteration": 2.4277760982513428 + }, + { + "auxiliary_loss_clip": 0.01108752, + "auxiliary_loss_mlp": 0.01036993, + "balance_loss_clip": 1.02550149, + "balance_loss_mlp": 1.03925705, + "epoch": 0.5848188787013378, + "flos": 24529106540160.0, + "grad_norm": 1.8720162128796731, + "language_loss": 0.66911906, + "learning_rate": 1.5514873232568206e-06, + "loss": 0.69057649, + "num_input_tokens_seen": 209690380, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6953125, + "step": 9727, + "time_per_iteration": 2.4941296577453613 + }, + { + "auxiliary_loss_clip": 0.011109, + "auxiliary_loss_mlp": 0.01042126, + "balance_loss_clip": 1.02927577, + "balance_loss_mlp": 1.04078412, + "epoch": 0.5848790019540057, + "flos": 20628382204800.0, + "grad_norm": 1.755089310778911, + "language_loss": 0.81880605, + "learning_rate": 1.5511077895959055e-06, + "loss": 0.84033632, + "num_input_tokens_seen": 209708845, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69921875, + "step": 9728, + "time_per_iteration": 2.504457950592041 + }, + { + "auxiliary_loss_clip": 0.01105423, + "auxiliary_loss_mlp": 0.01036783, + "balance_loss_clip": 1.02519631, + "balance_loss_mlp": 1.03857303, + "epoch": 0.5849391252066737, + "flos": 22418852469120.0, + "grad_norm": 1.9458365932895556, + "language_loss": 0.78459418, + "learning_rate": 1.550728272957027e-06, + "loss": 0.80601627, + "num_input_tokens_seen": 209729000, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66796875, + "step": 9729, + "time_per_iteration": 2.4906978607177734 + }, + { + "auxiliary_loss_clip": 0.0110723, + "auxiliary_loss_mlp": 0.01029316, + "balance_loss_clip": 1.01629853, + "balance_loss_mlp": 1.03705525, + "epoch": 0.5849992484593417, + "flos": 25410929431680.0, + "grad_norm": 2.2265789157985205, + "language_loss": 0.70611644, + "learning_rate": 1.5503487733545782e-06, + "loss": 0.72748184, + "num_input_tokens_seen": 209747435, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.703125, + "step": 9730, + "time_per_iteration": 2.5273194313049316 + }, + { + "auxiliary_loss_clip": 0.01113418, + "auxiliary_loss_mlp": 0.0103557, + "balance_loss_clip": 1.02182508, + "balance_loss_mlp": 1.04057014, + "epoch": 0.5850593717120096, + "flos": 21065163586560.0, + "grad_norm": 2.222037907468424, + "language_loss": 0.78473902, + "learning_rate": 1.5499692908029482e-06, + "loss": 0.80622888, + "num_input_tokens_seen": 209764910, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.7265625, + "step": 9731, + "time_per_iteration": 2.4710583686828613 + }, + { + "auxiliary_loss_clip": 0.0110815, + "auxiliary_loss_mlp": 0.01032599, + "balance_loss_clip": 1.01983809, + "balance_loss_mlp": 1.03908777, + "epoch": 0.5851194949646776, + "flos": 25301545539840.0, + "grad_norm": 1.7845208257427057, + "language_loss": 0.69966131, + "learning_rate": 1.549589825316528e-06, + "loss": 0.72106874, + "num_input_tokens_seen": 209786115, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69140625, + "step": 9732, + "time_per_iteration": 2.4975006580352783 + }, + { + "auxiliary_loss_clip": 0.01113456, + "auxiliary_loss_mlp": 0.01033051, + "balance_loss_clip": 1.01913929, + "balance_loss_mlp": 1.04045916, + "epoch": 0.5851796182173455, + "flos": 23587242065280.0, + "grad_norm": 1.73190032828597, + "language_loss": 0.52698147, + "learning_rate": 1.5492103769097075e-06, + "loss": 0.54844654, + "num_input_tokens_seen": 209806095, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.73046875, + "step": 9733, + "time_per_iteration": 2.485399007797241 + }, + { + "auxiliary_loss_clip": 0.01111159, + "auxiliary_loss_mlp": 0.01030285, + "balance_loss_clip": 1.01741672, + "balance_loss_mlp": 1.04071164, + "epoch": 0.5852397414700136, + "flos": 24822712310400.0, + "grad_norm": 6.263677136925273, + "language_loss": 0.87694037, + "learning_rate": 1.5488309455968739e-06, + "loss": 0.89835489, + "num_input_tokens_seen": 209823650, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 9734, + "time_per_iteration": 2.472288131713867 + }, + { + "auxiliary_loss_clip": 0.01103403, + "auxiliary_loss_mlp": 0.01032428, + "balance_loss_clip": 1.02119839, + "balance_loss_mlp": 1.03833449, + "epoch": 0.5852998647226815, + "flos": 19937784343680.0, + "grad_norm": 1.513447931139509, + "language_loss": 0.72063559, + "learning_rate": 1.5484515313924163e-06, + "loss": 0.7419939, + "num_input_tokens_seen": 209843220, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 9735, + "time_per_iteration": 2.4491236209869385 + }, + { + "auxiliary_loss_clip": 0.01113365, + "auxiliary_loss_mlp": 0.01041862, + "balance_loss_clip": 1.02809381, + "balance_loss_mlp": 1.04022026, + "epoch": 0.5853599879753495, + "flos": 16720367408640.0, + "grad_norm": 2.443961120173282, + "language_loss": 0.74189854, + "learning_rate": 1.5480721343107217e-06, + "loss": 0.76345086, + "num_input_tokens_seen": 209854880, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.73046875, + "step": 9736, + "time_per_iteration": 2.419142961502075 + }, + { + "auxiliary_loss_clip": 0.01106138, + "auxiliary_loss_mlp": 0.01032073, + "balance_loss_clip": 1.01981854, + "balance_loss_mlp": 1.0379591, + "epoch": 0.5854201112280174, + "flos": 44456583680640.0, + "grad_norm": 2.2236691167379083, + "language_loss": 0.70181298, + "learning_rate": 1.5476927543661772e-06, + "loss": 0.72319508, + "num_input_tokens_seen": 209877870, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6796875, + "step": 9737, + "time_per_iteration": 2.6583194732666016 + }, + { + "auxiliary_loss_clip": 0.01106196, + "auxiliary_loss_mlp": 0.0103613, + "balance_loss_clip": 1.02428091, + "balance_loss_mlp": 1.03835154, + "epoch": 0.5854802344806854, + "flos": 20339193807360.0, + "grad_norm": 1.7203982017599655, + "language_loss": 0.82579291, + "learning_rate": 1.547313391573169e-06, + "loss": 0.84721613, + "num_input_tokens_seen": 209896690, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.67578125, + "step": 9738, + "time_per_iteration": 2.4531257152557373 + }, + { + "auxiliary_loss_clip": 0.01113247, + "auxiliary_loss_mlp": 0.01036554, + "balance_loss_clip": 1.02323246, + "balance_loss_mlp": 1.04034615, + "epoch": 0.5855403577333533, + "flos": 20921054221440.0, + "grad_norm": 1.7945048569600959, + "language_loss": 0.68588519, + "learning_rate": 1.546934045946082e-06, + "loss": 0.70738328, + "num_input_tokens_seen": 209914640, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7265625, + "step": 9739, + "time_per_iteration": 2.456914186477661 + }, + { + "auxiliary_loss_clip": 0.01108939, + "auxiliary_loss_mlp": 0.0102703, + "balance_loss_clip": 1.01416099, + "balance_loss_mlp": 1.03718436, + "epoch": 0.5856004809860214, + "flos": 20448649526400.0, + "grad_norm": 3.661868392990544, + "language_loss": 0.58782631, + "learning_rate": 1.5465547174993017e-06, + "loss": 0.60918605, + "num_input_tokens_seen": 209933375, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 9740, + "time_per_iteration": 2.4507863521575928 + }, + { + "auxiliary_loss_clip": 0.01106066, + "auxiliary_loss_mlp": 0.01027305, + "balance_loss_clip": 1.01462674, + "balance_loss_mlp": 1.03621328, + "epoch": 0.5856606042386893, + "flos": 19640766781440.0, + "grad_norm": 2.5503677599504138, + "language_loss": 0.74937272, + "learning_rate": 1.5461754062472113e-06, + "loss": 0.77070647, + "num_input_tokens_seen": 209952055, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 9741, + "time_per_iteration": 2.4589905738830566 + }, + { + "auxiliary_loss_clip": 0.01110252, + "auxiliary_loss_mlp": 0.01030619, + "balance_loss_clip": 1.01856065, + "balance_loss_mlp": 1.04028082, + "epoch": 0.5857207274913573, + "flos": 21686166846720.0, + "grad_norm": 5.17192355324585, + "language_loss": 0.75760782, + "learning_rate": 1.5457961122041959e-06, + "loss": 0.77901655, + "num_input_tokens_seen": 209971190, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.69921875, + "step": 9742, + "time_per_iteration": 2.4604122638702393 + }, + { + "auxiliary_loss_clip": 0.01106761, + "auxiliary_loss_mlp": 0.01028937, + "balance_loss_clip": 1.01720667, + "balance_loss_mlp": 1.03765917, + "epoch": 0.5857808507440253, + "flos": 23182708118400.0, + "grad_norm": 1.843175426453247, + "language_loss": 0.74955082, + "learning_rate": 1.5454168353846369e-06, + "loss": 0.77090788, + "num_input_tokens_seen": 209990695, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.69140625, + "step": 9743, + "time_per_iteration": 2.4604763984680176 + }, + { + "auxiliary_loss_clip": 0.01106043, + "auxiliary_loss_mlp": 0.01028717, + "balance_loss_clip": 1.0171833, + "balance_loss_mlp": 1.03878045, + "epoch": 0.5858409739966932, + "flos": 27235299156480.0, + "grad_norm": 1.7092789137699793, + "language_loss": 0.81049299, + "learning_rate": 1.5450375758029172e-06, + "loss": 0.83184063, + "num_input_tokens_seen": 210010210, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 9744, + "time_per_iteration": 2.516517162322998 + }, + { + "auxiliary_loss_clip": 0.0111328, + "auxiliary_loss_mlp": 0.01029291, + "balance_loss_clip": 1.01756728, + "balance_loss_mlp": 1.04009771, + "epoch": 0.5859010972493612, + "flos": 27855512317440.0, + "grad_norm": 1.7947324983718902, + "language_loss": 0.71260583, + "learning_rate": 1.5446583334734183e-06, + "loss": 0.73403156, + "num_input_tokens_seen": 210030030, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.73046875, + "step": 9745, + "time_per_iteration": 2.5095736980438232 + }, + { + "auxiliary_loss_clip": 0.01029472, + "auxiliary_loss_mlp": 0.01001042, + "balance_loss_clip": 0.99980211, + "balance_loss_mlp": 1.00798225, + "epoch": 0.5859612205020291, + "flos": 70007064428160.0, + "grad_norm": 0.7288291603374486, + "language_loss": 0.5328598, + "learning_rate": 1.5442791084105204e-06, + "loss": 0.55316496, + "num_input_tokens_seen": 210094840, + "router_z_loss_clip": 0.01239014, + "router_z_loss_mlp": 0.21484375, + "step": 9746, + "time_per_iteration": 3.1588006019592285 + }, + { + "auxiliary_loss_clip": 0.01111789, + "auxiliary_loss_mlp": 0.01028882, + "balance_loss_clip": 1.01581621, + "balance_loss_mlp": 1.04034877, + "epoch": 0.5860213437546972, + "flos": 24056019486720.0, + "grad_norm": 2.1076565833563743, + "language_loss": 0.73041242, + "learning_rate": 1.5438999006286054e-06, + "loss": 0.75181913, + "num_input_tokens_seen": 210114660, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71484375, + "step": 9747, + "time_per_iteration": 2.529571533203125 + }, + { + "auxiliary_loss_clip": 0.01110161, + "auxiliary_loss_mlp": 0.01034205, + "balance_loss_clip": 1.02153921, + "balance_loss_mlp": 1.03954244, + "epoch": 0.5860814670073651, + "flos": 18947583141120.0, + "grad_norm": 2.1114805581962934, + "language_loss": 0.81232262, + "learning_rate": 1.543520710142051e-06, + "loss": 0.83376622, + "num_input_tokens_seen": 210132770, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 9748, + "time_per_iteration": 2.4205257892608643 + }, + { + "auxiliary_loss_clip": 0.01108981, + "auxiliary_loss_mlp": 0.01031425, + "balance_loss_clip": 1.01904488, + "balance_loss_mlp": 1.03803837, + "epoch": 0.5861415902600331, + "flos": 22561848512640.0, + "grad_norm": 1.6594717662282998, + "language_loss": 0.71928638, + "learning_rate": 1.5431415369652375e-06, + "loss": 0.74069047, + "num_input_tokens_seen": 210151895, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 9749, + "time_per_iteration": 2.4881033897399902 + }, + { + "auxiliary_loss_clip": 0.0110821, + "auxiliary_loss_mlp": 0.0103104, + "balance_loss_clip": 1.01869583, + "balance_loss_mlp": 1.04076529, + "epoch": 0.586201713512701, + "flos": 14392027912320.0, + "grad_norm": 2.0326510096801056, + "language_loss": 0.7436285, + "learning_rate": 1.5427623811125428e-06, + "loss": 0.76502097, + "num_input_tokens_seen": 210168040, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.67578125, + "step": 9750, + "time_per_iteration": 2.414621353149414 + }, + { + "auxiliary_loss_clip": 0.01108261, + "auxiliary_loss_mlp": 0.01035384, + "balance_loss_clip": 1.02279603, + "balance_loss_mlp": 1.03921914, + "epoch": 0.586261836765369, + "flos": 19498560837120.0, + "grad_norm": 1.743949260258008, + "language_loss": 0.71048808, + "learning_rate": 1.542383242598344e-06, + "loss": 0.73192453, + "num_input_tokens_seen": 210187720, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69140625, + "step": 9751, + "time_per_iteration": 2.4829182624816895 + }, + { + "auxiliary_loss_clip": 0.01112341, + "auxiliary_loss_mlp": 0.01034246, + "balance_loss_clip": 1.02050161, + "balance_loss_mlp": 1.04000425, + "epoch": 0.5863219600180369, + "flos": 20701819560960.0, + "grad_norm": 1.8642101544605258, + "language_loss": 0.74632239, + "learning_rate": 1.5420041214370184e-06, + "loss": 0.76778823, + "num_input_tokens_seen": 210206080, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.72265625, + "step": 9752, + "time_per_iteration": 2.4715142250061035 + }, + { + "auxiliary_loss_clip": 0.01107296, + "auxiliary_loss_mlp": 0.01031223, + "balance_loss_clip": 1.01895666, + "balance_loss_mlp": 1.0386945, + "epoch": 0.586382083270705, + "flos": 19792130693760.0, + "grad_norm": 1.7856678678755609, + "language_loss": 0.77179754, + "learning_rate": 1.541625017642943e-06, + "loss": 0.79318273, + "num_input_tokens_seen": 210225660, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6875, + "step": 9753, + "time_per_iteration": 2.443422794342041 + }, + { + "auxiliary_loss_clip": 0.01105348, + "auxiliary_loss_mlp": 0.01026457, + "balance_loss_clip": 1.01546574, + "balance_loss_mlp": 1.03864121, + "epoch": 0.5864422065233729, + "flos": 16500558130560.0, + "grad_norm": 1.9587413882718219, + "language_loss": 0.70530736, + "learning_rate": 1.5412459312304927e-06, + "loss": 0.72662538, + "num_input_tokens_seen": 210242725, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6640625, + "step": 9754, + "time_per_iteration": 2.409973621368408 + }, + { + "auxiliary_loss_clip": 0.01107928, + "auxiliary_loss_mlp": 0.01031096, + "balance_loss_clip": 1.01829863, + "balance_loss_mlp": 1.03827429, + "epoch": 0.5865023297760409, + "flos": 20413277608320.0, + "grad_norm": 1.747136336565704, + "language_loss": 0.72055626, + "learning_rate": 1.540866862214043e-06, + "loss": 0.74194646, + "num_input_tokens_seen": 210263225, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 9755, + "time_per_iteration": 2.4600584506988525 + }, + { + "auxiliary_loss_clip": 0.01030392, + "auxiliary_loss_mlp": 0.01003259, + "balance_loss_clip": 1.00203705, + "balance_loss_mlp": 1.00899053, + "epoch": 0.5865624530287089, + "flos": 63350769254400.0, + "grad_norm": 0.7394274912640315, + "language_loss": 0.5697751, + "learning_rate": 1.540487810607967e-06, + "loss": 0.59011161, + "num_input_tokens_seen": 210322310, + "router_z_loss_clip": 0.01220703, + "router_z_loss_mlp": 0.21484375, + "step": 9756, + "time_per_iteration": 3.0282156467437744 + }, + { + "auxiliary_loss_clip": 0.01105056, + "auxiliary_loss_mlp": 0.01032383, + "balance_loss_clip": 1.02114117, + "balance_loss_mlp": 1.03774321, + "epoch": 0.5866225762813768, + "flos": 27016279977600.0, + "grad_norm": 1.7702895540430315, + "language_loss": 0.76155764, + "learning_rate": 1.5401087764266396e-06, + "loss": 0.78293204, + "num_input_tokens_seen": 210340845, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 9757, + "time_per_iteration": 2.5391111373901367 + }, + { + "auxiliary_loss_clip": 0.01030392, + "auxiliary_loss_mlp": 0.01004494, + "balance_loss_clip": 1.00322425, + "balance_loss_mlp": 1.00899124, + "epoch": 0.5866826995340448, + "flos": 72987038507520.0, + "grad_norm": 0.8655305518018972, + "language_loss": 0.60531819, + "learning_rate": 1.5397297596844337e-06, + "loss": 0.62566704, + "num_input_tokens_seen": 210397815, + "router_z_loss_clip": 0.01269531, + "router_z_loss_mlp": 0.21484375, + "step": 9758, + "time_per_iteration": 3.0623366832733154 + }, + { + "auxiliary_loss_clip": 0.01113209, + "auxiliary_loss_mlp": 0.01030767, + "balance_loss_clip": 1.01773787, + "balance_loss_mlp": 1.03982747, + "epoch": 0.5867428227867127, + "flos": 21285727050240.0, + "grad_norm": 2.3357598656034897, + "language_loss": 0.71766979, + "learning_rate": 1.5393507603957212e-06, + "loss": 0.73910952, + "num_input_tokens_seen": 210413900, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.734375, + "step": 9759, + "time_per_iteration": 2.474400043487549 + }, + { + "auxiliary_loss_clip": 0.0111074, + "auxiliary_loss_mlp": 0.01032361, + "balance_loss_clip": 1.0208931, + "balance_loss_mlp": 1.04039979, + "epoch": 0.5868029460393808, + "flos": 33468852188160.0, + "grad_norm": 1.5007272591007914, + "language_loss": 0.73244017, + "learning_rate": 1.5389717785748742e-06, + "loss": 0.7538712, + "num_input_tokens_seen": 210434110, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.703125, + "step": 9760, + "time_per_iteration": 4.081261396408081 + }, + { + "auxiliary_loss_clip": 0.01106401, + "auxiliary_loss_mlp": 0.01027896, + "balance_loss_clip": 1.01556969, + "balance_loss_mlp": 1.03715563, + "epoch": 0.5868630692920487, + "flos": 17889475276800.0, + "grad_norm": 1.8805423527385174, + "language_loss": 0.72491598, + "learning_rate": 1.5385928142362637e-06, + "loss": 0.74625897, + "num_input_tokens_seen": 210451685, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69140625, + "step": 9761, + "time_per_iteration": 2.42621111869812 + }, + { + "auxiliary_loss_clip": 0.01107357, + "auxiliary_loss_mlp": 0.0103042, + "balance_loss_clip": 1.0167706, + "balance_loss_mlp": 1.03563881, + "epoch": 0.5869231925447167, + "flos": 21035035054080.0, + "grad_norm": 1.837534804487864, + "language_loss": 0.74821299, + "learning_rate": 1.5382138673942597e-06, + "loss": 0.76959074, + "num_input_tokens_seen": 210470825, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71875, + "step": 9762, + "time_per_iteration": 3.899322032928467 + }, + { + "auxiliary_loss_clip": 0.01107201, + "auxiliary_loss_mlp": 0.01029073, + "balance_loss_clip": 1.01706839, + "balance_loss_mlp": 1.03918064, + "epoch": 0.5869833157973846, + "flos": 74738219293440.0, + "grad_norm": 1.367882310541282, + "language_loss": 0.72223246, + "learning_rate": 1.5378349380632317e-06, + "loss": 0.74359524, + "num_input_tokens_seen": 210500075, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 9763, + "time_per_iteration": 4.356280326843262 + }, + { + "auxiliary_loss_clip": 0.01105096, + "auxiliary_loss_mlp": 0.01029686, + "balance_loss_clip": 1.01809907, + "balance_loss_mlp": 1.03675938, + "epoch": 0.5870434390500526, + "flos": 17638998762240.0, + "grad_norm": 1.4976833867772195, + "language_loss": 0.79729784, + "learning_rate": 1.53745602625755e-06, + "loss": 0.81864572, + "num_input_tokens_seen": 210518150, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.68359375, + "step": 9764, + "time_per_iteration": 3.9194676876068115 + }, + { + "auxiliary_loss_clip": 0.01108839, + "auxiliary_loss_mlp": 0.0103278, + "balance_loss_clip": 1.0202508, + "balance_loss_mlp": 1.03856993, + "epoch": 0.5871035623027205, + "flos": 21506146859520.0, + "grad_norm": 2.0111563944475908, + "language_loss": 0.78612924, + "learning_rate": 1.5370771319915819e-06, + "loss": 0.80754542, + "num_input_tokens_seen": 210537760, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 9765, + "time_per_iteration": 2.53273344039917 + }, + { + "auxiliary_loss_clip": 0.01106605, + "auxiliary_loss_mlp": 0.01029586, + "balance_loss_clip": 1.01712823, + "balance_loss_mlp": 1.03891206, + "epoch": 0.5871636855553886, + "flos": 13551861818880.0, + "grad_norm": 1.8843759319265088, + "language_loss": 0.83718032, + "learning_rate": 1.5366982552796947e-06, + "loss": 0.8585422, + "num_input_tokens_seen": 210555515, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.67578125, + "step": 9766, + "time_per_iteration": 2.467556953430176 + }, + { + "auxiliary_loss_clip": 0.01110103, + "auxiliary_loss_mlp": 0.01032223, + "balance_loss_clip": 1.02024257, + "balance_loss_mlp": 1.03847504, + "epoch": 0.5872238088080565, + "flos": 26212922346240.0, + "grad_norm": 2.6418409503909674, + "language_loss": 0.69825381, + "learning_rate": 1.536319396136257e-06, + "loss": 0.71967709, + "num_input_tokens_seen": 210575000, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.71875, + "step": 9767, + "time_per_iteration": 2.514695405960083 + }, + { + "auxiliary_loss_clip": 0.01108076, + "auxiliary_loss_mlp": 0.01034383, + "balance_loss_clip": 1.02162123, + "balance_loss_mlp": 1.03721809, + "epoch": 0.5872839320607245, + "flos": 30665198995200.0, + "grad_norm": 1.7100990150928812, + "language_loss": 0.6345011, + "learning_rate": 1.5359405545756336e-06, + "loss": 0.65592575, + "num_input_tokens_seen": 210595185, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7109375, + "step": 9768, + "time_per_iteration": 2.510586738586426 + }, + { + "auxiliary_loss_clip": 0.01029111, + "auxiliary_loss_mlp": 0.00999867, + "balance_loss_clip": 0.9985916, + "balance_loss_mlp": 1.00760961, + "epoch": 0.5873440553133924, + "flos": 60303570871680.0, + "grad_norm": 0.7128870586180143, + "language_loss": 0.53924322, + "learning_rate": 1.5355617306121914e-06, + "loss": 0.559533, + "num_input_tokens_seen": 210653210, + "router_z_loss_clip": 0.01275635, + "router_z_loss_mlp": 0.21484375, + "step": 9769, + "time_per_iteration": 3.0710904598236084 + }, + { + "auxiliary_loss_clip": 0.01104834, + "auxiliary_loss_mlp": 0.01033468, + "balance_loss_clip": 1.02148712, + "balance_loss_mlp": 1.03672135, + "epoch": 0.5874041785660604, + "flos": 21539292134400.0, + "grad_norm": 1.4641633186547043, + "language_loss": 0.70532131, + "learning_rate": 1.5351829242602945e-06, + "loss": 0.7267043, + "num_input_tokens_seen": 210673750, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 9770, + "time_per_iteration": 2.516707420349121 + }, + { + "auxiliary_loss_clip": 0.01106458, + "auxiliary_loss_mlp": 0.01031999, + "balance_loss_clip": 1.0193336, + "balance_loss_mlp": 1.03782773, + "epoch": 0.5874643018187284, + "flos": 24388947671040.0, + "grad_norm": 3.691664094278214, + "language_loss": 0.67488074, + "learning_rate": 1.5348041355343077e-06, + "loss": 0.69626534, + "num_input_tokens_seen": 210692960, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 9771, + "time_per_iteration": 2.4816172122955322 + }, + { + "auxiliary_loss_clip": 0.01107891, + "auxiliary_loss_mlp": 0.0103358, + "balance_loss_clip": 1.02041984, + "balance_loss_mlp": 1.03628254, + "epoch": 0.5875244250713964, + "flos": 28147717457280.0, + "grad_norm": 1.6051808895674682, + "language_loss": 0.65752995, + "learning_rate": 1.5344253644485954e-06, + "loss": 0.67894471, + "num_input_tokens_seen": 210714040, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71484375, + "step": 9772, + "time_per_iteration": 2.5371270179748535 + }, + { + "auxiliary_loss_clip": 0.01111325, + "auxiliary_loss_mlp": 0.01038697, + "balance_loss_clip": 1.02478576, + "balance_loss_mlp": 1.03915095, + "epoch": 0.5875845483240644, + "flos": 25812410722560.0, + "grad_norm": 1.7393863773768459, + "language_loss": 0.74272907, + "learning_rate": 1.534046611017519e-06, + "loss": 0.7642293, + "num_input_tokens_seen": 210733710, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.71875, + "step": 9773, + "time_per_iteration": 2.4879984855651855 + }, + { + "auxiliary_loss_clip": 0.01108784, + "auxiliary_loss_mlp": 0.01037956, + "balance_loss_clip": 1.02513528, + "balance_loss_mlp": 1.03829455, + "epoch": 0.5876446715767323, + "flos": 26906572863360.0, + "grad_norm": 2.707979121748391, + "language_loss": 0.53293657, + "learning_rate": 1.5336678752554421e-06, + "loss": 0.55440396, + "num_input_tokens_seen": 210753580, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 9774, + "time_per_iteration": 2.5072500705718994 + }, + { + "auxiliary_loss_clip": 0.01109655, + "auxiliary_loss_mlp": 0.01035615, + "balance_loss_clip": 1.02257991, + "balance_loss_mlp": 1.03880942, + "epoch": 0.5877047948294003, + "flos": 36684832579200.0, + "grad_norm": 2.48971225310605, + "language_loss": 0.65312964, + "learning_rate": 1.5332891571767264e-06, + "loss": 0.6745823, + "num_input_tokens_seen": 210773495, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 9775, + "time_per_iteration": 2.5655953884124756 + }, + { + "auxiliary_loss_clip": 0.01106711, + "auxiliary_loss_mlp": 0.01033817, + "balance_loss_clip": 1.02168775, + "balance_loss_mlp": 1.03676975, + "epoch": 0.5877649180820682, + "flos": 26724721282560.0, + "grad_norm": 1.785458151895031, + "language_loss": 0.73554152, + "learning_rate": 1.5329104567957326e-06, + "loss": 0.7569468, + "num_input_tokens_seen": 210793645, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69921875, + "step": 9776, + "time_per_iteration": 2.54707932472229 + }, + { + "auxiliary_loss_clip": 0.01106906, + "auxiliary_loss_mlp": 0.01033226, + "balance_loss_clip": 1.02136469, + "balance_loss_mlp": 1.0373795, + "epoch": 0.5878250413347362, + "flos": 21032197879680.0, + "grad_norm": 2.328878154900185, + "language_loss": 0.74400878, + "learning_rate": 1.532531774126821e-06, + "loss": 0.76541013, + "num_input_tokens_seen": 210813415, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6953125, + "step": 9777, + "time_per_iteration": 2.5013017654418945 + }, + { + "auxiliary_loss_clip": 0.01103004, + "auxiliary_loss_mlp": 0.01029838, + "balance_loss_clip": 1.01816726, + "balance_loss_mlp": 1.03745651, + "epoch": 0.5878851645874041, + "flos": 25484259047040.0, + "grad_norm": 1.542678345734907, + "language_loss": 0.74238187, + "learning_rate": 1.5321531091843512e-06, + "loss": 0.76371026, + "num_input_tokens_seen": 210833850, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 9778, + "time_per_iteration": 2.548445224761963 + }, + { + "auxiliary_loss_clip": 0.01104043, + "auxiliary_loss_mlp": 0.01029504, + "balance_loss_clip": 1.01765513, + "balance_loss_mlp": 1.03588045, + "epoch": 0.5879452878400722, + "flos": 23769129559680.0, + "grad_norm": 1.8670942886874708, + "language_loss": 0.70107329, + "learning_rate": 1.5317744619826824e-06, + "loss": 0.72240877, + "num_input_tokens_seen": 210853115, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 9779, + "time_per_iteration": 2.440385341644287 + }, + { + "auxiliary_loss_clip": 0.01109422, + "auxiliary_loss_mlp": 0.01032703, + "balance_loss_clip": 1.02029324, + "balance_loss_mlp": 1.03690886, + "epoch": 0.5880054110927401, + "flos": 17824513530240.0, + "grad_norm": 1.8860885981569304, + "language_loss": 0.67181754, + "learning_rate": 1.5313958325361727e-06, + "loss": 0.69323874, + "num_input_tokens_seen": 210872090, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7265625, + "step": 9780, + "time_per_iteration": 2.5105738639831543 + }, + { + "auxiliary_loss_clip": 0.01108309, + "auxiliary_loss_mlp": 0.01035836, + "balance_loss_clip": 1.02308023, + "balance_loss_mlp": 1.03872418, + "epoch": 0.5880655343454081, + "flos": 19463404400640.0, + "grad_norm": 3.148071574180809, + "language_loss": 0.72608495, + "learning_rate": 1.5310172208591807e-06, + "loss": 0.74752629, + "num_input_tokens_seen": 210888490, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 9781, + "time_per_iteration": 2.4174652099609375 + }, + { + "auxiliary_loss_clip": 0.01104991, + "auxiliary_loss_mlp": 0.0103161, + "balance_loss_clip": 1.01946235, + "balance_loss_mlp": 1.03562713, + "epoch": 0.588125657598076, + "flos": 21397588980480.0, + "grad_norm": 1.4505377017032317, + "language_loss": 0.70405555, + "learning_rate": 1.5306386269660622e-06, + "loss": 0.72542155, + "num_input_tokens_seen": 210908220, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6953125, + "step": 9782, + "time_per_iteration": 2.4488813877105713 + }, + { + "auxiliary_loss_clip": 0.01108141, + "auxiliary_loss_mlp": 0.01032909, + "balance_loss_clip": 1.02064204, + "balance_loss_mlp": 1.03547001, + "epoch": 0.588185780850744, + "flos": 16034653797120.0, + "grad_norm": 3.528130932430564, + "language_loss": 0.70414114, + "learning_rate": 1.5302600508711741e-06, + "loss": 0.72555161, + "num_input_tokens_seen": 210923945, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.7265625, + "step": 9783, + "time_per_iteration": 2.411940813064575 + }, + { + "auxiliary_loss_clip": 0.01109132, + "auxiliary_loss_mlp": 0.01032084, + "balance_loss_clip": 1.0186553, + "balance_loss_mlp": 1.03764033, + "epoch": 0.588245904103412, + "flos": 23728226947200.0, + "grad_norm": 2.8122189742296952, + "language_loss": 0.6903708, + "learning_rate": 1.5298814925888719e-06, + "loss": 0.71178293, + "num_input_tokens_seen": 210941955, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.71484375, + "step": 9784, + "time_per_iteration": 2.4809060096740723 + }, + { + "auxiliary_loss_clip": 0.01107726, + "auxiliary_loss_mlp": 0.01034991, + "balance_loss_clip": 1.02227104, + "balance_loss_mlp": 1.03585327, + "epoch": 0.58830602735608, + "flos": 33802534558080.0, + "grad_norm": 1.976987554101205, + "language_loss": 0.69485259, + "learning_rate": 1.5295029521335102e-06, + "loss": 0.71627975, + "num_input_tokens_seen": 210963105, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 9785, + "time_per_iteration": 2.5458383560180664 + }, + { + "auxiliary_loss_clip": 0.0110444, + "auxiliary_loss_mlp": 0.01026297, + "balance_loss_clip": 1.01477504, + "balance_loss_mlp": 1.03624511, + "epoch": 0.588366150608748, + "flos": 17090714586240.0, + "grad_norm": 2.0068567513814375, + "language_loss": 0.77542102, + "learning_rate": 1.5291244295194448e-06, + "loss": 0.79672837, + "num_input_tokens_seen": 210978720, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.68359375, + "step": 9786, + "time_per_iteration": 2.4269275665283203 + }, + { + "auxiliary_loss_clip": 0.01108029, + "auxiliary_loss_mlp": 0.01033892, + "balance_loss_clip": 1.02173829, + "balance_loss_mlp": 1.03681958, + "epoch": 0.5884262738614159, + "flos": 22127186033280.0, + "grad_norm": 1.4388452349288328, + "language_loss": 0.79175329, + "learning_rate": 1.5287459247610276e-06, + "loss": 0.81317246, + "num_input_tokens_seen": 210998750, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.7109375, + "step": 9787, + "time_per_iteration": 2.441265344619751 + }, + { + "auxiliary_loss_clip": 0.01106621, + "auxiliary_loss_mlp": 0.01031149, + "balance_loss_clip": 1.01953244, + "balance_loss_mlp": 1.03677058, + "epoch": 0.5884863971140839, + "flos": 21031838743680.0, + "grad_norm": 1.596428038291934, + "language_loss": 0.66514194, + "learning_rate": 1.5283674378726116e-06, + "loss": 0.68651974, + "num_input_tokens_seen": 211017550, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.69921875, + "step": 9788, + "time_per_iteration": 2.4632344245910645 + }, + { + "auxiliary_loss_clip": 0.01106001, + "auxiliary_loss_mlp": 0.01030969, + "balance_loss_clip": 1.01877332, + "balance_loss_mlp": 1.03787911, + "epoch": 0.5885465203667518, + "flos": 23805112008960.0, + "grad_norm": 2.066265402471891, + "language_loss": 0.79951847, + "learning_rate": 1.5279889688685506e-06, + "loss": 0.82088816, + "num_input_tokens_seen": 211034135, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 9789, + "time_per_iteration": 2.4486775398254395 + }, + { + "auxiliary_loss_clip": 0.01105301, + "auxiliary_loss_mlp": 0.01027594, + "balance_loss_clip": 1.01579237, + "balance_loss_mlp": 1.03722358, + "epoch": 0.5886066436194198, + "flos": 18880574319360.0, + "grad_norm": 1.510117689081276, + "language_loss": 0.70817208, + "learning_rate": 1.5276105177631944e-06, + "loss": 0.72950107, + "num_input_tokens_seen": 211053850, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 9790, + "time_per_iteration": 2.474634885787964 + }, + { + "auxiliary_loss_clip": 0.01105567, + "auxiliary_loss_mlp": 0.01033293, + "balance_loss_clip": 1.02120566, + "balance_loss_mlp": 1.0374043, + "epoch": 0.5886667668720877, + "flos": 24790141653120.0, + "grad_norm": 1.9043586619327855, + "language_loss": 0.83184004, + "learning_rate": 1.527232084570895e-06, + "loss": 0.85322857, + "num_input_tokens_seen": 211072165, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 9791, + "time_per_iteration": 2.4930591583251953 + }, + { + "auxiliary_loss_clip": 0.01110799, + "auxiliary_loss_mlp": 0.01034535, + "balance_loss_clip": 1.02189827, + "balance_loss_mlp": 1.04020619, + "epoch": 0.5887268901247558, + "flos": 21614381516160.0, + "grad_norm": 1.5964011084944127, + "language_loss": 0.76287472, + "learning_rate": 1.5268536693060026e-06, + "loss": 0.78432798, + "num_input_tokens_seen": 211089630, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 9792, + "time_per_iteration": 2.572164297103882 + }, + { + "auxiliary_loss_clip": 0.01110663, + "auxiliary_loss_mlp": 0.01030996, + "balance_loss_clip": 1.01878858, + "balance_loss_mlp": 1.0383172, + "epoch": 0.5887870133774237, + "flos": 20481722974080.0, + "grad_norm": 1.954465265842666, + "language_loss": 0.69085598, + "learning_rate": 1.5264752719828662e-06, + "loss": 0.71227252, + "num_input_tokens_seen": 211106120, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.72265625, + "step": 9793, + "time_per_iteration": 2.440532684326172 + }, + { + "auxiliary_loss_clip": 0.01105715, + "auxiliary_loss_mlp": 0.01032775, + "balance_loss_clip": 1.02001381, + "balance_loss_mlp": 1.03754866, + "epoch": 0.5888471366300917, + "flos": 19206283870080.0, + "grad_norm": 2.2945820531528547, + "language_loss": 0.60200524, + "learning_rate": 1.5260968926158353e-06, + "loss": 0.6233902, + "num_input_tokens_seen": 211122450, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 9794, + "time_per_iteration": 2.4281349182128906 + }, + { + "auxiliary_loss_clip": 0.01107792, + "auxiliary_loss_mlp": 0.01035699, + "balance_loss_clip": 1.02265191, + "balance_loss_mlp": 1.03800488, + "epoch": 0.5889072598827596, + "flos": 19972904866560.0, + "grad_norm": 1.8105141483242522, + "language_loss": 0.65209466, + "learning_rate": 1.525718531219257e-06, + "loss": 0.67352962, + "num_input_tokens_seen": 211141765, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.69921875, + "step": 9795, + "time_per_iteration": 2.4471983909606934 + }, + { + "auxiliary_loss_clip": 0.01105789, + "auxiliary_loss_mlp": 0.01036936, + "balance_loss_clip": 1.02589679, + "balance_loss_mlp": 1.03751063, + "epoch": 0.5889673831354276, + "flos": 20741249715840.0, + "grad_norm": 1.6472816848345888, + "language_loss": 0.74171197, + "learning_rate": 1.5253401878074801e-06, + "loss": 0.76313925, + "num_input_tokens_seen": 211160475, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6796875, + "step": 9796, + "time_per_iteration": 2.4404211044311523 + }, + { + "auxiliary_loss_clip": 0.0110878, + "auxiliary_loss_mlp": 0.01029178, + "balance_loss_clip": 1.01761484, + "balance_loss_mlp": 1.04002237, + "epoch": 0.5890275063880956, + "flos": 25300935008640.0, + "grad_norm": 1.4898681844876358, + "language_loss": 0.83064574, + "learning_rate": 1.5249618623948507e-06, + "loss": 0.85202533, + "num_input_tokens_seen": 211180480, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6875, + "step": 9797, + "time_per_iteration": 2.487971544265747 + }, + { + "auxiliary_loss_clip": 0.01104148, + "auxiliary_loss_mlp": 0.01031256, + "balance_loss_clip": 1.01896548, + "balance_loss_mlp": 1.03718829, + "epoch": 0.5890876296407636, + "flos": 11765377964160.0, + "grad_norm": 1.804693100831568, + "language_loss": 0.78741366, + "learning_rate": 1.5245835549957152e-06, + "loss": 0.80876774, + "num_input_tokens_seen": 211198000, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66796875, + "step": 9798, + "time_per_iteration": 2.4391119480133057 + }, + { + "auxiliary_loss_clip": 0.01104678, + "auxiliary_loss_mlp": 0.01031268, + "balance_loss_clip": 1.01994312, + "balance_loss_mlp": 1.03718722, + "epoch": 0.5891477528934316, + "flos": 13589460380160.0, + "grad_norm": 2.097614269824193, + "language_loss": 0.74100447, + "learning_rate": 1.5242052656244186e-06, + "loss": 0.76236397, + "num_input_tokens_seen": 211214765, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.67578125, + "step": 9799, + "time_per_iteration": 2.444185972213745 + }, + { + "auxiliary_loss_clip": 0.01110656, + "auxiliary_loss_mlp": 0.01031885, + "balance_loss_clip": 1.01852775, + "balance_loss_mlp": 1.03889656, + "epoch": 0.5892078761460995, + "flos": 15049193189760.0, + "grad_norm": 1.9705578864506654, + "language_loss": 0.76078779, + "learning_rate": 1.5238269942953064e-06, + "loss": 0.78221321, + "num_input_tokens_seen": 211232335, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.71875, + "step": 9800, + "time_per_iteration": 2.4564571380615234 + }, + { + "auxiliary_loss_clip": 0.01108184, + "auxiliary_loss_mlp": 0.01039976, + "balance_loss_clip": 1.02804899, + "balance_loss_mlp": 1.03771484, + "epoch": 0.5892679993987675, + "flos": 15778215624960.0, + "grad_norm": 1.9698106702703237, + "language_loss": 0.78824806, + "learning_rate": 1.523448741022722e-06, + "loss": 0.8097297, + "num_input_tokens_seen": 211249985, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.703125, + "step": 9801, + "time_per_iteration": 2.439195156097412 + }, + { + "auxiliary_loss_clip": 0.01109337, + "auxiliary_loss_mlp": 0.01029379, + "balance_loss_clip": 1.01721966, + "balance_loss_mlp": 1.03768528, + "epoch": 0.5893281226514354, + "flos": 25265203954560.0, + "grad_norm": 2.596016426383407, + "language_loss": 0.65912932, + "learning_rate": 1.5230705058210088e-06, + "loss": 0.68051648, + "num_input_tokens_seen": 211268425, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.71875, + "step": 9802, + "time_per_iteration": 3.8562896251678467 + }, + { + "auxiliary_loss_clip": 0.01106914, + "auxiliary_loss_mlp": 0.01027404, + "balance_loss_clip": 1.01552522, + "balance_loss_mlp": 1.03888416, + "epoch": 0.5893882459041034, + "flos": 19458232842240.0, + "grad_norm": 1.5756682227023782, + "language_loss": 0.78167737, + "learning_rate": 1.5226922887045108e-06, + "loss": 0.8030206, + "num_input_tokens_seen": 211286680, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 9803, + "time_per_iteration": 2.4531607627868652 + }, + { + "auxiliary_loss_clip": 0.01108754, + "auxiliary_loss_mlp": 0.01035578, + "balance_loss_clip": 1.02300191, + "balance_loss_mlp": 1.03849792, + "epoch": 0.5894483691567713, + "flos": 20634056553600.0, + "grad_norm": 1.5070835087317231, + "language_loss": 0.7292577, + "learning_rate": 1.5223140896875686e-06, + "loss": 0.75070107, + "num_input_tokens_seen": 211307700, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 9804, + "time_per_iteration": 3.909280776977539 + }, + { + "auxiliary_loss_clip": 0.01108266, + "auxiliary_loss_mlp": 0.01029855, + "balance_loss_clip": 1.01809549, + "balance_loss_mlp": 1.03996158, + "epoch": 0.5895084924094394, + "flos": 17778223877760.0, + "grad_norm": 1.9252543926260512, + "language_loss": 0.7480545, + "learning_rate": 1.5219359087845234e-06, + "loss": 0.76943576, + "num_input_tokens_seen": 211324835, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.68359375, + "step": 9805, + "time_per_iteration": 3.92484712600708 + }, + { + "auxiliary_loss_clip": 0.01113176, + "auxiliary_loss_mlp": 0.01031215, + "balance_loss_clip": 1.01807201, + "balance_loss_mlp": 1.03880858, + "epoch": 0.5895686156621073, + "flos": 20121072468480.0, + "grad_norm": 2.2161041024358736, + "language_loss": 0.7798723, + "learning_rate": 1.5215577460097174e-06, + "loss": 0.8013162, + "num_input_tokens_seen": 211344130, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7421875, + "step": 9806, + "time_per_iteration": 3.958747625350952 + }, + { + "auxiliary_loss_clip": 0.01106773, + "auxiliary_loss_mlp": 0.01030721, + "balance_loss_clip": 1.01821566, + "balance_loss_mlp": 1.03678048, + "epoch": 0.5896287389147753, + "flos": 20850058990080.0, + "grad_norm": 2.028844636014754, + "language_loss": 0.77013928, + "learning_rate": 1.5211796013774887e-06, + "loss": 0.79151416, + "num_input_tokens_seen": 211362915, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69921875, + "step": 9807, + "time_per_iteration": 2.437091827392578 + }, + { + "auxiliary_loss_clip": 0.01111522, + "auxiliary_loss_mlp": 0.01031277, + "balance_loss_clip": 1.01834261, + "balance_loss_mlp": 1.040411, + "epoch": 0.5896888621674432, + "flos": 14537897043840.0, + "grad_norm": 2.123691808114849, + "language_loss": 0.74406278, + "learning_rate": 1.5208014749021786e-06, + "loss": 0.76549083, + "num_input_tokens_seen": 211380700, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9808, + "time_per_iteration": 2.4456939697265625 + }, + { + "auxiliary_loss_clip": 0.01111351, + "auxiliary_loss_mlp": 0.01031497, + "balance_loss_clip": 1.01794887, + "balance_loss_mlp": 1.03927052, + "epoch": 0.5897489854201112, + "flos": 20886759711360.0, + "grad_norm": 1.9040797268830973, + "language_loss": 0.71715617, + "learning_rate": 1.5204233665981236e-06, + "loss": 0.73858464, + "num_input_tokens_seen": 211400095, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.71875, + "step": 9809, + "time_per_iteration": 2.4555907249450684 + }, + { + "auxiliary_loss_clip": 0.01111034, + "auxiliary_loss_mlp": 0.0103364, + "balance_loss_clip": 1.02066374, + "balance_loss_mlp": 1.03881156, + "epoch": 0.5898091086727792, + "flos": 20011149872640.0, + "grad_norm": 2.6575599068105262, + "language_loss": 0.81872356, + "learning_rate": 1.5200452764796627e-06, + "loss": 0.84017026, + "num_input_tokens_seen": 211417810, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.72265625, + "step": 9810, + "time_per_iteration": 2.546018600463867 + }, + { + "auxiliary_loss_clip": 0.01105843, + "auxiliary_loss_mlp": 0.01030074, + "balance_loss_clip": 1.01815283, + "balance_loss_mlp": 1.03850091, + "epoch": 0.5898692319254472, + "flos": 16253242012800.0, + "grad_norm": 1.679981614097192, + "language_loss": 0.8076582, + "learning_rate": 1.5196672045611336e-06, + "loss": 0.8290174, + "num_input_tokens_seen": 211436020, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 9811, + "time_per_iteration": 2.432685613632202 + }, + { + "auxiliary_loss_clip": 0.01110453, + "auxiliary_loss_mlp": 0.01027593, + "balance_loss_clip": 1.01449776, + "balance_loss_mlp": 1.03924918, + "epoch": 0.5899293551781152, + "flos": 20448541785600.0, + "grad_norm": 1.903117615206719, + "language_loss": 0.76666933, + "learning_rate": 1.5192891508568715e-06, + "loss": 0.78804982, + "num_input_tokens_seen": 211454335, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 9812, + "time_per_iteration": 2.45906138420105 + }, + { + "auxiliary_loss_clip": 0.01107232, + "auxiliary_loss_mlp": 0.01028772, + "balance_loss_clip": 1.01794147, + "balance_loss_mlp": 1.03932881, + "epoch": 0.5899894784307831, + "flos": 13881701433600.0, + "grad_norm": 3.543593991514859, + "language_loss": 0.70407474, + "learning_rate": 1.5189111153812133e-06, + "loss": 0.72543478, + "num_input_tokens_seen": 211472775, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6796875, + "step": 9813, + "time_per_iteration": 2.417073965072632 + }, + { + "auxiliary_loss_clip": 0.0110801, + "auxiliary_loss_mlp": 0.01031885, + "balance_loss_clip": 1.01969576, + "balance_loss_mlp": 1.03846037, + "epoch": 0.5900496016834511, + "flos": 20083797129600.0, + "grad_norm": 1.496524946754694, + "language_loss": 0.72230315, + "learning_rate": 1.518533098148494e-06, + "loss": 0.74370211, + "num_input_tokens_seen": 211492195, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6953125, + "step": 9814, + "time_per_iteration": 2.527130365371704 + }, + { + "auxiliary_loss_clip": 0.0110797, + "auxiliary_loss_mlp": 0.01029956, + "balance_loss_clip": 1.01768374, + "balance_loss_mlp": 1.03837872, + "epoch": 0.590109724936119, + "flos": 20259148348800.0, + "grad_norm": 1.8734717265521494, + "language_loss": 0.78583348, + "learning_rate": 1.5181550991730476e-06, + "loss": 0.80721277, + "num_input_tokens_seen": 211510220, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 9815, + "time_per_iteration": 2.4397730827331543 + }, + { + "auxiliary_loss_clip": 0.01114156, + "auxiliary_loss_mlp": 0.01035445, + "balance_loss_clip": 1.02197468, + "balance_loss_mlp": 1.03963876, + "epoch": 0.590169848188787, + "flos": 24235069806720.0, + "grad_norm": 2.0868241481245415, + "language_loss": 0.7557171, + "learning_rate": 1.5177771184692083e-06, + "loss": 0.7772131, + "num_input_tokens_seen": 211526260, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7421875, + "step": 9816, + "time_per_iteration": 2.443861484527588 + }, + { + "auxiliary_loss_clip": 0.01110119, + "auxiliary_loss_mlp": 0.01033143, + "balance_loss_clip": 1.02063835, + "balance_loss_mlp": 1.04108596, + "epoch": 0.590229971441455, + "flos": 17784724239360.0, + "grad_norm": 2.234392841889587, + "language_loss": 0.81303239, + "learning_rate": 1.517399156051309e-06, + "loss": 0.83446503, + "num_input_tokens_seen": 211542890, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69140625, + "step": 9817, + "time_per_iteration": 2.4248719215393066 + }, + { + "auxiliary_loss_clip": 0.01109425, + "auxiliary_loss_mlp": 0.01033156, + "balance_loss_clip": 1.02112818, + "balance_loss_mlp": 1.03941548, + "epoch": 0.590290094694123, + "flos": 22236893147520.0, + "grad_norm": 1.5738429375950187, + "language_loss": 0.76401961, + "learning_rate": 1.517021211933682e-06, + "loss": 0.78544545, + "num_input_tokens_seen": 211562685, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69921875, + "step": 9818, + "time_per_iteration": 2.445507526397705 + }, + { + "auxiliary_loss_clip": 0.01104232, + "auxiliary_loss_mlp": 0.01030314, + "balance_loss_clip": 1.01861358, + "balance_loss_mlp": 1.03634679, + "epoch": 0.5903502179467909, + "flos": 19098623831040.0, + "grad_norm": 1.8418500679377416, + "language_loss": 0.66351467, + "learning_rate": 1.5166432861306592e-06, + "loss": 0.68486011, + "num_input_tokens_seen": 211579960, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 9819, + "time_per_iteration": 2.4585890769958496 + }, + { + "auxiliary_loss_clip": 0.01109622, + "auxiliary_loss_mlp": 0.01032276, + "balance_loss_clip": 1.01972878, + "balance_loss_mlp": 1.03955185, + "epoch": 0.5904103411994589, + "flos": 24235500769920.0, + "grad_norm": 1.5583203498776486, + "language_loss": 0.77830237, + "learning_rate": 1.5162653786565714e-06, + "loss": 0.79972136, + "num_input_tokens_seen": 211599310, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 9820, + "time_per_iteration": 2.452444314956665 + }, + { + "auxiliary_loss_clip": 0.01033068, + "auxiliary_loss_mlp": 0.01003995, + "balance_loss_clip": 1.00268924, + "balance_loss_mlp": 1.01099396, + "epoch": 0.5904704644521268, + "flos": 64876613045760.0, + "grad_norm": 0.9230258023741272, + "language_loss": 0.65167463, + "learning_rate": 1.5158874895257487e-06, + "loss": 0.67204523, + "num_input_tokens_seen": 211658790, + "router_z_loss_clip": 0.01306152, + "router_z_loss_mlp": 0.22070312, + "step": 9821, + "time_per_iteration": 3.0410289764404297 + }, + { + "auxiliary_loss_clip": 0.01106857, + "auxiliary_loss_mlp": 0.01028087, + "balance_loss_clip": 1.0159936, + "balance_loss_mlp": 1.03887093, + "epoch": 0.5905305877047948, + "flos": 19609991804160.0, + "grad_norm": 1.8405567429237777, + "language_loss": 0.61040848, + "learning_rate": 1.515509618752521e-06, + "loss": 0.63175792, + "num_input_tokens_seen": 211677240, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 9822, + "time_per_iteration": 2.4597485065460205 + }, + { + "auxiliary_loss_clip": 0.01110158, + "auxiliary_loss_mlp": 0.01038511, + "balance_loss_clip": 1.02598214, + "balance_loss_mlp": 1.03878164, + "epoch": 0.5905907109574628, + "flos": 18989634988800.0, + "grad_norm": 1.8163106241475082, + "language_loss": 0.82910824, + "learning_rate": 1.5151317663512173e-06, + "loss": 0.850595, + "num_input_tokens_seen": 211695485, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71484375, + "step": 9823, + "time_per_iteration": 2.4342074394226074 + }, + { + "auxiliary_loss_clip": 0.01106822, + "auxiliary_loss_mlp": 0.01032649, + "balance_loss_clip": 1.02025676, + "balance_loss_mlp": 1.03823602, + "epoch": 0.5906508342101308, + "flos": 22200407907840.0, + "grad_norm": 1.9061097186750977, + "language_loss": 0.73051912, + "learning_rate": 1.514753932336165e-06, + "loss": 0.75191379, + "num_input_tokens_seen": 211713090, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 9824, + "time_per_iteration": 2.474583387374878 + }, + { + "auxiliary_loss_clip": 0.01115754, + "auxiliary_loss_mlp": 0.0103547, + "balance_loss_clip": 1.02118862, + "balance_loss_mlp": 1.03907609, + "epoch": 0.5907109574627988, + "flos": 20886687884160.0, + "grad_norm": 2.117093757339989, + "language_loss": 0.82486725, + "learning_rate": 1.514376116721693e-06, + "loss": 0.84637952, + "num_input_tokens_seen": 211732510, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.76953125, + "step": 9825, + "time_per_iteration": 2.4499030113220215 + }, + { + "auxiliary_loss_clip": 0.01104731, + "auxiliary_loss_mlp": 0.01028819, + "balance_loss_clip": 1.01781034, + "balance_loss_mlp": 1.03812122, + "epoch": 0.5907710807154667, + "flos": 21506649649920.0, + "grad_norm": 1.7674632389005596, + "language_loss": 0.77194965, + "learning_rate": 1.5139983195221272e-06, + "loss": 0.79328513, + "num_input_tokens_seen": 211748695, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6640625, + "step": 9826, + "time_per_iteration": 2.490628480911255 + }, + { + "auxiliary_loss_clip": 0.01106346, + "auxiliary_loss_mlp": 0.01026697, + "balance_loss_clip": 1.01523519, + "balance_loss_mlp": 1.03757071, + "epoch": 0.5908312039681347, + "flos": 22018376759040.0, + "grad_norm": 1.8211120400501501, + "language_loss": 0.72350824, + "learning_rate": 1.513620540751793e-06, + "loss": 0.74483871, + "num_input_tokens_seen": 211768545, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6875, + "step": 9827, + "time_per_iteration": 2.496574640274048 + }, + { + "auxiliary_loss_clip": 0.01107742, + "auxiliary_loss_mlp": 0.01028951, + "balance_loss_clip": 1.01782858, + "balance_loss_mlp": 1.0374589, + "epoch": 0.5908913272208026, + "flos": 18479523991680.0, + "grad_norm": 1.7932913826709562, + "language_loss": 0.79741728, + "learning_rate": 1.5132427804250178e-06, + "loss": 0.81878424, + "num_input_tokens_seen": 211786665, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.703125, + "step": 9828, + "time_per_iteration": 2.51045298576355 + }, + { + "auxiliary_loss_clip": 0.01111624, + "auxiliary_loss_mlp": 0.01034198, + "balance_loss_clip": 1.02125204, + "balance_loss_mlp": 1.03958178, + "epoch": 0.5909514504734706, + "flos": 12312189682560.0, + "grad_norm": 2.271428998540672, + "language_loss": 0.88056707, + "learning_rate": 1.5128650385561241e-06, + "loss": 0.90202534, + "num_input_tokens_seen": 211801215, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 9829, + "time_per_iteration": 2.4169514179229736 + }, + { + "auxiliary_loss_clip": 0.01031439, + "auxiliary_loss_mlp": 0.00999905, + "balance_loss_clip": 0.99870729, + "balance_loss_mlp": 1.00956726, + "epoch": 0.5910115737261386, + "flos": 70213262451840.0, + "grad_norm": 0.7537251091943264, + "language_loss": 0.57855141, + "learning_rate": 1.5124873151594376e-06, + "loss": 0.59886479, + "num_input_tokens_seen": 211857005, + "router_z_loss_clip": 0.01196289, + "router_z_loss_mlp": 0.21875, + "step": 9830, + "time_per_iteration": 2.996295928955078 + }, + { + "auxiliary_loss_clip": 0.01117401, + "auxiliary_loss_mlp": 0.01032419, + "balance_loss_clip": 1.01852536, + "balance_loss_mlp": 1.04140687, + "epoch": 0.5910716969788066, + "flos": 22017766227840.0, + "grad_norm": 2.0665850759749813, + "language_loss": 0.76163888, + "learning_rate": 1.5121096102492812e-06, + "loss": 0.78313708, + "num_input_tokens_seen": 211876675, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7578125, + "step": 9831, + "time_per_iteration": 2.461068868637085 + }, + { + "auxiliary_loss_clip": 0.01105452, + "auxiliary_loss_mlp": 0.01027496, + "balance_loss_clip": 1.01565278, + "balance_loss_mlp": 1.03923118, + "epoch": 0.5911318202314745, + "flos": 21251648021760.0, + "grad_norm": 1.602158251769988, + "language_loss": 0.7790612, + "learning_rate": 1.5117319238399767e-06, + "loss": 0.80039072, + "num_input_tokens_seen": 211895725, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 9832, + "time_per_iteration": 2.4806432723999023 + }, + { + "auxiliary_loss_clip": 0.01104873, + "auxiliary_loss_mlp": 0.01027671, + "balance_loss_clip": 1.01554728, + "balance_loss_mlp": 1.03533232, + "epoch": 0.5911919434841425, + "flos": 17821604528640.0, + "grad_norm": 1.7748958571682212, + "language_loss": 0.83552635, + "learning_rate": 1.511354255945847e-06, + "loss": 0.85685176, + "num_input_tokens_seen": 211913860, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6953125, + "step": 9833, + "time_per_iteration": 2.436558961868286 + }, + { + "auxiliary_loss_clip": 0.01108125, + "auxiliary_loss_mlp": 0.01032413, + "balance_loss_clip": 1.02002692, + "balance_loss_mlp": 1.03818607, + "epoch": 0.5912520667368104, + "flos": 20374781207040.0, + "grad_norm": 1.512608687160236, + "language_loss": 0.74505258, + "learning_rate": 1.5109766065812123e-06, + "loss": 0.76645797, + "num_input_tokens_seen": 211932880, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.703125, + "step": 9834, + "time_per_iteration": 2.497488260269165 + }, + { + "auxiliary_loss_clip": 0.01107604, + "auxiliary_loss_mlp": 0.01028933, + "balance_loss_clip": 1.01680338, + "balance_loss_mlp": 1.03707302, + "epoch": 0.5913121899894784, + "flos": 17930557457280.0, + "grad_norm": 2.15246332260658, + "language_loss": 0.78111219, + "learning_rate": 1.5105989757603942e-06, + "loss": 0.8024776, + "num_input_tokens_seen": 211948625, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.703125, + "step": 9835, + "time_per_iteration": 2.428570032119751 + }, + { + "auxiliary_loss_clip": 0.01108371, + "auxiliary_loss_mlp": 0.0103274, + "balance_loss_clip": 1.02080131, + "balance_loss_mlp": 1.03782153, + "epoch": 0.5913723132421465, + "flos": 22126934638080.0, + "grad_norm": 2.790579015547894, + "language_loss": 0.74016017, + "learning_rate": 1.5102213634977117e-06, + "loss": 0.76157123, + "num_input_tokens_seen": 211965355, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.703125, + "step": 9836, + "time_per_iteration": 2.4571895599365234 + }, + { + "auxiliary_loss_clip": 0.01108454, + "auxiliary_loss_mlp": 0.01025364, + "balance_loss_clip": 1.01332974, + "balance_loss_mlp": 1.03816915, + "epoch": 0.5914324364948144, + "flos": 15697918771200.0, + "grad_norm": 2.0887710674316335, + "language_loss": 0.81834614, + "learning_rate": 1.5098437698074841e-06, + "loss": 0.83968431, + "num_input_tokens_seen": 211982245, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.703125, + "step": 9837, + "time_per_iteration": 2.425869941711426 + }, + { + "auxiliary_loss_clip": 0.01109463, + "auxiliary_loss_mlp": 0.01030556, + "balance_loss_clip": 1.01760364, + "balance_loss_mlp": 1.03828216, + "epoch": 0.5914925597474824, + "flos": 22747327367040.0, + "grad_norm": 1.6633412669476784, + "language_loss": 0.79169023, + "learning_rate": 1.5094661947040304e-06, + "loss": 0.81309044, + "num_input_tokens_seen": 212000250, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 9838, + "time_per_iteration": 2.480945348739624 + }, + { + "auxiliary_loss_clip": 0.01109443, + "auxiliary_loss_mlp": 0.01032925, + "balance_loss_clip": 1.02036071, + "balance_loss_mlp": 1.03814876, + "epoch": 0.5915526830001503, + "flos": 18292788161280.0, + "grad_norm": 1.9639883281700399, + "language_loss": 0.6955409, + "learning_rate": 1.5090886382016673e-06, + "loss": 0.7169646, + "num_input_tokens_seen": 212017505, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 9839, + "time_per_iteration": 2.445032835006714 + }, + { + "auxiliary_loss_clip": 0.01109116, + "auxiliary_loss_mlp": 0.01040629, + "balance_loss_clip": 1.02804112, + "balance_loss_mlp": 1.03763521, + "epoch": 0.5916128062528183, + "flos": 17019072910080.0, + "grad_norm": 2.156057098485451, + "language_loss": 0.65970773, + "learning_rate": 1.5087111003147124e-06, + "loss": 0.68120515, + "num_input_tokens_seen": 212034595, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71484375, + "step": 9840, + "time_per_iteration": 2.4208333492279053 + }, + { + "auxiliary_loss_clip": 0.01109278, + "auxiliary_loss_mlp": 0.01031815, + "balance_loss_clip": 1.01920867, + "balance_loss_mlp": 1.03765261, + "epoch": 0.5916729295054862, + "flos": 24754231031040.0, + "grad_norm": 1.6889823147578333, + "language_loss": 0.81775278, + "learning_rate": 1.5083335810574813e-06, + "loss": 0.83916378, + "num_input_tokens_seen": 212055775, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 9841, + "time_per_iteration": 2.485783576965332 + }, + { + "auxiliary_loss_clip": 0.0110413, + "auxiliary_loss_mlp": 0.01028956, + "balance_loss_clip": 1.0175122, + "balance_loss_mlp": 1.03609967, + "epoch": 0.5917330527581542, + "flos": 15958199698560.0, + "grad_norm": 1.5545668932192243, + "language_loss": 0.68891448, + "learning_rate": 1.507956080444291e-06, + "loss": 0.71024531, + "num_input_tokens_seen": 212074000, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6796875, + "step": 9842, + "time_per_iteration": 2.4090652465820312 + }, + { + "auxiliary_loss_clip": 0.01108304, + "auxiliary_loss_mlp": 0.0103228, + "balance_loss_clip": 1.02031779, + "balance_loss_mlp": 1.03697038, + "epoch": 0.5917931760108222, + "flos": 23800730549760.0, + "grad_norm": 1.8995177421561278, + "language_loss": 0.8258518, + "learning_rate": 1.5075785984894549e-06, + "loss": 0.84725767, + "num_input_tokens_seen": 212091415, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.7109375, + "step": 9843, + "time_per_iteration": 2.456085443496704 + }, + { + "auxiliary_loss_clip": 0.01107968, + "auxiliary_loss_mlp": 0.01031114, + "balance_loss_clip": 1.01810205, + "balance_loss_mlp": 1.03701758, + "epoch": 0.5918532992634902, + "flos": 23249609199360.0, + "grad_norm": 2.3414678440212953, + "language_loss": 0.81883448, + "learning_rate": 1.5072011352072875e-06, + "loss": 0.84022528, + "num_input_tokens_seen": 212105255, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 9844, + "time_per_iteration": 3.834216833114624 + }, + { + "auxiliary_loss_clip": 0.01111099, + "auxiliary_loss_mlp": 0.01031123, + "balance_loss_clip": 1.01842773, + "balance_loss_mlp": 1.04004455, + "epoch": 0.5919134225161581, + "flos": 19499853726720.0, + "grad_norm": 1.8185302816606077, + "language_loss": 0.74449736, + "learning_rate": 1.5068236906121032e-06, + "loss": 0.76591957, + "num_input_tokens_seen": 212122765, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9845, + "time_per_iteration": 2.409029960632324 + }, + { + "auxiliary_loss_clip": 0.01108139, + "auxiliary_loss_mlp": 0.01026326, + "balance_loss_clip": 1.01324248, + "balance_loss_mlp": 1.03682494, + "epoch": 0.5919735457688261, + "flos": 38800940567040.0, + "grad_norm": 2.2228008907542027, + "language_loss": 0.63848257, + "learning_rate": 1.506446264718213e-06, + "loss": 0.65982717, + "num_input_tokens_seen": 212143960, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 9846, + "time_per_iteration": 3.994704246520996 + }, + { + "auxiliary_loss_clip": 0.01100388, + "auxiliary_loss_mlp": 0.01026228, + "balance_loss_clip": 1.01529002, + "balance_loss_mlp": 1.03501678, + "epoch": 0.592033669021494, + "flos": 22163994495360.0, + "grad_norm": 1.7549171077463366, + "language_loss": 0.76315683, + "learning_rate": 1.506068857539931e-06, + "loss": 0.78442299, + "num_input_tokens_seen": 212162005, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.65625, + "step": 9847, + "time_per_iteration": 3.815723419189453 + }, + { + "auxiliary_loss_clip": 0.01107339, + "auxiliary_loss_mlp": 0.01031252, + "balance_loss_clip": 1.01892579, + "balance_loss_mlp": 1.03723776, + "epoch": 0.592093792274162, + "flos": 22710985781760.0, + "grad_norm": 1.7391013556086516, + "language_loss": 0.6229955, + "learning_rate": 1.5056914690915667e-06, + "loss": 0.6443814, + "num_input_tokens_seen": 212181635, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 9848, + "time_per_iteration": 3.9868550300598145 + }, + { + "auxiliary_loss_clip": 0.01108795, + "auxiliary_loss_mlp": 0.01037331, + "balance_loss_clip": 1.02532125, + "balance_loss_mlp": 1.03819513, + "epoch": 0.59215391552683, + "flos": 22528954632960.0, + "grad_norm": 2.784596822173483, + "language_loss": 0.75762534, + "learning_rate": 1.5053140993874312e-06, + "loss": 0.77908659, + "num_input_tokens_seen": 212201615, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.703125, + "step": 9849, + "time_per_iteration": 2.4613027572631836 + }, + { + "auxiliary_loss_clip": 0.01108412, + "auxiliary_loss_mlp": 0.01032802, + "balance_loss_clip": 1.02006471, + "balance_loss_mlp": 1.0370928, + "epoch": 0.592214038779498, + "flos": 24499013921280.0, + "grad_norm": 1.6562680086624124, + "language_loss": 0.75594199, + "learning_rate": 1.5049367484418353e-06, + "loss": 0.77735424, + "num_input_tokens_seen": 212219355, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9850, + "time_per_iteration": 2.5371382236480713 + }, + { + "auxiliary_loss_clip": 0.01106578, + "auxiliary_loss_mlp": 0.01029098, + "balance_loss_clip": 1.01676035, + "balance_loss_mlp": 1.03672051, + "epoch": 0.592274162032166, + "flos": 21831353619840.0, + "grad_norm": 1.7347218503083297, + "language_loss": 0.7573396, + "learning_rate": 1.5045594162690868e-06, + "loss": 0.7786963, + "num_input_tokens_seen": 212236710, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.69921875, + "step": 9851, + "time_per_iteration": 2.4500503540039062 + }, + { + "auxiliary_loss_clip": 0.01106705, + "auxiliary_loss_mlp": 0.0103027, + "balance_loss_clip": 1.0179739, + "balance_loss_mlp": 1.03609896, + "epoch": 0.5923342852848339, + "flos": 24608146417920.0, + "grad_norm": 1.818113501506117, + "language_loss": 0.70232719, + "learning_rate": 1.5041821028834954e-06, + "loss": 0.72369695, + "num_input_tokens_seen": 212256195, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.70703125, + "step": 9852, + "time_per_iteration": 2.50327205657959 + }, + { + "auxiliary_loss_clip": 0.01112321, + "auxiliary_loss_mlp": 0.01040222, + "balance_loss_clip": 1.02710271, + "balance_loss_mlp": 1.03861785, + "epoch": 0.5923944085375019, + "flos": 19938143479680.0, + "grad_norm": 38.24844963287624, + "language_loss": 0.8025564, + "learning_rate": 1.5038048082993685e-06, + "loss": 0.82408178, + "num_input_tokens_seen": 212274085, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73828125, + "step": 9853, + "time_per_iteration": 2.443661689758301 + }, + { + "auxiliary_loss_clip": 0.01103448, + "auxiliary_loss_mlp": 0.01025904, + "balance_loss_clip": 1.01480556, + "balance_loss_mlp": 1.03603673, + "epoch": 0.5924545317901698, + "flos": 28658510812800.0, + "grad_norm": 1.502563314800498, + "language_loss": 0.67641807, + "learning_rate": 1.5034275325310124e-06, + "loss": 0.69771153, + "num_input_tokens_seen": 212295530, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.67578125, + "step": 9854, + "time_per_iteration": 2.5323755741119385 + }, + { + "auxiliary_loss_clip": 0.01105063, + "auxiliary_loss_mlp": 0.01025695, + "balance_loss_clip": 1.01371408, + "balance_loss_mlp": 1.03610444, + "epoch": 0.5925146550428378, + "flos": 19864885691520.0, + "grad_norm": 1.6522001385368033, + "language_loss": 0.88777542, + "learning_rate": 1.5030502755927344e-06, + "loss": 0.90908301, + "num_input_tokens_seen": 212313770, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 9855, + "time_per_iteration": 2.4309167861938477 + }, + { + "auxiliary_loss_clip": 0.01102278, + "auxiliary_loss_mlp": 0.01030174, + "balance_loss_clip": 1.01936722, + "balance_loss_mlp": 1.03590918, + "epoch": 0.5925747782955058, + "flos": 15122989681920.0, + "grad_norm": 1.7115668008760792, + "language_loss": 0.86635554, + "learning_rate": 1.5026730374988397e-06, + "loss": 0.88768005, + "num_input_tokens_seen": 212331525, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6640625, + "step": 9856, + "time_per_iteration": 2.464066743850708 + }, + { + "auxiliary_loss_clip": 0.01105344, + "auxiliary_loss_mlp": 0.01034231, + "balance_loss_clip": 1.02256656, + "balance_loss_mlp": 1.03562045, + "epoch": 0.5926349015481738, + "flos": 18405440190720.0, + "grad_norm": 2.1473398743532153, + "language_loss": 0.77584958, + "learning_rate": 1.5022958182636332e-06, + "loss": 0.79724526, + "num_input_tokens_seen": 212347295, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.69921875, + "step": 9857, + "time_per_iteration": 2.4102070331573486 + }, + { + "auxiliary_loss_clip": 0.01109396, + "auxiliary_loss_mlp": 0.01033385, + "balance_loss_clip": 1.0216434, + "balance_loss_mlp": 1.03954232, + "epoch": 0.5926950248008417, + "flos": 23111138269440.0, + "grad_norm": 1.9751188115052367, + "language_loss": 0.64351666, + "learning_rate": 1.501918617901419e-06, + "loss": 0.66494453, + "num_input_tokens_seen": 212365750, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.69921875, + "step": 9858, + "time_per_iteration": 2.461881637573242 + }, + { + "auxiliary_loss_clip": 0.01104549, + "auxiliary_loss_mlp": 0.01030693, + "balance_loss_clip": 1.01884377, + "balance_loss_mlp": 1.03700852, + "epoch": 0.5927551480535097, + "flos": 28033916192640.0, + "grad_norm": 1.9049315760209506, + "language_loss": 0.77045393, + "learning_rate": 1.501541436426501e-06, + "loss": 0.79180634, + "num_input_tokens_seen": 212385300, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.67578125, + "step": 9859, + "time_per_iteration": 2.478782892227173 + }, + { + "auxiliary_loss_clip": 0.01110235, + "auxiliary_loss_mlp": 0.01033746, + "balance_loss_clip": 1.02082372, + "balance_loss_mlp": 1.03882456, + "epoch": 0.5928152713061776, + "flos": 21798675221760.0, + "grad_norm": 2.1565186381803194, + "language_loss": 0.75153667, + "learning_rate": 1.5011642738531818e-06, + "loss": 0.77297652, + "num_input_tokens_seen": 212402140, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71484375, + "step": 9860, + "time_per_iteration": 2.4513912200927734 + }, + { + "auxiliary_loss_clip": 0.01106266, + "auxiliary_loss_mlp": 0.01033749, + "balance_loss_clip": 1.02277529, + "balance_loss_mlp": 1.03840578, + "epoch": 0.5928753945588456, + "flos": 24316839118080.0, + "grad_norm": 1.6305970530500205, + "language_loss": 0.76227921, + "learning_rate": 1.500787130195763e-06, + "loss": 0.78367937, + "num_input_tokens_seen": 212421790, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.67578125, + "step": 9861, + "time_per_iteration": 2.474095344543457 + }, + { + "auxiliary_loss_clip": 0.01103657, + "auxiliary_loss_mlp": 0.01024434, + "balance_loss_clip": 1.0131923, + "balance_loss_mlp": 1.03595328, + "epoch": 0.5929355178115137, + "flos": 26464619923200.0, + "grad_norm": 1.8413108938997076, + "language_loss": 0.70368218, + "learning_rate": 1.5004100054685465e-06, + "loss": 0.72496319, + "num_input_tokens_seen": 212442115, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.67578125, + "step": 9862, + "time_per_iteration": 2.539903402328491 + }, + { + "auxiliary_loss_clip": 0.0110657, + "auxiliary_loss_mlp": 0.0103043, + "balance_loss_clip": 1.01868796, + "balance_loss_mlp": 1.03706694, + "epoch": 0.5929956410641816, + "flos": 24965995662720.0, + "grad_norm": 1.8355876983877193, + "language_loss": 0.77771485, + "learning_rate": 1.500032899685832e-06, + "loss": 0.7990849, + "num_input_tokens_seen": 212459535, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6953125, + "step": 9863, + "time_per_iteration": 2.4712796211242676 + }, + { + "auxiliary_loss_clip": 0.01106967, + "auxiliary_loss_mlp": 0.01038141, + "balance_loss_clip": 1.02583861, + "balance_loss_mlp": 1.03730559, + "epoch": 0.5930557643168496, + "flos": 26208325405440.0, + "grad_norm": 1.8648903136261632, + "language_loss": 0.70763469, + "learning_rate": 1.499655812861921e-06, + "loss": 0.72908574, + "num_input_tokens_seen": 212479385, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 9864, + "time_per_iteration": 2.52478289604187 + }, + { + "auxiliary_loss_clip": 0.01107547, + "auxiliary_loss_mlp": 0.01033988, + "balance_loss_clip": 1.02201343, + "balance_loss_mlp": 1.03711009, + "epoch": 0.5931158875695175, + "flos": 27854937699840.0, + "grad_norm": 2.2141122969684655, + "language_loss": 0.67234761, + "learning_rate": 1.4992787450111112e-06, + "loss": 0.69376296, + "num_input_tokens_seen": 212500060, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.703125, + "step": 9865, + "time_per_iteration": 2.4957449436187744 + }, + { + "auxiliary_loss_clip": 0.0110929, + "auxiliary_loss_mlp": 0.01031685, + "balance_loss_clip": 1.01892328, + "balance_loss_mlp": 1.03758049, + "epoch": 0.5931760108221855, + "flos": 15413650536960.0, + "grad_norm": 1.8936144812420768, + "language_loss": 0.78334385, + "learning_rate": 1.4989016961477015e-06, + "loss": 0.8047536, + "num_input_tokens_seen": 212518590, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.71875, + "step": 9866, + "time_per_iteration": 2.4394681453704834 + }, + { + "auxiliary_loss_clip": 0.01105609, + "auxiliary_loss_mlp": 0.01030002, + "balance_loss_clip": 1.01867127, + "balance_loss_mlp": 1.03786838, + "epoch": 0.5932361340748534, + "flos": 30188520581760.0, + "grad_norm": 1.98454003485575, + "language_loss": 0.72037029, + "learning_rate": 1.4985246662859903e-06, + "loss": 0.7417264, + "num_input_tokens_seen": 212538190, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.67578125, + "step": 9867, + "time_per_iteration": 2.5107383728027344 + }, + { + "auxiliary_loss_clip": 0.01107812, + "auxiliary_loss_mlp": 0.01030449, + "balance_loss_clip": 1.01795018, + "balance_loss_mlp": 1.03910947, + "epoch": 0.5932962573275214, + "flos": 20157557708160.0, + "grad_norm": 1.538584883762445, + "language_loss": 0.66726553, + "learning_rate": 1.4981476554402732e-06, + "loss": 0.68864822, + "num_input_tokens_seen": 212557820, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 9868, + "time_per_iteration": 2.5143752098083496 + }, + { + "auxiliary_loss_clip": 0.01107645, + "auxiliary_loss_mlp": 0.01033304, + "balance_loss_clip": 1.02100754, + "balance_loss_mlp": 1.03726101, + "epoch": 0.5933563805801894, + "flos": 25445906300160.0, + "grad_norm": 1.5720110660148519, + "language_loss": 0.75083476, + "learning_rate": 1.4977706636248478e-06, + "loss": 0.77224427, + "num_input_tokens_seen": 212577645, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 9869, + "time_per_iteration": 2.4784231185913086 + }, + { + "auxiliary_loss_clip": 0.01110477, + "auxiliary_loss_mlp": 0.0103956, + "balance_loss_clip": 1.02690041, + "balance_loss_mlp": 1.0391326, + "epoch": 0.5934165038328574, + "flos": 59995740337920.0, + "grad_norm": 1.6442009630814416, + "language_loss": 0.74131197, + "learning_rate": 1.4973936908540091e-06, + "loss": 0.76281238, + "num_input_tokens_seen": 212603430, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9870, + "time_per_iteration": 2.8396053314208984 + }, + { + "auxiliary_loss_clip": 0.0111113, + "auxiliary_loss_mlp": 0.01025896, + "balance_loss_clip": 1.01414764, + "balance_loss_mlp": 1.04010868, + "epoch": 0.5934766270855253, + "flos": 24420548661120.0, + "grad_norm": 1.9765481299651093, + "language_loss": 0.71421361, + "learning_rate": 1.4970167371420517e-06, + "loss": 0.7355839, + "num_input_tokens_seen": 212620730, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.7109375, + "step": 9871, + "time_per_iteration": 2.460695505142212 + }, + { + "auxiliary_loss_clip": 0.01110046, + "auxiliary_loss_mlp": 0.01032784, + "balance_loss_clip": 1.0198555, + "balance_loss_mlp": 1.03879905, + "epoch": 0.5935367503381933, + "flos": 23513158264320.0, + "grad_norm": 1.9723601672672642, + "language_loss": 0.74131697, + "learning_rate": 1.496639802503271e-06, + "loss": 0.76274526, + "num_input_tokens_seen": 212639745, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9872, + "time_per_iteration": 2.4877848625183105 + }, + { + "auxiliary_loss_clip": 0.01111497, + "auxiliary_loss_mlp": 0.01036942, + "balance_loss_clip": 1.02359688, + "balance_loss_mlp": 1.03926826, + "epoch": 0.5935968735908612, + "flos": 18948337326720.0, + "grad_norm": 2.142318153174813, + "language_loss": 0.78675568, + "learning_rate": 1.4962628869519583e-06, + "loss": 0.80824012, + "num_input_tokens_seen": 212655915, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.72265625, + "step": 9873, + "time_per_iteration": 2.4480934143066406 + }, + { + "auxiliary_loss_clip": 0.01109102, + "auxiliary_loss_mlp": 0.01034698, + "balance_loss_clip": 1.02197838, + "balance_loss_mlp": 1.03843832, + "epoch": 0.5936569968435292, + "flos": 25483433034240.0, + "grad_norm": 1.5306423792742176, + "language_loss": 0.85011673, + "learning_rate": 1.4958859905024078e-06, + "loss": 0.87155473, + "num_input_tokens_seen": 212676115, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 9874, + "time_per_iteration": 2.5098774433135986 + }, + { + "auxiliary_loss_clip": 0.01030749, + "auxiliary_loss_mlp": 0.01001619, + "balance_loss_clip": 1.00044489, + "balance_loss_mlp": 1.00908446, + "epoch": 0.5937171200961973, + "flos": 66378361789440.0, + "grad_norm": 0.6973173617166174, + "language_loss": 0.60004687, + "learning_rate": 1.4955091131689115e-06, + "loss": 0.62037057, + "num_input_tokens_seen": 212737560, + "router_z_loss_clip": 0.01171875, + "router_z_loss_mlp": 0.21679688, + "step": 9875, + "time_per_iteration": 3.1099135875701904 + }, + { + "auxiliary_loss_clip": 0.01110933, + "auxiliary_loss_mlp": 0.01033007, + "balance_loss_clip": 1.01980412, + "balance_loss_mlp": 1.0373013, + "epoch": 0.5937772433488652, + "flos": 14903467712640.0, + "grad_norm": 2.0699471238582943, + "language_loss": 0.77501059, + "learning_rate": 1.4951322549657594e-06, + "loss": 0.7964499, + "num_input_tokens_seen": 212755365, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 9876, + "time_per_iteration": 2.466031551361084 + }, + { + "auxiliary_loss_clip": 0.01103172, + "auxiliary_loss_mlp": 0.01028266, + "balance_loss_clip": 1.01652348, + "balance_loss_mlp": 1.03654408, + "epoch": 0.5938373666015332, + "flos": 22561489376640.0, + "grad_norm": 1.5589386174362272, + "language_loss": 0.75830436, + "learning_rate": 1.494755415907243e-06, + "loss": 0.77961862, + "num_input_tokens_seen": 212773875, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 9877, + "time_per_iteration": 2.4772722721099854 + }, + { + "auxiliary_loss_clip": 0.01108511, + "auxiliary_loss_mlp": 0.01031637, + "balance_loss_clip": 1.01892304, + "balance_loss_mlp": 1.03673589, + "epoch": 0.5938974898542011, + "flos": 18440883936000.0, + "grad_norm": 4.77912842405454, + "language_loss": 0.81212896, + "learning_rate": 1.4943785960076522e-06, + "loss": 0.83353043, + "num_input_tokens_seen": 212790590, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 9878, + "time_per_iteration": 2.511408805847168 + }, + { + "auxiliary_loss_clip": 0.01108341, + "auxiliary_loss_mlp": 0.01037126, + "balance_loss_clip": 1.02462077, + "balance_loss_mlp": 1.0378468, + "epoch": 0.5939576131068691, + "flos": 45586728270720.0, + "grad_norm": 1.7027842827521733, + "language_loss": 0.71123505, + "learning_rate": 1.4940017952812754e-06, + "loss": 0.73268974, + "num_input_tokens_seen": 212812265, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 9879, + "time_per_iteration": 2.6537530422210693 + }, + { + "auxiliary_loss_clip": 0.01107077, + "auxiliary_loss_mlp": 0.01032021, + "balance_loss_clip": 1.01973653, + "balance_loss_mlp": 1.03814936, + "epoch": 0.594017736359537, + "flos": 23587708942080.0, + "grad_norm": 1.4837097454893722, + "language_loss": 0.5739696, + "learning_rate": 1.493625013742401e-06, + "loss": 0.59536058, + "num_input_tokens_seen": 212831915, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69140625, + "step": 9880, + "time_per_iteration": 2.487082004547119 + }, + { + "auxiliary_loss_clip": 0.01107055, + "auxiliary_loss_mlp": 0.01037129, + "balance_loss_clip": 1.02435601, + "balance_loss_mlp": 1.03724837, + "epoch": 0.594077859612205, + "flos": 29457235589760.0, + "grad_norm": 1.7845732450958962, + "language_loss": 0.76980609, + "learning_rate": 1.4932482514053177e-06, + "loss": 0.79124796, + "num_input_tokens_seen": 212851350, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 9881, + "time_per_iteration": 2.5019240379333496 + }, + { + "auxiliary_loss_clip": 0.01105499, + "auxiliary_loss_mlp": 0.01026899, + "balance_loss_clip": 1.01437569, + "balance_loss_mlp": 1.03524506, + "epoch": 0.594137982864873, + "flos": 16800089644800.0, + "grad_norm": 2.214394269583833, + "language_loss": 0.82820934, + "learning_rate": 1.4928715082843112e-06, + "loss": 0.84953332, + "num_input_tokens_seen": 212867995, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 9882, + "time_per_iteration": 2.4258036613464355 + }, + { + "auxiliary_loss_clip": 0.01106542, + "auxiliary_loss_mlp": 0.01035086, + "balance_loss_clip": 1.02321863, + "balance_loss_mlp": 1.03781402, + "epoch": 0.594198106117541, + "flos": 12750263953920.0, + "grad_norm": 2.5324902309588855, + "language_loss": 0.79348171, + "learning_rate": 1.492494784393667e-06, + "loss": 0.81489801, + "num_input_tokens_seen": 212885220, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 9883, + "time_per_iteration": 2.4191815853118896 + }, + { + "auxiliary_loss_clip": 0.01112982, + "auxiliary_loss_mlp": 0.01034942, + "balance_loss_clip": 1.0214777, + "balance_loss_mlp": 1.03999424, + "epoch": 0.5942582293702089, + "flos": 20996538652800.0, + "grad_norm": 1.7967272432241739, + "language_loss": 0.74134135, + "learning_rate": 1.4921180797476725e-06, + "loss": 0.7628206, + "num_input_tokens_seen": 212903195, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.73046875, + "step": 9884, + "time_per_iteration": 2.4599032402038574 + }, + { + "auxiliary_loss_clip": 0.01112156, + "auxiliary_loss_mlp": 0.01030057, + "balance_loss_clip": 1.0181067, + "balance_loss_mlp": 1.04232001, + "epoch": 0.5943183526228769, + "flos": 28291431772800.0, + "grad_norm": 3.4474311080183964, + "language_loss": 0.6639331, + "learning_rate": 1.4917413943606106e-06, + "loss": 0.68535531, + "num_input_tokens_seen": 212923340, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.69921875, + "step": 9885, + "time_per_iteration": 3.940159797668457 + }, + { + "auxiliary_loss_clip": 0.01107325, + "auxiliary_loss_mlp": 0.01036401, + "balance_loss_clip": 1.02392602, + "balance_loss_mlp": 1.03891098, + "epoch": 0.5943784758755448, + "flos": 26614619118720.0, + "grad_norm": 2.562196250157405, + "language_loss": 0.77456462, + "learning_rate": 1.4913647282467667e-06, + "loss": 0.79600191, + "num_input_tokens_seen": 212942755, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 9886, + "time_per_iteration": 2.4958837032318115 + }, + { + "auxiliary_loss_clip": 0.01029578, + "auxiliary_loss_mlp": 0.009997, + "balance_loss_clip": 0.99845427, + "balance_loss_mlp": 1.00789237, + "epoch": 0.5944385991282128, + "flos": 64190935347840.0, + "grad_norm": 0.8479500751523403, + "language_loss": 0.64580774, + "learning_rate": 1.490988081420423e-06, + "loss": 0.6661005, + "num_input_tokens_seen": 212999355, + "router_z_loss_clip": 0.01245117, + "router_z_loss_mlp": 0.21679688, + "step": 9887, + "time_per_iteration": 4.312393426895142 + }, + { + "auxiliary_loss_clip": 0.01106228, + "auxiliary_loss_mlp": 0.01031375, + "balance_loss_clip": 1.01911473, + "balance_loss_mlp": 1.03743696, + "epoch": 0.5944987223808808, + "flos": 19571998193280.0, + "grad_norm": 1.9767325567336362, + "language_loss": 0.69172513, + "learning_rate": 1.4906114538958615e-06, + "loss": 0.71310121, + "num_input_tokens_seen": 213018570, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 9888, + "time_per_iteration": 3.8631362915039062 + }, + { + "auxiliary_loss_clip": 0.01108213, + "auxiliary_loss_mlp": 0.01030472, + "balance_loss_clip": 1.01763296, + "balance_loss_mlp": 1.03916407, + "epoch": 0.5945588456335488, + "flos": 26177586341760.0, + "grad_norm": 1.5956528037649322, + "language_loss": 0.79466522, + "learning_rate": 1.490234845687366e-06, + "loss": 0.81605208, + "num_input_tokens_seen": 213037735, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6875, + "step": 9889, + "time_per_iteration": 4.0321431159973145 + }, + { + "auxiliary_loss_clip": 0.01105021, + "auxiliary_loss_mlp": 0.01031056, + "balance_loss_clip": 1.01912892, + "balance_loss_mlp": 1.03607225, + "epoch": 0.5946189688862168, + "flos": 20446494710400.0, + "grad_norm": 1.529319229595301, + "language_loss": 0.70732993, + "learning_rate": 1.4898582568092154e-06, + "loss": 0.72869068, + "num_input_tokens_seen": 213057160, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 9890, + "time_per_iteration": 2.465503692626953 + }, + { + "auxiliary_loss_clip": 0.01110328, + "auxiliary_loss_mlp": 0.01032793, + "balance_loss_clip": 1.01994216, + "balance_loss_mlp": 1.03921902, + "epoch": 0.5946790921388847, + "flos": 13437521850240.0, + "grad_norm": 2.2570879506032933, + "language_loss": 0.69334114, + "learning_rate": 1.489481687275691e-06, + "loss": 0.71477234, + "num_input_tokens_seen": 213073630, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 9891, + "time_per_iteration": 2.4280505180358887 + }, + { + "auxiliary_loss_clip": 0.01106776, + "auxiliary_loss_mlp": 0.01036094, + "balance_loss_clip": 1.02376795, + "balance_loss_mlp": 1.03809762, + "epoch": 0.5947392153915527, + "flos": 20412272027520.0, + "grad_norm": 1.752140694177181, + "language_loss": 0.53531826, + "learning_rate": 1.4891051371010726e-06, + "loss": 0.55674696, + "num_input_tokens_seen": 213092450, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 9892, + "time_per_iteration": 2.4815757274627686 + }, + { + "auxiliary_loss_clip": 0.01030384, + "auxiliary_loss_mlp": 0.0100208, + "balance_loss_clip": 1.00095928, + "balance_loss_mlp": 1.00874603, + "epoch": 0.5947993386442206, + "flos": 65619138994560.0, + "grad_norm": 0.6588951163028871, + "language_loss": 0.54535234, + "learning_rate": 1.4887286062996375e-06, + "loss": 0.56567693, + "num_input_tokens_seen": 213155465, + "router_z_loss_clip": 0.01123047, + "router_z_loss_mlp": 0.21679688, + "step": 9893, + "time_per_iteration": 3.1101529598236084 + }, + { + "auxiliary_loss_clip": 0.01106079, + "auxiliary_loss_mlp": 0.01030799, + "balance_loss_clip": 1.01892543, + "balance_loss_mlp": 1.03811431, + "epoch": 0.5948594618968887, + "flos": 23183103168000.0, + "grad_norm": 1.588107459430707, + "language_loss": 0.74231315, + "learning_rate": 1.4883520948856658e-06, + "loss": 0.76368201, + "num_input_tokens_seen": 213174875, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 9894, + "time_per_iteration": 2.4519400596618652 + }, + { + "auxiliary_loss_clip": 0.01106074, + "auxiliary_loss_mlp": 0.01032085, + "balance_loss_clip": 1.02005649, + "balance_loss_mlp": 1.03685939, + "epoch": 0.5949195851495566, + "flos": 13626771632640.0, + "grad_norm": 1.6911288792838162, + "language_loss": 0.77848423, + "learning_rate": 1.487975602873434e-06, + "loss": 0.79986584, + "num_input_tokens_seen": 213192695, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6953125, + "step": 9895, + "time_per_iteration": 2.524150848388672 + }, + { + "auxiliary_loss_clip": 0.01110174, + "auxiliary_loss_mlp": 0.01032, + "balance_loss_clip": 1.01923883, + "balance_loss_mlp": 1.0391717, + "epoch": 0.5949797084022246, + "flos": 19751012599680.0, + "grad_norm": 1.6627914614590094, + "language_loss": 0.79355633, + "learning_rate": 1.4875991302772182e-06, + "loss": 0.814978, + "num_input_tokens_seen": 213211195, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9896, + "time_per_iteration": 2.450514078140259 + }, + { + "auxiliary_loss_clip": 0.01108131, + "auxiliary_loss_mlp": 0.01032365, + "balance_loss_clip": 1.01991367, + "balance_loss_mlp": 1.0379312, + "epoch": 0.5950398316548925, + "flos": 25773878407680.0, + "grad_norm": 1.56691412182982, + "language_loss": 0.83697438, + "learning_rate": 1.4872226771112954e-06, + "loss": 0.8583793, + "num_input_tokens_seen": 213231975, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.703125, + "step": 9897, + "time_per_iteration": 2.499427556991577 + }, + { + "auxiliary_loss_clip": 0.0111126, + "auxiliary_loss_mlp": 0.01033118, + "balance_loss_clip": 1.02043986, + "balance_loss_mlp": 1.04021525, + "epoch": 0.5950999549075605, + "flos": 23039029716480.0, + "grad_norm": 1.7628400615055348, + "language_loss": 0.70908117, + "learning_rate": 1.486846243389939e-06, + "loss": 0.7305249, + "num_input_tokens_seen": 213249760, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9898, + "time_per_iteration": 2.450711488723755 + }, + { + "auxiliary_loss_clip": 0.01114075, + "auxiliary_loss_mlp": 0.01038747, + "balance_loss_clip": 1.02481782, + "balance_loss_mlp": 1.03905582, + "epoch": 0.5951600781602284, + "flos": 32446367637120.0, + "grad_norm": 2.840239375448059, + "language_loss": 0.64112437, + "learning_rate": 1.4864698291274251e-06, + "loss": 0.66265255, + "num_input_tokens_seen": 213269890, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.75, + "step": 9899, + "time_per_iteration": 2.5394744873046875 + }, + { + "auxiliary_loss_clip": 0.01109128, + "auxiliary_loss_mlp": 0.01026657, + "balance_loss_clip": 1.01592183, + "balance_loss_mlp": 1.04008675, + "epoch": 0.5952202014128964, + "flos": 23800874204160.0, + "grad_norm": 1.879978941191363, + "language_loss": 0.71715653, + "learning_rate": 1.4860934343380267e-06, + "loss": 0.73851436, + "num_input_tokens_seen": 213289400, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6875, + "step": 9900, + "time_per_iteration": 2.4623067378997803 + }, + { + "auxiliary_loss_clip": 0.01107194, + "auxiliary_loss_mlp": 0.01029325, + "balance_loss_clip": 1.01654577, + "balance_loss_mlp": 1.03926349, + "epoch": 0.5952803246655644, + "flos": 22492182084480.0, + "grad_norm": 1.9859766918367532, + "language_loss": 0.84489024, + "learning_rate": 1.4857170590360169e-06, + "loss": 0.86625552, + "num_input_tokens_seen": 213308040, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 9901, + "time_per_iteration": 2.4463791847229004 + }, + { + "auxiliary_loss_clip": 0.01028301, + "auxiliary_loss_mlp": 0.01003723, + "balance_loss_clip": 1.00249529, + "balance_loss_mlp": 1.00672269, + "epoch": 0.5953404479182324, + "flos": 51234688851840.0, + "grad_norm": 0.8098587011957621, + "language_loss": 0.58273184, + "learning_rate": 1.4853407032356674e-06, + "loss": 0.60305208, + "num_input_tokens_seen": 213358585, + "router_z_loss_clip": 0.01226807, + "router_z_loss_mlp": 0.21582031, + "step": 9902, + "time_per_iteration": 2.9000015258789062 + }, + { + "auxiliary_loss_clip": 0.01109898, + "auxiliary_loss_mlp": 0.0103143, + "balance_loss_clip": 1.01876402, + "balance_loss_mlp": 1.03859127, + "epoch": 0.5954005711709004, + "flos": 23112682554240.0, + "grad_norm": 3.08671627053405, + "language_loss": 0.77136552, + "learning_rate": 1.4849643669512503e-06, + "loss": 0.79277885, + "num_input_tokens_seen": 213379585, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71484375, + "step": 9903, + "time_per_iteration": 2.5076375007629395 + }, + { + "auxiliary_loss_clip": 0.01111406, + "auxiliary_loss_mlp": 0.01036008, + "balance_loss_clip": 1.02430773, + "balance_loss_mlp": 1.04097402, + "epoch": 0.5954606944235683, + "flos": 35954732736000.0, + "grad_norm": 1.7111155417857251, + "language_loss": 0.77616894, + "learning_rate": 1.4845880501970362e-06, + "loss": 0.79764313, + "num_input_tokens_seen": 213401465, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.703125, + "step": 9904, + "time_per_iteration": 2.5716845989227295 + }, + { + "auxiliary_loss_clip": 0.01110151, + "auxiliary_loss_mlp": 0.01036445, + "balance_loss_clip": 1.02405953, + "balance_loss_mlp": 1.03790653, + "epoch": 0.5955208176762363, + "flos": 30443665864320.0, + "grad_norm": 2.2036474032145192, + "language_loss": 0.72382712, + "learning_rate": 1.4842117529872942e-06, + "loss": 0.74529308, + "num_input_tokens_seen": 213422720, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.72265625, + "step": 9905, + "time_per_iteration": 2.5354321002960205 + }, + { + "auxiliary_loss_clip": 0.01109645, + "auxiliary_loss_mlp": 0.01030539, + "balance_loss_clip": 1.01789069, + "balance_loss_mlp": 1.03853083, + "epoch": 0.5955809409289042, + "flos": 17640112083840.0, + "grad_norm": 1.6203597758298474, + "language_loss": 0.69817066, + "learning_rate": 1.483835475336295e-06, + "loss": 0.71957242, + "num_input_tokens_seen": 213439480, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9906, + "time_per_iteration": 2.4373247623443604 + }, + { + "auxiliary_loss_clip": 0.01110789, + "auxiliary_loss_mlp": 0.01035293, + "balance_loss_clip": 1.02259731, + "balance_loss_mlp": 1.03987217, + "epoch": 0.5956410641815723, + "flos": 24279887001600.0, + "grad_norm": 1.782354761153575, + "language_loss": 0.7491982, + "learning_rate": 1.4834592172583057e-06, + "loss": 0.77065903, + "num_input_tokens_seen": 213458895, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 9907, + "time_per_iteration": 2.5548195838928223 + }, + { + "auxiliary_loss_clip": 0.01109413, + "auxiliary_loss_mlp": 0.01035553, + "balance_loss_clip": 1.02353668, + "balance_loss_mlp": 1.0388813, + "epoch": 0.5957011874342402, + "flos": 35734277013120.0, + "grad_norm": 1.601142913290667, + "language_loss": 0.67155874, + "learning_rate": 1.483082978767595e-06, + "loss": 0.69300842, + "num_input_tokens_seen": 213481730, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.70703125, + "step": 9908, + "time_per_iteration": 2.5727956295013428 + }, + { + "auxiliary_loss_clip": 0.01108392, + "auxiliary_loss_mlp": 0.01029453, + "balance_loss_clip": 1.01753211, + "balance_loss_mlp": 1.03904438, + "epoch": 0.5957613106869082, + "flos": 21245004005760.0, + "grad_norm": 5.1100613292928365, + "language_loss": 0.76492268, + "learning_rate": 1.4827067598784298e-06, + "loss": 0.78630114, + "num_input_tokens_seen": 213497225, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 9909, + "time_per_iteration": 2.459608554840088 + }, + { + "auxiliary_loss_clip": 0.01028544, + "auxiliary_loss_mlp": 0.01005303, + "balance_loss_clip": 1.00416493, + "balance_loss_mlp": 1.00715542, + "epoch": 0.5958214339395761, + "flos": 65940969876480.0, + "grad_norm": 0.9275868367088792, + "language_loss": 0.73427647, + "learning_rate": 1.4823305606050753e-06, + "loss": 0.75461495, + "num_input_tokens_seen": 213556890, + "router_z_loss_clip": 0.01141357, + "router_z_loss_mlp": 0.21386719, + "step": 9910, + "time_per_iteration": 3.1051745414733887 + }, + { + "auxiliary_loss_clip": 0.01108818, + "auxiliary_loss_mlp": 0.01032907, + "balance_loss_clip": 1.01981187, + "balance_loss_mlp": 1.03741884, + "epoch": 0.5958815571922441, + "flos": 23218690567680.0, + "grad_norm": 1.6458105124951614, + "language_loss": 0.69844317, + "learning_rate": 1.481954380961799e-06, + "loss": 0.71986043, + "num_input_tokens_seen": 213575800, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71484375, + "step": 9911, + "time_per_iteration": 2.4647021293640137 + }, + { + "auxiliary_loss_clip": 0.01116428, + "auxiliary_loss_mlp": 0.01033668, + "balance_loss_clip": 1.02031708, + "balance_loss_mlp": 1.04145718, + "epoch": 0.595941680444912, + "flos": 16538623568640.0, + "grad_norm": 1.8630263408862686, + "language_loss": 0.65476716, + "learning_rate": 1.4815782209628631e-06, + "loss": 0.6762681, + "num_input_tokens_seen": 213592740, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.75, + "step": 9912, + "time_per_iteration": 2.4077272415161133 + }, + { + "auxiliary_loss_clip": 0.01108551, + "auxiliary_loss_mlp": 0.01036693, + "balance_loss_clip": 1.02385449, + "balance_loss_mlp": 1.03806984, + "epoch": 0.59600180369758, + "flos": 27818883423360.0, + "grad_norm": 2.0476871057930772, + "language_loss": 0.73610109, + "learning_rate": 1.4812020806225337e-06, + "loss": 0.75755352, + "num_input_tokens_seen": 213611970, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 9913, + "time_per_iteration": 2.5155045986175537 + }, + { + "auxiliary_loss_clip": 0.01113961, + "auxiliary_loss_mlp": 0.0103048, + "balance_loss_clip": 1.0178144, + "balance_loss_mlp": 1.03791463, + "epoch": 0.596061926950248, + "flos": 29491566013440.0, + "grad_norm": 2.0765652786465885, + "language_loss": 0.79696703, + "learning_rate": 1.4808259599550738e-06, + "loss": 0.81841141, + "num_input_tokens_seen": 213632230, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.76171875, + "step": 9914, + "time_per_iteration": 2.4950027465820312 + }, + { + "auxiliary_loss_clip": 0.01107633, + "auxiliary_loss_mlp": 0.01030894, + "balance_loss_clip": 1.0189786, + "balance_loss_mlp": 1.03856075, + "epoch": 0.596122050202916, + "flos": 16836790366080.0, + "grad_norm": 1.9745402695948293, + "language_loss": 0.67218065, + "learning_rate": 1.4804498589747448e-06, + "loss": 0.69356596, + "num_input_tokens_seen": 213649645, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69140625, + "step": 9915, + "time_per_iteration": 2.43723726272583 + }, + { + "auxiliary_loss_clip": 0.01107786, + "auxiliary_loss_mlp": 0.01035427, + "balance_loss_clip": 1.02319074, + "balance_loss_mlp": 1.03634763, + "epoch": 0.596182173455584, + "flos": 20996646393600.0, + "grad_norm": 1.613453800947639, + "language_loss": 0.78928566, + "learning_rate": 1.4800737776958095e-06, + "loss": 0.81071782, + "num_input_tokens_seen": 213668850, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.71484375, + "step": 9916, + "time_per_iteration": 2.456350088119507 + }, + { + "auxiliary_loss_clip": 0.01108915, + "auxiliary_loss_mlp": 0.0103207, + "balance_loss_clip": 1.01933253, + "balance_loss_mlp": 1.03744936, + "epoch": 0.5962422967082519, + "flos": 16065680169600.0, + "grad_norm": 1.7690461818627004, + "language_loss": 0.82394695, + "learning_rate": 1.4796977161325286e-06, + "loss": 0.84535682, + "num_input_tokens_seen": 213685695, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71484375, + "step": 9917, + "time_per_iteration": 2.469238758087158 + }, + { + "auxiliary_loss_clip": 0.01108059, + "auxiliary_loss_mlp": 0.01035556, + "balance_loss_clip": 1.02383804, + "balance_loss_mlp": 1.03837276, + "epoch": 0.5963024199609199, + "flos": 12166966995840.0, + "grad_norm": 1.817824058021054, + "language_loss": 0.77982944, + "learning_rate": 1.4793216742991625e-06, + "loss": 0.8012656, + "num_input_tokens_seen": 213703515, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6953125, + "step": 9918, + "time_per_iteration": 2.4436004161834717 + }, + { + "auxiliary_loss_clip": 0.01109399, + "auxiliary_loss_mlp": 0.01034518, + "balance_loss_clip": 1.02182257, + "balance_loss_mlp": 1.0390811, + "epoch": 0.5963625432135878, + "flos": 28074280101120.0, + "grad_norm": 1.422582146168897, + "language_loss": 0.78566158, + "learning_rate": 1.4789456522099707e-06, + "loss": 0.80710077, + "num_input_tokens_seen": 213724170, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 9919, + "time_per_iteration": 2.5787289142608643 + }, + { + "auxiliary_loss_clip": 0.01107781, + "auxiliary_loss_mlp": 0.01034192, + "balance_loss_clip": 1.02094173, + "balance_loss_mlp": 1.0381664, + "epoch": 0.5964226664662559, + "flos": 19860324664320.0, + "grad_norm": 1.9239790966111896, + "language_loss": 0.77425951, + "learning_rate": 1.4785696498792122e-06, + "loss": 0.79567927, + "num_input_tokens_seen": 213740620, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6953125, + "step": 9920, + "time_per_iteration": 2.4440083503723145 + }, + { + "auxiliary_loss_clip": 0.01113744, + "auxiliary_loss_mlp": 0.01030569, + "balance_loss_clip": 1.01843953, + "balance_loss_mlp": 1.04212332, + "epoch": 0.5964827897189238, + "flos": 12932618325120.0, + "grad_norm": 2.2435260632361733, + "language_loss": 0.82452321, + "learning_rate": 1.4781936673211446e-06, + "loss": 0.84596634, + "num_input_tokens_seen": 213755390, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71484375, + "step": 9921, + "time_per_iteration": 2.456138849258423 + }, + { + "auxiliary_loss_clip": 0.0110802, + "auxiliary_loss_mlp": 0.01032844, + "balance_loss_clip": 1.02021408, + "balance_loss_mlp": 1.0373764, + "epoch": 0.5965429129715918, + "flos": 18150797698560.0, + "grad_norm": 1.9967408520895134, + "language_loss": 0.80682462, + "learning_rate": 1.4778177045500252e-06, + "loss": 0.82823324, + "num_input_tokens_seen": 213773225, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.70703125, + "step": 9922, + "time_per_iteration": 2.4144599437713623 + }, + { + "auxiliary_loss_clip": 0.0110795, + "auxiliary_loss_mlp": 0.01029598, + "balance_loss_clip": 1.01693821, + "balance_loss_mlp": 1.03790641, + "epoch": 0.5966030362242597, + "flos": 21763231476480.0, + "grad_norm": 1.7485306495183626, + "language_loss": 0.77080536, + "learning_rate": 1.477441761580111e-06, + "loss": 0.79218084, + "num_input_tokens_seen": 213791860, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 9923, + "time_per_iteration": 2.489145517349243 + }, + { + "auxiliary_loss_clip": 0.01115253, + "auxiliary_loss_mlp": 0.01035824, + "balance_loss_clip": 1.02174497, + "balance_loss_mlp": 1.04084301, + "epoch": 0.5966631594769277, + "flos": 18807208790400.0, + "grad_norm": 1.7680593419575392, + "language_loss": 0.75725371, + "learning_rate": 1.4770658384256573e-06, + "loss": 0.77876449, + "num_input_tokens_seen": 213809455, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7421875, + "step": 9924, + "time_per_iteration": 2.4216740131378174 + }, + { + "auxiliary_loss_clip": 0.01105499, + "auxiliary_loss_mlp": 0.01032044, + "balance_loss_clip": 1.0190742, + "balance_loss_mlp": 1.03832626, + "epoch": 0.5967232827295956, + "flos": 14064163545600.0, + "grad_norm": 3.198852886281723, + "language_loss": 0.6646719, + "learning_rate": 1.4766899351009204e-06, + "loss": 0.68604732, + "num_input_tokens_seen": 213826615, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.671875, + "step": 9925, + "time_per_iteration": 2.4475882053375244 + }, + { + "auxiliary_loss_clip": 0.01109319, + "auxiliary_loss_mlp": 0.01032439, + "balance_loss_clip": 1.01986837, + "balance_loss_mlp": 1.04157531, + "epoch": 0.5967834059822636, + "flos": 17238235743360.0, + "grad_norm": 2.375187864026988, + "language_loss": 0.71979719, + "learning_rate": 1.4763140516201528e-06, + "loss": 0.74121475, + "num_input_tokens_seen": 213844495, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6796875, + "step": 9926, + "time_per_iteration": 2.4132394790649414 + }, + { + "auxiliary_loss_clip": 0.01111749, + "auxiliary_loss_mlp": 0.01033269, + "balance_loss_clip": 1.02014971, + "balance_loss_mlp": 1.03978753, + "epoch": 0.5968435292349316, + "flos": 42520244284800.0, + "grad_norm": 1.812838696961727, + "language_loss": 0.70522958, + "learning_rate": 1.4759381879976088e-06, + "loss": 0.7266798, + "num_input_tokens_seen": 213869125, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 9927, + "time_per_iteration": 4.071920156478882 + }, + { + "auxiliary_loss_clip": 0.01112228, + "auxiliary_loss_mlp": 0.01031173, + "balance_loss_clip": 1.01779175, + "balance_loss_mlp": 1.03788543, + "epoch": 0.5969036524875996, + "flos": 37630898945280.0, + "grad_norm": 1.756068652476383, + "language_loss": 0.63428164, + "learning_rate": 1.4755623442475415e-06, + "loss": 0.65571564, + "num_input_tokens_seen": 213891115, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7421875, + "step": 9928, + "time_per_iteration": 2.616556406021118 + }, + { + "auxiliary_loss_clip": 0.01105274, + "auxiliary_loss_mlp": 0.01029748, + "balance_loss_clip": 1.01774395, + "balance_loss_mlp": 1.0362494, + "epoch": 0.5969637757402676, + "flos": 23148377694720.0, + "grad_norm": 1.5985801618436777, + "language_loss": 0.69484866, + "learning_rate": 1.4751865203842022e-06, + "loss": 0.71619892, + "num_input_tokens_seen": 213911925, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 9929, + "time_per_iteration": 3.929401397705078 + }, + { + "auxiliary_loss_clip": 0.01106506, + "auxiliary_loss_mlp": 0.01034705, + "balance_loss_clip": 1.02314126, + "balance_loss_mlp": 1.0390749, + "epoch": 0.5970238989929355, + "flos": 24020934877440.0, + "grad_norm": 1.8723634053132125, + "language_loss": 0.7651577, + "learning_rate": 1.4748107164218431e-06, + "loss": 0.78656977, + "num_input_tokens_seen": 213930715, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.67578125, + "step": 9930, + "time_per_iteration": 3.9201314449310303 + }, + { + "auxiliary_loss_clip": 0.01114181, + "auxiliary_loss_mlp": 0.01032152, + "balance_loss_clip": 1.01845503, + "balance_loss_mlp": 1.04086351, + "epoch": 0.5970840222456035, + "flos": 19426883247360.0, + "grad_norm": 1.7493285690141849, + "language_loss": 0.69032001, + "learning_rate": 1.4744349323747146e-06, + "loss": 0.71178329, + "num_input_tokens_seen": 213950015, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.734375, + "step": 9931, + "time_per_iteration": 3.879492998123169 + }, + { + "auxiliary_loss_clip": 0.01027027, + "auxiliary_loss_mlp": 0.00997139, + "balance_loss_clip": 0.99597675, + "balance_loss_mlp": 1.00581264, + "epoch": 0.5971441454982714, + "flos": 62976615235200.0, + "grad_norm": 0.8633082810339764, + "language_loss": 0.64247859, + "learning_rate": 1.474059168257065e-06, + "loss": 0.6627202, + "num_input_tokens_seen": 214003330, + "router_z_loss_clip": 0.01159668, + "router_z_loss_mlp": 0.21289062, + "step": 9932, + "time_per_iteration": 2.985929489135742 + }, + { + "auxiliary_loss_clip": 0.01109379, + "auxiliary_loss_mlp": 0.01029489, + "balance_loss_clip": 1.01604247, + "balance_loss_mlp": 1.03876853, + "epoch": 0.5972042687509395, + "flos": 20266223328000.0, + "grad_norm": 1.8784919283093424, + "language_loss": 0.74257267, + "learning_rate": 1.4736834240831454e-06, + "loss": 0.76396132, + "num_input_tokens_seen": 214021680, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.70703125, + "step": 9933, + "time_per_iteration": 2.4789366722106934 + }, + { + "auxiliary_loss_clip": 0.01027236, + "auxiliary_loss_mlp": 0.00998624, + "balance_loss_clip": 0.997509, + "balance_loss_mlp": 1.00592136, + "epoch": 0.5972643920036074, + "flos": 71652383832960.0, + "grad_norm": 0.6667374312128803, + "language_loss": 0.51967168, + "learning_rate": 1.473307699867203e-06, + "loss": 0.53993034, + "num_input_tokens_seen": 214090265, + "router_z_loss_clip": 0.01116943, + "router_z_loss_mlp": 0.21289062, + "step": 9934, + "time_per_iteration": 3.181849956512451 + }, + { + "auxiliary_loss_clip": 0.01027661, + "auxiliary_loss_mlp": 0.00997349, + "balance_loss_clip": 0.99616891, + "balance_loss_mlp": 1.00641167, + "epoch": 0.5973245152562754, + "flos": 56892702263040.0, + "grad_norm": 0.8444164965298677, + "language_loss": 0.54164159, + "learning_rate": 1.4729319956234849e-06, + "loss": 0.56189167, + "num_input_tokens_seen": 214146375, + "router_z_loss_clip": 0.01177979, + "router_z_loss_mlp": 0.21289062, + "step": 9935, + "time_per_iteration": 2.997821807861328 + }, + { + "auxiliary_loss_clip": 0.01108103, + "auxiliary_loss_mlp": 0.01034314, + "balance_loss_clip": 1.02102828, + "balance_loss_mlp": 1.03731823, + "epoch": 0.5973846385089433, + "flos": 24164361884160.0, + "grad_norm": 1.5699606989571269, + "language_loss": 0.65541828, + "learning_rate": 1.4725563113662394e-06, + "loss": 0.67684245, + "num_input_tokens_seen": 214165340, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.70703125, + "step": 9936, + "time_per_iteration": 2.533317804336548 + }, + { + "auxiliary_loss_clip": 0.01110253, + "auxiliary_loss_mlp": 0.01032053, + "balance_loss_clip": 1.02026367, + "balance_loss_mlp": 1.03937888, + "epoch": 0.5974447617616113, + "flos": 17670599752320.0, + "grad_norm": 2.0123537966767797, + "language_loss": 0.67731905, + "learning_rate": 1.4721806471097103e-06, + "loss": 0.69874215, + "num_input_tokens_seen": 214181360, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.7109375, + "step": 9937, + "time_per_iteration": 2.4379465579986572 + }, + { + "auxiliary_loss_clip": 0.01112093, + "auxiliary_loss_mlp": 0.01034338, + "balance_loss_clip": 1.02101064, + "balance_loss_mlp": 1.03899479, + "epoch": 0.5975048850142792, + "flos": 22892514140160.0, + "grad_norm": 3.133342754143776, + "language_loss": 0.77174151, + "learning_rate": 1.4718050028681442e-06, + "loss": 0.79320574, + "num_input_tokens_seen": 214198525, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73046875, + "step": 9938, + "time_per_iteration": 2.470590114593506 + }, + { + "auxiliary_loss_clip": 0.01110044, + "auxiliary_loss_mlp": 0.01030032, + "balance_loss_clip": 1.01708603, + "balance_loss_mlp": 1.03813004, + "epoch": 0.5975650082669473, + "flos": 24353108876160.0, + "grad_norm": 1.6192850653818303, + "language_loss": 0.75987661, + "learning_rate": 1.4714293786557855e-06, + "loss": 0.78127742, + "num_input_tokens_seen": 214218710, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 9939, + "time_per_iteration": 2.477731227874756 + }, + { + "auxiliary_loss_clip": 0.01113496, + "auxiliary_loss_mlp": 0.01030328, + "balance_loss_clip": 1.01565337, + "balance_loss_mlp": 1.03811717, + "epoch": 0.5976251315196152, + "flos": 20923352691840.0, + "grad_norm": 2.2637964874634124, + "language_loss": 0.6840167, + "learning_rate": 1.471053774486878e-06, + "loss": 0.70545495, + "num_input_tokens_seen": 214237800, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.75390625, + "step": 9940, + "time_per_iteration": 2.4641294479370117 + }, + { + "auxiliary_loss_clip": 0.01103786, + "auxiliary_loss_mlp": 0.01033159, + "balance_loss_clip": 1.02150035, + "balance_loss_mlp": 1.03630126, + "epoch": 0.5976852547722832, + "flos": 35844594658560.0, + "grad_norm": 1.3031499437689418, + "language_loss": 0.70227146, + "learning_rate": 1.470678190375664e-06, + "loss": 0.72364092, + "num_input_tokens_seen": 214260355, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.67578125, + "step": 9941, + "time_per_iteration": 2.644956111907959 + }, + { + "auxiliary_loss_clip": 0.01103617, + "auxiliary_loss_mlp": 0.01033415, + "balance_loss_clip": 1.02042711, + "balance_loss_mlp": 1.0345757, + "epoch": 0.5977453780249512, + "flos": 12855948744960.0, + "grad_norm": 2.0310172288776456, + "language_loss": 0.77255404, + "learning_rate": 1.470302626336386e-06, + "loss": 0.79392433, + "num_input_tokens_seen": 214277120, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.69140625, + "step": 9942, + "time_per_iteration": 2.4575772285461426 + }, + { + "auxiliary_loss_clip": 0.01108233, + "auxiliary_loss_mlp": 0.01040799, + "balance_loss_clip": 1.02815676, + "balance_loss_mlp": 1.03664815, + "epoch": 0.5978055012776191, + "flos": 20959155573120.0, + "grad_norm": 1.8744137632140625, + "language_loss": 0.7585178, + "learning_rate": 1.4699270823832857e-06, + "loss": 0.78000808, + "num_input_tokens_seen": 214295300, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71484375, + "step": 9943, + "time_per_iteration": 2.4413061141967773 + }, + { + "auxiliary_loss_clip": 0.01105208, + "auxiliary_loss_mlp": 0.01030161, + "balance_loss_clip": 1.01884818, + "balance_loss_mlp": 1.03699136, + "epoch": 0.5978656245302871, + "flos": 34058003063040.0, + "grad_norm": 1.7396443017276344, + "language_loss": 0.61821425, + "learning_rate": 1.4695515585306032e-06, + "loss": 0.63956803, + "num_input_tokens_seen": 214317050, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.68359375, + "step": 9944, + "time_per_iteration": 2.569403886795044 + }, + { + "auxiliary_loss_clip": 0.01110079, + "auxiliary_loss_mlp": 0.01035221, + "balance_loss_clip": 1.02228653, + "balance_loss_mlp": 1.0391618, + "epoch": 0.597925747782955, + "flos": 37373275624320.0, + "grad_norm": 1.6935047887113677, + "language_loss": 0.72621685, + "learning_rate": 1.4691760547925795e-06, + "loss": 0.74766988, + "num_input_tokens_seen": 214337470, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 9945, + "time_per_iteration": 2.5811283588409424 + }, + { + "auxiliary_loss_clip": 0.0110514, + "auxiliary_loss_mlp": 0.01032852, + "balance_loss_clip": 1.02017426, + "balance_loss_mlp": 1.03536916, + "epoch": 0.5979858710356231, + "flos": 25374803328000.0, + "grad_norm": 2.0883326121528443, + "language_loss": 0.67156124, + "learning_rate": 1.4688005711834522e-06, + "loss": 0.69294119, + "num_input_tokens_seen": 214357975, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 9946, + "time_per_iteration": 2.513643503189087 + }, + { + "auxiliary_loss_clip": 0.01111839, + "auxiliary_loss_mlp": 0.01036188, + "balance_loss_clip": 1.02280676, + "balance_loss_mlp": 1.03886974, + "epoch": 0.598045994288291, + "flos": 13698413308800.0, + "grad_norm": 2.0799446912413386, + "language_loss": 0.88996196, + "learning_rate": 1.468425107717461e-06, + "loss": 0.91144222, + "num_input_tokens_seen": 214374125, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.73046875, + "step": 9947, + "time_per_iteration": 2.4069466590881348 + }, + { + "auxiliary_loss_clip": 0.01102487, + "auxiliary_loss_mlp": 0.01035974, + "balance_loss_clip": 1.02501893, + "balance_loss_mlp": 1.03634834, + "epoch": 0.598106117540959, + "flos": 21981352815360.0, + "grad_norm": 1.664735448435926, + "language_loss": 0.72050726, + "learning_rate": 1.4680496644088432e-06, + "loss": 0.74189186, + "num_input_tokens_seen": 214393395, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6640625, + "step": 9948, + "time_per_iteration": 2.474961280822754 + }, + { + "auxiliary_loss_clip": 0.01107668, + "auxiliary_loss_mlp": 0.01031342, + "balance_loss_clip": 1.01749587, + "balance_loss_mlp": 1.03676891, + "epoch": 0.5981662407936269, + "flos": 20559362221440.0, + "grad_norm": 1.8018456141940389, + "language_loss": 0.89439249, + "learning_rate": 1.4676742412718347e-06, + "loss": 0.91578257, + "num_input_tokens_seen": 214411550, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7109375, + "step": 9949, + "time_per_iteration": 2.455151319503784 + }, + { + "auxiliary_loss_clip": 0.0110613, + "auxiliary_loss_mlp": 0.01026216, + "balance_loss_clip": 1.01458669, + "balance_loss_mlp": 1.03746963, + "epoch": 0.5982263640462949, + "flos": 14063840323200.0, + "grad_norm": 1.9594093526491967, + "language_loss": 0.70425475, + "learning_rate": 1.467298838320673e-06, + "loss": 0.72557819, + "num_input_tokens_seen": 214429780, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6875, + "step": 9950, + "time_per_iteration": 2.479177474975586 + }, + { + "auxiliary_loss_clip": 0.01106992, + "auxiliary_loss_mlp": 0.01030869, + "balance_loss_clip": 1.01816094, + "balance_loss_mlp": 1.03653646, + "epoch": 0.5982864872989628, + "flos": 17707228646400.0, + "grad_norm": 1.7839667170115563, + "language_loss": 0.78153586, + "learning_rate": 1.4669234555695921e-06, + "loss": 0.8029145, + "num_input_tokens_seen": 214447775, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 9951, + "time_per_iteration": 2.4318583011627197 + }, + { + "auxiliary_loss_clip": 0.01108258, + "auxiliary_loss_mlp": 0.01042077, + "balance_loss_clip": 1.02885103, + "balance_loss_mlp": 1.03666139, + "epoch": 0.5983466105516309, + "flos": 16764789553920.0, + "grad_norm": 6.7296631151691235, + "language_loss": 0.73816681, + "learning_rate": 1.4665480930328275e-06, + "loss": 0.75967014, + "num_input_tokens_seen": 214467245, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 9952, + "time_per_iteration": 2.4669008255004883 + }, + { + "auxiliary_loss_clip": 0.01109291, + "auxiliary_loss_mlp": 0.01030783, + "balance_loss_clip": 1.01705003, + "balance_loss_mlp": 1.03699803, + "epoch": 0.5984067338042988, + "flos": 20042714949120.0, + "grad_norm": 2.1100044837404264, + "language_loss": 0.78595901, + "learning_rate": 1.466172750724613e-06, + "loss": 0.8073597, + "num_input_tokens_seen": 214484385, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.72265625, + "step": 9953, + "time_per_iteration": 2.432607650756836 + }, + { + "auxiliary_loss_clip": 0.01106295, + "auxiliary_loss_mlp": 0.01030481, + "balance_loss_clip": 1.01883411, + "balance_loss_mlp": 1.03698087, + "epoch": 0.5984668570569668, + "flos": 26319900026880.0, + "grad_norm": 1.6558066102502929, + "language_loss": 0.69747621, + "learning_rate": 1.4657974286591807e-06, + "loss": 0.71884394, + "num_input_tokens_seen": 214503465, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6953125, + "step": 9954, + "time_per_iteration": 2.5316383838653564 + }, + { + "auxiliary_loss_clip": 0.01106341, + "auxiliary_loss_mlp": 0.01031118, + "balance_loss_clip": 1.01923835, + "balance_loss_mlp": 1.03664923, + "epoch": 0.5985269803096348, + "flos": 20593728558720.0, + "grad_norm": 1.7741106098423227, + "language_loss": 0.73212743, + "learning_rate": 1.4654221268507637e-06, + "loss": 0.75350201, + "num_input_tokens_seen": 214520725, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6953125, + "step": 9955, + "time_per_iteration": 2.457697629928589 + }, + { + "auxiliary_loss_clip": 0.01107558, + "auxiliary_loss_mlp": 0.01030352, + "balance_loss_clip": 1.01816237, + "balance_loss_mlp": 1.03694773, + "epoch": 0.5985871035623027, + "flos": 26865382942080.0, + "grad_norm": 1.8276717412391432, + "language_loss": 0.68681955, + "learning_rate": 1.4650468453135934e-06, + "loss": 0.70819867, + "num_input_tokens_seen": 214540675, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.70703125, + "step": 9956, + "time_per_iteration": 2.5265135765075684 + }, + { + "auxiliary_loss_clip": 0.01109542, + "auxiliary_loss_mlp": 0.01031362, + "balance_loss_clip": 1.0191431, + "balance_loss_mlp": 1.03873038, + "epoch": 0.5986472268149707, + "flos": 19609704495360.0, + "grad_norm": 2.224432093074028, + "language_loss": 0.73662853, + "learning_rate": 1.4646715840618999e-06, + "loss": 0.75803757, + "num_input_tokens_seen": 214559910, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.7109375, + "step": 9957, + "time_per_iteration": 2.4384164810180664 + }, + { + "auxiliary_loss_clip": 0.01105192, + "auxiliary_loss_mlp": 0.0102626, + "balance_loss_clip": 1.01433289, + "balance_loss_mlp": 1.03838789, + "epoch": 0.5987073500676386, + "flos": 21794616984960.0, + "grad_norm": 1.875022862600817, + "language_loss": 0.84732842, + "learning_rate": 1.4642963431099138e-06, + "loss": 0.86864293, + "num_input_tokens_seen": 214575960, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66796875, + "step": 9958, + "time_per_iteration": 2.501417636871338 + }, + { + "auxiliary_loss_clip": 0.01109112, + "auxiliary_loss_mlp": 0.01037074, + "balance_loss_clip": 1.02396715, + "balance_loss_mlp": 1.03740525, + "epoch": 0.5987674733203067, + "flos": 24314361079680.0, + "grad_norm": 2.024494152709453, + "language_loss": 0.66685295, + "learning_rate": 1.463921122471864e-06, + "loss": 0.6883148, + "num_input_tokens_seen": 214594230, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 9959, + "time_per_iteration": 2.471848726272583 + }, + { + "auxiliary_loss_clip": 0.01108718, + "auxiliary_loss_mlp": 0.01029772, + "balance_loss_clip": 1.01754093, + "balance_loss_mlp": 1.0389334, + "epoch": 0.5988275965729746, + "flos": 21320201128320.0, + "grad_norm": 1.6260957561310903, + "language_loss": 0.83360457, + "learning_rate": 1.4635459221619796e-06, + "loss": 0.85498953, + "num_input_tokens_seen": 214613130, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6953125, + "step": 9960, + "time_per_iteration": 2.4651761054992676 + }, + { + "auxiliary_loss_clip": 0.01106018, + "auxiliary_loss_mlp": 0.01029196, + "balance_loss_clip": 1.01716197, + "balance_loss_mlp": 1.03686321, + "epoch": 0.5988877198256426, + "flos": 25118041933440.0, + "grad_norm": 1.466008615140069, + "language_loss": 0.79505813, + "learning_rate": 1.4631707421944868e-06, + "loss": 0.81641018, + "num_input_tokens_seen": 214634470, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.69140625, + "step": 9961, + "time_per_iteration": 2.475454568862915 + }, + { + "auxiliary_loss_clip": 0.01106184, + "auxiliary_loss_mlp": 0.01030716, + "balance_loss_clip": 1.01849759, + "balance_loss_mlp": 1.03730237, + "epoch": 0.5989478430783105, + "flos": 26429104350720.0, + "grad_norm": 1.756927001005791, + "language_loss": 0.67329001, + "learning_rate": 1.4627955825836136e-06, + "loss": 0.69465899, + "num_input_tokens_seen": 214654030, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 9962, + "time_per_iteration": 2.489084005355835 + }, + { + "auxiliary_loss_clip": 0.01107167, + "auxiliary_loss_mlp": 0.01035232, + "balance_loss_clip": 1.02303684, + "balance_loss_mlp": 1.03722596, + "epoch": 0.5990079663309785, + "flos": 25778439434880.0, + "grad_norm": 1.365980621399165, + "language_loss": 0.74311382, + "learning_rate": 1.4624204433435857e-06, + "loss": 0.76453781, + "num_input_tokens_seen": 214676985, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.703125, + "step": 9963, + "time_per_iteration": 2.4947874546051025 + }, + { + "auxiliary_loss_clip": 0.01105091, + "auxiliary_loss_mlp": 0.01032808, + "balance_loss_clip": 1.02042198, + "balance_loss_mlp": 1.03652799, + "epoch": 0.5990680895836464, + "flos": 36831779118720.0, + "grad_norm": 2.111691032145124, + "language_loss": 0.68214118, + "learning_rate": 1.4620453244886281e-06, + "loss": 0.70352018, + "num_input_tokens_seen": 214700105, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6875, + "step": 9964, + "time_per_iteration": 2.595745086669922 + }, + { + "auxiliary_loss_clip": 0.01104549, + "auxiliary_loss_mlp": 0.01028863, + "balance_loss_clip": 1.01635242, + "balance_loss_mlp": 1.03745115, + "epoch": 0.5991282128363145, + "flos": 24133550993280.0, + "grad_norm": 1.9069133835925212, + "language_loss": 0.77044344, + "learning_rate": 1.4616702260329662e-06, + "loss": 0.79177749, + "num_input_tokens_seen": 214717885, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.671875, + "step": 9965, + "time_per_iteration": 2.447580337524414 + }, + { + "auxiliary_loss_clip": 0.01106548, + "auxiliary_loss_mlp": 0.01030278, + "balance_loss_clip": 1.01833928, + "balance_loss_mlp": 1.03651989, + "epoch": 0.5991883360889824, + "flos": 10304064956160.0, + "grad_norm": 1.8284726106569544, + "language_loss": 0.77189291, + "learning_rate": 1.4612951479908229e-06, + "loss": 0.79326117, + "num_input_tokens_seen": 214733680, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.703125, + "step": 9966, + "time_per_iteration": 2.450202226638794 + }, + { + "auxiliary_loss_clip": 0.01106883, + "auxiliary_loss_mlp": 0.01029207, + "balance_loss_clip": 1.01775706, + "balance_loss_mlp": 1.03827262, + "epoch": 0.5992484593416504, + "flos": 23951196622080.0, + "grad_norm": 1.4816211966309663, + "language_loss": 0.73338163, + "learning_rate": 1.460920090376422e-06, + "loss": 0.7547425, + "num_input_tokens_seen": 214753285, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6875, + "step": 9967, + "time_per_iteration": 2.5361080169677734 + }, + { + "auxiliary_loss_clip": 0.01113043, + "auxiliary_loss_mlp": 0.01036039, + "balance_loss_clip": 1.0229435, + "balance_loss_mlp": 1.03907526, + "epoch": 0.5993085825943184, + "flos": 11944105061760.0, + "grad_norm": 1.98552880835617, + "language_loss": 0.68667233, + "learning_rate": 1.4605450532039847e-06, + "loss": 0.70816314, + "num_input_tokens_seen": 214767810, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7421875, + "step": 9968, + "time_per_iteration": 2.4201669692993164 + }, + { + "auxiliary_loss_clip": 0.01107383, + "auxiliary_loss_mlp": 0.01034585, + "balance_loss_clip": 1.02188921, + "balance_loss_mlp": 1.03702521, + "epoch": 0.5993687058469863, + "flos": 19026838500480.0, + "grad_norm": 1.5069000727815525, + "language_loss": 0.79169899, + "learning_rate": 1.4601700364877334e-06, + "loss": 0.8131187, + "num_input_tokens_seen": 214786040, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 9969, + "time_per_iteration": 3.9278953075408936 + }, + { + "auxiliary_loss_clip": 0.01106356, + "auxiliary_loss_mlp": 0.01032743, + "balance_loss_clip": 1.01999974, + "balance_loss_mlp": 1.03598189, + "epoch": 0.5994288290996543, + "flos": 14282967242880.0, + "grad_norm": 2.0663897132059588, + "language_loss": 0.81023246, + "learning_rate": 1.4597950402418889e-06, + "loss": 0.83162344, + "num_input_tokens_seen": 214803110, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 9970, + "time_per_iteration": 2.4416465759277344 + }, + { + "auxiliary_loss_clip": 0.01109867, + "auxiliary_loss_mlp": 0.01039566, + "balance_loss_clip": 1.02511787, + "balance_loss_mlp": 1.0377593, + "epoch": 0.5994889523523222, + "flos": 19206643006080.0, + "grad_norm": 1.8664927797599988, + "language_loss": 0.62176776, + "learning_rate": 1.4594200644806697e-06, + "loss": 0.64326209, + "num_input_tokens_seen": 214819945, + "router_z_loss_clip": 0.14453125, + "router_z_loss_mlp": 0.71875, + "step": 9971, + "time_per_iteration": 3.8846518993377686 + }, + { + "auxiliary_loss_clip": 0.01102408, + "auxiliary_loss_mlp": 0.01029479, + "balance_loss_clip": 1.01776624, + "balance_loss_mlp": 1.03571367, + "epoch": 0.5995490756049903, + "flos": 28037040675840.0, + "grad_norm": 1.8563043542024344, + "language_loss": 0.79314888, + "learning_rate": 1.4590451092182962e-06, + "loss": 0.81446773, + "num_input_tokens_seen": 214838810, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 9972, + "time_per_iteration": 3.901256561279297 + }, + { + "auxiliary_loss_clip": 0.01112588, + "auxiliary_loss_mlp": 0.01034647, + "balance_loss_clip": 1.02152252, + "balance_loss_mlp": 1.03817391, + "epoch": 0.5996091988576582, + "flos": 29052953038080.0, + "grad_norm": 2.1896098539024176, + "language_loss": 0.76205128, + "learning_rate": 1.4586701744689864e-06, + "loss": 0.78352362, + "num_input_tokens_seen": 214857040, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7421875, + "step": 9973, + "time_per_iteration": 3.9424259662628174 + }, + { + "auxiliary_loss_clip": 0.01106987, + "auxiliary_loss_mlp": 0.01033225, + "balance_loss_clip": 1.02021337, + "balance_loss_mlp": 1.0362227, + "epoch": 0.5996693221103262, + "flos": 20813968800000.0, + "grad_norm": 2.3034108647788933, + "language_loss": 0.64969486, + "learning_rate": 1.4582952602469578e-06, + "loss": 0.67109704, + "num_input_tokens_seen": 214873375, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 9974, + "time_per_iteration": 2.4875805377960205 + }, + { + "auxiliary_loss_clip": 0.01107343, + "auxiliary_loss_mlp": 0.0103502, + "balance_loss_clip": 1.02270579, + "balance_loss_mlp": 1.03728855, + "epoch": 0.5997294453629941, + "flos": 23768914078080.0, + "grad_norm": 1.4500461001521425, + "language_loss": 0.74434048, + "learning_rate": 1.457920366566428e-06, + "loss": 0.76576418, + "num_input_tokens_seen": 214893900, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 9975, + "time_per_iteration": 2.4895670413970947 + }, + { + "auxiliary_loss_clip": 0.01107892, + "auxiliary_loss_mlp": 0.01028638, + "balance_loss_clip": 1.01572204, + "balance_loss_mlp": 1.03760493, + "epoch": 0.5997895686156621, + "flos": 20960017499520.0, + "grad_norm": 1.7933529759094704, + "language_loss": 0.76735765, + "learning_rate": 1.457545493441611e-06, + "loss": 0.78872299, + "num_input_tokens_seen": 214912110, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 9976, + "time_per_iteration": 2.5056304931640625 + }, + { + "auxiliary_loss_clip": 0.01107614, + "auxiliary_loss_mlp": 0.01039313, + "balance_loss_clip": 1.02620029, + "balance_loss_mlp": 1.03780508, + "epoch": 0.59984969186833, + "flos": 28365443746560.0, + "grad_norm": 2.4460752586196857, + "language_loss": 0.74817264, + "learning_rate": 1.4571706408867237e-06, + "loss": 0.76964188, + "num_input_tokens_seen": 214930140, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.69921875, + "step": 9977, + "time_per_iteration": 2.496149778366089 + }, + { + "auxiliary_loss_clip": 0.01107436, + "auxiliary_loss_mlp": 0.01032043, + "balance_loss_clip": 1.01962721, + "balance_loss_mlp": 1.03684258, + "epoch": 0.5999098151209981, + "flos": 22565906749440.0, + "grad_norm": 1.6882301956293941, + "language_loss": 0.68553925, + "learning_rate": 1.4567958089159802e-06, + "loss": 0.70693398, + "num_input_tokens_seen": 214949200, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.70703125, + "step": 9978, + "time_per_iteration": 2.483567714691162 + }, + { + "auxiliary_loss_clip": 0.01113427, + "auxiliary_loss_mlp": 0.01033778, + "balance_loss_clip": 1.02087975, + "balance_loss_mlp": 1.04072738, + "epoch": 0.599969938373666, + "flos": 18768712389120.0, + "grad_norm": 2.78777966355448, + "language_loss": 0.81153774, + "learning_rate": 1.456420997543594e-06, + "loss": 0.83300972, + "num_input_tokens_seen": 214965775, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7265625, + "step": 9979, + "time_per_iteration": 2.413935899734497 + }, + { + "auxiliary_loss_clip": 0.01102922, + "auxiliary_loss_mlp": 0.01032239, + "balance_loss_clip": 1.02026439, + "balance_loss_mlp": 1.03630424, + "epoch": 0.600030061626334, + "flos": 11327231865600.0, + "grad_norm": 1.7401896529481804, + "language_loss": 0.6957618, + "learning_rate": 1.4560462067837782e-06, + "loss": 0.71711338, + "num_input_tokens_seen": 214982480, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6640625, + "step": 9980, + "time_per_iteration": 2.4312682151794434 + }, + { + "auxiliary_loss_clip": 0.01110498, + "auxiliary_loss_mlp": 0.01032669, + "balance_loss_clip": 1.01947856, + "balance_loss_mlp": 1.03764093, + "epoch": 0.600090184879002, + "flos": 16578664254720.0, + "grad_norm": 3.8237519537086238, + "language_loss": 0.68642873, + "learning_rate": 1.4556714366507445e-06, + "loss": 0.70786041, + "num_input_tokens_seen": 214998110, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.73046875, + "step": 9981, + "time_per_iteration": 2.4180452823638916 + }, + { + "auxiliary_loss_clip": 0.01106792, + "auxiliary_loss_mlp": 0.01035605, + "balance_loss_clip": 1.02439916, + "balance_loss_mlp": 1.03752363, + "epoch": 0.6001503081316699, + "flos": 23618627573760.0, + "grad_norm": 3.017374403618408, + "language_loss": 0.78579712, + "learning_rate": 1.4552966871587048e-06, + "loss": 0.80722106, + "num_input_tokens_seen": 215017995, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.69140625, + "step": 9982, + "time_per_iteration": 2.5378241539001465 + }, + { + "auxiliary_loss_clip": 0.01107415, + "auxiliary_loss_mlp": 0.01034564, + "balance_loss_clip": 1.02182055, + "balance_loss_mlp": 1.03862381, + "epoch": 0.6002104313843379, + "flos": 20667668705280.0, + "grad_norm": 1.4959053225865697, + "language_loss": 0.72973263, + "learning_rate": 1.4549219583218686e-06, + "loss": 0.7511524, + "num_input_tokens_seen": 215038285, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 9983, + "time_per_iteration": 2.4516336917877197 + }, + { + "auxiliary_loss_clip": 0.01105736, + "auxiliary_loss_mlp": 0.01031565, + "balance_loss_clip": 1.01893497, + "balance_loss_mlp": 1.03546536, + "epoch": 0.6002705546370058, + "flos": 22455229968000.0, + "grad_norm": 2.0437339372279775, + "language_loss": 0.77803969, + "learning_rate": 1.454547250154447e-06, + "loss": 0.79941273, + "num_input_tokens_seen": 215057825, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 9984, + "time_per_iteration": 2.4639358520507812 + }, + { + "auxiliary_loss_clip": 0.0110781, + "auxiliary_loss_mlp": 0.01034309, + "balance_loss_clip": 1.02200651, + "balance_loss_mlp": 1.03833842, + "epoch": 0.6003306778896739, + "flos": 25191982080000.0, + "grad_norm": 1.564540000062254, + "language_loss": 0.83254963, + "learning_rate": 1.4541725626706485e-06, + "loss": 0.85397083, + "num_input_tokens_seen": 215077790, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 9985, + "time_per_iteration": 2.584782361984253 + }, + { + "auxiliary_loss_clip": 0.01107675, + "auxiliary_loss_mlp": 0.01037251, + "balance_loss_clip": 1.02535367, + "balance_loss_mlp": 1.03886271, + "epoch": 0.6003908011423418, + "flos": 26687733252480.0, + "grad_norm": 1.8232812965365295, + "language_loss": 0.71257466, + "learning_rate": 1.4537978958846809e-06, + "loss": 0.73402393, + "num_input_tokens_seen": 215097650, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 9986, + "time_per_iteration": 2.5054030418395996 + }, + { + "auxiliary_loss_clip": 0.01110337, + "auxiliary_loss_mlp": 0.01031102, + "balance_loss_clip": 1.01824546, + "balance_loss_mlp": 1.04022861, + "epoch": 0.6004509243950098, + "flos": 22565080736640.0, + "grad_norm": 2.2582190453585653, + "language_loss": 0.71791571, + "learning_rate": 1.4534232498107514e-06, + "loss": 0.73933005, + "num_input_tokens_seen": 215118235, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69921875, + "step": 9987, + "time_per_iteration": 2.4961001873016357 + }, + { + "auxiliary_loss_clip": 0.01106291, + "auxiliary_loss_mlp": 0.01034497, + "balance_loss_clip": 1.02245712, + "balance_loss_mlp": 1.03697586, + "epoch": 0.6005110476476777, + "flos": 19719303868800.0, + "grad_norm": 1.6101111043143586, + "language_loss": 0.84407473, + "learning_rate": 1.4530486244630673e-06, + "loss": 0.86548263, + "num_input_tokens_seen": 215136755, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6953125, + "step": 9988, + "time_per_iteration": 2.435049533843994 + }, + { + "auxiliary_loss_clip": 0.01105215, + "auxiliary_loss_mlp": 0.01033666, + "balance_loss_clip": 1.02113748, + "balance_loss_mlp": 1.03617096, + "epoch": 0.6005711709003457, + "flos": 17712543859200.0, + "grad_norm": 1.6559701651537184, + "language_loss": 0.65416402, + "learning_rate": 1.4526740198558346e-06, + "loss": 0.67555285, + "num_input_tokens_seen": 215155225, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69140625, + "step": 9989, + "time_per_iteration": 2.4359869956970215 + }, + { + "auxiliary_loss_clip": 0.01105185, + "auxiliary_loss_mlp": 0.01031292, + "balance_loss_clip": 1.01960373, + "balance_loss_mlp": 1.03680921, + "epoch": 0.6006312941530136, + "flos": 18514464946560.0, + "grad_norm": 1.811706113820645, + "language_loss": 0.80521321, + "learning_rate": 1.452299436003257e-06, + "loss": 0.82657802, + "num_input_tokens_seen": 215174815, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.68359375, + "step": 9990, + "time_per_iteration": 2.44775128364563 + }, + { + "auxiliary_loss_clip": 0.01108983, + "auxiliary_loss_mlp": 0.01034602, + "balance_loss_clip": 1.02215016, + "balance_loss_mlp": 1.03804195, + "epoch": 0.6006914174056817, + "flos": 21390837223680.0, + "grad_norm": 1.6786296180827829, + "language_loss": 0.82789129, + "learning_rate": 1.4519248729195403e-06, + "loss": 0.84932715, + "num_input_tokens_seen": 215192045, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 9991, + "time_per_iteration": 2.464409112930298 + }, + { + "auxiliary_loss_clip": 0.01103829, + "auxiliary_loss_mlp": 0.01034501, + "balance_loss_clip": 1.02240098, + "balance_loss_mlp": 1.03611255, + "epoch": 0.6007515406583496, + "flos": 12750515349120.0, + "grad_norm": 2.5638990933503587, + "language_loss": 0.82719564, + "learning_rate": 1.4515503306188878e-06, + "loss": 0.84857893, + "num_input_tokens_seen": 215209885, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 9992, + "time_per_iteration": 2.4012389183044434 + }, + { + "auxiliary_loss_clip": 0.01105302, + "auxiliary_loss_mlp": 0.01034099, + "balance_loss_clip": 1.02181458, + "balance_loss_mlp": 1.03721142, + "epoch": 0.6008116639110176, + "flos": 19206894401280.0, + "grad_norm": 2.724325433103902, + "language_loss": 0.6668725, + "learning_rate": 1.4511758091155008e-06, + "loss": 0.6882664, + "num_input_tokens_seen": 215228150, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6796875, + "step": 9993, + "time_per_iteration": 2.431534767150879 + }, + { + "auxiliary_loss_clip": 0.01105757, + "auxiliary_loss_mlp": 0.01031875, + "balance_loss_clip": 1.01941192, + "balance_loss_mlp": 1.03631759, + "epoch": 0.6008717871636855, + "flos": 17055342668160.0, + "grad_norm": 2.313639381360734, + "language_loss": 0.81478924, + "learning_rate": 1.4508013084235826e-06, + "loss": 0.83616555, + "num_input_tokens_seen": 215243755, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 9994, + "time_per_iteration": 2.410637140274048 + }, + { + "auxiliary_loss_clip": 0.01102128, + "auxiliary_loss_mlp": 0.01025937, + "balance_loss_clip": 1.01506472, + "balance_loss_mlp": 1.03755724, + "epoch": 0.6009319104163535, + "flos": 20298686244480.0, + "grad_norm": 1.8133737963871297, + "language_loss": 0.72619045, + "learning_rate": 1.4504268285573337e-06, + "loss": 0.74747109, + "num_input_tokens_seen": 215262130, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.64453125, + "step": 9995, + "time_per_iteration": 2.462024450302124 + }, + { + "auxiliary_loss_clip": 0.01106573, + "auxiliary_loss_mlp": 0.01033043, + "balance_loss_clip": 1.02083576, + "balance_loss_mlp": 1.03584194, + "epoch": 0.6009920336690215, + "flos": 21836776573440.0, + "grad_norm": 2.19390066880666, + "language_loss": 0.80974549, + "learning_rate": 1.4500523695309546e-06, + "loss": 0.83114165, + "num_input_tokens_seen": 215281785, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.70703125, + "step": 9996, + "time_per_iteration": 2.4826295375823975 + }, + { + "auxiliary_loss_clip": 0.01106517, + "auxiliary_loss_mlp": 0.01037102, + "balance_loss_clip": 1.02458513, + "balance_loss_mlp": 1.03807008, + "epoch": 0.6010521569216895, + "flos": 22596107109120.0, + "grad_norm": 3.1537087962017814, + "language_loss": 0.78669906, + "learning_rate": 1.4496779313586447e-06, + "loss": 0.80813521, + "num_input_tokens_seen": 215297550, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 9997, + "time_per_iteration": 2.4731595516204834 + }, + { + "auxiliary_loss_clip": 0.01107621, + "auxiliary_loss_mlp": 0.01030423, + "balance_loss_clip": 1.01708388, + "balance_loss_mlp": 1.03646445, + "epoch": 0.6011122801743575, + "flos": 19171702051200.0, + "grad_norm": 3.7238695953955263, + "language_loss": 0.73005414, + "learning_rate": 1.4493035140546028e-06, + "loss": 0.75143456, + "num_input_tokens_seen": 215316360, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 9998, + "time_per_iteration": 2.4839541912078857 + }, + { + "auxiliary_loss_clip": 0.01103199, + "auxiliary_loss_mlp": 0.01026771, + "balance_loss_clip": 1.0148679, + "balance_loss_mlp": 1.03565955, + "epoch": 0.6011724034270254, + "flos": 25010022758400.0, + "grad_norm": 1.5405076955909784, + "language_loss": 0.721259, + "learning_rate": 1.448929117633027e-06, + "loss": 0.74255872, + "num_input_tokens_seen": 215336405, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 9999, + "time_per_iteration": 2.5177793502807617 + }, + { + "auxiliary_loss_clip": 0.01108153, + "auxiliary_loss_mlp": 0.01035498, + "balance_loss_clip": 1.02320766, + "balance_loss_mlp": 1.03617668, + "epoch": 0.6012325266796934, + "flos": 21797669640960.0, + "grad_norm": 14.582740501304201, + "language_loss": 0.78332782, + "learning_rate": 1.4485547421081142e-06, + "loss": 0.80476433, + "num_input_tokens_seen": 215356590, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.71875, + "step": 10000, + "time_per_iteration": 2.5176899433135986 + }, + { + "auxiliary_loss_clip": 0.01111103, + "auxiliary_loss_mlp": 0.01033566, + "balance_loss_clip": 1.02002978, + "balance_loss_mlp": 1.03898025, + "epoch": 0.6012926499323613, + "flos": 19573003774080.0, + "grad_norm": 1.9333747533908545, + "language_loss": 0.77681154, + "learning_rate": 1.4481803874940608e-06, + "loss": 0.79825819, + "num_input_tokens_seen": 215374295, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.72265625, + "step": 10001, + "time_per_iteration": 2.4608781337738037 + }, + { + "auxiliary_loss_clip": 0.01109986, + "auxiliary_loss_mlp": 0.01031166, + "balance_loss_clip": 1.01821423, + "balance_loss_mlp": 1.03775978, + "epoch": 0.6013527731850293, + "flos": 34860786076800.0, + "grad_norm": 2.0376201380828642, + "language_loss": 0.58534205, + "learning_rate": 1.4478060538050624e-06, + "loss": 0.60675359, + "num_input_tokens_seen": 215394535, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.72265625, + "step": 10002, + "time_per_iteration": 2.573974847793579 + }, + { + "auxiliary_loss_clip": 0.01110624, + "auxiliary_loss_mlp": 0.01035664, + "balance_loss_clip": 1.02163339, + "balance_loss_mlp": 1.0399766, + "epoch": 0.6014128964376972, + "flos": 23291948355840.0, + "grad_norm": 1.4763500532767482, + "language_loss": 0.77651924, + "learning_rate": 1.447431741055314e-06, + "loss": 0.7979821, + "num_input_tokens_seen": 215414355, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.70703125, + "step": 10003, + "time_per_iteration": 2.507904291152954 + }, + { + "auxiliary_loss_clip": 0.01109401, + "auxiliary_loss_mlp": 0.01028384, + "balance_loss_clip": 1.01595616, + "balance_loss_mlp": 1.03869998, + "epoch": 0.6014730196903653, + "flos": 24820916630400.0, + "grad_norm": 2.341725474955548, + "language_loss": 0.77185351, + "learning_rate": 1.4470574492590091e-06, + "loss": 0.79323137, + "num_input_tokens_seen": 215428280, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.70703125, + "step": 10004, + "time_per_iteration": 2.4672906398773193 + }, + { + "auxiliary_loss_clip": 0.01106632, + "auxiliary_loss_mlp": 0.01029336, + "balance_loss_clip": 1.01697397, + "balance_loss_mlp": 1.03765237, + "epoch": 0.6015331429430332, + "flos": 23112359331840.0, + "grad_norm": 1.6533707293679005, + "language_loss": 0.72357887, + "learning_rate": 1.4466831784303408e-06, + "loss": 0.74493855, + "num_input_tokens_seen": 215448970, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 10005, + "time_per_iteration": 2.481327533721924 + }, + { + "auxiliary_loss_clip": 0.01103683, + "auxiliary_loss_mlp": 0.01029251, + "balance_loss_clip": 1.01724029, + "balance_loss_mlp": 1.03719342, + "epoch": 0.6015932661957012, + "flos": 19201363706880.0, + "grad_norm": 1.9903847661444378, + "language_loss": 0.74641156, + "learning_rate": 1.4463089285835026e-06, + "loss": 0.76774085, + "num_input_tokens_seen": 215465260, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 10006, + "time_per_iteration": 2.4176204204559326 + }, + { + "auxiliary_loss_clip": 0.01104928, + "auxiliary_loss_mlp": 0.01036759, + "balance_loss_clip": 1.02387798, + "balance_loss_mlp": 1.03541553, + "epoch": 0.6016533894483691, + "flos": 18113630100480.0, + "grad_norm": 2.3154709076008726, + "language_loss": 0.73940712, + "learning_rate": 1.445934699732685e-06, + "loss": 0.76082402, + "num_input_tokens_seen": 215482725, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 10007, + "time_per_iteration": 2.4568898677825928 + }, + { + "auxiliary_loss_clip": 0.01105567, + "auxiliary_loss_mlp": 0.01026535, + "balance_loss_clip": 1.01488209, + "balance_loss_mlp": 1.03767657, + "epoch": 0.6017135127010371, + "flos": 16216900427520.0, + "grad_norm": 2.0163179080147065, + "language_loss": 0.70129442, + "learning_rate": 1.4455604918920785e-06, + "loss": 0.72261548, + "num_input_tokens_seen": 215500420, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 10008, + "time_per_iteration": 2.4591152667999268 + }, + { + "auxiliary_loss_clip": 0.011063, + "auxiliary_loss_mlp": 0.01025901, + "balance_loss_clip": 1.01420045, + "balance_loss_mlp": 1.0375886, + "epoch": 0.6017736359537051, + "flos": 23444246021760.0, + "grad_norm": 1.5735106118568272, + "language_loss": 0.76055562, + "learning_rate": 1.4451863050758748e-06, + "loss": 0.78187764, + "num_input_tokens_seen": 215522260, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6875, + "step": 10009, + "time_per_iteration": 2.5413200855255127 + }, + { + "auxiliary_loss_clip": 0.01106971, + "auxiliary_loss_mlp": 0.01033511, + "balance_loss_clip": 1.02157235, + "balance_loss_mlp": 1.03784704, + "epoch": 0.601833759206373, + "flos": 23514056104320.0, + "grad_norm": 2.2862690220983257, + "language_loss": 0.74194181, + "learning_rate": 1.4448121392982608e-06, + "loss": 0.76334661, + "num_input_tokens_seen": 215541715, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 10010, + "time_per_iteration": 3.888418436050415 + }, + { + "auxiliary_loss_clip": 0.01029006, + "auxiliary_loss_mlp": 0.00995965, + "balance_loss_clip": 0.99489832, + "balance_loss_mlp": 1.00768209, + "epoch": 0.6018938824590411, + "flos": 63991668648960.0, + "grad_norm": 0.7964241921308365, + "language_loss": 0.55079472, + "learning_rate": 1.4444379945734268e-06, + "loss": 0.57104445, + "num_input_tokens_seen": 215603020, + "router_z_loss_clip": 0.01068115, + "router_z_loss_mlp": 0.21289062, + "step": 10011, + "time_per_iteration": 3.125993251800537 + }, + { + "auxiliary_loss_clip": 0.01106744, + "auxiliary_loss_mlp": 0.01035464, + "balance_loss_clip": 1.02382302, + "balance_loss_mlp": 1.03751755, + "epoch": 0.601954005711709, + "flos": 34640007131520.0, + "grad_norm": 1.3952150015846279, + "language_loss": 0.62033314, + "learning_rate": 1.44406387091556e-06, + "loss": 0.64175516, + "num_input_tokens_seen": 215625115, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.69140625, + "step": 10012, + "time_per_iteration": 3.9947257041931152 + }, + { + "auxiliary_loss_clip": 0.01107114, + "auxiliary_loss_mlp": 0.0102335, + "balance_loss_clip": 1.01210856, + "balance_loss_mlp": 1.03870738, + "epoch": 0.602014128964377, + "flos": 19427062815360.0, + "grad_norm": 1.6026031648611754, + "language_loss": 0.74765098, + "learning_rate": 1.4436897683388462e-06, + "loss": 0.76895565, + "num_input_tokens_seen": 215643730, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.68359375, + "step": 10013, + "time_per_iteration": 3.9350314140319824 + }, + { + "auxiliary_loss_clip": 0.01100697, + "auxiliary_loss_mlp": 0.01027976, + "balance_loss_clip": 1.01671076, + "balance_loss_mlp": 1.03607368, + "epoch": 0.6020742522170449, + "flos": 28329389470080.0, + "grad_norm": 1.7871112945652055, + "language_loss": 0.81346315, + "learning_rate": 1.4433156868574732e-06, + "loss": 0.83474994, + "num_input_tokens_seen": 215664425, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 10014, + "time_per_iteration": 3.929865837097168 + }, + { + "auxiliary_loss_clip": 0.01101176, + "auxiliary_loss_mlp": 0.01025273, + "balance_loss_clip": 1.01355481, + "balance_loss_mlp": 1.03631175, + "epoch": 0.6021343754697129, + "flos": 22747040058240.0, + "grad_norm": 1.3916523900358202, + "language_loss": 0.72577333, + "learning_rate": 1.442941626485624e-06, + "loss": 0.74703777, + "num_input_tokens_seen": 215684280, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6484375, + "step": 10015, + "time_per_iteration": 2.487917184829712 + }, + { + "auxiliary_loss_clip": 0.0102817, + "auxiliary_loss_mlp": 0.0100004, + "balance_loss_clip": 0.99888366, + "balance_loss_mlp": 1.00701785, + "epoch": 0.6021944987223808, + "flos": 65752007402880.0, + "grad_norm": 0.8145782570930438, + "language_loss": 0.54800987, + "learning_rate": 1.4425675872374848e-06, + "loss": 0.5682919, + "num_input_tokens_seen": 215739780, + "router_z_loss_clip": 0.01153564, + "router_z_loss_mlp": 0.2109375, + "step": 10016, + "time_per_iteration": 2.952225923538208 + }, + { + "auxiliary_loss_clip": 0.01105304, + "auxiliary_loss_mlp": 0.01029257, + "balance_loss_clip": 1.01721644, + "balance_loss_mlp": 1.03722167, + "epoch": 0.6022546219750489, + "flos": 16105182151680.0, + "grad_norm": 1.4974922822650143, + "language_loss": 0.82952374, + "learning_rate": 1.4421935691272381e-06, + "loss": 0.85086936, + "num_input_tokens_seen": 215757885, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 10017, + "time_per_iteration": 2.4482316970825195 + }, + { + "auxiliary_loss_clip": 0.01105754, + "auxiliary_loss_mlp": 0.01028153, + "balance_loss_clip": 1.01691723, + "balance_loss_mlp": 1.03885603, + "epoch": 0.6023147452277168, + "flos": 25512555985920.0, + "grad_norm": 1.7894712759587756, + "language_loss": 0.83787656, + "learning_rate": 1.4418195721690677e-06, + "loss": 0.85921562, + "num_input_tokens_seen": 215776415, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 10018, + "time_per_iteration": 2.570969820022583 + }, + { + "auxiliary_loss_clip": 0.01109615, + "auxiliary_loss_mlp": 0.01036282, + "balance_loss_clip": 1.02348518, + "balance_loss_mlp": 1.03740263, + "epoch": 0.6023748684803848, + "flos": 22636075968000.0, + "grad_norm": 1.651779624626633, + "language_loss": 0.78134441, + "learning_rate": 1.4414455963771549e-06, + "loss": 0.80280334, + "num_input_tokens_seen": 215794865, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.72265625, + "step": 10019, + "time_per_iteration": 2.4765312671661377 + }, + { + "auxiliary_loss_clip": 0.01103799, + "auxiliary_loss_mlp": 0.01027592, + "balance_loss_clip": 1.01586819, + "balance_loss_mlp": 1.03516555, + "epoch": 0.6024349917330527, + "flos": 26210444307840.0, + "grad_norm": 1.523816764872001, + "language_loss": 0.73855495, + "learning_rate": 1.441071641765681e-06, + "loss": 0.75986886, + "num_input_tokens_seen": 215816840, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6875, + "step": 10020, + "time_per_iteration": 2.530351161956787 + }, + { + "auxiliary_loss_clip": 0.01106179, + "auxiliary_loss_mlp": 0.01033215, + "balance_loss_clip": 1.02080584, + "balance_loss_mlp": 1.03670871, + "epoch": 0.6024951149857207, + "flos": 21251755762560.0, + "grad_norm": 1.5471183793037282, + "language_loss": 0.64036959, + "learning_rate": 1.4406977083488264e-06, + "loss": 0.66176355, + "num_input_tokens_seen": 215836100, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6953125, + "step": 10021, + "time_per_iteration": 2.491334915161133 + }, + { + "auxiliary_loss_clip": 0.01103767, + "auxiliary_loss_mlp": 0.01031861, + "balance_loss_clip": 1.01892638, + "balance_loss_mlp": 1.03523266, + "epoch": 0.6025552382383887, + "flos": 26943453152640.0, + "grad_norm": 1.4551090911481597, + "language_loss": 0.80527318, + "learning_rate": 1.4403237961407704e-06, + "loss": 0.8266294, + "num_input_tokens_seen": 215858480, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6875, + "step": 10022, + "time_per_iteration": 2.504343032836914 + }, + { + "auxiliary_loss_clip": 0.01110275, + "auxiliary_loss_mlp": 0.01029469, + "balance_loss_clip": 1.01720786, + "balance_loss_mlp": 1.03836441, + "epoch": 0.6026153614910567, + "flos": 31684379495040.0, + "grad_norm": 1.6380547321516945, + "language_loss": 0.66718352, + "learning_rate": 1.439949905155693e-06, + "loss": 0.68858099, + "num_input_tokens_seen": 215879950, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.71875, + "step": 10023, + "time_per_iteration": 2.550156593322754 + }, + { + "auxiliary_loss_clip": 0.01106872, + "auxiliary_loss_mlp": 0.01030774, + "balance_loss_clip": 1.01878142, + "balance_loss_mlp": 1.03709006, + "epoch": 0.6026754847437247, + "flos": 29312731175040.0, + "grad_norm": 3.9256623345472397, + "language_loss": 0.74829918, + "learning_rate": 1.4395760354077707e-06, + "loss": 0.76967561, + "num_input_tokens_seen": 215899830, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6953125, + "step": 10024, + "time_per_iteration": 2.5556838512420654 + }, + { + "auxiliary_loss_clip": 0.01106267, + "auxiliary_loss_mlp": 0.0103271, + "balance_loss_clip": 1.02027631, + "balance_loss_mlp": 1.03824794, + "epoch": 0.6027356079963926, + "flos": 23586775188480.0, + "grad_norm": 1.6728401649111677, + "language_loss": 0.7330395, + "learning_rate": 1.4392021869111815e-06, + "loss": 0.75442922, + "num_input_tokens_seen": 215920440, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6796875, + "step": 10025, + "time_per_iteration": 2.513984441757202 + }, + { + "auxiliary_loss_clip": 0.01110825, + "auxiliary_loss_mlp": 0.01034309, + "balance_loss_clip": 1.02113652, + "balance_loss_mlp": 1.03738081, + "epoch": 0.6027957312490606, + "flos": 20813753318400.0, + "grad_norm": 2.650368099581338, + "language_loss": 0.67278063, + "learning_rate": 1.4388283596801016e-06, + "loss": 0.69423193, + "num_input_tokens_seen": 215940535, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 10026, + "time_per_iteration": 2.542365550994873 + }, + { + "auxiliary_loss_clip": 0.01098285, + "auxiliary_loss_mlp": 0.0103129, + "balance_loss_clip": 1.0200423, + "balance_loss_mlp": 1.03320062, + "epoch": 0.6028558545017285, + "flos": 19935773182080.0, + "grad_norm": 2.2752496975382908, + "language_loss": 0.80318093, + "learning_rate": 1.4384545537287061e-06, + "loss": 0.82447666, + "num_input_tokens_seen": 215958045, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 10027, + "time_per_iteration": 2.444352626800537 + }, + { + "auxiliary_loss_clip": 0.01109574, + "auxiliary_loss_mlp": 0.01034466, + "balance_loss_clip": 1.02199113, + "balance_loss_mlp": 1.03832877, + "epoch": 0.6029159777543965, + "flos": 22820836550400.0, + "grad_norm": 2.211735765604233, + "language_loss": 0.71043503, + "learning_rate": 1.438080769071171e-06, + "loss": 0.73187542, + "num_input_tokens_seen": 215977330, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7109375, + "step": 10028, + "time_per_iteration": 2.479518413543701 + }, + { + "auxiliary_loss_clip": 0.01108344, + "auxiliary_loss_mlp": 0.01037039, + "balance_loss_clip": 1.02431321, + "balance_loss_mlp": 1.03661895, + "epoch": 0.6029761010070644, + "flos": 23587242065280.0, + "grad_norm": 1.6910926571719251, + "language_loss": 0.8391934, + "learning_rate": 1.437707005721669e-06, + "loss": 0.8606472, + "num_input_tokens_seen": 215997865, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 10029, + "time_per_iteration": 2.4701409339904785 + }, + { + "auxiliary_loss_clip": 0.01103116, + "auxiliary_loss_mlp": 0.01035147, + "balance_loss_clip": 1.02357185, + "balance_loss_mlp": 1.03613794, + "epoch": 0.6030362242597325, + "flos": 13662430859520.0, + "grad_norm": 1.6909986386379736, + "language_loss": 0.7958231, + "learning_rate": 1.437333263694373e-06, + "loss": 0.81720573, + "num_input_tokens_seen": 216016230, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.671875, + "step": 10030, + "time_per_iteration": 2.4561784267425537 + }, + { + "auxiliary_loss_clip": 0.01105406, + "auxiliary_loss_mlp": 0.01032107, + "balance_loss_clip": 1.0201565, + "balance_loss_mlp": 1.03732789, + "epoch": 0.6030963475124004, + "flos": 24422883045120.0, + "grad_norm": 1.5628951432606517, + "language_loss": 0.71363872, + "learning_rate": 1.4369595430034572e-06, + "loss": 0.73501384, + "num_input_tokens_seen": 216035785, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6796875, + "step": 10031, + "time_per_iteration": 2.512300729751587 + }, + { + "auxiliary_loss_clip": 0.011108, + "auxiliary_loss_mlp": 0.01032792, + "balance_loss_clip": 1.01967287, + "balance_loss_mlp": 1.03754997, + "epoch": 0.6031564707650684, + "flos": 29644043247360.0, + "grad_norm": 1.6597240808951284, + "language_loss": 0.73467577, + "learning_rate": 1.4365858436630912e-06, + "loss": 0.75611174, + "num_input_tokens_seen": 216059555, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73046875, + "step": 10032, + "time_per_iteration": 2.566749334335327 + }, + { + "auxiliary_loss_clip": 0.01111115, + "auxiliary_loss_mlp": 0.01032658, + "balance_loss_clip": 1.02004528, + "balance_loss_mlp": 1.04087365, + "epoch": 0.6032165940177363, + "flos": 16618776768000.0, + "grad_norm": 1.6790483076068066, + "language_loss": 0.68394065, + "learning_rate": 1.4362121656874465e-06, + "loss": 0.70537835, + "num_input_tokens_seen": 216077235, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 10033, + "time_per_iteration": 2.4334018230438232 + }, + { + "auxiliary_loss_clip": 0.01108457, + "auxiliary_loss_mlp": 0.01032938, + "balance_loss_clip": 1.02034903, + "balance_loss_mlp": 1.03930712, + "epoch": 0.6032767172704043, + "flos": 17488173553920.0, + "grad_norm": 1.9672909213981986, + "language_loss": 0.76032668, + "learning_rate": 1.4358385090906934e-06, + "loss": 0.78174067, + "num_input_tokens_seen": 216094985, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 10034, + "time_per_iteration": 2.430638074874878 + }, + { + "auxiliary_loss_clip": 0.01108661, + "auxiliary_loss_mlp": 0.01030537, + "balance_loss_clip": 1.01763296, + "balance_loss_mlp": 1.03813863, + "epoch": 0.6033368405230723, + "flos": 26832955939200.0, + "grad_norm": 2.463845452157716, + "language_loss": 0.74406719, + "learning_rate": 1.4354648738870004e-06, + "loss": 0.76545924, + "num_input_tokens_seen": 216115905, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 10035, + "time_per_iteration": 2.4784040451049805 + }, + { + "auxiliary_loss_clip": 0.01104517, + "auxiliary_loss_mlp": 0.0102907, + "balance_loss_clip": 1.01751912, + "balance_loss_mlp": 1.03727365, + "epoch": 0.6033969637757403, + "flos": 16909904499840.0, + "grad_norm": 1.5741870761115437, + "language_loss": 0.86713034, + "learning_rate": 1.435091260090536e-06, + "loss": 0.88846624, + "num_input_tokens_seen": 216132420, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 10036, + "time_per_iteration": 2.4385178089141846 + }, + { + "auxiliary_loss_clip": 0.01107298, + "auxiliary_loss_mlp": 0.01033369, + "balance_loss_clip": 1.02077413, + "balance_loss_mlp": 1.0369339, + "epoch": 0.6034570870284083, + "flos": 22930076787840.0, + "grad_norm": 2.0234995174732067, + "language_loss": 0.69894731, + "learning_rate": 1.4347176677154676e-06, + "loss": 0.72035396, + "num_input_tokens_seen": 216149800, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 10037, + "time_per_iteration": 2.4603824615478516 + }, + { + "auxiliary_loss_clip": 0.01106273, + "auxiliary_loss_mlp": 0.0103008, + "balance_loss_clip": 1.01800978, + "balance_loss_mlp": 1.03922844, + "epoch": 0.6035172102810762, + "flos": 23366319465600.0, + "grad_norm": 1.7516523293698103, + "language_loss": 0.85487103, + "learning_rate": 1.4343440967759616e-06, + "loss": 0.87623459, + "num_input_tokens_seen": 216168200, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.671875, + "step": 10038, + "time_per_iteration": 2.478269100189209 + }, + { + "auxiliary_loss_clip": 0.01108308, + "auxiliary_loss_mlp": 0.01035469, + "balance_loss_clip": 1.02303505, + "balance_loss_mlp": 1.03736269, + "epoch": 0.6035773335337442, + "flos": 20887082933760.0, + "grad_norm": 1.859562825285256, + "language_loss": 0.76468384, + "learning_rate": 1.4339705472861846e-06, + "loss": 0.78612161, + "num_input_tokens_seen": 216187105, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 10039, + "time_per_iteration": 2.4567699432373047 + }, + { + "auxiliary_loss_clip": 0.01104292, + "auxiliary_loss_mlp": 0.01032125, + "balance_loss_clip": 1.02047873, + "balance_loss_mlp": 1.03606224, + "epoch": 0.6036374567864121, + "flos": 24936298093440.0, + "grad_norm": 1.5744012931929299, + "language_loss": 0.70843172, + "learning_rate": 1.433597019260301e-06, + "loss": 0.72979593, + "num_input_tokens_seen": 216205440, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 10040, + "time_per_iteration": 2.491757392883301 + }, + { + "auxiliary_loss_clip": 0.01112027, + "auxiliary_loss_mlp": 0.01031343, + "balance_loss_clip": 1.01729393, + "balance_loss_mlp": 1.03952897, + "epoch": 0.6036975800390801, + "flos": 23148269953920.0, + "grad_norm": 2.4316928211832045, + "language_loss": 0.78400159, + "learning_rate": 1.433223512712475e-06, + "loss": 0.80543524, + "num_input_tokens_seen": 216223130, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7265625, + "step": 10041, + "time_per_iteration": 2.452766180038452 + }, + { + "auxiliary_loss_clip": 0.0110643, + "auxiliary_loss_mlp": 0.01030681, + "balance_loss_clip": 1.01892138, + "balance_loss_mlp": 1.03821898, + "epoch": 0.603757703291748, + "flos": 18660729127680.0, + "grad_norm": 1.6317318935059701, + "language_loss": 0.75574881, + "learning_rate": 1.4328500276568704e-06, + "loss": 0.77711999, + "num_input_tokens_seen": 216240260, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.68359375, + "step": 10042, + "time_per_iteration": 2.421757459640503 + }, + { + "auxiliary_loss_clip": 0.01103712, + "auxiliary_loss_mlp": 0.01028058, + "balance_loss_clip": 1.01626205, + "balance_loss_mlp": 1.03584445, + "epoch": 0.6038178265444161, + "flos": 19682603147520.0, + "grad_norm": 2.3271703550981138, + "language_loss": 0.84446549, + "learning_rate": 1.4324765641076498e-06, + "loss": 0.86578321, + "num_input_tokens_seen": 216258510, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6796875, + "step": 10043, + "time_per_iteration": 2.5310654640197754 + }, + { + "auxiliary_loss_clip": 0.01108903, + "auxiliary_loss_mlp": 0.01039945, + "balance_loss_clip": 1.02648067, + "balance_loss_mlp": 1.03705609, + "epoch": 0.603877949797084, + "flos": 22638230784000.0, + "grad_norm": 1.9621351051557316, + "language_loss": 0.69924289, + "learning_rate": 1.432103122078974e-06, + "loss": 0.72073138, + "num_input_tokens_seen": 216277550, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71875, + "step": 10044, + "time_per_iteration": 2.4903266429901123 + }, + { + "auxiliary_loss_clip": 0.01110997, + "auxiliary_loss_mlp": 0.01031283, + "balance_loss_clip": 1.0184021, + "balance_loss_mlp": 1.03954315, + "epoch": 0.603938073049752, + "flos": 25447881548160.0, + "grad_norm": 2.0335535035690557, + "language_loss": 0.77986026, + "learning_rate": 1.4317297015850057e-06, + "loss": 0.80128312, + "num_input_tokens_seen": 216296690, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71484375, + "step": 10045, + "time_per_iteration": 2.4881081581115723 + }, + { + "auxiliary_loss_clip": 0.0110549, + "auxiliary_loss_mlp": 0.01031468, + "balance_loss_clip": 1.01878381, + "balance_loss_mlp": 1.03781128, + "epoch": 0.6039981963024199, + "flos": 22340135813760.0, + "grad_norm": 1.5706793221026767, + "language_loss": 0.76730686, + "learning_rate": 1.4313563026399036e-06, + "loss": 0.7886765, + "num_input_tokens_seen": 216316110, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.67578125, + "step": 10046, + "time_per_iteration": 2.4508702754974365 + }, + { + "auxiliary_loss_clip": 0.01104935, + "auxiliary_loss_mlp": 0.01030001, + "balance_loss_clip": 1.01866424, + "balance_loss_mlp": 1.03633487, + "epoch": 0.6040583195550879, + "flos": 20703148364160.0, + "grad_norm": 1.5559732700373865, + "language_loss": 0.86937988, + "learning_rate": 1.430982925257827e-06, + "loss": 0.89072925, + "num_input_tokens_seen": 216333855, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6875, + "step": 10047, + "time_per_iteration": 2.465775489807129 + }, + { + "auxiliary_loss_clip": 0.01105881, + "auxiliary_loss_mlp": 0.0102976, + "balance_loss_clip": 1.01808965, + "balance_loss_mlp": 1.03915882, + "epoch": 0.604118442807756, + "flos": 27163118776320.0, + "grad_norm": 1.5346026168560238, + "language_loss": 0.75463951, + "learning_rate": 1.4306095694529358e-06, + "loss": 0.77599597, + "num_input_tokens_seen": 216354890, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66796875, + "step": 10048, + "time_per_iteration": 2.5098941326141357 + }, + { + "auxiliary_loss_clip": 0.01115671, + "auxiliary_loss_mlp": 0.01039349, + "balance_loss_clip": 1.0247221, + "balance_loss_mlp": 1.03979802, + "epoch": 0.6041785660604239, + "flos": 30881524654080.0, + "grad_norm": 2.285441895193273, + "language_loss": 0.66271615, + "learning_rate": 1.430236235239386e-06, + "loss": 0.68426633, + "num_input_tokens_seen": 216376055, + "router_z_loss_clip": 0.14648438, + "router_z_loss_mlp": 0.7578125, + "step": 10049, + "time_per_iteration": 2.537810802459717 + }, + { + "auxiliary_loss_clip": 0.01105568, + "auxiliary_loss_mlp": 0.01034046, + "balance_loss_clip": 1.02244711, + "balance_loss_mlp": 1.03769147, + "epoch": 0.6042386893130919, + "flos": 19938215306880.0, + "grad_norm": 1.5404607265151984, + "language_loss": 0.66999722, + "learning_rate": 1.429862922631336e-06, + "loss": 0.69139338, + "num_input_tokens_seen": 216396295, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 10050, + "time_per_iteration": 2.5025947093963623 + }, + { + "auxiliary_loss_clip": 0.01108275, + "auxiliary_loss_mlp": 0.01032262, + "balance_loss_clip": 1.01961958, + "balance_loss_mlp": 1.03837466, + "epoch": 0.6042988125657598, + "flos": 32415915882240.0, + "grad_norm": 2.5982455651349325, + "language_loss": 0.69730866, + "learning_rate": 1.4294896316429408e-06, + "loss": 0.718714, + "num_input_tokens_seen": 216416605, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69921875, + "step": 10051, + "time_per_iteration": 2.5584428310394287 + }, + { + "auxiliary_loss_clip": 0.01103115, + "auxiliary_loss_mlp": 0.01032392, + "balance_loss_clip": 1.02025664, + "balance_loss_mlp": 1.03470123, + "epoch": 0.6043589358184278, + "flos": 17420805596160.0, + "grad_norm": 1.883115508781388, + "language_loss": 0.64664817, + "learning_rate": 1.4291163622883553e-06, + "loss": 0.66800326, + "num_input_tokens_seen": 216435130, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.68359375, + "step": 10052, + "time_per_iteration": 3.8776209354400635 + }, + { + "auxiliary_loss_clip": 0.01106513, + "auxiliary_loss_mlp": 0.01035509, + "balance_loss_clip": 1.02243757, + "balance_loss_mlp": 1.03725076, + "epoch": 0.6044190590710957, + "flos": 27672834723840.0, + "grad_norm": 1.6187947947661157, + "language_loss": 0.68885666, + "learning_rate": 1.4287431145817358e-06, + "loss": 0.71027684, + "num_input_tokens_seen": 216455640, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6953125, + "step": 10053, + "time_per_iteration": 3.8864493370056152 + }, + { + "auxiliary_loss_clip": 0.0102793, + "auxiliary_loss_mlp": 0.01006986, + "balance_loss_clip": 1.00581133, + "balance_loss_mlp": 1.00684035, + "epoch": 0.6044791823237637, + "flos": 65316267515520.0, + "grad_norm": 0.7454166517190239, + "language_loss": 0.6043961, + "learning_rate": 1.4283698885372336e-06, + "loss": 0.62474525, + "num_input_tokens_seen": 216518130, + "router_z_loss_clip": 0.01171875, + "router_z_loss_mlp": 0.2109375, + "step": 10054, + "time_per_iteration": 4.507344961166382 + }, + { + "auxiliary_loss_clip": 0.01104586, + "auxiliary_loss_mlp": 0.01032941, + "balance_loss_clip": 1.01997089, + "balance_loss_mlp": 1.03684747, + "epoch": 0.6045393055764317, + "flos": 24492369905280.0, + "grad_norm": 1.6844086395494355, + "language_loss": 0.85636723, + "learning_rate": 1.4279966841690027e-06, + "loss": 0.87774247, + "num_input_tokens_seen": 216536845, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6796875, + "step": 10055, + "time_per_iteration": 3.930811643600464 + }, + { + "auxiliary_loss_clip": 0.01110141, + "auxiliary_loss_mlp": 0.01039632, + "balance_loss_clip": 1.02585101, + "balance_loss_mlp": 1.04008687, + "epoch": 0.6045994288290997, + "flos": 19054345340160.0, + "grad_norm": 2.2914523857580353, + "language_loss": 0.73531651, + "learning_rate": 1.4276235014911952e-06, + "loss": 0.75681424, + "num_input_tokens_seen": 216551860, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.69921875, + "step": 10056, + "time_per_iteration": 2.424492835998535 + }, + { + "auxiliary_loss_clip": 0.01105735, + "auxiliary_loss_mlp": 0.01033852, + "balance_loss_clip": 1.02206218, + "balance_loss_mlp": 1.03815937, + "epoch": 0.6046595520817676, + "flos": 26576697335040.0, + "grad_norm": 1.6647683047258863, + "language_loss": 0.80205089, + "learning_rate": 1.4272503405179616e-06, + "loss": 0.82344675, + "num_input_tokens_seen": 216574775, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.67578125, + "step": 10057, + "time_per_iteration": 2.4988396167755127 + }, + { + "auxiliary_loss_clip": 0.01104511, + "auxiliary_loss_mlp": 0.01030383, + "balance_loss_clip": 1.01725817, + "balance_loss_mlp": 1.0369792, + "epoch": 0.6047196753344356, + "flos": 13582277660160.0, + "grad_norm": 2.656202002056598, + "language_loss": 0.75172931, + "learning_rate": 1.4268772012634527e-06, + "loss": 0.7730782, + "num_input_tokens_seen": 216590100, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.67578125, + "step": 10058, + "time_per_iteration": 2.4108166694641113 + }, + { + "auxiliary_loss_clip": 0.01102949, + "auxiliary_loss_mlp": 0.0102824, + "balance_loss_clip": 1.01627767, + "balance_loss_mlp": 1.03582406, + "epoch": 0.6047797985871035, + "flos": 25520456977920.0, + "grad_norm": 1.75224691919055, + "language_loss": 0.71103948, + "learning_rate": 1.4265040837418176e-06, + "loss": 0.73235136, + "num_input_tokens_seen": 216610145, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 10059, + "time_per_iteration": 2.4859349727630615 + }, + { + "auxiliary_loss_clip": 0.01105606, + "auxiliary_loss_mlp": 0.01029263, + "balance_loss_clip": 1.01686525, + "balance_loss_mlp": 1.03741932, + "epoch": 0.6048399218397715, + "flos": 20520147548160.0, + "grad_norm": 1.587856969701262, + "language_loss": 0.76134253, + "learning_rate": 1.4261309879672054e-06, + "loss": 0.78269112, + "num_input_tokens_seen": 216630625, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 10060, + "time_per_iteration": 2.473043918609619 + }, + { + "auxiliary_loss_clip": 0.01104669, + "auxiliary_loss_mlp": 0.01033513, + "balance_loss_clip": 1.02149105, + "balance_loss_mlp": 1.03757381, + "epoch": 0.6049000450924396, + "flos": 20408788408320.0, + "grad_norm": 2.1588277388437276, + "language_loss": 0.73414183, + "learning_rate": 1.4257579139537628e-06, + "loss": 0.75552368, + "num_input_tokens_seen": 216649255, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 10061, + "time_per_iteration": 2.440943956375122 + }, + { + "auxiliary_loss_clip": 0.01105712, + "auxiliary_loss_mlp": 0.01029083, + "balance_loss_clip": 1.01750207, + "balance_loss_mlp": 1.03634655, + "epoch": 0.6049601683451075, + "flos": 20741357456640.0, + "grad_norm": 2.0041380833930145, + "language_loss": 0.67225152, + "learning_rate": 1.425384861715639e-06, + "loss": 0.69359946, + "num_input_tokens_seen": 216668100, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6953125, + "step": 10062, + "time_per_iteration": 2.4789950847625732 + }, + { + "auxiliary_loss_clip": 0.01105607, + "auxiliary_loss_mlp": 0.01038153, + "balance_loss_clip": 1.02592254, + "balance_loss_mlp": 1.03717685, + "epoch": 0.6050202915977755, + "flos": 20083114771200.0, + "grad_norm": 2.163401547344872, + "language_loss": 0.71361917, + "learning_rate": 1.425011831266978e-06, + "loss": 0.73505676, + "num_input_tokens_seen": 216686125, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 10063, + "time_per_iteration": 2.43302321434021 + }, + { + "auxiliary_loss_clip": 0.01102028, + "auxiliary_loss_mlp": 0.01031729, + "balance_loss_clip": 1.01968336, + "balance_loss_mlp": 1.03561401, + "epoch": 0.6050804148504434, + "flos": 15960821391360.0, + "grad_norm": 1.6164006934985269, + "language_loss": 0.84802878, + "learning_rate": 1.424638822621926e-06, + "loss": 0.86936641, + "num_input_tokens_seen": 216704265, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6640625, + "step": 10064, + "time_per_iteration": 2.447003126144409 + }, + { + "auxiliary_loss_clip": 0.0110348, + "auxiliary_loss_mlp": 0.01033722, + "balance_loss_clip": 1.02206945, + "balance_loss_mlp": 1.0354557, + "epoch": 0.6051405381031114, + "flos": 17456644391040.0, + "grad_norm": 2.2435880628396587, + "language_loss": 0.79335666, + "learning_rate": 1.4242658357946278e-06, + "loss": 0.81472868, + "num_input_tokens_seen": 216721765, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 10065, + "time_per_iteration": 2.437286376953125 + }, + { + "auxiliary_loss_clip": 0.01111102, + "auxiliary_loss_mlp": 0.01031949, + "balance_loss_clip": 1.01874626, + "balance_loss_mlp": 1.03979814, + "epoch": 0.6052006613557793, + "flos": 11400130517760.0, + "grad_norm": 1.9931239622384858, + "language_loss": 0.78788042, + "learning_rate": 1.423892870799226e-06, + "loss": 0.80931091, + "num_input_tokens_seen": 216738295, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7109375, + "step": 10066, + "time_per_iteration": 2.4346959590911865 + }, + { + "auxiliary_loss_clip": 0.01104198, + "auxiliary_loss_mlp": 0.01027592, + "balance_loss_clip": 1.01523578, + "balance_loss_mlp": 1.03645194, + "epoch": 0.6052607846084473, + "flos": 24750998807040.0, + "grad_norm": 1.5823653049215993, + "language_loss": 0.73320723, + "learning_rate": 1.4235199276498655e-06, + "loss": 0.75452518, + "num_input_tokens_seen": 216759875, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.67578125, + "step": 10067, + "time_per_iteration": 2.5625689029693604 + }, + { + "auxiliary_loss_clip": 0.01107587, + "auxiliary_loss_mlp": 0.01029785, + "balance_loss_clip": 1.01783991, + "balance_loss_mlp": 1.03971481, + "epoch": 0.6053209078611153, + "flos": 20741141975040.0, + "grad_norm": 1.6116431503881068, + "language_loss": 0.68952775, + "learning_rate": 1.4231470063606863e-06, + "loss": 0.7109015, + "num_input_tokens_seen": 216780705, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6796875, + "step": 10068, + "time_per_iteration": 2.5137228965759277 + }, + { + "auxiliary_loss_clip": 0.01103779, + "auxiliary_loss_mlp": 0.01030656, + "balance_loss_clip": 1.01877117, + "balance_loss_mlp": 1.03444147, + "epoch": 0.6053810311137833, + "flos": 18953149749120.0, + "grad_norm": 2.4473752710004586, + "language_loss": 0.86667287, + "learning_rate": 1.4227741069458303e-06, + "loss": 0.8880173, + "num_input_tokens_seen": 216797625, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 10069, + "time_per_iteration": 2.4172072410583496 + }, + { + "auxiliary_loss_clip": 0.01103834, + "auxiliary_loss_mlp": 0.01025707, + "balance_loss_clip": 1.01405478, + "balance_loss_mlp": 1.03583956, + "epoch": 0.6054411543664512, + "flos": 23951124794880.0, + "grad_norm": 1.4672457121748899, + "language_loss": 0.83270586, + "learning_rate": 1.4224012294194387e-06, + "loss": 0.85400122, + "num_input_tokens_seen": 216817610, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 10070, + "time_per_iteration": 2.464062452316284 + }, + { + "auxiliary_loss_clip": 0.01106279, + "auxiliary_loss_mlp": 0.01034036, + "balance_loss_clip": 1.02120876, + "balance_loss_mlp": 1.03630137, + "epoch": 0.6055012776191192, + "flos": 20593979953920.0, + "grad_norm": 1.5142081514734282, + "language_loss": 0.86056209, + "learning_rate": 1.4220283737956496e-06, + "loss": 0.88196522, + "num_input_tokens_seen": 216836835, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69921875, + "step": 10071, + "time_per_iteration": 2.435492515563965 + }, + { + "auxiliary_loss_clip": 0.01108912, + "auxiliary_loss_mlp": 0.01034714, + "balance_loss_clip": 1.02138042, + "balance_loss_mlp": 1.03817403, + "epoch": 0.6055614008717871, + "flos": 30298191782400.0, + "grad_norm": 1.7615317101181058, + "language_loss": 0.7703979, + "learning_rate": 1.421655540088603e-06, + "loss": 0.79183424, + "num_input_tokens_seen": 216856760, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.70703125, + "step": 10072, + "time_per_iteration": 2.5326199531555176 + }, + { + "auxiliary_loss_clip": 0.01104713, + "auxiliary_loss_mlp": 0.01026829, + "balance_loss_clip": 1.01362085, + "balance_loss_mlp": 1.03505397, + "epoch": 0.6056215241244551, + "flos": 27125017424640.0, + "grad_norm": 1.81020475903248, + "language_loss": 0.74383593, + "learning_rate": 1.4212827283124367e-06, + "loss": 0.76515132, + "num_input_tokens_seen": 216878795, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.6953125, + "step": 10073, + "time_per_iteration": 2.4809958934783936 + }, + { + "auxiliary_loss_clip": 0.01026997, + "auxiliary_loss_mlp": 0.01002422, + "balance_loss_clip": 1.00124216, + "balance_loss_mlp": 1.005988, + "epoch": 0.6056816473771232, + "flos": 56007323925120.0, + "grad_norm": 0.7588463064410728, + "language_loss": 0.55220222, + "learning_rate": 1.4209099384812863e-06, + "loss": 0.57249641, + "num_input_tokens_seen": 216937800, + "router_z_loss_clip": 0.01177979, + "router_z_loss_mlp": 0.2109375, + "step": 10074, + "time_per_iteration": 3.101125717163086 + }, + { + "auxiliary_loss_clip": 0.01105722, + "auxiliary_loss_mlp": 0.01029513, + "balance_loss_clip": 1.01752663, + "balance_loss_mlp": 1.03776407, + "epoch": 0.6057417706297911, + "flos": 23549499849600.0, + "grad_norm": 1.8033827655021575, + "language_loss": 0.81893396, + "learning_rate": 1.4205371706092894e-06, + "loss": 0.84028631, + "num_input_tokens_seen": 216955280, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6796875, + "step": 10075, + "time_per_iteration": 2.468269109725952 + }, + { + "auxiliary_loss_clip": 0.01105409, + "auxiliary_loss_mlp": 0.01024158, + "balance_loss_clip": 1.01174855, + "balance_loss_mlp": 1.03608966, + "epoch": 0.6058018938824591, + "flos": 27744296832000.0, + "grad_norm": 2.0602815760014392, + "language_loss": 0.78272569, + "learning_rate": 1.4201644247105813e-06, + "loss": 0.80402136, + "num_input_tokens_seen": 216976950, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69140625, + "step": 10076, + "time_per_iteration": 2.4932310581207275 + }, + { + "auxiliary_loss_clip": 0.01106985, + "auxiliary_loss_mlp": 0.01033773, + "balance_loss_clip": 1.02113056, + "balance_loss_mlp": 1.0365119, + "epoch": 0.605862017135127, + "flos": 22783381643520.0, + "grad_norm": 1.7482408044671829, + "language_loss": 0.72032154, + "learning_rate": 1.4197917007992964e-06, + "loss": 0.74172914, + "num_input_tokens_seen": 216996945, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 10077, + "time_per_iteration": 2.4521970748901367 + }, + { + "auxiliary_loss_clip": 0.01106927, + "auxiliary_loss_mlp": 0.01030175, + "balance_loss_clip": 1.01777112, + "balance_loss_mlp": 1.03759694, + "epoch": 0.605922140387795, + "flos": 21215019127680.0, + "grad_norm": 2.2939968580618215, + "language_loss": 0.55467492, + "learning_rate": 1.4194189988895682e-06, + "loss": 0.57604587, + "num_input_tokens_seen": 217016580, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69140625, + "step": 10078, + "time_per_iteration": 2.4789669513702393 + }, + { + "auxiliary_loss_clip": 0.01106991, + "auxiliary_loss_mlp": 0.01032923, + "balance_loss_clip": 1.02026904, + "balance_loss_mlp": 1.0364964, + "epoch": 0.6059822636404629, + "flos": 27268372604160.0, + "grad_norm": 2.206511673730914, + "language_loss": 0.70283198, + "learning_rate": 1.4190463189955297e-06, + "loss": 0.72423112, + "num_input_tokens_seen": 217037300, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 10079, + "time_per_iteration": 2.494340181350708 + }, + { + "auxiliary_loss_clip": 0.01105474, + "auxiliary_loss_mlp": 0.01038174, + "balance_loss_clip": 1.02605653, + "balance_loss_mlp": 1.03662014, + "epoch": 0.606042386893131, + "flos": 20631327120000.0, + "grad_norm": 1.7147155998392456, + "language_loss": 0.62479711, + "learning_rate": 1.4186736611313131e-06, + "loss": 0.64623356, + "num_input_tokens_seen": 217055805, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 10080, + "time_per_iteration": 2.4511730670928955 + }, + { + "auxiliary_loss_clip": 0.01107796, + "auxiliary_loss_mlp": 0.01029525, + "balance_loss_clip": 1.01679373, + "balance_loss_mlp": 1.03799117, + "epoch": 0.6061025101457989, + "flos": 23002293081600.0, + "grad_norm": 1.8405271272242842, + "language_loss": 0.71136117, + "learning_rate": 1.4183010253110492e-06, + "loss": 0.73273432, + "num_input_tokens_seen": 217074175, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 10081, + "time_per_iteration": 2.455698251724243 + }, + { + "auxiliary_loss_clip": 0.01105313, + "auxiliary_loss_mlp": 0.01028675, + "balance_loss_clip": 1.01634336, + "balance_loss_mlp": 1.03703296, + "epoch": 0.6061626333984669, + "flos": 29898937134720.0, + "grad_norm": 1.6262436392400634, + "language_loss": 0.69449544, + "learning_rate": 1.4179284115488691e-06, + "loss": 0.71583533, + "num_input_tokens_seen": 217095695, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.68359375, + "step": 10082, + "time_per_iteration": 2.52297043800354 + }, + { + "auxiliary_loss_clip": 0.01106177, + "auxiliary_loss_mlp": 0.01029148, + "balance_loss_clip": 1.01712012, + "balance_loss_mlp": 1.03799009, + "epoch": 0.6062227566511348, + "flos": 25009196745600.0, + "grad_norm": 1.4063428250351147, + "language_loss": 0.65709507, + "learning_rate": 1.4175558198589015e-06, + "loss": 0.67844832, + "num_input_tokens_seen": 217116260, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 10083, + "time_per_iteration": 2.464259147644043 + }, + { + "auxiliary_loss_clip": 0.01103152, + "auxiliary_loss_mlp": 0.01031191, + "balance_loss_clip": 1.01852512, + "balance_loss_mlp": 1.03483891, + "epoch": 0.6062828799038028, + "flos": 19463943104640.0, + "grad_norm": 2.2500443264419423, + "language_loss": 0.74058753, + "learning_rate": 1.4171832502552764e-06, + "loss": 0.76193094, + "num_input_tokens_seen": 217134465, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.68359375, + "step": 10084, + "time_per_iteration": 2.4634742736816406 + }, + { + "auxiliary_loss_clip": 0.01105676, + "auxiliary_loss_mlp": 0.01033057, + "balance_loss_clip": 1.02070093, + "balance_loss_mlp": 1.03634107, + "epoch": 0.6063430031564707, + "flos": 13589568120960.0, + "grad_norm": 2.43129197416672, + "language_loss": 0.72011673, + "learning_rate": 1.4168107027521204e-06, + "loss": 0.74150407, + "num_input_tokens_seen": 217149920, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.69140625, + "step": 10085, + "time_per_iteration": 2.4218525886535645 + }, + { + "auxiliary_loss_clip": 0.01104669, + "auxiliary_loss_mlp": 0.01036153, + "balance_loss_clip": 1.02473903, + "balance_loss_mlp": 1.03681493, + "epoch": 0.6064031264091387, + "flos": 23255499029760.0, + "grad_norm": 2.1595465216971834, + "language_loss": 0.76514173, + "learning_rate": 1.4164381773635605e-06, + "loss": 0.78655005, + "num_input_tokens_seen": 217168165, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6796875, + "step": 10086, + "time_per_iteration": 2.466749429702759 + }, + { + "auxiliary_loss_clip": 0.01103719, + "auxiliary_loss_mlp": 0.01030938, + "balance_loss_clip": 1.01887345, + "balance_loss_mlp": 1.03720832, + "epoch": 0.6064632496618068, + "flos": 22458462192000.0, + "grad_norm": 1.355452455492161, + "language_loss": 0.72577417, + "learning_rate": 1.4160656741037246e-06, + "loss": 0.74712074, + "num_input_tokens_seen": 217190070, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6640625, + "step": 10087, + "time_per_iteration": 2.465503692626953 + }, + { + "auxiliary_loss_clip": 0.01101232, + "auxiliary_loss_mlp": 0.01029891, + "balance_loss_clip": 1.01915646, + "balance_loss_mlp": 1.03517973, + "epoch": 0.6065233729144747, + "flos": 25118652464640.0, + "grad_norm": 1.707331111485516, + "language_loss": 0.83679116, + "learning_rate": 1.4156931929867355e-06, + "loss": 0.85810244, + "num_input_tokens_seen": 217209370, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.66015625, + "step": 10088, + "time_per_iteration": 2.490476369857788 + }, + { + "auxiliary_loss_clip": 0.0110302, + "auxiliary_loss_mlp": 0.01027327, + "balance_loss_clip": 1.01454818, + "balance_loss_mlp": 1.03563654, + "epoch": 0.6065834961671427, + "flos": 23477355383040.0, + "grad_norm": 2.1730607876548924, + "language_loss": 0.7139647, + "learning_rate": 1.4153207340267201e-06, + "loss": 0.73526812, + "num_input_tokens_seen": 217226990, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.671875, + "step": 10089, + "time_per_iteration": 2.4656596183776855 + }, + { + "auxiliary_loss_clip": 0.0110663, + "auxiliary_loss_mlp": 0.01036262, + "balance_loss_clip": 1.0252049, + "balance_loss_mlp": 1.0383575, + "epoch": 0.6066436194198106, + "flos": 17019396132480.0, + "grad_norm": 1.8527545733498374, + "language_loss": 0.82743609, + "learning_rate": 1.4149482972378009e-06, + "loss": 0.84886503, + "num_input_tokens_seen": 217244585, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.68359375, + "step": 10090, + "time_per_iteration": 2.4523448944091797 + }, + { + "auxiliary_loss_clip": 0.01112391, + "auxiliary_loss_mlp": 0.01036313, + "balance_loss_clip": 1.0230329, + "balance_loss_mlp": 1.03768897, + "epoch": 0.6067037426724786, + "flos": 18514752255360.0, + "grad_norm": 2.0611786286574514, + "language_loss": 0.75486428, + "learning_rate": 1.4145758826341e-06, + "loss": 0.77635133, + "num_input_tokens_seen": 217263435, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.74609375, + "step": 10091, + "time_per_iteration": 2.412745475769043 + }, + { + "auxiliary_loss_clip": 0.01103456, + "auxiliary_loss_mlp": 0.01034501, + "balance_loss_clip": 1.02224016, + "balance_loss_mlp": 1.03655899, + "epoch": 0.6067638659251465, + "flos": 22345989730560.0, + "grad_norm": 2.008159335053083, + "language_loss": 0.79580414, + "learning_rate": 1.4142034902297415e-06, + "loss": 0.81718373, + "num_input_tokens_seen": 217283725, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.66796875, + "step": 10092, + "time_per_iteration": 2.4787280559539795 + }, + { + "auxiliary_loss_clip": 0.01107853, + "auxiliary_loss_mlp": 0.01036056, + "balance_loss_clip": 1.02313423, + "balance_loss_mlp": 1.03692424, + "epoch": 0.6068239891778145, + "flos": 12451019748480.0, + "grad_norm": 1.8882550633479742, + "language_loss": 0.76085305, + "learning_rate": 1.4138311200388444e-06, + "loss": 0.78229213, + "num_input_tokens_seen": 217301120, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 10093, + "time_per_iteration": 3.8885409832000732 + }, + { + "auxiliary_loss_clip": 0.01103337, + "auxiliary_loss_mlp": 0.01033728, + "balance_loss_clip": 1.02215874, + "balance_loss_mlp": 1.03691947, + "epoch": 0.6068841124304825, + "flos": 23185868515200.0, + "grad_norm": 2.3186576779301387, + "language_loss": 0.87448221, + "learning_rate": 1.4134587720755304e-06, + "loss": 0.89585286, + "num_input_tokens_seen": 217319585, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6640625, + "step": 10094, + "time_per_iteration": 2.4714174270629883 + }, + { + "auxiliary_loss_clip": 0.01105151, + "auxiliary_loss_mlp": 0.01032495, + "balance_loss_clip": 1.01992464, + "balance_loss_mlp": 1.03669322, + "epoch": 0.6069442356831505, + "flos": 18587902302720.0, + "grad_norm": 1.5923423583427312, + "language_loss": 0.71694756, + "learning_rate": 1.413086446353919e-06, + "loss": 0.73832405, + "num_input_tokens_seen": 217338880, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 10095, + "time_per_iteration": 3.852285861968994 + }, + { + "auxiliary_loss_clip": 0.01105359, + "auxiliary_loss_mlp": 0.01028972, + "balance_loss_clip": 1.01727819, + "balance_loss_mlp": 1.036134, + "epoch": 0.6070043589358184, + "flos": 20960340721920.0, + "grad_norm": 1.6817389817846544, + "language_loss": 0.76919025, + "learning_rate": 1.4127141428881273e-06, + "loss": 0.7905336, + "num_input_tokens_seen": 217357480, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.69140625, + "step": 10096, + "time_per_iteration": 3.9708244800567627 + }, + { + "auxiliary_loss_clip": 0.01107233, + "auxiliary_loss_mlp": 0.0104091, + "balance_loss_clip": 1.0291853, + "balance_loss_mlp": 1.03734136, + "epoch": 0.6070644821884864, + "flos": 11692443398400.0, + "grad_norm": 1.7249712415107992, + "language_loss": 0.79864824, + "learning_rate": 1.4123418616922749e-06, + "loss": 0.82012963, + "num_input_tokens_seen": 217374575, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.69921875, + "step": 10097, + "time_per_iteration": 2.4229838848114014 + }, + { + "auxiliary_loss_clip": 0.01102947, + "auxiliary_loss_mlp": 0.01030901, + "balance_loss_clip": 1.01920676, + "balance_loss_mlp": 1.03555632, + "epoch": 0.6071246054411543, + "flos": 19310568030720.0, + "grad_norm": 1.4260099040951442, + "language_loss": 0.67338455, + "learning_rate": 1.411969602780478e-06, + "loss": 0.69472301, + "num_input_tokens_seen": 217392950, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.67578125, + "step": 10098, + "time_per_iteration": 3.9603915214538574 + }, + { + "auxiliary_loss_clip": 0.01103812, + "auxiliary_loss_mlp": 0.01031482, + "balance_loss_clip": 1.01989472, + "balance_loss_mlp": 1.03617251, + "epoch": 0.6071847286938223, + "flos": 17749029098880.0, + "grad_norm": 1.8973033677095168, + "language_loss": 0.80694121, + "learning_rate": 1.4115973661668523e-06, + "loss": 0.82829416, + "num_input_tokens_seen": 217412145, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.67578125, + "step": 10099, + "time_per_iteration": 2.409189462661743 + }, + { + "auxiliary_loss_clip": 0.01108088, + "auxiliary_loss_mlp": 0.01034923, + "balance_loss_clip": 1.02195358, + "balance_loss_mlp": 1.03531229, + "epoch": 0.6072448519464904, + "flos": 22637512512000.0, + "grad_norm": 2.230451803545553, + "language_loss": 0.70439708, + "learning_rate": 1.4112251518655133e-06, + "loss": 0.72582722, + "num_input_tokens_seen": 217432080, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7265625, + "step": 10100, + "time_per_iteration": 2.484339952468872 + }, + { + "auxiliary_loss_clip": 0.01108261, + "auxiliary_loss_mlp": 0.01036108, + "balance_loss_clip": 1.02344251, + "balance_loss_mlp": 1.03890038, + "epoch": 0.6073049751991583, + "flos": 19537308633600.0, + "grad_norm": 1.5791187964785582, + "language_loss": 0.70447475, + "learning_rate": 1.4108529598905764e-06, + "loss": 0.72591841, + "num_input_tokens_seen": 217450945, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 10101, + "time_per_iteration": 2.4309775829315186 + }, + { + "auxiliary_loss_clip": 0.01102039, + "auxiliary_loss_mlp": 0.01033664, + "balance_loss_clip": 1.02154672, + "balance_loss_mlp": 1.03490043, + "epoch": 0.6073650984518263, + "flos": 28294233033600.0, + "grad_norm": 1.6995748618566444, + "language_loss": 0.69606161, + "learning_rate": 1.410480790256154e-06, + "loss": 0.71741861, + "num_input_tokens_seen": 217473105, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 10102, + "time_per_iteration": 2.524376630783081 + }, + { + "auxiliary_loss_clip": 0.01107251, + "auxiliary_loss_mlp": 0.01033826, + "balance_loss_clip": 1.02220285, + "balance_loss_mlp": 1.0382359, + "epoch": 0.6074252217044942, + "flos": 25664422688640.0, + "grad_norm": 1.7952265928760782, + "language_loss": 0.73694891, + "learning_rate": 1.4101086429763589e-06, + "loss": 0.75835967, + "num_input_tokens_seen": 217491780, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.69140625, + "step": 10103, + "time_per_iteration": 2.4625236988067627 + }, + { + "auxiliary_loss_clip": 0.01110432, + "auxiliary_loss_mlp": 0.01036259, + "balance_loss_clip": 1.02295542, + "balance_loss_mlp": 1.03862011, + "epoch": 0.6074853449571622, + "flos": 22857106308480.0, + "grad_norm": 1.6961753672547197, + "language_loss": 0.76819229, + "learning_rate": 1.4097365180653032e-06, + "loss": 0.7896592, + "num_input_tokens_seen": 217510605, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 10104, + "time_per_iteration": 2.467879056930542 + }, + { + "auxiliary_loss_clip": 0.01028848, + "auxiliary_loss_mlp": 0.01014471, + "balance_loss_clip": 1.01331425, + "balance_loss_mlp": 1.00746071, + "epoch": 0.6075454682098301, + "flos": 67111406547840.0, + "grad_norm": 0.7111703190831327, + "language_loss": 0.56059039, + "learning_rate": 1.4093644155370977e-06, + "loss": 0.58102357, + "num_input_tokens_seen": 217574815, + "router_z_loss_clip": 0.01153564, + "router_z_loss_mlp": 0.21484375, + "step": 10105, + "time_per_iteration": 3.066772222518921 + }, + { + "auxiliary_loss_clip": 0.01028964, + "auxiliary_loss_mlp": 0.01012366, + "balance_loss_clip": 1.01119196, + "balance_loss_mlp": 1.00750494, + "epoch": 0.6076055914624982, + "flos": 70712024751360.0, + "grad_norm": 0.7555703523663572, + "language_loss": 0.56791615, + "learning_rate": 1.4089923354058533e-06, + "loss": 0.58832943, + "num_input_tokens_seen": 217632375, + "router_z_loss_clip": 0.01171875, + "router_z_loss_mlp": 0.21484375, + "step": 10106, + "time_per_iteration": 3.0346710681915283 + }, + { + "auxiliary_loss_clip": 0.01103036, + "auxiliary_loss_mlp": 0.01033262, + "balance_loss_clip": 1.02136517, + "balance_loss_mlp": 1.03558111, + "epoch": 0.6076657147151661, + "flos": 28364545906560.0, + "grad_norm": 1.556971911912289, + "language_loss": 0.68647003, + "learning_rate": 1.4086202776856784e-06, + "loss": 0.70783293, + "num_input_tokens_seen": 217653055, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.67578125, + "step": 10107, + "time_per_iteration": 2.5070221424102783 + }, + { + "auxiliary_loss_clip": 0.0110868, + "auxiliary_loss_mlp": 0.01030388, + "balance_loss_clip": 1.01801395, + "balance_loss_mlp": 1.03806663, + "epoch": 0.6077258379678341, + "flos": 15049767807360.0, + "grad_norm": 2.0591355858624594, + "language_loss": 0.81006205, + "learning_rate": 1.4082482423906815e-06, + "loss": 0.83145273, + "num_input_tokens_seen": 217671520, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.70703125, + "step": 10108, + "time_per_iteration": 2.449876070022583 + }, + { + "auxiliary_loss_clip": 0.01109814, + "auxiliary_loss_mlp": 0.01031426, + "balance_loss_clip": 1.01798475, + "balance_loss_mlp": 1.03772831, + "epoch": 0.607785961220502, + "flos": 36167251553280.0, + "grad_norm": 1.6885620074685026, + "language_loss": 0.70979893, + "learning_rate": 1.4078762295349714e-06, + "loss": 0.7312113, + "num_input_tokens_seen": 217691880, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.72265625, + "step": 10109, + "time_per_iteration": 2.569441318511963 + }, + { + "auxiliary_loss_clip": 0.01101619, + "auxiliary_loss_mlp": 0.01027249, + "balance_loss_clip": 1.01598346, + "balance_loss_mlp": 1.0354414, + "epoch": 0.60784608447317, + "flos": 22524249951360.0, + "grad_norm": 1.5138210455097567, + "language_loss": 0.80043399, + "learning_rate": 1.407504239132653e-06, + "loss": 0.82172269, + "num_input_tokens_seen": 217710530, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.66015625, + "step": 10110, + "time_per_iteration": 2.5667614936828613 + }, + { + "auxiliary_loss_clip": 0.01107667, + "auxiliary_loss_mlp": 0.01029434, + "balance_loss_clip": 1.01691723, + "balance_loss_mlp": 1.03725386, + "epoch": 0.6079062077258379, + "flos": 23841166285440.0, + "grad_norm": 2.0834448443085463, + "language_loss": 0.7047748, + "learning_rate": 1.4071322711978338e-06, + "loss": 0.72614574, + "num_input_tokens_seen": 217728650, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.70703125, + "step": 10111, + "time_per_iteration": 2.449047565460205 + }, + { + "auxiliary_loss_clip": 0.01107266, + "auxiliary_loss_mlp": 0.01030401, + "balance_loss_clip": 1.01765776, + "balance_loss_mlp": 1.03687668, + "epoch": 0.6079663309785059, + "flos": 23367037737600.0, + "grad_norm": 1.8731958384235612, + "language_loss": 0.65437806, + "learning_rate": 1.4067603257446186e-06, + "loss": 0.67575473, + "num_input_tokens_seen": 217747135, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 10112, + "time_per_iteration": 2.5956103801727295 + }, + { + "auxiliary_loss_clip": 0.01028267, + "auxiliary_loss_mlp": 0.00997544, + "balance_loss_clip": 0.99637622, + "balance_loss_mlp": 1.00686228, + "epoch": 0.6080264542311739, + "flos": 71382873110400.0, + "grad_norm": 0.6257418493150695, + "language_loss": 0.49600247, + "learning_rate": 1.4063884027871105e-06, + "loss": 0.51626056, + "num_input_tokens_seen": 217811860, + "router_z_loss_clip": 0.01165771, + "router_z_loss_mlp": 0.21484375, + "step": 10113, + "time_per_iteration": 3.0929043292999268 + }, + { + "auxiliary_loss_clip": 0.01027496, + "auxiliary_loss_mlp": 0.01000577, + "balance_loss_clip": 0.99939102, + "balance_loss_mlp": 1.0062747, + "epoch": 0.6080865774838419, + "flos": 66529833442560.0, + "grad_norm": 0.8371205862323671, + "language_loss": 0.56964719, + "learning_rate": 1.4060165023394147e-06, + "loss": 0.58992791, + "num_input_tokens_seen": 217866510, + "router_z_loss_clip": 0.01184082, + "router_z_loss_mlp": 0.21289062, + "step": 10114, + "time_per_iteration": 2.9712812900543213 + }, + { + "auxiliary_loss_clip": 0.01109587, + "auxiliary_loss_mlp": 0.01029457, + "balance_loss_clip": 1.01581991, + "balance_loss_mlp": 1.03810143, + "epoch": 0.6081467007365099, + "flos": 19207935895680.0, + "grad_norm": 1.7367632173905274, + "language_loss": 0.69756359, + "learning_rate": 1.4056446244156317e-06, + "loss": 0.71895409, + "num_input_tokens_seen": 217885650, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71484375, + "step": 10115, + "time_per_iteration": 2.4941470623016357 + }, + { + "auxiliary_loss_clip": 0.01106631, + "auxiliary_loss_mlp": 0.01027755, + "balance_loss_clip": 1.01550055, + "balance_loss_mlp": 1.03715134, + "epoch": 0.6082068239891778, + "flos": 24167737762560.0, + "grad_norm": 18.577098805589706, + "language_loss": 0.72356099, + "learning_rate": 1.4052727690298642e-06, + "loss": 0.74490488, + "num_input_tokens_seen": 217905300, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 10116, + "time_per_iteration": 2.448673725128174 + }, + { + "auxiliary_loss_clip": 0.01107481, + "auxiliary_loss_mlp": 0.01034138, + "balance_loss_clip": 1.02089953, + "balance_loss_mlp": 1.03622699, + "epoch": 0.6082669472418458, + "flos": 37413316310400.0, + "grad_norm": 1.8751462040451332, + "language_loss": 0.53553987, + "learning_rate": 1.4049009361962138e-06, + "loss": 0.55695611, + "num_input_tokens_seen": 217927845, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 10117, + "time_per_iteration": 2.592958927154541 + }, + { + "auxiliary_loss_clip": 0.01106561, + "auxiliary_loss_mlp": 0.01025434, + "balance_loss_clip": 1.01340544, + "balance_loss_mlp": 1.03709269, + "epoch": 0.6083270704945137, + "flos": 15085534775040.0, + "grad_norm": 1.965088318697828, + "language_loss": 0.69835466, + "learning_rate": 1.4045291259287786e-06, + "loss": 0.71967459, + "num_input_tokens_seen": 217946145, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6953125, + "step": 10118, + "time_per_iteration": 2.4184305667877197 + }, + { + "auxiliary_loss_clip": 0.01107391, + "auxiliary_loss_mlp": 0.01027545, + "balance_loss_clip": 1.01545143, + "balance_loss_mlp": 1.03855991, + "epoch": 0.6083871937471818, + "flos": 20668458804480.0, + "grad_norm": 1.4929706938116498, + "language_loss": 0.74641609, + "learning_rate": 1.4041573382416588e-06, + "loss": 0.7677654, + "num_input_tokens_seen": 217965190, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 10119, + "time_per_iteration": 2.4534857273101807 + }, + { + "auxiliary_loss_clip": 0.01104393, + "auxiliary_loss_mlp": 0.01033724, + "balance_loss_clip": 1.02189887, + "balance_loss_mlp": 1.03641152, + "epoch": 0.6084473169998497, + "flos": 21506901045120.0, + "grad_norm": 1.5799518634527623, + "language_loss": 0.67427665, + "learning_rate": 1.4037855731489525e-06, + "loss": 0.69565779, + "num_input_tokens_seen": 217983625, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 10120, + "time_per_iteration": 2.439384937286377 + }, + { + "auxiliary_loss_clip": 0.01109214, + "auxiliary_loss_mlp": 0.01032656, + "balance_loss_clip": 1.01977515, + "balance_loss_mlp": 1.03793478, + "epoch": 0.6085074402525177, + "flos": 26870051710080.0, + "grad_norm": 1.74219428879995, + "language_loss": 0.74141055, + "learning_rate": 1.4034138306647571e-06, + "loss": 0.76282924, + "num_input_tokens_seen": 218006005, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 10121, + "time_per_iteration": 2.506490707397461 + }, + { + "auxiliary_loss_clip": 0.01103145, + "auxiliary_loss_mlp": 0.01026601, + "balance_loss_clip": 1.01530576, + "balance_loss_mlp": 1.03512359, + "epoch": 0.6085675635051856, + "flos": 10889839952640.0, + "grad_norm": 1.7909457152882267, + "language_loss": 0.80599827, + "learning_rate": 1.4030421108031685e-06, + "loss": 0.82729572, + "num_input_tokens_seen": 218024195, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6796875, + "step": 10122, + "time_per_iteration": 2.422988176345825 + }, + { + "auxiliary_loss_clip": 0.01107244, + "auxiliary_loss_mlp": 0.01030794, + "balance_loss_clip": 1.01846766, + "balance_loss_mlp": 1.03843355, + "epoch": 0.6086276867578536, + "flos": 34862186707200.0, + "grad_norm": 1.4671658127927028, + "language_loss": 0.55411458, + "learning_rate": 1.402670413578284e-06, + "loss": 0.57549489, + "num_input_tokens_seen": 218047190, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 10123, + "time_per_iteration": 2.6203012466430664 + }, + { + "auxiliary_loss_clip": 0.01104564, + "auxiliary_loss_mlp": 0.01031847, + "balance_loss_clip": 1.01975298, + "balance_loss_mlp": 1.03711987, + "epoch": 0.6086878100105215, + "flos": 20047706939520.0, + "grad_norm": 1.7982570079112092, + "language_loss": 0.73612612, + "learning_rate": 1.4022987390041965e-06, + "loss": 0.75749022, + "num_input_tokens_seen": 218065945, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 10124, + "time_per_iteration": 2.465306282043457 + }, + { + "auxiliary_loss_clip": 0.01105892, + "auxiliary_loss_mlp": 0.01030108, + "balance_loss_clip": 1.01791847, + "balance_loss_mlp": 1.03691709, + "epoch": 0.6087479332631895, + "flos": 18332469711360.0, + "grad_norm": 3.6424543705255648, + "language_loss": 0.66014802, + "learning_rate": 1.4019270870950006e-06, + "loss": 0.681508, + "num_input_tokens_seen": 218085285, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 10125, + "time_per_iteration": 2.4767675399780273 + }, + { + "auxiliary_loss_clip": 0.01105208, + "auxiliary_loss_mlp": 0.01031644, + "balance_loss_clip": 1.01943719, + "balance_loss_mlp": 1.03736734, + "epoch": 0.6088080565158575, + "flos": 24493411399680.0, + "grad_norm": 2.3623427434848066, + "language_loss": 0.76202977, + "learning_rate": 1.40155545786479e-06, + "loss": 0.78339827, + "num_input_tokens_seen": 218104735, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6796875, + "step": 10126, + "time_per_iteration": 2.469557046890259 + }, + { + "auxiliary_loss_clip": 0.01109286, + "auxiliary_loss_mlp": 0.01028819, + "balance_loss_clip": 1.01583672, + "balance_loss_mlp": 1.03710127, + "epoch": 0.6088681797685255, + "flos": 10269016260480.0, + "grad_norm": 2.6026801218546036, + "language_loss": 0.71315622, + "learning_rate": 1.4011838513276558e-06, + "loss": 0.73453724, + "num_input_tokens_seen": 218121855, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.72265625, + "step": 10127, + "time_per_iteration": 2.463219404220581 + }, + { + "auxiliary_loss_clip": 0.01110954, + "auxiliary_loss_mlp": 0.01033412, + "balance_loss_clip": 1.02028704, + "balance_loss_mlp": 1.03909373, + "epoch": 0.6089283030211935, + "flos": 21973703218560.0, + "grad_norm": 2.879650268865683, + "language_loss": 0.72776711, + "learning_rate": 1.400812267497691e-06, + "loss": 0.74921077, + "num_input_tokens_seen": 218137325, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 10128, + "time_per_iteration": 2.4591028690338135 + }, + { + "auxiliary_loss_clip": 0.01105059, + "auxiliary_loss_mlp": 0.01030144, + "balance_loss_clip": 1.01816392, + "balance_loss_mlp": 1.03707957, + "epoch": 0.6089884262738614, + "flos": 17785191116160.0, + "grad_norm": 4.4407298106903585, + "language_loss": 0.73322678, + "learning_rate": 1.4004407063889842e-06, + "loss": 0.75457883, + "num_input_tokens_seen": 218155530, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6796875, + "step": 10129, + "time_per_iteration": 2.463595151901245 + }, + { + "auxiliary_loss_clip": 0.01104701, + "auxiliary_loss_mlp": 0.01034967, + "balance_loss_clip": 1.02273047, + "balance_loss_mlp": 1.03612173, + "epoch": 0.6090485495265294, + "flos": 36910423946880.0, + "grad_norm": 1.3648179669909797, + "language_loss": 0.65579844, + "learning_rate": 1.400069168015626e-06, + "loss": 0.67719507, + "num_input_tokens_seen": 218182535, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 10130, + "time_per_iteration": 2.638197183609009 + }, + { + "auxiliary_loss_clip": 0.01101489, + "auxiliary_loss_mlp": 0.01024774, + "balance_loss_clip": 1.01360381, + "balance_loss_mlp": 1.03529549, + "epoch": 0.6091086727791973, + "flos": 19899036547200.0, + "grad_norm": 1.5719208851669182, + "language_loss": 0.77160382, + "learning_rate": 1.3996976523917054e-06, + "loss": 0.79286647, + "num_input_tokens_seen": 218201740, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66015625, + "step": 10131, + "time_per_iteration": 2.4989805221557617 + }, + { + "auxiliary_loss_clip": 0.01104899, + "auxiliary_loss_mlp": 0.01030421, + "balance_loss_clip": 1.01976359, + "balance_loss_mlp": 1.03746176, + "epoch": 0.6091687960318654, + "flos": 22163635359360.0, + "grad_norm": 1.8809693968510182, + "language_loss": 0.76772207, + "learning_rate": 1.3993261595313093e-06, + "loss": 0.78907526, + "num_input_tokens_seen": 218219800, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.67578125, + "step": 10132, + "time_per_iteration": 2.4471144676208496 + }, + { + "auxiliary_loss_clip": 0.01100275, + "auxiliary_loss_mlp": 0.0103227, + "balance_loss_clip": 1.02116609, + "balance_loss_mlp": 1.035465, + "epoch": 0.6092289192845333, + "flos": 21465280160640.0, + "grad_norm": 1.8031513435586903, + "language_loss": 0.75461888, + "learning_rate": 1.3989546894485261e-06, + "loss": 0.77594435, + "num_input_tokens_seen": 218237585, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6484375, + "step": 10133, + "time_per_iteration": 2.4543044567108154 + }, + { + "auxiliary_loss_clip": 0.01104667, + "auxiliary_loss_mlp": 0.01032044, + "balance_loss_clip": 1.01921129, + "balance_loss_mlp": 1.03661132, + "epoch": 0.6092890425372013, + "flos": 28694924225280.0, + "grad_norm": 1.617219095446177, + "language_loss": 0.63404942, + "learning_rate": 1.3985832421574414e-06, + "loss": 0.65541649, + "num_input_tokens_seen": 218258700, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 10134, + "time_per_iteration": 2.4968786239624023 + }, + { + "auxiliary_loss_clip": 0.01101874, + "auxiliary_loss_mlp": 0.01028709, + "balance_loss_clip": 1.01722288, + "balance_loss_mlp": 1.03555775, + "epoch": 0.6093491657898692, + "flos": 20813178700800.0, + "grad_norm": 1.7042888689612277, + "language_loss": 0.78689611, + "learning_rate": 1.3982118176721397e-06, + "loss": 0.80820185, + "num_input_tokens_seen": 218275655, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 10135, + "time_per_iteration": 3.8730435371398926 + }, + { + "auxiliary_loss_clip": 0.01105216, + "auxiliary_loss_mlp": 0.01029652, + "balance_loss_clip": 1.01840401, + "balance_loss_mlp": 1.03660417, + "epoch": 0.6094092890425372, + "flos": 25446983708160.0, + "grad_norm": 2.4676554523748115, + "language_loss": 0.72265971, + "learning_rate": 1.3978404160067069e-06, + "loss": 0.7440083, + "num_input_tokens_seen": 218295720, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6875, + "step": 10136, + "time_per_iteration": 2.721339464187622 + }, + { + "auxiliary_loss_clip": 0.01108039, + "auxiliary_loss_mlp": 0.0102914, + "balance_loss_clip": 1.01704586, + "balance_loss_mlp": 1.0386939, + "epoch": 0.6094694122952051, + "flos": 35621265847680.0, + "grad_norm": 2.10435735907629, + "language_loss": 0.74540055, + "learning_rate": 1.3974690371752253e-06, + "loss": 0.76677233, + "num_input_tokens_seen": 218316745, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69140625, + "step": 10137, + "time_per_iteration": 3.969383716583252 + }, + { + "auxiliary_loss_clip": 0.01106599, + "auxiliary_loss_mlp": 0.01040095, + "balance_loss_clip": 1.027542, + "balance_loss_mlp": 1.03668833, + "epoch": 0.6095295355478731, + "flos": 24456962073600.0, + "grad_norm": 1.7200645743924223, + "language_loss": 0.80628771, + "learning_rate": 1.3970976811917785e-06, + "loss": 0.82775462, + "num_input_tokens_seen": 218335385, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 10138, + "time_per_iteration": 3.9027063846588135 + }, + { + "auxiliary_loss_clip": 0.01100268, + "auxiliary_loss_mlp": 0.01029542, + "balance_loss_clip": 1.01837766, + "balance_loss_mlp": 1.03564954, + "epoch": 0.6095896588005411, + "flos": 15633208419840.0, + "grad_norm": 1.5498019522052684, + "language_loss": 0.80843186, + "learning_rate": 1.3967263480704481e-06, + "loss": 0.82972997, + "num_input_tokens_seen": 218353320, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 10139, + "time_per_iteration": 3.9400634765625 + }, + { + "auxiliary_loss_clip": 0.01108872, + "auxiliary_loss_mlp": 0.01034626, + "balance_loss_clip": 1.02206182, + "balance_loss_mlp": 1.03840351, + "epoch": 0.6096497820532091, + "flos": 15550577182080.0, + "grad_norm": 2.0925165633907254, + "language_loss": 0.8375181, + "learning_rate": 1.396355037825315e-06, + "loss": 0.85895312, + "num_input_tokens_seen": 218365620, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 10140, + "time_per_iteration": 2.4656758308410645 + }, + { + "auxiliary_loss_clip": 0.01105652, + "auxiliary_loss_mlp": 0.01035103, + "balance_loss_clip": 1.02315855, + "balance_loss_mlp": 1.03600419, + "epoch": 0.6097099053058771, + "flos": 24204474397440.0, + "grad_norm": 2.1792852747623557, + "language_loss": 0.75585604, + "learning_rate": 1.3959837504704592e-06, + "loss": 0.77726358, + "num_input_tokens_seen": 218383785, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6953125, + "step": 10141, + "time_per_iteration": 2.574366331100464 + }, + { + "auxiliary_loss_clip": 0.01104603, + "auxiliary_loss_mlp": 0.01026989, + "balance_loss_clip": 1.01502669, + "balance_loss_mlp": 1.03598619, + "epoch": 0.609770028558545, + "flos": 19570238426880.0, + "grad_norm": 1.9409433083757806, + "language_loss": 0.76637286, + "learning_rate": 1.3956124860199603e-06, + "loss": 0.78768879, + "num_input_tokens_seen": 218399055, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6875, + "step": 10142, + "time_per_iteration": 2.4868385791778564 + }, + { + "auxiliary_loss_clip": 0.01104768, + "auxiliary_loss_mlp": 0.01032927, + "balance_loss_clip": 1.02058291, + "balance_loss_mlp": 1.03676569, + "epoch": 0.609830151811213, + "flos": 23949185460480.0, + "grad_norm": 1.745652179186059, + "language_loss": 0.76381373, + "learning_rate": 1.3952412444878964e-06, + "loss": 0.7851907, + "num_input_tokens_seen": 218419120, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6796875, + "step": 10143, + "time_per_iteration": 2.5635735988616943 + }, + { + "auxiliary_loss_clip": 0.01106393, + "auxiliary_loss_mlp": 0.01030422, + "balance_loss_clip": 1.01801276, + "balance_loss_mlp": 1.03715992, + "epoch": 0.6098902750638809, + "flos": 16179732829440.0, + "grad_norm": 1.7435526117723426, + "language_loss": 0.74993449, + "learning_rate": 1.3948700258883448e-06, + "loss": 0.77130264, + "num_input_tokens_seen": 218435290, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69140625, + "step": 10144, + "time_per_iteration": 2.4298861026763916 + }, + { + "auxiliary_loss_clip": 0.01107837, + "auxiliary_loss_mlp": 0.01028049, + "balance_loss_clip": 1.01527548, + "balance_loss_mlp": 1.03741479, + "epoch": 0.609950398316549, + "flos": 44526393763200.0, + "grad_norm": 2.307147766408813, + "language_loss": 0.72727025, + "learning_rate": 1.394498830235383e-06, + "loss": 0.74862915, + "num_input_tokens_seen": 218457880, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.703125, + "step": 10145, + "time_per_iteration": 2.694578170776367 + }, + { + "auxiliary_loss_clip": 0.01104204, + "auxiliary_loss_mlp": 0.01030027, + "balance_loss_clip": 1.01882768, + "balance_loss_mlp": 1.036484, + "epoch": 0.6100105215692169, + "flos": 23221743223680.0, + "grad_norm": 7.584582797643419, + "language_loss": 0.69428813, + "learning_rate": 1.3941276575430862e-06, + "loss": 0.71563041, + "num_input_tokens_seen": 218475930, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6796875, + "step": 10146, + "time_per_iteration": 2.4656052589416504 + }, + { + "auxiliary_loss_clip": 0.01102864, + "auxiliary_loss_mlp": 0.0102747, + "balance_loss_clip": 1.01665735, + "balance_loss_mlp": 1.03688705, + "epoch": 0.6100706448218849, + "flos": 15012564295680.0, + "grad_norm": 1.601297479877826, + "language_loss": 0.76745832, + "learning_rate": 1.3937565078255289e-06, + "loss": 0.78876168, + "num_input_tokens_seen": 218493675, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.65625, + "step": 10147, + "time_per_iteration": 2.5520474910736084 + }, + { + "auxiliary_loss_clip": 0.01103536, + "auxiliary_loss_mlp": 0.01026544, + "balance_loss_clip": 1.01436126, + "balance_loss_mlp": 1.03525686, + "epoch": 0.6101307680745528, + "flos": 19639976682240.0, + "grad_norm": 1.740411663388647, + "language_loss": 0.78028274, + "learning_rate": 1.393385381096786e-06, + "loss": 0.80158353, + "num_input_tokens_seen": 218511780, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 10148, + "time_per_iteration": 2.4648149013519287 + }, + { + "auxiliary_loss_clip": 0.01110722, + "auxiliary_loss_mlp": 0.0103437, + "balance_loss_clip": 1.02107859, + "balance_loss_mlp": 1.03736377, + "epoch": 0.6101908913272208, + "flos": 29935566028800.0, + "grad_norm": 2.1220511331050758, + "language_loss": 0.53903639, + "learning_rate": 1.39301427737093e-06, + "loss": 0.56048727, + "num_input_tokens_seen": 218531850, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.734375, + "step": 10149, + "time_per_iteration": 2.566124200820923 + }, + { + "auxiliary_loss_clip": 0.01101762, + "auxiliary_loss_mlp": 0.0103104, + "balance_loss_clip": 1.01953661, + "balance_loss_mlp": 1.03660202, + "epoch": 0.6102510145798887, + "flos": 21798639308160.0, + "grad_norm": 1.8365676346298867, + "language_loss": 0.80172944, + "learning_rate": 1.3926431966620333e-06, + "loss": 0.82305747, + "num_input_tokens_seen": 218551245, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65234375, + "step": 10150, + "time_per_iteration": 2.5030646324157715 + }, + { + "auxiliary_loss_clip": 0.01108008, + "auxiliary_loss_mlp": 0.01035252, + "balance_loss_clip": 1.02272308, + "balance_loss_mlp": 1.0384438, + "epoch": 0.6103111378325567, + "flos": 20706129192960.0, + "grad_norm": 1.5453703107618904, + "language_loss": 0.69006532, + "learning_rate": 1.3922721389841684e-06, + "loss": 0.7114979, + "num_input_tokens_seen": 218571365, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 10151, + "time_per_iteration": 2.5013327598571777 + }, + { + "auxiliary_loss_clip": 0.01103309, + "auxiliary_loss_mlp": 0.01028825, + "balance_loss_clip": 1.01780438, + "balance_loss_mlp": 1.0351758, + "epoch": 0.6103712610852247, + "flos": 29381643417600.0, + "grad_norm": 1.64819750933, + "language_loss": 0.70659781, + "learning_rate": 1.3919011043514036e-06, + "loss": 0.7279191, + "num_input_tokens_seen": 218588315, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6796875, + "step": 10152, + "time_per_iteration": 2.519719362258911 + }, + { + "auxiliary_loss_clip": 0.01107575, + "auxiliary_loss_mlp": 0.01032635, + "balance_loss_clip": 1.02031481, + "balance_loss_mlp": 1.03778815, + "epoch": 0.6104313843378927, + "flos": 20813035046400.0, + "grad_norm": 2.061001889975494, + "language_loss": 0.77937526, + "learning_rate": 1.391530092777811e-06, + "loss": 0.80077732, + "num_input_tokens_seen": 218605940, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69921875, + "step": 10153, + "time_per_iteration": 2.4679317474365234 + }, + { + "auxiliary_loss_clip": 0.01106601, + "auxiliary_loss_mlp": 0.01029839, + "balance_loss_clip": 1.01775157, + "balance_loss_mlp": 1.03693819, + "epoch": 0.6104915075905607, + "flos": 26578457101440.0, + "grad_norm": 1.6071348715593325, + "language_loss": 0.79040915, + "learning_rate": 1.3911591042774573e-06, + "loss": 0.8117736, + "num_input_tokens_seen": 218626100, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6953125, + "step": 10154, + "time_per_iteration": 2.4811360836029053 + }, + { + "auxiliary_loss_clip": 0.01102999, + "auxiliary_loss_mlp": 0.0102928, + "balance_loss_clip": 1.01754403, + "balance_loss_mlp": 1.03598225, + "epoch": 0.6105516308432286, + "flos": 23915788790400.0, + "grad_norm": 1.696167937827746, + "language_loss": 0.70110655, + "learning_rate": 1.3907881388644116e-06, + "loss": 0.72242928, + "num_input_tokens_seen": 218645060, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66796875, + "step": 10155, + "time_per_iteration": 2.4926087856292725 + }, + { + "auxiliary_loss_clip": 0.01107481, + "auxiliary_loss_mlp": 0.01028627, + "balance_loss_clip": 1.01622927, + "balance_loss_mlp": 1.03898025, + "epoch": 0.6106117540958966, + "flos": 31577365900800.0, + "grad_norm": 1.5701440613704458, + "language_loss": 0.7118175, + "learning_rate": 1.3904171965527413e-06, + "loss": 0.73317862, + "num_input_tokens_seen": 218667690, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 10156, + "time_per_iteration": 2.529212236404419 + }, + { + "auxiliary_loss_clip": 0.01103012, + "auxiliary_loss_mlp": 0.01029909, + "balance_loss_clip": 1.01777911, + "balance_loss_mlp": 1.0372014, + "epoch": 0.6106718773485645, + "flos": 19608160210560.0, + "grad_norm": 1.5875405214127527, + "language_loss": 0.67776453, + "learning_rate": 1.3900462773565114e-06, + "loss": 0.69909376, + "num_input_tokens_seen": 218687505, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66015625, + "step": 10157, + "time_per_iteration": 2.4632043838500977 + }, + { + "auxiliary_loss_clip": 0.01103689, + "auxiliary_loss_mlp": 0.01028593, + "balance_loss_clip": 1.01682067, + "balance_loss_mlp": 1.03470659, + "epoch": 0.6107320006012326, + "flos": 17123895774720.0, + "grad_norm": 1.8568219075391552, + "language_loss": 0.72478032, + "learning_rate": 1.3896753812897877e-06, + "loss": 0.74610317, + "num_input_tokens_seen": 218705315, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6875, + "step": 10158, + "time_per_iteration": 2.419174909591675 + }, + { + "auxiliary_loss_clip": 0.01106137, + "auxiliary_loss_mlp": 0.01032049, + "balance_loss_clip": 1.02036619, + "balance_loss_mlp": 1.03673482, + "epoch": 0.6107921238539005, + "flos": 30148228500480.0, + "grad_norm": 1.8610687942781703, + "language_loss": 0.69770175, + "learning_rate": 1.389304508366635e-06, + "loss": 0.71908361, + "num_input_tokens_seen": 218725735, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6953125, + "step": 10159, + "time_per_iteration": 2.5595028400421143 + }, + { + "auxiliary_loss_clip": 0.01106993, + "auxiliary_loss_mlp": 0.01031058, + "balance_loss_clip": 1.01866579, + "balance_loss_mlp": 1.03715146, + "epoch": 0.6108522471065685, + "flos": 18440273404800.0, + "grad_norm": 1.8623845683480673, + "language_loss": 0.79084963, + "learning_rate": 1.3889336586011167e-06, + "loss": 0.81223011, + "num_input_tokens_seen": 218743215, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6953125, + "step": 10160, + "time_per_iteration": 2.4194223880767822 + }, + { + "auxiliary_loss_clip": 0.01029586, + "auxiliary_loss_mlp": 0.0100036, + "balance_loss_clip": 0.99904329, + "balance_loss_mlp": 1.00828457, + "epoch": 0.6109123703592364, + "flos": 64135454791680.0, + "grad_norm": 0.8176802836469281, + "language_loss": 0.61464268, + "learning_rate": 1.388562832007295e-06, + "loss": 0.63494217, + "num_input_tokens_seen": 218806440, + "router_z_loss_clip": 0.01318359, + "router_z_loss_mlp": 0.21289062, + "step": 10161, + "time_per_iteration": 3.204864501953125 + }, + { + "auxiliary_loss_clip": 0.01106906, + "auxiliary_loss_mlp": 0.01033953, + "balance_loss_clip": 1.02142394, + "balance_loss_mlp": 1.03706193, + "epoch": 0.6109724936119044, + "flos": 20667848273280.0, + "grad_norm": 1.7743481380342319, + "language_loss": 0.76395631, + "learning_rate": 1.3881920285992324e-06, + "loss": 0.78536499, + "num_input_tokens_seen": 218825720, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69921875, + "step": 10162, + "time_per_iteration": 2.4414381980895996 + }, + { + "auxiliary_loss_clip": 0.01104818, + "auxiliary_loss_mlp": 0.01030642, + "balance_loss_clip": 1.01866198, + "balance_loss_mlp": 1.0372498, + "epoch": 0.6110326168645723, + "flos": 31351882273920.0, + "grad_norm": 2.0274139033268077, + "language_loss": 0.71609962, + "learning_rate": 1.3878212483909888e-06, + "loss": 0.73745424, + "num_input_tokens_seen": 218847735, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 10163, + "time_per_iteration": 2.541321039199829 + }, + { + "auxiliary_loss_clip": 0.01101773, + "auxiliary_loss_mlp": 0.01029191, + "balance_loss_clip": 1.01797926, + "balance_loss_mlp": 1.03580725, + "epoch": 0.6110927401172404, + "flos": 25003378742400.0, + "grad_norm": 14.54042933705356, + "language_loss": 0.59390211, + "learning_rate": 1.387450491396625e-06, + "loss": 0.61521178, + "num_input_tokens_seen": 218866585, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.66015625, + "step": 10164, + "time_per_iteration": 2.4755120277404785 + }, + { + "auxiliary_loss_clip": 0.01103552, + "auxiliary_loss_mlp": 0.01029571, + "balance_loss_clip": 1.01823997, + "balance_loss_mlp": 1.03602457, + "epoch": 0.6111528633699083, + "flos": 26248078782720.0, + "grad_norm": 1.7214680551340567, + "language_loss": 0.75950801, + "learning_rate": 1.3870797576302003e-06, + "loss": 0.7808392, + "num_input_tokens_seen": 218885560, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.67578125, + "step": 10165, + "time_per_iteration": 2.491528034210205 + }, + { + "auxiliary_loss_clip": 0.01105154, + "auxiliary_loss_mlp": 0.01027958, + "balance_loss_clip": 1.01582241, + "balance_loss_mlp": 1.03982759, + "epoch": 0.6112129866225763, + "flos": 22382474970240.0, + "grad_norm": 1.4553973070214548, + "language_loss": 0.78996694, + "learning_rate": 1.3867090471057719e-06, + "loss": 0.81129807, + "num_input_tokens_seen": 218905055, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.65625, + "step": 10166, + "time_per_iteration": 2.4699227809906006 + }, + { + "auxiliary_loss_clip": 0.01105985, + "auxiliary_loss_mlp": 0.01029373, + "balance_loss_clip": 1.01734531, + "balance_loss_mlp": 1.03734827, + "epoch": 0.6112731098752443, + "flos": 25227892702080.0, + "grad_norm": 3.097252625024806, + "language_loss": 0.67920876, + "learning_rate": 1.3863383598373987e-06, + "loss": 0.70056236, + "num_input_tokens_seen": 218924030, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 10167, + "time_per_iteration": 2.5190818309783936 + }, + { + "auxiliary_loss_clip": 0.01104165, + "auxiliary_loss_mlp": 0.01029568, + "balance_loss_clip": 1.01872551, + "balance_loss_mlp": 1.03759277, + "epoch": 0.6113332331279122, + "flos": 22893160584960.0, + "grad_norm": 1.7954202202348515, + "language_loss": 0.78805661, + "learning_rate": 1.3859676958391364e-06, + "loss": 0.80939388, + "num_input_tokens_seen": 218943750, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6640625, + "step": 10168, + "time_per_iteration": 2.4622983932495117 + }, + { + "auxiliary_loss_clip": 0.01110572, + "auxiliary_loss_mlp": 0.01034407, + "balance_loss_clip": 1.02078724, + "balance_loss_mlp": 1.03739679, + "epoch": 0.6113933563805802, + "flos": 18620329305600.0, + "grad_norm": 4.090256272371363, + "language_loss": 0.85369581, + "learning_rate": 1.3855970551250398e-06, + "loss": 0.87514555, + "num_input_tokens_seen": 218957585, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73046875, + "step": 10169, + "time_per_iteration": 2.4625487327575684 + }, + { + "auxiliary_loss_clip": 0.01101901, + "auxiliary_loss_mlp": 0.01027849, + "balance_loss_clip": 1.01722717, + "balance_loss_mlp": 1.03553629, + "epoch": 0.6114534796332481, + "flos": 41866275317760.0, + "grad_norm": 2.5520669740881727, + "language_loss": 0.78887564, + "learning_rate": 1.3852264377091652e-06, + "loss": 0.81017315, + "num_input_tokens_seen": 218980025, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6640625, + "step": 10170, + "time_per_iteration": 2.6308984756469727 + }, + { + "auxiliary_loss_clip": 0.01108241, + "auxiliary_loss_mlp": 0.01035541, + "balance_loss_clip": 1.02221966, + "balance_loss_mlp": 1.03567076, + "epoch": 0.6115136028859162, + "flos": 21908454163200.0, + "grad_norm": 1.8675504682209607, + "language_loss": 0.69072127, + "learning_rate": 1.3848558436055651e-06, + "loss": 0.71215916, + "num_input_tokens_seen": 218998200, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7265625, + "step": 10171, + "time_per_iteration": 2.4951138496398926 + }, + { + "auxiliary_loss_clip": 0.01106531, + "auxiliary_loss_mlp": 0.01035651, + "balance_loss_clip": 1.02225816, + "balance_loss_mlp": 1.03609705, + "epoch": 0.6115737261385841, + "flos": 28804846821120.0, + "grad_norm": 1.5834424948906107, + "language_loss": 0.78990817, + "learning_rate": 1.3844852728282934e-06, + "loss": 0.81132996, + "num_input_tokens_seen": 219017910, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.703125, + "step": 10172, + "time_per_iteration": 2.512971878051758 + }, + { + "auxiliary_loss_clip": 0.01110708, + "auxiliary_loss_mlp": 0.01033185, + "balance_loss_clip": 1.02060866, + "balance_loss_mlp": 1.03796673, + "epoch": 0.6116338493912521, + "flos": 21251468453760.0, + "grad_norm": 1.895061103662262, + "language_loss": 0.66887462, + "learning_rate": 1.3841147253914022e-06, + "loss": 0.69031352, + "num_input_tokens_seen": 219037730, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7265625, + "step": 10173, + "time_per_iteration": 2.450739860534668 + }, + { + "auxiliary_loss_clip": 0.01107875, + "auxiliary_loss_mlp": 0.01033311, + "balance_loss_clip": 1.02124667, + "balance_loss_mlp": 1.03863525, + "epoch": 0.61169397264392, + "flos": 17530189488000.0, + "grad_norm": 1.9715957300151092, + "language_loss": 0.5560292, + "learning_rate": 1.3837442013089416e-06, + "loss": 0.57744104, + "num_input_tokens_seen": 219056755, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.69140625, + "step": 10174, + "time_per_iteration": 2.4200756549835205 + }, + { + "auxiliary_loss_clip": 0.01111305, + "auxiliary_loss_mlp": 0.0103241, + "balance_loss_clip": 1.01991677, + "balance_loss_mlp": 1.04081392, + "epoch": 0.611754095896588, + "flos": 23951555758080.0, + "grad_norm": 1.8852329096028353, + "language_loss": 0.66003776, + "learning_rate": 1.3833737005949628e-06, + "loss": 0.68147486, + "num_input_tokens_seen": 219076985, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 10175, + "time_per_iteration": 2.4889590740203857 + }, + { + "auxiliary_loss_clip": 0.01102647, + "auxiliary_loss_mlp": 0.01023113, + "balance_loss_clip": 1.01236653, + "balance_loss_mlp": 1.03501439, + "epoch": 0.6118142191492559, + "flos": 25994872834560.0, + "grad_norm": 2.092985999457116, + "language_loss": 0.82515383, + "learning_rate": 1.3830032232635154e-06, + "loss": 0.84641147, + "num_input_tokens_seen": 219096050, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.67578125, + "step": 10176, + "time_per_iteration": 2.506054639816284 + }, + { + "auxiliary_loss_clip": 0.01107676, + "auxiliary_loss_mlp": 0.01036678, + "balance_loss_clip": 1.02346945, + "balance_loss_mlp": 1.03832841, + "epoch": 0.611874342401924, + "flos": 24603190341120.0, + "grad_norm": 4.162493341668284, + "language_loss": 0.76968575, + "learning_rate": 1.3826327693286474e-06, + "loss": 0.79112923, + "num_input_tokens_seen": 219112665, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6953125, + "step": 10177, + "time_per_iteration": 3.941509962081909 + }, + { + "auxiliary_loss_clip": 0.01104435, + "auxiliary_loss_mlp": 0.01033459, + "balance_loss_clip": 1.02179456, + "balance_loss_mlp": 1.03604686, + "epoch": 0.6119344656545919, + "flos": 15887132640000.0, + "grad_norm": 2.082789690638706, + "language_loss": 0.75353473, + "learning_rate": 1.3822623388044065e-06, + "loss": 0.77491367, + "num_input_tokens_seen": 219129120, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.68359375, + "step": 10178, + "time_per_iteration": 3.827141523361206 + }, + { + "auxiliary_loss_clip": 0.01107456, + "auxiliary_loss_mlp": 0.0103036, + "balance_loss_clip": 1.01814103, + "balance_loss_mlp": 1.03866005, + "epoch": 0.6119945889072599, + "flos": 21652877917440.0, + "grad_norm": 1.6048823215389816, + "language_loss": 0.6671313, + "learning_rate": 1.3818919317048402e-06, + "loss": 0.68850946, + "num_input_tokens_seen": 219148950, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6875, + "step": 10179, + "time_per_iteration": 2.467815637588501 + }, + { + "auxiliary_loss_clip": 0.01107829, + "auxiliary_loss_mlp": 0.01032233, + "balance_loss_clip": 1.02061653, + "balance_loss_mlp": 1.03923988, + "epoch": 0.6120547121599279, + "flos": 13772533023360.0, + "grad_norm": 1.8410866190884951, + "language_loss": 0.84216881, + "learning_rate": 1.3815215480439933e-06, + "loss": 0.86356938, + "num_input_tokens_seen": 219165585, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6875, + "step": 10180, + "time_per_iteration": 5.375430583953857 + }, + { + "auxiliary_loss_clip": 0.01105724, + "auxiliary_loss_mlp": 0.01030243, + "balance_loss_clip": 1.01756501, + "balance_loss_mlp": 1.03854799, + "epoch": 0.6121148354125958, + "flos": 20079164275200.0, + "grad_norm": 1.5429296840981428, + "language_loss": 0.77451497, + "learning_rate": 1.3811511878359113e-06, + "loss": 0.79587466, + "num_input_tokens_seen": 219183280, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.671875, + "step": 10181, + "time_per_iteration": 2.493150234222412 + }, + { + "auxiliary_loss_clip": 0.01106153, + "auxiliary_loss_mlp": 0.01027555, + "balance_loss_clip": 1.01597941, + "balance_loss_mlp": 1.03749657, + "epoch": 0.6121749586652638, + "flos": 13471313569920.0, + "grad_norm": 1.8534348182131113, + "language_loss": 0.80704159, + "learning_rate": 1.3807808510946384e-06, + "loss": 0.82837868, + "num_input_tokens_seen": 219197200, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6875, + "step": 10182, + "time_per_iteration": 2.5022473335266113 + }, + { + "auxiliary_loss_clip": 0.01099749, + "auxiliary_loss_mlp": 0.01025447, + "balance_loss_clip": 1.01557016, + "balance_loss_mlp": 1.03581071, + "epoch": 0.6122350819179317, + "flos": 20120533764480.0, + "grad_norm": 1.6380700202040888, + "language_loss": 0.83158624, + "learning_rate": 1.3804105378342177e-06, + "loss": 0.85283822, + "num_input_tokens_seen": 219216825, + "router_z_loss_clip": 0.09863281, + "router_z_loss_mlp": 0.640625, + "step": 10183, + "time_per_iteration": 2.489943265914917 + }, + { + "auxiliary_loss_clip": 0.01029447, + "auxiliary_loss_mlp": 0.01003231, + "balance_loss_clip": 1.00200963, + "balance_loss_mlp": 1.00785327, + "epoch": 0.6122952051705998, + "flos": 65429242767360.0, + "grad_norm": 0.7013408754852208, + "language_loss": 0.62862837, + "learning_rate": 1.3800402480686914e-06, + "loss": 0.64895517, + "num_input_tokens_seen": 219283795, + "router_z_loss_clip": 0.01220703, + "router_z_loss_mlp": 0.21582031, + "step": 10184, + "time_per_iteration": 3.1942267417907715 + }, + { + "auxiliary_loss_clip": 0.01105776, + "auxiliary_loss_mlp": 0.01026861, + "balance_loss_clip": 1.01558959, + "balance_loss_mlp": 1.03836298, + "epoch": 0.6123553284232677, + "flos": 20376253664640.0, + "grad_norm": 2.39384281866501, + "language_loss": 0.82134175, + "learning_rate": 1.379669981812101e-06, + "loss": 0.84266812, + "num_input_tokens_seen": 219302385, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.67578125, + "step": 10185, + "time_per_iteration": 2.441663980484009 + }, + { + "auxiliary_loss_clip": 0.01111146, + "auxiliary_loss_mlp": 0.01029457, + "balance_loss_clip": 1.0174942, + "balance_loss_mlp": 1.03989947, + "epoch": 0.6124154516759357, + "flos": 23987645948160.0, + "grad_norm": 5.230764283030459, + "language_loss": 0.74637246, + "learning_rate": 1.3792997390784868e-06, + "loss": 0.76777852, + "num_input_tokens_seen": 219319765, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.7109375, + "step": 10186, + "time_per_iteration": 2.494351387023926 + }, + { + "auxiliary_loss_clip": 0.01102292, + "auxiliary_loss_mlp": 0.01028561, + "balance_loss_clip": 1.01786828, + "balance_loss_mlp": 1.0364244, + "epoch": 0.6124755749286036, + "flos": 21468799693440.0, + "grad_norm": 1.5640192087821545, + "language_loss": 0.78181458, + "learning_rate": 1.3789295198818895e-06, + "loss": 0.80312312, + "num_input_tokens_seen": 219337440, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.65625, + "step": 10187, + "time_per_iteration": 2.4529902935028076 + }, + { + "auxiliary_loss_clip": 0.01101994, + "auxiliary_loss_mlp": 0.01028569, + "balance_loss_clip": 1.01668978, + "balance_loss_mlp": 1.03424489, + "epoch": 0.6125356981812716, + "flos": 23879195809920.0, + "grad_norm": 1.5585408172838955, + "language_loss": 0.82932627, + "learning_rate": 1.3785593242363462e-06, + "loss": 0.85063195, + "num_input_tokens_seen": 219357525, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 10188, + "time_per_iteration": 2.4779062271118164 + }, + { + "auxiliary_loss_clip": 0.0110417, + "auxiliary_loss_mlp": 0.01028272, + "balance_loss_clip": 1.0162673, + "balance_loss_mlp": 1.03603601, + "epoch": 0.6125958214339395, + "flos": 14425604150400.0, + "grad_norm": 2.027411293701354, + "language_loss": 0.75284189, + "learning_rate": 1.378189152155896e-06, + "loss": 0.77416623, + "num_input_tokens_seen": 219374855, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 10189, + "time_per_iteration": 2.4187629222869873 + }, + { + "auxiliary_loss_clip": 0.01105389, + "auxiliary_loss_mlp": 0.01031301, + "balance_loss_clip": 1.0194819, + "balance_loss_mlp": 1.03746915, + "epoch": 0.6126559446866076, + "flos": 23259090389760.0, + "grad_norm": 1.8604795542963726, + "language_loss": 0.74147457, + "learning_rate": 1.3778190036545758e-06, + "loss": 0.76284146, + "num_input_tokens_seen": 219394740, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 10190, + "time_per_iteration": 2.4838945865631104 + }, + { + "auxiliary_loss_clip": 0.01105194, + "auxiliary_loss_mlp": 0.01030345, + "balance_loss_clip": 1.01819181, + "balance_loss_mlp": 1.03696406, + "epoch": 0.6127160679392755, + "flos": 26864808324480.0, + "grad_norm": 1.6214214182316076, + "language_loss": 0.68505728, + "learning_rate": 1.3774488787464207e-06, + "loss": 0.70641267, + "num_input_tokens_seen": 219413755, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.68359375, + "step": 10191, + "time_per_iteration": 2.4871902465820312 + }, + { + "auxiliary_loss_clip": 0.01105112, + "auxiliary_loss_mlp": 0.01031184, + "balance_loss_clip": 1.01925695, + "balance_loss_mlp": 1.03581357, + "epoch": 0.6127761911919435, + "flos": 26396425952640.0, + "grad_norm": 2.21006786046543, + "language_loss": 0.73561746, + "learning_rate": 1.377078777445467e-06, + "loss": 0.75698042, + "num_input_tokens_seen": 219433560, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 10192, + "time_per_iteration": 2.491898536682129 + }, + { + "auxiliary_loss_clip": 0.0110379, + "auxiliary_loss_mlp": 0.01027536, + "balance_loss_clip": 1.01606202, + "balance_loss_mlp": 1.03735423, + "epoch": 0.6128363144446115, + "flos": 22634747164800.0, + "grad_norm": 1.8299896919962644, + "language_loss": 0.83299625, + "learning_rate": 1.3767086997657478e-06, + "loss": 0.85430956, + "num_input_tokens_seen": 219452640, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6640625, + "step": 10193, + "time_per_iteration": 2.459218740463257 + }, + { + "auxiliary_loss_clip": 0.0110509, + "auxiliary_loss_mlp": 0.01030227, + "balance_loss_clip": 1.01823401, + "balance_loss_mlp": 1.03667831, + "epoch": 0.6128964376972794, + "flos": 26759051706240.0, + "grad_norm": 2.3362331987729554, + "language_loss": 0.69596869, + "learning_rate": 1.3763386457212979e-06, + "loss": 0.71732187, + "num_input_tokens_seen": 219468585, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.68359375, + "step": 10194, + "time_per_iteration": 2.49104642868042 + }, + { + "auxiliary_loss_clip": 0.01026973, + "auxiliary_loss_mlp": 0.01003435, + "balance_loss_clip": 1.00231493, + "balance_loss_mlp": 1.0056808, + "epoch": 0.6129565609499474, + "flos": 65567929178880.0, + "grad_norm": 0.9308202048927251, + "language_loss": 0.58683991, + "learning_rate": 1.375968615326149e-06, + "loss": 0.607144, + "num_input_tokens_seen": 219523015, + "router_z_loss_clip": 0.01123047, + "router_z_loss_mlp": 0.21289062, + "step": 10195, + "time_per_iteration": 2.8671669960021973 + }, + { + "auxiliary_loss_clip": 0.01105637, + "auxiliary_loss_mlp": 0.01035062, + "balance_loss_clip": 1.02292037, + "balance_loss_mlp": 1.03803897, + "epoch": 0.6130166842026153, + "flos": 16362087200640.0, + "grad_norm": 1.927442212334356, + "language_loss": 0.69738579, + "learning_rate": 1.3755986085943324e-06, + "loss": 0.71879274, + "num_input_tokens_seen": 219539980, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 10196, + "time_per_iteration": 2.4702036380767822 + }, + { + "auxiliary_loss_clip": 0.01104196, + "auxiliary_loss_mlp": 0.0103889, + "balance_loss_clip": 1.02748811, + "balance_loss_mlp": 1.0374887, + "epoch": 0.6130768074552834, + "flos": 23652455207040.0, + "grad_norm": 2.920952429136396, + "language_loss": 0.71311784, + "learning_rate": 1.3752286255398788e-06, + "loss": 0.73454869, + "num_input_tokens_seen": 219556980, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 10197, + "time_per_iteration": 2.5032567977905273 + }, + { + "auxiliary_loss_clip": 0.01106358, + "auxiliary_loss_mlp": 0.01038142, + "balance_loss_clip": 1.02622151, + "balance_loss_mlp": 1.03691006, + "epoch": 0.6131369307079513, + "flos": 20047455544320.0, + "grad_norm": 1.885953700600687, + "language_loss": 0.78852749, + "learning_rate": 1.3748586661768191e-06, + "loss": 0.80997241, + "num_input_tokens_seen": 219576410, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 10198, + "time_per_iteration": 2.460963010787964 + }, + { + "auxiliary_loss_clip": 0.01107653, + "auxiliary_loss_mlp": 0.01033569, + "balance_loss_clip": 1.02179098, + "balance_loss_mlp": 1.037503, + "epoch": 0.6131970539606193, + "flos": 22672166158080.0, + "grad_norm": 1.406384953747787, + "language_loss": 0.7426461, + "learning_rate": 1.374488730519181e-06, + "loss": 0.76405835, + "num_input_tokens_seen": 219597180, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.703125, + "step": 10199, + "time_per_iteration": 2.4598445892333984 + }, + { + "auxiliary_loss_clip": 0.01108284, + "auxiliary_loss_mlp": 0.01038465, + "balance_loss_clip": 1.02553713, + "balance_loss_mlp": 1.03748035, + "epoch": 0.6132571772132872, + "flos": 26870913636480.0, + "grad_norm": 1.5460485143525171, + "language_loss": 0.62069702, + "learning_rate": 1.374118818580993e-06, + "loss": 0.64216447, + "num_input_tokens_seen": 219617630, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.70703125, + "step": 10200, + "time_per_iteration": 2.509960651397705 + }, + { + "auxiliary_loss_clip": 0.01104748, + "auxiliary_loss_mlp": 0.01031277, + "balance_loss_clip": 1.01977301, + "balance_loss_mlp": 1.03736472, + "epoch": 0.6133173004659552, + "flos": 22892657794560.0, + "grad_norm": 1.743695857232765, + "language_loss": 0.68367881, + "learning_rate": 1.3737489303762822e-06, + "loss": 0.70503902, + "num_input_tokens_seen": 219637025, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.67578125, + "step": 10201, + "time_per_iteration": 2.451493740081787 + }, + { + "auxiliary_loss_clip": 0.01100932, + "auxiliary_loss_mlp": 0.01027984, + "balance_loss_clip": 1.01627123, + "balance_loss_mlp": 1.03434098, + "epoch": 0.6133774237186231, + "flos": 20485098852480.0, + "grad_norm": 2.0127297199841747, + "language_loss": 0.83613813, + "learning_rate": 1.3733790659190746e-06, + "loss": 0.8574273, + "num_input_tokens_seen": 219656625, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 10202, + "time_per_iteration": 2.469893217086792 + }, + { + "auxiliary_loss_clip": 0.01027559, + "auxiliary_loss_mlp": 0.01002547, + "balance_loss_clip": 1.00130165, + "balance_loss_mlp": 1.00619066, + "epoch": 0.6134375469712912, + "flos": 69413065217280.0, + "grad_norm": 1.0897383842290518, + "language_loss": 0.67103815, + "learning_rate": 1.3730092252233953e-06, + "loss": 0.69133925, + "num_input_tokens_seen": 219718090, + "router_z_loss_clip": 0.01245117, + "router_z_loss_mlp": 0.21386719, + "step": 10203, + "time_per_iteration": 3.1407535076141357 + }, + { + "auxiliary_loss_clip": 0.01104451, + "auxiliary_loss_mlp": 0.01029714, + "balance_loss_clip": 1.01798368, + "balance_loss_mlp": 1.03650403, + "epoch": 0.6134976702239591, + "flos": 41281541815680.0, + "grad_norm": 2.800089510822399, + "language_loss": 0.61266363, + "learning_rate": 1.37263940830327e-06, + "loss": 0.63400525, + "num_input_tokens_seen": 219740100, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 10204, + "time_per_iteration": 2.683048963546753 + }, + { + "auxiliary_loss_clip": 0.01100362, + "auxiliary_loss_mlp": 0.01025947, + "balance_loss_clip": 1.0147351, + "balance_loss_mlp": 1.03410578, + "epoch": 0.6135577934766271, + "flos": 22346600261760.0, + "grad_norm": 1.8212112064426345, + "language_loss": 0.72582424, + "learning_rate": 1.3722696151727204e-06, + "loss": 0.74708724, + "num_input_tokens_seen": 219761225, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6640625, + "step": 10205, + "time_per_iteration": 2.5465259552001953 + }, + { + "auxiliary_loss_clip": 0.01100873, + "auxiliary_loss_mlp": 0.01023206, + "balance_loss_clip": 1.01134467, + "balance_loss_mlp": 1.03527784, + "epoch": 0.6136179167292951, + "flos": 23728155120000.0, + "grad_norm": 1.5866781109951742, + "language_loss": 0.75862819, + "learning_rate": 1.3718998458457701e-06, + "loss": 0.77986902, + "num_input_tokens_seen": 219780085, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.65625, + "step": 10206, + "time_per_iteration": 2.484109401702881 + }, + { + "auxiliary_loss_clip": 0.01105453, + "auxiliary_loss_mlp": 0.01029497, + "balance_loss_clip": 1.01753414, + "balance_loss_mlp": 1.03659487, + "epoch": 0.613678039981963, + "flos": 26024678144640.0, + "grad_norm": 1.9470428402611015, + "language_loss": 0.75471091, + "learning_rate": 1.3715301003364407e-06, + "loss": 0.77606046, + "num_input_tokens_seen": 219797895, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6875, + "step": 10207, + "time_per_iteration": 2.4940414428710938 + }, + { + "auxiliary_loss_clip": 0.01105401, + "auxiliary_loss_mlp": 0.01035368, + "balance_loss_clip": 1.02424574, + "balance_loss_mlp": 1.03734899, + "epoch": 0.613738163234631, + "flos": 9859957200000.0, + "grad_norm": 2.0213582004112336, + "language_loss": 0.82293832, + "learning_rate": 1.3711603786587525e-06, + "loss": 0.84434605, + "num_input_tokens_seen": 219811295, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6796875, + "step": 10208, + "time_per_iteration": 2.4401795864105225 + }, + { + "auxiliary_loss_clip": 0.01109978, + "auxiliary_loss_mlp": 0.01029842, + "balance_loss_clip": 1.01718831, + "balance_loss_mlp": 1.03974009, + "epoch": 0.613798286487299, + "flos": 33182070001920.0, + "grad_norm": 2.3284175830302365, + "language_loss": 0.72680509, + "learning_rate": 1.3707906808267265e-06, + "loss": 0.74820334, + "num_input_tokens_seen": 219832735, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 10209, + "time_per_iteration": 2.5886876583099365 + }, + { + "auxiliary_loss_clip": 0.01104268, + "auxiliary_loss_mlp": 0.01035755, + "balance_loss_clip": 1.02388787, + "balance_loss_mlp": 1.0384059, + "epoch": 0.613858409739967, + "flos": 25627901535360.0, + "grad_norm": 1.6658761229718997, + "language_loss": 0.74108303, + "learning_rate": 1.37042100685438e-06, + "loss": 0.76248324, + "num_input_tokens_seen": 219852755, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66015625, + "step": 10210, + "time_per_iteration": 2.521304130554199 + }, + { + "auxiliary_loss_clip": 0.01027276, + "auxiliary_loss_mlp": 0.01003672, + "balance_loss_clip": 1.00248551, + "balance_loss_mlp": 1.00609028, + "epoch": 0.6139185329926349, + "flos": 67192313932800.0, + "grad_norm": 0.8595111756805056, + "language_loss": 0.65022087, + "learning_rate": 1.3700513567557325e-06, + "loss": 0.67053032, + "num_input_tokens_seen": 219922785, + "router_z_loss_clip": 0.01184082, + "router_z_loss_mlp": 0.21191406, + "step": 10211, + "time_per_iteration": 3.2215003967285156 + }, + { + "auxiliary_loss_clip": 0.01104002, + "auxiliary_loss_mlp": 0.01037412, + "balance_loss_clip": 1.02478802, + "balance_loss_mlp": 1.03655624, + "epoch": 0.6139786562453029, + "flos": 21543637680000.0, + "grad_norm": 1.6436955201310604, + "language_loss": 0.75708187, + "learning_rate": 1.369681730544801e-06, + "loss": 0.77849603, + "num_input_tokens_seen": 219942215, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.671875, + "step": 10212, + "time_per_iteration": 2.4642996788024902 + }, + { + "auxiliary_loss_clip": 0.01106038, + "auxiliary_loss_mlp": 0.01032718, + "balance_loss_clip": 1.0206902, + "balance_loss_mlp": 1.03837156, + "epoch": 0.6140387794979708, + "flos": 26068489758720.0, + "grad_norm": 1.5692336608665938, + "language_loss": 0.74044585, + "learning_rate": 1.3693121282356009e-06, + "loss": 0.76183337, + "num_input_tokens_seen": 219963830, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 10213, + "time_per_iteration": 2.5178582668304443 + }, + { + "auxiliary_loss_clip": 0.01109398, + "auxiliary_loss_mlp": 0.01032231, + "balance_loss_clip": 1.01943398, + "balance_loss_mlp": 1.03825283, + "epoch": 0.6140989027506388, + "flos": 23694614795520.0, + "grad_norm": 1.5485308182437552, + "language_loss": 0.73049855, + "learning_rate": 1.3689425498421483e-06, + "loss": 0.75191492, + "num_input_tokens_seen": 219983815, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7109375, + "step": 10214, + "time_per_iteration": 2.4716460704803467 + }, + { + "auxiliary_loss_clip": 0.01106578, + "auxiliary_loss_mlp": 0.01032258, + "balance_loss_clip": 1.01949024, + "balance_loss_mlp": 1.03701019, + "epoch": 0.6141590260033067, + "flos": 22231721589120.0, + "grad_norm": 1.7742338479763222, + "language_loss": 0.74487185, + "learning_rate": 1.3685729953784572e-06, + "loss": 0.76626021, + "num_input_tokens_seen": 220003165, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 10215, + "time_per_iteration": 2.507734537124634 + }, + { + "auxiliary_loss_clip": 0.01103725, + "auxiliary_loss_mlp": 0.01032178, + "balance_loss_clip": 1.02031636, + "balance_loss_mlp": 1.03673744, + "epoch": 0.6142191492559748, + "flos": 23871653953920.0, + "grad_norm": 1.8655230442391189, + "language_loss": 0.78393024, + "learning_rate": 1.368203464858542e-06, + "loss": 0.80528927, + "num_input_tokens_seen": 220021015, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66796875, + "step": 10216, + "time_per_iteration": 2.479534864425659 + }, + { + "auxiliary_loss_clip": 0.01104623, + "auxiliary_loss_mlp": 0.01030214, + "balance_loss_clip": 1.01762557, + "balance_loss_mlp": 1.0373491, + "epoch": 0.6142792725086427, + "flos": 15042513260160.0, + "grad_norm": 2.5637363675830254, + "language_loss": 0.80079889, + "learning_rate": 1.3678339582964147e-06, + "loss": 0.82214725, + "num_input_tokens_seen": 220035780, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.671875, + "step": 10217, + "time_per_iteration": 2.4395620822906494 + }, + { + "auxiliary_loss_clip": 0.01105204, + "auxiliary_loss_mlp": 0.01026292, + "balance_loss_clip": 1.01397753, + "balance_loss_mlp": 1.0361073, + "epoch": 0.6143393957613107, + "flos": 23330947547520.0, + "grad_norm": 2.424574581231863, + "language_loss": 0.78246987, + "learning_rate": 1.3674644757060865e-06, + "loss": 0.80378485, + "num_input_tokens_seen": 220054280, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69140625, + "step": 10218, + "time_per_iteration": 3.851706027984619 + }, + { + "auxiliary_loss_clip": 0.01105535, + "auxiliary_loss_mlp": 0.0103101, + "balance_loss_clip": 1.01911902, + "balance_loss_mlp": 1.038481, + "epoch": 0.6143995190139786, + "flos": 20117086058880.0, + "grad_norm": 1.517262895370751, + "language_loss": 0.81908238, + "learning_rate": 1.367095017101569e-06, + "loss": 0.84044778, + "num_input_tokens_seen": 220074120, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 10219, + "time_per_iteration": 2.5016467571258545 + }, + { + "auxiliary_loss_clip": 0.01104307, + "auxiliary_loss_mlp": 0.01028106, + "balance_loss_clip": 1.01553547, + "balance_loss_mlp": 1.03468263, + "epoch": 0.6144596422666466, + "flos": 42303559489920.0, + "grad_norm": 1.8306132213683777, + "language_loss": 0.66681564, + "learning_rate": 1.3667255824968717e-06, + "loss": 0.6881398, + "num_input_tokens_seen": 220096320, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 10220, + "time_per_iteration": 4.022945404052734 + }, + { + "auxiliary_loss_clip": 0.01101764, + "auxiliary_loss_mlp": 0.01026811, + "balance_loss_clip": 1.01547968, + "balance_loss_mlp": 1.03572094, + "epoch": 0.6145197655193146, + "flos": 21573622558080.0, + "grad_norm": 1.9547432893761034, + "language_loss": 0.71545637, + "learning_rate": 1.3663561719060041e-06, + "loss": 0.73674214, + "num_input_tokens_seen": 220114850, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66015625, + "step": 10221, + "time_per_iteration": 3.984619617462158 + }, + { + "auxiliary_loss_clip": 0.01102691, + "auxiliary_loss_mlp": 0.0102811, + "balance_loss_clip": 1.01677299, + "balance_loss_mlp": 1.03571272, + "epoch": 0.6145798887719826, + "flos": 21471098163840.0, + "grad_norm": 1.6401613716258656, + "language_loss": 0.79416037, + "learning_rate": 1.3659867853429735e-06, + "loss": 0.81546843, + "num_input_tokens_seen": 220133395, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 10222, + "time_per_iteration": 2.482626438140869 + }, + { + "auxiliary_loss_clip": 0.01107554, + "auxiliary_loss_mlp": 0.01030696, + "balance_loss_clip": 1.01896548, + "balance_loss_mlp": 1.03836894, + "epoch": 0.6146400120246506, + "flos": 20777016683520.0, + "grad_norm": 4.215986026899438, + "language_loss": 0.76034737, + "learning_rate": 1.365617422821788e-06, + "loss": 0.78172994, + "num_input_tokens_seen": 220152790, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.69140625, + "step": 10223, + "time_per_iteration": 3.9831442832946777 + }, + { + "auxiliary_loss_clip": 0.01102548, + "auxiliary_loss_mlp": 0.01028023, + "balance_loss_clip": 1.01629829, + "balance_loss_mlp": 1.03779078, + "epoch": 0.6147001352773185, + "flos": 13881306384000.0, + "grad_norm": 2.127618755426409, + "language_loss": 0.78459811, + "learning_rate": 1.3652480843564535e-06, + "loss": 0.80590385, + "num_input_tokens_seen": 220169535, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6484375, + "step": 10224, + "time_per_iteration": 2.43497896194458 + }, + { + "auxiliary_loss_clip": 0.01100015, + "auxiliary_loss_mlp": 0.01025021, + "balance_loss_clip": 1.01433933, + "balance_loss_mlp": 1.03477085, + "epoch": 0.6147602585299865, + "flos": 56641791807360.0, + "grad_norm": 1.3024527007974456, + "language_loss": 0.66392958, + "learning_rate": 1.3648787699609746e-06, + "loss": 0.68517995, + "num_input_tokens_seen": 220195305, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.65234375, + "step": 10225, + "time_per_iteration": 2.845883369445801 + }, + { + "auxiliary_loss_clip": 0.01107388, + "auxiliary_loss_mlp": 0.01028986, + "balance_loss_clip": 1.01713014, + "balance_loss_mlp": 1.03713298, + "epoch": 0.6148203817826544, + "flos": 32817217605120.0, + "grad_norm": 2.1585029045138415, + "language_loss": 0.63199341, + "learning_rate": 1.364509479649357e-06, + "loss": 0.65335715, + "num_input_tokens_seen": 220215040, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.703125, + "step": 10226, + "time_per_iteration": 2.555772304534912 + }, + { + "auxiliary_loss_clip": 0.01104672, + "auxiliary_loss_mlp": 0.01030795, + "balance_loss_clip": 1.01831996, + "balance_loss_mlp": 1.03651762, + "epoch": 0.6148805050353224, + "flos": 18332038748160.0, + "grad_norm": 1.704417913895937, + "language_loss": 0.75513506, + "learning_rate": 1.3641402134356037e-06, + "loss": 0.77648973, + "num_input_tokens_seen": 220234205, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 10227, + "time_per_iteration": 2.536123514175415 + }, + { + "auxiliary_loss_clip": 0.01106007, + "auxiliary_loss_mlp": 0.01036804, + "balance_loss_clip": 1.022928, + "balance_loss_mlp": 1.03667367, + "epoch": 0.6149406282879903, + "flos": 14063983977600.0, + "grad_norm": 1.8551652476106548, + "language_loss": 0.61097801, + "learning_rate": 1.3637709713337164e-06, + "loss": 0.63240612, + "num_input_tokens_seen": 220252730, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.69140625, + "step": 10228, + "time_per_iteration": 2.419962167739868 + }, + { + "auxiliary_loss_clip": 0.01103286, + "auxiliary_loss_mlp": 0.01029831, + "balance_loss_clip": 1.01778531, + "balance_loss_mlp": 1.03672791, + "epoch": 0.6150007515406584, + "flos": 25190186400000.0, + "grad_norm": 1.3329712414655954, + "language_loss": 0.74049234, + "learning_rate": 1.3634017533576985e-06, + "loss": 0.76182348, + "num_input_tokens_seen": 220273345, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6640625, + "step": 10229, + "time_per_iteration": 2.506852626800537 + }, + { + "auxiliary_loss_clip": 0.01106333, + "auxiliary_loss_mlp": 0.01032652, + "balance_loss_clip": 1.0202601, + "balance_loss_mlp": 1.03880942, + "epoch": 0.6150608747933263, + "flos": 21945262625280.0, + "grad_norm": 1.7175132302354088, + "language_loss": 0.77996862, + "learning_rate": 1.3630325595215493e-06, + "loss": 0.80135846, + "num_input_tokens_seen": 220293845, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.67578125, + "step": 10230, + "time_per_iteration": 2.477675199508667 + }, + { + "auxiliary_loss_clip": 0.0110355, + "auxiliary_loss_mlp": 0.0102496, + "balance_loss_clip": 1.01371837, + "balance_loss_mlp": 1.03570461, + "epoch": 0.6151209980459943, + "flos": 30117453523200.0, + "grad_norm": 1.6621971423553226, + "language_loss": 0.72935748, + "learning_rate": 1.36266338983927e-06, + "loss": 0.75064254, + "num_input_tokens_seen": 220316070, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6796875, + "step": 10231, + "time_per_iteration": 2.561504602432251 + }, + { + "auxiliary_loss_clip": 0.01105925, + "auxiliary_loss_mlp": 0.0102981, + "balance_loss_clip": 1.01883698, + "balance_loss_mlp": 1.03801215, + "epoch": 0.6151811212986622, + "flos": 30008356940160.0, + "grad_norm": 1.5773460676573843, + "language_loss": 0.6960876, + "learning_rate": 1.362294244324858e-06, + "loss": 0.71744496, + "num_input_tokens_seen": 220335695, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6796875, + "step": 10232, + "time_per_iteration": 2.5435595512390137 + }, + { + "auxiliary_loss_clip": 0.01100438, + "auxiliary_loss_mlp": 0.01028039, + "balance_loss_clip": 1.0169704, + "balance_loss_mlp": 1.03564286, + "epoch": 0.6152412445513302, + "flos": 18872888808960.0, + "grad_norm": 1.9335513310183938, + "language_loss": 0.91684914, + "learning_rate": 1.3619251229923126e-06, + "loss": 0.9381339, + "num_input_tokens_seen": 220353720, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 10233, + "time_per_iteration": 2.464128017425537 + }, + { + "auxiliary_loss_clip": 0.01104077, + "auxiliary_loss_mlp": 0.01033089, + "balance_loss_clip": 1.02258682, + "balance_loss_mlp": 1.03727007, + "epoch": 0.6153013678039982, + "flos": 25703601448320.0, + "grad_norm": 1.693429608694219, + "language_loss": 0.71381217, + "learning_rate": 1.3615560258556306e-06, + "loss": 0.73518384, + "num_input_tokens_seen": 220372515, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.66796875, + "step": 10234, + "time_per_iteration": 2.484847068786621 + }, + { + "auxiliary_loss_clip": 0.01104977, + "auxiliary_loss_mlp": 0.01031373, + "balance_loss_clip": 1.01926732, + "balance_loss_mlp": 1.03558159, + "epoch": 0.6153614910566662, + "flos": 28510271383680.0, + "grad_norm": 1.9863568991468559, + "language_loss": 0.66966361, + "learning_rate": 1.3611869529288077e-06, + "loss": 0.69102716, + "num_input_tokens_seen": 220393490, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 10235, + "time_per_iteration": 2.499189853668213 + }, + { + "auxiliary_loss_clip": 0.01106455, + "auxiliary_loss_mlp": 0.01029276, + "balance_loss_clip": 1.01746273, + "balance_loss_mlp": 1.0364213, + "epoch": 0.6154216143093342, + "flos": 23549787158400.0, + "grad_norm": 2.269392311324668, + "language_loss": 0.81321824, + "learning_rate": 1.3608179042258398e-06, + "loss": 0.83457547, + "num_input_tokens_seen": 220412855, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.703125, + "step": 10236, + "time_per_iteration": 2.467374086380005 + }, + { + "auxiliary_loss_clip": 0.01106752, + "auxiliary_loss_mlp": 0.01029525, + "balance_loss_clip": 1.01815271, + "balance_loss_mlp": 1.03654408, + "epoch": 0.6154817375620021, + "flos": 22748081552640.0, + "grad_norm": 1.4348801753525875, + "language_loss": 0.80595863, + "learning_rate": 1.360448879760721e-06, + "loss": 0.82732141, + "num_input_tokens_seen": 220433440, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.703125, + "step": 10237, + "time_per_iteration": 2.4921953678131104 + }, + { + "auxiliary_loss_clip": 0.01103597, + "auxiliary_loss_mlp": 0.01036802, + "balance_loss_clip": 1.02533984, + "balance_loss_mlp": 1.03659725, + "epoch": 0.6155418608146701, + "flos": 27162975121920.0, + "grad_norm": 1.8067050747817437, + "language_loss": 0.7606861, + "learning_rate": 1.3600798795474449e-06, + "loss": 0.78209013, + "num_input_tokens_seen": 220453445, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 10238, + "time_per_iteration": 2.5077149868011475 + }, + { + "auxiliary_loss_clip": 0.01027367, + "auxiliary_loss_mlp": 0.01004239, + "balance_loss_clip": 1.00305295, + "balance_loss_mlp": 1.00621736, + "epoch": 0.615601984067338, + "flos": 68811165014400.0, + "grad_norm": 0.7636645723592903, + "language_loss": 0.57658124, + "learning_rate": 1.3597109036000036e-06, + "loss": 0.5968973, + "num_input_tokens_seen": 220509730, + "router_z_loss_clip": 0.01184082, + "router_z_loss_mlp": 0.21191406, + "step": 10239, + "time_per_iteration": 3.0781197547912598 + }, + { + "auxiliary_loss_clip": 0.01106458, + "auxiliary_loss_mlp": 0.01034804, + "balance_loss_clip": 1.02284706, + "balance_loss_mlp": 1.03747571, + "epoch": 0.615662107320006, + "flos": 15517144598400.0, + "grad_norm": 2.10217205787335, + "language_loss": 0.77644312, + "learning_rate": 1.3593419519323892e-06, + "loss": 0.79785573, + "num_input_tokens_seen": 220527295, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.69140625, + "step": 10240, + "time_per_iteration": 2.4440581798553467 + }, + { + "auxiliary_loss_clip": 0.01107517, + "auxiliary_loss_mlp": 0.01032816, + "balance_loss_clip": 1.02069855, + "balance_loss_mlp": 1.03847337, + "epoch": 0.615722230572674, + "flos": 21063691128960.0, + "grad_norm": 2.3418662553679495, + "language_loss": 0.72875106, + "learning_rate": 1.3589730245585922e-06, + "loss": 0.75015438, + "num_input_tokens_seen": 220542730, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69140625, + "step": 10241, + "time_per_iteration": 2.440458059310913 + }, + { + "auxiliary_loss_clip": 0.01102041, + "auxiliary_loss_mlp": 0.01027, + "balance_loss_clip": 1.01599121, + "balance_loss_mlp": 1.03596628, + "epoch": 0.615782353825342, + "flos": 23256791919360.0, + "grad_norm": 1.629664240741642, + "language_loss": 0.71536696, + "learning_rate": 1.3586041214926018e-06, + "loss": 0.73665738, + "num_input_tokens_seen": 220562995, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.66015625, + "step": 10242, + "time_per_iteration": 2.465280771255493 + }, + { + "auxiliary_loss_clip": 0.0110517, + "auxiliary_loss_mlp": 0.01028948, + "balance_loss_clip": 1.01760554, + "balance_loss_mlp": 1.03812838, + "epoch": 0.6158424770780099, + "flos": 21103911383040.0, + "grad_norm": 1.7806476568458218, + "language_loss": 0.72179866, + "learning_rate": 1.3582352427484086e-06, + "loss": 0.74313986, + "num_input_tokens_seen": 220581775, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 10243, + "time_per_iteration": 2.4706227779388428 + }, + { + "auxiliary_loss_clip": 0.01026424, + "auxiliary_loss_mlp": 0.01003264, + "balance_loss_clip": 1.00196481, + "balance_loss_mlp": 1.00526905, + "epoch": 0.6159026003306779, + "flos": 70333276769280.0, + "grad_norm": 0.7683330535495017, + "language_loss": 0.5684256, + "learning_rate": 1.3578663883399984e-06, + "loss": 0.58872247, + "num_input_tokens_seen": 220646395, + "router_z_loss_clip": 0.01300049, + "router_z_loss_mlp": 0.2109375, + "step": 10244, + "time_per_iteration": 3.108367919921875 + }, + { + "auxiliary_loss_clip": 0.01104886, + "auxiliary_loss_mlp": 0.01027895, + "balance_loss_clip": 1.01563978, + "balance_loss_mlp": 1.03710341, + "epoch": 0.6159627235833458, + "flos": 33874355802240.0, + "grad_norm": 1.7714653532708287, + "language_loss": 0.63837689, + "learning_rate": 1.3574975582813593e-06, + "loss": 0.65970469, + "num_input_tokens_seen": 220668335, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6796875, + "step": 10245, + "time_per_iteration": 2.5604476928710938 + }, + { + "auxiliary_loss_clip": 0.01102331, + "auxiliary_loss_mlp": 0.01028094, + "balance_loss_clip": 1.01664448, + "balance_loss_mlp": 1.03589809, + "epoch": 0.6160228468360138, + "flos": 26575440359040.0, + "grad_norm": 1.7050556240908794, + "language_loss": 0.78958333, + "learning_rate": 1.3571287525864771e-06, + "loss": 0.81088758, + "num_input_tokens_seen": 220688915, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 10246, + "time_per_iteration": 2.6499507427215576 + }, + { + "auxiliary_loss_clip": 0.01109766, + "auxiliary_loss_mlp": 0.01044472, + "balance_loss_clip": 1.03114414, + "balance_loss_mlp": 1.03871059, + "epoch": 0.6160829700886818, + "flos": 17193274894080.0, + "grad_norm": 2.2268806206076586, + "language_loss": 0.87346923, + "learning_rate": 1.3567599712693368e-06, + "loss": 0.89501166, + "num_input_tokens_seen": 220703465, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 10247, + "time_per_iteration": 2.53155517578125 + }, + { + "auxiliary_loss_clip": 0.01108151, + "auxiliary_loss_mlp": 0.01031566, + "balance_loss_clip": 1.01996708, + "balance_loss_mlp": 1.03957379, + "epoch": 0.6161430933413498, + "flos": 23623547736960.0, + "grad_norm": 2.019293099257412, + "language_loss": 0.80015755, + "learning_rate": 1.3563912143439235e-06, + "loss": 0.82155472, + "num_input_tokens_seen": 220722090, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6875, + "step": 10248, + "time_per_iteration": 2.565202236175537 + }, + { + "auxiliary_loss_clip": 0.01101653, + "auxiliary_loss_mlp": 0.01029592, + "balance_loss_clip": 1.01873195, + "balance_loss_mlp": 1.03529978, + "epoch": 0.6162032165940178, + "flos": 23002436736000.0, + "grad_norm": 1.9553906460889976, + "language_loss": 0.8661859, + "learning_rate": 1.3560224818242191e-06, + "loss": 0.88749832, + "num_input_tokens_seen": 220741075, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6640625, + "step": 10249, + "time_per_iteration": 2.5155153274536133 + }, + { + "auxiliary_loss_clip": 0.01104366, + "auxiliary_loss_mlp": 0.01026878, + "balance_loss_clip": 1.01438522, + "balance_loss_mlp": 1.03663516, + "epoch": 0.6162633398466857, + "flos": 39421979740800.0, + "grad_norm": 1.9896565394121724, + "language_loss": 0.6859656, + "learning_rate": 1.3556537737242072e-06, + "loss": 0.70727801, + "num_input_tokens_seen": 220763395, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.67578125, + "step": 10250, + "time_per_iteration": 2.6529786586761475 + }, + { + "auxiliary_loss_clip": 0.01098407, + "auxiliary_loss_mlp": 0.01025601, + "balance_loss_clip": 1.01488972, + "balance_loss_mlp": 1.03547108, + "epoch": 0.6163234630993537, + "flos": 19244672530560.0, + "grad_norm": 1.9258007321652242, + "language_loss": 0.74149621, + "learning_rate": 1.3552850900578692e-06, + "loss": 0.76273632, + "num_input_tokens_seen": 220780640, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.62890625, + "step": 10251, + "time_per_iteration": 2.5420565605163574 + }, + { + "auxiliary_loss_clip": 0.01104048, + "auxiliary_loss_mlp": 0.01027255, + "balance_loss_clip": 1.01518464, + "balance_loss_mlp": 1.03652811, + "epoch": 0.6163835863520216, + "flos": 15961791058560.0, + "grad_norm": 2.78922632869985, + "language_loss": 0.68291706, + "learning_rate": 1.3549164308391844e-06, + "loss": 0.70423007, + "num_input_tokens_seen": 220797960, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.67578125, + "step": 10252, + "time_per_iteration": 2.5236093997955322 + }, + { + "auxiliary_loss_clip": 0.01026564, + "auxiliary_loss_mlp": 0.01001879, + "balance_loss_clip": 1.0006336, + "balance_loss_mlp": 1.00562644, + "epoch": 0.6164437096046896, + "flos": 68103834393600.0, + "grad_norm": 0.8837133823521999, + "language_loss": 0.57868779, + "learning_rate": 1.3545477960821333e-06, + "loss": 0.5989722, + "num_input_tokens_seen": 220856930, + "router_z_loss_clip": 0.01245117, + "router_z_loss_mlp": 0.20898438, + "step": 10253, + "time_per_iteration": 3.103968858718872 + }, + { + "auxiliary_loss_clip": 0.01103565, + "auxiliary_loss_mlp": 0.01028041, + "balance_loss_clip": 1.01609635, + "balance_loss_mlp": 1.03543723, + "epoch": 0.6165038328573575, + "flos": 21361211481600.0, + "grad_norm": 1.4349605702857906, + "language_loss": 0.79628026, + "learning_rate": 1.3541791858006946e-06, + "loss": 0.81759632, + "num_input_tokens_seen": 220877595, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 10254, + "time_per_iteration": 2.4770078659057617 + }, + { + "auxiliary_loss_clip": 0.01107997, + "auxiliary_loss_mlp": 0.01028587, + "balance_loss_clip": 1.0167551, + "balance_loss_mlp": 1.03706634, + "epoch": 0.6165639561100256, + "flos": 21101972048640.0, + "grad_norm": 1.765232531729237, + "language_loss": 0.80340689, + "learning_rate": 1.353810600008846e-06, + "loss": 0.82477272, + "num_input_tokens_seen": 220896880, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.7109375, + "step": 10255, + "time_per_iteration": 2.4666266441345215 + }, + { + "auxiliary_loss_clip": 0.01108694, + "auxiliary_loss_mlp": 0.01032846, + "balance_loss_clip": 1.0197928, + "balance_loss_mlp": 1.03867257, + "epoch": 0.6166240793626935, + "flos": 25338533569920.0, + "grad_norm": 1.7468186030679946, + "language_loss": 0.65269709, + "learning_rate": 1.3534420387205646e-06, + "loss": 0.6741125, + "num_input_tokens_seen": 220916425, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.69921875, + "step": 10256, + "time_per_iteration": 2.514446973800659 + }, + { + "auxiliary_loss_clip": 0.01103556, + "auxiliary_loss_mlp": 0.01028163, + "balance_loss_clip": 1.01702309, + "balance_loss_mlp": 1.0371418, + "epoch": 0.6166842026153615, + "flos": 19682639061120.0, + "grad_norm": 1.5636561949397187, + "language_loss": 0.71758097, + "learning_rate": 1.353073501949825e-06, + "loss": 0.73889816, + "num_input_tokens_seen": 220935050, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 10257, + "time_per_iteration": 2.4575183391571045 + }, + { + "auxiliary_loss_clip": 0.01108721, + "auxiliary_loss_mlp": 0.01028654, + "balance_loss_clip": 1.01625657, + "balance_loss_mlp": 1.03909421, + "epoch": 0.6167443258680294, + "flos": 19318361281920.0, + "grad_norm": 2.0856421908029192, + "language_loss": 0.72058862, + "learning_rate": 1.3527049897106034e-06, + "loss": 0.74196231, + "num_input_tokens_seen": 220953085, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6953125, + "step": 10258, + "time_per_iteration": 2.4590466022491455 + }, + { + "auxiliary_loss_clip": 0.01105581, + "auxiliary_loss_mlp": 0.01030827, + "balance_loss_clip": 1.01888824, + "balance_loss_mlp": 1.03705239, + "epoch": 0.6168044491206974, + "flos": 25265239868160.0, + "grad_norm": 2.864696001888572, + "language_loss": 0.63946176, + "learning_rate": 1.3523365020168735e-06, + "loss": 0.66082585, + "num_input_tokens_seen": 220969050, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 10259, + "time_per_iteration": 2.472621202468872 + }, + { + "auxiliary_loss_clip": 0.01103568, + "auxiliary_loss_mlp": 0.0103251, + "balance_loss_clip": 1.02020216, + "balance_loss_mlp": 1.03760934, + "epoch": 0.6168645723733654, + "flos": 13219903301760.0, + "grad_norm": 1.8983508996193146, + "language_loss": 0.71194589, + "learning_rate": 1.3519680388826084e-06, + "loss": 0.73330671, + "num_input_tokens_seen": 220985825, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66015625, + "step": 10260, + "time_per_iteration": 3.8351244926452637 + }, + { + "auxiliary_loss_clip": 0.01112265, + "auxiliary_loss_mlp": 0.01030607, + "balance_loss_clip": 1.0169692, + "balance_loss_mlp": 1.04087448, + "epoch": 0.6169246956260334, + "flos": 26652038112000.0, + "grad_norm": 1.8640894588611543, + "language_loss": 0.68213212, + "learning_rate": 1.3515996003217803e-06, + "loss": 0.70356077, + "num_input_tokens_seen": 221004465, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71484375, + "step": 10261, + "time_per_iteration": 2.4846863746643066 + }, + { + "auxiliary_loss_clip": 0.01103737, + "auxiliary_loss_mlp": 0.01037916, + "balance_loss_clip": 1.02671063, + "balance_loss_mlp": 1.03602839, + "epoch": 0.6169848188787014, + "flos": 23148413608320.0, + "grad_norm": 1.7606752411550333, + "language_loss": 0.71393299, + "learning_rate": 1.3512311863483602e-06, + "loss": 0.73534954, + "num_input_tokens_seen": 221023260, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.67578125, + "step": 10262, + "time_per_iteration": 3.8463478088378906 + }, + { + "auxiliary_loss_clip": 0.01105557, + "auxiliary_loss_mlp": 0.01031442, + "balance_loss_clip": 1.01940775, + "balance_loss_mlp": 1.03685451, + "epoch": 0.6170449421313693, + "flos": 23331917214720.0, + "grad_norm": 1.9300485767677382, + "language_loss": 0.70171946, + "learning_rate": 1.3508627969763188e-06, + "loss": 0.72308946, + "num_input_tokens_seen": 221043090, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6875, + "step": 10263, + "time_per_iteration": 3.8719136714935303 + }, + { + "auxiliary_loss_clip": 0.01106014, + "auxiliary_loss_mlp": 0.01028395, + "balance_loss_clip": 1.01676631, + "balance_loss_mlp": 1.03678763, + "epoch": 0.6171050653840373, + "flos": 15851617067520.0, + "grad_norm": 8.265893448617778, + "language_loss": 0.75888687, + "learning_rate": 1.3504944322196244e-06, + "loss": 0.78023094, + "num_input_tokens_seen": 221061435, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.69140625, + "step": 10264, + "time_per_iteration": 3.9576797485351562 + }, + { + "auxiliary_loss_clip": 0.01105756, + "auxiliary_loss_mlp": 0.010292, + "balance_loss_clip": 1.01682043, + "balance_loss_mlp": 1.03773212, + "epoch": 0.6171651886367052, + "flos": 20045516209920.0, + "grad_norm": 2.621461269637815, + "language_loss": 0.85138124, + "learning_rate": 1.350126092092247e-06, + "loss": 0.87273085, + "num_input_tokens_seen": 221078705, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 10265, + "time_per_iteration": 2.4204261302948 + }, + { + "auxiliary_loss_clip": 0.01103728, + "auxiliary_loss_mlp": 0.01032958, + "balance_loss_clip": 1.02098346, + "balance_loss_mlp": 1.03761029, + "epoch": 0.6172253118893732, + "flos": 26432695710720.0, + "grad_norm": 3.6073790517357995, + "language_loss": 0.642869, + "learning_rate": 1.349757776608153e-06, + "loss": 0.66423583, + "num_input_tokens_seen": 221099245, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66015625, + "step": 10266, + "time_per_iteration": 2.5135982036590576 + }, + { + "auxiliary_loss_clip": 0.0110251, + "auxiliary_loss_mlp": 0.01031969, + "balance_loss_clip": 1.02062035, + "balance_loss_mlp": 1.03433692, + "epoch": 0.6172854351420412, + "flos": 22632879657600.0, + "grad_norm": 1.7504973624629372, + "language_loss": 0.75734687, + "learning_rate": 1.3493894857813094e-06, + "loss": 0.77869165, + "num_input_tokens_seen": 221116930, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6796875, + "step": 10267, + "time_per_iteration": 2.4403936862945557 + }, + { + "auxiliary_loss_clip": 0.01107183, + "auxiliary_loss_mlp": 0.01026945, + "balance_loss_clip": 1.0146544, + "balance_loss_mlp": 1.0368762, + "epoch": 0.6173455583947092, + "flos": 21212936138880.0, + "grad_norm": 1.5812909664018504, + "language_loss": 0.74722588, + "learning_rate": 1.3490212196256818e-06, + "loss": 0.7685672, + "num_input_tokens_seen": 221137660, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 10268, + "time_per_iteration": 2.467622995376587 + }, + { + "auxiliary_loss_clip": 0.01108432, + "auxiliary_loss_mlp": 0.01027035, + "balance_loss_clip": 1.01477456, + "balance_loss_mlp": 1.03709388, + "epoch": 0.6174056816473771, + "flos": 19500284689920.0, + "grad_norm": 1.6692354192517487, + "language_loss": 0.75483018, + "learning_rate": 1.3486529781552342e-06, + "loss": 0.77618486, + "num_input_tokens_seen": 221156225, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.71484375, + "step": 10269, + "time_per_iteration": 2.427558660507202 + }, + { + "auxiliary_loss_clip": 0.01102127, + "auxiliary_loss_mlp": 0.01025701, + "balance_loss_clip": 1.01379776, + "balance_loss_mlp": 1.03455544, + "epoch": 0.6174658049000451, + "flos": 15997342544640.0, + "grad_norm": 2.2351967956552987, + "language_loss": 0.76565802, + "learning_rate": 1.3482847613839318e-06, + "loss": 0.78693628, + "num_input_tokens_seen": 221173820, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 10270, + "time_per_iteration": 2.441521644592285 + }, + { + "auxiliary_loss_clip": 0.01106104, + "auxiliary_loss_mlp": 0.01026083, + "balance_loss_clip": 1.01432252, + "balance_loss_mlp": 1.03741896, + "epoch": 0.617525928152713, + "flos": 21903893136000.0, + "grad_norm": 1.7948450339640445, + "language_loss": 0.82511967, + "learning_rate": 1.347916569325736e-06, + "loss": 0.84644157, + "num_input_tokens_seen": 221191815, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6875, + "step": 10271, + "time_per_iteration": 2.427300453186035 + }, + { + "auxiliary_loss_clip": 0.01105866, + "auxiliary_loss_mlp": 0.01031447, + "balance_loss_clip": 1.01937735, + "balance_loss_mlp": 1.03691125, + "epoch": 0.617586051405381, + "flos": 21105958458240.0, + "grad_norm": 2.1955459228647687, + "language_loss": 0.76878774, + "learning_rate": 1.3475484019946093e-06, + "loss": 0.79016083, + "num_input_tokens_seen": 221211205, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6875, + "step": 10272, + "time_per_iteration": 2.4983582496643066 + }, + { + "auxiliary_loss_clip": 0.01028751, + "auxiliary_loss_mlp": 0.0099819, + "balance_loss_clip": 0.99684906, + "balance_loss_mlp": 1.00760865, + "epoch": 0.617646174658049, + "flos": 58610776665600.0, + "grad_norm": 0.8101209602428692, + "language_loss": 0.59128773, + "learning_rate": 1.347180259404513e-06, + "loss": 0.61155713, + "num_input_tokens_seen": 221268430, + "router_z_loss_clip": 0.01342773, + "router_z_loss_mlp": 0.21191406, + "step": 10273, + "time_per_iteration": 2.9302847385406494 + }, + { + "auxiliary_loss_clip": 0.01103173, + "auxiliary_loss_mlp": 0.01026931, + "balance_loss_clip": 1.01496243, + "balance_loss_mlp": 1.03603625, + "epoch": 0.617706297910717, + "flos": 13878684691200.0, + "grad_norm": 2.336605024454028, + "language_loss": 0.72963846, + "learning_rate": 1.3468121415694059e-06, + "loss": 0.75093955, + "num_input_tokens_seen": 221281930, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.671875, + "step": 10274, + "time_per_iteration": 2.4481325149536133 + }, + { + "auxiliary_loss_clip": 0.01104274, + "auxiliary_loss_mlp": 0.01027771, + "balance_loss_clip": 1.01643395, + "balance_loss_mlp": 1.03684974, + "epoch": 0.617766421163385, + "flos": 19208438686080.0, + "grad_norm": 3.0133252214936372, + "language_loss": 0.77358514, + "learning_rate": 1.3464440485032484e-06, + "loss": 0.79490566, + "num_input_tokens_seen": 221301605, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.67578125, + "step": 10275, + "time_per_iteration": 2.4196648597717285 + }, + { + "auxiliary_loss_clip": 0.01104297, + "auxiliary_loss_mlp": 0.01027767, + "balance_loss_clip": 1.01589358, + "balance_loss_mlp": 1.03650546, + "epoch": 0.6178265444160529, + "flos": 22565978576640.0, + "grad_norm": 1.6767450105474386, + "language_loss": 0.79291052, + "learning_rate": 1.346075980219998e-06, + "loss": 0.81423116, + "num_input_tokens_seen": 221320105, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 10276, + "time_per_iteration": 2.5229239463806152 + }, + { + "auxiliary_loss_clip": 0.01107984, + "auxiliary_loss_mlp": 0.01033282, + "balance_loss_clip": 1.02068734, + "balance_loss_mlp": 1.0383606, + "epoch": 0.6178866676687209, + "flos": 11984289402240.0, + "grad_norm": 2.1695107159415525, + "language_loss": 0.8092519, + "learning_rate": 1.345707936733612e-06, + "loss": 0.83066452, + "num_input_tokens_seen": 221335915, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 10277, + "time_per_iteration": 2.419820547103882 + }, + { + "auxiliary_loss_clip": 0.01107683, + "auxiliary_loss_mlp": 0.01031431, + "balance_loss_clip": 1.01819897, + "balance_loss_mlp": 1.03688812, + "epoch": 0.6179467909213888, + "flos": 20991510748800.0, + "grad_norm": 1.6341046500403578, + "language_loss": 0.81401992, + "learning_rate": 1.3453399180580466e-06, + "loss": 0.83541107, + "num_input_tokens_seen": 221353965, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.70703125, + "step": 10278, + "time_per_iteration": 2.451904058456421 + }, + { + "auxiliary_loss_clip": 0.01104247, + "auxiliary_loss_mlp": 0.01031724, + "balance_loss_clip": 1.02002394, + "balance_loss_mlp": 1.03586221, + "epoch": 0.6180069141740568, + "flos": 25338102606720.0, + "grad_norm": 1.4680885836846245, + "language_loss": 0.73827434, + "learning_rate": 1.3449719242072567e-06, + "loss": 0.75963408, + "num_input_tokens_seen": 221374080, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.68359375, + "step": 10279, + "time_per_iteration": 2.4702413082122803 + }, + { + "auxiliary_loss_clip": 0.01101252, + "auxiliary_loss_mlp": 0.01028202, + "balance_loss_clip": 1.01637602, + "balance_loss_mlp": 1.03415704, + "epoch": 0.6180670374267248, + "flos": 19645722858240.0, + "grad_norm": 1.5792662413822172, + "language_loss": 0.7052443, + "learning_rate": 1.3446039551951975e-06, + "loss": 0.72653878, + "num_input_tokens_seen": 221392910, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 10280, + "time_per_iteration": 2.439377784729004 + }, + { + "auxiliary_loss_clip": 0.01105085, + "auxiliary_loss_mlp": 0.01032737, + "balance_loss_clip": 1.02039266, + "balance_loss_mlp": 1.03673506, + "epoch": 0.6181271606793928, + "flos": 19464876858240.0, + "grad_norm": 1.433650263791477, + "language_loss": 0.72634661, + "learning_rate": 1.3442360110358215e-06, + "loss": 0.74772483, + "num_input_tokens_seen": 221410990, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.68359375, + "step": 10281, + "time_per_iteration": 2.4201571941375732 + }, + { + "auxiliary_loss_clip": 0.01100944, + "auxiliary_loss_mlp": 0.01030541, + "balance_loss_clip": 1.01990116, + "balance_loss_mlp": 1.0367198, + "epoch": 0.6181872839320607, + "flos": 25594289383680.0, + "grad_norm": 1.5669625672401193, + "language_loss": 0.76539791, + "learning_rate": 1.3438680917430827e-06, + "loss": 0.78671277, + "num_input_tokens_seen": 221431020, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.640625, + "step": 10282, + "time_per_iteration": 2.4729509353637695 + }, + { + "auxiliary_loss_clip": 0.01105858, + "auxiliary_loss_mlp": 0.01032009, + "balance_loss_clip": 1.01784086, + "balance_loss_mlp": 1.03611851, + "epoch": 0.6182474071847287, + "flos": 25551806572800.0, + "grad_norm": 1.557918367732971, + "language_loss": 0.69140053, + "learning_rate": 1.343500197330931e-06, + "loss": 0.71277922, + "num_input_tokens_seen": 221453235, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.69921875, + "step": 10283, + "time_per_iteration": 2.4644439220428467 + }, + { + "auxiliary_loss_clip": 0.01110819, + "auxiliary_loss_mlp": 0.01029964, + "balance_loss_clip": 1.01680923, + "balance_loss_mlp": 1.03751874, + "epoch": 0.6183075304373966, + "flos": 22123738327680.0, + "grad_norm": 1.5819420485757947, + "language_loss": 0.74983263, + "learning_rate": 1.3431323278133176e-06, + "loss": 0.77124047, + "num_input_tokens_seen": 221472560, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 10284, + "time_per_iteration": 2.4563488960266113 + }, + { + "auxiliary_loss_clip": 0.01100937, + "auxiliary_loss_mlp": 0.01034089, + "balance_loss_clip": 1.02219248, + "balance_loss_mlp": 1.03690124, + "epoch": 0.6183676536900646, + "flos": 22455589104000.0, + "grad_norm": 1.4660610214457293, + "language_loss": 0.75491369, + "learning_rate": 1.3427644832041922e-06, + "loss": 0.77626395, + "num_input_tokens_seen": 221492835, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.640625, + "step": 10285, + "time_per_iteration": 2.4554288387298584 + }, + { + "auxiliary_loss_clip": 0.01103991, + "auxiliary_loss_mlp": 0.01031793, + "balance_loss_clip": 1.0199194, + "balance_loss_mlp": 1.03520298, + "epoch": 0.6184277769427327, + "flos": 23364128736000.0, + "grad_norm": 1.5161367182822474, + "language_loss": 0.7299751, + "learning_rate": 1.342396663517503e-06, + "loss": 0.751333, + "num_input_tokens_seen": 221511870, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 10286, + "time_per_iteration": 2.487755060195923 + }, + { + "auxiliary_loss_clip": 0.01102671, + "auxiliary_loss_mlp": 0.01025604, + "balance_loss_clip": 1.01424325, + "balance_loss_mlp": 1.03537941, + "epoch": 0.6184879001954006, + "flos": 22711057608960.0, + "grad_norm": 2.03959974890174, + "language_loss": 0.75874734, + "learning_rate": 1.342028868767199e-06, + "loss": 0.78003013, + "num_input_tokens_seen": 221529915, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.671875, + "step": 10287, + "time_per_iteration": 2.4449198246002197 + }, + { + "auxiliary_loss_clip": 0.0110312, + "auxiliary_loss_mlp": 0.01031317, + "balance_loss_clip": 1.01973581, + "balance_loss_mlp": 1.03618407, + "epoch": 0.6185480234480686, + "flos": 23841920471040.0, + "grad_norm": 1.6506833358218813, + "language_loss": 0.72823429, + "learning_rate": 1.3416610989672262e-06, + "loss": 0.74957871, + "num_input_tokens_seen": 221549745, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66796875, + "step": 10288, + "time_per_iteration": 2.469217538833618 + }, + { + "auxiliary_loss_clip": 0.01099107, + "auxiliary_loss_mlp": 0.01032543, + "balance_loss_clip": 1.02139127, + "balance_loss_mlp": 1.03515327, + "epoch": 0.6186081467007365, + "flos": 45477595774080.0, + "grad_norm": 1.4866118467097145, + "language_loss": 0.72703552, + "learning_rate": 1.3412933541315296e-06, + "loss": 0.74835199, + "num_input_tokens_seen": 221572455, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.640625, + "step": 10289, + "time_per_iteration": 2.6342008113861084 + }, + { + "auxiliary_loss_clip": 0.01105306, + "auxiliary_loss_mlp": 0.01030634, + "balance_loss_clip": 1.01854038, + "balance_loss_mlp": 1.03557706, + "epoch": 0.6186682699534045, + "flos": 23550864566400.0, + "grad_norm": 1.5657368356700847, + "language_loss": 0.79090887, + "learning_rate": 1.340925634274056e-06, + "loss": 0.81226832, + "num_input_tokens_seen": 221591325, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69921875, + "step": 10290, + "time_per_iteration": 2.4762990474700928 + }, + { + "auxiliary_loss_clip": 0.01106885, + "auxiliary_loss_mlp": 0.01031151, + "balance_loss_clip": 1.01883626, + "balance_loss_mlp": 1.03720856, + "epoch": 0.6187283932060724, + "flos": 25774201630080.0, + "grad_norm": 1.6315677183830801, + "language_loss": 0.81586653, + "learning_rate": 1.3405579394087475e-06, + "loss": 0.83724689, + "num_input_tokens_seen": 221611640, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 10291, + "time_per_iteration": 2.46706223487854 + }, + { + "auxiliary_loss_clip": 0.01103179, + "auxiliary_loss_mlp": 0.01031435, + "balance_loss_clip": 1.01962161, + "balance_loss_mlp": 1.0360167, + "epoch": 0.6187885164587404, + "flos": 25265203954560.0, + "grad_norm": 1.907685541449211, + "language_loss": 0.77654225, + "learning_rate": 1.3401902695495487e-06, + "loss": 0.7978884, + "num_input_tokens_seen": 221631225, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 10292, + "time_per_iteration": 2.4810614585876465 + }, + { + "auxiliary_loss_clip": 0.01111234, + "auxiliary_loss_mlp": 0.01038447, + "balance_loss_clip": 1.02459502, + "balance_loss_mlp": 1.03891051, + "epoch": 0.6188486397114084, + "flos": 26250772302720.0, + "grad_norm": 1.9028504578301217, + "language_loss": 0.737167, + "learning_rate": 1.339822624710401e-06, + "loss": 0.75866383, + "num_input_tokens_seen": 221651035, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.72265625, + "step": 10293, + "time_per_iteration": 2.516528844833374 + }, + { + "auxiliary_loss_clip": 0.01106754, + "auxiliary_loss_mlp": 0.01032743, + "balance_loss_clip": 1.02110207, + "balance_loss_mlp": 1.03902757, + "epoch": 0.6189087629640764, + "flos": 20923388605440.0, + "grad_norm": 2.0122354574742602, + "language_loss": 0.83089775, + "learning_rate": 1.3394550049052454e-06, + "loss": 0.85229266, + "num_input_tokens_seen": 221671300, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.67578125, + "step": 10294, + "time_per_iteration": 2.499441623687744 + }, + { + "auxiliary_loss_clip": 0.01106207, + "auxiliary_loss_mlp": 0.01030165, + "balance_loss_clip": 1.01833987, + "balance_loss_mlp": 1.03719449, + "epoch": 0.6189688862167443, + "flos": 14829814874880.0, + "grad_norm": 2.183209160789612, + "language_loss": 0.70951724, + "learning_rate": 1.3390874101480225e-06, + "loss": 0.73088086, + "num_input_tokens_seen": 221687320, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 10295, + "time_per_iteration": 2.4442856311798096 + }, + { + "auxiliary_loss_clip": 0.01105622, + "auxiliary_loss_mlp": 0.01033007, + "balance_loss_clip": 1.02100849, + "balance_loss_mlp": 1.03787184, + "epoch": 0.6190290094694123, + "flos": 24285058560000.0, + "grad_norm": 1.6245756110977043, + "language_loss": 0.70113528, + "learning_rate": 1.3387198404526705e-06, + "loss": 0.72252154, + "num_input_tokens_seen": 221710175, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 10296, + "time_per_iteration": 2.636453866958618 + }, + { + "auxiliary_loss_clip": 0.01108503, + "auxiliary_loss_mlp": 0.01033341, + "balance_loss_clip": 1.02007866, + "balance_loss_mlp": 1.03864932, + "epoch": 0.6190891327220802, + "flos": 22529457423360.0, + "grad_norm": 2.076478179664887, + "language_loss": 0.71677291, + "learning_rate": 1.3383522958331287e-06, + "loss": 0.73819137, + "num_input_tokens_seen": 221728145, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.69921875, + "step": 10297, + "time_per_iteration": 2.487703800201416 + }, + { + "auxiliary_loss_clip": 0.01028294, + "auxiliary_loss_mlp": 0.00997518, + "balance_loss_clip": 0.9962309, + "balance_loss_mlp": 1.00701296, + "epoch": 0.6191492559747482, + "flos": 67729357152000.0, + "grad_norm": 0.8802858185205813, + "language_loss": 0.64150029, + "learning_rate": 1.3379847763033345e-06, + "loss": 0.66175842, + "num_input_tokens_seen": 221786100, + "router_z_loss_clip": 0.01287842, + "router_z_loss_mlp": 0.21289062, + "step": 10298, + "time_per_iteration": 2.959296226501465 + }, + { + "auxiliary_loss_clip": 0.01105855, + "auxiliary_loss_mlp": 0.01032235, + "balance_loss_clip": 1.0202961, + "balance_loss_mlp": 1.03661466, + "epoch": 0.6192093792274163, + "flos": 22346672088960.0, + "grad_norm": 1.6984885948159927, + "language_loss": 0.74105954, + "learning_rate": 1.3376172818772236e-06, + "loss": 0.76244044, + "num_input_tokens_seen": 221806450, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.69140625, + "step": 10299, + "time_per_iteration": 2.450899124145508 + }, + { + "auxiliary_loss_clip": 0.01109628, + "auxiliary_loss_mlp": 0.01032817, + "balance_loss_clip": 1.02075887, + "balance_loss_mlp": 1.0376761, + "epoch": 0.6192695024800842, + "flos": 13553944807680.0, + "grad_norm": 1.8344519767478165, + "language_loss": 0.68278986, + "learning_rate": 1.337249812568732e-06, + "loss": 0.70421433, + "num_input_tokens_seen": 221823330, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.71875, + "step": 10300, + "time_per_iteration": 2.4547624588012695 + }, + { + "auxiliary_loss_clip": 0.01108413, + "auxiliary_loss_mlp": 0.01033534, + "balance_loss_clip": 1.0215776, + "balance_loss_mlp": 1.03889441, + "epoch": 0.6193296257327522, + "flos": 17415310815360.0, + "grad_norm": 1.8244494071351975, + "language_loss": 0.66936946, + "learning_rate": 1.3368823683917939e-06, + "loss": 0.69078887, + "num_input_tokens_seen": 221839360, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6953125, + "step": 10301, + "time_per_iteration": 2.467451810836792 + }, + { + "auxiliary_loss_clip": 0.01104043, + "auxiliary_loss_mlp": 0.01029493, + "balance_loss_clip": 1.018013, + "balance_loss_mlp": 1.03542924, + "epoch": 0.6193897489854201, + "flos": 31101118450560.0, + "grad_norm": 2.0193419698977317, + "language_loss": 0.73042768, + "learning_rate": 1.3365149493603424e-06, + "loss": 0.75176305, + "num_input_tokens_seen": 221859465, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6875, + "step": 10302, + "time_per_iteration": 4.012500762939453 + }, + { + "auxiliary_loss_clip": 0.0110528, + "auxiliary_loss_mlp": 0.01031712, + "balance_loss_clip": 1.01923048, + "balance_loss_mlp": 1.03690219, + "epoch": 0.6194498722380881, + "flos": 19134031662720.0, + "grad_norm": 1.8664060987198585, + "language_loss": 0.80371857, + "learning_rate": 1.3361475554883107e-06, + "loss": 0.82508844, + "num_input_tokens_seen": 221878555, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 10303, + "time_per_iteration": 2.437244176864624 + }, + { + "auxiliary_loss_clip": 0.01107499, + "auxiliary_loss_mlp": 0.01031441, + "balance_loss_clip": 1.01827395, + "balance_loss_mlp": 1.03684223, + "epoch": 0.619509995490756, + "flos": 21835088634240.0, + "grad_norm": 1.5617333087985545, + "language_loss": 0.76300073, + "learning_rate": 1.3357801867896307e-06, + "loss": 0.78439015, + "num_input_tokens_seen": 221898790, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.70703125, + "step": 10304, + "time_per_iteration": 3.8231778144836426 + }, + { + "auxiliary_loss_clip": 0.01110648, + "auxiliary_loss_mlp": 0.01034085, + "balance_loss_clip": 1.02169371, + "balance_loss_mlp": 1.03864741, + "epoch": 0.619570118743424, + "flos": 23806548552960.0, + "grad_norm": 2.062841626901626, + "language_loss": 0.77207863, + "learning_rate": 1.3354128432782324e-06, + "loss": 0.79352599, + "num_input_tokens_seen": 221918875, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.71875, + "step": 10305, + "time_per_iteration": 5.318151473999023 + }, + { + "auxiliary_loss_clip": 0.01111243, + "auxiliary_loss_mlp": 0.01032824, + "balance_loss_clip": 1.01918018, + "balance_loss_mlp": 1.03931832, + "epoch": 0.619630241996092, + "flos": 21101612912640.0, + "grad_norm": 1.6773478766205938, + "language_loss": 0.78826416, + "learning_rate": 1.335045524968045e-06, + "loss": 0.80970484, + "num_input_tokens_seen": 221937895, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71875, + "step": 10306, + "time_per_iteration": 2.4717702865600586 + }, + { + "auxiliary_loss_clip": 0.01099716, + "auxiliary_loss_mlp": 0.01026237, + "balance_loss_clip": 1.01576495, + "balance_loss_mlp": 1.03520381, + "epoch": 0.61969036524876, + "flos": 27308269635840.0, + "grad_norm": 1.579957954838489, + "language_loss": 0.79917157, + "learning_rate": 1.3346782318729988e-06, + "loss": 0.82043117, + "num_input_tokens_seen": 221955920, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.64453125, + "step": 10307, + "time_per_iteration": 2.51257586479187 + }, + { + "auxiliary_loss_clip": 0.01027759, + "auxiliary_loss_mlp": 0.00997846, + "balance_loss_clip": 0.99666041, + "balance_loss_mlp": 1.00661421, + "epoch": 0.6197504885014279, + "flos": 51648955384320.0, + "grad_norm": 0.8254095728079679, + "language_loss": 0.59419918, + "learning_rate": 1.3343109640070203e-06, + "loss": 0.61445522, + "num_input_tokens_seen": 222011405, + "router_z_loss_clip": 0.01184082, + "router_z_loss_mlp": 0.2109375, + "step": 10308, + "time_per_iteration": 3.087841510772705 + }, + { + "auxiliary_loss_clip": 0.01102523, + "auxiliary_loss_mlp": 0.01026948, + "balance_loss_clip": 1.01634467, + "balance_loss_mlp": 1.0360744, + "epoch": 0.6198106117540959, + "flos": 30557107992960.0, + "grad_norm": 1.8503774737615284, + "language_loss": 0.67855436, + "learning_rate": 1.333943721384037e-06, + "loss": 0.69984901, + "num_input_tokens_seen": 222034545, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.6640625, + "step": 10309, + "time_per_iteration": 2.516601800918579 + }, + { + "auxiliary_loss_clip": 0.01105412, + "auxiliary_loss_mlp": 0.01031211, + "balance_loss_clip": 1.01924789, + "balance_loss_mlp": 1.03811872, + "epoch": 0.6198707350067638, + "flos": 18909733184640.0, + "grad_norm": 1.5770368221477629, + "language_loss": 0.71985435, + "learning_rate": 1.3335765040179746e-06, + "loss": 0.74122059, + "num_input_tokens_seen": 222052690, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.67578125, + "step": 10310, + "time_per_iteration": 2.4543659687042236 + }, + { + "auxiliary_loss_clip": 0.01109202, + "auxiliary_loss_mlp": 0.01032021, + "balance_loss_clip": 1.01870525, + "balance_loss_mlp": 1.03908801, + "epoch": 0.6199308582594318, + "flos": 21433858738560.0, + "grad_norm": 1.8624693813193853, + "language_loss": 0.78939658, + "learning_rate": 1.3332093119227573e-06, + "loss": 0.81080884, + "num_input_tokens_seen": 222069095, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.703125, + "step": 10311, + "time_per_iteration": 2.4637980461120605 + }, + { + "auxiliary_loss_clip": 0.01105244, + "auxiliary_loss_mlp": 0.01032184, + "balance_loss_clip": 1.01957762, + "balance_loss_mlp": 1.03495574, + "epoch": 0.6199909815120999, + "flos": 18407379525120.0, + "grad_norm": 1.9506851073512315, + "language_loss": 0.72994781, + "learning_rate": 1.3328421451123105e-06, + "loss": 0.75132203, + "num_input_tokens_seen": 222087360, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 10312, + "time_per_iteration": 2.4388468265533447 + }, + { + "auxiliary_loss_clip": 0.0110771, + "auxiliary_loss_mlp": 0.01035173, + "balance_loss_clip": 1.02284074, + "balance_loss_mlp": 1.0381484, + "epoch": 0.6200511047647678, + "flos": 21466860359040.0, + "grad_norm": 2.1707252036738502, + "language_loss": 0.71927798, + "learning_rate": 1.3324750036005557e-06, + "loss": 0.7407068, + "num_input_tokens_seen": 222106130, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 10313, + "time_per_iteration": 2.4896764755249023 + }, + { + "auxiliary_loss_clip": 0.01108842, + "auxiliary_loss_mlp": 0.01030693, + "balance_loss_clip": 1.01780641, + "balance_loss_mlp": 1.03789592, + "epoch": 0.6201112280174358, + "flos": 18215903099520.0, + "grad_norm": 1.8177190018334353, + "language_loss": 0.78071815, + "learning_rate": 1.332107887401416e-06, + "loss": 0.80211347, + "num_input_tokens_seen": 222123125, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 10314, + "time_per_iteration": 2.4607138633728027 + }, + { + "auxiliary_loss_clip": 0.01105035, + "auxiliary_loss_mlp": 0.01033382, + "balance_loss_clip": 1.02113914, + "balance_loss_mlp": 1.03498077, + "epoch": 0.6201713512701037, + "flos": 20011185786240.0, + "grad_norm": 1.7685018834569248, + "language_loss": 0.78155088, + "learning_rate": 1.331740796528812e-06, + "loss": 0.80293512, + "num_input_tokens_seen": 222140655, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.703125, + "step": 10315, + "time_per_iteration": 2.428445816040039 + }, + { + "auxiliary_loss_clip": 0.01109232, + "auxiliary_loss_mlp": 0.01033411, + "balance_loss_clip": 1.02145982, + "balance_loss_mlp": 1.03922391, + "epoch": 0.6202314745227717, + "flos": 22487692884480.0, + "grad_norm": 2.596321726125175, + "language_loss": 0.76265639, + "learning_rate": 1.3313737309966641e-06, + "loss": 0.78408277, + "num_input_tokens_seen": 222160450, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.703125, + "step": 10316, + "time_per_iteration": 2.463766098022461 + }, + { + "auxiliary_loss_clip": 0.01105873, + "auxiliary_loss_mlp": 0.01030686, + "balance_loss_clip": 1.01823497, + "balance_loss_mlp": 1.0344758, + "epoch": 0.6202915977754396, + "flos": 26828682220800.0, + "grad_norm": 1.9163692596467958, + "language_loss": 0.77438551, + "learning_rate": 1.3310066908188915e-06, + "loss": 0.79575109, + "num_input_tokens_seen": 222179170, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 10317, + "time_per_iteration": 2.468884229660034 + }, + { + "auxiliary_loss_clip": 0.01027239, + "auxiliary_loss_mlp": 0.00999035, + "balance_loss_clip": 0.99786037, + "balance_loss_mlp": 1.00593257, + "epoch": 0.6203517210281076, + "flos": 62742694890240.0, + "grad_norm": 0.6919425802260456, + "language_loss": 0.59057474, + "learning_rate": 1.3306396760094122e-06, + "loss": 0.61083746, + "num_input_tokens_seen": 222242660, + "router_z_loss_clip": 0.01171875, + "router_z_loss_mlp": 0.21289062, + "step": 10318, + "time_per_iteration": 3.090552568435669 + }, + { + "auxiliary_loss_clip": 0.0110802, + "auxiliary_loss_mlp": 0.01034141, + "balance_loss_clip": 1.02163601, + "balance_loss_mlp": 1.03937101, + "epoch": 0.6204118442807756, + "flos": 23404277162880.0, + "grad_norm": 1.6841357417658411, + "language_loss": 0.77685571, + "learning_rate": 1.330272686582143e-06, + "loss": 0.79827732, + "num_input_tokens_seen": 222262170, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 10319, + "time_per_iteration": 2.4693212509155273 + }, + { + "auxiliary_loss_clip": 0.01104902, + "auxiliary_loss_mlp": 0.0103113, + "balance_loss_clip": 1.01963234, + "balance_loss_mlp": 1.03732896, + "epoch": 0.6204719675334436, + "flos": 20193647898240.0, + "grad_norm": 2.3469109769721377, + "language_loss": 0.66256416, + "learning_rate": 1.3299057225510013e-06, + "loss": 0.68392456, + "num_input_tokens_seen": 222280375, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.67578125, + "step": 10320, + "time_per_iteration": 2.447006940841675 + }, + { + "auxiliary_loss_clip": 0.01102632, + "auxiliary_loss_mlp": 0.01030299, + "balance_loss_clip": 1.01918244, + "balance_loss_mlp": 1.03645897, + "epoch": 0.6205320907861115, + "flos": 13188050916480.0, + "grad_norm": 1.6640363363170714, + "language_loss": 0.76396954, + "learning_rate": 1.3295387839299013e-06, + "loss": 0.78529894, + "num_input_tokens_seen": 222297325, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66015625, + "step": 10321, + "time_per_iteration": 2.439819574356079 + }, + { + "auxiliary_loss_clip": 0.01102881, + "auxiliary_loss_mlp": 0.01027928, + "balance_loss_clip": 1.01653743, + "balance_loss_mlp": 1.03596795, + "epoch": 0.6205922140387795, + "flos": 20668386977280.0, + "grad_norm": 1.7446721342838176, + "language_loss": 0.73165452, + "learning_rate": 1.329171870732758e-06, + "loss": 0.75296265, + "num_input_tokens_seen": 222317095, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.671875, + "step": 10322, + "time_per_iteration": 2.4455277919769287 + }, + { + "auxiliary_loss_clip": 0.01105105, + "auxiliary_loss_mlp": 0.01024456, + "balance_loss_clip": 1.01309574, + "balance_loss_mlp": 1.03739095, + "epoch": 0.6206523372914474, + "flos": 23877831093120.0, + "grad_norm": 2.5506684456453157, + "language_loss": 0.73217744, + "learning_rate": 1.3288049829734845e-06, + "loss": 0.75347304, + "num_input_tokens_seen": 222337055, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.67578125, + "step": 10323, + "time_per_iteration": 2.4893054962158203 + }, + { + "auxiliary_loss_clip": 0.01114414, + "auxiliary_loss_mlp": 0.01033533, + "balance_loss_clip": 1.02086651, + "balance_loss_mlp": 1.04062796, + "epoch": 0.6207124605441154, + "flos": 13406603218560.0, + "grad_norm": 2.3064550645164354, + "language_loss": 0.58989835, + "learning_rate": 1.3284381206659933e-06, + "loss": 0.61137784, + "num_input_tokens_seen": 222354515, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.73828125, + "step": 10324, + "time_per_iteration": 2.4318976402282715 + }, + { + "auxiliary_loss_clip": 0.01108806, + "auxiliary_loss_mlp": 0.01030221, + "balance_loss_clip": 1.01746607, + "balance_loss_mlp": 1.03886914, + "epoch": 0.6207725837967835, + "flos": 18916341287040.0, + "grad_norm": 2.054520538169497, + "language_loss": 0.76530892, + "learning_rate": 1.3280712838241956e-06, + "loss": 0.78669918, + "num_input_tokens_seen": 222372755, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 10325, + "time_per_iteration": 2.4457478523254395 + }, + { + "auxiliary_loss_clip": 0.01107557, + "auxiliary_loss_mlp": 0.01027615, + "balance_loss_clip": 1.01502085, + "balance_loss_mlp": 1.03696799, + "epoch": 0.6208327070494514, + "flos": 23980211832960.0, + "grad_norm": 1.7674606629656198, + "language_loss": 0.72749656, + "learning_rate": 1.327704472462003e-06, + "loss": 0.74884826, + "num_input_tokens_seen": 222391380, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.70703125, + "step": 10326, + "time_per_iteration": 2.469116687774658 + }, + { + "auxiliary_loss_clip": 0.01108969, + "auxiliary_loss_mlp": 0.01033575, + "balance_loss_clip": 1.02126646, + "balance_loss_mlp": 1.03798246, + "epoch": 0.6208928303021194, + "flos": 22820405587200.0, + "grad_norm": 3.158836515239834, + "language_loss": 0.73515177, + "learning_rate": 1.3273376865933234e-06, + "loss": 0.75657719, + "num_input_tokens_seen": 222411165, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.7109375, + "step": 10327, + "time_per_iteration": 2.4524545669555664 + }, + { + "auxiliary_loss_clip": 0.01109109, + "auxiliary_loss_mlp": 0.01031256, + "balance_loss_clip": 1.01832175, + "balance_loss_mlp": 1.03808546, + "epoch": 0.6209529535547873, + "flos": 17564519911680.0, + "grad_norm": 2.016240551650266, + "language_loss": 0.7945962, + "learning_rate": 1.326970926232066e-06, + "loss": 0.81599987, + "num_input_tokens_seen": 222428110, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 10328, + "time_per_iteration": 2.4385621547698975 + }, + { + "auxiliary_loss_clip": 0.01108206, + "auxiliary_loss_mlp": 0.01036726, + "balance_loss_clip": 1.02380335, + "balance_loss_mlp": 1.03790045, + "epoch": 0.6210130768074553, + "flos": 22011912311040.0, + "grad_norm": 1.9358397907066565, + "language_loss": 0.77753472, + "learning_rate": 1.3266041913921396e-06, + "loss": 0.79898405, + "num_input_tokens_seen": 222446385, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 10329, + "time_per_iteration": 2.462999105453491 + }, + { + "auxiliary_loss_clip": 0.01028614, + "auxiliary_loss_mlp": 0.01005403, + "balance_loss_clip": 1.00426447, + "balance_loss_mlp": 1.00714183, + "epoch": 0.6210732000601232, + "flos": 63676873854720.0, + "grad_norm": 0.8271913018767197, + "language_loss": 0.62140441, + "learning_rate": 1.3262374820874484e-06, + "loss": 0.64174461, + "num_input_tokens_seen": 222502150, + "router_z_loss_clip": 0.01141357, + "router_z_loss_mlp": 0.21484375, + "step": 10330, + "time_per_iteration": 3.0160677433013916 + }, + { + "auxiliary_loss_clip": 0.01111605, + "auxiliary_loss_mlp": 0.0103314, + "balance_loss_clip": 1.02014053, + "balance_loss_mlp": 1.03902602, + "epoch": 0.6211333233127913, + "flos": 24243365848320.0, + "grad_norm": 2.119882521955809, + "language_loss": 0.77734917, + "learning_rate": 1.3258707983319002e-06, + "loss": 0.79879665, + "num_input_tokens_seen": 222519880, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7265625, + "step": 10331, + "time_per_iteration": 2.489560842514038 + }, + { + "auxiliary_loss_clip": 0.01110147, + "auxiliary_loss_mlp": 0.01034338, + "balance_loss_clip": 1.0218091, + "balance_loss_mlp": 1.0385623, + "epoch": 0.6211934465654592, + "flos": 16943803960320.0, + "grad_norm": 2.1826239313183486, + "language_loss": 0.67408252, + "learning_rate": 1.3255041401393992e-06, + "loss": 0.69552743, + "num_input_tokens_seen": 222538545, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71875, + "step": 10332, + "time_per_iteration": 2.425645112991333 + }, + { + "auxiliary_loss_clip": 0.0110723, + "auxiliary_loss_mlp": 0.01027428, + "balance_loss_clip": 1.01532817, + "balance_loss_mlp": 1.03766382, + "epoch": 0.6212535698181272, + "flos": 15267386355840.0, + "grad_norm": 1.6359189592805878, + "language_loss": 0.76677281, + "learning_rate": 1.3251375075238476e-06, + "loss": 0.78811944, + "num_input_tokens_seen": 222556935, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 10333, + "time_per_iteration": 2.4364230632781982 + }, + { + "auxiliary_loss_clip": 0.01105905, + "auxiliary_loss_mlp": 0.01028787, + "balance_loss_clip": 1.01689601, + "balance_loss_mlp": 1.03827369, + "epoch": 0.6213136930707951, + "flos": 13443950384640.0, + "grad_norm": 2.0485781293514793, + "language_loss": 0.69575661, + "learning_rate": 1.3247709004991507e-06, + "loss": 0.71710348, + "num_input_tokens_seen": 222574035, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 10334, + "time_per_iteration": 2.4257168769836426 + }, + { + "auxiliary_loss_clip": 0.01107391, + "auxiliary_loss_mlp": 0.01028969, + "balance_loss_clip": 1.01766801, + "balance_loss_mlp": 1.03944373, + "epoch": 0.6213738163234631, + "flos": 18111223889280.0, + "grad_norm": 2.0078352045306507, + "language_loss": 0.70201457, + "learning_rate": 1.3244043190792078e-06, + "loss": 0.72337818, + "num_input_tokens_seen": 222592290, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6796875, + "step": 10335, + "time_per_iteration": 2.47383451461792 + }, + { + "auxiliary_loss_clip": 0.01102603, + "auxiliary_loss_mlp": 0.01030557, + "balance_loss_clip": 1.01889277, + "balance_loss_mlp": 1.03563762, + "epoch": 0.621433939576131, + "flos": 25337348421120.0, + "grad_norm": 1.47099412595651, + "language_loss": 0.80045199, + "learning_rate": 1.3240377632779213e-06, + "loss": 0.82178366, + "num_input_tokens_seen": 222612805, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66796875, + "step": 10336, + "time_per_iteration": 2.476863145828247 + }, + { + "auxiliary_loss_clip": 0.01103545, + "auxiliary_loss_mlp": 0.01027933, + "balance_loss_clip": 1.01593423, + "balance_loss_mlp": 1.03639817, + "epoch": 0.621494062828799, + "flos": 22565619440640.0, + "grad_norm": 1.8768203229000895, + "language_loss": 0.73504305, + "learning_rate": 1.3236712331091907e-06, + "loss": 0.75635779, + "num_input_tokens_seen": 222632260, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 10337, + "time_per_iteration": 2.4732797145843506 + }, + { + "auxiliary_loss_clip": 0.01109544, + "auxiliary_loss_mlp": 0.01030918, + "balance_loss_clip": 1.01764417, + "balance_loss_mlp": 1.03801644, + "epoch": 0.621554186081467, + "flos": 27417976750080.0, + "grad_norm": 1.8614452301224431, + "language_loss": 0.63164204, + "learning_rate": 1.3233047285869145e-06, + "loss": 0.65304667, + "num_input_tokens_seen": 222653570, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71484375, + "step": 10338, + "time_per_iteration": 2.4973182678222656 + }, + { + "auxiliary_loss_clip": 0.01106095, + "auxiliary_loss_mlp": 0.01029305, + "balance_loss_clip": 1.01755667, + "balance_loss_mlp": 1.03789639, + "epoch": 0.621614309334135, + "flos": 22346815743360.0, + "grad_norm": 2.390170977530988, + "language_loss": 0.71337169, + "learning_rate": 1.322938249724991e-06, + "loss": 0.73472571, + "num_input_tokens_seen": 222672480, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.68359375, + "step": 10339, + "time_per_iteration": 2.47871994972229 + }, + { + "auxiliary_loss_clip": 0.01103361, + "auxiliary_loss_mlp": 0.01027491, + "balance_loss_clip": 1.01571906, + "balance_loss_mlp": 1.03734398, + "epoch": 0.621674432586803, + "flos": 19281229597440.0, + "grad_norm": 1.5831202152699189, + "language_loss": 0.69323343, + "learning_rate": 1.3225717965373166e-06, + "loss": 0.71454197, + "num_input_tokens_seen": 222691200, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.66015625, + "step": 10340, + "time_per_iteration": 2.445570707321167 + }, + { + "auxiliary_loss_clip": 0.0110187, + "auxiliary_loss_mlp": 0.01027129, + "balance_loss_clip": 1.0154407, + "balance_loss_mlp": 1.03529525, + "epoch": 0.6217345558394709, + "flos": 21609533180160.0, + "grad_norm": 3.3727615102843513, + "language_loss": 0.68661916, + "learning_rate": 1.322205369037788e-06, + "loss": 0.70790917, + "num_input_tokens_seen": 222709975, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 10341, + "time_per_iteration": 2.439035415649414 + }, + { + "auxiliary_loss_clip": 0.01107972, + "auxiliary_loss_mlp": 0.0102942, + "balance_loss_clip": 1.01605105, + "balance_loss_mlp": 1.03783154, + "epoch": 0.6217946790921389, + "flos": 18004102554240.0, + "grad_norm": 2.06494623621423, + "language_loss": 0.81278366, + "learning_rate": 1.321838967240299e-06, + "loss": 0.83415759, + "num_input_tokens_seen": 222729005, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.703125, + "step": 10342, + "time_per_iteration": 2.445607900619507 + }, + { + "auxiliary_loss_clip": 0.01027883, + "auxiliary_loss_mlp": 0.01003385, + "balance_loss_clip": 1.00214529, + "balance_loss_mlp": 1.00662279, + "epoch": 0.6218548023448068, + "flos": 61973631768960.0, + "grad_norm": 0.7785995287469357, + "language_loss": 0.57325292, + "learning_rate": 1.3214725911587452e-06, + "loss": 0.59356558, + "num_input_tokens_seen": 222786090, + "router_z_loss_clip": 0.01239014, + "router_z_loss_mlp": 0.21289062, + "step": 10343, + "time_per_iteration": 4.364051342010498 + }, + { + "auxiliary_loss_clip": 0.01102174, + "auxiliary_loss_mlp": 0.01023841, + "balance_loss_clip": 1.01308846, + "balance_loss_mlp": 1.03629875, + "epoch": 0.6219149255974749, + "flos": 25739152934400.0, + "grad_norm": 1.780281281905301, + "language_loss": 0.72907692, + "learning_rate": 1.3211062408070184e-06, + "loss": 0.75033712, + "num_input_tokens_seen": 222806100, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.66015625, + "step": 10344, + "time_per_iteration": 2.4766275882720947 + }, + { + "auxiliary_loss_clip": 0.01107045, + "auxiliary_loss_mlp": 0.01033365, + "balance_loss_clip": 1.02209353, + "balance_loss_mlp": 1.03881705, + "epoch": 0.6219750488501428, + "flos": 25411073086080.0, + "grad_norm": 1.738872083076136, + "language_loss": 0.59990644, + "learning_rate": 1.3207399161990105e-06, + "loss": 0.62131059, + "num_input_tokens_seen": 222826575, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6796875, + "step": 10345, + "time_per_iteration": 2.541123390197754 + }, + { + "auxiliary_loss_clip": 0.01104933, + "auxiliary_loss_mlp": 0.01031607, + "balance_loss_clip": 1.01948929, + "balance_loss_mlp": 1.0357126, + "epoch": 0.6220351721028108, + "flos": 20047383717120.0, + "grad_norm": 1.9219019260210024, + "language_loss": 0.78273392, + "learning_rate": 1.320373617348614e-06, + "loss": 0.80409932, + "num_input_tokens_seen": 222845285, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 10346, + "time_per_iteration": 5.315351724624634 + }, + { + "auxiliary_loss_clip": 0.01106477, + "auxiliary_loss_mlp": 0.01028801, + "balance_loss_clip": 1.01602221, + "balance_loss_mlp": 1.03580999, + "epoch": 0.6220952953554787, + "flos": 27488397363840.0, + "grad_norm": 1.6418210301478282, + "language_loss": 0.71802652, + "learning_rate": 1.3200073442697171e-06, + "loss": 0.73937929, + "num_input_tokens_seen": 222864575, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.70703125, + "step": 10347, + "time_per_iteration": 2.497929334640503 + }, + { + "auxiliary_loss_clip": 0.01102635, + "auxiliary_loss_mlp": 0.01028399, + "balance_loss_clip": 1.01597118, + "balance_loss_mlp": 1.03503013, + "epoch": 0.6221554186081467, + "flos": 19207612673280.0, + "grad_norm": 1.625266135857152, + "language_loss": 0.71975756, + "learning_rate": 1.3196410969762108e-06, + "loss": 0.74106789, + "num_input_tokens_seen": 222884420, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.67578125, + "step": 10348, + "time_per_iteration": 3.9235568046569824 + }, + { + "auxiliary_loss_clip": 0.01026634, + "auxiliary_loss_mlp": 0.01006199, + "balance_loss_clip": 1.00494766, + "balance_loss_mlp": 1.00541496, + "epoch": 0.6222155418608146, + "flos": 62950939989120.0, + "grad_norm": 0.8371335682612564, + "language_loss": 0.54224485, + "learning_rate": 1.3192748754819815e-06, + "loss": 0.56257325, + "num_input_tokens_seen": 222944690, + "router_z_loss_clip": 0.01251221, + "router_z_loss_mlp": 0.21289062, + "step": 10349, + "time_per_iteration": 3.0496747493743896 + }, + { + "auxiliary_loss_clip": 0.0110532, + "auxiliary_loss_mlp": 0.01026788, + "balance_loss_clip": 1.01496863, + "balance_loss_mlp": 1.03663087, + "epoch": 0.6222756651134826, + "flos": 22601099099520.0, + "grad_norm": 2.1582584328539594, + "language_loss": 0.69793445, + "learning_rate": 1.3189086798009173e-06, + "loss": 0.71925557, + "num_input_tokens_seen": 222962990, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 10350, + "time_per_iteration": 2.470149278640747 + }, + { + "auxiliary_loss_clip": 0.0110629, + "auxiliary_loss_mlp": 0.0103389, + "balance_loss_clip": 1.02183747, + "balance_loss_mlp": 1.03684473, + "epoch": 0.6223357883661506, + "flos": 21142228216320.0, + "grad_norm": 1.9147448057982832, + "language_loss": 0.56816912, + "learning_rate": 1.3185425099469046e-06, + "loss": 0.58957094, + "num_input_tokens_seen": 222980715, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6953125, + "step": 10351, + "time_per_iteration": 2.505211114883423 + }, + { + "auxiliary_loss_clip": 0.01026374, + "auxiliary_loss_mlp": 0.00993206, + "balance_loss_clip": 0.99188894, + "balance_loss_mlp": 1.00534272, + "epoch": 0.6223959116188186, + "flos": 63765071700480.0, + "grad_norm": 0.8115156894720258, + "language_loss": 0.61159444, + "learning_rate": 1.3181763659338276e-06, + "loss": 0.63179016, + "num_input_tokens_seen": 223040685, + "router_z_loss_clip": 0.01318359, + "router_z_loss_mlp": 0.2109375, + "step": 10352, + "time_per_iteration": 3.021286725997925 + }, + { + "auxiliary_loss_clip": 0.01101568, + "auxiliary_loss_mlp": 0.01029379, + "balance_loss_clip": 1.017488, + "balance_loss_mlp": 1.0351944, + "epoch": 0.6224560348714866, + "flos": 22565727181440.0, + "grad_norm": 2.081556495777929, + "language_loss": 0.81940329, + "learning_rate": 1.3178102477755714e-06, + "loss": 0.84071267, + "num_input_tokens_seen": 223059000, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6640625, + "step": 10353, + "time_per_iteration": 2.496713638305664 + }, + { + "auxiliary_loss_clip": 0.010991, + "auxiliary_loss_mlp": 0.010273, + "balance_loss_clip": 1.01638615, + "balance_loss_mlp": 1.03455448, + "epoch": 0.6225161581241545, + "flos": 24097748112000.0, + "grad_norm": 1.5710771766627751, + "language_loss": 0.7576375, + "learning_rate": 1.3174441554860195e-06, + "loss": 0.77890158, + "num_input_tokens_seen": 223079345, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.64453125, + "step": 10354, + "time_per_iteration": 2.4855527877807617 + }, + { + "auxiliary_loss_clip": 0.01103725, + "auxiliary_loss_mlp": 0.01028461, + "balance_loss_clip": 1.01658213, + "balance_loss_mlp": 1.03609419, + "epoch": 0.6225762813768225, + "flos": 20443513881600.0, + "grad_norm": 1.4655004554762274, + "language_loss": 0.78727663, + "learning_rate": 1.3170780890790528e-06, + "loss": 0.80859846, + "num_input_tokens_seen": 223097880, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.67578125, + "step": 10355, + "time_per_iteration": 2.445819616317749 + }, + { + "auxiliary_loss_clip": 0.01106453, + "auxiliary_loss_mlp": 0.01030134, + "balance_loss_clip": 1.01859486, + "balance_loss_mlp": 1.03856397, + "epoch": 0.6226364046294904, + "flos": 27198131558400.0, + "grad_norm": 1.5925757296601037, + "language_loss": 0.78048426, + "learning_rate": 1.3167120485685538e-06, + "loss": 0.80185014, + "num_input_tokens_seen": 223118185, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6796875, + "step": 10356, + "time_per_iteration": 2.4893651008605957 + }, + { + "auxiliary_loss_clip": 0.01110459, + "auxiliary_loss_mlp": 0.01031899, + "balance_loss_clip": 1.01882744, + "balance_loss_mlp": 1.0377419, + "epoch": 0.6226965278821585, + "flos": 20445776438400.0, + "grad_norm": 2.1577973787104923, + "language_loss": 0.67252231, + "learning_rate": 1.3163460339684024e-06, + "loss": 0.69394588, + "num_input_tokens_seen": 223137600, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 10357, + "time_per_iteration": 2.4467334747314453 + }, + { + "auxiliary_loss_clip": 0.01111299, + "auxiliary_loss_mlp": 0.01031087, + "balance_loss_clip": 1.01744306, + "balance_loss_mlp": 1.03907299, + "epoch": 0.6227566511348264, + "flos": 22162737519360.0, + "grad_norm": 2.813144519953157, + "language_loss": 0.75561357, + "learning_rate": 1.3159800452924778e-06, + "loss": 0.77703738, + "num_input_tokens_seen": 223154360, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.72265625, + "step": 10358, + "time_per_iteration": 2.516791343688965 + }, + { + "auxiliary_loss_clip": 0.01104161, + "auxiliary_loss_mlp": 0.01031224, + "balance_loss_clip": 1.01916623, + "balance_loss_mlp": 1.03473985, + "epoch": 0.6228167743874944, + "flos": 18040875102720.0, + "grad_norm": 2.219435804709828, + "language_loss": 0.82639635, + "learning_rate": 1.3156140825546588e-06, + "loss": 0.84775025, + "num_input_tokens_seen": 223172255, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6953125, + "step": 10359, + "time_per_iteration": 2.4310834407806396 + }, + { + "auxiliary_loss_clip": 0.01102353, + "auxiliary_loss_mlp": 0.01039222, + "balance_loss_clip": 1.02763474, + "balance_loss_mlp": 1.03537011, + "epoch": 0.6228768976401623, + "flos": 17742851959680.0, + "grad_norm": 2.303439975038256, + "language_loss": 0.73551476, + "learning_rate": 1.315248145768822e-06, + "loss": 0.75693059, + "num_input_tokens_seen": 223186965, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66796875, + "step": 10360, + "time_per_iteration": 2.4032440185546875 + }, + { + "auxiliary_loss_clip": 0.01104376, + "auxiliary_loss_mlp": 0.01033974, + "balance_loss_clip": 1.02152276, + "balance_loss_mlp": 1.03514135, + "epoch": 0.6229370208928303, + "flos": 17894934144000.0, + "grad_norm": 2.1872491258589877, + "language_loss": 0.78007793, + "learning_rate": 1.3148822349488442e-06, + "loss": 0.8014614, + "num_input_tokens_seen": 223206045, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.69140625, + "step": 10361, + "time_per_iteration": 2.432612419128418 + }, + { + "auxiliary_loss_clip": 0.01105247, + "auxiliary_loss_mlp": 0.01028519, + "balance_loss_clip": 1.01694417, + "balance_loss_mlp": 1.03777361, + "epoch": 0.6229971441454982, + "flos": 17347763289600.0, + "grad_norm": 2.0406207393391322, + "language_loss": 0.67669165, + "learning_rate": 1.3145163501086005e-06, + "loss": 0.69802934, + "num_input_tokens_seen": 223224820, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.671875, + "step": 10362, + "time_per_iteration": 2.4279119968414307 + }, + { + "auxiliary_loss_clip": 0.01105655, + "auxiliary_loss_mlp": 0.01029911, + "balance_loss_clip": 1.01772738, + "balance_loss_mlp": 1.03628147, + "epoch": 0.6230572673981662, + "flos": 29241376807680.0, + "grad_norm": 1.866995951195316, + "language_loss": 0.67914844, + "learning_rate": 1.3141504912619658e-06, + "loss": 0.70050412, + "num_input_tokens_seen": 223243205, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6953125, + "step": 10363, + "time_per_iteration": 2.5570461750030518 + }, + { + "auxiliary_loss_clip": 0.01107735, + "auxiliary_loss_mlp": 0.01034747, + "balance_loss_clip": 1.02156806, + "balance_loss_mlp": 1.03598118, + "epoch": 0.6231173906508342, + "flos": 16325961096960.0, + "grad_norm": 1.8313003501061587, + "language_loss": 0.86500871, + "learning_rate": 1.3137846584228127e-06, + "loss": 0.88643348, + "num_input_tokens_seen": 223261370, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.71875, + "step": 10364, + "time_per_iteration": 2.4293837547302246 + }, + { + "auxiliary_loss_clip": 0.01025186, + "auxiliary_loss_mlp": 0.01006976, + "balance_loss_clip": 1.00571883, + "balance_loss_mlp": 1.00405002, + "epoch": 0.6231775139035022, + "flos": 68702032517760.0, + "grad_norm": 0.884662336082659, + "language_loss": 0.60777593, + "learning_rate": 1.313418851605015e-06, + "loss": 0.62809759, + "num_input_tokens_seen": 223315050, + "router_z_loss_clip": 0.01257324, + "router_z_loss_mlp": 0.2109375, + "step": 10365, + "time_per_iteration": 3.0822458267211914 + }, + { + "auxiliary_loss_clip": 0.01111747, + "auxiliary_loss_mlp": 0.01039491, + "balance_loss_clip": 1.02530479, + "balance_loss_mlp": 1.03808904, + "epoch": 0.6232376371561702, + "flos": 19821038163840.0, + "grad_norm": 2.2798464083102576, + "language_loss": 0.75205708, + "learning_rate": 1.3130530708224427e-06, + "loss": 0.77356946, + "num_input_tokens_seen": 223332130, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.73828125, + "step": 10366, + "time_per_iteration": 2.441955804824829 + }, + { + "auxiliary_loss_clip": 0.01108704, + "auxiliary_loss_mlp": 0.01040835, + "balance_loss_clip": 1.02833033, + "balance_loss_mlp": 1.03776455, + "epoch": 0.6232977604088381, + "flos": 23258264376960.0, + "grad_norm": 2.0199414320321725, + "language_loss": 0.76469356, + "learning_rate": 1.3126873160889665e-06, + "loss": 0.78618896, + "num_input_tokens_seen": 223351605, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.7109375, + "step": 10367, + "time_per_iteration": 2.477055072784424 + }, + { + "auxiliary_loss_clip": 0.01105026, + "auxiliary_loss_mlp": 0.01034491, + "balance_loss_clip": 1.02271938, + "balance_loss_mlp": 1.03831315, + "epoch": 0.6233578836615061, + "flos": 21106425335040.0, + "grad_norm": 1.4367646696128493, + "language_loss": 0.78561807, + "learning_rate": 1.312321587418457e-06, + "loss": 0.80701321, + "num_input_tokens_seen": 223372090, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.66796875, + "step": 10368, + "time_per_iteration": 2.4565787315368652 + }, + { + "auxiliary_loss_clip": 0.01108362, + "auxiliary_loss_mlp": 0.01031753, + "balance_loss_clip": 1.01959956, + "balance_loss_mlp": 1.03783059, + "epoch": 0.623418006914174, + "flos": 23769416868480.0, + "grad_norm": 1.854629496919494, + "language_loss": 0.68463397, + "learning_rate": 1.3119558848247811e-06, + "loss": 0.70603514, + "num_input_tokens_seen": 223390110, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.703125, + "step": 10369, + "time_per_iteration": 2.495943069458008 + }, + { + "auxiliary_loss_clip": 0.01107955, + "auxiliary_loss_mlp": 0.0103761, + "balance_loss_clip": 1.02470601, + "balance_loss_mlp": 1.03846693, + "epoch": 0.6234781301668421, + "flos": 17890480857600.0, + "grad_norm": 2.0672458586121922, + "language_loss": 0.87758917, + "learning_rate": 1.3115902083218072e-06, + "loss": 0.89904487, + "num_input_tokens_seen": 223404205, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 10370, + "time_per_iteration": 2.4028708934783936 + }, + { + "auxiliary_loss_clip": 0.01103104, + "auxiliary_loss_mlp": 0.01026152, + "balance_loss_clip": 1.01450515, + "balance_loss_mlp": 1.03551197, + "epoch": 0.62353825341951, + "flos": 26175503352960.0, + "grad_norm": 1.4687473894600929, + "language_loss": 0.65925562, + "learning_rate": 1.311224557923402e-06, + "loss": 0.68054819, + "num_input_tokens_seen": 223424855, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.67578125, + "step": 10371, + "time_per_iteration": 2.4908487796783447 + }, + { + "auxiliary_loss_clip": 0.01099208, + "auxiliary_loss_mlp": 0.01029723, + "balance_loss_clip": 1.01929259, + "balance_loss_mlp": 1.03462815, + "epoch": 0.623598376672178, + "flos": 31139902160640.0, + "grad_norm": 1.308988821713543, + "language_loss": 0.77547729, + "learning_rate": 1.3108589336434298e-06, + "loss": 0.79676664, + "num_input_tokens_seen": 223447225, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.6484375, + "step": 10372, + "time_per_iteration": 2.5180232524871826 + }, + { + "auxiliary_loss_clip": 0.01105338, + "auxiliary_loss_mlp": 0.01030431, + "balance_loss_clip": 1.01769924, + "balance_loss_mlp": 1.03540146, + "epoch": 0.6236584999248459, + "flos": 23730202195200.0, + "grad_norm": 1.565588018128666, + "language_loss": 0.77423698, + "learning_rate": 1.3104933354957568e-06, + "loss": 0.79559469, + "num_input_tokens_seen": 223467520, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 10373, + "time_per_iteration": 2.4661612510681152 + }, + { + "auxiliary_loss_clip": 0.01101212, + "auxiliary_loss_mlp": 0.01025569, + "balance_loss_clip": 1.01429188, + "balance_loss_mlp": 1.03523397, + "epoch": 0.6237186231775139, + "flos": 21762764599680.0, + "grad_norm": 1.4815417355827754, + "language_loss": 0.69228935, + "learning_rate": 1.3101277634942448e-06, + "loss": 0.71355724, + "num_input_tokens_seen": 223488130, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.66015625, + "step": 10374, + "time_per_iteration": 2.473937511444092 + }, + { + "auxiliary_loss_clip": 0.0110711, + "auxiliary_loss_mlp": 0.01031094, + "balance_loss_clip": 1.01916742, + "balance_loss_mlp": 1.03731394, + "epoch": 0.6237787464301818, + "flos": 14939486075520.0, + "grad_norm": 1.723426878177341, + "language_loss": 0.77033317, + "learning_rate": 1.3097622176527577e-06, + "loss": 0.79171526, + "num_input_tokens_seen": 223505105, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69921875, + "step": 10375, + "time_per_iteration": 2.437490463256836 + }, + { + "auxiliary_loss_clip": 0.01104528, + "auxiliary_loss_mlp": 0.01028615, + "balance_loss_clip": 1.0170275, + "balance_loss_mlp": 1.0379982, + "epoch": 0.6238388696828499, + "flos": 35590311302400.0, + "grad_norm": 1.4731613062232216, + "language_loss": 0.70344281, + "learning_rate": 1.3093966979851566e-06, + "loss": 0.72477418, + "num_input_tokens_seen": 223528065, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66796875, + "step": 10376, + "time_per_iteration": 2.6377809047698975 + }, + { + "auxiliary_loss_clip": 0.01108576, + "auxiliary_loss_mlp": 0.01029796, + "balance_loss_clip": 1.01712978, + "balance_loss_mlp": 1.03811753, + "epoch": 0.6238989929355178, + "flos": 23623511823360.0, + "grad_norm": 2.3241172647924837, + "language_loss": 0.76568282, + "learning_rate": 1.309031204505301e-06, + "loss": 0.78706658, + "num_input_tokens_seen": 223547305, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 10377, + "time_per_iteration": 2.479133367538452 + }, + { + "auxiliary_loss_clip": 0.01106151, + "auxiliary_loss_mlp": 0.01029223, + "balance_loss_clip": 1.01860189, + "balance_loss_mlp": 1.03780174, + "epoch": 0.6239591161881858, + "flos": 22087468569600.0, + "grad_norm": 1.547563238627933, + "language_loss": 0.67949808, + "learning_rate": 1.308665737227052e-06, + "loss": 0.7008518, + "num_input_tokens_seen": 223567205, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.68359375, + "step": 10378, + "time_per_iteration": 2.4531919956207275 + }, + { + "auxiliary_loss_clip": 0.01104298, + "auxiliary_loss_mlp": 0.01030974, + "balance_loss_clip": 1.01901162, + "balance_loss_mlp": 1.03584397, + "epoch": 0.6240192394408538, + "flos": 24535930124160.0, + "grad_norm": 1.7868825896573544, + "language_loss": 0.76539075, + "learning_rate": 1.3083002961642675e-06, + "loss": 0.78674352, + "num_input_tokens_seen": 223586560, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6875, + "step": 10379, + "time_per_iteration": 2.489495277404785 + }, + { + "auxiliary_loss_clip": 0.01102881, + "auxiliary_loss_mlp": 0.01027719, + "balance_loss_clip": 1.01567876, + "balance_loss_mlp": 1.0352664, + "epoch": 0.6240793626935217, + "flos": 27931930502400.0, + "grad_norm": 1.3567066837187596, + "language_loss": 0.79495847, + "learning_rate": 1.3079348813308051e-06, + "loss": 0.81626451, + "num_input_tokens_seen": 223610595, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 10380, + "time_per_iteration": 2.513836145401001 + }, + { + "auxiliary_loss_clip": 0.01105137, + "auxiliary_loss_mlp": 0.01028452, + "balance_loss_clip": 1.01738906, + "balance_loss_mlp": 1.03878844, + "epoch": 0.6241394859461897, + "flos": 22892514140160.0, + "grad_norm": 1.5683522336983957, + "language_loss": 0.79919797, + "learning_rate": 1.3075694927405207e-06, + "loss": 0.82053387, + "num_input_tokens_seen": 223630230, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6640625, + "step": 10381, + "time_per_iteration": 2.4719154834747314 + }, + { + "auxiliary_loss_clip": 0.01104983, + "auxiliary_loss_mlp": 0.01032235, + "balance_loss_clip": 1.02026606, + "balance_loss_mlp": 1.03598738, + "epoch": 0.6241996091988576, + "flos": 12750766744320.0, + "grad_norm": 2.2093057050572606, + "language_loss": 0.74530953, + "learning_rate": 1.3072041304072718e-06, + "loss": 0.76668167, + "num_input_tokens_seen": 223648360, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.69140625, + "step": 10382, + "time_per_iteration": 2.4555060863494873 + }, + { + "auxiliary_loss_clip": 0.01102662, + "auxiliary_loss_mlp": 0.01025503, + "balance_loss_clip": 1.01423788, + "balance_loss_mlp": 1.03613257, + "epoch": 0.6242597324515257, + "flos": 25851302173440.0, + "grad_norm": 1.3920284041280475, + "language_loss": 0.78429455, + "learning_rate": 1.306838794344911e-06, + "loss": 0.80557621, + "num_input_tokens_seen": 223671255, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6640625, + "step": 10383, + "time_per_iteration": 2.5131173133850098 + }, + { + "auxiliary_loss_clip": 0.01103404, + "auxiliary_loss_mlp": 0.0102853, + "balance_loss_clip": 1.01732409, + "balance_loss_mlp": 1.03612638, + "epoch": 0.6243198557041936, + "flos": 19937712516480.0, + "grad_norm": 2.28937629159475, + "language_loss": 0.7478832, + "learning_rate": 1.3064734845672925e-06, + "loss": 0.76920247, + "num_input_tokens_seen": 223689860, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 10384, + "time_per_iteration": 2.441364049911499 + }, + { + "auxiliary_loss_clip": 0.01107606, + "auxiliary_loss_mlp": 0.0103045, + "balance_loss_clip": 1.01817775, + "balance_loss_mlp": 1.03742898, + "epoch": 0.6243799789568616, + "flos": 18406194376320.0, + "grad_norm": 2.8855056380065993, + "language_loss": 0.66313016, + "learning_rate": 1.3061082010882694e-06, + "loss": 0.68451071, + "num_input_tokens_seen": 223707835, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.703125, + "step": 10385, + "time_per_iteration": 3.859321117401123 + }, + { + "auxiliary_loss_clip": 0.01027145, + "auxiliary_loss_mlp": 0.01001461, + "balance_loss_clip": 1.00013185, + "balance_loss_mlp": 1.0058732, + "epoch": 0.6244401022095295, + "flos": 66027587523840.0, + "grad_norm": 0.7546932463540804, + "language_loss": 0.62028766, + "learning_rate": 1.305742943921692e-06, + "loss": 0.64057362, + "num_input_tokens_seen": 223771875, + "router_z_loss_clip": 0.01330566, + "router_z_loss_mlp": 0.21289062, + "step": 10386, + "time_per_iteration": 3.106778860092163 + }, + { + "auxiliary_loss_clip": 0.01105241, + "auxiliary_loss_mlp": 0.01031922, + "balance_loss_clip": 1.01933956, + "balance_loss_mlp": 1.03560019, + "epoch": 0.6245002254621975, + "flos": 24571266128640.0, + "grad_norm": 2.5221123793522247, + "language_loss": 0.7170524, + "learning_rate": 1.3053777130814128e-06, + "loss": 0.73842406, + "num_input_tokens_seen": 223788895, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 10387, + "time_per_iteration": 2.471496105194092 + }, + { + "auxiliary_loss_clip": 0.01110828, + "auxiliary_loss_mlp": 0.01038535, + "balance_loss_clip": 1.02493882, + "balance_loss_mlp": 1.03753424, + "epoch": 0.6245603487148654, + "flos": 29168837291520.0, + "grad_norm": 2.0526196711418345, + "language_loss": 0.65366501, + "learning_rate": 1.3050125085812798e-06, + "loss": 0.67515868, + "num_input_tokens_seen": 223810385, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.73046875, + "step": 10388, + "time_per_iteration": 5.378544330596924 + }, + { + "auxiliary_loss_clip": 0.01104574, + "auxiliary_loss_mlp": 0.01027126, + "balance_loss_clip": 1.01566386, + "balance_loss_mlp": 1.03606319, + "epoch": 0.6246204719675335, + "flos": 14790097411200.0, + "grad_norm": 1.6446610432064326, + "language_loss": 0.79204857, + "learning_rate": 1.3046473304351417e-06, + "loss": 0.81336558, + "num_input_tokens_seen": 223826040, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6875, + "step": 10389, + "time_per_iteration": 3.85504150390625 + }, + { + "auxiliary_loss_clip": 0.01103741, + "auxiliary_loss_mlp": 0.01032365, + "balance_loss_clip": 1.02053928, + "balance_loss_mlp": 1.03604019, + "epoch": 0.6246805952202014, + "flos": 12493538472960.0, + "grad_norm": 1.9237323307273804, + "language_loss": 0.60423774, + "learning_rate": 1.3042821786568475e-06, + "loss": 0.62559879, + "num_input_tokens_seen": 223842300, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.67578125, + "step": 10390, + "time_per_iteration": 2.4648008346557617 + }, + { + "auxiliary_loss_clip": 0.01107504, + "auxiliary_loss_mlp": 0.01033109, + "balance_loss_clip": 1.02080107, + "balance_loss_mlp": 1.03688002, + "epoch": 0.6247407184728694, + "flos": 12786677366400.0, + "grad_norm": 1.88087186985586, + "language_loss": 0.77647173, + "learning_rate": 1.3039170532602416e-06, + "loss": 0.79787791, + "num_input_tokens_seen": 223858320, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.70703125, + "step": 10391, + "time_per_iteration": 2.4204020500183105 + }, + { + "auxiliary_loss_clip": 0.01107712, + "auxiliary_loss_mlp": 0.0103129, + "balance_loss_clip": 1.01849914, + "balance_loss_mlp": 1.03854263, + "epoch": 0.6248008417255374, + "flos": 40629188960640.0, + "grad_norm": 1.9064599500175736, + "language_loss": 0.64700288, + "learning_rate": 1.3035519542591718e-06, + "loss": 0.6683929, + "num_input_tokens_seen": 223883545, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69140625, + "step": 10392, + "time_per_iteration": 2.6868064403533936 + }, + { + "auxiliary_loss_clip": 0.01108711, + "auxiliary_loss_mlp": 0.010312, + "balance_loss_clip": 1.0189693, + "balance_loss_mlp": 1.03795576, + "epoch": 0.6248609649782053, + "flos": 19902017376000.0, + "grad_norm": 1.715075150061653, + "language_loss": 0.76449108, + "learning_rate": 1.3031868816674819e-06, + "loss": 0.78589016, + "num_input_tokens_seen": 223901445, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.70703125, + "step": 10393, + "time_per_iteration": 2.5002684593200684 + }, + { + "auxiliary_loss_clip": 0.01109321, + "auxiliary_loss_mlp": 0.01036311, + "balance_loss_clip": 1.02361488, + "balance_loss_mlp": 1.03849423, + "epoch": 0.6249210882308733, + "flos": 19682746801920.0, + "grad_norm": 1.7032519811811655, + "language_loss": 0.82738161, + "learning_rate": 1.3028218354990142e-06, + "loss": 0.84883797, + "num_input_tokens_seen": 223920170, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 10394, + "time_per_iteration": 2.5074119567871094 + }, + { + "auxiliary_loss_clip": 0.01108744, + "auxiliary_loss_mlp": 0.01032457, + "balance_loss_clip": 1.01968956, + "balance_loss_mlp": 1.03777504, + "epoch": 0.6249812114835412, + "flos": 13990726189440.0, + "grad_norm": 1.7635560366961225, + "language_loss": 0.75053072, + "learning_rate": 1.3024568157676128e-06, + "loss": 0.77194268, + "num_input_tokens_seen": 223936495, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7109375, + "step": 10395, + "time_per_iteration": 2.4207139015197754 + }, + { + "auxiliary_loss_clip": 0.01106696, + "auxiliary_loss_mlp": 0.01029996, + "balance_loss_clip": 1.01774108, + "balance_loss_mlp": 1.03590536, + "epoch": 0.6250413347362093, + "flos": 14530031965440.0, + "grad_norm": 2.116778231139036, + "language_loss": 0.72623551, + "learning_rate": 1.302091822487119e-06, + "loss": 0.74760246, + "num_input_tokens_seen": 223950070, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.70703125, + "step": 10396, + "time_per_iteration": 2.4098753929138184 + }, + { + "auxiliary_loss_clip": 0.01106314, + "auxiliary_loss_mlp": 0.01035035, + "balance_loss_clip": 1.02295291, + "balance_loss_mlp": 1.03761959, + "epoch": 0.6251014579888772, + "flos": 22963006581120.0, + "grad_norm": 1.639382305953213, + "language_loss": 0.75850725, + "learning_rate": 1.3017268556713732e-06, + "loss": 0.7799207, + "num_input_tokens_seen": 223970065, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6875, + "step": 10397, + "time_per_iteration": 2.437908887863159 + }, + { + "auxiliary_loss_clip": 0.01104633, + "auxiliary_loss_mlp": 0.01031448, + "balance_loss_clip": 1.0192287, + "balance_loss_mlp": 1.03570378, + "epoch": 0.6251615812415452, + "flos": 28111232217600.0, + "grad_norm": 2.1037822697926667, + "language_loss": 0.74630761, + "learning_rate": 1.3013619153342154e-06, + "loss": 0.76766837, + "num_input_tokens_seen": 223990315, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 10398, + "time_per_iteration": 2.5268969535827637 + }, + { + "auxiliary_loss_clip": 0.01107479, + "auxiliary_loss_mlp": 0.01031651, + "balance_loss_clip": 1.0180074, + "balance_loss_mlp": 1.03535593, + "epoch": 0.6252217044942131, + "flos": 26724469887360.0, + "grad_norm": 1.7918693005970583, + "language_loss": 0.74092543, + "learning_rate": 1.300997001489483e-06, + "loss": 0.7623167, + "num_input_tokens_seen": 224009960, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.72265625, + "step": 10399, + "time_per_iteration": 2.4791572093963623 + }, + { + "auxiliary_loss_clip": 0.01107905, + "auxiliary_loss_mlp": 0.01032268, + "balance_loss_clip": 1.01990008, + "balance_loss_mlp": 1.03819537, + "epoch": 0.6252818277468811, + "flos": 20006768413440.0, + "grad_norm": 1.731383234573371, + "language_loss": 0.74527764, + "learning_rate": 1.3006321141510147e-06, + "loss": 0.76667941, + "num_input_tokens_seen": 224028870, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.69921875, + "step": 10400, + "time_per_iteration": 2.473951816558838 + }, + { + "auxiliary_loss_clip": 0.01026565, + "auxiliary_loss_mlp": 0.01000492, + "balance_loss_clip": 0.99915105, + "balance_loss_mlp": 1.00554299, + "epoch": 0.625341950999549, + "flos": 59278285059840.0, + "grad_norm": 0.8444247043206139, + "language_loss": 0.5648914, + "learning_rate": 1.3002672533326465e-06, + "loss": 0.58516198, + "num_input_tokens_seen": 224094140, + "router_z_loss_clip": 0.01342773, + "router_z_loss_mlp": 0.20996094, + "step": 10401, + "time_per_iteration": 3.129333019256592 + }, + { + "auxiliary_loss_clip": 0.01107201, + "auxiliary_loss_mlp": 0.01033139, + "balance_loss_clip": 1.0204252, + "balance_loss_mlp": 1.03666401, + "epoch": 0.625402074252217, + "flos": 20157090831360.0, + "grad_norm": 2.0092602513975977, + "language_loss": 0.82945538, + "learning_rate": 1.2999024190482146e-06, + "loss": 0.85085875, + "num_input_tokens_seen": 224113235, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 10402, + "time_per_iteration": 2.460231304168701 + }, + { + "auxiliary_loss_clip": 0.01104333, + "auxiliary_loss_mlp": 0.01031135, + "balance_loss_clip": 1.01907122, + "balance_loss_mlp": 1.03590369, + "epoch": 0.625462197504885, + "flos": 29132531619840.0, + "grad_norm": 1.9961648351421997, + "language_loss": 0.69392562, + "learning_rate": 1.2995376113115527e-06, + "loss": 0.71528035, + "num_input_tokens_seen": 224134530, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.68359375, + "step": 10403, + "time_per_iteration": 2.512580156326294 + }, + { + "auxiliary_loss_clip": 0.01107476, + "auxiliary_loss_mlp": 0.01029842, + "balance_loss_clip": 1.01649678, + "balance_loss_mlp": 1.03631687, + "epoch": 0.625522320757553, + "flos": 26104436294400.0, + "grad_norm": 1.605243006168547, + "language_loss": 0.71813661, + "learning_rate": 1.2991728301364954e-06, + "loss": 0.73950982, + "num_input_tokens_seen": 224154170, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.7109375, + "step": 10404, + "time_per_iteration": 2.5337743759155273 + }, + { + "auxiliary_loss_clip": 0.0110666, + "auxiliary_loss_mlp": 0.01036242, + "balance_loss_clip": 1.02410626, + "balance_loss_mlp": 1.03739667, + "epoch": 0.625582444010221, + "flos": 20630967984000.0, + "grad_norm": 2.1209903153707637, + "language_loss": 0.69724202, + "learning_rate": 1.2988080755368742e-06, + "loss": 0.71867102, + "num_input_tokens_seen": 224172730, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69140625, + "step": 10405, + "time_per_iteration": 2.429565191268921 + }, + { + "auxiliary_loss_clip": 0.01106396, + "auxiliary_loss_mlp": 0.01031388, + "balance_loss_clip": 1.01888895, + "balance_loss_mlp": 1.03722537, + "epoch": 0.6256425672628889, + "flos": 20521512264960.0, + "grad_norm": 1.5758155671533136, + "language_loss": 0.79004002, + "learning_rate": 1.2984433475265207e-06, + "loss": 0.81141788, + "num_input_tokens_seen": 224192620, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69140625, + "step": 10406, + "time_per_iteration": 2.468031167984009 + }, + { + "auxiliary_loss_clip": 0.01107697, + "auxiliary_loss_mlp": 0.01033032, + "balance_loss_clip": 1.0206759, + "balance_loss_mlp": 1.03848672, + "epoch": 0.6257026905155569, + "flos": 29529200488320.0, + "grad_norm": 2.3254582384945546, + "language_loss": 0.68920648, + "learning_rate": 1.2980786461192666e-06, + "loss": 0.71061373, + "num_input_tokens_seen": 224214660, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6953125, + "step": 10407, + "time_per_iteration": 2.527899742126465 + }, + { + "auxiliary_loss_clip": 0.01103441, + "auxiliary_loss_mlp": 0.01027988, + "balance_loss_clip": 1.0164783, + "balance_loss_mlp": 1.03711939, + "epoch": 0.6257628137682248, + "flos": 24024885373440.0, + "grad_norm": 1.6489273629254082, + "language_loss": 0.85259062, + "learning_rate": 1.2977139713289398e-06, + "loss": 0.87390488, + "num_input_tokens_seen": 224234170, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 10408, + "time_per_iteration": 2.5326271057128906 + }, + { + "auxiliary_loss_clip": 0.01103218, + "auxiliary_loss_mlp": 0.01032655, + "balance_loss_clip": 1.02121651, + "balance_loss_mlp": 1.03541374, + "epoch": 0.6258229370208929, + "flos": 20850956830080.0, + "grad_norm": 1.6409440677958231, + "language_loss": 0.79910547, + "learning_rate": 1.2973493231693699e-06, + "loss": 0.82046419, + "num_input_tokens_seen": 224253115, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6796875, + "step": 10409, + "time_per_iteration": 2.442823886871338 + }, + { + "auxiliary_loss_clip": 0.01102769, + "auxiliary_loss_mlp": 0.01029631, + "balance_loss_clip": 1.01786542, + "balance_loss_mlp": 1.03510618, + "epoch": 0.6258830602735608, + "flos": 22231542021120.0, + "grad_norm": 2.1270589511309, + "language_loss": 0.69238424, + "learning_rate": 1.2969847016543845e-06, + "loss": 0.71370828, + "num_input_tokens_seen": 224271375, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6796875, + "step": 10410, + "time_per_iteration": 2.5218586921691895 + }, + { + "auxiliary_loss_clip": 0.01102703, + "auxiliary_loss_mlp": 0.01027941, + "balance_loss_clip": 1.01665211, + "balance_loss_mlp": 1.03720927, + "epoch": 0.6259431835262288, + "flos": 25076887925760.0, + "grad_norm": 1.7979777871745755, + "language_loss": 0.67414671, + "learning_rate": 1.2966201067978086e-06, + "loss": 0.69545317, + "num_input_tokens_seen": 224290315, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65625, + "step": 10411, + "time_per_iteration": 2.4738645553588867 + }, + { + "auxiliary_loss_clip": 0.0110494, + "auxiliary_loss_mlp": 0.01035981, + "balance_loss_clip": 1.02416134, + "balance_loss_mlp": 1.03532887, + "epoch": 0.6260033067788967, + "flos": 28252288926720.0, + "grad_norm": 1.6084905019023508, + "language_loss": 0.69372767, + "learning_rate": 1.2962555386134702e-06, + "loss": 0.71513689, + "num_input_tokens_seen": 224310545, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6953125, + "step": 10412, + "time_per_iteration": 2.5545077323913574 + }, + { + "auxiliary_loss_clip": 0.01102021, + "auxiliary_loss_mlp": 0.01031498, + "balance_loss_clip": 1.02027464, + "balance_loss_mlp": 1.03490543, + "epoch": 0.6260634300315647, + "flos": 23367432787200.0, + "grad_norm": 1.551813331878434, + "language_loss": 0.69730282, + "learning_rate": 1.2958909971151908e-06, + "loss": 0.718638, + "num_input_tokens_seen": 224331115, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 10413, + "time_per_iteration": 2.4613993167877197 + }, + { + "auxiliary_loss_clip": 0.01107528, + "auxiliary_loss_mlp": 0.01031885, + "balance_loss_clip": 1.01831901, + "balance_loss_mlp": 1.03475976, + "epoch": 0.6261235532842326, + "flos": 18035308494720.0, + "grad_norm": 2.3187128472961347, + "language_loss": 0.80297446, + "learning_rate": 1.295526482316796e-06, + "loss": 0.82436854, + "num_input_tokens_seen": 224347525, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.7265625, + "step": 10414, + "time_per_iteration": 2.4762308597564697 + }, + { + "auxiliary_loss_clip": 0.01106139, + "auxiliary_loss_mlp": 0.01034131, + "balance_loss_clip": 1.02244806, + "balance_loss_mlp": 1.03826272, + "epoch": 0.6261836765369007, + "flos": 22011265866240.0, + "grad_norm": 1.6885486405610761, + "language_loss": 0.74565107, + "learning_rate": 1.2951619942321083e-06, + "loss": 0.76705372, + "num_input_tokens_seen": 224367045, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 10415, + "time_per_iteration": 2.469125270843506 + }, + { + "auxiliary_loss_clip": 0.01103919, + "auxiliary_loss_mlp": 0.01027621, + "balance_loss_clip": 1.01612878, + "balance_loss_mlp": 1.03637624, + "epoch": 0.6262437997895686, + "flos": 24936010784640.0, + "grad_norm": 1.6561914595998568, + "language_loss": 0.74751735, + "learning_rate": 1.2947975328749472e-06, + "loss": 0.7688328, + "num_input_tokens_seen": 224388860, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.67578125, + "step": 10416, + "time_per_iteration": 2.5993549823760986 + }, + { + "auxiliary_loss_clip": 0.0110123, + "auxiliary_loss_mlp": 0.01029477, + "balance_loss_clip": 1.01813984, + "balance_loss_mlp": 1.03624392, + "epoch": 0.6263039230422366, + "flos": 31608428186880.0, + "grad_norm": 1.5562931530598996, + "language_loss": 0.84521848, + "learning_rate": 1.2944330982591352e-06, + "loss": 0.86652553, + "num_input_tokens_seen": 224409645, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 10417, + "time_per_iteration": 2.555704355239868 + }, + { + "auxiliary_loss_clip": 0.01105248, + "auxiliary_loss_mlp": 0.01028467, + "balance_loss_clip": 1.01628423, + "balance_loss_mlp": 1.03636765, + "epoch": 0.6263640462949046, + "flos": 17639465639040.0, + "grad_norm": 2.453683898924351, + "language_loss": 0.56929493, + "learning_rate": 1.2940686903984904e-06, + "loss": 0.59063208, + "num_input_tokens_seen": 224428530, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 10418, + "time_per_iteration": 2.443615198135376 + }, + { + "auxiliary_loss_clip": 0.01108601, + "auxiliary_loss_mlp": 0.01031797, + "balance_loss_clip": 1.01904798, + "balance_loss_mlp": 1.03636181, + "epoch": 0.6264241695475725, + "flos": 19974951941760.0, + "grad_norm": 1.7897891411455675, + "language_loss": 0.84952247, + "learning_rate": 1.2937043093068316e-06, + "loss": 0.8709265, + "num_input_tokens_seen": 224447175, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.72265625, + "step": 10419, + "time_per_iteration": 2.432539224624634 + }, + { + "auxiliary_loss_clip": 0.01108205, + "auxiliary_loss_mlp": 0.01032396, + "balance_loss_clip": 1.02055252, + "balance_loss_mlp": 1.03868783, + "epoch": 0.6264842928002405, + "flos": 27344323912320.0, + "grad_norm": 1.768912237267882, + "language_loss": 0.64837831, + "learning_rate": 1.2933399549979762e-06, + "loss": 0.66978431, + "num_input_tokens_seen": 224469445, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6953125, + "step": 10420, + "time_per_iteration": 2.5192198753356934 + }, + { + "auxiliary_loss_clip": 0.01105751, + "auxiliary_loss_mlp": 0.01030399, + "balance_loss_clip": 1.01782894, + "balance_loss_mlp": 1.03548038, + "epoch": 0.6265444160529084, + "flos": 22997265177600.0, + "grad_norm": 1.9559815455742504, + "language_loss": 0.86093545, + "learning_rate": 1.292975627485741e-06, + "loss": 0.88229704, + "num_input_tokens_seen": 224486590, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 10421, + "time_per_iteration": 2.454472303390503 + }, + { + "auxiliary_loss_clip": 0.01106789, + "auxiliary_loss_mlp": 0.01031721, + "balance_loss_clip": 1.02009797, + "balance_loss_mlp": 1.03760505, + "epoch": 0.6266045393055765, + "flos": 19938323047680.0, + "grad_norm": 2.5701422758472687, + "language_loss": 0.79219615, + "learning_rate": 1.2926113267839403e-06, + "loss": 0.81358123, + "num_input_tokens_seen": 224502795, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.69140625, + "step": 10422, + "time_per_iteration": 2.4565389156341553 + }, + { + "auxiliary_loss_clip": 0.01102252, + "auxiliary_loss_mlp": 0.01024803, + "balance_loss_clip": 1.01235723, + "balance_loss_mlp": 1.03458548, + "epoch": 0.6266646625582444, + "flos": 24389091325440.0, + "grad_norm": 2.6493252664986784, + "language_loss": 0.74391955, + "learning_rate": 1.292247052906389e-06, + "loss": 0.76519012, + "num_input_tokens_seen": 224522300, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.67578125, + "step": 10423, + "time_per_iteration": 2.4744317531585693 + }, + { + "auxiliary_loss_clip": 0.01102071, + "auxiliary_loss_mlp": 0.0102725, + "balance_loss_clip": 1.0154779, + "balance_loss_mlp": 1.03445518, + "epoch": 0.6267247858109124, + "flos": 14683802088960.0, + "grad_norm": 1.8573410403622042, + "language_loss": 0.77685475, + "learning_rate": 1.2918828058669004e-06, + "loss": 0.79814792, + "num_input_tokens_seen": 224538260, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.67578125, + "step": 10424, + "time_per_iteration": 2.459156036376953 + }, + { + "auxiliary_loss_clip": 0.0110302, + "auxiliary_loss_mlp": 0.01028249, + "balance_loss_clip": 1.01498699, + "balance_loss_mlp": 1.03587162, + "epoch": 0.6267849090635803, + "flos": 24929977299840.0, + "grad_norm": 1.722847581119462, + "language_loss": 0.6881507, + "learning_rate": 1.2915185856792868e-06, + "loss": 0.70946336, + "num_input_tokens_seen": 224559155, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.671875, + "step": 10425, + "time_per_iteration": 2.4837486743927 + }, + { + "auxiliary_loss_clip": 0.01100261, + "auxiliary_loss_mlp": 0.01028731, + "balance_loss_clip": 1.0183543, + "balance_loss_mlp": 1.0359807, + "epoch": 0.6268450323162483, + "flos": 25337851211520.0, + "grad_norm": 1.5803855338986545, + "language_loss": 0.7465167, + "learning_rate": 1.2911543923573598e-06, + "loss": 0.76780665, + "num_input_tokens_seen": 224578660, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.640625, + "step": 10426, + "time_per_iteration": 2.514317274093628 + }, + { + "auxiliary_loss_clip": 0.01105959, + "auxiliary_loss_mlp": 0.01032051, + "balance_loss_clip": 1.01989794, + "balance_loss_mlp": 1.03667617, + "epoch": 0.6269051555689162, + "flos": 26177299032960.0, + "grad_norm": 1.372305042134179, + "language_loss": 0.80499035, + "learning_rate": 1.290790225914929e-06, + "loss": 0.82637042, + "num_input_tokens_seen": 224599080, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6953125, + "step": 10427, + "time_per_iteration": 3.906360149383545 + }, + { + "auxiliary_loss_clip": 0.01106724, + "auxiliary_loss_mlp": 0.01031602, + "balance_loss_clip": 1.01931798, + "balance_loss_mlp": 1.03726578, + "epoch": 0.6269652788215843, + "flos": 18256877539200.0, + "grad_norm": 1.7157059050483638, + "language_loss": 0.68742979, + "learning_rate": 1.2904260863658034e-06, + "loss": 0.70881307, + "num_input_tokens_seen": 224614225, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6953125, + "step": 10428, + "time_per_iteration": 2.4357380867004395 + }, + { + "auxiliary_loss_clip": 0.01105018, + "auxiliary_loss_mlp": 0.01032275, + "balance_loss_clip": 1.02134943, + "balance_loss_mlp": 1.03779614, + "epoch": 0.6270254020742522, + "flos": 11765413877760.0, + "grad_norm": 1.9089213874225204, + "language_loss": 0.71640742, + "learning_rate": 1.2900619737237928e-06, + "loss": 0.73778033, + "num_input_tokens_seen": 224632365, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.671875, + "step": 10429, + "time_per_iteration": 3.8758704662323 + }, + { + "auxiliary_loss_clip": 0.01108797, + "auxiliary_loss_mlp": 0.01033731, + "balance_loss_clip": 1.02144098, + "balance_loss_mlp": 1.03881693, + "epoch": 0.6270855253269202, + "flos": 23475631530240.0, + "grad_norm": 1.5765983769123613, + "language_loss": 0.79904956, + "learning_rate": 1.2896978880027023e-06, + "loss": 0.82047486, + "num_input_tokens_seen": 224651125, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69921875, + "step": 10430, + "time_per_iteration": 5.480989217758179 + }, + { + "auxiliary_loss_clip": 0.01027432, + "auxiliary_loss_mlp": 0.01010431, + "balance_loss_clip": 1.00904214, + "balance_loss_mlp": 1.00618088, + "epoch": 0.6271456485795882, + "flos": 70064520232320.0, + "grad_norm": 0.7689165216290166, + "language_loss": 0.59162331, + "learning_rate": 1.2893338292163393e-06, + "loss": 0.6120019, + "num_input_tokens_seen": 224716115, + "router_z_loss_clip": 0.01391602, + "router_z_loss_mlp": 0.21289062, + "step": 10431, + "time_per_iteration": 3.1698784828186035 + }, + { + "auxiliary_loss_clip": 0.0102736, + "auxiliary_loss_mlp": 0.01007095, + "balance_loss_clip": 1.00575376, + "balance_loss_mlp": 1.00630832, + "epoch": 0.6272057718322561, + "flos": 65156718280320.0, + "grad_norm": 0.8815125854573025, + "language_loss": 0.63825411, + "learning_rate": 1.2889697973785095e-06, + "loss": 0.6585986, + "num_input_tokens_seen": 224782930, + "router_z_loss_clip": 0.01342773, + "router_z_loss_mlp": 0.2109375, + "step": 10432, + "time_per_iteration": 3.1316046714782715 + }, + { + "auxiliary_loss_clip": 0.01101622, + "auxiliary_loss_mlp": 0.0103135, + "balance_loss_clip": 1.02075207, + "balance_loss_mlp": 1.03523922, + "epoch": 0.6272658950849241, + "flos": 24389342720640.0, + "grad_norm": 1.6684665767860385, + "language_loss": 0.6480633, + "learning_rate": 1.2886057925030153e-06, + "loss": 0.66939294, + "num_input_tokens_seen": 224802010, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.6640625, + "step": 10433, + "time_per_iteration": 2.530367851257324 + }, + { + "auxiliary_loss_clip": 0.01109549, + "auxiliary_loss_mlp": 0.01033392, + "balance_loss_clip": 1.02095246, + "balance_loss_mlp": 1.03838599, + "epoch": 0.627326018337592, + "flos": 17966001202560.0, + "grad_norm": 1.999112171650009, + "language_loss": 0.61930764, + "learning_rate": 1.2882418146036612e-06, + "loss": 0.64073694, + "num_input_tokens_seen": 224818875, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 10434, + "time_per_iteration": 2.4613072872161865 + }, + { + "auxiliary_loss_clip": 0.01103629, + "auxiliary_loss_mlp": 0.01025848, + "balance_loss_clip": 1.01420164, + "balance_loss_mlp": 1.03523064, + "epoch": 0.6273861415902601, + "flos": 20230097224320.0, + "grad_norm": 1.7052209762713233, + "language_loss": 0.84669697, + "learning_rate": 1.2878778636942484e-06, + "loss": 0.86799175, + "num_input_tokens_seen": 224837790, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.68359375, + "step": 10435, + "time_per_iteration": 2.516956090927124 + }, + { + "auxiliary_loss_clip": 0.01027112, + "auxiliary_loss_mlp": 0.01006345, + "balance_loss_clip": 1.00503409, + "balance_loss_mlp": 1.00594997, + "epoch": 0.627446264842928, + "flos": 64953210798720.0, + "grad_norm": 0.7299742143913254, + "language_loss": 0.61572838, + "learning_rate": 1.2875139397885786e-06, + "loss": 0.63606298, + "num_input_tokens_seen": 224899685, + "router_z_loss_clip": 0.01312256, + "router_z_loss_mlp": 0.2109375, + "step": 10436, + "time_per_iteration": 3.1023128032684326 + }, + { + "auxiliary_loss_clip": 0.01107216, + "auxiliary_loss_mlp": 0.01034863, + "balance_loss_clip": 1.02201223, + "balance_loss_mlp": 1.03899169, + "epoch": 0.627506388095596, + "flos": 23584261236480.0, + "grad_norm": 1.5188433692104768, + "language_loss": 0.77361041, + "learning_rate": 1.2871500429004523e-06, + "loss": 0.79503125, + "num_input_tokens_seen": 224918650, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.68359375, + "step": 10437, + "time_per_iteration": 2.5252764225006104 + }, + { + "auxiliary_loss_clip": 0.01026138, + "auxiliary_loss_mlp": 0.01003989, + "balance_loss_clip": 1.0027318, + "balance_loss_mlp": 1.00493383, + "epoch": 0.6275665113482639, + "flos": 67583631674880.0, + "grad_norm": 0.7219307652334395, + "language_loss": 0.5436241, + "learning_rate": 1.2867861730436667e-06, + "loss": 0.56392533, + "num_input_tokens_seen": 224981575, + "router_z_loss_clip": 0.01257324, + "router_z_loss_mlp": 0.21289062, + "step": 10438, + "time_per_iteration": 3.043013572692871 + }, + { + "auxiliary_loss_clip": 0.01102529, + "auxiliary_loss_mlp": 0.01041098, + "balance_loss_clip": 1.02895069, + "balance_loss_mlp": 1.03441381, + "epoch": 0.6276266346009319, + "flos": 27636924101760.0, + "grad_norm": 2.0343389960160163, + "language_loss": 0.84072959, + "learning_rate": 1.2864223302320214e-06, + "loss": 0.86216581, + "num_input_tokens_seen": 225000820, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6796875, + "step": 10439, + "time_per_iteration": 2.5371646881103516 + }, + { + "auxiliary_loss_clip": 0.0110542, + "auxiliary_loss_mlp": 0.01040087, + "balance_loss_clip": 1.0274682, + "balance_loss_mlp": 1.03541088, + "epoch": 0.6276867578535998, + "flos": 22746142218240.0, + "grad_norm": 2.0182472461440057, + "language_loss": 0.8041876, + "learning_rate": 1.2860585144793128e-06, + "loss": 0.8256427, + "num_input_tokens_seen": 225017585, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69921875, + "step": 10440, + "time_per_iteration": 2.4601192474365234 + }, + { + "auxiliary_loss_clip": 0.01099453, + "auxiliary_loss_mlp": 0.01028712, + "balance_loss_clip": 1.01833498, + "balance_loss_mlp": 1.03509974, + "epoch": 0.6277468811062679, + "flos": 24644200694400.0, + "grad_norm": 1.4716906489338055, + "language_loss": 0.74504089, + "learning_rate": 1.285694725799337e-06, + "loss": 0.76632255, + "num_input_tokens_seen": 225039085, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.640625, + "step": 10441, + "time_per_iteration": 2.5412392616271973 + }, + { + "auxiliary_loss_clip": 0.0110177, + "auxiliary_loss_mlp": 0.01029515, + "balance_loss_clip": 1.0175643, + "balance_loss_mlp": 1.03450918, + "epoch": 0.6278070043589358, + "flos": 19678975873920.0, + "grad_norm": 1.707965956451768, + "language_loss": 0.72134054, + "learning_rate": 1.2853309642058884e-06, + "loss": 0.74265343, + "num_input_tokens_seen": 225058105, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 10442, + "time_per_iteration": 2.537446975708008 + }, + { + "auxiliary_loss_clip": 0.01103523, + "auxiliary_loss_mlp": 0.01029473, + "balance_loss_clip": 1.01785576, + "balance_loss_mlp": 1.03555417, + "epoch": 0.6278671276116038, + "flos": 22121834906880.0, + "grad_norm": 1.5665674956365474, + "language_loss": 0.71364504, + "learning_rate": 1.284967229712762e-06, + "loss": 0.73497498, + "num_input_tokens_seen": 225077605, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 10443, + "time_per_iteration": 2.49980092048645 + }, + { + "auxiliary_loss_clip": 0.01103341, + "auxiliary_loss_mlp": 0.01025519, + "balance_loss_clip": 1.01374125, + "balance_loss_mlp": 1.03619695, + "epoch": 0.6279272508642717, + "flos": 23038562839680.0, + "grad_norm": 1.9169292083366938, + "language_loss": 0.72973317, + "learning_rate": 1.2846035223337492e-06, + "loss": 0.75102174, + "num_input_tokens_seen": 225097775, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.671875, + "step": 10444, + "time_per_iteration": 2.474400520324707 + }, + { + "auxiliary_loss_clip": 0.01102711, + "auxiliary_loss_mlp": 0.01027435, + "balance_loss_clip": 1.0155499, + "balance_loss_mlp": 1.03607392, + "epoch": 0.6279873741169397, + "flos": 19824090819840.0, + "grad_norm": 1.8659138317245392, + "language_loss": 0.72426593, + "learning_rate": 1.2842398420826423e-06, + "loss": 0.74556732, + "num_input_tokens_seen": 225115585, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66796875, + "step": 10445, + "time_per_iteration": 2.4486618041992188 + }, + { + "auxiliary_loss_clip": 0.01101674, + "auxiliary_loss_mlp": 0.0103002, + "balance_loss_clip": 1.01832557, + "balance_loss_mlp": 1.03417051, + "epoch": 0.6280474973696077, + "flos": 23915393740800.0, + "grad_norm": 1.6334831062955149, + "language_loss": 0.69040692, + "learning_rate": 1.2838761889732331e-06, + "loss": 0.71172386, + "num_input_tokens_seen": 225135575, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.67578125, + "step": 10446, + "time_per_iteration": 2.4619648456573486 + }, + { + "auxiliary_loss_clip": 0.01108513, + "auxiliary_loss_mlp": 0.01030919, + "balance_loss_clip": 1.01822352, + "balance_loss_mlp": 1.03651023, + "epoch": 0.6281076206222757, + "flos": 17967976450560.0, + "grad_norm": 1.946229669067864, + "language_loss": 0.74025476, + "learning_rate": 1.2835125630193102e-06, + "loss": 0.76164913, + "num_input_tokens_seen": 225154230, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 10447, + "time_per_iteration": 2.449399948120117 + }, + { + "auxiliary_loss_clip": 0.01025063, + "auxiliary_loss_mlp": 0.00998572, + "balance_loss_clip": 0.99728459, + "balance_loss_mlp": 1.00378299, + "epoch": 0.6281677438749437, + "flos": 66778370622720.0, + "grad_norm": 0.6772794879542157, + "language_loss": 0.52363139, + "learning_rate": 1.2831489642346626e-06, + "loss": 0.54386771, + "num_input_tokens_seen": 225213650, + "router_z_loss_clip": 0.01287842, + "router_z_loss_mlp": 0.21289062, + "step": 10448, + "time_per_iteration": 2.9426791667938232 + }, + { + "auxiliary_loss_clip": 0.01106244, + "auxiliary_loss_mlp": 0.01041172, + "balance_loss_clip": 1.02860117, + "balance_loss_mlp": 1.03656423, + "epoch": 0.6282278671276116, + "flos": 11656173640320.0, + "grad_norm": 2.1057349931562275, + "language_loss": 0.91307616, + "learning_rate": 1.282785392633079e-06, + "loss": 0.93455029, + "num_input_tokens_seen": 225230135, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 10449, + "time_per_iteration": 2.4679763317108154 + }, + { + "auxiliary_loss_clip": 0.01101969, + "auxiliary_loss_mlp": 0.01029117, + "balance_loss_clip": 1.01808405, + "balance_loss_mlp": 1.03486931, + "epoch": 0.6282879903802796, + "flos": 42741597847680.0, + "grad_norm": 1.5272379639764508, + "language_loss": 0.60454214, + "learning_rate": 1.2824218482283438e-06, + "loss": 0.62585294, + "num_input_tokens_seen": 225253520, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.671875, + "step": 10450, + "time_per_iteration": 2.665226459503174 + }, + { + "auxiliary_loss_clip": 0.01101695, + "auxiliary_loss_mlp": 0.01030714, + "balance_loss_clip": 1.01926398, + "balance_loss_mlp": 1.03620005, + "epoch": 0.6283481136329475, + "flos": 20009210538240.0, + "grad_norm": 1.5565304478998412, + "language_loss": 0.76683152, + "learning_rate": 1.2820583310342452e-06, + "loss": 0.78815556, + "num_input_tokens_seen": 225272460, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65625, + "step": 10451, + "time_per_iteration": 2.4581120014190674 + }, + { + "auxiliary_loss_clip": 0.01105178, + "auxiliary_loss_mlp": 0.01031424, + "balance_loss_clip": 1.01928806, + "balance_loss_mlp": 1.0352962, + "epoch": 0.6284082368856155, + "flos": 21904431840000.0, + "grad_norm": 1.577387753245048, + "language_loss": 0.77243423, + "learning_rate": 1.281694841064566e-06, + "loss": 0.79380023, + "num_input_tokens_seen": 225291700, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69921875, + "step": 10452, + "time_per_iteration": 2.4569571018218994 + }, + { + "auxiliary_loss_clip": 0.01105275, + "auxiliary_loss_mlp": 0.01031854, + "balance_loss_clip": 1.01977849, + "balance_loss_mlp": 1.03737903, + "epoch": 0.6284683601382834, + "flos": 25484187219840.0, + "grad_norm": 1.9445051684642027, + "language_loss": 0.72382963, + "learning_rate": 1.2813313783330904e-06, + "loss": 0.74520093, + "num_input_tokens_seen": 225311470, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6796875, + "step": 10453, + "time_per_iteration": 2.4979004859924316 + }, + { + "auxiliary_loss_clip": 0.01102123, + "auxiliary_loss_mlp": 0.0102867, + "balance_loss_clip": 1.01643896, + "balance_loss_mlp": 1.03324366, + "epoch": 0.6285284833909515, + "flos": 16538695395840.0, + "grad_norm": 1.6809278534400005, + "language_loss": 0.80429286, + "learning_rate": 1.2809679428536013e-06, + "loss": 0.82560074, + "num_input_tokens_seen": 225328385, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 10454, + "time_per_iteration": 2.409714937210083 + }, + { + "auxiliary_loss_clip": 0.01102175, + "auxiliary_loss_mlp": 0.01029803, + "balance_loss_clip": 1.01829922, + "balance_loss_mlp": 1.03586721, + "epoch": 0.6285886066436194, + "flos": 22820692896000.0, + "grad_norm": 1.824800115863982, + "language_loss": 0.82303673, + "learning_rate": 1.2806045346398792e-06, + "loss": 0.84435654, + "num_input_tokens_seen": 225348415, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 10455, + "time_per_iteration": 2.4712390899658203 + }, + { + "auxiliary_loss_clip": 0.01103001, + "auxiliary_loss_mlp": 0.01029287, + "balance_loss_clip": 1.01754522, + "balance_loss_mlp": 1.03569484, + "epoch": 0.6286487298962874, + "flos": 24715734629760.0, + "grad_norm": 3.44693783643537, + "language_loss": 0.81578875, + "learning_rate": 1.280241153705706e-06, + "loss": 0.83711159, + "num_input_tokens_seen": 225367740, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 10456, + "time_per_iteration": 2.48745059967041 + }, + { + "auxiliary_loss_clip": 0.01107634, + "auxiliary_loss_mlp": 0.01030014, + "balance_loss_clip": 1.01755691, + "balance_loss_mlp": 1.03793502, + "epoch": 0.6287088531489553, + "flos": 20740818752640.0, + "grad_norm": 1.5367705166393795, + "language_loss": 0.72127652, + "learning_rate": 1.27987780006486e-06, + "loss": 0.74265301, + "num_input_tokens_seen": 225388405, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 10457, + "time_per_iteration": 2.451204776763916 + }, + { + "auxiliary_loss_clip": 0.01107301, + "auxiliary_loss_mlp": 0.0103049, + "balance_loss_clip": 1.01816964, + "balance_loss_mlp": 1.03497529, + "epoch": 0.6287689764016233, + "flos": 23070630706560.0, + "grad_norm": 2.138119380312756, + "language_loss": 0.79647571, + "learning_rate": 1.2795144737311202e-06, + "loss": 0.81785357, + "num_input_tokens_seen": 225408360, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.72265625, + "step": 10458, + "time_per_iteration": 2.4522323608398438 + }, + { + "auxiliary_loss_clip": 0.01107535, + "auxiliary_loss_mlp": 0.01031161, + "balance_loss_clip": 1.01934147, + "balance_loss_mlp": 1.03738856, + "epoch": 0.6288290996542913, + "flos": 32233669251840.0, + "grad_norm": 1.5072940054720605, + "language_loss": 0.60961497, + "learning_rate": 1.2791511747182635e-06, + "loss": 0.63100201, + "num_input_tokens_seen": 225431310, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.703125, + "step": 10459, + "time_per_iteration": 2.5262553691864014 + }, + { + "auxiliary_loss_clip": 0.01105348, + "auxiliary_loss_mlp": 0.01029341, + "balance_loss_clip": 1.01796818, + "balance_loss_mlp": 1.03684652, + "epoch": 0.6288892229069593, + "flos": 24641327606400.0, + "grad_norm": 1.7541268062536184, + "language_loss": 0.7885046, + "learning_rate": 1.2787879030400666e-06, + "loss": 0.80985153, + "num_input_tokens_seen": 225450385, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.68359375, + "step": 10460, + "time_per_iteration": 2.4601290225982666 + }, + { + "auxiliary_loss_clip": 0.01102775, + "auxiliary_loss_mlp": 0.01025947, + "balance_loss_clip": 1.01438367, + "balance_loss_mlp": 1.03575253, + "epoch": 0.6289493461596273, + "flos": 17858341163520.0, + "grad_norm": 1.7189888907813877, + "language_loss": 0.73800498, + "learning_rate": 1.2784246587103047e-06, + "loss": 0.75929219, + "num_input_tokens_seen": 225467325, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.671875, + "step": 10461, + "time_per_iteration": 2.4365780353546143 + }, + { + "auxiliary_loss_clip": 0.01100652, + "auxiliary_loss_mlp": 0.01033338, + "balance_loss_clip": 1.02188754, + "balance_loss_mlp": 1.03492045, + "epoch": 0.6290094694122952, + "flos": 22345379199360.0, + "grad_norm": 1.7518850371883825, + "language_loss": 0.70340359, + "learning_rate": 1.2780614417427523e-06, + "loss": 0.72474349, + "num_input_tokens_seen": 225487370, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 10462, + "time_per_iteration": 2.4497246742248535 + }, + { + "auxiliary_loss_clip": 0.01098069, + "auxiliary_loss_mlp": 0.01028834, + "balance_loss_clip": 1.01851106, + "balance_loss_mlp": 1.03555751, + "epoch": 0.6290695926649632, + "flos": 28402431776640.0, + "grad_norm": 1.8426896444846728, + "language_loss": 0.71998221, + "learning_rate": 1.2776982521511821e-06, + "loss": 0.74125123, + "num_input_tokens_seen": 225506915, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.625, + "step": 10463, + "time_per_iteration": 2.519118070602417 + }, + { + "auxiliary_loss_clip": 0.01104354, + "auxiliary_loss_mlp": 0.01035813, + "balance_loss_clip": 1.02407098, + "balance_loss_mlp": 1.03894711, + "epoch": 0.6291297159176311, + "flos": 21505464501120.0, + "grad_norm": 2.0251276075815507, + "language_loss": 0.72917801, + "learning_rate": 1.2773350899493665e-06, + "loss": 0.75057971, + "num_input_tokens_seen": 225525670, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 10464, + "time_per_iteration": 2.4394619464874268 + }, + { + "auxiliary_loss_clip": 0.01101197, + "auxiliary_loss_mlp": 0.01028137, + "balance_loss_clip": 1.01696074, + "balance_loss_mlp": 1.03590441, + "epoch": 0.6291898391702991, + "flos": 12203308581120.0, + "grad_norm": 1.8688314099913752, + "language_loss": 0.69353777, + "learning_rate": 1.2769719551510768e-06, + "loss": 0.71483117, + "num_input_tokens_seen": 225542235, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65234375, + "step": 10465, + "time_per_iteration": 2.420706033706665 + }, + { + "auxiliary_loss_clip": 0.01025681, + "auxiliary_loss_mlp": 0.01003212, + "balance_loss_clip": 1.00197816, + "balance_loss_mlp": 1.00449264, + "epoch": 0.629249962422967, + "flos": 69299479434240.0, + "grad_norm": 0.6783887533402703, + "language_loss": 0.59743875, + "learning_rate": 1.2766088477700832e-06, + "loss": 0.6177277, + "num_input_tokens_seen": 225607185, + "router_z_loss_clip": 0.0123291, + "router_z_loss_mlp": 0.21191406, + "step": 10466, + "time_per_iteration": 3.1529486179351807 + }, + { + "auxiliary_loss_clip": 0.01098875, + "auxiliary_loss_mlp": 0.01028989, + "balance_loss_clip": 1.01821828, + "balance_loss_mlp": 1.03199136, + "epoch": 0.6293100856756351, + "flos": 40077888042240.0, + "grad_norm": 1.895578491152679, + "language_loss": 0.64383173, + "learning_rate": 1.276245767820154e-06, + "loss": 0.66511035, + "num_input_tokens_seen": 225628785, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.66796875, + "step": 10467, + "time_per_iteration": 2.596909761428833 + }, + { + "auxiliary_loss_clip": 0.01025676, + "auxiliary_loss_mlp": 0.00999758, + "balance_loss_clip": 0.9984706, + "balance_loss_mlp": 1.00462031, + "epoch": 0.629370208928303, + "flos": 67501108177920.0, + "grad_norm": 0.7946860251086647, + "language_loss": 0.569076, + "learning_rate": 1.2758827153150586e-06, + "loss": 0.58933038, + "num_input_tokens_seen": 225678980, + "router_z_loss_clip": 0.01287842, + "router_z_loss_mlp": 0.2109375, + "step": 10468, + "time_per_iteration": 4.298036336898804 + }, + { + "auxiliary_loss_clip": 0.01025761, + "auxiliary_loss_mlp": 0.01000379, + "balance_loss_clip": 0.99905533, + "balance_loss_mlp": 1.00460362, + "epoch": 0.629430332180971, + "flos": 60660450449280.0, + "grad_norm": 0.7346247861969195, + "language_loss": 0.580616, + "learning_rate": 1.2755196902685626e-06, + "loss": 0.6008774, + "num_input_tokens_seen": 225740295, + "router_z_loss_clip": 0.01324463, + "router_z_loss_mlp": 0.2109375, + "step": 10469, + "time_per_iteration": 3.013350009918213 + }, + { + "auxiliary_loss_clip": 0.01026242, + "auxiliary_loss_mlp": 0.0100094, + "balance_loss_clip": 0.99966449, + "balance_loss_mlp": 1.00510228, + "epoch": 0.6294904554336389, + "flos": 66869764778880.0, + "grad_norm": 0.6786572594163077, + "language_loss": 0.5214479, + "learning_rate": 1.2751566926944329e-06, + "loss": 0.54171979, + "num_input_tokens_seen": 225805615, + "router_z_loss_clip": 0.01275635, + "router_z_loss_mlp": 0.21191406, + "step": 10470, + "time_per_iteration": 3.1025776863098145 + }, + { + "auxiliary_loss_clip": 0.01101792, + "auxiliary_loss_mlp": 0.01030269, + "balance_loss_clip": 1.01853275, + "balance_loss_mlp": 1.03531003, + "epoch": 0.6295505786863069, + "flos": 42522794150400.0, + "grad_norm": 1.7821374773378207, + "language_loss": 0.7444669, + "learning_rate": 1.2747937226064342e-06, + "loss": 0.76578748, + "num_input_tokens_seen": 225826585, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 10471, + "time_per_iteration": 5.750757455825806 + }, + { + "auxiliary_loss_clip": 0.01104026, + "auxiliary_loss_mlp": 0.0102689, + "balance_loss_clip": 1.0161432, + "balance_loss_mlp": 1.03594935, + "epoch": 0.629610701938975, + "flos": 17384140788480.0, + "grad_norm": 1.928248423372208, + "language_loss": 0.62892604, + "learning_rate": 1.2744307800183297e-06, + "loss": 0.65023524, + "num_input_tokens_seen": 225844095, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6796875, + "step": 10472, + "time_per_iteration": 2.4507625102996826 + }, + { + "auxiliary_loss_clip": 0.01108224, + "auxiliary_loss_mlp": 0.01032069, + "balance_loss_clip": 1.02030277, + "balance_loss_mlp": 1.03887987, + "epoch": 0.6296708251916429, + "flos": 24242934885120.0, + "grad_norm": 1.6696696732656569, + "language_loss": 0.69374871, + "learning_rate": 1.2740678649438828e-06, + "loss": 0.71515167, + "num_input_tokens_seen": 225864310, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6953125, + "step": 10473, + "time_per_iteration": 3.954071283340454 + }, + { + "auxiliary_loss_clip": 0.01101498, + "auxiliary_loss_mlp": 0.0102561, + "balance_loss_clip": 1.01492906, + "balance_loss_mlp": 1.03493738, + "epoch": 0.6297309484443109, + "flos": 19278536077440.0, + "grad_norm": 1.5555016558834316, + "language_loss": 0.74785316, + "learning_rate": 1.2737049773968554e-06, + "loss": 0.76912427, + "num_input_tokens_seen": 225883830, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6640625, + "step": 10474, + "time_per_iteration": 2.4985709190368652 + }, + { + "auxiliary_loss_clip": 0.0110251, + "auxiliary_loss_mlp": 0.01028258, + "balance_loss_clip": 1.01681423, + "balance_loss_mlp": 1.03494573, + "epoch": 0.6297910716969788, + "flos": 30662685043200.0, + "grad_norm": 1.565073448719141, + "language_loss": 0.66372955, + "learning_rate": 1.2733421173910081e-06, + "loss": 0.68503714, + "num_input_tokens_seen": 225905755, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.67578125, + "step": 10475, + "time_per_iteration": 2.511357307434082 + }, + { + "auxiliary_loss_clip": 0.01098965, + "auxiliary_loss_mlp": 0.01029394, + "balance_loss_clip": 1.01852202, + "balance_loss_mlp": 1.03441048, + "epoch": 0.6298511949496468, + "flos": 14423018371200.0, + "grad_norm": 2.080975026928719, + "language_loss": 0.9029789, + "learning_rate": 1.272979284940101e-06, + "loss": 0.92426246, + "num_input_tokens_seen": 225922155, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.64453125, + "step": 10476, + "time_per_iteration": 2.4218876361846924 + }, + { + "auxiliary_loss_clip": 0.01101376, + "auxiliary_loss_mlp": 0.01035009, + "balance_loss_clip": 1.02379751, + "balance_loss_mlp": 1.03476787, + "epoch": 0.6299113182023147, + "flos": 23514163845120.0, + "grad_norm": 1.6697359788083987, + "language_loss": 0.75050914, + "learning_rate": 1.2726164800578913e-06, + "loss": 0.771873, + "num_input_tokens_seen": 225941060, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6640625, + "step": 10477, + "time_per_iteration": 2.455409049987793 + }, + { + "auxiliary_loss_clip": 0.01101367, + "auxiliary_loss_mlp": 0.01026543, + "balance_loss_clip": 1.01468766, + "balance_loss_mlp": 1.0337708, + "epoch": 0.6299714414549827, + "flos": 22674500542080.0, + "grad_norm": 1.9554844868820769, + "language_loss": 0.70427382, + "learning_rate": 1.272253702758138e-06, + "loss": 0.72555292, + "num_input_tokens_seen": 225960870, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.67578125, + "step": 10478, + "time_per_iteration": 2.448185443878174 + }, + { + "auxiliary_loss_clip": 0.01107518, + "auxiliary_loss_mlp": 0.01031561, + "balance_loss_clip": 1.01893711, + "balance_loss_mlp": 1.03713453, + "epoch": 0.6300315647076506, + "flos": 14501735026560.0, + "grad_norm": 2.8380864968685287, + "language_loss": 0.67054832, + "learning_rate": 1.2718909530545974e-06, + "loss": 0.69193918, + "num_input_tokens_seen": 225977895, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 10479, + "time_per_iteration": 2.4200356006622314 + }, + { + "auxiliary_loss_clip": 0.01103494, + "auxiliary_loss_mlp": 0.01033351, + "balance_loss_clip": 1.0211798, + "balance_loss_mlp": 1.03659678, + "epoch": 0.6300916879603187, + "flos": 21871681614720.0, + "grad_norm": 3.6551699512461067, + "language_loss": 0.73471272, + "learning_rate": 1.2715282309610245e-06, + "loss": 0.75608122, + "num_input_tokens_seen": 225997835, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.66796875, + "step": 10480, + "time_per_iteration": 2.4555039405822754 + }, + { + "auxiliary_loss_clip": 0.01104037, + "auxiliary_loss_mlp": 0.01031551, + "balance_loss_clip": 1.01883698, + "balance_loss_mlp": 1.03487301, + "epoch": 0.6301518112129866, + "flos": 21834047139840.0, + "grad_norm": 2.336956908643113, + "language_loss": 0.78874803, + "learning_rate": 1.2711655364911744e-06, + "loss": 0.81010389, + "num_input_tokens_seen": 226017620, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 10481, + "time_per_iteration": 2.4346415996551514 + }, + { + "auxiliary_loss_clip": 0.01026096, + "auxiliary_loss_mlp": 0.01005078, + "balance_loss_clip": 1.00391531, + "balance_loss_mlp": 1.0049262, + "epoch": 0.6302119344656546, + "flos": 44334237957120.0, + "grad_norm": 0.9177955201810194, + "language_loss": 0.61818945, + "learning_rate": 1.2708028696588e-06, + "loss": 0.63850117, + "num_input_tokens_seen": 226068755, + "router_z_loss_clip": 0.01159668, + "router_z_loss_mlp": 0.21191406, + "step": 10482, + "time_per_iteration": 2.812809705734253 + }, + { + "auxiliary_loss_clip": 0.01108769, + "auxiliary_loss_mlp": 0.01030629, + "balance_loss_clip": 1.01766491, + "balance_loss_mlp": 1.03617549, + "epoch": 0.6302720577183225, + "flos": 11217919800960.0, + "grad_norm": 2.913772314034849, + "language_loss": 0.83037972, + "learning_rate": 1.2704402304776541e-06, + "loss": 0.85177374, + "num_input_tokens_seen": 226084395, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7265625, + "step": 10483, + "time_per_iteration": 2.401224374771118 + }, + { + "auxiliary_loss_clip": 0.01097662, + "auxiliary_loss_mlp": 0.01031323, + "balance_loss_clip": 1.02013469, + "balance_loss_mlp": 1.03428078, + "epoch": 0.6303321809709905, + "flos": 27964932122880.0, + "grad_norm": 1.5146236246766749, + "language_loss": 0.72939026, + "learning_rate": 1.270077618961487e-06, + "loss": 0.75068009, + "num_input_tokens_seen": 226105890, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6328125, + "step": 10484, + "time_per_iteration": 2.5125913619995117 + }, + { + "auxiliary_loss_clip": 0.01101969, + "auxiliary_loss_mlp": 0.01026821, + "balance_loss_clip": 1.01511419, + "balance_loss_mlp": 1.03390932, + "epoch": 0.6303923042236586, + "flos": 28220759763840.0, + "grad_norm": 1.8710303376286184, + "language_loss": 0.74698818, + "learning_rate": 1.2697150351240506e-06, + "loss": 0.7682761, + "num_input_tokens_seen": 226126760, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 10485, + "time_per_iteration": 2.4874563217163086 + }, + { + "auxiliary_loss_clip": 0.01107856, + "auxiliary_loss_mlp": 0.01031188, + "balance_loss_clip": 1.01911783, + "balance_loss_mlp": 1.03676295, + "epoch": 0.6304524274763265, + "flos": 27631034271360.0, + "grad_norm": 1.9819800910053105, + "language_loss": 0.81547624, + "learning_rate": 1.269352478979093e-06, + "loss": 0.83686674, + "num_input_tokens_seen": 226147315, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.7109375, + "step": 10486, + "time_per_iteration": 2.4926888942718506 + }, + { + "auxiliary_loss_clip": 0.01102993, + "auxiliary_loss_mlp": 0.01036434, + "balance_loss_clip": 1.0249896, + "balance_loss_mlp": 1.03641152, + "epoch": 0.6305125507289945, + "flos": 17311313963520.0, + "grad_norm": 2.1821850164901675, + "language_loss": 0.63638449, + "learning_rate": 1.2689899505403628e-06, + "loss": 0.65777874, + "num_input_tokens_seen": 226165935, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 10487, + "time_per_iteration": 2.408770799636841 + }, + { + "auxiliary_loss_clip": 0.01103897, + "auxiliary_loss_mlp": 0.01035839, + "balance_loss_clip": 1.02434158, + "balance_loss_mlp": 1.03714716, + "epoch": 0.6305726739816624, + "flos": 25808280658560.0, + "grad_norm": 1.4517629521514586, + "language_loss": 0.67256761, + "learning_rate": 1.2686274498216065e-06, + "loss": 0.69396502, + "num_input_tokens_seen": 226186890, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.66796875, + "step": 10488, + "time_per_iteration": 2.484377861022949 + }, + { + "auxiliary_loss_clip": 0.01105074, + "auxiliary_loss_mlp": 0.01028659, + "balance_loss_clip": 1.01726294, + "balance_loss_mlp": 1.03574753, + "epoch": 0.6306327972343304, + "flos": 21797454159360.0, + "grad_norm": 1.9513019958263491, + "language_loss": 0.67263639, + "learning_rate": 1.2682649768365706e-06, + "loss": 0.69397372, + "num_input_tokens_seen": 226206710, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.69140625, + "step": 10489, + "time_per_iteration": 2.4636588096618652 + }, + { + "auxiliary_loss_clip": 0.01110064, + "auxiliary_loss_mlp": 0.01029894, + "balance_loss_clip": 1.0174365, + "balance_loss_mlp": 1.03723645, + "epoch": 0.6306929204869983, + "flos": 20777375819520.0, + "grad_norm": 1.966397981441809, + "language_loss": 0.69455999, + "learning_rate": 1.2679025315990007e-06, + "loss": 0.71595961, + "num_input_tokens_seen": 226225565, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7265625, + "step": 10490, + "time_per_iteration": 2.4483461380004883 + }, + { + "auxiliary_loss_clip": 0.0110581, + "auxiliary_loss_mlp": 0.01033287, + "balance_loss_clip": 1.02133048, + "balance_loss_mlp": 1.03623903, + "epoch": 0.6307530437396663, + "flos": 23654214973440.0, + "grad_norm": 2.505536440046342, + "language_loss": 0.78477776, + "learning_rate": 1.2675401141226393e-06, + "loss": 0.80616874, + "num_input_tokens_seen": 226243680, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6953125, + "step": 10491, + "time_per_iteration": 2.4928994178771973 + }, + { + "auxiliary_loss_clip": 0.01104065, + "auxiliary_loss_mlp": 0.01031707, + "balance_loss_clip": 1.02038789, + "balance_loss_mlp": 1.03679323, + "epoch": 0.6308131669923343, + "flos": 24719002767360.0, + "grad_norm": 1.9616523750971206, + "language_loss": 0.55806887, + "learning_rate": 1.2671777244212308e-06, + "loss": 0.57942659, + "num_input_tokens_seen": 226264345, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 10492, + "time_per_iteration": 2.5040977001190186 + }, + { + "auxiliary_loss_clip": 0.01105591, + "auxiliary_loss_mlp": 0.01035776, + "balance_loss_clip": 1.023283, + "balance_loss_mlp": 1.03620148, + "epoch": 0.6308732902450023, + "flos": 22565403959040.0, + "grad_norm": 2.2691030779407693, + "language_loss": 0.63968873, + "learning_rate": 1.2668153625085168e-06, + "loss": 0.66110241, + "num_input_tokens_seen": 226283165, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 10493, + "time_per_iteration": 2.501648187637329 + }, + { + "auxiliary_loss_clip": 0.01103602, + "auxiliary_loss_mlp": 0.01029723, + "balance_loss_clip": 1.01742673, + "balance_loss_mlp": 1.03536439, + "epoch": 0.6309334134976702, + "flos": 24644200694400.0, + "grad_norm": 1.6404154470274028, + "language_loss": 0.82711017, + "learning_rate": 1.2664530283982367e-06, + "loss": 0.84844351, + "num_input_tokens_seen": 226304080, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.68359375, + "step": 10494, + "time_per_iteration": 2.488478183746338 + }, + { + "auxiliary_loss_clip": 0.01106273, + "auxiliary_loss_mlp": 0.010314, + "balance_loss_clip": 1.01908565, + "balance_loss_mlp": 1.03702521, + "epoch": 0.6309935367503382, + "flos": 41427949651200.0, + "grad_norm": 1.6064300635789628, + "language_loss": 0.792678, + "learning_rate": 1.2660907221041317e-06, + "loss": 0.81405473, + "num_input_tokens_seen": 226325925, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6953125, + "step": 10495, + "time_per_iteration": 2.607936143875122 + }, + { + "auxiliary_loss_clip": 0.01104478, + "auxiliary_loss_mlp": 0.01032474, + "balance_loss_clip": 1.01999319, + "balance_loss_mlp": 1.0356319, + "epoch": 0.6310536600030061, + "flos": 15118931445120.0, + "grad_norm": 1.9868473037750025, + "language_loss": 0.69977289, + "learning_rate": 1.2657284436399403e-06, + "loss": 0.72114241, + "num_input_tokens_seen": 226344190, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 10496, + "time_per_iteration": 2.4172658920288086 + }, + { + "auxiliary_loss_clip": 0.01106703, + "auxiliary_loss_mlp": 0.01032501, + "balance_loss_clip": 1.02038908, + "balance_loss_mlp": 1.03729558, + "epoch": 0.6311137832556741, + "flos": 15231619388160.0, + "grad_norm": 2.5454831155818307, + "language_loss": 0.80091369, + "learning_rate": 1.2653661930193997e-06, + "loss": 0.82230574, + "num_input_tokens_seen": 226361520, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 10497, + "time_per_iteration": 2.417558193206787 + }, + { + "auxiliary_loss_clip": 0.01101019, + "auxiliary_loss_mlp": 0.01030621, + "balance_loss_clip": 1.01958823, + "balance_loss_mlp": 1.03501368, + "epoch": 0.6311739065083422, + "flos": 22018664067840.0, + "grad_norm": 1.8690299301257927, + "language_loss": 0.74428982, + "learning_rate": 1.265003970256247e-06, + "loss": 0.76560622, + "num_input_tokens_seen": 226381920, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.66015625, + "step": 10498, + "time_per_iteration": 2.452404737472534 + }, + { + "auxiliary_loss_clip": 0.01104382, + "auxiliary_loss_mlp": 0.0102937, + "balance_loss_clip": 1.01755679, + "balance_loss_mlp": 1.03578484, + "epoch": 0.6312340297610101, + "flos": 22710770300160.0, + "grad_norm": 1.8689991492998164, + "language_loss": 0.69558024, + "learning_rate": 1.264641775364217e-06, + "loss": 0.71691775, + "num_input_tokens_seen": 226400035, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 10499, + "time_per_iteration": 2.4273722171783447 + }, + { + "auxiliary_loss_clip": 0.01101969, + "auxiliary_loss_mlp": 0.01036719, + "balance_loss_clip": 1.02513218, + "balance_loss_mlp": 1.03703976, + "epoch": 0.6312941530136781, + "flos": 24280102483200.0, + "grad_norm": 2.6400614385639294, + "language_loss": 0.70014846, + "learning_rate": 1.2642796083570448e-06, + "loss": 0.72153533, + "num_input_tokens_seen": 226418280, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6484375, + "step": 10500, + "time_per_iteration": 2.4538466930389404 + }, + { + "auxiliary_loss_clip": 0.01106013, + "auxiliary_loss_mlp": 0.01032106, + "balance_loss_clip": 1.02079344, + "balance_loss_mlp": 1.03783047, + "epoch": 0.631354276266346, + "flos": 21725956137600.0, + "grad_norm": 3.0415450485464937, + "language_loss": 0.74062467, + "learning_rate": 1.2639174692484634e-06, + "loss": 0.76200593, + "num_input_tokens_seen": 226436650, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6796875, + "step": 10501, + "time_per_iteration": 2.4303436279296875 + }, + { + "auxiliary_loss_clip": 0.01104429, + "auxiliary_loss_mlp": 0.01031161, + "balance_loss_clip": 1.01901984, + "balance_loss_mlp": 1.03665447, + "epoch": 0.631414399519014, + "flos": 24025100855040.0, + "grad_norm": 1.6794546939174708, + "language_loss": 0.75353241, + "learning_rate": 1.2635553580522053e-06, + "loss": 0.77488828, + "num_input_tokens_seen": 226456275, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6796875, + "step": 10502, + "time_per_iteration": 2.4563441276550293 + }, + { + "auxiliary_loss_clip": 0.01109898, + "auxiliary_loss_mlp": 0.01044466, + "balance_loss_clip": 1.03212154, + "balance_loss_mlp": 1.03856277, + "epoch": 0.6314745227716819, + "flos": 24315797623680.0, + "grad_norm": 2.067886001099209, + "language_loss": 0.85457253, + "learning_rate": 1.2631932747820022e-06, + "loss": 0.87611616, + "num_input_tokens_seen": 226473610, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.7109375, + "step": 10503, + "time_per_iteration": 2.454007148742676 + }, + { + "auxiliary_loss_clip": 0.01104787, + "auxiliary_loss_mlp": 0.0103047, + "balance_loss_clip": 1.01844788, + "balance_loss_mlp": 1.0356003, + "epoch": 0.6315346460243499, + "flos": 23366391292800.0, + "grad_norm": 1.7756005126280807, + "language_loss": 0.86549926, + "learning_rate": 1.2628312194515838e-06, + "loss": 0.88685179, + "num_input_tokens_seen": 226493665, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 10504, + "time_per_iteration": 2.452439546585083 + }, + { + "auxiliary_loss_clip": 0.01110828, + "auxiliary_loss_mlp": 0.01034178, + "balance_loss_clip": 1.02137482, + "balance_loss_mlp": 1.03827763, + "epoch": 0.6315947692770179, + "flos": 20260333497600.0, + "grad_norm": 1.5631411561519288, + "language_loss": 0.76411223, + "learning_rate": 1.2624691920746793e-06, + "loss": 0.78556228, + "num_input_tokens_seen": 226511625, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7265625, + "step": 10505, + "time_per_iteration": 2.4167821407318115 + }, + { + "auxiliary_loss_clip": 0.01107106, + "auxiliary_loss_mlp": 0.01031267, + "balance_loss_clip": 1.01840425, + "balance_loss_mlp": 1.03718579, + "epoch": 0.6316548925296859, + "flos": 25265850399360.0, + "grad_norm": 1.80507675782724, + "language_loss": 0.81566548, + "learning_rate": 1.2621071926650166e-06, + "loss": 0.83704925, + "num_input_tokens_seen": 226530085, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69921875, + "step": 10506, + "time_per_iteration": 2.475015163421631 + }, + { + "auxiliary_loss_clip": 0.01108071, + "auxiliary_loss_mlp": 0.01029424, + "balance_loss_clip": 1.0174973, + "balance_loss_mlp": 1.03848529, + "epoch": 0.6317150157823538, + "flos": 22930579578240.0, + "grad_norm": 1.7792905066974667, + "language_loss": 0.74235427, + "learning_rate": 1.2617452212363238e-06, + "loss": 0.76372921, + "num_input_tokens_seen": 226548115, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6953125, + "step": 10507, + "time_per_iteration": 2.4495646953582764 + }, + { + "auxiliary_loss_clip": 0.01109877, + "auxiliary_loss_mlp": 0.01034985, + "balance_loss_clip": 1.02270663, + "balance_loss_mlp": 1.03861022, + "epoch": 0.6317751390350218, + "flos": 22527051212160.0, + "grad_norm": 1.7094804545962832, + "language_loss": 0.6781255, + "learning_rate": 1.2613832778023258e-06, + "loss": 0.69957411, + "num_input_tokens_seen": 226567955, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.7109375, + "step": 10508, + "time_per_iteration": 2.4817588329315186 + }, + { + "auxiliary_loss_clip": 0.01105487, + "auxiliary_loss_mlp": 0.01028244, + "balance_loss_clip": 1.01696706, + "balance_loss_mlp": 1.03691339, + "epoch": 0.6318352622876897, + "flos": 23294749616640.0, + "grad_norm": 1.6822434485138316, + "language_loss": 0.70602268, + "learning_rate": 1.2610213623767478e-06, + "loss": 0.72736001, + "num_input_tokens_seen": 226588205, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.68359375, + "step": 10509, + "time_per_iteration": 2.511807680130005 + }, + { + "auxiliary_loss_clip": 0.01104325, + "auxiliary_loss_mlp": 0.01027474, + "balance_loss_clip": 1.01614881, + "balance_loss_mlp": 1.03634882, + "epoch": 0.6318953855403577, + "flos": 20704082117760.0, + "grad_norm": 1.6779333049559604, + "language_loss": 0.79419941, + "learning_rate": 1.2606594749733143e-06, + "loss": 0.81551743, + "num_input_tokens_seen": 226606965, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6796875, + "step": 10510, + "time_per_iteration": 3.890570640563965 + }, + { + "auxiliary_loss_clip": 0.01107002, + "auxiliary_loss_mlp": 0.01031338, + "balance_loss_clip": 1.01917839, + "balance_loss_mlp": 1.03689122, + "epoch": 0.6319555087930258, + "flos": 22820046451200.0, + "grad_norm": 1.4507580648571856, + "language_loss": 0.70762742, + "learning_rate": 1.2602976156057469e-06, + "loss": 0.72901082, + "num_input_tokens_seen": 226627845, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.703125, + "step": 10511, + "time_per_iteration": 2.502631902694702 + }, + { + "auxiliary_loss_clip": 0.01102983, + "auxiliary_loss_mlp": 0.01032047, + "balance_loss_clip": 1.02093077, + "balance_loss_mlp": 1.03624094, + "epoch": 0.6320156320456937, + "flos": 19970929618560.0, + "grad_norm": 1.7129276808255165, + "language_loss": 0.80193913, + "learning_rate": 1.2599357842877684e-06, + "loss": 0.82328945, + "num_input_tokens_seen": 226645855, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66796875, + "step": 10512, + "time_per_iteration": 2.4500255584716797 + }, + { + "auxiliary_loss_clip": 0.01108015, + "auxiliary_loss_mlp": 0.01031253, + "balance_loss_clip": 1.01859283, + "balance_loss_mlp": 1.03887498, + "epoch": 0.6320757552983617, + "flos": 27013406889600.0, + "grad_norm": 1.9936938479118853, + "language_loss": 0.70610952, + "learning_rate": 1.2595739810330994e-06, + "loss": 0.72750223, + "num_input_tokens_seen": 226665375, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 10513, + "time_per_iteration": 5.2415876388549805 + }, + { + "auxiliary_loss_clip": 0.01110907, + "auxiliary_loss_mlp": 0.01029996, + "balance_loss_clip": 1.01782441, + "balance_loss_mlp": 1.03923917, + "epoch": 0.6321358785510296, + "flos": 23695943598720.0, + "grad_norm": 1.9330841856618928, + "language_loss": 0.66179729, + "learning_rate": 1.259212205855459e-06, + "loss": 0.68320632, + "num_input_tokens_seen": 226685270, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.71875, + "step": 10514, + "time_per_iteration": 3.9086010456085205 + }, + { + "auxiliary_loss_clip": 0.01102729, + "auxiliary_loss_mlp": 0.01030448, + "balance_loss_clip": 1.01901603, + "balance_loss_mlp": 1.0355525, + "epoch": 0.6321960018036976, + "flos": 25995231970560.0, + "grad_norm": 1.657544375063904, + "language_loss": 0.74582148, + "learning_rate": 1.2588504587685663e-06, + "loss": 0.76715326, + "num_input_tokens_seen": 226705325, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 10515, + "time_per_iteration": 2.4754388332366943 + }, + { + "auxiliary_loss_clip": 0.01103002, + "auxiliary_loss_mlp": 0.01026215, + "balance_loss_clip": 1.01485467, + "balance_loss_mlp": 1.03710318, + "epoch": 0.6322561250563655, + "flos": 22821016118400.0, + "grad_norm": 1.8087331085143223, + "language_loss": 0.89853811, + "learning_rate": 1.2584887397861379e-06, + "loss": 0.91983026, + "num_input_tokens_seen": 226723815, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 10516, + "time_per_iteration": 2.431255578994751 + }, + { + "auxiliary_loss_clip": 0.01113899, + "auxiliary_loss_mlp": 0.01030505, + "balance_loss_clip": 1.01684928, + "balance_loss_mlp": 1.04077578, + "epoch": 0.6323162483090335, + "flos": 18988413926400.0, + "grad_norm": 1.8110008690321133, + "language_loss": 0.81904936, + "learning_rate": 1.2581270489218911e-06, + "loss": 0.84049344, + "num_input_tokens_seen": 226741550, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.73046875, + "step": 10517, + "time_per_iteration": 2.418457508087158 + }, + { + "auxiliary_loss_clip": 0.01105413, + "auxiliary_loss_mlp": 0.01034651, + "balance_loss_clip": 1.02295017, + "balance_loss_mlp": 1.03746212, + "epoch": 0.6323763715617015, + "flos": 19865173000320.0, + "grad_norm": 1.9810559885321721, + "language_loss": 0.77525067, + "learning_rate": 1.257765386189541e-06, + "loss": 0.7966513, + "num_input_tokens_seen": 226761115, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 10518, + "time_per_iteration": 2.480358839035034 + }, + { + "auxiliary_loss_clip": 0.01102761, + "auxiliary_loss_mlp": 0.01032476, + "balance_loss_clip": 1.0210377, + "balance_loss_mlp": 1.03653479, + "epoch": 0.6324364948143695, + "flos": 22782699285120.0, + "grad_norm": 1.4836154875686243, + "language_loss": 0.85232532, + "learning_rate": 1.2574037516028018e-06, + "loss": 0.87367767, + "num_input_tokens_seen": 226782225, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 10519, + "time_per_iteration": 2.539891242980957 + }, + { + "auxiliary_loss_clip": 0.01101558, + "auxiliary_loss_mlp": 0.01032383, + "balance_loss_clip": 1.02114785, + "balance_loss_mlp": 1.03623748, + "epoch": 0.6324966180670374, + "flos": 22235923480320.0, + "grad_norm": 1.6381683069265482, + "language_loss": 0.71834314, + "learning_rate": 1.2570421451753867e-06, + "loss": 0.73968256, + "num_input_tokens_seen": 226802375, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 10520, + "time_per_iteration": 2.4911139011383057 + }, + { + "auxiliary_loss_clip": 0.0110337, + "auxiliary_loss_mlp": 0.01030762, + "balance_loss_clip": 1.01956213, + "balance_loss_mlp": 1.03599596, + "epoch": 0.6325567413197054, + "flos": 21689183589120.0, + "grad_norm": 1.7138425612253112, + "language_loss": 0.7110256, + "learning_rate": 1.2566805669210081e-06, + "loss": 0.73236692, + "num_input_tokens_seen": 226822165, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.671875, + "step": 10521, + "time_per_iteration": 2.42466402053833 + }, + { + "auxiliary_loss_clip": 0.01107506, + "auxiliary_loss_mlp": 0.0103163, + "balance_loss_clip": 1.01895845, + "balance_loss_mlp": 1.03792214, + "epoch": 0.6326168645723733, + "flos": 19937137898880.0, + "grad_norm": 1.6701833516110784, + "language_loss": 0.71829087, + "learning_rate": 1.256319016853377e-06, + "loss": 0.7396822, + "num_input_tokens_seen": 226841645, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 10522, + "time_per_iteration": 2.456470012664795 + }, + { + "auxiliary_loss_clip": 0.01105444, + "auxiliary_loss_mlp": 0.01030343, + "balance_loss_clip": 1.01849365, + "balance_loss_mlp": 1.03691065, + "epoch": 0.6326769878250413, + "flos": 20230348619520.0, + "grad_norm": 1.752428604035476, + "language_loss": 0.81730425, + "learning_rate": 1.2559574949862023e-06, + "loss": 0.83866215, + "num_input_tokens_seen": 226860355, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6875, + "step": 10523, + "time_per_iteration": 2.4390153884887695 + }, + { + "auxiliary_loss_clip": 0.01104755, + "auxiliary_loss_mlp": 0.01025919, + "balance_loss_clip": 1.01428986, + "balance_loss_mlp": 1.03734088, + "epoch": 0.6327371110777094, + "flos": 20775759707520.0, + "grad_norm": 2.695654876532073, + "language_loss": 0.73930323, + "learning_rate": 1.255596001333195e-06, + "loss": 0.76060998, + "num_input_tokens_seen": 226878390, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 10524, + "time_per_iteration": 2.4376304149627686 + }, + { + "auxiliary_loss_clip": 0.01111218, + "auxiliary_loss_mlp": 0.01035593, + "balance_loss_clip": 1.02231896, + "balance_loss_mlp": 1.03718793, + "epoch": 0.6327972343303773, + "flos": 30336544529280.0, + "grad_norm": 4.405789883496385, + "language_loss": 0.84463608, + "learning_rate": 1.2552345359080615e-06, + "loss": 0.86610419, + "num_input_tokens_seen": 226898420, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.73828125, + "step": 10525, + "time_per_iteration": 2.4973292350769043 + }, + { + "auxiliary_loss_clip": 0.011025, + "auxiliary_loss_mlp": 0.01023105, + "balance_loss_clip": 1.01178622, + "balance_loss_mlp": 1.03544807, + "epoch": 0.6328573575830453, + "flos": 17092258871040.0, + "grad_norm": 3.1585625796827212, + "language_loss": 0.66817802, + "learning_rate": 1.2548730987245093e-06, + "loss": 0.68943405, + "num_input_tokens_seen": 226916305, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 10526, + "time_per_iteration": 2.431757688522339 + }, + { + "auxiliary_loss_clip": 0.01111651, + "auxiliary_loss_mlp": 0.01036373, + "balance_loss_clip": 1.02308118, + "balance_loss_mlp": 1.03971434, + "epoch": 0.6329174808357132, + "flos": 25047154442880.0, + "grad_norm": 2.135799005467542, + "language_loss": 0.7367599, + "learning_rate": 1.254511689796244e-06, + "loss": 0.75824016, + "num_input_tokens_seen": 226937705, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 10527, + "time_per_iteration": 2.473468065261841 + }, + { + "auxiliary_loss_clip": 0.01104509, + "auxiliary_loss_mlp": 0.01029387, + "balance_loss_clip": 1.01803195, + "balance_loss_mlp": 1.03822732, + "epoch": 0.6329776040883812, + "flos": 16836826279680.0, + "grad_norm": 1.98632215188849, + "language_loss": 0.71867841, + "learning_rate": 1.2541503091369693e-06, + "loss": 0.74001735, + "num_input_tokens_seen": 226954880, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6640625, + "step": 10528, + "time_per_iteration": 2.428516387939453 + }, + { + "auxiliary_loss_clip": 0.01103271, + "auxiliary_loss_mlp": 0.01028378, + "balance_loss_clip": 1.01593244, + "balance_loss_mlp": 1.03575611, + "epoch": 0.6330377273410491, + "flos": 13516705382400.0, + "grad_norm": 2.649115399957431, + "language_loss": 0.66042399, + "learning_rate": 1.2537889567603905e-06, + "loss": 0.68174052, + "num_input_tokens_seen": 226972595, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.67578125, + "step": 10529, + "time_per_iteration": 2.4110963344573975 + }, + { + "auxiliary_loss_clip": 0.01109156, + "auxiliary_loss_mlp": 0.01031761, + "balance_loss_clip": 1.01853514, + "balance_loss_mlp": 1.03828883, + "epoch": 0.6330978505937171, + "flos": 21538825257600.0, + "grad_norm": 2.3567719196586134, + "language_loss": 0.75553149, + "learning_rate": 1.2534276326802092e-06, + "loss": 0.7769407, + "num_input_tokens_seen": 226991910, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 10530, + "time_per_iteration": 2.47843074798584 + }, + { + "auxiliary_loss_clip": 0.01109816, + "auxiliary_loss_mlp": 0.01029253, + "balance_loss_clip": 1.01745164, + "balance_loss_mlp": 1.04016328, + "epoch": 0.6331579738463851, + "flos": 25009484054400.0, + "grad_norm": 2.740073625004777, + "language_loss": 0.73872888, + "learning_rate": 1.2530663369101259e-06, + "loss": 0.76011956, + "num_input_tokens_seen": 227010175, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6953125, + "step": 10531, + "time_per_iteration": 2.4678969383239746 + }, + { + "auxiliary_loss_clip": 0.01102476, + "auxiliary_loss_mlp": 0.01028951, + "balance_loss_clip": 1.01666689, + "balance_loss_mlp": 1.03636086, + "epoch": 0.6332180970990531, + "flos": 14976007228800.0, + "grad_norm": 2.9880072875831147, + "language_loss": 0.79408121, + "learning_rate": 1.2527050694638432e-06, + "loss": 0.81539547, + "num_input_tokens_seen": 227025540, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66015625, + "step": 10532, + "time_per_iteration": 2.481036901473999 + }, + { + "auxiliary_loss_clip": 0.01102051, + "auxiliary_loss_mlp": 0.0103236, + "balance_loss_clip": 1.02170849, + "balance_loss_mlp": 1.03580236, + "epoch": 0.633278220351721, + "flos": 22706963458560.0, + "grad_norm": 2.7000401748576817, + "language_loss": 0.74374038, + "learning_rate": 1.2523438303550582e-06, + "loss": 0.76508451, + "num_input_tokens_seen": 227045520, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6640625, + "step": 10533, + "time_per_iteration": 2.4607644081115723 + }, + { + "auxiliary_loss_clip": 0.01110909, + "auxiliary_loss_mlp": 0.01034703, + "balance_loss_clip": 1.02163792, + "balance_loss_mlp": 1.03844595, + "epoch": 0.633338343604389, + "flos": 12602922364800.0, + "grad_norm": 2.750255656428334, + "language_loss": 0.76894259, + "learning_rate": 1.2519826195974706e-06, + "loss": 0.79039878, + "num_input_tokens_seen": 227059420, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.72265625, + "step": 10534, + "time_per_iteration": 2.4279823303222656 + }, + { + "auxiliary_loss_clip": 0.01106846, + "auxiliary_loss_mlp": 0.0103469, + "balance_loss_clip": 1.02312016, + "balance_loss_mlp": 1.03899598, + "epoch": 0.6333984668570569, + "flos": 25960111447680.0, + "grad_norm": 1.5411023230298349, + "language_loss": 0.85583681, + "learning_rate": 1.251621437204777e-06, + "loss": 0.8772521, + "num_input_tokens_seen": 227081310, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6796875, + "step": 10535, + "time_per_iteration": 2.4824087619781494 + }, + { + "auxiliary_loss_clip": 0.01106839, + "auxiliary_loss_mlp": 0.01031821, + "balance_loss_clip": 1.01961398, + "balance_loss_mlp": 1.03782022, + "epoch": 0.6334585901097249, + "flos": 23659242877440.0, + "grad_norm": 2.0534992057606285, + "language_loss": 0.76360321, + "learning_rate": 1.2512602831906733e-06, + "loss": 0.78498983, + "num_input_tokens_seen": 227100365, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 10536, + "time_per_iteration": 2.530451774597168 + }, + { + "auxiliary_loss_clip": 0.0110721, + "auxiliary_loss_mlp": 0.01028938, + "balance_loss_clip": 1.01741064, + "balance_loss_mlp": 1.03990674, + "epoch": 0.633518713362393, + "flos": 28760496503040.0, + "grad_norm": 1.9627877064999752, + "language_loss": 0.60015184, + "learning_rate": 1.250899157568855e-06, + "loss": 0.62151325, + "num_input_tokens_seen": 227119680, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 10537, + "time_per_iteration": 2.5151615142822266 + }, + { + "auxiliary_loss_clip": 0.01026622, + "auxiliary_loss_mlp": 0.01001054, + "balance_loss_clip": 0.999695, + "balance_loss_mlp": 1.00554442, + "epoch": 0.6335788366150609, + "flos": 70420322401920.0, + "grad_norm": 0.7708037183825521, + "language_loss": 0.52472723, + "learning_rate": 1.2505380603530155e-06, + "loss": 0.54500401, + "num_input_tokens_seen": 227184465, + "router_z_loss_clip": 0.01361084, + "router_z_loss_mlp": 0.2109375, + "step": 10538, + "time_per_iteration": 3.165985584259033 + }, + { + "auxiliary_loss_clip": 0.01108701, + "auxiliary_loss_mlp": 0.01033325, + "balance_loss_clip": 1.0205456, + "balance_loss_mlp": 1.0376327, + "epoch": 0.6336389598677289, + "flos": 23732069702400.0, + "grad_norm": 1.8519204835949576, + "language_loss": 0.83039713, + "learning_rate": 1.250176991556848e-06, + "loss": 0.85181737, + "num_input_tokens_seen": 227202185, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7109375, + "step": 10539, + "time_per_iteration": 2.4390335083007812 + }, + { + "auxiliary_loss_clip": 0.01107427, + "auxiliary_loss_mlp": 0.01028431, + "balance_loss_clip": 1.01580071, + "balance_loss_mlp": 1.03738523, + "epoch": 0.6336990831203968, + "flos": 29276676898560.0, + "grad_norm": 1.637138612539208, + "language_loss": 0.86837506, + "learning_rate": 1.2498159511940438e-06, + "loss": 0.88973361, + "num_input_tokens_seen": 227222020, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69921875, + "step": 10540, + "time_per_iteration": 2.4831221103668213 + }, + { + "auxiliary_loss_clip": 0.01100728, + "auxiliary_loss_mlp": 0.01027815, + "balance_loss_clip": 1.01740217, + "balance_loss_mlp": 1.03550363, + "epoch": 0.6337592063730648, + "flos": 29096836479360.0, + "grad_norm": 1.5901447763785947, + "language_loss": 0.7268725, + "learning_rate": 1.2494549392782943e-06, + "loss": 0.74815792, + "num_input_tokens_seen": 227240885, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.65234375, + "step": 10541, + "time_per_iteration": 2.479461908340454 + }, + { + "auxiliary_loss_clip": 0.01109283, + "auxiliary_loss_mlp": 0.0103142, + "balance_loss_clip": 1.01819396, + "balance_loss_mlp": 1.03717303, + "epoch": 0.6338193296257327, + "flos": 34706477249280.0, + "grad_norm": 2.6143323692331166, + "language_loss": 0.84712064, + "learning_rate": 1.2490939558232887e-06, + "loss": 0.86852765, + "num_input_tokens_seen": 227257880, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.72265625, + "step": 10542, + "time_per_iteration": 2.54823899269104 + }, + { + "auxiliary_loss_clip": 0.01105497, + "auxiliary_loss_mlp": 0.01030798, + "balance_loss_clip": 1.01760697, + "balance_loss_mlp": 1.03709495, + "epoch": 0.6338794528784008, + "flos": 16687581269760.0, + "grad_norm": 1.6553786281241991, + "language_loss": 0.77977955, + "learning_rate": 1.2487330008427153e-06, + "loss": 0.80114251, + "num_input_tokens_seen": 227274840, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.68359375, + "step": 10543, + "time_per_iteration": 2.3880414962768555 + }, + { + "auxiliary_loss_clip": 0.0110064, + "auxiliary_loss_mlp": 0.01034533, + "balance_loss_clip": 1.02324414, + "balance_loss_mlp": 1.03599632, + "epoch": 0.6339395761310687, + "flos": 22346600261760.0, + "grad_norm": 1.6753324610621851, + "language_loss": 0.73382592, + "learning_rate": 1.2483720743502618e-06, + "loss": 0.75517762, + "num_input_tokens_seen": 227294835, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 10544, + "time_per_iteration": 2.4576821327209473 + }, + { + "auxiliary_loss_clip": 0.01108095, + "auxiliary_loss_mlp": 0.01036859, + "balance_loss_clip": 1.02458596, + "balance_loss_mlp": 1.0366528, + "epoch": 0.6339996993837367, + "flos": 18551812112640.0, + "grad_norm": 2.0297826320587844, + "language_loss": 0.68563735, + "learning_rate": 1.2480111763596144e-06, + "loss": 0.70708686, + "num_input_tokens_seen": 227314935, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.71484375, + "step": 10545, + "time_per_iteration": 2.4281883239746094 + }, + { + "auxiliary_loss_clip": 0.01102093, + "auxiliary_loss_mlp": 0.01030311, + "balance_loss_clip": 1.01794934, + "balance_loss_mlp": 1.03507733, + "epoch": 0.6340598226364046, + "flos": 12969498614400.0, + "grad_norm": 2.000384025401953, + "language_loss": 0.71141988, + "learning_rate": 1.2476503068844592e-06, + "loss": 0.73274392, + "num_input_tokens_seen": 227332905, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 10546, + "time_per_iteration": 2.4097115993499756 + }, + { + "auxiliary_loss_clip": 0.01101216, + "auxiliary_loss_mlp": 0.01031824, + "balance_loss_clip": 1.02085662, + "balance_loss_mlp": 1.03665507, + "epoch": 0.6341199458890726, + "flos": 26687984647680.0, + "grad_norm": 1.3382755401261122, + "language_loss": 0.77992189, + "learning_rate": 1.2472894659384792e-06, + "loss": 0.80125231, + "num_input_tokens_seen": 227354915, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 10547, + "time_per_iteration": 2.4647274017333984 + }, + { + "auxiliary_loss_clip": 0.01107664, + "auxiliary_loss_mlp": 0.01032127, + "balance_loss_clip": 1.02003956, + "balance_loss_mlp": 1.03658104, + "epoch": 0.6341800691417405, + "flos": 18734274224640.0, + "grad_norm": 1.7405007308500737, + "language_loss": 0.63246721, + "learning_rate": 1.2469286535353578e-06, + "loss": 0.6538651, + "num_input_tokens_seen": 227372990, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.7109375, + "step": 10548, + "time_per_iteration": 2.4153497219085693 + }, + { + "auxiliary_loss_clip": 0.01103941, + "auxiliary_loss_mlp": 0.01027195, + "balance_loss_clip": 1.01604867, + "balance_loss_mlp": 1.03657913, + "epoch": 0.6342401923944085, + "flos": 26249443499520.0, + "grad_norm": 1.5666269720045418, + "language_loss": 0.61767489, + "learning_rate": 1.2465678696887785e-06, + "loss": 0.63898623, + "num_input_tokens_seen": 227393270, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.671875, + "step": 10549, + "time_per_iteration": 2.4682185649871826 + }, + { + "auxiliary_loss_clip": 0.01102967, + "auxiliary_loss_mlp": 0.01028195, + "balance_loss_clip": 1.01777053, + "balance_loss_mlp": 1.03553009, + "epoch": 0.6343003156470765, + "flos": 24680937329280.0, + "grad_norm": 1.7174833177104423, + "language_loss": 0.73910511, + "learning_rate": 1.2462071144124197e-06, + "loss": 0.76041675, + "num_input_tokens_seen": 227413630, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.671875, + "step": 10550, + "time_per_iteration": 2.484928607940674 + }, + { + "auxiliary_loss_clip": 0.01026139, + "auxiliary_loss_mlp": 0.00996982, + "balance_loss_clip": 0.99562275, + "balance_loss_mlp": 1.00515223, + "epoch": 0.6343604388997445, + "flos": 69805352626560.0, + "grad_norm": 0.6918927993882659, + "language_loss": 0.57716167, + "learning_rate": 1.2458463877199638e-06, + "loss": 0.59739286, + "num_input_tokens_seen": 227476630, + "router_z_loss_clip": 0.01361084, + "router_z_loss_mlp": 0.20996094, + "step": 10551, + "time_per_iteration": 3.0650179386138916 + }, + { + "auxiliary_loss_clip": 0.01103158, + "auxiliary_loss_mlp": 0.01026085, + "balance_loss_clip": 1.01508236, + "balance_loss_mlp": 1.03589559, + "epoch": 0.6344205621524125, + "flos": 21982430223360.0, + "grad_norm": 1.7335763595284734, + "language_loss": 0.67098165, + "learning_rate": 1.2454856896250881e-06, + "loss": 0.69227403, + "num_input_tokens_seen": 227496060, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.671875, + "step": 10552, + "time_per_iteration": 3.8182289600372314 + }, + { + "auxiliary_loss_clip": 0.01105164, + "auxiliary_loss_mlp": 0.01028535, + "balance_loss_clip": 1.01642919, + "balance_loss_mlp": 1.03475296, + "epoch": 0.6344806854050804, + "flos": 20448865008000.0, + "grad_norm": 1.910004275661171, + "language_loss": 0.8218025, + "learning_rate": 1.24512502014147e-06, + "loss": 0.84313941, + "num_input_tokens_seen": 227513440, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.703125, + "step": 10553, + "time_per_iteration": 2.449106216430664 + }, + { + "auxiliary_loss_clip": 0.0110533, + "auxiliary_loss_mlp": 0.01033634, + "balance_loss_clip": 1.02142692, + "balance_loss_mlp": 1.03581154, + "epoch": 0.6345408086577484, + "flos": 40510611187200.0, + "grad_norm": 1.776645744912539, + "language_loss": 0.5519408, + "learning_rate": 1.2447643792827879e-06, + "loss": 0.5733304, + "num_input_tokens_seen": 227535395, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6953125, + "step": 10554, + "time_per_iteration": 4.085347652435303 + }, + { + "auxiliary_loss_clip": 0.01105981, + "auxiliary_loss_mlp": 0.01029893, + "balance_loss_clip": 1.01777518, + "balance_loss_mlp": 1.03750849, + "epoch": 0.6346009319104163, + "flos": 21361319222400.0, + "grad_norm": 1.7092991458226663, + "language_loss": 0.70511019, + "learning_rate": 1.2444037670627153e-06, + "loss": 0.72646892, + "num_input_tokens_seen": 227554545, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.68359375, + "step": 10555, + "time_per_iteration": 3.8290207386016846 + }, + { + "auxiliary_loss_clip": 0.01026207, + "auxiliary_loss_mlp": 0.01000287, + "balance_loss_clip": 0.99898165, + "balance_loss_mlp": 1.00531995, + "epoch": 0.6346610551630844, + "flos": 71365419100800.0, + "grad_norm": 0.8961338606309752, + "language_loss": 0.55477089, + "learning_rate": 1.2440431834949276e-06, + "loss": 0.57503581, + "num_input_tokens_seen": 227608575, + "router_z_loss_clip": 0.01306152, + "router_z_loss_mlp": 0.20898438, + "step": 10556, + "time_per_iteration": 4.450624227523804 + }, + { + "auxiliary_loss_clip": 0.01105056, + "auxiliary_loss_mlp": 0.01032391, + "balance_loss_clip": 1.01931942, + "balance_loss_mlp": 1.03497851, + "epoch": 0.6347211784157523, + "flos": 25411504049280.0, + "grad_norm": 1.9064112571580962, + "language_loss": 0.68177021, + "learning_rate": 1.2436826285930985e-06, + "loss": 0.70314467, + "num_input_tokens_seen": 227628175, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.69921875, + "step": 10557, + "time_per_iteration": 2.486895799636841 + }, + { + "auxiliary_loss_clip": 0.01103875, + "auxiliary_loss_mlp": 0.01031026, + "balance_loss_clip": 1.01968968, + "balance_loss_mlp": 1.03706682, + "epoch": 0.6347813016684203, + "flos": 15742735966080.0, + "grad_norm": 1.9232562930576766, + "language_loss": 0.70448172, + "learning_rate": 1.2433221023709002e-06, + "loss": 0.72583079, + "num_input_tokens_seen": 227645330, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 10558, + "time_per_iteration": 2.40922212600708 + }, + { + "auxiliary_loss_clip": 0.0110235, + "auxiliary_loss_mlp": 0.01031008, + "balance_loss_clip": 1.01924789, + "balance_loss_mlp": 1.03492951, + "epoch": 0.6348414249210882, + "flos": 21464777370240.0, + "grad_norm": 1.4566517765841722, + "language_loss": 0.78202355, + "learning_rate": 1.2429616048420031e-06, + "loss": 0.80335712, + "num_input_tokens_seen": 227665250, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.67578125, + "step": 10559, + "time_per_iteration": 2.44706130027771 + }, + { + "auxiliary_loss_clip": 0.01108267, + "auxiliary_loss_mlp": 0.01037326, + "balance_loss_clip": 1.02512479, + "balance_loss_mlp": 1.03806639, + "epoch": 0.6349015481737562, + "flos": 21653057485440.0, + "grad_norm": 2.1761339392195467, + "language_loss": 0.68320858, + "learning_rate": 1.242601136020078e-06, + "loss": 0.70466453, + "num_input_tokens_seen": 227685070, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.703125, + "step": 10560, + "time_per_iteration": 2.4409596920013428 + }, + { + "auxiliary_loss_clip": 0.01104015, + "auxiliary_loss_mlp": 0.01034754, + "balance_loss_clip": 1.02302957, + "balance_loss_mlp": 1.03592563, + "epoch": 0.6349616714264241, + "flos": 22194984954240.0, + "grad_norm": 1.67836467156634, + "language_loss": 0.7699995, + "learning_rate": 1.2422406959187939e-06, + "loss": 0.7913872, + "num_input_tokens_seen": 227704430, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 10561, + "time_per_iteration": 2.5039145946502686 + }, + { + "auxiliary_loss_clip": 0.01104347, + "auxiliary_loss_mlp": 0.01032147, + "balance_loss_clip": 1.01994574, + "balance_loss_mlp": 1.0352478, + "epoch": 0.6350217946790921, + "flos": 25410354814080.0, + "grad_norm": 4.009586317175133, + "language_loss": 0.72008455, + "learning_rate": 1.2418802845518178e-06, + "loss": 0.74144948, + "num_input_tokens_seen": 227724920, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69140625, + "step": 10562, + "time_per_iteration": 2.472137212753296 + }, + { + "auxiliary_loss_clip": 0.01107214, + "auxiliary_loss_mlp": 0.010302, + "balance_loss_clip": 1.01749265, + "balance_loss_mlp": 1.03718257, + "epoch": 0.63508191793176, + "flos": 19718944732800.0, + "grad_norm": 2.5105421382487267, + "language_loss": 0.80683196, + "learning_rate": 1.2415199019328185e-06, + "loss": 0.82820606, + "num_input_tokens_seen": 227743400, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 10563, + "time_per_iteration": 2.4413557052612305 + }, + { + "auxiliary_loss_clip": 0.01108821, + "auxiliary_loss_mlp": 0.01036739, + "balance_loss_clip": 1.02480066, + "balance_loss_mlp": 1.03883505, + "epoch": 0.6351420411844281, + "flos": 18186923802240.0, + "grad_norm": 2.110536240738381, + "language_loss": 0.80818796, + "learning_rate": 1.2411595480754597e-06, + "loss": 0.82964349, + "num_input_tokens_seen": 227759990, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.69921875, + "step": 10564, + "time_per_iteration": 2.4266111850738525 + }, + { + "auxiliary_loss_clip": 0.01107128, + "auxiliary_loss_mlp": 0.01031565, + "balance_loss_clip": 1.02003133, + "balance_loss_mlp": 1.03857136, + "epoch": 0.6352021644370961, + "flos": 33726511422720.0, + "grad_norm": 1.6172553063068438, + "language_loss": 0.72285914, + "learning_rate": 1.240799222993407e-06, + "loss": 0.74424613, + "num_input_tokens_seen": 227780835, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6875, + "step": 10565, + "time_per_iteration": 2.534834623336792 + }, + { + "auxiliary_loss_clip": 0.01106685, + "auxiliary_loss_mlp": 0.01030402, + "balance_loss_clip": 1.01762271, + "balance_loss_mlp": 1.03696799, + "epoch": 0.635262287689764, + "flos": 20374781207040.0, + "grad_norm": 2.0506297866150467, + "language_loss": 0.69144678, + "learning_rate": 1.240438926700324e-06, + "loss": 0.71281761, + "num_input_tokens_seen": 227798580, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 10566, + "time_per_iteration": 2.411491632461548 + }, + { + "auxiliary_loss_clip": 0.01102305, + "auxiliary_loss_mlp": 0.01029542, + "balance_loss_clip": 1.01837826, + "balance_loss_mlp": 1.03648448, + "epoch": 0.635322410942432, + "flos": 27525421307520.0, + "grad_norm": 1.5548948412040506, + "language_loss": 0.69706547, + "learning_rate": 1.2400786592098725e-06, + "loss": 0.71838397, + "num_input_tokens_seen": 227819210, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 10567, + "time_per_iteration": 2.48917293548584 + }, + { + "auxiliary_loss_clip": 0.01103585, + "auxiliary_loss_mlp": 0.01032656, + "balance_loss_clip": 1.02129579, + "balance_loss_mlp": 1.03807187, + "epoch": 0.6353825341950999, + "flos": 21543601766400.0, + "grad_norm": 2.2646303551803753, + "language_loss": 0.84620178, + "learning_rate": 1.2397184205357154e-06, + "loss": 0.86756414, + "num_input_tokens_seen": 227838340, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65625, + "step": 10568, + "time_per_iteration": 2.4403724670410156 + }, + { + "auxiliary_loss_clip": 0.0110714, + "auxiliary_loss_mlp": 0.01038435, + "balance_loss_clip": 1.02603149, + "balance_loss_mlp": 1.03773642, + "epoch": 0.635442657447768, + "flos": 31759756185600.0, + "grad_norm": 1.746273347856982, + "language_loss": 0.83601934, + "learning_rate": 1.2393582106915113e-06, + "loss": 0.8574751, + "num_input_tokens_seen": 227859170, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6953125, + "step": 10569, + "time_per_iteration": 2.5299484729766846 + }, + { + "auxiliary_loss_clip": 0.01103471, + "auxiliary_loss_mlp": 0.01027469, + "balance_loss_clip": 1.01576304, + "balance_loss_mlp": 1.03676128, + "epoch": 0.6355027807004359, + "flos": 19828831415040.0, + "grad_norm": 1.7093099643488843, + "language_loss": 0.69269961, + "learning_rate": 1.2389980296909198e-06, + "loss": 0.71400905, + "num_input_tokens_seen": 227878545, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66796875, + "step": 10570, + "time_per_iteration": 2.4609997272491455 + }, + { + "auxiliary_loss_clip": 0.01105483, + "auxiliary_loss_mlp": 0.01033824, + "balance_loss_clip": 1.02142024, + "balance_loss_mlp": 1.03430879, + "epoch": 0.6355629039531039, + "flos": 30372383324160.0, + "grad_norm": 1.6697776111718718, + "language_loss": 0.65798032, + "learning_rate": 1.2386378775476e-06, + "loss": 0.67937338, + "num_input_tokens_seen": 227898875, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 10571, + "time_per_iteration": 2.5261099338531494 + }, + { + "auxiliary_loss_clip": 0.01110578, + "auxiliary_loss_mlp": 0.01029161, + "balance_loss_clip": 1.01701999, + "balance_loss_mlp": 1.03919911, + "epoch": 0.6356230272057718, + "flos": 17932065828480.0, + "grad_norm": 1.7838042943408632, + "language_loss": 0.71219468, + "learning_rate": 1.2382777542752074e-06, + "loss": 0.73359203, + "num_input_tokens_seen": 227917130, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.7109375, + "step": 10572, + "time_per_iteration": 2.4292333126068115 + }, + { + "auxiliary_loss_clip": 0.0110121, + "auxiliary_loss_mlp": 0.01032091, + "balance_loss_clip": 1.02092671, + "balance_loss_mlp": 1.03459537, + "epoch": 0.6356831504584398, + "flos": 25375844822400.0, + "grad_norm": 2.8044296408111657, + "language_loss": 0.81269503, + "learning_rate": 1.2379176598873992e-06, + "loss": 0.83402801, + "num_input_tokens_seen": 227939550, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.66796875, + "step": 10573, + "time_per_iteration": 2.5012412071228027 + }, + { + "auxiliary_loss_clip": 0.01106251, + "auxiliary_loss_mlp": 0.01030673, + "balance_loss_clip": 1.01897848, + "balance_loss_mlp": 1.0366838, + "epoch": 0.6357432737111077, + "flos": 46500331720320.0, + "grad_norm": 1.5014218063812952, + "language_loss": 0.68932259, + "learning_rate": 1.2375575943978303e-06, + "loss": 0.71069181, + "num_input_tokens_seen": 227962200, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.69921875, + "step": 10574, + "time_per_iteration": 2.668290853500366 + }, + { + "auxiliary_loss_clip": 0.01104073, + "auxiliary_loss_mlp": 0.01027616, + "balance_loss_clip": 1.01580226, + "balance_loss_mlp": 1.03717065, + "epoch": 0.6358033969637757, + "flos": 17274361847040.0, + "grad_norm": 2.2372840416556476, + "language_loss": 0.86855853, + "learning_rate": 1.2371975578201525e-06, + "loss": 0.88987547, + "num_input_tokens_seen": 227979270, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 10575, + "time_per_iteration": 2.4198617935180664 + }, + { + "auxiliary_loss_clip": 0.01104492, + "auxiliary_loss_mlp": 0.01032617, + "balance_loss_clip": 1.02124405, + "balance_loss_mlp": 1.03752255, + "epoch": 0.6358635202164437, + "flos": 27125520215040.0, + "grad_norm": 1.7217722573852687, + "language_loss": 0.72000861, + "learning_rate": 1.2368375501680204e-06, + "loss": 0.74137974, + "num_input_tokens_seen": 228000550, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.66796875, + "step": 10576, + "time_per_iteration": 2.4883639812469482 + }, + { + "auxiliary_loss_clip": 0.01106159, + "auxiliary_loss_mlp": 0.01028082, + "balance_loss_clip": 1.01569581, + "balance_loss_mlp": 1.03626978, + "epoch": 0.6359236434691117, + "flos": 27525205825920.0, + "grad_norm": 1.7446831979165325, + "language_loss": 0.69537437, + "learning_rate": 1.236477571455085e-06, + "loss": 0.71671677, + "num_input_tokens_seen": 228022005, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69921875, + "step": 10577, + "time_per_iteration": 2.4888103008270264 + }, + { + "auxiliary_loss_clip": 0.01103371, + "auxiliary_loss_mlp": 0.01029672, + "balance_loss_clip": 1.01802576, + "balance_loss_mlp": 1.03562689, + "epoch": 0.6359837667217797, + "flos": 39348290989440.0, + "grad_norm": 1.9869814787183224, + "language_loss": 0.72090602, + "learning_rate": 1.2361176216949964e-06, + "loss": 0.74223644, + "num_input_tokens_seen": 228043770, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 10578, + "time_per_iteration": 2.5746970176696777 + }, + { + "auxiliary_loss_clip": 0.01025564, + "auxiliary_loss_mlp": 0.0100215, + "balance_loss_clip": 1.00076127, + "balance_loss_mlp": 1.00480723, + "epoch": 0.6360438899744476, + "flos": 56413797206400.0, + "grad_norm": 0.7033646347458022, + "language_loss": 0.54444003, + "learning_rate": 1.2357577009014044e-06, + "loss": 0.56471717, + "num_input_tokens_seen": 228104985, + "router_z_loss_clip": 0.01391602, + "router_z_loss_mlp": 0.20800781, + "step": 10579, + "time_per_iteration": 3.1232736110687256 + }, + { + "auxiliary_loss_clip": 0.01105773, + "auxiliary_loss_mlp": 0.01027914, + "balance_loss_clip": 1.01612449, + "balance_loss_mlp": 1.0368464, + "epoch": 0.6361040132271156, + "flos": 24973106555520.0, + "grad_norm": 1.7171447811267215, + "language_loss": 0.77475232, + "learning_rate": 1.2353978090879568e-06, + "loss": 0.79608917, + "num_input_tokens_seen": 228125620, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6875, + "step": 10580, + "time_per_iteration": 2.461869239807129 + }, + { + "auxiliary_loss_clip": 0.01103242, + "auxiliary_loss_mlp": 0.01025216, + "balance_loss_clip": 1.0138669, + "balance_loss_mlp": 1.03540814, + "epoch": 0.6361641364797835, + "flos": 23259198130560.0, + "grad_norm": 2.098056730123376, + "language_loss": 0.67005563, + "learning_rate": 1.235037946268301e-06, + "loss": 0.69134021, + "num_input_tokens_seen": 228143495, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6796875, + "step": 10581, + "time_per_iteration": 2.4425008296966553 + }, + { + "auxiliary_loss_clip": 0.01102376, + "auxiliary_loss_mlp": 0.01030749, + "balance_loss_clip": 1.01949584, + "balance_loss_mlp": 1.03480268, + "epoch": 0.6362242597324516, + "flos": 25994513698560.0, + "grad_norm": 1.3074505079001684, + "language_loss": 0.68299043, + "learning_rate": 1.2346781124560828e-06, + "loss": 0.70432162, + "num_input_tokens_seen": 228166500, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.67578125, + "step": 10582, + "time_per_iteration": 2.4763622283935547 + }, + { + "auxiliary_loss_clip": 0.01106848, + "auxiliary_loss_mlp": 0.01035558, + "balance_loss_clip": 1.02355933, + "balance_loss_mlp": 1.03695726, + "epoch": 0.6362843829851195, + "flos": 25703242312320.0, + "grad_norm": 2.267802402035549, + "language_loss": 0.84247005, + "learning_rate": 1.2343183076649473e-06, + "loss": 0.8638941, + "num_input_tokens_seen": 228185325, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69921875, + "step": 10583, + "time_per_iteration": 2.4797277450561523 + }, + { + "auxiliary_loss_clip": 0.0110538, + "auxiliary_loss_mlp": 0.01029153, + "balance_loss_clip": 1.01703572, + "balance_loss_mlp": 1.03860188, + "epoch": 0.6363445062377875, + "flos": 20522912895360.0, + "grad_norm": 1.5650473008286672, + "language_loss": 0.7515592, + "learning_rate": 1.233958531908538e-06, + "loss": 0.77290452, + "num_input_tokens_seen": 228204050, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 10584, + "time_per_iteration": 2.430316209793091 + }, + { + "auxiliary_loss_clip": 0.01106996, + "auxiliary_loss_mlp": 0.01037982, + "balance_loss_clip": 1.02420747, + "balance_loss_mlp": 1.03688443, + "epoch": 0.6364046294904554, + "flos": 19463799450240.0, + "grad_norm": 1.9066305180241776, + "language_loss": 0.72856915, + "learning_rate": 1.2335987852004985e-06, + "loss": 0.75001895, + "num_input_tokens_seen": 228222430, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.703125, + "step": 10585, + "time_per_iteration": 2.4419803619384766 + }, + { + "auxiliary_loss_clip": 0.01105577, + "auxiliary_loss_mlp": 0.01028793, + "balance_loss_clip": 1.01756358, + "balance_loss_mlp": 1.03718138, + "epoch": 0.6364647527431234, + "flos": 20995892208000.0, + "grad_norm": 1.8332276657421747, + "language_loss": 0.82785809, + "learning_rate": 1.2332390675544697e-06, + "loss": 0.8492018, + "num_input_tokens_seen": 228241925, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.68359375, + "step": 10586, + "time_per_iteration": 2.421600341796875 + }, + { + "auxiliary_loss_clip": 0.01103874, + "auxiliary_loss_mlp": 0.0102664, + "balance_loss_clip": 1.01523161, + "balance_loss_mlp": 1.03603029, + "epoch": 0.6365248759957913, + "flos": 25770789838080.0, + "grad_norm": 4.704421092048837, + "language_loss": 0.72570878, + "learning_rate": 1.2328793789840918e-06, + "loss": 0.74701393, + "num_input_tokens_seen": 228262535, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6796875, + "step": 10587, + "time_per_iteration": 2.472022533416748 + }, + { + "auxiliary_loss_clip": 0.0110564, + "auxiliary_loss_mlp": 0.01025045, + "balance_loss_clip": 1.01379192, + "balance_loss_mlp": 1.03764784, + "epoch": 0.6365849992484593, + "flos": 22455589104000.0, + "grad_norm": 1.7915085469286844, + "language_loss": 0.76668859, + "learning_rate": 1.2325197195030058e-06, + "loss": 0.7879954, + "num_input_tokens_seen": 228281340, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6796875, + "step": 10588, + "time_per_iteration": 2.4190168380737305 + }, + { + "auxiliary_loss_clip": 0.01102746, + "auxiliary_loss_mlp": 0.01027364, + "balance_loss_clip": 1.01552689, + "balance_loss_mlp": 1.03755879, + "epoch": 0.6366451225011273, + "flos": 19025689265280.0, + "grad_norm": 1.3970993827847034, + "language_loss": 0.79966116, + "learning_rate": 1.2321600891248478e-06, + "loss": 0.82096231, + "num_input_tokens_seen": 228300865, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65234375, + "step": 10589, + "time_per_iteration": 2.4743268489837646 + }, + { + "auxiliary_loss_clip": 0.01104028, + "auxiliary_loss_mlp": 0.01028134, + "balance_loss_clip": 1.01623118, + "balance_loss_mlp": 1.03771806, + "epoch": 0.6367052457537953, + "flos": 25228395492480.0, + "grad_norm": 2.311775126826065, + "language_loss": 0.67541653, + "learning_rate": 1.231800487863257e-06, + "loss": 0.69673812, + "num_input_tokens_seen": 228320815, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 10590, + "time_per_iteration": 2.450011730194092 + }, + { + "auxiliary_loss_clip": 0.0111127, + "auxiliary_loss_mlp": 0.01031485, + "balance_loss_clip": 1.01871789, + "balance_loss_mlp": 1.03779423, + "epoch": 0.6367653690064633, + "flos": 19208438686080.0, + "grad_norm": 1.6364871188688683, + "language_loss": 0.79574269, + "learning_rate": 1.2314409157318685e-06, + "loss": 0.8171702, + "num_input_tokens_seen": 228339065, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.734375, + "step": 10591, + "time_per_iteration": 2.4351706504821777 + }, + { + "auxiliary_loss_clip": 0.01104191, + "auxiliary_loss_mlp": 0.01026707, + "balance_loss_clip": 1.0155071, + "balance_loss_mlp": 1.03807092, + "epoch": 0.6368254922591312, + "flos": 23546806329600.0, + "grad_norm": 1.6582489812189014, + "language_loss": 0.8898353, + "learning_rate": 1.231081372744317e-06, + "loss": 0.91114426, + "num_input_tokens_seen": 228359210, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.66015625, + "step": 10592, + "time_per_iteration": 2.4826667308807373 + }, + { + "auxiliary_loss_clip": 0.01101297, + "auxiliary_loss_mlp": 0.01025871, + "balance_loss_clip": 1.01484966, + "balance_loss_mlp": 1.03570986, + "epoch": 0.6368856155117992, + "flos": 26467313443200.0, + "grad_norm": 1.2873763192716858, + "language_loss": 0.68307251, + "learning_rate": 1.2307218589142376e-06, + "loss": 0.70434421, + "num_input_tokens_seen": 228379630, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 10593, + "time_per_iteration": 3.870232105255127 + }, + { + "auxiliary_loss_clip": 0.01101916, + "auxiliary_loss_mlp": 0.01030234, + "balance_loss_clip": 1.01887894, + "balance_loss_mlp": 1.03454375, + "epoch": 0.6369457387644671, + "flos": 33692432394240.0, + "grad_norm": 1.9223941478023494, + "language_loss": 0.63311636, + "learning_rate": 1.2303623742552618e-06, + "loss": 0.6544379, + "num_input_tokens_seen": 228401410, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.671875, + "step": 10594, + "time_per_iteration": 2.5456788539886475 + }, + { + "auxiliary_loss_clip": 0.01026012, + "auxiliary_loss_mlp": 0.00997701, + "balance_loss_clip": 0.99638408, + "balance_loss_mlp": 1.0052495, + "epoch": 0.6370058620171352, + "flos": 70908600908160.0, + "grad_norm": 0.7601242064241133, + "language_loss": 0.54636633, + "learning_rate": 1.230002918781022e-06, + "loss": 0.56660342, + "num_input_tokens_seen": 228470335, + "router_z_loss_clip": 0.01318359, + "router_z_loss_mlp": 0.20703125, + "step": 10595, + "time_per_iteration": 3.1794607639312744 + }, + { + "auxiliary_loss_clip": 0.01107322, + "auxiliary_loss_mlp": 0.01033931, + "balance_loss_clip": 1.02145016, + "balance_loss_mlp": 1.03732097, + "epoch": 0.6370659852698031, + "flos": 21141940907520.0, + "grad_norm": 1.701168717319966, + "language_loss": 0.6690321, + "learning_rate": 1.2296434925051493e-06, + "loss": 0.69044465, + "num_input_tokens_seen": 228490765, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69921875, + "step": 10596, + "time_per_iteration": 5.259617328643799 + }, + { + "auxiliary_loss_clip": 0.01105102, + "auxiliary_loss_mlp": 0.01028653, + "balance_loss_clip": 1.0173285, + "balance_loss_mlp": 1.03692877, + "epoch": 0.6371261085224711, + "flos": 20193288762240.0, + "grad_norm": 2.3148419368361686, + "language_loss": 0.78864521, + "learning_rate": 1.2292840954412718e-06, + "loss": 0.80998278, + "num_input_tokens_seen": 228509700, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6796875, + "step": 10597, + "time_per_iteration": 3.8967549800872803 + }, + { + "auxiliary_loss_clip": 0.01105388, + "auxiliary_loss_mlp": 0.01028448, + "balance_loss_clip": 1.01746917, + "balance_loss_mlp": 1.03798401, + "epoch": 0.637186231775139, + "flos": 19683536901120.0, + "grad_norm": 1.7226875807463897, + "language_loss": 0.7490381, + "learning_rate": 1.2289247276030189e-06, + "loss": 0.77037644, + "num_input_tokens_seen": 228529050, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.671875, + "step": 10598, + "time_per_iteration": 2.426950693130493 + }, + { + "auxiliary_loss_clip": 0.01105339, + "auxiliary_loss_mlp": 0.01031507, + "balance_loss_clip": 1.02002072, + "balance_loss_mlp": 1.03712225, + "epoch": 0.637246355027807, + "flos": 13071196995840.0, + "grad_norm": 1.827624008719727, + "language_loss": 0.68324673, + "learning_rate": 1.2285653890040176e-06, + "loss": 0.70461518, + "num_input_tokens_seen": 228544665, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6796875, + "step": 10599, + "time_per_iteration": 2.3905580043792725 + }, + { + "auxiliary_loss_clip": 0.01106294, + "auxiliary_loss_mlp": 0.0103191, + "balance_loss_clip": 1.0198344, + "balance_loss_mlp": 1.03601742, + "epoch": 0.6373064782804749, + "flos": 18222654856320.0, + "grad_norm": 2.01568733519361, + "language_loss": 0.80380464, + "learning_rate": 1.2282060796578942e-06, + "loss": 0.82518673, + "num_input_tokens_seen": 228562060, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.703125, + "step": 10600, + "time_per_iteration": 2.390493631362915 + }, + { + "auxiliary_loss_clip": 0.01102518, + "auxiliary_loss_mlp": 0.01028291, + "balance_loss_clip": 1.01701963, + "balance_loss_mlp": 1.03515601, + "epoch": 0.637366601533143, + "flos": 24498475217280.0, + "grad_norm": 1.447681041520347, + "language_loss": 0.79922855, + "learning_rate": 1.2278467995782732e-06, + "loss": 0.82053661, + "num_input_tokens_seen": 228582550, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.67578125, + "step": 10601, + "time_per_iteration": 2.4929754734039307 + }, + { + "auxiliary_loss_clip": 0.01105771, + "auxiliary_loss_mlp": 0.01026276, + "balance_loss_clip": 1.01460528, + "balance_loss_mlp": 1.03642297, + "epoch": 0.6374267247858109, + "flos": 26359042872960.0, + "grad_norm": 1.837610857942547, + "language_loss": 0.66878605, + "learning_rate": 1.2274875487787797e-06, + "loss": 0.69010651, + "num_input_tokens_seen": 228604960, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6953125, + "step": 10602, + "time_per_iteration": 2.467132091522217 + }, + { + "auxiliary_loss_clip": 0.01101843, + "auxiliary_loss_mlp": 0.01022562, + "balance_loss_clip": 1.01127887, + "balance_loss_mlp": 1.03530014, + "epoch": 0.6374868480384789, + "flos": 20371728551040.0, + "grad_norm": 1.9668253771039714, + "language_loss": 0.79456556, + "learning_rate": 1.2271283272730354e-06, + "loss": 0.81580961, + "num_input_tokens_seen": 228622195, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6640625, + "step": 10603, + "time_per_iteration": 2.439401149749756 + }, + { + "auxiliary_loss_clip": 0.01103337, + "auxiliary_loss_mlp": 0.01026726, + "balance_loss_clip": 1.01447129, + "balance_loss_mlp": 1.03550994, + "epoch": 0.6375469712911469, + "flos": 20996251344000.0, + "grad_norm": 2.0023670291582034, + "language_loss": 0.76751029, + "learning_rate": 1.2267691350746621e-06, + "loss": 0.78881085, + "num_input_tokens_seen": 228639735, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6796875, + "step": 10604, + "time_per_iteration": 2.4105138778686523 + }, + { + "auxiliary_loss_clip": 0.01108604, + "auxiliary_loss_mlp": 0.01032325, + "balance_loss_clip": 1.02013552, + "balance_loss_mlp": 1.03714681, + "epoch": 0.6376070945438148, + "flos": 19715748422400.0, + "grad_norm": 1.780209303316883, + "language_loss": 0.77448142, + "learning_rate": 1.226409972197281e-06, + "loss": 0.79589069, + "num_input_tokens_seen": 228658195, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.71484375, + "step": 10605, + "time_per_iteration": 2.4292843341827393 + }, + { + "auxiliary_loss_clip": 0.01105536, + "auxiliary_loss_mlp": 0.01026657, + "balance_loss_clip": 1.01408625, + "balance_loss_mlp": 1.03642416, + "epoch": 0.6376672177964828, + "flos": 21506757390720.0, + "grad_norm": 1.9363320912621251, + "language_loss": 0.65341508, + "learning_rate": 1.2260508386545106e-06, + "loss": 0.67473698, + "num_input_tokens_seen": 228677415, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69140625, + "step": 10606, + "time_per_iteration": 2.427497625350952 + }, + { + "auxiliary_loss_clip": 0.01101905, + "auxiliary_loss_mlp": 0.01033123, + "balance_loss_clip": 1.02189994, + "balance_loss_mlp": 1.03686523, + "epoch": 0.6377273410491507, + "flos": 18843873598080.0, + "grad_norm": 1.731945960339434, + "language_loss": 0.75044298, + "learning_rate": 1.225691734459971e-06, + "loss": 0.77179325, + "num_input_tokens_seen": 228696450, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 10607, + "time_per_iteration": 2.446707248687744 + }, + { + "auxiliary_loss_clip": 0.01106141, + "auxiliary_loss_mlp": 0.0103654, + "balance_loss_clip": 1.02503061, + "balance_loss_mlp": 1.03733909, + "epoch": 0.6377874643018188, + "flos": 53062970181120.0, + "grad_norm": 1.7077896003554156, + "language_loss": 0.65732801, + "learning_rate": 1.225332659627278e-06, + "loss": 0.67875481, + "num_input_tokens_seen": 228721600, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6875, + "step": 10608, + "time_per_iteration": 2.7172274589538574 + }, + { + "auxiliary_loss_clip": 0.01026098, + "auxiliary_loss_mlp": 0.01010909, + "balance_loss_clip": 1.00953197, + "balance_loss_mlp": 1.00546312, + "epoch": 0.6378475875544867, + "flos": 65135026465920.0, + "grad_norm": 0.7342720172803939, + "language_loss": 0.51933324, + "learning_rate": 1.2249736141700475e-06, + "loss": 0.53970337, + "num_input_tokens_seen": 228784535, + "router_z_loss_clip": 0.01379395, + "router_z_loss_mlp": 0.20703125, + "step": 10609, + "time_per_iteration": 3.038902759552002 + }, + { + "auxiliary_loss_clip": 0.01099294, + "auxiliary_loss_mlp": 0.01023726, + "balance_loss_clip": 1.01322937, + "balance_loss_mlp": 1.03415811, + "epoch": 0.6379077108071547, + "flos": 23002759958400.0, + "grad_norm": 1.5171992119631734, + "language_loss": 0.74632645, + "learning_rate": 1.2246145981018965e-06, + "loss": 0.76755667, + "num_input_tokens_seen": 228804110, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.65234375, + "step": 10610, + "time_per_iteration": 2.472832202911377 + }, + { + "auxiliary_loss_clip": 0.0102568, + "auxiliary_loss_mlp": 0.01003259, + "balance_loss_clip": 1.00187004, + "balance_loss_mlp": 1.00496507, + "epoch": 0.6379678340598226, + "flos": 67601947610880.0, + "grad_norm": 0.8614298288544585, + "language_loss": 0.63198531, + "learning_rate": 1.2242556114364364e-06, + "loss": 0.65227467, + "num_input_tokens_seen": 228867705, + "router_z_loss_clip": 0.01391602, + "router_z_loss_mlp": 0.20703125, + "step": 10611, + "time_per_iteration": 3.118346691131592 + }, + { + "auxiliary_loss_clip": 0.01104297, + "auxiliary_loss_mlp": 0.01035566, + "balance_loss_clip": 1.02351391, + "balance_loss_mlp": 1.03604293, + "epoch": 0.6380279573124906, + "flos": 29680061610240.0, + "grad_norm": 2.12180371585039, + "language_loss": 0.72335958, + "learning_rate": 1.223896654187282e-06, + "loss": 0.74475813, + "num_input_tokens_seen": 228889215, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.68359375, + "step": 10612, + "time_per_iteration": 2.5017549991607666 + }, + { + "auxiliary_loss_clip": 0.01025775, + "auxiliary_loss_mlp": 0.01000915, + "balance_loss_clip": 0.99957991, + "balance_loss_mlp": 1.00507379, + "epoch": 0.6380880805651585, + "flos": 66484046580480.0, + "grad_norm": 0.7184948556551517, + "language_loss": 0.57873541, + "learning_rate": 1.2235377263680446e-06, + "loss": 0.5990023, + "num_input_tokens_seen": 228948465, + "router_z_loss_clip": 0.0133667, + "router_z_loss_mlp": 0.20703125, + "step": 10613, + "time_per_iteration": 2.9799587726593018 + }, + { + "auxiliary_loss_clip": 0.01107464, + "auxiliary_loss_mlp": 0.01031857, + "balance_loss_clip": 1.01890481, + "balance_loss_mlp": 1.03777075, + "epoch": 0.6381482038178266, + "flos": 23914998691200.0, + "grad_norm": 1.7105248760789145, + "language_loss": 0.75128651, + "learning_rate": 1.2231788279923334e-06, + "loss": 0.77267975, + "num_input_tokens_seen": 228967955, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69921875, + "step": 10614, + "time_per_iteration": 2.491565465927124 + }, + { + "auxiliary_loss_clip": 0.01106638, + "auxiliary_loss_mlp": 0.01034933, + "balance_loss_clip": 1.02311897, + "balance_loss_mlp": 1.03886855, + "epoch": 0.6382083270704945, + "flos": 24243042625920.0, + "grad_norm": 1.84751826433944, + "language_loss": 0.79666638, + "learning_rate": 1.2228199590737599e-06, + "loss": 0.81808209, + "num_input_tokens_seen": 228985495, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 10615, + "time_per_iteration": 2.492230176925659 + }, + { + "auxiliary_loss_clip": 0.0102549, + "auxiliary_loss_mlp": 0.01004342, + "balance_loss_clip": 1.00296533, + "balance_loss_mlp": 1.00477338, + "epoch": 0.6382684503231625, + "flos": 70775552931840.0, + "grad_norm": 0.6538614969335592, + "language_loss": 0.55591351, + "learning_rate": 1.2224611196259305e-06, + "loss": 0.57621187, + "num_input_tokens_seen": 229052995, + "router_z_loss_clip": 0.01379395, + "router_z_loss_mlp": 0.20703125, + "step": 10616, + "time_per_iteration": 3.1426796913146973 + }, + { + "auxiliary_loss_clip": 0.01103937, + "auxiliary_loss_mlp": 0.01030089, + "balance_loss_clip": 1.01841819, + "balance_loss_mlp": 1.03616679, + "epoch": 0.6383285735758305, + "flos": 16544836621440.0, + "grad_norm": 1.9069966042725246, + "language_loss": 0.83733106, + "learning_rate": 1.2221023096624538e-06, + "loss": 0.85867131, + "num_input_tokens_seen": 229071030, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 10617, + "time_per_iteration": 2.4153995513916016 + }, + { + "auxiliary_loss_clip": 0.01104997, + "auxiliary_loss_mlp": 0.01034225, + "balance_loss_clip": 1.0218631, + "balance_loss_mlp": 1.03582323, + "epoch": 0.6383886968284984, + "flos": 14427651225600.0, + "grad_norm": 1.8815450583884574, + "language_loss": 0.87111914, + "learning_rate": 1.221743529196936e-06, + "loss": 0.89251137, + "num_input_tokens_seen": 229088275, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.69140625, + "step": 10618, + "time_per_iteration": 2.4547295570373535 + }, + { + "auxiliary_loss_clip": 0.01108351, + "auxiliary_loss_mlp": 0.01033496, + "balance_loss_clip": 1.02284503, + "balance_loss_mlp": 1.03887093, + "epoch": 0.6384488200811664, + "flos": 17929659617280.0, + "grad_norm": 1.8631596079726758, + "language_loss": 0.73287827, + "learning_rate": 1.2213847782429806e-06, + "loss": 0.75429678, + "num_input_tokens_seen": 229105190, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6953125, + "step": 10619, + "time_per_iteration": 2.4028847217559814 + }, + { + "auxiliary_loss_clip": 0.01110376, + "auxiliary_loss_mlp": 0.01033636, + "balance_loss_clip": 1.02090454, + "balance_loss_mlp": 1.03807545, + "epoch": 0.6385089433338343, + "flos": 18515578268160.0, + "grad_norm": 1.9227827130097541, + "language_loss": 0.76158774, + "learning_rate": 1.221026056814193e-06, + "loss": 0.78302789, + "num_input_tokens_seen": 229122290, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.72265625, + "step": 10620, + "time_per_iteration": 2.4420766830444336 + }, + { + "auxiliary_loss_clip": 0.01104115, + "auxiliary_loss_mlp": 0.0102525, + "balance_loss_clip": 1.01349616, + "balance_loss_mlp": 1.03697598, + "epoch": 0.6385690665865024, + "flos": 24753620499840.0, + "grad_norm": 2.4243704084161806, + "language_loss": 0.70476806, + "learning_rate": 1.2206673649241752e-06, + "loss": 0.7260617, + "num_input_tokens_seen": 229141620, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.671875, + "step": 10621, + "time_per_iteration": 2.474518299102783 + }, + { + "auxiliary_loss_clip": 0.01098545, + "auxiliary_loss_mlp": 0.0102422, + "balance_loss_clip": 1.01336575, + "balance_loss_mlp": 1.03505826, + "epoch": 0.6386291898391703, + "flos": 20120569678080.0, + "grad_norm": 1.540938795838808, + "language_loss": 0.77551067, + "learning_rate": 1.220308702586529e-06, + "loss": 0.79673827, + "num_input_tokens_seen": 229161570, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6328125, + "step": 10622, + "time_per_iteration": 2.4603724479675293 + }, + { + "auxiliary_loss_clip": 0.01102358, + "auxiliary_loss_mlp": 0.01027326, + "balance_loss_clip": 1.0161562, + "balance_loss_mlp": 1.0359875, + "epoch": 0.6386893130918383, + "flos": 16867278034560.0, + "grad_norm": 1.7317763854255814, + "language_loss": 0.7494216, + "learning_rate": 1.2199500698148546e-06, + "loss": 0.77071846, + "num_input_tokens_seen": 229178465, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6640625, + "step": 10623, + "time_per_iteration": 2.4120795726776123 + }, + { + "auxiliary_loss_clip": 0.01098287, + "auxiliary_loss_mlp": 0.01029397, + "balance_loss_clip": 1.01879287, + "balance_loss_mlp": 1.03354859, + "epoch": 0.6387494363445062, + "flos": 22966274718720.0, + "grad_norm": 1.6666297183957082, + "language_loss": 0.76487082, + "learning_rate": 1.2195914666227527e-06, + "loss": 0.78614771, + "num_input_tokens_seen": 229198975, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6484375, + "step": 10624, + "time_per_iteration": 2.4929676055908203 + }, + { + "auxiliary_loss_clip": 0.0110372, + "auxiliary_loss_mlp": 0.01026899, + "balance_loss_clip": 1.01597917, + "balance_loss_mlp": 1.03606153, + "epoch": 0.6388095595971742, + "flos": 22857716839680.0, + "grad_norm": 1.639287347980187, + "language_loss": 0.80685896, + "learning_rate": 1.21923289302382e-06, + "loss": 0.82816517, + "num_input_tokens_seen": 229218825, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.67578125, + "step": 10625, + "time_per_iteration": 2.4569015502929688 + }, + { + "auxiliary_loss_clip": 0.01105914, + "auxiliary_loss_mlp": 0.0103107, + "balance_loss_clip": 1.01917887, + "balance_loss_mlp": 1.03810406, + "epoch": 0.6388696828498421, + "flos": 17311529445120.0, + "grad_norm": 1.744297621070212, + "language_loss": 0.72630143, + "learning_rate": 1.218874349031654e-06, + "loss": 0.74767131, + "num_input_tokens_seen": 229236060, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 10626, + "time_per_iteration": 2.441058397293091 + }, + { + "auxiliary_loss_clip": 0.01104529, + "auxiliary_loss_mlp": 0.01029322, + "balance_loss_clip": 1.01703739, + "balance_loss_mlp": 1.03571403, + "epoch": 0.6389298061025102, + "flos": 17128636369920.0, + "grad_norm": 1.7246902612727075, + "language_loss": 0.72518885, + "learning_rate": 1.2185158346598517e-06, + "loss": 0.74652737, + "num_input_tokens_seen": 229255160, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 10627, + "time_per_iteration": 2.40901780128479 + }, + { + "auxiliary_loss_clip": 0.01108886, + "auxiliary_loss_mlp": 0.01029992, + "balance_loss_clip": 1.0163188, + "balance_loss_mlp": 1.03729248, + "epoch": 0.6389899293551781, + "flos": 27710971989120.0, + "grad_norm": 2.244776770999307, + "language_loss": 0.67281765, + "learning_rate": 1.2181573499220064e-06, + "loss": 0.69420648, + "num_input_tokens_seen": 229278705, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71875, + "step": 10628, + "time_per_iteration": 2.5263736248016357 + }, + { + "auxiliary_loss_clip": 0.01100861, + "auxiliary_loss_mlp": 0.01026385, + "balance_loss_clip": 1.01536465, + "balance_loss_mlp": 1.03674936, + "epoch": 0.6390500526078461, + "flos": 21215701486080.0, + "grad_norm": 1.8036287880835562, + "language_loss": 0.67833781, + "learning_rate": 1.2177988948317135e-06, + "loss": 0.69961035, + "num_input_tokens_seen": 229299990, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 10629, + "time_per_iteration": 2.477262258529663 + }, + { + "auxiliary_loss_clip": 0.01110269, + "auxiliary_loss_mlp": 0.01040382, + "balance_loss_clip": 1.02683949, + "balance_loss_mlp": 1.03733897, + "epoch": 0.6391101758605141, + "flos": 21581056673280.0, + "grad_norm": 1.4737896174832923, + "language_loss": 0.75127286, + "learning_rate": 1.2174404694025646e-06, + "loss": 0.77277935, + "num_input_tokens_seen": 229319230, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.73046875, + "step": 10630, + "time_per_iteration": 2.4760096073150635 + }, + { + "auxiliary_loss_clip": 0.01102055, + "auxiliary_loss_mlp": 0.01030181, + "balance_loss_clip": 1.01984, + "balance_loss_mlp": 1.03617334, + "epoch": 0.639170299113182, + "flos": 19900473091200.0, + "grad_norm": 1.5423208876827523, + "language_loss": 0.70398533, + "learning_rate": 1.2170820736481511e-06, + "loss": 0.7253077, + "num_input_tokens_seen": 229338600, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.65625, + "step": 10631, + "time_per_iteration": 2.452275514602661 + }, + { + "auxiliary_loss_clip": 0.01023775, + "auxiliary_loss_mlp": 0.00996899, + "balance_loss_clip": 0.99556983, + "balance_loss_mlp": 1.00307584, + "epoch": 0.63923042236585, + "flos": 69877604833920.0, + "grad_norm": 0.7719101864922713, + "language_loss": 0.63005149, + "learning_rate": 1.2167237075820646e-06, + "loss": 0.6502583, + "num_input_tokens_seen": 229402420, + "router_z_loss_clip": 0.01330566, + "router_z_loss_mlp": 0.20703125, + "step": 10632, + "time_per_iteration": 3.1005401611328125 + }, + { + "auxiliary_loss_clip": 0.01101477, + "auxiliary_loss_mlp": 0.01028273, + "balance_loss_clip": 1.01642942, + "balance_loss_mlp": 1.03553295, + "epoch": 0.639290545618518, + "flos": 22674823764480.0, + "grad_norm": 2.062081508069593, + "language_loss": 0.66411757, + "learning_rate": 1.216365371217893e-06, + "loss": 0.68541509, + "num_input_tokens_seen": 229419185, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66015625, + "step": 10633, + "time_per_iteration": 2.4561798572540283 + }, + { + "auxiliary_loss_clip": 0.0110405, + "auxiliary_loss_mlp": 0.01027355, + "balance_loss_clip": 1.01587558, + "balance_loss_mlp": 1.03670645, + "epoch": 0.639350668871186, + "flos": 19829190551040.0, + "grad_norm": 2.980251338642478, + "language_loss": 0.81779587, + "learning_rate": 1.216007064569225e-06, + "loss": 0.8391099, + "num_input_tokens_seen": 229436735, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.671875, + "step": 10634, + "time_per_iteration": 2.4740054607391357 + }, + { + "auxiliary_loss_clip": 0.01104597, + "auxiliary_loss_mlp": 0.01030359, + "balance_loss_clip": 1.01801491, + "balance_loss_mlp": 1.03732753, + "epoch": 0.6394107921238539, + "flos": 20553328736640.0, + "grad_norm": 1.7668249879195463, + "language_loss": 0.75268984, + "learning_rate": 1.2156487876496483e-06, + "loss": 0.77403939, + "num_input_tokens_seen": 229455595, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.671875, + "step": 10635, + "time_per_iteration": 3.8579487800598145 + }, + { + "auxiliary_loss_clip": 0.01103838, + "auxiliary_loss_mlp": 0.01030566, + "balance_loss_clip": 1.01875806, + "balance_loss_mlp": 1.03555012, + "epoch": 0.6394709153765219, + "flos": 25774991729280.0, + "grad_norm": 1.8856871240472837, + "language_loss": 0.71665233, + "learning_rate": 1.2152905404727475e-06, + "loss": 0.73799634, + "num_input_tokens_seen": 229476230, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.68359375, + "step": 10636, + "time_per_iteration": 2.4976108074188232 + }, + { + "auxiliary_loss_clip": 0.01106058, + "auxiliary_loss_mlp": 0.01030245, + "balance_loss_clip": 1.01808596, + "balance_loss_mlp": 1.03683591, + "epoch": 0.6395310386291898, + "flos": 17530153574400.0, + "grad_norm": 4.067899624402538, + "language_loss": 0.7341159, + "learning_rate": 1.2149323230521085e-06, + "loss": 0.75547898, + "num_input_tokens_seen": 229494300, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69140625, + "step": 10637, + "time_per_iteration": 2.4985272884368896 + }, + { + "auxiliary_loss_clip": 0.01105341, + "auxiliary_loss_mlp": 0.01028468, + "balance_loss_clip": 1.0159924, + "balance_loss_mlp": 1.03592014, + "epoch": 0.6395911618818578, + "flos": 18588225525120.0, + "grad_norm": 1.8415469934331217, + "language_loss": 0.77680337, + "learning_rate": 1.2145741354013143e-06, + "loss": 0.79814142, + "num_input_tokens_seen": 229512985, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6953125, + "step": 10638, + "time_per_iteration": 5.310981035232544 + }, + { + "auxiliary_loss_clip": 0.01102761, + "auxiliary_loss_mlp": 0.01028447, + "balance_loss_clip": 1.016675, + "balance_loss_mlp": 1.0358299, + "epoch": 0.6396512851345257, + "flos": 28366557068160.0, + "grad_norm": 3.6995147498561636, + "language_loss": 0.81817627, + "learning_rate": 1.2142159775339478e-06, + "loss": 0.83948827, + "num_input_tokens_seen": 229534270, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.66796875, + "step": 10639, + "time_per_iteration": 3.956713914871216 + }, + { + "auxiliary_loss_clip": 0.01024264, + "auxiliary_loss_mlp": 0.0100149, + "balance_loss_clip": 1.00012457, + "balance_loss_mlp": 1.00365281, + "epoch": 0.6397114083871938, + "flos": 70724307202560.0, + "grad_norm": 0.8122323276395823, + "language_loss": 0.59012806, + "learning_rate": 1.21385784946359e-06, + "loss": 0.61038566, + "num_input_tokens_seen": 229596455, + "router_z_loss_clip": 0.01367188, + "router_z_loss_mlp": 0.20605469, + "step": 10640, + "time_per_iteration": 3.01208758354187 + }, + { + "auxiliary_loss_clip": 0.01100429, + "auxiliary_loss_mlp": 0.01025452, + "balance_loss_clip": 1.01467586, + "balance_loss_mlp": 1.03550696, + "epoch": 0.6397715316398617, + "flos": 18142537570560.0, + "grad_norm": 1.7939599084799007, + "language_loss": 0.78193939, + "learning_rate": 1.2134997512038215e-06, + "loss": 0.80319822, + "num_input_tokens_seen": 229612860, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6484375, + "step": 10641, + "time_per_iteration": 2.399609327316284 + }, + { + "auxiliary_loss_clip": 0.01108702, + "auxiliary_loss_mlp": 0.01034799, + "balance_loss_clip": 1.02274656, + "balance_loss_mlp": 1.03676474, + "epoch": 0.6398316548925297, + "flos": 25739512070400.0, + "grad_norm": 22.013815914762134, + "language_loss": 0.63092768, + "learning_rate": 1.2131416827682209e-06, + "loss": 0.65236264, + "num_input_tokens_seen": 229633960, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.71875, + "step": 10642, + "time_per_iteration": 2.4959514141082764 + }, + { + "auxiliary_loss_clip": 0.01024704, + "auxiliary_loss_mlp": 0.01003714, + "balance_loss_clip": 1.00239646, + "balance_loss_mlp": 1.00392115, + "epoch": 0.6398917781451977, + "flos": 71214234756480.0, + "grad_norm": 0.944530378795617, + "language_loss": 0.55960983, + "learning_rate": 1.2127836441703667e-06, + "loss": 0.57989401, + "num_input_tokens_seen": 229686730, + "router_z_loss_clip": 0.01318359, + "router_z_loss_mlp": 0.20800781, + "step": 10643, + "time_per_iteration": 2.9914019107818604 + }, + { + "auxiliary_loss_clip": 0.01108117, + "auxiliary_loss_mlp": 0.01025119, + "balance_loss_clip": 1.01332903, + "balance_loss_mlp": 1.03745127, + "epoch": 0.6399519013978656, + "flos": 20521835487360.0, + "grad_norm": 2.5171801924474764, + "language_loss": 0.77069736, + "learning_rate": 1.2124256354238358e-06, + "loss": 0.79202974, + "num_input_tokens_seen": 229704800, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.70703125, + "step": 10644, + "time_per_iteration": 2.437391996383667 + }, + { + "auxiliary_loss_clip": 0.01104463, + "auxiliary_loss_mlp": 0.01031384, + "balance_loss_clip": 1.01893854, + "balance_loss_mlp": 1.03780031, + "epoch": 0.6400120246505336, + "flos": 24460840742400.0, + "grad_norm": 1.4086380930188218, + "language_loss": 0.82438183, + "learning_rate": 1.212067656542203e-06, + "loss": 0.84574032, + "num_input_tokens_seen": 229725265, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.66796875, + "step": 10645, + "time_per_iteration": 2.4806745052337646 + }, + { + "auxiliary_loss_clip": 0.01108703, + "auxiliary_loss_mlp": 0.01034822, + "balance_loss_clip": 1.0219593, + "balance_loss_mlp": 1.03747869, + "epoch": 0.6400721479032015, + "flos": 28366090191360.0, + "grad_norm": 1.670748165032705, + "language_loss": 0.73261863, + "learning_rate": 1.2117097075390447e-06, + "loss": 0.75405383, + "num_input_tokens_seen": 229744840, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 10646, + "time_per_iteration": 2.515089988708496 + }, + { + "auxiliary_loss_clip": 0.01105459, + "auxiliary_loss_mlp": 0.01031116, + "balance_loss_clip": 1.01905167, + "balance_loss_mlp": 1.03657353, + "epoch": 0.6401322711558696, + "flos": 17816540711040.0, + "grad_norm": 2.6918825179848747, + "language_loss": 0.79892278, + "learning_rate": 1.2113517884279327e-06, + "loss": 0.82028854, + "num_input_tokens_seen": 229759095, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.69140625, + "step": 10647, + "time_per_iteration": 2.423576593399048 + }, + { + "auxiliary_loss_clip": 0.01102623, + "auxiliary_loss_mlp": 0.01029035, + "balance_loss_clip": 1.01801419, + "balance_loss_mlp": 1.03732038, + "epoch": 0.6401923944085375, + "flos": 26030855283840.0, + "grad_norm": 2.4485135437848724, + "language_loss": 0.75737441, + "learning_rate": 1.2109938992224399e-06, + "loss": 0.77869105, + "num_input_tokens_seen": 229777750, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65234375, + "step": 10648, + "time_per_iteration": 2.528726100921631 + }, + { + "auxiliary_loss_clip": 0.01103307, + "auxiliary_loss_mlp": 0.01029506, + "balance_loss_clip": 1.01779962, + "balance_loss_mlp": 1.03479123, + "epoch": 0.6402525176612055, + "flos": 23586451966080.0, + "grad_norm": 1.7767786509202286, + "language_loss": 0.78653902, + "learning_rate": 1.210636039936138e-06, + "loss": 0.80786711, + "num_input_tokens_seen": 229796785, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.68359375, + "step": 10649, + "time_per_iteration": 2.4528145790100098 + }, + { + "auxiliary_loss_clip": 0.01103744, + "auxiliary_loss_mlp": 0.0103342, + "balance_loss_clip": 1.02100444, + "balance_loss_mlp": 1.03651512, + "epoch": 0.6403126409138734, + "flos": 18041413806720.0, + "grad_norm": 1.6464773742271148, + "language_loss": 0.75819784, + "learning_rate": 1.2102782105825956e-06, + "loss": 0.77956951, + "num_input_tokens_seen": 229815425, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 10650, + "time_per_iteration": 2.4333670139312744 + }, + { + "auxiliary_loss_clip": 0.01102627, + "auxiliary_loss_mlp": 0.01030825, + "balance_loss_clip": 1.01803422, + "balance_loss_mlp": 1.0351758, + "epoch": 0.6403727641665414, + "flos": 21979485308160.0, + "grad_norm": 1.4678123102603653, + "language_loss": 0.70750296, + "learning_rate": 1.2099204111753833e-06, + "loss": 0.72883749, + "num_input_tokens_seen": 229834545, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.67578125, + "step": 10651, + "time_per_iteration": 2.4399240016937256 + }, + { + "auxiliary_loss_clip": 0.01104316, + "auxiliary_loss_mlp": 0.01037837, + "balance_loss_clip": 1.02509403, + "balance_loss_mlp": 1.03600538, + "epoch": 0.6404328874192093, + "flos": 24895539135360.0, + "grad_norm": 2.264038346674132, + "language_loss": 0.63932753, + "learning_rate": 1.2095626417280684e-06, + "loss": 0.66074908, + "num_input_tokens_seen": 229849175, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.68359375, + "step": 10652, + "time_per_iteration": 2.4656026363372803 + }, + { + "auxiliary_loss_clip": 0.01104729, + "auxiliary_loss_mlp": 0.01029772, + "balance_loss_clip": 1.01769614, + "balance_loss_mlp": 1.03726971, + "epoch": 0.6404930106718774, + "flos": 17597198309760.0, + "grad_norm": 2.2063618593971586, + "language_loss": 0.79597425, + "learning_rate": 1.2092049022542168e-06, + "loss": 0.81731927, + "num_input_tokens_seen": 229865400, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 10653, + "time_per_iteration": 2.4099206924438477 + }, + { + "auxiliary_loss_clip": 0.01113277, + "auxiliary_loss_mlp": 0.01045693, + "balance_loss_clip": 1.03203726, + "balance_loss_mlp": 1.03744364, + "epoch": 0.6405531339245453, + "flos": 20157880930560.0, + "grad_norm": 2.172692455677744, + "language_loss": 0.69950652, + "learning_rate": 1.2088471927673952e-06, + "loss": 0.72109628, + "num_input_tokens_seen": 229882945, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.7578125, + "step": 10654, + "time_per_iteration": 2.451249122619629 + }, + { + "auxiliary_loss_clip": 0.01108717, + "auxiliary_loss_mlp": 0.01036203, + "balance_loss_clip": 1.02347147, + "balance_loss_mlp": 1.03717566, + "epoch": 0.6406132571772133, + "flos": 21942281796480.0, + "grad_norm": 1.7648347923503578, + "language_loss": 0.72763705, + "learning_rate": 1.2084895132811666e-06, + "loss": 0.74908626, + "num_input_tokens_seen": 229901590, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.71875, + "step": 10655, + "time_per_iteration": 2.4311604499816895 + }, + { + "auxiliary_loss_clip": 0.01106611, + "auxiliary_loss_mlp": 0.0103335, + "balance_loss_clip": 1.02160144, + "balance_loss_mlp": 1.0368948, + "epoch": 0.6406733804298813, + "flos": 28768002445440.0, + "grad_norm": 1.5980795641640981, + "language_loss": 0.83070755, + "learning_rate": 1.2081318638090952e-06, + "loss": 0.85210717, + "num_input_tokens_seen": 229922535, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.69921875, + "step": 10656, + "time_per_iteration": 2.5178308486938477 + }, + { + "auxiliary_loss_clip": 0.01103309, + "auxiliary_loss_mlp": 0.01034264, + "balance_loss_clip": 1.02264667, + "balance_loss_mlp": 1.03502929, + "epoch": 0.6407335036825492, + "flos": 17457183095040.0, + "grad_norm": 2.258129795094631, + "language_loss": 0.72108161, + "learning_rate": 1.2077742443647433e-06, + "loss": 0.74245739, + "num_input_tokens_seen": 229939575, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.68359375, + "step": 10657, + "time_per_iteration": 2.422863483428955 + }, + { + "auxiliary_loss_clip": 0.01103006, + "auxiliary_loss_mlp": 0.01032513, + "balance_loss_clip": 1.02086604, + "balance_loss_mlp": 1.03499269, + "epoch": 0.6407936269352172, + "flos": 22125282612480.0, + "grad_norm": 2.427174353089587, + "language_loss": 0.7728945, + "learning_rate": 1.2074166549616707e-06, + "loss": 0.79424977, + "num_input_tokens_seen": 229958840, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 10658, + "time_per_iteration": 2.449277877807617 + }, + { + "auxiliary_loss_clip": 0.01108084, + "auxiliary_loss_mlp": 0.01039076, + "balance_loss_clip": 1.02629066, + "balance_loss_mlp": 1.03781724, + "epoch": 0.6408537501878852, + "flos": 23110635479040.0, + "grad_norm": 1.5608188078670746, + "language_loss": 0.7607885, + "learning_rate": 1.2070590956134386e-06, + "loss": 0.78226012, + "num_input_tokens_seen": 229979680, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.703125, + "step": 10659, + "time_per_iteration": 2.4464104175567627 + }, + { + "auxiliary_loss_clip": 0.01103759, + "auxiliary_loss_mlp": 0.0103387, + "balance_loss_clip": 1.02172303, + "balance_loss_mlp": 1.03568363, + "epoch": 0.6409138734405532, + "flos": 16472440759680.0, + "grad_norm": 1.6810966877518245, + "language_loss": 0.78276753, + "learning_rate": 1.2067015663336046e-06, + "loss": 0.80414379, + "num_input_tokens_seen": 229996830, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6796875, + "step": 10660, + "time_per_iteration": 2.463932752609253 + }, + { + "auxiliary_loss_clip": 0.01110744, + "auxiliary_loss_mlp": 0.01037101, + "balance_loss_clip": 1.023785, + "balance_loss_mlp": 1.03830671, + "epoch": 0.6409739966932211, + "flos": 22777922776320.0, + "grad_norm": 2.1049933789165727, + "language_loss": 0.68227595, + "learning_rate": 1.206344067135727e-06, + "loss": 0.70375443, + "num_input_tokens_seen": 230015115, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7265625, + "step": 10661, + "time_per_iteration": 2.4437673091888428 + }, + { + "auxiliary_loss_clip": 0.01103563, + "auxiliary_loss_mlp": 0.01036782, + "balance_loss_clip": 1.02566016, + "balance_loss_mlp": 1.0374167, + "epoch": 0.6410341199458891, + "flos": 25152049134720.0, + "grad_norm": 1.4944389143541703, + "language_loss": 0.75839317, + "learning_rate": 1.205986598033362e-06, + "loss": 0.77979672, + "num_input_tokens_seen": 230035515, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66015625, + "step": 10662, + "time_per_iteration": 2.4985625743865967 + }, + { + "auxiliary_loss_clip": 0.01102338, + "auxiliary_loss_mlp": 0.01030167, + "balance_loss_clip": 1.01905084, + "balance_loss_mlp": 1.03421175, + "epoch": 0.641094243198557, + "flos": 27046193028480.0, + "grad_norm": 1.8768391350540305, + "language_loss": 0.69502836, + "learning_rate": 1.2056291590400644e-06, + "loss": 0.71635342, + "num_input_tokens_seen": 230054355, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6796875, + "step": 10663, + "time_per_iteration": 2.479556083679199 + }, + { + "auxiliary_loss_clip": 0.01106696, + "auxiliary_loss_mlp": 0.01040197, + "balance_loss_clip": 1.02654743, + "balance_loss_mlp": 1.03724718, + "epoch": 0.641154366451225, + "flos": 25374551932800.0, + "grad_norm": 2.235560561918587, + "language_loss": 0.68056524, + "learning_rate": 1.205271750169389e-06, + "loss": 0.70203424, + "num_input_tokens_seen": 230074605, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.6953125, + "step": 10664, + "time_per_iteration": 2.490736484527588 + }, + { + "auxiliary_loss_clip": 0.01101883, + "auxiliary_loss_mlp": 0.01029231, + "balance_loss_clip": 1.01797128, + "balance_loss_mlp": 1.03587985, + "epoch": 0.6412144897038929, + "flos": 25153342024320.0, + "grad_norm": 1.8443375686405623, + "language_loss": 0.66447258, + "learning_rate": 1.2049143714348881e-06, + "loss": 0.68578362, + "num_input_tokens_seen": 230093820, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.66015625, + "step": 10665, + "time_per_iteration": 2.4581611156463623 + }, + { + "auxiliary_loss_clip": 0.01102013, + "auxiliary_loss_mlp": 0.01029843, + "balance_loss_clip": 1.01801181, + "balance_loss_mlp": 1.03565812, + "epoch": 0.641274612956561, + "flos": 23440762402560.0, + "grad_norm": 1.9911859706917303, + "language_loss": 0.64523447, + "learning_rate": 1.2045570228501145e-06, + "loss": 0.66655302, + "num_input_tokens_seen": 230114285, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 10666, + "time_per_iteration": 2.4770736694335938 + }, + { + "auxiliary_loss_clip": 0.01105742, + "auxiliary_loss_mlp": 0.01031432, + "balance_loss_clip": 1.01933265, + "balance_loss_mlp": 1.03609776, + "epoch": 0.6413347362092289, + "flos": 19427493778560.0, + "grad_norm": 1.666384333420834, + "language_loss": 0.7067616, + "learning_rate": 1.2041997044286176e-06, + "loss": 0.72813338, + "num_input_tokens_seen": 230132760, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 10667, + "time_per_iteration": 2.407938003540039 + }, + { + "auxiliary_loss_clip": 0.01114508, + "auxiliary_loss_mlp": 0.01034302, + "balance_loss_clip": 1.02077127, + "balance_loss_mlp": 1.03901672, + "epoch": 0.6413948594618969, + "flos": 17196578945280.0, + "grad_norm": 2.2700946721922874, + "language_loss": 0.77413416, + "learning_rate": 1.2038424161839484e-06, + "loss": 0.79562223, + "num_input_tokens_seen": 230149690, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.75390625, + "step": 10668, + "time_per_iteration": 2.421332836151123 + }, + { + "auxiliary_loss_clip": 0.01105498, + "auxiliary_loss_mlp": 0.01034123, + "balance_loss_clip": 1.02199399, + "balance_loss_mlp": 1.0376507, + "epoch": 0.6414549827145648, + "flos": 22269787027200.0, + "grad_norm": 1.6100109548180268, + "language_loss": 0.67520595, + "learning_rate": 1.2034851581296544e-06, + "loss": 0.69660217, + "num_input_tokens_seen": 230166950, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 10669, + "time_per_iteration": 2.426586866378784 + }, + { + "auxiliary_loss_clip": 0.01112518, + "auxiliary_loss_mlp": 0.01037501, + "balance_loss_clip": 1.02510333, + "balance_loss_mlp": 1.03997803, + "epoch": 0.6415151059672328, + "flos": 19640192163840.0, + "grad_norm": 1.7319389151723867, + "language_loss": 0.78258085, + "learning_rate": 1.2031279302792825e-06, + "loss": 0.80408102, + "num_input_tokens_seen": 230184785, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7265625, + "step": 10670, + "time_per_iteration": 2.469668388366699 + }, + { + "auxiliary_loss_clip": 0.01108443, + "auxiliary_loss_mlp": 0.01034621, + "balance_loss_clip": 1.02200222, + "balance_loss_mlp": 1.03697228, + "epoch": 0.6415752292199008, + "flos": 14865833237760.0, + "grad_norm": 2.2181025019747445, + "language_loss": 0.88322049, + "learning_rate": 1.20277073264638e-06, + "loss": 0.90465117, + "num_input_tokens_seen": 230201385, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.71484375, + "step": 10671, + "time_per_iteration": 2.391927480697632 + }, + { + "auxiliary_loss_clip": 0.01103513, + "auxiliary_loss_mlp": 0.0102843, + "balance_loss_clip": 1.01708126, + "balance_loss_mlp": 1.03752613, + "epoch": 0.6416353524725688, + "flos": 13735580906880.0, + "grad_norm": 1.4861712883005815, + "language_loss": 0.69451904, + "learning_rate": 1.2024135652444907e-06, + "loss": 0.71583843, + "num_input_tokens_seen": 230220380, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66015625, + "step": 10672, + "time_per_iteration": 2.4214959144592285 + }, + { + "auxiliary_loss_clip": 0.01109224, + "auxiliary_loss_mlp": 0.0102904, + "balance_loss_clip": 1.01554513, + "balance_loss_mlp": 1.03705025, + "epoch": 0.6416954757252368, + "flos": 24534924543360.0, + "grad_norm": 1.748656888764651, + "language_loss": 0.7392627, + "learning_rate": 1.2020564280871593e-06, + "loss": 0.76064527, + "num_input_tokens_seen": 230239845, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.72265625, + "step": 10673, + "time_per_iteration": 2.4611282348632812 + }, + { + "auxiliary_loss_clip": 0.01104131, + "auxiliary_loss_mlp": 0.01034077, + "balance_loss_clip": 1.0213933, + "balance_loss_mlp": 1.03559685, + "epoch": 0.6417555989779047, + "flos": 27710002321920.0, + "grad_norm": 1.56139787015984, + "language_loss": 0.69352114, + "learning_rate": 1.2016993211879283e-06, + "loss": 0.71490324, + "num_input_tokens_seen": 230262420, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 10674, + "time_per_iteration": 2.5161702632904053 + }, + { + "auxiliary_loss_clip": 0.01109387, + "auxiliary_loss_mlp": 0.01029296, + "balance_loss_clip": 1.01650524, + "balance_loss_mlp": 1.03618658, + "epoch": 0.6418157222305727, + "flos": 20556632787840.0, + "grad_norm": 1.8510668186633226, + "language_loss": 0.66126549, + "learning_rate": 1.201342244560338e-06, + "loss": 0.68265229, + "num_input_tokens_seen": 230279950, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.73046875, + "step": 10675, + "time_per_iteration": 2.4155290126800537 + }, + { + "auxiliary_loss_clip": 0.01106276, + "auxiliary_loss_mlp": 0.01034914, + "balance_loss_clip": 1.02304077, + "balance_loss_mlp": 1.03823316, + "epoch": 0.6418758454832406, + "flos": 22601530062720.0, + "grad_norm": 2.2027244466364486, + "language_loss": 0.66607732, + "learning_rate": 1.2009851982179307e-06, + "loss": 0.68748927, + "num_input_tokens_seen": 230299705, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 10676, + "time_per_iteration": 2.490659713745117 + }, + { + "auxiliary_loss_clip": 0.01108966, + "auxiliary_loss_mlp": 0.01030084, + "balance_loss_clip": 1.01685786, + "balance_loss_mlp": 1.03876162, + "epoch": 0.6419359687359086, + "flos": 27375098889600.0, + "grad_norm": 2.097581634404412, + "language_loss": 0.75956476, + "learning_rate": 1.2006281821742446e-06, + "loss": 0.7809552, + "num_input_tokens_seen": 230320030, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 10677, + "time_per_iteration": 3.9567973613739014 + }, + { + "auxiliary_loss_clip": 0.01026179, + "auxiliary_loss_mlp": 0.00997901, + "balance_loss_clip": 0.99666101, + "balance_loss_mlp": 1.00533533, + "epoch": 0.6419960919885765, + "flos": 67251924552960.0, + "grad_norm": 0.8065212839738138, + "language_loss": 0.60730147, + "learning_rate": 1.200271196442818e-06, + "loss": 0.62754226, + "num_input_tokens_seen": 230381495, + "router_z_loss_clip": 0.01239014, + "router_z_loss_mlp": 0.20898438, + "step": 10678, + "time_per_iteration": 3.13420033454895 + }, + { + "auxiliary_loss_clip": 0.01103932, + "auxiliary_loss_mlp": 0.01033582, + "balance_loss_clip": 1.02209604, + "balance_loss_mlp": 1.03742027, + "epoch": 0.6420562152412446, + "flos": 19901873721600.0, + "grad_norm": 1.6963549464247227, + "language_loss": 0.67299467, + "learning_rate": 1.1999142410371875e-06, + "loss": 0.69436979, + "num_input_tokens_seen": 230401385, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6640625, + "step": 10679, + "time_per_iteration": 3.808528423309326 + }, + { + "auxiliary_loss_clip": 0.01108558, + "auxiliary_loss_mlp": 0.01029627, + "balance_loss_clip": 1.01666307, + "balance_loss_mlp": 1.03855729, + "epoch": 0.6421163384939125, + "flos": 24790177566720.0, + "grad_norm": 1.6996500318605585, + "language_loss": 0.72910142, + "learning_rate": 1.1995573159708897e-06, + "loss": 0.75048327, + "num_input_tokens_seen": 230421340, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69921875, + "step": 10680, + "time_per_iteration": 3.8477213382720947 + }, + { + "auxiliary_loss_clip": 0.01103253, + "auxiliary_loss_mlp": 0.01026838, + "balance_loss_clip": 1.01612723, + "balance_loss_mlp": 1.03545952, + "epoch": 0.6421764617465805, + "flos": 25592816926080.0, + "grad_norm": 1.7403495519820134, + "language_loss": 0.67876667, + "learning_rate": 1.1992004212574582e-06, + "loss": 0.70006758, + "num_input_tokens_seen": 230441270, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6796875, + "step": 10681, + "time_per_iteration": 3.919956922531128 + }, + { + "auxiliary_loss_clip": 0.01101229, + "auxiliary_loss_mlp": 0.01029189, + "balance_loss_clip": 1.01748252, + "balance_loss_mlp": 1.03434682, + "epoch": 0.6422365849992484, + "flos": 14134727813760.0, + "grad_norm": 1.5976000618825759, + "language_loss": 0.74644732, + "learning_rate": 1.198843556910427e-06, + "loss": 0.76775151, + "num_input_tokens_seen": 230457455, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 10682, + "time_per_iteration": 2.4222958087921143 + }, + { + "auxiliary_loss_clip": 0.01099045, + "auxiliary_loss_mlp": 0.01030599, + "balance_loss_clip": 1.01960802, + "balance_loss_mlp": 1.0343014, + "epoch": 0.6422967082519164, + "flos": 22383911514240.0, + "grad_norm": 1.48329541818395, + "language_loss": 0.79282379, + "learning_rate": 1.1984867229433287e-06, + "loss": 0.81412017, + "num_input_tokens_seen": 230478955, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6484375, + "step": 10683, + "time_per_iteration": 2.4635698795318604 + }, + { + "auxiliary_loss_clip": 0.01106037, + "auxiliary_loss_mlp": 0.01036635, + "balance_loss_clip": 1.0240351, + "balance_loss_mlp": 1.03679943, + "epoch": 0.6423568315045844, + "flos": 14647927380480.0, + "grad_norm": 1.6292181520500175, + "language_loss": 0.67376101, + "learning_rate": 1.1981299193696941e-06, + "loss": 0.69518769, + "num_input_tokens_seen": 230496425, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69140625, + "step": 10684, + "time_per_iteration": 2.41907000541687 + }, + { + "auxiliary_loss_clip": 0.01104447, + "auxiliary_loss_mlp": 0.01028123, + "balance_loss_clip": 1.01595724, + "balance_loss_mlp": 1.03616428, + "epoch": 0.6424169547572524, + "flos": 26833925606400.0, + "grad_norm": 2.2028301911766976, + "language_loss": 0.71436971, + "learning_rate": 1.1977731462030533e-06, + "loss": 0.73569536, + "num_input_tokens_seen": 230516245, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.68359375, + "step": 10685, + "time_per_iteration": 2.471905469894409 + }, + { + "auxiliary_loss_clip": 0.01101725, + "auxiliary_loss_mlp": 0.01032399, + "balance_loss_clip": 1.0210917, + "balance_loss_mlp": 1.0360837, + "epoch": 0.6424770780099204, + "flos": 22707430335360.0, + "grad_norm": 1.599317002960078, + "language_loss": 0.75343961, + "learning_rate": 1.197416403456935e-06, + "loss": 0.77478087, + "num_input_tokens_seen": 230534745, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 10686, + "time_per_iteration": 2.4540653228759766 + }, + { + "auxiliary_loss_clip": 0.0110856, + "auxiliary_loss_mlp": 0.01033102, + "balance_loss_clip": 1.02034068, + "balance_loss_mlp": 1.03813434, + "epoch": 0.6425372012625883, + "flos": 28469512425600.0, + "grad_norm": 2.1016215045747684, + "language_loss": 0.6875909, + "learning_rate": 1.197059691144867e-06, + "loss": 0.70900756, + "num_input_tokens_seen": 230555895, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 10687, + "time_per_iteration": 2.4797768592834473 + }, + { + "auxiliary_loss_clip": 0.01107085, + "auxiliary_loss_mlp": 0.01030589, + "balance_loss_clip": 1.01875103, + "balance_loss_mlp": 1.03763437, + "epoch": 0.6425973245152563, + "flos": 29351694453120.0, + "grad_norm": 2.024359307432863, + "language_loss": 0.66338682, + "learning_rate": 1.1967030092803767e-06, + "loss": 0.68476355, + "num_input_tokens_seen": 230577460, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6953125, + "step": 10688, + "time_per_iteration": 2.515491247177124 + }, + { + "auxiliary_loss_clip": 0.01103677, + "auxiliary_loss_mlp": 0.01029681, + "balance_loss_clip": 1.01751018, + "balance_loss_mlp": 1.03563595, + "epoch": 0.6426574477679242, + "flos": 16430388912000.0, + "grad_norm": 1.8678327137671962, + "language_loss": 0.73044169, + "learning_rate": 1.1963463578769876e-06, + "loss": 0.75177526, + "num_input_tokens_seen": 230595030, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6796875, + "step": 10689, + "time_per_iteration": 2.442413806915283 + }, + { + "auxiliary_loss_clip": 0.01101756, + "auxiliary_loss_mlp": 0.01029518, + "balance_loss_clip": 1.01890218, + "balance_loss_mlp": 1.03588271, + "epoch": 0.6427175710205922, + "flos": 21835914647040.0, + "grad_norm": 2.3454318131191485, + "language_loss": 0.72232103, + "learning_rate": 1.195989736948226e-06, + "loss": 0.74363381, + "num_input_tokens_seen": 230615135, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.66015625, + "step": 10690, + "time_per_iteration": 2.471299648284912 + }, + { + "auxiliary_loss_clip": 0.01102076, + "auxiliary_loss_mlp": 0.01028258, + "balance_loss_clip": 1.01679587, + "balance_loss_mlp": 1.03589702, + "epoch": 0.6427776942732601, + "flos": 17786627660160.0, + "grad_norm": 1.751175955717072, + "language_loss": 0.77973688, + "learning_rate": 1.1956331465076143e-06, + "loss": 0.80104017, + "num_input_tokens_seen": 230631965, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.66015625, + "step": 10691, + "time_per_iteration": 2.405625581741333 + }, + { + "auxiliary_loss_clip": 0.01107533, + "auxiliary_loss_mlp": 0.01036737, + "balance_loss_clip": 1.02516222, + "balance_loss_mlp": 1.03734851, + "epoch": 0.6428378175259282, + "flos": 15085893911040.0, + "grad_norm": 1.7365524827328973, + "language_loss": 0.74180853, + "learning_rate": 1.1952765865686738e-06, + "loss": 0.76325125, + "num_input_tokens_seen": 230649565, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.703125, + "step": 10692, + "time_per_iteration": 2.4545161724090576 + }, + { + "auxiliary_loss_clip": 0.01105895, + "auxiliary_loss_mlp": 0.01032234, + "balance_loss_clip": 1.02066517, + "balance_loss_mlp": 1.03752697, + "epoch": 0.6428979407785961, + "flos": 23841776816640.0, + "grad_norm": 1.783950417735838, + "language_loss": 0.61135745, + "learning_rate": 1.1949200571449263e-06, + "loss": 0.63273877, + "num_input_tokens_seen": 230669265, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.68359375, + "step": 10693, + "time_per_iteration": 2.443671464920044 + }, + { + "auxiliary_loss_clip": 0.01107731, + "auxiliary_loss_mlp": 0.010259, + "balance_loss_clip": 1.01348996, + "balance_loss_mlp": 1.03660131, + "epoch": 0.6429580640312641, + "flos": 32926852892160.0, + "grad_norm": 1.580411610275865, + "language_loss": 0.59667271, + "learning_rate": 1.1945635582498903e-06, + "loss": 0.61800897, + "num_input_tokens_seen": 230690575, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 10694, + "time_per_iteration": 2.539658308029175 + }, + { + "auxiliary_loss_clip": 0.01107722, + "auxiliary_loss_mlp": 0.01026895, + "balance_loss_clip": 1.0150162, + "balance_loss_mlp": 1.03852546, + "epoch": 0.643018187283932, + "flos": 21068359896960.0, + "grad_norm": 1.3391279253609552, + "language_loss": 0.79716361, + "learning_rate": 1.1942070898970853e-06, + "loss": 0.81850976, + "num_input_tokens_seen": 230709420, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.69140625, + "step": 10695, + "time_per_iteration": 2.4294402599334717 + }, + { + "auxiliary_loss_clip": 0.01105962, + "auxiliary_loss_mlp": 0.01036859, + "balance_loss_clip": 1.02474165, + "balance_loss_mlp": 1.03641272, + "epoch": 0.6430783105366, + "flos": 26724649455360.0, + "grad_norm": 2.3258756947072112, + "language_loss": 0.73518264, + "learning_rate": 1.1938506521000285e-06, + "loss": 0.75661093, + "num_input_tokens_seen": 230729350, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 10696, + "time_per_iteration": 2.502713441848755 + }, + { + "auxiliary_loss_clip": 0.01102941, + "auxiliary_loss_mlp": 0.01026977, + "balance_loss_clip": 1.01575983, + "balance_loss_mlp": 1.03764093, + "epoch": 0.643138433789268, + "flos": 23696841438720.0, + "grad_norm": 2.7414253907465636, + "language_loss": 0.7579782, + "learning_rate": 1.1934942448722347e-06, + "loss": 0.77927744, + "num_input_tokens_seen": 230749220, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 10697, + "time_per_iteration": 2.4447250366210938 + }, + { + "auxiliary_loss_clip": 0.01102432, + "auxiliary_loss_mlp": 0.01029855, + "balance_loss_clip": 1.01867306, + "balance_loss_mlp": 1.03607345, + "epoch": 0.643198557041936, + "flos": 34202184255360.0, + "grad_norm": 1.4042502284177218, + "language_loss": 0.6627214, + "learning_rate": 1.1931378682272208e-06, + "loss": 0.68404424, + "num_input_tokens_seen": 230770245, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6640625, + "step": 10698, + "time_per_iteration": 2.585150718688965 + }, + { + "auxiliary_loss_clip": 0.01025803, + "auxiliary_loss_mlp": 0.01004446, + "balance_loss_clip": 1.00322425, + "balance_loss_mlp": 1.00514603, + "epoch": 0.643258680294604, + "flos": 67626473621760.0, + "grad_norm": 0.8344250970478979, + "language_loss": 0.63460743, + "learning_rate": 1.1927815221784996e-06, + "loss": 0.65490991, + "num_input_tokens_seen": 230837030, + "router_z_loss_clip": 0.01220703, + "router_z_loss_mlp": 0.20703125, + "step": 10699, + "time_per_iteration": 3.024700403213501 + }, + { + "auxiliary_loss_clip": 0.01103086, + "auxiliary_loss_mlp": 0.01026282, + "balance_loss_clip": 1.01535618, + "balance_loss_mlp": 1.03705359, + "epoch": 0.6433188035472719, + "flos": 25185984508800.0, + "grad_norm": 1.912981795070525, + "language_loss": 0.6912387, + "learning_rate": 1.1924252067395838e-06, + "loss": 0.71253234, + "num_input_tokens_seen": 230856845, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.66015625, + "step": 10700, + "time_per_iteration": 2.4683825969696045 + }, + { + "auxiliary_loss_clip": 0.01103852, + "auxiliary_loss_mlp": 0.01026054, + "balance_loss_clip": 1.01447868, + "balance_loss_mlp": 1.03590679, + "epoch": 0.6433789267999399, + "flos": 24973573432320.0, + "grad_norm": 1.7070737124865907, + "language_loss": 0.73354918, + "learning_rate": 1.1920689219239855e-06, + "loss": 0.75484824, + "num_input_tokens_seen": 230878785, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6796875, + "step": 10701, + "time_per_iteration": 2.4831302165985107 + }, + { + "auxiliary_loss_clip": 0.01105062, + "auxiliary_loss_mlp": 0.01028305, + "balance_loss_clip": 1.01542449, + "balance_loss_mlp": 1.03474069, + "epoch": 0.6434390500526078, + "flos": 17566028282880.0, + "grad_norm": 1.878097796503538, + "language_loss": 0.81941777, + "learning_rate": 1.1917126677452144e-06, + "loss": 0.84075147, + "num_input_tokens_seen": 230895445, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 10702, + "time_per_iteration": 2.468240261077881 + }, + { + "auxiliary_loss_clip": 0.01101591, + "auxiliary_loss_mlp": 0.01033917, + "balance_loss_clip": 1.02235985, + "balance_loss_mlp": 1.03552771, + "epoch": 0.6434991733052758, + "flos": 20843594542080.0, + "grad_norm": 1.8640854274416083, + "language_loss": 0.74179298, + "learning_rate": 1.1913564442167798e-06, + "loss": 0.76314807, + "num_input_tokens_seen": 230911375, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66015625, + "step": 10703, + "time_per_iteration": 2.413569688796997 + }, + { + "auxiliary_loss_clip": 0.01025343, + "auxiliary_loss_mlp": 0.00998028, + "balance_loss_clip": 0.99697268, + "balance_loss_mlp": 1.0046978, + "epoch": 0.6435592965579437, + "flos": 66094596345600.0, + "grad_norm": 0.6508795205779913, + "language_loss": 0.54642779, + "learning_rate": 1.1910002513521898e-06, + "loss": 0.56666148, + "num_input_tokens_seen": 230975990, + "router_z_loss_clip": 0.01055908, + "router_z_loss_mlp": 0.20703125, + "step": 10704, + "time_per_iteration": 3.0236172676086426 + }, + { + "auxiliary_loss_clip": 0.01102168, + "auxiliary_loss_mlp": 0.01022828, + "balance_loss_clip": 1.01258826, + "balance_loss_mlp": 1.03476024, + "epoch": 0.6436194198106118, + "flos": 23768842250880.0, + "grad_norm": 1.5730519252717787, + "language_loss": 0.76976264, + "learning_rate": 1.1906440891649519e-06, + "loss": 0.79101259, + "num_input_tokens_seen": 230997110, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.67578125, + "step": 10705, + "time_per_iteration": 2.455488443374634 + }, + { + "auxiliary_loss_clip": 0.01104051, + "auxiliary_loss_mlp": 0.01036328, + "balance_loss_clip": 1.02475905, + "balance_loss_mlp": 1.0358727, + "epoch": 0.6436795430632797, + "flos": 20230312705920.0, + "grad_norm": 1.7440813911831818, + "language_loss": 0.7908684, + "learning_rate": 1.1902879576685708e-06, + "loss": 0.81227219, + "num_input_tokens_seen": 231015590, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.68359375, + "step": 10706, + "time_per_iteration": 2.449542760848999 + }, + { + "auxiliary_loss_clip": 0.01103828, + "auxiliary_loss_mlp": 0.0103175, + "balance_loss_clip": 1.01926923, + "balance_loss_mlp": 1.0355916, + "epoch": 0.6437396663159477, + "flos": 20301846641280.0, + "grad_norm": 2.1755935090023164, + "language_loss": 0.80497181, + "learning_rate": 1.1899318568765518e-06, + "loss": 0.82632756, + "num_input_tokens_seen": 231033800, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 10707, + "time_per_iteration": 2.416238784790039 + }, + { + "auxiliary_loss_clip": 0.01103614, + "auxiliary_loss_mlp": 0.01028614, + "balance_loss_clip": 1.01691961, + "balance_loss_mlp": 1.03542554, + "epoch": 0.6437997895686156, + "flos": 23878585278720.0, + "grad_norm": 1.7933979371525552, + "language_loss": 0.85400867, + "learning_rate": 1.1895757868023978e-06, + "loss": 0.87533092, + "num_input_tokens_seen": 231053160, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 10708, + "time_per_iteration": 2.4596221446990967 + }, + { + "auxiliary_loss_clip": 0.01113539, + "auxiliary_loss_mlp": 0.01039123, + "balance_loss_clip": 1.02577186, + "balance_loss_mlp": 1.03982544, + "epoch": 0.6438599128212836, + "flos": 18989275852800.0, + "grad_norm": 2.314624765830387, + "language_loss": 0.65632617, + "learning_rate": 1.1892197474596106e-06, + "loss": 0.67785281, + "num_input_tokens_seen": 231069470, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.734375, + "step": 10709, + "time_per_iteration": 2.414792776107788 + }, + { + "auxiliary_loss_clip": 0.01101587, + "auxiliary_loss_mlp": 0.01027315, + "balance_loss_clip": 1.01617527, + "balance_loss_mlp": 1.03474462, + "epoch": 0.6439200360739517, + "flos": 24096347481600.0, + "grad_norm": 2.1321707309255196, + "language_loss": 0.80428755, + "learning_rate": 1.1888637388616929e-06, + "loss": 0.8255766, + "num_input_tokens_seen": 231088205, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66796875, + "step": 10710, + "time_per_iteration": 2.475790500640869 + }, + { + "auxiliary_loss_clip": 0.01101022, + "auxiliary_loss_mlp": 0.01027519, + "balance_loss_clip": 1.01573479, + "balance_loss_mlp": 1.03475547, + "epoch": 0.6439801593266196, + "flos": 31902141697920.0, + "grad_norm": 1.6745994206662376, + "language_loss": 0.66166174, + "learning_rate": 1.1885077610221425e-06, + "loss": 0.68294716, + "num_input_tokens_seen": 231107850, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 10711, + "time_per_iteration": 2.502237319946289 + }, + { + "auxiliary_loss_clip": 0.0110763, + "auxiliary_loss_mlp": 0.01027379, + "balance_loss_clip": 1.0150764, + "balance_loss_mlp": 1.03871155, + "epoch": 0.6440402825792876, + "flos": 27125879351040.0, + "grad_norm": 1.56251052314253, + "language_loss": 0.78744113, + "learning_rate": 1.1881518139544597e-06, + "loss": 0.80879122, + "num_input_tokens_seen": 231127200, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69140625, + "step": 10712, + "time_per_iteration": 2.4865529537200928 + }, + { + "auxiliary_loss_clip": 0.01106973, + "auxiliary_loss_mlp": 0.01033455, + "balance_loss_clip": 1.0215044, + "balance_loss_mlp": 1.03622448, + "epoch": 0.6441004058319555, + "flos": 20667704618880.0, + "grad_norm": 1.5577972768959576, + "language_loss": 0.82686722, + "learning_rate": 1.1877958976721417e-06, + "loss": 0.84827155, + "num_input_tokens_seen": 231146360, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.70703125, + "step": 10713, + "time_per_iteration": 2.4358584880828857 + }, + { + "auxiliary_loss_clip": 0.01101375, + "auxiliary_loss_mlp": 0.01035112, + "balance_loss_clip": 1.02368593, + "balance_loss_mlp": 1.03669071, + "epoch": 0.6441605290846235, + "flos": 26026006947840.0, + "grad_norm": 1.4453495865190145, + "language_loss": 0.78343773, + "learning_rate": 1.187440012188684e-06, + "loss": 0.80480266, + "num_input_tokens_seen": 231168350, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 10714, + "time_per_iteration": 2.4839279651641846 + }, + { + "auxiliary_loss_clip": 0.01102157, + "auxiliary_loss_mlp": 0.01031268, + "balance_loss_clip": 1.02021146, + "balance_loss_mlp": 1.03580499, + "epoch": 0.6442206523372914, + "flos": 24899489631360.0, + "grad_norm": 1.5121330908882218, + "language_loss": 0.81442875, + "learning_rate": 1.187084157517583e-06, + "loss": 0.83576298, + "num_input_tokens_seen": 231188385, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6640625, + "step": 10715, + "time_per_iteration": 2.4751946926116943 + }, + { + "auxiliary_loss_clip": 0.01103061, + "auxiliary_loss_mlp": 0.01029965, + "balance_loss_clip": 1.01841354, + "balance_loss_mlp": 1.03416896, + "epoch": 0.6442807755899594, + "flos": 25156322853120.0, + "grad_norm": 1.858940461069926, + "language_loss": 0.81107575, + "learning_rate": 1.186728333672332e-06, + "loss": 0.83240604, + "num_input_tokens_seen": 231209880, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6875, + "step": 10716, + "time_per_iteration": 2.506404161453247 + }, + { + "auxiliary_loss_clip": 0.01106307, + "auxiliary_loss_mlp": 0.01034501, + "balance_loss_clip": 1.02118576, + "balance_loss_mlp": 1.03650761, + "epoch": 0.6443408988426274, + "flos": 27344503480320.0, + "grad_norm": 1.7227597977263103, + "language_loss": 0.77839047, + "learning_rate": 1.186372540666424e-06, + "loss": 0.79979855, + "num_input_tokens_seen": 231230765, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6953125, + "step": 10717, + "time_per_iteration": 2.4654810428619385 + }, + { + "auxiliary_loss_clip": 0.01102271, + "auxiliary_loss_mlp": 0.01028495, + "balance_loss_clip": 1.01759315, + "balance_loss_mlp": 1.03718793, + "epoch": 0.6444010220952954, + "flos": 27928339142400.0, + "grad_norm": 1.6109335148111539, + "language_loss": 0.68141425, + "learning_rate": 1.1860167785133513e-06, + "loss": 0.70272195, + "num_input_tokens_seen": 231252350, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 10718, + "time_per_iteration": 3.9740819931030273 + }, + { + "auxiliary_loss_clip": 0.0102484, + "auxiliary_loss_mlp": 0.01004792, + "balance_loss_clip": 1.00373709, + "balance_loss_mlp": 1.00415778, + "epoch": 0.6444611453479633, + "flos": 71215024855680.0, + "grad_norm": 0.7588040526175028, + "language_loss": 0.49665093, + "learning_rate": 1.185661047226603e-06, + "loss": 0.51694727, + "num_input_tokens_seen": 231313865, + "router_z_loss_clip": 0.01055908, + "router_z_loss_mlp": 0.20703125, + "step": 10719, + "time_per_iteration": 3.2171850204467773 + }, + { + "auxiliary_loss_clip": 0.01108486, + "auxiliary_loss_mlp": 0.01034143, + "balance_loss_clip": 1.0216434, + "balance_loss_mlp": 1.03927541, + "epoch": 0.6445212686006313, + "flos": 22705131864960.0, + "grad_norm": 2.0805005783182415, + "language_loss": 0.78263915, + "learning_rate": 1.18530534681967e-06, + "loss": 0.80406547, + "num_input_tokens_seen": 231331710, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 10720, + "time_per_iteration": 2.434246301651001 + }, + { + "auxiliary_loss_clip": 0.01102308, + "auxiliary_loss_mlp": 0.01032767, + "balance_loss_clip": 1.02056015, + "balance_loss_mlp": 1.03513026, + "epoch": 0.6445813918532992, + "flos": 21178821196800.0, + "grad_norm": 1.6971626147342385, + "language_loss": 0.76729137, + "learning_rate": 1.18494967730604e-06, + "loss": 0.78864217, + "num_input_tokens_seen": 231350705, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 10721, + "time_per_iteration": 5.301208734512329 + }, + { + "auxiliary_loss_clip": 0.01102301, + "auxiliary_loss_mlp": 0.01031144, + "balance_loss_clip": 1.01885331, + "balance_loss_mlp": 1.03417397, + "epoch": 0.6446415151059672, + "flos": 25191910252800.0, + "grad_norm": 2.4666147768058, + "language_loss": 0.73236001, + "learning_rate": 1.1845940386991995e-06, + "loss": 0.75369453, + "num_input_tokens_seen": 231369550, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6796875, + "step": 10722, + "time_per_iteration": 3.918328046798706 + }, + { + "auxiliary_loss_clip": 0.01101304, + "auxiliary_loss_mlp": 0.01026922, + "balance_loss_clip": 1.01602662, + "balance_loss_mlp": 1.03587341, + "epoch": 0.6447016383586353, + "flos": 25302227898240.0, + "grad_norm": 2.1714179391362074, + "language_loss": 0.78181046, + "learning_rate": 1.184238431012635e-06, + "loss": 0.80309272, + "num_input_tokens_seen": 231389285, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.65625, + "step": 10723, + "time_per_iteration": 2.485879421234131 + }, + { + "auxiliary_loss_clip": 0.01108308, + "auxiliary_loss_mlp": 0.01030662, + "balance_loss_clip": 1.01842475, + "balance_loss_mlp": 1.03774381, + "epoch": 0.6447617616113032, + "flos": 27703142824320.0, + "grad_norm": 1.8069876028647023, + "language_loss": 0.58755672, + "learning_rate": 1.1838828542598312e-06, + "loss": 0.60894638, + "num_input_tokens_seen": 231408820, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.703125, + "step": 10724, + "time_per_iteration": 2.478766679763794 + }, + { + "auxiliary_loss_clip": 0.0110091, + "auxiliary_loss_mlp": 0.01031012, + "balance_loss_clip": 1.02043772, + "balance_loss_mlp": 1.03629243, + "epoch": 0.6448218848639712, + "flos": 23039101543680.0, + "grad_norm": 1.6726755912827203, + "language_loss": 0.83442616, + "learning_rate": 1.183527308454271e-06, + "loss": 0.85574543, + "num_input_tokens_seen": 231428100, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6484375, + "step": 10725, + "time_per_iteration": 2.4473166465759277 + }, + { + "auxiliary_loss_clip": 0.01101664, + "auxiliary_loss_mlp": 0.01033389, + "balance_loss_clip": 1.02134895, + "balance_loss_mlp": 1.03365588, + "epoch": 0.6448820081166391, + "flos": 24496104919680.0, + "grad_norm": 1.7120227863307491, + "language_loss": 0.82104886, + "learning_rate": 1.1831717936094368e-06, + "loss": 0.84239936, + "num_input_tokens_seen": 231445810, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 10726, + "time_per_iteration": 2.4571003913879395 + }, + { + "auxiliary_loss_clip": 0.01106369, + "auxiliary_loss_mlp": 0.01031146, + "balance_loss_clip": 1.01877189, + "balance_loss_mlp": 1.03662455, + "epoch": 0.6449421313693071, + "flos": 22419283432320.0, + "grad_norm": 3.203326603634113, + "language_loss": 0.80919254, + "learning_rate": 1.1828163097388108e-06, + "loss": 0.83056766, + "num_input_tokens_seen": 231463570, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69921875, + "step": 10727, + "time_per_iteration": 2.4684529304504395 + }, + { + "auxiliary_loss_clip": 0.01109129, + "auxiliary_loss_mlp": 0.0103185, + "balance_loss_clip": 1.01939309, + "balance_loss_mlp": 1.03661084, + "epoch": 0.645002254621975, + "flos": 20225715765120.0, + "grad_norm": 2.8311253143889514, + "language_loss": 0.7950902, + "learning_rate": 1.1824608568558717e-06, + "loss": 0.81649995, + "num_input_tokens_seen": 231482155, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7265625, + "step": 10728, + "time_per_iteration": 2.420926094055176 + }, + { + "auxiliary_loss_clip": 0.0110447, + "auxiliary_loss_mlp": 0.01033011, + "balance_loss_clip": 1.02040493, + "balance_loss_mlp": 1.03509378, + "epoch": 0.645062377874643, + "flos": 27855440490240.0, + "grad_norm": 1.688837212564324, + "language_loss": 0.74242163, + "learning_rate": 1.1821054349740988e-06, + "loss": 0.76379651, + "num_input_tokens_seen": 231502465, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 10729, + "time_per_iteration": 2.5284883975982666 + }, + { + "auxiliary_loss_clip": 0.01105519, + "auxiliary_loss_mlp": 0.01032833, + "balance_loss_clip": 1.02004814, + "balance_loss_mlp": 1.03606546, + "epoch": 0.645122501127311, + "flos": 25301509626240.0, + "grad_norm": 1.7461235371462989, + "language_loss": 0.66486406, + "learning_rate": 1.1817500441069706e-06, + "loss": 0.68624759, + "num_input_tokens_seen": 231522740, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 10730, + "time_per_iteration": 2.472608804702759 + }, + { + "auxiliary_loss_clip": 0.01105349, + "auxiliary_loss_mlp": 0.0103343, + "balance_loss_clip": 1.02041864, + "balance_loss_mlp": 1.03703654, + "epoch": 0.645182624379979, + "flos": 18807352444800.0, + "grad_norm": 1.5067900334591022, + "language_loss": 0.63581085, + "learning_rate": 1.1813946842679614e-06, + "loss": 0.65719867, + "num_input_tokens_seen": 231542050, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.68359375, + "step": 10731, + "time_per_iteration": 2.446270704269409 + }, + { + "auxiliary_loss_clip": 0.01103309, + "auxiliary_loss_mlp": 0.01035418, + "balance_loss_clip": 1.0236578, + "balance_loss_mlp": 1.03637123, + "epoch": 0.6452427476326469, + "flos": 18332182402560.0, + "grad_norm": 1.5914748736963724, + "language_loss": 0.67864686, + "learning_rate": 1.1810393554705492e-06, + "loss": 0.70003414, + "num_input_tokens_seen": 231560380, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 10732, + "time_per_iteration": 2.4132513999938965 + }, + { + "auxiliary_loss_clip": 0.0110186, + "auxiliary_loss_mlp": 0.0103276, + "balance_loss_clip": 1.02102399, + "balance_loss_mlp": 1.03576565, + "epoch": 0.6453028708853149, + "flos": 22784746360320.0, + "grad_norm": 2.9402611085528685, + "language_loss": 0.75528163, + "learning_rate": 1.1806840577282055e-06, + "loss": 0.77662778, + "num_input_tokens_seen": 231580810, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66015625, + "step": 10733, + "time_per_iteration": 2.481633186340332 + }, + { + "auxiliary_loss_clip": 0.01109224, + "auxiliary_loss_mlp": 0.01038115, + "balance_loss_clip": 1.02552605, + "balance_loss_mlp": 1.03813672, + "epoch": 0.6453629941379828, + "flos": 23945989150080.0, + "grad_norm": 1.733255021176503, + "language_loss": 0.65421891, + "learning_rate": 1.1803287910544048e-06, + "loss": 0.67569232, + "num_input_tokens_seen": 231600585, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 10734, + "time_per_iteration": 2.458852529525757 + }, + { + "auxiliary_loss_clip": 0.01102096, + "auxiliary_loss_mlp": 0.01040084, + "balance_loss_clip": 1.02868783, + "balance_loss_mlp": 1.03828883, + "epoch": 0.6454231173906508, + "flos": 17676381841920.0, + "grad_norm": 2.35360500847906, + "language_loss": 0.7390331, + "learning_rate": 1.1799735554626191e-06, + "loss": 0.76045489, + "num_input_tokens_seen": 231618765, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.640625, + "step": 10735, + "time_per_iteration": 2.4310169219970703 + }, + { + "auxiliary_loss_clip": 0.01106342, + "auxiliary_loss_mlp": 0.01033097, + "balance_loss_clip": 1.02140272, + "balance_loss_mlp": 1.0381664, + "epoch": 0.6454832406433189, + "flos": 23292774368640.0, + "grad_norm": 1.7357542776809323, + "language_loss": 0.74936789, + "learning_rate": 1.1796183509663176e-06, + "loss": 0.77076226, + "num_input_tokens_seen": 231638525, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 10736, + "time_per_iteration": 2.4535531997680664 + }, + { + "auxiliary_loss_clip": 0.01108598, + "auxiliary_loss_mlp": 0.01030926, + "balance_loss_clip": 1.01834321, + "balance_loss_mlp": 1.03880417, + "epoch": 0.6455433638959868, + "flos": 20157198572160.0, + "grad_norm": 7.331374953548985, + "language_loss": 0.70983565, + "learning_rate": 1.1792631775789708e-06, + "loss": 0.73123091, + "num_input_tokens_seen": 231656785, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 10737, + "time_per_iteration": 2.455932855606079 + }, + { + "auxiliary_loss_clip": 0.01024539, + "auxiliary_loss_mlp": 0.0100647, + "balance_loss_clip": 1.00536776, + "balance_loss_mlp": 1.0038693, + "epoch": 0.6456034871486548, + "flos": 66532922012160.0, + "grad_norm": 0.7756134851395411, + "language_loss": 0.58466899, + "learning_rate": 1.1789080353140464e-06, + "loss": 0.6049791, + "num_input_tokens_seen": 231719075, + "router_z_loss_clip": 0.01104736, + "router_z_loss_mlp": 0.20703125, + "step": 10738, + "time_per_iteration": 3.11362624168396 + }, + { + "auxiliary_loss_clip": 0.01101864, + "auxiliary_loss_mlp": 0.0103012, + "balance_loss_clip": 1.01832366, + "balance_loss_mlp": 1.03569365, + "epoch": 0.6456636104013227, + "flos": 24206090509440.0, + "grad_norm": 1.6796977264879835, + "language_loss": 0.7432248, + "learning_rate": 1.1785529241850118e-06, + "loss": 0.76454461, + "num_input_tokens_seen": 231737810, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66015625, + "step": 10739, + "time_per_iteration": 2.575263261795044 + }, + { + "auxiliary_loss_clip": 0.01106876, + "auxiliary_loss_mlp": 0.01028957, + "balance_loss_clip": 1.01638103, + "balance_loss_mlp": 1.03678012, + "epoch": 0.6457237336539907, + "flos": 23624086440960.0, + "grad_norm": 1.7287512893442607, + "language_loss": 0.71253389, + "learning_rate": 1.1781978442053324e-06, + "loss": 0.7338922, + "num_input_tokens_seen": 231756140, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 10740, + "time_per_iteration": 2.4456567764282227 + }, + { + "auxiliary_loss_clip": 0.01024391, + "auxiliary_loss_mlp": 0.01001433, + "balance_loss_clip": 1.00019932, + "balance_loss_mlp": 1.00384283, + "epoch": 0.6457838569066586, + "flos": 65846023251840.0, + "grad_norm": 0.6634055191842134, + "language_loss": 0.55304271, + "learning_rate": 1.1778427953884733e-06, + "loss": 0.57330096, + "num_input_tokens_seen": 231823665, + "router_z_loss_clip": 0.0123291, + "router_z_loss_mlp": 0.20507812, + "step": 10741, + "time_per_iteration": 3.084655284881592 + }, + { + "auxiliary_loss_clip": 0.01100994, + "auxiliary_loss_mlp": 0.01029872, + "balance_loss_clip": 1.01909518, + "balance_loss_mlp": 1.03560328, + "epoch": 0.6458439801593266, + "flos": 22381972179840.0, + "grad_norm": 4.469668504909254, + "language_loss": 0.80574667, + "learning_rate": 1.1774877777478977e-06, + "loss": 0.82705534, + "num_input_tokens_seen": 231844500, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.65234375, + "step": 10742, + "time_per_iteration": 2.4683938026428223 + }, + { + "auxiliary_loss_clip": 0.01100072, + "auxiliary_loss_mlp": 0.01027999, + "balance_loss_clip": 1.01656711, + "balance_loss_mlp": 1.03513217, + "epoch": 0.6459041034119946, + "flos": 24789243813120.0, + "grad_norm": 1.5091720275231448, + "language_loss": 0.81898236, + "learning_rate": 1.1771327912970678e-06, + "loss": 0.84026313, + "num_input_tokens_seen": 231864510, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 10743, + "time_per_iteration": 2.4860422611236572 + }, + { + "auxiliary_loss_clip": 0.01103144, + "auxiliary_loss_mlp": 0.01028667, + "balance_loss_clip": 1.01716936, + "balance_loss_mlp": 1.03643143, + "epoch": 0.6459642266646626, + "flos": 18325358818560.0, + "grad_norm": 1.8283751590876323, + "language_loss": 0.72072589, + "learning_rate": 1.1767778360494453e-06, + "loss": 0.74204403, + "num_input_tokens_seen": 231881555, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 10744, + "time_per_iteration": 2.401154041290283 + }, + { + "auxiliary_loss_clip": 0.01103143, + "auxiliary_loss_mlp": 0.01024823, + "balance_loss_clip": 1.01339674, + "balance_loss_mlp": 1.0362134, + "epoch": 0.6460243499173305, + "flos": 43581368891520.0, + "grad_norm": 1.6832996887385467, + "language_loss": 0.66680956, + "learning_rate": 1.1764229120184896e-06, + "loss": 0.68808925, + "num_input_tokens_seen": 231905945, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 10745, + "time_per_iteration": 2.648923635482788 + }, + { + "auxiliary_loss_clip": 0.01104749, + "auxiliary_loss_mlp": 0.01030723, + "balance_loss_clip": 1.01873684, + "balance_loss_mlp": 1.03738117, + "epoch": 0.6460844731699985, + "flos": 19244026085760.0, + "grad_norm": 2.3663753891536206, + "language_loss": 0.7367624, + "learning_rate": 1.1760680192176597e-06, + "loss": 0.75811714, + "num_input_tokens_seen": 231922535, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 10746, + "time_per_iteration": 2.414886713027954 + }, + { + "auxiliary_loss_clip": 0.01106121, + "auxiliary_loss_mlp": 0.01031446, + "balance_loss_clip": 1.02040744, + "balance_loss_mlp": 1.03723145, + "epoch": 0.6461445964226664, + "flos": 27453348668160.0, + "grad_norm": 1.4238954510434034, + "language_loss": 0.66682059, + "learning_rate": 1.175713157660413e-06, + "loss": 0.6881963, + "num_input_tokens_seen": 231944800, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6875, + "step": 10747, + "time_per_iteration": 2.5016472339630127 + }, + { + "auxiliary_loss_clip": 0.01103964, + "auxiliary_loss_mlp": 0.01036697, + "balance_loss_clip": 1.02568781, + "balance_loss_mlp": 1.03684711, + "epoch": 0.6462047196753344, + "flos": 20295489934080.0, + "grad_norm": 1.577013961139599, + "language_loss": 0.66913009, + "learning_rate": 1.1753583273602056e-06, + "loss": 0.69053674, + "num_input_tokens_seen": 231962970, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.671875, + "step": 10748, + "time_per_iteration": 2.442237615585327 + }, + { + "auxiliary_loss_clip": 0.0110688, + "auxiliary_loss_mlp": 0.01039659, + "balance_loss_clip": 1.0270108, + "balance_loss_mlp": 1.03662395, + "epoch": 0.6462648429280025, + "flos": 22018340845440.0, + "grad_norm": 1.8120464443443396, + "language_loss": 0.76339692, + "learning_rate": 1.1750035283304937e-06, + "loss": 0.78486234, + "num_input_tokens_seen": 231981195, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 10749, + "time_per_iteration": 2.4924192428588867 + }, + { + "auxiliary_loss_clip": 0.01104279, + "auxiliary_loss_mlp": 0.010303, + "balance_loss_clip": 1.01845694, + "balance_loss_mlp": 1.03520691, + "epoch": 0.6463249661806704, + "flos": 27781141207680.0, + "grad_norm": 1.7469795758698337, + "language_loss": 0.77112448, + "learning_rate": 1.17464876058473e-06, + "loss": 0.79247028, + "num_input_tokens_seen": 232001735, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.69140625, + "step": 10750, + "time_per_iteration": 2.4771273136138916 + }, + { + "auxiliary_loss_clip": 0.01108606, + "auxiliary_loss_mlp": 0.01031442, + "balance_loss_clip": 1.01831079, + "balance_loss_mlp": 1.0382905, + "epoch": 0.6463850894333384, + "flos": 22050588280320.0, + "grad_norm": 2.0857387723701817, + "language_loss": 0.68225217, + "learning_rate": 1.1742940241363683e-06, + "loss": 0.70365262, + "num_input_tokens_seen": 232019830, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 10751, + "time_per_iteration": 2.5023088455200195 + }, + { + "auxiliary_loss_clip": 0.01103858, + "auxiliary_loss_mlp": 0.01029491, + "balance_loss_clip": 1.01730776, + "balance_loss_mlp": 1.03535843, + "epoch": 0.6464452126860063, + "flos": 21106245767040.0, + "grad_norm": 1.6570772228110922, + "language_loss": 0.70823848, + "learning_rate": 1.1739393189988604e-06, + "loss": 0.72957194, + "num_input_tokens_seen": 232039625, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 10752, + "time_per_iteration": 2.4542946815490723 + }, + { + "auxiliary_loss_clip": 0.01106954, + "auxiliary_loss_mlp": 0.01034427, + "balance_loss_clip": 1.02100945, + "balance_loss_mlp": 1.03708041, + "epoch": 0.6465053359386743, + "flos": 16028045694720.0, + "grad_norm": 1.7443402746921521, + "language_loss": 0.7799257, + "learning_rate": 1.1735846451856554e-06, + "loss": 0.80133951, + "num_input_tokens_seen": 232055855, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.69921875, + "step": 10753, + "time_per_iteration": 2.414531707763672 + }, + { + "auxiliary_loss_clip": 0.01102943, + "auxiliary_loss_mlp": 0.01038015, + "balance_loss_clip": 1.02636194, + "balance_loss_mlp": 1.03694177, + "epoch": 0.6465654591913422, + "flos": 23398674641280.0, + "grad_norm": 1.5945794385803833, + "language_loss": 0.85284775, + "learning_rate": 1.1732300027102041e-06, + "loss": 0.87425733, + "num_input_tokens_seen": 232073475, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66015625, + "step": 10754, + "time_per_iteration": 2.4596917629241943 + }, + { + "auxiliary_loss_clip": 0.01102766, + "auxiliary_loss_mlp": 0.01033015, + "balance_loss_clip": 1.02160048, + "balance_loss_mlp": 1.03613544, + "epoch": 0.6466255824440102, + "flos": 15377273038080.0, + "grad_norm": 1.9678569539088453, + "language_loss": 0.59384984, + "learning_rate": 1.1728753915859541e-06, + "loss": 0.61520755, + "num_input_tokens_seen": 232091090, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 10755, + "time_per_iteration": 2.439668893814087 + }, + { + "auxiliary_loss_clip": 0.01102658, + "auxiliary_loss_mlp": 0.01030046, + "balance_loss_clip": 1.01808381, + "balance_loss_mlp": 1.03532171, + "epoch": 0.6466857056966782, + "flos": 16252846963200.0, + "grad_norm": 5.126423165663523, + "language_loss": 0.67684507, + "learning_rate": 1.1725208118263518e-06, + "loss": 0.69817215, + "num_input_tokens_seen": 232107320, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.67578125, + "step": 10756, + "time_per_iteration": 2.405700206756592 + }, + { + "auxiliary_loss_clip": 0.01110332, + "auxiliary_loss_mlp": 0.01031544, + "balance_loss_clip": 1.019063, + "balance_loss_mlp": 1.03889596, + "epoch": 0.6467458289493462, + "flos": 21178246579200.0, + "grad_norm": 3.0387860554111574, + "language_loss": 0.74348402, + "learning_rate": 1.172166263444844e-06, + "loss": 0.76490277, + "num_input_tokens_seen": 232123930, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71484375, + "step": 10757, + "time_per_iteration": 2.4515702724456787 + }, + { + "auxiliary_loss_clip": 0.01102078, + "auxiliary_loss_mlp": 0.01030497, + "balance_loss_clip": 1.01911271, + "balance_loss_mlp": 1.03616095, + "epoch": 0.6468059522020141, + "flos": 17968299672960.0, + "grad_norm": 1.6276488646407918, + "language_loss": 0.74483991, + "learning_rate": 1.1718117464548734e-06, + "loss": 0.76616573, + "num_input_tokens_seen": 232142905, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65625, + "step": 10758, + "time_per_iteration": 2.4118669033050537 + }, + { + "auxiliary_loss_clip": 0.01104769, + "auxiliary_loss_mlp": 0.01029801, + "balance_loss_clip": 1.01735008, + "balance_loss_mlp": 1.03648281, + "epoch": 0.6468660754546821, + "flos": 17890157635200.0, + "grad_norm": 1.54772879655888, + "language_loss": 0.67891282, + "learning_rate": 1.1714572608698845e-06, + "loss": 0.70025849, + "num_input_tokens_seen": 232162230, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.68359375, + "step": 10759, + "time_per_iteration": 2.487632989883423 + }, + { + "auxiliary_loss_clip": 0.01106799, + "auxiliary_loss_mlp": 0.01030763, + "balance_loss_clip": 1.01868701, + "balance_loss_mlp": 1.03644943, + "epoch": 0.64692619870735, + "flos": 22600991358720.0, + "grad_norm": 1.881795853492405, + "language_loss": 0.75285017, + "learning_rate": 1.1711028067033197e-06, + "loss": 0.77422583, + "num_input_tokens_seen": 232182700, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.703125, + "step": 10760, + "time_per_iteration": 3.9272162914276123 + }, + { + "auxiliary_loss_clip": 0.01100222, + "auxiliary_loss_mlp": 0.01026563, + "balance_loss_clip": 1.01540494, + "balance_loss_mlp": 1.03383064, + "epoch": 0.646986321960018, + "flos": 49600786993920.0, + "grad_norm": 1.5052354500877283, + "language_loss": 0.65392292, + "learning_rate": 1.1707483839686194e-06, + "loss": 0.67519075, + "num_input_tokens_seen": 232208235, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6640625, + "step": 10761, + "time_per_iteration": 2.6993539333343506 + }, + { + "auxiliary_loss_clip": 0.01106456, + "auxiliary_loss_mlp": 0.01029889, + "balance_loss_clip": 1.01751542, + "balance_loss_mlp": 1.03747368, + "epoch": 0.6470464452126861, + "flos": 21908454163200.0, + "grad_norm": 2.1055667385281316, + "language_loss": 0.69732755, + "learning_rate": 1.1703939926792235e-06, + "loss": 0.71869099, + "num_input_tokens_seen": 232228720, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 10762, + "time_per_iteration": 2.4523587226867676 + }, + { + "auxiliary_loss_clip": 0.0110606, + "auxiliary_loss_mlp": 0.01032417, + "balance_loss_clip": 1.02048469, + "balance_loss_mlp": 1.03625226, + "epoch": 0.647106568465354, + "flos": 18106124158080.0, + "grad_norm": 2.1633807412884343, + "language_loss": 0.82723743, + "learning_rate": 1.1700396328485705e-06, + "loss": 0.8486222, + "num_input_tokens_seen": 232244655, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69921875, + "step": 10763, + "time_per_iteration": 5.290219306945801 + }, + { + "auxiliary_loss_clip": 0.01024866, + "auxiliary_loss_mlp": 0.01000313, + "balance_loss_clip": 0.99911511, + "balance_loss_mlp": 1.00423336, + "epoch": 0.647166691718022, + "flos": 69480038125440.0, + "grad_norm": 0.7101546065504528, + "language_loss": 0.57767004, + "learning_rate": 1.1696853044900978e-06, + "loss": 0.59792185, + "num_input_tokens_seen": 232308685, + "router_z_loss_clip": 0.01196289, + "router_z_loss_mlp": 0.20703125, + "step": 10764, + "time_per_iteration": 4.603821277618408 + }, + { + "auxiliary_loss_clip": 0.01102286, + "auxiliary_loss_mlp": 0.01034651, + "balance_loss_clip": 1.02277756, + "balance_loss_mlp": 1.03570485, + "epoch": 0.6472268149706899, + "flos": 34095170661120.0, + "grad_norm": 1.8570193979841765, + "language_loss": 0.60458118, + "learning_rate": 1.1693310076172413e-06, + "loss": 0.62595057, + "num_input_tokens_seen": 232327520, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6640625, + "step": 10765, + "time_per_iteration": 2.545964002609253 + }, + { + "auxiliary_loss_clip": 0.01102593, + "auxiliary_loss_mlp": 0.01026242, + "balance_loss_clip": 1.01510835, + "balance_loss_mlp": 1.03606391, + "epoch": 0.6472869382233579, + "flos": 28111232217600.0, + "grad_norm": 1.9785388674295172, + "language_loss": 0.63237435, + "learning_rate": 1.168976742243437e-06, + "loss": 0.65366268, + "num_input_tokens_seen": 232349025, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 10766, + "time_per_iteration": 2.4889070987701416 + }, + { + "auxiliary_loss_clip": 0.01103393, + "auxiliary_loss_mlp": 0.01029515, + "balance_loss_clip": 1.0176481, + "balance_loss_mlp": 1.03667796, + "epoch": 0.6473470614760258, + "flos": 22492146170880.0, + "grad_norm": 1.6243256535427835, + "language_loss": 0.75656283, + "learning_rate": 1.1686225083821174e-06, + "loss": 0.77789199, + "num_input_tokens_seen": 232367835, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 10767, + "time_per_iteration": 2.506972551345825 + }, + { + "auxiliary_loss_clip": 0.01101653, + "auxiliary_loss_mlp": 0.01031252, + "balance_loss_clip": 1.01970649, + "balance_loss_mlp": 1.03562641, + "epoch": 0.6474071847286939, + "flos": 14538938538240.0, + "grad_norm": 2.0284924931052406, + "language_loss": 0.77826148, + "learning_rate": 1.1682683060467153e-06, + "loss": 0.79959053, + "num_input_tokens_seen": 232385840, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66015625, + "step": 10768, + "time_per_iteration": 2.4127895832061768 + }, + { + "auxiliary_loss_clip": 0.01102155, + "auxiliary_loss_mlp": 0.01028073, + "balance_loss_clip": 1.01648641, + "balance_loss_mlp": 1.03510022, + "epoch": 0.6474673079813618, + "flos": 24098214988800.0, + "grad_norm": 1.6952390655728202, + "language_loss": 0.71920127, + "learning_rate": 1.167914135250663e-06, + "loss": 0.74050355, + "num_input_tokens_seen": 232406205, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 10769, + "time_per_iteration": 2.4743292331695557 + }, + { + "auxiliary_loss_clip": 0.01101029, + "auxiliary_loss_mlp": 0.01034947, + "balance_loss_clip": 1.02368212, + "balance_loss_mlp": 1.03668594, + "epoch": 0.6475274312340298, + "flos": 14976186796800.0, + "grad_norm": 1.9257555417687353, + "language_loss": 0.71907532, + "learning_rate": 1.1675599960073895e-06, + "loss": 0.74043512, + "num_input_tokens_seen": 232424995, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.64453125, + "step": 10770, + "time_per_iteration": 2.423251152038574 + }, + { + "auxiliary_loss_clip": 0.01107379, + "auxiliary_loss_mlp": 0.01027825, + "balance_loss_clip": 1.01506996, + "balance_loss_mlp": 1.03676248, + "epoch": 0.6475875544866977, + "flos": 25045322849280.0, + "grad_norm": 1.7207965836379309, + "language_loss": 0.73562384, + "learning_rate": 1.167205888330325e-06, + "loss": 0.75697601, + "num_input_tokens_seen": 232445870, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 10771, + "time_per_iteration": 2.498911142349243 + }, + { + "auxiliary_loss_clip": 0.01104798, + "auxiliary_loss_mlp": 0.01029715, + "balance_loss_clip": 1.01807988, + "balance_loss_mlp": 1.03799939, + "epoch": 0.6476476777393657, + "flos": 16472153450880.0, + "grad_norm": 1.8994664849870517, + "language_loss": 0.7373805, + "learning_rate": 1.1668518122328958e-06, + "loss": 0.75872564, + "num_input_tokens_seen": 232464285, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66796875, + "step": 10772, + "time_per_iteration": 2.4090960025787354 + }, + { + "auxiliary_loss_clip": 0.01100449, + "auxiliary_loss_mlp": 0.01029925, + "balance_loss_clip": 1.0196557, + "balance_loss_mlp": 1.03508711, + "epoch": 0.6477078009920336, + "flos": 25812267068160.0, + "grad_norm": 1.4911839819427335, + "language_loss": 0.83115339, + "learning_rate": 1.1664977677285305e-06, + "loss": 0.85245723, + "num_input_tokens_seen": 232485815, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.65625, + "step": 10773, + "time_per_iteration": 2.4857256412506104 + }, + { + "auxiliary_loss_clip": 0.01100513, + "auxiliary_loss_mlp": 0.0102739, + "balance_loss_clip": 1.01634526, + "balance_loss_mlp": 1.03509998, + "epoch": 0.6477679242447016, + "flos": 17676130446720.0, + "grad_norm": 1.4644145421555252, + "language_loss": 0.78116065, + "learning_rate": 1.1661437548306524e-06, + "loss": 0.80243969, + "num_input_tokens_seen": 232504875, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 10774, + "time_per_iteration": 2.4285647869110107 + }, + { + "auxiliary_loss_clip": 0.01105101, + "auxiliary_loss_mlp": 0.01036946, + "balance_loss_clip": 1.02481055, + "balance_loss_mlp": 1.0360589, + "epoch": 0.6478280474973696, + "flos": 21032305620480.0, + "grad_norm": 2.0390391270124986, + "language_loss": 0.68541199, + "learning_rate": 1.1657897735526867e-06, + "loss": 0.70683241, + "num_input_tokens_seen": 232521945, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69140625, + "step": 10775, + "time_per_iteration": 2.451076030731201 + }, + { + "auxiliary_loss_clip": 0.01106496, + "auxiliary_loss_mlp": 0.01028548, + "balance_loss_clip": 1.01700842, + "balance_loss_mlp": 1.03669178, + "epoch": 0.6478881707500376, + "flos": 21616931381760.0, + "grad_norm": 1.740664481421832, + "language_loss": 0.65512002, + "learning_rate": 1.1654358239080574e-06, + "loss": 0.67647052, + "num_input_tokens_seen": 232541500, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.69921875, + "step": 10776, + "time_per_iteration": 2.457409381866455 + }, + { + "auxiliary_loss_clip": 0.01105388, + "auxiliary_loss_mlp": 0.01033354, + "balance_loss_clip": 1.02126646, + "balance_loss_mlp": 1.03623533, + "epoch": 0.6479482940027056, + "flos": 18442571875200.0, + "grad_norm": 2.790324273409248, + "language_loss": 0.78897285, + "learning_rate": 1.1650819059101839e-06, + "loss": 0.81036025, + "num_input_tokens_seen": 232559720, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69140625, + "step": 10777, + "time_per_iteration": 2.425045967102051 + }, + { + "auxiliary_loss_clip": 0.01105443, + "auxiliary_loss_mlp": 0.01029925, + "balance_loss_clip": 1.01792097, + "balance_loss_mlp": 1.03808999, + "epoch": 0.6480084172553735, + "flos": 22164066322560.0, + "grad_norm": 2.190301315300799, + "language_loss": 0.73786491, + "learning_rate": 1.1647280195724896e-06, + "loss": 0.75921857, + "num_input_tokens_seen": 232579370, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 10778, + "time_per_iteration": 2.459921360015869 + }, + { + "auxiliary_loss_clip": 0.01099736, + "auxiliary_loss_mlp": 0.01030488, + "balance_loss_clip": 1.01919854, + "balance_loss_mlp": 1.033885, + "epoch": 0.6480685405080415, + "flos": 24316228586880.0, + "grad_norm": 1.4021781865585379, + "language_loss": 0.77758849, + "learning_rate": 1.1643741649083923e-06, + "loss": 0.79889071, + "num_input_tokens_seen": 232600495, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65625, + "step": 10779, + "time_per_iteration": 2.4944956302642822 + }, + { + "auxiliary_loss_clip": 0.0102552, + "auxiliary_loss_mlp": 0.0100081, + "balance_loss_clip": 0.99959451, + "balance_loss_mlp": 1.00497544, + "epoch": 0.6481286637607094, + "flos": 59891207760000.0, + "grad_norm": 0.7236484274239682, + "language_loss": 0.59404081, + "learning_rate": 1.1640203419313095e-06, + "loss": 0.61430413, + "num_input_tokens_seen": 232663165, + "router_z_loss_clip": 0.012146, + "router_z_loss_mlp": 0.20507812, + "step": 10780, + "time_per_iteration": 3.0612237453460693 + }, + { + "auxiliary_loss_clip": 0.01101259, + "auxiliary_loss_mlp": 0.01029164, + "balance_loss_clip": 1.01804209, + "balance_loss_mlp": 1.03493273, + "epoch": 0.6481887870133775, + "flos": 25484187219840.0, + "grad_norm": 1.958027941262836, + "language_loss": 0.79607379, + "learning_rate": 1.1636665506546599e-06, + "loss": 0.81737804, + "num_input_tokens_seen": 232683385, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 10781, + "time_per_iteration": 2.5239641666412354 + }, + { + "auxiliary_loss_clip": 0.01107534, + "auxiliary_loss_mlp": 0.01033388, + "balance_loss_clip": 1.01997757, + "balance_loss_mlp": 1.03791904, + "epoch": 0.6482489102660454, + "flos": 19930206574080.0, + "grad_norm": 1.9679764489100238, + "language_loss": 0.78864902, + "learning_rate": 1.1633127910918578e-06, + "loss": 0.81005824, + "num_input_tokens_seen": 232699095, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.6953125, + "step": 10782, + "time_per_iteration": 2.4253900051116943 + }, + { + "auxiliary_loss_clip": 0.01106515, + "auxiliary_loss_mlp": 0.0103164, + "balance_loss_clip": 1.01932609, + "balance_loss_mlp": 1.03778386, + "epoch": 0.6483090335187134, + "flos": 26979471515520.0, + "grad_norm": 2.985749633483, + "language_loss": 0.63785768, + "learning_rate": 1.1629590632563187e-06, + "loss": 0.65923923, + "num_input_tokens_seen": 232717920, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 10783, + "time_per_iteration": 2.5159454345703125 + }, + { + "auxiliary_loss_clip": 0.01108311, + "auxiliary_loss_mlp": 0.01032692, + "balance_loss_clip": 1.01933455, + "balance_loss_mlp": 1.03791237, + "epoch": 0.6483691567713813, + "flos": 25077965333760.0, + "grad_norm": 2.3442009274857387, + "language_loss": 0.88642716, + "learning_rate": 1.1626053671614561e-06, + "loss": 0.90783715, + "num_input_tokens_seen": 232737605, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.703125, + "step": 10784, + "time_per_iteration": 2.4753408432006836 + }, + { + "auxiliary_loss_clip": 0.01102388, + "auxiliary_loss_mlp": 0.01031195, + "balance_loss_clip": 1.01830864, + "balance_loss_mlp": 1.03565788, + "epoch": 0.6484292800240493, + "flos": 16105972250880.0, + "grad_norm": 12.15646159907571, + "language_loss": 0.73281801, + "learning_rate": 1.1622517028206815e-06, + "loss": 0.75415385, + "num_input_tokens_seen": 232755110, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.66796875, + "step": 10785, + "time_per_iteration": 2.4413681030273438 + }, + { + "auxiliary_loss_clip": 0.01101717, + "auxiliary_loss_mlp": 0.01029791, + "balance_loss_clip": 1.01844823, + "balance_loss_mlp": 1.03633511, + "epoch": 0.6484894032767172, + "flos": 28840398307200.0, + "grad_norm": 1.367601959382758, + "language_loss": 0.69167411, + "learning_rate": 1.1618980702474071e-06, + "loss": 0.71298921, + "num_input_tokens_seen": 232779040, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65234375, + "step": 10786, + "time_per_iteration": 2.524073362350464 + }, + { + "auxiliary_loss_clip": 0.01100884, + "auxiliary_loss_mlp": 0.01031055, + "balance_loss_clip": 1.01922917, + "balance_loss_mlp": 1.03379738, + "epoch": 0.6485495265293852, + "flos": 30227052896640.0, + "grad_norm": 1.7579718485158407, + "language_loss": 0.71124583, + "learning_rate": 1.161544469455041e-06, + "loss": 0.73256522, + "num_input_tokens_seen": 232800515, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 10787, + "time_per_iteration": 2.5158114433288574 + }, + { + "auxiliary_loss_clip": 0.01106869, + "auxiliary_loss_mlp": 0.01029835, + "balance_loss_clip": 1.017735, + "balance_loss_mlp": 1.03644013, + "epoch": 0.6486096497820532, + "flos": 20082181017600.0, + "grad_norm": 2.051362245275849, + "language_loss": 0.84114212, + "learning_rate": 1.1611909004569934e-06, + "loss": 0.86250919, + "num_input_tokens_seen": 232818450, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.703125, + "step": 10788, + "time_per_iteration": 2.4534499645233154 + }, + { + "auxiliary_loss_clip": 0.01105049, + "auxiliary_loss_mlp": 0.01028079, + "balance_loss_clip": 1.01589584, + "balance_loss_mlp": 1.03690362, + "epoch": 0.6486697730347212, + "flos": 17129067333120.0, + "grad_norm": 1.7919339269161743, + "language_loss": 0.76950663, + "learning_rate": 1.1608373632666708e-06, + "loss": 0.79083782, + "num_input_tokens_seen": 232834785, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6796875, + "step": 10789, + "time_per_iteration": 2.483477830886841 + }, + { + "auxiliary_loss_clip": 0.01100294, + "auxiliary_loss_mlp": 0.01028178, + "balance_loss_clip": 1.01661515, + "balance_loss_mlp": 1.03485107, + "epoch": 0.6487298962873892, + "flos": 38911940570880.0, + "grad_norm": 1.6011584419095646, + "language_loss": 0.76170266, + "learning_rate": 1.160483857897479e-06, + "loss": 0.78298742, + "num_input_tokens_seen": 232856050, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65234375, + "step": 10790, + "time_per_iteration": 2.589041233062744 + }, + { + "auxiliary_loss_clip": 0.01106166, + "auxiliary_loss_mlp": 0.01032194, + "balance_loss_clip": 1.02138782, + "balance_loss_mlp": 1.03979826, + "epoch": 0.6487900195400571, + "flos": 11947840076160.0, + "grad_norm": 2.041315075509779, + "language_loss": 0.59891582, + "learning_rate": 1.160130384362823e-06, + "loss": 0.6202994, + "num_input_tokens_seen": 232873945, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6640625, + "step": 10791, + "time_per_iteration": 2.432832956314087 + }, + { + "auxiliary_loss_clip": 0.01103609, + "auxiliary_loss_mlp": 0.01028949, + "balance_loss_clip": 1.01708758, + "balance_loss_mlp": 1.03552938, + "epoch": 0.6488501427927251, + "flos": 22344445445760.0, + "grad_norm": 1.6472225462276555, + "language_loss": 0.86154032, + "learning_rate": 1.1597769426761082e-06, + "loss": 0.88286591, + "num_input_tokens_seen": 232892160, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 10792, + "time_per_iteration": 2.446188449859619 + }, + { + "auxiliary_loss_clip": 0.01108514, + "auxiliary_loss_mlp": 0.01037084, + "balance_loss_clip": 1.02510905, + "balance_loss_mlp": 1.03797722, + "epoch": 0.648910266045393, + "flos": 22236282616320.0, + "grad_norm": 2.3897847361162396, + "language_loss": 0.78055567, + "learning_rate": 1.159423532850735e-06, + "loss": 0.80201161, + "num_input_tokens_seen": 232911725, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.703125, + "step": 10793, + "time_per_iteration": 2.5302352905273438 + }, + { + "auxiliary_loss_clip": 0.01108826, + "auxiliary_loss_mlp": 0.01027456, + "balance_loss_clip": 1.01562476, + "balance_loss_mlp": 1.03950644, + "epoch": 0.6489703892980611, + "flos": 25301258231040.0, + "grad_norm": 1.9288429134844602, + "language_loss": 0.75000489, + "learning_rate": 1.1590701549001055e-06, + "loss": 0.77136773, + "num_input_tokens_seen": 232929085, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6953125, + "step": 10794, + "time_per_iteration": 2.487550735473633 + }, + { + "auxiliary_loss_clip": 0.01102282, + "auxiliary_loss_mlp": 0.01030687, + "balance_loss_clip": 1.01906371, + "balance_loss_mlp": 1.03439832, + "epoch": 0.649030512550729, + "flos": 24571912573440.0, + "grad_norm": 1.7036979096858527, + "language_loss": 0.70159793, + "learning_rate": 1.158716808837621e-06, + "loss": 0.72292763, + "num_input_tokens_seen": 232949455, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 10795, + "time_per_iteration": 2.5075082778930664 + }, + { + "auxiliary_loss_clip": 0.01107904, + "auxiliary_loss_mlp": 0.01033056, + "balance_loss_clip": 1.0199964, + "balance_loss_mlp": 1.03854239, + "epoch": 0.649090635803397, + "flos": 26244702904320.0, + "grad_norm": 1.7755045878876892, + "language_loss": 0.54152012, + "learning_rate": 1.158363494676679e-06, + "loss": 0.56292963, + "num_input_tokens_seen": 232969445, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6953125, + "step": 10796, + "time_per_iteration": 2.4778566360473633 + }, + { + "auxiliary_loss_clip": 0.01104118, + "auxiliary_loss_mlp": 0.01027194, + "balance_loss_clip": 1.01583314, + "balance_loss_mlp": 1.03535151, + "epoch": 0.6491507590560649, + "flos": 24937375501440.0, + "grad_norm": 2.2433372918176917, + "language_loss": 0.77806747, + "learning_rate": 1.1580102124306775e-06, + "loss": 0.79938054, + "num_input_tokens_seen": 232988900, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6875, + "step": 10797, + "time_per_iteration": 2.4779365062713623 + }, + { + "auxiliary_loss_clip": 0.01101065, + "auxiliary_loss_mlp": 0.0102751, + "balance_loss_clip": 1.0163343, + "balance_loss_mlp": 1.03683209, + "epoch": 0.6492108823087329, + "flos": 19499781899520.0, + "grad_norm": 2.205335755673093, + "language_loss": 0.70565605, + "learning_rate": 1.1576569621130134e-06, + "loss": 0.72694176, + "num_input_tokens_seen": 233005060, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.640625, + "step": 10798, + "time_per_iteration": 2.4684252738952637 + }, + { + "auxiliary_loss_clip": 0.01101226, + "auxiliary_loss_mlp": 0.0102813, + "balance_loss_clip": 1.01698995, + "balance_loss_mlp": 1.03464842, + "epoch": 0.6492710055614008, + "flos": 19719303868800.0, + "grad_norm": 1.6813115922747512, + "language_loss": 0.76955473, + "learning_rate": 1.1573037437370811e-06, + "loss": 0.79084826, + "num_input_tokens_seen": 233023375, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 10799, + "time_per_iteration": 2.5210940837860107 + }, + { + "auxiliary_loss_clip": 0.01104973, + "auxiliary_loss_mlp": 0.01032808, + "balance_loss_clip": 1.01997542, + "balance_loss_mlp": 1.03435063, + "epoch": 0.6493311288140688, + "flos": 24317018686080.0, + "grad_norm": 1.8153395402518349, + "language_loss": 0.7160871, + "learning_rate": 1.1569505573162755e-06, + "loss": 0.7374649, + "num_input_tokens_seen": 233043130, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.70703125, + "step": 10800, + "time_per_iteration": 2.482504367828369 + }, + { + "auxiliary_loss_clip": 0.01026126, + "auxiliary_loss_mlp": 0.01002417, + "balance_loss_clip": 1.00117147, + "balance_loss_mlp": 1.00504756, + "epoch": 0.6493912520667368, + "flos": 70934635290240.0, + "grad_norm": 0.7657069555877785, + "language_loss": 0.60286164, + "learning_rate": 1.1565974028639897e-06, + "loss": 0.62314713, + "num_input_tokens_seen": 233110560, + "router_z_loss_clip": 0.01245117, + "router_z_loss_mlp": 0.2109375, + "step": 10801, + "time_per_iteration": 3.226260185241699 + }, + { + "auxiliary_loss_clip": 0.01110608, + "auxiliary_loss_mlp": 0.01036145, + "balance_loss_clip": 1.02356291, + "balance_loss_mlp": 1.04023898, + "epoch": 0.6494513753194048, + "flos": 25337779384320.0, + "grad_norm": 1.8073883235159445, + "language_loss": 0.78302824, + "learning_rate": 1.156244280393614e-06, + "loss": 0.80449581, + "num_input_tokens_seen": 233130080, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 10802, + "time_per_iteration": 3.919212579727173 + }, + { + "auxiliary_loss_clip": 0.01103206, + "auxiliary_loss_mlp": 0.01035837, + "balance_loss_clip": 1.02334428, + "balance_loss_mlp": 1.03446245, + "epoch": 0.6495114985720728, + "flos": 24681978823680.0, + "grad_norm": 1.6305174461496863, + "language_loss": 0.74483562, + "learning_rate": 1.155891189918541e-06, + "loss": 0.76622605, + "num_input_tokens_seen": 233150235, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 10803, + "time_per_iteration": 2.4627156257629395 + }, + { + "auxiliary_loss_clip": 0.01104558, + "auxiliary_loss_mlp": 0.01031041, + "balance_loss_clip": 1.01895285, + "balance_loss_mlp": 1.03586698, + "epoch": 0.6495716218247407, + "flos": 23651162317440.0, + "grad_norm": 2.1376614082682104, + "language_loss": 0.70056975, + "learning_rate": 1.1555381314521578e-06, + "loss": 0.72192574, + "num_input_tokens_seen": 233166710, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 10804, + "time_per_iteration": 3.848759889602661 + }, + { + "auxiliary_loss_clip": 0.01103321, + "auxiliary_loss_mlp": 0.01028784, + "balance_loss_clip": 1.01596284, + "balance_loss_mlp": 1.03562534, + "epoch": 0.6496317450774087, + "flos": 22346169298560.0, + "grad_norm": 1.6605919162215552, + "language_loss": 0.72852522, + "learning_rate": 1.1551851050078537e-06, + "loss": 0.74984628, + "num_input_tokens_seen": 233185445, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 10805, + "time_per_iteration": 3.8869080543518066 + }, + { + "auxiliary_loss_clip": 0.01103949, + "auxiliary_loss_mlp": 0.01030707, + "balance_loss_clip": 1.0191083, + "balance_loss_mlp": 1.03534186, + "epoch": 0.6496918683300766, + "flos": 30518647505280.0, + "grad_norm": 2.4377517316486816, + "language_loss": 0.66010499, + "learning_rate": 1.1548321105990155e-06, + "loss": 0.68145156, + "num_input_tokens_seen": 233205805, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6875, + "step": 10806, + "time_per_iteration": 3.955326557159424 + }, + { + "auxiliary_loss_clip": 0.01104962, + "auxiliary_loss_mlp": 0.0103143, + "balance_loss_clip": 1.01891875, + "balance_loss_mlp": 1.0347352, + "epoch": 0.6497519915827447, + "flos": 12458992567680.0, + "grad_norm": 2.0043448276690743, + "language_loss": 0.79282916, + "learning_rate": 1.1544791482390275e-06, + "loss": 0.81419313, + "num_input_tokens_seen": 233224215, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 10807, + "time_per_iteration": 2.39217209815979 + }, + { + "auxiliary_loss_clip": 0.01025408, + "auxiliary_loss_mlp": 0.0099987, + "balance_loss_clip": 0.99852294, + "balance_loss_mlp": 1.0043627, + "epoch": 0.6498121148354126, + "flos": 69093748287360.0, + "grad_norm": 0.8116161107359111, + "language_loss": 0.58930409, + "learning_rate": 1.1541262179412745e-06, + "loss": 0.60955691, + "num_input_tokens_seen": 233294440, + "router_z_loss_clip": 0.01348877, + "router_z_loss_mlp": 0.2109375, + "step": 10808, + "time_per_iteration": 3.230355739593506 + }, + { + "auxiliary_loss_clip": 0.0110383, + "auxiliary_loss_mlp": 0.01027961, + "balance_loss_clip": 1.01633191, + "balance_loss_mlp": 1.03880036, + "epoch": 0.6498722380880806, + "flos": 36897135914880.0, + "grad_norm": 1.7314499567585588, + "language_loss": 0.63442683, + "learning_rate": 1.1537733197191415e-06, + "loss": 0.65574473, + "num_input_tokens_seen": 233316125, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 10809, + "time_per_iteration": 2.5621047019958496 + }, + { + "auxiliary_loss_clip": 0.01103232, + "auxiliary_loss_mlp": 0.01030844, + "balance_loss_clip": 1.01940036, + "balance_loss_mlp": 1.03731823, + "epoch": 0.6499323613407485, + "flos": 29017760688000.0, + "grad_norm": 1.7915412750630062, + "language_loss": 0.81444794, + "learning_rate": 1.153420453586008e-06, + "loss": 0.83578873, + "num_input_tokens_seen": 233336140, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66015625, + "step": 10810, + "time_per_iteration": 2.504213571548462 + }, + { + "auxiliary_loss_clip": 0.01101776, + "auxiliary_loss_mlp": 0.01032614, + "balance_loss_clip": 1.02178955, + "balance_loss_mlp": 1.03596044, + "epoch": 0.6499924845934165, + "flos": 20119240874880.0, + "grad_norm": 1.6107612285139954, + "language_loss": 0.71639317, + "learning_rate": 1.1530676195552561e-06, + "loss": 0.73773706, + "num_input_tokens_seen": 233356095, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.66015625, + "step": 10811, + "time_per_iteration": 2.4460504055023193 + }, + { + "auxiliary_loss_clip": 0.01102886, + "auxiliary_loss_mlp": 0.01026927, + "balance_loss_clip": 1.01604342, + "balance_loss_mlp": 1.03864026, + "epoch": 0.6500526078460844, + "flos": 24421338760320.0, + "grad_norm": 1.696628622759694, + "language_loss": 0.78028226, + "learning_rate": 1.1527148176402649e-06, + "loss": 0.80158031, + "num_input_tokens_seen": 233376830, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.640625, + "step": 10812, + "time_per_iteration": 2.4838054180145264 + }, + { + "auxiliary_loss_clip": 0.01103233, + "auxiliary_loss_mlp": 0.01036542, + "balance_loss_clip": 1.02414393, + "balance_loss_mlp": 1.03522503, + "epoch": 0.6501127310987524, + "flos": 23331019374720.0, + "grad_norm": 1.7227870996833219, + "language_loss": 0.85212648, + "learning_rate": 1.152362047854413e-06, + "loss": 0.87352425, + "num_input_tokens_seen": 233395275, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 10813, + "time_per_iteration": 2.4507973194122314 + }, + { + "auxiliary_loss_clip": 0.01103984, + "auxiliary_loss_mlp": 0.0102924, + "balance_loss_clip": 1.01711679, + "balance_loss_mlp": 1.03622413, + "epoch": 0.6501728543514204, + "flos": 18697824898560.0, + "grad_norm": 1.630969137195917, + "language_loss": 0.80210257, + "learning_rate": 1.1520093102110764e-06, + "loss": 0.82343483, + "num_input_tokens_seen": 233413345, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 10814, + "time_per_iteration": 2.4843356609344482 + }, + { + "auxiliary_loss_clip": 0.0110736, + "auxiliary_loss_mlp": 0.01034219, + "balance_loss_clip": 1.02207136, + "balance_loss_mlp": 1.03762007, + "epoch": 0.6502329776040884, + "flos": 44199858199680.0, + "grad_norm": 1.5728804424803877, + "language_loss": 0.65147841, + "learning_rate": 1.1516566047236328e-06, + "loss": 0.67289424, + "num_input_tokens_seen": 233436105, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6953125, + "step": 10815, + "time_per_iteration": 2.6453187465667725 + }, + { + "auxiliary_loss_clip": 0.01107853, + "auxiliary_loss_mlp": 0.01034444, + "balance_loss_clip": 1.02031219, + "balance_loss_mlp": 1.0368166, + "epoch": 0.6502931008567564, + "flos": 14574741419520.0, + "grad_norm": 2.1717658748812925, + "language_loss": 0.75344497, + "learning_rate": 1.1513039314054546e-06, + "loss": 0.77486801, + "num_input_tokens_seen": 233452320, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.7109375, + "step": 10816, + "time_per_iteration": 2.4386065006256104 + }, + { + "auxiliary_loss_clip": 0.01102422, + "auxiliary_loss_mlp": 0.01032135, + "balance_loss_clip": 1.0204587, + "balance_loss_mlp": 1.0362556, + "epoch": 0.6503532241094243, + "flos": 21395003201280.0, + "grad_norm": 1.7229503928288044, + "language_loss": 0.7330451, + "learning_rate": 1.1509512902699174e-06, + "loss": 0.75439066, + "num_input_tokens_seen": 233469920, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 10817, + "time_per_iteration": 2.4583981037139893 + }, + { + "auxiliary_loss_clip": 0.01103563, + "auxiliary_loss_mlp": 0.01036237, + "balance_loss_clip": 1.02363098, + "balance_loss_mlp": 1.03521729, + "epoch": 0.6504133473620923, + "flos": 74740840986240.0, + "grad_norm": 1.4667825090725979, + "language_loss": 0.71944672, + "learning_rate": 1.1505986813303916e-06, + "loss": 0.74084473, + "num_input_tokens_seen": 233499780, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.68359375, + "step": 10818, + "time_per_iteration": 2.862744092941284 + }, + { + "auxiliary_loss_clip": 0.01107713, + "auxiliary_loss_mlp": 0.01028855, + "balance_loss_clip": 1.01682127, + "balance_loss_mlp": 1.03837013, + "epoch": 0.6504734706147602, + "flos": 19713270384000.0, + "grad_norm": 1.8855888512315708, + "language_loss": 0.65002698, + "learning_rate": 1.150246104600249e-06, + "loss": 0.67139268, + "num_input_tokens_seen": 233518235, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6953125, + "step": 10819, + "time_per_iteration": 2.500066041946411 + }, + { + "auxiliary_loss_clip": 0.01105945, + "auxiliary_loss_mlp": 0.01031983, + "balance_loss_clip": 1.01954389, + "balance_loss_mlp": 1.03696644, + "epoch": 0.6505335938674283, + "flos": 25556870390400.0, + "grad_norm": 1.9280601319833375, + "language_loss": 0.83383453, + "learning_rate": 1.14989356009286e-06, + "loss": 0.85521388, + "num_input_tokens_seen": 233535215, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.69140625, + "step": 10820, + "time_per_iteration": 2.5053653717041016 + }, + { + "auxiliary_loss_clip": 0.01105855, + "auxiliary_loss_mlp": 0.01030579, + "balance_loss_clip": 1.01781201, + "balance_loss_mlp": 1.03561532, + "epoch": 0.6505937171200962, + "flos": 17821424960640.0, + "grad_norm": 2.4467285300705166, + "language_loss": 0.78197402, + "learning_rate": 1.1495410478215914e-06, + "loss": 0.80333835, + "num_input_tokens_seen": 233552775, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.703125, + "step": 10821, + "time_per_iteration": 2.431373357772827 + }, + { + "auxiliary_loss_clip": 0.01101047, + "auxiliary_loss_mlp": 0.01029296, + "balance_loss_clip": 1.01875234, + "balance_loss_mlp": 1.03584325, + "epoch": 0.6506538403727642, + "flos": 20668135582080.0, + "grad_norm": 1.457845041613161, + "language_loss": 0.80133367, + "learning_rate": 1.1491885677998126e-06, + "loss": 0.82263708, + "num_input_tokens_seen": 233572080, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.65234375, + "step": 10822, + "time_per_iteration": 2.460176706314087 + }, + { + "auxiliary_loss_clip": 0.01102252, + "auxiliary_loss_mlp": 0.01029965, + "balance_loss_clip": 1.01815748, + "balance_loss_mlp": 1.0353092, + "epoch": 0.6507139636254321, + "flos": 11721422695680.0, + "grad_norm": 1.750587835143927, + "language_loss": 0.87001264, + "learning_rate": 1.1488361200408883e-06, + "loss": 0.89133477, + "num_input_tokens_seen": 233589155, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 10823, + "time_per_iteration": 2.4293131828308105 + }, + { + "auxiliary_loss_clip": 0.01105612, + "auxiliary_loss_mlp": 0.01029251, + "balance_loss_clip": 1.01709723, + "balance_loss_mlp": 1.03624296, + "epoch": 0.6507740868781001, + "flos": 26761745226240.0, + "grad_norm": 1.6365898296789787, + "language_loss": 0.66641533, + "learning_rate": 1.148483704558183e-06, + "loss": 0.68776393, + "num_input_tokens_seen": 233608180, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6953125, + "step": 10824, + "time_per_iteration": 2.4835896492004395 + }, + { + "auxiliary_loss_clip": 0.01106218, + "auxiliary_loss_mlp": 0.01028522, + "balance_loss_clip": 1.01628542, + "balance_loss_mlp": 1.03520238, + "epoch": 0.650834210130768, + "flos": 16471722487680.0, + "grad_norm": 4.8089783891514974, + "language_loss": 0.87194103, + "learning_rate": 1.1481313213650607e-06, + "loss": 0.89328843, + "num_input_tokens_seen": 233625750, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.7109375, + "step": 10825, + "time_per_iteration": 2.4161195755004883 + }, + { + "auxiliary_loss_clip": 0.01106101, + "auxiliary_loss_mlp": 0.01028407, + "balance_loss_clip": 1.01528192, + "balance_loss_mlp": 1.03501797, + "epoch": 0.650894333383436, + "flos": 17128672283520.0, + "grad_norm": 2.113023109439822, + "language_loss": 0.72701895, + "learning_rate": 1.147778970474885e-06, + "loss": 0.74836403, + "num_input_tokens_seen": 233644235, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 10826, + "time_per_iteration": 2.4384891986846924 + }, + { + "auxiliary_loss_clip": 0.01103778, + "auxiliary_loss_mlp": 0.01029196, + "balance_loss_clip": 1.01812768, + "balance_loss_mlp": 1.03663278, + "epoch": 0.650954456636104, + "flos": 18734238311040.0, + "grad_norm": 1.8815234967356322, + "language_loss": 0.69047898, + "learning_rate": 1.1474266519010157e-06, + "loss": 0.71180868, + "num_input_tokens_seen": 233662845, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.671875, + "step": 10827, + "time_per_iteration": 2.4236016273498535 + }, + { + "auxiliary_loss_clip": 0.01103468, + "auxiliary_loss_mlp": 0.01026173, + "balance_loss_clip": 1.01466322, + "balance_loss_mlp": 1.03472352, + "epoch": 0.651014579888772, + "flos": 24528244613760.0, + "grad_norm": 1.912124303976498, + "language_loss": 0.76917899, + "learning_rate": 1.1470743656568136e-06, + "loss": 0.79047537, + "num_input_tokens_seen": 233681990, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6875, + "step": 10828, + "time_per_iteration": 2.501492500305176 + }, + { + "auxiliary_loss_clip": 0.01103546, + "auxiliary_loss_mlp": 0.01025588, + "balance_loss_clip": 1.01449549, + "balance_loss_mlp": 1.03721857, + "epoch": 0.65107470314144, + "flos": 24061083304320.0, + "grad_norm": 1.7405898865071652, + "language_loss": 0.89106113, + "learning_rate": 1.1467221117556362e-06, + "loss": 0.91235244, + "num_input_tokens_seen": 233698930, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6640625, + "step": 10829, + "time_per_iteration": 2.4867043495178223 + }, + { + "auxiliary_loss_clip": 0.01025679, + "auxiliary_loss_mlp": 0.01006089, + "balance_loss_clip": 1.004879, + "balance_loss_mlp": 1.00477648, + "epoch": 0.6511348263941079, + "flos": 72480734352000.0, + "grad_norm": 0.638409366999194, + "language_loss": 0.5535605, + "learning_rate": 1.1463698902108428e-06, + "loss": 0.57387817, + "num_input_tokens_seen": 233769825, + "router_z_loss_clip": 0.01208496, + "router_z_loss_mlp": 0.20898438, + "step": 10830, + "time_per_iteration": 3.2332394123077393 + }, + { + "auxiliary_loss_clip": 0.01105984, + "auxiliary_loss_mlp": 0.01032022, + "balance_loss_clip": 1.01880741, + "balance_loss_mlp": 1.0351963, + "epoch": 0.6511949496467759, + "flos": 23367684182400.0, + "grad_norm": 1.8294925765604486, + "language_loss": 0.74714524, + "learning_rate": 1.1460177010357878e-06, + "loss": 0.76852524, + "num_input_tokens_seen": 233787095, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.70703125, + "step": 10831, + "time_per_iteration": 2.4678196907043457 + }, + { + "auxiliary_loss_clip": 0.01026675, + "auxiliary_loss_mlp": 0.01000885, + "balance_loss_clip": 0.99961585, + "balance_loss_mlp": 1.0056181, + "epoch": 0.6512550728994438, + "flos": 67333191073920.0, + "grad_norm": 0.6414585196656494, + "language_loss": 0.51052123, + "learning_rate": 1.145665544243828e-06, + "loss": 0.53079689, + "num_input_tokens_seen": 233853050, + "router_z_loss_clip": 0.01269531, + "router_z_loss_mlp": 0.2109375, + "step": 10832, + "time_per_iteration": 3.188751697540283 + }, + { + "auxiliary_loss_clip": 0.01105483, + "auxiliary_loss_mlp": 0.01031181, + "balance_loss_clip": 1.01886725, + "balance_loss_mlp": 1.03423023, + "epoch": 0.6513151961521119, + "flos": 21141689512320.0, + "grad_norm": 2.076228287586058, + "language_loss": 0.83391213, + "learning_rate": 1.145313419848316e-06, + "loss": 0.85527885, + "num_input_tokens_seen": 233871385, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.7109375, + "step": 10833, + "time_per_iteration": 2.462529182434082 + }, + { + "auxiliary_loss_clip": 0.01106556, + "auxiliary_loss_mlp": 0.01030981, + "balance_loss_clip": 1.01900601, + "balance_loss_mlp": 1.03788352, + "epoch": 0.6513753194047798, + "flos": 15158828476800.0, + "grad_norm": 2.0595405323959817, + "language_loss": 0.83691829, + "learning_rate": 1.1449613278626049e-06, + "loss": 0.85829365, + "num_input_tokens_seen": 233888175, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6875, + "step": 10834, + "time_per_iteration": 2.4130232334136963 + }, + { + "auxiliary_loss_clip": 0.01105953, + "auxiliary_loss_mlp": 0.01032966, + "balance_loss_clip": 1.02089036, + "balance_loss_mlp": 1.03688574, + "epoch": 0.6514354426574478, + "flos": 30226621933440.0, + "grad_norm": 1.593058398275777, + "language_loss": 0.76863015, + "learning_rate": 1.1446092683000455e-06, + "loss": 0.79001933, + "num_input_tokens_seen": 233911470, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69140625, + "step": 10835, + "time_per_iteration": 2.562690019607544 + }, + { + "auxiliary_loss_clip": 0.01107145, + "auxiliary_loss_mlp": 0.01032273, + "balance_loss_clip": 1.02025032, + "balance_loss_mlp": 1.03849971, + "epoch": 0.6514955659101157, + "flos": 24205587719040.0, + "grad_norm": 1.513196810995274, + "language_loss": 0.7734859, + "learning_rate": 1.1442572411739882e-06, + "loss": 0.79488003, + "num_input_tokens_seen": 233932135, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6875, + "step": 10836, + "time_per_iteration": 2.4830451011657715 + }, + { + "auxiliary_loss_clip": 0.01106015, + "auxiliary_loss_mlp": 0.01030871, + "balance_loss_clip": 1.01907551, + "balance_loss_mlp": 1.03746104, + "epoch": 0.6515556891627837, + "flos": 12377761960320.0, + "grad_norm": 3.377184093609282, + "language_loss": 0.82293916, + "learning_rate": 1.143905246497783e-06, + "loss": 0.84430802, + "num_input_tokens_seen": 233947880, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.68359375, + "step": 10837, + "time_per_iteration": 2.449313163757324 + }, + { + "auxiliary_loss_clip": 0.01101636, + "auxiliary_loss_mlp": 0.01031992, + "balance_loss_clip": 1.01881361, + "balance_loss_mlp": 1.0366106, + "epoch": 0.6516158124154516, + "flos": 49601217957120.0, + "grad_norm": 1.879635988028464, + "language_loss": 0.59214962, + "learning_rate": 1.1435532842847758e-06, + "loss": 0.61348593, + "num_input_tokens_seen": 233971475, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.65234375, + "step": 10838, + "time_per_iteration": 2.7190330028533936 + }, + { + "auxiliary_loss_clip": 0.01025807, + "auxiliary_loss_mlp": 0.01001457, + "balance_loss_clip": 1.00031853, + "balance_loss_mlp": 1.00479698, + "epoch": 0.6516759356681197, + "flos": 59702748076800.0, + "grad_norm": 0.7299756161535264, + "language_loss": 0.60843396, + "learning_rate": 1.1432013545483147e-06, + "loss": 0.62870657, + "num_input_tokens_seen": 234030690, + "router_z_loss_clip": 0.01141357, + "router_z_loss_mlp": 0.2109375, + "step": 10839, + "time_per_iteration": 3.0971086025238037 + }, + { + "auxiliary_loss_clip": 0.0110206, + "auxiliary_loss_mlp": 0.01025474, + "balance_loss_clip": 1.01444113, + "balance_loss_mlp": 1.03583503, + "epoch": 0.6517360589207876, + "flos": 37450807130880.0, + "grad_norm": 1.8264384192259977, + "language_loss": 0.68170393, + "learning_rate": 1.1428494573017439e-06, + "loss": 0.70297927, + "num_input_tokens_seen": 234052470, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6640625, + "step": 10840, + "time_per_iteration": 2.5938761234283447 + }, + { + "auxiliary_loss_clip": 0.01102231, + "auxiliary_loss_mlp": 0.01028806, + "balance_loss_clip": 1.01783288, + "balance_loss_mlp": 1.03470433, + "epoch": 0.6517961821734556, + "flos": 25374911068800.0, + "grad_norm": 2.0940212881125433, + "language_loss": 0.73375624, + "learning_rate": 1.1424975925584071e-06, + "loss": 0.75506657, + "num_input_tokens_seen": 234071495, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.67578125, + "step": 10841, + "time_per_iteration": 2.5096652507781982 + }, + { + "auxiliary_loss_clip": 0.01104442, + "auxiliary_loss_mlp": 0.01031855, + "balance_loss_clip": 1.02016115, + "balance_loss_mlp": 1.03598571, + "epoch": 0.6518563054261236, + "flos": 28766996864640.0, + "grad_norm": 1.444320911302732, + "language_loss": 0.6237874, + "learning_rate": 1.142145760331648e-06, + "loss": 0.64515036, + "num_input_tokens_seen": 234092325, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.68359375, + "step": 10842, + "time_per_iteration": 2.4958693981170654 + }, + { + "auxiliary_loss_clip": 0.01026129, + "auxiliary_loss_mlp": 0.00998688, + "balance_loss_clip": 0.99753761, + "balance_loss_mlp": 1.00511324, + "epoch": 0.6519164286787915, + "flos": 68924750797440.0, + "grad_norm": 0.8080147467318853, + "language_loss": 0.56082183, + "learning_rate": 1.141793960634807e-06, + "loss": 0.58107001, + "num_input_tokens_seen": 234148005, + "router_z_loss_clip": 0.01147461, + "router_z_loss_mlp": 0.2109375, + "step": 10843, + "time_per_iteration": 4.309800863265991 + }, + { + "auxiliary_loss_clip": 0.01107299, + "auxiliary_loss_mlp": 0.01033148, + "balance_loss_clip": 1.02050614, + "balance_loss_mlp": 1.03615665, + "epoch": 0.6519765519314595, + "flos": 20441933683200.0, + "grad_norm": 1.64501007109248, + "language_loss": 0.82562542, + "learning_rate": 1.1414421934812253e-06, + "loss": 0.84702992, + "num_input_tokens_seen": 234164280, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 10844, + "time_per_iteration": 2.4669365882873535 + }, + { + "auxiliary_loss_clip": 0.01103507, + "auxiliary_loss_mlp": 0.01026932, + "balance_loss_clip": 1.0143199, + "balance_loss_mlp": 1.03550506, + "epoch": 0.6520366751841274, + "flos": 28402970480640.0, + "grad_norm": 2.063344534700721, + "language_loss": 0.60069621, + "learning_rate": 1.1410904588842421e-06, + "loss": 0.62200063, + "num_input_tokens_seen": 234185090, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6796875, + "step": 10845, + "time_per_iteration": 2.5032777786254883 + }, + { + "auxiliary_loss_clip": 0.01102843, + "auxiliary_loss_mlp": 0.01026875, + "balance_loss_clip": 1.01482248, + "balance_loss_mlp": 1.03591549, + "epoch": 0.6520967984367955, + "flos": 22273414300800.0, + "grad_norm": 2.814601439051778, + "language_loss": 0.79261941, + "learning_rate": 1.140738756857194e-06, + "loss": 0.81391656, + "num_input_tokens_seen": 234204050, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.66796875, + "step": 10846, + "time_per_iteration": 5.227022171020508 + }, + { + "auxiliary_loss_clip": 0.01025994, + "auxiliary_loss_mlp": 0.01001258, + "balance_loss_clip": 1.00005949, + "balance_loss_mlp": 1.00516367, + "epoch": 0.6521569216894634, + "flos": 68917140092160.0, + "grad_norm": 0.7222516480670771, + "language_loss": 0.60183281, + "learning_rate": 1.1403870874134192e-06, + "loss": 0.6221053, + "num_input_tokens_seen": 234269790, + "router_z_loss_clip": 0.01196289, + "router_z_loss_mlp": 0.20898438, + "step": 10847, + "time_per_iteration": 3.1712331771850586 + }, + { + "auxiliary_loss_clip": 0.01107998, + "auxiliary_loss_mlp": 0.01036888, + "balance_loss_clip": 1.0252527, + "balance_loss_mlp": 1.03767812, + "epoch": 0.6522170449421314, + "flos": 29130520458240.0, + "grad_norm": 1.5760338552649935, + "language_loss": 0.81001323, + "learning_rate": 1.1400354505662514e-06, + "loss": 0.83146203, + "num_input_tokens_seen": 234290135, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.703125, + "step": 10848, + "time_per_iteration": 3.9554522037506104 + }, + { + "auxiliary_loss_clip": 0.01102504, + "auxiliary_loss_mlp": 0.01034881, + "balance_loss_clip": 1.02334166, + "balance_loss_mlp": 1.03565013, + "epoch": 0.6522771681947993, + "flos": 26651930371200.0, + "grad_norm": 2.095194559726116, + "language_loss": 0.75025082, + "learning_rate": 1.1396838463290263e-06, + "loss": 0.77162468, + "num_input_tokens_seen": 234309535, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66796875, + "step": 10849, + "time_per_iteration": 2.4771504402160645 + }, + { + "auxiliary_loss_clip": 0.01101707, + "auxiliary_loss_mlp": 0.01028459, + "balance_loss_clip": 1.01685405, + "balance_loss_mlp": 1.03644204, + "epoch": 0.6523372914474673, + "flos": 25739763465600.0, + "grad_norm": 1.5413673094352514, + "language_loss": 0.68062961, + "learning_rate": 1.1393322747150752e-06, + "loss": 0.70193124, + "num_input_tokens_seen": 234328755, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65234375, + "step": 10850, + "time_per_iteration": 2.5665318965911865 + }, + { + "auxiliary_loss_clip": 0.01102248, + "auxiliary_loss_mlp": 0.01025486, + "balance_loss_clip": 1.01378012, + "balance_loss_mlp": 1.03723216, + "epoch": 0.6523974147001352, + "flos": 24827345164800.0, + "grad_norm": 3.429236792588671, + "language_loss": 0.66494656, + "learning_rate": 1.1389807357377313e-06, + "loss": 0.68622386, + "num_input_tokens_seen": 234348655, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6484375, + "step": 10851, + "time_per_iteration": 2.4702751636505127 + }, + { + "auxiliary_loss_clip": 0.01106489, + "auxiliary_loss_mlp": 0.01027965, + "balance_loss_clip": 1.01643169, + "balance_loss_mlp": 1.03662848, + "epoch": 0.6524575379528033, + "flos": 26317637470080.0, + "grad_norm": 2.15849365590988, + "language_loss": 0.74028027, + "learning_rate": 1.1386292294103235e-06, + "loss": 0.76162481, + "num_input_tokens_seen": 234367445, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.69921875, + "step": 10852, + "time_per_iteration": 2.509229898452759 + }, + { + "auxiliary_loss_clip": 0.01108111, + "auxiliary_loss_mlp": 0.01028926, + "balance_loss_clip": 1.01606905, + "balance_loss_mlp": 1.03742135, + "epoch": 0.6525176612054712, + "flos": 19494143464320.0, + "grad_norm": 7.224145946580318, + "language_loss": 0.66702747, + "learning_rate": 1.1382777557461812e-06, + "loss": 0.68839788, + "num_input_tokens_seen": 234384825, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.70703125, + "step": 10853, + "time_per_iteration": 2.4382283687591553 + }, + { + "auxiliary_loss_clip": 0.01026122, + "auxiliary_loss_mlp": 0.00996827, + "balance_loss_clip": 0.99562865, + "balance_loss_mlp": 1.00521636, + "epoch": 0.6525777844581392, + "flos": 71706894721920.0, + "grad_norm": 0.7308751423910714, + "language_loss": 0.62970364, + "learning_rate": 1.137926314758634e-06, + "loss": 0.64993316, + "num_input_tokens_seen": 234450630, + "router_z_loss_clip": 0.01196289, + "router_z_loss_mlp": 0.20898438, + "step": 10854, + "time_per_iteration": 3.1691970825195312 + }, + { + "auxiliary_loss_clip": 0.01104802, + "auxiliary_loss_mlp": 0.01030611, + "balance_loss_clip": 1.01792085, + "balance_loss_mlp": 1.03625202, + "epoch": 0.6526379077108072, + "flos": 26653115520000.0, + "grad_norm": 1.8459663187588897, + "language_loss": 0.77826589, + "learning_rate": 1.1375749064610072e-06, + "loss": 0.79962003, + "num_input_tokens_seen": 234473505, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 10855, + "time_per_iteration": 2.5133306980133057 + }, + { + "auxiliary_loss_clip": 0.01099784, + "auxiliary_loss_mlp": 0.01026139, + "balance_loss_clip": 1.01462901, + "balance_loss_mlp": 1.03466463, + "epoch": 0.6526980309634751, + "flos": 22820369673600.0, + "grad_norm": 1.7863182329630984, + "language_loss": 0.79166549, + "learning_rate": 1.1372235308666256e-06, + "loss": 0.81292474, + "num_input_tokens_seen": 234492485, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 10856, + "time_per_iteration": 2.521003007888794 + }, + { + "auxiliary_loss_clip": 0.01103089, + "auxiliary_loss_mlp": 0.01029164, + "balance_loss_clip": 1.01608038, + "balance_loss_mlp": 1.03572774, + "epoch": 0.6527581542161431, + "flos": 28365048696960.0, + "grad_norm": 1.7280049220035325, + "language_loss": 0.73561788, + "learning_rate": 1.136872187988815e-06, + "loss": 0.75694042, + "num_input_tokens_seen": 234512645, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.671875, + "step": 10857, + "time_per_iteration": 2.524388074874878 + }, + { + "auxiliary_loss_clip": 0.01103768, + "auxiliary_loss_mlp": 0.01030272, + "balance_loss_clip": 1.01941204, + "balance_loss_mlp": 1.03619289, + "epoch": 0.652818277468811, + "flos": 18369206346240.0, + "grad_norm": 2.287513574647506, + "language_loss": 0.62553668, + "learning_rate": 1.1365208778408965e-06, + "loss": 0.64687705, + "num_input_tokens_seen": 234529310, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.67578125, + "step": 10858, + "time_per_iteration": 2.484292507171631 + }, + { + "auxiliary_loss_clip": 0.01100147, + "auxiliary_loss_mlp": 0.01030536, + "balance_loss_clip": 1.0192945, + "balance_loss_mlp": 1.03388333, + "epoch": 0.6528784007214791, + "flos": 18036170421120.0, + "grad_norm": 1.70957243248878, + "language_loss": 0.78181291, + "learning_rate": 1.1361696004361939e-06, + "loss": 0.80311966, + "num_input_tokens_seen": 234546685, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6640625, + "step": 10859, + "time_per_iteration": 2.4208006858825684 + }, + { + "auxiliary_loss_clip": 0.011057, + "auxiliary_loss_mlp": 0.01030498, + "balance_loss_clip": 1.01820755, + "balance_loss_mlp": 1.03562379, + "epoch": 0.652938523974147, + "flos": 22382008093440.0, + "grad_norm": 1.5618141301411743, + "language_loss": 0.67899007, + "learning_rate": 1.1358183557880256e-06, + "loss": 0.70035207, + "num_input_tokens_seen": 234566255, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 10860, + "time_per_iteration": 2.516052722930908 + }, + { + "auxiliary_loss_clip": 0.01106777, + "auxiliary_loss_mlp": 0.01026586, + "balance_loss_clip": 1.01489735, + "balance_loss_mlp": 1.03654599, + "epoch": 0.652998647226815, + "flos": 16764035368320.0, + "grad_norm": 2.1862353937135732, + "language_loss": 0.66182673, + "learning_rate": 1.135467143909712e-06, + "loss": 0.68316036, + "num_input_tokens_seen": 234585405, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.703125, + "step": 10861, + "time_per_iteration": 2.4207851886749268 + }, + { + "auxiliary_loss_clip": 0.01105314, + "auxiliary_loss_mlp": 0.0103272, + "balance_loss_clip": 1.01948178, + "balance_loss_mlp": 1.03619254, + "epoch": 0.6530587704794829, + "flos": 35772522019200.0, + "grad_norm": 1.7782678366068123, + "language_loss": 0.6507051, + "learning_rate": 1.135115964814572e-06, + "loss": 0.67208546, + "num_input_tokens_seen": 234608095, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.69140625, + "step": 10862, + "time_per_iteration": 2.5804362297058105 + }, + { + "auxiliary_loss_clip": 0.0110242, + "auxiliary_loss_mlp": 0.0103057, + "balance_loss_clip": 1.01891708, + "balance_loss_mlp": 1.03588247, + "epoch": 0.6531188937321509, + "flos": 19316134638720.0, + "grad_norm": 1.5241362686221158, + "language_loss": 0.77193171, + "learning_rate": 1.13476481851592e-06, + "loss": 0.79326159, + "num_input_tokens_seen": 234627335, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 10863, + "time_per_iteration": 2.438044309616089 + }, + { + "auxiliary_loss_clip": 0.01103508, + "auxiliary_loss_mlp": 0.01026948, + "balance_loss_clip": 1.0157485, + "balance_loss_mlp": 1.03619623, + "epoch": 0.6531790169848188, + "flos": 22893771116160.0, + "grad_norm": 1.8164803813000403, + "language_loss": 0.7466498, + "learning_rate": 1.1344137050270739e-06, + "loss": 0.76795435, + "num_input_tokens_seen": 234646540, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.671875, + "step": 10864, + "time_per_iteration": 2.4771134853363037 + }, + { + "auxiliary_loss_clip": 0.01102279, + "auxiliary_loss_mlp": 0.01033829, + "balance_loss_clip": 1.02253413, + "balance_loss_mlp": 1.03580821, + "epoch": 0.6532391402374869, + "flos": 29563530912000.0, + "grad_norm": 1.7514895317957062, + "language_loss": 0.8600319, + "learning_rate": 1.1340626243613458e-06, + "loss": 0.88139296, + "num_input_tokens_seen": 234665470, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.66796875, + "step": 10865, + "time_per_iteration": 2.5002095699310303 + }, + { + "auxiliary_loss_clip": 0.01108321, + "auxiliary_loss_mlp": 0.01036311, + "balance_loss_clip": 1.02430654, + "balance_loss_mlp": 1.03760266, + "epoch": 0.6532992634901548, + "flos": 23105463920640.0, + "grad_norm": 3.5499069425062832, + "language_loss": 0.81403613, + "learning_rate": 1.133711576532051e-06, + "loss": 0.83548248, + "num_input_tokens_seen": 234683955, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.70703125, + "step": 10866, + "time_per_iteration": 2.546633005142212 + }, + { + "auxiliary_loss_clip": 0.01102409, + "auxiliary_loss_mlp": 0.01026983, + "balance_loss_clip": 1.01524687, + "balance_loss_mlp": 1.03626192, + "epoch": 0.6533593867428228, + "flos": 26067340523520.0, + "grad_norm": 1.4960309400225926, + "language_loss": 0.82321596, + "learning_rate": 1.1333605615524995e-06, + "loss": 0.8445099, + "num_input_tokens_seen": 234704595, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66015625, + "step": 10867, + "time_per_iteration": 2.4704959392547607 + }, + { + "auxiliary_loss_clip": 0.01102151, + "auxiliary_loss_mlp": 0.0102709, + "balance_loss_clip": 1.01578307, + "balance_loss_mlp": 1.03344285, + "epoch": 0.6534195099954908, + "flos": 21212469262080.0, + "grad_norm": 1.873401062188488, + "language_loss": 0.81152415, + "learning_rate": 1.1330095794360016e-06, + "loss": 0.8328166, + "num_input_tokens_seen": 234724090, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6875, + "step": 10868, + "time_per_iteration": 2.462496519088745 + }, + { + "auxiliary_loss_clip": 0.01106253, + "auxiliary_loss_mlp": 0.01028186, + "balance_loss_clip": 1.0159198, + "balance_loss_mlp": 1.03690481, + "epoch": 0.6534796332481587, + "flos": 19646584784640.0, + "grad_norm": 1.9591239016591335, + "language_loss": 0.79279351, + "learning_rate": 1.1326586301958675e-06, + "loss": 0.81413788, + "num_input_tokens_seen": 234742560, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 10869, + "time_per_iteration": 2.4351487159729004 + }, + { + "auxiliary_loss_clip": 0.0110718, + "auxiliary_loss_mlp": 0.01034774, + "balance_loss_clip": 1.02241778, + "balance_loss_mlp": 1.03880501, + "epoch": 0.6535397565008267, + "flos": 24022479162240.0, + "grad_norm": 2.040320648065678, + "language_loss": 0.71729898, + "learning_rate": 1.1323077138454063e-06, + "loss": 0.73871845, + "num_input_tokens_seen": 234762315, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6875, + "step": 10870, + "time_per_iteration": 2.5223138332366943 + }, + { + "auxiliary_loss_clip": 0.01104928, + "auxiliary_loss_mlp": 0.01035056, + "balance_loss_clip": 1.02319491, + "balance_loss_mlp": 1.0377295, + "epoch": 0.6535998797534947, + "flos": 24602759377920.0, + "grad_norm": 2.147023101303994, + "language_loss": 0.74992102, + "learning_rate": 1.1319568303979221e-06, + "loss": 0.77132088, + "num_input_tokens_seen": 234781300, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 10871, + "time_per_iteration": 2.469367265701294 + }, + { + "auxiliary_loss_clip": 0.01099729, + "auxiliary_loss_mlp": 0.01029902, + "balance_loss_clip": 1.01870823, + "balance_loss_mlp": 1.03503919, + "epoch": 0.6536600030061627, + "flos": 23364164649600.0, + "grad_norm": 1.7849990892484822, + "language_loss": 0.55615103, + "learning_rate": 1.1316059798667227e-06, + "loss": 0.5774473, + "num_input_tokens_seen": 234801040, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6484375, + "step": 10872, + "time_per_iteration": 2.4673538208007812 + }, + { + "auxiliary_loss_clip": 0.0110205, + "auxiliary_loss_mlp": 0.01034353, + "balance_loss_clip": 1.0230695, + "balance_loss_mlp": 1.03632164, + "epoch": 0.6537201262588306, + "flos": 23878477537920.0, + "grad_norm": 1.8219619398900448, + "language_loss": 0.75073338, + "learning_rate": 1.1312551622651112e-06, + "loss": 0.77209741, + "num_input_tokens_seen": 234821415, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65625, + "step": 10873, + "time_per_iteration": 2.4655163288116455 + }, + { + "auxiliary_loss_clip": 0.01104694, + "auxiliary_loss_mlp": 0.01028464, + "balance_loss_clip": 1.0166738, + "balance_loss_mlp": 1.03662491, + "epoch": 0.6537802495114986, + "flos": 24354760901760.0, + "grad_norm": 1.5897958047644043, + "language_loss": 0.75623226, + "learning_rate": 1.1309043776063917e-06, + "loss": 0.77756387, + "num_input_tokens_seen": 234843795, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6796875, + "step": 10874, + "time_per_iteration": 2.5224883556365967 + }, + { + "auxiliary_loss_clip": 0.01103714, + "auxiliary_loss_mlp": 0.01031841, + "balance_loss_clip": 1.0198009, + "balance_loss_mlp": 1.03682685, + "epoch": 0.6538403727641665, + "flos": 27996892248960.0, + "grad_norm": 2.7439070637520064, + "language_loss": 0.81423092, + "learning_rate": 1.1305536259038642e-06, + "loss": 0.83558643, + "num_input_tokens_seen": 234862350, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.66796875, + "step": 10875, + "time_per_iteration": 2.4869980812072754 + }, + { + "auxiliary_loss_clip": 0.01102459, + "auxiliary_loss_mlp": 0.0103789, + "balance_loss_clip": 1.02635086, + "balance_loss_mlp": 1.03504491, + "epoch": 0.6539004960168345, + "flos": 27563594486400.0, + "grad_norm": 1.6810546720157804, + "language_loss": 0.70045686, + "learning_rate": 1.1302029071708314e-06, + "loss": 0.72186041, + "num_input_tokens_seen": 234881790, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 10876, + "time_per_iteration": 2.5040364265441895 + }, + { + "auxiliary_loss_clip": 0.01103097, + "auxiliary_loss_mlp": 0.01034782, + "balance_loss_clip": 1.02289736, + "balance_loss_mlp": 1.03575683, + "epoch": 0.6539606192695024, + "flos": 14530067879040.0, + "grad_norm": 1.8217122109555075, + "language_loss": 0.7932229, + "learning_rate": 1.1298522214205908e-06, + "loss": 0.81460166, + "num_input_tokens_seen": 234897775, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 10877, + "time_per_iteration": 2.402308940887451 + }, + { + "auxiliary_loss_clip": 0.01103576, + "auxiliary_loss_mlp": 0.01027676, + "balance_loss_clip": 1.01635098, + "balance_loss_mlp": 1.0359726, + "epoch": 0.6540207425221705, + "flos": 21616356764160.0, + "grad_norm": 2.189241924086369, + "language_loss": 0.7987535, + "learning_rate": 1.1295015686664408e-06, + "loss": 0.82006603, + "num_input_tokens_seen": 234918395, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.67578125, + "step": 10878, + "time_per_iteration": 2.4780471324920654 + }, + { + "auxiliary_loss_clip": 0.01100458, + "auxiliary_loss_mlp": 0.01027543, + "balance_loss_clip": 1.0154438, + "balance_loss_mlp": 1.03370023, + "epoch": 0.6540808657748384, + "flos": 17668983640320.0, + "grad_norm": 1.8666542226247762, + "language_loss": 0.84453034, + "learning_rate": 1.1291509489216797e-06, + "loss": 0.86581039, + "num_input_tokens_seen": 234936260, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66796875, + "step": 10879, + "time_per_iteration": 2.4143741130828857 + }, + { + "auxiliary_loss_clip": 0.01104945, + "auxiliary_loss_mlp": 0.01030605, + "balance_loss_clip": 1.01845217, + "balance_loss_mlp": 1.03493488, + "epoch": 0.6541409890275064, + "flos": 14538292093440.0, + "grad_norm": 2.8543762869506004, + "language_loss": 0.71946406, + "learning_rate": 1.128800362199601e-06, + "loss": 0.74081963, + "num_input_tokens_seen": 234952110, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69921875, + "step": 10880, + "time_per_iteration": 2.430192708969116 + }, + { + "auxiliary_loss_clip": 0.01100358, + "auxiliary_loss_mlp": 0.01030494, + "balance_loss_clip": 1.01899612, + "balance_loss_mlp": 1.03472471, + "epoch": 0.6542011122801744, + "flos": 17165301177600.0, + "grad_norm": 1.7514865003733433, + "language_loss": 0.84385759, + "learning_rate": 1.1284498085135005e-06, + "loss": 0.86516607, + "num_input_tokens_seen": 234970810, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 10881, + "time_per_iteration": 2.4801900386810303 + }, + { + "auxiliary_loss_clip": 0.01105434, + "auxiliary_loss_mlp": 0.0103233, + "balance_loss_clip": 1.01909828, + "balance_loss_mlp": 1.03612447, + "epoch": 0.6542612355328423, + "flos": 18186600579840.0, + "grad_norm": 1.8305344772437837, + "language_loss": 0.77706677, + "learning_rate": 1.1280992878766699e-06, + "loss": 0.79844439, + "num_input_tokens_seen": 234989565, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.69140625, + "step": 10882, + "time_per_iteration": 2.4523637294769287 + }, + { + "auxiliary_loss_clip": 0.01106717, + "auxiliary_loss_mlp": 0.01029129, + "balance_loss_clip": 1.01632619, + "balance_loss_mlp": 1.03733766, + "epoch": 0.6543213587855103, + "flos": 19792453916160.0, + "grad_norm": 1.6779149142362604, + "language_loss": 0.82394373, + "learning_rate": 1.1277488003024024e-06, + "loss": 0.84530222, + "num_input_tokens_seen": 235007955, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 10883, + "time_per_iteration": 2.4265058040618896 + }, + { + "auxiliary_loss_clip": 0.01108268, + "auxiliary_loss_mlp": 0.01034275, + "balance_loss_clip": 1.02163935, + "balance_loss_mlp": 1.0390712, + "epoch": 0.6543814820381783, + "flos": 21105096531840.0, + "grad_norm": 2.382020741579914, + "language_loss": 0.85506725, + "learning_rate": 1.127398345803988e-06, + "loss": 0.87649274, + "num_input_tokens_seen": 235024860, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 10884, + "time_per_iteration": 2.4697301387786865 + }, + { + "auxiliary_loss_clip": 0.0110574, + "auxiliary_loss_mlp": 0.01036191, + "balance_loss_clip": 1.02454472, + "balance_loss_mlp": 1.03698301, + "epoch": 0.6544416052908463, + "flos": 20194042947840.0, + "grad_norm": 10.527351582586146, + "language_loss": 0.80486369, + "learning_rate": 1.127047924394715e-06, + "loss": 0.82628304, + "num_input_tokens_seen": 235043815, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6875, + "step": 10885, + "time_per_iteration": 3.9415979385375977 + }, + { + "auxiliary_loss_clip": 0.01103256, + "auxiliary_loss_mlp": 0.01026985, + "balance_loss_clip": 1.01527846, + "balance_loss_mlp": 1.03751159, + "epoch": 0.6545017285435142, + "flos": 23368258800000.0, + "grad_norm": 1.8132591830137343, + "language_loss": 0.72155404, + "learning_rate": 1.1266975360878722e-06, + "loss": 0.74285644, + "num_input_tokens_seen": 235062985, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 10886, + "time_per_iteration": 2.474519729614258 + }, + { + "auxiliary_loss_clip": 0.01101162, + "auxiliary_loss_mlp": 0.01029474, + "balance_loss_clip": 1.01850116, + "balance_loss_mlp": 1.03500915, + "epoch": 0.6545618517961822, + "flos": 19134714021120.0, + "grad_norm": 1.738538225206424, + "language_loss": 0.78089505, + "learning_rate": 1.1263471808967468e-06, + "loss": 0.80220145, + "num_input_tokens_seen": 235081670, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6640625, + "step": 10887, + "time_per_iteration": 2.4567511081695557 + }, + { + "auxiliary_loss_clip": 0.01102786, + "auxiliary_loss_mlp": 0.01030643, + "balance_loss_clip": 1.01913893, + "balance_loss_mlp": 1.03559566, + "epoch": 0.6546219750488501, + "flos": 14938624149120.0, + "grad_norm": 4.496679975000023, + "language_loss": 0.78967035, + "learning_rate": 1.1259968588346234e-06, + "loss": 0.81100464, + "num_input_tokens_seen": 235098510, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 10888, + "time_per_iteration": 5.19985818862915 + }, + { + "auxiliary_loss_clip": 0.01099033, + "auxiliary_loss_mlp": 0.01028783, + "balance_loss_clip": 1.01808405, + "balance_loss_mlp": 1.03421295, + "epoch": 0.6546820983015181, + "flos": 36320518886400.0, + "grad_norm": 1.5919708412571818, + "language_loss": 0.66247272, + "learning_rate": 1.1256465699147874e-06, + "loss": 0.68375087, + "num_input_tokens_seen": 235119990, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6484375, + "step": 10889, + "time_per_iteration": 2.5679409503936768 + }, + { + "auxiliary_loss_clip": 0.01102521, + "auxiliary_loss_mlp": 0.01029157, + "balance_loss_clip": 1.01675916, + "balance_loss_mlp": 1.03473425, + "epoch": 0.654742221554186, + "flos": 20411446014720.0, + "grad_norm": 1.4966214179852624, + "language_loss": 0.79874986, + "learning_rate": 1.1252963141505203e-06, + "loss": 0.82006663, + "num_input_tokens_seen": 235139255, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 10890, + "time_per_iteration": 3.9007346630096436 + }, + { + "auxiliary_loss_clip": 0.01102511, + "auxiliary_loss_mlp": 0.0102901, + "balance_loss_clip": 1.01683879, + "balance_loss_mlp": 1.03386474, + "epoch": 0.6548023448068541, + "flos": 24863650836480.0, + "grad_norm": 2.4806412573813494, + "language_loss": 0.65136874, + "learning_rate": 1.1249460915551052e-06, + "loss": 0.67268395, + "num_input_tokens_seen": 235158455, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6875, + "step": 10891, + "time_per_iteration": 2.4762353897094727 + }, + { + "auxiliary_loss_clip": 0.01103305, + "auxiliary_loss_mlp": 0.01030658, + "balance_loss_clip": 1.01963139, + "balance_loss_mlp": 1.03584743, + "epoch": 0.654862468059522, + "flos": 21427573858560.0, + "grad_norm": 1.713176232540202, + "language_loss": 0.79329646, + "learning_rate": 1.1245959021418214e-06, + "loss": 0.81463599, + "num_input_tokens_seen": 235177350, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.67578125, + "step": 10892, + "time_per_iteration": 2.483430862426758 + }, + { + "auxiliary_loss_clip": 0.01108627, + "auxiliary_loss_mlp": 0.01034578, + "balance_loss_clip": 1.02300262, + "balance_loss_mlp": 1.03826213, + "epoch": 0.65492259131219, + "flos": 26577846570240.0, + "grad_norm": 1.927118370280093, + "language_loss": 0.77688205, + "learning_rate": 1.1242457459239497e-06, + "loss": 0.79831409, + "num_input_tokens_seen": 235196435, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.703125, + "step": 10893, + "time_per_iteration": 2.468653440475464 + }, + { + "auxiliary_loss_clip": 0.01107027, + "auxiliary_loss_mlp": 0.01026547, + "balance_loss_clip": 1.01425672, + "balance_loss_mlp": 1.03698456, + "epoch": 0.6549827145648579, + "flos": 21501334437120.0, + "grad_norm": 1.6133414191995223, + "language_loss": 0.7036956, + "learning_rate": 1.123895622914766e-06, + "loss": 0.72503132, + "num_input_tokens_seen": 235215430, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 10894, + "time_per_iteration": 2.454615592956543 + }, + { + "auxiliary_loss_clip": 0.01106124, + "auxiliary_loss_mlp": 0.0103336, + "balance_loss_clip": 1.02128386, + "balance_loss_mlp": 1.03594112, + "epoch": 0.6550428378175259, + "flos": 22594275515520.0, + "grad_norm": 4.213583210390945, + "language_loss": 0.63007772, + "learning_rate": 1.123545533127549e-06, + "loss": 0.65147251, + "num_input_tokens_seen": 235232015, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.703125, + "step": 10895, + "time_per_iteration": 2.4314959049224854 + }, + { + "auxiliary_loss_clip": 0.01102811, + "auxiliary_loss_mlp": 0.01030375, + "balance_loss_clip": 1.01881742, + "balance_loss_mlp": 1.03532076, + "epoch": 0.655102961070194, + "flos": 12823809050880.0, + "grad_norm": 3.6304048273042717, + "language_loss": 0.7897135, + "learning_rate": 1.1231954765755722e-06, + "loss": 0.81104541, + "num_input_tokens_seen": 235248115, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.67578125, + "step": 10896, + "time_per_iteration": 2.4550769329071045 + }, + { + "auxiliary_loss_clip": 0.01102279, + "auxiliary_loss_mlp": 0.01031456, + "balance_loss_clip": 1.01995277, + "balance_loss_mlp": 1.03664804, + "epoch": 0.6551630843228619, + "flos": 24791075406720.0, + "grad_norm": 1.4344785444999102, + "language_loss": 0.70384824, + "learning_rate": 1.1228454532721111e-06, + "loss": 0.72518563, + "num_input_tokens_seen": 235270785, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 10897, + "time_per_iteration": 2.478304624557495 + }, + { + "auxiliary_loss_clip": 0.01103619, + "auxiliary_loss_mlp": 0.010288, + "balance_loss_clip": 1.0175761, + "balance_loss_mlp": 1.03478158, + "epoch": 0.6552232075755299, + "flos": 16724461559040.0, + "grad_norm": 1.7387642279992266, + "language_loss": 0.75401318, + "learning_rate": 1.1224954632304391e-06, + "loss": 0.77533734, + "num_input_tokens_seen": 235287905, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6875, + "step": 10898, + "time_per_iteration": 2.4487948417663574 + }, + { + "auxiliary_loss_clip": 0.01105468, + "auxiliary_loss_mlp": 0.01034852, + "balance_loss_clip": 1.02299678, + "balance_loss_mlp": 1.03773856, + "epoch": 0.6552833308281978, + "flos": 22016473338240.0, + "grad_norm": 2.1990983943767555, + "language_loss": 0.73518318, + "learning_rate": 1.122145506463827e-06, + "loss": 0.75658637, + "num_input_tokens_seen": 235305525, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.67578125, + "step": 10899, + "time_per_iteration": 2.4304370880126953 + }, + { + "auxiliary_loss_clip": 0.0110359, + "auxiliary_loss_mlp": 0.01026488, + "balance_loss_clip": 1.01528871, + "balance_loss_mlp": 1.0364536, + "epoch": 0.6553434540808658, + "flos": 24863399441280.0, + "grad_norm": 2.1275272720256293, + "language_loss": 0.55958188, + "learning_rate": 1.1217955829855443e-06, + "loss": 0.58088267, + "num_input_tokens_seen": 235324415, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 10900, + "time_per_iteration": 2.484473943710327 + }, + { + "auxiliary_loss_clip": 0.01104316, + "auxiliary_loss_mlp": 0.01032088, + "balance_loss_clip": 1.01939833, + "balance_loss_mlp": 1.03653932, + "epoch": 0.6554035773335337, + "flos": 23221060865280.0, + "grad_norm": 1.8846923286778847, + "language_loss": 0.76933706, + "learning_rate": 1.1214456928088622e-06, + "loss": 0.79070109, + "num_input_tokens_seen": 235341595, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 10901, + "time_per_iteration": 2.4382822513580322 + }, + { + "auxiliary_loss_clip": 0.01101496, + "auxiliary_loss_mlp": 0.01026359, + "balance_loss_clip": 1.01434255, + "balance_loss_mlp": 1.03516734, + "epoch": 0.6554637005862017, + "flos": 22783597125120.0, + "grad_norm": 1.753856944987035, + "language_loss": 0.73216426, + "learning_rate": 1.1210958359470463e-06, + "loss": 0.75344282, + "num_input_tokens_seen": 235361700, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 10902, + "time_per_iteration": 2.49745774269104 + }, + { + "auxiliary_loss_clip": 0.01102831, + "auxiliary_loss_mlp": 0.01026395, + "balance_loss_clip": 1.01536822, + "balance_loss_mlp": 1.03652823, + "epoch": 0.6555238238388696, + "flos": 21507224267520.0, + "grad_norm": 1.6638199342391367, + "language_loss": 0.67729247, + "learning_rate": 1.1207460124133645e-06, + "loss": 0.69858468, + "num_input_tokens_seen": 235382065, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6640625, + "step": 10903, + "time_per_iteration": 2.457672595977783 + }, + { + "auxiliary_loss_clip": 0.01106344, + "auxiliary_loss_mlp": 0.01034393, + "balance_loss_clip": 1.02201295, + "balance_loss_mlp": 1.03555727, + "epoch": 0.6555839470915377, + "flos": 30519473518080.0, + "grad_norm": 1.8258125512154932, + "language_loss": 0.66961503, + "learning_rate": 1.1203962222210832e-06, + "loss": 0.6910224, + "num_input_tokens_seen": 235402130, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 10904, + "time_per_iteration": 2.544079065322876 + }, + { + "auxiliary_loss_clip": 0.01103937, + "auxiliary_loss_mlp": 0.01035362, + "balance_loss_clip": 1.02261209, + "balance_loss_mlp": 1.03435302, + "epoch": 0.6556440703442056, + "flos": 24642943718400.0, + "grad_norm": 1.9965123681804708, + "language_loss": 0.90475762, + "learning_rate": 1.120046465383464e-06, + "loss": 0.92615068, + "num_input_tokens_seen": 235420435, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 10905, + "time_per_iteration": 2.4607133865356445 + }, + { + "auxiliary_loss_clip": 0.01100631, + "auxiliary_loss_mlp": 0.01030435, + "balance_loss_clip": 1.01922941, + "balance_loss_mlp": 1.03564942, + "epoch": 0.6557041935968736, + "flos": 23732464752000.0, + "grad_norm": 2.060465882995779, + "language_loss": 0.75227022, + "learning_rate": 1.1196967419137721e-06, + "loss": 0.77358085, + "num_input_tokens_seen": 235439960, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 10906, + "time_per_iteration": 2.489344358444214 + }, + { + "auxiliary_loss_clip": 0.01108555, + "auxiliary_loss_mlp": 0.01037824, + "balance_loss_clip": 1.02571845, + "balance_loss_mlp": 1.03796065, + "epoch": 0.6557643168495415, + "flos": 11102753819520.0, + "grad_norm": 2.673517900647209, + "language_loss": 0.74337453, + "learning_rate": 1.119347051825267e-06, + "loss": 0.76483834, + "num_input_tokens_seen": 235457495, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.70703125, + "step": 10907, + "time_per_iteration": 2.4216673374176025 + }, + { + "auxiliary_loss_clip": 0.01102218, + "auxiliary_loss_mlp": 0.01030446, + "balance_loss_clip": 1.01732743, + "balance_loss_mlp": 1.03423953, + "epoch": 0.6558244401022095, + "flos": 30191034533760.0, + "grad_norm": 1.4101718899089066, + "language_loss": 0.72367519, + "learning_rate": 1.118997395131211e-06, + "loss": 0.74500179, + "num_input_tokens_seen": 235479525, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6796875, + "step": 10908, + "time_per_iteration": 2.531003952026367 + }, + { + "auxiliary_loss_clip": 0.01105598, + "auxiliary_loss_mlp": 0.01032426, + "balance_loss_clip": 1.01962876, + "balance_loss_mlp": 1.03744864, + "epoch": 0.6558845633548775, + "flos": 17931060247680.0, + "grad_norm": 2.1513013799426868, + "language_loss": 0.81017995, + "learning_rate": 1.118647771844861e-06, + "loss": 0.83156013, + "num_input_tokens_seen": 235496305, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 10909, + "time_per_iteration": 2.4130208492279053 + }, + { + "auxiliary_loss_clip": 0.01105504, + "auxiliary_loss_mlp": 0.01034655, + "balance_loss_clip": 1.02167928, + "balance_loss_mlp": 1.0363667, + "epoch": 0.6559446866075455, + "flos": 21904144531200.0, + "grad_norm": 2.0430689174515098, + "language_loss": 0.63840532, + "learning_rate": 1.1182981819794767e-06, + "loss": 0.65980697, + "num_input_tokens_seen": 235512545, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.69140625, + "step": 10910, + "time_per_iteration": 2.4513769149780273 + }, + { + "auxiliary_loss_clip": 0.01110874, + "auxiliary_loss_mlp": 0.01030758, + "balance_loss_clip": 1.0173471, + "balance_loss_mlp": 1.03761017, + "epoch": 0.6560048098602135, + "flos": 14127976056960.0, + "grad_norm": 3.983049569871041, + "language_loss": 0.76120275, + "learning_rate": 1.117948625548313e-06, + "loss": 0.78261906, + "num_input_tokens_seen": 235526045, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.734375, + "step": 10911, + "time_per_iteration": 2.421567440032959 + }, + { + "auxiliary_loss_clip": 0.01098552, + "auxiliary_loss_mlp": 0.01027915, + "balance_loss_clip": 1.01696563, + "balance_loss_mlp": 1.03389096, + "epoch": 0.6560649331128814, + "flos": 18807567926400.0, + "grad_norm": 2.6100669832011048, + "language_loss": 0.75670731, + "learning_rate": 1.1175991025646265e-06, + "loss": 0.77797198, + "num_input_tokens_seen": 235545285, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.64453125, + "step": 10912, + "time_per_iteration": 2.4657318592071533 + }, + { + "auxiliary_loss_clip": 0.01111745, + "auxiliary_loss_mlp": 0.01034368, + "balance_loss_clip": 1.02153468, + "balance_loss_mlp": 1.03876007, + "epoch": 0.6561250563655494, + "flos": 17053618815360.0, + "grad_norm": 1.5787420401710588, + "language_loss": 0.77322382, + "learning_rate": 1.1172496130416697e-06, + "loss": 0.79468495, + "num_input_tokens_seen": 235563150, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.73046875, + "step": 10913, + "time_per_iteration": 2.4153146743774414 + }, + { + "auxiliary_loss_clip": 0.01099585, + "auxiliary_loss_mlp": 0.01026757, + "balance_loss_clip": 1.01626611, + "balance_loss_mlp": 1.03441024, + "epoch": 0.6561851796182173, + "flos": 22637656166400.0, + "grad_norm": 1.9167212276506074, + "language_loss": 0.70828009, + "learning_rate": 1.1169001569926961e-06, + "loss": 0.72954357, + "num_input_tokens_seen": 235582535, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.65234375, + "step": 10914, + "time_per_iteration": 2.4597549438476562 + }, + { + "auxiliary_loss_clip": 0.01103262, + "auxiliary_loss_mlp": 0.01030743, + "balance_loss_clip": 1.01875639, + "balance_loss_mlp": 1.03628445, + "epoch": 0.6562453028708853, + "flos": 19239213663360.0, + "grad_norm": 1.851270541448462, + "language_loss": 0.73936331, + "learning_rate": 1.116550734430958e-06, + "loss": 0.76070333, + "num_input_tokens_seen": 235601490, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66796875, + "step": 10915, + "time_per_iteration": 2.4307053089141846 + }, + { + "auxiliary_loss_clip": 0.01100361, + "auxiliary_loss_mlp": 0.01032785, + "balance_loss_clip": 1.02053595, + "balance_loss_mlp": 1.034675, + "epoch": 0.6563054261235532, + "flos": 23801305167360.0, + "grad_norm": 1.6584707758046542, + "language_loss": 0.79572797, + "learning_rate": 1.1162013453697042e-06, + "loss": 0.8170594, + "num_input_tokens_seen": 235619165, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.65625, + "step": 10916, + "time_per_iteration": 2.4956743717193604 + }, + { + "auxiliary_loss_clip": 0.01103152, + "auxiliary_loss_mlp": 0.01034727, + "balance_loss_clip": 1.02357495, + "balance_loss_mlp": 1.03500533, + "epoch": 0.6563655493762213, + "flos": 19240039676160.0, + "grad_norm": 1.9383516308380546, + "language_loss": 0.76153994, + "learning_rate": 1.1158519898221831e-06, + "loss": 0.78291869, + "num_input_tokens_seen": 235637115, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6796875, + "step": 10917, + "time_per_iteration": 2.4713754653930664 + }, + { + "auxiliary_loss_clip": 0.01101411, + "auxiliary_loss_mlp": 0.01027975, + "balance_loss_clip": 1.01656687, + "balance_loss_mlp": 1.03484607, + "epoch": 0.6564256726288892, + "flos": 25556439427200.0, + "grad_norm": 1.8282774447422543, + "language_loss": 0.69401765, + "learning_rate": 1.1155026678016445e-06, + "loss": 0.71531153, + "num_input_tokens_seen": 235656330, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 10918, + "time_per_iteration": 2.500551462173462 + }, + { + "auxiliary_loss_clip": 0.01099041, + "auxiliary_loss_mlp": 0.01037247, + "balance_loss_clip": 1.02611244, + "balance_loss_mlp": 1.03552103, + "epoch": 0.6564857958815572, + "flos": 22200623389440.0, + "grad_norm": 1.7922194863374643, + "language_loss": 0.76487136, + "learning_rate": 1.115153379321332e-06, + "loss": 0.78623426, + "num_input_tokens_seen": 235674510, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6328125, + "step": 10919, + "time_per_iteration": 2.4698684215545654 + }, + { + "auxiliary_loss_clip": 0.01026665, + "auxiliary_loss_mlp": 0.01002269, + "balance_loss_clip": 1.00099361, + "balance_loss_mlp": 1.00584173, + "epoch": 0.6565459191342251, + "flos": 58123144604160.0, + "grad_norm": 0.7207598722602275, + "language_loss": 0.5307852, + "learning_rate": 1.1148041243944931e-06, + "loss": 0.55107456, + "num_input_tokens_seen": 235735050, + "router_z_loss_clip": 0.01275635, + "router_z_loss_mlp": 0.20898438, + "step": 10920, + "time_per_iteration": 3.0821664333343506 + }, + { + "auxiliary_loss_clip": 0.01101918, + "auxiliary_loss_mlp": 0.01028744, + "balance_loss_clip": 1.01682281, + "balance_loss_mlp": 1.03579378, + "epoch": 0.6566060423868931, + "flos": 30809631582720.0, + "grad_norm": 1.482616976222016, + "language_loss": 0.65204817, + "learning_rate": 1.1144549030343697e-06, + "loss": 0.6733548, + "num_input_tokens_seen": 235757545, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66015625, + "step": 10921, + "time_per_iteration": 2.515620231628418 + }, + { + "auxiliary_loss_clip": 0.01100913, + "auxiliary_loss_mlp": 0.0103275, + "balance_loss_clip": 1.01986957, + "balance_loss_mlp": 1.03486526, + "epoch": 0.6566661656395612, + "flos": 23367432787200.0, + "grad_norm": 1.8313420178351358, + "language_loss": 0.81071579, + "learning_rate": 1.114105715254205e-06, + "loss": 0.83205247, + "num_input_tokens_seen": 235777265, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.66015625, + "step": 10922, + "time_per_iteration": 2.454880714416504 + }, + { + "auxiliary_loss_clip": 0.01105049, + "auxiliary_loss_mlp": 0.01032776, + "balance_loss_clip": 1.02074158, + "balance_loss_mlp": 1.03742886, + "epoch": 0.6567262888922291, + "flos": 25735597488000.0, + "grad_norm": 2.376840972990548, + "language_loss": 0.71632755, + "learning_rate": 1.1137565610672414e-06, + "loss": 0.73770583, + "num_input_tokens_seen": 235796565, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 10923, + "time_per_iteration": 2.5216050148010254 + }, + { + "auxiliary_loss_clip": 0.01106548, + "auxiliary_loss_mlp": 0.01030386, + "balance_loss_clip": 1.01902556, + "balance_loss_mlp": 1.03784943, + "epoch": 0.6567864121448971, + "flos": 17123716206720.0, + "grad_norm": 1.9379255151150183, + "language_loss": 0.80668283, + "learning_rate": 1.1134074404867169e-06, + "loss": 0.82805216, + "num_input_tokens_seen": 235814805, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6875, + "step": 10924, + "time_per_iteration": 2.420976400375366 + }, + { + "auxiliary_loss_clip": 0.01103854, + "auxiliary_loss_mlp": 0.01029176, + "balance_loss_clip": 1.01838779, + "balance_loss_mlp": 1.03694773, + "epoch": 0.656846535397565, + "flos": 22419319345920.0, + "grad_norm": 1.6223500631493692, + "language_loss": 0.72360754, + "learning_rate": 1.1130583535258717e-06, + "loss": 0.74493784, + "num_input_tokens_seen": 235833405, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.66796875, + "step": 10925, + "time_per_iteration": 2.48442006111145 + }, + { + "auxiliary_loss_clip": 0.011034, + "auxiliary_loss_mlp": 0.01026622, + "balance_loss_clip": 1.01515365, + "balance_loss_mlp": 1.03553128, + "epoch": 0.656906658650233, + "flos": 17704535126400.0, + "grad_norm": 2.3888033375770266, + "language_loss": 0.72365135, + "learning_rate": 1.112709300197942e-06, + "loss": 0.74495161, + "num_input_tokens_seen": 235848530, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6796875, + "step": 10926, + "time_per_iteration": 2.426408052444458 + }, + { + "auxiliary_loss_clip": 0.01104746, + "auxiliary_loss_mlp": 0.01030909, + "balance_loss_clip": 1.01850533, + "balance_loss_mlp": 1.03482258, + "epoch": 0.6569667819029009, + "flos": 21175158009600.0, + "grad_norm": 1.681586343154767, + "language_loss": 0.72273743, + "learning_rate": 1.1123602805161656e-06, + "loss": 0.74409401, + "num_input_tokens_seen": 235867225, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69921875, + "step": 10927, + "time_per_iteration": 3.80648136138916 + }, + { + "auxiliary_loss_clip": 0.01026322, + "auxiliary_loss_mlp": 0.01001587, + "balance_loss_clip": 1.00027585, + "balance_loss_mlp": 1.00539577, + "epoch": 0.6570269051555689, + "flos": 68761897511040.0, + "grad_norm": 0.7330380682962492, + "language_loss": 0.64455849, + "learning_rate": 1.112011294493775e-06, + "loss": 0.66483754, + "num_input_tokens_seen": 235932925, + "router_z_loss_clip": 0.01312256, + "router_z_loss_mlp": 0.20898438, + "step": 10928, + "time_per_iteration": 3.092785120010376 + }, + { + "auxiliary_loss_clip": 0.01101776, + "auxiliary_loss_mlp": 0.01028054, + "balance_loss_clip": 1.01669884, + "balance_loss_mlp": 1.03520453, + "epoch": 0.6570870284082369, + "flos": 26319289495680.0, + "grad_norm": 1.7549487521997071, + "language_loss": 0.77955842, + "learning_rate": 1.1116623421440063e-06, + "loss": 0.80085671, + "num_input_tokens_seen": 235952680, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6640625, + "step": 10929, + "time_per_iteration": 4.023591041564941 + }, + { + "auxiliary_loss_clip": 0.01102136, + "auxiliary_loss_mlp": 0.01031793, + "balance_loss_clip": 1.02030087, + "balance_loss_mlp": 1.0353775, + "epoch": 0.6571471516609049, + "flos": 26174749167360.0, + "grad_norm": 1.6371374390238511, + "language_loss": 0.65487254, + "learning_rate": 1.1113134234800895e-06, + "loss": 0.67621183, + "num_input_tokens_seen": 235972075, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.66796875, + "step": 10930, + "time_per_iteration": 3.8790106773376465 + }, + { + "auxiliary_loss_clip": 0.01102, + "auxiliary_loss_mlp": 0.01028907, + "balance_loss_clip": 1.01679564, + "balance_loss_mlp": 1.03432441, + "epoch": 0.6572072749135728, + "flos": 20376253664640.0, + "grad_norm": 1.5199914554797245, + "language_loss": 0.70439506, + "learning_rate": 1.110964538515258e-06, + "loss": 0.72570413, + "num_input_tokens_seen": 235990340, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 10931, + "time_per_iteration": 3.8428475856781006 + }, + { + "auxiliary_loss_clip": 0.01105703, + "auxiliary_loss_mlp": 0.01035602, + "balance_loss_clip": 1.02384758, + "balance_loss_mlp": 1.03632128, + "epoch": 0.6572673981662408, + "flos": 17128744110720.0, + "grad_norm": 2.0176400266990147, + "language_loss": 0.68914682, + "learning_rate": 1.1106156872627393e-06, + "loss": 0.71055984, + "num_input_tokens_seen": 236007470, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6953125, + "step": 10932, + "time_per_iteration": 2.427386999130249 + }, + { + "auxiliary_loss_clip": 0.01100641, + "auxiliary_loss_mlp": 0.01028111, + "balance_loss_clip": 1.01668537, + "balance_loss_mlp": 1.03434443, + "epoch": 0.6573275214189087, + "flos": 41275113281280.0, + "grad_norm": 2.41406977097007, + "language_loss": 0.80051857, + "learning_rate": 1.1102668697357626e-06, + "loss": 0.82180607, + "num_input_tokens_seen": 236029030, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 10933, + "time_per_iteration": 2.5989818572998047 + }, + { + "auxiliary_loss_clip": 0.01106278, + "auxiliary_loss_mlp": 0.01030089, + "balance_loss_clip": 1.01818609, + "balance_loss_mlp": 1.03827631, + "epoch": 0.6573876446715767, + "flos": 22890143842560.0, + "grad_norm": 1.7962352646576603, + "language_loss": 0.73653376, + "learning_rate": 1.1099180859475571e-06, + "loss": 0.75789738, + "num_input_tokens_seen": 236047160, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 10934, + "time_per_iteration": 2.513033390045166 + }, + { + "auxiliary_loss_clip": 0.01101364, + "auxiliary_loss_mlp": 0.01031486, + "balance_loss_clip": 1.01991725, + "balance_loss_mlp": 1.03564167, + "epoch": 0.6574477679242448, + "flos": 44018150273280.0, + "grad_norm": 1.5095272560756583, + "language_loss": 0.7590912, + "learning_rate": 1.1095693359113454e-06, + "loss": 0.78041971, + "num_input_tokens_seen": 236069215, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 10935, + "time_per_iteration": 2.7678496837615967 + }, + { + "auxiliary_loss_clip": 0.0110481, + "auxiliary_loss_mlp": 0.0103704, + "balance_loss_clip": 1.02380824, + "balance_loss_mlp": 1.03610992, + "epoch": 0.6575078911769127, + "flos": 24571517523840.0, + "grad_norm": 1.8441545252151383, + "language_loss": 0.78123999, + "learning_rate": 1.1092206196403538e-06, + "loss": 0.8026585, + "num_input_tokens_seen": 236088335, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6875, + "step": 10936, + "time_per_iteration": 2.5077192783355713 + }, + { + "auxiliary_loss_clip": 0.01099758, + "auxiliary_loss_mlp": 0.01029165, + "balance_loss_clip": 1.01789331, + "balance_loss_mlp": 1.03462768, + "epoch": 0.6575680144295807, + "flos": 20924035050240.0, + "grad_norm": 2.0488788051519777, + "language_loss": 0.68872631, + "learning_rate": 1.1088719371478056e-06, + "loss": 0.71001554, + "num_input_tokens_seen": 236108540, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 10937, + "time_per_iteration": 2.5001776218414307 + }, + { + "auxiliary_loss_clip": 0.01102833, + "auxiliary_loss_mlp": 0.01027511, + "balance_loss_clip": 1.01570368, + "balance_loss_mlp": 1.03619266, + "epoch": 0.6576281376822486, + "flos": 10925642833920.0, + "grad_norm": 2.29220645619057, + "language_loss": 0.68323117, + "learning_rate": 1.1085232884469236e-06, + "loss": 0.70453459, + "num_input_tokens_seen": 236124495, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 10938, + "time_per_iteration": 2.4366493225097656 + }, + { + "auxiliary_loss_clip": 0.01105738, + "auxiliary_loss_mlp": 0.01030966, + "balance_loss_clip": 1.01890206, + "balance_loss_mlp": 1.03749824, + "epoch": 0.6576882609349166, + "flos": 19281552819840.0, + "grad_norm": 2.075102589417424, + "language_loss": 0.71458369, + "learning_rate": 1.108174673550927e-06, + "loss": 0.73595071, + "num_input_tokens_seen": 236142550, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.68359375, + "step": 10939, + "time_per_iteration": 2.4688596725463867 + }, + { + "auxiliary_loss_clip": 0.0110619, + "auxiliary_loss_mlp": 0.01029227, + "balance_loss_clip": 1.01679969, + "balance_loss_mlp": 1.03710163, + "epoch": 0.6577483841875845, + "flos": 20220544206720.0, + "grad_norm": 2.217107584857945, + "language_loss": 0.77532256, + "learning_rate": 1.107826092473037e-06, + "loss": 0.7966767, + "num_input_tokens_seen": 236156620, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 10940, + "time_per_iteration": 2.425093412399292 + }, + { + "auxiliary_loss_clip": 0.0110778, + "auxiliary_loss_mlp": 0.01031615, + "balance_loss_clip": 1.01946735, + "balance_loss_mlp": 1.03589988, + "epoch": 0.6578085074402525, + "flos": 34751078962560.0, + "grad_norm": 2.046264853980575, + "language_loss": 0.68482137, + "learning_rate": 1.107477545226471e-06, + "loss": 0.70621532, + "num_input_tokens_seen": 236177095, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.71875, + "step": 10941, + "time_per_iteration": 2.5489418506622314 + }, + { + "auxiliary_loss_clip": 0.01100409, + "auxiliary_loss_mlp": 0.01026012, + "balance_loss_clip": 1.01428187, + "balance_loss_mlp": 1.03322697, + "epoch": 0.6578686306929205, + "flos": 23470998675840.0, + "grad_norm": 1.8711951914026155, + "language_loss": 0.68390548, + "learning_rate": 1.1071290318244448e-06, + "loss": 0.70516968, + "num_input_tokens_seen": 236194695, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 10942, + "time_per_iteration": 2.486746072769165 + }, + { + "auxiliary_loss_clip": 0.0110907, + "auxiliary_loss_mlp": 0.01035043, + "balance_loss_clip": 1.02216208, + "balance_loss_mlp": 1.03639185, + "epoch": 0.6579287539455885, + "flos": 18077073033600.0, + "grad_norm": 2.0678514729005544, + "language_loss": 0.71317995, + "learning_rate": 1.1067805522801753e-06, + "loss": 0.73462105, + "num_input_tokens_seen": 236213885, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7265625, + "step": 10943, + "time_per_iteration": 2.4520316123962402 + }, + { + "auxiliary_loss_clip": 0.01102052, + "auxiliary_loss_mlp": 0.01030183, + "balance_loss_clip": 1.01820219, + "balance_loss_mlp": 1.03616333, + "epoch": 0.6579888771982564, + "flos": 28661383900800.0, + "grad_norm": 1.7679689812851298, + "language_loss": 0.59513438, + "learning_rate": 1.1064321066068778e-06, + "loss": 0.61645675, + "num_input_tokens_seen": 236237315, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66015625, + "step": 10944, + "time_per_iteration": 2.5190436840057373 + }, + { + "auxiliary_loss_clip": 0.01108265, + "auxiliary_loss_mlp": 0.01035357, + "balance_loss_clip": 1.02253008, + "balance_loss_mlp": 1.03664446, + "epoch": 0.6580490004509244, + "flos": 25046543911680.0, + "grad_norm": 1.558618410146096, + "language_loss": 0.72308242, + "learning_rate": 1.1060836948177646e-06, + "loss": 0.74451864, + "num_input_tokens_seen": 236256345, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.71484375, + "step": 10945, + "time_per_iteration": 2.463829755783081 + }, + { + "auxiliary_loss_clip": 0.01102, + "auxiliary_loss_mlp": 0.01026302, + "balance_loss_clip": 1.01497078, + "balance_loss_mlp": 1.03548717, + "epoch": 0.6581091237035923, + "flos": 43508793461760.0, + "grad_norm": 1.6917792730430523, + "language_loss": 0.70766807, + "learning_rate": 1.105735316926046e-06, + "loss": 0.7289511, + "num_input_tokens_seen": 236281890, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 10946, + "time_per_iteration": 2.6370081901550293 + }, + { + "auxiliary_loss_clip": 0.01104509, + "auxiliary_loss_mlp": 0.01030715, + "balance_loss_clip": 1.0187701, + "balance_loss_mlp": 1.03649974, + "epoch": 0.6581692469562603, + "flos": 22415404763520.0, + "grad_norm": 1.9998217553522297, + "language_loss": 0.81970888, + "learning_rate": 1.105386972944934e-06, + "loss": 0.84106112, + "num_input_tokens_seen": 236298370, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 10947, + "time_per_iteration": 2.44291090965271 + }, + { + "auxiliary_loss_clip": 0.01105119, + "auxiliary_loss_mlp": 0.01028432, + "balance_loss_clip": 1.01703572, + "balance_loss_mlp": 1.03552985, + "epoch": 0.6582293702089284, + "flos": 24859772167680.0, + "grad_norm": 1.5893547671126769, + "language_loss": 0.77298671, + "learning_rate": 1.1050386628876385e-06, + "loss": 0.79432225, + "num_input_tokens_seen": 236317380, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6953125, + "step": 10948, + "time_per_iteration": 2.52156400680542 + }, + { + "auxiliary_loss_clip": 0.01103491, + "auxiliary_loss_mlp": 0.01024697, + "balance_loss_clip": 1.01318693, + "balance_loss_mlp": 1.03675056, + "epoch": 0.6582894934615963, + "flos": 23039676161280.0, + "grad_norm": 1.5781773720774923, + "language_loss": 0.79309839, + "learning_rate": 1.1046903867673655e-06, + "loss": 0.81438029, + "num_input_tokens_seen": 236336210, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66796875, + "step": 10949, + "time_per_iteration": 2.4466731548309326 + }, + { + "auxiliary_loss_clip": 0.01025722, + "auxiliary_loss_mlp": 0.00999404, + "balance_loss_clip": 0.99824774, + "balance_loss_mlp": 1.0049113, + "epoch": 0.6583496167142643, + "flos": 72551980978560.0, + "grad_norm": 0.7326202101084998, + "language_loss": 0.61823738, + "learning_rate": 1.104342144597323e-06, + "loss": 0.63848865, + "num_input_tokens_seen": 236403090, + "router_z_loss_clip": 0.01153564, + "router_z_loss_mlp": 0.20800781, + "step": 10950, + "time_per_iteration": 3.121711015701294 + }, + { + "auxiliary_loss_clip": 0.01098873, + "auxiliary_loss_mlp": 0.01029132, + "balance_loss_clip": 1.01832557, + "balance_loss_mlp": 1.0340389, + "epoch": 0.6584097399669322, + "flos": 13078846592640.0, + "grad_norm": 2.039519263453104, + "language_loss": 0.67086935, + "learning_rate": 1.1039939363907178e-06, + "loss": 0.69214934, + "num_input_tokens_seen": 236420475, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6484375, + "step": 10951, + "time_per_iteration": 2.4204366207122803 + }, + { + "auxiliary_loss_clip": 0.01103981, + "auxiliary_loss_mlp": 0.01031136, + "balance_loss_clip": 1.01967382, + "balance_loss_mlp": 1.03702927, + "epoch": 0.6584698632196002, + "flos": 28693164458880.0, + "grad_norm": 1.3948057696634335, + "language_loss": 0.76445824, + "learning_rate": 1.1036457621607504e-06, + "loss": 0.7858094, + "num_input_tokens_seen": 236441915, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 10952, + "time_per_iteration": 2.5405352115631104 + }, + { + "auxiliary_loss_clip": 0.01101736, + "auxiliary_loss_mlp": 0.01031049, + "balance_loss_clip": 1.0193491, + "balance_loss_mlp": 1.03628421, + "epoch": 0.6585299864722681, + "flos": 14319272914560.0, + "grad_norm": 1.8480440869895376, + "language_loss": 0.73304069, + "learning_rate": 1.1032976219206257e-06, + "loss": 0.75436854, + "num_input_tokens_seen": 236460340, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 10953, + "time_per_iteration": 2.4275546073913574 + }, + { + "auxiliary_loss_clip": 0.01103382, + "auxiliary_loss_mlp": 0.01035829, + "balance_loss_clip": 1.02360368, + "balance_loss_mlp": 1.0364027, + "epoch": 0.6585901097249361, + "flos": 26797907243520.0, + "grad_norm": 2.01659222308535, + "language_loss": 0.78839052, + "learning_rate": 1.102949515683546e-06, + "loss": 0.80978262, + "num_input_tokens_seen": 236478280, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 10954, + "time_per_iteration": 2.515486478805542 + }, + { + "auxiliary_loss_clip": 0.01105192, + "auxiliary_loss_mlp": 0.01030874, + "balance_loss_clip": 1.01928055, + "balance_loss_mlp": 1.0370729, + "epoch": 0.658650232977604, + "flos": 18733124989440.0, + "grad_norm": 4.542628698192554, + "language_loss": 0.69261253, + "learning_rate": 1.1026014434627096e-06, + "loss": 0.71397316, + "num_input_tokens_seen": 236493225, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 10955, + "time_per_iteration": 2.4162137508392334 + }, + { + "auxiliary_loss_clip": 0.01098966, + "auxiliary_loss_mlp": 0.01031102, + "balance_loss_clip": 1.02065945, + "balance_loss_mlp": 1.03447628, + "epoch": 0.6587103562302721, + "flos": 24753440931840.0, + "grad_norm": 1.9435823457200367, + "language_loss": 0.8063699, + "learning_rate": 1.1022534052713172e-06, + "loss": 0.82767057, + "num_input_tokens_seen": 236514420, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.64453125, + "step": 10956, + "time_per_iteration": 2.501207113265991 + }, + { + "auxiliary_loss_clip": 0.01103712, + "auxiliary_loss_mlp": 0.0103857, + "balance_loss_clip": 1.02636909, + "balance_loss_mlp": 1.03677917, + "epoch": 0.65877047948294, + "flos": 22346133384960.0, + "grad_norm": 2.2587354412030365, + "language_loss": 0.8126533, + "learning_rate": 1.1019054011225648e-06, + "loss": 0.83407611, + "num_input_tokens_seen": 236532785, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66796875, + "step": 10957, + "time_per_iteration": 2.4624950885772705 + }, + { + "auxiliary_loss_clip": 0.01101319, + "auxiliary_loss_mlp": 0.01027592, + "balance_loss_clip": 1.01656473, + "balance_loss_mlp": 1.03620899, + "epoch": 0.658830602735608, + "flos": 45180542298240.0, + "grad_norm": 1.8981628531368988, + "language_loss": 0.76096463, + "learning_rate": 1.1015574310296506e-06, + "loss": 0.78225374, + "num_input_tokens_seen": 236553330, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65234375, + "step": 10958, + "time_per_iteration": 2.6494197845458984 + }, + { + "auxiliary_loss_clip": 0.01101191, + "auxiliary_loss_mlp": 0.01029699, + "balance_loss_clip": 1.01818335, + "balance_loss_mlp": 1.03651094, + "epoch": 0.6588907259882759, + "flos": 19901622326400.0, + "grad_norm": 1.5449360693578584, + "language_loss": 0.7480197, + "learning_rate": 1.1012094950057678e-06, + "loss": 0.76932859, + "num_input_tokens_seen": 236572960, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 10959, + "time_per_iteration": 2.427396535873413 + }, + { + "auxiliary_loss_clip": 0.01102895, + "auxiliary_loss_mlp": 0.0102741, + "balance_loss_clip": 1.01597738, + "balance_loss_mlp": 1.03627992, + "epoch": 0.6589508492409439, + "flos": 24133766474880.0, + "grad_norm": 1.5048251142631304, + "language_loss": 0.64632499, + "learning_rate": 1.1008615930641107e-06, + "loss": 0.66762793, + "num_input_tokens_seen": 236594090, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 10960, + "time_per_iteration": 2.4602410793304443 + }, + { + "auxiliary_loss_clip": 0.01108237, + "auxiliary_loss_mlp": 0.01031743, + "balance_loss_clip": 1.01920843, + "balance_loss_mlp": 1.03767896, + "epoch": 0.659010972493612, + "flos": 18222906251520.0, + "grad_norm": 2.0928832268916064, + "language_loss": 0.81810492, + "learning_rate": 1.1005137252178734e-06, + "loss": 0.83950472, + "num_input_tokens_seen": 236610190, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.70703125, + "step": 10961, + "time_per_iteration": 2.409662961959839 + }, + { + "auxiliary_loss_clip": 0.01105671, + "auxiliary_loss_mlp": 0.01028804, + "balance_loss_clip": 1.01721096, + "balance_loss_mlp": 1.03837204, + "epoch": 0.6590710957462799, + "flos": 27600007898880.0, + "grad_norm": 1.6316286919602636, + "language_loss": 0.73185778, + "learning_rate": 1.1001658914802453e-06, + "loss": 0.7532025, + "num_input_tokens_seen": 236631575, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 10962, + "time_per_iteration": 2.5012168884277344 + }, + { + "auxiliary_loss_clip": 0.01103926, + "auxiliary_loss_mlp": 0.01027399, + "balance_loss_clip": 1.016325, + "balance_loss_mlp": 1.03553998, + "epoch": 0.6591312189989479, + "flos": 20302959962880.0, + "grad_norm": 2.292666509682468, + "language_loss": 0.7991221, + "learning_rate": 1.0998180918644165e-06, + "loss": 0.8204354, + "num_input_tokens_seen": 236649815, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.68359375, + "step": 10963, + "time_per_iteration": 2.4411072731018066 + }, + { + "auxiliary_loss_clip": 0.01101161, + "auxiliary_loss_mlp": 0.01026818, + "balance_loss_clip": 1.01545739, + "balance_loss_mlp": 1.03585351, + "epoch": 0.6591913422516158, + "flos": 12312943868160.0, + "grad_norm": 1.6740266575713383, + "language_loss": 0.78245199, + "learning_rate": 1.0994703263835754e-06, + "loss": 0.8037318, + "num_input_tokens_seen": 236668335, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65625, + "step": 10964, + "time_per_iteration": 2.5599732398986816 + }, + { + "auxiliary_loss_clip": 0.01103059, + "auxiliary_loss_mlp": 0.01032339, + "balance_loss_clip": 1.02130592, + "balance_loss_mlp": 1.03435874, + "epoch": 0.6592514655042838, + "flos": 25884591102720.0, + "grad_norm": 1.7118472944354244, + "language_loss": 0.74207413, + "learning_rate": 1.0991225950509106e-06, + "loss": 0.76342809, + "num_input_tokens_seen": 236688945, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6875, + "step": 10965, + "time_per_iteration": 2.471712112426758 + }, + { + "auxiliary_loss_clip": 0.01107005, + "auxiliary_loss_mlp": 0.01030746, + "balance_loss_clip": 1.01877689, + "balance_loss_mlp": 1.03634071, + "epoch": 0.6593115887569517, + "flos": 14063624841600.0, + "grad_norm": 1.7378396373661993, + "language_loss": 0.73264408, + "learning_rate": 1.0987748978796067e-06, + "loss": 0.75402158, + "num_input_tokens_seen": 236707055, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.70703125, + "step": 10966, + "time_per_iteration": 2.436239004135132 + }, + { + "auxiliary_loss_clip": 0.0110244, + "auxiliary_loss_mlp": 0.01030113, + "balance_loss_clip": 1.01798916, + "balance_loss_mlp": 1.03512931, + "epoch": 0.6593717120096197, + "flos": 24717925359360.0, + "grad_norm": 1.8788551125386406, + "language_loss": 0.77065092, + "learning_rate": 1.0984272348828487e-06, + "loss": 0.79197645, + "num_input_tokens_seen": 236725900, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 10967, + "time_per_iteration": 2.4717586040496826 + }, + { + "auxiliary_loss_clip": 0.01027072, + "auxiliary_loss_mlp": 0.01001789, + "balance_loss_clip": 1.00063896, + "balance_loss_mlp": 1.00624704, + "epoch": 0.6594318352622877, + "flos": 55558083502080.0, + "grad_norm": 0.6916872612313274, + "language_loss": 0.48437804, + "learning_rate": 1.0980796060738221e-06, + "loss": 0.50466669, + "num_input_tokens_seen": 236788415, + "router_z_loss_clip": 0.01147461, + "router_z_loss_mlp": 0.20898438, + "step": 10968, + "time_per_iteration": 4.5336384773254395 + }, + { + "auxiliary_loss_clip": 0.01103459, + "auxiliary_loss_mlp": 0.01030332, + "balance_loss_clip": 1.01853621, + "balance_loss_mlp": 1.03579104, + "epoch": 0.6594919585149557, + "flos": 17456931699840.0, + "grad_norm": 1.9909395686766433, + "language_loss": 0.79144681, + "learning_rate": 1.0977320114657058e-06, + "loss": 0.81278479, + "num_input_tokens_seen": 236805155, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 10969, + "time_per_iteration": 2.4394266605377197 + }, + { + "auxiliary_loss_clip": 0.01101468, + "auxiliary_loss_mlp": 0.01027571, + "balance_loss_clip": 1.0165081, + "balance_loss_mlp": 1.03489542, + "epoch": 0.6595520817676236, + "flos": 18223229473920.0, + "grad_norm": 1.9980021115439661, + "language_loss": 0.65425408, + "learning_rate": 1.0973844510716817e-06, + "loss": 0.6755445, + "num_input_tokens_seen": 236824360, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6640625, + "step": 10970, + "time_per_iteration": 2.421241521835327 + }, + { + "auxiliary_loss_clip": 0.01103326, + "auxiliary_loss_mlp": 0.0102615, + "balance_loss_clip": 1.01499188, + "balance_loss_mlp": 1.0361867, + "epoch": 0.6596122050202916, + "flos": 22199761463040.0, + "grad_norm": 1.9709453771316594, + "language_loss": 0.76396167, + "learning_rate": 1.0970369249049308e-06, + "loss": 0.78525639, + "num_input_tokens_seen": 236844640, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.671875, + "step": 10971, + "time_per_iteration": 5.2941343784332275 + }, + { + "auxiliary_loss_clip": 0.01105265, + "auxiliary_loss_mlp": 0.01031993, + "balance_loss_clip": 1.02074528, + "balance_loss_mlp": 1.03658032, + "epoch": 0.6596723282729595, + "flos": 14173834746240.0, + "grad_norm": 2.436761152631742, + "language_loss": 0.70031983, + "learning_rate": 1.096689432978629e-06, + "loss": 0.72169238, + "num_input_tokens_seen": 236861160, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6875, + "step": 10972, + "time_per_iteration": 2.434751033782959 + }, + { + "auxiliary_loss_clip": 0.01104063, + "auxiliary_loss_mlp": 0.01026316, + "balance_loss_clip": 1.01401901, + "balance_loss_mlp": 1.03706002, + "epoch": 0.6597324515256275, + "flos": 30553193410560.0, + "grad_norm": 2.0552877724786347, + "language_loss": 0.55426097, + "learning_rate": 1.0963419753059556e-06, + "loss": 0.5755648, + "num_input_tokens_seen": 236880465, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.671875, + "step": 10973, + "time_per_iteration": 3.9870107173919678 + }, + { + "auxiliary_loss_clip": 0.01108369, + "auxiliary_loss_mlp": 0.01034143, + "balance_loss_clip": 1.0224663, + "balance_loss_mlp": 1.0379895, + "epoch": 0.6597925747782956, + "flos": 17639860688640.0, + "grad_norm": 1.9173473771897223, + "language_loss": 0.78754056, + "learning_rate": 1.0959945519000839e-06, + "loss": 0.80896568, + "num_input_tokens_seen": 236897730, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.703125, + "step": 10974, + "time_per_iteration": 2.413245916366577 + }, + { + "auxiliary_loss_clip": 0.01104385, + "auxiliary_loss_mlp": 0.01031342, + "balance_loss_clip": 1.01999879, + "balance_loss_mlp": 1.03666687, + "epoch": 0.6598526980309635, + "flos": 22819112697600.0, + "grad_norm": 2.1994599169674016, + "language_loss": 0.69061923, + "learning_rate": 1.0956471627741906e-06, + "loss": 0.71197647, + "num_input_tokens_seen": 236917300, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.67578125, + "step": 10975, + "time_per_iteration": 2.488288164138794 + }, + { + "auxiliary_loss_clip": 0.01101915, + "auxiliary_loss_mlp": 0.01026336, + "balance_loss_clip": 1.01519537, + "balance_loss_mlp": 1.03476441, + "epoch": 0.6599128212836315, + "flos": 21068036674560.0, + "grad_norm": 1.699075414788055, + "language_loss": 0.7082206, + "learning_rate": 1.0952998079414464e-06, + "loss": 0.72950304, + "num_input_tokens_seen": 236935590, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.671875, + "step": 10976, + "time_per_iteration": 2.4436802864074707 + }, + { + "auxiliary_loss_clip": 0.01099428, + "auxiliary_loss_mlp": 0.010308, + "balance_loss_clip": 1.01890898, + "balance_loss_mlp": 1.03462744, + "epoch": 0.6599729445362994, + "flos": 22163527618560.0, + "grad_norm": 1.7471383506629494, + "language_loss": 0.6767379, + "learning_rate": 1.0949524874150243e-06, + "loss": 0.69804019, + "num_input_tokens_seen": 236952830, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6484375, + "step": 10977, + "time_per_iteration": 2.4598448276519775 + }, + { + "auxiliary_loss_clip": 0.01108053, + "auxiliary_loss_mlp": 0.01027322, + "balance_loss_clip": 1.01457834, + "balance_loss_mlp": 1.03748345, + "epoch": 0.6600330677889674, + "flos": 18150079426560.0, + "grad_norm": 2.0162776681697476, + "language_loss": 0.81473112, + "learning_rate": 1.0946052012080952e-06, + "loss": 0.83608478, + "num_input_tokens_seen": 236971930, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 10978, + "time_per_iteration": 2.4228336811065674 + }, + { + "auxiliary_loss_clip": 0.01107046, + "auxiliary_loss_mlp": 0.01037813, + "balance_loss_clip": 1.02570164, + "balance_loss_mlp": 1.03726959, + "epoch": 0.6600931910416353, + "flos": 18150115340160.0, + "grad_norm": 3.1339976235635527, + "language_loss": 0.6725859, + "learning_rate": 1.0942579493338278e-06, + "loss": 0.69403446, + "num_input_tokens_seen": 236989920, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69921875, + "step": 10979, + "time_per_iteration": 2.450756549835205 + }, + { + "auxiliary_loss_clip": 0.01102975, + "auxiliary_loss_mlp": 0.01023928, + "balance_loss_clip": 1.01215005, + "balance_loss_mlp": 1.0349319, + "epoch": 0.6601533142943034, + "flos": 17420733768960.0, + "grad_norm": 2.827162971921963, + "language_loss": 0.72720212, + "learning_rate": 1.0939107318053889e-06, + "loss": 0.74847114, + "num_input_tokens_seen": 237006570, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6796875, + "step": 10980, + "time_per_iteration": 2.406029462814331 + }, + { + "auxiliary_loss_clip": 0.01098591, + "auxiliary_loss_mlp": 0.01031279, + "balance_loss_clip": 1.02074146, + "balance_loss_mlp": 1.03450036, + "epoch": 0.6602134375469713, + "flos": 28219574615040.0, + "grad_norm": 1.5521957632844796, + "language_loss": 0.72807193, + "learning_rate": 1.0935635486359459e-06, + "loss": 0.74937057, + "num_input_tokens_seen": 237028415, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.640625, + "step": 10981, + "time_per_iteration": 2.5201127529144287 + }, + { + "auxiliary_loss_clip": 0.01104778, + "auxiliary_loss_mlp": 0.01032982, + "balance_loss_clip": 1.02119243, + "balance_loss_mlp": 1.03583837, + "epoch": 0.6602735607996393, + "flos": 29418056830080.0, + "grad_norm": 1.966625481577904, + "language_loss": 0.69085824, + "learning_rate": 1.0932163998386647e-06, + "loss": 0.71223581, + "num_input_tokens_seen": 237046595, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 10982, + "time_per_iteration": 2.5098371505737305 + }, + { + "auxiliary_loss_clip": 0.01101832, + "auxiliary_loss_mlp": 0.01026165, + "balance_loss_clip": 1.01473927, + "balance_loss_mlp": 1.03688765, + "epoch": 0.6603336840523072, + "flos": 18588045957120.0, + "grad_norm": 1.50117340695368, + "language_loss": 0.69566637, + "learning_rate": 1.0928692854267075e-06, + "loss": 0.71694636, + "num_input_tokens_seen": 237066150, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 10983, + "time_per_iteration": 2.4642090797424316 + }, + { + "auxiliary_loss_clip": 0.01103785, + "auxiliary_loss_mlp": 0.01027163, + "balance_loss_clip": 1.01571345, + "balance_loss_mlp": 1.03580856, + "epoch": 0.6603938073049752, + "flos": 33254860913280.0, + "grad_norm": 1.6650782937776725, + "language_loss": 0.70871687, + "learning_rate": 1.092522205413239e-06, + "loss": 0.73002636, + "num_input_tokens_seen": 237087060, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6796875, + "step": 10984, + "time_per_iteration": 2.545948028564453 + }, + { + "auxiliary_loss_clip": 0.01099312, + "auxiliary_loss_mlp": 0.01032765, + "balance_loss_clip": 1.02120149, + "balance_loss_mlp": 1.03464043, + "epoch": 0.6604539305576431, + "flos": 17384284442880.0, + "grad_norm": 1.583849922965693, + "language_loss": 0.83839536, + "learning_rate": 1.0921751598114193e-06, + "loss": 0.85971612, + "num_input_tokens_seen": 237103825, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6484375, + "step": 10985, + "time_per_iteration": 2.5026867389678955 + }, + { + "auxiliary_loss_clip": 0.01105227, + "auxiliary_loss_mlp": 0.01032357, + "balance_loss_clip": 1.02026308, + "balance_loss_mlp": 1.03746915, + "epoch": 0.6605140538103111, + "flos": 21251145231360.0, + "grad_norm": 2.805092368411813, + "language_loss": 0.73806614, + "learning_rate": 1.0918281486344077e-06, + "loss": 0.75944197, + "num_input_tokens_seen": 237121740, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 10986, + "time_per_iteration": 2.4697890281677246 + }, + { + "auxiliary_loss_clip": 0.01100417, + "auxiliary_loss_mlp": 0.01026054, + "balance_loss_clip": 1.01450825, + "balance_loss_mlp": 1.03609347, + "epoch": 0.6605741770629792, + "flos": 13881701433600.0, + "grad_norm": 1.6327019217005077, + "language_loss": 0.78796637, + "learning_rate": 1.0914811718953636e-06, + "loss": 0.80923104, + "num_input_tokens_seen": 237139565, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 10987, + "time_per_iteration": 2.417971611022949 + }, + { + "auxiliary_loss_clip": 0.01026194, + "auxiliary_loss_mlp": 0.01004542, + "balance_loss_clip": 1.0033257, + "balance_loss_mlp": 1.00560772, + "epoch": 0.6606343003156471, + "flos": 69316215171840.0, + "grad_norm": 0.8165641821952351, + "language_loss": 0.54130733, + "learning_rate": 1.0911342296074454e-06, + "loss": 0.56161469, + "num_input_tokens_seen": 237201055, + "router_z_loss_clip": 0.012146, + "router_z_loss_mlp": 0.20605469, + "step": 10988, + "time_per_iteration": 3.158214807510376 + }, + { + "auxiliary_loss_clip": 0.01103971, + "auxiliary_loss_mlp": 0.0102934, + "balance_loss_clip": 1.01902199, + "balance_loss_mlp": 1.03813577, + "epoch": 0.6606944235683151, + "flos": 27272394927360.0, + "grad_norm": 1.5008723881290433, + "language_loss": 0.77463698, + "learning_rate": 1.0907873217838077e-06, + "loss": 0.79597014, + "num_input_tokens_seen": 237221805, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.65625, + "step": 10989, + "time_per_iteration": 2.531778573989868 + }, + { + "auxiliary_loss_clip": 0.01105177, + "auxiliary_loss_mlp": 0.01032304, + "balance_loss_clip": 1.02086616, + "balance_loss_mlp": 1.0393579, + "epoch": 0.660754546820983, + "flos": 13772820332160.0, + "grad_norm": 1.9100821463598359, + "language_loss": 0.77224958, + "learning_rate": 1.0904404484376064e-06, + "loss": 0.7936244, + "num_input_tokens_seen": 237238270, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66015625, + "step": 10990, + "time_per_iteration": 2.393866539001465 + }, + { + "auxiliary_loss_clip": 0.01103575, + "auxiliary_loss_mlp": 0.01027755, + "balance_loss_clip": 1.0154345, + "balance_loss_mlp": 1.03490543, + "epoch": 0.660814670073651, + "flos": 15705209232000.0, + "grad_norm": 1.959228938394804, + "language_loss": 0.60573477, + "learning_rate": 1.0900936095819937e-06, + "loss": 0.62704802, + "num_input_tokens_seen": 237255400, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6875, + "step": 10991, + "time_per_iteration": 2.421860933303833 + }, + { + "auxiliary_loss_clip": 0.01106108, + "auxiliary_loss_mlp": 0.01031939, + "balance_loss_clip": 1.01960719, + "balance_loss_mlp": 1.03634095, + "epoch": 0.6608747933263189, + "flos": 20850023076480.0, + "grad_norm": 2.508745269820261, + "language_loss": 0.68313217, + "learning_rate": 1.0897468052301234e-06, + "loss": 0.70451266, + "num_input_tokens_seen": 237273105, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69921875, + "step": 10992, + "time_per_iteration": 2.438251495361328 + }, + { + "auxiliary_loss_clip": 0.01105003, + "auxiliary_loss_mlp": 0.01027261, + "balance_loss_clip": 1.0152688, + "balance_loss_mlp": 1.03565395, + "epoch": 0.660934916578987, + "flos": 20632117219200.0, + "grad_norm": 2.0506508317322036, + "language_loss": 0.87773001, + "learning_rate": 1.0894000353951444e-06, + "loss": 0.89905262, + "num_input_tokens_seen": 237292650, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 10993, + "time_per_iteration": 2.4813613891601562 + }, + { + "auxiliary_loss_clip": 0.01109842, + "auxiliary_loss_mlp": 0.01029551, + "balance_loss_clip": 1.01642656, + "balance_loss_mlp": 1.03765821, + "epoch": 0.6609950398316549, + "flos": 25113588647040.0, + "grad_norm": 1.679596565938276, + "language_loss": 0.66940713, + "learning_rate": 1.0890533000902078e-06, + "loss": 0.69080102, + "num_input_tokens_seen": 237312865, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.72265625, + "step": 10994, + "time_per_iteration": 2.464946985244751 + }, + { + "auxiliary_loss_clip": 0.0110627, + "auxiliary_loss_mlp": 0.0103033, + "balance_loss_clip": 1.01818299, + "balance_loss_mlp": 1.03735578, + "epoch": 0.6610551630843229, + "flos": 18661196004480.0, + "grad_norm": 1.7600806197216516, + "language_loss": 0.76505876, + "learning_rate": 1.0887065993284626e-06, + "loss": 0.78642476, + "num_input_tokens_seen": 237331210, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6875, + "step": 10995, + "time_per_iteration": 2.443978786468506 + }, + { + "auxiliary_loss_clip": 0.01103759, + "auxiliary_loss_mlp": 0.01025776, + "balance_loss_clip": 1.01477861, + "balance_loss_mlp": 1.03649068, + "epoch": 0.6611152863369908, + "flos": 23258192549760.0, + "grad_norm": 1.907480349708707, + "language_loss": 0.74543679, + "learning_rate": 1.088359933123053e-06, + "loss": 0.76673216, + "num_input_tokens_seen": 237349455, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.671875, + "step": 10996, + "time_per_iteration": 2.437030076980591 + }, + { + "auxiliary_loss_clip": 0.01103314, + "auxiliary_loss_mlp": 0.01031676, + "balance_loss_clip": 1.01963055, + "balance_loss_mlp": 1.03681195, + "epoch": 0.6611754095896588, + "flos": 22159720776960.0, + "grad_norm": 1.9556097783969382, + "language_loss": 0.68673009, + "learning_rate": 1.088013301487126e-06, + "loss": 0.70807999, + "num_input_tokens_seen": 237367100, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6640625, + "step": 10997, + "time_per_iteration": 2.4747731685638428 + }, + { + "auxiliary_loss_clip": 0.01107479, + "auxiliary_loss_mlp": 0.01026937, + "balance_loss_clip": 1.01583838, + "balance_loss_mlp": 1.03762627, + "epoch": 0.6612355328423267, + "flos": 13991228979840.0, + "grad_norm": 1.9530622490500587, + "language_loss": 0.68974924, + "learning_rate": 1.0876667044338269e-06, + "loss": 0.71109343, + "num_input_tokens_seen": 237384840, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.69921875, + "step": 10998, + "time_per_iteration": 2.407527208328247 + }, + { + "auxiliary_loss_clip": 0.01026246, + "auxiliary_loss_mlp": 0.0100257, + "balance_loss_clip": 1.00145519, + "balance_loss_mlp": 1.00553703, + "epoch": 0.6612956560949947, + "flos": 61453716359040.0, + "grad_norm": 0.6545620134591473, + "language_loss": 0.5117774, + "learning_rate": 1.087320141976297e-06, + "loss": 0.53206557, + "num_input_tokens_seen": 237443355, + "router_z_loss_clip": 0.01116943, + "router_z_loss_mlp": 0.20703125, + "step": 10999, + "time_per_iteration": 3.0084383487701416 + }, + { + "auxiliary_loss_clip": 0.01105663, + "auxiliary_loss_mlp": 0.01027993, + "balance_loss_clip": 1.01696038, + "balance_loss_mlp": 1.03627193, + "epoch": 0.6613557793476627, + "flos": 21616644072960.0, + "grad_norm": 2.367912839089916, + "language_loss": 0.71249658, + "learning_rate": 1.086973614127679e-06, + "loss": 0.73383313, + "num_input_tokens_seen": 237459205, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6953125, + "step": 11000, + "time_per_iteration": 2.426126480102539 + }, + { + "auxiliary_loss_clip": 0.01100275, + "auxiliary_loss_mlp": 0.010328, + "balance_loss_clip": 1.02214289, + "balance_loss_mlp": 1.03528404, + "epoch": 0.6614159026003307, + "flos": 34020117192960.0, + "grad_norm": 1.5935854519622277, + "language_loss": 0.65334332, + "learning_rate": 1.0866271209011133e-06, + "loss": 0.67467409, + "num_input_tokens_seen": 237483580, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6484375, + "step": 11001, + "time_per_iteration": 2.586193323135376 + }, + { + "auxiliary_loss_clip": 0.01103282, + "auxiliary_loss_mlp": 0.01027047, + "balance_loss_clip": 1.01568055, + "balance_loss_mlp": 1.03593278, + "epoch": 0.6614760258529987, + "flos": 24097281235200.0, + "grad_norm": 1.922146655127119, + "language_loss": 0.73242342, + "learning_rate": 1.086280662309739e-06, + "loss": 0.75372672, + "num_input_tokens_seen": 237502860, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.67578125, + "step": 11002, + "time_per_iteration": 2.4588327407836914 + }, + { + "auxiliary_loss_clip": 0.01101069, + "auxiliary_loss_mlp": 0.01032524, + "balance_loss_clip": 1.02064466, + "balance_loss_mlp": 1.03539062, + "epoch": 0.6615361491056666, + "flos": 14903790935040.0, + "grad_norm": 2.0738499312562215, + "language_loss": 0.78606766, + "learning_rate": 1.0859342383666928e-06, + "loss": 0.80740356, + "num_input_tokens_seen": 237521030, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.65625, + "step": 11003, + "time_per_iteration": 2.470768928527832 + }, + { + "auxiliary_loss_clip": 0.01105808, + "auxiliary_loss_mlp": 0.01034072, + "balance_loss_clip": 1.02114952, + "balance_loss_mlp": 1.03701353, + "epoch": 0.6615962723583346, + "flos": 15304877176320.0, + "grad_norm": 1.8055156139018678, + "language_loss": 0.68872547, + "learning_rate": 1.0855878490851119e-06, + "loss": 0.71012425, + "num_input_tokens_seen": 237539585, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6875, + "step": 11004, + "time_per_iteration": 2.4174275398254395 + }, + { + "auxiliary_loss_clip": 0.01105956, + "auxiliary_loss_mlp": 0.01034853, + "balance_loss_clip": 1.02177572, + "balance_loss_mlp": 1.0356009, + "epoch": 0.6616563956110025, + "flos": 18732586285440.0, + "grad_norm": 2.2557237333346687, + "language_loss": 0.69553763, + "learning_rate": 1.085241494478132e-06, + "loss": 0.71694571, + "num_input_tokens_seen": 237557655, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 11005, + "time_per_iteration": 2.482177495956421 + }, + { + "auxiliary_loss_clip": 0.01102487, + "auxiliary_loss_mlp": 0.01026054, + "balance_loss_clip": 1.01488411, + "balance_loss_mlp": 1.03609776, + "epoch": 0.6617165188636706, + "flos": 24495063425280.0, + "grad_norm": 1.5704694842406037, + "language_loss": 0.78232396, + "learning_rate": 1.0848951745588855e-06, + "loss": 0.80360937, + "num_input_tokens_seen": 237577000, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 11006, + "time_per_iteration": 2.4723048210144043 + }, + { + "auxiliary_loss_clip": 0.01102233, + "auxiliary_loss_mlp": 0.01031605, + "balance_loss_clip": 1.01923108, + "balance_loss_mlp": 1.03596103, + "epoch": 0.6617766421163385, + "flos": 22379673709440.0, + "grad_norm": 1.5007948972384493, + "language_loss": 0.75993907, + "learning_rate": 1.0845488893405068e-06, + "loss": 0.78127748, + "num_input_tokens_seen": 237597960, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6640625, + "step": 11007, + "time_per_iteration": 2.4790470600128174 + }, + { + "auxiliary_loss_clip": 0.01105175, + "auxiliary_loss_mlp": 0.0102811, + "balance_loss_clip": 1.01685643, + "balance_loss_mlp": 1.0384593, + "epoch": 0.6618367653690065, + "flos": 20850418126080.0, + "grad_norm": 1.9253644062666073, + "language_loss": 0.78290129, + "learning_rate": 1.0842026388361248e-06, + "loss": 0.80423415, + "num_input_tokens_seen": 237616385, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.66796875, + "step": 11008, + "time_per_iteration": 2.4340806007385254 + }, + { + "auxiliary_loss_clip": 0.0110631, + "auxiliary_loss_mlp": 0.01029918, + "balance_loss_clip": 1.01736474, + "balance_loss_mlp": 1.03573239, + "epoch": 0.6618968886216744, + "flos": 17712328377600.0, + "grad_norm": 1.8127446377472742, + "language_loss": 0.81780791, + "learning_rate": 1.0838564230588715e-06, + "loss": 0.83917022, + "num_input_tokens_seen": 237634930, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.70703125, + "step": 11009, + "time_per_iteration": 2.4623091220855713 + }, + { + "auxiliary_loss_clip": 0.01026257, + "auxiliary_loss_mlp": 0.01005514, + "balance_loss_clip": 1.0043757, + "balance_loss_mlp": 1.00541437, + "epoch": 0.6619570118743424, + "flos": 67035347498880.0, + "grad_norm": 0.9788733414804485, + "language_loss": 0.67425871, + "learning_rate": 1.0835102420218735e-06, + "loss": 0.69457638, + "num_input_tokens_seen": 237693175, + "router_z_loss_clip": 0.01141357, + "router_z_loss_mlp": 0.20898438, + "step": 11010, + "time_per_iteration": 4.397435188293457 + }, + { + "auxiliary_loss_clip": 0.01104702, + "auxiliary_loss_mlp": 0.01028772, + "balance_loss_clip": 1.01645815, + "balance_loss_mlp": 1.03598547, + "epoch": 0.6620171351270103, + "flos": 18660908695680.0, + "grad_norm": 1.7882832526705355, + "language_loss": 0.71199936, + "learning_rate": 1.0831640957382593e-06, + "loss": 0.73333406, + "num_input_tokens_seen": 237713160, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 11011, + "time_per_iteration": 2.4273481369018555 + }, + { + "auxiliary_loss_clip": 0.01103848, + "auxiliary_loss_mlp": 0.0103319, + "balance_loss_clip": 1.02204967, + "balance_loss_mlp": 1.03780639, + "epoch": 0.6620772583796783, + "flos": 24170503109760.0, + "grad_norm": 1.7492667107704147, + "language_loss": 0.72528613, + "learning_rate": 1.0828179842211557e-06, + "loss": 0.74665654, + "num_input_tokens_seen": 237733600, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66015625, + "step": 11012, + "time_per_iteration": 2.467482566833496 + }, + { + "auxiliary_loss_clip": 0.010978, + "auxiliary_loss_mlp": 0.01031536, + "balance_loss_clip": 1.02084911, + "balance_loss_mlp": 1.03615665, + "epoch": 0.6621373816323463, + "flos": 23623547736960.0, + "grad_norm": 1.7384195449369746, + "language_loss": 0.795021, + "learning_rate": 1.0824719074836845e-06, + "loss": 0.8163144, + "num_input_tokens_seen": 237752135, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6171875, + "step": 11013, + "time_per_iteration": 3.923494577407837 + }, + { + "auxiliary_loss_clip": 0.01102996, + "auxiliary_loss_mlp": 0.01029153, + "balance_loss_clip": 1.01767898, + "balance_loss_mlp": 1.03644931, + "epoch": 0.6621975048850143, + "flos": 18442212739200.0, + "grad_norm": 1.886371512022625, + "language_loss": 0.7088536, + "learning_rate": 1.082125865538971e-06, + "loss": 0.73017514, + "num_input_tokens_seen": 237770735, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6640625, + "step": 11014, + "time_per_iteration": 2.439049482345581 + }, + { + "auxiliary_loss_clip": 0.01100918, + "auxiliary_loss_mlp": 0.01030718, + "balance_loss_clip": 1.02039468, + "balance_loss_mlp": 1.03656077, + "epoch": 0.6622576281376823, + "flos": 14063876236800.0, + "grad_norm": 2.1131368988088504, + "language_loss": 0.76709092, + "learning_rate": 1.081779858400137e-06, + "loss": 0.78840733, + "num_input_tokens_seen": 237789005, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.640625, + "step": 11015, + "time_per_iteration": 3.900524616241455 + }, + { + "auxiliary_loss_clip": 0.01101265, + "auxiliary_loss_mlp": 0.01026817, + "balance_loss_clip": 1.01506257, + "balance_loss_mlp": 1.03580058, + "epoch": 0.6623177513903502, + "flos": 17018965169280.0, + "grad_norm": 1.7610046970273479, + "language_loss": 0.82307482, + "learning_rate": 1.0814338860803021e-06, + "loss": 0.8443557, + "num_input_tokens_seen": 237807740, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.65234375, + "step": 11016, + "time_per_iteration": 2.4373061656951904 + }, + { + "auxiliary_loss_clip": 0.0110303, + "auxiliary_loss_mlp": 0.01031074, + "balance_loss_clip": 1.01933837, + "balance_loss_mlp": 1.03373432, + "epoch": 0.6623778746430182, + "flos": 17271021882240.0, + "grad_norm": 1.888497767792011, + "language_loss": 0.6969018, + "learning_rate": 1.0810879485925864e-06, + "loss": 0.71824282, + "num_input_tokens_seen": 237826340, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.69140625, + "step": 11017, + "time_per_iteration": 2.4477572441101074 + }, + { + "auxiliary_loss_clip": 0.0110184, + "auxiliary_loss_mlp": 0.01034298, + "balance_loss_clip": 1.02280068, + "balance_loss_mlp": 1.03520179, + "epoch": 0.6624379978956861, + "flos": 48792688767360.0, + "grad_norm": 1.7526472003474178, + "language_loss": 0.77214134, + "learning_rate": 1.0807420459501084e-06, + "loss": 0.79350269, + "num_input_tokens_seen": 237848305, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6640625, + "step": 11018, + "time_per_iteration": 2.6970436573028564 + }, + { + "auxiliary_loss_clip": 0.01100319, + "auxiliary_loss_mlp": 0.01036299, + "balance_loss_clip": 1.0244143, + "balance_loss_mlp": 1.03411186, + "epoch": 0.6624981211483542, + "flos": 18952431477120.0, + "grad_norm": 1.9966965859861308, + "language_loss": 0.83007133, + "learning_rate": 1.0803961781659841e-06, + "loss": 0.85143745, + "num_input_tokens_seen": 237867020, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6640625, + "step": 11019, + "time_per_iteration": 2.429482936859131 + }, + { + "auxiliary_loss_clip": 0.011003, + "auxiliary_loss_mlp": 0.01028801, + "balance_loss_clip": 1.01789916, + "balance_loss_mlp": 1.0355196, + "epoch": 0.6625582444010221, + "flos": 23256576437760.0, + "grad_norm": 1.956066495989637, + "language_loss": 0.71813512, + "learning_rate": 1.080050345253328e-06, + "loss": 0.73942614, + "num_input_tokens_seen": 237886710, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6484375, + "step": 11020, + "time_per_iteration": 2.4736745357513428 + }, + { + "auxiliary_loss_clip": 0.01107397, + "auxiliary_loss_mlp": 0.0102918, + "balance_loss_clip": 1.01639438, + "balance_loss_mlp": 1.03652906, + "epoch": 0.6626183676536901, + "flos": 21394823633280.0, + "grad_norm": 1.7164682336590185, + "language_loss": 0.72276735, + "learning_rate": 1.0797045472252554e-06, + "loss": 0.74413311, + "num_input_tokens_seen": 237904795, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7109375, + "step": 11021, + "time_per_iteration": 2.477529525756836 + }, + { + "auxiliary_loss_clip": 0.01103875, + "auxiliary_loss_mlp": 0.01032756, + "balance_loss_clip": 1.02115703, + "balance_loss_mlp": 1.03667212, + "epoch": 0.662678490906358, + "flos": 14571293713920.0, + "grad_norm": 2.3400531031028873, + "language_loss": 0.83128953, + "learning_rate": 1.0793587840948793e-06, + "loss": 0.85265589, + "num_input_tokens_seen": 237921320, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 11022, + "time_per_iteration": 2.507936716079712 + }, + { + "auxiliary_loss_clip": 0.0110951, + "auxiliary_loss_mlp": 0.01032056, + "balance_loss_clip": 1.01896095, + "balance_loss_mlp": 1.03662038, + "epoch": 0.662738614159026, + "flos": 15992350554240.0, + "grad_norm": 2.599884159549939, + "language_loss": 0.73365414, + "learning_rate": 1.0790130558753099e-06, + "loss": 0.75506973, + "num_input_tokens_seen": 237933525, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7265625, + "step": 11023, + "time_per_iteration": 2.4137043952941895 + }, + { + "auxiliary_loss_clip": 0.0110089, + "auxiliary_loss_mlp": 0.01029067, + "balance_loss_clip": 1.0178678, + "balance_loss_mlp": 1.03488147, + "epoch": 0.6627987374116939, + "flos": 19536338966400.0, + "grad_norm": 1.7959862106394333, + "language_loss": 0.74551922, + "learning_rate": 1.0786673625796574e-06, + "loss": 0.76681882, + "num_input_tokens_seen": 237953395, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6640625, + "step": 11024, + "time_per_iteration": 2.475996255874634 + }, + { + "auxiliary_loss_clip": 0.01105322, + "auxiliary_loss_mlp": 0.01029911, + "balance_loss_clip": 1.01788878, + "balance_loss_mlp": 1.0374223, + "epoch": 0.662858860664362, + "flos": 15702838934400.0, + "grad_norm": 2.1748664614868214, + "language_loss": 0.69700897, + "learning_rate": 1.0783217042210306e-06, + "loss": 0.71836132, + "num_input_tokens_seen": 237971445, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 11025, + "time_per_iteration": 2.4363040924072266 + }, + { + "auxiliary_loss_clip": 0.01105179, + "auxiliary_loss_mlp": 0.01035379, + "balance_loss_clip": 1.02345753, + "balance_loss_mlp": 1.03844023, + "epoch": 0.6629189839170299, + "flos": 20154289570560.0, + "grad_norm": 1.7368551034909252, + "language_loss": 0.78647238, + "learning_rate": 1.0779760808125379e-06, + "loss": 0.8078779, + "num_input_tokens_seen": 237989965, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66796875, + "step": 11026, + "time_per_iteration": 2.4804115295410156 + }, + { + "auxiliary_loss_clip": 0.01102997, + "auxiliary_loss_mlp": 0.01029116, + "balance_loss_clip": 1.01807094, + "balance_loss_mlp": 1.03734887, + "epoch": 0.6629791071696979, + "flos": 20915415786240.0, + "grad_norm": 1.6695781674460857, + "language_loss": 0.7642892, + "learning_rate": 1.0776304923672842e-06, + "loss": 0.78561032, + "num_input_tokens_seen": 238006820, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 11027, + "time_per_iteration": 2.4259533882141113 + }, + { + "auxiliary_loss_clip": 0.01104358, + "auxiliary_loss_mlp": 0.01035525, + "balance_loss_clip": 1.02346063, + "balance_loss_mlp": 1.03656745, + "epoch": 0.6630392304223659, + "flos": 20846898593280.0, + "grad_norm": 2.1060132685452335, + "language_loss": 0.69903147, + "learning_rate": 1.0772849388983742e-06, + "loss": 0.72043025, + "num_input_tokens_seen": 238022560, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6796875, + "step": 11028, + "time_per_iteration": 2.4627115726470947 + }, + { + "auxiliary_loss_clip": 0.01102349, + "auxiliary_loss_mlp": 0.01031493, + "balance_loss_clip": 1.02125263, + "balance_loss_mlp": 1.03578711, + "epoch": 0.6630993536750338, + "flos": 20995820380800.0, + "grad_norm": 1.8773152280466259, + "language_loss": 0.7926842, + "learning_rate": 1.0769394204189138e-06, + "loss": 0.8140226, + "num_input_tokens_seen": 238041895, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.66796875, + "step": 11029, + "time_per_iteration": 2.4524929523468018 + }, + { + "auxiliary_loss_clip": 0.01103713, + "auxiliary_loss_mlp": 0.01028426, + "balance_loss_clip": 1.01583755, + "balance_loss_mlp": 1.03504181, + "epoch": 0.6631594769277018, + "flos": 18259032355200.0, + "grad_norm": 2.11014761642944, + "language_loss": 0.76041275, + "learning_rate": 1.0765939369420012e-06, + "loss": 0.78173411, + "num_input_tokens_seen": 238060445, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 11030, + "time_per_iteration": 2.4383111000061035 + }, + { + "auxiliary_loss_clip": 0.01109452, + "auxiliary_loss_mlp": 0.01031592, + "balance_loss_clip": 1.01958811, + "balance_loss_mlp": 1.03813887, + "epoch": 0.6632196001803697, + "flos": 17820491207040.0, + "grad_norm": 2.37714698139957, + "language_loss": 0.74753916, + "learning_rate": 1.0762484884807391e-06, + "loss": 0.76894963, + "num_input_tokens_seen": 238077080, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.71484375, + "step": 11031, + "time_per_iteration": 2.4041976928710938 + }, + { + "auxiliary_loss_clip": 0.01103516, + "auxiliary_loss_mlp": 0.01031437, + "balance_loss_clip": 1.01942098, + "balance_loss_mlp": 1.03516042, + "epoch": 0.6632797234330378, + "flos": 12670182581760.0, + "grad_norm": 3.9220695320455494, + "language_loss": 0.74872231, + "learning_rate": 1.075903075048228e-06, + "loss": 0.77007186, + "num_input_tokens_seen": 238091045, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 11032, + "time_per_iteration": 2.3847768306732178 + }, + { + "auxiliary_loss_clip": 0.01102597, + "auxiliary_loss_mlp": 0.01028293, + "balance_loss_clip": 1.01723075, + "balance_loss_mlp": 1.03578007, + "epoch": 0.6633398466857057, + "flos": 23584728113280.0, + "grad_norm": 1.77863211463492, + "language_loss": 0.80295861, + "learning_rate": 1.0755576966575635e-06, + "loss": 0.82426751, + "num_input_tokens_seen": 238110220, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.66796875, + "step": 11033, + "time_per_iteration": 2.4669265747070312 + }, + { + "auxiliary_loss_clip": 0.01101844, + "auxiliary_loss_mlp": 0.01029961, + "balance_loss_clip": 1.01801026, + "balance_loss_mlp": 1.03441966, + "epoch": 0.6633999699383737, + "flos": 20631686256000.0, + "grad_norm": 2.0583190629929957, + "language_loss": 0.80057156, + "learning_rate": 1.0752123533218451e-06, + "loss": 0.82188958, + "num_input_tokens_seen": 238130400, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.67578125, + "step": 11034, + "time_per_iteration": 2.4563634395599365 + }, + { + "auxiliary_loss_clip": 0.0109965, + "auxiliary_loss_mlp": 0.0102582, + "balance_loss_clip": 1.01526368, + "balance_loss_mlp": 1.03502083, + "epoch": 0.6634600931910416, + "flos": 21797095023360.0, + "grad_norm": 1.5719715577747368, + "language_loss": 0.75545985, + "learning_rate": 1.074867045054166e-06, + "loss": 0.7767145, + "num_input_tokens_seen": 238148165, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.64453125, + "step": 11035, + "time_per_iteration": 2.513399600982666 + }, + { + "auxiliary_loss_clip": 0.01103249, + "auxiliary_loss_mlp": 0.01027301, + "balance_loss_clip": 1.0156064, + "balance_loss_mlp": 1.0342617, + "epoch": 0.6635202164437096, + "flos": 18732873594240.0, + "grad_norm": 1.7970498153302146, + "language_loss": 0.83235633, + "learning_rate": 1.074521771867622e-06, + "loss": 0.85366178, + "num_input_tokens_seen": 238166360, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.69140625, + "step": 11036, + "time_per_iteration": 2.519704580307007 + }, + { + "auxiliary_loss_clip": 0.01027101, + "auxiliary_loss_mlp": 0.01001243, + "balance_loss_clip": 0.99994338, + "balance_loss_mlp": 1.00646234, + "epoch": 0.6635803396963775, + "flos": 60222771227520.0, + "grad_norm": 0.7769560833184769, + "language_loss": 0.52306348, + "learning_rate": 1.0741765337753044e-06, + "loss": 0.54334688, + "num_input_tokens_seen": 238227630, + "router_z_loss_clip": 0.01300049, + "router_z_loss_mlp": 0.20703125, + "step": 11037, + "time_per_iteration": 3.0515010356903076 + }, + { + "auxiliary_loss_clip": 0.01103588, + "auxiliary_loss_mlp": 0.01036401, + "balance_loss_clip": 1.02405727, + "balance_loss_mlp": 1.03591716, + "epoch": 0.6636404629490456, + "flos": 29167041611520.0, + "grad_norm": 1.842185877925078, + "language_loss": 0.79099, + "learning_rate": 1.0738313307903052e-06, + "loss": 0.81238985, + "num_input_tokens_seen": 238248435, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.67578125, + "step": 11038, + "time_per_iteration": 2.5139565467834473 + }, + { + "auxiliary_loss_clip": 0.0110341, + "auxiliary_loss_mlp": 0.01037399, + "balance_loss_clip": 1.02515566, + "balance_loss_mlp": 1.03648806, + "epoch": 0.6637005862017135, + "flos": 38907702766080.0, + "grad_norm": 1.8255445121908285, + "language_loss": 0.64082795, + "learning_rate": 1.073486162925716e-06, + "loss": 0.66223598, + "num_input_tokens_seen": 238268755, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66796875, + "step": 11039, + "time_per_iteration": 2.623331308364868 + }, + { + "auxiliary_loss_clip": 0.0110572, + "auxiliary_loss_mlp": 0.01030057, + "balance_loss_clip": 1.01841021, + "balance_loss_mlp": 1.03601968, + "epoch": 0.6637607094543815, + "flos": 22783345729920.0, + "grad_norm": 1.7210825984121325, + "language_loss": 0.63687986, + "learning_rate": 1.0731410301946237e-06, + "loss": 0.65823758, + "num_input_tokens_seen": 238290120, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.69921875, + "step": 11040, + "time_per_iteration": 2.472255229949951 + }, + { + "auxiliary_loss_clip": 0.01100331, + "auxiliary_loss_mlp": 0.01029858, + "balance_loss_clip": 1.01893258, + "balance_loss_mlp": 1.03372359, + "epoch": 0.6638208327070495, + "flos": 18114096977280.0, + "grad_norm": 1.9713653362611905, + "language_loss": 0.71843195, + "learning_rate": 1.0727959326101161e-06, + "loss": 0.73973382, + "num_input_tokens_seen": 238309290, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.66796875, + "step": 11041, + "time_per_iteration": 2.4769115447998047 + }, + { + "auxiliary_loss_clip": 0.01102253, + "auxiliary_loss_mlp": 0.01038804, + "balance_loss_clip": 1.02647161, + "balance_loss_mlp": 1.03540432, + "epoch": 0.6638809559597174, + "flos": 29424880414080.0, + "grad_norm": 8.010243162338005, + "language_loss": 0.61716807, + "learning_rate": 1.0724508701852806e-06, + "loss": 0.63857865, + "num_input_tokens_seen": 238327280, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66796875, + "step": 11042, + "time_per_iteration": 2.50669264793396 + }, + { + "auxiliary_loss_clip": 0.01105298, + "auxiliary_loss_mlp": 0.0102732, + "balance_loss_clip": 1.01413536, + "balance_loss_mlp": 1.03500068, + "epoch": 0.6639410792123854, + "flos": 28072699902720.0, + "grad_norm": 2.00393235647331, + "language_loss": 0.68282115, + "learning_rate": 1.0721058429331998e-06, + "loss": 0.70414734, + "num_input_tokens_seen": 238346330, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 11043, + "time_per_iteration": 2.518275499343872 + }, + { + "auxiliary_loss_clip": 0.01099626, + "auxiliary_loss_mlp": 0.01024503, + "balance_loss_clip": 1.01448393, + "balance_loss_mlp": 1.03639984, + "epoch": 0.6640012024650533, + "flos": 25556367600000.0, + "grad_norm": 1.6123860278714182, + "language_loss": 0.83758092, + "learning_rate": 1.0717608508669587e-06, + "loss": 0.85882223, + "num_input_tokens_seen": 238364650, + "router_z_loss_clip": 0.10009766, + "router_z_loss_mlp": 0.6328125, + "step": 11044, + "time_per_iteration": 2.505173444747925 + }, + { + "auxiliary_loss_clip": 0.01102203, + "auxiliary_loss_mlp": 0.01029885, + "balance_loss_clip": 1.01769567, + "balance_loss_mlp": 1.03553414, + "epoch": 0.6640613257177214, + "flos": 14866946559360.0, + "grad_norm": 1.9292668184213282, + "language_loss": 0.69679981, + "learning_rate": 1.0714158939996392e-06, + "loss": 0.71812069, + "num_input_tokens_seen": 238381630, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66796875, + "step": 11045, + "time_per_iteration": 2.4917290210723877 + }, + { + "auxiliary_loss_clip": 0.01106396, + "auxiliary_loss_mlp": 0.01025419, + "balance_loss_clip": 1.01372421, + "balance_loss_mlp": 1.03785038, + "epoch": 0.6641214489703893, + "flos": 23221096778880.0, + "grad_norm": 1.4259906887756533, + "language_loss": 0.6473543, + "learning_rate": 1.0710709723443235e-06, + "loss": 0.66867244, + "num_input_tokens_seen": 238402595, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6875, + "step": 11046, + "time_per_iteration": 2.4937326908111572 + }, + { + "auxiliary_loss_clip": 0.01101037, + "auxiliary_loss_mlp": 0.01026442, + "balance_loss_clip": 1.01489711, + "balance_loss_mlp": 1.03506637, + "epoch": 0.6641815722230573, + "flos": 37742617221120.0, + "grad_norm": 1.4622045705244888, + "language_loss": 0.71289897, + "learning_rate": 1.070726085914088e-06, + "loss": 0.73417372, + "num_input_tokens_seen": 238426860, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66015625, + "step": 11047, + "time_per_iteration": 2.6626944541931152 + }, + { + "auxiliary_loss_clip": 0.01104783, + "auxiliary_loss_mlp": 0.01031561, + "balance_loss_clip": 1.01909184, + "balance_loss_mlp": 1.03837025, + "epoch": 0.6642416954757252, + "flos": 17931132074880.0, + "grad_norm": 1.803867578656826, + "language_loss": 0.77093923, + "learning_rate": 1.0703812347220126e-06, + "loss": 0.79230267, + "num_input_tokens_seen": 238443990, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6640625, + "step": 11048, + "time_per_iteration": 2.3982088565826416 + }, + { + "auxiliary_loss_clip": 0.01026262, + "auxiliary_loss_mlp": 0.00999443, + "balance_loss_clip": 0.99813193, + "balance_loss_mlp": 1.00559723, + "epoch": 0.6643018187283932, + "flos": 51995384104320.0, + "grad_norm": 0.8140473421231088, + "language_loss": 0.55041039, + "learning_rate": 1.0700364187811745e-06, + "loss": 0.57066745, + "num_input_tokens_seen": 238503045, + "router_z_loss_clip": 0.01312256, + "router_z_loss_mlp": 0.20703125, + "step": 11049, + "time_per_iteration": 3.0340354442596436 + }, + { + "auxiliary_loss_clip": 0.01102231, + "auxiliary_loss_mlp": 0.01027108, + "balance_loss_clip": 1.01654625, + "balance_loss_mlp": 1.03627372, + "epoch": 0.6643619419810611, + "flos": 30226657847040.0, + "grad_norm": 1.7091488805060655, + "language_loss": 0.64489448, + "learning_rate": 1.069691638104648e-06, + "loss": 0.66618788, + "num_input_tokens_seen": 238527320, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.66015625, + "step": 11050, + "time_per_iteration": 2.5083260536193848 + }, + { + "auxiliary_loss_clip": 0.01099461, + "auxiliary_loss_mlp": 0.01026918, + "balance_loss_clip": 1.01615942, + "balance_loss_mlp": 1.03481436, + "epoch": 0.6644220652337292, + "flos": 22966131064320.0, + "grad_norm": 2.10593076125299, + "language_loss": 0.78783518, + "learning_rate": 1.0693468927055085e-06, + "loss": 0.80909896, + "num_input_tokens_seen": 238546030, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6484375, + "step": 11051, + "time_per_iteration": 2.462937116622925 + }, + { + "auxiliary_loss_clip": 0.01103355, + "auxiliary_loss_mlp": 0.01031151, + "balance_loss_clip": 1.01975441, + "balance_loss_mlp": 1.03752089, + "epoch": 0.6644821884863971, + "flos": 21142228216320.0, + "grad_norm": 1.6490502352967844, + "language_loss": 0.85132825, + "learning_rate": 1.0690021825968276e-06, + "loss": 0.87267327, + "num_input_tokens_seen": 238564175, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65625, + "step": 11052, + "time_per_iteration": 3.808241128921509 + }, + { + "auxiliary_loss_clip": 0.01106566, + "auxiliary_loss_mlp": 0.01035204, + "balance_loss_clip": 1.02213919, + "balance_loss_mlp": 1.03723979, + "epoch": 0.6645423117390651, + "flos": 20192821885440.0, + "grad_norm": 2.202728029810485, + "language_loss": 0.75382364, + "learning_rate": 1.0686575077916776e-06, + "loss": 0.77524137, + "num_input_tokens_seen": 238581010, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6953125, + "step": 11053, + "time_per_iteration": 2.4659061431884766 + }, + { + "auxiliary_loss_clip": 0.01099205, + "auxiliary_loss_mlp": 0.01024974, + "balance_loss_clip": 1.0138042, + "balance_loss_mlp": 1.03446698, + "epoch": 0.6646024349917331, + "flos": 24351959640960.0, + "grad_norm": 1.6434507479308733, + "language_loss": 0.79397607, + "learning_rate": 1.0683128683031278e-06, + "loss": 0.81521785, + "num_input_tokens_seen": 238601365, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 11054, + "time_per_iteration": 2.4667155742645264 + }, + { + "auxiliary_loss_clip": 0.01100684, + "auxiliary_loss_mlp": 0.01029658, + "balance_loss_clip": 1.0186491, + "balance_loss_mlp": 1.03520536, + "epoch": 0.664662558244401, + "flos": 18806706000000.0, + "grad_norm": 1.4981555869580738, + "language_loss": 0.74050117, + "learning_rate": 1.0679682641442472e-06, + "loss": 0.76180458, + "num_input_tokens_seen": 238619850, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 11055, + "time_per_iteration": 3.8726584911346436 + }, + { + "auxiliary_loss_clip": 0.01104209, + "auxiliary_loss_mlp": 0.01034676, + "balance_loss_clip": 1.02240944, + "balance_loss_mlp": 1.0363059, + "epoch": 0.664722681497069, + "flos": 18952790613120.0, + "grad_norm": 1.7483359396792508, + "language_loss": 0.72639185, + "learning_rate": 1.0676236953281042e-06, + "loss": 0.74778068, + "num_input_tokens_seen": 238637635, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.67578125, + "step": 11056, + "time_per_iteration": 3.913365364074707 + }, + { + "auxiliary_loss_clip": 0.01101489, + "auxiliary_loss_mlp": 0.01027564, + "balance_loss_clip": 1.01610804, + "balance_loss_mlp": 1.03553987, + "epoch": 0.6647828047497369, + "flos": 19571279921280.0, + "grad_norm": 2.080468005748717, + "language_loss": 0.69644797, + "learning_rate": 1.0672791618677641e-06, + "loss": 0.71773851, + "num_input_tokens_seen": 238656200, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 11057, + "time_per_iteration": 2.4554696083068848 + }, + { + "auxiliary_loss_clip": 0.01104048, + "auxiliary_loss_mlp": 0.01029444, + "balance_loss_clip": 1.01761794, + "balance_loss_mlp": 1.0374651, + "epoch": 0.664842928002405, + "flos": 23149455102720.0, + "grad_norm": 2.7208836045736753, + "language_loss": 0.80084372, + "learning_rate": 1.066934663776291e-06, + "loss": 0.8221786, + "num_input_tokens_seen": 238675005, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 11058, + "time_per_iteration": 2.4723973274230957 + }, + { + "auxiliary_loss_clip": 0.01026201, + "auxiliary_loss_mlp": 0.00999951, + "balance_loss_clip": 0.99850267, + "balance_loss_mlp": 1.00571644, + "epoch": 0.6649030512550729, + "flos": 65244913148160.0, + "grad_norm": 0.8197408377002003, + "language_loss": 0.62637091, + "learning_rate": 1.0665902010667496e-06, + "loss": 0.64663243, + "num_input_tokens_seen": 238731425, + "router_z_loss_clip": 0.01446533, + "router_z_loss_mlp": 0.20507812, + "step": 11059, + "time_per_iteration": 2.9666504859924316 + }, + { + "auxiliary_loss_clip": 0.01099741, + "auxiliary_loss_mlp": 0.01034131, + "balance_loss_clip": 1.0232594, + "balance_loss_mlp": 1.03411603, + "epoch": 0.6649631745077409, + "flos": 20194797133440.0, + "grad_norm": 1.4258342030978963, + "language_loss": 0.78922415, + "learning_rate": 1.0662457737522008e-06, + "loss": 0.81056285, + "num_input_tokens_seen": 238752020, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.65625, + "step": 11060, + "time_per_iteration": 2.453782796859741 + }, + { + "auxiliary_loss_clip": 0.01105175, + "auxiliary_loss_mlp": 0.01034567, + "balance_loss_clip": 1.02221072, + "balance_loss_mlp": 1.03826928, + "epoch": 0.6650232977604088, + "flos": 17238558965760.0, + "grad_norm": 1.8106435880803493, + "language_loss": 0.78883487, + "learning_rate": 1.0659013818457055e-06, + "loss": 0.81023228, + "num_input_tokens_seen": 238769665, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.671875, + "step": 11061, + "time_per_iteration": 2.4411821365356445 + }, + { + "auxiliary_loss_clip": 0.01103137, + "auxiliary_loss_mlp": 0.0102691, + "balance_loss_clip": 1.01606226, + "balance_loss_mlp": 1.03756928, + "epoch": 0.6650834210130768, + "flos": 10006867825920.0, + "grad_norm": 2.176969604984505, + "language_loss": 0.57041669, + "learning_rate": 1.0655570253603243e-06, + "loss": 0.59171724, + "num_input_tokens_seen": 238782180, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.65625, + "step": 11062, + "time_per_iteration": 2.389374256134033 + }, + { + "auxiliary_loss_clip": 0.0110523, + "auxiliary_loss_mlp": 0.01027344, + "balance_loss_clip": 1.01472592, + "balance_loss_mlp": 1.03483319, + "epoch": 0.6651435442657447, + "flos": 10452088903680.0, + "grad_norm": 1.8021007966116196, + "language_loss": 0.75658429, + "learning_rate": 1.0652127043091144e-06, + "loss": 0.77791005, + "num_input_tokens_seen": 238800315, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 11063, + "time_per_iteration": 2.4186158180236816 + }, + { + "auxiliary_loss_clip": 0.01103715, + "auxiliary_loss_mlp": 0.01033702, + "balance_loss_clip": 1.02248394, + "balance_loss_mlp": 1.0370208, + "epoch": 0.6652036675184128, + "flos": 22344229964160.0, + "grad_norm": 1.3058140700355754, + "language_loss": 0.7048496, + "learning_rate": 1.0648684187051316e-06, + "loss": 0.72622377, + "num_input_tokens_seen": 238822250, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.66796875, + "step": 11064, + "time_per_iteration": 2.5101113319396973 + }, + { + "auxiliary_loss_clip": 0.01024924, + "auxiliary_loss_mlp": 0.00997873, + "balance_loss_clip": 0.9965679, + "balance_loss_mlp": 1.00459087, + "epoch": 0.6652637907710807, + "flos": 52909633998720.0, + "grad_norm": 0.8487322656758325, + "language_loss": 0.63019937, + "learning_rate": 1.0645241685614322e-06, + "loss": 0.65042734, + "num_input_tokens_seen": 238877190, + "router_z_loss_clip": 0.01306152, + "router_z_loss_mlp": 0.203125, + "step": 11065, + "time_per_iteration": 3.006619691848755 + }, + { + "auxiliary_loss_clip": 0.01104037, + "auxiliary_loss_mlp": 0.01026975, + "balance_loss_clip": 1.01580477, + "balance_loss_mlp": 1.03731883, + "epoch": 0.6653239140237487, + "flos": 23104637907840.0, + "grad_norm": 1.6667915109143088, + "language_loss": 0.62019926, + "learning_rate": 1.0641799538910708e-06, + "loss": 0.64150941, + "num_input_tokens_seen": 238896010, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.66796875, + "step": 11066, + "time_per_iteration": 2.468318223953247 + }, + { + "auxiliary_loss_clip": 0.01102953, + "auxiliary_loss_mlp": 0.01026785, + "balance_loss_clip": 1.01430988, + "balance_loss_mlp": 1.03479779, + "epoch": 0.6653840372764167, + "flos": 25959393175680.0, + "grad_norm": 1.7106058760764156, + "language_loss": 0.70056629, + "learning_rate": 1.0638357747070985e-06, + "loss": 0.72186363, + "num_input_tokens_seen": 238918990, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 11067, + "time_per_iteration": 2.524820566177368 + }, + { + "auxiliary_loss_clip": 0.0102549, + "auxiliary_loss_mlp": 0.00996129, + "balance_loss_clip": 0.99466848, + "balance_loss_mlp": 1.00504017, + "epoch": 0.6654441605290846, + "flos": 66041985899520.0, + "grad_norm": 0.9061893644507588, + "language_loss": 0.72102368, + "learning_rate": 1.0634916310225684e-06, + "loss": 0.74123991, + "num_input_tokens_seen": 238975735, + "router_z_loss_clip": 0.0145874, + "router_z_loss_mlp": 0.20507812, + "step": 11068, + "time_per_iteration": 3.0193986892700195 + }, + { + "auxiliary_loss_clip": 0.01025049, + "auxiliary_loss_mlp": 0.00996802, + "balance_loss_clip": 0.99540693, + "balance_loss_mlp": 1.00446737, + "epoch": 0.6655042837817526, + "flos": 65196112521600.0, + "grad_norm": 0.7087489248971819, + "language_loss": 0.57800353, + "learning_rate": 1.0631475228505285e-06, + "loss": 0.59822208, + "num_input_tokens_seen": 239042360, + "router_z_loss_clip": 0.01397705, + "router_z_loss_mlp": 0.20605469, + "step": 11069, + "time_per_iteration": 3.2124764919281006 + }, + { + "auxiliary_loss_clip": 0.0102455, + "auxiliary_loss_mlp": 0.01000321, + "balance_loss_clip": 0.99886698, + "balance_loss_mlp": 1.0041275, + "epoch": 0.6655644070344205, + "flos": 69008746752000.0, + "grad_norm": 0.7763166900295557, + "language_loss": 0.63506204, + "learning_rate": 1.062803450204029e-06, + "loss": 0.65531075, + "num_input_tokens_seen": 239109410, + "router_z_loss_clip": 0.01452637, + "router_z_loss_mlp": 0.20410156, + "step": 11070, + "time_per_iteration": 3.1373214721679688 + }, + { + "auxiliary_loss_clip": 0.01101179, + "auxiliary_loss_mlp": 0.01026674, + "balance_loss_clip": 1.01531315, + "balance_loss_mlp": 1.03435302, + "epoch": 0.6656245302870886, + "flos": 36315562809600.0, + "grad_norm": 1.7781228106405071, + "language_loss": 0.58826381, + "learning_rate": 1.062459413096116e-06, + "loss": 0.60954237, + "num_input_tokens_seen": 239135345, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66796875, + "step": 11071, + "time_per_iteration": 2.5929718017578125 + }, + { + "auxiliary_loss_clip": 0.01105196, + "auxiliary_loss_mlp": 0.01027293, + "balance_loss_clip": 1.01662362, + "balance_loss_mlp": 1.03853655, + "epoch": 0.6656846535397565, + "flos": 21794832466560.0, + "grad_norm": 2.462730868248946, + "language_loss": 0.72873962, + "learning_rate": 1.0621154115398364e-06, + "loss": 0.75006455, + "num_input_tokens_seen": 239154340, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6640625, + "step": 11072, + "time_per_iteration": 2.457197427749634 + }, + { + "auxiliary_loss_clip": 0.01103868, + "auxiliary_loss_mlp": 0.01029593, + "balance_loss_clip": 1.01775575, + "balance_loss_mlp": 1.03849804, + "epoch": 0.6657447767924245, + "flos": 37487615592960.0, + "grad_norm": 2.0960284851890183, + "language_loss": 0.70686483, + "learning_rate": 1.0617714455482353e-06, + "loss": 0.72819948, + "num_input_tokens_seen": 239177815, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65625, + "step": 11073, + "time_per_iteration": 2.621063709259033 + }, + { + "auxiliary_loss_clip": 0.01105664, + "auxiliary_loss_mlp": 0.0103133, + "balance_loss_clip": 1.01962924, + "balance_loss_mlp": 1.03680611, + "epoch": 0.6658049000450924, + "flos": 16837688206080.0, + "grad_norm": 21.254891604302284, + "language_loss": 0.56184697, + "learning_rate": 1.061427515134354e-06, + "loss": 0.58321697, + "num_input_tokens_seen": 239195735, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6875, + "step": 11074, + "time_per_iteration": 2.417592763900757 + }, + { + "auxiliary_loss_clip": 0.01103413, + "auxiliary_loss_mlp": 0.01029306, + "balance_loss_clip": 1.01795745, + "balance_loss_mlp": 1.03713965, + "epoch": 0.6658650232977604, + "flos": 33510975863040.0, + "grad_norm": 1.4493539029409879, + "language_loss": 0.72269762, + "learning_rate": 1.061083620311235e-06, + "loss": 0.74402475, + "num_input_tokens_seen": 239217535, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 11075, + "time_per_iteration": 2.5621016025543213 + }, + { + "auxiliary_loss_clip": 0.01099577, + "auxiliary_loss_mlp": 0.01035227, + "balance_loss_clip": 1.02400899, + "balance_loss_mlp": 1.03572047, + "epoch": 0.6659251465504283, + "flos": 37706311549440.0, + "grad_norm": 1.47592254117705, + "language_loss": 0.6616652, + "learning_rate": 1.0607397610919202e-06, + "loss": 0.6830132, + "num_input_tokens_seen": 239241975, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 11076, + "time_per_iteration": 2.618560552597046 + }, + { + "auxiliary_loss_clip": 0.01102176, + "auxiliary_loss_mlp": 0.01034767, + "balance_loss_clip": 1.02297735, + "balance_loss_mlp": 1.0359937, + "epoch": 0.6659852698030964, + "flos": 24893420232960.0, + "grad_norm": 1.613817606590062, + "language_loss": 0.75271714, + "learning_rate": 1.0603959374894468e-06, + "loss": 0.77408653, + "num_input_tokens_seen": 239262025, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66015625, + "step": 11077, + "time_per_iteration": 2.487748146057129 + }, + { + "auxiliary_loss_clip": 0.01102302, + "auxiliary_loss_mlp": 0.01030781, + "balance_loss_clip": 1.01956344, + "balance_loss_mlp": 1.03536868, + "epoch": 0.6660453930557643, + "flos": 24352821567360.0, + "grad_norm": 1.863663819937869, + "language_loss": 0.66703588, + "learning_rate": 1.0600521495168538e-06, + "loss": 0.68836671, + "num_input_tokens_seen": 239282775, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.66796875, + "step": 11078, + "time_per_iteration": 2.4835610389709473 + }, + { + "auxiliary_loss_clip": 0.0110385, + "auxiliary_loss_mlp": 0.01029692, + "balance_loss_clip": 1.01786661, + "balance_loss_mlp": 1.03568316, + "epoch": 0.6661055163084323, + "flos": 10597814380800.0, + "grad_norm": 1.962622549544945, + "language_loss": 0.69805777, + "learning_rate": 1.0597083971871783e-06, + "loss": 0.71939325, + "num_input_tokens_seen": 239299775, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.68359375, + "step": 11079, + "time_per_iteration": 2.4517362117767334 + }, + { + "auxiliary_loss_clip": 0.01100691, + "auxiliary_loss_mlp": 0.01027325, + "balance_loss_clip": 1.01656055, + "balance_loss_mlp": 1.03579783, + "epoch": 0.6661656395611003, + "flos": 24057491944320.0, + "grad_norm": 1.4504303029583365, + "language_loss": 0.80272287, + "learning_rate": 1.0593646805134544e-06, + "loss": 0.82400304, + "num_input_tokens_seen": 239319660, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6484375, + "step": 11080, + "time_per_iteration": 2.496086835861206 + }, + { + "auxiliary_loss_clip": 0.01098572, + "auxiliary_loss_mlp": 0.0102897, + "balance_loss_clip": 1.0184257, + "balance_loss_mlp": 1.03518367, + "epoch": 0.6662257628137682, + "flos": 23036192542080.0, + "grad_norm": 1.7747670262807855, + "language_loss": 0.78175783, + "learning_rate": 1.0590209995087157e-06, + "loss": 0.80303317, + "num_input_tokens_seen": 239339215, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6328125, + "step": 11081, + "time_per_iteration": 2.4947092533111572 + }, + { + "auxiliary_loss_clip": 0.01104079, + "auxiliary_loss_mlp": 0.01031001, + "balance_loss_clip": 1.01905608, + "balance_loss_mlp": 1.03641224, + "epoch": 0.6662858860664362, + "flos": 24754446512640.0, + "grad_norm": 1.7096575045073308, + "language_loss": 0.79757982, + "learning_rate": 1.0586773541859946e-06, + "loss": 0.81893063, + "num_input_tokens_seen": 239358545, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 11082, + "time_per_iteration": 2.496314287185669 + }, + { + "auxiliary_loss_clip": 0.01100593, + "auxiliary_loss_mlp": 0.01035918, + "balance_loss_clip": 1.02490342, + "balance_loss_mlp": 1.03489673, + "epoch": 0.6663460093191041, + "flos": 20009066883840.0, + "grad_norm": 1.4408084093775566, + "language_loss": 0.83964407, + "learning_rate": 1.0583337445583234e-06, + "loss": 0.86100918, + "num_input_tokens_seen": 239376665, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 11083, + "time_per_iteration": 2.441714286804199 + }, + { + "auxiliary_loss_clip": 0.0110885, + "auxiliary_loss_mlp": 0.01034762, + "balance_loss_clip": 1.02278709, + "balance_loss_mlp": 1.03879905, + "epoch": 0.6664061325717722, + "flos": 17821389047040.0, + "grad_norm": 2.210335279184582, + "language_loss": 0.85422742, + "learning_rate": 1.057990170638731e-06, + "loss": 0.87566352, + "num_input_tokens_seen": 239394345, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.703125, + "step": 11084, + "time_per_iteration": 2.4179892539978027 + }, + { + "auxiliary_loss_clip": 0.01106729, + "auxiliary_loss_mlp": 0.0102887, + "balance_loss_clip": 1.01672292, + "balance_loss_mlp": 1.03759933, + "epoch": 0.6664662558244401, + "flos": 18076893465600.0, + "grad_norm": 2.2800746471584135, + "language_loss": 0.73236918, + "learning_rate": 1.0576466324402452e-06, + "loss": 0.75372517, + "num_input_tokens_seen": 239410605, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69140625, + "step": 11085, + "time_per_iteration": 2.4865758419036865 + }, + { + "auxiliary_loss_clip": 0.01102626, + "auxiliary_loss_mlp": 0.01029255, + "balance_loss_clip": 1.01763248, + "balance_loss_mlp": 1.03617859, + "epoch": 0.6665263790771081, + "flos": 21574197175680.0, + "grad_norm": 1.9088871569878003, + "language_loss": 0.80301607, + "learning_rate": 1.057303129975894e-06, + "loss": 0.82433486, + "num_input_tokens_seen": 239427155, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 11086, + "time_per_iteration": 2.444645643234253 + }, + { + "auxiliary_loss_clip": 0.01101849, + "auxiliary_loss_mlp": 0.01029998, + "balance_loss_clip": 1.01799965, + "balance_loss_mlp": 1.03593099, + "epoch": 0.666586502329776, + "flos": 24206629213440.0, + "grad_norm": 2.0449845091934753, + "language_loss": 0.74311554, + "learning_rate": 1.056959663258702e-06, + "loss": 0.7644341, + "num_input_tokens_seen": 239445510, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66015625, + "step": 11087, + "time_per_iteration": 2.483962059020996 + }, + { + "auxiliary_loss_clip": 0.0110217, + "auxiliary_loss_mlp": 0.01030474, + "balance_loss_clip": 1.01872563, + "balance_loss_mlp": 1.03587329, + "epoch": 0.666646625582444, + "flos": 22200515648640.0, + "grad_norm": 1.5673899455217954, + "language_loss": 0.64753473, + "learning_rate": 1.0566162323016939e-06, + "loss": 0.66886115, + "num_input_tokens_seen": 239464805, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 11088, + "time_per_iteration": 2.4562034606933594 + }, + { + "auxiliary_loss_clip": 0.01104855, + "auxiliary_loss_mlp": 0.01029085, + "balance_loss_clip": 1.01734924, + "balance_loss_mlp": 1.03735042, + "epoch": 0.6667067488351119, + "flos": 18259930195200.0, + "grad_norm": 1.8332928045753645, + "language_loss": 0.64570332, + "learning_rate": 1.0562728371178928e-06, + "loss": 0.66704261, + "num_input_tokens_seen": 239483890, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.67578125, + "step": 11089, + "time_per_iteration": 2.4386065006256104 + }, + { + "auxiliary_loss_clip": 0.01099875, + "auxiliary_loss_mlp": 0.01031668, + "balance_loss_clip": 1.02059364, + "balance_loss_mlp": 1.03447926, + "epoch": 0.66676687208778, + "flos": 17236547804160.0, + "grad_norm": 2.1527148838753236, + "language_loss": 0.80835247, + "learning_rate": 1.0559294777203221e-06, + "loss": 0.82966793, + "num_input_tokens_seen": 239500080, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.65625, + "step": 11090, + "time_per_iteration": 2.394827365875244 + }, + { + "auxiliary_loss_clip": 0.01105547, + "auxiliary_loss_mlp": 0.01031813, + "balance_loss_clip": 1.02039266, + "balance_loss_mlp": 1.03684211, + "epoch": 0.6668269953404479, + "flos": 19752197748480.0, + "grad_norm": 3.4302717941928806, + "language_loss": 0.7762655, + "learning_rate": 1.0555861541219984e-06, + "loss": 0.79763907, + "num_input_tokens_seen": 239517335, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6875, + "step": 11091, + "time_per_iteration": 2.4357736110687256 + }, + { + "auxiliary_loss_clip": 0.01101701, + "auxiliary_loss_mlp": 0.01030674, + "balance_loss_clip": 1.01943851, + "balance_loss_mlp": 1.03544581, + "epoch": 0.6668871185931159, + "flos": 20558428467840.0, + "grad_norm": 1.7415157953091596, + "language_loss": 0.79347867, + "learning_rate": 1.0552428663359425e-06, + "loss": 0.81480247, + "num_input_tokens_seen": 239536240, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6640625, + "step": 11092, + "time_per_iteration": 2.4493799209594727 + }, + { + "auxiliary_loss_clip": 0.01024657, + "auxiliary_loss_mlp": 0.01010054, + "balance_loss_clip": 1.00873661, + "balance_loss_mlp": 1.00410509, + "epoch": 0.6669472418457839, + "flos": 58088167735680.0, + "grad_norm": 0.7618033983707613, + "language_loss": 0.57674438, + "learning_rate": 1.0548996143751724e-06, + "loss": 0.5970915, + "num_input_tokens_seen": 239598000, + "router_z_loss_clip": 0.01318359, + "router_z_loss_mlp": 0.20507812, + "step": 11093, + "time_per_iteration": 3.060945510864258 + }, + { + "auxiliary_loss_clip": 0.0110198, + "auxiliary_loss_mlp": 0.01029858, + "balance_loss_clip": 1.01859891, + "balance_loss_mlp": 1.03614676, + "epoch": 0.6670073650984518, + "flos": 26065113880320.0, + "grad_norm": 3.0734338086465733, + "language_loss": 0.76404822, + "learning_rate": 1.054556398252703e-06, + "loss": 0.78536654, + "num_input_tokens_seen": 239617650, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.66015625, + "step": 11094, + "time_per_iteration": 3.8702232837677 + }, + { + "auxiliary_loss_clip": 0.01102539, + "auxiliary_loss_mlp": 0.01033592, + "balance_loss_clip": 1.02148628, + "balance_loss_mlp": 1.03533387, + "epoch": 0.6670674883511198, + "flos": 32416849635840.0, + "grad_norm": 1.7962253370500996, + "language_loss": 0.73604453, + "learning_rate": 1.05421321798155e-06, + "loss": 0.75740582, + "num_input_tokens_seen": 239639825, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 11095, + "time_per_iteration": 2.5393593311309814 + }, + { + "auxiliary_loss_clip": 0.01104214, + "auxiliary_loss_mlp": 0.01031091, + "balance_loss_clip": 1.01983809, + "balance_loss_mlp": 1.03839517, + "epoch": 0.6671276116037878, + "flos": 18037786533120.0, + "grad_norm": 2.498006768699264, + "language_loss": 0.73841417, + "learning_rate": 1.053870073574727e-06, + "loss": 0.75976729, + "num_input_tokens_seen": 239656300, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 11096, + "time_per_iteration": 5.295018672943115 + }, + { + "auxiliary_loss_clip": 0.01101592, + "auxiliary_loss_mlp": 0.01030969, + "balance_loss_clip": 1.02000785, + "balance_loss_mlp": 1.03659046, + "epoch": 0.6671877348564558, + "flos": 23767046570880.0, + "grad_norm": 2.1197138558836652, + "language_loss": 0.64377868, + "learning_rate": 1.0535269650452456e-06, + "loss": 0.66510427, + "num_input_tokens_seen": 239676655, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 11097, + "time_per_iteration": 2.4755849838256836 + }, + { + "auxiliary_loss_clip": 0.01105023, + "auxiliary_loss_mlp": 0.01029903, + "balance_loss_clip": 1.01810169, + "balance_loss_mlp": 1.03657043, + "epoch": 0.6672478581091237, + "flos": 20918360701440.0, + "grad_norm": 1.8367279267646714, + "language_loss": 0.75293523, + "learning_rate": 1.0531838924061158e-06, + "loss": 0.77428448, + "num_input_tokens_seen": 239695430, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 11098, + "time_per_iteration": 3.8889780044555664 + }, + { + "auxiliary_loss_clip": 0.01105898, + "auxiliary_loss_mlp": 0.01031065, + "balance_loss_clip": 1.01997876, + "balance_loss_mlp": 1.03809619, + "epoch": 0.6673079813617917, + "flos": 27855799626240.0, + "grad_norm": 1.6239497270406267, + "language_loss": 0.74629354, + "learning_rate": 1.0528408556703476e-06, + "loss": 0.76766318, + "num_input_tokens_seen": 239717070, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6796875, + "step": 11099, + "time_per_iteration": 2.499155282974243 + }, + { + "auxiliary_loss_clip": 0.01099471, + "auxiliary_loss_mlp": 0.01033794, + "balance_loss_clip": 1.02229047, + "balance_loss_mlp": 1.03467488, + "epoch": 0.6673681046144596, + "flos": 21616859554560.0, + "grad_norm": 1.8612331810201734, + "language_loss": 0.78086853, + "learning_rate": 1.0524978548509502e-06, + "loss": 0.80220115, + "num_input_tokens_seen": 239737105, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 11100, + "time_per_iteration": 2.4822754859924316 + }, + { + "auxiliary_loss_clip": 0.01101826, + "auxiliary_loss_mlp": 0.01034027, + "balance_loss_clip": 1.02265465, + "balance_loss_mlp": 1.03608942, + "epoch": 0.6674282278671276, + "flos": 20889884194560.0, + "grad_norm": 2.199541930312583, + "language_loss": 0.60234034, + "learning_rate": 1.0521548899609288e-06, + "loss": 0.62369883, + "num_input_tokens_seen": 239757835, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 11101, + "time_per_iteration": 2.470005750656128 + }, + { + "auxiliary_loss_clip": 0.0110769, + "auxiliary_loss_mlp": 0.01034695, + "balance_loss_clip": 1.02202892, + "balance_loss_mlp": 1.03702366, + "epoch": 0.6674883511197955, + "flos": 23624194181760.0, + "grad_norm": 1.6927482018220132, + "language_loss": 0.711254, + "learning_rate": 1.0518119610132884e-06, + "loss": 0.73267794, + "num_input_tokens_seen": 239775425, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 11102, + "time_per_iteration": 2.5034313201904297 + }, + { + "auxiliary_loss_clip": 0.01104064, + "auxiliary_loss_mlp": 0.01029221, + "balance_loss_clip": 1.01774669, + "balance_loss_mlp": 1.03638661, + "epoch": 0.6675484743724636, + "flos": 19609668581760.0, + "grad_norm": 1.4777736440637246, + "language_loss": 0.84276104, + "learning_rate": 1.051469068021034e-06, + "loss": 0.8640939, + "num_input_tokens_seen": 239794605, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.67578125, + "step": 11103, + "time_per_iteration": 2.430427074432373 + }, + { + "auxiliary_loss_clip": 0.01104082, + "auxiliary_loss_mlp": 0.01024589, + "balance_loss_clip": 1.01338315, + "balance_loss_mlp": 1.03620505, + "epoch": 0.6676085976251315, + "flos": 14319452482560.0, + "grad_norm": 2.187100835254228, + "language_loss": 0.77906835, + "learning_rate": 1.0511262109971668e-06, + "loss": 0.80035502, + "num_input_tokens_seen": 239812135, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6796875, + "step": 11104, + "time_per_iteration": 2.431415557861328 + }, + { + "auxiliary_loss_clip": 0.01105832, + "auxiliary_loss_mlp": 0.01027014, + "balance_loss_clip": 1.01554644, + "balance_loss_mlp": 1.03741312, + "epoch": 0.6676687208777995, + "flos": 38104596529920.0, + "grad_norm": 1.7588653188886298, + "language_loss": 0.58123207, + "learning_rate": 1.0507833899546889e-06, + "loss": 0.60256052, + "num_input_tokens_seen": 239835845, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.68359375, + "step": 11105, + "time_per_iteration": 2.5778300762176514 + }, + { + "auxiliary_loss_clip": 0.01107319, + "auxiliary_loss_mlp": 0.01030859, + "balance_loss_clip": 1.01864648, + "balance_loss_mlp": 1.0369904, + "epoch": 0.6677288441304675, + "flos": 23981576549760.0, + "grad_norm": 1.5584285162619382, + "language_loss": 0.73263156, + "learning_rate": 1.0504406049066e-06, + "loss": 0.75401342, + "num_input_tokens_seen": 239853820, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.703125, + "step": 11106, + "time_per_iteration": 2.502669334411621 + }, + { + "auxiliary_loss_clip": 0.01102707, + "auxiliary_loss_mlp": 0.01027538, + "balance_loss_clip": 1.01568878, + "balance_loss_mlp": 1.03582263, + "epoch": 0.6677889673831354, + "flos": 24170682677760.0, + "grad_norm": 1.612792210414072, + "language_loss": 0.77103424, + "learning_rate": 1.0500978558659e-06, + "loss": 0.7923367, + "num_input_tokens_seen": 239873365, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 11107, + "time_per_iteration": 2.4632906913757324 + }, + { + "auxiliary_loss_clip": 0.01098872, + "auxiliary_loss_mlp": 0.01027525, + "balance_loss_clip": 1.01636124, + "balance_loss_mlp": 1.03531408, + "epoch": 0.6678490906358034, + "flos": 22309648145280.0, + "grad_norm": 2.2458320549685267, + "language_loss": 0.89908957, + "learning_rate": 1.049755142845583e-06, + "loss": 0.92035359, + "num_input_tokens_seen": 239891215, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.63671875, + "step": 11108, + "time_per_iteration": 2.4730093479156494 + }, + { + "auxiliary_loss_clip": 0.01099015, + "auxiliary_loss_mlp": 0.01022867, + "balance_loss_clip": 1.01253176, + "balance_loss_mlp": 1.03418517, + "epoch": 0.6679092138884714, + "flos": 36898752026880.0, + "grad_norm": 1.3985533807105044, + "language_loss": 0.82679069, + "learning_rate": 1.049412465858646e-06, + "loss": 0.84800953, + "num_input_tokens_seen": 239913490, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.6484375, + "step": 11109, + "time_per_iteration": 2.580944061279297 + }, + { + "auxiliary_loss_clip": 0.01102598, + "auxiliary_loss_mlp": 0.01028606, + "balance_loss_clip": 1.01666141, + "balance_loss_mlp": 1.03557515, + "epoch": 0.6679693371411394, + "flos": 18150294908160.0, + "grad_norm": 1.8119039289749856, + "language_loss": 0.69528979, + "learning_rate": 1.0490698249180847e-06, + "loss": 0.71660185, + "num_input_tokens_seen": 239931565, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 11110, + "time_per_iteration": 2.5149457454681396 + }, + { + "auxiliary_loss_clip": 0.01106029, + "auxiliary_loss_mlp": 0.01032156, + "balance_loss_clip": 1.01944864, + "balance_loss_mlp": 1.03632832, + "epoch": 0.6680294603938073, + "flos": 27198167472000.0, + "grad_norm": 1.7594532626452621, + "language_loss": 0.7338779, + "learning_rate": 1.04872722003689e-06, + "loss": 0.75525975, + "num_input_tokens_seen": 239952395, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 11111, + "time_per_iteration": 2.481405258178711 + }, + { + "auxiliary_loss_clip": 0.01097972, + "auxiliary_loss_mlp": 0.01026013, + "balance_loss_clip": 1.01508117, + "balance_loss_mlp": 1.03355026, + "epoch": 0.6680895836464753, + "flos": 21725309692800.0, + "grad_norm": 3.2736780286979488, + "language_loss": 0.64989609, + "learning_rate": 1.0483846512280553e-06, + "loss": 0.6711359, + "num_input_tokens_seen": 239968910, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.64453125, + "step": 11112, + "time_per_iteration": 2.452441930770874 + }, + { + "auxiliary_loss_clip": 0.01102119, + "auxiliary_loss_mlp": 0.01029115, + "balance_loss_clip": 1.01734865, + "balance_loss_mlp": 1.03562021, + "epoch": 0.6681497068991432, + "flos": 19646477043840.0, + "grad_norm": 1.7892928589109056, + "language_loss": 0.63786232, + "learning_rate": 1.048042118504569e-06, + "loss": 0.65917462, + "num_input_tokens_seen": 239987680, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 11113, + "time_per_iteration": 2.4086506366729736 + }, + { + "auxiliary_loss_clip": 0.01100693, + "auxiliary_loss_mlp": 0.01031399, + "balance_loss_clip": 1.02059317, + "balance_loss_mlp": 1.03552222, + "epoch": 0.6682098301518112, + "flos": 17419153570560.0, + "grad_norm": 1.8981901836856618, + "language_loss": 0.66016996, + "learning_rate": 1.047699621879422e-06, + "loss": 0.6814909, + "num_input_tokens_seen": 240005790, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.65234375, + "step": 11114, + "time_per_iteration": 2.4347803592681885 + }, + { + "auxiliary_loss_clip": 0.0110162, + "auxiliary_loss_mlp": 0.01032493, + "balance_loss_clip": 1.02107906, + "balance_loss_mlp": 1.03480756, + "epoch": 0.6682699534044791, + "flos": 22599016110720.0, + "grad_norm": 1.6451209195544332, + "language_loss": 0.78455061, + "learning_rate": 1.0473571613655998e-06, + "loss": 0.80589175, + "num_input_tokens_seen": 240025895, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 11115, + "time_per_iteration": 2.478957414627075 + }, + { + "auxiliary_loss_clip": 0.01101464, + "auxiliary_loss_mlp": 0.01029063, + "balance_loss_clip": 1.01758313, + "balance_loss_mlp": 1.03418374, + "epoch": 0.6683300766571472, + "flos": 24863686750080.0, + "grad_norm": 1.607299826888502, + "language_loss": 0.79468185, + "learning_rate": 1.0470147369760896e-06, + "loss": 0.81598711, + "num_input_tokens_seen": 240044880, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.671875, + "step": 11116, + "time_per_iteration": 2.5263917446136475 + }, + { + "auxiliary_loss_clip": 0.01104708, + "auxiliary_loss_mlp": 0.01031435, + "balance_loss_clip": 1.01905489, + "balance_loss_mlp": 1.03642249, + "epoch": 0.6683901999098151, + "flos": 27126633536640.0, + "grad_norm": 1.793058561798458, + "language_loss": 0.79410267, + "learning_rate": 1.0466723487238768e-06, + "loss": 0.81546414, + "num_input_tokens_seen": 240065785, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 11117, + "time_per_iteration": 2.4854443073272705 + }, + { + "auxiliary_loss_clip": 0.01105696, + "auxiliary_loss_mlp": 0.01031025, + "balance_loss_clip": 1.01769769, + "balance_loss_mlp": 1.03675961, + "epoch": 0.6684503231624831, + "flos": 20739023072640.0, + "grad_norm": 1.507325638356248, + "language_loss": 0.65411663, + "learning_rate": 1.0463299966219441e-06, + "loss": 0.67548382, + "num_input_tokens_seen": 240085130, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6875, + "step": 11118, + "time_per_iteration": 2.472377300262451 + }, + { + "auxiliary_loss_clip": 0.01100857, + "auxiliary_loss_mlp": 0.01028801, + "balance_loss_clip": 1.01816726, + "balance_loss_mlp": 1.03583932, + "epoch": 0.668510446415151, + "flos": 21762189982080.0, + "grad_norm": 2.967647334244501, + "language_loss": 0.68711627, + "learning_rate": 1.0459876806832727e-06, + "loss": 0.70841289, + "num_input_tokens_seen": 240105495, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.65234375, + "step": 11119, + "time_per_iteration": 2.4728288650512695 + }, + { + "auxiliary_loss_clip": 0.01102457, + "auxiliary_loss_mlp": 0.01026404, + "balance_loss_clip": 1.01497746, + "balance_loss_mlp": 1.03634501, + "epoch": 0.668570569667819, + "flos": 30191250015360.0, + "grad_norm": 1.5996077334078893, + "language_loss": 0.66828573, + "learning_rate": 1.0456454009208448e-06, + "loss": 0.68957436, + "num_input_tokens_seen": 240125455, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66015625, + "step": 11120, + "time_per_iteration": 2.546515941619873 + }, + { + "auxiliary_loss_clip": 0.0110248, + "auxiliary_loss_mlp": 0.01029952, + "balance_loss_clip": 1.01742911, + "balance_loss_mlp": 1.03602421, + "epoch": 0.668630692920487, + "flos": 24170646764160.0, + "grad_norm": 1.762800604873663, + "language_loss": 0.72149706, + "learning_rate": 1.045303157347638e-06, + "loss": 0.7428214, + "num_input_tokens_seen": 240143870, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6640625, + "step": 11121, + "time_per_iteration": 2.477660894393921 + }, + { + "auxiliary_loss_clip": 0.01103609, + "auxiliary_loss_mlp": 0.01034095, + "balance_loss_clip": 1.02209687, + "balance_loss_mlp": 1.0351814, + "epoch": 0.668690816173155, + "flos": 17457147181440.0, + "grad_norm": 2.849050741943763, + "language_loss": 0.70147824, + "learning_rate": 1.0449609499766316e-06, + "loss": 0.72285533, + "num_input_tokens_seen": 240161020, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 11122, + "time_per_iteration": 2.490941286087036 + }, + { + "auxiliary_loss_clip": 0.01103575, + "auxiliary_loss_mlp": 0.01035456, + "balance_loss_clip": 1.02377343, + "balance_loss_mlp": 1.03655457, + "epoch": 0.668750939425823, + "flos": 25005102595200.0, + "grad_norm": 1.6701786551201399, + "language_loss": 0.71671915, + "learning_rate": 1.0446187788208015e-06, + "loss": 0.73810941, + "num_input_tokens_seen": 240179820, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 11123, + "time_per_iteration": 2.4819095134735107 + }, + { + "auxiliary_loss_clip": 0.01108577, + "auxiliary_loss_mlp": 0.01035224, + "balance_loss_clip": 1.02273631, + "balance_loss_mlp": 1.0392499, + "epoch": 0.6688110626784909, + "flos": 24096778444800.0, + "grad_norm": 1.6154595834142065, + "language_loss": 0.79180294, + "learning_rate": 1.0442766438931244e-06, + "loss": 0.81324089, + "num_input_tokens_seen": 240200130, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6953125, + "step": 11124, + "time_per_iteration": 2.4734344482421875 + }, + { + "auxiliary_loss_clip": 0.0110496, + "auxiliary_loss_mlp": 0.01036149, + "balance_loss_clip": 1.02444232, + "balance_loss_mlp": 1.03757286, + "epoch": 0.6688711859311589, + "flos": 21759532375680.0, + "grad_norm": 1.7495803882819345, + "language_loss": 0.74282473, + "learning_rate": 1.0439345452065716e-06, + "loss": 0.76423579, + "num_input_tokens_seen": 240217945, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.67578125, + "step": 11125, + "time_per_iteration": 2.444687843322754 + }, + { + "auxiliary_loss_clip": 0.01105662, + "auxiliary_loss_mlp": 0.01034569, + "balance_loss_clip": 1.0227133, + "balance_loss_mlp": 1.03771114, + "epoch": 0.6689313091838268, + "flos": 22929645824640.0, + "grad_norm": 2.3220485163353035, + "language_loss": 0.66047573, + "learning_rate": 1.043592482774116e-06, + "loss": 0.68187803, + "num_input_tokens_seen": 240237220, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 11126, + "time_per_iteration": 2.508352756500244 + }, + { + "auxiliary_loss_clip": 0.01100528, + "auxiliary_loss_mlp": 0.01023616, + "balance_loss_clip": 1.01235676, + "balance_loss_mlp": 1.03333449, + "epoch": 0.6689914324364948, + "flos": 20886149180160.0, + "grad_norm": 3.2519975932516094, + "language_loss": 0.71248001, + "learning_rate": 1.0432504566087305e-06, + "loss": 0.73372149, + "num_input_tokens_seen": 240256000, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 11127, + "time_per_iteration": 2.4746527671813965 + }, + { + "auxiliary_loss_clip": 0.01106513, + "auxiliary_loss_mlp": 0.01032607, + "balance_loss_clip": 1.01952958, + "balance_loss_mlp": 1.03555894, + "epoch": 0.6690515556891627, + "flos": 22748225207040.0, + "grad_norm": 2.0140192417842235, + "language_loss": 0.80290639, + "learning_rate": 1.0429084667233827e-06, + "loss": 0.82429767, + "num_input_tokens_seen": 240275845, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.7109375, + "step": 11128, + "time_per_iteration": 2.476914644241333 + }, + { + "auxiliary_loss_clip": 0.0110363, + "auxiliary_loss_mlp": 0.01029193, + "balance_loss_clip": 1.01713467, + "balance_loss_mlp": 1.03555393, + "epoch": 0.6691116789418308, + "flos": 23331450337920.0, + "grad_norm": 2.0449515592271967, + "language_loss": 0.81091756, + "learning_rate": 1.0425665131310427e-06, + "loss": 0.83224577, + "num_input_tokens_seen": 240294095, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 11129, + "time_per_iteration": 2.457526922225952 + }, + { + "auxiliary_loss_clip": 0.01098446, + "auxiliary_loss_mlp": 0.01033459, + "balance_loss_clip": 1.02288556, + "balance_loss_mlp": 1.0350548, + "epoch": 0.6691718021944987, + "flos": 32447014081920.0, + "grad_norm": 1.6204282208074086, + "language_loss": 0.70266747, + "learning_rate": 1.0422245958446762e-06, + "loss": 0.72398651, + "num_input_tokens_seen": 240313460, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6328125, + "step": 11130, + "time_per_iteration": 2.5508627891540527 + }, + { + "auxiliary_loss_clip": 0.01100261, + "auxiliary_loss_mlp": 0.01034692, + "balance_loss_clip": 1.02409458, + "balance_loss_mlp": 1.03609157, + "epoch": 0.6692319254471667, + "flos": 23731602825600.0, + "grad_norm": 1.5850862701658837, + "language_loss": 0.70004213, + "learning_rate": 1.0418827148772486e-06, + "loss": 0.72139168, + "num_input_tokens_seen": 240333540, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.640625, + "step": 11131, + "time_per_iteration": 2.442675828933716 + }, + { + "auxiliary_loss_clip": 0.01103504, + "auxiliary_loss_mlp": 0.01028409, + "balance_loss_clip": 1.01562405, + "balance_loss_mlp": 1.03573704, + "epoch": 0.6692920486998346, + "flos": 14427902620800.0, + "grad_norm": 2.456945083607925, + "language_loss": 0.65068108, + "learning_rate": 1.0415408702417243e-06, + "loss": 0.67200017, + "num_input_tokens_seen": 240350085, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 11132, + "time_per_iteration": 2.4112234115600586 + }, + { + "auxiliary_loss_clip": 0.01105597, + "auxiliary_loss_mlp": 0.01034611, + "balance_loss_clip": 1.0216949, + "balance_loss_mlp": 1.03693044, + "epoch": 0.6693521719525026, + "flos": 21507475662720.0, + "grad_norm": 1.6075137482523445, + "language_loss": 0.74700105, + "learning_rate": 1.0411990619510661e-06, + "loss": 0.76840317, + "num_input_tokens_seen": 240370015, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6875, + "step": 11133, + "time_per_iteration": 2.4416236877441406 + }, + { + "auxiliary_loss_clip": 0.01109475, + "auxiliary_loss_mlp": 0.01030464, + "balance_loss_clip": 1.01720238, + "balance_loss_mlp": 1.03926897, + "epoch": 0.6694122952051706, + "flos": 25406943022080.0, + "grad_norm": 2.3633346892670266, + "language_loss": 0.66337103, + "learning_rate": 1.0408572900182363e-06, + "loss": 0.68477046, + "num_input_tokens_seen": 240390770, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.703125, + "step": 11134, + "time_per_iteration": 2.4672107696533203 + }, + { + "auxiliary_loss_clip": 0.0111221, + "auxiliary_loss_mlp": 0.01033808, + "balance_loss_clip": 1.02098703, + "balance_loss_mlp": 1.03965247, + "epoch": 0.6694724184578386, + "flos": 25661729168640.0, + "grad_norm": 1.8392889149756566, + "language_loss": 0.77132189, + "learning_rate": 1.0405155544561943e-06, + "loss": 0.79278213, + "num_input_tokens_seen": 240409590, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7265625, + "step": 11135, + "time_per_iteration": 2.4986488819122314 + }, + { + "auxiliary_loss_clip": 0.01101077, + "auxiliary_loss_mlp": 0.01031734, + "balance_loss_clip": 1.01987875, + "balance_loss_mlp": 1.03573108, + "epoch": 0.6695325417105066, + "flos": 17709311635200.0, + "grad_norm": 1.766175864119674, + "language_loss": 0.74168599, + "learning_rate": 1.040173855277898e-06, + "loss": 0.76301408, + "num_input_tokens_seen": 240428180, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65234375, + "step": 11136, + "time_per_iteration": 3.892975091934204 + }, + { + "auxiliary_loss_clip": 0.01108465, + "auxiliary_loss_mlp": 0.010308, + "balance_loss_clip": 1.0182538, + "balance_loss_mlp": 1.03819919, + "epoch": 0.6695926649631745, + "flos": 24460050643200.0, + "grad_norm": 1.743373004526595, + "language_loss": 0.62210536, + "learning_rate": 1.0398321924963061e-06, + "loss": 0.643498, + "num_input_tokens_seen": 240447815, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 11137, + "time_per_iteration": 2.4584341049194336 + }, + { + "auxiliary_loss_clip": 0.01102957, + "auxiliary_loss_mlp": 0.01030402, + "balance_loss_clip": 1.01840353, + "balance_loss_mlp": 1.03640008, + "epoch": 0.6696527882158425, + "flos": 24280138396800.0, + "grad_norm": 2.2042949503897837, + "language_loss": 0.65724766, + "learning_rate": 1.0394905661243724e-06, + "loss": 0.67858124, + "num_input_tokens_seen": 240468635, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 11138, + "time_per_iteration": 4.031554460525513 + }, + { + "auxiliary_loss_clip": 0.01099165, + "auxiliary_loss_mlp": 0.01033153, + "balance_loss_clip": 1.02175677, + "balance_loss_mlp": 1.03467035, + "epoch": 0.6697129114685104, + "flos": 23002759958400.0, + "grad_norm": 1.5685975938909107, + "language_loss": 0.73056483, + "learning_rate": 1.039148976175053e-06, + "loss": 0.75188804, + "num_input_tokens_seen": 240488550, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.64453125, + "step": 11139, + "time_per_iteration": 2.490262746810913 + }, + { + "auxiliary_loss_clip": 0.01099368, + "auxiliary_loss_mlp": 0.0102889, + "balance_loss_clip": 1.01842916, + "balance_loss_mlp": 1.0357821, + "epoch": 0.6697730347211784, + "flos": 22638123043200.0, + "grad_norm": 3.192057111781844, + "language_loss": 0.70166105, + "learning_rate": 1.0388074226613016e-06, + "loss": 0.72294366, + "num_input_tokens_seen": 240508330, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.63671875, + "step": 11140, + "time_per_iteration": 3.9318604469299316 + }, + { + "auxiliary_loss_clip": 0.01104563, + "auxiliary_loss_mlp": 0.01026754, + "balance_loss_clip": 1.01471996, + "balance_loss_mlp": 1.03500891, + "epoch": 0.6698331579738463, + "flos": 28877242682880.0, + "grad_norm": 3.669311669995305, + "language_loss": 0.75779974, + "learning_rate": 1.0384659055960691e-06, + "loss": 0.77911294, + "num_input_tokens_seen": 240528470, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6953125, + "step": 11141, + "time_per_iteration": 2.516190767288208 + }, + { + "auxiliary_loss_clip": 0.01103882, + "auxiliary_loss_mlp": 0.01034793, + "balance_loss_clip": 1.0225563, + "balance_loss_mlp": 1.03589845, + "epoch": 0.6698932812265144, + "flos": 24207096090240.0, + "grad_norm": 1.7275630939402262, + "language_loss": 0.82025433, + "learning_rate": 1.0381244249923052e-06, + "loss": 0.84164113, + "num_input_tokens_seen": 240547815, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6796875, + "step": 11142, + "time_per_iteration": 2.477917432785034 + }, + { + "auxiliary_loss_clip": 0.01099057, + "auxiliary_loss_mlp": 0.0102729, + "balance_loss_clip": 1.01569653, + "balance_loss_mlp": 1.0331465, + "epoch": 0.6699534044791823, + "flos": 22090269830400.0, + "grad_norm": 1.5656493432889642, + "language_loss": 0.70054591, + "learning_rate": 1.037782980862959e-06, + "loss": 0.72180939, + "num_input_tokens_seen": 240567765, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66015625, + "step": 11143, + "time_per_iteration": 2.496873617172241 + }, + { + "auxiliary_loss_clip": 0.01099241, + "auxiliary_loss_mlp": 0.01031347, + "balance_loss_clip": 1.02056444, + "balance_loss_mlp": 1.03546476, + "epoch": 0.6700135277318503, + "flos": 25192377129600.0, + "grad_norm": 1.5042984772488615, + "language_loss": 0.69867527, + "learning_rate": 1.0374415732209796e-06, + "loss": 0.71998119, + "num_input_tokens_seen": 240590750, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.63671875, + "step": 11144, + "time_per_iteration": 2.498004674911499 + }, + { + "auxiliary_loss_clip": 0.01101313, + "auxiliary_loss_mlp": 0.01027386, + "balance_loss_clip": 1.01553071, + "balance_loss_mlp": 1.03556204, + "epoch": 0.6700736509845182, + "flos": 23440187784960.0, + "grad_norm": 1.7755943554148508, + "language_loss": 0.74376822, + "learning_rate": 1.0371002020793114e-06, + "loss": 0.76505524, + "num_input_tokens_seen": 240608875, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.65625, + "step": 11145, + "time_per_iteration": 2.482536554336548 + }, + { + "auxiliary_loss_clip": 0.01105558, + "auxiliary_loss_mlp": 0.01027193, + "balance_loss_clip": 1.01503921, + "balance_loss_mlp": 1.03620577, + "epoch": 0.6701337742371862, + "flos": 24389953251840.0, + "grad_norm": 1.7672711788536422, + "language_loss": 0.70669931, + "learning_rate": 1.0367588674509008e-06, + "loss": 0.72802681, + "num_input_tokens_seen": 240628565, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 11146, + "time_per_iteration": 2.480379819869995 + }, + { + "auxiliary_loss_clip": 0.01098109, + "auxiliary_loss_mlp": 0.01029358, + "balance_loss_clip": 1.01786661, + "balance_loss_mlp": 1.03490484, + "epoch": 0.6701938974898543, + "flos": 14793652857600.0, + "grad_norm": 1.8854886897083816, + "language_loss": 0.7791847, + "learning_rate": 1.0364175693486905e-06, + "loss": 0.80045938, + "num_input_tokens_seen": 240646325, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6328125, + "step": 11147, + "time_per_iteration": 2.4453067779541016 + }, + { + "auxiliary_loss_clip": 0.01104074, + "auxiliary_loss_mlp": 0.01033623, + "balance_loss_clip": 1.02177358, + "balance_loss_mlp": 1.03823161, + "epoch": 0.6702540207425222, + "flos": 20154002261760.0, + "grad_norm": 1.9489637728749547, + "language_loss": 0.70395339, + "learning_rate": 1.0360763077856218e-06, + "loss": 0.72533029, + "num_input_tokens_seen": 240666145, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65625, + "step": 11148, + "time_per_iteration": 2.4539880752563477 + }, + { + "auxiliary_loss_clip": 0.0110278, + "auxiliary_loss_mlp": 0.01032686, + "balance_loss_clip": 1.0209502, + "balance_loss_mlp": 1.03479636, + "epoch": 0.6703141439951902, + "flos": 21214157201280.0, + "grad_norm": 1.6874385150714277, + "language_loss": 0.70091569, + "learning_rate": 1.035735082774636e-06, + "loss": 0.72227037, + "num_input_tokens_seen": 240685570, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 11149, + "time_per_iteration": 2.5368881225585938 + }, + { + "auxiliary_loss_clip": 0.01102045, + "auxiliary_loss_mlp": 0.01025738, + "balance_loss_clip": 1.01511049, + "balance_loss_mlp": 1.03425717, + "epoch": 0.6703742672478581, + "flos": 23112538899840.0, + "grad_norm": 2.0651183620740405, + "language_loss": 0.7356782, + "learning_rate": 1.0353938943286727e-06, + "loss": 0.75695598, + "num_input_tokens_seen": 240706945, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6796875, + "step": 11150, + "time_per_iteration": 2.489635944366455 + }, + { + "auxiliary_loss_clip": 0.01104117, + "auxiliary_loss_mlp": 0.01034109, + "balance_loss_clip": 1.02237308, + "balance_loss_mlp": 1.03698301, + "epoch": 0.6704343905005261, + "flos": 22528918719360.0, + "grad_norm": 1.8066986470751747, + "language_loss": 0.7880882, + "learning_rate": 1.035052742460671e-06, + "loss": 0.80947053, + "num_input_tokens_seen": 240727990, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 11151, + "time_per_iteration": 2.470423698425293 + }, + { + "auxiliary_loss_clip": 0.01028384, + "auxiliary_loss_mlp": 0.01010518, + "balance_loss_clip": 1.00938594, + "balance_loss_mlp": 1.00781679, + "epoch": 0.670494513753194, + "flos": 64793158773120.0, + "grad_norm": 0.8172638110433008, + "language_loss": 0.55524588, + "learning_rate": 1.0347116271835643e-06, + "loss": 0.57563496, + "num_input_tokens_seen": 240790380, + "router_z_loss_clip": 0.01135254, + "router_z_loss_mlp": 0.20507812, + "step": 11152, + "time_per_iteration": 3.123234510421753 + }, + { + "auxiliary_loss_clip": 0.01103764, + "auxiliary_loss_mlp": 0.01030614, + "balance_loss_clip": 1.01884782, + "balance_loss_mlp": 1.03580236, + "epoch": 0.670554637005862, + "flos": 23511506238720.0, + "grad_norm": 1.6208942555378636, + "language_loss": 0.80739468, + "learning_rate": 1.0343705485102896e-06, + "loss": 0.82873851, + "num_input_tokens_seen": 240811545, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6796875, + "step": 11153, + "time_per_iteration": 2.511383533477783 + }, + { + "auxiliary_loss_clip": 0.01102109, + "auxiliary_loss_mlp": 0.0103123, + "balance_loss_clip": 1.0203166, + "balance_loss_mlp": 1.03519535, + "epoch": 0.67061476025853, + "flos": 19463404400640.0, + "grad_norm": 1.5741743783633508, + "language_loss": 0.76160783, + "learning_rate": 1.0340295064537814e-06, + "loss": 0.78294122, + "num_input_tokens_seen": 240831380, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.671875, + "step": 11154, + "time_per_iteration": 2.453047513961792 + }, + { + "auxiliary_loss_clip": 0.01108949, + "auxiliary_loss_mlp": 0.01032067, + "balance_loss_clip": 1.02030683, + "balance_loss_mlp": 1.03847241, + "epoch": 0.670674883511198, + "flos": 20519967980160.0, + "grad_norm": 1.4962510781515113, + "language_loss": 0.75975895, + "learning_rate": 1.0336885010269702e-06, + "loss": 0.78116906, + "num_input_tokens_seen": 240851855, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.703125, + "step": 11155, + "time_per_iteration": 2.507368564605713 + }, + { + "auxiliary_loss_clip": 0.01105615, + "auxiliary_loss_mlp": 0.01034109, + "balance_loss_clip": 1.02230704, + "balance_loss_mlp": 1.03825569, + "epoch": 0.6707350067638659, + "flos": 25483971738240.0, + "grad_norm": 2.76266123008703, + "language_loss": 0.81881839, + "learning_rate": 1.0333475322427878e-06, + "loss": 0.84021568, + "num_input_tokens_seen": 240869980, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.671875, + "step": 11156, + "time_per_iteration": 2.467165946960449 + }, + { + "auxiliary_loss_clip": 0.0110068, + "auxiliary_loss_mlp": 0.01025682, + "balance_loss_clip": 1.01425576, + "balance_loss_mlp": 1.03438997, + "epoch": 0.6707951300165339, + "flos": 22273450214400.0, + "grad_norm": 2.4473397037337237, + "language_loss": 0.74570251, + "learning_rate": 1.033006600114165e-06, + "loss": 0.7669661, + "num_input_tokens_seen": 240888680, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 11157, + "time_per_iteration": 2.4674718379974365 + }, + { + "auxiliary_loss_clip": 0.01105952, + "auxiliary_loss_mlp": 0.01035415, + "balance_loss_clip": 1.02370262, + "balance_loss_mlp": 1.03829253, + "epoch": 0.6708552532692018, + "flos": 23984593292160.0, + "grad_norm": 1.9350697498335474, + "language_loss": 0.7444576, + "learning_rate": 1.0326657046540282e-06, + "loss": 0.76587129, + "num_input_tokens_seen": 240909050, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.67578125, + "step": 11158, + "time_per_iteration": 2.4784538745880127 + }, + { + "auxiliary_loss_clip": 0.01108128, + "auxiliary_loss_mlp": 0.01030262, + "balance_loss_clip": 1.0180732, + "balance_loss_mlp": 1.0385921, + "epoch": 0.6709153765218698, + "flos": 24937519155840.0, + "grad_norm": 2.077178366394848, + "language_loss": 0.81668246, + "learning_rate": 1.0323248458753044e-06, + "loss": 0.83806634, + "num_input_tokens_seen": 240930035, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6953125, + "step": 11159, + "time_per_iteration": 2.476008653640747 + }, + { + "auxiliary_loss_clip": 0.01102735, + "auxiliary_loss_mlp": 0.01029288, + "balance_loss_clip": 1.01749814, + "balance_loss_mlp": 1.0353272, + "epoch": 0.6709754997745379, + "flos": 17530225401600.0, + "grad_norm": 1.6091286648822523, + "language_loss": 0.7708782, + "learning_rate": 1.0319840237909193e-06, + "loss": 0.79219836, + "num_input_tokens_seen": 240948895, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.671875, + "step": 11160, + "time_per_iteration": 2.4390769004821777 + }, + { + "auxiliary_loss_clip": 0.01101773, + "auxiliary_loss_mlp": 0.01026605, + "balance_loss_clip": 1.01520884, + "balance_loss_mlp": 1.03558326, + "epoch": 0.6710356230272058, + "flos": 22090880361600.0, + "grad_norm": 1.9005005299223583, + "language_loss": 0.73766249, + "learning_rate": 1.0316432384137978e-06, + "loss": 0.7589463, + "num_input_tokens_seen": 240967770, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 11161, + "time_per_iteration": 2.5078043937683105 + }, + { + "auxiliary_loss_clip": 0.01105932, + "auxiliary_loss_mlp": 0.01035471, + "balance_loss_clip": 1.0230794, + "balance_loss_mlp": 1.03523338, + "epoch": 0.6710957462798738, + "flos": 24206449645440.0, + "grad_norm": 1.6945637244101817, + "language_loss": 0.67987847, + "learning_rate": 1.0313024897568618e-06, + "loss": 0.70129251, + "num_input_tokens_seen": 240988985, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.703125, + "step": 11162, + "time_per_iteration": 2.5096116065979004 + }, + { + "auxiliary_loss_clip": 0.01102024, + "auxiliary_loss_mlp": 0.01032663, + "balance_loss_clip": 1.02136803, + "balance_loss_mlp": 1.03582597, + "epoch": 0.6711558695325417, + "flos": 19093955063040.0, + "grad_norm": 1.8281474305298504, + "language_loss": 0.70357502, + "learning_rate": 1.030961777833032e-06, + "loss": 0.72492194, + "num_input_tokens_seen": 241005455, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6640625, + "step": 11163, + "time_per_iteration": 2.4419682025909424 + }, + { + "auxiliary_loss_clip": 0.01101063, + "auxiliary_loss_mlp": 0.01029783, + "balance_loss_clip": 1.01849425, + "balance_loss_mlp": 1.03680897, + "epoch": 0.6712159927852097, + "flos": 25557875971200.0, + "grad_norm": 1.5206709527115365, + "language_loss": 0.75686288, + "learning_rate": 1.0306211026552291e-06, + "loss": 0.7781713, + "num_input_tokens_seen": 241026175, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.640625, + "step": 11164, + "time_per_iteration": 2.540302276611328 + }, + { + "auxiliary_loss_clip": 0.01102198, + "auxiliary_loss_mlp": 0.01027109, + "balance_loss_clip": 1.01537251, + "balance_loss_mlp": 1.03613234, + "epoch": 0.6712761160378776, + "flos": 22228812587520.0, + "grad_norm": 2.0013900075408424, + "language_loss": 0.64903474, + "learning_rate": 1.0302804642363704e-06, + "loss": 0.67032778, + "num_input_tokens_seen": 241044040, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66015625, + "step": 11165, + "time_per_iteration": 2.50164532661438 + }, + { + "auxiliary_loss_clip": 0.01101735, + "auxiliary_loss_mlp": 0.0103049, + "balance_loss_clip": 1.0188911, + "balance_loss_mlp": 1.03648162, + "epoch": 0.6713362392905456, + "flos": 22455517276800.0, + "grad_norm": 2.824490258261556, + "language_loss": 0.71357495, + "learning_rate": 1.0299398625893738e-06, + "loss": 0.73489726, + "num_input_tokens_seen": 241063615, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 11166, + "time_per_iteration": 2.4522786140441895 + }, + { + "auxiliary_loss_clip": 0.01102027, + "auxiliary_loss_mlp": 0.0102352, + "balance_loss_clip": 1.01324987, + "balance_loss_mlp": 1.0378294, + "epoch": 0.6713963625432136, + "flos": 25630200005760.0, + "grad_norm": 1.8136989987191092, + "language_loss": 0.77263552, + "learning_rate": 1.0295992977271546e-06, + "loss": 0.79389095, + "num_input_tokens_seen": 241082520, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.640625, + "step": 11167, + "time_per_iteration": 2.5255751609802246 + }, + { + "auxiliary_loss_clip": 0.01101953, + "auxiliary_loss_mlp": 0.0103477, + "balance_loss_clip": 1.02347469, + "balance_loss_mlp": 1.03458977, + "epoch": 0.6714564857958816, + "flos": 35006475640320.0, + "grad_norm": 5.373120607190098, + "language_loss": 0.69078279, + "learning_rate": 1.029258769662629e-06, + "loss": 0.71215004, + "num_input_tokens_seen": 241103505, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.671875, + "step": 11168, + "time_per_iteration": 2.5593607425689697 + }, + { + "auxiliary_loss_clip": 0.01105965, + "auxiliary_loss_mlp": 0.01038556, + "balance_loss_clip": 1.02552032, + "balance_loss_mlp": 1.036659, + "epoch": 0.6715166090485495, + "flos": 26279931168000.0, + "grad_norm": 1.891897557253962, + "language_loss": 0.73191148, + "learning_rate": 1.0289182784087068e-06, + "loss": 0.7533567, + "num_input_tokens_seen": 241122885, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6953125, + "step": 11169, + "time_per_iteration": 2.4835712909698486 + }, + { + "auxiliary_loss_clip": 0.01104514, + "auxiliary_loss_mlp": 0.01031792, + "balance_loss_clip": 1.01917934, + "balance_loss_mlp": 1.03605962, + "epoch": 0.6715767323012175, + "flos": 15924156583680.0, + "grad_norm": 2.050492769021052, + "language_loss": 0.76193798, + "learning_rate": 1.0285778239783005e-06, + "loss": 0.78330112, + "num_input_tokens_seen": 241140865, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 11170, + "time_per_iteration": 2.41772723197937 + }, + { + "auxiliary_loss_clip": 0.01106509, + "auxiliary_loss_mlp": 0.01028049, + "balance_loss_clip": 1.01618147, + "balance_loss_mlp": 1.03668404, + "epoch": 0.6716368555538854, + "flos": 17491441691520.0, + "grad_norm": 4.365942833040682, + "language_loss": 0.74738538, + "learning_rate": 1.0282374063843212e-06, + "loss": 0.768731, + "num_input_tokens_seen": 241158225, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69921875, + "step": 11171, + "time_per_iteration": 2.443998336791992 + }, + { + "auxiliary_loss_clip": 0.01104887, + "auxiliary_loss_mlp": 0.01037908, + "balance_loss_clip": 1.02605891, + "balance_loss_mlp": 1.03686571, + "epoch": 0.6716969788065534, + "flos": 16761521416320.0, + "grad_norm": 6.401963753530839, + "language_loss": 0.86554527, + "learning_rate": 1.0278970256396762e-06, + "loss": 0.88697314, + "num_input_tokens_seen": 241175215, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 11172, + "time_per_iteration": 2.449519395828247 + }, + { + "auxiliary_loss_clip": 0.01101029, + "auxiliary_loss_mlp": 0.0103252, + "balance_loss_clip": 1.02077198, + "balance_loss_mlp": 1.03432322, + "epoch": 0.6717571020592215, + "flos": 22709800632960.0, + "grad_norm": 1.5214923385952612, + "language_loss": 0.63705564, + "learning_rate": 1.0275566817572733e-06, + "loss": 0.65839112, + "num_input_tokens_seen": 241195250, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 11173, + "time_per_iteration": 2.4728994369506836 + }, + { + "auxiliary_loss_clip": 0.01108562, + "auxiliary_loss_mlp": 0.01035239, + "balance_loss_clip": 1.02201891, + "balance_loss_mlp": 1.03632855, + "epoch": 0.6718172253118894, + "flos": 18734094656640.0, + "grad_norm": 2.8011577390317584, + "language_loss": 0.71934807, + "learning_rate": 1.02721637475002e-06, + "loss": 0.74078608, + "num_input_tokens_seen": 241210720, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.72265625, + "step": 11174, + "time_per_iteration": 2.4150753021240234 + }, + { + "auxiliary_loss_clip": 0.01100599, + "auxiliary_loss_mlp": 0.01029318, + "balance_loss_clip": 1.01791573, + "balance_loss_mlp": 1.03507197, + "epoch": 0.6718773485645574, + "flos": 15632526061440.0, + "grad_norm": 1.9034241424773972, + "language_loss": 0.68639195, + "learning_rate": 1.0268761046308178e-06, + "loss": 0.70769107, + "num_input_tokens_seen": 241227395, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 11175, + "time_per_iteration": 2.4914746284484863 + }, + { + "auxiliary_loss_clip": 0.01101682, + "auxiliary_loss_mlp": 0.0103252, + "balance_loss_clip": 1.02143312, + "balance_loss_mlp": 1.0366466, + "epoch": 0.6719374718172253, + "flos": 19354774694400.0, + "grad_norm": 2.444826411678876, + "language_loss": 0.73786706, + "learning_rate": 1.0265358714125714e-06, + "loss": 0.7592091, + "num_input_tokens_seen": 241246355, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.65234375, + "step": 11176, + "time_per_iteration": 2.4306447505950928 + }, + { + "auxiliary_loss_clip": 0.01104157, + "auxiliary_loss_mlp": 0.01028322, + "balance_loss_clip": 1.01596642, + "balance_loss_mlp": 1.035748, + "epoch": 0.6719975950698933, + "flos": 21981316901760.0, + "grad_norm": 1.6959341450848686, + "language_loss": 0.72810507, + "learning_rate": 1.026195675108182e-06, + "loss": 0.74942982, + "num_input_tokens_seen": 241264180, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.68359375, + "step": 11177, + "time_per_iteration": 2.498624086380005 + }, + { + "auxiliary_loss_clip": 0.01103405, + "auxiliary_loss_mlp": 0.010286, + "balance_loss_clip": 1.01617265, + "balance_loss_mlp": 1.0354104, + "epoch": 0.6720577183225612, + "flos": 25228072270080.0, + "grad_norm": 2.080774174197305, + "language_loss": 0.76790631, + "learning_rate": 1.025855515730551e-06, + "loss": 0.78922629, + "num_input_tokens_seen": 241282245, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6796875, + "step": 11178, + "time_per_iteration": 3.880969524383545 + }, + { + "auxiliary_loss_clip": 0.01105896, + "auxiliary_loss_mlp": 0.01030362, + "balance_loss_clip": 1.01926351, + "balance_loss_mlp": 1.0375278, + "epoch": 0.6721178415752292, + "flos": 16945886949120.0, + "grad_norm": 1.9975121194491492, + "language_loss": 0.69893503, + "learning_rate": 1.0255153932925766e-06, + "loss": 0.72029757, + "num_input_tokens_seen": 241300745, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.68359375, + "step": 11179, + "time_per_iteration": 2.4223077297210693 + }, + { + "auxiliary_loss_clip": 0.01102153, + "auxiliary_loss_mlp": 0.01029046, + "balance_loss_clip": 1.01767302, + "balance_loss_mlp": 1.03676152, + "epoch": 0.6721779648278972, + "flos": 21541375123200.0, + "grad_norm": 1.6665783443252085, + "language_loss": 0.74105644, + "learning_rate": 1.0251753078071557e-06, + "loss": 0.76236838, + "num_input_tokens_seen": 241319320, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65625, + "step": 11180, + "time_per_iteration": 3.958832263946533 + }, + { + "auxiliary_loss_clip": 0.01102807, + "auxiliary_loss_mlp": 0.01027033, + "balance_loss_clip": 1.01544547, + "balance_loss_mlp": 1.03720415, + "epoch": 0.6722380880805652, + "flos": 22605444645120.0, + "grad_norm": 1.5017770160927022, + "language_loss": 0.75209451, + "learning_rate": 1.0248352592871848e-06, + "loss": 0.77339292, + "num_input_tokens_seen": 241342225, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 11181, + "time_per_iteration": 3.970757484436035 + }, + { + "auxiliary_loss_clip": 0.0110393, + "auxiliary_loss_mlp": 0.01025138, + "balance_loss_clip": 1.01424241, + "balance_loss_mlp": 1.03628325, + "epoch": 0.6722982113332331, + "flos": 15925269905280.0, + "grad_norm": 1.9826713327422718, + "language_loss": 0.74716818, + "learning_rate": 1.0244952477455585e-06, + "loss": 0.76845884, + "num_input_tokens_seen": 241358240, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.67578125, + "step": 11182, + "time_per_iteration": 2.4164199829101562 + }, + { + "auxiliary_loss_clip": 0.01098753, + "auxiliary_loss_mlp": 0.01032191, + "balance_loss_clip": 1.0213666, + "balance_loss_mlp": 1.03483748, + "epoch": 0.6723583345859011, + "flos": 20596170683520.0, + "grad_norm": 1.6492155923055305, + "language_loss": 0.69678056, + "learning_rate": 1.0241552731951699e-06, + "loss": 0.71808994, + "num_input_tokens_seen": 241378420, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.640625, + "step": 11183, + "time_per_iteration": 2.4825363159179688 + }, + { + "auxiliary_loss_clip": 0.01101908, + "auxiliary_loss_mlp": 0.01032724, + "balance_loss_clip": 1.02102327, + "balance_loss_mlp": 1.0350728, + "epoch": 0.672418457838569, + "flos": 21725848396800.0, + "grad_norm": 1.6819294722428546, + "language_loss": 0.77619171, + "learning_rate": 1.0238153356489112e-06, + "loss": 0.79753804, + "num_input_tokens_seen": 241397185, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66796875, + "step": 11184, + "time_per_iteration": 2.4742484092712402 + }, + { + "auxiliary_loss_clip": 0.01112997, + "auxiliary_loss_mlp": 0.01032631, + "balance_loss_clip": 1.01960111, + "balance_loss_mlp": 1.03978956, + "epoch": 0.672478581091237, + "flos": 21470379891840.0, + "grad_norm": 1.9600702886656058, + "language_loss": 0.65830189, + "learning_rate": 1.0234754351196743e-06, + "loss": 0.67975819, + "num_input_tokens_seen": 241415785, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.734375, + "step": 11185, + "time_per_iteration": 2.6265766620635986 + }, + { + "auxiliary_loss_clip": 0.0110247, + "auxiliary_loss_mlp": 0.01027508, + "balance_loss_clip": 1.01555133, + "balance_loss_mlp": 1.03508019, + "epoch": 0.6725387043439051, + "flos": 30846763267200.0, + "grad_norm": 1.6086008561996032, + "language_loss": 0.8077392, + "learning_rate": 1.023135571620345e-06, + "loss": 0.82903898, + "num_input_tokens_seen": 241437390, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 11186, + "time_per_iteration": 2.5254018306732178 + }, + { + "auxiliary_loss_clip": 0.0110242, + "auxiliary_loss_mlp": 0.01032446, + "balance_loss_clip": 1.02216411, + "balance_loss_mlp": 1.03798425, + "epoch": 0.672598827596573, + "flos": 24055947659520.0, + "grad_norm": 1.4050560740555764, + "language_loss": 0.8022958, + "learning_rate": 1.022795745163813e-06, + "loss": 0.82364446, + "num_input_tokens_seen": 241458085, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.64453125, + "step": 11187, + "time_per_iteration": 2.492206335067749 + }, + { + "auxiliary_loss_clip": 0.01108961, + "auxiliary_loss_mlp": 0.01035039, + "balance_loss_clip": 1.02235556, + "balance_loss_mlp": 1.03917003, + "epoch": 0.672658950849241, + "flos": 21871861182720.0, + "grad_norm": 2.0955662178616663, + "language_loss": 0.70936477, + "learning_rate": 1.022455955762965e-06, + "loss": 0.73080474, + "num_input_tokens_seen": 241476880, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 11188, + "time_per_iteration": 2.4696547985076904 + }, + { + "auxiliary_loss_clip": 0.01100609, + "auxiliary_loss_mlp": 0.01029794, + "balance_loss_clip": 1.01867819, + "balance_loss_mlp": 1.03614163, + "epoch": 0.6727190741019089, + "flos": 23222102359680.0, + "grad_norm": 1.690433236478768, + "language_loss": 0.7567057, + "learning_rate": 1.0221162034306842e-06, + "loss": 0.77800977, + "num_input_tokens_seen": 241496535, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.64453125, + "step": 11189, + "time_per_iteration": 2.502394676208496 + }, + { + "auxiliary_loss_clip": 0.01105784, + "auxiliary_loss_mlp": 0.01026536, + "balance_loss_clip": 1.01342869, + "balance_loss_mlp": 1.03580916, + "epoch": 0.6727791973545769, + "flos": 15778610674560.0, + "grad_norm": 2.0624308015957666, + "language_loss": 0.75735819, + "learning_rate": 1.0217764881798562e-06, + "loss": 0.7786814, + "num_input_tokens_seen": 241513465, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.69921875, + "step": 11190, + "time_per_iteration": 2.4117863178253174 + }, + { + "auxiliary_loss_clip": 0.01101643, + "auxiliary_loss_mlp": 0.01030612, + "balance_loss_clip": 1.01830935, + "balance_loss_mlp": 1.03503203, + "epoch": 0.6728393206072448, + "flos": 21249852341760.0, + "grad_norm": 1.479637189299754, + "language_loss": 0.77305663, + "learning_rate": 1.0214368100233612e-06, + "loss": 0.79437912, + "num_input_tokens_seen": 241534125, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66796875, + "step": 11191, + "time_per_iteration": 2.471383571624756 + }, + { + "auxiliary_loss_clip": 0.01101045, + "auxiliary_loss_mlp": 0.01027006, + "balance_loss_clip": 1.01542521, + "balance_loss_mlp": 1.03620696, + "epoch": 0.6728994438599128, + "flos": 32123279779200.0, + "grad_norm": 1.9484073900919987, + "language_loss": 0.86244619, + "learning_rate": 1.0210971689740802e-06, + "loss": 0.88372666, + "num_input_tokens_seen": 241556340, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6484375, + "step": 11192, + "time_per_iteration": 2.541471481323242 + }, + { + "auxiliary_loss_clip": 0.01105869, + "auxiliary_loss_mlp": 0.01033893, + "balance_loss_clip": 1.02166843, + "balance_loss_mlp": 1.03793001, + "epoch": 0.6729595671125808, + "flos": 23112359331840.0, + "grad_norm": 1.7778605034576032, + "language_loss": 0.76010567, + "learning_rate": 1.0207575650448923e-06, + "loss": 0.78150332, + "num_input_tokens_seen": 241575185, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6796875, + "step": 11193, + "time_per_iteration": 2.4631118774414062 + }, + { + "auxiliary_loss_clip": 0.01104222, + "auxiliary_loss_mlp": 0.01032099, + "balance_loss_clip": 1.0205301, + "balance_loss_mlp": 1.03698504, + "epoch": 0.6730196903652488, + "flos": 14611406227200.0, + "grad_norm": 1.7482449519435526, + "language_loss": 0.78450751, + "learning_rate": 1.0204179982486758e-06, + "loss": 0.80587071, + "num_input_tokens_seen": 241592970, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 11194, + "time_per_iteration": 2.4163994789123535 + }, + { + "auxiliary_loss_clip": 0.01102715, + "auxiliary_loss_mlp": 0.01027293, + "balance_loss_clip": 1.01602221, + "balance_loss_mlp": 1.03523183, + "epoch": 0.6730798136179167, + "flos": 21105922544640.0, + "grad_norm": 2.439402985037115, + "language_loss": 0.89769554, + "learning_rate": 1.0200784685983075e-06, + "loss": 0.91899562, + "num_input_tokens_seen": 241610245, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.67578125, + "step": 11195, + "time_per_iteration": 2.4890894889831543 + }, + { + "auxiliary_loss_clip": 0.01103794, + "auxiliary_loss_mlp": 0.0103069, + "balance_loss_clip": 1.01964474, + "balance_loss_mlp": 1.03698754, + "epoch": 0.6731399368705847, + "flos": 28986267438720.0, + "grad_norm": 3.8315256645626468, + "language_loss": 0.7259835, + "learning_rate": 1.019738976106662e-06, + "loss": 0.74732834, + "num_input_tokens_seen": 241630350, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6640625, + "step": 11196, + "time_per_iteration": 2.4961941242218018 + }, + { + "auxiliary_loss_clip": 0.01026268, + "auxiliary_loss_mlp": 0.00997949, + "balance_loss_clip": 0.99669737, + "balance_loss_mlp": 1.00585961, + "epoch": 0.6732000601232526, + "flos": 64743708723840.0, + "grad_norm": 0.7827982838834083, + "language_loss": 0.56530619, + "learning_rate": 1.0193995207866123e-06, + "loss": 0.58554828, + "num_input_tokens_seen": 241692380, + "router_z_loss_clip": 0.01251221, + "router_z_loss_mlp": 0.20410156, + "step": 11197, + "time_per_iteration": 2.9888203144073486 + }, + { + "auxiliary_loss_clip": 0.01103429, + "auxiliary_loss_mlp": 0.01023702, + "balance_loss_clip": 1.01289546, + "balance_loss_mlp": 1.03899539, + "epoch": 0.6732601833759206, + "flos": 17201642762880.0, + "grad_norm": 2.0080706986846635, + "language_loss": 0.75471473, + "learning_rate": 1.0190601026510312e-06, + "loss": 0.77598602, + "num_input_tokens_seen": 241710430, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.64453125, + "step": 11198, + "time_per_iteration": 2.4266445636749268 + }, + { + "auxiliary_loss_clip": 0.01103371, + "auxiliary_loss_mlp": 0.01026973, + "balance_loss_clip": 1.01493251, + "balance_loss_mlp": 1.03564501, + "epoch": 0.6733203066285887, + "flos": 18658861620480.0, + "grad_norm": 2.2277183364076674, + "language_loss": 0.8092168, + "learning_rate": 1.0187207217127892e-06, + "loss": 0.83052027, + "num_input_tokens_seen": 241724775, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6796875, + "step": 11199, + "time_per_iteration": 2.4250686168670654 + }, + { + "auxiliary_loss_clip": 0.01103852, + "auxiliary_loss_mlp": 0.01032091, + "balance_loss_clip": 1.01972258, + "balance_loss_mlp": 1.03520989, + "epoch": 0.6733804298812566, + "flos": 35809330481280.0, + "grad_norm": 1.7815929608142598, + "language_loss": 0.71828485, + "learning_rate": 1.0183813779847552e-06, + "loss": 0.73964423, + "num_input_tokens_seen": 241744440, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6875, + "step": 11200, + "time_per_iteration": 2.555952787399292 + }, + { + "auxiliary_loss_clip": 0.01106738, + "auxiliary_loss_mlp": 0.01032004, + "balance_loss_clip": 1.0200175, + "balance_loss_mlp": 1.03832173, + "epoch": 0.6734405531339246, + "flos": 61638833099520.0, + "grad_norm": 1.625800733182769, + "language_loss": 0.6466803, + "learning_rate": 1.0180420714797987e-06, + "loss": 0.66806769, + "num_input_tokens_seen": 241771705, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 11201, + "time_per_iteration": 2.8149640560150146 + }, + { + "auxiliary_loss_clip": 0.01106243, + "auxiliary_loss_mlp": 0.0103222, + "balance_loss_clip": 1.01998901, + "balance_loss_mlp": 1.03641796, + "epoch": 0.6735006763865925, + "flos": 20522338277760.0, + "grad_norm": 1.7955061796431357, + "language_loss": 0.63162857, + "learning_rate": 1.0177028022107856e-06, + "loss": 0.65301323, + "num_input_tokens_seen": 241790830, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69921875, + "step": 11202, + "time_per_iteration": 2.437077045440674 + }, + { + "auxiliary_loss_clip": 0.01103951, + "auxiliary_loss_mlp": 0.01026684, + "balance_loss_clip": 1.01568675, + "balance_loss_mlp": 1.03587484, + "epoch": 0.6735607996392605, + "flos": 13918869031680.0, + "grad_norm": 1.8620640282713015, + "language_loss": 0.74766082, + "learning_rate": 1.0173635701905796e-06, + "loss": 0.76896715, + "num_input_tokens_seen": 241808165, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6796875, + "step": 11203, + "time_per_iteration": 2.457798719406128 + }, + { + "auxiliary_loss_clip": 0.01109149, + "auxiliary_loss_mlp": 0.01030349, + "balance_loss_clip": 1.01710498, + "balance_loss_mlp": 1.037606, + "epoch": 0.6736209228919284, + "flos": 18807244704000.0, + "grad_norm": 1.7246428938805878, + "language_loss": 0.67498362, + "learning_rate": 1.0170243754320456e-06, + "loss": 0.69637865, + "num_input_tokens_seen": 241826925, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71484375, + "step": 11204, + "time_per_iteration": 2.4272255897521973 + }, + { + "auxiliary_loss_clip": 0.01110127, + "auxiliary_loss_mlp": 0.01032358, + "balance_loss_clip": 1.01991844, + "balance_loss_mlp": 1.03929329, + "epoch": 0.6736810461445965, + "flos": 20373129181440.0, + "grad_norm": 1.5939578102801788, + "language_loss": 0.7447291, + "learning_rate": 1.0166852179480465e-06, + "loss": 0.76615399, + "num_input_tokens_seen": 241845525, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 11205, + "time_per_iteration": 2.4560165405273438 + }, + { + "auxiliary_loss_clip": 0.0109994, + "auxiliary_loss_mlp": 0.01030681, + "balance_loss_clip": 1.01932693, + "balance_loss_mlp": 1.03492117, + "epoch": 0.6737411693972644, + "flos": 30007530927360.0, + "grad_norm": 1.6470910861724577, + "language_loss": 0.71854442, + "learning_rate": 1.0163460977514416e-06, + "loss": 0.73985064, + "num_input_tokens_seen": 241866815, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 11206, + "time_per_iteration": 2.5040676593780518 + }, + { + "auxiliary_loss_clip": 0.01111631, + "auxiliary_loss_mlp": 0.01032573, + "balance_loss_clip": 1.02033639, + "balance_loss_mlp": 1.03923798, + "epoch": 0.6738012926499324, + "flos": 25447342844160.0, + "grad_norm": 6.529945029855453, + "language_loss": 0.67127562, + "learning_rate": 1.016007014855092e-06, + "loss": 0.69271767, + "num_input_tokens_seen": 241887050, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.7265625, + "step": 11207, + "time_per_iteration": 2.5161397457122803 + }, + { + "auxiliary_loss_clip": 0.01102629, + "auxiliary_loss_mlp": 0.01029952, + "balance_loss_clip": 1.0182395, + "balance_loss_mlp": 1.03757155, + "epoch": 0.6738614159026003, + "flos": 20776873029120.0, + "grad_norm": 2.4663080715904675, + "language_loss": 0.73317289, + "learning_rate": 1.0156679692718553e-06, + "loss": 0.75449866, + "num_input_tokens_seen": 241904280, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6484375, + "step": 11208, + "time_per_iteration": 2.4350569248199463 + }, + { + "auxiliary_loss_clip": 0.01104929, + "auxiliary_loss_mlp": 0.01033002, + "balance_loss_clip": 1.019835, + "balance_loss_mlp": 1.03649032, + "epoch": 0.6739215391552683, + "flos": 19566898462080.0, + "grad_norm": 1.8859944640341983, + "language_loss": 0.75882745, + "learning_rate": 1.0153289610145867e-06, + "loss": 0.78020674, + "num_input_tokens_seen": 241919190, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.6875, + "step": 11209, + "time_per_iteration": 2.4393579959869385 + }, + { + "auxiliary_loss_clip": 0.01100693, + "auxiliary_loss_mlp": 0.01029713, + "balance_loss_clip": 1.01881683, + "balance_loss_mlp": 1.03629994, + "epoch": 0.6739816624079362, + "flos": 24388193485440.0, + "grad_norm": 1.6804143170759391, + "language_loss": 0.66519487, + "learning_rate": 1.0149899900961428e-06, + "loss": 0.68649894, + "num_input_tokens_seen": 241940525, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.64453125, + "step": 11210, + "time_per_iteration": 2.4730069637298584 + }, + { + "auxiliary_loss_clip": 0.01100446, + "auxiliary_loss_mlp": 0.01027448, + "balance_loss_clip": 1.01682603, + "balance_loss_mlp": 1.03569078, + "epoch": 0.6740417856606042, + "flos": 22528164533760.0, + "grad_norm": 2.1720353274754154, + "language_loss": 0.79894733, + "learning_rate": 1.014651056529377e-06, + "loss": 0.82022631, + "num_input_tokens_seen": 241959290, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6484375, + "step": 11211, + "time_per_iteration": 2.468639850616455 + }, + { + "auxiliary_loss_clip": 0.01101219, + "auxiliary_loss_mlp": 0.01030077, + "balance_loss_clip": 1.01841807, + "balance_loss_mlp": 1.03608012, + "epoch": 0.6741019089132723, + "flos": 25775458606080.0, + "grad_norm": 1.549232637169743, + "language_loss": 0.76512897, + "learning_rate": 1.014312160327143e-06, + "loss": 0.78644192, + "num_input_tokens_seen": 241980715, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65234375, + "step": 11212, + "time_per_iteration": 2.478450059890747 + }, + { + "auxiliary_loss_clip": 0.01104217, + "auxiliary_loss_mlp": 0.01029584, + "balance_loss_clip": 1.01728141, + "balance_loss_mlp": 1.03573346, + "epoch": 0.6741620321659402, + "flos": 21105671149440.0, + "grad_norm": 1.6801358890975542, + "language_loss": 0.77888572, + "learning_rate": 1.0139733015022905e-06, + "loss": 0.80022377, + "num_input_tokens_seen": 241999985, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.68359375, + "step": 11213, + "time_per_iteration": 2.4666621685028076 + }, + { + "auxiliary_loss_clip": 0.01107053, + "auxiliary_loss_mlp": 0.01035384, + "balance_loss_clip": 1.0228132, + "balance_loss_mlp": 1.03760529, + "epoch": 0.6742221554186082, + "flos": 20740423703040.0, + "grad_norm": 2.4257892231901765, + "language_loss": 0.67633986, + "learning_rate": 1.0136344800676685e-06, + "loss": 0.69776428, + "num_input_tokens_seen": 242018990, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 11214, + "time_per_iteration": 2.452108860015869 + }, + { + "auxiliary_loss_clip": 0.01103571, + "auxiliary_loss_mlp": 0.01033418, + "balance_loss_clip": 1.02189624, + "balance_loss_mlp": 1.03610945, + "epoch": 0.6742822786712761, + "flos": 37774146384000.0, + "grad_norm": 1.6441501997597023, + "language_loss": 0.72691011, + "learning_rate": 1.0132956960361263e-06, + "loss": 0.74828005, + "num_input_tokens_seen": 242039340, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.67578125, + "step": 11215, + "time_per_iteration": 2.615023374557495 + }, + { + "auxiliary_loss_clip": 0.01105661, + "auxiliary_loss_mlp": 0.01032878, + "balance_loss_clip": 1.02168989, + "balance_loss_mlp": 1.03667545, + "epoch": 0.6743424019239441, + "flos": 37263891732480.0, + "grad_norm": 3.085424201902257, + "language_loss": 0.67325628, + "learning_rate": 1.0129569494205096e-06, + "loss": 0.69464171, + "num_input_tokens_seen": 242062215, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6875, + "step": 11216, + "time_per_iteration": 2.567662477493286 + }, + { + "auxiliary_loss_clip": 0.01026395, + "auxiliary_loss_mlp": 0.01001456, + "balance_loss_clip": 1.00016236, + "balance_loss_mlp": 1.00580978, + "epoch": 0.674402525176612, + "flos": 65997746300160.0, + "grad_norm": 0.6744353438462242, + "language_loss": 0.56309336, + "learning_rate": 1.0126182402336646e-06, + "loss": 0.58337194, + "num_input_tokens_seen": 242131130, + "router_z_loss_clip": 0.01293945, + "router_z_loss_mlp": 0.20605469, + "step": 11217, + "time_per_iteration": 3.1818552017211914 + }, + { + "auxiliary_loss_clip": 0.01102202, + "auxiliary_loss_mlp": 0.01028455, + "balance_loss_clip": 1.0162183, + "balance_loss_mlp": 1.0352273, + "epoch": 0.67446264842928, + "flos": 26461208131200.0, + "grad_norm": 1.9712085707776, + "language_loss": 0.74490952, + "learning_rate": 1.0122795684884363e-06, + "loss": 0.76621616, + "num_input_tokens_seen": 242149720, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.671875, + "step": 11218, + "time_per_iteration": 2.4742777347564697 + }, + { + "auxiliary_loss_clip": 0.01105482, + "auxiliary_loss_mlp": 0.01042974, + "balance_loss_clip": 1.02953315, + "balance_loss_mlp": 1.03671169, + "epoch": 0.674522771681948, + "flos": 23732392924800.0, + "grad_norm": 1.6873790300129339, + "language_loss": 0.66097057, + "learning_rate": 1.0119409341976639e-06, + "loss": 0.68245506, + "num_input_tokens_seen": 242168875, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.6875, + "step": 11219, + "time_per_iteration": 3.9712955951690674 + }, + { + "auxiliary_loss_clip": 0.01105197, + "auxiliary_loss_mlp": 0.0103097, + "balance_loss_clip": 1.01901901, + "balance_loss_mlp": 1.03550935, + "epoch": 0.674582894934616, + "flos": 24754338771840.0, + "grad_norm": 1.9354673669636624, + "language_loss": 0.74431932, + "learning_rate": 1.0116023373741904e-06, + "loss": 0.76568097, + "num_input_tokens_seen": 242188465, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 11220, + "time_per_iteration": 2.4782400131225586 + }, + { + "auxiliary_loss_clip": 0.01104541, + "auxiliary_loss_mlp": 0.0103107, + "balance_loss_clip": 1.01871443, + "balance_loss_mlp": 1.03673649, + "epoch": 0.6746430181872839, + "flos": 24826626892800.0, + "grad_norm": 1.753572378422806, + "language_loss": 0.70208532, + "learning_rate": 1.0112637780308554e-06, + "loss": 0.72344136, + "num_input_tokens_seen": 242208675, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6796875, + "step": 11221, + "time_per_iteration": 3.8499643802642822 + }, + { + "auxiliary_loss_clip": 0.01104329, + "auxiliary_loss_mlp": 0.01027391, + "balance_loss_clip": 1.01634061, + "balance_loss_mlp": 1.03750563, + "epoch": 0.6747031414399519, + "flos": 16873491087360.0, + "grad_norm": 2.083478811055199, + "language_loss": 0.58038485, + "learning_rate": 1.010925256180498e-06, + "loss": 0.60170209, + "num_input_tokens_seen": 242227440, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.66796875, + "step": 11222, + "time_per_iteration": 3.796449661254883 + }, + { + "auxiliary_loss_clip": 0.01105674, + "auxiliary_loss_mlp": 0.01032124, + "balance_loss_clip": 1.02047682, + "balance_loss_mlp": 1.03731191, + "epoch": 0.6747632646926198, + "flos": 22784925928320.0, + "grad_norm": 2.9048479494136266, + "language_loss": 0.76680332, + "learning_rate": 1.0105867718359528e-06, + "loss": 0.7881813, + "num_input_tokens_seen": 242245240, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.68359375, + "step": 11223, + "time_per_iteration": 3.932152271270752 + }, + { + "auxiliary_loss_clip": 0.01107203, + "auxiliary_loss_mlp": 0.01032299, + "balance_loss_clip": 1.02009189, + "balance_loss_mlp": 1.03799176, + "epoch": 0.6748233879452878, + "flos": 20046090827520.0, + "grad_norm": 1.7350565617477662, + "language_loss": 0.75261784, + "learning_rate": 1.0102483250100574e-06, + "loss": 0.77401286, + "num_input_tokens_seen": 242263435, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69140625, + "step": 11224, + "time_per_iteration": 2.4370362758636475 + }, + { + "auxiliary_loss_clip": 0.0109934, + "auxiliary_loss_mlp": 0.01024383, + "balance_loss_clip": 1.01388621, + "balance_loss_mlp": 1.03474987, + "epoch": 0.6748835111979558, + "flos": 23002831785600.0, + "grad_norm": 1.6433655631752735, + "language_loss": 0.63031125, + "learning_rate": 1.0099099157156445e-06, + "loss": 0.6515485, + "num_input_tokens_seen": 242282765, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.64453125, + "step": 11225, + "time_per_iteration": 2.472139835357666 + }, + { + "auxiliary_loss_clip": 0.01099206, + "auxiliary_loss_mlp": 0.01030978, + "balance_loss_clip": 1.02051783, + "balance_loss_mlp": 1.03548217, + "epoch": 0.6749436344506238, + "flos": 12197311009920.0, + "grad_norm": 1.7438523279987848, + "language_loss": 0.64443898, + "learning_rate": 1.0095715439655462e-06, + "loss": 0.66574085, + "num_input_tokens_seen": 242298980, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.63671875, + "step": 11226, + "time_per_iteration": 2.3997251987457275 + }, + { + "auxiliary_loss_clip": 0.01106856, + "auxiliary_loss_mlp": 0.01032206, + "balance_loss_clip": 1.02009439, + "balance_loss_mlp": 1.03833103, + "epoch": 0.6750037577032918, + "flos": 11873720361600.0, + "grad_norm": 2.0520128582030406, + "language_loss": 0.71177256, + "learning_rate": 1.0092332097725945e-06, + "loss": 0.73316324, + "num_input_tokens_seen": 242315420, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 11227, + "time_per_iteration": 2.4354188442230225 + }, + { + "auxiliary_loss_clip": 0.01102719, + "auxiliary_loss_mlp": 0.01028023, + "balance_loss_clip": 1.01566148, + "balance_loss_mlp": 1.03702497, + "epoch": 0.6750638809559597, + "flos": 17019611614080.0, + "grad_norm": 1.9773279438432965, + "language_loss": 0.7113992, + "learning_rate": 1.0088949131496183e-06, + "loss": 0.73270661, + "num_input_tokens_seen": 242332805, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.65625, + "step": 11228, + "time_per_iteration": 2.4065871238708496 + }, + { + "auxiliary_loss_clip": 0.01026271, + "auxiliary_loss_mlp": 0.01000743, + "balance_loss_clip": 0.99950963, + "balance_loss_mlp": 1.00561559, + "epoch": 0.6751240042086277, + "flos": 70951011891840.0, + "grad_norm": 0.7600669046292114, + "language_loss": 0.53283465, + "learning_rate": 1.0085566541094482e-06, + "loss": 0.55310482, + "num_input_tokens_seen": 242396160, + "router_z_loss_clip": 0.0123291, + "router_z_loss_mlp": 0.20703125, + "step": 11229, + "time_per_iteration": 3.113936424255371 + }, + { + "auxiliary_loss_clip": 0.01102392, + "auxiliary_loss_mlp": 0.01029521, + "balance_loss_clip": 1.01778507, + "balance_loss_mlp": 1.03599358, + "epoch": 0.6751841274612956, + "flos": 22675146986880.0, + "grad_norm": 1.668368696112623, + "language_loss": 0.80301458, + "learning_rate": 1.0082184326649072e-06, + "loss": 0.82433373, + "num_input_tokens_seen": 242414660, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 11230, + "time_per_iteration": 2.481586456298828 + }, + { + "auxiliary_loss_clip": 0.01101511, + "auxiliary_loss_mlp": 0.01025242, + "balance_loss_clip": 1.01433396, + "balance_loss_mlp": 1.03644145, + "epoch": 0.6752442507139637, + "flos": 21288636051840.0, + "grad_norm": 1.5254643295267571, + "language_loss": 0.66080362, + "learning_rate": 1.0078802488288228e-06, + "loss": 0.68207115, + "num_input_tokens_seen": 242434225, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6484375, + "step": 11231, + "time_per_iteration": 2.4348020553588867 + }, + { + "auxiliary_loss_clip": 0.01109126, + "auxiliary_loss_mlp": 0.01036053, + "balance_loss_clip": 1.02254665, + "balance_loss_mlp": 1.03815401, + "epoch": 0.6753043739666316, + "flos": 28256921781120.0, + "grad_norm": 1.8895861738799862, + "language_loss": 0.66976327, + "learning_rate": 1.0075421026140198e-06, + "loss": 0.69121504, + "num_input_tokens_seen": 242454355, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7109375, + "step": 11232, + "time_per_iteration": 2.565011501312256 + }, + { + "auxiliary_loss_clip": 0.01102814, + "auxiliary_loss_mlp": 0.01026397, + "balance_loss_clip": 1.01529849, + "balance_loss_mlp": 1.03667426, + "epoch": 0.6753644972192996, + "flos": 21360349555200.0, + "grad_norm": 1.7997945281360064, + "language_loss": 0.72617656, + "learning_rate": 1.0072039940333188e-06, + "loss": 0.74746865, + "num_input_tokens_seen": 242474935, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6640625, + "step": 11233, + "time_per_iteration": 2.451127767562866 + }, + { + "auxiliary_loss_clip": 0.01102933, + "auxiliary_loss_mlp": 0.01032225, + "balance_loss_clip": 1.01986313, + "balance_loss_mlp": 1.035604, + "epoch": 0.6754246204719675, + "flos": 26541971861760.0, + "grad_norm": 1.728016441920487, + "language_loss": 0.76981372, + "learning_rate": 1.0068659230995418e-06, + "loss": 0.79116529, + "num_input_tokens_seen": 242495530, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 11234, + "time_per_iteration": 2.560873031616211 + }, + { + "auxiliary_loss_clip": 0.01103068, + "auxiliary_loss_mlp": 0.01028923, + "balance_loss_clip": 1.01688838, + "balance_loss_mlp": 1.03655529, + "epoch": 0.6754847437246355, + "flos": 25556690822400.0, + "grad_norm": 1.5233618386668848, + "language_loss": 0.7516101, + "learning_rate": 1.0065278898255101e-06, + "loss": 0.77292997, + "num_input_tokens_seen": 242514550, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 11235, + "time_per_iteration": 2.473658323287964 + }, + { + "auxiliary_loss_clip": 0.01025939, + "auxiliary_loss_mlp": 0.00999916, + "balance_loss_clip": 0.99873585, + "balance_loss_mlp": 1.00544596, + "epoch": 0.6755448669773034, + "flos": 59513318726400.0, + "grad_norm": 0.7849137447209698, + "language_loss": 0.51408035, + "learning_rate": 1.0061898942240387e-06, + "loss": 0.53433889, + "num_input_tokens_seen": 242569200, + "router_z_loss_clip": 0.01177979, + "router_z_loss_mlp": 0.20507812, + "step": 11236, + "time_per_iteration": 2.993544340133667 + }, + { + "auxiliary_loss_clip": 0.01102563, + "auxiliary_loss_mlp": 0.01027868, + "balance_loss_clip": 1.01443923, + "balance_loss_mlp": 1.03596473, + "epoch": 0.6756049902299714, + "flos": 23294534135040.0, + "grad_norm": 2.2221952993281335, + "language_loss": 0.75521564, + "learning_rate": 1.0058519363079464e-06, + "loss": 0.77652001, + "num_input_tokens_seen": 242586950, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.6640625, + "step": 11237, + "time_per_iteration": 2.4348740577697754 + }, + { + "auxiliary_loss_clip": 0.01105842, + "auxiliary_loss_mlp": 0.01032869, + "balance_loss_clip": 1.02153206, + "balance_loss_mlp": 1.03944969, + "epoch": 0.6756651134826394, + "flos": 31575426566400.0, + "grad_norm": 2.1736628297595466, + "language_loss": 0.77503932, + "learning_rate": 1.0055140160900482e-06, + "loss": 0.79642648, + "num_input_tokens_seen": 242607380, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 11238, + "time_per_iteration": 2.526988983154297 + }, + { + "auxiliary_loss_clip": 0.01105979, + "auxiliary_loss_mlp": 0.01031121, + "balance_loss_clip": 1.0188483, + "balance_loss_mlp": 1.03556848, + "epoch": 0.6757252367353074, + "flos": 27272287186560.0, + "grad_norm": 1.9142971498049255, + "language_loss": 0.66731274, + "learning_rate": 1.0051761335831587e-06, + "loss": 0.68868375, + "num_input_tokens_seen": 242628025, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 11239, + "time_per_iteration": 2.4696223735809326 + }, + { + "auxiliary_loss_clip": 0.0110246, + "auxiliary_loss_mlp": 0.01026428, + "balance_loss_clip": 1.01447082, + "balance_loss_mlp": 1.03745294, + "epoch": 0.6757853599879754, + "flos": 16830900535680.0, + "grad_norm": 2.923743651844225, + "language_loss": 0.82995439, + "learning_rate": 1.0048382888000898e-06, + "loss": 0.85124326, + "num_input_tokens_seen": 242643825, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6484375, + "step": 11240, + "time_per_iteration": 2.446572780609131 + }, + { + "auxiliary_loss_clip": 0.01111011, + "auxiliary_loss_mlp": 0.01033506, + "balance_loss_clip": 1.01959419, + "balance_loss_mlp": 1.03869963, + "epoch": 0.6758454832406433, + "flos": 23220055284480.0, + "grad_norm": 3.6442496224808933, + "language_loss": 0.74812031, + "learning_rate": 1.0045004817536525e-06, + "loss": 0.76956552, + "num_input_tokens_seen": 242661820, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.72265625, + "step": 11241, + "time_per_iteration": 2.423372268676758 + }, + { + "auxiliary_loss_clip": 0.01104649, + "auxiliary_loss_mlp": 0.01032061, + "balance_loss_clip": 1.02011645, + "balance_loss_mlp": 1.03697479, + "epoch": 0.6759056064933113, + "flos": 16289547684480.0, + "grad_norm": 2.14563763168323, + "language_loss": 0.80052149, + "learning_rate": 1.0041627124566572e-06, + "loss": 0.82188863, + "num_input_tokens_seen": 242679890, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 11242, + "time_per_iteration": 2.434990167617798 + }, + { + "auxiliary_loss_clip": 0.01101563, + "auxiliary_loss_mlp": 0.01028144, + "balance_loss_clip": 1.01693797, + "balance_loss_mlp": 1.034922, + "epoch": 0.6759657297459792, + "flos": 25922297404800.0, + "grad_norm": 1.9802154508142344, + "language_loss": 0.72626722, + "learning_rate": 1.0038249809219109e-06, + "loss": 0.74756432, + "num_input_tokens_seen": 242699495, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.66796875, + "step": 11243, + "time_per_iteration": 2.453474283218384 + }, + { + "auxiliary_loss_clip": 0.0110346, + "auxiliary_loss_mlp": 0.01034848, + "balance_loss_clip": 1.02346909, + "balance_loss_mlp": 1.03676426, + "epoch": 0.6760258529986473, + "flos": 23000820624000.0, + "grad_norm": 1.7073695655292809, + "language_loss": 0.72612441, + "learning_rate": 1.003487287162221e-06, + "loss": 0.74750745, + "num_input_tokens_seen": 242719500, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 11244, + "time_per_iteration": 2.5105230808258057 + }, + { + "auxiliary_loss_clip": 0.01106398, + "auxiliary_loss_mlp": 0.01038693, + "balance_loss_clip": 1.02668309, + "balance_loss_mlp": 1.03746104, + "epoch": 0.6760859762513152, + "flos": 20959335141120.0, + "grad_norm": 1.8087707557146027, + "language_loss": 0.85335118, + "learning_rate": 1.003149631190393e-06, + "loss": 0.87480211, + "num_input_tokens_seen": 242738325, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6875, + "step": 11245, + "time_per_iteration": 2.445233106613159 + }, + { + "auxiliary_loss_clip": 0.01108481, + "auxiliary_loss_mlp": 0.01032399, + "balance_loss_clip": 1.02013278, + "balance_loss_mlp": 1.03733289, + "epoch": 0.6761460995039832, + "flos": 23622937205760.0, + "grad_norm": 2.3183444790940766, + "language_loss": 0.73646373, + "learning_rate": 1.0028120130192327e-06, + "loss": 0.75787258, + "num_input_tokens_seen": 242756620, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.7109375, + "step": 11246, + "time_per_iteration": 2.4863364696502686 + }, + { + "auxiliary_loss_clip": 0.01102215, + "auxiliary_loss_mlp": 0.01025917, + "balance_loss_clip": 1.01430011, + "balance_loss_mlp": 1.0346514, + "epoch": 0.6762062227566511, + "flos": 20770875457920.0, + "grad_norm": 2.2448543978250437, + "language_loss": 0.88085318, + "learning_rate": 1.002474432661539e-06, + "loss": 0.90213448, + "num_input_tokens_seen": 242774505, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.67578125, + "step": 11247, + "time_per_iteration": 2.4308738708496094 + }, + { + "auxiliary_loss_clip": 0.01025674, + "auxiliary_loss_mlp": 0.01003402, + "balance_loss_clip": 1.00222802, + "balance_loss_mlp": 1.0053699, + "epoch": 0.6762663460093191, + "flos": 52818099166080.0, + "grad_norm": 0.8266217559963673, + "language_loss": 0.54048848, + "learning_rate": 1.002136890130115e-06, + "loss": 0.56077927, + "num_input_tokens_seen": 242828645, + "router_z_loss_clip": 0.01171875, + "router_z_loss_mlp": 0.203125, + "step": 11248, + "time_per_iteration": 3.076478958129883 + }, + { + "auxiliary_loss_clip": 0.01098711, + "auxiliary_loss_mlp": 0.01029161, + "balance_loss_clip": 1.01805067, + "balance_loss_mlp": 1.03580928, + "epoch": 0.676326469261987, + "flos": 23696302734720.0, + "grad_norm": 1.69579760819699, + "language_loss": 0.73396099, + "learning_rate": 1.001799385437761e-06, + "loss": 0.75523973, + "num_input_tokens_seen": 242850100, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.62890625, + "step": 11249, + "time_per_iteration": 2.47476863861084 + }, + { + "auxiliary_loss_clip": 0.01103589, + "auxiliary_loss_mlp": 0.01031036, + "balance_loss_clip": 1.01811373, + "balance_loss_mlp": 1.03449488, + "epoch": 0.676386592514655, + "flos": 14063732582400.0, + "grad_norm": 2.3334311767034035, + "language_loss": 0.73674285, + "learning_rate": 1.0014619185972732e-06, + "loss": 0.75808907, + "num_input_tokens_seen": 242867775, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69140625, + "step": 11250, + "time_per_iteration": 2.427795171737671 + }, + { + "auxiliary_loss_clip": 0.01104705, + "auxiliary_loss_mlp": 0.01024882, + "balance_loss_clip": 1.01378322, + "balance_loss_mlp": 1.03724456, + "epoch": 0.676446715767323, + "flos": 20412236113920.0, + "grad_norm": 1.7440150220700932, + "language_loss": 0.75326031, + "learning_rate": 1.0011244896214497e-06, + "loss": 0.77455616, + "num_input_tokens_seen": 242886865, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.67578125, + "step": 11251, + "time_per_iteration": 2.453015089035034 + }, + { + "auxiliary_loss_clip": 0.01103045, + "auxiliary_loss_mlp": 0.01030739, + "balance_loss_clip": 1.01890135, + "balance_loss_mlp": 1.03677213, + "epoch": 0.676506839019991, + "flos": 21288241002240.0, + "grad_norm": 4.794996819995717, + "language_loss": 0.7030319, + "learning_rate": 1.0007870985230873e-06, + "loss": 0.7243697, + "num_input_tokens_seen": 242906705, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6640625, + "step": 11252, + "time_per_iteration": 2.507655382156372 + }, + { + "auxiliary_loss_clip": 0.01104897, + "auxiliary_loss_mlp": 0.01030105, + "balance_loss_clip": 1.01872027, + "balance_loss_mlp": 1.03790915, + "epoch": 0.676566962272659, + "flos": 29932477459200.0, + "grad_norm": 1.7295296864842329, + "language_loss": 0.66713816, + "learning_rate": 1.0004497453149765e-06, + "loss": 0.68848813, + "num_input_tokens_seen": 242925215, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.671875, + "step": 11253, + "time_per_iteration": 2.495661735534668 + }, + { + "auxiliary_loss_clip": 0.01106169, + "auxiliary_loss_mlp": 0.01034304, + "balance_loss_clip": 1.02159083, + "balance_loss_mlp": 1.03755689, + "epoch": 0.6766270855253269, + "flos": 17931203902080.0, + "grad_norm": 1.5712995070705533, + "language_loss": 0.77059627, + "learning_rate": 1.0001124300099115e-06, + "loss": 0.79200101, + "num_input_tokens_seen": 242944750, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 11254, + "time_per_iteration": 2.5303773880004883 + }, + { + "auxiliary_loss_clip": 0.01103059, + "auxiliary_loss_mlp": 0.0103136, + "balance_loss_clip": 1.01923668, + "balance_loss_mlp": 1.0353651, + "epoch": 0.6766872087779949, + "flos": 23104853389440.0, + "grad_norm": 2.008694221276799, + "language_loss": 0.72041488, + "learning_rate": 9.997751526206835e-07, + "loss": 0.74175906, + "num_input_tokens_seen": 242963860, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 11255, + "time_per_iteration": 2.4310834407806396 + }, + { + "auxiliary_loss_clip": 0.01103491, + "auxiliary_loss_mlp": 0.01034987, + "balance_loss_clip": 1.02294099, + "balance_loss_mlp": 1.03527474, + "epoch": 0.6767473320306628, + "flos": 26213137827840.0, + "grad_norm": 2.4309429012787533, + "language_loss": 0.75107753, + "learning_rate": 9.994379131600828e-07, + "loss": 0.77246231, + "num_input_tokens_seen": 242983050, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.68359375, + "step": 11256, + "time_per_iteration": 2.5040993690490723 + }, + { + "auxiliary_loss_clip": 0.01105082, + "auxiliary_loss_mlp": 0.01030951, + "balance_loss_clip": 1.01898217, + "balance_loss_mlp": 1.03802788, + "epoch": 0.6768074552833309, + "flos": 18368739469440.0, + "grad_norm": 2.256626523492283, + "language_loss": 0.64639592, + "learning_rate": 9.991007116408965e-07, + "loss": 0.6677562, + "num_input_tokens_seen": 243001125, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 11257, + "time_per_iteration": 2.4259557723999023 + }, + { + "auxiliary_loss_clip": 0.01097898, + "auxiliary_loss_mlp": 0.01028121, + "balance_loss_clip": 1.01709366, + "balance_loss_mlp": 1.03422582, + "epoch": 0.6768675785359988, + "flos": 23039927556480.0, + "grad_norm": 1.4043820681784667, + "language_loss": 0.75555968, + "learning_rate": 9.987635480759109e-07, + "loss": 0.77681983, + "num_input_tokens_seen": 243021865, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.63671875, + "step": 11258, + "time_per_iteration": 2.4665939807891846 + }, + { + "auxiliary_loss_clip": 0.01100628, + "auxiliary_loss_mlp": 0.01028723, + "balance_loss_clip": 1.01757717, + "balance_loss_mlp": 1.03654146, + "epoch": 0.6769277017886668, + "flos": 33036524092800.0, + "grad_norm": 1.6503685315767886, + "language_loss": 0.66716135, + "learning_rate": 9.984264224779127e-07, + "loss": 0.68845475, + "num_input_tokens_seen": 243042970, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.640625, + "step": 11259, + "time_per_iteration": 2.527073383331299 + }, + { + "auxiliary_loss_clip": 0.01104423, + "auxiliary_loss_mlp": 0.01027196, + "balance_loss_clip": 1.0155077, + "balance_loss_mlp": 1.03676665, + "epoch": 0.6769878250413347, + "flos": 20848406964480.0, + "grad_norm": 2.9058137848386902, + "language_loss": 0.85316312, + "learning_rate": 9.980893348596839e-07, + "loss": 0.87447935, + "num_input_tokens_seen": 243058470, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.67578125, + "step": 11260, + "time_per_iteration": 2.457331418991089 + }, + { + "auxiliary_loss_clip": 0.01104097, + "auxiliary_loss_mlp": 0.01032449, + "balance_loss_clip": 1.02011061, + "balance_loss_mlp": 1.03481388, + "epoch": 0.6770479482940027, + "flos": 15595968994560.0, + "grad_norm": 1.992894296567027, + "language_loss": 0.77366221, + "learning_rate": 9.977522852340081e-07, + "loss": 0.79502773, + "num_input_tokens_seen": 243076630, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6953125, + "step": 11261, + "time_per_iteration": 3.8098442554473877 + }, + { + "auxiliary_loss_clip": 0.01102151, + "auxiliary_loss_mlp": 0.01033066, + "balance_loss_clip": 1.0210495, + "balance_loss_mlp": 1.03392744, + "epoch": 0.6771080715466706, + "flos": 18621011664000.0, + "grad_norm": 1.8771294723417649, + "language_loss": 0.87785065, + "learning_rate": 9.97415273613666e-07, + "loss": 0.89920282, + "num_input_tokens_seen": 243092260, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 11262, + "time_per_iteration": 2.4098682403564453 + }, + { + "auxiliary_loss_clip": 0.0110654, + "auxiliary_loss_mlp": 0.0102957, + "balance_loss_clip": 1.01738644, + "balance_loss_mlp": 1.03773284, + "epoch": 0.6771681947993387, + "flos": 12495441893760.0, + "grad_norm": 2.144843606745404, + "language_loss": 0.73935968, + "learning_rate": 9.97078300011439e-07, + "loss": 0.76072079, + "num_input_tokens_seen": 243109405, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 11263, + "time_per_iteration": 3.836534261703491 + }, + { + "auxiliary_loss_clip": 0.01105867, + "auxiliary_loss_mlp": 0.01033538, + "balance_loss_clip": 1.020401, + "balance_loss_mlp": 1.03613746, + "epoch": 0.6772283180520066, + "flos": 22236964974720.0, + "grad_norm": 3.0336865802259716, + "language_loss": 0.67681348, + "learning_rate": 9.967413644401016e-07, + "loss": 0.6982075, + "num_input_tokens_seen": 243128135, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6953125, + "step": 11264, + "time_per_iteration": 3.8063998222351074 + }, + { + "auxiliary_loss_clip": 0.01104469, + "auxiliary_loss_mlp": 0.01036584, + "balance_loss_clip": 1.02429914, + "balance_loss_mlp": 1.03774631, + "epoch": 0.6772884413046746, + "flos": 16143139848960.0, + "grad_norm": 1.9309030757319006, + "language_loss": 0.72956276, + "learning_rate": 9.964044669124324e-07, + "loss": 0.75097328, + "num_input_tokens_seen": 243146785, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66796875, + "step": 11265, + "time_per_iteration": 3.905475616455078 + }, + { + "auxiliary_loss_clip": 0.01101535, + "auxiliary_loss_mlp": 0.01031399, + "balance_loss_clip": 1.02002645, + "balance_loss_mlp": 1.03592122, + "epoch": 0.6773485645573426, + "flos": 19135755515520.0, + "grad_norm": 1.5488311970116568, + "language_loss": 0.61298478, + "learning_rate": 9.96067607441207e-07, + "loss": 0.63431406, + "num_input_tokens_seen": 243165275, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65625, + "step": 11266, + "time_per_iteration": 2.4533629417419434 + }, + { + "auxiliary_loss_clip": 0.01105454, + "auxiliary_loss_mlp": 0.01035839, + "balance_loss_clip": 1.02384639, + "balance_loss_mlp": 1.03653467, + "epoch": 0.6774086878100105, + "flos": 14136918543360.0, + "grad_norm": 2.0018325327863455, + "language_loss": 0.70975608, + "learning_rate": 9.957307860391976e-07, + "loss": 0.73116899, + "num_input_tokens_seen": 243182845, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 11267, + "time_per_iteration": 2.4130048751831055 + }, + { + "auxiliary_loss_clip": 0.01102815, + "auxiliary_loss_mlp": 0.01027992, + "balance_loss_clip": 1.01627374, + "balance_loss_mlp": 1.03553224, + "epoch": 0.6774688110626785, + "flos": 22197067943040.0, + "grad_norm": 1.995940802920633, + "language_loss": 0.71196496, + "learning_rate": 9.953940027191785e-07, + "loss": 0.73327303, + "num_input_tokens_seen": 243201475, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 11268, + "time_per_iteration": 2.5001561641693115 + }, + { + "auxiliary_loss_clip": 0.01106446, + "auxiliary_loss_mlp": 0.01028349, + "balance_loss_clip": 1.01621413, + "balance_loss_mlp": 1.03911674, + "epoch": 0.6775289343153464, + "flos": 23039963470080.0, + "grad_norm": 1.4505290648176117, + "language_loss": 0.76658797, + "learning_rate": 9.950572574939194e-07, + "loss": 0.78793591, + "num_input_tokens_seen": 243221850, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 11269, + "time_per_iteration": 2.450594902038574 + }, + { + "auxiliary_loss_clip": 0.0110441, + "auxiliary_loss_mlp": 0.01033402, + "balance_loss_clip": 1.02046824, + "balance_loss_mlp": 1.03552103, + "epoch": 0.6775890575680145, + "flos": 18293506433280.0, + "grad_norm": 1.9037033189032353, + "language_loss": 0.74434447, + "learning_rate": 9.94720550376189e-07, + "loss": 0.76572257, + "num_input_tokens_seen": 243239855, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6875, + "step": 11270, + "time_per_iteration": 2.4480292797088623 + }, + { + "auxiliary_loss_clip": 0.01105285, + "auxiliary_loss_mlp": 0.01037183, + "balance_loss_clip": 1.02421904, + "balance_loss_mlp": 1.03799176, + "epoch": 0.6776491808206824, + "flos": 25336450581120.0, + "grad_norm": 1.765961836580733, + "language_loss": 0.72747099, + "learning_rate": 9.94383881378756e-07, + "loss": 0.74889576, + "num_input_tokens_seen": 243260085, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.671875, + "step": 11271, + "time_per_iteration": 2.466099739074707 + }, + { + "auxiliary_loss_clip": 0.01104325, + "auxiliary_loss_mlp": 0.01033539, + "balance_loss_clip": 1.02158785, + "balance_loss_mlp": 1.0367682, + "epoch": 0.6777093040733504, + "flos": 26028233591040.0, + "grad_norm": 1.5327741783409103, + "language_loss": 0.67725623, + "learning_rate": 9.94047250514387e-07, + "loss": 0.69863486, + "num_input_tokens_seen": 243280065, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 11272, + "time_per_iteration": 2.506606340408325 + }, + { + "auxiliary_loss_clip": 0.01107233, + "auxiliary_loss_mlp": 0.01034611, + "balance_loss_clip": 1.02115774, + "balance_loss_mlp": 1.03756714, + "epoch": 0.6777694273260183, + "flos": 18003599763840.0, + "grad_norm": 2.19334323210367, + "language_loss": 0.73699766, + "learning_rate": 9.937106577958481e-07, + "loss": 0.75841612, + "num_input_tokens_seen": 243297775, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.6953125, + "step": 11273, + "time_per_iteration": 2.40608286857605 + }, + { + "auxiliary_loss_clip": 0.01101569, + "auxiliary_loss_mlp": 0.01036562, + "balance_loss_clip": 1.02462888, + "balance_loss_mlp": 1.03617656, + "epoch": 0.6778295505786863, + "flos": 23441085624960.0, + "grad_norm": 2.20814425061036, + "language_loss": 0.70081609, + "learning_rate": 9.933741032359015e-07, + "loss": 0.72219741, + "num_input_tokens_seen": 243315760, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 11274, + "time_per_iteration": 2.476304769515991 + }, + { + "auxiliary_loss_clip": 0.01104951, + "auxiliary_loss_mlp": 0.0103141, + "balance_loss_clip": 1.01921475, + "balance_loss_mlp": 1.03662062, + "epoch": 0.6778896738313542, + "flos": 19098408349440.0, + "grad_norm": 1.6447665363620352, + "language_loss": 0.65597254, + "learning_rate": 9.930375868473093e-07, + "loss": 0.67733622, + "num_input_tokens_seen": 243335715, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 11275, + "time_per_iteration": 2.4458420276641846 + }, + { + "auxiliary_loss_clip": 0.01103666, + "auxiliary_loss_mlp": 0.01032506, + "balance_loss_clip": 1.02177751, + "balance_loss_mlp": 1.03688347, + "epoch": 0.6779497970840223, + "flos": 26103933504000.0, + "grad_norm": 2.26567322463042, + "language_loss": 0.72724402, + "learning_rate": 9.927011086428335e-07, + "loss": 0.74860573, + "num_input_tokens_seen": 243356935, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.66796875, + "step": 11276, + "time_per_iteration": 2.506394624710083 + }, + { + "auxiliary_loss_clip": 0.01103474, + "auxiliary_loss_mlp": 0.01029393, + "balance_loss_clip": 1.01724589, + "balance_loss_mlp": 1.03681684, + "epoch": 0.6780099203366902, + "flos": 19719232041600.0, + "grad_norm": 1.7387203972635623, + "language_loss": 0.76835978, + "learning_rate": 9.923646686352317e-07, + "loss": 0.78968847, + "num_input_tokens_seen": 243375625, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.66796875, + "step": 11277, + "time_per_iteration": 2.4156947135925293 + }, + { + "auxiliary_loss_clip": 0.01106329, + "auxiliary_loss_mlp": 0.01027599, + "balance_loss_clip": 1.01580894, + "balance_loss_mlp": 1.03709924, + "epoch": 0.6780700435893582, + "flos": 18214538382720.0, + "grad_norm": 3.843343867942956, + "language_loss": 0.83494425, + "learning_rate": 9.920282668372627e-07, + "loss": 0.85628355, + "num_input_tokens_seen": 243390195, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.69140625, + "step": 11278, + "time_per_iteration": 2.4242331981658936 + }, + { + "auxiliary_loss_clip": 0.01100898, + "auxiliary_loss_mlp": 0.01030557, + "balance_loss_clip": 1.01966131, + "balance_loss_mlp": 1.03655803, + "epoch": 0.6781301668420262, + "flos": 25376239872000.0, + "grad_norm": 1.546828654628467, + "language_loss": 0.70229775, + "learning_rate": 9.916919032616844e-07, + "loss": 0.72361231, + "num_input_tokens_seen": 243411690, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.640625, + "step": 11279, + "time_per_iteration": 2.4774818420410156 + }, + { + "auxiliary_loss_clip": 0.01103487, + "auxiliary_loss_mlp": 0.01030453, + "balance_loss_clip": 1.01785898, + "balance_loss_mlp": 1.03606427, + "epoch": 0.6781902900946941, + "flos": 24020432087040.0, + "grad_norm": 1.8996542277217034, + "language_loss": 0.74191052, + "learning_rate": 9.913555779212485e-07, + "loss": 0.76324993, + "num_input_tokens_seen": 243430280, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.671875, + "step": 11280, + "time_per_iteration": 2.4954020977020264 + }, + { + "auxiliary_loss_clip": 0.01106782, + "auxiliary_loss_mlp": 0.01029984, + "balance_loss_clip": 1.01768732, + "balance_loss_mlp": 1.03710222, + "epoch": 0.6782504133473621, + "flos": 19646764352640.0, + "grad_norm": 1.8728658209175957, + "language_loss": 0.70118409, + "learning_rate": 9.910192908287104e-07, + "loss": 0.7225517, + "num_input_tokens_seen": 243448690, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 11281, + "time_per_iteration": 2.4171640872955322 + }, + { + "auxiliary_loss_clip": 0.01100593, + "auxiliary_loss_mlp": 0.01025939, + "balance_loss_clip": 1.01519203, + "balance_loss_mlp": 1.03611064, + "epoch": 0.67831053660003, + "flos": 24932742647040.0, + "grad_norm": 1.563642265820809, + "language_loss": 0.63874096, + "learning_rate": 9.906830419968217e-07, + "loss": 0.66000628, + "num_input_tokens_seen": 243470695, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6484375, + "step": 11282, + "time_per_iteration": 2.5364012718200684 + }, + { + "auxiliary_loss_clip": 0.0110743, + "auxiliary_loss_mlp": 0.01036912, + "balance_loss_clip": 1.02427554, + "balance_loss_mlp": 1.03683639, + "epoch": 0.6783706598526981, + "flos": 31208383440000.0, + "grad_norm": 1.5622929992593626, + "language_loss": 0.74648255, + "learning_rate": 9.90346831438334e-07, + "loss": 0.76792598, + "num_input_tokens_seen": 243493345, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 11283, + "time_per_iteration": 2.5009424686431885 + }, + { + "auxiliary_loss_clip": 0.01101134, + "auxiliary_loss_mlp": 0.01027647, + "balance_loss_clip": 1.01622117, + "balance_loss_mlp": 1.03523421, + "epoch": 0.678430783105366, + "flos": 35441317687680.0, + "grad_norm": 1.6182405596102953, + "language_loss": 0.5701533, + "learning_rate": 9.900106591659948e-07, + "loss": 0.59144115, + "num_input_tokens_seen": 243515670, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66015625, + "step": 11284, + "time_per_iteration": 2.5896449089050293 + }, + { + "auxiliary_loss_clip": 0.01101588, + "auxiliary_loss_mlp": 0.0102962, + "balance_loss_clip": 1.01796126, + "balance_loss_mlp": 1.03485477, + "epoch": 0.678490906358034, + "flos": 14428800460800.0, + "grad_norm": 2.4677100655448485, + "language_loss": 0.75404185, + "learning_rate": 9.896745251925535e-07, + "loss": 0.77535391, + "num_input_tokens_seen": 243533625, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66796875, + "step": 11285, + "time_per_iteration": 2.53873872756958 + }, + { + "auxiliary_loss_clip": 0.01102067, + "auxiliary_loss_mlp": 0.01028054, + "balance_loss_clip": 1.01661038, + "balance_loss_mlp": 1.03747129, + "epoch": 0.6785510296107019, + "flos": 24311236596480.0, + "grad_norm": 1.8021221276720163, + "language_loss": 0.66290027, + "learning_rate": 9.893384295307557e-07, + "loss": 0.68420148, + "num_input_tokens_seen": 243553040, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 11286, + "time_per_iteration": 2.498288631439209 + }, + { + "auxiliary_loss_clip": 0.0110234, + "auxiliary_loss_mlp": 0.01029185, + "balance_loss_clip": 1.01754975, + "balance_loss_mlp": 1.03434348, + "epoch": 0.6786111528633699, + "flos": 26977244872320.0, + "grad_norm": 2.2344222526167083, + "language_loss": 0.52489305, + "learning_rate": 9.890023721933447e-07, + "loss": 0.54620832, + "num_input_tokens_seen": 243572590, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 11287, + "time_per_iteration": 2.470860719680786 + }, + { + "auxiliary_loss_clip": 0.01102428, + "auxiliary_loss_mlp": 0.01029695, + "balance_loss_clip": 1.01842999, + "balance_loss_mlp": 1.0358603, + "epoch": 0.6786712761160378, + "flos": 24317557390080.0, + "grad_norm": 2.2748309661133086, + "language_loss": 0.77437216, + "learning_rate": 9.886663531930655e-07, + "loss": 0.7956934, + "num_input_tokens_seen": 243594140, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6640625, + "step": 11288, + "time_per_iteration": 2.507276773452759 + }, + { + "auxiliary_loss_clip": 0.01105773, + "auxiliary_loss_mlp": 0.01034994, + "balance_loss_clip": 1.02353239, + "balance_loss_mlp": 1.03752971, + "epoch": 0.6787313993687059, + "flos": 22930435923840.0, + "grad_norm": 1.9600358072539563, + "language_loss": 0.73192465, + "learning_rate": 9.883303725426593e-07, + "loss": 0.75333238, + "num_input_tokens_seen": 243615170, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.68359375, + "step": 11289, + "time_per_iteration": 2.466587781906128 + }, + { + "auxiliary_loss_clip": 0.01103364, + "auxiliary_loss_mlp": 0.01035376, + "balance_loss_clip": 1.02300215, + "balance_loss_mlp": 1.0357126, + "epoch": 0.6787915226213738, + "flos": 26868435598080.0, + "grad_norm": 1.567844133932764, + "language_loss": 0.80266666, + "learning_rate": 9.879944302548682e-07, + "loss": 0.82405412, + "num_input_tokens_seen": 243635675, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.67578125, + "step": 11290, + "time_per_iteration": 2.5057084560394287 + }, + { + "auxiliary_loss_clip": 0.01100237, + "auxiliary_loss_mlp": 0.01027997, + "balance_loss_clip": 1.01677918, + "balance_loss_mlp": 1.03600717, + "epoch": 0.6788516458740418, + "flos": 20008851402240.0, + "grad_norm": 2.2351562454410034, + "language_loss": 0.75014412, + "learning_rate": 9.87658526342428e-07, + "loss": 0.77142644, + "num_input_tokens_seen": 243654950, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 11291, + "time_per_iteration": 2.4530417919158936 + }, + { + "auxiliary_loss_clip": 0.01105979, + "auxiliary_loss_mlp": 0.01034196, + "balance_loss_clip": 1.02219784, + "balance_loss_mlp": 1.03691578, + "epoch": 0.6789117691267098, + "flos": 28727099832960.0, + "grad_norm": 1.8299710869537638, + "language_loss": 0.75613016, + "learning_rate": 9.873226608180785e-07, + "loss": 0.77753186, + "num_input_tokens_seen": 243674970, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 11292, + "time_per_iteration": 2.560930013656616 + }, + { + "auxiliary_loss_clip": 0.01103978, + "auxiliary_loss_mlp": 0.01028719, + "balance_loss_clip": 1.01666081, + "balance_loss_mlp": 1.03636706, + "epoch": 0.6789718923793777, + "flos": 23403451150080.0, + "grad_norm": 1.9135755383501691, + "language_loss": 0.83619392, + "learning_rate": 9.869868336945556e-07, + "loss": 0.85752094, + "num_input_tokens_seen": 243693440, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 11293, + "time_per_iteration": 2.442145824432373 + }, + { + "auxiliary_loss_clip": 0.01111617, + "auxiliary_loss_mlp": 0.01037557, + "balance_loss_clip": 1.02419984, + "balance_loss_mlp": 1.03933525, + "epoch": 0.6790320156320457, + "flos": 20448865008000.0, + "grad_norm": 2.319599838777995, + "language_loss": 0.79377204, + "learning_rate": 9.866510449845929e-07, + "loss": 0.81526375, + "num_input_tokens_seen": 243710055, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.72265625, + "step": 11294, + "time_per_iteration": 2.487916946411133 + }, + { + "auxiliary_loss_clip": 0.0110334, + "auxiliary_loss_mlp": 0.01027757, + "balance_loss_clip": 1.0165689, + "balance_loss_mlp": 1.0358336, + "epoch": 0.6790921388847136, + "flos": 24167199058560.0, + "grad_norm": 1.670516322497649, + "language_loss": 0.79154253, + "learning_rate": 9.86315294700924e-07, + "loss": 0.81285346, + "num_input_tokens_seen": 243728635, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.67578125, + "step": 11295, + "time_per_iteration": 2.466892957687378 + }, + { + "auxiliary_loss_clip": 0.01099219, + "auxiliary_loss_mlp": 0.01028607, + "balance_loss_clip": 1.01862347, + "balance_loss_mlp": 1.03505814, + "epoch": 0.6791522621373817, + "flos": 21908095027200.0, + "grad_norm": 1.698673678539366, + "language_loss": 0.71407616, + "learning_rate": 9.859795828562823e-07, + "loss": 0.73535442, + "num_input_tokens_seen": 243748330, + "router_z_loss_clip": 0.09960938, + "router_z_loss_mlp": 0.640625, + "step": 11296, + "time_per_iteration": 2.482555866241455 + }, + { + "auxiliary_loss_clip": 0.01101606, + "auxiliary_loss_mlp": 0.0102908, + "balance_loss_clip": 1.01736212, + "balance_loss_mlp": 1.03510296, + "epoch": 0.6792123853900496, + "flos": 24826519152000.0, + "grad_norm": 1.753920624789111, + "language_loss": 0.70683616, + "learning_rate": 9.856439094633949e-07, + "loss": 0.72814304, + "num_input_tokens_seen": 243769380, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 11297, + "time_per_iteration": 2.466238021850586 + }, + { + "auxiliary_loss_clip": 0.01106999, + "auxiliary_loss_mlp": 0.01030897, + "balance_loss_clip": 1.01821899, + "balance_loss_mlp": 1.03667176, + "epoch": 0.6792725086427176, + "flos": 17566279678080.0, + "grad_norm": 2.1069890127028974, + "language_loss": 0.66267467, + "learning_rate": 9.853082745349918e-07, + "loss": 0.6840536, + "num_input_tokens_seen": 243785510, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 11298, + "time_per_iteration": 2.424710273742676 + }, + { + "auxiliary_loss_clip": 0.01103908, + "auxiliary_loss_mlp": 0.01029396, + "balance_loss_clip": 1.01871479, + "balance_loss_mlp": 1.03633463, + "epoch": 0.6793326318953855, + "flos": 26941837040640.0, + "grad_norm": 1.7026224144439064, + "language_loss": 0.71526003, + "learning_rate": 9.84972678083801e-07, + "loss": 0.73659307, + "num_input_tokens_seen": 243805545, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.67578125, + "step": 11299, + "time_per_iteration": 2.4778668880462646 + }, + { + "auxiliary_loss_clip": 0.01105656, + "auxiliary_loss_mlp": 0.01032112, + "balance_loss_clip": 1.02016139, + "balance_loss_mlp": 1.03812611, + "epoch": 0.6793927551480535, + "flos": 24318275662080.0, + "grad_norm": 1.4081485921140142, + "language_loss": 0.77155232, + "learning_rate": 9.846371201225488e-07, + "loss": 0.79293001, + "num_input_tokens_seen": 243825185, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.67578125, + "step": 11300, + "time_per_iteration": 2.492253541946411 + }, + { + "auxiliary_loss_clip": 0.01103159, + "auxiliary_loss_mlp": 0.01029717, + "balance_loss_clip": 1.01748598, + "balance_loss_mlp": 1.03599048, + "epoch": 0.6794528784007214, + "flos": 11436615757440.0, + "grad_norm": 1.7968797031135182, + "language_loss": 0.62885916, + "learning_rate": 9.843016006639577e-07, + "loss": 0.65018791, + "num_input_tokens_seen": 243841600, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 11301, + "time_per_iteration": 2.397135019302368 + }, + { + "auxiliary_loss_clip": 0.01102494, + "auxiliary_loss_mlp": 0.0102808, + "balance_loss_clip": 1.01690459, + "balance_loss_mlp": 1.03594089, + "epoch": 0.6795130016533895, + "flos": 25229688382080.0, + "grad_norm": 1.724284284453245, + "language_loss": 0.82755935, + "learning_rate": 9.839661197207525e-07, + "loss": 0.84886515, + "num_input_tokens_seen": 243862250, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6640625, + "step": 11302, + "time_per_iteration": 2.472766399383545 + }, + { + "auxiliary_loss_clip": 0.01106208, + "auxiliary_loss_mlp": 0.01031247, + "balance_loss_clip": 1.01926029, + "balance_loss_mlp": 1.03716099, + "epoch": 0.6795731249060574, + "flos": 18296415434880.0, + "grad_norm": 2.1762222349963176, + "language_loss": 0.69784915, + "learning_rate": 9.83630677305654e-07, + "loss": 0.71922374, + "num_input_tokens_seen": 243880560, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.69140625, + "step": 11303, + "time_per_iteration": 3.805736780166626 + }, + { + "auxiliary_loss_clip": 0.0110718, + "auxiliary_loss_mlp": 0.01029925, + "balance_loss_clip": 1.01801562, + "balance_loss_mlp": 1.03717601, + "epoch": 0.6796332481587254, + "flos": 20300374183680.0, + "grad_norm": 2.3868097803445383, + "language_loss": 0.69926792, + "learning_rate": 9.832952734313813e-07, + "loss": 0.72063893, + "num_input_tokens_seen": 243900635, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.703125, + "step": 11304, + "time_per_iteration": 2.4878110885620117 + }, + { + "auxiliary_loss_clip": 0.01107692, + "auxiliary_loss_mlp": 0.01031025, + "balance_loss_clip": 1.0188539, + "balance_loss_mlp": 1.03924417, + "epoch": 0.6796933714113934, + "flos": 23586847015680.0, + "grad_norm": 2.7487345535411407, + "language_loss": 0.72523355, + "learning_rate": 9.829599081106536e-07, + "loss": 0.74662066, + "num_input_tokens_seen": 243920160, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.68359375, + "step": 11305, + "time_per_iteration": 3.969510316848755 + }, + { + "auxiliary_loss_clip": 0.01103346, + "auxiliary_loss_mlp": 0.01027504, + "balance_loss_clip": 1.01541042, + "balance_loss_mlp": 1.03585541, + "epoch": 0.6797534946640613, + "flos": 27119917693440.0, + "grad_norm": 1.9643394158396053, + "language_loss": 0.65558803, + "learning_rate": 9.826245813561882e-07, + "loss": 0.67689657, + "num_input_tokens_seen": 243939015, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 11306, + "time_per_iteration": 5.400679111480713 + }, + { + "auxiliary_loss_clip": 0.01101932, + "auxiliary_loss_mlp": 0.01027949, + "balance_loss_clip": 1.01505661, + "balance_loss_mlp": 1.03540945, + "epoch": 0.6798136179167293, + "flos": 22127437428480.0, + "grad_norm": 1.6667606428941142, + "language_loss": 0.79942191, + "learning_rate": 9.822892931807021e-07, + "loss": 0.82072073, + "num_input_tokens_seen": 243958470, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6640625, + "step": 11307, + "time_per_iteration": 2.430248260498047 + }, + { + "auxiliary_loss_clip": 0.01103369, + "auxiliary_loss_mlp": 0.01030053, + "balance_loss_clip": 1.01866865, + "balance_loss_mlp": 1.03694439, + "epoch": 0.6798737411693972, + "flos": 17488640430720.0, + "grad_norm": 1.5435492505708737, + "language_loss": 0.88790625, + "learning_rate": 9.819540435969066e-07, + "loss": 0.90924048, + "num_input_tokens_seen": 243975450, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6640625, + "step": 11308, + "time_per_iteration": 2.456007242202759 + }, + { + "auxiliary_loss_clip": 0.01104873, + "auxiliary_loss_mlp": 0.01036527, + "balance_loss_clip": 1.02374792, + "balance_loss_mlp": 1.03597665, + "epoch": 0.6799338644220653, + "flos": 22892262744960.0, + "grad_norm": 2.037595188669874, + "language_loss": 0.71198809, + "learning_rate": 9.816188326175154e-07, + "loss": 0.73340213, + "num_input_tokens_seen": 243994355, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6875, + "step": 11309, + "time_per_iteration": 2.444063901901245 + }, + { + "auxiliary_loss_clip": 0.01104515, + "auxiliary_loss_mlp": 0.01033962, + "balance_loss_clip": 1.02223217, + "balance_loss_mlp": 1.03636754, + "epoch": 0.6799939876747332, + "flos": 23180409648000.0, + "grad_norm": 2.0611426595675915, + "language_loss": 0.84300488, + "learning_rate": 9.812836602552411e-07, + "loss": 0.86438966, + "num_input_tokens_seen": 244011620, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.68359375, + "step": 11310, + "time_per_iteration": 2.4817349910736084 + }, + { + "auxiliary_loss_clip": 0.01102101, + "auxiliary_loss_mlp": 0.01027846, + "balance_loss_clip": 1.01696813, + "balance_loss_mlp": 1.03708959, + "epoch": 0.6800541109274012, + "flos": 19499925553920.0, + "grad_norm": 2.1934331981692963, + "language_loss": 0.82783055, + "learning_rate": 9.80948526522792e-07, + "loss": 0.84913009, + "num_input_tokens_seen": 244029925, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6484375, + "step": 11311, + "time_per_iteration": 2.4103691577911377 + }, + { + "auxiliary_loss_clip": 0.01107302, + "auxiliary_loss_mlp": 0.01032182, + "balance_loss_clip": 1.01871729, + "balance_loss_mlp": 1.03547812, + "epoch": 0.6801142341800691, + "flos": 22277652105600.0, + "grad_norm": 2.5662813310714268, + "language_loss": 0.76297283, + "learning_rate": 9.806134314328767e-07, + "loss": 0.78436768, + "num_input_tokens_seen": 244051225, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71875, + "step": 11312, + "time_per_iteration": 2.5150935649871826 + }, + { + "auxiliary_loss_clip": 0.01027323, + "auxiliary_loss_mlp": 0.01002804, + "balance_loss_clip": 1.00166547, + "balance_loss_mlp": 1.00670671, + "epoch": 0.6801743574327371, + "flos": 68714817759360.0, + "grad_norm": 0.6868398662733849, + "language_loss": 0.57254708, + "learning_rate": 9.802783749982038e-07, + "loss": 0.59284842, + "num_input_tokens_seen": 244115930, + "router_z_loss_clip": 0.01141357, + "router_z_loss_mlp": 0.20605469, + "step": 11313, + "time_per_iteration": 3.1505696773529053 + }, + { + "auxiliary_loss_clip": 0.01103458, + "auxiliary_loss_mlp": 0.01027729, + "balance_loss_clip": 1.01572418, + "balance_loss_mlp": 1.03516006, + "epoch": 0.680234480685405, + "flos": 29460467813760.0, + "grad_norm": 1.7918563854588148, + "language_loss": 0.68882596, + "learning_rate": 9.799433572314754e-07, + "loss": 0.71013784, + "num_input_tokens_seen": 244137320, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 11314, + "time_per_iteration": 2.5254998207092285 + }, + { + "auxiliary_loss_clip": 0.01099909, + "auxiliary_loss_mlp": 0.01028487, + "balance_loss_clip": 1.01754403, + "balance_loss_mlp": 1.03417087, + "epoch": 0.6802946039380731, + "flos": 15916866122880.0, + "grad_norm": 1.7481645051595534, + "language_loss": 0.81398594, + "learning_rate": 9.796083781453972e-07, + "loss": 0.83526987, + "num_input_tokens_seen": 244152755, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.65625, + "step": 11315, + "time_per_iteration": 2.453127861022949 + }, + { + "auxiliary_loss_clip": 0.01104752, + "auxiliary_loss_mlp": 0.01026651, + "balance_loss_clip": 1.01452708, + "balance_loss_mlp": 1.03766704, + "epoch": 0.680354727190741, + "flos": 22018664067840.0, + "grad_norm": 1.6730986060802988, + "language_loss": 0.69740957, + "learning_rate": 9.792734377526718e-07, + "loss": 0.7187236, + "num_input_tokens_seen": 244171480, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 11316, + "time_per_iteration": 2.483550548553467 + }, + { + "auxiliary_loss_clip": 0.01103992, + "auxiliary_loss_mlp": 0.01026651, + "balance_loss_clip": 1.0155412, + "balance_loss_mlp": 1.03765678, + "epoch": 0.680414850443409, + "flos": 18441494467200.0, + "grad_norm": 2.178074033436339, + "language_loss": 0.66859937, + "learning_rate": 9.789385360660003e-07, + "loss": 0.68990576, + "num_input_tokens_seen": 244187920, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 11317, + "time_per_iteration": 2.4059898853302 + }, + { + "auxiliary_loss_clip": 0.01106005, + "auxiliary_loss_mlp": 0.0103958, + "balance_loss_clip": 1.02807629, + "balance_loss_mlp": 1.0385282, + "epoch": 0.680474973696077, + "flos": 26358611909760.0, + "grad_norm": 1.4508017405477542, + "language_loss": 0.75009024, + "learning_rate": 9.78603673098082e-07, + "loss": 0.77154613, + "num_input_tokens_seen": 244209565, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.67578125, + "step": 11318, + "time_per_iteration": 2.499570608139038 + }, + { + "auxiliary_loss_clip": 0.01097899, + "auxiliary_loss_mlp": 0.01028185, + "balance_loss_clip": 1.01697898, + "balance_loss_mlp": 1.03418541, + "epoch": 0.6805350969487449, + "flos": 18333116156160.0, + "grad_norm": 2.7236911079158985, + "language_loss": 0.6802513, + "learning_rate": 9.782688488616143e-07, + "loss": 0.7015121, + "num_input_tokens_seen": 244228015, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.63671875, + "step": 11319, + "time_per_iteration": 2.4078075885772705 + }, + { + "auxiliary_loss_clip": 0.01101617, + "auxiliary_loss_mlp": 0.01037234, + "balance_loss_clip": 1.02501535, + "balance_loss_mlp": 1.03571796, + "epoch": 0.6805952202014129, + "flos": 19937497034880.0, + "grad_norm": 1.8193525574873417, + "language_loss": 0.76578677, + "learning_rate": 9.779340633692945e-07, + "loss": 0.7871753, + "num_input_tokens_seen": 244245615, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65625, + "step": 11320, + "time_per_iteration": 2.4763078689575195 + }, + { + "auxiliary_loss_clip": 0.011026, + "auxiliary_loss_mlp": 0.01028294, + "balance_loss_clip": 1.01627135, + "balance_loss_mlp": 1.0357213, + "epoch": 0.6806553434540809, + "flos": 25224301342080.0, + "grad_norm": 2.0578108779297732, + "language_loss": 0.74360389, + "learning_rate": 9.77599316633817e-07, + "loss": 0.76491284, + "num_input_tokens_seen": 244263625, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66796875, + "step": 11321, + "time_per_iteration": 2.4495351314544678 + }, + { + "auxiliary_loss_clip": 0.01105692, + "auxiliary_loss_mlp": 0.01034068, + "balance_loss_clip": 1.02243876, + "balance_loss_mlp": 1.03807235, + "epoch": 0.6807154667067489, + "flos": 17785586165760.0, + "grad_norm": 1.8874116924899373, + "language_loss": 0.72533345, + "learning_rate": 9.772646086678758e-07, + "loss": 0.74673104, + "num_input_tokens_seen": 244282745, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.67578125, + "step": 11322, + "time_per_iteration": 2.4374794960021973 + }, + { + "auxiliary_loss_clip": 0.01102931, + "auxiliary_loss_mlp": 0.01028655, + "balance_loss_clip": 1.01677608, + "balance_loss_mlp": 1.03495407, + "epoch": 0.6807755899594168, + "flos": 22199905117440.0, + "grad_norm": 1.6803003695181602, + "language_loss": 0.78470093, + "learning_rate": 9.769299394841638e-07, + "loss": 0.8060168, + "num_input_tokens_seen": 244303770, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 11323, + "time_per_iteration": 2.4333457946777344 + }, + { + "auxiliary_loss_clip": 0.01027457, + "auxiliary_loss_mlp": 0.01001857, + "balance_loss_clip": 1.00065899, + "balance_loss_mlp": 1.00677872, + "epoch": 0.6808357132120848, + "flos": 68631073200000.0, + "grad_norm": 0.7788248321760284, + "language_loss": 0.57097274, + "learning_rate": 9.765953090953714e-07, + "loss": 0.59126586, + "num_input_tokens_seen": 244355910, + "router_z_loss_clip": 0.01196289, + "router_z_loss_mlp": 0.20703125, + "step": 11324, + "time_per_iteration": 2.87032413482666 + }, + { + "auxiliary_loss_clip": 0.01104753, + "auxiliary_loss_mlp": 0.01034165, + "balance_loss_clip": 1.02192771, + "balance_loss_mlp": 1.03705823, + "epoch": 0.6808958364647527, + "flos": 23843357015040.0, + "grad_norm": 1.797689988899455, + "language_loss": 0.68072367, + "learning_rate": 9.76260717514186e-07, + "loss": 0.70211285, + "num_input_tokens_seen": 244376610, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6796875, + "step": 11325, + "time_per_iteration": 2.4791805744171143 + }, + { + "auxiliary_loss_clip": 0.01107083, + "auxiliary_loss_mlp": 0.01031616, + "balance_loss_clip": 1.01901543, + "balance_loss_mlp": 1.03593659, + "epoch": 0.6809559597174207, + "flos": 17711717846400.0, + "grad_norm": 9.902559035776392, + "language_loss": 0.7025001, + "learning_rate": 9.759261647532974e-07, + "loss": 0.72388709, + "num_input_tokens_seen": 244393000, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 11326, + "time_per_iteration": 2.411768913269043 + }, + { + "auxiliary_loss_clip": 0.01103444, + "auxiliary_loss_mlp": 0.01030379, + "balance_loss_clip": 1.01868427, + "balance_loss_mlp": 1.03564632, + "epoch": 0.6810160829700886, + "flos": 22491894775680.0, + "grad_norm": 1.7689960274485943, + "language_loss": 0.72761798, + "learning_rate": 9.75591650825392e-07, + "loss": 0.7489562, + "num_input_tokens_seen": 244409515, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 11327, + "time_per_iteration": 2.4436709880828857 + }, + { + "auxiliary_loss_clip": 0.01101261, + "auxiliary_loss_mlp": 0.01031062, + "balance_loss_clip": 1.01918912, + "balance_loss_mlp": 1.03561234, + "epoch": 0.6810762062227567, + "flos": 16832875783680.0, + "grad_norm": 2.3861554573552533, + "language_loss": 0.77319372, + "learning_rate": 9.752571757431526e-07, + "loss": 0.79451698, + "num_input_tokens_seen": 244427165, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.65625, + "step": 11328, + "time_per_iteration": 2.427549123764038 + }, + { + "auxiliary_loss_clip": 0.01104861, + "auxiliary_loss_mlp": 0.01029367, + "balance_loss_clip": 1.01756525, + "balance_loss_mlp": 1.03677118, + "epoch": 0.6811363294754246, + "flos": 12714676554240.0, + "grad_norm": 3.828786564380187, + "language_loss": 0.64639735, + "learning_rate": 9.74922739519265e-07, + "loss": 0.66773969, + "num_input_tokens_seen": 244445705, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.68359375, + "step": 11329, + "time_per_iteration": 2.4063379764556885 + }, + { + "auxiliary_loss_clip": 0.01106328, + "auxiliary_loss_mlp": 0.01029313, + "balance_loss_clip": 1.01713562, + "balance_loss_mlp": 1.03745294, + "epoch": 0.6811964527280926, + "flos": 17711969241600.0, + "grad_norm": 1.9960449149160304, + "language_loss": 0.79504317, + "learning_rate": 9.745883421664096e-07, + "loss": 0.81639957, + "num_input_tokens_seen": 244460415, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6875, + "step": 11330, + "time_per_iteration": 2.4729740619659424 + }, + { + "auxiliary_loss_clip": 0.0110534, + "auxiliary_loss_mlp": 0.01029907, + "balance_loss_clip": 1.01765263, + "balance_loss_mlp": 1.03767729, + "epoch": 0.6812565759807605, + "flos": 24863471268480.0, + "grad_norm": 3.982985267736798, + "language_loss": 0.63851273, + "learning_rate": 9.742539836972665e-07, + "loss": 0.6598652, + "num_input_tokens_seen": 244480555, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.67578125, + "step": 11331, + "time_per_iteration": 2.4589385986328125 + }, + { + "auxiliary_loss_clip": 0.01104506, + "auxiliary_loss_mlp": 0.01034793, + "balance_loss_clip": 1.02241278, + "balance_loss_mlp": 1.03761506, + "epoch": 0.6813166992334285, + "flos": 17166019449600.0, + "grad_norm": 1.9198633310725437, + "language_loss": 0.7197634, + "learning_rate": 9.739196641245148e-07, + "loss": 0.7411564, + "num_input_tokens_seen": 244498540, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.66796875, + "step": 11332, + "time_per_iteration": 2.48699951171875 + }, + { + "auxiliary_loss_clip": 0.01105323, + "auxiliary_loss_mlp": 0.01032093, + "balance_loss_clip": 1.0197432, + "balance_loss_mlp": 1.03659022, + "epoch": 0.6813768224860965, + "flos": 18843550375680.0, + "grad_norm": 1.8849624776188914, + "language_loss": 0.75043106, + "learning_rate": 9.735853834608326e-07, + "loss": 0.77180523, + "num_input_tokens_seen": 244517015, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6875, + "step": 11333, + "time_per_iteration": 2.4035282135009766 + }, + { + "auxiliary_loss_clip": 0.01109278, + "auxiliary_loss_mlp": 0.01029254, + "balance_loss_clip": 1.01664138, + "balance_loss_mlp": 1.03870749, + "epoch": 0.6814369457387645, + "flos": 24532733813760.0, + "grad_norm": 1.3964934580500172, + "language_loss": 0.71910471, + "learning_rate": 9.732511417188963e-07, + "loss": 0.74048996, + "num_input_tokens_seen": 244537450, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.70703125, + "step": 11334, + "time_per_iteration": 2.514709234237671 + }, + { + "auxiliary_loss_clip": 0.01102183, + "auxiliary_loss_mlp": 0.01031072, + "balance_loss_clip": 1.01966393, + "balance_loss_mlp": 1.03584528, + "epoch": 0.6814970689914325, + "flos": 18222978078720.0, + "grad_norm": 1.6647407719870675, + "language_loss": 0.85981625, + "learning_rate": 9.729169389113791e-07, + "loss": 0.88114882, + "num_input_tokens_seen": 244555640, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 11335, + "time_per_iteration": 2.566171169281006 + }, + { + "auxiliary_loss_clip": 0.0109703, + "auxiliary_loss_mlp": 0.01027373, + "balance_loss_clip": 1.01659656, + "balance_loss_mlp": 1.03387475, + "epoch": 0.6815571922441004, + "flos": 25228790542080.0, + "grad_norm": 2.956835270100481, + "language_loss": 0.81945407, + "learning_rate": 9.725827750509542e-07, + "loss": 0.84069812, + "num_input_tokens_seen": 244574005, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6328125, + "step": 11336, + "time_per_iteration": 2.50917911529541 + }, + { + "auxiliary_loss_clip": 0.01100635, + "auxiliary_loss_mlp": 0.01029151, + "balance_loss_clip": 1.01822007, + "balance_loss_mlp": 1.03596747, + "epoch": 0.6816173154967684, + "flos": 19456078026240.0, + "grad_norm": 1.8358807203128344, + "language_loss": 0.81945646, + "learning_rate": 9.72248650150294e-07, + "loss": 0.84075427, + "num_input_tokens_seen": 244591395, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.64453125, + "step": 11337, + "time_per_iteration": 2.448796510696411 + }, + { + "auxiliary_loss_clip": 0.01099529, + "auxiliary_loss_mlp": 0.0102691, + "balance_loss_clip": 1.0160563, + "balance_loss_mlp": 1.03479064, + "epoch": 0.6816774387494363, + "flos": 17931455297280.0, + "grad_norm": 1.722806796595651, + "language_loss": 0.72469616, + "learning_rate": 9.719145642220673e-07, + "loss": 0.74596059, + "num_input_tokens_seen": 244610400, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6484375, + "step": 11338, + "time_per_iteration": 2.517240047454834 + }, + { + "auxiliary_loss_clip": 0.01105227, + "auxiliary_loss_mlp": 0.01031788, + "balance_loss_clip": 1.02005768, + "balance_loss_mlp": 1.03771722, + "epoch": 0.6817375620021043, + "flos": 22233014478720.0, + "grad_norm": 1.4508555916130568, + "language_loss": 0.77669561, + "learning_rate": 9.715805172789435e-07, + "loss": 0.79806578, + "num_input_tokens_seen": 244630400, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.67578125, + "step": 11339, + "time_per_iteration": 2.436663866043091 + }, + { + "auxiliary_loss_clip": 0.01105389, + "auxiliary_loss_mlp": 0.01032391, + "balance_loss_clip": 1.02076244, + "balance_loss_mlp": 1.03804171, + "epoch": 0.6817976852547722, + "flos": 25374408278400.0, + "grad_norm": 2.0295293442554483, + "language_loss": 0.70622659, + "learning_rate": 9.712465093335901e-07, + "loss": 0.72760439, + "num_input_tokens_seen": 244649155, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 11340, + "time_per_iteration": 2.5092625617980957 + }, + { + "auxiliary_loss_clip": 0.01108606, + "auxiliary_loss_mlp": 0.01032866, + "balance_loss_clip": 1.02090895, + "balance_loss_mlp": 1.03815854, + "epoch": 0.6818578085074403, + "flos": 22265764704000.0, + "grad_norm": 2.203520540229157, + "language_loss": 0.82961929, + "learning_rate": 9.709125403986722e-07, + "loss": 0.85103399, + "num_input_tokens_seen": 244665470, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.703125, + "step": 11341, + "time_per_iteration": 2.470651626586914 + }, + { + "auxiliary_loss_clip": 0.01106072, + "auxiliary_loss_mlp": 0.01036902, + "balance_loss_clip": 1.02358067, + "balance_loss_mlp": 1.03685653, + "epoch": 0.6819179317601082, + "flos": 19318145800320.0, + "grad_norm": 1.764627541247337, + "language_loss": 0.68348753, + "learning_rate": 9.705786104868531e-07, + "loss": 0.70491731, + "num_input_tokens_seen": 244684390, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.69140625, + "step": 11342, + "time_per_iteration": 2.5127713680267334 + }, + { + "auxiliary_loss_clip": 0.01101992, + "auxiliary_loss_mlp": 0.01029003, + "balance_loss_clip": 1.01706433, + "balance_loss_mlp": 1.03569162, + "epoch": 0.6819780550127762, + "flos": 21104126864640.0, + "grad_norm": 1.7075903323008321, + "language_loss": 0.74946058, + "learning_rate": 9.702447196107963e-07, + "loss": 0.77077055, + "num_input_tokens_seen": 244703370, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6640625, + "step": 11343, + "time_per_iteration": 2.5146141052246094 + }, + { + "auxiliary_loss_clip": 0.01104891, + "auxiliary_loss_mlp": 0.01033799, + "balance_loss_clip": 1.02227187, + "balance_loss_mlp": 1.0377264, + "epoch": 0.6820381782654441, + "flos": 29716403195520.0, + "grad_norm": 1.6017732799578648, + "language_loss": 0.79690164, + "learning_rate": 9.699108677831639e-07, + "loss": 0.81828856, + "num_input_tokens_seen": 244723325, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 11344, + "time_per_iteration": 3.9397521018981934 + }, + { + "auxiliary_loss_clip": 0.01102922, + "auxiliary_loss_mlp": 0.01032113, + "balance_loss_clip": 1.0200969, + "balance_loss_mlp": 1.03575659, + "epoch": 0.6820983015181121, + "flos": 29242130993280.0, + "grad_norm": 2.3863241768064416, + "language_loss": 0.66377771, + "learning_rate": 9.695770550166136e-07, + "loss": 0.68512809, + "num_input_tokens_seen": 244745650, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 11345, + "time_per_iteration": 2.5208473205566406 + }, + { + "auxiliary_loss_clip": 0.01106639, + "auxiliary_loss_mlp": 0.01030289, + "balance_loss_clip": 1.01854682, + "balance_loss_mlp": 1.03741777, + "epoch": 0.6821584247707801, + "flos": 18871775487360.0, + "grad_norm": 2.4472974915932637, + "language_loss": 0.64573473, + "learning_rate": 9.692432813238054e-07, + "loss": 0.66710401, + "num_input_tokens_seen": 244760270, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6953125, + "step": 11346, + "time_per_iteration": 3.8512396812438965 + }, + { + "auxiliary_loss_clip": 0.01105563, + "auxiliary_loss_mlp": 0.01030452, + "balance_loss_clip": 1.01776791, + "balance_loss_mlp": 1.03745544, + "epoch": 0.6822185480234481, + "flos": 21324582587520.0, + "grad_norm": 1.5968577060390179, + "language_loss": 0.7844069, + "learning_rate": 9.689095467173952e-07, + "loss": 0.80576706, + "num_input_tokens_seen": 244779565, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 11347, + "time_per_iteration": 3.8028361797332764 + }, + { + "auxiliary_loss_clip": 0.01026659, + "auxiliary_loss_mlp": 0.01001661, + "balance_loss_clip": 1.00046301, + "balance_loss_mlp": 1.0059818, + "epoch": 0.6822786712761161, + "flos": 63488306430720.0, + "grad_norm": 0.7216727103538496, + "language_loss": 0.5250113, + "learning_rate": 9.685758512100378e-07, + "loss": 0.54529452, + "num_input_tokens_seen": 244838480, + "router_z_loss_clip": 0.01196289, + "router_z_loss_mlp": 0.20703125, + "step": 11348, + "time_per_iteration": 4.506226539611816 + }, + { + "auxiliary_loss_clip": 0.01101236, + "auxiliary_loss_mlp": 0.01032556, + "balance_loss_clip": 1.02144003, + "balance_loss_mlp": 1.03572845, + "epoch": 0.682338794528784, + "flos": 21068934514560.0, + "grad_norm": 1.7906697802801645, + "language_loss": 0.79596829, + "learning_rate": 9.682421948143873e-07, + "loss": 0.81730622, + "num_input_tokens_seen": 244855265, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 11349, + "time_per_iteration": 2.4514377117156982 + }, + { + "auxiliary_loss_clip": 0.01111621, + "auxiliary_loss_mlp": 0.01028663, + "balance_loss_clip": 1.01438189, + "balance_loss_mlp": 1.03865266, + "epoch": 0.682398917781452, + "flos": 36283243547520.0, + "grad_norm": 9.523032245657118, + "language_loss": 0.74000543, + "learning_rate": 9.67908577543096e-07, + "loss": 0.76140821, + "num_input_tokens_seen": 244875555, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.7265625, + "step": 11350, + "time_per_iteration": 2.6128787994384766 + }, + { + "auxiliary_loss_clip": 0.01102473, + "auxiliary_loss_mlp": 0.01028574, + "balance_loss_clip": 1.01656938, + "balance_loss_mlp": 1.03694868, + "epoch": 0.6824590410341199, + "flos": 24859197550080.0, + "grad_norm": 1.583319505093848, + "language_loss": 0.79434985, + "learning_rate": 9.675749994088161e-07, + "loss": 0.81566036, + "num_input_tokens_seen": 244895270, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.65625, + "step": 11351, + "time_per_iteration": 2.4813127517700195 + }, + { + "auxiliary_loss_clip": 0.01102481, + "auxiliary_loss_mlp": 0.01033209, + "balance_loss_clip": 1.02167511, + "balance_loss_mlp": 1.03581142, + "epoch": 0.6825191642867879, + "flos": 22452392793600.0, + "grad_norm": 1.5951575368956712, + "language_loss": 0.73410577, + "learning_rate": 9.672414604241954e-07, + "loss": 0.75546265, + "num_input_tokens_seen": 244914535, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66796875, + "step": 11352, + "time_per_iteration": 2.471541166305542 + }, + { + "auxiliary_loss_clip": 0.01105327, + "auxiliary_loss_mlp": 0.01034603, + "balance_loss_clip": 1.02216315, + "balance_loss_mlp": 1.03617918, + "epoch": 0.6825792875394558, + "flos": 29424377623680.0, + "grad_norm": 1.5725908722190713, + "language_loss": 0.80191058, + "learning_rate": 9.669079606018814e-07, + "loss": 0.8233099, + "num_input_tokens_seen": 244936095, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69140625, + "step": 11353, + "time_per_iteration": 2.5034008026123047 + }, + { + "auxiliary_loss_clip": 0.01103178, + "auxiliary_loss_mlp": 0.01024386, + "balance_loss_clip": 1.01242352, + "balance_loss_mlp": 1.03601313, + "epoch": 0.6826394107921239, + "flos": 18770974945920.0, + "grad_norm": 1.984510532707265, + "language_loss": 0.78228319, + "learning_rate": 9.665744999545218e-07, + "loss": 0.80355877, + "num_input_tokens_seen": 244955290, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 11354, + "time_per_iteration": 2.4608607292175293 + }, + { + "auxiliary_loss_clip": 0.01102222, + "auxiliary_loss_mlp": 0.0102752, + "balance_loss_clip": 1.01630878, + "balance_loss_mlp": 1.03619695, + "epoch": 0.6826995340447918, + "flos": 16617591619200.0, + "grad_norm": 2.0028339846466445, + "language_loss": 0.61692381, + "learning_rate": 9.662410784947599e-07, + "loss": 0.63822126, + "num_input_tokens_seen": 244972935, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.66015625, + "step": 11355, + "time_per_iteration": 2.40678071975708 + }, + { + "auxiliary_loss_clip": 0.01101274, + "auxiliary_loss_mlp": 0.01026693, + "balance_loss_clip": 1.0152607, + "balance_loss_mlp": 1.03438973, + "epoch": 0.6827596572974598, + "flos": 20848299223680.0, + "grad_norm": 1.9183183626079316, + "language_loss": 0.81905627, + "learning_rate": 9.659076962352398e-07, + "loss": 0.84033597, + "num_input_tokens_seen": 244989440, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 11356, + "time_per_iteration": 2.4604368209838867 + }, + { + "auxiliary_loss_clip": 0.01106625, + "auxiliary_loss_mlp": 0.01028652, + "balance_loss_clip": 1.01654649, + "balance_loss_mlp": 1.03872633, + "epoch": 0.6828197805501277, + "flos": 22748081552640.0, + "grad_norm": 2.562660672921637, + "language_loss": 0.78667843, + "learning_rate": 9.655743531886052e-07, + "loss": 0.8080312, + "num_input_tokens_seen": 245007830, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 11357, + "time_per_iteration": 2.4570956230163574 + }, + { + "auxiliary_loss_clip": 0.01027055, + "auxiliary_loss_mlp": 0.01004376, + "balance_loss_clip": 1.00311232, + "balance_loss_mlp": 1.00636482, + "epoch": 0.6828799038027957, + "flos": 71646565829760.0, + "grad_norm": 0.8170905749226814, + "language_loss": 0.59669131, + "learning_rate": 9.65241049367493e-07, + "loss": 0.61700559, + "num_input_tokens_seen": 245070720, + "router_z_loss_clip": 0.01263428, + "router_z_loss_mlp": 0.20703125, + "step": 11358, + "time_per_iteration": 3.1206090450286865 + }, + { + "auxiliary_loss_clip": 0.01108785, + "auxiliary_loss_mlp": 0.01036705, + "balance_loss_clip": 1.0243305, + "balance_loss_mlp": 1.03812075, + "epoch": 0.6829400270554637, + "flos": 19829154637440.0, + "grad_norm": 1.7308298657289736, + "language_loss": 0.78347307, + "learning_rate": 9.64907784784544e-07, + "loss": 0.804928, + "num_input_tokens_seen": 245089070, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.70703125, + "step": 11359, + "time_per_iteration": 2.4206995964050293 + }, + { + "auxiliary_loss_clip": 0.01102635, + "auxiliary_loss_mlp": 0.0103121, + "balance_loss_clip": 1.01964045, + "balance_loss_mlp": 1.03594446, + "epoch": 0.6830001503081317, + "flos": 21980634543360.0, + "grad_norm": 1.9738432775453243, + "language_loss": 0.81637627, + "learning_rate": 9.645745594523958e-07, + "loss": 0.83771473, + "num_input_tokens_seen": 245106500, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66796875, + "step": 11360, + "time_per_iteration": 2.476433038711548 + }, + { + "auxiliary_loss_clip": 0.01107014, + "auxiliary_loss_mlp": 0.01033044, + "balance_loss_clip": 1.02083063, + "balance_loss_mlp": 1.03856695, + "epoch": 0.6830602735607997, + "flos": 24316767290880.0, + "grad_norm": 1.86444446180785, + "language_loss": 0.75634044, + "learning_rate": 9.642413733836844e-07, + "loss": 0.77774101, + "num_input_tokens_seen": 245125260, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 11361, + "time_per_iteration": 2.4659223556518555 + }, + { + "auxiliary_loss_clip": 0.010268, + "auxiliary_loss_mlp": 0.01003581, + "balance_loss_clip": 1.00227582, + "balance_loss_mlp": 1.00611186, + "epoch": 0.6831203968134676, + "flos": 57690062323200.0, + "grad_norm": 0.8682819030103981, + "language_loss": 0.59711051, + "learning_rate": 9.639082265910437e-07, + "loss": 0.61741436, + "num_input_tokens_seen": 245188730, + "router_z_loss_clip": 0.01306152, + "router_z_loss_mlp": 0.20703125, + "step": 11362, + "time_per_iteration": 3.127232074737549 + }, + { + "auxiliary_loss_clip": 0.01104869, + "auxiliary_loss_mlp": 0.01030633, + "balance_loss_clip": 1.0179255, + "balance_loss_mlp": 1.03573108, + "epoch": 0.6831805200661356, + "flos": 14388436552320.0, + "grad_norm": 2.8459010350172913, + "language_loss": 0.74898708, + "learning_rate": 9.635751190871074e-07, + "loss": 0.77034211, + "num_input_tokens_seen": 245205065, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 11363, + "time_per_iteration": 2.4112236499786377 + }, + { + "auxiliary_loss_clip": 0.01102233, + "auxiliary_loss_mlp": 0.01037618, + "balance_loss_clip": 1.02511919, + "balance_loss_mlp": 1.03508842, + "epoch": 0.6832406433188035, + "flos": 22820297846400.0, + "grad_norm": 2.6445368972435976, + "language_loss": 0.89400429, + "learning_rate": 9.632420508845063e-07, + "loss": 0.91540277, + "num_input_tokens_seen": 245224265, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.671875, + "step": 11364, + "time_per_iteration": 2.4431772232055664 + }, + { + "auxiliary_loss_clip": 0.01101882, + "auxiliary_loss_mlp": 0.01030697, + "balance_loss_clip": 1.01950884, + "balance_loss_mlp": 1.03680646, + "epoch": 0.6833007665714715, + "flos": 17561718650880.0, + "grad_norm": 3.2328498112003503, + "language_loss": 0.88372034, + "learning_rate": 9.629090219958697e-07, + "loss": 0.90504611, + "num_input_tokens_seen": 245243360, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.65234375, + "step": 11365, + "time_per_iteration": 2.4429502487182617 + }, + { + "auxiliary_loss_clip": 0.01110566, + "auxiliary_loss_mlp": 0.01036403, + "balance_loss_clip": 1.02396965, + "balance_loss_mlp": 1.03944576, + "epoch": 0.6833608898241395, + "flos": 22445928345600.0, + "grad_norm": 2.0793788072414734, + "language_loss": 0.81185693, + "learning_rate": 9.625760324338272e-07, + "loss": 0.83332664, + "num_input_tokens_seen": 245256350, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.7109375, + "step": 11366, + "time_per_iteration": 2.472283363342285 + }, + { + "auxiliary_loss_clip": 0.01103514, + "auxiliary_loss_mlp": 0.01028693, + "balance_loss_clip": 1.0166235, + "balance_loss_mlp": 1.03517795, + "epoch": 0.6834210130768075, + "flos": 24534637234560.0, + "grad_norm": 1.7001262791469558, + "language_loss": 0.76775587, + "learning_rate": 9.622430822110062e-07, + "loss": 0.789078, + "num_input_tokens_seen": 245277575, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.68359375, + "step": 11367, + "time_per_iteration": 2.4591305255889893 + }, + { + "auxiliary_loss_clip": 0.01105081, + "auxiliary_loss_mlp": 0.01035748, + "balance_loss_clip": 1.0234282, + "balance_loss_mlp": 1.03755784, + "epoch": 0.6834811363294754, + "flos": 20047132321920.0, + "grad_norm": 1.4398909959276744, + "language_loss": 0.68965262, + "learning_rate": 9.619101713400312e-07, + "loss": 0.71106088, + "num_input_tokens_seen": 245296615, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.67578125, + "step": 11368, + "time_per_iteration": 2.477160692214966 + }, + { + "auxiliary_loss_clip": 0.01102397, + "auxiliary_loss_mlp": 0.01029739, + "balance_loss_clip": 1.01824105, + "balance_loss_mlp": 1.03536785, + "epoch": 0.6835412595821434, + "flos": 24790752184320.0, + "grad_norm": 1.9865162675168815, + "language_loss": 0.73352474, + "learning_rate": 9.615772998335261e-07, + "loss": 0.7548461, + "num_input_tokens_seen": 245316275, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 11369, + "time_per_iteration": 2.4527742862701416 + }, + { + "auxiliary_loss_clip": 0.01102773, + "auxiliary_loss_mlp": 0.0102762, + "balance_loss_clip": 1.01549673, + "balance_loss_mlp": 1.03507197, + "epoch": 0.6836013828348113, + "flos": 19500356517120.0, + "grad_norm": 1.995405258990165, + "language_loss": 0.78393018, + "learning_rate": 9.612444677041138e-07, + "loss": 0.80523407, + "num_input_tokens_seen": 245334595, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 11370, + "time_per_iteration": 2.443544864654541 + }, + { + "auxiliary_loss_clip": 0.01026342, + "auxiliary_loss_mlp": 0.01001936, + "balance_loss_clip": 1.00067234, + "balance_loss_mlp": 1.00567722, + "epoch": 0.6836615060874793, + "flos": 58363999251840.0, + "grad_norm": 0.7476131007411569, + "language_loss": 0.59831941, + "learning_rate": 9.609116749644162e-07, + "loss": 0.61860228, + "num_input_tokens_seen": 245389750, + "router_z_loss_clip": 0.01263428, + "router_z_loss_mlp": 0.20703125, + "step": 11371, + "time_per_iteration": 2.9889161586761475 + }, + { + "auxiliary_loss_clip": 0.01099697, + "auxiliary_loss_mlp": 0.01028049, + "balance_loss_clip": 1.01723075, + "balance_loss_mlp": 1.03550124, + "epoch": 0.6837216293401474, + "flos": 12166895168640.0, + "grad_norm": 1.4552904214885107, + "language_loss": 0.63685644, + "learning_rate": 9.605789216270511e-07, + "loss": 0.65813392, + "num_input_tokens_seen": 245407530, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.640625, + "step": 11372, + "time_per_iteration": 2.424954891204834 + }, + { + "auxiliary_loss_clip": 0.01101447, + "auxiliary_loss_mlp": 0.01025041, + "balance_loss_clip": 1.01319766, + "balance_loss_mlp": 1.03525615, + "epoch": 0.6837817525928153, + "flos": 22127581082880.0, + "grad_norm": 1.4781124923613422, + "language_loss": 0.71735704, + "learning_rate": 9.602462077046375e-07, + "loss": 0.73862189, + "num_input_tokens_seen": 245427000, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66015625, + "step": 11373, + "time_per_iteration": 2.474728584289551 + }, + { + "auxiliary_loss_clip": 0.01026667, + "auxiliary_loss_mlp": 0.01000459, + "balance_loss_clip": 0.99917108, + "balance_loss_mlp": 1.00602746, + "epoch": 0.6838418758454833, + "flos": 65005928985600.0, + "grad_norm": 1.2229800972978824, + "language_loss": 0.56697685, + "learning_rate": 9.599135332097935e-07, + "loss": 0.58724803, + "num_input_tokens_seen": 245491620, + "router_z_loss_clip": 0.01287842, + "router_z_loss_mlp": 0.20703125, + "step": 11374, + "time_per_iteration": 3.22890567779541 + }, + { + "auxiliary_loss_clip": 0.01106754, + "auxiliary_loss_mlp": 0.01026655, + "balance_loss_clip": 1.01422763, + "balance_loss_mlp": 1.03807116, + "epoch": 0.6839019990981512, + "flos": 21030833162880.0, + "grad_norm": 1.6218199942773524, + "language_loss": 0.73614061, + "learning_rate": 9.595808981551312e-07, + "loss": 0.75747472, + "num_input_tokens_seen": 245511285, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 11375, + "time_per_iteration": 2.461625814437866 + }, + { + "auxiliary_loss_clip": 0.01102376, + "auxiliary_loss_mlp": 0.01031136, + "balance_loss_clip": 1.01968646, + "balance_loss_mlp": 1.036448, + "epoch": 0.6839621223508192, + "flos": 24935543907840.0, + "grad_norm": 1.6159856732267652, + "language_loss": 0.70548576, + "learning_rate": 9.592483025532651e-07, + "loss": 0.72682095, + "num_input_tokens_seen": 245532910, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66015625, + "step": 11376, + "time_per_iteration": 2.4842541217803955 + }, + { + "auxiliary_loss_clip": 0.01104932, + "auxiliary_loss_mlp": 0.01037614, + "balance_loss_clip": 1.02520418, + "balance_loss_mlp": 1.03640866, + "epoch": 0.6840222456034871, + "flos": 26358827391360.0, + "grad_norm": 2.0252780909145756, + "language_loss": 0.7449975, + "learning_rate": 9.58915746416808e-07, + "loss": 0.76642299, + "num_input_tokens_seen": 245550540, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 11377, + "time_per_iteration": 2.5335726737976074 + }, + { + "auxiliary_loss_clip": 0.01026236, + "auxiliary_loss_mlp": 0.01001308, + "balance_loss_clip": 1.00019324, + "balance_loss_mlp": 1.00557923, + "epoch": 0.6840823688561551, + "flos": 65988336936960.0, + "grad_norm": 0.7232069780958926, + "language_loss": 0.56829667, + "learning_rate": 9.585832297583707e-07, + "loss": 0.58857214, + "num_input_tokens_seen": 245619570, + "router_z_loss_clip": 0.01116943, + "router_z_loss_mlp": 0.20703125, + "step": 11378, + "time_per_iteration": 3.137204885482788 + }, + { + "auxiliary_loss_clip": 0.01103234, + "auxiliary_loss_mlp": 0.0103223, + "balance_loss_clip": 1.01959991, + "balance_loss_mlp": 1.03537726, + "epoch": 0.684142492108823, + "flos": 21397588980480.0, + "grad_norm": 1.644108790952547, + "language_loss": 0.78129804, + "learning_rate": 9.58250752590561e-07, + "loss": 0.80265266, + "num_input_tokens_seen": 245637980, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 11379, + "time_per_iteration": 2.496009349822998 + }, + { + "auxiliary_loss_clip": 0.01098608, + "auxiliary_loss_mlp": 0.01026904, + "balance_loss_clip": 1.01664054, + "balance_loss_mlp": 1.03623796, + "epoch": 0.6842026153614911, + "flos": 18801426700800.0, + "grad_norm": 2.007866180272703, + "language_loss": 0.68494868, + "learning_rate": 9.57918314925988e-07, + "loss": 0.70620382, + "num_input_tokens_seen": 245655690, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.625, + "step": 11380, + "time_per_iteration": 2.406384229660034 + }, + { + "auxiliary_loss_clip": 0.01101488, + "auxiliary_loss_mlp": 0.0103156, + "balance_loss_clip": 1.01939452, + "balance_loss_mlp": 1.03453815, + "epoch": 0.684262738614159, + "flos": 19646405216640.0, + "grad_norm": 2.132022624853322, + "language_loss": 0.78171045, + "learning_rate": 9.575859167772568e-07, + "loss": 0.80304098, + "num_input_tokens_seen": 245671525, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.66796875, + "step": 11381, + "time_per_iteration": 2.4570810794830322 + }, + { + "auxiliary_loss_clip": 0.01025143, + "auxiliary_loss_mlp": 0.01003104, + "balance_loss_clip": 1.00188828, + "balance_loss_mlp": 1.00454473, + "epoch": 0.684322861866827, + "flos": 62354462739840.0, + "grad_norm": 0.8747752326004012, + "language_loss": 0.67185926, + "learning_rate": 9.572535581569713e-07, + "loss": 0.69214177, + "num_input_tokens_seen": 245724115, + "router_z_loss_clip": 0.012146, + "router_z_loss_mlp": 0.20605469, + "step": 11382, + "time_per_iteration": 2.90439510345459 + }, + { + "auxiliary_loss_clip": 0.0102608, + "auxiliary_loss_mlp": 0.01001227, + "balance_loss_clip": 1.00005233, + "balance_loss_mlp": 1.00557017, + "epoch": 0.6843829851194949, + "flos": 65805048812160.0, + "grad_norm": 0.8179080284964599, + "language_loss": 0.58123773, + "learning_rate": 9.569212390777356e-07, + "loss": 0.60151082, + "num_input_tokens_seen": 245789245, + "router_z_loss_clip": 0.01171875, + "router_z_loss_mlp": 0.20507812, + "step": 11383, + "time_per_iteration": 3.0904266834259033 + }, + { + "auxiliary_loss_clip": 0.01100892, + "auxiliary_loss_mlp": 0.01025381, + "balance_loss_clip": 1.0144496, + "balance_loss_mlp": 1.03393197, + "epoch": 0.6844431083721629, + "flos": 27855153181440.0, + "grad_norm": 6.398458171268355, + "language_loss": 0.7963292, + "learning_rate": 9.565889595521517e-07, + "loss": 0.81759197, + "num_input_tokens_seen": 245812420, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.671875, + "step": 11384, + "time_per_iteration": 2.56005859375 + }, + { + "auxiliary_loss_clip": 0.0110454, + "auxiliary_loss_mlp": 0.01033859, + "balance_loss_clip": 1.02203345, + "balance_loss_mlp": 1.03471613, + "epoch": 0.684503231624831, + "flos": 18255010032000.0, + "grad_norm": 2.1545219517049135, + "language_loss": 0.7672773, + "learning_rate": 9.562567195928187e-07, + "loss": 0.7886613, + "num_input_tokens_seen": 245829135, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.69921875, + "step": 11385, + "time_per_iteration": 2.442094326019287 + }, + { + "auxiliary_loss_clip": 0.0111135, + "auxiliary_loss_mlp": 0.01034562, + "balance_loss_clip": 1.02137756, + "balance_loss_mlp": 1.03792572, + "epoch": 0.6845633548774989, + "flos": 17639681120640.0, + "grad_norm": 2.0113901870570534, + "language_loss": 0.84306657, + "learning_rate": 9.55924519212335e-07, + "loss": 0.86452568, + "num_input_tokens_seen": 245847140, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.734375, + "step": 11386, + "time_per_iteration": 3.9225666522979736 + }, + { + "auxiliary_loss_clip": 0.01105442, + "auxiliary_loss_mlp": 0.01036021, + "balance_loss_clip": 1.02474415, + "balance_loss_mlp": 1.0376749, + "epoch": 0.6846234781301669, + "flos": 20807576179200.0, + "grad_norm": 2.7843660394813035, + "language_loss": 0.83315331, + "learning_rate": 9.555923584232984e-07, + "loss": 0.854568, + "num_input_tokens_seen": 245862855, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6796875, + "step": 11387, + "time_per_iteration": 2.4256067276000977 + }, + { + "auxiliary_loss_clip": 0.01100691, + "auxiliary_loss_mlp": 0.01028881, + "balance_loss_clip": 1.01747251, + "balance_loss_mlp": 1.03419471, + "epoch": 0.6846836013828348, + "flos": 36101176485120.0, + "grad_norm": 1.6307550098034056, + "language_loss": 0.72258627, + "learning_rate": 9.552602372383047e-07, + "loss": 0.74388194, + "num_input_tokens_seen": 245885415, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 11388, + "time_per_iteration": 3.991851806640625 + }, + { + "auxiliary_loss_clip": 0.01101036, + "auxiliary_loss_mlp": 0.01023785, + "balance_loss_clip": 1.01292491, + "balance_loss_mlp": 1.03534198, + "epoch": 0.6847437246355028, + "flos": 43142468607360.0, + "grad_norm": 1.8327013595289872, + "language_loss": 0.62769783, + "learning_rate": 9.549281556699469e-07, + "loss": 0.64894605, + "num_input_tokens_seen": 245906285, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.65625, + "step": 11389, + "time_per_iteration": 4.031615495681763 + }, + { + "auxiliary_loss_clip": 0.0102484, + "auxiliary_loss_mlp": 0.00998817, + "balance_loss_clip": 0.99756575, + "balance_loss_mlp": 1.00428033, + "epoch": 0.6848038478881707, + "flos": 71663729552640.0, + "grad_norm": 0.7254408078879129, + "language_loss": 0.56007105, + "learning_rate": 9.54596113730818e-07, + "loss": 0.5803076, + "num_input_tokens_seen": 245967620, + "router_z_loss_clip": 0.01251221, + "router_z_loss_mlp": 0.20605469, + "step": 11390, + "time_per_iteration": 4.692908048629761 + }, + { + "auxiliary_loss_clip": 0.01103708, + "auxiliary_loss_mlp": 0.01031834, + "balance_loss_clip": 1.02001452, + "balance_loss_mlp": 1.03709829, + "epoch": 0.6848639711408387, + "flos": 19937820257280.0, + "grad_norm": 2.011305237937575, + "language_loss": 0.8772974, + "learning_rate": 9.542641114335109e-07, + "loss": 0.89865273, + "num_input_tokens_seen": 245985075, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66796875, + "step": 11391, + "time_per_iteration": 2.4319207668304443 + }, + { + "auxiliary_loss_clip": 0.01106215, + "auxiliary_loss_mlp": 0.0103464, + "balance_loss_clip": 1.02271295, + "balance_loss_mlp": 1.03695166, + "epoch": 0.6849240943935067, + "flos": 26867501844480.0, + "grad_norm": 1.6650143278886758, + "language_loss": 0.79346359, + "learning_rate": 9.539321487906117e-07, + "loss": 0.81487215, + "num_input_tokens_seen": 246003560, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69140625, + "step": 11392, + "time_per_iteration": 2.501056671142578 + }, + { + "auxiliary_loss_clip": 0.01101076, + "auxiliary_loss_mlp": 0.01027958, + "balance_loss_clip": 1.01641881, + "balance_loss_mlp": 1.03576994, + "epoch": 0.6849842176461747, + "flos": 13735365425280.0, + "grad_norm": 2.2005866152358977, + "language_loss": 0.70957869, + "learning_rate": 9.536002258147104e-07, + "loss": 0.73086905, + "num_input_tokens_seen": 246019600, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65234375, + "step": 11393, + "time_per_iteration": 2.404430627822876 + }, + { + "auxiliary_loss_clip": 0.0110549, + "auxiliary_loss_mlp": 0.01029919, + "balance_loss_clip": 1.01724112, + "balance_loss_mlp": 1.03636444, + "epoch": 0.6850443408988426, + "flos": 24973070641920.0, + "grad_norm": 1.6151771222215205, + "language_loss": 0.64394313, + "learning_rate": 9.532683425183936e-07, + "loss": 0.66529727, + "num_input_tokens_seen": 246038920, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 11394, + "time_per_iteration": 2.4956462383270264 + }, + { + "auxiliary_loss_clip": 0.01105306, + "auxiliary_loss_mlp": 0.010345, + "balance_loss_clip": 1.02175093, + "balance_loss_mlp": 1.03593922, + "epoch": 0.6851044641515106, + "flos": 27744225004800.0, + "grad_norm": 2.3582380826263303, + "language_loss": 0.80521697, + "learning_rate": 9.529364989142468e-07, + "loss": 0.82661504, + "num_input_tokens_seen": 246060490, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 11395, + "time_per_iteration": 2.510676860809326 + }, + { + "auxiliary_loss_clip": 0.01105245, + "auxiliary_loss_mlp": 0.0103051, + "balance_loss_clip": 1.01755834, + "balance_loss_mlp": 1.03777242, + "epoch": 0.6851645874041785, + "flos": 24351061800960.0, + "grad_norm": 1.764971527643648, + "language_loss": 0.73285419, + "learning_rate": 9.526046950148527e-07, + "loss": 0.75421178, + "num_input_tokens_seen": 246081465, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.67578125, + "step": 11396, + "time_per_iteration": 2.568514823913574 + }, + { + "auxiliary_loss_clip": 0.01106243, + "auxiliary_loss_mlp": 0.0102748, + "balance_loss_clip": 1.01480818, + "balance_loss_mlp": 1.03660202, + "epoch": 0.6852247106568465, + "flos": 15077849264640.0, + "grad_norm": 5.148870058421947, + "language_loss": 0.79048425, + "learning_rate": 9.522729308327931e-07, + "loss": 0.81182146, + "num_input_tokens_seen": 246096110, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 11397, + "time_per_iteration": 2.4331774711608887 + }, + { + "auxiliary_loss_clip": 0.01103305, + "auxiliary_loss_mlp": 0.01026103, + "balance_loss_clip": 1.01383626, + "balance_loss_mlp": 1.03412771, + "epoch": 0.6852848339095146, + "flos": 18770005278720.0, + "grad_norm": 2.4689910585067616, + "language_loss": 0.71553206, + "learning_rate": 9.519412063806493e-07, + "loss": 0.73682612, + "num_input_tokens_seen": 246114785, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69140625, + "step": 11398, + "time_per_iteration": 2.5442934036254883 + }, + { + "auxiliary_loss_clip": 0.0110016, + "auxiliary_loss_mlp": 0.01033134, + "balance_loss_clip": 1.02194667, + "balance_loss_mlp": 1.03415036, + "epoch": 0.6853449571621825, + "flos": 27854363082240.0, + "grad_norm": 1.6631285015848603, + "language_loss": 0.70751739, + "learning_rate": 9.516095216709996e-07, + "loss": 0.72885031, + "num_input_tokens_seen": 246136375, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.66015625, + "step": 11399, + "time_per_iteration": 2.4914610385894775 + }, + { + "auxiliary_loss_clip": 0.01104852, + "auxiliary_loss_mlp": 0.01026727, + "balance_loss_clip": 1.01515758, + "balance_loss_mlp": 1.03707409, + "epoch": 0.6854050804148505, + "flos": 18150510389760.0, + "grad_norm": 1.5329347602462005, + "language_loss": 0.7047379, + "learning_rate": 9.512778767164217e-07, + "loss": 0.72605371, + "num_input_tokens_seen": 246155090, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6796875, + "step": 11400, + "time_per_iteration": 2.5048537254333496 + }, + { + "auxiliary_loss_clip": 0.01113165, + "auxiliary_loss_mlp": 0.01035214, + "balance_loss_clip": 1.02017021, + "balance_loss_mlp": 1.0384146, + "epoch": 0.6854652036675184, + "flos": 16326212492160.0, + "grad_norm": 1.826720269595169, + "language_loss": 0.78065717, + "learning_rate": 9.509462715294927e-07, + "loss": 0.80214089, + "num_input_tokens_seen": 246172645, + "router_z_loss_clip": 0.15039062, + "router_z_loss_mlp": 0.75, + "step": 11401, + "time_per_iteration": 2.441246747970581 + }, + { + "auxiliary_loss_clip": 0.01102237, + "auxiliary_loss_mlp": 0.01028091, + "balance_loss_clip": 1.01642609, + "balance_loss_mlp": 1.03616953, + "epoch": 0.6855253269201864, + "flos": 14940814878720.0, + "grad_norm": 2.0377910237961925, + "language_loss": 0.75284612, + "learning_rate": 9.50614706122786e-07, + "loss": 0.77414942, + "num_input_tokens_seen": 246189055, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66015625, + "step": 11402, + "time_per_iteration": 2.4716646671295166 + }, + { + "auxiliary_loss_clip": 0.01105094, + "auxiliary_loss_mlp": 0.01037038, + "balance_loss_clip": 1.0245446, + "balance_loss_mlp": 1.03575242, + "epoch": 0.6855854501728543, + "flos": 23037736826880.0, + "grad_norm": 1.633024747176301, + "language_loss": 0.7278834, + "learning_rate": 9.502831805088742e-07, + "loss": 0.74930477, + "num_input_tokens_seen": 246207990, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 11403, + "time_per_iteration": 2.4483251571655273 + }, + { + "auxiliary_loss_clip": 0.01101831, + "auxiliary_loss_mlp": 0.01029557, + "balance_loss_clip": 1.01826835, + "balance_loss_mlp": 1.03608656, + "epoch": 0.6856455734255223, + "flos": 13253623194240.0, + "grad_norm": 2.2661790169676284, + "language_loss": 0.81050408, + "learning_rate": 9.499516947003294e-07, + "loss": 0.83181787, + "num_input_tokens_seen": 246221595, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65625, + "step": 11404, + "time_per_iteration": 2.4682669639587402 + }, + { + "auxiliary_loss_clip": 0.01103095, + "auxiliary_loss_mlp": 0.01034189, + "balance_loss_clip": 1.02251863, + "balance_loss_mlp": 1.03651369, + "epoch": 0.6857056966781903, + "flos": 23333461499520.0, + "grad_norm": 1.3732837819876964, + "language_loss": 0.77531087, + "learning_rate": 9.496202487097222e-07, + "loss": 0.79668367, + "num_input_tokens_seen": 246242970, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 11405, + "time_per_iteration": 2.4672837257385254 + }, + { + "auxiliary_loss_clip": 0.01024197, + "auxiliary_loss_mlp": 0.01001171, + "balance_loss_clip": 1.00008011, + "balance_loss_mlp": 1.00375617, + "epoch": 0.6857658199308583, + "flos": 61852647784320.0, + "grad_norm": 0.7944486320456374, + "language_loss": 0.60998279, + "learning_rate": 9.492888425496199e-07, + "loss": 0.63023651, + "num_input_tokens_seen": 246300405, + "router_z_loss_clip": 0.01092529, + "router_z_loss_mlp": 0.20507812, + "step": 11406, + "time_per_iteration": 3.146902084350586 + }, + { + "auxiliary_loss_clip": 0.01102554, + "auxiliary_loss_mlp": 0.01033572, + "balance_loss_clip": 1.02050114, + "balance_loss_mlp": 1.03420663, + "epoch": 0.6858259431835262, + "flos": 16654543735680.0, + "grad_norm": 1.8632160242742672, + "language_loss": 0.76916838, + "learning_rate": 9.489574762325907e-07, + "loss": 0.79052973, + "num_input_tokens_seen": 246318780, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.68359375, + "step": 11407, + "time_per_iteration": 2.4350507259368896 + }, + { + "auxiliary_loss_clip": 0.01106959, + "auxiliary_loss_mlp": 0.01035298, + "balance_loss_clip": 1.02232862, + "balance_loss_mlp": 1.03708422, + "epoch": 0.6858860664361942, + "flos": 21872974504320.0, + "grad_norm": 2.5660412243788153, + "language_loss": 0.71399796, + "learning_rate": 9.486261497711991e-07, + "loss": 0.73542058, + "num_input_tokens_seen": 246339405, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.69921875, + "step": 11408, + "time_per_iteration": 2.5331506729125977 + }, + { + "auxiliary_loss_clip": 0.01104047, + "auxiliary_loss_mlp": 0.01025559, + "balance_loss_clip": 1.01318479, + "balance_loss_mlp": 1.03469181, + "epoch": 0.6859461896888621, + "flos": 15267637751040.0, + "grad_norm": 1.9585659451981918, + "language_loss": 0.69841951, + "learning_rate": 9.482948631780087e-07, + "loss": 0.7197156, + "num_input_tokens_seen": 246357055, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6953125, + "step": 11409, + "time_per_iteration": 2.406949520111084 + }, + { + "auxiliary_loss_clip": 0.01098382, + "auxiliary_loss_mlp": 0.01030381, + "balance_loss_clip": 1.01965857, + "balance_loss_mlp": 1.03563976, + "epoch": 0.6860063129415301, + "flos": 18620293392000.0, + "grad_norm": 1.5737480053745323, + "language_loss": 0.78358257, + "learning_rate": 9.479636164655825e-07, + "loss": 0.80487025, + "num_input_tokens_seen": 246374050, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.625, + "step": 11410, + "time_per_iteration": 2.5127828121185303 + }, + { + "auxiliary_loss_clip": 0.0110556, + "auxiliary_loss_mlp": 0.01034715, + "balance_loss_clip": 1.02162552, + "balance_loss_mlp": 1.03487253, + "epoch": 0.6860664361941982, + "flos": 23951376190080.0, + "grad_norm": 2.0456589939951852, + "language_loss": 0.71620971, + "learning_rate": 9.476324096464821e-07, + "loss": 0.73761249, + "num_input_tokens_seen": 246392910, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.70703125, + "step": 11411, + "time_per_iteration": 2.456273317337036 + }, + { + "auxiliary_loss_clip": 0.01105032, + "auxiliary_loss_mlp": 0.01031198, + "balance_loss_clip": 1.01804924, + "balance_loss_mlp": 1.03671002, + "epoch": 0.6861265594468661, + "flos": 20407782827520.0, + "grad_norm": 1.870472752363858, + "language_loss": 0.696311, + "learning_rate": 9.473012427332654e-07, + "loss": 0.7176733, + "num_input_tokens_seen": 246411540, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.68359375, + "step": 11412, + "time_per_iteration": 2.4815471172332764 + }, + { + "auxiliary_loss_clip": 0.0110396, + "auxiliary_loss_mlp": 0.01030657, + "balance_loss_clip": 1.01843774, + "balance_loss_mlp": 1.03616846, + "epoch": 0.6861866826995341, + "flos": 11428571111040.0, + "grad_norm": 2.8759639216310364, + "language_loss": 0.72033083, + "learning_rate": 9.469701157384919e-07, + "loss": 0.74167705, + "num_input_tokens_seen": 246423295, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6796875, + "step": 11413, + "time_per_iteration": 2.3763904571533203 + }, + { + "auxiliary_loss_clip": 0.01104612, + "auxiliary_loss_mlp": 0.01031654, + "balance_loss_clip": 1.01989388, + "balance_loss_mlp": 1.03653979, + "epoch": 0.686246805952202, + "flos": 15997593939840.0, + "grad_norm": 1.7019599587889749, + "language_loss": 0.73731822, + "learning_rate": 9.466390286747164e-07, + "loss": 0.75868088, + "num_input_tokens_seen": 246441045, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6796875, + "step": 11414, + "time_per_iteration": 2.4849958419799805 + }, + { + "auxiliary_loss_clip": 0.0110805, + "auxiliary_loss_mlp": 0.01030632, + "balance_loss_clip": 1.01831794, + "balance_loss_mlp": 1.03832841, + "epoch": 0.68630692920487, + "flos": 19826712512640.0, + "grad_norm": 2.1354792795106396, + "language_loss": 0.86471385, + "learning_rate": 9.46307981554495e-07, + "loss": 0.88610065, + "num_input_tokens_seen": 246456905, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69921875, + "step": 11415, + "time_per_iteration": 2.419379711151123 + }, + { + "auxiliary_loss_clip": 0.01106633, + "auxiliary_loss_mlp": 0.01036667, + "balance_loss_clip": 1.02393508, + "balance_loss_mlp": 1.03672004, + "epoch": 0.6863670524575379, + "flos": 26286216048000.0, + "grad_norm": 1.5997351133047528, + "language_loss": 0.67188251, + "learning_rate": 9.459769743903801e-07, + "loss": 0.69331551, + "num_input_tokens_seen": 246477545, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 11416, + "time_per_iteration": 2.544360876083374 + }, + { + "auxiliary_loss_clip": 0.01101411, + "auxiliary_loss_mlp": 0.01032124, + "balance_loss_clip": 1.02020359, + "balance_loss_mlp": 1.03366458, + "epoch": 0.686427175710206, + "flos": 19173138595200.0, + "grad_norm": 1.3938350999013296, + "language_loss": 0.75928599, + "learning_rate": 9.456460071949237e-07, + "loss": 0.78062129, + "num_input_tokens_seen": 246496705, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 11417, + "time_per_iteration": 2.420132637023926 + }, + { + "auxiliary_loss_clip": 0.01103442, + "auxiliary_loss_mlp": 0.01030485, + "balance_loss_clip": 1.01882672, + "balance_loss_mlp": 1.03592944, + "epoch": 0.6864872989628739, + "flos": 18916628595840.0, + "grad_norm": 1.7730588079343717, + "language_loss": 0.77459234, + "learning_rate": 9.45315079980678e-07, + "loss": 0.79593164, + "num_input_tokens_seen": 246514860, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.67578125, + "step": 11418, + "time_per_iteration": 2.4872171878814697 + }, + { + "auxiliary_loss_clip": 0.01103813, + "auxiliary_loss_mlp": 0.01026249, + "balance_loss_clip": 1.01471543, + "balance_loss_mlp": 1.03681958, + "epoch": 0.6865474222155419, + "flos": 25956196865280.0, + "grad_norm": 2.2244412162236924, + "language_loss": 0.76546735, + "learning_rate": 9.449841927601887e-07, + "loss": 0.78676796, + "num_input_tokens_seen": 246536145, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 11419, + "time_per_iteration": 2.5004422664642334 + }, + { + "auxiliary_loss_clip": 0.01101876, + "auxiliary_loss_mlp": 0.01032873, + "balance_loss_clip": 1.02173305, + "balance_loss_mlp": 1.03602588, + "epoch": 0.6866075454682098, + "flos": 18478087447680.0, + "grad_norm": 1.9820381057917913, + "language_loss": 0.71707082, + "learning_rate": 9.446533455460044e-07, + "loss": 0.73841834, + "num_input_tokens_seen": 246553265, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66015625, + "step": 11420, + "time_per_iteration": 2.480562925338745 + }, + { + "auxiliary_loss_clip": 0.01101218, + "auxiliary_loss_mlp": 0.01023861, + "balance_loss_clip": 1.01320374, + "balance_loss_mlp": 1.03455591, + "epoch": 0.6866676687208778, + "flos": 34239998298240.0, + "grad_norm": 1.3356950077180587, + "language_loss": 0.7420696, + "learning_rate": 9.443225383506712e-07, + "loss": 0.76332039, + "num_input_tokens_seen": 246575130, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6640625, + "step": 11421, + "time_per_iteration": 2.532064199447632 + }, + { + "auxiliary_loss_clip": 0.01100357, + "auxiliary_loss_mlp": 0.01029807, + "balance_loss_clip": 1.01827979, + "balance_loss_mlp": 1.03495026, + "epoch": 0.6867277919735457, + "flos": 21721754246400.0, + "grad_norm": 1.7634473864986122, + "language_loss": 0.77061129, + "learning_rate": 9.439917711867338e-07, + "loss": 0.79191291, + "num_input_tokens_seen": 246593095, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 11422, + "time_per_iteration": 2.494222402572632 + }, + { + "auxiliary_loss_clip": 0.01105572, + "auxiliary_loss_mlp": 0.01036083, + "balance_loss_clip": 1.02336359, + "balance_loss_mlp": 1.03689635, + "epoch": 0.6867879152262137, + "flos": 24097999507200.0, + "grad_norm": 2.896334528061073, + "language_loss": 0.77752495, + "learning_rate": 9.436610440667334e-07, + "loss": 0.79894149, + "num_input_tokens_seen": 246612165, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 11423, + "time_per_iteration": 2.4580142498016357 + }, + { + "auxiliary_loss_clip": 0.01105867, + "auxiliary_loss_mlp": 0.01028863, + "balance_loss_clip": 1.01655424, + "balance_loss_mlp": 1.03794348, + "epoch": 0.6868480384788818, + "flos": 21615818060160.0, + "grad_norm": 1.4732024211582577, + "language_loss": 0.72956997, + "learning_rate": 9.433303570032129e-07, + "loss": 0.75091726, + "num_input_tokens_seen": 246632065, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6796875, + "step": 11424, + "time_per_iteration": 2.5055267810821533 + }, + { + "auxiliary_loss_clip": 0.01105305, + "auxiliary_loss_mlp": 0.01028679, + "balance_loss_clip": 1.01724076, + "balance_loss_mlp": 1.03695333, + "epoch": 0.6869081617315497, + "flos": 26286144220800.0, + "grad_norm": 1.7308743196557235, + "language_loss": 0.65175045, + "learning_rate": 9.429997100087112e-07, + "loss": 0.67309034, + "num_input_tokens_seen": 246651245, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.68359375, + "step": 11425, + "time_per_iteration": 2.470486640930176 + }, + { + "auxiliary_loss_clip": 0.01102552, + "auxiliary_loss_mlp": 0.01023971, + "balance_loss_clip": 1.01246786, + "balance_loss_mlp": 1.03693807, + "epoch": 0.6869682849842177, + "flos": 21105096531840.0, + "grad_norm": 1.3720059089078416, + "language_loss": 0.71447921, + "learning_rate": 9.426691030957657e-07, + "loss": 0.73574442, + "num_input_tokens_seen": 246672225, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 11426, + "time_per_iteration": 2.5032618045806885 + }, + { + "auxiliary_loss_clip": 0.01102828, + "auxiliary_loss_mlp": 0.01026153, + "balance_loss_clip": 1.01463187, + "balance_loss_mlp": 1.03570724, + "epoch": 0.6870284082368856, + "flos": 17092653920640.0, + "grad_norm": 2.2242612174106737, + "language_loss": 0.85695207, + "learning_rate": 9.423385362769136e-07, + "loss": 0.8782419, + "num_input_tokens_seen": 246688385, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 11427, + "time_per_iteration": 2.4124362468719482 + }, + { + "auxiliary_loss_clip": 0.01102706, + "auxiliary_loss_mlp": 0.01027967, + "balance_loss_clip": 1.01630831, + "balance_loss_mlp": 1.03642297, + "epoch": 0.6870885314895536, + "flos": 27308090067840.0, + "grad_norm": 1.5166850198696897, + "language_loss": 0.75723726, + "learning_rate": 9.420080095646909e-07, + "loss": 0.77854395, + "num_input_tokens_seen": 246710730, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 11428, + "time_per_iteration": 3.971212387084961 + }, + { + "auxiliary_loss_clip": 0.0110684, + "auxiliary_loss_mlp": 0.01035789, + "balance_loss_clip": 1.02289069, + "balance_loss_mlp": 1.03649604, + "epoch": 0.6871486547422215, + "flos": 20814543417600.0, + "grad_norm": 2.165798768763756, + "language_loss": 0.73242265, + "learning_rate": 9.4167752297163e-07, + "loss": 0.75384891, + "num_input_tokens_seen": 246730350, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.703125, + "step": 11429, + "time_per_iteration": 2.4732346534729004 + }, + { + "auxiliary_loss_clip": 0.01107151, + "auxiliary_loss_mlp": 0.01027831, + "balance_loss_clip": 1.01595795, + "balance_loss_mlp": 1.03874505, + "epoch": 0.6872087779948896, + "flos": 30154118330880.0, + "grad_norm": 2.494094152353352, + "language_loss": 0.83109355, + "learning_rate": 9.413470765102643e-07, + "loss": 0.8524434, + "num_input_tokens_seen": 246751700, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.68359375, + "step": 11430, + "time_per_iteration": 3.9374120235443115 + }, + { + "auxiliary_loss_clip": 0.01102176, + "auxiliary_loss_mlp": 0.01032588, + "balance_loss_clip": 1.02065516, + "balance_loss_mlp": 1.03498435, + "epoch": 0.6872689012475575, + "flos": 20704584908160.0, + "grad_norm": 2.0537474499977746, + "language_loss": 0.700809, + "learning_rate": 9.410166701931225e-07, + "loss": 0.72215664, + "num_input_tokens_seen": 246769860, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 11431, + "time_per_iteration": 5.39936375617981 + }, + { + "auxiliary_loss_clip": 0.0110521, + "auxiliary_loss_mlp": 0.01032138, + "balance_loss_clip": 1.02002013, + "balance_loss_mlp": 1.03624368, + "epoch": 0.6873290245002255, + "flos": 25520852027520.0, + "grad_norm": 1.9154257852528767, + "language_loss": 0.79996437, + "learning_rate": 9.406863040327355e-07, + "loss": 0.82133788, + "num_input_tokens_seen": 246789905, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 11432, + "time_per_iteration": 2.5091586112976074 + }, + { + "auxiliary_loss_clip": 0.0110135, + "auxiliary_loss_mlp": 0.01026907, + "balance_loss_clip": 1.01545095, + "balance_loss_mlp": 1.03639221, + "epoch": 0.6873891477528934, + "flos": 25191479289600.0, + "grad_norm": 1.5073442194689934, + "language_loss": 0.67916226, + "learning_rate": 9.403559780416295e-07, + "loss": 0.70044488, + "num_input_tokens_seen": 246808815, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6484375, + "step": 11433, + "time_per_iteration": 2.4911651611328125 + }, + { + "auxiliary_loss_clip": 0.0110721, + "auxiliary_loss_mlp": 0.01037163, + "balance_loss_clip": 1.02483046, + "balance_loss_mlp": 1.03957868, + "epoch": 0.6874492710055614, + "flos": 35152380685440.0, + "grad_norm": 1.9834703858650884, + "language_loss": 0.72955799, + "learning_rate": 9.400256922323309e-07, + "loss": 0.75100172, + "num_input_tokens_seen": 246829775, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.67578125, + "step": 11434, + "time_per_iteration": 2.601761817932129 + }, + { + "auxiliary_loss_clip": 0.01104287, + "auxiliary_loss_mlp": 0.0102684, + "balance_loss_clip": 1.01488328, + "balance_loss_mlp": 1.03820884, + "epoch": 0.6875093942582293, + "flos": 17822215059840.0, + "grad_norm": 1.6345537065528275, + "language_loss": 0.80520904, + "learning_rate": 9.396954466173657e-07, + "loss": 0.82652032, + "num_input_tokens_seen": 246848045, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66015625, + "step": 11435, + "time_per_iteration": 2.4691109657287598 + }, + { + "auxiliary_loss_clip": 0.01105455, + "auxiliary_loss_mlp": 0.01031499, + "balance_loss_clip": 1.01895833, + "balance_loss_mlp": 1.03661776, + "epoch": 0.6875695175108973, + "flos": 20704548994560.0, + "grad_norm": 2.919181748670558, + "language_loss": 0.8081519, + "learning_rate": 9.393652412092538e-07, + "loss": 0.82952142, + "num_input_tokens_seen": 246866095, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 11436, + "time_per_iteration": 2.4831182956695557 + }, + { + "auxiliary_loss_clip": 0.01098481, + "auxiliary_loss_mlp": 0.01027936, + "balance_loss_clip": 1.01780939, + "balance_loss_mlp": 1.03531957, + "epoch": 0.6876296407635654, + "flos": 25374013228800.0, + "grad_norm": 2.0171807255350056, + "language_loss": 0.82209235, + "learning_rate": 9.390350760205183e-07, + "loss": 0.84335649, + "num_input_tokens_seen": 246883975, + "router_z_loss_clip": 0.1015625, + "router_z_loss_mlp": 0.6328125, + "step": 11437, + "time_per_iteration": 2.476003646850586 + }, + { + "auxiliary_loss_clip": 0.01111133, + "auxiliary_loss_mlp": 0.01034984, + "balance_loss_clip": 1.02188921, + "balance_loss_mlp": 1.03871989, + "epoch": 0.6876897640162333, + "flos": 23222317841280.0, + "grad_norm": 2.5574373753550894, + "language_loss": 0.77940321, + "learning_rate": 9.387049510636793e-07, + "loss": 0.80086446, + "num_input_tokens_seen": 246901560, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.72265625, + "step": 11438, + "time_per_iteration": 2.502321720123291 + }, + { + "auxiliary_loss_clip": 0.01098247, + "auxiliary_loss_mlp": 0.01032589, + "balance_loss_clip": 1.02097225, + "balance_loss_mlp": 1.03480375, + "epoch": 0.6877498872689013, + "flos": 27124335066240.0, + "grad_norm": 1.5853093369472568, + "language_loss": 0.72395837, + "learning_rate": 9.383748663512554e-07, + "loss": 0.74526674, + "num_input_tokens_seen": 246922655, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6328125, + "step": 11439, + "time_per_iteration": 2.4871983528137207 + }, + { + "auxiliary_loss_clip": 0.01104102, + "auxiliary_loss_mlp": 0.01026458, + "balance_loss_clip": 1.0148648, + "balance_loss_mlp": 1.0368948, + "epoch": 0.6878100105215692, + "flos": 11581658876160.0, + "grad_norm": 1.9510407430553642, + "language_loss": 0.75392562, + "learning_rate": 9.380448218957623e-07, + "loss": 0.77523124, + "num_input_tokens_seen": 246940100, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.671875, + "step": 11440, + "time_per_iteration": 2.444061040878296 + }, + { + "auxiliary_loss_clip": 0.01100078, + "auxiliary_loss_mlp": 0.01032398, + "balance_loss_clip": 1.02096558, + "balance_loss_mlp": 1.03482723, + "epoch": 0.6878701337742372, + "flos": 20303175444480.0, + "grad_norm": 1.5583446762430218, + "language_loss": 0.71741056, + "learning_rate": 9.377148177097167e-07, + "loss": 0.73873532, + "num_input_tokens_seen": 246958545, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6484375, + "step": 11441, + "time_per_iteration": 2.42561936378479 + }, + { + "auxiliary_loss_clip": 0.01107766, + "auxiliary_loss_mlp": 0.0103068, + "balance_loss_clip": 1.01703668, + "balance_loss_mlp": 1.03738022, + "epoch": 0.6879302570269051, + "flos": 13840080549120.0, + "grad_norm": 1.6223718684669892, + "language_loss": 0.66661596, + "learning_rate": 9.373848538056317e-07, + "loss": 0.68800044, + "num_input_tokens_seen": 246974805, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.703125, + "step": 11442, + "time_per_iteration": 2.527100086212158 + }, + { + "auxiliary_loss_clip": 0.01104807, + "auxiliary_loss_mlp": 0.0103077, + "balance_loss_clip": 1.01938581, + "balance_loss_mlp": 1.03825164, + "epoch": 0.6879903802795732, + "flos": 21324654414720.0, + "grad_norm": 1.9334719769408109, + "language_loss": 0.69233751, + "learning_rate": 9.370549301960189e-07, + "loss": 0.71369326, + "num_input_tokens_seen": 246992505, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6640625, + "step": 11443, + "time_per_iteration": 2.4346165657043457 + }, + { + "auxiliary_loss_clip": 0.01109303, + "auxiliary_loss_mlp": 0.01033521, + "balance_loss_clip": 1.02084899, + "balance_loss_mlp": 1.04012263, + "epoch": 0.6880505035322411, + "flos": 25152049134720.0, + "grad_norm": 1.4614285926013768, + "language_loss": 0.76507717, + "learning_rate": 9.367250468933893e-07, + "loss": 0.78650534, + "num_input_tokens_seen": 247013370, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69140625, + "step": 11444, + "time_per_iteration": 2.508368968963623 + }, + { + "auxiliary_loss_clip": 0.01101207, + "auxiliary_loss_mlp": 0.01027844, + "balance_loss_clip": 1.01622105, + "balance_loss_mlp": 1.03577399, + "epoch": 0.6881106267849091, + "flos": 23215530170880.0, + "grad_norm": 1.8080804951596867, + "language_loss": 0.76652426, + "learning_rate": 9.363952039102536e-07, + "loss": 0.78781474, + "num_input_tokens_seen": 247029855, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 11445, + "time_per_iteration": 2.4379546642303467 + }, + { + "auxiliary_loss_clip": 0.01025524, + "auxiliary_loss_mlp": 0.01005058, + "balance_loss_clip": 1.00386608, + "balance_loss_mlp": 1.00513721, + "epoch": 0.688170750037577, + "flos": 48484397312640.0, + "grad_norm": 0.8196174893111461, + "language_loss": 0.58379793, + "learning_rate": 9.360654012591183e-07, + "loss": 0.60410374, + "num_input_tokens_seen": 247085030, + "router_z_loss_clip": 0.01190186, + "router_z_loss_mlp": 0.20410156, + "step": 11446, + "time_per_iteration": 3.09559965133667 + }, + { + "auxiliary_loss_clip": 0.01105797, + "auxiliary_loss_mlp": 0.01027584, + "balance_loss_clip": 1.01536548, + "balance_loss_mlp": 1.03552115, + "epoch": 0.688230873290245, + "flos": 22783633038720.0, + "grad_norm": 1.5108741045715646, + "language_loss": 0.75743663, + "learning_rate": 9.357356389524886e-07, + "loss": 0.77877045, + "num_input_tokens_seen": 247104840, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.703125, + "step": 11447, + "time_per_iteration": 2.4388415813446045 + }, + { + "auxiliary_loss_clip": 0.01105525, + "auxiliary_loss_mlp": 0.01034698, + "balance_loss_clip": 1.02256274, + "balance_loss_mlp": 1.035833, + "epoch": 0.6882909965429129, + "flos": 22455660931200.0, + "grad_norm": 2.3058905142845, + "language_loss": 0.73110414, + "learning_rate": 9.354059170028705e-07, + "loss": 0.75250638, + "num_input_tokens_seen": 247121905, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69921875, + "step": 11448, + "time_per_iteration": 2.490492820739746 + }, + { + "auxiliary_loss_clip": 0.01106927, + "auxiliary_loss_mlp": 0.01031125, + "balance_loss_clip": 1.01841772, + "balance_loss_mlp": 1.03607249, + "epoch": 0.688351119795581, + "flos": 26214143408640.0, + "grad_norm": 1.6148138238236993, + "language_loss": 0.74589622, + "learning_rate": 9.350762354227673e-07, + "loss": 0.76727676, + "num_input_tokens_seen": 247142375, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 11449, + "time_per_iteration": 2.5052759647369385 + }, + { + "auxiliary_loss_clip": 0.01102717, + "auxiliary_loss_mlp": 0.01033997, + "balance_loss_clip": 1.02249944, + "balance_loss_mlp": 1.03643203, + "epoch": 0.6884112430482489, + "flos": 22565260304640.0, + "grad_norm": 1.8257091472853513, + "language_loss": 0.69832647, + "learning_rate": 9.34746594224679e-07, + "loss": 0.71969366, + "num_input_tokens_seen": 247161095, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 11450, + "time_per_iteration": 2.4648208618164062 + }, + { + "auxiliary_loss_clip": 0.01108292, + "auxiliary_loss_mlp": 0.0103361, + "balance_loss_clip": 1.02027011, + "balance_loss_mlp": 1.03613949, + "epoch": 0.6884713663009169, + "flos": 17341047446400.0, + "grad_norm": 2.0456347390181366, + "language_loss": 0.76224291, + "learning_rate": 9.344169934211068e-07, + "loss": 0.78366196, + "num_input_tokens_seen": 247178565, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 11451, + "time_per_iteration": 2.430615186691284 + }, + { + "auxiliary_loss_clip": 0.01106472, + "auxiliary_loss_mlp": 0.01029349, + "balance_loss_clip": 1.01746345, + "balance_loss_mlp": 1.03748226, + "epoch": 0.6885314895535849, + "flos": 26470832976000.0, + "grad_norm": 1.5920883527953233, + "language_loss": 0.69262952, + "learning_rate": 9.340874330245505e-07, + "loss": 0.71398771, + "num_input_tokens_seen": 247202345, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6875, + "step": 11452, + "time_per_iteration": 2.5010976791381836 + }, + { + "auxiliary_loss_clip": 0.01103078, + "auxiliary_loss_mlp": 0.01035172, + "balance_loss_clip": 1.02176046, + "balance_loss_mlp": 1.0362519, + "epoch": 0.6885916128062528, + "flos": 20521548178560.0, + "grad_norm": 1.7710041973258575, + "language_loss": 0.72149074, + "learning_rate": 9.337579130475042e-07, + "loss": 0.74287325, + "num_input_tokens_seen": 247219240, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.66796875, + "step": 11453, + "time_per_iteration": 2.450064182281494 + }, + { + "auxiliary_loss_clip": 0.01025423, + "auxiliary_loss_mlp": 0.00997723, + "balance_loss_clip": 0.99648923, + "balance_loss_mlp": 1.0050149, + "epoch": 0.6886517360589208, + "flos": 70715795679360.0, + "grad_norm": 0.7858760559038386, + "language_loss": 0.50753725, + "learning_rate": 9.334284335024644e-07, + "loss": 0.52776867, + "num_input_tokens_seen": 247272010, + "router_z_loss_clip": 0.0123291, + "router_z_loss_mlp": 0.20410156, + "step": 11454, + "time_per_iteration": 2.9117000102996826 + }, + { + "auxiliary_loss_clip": 0.01101076, + "auxiliary_loss_mlp": 0.01028661, + "balance_loss_clip": 1.01732993, + "balance_loss_mlp": 1.03662014, + "epoch": 0.6887118593115887, + "flos": 17893533513600.0, + "grad_norm": 1.7206646308115936, + "language_loss": 0.75241423, + "learning_rate": 9.330989944019263e-07, + "loss": 0.77371156, + "num_input_tokens_seen": 247290630, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.64453125, + "step": 11455, + "time_per_iteration": 2.485668897628784 + }, + { + "auxiliary_loss_clip": 0.01106397, + "auxiliary_loss_mlp": 0.01037929, + "balance_loss_clip": 1.02456009, + "balance_loss_mlp": 1.03585863, + "epoch": 0.6887719825642568, + "flos": 17453017117440.0, + "grad_norm": 2.149117194105129, + "language_loss": 0.72609061, + "learning_rate": 9.327695957583803e-07, + "loss": 0.74753392, + "num_input_tokens_seen": 247304800, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.70703125, + "step": 11456, + "time_per_iteration": 2.393894672393799 + }, + { + "auxiliary_loss_clip": 0.01102522, + "auxiliary_loss_mlp": 0.01030403, + "balance_loss_clip": 1.01892924, + "balance_loss_mlp": 1.03732562, + "epoch": 0.6888321058169247, + "flos": 23070199743360.0, + "grad_norm": 1.623007735916198, + "language_loss": 0.80938387, + "learning_rate": 9.32440237584319e-07, + "loss": 0.83071315, + "num_input_tokens_seen": 247323450, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65234375, + "step": 11457, + "time_per_iteration": 2.495333194732666 + }, + { + "auxiliary_loss_clip": 0.01108692, + "auxiliary_loss_mlp": 0.01028177, + "balance_loss_clip": 1.01552272, + "balance_loss_mlp": 1.03859973, + "epoch": 0.6888922290695927, + "flos": 23368833417600.0, + "grad_norm": 1.5000729460202227, + "language_loss": 0.76153016, + "learning_rate": 9.321109198922301e-07, + "loss": 0.7828989, + "num_input_tokens_seen": 247343845, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 11458, + "time_per_iteration": 2.4778497219085693 + }, + { + "auxiliary_loss_clip": 0.01104516, + "auxiliary_loss_mlp": 0.01029574, + "balance_loss_clip": 1.01787341, + "balance_loss_mlp": 1.03653932, + "epoch": 0.6889523523222606, + "flos": 17631636474240.0, + "grad_norm": 2.658523232455535, + "language_loss": 0.68647993, + "learning_rate": 9.31781642694603e-07, + "loss": 0.70782083, + "num_input_tokens_seen": 247356650, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 11459, + "time_per_iteration": 2.418846845626831 + }, + { + "auxiliary_loss_clip": 0.01104672, + "auxiliary_loss_mlp": 0.01027492, + "balance_loss_clip": 1.01640558, + "balance_loss_mlp": 1.03759336, + "epoch": 0.6890124755749286, + "flos": 25228144097280.0, + "grad_norm": 1.5707154761187223, + "language_loss": 0.68636, + "learning_rate": 9.314524060039221e-07, + "loss": 0.7076816, + "num_input_tokens_seen": 247377340, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.671875, + "step": 11460, + "time_per_iteration": 2.5109915733337402 + }, + { + "auxiliary_loss_clip": 0.01108621, + "auxiliary_loss_mlp": 0.01033572, + "balance_loss_clip": 1.02050114, + "balance_loss_mlp": 1.03564703, + "epoch": 0.6890725988275965, + "flos": 20230240878720.0, + "grad_norm": 1.7818403559528928, + "language_loss": 0.76981837, + "learning_rate": 9.311232098326731e-07, + "loss": 0.79124033, + "num_input_tokens_seen": 247395805, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.73046875, + "step": 11461, + "time_per_iteration": 2.467684268951416 + }, + { + "auxiliary_loss_clip": 0.01103615, + "auxiliary_loss_mlp": 0.01034219, + "balance_loss_clip": 1.02204204, + "balance_loss_mlp": 1.03618026, + "epoch": 0.6891327220802645, + "flos": 14535311264640.0, + "grad_norm": 1.6409609736690487, + "language_loss": 0.6973418, + "learning_rate": 9.307940541933401e-07, + "loss": 0.71872014, + "num_input_tokens_seen": 247413165, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 11462, + "time_per_iteration": 2.411785125732422 + }, + { + "auxiliary_loss_clip": 0.01106527, + "auxiliary_loss_mlp": 0.01025599, + "balance_loss_clip": 1.0134095, + "balance_loss_mlp": 1.03735316, + "epoch": 0.6891928453329325, + "flos": 21139139646720.0, + "grad_norm": 1.4661487687088357, + "language_loss": 0.87139171, + "learning_rate": 9.304649390984034e-07, + "loss": 0.89271295, + "num_input_tokens_seen": 247433140, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69140625, + "step": 11463, + "time_per_iteration": 2.4572672843933105 + }, + { + "auxiliary_loss_clip": 0.010984, + "auxiliary_loss_mlp": 0.01027776, + "balance_loss_clip": 1.01702309, + "balance_loss_mlp": 1.03459322, + "epoch": 0.6892529685856005, + "flos": 17858520731520.0, + "grad_norm": 1.8830832637485666, + "language_loss": 0.68394661, + "learning_rate": 9.301358645603428e-07, + "loss": 0.70520842, + "num_input_tokens_seen": 247451265, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.63671875, + "step": 11464, + "time_per_iteration": 2.4330556392669678 + }, + { + "auxiliary_loss_clip": 0.0110454, + "auxiliary_loss_mlp": 0.0103758, + "balance_loss_clip": 1.0248543, + "balance_loss_mlp": 1.0371387, + "epoch": 0.6893130918382685, + "flos": 29934811843200.0, + "grad_norm": 1.8974270807015088, + "language_loss": 0.65594816, + "learning_rate": 9.298068305916373e-07, + "loss": 0.67736936, + "num_input_tokens_seen": 247471645, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.671875, + "step": 11465, + "time_per_iteration": 2.495144844055176 + }, + { + "auxiliary_loss_clip": 0.01106695, + "auxiliary_loss_mlp": 0.01034637, + "balance_loss_clip": 1.02248406, + "balance_loss_mlp": 1.03674364, + "epoch": 0.6893732150909364, + "flos": 24388516707840.0, + "grad_norm": 1.5240764354372476, + "language_loss": 0.72628653, + "learning_rate": 9.294778372047649e-07, + "loss": 0.74769986, + "num_input_tokens_seen": 247491170, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69921875, + "step": 11466, + "time_per_iteration": 2.4766881465911865 + }, + { + "auxiliary_loss_clip": 0.0110566, + "auxiliary_loss_mlp": 0.0103001, + "balance_loss_clip": 1.01822066, + "balance_loss_mlp": 1.03696775, + "epoch": 0.6894333383436044, + "flos": 16982874979200.0, + "grad_norm": 1.714596281960388, + "language_loss": 0.71770334, + "learning_rate": 9.291488844121995e-07, + "loss": 0.73905998, + "num_input_tokens_seen": 247509005, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 11467, + "time_per_iteration": 2.4112367630004883 + }, + { + "auxiliary_loss_clip": 0.0110697, + "auxiliary_loss_mlp": 0.01033327, + "balance_loss_clip": 1.02008295, + "balance_loss_mlp": 1.0355289, + "epoch": 0.6894934615962723, + "flos": 18985540838400.0, + "grad_norm": 2.163503550286246, + "language_loss": 0.81232512, + "learning_rate": 9.288199722264156e-07, + "loss": 0.83372813, + "num_input_tokens_seen": 247527050, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71484375, + "step": 11468, + "time_per_iteration": 2.466501474380493 + }, + { + "auxiliary_loss_clip": 0.01107651, + "auxiliary_loss_mlp": 0.01032816, + "balance_loss_clip": 1.02028728, + "balance_loss_mlp": 1.03816724, + "epoch": 0.6895535848489404, + "flos": 34531664734080.0, + "grad_norm": 1.7452296141639345, + "language_loss": 0.65893084, + "learning_rate": 9.284911006598875e-07, + "loss": 0.68033552, + "num_input_tokens_seen": 247547765, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 11469, + "time_per_iteration": 3.9587156772613525 + }, + { + "auxiliary_loss_clip": 0.01024995, + "auxiliary_loss_mlp": 0.01001889, + "balance_loss_clip": 1.00072718, + "balance_loss_mlp": 1.0044626, + "epoch": 0.6896137081016083, + "flos": 50075852273280.0, + "grad_norm": 0.79151835418889, + "language_loss": 0.55171818, + "learning_rate": 9.281622697250824e-07, + "loss": 0.57198697, + "num_input_tokens_seen": 247603515, + "router_z_loss_clip": 0.01159668, + "router_z_loss_mlp": 0.20507812, + "step": 11470, + "time_per_iteration": 2.9345321655273438 + }, + { + "auxiliary_loss_clip": 0.01101343, + "auxiliary_loss_mlp": 0.01031107, + "balance_loss_clip": 1.0206759, + "balance_loss_mlp": 1.03692126, + "epoch": 0.6896738313542763, + "flos": 19938215306880.0, + "grad_norm": 2.531274937243883, + "language_loss": 0.77590048, + "learning_rate": 9.278334794344715e-07, + "loss": 0.79722488, + "num_input_tokens_seen": 247622110, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.64453125, + "step": 11471, + "time_per_iteration": 3.9249086380004883 + }, + { + "auxiliary_loss_clip": 0.0110492, + "auxiliary_loss_mlp": 0.01032326, + "balance_loss_clip": 1.02015519, + "balance_loss_mlp": 1.03743219, + "epoch": 0.6897339546069442, + "flos": 21725489260800.0, + "grad_norm": 1.771316633109537, + "language_loss": 0.78440964, + "learning_rate": 9.275047298005232e-07, + "loss": 0.80578208, + "num_input_tokens_seen": 247641905, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.67578125, + "step": 11472, + "time_per_iteration": 3.877894401550293 + }, + { + "auxiliary_loss_clip": 0.01100137, + "auxiliary_loss_mlp": 0.01029711, + "balance_loss_clip": 1.01826715, + "balance_loss_mlp": 1.03419447, + "epoch": 0.6897940778596122, + "flos": 19826497031040.0, + "grad_norm": 1.5889671799486909, + "language_loss": 0.76273483, + "learning_rate": 9.271760208357024e-07, + "loss": 0.7840333, + "num_input_tokens_seen": 247660945, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66015625, + "step": 11473, + "time_per_iteration": 3.895129680633545 + }, + { + "auxiliary_loss_clip": 0.01106535, + "auxiliary_loss_mlp": 0.01032871, + "balance_loss_clip": 1.0202589, + "balance_loss_mlp": 1.03657973, + "epoch": 0.6898542011122801, + "flos": 17310056987520.0, + "grad_norm": 3.23937327376226, + "language_loss": 0.75285846, + "learning_rate": 9.268473525524751e-07, + "loss": 0.77425253, + "num_input_tokens_seen": 247678395, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69921875, + "step": 11474, + "time_per_iteration": 2.4117770195007324 + }, + { + "auxiliary_loss_clip": 0.0110508, + "auxiliary_loss_mlp": 0.01032957, + "balance_loss_clip": 1.02095246, + "balance_loss_mlp": 1.03775465, + "epoch": 0.6899143243649482, + "flos": 24754051463040.0, + "grad_norm": 1.4990231187907213, + "language_loss": 0.74082041, + "learning_rate": 9.26518724963303e-07, + "loss": 0.76220077, + "num_input_tokens_seen": 247698380, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 11475, + "time_per_iteration": 2.5168709754943848 + }, + { + "auxiliary_loss_clip": 0.01105263, + "auxiliary_loss_mlp": 0.01028544, + "balance_loss_clip": 1.01619959, + "balance_loss_mlp": 1.03732133, + "epoch": 0.6899744476176161, + "flos": 17234536642560.0, + "grad_norm": 2.0309056655134587, + "language_loss": 0.88638115, + "learning_rate": 9.261901380806491e-07, + "loss": 0.90771919, + "num_input_tokens_seen": 247716370, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6796875, + "step": 11476, + "time_per_iteration": 2.4443247318267822 + }, + { + "auxiliary_loss_clip": 0.01102042, + "auxiliary_loss_mlp": 0.0103205, + "balance_loss_clip": 1.02020097, + "balance_loss_mlp": 1.03498316, + "epoch": 0.6900345708702841, + "flos": 25410678036480.0, + "grad_norm": 1.3153464082970854, + "language_loss": 0.70150822, + "learning_rate": 9.258615919169724e-07, + "loss": 0.72284913, + "num_input_tokens_seen": 247737335, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 11477, + "time_per_iteration": 2.5622828006744385 + }, + { + "auxiliary_loss_clip": 0.01109227, + "auxiliary_loss_mlp": 0.01038844, + "balance_loss_clip": 1.02615404, + "balance_loss_mlp": 1.03800416, + "epoch": 0.6900946941229521, + "flos": 23434190213760.0, + "grad_norm": 2.5064065757946925, + "language_loss": 0.68533587, + "learning_rate": 9.255330864847313e-07, + "loss": 0.70681655, + "num_input_tokens_seen": 247756680, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 11478, + "time_per_iteration": 2.46543288230896 + }, + { + "auxiliary_loss_clip": 0.01105606, + "auxiliary_loss_mlp": 0.01032476, + "balance_loss_clip": 1.02038169, + "balance_loss_mlp": 1.03681922, + "epoch": 0.69015481737562, + "flos": 17820096157440.0, + "grad_norm": 1.887360413937171, + "language_loss": 0.7609849, + "learning_rate": 9.252046217963843e-07, + "loss": 0.78236568, + "num_input_tokens_seen": 247774265, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 11479, + "time_per_iteration": 2.568270683288574 + }, + { + "auxiliary_loss_clip": 0.01107631, + "auxiliary_loss_mlp": 0.01027498, + "balance_loss_clip": 1.01501036, + "balance_loss_mlp": 1.03848529, + "epoch": 0.690214940628288, + "flos": 17456500736640.0, + "grad_norm": 1.594697323523918, + "language_loss": 0.78643298, + "learning_rate": 9.248761978643856e-07, + "loss": 0.80778426, + "num_input_tokens_seen": 247792395, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69140625, + "step": 11480, + "time_per_iteration": 2.4369962215423584 + }, + { + "auxiliary_loss_clip": 0.01104582, + "auxiliary_loss_mlp": 0.0102684, + "balance_loss_clip": 1.01478815, + "balance_loss_mlp": 1.03820038, + "epoch": 0.6902750638809559, + "flos": 29566691308800.0, + "grad_norm": 1.618219832411148, + "language_loss": 0.75485682, + "learning_rate": 9.245478147011885e-07, + "loss": 0.77617109, + "num_input_tokens_seen": 247811985, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6640625, + "step": 11481, + "time_per_iteration": 2.5970773696899414 + }, + { + "auxiliary_loss_clip": 0.01103611, + "auxiliary_loss_mlp": 0.01031414, + "balance_loss_clip": 1.01932073, + "balance_loss_mlp": 1.03630292, + "epoch": 0.690335187133624, + "flos": 25557121785600.0, + "grad_norm": 1.6722041595175992, + "language_loss": 0.6924783, + "learning_rate": 9.24219472319246e-07, + "loss": 0.71382856, + "num_input_tokens_seen": 247831880, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 11482, + "time_per_iteration": 2.4690396785736084 + }, + { + "auxiliary_loss_clip": 0.01104337, + "auxiliary_loss_mlp": 0.01027796, + "balance_loss_clip": 1.01601243, + "balance_loss_mlp": 1.0365622, + "epoch": 0.6903953103862919, + "flos": 22488447070080.0, + "grad_norm": 1.9391931338657746, + "language_loss": 0.82797402, + "learning_rate": 9.238911707310096e-07, + "loss": 0.84929538, + "num_input_tokens_seen": 247851170, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 11483, + "time_per_iteration": 2.479827880859375 + }, + { + "auxiliary_loss_clip": 0.01105727, + "auxiliary_loss_mlp": 0.01028162, + "balance_loss_clip": 1.01693249, + "balance_loss_mlp": 1.03651834, + "epoch": 0.6904554336389599, + "flos": 26100521712000.0, + "grad_norm": 2.252246315768351, + "language_loss": 0.65228778, + "learning_rate": 9.235629099489273e-07, + "loss": 0.67362666, + "num_input_tokens_seen": 247868950, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.69140625, + "step": 11484, + "time_per_iteration": 2.4820756912231445 + }, + { + "auxiliary_loss_clip": 0.01100593, + "auxiliary_loss_mlp": 0.01034816, + "balance_loss_clip": 1.0234673, + "balance_loss_mlp": 1.03430891, + "epoch": 0.6905155568916278, + "flos": 31171754545920.0, + "grad_norm": 1.5009595972061287, + "language_loss": 0.73750043, + "learning_rate": 9.232346899854479e-07, + "loss": 0.75885451, + "num_input_tokens_seen": 247889805, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 11485, + "time_per_iteration": 2.5609304904937744 + }, + { + "auxiliary_loss_clip": 0.0110609, + "auxiliary_loss_mlp": 0.0103398, + "balance_loss_clip": 1.02159464, + "balance_loss_mlp": 1.03691673, + "epoch": 0.6905756801442958, + "flos": 17639681120640.0, + "grad_norm": 1.703754025392432, + "language_loss": 0.85226732, + "learning_rate": 9.22906510853017e-07, + "loss": 0.87366807, + "num_input_tokens_seen": 247908585, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69140625, + "step": 11486, + "time_per_iteration": 2.422380208969116 + }, + { + "auxiliary_loss_clip": 0.01105383, + "auxiliary_loss_mlp": 0.01034252, + "balance_loss_clip": 1.02200925, + "balance_loss_mlp": 1.03721333, + "epoch": 0.6906358033969637, + "flos": 22343691260160.0, + "grad_norm": 1.4802712098189896, + "language_loss": 0.72739094, + "learning_rate": 9.225783725640786e-07, + "loss": 0.74878728, + "num_input_tokens_seen": 247928480, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.68359375, + "step": 11487, + "time_per_iteration": 2.4903013706207275 + }, + { + "auxiliary_loss_clip": 0.01025937, + "auxiliary_loss_mlp": 0.01011443, + "balance_loss_clip": 1.01028049, + "balance_loss_mlp": 1.00546408, + "epoch": 0.6906959266496318, + "flos": 69747789081600.0, + "grad_norm": 0.8945179331036194, + "language_loss": 0.66639161, + "learning_rate": 9.222502751310759e-07, + "loss": 0.68676543, + "num_input_tokens_seen": 247988855, + "router_z_loss_clip": 0.01159668, + "router_z_loss_mlp": 0.20507812, + "step": 11488, + "time_per_iteration": 3.0653343200683594 + }, + { + "auxiliary_loss_clip": 0.01107886, + "auxiliary_loss_mlp": 0.01032838, + "balance_loss_clip": 1.0193913, + "balance_loss_mlp": 1.03628397, + "epoch": 0.6907560499022997, + "flos": 21434253788160.0, + "grad_norm": 1.7595875611490563, + "language_loss": 0.7471655, + "learning_rate": 9.219222185664519e-07, + "loss": 0.76857275, + "num_input_tokens_seen": 248007685, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71484375, + "step": 11489, + "time_per_iteration": 2.5286636352539062 + }, + { + "auxiliary_loss_clip": 0.01107539, + "auxiliary_loss_mlp": 0.01034501, + "balance_loss_clip": 1.02164459, + "balance_loss_mlp": 1.03755021, + "epoch": 0.6908161731549677, + "flos": 14392207480320.0, + "grad_norm": 1.9530912954904702, + "language_loss": 0.62219006, + "learning_rate": 9.215942028826445e-07, + "loss": 0.64361048, + "num_input_tokens_seen": 248025145, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69921875, + "step": 11490, + "time_per_iteration": 2.420513868331909 + }, + { + "auxiliary_loss_clip": 0.01104902, + "auxiliary_loss_mlp": 0.01029569, + "balance_loss_clip": 1.01746964, + "balance_loss_mlp": 1.03648567, + "epoch": 0.6908762964076357, + "flos": 20010970304640.0, + "grad_norm": 1.749287596246761, + "language_loss": 0.72922885, + "learning_rate": 9.212662280920937e-07, + "loss": 0.75057352, + "num_input_tokens_seen": 248043750, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.68359375, + "step": 11491, + "time_per_iteration": 2.481513738632202 + }, + { + "auxiliary_loss_clip": 0.01101839, + "auxiliary_loss_mlp": 0.01037114, + "balance_loss_clip": 1.02441788, + "balance_loss_mlp": 1.03524041, + "epoch": 0.6909364196603036, + "flos": 28769079853440.0, + "grad_norm": 1.8381710188845477, + "language_loss": 0.7008509, + "learning_rate": 9.20938294207235e-07, + "loss": 0.72224045, + "num_input_tokens_seen": 248065765, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6640625, + "step": 11492, + "time_per_iteration": 2.506946325302124 + }, + { + "auxiliary_loss_clip": 0.01107014, + "auxiliary_loss_mlp": 0.01033387, + "balance_loss_clip": 1.02075076, + "balance_loss_mlp": 1.03607428, + "epoch": 0.6909965429129716, + "flos": 22528128620160.0, + "grad_norm": 1.9892003988580658, + "language_loss": 0.74623132, + "learning_rate": 9.206104012405049e-07, + "loss": 0.76763535, + "num_input_tokens_seen": 248083810, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 11493, + "time_per_iteration": 2.485933780670166 + }, + { + "auxiliary_loss_clip": 0.011046, + "auxiliary_loss_mlp": 0.01029054, + "balance_loss_clip": 1.01704955, + "balance_loss_mlp": 1.03709757, + "epoch": 0.6910566661656395, + "flos": 18405942981120.0, + "grad_norm": 1.750158272708012, + "language_loss": 0.74326122, + "learning_rate": 9.20282549204336e-07, + "loss": 0.76459777, + "num_input_tokens_seen": 248103185, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 11494, + "time_per_iteration": 2.4338111877441406 + }, + { + "auxiliary_loss_clip": 0.01102928, + "auxiliary_loss_mlp": 0.01030264, + "balance_loss_clip": 1.01822948, + "balance_loss_mlp": 1.03593969, + "epoch": 0.6911167894183076, + "flos": 30773972355840.0, + "grad_norm": 1.7715754861715476, + "language_loss": 0.68369365, + "learning_rate": 9.19954738111161e-07, + "loss": 0.70502561, + "num_input_tokens_seen": 248125665, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66796875, + "step": 11495, + "time_per_iteration": 2.5815460681915283 + }, + { + "auxiliary_loss_clip": 0.01103437, + "auxiliary_loss_mlp": 0.01029758, + "balance_loss_clip": 1.01754475, + "balance_loss_mlp": 1.03441787, + "epoch": 0.6911769126709755, + "flos": 13735724561280.0, + "grad_norm": 1.7834368050418072, + "language_loss": 0.73899794, + "learning_rate": 9.196269679734119e-07, + "loss": 0.7603299, + "num_input_tokens_seen": 248142545, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69140625, + "step": 11496, + "time_per_iteration": 2.4315319061279297 + }, + { + "auxiliary_loss_clip": 0.01102521, + "auxiliary_loss_mlp": 0.01028635, + "balance_loss_clip": 1.01727474, + "balance_loss_mlp": 1.03553581, + "epoch": 0.6912370359236435, + "flos": 17566854295680.0, + "grad_norm": 1.6258579372444952, + "language_loss": 0.79742873, + "learning_rate": 9.19299238803515e-07, + "loss": 0.81874031, + "num_input_tokens_seen": 248160225, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.671875, + "step": 11497, + "time_per_iteration": 2.4571430683135986 + }, + { + "auxiliary_loss_clip": 0.01107463, + "auxiliary_loss_mlp": 0.01036367, + "balance_loss_clip": 1.0240463, + "balance_loss_mlp": 1.03682327, + "epoch": 0.6912971591763114, + "flos": 22090772620800.0, + "grad_norm": 1.5194582434001807, + "language_loss": 0.80841976, + "learning_rate": 9.189715506138993e-07, + "loss": 0.82985806, + "num_input_tokens_seen": 248180430, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.70703125, + "step": 11498, + "time_per_iteration": 2.4500298500061035 + }, + { + "auxiliary_loss_clip": 0.01100372, + "auxiliary_loss_mlp": 0.01033064, + "balance_loss_clip": 1.0208571, + "balance_loss_mlp": 1.03421736, + "epoch": 0.6913572824289794, + "flos": 29971476650880.0, + "grad_norm": 1.486917569830455, + "language_loss": 0.86061001, + "learning_rate": 9.186439034169915e-07, + "loss": 0.8819443, + "num_input_tokens_seen": 248202365, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6640625, + "step": 11499, + "time_per_iteration": 2.5612852573394775 + }, + { + "auxiliary_loss_clip": 0.01101921, + "auxiliary_loss_mlp": 0.01027697, + "balance_loss_clip": 1.01606178, + "balance_loss_mlp": 1.03633177, + "epoch": 0.6914174056816473, + "flos": 20448936835200.0, + "grad_norm": 1.5487466201601385, + "language_loss": 0.75228941, + "learning_rate": 9.183162972252145e-07, + "loss": 0.77358556, + "num_input_tokens_seen": 248221750, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 11500, + "time_per_iteration": 2.443873405456543 + }, + { + "auxiliary_loss_clip": 0.01106604, + "auxiliary_loss_mlp": 0.01032013, + "balance_loss_clip": 1.0196985, + "balance_loss_mlp": 1.03778219, + "epoch": 0.6914775289343154, + "flos": 21282530739840.0, + "grad_norm": 1.800321839469313, + "language_loss": 0.76985884, + "learning_rate": 9.179887320509921e-07, + "loss": 0.79124504, + "num_input_tokens_seen": 248239535, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 11501, + "time_per_iteration": 2.5296645164489746 + }, + { + "auxiliary_loss_clip": 0.01107203, + "auxiliary_loss_mlp": 0.01033206, + "balance_loss_clip": 1.0208025, + "balance_loss_mlp": 1.03735363, + "epoch": 0.6915376521869833, + "flos": 23878118401920.0, + "grad_norm": 1.9287376377715924, + "language_loss": 0.73522556, + "learning_rate": 9.176612079067458e-07, + "loss": 0.75662971, + "num_input_tokens_seen": 248259055, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69921875, + "step": 11502, + "time_per_iteration": 2.476379632949829 + }, + { + "auxiliary_loss_clip": 0.01108675, + "auxiliary_loss_mlp": 0.0103432, + "balance_loss_clip": 1.02137995, + "balance_loss_mlp": 1.03706694, + "epoch": 0.6915977754396513, + "flos": 11510268595200.0, + "grad_norm": 1.914547972677127, + "language_loss": 0.73439324, + "learning_rate": 9.173337248048953e-07, + "loss": 0.75582325, + "num_input_tokens_seen": 248276765, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71484375, + "step": 11503, + "time_per_iteration": 2.477112293243408 + }, + { + "auxiliary_loss_clip": 0.01103095, + "auxiliary_loss_mlp": 0.01030947, + "balance_loss_clip": 1.01900172, + "balance_loss_mlp": 1.03558373, + "epoch": 0.6916578986923193, + "flos": 22601278667520.0, + "grad_norm": 2.2572840313297067, + "language_loss": 0.77144331, + "learning_rate": 9.170062827578575e-07, + "loss": 0.79278374, + "num_input_tokens_seen": 248295310, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.67578125, + "step": 11504, + "time_per_iteration": 2.434324026107788 + }, + { + "auxiliary_loss_clip": 0.01103184, + "auxiliary_loss_mlp": 0.01029313, + "balance_loss_clip": 1.0178746, + "balance_loss_mlp": 1.03509164, + "epoch": 0.6917180219449872, + "flos": 23477355383040.0, + "grad_norm": 1.6879501017402825, + "language_loss": 0.73243099, + "learning_rate": 9.166788817780499e-07, + "loss": 0.75375593, + "num_input_tokens_seen": 248315230, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6796875, + "step": 11505, + "time_per_iteration": 2.4869065284729004 + }, + { + "auxiliary_loss_clip": 0.01102379, + "auxiliary_loss_mlp": 0.01033942, + "balance_loss_clip": 1.02147281, + "balance_loss_mlp": 1.03443623, + "epoch": 0.6917781451976552, + "flos": 23732536579200.0, + "grad_norm": 1.792057287093782, + "language_loss": 0.87782943, + "learning_rate": 9.163515218778886e-07, + "loss": 0.89919269, + "num_input_tokens_seen": 248332980, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6796875, + "step": 11506, + "time_per_iteration": 2.4522695541381836 + }, + { + "auxiliary_loss_clip": 0.01102604, + "auxiliary_loss_mlp": 0.01026179, + "balance_loss_clip": 1.01465774, + "balance_loss_mlp": 1.03585625, + "epoch": 0.6918382684503231, + "flos": 31466760946560.0, + "grad_norm": 2.803306813867866, + "language_loss": 0.69775116, + "learning_rate": 9.160242030697856e-07, + "loss": 0.71903902, + "num_input_tokens_seen": 248352865, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 11507, + "time_per_iteration": 2.5447754859924316 + }, + { + "auxiliary_loss_clip": 0.01106091, + "auxiliary_loss_mlp": 0.01033389, + "balance_loss_clip": 1.02132535, + "balance_loss_mlp": 1.03596449, + "epoch": 0.6918983917029912, + "flos": 21650471706240.0, + "grad_norm": 2.005563924492128, + "language_loss": 0.76869601, + "learning_rate": 9.156969253661538e-07, + "loss": 0.7900908, + "num_input_tokens_seen": 248371125, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.703125, + "step": 11508, + "time_per_iteration": 2.4350826740264893 + }, + { + "auxiliary_loss_clip": 0.01099817, + "auxiliary_loss_mlp": 0.01029985, + "balance_loss_clip": 1.01885688, + "balance_loss_mlp": 1.03575826, + "epoch": 0.6919585149556591, + "flos": 25550082720000.0, + "grad_norm": 3.1306614754136217, + "language_loss": 0.75215411, + "learning_rate": 9.153696887794027e-07, + "loss": 0.77345216, + "num_input_tokens_seen": 248390455, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.640625, + "step": 11509, + "time_per_iteration": 2.51385760307312 + }, + { + "auxiliary_loss_clip": 0.01104564, + "auxiliary_loss_mlp": 0.01032862, + "balance_loss_clip": 1.02108383, + "balance_loss_mlp": 1.03770804, + "epoch": 0.6920186382083271, + "flos": 23659781581440.0, + "grad_norm": 1.4724116040863566, + "language_loss": 0.64134341, + "learning_rate": 9.150424933219425e-07, + "loss": 0.6627177, + "num_input_tokens_seen": 248411305, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.66796875, + "step": 11510, + "time_per_iteration": 2.45000958442688 + }, + { + "auxiliary_loss_clip": 0.01109479, + "auxiliary_loss_mlp": 0.01032628, + "balance_loss_clip": 1.01910424, + "balance_loss_mlp": 1.03804469, + "epoch": 0.692078761460995, + "flos": 19061959023360.0, + "grad_norm": 4.327241358216876, + "language_loss": 0.75543642, + "learning_rate": 9.147153390061788e-07, + "loss": 0.7768575, + "num_input_tokens_seen": 248430190, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71484375, + "step": 11511, + "time_per_iteration": 3.932948350906372 + }, + { + "auxiliary_loss_clip": 0.01104006, + "auxiliary_loss_mlp": 0.01033598, + "balance_loss_clip": 1.02240431, + "balance_loss_mlp": 1.03698862, + "epoch": 0.692138884713663, + "flos": 29023291382400.0, + "grad_norm": 2.3102277566791614, + "language_loss": 0.62639916, + "learning_rate": 9.143882258445184e-07, + "loss": 0.64777517, + "num_input_tokens_seen": 248450830, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.671875, + "step": 11512, + "time_per_iteration": 2.50154185295105 + }, + { + "auxiliary_loss_clip": 0.01103911, + "auxiliary_loss_mlp": 0.01030377, + "balance_loss_clip": 1.01848567, + "balance_loss_mlp": 1.03483152, + "epoch": 0.6921990079663309, + "flos": 14757849976320.0, + "grad_norm": 1.6663402692023492, + "language_loss": 0.8328855, + "learning_rate": 9.140611538493666e-07, + "loss": 0.85422838, + "num_input_tokens_seen": 248468585, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69140625, + "step": 11513, + "time_per_iteration": 3.906061887741089 + }, + { + "auxiliary_loss_clip": 0.01101955, + "auxiliary_loss_mlp": 0.01029486, + "balance_loss_clip": 1.01814294, + "balance_loss_mlp": 1.03563786, + "epoch": 0.692259131218999, + "flos": 23841848643840.0, + "grad_norm": 1.4134932862806329, + "language_loss": 0.77965999, + "learning_rate": 9.137341230331233e-07, + "loss": 0.80097437, + "num_input_tokens_seen": 248490535, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 11514, + "time_per_iteration": 3.914891481399536 + }, + { + "auxiliary_loss_clip": 0.01104612, + "auxiliary_loss_mlp": 0.01034457, + "balance_loss_clip": 1.02256036, + "balance_loss_mlp": 1.03478587, + "epoch": 0.6923192544716669, + "flos": 19135073157120.0, + "grad_norm": 1.8450575688706539, + "language_loss": 0.74720532, + "learning_rate": 9.134071334081907e-07, + "loss": 0.76859605, + "num_input_tokens_seen": 248508575, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69921875, + "step": 11515, + "time_per_iteration": 3.975337505340576 + }, + { + "auxiliary_loss_clip": 0.01101876, + "auxiliary_loss_mlp": 0.01034689, + "balance_loss_clip": 1.02331054, + "balance_loss_mlp": 1.03631759, + "epoch": 0.6923793777243349, + "flos": 28074639237120.0, + "grad_norm": 2.249358111886257, + "language_loss": 0.53926551, + "learning_rate": 9.130801849869694e-07, + "loss": 0.56063116, + "num_input_tokens_seen": 248527025, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65625, + "step": 11516, + "time_per_iteration": 2.4912428855895996 + }, + { + "auxiliary_loss_clip": 0.01101185, + "auxiliary_loss_mlp": 0.01031903, + "balance_loss_clip": 1.01989245, + "balance_loss_mlp": 1.03666639, + "epoch": 0.6924395009770029, + "flos": 16581250033920.0, + "grad_norm": 1.6422617041097631, + "language_loss": 0.72871542, + "learning_rate": 9.127532777818557e-07, + "loss": 0.75004637, + "num_input_tokens_seen": 248544275, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.64453125, + "step": 11517, + "time_per_iteration": 2.478013277053833 + }, + { + "auxiliary_loss_clip": 0.01105782, + "auxiliary_loss_mlp": 0.01036586, + "balance_loss_clip": 1.02440846, + "balance_loss_mlp": 1.03657305, + "epoch": 0.6924996242296708, + "flos": 16655297921280.0, + "grad_norm": 1.7574015499880917, + "language_loss": 0.76101017, + "learning_rate": 9.124264118052465e-07, + "loss": 0.78243387, + "num_input_tokens_seen": 248561870, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69140625, + "step": 11518, + "time_per_iteration": 2.4453186988830566 + }, + { + "auxiliary_loss_clip": 0.01108029, + "auxiliary_loss_mlp": 0.01033282, + "balance_loss_clip": 1.02049708, + "balance_loss_mlp": 1.03722334, + "epoch": 0.6925597474823388, + "flos": 34754167532160.0, + "grad_norm": 1.3039874531903892, + "language_loss": 0.64442092, + "learning_rate": 9.120995870695376e-07, + "loss": 0.66583401, + "num_input_tokens_seen": 248588190, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.70703125, + "step": 11519, + "time_per_iteration": 2.6372623443603516 + }, + { + "auxiliary_loss_clip": 0.01103223, + "auxiliary_loss_mlp": 0.01035136, + "balance_loss_clip": 1.02304852, + "balance_loss_mlp": 1.03542209, + "epoch": 0.6926198707350067, + "flos": 21871717528320.0, + "grad_norm": 1.9115708642987976, + "language_loss": 0.6239593, + "learning_rate": 9.117728035871212e-07, + "loss": 0.64534283, + "num_input_tokens_seen": 248606460, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 11520, + "time_per_iteration": 2.4893410205841064 + }, + { + "auxiliary_loss_clip": 0.01111126, + "auxiliary_loss_mlp": 0.01036871, + "balance_loss_clip": 1.02313781, + "balance_loss_mlp": 1.03751791, + "epoch": 0.6926799939876748, + "flos": 13006271162880.0, + "grad_norm": 1.8081030789169619, + "language_loss": 0.77767199, + "learning_rate": 9.114460613703887e-07, + "loss": 0.79915196, + "num_input_tokens_seen": 248623715, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.734375, + "step": 11521, + "time_per_iteration": 2.4445972442626953 + }, + { + "auxiliary_loss_clip": 0.01107789, + "auxiliary_loss_mlp": 0.01030231, + "balance_loss_clip": 1.01773214, + "balance_loss_mlp": 1.03593922, + "epoch": 0.6927401172403427, + "flos": 16761234107520.0, + "grad_norm": 1.8501694912434254, + "language_loss": 0.81979275, + "learning_rate": 9.111193604317304e-07, + "loss": 0.84117287, + "num_input_tokens_seen": 248640575, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71875, + "step": 11522, + "time_per_iteration": 2.423020124435425 + }, + { + "auxiliary_loss_clip": 0.01105276, + "auxiliary_loss_mlp": 0.01030165, + "balance_loss_clip": 1.01828539, + "balance_loss_mlp": 1.03786206, + "epoch": 0.6928002404930107, + "flos": 25705648523520.0, + "grad_norm": 1.3469897873257555, + "language_loss": 0.76728314, + "learning_rate": 9.107927007835361e-07, + "loss": 0.78863752, + "num_input_tokens_seen": 248663535, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 11523, + "time_per_iteration": 2.549304246902466 + }, + { + "auxiliary_loss_clip": 0.01101923, + "auxiliary_loss_mlp": 0.01031993, + "balance_loss_clip": 1.02063847, + "balance_loss_mlp": 1.03536248, + "epoch": 0.6928603637456786, + "flos": 18588261438720.0, + "grad_norm": 2.1482280608330355, + "language_loss": 0.68315476, + "learning_rate": 9.104660824381915e-07, + "loss": 0.70449388, + "num_input_tokens_seen": 248681125, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 11524, + "time_per_iteration": 2.443089723587036 + }, + { + "auxiliary_loss_clip": 0.0110548, + "auxiliary_loss_mlp": 0.01034425, + "balance_loss_clip": 1.02197385, + "balance_loss_mlp": 1.03614259, + "epoch": 0.6929204869983466, + "flos": 22200874784640.0, + "grad_norm": 1.8981913764440181, + "language_loss": 0.64524782, + "learning_rate": 9.101395054080815e-07, + "loss": 0.66664684, + "num_input_tokens_seen": 248700555, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.69140625, + "step": 11525, + "time_per_iteration": 2.504351854324341 + }, + { + "auxiliary_loss_clip": 0.01106067, + "auxiliary_loss_mlp": 0.01039081, + "balance_loss_clip": 1.02695775, + "balance_loss_mlp": 1.0376687, + "epoch": 0.6929806102510145, + "flos": 17894754576000.0, + "grad_norm": 1.9735788084293737, + "language_loss": 0.70338595, + "learning_rate": 9.098129697055907e-07, + "loss": 0.72483742, + "num_input_tokens_seen": 248716095, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.68359375, + "step": 11526, + "time_per_iteration": 2.4542391300201416 + }, + { + "auxiliary_loss_clip": 0.01100987, + "auxiliary_loss_mlp": 0.01028995, + "balance_loss_clip": 1.01756251, + "balance_loss_mlp": 1.03445363, + "epoch": 0.6930407335036826, + "flos": 19755178577280.0, + "grad_norm": 1.4787934463099037, + "language_loss": 0.76685685, + "learning_rate": 9.094864753431022e-07, + "loss": 0.78815675, + "num_input_tokens_seen": 248735330, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 11527, + "time_per_iteration": 2.510793685913086 + }, + { + "auxiliary_loss_clip": 0.0110112, + "auxiliary_loss_mlp": 0.0102928, + "balance_loss_clip": 1.01780045, + "balance_loss_mlp": 1.03496742, + "epoch": 0.6931008567563505, + "flos": 21544248211200.0, + "grad_norm": 1.562329187830164, + "language_loss": 0.79614961, + "learning_rate": 9.091600223329952e-07, + "loss": 0.81745368, + "num_input_tokens_seen": 248754530, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.66015625, + "step": 11528, + "time_per_iteration": 2.465226173400879 + }, + { + "auxiliary_loss_clip": 0.01099854, + "auxiliary_loss_mlp": 0.01030348, + "balance_loss_clip": 1.01909447, + "balance_loss_mlp": 1.03573501, + "epoch": 0.6931609800090185, + "flos": 26250018117120.0, + "grad_norm": 1.4331100909898178, + "language_loss": 0.76051259, + "learning_rate": 9.088336106876491e-07, + "loss": 0.78181458, + "num_input_tokens_seen": 248775825, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 11529, + "time_per_iteration": 2.5549967288970947 + }, + { + "auxiliary_loss_clip": 0.01100355, + "auxiliary_loss_mlp": 0.01035247, + "balance_loss_clip": 1.02312326, + "balance_loss_mlp": 1.0351932, + "epoch": 0.6932211032616865, + "flos": 32343376366080.0, + "grad_norm": 1.7201137592726918, + "language_loss": 0.72201979, + "learning_rate": 9.085072404194436e-07, + "loss": 0.74337578, + "num_input_tokens_seen": 248796180, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6484375, + "step": 11530, + "time_per_iteration": 2.531743049621582 + }, + { + "auxiliary_loss_clip": 0.01111171, + "auxiliary_loss_mlp": 0.01036425, + "balance_loss_clip": 1.02228653, + "balance_loss_mlp": 1.0381999, + "epoch": 0.6932812265143544, + "flos": 22049079909120.0, + "grad_norm": 1.645987038290128, + "language_loss": 0.7850855, + "learning_rate": 9.081809115407513e-07, + "loss": 0.80656147, + "num_input_tokens_seen": 248814735, + "router_z_loss_clip": 0.14160156, + "router_z_loss_mlp": 0.73046875, + "step": 11531, + "time_per_iteration": 2.500711679458618 + }, + { + "auxiliary_loss_clip": 0.01101383, + "auxiliary_loss_mlp": 0.01030167, + "balance_loss_clip": 1.01911616, + "balance_loss_mlp": 1.0353266, + "epoch": 0.6933413497670224, + "flos": 26256626219520.0, + "grad_norm": 1.5275750432937483, + "language_loss": 0.69725084, + "learning_rate": 9.078546240639484e-07, + "loss": 0.71856636, + "num_input_tokens_seen": 248839140, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.66015625, + "step": 11532, + "time_per_iteration": 2.527376174926758 + }, + { + "auxiliary_loss_clip": 0.01106351, + "auxiliary_loss_mlp": 0.01028424, + "balance_loss_clip": 1.01601446, + "balance_loss_mlp": 1.0371834, + "epoch": 0.6934014730196904, + "flos": 19573003774080.0, + "grad_norm": 1.8600077248097753, + "language_loss": 0.6705901, + "learning_rate": 9.075283780014082e-07, + "loss": 0.69193786, + "num_input_tokens_seen": 248858300, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69140625, + "step": 11533, + "time_per_iteration": 2.518920421600342 + }, + { + "auxiliary_loss_clip": 0.01105686, + "auxiliary_loss_mlp": 0.01032096, + "balance_loss_clip": 1.01975226, + "balance_loss_mlp": 1.03683567, + "epoch": 0.6934615962723584, + "flos": 22119249127680.0, + "grad_norm": 3.0139531095823893, + "language_loss": 0.58712631, + "learning_rate": 9.072021733655007e-07, + "loss": 0.60850418, + "num_input_tokens_seen": 248876310, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 11534, + "time_per_iteration": 2.4710326194763184 + }, + { + "auxiliary_loss_clip": 0.01101215, + "auxiliary_loss_mlp": 0.01029045, + "balance_loss_clip": 1.01689124, + "balance_loss_mlp": 1.03428173, + "epoch": 0.6935217195250263, + "flos": 21360816432000.0, + "grad_norm": 2.05674594042133, + "language_loss": 0.71339464, + "learning_rate": 9.068760101685971e-07, + "loss": 0.73469722, + "num_input_tokens_seen": 248895650, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.671875, + "step": 11535, + "time_per_iteration": 2.4800782203674316 + }, + { + "auxiliary_loss_clip": 0.01024678, + "auxiliary_loss_mlp": 0.01001067, + "balance_loss_clip": 0.99988097, + "balance_loss_mlp": 1.00423908, + "epoch": 0.6935818427776943, + "flos": 64063813115520.0, + "grad_norm": 0.7160519901112068, + "language_loss": 0.59069979, + "learning_rate": 9.065498884230638e-07, + "loss": 0.61095721, + "num_input_tokens_seen": 248963920, + "router_z_loss_clip": 0.01184082, + "router_z_loss_mlp": 0.20507812, + "step": 11536, + "time_per_iteration": 3.175150156021118 + }, + { + "auxiliary_loss_clip": 0.01107914, + "auxiliary_loss_mlp": 0.01030682, + "balance_loss_clip": 1.01796234, + "balance_loss_mlp": 1.03721535, + "epoch": 0.6936419660303622, + "flos": 20302564913280.0, + "grad_norm": 1.8374101085934587, + "language_loss": 0.72543836, + "learning_rate": 9.062238081412692e-07, + "loss": 0.74682426, + "num_input_tokens_seen": 248983380, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 11537, + "time_per_iteration": 2.4590697288513184 + }, + { + "auxiliary_loss_clip": 0.01024524, + "auxiliary_loss_mlp": 0.00999962, + "balance_loss_clip": 0.99879992, + "balance_loss_mlp": 1.0041244, + "epoch": 0.6937020892830302, + "flos": 67182581347200.0, + "grad_norm": 0.7454400182413451, + "language_loss": 0.55605686, + "learning_rate": 9.058977693355767e-07, + "loss": 0.57630169, + "num_input_tokens_seen": 249044680, + "router_z_loss_clip": 0.01159668, + "router_z_loss_mlp": 0.20410156, + "step": 11538, + "time_per_iteration": 3.05582332611084 + }, + { + "auxiliary_loss_clip": 0.01098329, + "auxiliary_loss_mlp": 0.01030961, + "balance_loss_clip": 1.02029145, + "balance_loss_mlp": 1.03483129, + "epoch": 0.6937622125356981, + "flos": 23878190229120.0, + "grad_norm": 1.5310037982769402, + "language_loss": 0.77299392, + "learning_rate": 9.055717720183505e-07, + "loss": 0.79428679, + "num_input_tokens_seen": 249061060, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.63671875, + "step": 11539, + "time_per_iteration": 2.478339433670044 + }, + { + "auxiliary_loss_clip": 0.01102344, + "auxiliary_loss_mlp": 0.01026627, + "balance_loss_clip": 1.01524878, + "balance_loss_mlp": 1.03527951, + "epoch": 0.6938223357883662, + "flos": 28730619365760.0, + "grad_norm": 1.7857614206632793, + "language_loss": 0.64559513, + "learning_rate": 9.05245816201953e-07, + "loss": 0.66688484, + "num_input_tokens_seen": 249081430, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 11540, + "time_per_iteration": 2.5308845043182373 + }, + { + "auxiliary_loss_clip": 0.01102212, + "auxiliary_loss_mlp": 0.01028001, + "balance_loss_clip": 1.01658714, + "balance_loss_mlp": 1.03576088, + "epoch": 0.6938824590410341, + "flos": 28655027193600.0, + "grad_norm": 1.5373758397394544, + "language_loss": 0.8667385, + "learning_rate": 9.049199018987437e-07, + "loss": 0.88804066, + "num_input_tokens_seen": 249103020, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 11541, + "time_per_iteration": 2.5364692211151123 + }, + { + "auxiliary_loss_clip": 0.01103258, + "auxiliary_loss_mlp": 0.01031894, + "balance_loss_clip": 1.02015162, + "balance_loss_mlp": 1.03593302, + "epoch": 0.6939425822937021, + "flos": 18983062800000.0, + "grad_norm": 1.7924323447912938, + "language_loss": 0.84049714, + "learning_rate": 9.04594029121081e-07, + "loss": 0.86184859, + "num_input_tokens_seen": 249120810, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 11542, + "time_per_iteration": 2.4829962253570557 + }, + { + "auxiliary_loss_clip": 0.01104055, + "auxiliary_loss_mlp": 0.01029315, + "balance_loss_clip": 1.01595759, + "balance_loss_mlp": 1.0352869, + "epoch": 0.6940027055463701, + "flos": 23075838178560.0, + "grad_norm": 1.8414334065280868, + "language_loss": 0.75269711, + "learning_rate": 9.04268197881323e-07, + "loss": 0.77403086, + "num_input_tokens_seen": 249138050, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6875, + "step": 11543, + "time_per_iteration": 2.452195167541504 + }, + { + "auxiliary_loss_clip": 0.01102342, + "auxiliary_loss_mlp": 0.01031344, + "balance_loss_clip": 1.01960182, + "balance_loss_mlp": 1.03582442, + "epoch": 0.694062828799038, + "flos": 18186564666240.0, + "grad_norm": 1.6661945850864863, + "language_loss": 0.76122248, + "learning_rate": 9.039424081918241e-07, + "loss": 0.78255928, + "num_input_tokens_seen": 249155570, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 11544, + "time_per_iteration": 2.461024761199951 + }, + { + "auxiliary_loss_clip": 0.01105964, + "auxiliary_loss_mlp": 0.01033804, + "balance_loss_clip": 1.02216911, + "balance_loss_mlp": 1.03684866, + "epoch": 0.694122952051706, + "flos": 17821532701440.0, + "grad_norm": 1.7008976535157667, + "language_loss": 0.71218264, + "learning_rate": 9.036166600649388e-07, + "loss": 0.73358029, + "num_input_tokens_seen": 249172960, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.69140625, + "step": 11545, + "time_per_iteration": 2.4178249835968018 + }, + { + "auxiliary_loss_clip": 0.01099714, + "auxiliary_loss_mlp": 0.01027089, + "balance_loss_clip": 1.01646137, + "balance_loss_mlp": 1.03581667, + "epoch": 0.694183075304374, + "flos": 21215306436480.0, + "grad_norm": 1.933857108829042, + "language_loss": 0.79382741, + "learning_rate": 9.0329095351302e-07, + "loss": 0.81509542, + "num_input_tokens_seen": 249192450, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.640625, + "step": 11546, + "time_per_iteration": 2.467369794845581 + }, + { + "auxiliary_loss_clip": 0.01105153, + "auxiliary_loss_mlp": 0.0102905, + "balance_loss_clip": 1.01755857, + "balance_loss_mlp": 1.03803396, + "epoch": 0.694243198557042, + "flos": 24060508686720.0, + "grad_norm": 2.1784420231587562, + "language_loss": 0.78471816, + "learning_rate": 9.029652885484194e-07, + "loss": 0.80606019, + "num_input_tokens_seen": 249214320, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 11547, + "time_per_iteration": 2.5005674362182617 + }, + { + "auxiliary_loss_clip": 0.01104152, + "auxiliary_loss_mlp": 0.01033664, + "balance_loss_clip": 1.02177894, + "balance_loss_mlp": 1.03765762, + "epoch": 0.6943033218097099, + "flos": 21141869080320.0, + "grad_norm": 2.1600607182563323, + "language_loss": 0.81004536, + "learning_rate": 9.026396651834834e-07, + "loss": 0.83142352, + "num_input_tokens_seen": 249230925, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 11548, + "time_per_iteration": 2.467039108276367 + }, + { + "auxiliary_loss_clip": 0.01024313, + "auxiliary_loss_mlp": 0.01001552, + "balance_loss_clip": 1.00046158, + "balance_loss_mlp": 1.003824, + "epoch": 0.6943634450623779, + "flos": 57812015975040.0, + "grad_norm": 0.6998312619688671, + "language_loss": 0.53725159, + "learning_rate": 9.023140834305613e-07, + "loss": 0.55751026, + "num_input_tokens_seen": 249293975, + "router_z_loss_clip": 0.01092529, + "router_z_loss_mlp": 0.20507812, + "step": 11549, + "time_per_iteration": 3.049893617630005 + }, + { + "auxiliary_loss_clip": 0.01102026, + "auxiliary_loss_mlp": 0.01031521, + "balance_loss_clip": 1.01926637, + "balance_loss_mlp": 1.03490329, + "epoch": 0.6944235683150458, + "flos": 30590684231040.0, + "grad_norm": 1.4134834791230244, + "language_loss": 0.7344752, + "learning_rate": 9.01988543302e-07, + "loss": 0.75581068, + "num_input_tokens_seen": 249315285, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.671875, + "step": 11550, + "time_per_iteration": 2.5287935733795166 + }, + { + "auxiliary_loss_clip": 0.01105894, + "auxiliary_loss_mlp": 0.01035534, + "balance_loss_clip": 1.02367878, + "balance_loss_mlp": 1.03701949, + "epoch": 0.6944836915677138, + "flos": 19719447523200.0, + "grad_norm": 1.8969044968976483, + "language_loss": 0.73992145, + "learning_rate": 9.016630448101425e-07, + "loss": 0.76133573, + "num_input_tokens_seen": 249333505, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6875, + "step": 11551, + "time_per_iteration": 2.4404563903808594 + }, + { + "auxiliary_loss_clip": 0.01104938, + "auxiliary_loss_mlp": 0.01037921, + "balance_loss_clip": 1.02617919, + "balance_loss_mlp": 1.03671432, + "epoch": 0.6945438148203817, + "flos": 24863579009280.0, + "grad_norm": 1.6277950876042102, + "language_loss": 0.84549385, + "learning_rate": 9.01337587967333e-07, + "loss": 0.86692244, + "num_input_tokens_seen": 249354180, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 11552, + "time_per_iteration": 2.498476028442383 + }, + { + "auxiliary_loss_clip": 0.01102767, + "auxiliary_loss_mlp": 0.01034275, + "balance_loss_clip": 1.02255046, + "balance_loss_mlp": 1.03642297, + "epoch": 0.6946039380730498, + "flos": 33326646243840.0, + "grad_norm": 1.5310970869840324, + "language_loss": 0.67400169, + "learning_rate": 9.010121727859117e-07, + "loss": 0.6953721, + "num_input_tokens_seen": 249377035, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 11553, + "time_per_iteration": 3.92946720123291 + }, + { + "auxiliary_loss_clip": 0.01107649, + "auxiliary_loss_mlp": 0.01028199, + "balance_loss_clip": 1.01543725, + "balance_loss_mlp": 1.03727949, + "epoch": 0.6946640613257177, + "flos": 20850956830080.0, + "grad_norm": 1.5363855656738201, + "language_loss": 0.79580885, + "learning_rate": 9.006867992782195e-07, + "loss": 0.8171674, + "num_input_tokens_seen": 249396155, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.703125, + "step": 11554, + "time_per_iteration": 2.469681978225708 + }, + { + "auxiliary_loss_clip": 0.01103857, + "auxiliary_loss_mlp": 0.01029016, + "balance_loss_clip": 1.0172801, + "balance_loss_mlp": 1.03479338, + "epoch": 0.6947241845783857, + "flos": 19354846521600.0, + "grad_norm": 1.7519879066783155, + "language_loss": 0.72581065, + "learning_rate": 9.003614674565934e-07, + "loss": 0.74713933, + "num_input_tokens_seen": 249414555, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.69140625, + "step": 11555, + "time_per_iteration": 3.862004280090332 + }, + { + "auxiliary_loss_clip": 0.01100586, + "auxiliary_loss_mlp": 0.01027174, + "balance_loss_clip": 1.01587296, + "balance_loss_mlp": 1.0338273, + "epoch": 0.6947843078310536, + "flos": 27120240915840.0, + "grad_norm": 1.9852142507231525, + "language_loss": 0.78025049, + "learning_rate": 9.000361773333705e-07, + "loss": 0.8015281, + "num_input_tokens_seen": 249433570, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.66796875, + "step": 11556, + "time_per_iteration": 5.454412937164307 + }, + { + "auxiliary_loss_clip": 0.01101652, + "auxiliary_loss_mlp": 0.01034952, + "balance_loss_clip": 1.0232873, + "balance_loss_mlp": 1.03403139, + "epoch": 0.6948444310837216, + "flos": 28585109370240.0, + "grad_norm": 2.79871624128239, + "language_loss": 0.60282063, + "learning_rate": 8.997109289208869e-07, + "loss": 0.62418664, + "num_input_tokens_seen": 249453735, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.67578125, + "step": 11557, + "time_per_iteration": 2.5056674480438232 + }, + { + "auxiliary_loss_clip": 0.01100911, + "auxiliary_loss_mlp": 0.01036469, + "balance_loss_clip": 1.02463782, + "balance_loss_mlp": 1.03539312, + "epoch": 0.6949045543363896, + "flos": 15669262696320.0, + "grad_norm": 1.6476789256185396, + "language_loss": 0.8537513, + "learning_rate": 8.993857222314752e-07, + "loss": 0.87512511, + "num_input_tokens_seen": 249470805, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65625, + "step": 11558, + "time_per_iteration": 2.456141948699951 + }, + { + "auxiliary_loss_clip": 0.01105067, + "auxiliary_loss_mlp": 0.01029415, + "balance_loss_clip": 1.01679111, + "balance_loss_mlp": 1.03618479, + "epoch": 0.6949646775890576, + "flos": 23259413612160.0, + "grad_norm": 1.6025671858040744, + "language_loss": 0.70371419, + "learning_rate": 8.990605572774664e-07, + "loss": 0.72505903, + "num_input_tokens_seen": 249491150, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 11559, + "time_per_iteration": 2.5148940086364746 + }, + { + "auxiliary_loss_clip": 0.01101778, + "auxiliary_loss_mlp": 0.01028231, + "balance_loss_clip": 1.01674509, + "balance_loss_mlp": 1.03588152, + "epoch": 0.6950248008417256, + "flos": 22382546797440.0, + "grad_norm": 1.5297645646514304, + "language_loss": 0.78975582, + "learning_rate": 8.987354340711921e-07, + "loss": 0.8110559, + "num_input_tokens_seen": 249511560, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66015625, + "step": 11560, + "time_per_iteration": 2.504146099090576 + }, + { + "auxiliary_loss_clip": 0.01101416, + "auxiliary_loss_mlp": 0.01030876, + "balance_loss_clip": 1.01942587, + "balance_loss_mlp": 1.03616834, + "epoch": 0.6950849240943935, + "flos": 23477355383040.0, + "grad_norm": 1.666384056444463, + "language_loss": 0.76987702, + "learning_rate": 8.9841035262498e-07, + "loss": 0.79119992, + "num_input_tokens_seen": 249531910, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 11561, + "time_per_iteration": 2.480802536010742 + }, + { + "auxiliary_loss_clip": 0.0109923, + "auxiliary_loss_mlp": 0.01030204, + "balance_loss_clip": 1.01806235, + "balance_loss_mlp": 1.03331923, + "epoch": 0.6951450473470615, + "flos": 17420554200960.0, + "grad_norm": 1.7391531347439242, + "language_loss": 0.78634578, + "learning_rate": 8.980853129511577e-07, + "loss": 0.80764008, + "num_input_tokens_seen": 249550300, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66015625, + "step": 11562, + "time_per_iteration": 2.438997268676758 + }, + { + "auxiliary_loss_clip": 0.0110346, + "auxiliary_loss_mlp": 0.01032953, + "balance_loss_clip": 1.02100253, + "balance_loss_mlp": 1.03525412, + "epoch": 0.6952051705997294, + "flos": 20485745297280.0, + "grad_norm": 1.9230268961820236, + "language_loss": 0.69259918, + "learning_rate": 8.977603150620515e-07, + "loss": 0.71396333, + "num_input_tokens_seen": 249567740, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 11563, + "time_per_iteration": 2.4467828273773193 + }, + { + "auxiliary_loss_clip": 0.01097161, + "auxiliary_loss_mlp": 0.01025707, + "balance_loss_clip": 1.01491845, + "balance_loss_mlp": 1.03383183, + "epoch": 0.6952652938523974, + "flos": 13989541040640.0, + "grad_norm": 2.153945918609724, + "language_loss": 0.73383999, + "learning_rate": 8.974353589699846e-07, + "loss": 0.75506866, + "num_input_tokens_seen": 249582700, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6328125, + "step": 11564, + "time_per_iteration": 2.4219517707824707 + }, + { + "auxiliary_loss_clip": 0.01117667, + "auxiliary_loss_mlp": 0.01036182, + "balance_loss_clip": 1.02188849, + "balance_loss_mlp": 1.04055667, + "epoch": 0.6953254171050653, + "flos": 30953956429440.0, + "grad_norm": 1.987939257518994, + "language_loss": 0.71758306, + "learning_rate": 8.971104446872785e-07, + "loss": 0.73912156, + "num_input_tokens_seen": 249602920, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.76953125, + "step": 11565, + "time_per_iteration": 2.5249881744384766 + }, + { + "auxiliary_loss_clip": 0.01024476, + "auxiliary_loss_mlp": 0.01001909, + "balance_loss_clip": 1.00083661, + "balance_loss_mlp": 1.00426412, + "epoch": 0.6953855403577334, + "flos": 61670257499520.0, + "grad_norm": 0.9231095353674287, + "language_loss": 0.58470231, + "learning_rate": 8.96785572226255e-07, + "loss": 0.60496616, + "num_input_tokens_seen": 249660400, + "router_z_loss_clip": 0.01074219, + "router_z_loss_mlp": 0.20214844, + "step": 11566, + "time_per_iteration": 2.9420695304870605 + }, + { + "auxiliary_loss_clip": 0.0110462, + "auxiliary_loss_mlp": 0.01028714, + "balance_loss_clip": 1.01639366, + "balance_loss_mlp": 1.03440809, + "epoch": 0.6954456636104013, + "flos": 23039029716480.0, + "grad_norm": 1.9048250540658576, + "language_loss": 0.74568522, + "learning_rate": 8.964607415992338e-07, + "loss": 0.76701856, + "num_input_tokens_seen": 249679335, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 11567, + "time_per_iteration": 2.4744651317596436 + }, + { + "auxiliary_loss_clip": 0.01100227, + "auxiliary_loss_mlp": 0.01033708, + "balance_loss_clip": 1.02154922, + "balance_loss_mlp": 1.03473878, + "epoch": 0.6955057868630693, + "flos": 23918518224000.0, + "grad_norm": 1.342733224210211, + "language_loss": 0.76978123, + "learning_rate": 8.961359528185313e-07, + "loss": 0.79112065, + "num_input_tokens_seen": 249701805, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.65625, + "step": 11568, + "time_per_iteration": 2.5342469215393066 + }, + { + "auxiliary_loss_clip": 0.01103163, + "auxiliary_loss_mlp": 0.01033796, + "balance_loss_clip": 1.02255452, + "balance_loss_mlp": 1.03756905, + "epoch": 0.6955659101157372, + "flos": 22594634651520.0, + "grad_norm": 4.390531062594107, + "language_loss": 0.72720057, + "learning_rate": 8.958112058964649e-07, + "loss": 0.74857014, + "num_input_tokens_seen": 249720550, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 11569, + "time_per_iteration": 2.44547438621521 + }, + { + "auxiliary_loss_clip": 0.01104961, + "auxiliary_loss_mlp": 0.0102821, + "balance_loss_clip": 1.01642609, + "balance_loss_mlp": 1.03668261, + "epoch": 0.6956260333684052, + "flos": 24572523104640.0, + "grad_norm": 2.456023744681467, + "language_loss": 0.77213609, + "learning_rate": 8.954865008453471e-07, + "loss": 0.79346788, + "num_input_tokens_seen": 249740325, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.68359375, + "step": 11570, + "time_per_iteration": 2.539635419845581 + }, + { + "auxiliary_loss_clip": 0.01104303, + "auxiliary_loss_mlp": 0.01033386, + "balance_loss_clip": 1.02089262, + "balance_loss_mlp": 1.03544307, + "epoch": 0.6956861566210732, + "flos": 25846058787840.0, + "grad_norm": 2.0491810853886125, + "language_loss": 0.74309134, + "learning_rate": 8.95161837677493e-07, + "loss": 0.76446825, + "num_input_tokens_seen": 249760570, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 11571, + "time_per_iteration": 2.5310707092285156 + }, + { + "auxiliary_loss_clip": 0.0109878, + "auxiliary_loss_mlp": 0.01030092, + "balance_loss_clip": 1.01863599, + "balance_loss_mlp": 1.03522277, + "epoch": 0.6957462798737412, + "flos": 15301393557120.0, + "grad_norm": 2.2800160570301395, + "language_loss": 0.74539, + "learning_rate": 8.948372164052118e-07, + "loss": 0.76667869, + "num_input_tokens_seen": 249778290, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.63671875, + "step": 11572, + "time_per_iteration": 2.454315423965454 + }, + { + "auxiliary_loss_clip": 0.01101105, + "auxiliary_loss_mlp": 0.0102909, + "balance_loss_clip": 1.01692498, + "balance_loss_mlp": 1.03309405, + "epoch": 0.6958064031264092, + "flos": 36246830135040.0, + "grad_norm": 1.919471935586269, + "language_loss": 0.7033447, + "learning_rate": 8.94512637040814e-07, + "loss": 0.72464669, + "num_input_tokens_seen": 249800925, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6796875, + "step": 11573, + "time_per_iteration": 2.6062417030334473 + }, + { + "auxiliary_loss_clip": 0.01108794, + "auxiliary_loss_mlp": 0.0103278, + "balance_loss_clip": 1.02028072, + "balance_loss_mlp": 1.03887129, + "epoch": 0.6958665263790771, + "flos": 19208725994880.0, + "grad_norm": 1.9750506885077386, + "language_loss": 0.74985647, + "learning_rate": 8.941880995966095e-07, + "loss": 0.77127224, + "num_input_tokens_seen": 249820500, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69921875, + "step": 11574, + "time_per_iteration": 2.4739365577697754 + }, + { + "auxiliary_loss_clip": 0.01105022, + "auxiliary_loss_mlp": 0.01031169, + "balance_loss_clip": 1.01920092, + "balance_loss_mlp": 1.03574729, + "epoch": 0.6959266496317451, + "flos": 21795838047360.0, + "grad_norm": 1.6163956776113584, + "language_loss": 0.74427664, + "learning_rate": 8.938636040849014e-07, + "loss": 0.76563859, + "num_input_tokens_seen": 249839845, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.69140625, + "step": 11575, + "time_per_iteration": 2.4526143074035645 + }, + { + "auxiliary_loss_clip": 0.01103541, + "auxiliary_loss_mlp": 0.01031176, + "balance_loss_clip": 1.01851606, + "balance_loss_mlp": 1.03498685, + "epoch": 0.695986772884413, + "flos": 20558248899840.0, + "grad_norm": 2.202817220265, + "language_loss": 0.78680444, + "learning_rate": 8.935391505179966e-07, + "loss": 0.80815148, + "num_input_tokens_seen": 249857400, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 11576, + "time_per_iteration": 2.489030122756958 + }, + { + "auxiliary_loss_clip": 0.01104629, + "auxiliary_loss_mlp": 0.01031961, + "balance_loss_clip": 1.0206064, + "balance_loss_mlp": 1.03426623, + "epoch": 0.696046896137081, + "flos": 14936217937920.0, + "grad_norm": 2.167216169901492, + "language_loss": 0.56448716, + "learning_rate": 8.932147389081985e-07, + "loss": 0.5858531, + "num_input_tokens_seen": 249871645, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.703125, + "step": 11577, + "time_per_iteration": 2.402588367462158 + }, + { + "auxiliary_loss_clip": 0.010978, + "auxiliary_loss_mlp": 0.01020474, + "balance_loss_clip": 1.01061571, + "balance_loss_mlp": 1.0344727, + "epoch": 0.696107019389749, + "flos": 30740216549760.0, + "grad_norm": 1.3300447055766056, + "language_loss": 0.76633966, + "learning_rate": 8.928903692678081e-07, + "loss": 0.78752244, + "num_input_tokens_seen": 249894215, + "router_z_loss_clip": 0.09863281, + "router_z_loss_mlp": 0.6328125, + "step": 11578, + "time_per_iteration": 2.5856926441192627 + }, + { + "auxiliary_loss_clip": 0.01103837, + "auxiliary_loss_mlp": 0.01031702, + "balance_loss_clip": 1.02003193, + "balance_loss_mlp": 1.03707981, + "epoch": 0.696167142642417, + "flos": 20776729374720.0, + "grad_norm": 1.9898977429274547, + "language_loss": 0.7948364, + "learning_rate": 8.925660416091254e-07, + "loss": 0.81619179, + "num_input_tokens_seen": 249912850, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66796875, + "step": 11579, + "time_per_iteration": 2.4593424797058105 + }, + { + "auxiliary_loss_clip": 0.01097825, + "auxiliary_loss_mlp": 0.01026526, + "balance_loss_clip": 1.0148437, + "balance_loss_mlp": 1.03269458, + "epoch": 0.6962272658950849, + "flos": 22565152563840.0, + "grad_norm": 1.7711043261793566, + "language_loss": 0.72253591, + "learning_rate": 8.922417559444502e-07, + "loss": 0.74377942, + "num_input_tokens_seen": 249932650, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65234375, + "step": 11580, + "time_per_iteration": 2.5214614868164062 + }, + { + "auxiliary_loss_clip": 0.0110553, + "auxiliary_loss_mlp": 0.01029239, + "balance_loss_clip": 1.01617932, + "balance_loss_mlp": 1.03608978, + "epoch": 0.6962873891477529, + "flos": 22200156512640.0, + "grad_norm": 1.861307366576084, + "language_loss": 0.65531254, + "learning_rate": 8.919175122860787e-07, + "loss": 0.67666024, + "num_input_tokens_seen": 249951205, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6953125, + "step": 11581, + "time_per_iteration": 2.519068479537964 + }, + { + "auxiliary_loss_clip": 0.01102711, + "auxiliary_loss_mlp": 0.01030191, + "balance_loss_clip": 1.01883006, + "balance_loss_mlp": 1.03555655, + "epoch": 0.6963475124004208, + "flos": 12489695717760.0, + "grad_norm": 2.390157722365771, + "language_loss": 0.76223433, + "learning_rate": 8.915933106463056e-07, + "loss": 0.78356332, + "num_input_tokens_seen": 249967045, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 11582, + "time_per_iteration": 2.444866418838501 + }, + { + "auxiliary_loss_clip": 0.01101303, + "auxiliary_loss_mlp": 0.01029814, + "balance_loss_clip": 1.01910901, + "balance_loss_mlp": 1.03478706, + "epoch": 0.6964076356530888, + "flos": 17165085696000.0, + "grad_norm": 1.876033269945707, + "language_loss": 0.69968796, + "learning_rate": 8.91269151037425e-07, + "loss": 0.72099912, + "num_input_tokens_seen": 249984565, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6640625, + "step": 11583, + "time_per_iteration": 2.430619239807129 + }, + { + "auxiliary_loss_clip": 0.01105097, + "auxiliary_loss_mlp": 0.0103149, + "balance_loss_clip": 1.01950979, + "balance_loss_mlp": 1.03693569, + "epoch": 0.6964677589057569, + "flos": 19937317466880.0, + "grad_norm": 2.37757967168826, + "language_loss": 0.82697153, + "learning_rate": 8.909450334717301e-07, + "loss": 0.84833741, + "num_input_tokens_seen": 250004235, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6796875, + "step": 11584, + "time_per_iteration": 2.5077664852142334 + }, + { + "auxiliary_loss_clip": 0.01105057, + "auxiliary_loss_mlp": 0.01035873, + "balance_loss_clip": 1.02336848, + "balance_loss_mlp": 1.03613901, + "epoch": 0.6965278821584248, + "flos": 22784064001920.0, + "grad_norm": 2.430393804317416, + "language_loss": 0.79577053, + "learning_rate": 8.906209579615107e-07, + "loss": 0.8171798, + "num_input_tokens_seen": 250017645, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 11585, + "time_per_iteration": 2.4488959312438965 + }, + { + "auxiliary_loss_clip": 0.01099431, + "auxiliary_loss_mlp": 0.0103141, + "balance_loss_clip": 1.02046049, + "balance_loss_mlp": 1.03464603, + "epoch": 0.6965880054110928, + "flos": 20047563285120.0, + "grad_norm": 1.5234092919525861, + "language_loss": 0.77759147, + "learning_rate": 8.90296924519055e-07, + "loss": 0.79889989, + "num_input_tokens_seen": 250037640, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 11586, + "time_per_iteration": 2.4705069065093994 + }, + { + "auxiliary_loss_clip": 0.01096075, + "auxiliary_loss_mlp": 0.01030607, + "balance_loss_clip": 1.02000952, + "balance_loss_mlp": 1.03367376, + "epoch": 0.6966481286637607, + "flos": 21908238681600.0, + "grad_norm": 1.7766488711687052, + "language_loss": 0.78765887, + "learning_rate": 8.899729331566519e-07, + "loss": 0.80892575, + "num_input_tokens_seen": 250056490, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.625, + "step": 11587, + "time_per_iteration": 2.4538965225219727 + }, + { + "auxiliary_loss_clip": 0.01100978, + "auxiliary_loss_mlp": 0.01030866, + "balance_loss_clip": 1.01916003, + "balance_loss_mlp": 1.03608429, + "epoch": 0.6967082519164287, + "flos": 15633172506240.0, + "grad_norm": 2.0434006837874885, + "language_loss": 0.72847271, + "learning_rate": 8.896489838865857e-07, + "loss": 0.74979115, + "num_input_tokens_seen": 250074285, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6484375, + "step": 11588, + "time_per_iteration": 2.452421188354492 + }, + { + "auxiliary_loss_clip": 0.01101081, + "auxiliary_loss_mlp": 0.0102536, + "balance_loss_clip": 1.01462507, + "balance_loss_mlp": 1.03454709, + "epoch": 0.6967683751690966, + "flos": 24024598064640.0, + "grad_norm": 1.6358395354491653, + "language_loss": 0.75110734, + "learning_rate": 8.893250767211413e-07, + "loss": 0.77237165, + "num_input_tokens_seen": 250093350, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6640625, + "step": 11589, + "time_per_iteration": 2.466801643371582 + }, + { + "auxiliary_loss_clip": 0.01102838, + "auxiliary_loss_mlp": 0.01029576, + "balance_loss_clip": 1.01815021, + "balance_loss_mlp": 1.03571272, + "epoch": 0.6968284984217646, + "flos": 31024700265600.0, + "grad_norm": 1.8223612278895884, + "language_loss": 0.63479555, + "learning_rate": 8.890012116726012e-07, + "loss": 0.6561197, + "num_input_tokens_seen": 250114170, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 11590, + "time_per_iteration": 2.547621011734009 + }, + { + "auxiliary_loss_clip": 0.0102506, + "auxiliary_loss_mlp": 0.00999727, + "balance_loss_clip": 0.99851686, + "balance_loss_mlp": 1.00460005, + "epoch": 0.6968886216744326, + "flos": 67622990002560.0, + "grad_norm": 0.7464434837595778, + "language_loss": 0.61278826, + "learning_rate": 8.88677388753248e-07, + "loss": 0.63303614, + "num_input_tokens_seen": 250178250, + "router_z_loss_clip": 0.01208496, + "router_z_loss_mlp": 0.20507812, + "step": 11591, + "time_per_iteration": 3.138062000274658 + }, + { + "auxiliary_loss_clip": 0.01106658, + "auxiliary_loss_mlp": 0.010328, + "balance_loss_clip": 1.02039623, + "balance_loss_mlp": 1.03897679, + "epoch": 0.6969487449271006, + "flos": 24863686750080.0, + "grad_norm": 2.149264324135608, + "language_loss": 0.69040775, + "learning_rate": 8.883536079753582e-07, + "loss": 0.7118023, + "num_input_tokens_seen": 250198420, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 11592, + "time_per_iteration": 2.4973015785217285 + }, + { + "auxiliary_loss_clip": 0.01102777, + "auxiliary_loss_mlp": 0.01027086, + "balance_loss_clip": 1.01633334, + "balance_loss_mlp": 1.03731585, + "epoch": 0.6970088681797685, + "flos": 28767858791040.0, + "grad_norm": 1.7113840138583603, + "language_loss": 0.62385631, + "learning_rate": 8.880298693512109e-07, + "loss": 0.64515489, + "num_input_tokens_seen": 250220650, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.65625, + "step": 11593, + "time_per_iteration": 2.5094406604766846 + }, + { + "auxiliary_loss_clip": 0.01098813, + "auxiliary_loss_mlp": 0.0102709, + "balance_loss_clip": 1.01652873, + "balance_loss_mlp": 1.03533387, + "epoch": 0.6970689914324365, + "flos": 27308556944640.0, + "grad_norm": 1.6455172692601516, + "language_loss": 0.54323792, + "learning_rate": 8.877061728930832e-07, + "loss": 0.56449699, + "num_input_tokens_seen": 250241750, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6328125, + "step": 11594, + "time_per_iteration": 3.896481513977051 + }, + { + "auxiliary_loss_clip": 0.01100941, + "auxiliary_loss_mlp": 0.01025932, + "balance_loss_clip": 1.01524472, + "balance_loss_mlp": 1.03542423, + "epoch": 0.6971291146851044, + "flos": 19136258305920.0, + "grad_norm": 2.382773789064297, + "language_loss": 0.77469057, + "learning_rate": 8.87382518613248e-07, + "loss": 0.79595929, + "num_input_tokens_seen": 250259445, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.65625, + "step": 11595, + "time_per_iteration": 2.4667396545410156 + }, + { + "auxiliary_loss_clip": 0.01107354, + "auxiliary_loss_mlp": 0.01030534, + "balance_loss_clip": 1.01850617, + "balance_loss_mlp": 1.03804874, + "epoch": 0.6971892379377724, + "flos": 14610508387200.0, + "grad_norm": 2.2493761025640957, + "language_loss": 0.71796727, + "learning_rate": 8.870589065239793e-07, + "loss": 0.73934615, + "num_input_tokens_seen": 250275640, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6953125, + "step": 11596, + "time_per_iteration": 3.921229839324951 + }, + { + "auxiliary_loss_clip": 0.01105557, + "auxiliary_loss_mlp": 0.01031813, + "balance_loss_clip": 1.0197432, + "balance_loss_mlp": 1.03878427, + "epoch": 0.6972493611904405, + "flos": 22307457415680.0, + "grad_norm": 1.6145547078757287, + "language_loss": 0.76072466, + "learning_rate": 8.867353366375492e-07, + "loss": 0.78209841, + "num_input_tokens_seen": 250296435, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66796875, + "step": 11597, + "time_per_iteration": 3.8901522159576416 + }, + { + "auxiliary_loss_clip": 0.01101534, + "auxiliary_loss_mlp": 0.01034099, + "balance_loss_clip": 1.02247071, + "balance_loss_mlp": 1.03553581, + "epoch": 0.6973094844431084, + "flos": 17420374632960.0, + "grad_norm": 1.8362035763244782, + "language_loss": 0.74662215, + "learning_rate": 8.864118089662267e-07, + "loss": 0.76797849, + "num_input_tokens_seen": 250314035, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66015625, + "step": 11598, + "time_per_iteration": 3.8907439708709717 + }, + { + "auxiliary_loss_clip": 0.01105596, + "auxiliary_loss_mlp": 0.0103258, + "balance_loss_clip": 1.02021837, + "balance_loss_mlp": 1.03667629, + "epoch": 0.6973696076957764, + "flos": 27235370983680.0, + "grad_norm": 1.7078147721602885, + "language_loss": 0.89751863, + "learning_rate": 8.860883235222791e-07, + "loss": 0.91890037, + "num_input_tokens_seen": 250332995, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69140625, + "step": 11599, + "time_per_iteration": 2.508460760116577 + }, + { + "auxiliary_loss_clip": 0.01107859, + "auxiliary_loss_mlp": 0.01035136, + "balance_loss_clip": 1.02237415, + "balance_loss_mlp": 1.03705978, + "epoch": 0.6974297309484443, + "flos": 22018089450240.0, + "grad_norm": 2.217668834863667, + "language_loss": 0.69431078, + "learning_rate": 8.85764880317974e-07, + "loss": 0.7157408, + "num_input_tokens_seen": 250352120, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7109375, + "step": 11600, + "time_per_iteration": 2.4692399501800537 + }, + { + "auxiliary_loss_clip": 0.01100885, + "auxiliary_loss_mlp": 0.01030939, + "balance_loss_clip": 1.01923847, + "balance_loss_mlp": 1.03319108, + "epoch": 0.6974898542011123, + "flos": 28366449327360.0, + "grad_norm": 2.0745134651859853, + "language_loss": 0.76886988, + "learning_rate": 8.854414793655771e-07, + "loss": 0.79018807, + "num_input_tokens_seen": 250371705, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 11601, + "time_per_iteration": 2.5153214931488037 + }, + { + "auxiliary_loss_clip": 0.0109772, + "auxiliary_loss_mlp": 0.0103129, + "balance_loss_clip": 1.02020907, + "balance_loss_mlp": 1.03365159, + "epoch": 0.6975499774537802, + "flos": 15232050351360.0, + "grad_norm": 1.7793101834620162, + "language_loss": 0.72061765, + "learning_rate": 8.851181206773508e-07, + "loss": 0.74190778, + "num_input_tokens_seen": 250390485, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.640625, + "step": 11602, + "time_per_iteration": 2.4385433197021484 + }, + { + "auxiliary_loss_clip": 0.01102254, + "auxiliary_loss_mlp": 0.010339, + "balance_loss_clip": 1.0228374, + "balance_loss_mlp": 1.0355022, + "epoch": 0.6976101007064482, + "flos": 22157422306560.0, + "grad_norm": 2.295891013382411, + "language_loss": 0.76406467, + "learning_rate": 8.847948042655567e-07, + "loss": 0.78542626, + "num_input_tokens_seen": 250407020, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.66796875, + "step": 11603, + "time_per_iteration": 2.451995611190796 + }, + { + "auxiliary_loss_clip": 0.01102122, + "auxiliary_loss_mlp": 0.01030282, + "balance_loss_clip": 1.01861763, + "balance_loss_mlp": 1.03604972, + "epoch": 0.6976702239591162, + "flos": 22273522041600.0, + "grad_norm": 3.2492511864977476, + "language_loss": 0.62036002, + "learning_rate": 8.844715301424557e-07, + "loss": 0.64168406, + "num_input_tokens_seen": 250425880, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66015625, + "step": 11604, + "time_per_iteration": 2.4743845462799072 + }, + { + "auxiliary_loss_clip": 0.01102984, + "auxiliary_loss_mlp": 0.01033432, + "balance_loss_clip": 1.02031875, + "balance_loss_mlp": 1.03493071, + "epoch": 0.6977303472117842, + "flos": 25848608653440.0, + "grad_norm": 2.371593906069345, + "language_loss": 0.81601393, + "learning_rate": 8.841482983203057e-07, + "loss": 0.83737808, + "num_input_tokens_seen": 250442925, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6796875, + "step": 11605, + "time_per_iteration": 2.4963574409484863 + }, + { + "auxiliary_loss_clip": 0.01102471, + "auxiliary_loss_mlp": 0.01029368, + "balance_loss_clip": 1.01794219, + "balance_loss_mlp": 1.03550363, + "epoch": 0.6977904704644521, + "flos": 20959586536320.0, + "grad_norm": 1.5505350039714891, + "language_loss": 0.70039761, + "learning_rate": 8.838251088113638e-07, + "loss": 0.72171599, + "num_input_tokens_seen": 250461220, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 11606, + "time_per_iteration": 2.464792490005493 + }, + { + "auxiliary_loss_clip": 0.01105207, + "auxiliary_loss_mlp": 0.01028269, + "balance_loss_clip": 1.0165329, + "balance_loss_mlp": 1.03639364, + "epoch": 0.6978505937171201, + "flos": 22055041566720.0, + "grad_norm": 2.811539216798812, + "language_loss": 0.8241694, + "learning_rate": 8.835019616278856e-07, + "loss": 0.84550416, + "num_input_tokens_seen": 250480975, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6875, + "step": 11607, + "time_per_iteration": 2.4532179832458496 + }, + { + "auxiliary_loss_clip": 0.01108161, + "auxiliary_loss_mlp": 0.01030785, + "balance_loss_clip": 1.01842856, + "balance_loss_mlp": 1.03726959, + "epoch": 0.697910716969788, + "flos": 20043720529920.0, + "grad_norm": 1.8001657478638917, + "language_loss": 0.7874788, + "learning_rate": 8.831788567821265e-07, + "loss": 0.80886829, + "num_input_tokens_seen": 250497980, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.70703125, + "step": 11608, + "time_per_iteration": 2.47961688041687 + }, + { + "auxiliary_loss_clip": 0.01103023, + "auxiliary_loss_mlp": 0.01031974, + "balance_loss_clip": 1.02052379, + "balance_loss_mlp": 1.03606093, + "epoch": 0.697970840222456, + "flos": 15888245961600.0, + "grad_norm": 1.8111202994770392, + "language_loss": 0.89970839, + "learning_rate": 8.828557942863357e-07, + "loss": 0.9210583, + "num_input_tokens_seen": 250511910, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 11609, + "time_per_iteration": 2.408423900604248 + }, + { + "auxiliary_loss_clip": 0.01104617, + "auxiliary_loss_mlp": 0.01027572, + "balance_loss_clip": 1.01522803, + "balance_loss_mlp": 1.03529525, + "epoch": 0.698030963475124, + "flos": 21215629658880.0, + "grad_norm": 2.1159011349331607, + "language_loss": 0.63904428, + "learning_rate": 8.82532774152765e-07, + "loss": 0.66036618, + "num_input_tokens_seen": 250531090, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 11610, + "time_per_iteration": 2.4653687477111816 + }, + { + "auxiliary_loss_clip": 0.01100567, + "auxiliary_loss_mlp": 0.01029938, + "balance_loss_clip": 1.01883924, + "balance_loss_mlp": 1.03393793, + "epoch": 0.698091086727792, + "flos": 33759728524800.0, + "grad_norm": 1.6195278662998478, + "language_loss": 0.84689248, + "learning_rate": 8.822097963936643e-07, + "loss": 0.86819756, + "num_input_tokens_seen": 250551565, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 11611, + "time_per_iteration": 2.5322601795196533 + }, + { + "auxiliary_loss_clip": 0.01104506, + "auxiliary_loss_mlp": 0.01030479, + "balance_loss_clip": 1.01864767, + "balance_loss_mlp": 1.03619266, + "epoch": 0.69815120998046, + "flos": 15887850912000.0, + "grad_norm": 1.902997346306539, + "language_loss": 0.71074033, + "learning_rate": 8.818868610212793e-07, + "loss": 0.73209023, + "num_input_tokens_seen": 250569625, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.68359375, + "step": 11612, + "time_per_iteration": 2.432530641555786 + }, + { + "auxiliary_loss_clip": 0.01100621, + "auxiliary_loss_mlp": 0.01031074, + "balance_loss_clip": 1.01988053, + "balance_loss_mlp": 1.03486013, + "epoch": 0.6982113332331279, + "flos": 18947044437120.0, + "grad_norm": 1.5615931118386375, + "language_loss": 0.80995202, + "learning_rate": 8.815639680478573e-07, + "loss": 0.83126897, + "num_input_tokens_seen": 250586960, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.65625, + "step": 11613, + "time_per_iteration": 2.429049253463745 + }, + { + "auxiliary_loss_clip": 0.01100637, + "auxiliary_loss_mlp": 0.01029075, + "balance_loss_clip": 1.01840007, + "balance_loss_mlp": 1.03550696, + "epoch": 0.6982714564857959, + "flos": 24389594115840.0, + "grad_norm": 1.8186173474764362, + "language_loss": 0.75323808, + "learning_rate": 8.812411174856411e-07, + "loss": 0.77453518, + "num_input_tokens_seen": 250605080, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.65234375, + "step": 11614, + "time_per_iteration": 2.469871997833252 + }, + { + "auxiliary_loss_clip": 0.01102382, + "auxiliary_loss_mlp": 0.01029848, + "balance_loss_clip": 1.01817775, + "balance_loss_mlp": 1.03613019, + "epoch": 0.6983315797384638, + "flos": 20083725302400.0, + "grad_norm": 2.4207105527318125, + "language_loss": 0.77124798, + "learning_rate": 8.809183093468746e-07, + "loss": 0.79257029, + "num_input_tokens_seen": 250623965, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 11615, + "time_per_iteration": 2.4482977390289307 + }, + { + "auxiliary_loss_clip": 0.01098585, + "auxiliary_loss_mlp": 0.01025272, + "balance_loss_clip": 1.01378596, + "balance_loss_mlp": 1.03474522, + "epoch": 0.6983917029911318, + "flos": 13512431664000.0, + "grad_norm": 2.152403248821291, + "language_loss": 0.73121244, + "learning_rate": 8.80595543643797e-07, + "loss": 0.752451, + "num_input_tokens_seen": 250640675, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 11616, + "time_per_iteration": 2.4637510776519775 + }, + { + "auxiliary_loss_clip": 0.01102545, + "auxiliary_loss_mlp": 0.01033011, + "balance_loss_clip": 1.02162004, + "balance_loss_mlp": 1.03698003, + "epoch": 0.6984518262437998, + "flos": 22018412672640.0, + "grad_norm": 1.620744160430393, + "language_loss": 0.84509301, + "learning_rate": 8.802728203886487e-07, + "loss": 0.86644858, + "num_input_tokens_seen": 250660295, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 11617, + "time_per_iteration": 2.4850711822509766 + }, + { + "auxiliary_loss_clip": 0.01105897, + "auxiliary_loss_mlp": 0.01035391, + "balance_loss_clip": 1.02358341, + "balance_loss_mlp": 1.03734601, + "epoch": 0.6985119494964678, + "flos": 18770615809920.0, + "grad_norm": 2.8091395621454884, + "language_loss": 0.59596443, + "learning_rate": 8.799501395936682e-07, + "loss": 0.61737734, + "num_input_tokens_seen": 250678155, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 11618, + "time_per_iteration": 2.4457621574401855 + }, + { + "auxiliary_loss_clip": 0.0110188, + "auxiliary_loss_mlp": 0.01032015, + "balance_loss_clip": 1.02075553, + "balance_loss_mlp": 1.03564835, + "epoch": 0.6985720727491357, + "flos": 22382834106240.0, + "grad_norm": 1.7259844025825606, + "language_loss": 0.82820493, + "learning_rate": 8.796275012710903e-07, + "loss": 0.84954393, + "num_input_tokens_seen": 250697230, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6640625, + "step": 11619, + "time_per_iteration": 2.4546103477478027 + }, + { + "auxiliary_loss_clip": 0.01097255, + "auxiliary_loss_mlp": 0.01029049, + "balance_loss_clip": 1.01878548, + "balance_loss_mlp": 1.0334444, + "epoch": 0.6986321960018037, + "flos": 39567884785920.0, + "grad_norm": 1.7065049310483924, + "language_loss": 0.67252052, + "learning_rate": 8.793049054331494e-07, + "loss": 0.69378352, + "num_input_tokens_seen": 250719865, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.63671875, + "step": 11620, + "time_per_iteration": 2.6086742877960205 + }, + { + "auxiliary_loss_clip": 0.01102039, + "auxiliary_loss_mlp": 0.01028774, + "balance_loss_clip": 1.0171926, + "balance_loss_mlp": 1.03403723, + "epoch": 0.6986923192544716, + "flos": 17967725055360.0, + "grad_norm": 2.0144848908668607, + "language_loss": 0.72543484, + "learning_rate": 8.789823520920794e-07, + "loss": 0.74674302, + "num_input_tokens_seen": 250736565, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 11621, + "time_per_iteration": 2.4109437465667725 + }, + { + "auxiliary_loss_clip": 0.01104286, + "auxiliary_loss_mlp": 0.01033661, + "balance_loss_clip": 1.02206206, + "balance_loss_mlp": 1.03532565, + "epoch": 0.6987524425071396, + "flos": 25594325297280.0, + "grad_norm": 1.8967396853715839, + "language_loss": 0.68434918, + "learning_rate": 8.7865984126011e-07, + "loss": 0.70572865, + "num_input_tokens_seen": 250757235, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6875, + "step": 11622, + "time_per_iteration": 2.4823949337005615 + }, + { + "auxiliary_loss_clip": 0.01097003, + "auxiliary_loss_mlp": 0.0102851, + "balance_loss_clip": 1.01771569, + "balance_loss_mlp": 1.03294408, + "epoch": 0.6988125657598077, + "flos": 17530081747200.0, + "grad_norm": 1.7255143974519898, + "language_loss": 0.62549627, + "learning_rate": 8.783373729494721e-07, + "loss": 0.6467514, + "num_input_tokens_seen": 250775585, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.640625, + "step": 11623, + "time_per_iteration": 2.4188036918640137 + }, + { + "auxiliary_loss_clip": 0.01104383, + "auxiliary_loss_mlp": 0.0102562, + "balance_loss_clip": 1.0135262, + "balance_loss_mlp": 1.03467298, + "epoch": 0.6988726890124756, + "flos": 39165721136640.0, + "grad_norm": 1.7388598441341108, + "language_loss": 0.60939074, + "learning_rate": 8.780149471723932e-07, + "loss": 0.63069075, + "num_input_tokens_seen": 250795725, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.69921875, + "step": 11624, + "time_per_iteration": 2.5913877487182617 + }, + { + "auxiliary_loss_clip": 0.01102198, + "auxiliary_loss_mlp": 0.01035945, + "balance_loss_clip": 1.02349353, + "balance_loss_mlp": 1.03341901, + "epoch": 0.6989328122651436, + "flos": 20193468330240.0, + "grad_norm": 1.6753967170861992, + "language_loss": 0.78502715, + "learning_rate": 8.776925639411017e-07, + "loss": 0.80640858, + "num_input_tokens_seen": 250814555, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 11625, + "time_per_iteration": 2.4710693359375 + }, + { + "auxiliary_loss_clip": 0.01098526, + "auxiliary_loss_mlp": 0.01029914, + "balance_loss_clip": 1.01916766, + "balance_loss_mlp": 1.03475714, + "epoch": 0.6989929355178115, + "flos": 21834873152640.0, + "grad_norm": 1.9082516770255042, + "language_loss": 0.66193223, + "learning_rate": 8.773702232678188e-07, + "loss": 0.68321669, + "num_input_tokens_seen": 250833105, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.640625, + "step": 11626, + "time_per_iteration": 2.4523563385009766 + }, + { + "auxiliary_loss_clip": 0.01102348, + "auxiliary_loss_mlp": 0.01030649, + "balance_loss_clip": 1.01857281, + "balance_loss_mlp": 1.03522182, + "epoch": 0.6990530587704795, + "flos": 26322880855680.0, + "grad_norm": 1.7406688014675167, + "language_loss": 0.7007491, + "learning_rate": 8.770479251647697e-07, + "loss": 0.72207904, + "num_input_tokens_seen": 250852570, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.671875, + "step": 11627, + "time_per_iteration": 2.474536895751953 + }, + { + "auxiliary_loss_clip": 0.01098589, + "auxiliary_loss_mlp": 0.01025817, + "balance_loss_clip": 1.01572561, + "balance_loss_mlp": 1.03557801, + "epoch": 0.6991131820231474, + "flos": 19828975069440.0, + "grad_norm": 1.7260870632652867, + "language_loss": 0.62484425, + "learning_rate": 8.767256696441768e-07, + "loss": 0.64608836, + "num_input_tokens_seen": 250870500, + "router_z_loss_clip": 0.10107422, + "router_z_loss_mlp": 0.62890625, + "step": 11628, + "time_per_iteration": 2.466815710067749 + }, + { + "auxiliary_loss_clip": 0.01102216, + "auxiliary_loss_mlp": 0.01031088, + "balance_loss_clip": 1.01934004, + "balance_loss_mlp": 1.03518367, + "epoch": 0.6991733052758154, + "flos": 33984817102080.0, + "grad_norm": 2.3991163930052757, + "language_loss": 0.68365383, + "learning_rate": 8.764034567182581e-07, + "loss": 0.70498693, + "num_input_tokens_seen": 250892745, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 11629, + "time_per_iteration": 2.5539638996124268 + }, + { + "auxiliary_loss_clip": 0.01102664, + "auxiliary_loss_mlp": 0.01032595, + "balance_loss_clip": 1.02066183, + "balance_loss_mlp": 1.03708851, + "epoch": 0.6992334285284834, + "flos": 15633136592640.0, + "grad_norm": 1.6822972614586869, + "language_loss": 0.73017991, + "learning_rate": 8.760812863992337e-07, + "loss": 0.75153255, + "num_input_tokens_seen": 250910225, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 11630, + "time_per_iteration": 2.4794862270355225 + }, + { + "auxiliary_loss_clip": 0.01100869, + "auxiliary_loss_mlp": 0.01034714, + "balance_loss_clip": 1.0236398, + "balance_loss_mlp": 1.03656542, + "epoch": 0.6992935517811514, + "flos": 21726279360000.0, + "grad_norm": 1.6007473169297173, + "language_loss": 0.7410804, + "learning_rate": 8.757591586993196e-07, + "loss": 0.76243627, + "num_input_tokens_seen": 250929715, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.64453125, + "step": 11631, + "time_per_iteration": 2.4957640171051025 + }, + { + "auxiliary_loss_clip": 0.01106043, + "auxiliary_loss_mlp": 0.01029343, + "balance_loss_clip": 1.01692176, + "balance_loss_mlp": 1.03722155, + "epoch": 0.6993536750338193, + "flos": 20115254465280.0, + "grad_norm": 2.1507086916172153, + "language_loss": 0.8977077, + "learning_rate": 8.7543707363073e-07, + "loss": 0.91906154, + "num_input_tokens_seen": 250944230, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 11632, + "time_per_iteration": 2.44950008392334 + }, + { + "auxiliary_loss_clip": 0.01105644, + "auxiliary_loss_mlp": 0.01033481, + "balance_loss_clip": 1.02223396, + "balance_loss_mlp": 1.03784966, + "epoch": 0.6994137982864873, + "flos": 22010547594240.0, + "grad_norm": 1.6745752563732321, + "language_loss": 0.79724801, + "learning_rate": 8.751150312056792e-07, + "loss": 0.81863928, + "num_input_tokens_seen": 250961865, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.67578125, + "step": 11633, + "time_per_iteration": 2.4414355754852295 + }, + { + "auxiliary_loss_clip": 0.0110496, + "auxiliary_loss_mlp": 0.01031046, + "balance_loss_clip": 1.01837361, + "balance_loss_mlp": 1.03629565, + "epoch": 0.6994739215391552, + "flos": 25519020433920.0, + "grad_norm": 1.8513742632089842, + "language_loss": 0.6695196, + "learning_rate": 8.747930314363794e-07, + "loss": 0.69087964, + "num_input_tokens_seen": 250982025, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 11634, + "time_per_iteration": 2.512799024581909 + }, + { + "auxiliary_loss_clip": 0.01025073, + "auxiliary_loss_mlp": 0.01006178, + "balance_loss_clip": 1.0051055, + "balance_loss_mlp": 1.00443375, + "epoch": 0.6995340447918232, + "flos": 59128357691520.0, + "grad_norm": 0.7055663228963396, + "language_loss": 0.53125268, + "learning_rate": 8.744710743350412e-07, + "loss": 0.55156517, + "num_input_tokens_seen": 251046900, + "router_z_loss_clip": 0.01074219, + "router_z_loss_mlp": 0.20703125, + "step": 11635, + "time_per_iteration": 3.1653506755828857 + }, + { + "auxiliary_loss_clip": 0.01100006, + "auxiliary_loss_mlp": 0.01029622, + "balance_loss_clip": 1.01810038, + "balance_loss_mlp": 1.03436577, + "epoch": 0.6995941680444913, + "flos": 17967832796160.0, + "grad_norm": 1.634854939073058, + "language_loss": 0.82167876, + "learning_rate": 8.741491599138726e-07, + "loss": 0.84297502, + "num_input_tokens_seen": 251065050, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 11636, + "time_per_iteration": 3.8652594089508057 + }, + { + "auxiliary_loss_clip": 0.0110217, + "auxiliary_loss_mlp": 0.01025995, + "balance_loss_clip": 1.01429462, + "balance_loss_mlp": 1.03523159, + "epoch": 0.6996542912971592, + "flos": 21980095839360.0, + "grad_norm": 2.0826416356932764, + "language_loss": 0.83018386, + "learning_rate": 8.738272881850801e-07, + "loss": 0.85146558, + "num_input_tokens_seen": 251083355, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 11637, + "time_per_iteration": 2.471907615661621 + }, + { + "auxiliary_loss_clip": 0.01100142, + "auxiliary_loss_mlp": 0.01033162, + "balance_loss_clip": 1.02207518, + "balance_loss_mlp": 1.03530073, + "epoch": 0.6997144145498272, + "flos": 11686158518400.0, + "grad_norm": 2.0103377322341807, + "language_loss": 0.67541957, + "learning_rate": 8.735054591608704e-07, + "loss": 0.69675255, + "num_input_tokens_seen": 251096420, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6484375, + "step": 11638, + "time_per_iteration": 3.8712992668151855 + }, + { + "auxiliary_loss_clip": 0.01105589, + "auxiliary_loss_mlp": 0.01031049, + "balance_loss_clip": 1.01862764, + "balance_loss_mlp": 1.03554988, + "epoch": 0.6997745378024951, + "flos": 29607162958080.0, + "grad_norm": 3.4273717366145293, + "language_loss": 0.78027046, + "learning_rate": 8.731836728534459e-07, + "loss": 0.80163682, + "num_input_tokens_seen": 251115410, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.703125, + "step": 11639, + "time_per_iteration": 4.004430532455444 + }, + { + "auxiliary_loss_clip": 0.01104922, + "auxiliary_loss_mlp": 0.01035722, + "balance_loss_clip": 1.02339602, + "balance_loss_mlp": 1.03788579, + "epoch": 0.6998346610551631, + "flos": 20886616056960.0, + "grad_norm": 2.1417598387130807, + "language_loss": 0.82320189, + "learning_rate": 8.728619292750093e-07, + "loss": 0.84460831, + "num_input_tokens_seen": 251133530, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.671875, + "step": 11640, + "time_per_iteration": 3.938671588897705 + }, + { + "auxiliary_loss_clip": 0.01099361, + "auxiliary_loss_mlp": 0.01028635, + "balance_loss_clip": 1.01750684, + "balance_loss_mlp": 1.03294611, + "epoch": 0.699894784307831, + "flos": 27163046949120.0, + "grad_norm": 1.651631828879974, + "language_loss": 0.7513082, + "learning_rate": 8.725402284377619e-07, + "loss": 0.77258819, + "num_input_tokens_seen": 251153985, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 11641, + "time_per_iteration": 2.5288925170898438 + }, + { + "auxiliary_loss_clip": 0.01103165, + "auxiliary_loss_mlp": 0.01021586, + "balance_loss_clip": 1.00946224, + "balance_loss_mlp": 1.03693998, + "epoch": 0.699954907560499, + "flos": 20923640000640.0, + "grad_norm": 1.9866198731885556, + "language_loss": 0.78112102, + "learning_rate": 8.722185703539022e-07, + "loss": 0.80236852, + "num_input_tokens_seen": 251173225, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 11642, + "time_per_iteration": 2.4836714267730713 + }, + { + "auxiliary_loss_clip": 0.01106745, + "auxiliary_loss_mlp": 0.01033461, + "balance_loss_clip": 1.01997852, + "balance_loss_mlp": 1.03653657, + "epoch": 0.700015030813167, + "flos": 28657792540800.0, + "grad_norm": 3.5463939994986524, + "language_loss": 0.75054216, + "learning_rate": 8.718969550356266e-07, + "loss": 0.77194417, + "num_input_tokens_seen": 251192485, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.703125, + "step": 11643, + "time_per_iteration": 2.5334367752075195 + }, + { + "auxiliary_loss_clip": 0.01102548, + "auxiliary_loss_mlp": 0.01026355, + "balance_loss_clip": 1.01483929, + "balance_loss_mlp": 1.03516173, + "epoch": 0.700075154065835, + "flos": 29205286617600.0, + "grad_norm": 1.4977944271718722, + "language_loss": 0.60428506, + "learning_rate": 8.715753824951315e-07, + "loss": 0.62557411, + "num_input_tokens_seen": 251214965, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.67578125, + "step": 11644, + "time_per_iteration": 2.549466609954834 + }, + { + "auxiliary_loss_clip": 0.01099168, + "auxiliary_loss_mlp": 0.01026944, + "balance_loss_clip": 1.01587558, + "balance_loss_mlp": 1.03423524, + "epoch": 0.7001352773185029, + "flos": 23112431159040.0, + "grad_norm": 1.654773912405309, + "language_loss": 0.8168875, + "learning_rate": 8.712538527446119e-07, + "loss": 0.83814859, + "num_input_tokens_seen": 251234500, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6484375, + "step": 11645, + "time_per_iteration": 2.5374014377593994 + }, + { + "auxiliary_loss_clip": 0.01100534, + "auxiliary_loss_mlp": 0.01025535, + "balance_loss_clip": 1.01418638, + "balance_loss_mlp": 1.03470361, + "epoch": 0.7001954005711709, + "flos": 21322858734720.0, + "grad_norm": 1.9559227219413697, + "language_loss": 0.6827392, + "learning_rate": 8.709323657962584e-07, + "loss": 0.70399988, + "num_input_tokens_seen": 251254360, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 11646, + "time_per_iteration": 2.4721925258636475 + }, + { + "auxiliary_loss_clip": 0.01101074, + "auxiliary_loss_mlp": 0.01033463, + "balance_loss_clip": 1.02236462, + "balance_loss_mlp": 1.03534269, + "epoch": 0.7002555238238388, + "flos": 24535822383360.0, + "grad_norm": 1.4678938287912224, + "language_loss": 0.71031594, + "learning_rate": 8.706109216622635e-07, + "loss": 0.73166132, + "num_input_tokens_seen": 251274790, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.65625, + "step": 11647, + "time_per_iteration": 2.5134873390197754 + }, + { + "auxiliary_loss_clip": 0.01105174, + "auxiliary_loss_mlp": 0.01033483, + "balance_loss_clip": 1.02156842, + "balance_loss_mlp": 1.03716385, + "epoch": 0.7003156470765068, + "flos": 39056552726400.0, + "grad_norm": 1.703178589128687, + "language_loss": 0.71102858, + "learning_rate": 8.702895203548155e-07, + "loss": 0.73241514, + "num_input_tokens_seen": 251296275, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 11648, + "time_per_iteration": 2.5937957763671875 + }, + { + "auxiliary_loss_clip": 0.01099145, + "auxiliary_loss_mlp": 0.01027726, + "balance_loss_clip": 1.01577532, + "balance_loss_mlp": 1.03368604, + "epoch": 0.7003757703291749, + "flos": 28804092635520.0, + "grad_norm": 1.6329252584498772, + "language_loss": 0.77452666, + "learning_rate": 8.699681618861014e-07, + "loss": 0.79579538, + "num_input_tokens_seen": 251317375, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.65625, + "step": 11649, + "time_per_iteration": 2.517803907394409 + }, + { + "auxiliary_loss_clip": 0.01101252, + "auxiliary_loss_mlp": 0.01033228, + "balance_loss_clip": 1.02211761, + "balance_loss_mlp": 1.03584242, + "epoch": 0.7004358935818428, + "flos": 15953854152960.0, + "grad_norm": 1.77714876620496, + "language_loss": 0.78475487, + "learning_rate": 8.69646846268308e-07, + "loss": 0.80609971, + "num_input_tokens_seen": 251333570, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 11650, + "time_per_iteration": 2.40120530128479 + }, + { + "auxiliary_loss_clip": 0.0109906, + "auxiliary_loss_mlp": 0.01025547, + "balance_loss_clip": 1.01452041, + "balance_loss_mlp": 1.03317046, + "epoch": 0.7004960168345108, + "flos": 20411984718720.0, + "grad_norm": 2.032619640135715, + "language_loss": 0.78585541, + "learning_rate": 8.693255735136194e-07, + "loss": 0.80710149, + "num_input_tokens_seen": 251351070, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.66015625, + "step": 11651, + "time_per_iteration": 2.4667370319366455 + }, + { + "auxiliary_loss_clip": 0.01104452, + "auxiliary_loss_mlp": 0.01031415, + "balance_loss_clip": 1.02046514, + "balance_loss_mlp": 1.03640985, + "epoch": 0.7005561400871787, + "flos": 17347547808000.0, + "grad_norm": 1.5029723936879913, + "language_loss": 0.69227219, + "learning_rate": 8.690043436342198e-07, + "loss": 0.71363091, + "num_input_tokens_seen": 251370005, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6796875, + "step": 11652, + "time_per_iteration": 2.4276230335235596 + }, + { + "auxiliary_loss_clip": 0.01102036, + "auxiliary_loss_mlp": 0.01027935, + "balance_loss_clip": 1.01670551, + "balance_loss_mlp": 1.03644037, + "epoch": 0.7006162633398467, + "flos": 25302120157440.0, + "grad_norm": 1.3694191346433118, + "language_loss": 0.74200094, + "learning_rate": 8.686831566422874e-07, + "loss": 0.76330066, + "num_input_tokens_seen": 251391210, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 11653, + "time_per_iteration": 2.515753984451294 + }, + { + "auxiliary_loss_clip": 0.01102535, + "auxiliary_loss_mlp": 0.0102929, + "balance_loss_clip": 1.01641536, + "balance_loss_mlp": 1.03555512, + "epoch": 0.7006763865925146, + "flos": 20668997508480.0, + "grad_norm": 2.227987433936512, + "language_loss": 0.70499587, + "learning_rate": 8.68362012550003e-07, + "loss": 0.72631419, + "num_input_tokens_seen": 251411505, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.671875, + "step": 11654, + "time_per_iteration": 2.4286937713623047 + }, + { + "auxiliary_loss_clip": 0.01104582, + "auxiliary_loss_mlp": 0.01030492, + "balance_loss_clip": 1.01751626, + "balance_loss_mlp": 1.03610516, + "epoch": 0.7007365098451827, + "flos": 20046449963520.0, + "grad_norm": 2.4203729950028063, + "language_loss": 0.73474562, + "learning_rate": 8.680409113695453e-07, + "loss": 0.75609636, + "num_input_tokens_seen": 251428975, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.68359375, + "step": 11655, + "time_per_iteration": 2.4598588943481445 + }, + { + "auxiliary_loss_clip": 0.01110167, + "auxiliary_loss_mlp": 0.01036696, + "balance_loss_clip": 1.0236547, + "balance_loss_mlp": 1.03842175, + "epoch": 0.7007966330978506, + "flos": 20777375819520.0, + "grad_norm": 1.832010728467088, + "language_loss": 0.69950438, + "learning_rate": 8.677198531130889e-07, + "loss": 0.72097301, + "num_input_tokens_seen": 251446940, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 11656, + "time_per_iteration": 2.4319212436676025 + }, + { + "auxiliary_loss_clip": 0.01100001, + "auxiliary_loss_mlp": 0.01028207, + "balance_loss_clip": 1.01765096, + "balance_loss_mlp": 1.03448498, + "epoch": 0.7008567563505186, + "flos": 29638189330560.0, + "grad_norm": 1.5232296652544484, + "language_loss": 0.77772856, + "learning_rate": 8.673988377928092e-07, + "loss": 0.79901063, + "num_input_tokens_seen": 251466205, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.65625, + "step": 11657, + "time_per_iteration": 2.5232887268066406 + }, + { + "auxiliary_loss_clip": 0.01107629, + "auxiliary_loss_mlp": 0.01031267, + "balance_loss_clip": 1.01827931, + "balance_loss_mlp": 1.03665113, + "epoch": 0.7009168796031865, + "flos": 17092007475840.0, + "grad_norm": 2.426278289678233, + "language_loss": 0.77859247, + "learning_rate": 8.670778654208797e-07, + "loss": 0.79998142, + "num_input_tokens_seen": 251484820, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 11658, + "time_per_iteration": 2.5308613777160645 + }, + { + "auxiliary_loss_clip": 0.01098446, + "auxiliary_loss_mlp": 0.01024442, + "balance_loss_clip": 1.01329541, + "balance_loss_mlp": 1.03391457, + "epoch": 0.7009770028558545, + "flos": 20448972748800.0, + "grad_norm": 2.3274246978175803, + "language_loss": 0.82637346, + "learning_rate": 8.667569360094713e-07, + "loss": 0.84760237, + "num_input_tokens_seen": 251502670, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.64453125, + "step": 11659, + "time_per_iteration": 2.4660232067108154 + }, + { + "auxiliary_loss_clip": 0.01100216, + "auxiliary_loss_mlp": 0.01026445, + "balance_loss_clip": 1.01545429, + "balance_loss_mlp": 1.0353384, + "epoch": 0.7010371261085224, + "flos": 19245139407360.0, + "grad_norm": 1.9444226757743717, + "language_loss": 0.69085199, + "learning_rate": 8.664360495707526e-07, + "loss": 0.71211863, + "num_input_tokens_seen": 251521630, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6484375, + "step": 11660, + "time_per_iteration": 2.425694227218628 + }, + { + "auxiliary_loss_clip": 0.01102737, + "auxiliary_loss_mlp": 0.0103262, + "balance_loss_clip": 1.0202167, + "balance_loss_mlp": 1.03413391, + "epoch": 0.7010972493611904, + "flos": 22127581082880.0, + "grad_norm": 1.7015787806945502, + "language_loss": 0.80871427, + "learning_rate": 8.661152061168924e-07, + "loss": 0.83006787, + "num_input_tokens_seen": 251540105, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 11661, + "time_per_iteration": 2.4829437732696533 + }, + { + "auxiliary_loss_clip": 0.01100288, + "auxiliary_loss_mlp": 0.01030814, + "balance_loss_clip": 1.01967359, + "balance_loss_mlp": 1.033602, + "epoch": 0.7011573726138585, + "flos": 31391132860800.0, + "grad_norm": 3.059809361896724, + "language_loss": 0.78862965, + "learning_rate": 8.657944056600579e-07, + "loss": 0.80994064, + "num_input_tokens_seen": 251560530, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66796875, + "step": 11662, + "time_per_iteration": 2.5052289962768555 + }, + { + "auxiliary_loss_clip": 0.01102457, + "auxiliary_loss_mlp": 0.01023605, + "balance_loss_clip": 1.01188052, + "balance_loss_mlp": 1.03489375, + "epoch": 0.7012174958665264, + "flos": 18150582216960.0, + "grad_norm": 1.922970255639485, + "language_loss": 0.8358953, + "learning_rate": 8.654736482124134e-07, + "loss": 0.85715592, + "num_input_tokens_seen": 251577930, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.67578125, + "step": 11663, + "time_per_iteration": 2.4594573974609375 + }, + { + "auxiliary_loss_clip": 0.0102523, + "auxiliary_loss_mlp": 0.01007606, + "balance_loss_clip": 1.00651574, + "balance_loss_mlp": 1.00453377, + "epoch": 0.7012776191191944, + "flos": 60651256567680.0, + "grad_norm": 0.8204387591217913, + "language_loss": 0.53774929, + "learning_rate": 8.651529337861209e-07, + "loss": 0.55807763, + "num_input_tokens_seen": 251638820, + "router_z_loss_clip": 0.01092529, + "router_z_loss_mlp": 0.20703125, + "step": 11664, + "time_per_iteration": 3.0331904888153076 + }, + { + "auxiliary_loss_clip": 0.01104897, + "auxiliary_loss_mlp": 0.01030739, + "balance_loss_clip": 1.01900291, + "balance_loss_mlp": 1.03650737, + "epoch": 0.7013377423718623, + "flos": 27198598435200.0, + "grad_norm": 2.4272507893526143, + "language_loss": 0.78843081, + "learning_rate": 8.64832262393344e-07, + "loss": 0.80978715, + "num_input_tokens_seen": 251658070, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.68359375, + "step": 11665, + "time_per_iteration": 2.4934439659118652 + }, + { + "auxiliary_loss_clip": 0.01099902, + "auxiliary_loss_mlp": 0.01028028, + "balance_loss_clip": 1.01650083, + "balance_loss_mlp": 1.03361416, + "epoch": 0.7013978656245303, + "flos": 16543543731840.0, + "grad_norm": 2.269849765653923, + "language_loss": 0.77034938, + "learning_rate": 8.645116340462404e-07, + "loss": 0.79162872, + "num_input_tokens_seen": 251671575, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 11666, + "time_per_iteration": 2.4027786254882812 + }, + { + "auxiliary_loss_clip": 0.01101042, + "auxiliary_loss_mlp": 0.01026786, + "balance_loss_clip": 1.01623607, + "balance_loss_mlp": 1.0356462, + "epoch": 0.7014579888771982, + "flos": 23143780753920.0, + "grad_norm": 1.878568521742783, + "language_loss": 0.81238604, + "learning_rate": 8.641910487569695e-07, + "loss": 0.8336643, + "num_input_tokens_seen": 251689350, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.65625, + "step": 11667, + "time_per_iteration": 2.4780242443084717 + }, + { + "auxiliary_loss_clip": 0.0110046, + "auxiliary_loss_mlp": 0.01035616, + "balance_loss_clip": 1.0237546, + "balance_loss_mlp": 1.03487873, + "epoch": 0.7015181121298663, + "flos": 25082095397760.0, + "grad_norm": 2.0547760249868685, + "language_loss": 0.65335631, + "learning_rate": 8.638705065376879e-07, + "loss": 0.67471707, + "num_input_tokens_seen": 251704635, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.65625, + "step": 11668, + "time_per_iteration": 2.447939395904541 + }, + { + "auxiliary_loss_clip": 0.01103124, + "auxiliary_loss_mlp": 0.01022731, + "balance_loss_clip": 1.01117384, + "balance_loss_mlp": 1.03469038, + "epoch": 0.7015782353825342, + "flos": 23327894891520.0, + "grad_norm": 2.272329624033439, + "language_loss": 0.76275986, + "learning_rate": 8.635500074005519e-07, + "loss": 0.78401846, + "num_input_tokens_seen": 251723035, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.68359375, + "step": 11669, + "time_per_iteration": 2.4600765705108643 + }, + { + "auxiliary_loss_clip": 0.01025535, + "auxiliary_loss_mlp": 0.01006318, + "balance_loss_clip": 1.00525713, + "balance_loss_mlp": 1.00477183, + "epoch": 0.7016383586352022, + "flos": 70397161107840.0, + "grad_norm": 0.6922095034682588, + "language_loss": 0.54468822, + "learning_rate": 8.632295513577122e-07, + "loss": 0.56500673, + "num_input_tokens_seen": 251791630, + "router_z_loss_clip": 0.01062012, + "router_z_loss_mlp": 0.20703125, + "step": 11670, + "time_per_iteration": 3.1504855155944824 + }, + { + "auxiliary_loss_clip": 0.01100438, + "auxiliary_loss_mlp": 0.01031818, + "balance_loss_clip": 1.0203141, + "balance_loss_mlp": 1.03460622, + "epoch": 0.7016984818878701, + "flos": 19792274348160.0, + "grad_norm": 1.9909569240580678, + "language_loss": 0.81605625, + "learning_rate": 8.629091384213218e-07, + "loss": 0.83737886, + "num_input_tokens_seen": 251809840, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66015625, + "step": 11671, + "time_per_iteration": 2.475792169570923 + }, + { + "auxiliary_loss_clip": 0.011038, + "auxiliary_loss_mlp": 0.01028796, + "balance_loss_clip": 1.01734638, + "balance_loss_mlp": 1.03691864, + "epoch": 0.7017586051405381, + "flos": 12896923184640.0, + "grad_norm": 2.023044603900928, + "language_loss": 0.75000024, + "learning_rate": 8.625887686035313e-07, + "loss": 0.77132618, + "num_input_tokens_seen": 251827550, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 11672, + "time_per_iteration": 2.4228410720825195 + }, + { + "auxiliary_loss_clip": 0.0110057, + "auxiliary_loss_mlp": 0.01030144, + "balance_loss_clip": 1.01794934, + "balance_loss_mlp": 1.0343281, + "epoch": 0.701818728393206, + "flos": 18332828847360.0, + "grad_norm": 1.708219397381251, + "language_loss": 0.87053084, + "learning_rate": 8.622684419164883e-07, + "loss": 0.89183801, + "num_input_tokens_seen": 251844880, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66015625, + "step": 11673, + "time_per_iteration": 2.4504873752593994 + }, + { + "auxiliary_loss_clip": 0.01098005, + "auxiliary_loss_mlp": 0.01024449, + "balance_loss_clip": 1.01308239, + "balance_loss_mlp": 1.0342052, + "epoch": 0.701878851645874, + "flos": 17384212615680.0, + "grad_norm": 2.1494737009789935, + "language_loss": 0.72768337, + "learning_rate": 8.619481583723399e-07, + "loss": 0.74890792, + "num_input_tokens_seen": 251861025, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.63671875, + "step": 11674, + "time_per_iteration": 2.397975444793701 + }, + { + "auxiliary_loss_clip": 0.01100361, + "auxiliary_loss_mlp": 0.01027786, + "balance_loss_clip": 1.01708126, + "balance_loss_mlp": 1.03694451, + "epoch": 0.701938974898542, + "flos": 23915501481600.0, + "grad_norm": 1.5674244409742963, + "language_loss": 0.72100163, + "learning_rate": 8.616279179832329e-07, + "loss": 0.74228311, + "num_input_tokens_seen": 251880175, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6328125, + "step": 11675, + "time_per_iteration": 2.4895689487457275 + }, + { + "auxiliary_loss_clip": 0.01102681, + "auxiliary_loss_mlp": 0.01024344, + "balance_loss_clip": 1.01256597, + "balance_loss_mlp": 1.03593993, + "epoch": 0.70199909815121, + "flos": 21795586652160.0, + "grad_norm": 2.517132712975458, + "language_loss": 0.50993675, + "learning_rate": 8.613077207613078e-07, + "loss": 0.53120697, + "num_input_tokens_seen": 251899005, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.66796875, + "step": 11676, + "time_per_iteration": 2.4392223358154297 + }, + { + "auxiliary_loss_clip": 0.01024806, + "auxiliary_loss_mlp": 0.01002084, + "balance_loss_clip": 1.00087988, + "balance_loss_mlp": 1.00406504, + "epoch": 0.702059221403878, + "flos": 71715047109120.0, + "grad_norm": 0.7321379163768023, + "language_loss": 0.59195387, + "learning_rate": 8.609875667187079e-07, + "loss": 0.61222279, + "num_input_tokens_seen": 251966790, + "router_z_loss_clip": 0.01202393, + "router_z_loss_mlp": 0.20703125, + "step": 11677, + "time_per_iteration": 3.125434398651123 + }, + { + "auxiliary_loss_clip": 0.01103207, + "auxiliary_loss_mlp": 0.01026564, + "balance_loss_clip": 1.01498294, + "balance_loss_mlp": 1.03543353, + "epoch": 0.7021193446565459, + "flos": 28111052649600.0, + "grad_norm": 2.2320710813331304, + "language_loss": 0.62693989, + "learning_rate": 8.606674558675737e-07, + "loss": 0.64823759, + "num_input_tokens_seen": 251989315, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6796875, + "step": 11678, + "time_per_iteration": 4.006704330444336 + }, + { + "auxiliary_loss_clip": 0.01100355, + "auxiliary_loss_mlp": 0.01032338, + "balance_loss_clip": 1.02100134, + "balance_loss_mlp": 1.0344584, + "epoch": 0.7021794679092139, + "flos": 22924905229440.0, + "grad_norm": 1.8460467241007361, + "language_loss": 0.79242504, + "learning_rate": 8.603473882200444e-07, + "loss": 0.81375194, + "num_input_tokens_seen": 252006620, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 11679, + "time_per_iteration": 2.4555304050445557 + }, + { + "auxiliary_loss_clip": 0.01101096, + "auxiliary_loss_mlp": 0.01035801, + "balance_loss_clip": 1.02535808, + "balance_loss_mlp": 1.03703773, + "epoch": 0.7022395911618818, + "flos": 18077827219200.0, + "grad_norm": 2.331847817004221, + "language_loss": 0.70253718, + "learning_rate": 8.600273637882567e-07, + "loss": 0.7239061, + "num_input_tokens_seen": 252024570, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.640625, + "step": 11680, + "time_per_iteration": 3.8396050930023193 + }, + { + "auxiliary_loss_clip": 0.01105234, + "auxiliary_loss_mlp": 0.01031375, + "balance_loss_clip": 1.01941895, + "balance_loss_mlp": 1.03682303, + "epoch": 0.7022997144145499, + "flos": 16034294661120.0, + "grad_norm": 1.6980564631311013, + "language_loss": 0.74690676, + "learning_rate": 8.597073825843446e-07, + "loss": 0.76827282, + "num_input_tokens_seen": 252042775, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.68359375, + "step": 11681, + "time_per_iteration": 5.31316614151001 + }, + { + "auxiliary_loss_clip": 0.0110093, + "auxiliary_loss_mlp": 0.01030271, + "balance_loss_clip": 1.01963735, + "balance_loss_mlp": 1.03458714, + "epoch": 0.7023598376672178, + "flos": 26468678160000.0, + "grad_norm": 1.4988427000417734, + "language_loss": 0.76605582, + "learning_rate": 8.593874446204434e-07, + "loss": 0.78736782, + "num_input_tokens_seen": 252063690, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6640625, + "step": 11682, + "time_per_iteration": 2.4792110919952393 + }, + { + "auxiliary_loss_clip": 0.01103891, + "auxiliary_loss_mlp": 0.01033801, + "balance_loss_clip": 1.02231503, + "balance_loss_mlp": 1.03589272, + "epoch": 0.7024199609198858, + "flos": 17055917285760.0, + "grad_norm": 1.8311880743600102, + "language_loss": 0.73361951, + "learning_rate": 8.590675499086841e-07, + "loss": 0.75499648, + "num_input_tokens_seen": 252080335, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6796875, + "step": 11683, + "time_per_iteration": 2.434879779815674 + }, + { + "auxiliary_loss_clip": 0.0110308, + "auxiliary_loss_mlp": 0.01028117, + "balance_loss_clip": 1.01577854, + "balance_loss_mlp": 1.03725612, + "epoch": 0.7024800841725537, + "flos": 25849039616640.0, + "grad_norm": 1.7668169003154093, + "language_loss": 0.71169794, + "learning_rate": 8.587476984611976e-07, + "loss": 0.73300993, + "num_input_tokens_seen": 252101075, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.65625, + "step": 11684, + "time_per_iteration": 2.486572742462158 + }, + { + "auxiliary_loss_clip": 0.01101245, + "auxiliary_loss_mlp": 0.01031543, + "balance_loss_clip": 1.01969957, + "balance_loss_mlp": 1.03529143, + "epoch": 0.7025402074252217, + "flos": 23513014609920.0, + "grad_norm": 1.8432235400728463, + "language_loss": 0.72046304, + "learning_rate": 8.584278902901128e-07, + "loss": 0.74179095, + "num_input_tokens_seen": 252120510, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66015625, + "step": 11685, + "time_per_iteration": 2.5009102821350098 + }, + { + "auxiliary_loss_clip": 0.01101202, + "auxiliary_loss_mlp": 0.01029995, + "balance_loss_clip": 1.01923084, + "balance_loss_mlp": 1.03449953, + "epoch": 0.7026003306778896, + "flos": 20150985519360.0, + "grad_norm": 1.7057605239318525, + "language_loss": 0.84865069, + "learning_rate": 8.581081254075582e-07, + "loss": 0.86996263, + "num_input_tokens_seen": 252137590, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.66796875, + "step": 11686, + "time_per_iteration": 2.447744846343994 + }, + { + "auxiliary_loss_clip": 0.01025709, + "auxiliary_loss_mlp": 0.01003132, + "balance_loss_clip": 1.00203538, + "balance_loss_mlp": 1.00512934, + "epoch": 0.7026604539305576, + "flos": 64772400712320.0, + "grad_norm": 0.988856355007654, + "language_loss": 0.69923353, + "learning_rate": 8.577884038256566e-07, + "loss": 0.71952194, + "num_input_tokens_seen": 252199830, + "router_z_loss_clip": 0.01098633, + "router_z_loss_mlp": 0.20605469, + "step": 11687, + "time_per_iteration": 3.1910674571990967 + }, + { + "auxiliary_loss_clip": 0.01103018, + "auxiliary_loss_mlp": 0.01027664, + "balance_loss_clip": 1.0161128, + "balance_loss_mlp": 1.03627849, + "epoch": 0.7027205771832256, + "flos": 21871466133120.0, + "grad_norm": 2.17247822122661, + "language_loss": 0.77656871, + "learning_rate": 8.574687255565329e-07, + "loss": 0.79787552, + "num_input_tokens_seen": 252217200, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 11688, + "time_per_iteration": 2.472559928894043 + }, + { + "auxiliary_loss_clip": 0.01101422, + "auxiliary_loss_mlp": 0.01031189, + "balance_loss_clip": 1.01951802, + "balance_loss_mlp": 1.0350461, + "epoch": 0.7027807004358936, + "flos": 23367791923200.0, + "grad_norm": 2.0685575537033207, + "language_loss": 0.68521178, + "learning_rate": 8.571490906123107e-07, + "loss": 0.70653796, + "num_input_tokens_seen": 252236105, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 11689, + "time_per_iteration": 2.4660775661468506 + }, + { + "auxiliary_loss_clip": 0.01103667, + "auxiliary_loss_mlp": 0.01035824, + "balance_loss_clip": 1.02360475, + "balance_loss_mlp": 1.03517842, + "epoch": 0.7028408236885616, + "flos": 15304266645120.0, + "grad_norm": 2.110320581130951, + "language_loss": 0.79499185, + "learning_rate": 8.568294990051086e-07, + "loss": 0.81638682, + "num_input_tokens_seen": 252253315, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 11690, + "time_per_iteration": 2.510883331298828 + }, + { + "auxiliary_loss_clip": 0.01102324, + "auxiliary_loss_mlp": 0.01031703, + "balance_loss_clip": 1.01994324, + "balance_loss_mlp": 1.03600478, + "epoch": 0.7029009469412295, + "flos": 22018197191040.0, + "grad_norm": 1.5848883111705174, + "language_loss": 0.76091731, + "learning_rate": 8.56509950747047e-07, + "loss": 0.78225756, + "num_input_tokens_seen": 252272765, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6640625, + "step": 11691, + "time_per_iteration": 2.4371836185455322 + }, + { + "auxiliary_loss_clip": 0.01102138, + "auxiliary_loss_mlp": 0.01024652, + "balance_loss_clip": 1.01367295, + "balance_loss_mlp": 1.03720069, + "epoch": 0.7029610701938975, + "flos": 21835519597440.0, + "grad_norm": 1.7363845404220049, + "language_loss": 0.81481391, + "learning_rate": 8.561904458502429e-07, + "loss": 0.8360818, + "num_input_tokens_seen": 252290510, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6484375, + "step": 11692, + "time_per_iteration": 2.501248359680176 + }, + { + "auxiliary_loss_clip": 0.01099945, + "auxiliary_loss_mlp": 0.0102586, + "balance_loss_clip": 1.01414728, + "balance_loss_mlp": 1.03468466, + "epoch": 0.7030211934465654, + "flos": 19135647774720.0, + "grad_norm": 1.5395445178386533, + "language_loss": 0.76162529, + "learning_rate": 8.558709843268111e-07, + "loss": 0.78288329, + "num_input_tokens_seen": 252309365, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65234375, + "step": 11693, + "time_per_iteration": 2.452014923095703 + }, + { + "auxiliary_loss_clip": 0.01101571, + "auxiliary_loss_mlp": 0.01030054, + "balance_loss_clip": 1.01904464, + "balance_loss_mlp": 1.03672361, + "epoch": 0.7030813166992335, + "flos": 38546010766080.0, + "grad_norm": 1.51123653242133, + "language_loss": 0.68433905, + "learning_rate": 8.55551566188866e-07, + "loss": 0.70565528, + "num_input_tokens_seen": 252333010, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 11694, + "time_per_iteration": 2.6905438899993896 + }, + { + "auxiliary_loss_clip": 0.01101725, + "auxiliary_loss_mlp": 0.0103067, + "balance_loss_clip": 1.01921415, + "balance_loss_mlp": 1.03518784, + "epoch": 0.7031414399519014, + "flos": 14720897859840.0, + "grad_norm": 2.685426816457134, + "language_loss": 0.75926757, + "learning_rate": 8.552321914485203e-07, + "loss": 0.78059149, + "num_input_tokens_seen": 252351330, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 11695, + "time_per_iteration": 2.4197287559509277 + }, + { + "auxiliary_loss_clip": 0.01104949, + "auxiliary_loss_mlp": 0.0103903, + "balance_loss_clip": 1.02692449, + "balance_loss_mlp": 1.03704011, + "epoch": 0.7032015632045694, + "flos": 14027247342720.0, + "grad_norm": 2.1895380825721595, + "language_loss": 0.73749006, + "learning_rate": 8.549128601178852e-07, + "loss": 0.75892979, + "num_input_tokens_seen": 252369580, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 11696, + "time_per_iteration": 2.438162088394165 + }, + { + "auxiliary_loss_clip": 0.01102914, + "auxiliary_loss_mlp": 0.01027271, + "balance_loss_clip": 1.01507568, + "balance_loss_mlp": 1.03577912, + "epoch": 0.7032616864572373, + "flos": 27637175496960.0, + "grad_norm": 1.6020001034841755, + "language_loss": 0.75352108, + "learning_rate": 8.545935722090693e-07, + "loss": 0.77482289, + "num_input_tokens_seen": 252390525, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 11697, + "time_per_iteration": 2.50844669342041 + }, + { + "auxiliary_loss_clip": 0.01107405, + "auxiliary_loss_mlp": 0.01033634, + "balance_loss_clip": 1.02019286, + "balance_loss_mlp": 1.03933907, + "epoch": 0.7033218097099053, + "flos": 17967294092160.0, + "grad_norm": 1.763301186005729, + "language_loss": 0.8075971, + "learning_rate": 8.542743277341793e-07, + "loss": 0.82900751, + "num_input_tokens_seen": 252407470, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.6796875, + "step": 11698, + "time_per_iteration": 2.4794504642486572 + }, + { + "auxiliary_loss_clip": 0.01102647, + "auxiliary_loss_mlp": 0.01031888, + "balance_loss_clip": 1.01978219, + "balance_loss_mlp": 1.03481781, + "epoch": 0.7033819329625732, + "flos": 19501721233920.0, + "grad_norm": 1.4318828234621686, + "language_loss": 0.84606147, + "learning_rate": 8.539551267053222e-07, + "loss": 0.86740685, + "num_input_tokens_seen": 252427025, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 11699, + "time_per_iteration": 2.466271162033081 + }, + { + "auxiliary_loss_clip": 0.01102469, + "auxiliary_loss_mlp": 0.01028448, + "balance_loss_clip": 1.01603246, + "balance_loss_mlp": 1.03670907, + "epoch": 0.7034420562152413, + "flos": 23987645948160.0, + "grad_norm": 2.1706968176821326, + "language_loss": 0.79156339, + "learning_rate": 8.53635969134601e-07, + "loss": 0.81287259, + "num_input_tokens_seen": 252445410, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.65625, + "step": 11700, + "time_per_iteration": 2.4769561290740967 + }, + { + "auxiliary_loss_clip": 0.01102749, + "auxiliary_loss_mlp": 0.01023696, + "balance_loss_clip": 1.01164412, + "balance_loss_mlp": 1.0352428, + "epoch": 0.7035021794679092, + "flos": 35043427756800.0, + "grad_norm": 1.698709640635861, + "language_loss": 0.74290204, + "learning_rate": 8.533168550341186e-07, + "loss": 0.76416653, + "num_input_tokens_seen": 252463905, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.67578125, + "step": 11701, + "time_per_iteration": 2.5410683155059814 + }, + { + "auxiliary_loss_clip": 0.01105173, + "auxiliary_loss_mlp": 0.01027306, + "balance_loss_clip": 1.01449096, + "balance_loss_mlp": 1.03693128, + "epoch": 0.7035623027205772, + "flos": 10997428164480.0, + "grad_norm": 2.241875664618386, + "language_loss": 0.83804989, + "learning_rate": 8.529977844159769e-07, + "loss": 0.8593747, + "num_input_tokens_seen": 252478655, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.68359375, + "step": 11702, + "time_per_iteration": 2.4136838912963867 + }, + { + "auxiliary_loss_clip": 0.01102777, + "auxiliary_loss_mlp": 0.01031389, + "balance_loss_clip": 1.01974845, + "balance_loss_mlp": 1.03585792, + "epoch": 0.7036224259732452, + "flos": 23623727304960.0, + "grad_norm": 17.73315944125735, + "language_loss": 0.60806382, + "learning_rate": 8.526787572922738e-07, + "loss": 0.62940544, + "num_input_tokens_seen": 252498740, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66796875, + "step": 11703, + "time_per_iteration": 2.4728925228118896 + }, + { + "auxiliary_loss_clip": 0.0109967, + "auxiliary_loss_mlp": 0.01027379, + "balance_loss_clip": 1.01538706, + "balance_loss_mlp": 1.03344357, + "epoch": 0.7036825492259131, + "flos": 31686175175040.0, + "grad_norm": 1.86622111466138, + "language_loss": 0.60721993, + "learning_rate": 8.523597736751067e-07, + "loss": 0.62849051, + "num_input_tokens_seen": 252517800, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 11704, + "time_per_iteration": 2.5538487434387207 + }, + { + "auxiliary_loss_clip": 0.01097343, + "auxiliary_loss_mlp": 0.01030559, + "balance_loss_clip": 1.02000296, + "balance_loss_mlp": 1.03398025, + "epoch": 0.7037426724785811, + "flos": 30192866127360.0, + "grad_norm": 1.6367819423893837, + "language_loss": 0.70355535, + "learning_rate": 8.520408335765719e-07, + "loss": 0.72483432, + "num_input_tokens_seen": 252539620, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6328125, + "step": 11705, + "time_per_iteration": 2.5196011066436768 + }, + { + "auxiliary_loss_clip": 0.01100052, + "auxiliary_loss_mlp": 0.01027822, + "balance_loss_clip": 1.01636624, + "balance_loss_mlp": 1.03497076, + "epoch": 0.703802795731249, + "flos": 24311523905280.0, + "grad_norm": 2.637724615159266, + "language_loss": 0.61509889, + "learning_rate": 8.517219370087645e-07, + "loss": 0.63637763, + "num_input_tokens_seen": 252557300, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65234375, + "step": 11706, + "time_per_iteration": 2.4852991104125977 + }, + { + "auxiliary_loss_clip": 0.01102393, + "auxiliary_loss_mlp": 0.01026458, + "balance_loss_clip": 1.01521683, + "balance_loss_mlp": 1.03553593, + "epoch": 0.7038629189839171, + "flos": 22528954632960.0, + "grad_norm": 2.2484984676875563, + "language_loss": 0.68121183, + "learning_rate": 8.514030839837756e-07, + "loss": 0.70250034, + "num_input_tokens_seen": 252576715, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.66796875, + "step": 11707, + "time_per_iteration": 2.4560024738311768 + }, + { + "auxiliary_loss_clip": 0.01097433, + "auxiliary_loss_mlp": 0.01027441, + "balance_loss_clip": 1.0162648, + "balance_loss_mlp": 1.03335011, + "epoch": 0.703923042236585, + "flos": 26250484993920.0, + "grad_norm": 1.7446259905587083, + "language_loss": 0.76487923, + "learning_rate": 8.510842745136974e-07, + "loss": 0.78612804, + "num_input_tokens_seen": 252596190, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.640625, + "step": 11708, + "time_per_iteration": 2.515327215194702 + }, + { + "auxiliary_loss_clip": 0.01099228, + "auxiliary_loss_mlp": 0.01024966, + "balance_loss_clip": 1.01421952, + "balance_loss_mlp": 1.03512418, + "epoch": 0.703983165489253, + "flos": 19390254353280.0, + "grad_norm": 1.893368388386225, + "language_loss": 0.72055292, + "learning_rate": 8.50765508610619e-07, + "loss": 0.74179482, + "num_input_tokens_seen": 252613410, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.640625, + "step": 11709, + "time_per_iteration": 2.431182384490967 + }, + { + "auxiliary_loss_clip": 0.01099189, + "auxiliary_loss_mlp": 0.01025216, + "balance_loss_clip": 1.01399827, + "balance_loss_mlp": 1.03375983, + "epoch": 0.7040432887419209, + "flos": 16683630773760.0, + "grad_norm": 2.079430411231168, + "language_loss": 0.79054451, + "learning_rate": 8.504467862866267e-07, + "loss": 0.81178856, + "num_input_tokens_seen": 252629150, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 11710, + "time_per_iteration": 2.3997299671173096 + }, + { + "auxiliary_loss_clip": 0.01104493, + "auxiliary_loss_mlp": 0.01030463, + "balance_loss_clip": 1.01852989, + "balance_loss_mlp": 1.03760147, + "epoch": 0.7041034119945889, + "flos": 21141402203520.0, + "grad_norm": 1.6049139638931622, + "language_loss": 0.77447236, + "learning_rate": 8.501281075538076e-07, + "loss": 0.79582191, + "num_input_tokens_seen": 252648225, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.671875, + "step": 11711, + "time_per_iteration": 2.4934744834899902 + }, + { + "auxiliary_loss_clip": 0.01098365, + "auxiliary_loss_mlp": 0.01024434, + "balance_loss_clip": 1.01442647, + "balance_loss_mlp": 1.03375506, + "epoch": 0.7041635352472568, + "flos": 16910299549440.0, + "grad_norm": 2.37459605810246, + "language_loss": 0.73933756, + "learning_rate": 8.498094724242457e-07, + "loss": 0.76056558, + "num_input_tokens_seen": 252665380, + "router_z_loss_clip": 0.10009766, + "router_z_loss_mlp": 0.64453125, + "step": 11712, + "time_per_iteration": 2.4414384365081787 + }, + { + "auxiliary_loss_clip": 0.01025006, + "auxiliary_loss_mlp": 0.01006413, + "balance_loss_clip": 1.00531662, + "balance_loss_mlp": 1.00448298, + "epoch": 0.7042236584999249, + "flos": 71681219475840.0, + "grad_norm": 0.8819337057085826, + "language_loss": 0.64707136, + "learning_rate": 8.494908809100247e-07, + "loss": 0.66738558, + "num_input_tokens_seen": 252727950, + "router_z_loss_clip": 0.01098633, + "router_z_loss_mlp": 0.20507812, + "step": 11713, + "time_per_iteration": 3.1559205055236816 + }, + { + "auxiliary_loss_clip": 0.0109808, + "auxiliary_loss_mlp": 0.01024348, + "balance_loss_clip": 1.01370883, + "balance_loss_mlp": 1.03258777, + "epoch": 0.7042837817525928, + "flos": 28658187590400.0, + "grad_norm": 2.337022160062714, + "language_loss": 0.72537225, + "learning_rate": 8.49172333023225e-07, + "loss": 0.74659657, + "num_input_tokens_seen": 252746770, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.65625, + "step": 11714, + "time_per_iteration": 2.5274534225463867 + }, + { + "auxiliary_loss_clip": 0.01100361, + "auxiliary_loss_mlp": 0.01030547, + "balance_loss_clip": 1.01889992, + "balance_loss_mlp": 1.03500628, + "epoch": 0.7043439050052608, + "flos": 19753562465280.0, + "grad_norm": 1.5791768588768047, + "language_loss": 0.79251838, + "learning_rate": 8.488538287759248e-07, + "loss": 0.81382746, + "num_input_tokens_seen": 252765610, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 11715, + "time_per_iteration": 2.423422336578369 + }, + { + "auxiliary_loss_clip": 0.01102527, + "auxiliary_loss_mlp": 0.01031536, + "balance_loss_clip": 1.02010405, + "balance_loss_mlp": 1.03536105, + "epoch": 0.7044040282579288, + "flos": 11538529620480.0, + "grad_norm": 2.2156697071751204, + "language_loss": 0.71082246, + "learning_rate": 8.485353681802037e-07, + "loss": 0.73216307, + "num_input_tokens_seen": 252781610, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.671875, + "step": 11716, + "time_per_iteration": 2.407350540161133 + }, + { + "auxiliary_loss_clip": 0.01105426, + "auxiliary_loss_mlp": 0.01028892, + "balance_loss_clip": 1.0173167, + "balance_loss_mlp": 1.03666377, + "epoch": 0.7044641515105967, + "flos": 33656126722560.0, + "grad_norm": 1.9148933155218295, + "language_loss": 0.66782308, + "learning_rate": 8.482169512481358e-07, + "loss": 0.68916631, + "num_input_tokens_seen": 252800600, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6875, + "step": 11717, + "time_per_iteration": 2.525740146636963 + }, + { + "auxiliary_loss_clip": 0.01102186, + "auxiliary_loss_mlp": 0.01028377, + "balance_loss_clip": 1.01697445, + "balance_loss_mlp": 1.03591442, + "epoch": 0.7045242747632647, + "flos": 26723859356160.0, + "grad_norm": 1.4782257349417278, + "language_loss": 0.7415244, + "learning_rate": 8.478985779917967e-07, + "loss": 0.76283002, + "num_input_tokens_seen": 252822310, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 11718, + "time_per_iteration": 2.5084335803985596 + }, + { + "auxiliary_loss_clip": 0.01100672, + "auxiliary_loss_mlp": 0.01031212, + "balance_loss_clip": 1.02055478, + "balance_loss_mlp": 1.03563166, + "epoch": 0.7045843980159326, + "flos": 26797655848320.0, + "grad_norm": 1.542276447013311, + "language_loss": 0.79529881, + "learning_rate": 8.475802484232606e-07, + "loss": 0.81661767, + "num_input_tokens_seen": 252842355, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6484375, + "step": 11719, + "time_per_iteration": 3.982532024383545 + }, + { + "auxiliary_loss_clip": 0.011017, + "auxiliary_loss_mlp": 0.01032857, + "balance_loss_clip": 1.02105546, + "balance_loss_mlp": 1.03649902, + "epoch": 0.7046445212686007, + "flos": 41574824363520.0, + "grad_norm": 1.7315117799773545, + "language_loss": 0.65495813, + "learning_rate": 8.472619625545951e-07, + "loss": 0.67630363, + "num_input_tokens_seen": 252866785, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65234375, + "step": 11720, + "time_per_iteration": 2.613939046859741 + }, + { + "auxiliary_loss_clip": 0.01103943, + "auxiliary_loss_mlp": 0.01027827, + "balance_loss_clip": 1.01631165, + "balance_loss_mlp": 1.03645182, + "epoch": 0.7047046445212686, + "flos": 15560166113280.0, + "grad_norm": 2.050842345880835, + "language_loss": 0.79890549, + "learning_rate": 8.46943720397872e-07, + "loss": 0.82022321, + "num_input_tokens_seen": 252881870, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.67578125, + "step": 11721, + "time_per_iteration": 3.8472952842712402 + }, + { + "auxiliary_loss_clip": 0.01025354, + "auxiliary_loss_mlp": 0.0100049, + "balance_loss_clip": 0.99931604, + "balance_loss_mlp": 1.00471067, + "epoch": 0.7047647677739366, + "flos": 70410269571840.0, + "grad_norm": 0.7603746797437617, + "language_loss": 0.64777911, + "learning_rate": 8.466255219651582e-07, + "loss": 0.66803753, + "num_input_tokens_seen": 252951300, + "router_z_loss_clip": 0.01171875, + "router_z_loss_mlp": 0.20703125, + "step": 11722, + "time_per_iteration": 4.5988264083862305 + }, + { + "auxiliary_loss_clip": 0.01101223, + "auxiliary_loss_mlp": 0.01031226, + "balance_loss_clip": 1.02053356, + "balance_loss_mlp": 1.03678107, + "epoch": 0.7048248910266045, + "flos": 23660032976640.0, + "grad_norm": 1.538856016334547, + "language_loss": 0.65742815, + "learning_rate": 8.463073672685211e-07, + "loss": 0.67875266, + "num_input_tokens_seen": 252971400, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.64453125, + "step": 11723, + "time_per_iteration": 3.925845146179199 + }, + { + "auxiliary_loss_clip": 0.01103786, + "auxiliary_loss_mlp": 0.01027901, + "balance_loss_clip": 1.01623046, + "balance_loss_mlp": 1.03655779, + "epoch": 0.7048850142792725, + "flos": 21397158017280.0, + "grad_norm": 1.8916483795909507, + "language_loss": 0.81127882, + "learning_rate": 8.459892563200235e-07, + "loss": 0.83259565, + "num_input_tokens_seen": 252989475, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.671875, + "step": 11724, + "time_per_iteration": 2.4720969200134277 + }, + { + "auxiliary_loss_clip": 0.01100772, + "auxiliary_loss_mlp": 0.01034144, + "balance_loss_clip": 1.02252126, + "balance_loss_mlp": 1.03349257, + "epoch": 0.7049451375319404, + "flos": 21648101408640.0, + "grad_norm": 2.093101088286717, + "language_loss": 0.72902447, + "learning_rate": 8.456711891317296e-07, + "loss": 0.75037366, + "num_input_tokens_seen": 253007220, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 11725, + "time_per_iteration": 2.4452946186065674 + }, + { + "auxiliary_loss_clip": 0.01103396, + "auxiliary_loss_mlp": 0.01029478, + "balance_loss_clip": 1.01771188, + "balance_loss_mlp": 1.03560305, + "epoch": 0.7050052607846085, + "flos": 14866802904960.0, + "grad_norm": 2.2997258543703847, + "language_loss": 0.78231096, + "learning_rate": 8.453531657156998e-07, + "loss": 0.80363971, + "num_input_tokens_seen": 253025410, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6796875, + "step": 11726, + "time_per_iteration": 2.4585561752319336 + }, + { + "auxiliary_loss_clip": 0.0110172, + "auxiliary_loss_mlp": 0.01028095, + "balance_loss_clip": 1.01683593, + "balance_loss_mlp": 1.0345757, + "epoch": 0.7050653840372764, + "flos": 19241763528960.0, + "grad_norm": 1.8306322081887336, + "language_loss": 0.70494819, + "learning_rate": 8.450351860839931e-07, + "loss": 0.72624636, + "num_input_tokens_seen": 253043305, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 11727, + "time_per_iteration": 2.4121358394622803 + }, + { + "auxiliary_loss_clip": 0.01093352, + "auxiliary_loss_mlp": 0.01023523, + "balance_loss_clip": 1.01340246, + "balance_loss_mlp": 1.03211212, + "epoch": 0.7051255072899444, + "flos": 27780422935680.0, + "grad_norm": 1.6678218850336868, + "language_loss": 0.69096273, + "learning_rate": 8.44717250248668e-07, + "loss": 0.7121315, + "num_input_tokens_seen": 253062790, + "router_z_loss_clip": 0.10107422, + "router_z_loss_mlp": 0.61328125, + "step": 11728, + "time_per_iteration": 2.5468525886535645 + }, + { + "auxiliary_loss_clip": 0.0110237, + "auxiliary_loss_mlp": 0.01029013, + "balance_loss_clip": 1.01771235, + "balance_loss_mlp": 1.03713453, + "epoch": 0.7051856305426124, + "flos": 27892033470720.0, + "grad_norm": 3.1019246116397774, + "language_loss": 0.73087037, + "learning_rate": 8.443993582217803e-07, + "loss": 0.75218427, + "num_input_tokens_seen": 253082055, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 11729, + "time_per_iteration": 2.4827933311462402 + }, + { + "auxiliary_loss_clip": 0.01106229, + "auxiliary_loss_mlp": 0.01033191, + "balance_loss_clip": 1.02045989, + "balance_loss_mlp": 1.03594112, + "epoch": 0.7052457537952803, + "flos": 25043563082880.0, + "grad_norm": 1.545567199994104, + "language_loss": 0.77897024, + "learning_rate": 8.440815100153862e-07, + "loss": 0.80036438, + "num_input_tokens_seen": 253102575, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 11730, + "time_per_iteration": 2.493704080581665 + }, + { + "auxiliary_loss_clip": 0.0110104, + "auxiliary_loss_mlp": 0.01030193, + "balance_loss_clip": 1.01871312, + "balance_loss_mlp": 1.03360641, + "epoch": 0.7053058770479483, + "flos": 21871717528320.0, + "grad_norm": 2.123896450725626, + "language_loss": 0.62706244, + "learning_rate": 8.437637056415359e-07, + "loss": 0.64837468, + "num_input_tokens_seen": 253121290, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.671875, + "step": 11731, + "time_per_iteration": 2.459735631942749 + }, + { + "auxiliary_loss_clip": 0.01103723, + "auxiliary_loss_mlp": 0.01025352, + "balance_loss_clip": 1.01358604, + "balance_loss_mlp": 1.03539586, + "epoch": 0.7053660003006162, + "flos": 16398716094720.0, + "grad_norm": 2.3898643418724888, + "language_loss": 0.74733448, + "learning_rate": 8.434459451122815e-07, + "loss": 0.76862514, + "num_input_tokens_seen": 253139720, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.68359375, + "step": 11732, + "time_per_iteration": 2.4383316040039062 + }, + { + "auxiliary_loss_clip": 0.01100804, + "auxiliary_loss_mlp": 0.0102585, + "balance_loss_clip": 1.0146327, + "balance_loss_mlp": 1.03631091, + "epoch": 0.7054261235532843, + "flos": 22711560399360.0, + "grad_norm": 1.6140204941030658, + "language_loss": 0.70913476, + "learning_rate": 8.431282284396735e-07, + "loss": 0.73040134, + "num_input_tokens_seen": 253160250, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.64453125, + "step": 11733, + "time_per_iteration": 2.463106632232666 + }, + { + "auxiliary_loss_clip": 0.0109822, + "auxiliary_loss_mlp": 0.01030378, + "balance_loss_clip": 1.01916051, + "balance_loss_mlp": 1.0332557, + "epoch": 0.7054862468059522, + "flos": 13589711775360.0, + "grad_norm": 1.8693202683913837, + "language_loss": 0.73223364, + "learning_rate": 8.428105556357583e-07, + "loss": 0.75351965, + "num_input_tokens_seen": 253178710, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 11734, + "time_per_iteration": 2.44874906539917 + }, + { + "auxiliary_loss_clip": 0.01105433, + "auxiliary_loss_mlp": 0.01034147, + "balance_loss_clip": 1.02211308, + "balance_loss_mlp": 1.03561354, + "epoch": 0.7055463700586202, + "flos": 15880704105600.0, + "grad_norm": 2.1460182030345423, + "language_loss": 0.69040471, + "learning_rate": 8.424929267125829e-07, + "loss": 0.71180052, + "num_input_tokens_seen": 253194805, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.69921875, + "step": 11735, + "time_per_iteration": 2.3848354816436768 + }, + { + "auxiliary_loss_clip": 0.01103108, + "auxiliary_loss_mlp": 0.01034516, + "balance_loss_clip": 1.02173638, + "balance_loss_mlp": 1.03526986, + "epoch": 0.7056064933112881, + "flos": 23076161400960.0, + "grad_norm": 2.0775841009488105, + "language_loss": 0.72464728, + "learning_rate": 8.421753416821933e-07, + "loss": 0.74602348, + "num_input_tokens_seen": 253213895, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 11736, + "time_per_iteration": 2.4738998413085938 + }, + { + "auxiliary_loss_clip": 0.01101906, + "auxiliary_loss_mlp": 0.01022502, + "balance_loss_clip": 1.01198161, + "balance_loss_mlp": 1.03716493, + "epoch": 0.7056666165639561, + "flos": 24057168721920.0, + "grad_norm": 1.8965770447194195, + "language_loss": 0.69242585, + "learning_rate": 8.41857800556629e-07, + "loss": 0.71366996, + "num_input_tokens_seen": 253231620, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6484375, + "step": 11737, + "time_per_iteration": 2.486891031265259 + }, + { + "auxiliary_loss_clip": 0.01104553, + "auxiliary_loss_mlp": 0.01036427, + "balance_loss_clip": 1.02426147, + "balance_loss_mlp": 1.03642035, + "epoch": 0.705726739816624, + "flos": 17493237371520.0, + "grad_norm": 3.675344969023003, + "language_loss": 0.6783061, + "learning_rate": 8.415403033479332e-07, + "loss": 0.69971591, + "num_input_tokens_seen": 253249590, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.68359375, + "step": 11738, + "time_per_iteration": 2.553422212600708 + }, + { + "auxiliary_loss_clip": 0.01101113, + "auxiliary_loss_mlp": 0.01028831, + "balance_loss_clip": 1.01684439, + "balance_loss_mlp": 1.03525221, + "epoch": 0.7057868630692921, + "flos": 51350426472960.0, + "grad_norm": 1.822626738464323, + "language_loss": 0.75158858, + "learning_rate": 8.41222850068145e-07, + "loss": 0.77288795, + "num_input_tokens_seen": 253273870, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66015625, + "step": 11739, + "time_per_iteration": 2.7234206199645996 + }, + { + "auxiliary_loss_clip": 0.01096979, + "auxiliary_loss_mlp": 0.01024687, + "balance_loss_clip": 1.01327837, + "balance_loss_mlp": 1.03416896, + "epoch": 0.70584698632196, + "flos": 26102963836800.0, + "grad_norm": 1.6386606118434162, + "language_loss": 0.71622884, + "learning_rate": 8.409054407293032e-07, + "loss": 0.73744547, + "num_input_tokens_seen": 253293720, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.62890625, + "step": 11740, + "time_per_iteration": 2.5212409496307373 + }, + { + "auxiliary_loss_clip": 0.01101026, + "auxiliary_loss_mlp": 0.01025615, + "balance_loss_clip": 1.01523209, + "balance_loss_mlp": 1.03545165, + "epoch": 0.705907109574628, + "flos": 21543134889600.0, + "grad_norm": 1.6725006196923968, + "language_loss": 0.81998235, + "learning_rate": 8.405880753434434e-07, + "loss": 0.84124875, + "num_input_tokens_seen": 253313700, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.65625, + "step": 11741, + "time_per_iteration": 2.492769241333008 + }, + { + "auxiliary_loss_clip": 0.01100552, + "auxiliary_loss_mlp": 0.0102846, + "balance_loss_clip": 1.01662874, + "balance_loss_mlp": 1.03408957, + "epoch": 0.705967232827296, + "flos": 22710842127360.0, + "grad_norm": 3.596961466154263, + "language_loss": 0.78171599, + "learning_rate": 8.402707539225993e-07, + "loss": 0.80300617, + "num_input_tokens_seen": 253332425, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 11742, + "time_per_iteration": 2.4635274410247803 + }, + { + "auxiliary_loss_clip": 0.01105195, + "auxiliary_loss_mlp": 0.01028943, + "balance_loss_clip": 1.01668274, + "balance_loss_mlp": 1.03600883, + "epoch": 0.7060273560799639, + "flos": 28691225124480.0, + "grad_norm": 1.573979132261771, + "language_loss": 0.64315516, + "learning_rate": 8.39953476478805e-07, + "loss": 0.66449654, + "num_input_tokens_seen": 253353620, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.69140625, + "step": 11743, + "time_per_iteration": 2.5026400089263916 + }, + { + "auxiliary_loss_clip": 0.01102792, + "auxiliary_loss_mlp": 0.01026845, + "balance_loss_clip": 1.01475716, + "balance_loss_mlp": 1.03465271, + "epoch": 0.7060874793326319, + "flos": 15706178899200.0, + "grad_norm": 2.3718798915613846, + "language_loss": 0.65446359, + "learning_rate": 8.396362430240902e-07, + "loss": 0.67576003, + "num_input_tokens_seen": 253370930, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.68359375, + "step": 11744, + "time_per_iteration": 2.458536386489868 + }, + { + "auxiliary_loss_clip": 0.01099117, + "auxiliary_loss_mlp": 0.01030101, + "balance_loss_clip": 1.0185678, + "balance_loss_mlp": 1.03479218, + "epoch": 0.7061476025852998, + "flos": 21506757390720.0, + "grad_norm": 1.9180320114034342, + "language_loss": 0.6355719, + "learning_rate": 8.393190535704857e-07, + "loss": 0.65686405, + "num_input_tokens_seen": 253389810, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 11745, + "time_per_iteration": 2.462301254272461 + }, + { + "auxiliary_loss_clip": 0.0110159, + "auxiliary_loss_mlp": 0.0102864, + "balance_loss_clip": 1.01734483, + "balance_loss_mlp": 1.03486073, + "epoch": 0.7062077258379679, + "flos": 28181832399360.0, + "grad_norm": 1.843467279794647, + "language_loss": 0.71770209, + "learning_rate": 8.390019081300188e-07, + "loss": 0.73900437, + "num_input_tokens_seen": 253408685, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66796875, + "step": 11746, + "time_per_iteration": 2.528543472290039 + }, + { + "auxiliary_loss_clip": 0.01102256, + "auxiliary_loss_mlp": 0.01030658, + "balance_loss_clip": 1.01882029, + "balance_loss_mlp": 1.03566575, + "epoch": 0.7062678490906358, + "flos": 27853680723840.0, + "grad_norm": 1.4097258428408725, + "language_loss": 0.79373205, + "learning_rate": 8.386848067147175e-07, + "loss": 0.81506121, + "num_input_tokens_seen": 253429685, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 11747, + "time_per_iteration": 2.479778528213501 + }, + { + "auxiliary_loss_clip": 0.01099735, + "auxiliary_loss_mlp": 0.01027659, + "balance_loss_clip": 1.01698387, + "balance_loss_mlp": 1.03513098, + "epoch": 0.7063279723433038, + "flos": 23184862934400.0, + "grad_norm": 1.7869226712906443, + "language_loss": 0.65377176, + "learning_rate": 8.383677493366031e-07, + "loss": 0.67504573, + "num_input_tokens_seen": 253448260, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.64453125, + "step": 11748, + "time_per_iteration": 2.4946584701538086 + }, + { + "auxiliary_loss_clip": 0.01101478, + "auxiliary_loss_mlp": 0.01034387, + "balance_loss_clip": 1.02267504, + "balance_loss_mlp": 1.03426147, + "epoch": 0.7063880955959717, + "flos": 20188655907840.0, + "grad_norm": 1.990623957456742, + "language_loss": 0.79503167, + "learning_rate": 8.380507360077003e-07, + "loss": 0.8163904, + "num_input_tokens_seen": 253467725, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 11749, + "time_per_iteration": 2.4612464904785156 + }, + { + "auxiliary_loss_clip": 0.01024671, + "auxiliary_loss_mlp": 0.010023, + "balance_loss_clip": 1.0011971, + "balance_loss_mlp": 1.00396466, + "epoch": 0.7064482188486397, + "flos": 63668182763520.0, + "grad_norm": 0.788003911856545, + "language_loss": 0.54088426, + "learning_rate": 8.377337667400304e-07, + "loss": 0.56115395, + "num_input_tokens_seen": 253526940, + "router_z_loss_clip": 0.01104736, + "router_z_loss_mlp": 0.20703125, + "step": 11750, + "time_per_iteration": 2.998089075088501 + }, + { + "auxiliary_loss_clip": 0.01103221, + "auxiliary_loss_mlp": 0.01029796, + "balance_loss_clip": 1.01806545, + "balance_loss_mlp": 1.03667092, + "epoch": 0.7065083421013076, + "flos": 25191227894400.0, + "grad_norm": 2.4248797762244725, + "language_loss": 0.7843067, + "learning_rate": 8.37416841545612e-07, + "loss": 0.80563688, + "num_input_tokens_seen": 253546160, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 11751, + "time_per_iteration": 2.4795496463775635 + }, + { + "auxiliary_loss_clip": 0.01096512, + "auxiliary_loss_mlp": 0.01027672, + "balance_loss_clip": 1.01685357, + "balance_loss_mlp": 1.03329563, + "epoch": 0.7065684653539757, + "flos": 22893699288960.0, + "grad_norm": 1.7553518859924266, + "language_loss": 0.67958248, + "learning_rate": 8.370999604364634e-07, + "loss": 0.70082432, + "num_input_tokens_seen": 253565505, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6328125, + "step": 11752, + "time_per_iteration": 2.4724245071411133 + }, + { + "auxiliary_loss_clip": 0.01100964, + "auxiliary_loss_mlp": 0.01034731, + "balance_loss_clip": 1.02317405, + "balance_loss_mlp": 1.03582311, + "epoch": 0.7066285886066436, + "flos": 23550254035200.0, + "grad_norm": 2.8550758527521567, + "language_loss": 0.76533222, + "learning_rate": 8.367831234246025e-07, + "loss": 0.78668916, + "num_input_tokens_seen": 253585125, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65234375, + "step": 11753, + "time_per_iteration": 2.5033509731292725 + }, + { + "auxiliary_loss_clip": 0.01099171, + "auxiliary_loss_mlp": 0.01026978, + "balance_loss_clip": 1.01595759, + "balance_loss_mlp": 1.03566098, + "epoch": 0.7066887118593116, + "flos": 21069293650560.0, + "grad_norm": 1.8063663453491996, + "language_loss": 0.710163, + "learning_rate": 8.364663305220405e-07, + "loss": 0.73142445, + "num_input_tokens_seen": 253604815, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.63671875, + "step": 11754, + "time_per_iteration": 2.47737979888916 + }, + { + "auxiliary_loss_clip": 0.01101217, + "auxiliary_loss_mlp": 0.01032473, + "balance_loss_clip": 1.02061772, + "balance_loss_mlp": 1.03515744, + "epoch": 0.7067488351119796, + "flos": 21176307244800.0, + "grad_norm": 1.555791916243094, + "language_loss": 0.89167392, + "learning_rate": 8.361495817407919e-07, + "loss": 0.91301078, + "num_input_tokens_seen": 253622855, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66015625, + "step": 11755, + "time_per_iteration": 2.4300765991210938 + }, + { + "auxiliary_loss_clip": 0.01099783, + "auxiliary_loss_mlp": 0.01032572, + "balance_loss_clip": 1.0210979, + "balance_loss_mlp": 1.03451729, + "epoch": 0.7068089583646475, + "flos": 20449224144000.0, + "grad_norm": 1.6305430191953068, + "language_loss": 0.79877228, + "learning_rate": 8.358328770928678e-07, + "loss": 0.82009578, + "num_input_tokens_seen": 253642760, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65625, + "step": 11756, + "time_per_iteration": 2.455738067626953 + }, + { + "auxiliary_loss_clip": 0.01025525, + "auxiliary_loss_mlp": 0.00998571, + "balance_loss_clip": 0.99742049, + "balance_loss_mlp": 1.00500059, + "epoch": 0.7068690816173155, + "flos": 59109179829120.0, + "grad_norm": 0.8167477619249136, + "language_loss": 0.60323715, + "learning_rate": 8.355162165902785e-07, + "loss": 0.62347817, + "num_input_tokens_seen": 253695685, + "router_z_loss_clip": 0.01147461, + "router_z_loss_mlp": 0.20507812, + "step": 11757, + "time_per_iteration": 2.8279542922973633 + }, + { + "auxiliary_loss_clip": 0.01103404, + "auxiliary_loss_mlp": 0.01030296, + "balance_loss_clip": 1.0194478, + "balance_loss_mlp": 1.03670585, + "epoch": 0.7069292048699835, + "flos": 16251554073600.0, + "grad_norm": 2.9383193028665335, + "language_loss": 0.80605227, + "learning_rate": 8.351996002450307e-07, + "loss": 0.82738924, + "num_input_tokens_seen": 253713305, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.66796875, + "step": 11758, + "time_per_iteration": 2.438985824584961 + }, + { + "auxiliary_loss_clip": 0.01097896, + "auxiliary_loss_mlp": 0.01032742, + "balance_loss_clip": 1.02111876, + "balance_loss_mlp": 1.03326845, + "epoch": 0.7069893281226515, + "flos": 41172768455040.0, + "grad_norm": 2.302594291056757, + "language_loss": 0.77111626, + "learning_rate": 8.348830280691304e-07, + "loss": 0.79242271, + "num_input_tokens_seen": 253736100, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6484375, + "step": 11759, + "time_per_iteration": 2.6082146167755127 + }, + { + "auxiliary_loss_clip": 0.01099992, + "auxiliary_loss_mlp": 0.01030363, + "balance_loss_clip": 1.01839471, + "balance_loss_mlp": 1.03407705, + "epoch": 0.7070494513753194, + "flos": 24207275658240.0, + "grad_norm": 1.8203560783968598, + "language_loss": 0.67900372, + "learning_rate": 8.34566500074583e-07, + "loss": 0.70030731, + "num_input_tokens_seen": 253757350, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66015625, + "step": 11760, + "time_per_iteration": 2.4875950813293457 + }, + { + "auxiliary_loss_clip": 0.01103064, + "auxiliary_loss_mlp": 0.01033498, + "balance_loss_clip": 1.02223873, + "balance_loss_mlp": 1.03625393, + "epoch": 0.7071095746279874, + "flos": 20185675079040.0, + "grad_norm": 1.8036620557159548, + "language_loss": 0.80104721, + "learning_rate": 8.342500162733899e-07, + "loss": 0.82241285, + "num_input_tokens_seen": 253772855, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.66796875, + "step": 11761, + "time_per_iteration": 3.7999839782714844 + }, + { + "auxiliary_loss_clip": 0.01101999, + "auxiliary_loss_mlp": 0.01030339, + "balance_loss_clip": 1.01776791, + "balance_loss_mlp": 1.03520203, + "epoch": 0.7071696978806553, + "flos": 18183045133440.0, + "grad_norm": 2.4050467781095697, + "language_loss": 0.74975789, + "learning_rate": 8.33933576677553e-07, + "loss": 0.77108127, + "num_input_tokens_seen": 253790360, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.66796875, + "step": 11762, + "time_per_iteration": 2.408281087875366 + }, + { + "auxiliary_loss_clip": 0.01100811, + "auxiliary_loss_mlp": 0.0102852, + "balance_loss_clip": 1.01743984, + "balance_loss_mlp": 1.03630018, + "epoch": 0.7072298211333233, + "flos": 24131719399680.0, + "grad_norm": 1.750455145965042, + "language_loss": 0.76771009, + "learning_rate": 8.336171812990724e-07, + "loss": 0.78900343, + "num_input_tokens_seen": 253810585, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.64453125, + "step": 11763, + "time_per_iteration": 3.8708484172821045 + }, + { + "auxiliary_loss_clip": 0.01101144, + "auxiliary_loss_mlp": 0.01033004, + "balance_loss_clip": 1.02082658, + "balance_loss_mlp": 1.03537869, + "epoch": 0.7072899443859912, + "flos": 27198418867200.0, + "grad_norm": 2.2813098001672527, + "language_loss": 0.78606045, + "learning_rate": 8.333008301499453e-07, + "loss": 0.8074019, + "num_input_tokens_seen": 253829080, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65625, + "step": 11764, + "time_per_iteration": 3.926267623901367 + }, + { + "auxiliary_loss_clip": 0.0110389, + "auxiliary_loss_mlp": 0.01036608, + "balance_loss_clip": 1.02440739, + "balance_loss_mlp": 1.03585315, + "epoch": 0.7073500676386593, + "flos": 16435596384000.0, + "grad_norm": 1.4902481922059967, + "language_loss": 0.79271352, + "learning_rate": 8.32984523242167e-07, + "loss": 0.8141185, + "num_input_tokens_seen": 253846780, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6796875, + "step": 11765, + "time_per_iteration": 3.9003517627716064 + }, + { + "auxiliary_loss_clip": 0.01097952, + "auxiliary_loss_mlp": 0.01025366, + "balance_loss_clip": 1.01503086, + "balance_loss_mlp": 1.03383851, + "epoch": 0.7074101908913272, + "flos": 27673732563840.0, + "grad_norm": 1.6100965300159724, + "language_loss": 0.68550825, + "learning_rate": 8.326682605877324e-07, + "loss": 0.70674151, + "num_input_tokens_seen": 253867075, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.640625, + "step": 11766, + "time_per_iteration": 2.4833571910858154 + }, + { + "auxiliary_loss_clip": 0.01100721, + "auxiliary_loss_mlp": 0.0103238, + "balance_loss_clip": 1.02058399, + "balance_loss_mlp": 1.03390872, + "epoch": 0.7074703141439952, + "flos": 22238078296320.0, + "grad_norm": 1.8537677939151296, + "language_loss": 0.63282174, + "learning_rate": 8.323520421986352e-07, + "loss": 0.65415275, + "num_input_tokens_seen": 253885790, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.66796875, + "step": 11767, + "time_per_iteration": 2.4963812828063965 + }, + { + "auxiliary_loss_clip": 0.01100427, + "auxiliary_loss_mlp": 0.01029022, + "balance_loss_clip": 1.0175842, + "balance_loss_mlp": 1.03403151, + "epoch": 0.7075304373966632, + "flos": 29643217234560.0, + "grad_norm": 1.4756633405104822, + "language_loss": 0.52592945, + "learning_rate": 8.320358680868646e-07, + "loss": 0.54722404, + "num_input_tokens_seen": 253907070, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6640625, + "step": 11768, + "time_per_iteration": 2.5584144592285156 + }, + { + "auxiliary_loss_clip": 0.01098381, + "auxiliary_loss_mlp": 0.01028365, + "balance_loss_clip": 1.01779723, + "balance_loss_mlp": 1.03422117, + "epoch": 0.7075905606493311, + "flos": 19755214490880.0, + "grad_norm": 2.0331888903396296, + "language_loss": 0.75885397, + "learning_rate": 8.317197382644119e-07, + "loss": 0.78012145, + "num_input_tokens_seen": 253927290, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.640625, + "step": 11769, + "time_per_iteration": 2.474039077758789 + }, + { + "auxiliary_loss_clip": 0.01025061, + "auxiliary_loss_mlp": 0.01004429, + "balance_loss_clip": 1.00333822, + "balance_loss_mlp": 1.00454879, + "epoch": 0.7076506839019991, + "flos": 65716132694400.0, + "grad_norm": 0.8547700200374695, + "language_loss": 0.6197865, + "learning_rate": 8.314036527432637e-07, + "loss": 0.64008141, + "num_input_tokens_seen": 253983440, + "router_z_loss_clip": 0.01092529, + "router_z_loss_mlp": 0.20507812, + "step": 11770, + "time_per_iteration": 2.9852561950683594 + }, + { + "auxiliary_loss_clip": 0.01103159, + "auxiliary_loss_mlp": 0.01032983, + "balance_loss_clip": 1.02135992, + "balance_loss_mlp": 1.03515804, + "epoch": 0.707710807154667, + "flos": 23765286804480.0, + "grad_norm": 1.6682974029871904, + "language_loss": 0.76099932, + "learning_rate": 8.310876115354055e-07, + "loss": 0.78236079, + "num_input_tokens_seen": 254003825, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 11771, + "time_per_iteration": 2.4772582054138184 + }, + { + "auxiliary_loss_clip": 0.01096997, + "auxiliary_loss_mlp": 0.01025955, + "balance_loss_clip": 1.01532149, + "balance_loss_mlp": 1.03349578, + "epoch": 0.7077709304073351, + "flos": 21251360712960.0, + "grad_norm": 1.5504616161071019, + "language_loss": 0.71518672, + "learning_rate": 8.307716146528221e-07, + "loss": 0.73641628, + "num_input_tokens_seen": 254023345, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6328125, + "step": 11772, + "time_per_iteration": 2.443416118621826 + }, + { + "auxiliary_loss_clip": 0.011025, + "auxiliary_loss_mlp": 0.01030203, + "balance_loss_clip": 1.01823425, + "balance_loss_mlp": 1.03437555, + "epoch": 0.707831053660003, + "flos": 20740746925440.0, + "grad_norm": 2.392750359759926, + "language_loss": 0.69805288, + "learning_rate": 8.30455662107496e-07, + "loss": 0.7193799, + "num_input_tokens_seen": 254041815, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6796875, + "step": 11773, + "time_per_iteration": 2.4619219303131104 + }, + { + "auxiliary_loss_clip": 0.01101421, + "auxiliary_loss_mlp": 0.0103239, + "balance_loss_clip": 1.02130961, + "balance_loss_mlp": 1.03520298, + "epoch": 0.707891176912671, + "flos": 21980993679360.0, + "grad_norm": 1.496714779410967, + "language_loss": 0.70210946, + "learning_rate": 8.301397539114095e-07, + "loss": 0.72344756, + "num_input_tokens_seen": 254062065, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6640625, + "step": 11774, + "time_per_iteration": 2.446824073791504 + }, + { + "auxiliary_loss_clip": 0.01098362, + "auxiliary_loss_mlp": 0.01027842, + "balance_loss_clip": 1.01658893, + "balance_loss_mlp": 1.03544569, + "epoch": 0.7079513001653389, + "flos": 21068970428160.0, + "grad_norm": 1.5148638748080412, + "language_loss": 0.74460763, + "learning_rate": 8.298238900765407e-07, + "loss": 0.76586962, + "num_input_tokens_seen": 254080605, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.62890625, + "step": 11775, + "time_per_iteration": 2.482792854309082 + }, + { + "auxiliary_loss_clip": 0.01102892, + "auxiliary_loss_mlp": 0.01026682, + "balance_loss_clip": 1.01527333, + "balance_loss_mlp": 1.03621781, + "epoch": 0.7080114234180069, + "flos": 18040659621120.0, + "grad_norm": 1.8403672382430083, + "language_loss": 0.86566663, + "learning_rate": 8.295080706148665e-07, + "loss": 0.88696229, + "num_input_tokens_seen": 254098710, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 11776, + "time_per_iteration": 2.425718069076538 + }, + { + "auxiliary_loss_clip": 0.01098917, + "auxiliary_loss_mlp": 0.01027548, + "balance_loss_clip": 1.01670027, + "balance_loss_mlp": 1.03438497, + "epoch": 0.7080715466706748, + "flos": 15122271409920.0, + "grad_norm": 1.5328522694355011, + "language_loss": 0.74733853, + "learning_rate": 8.291922955383641e-07, + "loss": 0.76860321, + "num_input_tokens_seen": 254117200, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.64453125, + "step": 11777, + "time_per_iteration": 2.4531426429748535 + }, + { + "auxiliary_loss_clip": 0.01106707, + "auxiliary_loss_mlp": 0.01029139, + "balance_loss_clip": 1.01738548, + "balance_loss_mlp": 1.0374651, + "epoch": 0.7081316699233429, + "flos": 14422802889600.0, + "grad_norm": 2.558875929872249, + "language_loss": 0.82017881, + "learning_rate": 8.288765648590066e-07, + "loss": 0.84153724, + "num_input_tokens_seen": 254132115, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6953125, + "step": 11778, + "time_per_iteration": 2.4829678535461426 + }, + { + "auxiliary_loss_clip": 0.01097091, + "auxiliary_loss_mlp": 0.01028381, + "balance_loss_clip": 1.0185461, + "balance_loss_mlp": 1.03495932, + "epoch": 0.7081917931760108, + "flos": 23222389668480.0, + "grad_norm": 1.514152254548671, + "language_loss": 0.84892875, + "learning_rate": 8.285608785887673e-07, + "loss": 0.87018347, + "num_input_tokens_seen": 254152285, + "router_z_loss_clip": 0.09863281, + "router_z_loss_mlp": 0.62109375, + "step": 11779, + "time_per_iteration": 2.484011173248291 + }, + { + "auxiliary_loss_clip": 0.01103023, + "auxiliary_loss_mlp": 0.01037082, + "balance_loss_clip": 1.02578115, + "balance_loss_mlp": 1.03680944, + "epoch": 0.7082519164286788, + "flos": 39308429871360.0, + "grad_norm": 2.0385221770512474, + "language_loss": 0.71657723, + "learning_rate": 8.28245236739618e-07, + "loss": 0.73797828, + "num_input_tokens_seen": 254172805, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66015625, + "step": 11780, + "time_per_iteration": 2.5964436531066895 + }, + { + "auxiliary_loss_clip": 0.0110027, + "auxiliary_loss_mlp": 0.01029233, + "balance_loss_clip": 1.01806879, + "balance_loss_mlp": 1.03559303, + "epoch": 0.7083120396813467, + "flos": 21651154064640.0, + "grad_norm": 1.4808752741388003, + "language_loss": 0.72866988, + "learning_rate": 8.279296393235256e-07, + "loss": 0.74996495, + "num_input_tokens_seen": 254191890, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.64453125, + "step": 11781, + "time_per_iteration": 2.583249807357788 + }, + { + "auxiliary_loss_clip": 0.01100497, + "auxiliary_loss_mlp": 0.01031477, + "balance_loss_clip": 1.02093256, + "balance_loss_mlp": 1.03541338, + "epoch": 0.7083721629340147, + "flos": 17567033863680.0, + "grad_norm": 1.5571808268796947, + "language_loss": 0.77223784, + "learning_rate": 8.276140863524585e-07, + "loss": 0.79355758, + "num_input_tokens_seen": 254210150, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6484375, + "step": 11782, + "time_per_iteration": 2.4219703674316406 + }, + { + "auxiliary_loss_clip": 0.01098336, + "auxiliary_loss_mlp": 0.01025106, + "balance_loss_clip": 1.0149796, + "balance_loss_mlp": 1.03362107, + "epoch": 0.7084322861866827, + "flos": 29350509304320.0, + "grad_norm": 3.8090510781636273, + "language_loss": 0.69602305, + "learning_rate": 8.272985778383828e-07, + "loss": 0.71725744, + "num_input_tokens_seen": 254233015, + "router_z_loss_clip": 0.1015625, + "router_z_loss_mlp": 0.6484375, + "step": 11783, + "time_per_iteration": 2.532317638397217 + }, + { + "auxiliary_loss_clip": 0.01103454, + "auxiliary_loss_mlp": 0.01028711, + "balance_loss_clip": 1.01744008, + "balance_loss_mlp": 1.03593731, + "epoch": 0.7084924094393507, + "flos": 20194294343040.0, + "grad_norm": 1.6689397610891612, + "language_loss": 0.79052562, + "learning_rate": 8.269831137932632e-07, + "loss": 0.81184721, + "num_input_tokens_seen": 254251345, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.67578125, + "step": 11784, + "time_per_iteration": 2.443634271621704 + }, + { + "auxiliary_loss_clip": 0.01100479, + "auxiliary_loss_mlp": 0.01027476, + "balance_loss_clip": 1.01634157, + "balance_loss_mlp": 1.03534245, + "epoch": 0.7085525326920187, + "flos": 23477211728640.0, + "grad_norm": 2.217987534439464, + "language_loss": 0.77291393, + "learning_rate": 8.266676942290609e-07, + "loss": 0.79419351, + "num_input_tokens_seen": 254269905, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 11785, + "time_per_iteration": 2.4818367958068848 + }, + { + "auxiliary_loss_clip": 0.01100759, + "auxiliary_loss_mlp": 0.01029862, + "balance_loss_clip": 1.01869774, + "balance_loss_mlp": 1.03610969, + "epoch": 0.7086126559446866, + "flos": 25958818558080.0, + "grad_norm": 1.6474825992078, + "language_loss": 0.77668089, + "learning_rate": 8.26352319157738e-07, + "loss": 0.7979871, + "num_input_tokens_seen": 254289990, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6484375, + "step": 11786, + "time_per_iteration": 2.4843997955322266 + }, + { + "auxiliary_loss_clip": 0.01103028, + "auxiliary_loss_mlp": 0.01025097, + "balance_loss_clip": 1.01389718, + "balance_loss_mlp": 1.03586793, + "epoch": 0.7086727791973546, + "flos": 26724793109760.0, + "grad_norm": 2.2462918540494865, + "language_loss": 0.78872836, + "learning_rate": 8.260369885912526e-07, + "loss": 0.8100096, + "num_input_tokens_seen": 254309085, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 11787, + "time_per_iteration": 2.5082507133483887 + }, + { + "auxiliary_loss_clip": 0.01100945, + "auxiliary_loss_mlp": 0.01027499, + "balance_loss_clip": 1.01635325, + "balance_loss_mlp": 1.03544235, + "epoch": 0.7087329024500225, + "flos": 21683365585920.0, + "grad_norm": 1.6974940078994716, + "language_loss": 0.76277357, + "learning_rate": 8.257217025415615e-07, + "loss": 0.78405803, + "num_input_tokens_seen": 254327045, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 11788, + "time_per_iteration": 2.4395945072174072 + }, + { + "auxiliary_loss_clip": 0.01105244, + "auxiliary_loss_mlp": 0.01028353, + "balance_loss_clip": 1.01596761, + "balance_loss_mlp": 1.03661728, + "epoch": 0.7087930257026905, + "flos": 17931060247680.0, + "grad_norm": 2.1698748278708644, + "language_loss": 0.67896038, + "learning_rate": 8.254064610206212e-07, + "loss": 0.70029634, + "num_input_tokens_seen": 254344585, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 11789, + "time_per_iteration": 2.4851551055908203 + }, + { + "auxiliary_loss_clip": 0.01102295, + "auxiliary_loss_mlp": 0.01027538, + "balance_loss_clip": 1.01540208, + "balance_loss_mlp": 1.0347805, + "epoch": 0.7088531489553584, + "flos": 18911528864640.0, + "grad_norm": 1.6812027162903995, + "language_loss": 0.77360779, + "learning_rate": 8.250912640403858e-07, + "loss": 0.79490614, + "num_input_tokens_seen": 254362470, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 11790, + "time_per_iteration": 2.42874813079834 + }, + { + "auxiliary_loss_clip": 0.01105386, + "auxiliary_loss_mlp": 0.01028585, + "balance_loss_clip": 1.01652074, + "balance_loss_mlp": 1.03555274, + "epoch": 0.7089132722080265, + "flos": 27380880979200.0, + "grad_norm": 2.1572989383917864, + "language_loss": 0.70921314, + "learning_rate": 8.247761116128085e-07, + "loss": 0.73055279, + "num_input_tokens_seen": 254383190, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.69921875, + "step": 11791, + "time_per_iteration": 2.5331575870513916 + }, + { + "auxiliary_loss_clip": 0.01101819, + "auxiliary_loss_mlp": 0.01028888, + "balance_loss_clip": 1.01735473, + "balance_loss_mlp": 1.03576159, + "epoch": 0.7089733954606944, + "flos": 22162917087360.0, + "grad_norm": 2.1052262710476968, + "language_loss": 0.81886566, + "learning_rate": 8.244610037498376e-07, + "loss": 0.84017277, + "num_input_tokens_seen": 254403115, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66015625, + "step": 11792, + "time_per_iteration": 2.4568569660186768 + }, + { + "auxiliary_loss_clip": 0.01102816, + "auxiliary_loss_mlp": 0.01027492, + "balance_loss_clip": 1.01563632, + "balance_loss_mlp": 1.03356898, + "epoch": 0.7090335187133624, + "flos": 24425827960320.0, + "grad_norm": 1.890918416074432, + "language_loss": 0.64758253, + "learning_rate": 8.241459404634232e-07, + "loss": 0.66888559, + "num_input_tokens_seen": 254421875, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.69140625, + "step": 11793, + "time_per_iteration": 2.508790969848633 + }, + { + "auxiliary_loss_clip": 0.011012, + "auxiliary_loss_mlp": 0.01027317, + "balance_loss_clip": 1.01664209, + "balance_loss_mlp": 1.03602946, + "epoch": 0.7090936419660303, + "flos": 21835232288640.0, + "grad_norm": 2.7723759797175505, + "language_loss": 0.70710409, + "learning_rate": 8.238309217655133e-07, + "loss": 0.7283892, + "num_input_tokens_seen": 254440765, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.65234375, + "step": 11794, + "time_per_iteration": 2.4677059650421143 + }, + { + "auxiliary_loss_clip": 0.01102435, + "auxiliary_loss_mlp": 0.01029941, + "balance_loss_clip": 1.01953435, + "balance_loss_mlp": 1.03833604, + "epoch": 0.7091537652186983, + "flos": 20082360585600.0, + "grad_norm": 1.7023757586214014, + "language_loss": 0.75844228, + "learning_rate": 8.23515947668052e-07, + "loss": 0.77976608, + "num_input_tokens_seen": 254459480, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.640625, + "step": 11795, + "time_per_iteration": 2.451152801513672 + }, + { + "auxiliary_loss_clip": 0.01100363, + "auxiliary_loss_mlp": 0.0102972, + "balance_loss_clip": 1.01907492, + "balance_loss_mlp": 1.03473902, + "epoch": 0.7092138884713663, + "flos": 13151565676800.0, + "grad_norm": 2.342459713927466, + "language_loss": 0.74982113, + "learning_rate": 8.232010181829838e-07, + "loss": 0.77112198, + "num_input_tokens_seen": 254473985, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.65625, + "step": 11796, + "time_per_iteration": 2.402853012084961 + }, + { + "auxiliary_loss_clip": 0.01106679, + "auxiliary_loss_mlp": 0.01031501, + "balance_loss_clip": 1.0188055, + "balance_loss_mlp": 1.03671682, + "epoch": 0.7092740117240343, + "flos": 21645982506240.0, + "grad_norm": 1.6427166102656843, + "language_loss": 0.74295354, + "learning_rate": 8.228861333222523e-07, + "loss": 0.76433539, + "num_input_tokens_seen": 254492135, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 11797, + "time_per_iteration": 2.4772911071777344 + }, + { + "auxiliary_loss_clip": 0.01102045, + "auxiliary_loss_mlp": 0.01028995, + "balance_loss_clip": 1.01778328, + "balance_loss_mlp": 1.03599036, + "epoch": 0.7093341349767023, + "flos": 21032521102080.0, + "grad_norm": 1.5744211833149133, + "language_loss": 0.79336572, + "learning_rate": 8.225712930977953e-07, + "loss": 0.81467617, + "num_input_tokens_seen": 254512865, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6640625, + "step": 11798, + "time_per_iteration": 2.470794677734375 + }, + { + "auxiliary_loss_clip": 0.0110133, + "auxiliary_loss_mlp": 0.01032189, + "balance_loss_clip": 1.02051234, + "balance_loss_mlp": 1.03513288, + "epoch": 0.7093942582293702, + "flos": 22017658487040.0, + "grad_norm": 1.8971965021381223, + "language_loss": 0.66774857, + "learning_rate": 8.222564975215529e-07, + "loss": 0.68908381, + "num_input_tokens_seen": 254532605, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 11799, + "time_per_iteration": 2.4620981216430664 + }, + { + "auxiliary_loss_clip": 0.01102381, + "auxiliary_loss_mlp": 0.01026893, + "balance_loss_clip": 1.01489425, + "balance_loss_mlp": 1.03516233, + "epoch": 0.7094543814820382, + "flos": 27235586465280.0, + "grad_norm": 1.8304913592304672, + "language_loss": 0.81343234, + "learning_rate": 8.219417466054622e-07, + "loss": 0.83472508, + "num_input_tokens_seen": 254553780, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 11800, + "time_per_iteration": 2.5046193599700928 + }, + { + "auxiliary_loss_clip": 0.01097772, + "auxiliary_loss_mlp": 0.01025392, + "balance_loss_clip": 1.01481259, + "balance_loss_mlp": 1.03336954, + "epoch": 0.7095145047347061, + "flos": 12089148180480.0, + "grad_norm": 1.8277069049900614, + "language_loss": 0.8660984, + "learning_rate": 8.21627040361459e-07, + "loss": 0.88733006, + "num_input_tokens_seen": 254567510, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.64453125, + "step": 11801, + "time_per_iteration": 2.4158272743225098 + }, + { + "auxiliary_loss_clip": 0.01100039, + "auxiliary_loss_mlp": 0.01031161, + "balance_loss_clip": 1.01996124, + "balance_loss_mlp": 1.03366089, + "epoch": 0.7095746279873741, + "flos": 19383789905280.0, + "grad_norm": 1.7026819201034897, + "language_loss": 0.76157814, + "learning_rate": 8.213123788014758e-07, + "loss": 0.78289014, + "num_input_tokens_seen": 254585565, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6640625, + "step": 11802, + "time_per_iteration": 2.4612386226654053 + }, + { + "auxiliary_loss_clip": 0.01102987, + "auxiliary_loss_mlp": 0.01036669, + "balance_loss_clip": 1.02519536, + "balance_loss_mlp": 1.03526998, + "epoch": 0.709634751240042, + "flos": 21360600950400.0, + "grad_norm": 3.23871820936019, + "language_loss": 0.81726915, + "learning_rate": 8.209977619374462e-07, + "loss": 0.83866572, + "num_input_tokens_seen": 254603465, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.67578125, + "step": 11803, + "time_per_iteration": 3.975581407546997 + }, + { + "auxiliary_loss_clip": 0.01101959, + "auxiliary_loss_mlp": 0.01027237, + "balance_loss_clip": 1.01518524, + "balance_loss_mlp": 1.03458929, + "epoch": 0.7096948744927101, + "flos": 13917037438080.0, + "grad_norm": 2.0140842961231047, + "language_loss": 0.67451382, + "learning_rate": 8.206831897812995e-07, + "loss": 0.69580579, + "num_input_tokens_seen": 254620500, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 11804, + "time_per_iteration": 2.4457967281341553 + }, + { + "auxiliary_loss_clip": 0.01096545, + "auxiliary_loss_mlp": 0.01024221, + "balance_loss_clip": 1.01398039, + "balance_loss_mlp": 1.03440809, + "epoch": 0.709754997745378, + "flos": 30298335436800.0, + "grad_norm": 1.740193690303794, + "language_loss": 0.78362393, + "learning_rate": 8.203686623449637e-07, + "loss": 0.80483156, + "num_input_tokens_seen": 254638565, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.62109375, + "step": 11805, + "time_per_iteration": 3.905280590057373 + }, + { + "auxiliary_loss_clip": 0.01099173, + "auxiliary_loss_mlp": 0.01028601, + "balance_loss_clip": 1.01693034, + "balance_loss_mlp": 1.03327656, + "epoch": 0.709815120998046, + "flos": 18515147304960.0, + "grad_norm": 3.0979433949045125, + "language_loss": 0.78634393, + "learning_rate": 8.200541796403667e-07, + "loss": 0.8076216, + "num_input_tokens_seen": 254657505, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66015625, + "step": 11806, + "time_per_iteration": 5.279039144515991 + }, + { + "auxiliary_loss_clip": 0.01100527, + "auxiliary_loss_mlp": 0.01031855, + "balance_loss_clip": 1.02109098, + "balance_loss_mlp": 1.03536928, + "epoch": 0.7098752442507139, + "flos": 22272588288000.0, + "grad_norm": 2.519109679125039, + "language_loss": 0.56458282, + "learning_rate": 8.197397416794332e-07, + "loss": 0.58590662, + "num_input_tokens_seen": 254674730, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.65234375, + "step": 11807, + "time_per_iteration": 2.4814159870147705 + }, + { + "auxiliary_loss_clip": 0.01104048, + "auxiliary_loss_mlp": 0.01038917, + "balance_loss_clip": 1.02743721, + "balance_loss_mlp": 1.03456068, + "epoch": 0.7099353675033819, + "flos": 19275447507840.0, + "grad_norm": 2.0844100679096407, + "language_loss": 0.68413723, + "learning_rate": 8.194253484740882e-07, + "loss": 0.70556688, + "num_input_tokens_seen": 254691665, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6953125, + "step": 11808, + "time_per_iteration": 2.425276279449463 + }, + { + "auxiliary_loss_clip": 0.01102073, + "auxiliary_loss_mlp": 0.01029901, + "balance_loss_clip": 1.01882625, + "balance_loss_mlp": 1.03456879, + "epoch": 0.70999549075605, + "flos": 21908525990400.0, + "grad_norm": 1.9066636961835672, + "language_loss": 0.71175826, + "learning_rate": 8.191110000362513e-07, + "loss": 0.733078, + "num_input_tokens_seen": 254711610, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.67578125, + "step": 11809, + "time_per_iteration": 2.4811971187591553 + }, + { + "auxiliary_loss_clip": 0.01025844, + "auxiliary_loss_mlp": 0.00998682, + "balance_loss_clip": 0.99747771, + "balance_loss_mlp": 1.00508428, + "epoch": 0.7100556140087179, + "flos": 70456053456000.0, + "grad_norm": 0.7498079844660932, + "language_loss": 0.59492218, + "learning_rate": 8.187966963778435e-07, + "loss": 0.61516744, + "num_input_tokens_seen": 254772615, + "router_z_loss_clip": 0.01202393, + "router_z_loss_mlp": 0.20703125, + "step": 11810, + "time_per_iteration": 3.1407463550567627 + }, + { + "auxiliary_loss_clip": 0.01102477, + "auxiliary_loss_mlp": 0.01031923, + "balance_loss_clip": 1.02154016, + "balance_loss_mlp": 1.03702438, + "epoch": 0.7101157372613859, + "flos": 23039568420480.0, + "grad_norm": 1.5762923305466447, + "language_loss": 0.73988348, + "learning_rate": 8.18482437510784e-07, + "loss": 0.76122749, + "num_input_tokens_seen": 254791375, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.65625, + "step": 11811, + "time_per_iteration": 2.4921576976776123 + }, + { + "auxiliary_loss_clip": 0.01097734, + "auxiliary_loss_mlp": 0.01024065, + "balance_loss_clip": 1.0132947, + "balance_loss_mlp": 1.03462029, + "epoch": 0.7101758605140538, + "flos": 23185329811200.0, + "grad_norm": 1.6755141879364293, + "language_loss": 0.83260751, + "learning_rate": 8.181682234469882e-07, + "loss": 0.85382551, + "num_input_tokens_seen": 254809300, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6328125, + "step": 11812, + "time_per_iteration": 2.4486024379730225 + }, + { + "auxiliary_loss_clip": 0.0110213, + "auxiliary_loss_mlp": 0.01025057, + "balance_loss_clip": 1.01317763, + "balance_loss_mlp": 1.0353713, + "epoch": 0.7102359837667218, + "flos": 23696123166720.0, + "grad_norm": 1.6424398905568702, + "language_loss": 0.69810915, + "learning_rate": 8.178540541983716e-07, + "loss": 0.71938103, + "num_input_tokens_seen": 254829325, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66796875, + "step": 11813, + "time_per_iteration": 2.4982481002807617 + }, + { + "auxiliary_loss_clip": 0.01096572, + "auxiliary_loss_mlp": 0.0102546, + "balance_loss_clip": 1.01479709, + "balance_loss_mlp": 1.03272831, + "epoch": 0.7102961070193897, + "flos": 19391116279680.0, + "grad_norm": 1.8324166675871492, + "language_loss": 0.81685358, + "learning_rate": 8.175399297768495e-07, + "loss": 0.83807397, + "num_input_tokens_seen": 254847690, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.63671875, + "step": 11814, + "time_per_iteration": 2.4432296752929688 + }, + { + "auxiliary_loss_clip": 0.01100828, + "auxiliary_loss_mlp": 0.01026562, + "balance_loss_clip": 1.01498675, + "balance_loss_mlp": 1.03533602, + "epoch": 0.7103562302720577, + "flos": 21507511576320.0, + "grad_norm": 2.0936967568296594, + "language_loss": 0.75861955, + "learning_rate": 8.172258501943301e-07, + "loss": 0.77989352, + "num_input_tokens_seen": 254865960, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65234375, + "step": 11815, + "time_per_iteration": 2.49507474899292 + }, + { + "auxiliary_loss_clip": 0.01098556, + "auxiliary_loss_mlp": 0.01030404, + "balance_loss_clip": 1.0192579, + "balance_loss_mlp": 1.03366482, + "epoch": 0.7104163535247257, + "flos": 14535059869440.0, + "grad_norm": 1.6038639171669453, + "language_loss": 0.78608739, + "learning_rate": 8.16911815462725e-07, + "loss": 0.80737698, + "num_input_tokens_seen": 254882815, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 11816, + "time_per_iteration": 2.415172815322876 + }, + { + "auxiliary_loss_clip": 0.01101923, + "auxiliary_loss_mlp": 0.01035584, + "balance_loss_clip": 1.02450991, + "balance_loss_mlp": 1.03593814, + "epoch": 0.7104764767773937, + "flos": 11400310085760.0, + "grad_norm": 1.8614231241085628, + "language_loss": 0.8662678, + "learning_rate": 8.165978255939426e-07, + "loss": 0.88764292, + "num_input_tokens_seen": 254898705, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.66015625, + "step": 11817, + "time_per_iteration": 2.4507339000701904 + }, + { + "auxiliary_loss_clip": 0.01099735, + "auxiliary_loss_mlp": 0.01028346, + "balance_loss_clip": 1.01768219, + "balance_loss_mlp": 1.03457141, + "epoch": 0.7105366000300616, + "flos": 11690432236800.0, + "grad_norm": 3.9427784620989437, + "language_loss": 0.84360695, + "learning_rate": 8.162838805998897e-07, + "loss": 0.86488771, + "num_input_tokens_seen": 254913665, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.65234375, + "step": 11818, + "time_per_iteration": 2.451037883758545 + }, + { + "auxiliary_loss_clip": 0.01099201, + "auxiliary_loss_mlp": 0.01027387, + "balance_loss_clip": 1.01552582, + "balance_loss_mlp": 1.03239679, + "epoch": 0.7105967232827296, + "flos": 19354020508800.0, + "grad_norm": 2.103555241678178, + "language_loss": 0.75971746, + "learning_rate": 8.159699804924709e-07, + "loss": 0.78098345, + "num_input_tokens_seen": 254932140, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66796875, + "step": 11819, + "time_per_iteration": 2.4669997692108154 + }, + { + "auxiliary_loss_clip": 0.01101813, + "auxiliary_loss_mlp": 0.01028132, + "balance_loss_clip": 1.01483393, + "balance_loss_mlp": 1.03531337, + "epoch": 0.7106568465353975, + "flos": 22930400010240.0, + "grad_norm": 1.7430720805927078, + "language_loss": 0.70564902, + "learning_rate": 8.156561252835883e-07, + "loss": 0.7269485, + "num_input_tokens_seen": 254951580, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6640625, + "step": 11820, + "time_per_iteration": 2.454805612564087 + }, + { + "auxiliary_loss_clip": 0.01100228, + "auxiliary_loss_mlp": 0.01026521, + "balance_loss_clip": 1.01536262, + "balance_loss_mlp": 1.03519297, + "epoch": 0.7107169697880655, + "flos": 19099665325440.0, + "grad_norm": 1.9533750259905485, + "language_loss": 0.75224185, + "learning_rate": 8.153423149851449e-07, + "loss": 0.77350932, + "num_input_tokens_seen": 254969425, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65234375, + "step": 11821, + "time_per_iteration": 2.4534716606140137 + }, + { + "auxiliary_loss_clip": 0.01025147, + "auxiliary_loss_mlp": 0.01000031, + "balance_loss_clip": 0.99898189, + "balance_loss_mlp": 1.00464201, + "epoch": 0.7107770930407336, + "flos": 63638054231040.0, + "grad_norm": 0.7907699295335275, + "language_loss": 0.55060166, + "learning_rate": 8.150285496090388e-07, + "loss": 0.57085341, + "num_input_tokens_seen": 255032680, + "router_z_loss_clip": 0.01049805, + "router_z_loss_mlp": 0.20507812, + "step": 11822, + "time_per_iteration": 3.0831096172332764 + }, + { + "auxiliary_loss_clip": 0.01095485, + "auxiliary_loss_mlp": 0.01025121, + "balance_loss_clip": 1.01383758, + "balance_loss_mlp": 1.03307807, + "epoch": 0.7108372162934015, + "flos": 22054466949120.0, + "grad_norm": 1.9650661666731581, + "language_loss": 0.60139519, + "learning_rate": 8.147148291671688e-07, + "loss": 0.62260121, + "num_input_tokens_seen": 255054400, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.625, + "step": 11823, + "time_per_iteration": 2.5066399574279785 + }, + { + "auxiliary_loss_clip": 0.01100805, + "auxiliary_loss_mlp": 0.01029098, + "balance_loss_clip": 1.01848853, + "balance_loss_mlp": 1.03523636, + "epoch": 0.7108973395460695, + "flos": 19135144984320.0, + "grad_norm": 2.216168272824083, + "language_loss": 0.71333873, + "learning_rate": 8.144011536714322e-07, + "loss": 0.73463774, + "num_input_tokens_seen": 255072785, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.65625, + "step": 11824, + "time_per_iteration": 2.4382858276367188 + }, + { + "auxiliary_loss_clip": 0.01095465, + "auxiliary_loss_mlp": 0.01028765, + "balance_loss_clip": 1.01859653, + "balance_loss_mlp": 1.03347003, + "epoch": 0.7109574627987374, + "flos": 17894431353600.0, + "grad_norm": 1.655325791752312, + "language_loss": 0.7270785, + "learning_rate": 8.140875231337223e-07, + "loss": 0.74832082, + "num_input_tokens_seen": 255091820, + "router_z_loss_clip": 0.1015625, + "router_z_loss_mlp": 0.62109375, + "step": 11825, + "time_per_iteration": 2.46207332611084 + }, + { + "auxiliary_loss_clip": 0.01102509, + "auxiliary_loss_mlp": 0.01030137, + "balance_loss_clip": 1.01849043, + "balance_loss_mlp": 1.0350585, + "epoch": 0.7110175860514054, + "flos": 28979623422720.0, + "grad_norm": 1.7037190958225141, + "language_loss": 0.79228491, + "learning_rate": 8.137739375659321e-07, + "loss": 0.81361139, + "num_input_tokens_seen": 255111720, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.671875, + "step": 11826, + "time_per_iteration": 2.4977200031280518 + }, + { + "auxiliary_loss_clip": 0.01097466, + "auxiliary_loss_mlp": 0.01031185, + "balance_loss_clip": 1.02055109, + "balance_loss_mlp": 1.03329957, + "epoch": 0.7110777093040733, + "flos": 26173312623360.0, + "grad_norm": 1.8095370005527254, + "language_loss": 0.83191311, + "learning_rate": 8.134603969799527e-07, + "loss": 0.8531996, + "num_input_tokens_seen": 255133495, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.640625, + "step": 11827, + "time_per_iteration": 2.5329458713531494 + }, + { + "auxiliary_loss_clip": 0.01100333, + "auxiliary_loss_mlp": 0.01029649, + "balance_loss_clip": 1.01757264, + "balance_loss_mlp": 1.03426528, + "epoch": 0.7111378325567413, + "flos": 26869943969280.0, + "grad_norm": 27.265917209893804, + "language_loss": 0.62289751, + "learning_rate": 8.131469013876748e-07, + "loss": 0.64419734, + "num_input_tokens_seen": 255156880, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 11828, + "time_per_iteration": 2.601370096206665 + }, + { + "auxiliary_loss_clip": 0.01099233, + "auxiliary_loss_mlp": 0.01031042, + "balance_loss_clip": 1.01993763, + "balance_loss_mlp": 1.03395164, + "epoch": 0.7111979558094093, + "flos": 27271820309760.0, + "grad_norm": 1.4399488675180274, + "language_loss": 0.72070241, + "learning_rate": 8.128334508009846e-07, + "loss": 0.74200517, + "num_input_tokens_seen": 255178920, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65234375, + "step": 11829, + "time_per_iteration": 2.534470796585083 + }, + { + "auxiliary_loss_clip": 0.01098293, + "auxiliary_loss_mlp": 0.01030259, + "balance_loss_clip": 1.01942253, + "balance_loss_mlp": 1.0337075, + "epoch": 0.7112580790620773, + "flos": 25046938961280.0, + "grad_norm": 1.7046572375419429, + "language_loss": 0.80539268, + "learning_rate": 8.125200452317697e-07, + "loss": 0.82667816, + "num_input_tokens_seen": 255198095, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6484375, + "step": 11830, + "time_per_iteration": 2.4941787719726562 + }, + { + "auxiliary_loss_clip": 0.01100972, + "auxiliary_loss_mlp": 0.01032941, + "balance_loss_clip": 1.02192593, + "balance_loss_mlp": 1.03516912, + "epoch": 0.7113182023147452, + "flos": 21646628951040.0, + "grad_norm": 1.6897013308211777, + "language_loss": 0.84117299, + "learning_rate": 8.122066846919138e-07, + "loss": 0.86251217, + "num_input_tokens_seen": 255215860, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 11831, + "time_per_iteration": 2.4908971786499023 + }, + { + "auxiliary_loss_clip": 0.01100644, + "auxiliary_loss_mlp": 0.01028791, + "balance_loss_clip": 1.01750183, + "balance_loss_mlp": 1.03430891, + "epoch": 0.7113783255674132, + "flos": 20996287257600.0, + "grad_norm": 2.068922809184691, + "language_loss": 0.76956964, + "learning_rate": 8.118933691932985e-07, + "loss": 0.79086405, + "num_input_tokens_seen": 255235425, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6640625, + "step": 11832, + "time_per_iteration": 2.4407291412353516 + }, + { + "auxiliary_loss_clip": 0.01024653, + "auxiliary_loss_mlp": 0.0100495, + "balance_loss_clip": 1.00390673, + "balance_loss_mlp": 1.00420582, + "epoch": 0.7114384488200811, + "flos": 66771080161920.0, + "grad_norm": 0.7451484693360029, + "language_loss": 0.56659162, + "learning_rate": 8.115800987478059e-07, + "loss": 0.58688766, + "num_input_tokens_seen": 255291680, + "router_z_loss_clip": 0.01043701, + "router_z_loss_mlp": 0.20507812, + "step": 11833, + "time_per_iteration": 2.9816091060638428 + }, + { + "auxiliary_loss_clip": 0.01097454, + "auxiliary_loss_mlp": 0.01033073, + "balance_loss_clip": 1.02226698, + "balance_loss_mlp": 1.0331707, + "epoch": 0.7114985720727491, + "flos": 25010058672000.0, + "grad_norm": 1.6073221071178434, + "language_loss": 0.70877647, + "learning_rate": 8.11266873367315e-07, + "loss": 0.7300818, + "num_input_tokens_seen": 255313880, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.640625, + "step": 11834, + "time_per_iteration": 2.478980541229248 + }, + { + "auxiliary_loss_clip": 0.01103011, + "auxiliary_loss_mlp": 0.0102967, + "balance_loss_clip": 1.0181601, + "balance_loss_mlp": 1.03596425, + "epoch": 0.7115586953254172, + "flos": 21470128496640.0, + "grad_norm": 1.9914740179798254, + "language_loss": 0.79722375, + "learning_rate": 8.10953693063704e-07, + "loss": 0.81855053, + "num_input_tokens_seen": 255332390, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 11835, + "time_per_iteration": 2.479388952255249 + }, + { + "auxiliary_loss_clip": 0.01096967, + "auxiliary_loss_mlp": 0.01025588, + "balance_loss_clip": 1.0149014, + "balance_loss_mlp": 1.03320408, + "epoch": 0.7116188185780851, + "flos": 28622600190720.0, + "grad_norm": 1.6571407536951757, + "language_loss": 0.7602039, + "learning_rate": 8.10640557848848e-07, + "loss": 0.78142941, + "num_input_tokens_seen": 255354025, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.63671875, + "step": 11836, + "time_per_iteration": 2.4998624324798584 + }, + { + "auxiliary_loss_clip": 0.01098563, + "auxiliary_loss_mlp": 0.01030382, + "balance_loss_clip": 1.01905715, + "balance_loss_mlp": 1.03251767, + "epoch": 0.7116789418307531, + "flos": 25293608634240.0, + "grad_norm": 1.7551754985161803, + "language_loss": 0.70438159, + "learning_rate": 8.103274677346208e-07, + "loss": 0.72567105, + "num_input_tokens_seen": 255371400, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66015625, + "step": 11837, + "time_per_iteration": 2.4985547065734863 + }, + { + "auxiliary_loss_clip": 0.01103208, + "auxiliary_loss_mlp": 0.01032547, + "balance_loss_clip": 1.02022099, + "balance_loss_mlp": 1.03518689, + "epoch": 0.711739065083421, + "flos": 25557301353600.0, + "grad_norm": 1.8053810542915782, + "language_loss": 0.61668026, + "learning_rate": 8.100144227328958e-07, + "loss": 0.63803786, + "num_input_tokens_seen": 255390710, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6796875, + "step": 11838, + "time_per_iteration": 2.4703662395477295 + }, + { + "auxiliary_loss_clip": 0.01101169, + "auxiliary_loss_mlp": 0.0103104, + "balance_loss_clip": 1.01982808, + "balance_loss_mlp": 1.03559279, + "epoch": 0.711799188336089, + "flos": 26140993361280.0, + "grad_norm": 2.6637536928847556, + "language_loss": 0.67472559, + "learning_rate": 8.097014228555426e-07, + "loss": 0.69604766, + "num_input_tokens_seen": 255408790, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 11839, + "time_per_iteration": 2.535466194152832 + }, + { + "auxiliary_loss_clip": 0.01101981, + "auxiliary_loss_mlp": 0.0102972, + "balance_loss_clip": 1.01871097, + "balance_loss_mlp": 1.03578651, + "epoch": 0.7118593115887569, + "flos": 21140648017920.0, + "grad_norm": 1.8263370197913231, + "language_loss": 0.84035689, + "learning_rate": 8.093884681144305e-07, + "loss": 0.86167389, + "num_input_tokens_seen": 255426280, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6640625, + "step": 11840, + "time_per_iteration": 2.4370462894439697 + }, + { + "auxiliary_loss_clip": 0.01104281, + "auxiliary_loss_mlp": 0.01028702, + "balance_loss_clip": 1.01743066, + "balance_loss_mlp": 1.03657627, + "epoch": 0.711919434841425, + "flos": 14975684006400.0, + "grad_norm": 2.0089671894900243, + "language_loss": 0.76980072, + "learning_rate": 8.090755585214277e-07, + "loss": 0.79113054, + "num_input_tokens_seen": 255442935, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6796875, + "step": 11841, + "time_per_iteration": 2.4408881664276123 + }, + { + "auxiliary_loss_clip": 0.01102547, + "auxiliary_loss_mlp": 0.01027846, + "balance_loss_clip": 1.01616335, + "balance_loss_mlp": 1.03546906, + "epoch": 0.7119795580940929, + "flos": 16508997826560.0, + "grad_norm": 2.1117001145117595, + "language_loss": 0.74941587, + "learning_rate": 8.087626940883994e-07, + "loss": 0.77071977, + "num_input_tokens_seen": 255460925, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.671875, + "step": 11842, + "time_per_iteration": 2.4360697269439697 + }, + { + "auxiliary_loss_clip": 0.01024411, + "auxiliary_loss_mlp": 0.01002483, + "balance_loss_clip": 1.00127351, + "balance_loss_mlp": 1.00402236, + "epoch": 0.7120396813467609, + "flos": 66570736055040.0, + "grad_norm": 0.784591330387751, + "language_loss": 0.61587965, + "learning_rate": 8.084498748272082e-07, + "loss": 0.63614863, + "num_input_tokens_seen": 255521360, + "router_z_loss_clip": 0.01208496, + "router_z_loss_mlp": 0.20410156, + "step": 11843, + "time_per_iteration": 3.0296053886413574 + }, + { + "auxiliary_loss_clip": 0.01099529, + "auxiliary_loss_mlp": 0.01027745, + "balance_loss_clip": 1.01671815, + "balance_loss_mlp": 1.03480315, + "epoch": 0.7120998045994288, + "flos": 26432731624320.0, + "grad_norm": 1.734640870802516, + "language_loss": 0.80089492, + "learning_rate": 8.081371007497171e-07, + "loss": 0.82216763, + "num_input_tokens_seen": 255541435, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 11844, + "time_per_iteration": 3.887108325958252 + }, + { + "auxiliary_loss_clip": 0.01098797, + "auxiliary_loss_mlp": 0.01030686, + "balance_loss_clip": 1.01913476, + "balance_loss_mlp": 1.03288889, + "epoch": 0.7121599278520968, + "flos": 16427982700800.0, + "grad_norm": 2.1905334361731326, + "language_loss": 0.78714418, + "learning_rate": 8.078243718677873e-07, + "loss": 0.80843902, + "num_input_tokens_seen": 255558505, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66015625, + "step": 11845, + "time_per_iteration": 2.410975456237793 + }, + { + "auxiliary_loss_clip": 0.01099567, + "auxiliary_loss_mlp": 0.01031851, + "balance_loss_clip": 1.02026939, + "balance_loss_mlp": 1.03620291, + "epoch": 0.7122200511047647, + "flos": 28949889939840.0, + "grad_norm": 1.893878343442594, + "language_loss": 0.76888061, + "learning_rate": 8.075116881932762e-07, + "loss": 0.79019481, + "num_input_tokens_seen": 255577815, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6328125, + "step": 11846, + "time_per_iteration": 3.931493043899536 + }, + { + "auxiliary_loss_clip": 0.01102634, + "auxiliary_loss_mlp": 0.01028937, + "balance_loss_clip": 1.0170877, + "balance_loss_mlp": 1.03620863, + "epoch": 0.7122801743574327, + "flos": 16471866142080.0, + "grad_norm": 1.9372499520787854, + "language_loss": 0.58303821, + "learning_rate": 8.071990497380421e-07, + "loss": 0.6043539, + "num_input_tokens_seen": 255595885, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 11847, + "time_per_iteration": 3.8361809253692627 + }, + { + "auxiliary_loss_clip": 0.01097288, + "auxiliary_loss_mlp": 0.01031113, + "balance_loss_clip": 1.01974046, + "balance_loss_mlp": 1.03439856, + "epoch": 0.7123402976101008, + "flos": 20631039811200.0, + "grad_norm": 1.4312853577961298, + "language_loss": 0.71475565, + "learning_rate": 8.068864565139395e-07, + "loss": 0.7360397, + "num_input_tokens_seen": 255616750, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.62890625, + "step": 11848, + "time_per_iteration": 3.985182523727417 + }, + { + "auxiliary_loss_clip": 0.01025097, + "auxiliary_loss_mlp": 0.01002394, + "balance_loss_clip": 1.00141037, + "balance_loss_mlp": 1.00462532, + "epoch": 0.7124004208627687, + "flos": 62325734837760.0, + "grad_norm": 0.8575731984951991, + "language_loss": 0.63123107, + "learning_rate": 8.065739085328211e-07, + "loss": 0.65150595, + "num_input_tokens_seen": 255677900, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.20507812, + "step": 11849, + "time_per_iteration": 3.0350046157836914 + }, + { + "auxiliary_loss_clip": 0.01102326, + "auxiliary_loss_mlp": 0.01031998, + "balance_loss_clip": 1.02040517, + "balance_loss_mlp": 1.03554058, + "epoch": 0.7124605441154367, + "flos": 39675975788160.0, + "grad_norm": 1.4965357236983527, + "language_loss": 0.63742816, + "learning_rate": 8.0626140580654e-07, + "loss": 0.65877146, + "num_input_tokens_seen": 255699140, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66796875, + "step": 11850, + "time_per_iteration": 2.6502671241760254 + }, + { + "auxiliary_loss_clip": 0.01101252, + "auxiliary_loss_mlp": 0.0103008, + "balance_loss_clip": 1.0185765, + "balance_loss_mlp": 1.03538823, + "epoch": 0.7125206673681046, + "flos": 28181868312960.0, + "grad_norm": 1.4672764564322482, + "language_loss": 0.69679284, + "learning_rate": 8.05948948346946e-07, + "loss": 0.71810615, + "num_input_tokens_seen": 255719640, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66015625, + "step": 11851, + "time_per_iteration": 2.495501756668091 + }, + { + "auxiliary_loss_clip": 0.01100247, + "auxiliary_loss_mlp": 0.01031406, + "balance_loss_clip": 1.02083778, + "balance_loss_mlp": 1.03549206, + "epoch": 0.7125807906207726, + "flos": 26176939896960.0, + "grad_norm": 1.4895655159302474, + "language_loss": 0.83113164, + "learning_rate": 8.056365361658882e-07, + "loss": 0.85244817, + "num_input_tokens_seen": 255740450, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6484375, + "step": 11852, + "time_per_iteration": 2.510340929031372 + }, + { + "auxiliary_loss_clip": 0.01103516, + "auxiliary_loss_mlp": 0.0103163, + "balance_loss_clip": 1.0193572, + "balance_loss_mlp": 1.03595805, + "epoch": 0.7126409138734405, + "flos": 17157328358400.0, + "grad_norm": 2.258616053920704, + "language_loss": 0.73188543, + "learning_rate": 8.053241692752126e-07, + "loss": 0.75323689, + "num_input_tokens_seen": 255758070, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.67578125, + "step": 11853, + "time_per_iteration": 2.4003355503082275 + }, + { + "auxiliary_loss_clip": 0.01096006, + "auxiliary_loss_mlp": 0.01027437, + "balance_loss_clip": 1.0173521, + "balance_loss_mlp": 1.03375578, + "epoch": 0.7127010371261085, + "flos": 18769933451520.0, + "grad_norm": 1.9420602082674068, + "language_loss": 0.92091542, + "learning_rate": 8.050118476867635e-07, + "loss": 0.94214988, + "num_input_tokens_seen": 255775685, + "router_z_loss_clip": 0.10058594, + "router_z_loss_mlp": 0.62109375, + "step": 11854, + "time_per_iteration": 2.4623403549194336 + }, + { + "auxiliary_loss_clip": 0.01099699, + "auxiliary_loss_mlp": 0.01030318, + "balance_loss_clip": 1.01910615, + "balance_loss_mlp": 1.0353142, + "epoch": 0.7127611603787765, + "flos": 20376433232640.0, + "grad_norm": 2.0934387752470403, + "language_loss": 0.79594553, + "learning_rate": 8.046995714123856e-07, + "loss": 0.81724572, + "num_input_tokens_seen": 255794750, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 11855, + "time_per_iteration": 2.442281484603882 + }, + { + "auxiliary_loss_clip": 0.01100914, + "auxiliary_loss_mlp": 0.01033118, + "balance_loss_clip": 1.02083373, + "balance_loss_mlp": 1.0347116, + "epoch": 0.7128212836314445, + "flos": 20449008662400.0, + "grad_norm": 1.6650252891937876, + "language_loss": 0.72577047, + "learning_rate": 8.043873404639192e-07, + "loss": 0.74711072, + "num_input_tokens_seen": 255813325, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.66015625, + "step": 11856, + "time_per_iteration": 2.47229266166687 + }, + { + "auxiliary_loss_clip": 0.01103455, + "auxiliary_loss_mlp": 0.01030145, + "balance_loss_clip": 1.01880825, + "balance_loss_mlp": 1.03564286, + "epoch": 0.7128814068841124, + "flos": 23440834229760.0, + "grad_norm": 1.6411446267606922, + "language_loss": 0.70082289, + "learning_rate": 8.040751548532046e-07, + "loss": 0.72215885, + "num_input_tokens_seen": 255832470, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.67578125, + "step": 11857, + "time_per_iteration": 2.4524147510528564 + }, + { + "auxiliary_loss_clip": 0.01098237, + "auxiliary_loss_mlp": 0.01029295, + "balance_loss_clip": 1.01744556, + "balance_loss_mlp": 1.03391576, + "epoch": 0.7129415301367804, + "flos": 18222942165120.0, + "grad_norm": 2.116428788246258, + "language_loss": 0.85496008, + "learning_rate": 8.03763014592081e-07, + "loss": 0.87623537, + "num_input_tokens_seen": 255849740, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.640625, + "step": 11858, + "time_per_iteration": 2.4527347087860107 + }, + { + "auxiliary_loss_clip": 0.01104991, + "auxiliary_loss_mlp": 0.01029649, + "balance_loss_clip": 1.01795506, + "balance_loss_mlp": 1.03623009, + "epoch": 0.7130016533894483, + "flos": 15523896355200.0, + "grad_norm": 1.608889007430339, + "language_loss": 0.80293894, + "learning_rate": 8.034509196923829e-07, + "loss": 0.82428539, + "num_input_tokens_seen": 255866975, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6875, + "step": 11859, + "time_per_iteration": 2.4199166297912598 + }, + { + "auxiliary_loss_clip": 0.010985, + "auxiliary_loss_mlp": 0.01031707, + "balance_loss_clip": 1.02047169, + "balance_loss_mlp": 1.03418899, + "epoch": 0.7130617766421163, + "flos": 57115668960000.0, + "grad_norm": 1.1635938409015476, + "language_loss": 0.68921995, + "learning_rate": 8.031388701659456e-07, + "loss": 0.710522, + "num_input_tokens_seen": 255892915, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.64453125, + "step": 11860, + "time_per_iteration": 2.779348373413086 + }, + { + "auxiliary_loss_clip": 0.01101605, + "auxiliary_loss_mlp": 0.01028645, + "balance_loss_clip": 1.01661134, + "balance_loss_mlp": 1.03528762, + "epoch": 0.7131218998947844, + "flos": 19788252024960.0, + "grad_norm": 1.9453238784757083, + "language_loss": 0.64468431, + "learning_rate": 8.028268660246023e-07, + "loss": 0.66598678, + "num_input_tokens_seen": 255911480, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 11861, + "time_per_iteration": 2.4438693523406982 + }, + { + "auxiliary_loss_clip": 0.01106949, + "auxiliary_loss_mlp": 0.01027271, + "balance_loss_clip": 1.01536191, + "balance_loss_mlp": 1.03813672, + "epoch": 0.7131820231474523, + "flos": 26651894457600.0, + "grad_norm": 1.665544522358975, + "language_loss": 0.67246974, + "learning_rate": 8.025149072801849e-07, + "loss": 0.69381201, + "num_input_tokens_seen": 255931140, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 11862, + "time_per_iteration": 2.507708787918091 + }, + { + "auxiliary_loss_clip": 0.01099005, + "auxiliary_loss_mlp": 0.01033992, + "balance_loss_clip": 1.02357876, + "balance_loss_mlp": 1.03554285, + "epoch": 0.7132421464001203, + "flos": 29205609840000.0, + "grad_norm": 2.1581150638153117, + "language_loss": 0.66787547, + "learning_rate": 8.022029939445214e-07, + "loss": 0.68920541, + "num_input_tokens_seen": 255951665, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.6328125, + "step": 11863, + "time_per_iteration": 2.508451223373413 + }, + { + "auxiliary_loss_clip": 0.01107413, + "auxiliary_loss_mlp": 0.01035848, + "balance_loss_clip": 1.02361131, + "balance_loss_mlp": 1.03781486, + "epoch": 0.7133022696527882, + "flos": 23073611535360.0, + "grad_norm": 9.155363012323315, + "language_loss": 0.65499818, + "learning_rate": 8.018911260294414e-07, + "loss": 0.67643076, + "num_input_tokens_seen": 255970055, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6953125, + "step": 11864, + "time_per_iteration": 2.4946515560150146 + }, + { + "auxiliary_loss_clip": 0.0110368, + "auxiliary_loss_mlp": 0.01028226, + "balance_loss_clip": 1.01631117, + "balance_loss_mlp": 1.03640735, + "epoch": 0.7133623929054562, + "flos": 17457111267840.0, + "grad_norm": 1.87343338578939, + "language_loss": 0.85730636, + "learning_rate": 8.015793035467697e-07, + "loss": 0.87862539, + "num_input_tokens_seen": 255987720, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 11865, + "time_per_iteration": 2.42283296585083 + }, + { + "auxiliary_loss_clip": 0.01100738, + "auxiliary_loss_mlp": 0.01027678, + "balance_loss_clip": 1.01554251, + "balance_loss_mlp": 1.03419256, + "epoch": 0.7134225161581241, + "flos": 19536554448000.0, + "grad_norm": 1.8472790526640706, + "language_loss": 0.74752319, + "learning_rate": 8.012675265083304e-07, + "loss": 0.76880735, + "num_input_tokens_seen": 256005490, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 11866, + "time_per_iteration": 2.4545392990112305 + }, + { + "auxiliary_loss_clip": 0.01104452, + "auxiliary_loss_mlp": 0.01034488, + "balance_loss_clip": 1.02232265, + "balance_loss_mlp": 1.03757143, + "epoch": 0.7134826394107922, + "flos": 26250089944320.0, + "grad_norm": 2.6643205457919477, + "language_loss": 0.70109868, + "learning_rate": 8.009557949259464e-07, + "loss": 0.72248805, + "num_input_tokens_seen": 256026030, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 11867, + "time_per_iteration": 2.487058639526367 + }, + { + "auxiliary_loss_clip": 0.01098879, + "auxiliary_loss_mlp": 0.0102599, + "balance_loss_clip": 1.01518393, + "balance_loss_mlp": 1.03465641, + "epoch": 0.7135427626634601, + "flos": 15815311395840.0, + "grad_norm": 6.705448377548921, + "language_loss": 0.71701014, + "learning_rate": 8.006441088114397e-07, + "loss": 0.73825878, + "num_input_tokens_seen": 256043680, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.640625, + "step": 11868, + "time_per_iteration": 2.4669320583343506 + }, + { + "auxiliary_loss_clip": 0.01105504, + "auxiliary_loss_mlp": 0.01027422, + "balance_loss_clip": 1.01509547, + "balance_loss_mlp": 1.03705835, + "epoch": 0.7136028859161281, + "flos": 18223409041920.0, + "grad_norm": 2.2157289852805278, + "language_loss": 0.65810573, + "learning_rate": 8.003324681766286e-07, + "loss": 0.67943501, + "num_input_tokens_seen": 256059705, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.68359375, + "step": 11869, + "time_per_iteration": 2.452075242996216 + }, + { + "auxiliary_loss_clip": 0.01100077, + "auxiliary_loss_mlp": 0.01024311, + "balance_loss_clip": 1.01321864, + "balance_loss_mlp": 1.03367877, + "epoch": 0.713663009168796, + "flos": 24314827956480.0, + "grad_norm": 1.5172430207890026, + "language_loss": 0.77797884, + "learning_rate": 8.000208730333298e-07, + "loss": 0.79922271, + "num_input_tokens_seen": 256079785, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6640625, + "step": 11870, + "time_per_iteration": 2.497041940689087 + }, + { + "auxiliary_loss_clip": 0.01101931, + "auxiliary_loss_mlp": 0.01029796, + "balance_loss_clip": 1.0176903, + "balance_loss_mlp": 1.03650808, + "epoch": 0.713723132421464, + "flos": 26538488242560.0, + "grad_norm": 1.6309688506128002, + "language_loss": 0.80767673, + "learning_rate": 7.997093233933597e-07, + "loss": 0.82899404, + "num_input_tokens_seen": 256099000, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.65625, + "step": 11871, + "time_per_iteration": 2.519364595413208 + }, + { + "auxiliary_loss_clip": 0.01102602, + "auxiliary_loss_mlp": 0.01036614, + "balance_loss_clip": 1.02396536, + "balance_loss_mlp": 1.03430688, + "epoch": 0.7137832556741319, + "flos": 19865675790720.0, + "grad_norm": 1.5882335500802451, + "language_loss": 0.78899664, + "learning_rate": 7.993978192685331e-07, + "loss": 0.8103888, + "num_input_tokens_seen": 256117985, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.68359375, + "step": 11872, + "time_per_iteration": 2.4607558250427246 + }, + { + "auxiliary_loss_clip": 0.01102685, + "auxiliary_loss_mlp": 0.01026379, + "balance_loss_clip": 1.01414764, + "balance_loss_mlp": 1.035676, + "epoch": 0.7138433789267999, + "flos": 21688932193920.0, + "grad_norm": 2.27961967349627, + "language_loss": 0.84102201, + "learning_rate": 7.990863606706606e-07, + "loss": 0.86231267, + "num_input_tokens_seen": 256134350, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.671875, + "step": 11873, + "time_per_iteration": 2.4343557357788086 + }, + { + "auxiliary_loss_clip": 0.01097529, + "auxiliary_loss_mlp": 0.0102625, + "balance_loss_clip": 1.01584864, + "balance_loss_mlp": 1.03362751, + "epoch": 0.713903502179468, + "flos": 17602729004160.0, + "grad_norm": 1.9049541609511427, + "language_loss": 0.86355829, + "learning_rate": 7.987749476115539e-07, + "loss": 0.88479608, + "num_input_tokens_seen": 256150610, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.640625, + "step": 11874, + "time_per_iteration": 2.4541850090026855 + }, + { + "auxiliary_loss_clip": 0.01102173, + "auxiliary_loss_mlp": 0.01026012, + "balance_loss_clip": 1.01449037, + "balance_loss_mlp": 1.0344789, + "epoch": 0.7139636254321359, + "flos": 18040336398720.0, + "grad_norm": 1.8939539946065194, + "language_loss": 0.82938111, + "learning_rate": 7.984635801030228e-07, + "loss": 0.85066295, + "num_input_tokens_seen": 256168620, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6796875, + "step": 11875, + "time_per_iteration": 2.4051244258880615 + }, + { + "auxiliary_loss_clip": 0.01106903, + "auxiliary_loss_mlp": 0.01031292, + "balance_loss_clip": 1.01805401, + "balance_loss_mlp": 1.03582454, + "epoch": 0.7140237486848039, + "flos": 23331127115520.0, + "grad_norm": 1.8810853083413022, + "language_loss": 0.69459707, + "learning_rate": 7.981522581568721e-07, + "loss": 0.71597898, + "num_input_tokens_seen": 256186700, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 11876, + "time_per_iteration": 2.461815595626831 + }, + { + "auxiliary_loss_clip": 0.0110347, + "auxiliary_loss_mlp": 0.01028817, + "balance_loss_clip": 1.01702094, + "balance_loss_mlp": 1.0361371, + "epoch": 0.7140838719374718, + "flos": 16837077674880.0, + "grad_norm": 1.9368833564249184, + "language_loss": 0.78070778, + "learning_rate": 7.978409817849079e-07, + "loss": 0.80203062, + "num_input_tokens_seen": 256205390, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 11877, + "time_per_iteration": 2.420319080352783 + }, + { + "auxiliary_loss_clip": 0.01100487, + "auxiliary_loss_mlp": 0.01031738, + "balance_loss_clip": 1.02102709, + "balance_loss_mlp": 1.0355581, + "epoch": 0.7141439951901398, + "flos": 21142012734720.0, + "grad_norm": 6.763182431425842, + "language_loss": 0.69534928, + "learning_rate": 7.97529750998934e-07, + "loss": 0.71667153, + "num_input_tokens_seen": 256224575, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6484375, + "step": 11878, + "time_per_iteration": 2.544290781021118 + }, + { + "auxiliary_loss_clip": 0.01100118, + "auxiliary_loss_mlp": 0.01033007, + "balance_loss_clip": 1.02254677, + "balance_loss_mlp": 1.03579926, + "epoch": 0.7142041184428077, + "flos": 24717709877760.0, + "grad_norm": 1.7269032775367334, + "language_loss": 0.679344, + "learning_rate": 7.972185658107535e-07, + "loss": 0.70067525, + "num_input_tokens_seen": 256242130, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.64453125, + "step": 11879, + "time_per_iteration": 2.4966022968292236 + }, + { + "auxiliary_loss_clip": 0.0110079, + "auxiliary_loss_mlp": 0.01031125, + "balance_loss_clip": 1.01867926, + "balance_loss_mlp": 1.03534412, + "epoch": 0.7142642416954758, + "flos": 21908202768000.0, + "grad_norm": 1.8974430539108489, + "language_loss": 0.68789601, + "learning_rate": 7.969074262321646e-07, + "loss": 0.70921516, + "num_input_tokens_seen": 256261920, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.65625, + "step": 11880, + "time_per_iteration": 2.502960205078125 + }, + { + "auxiliary_loss_clip": 0.01101747, + "auxiliary_loss_mlp": 0.01035023, + "balance_loss_clip": 1.02314401, + "balance_loss_mlp": 1.03362322, + "epoch": 0.7143243649481437, + "flos": 20805636844800.0, + "grad_norm": 2.4282585669500105, + "language_loss": 0.80370951, + "learning_rate": 7.965963322749674e-07, + "loss": 0.82507718, + "num_input_tokens_seen": 256277970, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.68359375, + "step": 11881, + "time_per_iteration": 2.470723867416382 + }, + { + "auxiliary_loss_clip": 0.01100316, + "auxiliary_loss_mlp": 0.01028315, + "balance_loss_clip": 1.01772344, + "balance_loss_mlp": 1.03443766, + "epoch": 0.7143844882008117, + "flos": 27235011847680.0, + "grad_norm": 1.561021120261205, + "language_loss": 0.63214886, + "learning_rate": 7.962852839509579e-07, + "loss": 0.65343523, + "num_input_tokens_seen": 256298205, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.66015625, + "step": 11882, + "time_per_iteration": 2.509657859802246 + }, + { + "auxiliary_loss_clip": 0.01104591, + "auxiliary_loss_mlp": 0.01027179, + "balance_loss_clip": 1.01627707, + "balance_loss_mlp": 1.03739905, + "epoch": 0.7144446114534796, + "flos": 17929623703680.0, + "grad_norm": 2.019106640227393, + "language_loss": 0.68898022, + "learning_rate": 7.959742812719304e-07, + "loss": 0.71029788, + "num_input_tokens_seen": 256316685, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.671875, + "step": 11883, + "time_per_iteration": 2.443070650100708 + }, + { + "auxiliary_loss_clip": 0.01101954, + "auxiliary_loss_mlp": 0.0103422, + "balance_loss_clip": 1.02256155, + "balance_loss_mlp": 1.03674269, + "epoch": 0.7145047347061476, + "flos": 20740962407040.0, + "grad_norm": 1.8254173167373133, + "language_loss": 0.77734333, + "learning_rate": 7.956633242496788e-07, + "loss": 0.7987051, + "num_input_tokens_seen": 256334205, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 11884, + "time_per_iteration": 2.498660087585449 + }, + { + "auxiliary_loss_clip": 0.01107297, + "auxiliary_loss_mlp": 0.0103088, + "balance_loss_clip": 1.01801157, + "balance_loss_mlp": 1.03647792, + "epoch": 0.7145648579588155, + "flos": 21178605715200.0, + "grad_norm": 2.2601581794211456, + "language_loss": 0.73881954, + "learning_rate": 7.953524128959954e-07, + "loss": 0.76020128, + "num_input_tokens_seen": 256353340, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.70703125, + "step": 11885, + "time_per_iteration": 2.4516425132751465 + }, + { + "auxiliary_loss_clip": 0.01024577, + "auxiliary_loss_mlp": 0.01003775, + "balance_loss_clip": 1.00262451, + "balance_loss_mlp": 1.00405157, + "epoch": 0.7146249812114835, + "flos": 64784539509120.0, + "grad_norm": 0.8858821646270937, + "language_loss": 0.66354322, + "learning_rate": 7.95041547222669e-07, + "loss": 0.68382668, + "num_input_tokens_seen": 256411550, + "router_z_loss_clip": 0.01147461, + "router_z_loss_mlp": 0.20507812, + "step": 11886, + "time_per_iteration": 4.428006649017334 + }, + { + "auxiliary_loss_clip": 0.01101529, + "auxiliary_loss_mlp": 0.01029616, + "balance_loss_clip": 1.01769543, + "balance_loss_mlp": 1.03508115, + "epoch": 0.7146851044641516, + "flos": 18113881495680.0, + "grad_norm": 2.6640943514117006, + "language_loss": 0.75138283, + "learning_rate": 7.947307272414874e-07, + "loss": 0.77269423, + "num_input_tokens_seen": 256430360, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 11887, + "time_per_iteration": 2.449885129928589 + }, + { + "auxiliary_loss_clip": 0.01102615, + "auxiliary_loss_mlp": 0.01025326, + "balance_loss_clip": 1.01408505, + "balance_loss_mlp": 1.03539872, + "epoch": 0.7147452277168195, + "flos": 19243846517760.0, + "grad_norm": 1.6754616856197402, + "language_loss": 0.71326733, + "learning_rate": 7.944199529642372e-07, + "loss": 0.73454678, + "num_input_tokens_seen": 256449750, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 11888, + "time_per_iteration": 3.880155086517334 + }, + { + "auxiliary_loss_clip": 0.01103487, + "auxiliary_loss_mlp": 0.01031134, + "balance_loss_clip": 1.01867044, + "balance_loss_mlp": 1.03440201, + "epoch": 0.7148053509694875, + "flos": 23764712186880.0, + "grad_norm": 1.7956471800089868, + "language_loss": 0.84206235, + "learning_rate": 7.941092244027041e-07, + "loss": 0.86340851, + "num_input_tokens_seen": 256467330, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.69140625, + "step": 11889, + "time_per_iteration": 3.8910415172576904 + }, + { + "auxiliary_loss_clip": 0.01102924, + "auxiliary_loss_mlp": 0.01024297, + "balance_loss_clip": 1.0128237, + "balance_loss_mlp": 1.0358007, + "epoch": 0.7148654742221554, + "flos": 22485322586880.0, + "grad_norm": 2.5861869043572994, + "language_loss": 0.75895607, + "learning_rate": 7.937985415686695e-07, + "loss": 0.78022826, + "num_input_tokens_seen": 256485705, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.671875, + "step": 11890, + "time_per_iteration": 3.942615270614624 + }, + { + "auxiliary_loss_clip": 0.01100031, + "auxiliary_loss_mlp": 0.01029052, + "balance_loss_clip": 1.01834106, + "balance_loss_mlp": 1.03469455, + "epoch": 0.7149255974748234, + "flos": 24679213476480.0, + "grad_norm": 1.4697874617816058, + "language_loss": 0.74033976, + "learning_rate": 7.934879044739147e-07, + "loss": 0.76163059, + "num_input_tokens_seen": 256504755, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.65234375, + "step": 11891, + "time_per_iteration": 2.5003371238708496 + }, + { + "auxiliary_loss_clip": 0.01101426, + "auxiliary_loss_mlp": 0.0103332, + "balance_loss_clip": 1.02201867, + "balance_loss_mlp": 1.03495193, + "epoch": 0.7149857207274913, + "flos": 18405583845120.0, + "grad_norm": 2.2548483440838676, + "language_loss": 0.68382698, + "learning_rate": 7.931773131302211e-07, + "loss": 0.70517445, + "num_input_tokens_seen": 256523670, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 11892, + "time_per_iteration": 2.431938409805298 + }, + { + "auxiliary_loss_clip": 0.0110488, + "auxiliary_loss_mlp": 0.01033708, + "balance_loss_clip": 1.02009463, + "balance_loss_mlp": 1.03543699, + "epoch": 0.7150458439801594, + "flos": 24969515195520.0, + "grad_norm": 2.391594593507675, + "language_loss": 0.73810261, + "learning_rate": 7.928667675493632e-07, + "loss": 0.75948846, + "num_input_tokens_seen": 256542225, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.6953125, + "step": 11893, + "time_per_iteration": 2.487308979034424 + }, + { + "auxiliary_loss_clip": 0.01103932, + "auxiliary_loss_mlp": 0.01029609, + "balance_loss_clip": 1.01721692, + "balance_loss_mlp": 1.03571689, + "epoch": 0.7151059672328273, + "flos": 16690777580160.0, + "grad_norm": 2.3568611580959016, + "language_loss": 0.65677148, + "learning_rate": 7.925562677431185e-07, + "loss": 0.6781069, + "num_input_tokens_seen": 256560730, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 11894, + "time_per_iteration": 2.4283459186553955 + }, + { + "auxiliary_loss_clip": 0.01103271, + "auxiliary_loss_mlp": 0.01029058, + "balance_loss_clip": 1.01771545, + "balance_loss_mlp": 1.03522325, + "epoch": 0.7151660904854953, + "flos": 27271820309760.0, + "grad_norm": 1.6791953890758138, + "language_loss": 0.77629852, + "learning_rate": 7.922458137232613e-07, + "loss": 0.79762185, + "num_input_tokens_seen": 256580505, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6796875, + "step": 11895, + "time_per_iteration": 2.478421926498413 + }, + { + "auxiliary_loss_clip": 0.01103559, + "auxiliary_loss_mlp": 0.01030416, + "balance_loss_clip": 1.01749945, + "balance_loss_mlp": 1.03565729, + "epoch": 0.7152262137381632, + "flos": 18332254229760.0, + "grad_norm": 2.101834953638121, + "language_loss": 0.69718951, + "learning_rate": 7.919354055015643e-07, + "loss": 0.71852922, + "num_input_tokens_seen": 256597330, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6796875, + "step": 11896, + "time_per_iteration": 2.4343297481536865 + }, + { + "auxiliary_loss_clip": 0.01102918, + "auxiliary_loss_mlp": 0.01042714, + "balance_loss_clip": 1.03019083, + "balance_loss_mlp": 1.03482461, + "epoch": 0.7152863369908312, + "flos": 21799285752960.0, + "grad_norm": 2.363966655291517, + "language_loss": 0.86399305, + "learning_rate": 7.91625043089798e-07, + "loss": 0.88544941, + "num_input_tokens_seen": 256616030, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 11897, + "time_per_iteration": 2.4417433738708496 + }, + { + "auxiliary_loss_clip": 0.01100281, + "auxiliary_loss_mlp": 0.01032292, + "balance_loss_clip": 1.02097273, + "balance_loss_mlp": 1.0358789, + "epoch": 0.7153464602434991, + "flos": 22158427887360.0, + "grad_norm": 2.1825882164427015, + "language_loss": 0.77925879, + "learning_rate": 7.913147264997304e-07, + "loss": 0.8005845, + "num_input_tokens_seen": 256635570, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.64453125, + "step": 11898, + "time_per_iteration": 2.4770331382751465 + }, + { + "auxiliary_loss_clip": 0.01105608, + "auxiliary_loss_mlp": 0.0102901, + "balance_loss_clip": 1.01695776, + "balance_loss_mlp": 1.03606868, + "epoch": 0.7154065834961671, + "flos": 24716057852160.0, + "grad_norm": 1.8319920355445916, + "language_loss": 0.73037088, + "learning_rate": 7.910044557431302e-07, + "loss": 0.75171709, + "num_input_tokens_seen": 256655290, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6953125, + "step": 11899, + "time_per_iteration": 2.4661285877227783 + }, + { + "auxiliary_loss_clip": 0.01100403, + "auxiliary_loss_mlp": 0.01033704, + "balance_loss_clip": 1.02177763, + "balance_loss_mlp": 1.03431213, + "epoch": 0.7154667067488351, + "flos": 22601494149120.0, + "grad_norm": 3.247812809543318, + "language_loss": 0.76076663, + "learning_rate": 7.906942308317614e-07, + "loss": 0.78210765, + "num_input_tokens_seen": 256671605, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 11900, + "time_per_iteration": 2.4811995029449463 + }, + { + "auxiliary_loss_clip": 0.01103689, + "auxiliary_loss_mlp": 0.01027857, + "balance_loss_clip": 1.01656795, + "balance_loss_mlp": 1.03645658, + "epoch": 0.7155268300015031, + "flos": 18771154513920.0, + "grad_norm": 1.955266248567226, + "language_loss": 0.80275625, + "learning_rate": 7.903840517773886e-07, + "loss": 0.82407176, + "num_input_tokens_seen": 256689680, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.671875, + "step": 11901, + "time_per_iteration": 2.423145294189453 + }, + { + "auxiliary_loss_clip": 0.01105274, + "auxiliary_loss_mlp": 0.01029008, + "balance_loss_clip": 1.01728368, + "balance_loss_mlp": 1.0356729, + "epoch": 0.7155869532541711, + "flos": 18296343607680.0, + "grad_norm": 2.026904555565968, + "language_loss": 0.81071323, + "learning_rate": 7.900739185917744e-07, + "loss": 0.83205605, + "num_input_tokens_seen": 256707760, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6953125, + "step": 11902, + "time_per_iteration": 2.459885835647583 + }, + { + "auxiliary_loss_clip": 0.01101351, + "auxiliary_loss_mlp": 0.0102546, + "balance_loss_clip": 1.01437926, + "balance_loss_mlp": 1.03461826, + "epoch": 0.715647076506839, + "flos": 11980805783040.0, + "grad_norm": 1.7500024281838862, + "language_loss": 0.68114519, + "learning_rate": 7.897638312866785e-07, + "loss": 0.70241332, + "num_input_tokens_seen": 256724150, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.66796875, + "step": 11903, + "time_per_iteration": 2.407540798187256 + }, + { + "auxiliary_loss_clip": 0.01098245, + "auxiliary_loss_mlp": 0.010278, + "balance_loss_clip": 1.01664829, + "balance_loss_mlp": 1.03362346, + "epoch": 0.715707199759507, + "flos": 18951641377920.0, + "grad_norm": 1.6395674800408413, + "language_loss": 0.76098162, + "learning_rate": 7.894537898738589e-07, + "loss": 0.78224206, + "num_input_tokens_seen": 256742780, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.64453125, + "step": 11904, + "time_per_iteration": 2.4763503074645996 + }, + { + "auxiliary_loss_clip": 0.01102193, + "auxiliary_loss_mlp": 0.01036941, + "balance_loss_clip": 1.02438211, + "balance_loss_mlp": 1.03558111, + "epoch": 0.7157673230121749, + "flos": 15304410299520.0, + "grad_norm": 2.193780720610546, + "language_loss": 0.72085339, + "learning_rate": 7.891437943650727e-07, + "loss": 0.74224472, + "num_input_tokens_seen": 256761355, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.66796875, + "step": 11905, + "time_per_iteration": 2.42999267578125 + }, + { + "auxiliary_loss_clip": 0.01099839, + "auxiliary_loss_mlp": 0.01029451, + "balance_loss_clip": 1.01815557, + "balance_loss_mlp": 1.03396761, + "epoch": 0.715827446264843, + "flos": 23221850964480.0, + "grad_norm": 1.8001319449198983, + "language_loss": 0.78033888, + "learning_rate": 7.88833844772076e-07, + "loss": 0.80163181, + "num_input_tokens_seen": 256781335, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65625, + "step": 11906, + "time_per_iteration": 2.483344078063965 + }, + { + "auxiliary_loss_clip": 0.01024215, + "auxiliary_loss_mlp": 0.01002687, + "balance_loss_clip": 1.0016793, + "balance_loss_mlp": 1.00366879, + "epoch": 0.7158875695175109, + "flos": 60975421833600.0, + "grad_norm": 0.733638122069069, + "language_loss": 0.55290663, + "learning_rate": 7.885239411066205e-07, + "loss": 0.57317567, + "num_input_tokens_seen": 256838890, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.20507812, + "step": 11907, + "time_per_iteration": 2.9801692962646484 + }, + { + "auxiliary_loss_clip": 0.01101012, + "auxiliary_loss_mlp": 0.01029943, + "balance_loss_clip": 1.01846945, + "balance_loss_mlp": 1.03456974, + "epoch": 0.7159476927701789, + "flos": 17128780024320.0, + "grad_norm": 1.7110812642484816, + "language_loss": 0.69928622, + "learning_rate": 7.882140833804593e-07, + "loss": 0.72059584, + "num_input_tokens_seen": 256858145, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 11908, + "time_per_iteration": 2.4816782474517822 + }, + { + "auxiliary_loss_clip": 0.01102562, + "auxiliary_loss_mlp": 0.01031677, + "balance_loss_clip": 1.01918399, + "balance_loss_mlp": 1.03589427, + "epoch": 0.7160078160228468, + "flos": 22490601886080.0, + "grad_norm": 1.7432604153438784, + "language_loss": 0.71158898, + "learning_rate": 7.879042716053415e-07, + "loss": 0.73293138, + "num_input_tokens_seen": 256878545, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.66796875, + "step": 11909, + "time_per_iteration": 2.463728189468384 + }, + { + "auxiliary_loss_clip": 0.01102467, + "auxiliary_loss_mlp": 0.01030839, + "balance_loss_clip": 1.01932335, + "balance_loss_mlp": 1.0351603, + "epoch": 0.7160679392755148, + "flos": 30590935626240.0, + "grad_norm": 2.4467362846605014, + "language_loss": 0.75301147, + "learning_rate": 7.875945057930144e-07, + "loss": 0.7743445, + "num_input_tokens_seen": 256899920, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 11910, + "time_per_iteration": 2.552417755126953 + }, + { + "auxiliary_loss_clip": 0.01101078, + "auxiliary_loss_mlp": 0.01030765, + "balance_loss_clip": 1.02067399, + "balance_loss_mlp": 1.03550065, + "epoch": 0.7161280625281827, + "flos": 21323648833920.0, + "grad_norm": 1.495993401769944, + "language_loss": 0.7667104, + "learning_rate": 7.872847859552251e-07, + "loss": 0.78802884, + "num_input_tokens_seen": 256918460, + "router_z_loss_clip": 0.10058594, + "router_z_loss_mlp": 0.65625, + "step": 11911, + "time_per_iteration": 2.441070079803467 + }, + { + "auxiliary_loss_clip": 0.01101901, + "auxiliary_loss_mlp": 0.01028053, + "balance_loss_clip": 1.01610255, + "balance_loss_mlp": 1.03523242, + "epoch": 0.7161881857808508, + "flos": 61860078921600.0, + "grad_norm": 1.748429659384578, + "language_loss": 0.58908474, + "learning_rate": 7.869751121037192e-07, + "loss": 0.61038435, + "num_input_tokens_seen": 256942015, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66796875, + "step": 11912, + "time_per_iteration": 2.857440948486328 + }, + { + "auxiliary_loss_clip": 0.01101647, + "auxiliary_loss_mlp": 0.01032599, + "balance_loss_clip": 1.02072561, + "balance_loss_mlp": 1.03633833, + "epoch": 0.7162483090335187, + "flos": 20812101292800.0, + "grad_norm": 2.5901065267477907, + "language_loss": 0.77851343, + "learning_rate": 7.866654842502376e-07, + "loss": 0.79985595, + "num_input_tokens_seen": 256961065, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.65625, + "step": 11913, + "time_per_iteration": 2.4704270362854004 + }, + { + "auxiliary_loss_clip": 0.01097344, + "auxiliary_loss_mlp": 0.0102545, + "balance_loss_clip": 1.01518047, + "balance_loss_mlp": 1.03362048, + "epoch": 0.7163084322861867, + "flos": 24097532630400.0, + "grad_norm": 1.6674872832299297, + "language_loss": 0.7374261, + "learning_rate": 7.863559024065234e-07, + "loss": 0.75865406, + "num_input_tokens_seen": 256982165, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.63671875, + "step": 11914, + "time_per_iteration": 2.4930355548858643 + }, + { + "auxiliary_loss_clip": 0.01097032, + "auxiliary_loss_mlp": 0.01028588, + "balance_loss_clip": 1.01763296, + "balance_loss_mlp": 1.03384876, + "epoch": 0.7163685555388547, + "flos": 20080888128000.0, + "grad_norm": 1.6897507669283607, + "language_loss": 0.74089867, + "learning_rate": 7.860463665843143e-07, + "loss": 0.76215488, + "num_input_tokens_seen": 256999825, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6328125, + "step": 11915, + "time_per_iteration": 2.4409830570220947 + }, + { + "auxiliary_loss_clip": 0.01101198, + "auxiliary_loss_mlp": 0.01026687, + "balance_loss_clip": 1.01569629, + "balance_loss_mlp": 1.03444886, + "epoch": 0.7164286787915226, + "flos": 17456967613440.0, + "grad_norm": 1.8754792377471143, + "language_loss": 0.81102198, + "learning_rate": 7.85736876795349e-07, + "loss": 0.83230084, + "num_input_tokens_seen": 257017450, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.66796875, + "step": 11916, + "time_per_iteration": 2.459618330001831 + }, + { + "auxiliary_loss_clip": 0.01101693, + "auxiliary_loss_mlp": 0.01030601, + "balance_loss_clip": 1.01945496, + "balance_loss_mlp": 1.03565669, + "epoch": 0.7164888020441906, + "flos": 19718908819200.0, + "grad_norm": 1.9464707558133532, + "language_loss": 0.68163168, + "learning_rate": 7.854274330513626e-07, + "loss": 0.70295465, + "num_input_tokens_seen": 257035465, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66015625, + "step": 11917, + "time_per_iteration": 2.4127745628356934 + }, + { + "auxiliary_loss_clip": 0.01101517, + "auxiliary_loss_mlp": 0.01028462, + "balance_loss_clip": 1.01660061, + "balance_loss_mlp": 1.0357312, + "epoch": 0.7165489252968585, + "flos": 21470523546240.0, + "grad_norm": 1.6865560164096236, + "language_loss": 0.75851363, + "learning_rate": 7.851180353640896e-07, + "loss": 0.77981341, + "num_input_tokens_seen": 257053750, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.65625, + "step": 11918, + "time_per_iteration": 2.4734885692596436 + }, + { + "auxiliary_loss_clip": 0.01024332, + "auxiliary_loss_mlp": 0.00998276, + "balance_loss_clip": 0.99721545, + "balance_loss_mlp": 1.00387406, + "epoch": 0.7166090485495266, + "flos": 69928060464000.0, + "grad_norm": 0.6281271868389183, + "language_loss": 0.53900385, + "learning_rate": 7.848086837452639e-07, + "loss": 0.55922985, + "num_input_tokens_seen": 257121215, + "router_z_loss_clip": 0.01062012, + "router_z_loss_mlp": 0.20507812, + "step": 11919, + "time_per_iteration": 3.0739991664886475 + }, + { + "auxiliary_loss_clip": 0.01103551, + "auxiliary_loss_mlp": 0.010276, + "balance_loss_clip": 1.0166924, + "balance_loss_mlp": 1.03664875, + "epoch": 0.7166691718021945, + "flos": 27343892949120.0, + "grad_norm": 1.814886397013554, + "language_loss": 0.69109583, + "learning_rate": 7.844993782066132e-07, + "loss": 0.71240735, + "num_input_tokens_seen": 257143370, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.66796875, + "step": 11920, + "time_per_iteration": 2.544965982437134 + }, + { + "auxiliary_loss_clip": 0.01101615, + "auxiliary_loss_mlp": 0.01034755, + "balance_loss_clip": 1.02296519, + "balance_loss_mlp": 1.03518677, + "epoch": 0.7167292950548625, + "flos": 30408868563840.0, + "grad_norm": 2.316743559144869, + "language_loss": 0.74621791, + "learning_rate": 7.841901187598678e-07, + "loss": 0.76758158, + "num_input_tokens_seen": 257162160, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 11921, + "time_per_iteration": 2.526437282562256 + }, + { + "auxiliary_loss_clip": 0.01105899, + "auxiliary_loss_mlp": 0.0103267, + "balance_loss_clip": 1.01924133, + "balance_loss_mlp": 1.0359118, + "epoch": 0.7167894183075304, + "flos": 14571257800320.0, + "grad_norm": 2.001999520631163, + "language_loss": 0.75461966, + "learning_rate": 7.83880905416755e-07, + "loss": 0.77600539, + "num_input_tokens_seen": 257179300, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.703125, + "step": 11922, + "time_per_iteration": 2.4796934127807617 + }, + { + "auxiliary_loss_clip": 0.0102356, + "auxiliary_loss_mlp": 0.01004637, + "balance_loss_clip": 1.0036118, + "balance_loss_mlp": 1.00325036, + "epoch": 0.7168495415601984, + "flos": 64110674407680.0, + "grad_norm": 0.7529363745673505, + "language_loss": 0.55118704, + "learning_rate": 7.83571738189001e-07, + "loss": 0.57146901, + "num_input_tokens_seen": 257235470, + "router_z_loss_clip": 0.01025391, + "router_z_loss_mlp": 0.203125, + "step": 11923, + "time_per_iteration": 2.8653676509857178 + }, + { + "auxiliary_loss_clip": 0.01101474, + "auxiliary_loss_mlp": 0.01034043, + "balance_loss_clip": 1.02191389, + "balance_loss_mlp": 1.03463423, + "epoch": 0.7169096648128663, + "flos": 24681440119680.0, + "grad_norm": 1.5657552163313224, + "language_loss": 0.7707153, + "learning_rate": 7.832626170883279e-07, + "loss": 0.79207051, + "num_input_tokens_seen": 257255850, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66796875, + "step": 11924, + "time_per_iteration": 2.4798498153686523 + }, + { + "auxiliary_loss_clip": 0.01099287, + "auxiliary_loss_mlp": 0.01026383, + "balance_loss_clip": 1.01563597, + "balance_loss_mlp": 1.03447676, + "epoch": 0.7169697880655344, + "flos": 20667525050880.0, + "grad_norm": 1.8554693193395075, + "language_loss": 0.68279767, + "learning_rate": 7.829535421264588e-07, + "loss": 0.70405436, + "num_input_tokens_seen": 257275425, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6484375, + "step": 11925, + "time_per_iteration": 2.456970453262329 + }, + { + "auxiliary_loss_clip": 0.01094381, + "auxiliary_loss_mlp": 0.01029664, + "balance_loss_clip": 1.0189774, + "balance_loss_mlp": 1.03209913, + "epoch": 0.7170299113182023, + "flos": 21032700670080.0, + "grad_norm": 1.4556850136555692, + "language_loss": 0.77406371, + "learning_rate": 7.826445133151133e-07, + "loss": 0.79530406, + "num_input_tokens_seen": 257295740, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.62109375, + "step": 11926, + "time_per_iteration": 2.47904109954834 + }, + { + "auxiliary_loss_clip": 0.01104854, + "auxiliary_loss_mlp": 0.01029406, + "balance_loss_clip": 1.01756239, + "balance_loss_mlp": 1.03482664, + "epoch": 0.7170900345708703, + "flos": 22893304239360.0, + "grad_norm": 1.9978148890029475, + "language_loss": 0.77397847, + "learning_rate": 7.823355306660093e-07, + "loss": 0.79532105, + "num_input_tokens_seen": 257315970, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.703125, + "step": 11927, + "time_per_iteration": 2.4695799350738525 + }, + { + "auxiliary_loss_clip": 0.0110007, + "auxiliary_loss_mlp": 0.01029701, + "balance_loss_clip": 1.01752985, + "balance_loss_mlp": 1.03606367, + "epoch": 0.7171501578235383, + "flos": 15518688883200.0, + "grad_norm": 1.633304495033459, + "language_loss": 0.69208646, + "learning_rate": 7.820265941908642e-07, + "loss": 0.71338403, + "num_input_tokens_seen": 257334230, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.640625, + "step": 11928, + "time_per_iteration": 3.8939363956451416 + }, + { + "auxiliary_loss_clip": 0.01097285, + "auxiliary_loss_mlp": 0.01028264, + "balance_loss_clip": 1.01755297, + "balance_loss_mlp": 1.03416717, + "epoch": 0.7172102810762062, + "flos": 26104292640000.0, + "grad_norm": 1.8722089290497335, + "language_loss": 0.65309197, + "learning_rate": 7.817177039013931e-07, + "loss": 0.67434746, + "num_input_tokens_seen": 257352145, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6328125, + "step": 11929, + "time_per_iteration": 2.6483962535858154 + }, + { + "auxiliary_loss_clip": 0.01101349, + "auxiliary_loss_mlp": 0.01027062, + "balance_loss_clip": 1.01543295, + "balance_loss_mlp": 1.03426468, + "epoch": 0.7172704043288742, + "flos": 21506649649920.0, + "grad_norm": 1.9043937603193066, + "language_loss": 0.69810534, + "learning_rate": 7.81408859809308e-07, + "loss": 0.71938944, + "num_input_tokens_seen": 257371460, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 11930, + "time_per_iteration": 3.861077308654785 + }, + { + "auxiliary_loss_clip": 0.01099761, + "auxiliary_loss_mlp": 0.01027844, + "balance_loss_clip": 1.01666808, + "balance_loss_mlp": 1.0326252, + "epoch": 0.7173305275815421, + "flos": 18770939032320.0, + "grad_norm": 1.6949604037705792, + "language_loss": 0.80755305, + "learning_rate": 7.811000619263219e-07, + "loss": 0.82882911, + "num_input_tokens_seen": 257390800, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.671875, + "step": 11931, + "time_per_iteration": 5.3606438636779785 + }, + { + "auxiliary_loss_clip": 0.01098633, + "auxiliary_loss_mlp": 0.01031463, + "balance_loss_clip": 1.02060318, + "balance_loss_mlp": 1.03377175, + "epoch": 0.7173906508342102, + "flos": 16179876483840.0, + "grad_norm": 2.0368865181542843, + "language_loss": 0.78136313, + "learning_rate": 7.80791310264143e-07, + "loss": 0.8026641, + "num_input_tokens_seen": 257407495, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6484375, + "step": 11932, + "time_per_iteration": 2.4471938610076904 + }, + { + "auxiliary_loss_clip": 0.01098455, + "auxiliary_loss_mlp": 0.01027853, + "balance_loss_clip": 1.01697493, + "balance_loss_mlp": 1.03404713, + "epoch": 0.7174507740868781, + "flos": 26613864933120.0, + "grad_norm": 2.4237059069381446, + "language_loss": 0.75071502, + "learning_rate": 7.804826048344803e-07, + "loss": 0.77197808, + "num_input_tokens_seen": 257429675, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.64453125, + "step": 11933, + "time_per_iteration": 2.5671815872192383 + }, + { + "auxiliary_loss_clip": 0.01107402, + "auxiliary_loss_mlp": 0.01034606, + "balance_loss_clip": 1.02070642, + "balance_loss_mlp": 1.03681624, + "epoch": 0.7175108973395461, + "flos": 18432911116800.0, + "grad_norm": 2.920076286079433, + "language_loss": 0.69595957, + "learning_rate": 7.801739456490388e-07, + "loss": 0.71737969, + "num_input_tokens_seen": 257442765, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.703125, + "step": 11934, + "time_per_iteration": 2.4200711250305176 + }, + { + "auxiliary_loss_clip": 0.0109937, + "auxiliary_loss_mlp": 0.01033447, + "balance_loss_clip": 1.02228904, + "balance_loss_mlp": 1.03382134, + "epoch": 0.717571020592214, + "flos": 23914962777600.0, + "grad_norm": 2.1353095308292858, + "language_loss": 0.86605275, + "learning_rate": 7.798653327195237e-07, + "loss": 0.8873809, + "num_input_tokens_seen": 257459310, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.65625, + "step": 11935, + "time_per_iteration": 2.4989066123962402 + }, + { + "auxiliary_loss_clip": 0.0110051, + "auxiliary_loss_mlp": 0.0102799, + "balance_loss_clip": 1.01663518, + "balance_loss_mlp": 1.03355277, + "epoch": 0.717631143844882, + "flos": 38256930109440.0, + "grad_norm": 1.5482941622525788, + "language_loss": 0.73668665, + "learning_rate": 7.795567660576388e-07, + "loss": 0.75797164, + "num_input_tokens_seen": 257484750, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 11936, + "time_per_iteration": 2.5941200256347656 + }, + { + "auxiliary_loss_clip": 0.01023485, + "auxiliary_loss_mlp": 0.00998978, + "balance_loss_clip": 0.99795878, + "balance_loss_mlp": 1.00320697, + "epoch": 0.7176912670975499, + "flos": 65515896328320.0, + "grad_norm": 0.7612162175951352, + "language_loss": 0.5594666, + "learning_rate": 7.79248245675082e-07, + "loss": 0.57969117, + "num_input_tokens_seen": 257543110, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.203125, + "step": 11937, + "time_per_iteration": 3.0358333587646484 + }, + { + "auxiliary_loss_clip": 0.01103398, + "auxiliary_loss_mlp": 0.01031893, + "balance_loss_clip": 1.0194416, + "balance_loss_mlp": 1.03557646, + "epoch": 0.717751390350218, + "flos": 31281066610560.0, + "grad_norm": 1.9834308333096748, + "language_loss": 0.54777831, + "learning_rate": 7.789397715835542e-07, + "loss": 0.56913126, + "num_input_tokens_seen": 257567410, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6796875, + "step": 11938, + "time_per_iteration": 2.498337984085083 + }, + { + "auxiliary_loss_clip": 0.01096235, + "auxiliary_loss_mlp": 0.01028561, + "balance_loss_clip": 1.01791584, + "balance_loss_mlp": 1.03201962, + "epoch": 0.7178115136028859, + "flos": 19859031774720.0, + "grad_norm": 1.6763116198702877, + "language_loss": 0.76891506, + "learning_rate": 7.786313437947527e-07, + "loss": 0.79016298, + "num_input_tokens_seen": 257586270, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.640625, + "step": 11939, + "time_per_iteration": 2.4648613929748535 + }, + { + "auxiliary_loss_clip": 0.0102339, + "auxiliary_loss_mlp": 0.01004556, + "balance_loss_clip": 1.00347769, + "balance_loss_mlp": 1.003003, + "epoch": 0.7178716368555539, + "flos": 64348655967360.0, + "grad_norm": 0.7581492176008457, + "language_loss": 0.61391574, + "learning_rate": 7.783229623203738e-07, + "loss": 0.63419521, + "num_input_tokens_seen": 257647415, + "router_z_loss_clip": 0.01080322, + "router_z_loss_mlp": 0.20410156, + "step": 11940, + "time_per_iteration": 3.0383803844451904 + }, + { + "auxiliary_loss_clip": 0.0109722, + "auxiliary_loss_mlp": 0.01030263, + "balance_loss_clip": 1.01903307, + "balance_loss_mlp": 1.03327632, + "epoch": 0.7179317601082219, + "flos": 26762607152640.0, + "grad_norm": 1.5272164711726817, + "language_loss": 0.58784437, + "learning_rate": 7.780146271721097e-07, + "loss": 0.60911918, + "num_input_tokens_seen": 257669795, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 11941, + "time_per_iteration": 2.5290164947509766 + }, + { + "auxiliary_loss_clip": 0.01100557, + "auxiliary_loss_mlp": 0.01029246, + "balance_loss_clip": 1.0178616, + "balance_loss_mlp": 1.03522658, + "epoch": 0.7179918833608898, + "flos": 23513804709120.0, + "grad_norm": 1.9189885732421792, + "language_loss": 0.79849315, + "learning_rate": 7.777063383616543e-07, + "loss": 0.81979108, + "num_input_tokens_seen": 257687415, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65234375, + "step": 11942, + "time_per_iteration": 2.5065739154815674 + }, + { + "auxiliary_loss_clip": 0.0110185, + "auxiliary_loss_mlp": 0.0103798, + "balance_loss_clip": 1.02638674, + "balance_loss_mlp": 1.03522158, + "epoch": 0.7180520066135578, + "flos": 17165588486400.0, + "grad_norm": 2.0597149659122636, + "language_loss": 0.66328835, + "learning_rate": 7.773980959006968e-07, + "loss": 0.6846866, + "num_input_tokens_seen": 257706215, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 11943, + "time_per_iteration": 2.4654104709625244 + }, + { + "auxiliary_loss_clip": 0.01097892, + "auxiliary_loss_mlp": 0.01028638, + "balance_loss_clip": 1.01696157, + "balance_loss_mlp": 1.03440082, + "epoch": 0.7181121298662257, + "flos": 17566638814080.0, + "grad_norm": 1.9764370465475432, + "language_loss": 0.79013598, + "learning_rate": 7.770898998009254e-07, + "loss": 0.81140125, + "num_input_tokens_seen": 257724740, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6328125, + "step": 11944, + "time_per_iteration": 2.445718765258789 + }, + { + "auxiliary_loss_clip": 0.01102899, + "auxiliary_loss_mlp": 0.01036625, + "balance_loss_clip": 1.02368522, + "balance_loss_mlp": 1.03503132, + "epoch": 0.7181722531188938, + "flos": 11947660508160.0, + "grad_norm": 2.260846776642364, + "language_loss": 0.62923992, + "learning_rate": 7.767817500740277e-07, + "loss": 0.65063506, + "num_input_tokens_seen": 257742060, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6796875, + "step": 11945, + "time_per_iteration": 2.4455084800720215 + }, + { + "auxiliary_loss_clip": 0.01023274, + "auxiliary_loss_mlp": 0.01000772, + "balance_loss_clip": 0.99966967, + "balance_loss_mlp": 1.00287986, + "epoch": 0.7182323763715617, + "flos": 65503649790720.0, + "grad_norm": 0.7012511616617018, + "language_loss": 0.51091176, + "learning_rate": 7.76473646731689e-07, + "loss": 0.53115225, + "num_input_tokens_seen": 257802250, + "router_z_loss_clip": 0.01104736, + "router_z_loss_mlp": 0.20410156, + "step": 11946, + "time_per_iteration": 2.993520498275757 + }, + { + "auxiliary_loss_clip": 0.0110474, + "auxiliary_loss_mlp": 0.01035735, + "balance_loss_clip": 1.0228188, + "balance_loss_mlp": 1.03633344, + "epoch": 0.7182924996242297, + "flos": 20630932070400.0, + "grad_norm": 1.5511387132101104, + "language_loss": 0.74426639, + "learning_rate": 7.761655897855925e-07, + "loss": 0.76567119, + "num_input_tokens_seen": 257821155, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.68359375, + "step": 11947, + "time_per_iteration": 2.5280697345733643 + }, + { + "auxiliary_loss_clip": 0.01098111, + "auxiliary_loss_mlp": 0.01028479, + "balance_loss_clip": 1.01682568, + "balance_loss_mlp": 1.03252912, + "epoch": 0.7183526228768976, + "flos": 16216433550720.0, + "grad_norm": 1.7377460165223417, + "language_loss": 0.72264934, + "learning_rate": 7.758575792474187e-07, + "loss": 0.74391532, + "num_input_tokens_seen": 257839905, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65625, + "step": 11948, + "time_per_iteration": 2.404911994934082 + }, + { + "auxiliary_loss_clip": 0.01103929, + "auxiliary_loss_mlp": 0.01038628, + "balance_loss_clip": 1.0260042, + "balance_loss_mlp": 1.0358839, + "epoch": 0.7184127461295656, + "flos": 22232655342720.0, + "grad_norm": 1.5225277290119825, + "language_loss": 0.71613109, + "learning_rate": 7.755496151288483e-07, + "loss": 0.73755664, + "num_input_tokens_seen": 257860055, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 11949, + "time_per_iteration": 2.4918761253356934 + }, + { + "auxiliary_loss_clip": 0.01099737, + "auxiliary_loss_mlp": 0.01030693, + "balance_loss_clip": 1.01983905, + "balance_loss_mlp": 1.03520155, + "epoch": 0.7184728693822335, + "flos": 27344503480320.0, + "grad_norm": 1.917874476636917, + "language_loss": 0.75913876, + "learning_rate": 7.752416974415598e-07, + "loss": 0.78044307, + "num_input_tokens_seen": 257879315, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6484375, + "step": 11950, + "time_per_iteration": 2.4783732891082764 + }, + { + "auxiliary_loss_clip": 0.01104047, + "auxiliary_loss_mlp": 0.01029869, + "balance_loss_clip": 1.01741719, + "balance_loss_mlp": 1.03651667, + "epoch": 0.7185329926349016, + "flos": 16508530949760.0, + "grad_norm": 2.3664494047814872, + "language_loss": 0.67457062, + "learning_rate": 7.749338261972282e-07, + "loss": 0.69590974, + "num_input_tokens_seen": 257896570, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.67578125, + "step": 11951, + "time_per_iteration": 2.4524526596069336 + }, + { + "auxiliary_loss_clip": 0.01106378, + "auxiliary_loss_mlp": 0.01029859, + "balance_loss_clip": 1.01682329, + "balance_loss_mlp": 1.03777874, + "epoch": 0.7185931158875695, + "flos": 23951052967680.0, + "grad_norm": 1.7288194945229958, + "language_loss": 0.78023463, + "learning_rate": 7.746260014075286e-07, + "loss": 0.80159694, + "num_input_tokens_seen": 257916855, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.68359375, + "step": 11952, + "time_per_iteration": 2.49094295501709 + }, + { + "auxiliary_loss_clip": 0.01103687, + "auxiliary_loss_mlp": 0.01033038, + "balance_loss_clip": 1.02052677, + "balance_loss_mlp": 1.03563547, + "epoch": 0.7186532391402375, + "flos": 26542007775360.0, + "grad_norm": 1.7793096783925773, + "language_loss": 0.74963003, + "learning_rate": 7.743182230841352e-07, + "loss": 0.77099729, + "num_input_tokens_seen": 257937140, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 11953, + "time_per_iteration": 2.500009298324585 + }, + { + "auxiliary_loss_clip": 0.01102038, + "auxiliary_loss_mlp": 0.01028598, + "balance_loss_clip": 1.0169332, + "balance_loss_mlp": 1.03495383, + "epoch": 0.7187133623929055, + "flos": 22383049587840.0, + "grad_norm": 1.7832624252992626, + "language_loss": 0.72971594, + "learning_rate": 7.740104912387164e-07, + "loss": 0.75102234, + "num_input_tokens_seen": 257956785, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.671875, + "step": 11954, + "time_per_iteration": 2.4608652591705322 + }, + { + "auxiliary_loss_clip": 0.01103144, + "auxiliary_loss_mlp": 0.01034266, + "balance_loss_clip": 1.02251804, + "balance_loss_mlp": 1.03668714, + "epoch": 0.7187734856455734, + "flos": 15779580341760.0, + "grad_norm": 1.601255234350909, + "language_loss": 0.74186033, + "learning_rate": 7.737028058829425e-07, + "loss": 0.7632345, + "num_input_tokens_seen": 257975455, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 11955, + "time_per_iteration": 2.474217176437378 + }, + { + "auxiliary_loss_clip": 0.01101929, + "auxiliary_loss_mlp": 0.01030274, + "balance_loss_clip": 1.01877582, + "balance_loss_mlp": 1.0353359, + "epoch": 0.7188336088982414, + "flos": 31759612531200.0, + "grad_norm": 1.6751832358498482, + "language_loss": 0.73376679, + "learning_rate": 7.733951670284817e-07, + "loss": 0.75508881, + "num_input_tokens_seen": 257996850, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 11956, + "time_per_iteration": 2.5315232276916504 + }, + { + "auxiliary_loss_clip": 0.01101581, + "auxiliary_loss_mlp": 0.01028517, + "balance_loss_clip": 1.01688766, + "balance_loss_mlp": 1.03342509, + "epoch": 0.7188937321509093, + "flos": 21465208333440.0, + "grad_norm": 2.7995163806109407, + "language_loss": 0.7065621, + "learning_rate": 7.730875746869987e-07, + "loss": 0.72786307, + "num_input_tokens_seen": 258016145, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.68359375, + "step": 11957, + "time_per_iteration": 2.479146957397461 + }, + { + "auxiliary_loss_clip": 0.01102151, + "auxiliary_loss_mlp": 0.01036925, + "balance_loss_clip": 1.02427661, + "balance_loss_mlp": 1.03408146, + "epoch": 0.7189538554035774, + "flos": 27271497087360.0, + "grad_norm": 1.9581401117139001, + "language_loss": 0.73586559, + "learning_rate": 7.727800288701582e-07, + "loss": 0.75725639, + "num_input_tokens_seen": 258035420, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6796875, + "step": 11958, + "time_per_iteration": 2.50201416015625 + }, + { + "auxiliary_loss_clip": 0.01099164, + "auxiliary_loss_mlp": 0.01034303, + "balance_loss_clip": 1.0223763, + "balance_loss_mlp": 1.03451216, + "epoch": 0.7190139786562453, + "flos": 21580625710080.0, + "grad_norm": 1.602205422840009, + "language_loss": 0.84252381, + "learning_rate": 7.724725295896215e-07, + "loss": 0.86385846, + "num_input_tokens_seen": 258053520, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6484375, + "step": 11959, + "time_per_iteration": 2.4619383811950684 + }, + { + "auxiliary_loss_clip": 0.01107021, + "auxiliary_loss_mlp": 0.01029979, + "balance_loss_clip": 1.01745617, + "balance_loss_mlp": 1.0378958, + "epoch": 0.7190741019089133, + "flos": 26721237663360.0, + "grad_norm": 1.9033832243828488, + "language_loss": 0.81933033, + "learning_rate": 7.7216507685705e-07, + "loss": 0.84070033, + "num_input_tokens_seen": 258073020, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69140625, + "step": 11960, + "time_per_iteration": 2.4611432552337646 + }, + { + "auxiliary_loss_clip": 0.01100369, + "auxiliary_loss_mlp": 0.01031508, + "balance_loss_clip": 1.01926565, + "balance_loss_mlp": 1.03601289, + "epoch": 0.7191342251615812, + "flos": 26104759516800.0, + "grad_norm": 1.6005750914484573, + "language_loss": 0.77382779, + "learning_rate": 7.718576706841013e-07, + "loss": 0.79514658, + "num_input_tokens_seen": 258093155, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.64453125, + "step": 11961, + "time_per_iteration": 2.490257978439331 + }, + { + "auxiliary_loss_clip": 0.01096696, + "auxiliary_loss_mlp": 0.01030381, + "balance_loss_clip": 1.01970601, + "balance_loss_mlp": 1.03359604, + "epoch": 0.7191943484142492, + "flos": 22967028904320.0, + "grad_norm": 1.493885754938081, + "language_loss": 0.75197971, + "learning_rate": 7.715503110824326e-07, + "loss": 0.7732504, + "num_input_tokens_seen": 258113905, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.62890625, + "step": 11962, + "time_per_iteration": 2.444990873336792 + }, + { + "auxiliary_loss_clip": 0.01101992, + "auxiliary_loss_mlp": 0.01031338, + "balance_loss_clip": 1.01888692, + "balance_loss_mlp": 1.03441834, + "epoch": 0.7192544716669171, + "flos": 22565332131840.0, + "grad_norm": 1.6695078722173347, + "language_loss": 0.75041807, + "learning_rate": 7.712429980637001e-07, + "loss": 0.7717514, + "num_input_tokens_seen": 258132820, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.67578125, + "step": 11963, + "time_per_iteration": 2.4661693572998047 + }, + { + "auxiliary_loss_clip": 0.01105424, + "auxiliary_loss_mlp": 0.01033709, + "balance_loss_clip": 1.02045298, + "balance_loss_mlp": 1.03614235, + "epoch": 0.7193145949195852, + "flos": 18982200873600.0, + "grad_norm": 8.488875605489067, + "language_loss": 0.80680382, + "learning_rate": 7.709357316395564e-07, + "loss": 0.82819521, + "num_input_tokens_seen": 258148055, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.69140625, + "step": 11964, + "time_per_iteration": 2.400843620300293 + }, + { + "auxiliary_loss_clip": 0.01098575, + "auxiliary_loss_mlp": 0.01030033, + "balance_loss_clip": 1.01819539, + "balance_loss_mlp": 1.03335524, + "epoch": 0.7193747181722531, + "flos": 18004246208640.0, + "grad_norm": 1.6851421500357613, + "language_loss": 0.74987501, + "learning_rate": 7.70628511821652e-07, + "loss": 0.77116108, + "num_input_tokens_seen": 258165995, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65234375, + "step": 11965, + "time_per_iteration": 2.455549955368042 + }, + { + "auxiliary_loss_clip": 0.01103926, + "auxiliary_loss_mlp": 0.01032188, + "balance_loss_clip": 1.01950407, + "balance_loss_mlp": 1.03589249, + "epoch": 0.7194348414249211, + "flos": 24389414547840.0, + "grad_norm": 1.6225024257282918, + "language_loss": 0.77548587, + "learning_rate": 7.703213386216377e-07, + "loss": 0.79684699, + "num_input_tokens_seen": 258186165, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 11966, + "time_per_iteration": 2.4651193618774414 + }, + { + "auxiliary_loss_clip": 0.01101346, + "auxiliary_loss_mlp": 0.01029248, + "balance_loss_clip": 1.01745796, + "balance_loss_mlp": 1.03470814, + "epoch": 0.7194949646775891, + "flos": 22163455791360.0, + "grad_norm": 1.9626871533411263, + "language_loss": 0.72638512, + "learning_rate": 7.700142120511619e-07, + "loss": 0.74769109, + "num_input_tokens_seen": 258204595, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 11967, + "time_per_iteration": 2.4732322692871094 + }, + { + "auxiliary_loss_clip": 0.01098168, + "auxiliary_loss_mlp": 0.01028177, + "balance_loss_clip": 1.01812136, + "balance_loss_mlp": 1.03623199, + "epoch": 0.719555087930257, + "flos": 20266366982400.0, + "grad_norm": 1.8100027522509434, + "language_loss": 0.81220973, + "learning_rate": 7.6970713212187e-07, + "loss": 0.83347309, + "num_input_tokens_seen": 258223110, + "router_z_loss_clip": 0.10058594, + "router_z_loss_mlp": 0.62109375, + "step": 11968, + "time_per_iteration": 2.4276745319366455 + }, + { + "auxiliary_loss_clip": 0.01098632, + "auxiliary_loss_mlp": 0.01027926, + "balance_loss_clip": 1.01651216, + "balance_loss_mlp": 1.03366137, + "epoch": 0.719615211182925, + "flos": 24716309247360.0, + "grad_norm": 2.0102886054893268, + "language_loss": 0.76459819, + "learning_rate": 7.69400098845407e-07, + "loss": 0.78586376, + "num_input_tokens_seen": 258242660, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 11969, + "time_per_iteration": 3.861771821975708 + }, + { + "auxiliary_loss_clip": 0.01100862, + "auxiliary_loss_mlp": 0.01030429, + "balance_loss_clip": 1.01781702, + "balance_loss_mlp": 1.03329253, + "epoch": 0.719675334435593, + "flos": 20009641501440.0, + "grad_norm": 1.7792544853917616, + "language_loss": 0.70936543, + "learning_rate": 7.69093112233417e-07, + "loss": 0.73067832, + "num_input_tokens_seen": 258261850, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.67578125, + "step": 11970, + "time_per_iteration": 2.421149253845215 + }, + { + "auxiliary_loss_clip": 0.01023909, + "auxiliary_loss_mlp": 0.00997715, + "balance_loss_clip": 0.99661201, + "balance_loss_mlp": 1.00355303, + "epoch": 0.719735457688261, + "flos": 44199861177600.0, + "grad_norm": 0.9239284754087862, + "language_loss": 0.60847962, + "learning_rate": 7.68786172297538e-07, + "loss": 0.62869585, + "num_input_tokens_seen": 258312570, + "router_z_loss_clip": 0.01104736, + "router_z_loss_mlp": 0.203125, + "step": 11971, + "time_per_iteration": 4.394974231719971 + }, + { + "auxiliary_loss_clip": 0.01108101, + "auxiliary_loss_mlp": 0.01032644, + "balance_loss_clip": 1.02012718, + "balance_loss_mlp": 1.03647828, + "epoch": 0.7197955809409289, + "flos": 16802890905600.0, + "grad_norm": 2.2193219685375647, + "language_loss": 0.79842031, + "learning_rate": 7.684792790494105e-07, + "loss": 0.8198278, + "num_input_tokens_seen": 258331600, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.71875, + "step": 11972, + "time_per_iteration": 3.8465628623962402 + }, + { + "auxiliary_loss_clip": 0.01104665, + "auxiliary_loss_mlp": 0.01036669, + "balance_loss_clip": 1.02426565, + "balance_loss_mlp": 1.0365268, + "epoch": 0.7198557041935969, + "flos": 24535391420160.0, + "grad_norm": 1.5934226335424646, + "language_loss": 0.75385857, + "learning_rate": 7.681724325006733e-07, + "loss": 0.77527189, + "num_input_tokens_seen": 258351785, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 11973, + "time_per_iteration": 3.967134475708008 + }, + { + "auxiliary_loss_clip": 0.0102351, + "auxiliary_loss_mlp": 0.00997992, + "balance_loss_clip": 0.9969967, + "balance_loss_mlp": 1.00313878, + "epoch": 0.7199158274462648, + "flos": 70710839602560.0, + "grad_norm": 0.8568599946371717, + "language_loss": 0.57251143, + "learning_rate": 7.6786563266296e-07, + "loss": 0.59272635, + "num_input_tokens_seen": 258404035, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.20410156, + "step": 11974, + "time_per_iteration": 2.9041314125061035 + }, + { + "auxiliary_loss_clip": 0.0110113, + "auxiliary_loss_mlp": 0.01032835, + "balance_loss_clip": 1.02024031, + "balance_loss_mlp": 1.03228343, + "epoch": 0.7199759506989328, + "flos": 29347995352320.0, + "grad_norm": 2.0540036125086623, + "language_loss": 0.61555636, + "learning_rate": 7.675588795479062e-07, + "loss": 0.63689601, + "num_input_tokens_seen": 258424850, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 11975, + "time_per_iteration": 2.5565595626831055 + }, + { + "auxiliary_loss_clip": 0.0110015, + "auxiliary_loss_mlp": 0.01031171, + "balance_loss_clip": 1.01964951, + "balance_loss_mlp": 1.03378308, + "epoch": 0.7200360739516007, + "flos": 24640465680000.0, + "grad_norm": 1.7485061825333017, + "language_loss": 0.67644596, + "learning_rate": 7.672521731671425e-07, + "loss": 0.69775921, + "num_input_tokens_seen": 258445485, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 11976, + "time_per_iteration": 2.4791998863220215 + }, + { + "auxiliary_loss_clip": 0.01101483, + "auxiliary_loss_mlp": 0.01028822, + "balance_loss_clip": 1.0175333, + "balance_loss_mlp": 1.03462696, + "epoch": 0.7200961972042688, + "flos": 20812855478400.0, + "grad_norm": 1.9984197913928563, + "language_loss": 0.67032665, + "learning_rate": 7.669455135323004e-07, + "loss": 0.69162977, + "num_input_tokens_seen": 258464505, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.66796875, + "step": 11977, + "time_per_iteration": 2.4562158584594727 + }, + { + "auxiliary_loss_clip": 0.01105574, + "auxiliary_loss_mlp": 0.01030978, + "balance_loss_clip": 1.01912272, + "balance_loss_mlp": 1.03690076, + "epoch": 0.7201563204569367, + "flos": 31245910174080.0, + "grad_norm": 1.7897602101317545, + "language_loss": 0.75156534, + "learning_rate": 7.666389006550074e-07, + "loss": 0.77293086, + "num_input_tokens_seen": 258487190, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 11978, + "time_per_iteration": 2.6318418979644775 + }, + { + "auxiliary_loss_clip": 0.01098931, + "auxiliary_loss_mlp": 0.01031575, + "balance_loss_clip": 1.01953471, + "balance_loss_mlp": 1.03316569, + "epoch": 0.7202164437096047, + "flos": 26651391667200.0, + "grad_norm": 2.125023403243126, + "language_loss": 0.78794968, + "learning_rate": 7.663323345468908e-07, + "loss": 0.80925471, + "num_input_tokens_seen": 258503790, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.65625, + "step": 11979, + "time_per_iteration": 2.4805469512939453 + }, + { + "auxiliary_loss_clip": 0.01103342, + "auxiliary_loss_mlp": 0.01026884, + "balance_loss_clip": 1.0148797, + "balance_loss_mlp": 1.03659976, + "epoch": 0.7202765669622727, + "flos": 25959608657280.0, + "grad_norm": 1.7429736369489133, + "language_loss": 0.65073323, + "learning_rate": 7.660258152195767e-07, + "loss": 0.67203552, + "num_input_tokens_seen": 258527335, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66796875, + "step": 11980, + "time_per_iteration": 2.530036211013794 + }, + { + "auxiliary_loss_clip": 0.0110393, + "auxiliary_loss_mlp": 0.01035093, + "balance_loss_clip": 1.02152729, + "balance_loss_mlp": 1.03610325, + "epoch": 0.7203366902149406, + "flos": 28512354372480.0, + "grad_norm": 1.8302790091648973, + "language_loss": 0.67421222, + "learning_rate": 7.657193426846871e-07, + "loss": 0.69560248, + "num_input_tokens_seen": 258546690, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.6796875, + "step": 11981, + "time_per_iteration": 2.5009641647338867 + }, + { + "auxiliary_loss_clip": 0.01103608, + "auxiliary_loss_mlp": 0.01032928, + "balance_loss_clip": 1.02081633, + "balance_loss_mlp": 1.03605318, + "epoch": 0.7203968134676086, + "flos": 21106030285440.0, + "grad_norm": 1.9266732225953629, + "language_loss": 0.73759854, + "learning_rate": 7.65412916953843e-07, + "loss": 0.75896388, + "num_input_tokens_seen": 258566340, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 11982, + "time_per_iteration": 2.4776506423950195 + }, + { + "auxiliary_loss_clip": 0.01101459, + "auxiliary_loss_mlp": 0.01037116, + "balance_loss_clip": 1.02592814, + "balance_loss_mlp": 1.03360009, + "epoch": 0.7204569367202766, + "flos": 18332146488960.0, + "grad_norm": 1.8065417430122819, + "language_loss": 0.66113031, + "learning_rate": 7.65106538038665e-07, + "loss": 0.68251604, + "num_input_tokens_seen": 258584455, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6796875, + "step": 11983, + "time_per_iteration": 2.441363573074341 + }, + { + "auxiliary_loss_clip": 0.01103087, + "auxiliary_loss_mlp": 0.01031533, + "balance_loss_clip": 1.0197134, + "balance_loss_mlp": 1.03643811, + "epoch": 0.7205170599729446, + "flos": 23255103980160.0, + "grad_norm": 1.5519388922028943, + "language_loss": 0.66470373, + "learning_rate": 7.648002059507715e-07, + "loss": 0.68604994, + "num_input_tokens_seen": 258604725, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66796875, + "step": 11984, + "time_per_iteration": 2.4713308811187744 + }, + { + "auxiliary_loss_clip": 0.01107357, + "auxiliary_loss_mlp": 0.01035242, + "balance_loss_clip": 1.02291024, + "balance_loss_mlp": 1.03795314, + "epoch": 0.7205771832256125, + "flos": 20120892900480.0, + "grad_norm": 1.7856287402136095, + "language_loss": 0.73836136, + "learning_rate": 7.644939207017771e-07, + "loss": 0.75978738, + "num_input_tokens_seen": 258622885, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6953125, + "step": 11985, + "time_per_iteration": 2.4582014083862305 + }, + { + "auxiliary_loss_clip": 0.01101196, + "auxiliary_loss_mlp": 0.01028264, + "balance_loss_clip": 1.01717734, + "balance_loss_mlp": 1.03589368, + "epoch": 0.7206373064782805, + "flos": 27703250565120.0, + "grad_norm": 1.7243225094685473, + "language_loss": 0.62891448, + "learning_rate": 7.641876823032977e-07, + "loss": 0.65020913, + "num_input_tokens_seen": 258644305, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.65234375, + "step": 11986, + "time_per_iteration": 2.525557279586792 + }, + { + "auxiliary_loss_clip": 0.01104045, + "auxiliary_loss_mlp": 0.01034059, + "balance_loss_clip": 1.020702, + "balance_loss_mlp": 1.03693676, + "epoch": 0.7206974297309484, + "flos": 17968156018560.0, + "grad_norm": 1.5922220046222206, + "language_loss": 0.72103626, + "learning_rate": 7.638814907669455e-07, + "loss": 0.74241722, + "num_input_tokens_seen": 258661775, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.671875, + "step": 11987, + "time_per_iteration": 2.4586973190307617 + }, + { + "auxiliary_loss_clip": 0.01104181, + "auxiliary_loss_mlp": 0.01030178, + "balance_loss_clip": 1.01796532, + "balance_loss_mlp": 1.03563333, + "epoch": 0.7207575529836164, + "flos": 16983162288000.0, + "grad_norm": 1.7226788638874178, + "language_loss": 0.78616083, + "learning_rate": 7.635753461043301e-07, + "loss": 0.80750442, + "num_input_tokens_seen": 258679830, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 11988, + "time_per_iteration": 2.425905227661133 + }, + { + "auxiliary_loss_clip": 0.01100213, + "auxiliary_loss_mlp": 0.01030561, + "balance_loss_clip": 1.01888466, + "balance_loss_mlp": 1.03406453, + "epoch": 0.7208176762362843, + "flos": 18727594295040.0, + "grad_norm": 3.553932459688601, + "language_loss": 0.78784275, + "learning_rate": 7.632692483270618e-07, + "loss": 0.80915058, + "num_input_tokens_seen": 258697415, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 11989, + "time_per_iteration": 2.58890700340271 + }, + { + "auxiliary_loss_clip": 0.01100086, + "auxiliary_loss_mlp": 0.01031511, + "balance_loss_clip": 1.01997149, + "balance_loss_mlp": 1.03511739, + "epoch": 0.7208777994889524, + "flos": 18734489706240.0, + "grad_norm": 6.030003130937093, + "language_loss": 0.82572663, + "learning_rate": 7.629631974467481e-07, + "loss": 0.84704268, + "num_input_tokens_seen": 258716755, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 11990, + "time_per_iteration": 2.422929048538208 + }, + { + "auxiliary_loss_clip": 0.01101655, + "auxiliary_loss_mlp": 0.01035975, + "balance_loss_clip": 1.02463794, + "balance_loss_mlp": 1.03581941, + "epoch": 0.7209379227416203, + "flos": 14793437376000.0, + "grad_norm": 2.2646719383287746, + "language_loss": 0.76148689, + "learning_rate": 7.626571934749931e-07, + "loss": 0.78286314, + "num_input_tokens_seen": 258733270, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.66015625, + "step": 11991, + "time_per_iteration": 2.439966917037964 + }, + { + "auxiliary_loss_clip": 0.01099738, + "auxiliary_loss_mlp": 0.01027379, + "balance_loss_clip": 1.0163815, + "balance_loss_mlp": 1.03555298, + "epoch": 0.7209980459942883, + "flos": 29636860527360.0, + "grad_norm": 2.0383069832544263, + "language_loss": 0.72644949, + "learning_rate": 7.623512364234022e-07, + "loss": 0.74772066, + "num_input_tokens_seen": 258755270, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.640625, + "step": 11992, + "time_per_iteration": 2.508730173110962 + }, + { + "auxiliary_loss_clip": 0.01103062, + "auxiliary_loss_mlp": 0.01028315, + "balance_loss_clip": 1.01668572, + "balance_loss_mlp": 1.0353663, + "epoch": 0.7210581692469563, + "flos": 23477175815040.0, + "grad_norm": 1.8344706583489365, + "language_loss": 0.66479945, + "learning_rate": 7.620453263035755e-07, + "loss": 0.68611324, + "num_input_tokens_seen": 258775340, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.67578125, + "step": 11993, + "time_per_iteration": 2.496220350265503 + }, + { + "auxiliary_loss_clip": 0.01101133, + "auxiliary_loss_mlp": 0.01030932, + "balance_loss_clip": 1.01959491, + "balance_loss_mlp": 1.03491402, + "epoch": 0.7211182924996242, + "flos": 26099839353600.0, + "grad_norm": 2.3726873705189786, + "language_loss": 0.65635949, + "learning_rate": 7.61739463127115e-07, + "loss": 0.67768013, + "num_input_tokens_seen": 258794580, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66015625, + "step": 11994, + "time_per_iteration": 2.481267213821411 + }, + { + "auxiliary_loss_clip": 0.01102846, + "auxiliary_loss_mlp": 0.01029077, + "balance_loss_clip": 1.01626778, + "balance_loss_mlp": 1.03604794, + "epoch": 0.7211784157522922, + "flos": 17712076982400.0, + "grad_norm": 1.7186394121352693, + "language_loss": 0.66596985, + "learning_rate": 7.614336469056172e-07, + "loss": 0.68728906, + "num_input_tokens_seen": 258812330, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.66796875, + "step": 11995, + "time_per_iteration": 2.4427177906036377 + }, + { + "auxiliary_loss_clip": 0.01099622, + "auxiliary_loss_mlp": 0.01029445, + "balance_loss_clip": 1.01687467, + "balance_loss_mlp": 1.0355916, + "epoch": 0.7212385390049602, + "flos": 24423637230720.0, + "grad_norm": 1.85436447909986, + "language_loss": 0.79713655, + "learning_rate": 7.6112787765068e-07, + "loss": 0.81842726, + "num_input_tokens_seen": 258831770, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.640625, + "step": 11996, + "time_per_iteration": 2.459115505218506 + }, + { + "auxiliary_loss_clip": 0.01103225, + "auxiliary_loss_mlp": 0.01032367, + "balance_loss_clip": 1.0208993, + "balance_loss_mlp": 1.03676772, + "epoch": 0.7212986622576282, + "flos": 28147250580480.0, + "grad_norm": 5.051284745258933, + "language_loss": 0.81384039, + "learning_rate": 7.60822155373899e-07, + "loss": 0.83519638, + "num_input_tokens_seen": 258849090, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6640625, + "step": 11997, + "time_per_iteration": 2.5205626487731934 + }, + { + "auxiliary_loss_clip": 0.01103756, + "auxiliary_loss_mlp": 0.01034784, + "balance_loss_clip": 1.02231431, + "balance_loss_mlp": 1.03483105, + "epoch": 0.7213587855102961, + "flos": 21835770992640.0, + "grad_norm": 1.8313827039335897, + "language_loss": 0.67091608, + "learning_rate": 7.605164800868646e-07, + "loss": 0.69230151, + "num_input_tokens_seen": 258868230, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 11998, + "time_per_iteration": 2.431267023086548 + }, + { + "auxiliary_loss_clip": 0.01102391, + "auxiliary_loss_mlp": 0.01031921, + "balance_loss_clip": 1.02113891, + "balance_loss_mlp": 1.03637111, + "epoch": 0.7214189087629641, + "flos": 14611549881600.0, + "grad_norm": 1.8599790081910679, + "language_loss": 0.72658986, + "learning_rate": 7.602108518011696e-07, + "loss": 0.74793291, + "num_input_tokens_seen": 258885525, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.66015625, + "step": 11999, + "time_per_iteration": 2.434900999069214 + }, + { + "auxiliary_loss_clip": 0.01103894, + "auxiliary_loss_mlp": 0.01027384, + "balance_loss_clip": 1.01531434, + "balance_loss_mlp": 1.03644443, + "epoch": 0.721479032015632, + "flos": 19390864884480.0, + "grad_norm": 2.3549521640831843, + "language_loss": 0.83203346, + "learning_rate": 7.599052705284039e-07, + "loss": 0.85334623, + "num_input_tokens_seen": 258903245, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.67578125, + "step": 12000, + "time_per_iteration": 2.4203250408172607 + }, + { + "auxiliary_loss_clip": 0.011045, + "auxiliary_loss_mlp": 0.01034844, + "balance_loss_clip": 1.02320933, + "balance_loss_mlp": 1.03663993, + "epoch": 0.7215391552683, + "flos": 18512884748160.0, + "grad_norm": 1.6620327129342116, + "language_loss": 0.77455056, + "learning_rate": 7.59599736280154e-07, + "loss": 0.79594404, + "num_input_tokens_seen": 258921245, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 12001, + "time_per_iteration": 2.446817636489868 + }, + { + "auxiliary_loss_clip": 0.01103076, + "auxiliary_loss_mlp": 0.01034762, + "balance_loss_clip": 1.02331209, + "balance_loss_mlp": 1.0377841, + "epoch": 0.721599278520968, + "flos": 23258731253760.0, + "grad_norm": 1.7518200734535594, + "language_loss": 0.81436306, + "learning_rate": 7.592942490680066e-07, + "loss": 0.83574152, + "num_input_tokens_seen": 258939425, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 12002, + "time_per_iteration": 2.4679903984069824 + }, + { + "auxiliary_loss_clip": 0.01104088, + "auxiliary_loss_mlp": 0.01027156, + "balance_loss_clip": 1.01510406, + "balance_loss_mlp": 1.03641772, + "epoch": 0.721659401773636, + "flos": 39199045979520.0, + "grad_norm": 2.283155803599373, + "language_loss": 0.62498772, + "learning_rate": 7.589888089035462e-07, + "loss": 0.6463002, + "num_input_tokens_seen": 258960710, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.67578125, + "step": 12003, + "time_per_iteration": 2.58776593208313 + }, + { + "auxiliary_loss_clip": 0.01102937, + "auxiliary_loss_mlp": 0.01032279, + "balance_loss_clip": 1.02022064, + "balance_loss_mlp": 1.03539622, + "epoch": 0.7217195250263039, + "flos": 14939917038720.0, + "grad_norm": 2.560985107334089, + "language_loss": 0.68500596, + "learning_rate": 7.586834157983544e-07, + "loss": 0.70635808, + "num_input_tokens_seen": 258978475, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.67578125, + "step": 12004, + "time_per_iteration": 2.3969027996063232 + }, + { + "auxiliary_loss_clip": 0.01025027, + "auxiliary_loss_mlp": 0.0099804, + "balance_loss_clip": 0.99700272, + "balance_loss_mlp": 1.00448203, + "epoch": 0.7217796482789719, + "flos": 70869206666880.0, + "grad_norm": 0.8643975392958543, + "language_loss": 0.54278243, + "learning_rate": 7.583780697640112e-07, + "loss": 0.56301308, + "num_input_tokens_seen": 259037520, + "router_z_loss_clip": 0.01037598, + "router_z_loss_mlp": 0.20507812, + "step": 12005, + "time_per_iteration": 2.9869492053985596 + }, + { + "auxiliary_loss_clip": 0.01102163, + "auxiliary_loss_mlp": 0.01032861, + "balance_loss_clip": 1.02052271, + "balance_loss_mlp": 1.03582788, + "epoch": 0.7218397715316398, + "flos": 37451525402880.0, + "grad_norm": 1.4997790369746062, + "language_loss": 0.62904799, + "learning_rate": 7.580727708120962e-07, + "loss": 0.65039825, + "num_input_tokens_seen": 259061325, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6640625, + "step": 12006, + "time_per_iteration": 2.6116576194763184 + }, + { + "auxiliary_loss_clip": 0.01102198, + "auxiliary_loss_mlp": 0.01031585, + "balance_loss_clip": 1.02048635, + "balance_loss_mlp": 1.0356214, + "epoch": 0.7218998947843078, + "flos": 22710662559360.0, + "grad_norm": 1.702113645244825, + "language_loss": 0.92155731, + "learning_rate": 7.577675189541865e-07, + "loss": 0.94289511, + "num_input_tokens_seen": 259078135, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 12007, + "time_per_iteration": 2.4609286785125732 + }, + { + "auxiliary_loss_clip": 0.01102657, + "auxiliary_loss_mlp": 0.01030059, + "balance_loss_clip": 1.0176847, + "balance_loss_mlp": 1.03450811, + "epoch": 0.7219600180369758, + "flos": 12167182477440.0, + "grad_norm": 2.0030110165156088, + "language_loss": 0.64172041, + "learning_rate": 7.574623142018568e-07, + "loss": 0.66304755, + "num_input_tokens_seen": 259095910, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 12008, + "time_per_iteration": 2.4176084995269775 + }, + { + "auxiliary_loss_clip": 0.0110518, + "auxiliary_loss_mlp": 0.0103247, + "balance_loss_clip": 1.01998281, + "balance_loss_mlp": 1.03712559, + "epoch": 0.7220201412896438, + "flos": 22596573985920.0, + "grad_norm": 1.9142767312180562, + "language_loss": 0.78281379, + "learning_rate": 7.57157156566681e-07, + "loss": 0.80419028, + "num_input_tokens_seen": 259114225, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 12009, + "time_per_iteration": 2.486860752105713 + }, + { + "auxiliary_loss_clip": 0.01105579, + "auxiliary_loss_mlp": 0.01034559, + "balance_loss_clip": 1.0216608, + "balance_loss_mlp": 1.03696656, + "epoch": 0.7220802645423118, + "flos": 26718651884160.0, + "grad_norm": 1.8228551130543398, + "language_loss": 0.63638747, + "learning_rate": 7.568520460602297e-07, + "loss": 0.65778881, + "num_input_tokens_seen": 259134660, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6875, + "step": 12010, + "time_per_iteration": 2.4727206230163574 + }, + { + "auxiliary_loss_clip": 0.01102553, + "auxiliary_loss_mlp": 0.0102775, + "balance_loss_clip": 1.0161804, + "balance_loss_mlp": 1.03521693, + "epoch": 0.7221403877949797, + "flos": 24420548661120.0, + "grad_norm": 2.6555622208181195, + "language_loss": 0.77546549, + "learning_rate": 7.565469826940742e-07, + "loss": 0.79676855, + "num_input_tokens_seen": 259153300, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.671875, + "step": 12011, + "time_per_iteration": 3.9832870960235596 + }, + { + "auxiliary_loss_clip": 0.01101603, + "auxiliary_loss_mlp": 0.01032842, + "balance_loss_clip": 1.02180326, + "balance_loss_mlp": 1.03652728, + "epoch": 0.7222005110476477, + "flos": 23514379326720.0, + "grad_norm": 1.6788129204959028, + "language_loss": 0.79040414, + "learning_rate": 7.56241966479781e-07, + "loss": 0.81174862, + "num_input_tokens_seen": 259172115, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 12012, + "time_per_iteration": 2.5008320808410645 + }, + { + "auxiliary_loss_clip": 0.01105391, + "auxiliary_loss_mlp": 0.01030402, + "balance_loss_clip": 1.01893425, + "balance_loss_mlp": 1.03754234, + "epoch": 0.7222606343003156, + "flos": 23112538899840.0, + "grad_norm": 1.7808047508810358, + "language_loss": 0.75740772, + "learning_rate": 7.559369974289171e-07, + "loss": 0.77876568, + "num_input_tokens_seen": 259191345, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6796875, + "step": 12013, + "time_per_iteration": 3.9566152095794678 + }, + { + "auxiliary_loss_clip": 0.01102634, + "auxiliary_loss_mlp": 0.01025299, + "balance_loss_clip": 1.01445651, + "balance_loss_mlp": 1.03621209, + "epoch": 0.7223207575529836, + "flos": 24351169541760.0, + "grad_norm": 1.471281729007001, + "language_loss": 0.75965142, + "learning_rate": 7.556320755530484e-07, + "loss": 0.78093076, + "num_input_tokens_seen": 259211700, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6640625, + "step": 12014, + "time_per_iteration": 3.9748001098632812 + }, + { + "auxiliary_loss_clip": 0.01104429, + "auxiliary_loss_mlp": 0.01032084, + "balance_loss_clip": 1.02027655, + "balance_loss_mlp": 1.03614628, + "epoch": 0.7223808808056515, + "flos": 28330179569280.0, + "grad_norm": 1.614960921439624, + "language_loss": 0.86782753, + "learning_rate": 7.553272008637346e-07, + "loss": 0.8891927, + "num_input_tokens_seen": 259233825, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.68359375, + "step": 12015, + "time_per_iteration": 3.9988059997558594 + }, + { + "auxiliary_loss_clip": 0.01100793, + "auxiliary_loss_mlp": 0.01035132, + "balance_loss_clip": 1.02386725, + "balance_loss_mlp": 1.03534532, + "epoch": 0.7224410040583196, + "flos": 21069437304960.0, + "grad_norm": 1.879880951075302, + "language_loss": 0.77969182, + "learning_rate": 7.55022373372538e-07, + "loss": 0.80105108, + "num_input_tokens_seen": 259253055, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65625, + "step": 12016, + "time_per_iteration": 2.45281982421875 + }, + { + "auxiliary_loss_clip": 0.01099669, + "auxiliary_loss_mlp": 0.01033938, + "balance_loss_clip": 1.02245855, + "balance_loss_mlp": 1.03527737, + "epoch": 0.7225011273109875, + "flos": 26795429205120.0, + "grad_norm": 1.444882690983208, + "language_loss": 0.77545393, + "learning_rate": 7.547175930910186e-07, + "loss": 0.79679, + "num_input_tokens_seen": 259273420, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.64453125, + "step": 12017, + "time_per_iteration": 2.4577410221099854 + }, + { + "auxiliary_loss_clip": 0.01098758, + "auxiliary_loss_mlp": 0.01025484, + "balance_loss_clip": 1.01503491, + "balance_loss_mlp": 1.03520453, + "epoch": 0.7225612505636555, + "flos": 23583578878080.0, + "grad_norm": 2.637627355867151, + "language_loss": 0.73314553, + "learning_rate": 7.54412860030732e-07, + "loss": 0.75438797, + "num_input_tokens_seen": 259291000, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.63671875, + "step": 12018, + "time_per_iteration": 2.4559662342071533 + }, + { + "auxiliary_loss_clip": 0.01099343, + "auxiliary_loss_mlp": 0.01031186, + "balance_loss_clip": 1.02060056, + "balance_loss_mlp": 1.03665912, + "epoch": 0.7226213738163234, + "flos": 20777627214720.0, + "grad_norm": 2.5981107828035026, + "language_loss": 0.77910566, + "learning_rate": 7.541081742032347e-07, + "loss": 0.80041099, + "num_input_tokens_seen": 259312390, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.625, + "step": 12019, + "time_per_iteration": 2.4371070861816406 + }, + { + "auxiliary_loss_clip": 0.01100393, + "auxiliary_loss_mlp": 0.01027432, + "balance_loss_clip": 1.01560664, + "balance_loss_mlp": 1.0350244, + "epoch": 0.7226814970689914, + "flos": 32635832901120.0, + "grad_norm": 1.6489444745204735, + "language_loss": 0.73905075, + "learning_rate": 7.53803535620081e-07, + "loss": 0.76032901, + "num_input_tokens_seen": 259332645, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65234375, + "step": 12020, + "time_per_iteration": 2.5431694984436035 + }, + { + "auxiliary_loss_clip": 0.01103343, + "auxiliary_loss_mlp": 0.01032271, + "balance_loss_clip": 1.02115512, + "balance_loss_mlp": 1.03456461, + "epoch": 0.7227416203216595, + "flos": 22454368041600.0, + "grad_norm": 1.6675263064788628, + "language_loss": 0.77169615, + "learning_rate": 7.534989442928219e-07, + "loss": 0.79305232, + "num_input_tokens_seen": 259353810, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6875, + "step": 12021, + "time_per_iteration": 2.483078718185425 + }, + { + "auxiliary_loss_clip": 0.0110063, + "auxiliary_loss_mlp": 0.01031983, + "balance_loss_clip": 1.0206815, + "balance_loss_mlp": 1.03491306, + "epoch": 0.7228017435743274, + "flos": 21652303299840.0, + "grad_norm": 2.1826099193920374, + "language_loss": 0.68331528, + "learning_rate": 7.531944002330073e-07, + "loss": 0.70464146, + "num_input_tokens_seen": 259372460, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65625, + "step": 12022, + "time_per_iteration": 2.454972982406616 + }, + { + "auxiliary_loss_clip": 0.01101398, + "auxiliary_loss_mlp": 0.01032038, + "balance_loss_clip": 1.01967645, + "balance_loss_mlp": 1.03452194, + "epoch": 0.7228618668269954, + "flos": 29533474206720.0, + "grad_norm": 1.7453912487460392, + "language_loss": 0.69111204, + "learning_rate": 7.528899034521858e-07, + "loss": 0.71244639, + "num_input_tokens_seen": 259393275, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.66796875, + "step": 12023, + "time_per_iteration": 2.4790570735931396 + }, + { + "auxiliary_loss_clip": 0.01098672, + "auxiliary_loss_mlp": 0.01028619, + "balance_loss_clip": 1.0168705, + "balance_loss_mlp": 1.03245616, + "epoch": 0.7229219900796633, + "flos": 27453815544960.0, + "grad_norm": 1.6879551293116275, + "language_loss": 0.71159554, + "learning_rate": 7.525854539619052e-07, + "loss": 0.73286849, + "num_input_tokens_seen": 259416205, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6640625, + "step": 12024, + "time_per_iteration": 2.5031228065490723 + }, + { + "auxiliary_loss_clip": 0.0110197, + "auxiliary_loss_mlp": 0.01035131, + "balance_loss_clip": 1.02403879, + "balance_loss_mlp": 1.03651297, + "epoch": 0.7229821133323313, + "flos": 16289368116480.0, + "grad_norm": 1.7827113324832673, + "language_loss": 0.75502241, + "learning_rate": 7.522810517737089e-07, + "loss": 0.77639341, + "num_input_tokens_seen": 259433115, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.65625, + "step": 12025, + "time_per_iteration": 2.540117025375366 + }, + { + "auxiliary_loss_clip": 0.01101331, + "auxiliary_loss_mlp": 0.0102966, + "balance_loss_clip": 1.01867485, + "balance_loss_mlp": 1.03641152, + "epoch": 0.7230422365849992, + "flos": 20412343854720.0, + "grad_norm": 2.1646639083011, + "language_loss": 0.7686342, + "learning_rate": 7.519766968991395e-07, + "loss": 0.78994411, + "num_input_tokens_seen": 259450475, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6484375, + "step": 12026, + "time_per_iteration": 2.619121551513672 + }, + { + "auxiliary_loss_clip": 0.01101511, + "auxiliary_loss_mlp": 0.0103796, + "balance_loss_clip": 1.02681398, + "balance_loss_mlp": 1.0340333, + "epoch": 0.7231023598376672, + "flos": 25593499284480.0, + "grad_norm": 1.96713718815872, + "language_loss": 0.67575908, + "learning_rate": 7.516723893497388e-07, + "loss": 0.69715375, + "num_input_tokens_seen": 259469355, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.671875, + "step": 12027, + "time_per_iteration": 2.5705184936523438 + }, + { + "auxiliary_loss_clip": 0.01105426, + "auxiliary_loss_mlp": 0.01030564, + "balance_loss_clip": 1.01864338, + "balance_loss_mlp": 1.03727841, + "epoch": 0.7231624830903352, + "flos": 25149607009920.0, + "grad_norm": 2.266596078102469, + "language_loss": 0.78860784, + "learning_rate": 7.513681291370469e-07, + "loss": 0.8099677, + "num_input_tokens_seen": 259486565, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 12028, + "time_per_iteration": 2.521543502807617 + }, + { + "auxiliary_loss_clip": 0.01099985, + "auxiliary_loss_mlp": 0.01025931, + "balance_loss_clip": 1.0140934, + "balance_loss_mlp": 1.03353393, + "epoch": 0.7232226063430032, + "flos": 21725740656000.0, + "grad_norm": 1.7215623884299298, + "language_loss": 0.81997663, + "learning_rate": 7.510639162726e-07, + "loss": 0.84123576, + "num_input_tokens_seen": 259505070, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 12029, + "time_per_iteration": 2.518493890762329 + }, + { + "auxiliary_loss_clip": 0.01024828, + "auxiliary_loss_mlp": 0.01005824, + "balance_loss_clip": 1.00497139, + "balance_loss_mlp": 1.00435281, + "epoch": 0.7232827295956711, + "flos": 68436798491520.0, + "grad_norm": 0.8108297905714709, + "language_loss": 0.61798579, + "learning_rate": 7.507597507679347e-07, + "loss": 0.63829231, + "num_input_tokens_seen": 259569135, + "router_z_loss_clip": 0.00854492, + "router_z_loss_mlp": 0.20507812, + "step": 12030, + "time_per_iteration": 3.3008005619049072 + }, + { + "auxiliary_loss_clip": 0.01097674, + "auxiliary_loss_mlp": 0.01027221, + "balance_loss_clip": 1.01557982, + "balance_loss_mlp": 1.0335412, + "epoch": 0.7233428528483391, + "flos": 20192642317440.0, + "grad_norm": 1.9017157177210717, + "language_loss": 0.78060263, + "learning_rate": 7.504556326345859e-07, + "loss": 0.80185157, + "num_input_tokens_seen": 259587035, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.640625, + "step": 12031, + "time_per_iteration": 2.410015106201172 + }, + { + "auxiliary_loss_clip": 0.01103629, + "auxiliary_loss_mlp": 0.01030191, + "balance_loss_clip": 1.01835394, + "balance_loss_mlp": 1.03571391, + "epoch": 0.723402976101007, + "flos": 23949472769280.0, + "grad_norm": 2.6817131275089614, + "language_loss": 0.81817293, + "learning_rate": 7.501515618840834e-07, + "loss": 0.83951116, + "num_input_tokens_seen": 259606140, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 12032, + "time_per_iteration": 2.4944539070129395 + }, + { + "auxiliary_loss_clip": 0.01105541, + "auxiliary_loss_mlp": 0.01032741, + "balance_loss_clip": 1.0208137, + "balance_loss_mlp": 1.03620064, + "epoch": 0.723463099353675, + "flos": 20813394182400.0, + "grad_norm": 1.8666102600772807, + "language_loss": 0.74966335, + "learning_rate": 7.498475385279592e-07, + "loss": 0.77104622, + "num_input_tokens_seen": 259624275, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 12033, + "time_per_iteration": 2.4195306301116943 + }, + { + "auxiliary_loss_clip": 0.01098927, + "auxiliary_loss_mlp": 0.01027603, + "balance_loss_clip": 1.01661193, + "balance_loss_mlp": 1.03378749, + "epoch": 0.723523222606343, + "flos": 19098013299840.0, + "grad_norm": 1.53895157270623, + "language_loss": 0.74960071, + "learning_rate": 7.495435625777423e-07, + "loss": 0.77086604, + "num_input_tokens_seen": 259643465, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.65234375, + "step": 12034, + "time_per_iteration": 2.4611551761627197 + }, + { + "auxiliary_loss_clip": 0.01099874, + "auxiliary_loss_mlp": 0.01026544, + "balance_loss_clip": 1.01580346, + "balance_loss_mlp": 1.03402519, + "epoch": 0.723583345859011, + "flos": 26506994993280.0, + "grad_norm": 1.7101429729597608, + "language_loss": 0.80541229, + "learning_rate": 7.492396340449578e-07, + "loss": 0.82667649, + "num_input_tokens_seen": 259662500, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.66015625, + "step": 12035, + "time_per_iteration": 2.4735255241394043 + }, + { + "auxiliary_loss_clip": 0.01102988, + "auxiliary_loss_mlp": 0.0103086, + "balance_loss_clip": 1.01914811, + "balance_loss_mlp": 1.03593981, + "epoch": 0.723643469111679, + "flos": 16033863697920.0, + "grad_norm": 1.6708890033016828, + "language_loss": 0.60718334, + "learning_rate": 7.489357529411326e-07, + "loss": 0.6285218, + "num_input_tokens_seen": 259680140, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 12036, + "time_per_iteration": 2.4652183055877686 + }, + { + "auxiliary_loss_clip": 0.01097804, + "auxiliary_loss_mlp": 0.01029749, + "balance_loss_clip": 1.01916969, + "balance_loss_mlp": 1.03397477, + "epoch": 0.7237035923643469, + "flos": 21945549934080.0, + "grad_norm": 1.8946488922724685, + "language_loss": 0.67484653, + "learning_rate": 7.486319192777883e-07, + "loss": 0.69612211, + "num_input_tokens_seen": 259700160, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.63671875, + "step": 12037, + "time_per_iteration": 2.439401388168335 + }, + { + "auxiliary_loss_clip": 0.0110086, + "auxiliary_loss_mlp": 0.01035034, + "balance_loss_clip": 1.02312541, + "balance_loss_mlp": 1.03511119, + "epoch": 0.7237637156170149, + "flos": 23583112001280.0, + "grad_norm": 1.8161270520180812, + "language_loss": 0.72444439, + "learning_rate": 7.483281330664479e-07, + "loss": 0.74580336, + "num_input_tokens_seen": 259720525, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 12038, + "time_per_iteration": 2.498206853866577 + }, + { + "auxiliary_loss_clip": 0.01102879, + "auxiliary_loss_mlp": 0.01029521, + "balance_loss_clip": 1.01683688, + "balance_loss_mlp": 1.0365181, + "epoch": 0.7238238388696828, + "flos": 20594698225920.0, + "grad_norm": 1.9105264736762722, + "language_loss": 0.72119117, + "learning_rate": 7.480243943186293e-07, + "loss": 0.74251521, + "num_input_tokens_seen": 259738680, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6640625, + "step": 12039, + "time_per_iteration": 2.427929401397705 + }, + { + "auxiliary_loss_clip": 0.01105168, + "auxiliary_loss_mlp": 0.01034722, + "balance_loss_clip": 1.02339756, + "balance_loss_mlp": 1.03659403, + "epoch": 0.7238839621223508, + "flos": 24207024263040.0, + "grad_norm": 2.0387115182112567, + "language_loss": 0.75838852, + "learning_rate": 7.477207030458513e-07, + "loss": 0.77978736, + "num_input_tokens_seen": 259758790, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6875, + "step": 12040, + "time_per_iteration": 2.4932591915130615 + }, + { + "auxiliary_loss_clip": 0.01100807, + "auxiliary_loss_mlp": 0.01033028, + "balance_loss_clip": 1.02060628, + "balance_loss_mlp": 1.03361833, + "epoch": 0.7239440853750188, + "flos": 14209745368320.0, + "grad_norm": 1.913740912847533, + "language_loss": 0.76230586, + "learning_rate": 7.474170592596301e-07, + "loss": 0.7836442, + "num_input_tokens_seen": 259777370, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 12041, + "time_per_iteration": 2.393092393875122 + }, + { + "auxiliary_loss_clip": 0.01101216, + "auxiliary_loss_mlp": 0.01028073, + "balance_loss_clip": 1.01658726, + "balance_loss_mlp": 1.03313875, + "epoch": 0.7240042086276868, + "flos": 21614812479360.0, + "grad_norm": 2.0516689414632348, + "language_loss": 0.63410985, + "learning_rate": 7.471134629714797e-07, + "loss": 0.65540266, + "num_input_tokens_seen": 259794665, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6796875, + "step": 12042, + "time_per_iteration": 2.4641988277435303 + }, + { + "auxiliary_loss_clip": 0.01105282, + "auxiliary_loss_mlp": 0.01033601, + "balance_loss_clip": 1.02106047, + "balance_loss_mlp": 1.03651488, + "epoch": 0.7240643318803547, + "flos": 23331450337920.0, + "grad_norm": 2.5235443155533486, + "language_loss": 0.83237529, + "learning_rate": 7.468099141929116e-07, + "loss": 0.85376412, + "num_input_tokens_seen": 259811110, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 12043, + "time_per_iteration": 2.433598041534424 + }, + { + "auxiliary_loss_clip": 0.0110258, + "auxiliary_loss_mlp": 0.0102984, + "balance_loss_clip": 1.01696599, + "balance_loss_mlp": 1.03478646, + "epoch": 0.7241244551330227, + "flos": 24024849459840.0, + "grad_norm": 1.7620410881767092, + "language_loss": 0.64035821, + "learning_rate": 7.465064129354379e-07, + "loss": 0.66168237, + "num_input_tokens_seen": 259831080, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6796875, + "step": 12044, + "time_per_iteration": 2.4627864360809326 + }, + { + "auxiliary_loss_clip": 0.0110401, + "auxiliary_loss_mlp": 0.01031559, + "balance_loss_clip": 1.01967335, + "balance_loss_mlp": 1.03717875, + "epoch": 0.7241845783856906, + "flos": 18730323728640.0, + "grad_norm": 1.4978020202204398, + "language_loss": 0.81621009, + "learning_rate": 7.462029592105658e-07, + "loss": 0.83756578, + "num_input_tokens_seen": 259850135, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66796875, + "step": 12045, + "time_per_iteration": 2.4192216396331787 + }, + { + "auxiliary_loss_clip": 0.01098967, + "auxiliary_loss_mlp": 0.01033304, + "balance_loss_clip": 1.0214541, + "balance_loss_mlp": 1.0345459, + "epoch": 0.7242447016383586, + "flos": 19498668577920.0, + "grad_norm": 1.5204011665835366, + "language_loss": 0.71989012, + "learning_rate": 7.458995530298034e-07, + "loss": 0.74121284, + "num_input_tokens_seen": 259868185, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.64453125, + "step": 12046, + "time_per_iteration": 2.4425227642059326 + }, + { + "auxiliary_loss_clip": 0.01101516, + "auxiliary_loss_mlp": 0.01028832, + "balance_loss_clip": 1.01617825, + "balance_loss_mlp": 1.03457832, + "epoch": 0.7243048248910267, + "flos": 22163491704960.0, + "grad_norm": 1.7863177787262001, + "language_loss": 0.71125013, + "learning_rate": 7.455961944046553e-07, + "loss": 0.7325536, + "num_input_tokens_seen": 259887055, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.671875, + "step": 12047, + "time_per_iteration": 2.4461426734924316 + }, + { + "auxiliary_loss_clip": 0.01108161, + "auxiliary_loss_mlp": 0.01034212, + "balance_loss_clip": 1.02217817, + "balance_loss_mlp": 1.03864622, + "epoch": 0.7243649481436946, + "flos": 27672762896640.0, + "grad_norm": 1.6964330206566038, + "language_loss": 0.69839394, + "learning_rate": 7.45292883346627e-07, + "loss": 0.71981764, + "num_input_tokens_seen": 259908295, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6953125, + "step": 12048, + "time_per_iteration": 2.500828981399536 + }, + { + "auxiliary_loss_clip": 0.01024144, + "auxiliary_loss_mlp": 0.01003374, + "balance_loss_clip": 1.00239074, + "balance_loss_mlp": 1.00373721, + "epoch": 0.7244250713963626, + "flos": 63244545759360.0, + "grad_norm": 0.8243567714089579, + "language_loss": 0.5377422, + "learning_rate": 7.449896198672168e-07, + "loss": 0.55801743, + "num_input_tokens_seen": 259968475, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.20410156, + "step": 12049, + "time_per_iteration": 3.04441499710083 + }, + { + "auxiliary_loss_clip": 0.01107642, + "auxiliary_loss_mlp": 0.01030185, + "balance_loss_clip": 1.01611245, + "balance_loss_mlp": 1.0363996, + "epoch": 0.7244851946490305, + "flos": 17967114524160.0, + "grad_norm": 3.707915690527614, + "language_loss": 0.59357387, + "learning_rate": 7.446864039779258e-07, + "loss": 0.61495221, + "num_input_tokens_seen": 259984865, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7109375, + "step": 12050, + "time_per_iteration": 2.4253971576690674 + }, + { + "auxiliary_loss_clip": 0.01024067, + "auxiliary_loss_mlp": 0.01001921, + "balance_loss_clip": 1.00082481, + "balance_loss_mlp": 1.00360942, + "epoch": 0.7245453179016985, + "flos": 70943649603840.0, + "grad_norm": 0.7294493469822053, + "language_loss": 0.53312981, + "learning_rate": 7.443832356902528e-07, + "loss": 0.55338979, + "num_input_tokens_seen": 260046735, + "router_z_loss_clip": 0.01098633, + "router_z_loss_mlp": 0.20507812, + "step": 12051, + "time_per_iteration": 3.049221992492676 + }, + { + "auxiliary_loss_clip": 0.01100204, + "auxiliary_loss_mlp": 0.01030999, + "balance_loss_clip": 1.01975131, + "balance_loss_mlp": 1.03494263, + "epoch": 0.7246054411543664, + "flos": 24568464867840.0, + "grad_norm": 1.5472193855827432, + "language_loss": 0.72156775, + "learning_rate": 7.440801150156927e-07, + "loss": 0.74287981, + "num_input_tokens_seen": 260067950, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 12052, + "time_per_iteration": 2.46797776222229 + }, + { + "auxiliary_loss_clip": 0.01102918, + "auxiliary_loss_mlp": 0.01029028, + "balance_loss_clip": 1.01620138, + "balance_loss_mlp": 1.03667867, + "epoch": 0.7246655644070344, + "flos": 32338312548480.0, + "grad_norm": 2.0462685374624088, + "language_loss": 0.74402982, + "learning_rate": 7.437770419657415e-07, + "loss": 0.76534927, + "num_input_tokens_seen": 260087730, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6640625, + "step": 12053, + "time_per_iteration": 3.902531862258911 + }, + { + "auxiliary_loss_clip": 0.01102096, + "auxiliary_loss_mlp": 0.01029811, + "balance_loss_clip": 1.01761603, + "balance_loss_mlp": 1.03548145, + "epoch": 0.7247256876597024, + "flos": 21872471713920.0, + "grad_norm": 2.1030984660426792, + "language_loss": 0.78042889, + "learning_rate": 7.434740165518898e-07, + "loss": 0.80174804, + "num_input_tokens_seen": 260107760, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66796875, + "step": 12054, + "time_per_iteration": 2.4352877140045166 + }, + { + "auxiliary_loss_clip": 0.01101331, + "auxiliary_loss_mlp": 0.01033632, + "balance_loss_clip": 1.02155614, + "balance_loss_mlp": 1.03527296, + "epoch": 0.7247858109123704, + "flos": 16213093585920.0, + "grad_norm": 2.826077293282499, + "language_loss": 0.68607175, + "learning_rate": 7.431710387856301e-07, + "loss": 0.70742142, + "num_input_tokens_seen": 260123660, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 12055, + "time_per_iteration": 3.8767430782318115 + }, + { + "auxiliary_loss_clip": 0.01098671, + "auxiliary_loss_mlp": 0.01028558, + "balance_loss_clip": 1.0167743, + "balance_loss_mlp": 1.03378785, + "epoch": 0.7248459341650383, + "flos": 20850705434880.0, + "grad_norm": 1.7289479887024157, + "language_loss": 0.73999792, + "learning_rate": 7.428681086784496e-07, + "loss": 0.76127023, + "num_input_tokens_seen": 260142690, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6484375, + "step": 12056, + "time_per_iteration": 5.455943822860718 + }, + { + "auxiliary_loss_clip": 0.010981, + "auxiliary_loss_mlp": 0.01023811, + "balance_loss_clip": 1.01225948, + "balance_loss_mlp": 1.03432655, + "epoch": 0.7249060574177063, + "flos": 25921794614400.0, + "grad_norm": 1.6012339855962578, + "language_loss": 0.70800096, + "learning_rate": 7.425652262418368e-07, + "loss": 0.72922009, + "num_input_tokens_seen": 260162590, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.63671875, + "step": 12057, + "time_per_iteration": 2.5277090072631836 + }, + { + "auxiliary_loss_clip": 0.01104249, + "auxiliary_loss_mlp": 0.01033995, + "balance_loss_clip": 1.02146614, + "balance_loss_mlp": 1.03651786, + "epoch": 0.7249661806703742, + "flos": 17345536646400.0, + "grad_norm": 1.9271030457531089, + "language_loss": 0.6256361, + "learning_rate": 7.42262391487277e-07, + "loss": 0.64701855, + "num_input_tokens_seen": 260181065, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 12058, + "time_per_iteration": 2.4183826446533203 + }, + { + "auxiliary_loss_clip": 0.01105608, + "auxiliary_loss_mlp": 0.01029496, + "balance_loss_clip": 1.01710391, + "balance_loss_mlp": 1.03729975, + "epoch": 0.7250263039230422, + "flos": 19574153009280.0, + "grad_norm": 1.9655611905409667, + "language_loss": 0.74991, + "learning_rate": 7.419596044262535e-07, + "loss": 0.7712611, + "num_input_tokens_seen": 260200330, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 12059, + "time_per_iteration": 2.4240307807922363 + }, + { + "auxiliary_loss_clip": 0.01098542, + "auxiliary_loss_mlp": 0.01030731, + "balance_loss_clip": 1.01989508, + "balance_loss_mlp": 1.03418756, + "epoch": 0.7250864271757103, + "flos": 21976648133760.0, + "grad_norm": 1.73148336462866, + "language_loss": 0.79305416, + "learning_rate": 7.416568650702472e-07, + "loss": 0.81434691, + "num_input_tokens_seen": 260219975, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.640625, + "step": 12060, + "time_per_iteration": 2.493459463119507 + }, + { + "auxiliary_loss_clip": 0.0110292, + "auxiliary_loss_mlp": 0.01028467, + "balance_loss_clip": 1.01606321, + "balance_loss_mlp": 1.03522062, + "epoch": 0.7251465504283782, + "flos": 25012608537600.0, + "grad_norm": 2.354515481339918, + "language_loss": 0.76317465, + "learning_rate": 7.413541734307393e-07, + "loss": 0.78448856, + "num_input_tokens_seen": 260242025, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 12061, + "time_per_iteration": 2.4897234439849854 + }, + { + "auxiliary_loss_clip": 0.01100914, + "auxiliary_loss_mlp": 0.01028273, + "balance_loss_clip": 1.01707315, + "balance_loss_mlp": 1.03607178, + "epoch": 0.7252066736810462, + "flos": 16690131135360.0, + "grad_norm": 1.707041727604455, + "language_loss": 0.81039721, + "learning_rate": 7.410515295192068e-07, + "loss": 0.83168906, + "num_input_tokens_seen": 260260015, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 12062, + "time_per_iteration": 2.4312822818756104 + }, + { + "auxiliary_loss_clip": 0.01106743, + "auxiliary_loss_mlp": 0.0103005, + "balance_loss_clip": 1.01713991, + "balance_loss_mlp": 1.03735328, + "epoch": 0.7252667969337141, + "flos": 25703026830720.0, + "grad_norm": 1.9940387151474506, + "language_loss": 0.68844217, + "learning_rate": 7.407489333471262e-07, + "loss": 0.70981008, + "num_input_tokens_seen": 260278635, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 12063, + "time_per_iteration": 2.5078516006469727 + }, + { + "auxiliary_loss_clip": 0.01099308, + "auxiliary_loss_mlp": 0.01029715, + "balance_loss_clip": 1.01788342, + "balance_loss_mlp": 1.03523588, + "epoch": 0.7253269201863821, + "flos": 18259930195200.0, + "grad_norm": 1.3500136523009691, + "language_loss": 0.69967401, + "learning_rate": 7.40446384925973e-07, + "loss": 0.72096425, + "num_input_tokens_seen": 260298510, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.640625, + "step": 12064, + "time_per_iteration": 2.4525294303894043 + }, + { + "auxiliary_loss_clip": 0.01103585, + "auxiliary_loss_mlp": 0.010308, + "balance_loss_clip": 1.0188607, + "balance_loss_mlp": 1.03744543, + "epoch": 0.72538704343905, + "flos": 20411805150720.0, + "grad_norm": 2.2336703023596716, + "language_loss": 0.90039599, + "learning_rate": 7.401438842672192e-07, + "loss": 0.92173982, + "num_input_tokens_seen": 260317405, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 12065, + "time_per_iteration": 2.4503257274627686 + }, + { + "auxiliary_loss_clip": 0.01023945, + "auxiliary_loss_mlp": 0.00999171, + "balance_loss_clip": 0.99806815, + "balance_loss_mlp": 1.00351691, + "epoch": 0.725447166691718, + "flos": 70151209706880.0, + "grad_norm": 0.6543765045930707, + "language_loss": 0.56138921, + "learning_rate": 7.398414313823349e-07, + "loss": 0.58162034, + "num_input_tokens_seen": 260388085, + "router_z_loss_clip": 0.01104736, + "router_z_loss_mlp": 0.20507812, + "step": 12066, + "time_per_iteration": 3.203951120376587 + }, + { + "auxiliary_loss_clip": 0.01100204, + "auxiliary_loss_mlp": 0.01027787, + "balance_loss_clip": 1.01663494, + "balance_loss_mlp": 1.03434396, + "epoch": 0.725507289944386, + "flos": 27052334254080.0, + "grad_norm": 1.9431934533116317, + "language_loss": 0.76573753, + "learning_rate": 7.395390262827897e-07, + "loss": 0.78701746, + "num_input_tokens_seen": 260406165, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 12067, + "time_per_iteration": 2.5001325607299805 + }, + { + "auxiliary_loss_clip": 0.01024325, + "auxiliary_loss_mlp": 0.01000445, + "balance_loss_clip": 0.99928838, + "balance_loss_mlp": 1.00393391, + "epoch": 0.725567413197054, + "flos": 62921924778240.0, + "grad_norm": 0.7268496108336204, + "language_loss": 0.57092577, + "learning_rate": 7.392366689800515e-07, + "loss": 0.59117347, + "num_input_tokens_seen": 260461365, + "router_z_loss_clip": 0.01153564, + "router_z_loss_mlp": 0.20410156, + "step": 12068, + "time_per_iteration": 2.961564779281616 + }, + { + "auxiliary_loss_clip": 0.01023519, + "auxiliary_loss_mlp": 0.00997832, + "balance_loss_clip": 0.99668139, + "balance_loss_mlp": 1.00306845, + "epoch": 0.7256275364497219, + "flos": 60295957188480.0, + "grad_norm": 0.6592626191043454, + "language_loss": 0.55426753, + "learning_rate": 7.389343594855848e-07, + "loss": 0.57448101, + "num_input_tokens_seen": 260523795, + "router_z_loss_clip": 0.01147461, + "router_z_loss_mlp": 0.20507812, + "step": 12069, + "time_per_iteration": 3.111906051635742 + }, + { + "auxiliary_loss_clip": 0.01098503, + "auxiliary_loss_mlp": 0.01026099, + "balance_loss_clip": 1.015275, + "balance_loss_mlp": 1.03479362, + "epoch": 0.7256876597023899, + "flos": 24498511130880.0, + "grad_norm": 1.8188254561357684, + "language_loss": 0.79876685, + "learning_rate": 7.38632097810854e-07, + "loss": 0.82001287, + "num_input_tokens_seen": 260544765, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.63671875, + "step": 12070, + "time_per_iteration": 2.4814393520355225 + }, + { + "auxiliary_loss_clip": 0.0109711, + "auxiliary_loss_mlp": 0.01030083, + "balance_loss_clip": 1.01860952, + "balance_loss_mlp": 1.03523922, + "epoch": 0.7257477829550578, + "flos": 24352749740160.0, + "grad_norm": 2.135024516193614, + "language_loss": 0.72267014, + "learning_rate": 7.383298839673197e-07, + "loss": 0.74394208, + "num_input_tokens_seen": 260564340, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.62109375, + "step": 12071, + "time_per_iteration": 2.5080463886260986 + }, + { + "auxiliary_loss_clip": 0.01099686, + "auxiliary_loss_mlp": 0.01034521, + "balance_loss_clip": 1.02348769, + "balance_loss_mlp": 1.03501189, + "epoch": 0.7258079062077258, + "flos": 17202217380480.0, + "grad_norm": 1.7654284044796786, + "language_loss": 0.6994983, + "learning_rate": 7.380277179664436e-07, + "loss": 0.72084033, + "num_input_tokens_seen": 260582565, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.64453125, + "step": 12072, + "time_per_iteration": 2.430056571960449 + }, + { + "auxiliary_loss_clip": 0.01103966, + "auxiliary_loss_mlp": 0.01029251, + "balance_loss_clip": 1.01702607, + "balance_loss_mlp": 1.03472924, + "epoch": 0.7258680294603939, + "flos": 21580338401280.0, + "grad_norm": 1.7824187520349677, + "language_loss": 0.78317153, + "learning_rate": 7.377255998196821e-07, + "loss": 0.80450368, + "num_input_tokens_seen": 260601700, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69140625, + "step": 12073, + "time_per_iteration": 2.479287624359131 + }, + { + "auxiliary_loss_clip": 0.01100141, + "auxiliary_loss_mlp": 0.01026691, + "balance_loss_clip": 1.01472855, + "balance_loss_mlp": 1.03557312, + "epoch": 0.7259281527130618, + "flos": 34855399036800.0, + "grad_norm": 1.6619094478292162, + "language_loss": 0.70389605, + "learning_rate": 7.374235295384923e-07, + "loss": 0.72516435, + "num_input_tokens_seen": 260623040, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.64453125, + "step": 12074, + "time_per_iteration": 2.674909830093384 + }, + { + "auxiliary_loss_clip": 0.01103212, + "auxiliary_loss_mlp": 0.01027211, + "balance_loss_clip": 1.01514673, + "balance_loss_mlp": 1.03562987, + "epoch": 0.7259882759657298, + "flos": 25404644551680.0, + "grad_norm": 1.6427266790682502, + "language_loss": 0.7405411, + "learning_rate": 7.371215071343302e-07, + "loss": 0.76184535, + "num_input_tokens_seen": 260642735, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.67578125, + "step": 12075, + "time_per_iteration": 2.4879863262176514 + }, + { + "auxiliary_loss_clip": 0.01102234, + "auxiliary_loss_mlp": 0.01030684, + "balance_loss_clip": 1.01842904, + "balance_loss_mlp": 1.03551388, + "epoch": 0.7260483992183977, + "flos": 62953630531200.0, + "grad_norm": 1.5060189576698704, + "language_loss": 0.635382, + "learning_rate": 7.368195326186458e-07, + "loss": 0.65671116, + "num_input_tokens_seen": 260669935, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.66796875, + "step": 12076, + "time_per_iteration": 2.817375659942627 + }, + { + "auxiliary_loss_clip": 0.0110076, + "auxiliary_loss_mlp": 0.01025872, + "balance_loss_clip": 1.0139389, + "balance_loss_mlp": 1.03412342, + "epoch": 0.7261085224710657, + "flos": 26467528924800.0, + "grad_norm": 2.5366204857105332, + "language_loss": 0.79249585, + "learning_rate": 7.365176060028912e-07, + "loss": 0.81376213, + "num_input_tokens_seen": 260689605, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66796875, + "step": 12077, + "time_per_iteration": 2.513556480407715 + }, + { + "auxiliary_loss_clip": 0.01023637, + "auxiliary_loss_mlp": 0.01002866, + "balance_loss_clip": 1.00172806, + "balance_loss_mlp": 1.00323439, + "epoch": 0.7261686457237336, + "flos": 66772732187520.0, + "grad_norm": 0.8858624910390671, + "language_loss": 0.64977288, + "learning_rate": 7.362157272985163e-07, + "loss": 0.67003787, + "num_input_tokens_seen": 260748265, + "router_z_loss_clip": 0.01141357, + "router_z_loss_mlp": 0.20507812, + "step": 12078, + "time_per_iteration": 3.0679736137390137 + }, + { + "auxiliary_loss_clip": 0.01023707, + "auxiliary_loss_mlp": 0.01000415, + "balance_loss_clip": 0.99934798, + "balance_loss_mlp": 1.0032717, + "epoch": 0.7262287689764017, + "flos": 69999594399360.0, + "grad_norm": 0.7121161567572437, + "language_loss": 0.59267461, + "learning_rate": 7.359138965169671e-07, + "loss": 0.61291581, + "num_input_tokens_seen": 260816715, + "router_z_loss_clip": 0.01068115, + "router_z_loss_mlp": 0.20507812, + "step": 12079, + "time_per_iteration": 3.201369524002075 + }, + { + "auxiliary_loss_clip": 0.01099969, + "auxiliary_loss_mlp": 0.01030168, + "balance_loss_clip": 1.01752567, + "balance_loss_mlp": 1.03409278, + "epoch": 0.7262888922290696, + "flos": 23805435231360.0, + "grad_norm": 1.8820513707228834, + "language_loss": 0.65003538, + "learning_rate": 7.356121136696895e-07, + "loss": 0.67133677, + "num_input_tokens_seen": 260836765, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.66015625, + "step": 12080, + "time_per_iteration": 2.4735429286956787 + }, + { + "auxiliary_loss_clip": 0.01101349, + "auxiliary_loss_mlp": 0.01026647, + "balance_loss_clip": 1.01415968, + "balance_loss_mlp": 1.0338223, + "epoch": 0.7263490154817376, + "flos": 19500320603520.0, + "grad_norm": 5.946673694238332, + "language_loss": 0.699211, + "learning_rate": 7.35310378768128e-07, + "loss": 0.72049093, + "num_input_tokens_seen": 260854610, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.67578125, + "step": 12081, + "time_per_iteration": 2.4283978939056396 + }, + { + "auxiliary_loss_clip": 0.01104797, + "auxiliary_loss_mlp": 0.0102899, + "balance_loss_clip": 1.01758754, + "balance_loss_mlp": 1.03677154, + "epoch": 0.7264091387344055, + "flos": 16286243633280.0, + "grad_norm": 4.042667093911173, + "language_loss": 0.81073087, + "learning_rate": 7.350086918237237e-07, + "loss": 0.83206874, + "num_input_tokens_seen": 260871620, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6796875, + "step": 12082, + "time_per_iteration": 2.4518401622772217 + }, + { + "auxiliary_loss_clip": 0.01107339, + "auxiliary_loss_mlp": 0.01033829, + "balance_loss_clip": 1.02072203, + "balance_loss_mlp": 1.03555846, + "epoch": 0.7264692619870735, + "flos": 24352031468160.0, + "grad_norm": 1.773588814829077, + "language_loss": 0.76834166, + "learning_rate": 7.347070528479158e-07, + "loss": 0.78975332, + "num_input_tokens_seen": 260890490, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.71875, + "step": 12083, + "time_per_iteration": 2.4874460697174072 + }, + { + "auxiliary_loss_clip": 0.01106226, + "auxiliary_loss_mlp": 0.01031622, + "balance_loss_clip": 1.01915908, + "balance_loss_mlp": 1.03815079, + "epoch": 0.7265293852397414, + "flos": 25119478477440.0, + "grad_norm": 1.6288025457613526, + "language_loss": 0.72911334, + "learning_rate": 7.344054618521433e-07, + "loss": 0.75049186, + "num_input_tokens_seen": 260909700, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 12084, + "time_per_iteration": 2.4936935901641846 + }, + { + "auxiliary_loss_clip": 0.01104738, + "auxiliary_loss_mlp": 0.01031261, + "balance_loss_clip": 1.01855981, + "balance_loss_mlp": 1.03661275, + "epoch": 0.7265895084924094, + "flos": 22638230784000.0, + "grad_norm": 1.683298254553577, + "language_loss": 0.77603686, + "learning_rate": 7.34103918847843e-07, + "loss": 0.79739684, + "num_input_tokens_seen": 260929090, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 12085, + "time_per_iteration": 2.461860418319702 + }, + { + "auxiliary_loss_clip": 0.01101384, + "auxiliary_loss_mlp": 0.01032841, + "balance_loss_clip": 1.02104557, + "balance_loss_mlp": 1.03391504, + "epoch": 0.7266496317450775, + "flos": 23368222886400.0, + "grad_norm": 1.8314526850775286, + "language_loss": 0.72461057, + "learning_rate": 7.338024238464493e-07, + "loss": 0.74595284, + "num_input_tokens_seen": 260946615, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 12086, + "time_per_iteration": 2.4804890155792236 + }, + { + "auxiliary_loss_clip": 0.01102997, + "auxiliary_loss_mlp": 0.01033387, + "balance_loss_clip": 1.02163863, + "balance_loss_mlp": 1.03661227, + "epoch": 0.7267097549977454, + "flos": 28074603323520.0, + "grad_norm": 2.0882270871339492, + "language_loss": 0.69382304, + "learning_rate": 7.335009768593938e-07, + "loss": 0.71518683, + "num_input_tokens_seen": 260968515, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6640625, + "step": 12087, + "time_per_iteration": 2.472632884979248 + }, + { + "auxiliary_loss_clip": 0.01105347, + "auxiliary_loss_mlp": 0.01034009, + "balance_loss_clip": 1.02104521, + "balance_loss_mlp": 1.03732419, + "epoch": 0.7267698782504134, + "flos": 22195523658240.0, + "grad_norm": 2.250412175179094, + "language_loss": 0.79011619, + "learning_rate": 7.331995778981088e-07, + "loss": 0.81150979, + "num_input_tokens_seen": 260986790, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6796875, + "step": 12088, + "time_per_iteration": 2.455793857574463 + }, + { + "auxiliary_loss_clip": 0.01103631, + "auxiliary_loss_mlp": 0.01035978, + "balance_loss_clip": 1.02433753, + "balance_loss_mlp": 1.03484094, + "epoch": 0.7268300015030813, + "flos": 18514859996160.0, + "grad_norm": 1.695956180050093, + "language_loss": 0.73965418, + "learning_rate": 7.328982269740221e-07, + "loss": 0.76105028, + "num_input_tokens_seen": 261004925, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6875, + "step": 12089, + "time_per_iteration": 2.4252777099609375 + }, + { + "auxiliary_loss_clip": 0.01103186, + "auxiliary_loss_mlp": 0.01033658, + "balance_loss_clip": 1.0215764, + "balance_loss_mlp": 1.03553808, + "epoch": 0.7268901247557493, + "flos": 23986029836160.0, + "grad_norm": 1.809103044869338, + "language_loss": 0.70920813, + "learning_rate": 7.325969240985616e-07, + "loss": 0.73057657, + "num_input_tokens_seen": 261023895, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 12090, + "time_per_iteration": 2.500497817993164 + }, + { + "auxiliary_loss_clip": 0.01103253, + "auxiliary_loss_mlp": 0.01029106, + "balance_loss_clip": 1.01645172, + "balance_loss_mlp": 1.03472519, + "epoch": 0.7269502480084172, + "flos": 32088087429120.0, + "grad_norm": 1.7365025485289893, + "language_loss": 0.7741468, + "learning_rate": 7.322956692831528e-07, + "loss": 0.79547042, + "num_input_tokens_seen": 261045445, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.68359375, + "step": 12091, + "time_per_iteration": 2.5417003631591797 + }, + { + "auxiliary_loss_clip": 0.01100865, + "auxiliary_loss_mlp": 0.01028025, + "balance_loss_clip": 1.01566255, + "balance_loss_mlp": 1.03411698, + "epoch": 0.7270103712610853, + "flos": 19062785036160.0, + "grad_norm": 3.1465600327537304, + "language_loss": 0.71302813, + "learning_rate": 7.319944625392205e-07, + "loss": 0.73431706, + "num_input_tokens_seen": 261064275, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.66796875, + "step": 12092, + "time_per_iteration": 2.4790890216827393 + }, + { + "auxiliary_loss_clip": 0.01101296, + "auxiliary_loss_mlp": 0.01029757, + "balance_loss_clip": 1.01770473, + "balance_loss_mlp": 1.03515983, + "epoch": 0.7270704945137532, + "flos": 34532921710080.0, + "grad_norm": 1.8134968044947444, + "language_loss": 0.6129632, + "learning_rate": 7.31693303878184e-07, + "loss": 0.63427377, + "num_input_tokens_seen": 261083310, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.66015625, + "step": 12093, + "time_per_iteration": 2.531416654586792 + }, + { + "auxiliary_loss_clip": 0.01102331, + "auxiliary_loss_mlp": 0.01031046, + "balance_loss_clip": 1.01923263, + "balance_loss_mlp": 1.03584278, + "epoch": 0.7271306177664212, + "flos": 21507583403520.0, + "grad_norm": 1.5414395566200807, + "language_loss": 0.75677824, + "learning_rate": 7.313921933114644e-07, + "loss": 0.77811199, + "num_input_tokens_seen": 261103460, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 12094, + "time_per_iteration": 3.885373592376709 + }, + { + "auxiliary_loss_clip": 0.01099162, + "auxiliary_loss_mlp": 0.0103038, + "balance_loss_clip": 1.01925766, + "balance_loss_mlp": 1.03378463, + "epoch": 0.7271907410190891, + "flos": 22272444633600.0, + "grad_norm": 1.9126635522388606, + "language_loss": 0.84773397, + "learning_rate": 7.310911308504808e-07, + "loss": 0.8690294, + "num_input_tokens_seen": 261121375, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 12095, + "time_per_iteration": 2.429746150970459 + }, + { + "auxiliary_loss_clip": 0.01101056, + "auxiliary_loss_mlp": 0.01033976, + "balance_loss_clip": 1.02127481, + "balance_loss_mlp": 1.03374481, + "epoch": 0.7272508642717571, + "flos": 22893124671360.0, + "grad_norm": 1.7505444036152586, + "language_loss": 0.78038371, + "learning_rate": 7.307901165066479e-07, + "loss": 0.80173397, + "num_input_tokens_seen": 261141105, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.671875, + "step": 12096, + "time_per_iteration": 3.8615665435791016 + }, + { + "auxiliary_loss_clip": 0.01103926, + "auxiliary_loss_mlp": 0.01031359, + "balance_loss_clip": 1.0200038, + "balance_loss_mlp": 1.03728688, + "epoch": 0.727310987524425, + "flos": 11655886331520.0, + "grad_norm": 2.3221692333246655, + "language_loss": 0.7232452, + "learning_rate": 7.30489150291381e-07, + "loss": 0.74459803, + "num_input_tokens_seen": 261159255, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 12097, + "time_per_iteration": 3.8505306243896484 + }, + { + "auxiliary_loss_clip": 0.01104342, + "auxiliary_loss_mlp": 0.01034863, + "balance_loss_clip": 1.02190495, + "balance_loss_mlp": 1.03669655, + "epoch": 0.727371110777093, + "flos": 24535319592960.0, + "grad_norm": 2.177278264782312, + "language_loss": 0.7672922, + "learning_rate": 7.301882322160935e-07, + "loss": 0.78868425, + "num_input_tokens_seen": 261177960, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6796875, + "step": 12098, + "time_per_iteration": 4.021664142608643 + }, + { + "auxiliary_loss_clip": 0.01102665, + "auxiliary_loss_mlp": 0.01032278, + "balance_loss_clip": 1.01946902, + "balance_loss_mlp": 1.03345513, + "epoch": 0.7274312340297611, + "flos": 74739835405440.0, + "grad_norm": 1.8124975199898956, + "language_loss": 0.6742186, + "learning_rate": 7.298873622921952e-07, + "loss": 0.69556803, + "num_input_tokens_seen": 261205660, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 12099, + "time_per_iteration": 2.8312809467315674 + }, + { + "auxiliary_loss_clip": 0.01106918, + "auxiliary_loss_mlp": 0.01032961, + "balance_loss_clip": 1.02005613, + "balance_loss_mlp": 1.0350759, + "epoch": 0.727491357282429, + "flos": 22342865247360.0, + "grad_norm": 4.666251767932542, + "language_loss": 0.72614902, + "learning_rate": 7.29586540531095e-07, + "loss": 0.74754786, + "num_input_tokens_seen": 261225185, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.71875, + "step": 12100, + "time_per_iteration": 2.48777437210083 + }, + { + "auxiliary_loss_clip": 0.01103782, + "auxiliary_loss_mlp": 0.01033607, + "balance_loss_clip": 1.02218103, + "balance_loss_mlp": 1.03623843, + "epoch": 0.727551480535097, + "flos": 23297550877440.0, + "grad_norm": 1.4296037667662786, + "language_loss": 0.74749982, + "learning_rate": 7.292857669442005e-07, + "loss": 0.76887369, + "num_input_tokens_seen": 261247965, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.67578125, + "step": 12101, + "time_per_iteration": 2.460813045501709 + }, + { + "auxiliary_loss_clip": 0.01102809, + "auxiliary_loss_mlp": 0.01030746, + "balance_loss_clip": 1.01962399, + "balance_loss_mlp": 1.03687561, + "epoch": 0.7276116037877649, + "flos": 21470559459840.0, + "grad_norm": 1.6471267556293203, + "language_loss": 0.82180774, + "learning_rate": 7.289850415429177e-07, + "loss": 0.84314322, + "num_input_tokens_seen": 261267585, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66015625, + "step": 12102, + "time_per_iteration": 2.486891031265259 + }, + { + "auxiliary_loss_clip": 0.01101993, + "auxiliary_loss_mlp": 0.01032966, + "balance_loss_clip": 1.02160573, + "balance_loss_mlp": 1.03577983, + "epoch": 0.7276717270404329, + "flos": 21464059098240.0, + "grad_norm": 2.238789262926412, + "language_loss": 0.81434906, + "learning_rate": 7.286843643386495e-07, + "loss": 0.8356986, + "num_input_tokens_seen": 261285200, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66015625, + "step": 12103, + "time_per_iteration": 2.414008855819702 + }, + { + "auxiliary_loss_clip": 0.01102157, + "auxiliary_loss_mlp": 0.01026281, + "balance_loss_clip": 1.01403213, + "balance_loss_mlp": 1.03556037, + "epoch": 0.7277318502931008, + "flos": 16837221329280.0, + "grad_norm": 2.300581534767291, + "language_loss": 0.66380107, + "learning_rate": 7.283837353427968e-07, + "loss": 0.68508548, + "num_input_tokens_seen": 261303645, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6640625, + "step": 12104, + "time_per_iteration": 2.4741268157958984 + }, + { + "auxiliary_loss_clip": 0.01099619, + "auxiliary_loss_mlp": 0.01028412, + "balance_loss_clip": 1.01674151, + "balance_loss_mlp": 1.03588009, + "epoch": 0.7277919735457689, + "flos": 33400550476800.0, + "grad_norm": 1.8448719986078481, + "language_loss": 0.65691745, + "learning_rate": 7.280831545667611e-07, + "loss": 0.67819774, + "num_input_tokens_seen": 261323265, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.63671875, + "step": 12105, + "time_per_iteration": 2.5147173404693604 + }, + { + "auxiliary_loss_clip": 0.01103458, + "auxiliary_loss_mlp": 0.01032555, + "balance_loss_clip": 1.02052665, + "balance_loss_mlp": 1.03698063, + "epoch": 0.7278520967984368, + "flos": 19206499351680.0, + "grad_norm": 2.269554332821791, + "language_loss": 0.75712693, + "learning_rate": 7.27782622021939e-07, + "loss": 0.77848709, + "num_input_tokens_seen": 261339745, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 12106, + "time_per_iteration": 2.435525417327881 + }, + { + "auxiliary_loss_clip": 0.01105516, + "auxiliary_loss_mlp": 0.01028309, + "balance_loss_clip": 1.01580417, + "balance_loss_mlp": 1.03651524, + "epoch": 0.7279122200511048, + "flos": 34094667870720.0, + "grad_norm": 2.027947954090959, + "language_loss": 0.70116639, + "learning_rate": 7.274821377197273e-07, + "loss": 0.72250462, + "num_input_tokens_seen": 261359310, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 12107, + "time_per_iteration": 2.5302398204803467 + }, + { + "auxiliary_loss_clip": 0.01101241, + "auxiliary_loss_mlp": 0.0103217, + "balance_loss_clip": 1.02056551, + "balance_loss_mlp": 1.03459477, + "epoch": 0.7279723433037727, + "flos": 54599049348480.0, + "grad_norm": 1.520569075146339, + "language_loss": 0.75155759, + "learning_rate": 7.271817016715205e-07, + "loss": 0.77289176, + "num_input_tokens_seen": 261384640, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 12108, + "time_per_iteration": 2.7630767822265625 + }, + { + "auxiliary_loss_clip": 0.01102209, + "auxiliary_loss_mlp": 0.01028873, + "balance_loss_clip": 1.01658893, + "balance_loss_mlp": 1.03495109, + "epoch": 0.7280324665564407, + "flos": 36137482156800.0, + "grad_norm": 1.5886355104574046, + "language_loss": 0.66785181, + "learning_rate": 7.268813138887124e-07, + "loss": 0.68916261, + "num_input_tokens_seen": 261405290, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.671875, + "step": 12109, + "time_per_iteration": 2.5576727390289307 + }, + { + "auxiliary_loss_clip": 0.01102344, + "auxiliary_loss_mlp": 0.01030828, + "balance_loss_clip": 1.01853728, + "balance_loss_mlp": 1.03609085, + "epoch": 0.7280925898091086, + "flos": 11618539165440.0, + "grad_norm": 1.9794357831275327, + "language_loss": 0.62950575, + "learning_rate": 7.265809743826912e-07, + "loss": 0.65083742, + "num_input_tokens_seen": 261419710, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6640625, + "step": 12110, + "time_per_iteration": 2.44002366065979 + }, + { + "auxiliary_loss_clip": 0.01102169, + "auxiliary_loss_mlp": 0.01026996, + "balance_loss_clip": 1.01408529, + "balance_loss_mlp": 1.03304601, + "epoch": 0.7281527130617766, + "flos": 34277094069120.0, + "grad_norm": 1.7658774771753212, + "language_loss": 0.58043802, + "learning_rate": 7.26280683164847e-07, + "loss": 0.60172975, + "num_input_tokens_seen": 261442385, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69140625, + "step": 12111, + "time_per_iteration": 2.6210787296295166 + }, + { + "auxiliary_loss_clip": 0.01106335, + "auxiliary_loss_mlp": 0.01030725, + "balance_loss_clip": 1.01827931, + "balance_loss_mlp": 1.03801906, + "epoch": 0.7282128363144446, + "flos": 13918043018880.0, + "grad_norm": 1.9352527589955661, + "language_loss": 0.73992717, + "learning_rate": 7.259804402465677e-07, + "loss": 0.76129776, + "num_input_tokens_seen": 261459805, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.68359375, + "step": 12112, + "time_per_iteration": 2.4524636268615723 + }, + { + "auxiliary_loss_clip": 0.01099679, + "auxiliary_loss_mlp": 0.01029493, + "balance_loss_clip": 1.01777458, + "balance_loss_mlp": 1.03403258, + "epoch": 0.7282729595671126, + "flos": 20777627214720.0, + "grad_norm": 2.0053906619330006, + "language_loss": 0.67298758, + "learning_rate": 7.25680245639237e-07, + "loss": 0.69427931, + "num_input_tokens_seen": 261477175, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 12113, + "time_per_iteration": 2.4597878456115723 + }, + { + "auxiliary_loss_clip": 0.01102125, + "auxiliary_loss_mlp": 0.01030726, + "balance_loss_clip": 1.01829863, + "balance_loss_mlp": 1.03391302, + "epoch": 0.7283330828197806, + "flos": 16325422392960.0, + "grad_norm": 1.6626035833227917, + "language_loss": 0.73243928, + "learning_rate": 7.253800993542399e-07, + "loss": 0.75376785, + "num_input_tokens_seen": 261494990, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.68359375, + "step": 12114, + "time_per_iteration": 2.4250495433807373 + }, + { + "auxiliary_loss_clip": 0.01099798, + "auxiliary_loss_mlp": 0.01029231, + "balance_loss_clip": 1.01685691, + "balance_loss_mlp": 1.03370285, + "epoch": 0.7283932060724485, + "flos": 27490193043840.0, + "grad_norm": 2.0029156408767714, + "language_loss": 0.68175685, + "learning_rate": 7.250800014029564e-07, + "loss": 0.70304716, + "num_input_tokens_seen": 261514445, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6640625, + "step": 12115, + "time_per_iteration": 2.4954171180725098 + }, + { + "auxiliary_loss_clip": 0.01103561, + "auxiliary_loss_mlp": 0.01027892, + "balance_loss_clip": 1.01567912, + "balance_loss_mlp": 1.03449523, + "epoch": 0.7284533293251165, + "flos": 18367877543040.0, + "grad_norm": 1.5749182133229294, + "language_loss": 0.59722745, + "learning_rate": 7.247799517967674e-07, + "loss": 0.61854202, + "num_input_tokens_seen": 261533565, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69140625, + "step": 12116, + "time_per_iteration": 2.5029101371765137 + }, + { + "auxiliary_loss_clip": 0.01100013, + "auxiliary_loss_mlp": 0.01028036, + "balance_loss_clip": 1.01579905, + "balance_loss_mlp": 1.03508806, + "epoch": 0.7285134525777844, + "flos": 21725525174400.0, + "grad_norm": 1.7186518000931694, + "language_loss": 0.72523415, + "learning_rate": 7.2447995054705e-07, + "loss": 0.74651456, + "num_input_tokens_seen": 261553795, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6484375, + "step": 12117, + "time_per_iteration": 2.4426584243774414 + }, + { + "auxiliary_loss_clip": 0.01101836, + "auxiliary_loss_mlp": 0.01024568, + "balance_loss_clip": 1.01234937, + "balance_loss_mlp": 1.03475642, + "epoch": 0.7285735758304525, + "flos": 20741357456640.0, + "grad_norm": 2.143264936763247, + "language_loss": 0.69296616, + "learning_rate": 7.241799976651807e-07, + "loss": 0.71423018, + "num_input_tokens_seen": 261572565, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 12118, + "time_per_iteration": 2.5339369773864746 + }, + { + "auxiliary_loss_clip": 0.01097686, + "auxiliary_loss_mlp": 0.0103422, + "balance_loss_clip": 1.02279413, + "balance_loss_mlp": 1.03442514, + "epoch": 0.7286336990831204, + "flos": 17310954827520.0, + "grad_norm": 1.6909309126085614, + "language_loss": 0.84203392, + "learning_rate": 7.238800931625346e-07, + "loss": 0.86335295, + "num_input_tokens_seen": 261590910, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6328125, + "step": 12119, + "time_per_iteration": 2.3954200744628906 + }, + { + "auxiliary_loss_clip": 0.01102768, + "auxiliary_loss_mlp": 0.01027674, + "balance_loss_clip": 1.01645637, + "balance_loss_mlp": 1.03579891, + "epoch": 0.7286938223357884, + "flos": 19787390098560.0, + "grad_norm": 2.2822251390786312, + "language_loss": 0.82164419, + "learning_rate": 7.235802370504831e-07, + "loss": 0.84294862, + "num_input_tokens_seen": 261606005, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 12120, + "time_per_iteration": 2.4175772666931152 + }, + { + "auxiliary_loss_clip": 0.01104482, + "auxiliary_loss_mlp": 0.0103554, + "balance_loss_clip": 1.02358377, + "balance_loss_mlp": 1.03648496, + "epoch": 0.7287539455884563, + "flos": 15340859625600.0, + "grad_norm": 1.8056895427232635, + "language_loss": 0.78642154, + "learning_rate": 7.232804293403963e-07, + "loss": 0.80782175, + "num_input_tokens_seen": 261622305, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6796875, + "step": 12121, + "time_per_iteration": 2.406684160232544 + }, + { + "auxiliary_loss_clip": 0.01100839, + "auxiliary_loss_mlp": 0.01029423, + "balance_loss_clip": 1.01706123, + "balance_loss_mlp": 1.03222573, + "epoch": 0.7288140688411243, + "flos": 25192484870400.0, + "grad_norm": 1.5367306608153926, + "language_loss": 0.6915673, + "learning_rate": 7.229806700436441e-07, + "loss": 0.71286988, + "num_input_tokens_seen": 261642465, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 12122, + "time_per_iteration": 2.533647060394287 + }, + { + "auxiliary_loss_clip": 0.01097102, + "auxiliary_loss_mlp": 0.01029883, + "balance_loss_clip": 1.01871347, + "balance_loss_mlp": 1.03240955, + "epoch": 0.7288741920937922, + "flos": 23984162328960.0, + "grad_norm": 1.8487795313278665, + "language_loss": 0.8722074, + "learning_rate": 7.226809591715923e-07, + "loss": 0.89347732, + "num_input_tokens_seen": 261661420, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 12123, + "time_per_iteration": 2.4654133319854736 + }, + { + "auxiliary_loss_clip": 0.01099535, + "auxiliary_loss_mlp": 0.01031272, + "balance_loss_clip": 1.01967263, + "balance_loss_mlp": 1.03390992, + "epoch": 0.7289343153464602, + "flos": 22744921155840.0, + "grad_norm": 2.1267005511199604, + "language_loss": 0.8275702, + "learning_rate": 7.223812967356065e-07, + "loss": 0.84887826, + "num_input_tokens_seen": 261680865, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 12124, + "time_per_iteration": 2.5298664569854736 + }, + { + "auxiliary_loss_clip": 0.01100083, + "auxiliary_loss_mlp": 0.01028709, + "balance_loss_clip": 1.01730633, + "balance_loss_mlp": 1.0351851, + "epoch": 0.7289944385991282, + "flos": 24900028335360.0, + "grad_norm": 1.8446613007140906, + "language_loss": 0.67240703, + "learning_rate": 7.220816827470499e-07, + "loss": 0.69369495, + "num_input_tokens_seen": 261701455, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 12125, + "time_per_iteration": 2.4683637619018555 + }, + { + "auxiliary_loss_clip": 0.01105338, + "auxiliary_loss_mlp": 0.01030214, + "balance_loss_clip": 1.01760745, + "balance_loss_mlp": 1.03575897, + "epoch": 0.7290545618517962, + "flos": 22967064817920.0, + "grad_norm": 1.8041889285235344, + "language_loss": 0.74976206, + "learning_rate": 7.217821172172855e-07, + "loss": 0.77111757, + "num_input_tokens_seen": 261721260, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 12126, + "time_per_iteration": 2.4857234954833984 + }, + { + "auxiliary_loss_clip": 0.01023798, + "auxiliary_loss_mlp": 0.01004495, + "balance_loss_clip": 1.00342834, + "balance_loss_mlp": 1.0033108, + "epoch": 0.7291146851044642, + "flos": 61901523216000.0, + "grad_norm": 0.8154544542721714, + "language_loss": 0.58675981, + "learning_rate": 7.2148260015767e-07, + "loss": 0.60704273, + "num_input_tokens_seen": 261779370, + "router_z_loss_clip": 0.01068115, + "router_z_loss_mlp": 0.20507812, + "step": 12127, + "time_per_iteration": 2.9716975688934326 + }, + { + "auxiliary_loss_clip": 0.01100331, + "auxiliary_loss_mlp": 0.01027444, + "balance_loss_clip": 1.0168165, + "balance_loss_mlp": 1.03571177, + "epoch": 0.7291748083571321, + "flos": 23330947547520.0, + "grad_norm": 1.9593385701209045, + "language_loss": 0.69048452, + "learning_rate": 7.21183131579562e-07, + "loss": 0.71176225, + "num_input_tokens_seen": 261798050, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6484375, + "step": 12128, + "time_per_iteration": 2.5162582397460938 + }, + { + "auxiliary_loss_clip": 0.0110308, + "auxiliary_loss_mlp": 0.01032469, + "balance_loss_clip": 1.02043474, + "balance_loss_mlp": 1.03561521, + "epoch": 0.7292349316098001, + "flos": 28330000001280.0, + "grad_norm": 2.0485847355558953, + "language_loss": 0.65249133, + "learning_rate": 7.20883711494319e-07, + "loss": 0.67384678, + "num_input_tokens_seen": 261817660, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 12129, + "time_per_iteration": 2.487868547439575 + }, + { + "auxiliary_loss_clip": 0.01098698, + "auxiliary_loss_mlp": 0.01026271, + "balance_loss_clip": 1.01426673, + "balance_loss_mlp": 1.03446507, + "epoch": 0.729295054862468, + "flos": 24132222190080.0, + "grad_norm": 2.5377483717802485, + "language_loss": 0.74676943, + "learning_rate": 7.205843399132927e-07, + "loss": 0.76801908, + "num_input_tokens_seen": 261837935, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.640625, + "step": 12130, + "time_per_iteration": 2.5030577182769775 + }, + { + "auxiliary_loss_clip": 0.01100647, + "auxiliary_loss_mlp": 0.01029321, + "balance_loss_clip": 1.01728129, + "balance_loss_mlp": 1.0347085, + "epoch": 0.7293551781151361, + "flos": 22816239609600.0, + "grad_norm": 1.617355369468953, + "language_loss": 0.6962043, + "learning_rate": 7.202850168478374e-07, + "loss": 0.71750402, + "num_input_tokens_seen": 261857575, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66015625, + "step": 12131, + "time_per_iteration": 2.4352428913116455 + }, + { + "auxiliary_loss_clip": 0.01101676, + "auxiliary_loss_mlp": 0.01028873, + "balance_loss_clip": 1.01771474, + "balance_loss_mlp": 1.03647351, + "epoch": 0.729415301367804, + "flos": 22126683242880.0, + "grad_norm": 1.4743863900351697, + "language_loss": 0.77282, + "learning_rate": 7.199857423093025e-07, + "loss": 0.79412544, + "num_input_tokens_seen": 261877265, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.65234375, + "step": 12132, + "time_per_iteration": 2.495375156402588 + }, + { + "auxiliary_loss_clip": 0.0110199, + "auxiliary_loss_mlp": 0.01032463, + "balance_loss_clip": 1.02124524, + "balance_loss_mlp": 1.03552151, + "epoch": 0.729475424620472, + "flos": 12349608675840.0, + "grad_norm": 2.217572112042413, + "language_loss": 0.79134017, + "learning_rate": 7.196865163090358e-07, + "loss": 0.81268471, + "num_input_tokens_seen": 261893695, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6640625, + "step": 12133, + "time_per_iteration": 2.403266668319702 + }, + { + "auxiliary_loss_clip": 0.01100314, + "auxiliary_loss_mlp": 0.01030932, + "balance_loss_clip": 1.01922512, + "balance_loss_mlp": 1.03376698, + "epoch": 0.7295355478731399, + "flos": 22195308176640.0, + "grad_norm": 1.8655920091949136, + "language_loss": 0.7224102, + "learning_rate": 7.193873388583846e-07, + "loss": 0.74372262, + "num_input_tokens_seen": 261911825, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66796875, + "step": 12134, + "time_per_iteration": 2.510369300842285 + }, + { + "auxiliary_loss_clip": 0.01103467, + "auxiliary_loss_mlp": 0.01035641, + "balance_loss_clip": 1.02342796, + "balance_loss_mlp": 1.03683078, + "epoch": 0.7295956711258079, + "flos": 23222030532480.0, + "grad_norm": 1.8815102601218348, + "language_loss": 0.71485353, + "learning_rate": 7.190882099686939e-07, + "loss": 0.73624468, + "num_input_tokens_seen": 261931190, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6640625, + "step": 12135, + "time_per_iteration": 2.4513211250305176 + }, + { + "auxiliary_loss_clip": 0.01102275, + "auxiliary_loss_mlp": 0.01032318, + "balance_loss_clip": 1.02063513, + "balance_loss_mlp": 1.03478527, + "epoch": 0.7296557943784758, + "flos": 31869104163840.0, + "grad_norm": 2.3479540644405645, + "language_loss": 0.62245309, + "learning_rate": 7.187891296513075e-07, + "loss": 0.64379901, + "num_input_tokens_seen": 261951240, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.67578125, + "step": 12136, + "time_per_iteration": 3.9409608840942383 + }, + { + "auxiliary_loss_clip": 0.01099061, + "auxiliary_loss_mlp": 0.01034881, + "balance_loss_clip": 1.02353823, + "balance_loss_mlp": 1.03336811, + "epoch": 0.7297159176311439, + "flos": 26651714889600.0, + "grad_norm": 1.8075029483736118, + "language_loss": 0.74606574, + "learning_rate": 7.184900979175654e-07, + "loss": 0.76740515, + "num_input_tokens_seen": 261971605, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 12137, + "time_per_iteration": 2.536616086959839 + }, + { + "auxiliary_loss_clip": 0.01104966, + "auxiliary_loss_mlp": 0.01033839, + "balance_loss_clip": 1.02242422, + "balance_loss_mlp": 1.03774345, + "epoch": 0.7297760408838118, + "flos": 24749562263040.0, + "grad_norm": 1.6283862626280647, + "language_loss": 0.74377739, + "learning_rate": 7.181911147788069e-07, + "loss": 0.76516545, + "num_input_tokens_seen": 261990830, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 12138, + "time_per_iteration": 3.9735019207000732 + }, + { + "auxiliary_loss_clip": 0.01097337, + "auxiliary_loss_mlp": 0.01029617, + "balance_loss_clip": 1.01875067, + "balance_loss_mlp": 1.03234982, + "epoch": 0.7298361641364798, + "flos": 18073768982400.0, + "grad_norm": 2.062700649659985, + "language_loss": 0.71971607, + "learning_rate": 7.178921802463702e-07, + "loss": 0.74098563, + "num_input_tokens_seen": 262008190, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6484375, + "step": 12139, + "time_per_iteration": 4.020869731903076 + }, + { + "auxiliary_loss_clip": 0.01097707, + "auxiliary_loss_mlp": 0.01025679, + "balance_loss_clip": 1.01508093, + "balance_loss_mlp": 1.03471375, + "epoch": 0.7298962873891478, + "flos": 29895597169920.0, + "grad_norm": 1.3852703912405009, + "language_loss": 0.73432374, + "learning_rate": 7.175932943315898e-07, + "loss": 0.75555754, + "num_input_tokens_seen": 262030460, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.62890625, + "step": 12140, + "time_per_iteration": 4.02800989151001 + }, + { + "auxiliary_loss_clip": 0.01101201, + "auxiliary_loss_mlp": 0.01031284, + "balance_loss_clip": 1.01883268, + "balance_loss_mlp": 1.03433836, + "epoch": 0.7299564106418157, + "flos": 32266096254720.0, + "grad_norm": 1.6478138849846053, + "language_loss": 0.55289412, + "learning_rate": 7.172944570458003e-07, + "loss": 0.57421893, + "num_input_tokens_seen": 262050830, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.66796875, + "step": 12141, + "time_per_iteration": 2.7540974617004395 + }, + { + "auxiliary_loss_clip": 0.01098698, + "auxiliary_loss_mlp": 0.01024438, + "balance_loss_clip": 1.01330972, + "balance_loss_mlp": 1.0348109, + "epoch": 0.7300165338944837, + "flos": 22930292269440.0, + "grad_norm": 1.4560422495968448, + "language_loss": 0.72527927, + "learning_rate": 7.169956684003342e-07, + "loss": 0.74651062, + "num_input_tokens_seen": 262071245, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.640625, + "step": 12142, + "time_per_iteration": 2.5032155513763428 + }, + { + "auxiliary_loss_clip": 0.01100592, + "auxiliary_loss_mlp": 0.01031929, + "balance_loss_clip": 1.0208838, + "balance_loss_mlp": 1.03534031, + "epoch": 0.7300766571471516, + "flos": 19828795501440.0, + "grad_norm": 1.7431177644397007, + "language_loss": 0.73784506, + "learning_rate": 7.16696928406521e-07, + "loss": 0.75917029, + "num_input_tokens_seen": 262087525, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65234375, + "step": 12143, + "time_per_iteration": 2.4508650302886963 + }, + { + "auxiliary_loss_clip": 0.01101928, + "auxiliary_loss_mlp": 0.01031479, + "balance_loss_clip": 1.01959443, + "balance_loss_mlp": 1.03553247, + "epoch": 0.7301367803998197, + "flos": 24347829576960.0, + "grad_norm": 2.3241470315915786, + "language_loss": 0.66688013, + "learning_rate": 7.163982370756882e-07, + "loss": 0.68821418, + "num_input_tokens_seen": 262107355, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 12144, + "time_per_iteration": 2.5108768939971924 + }, + { + "auxiliary_loss_clip": 0.01101867, + "auxiliary_loss_mlp": 0.01027765, + "balance_loss_clip": 1.01570094, + "balance_loss_mlp": 1.03569078, + "epoch": 0.7301969036524876, + "flos": 15304518040320.0, + "grad_norm": 1.6911946286278683, + "language_loss": 0.79302132, + "learning_rate": 7.160995944191627e-07, + "loss": 0.81431764, + "num_input_tokens_seen": 262125645, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6640625, + "step": 12145, + "time_per_iteration": 2.4418389797210693 + }, + { + "auxiliary_loss_clip": 0.01100282, + "auxiliary_loss_mlp": 0.01028599, + "balance_loss_clip": 1.0172739, + "balance_loss_mlp": 1.03604698, + "epoch": 0.7302570269051556, + "flos": 23507268433920.0, + "grad_norm": 1.6533125281544103, + "language_loss": 0.91145337, + "learning_rate": 7.158010004482702e-07, + "loss": 0.93274218, + "num_input_tokens_seen": 262144075, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 12146, + "time_per_iteration": 2.4392800331115723 + }, + { + "auxiliary_loss_clip": 0.01098845, + "auxiliary_loss_mlp": 0.01025746, + "balance_loss_clip": 1.01512456, + "balance_loss_mlp": 1.03589582, + "epoch": 0.7303171501578235, + "flos": 20523056549760.0, + "grad_norm": 3.9977008079887275, + "language_loss": 0.61903286, + "learning_rate": 7.155024551743316e-07, + "loss": 0.64027882, + "num_input_tokens_seen": 262165940, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.62890625, + "step": 12147, + "time_per_iteration": 2.4647200107574463 + }, + { + "auxiliary_loss_clip": 0.01102166, + "auxiliary_loss_mlp": 0.01035594, + "balance_loss_clip": 1.02376306, + "balance_loss_mlp": 1.03584671, + "epoch": 0.7303772734104915, + "flos": 18332613365760.0, + "grad_norm": 1.8253831896260186, + "language_loss": 0.75063682, + "learning_rate": 7.152039586086693e-07, + "loss": 0.7720145, + "num_input_tokens_seen": 262184520, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 12148, + "time_per_iteration": 2.4266207218170166 + }, + { + "auxiliary_loss_clip": 0.01024253, + "auxiliary_loss_mlp": 0.01006124, + "balance_loss_clip": 1.00514054, + "balance_loss_mlp": 1.0036819, + "epoch": 0.7304373966631594, + "flos": 60654776100480.0, + "grad_norm": 0.6830351523119454, + "language_loss": 0.56657213, + "learning_rate": 7.149055107626017e-07, + "loss": 0.58687592, + "num_input_tokens_seen": 262247070, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.20507812, + "step": 12149, + "time_per_iteration": 3.027615785598755 + }, + { + "auxiliary_loss_clip": 0.01101256, + "auxiliary_loss_mlp": 0.01030223, + "balance_loss_clip": 1.01849318, + "balance_loss_mlp": 1.03406572, + "epoch": 0.7304975199158275, + "flos": 19828077229440.0, + "grad_norm": 1.6835156550315518, + "language_loss": 0.73653138, + "learning_rate": 7.146071116474451e-07, + "loss": 0.75784624, + "num_input_tokens_seen": 262266605, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 12150, + "time_per_iteration": 2.4099485874176025 + }, + { + "auxiliary_loss_clip": 0.01103316, + "auxiliary_loss_mlp": 0.01027257, + "balance_loss_clip": 1.01468682, + "balance_loss_mlp": 1.03478301, + "epoch": 0.7305576431684954, + "flos": 13223997452160.0, + "grad_norm": 1.944560081629452, + "language_loss": 0.84078568, + "learning_rate": 7.143087612745158e-07, + "loss": 0.86209142, + "num_input_tokens_seen": 262283880, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 12151, + "time_per_iteration": 2.4708986282348633 + }, + { + "auxiliary_loss_clip": 0.01102798, + "auxiliary_loss_mlp": 0.0103432, + "balance_loss_clip": 1.0218451, + "balance_loss_mlp": 1.0358156, + "epoch": 0.7306177664211634, + "flos": 24060472773120.0, + "grad_norm": 1.670544008969589, + "language_loss": 0.77620661, + "learning_rate": 7.14010459655127e-07, + "loss": 0.79757774, + "num_input_tokens_seen": 262304155, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.66796875, + "step": 12152, + "time_per_iteration": 2.4695539474487305 + }, + { + "auxiliary_loss_clip": 0.01103894, + "auxiliary_loss_mlp": 0.01028837, + "balance_loss_clip": 1.01692247, + "balance_loss_mlp": 1.03786087, + "epoch": 0.7306778896738314, + "flos": 27089106802560.0, + "grad_norm": 1.5663619490166691, + "language_loss": 0.79568756, + "learning_rate": 7.137122068005919e-07, + "loss": 0.81701493, + "num_input_tokens_seen": 262325660, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66015625, + "step": 12153, + "time_per_iteration": 2.533879280090332 + }, + { + "auxiliary_loss_clip": 0.01105053, + "auxiliary_loss_mlp": 0.01030327, + "balance_loss_clip": 1.0184778, + "balance_loss_mlp": 1.03624892, + "epoch": 0.7307380129264993, + "flos": 16690669839360.0, + "grad_norm": 1.621227897072943, + "language_loss": 0.67485428, + "learning_rate": 7.134140027222173e-07, + "loss": 0.69620812, + "num_input_tokens_seen": 262344075, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6875, + "step": 12154, + "time_per_iteration": 2.418184995651245 + }, + { + "auxiliary_loss_clip": 0.01102596, + "auxiliary_loss_mlp": 0.01029751, + "balance_loss_clip": 1.01756167, + "balance_loss_mlp": 1.03488839, + "epoch": 0.7307981361791673, + "flos": 21725740656000.0, + "grad_norm": 1.9151300415152432, + "language_loss": 0.65747088, + "learning_rate": 7.131158474313128e-07, + "loss": 0.67879438, + "num_input_tokens_seen": 262363305, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.67578125, + "step": 12155, + "time_per_iteration": 2.4923956394195557 + }, + { + "auxiliary_loss_clip": 0.01096922, + "auxiliary_loss_mlp": 0.01030411, + "balance_loss_clip": 1.01884151, + "balance_loss_mlp": 1.03208816, + "epoch": 0.7308582594318352, + "flos": 18040659621120.0, + "grad_norm": 1.6880646162483905, + "language_loss": 0.81661636, + "learning_rate": 7.128177409391851e-07, + "loss": 0.83788967, + "num_input_tokens_seen": 262380730, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6484375, + "step": 12156, + "time_per_iteration": 2.4129483699798584 + }, + { + "auxiliary_loss_clip": 0.01098675, + "auxiliary_loss_mlp": 0.01030204, + "balance_loss_clip": 1.0193615, + "balance_loss_mlp": 1.03432953, + "epoch": 0.7309183826845033, + "flos": 13844964798720.0, + "grad_norm": 2.405459413664416, + "language_loss": 0.75240982, + "learning_rate": 7.125196832571367e-07, + "loss": 0.77369863, + "num_input_tokens_seen": 262395480, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.640625, + "step": 12157, + "time_per_iteration": 2.4383459091186523 + }, + { + "auxiliary_loss_clip": 0.0109587, + "auxiliary_loss_mlp": 0.0102862, + "balance_loss_clip": 1.01831448, + "balance_loss_mlp": 1.03320694, + "epoch": 0.7309785059371712, + "flos": 17019216564480.0, + "grad_norm": 2.0421552799554457, + "language_loss": 0.72894901, + "learning_rate": 7.122216743964713e-07, + "loss": 0.75019395, + "num_input_tokens_seen": 262413340, + "router_z_loss_clip": 0.10302734, + "router_z_loss_mlp": 0.62890625, + "step": 12158, + "time_per_iteration": 2.409529209136963 + }, + { + "auxiliary_loss_clip": 0.01103494, + "auxiliary_loss_mlp": 0.01030392, + "balance_loss_clip": 1.01861429, + "balance_loss_mlp": 1.03654337, + "epoch": 0.7310386291898392, + "flos": 26502398052480.0, + "grad_norm": 1.5794059929341078, + "language_loss": 0.85767531, + "learning_rate": 7.119237143684896e-07, + "loss": 0.87901425, + "num_input_tokens_seen": 262433455, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.66796875, + "step": 12159, + "time_per_iteration": 2.5144267082214355 + }, + { + "auxiliary_loss_clip": 0.01104084, + "auxiliary_loss_mlp": 0.01029593, + "balance_loss_clip": 1.01700473, + "balance_loss_mlp": 1.03464055, + "epoch": 0.7310987524425071, + "flos": 16945922862720.0, + "grad_norm": 2.076806919622798, + "language_loss": 0.73464298, + "learning_rate": 7.116258031844895e-07, + "loss": 0.75597978, + "num_input_tokens_seen": 262450335, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 12160, + "time_per_iteration": 2.405029535293579 + }, + { + "auxiliary_loss_clip": 0.0110368, + "auxiliary_loss_mlp": 0.01029782, + "balance_loss_clip": 1.01743793, + "balance_loss_mlp": 1.0356549, + "epoch": 0.7311588756951751, + "flos": 13845288021120.0, + "grad_norm": 1.9196235781681743, + "language_loss": 0.72528148, + "learning_rate": 7.113279408557675e-07, + "loss": 0.74661607, + "num_input_tokens_seen": 262468240, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6796875, + "step": 12161, + "time_per_iteration": 2.4075698852539062 + }, + { + "auxiliary_loss_clip": 0.01107154, + "auxiliary_loss_mlp": 0.01029867, + "balance_loss_clip": 1.01682591, + "balance_loss_mlp": 1.03725171, + "epoch": 0.731218998947843, + "flos": 28767894704640.0, + "grad_norm": 5.707259461998225, + "language_loss": 0.69178545, + "learning_rate": 7.110301273936192e-07, + "loss": 0.71315575, + "num_input_tokens_seen": 262487045, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.69921875, + "step": 12162, + "time_per_iteration": 2.5137577056884766 + }, + { + "auxiliary_loss_clip": 0.01103934, + "auxiliary_loss_mlp": 0.01030313, + "balance_loss_clip": 1.01783824, + "balance_loss_mlp": 1.03625202, + "epoch": 0.7312791222005111, + "flos": 27088783580160.0, + "grad_norm": 1.8703565147701806, + "language_loss": 0.66851526, + "learning_rate": 7.107323628093382e-07, + "loss": 0.68985772, + "num_input_tokens_seen": 262504855, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 12163, + "time_per_iteration": 2.4703001976013184 + }, + { + "auxiliary_loss_clip": 0.0110019, + "auxiliary_loss_mlp": 0.01030251, + "balance_loss_clip": 1.01822889, + "balance_loss_mlp": 1.03375793, + "epoch": 0.731339245453179, + "flos": 20924035050240.0, + "grad_norm": 1.4832431428317139, + "language_loss": 0.68488622, + "learning_rate": 7.104346471142153e-07, + "loss": 0.70619065, + "num_input_tokens_seen": 262524920, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 12164, + "time_per_iteration": 2.4578616619110107 + }, + { + "auxiliary_loss_clip": 0.01100044, + "auxiliary_loss_mlp": 0.01031446, + "balance_loss_clip": 1.0206039, + "balance_loss_mlp": 1.0372684, + "epoch": 0.731399368705847, + "flos": 23075694524160.0, + "grad_norm": 1.4717257929564707, + "language_loss": 0.72854477, + "learning_rate": 7.101369803195391e-07, + "loss": 0.74985963, + "num_input_tokens_seen": 262545725, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.62890625, + "step": 12165, + "time_per_iteration": 2.451599359512329 + }, + { + "auxiliary_loss_clip": 0.01102834, + "auxiliary_loss_mlp": 0.01033628, + "balance_loss_clip": 1.0217309, + "balance_loss_mlp": 1.03535652, + "epoch": 0.731459491958515, + "flos": 23582681038080.0, + "grad_norm": 1.8716087020467311, + "language_loss": 0.76773065, + "learning_rate": 7.098393624365988e-07, + "loss": 0.78909522, + "num_input_tokens_seen": 262565480, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 12166, + "time_per_iteration": 2.4635097980499268 + }, + { + "auxiliary_loss_clip": 0.01102127, + "auxiliary_loss_mlp": 0.01032086, + "balance_loss_clip": 1.02040911, + "balance_loss_mlp": 1.03687727, + "epoch": 0.7315196152111829, + "flos": 22379278659840.0, + "grad_norm": 2.0545527072080945, + "language_loss": 0.79531485, + "learning_rate": 7.095417934766781e-07, + "loss": 0.81665695, + "num_input_tokens_seen": 262584145, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65234375, + "step": 12167, + "time_per_iteration": 2.46749210357666 + }, + { + "auxiliary_loss_clip": 0.01101324, + "auxiliary_loss_mlp": 0.01038084, + "balance_loss_clip": 1.02647865, + "balance_loss_mlp": 1.03602624, + "epoch": 0.7315797384638509, + "flos": 26177047637760.0, + "grad_norm": 1.668118675469295, + "language_loss": 0.76923746, + "learning_rate": 7.092442734510622e-07, + "loss": 0.79063153, + "num_input_tokens_seen": 262604045, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 12168, + "time_per_iteration": 2.5427803993225098 + }, + { + "auxiliary_loss_clip": 0.01103242, + "auxiliary_loss_mlp": 0.01033018, + "balance_loss_clip": 1.02010727, + "balance_loss_mlp": 1.03531849, + "epoch": 0.7316398617165188, + "flos": 21506326427520.0, + "grad_norm": 1.6642312861866588, + "language_loss": 0.81803644, + "learning_rate": 7.089468023710326e-07, + "loss": 0.83939904, + "num_input_tokens_seen": 262624540, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6796875, + "step": 12169, + "time_per_iteration": 2.4575917720794678 + }, + { + "auxiliary_loss_clip": 0.0110358, + "auxiliary_loss_mlp": 0.01035859, + "balance_loss_clip": 1.024194, + "balance_loss_mlp": 1.03600168, + "epoch": 0.7316999849691869, + "flos": 30482557315200.0, + "grad_norm": 1.6489053706369026, + "language_loss": 0.69867074, + "learning_rate": 7.08649380247871e-07, + "loss": 0.72006512, + "num_input_tokens_seen": 262644545, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.67578125, + "step": 12170, + "time_per_iteration": 2.5548336505889893 + }, + { + "auxiliary_loss_clip": 0.01100039, + "auxiliary_loss_mlp": 0.01030294, + "balance_loss_clip": 1.01778316, + "balance_loss_mlp": 1.03440404, + "epoch": 0.7317601082218548, + "flos": 21543781334400.0, + "grad_norm": 2.8957173811976022, + "language_loss": 0.69379872, + "learning_rate": 7.083520070928533e-07, + "loss": 0.71510202, + "num_input_tokens_seen": 262662570, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.65625, + "step": 12171, + "time_per_iteration": 2.4312360286712646 + }, + { + "auxiliary_loss_clip": 0.01101881, + "auxiliary_loss_mlp": 0.01033722, + "balance_loss_clip": 1.02222395, + "balance_loss_mlp": 1.03613734, + "epoch": 0.7318202314745228, + "flos": 33251592775680.0, + "grad_norm": 3.1521599416176582, + "language_loss": 0.65645874, + "learning_rate": 7.080546829172564e-07, + "loss": 0.67781472, + "num_input_tokens_seen": 262683245, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 12172, + "time_per_iteration": 2.5476059913635254 + }, + { + "auxiliary_loss_clip": 0.01103925, + "auxiliary_loss_mlp": 0.01027122, + "balance_loss_clip": 1.01507545, + "balance_loss_mlp": 1.03686643, + "epoch": 0.7318803547271907, + "flos": 20157054917760.0, + "grad_norm": 2.237216797653005, + "language_loss": 0.6100843, + "learning_rate": 7.077574077323564e-07, + "loss": 0.63139474, + "num_input_tokens_seen": 262701585, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.671875, + "step": 12173, + "time_per_iteration": 2.4594876766204834 + }, + { + "auxiliary_loss_clip": 0.0110106, + "auxiliary_loss_mlp": 0.01025966, + "balance_loss_clip": 1.0147481, + "balance_loss_mlp": 1.03543413, + "epoch": 0.7319404779798587, + "flos": 20558536208640.0, + "grad_norm": 1.8253545093146943, + "language_loss": 0.73704946, + "learning_rate": 7.074601815494243e-07, + "loss": 0.75831974, + "num_input_tokens_seen": 262719295, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 12174, + "time_per_iteration": 2.515566349029541 + }, + { + "auxiliary_loss_clip": 0.01099205, + "auxiliary_loss_mlp": 0.01025641, + "balance_loss_clip": 1.01454306, + "balance_loss_mlp": 1.03585482, + "epoch": 0.7320006012325266, + "flos": 28695391102080.0, + "grad_norm": 1.5591268998445824, + "language_loss": 0.80786538, + "learning_rate": 7.071630043797317e-07, + "loss": 0.82911384, + "num_input_tokens_seen": 262739995, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6328125, + "step": 12175, + "time_per_iteration": 2.5457139015197754 + }, + { + "auxiliary_loss_clip": 0.01101358, + "auxiliary_loss_mlp": 0.01027139, + "balance_loss_clip": 1.01556993, + "balance_loss_mlp": 1.03506994, + "epoch": 0.7320607244851947, + "flos": 16362697731840.0, + "grad_norm": 1.8633750273009067, + "language_loss": 0.76524568, + "learning_rate": 7.068658762345488e-07, + "loss": 0.78653067, + "num_input_tokens_seen": 262757680, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66015625, + "step": 12176, + "time_per_iteration": 2.4949843883514404 + }, + { + "auxiliary_loss_clip": 0.01101151, + "auxiliary_loss_mlp": 0.01030227, + "balance_loss_clip": 1.01911664, + "balance_loss_mlp": 1.03668857, + "epoch": 0.7321208477378626, + "flos": 20955097336320.0, + "grad_norm": 2.0429703759451074, + "language_loss": 0.76661092, + "learning_rate": 7.065687971251399e-07, + "loss": 0.78792465, + "num_input_tokens_seen": 262776990, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.64453125, + "step": 12177, + "time_per_iteration": 2.5137908458709717 + }, + { + "auxiliary_loss_clip": 0.01097382, + "auxiliary_loss_mlp": 0.01034623, + "balance_loss_clip": 1.02391255, + "balance_loss_mlp": 1.03224051, + "epoch": 0.7321809709905306, + "flos": 13845072539520.0, + "grad_norm": 2.015813751432838, + "language_loss": 0.74164724, + "learning_rate": 7.06271767062772e-07, + "loss": 0.76296735, + "num_input_tokens_seen": 262795440, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6484375, + "step": 12178, + "time_per_iteration": 3.7930397987365723 + }, + { + "auxiliary_loss_clip": 0.01102574, + "auxiliary_loss_mlp": 0.01029204, + "balance_loss_clip": 1.01727104, + "balance_loss_mlp": 1.03461027, + "epoch": 0.7322410942431986, + "flos": 26979938392320.0, + "grad_norm": 3.902615906398373, + "language_loss": 0.82204944, + "learning_rate": 7.059747860587084e-07, + "loss": 0.84336722, + "num_input_tokens_seen": 262816385, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 12179, + "time_per_iteration": 2.4926083087921143 + }, + { + "auxiliary_loss_clip": 0.01096766, + "auxiliary_loss_mlp": 0.01031847, + "balance_loss_clip": 1.0208199, + "balance_loss_mlp": 1.03491974, + "epoch": 0.7323012174958665, + "flos": 17639717034240.0, + "grad_norm": 1.7358162194967635, + "language_loss": 0.74350899, + "learning_rate": 7.056778541242115e-07, + "loss": 0.76479512, + "num_input_tokens_seen": 262834955, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6171875, + "step": 12180, + "time_per_iteration": 3.9542806148529053 + }, + { + "auxiliary_loss_clip": 0.01102785, + "auxiliary_loss_mlp": 0.01028317, + "balance_loss_clip": 1.01565659, + "balance_loss_mlp": 1.03372073, + "epoch": 0.7323613407485345, + "flos": 32342765834880.0, + "grad_norm": 1.8090406286045437, + "language_loss": 0.78966725, + "learning_rate": 7.053809712705396e-07, + "loss": 0.81097823, + "num_input_tokens_seen": 262853555, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 12181, + "time_per_iteration": 5.370461940765381 + }, + { + "auxiliary_loss_clip": 0.01104053, + "auxiliary_loss_mlp": 0.01031532, + "balance_loss_clip": 1.01984382, + "balance_loss_mlp": 1.03627169, + "epoch": 0.7324214640012024, + "flos": 18362777811840.0, + "grad_norm": 1.6926303414905466, + "language_loss": 0.71438134, + "learning_rate": 7.050841375089506e-07, + "loss": 0.7357372, + "num_input_tokens_seen": 262870975, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 12182, + "time_per_iteration": 2.395366668701172 + }, + { + "auxiliary_loss_clip": 0.01104144, + "auxiliary_loss_mlp": 0.01033063, + "balance_loss_clip": 1.02144599, + "balance_loss_mlp": 1.03678739, + "epoch": 0.7324815872538705, + "flos": 30812289189120.0, + "grad_norm": 1.516043869338468, + "language_loss": 0.71126986, + "learning_rate": 7.047873528507015e-07, + "loss": 0.73264194, + "num_input_tokens_seen": 262892635, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 12183, + "time_per_iteration": 2.5406055450439453 + }, + { + "auxiliary_loss_clip": 0.0110482, + "auxiliary_loss_mlp": 0.01036116, + "balance_loss_clip": 1.02371871, + "balance_loss_mlp": 1.03739989, + "epoch": 0.7325417105065384, + "flos": 21505069451520.0, + "grad_norm": 1.782462638135082, + "language_loss": 0.72453171, + "learning_rate": 7.04490617307045e-07, + "loss": 0.74594104, + "num_input_tokens_seen": 262910725, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 12184, + "time_per_iteration": 2.4203481674194336 + }, + { + "auxiliary_loss_clip": 0.01023657, + "auxiliary_loss_mlp": 0.01014002, + "balance_loss_clip": 1.01300097, + "balance_loss_mlp": 1.00301158, + "epoch": 0.7326018337592064, + "flos": 67257742556160.0, + "grad_norm": 0.763876847553094, + "language_loss": 0.65218687, + "learning_rate": 7.041939308892344e-07, + "loss": 0.67256343, + "num_input_tokens_seen": 262974150, + "router_z_loss_clip": 0.01000977, + "router_z_loss_mlp": 0.20703125, + "step": 12185, + "time_per_iteration": 3.0270133018493652 + }, + { + "auxiliary_loss_clip": 0.01100629, + "auxiliary_loss_mlp": 0.01026623, + "balance_loss_clip": 1.01434445, + "balance_loss_mlp": 1.03290069, + "epoch": 0.7326619570118743, + "flos": 22857070394880.0, + "grad_norm": 1.938744837028, + "language_loss": 0.807504, + "learning_rate": 7.038972936085197e-07, + "loss": 0.82877648, + "num_input_tokens_seen": 262993370, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6796875, + "step": 12186, + "time_per_iteration": 2.4389822483062744 + }, + { + "auxiliary_loss_clip": 0.01101959, + "auxiliary_loss_mlp": 0.01031355, + "balance_loss_clip": 1.01886177, + "balance_loss_mlp": 1.03473353, + "epoch": 0.7327220802645423, + "flos": 23327499841920.0, + "grad_norm": 1.9074219827171814, + "language_loss": 0.73762989, + "learning_rate": 7.036007054761508e-07, + "loss": 0.75896305, + "num_input_tokens_seen": 263012665, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.671875, + "step": 12187, + "time_per_iteration": 2.4973368644714355 + }, + { + "auxiliary_loss_clip": 0.01104423, + "auxiliary_loss_mlp": 0.01033786, + "balance_loss_clip": 1.02201378, + "balance_loss_mlp": 1.03718829, + "epoch": 0.7327822035172102, + "flos": 23180661043200.0, + "grad_norm": 1.717563808128471, + "language_loss": 0.88947159, + "learning_rate": 7.033041665033716e-07, + "loss": 0.91085368, + "num_input_tokens_seen": 263031475, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 12188, + "time_per_iteration": 2.4411849975585938 + }, + { + "auxiliary_loss_clip": 0.01103922, + "auxiliary_loss_mlp": 0.01030882, + "balance_loss_clip": 1.01875281, + "balance_loss_mlp": 1.03507185, + "epoch": 0.7328423267698783, + "flos": 21066600130560.0, + "grad_norm": 1.8794202002209792, + "language_loss": 0.7421574, + "learning_rate": 7.030076767014284e-07, + "loss": 0.76350546, + "num_input_tokens_seen": 263051445, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6875, + "step": 12189, + "time_per_iteration": 2.4856882095336914 + }, + { + "auxiliary_loss_clip": 0.0110238, + "auxiliary_loss_mlp": 0.0102823, + "balance_loss_clip": 1.01568341, + "balance_loss_mlp": 1.03474796, + "epoch": 0.7329024500225462, + "flos": 21689578638720.0, + "grad_norm": 1.5825056379011793, + "language_loss": 0.82314098, + "learning_rate": 7.027112360815648e-07, + "loss": 0.84444714, + "num_input_tokens_seen": 263070835, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.67578125, + "step": 12190, + "time_per_iteration": 2.456019878387451 + }, + { + "auxiliary_loss_clip": 0.01102905, + "auxiliary_loss_mlp": 0.01034193, + "balance_loss_clip": 1.02160442, + "balance_loss_mlp": 1.03589582, + "epoch": 0.7329625732752142, + "flos": 24164038661760.0, + "grad_norm": 1.732792680094222, + "language_loss": 0.71868473, + "learning_rate": 7.024148446550204e-07, + "loss": 0.74005568, + "num_input_tokens_seen": 263090070, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.66796875, + "step": 12191, + "time_per_iteration": 2.483551502227783 + }, + { + "auxiliary_loss_clip": 0.01103846, + "auxiliary_loss_mlp": 0.01033545, + "balance_loss_clip": 1.02112985, + "balance_loss_mlp": 1.03651261, + "epoch": 0.7330226965278822, + "flos": 30077915627520.0, + "grad_norm": 1.5577440951602006, + "language_loss": 0.69461203, + "learning_rate": 7.021185024330361e-07, + "loss": 0.71598595, + "num_input_tokens_seen": 263110030, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 12192, + "time_per_iteration": 2.509345531463623 + }, + { + "auxiliary_loss_clip": 0.01100244, + "auxiliary_loss_mlp": 0.01028189, + "balance_loss_clip": 1.01683998, + "balance_loss_mlp": 1.03492808, + "epoch": 0.7330828197805501, + "flos": 23368294713600.0, + "grad_norm": 1.567853507336265, + "language_loss": 0.73125577, + "learning_rate": 7.01822209426848e-07, + "loss": 0.75254017, + "num_input_tokens_seen": 263129735, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65234375, + "step": 12193, + "time_per_iteration": 2.5061562061309814 + }, + { + "auxiliary_loss_clip": 0.01101772, + "auxiliary_loss_mlp": 0.01027657, + "balance_loss_clip": 1.01551533, + "balance_loss_mlp": 1.03417039, + "epoch": 0.7331429430332181, + "flos": 21032808410880.0, + "grad_norm": 4.194654550291271, + "language_loss": 0.76709831, + "learning_rate": 7.015259656476911e-07, + "loss": 0.78839254, + "num_input_tokens_seen": 263149100, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.67578125, + "step": 12194, + "time_per_iteration": 2.429858446121216 + }, + { + "auxiliary_loss_clip": 0.01101072, + "auxiliary_loss_mlp": 0.01026816, + "balance_loss_clip": 1.01485932, + "balance_loss_mlp": 1.03564095, + "epoch": 0.733203066285886, + "flos": 14647891466880.0, + "grad_norm": 1.8657268793695219, + "language_loss": 0.70426142, + "learning_rate": 7.012297711067998e-07, + "loss": 0.72554034, + "num_input_tokens_seen": 263166620, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 12195, + "time_per_iteration": 2.47605299949646 + }, + { + "auxiliary_loss_clip": 0.01103283, + "auxiliary_loss_mlp": 0.01036918, + "balance_loss_clip": 1.02542019, + "balance_loss_mlp": 1.0363059, + "epoch": 0.7332631895385541, + "flos": 17165301177600.0, + "grad_norm": 1.8748815414700573, + "language_loss": 0.72009385, + "learning_rate": 7.009336258154057e-07, + "loss": 0.74149585, + "num_input_tokens_seen": 263184780, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 12196, + "time_per_iteration": 2.4170355796813965 + }, + { + "auxiliary_loss_clip": 0.01100598, + "auxiliary_loss_mlp": 0.0102618, + "balance_loss_clip": 1.01400244, + "balance_loss_mlp": 1.03541434, + "epoch": 0.733323312791222, + "flos": 28658151676800.0, + "grad_norm": 1.6057850533210987, + "language_loss": 0.71647477, + "learning_rate": 7.006375297847394e-07, + "loss": 0.73774254, + "num_input_tokens_seen": 263204625, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65234375, + "step": 12197, + "time_per_iteration": 2.5049266815185547 + }, + { + "auxiliary_loss_clip": 0.01103625, + "auxiliary_loss_mlp": 0.01038235, + "balance_loss_clip": 1.0253787, + "balance_loss_mlp": 1.03410459, + "epoch": 0.73338343604389, + "flos": 16618417632000.0, + "grad_norm": 1.8231283851018831, + "language_loss": 0.78448522, + "learning_rate": 7.003414830260282e-07, + "loss": 0.80590379, + "num_input_tokens_seen": 263221565, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 12198, + "time_per_iteration": 2.4223878383636475 + }, + { + "auxiliary_loss_clip": 0.0110209, + "auxiliary_loss_mlp": 0.01030266, + "balance_loss_clip": 1.01910758, + "balance_loss_mlp": 1.03584075, + "epoch": 0.7334435592965579, + "flos": 21142084561920.0, + "grad_norm": 1.9413444885935378, + "language_loss": 0.74405611, + "learning_rate": 7.000454855504974e-07, + "loss": 0.76537967, + "num_input_tokens_seen": 263240620, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 12199, + "time_per_iteration": 2.503514528274536 + }, + { + "auxiliary_loss_clip": 0.01106436, + "auxiliary_loss_mlp": 0.01034584, + "balance_loss_clip": 1.02240086, + "balance_loss_mlp": 1.03749204, + "epoch": 0.7335036825492259, + "flos": 17125332318720.0, + "grad_norm": 2.410343838162529, + "language_loss": 0.76916027, + "learning_rate": 6.997495373693729e-07, + "loss": 0.79057044, + "num_input_tokens_seen": 263254365, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 12200, + "time_per_iteration": 2.385646104812622 + }, + { + "auxiliary_loss_clip": 0.0110137, + "auxiliary_loss_mlp": 0.01031712, + "balance_loss_clip": 1.02033889, + "balance_loss_mlp": 1.03535485, + "epoch": 0.7335638058018938, + "flos": 23731818307200.0, + "grad_norm": 1.9712263454849892, + "language_loss": 0.61337197, + "learning_rate": 6.994536384938754e-07, + "loss": 0.63470274, + "num_input_tokens_seen": 263275880, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.66015625, + "step": 12201, + "time_per_iteration": 2.494711399078369 + }, + { + "auxiliary_loss_clip": 0.01099322, + "auxiliary_loss_mlp": 0.01022943, + "balance_loss_clip": 1.01186204, + "balance_loss_mlp": 1.03445053, + "epoch": 0.7336239290545619, + "flos": 34933289679360.0, + "grad_norm": 1.770832212268843, + "language_loss": 0.52208602, + "learning_rate": 6.991577889352264e-07, + "loss": 0.54330868, + "num_input_tokens_seen": 263298315, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6484375, + "step": 12202, + "time_per_iteration": 2.5508878231048584 + }, + { + "auxiliary_loss_clip": 0.01100977, + "auxiliary_loss_mlp": 0.01026727, + "balance_loss_clip": 1.01555753, + "balance_loss_mlp": 1.03535819, + "epoch": 0.7336840523072298, + "flos": 21103049456640.0, + "grad_norm": 1.8712183341846977, + "language_loss": 0.68450284, + "learning_rate": 6.98861988704645e-07, + "loss": 0.70577991, + "num_input_tokens_seen": 263318615, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.65625, + "step": 12203, + "time_per_iteration": 2.455225944519043 + }, + { + "auxiliary_loss_clip": 0.01104999, + "auxiliary_loss_mlp": 0.01037444, + "balance_loss_clip": 1.02551746, + "balance_loss_mlp": 1.03558648, + "epoch": 0.7337441755598978, + "flos": 24024418496640.0, + "grad_norm": 2.0115937343101176, + "language_loss": 0.66122192, + "learning_rate": 6.985662378133474e-07, + "loss": 0.68264639, + "num_input_tokens_seen": 263336705, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 12204, + "time_per_iteration": 2.4275307655334473 + }, + { + "auxiliary_loss_clip": 0.01100701, + "auxiliary_loss_mlp": 0.0102869, + "balance_loss_clip": 1.01770449, + "balance_loss_mlp": 1.036098, + "epoch": 0.7338042988125658, + "flos": 22711309004160.0, + "grad_norm": 2.1044017909422434, + "language_loss": 0.77165949, + "learning_rate": 6.982705362725479e-07, + "loss": 0.79295337, + "num_input_tokens_seen": 263355065, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.64453125, + "step": 12205, + "time_per_iteration": 2.465723752975464 + }, + { + "auxiliary_loss_clip": 0.01101796, + "auxiliary_loss_mlp": 0.01028561, + "balance_loss_clip": 1.01719475, + "balance_loss_mlp": 1.03765106, + "epoch": 0.7338644220652337, + "flos": 21360996000000.0, + "grad_norm": 1.633398371679339, + "language_loss": 0.79663754, + "learning_rate": 6.979748840934601e-07, + "loss": 0.81794107, + "num_input_tokens_seen": 263374460, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 12206, + "time_per_iteration": 2.4295356273651123 + }, + { + "auxiliary_loss_clip": 0.01101572, + "auxiliary_loss_mlp": 0.01027593, + "balance_loss_clip": 1.01490951, + "balance_loss_mlp": 1.03436399, + "epoch": 0.7339245453179017, + "flos": 30920236536960.0, + "grad_norm": 1.938197948270063, + "language_loss": 0.71248126, + "learning_rate": 6.976792812872958e-07, + "loss": 0.73377299, + "num_input_tokens_seen": 263393610, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.671875, + "step": 12207, + "time_per_iteration": 2.533963918685913 + }, + { + "auxiliary_loss_clip": 0.01023391, + "auxiliary_loss_mlp": 0.01000694, + "balance_loss_clip": 0.99954408, + "balance_loss_mlp": 1.0029676, + "epoch": 0.7339846685705697, + "flos": 67899429072000.0, + "grad_norm": 0.7861729617868648, + "language_loss": 0.54826534, + "learning_rate": 6.97383727865263e-07, + "loss": 0.56850618, + "num_input_tokens_seen": 263450340, + "router_z_loss_clip": 0.01147461, + "router_z_loss_mlp": 0.20507812, + "step": 12208, + "time_per_iteration": 3.1204357147216797 + }, + { + "auxiliary_loss_clip": 0.01100641, + "auxiliary_loss_mlp": 0.01026824, + "balance_loss_clip": 1.01652431, + "balance_loss_mlp": 1.03539574, + "epoch": 0.7340447918232377, + "flos": 22236749493120.0, + "grad_norm": 1.435103992793476, + "language_loss": 0.80251199, + "learning_rate": 6.970882238385703e-07, + "loss": 0.82378662, + "num_input_tokens_seen": 263471735, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.65234375, + "step": 12209, + "time_per_iteration": 2.4724159240722656 + }, + { + "auxiliary_loss_clip": 0.01097718, + "auxiliary_loss_mlp": 0.01027545, + "balance_loss_clip": 1.01657128, + "balance_loss_mlp": 1.03298545, + "epoch": 0.7341049150759056, + "flos": 23764784014080.0, + "grad_norm": 1.8625549043469913, + "language_loss": 0.78958344, + "learning_rate": 6.96792769218423e-07, + "loss": 0.81083614, + "num_input_tokens_seen": 263493245, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6484375, + "step": 12210, + "time_per_iteration": 2.455946445465088 + }, + { + "auxiliary_loss_clip": 0.011003, + "auxiliary_loss_mlp": 0.0102585, + "balance_loss_clip": 1.01407206, + "balance_loss_mlp": 1.03534794, + "epoch": 0.7341650383285736, + "flos": 17236547804160.0, + "grad_norm": 1.6735159974751206, + "language_loss": 0.7608707, + "learning_rate": 6.964973640160236e-07, + "loss": 0.78213215, + "num_input_tokens_seen": 263511660, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6484375, + "step": 12211, + "time_per_iteration": 2.4627277851104736 + }, + { + "auxiliary_loss_clip": 0.01102174, + "auxiliary_loss_mlp": 0.01025987, + "balance_loss_clip": 1.01444197, + "balance_loss_mlp": 1.03521109, + "epoch": 0.7342251615812415, + "flos": 23403953940480.0, + "grad_norm": 1.9025360413194936, + "language_loss": 0.71490365, + "learning_rate": 6.962020082425748e-07, + "loss": 0.73618519, + "num_input_tokens_seen": 263530875, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 12212, + "time_per_iteration": 2.446685552597046 + }, + { + "auxiliary_loss_clip": 0.01103728, + "auxiliary_loss_mlp": 0.01031481, + "balance_loss_clip": 1.01991129, + "balance_loss_mlp": 1.03784096, + "epoch": 0.7342852848339095, + "flos": 22747183712640.0, + "grad_norm": 1.9034635106886582, + "language_loss": 0.68719161, + "learning_rate": 6.959067019092766e-07, + "loss": 0.70854366, + "num_input_tokens_seen": 263551585, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 12213, + "time_per_iteration": 2.4991095066070557 + }, + { + "auxiliary_loss_clip": 0.01023626, + "auxiliary_loss_mlp": 0.01002854, + "balance_loss_clip": 1.00172174, + "balance_loss_mlp": 1.00317287, + "epoch": 0.7343454080865774, + "flos": 53942353925760.0, + "grad_norm": 0.7248810226626392, + "language_loss": 0.54344672, + "learning_rate": 6.956114450273276e-07, + "loss": 0.56371152, + "num_input_tokens_seen": 263609545, + "router_z_loss_clip": 0.01135254, + "router_z_loss_mlp": 0.20507812, + "step": 12214, + "time_per_iteration": 2.920579433441162 + }, + { + "auxiliary_loss_clip": 0.01103211, + "auxiliary_loss_mlp": 0.01026326, + "balance_loss_clip": 1.01514435, + "balance_loss_mlp": 1.03471541, + "epoch": 0.7344055313392455, + "flos": 12166859255040.0, + "grad_norm": 1.9617721107193735, + "language_loss": 0.70233238, + "learning_rate": 6.953162376079233e-07, + "loss": 0.72362781, + "num_input_tokens_seen": 263627880, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.68359375, + "step": 12215, + "time_per_iteration": 2.4825196266174316 + }, + { + "auxiliary_loss_clip": 0.01098919, + "auxiliary_loss_mlp": 0.01027103, + "balance_loss_clip": 1.01648164, + "balance_loss_mlp": 1.03576207, + "epoch": 0.7344656545919134, + "flos": 18550052346240.0, + "grad_norm": 1.7008791597621735, + "language_loss": 0.72984588, + "learning_rate": 6.950210796622573e-07, + "loss": 0.75110614, + "num_input_tokens_seen": 263645665, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6328125, + "step": 12216, + "time_per_iteration": 2.419165849685669 + }, + { + "auxiliary_loss_clip": 0.01106239, + "auxiliary_loss_mlp": 0.01035051, + "balance_loss_clip": 1.02124095, + "balance_loss_mlp": 1.03503752, + "epoch": 0.7345257778445814, + "flos": 23661649088640.0, + "grad_norm": 1.6841898563593931, + "language_loss": 0.7813915, + "learning_rate": 6.947259712015236e-07, + "loss": 0.80280441, + "num_input_tokens_seen": 263668170, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7109375, + "step": 12217, + "time_per_iteration": 2.519476890563965 + }, + { + "auxiliary_loss_clip": 0.01097824, + "auxiliary_loss_mlp": 0.01025415, + "balance_loss_clip": 1.01500201, + "balance_loss_mlp": 1.03322065, + "epoch": 0.7345859010972494, + "flos": 13808659127040.0, + "grad_norm": 2.0430723318586814, + "language_loss": 0.77478087, + "learning_rate": 6.94430912236911e-07, + "loss": 0.7960133, + "num_input_tokens_seen": 263684190, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.64453125, + "step": 12218, + "time_per_iteration": 2.4323973655700684 + }, + { + "auxiliary_loss_clip": 0.0109922, + "auxiliary_loss_mlp": 0.01029029, + "balance_loss_clip": 1.0175488, + "balance_loss_mlp": 1.03478718, + "epoch": 0.7346460243499173, + "flos": 22272731942400.0, + "grad_norm": 2.4653490702635223, + "language_loss": 0.72245163, + "learning_rate": 6.941359027796092e-07, + "loss": 0.74373412, + "num_input_tokens_seen": 263702095, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.64453125, + "step": 12219, + "time_per_iteration": 3.851811408996582 + }, + { + "auxiliary_loss_clip": 0.0109725, + "auxiliary_loss_mlp": 0.01027919, + "balance_loss_clip": 1.01675463, + "balance_loss_mlp": 1.03373814, + "epoch": 0.7347061476025853, + "flos": 23255247634560.0, + "grad_norm": 1.7840681188410097, + "language_loss": 0.7480529, + "learning_rate": 6.938409428408061e-07, + "loss": 0.76930463, + "num_input_tokens_seen": 263721385, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6328125, + "step": 12220, + "time_per_iteration": 2.450587511062622 + }, + { + "auxiliary_loss_clip": 0.01102396, + "auxiliary_loss_mlp": 0.01030551, + "balance_loss_clip": 1.01934493, + "balance_loss_mlp": 1.03515816, + "epoch": 0.7347662708552533, + "flos": 15267565923840.0, + "grad_norm": 1.5828657801363317, + "language_loss": 0.65927309, + "learning_rate": 6.93546032431684e-07, + "loss": 0.68060255, + "num_input_tokens_seen": 263737835, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 12221, + "time_per_iteration": 3.9862098693847656 + }, + { + "auxiliary_loss_clip": 0.01100484, + "auxiliary_loss_mlp": 0.01028442, + "balance_loss_clip": 1.01736093, + "balance_loss_mlp": 1.03518033, + "epoch": 0.7348263941079213, + "flos": 24859987649280.0, + "grad_norm": 1.690484446007973, + "language_loss": 0.69146597, + "learning_rate": 6.932511715634273e-07, + "loss": 0.71275526, + "num_input_tokens_seen": 263756480, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.65234375, + "step": 12222, + "time_per_iteration": 3.9009041786193848 + }, + { + "auxiliary_loss_clip": 0.01099444, + "auxiliary_loss_mlp": 0.01027554, + "balance_loss_clip": 1.01703954, + "balance_loss_mlp": 1.03489995, + "epoch": 0.7348865173605892, + "flos": 24352103295360.0, + "grad_norm": 1.6021663525354104, + "language_loss": 0.65751356, + "learning_rate": 6.92956360247217e-07, + "loss": 0.67878354, + "num_input_tokens_seen": 263776440, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.64453125, + "step": 12223, + "time_per_iteration": 3.9320757389068604 + }, + { + "auxiliary_loss_clip": 0.01101903, + "auxiliary_loss_mlp": 0.01027965, + "balance_loss_clip": 1.01641989, + "balance_loss_mlp": 1.03491271, + "epoch": 0.7349466406132572, + "flos": 20004613597440.0, + "grad_norm": 1.7805598542267875, + "language_loss": 0.72150576, + "learning_rate": 6.926615984942332e-07, + "loss": 0.74280441, + "num_input_tokens_seen": 263793700, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.671875, + "step": 12224, + "time_per_iteration": 2.424764394760132 + }, + { + "auxiliary_loss_clip": 0.01102425, + "auxiliary_loss_mlp": 0.01030088, + "balance_loss_clip": 1.01862597, + "balance_loss_mlp": 1.0356946, + "epoch": 0.7350067638659251, + "flos": 29825068815360.0, + "grad_norm": 1.667305857067153, + "language_loss": 0.72422898, + "learning_rate": 6.92366886315652e-07, + "loss": 0.74555409, + "num_input_tokens_seen": 263814620, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.66796875, + "step": 12225, + "time_per_iteration": 2.555699110031128 + }, + { + "auxiliary_loss_clip": 0.0110455, + "auxiliary_loss_mlp": 0.01031967, + "balance_loss_clip": 1.01911092, + "balance_loss_mlp": 1.03528094, + "epoch": 0.7350668871185931, + "flos": 21866150920320.0, + "grad_norm": 1.6574802149125882, + "language_loss": 0.76740652, + "learning_rate": 6.920722237226501e-07, + "loss": 0.78877175, + "num_input_tokens_seen": 263832725, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69140625, + "step": 12226, + "time_per_iteration": 2.417281150817871 + }, + { + "auxiliary_loss_clip": 0.0110041, + "auxiliary_loss_mlp": 0.01028316, + "balance_loss_clip": 1.01671648, + "balance_loss_mlp": 1.03516448, + "epoch": 0.735127010371261, + "flos": 22566122231040.0, + "grad_norm": 1.6412947887343436, + "language_loss": 0.66742253, + "learning_rate": 6.917776107264008e-07, + "loss": 0.68870974, + "num_input_tokens_seen": 263853850, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 12227, + "time_per_iteration": 2.493746280670166 + }, + { + "auxiliary_loss_clip": 0.01101958, + "auxiliary_loss_mlp": 0.01034032, + "balance_loss_clip": 1.02254581, + "balance_loss_mlp": 1.03482342, + "epoch": 0.7351871336239291, + "flos": 25884339707520.0, + "grad_norm": 1.3969319271399194, + "language_loss": 0.63719964, + "learning_rate": 6.914830473380749e-07, + "loss": 0.65855956, + "num_input_tokens_seen": 263874760, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 12228, + "time_per_iteration": 2.4691944122314453 + }, + { + "auxiliary_loss_clip": 0.01099398, + "auxiliary_loss_mlp": 0.01033625, + "balance_loss_clip": 1.02261627, + "balance_loss_mlp": 1.03371692, + "epoch": 0.735247256876597, + "flos": 17932173569280.0, + "grad_norm": 2.005632249261944, + "language_loss": 0.63364494, + "learning_rate": 6.911885335688427e-07, + "loss": 0.65497524, + "num_input_tokens_seen": 263893390, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 12229, + "time_per_iteration": 2.44689679145813 + }, + { + "auxiliary_loss_clip": 0.01104076, + "auxiliary_loss_mlp": 0.0103478, + "balance_loss_clip": 1.02215624, + "balance_loss_mlp": 1.03622568, + "epoch": 0.735307380129265, + "flos": 28875159694080.0, + "grad_norm": 1.6720920493620766, + "language_loss": 0.73554301, + "learning_rate": 6.908940694298726e-07, + "loss": 0.7569316, + "num_input_tokens_seen": 263911180, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6796875, + "step": 12230, + "time_per_iteration": 2.471467971801758 + }, + { + "auxiliary_loss_clip": 0.01102648, + "auxiliary_loss_mlp": 0.01028691, + "balance_loss_clip": 1.01664519, + "balance_loss_mlp": 1.03582287, + "epoch": 0.7353675033819329, + "flos": 13625658311040.0, + "grad_norm": 1.9806878096831662, + "language_loss": 0.71668804, + "learning_rate": 6.90599654932332e-07, + "loss": 0.73800141, + "num_input_tokens_seen": 263928975, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.66796875, + "step": 12231, + "time_per_iteration": 2.473133087158203 + }, + { + "auxiliary_loss_clip": 0.01104217, + "auxiliary_loss_mlp": 0.01036138, + "balance_loss_clip": 1.02288222, + "balance_loss_mlp": 1.03647971, + "epoch": 0.7354276266346009, + "flos": 19463081178240.0, + "grad_norm": 3.941316401522165, + "language_loss": 0.64094537, + "learning_rate": 6.903052900873823e-07, + "loss": 0.66234899, + "num_input_tokens_seen": 263944495, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6796875, + "step": 12232, + "time_per_iteration": 2.4203951358795166 + }, + { + "auxiliary_loss_clip": 0.01102255, + "auxiliary_loss_mlp": 0.01030592, + "balance_loss_clip": 1.01851058, + "balance_loss_mlp": 1.03487468, + "epoch": 0.735487749887269, + "flos": 15771858917760.0, + "grad_norm": 1.8455770572081356, + "language_loss": 0.75458562, + "learning_rate": 6.900109749061874e-07, + "loss": 0.77591407, + "num_input_tokens_seen": 263961325, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 12233, + "time_per_iteration": 2.4624409675598145 + }, + { + "auxiliary_loss_clip": 0.01101376, + "auxiliary_loss_mlp": 0.01027274, + "balance_loss_clip": 1.01549006, + "balance_loss_mlp": 1.03507233, + "epoch": 0.7355478731399369, + "flos": 18260648467200.0, + "grad_norm": 1.614964377536134, + "language_loss": 0.73402774, + "learning_rate": 6.897167093999079e-07, + "loss": 0.75531423, + "num_input_tokens_seen": 263980445, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 12234, + "time_per_iteration": 2.4193742275238037 + }, + { + "auxiliary_loss_clip": 0.01103947, + "auxiliary_loss_mlp": 0.01030505, + "balance_loss_clip": 1.01924026, + "balance_loss_mlp": 1.03720987, + "epoch": 0.7356079963926049, + "flos": 26542043688960.0, + "grad_norm": 4.5713288626894455, + "language_loss": 0.59835577, + "learning_rate": 6.894224935797017e-07, + "loss": 0.61970031, + "num_input_tokens_seen": 263999330, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.66796875, + "step": 12235, + "time_per_iteration": 2.5044472217559814 + }, + { + "auxiliary_loss_clip": 0.01101342, + "auxiliary_loss_mlp": 0.01026652, + "balance_loss_clip": 1.01521957, + "balance_loss_mlp": 1.03657699, + "epoch": 0.7356681196452728, + "flos": 10778624467200.0, + "grad_norm": 2.0497651121742115, + "language_loss": 0.8565346, + "learning_rate": 6.891283274567259e-07, + "loss": 0.87781453, + "num_input_tokens_seen": 264014150, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 12236, + "time_per_iteration": 2.3936641216278076 + }, + { + "auxiliary_loss_clip": 0.01102811, + "auxiliary_loss_mlp": 0.01028119, + "balance_loss_clip": 1.01669908, + "balance_loss_mlp": 1.03538775, + "epoch": 0.7357282428979408, + "flos": 19718693337600.0, + "grad_norm": 1.8090519272371215, + "language_loss": 0.69331872, + "learning_rate": 6.888342110421364e-07, + "loss": 0.71462798, + "num_input_tokens_seen": 264033140, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 12237, + "time_per_iteration": 2.473252296447754 + }, + { + "auxiliary_loss_clip": 0.01101452, + "auxiliary_loss_mlp": 0.01025644, + "balance_loss_clip": 1.01471233, + "balance_loss_mlp": 1.03477573, + "epoch": 0.7357883661506087, + "flos": 19464014931840.0, + "grad_norm": 1.6472611180309946, + "language_loss": 0.72134531, + "learning_rate": 6.885401443470839e-07, + "loss": 0.7426163, + "num_input_tokens_seen": 264052105, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6640625, + "step": 12238, + "time_per_iteration": 2.423517942428589 + }, + { + "auxiliary_loss_clip": 0.01106028, + "auxiliary_loss_mlp": 0.01029179, + "balance_loss_clip": 1.01703119, + "balance_loss_mlp": 1.03515995, + "epoch": 0.7358484894032767, + "flos": 27123006263040.0, + "grad_norm": 1.7391094576956916, + "language_loss": 0.72675085, + "learning_rate": 6.882461273827205e-07, + "loss": 0.7481029, + "num_input_tokens_seen": 264070690, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.7109375, + "step": 12239, + "time_per_iteration": 2.523238182067871 + }, + { + "auxiliary_loss_clip": 0.01098843, + "auxiliary_loss_mlp": 0.01029765, + "balance_loss_clip": 1.01849365, + "balance_loss_mlp": 1.03532851, + "epoch": 0.7359086126559446, + "flos": 24502282058880.0, + "grad_norm": 1.5041553602452318, + "language_loss": 0.78892875, + "learning_rate": 6.879521601601954e-07, + "loss": 0.81021476, + "num_input_tokens_seen": 264094225, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6328125, + "step": 12240, + "time_per_iteration": 2.4987194538116455 + }, + { + "auxiliary_loss_clip": 0.0110103, + "auxiliary_loss_mlp": 0.01033295, + "balance_loss_clip": 1.02145731, + "balance_loss_mlp": 1.03596234, + "epoch": 0.7359687359086127, + "flos": 23331270769920.0, + "grad_norm": 1.7320565425934242, + "language_loss": 0.83208013, + "learning_rate": 6.876582426906565e-07, + "loss": 0.85342342, + "num_input_tokens_seen": 264113190, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6484375, + "step": 12241, + "time_per_iteration": 2.499547004699707 + }, + { + "auxiliary_loss_clip": 0.01099431, + "auxiliary_loss_mlp": 0.01026793, + "balance_loss_clip": 1.01507461, + "balance_loss_mlp": 1.03403616, + "epoch": 0.7360288591612806, + "flos": 20193396503040.0, + "grad_norm": 1.8298064214189858, + "language_loss": 0.78645867, + "learning_rate": 6.873643749852484e-07, + "loss": 0.8077209, + "num_input_tokens_seen": 264132050, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65234375, + "step": 12242, + "time_per_iteration": 2.4207592010498047 + }, + { + "auxiliary_loss_clip": 0.01102156, + "auxiliary_loss_mlp": 0.01029374, + "balance_loss_clip": 1.0180552, + "balance_loss_mlp": 1.0359714, + "epoch": 0.7360889824139486, + "flos": 24972783333120.0, + "grad_norm": 1.9546604159013963, + "language_loss": 0.79385024, + "learning_rate": 6.870705570551145e-07, + "loss": 0.81516558, + "num_input_tokens_seen": 264152800, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 12243, + "time_per_iteration": 2.51019024848938 + }, + { + "auxiliary_loss_clip": 0.01102378, + "auxiliary_loss_mlp": 0.01032839, + "balance_loss_clip": 1.02083445, + "balance_loss_mlp": 1.03466713, + "epoch": 0.7361491056666165, + "flos": 15012312900480.0, + "grad_norm": 1.9125543259943414, + "language_loss": 0.74100977, + "learning_rate": 6.867767889113969e-07, + "loss": 0.76236194, + "num_input_tokens_seen": 264169650, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 12244, + "time_per_iteration": 2.4030749797821045 + }, + { + "auxiliary_loss_clip": 0.01101314, + "auxiliary_loss_mlp": 0.01029345, + "balance_loss_clip": 1.01773405, + "balance_loss_mlp": 1.03416705, + "epoch": 0.7362092289192845, + "flos": 22930400010240.0, + "grad_norm": 1.7798055097675247, + "language_loss": 0.6942178, + "learning_rate": 6.864830705652347e-07, + "loss": 0.71552444, + "num_input_tokens_seen": 264190530, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 12245, + "time_per_iteration": 2.4875071048736572 + }, + { + "auxiliary_loss_clip": 0.01098192, + "auxiliary_loss_mlp": 0.01031115, + "balance_loss_clip": 1.01933169, + "balance_loss_mlp": 1.03475428, + "epoch": 0.7362693521719526, + "flos": 20702681487360.0, + "grad_norm": 1.5087221257099204, + "language_loss": 0.73185629, + "learning_rate": 6.861894020277658e-07, + "loss": 0.75314939, + "num_input_tokens_seen": 264210820, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.63671875, + "step": 12246, + "time_per_iteration": 2.4394288063049316 + }, + { + "auxiliary_loss_clip": 0.01096401, + "auxiliary_loss_mlp": 0.01025823, + "balance_loss_clip": 1.01489758, + "balance_loss_mlp": 1.0334698, + "epoch": 0.7363294754246205, + "flos": 13111381336320.0, + "grad_norm": 2.1784937379902787, + "language_loss": 0.73557955, + "learning_rate": 6.858957833101266e-07, + "loss": 0.75680184, + "num_input_tokens_seen": 264227430, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.62890625, + "step": 12247, + "time_per_iteration": 2.4587297439575195 + }, + { + "auxiliary_loss_clip": 0.01101638, + "auxiliary_loss_mlp": 0.01027969, + "balance_loss_clip": 1.01730013, + "balance_loss_mlp": 1.03827024, + "epoch": 0.7363895986772885, + "flos": 14027426910720.0, + "grad_norm": 1.48643381660021, + "language_loss": 0.7409212, + "learning_rate": 6.856022144234526e-07, + "loss": 0.76221728, + "num_input_tokens_seen": 264245230, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6328125, + "step": 12248, + "time_per_iteration": 2.4140796661376953 + }, + { + "auxiliary_loss_clip": 0.01101098, + "auxiliary_loss_mlp": 0.01034256, + "balance_loss_clip": 1.02237701, + "balance_loss_mlp": 1.03480268, + "epoch": 0.7364497219299564, + "flos": 19719986227200.0, + "grad_norm": 4.381127457761843, + "language_loss": 0.72677851, + "learning_rate": 6.853086953788727e-07, + "loss": 0.74813205, + "num_input_tokens_seen": 264263945, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6640625, + "step": 12249, + "time_per_iteration": 2.4724795818328857 + }, + { + "auxiliary_loss_clip": 0.01103324, + "auxiliary_loss_mlp": 0.01030157, + "balance_loss_clip": 1.01859331, + "balance_loss_mlp": 1.03676438, + "epoch": 0.7365098451826244, + "flos": 21361391049600.0, + "grad_norm": 1.708422030858321, + "language_loss": 0.77026933, + "learning_rate": 6.850152261875189e-07, + "loss": 0.79160416, + "num_input_tokens_seen": 264281500, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6640625, + "step": 12250, + "time_per_iteration": 2.4324309825897217 + }, + { + "auxiliary_loss_clip": 0.01102594, + "auxiliary_loss_mlp": 0.0102868, + "balance_loss_clip": 1.01680052, + "balance_loss_mlp": 1.0353688, + "epoch": 0.7365699684352923, + "flos": 23368222886400.0, + "grad_norm": 2.1441444373175687, + "language_loss": 0.71412712, + "learning_rate": 6.8472180686052e-07, + "loss": 0.7354399, + "num_input_tokens_seen": 264301625, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 12251, + "time_per_iteration": 2.4759652614593506 + }, + { + "auxiliary_loss_clip": 0.0110003, + "auxiliary_loss_mlp": 0.01029632, + "balance_loss_clip": 1.01828933, + "balance_loss_mlp": 1.03470254, + "epoch": 0.7366300916879603, + "flos": 59524879927680.0, + "grad_norm": 1.4418314268019194, + "language_loss": 0.65489835, + "learning_rate": 6.844284374090015e-07, + "loss": 0.67619503, + "num_input_tokens_seen": 264323975, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65234375, + "step": 12252, + "time_per_iteration": 2.8028664588928223 + }, + { + "auxiliary_loss_clip": 0.01105105, + "auxiliary_loss_mlp": 0.0102878, + "balance_loss_clip": 1.01736534, + "balance_loss_mlp": 1.03739333, + "epoch": 0.7366902149406283, + "flos": 20923137210240.0, + "grad_norm": 1.657771200645772, + "language_loss": 0.79182792, + "learning_rate": 6.841351178440884e-07, + "loss": 0.8131668, + "num_input_tokens_seen": 264343785, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.67578125, + "step": 12253, + "time_per_iteration": 2.472512722015381 + }, + { + "auxiliary_loss_clip": 0.01096622, + "auxiliary_loss_mlp": 0.010276, + "balance_loss_clip": 1.01669192, + "balance_loss_mlp": 1.03384531, + "epoch": 0.7367503381932963, + "flos": 17348158339200.0, + "grad_norm": 2.145672565702914, + "language_loss": 0.75874883, + "learning_rate": 6.83841848176905e-07, + "loss": 0.77999103, + "num_input_tokens_seen": 264361130, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.625, + "step": 12254, + "time_per_iteration": 2.419156074523926 + }, + { + "auxiliary_loss_clip": 0.01101466, + "auxiliary_loss_mlp": 0.01032545, + "balance_loss_clip": 1.02074361, + "balance_loss_mlp": 1.03581631, + "epoch": 0.7368104614459642, + "flos": 17821317219840.0, + "grad_norm": 2.333279522964119, + "language_loss": 0.68892902, + "learning_rate": 6.835486284185692e-07, + "loss": 0.71026909, + "num_input_tokens_seen": 264376965, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65625, + "step": 12255, + "time_per_iteration": 2.456407308578491 + }, + { + "auxiliary_loss_clip": 0.01101847, + "auxiliary_loss_mlp": 0.01029458, + "balance_loss_clip": 1.01738834, + "balance_loss_mlp": 1.03577256, + "epoch": 0.7368705846986322, + "flos": 24606099342720.0, + "grad_norm": 2.0115502306535404, + "language_loss": 0.7508868, + "learning_rate": 6.832554585802012e-07, + "loss": 0.77219987, + "num_input_tokens_seen": 264396310, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6640625, + "step": 12256, + "time_per_iteration": 2.4806578159332275 + }, + { + "auxiliary_loss_clip": 0.01103736, + "auxiliary_loss_mlp": 0.01029113, + "balance_loss_clip": 1.01691759, + "balance_loss_mlp": 1.0363915, + "epoch": 0.7369307079513001, + "flos": 34970169968640.0, + "grad_norm": 1.5936534045043864, + "language_loss": 0.73533136, + "learning_rate": 6.829623386729182e-07, + "loss": 0.75665981, + "num_input_tokens_seen": 264418085, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.671875, + "step": 12257, + "time_per_iteration": 2.5967447757720947 + }, + { + "auxiliary_loss_clip": 0.01100133, + "auxiliary_loss_mlp": 0.01034318, + "balance_loss_clip": 1.02328479, + "balance_loss_mlp": 1.0344913, + "epoch": 0.7369908312039681, + "flos": 21214588164480.0, + "grad_norm": 1.4666060569830273, + "language_loss": 0.78067857, + "learning_rate": 6.826692687078362e-07, + "loss": 0.80202311, + "num_input_tokens_seen": 264437595, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 12258, + "time_per_iteration": 2.454329252243042 + }, + { + "auxiliary_loss_clip": 0.0110487, + "auxiliary_loss_mlp": 0.01029651, + "balance_loss_clip": 1.0180105, + "balance_loss_mlp": 1.03685117, + "epoch": 0.7370509544566362, + "flos": 23623655477760.0, + "grad_norm": 1.3867663760940814, + "language_loss": 0.66167754, + "learning_rate": 6.823762486960674e-07, + "loss": 0.68302274, + "num_input_tokens_seen": 264457385, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 12259, + "time_per_iteration": 2.517813205718994 + }, + { + "auxiliary_loss_clip": 0.0110186, + "auxiliary_loss_mlp": 0.01032538, + "balance_loss_clip": 1.02026582, + "balance_loss_mlp": 1.03576601, + "epoch": 0.7371110777093041, + "flos": 24827704300800.0, + "grad_norm": 1.584231595020614, + "language_loss": 0.73625088, + "learning_rate": 6.820832786487225e-07, + "loss": 0.75759482, + "num_input_tokens_seen": 264477205, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66015625, + "step": 12260, + "time_per_iteration": 2.5023396015167236 + }, + { + "auxiliary_loss_clip": 0.0110407, + "auxiliary_loss_mlp": 0.01028376, + "balance_loss_clip": 1.01717019, + "balance_loss_mlp": 1.03662717, + "epoch": 0.7371712009619721, + "flos": 23149491016320.0, + "grad_norm": 1.604192195943769, + "language_loss": 0.73533583, + "learning_rate": 6.817903585769125e-07, + "loss": 0.75666034, + "num_input_tokens_seen": 264497195, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.67578125, + "step": 12261, + "time_per_iteration": 3.906297445297241 + }, + { + "auxiliary_loss_clip": 0.01105085, + "auxiliary_loss_mlp": 0.01034539, + "balance_loss_clip": 1.02218294, + "balance_loss_mlp": 1.03563118, + "epoch": 0.73723132421464, + "flos": 23112898035840.0, + "grad_norm": 2.303167962152087, + "language_loss": 0.66901404, + "learning_rate": 6.814974884917438e-07, + "loss": 0.69041032, + "num_input_tokens_seen": 264516950, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6953125, + "step": 12262, + "time_per_iteration": 2.4535868167877197 + }, + { + "auxiliary_loss_clip": 0.01102433, + "auxiliary_loss_mlp": 0.0102981, + "balance_loss_clip": 1.01726305, + "balance_loss_mlp": 1.03487778, + "epoch": 0.737291447467308, + "flos": 19273328605440.0, + "grad_norm": 1.8236008971372257, + "language_loss": 0.88766813, + "learning_rate": 6.81204668404322e-07, + "loss": 0.90899056, + "num_input_tokens_seen": 264532675, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.67578125, + "step": 12263, + "time_per_iteration": 4.029206037521362 + }, + { + "auxiliary_loss_clip": 0.01096266, + "auxiliary_loss_mlp": 0.01026445, + "balance_loss_clip": 1.01636577, + "balance_loss_mlp": 1.03449428, + "epoch": 0.7373515707199759, + "flos": 25118257415040.0, + "grad_norm": 2.309256872894793, + "language_loss": 0.67259324, + "learning_rate": 6.809118983257522e-07, + "loss": 0.69382036, + "num_input_tokens_seen": 264555635, + "router_z_loss_clip": 0.10107422, + "router_z_loss_mlp": 0.6171875, + "step": 12264, + "time_per_iteration": 3.8689637184143066 + }, + { + "auxiliary_loss_clip": 0.01098996, + "auxiliary_loss_mlp": 0.0102669, + "balance_loss_clip": 1.01562762, + "balance_loss_mlp": 1.03491688, + "epoch": 0.737411693972644, + "flos": 32408481767040.0, + "grad_norm": 2.4971579087814066, + "language_loss": 0.80039012, + "learning_rate": 6.806191782671356e-07, + "loss": 0.82164693, + "num_input_tokens_seen": 264573140, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 12265, + "time_per_iteration": 4.065499782562256 + }, + { + "auxiliary_loss_clip": 0.01103678, + "auxiliary_loss_mlp": 0.01030021, + "balance_loss_clip": 1.01788533, + "balance_loss_mlp": 1.03421259, + "epoch": 0.7374718172253119, + "flos": 24315797623680.0, + "grad_norm": 1.6219065104687562, + "language_loss": 0.74228191, + "learning_rate": 6.803265082395711e-07, + "loss": 0.76361895, + "num_input_tokens_seen": 264591610, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 12266, + "time_per_iteration": 2.469236373901367 + }, + { + "auxiliary_loss_clip": 0.01103845, + "auxiliary_loss_mlp": 0.01034958, + "balance_loss_clip": 1.02281022, + "balance_loss_mlp": 1.03720498, + "epoch": 0.7375319404779799, + "flos": 27156115624320.0, + "grad_norm": 1.5661834210732133, + "language_loss": 0.73517638, + "learning_rate": 6.800338882541576e-07, + "loss": 0.75656438, + "num_input_tokens_seen": 264611170, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6640625, + "step": 12267, + "time_per_iteration": 2.504617214202881 + }, + { + "auxiliary_loss_clip": 0.01100734, + "auxiliary_loss_mlp": 0.01032173, + "balance_loss_clip": 1.02071714, + "balance_loss_mlp": 1.03550386, + "epoch": 0.7375920637306478, + "flos": 18879999701760.0, + "grad_norm": 1.9413990473639766, + "language_loss": 0.82913959, + "learning_rate": 6.797413183219923e-07, + "loss": 0.85046864, + "num_input_tokens_seen": 264629365, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65234375, + "step": 12268, + "time_per_iteration": 2.4835684299468994 + }, + { + "auxiliary_loss_clip": 0.01098968, + "auxiliary_loss_mlp": 0.01036468, + "balance_loss_clip": 1.02494073, + "balance_loss_mlp": 1.034657, + "epoch": 0.7376521869833158, + "flos": 15669765486720.0, + "grad_norm": 1.7133544019503224, + "language_loss": 0.7298789, + "learning_rate": 6.794487984541677e-07, + "loss": 0.75123322, + "num_input_tokens_seen": 264647915, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.64453125, + "step": 12269, + "time_per_iteration": 2.467454195022583 + }, + { + "auxiliary_loss_clip": 0.01104784, + "auxiliary_loss_mlp": 0.01032211, + "balance_loss_clip": 1.01989651, + "balance_loss_mlp": 1.03631639, + "epoch": 0.7377123102359837, + "flos": 36971973901440.0, + "grad_norm": 2.1055066962392095, + "language_loss": 0.69917566, + "learning_rate": 6.791563286617776e-07, + "loss": 0.72054565, + "num_input_tokens_seen": 264669620, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.68359375, + "step": 12270, + "time_per_iteration": 2.5774502754211426 + }, + { + "auxiliary_loss_clip": 0.0110007, + "auxiliary_loss_mlp": 0.01028402, + "balance_loss_clip": 1.01778626, + "balance_loss_mlp": 1.03567266, + "epoch": 0.7377724334886517, + "flos": 24496284487680.0, + "grad_norm": 1.7971813672192163, + "language_loss": 0.69534814, + "learning_rate": 6.788639089559119e-07, + "loss": 0.71663284, + "num_input_tokens_seen": 264689345, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.64453125, + "step": 12271, + "time_per_iteration": 2.6254172325134277 + }, + { + "auxiliary_loss_clip": 0.01102484, + "auxiliary_loss_mlp": 0.01029648, + "balance_loss_clip": 1.01770949, + "balance_loss_mlp": 1.03490114, + "epoch": 0.7378325567413198, + "flos": 24390025079040.0, + "grad_norm": 1.9993430148747984, + "language_loss": 0.68443513, + "learning_rate": 6.785715393476586e-07, + "loss": 0.70575643, + "num_input_tokens_seen": 264707625, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.67578125, + "step": 12272, + "time_per_iteration": 2.514380693435669 + }, + { + "auxiliary_loss_clip": 0.0109964, + "auxiliary_loss_mlp": 0.01029561, + "balance_loss_clip": 1.01809931, + "balance_loss_mlp": 1.03528929, + "epoch": 0.7378926799939877, + "flos": 17416388223360.0, + "grad_norm": 1.683058960031114, + "language_loss": 0.77877617, + "learning_rate": 6.782792198481049e-07, + "loss": 0.80006814, + "num_input_tokens_seen": 264725575, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.64453125, + "step": 12273, + "time_per_iteration": 2.4802489280700684 + }, + { + "auxiliary_loss_clip": 0.01098973, + "auxiliary_loss_mlp": 0.01031366, + "balance_loss_clip": 1.01958835, + "balance_loss_mlp": 1.03365088, + "epoch": 0.7379528032466557, + "flos": 18474208778880.0, + "grad_norm": 1.8227934716103082, + "language_loss": 0.83283198, + "learning_rate": 6.779869504683355e-07, + "loss": 0.85413539, + "num_input_tokens_seen": 264742855, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.65234375, + "step": 12274, + "time_per_iteration": 2.4196221828460693 + }, + { + "auxiliary_loss_clip": 0.01106787, + "auxiliary_loss_mlp": 0.01026617, + "balance_loss_clip": 1.01393938, + "balance_loss_mlp": 1.03611016, + "epoch": 0.7380129264993236, + "flos": 17821999578240.0, + "grad_norm": 1.788699432283416, + "language_loss": 0.7346586, + "learning_rate": 6.776947312194341e-07, + "loss": 0.75599259, + "num_input_tokens_seen": 264761155, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 12275, + "time_per_iteration": 2.4947471618652344 + }, + { + "auxiliary_loss_clip": 0.01106269, + "auxiliary_loss_mlp": 0.01039319, + "balance_loss_clip": 1.02698684, + "balance_loss_mlp": 1.03702235, + "epoch": 0.7380730497519916, + "flos": 22997372918400.0, + "grad_norm": 1.805676108210034, + "language_loss": 0.73670596, + "learning_rate": 6.774025621124813e-07, + "loss": 0.75816184, + "num_input_tokens_seen": 264780660, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6953125, + "step": 12276, + "time_per_iteration": 2.460041046142578 + }, + { + "auxiliary_loss_clip": 0.01102916, + "auxiliary_loss_mlp": 0.01028638, + "balance_loss_clip": 1.01706874, + "balance_loss_mlp": 1.03511322, + "epoch": 0.7381331730046595, + "flos": 20266259241600.0, + "grad_norm": 2.2438661310985544, + "language_loss": 0.77184784, + "learning_rate": 6.771104431585551e-07, + "loss": 0.79316336, + "num_input_tokens_seen": 264798850, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6796875, + "step": 12277, + "time_per_iteration": 2.5230605602264404 + }, + { + "auxiliary_loss_clip": 0.01101926, + "auxiliary_loss_mlp": 0.0103486, + "balance_loss_clip": 1.02326107, + "balance_loss_mlp": 1.03710866, + "epoch": 0.7381932962573275, + "flos": 19754532132480.0, + "grad_norm": 1.8274458620386211, + "language_loss": 0.78436172, + "learning_rate": 6.768183743687338e-07, + "loss": 0.80572963, + "num_input_tokens_seen": 264816795, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6484375, + "step": 12278, + "time_per_iteration": 2.4361507892608643 + }, + { + "auxiliary_loss_clip": 0.0110302, + "auxiliary_loss_mlp": 0.0103102, + "balance_loss_clip": 1.01922441, + "balance_loss_mlp": 1.03554451, + "epoch": 0.7382534195099955, + "flos": 17305316392320.0, + "grad_norm": 2.0940191805387722, + "language_loss": 0.72178644, + "learning_rate": 6.765263557540921e-07, + "loss": 0.74312687, + "num_input_tokens_seen": 264834105, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.67578125, + "step": 12279, + "time_per_iteration": 2.454338312149048 + }, + { + "auxiliary_loss_clip": 0.01102728, + "auxiliary_loss_mlp": 0.01033973, + "balance_loss_clip": 1.02146792, + "balance_loss_mlp": 1.03468275, + "epoch": 0.7383135427626635, + "flos": 18697358021760.0, + "grad_norm": 2.207094607312378, + "language_loss": 0.85757834, + "learning_rate": 6.762343873257034e-07, + "loss": 0.87894535, + "num_input_tokens_seen": 264850895, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 12280, + "time_per_iteration": 2.4340832233428955 + }, + { + "auxiliary_loss_clip": 0.01103222, + "auxiliary_loss_mlp": 0.01028536, + "balance_loss_clip": 1.016675, + "balance_loss_mlp": 1.03586102, + "epoch": 0.7383736660153314, + "flos": 20881300844160.0, + "grad_norm": 2.186067036515089, + "language_loss": 0.72367251, + "learning_rate": 6.759424690946408e-07, + "loss": 0.74499011, + "num_input_tokens_seen": 264869505, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 12281, + "time_per_iteration": 2.4844117164611816 + }, + { + "auxiliary_loss_clip": 0.01102088, + "auxiliary_loss_mlp": 0.01033571, + "balance_loss_clip": 1.02173972, + "balance_loss_mlp": 1.03446507, + "epoch": 0.7384337892679994, + "flos": 20663215418880.0, + "grad_norm": 1.9159466937607454, + "language_loss": 0.6074115, + "learning_rate": 6.756506010719711e-07, + "loss": 0.62876809, + "num_input_tokens_seen": 264886915, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.67578125, + "step": 12282, + "time_per_iteration": 2.4337880611419678 + }, + { + "auxiliary_loss_clip": 0.01104133, + "auxiliary_loss_mlp": 0.01031061, + "balance_loss_clip": 1.01902103, + "balance_loss_mlp": 1.03598022, + "epoch": 0.7384939125206673, + "flos": 29169627390720.0, + "grad_norm": 2.224847577186844, + "language_loss": 0.67914271, + "learning_rate": 6.753587832687632e-07, + "loss": 0.70049471, + "num_input_tokens_seen": 264910350, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6796875, + "step": 12283, + "time_per_iteration": 2.5443530082702637 + }, + { + "auxiliary_loss_clip": 0.01103409, + "auxiliary_loss_mlp": 0.01036343, + "balance_loss_clip": 1.02461326, + "balance_loss_mlp": 1.03717303, + "epoch": 0.7385540357733353, + "flos": 36312833376000.0, + "grad_norm": 1.587417277679554, + "language_loss": 0.76002008, + "learning_rate": 6.750670156960832e-07, + "loss": 0.78141761, + "num_input_tokens_seen": 264930705, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 12284, + "time_per_iteration": 2.561150312423706 + }, + { + "auxiliary_loss_clip": 0.01103106, + "auxiliary_loss_mlp": 0.01029878, + "balance_loss_clip": 1.01742673, + "balance_loss_mlp": 1.03535485, + "epoch": 0.7386141590260034, + "flos": 20302600826880.0, + "grad_norm": 1.8705632629894415, + "language_loss": 0.69351077, + "learning_rate": 6.747752983649954e-07, + "loss": 0.71484059, + "num_input_tokens_seen": 264946975, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6796875, + "step": 12285, + "time_per_iteration": 2.5044779777526855 + }, + { + "auxiliary_loss_clip": 0.01105253, + "auxiliary_loss_mlp": 0.01030637, + "balance_loss_clip": 1.01810813, + "balance_loss_mlp": 1.03483808, + "epoch": 0.7386742822786713, + "flos": 25483792170240.0, + "grad_norm": 2.818148859522571, + "language_loss": 0.79595774, + "learning_rate": 6.744836312865602e-07, + "loss": 0.81731659, + "num_input_tokens_seen": 264967665, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 12286, + "time_per_iteration": 2.462742328643799 + }, + { + "auxiliary_loss_clip": 0.01099992, + "auxiliary_loss_mlp": 0.01026401, + "balance_loss_clip": 1.01436138, + "balance_loss_mlp": 1.03468239, + "epoch": 0.7387344055313393, + "flos": 13771958405760.0, + "grad_norm": 2.0998689615756616, + "language_loss": 0.65484864, + "learning_rate": 6.741920144718396e-07, + "loss": 0.67611259, + "num_input_tokens_seen": 264985480, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.65234375, + "step": 12287, + "time_per_iteration": 2.5399184226989746 + }, + { + "auxiliary_loss_clip": 0.01097159, + "auxiliary_loss_mlp": 0.01026905, + "balance_loss_clip": 1.01564598, + "balance_loss_mlp": 1.03362429, + "epoch": 0.7387945287840072, + "flos": 27855189095040.0, + "grad_norm": 1.862112231817168, + "language_loss": 0.76542664, + "learning_rate": 6.739004479318903e-07, + "loss": 0.78666735, + "num_input_tokens_seen": 265004790, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.63671875, + "step": 12288, + "time_per_iteration": 2.483729362487793 + }, + { + "auxiliary_loss_clip": 0.01107844, + "auxiliary_loss_mlp": 0.01031728, + "balance_loss_clip": 1.0192709, + "balance_loss_mlp": 1.03781092, + "epoch": 0.7388546520366752, + "flos": 44233039388160.0, + "grad_norm": 1.6167864576536901, + "language_loss": 0.58242345, + "learning_rate": 6.736089316777684e-07, + "loss": 0.60381913, + "num_input_tokens_seen": 265028790, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.69921875, + "step": 12289, + "time_per_iteration": 2.653754472732544 + }, + { + "auxiliary_loss_clip": 0.01027818, + "auxiliary_loss_mlp": 0.010066, + "balance_loss_clip": 1.00555122, + "balance_loss_mlp": 1.00722313, + "epoch": 0.7389147752893431, + "flos": 70680890638080.0, + "grad_norm": 0.657884434351233, + "language_loss": 0.49320006, + "learning_rate": 6.733174657205287e-07, + "loss": 0.5135442, + "num_input_tokens_seen": 265096660, + "router_z_loss_clip": 0.01049805, + "router_z_loss_mlp": 0.20605469, + "step": 12290, + "time_per_iteration": 3.161417007446289 + }, + { + "auxiliary_loss_clip": 0.01104102, + "auxiliary_loss_mlp": 0.01030184, + "balance_loss_clip": 1.01758409, + "balance_loss_mlp": 1.03600287, + "epoch": 0.7389748985420111, + "flos": 25994980575360.0, + "grad_norm": 1.8618109210971494, + "language_loss": 0.66936404, + "learning_rate": 6.730260500712237e-07, + "loss": 0.69070697, + "num_input_tokens_seen": 265116375, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6796875, + "step": 12291, + "time_per_iteration": 2.504171371459961 + }, + { + "auxiliary_loss_clip": 0.01026631, + "auxiliary_loss_mlp": 0.01008045, + "balance_loss_clip": 1.00700212, + "balance_loss_mlp": 1.00617576, + "epoch": 0.7390350217946791, + "flos": 54403661318400.0, + "grad_norm": 0.9921278078436683, + "language_loss": 0.60870874, + "learning_rate": 6.727346847409052e-07, + "loss": 0.6290555, + "num_input_tokens_seen": 265161230, + "router_z_loss_clip": 0.01043701, + "router_z_loss_mlp": 0.20507812, + "step": 12292, + "time_per_iteration": 2.740140676498413 + }, + { + "auxiliary_loss_clip": 0.0110263, + "auxiliary_loss_mlp": 0.01030054, + "balance_loss_clip": 1.0190208, + "balance_loss_mlp": 1.03666413, + "epoch": 0.7390951450473471, + "flos": 32196968530560.0, + "grad_norm": 2.0283775750990447, + "language_loss": 0.67287552, + "learning_rate": 6.724433697406191e-07, + "loss": 0.6942023, + "num_input_tokens_seen": 265182515, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.66015625, + "step": 12293, + "time_per_iteration": 2.5637433528900146 + }, + { + "auxiliary_loss_clip": 0.01101914, + "auxiliary_loss_mlp": 0.01030871, + "balance_loss_clip": 1.01897979, + "balance_loss_mlp": 1.03533363, + "epoch": 0.739155268300015, + "flos": 16684241304960.0, + "grad_norm": 1.7680717845070275, + "language_loss": 0.83443105, + "learning_rate": 6.721521050814134e-07, + "loss": 0.85575891, + "num_input_tokens_seen": 265198160, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 12294, + "time_per_iteration": 2.452796697616577 + }, + { + "auxiliary_loss_clip": 0.0109896, + "auxiliary_loss_mlp": 0.01030815, + "balance_loss_clip": 1.01865005, + "balance_loss_mlp": 1.03435683, + "epoch": 0.739215391552683, + "flos": 31649761762560.0, + "grad_norm": 1.704234892939925, + "language_loss": 0.72765625, + "learning_rate": 6.718608907743337e-07, + "loss": 0.74895406, + "num_input_tokens_seen": 265218480, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6484375, + "step": 12295, + "time_per_iteration": 2.532444953918457 + }, + { + "auxiliary_loss_clip": 0.01099527, + "auxiliary_loss_mlp": 0.01036709, + "balance_loss_clip": 1.0250864, + "balance_loss_mlp": 1.03585625, + "epoch": 0.7392755148053509, + "flos": 29718522097920.0, + "grad_norm": 1.6789172360591735, + "language_loss": 0.78772449, + "learning_rate": 6.715697268304215e-07, + "loss": 0.8090868, + "num_input_tokens_seen": 265240165, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.63671875, + "step": 12296, + "time_per_iteration": 2.5699706077575684 + }, + { + "auxiliary_loss_clip": 0.01102686, + "auxiliary_loss_mlp": 0.01031782, + "balance_loss_clip": 1.01921093, + "balance_loss_mlp": 1.03617287, + "epoch": 0.7393356380580189, + "flos": 37050475075200.0, + "grad_norm": 1.8636543361899776, + "language_loss": 0.66520232, + "learning_rate": 6.712786132607182e-07, + "loss": 0.68654692, + "num_input_tokens_seen": 265263295, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6640625, + "step": 12297, + "time_per_iteration": 2.5840320587158203 + }, + { + "auxiliary_loss_clip": 0.01103197, + "auxiliary_loss_mlp": 0.0103567, + "balance_loss_clip": 1.0230639, + "balance_loss_mlp": 1.03605783, + "epoch": 0.739395761310687, + "flos": 19719627091200.0, + "grad_norm": 2.2038505631105054, + "language_loss": 0.68769479, + "learning_rate": 6.709875500762645e-07, + "loss": 0.70908344, + "num_input_tokens_seen": 265282740, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.671875, + "step": 12298, + "time_per_iteration": 2.4649643898010254 + }, + { + "auxiliary_loss_clip": 0.01102459, + "auxiliary_loss_mlp": 0.01029321, + "balance_loss_clip": 1.01767373, + "balance_loss_mlp": 1.0349468, + "epoch": 0.7394558845633549, + "flos": 11801504067840.0, + "grad_norm": 1.7869505814548332, + "language_loss": 0.74577737, + "learning_rate": 6.706965372880946e-07, + "loss": 0.76709521, + "num_input_tokens_seen": 265300175, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.67578125, + "step": 12299, + "time_per_iteration": 2.4275574684143066 + }, + { + "auxiliary_loss_clip": 0.01025983, + "auxiliary_loss_mlp": 0.01002146, + "balance_loss_clip": 1.00116849, + "balance_loss_mlp": 1.00569797, + "epoch": 0.7395160078160229, + "flos": 66195827850240.0, + "grad_norm": 0.7180686194551699, + "language_loss": 0.60861343, + "learning_rate": 6.704055749072455e-07, + "loss": 0.62889469, + "num_input_tokens_seen": 265363275, + "router_z_loss_clip": 0.00976562, + "router_z_loss_mlp": 0.203125, + "step": 12300, + "time_per_iteration": 3.1263675689697266 + }, + { + "auxiliary_loss_clip": 0.01102982, + "auxiliary_loss_mlp": 0.01028506, + "balance_loss_clip": 1.01645398, + "balance_loss_mlp": 1.03720665, + "epoch": 0.7395761310686908, + "flos": 21249708687360.0, + "grad_norm": 1.4253075505979764, + "language_loss": 0.80278659, + "learning_rate": 6.7011466294475e-07, + "loss": 0.82410145, + "num_input_tokens_seen": 265382935, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.65625, + "step": 12301, + "time_per_iteration": 2.46708345413208 + }, + { + "auxiliary_loss_clip": 0.01100248, + "auxiliary_loss_mlp": 0.01028329, + "balance_loss_clip": 1.01725399, + "balance_loss_mlp": 1.0343194, + "epoch": 0.7396362543213588, + "flos": 25955299025280.0, + "grad_norm": 1.5951843205733178, + "language_loss": 0.73313689, + "learning_rate": 6.698238014116406e-07, + "loss": 0.75442266, + "num_input_tokens_seen": 265403245, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.66015625, + "step": 12302, + "time_per_iteration": 2.532886028289795 + }, + { + "auxiliary_loss_clip": 0.01102922, + "auxiliary_loss_mlp": 0.01036251, + "balance_loss_clip": 1.02475905, + "balance_loss_mlp": 1.03542805, + "epoch": 0.7396963775740267, + "flos": 27377936064000.0, + "grad_norm": 1.7925873497266347, + "language_loss": 0.7409184, + "learning_rate": 6.695329903189451e-07, + "loss": 0.76231015, + "num_input_tokens_seen": 265423105, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.67578125, + "step": 12303, + "time_per_iteration": 3.842045545578003 + }, + { + "auxiliary_loss_clip": 0.01098981, + "auxiliary_loss_mlp": 0.01026474, + "balance_loss_clip": 1.01557863, + "balance_loss_mlp": 1.03380299, + "epoch": 0.7397565008266948, + "flos": 25520133755520.0, + "grad_norm": 1.7395112572263238, + "language_loss": 0.54232901, + "learning_rate": 6.692422296776927e-07, + "loss": 0.56358361, + "num_input_tokens_seen": 265443445, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6484375, + "step": 12304, + "time_per_iteration": 2.5310745239257812 + }, + { + "auxiliary_loss_clip": 0.01102065, + "auxiliary_loss_mlp": 0.01029526, + "balance_loss_clip": 1.01740789, + "balance_loss_mlp": 1.03500128, + "epoch": 0.7398166240793627, + "flos": 23727760070400.0, + "grad_norm": 1.9555871557250795, + "language_loss": 0.841694, + "learning_rate": 6.689515194989084e-07, + "loss": 0.86300987, + "num_input_tokens_seen": 265462085, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 12305, + "time_per_iteration": 3.97141170501709 + }, + { + "auxiliary_loss_clip": 0.01025514, + "auxiliary_loss_mlp": 0.00998213, + "balance_loss_clip": 0.99718779, + "balance_loss_mlp": 1.0049659, + "epoch": 0.7398767473320307, + "flos": 67267582882560.0, + "grad_norm": 0.8695449825144963, + "language_loss": 0.57674229, + "learning_rate": 6.68660859793615e-07, + "loss": 0.59697956, + "num_input_tokens_seen": 265521190, + "router_z_loss_clip": 0.01025391, + "router_z_loss_mlp": 0.20507812, + "step": 12306, + "time_per_iteration": 4.480564117431641 + }, + { + "auxiliary_loss_clip": 0.01105578, + "auxiliary_loss_mlp": 0.01031897, + "balance_loss_clip": 1.01955891, + "balance_loss_mlp": 1.03752124, + "epoch": 0.7399368705846986, + "flos": 22018699981440.0, + "grad_norm": 1.94634660943293, + "language_loss": 0.81800246, + "learning_rate": 6.683702505728355e-07, + "loss": 0.83937716, + "num_input_tokens_seen": 265539705, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6796875, + "step": 12307, + "time_per_iteration": 3.964345932006836 + }, + { + "auxiliary_loss_clip": 0.01099571, + "auxiliary_loss_mlp": 0.01028725, + "balance_loss_clip": 1.01743591, + "balance_loss_mlp": 1.03615248, + "epoch": 0.7399969938373666, + "flos": 14173870659840.0, + "grad_norm": 1.7625756479889783, + "language_loss": 0.69852555, + "learning_rate": 6.680796918475893e-07, + "loss": 0.71980846, + "num_input_tokens_seen": 265555855, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.63671875, + "step": 12308, + "time_per_iteration": 2.426374912261963 + }, + { + "auxiliary_loss_clip": 0.01097458, + "auxiliary_loss_mlp": 0.01025287, + "balance_loss_clip": 1.01394367, + "balance_loss_mlp": 1.03327668, + "epoch": 0.7400571170900345, + "flos": 25301473712640.0, + "grad_norm": 1.8311869299558743, + "language_loss": 0.81359291, + "learning_rate": 6.67789183628896e-07, + "loss": 0.83482039, + "num_input_tokens_seen": 265575455, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 12309, + "time_per_iteration": 2.47933292388916 + }, + { + "auxiliary_loss_clip": 0.01102906, + "auxiliary_loss_mlp": 0.01033716, + "balance_loss_clip": 1.02118754, + "balance_loss_mlp": 1.03444481, + "epoch": 0.7401172403427025, + "flos": 22711344917760.0, + "grad_norm": 1.7272186323130432, + "language_loss": 0.72933966, + "learning_rate": 6.674987259277692e-07, + "loss": 0.7507059, + "num_input_tokens_seen": 265595250, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 12310, + "time_per_iteration": 2.458360195159912 + }, + { + "auxiliary_loss_clip": 0.01105362, + "auxiliary_loss_mlp": 0.01037153, + "balance_loss_clip": 1.02455902, + "balance_loss_mlp": 1.03706884, + "epoch": 0.7401773635953706, + "flos": 18067448188800.0, + "grad_norm": 2.8138497569314165, + "language_loss": 0.8816393, + "learning_rate": 6.672083187552239e-07, + "loss": 0.90306449, + "num_input_tokens_seen": 265606945, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.68359375, + "step": 12311, + "time_per_iteration": 2.4193923473358154 + }, + { + "auxiliary_loss_clip": 0.0110026, + "auxiliary_loss_mlp": 0.01026797, + "balance_loss_clip": 1.01557398, + "balance_loss_mlp": 1.0338285, + "epoch": 0.7402374868480385, + "flos": 22712135016960.0, + "grad_norm": 1.5281974655269193, + "language_loss": 0.80203426, + "learning_rate": 6.669179621222738e-07, + "loss": 0.82330477, + "num_input_tokens_seen": 265626115, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6640625, + "step": 12312, + "time_per_iteration": 2.5062949657440186 + }, + { + "auxiliary_loss_clip": 0.01100667, + "auxiliary_loss_mlp": 0.01026723, + "balance_loss_clip": 1.0153985, + "balance_loss_mlp": 1.03547597, + "epoch": 0.7402976101007065, + "flos": 22856675345280.0, + "grad_norm": 2.0496860461073676, + "language_loss": 0.7839551, + "learning_rate": 6.666276560399273e-07, + "loss": 0.80522901, + "num_input_tokens_seen": 265646520, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 12313, + "time_per_iteration": 2.4662246704101562 + }, + { + "auxiliary_loss_clip": 0.01102693, + "auxiliary_loss_mlp": 0.01036243, + "balance_loss_clip": 1.02358902, + "balance_loss_mlp": 1.03396571, + "epoch": 0.7403577333533744, + "flos": 12345801834240.0, + "grad_norm": 2.00903442682859, + "language_loss": 0.78872943, + "learning_rate": 6.663374005191937e-07, + "loss": 0.81011879, + "num_input_tokens_seen": 265661875, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 12314, + "time_per_iteration": 2.446385622024536 + }, + { + "auxiliary_loss_clip": 0.01024604, + "auxiliary_loss_mlp": 0.01003964, + "balance_loss_clip": 1.00296831, + "balance_loss_mlp": 1.00410616, + "epoch": 0.7404178566060424, + "flos": 60327270869760.0, + "grad_norm": 0.8412651667201435, + "language_loss": 0.55169189, + "learning_rate": 6.660471955710809e-07, + "loss": 0.57197762, + "num_input_tokens_seen": 265721255, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.20507812, + "step": 12315, + "time_per_iteration": 3.0314457416534424 + }, + { + "auxiliary_loss_clip": 0.01097855, + "auxiliary_loss_mlp": 0.01031672, + "balance_loss_clip": 1.02031732, + "balance_loss_mlp": 1.03454709, + "epoch": 0.7404779798587103, + "flos": 32014650072960.0, + "grad_norm": 1.5280075701489741, + "language_loss": 0.79192966, + "learning_rate": 6.65757041206591e-07, + "loss": 0.81322497, + "num_input_tokens_seen": 265743970, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6328125, + "step": 12316, + "time_per_iteration": 2.5997025966644287 + }, + { + "auxiliary_loss_clip": 0.0109893, + "auxiliary_loss_mlp": 0.01029612, + "balance_loss_clip": 1.01758349, + "balance_loss_mlp": 1.03257847, + "epoch": 0.7405381031113784, + "flos": 12889704551040.0, + "grad_norm": 1.6312870183183517, + "language_loss": 0.74777615, + "learning_rate": 6.654669374367275e-07, + "loss": 0.76906157, + "num_input_tokens_seen": 265760890, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 12317, + "time_per_iteration": 2.409041404724121 + }, + { + "auxiliary_loss_clip": 0.01097259, + "auxiliary_loss_mlp": 0.01034106, + "balance_loss_clip": 1.02258456, + "balance_loss_mlp": 1.03415799, + "epoch": 0.7405982263640463, + "flos": 20229127557120.0, + "grad_norm": 1.5381739579945533, + "language_loss": 0.81140697, + "learning_rate": 6.651768842724917e-07, + "loss": 0.83272064, + "num_input_tokens_seen": 265779600, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.62890625, + "step": 12318, + "time_per_iteration": 2.483341932296753 + }, + { + "auxiliary_loss_clip": 0.01102727, + "auxiliary_loss_mlp": 0.01029982, + "balance_loss_clip": 1.01801968, + "balance_loss_mlp": 1.03532875, + "epoch": 0.7406583496167143, + "flos": 17567213431680.0, + "grad_norm": 2.10976565284071, + "language_loss": 0.76717627, + "learning_rate": 6.648868817248827e-07, + "loss": 0.78850329, + "num_input_tokens_seen": 265797030, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.67578125, + "step": 12319, + "time_per_iteration": 2.4090797901153564 + }, + { + "auxiliary_loss_clip": 0.01100157, + "auxiliary_loss_mlp": 0.01031224, + "balance_loss_clip": 1.02052474, + "balance_loss_mlp": 1.03510928, + "epoch": 0.7407184728693822, + "flos": 18295733076480.0, + "grad_norm": 2.728045021553726, + "language_loss": 0.64247096, + "learning_rate": 6.64596929804897e-07, + "loss": 0.6637848, + "num_input_tokens_seen": 265815055, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.65234375, + "step": 12320, + "time_per_iteration": 2.4777369499206543 + }, + { + "auxiliary_loss_clip": 0.01104796, + "auxiliary_loss_mlp": 0.01034808, + "balance_loss_clip": 1.02257681, + "balance_loss_mlp": 1.03554249, + "epoch": 0.7407785961220502, + "flos": 16690562098560.0, + "grad_norm": 2.5603662317591307, + "language_loss": 0.83399361, + "learning_rate": 6.643070285235288e-07, + "loss": 0.8553896, + "num_input_tokens_seen": 265828480, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69140625, + "step": 12321, + "time_per_iteration": 2.5069942474365234 + }, + { + "auxiliary_loss_clip": 0.01106734, + "auxiliary_loss_mlp": 0.01043827, + "balance_loss_clip": 1.03052354, + "balance_loss_mlp": 1.03583789, + "epoch": 0.7408387193747181, + "flos": 22088330496000.0, + "grad_norm": 1.897257666550991, + "language_loss": 0.71964365, + "learning_rate": 6.640171778917727e-07, + "loss": 0.74114925, + "num_input_tokens_seen": 265845825, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.7109375, + "step": 12322, + "time_per_iteration": 2.4930129051208496 + }, + { + "auxiliary_loss_clip": 0.01101959, + "auxiliary_loss_mlp": 0.0103281, + "balance_loss_clip": 1.02137196, + "balance_loss_mlp": 1.03622496, + "epoch": 0.7408988426273861, + "flos": 24236721832320.0, + "grad_norm": 1.870315243792337, + "language_loss": 0.64078039, + "learning_rate": 6.637273779206183e-07, + "loss": 0.66212809, + "num_input_tokens_seen": 265866335, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 12323, + "time_per_iteration": 2.4777188301086426 + }, + { + "auxiliary_loss_clip": 0.01102632, + "auxiliary_loss_mlp": 0.01026506, + "balance_loss_clip": 1.0141499, + "balance_loss_mlp": 1.03480208, + "epoch": 0.7409589658800542, + "flos": 29023004073600.0, + "grad_norm": 1.4950637015537451, + "language_loss": 0.75935167, + "learning_rate": 6.634376286210559e-07, + "loss": 0.78064305, + "num_input_tokens_seen": 265888945, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6796875, + "step": 12324, + "time_per_iteration": 2.5902748107910156 + }, + { + "auxiliary_loss_clip": 0.01099826, + "auxiliary_loss_mlp": 0.01023896, + "balance_loss_clip": 1.01248217, + "balance_loss_mlp": 1.0326978, + "epoch": 0.7410190891327221, + "flos": 19351362902400.0, + "grad_norm": 1.7779845069008868, + "language_loss": 0.74595994, + "learning_rate": 6.63147930004073e-07, + "loss": 0.76719713, + "num_input_tokens_seen": 265908030, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 12325, + "time_per_iteration": 2.428908586502075 + }, + { + "auxiliary_loss_clip": 0.01105539, + "auxiliary_loss_mlp": 0.01032307, + "balance_loss_clip": 1.01949763, + "balance_loss_mlp": 1.03505337, + "epoch": 0.7410792123853901, + "flos": 22747650589440.0, + "grad_norm": 1.8169030049946526, + "language_loss": 0.68363488, + "learning_rate": 6.628582820806545e-07, + "loss": 0.70501333, + "num_input_tokens_seen": 265927030, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.703125, + "step": 12326, + "time_per_iteration": 2.4834694862365723 + }, + { + "auxiliary_loss_clip": 0.01101938, + "auxiliary_loss_mlp": 0.01027612, + "balance_loss_clip": 1.0159893, + "balance_loss_mlp": 1.03513253, + "epoch": 0.741139335638058, + "flos": 25372433030400.0, + "grad_norm": 2.058459084269704, + "language_loss": 0.89730138, + "learning_rate": 6.625686848617835e-07, + "loss": 0.91859686, + "num_input_tokens_seen": 265945490, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66796875, + "step": 12327, + "time_per_iteration": 2.4705865383148193 + }, + { + "auxiliary_loss_clip": 0.01101392, + "auxiliary_loss_mlp": 0.01031454, + "balance_loss_clip": 1.01925874, + "balance_loss_mlp": 1.03504896, + "epoch": 0.741199458890726, + "flos": 18585639745920.0, + "grad_norm": 1.6496511439188377, + "language_loss": 0.85582221, + "learning_rate": 6.62279138358442e-07, + "loss": 0.87715065, + "num_input_tokens_seen": 265963265, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6640625, + "step": 12328, + "time_per_iteration": 2.440108060836792 + }, + { + "auxiliary_loss_clip": 0.0109826, + "auxiliary_loss_mlp": 0.01029623, + "balance_loss_clip": 1.01708829, + "balance_loss_mlp": 1.03355885, + "epoch": 0.7412595821433939, + "flos": 22127078292480.0, + "grad_norm": 1.676741332984265, + "language_loss": 0.66687691, + "learning_rate": 6.619896425816103e-07, + "loss": 0.68815577, + "num_input_tokens_seen": 265982270, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6484375, + "step": 12329, + "time_per_iteration": 2.433601140975952 + }, + { + "auxiliary_loss_clip": 0.01105654, + "auxiliary_loss_mlp": 0.01029158, + "balance_loss_clip": 1.01754093, + "balance_loss_mlp": 1.03583872, + "epoch": 0.741319705396062, + "flos": 29169699217920.0, + "grad_norm": 1.8984380479185268, + "language_loss": 0.66488492, + "learning_rate": 6.617001975422647e-07, + "loss": 0.68623304, + "num_input_tokens_seen": 266003835, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.69921875, + "step": 12330, + "time_per_iteration": 2.5116231441497803 + }, + { + "auxiliary_loss_clip": 0.01108565, + "auxiliary_loss_mlp": 0.0103297, + "balance_loss_clip": 1.01889706, + "balance_loss_mlp": 1.03731847, + "epoch": 0.7413798286487299, + "flos": 20667489137280.0, + "grad_norm": 1.9345159720147296, + "language_loss": 0.85613048, + "learning_rate": 6.614108032513823e-07, + "loss": 0.87754583, + "num_input_tokens_seen": 266021595, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7109375, + "step": 12331, + "time_per_iteration": 2.4270429611206055 + }, + { + "auxiliary_loss_clip": 0.01102715, + "auxiliary_loss_mlp": 0.01030968, + "balance_loss_clip": 1.01889229, + "balance_loss_mlp": 1.03435421, + "epoch": 0.7414399519013979, + "flos": 16398895662720.0, + "grad_norm": 1.9091499126857316, + "language_loss": 0.69466591, + "learning_rate": 6.611214597199364e-07, + "loss": 0.7160027, + "num_input_tokens_seen": 266039860, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.68359375, + "step": 12332, + "time_per_iteration": 2.422391176223755 + }, + { + "auxiliary_loss_clip": 0.01102435, + "auxiliary_loss_mlp": 0.01035735, + "balance_loss_clip": 1.02290845, + "balance_loss_mlp": 1.03556943, + "epoch": 0.7415000751540658, + "flos": 25630235919360.0, + "grad_norm": 2.2157206056702097, + "language_loss": 0.63370979, + "learning_rate": 6.608321669588984e-07, + "loss": 0.65509146, + "num_input_tokens_seen": 266058050, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.66796875, + "step": 12333, + "time_per_iteration": 2.505436420440674 + }, + { + "auxiliary_loss_clip": 0.01099765, + "auxiliary_loss_mlp": 0.01033254, + "balance_loss_clip": 1.02141094, + "balance_loss_mlp": 1.03644109, + "epoch": 0.7415601984067338, + "flos": 24499732193280.0, + "grad_norm": 1.6374577716994534, + "language_loss": 0.71271133, + "learning_rate": 6.605429249792387e-07, + "loss": 0.73404145, + "num_input_tokens_seen": 266078060, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6328125, + "step": 12334, + "time_per_iteration": 2.5002856254577637 + }, + { + "auxiliary_loss_clip": 0.01101856, + "auxiliary_loss_mlp": 0.01027153, + "balance_loss_clip": 1.01598334, + "balance_loss_mlp": 1.03537202, + "epoch": 0.7416203216594017, + "flos": 20887154760960.0, + "grad_norm": 1.9057001714532567, + "language_loss": 0.82662481, + "learning_rate": 6.602537337919257e-07, + "loss": 0.84791493, + "num_input_tokens_seen": 266097110, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6640625, + "step": 12335, + "time_per_iteration": 2.420285701751709 + }, + { + "auxiliary_loss_clip": 0.01101959, + "auxiliary_loss_mlp": 0.01031092, + "balance_loss_clip": 1.01862848, + "balance_loss_mlp": 1.03514791, + "epoch": 0.7416804449120697, + "flos": 15624265933440.0, + "grad_norm": 2.6318734852412082, + "language_loss": 0.74709713, + "learning_rate": 6.599645934079259e-07, + "loss": 0.76842761, + "num_input_tokens_seen": 266110870, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.66796875, + "step": 12336, + "time_per_iteration": 2.395914077758789 + }, + { + "auxiliary_loss_clip": 0.01105209, + "auxiliary_loss_mlp": 0.0102888, + "balance_loss_clip": 1.01698947, + "balance_loss_mlp": 1.03675711, + "epoch": 0.7417405681647377, + "flos": 17120483982720.0, + "grad_norm": 2.0074082890204803, + "language_loss": 0.73073846, + "learning_rate": 6.596755038382029e-07, + "loss": 0.75207937, + "num_input_tokens_seen": 266127845, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.68359375, + "step": 12337, + "time_per_iteration": 2.4017410278320312 + }, + { + "auxiliary_loss_clip": 0.01101618, + "auxiliary_loss_mlp": 0.0103512, + "balance_loss_clip": 1.02384233, + "balance_loss_mlp": 1.0375526, + "epoch": 0.7418006914174057, + "flos": 18880322924160.0, + "grad_norm": 1.582069295944861, + "language_loss": 0.76476055, + "learning_rate": 6.593864650937186e-07, + "loss": 0.78612792, + "num_input_tokens_seen": 266145400, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.640625, + "step": 12338, + "time_per_iteration": 2.469158172607422 + }, + { + "auxiliary_loss_clip": 0.01098771, + "auxiliary_loss_mlp": 0.01027623, + "balance_loss_clip": 1.01728797, + "balance_loss_mlp": 1.03412902, + "epoch": 0.7418608146700737, + "flos": 21580733450880.0, + "grad_norm": 1.7521644726075343, + "language_loss": 0.73067641, + "learning_rate": 6.590974771854345e-07, + "loss": 0.75194031, + "num_input_tokens_seen": 266164430, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.64453125, + "step": 12339, + "time_per_iteration": 2.4999265670776367 + }, + { + "auxiliary_loss_clip": 0.01103048, + "auxiliary_loss_mlp": 0.01027293, + "balance_loss_clip": 1.01544917, + "balance_loss_mlp": 1.03630698, + "epoch": 0.7419209379227416, + "flos": 22340459036160.0, + "grad_norm": 1.733265242117768, + "language_loss": 0.79821277, + "learning_rate": 6.588085401243077e-07, + "loss": 0.81951618, + "num_input_tokens_seen": 266183855, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66796875, + "step": 12340, + "time_per_iteration": 2.5067059993743896 + }, + { + "auxiliary_loss_clip": 0.0110211, + "auxiliary_loss_mlp": 0.01033939, + "balance_loss_clip": 1.02174389, + "balance_loss_mlp": 1.0347414, + "epoch": 0.7419810611754096, + "flos": 16762275601920.0, + "grad_norm": 1.853046258672694, + "language_loss": 0.75634474, + "learning_rate": 6.585196539212958e-07, + "loss": 0.77770519, + "num_input_tokens_seen": 266202085, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 12341, + "time_per_iteration": 2.419905662536621 + }, + { + "auxiliary_loss_clip": 0.01095271, + "auxiliary_loss_mlp": 0.0103099, + "balance_loss_clip": 1.01980829, + "balance_loss_mlp": 1.03472114, + "epoch": 0.7420411844280775, + "flos": 26212958259840.0, + "grad_norm": 1.6930413865654552, + "language_loss": 0.80139267, + "learning_rate": 6.582308185873535e-07, + "loss": 0.82265526, + "num_input_tokens_seen": 266223445, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.60546875, + "step": 12342, + "time_per_iteration": 2.5155606269836426 + }, + { + "auxiliary_loss_clip": 0.01100642, + "auxiliary_loss_mlp": 0.01028992, + "balance_loss_clip": 1.01748824, + "balance_loss_mlp": 1.03512716, + "epoch": 0.7421013076807456, + "flos": 68529371840640.0, + "grad_norm": 1.6721865826322508, + "language_loss": 0.77694213, + "learning_rate": 6.57942034133433e-07, + "loss": 0.79823846, + "num_input_tokens_seen": 266246575, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 12343, + "time_per_iteration": 2.8234310150146484 + }, + { + "auxiliary_loss_clip": 0.01097938, + "auxiliary_loss_mlp": 0.01031813, + "balance_loss_clip": 1.02027309, + "balance_loss_mlp": 1.03249693, + "epoch": 0.7421614309334135, + "flos": 24425325169920.0, + "grad_norm": 1.7204142149055508, + "language_loss": 0.67798221, + "learning_rate": 6.576533005704843e-07, + "loss": 0.69927979, + "num_input_tokens_seen": 266266055, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 12344, + "time_per_iteration": 3.9860341548919678 + }, + { + "auxiliary_loss_clip": 0.01102936, + "auxiliary_loss_mlp": 0.01033112, + "balance_loss_clip": 1.0204283, + "balance_loss_mlp": 1.03553951, + "epoch": 0.7422215541860815, + "flos": 12311076360960.0, + "grad_norm": 2.3379030417701423, + "language_loss": 0.81033051, + "learning_rate": 6.573646179094572e-07, + "loss": 0.83169097, + "num_input_tokens_seen": 266282240, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.67578125, + "step": 12345, + "time_per_iteration": 2.457531213760376 + }, + { + "auxiliary_loss_clip": 0.01102706, + "auxiliary_loss_mlp": 0.01036384, + "balance_loss_clip": 1.02450442, + "balance_loss_mlp": 1.0354228, + "epoch": 0.7422816774387494, + "flos": 19645579203840.0, + "grad_norm": 1.9598348009853668, + "language_loss": 0.71018803, + "learning_rate": 6.570759861612988e-07, + "loss": 0.73157895, + "num_input_tokens_seen": 266300980, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 12346, + "time_per_iteration": 3.8033220767974854 + }, + { + "auxiliary_loss_clip": 0.01102695, + "auxiliary_loss_mlp": 0.01030351, + "balance_loss_clip": 1.01851344, + "balance_loss_mlp": 1.03597689, + "epoch": 0.7423418006914174, + "flos": 32015978876160.0, + "grad_norm": 1.5893772785658562, + "language_loss": 0.73678845, + "learning_rate": 6.56787405336953e-07, + "loss": 0.75811887, + "num_input_tokens_seen": 266322215, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6640625, + "step": 12347, + "time_per_iteration": 3.922349691390991 + }, + { + "auxiliary_loss_clip": 0.0110556, + "auxiliary_loss_mlp": 0.01030516, + "balance_loss_clip": 1.01888108, + "balance_loss_mlp": 1.03616238, + "epoch": 0.7424019239440853, + "flos": 18916951818240.0, + "grad_norm": 1.7507272785973695, + "language_loss": 0.80773383, + "learning_rate": 6.564988754473642e-07, + "loss": 0.82909453, + "num_input_tokens_seen": 266341600, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6953125, + "step": 12348, + "time_per_iteration": 3.8946139812469482 + }, + { + "auxiliary_loss_clip": 0.01100119, + "auxiliary_loss_mlp": 0.01028617, + "balance_loss_clip": 1.01724422, + "balance_loss_mlp": 1.03434706, + "epoch": 0.7424620471967533, + "flos": 35876518871040.0, + "grad_norm": 1.9451806865791765, + "language_loss": 0.72609961, + "learning_rate": 6.562103965034724e-07, + "loss": 0.74738705, + "num_input_tokens_seen": 266362895, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65625, + "step": 12349, + "time_per_iteration": 2.6297786235809326 + }, + { + "auxiliary_loss_clip": 0.01105402, + "auxiliary_loss_mlp": 0.01032044, + "balance_loss_clip": 1.01891899, + "balance_loss_mlp": 1.03512514, + "epoch": 0.7425221704494213, + "flos": 27016603200000.0, + "grad_norm": 1.884291217596135, + "language_loss": 0.78724527, + "learning_rate": 6.559219685162165e-07, + "loss": 0.80861974, + "num_input_tokens_seen": 266384015, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 12350, + "time_per_iteration": 2.500523567199707 + }, + { + "auxiliary_loss_clip": 0.01100361, + "auxiliary_loss_mlp": 0.01034078, + "balance_loss_clip": 1.02290213, + "balance_loss_mlp": 1.03446043, + "epoch": 0.7425822937020893, + "flos": 34167135559680.0, + "grad_norm": 1.7431994876148182, + "language_loss": 0.74992573, + "learning_rate": 6.556335914965343e-07, + "loss": 0.7712701, + "num_input_tokens_seen": 266405990, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.66015625, + "step": 12351, + "time_per_iteration": 2.570344924926758 + }, + { + "auxiliary_loss_clip": 0.01100715, + "auxiliary_loss_mlp": 0.01024897, + "balance_loss_clip": 1.01363814, + "balance_loss_mlp": 1.03487992, + "epoch": 0.7426424169547573, + "flos": 21283572234240.0, + "grad_norm": 1.8775764813546454, + "language_loss": 0.81292212, + "learning_rate": 6.553452654553611e-07, + "loss": 0.83417821, + "num_input_tokens_seen": 266424260, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 12352, + "time_per_iteration": 2.4442734718322754 + }, + { + "auxiliary_loss_clip": 0.01103269, + "auxiliary_loss_mlp": 0.01037884, + "balance_loss_clip": 1.02641034, + "balance_loss_mlp": 1.0369432, + "epoch": 0.7427025402074252, + "flos": 22448442297600.0, + "grad_norm": 1.9024946732776964, + "language_loss": 0.71716195, + "learning_rate": 6.550569904036307e-07, + "loss": 0.73857349, + "num_input_tokens_seen": 266444580, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6640625, + "step": 12353, + "time_per_iteration": 2.4757235050201416 + }, + { + "auxiliary_loss_clip": 0.01102245, + "auxiliary_loss_mlp": 0.0103172, + "balance_loss_clip": 1.02075243, + "balance_loss_mlp": 1.03749537, + "epoch": 0.7427626634600932, + "flos": 22524609087360.0, + "grad_norm": 1.5592881493961996, + "language_loss": 0.72042692, + "learning_rate": 6.547687663522739e-07, + "loss": 0.74176657, + "num_input_tokens_seen": 266465640, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 12354, + "time_per_iteration": 2.4892525672912598 + }, + { + "auxiliary_loss_clip": 0.01024379, + "auxiliary_loss_mlp": 0.01002171, + "balance_loss_clip": 1.00115824, + "balance_loss_mlp": 1.0041914, + "epoch": 0.7428227867127611, + "flos": 67209477655680.0, + "grad_norm": 0.7195367720859078, + "language_loss": 0.595505, + "learning_rate": 6.544805933122199e-07, + "loss": 0.61577046, + "num_input_tokens_seen": 266531950, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.20214844, + "step": 12355, + "time_per_iteration": 3.1565847396850586 + }, + { + "auxiliary_loss_clip": 0.01101716, + "auxiliary_loss_mlp": 0.01029249, + "balance_loss_clip": 1.01746547, + "balance_loss_mlp": 1.03509939, + "epoch": 0.7428829099654292, + "flos": 14721221082240.0, + "grad_norm": 1.5856742175038152, + "language_loss": 0.67546952, + "learning_rate": 6.541924712943971e-07, + "loss": 0.69677925, + "num_input_tokens_seen": 266550665, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 12356, + "time_per_iteration": 2.4489800930023193 + }, + { + "auxiliary_loss_clip": 0.01102658, + "auxiliary_loss_mlp": 0.01035169, + "balance_loss_clip": 1.02305126, + "balance_loss_mlp": 1.03400218, + "epoch": 0.7429430332180971, + "flos": 48646496413440.0, + "grad_norm": 2.760673613642481, + "language_loss": 0.72485077, + "learning_rate": 6.539044003097301e-07, + "loss": 0.74622905, + "num_input_tokens_seen": 266572455, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 12357, + "time_per_iteration": 2.721644401550293 + }, + { + "auxiliary_loss_clip": 0.01098694, + "auxiliary_loss_mlp": 0.01028573, + "balance_loss_clip": 1.0175041, + "balance_loss_mlp": 1.03629148, + "epoch": 0.7430031564707651, + "flos": 16764071281920.0, + "grad_norm": 2.0039134107579395, + "language_loss": 0.65105826, + "learning_rate": 6.53616380369143e-07, + "loss": 0.67233098, + "num_input_tokens_seen": 266590895, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.625, + "step": 12358, + "time_per_iteration": 2.4294605255126953 + }, + { + "auxiliary_loss_clip": 0.01104584, + "auxiliary_loss_mlp": 0.01035558, + "balance_loss_clip": 1.02243936, + "balance_loss_mlp": 1.03652191, + "epoch": 0.743063279723433, + "flos": 23870576545920.0, + "grad_norm": 1.8081229014020102, + "language_loss": 0.80658948, + "learning_rate": 6.533284114835591e-07, + "loss": 0.82799089, + "num_input_tokens_seen": 266607660, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6796875, + "step": 12359, + "time_per_iteration": 2.4662840366363525 + }, + { + "auxiliary_loss_clip": 0.01100319, + "auxiliary_loss_mlp": 0.01027181, + "balance_loss_clip": 1.01539159, + "balance_loss_mlp": 1.03399527, + "epoch": 0.743123402976101, + "flos": 14391704689920.0, + "grad_norm": 1.9929370638459747, + "language_loss": 0.68443716, + "learning_rate": 6.530404936638956e-07, + "loss": 0.7057122, + "num_input_tokens_seen": 266624260, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 12360, + "time_per_iteration": 2.39972186088562 + }, + { + "auxiliary_loss_clip": 0.01100119, + "auxiliary_loss_mlp": 0.01028769, + "balance_loss_clip": 1.01742589, + "balance_loss_mlp": 1.03408909, + "epoch": 0.7431835262287689, + "flos": 27454318335360.0, + "grad_norm": 1.6105929709695739, + "language_loss": 0.72354007, + "learning_rate": 6.527526269210715e-07, + "loss": 0.74482894, + "num_input_tokens_seen": 266644210, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66015625, + "step": 12361, + "time_per_iteration": 2.53438663482666 + }, + { + "auxiliary_loss_clip": 0.01103295, + "auxiliary_loss_mlp": 0.01031584, + "balance_loss_clip": 1.01991367, + "balance_loss_mlp": 1.03592443, + "epoch": 0.743243649481437, + "flos": 20959514709120.0, + "grad_norm": 1.9313349058571254, + "language_loss": 0.55937529, + "learning_rate": 6.524648112660027e-07, + "loss": 0.58072412, + "num_input_tokens_seen": 266664230, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 12362, + "time_per_iteration": 2.44446063041687 + }, + { + "auxiliary_loss_clip": 0.01103216, + "auxiliary_loss_mlp": 0.01029843, + "balance_loss_clip": 1.01801753, + "balance_loss_mlp": 1.03700173, + "epoch": 0.7433037727341049, + "flos": 22783166161920.0, + "grad_norm": 1.6965020963152944, + "language_loss": 0.77103531, + "learning_rate": 6.521770467096039e-07, + "loss": 0.79236591, + "num_input_tokens_seen": 266683270, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 12363, + "time_per_iteration": 2.4665377140045166 + }, + { + "auxiliary_loss_clip": 0.01100355, + "auxiliary_loss_mlp": 0.01030156, + "balance_loss_clip": 1.01916444, + "balance_loss_mlp": 1.03546381, + "epoch": 0.7433638959867729, + "flos": 22196708807040.0, + "grad_norm": 1.5848696782031413, + "language_loss": 0.781322, + "learning_rate": 6.518893332627862e-07, + "loss": 0.80262709, + "num_input_tokens_seen": 266701235, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6484375, + "step": 12364, + "time_per_iteration": 2.4513514041900635 + }, + { + "auxiliary_loss_clip": 0.01099072, + "auxiliary_loss_mlp": 0.0103225, + "balance_loss_clip": 1.02075863, + "balance_loss_mlp": 1.03311908, + "epoch": 0.7434240192394409, + "flos": 23296760778240.0, + "grad_norm": 1.566466537213553, + "language_loss": 0.78534245, + "learning_rate": 6.516016709364604e-07, + "loss": 0.80665576, + "num_input_tokens_seen": 266721495, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 12365, + "time_per_iteration": 2.5116143226623535 + }, + { + "auxiliary_loss_clip": 0.01103544, + "auxiliary_loss_mlp": 0.01031747, + "balance_loss_clip": 1.01939702, + "balance_loss_mlp": 1.03469706, + "epoch": 0.7434841424921088, + "flos": 54009575251200.0, + "grad_norm": 1.5212918722481565, + "language_loss": 0.76719224, + "learning_rate": 6.513140597415346e-07, + "loss": 0.78854513, + "num_input_tokens_seen": 266747400, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6875, + "step": 12366, + "time_per_iteration": 2.714674711227417 + }, + { + "auxiliary_loss_clip": 0.0109921, + "auxiliary_loss_mlp": 0.01030031, + "balance_loss_clip": 1.01957047, + "balance_loss_mlp": 1.03603196, + "epoch": 0.7435442657447768, + "flos": 21433966479360.0, + "grad_norm": 1.8098497154463502, + "language_loss": 0.7116037, + "learning_rate": 6.510264996889141e-07, + "loss": 0.73289615, + "num_input_tokens_seen": 266767630, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.6328125, + "step": 12367, + "time_per_iteration": 2.4605956077575684 + }, + { + "auxiliary_loss_clip": 0.01104307, + "auxiliary_loss_mlp": 0.01034707, + "balance_loss_clip": 1.02303672, + "balance_loss_mlp": 1.03570354, + "epoch": 0.7436043889974447, + "flos": 24499408970880.0, + "grad_norm": 1.5537878615409826, + "language_loss": 0.74737108, + "learning_rate": 6.507389907895038e-07, + "loss": 0.76876128, + "num_input_tokens_seen": 266788015, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6875, + "step": 12368, + "time_per_iteration": 2.4532225131988525 + }, + { + "auxiliary_loss_clip": 0.01099451, + "auxiliary_loss_mlp": 0.01032106, + "balance_loss_clip": 1.02200878, + "balance_loss_mlp": 1.03582263, + "epoch": 0.7436645122501128, + "flos": 40698388512000.0, + "grad_norm": 2.6437968867522397, + "language_loss": 0.69177192, + "learning_rate": 6.50451533054207e-07, + "loss": 0.7130875, + "num_input_tokens_seen": 266809010, + "router_z_loss_clip": 0.10107422, + "router_z_loss_mlp": 0.63671875, + "step": 12369, + "time_per_iteration": 2.6095521450042725 + }, + { + "auxiliary_loss_clip": 0.01100669, + "auxiliary_loss_mlp": 0.01027152, + "balance_loss_clip": 1.01569033, + "balance_loss_mlp": 1.03491139, + "epoch": 0.7437246355027807, + "flos": 18908835344640.0, + "grad_norm": 1.8225441721973505, + "language_loss": 0.75607926, + "learning_rate": 6.501641264939233e-07, + "loss": 0.77735746, + "num_input_tokens_seen": 266825390, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 12370, + "time_per_iteration": 2.3974015712738037 + }, + { + "auxiliary_loss_clip": 0.01101812, + "auxiliary_loss_mlp": 0.01033435, + "balance_loss_clip": 1.02188921, + "balance_loss_mlp": 1.03682232, + "epoch": 0.7437847587554487, + "flos": 21543817248000.0, + "grad_norm": 1.5003725500414622, + "language_loss": 0.78235525, + "learning_rate": 6.498767711195503e-07, + "loss": 0.80370772, + "num_input_tokens_seen": 266844675, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6484375, + "step": 12371, + "time_per_iteration": 2.587583303451538 + }, + { + "auxiliary_loss_clip": 0.01100563, + "auxiliary_loss_mlp": 0.01024934, + "balance_loss_clip": 1.01351357, + "balance_loss_mlp": 1.03449976, + "epoch": 0.7438448820081166, + "flos": 27782470010880.0, + "grad_norm": 1.5904858963552928, + "language_loss": 0.69456738, + "learning_rate": 6.495894669419857e-07, + "loss": 0.71582228, + "num_input_tokens_seen": 266865160, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 12372, + "time_per_iteration": 2.46589732170105 + }, + { + "auxiliary_loss_clip": 0.01100436, + "auxiliary_loss_mlp": 0.0103015, + "balance_loss_clip": 1.01876593, + "balance_loss_mlp": 1.03523791, + "epoch": 0.7439050052607846, + "flos": 17967832796160.0, + "grad_norm": 2.0303622627769, + "language_loss": 0.74881828, + "learning_rate": 6.493022139721245e-07, + "loss": 0.77012408, + "num_input_tokens_seen": 266883285, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 12373, + "time_per_iteration": 2.429455518722534 + }, + { + "auxiliary_loss_clip": 0.01103553, + "auxiliary_loss_mlp": 0.01031934, + "balance_loss_clip": 1.01918495, + "balance_loss_mlp": 1.03517175, + "epoch": 0.7439651285134525, + "flos": 22958696949120.0, + "grad_norm": 1.8905423318011396, + "language_loss": 0.77127612, + "learning_rate": 6.49015012220858e-07, + "loss": 0.79263097, + "num_input_tokens_seen": 266900960, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.68359375, + "step": 12374, + "time_per_iteration": 2.467027425765991 + }, + { + "auxiliary_loss_clip": 0.01101807, + "auxiliary_loss_mlp": 0.0103348, + "balance_loss_clip": 1.02173197, + "balance_loss_mlp": 1.03450108, + "epoch": 0.7440252517661206, + "flos": 18806777827200.0, + "grad_norm": 2.0275286605601903, + "language_loss": 0.76452887, + "learning_rate": 6.487278616990774e-07, + "loss": 0.7858817, + "num_input_tokens_seen": 266917710, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.671875, + "step": 12375, + "time_per_iteration": 2.4504282474517822 + }, + { + "auxiliary_loss_clip": 0.01098205, + "auxiliary_loss_mlp": 0.01029678, + "balance_loss_clip": 1.01930046, + "balance_loss_mlp": 1.03446364, + "epoch": 0.7440853750187885, + "flos": 20266295155200.0, + "grad_norm": 1.8957308287031664, + "language_loss": 0.77052188, + "learning_rate": 6.484407624176733e-07, + "loss": 0.79180074, + "num_input_tokens_seen": 266934220, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.63671875, + "step": 12376, + "time_per_iteration": 2.426997423171997 + }, + { + "auxiliary_loss_clip": 0.01102657, + "auxiliary_loss_mlp": 0.01026205, + "balance_loss_clip": 1.01435566, + "balance_loss_mlp": 1.03490746, + "epoch": 0.7441454982714565, + "flos": 25337276593920.0, + "grad_norm": 1.648771332644217, + "language_loss": 0.79147625, + "learning_rate": 6.481537143875296e-07, + "loss": 0.81276488, + "num_input_tokens_seen": 266955210, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.67578125, + "step": 12377, + "time_per_iteration": 2.5062367916107178 + }, + { + "auxiliary_loss_clip": 0.01104221, + "auxiliary_loss_mlp": 0.01028645, + "balance_loss_clip": 1.01639605, + "balance_loss_mlp": 1.03595889, + "epoch": 0.7442056215241245, + "flos": 64480910866560.0, + "grad_norm": 1.8728399382870544, + "language_loss": 0.67017269, + "learning_rate": 6.478667176195322e-07, + "loss": 0.69150138, + "num_input_tokens_seen": 266976555, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.68359375, + "step": 12378, + "time_per_iteration": 2.81579327583313 + }, + { + "auxiliary_loss_clip": 0.01103565, + "auxiliary_loss_mlp": 0.01034825, + "balance_loss_clip": 1.02170622, + "balance_loss_mlp": 1.0356729, + "epoch": 0.7442657447767924, + "flos": 31285376242560.0, + "grad_norm": 1.6381441755645296, + "language_loss": 0.71693718, + "learning_rate": 6.475797721245648e-07, + "loss": 0.73832107, + "num_input_tokens_seen": 266997640, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6796875, + "step": 12379, + "time_per_iteration": 2.5361573696136475 + }, + { + "auxiliary_loss_clip": 0.0110076, + "auxiliary_loss_mlp": 0.01032377, + "balance_loss_clip": 1.02015245, + "balance_loss_mlp": 1.0342983, + "epoch": 0.7443258680294604, + "flos": 20807899401600.0, + "grad_norm": 1.779117116222904, + "language_loss": 0.6545527, + "learning_rate": 6.472928779135085e-07, + "loss": 0.67588407, + "num_input_tokens_seen": 267016165, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6640625, + "step": 12380, + "time_per_iteration": 2.4233927726745605 + }, + { + "auxiliary_loss_clip": 0.01103433, + "auxiliary_loss_mlp": 0.01030324, + "balance_loss_clip": 1.0180037, + "balance_loss_mlp": 1.0361979, + "epoch": 0.7443859912821283, + "flos": 22199833290240.0, + "grad_norm": 1.8649656788405269, + "language_loss": 0.78407371, + "learning_rate": 6.470060349972411e-07, + "loss": 0.80541134, + "num_input_tokens_seen": 267034075, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.671875, + "step": 12381, + "time_per_iteration": 2.4858570098876953 + }, + { + "auxiliary_loss_clip": 0.01105177, + "auxiliary_loss_mlp": 0.01030856, + "balance_loss_clip": 1.01844049, + "balance_loss_mlp": 1.03706956, + "epoch": 0.7444461145347964, + "flos": 22017838055040.0, + "grad_norm": 2.020102032989411, + "language_loss": 0.726803, + "learning_rate": 6.467192433866411e-07, + "loss": 0.74816334, + "num_input_tokens_seen": 267053645, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 12382, + "time_per_iteration": 2.4412596225738525 + }, + { + "auxiliary_loss_clip": 0.01025583, + "auxiliary_loss_mlp": 0.0100093, + "balance_loss_clip": 0.99986947, + "balance_loss_mlp": 1.00515175, + "epoch": 0.7445062377874643, + "flos": 70559047704960.0, + "grad_norm": 0.6497921539673587, + "language_loss": 0.5464738, + "learning_rate": 6.464325030925831e-07, + "loss": 0.56673896, + "num_input_tokens_seen": 267121830, + "router_z_loss_clip": 0.01062012, + "router_z_loss_mlp": 0.20507812, + "step": 12383, + "time_per_iteration": 3.222402811050415 + }, + { + "auxiliary_loss_clip": 0.01100878, + "auxiliary_loss_mlp": 0.01026647, + "balance_loss_clip": 1.0151608, + "balance_loss_mlp": 1.03370833, + "epoch": 0.7445663610401323, + "flos": 22164425458560.0, + "grad_norm": 1.9786543947489503, + "language_loss": 0.76230276, + "learning_rate": 6.461458141259395e-07, + "loss": 0.78357792, + "num_input_tokens_seen": 267141145, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.671875, + "step": 12384, + "time_per_iteration": 2.455353021621704 + }, + { + "auxiliary_loss_clip": 0.01100095, + "auxiliary_loss_mlp": 0.01029552, + "balance_loss_clip": 1.01782155, + "balance_loss_mlp": 1.03452992, + "epoch": 0.7446264842928002, + "flos": 24170251714560.0, + "grad_norm": 2.0782969884363816, + "language_loss": 0.79298764, + "learning_rate": 6.458591764975823e-07, + "loss": 0.81428415, + "num_input_tokens_seen": 267159280, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.65625, + "step": 12385, + "time_per_iteration": 2.4987757205963135 + }, + { + "auxiliary_loss_clip": 0.01106725, + "auxiliary_loss_mlp": 0.01032618, + "balance_loss_clip": 1.01921868, + "balance_loss_mlp": 1.03626704, + "epoch": 0.7446866075454682, + "flos": 24134556574080.0, + "grad_norm": 1.6771558108044815, + "language_loss": 0.8143934, + "learning_rate": 6.455725902183813e-07, + "loss": 0.83578682, + "num_input_tokens_seen": 267179390, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.703125, + "step": 12386, + "time_per_iteration": 3.998560667037964 + }, + { + "auxiliary_loss_clip": 0.01099591, + "auxiliary_loss_mlp": 0.01026498, + "balance_loss_clip": 1.0154351, + "balance_loss_mlp": 1.03524506, + "epoch": 0.7447467307981361, + "flos": 23548063305600.0, + "grad_norm": 1.7576352250203031, + "language_loss": 0.71226764, + "learning_rate": 6.452860552992037e-07, + "loss": 0.73352849, + "num_input_tokens_seen": 267198165, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.64453125, + "step": 12387, + "time_per_iteration": 2.4593608379364014 + }, + { + "auxiliary_loss_clip": 0.01101935, + "auxiliary_loss_mlp": 0.0102766, + "balance_loss_clip": 1.01612639, + "balance_loss_mlp": 1.03501618, + "epoch": 0.7448068540508042, + "flos": 19567832215680.0, + "grad_norm": 2.162095578178006, + "language_loss": 0.7053076, + "learning_rate": 6.449995717509138e-07, + "loss": 0.72660351, + "num_input_tokens_seen": 267214520, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 12388, + "time_per_iteration": 3.8914287090301514 + }, + { + "auxiliary_loss_clip": 0.01102008, + "auxiliary_loss_mlp": 0.01030206, + "balance_loss_clip": 1.01879215, + "balance_loss_mlp": 1.03539407, + "epoch": 0.7448669773034721, + "flos": 21839721488640.0, + "grad_norm": 1.5805660577109513, + "language_loss": 0.84949243, + "learning_rate": 6.447131395843761e-07, + "loss": 0.87081456, + "num_input_tokens_seen": 267236555, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 12389, + "time_per_iteration": 4.039583683013916 + }, + { + "auxiliary_loss_clip": 0.01100859, + "auxiliary_loss_mlp": 0.01030695, + "balance_loss_clip": 1.01907206, + "balance_loss_mlp": 1.03446209, + "epoch": 0.7449271005561401, + "flos": 25155389099520.0, + "grad_norm": 1.992620566185106, + "language_loss": 0.79385233, + "learning_rate": 6.444267588104526e-07, + "loss": 0.8151679, + "num_input_tokens_seen": 267254800, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 12390, + "time_per_iteration": 3.9466896057128906 + }, + { + "auxiliary_loss_clip": 0.01104503, + "auxiliary_loss_mlp": 0.0102899, + "balance_loss_clip": 1.01669955, + "balance_loss_mlp": 1.03669739, + "epoch": 0.7449872238088081, + "flos": 22273342473600.0, + "grad_norm": 1.730347550558291, + "language_loss": 0.84698212, + "learning_rate": 6.441404294400014e-07, + "loss": 0.86831707, + "num_input_tokens_seen": 267274610, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6796875, + "step": 12391, + "time_per_iteration": 2.493415117263794 + }, + { + "auxiliary_loss_clip": 0.01100534, + "auxiliary_loss_mlp": 0.01029608, + "balance_loss_clip": 1.01861715, + "balance_loss_mlp": 1.03483033, + "epoch": 0.745047347061476, + "flos": 20594805966720.0, + "grad_norm": 1.8306369594039993, + "language_loss": 0.73786843, + "learning_rate": 6.438541514838811e-07, + "loss": 0.75916982, + "num_input_tokens_seen": 267292600, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.65625, + "step": 12392, + "time_per_iteration": 2.431533098220825 + }, + { + "auxiliary_loss_clip": 0.01098966, + "auxiliary_loss_mlp": 0.01033054, + "balance_loss_clip": 1.02171087, + "balance_loss_mlp": 1.03509498, + "epoch": 0.745107470314144, + "flos": 22127545169280.0, + "grad_norm": 1.6456666698641875, + "language_loss": 0.76718521, + "learning_rate": 6.435679249529487e-07, + "loss": 0.78850538, + "num_input_tokens_seen": 267311295, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.63671875, + "step": 12393, + "time_per_iteration": 2.473604917526245 + }, + { + "auxiliary_loss_clip": 0.01102478, + "auxiliary_loss_mlp": 0.01033816, + "balance_loss_clip": 1.02133441, + "balance_loss_mlp": 1.03579187, + "epoch": 0.745167593566812, + "flos": 22236498097920.0, + "grad_norm": 1.8111060695117658, + "language_loss": 0.72828883, + "learning_rate": 6.432817498580552e-07, + "loss": 0.74965185, + "num_input_tokens_seen": 267328390, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6640625, + "step": 12394, + "time_per_iteration": 2.4453284740448 + }, + { + "auxiliary_loss_clip": 0.01103102, + "auxiliary_loss_mlp": 0.01035289, + "balance_loss_clip": 1.02332675, + "balance_loss_mlp": 1.03558517, + "epoch": 0.74522771681948, + "flos": 20666232161280.0, + "grad_norm": 1.668528755901744, + "language_loss": 0.81820607, + "learning_rate": 6.429956262100535e-07, + "loss": 0.83958995, + "num_input_tokens_seen": 267348185, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.67578125, + "step": 12395, + "time_per_iteration": 2.4907712936401367 + }, + { + "auxiliary_loss_clip": 0.01104977, + "auxiliary_loss_mlp": 0.010329, + "balance_loss_clip": 1.02062798, + "balance_loss_mlp": 1.03574276, + "epoch": 0.7452878400721479, + "flos": 21106999952640.0, + "grad_norm": 5.4481505993838475, + "language_loss": 0.70923871, + "learning_rate": 6.427095540197937e-07, + "loss": 0.73061752, + "num_input_tokens_seen": 267367010, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69140625, + "step": 12396, + "time_per_iteration": 2.5307369232177734 + }, + { + "auxiliary_loss_clip": 0.01103961, + "auxiliary_loss_mlp": 0.01029197, + "balance_loss_clip": 1.0171988, + "balance_loss_mlp": 1.03555429, + "epoch": 0.7453479633248159, + "flos": 26688056474880.0, + "grad_norm": 1.799312565551718, + "language_loss": 0.6829254, + "learning_rate": 6.424235332981245e-07, + "loss": 0.70425701, + "num_input_tokens_seen": 267386605, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 12397, + "time_per_iteration": 2.5126614570617676 + }, + { + "auxiliary_loss_clip": 0.01101329, + "auxiliary_loss_mlp": 0.01040843, + "balance_loss_clip": 1.02871311, + "balance_loss_mlp": 1.03490043, + "epoch": 0.7454080865774838, + "flos": 17016056167680.0, + "grad_norm": 2.004729126431997, + "language_loss": 0.76321107, + "learning_rate": 6.421375640558908e-07, + "loss": 0.7846328, + "num_input_tokens_seen": 267404135, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6640625, + "step": 12398, + "time_per_iteration": 2.4119622707366943 + }, + { + "auxiliary_loss_clip": 0.01098608, + "auxiliary_loss_mlp": 0.01026539, + "balance_loss_clip": 1.01461804, + "balance_loss_mlp": 1.03464854, + "epoch": 0.7454682098301518, + "flos": 21323900229120.0, + "grad_norm": 1.6814125292484552, + "language_loss": 0.77809334, + "learning_rate": 6.418516463039363e-07, + "loss": 0.79934478, + "num_input_tokens_seen": 267423120, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.640625, + "step": 12399, + "time_per_iteration": 2.4987549781799316 + }, + { + "auxiliary_loss_clip": 0.01097189, + "auxiliary_loss_mlp": 0.0103442, + "balance_loss_clip": 1.02345836, + "balance_loss_mlp": 1.03396916, + "epoch": 0.7455283330828197, + "flos": 17858341163520.0, + "grad_norm": 1.9741218645460363, + "language_loss": 0.73963678, + "learning_rate": 6.415657800531038e-07, + "loss": 0.76095283, + "num_input_tokens_seen": 267441250, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6328125, + "step": 12400, + "time_per_iteration": 2.4242513179779053 + }, + { + "auxiliary_loss_clip": 0.01098712, + "auxiliary_loss_mlp": 0.01029354, + "balance_loss_clip": 1.01808882, + "balance_loss_mlp": 1.03357267, + "epoch": 0.7455884563354878, + "flos": 30774259664640.0, + "grad_norm": 1.8638807707826066, + "language_loss": 0.81975746, + "learning_rate": 6.412799653142327e-07, + "loss": 0.84103811, + "num_input_tokens_seen": 267462820, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65234375, + "step": 12401, + "time_per_iteration": 2.5451955795288086 + }, + { + "auxiliary_loss_clip": 0.01100279, + "auxiliary_loss_mlp": 0.01033562, + "balance_loss_clip": 1.02252901, + "balance_loss_mlp": 1.03501511, + "epoch": 0.7456485795881557, + "flos": 23185545292800.0, + "grad_norm": 1.845084112452823, + "language_loss": 0.65197337, + "learning_rate": 6.409942020981611e-07, + "loss": 0.67331183, + "num_input_tokens_seen": 267483065, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65234375, + "step": 12402, + "time_per_iteration": 2.4577367305755615 + }, + { + "auxiliary_loss_clip": 0.01099262, + "auxiliary_loss_mlp": 0.01028921, + "balance_loss_clip": 1.01831102, + "balance_loss_mlp": 1.0342567, + "epoch": 0.7457087028408237, + "flos": 38727144074880.0, + "grad_norm": 1.6576964620220311, + "language_loss": 0.73214388, + "learning_rate": 6.407084904157265e-07, + "loss": 0.75342572, + "num_input_tokens_seen": 267504825, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.6484375, + "step": 12403, + "time_per_iteration": 2.620654821395874 + }, + { + "auxiliary_loss_clip": 0.01024907, + "auxiliary_loss_mlp": 0.01005223, + "balance_loss_clip": 1.00420368, + "balance_loss_mlp": 1.00436723, + "epoch": 0.7457688260934917, + "flos": 56043737337600.0, + "grad_norm": 0.8255474672184773, + "language_loss": 0.58760434, + "learning_rate": 6.404228302777621e-07, + "loss": 0.60790563, + "num_input_tokens_seen": 267559260, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.20507812, + "step": 12404, + "time_per_iteration": 2.8954858779907227 + }, + { + "auxiliary_loss_clip": 0.0110003, + "auxiliary_loss_mlp": 0.01034771, + "balance_loss_clip": 1.02357709, + "balance_loss_mlp": 1.03306055, + "epoch": 0.7458289493461596, + "flos": 20116152305280.0, + "grad_norm": 1.6032592804273305, + "language_loss": 0.77657819, + "learning_rate": 6.401372216950995e-07, + "loss": 0.79792619, + "num_input_tokens_seen": 267578720, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.66796875, + "step": 12405, + "time_per_iteration": 2.470407247543335 + }, + { + "auxiliary_loss_clip": 0.01100531, + "auxiliary_loss_mlp": 0.01032126, + "balance_loss_clip": 1.02069402, + "balance_loss_mlp": 1.03543913, + "epoch": 0.7458890725988276, + "flos": 20193073280640.0, + "grad_norm": 1.5461856417653022, + "language_loss": 0.69148755, + "learning_rate": 6.398516646785698e-07, + "loss": 0.71281415, + "num_input_tokens_seen": 267598250, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 12406, + "time_per_iteration": 2.4450039863586426 + }, + { + "auxiliary_loss_clip": 0.01107001, + "auxiliary_loss_mlp": 0.01034039, + "balance_loss_clip": 1.02102149, + "balance_loss_mlp": 1.03617549, + "epoch": 0.7459491958514956, + "flos": 17018749687680.0, + "grad_norm": 1.505466725953553, + "language_loss": 0.64742386, + "learning_rate": 6.39566159239002e-07, + "loss": 0.66883421, + "num_input_tokens_seen": 267615430, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.7109375, + "step": 12407, + "time_per_iteration": 2.4332051277160645 + }, + { + "auxiliary_loss_clip": 0.01103397, + "auxiliary_loss_mlp": 0.01031734, + "balance_loss_clip": 1.01944911, + "balance_loss_mlp": 1.03494692, + "epoch": 0.7460093191041636, + "flos": 25078719519360.0, + "grad_norm": 1.652287891377431, + "language_loss": 0.72460616, + "learning_rate": 6.392807053872212e-07, + "loss": 0.74595749, + "num_input_tokens_seen": 267635075, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.68359375, + "step": 12408, + "time_per_iteration": 2.4836978912353516 + }, + { + "auxiliary_loss_clip": 0.01106452, + "auxiliary_loss_mlp": 0.01034146, + "balance_loss_clip": 1.02128339, + "balance_loss_mlp": 1.03751123, + "epoch": 0.7460694423568315, + "flos": 21908525990400.0, + "grad_norm": 1.7143768507331778, + "language_loss": 0.72858518, + "learning_rate": 6.38995303134053e-07, + "loss": 0.74999118, + "num_input_tokens_seen": 267654105, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6875, + "step": 12409, + "time_per_iteration": 2.515709400177002 + }, + { + "auxiliary_loss_clip": 0.01098264, + "auxiliary_loss_mlp": 0.01032387, + "balance_loss_clip": 1.02187228, + "balance_loss_mlp": 1.03468859, + "epoch": 0.7461295656094995, + "flos": 21215737399680.0, + "grad_norm": 1.710421587761424, + "language_loss": 0.6618892, + "learning_rate": 6.38709952490319e-07, + "loss": 0.68319571, + "num_input_tokens_seen": 267673090, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.63671875, + "step": 12410, + "time_per_iteration": 2.449406147003174 + }, + { + "auxiliary_loss_clip": 0.01098711, + "auxiliary_loss_mlp": 0.0103109, + "balance_loss_clip": 1.01910925, + "balance_loss_mlp": 1.034163, + "epoch": 0.7461896888621674, + "flos": 22346851656960.0, + "grad_norm": 2.213506116293379, + "language_loss": 0.84104359, + "learning_rate": 6.384246534668396e-07, + "loss": 0.86234152, + "num_input_tokens_seen": 267690605, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6484375, + "step": 12411, + "time_per_iteration": 2.4625163078308105 + }, + { + "auxiliary_loss_clip": 0.01103566, + "auxiliary_loss_mlp": 0.01029489, + "balance_loss_clip": 1.01740742, + "balance_loss_mlp": 1.03515697, + "epoch": 0.7462498121148354, + "flos": 25482930243840.0, + "grad_norm": 1.6692936053556306, + "language_loss": 0.7766965, + "learning_rate": 6.381394060744339e-07, + "loss": 0.79802704, + "num_input_tokens_seen": 267710540, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.68359375, + "step": 12412, + "time_per_iteration": 2.4557554721832275 + }, + { + "auxiliary_loss_clip": 0.01102723, + "auxiliary_loss_mlp": 0.01035248, + "balance_loss_clip": 1.02409029, + "balance_loss_mlp": 1.03520751, + "epoch": 0.7463099353675033, + "flos": 33947936812800.0, + "grad_norm": 1.834679176534713, + "language_loss": 0.6225034, + "learning_rate": 6.378542103239188e-07, + "loss": 0.64388311, + "num_input_tokens_seen": 267730780, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.67578125, + "step": 12413, + "time_per_iteration": 2.559657573699951 + }, + { + "auxiliary_loss_clip": 0.01024964, + "auxiliary_loss_mlp": 0.01002262, + "balance_loss_clip": 1.00121295, + "balance_loss_mlp": 1.00439072, + "epoch": 0.7463700586201714, + "flos": 62767723691520.0, + "grad_norm": 0.7203793484361629, + "language_loss": 0.54924321, + "learning_rate": 6.375690662261082e-07, + "loss": 0.56951547, + "num_input_tokens_seen": 267794240, + "router_z_loss_clip": 0.01049805, + "router_z_loss_mlp": 0.20605469, + "step": 12414, + "time_per_iteration": 3.0637338161468506 + }, + { + "auxiliary_loss_clip": 0.01101199, + "auxiliary_loss_mlp": 0.01030512, + "balance_loss_clip": 1.01846027, + "balance_loss_mlp": 1.03334022, + "epoch": 0.7464301818728393, + "flos": 33432654257280.0, + "grad_norm": 1.860182659182016, + "language_loss": 0.54804456, + "learning_rate": 6.372839737918154e-07, + "loss": 0.56936157, + "num_input_tokens_seen": 267817190, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 12415, + "time_per_iteration": 2.5465588569641113 + }, + { + "auxiliary_loss_clip": 0.01101991, + "auxiliary_loss_mlp": 0.01032607, + "balance_loss_clip": 1.02022743, + "balance_loss_mlp": 1.0359658, + "epoch": 0.7464903051255073, + "flos": 26869872142080.0, + "grad_norm": 1.6660939393048266, + "language_loss": 0.74985796, + "learning_rate": 6.369989330318506e-07, + "loss": 0.77120394, + "num_input_tokens_seen": 267836245, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.65625, + "step": 12416, + "time_per_iteration": 2.514845132827759 + }, + { + "auxiliary_loss_clip": 0.01101809, + "auxiliary_loss_mlp": 0.01034557, + "balance_loss_clip": 1.02302361, + "balance_loss_mlp": 1.03556323, + "epoch": 0.7465504283781753, + "flos": 44086954775040.0, + "grad_norm": 1.4814223642956346, + "language_loss": 0.69489551, + "learning_rate": 6.367139439570233e-07, + "loss": 0.71625924, + "num_input_tokens_seen": 267858310, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66015625, + "step": 12417, + "time_per_iteration": 2.6574227809906006 + }, + { + "auxiliary_loss_clip": 0.01104674, + "auxiliary_loss_mlp": 0.01030343, + "balance_loss_clip": 1.01790345, + "balance_loss_mlp": 1.03659248, + "epoch": 0.7466105516308432, + "flos": 19676102785920.0, + "grad_norm": 1.767590849665872, + "language_loss": 0.73728597, + "learning_rate": 6.364290065781392e-07, + "loss": 0.75863612, + "num_input_tokens_seen": 267876345, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6796875, + "step": 12418, + "time_per_iteration": 2.462244987487793 + }, + { + "auxiliary_loss_clip": 0.01103226, + "auxiliary_loss_mlp": 0.01027321, + "balance_loss_clip": 1.01586497, + "balance_loss_mlp": 1.03675175, + "epoch": 0.7466706748835112, + "flos": 20520722165760.0, + "grad_norm": 1.574966460677448, + "language_loss": 0.69369054, + "learning_rate": 6.361441209060039e-07, + "loss": 0.71499598, + "num_input_tokens_seen": 267896740, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 12419, + "time_per_iteration": 2.4568960666656494 + }, + { + "auxiliary_loss_clip": 0.01095857, + "auxiliary_loss_mlp": 0.01032572, + "balance_loss_clip": 1.02151561, + "balance_loss_mlp": 1.03342533, + "epoch": 0.7467307981361792, + "flos": 21690260997120.0, + "grad_norm": 1.6640874245133943, + "language_loss": 0.74578714, + "learning_rate": 6.358592869514216e-07, + "loss": 0.76707137, + "num_input_tokens_seen": 267914765, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.625, + "step": 12420, + "time_per_iteration": 2.5238821506500244 + }, + { + "auxiliary_loss_clip": 0.01104196, + "auxiliary_loss_mlp": 0.01031831, + "balance_loss_clip": 1.01943922, + "balance_loss_mlp": 1.03683901, + "epoch": 0.7467909213888472, + "flos": 19573686132480.0, + "grad_norm": 1.6177707150337377, + "language_loss": 0.67195189, + "learning_rate": 6.355745047251904e-07, + "loss": 0.69331217, + "num_input_tokens_seen": 267934085, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 12421, + "time_per_iteration": 2.4293341636657715 + }, + { + "auxiliary_loss_clip": 0.01104487, + "auxiliary_loss_mlp": 0.01032623, + "balance_loss_clip": 1.01912296, + "balance_loss_mlp": 1.03556955, + "epoch": 0.7468510446415151, + "flos": 23695225326720.0, + "grad_norm": 1.5639142011030407, + "language_loss": 0.72440511, + "learning_rate": 6.352897742381107e-07, + "loss": 0.74577618, + "num_input_tokens_seen": 267955170, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.6875, + "step": 12422, + "time_per_iteration": 2.509237766265869 + }, + { + "auxiliary_loss_clip": 0.01100612, + "auxiliary_loss_mlp": 0.01030685, + "balance_loss_clip": 1.01877022, + "balance_loss_mlp": 1.03514779, + "epoch": 0.7469111678941831, + "flos": 29315783831040.0, + "grad_norm": 9.98591332499941, + "language_loss": 0.74842906, + "learning_rate": 6.350050955009796e-07, + "loss": 0.76974201, + "num_input_tokens_seen": 267974980, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 12423, + "time_per_iteration": 2.5110628604888916 + }, + { + "auxiliary_loss_clip": 0.01099293, + "auxiliary_loss_mlp": 0.0102642, + "balance_loss_clip": 1.01536298, + "balance_loss_mlp": 1.03383863, + "epoch": 0.746971291146851, + "flos": 21798639308160.0, + "grad_norm": 1.296938244989713, + "language_loss": 0.67754054, + "learning_rate": 6.347204685245929e-07, + "loss": 0.6987977, + "num_input_tokens_seen": 267994985, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.65625, + "step": 12424, + "time_per_iteration": 2.4905362129211426 + }, + { + "auxiliary_loss_clip": 0.01105568, + "auxiliary_loss_mlp": 0.0103483, + "balance_loss_clip": 1.02293932, + "balance_loss_mlp": 1.03707027, + "epoch": 0.747031414399519, + "flos": 36245070368640.0, + "grad_norm": 1.7754548837213033, + "language_loss": 0.74119371, + "learning_rate": 6.344358933197418e-07, + "loss": 0.76259774, + "num_input_tokens_seen": 268014985, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 12425, + "time_per_iteration": 2.5686028003692627 + }, + { + "auxiliary_loss_clip": 0.01101237, + "auxiliary_loss_mlp": 0.0102942, + "balance_loss_clip": 1.01754081, + "balance_loss_mlp": 1.0353744, + "epoch": 0.7470915376521869, + "flos": 19974916028160.0, + "grad_norm": 2.326605643233434, + "language_loss": 0.69533008, + "learning_rate": 6.341513698972194e-07, + "loss": 0.71663666, + "num_input_tokens_seen": 268034395, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66015625, + "step": 12426, + "time_per_iteration": 2.4671969413757324 + }, + { + "auxiliary_loss_clip": 0.01097868, + "auxiliary_loss_mlp": 0.01036163, + "balance_loss_clip": 1.024755, + "balance_loss_mlp": 1.03396261, + "epoch": 0.747151660904855, + "flos": 20084299920000.0, + "grad_norm": 1.6460733379816328, + "language_loss": 0.65486181, + "learning_rate": 6.338668982678139e-07, + "loss": 0.67620206, + "num_input_tokens_seen": 268054485, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.640625, + "step": 12427, + "time_per_iteration": 2.459057092666626 + }, + { + "auxiliary_loss_clip": 0.01102337, + "auxiliary_loss_mlp": 0.01027971, + "balance_loss_clip": 1.01555538, + "balance_loss_mlp": 1.03570294, + "epoch": 0.7472117841575229, + "flos": 16290373697280.0, + "grad_norm": 1.7506429909383225, + "language_loss": 0.74639595, + "learning_rate": 6.335824784423118e-07, + "loss": 0.767699, + "num_input_tokens_seen": 268072250, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6640625, + "step": 12428, + "time_per_iteration": 3.809513807296753 + }, + { + "auxiliary_loss_clip": 0.0110597, + "auxiliary_loss_mlp": 0.0103051, + "balance_loss_clip": 1.01710534, + "balance_loss_mlp": 1.0359993, + "epoch": 0.7472719074101909, + "flos": 21389939383680.0, + "grad_norm": 2.159964503285926, + "language_loss": 0.58328772, + "learning_rate": 6.33298110431499e-07, + "loss": 0.60465252, + "num_input_tokens_seen": 268089840, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.69921875, + "step": 12429, + "time_per_iteration": 2.420081377029419 + }, + { + "auxiliary_loss_clip": 0.01105592, + "auxiliary_loss_mlp": 0.0103229, + "balance_loss_clip": 1.02002382, + "balance_loss_mlp": 1.03655839, + "epoch": 0.7473320306628589, + "flos": 29643289061760.0, + "grad_norm": 1.8822181590488856, + "language_loss": 0.60539925, + "learning_rate": 6.330137942461595e-07, + "loss": 0.62677801, + "num_input_tokens_seen": 268109360, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.69140625, + "step": 12430, + "time_per_iteration": 3.917961359024048 + }, + { + "auxiliary_loss_clip": 0.01102089, + "auxiliary_loss_mlp": 0.01030145, + "balance_loss_clip": 1.01848626, + "balance_loss_mlp": 1.0366466, + "epoch": 0.7473921539155268, + "flos": 24136100858880.0, + "grad_norm": 1.4375442916697652, + "language_loss": 0.75408334, + "learning_rate": 6.327295298970734e-07, + "loss": 0.77540565, + "num_input_tokens_seen": 268131840, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65625, + "step": 12431, + "time_per_iteration": 3.8775863647460938 + }, + { + "auxiliary_loss_clip": 0.01101836, + "auxiliary_loss_mlp": 0.01030243, + "balance_loss_clip": 1.01829195, + "balance_loss_mlp": 1.03413606, + "epoch": 0.7474522771681948, + "flos": 17487958072320.0, + "grad_norm": 1.7750987800998057, + "language_loss": 0.75931549, + "learning_rate": 6.32445317395021e-07, + "loss": 0.78063631, + "num_input_tokens_seen": 268148300, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.67578125, + "step": 12432, + "time_per_iteration": 2.4008095264434814 + }, + { + "auxiliary_loss_clip": 0.01105995, + "auxiliary_loss_mlp": 0.01036957, + "balance_loss_clip": 1.02375436, + "balance_loss_mlp": 1.03559935, + "epoch": 0.7475124004208628, + "flos": 16727298733440.0, + "grad_norm": 4.600278612020183, + "language_loss": 0.69874978, + "learning_rate": 6.321611567507787e-07, + "loss": 0.72017932, + "num_input_tokens_seen": 268166450, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 12433, + "time_per_iteration": 3.879322052001953 + }, + { + "auxiliary_loss_clip": 0.01103347, + "auxiliary_loss_mlp": 0.01031143, + "balance_loss_clip": 1.01847744, + "balance_loss_mlp": 1.03535938, + "epoch": 0.7475725236735308, + "flos": 19720237622400.0, + "grad_norm": 1.4972431185118094, + "language_loss": 0.67169416, + "learning_rate": 6.318770479751232e-07, + "loss": 0.69303912, + "num_input_tokens_seen": 268186165, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 12434, + "time_per_iteration": 2.45617938041687 + }, + { + "auxiliary_loss_clip": 0.01096539, + "auxiliary_loss_mlp": 0.01028681, + "balance_loss_clip": 1.01809549, + "balance_loss_mlp": 1.03466129, + "epoch": 0.7476326469261987, + "flos": 26286000566400.0, + "grad_norm": 1.5115766265302155, + "language_loss": 0.7984153, + "learning_rate": 6.315929910788263e-07, + "loss": 0.81966752, + "num_input_tokens_seen": 268208145, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.6171875, + "step": 12435, + "time_per_iteration": 2.4689295291900635 + }, + { + "auxiliary_loss_clip": 0.01103643, + "auxiliary_loss_mlp": 0.01028444, + "balance_loss_clip": 1.0165174, + "balance_loss_mlp": 1.03551531, + "epoch": 0.7476927701788667, + "flos": 31831828824960.0, + "grad_norm": 1.9192190166141703, + "language_loss": 0.685781, + "learning_rate": 6.313089860726604e-07, + "loss": 0.70710182, + "num_input_tokens_seen": 268228345, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.68359375, + "step": 12436, + "time_per_iteration": 2.5397560596466064 + }, + { + "auxiliary_loss_clip": 0.01105286, + "auxiliary_loss_mlp": 0.01032667, + "balance_loss_clip": 1.02078748, + "balance_loss_mlp": 1.0353477, + "epoch": 0.7477528934315346, + "flos": 31795487239680.0, + "grad_norm": 2.523256251254823, + "language_loss": 0.70543289, + "learning_rate": 6.31025032967396e-07, + "loss": 0.72681236, + "num_input_tokens_seen": 268250260, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.703125, + "step": 12437, + "time_per_iteration": 2.5217578411102295 + }, + { + "auxiliary_loss_clip": 0.01096987, + "auxiliary_loss_mlp": 0.01028586, + "balance_loss_clip": 1.01766062, + "balance_loss_mlp": 1.0336585, + "epoch": 0.7478130166842026, + "flos": 20371979946240.0, + "grad_norm": 1.7258668993948156, + "language_loss": 0.6710937, + "learning_rate": 6.307411317737986e-07, + "loss": 0.69234937, + "num_input_tokens_seen": 268268440, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6328125, + "step": 12438, + "time_per_iteration": 2.4754526615142822 + }, + { + "auxiliary_loss_clip": 0.01101335, + "auxiliary_loss_mlp": 0.01032026, + "balance_loss_clip": 1.02005768, + "balance_loss_mlp": 1.03440166, + "epoch": 0.7478731399368705, + "flos": 18148930191360.0, + "grad_norm": 1.6057176452605648, + "language_loss": 0.80471182, + "learning_rate": 6.304572825026344e-07, + "loss": 0.82604539, + "num_input_tokens_seen": 268285765, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 12439, + "time_per_iteration": 2.4217841625213623 + }, + { + "auxiliary_loss_clip": 0.01100863, + "auxiliary_loss_mlp": 0.01035132, + "balance_loss_clip": 1.02369952, + "balance_loss_mlp": 1.034688, + "epoch": 0.7479332631895386, + "flos": 15267889146240.0, + "grad_norm": 4.3324890021257065, + "language_loss": 0.70790303, + "learning_rate": 6.301734851646674e-07, + "loss": 0.72926295, + "num_input_tokens_seen": 268304015, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 12440, + "time_per_iteration": 2.4390249252319336 + }, + { + "auxiliary_loss_clip": 0.01100888, + "auxiliary_loss_mlp": 0.01028215, + "balance_loss_clip": 1.01672912, + "balance_loss_mlp": 1.03678477, + "epoch": 0.7479933864422065, + "flos": 21142515525120.0, + "grad_norm": 1.6196156366406493, + "language_loss": 0.74209476, + "learning_rate": 6.298897397706597e-07, + "loss": 0.76338577, + "num_input_tokens_seen": 268323290, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 12441, + "time_per_iteration": 2.452240467071533 + }, + { + "auxiliary_loss_clip": 0.01104655, + "auxiliary_loss_mlp": 0.01035515, + "balance_loss_clip": 1.02294469, + "balance_loss_mlp": 1.0354284, + "epoch": 0.7480535096948745, + "flos": 14392027912320.0, + "grad_norm": 2.0647572412884223, + "language_loss": 0.82613641, + "learning_rate": 6.296060463313698e-07, + "loss": 0.84753811, + "num_input_tokens_seen": 268339490, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69140625, + "step": 12442, + "time_per_iteration": 2.4386143684387207 + }, + { + "auxiliary_loss_clip": 0.01104883, + "auxiliary_loss_mlp": 0.01030375, + "balance_loss_clip": 1.01779294, + "balance_loss_mlp": 1.03697157, + "epoch": 0.7481136329475425, + "flos": 27344683048320.0, + "grad_norm": 1.8278548482074275, + "language_loss": 0.62552464, + "learning_rate": 6.293224048575565e-07, + "loss": 0.64687717, + "num_input_tokens_seen": 268359865, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6796875, + "step": 12443, + "time_per_iteration": 2.501383066177368 + }, + { + "auxiliary_loss_clip": 0.01099555, + "auxiliary_loss_mlp": 0.01027213, + "balance_loss_clip": 1.01601326, + "balance_loss_mlp": 1.03445029, + "epoch": 0.7481737562002104, + "flos": 19531454716800.0, + "grad_norm": 2.2374686087677365, + "language_loss": 0.71498984, + "learning_rate": 6.29038815359975e-07, + "loss": 0.73625755, + "num_input_tokens_seen": 268377065, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6484375, + "step": 12444, + "time_per_iteration": 2.4533753395080566 + }, + { + "auxiliary_loss_clip": 0.01101788, + "auxiliary_loss_mlp": 0.01030053, + "balance_loss_clip": 1.01813221, + "balance_loss_mlp": 1.03564715, + "epoch": 0.7482338794528784, + "flos": 21760035166080.0, + "grad_norm": 1.421192180726323, + "language_loss": 0.68887877, + "learning_rate": 6.287552778493786e-07, + "loss": 0.71019721, + "num_input_tokens_seen": 268396935, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 12445, + "time_per_iteration": 2.4437148571014404 + }, + { + "auxiliary_loss_clip": 0.01099024, + "auxiliary_loss_mlp": 0.01025898, + "balance_loss_clip": 1.01420927, + "balance_loss_mlp": 1.0338124, + "epoch": 0.7482940027055464, + "flos": 18697358021760.0, + "grad_norm": 1.6018226461169682, + "language_loss": 0.73926389, + "learning_rate": 6.28471792336519e-07, + "loss": 0.76051313, + "num_input_tokens_seen": 268414460, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6484375, + "step": 12446, + "time_per_iteration": 2.4290761947631836 + }, + { + "auxiliary_loss_clip": 0.01107586, + "auxiliary_loss_mlp": 0.01031072, + "balance_loss_clip": 1.01841235, + "balance_loss_mlp": 1.03757131, + "epoch": 0.7483541259582144, + "flos": 15998024903040.0, + "grad_norm": 1.8678016899713992, + "language_loss": 0.73009384, + "learning_rate": 6.281883588321475e-07, + "loss": 0.75148046, + "num_input_tokens_seen": 268432225, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 12447, + "time_per_iteration": 2.4282591342926025 + }, + { + "auxiliary_loss_clip": 0.01100481, + "auxiliary_loss_mlp": 0.01030723, + "balance_loss_clip": 1.01952291, + "balance_loss_mlp": 1.03436816, + "epoch": 0.7484142492108823, + "flos": 25556295772800.0, + "grad_norm": 2.453147122317507, + "language_loss": 0.71330941, + "learning_rate": 6.279049773470109e-07, + "loss": 0.73462141, + "num_input_tokens_seen": 268449270, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.66015625, + "step": 12448, + "time_per_iteration": 2.492389678955078 + }, + { + "auxiliary_loss_clip": 0.01103975, + "auxiliary_loss_mlp": 0.01035837, + "balance_loss_clip": 1.02396417, + "balance_loss_mlp": 1.03592634, + "epoch": 0.7484743724635503, + "flos": 22887737631360.0, + "grad_norm": 1.681801443430281, + "language_loss": 0.73694456, + "learning_rate": 6.276216478918543e-07, + "loss": 0.75834262, + "num_input_tokens_seen": 268467250, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 12449, + "time_per_iteration": 2.458009958267212 + }, + { + "auxiliary_loss_clip": 0.01107992, + "auxiliary_loss_mlp": 0.0103785, + "balance_loss_clip": 1.02537513, + "balance_loss_mlp": 1.03796268, + "epoch": 0.7485344957162182, + "flos": 25300288563840.0, + "grad_norm": 4.253717763971936, + "language_loss": 0.6114825, + "learning_rate": 6.273383704774225e-07, + "loss": 0.63294089, + "num_input_tokens_seen": 268487270, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69921875, + "step": 12450, + "time_per_iteration": 2.50168776512146 + }, + { + "auxiliary_loss_clip": 0.01095887, + "auxiliary_loss_mlp": 0.01024791, + "balance_loss_clip": 1.01391912, + "balance_loss_mlp": 1.03296888, + "epoch": 0.7485946189688862, + "flos": 27053016612480.0, + "grad_norm": 2.2078562652579445, + "language_loss": 0.70491904, + "learning_rate": 6.270551451144577e-07, + "loss": 0.72612584, + "num_input_tokens_seen": 268508020, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.62890625, + "step": 12451, + "time_per_iteration": 2.4641804695129395 + }, + { + "auxiliary_loss_clip": 0.0110528, + "auxiliary_loss_mlp": 0.01029724, + "balance_loss_clip": 1.01757646, + "balance_loss_mlp": 1.03587961, + "epoch": 0.7486547422215541, + "flos": 26906752431360.0, + "grad_norm": 1.9404174586148812, + "language_loss": 0.80036032, + "learning_rate": 6.267719718136988e-07, + "loss": 0.82171035, + "num_input_tokens_seen": 268527375, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69140625, + "step": 12452, + "time_per_iteration": 2.472050189971924 + }, + { + "auxiliary_loss_clip": 0.0111029, + "auxiliary_loss_mlp": 0.01032553, + "balance_loss_clip": 1.0199945, + "balance_loss_mlp": 1.03968, + "epoch": 0.7487148654742222, + "flos": 22346277039360.0, + "grad_norm": 1.9353512881851993, + "language_loss": 0.71305573, + "learning_rate": 6.264888505858843e-07, + "loss": 0.73448426, + "num_input_tokens_seen": 268544870, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.70703125, + "step": 12453, + "time_per_iteration": 2.4257922172546387 + }, + { + "auxiliary_loss_clip": 0.01104414, + "auxiliary_loss_mlp": 0.0102963, + "balance_loss_clip": 1.01754212, + "balance_loss_mlp": 1.03703308, + "epoch": 0.7487749887268901, + "flos": 23038814234880.0, + "grad_norm": 1.4891462542899447, + "language_loss": 0.74149597, + "learning_rate": 6.262057814417517e-07, + "loss": 0.76283646, + "num_input_tokens_seen": 268564580, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 12454, + "time_per_iteration": 2.468405246734619 + }, + { + "auxiliary_loss_clip": 0.01025662, + "auxiliary_loss_mlp": 0.01006028, + "balance_loss_clip": 1.00498486, + "balance_loss_mlp": 1.00516868, + "epoch": 0.7488351119795581, + "flos": 71525294536320.0, + "grad_norm": 0.7310384566009501, + "language_loss": 0.59401155, + "learning_rate": 6.259227643920322e-07, + "loss": 0.61432838, + "num_input_tokens_seen": 268629550, + "router_z_loss_clip": 0.01043701, + "router_z_loss_mlp": 0.20507812, + "step": 12455, + "time_per_iteration": 3.1971945762634277 + }, + { + "auxiliary_loss_clip": 0.01101497, + "auxiliary_loss_mlp": 0.01029187, + "balance_loss_clip": 1.01709366, + "balance_loss_mlp": 1.03604209, + "epoch": 0.748895235232226, + "flos": 17196255722880.0, + "grad_norm": 4.934936184310134, + "language_loss": 0.79615253, + "learning_rate": 6.256397994474592e-07, + "loss": 0.81745934, + "num_input_tokens_seen": 268646645, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.65625, + "step": 12456, + "time_per_iteration": 2.4296135902404785 + }, + { + "auxiliary_loss_clip": 0.0102509, + "auxiliary_loss_mlp": 0.01006564, + "balance_loss_clip": 1.00547349, + "balance_loss_mlp": 1.00471401, + "epoch": 0.748955358484894, + "flos": 58979256336000.0, + "grad_norm": 0.83989134398578, + "language_loss": 0.61468804, + "learning_rate": 6.25356886618763e-07, + "loss": 0.63500464, + "num_input_tokens_seen": 268702275, + "router_z_loss_clip": 0.01092529, + "router_z_loss_mlp": 0.20410156, + "step": 12457, + "time_per_iteration": 2.974139928817749 + }, + { + "auxiliary_loss_clip": 0.01106258, + "auxiliary_loss_mlp": 0.01030489, + "balance_loss_clip": 1.01888442, + "balance_loss_mlp": 1.03782308, + "epoch": 0.749015481737562, + "flos": 11360413054080.0, + "grad_norm": 2.8899809171397686, + "language_loss": 0.6718834, + "learning_rate": 6.250740259166711e-07, + "loss": 0.69325089, + "num_input_tokens_seen": 268716265, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.68359375, + "step": 12458, + "time_per_iteration": 2.439760684967041 + }, + { + "auxiliary_loss_clip": 0.01100263, + "auxiliary_loss_mlp": 0.01029117, + "balance_loss_clip": 1.01747072, + "balance_loss_mlp": 1.03471386, + "epoch": 0.74907560499023, + "flos": 21106497162240.0, + "grad_norm": 4.815239058798898, + "language_loss": 0.79521596, + "learning_rate": 6.247912173519106e-07, + "loss": 0.81650984, + "num_input_tokens_seen": 268734330, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 12459, + "time_per_iteration": 2.4311044216156006 + }, + { + "auxiliary_loss_clip": 0.01099542, + "auxiliary_loss_mlp": 0.01031519, + "balance_loss_clip": 1.01964533, + "balance_loss_mlp": 1.03522098, + "epoch": 0.749135728242898, + "flos": 22268027260800.0, + "grad_norm": 1.5166660138964414, + "language_loss": 0.80542082, + "learning_rate": 6.245084609352043e-07, + "loss": 0.82673144, + "num_input_tokens_seen": 268753500, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.64453125, + "step": 12460, + "time_per_iteration": 2.467636823654175 + }, + { + "auxiliary_loss_clip": 0.01102889, + "auxiliary_loss_mlp": 0.01030071, + "balance_loss_clip": 1.01772666, + "balance_loss_mlp": 1.03595591, + "epoch": 0.7491958514955659, + "flos": 24057527857920.0, + "grad_norm": 1.8187946605999095, + "language_loss": 0.8621248, + "learning_rate": 6.242257566772755e-07, + "loss": 0.88345432, + "num_input_tokens_seen": 268772055, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.66796875, + "step": 12461, + "time_per_iteration": 2.445946216583252 + }, + { + "auxiliary_loss_clip": 0.01099091, + "auxiliary_loss_mlp": 0.01030727, + "balance_loss_clip": 1.01919913, + "balance_loss_mlp": 1.03504705, + "epoch": 0.7492559747482339, + "flos": 24492118510080.0, + "grad_norm": 4.4069049168427235, + "language_loss": 0.69474328, + "learning_rate": 6.239431045888435e-07, + "loss": 0.71604145, + "num_input_tokens_seen": 268792265, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 12462, + "time_per_iteration": 2.4715051651000977 + }, + { + "auxiliary_loss_clip": 0.01101104, + "auxiliary_loss_mlp": 0.0103147, + "balance_loss_clip": 1.01925659, + "balance_loss_mlp": 1.03515553, + "epoch": 0.7493160980009018, + "flos": 27745338326400.0, + "grad_norm": 2.161569960012567, + "language_loss": 0.70565915, + "learning_rate": 6.236605046806267e-07, + "loss": 0.72698486, + "num_input_tokens_seen": 268812735, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66015625, + "step": 12463, + "time_per_iteration": 2.4890224933624268 + }, + { + "auxiliary_loss_clip": 0.01103139, + "auxiliary_loss_mlp": 0.0103002, + "balance_loss_clip": 1.01886845, + "balance_loss_mlp": 1.03757596, + "epoch": 0.7493762212535698, + "flos": 30226190970240.0, + "grad_norm": 1.815437502169393, + "language_loss": 0.77414626, + "learning_rate": 6.233779569633419e-07, + "loss": 0.79547787, + "num_input_tokens_seen": 268833090, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 12464, + "time_per_iteration": 2.5218935012817383 + }, + { + "auxiliary_loss_clip": 0.01097602, + "auxiliary_loss_mlp": 0.01026218, + "balance_loss_clip": 1.01478601, + "balance_loss_mlp": 1.03183138, + "epoch": 0.7494363445062378, + "flos": 21944472526080.0, + "grad_norm": 1.6191901183341268, + "language_loss": 0.78242761, + "learning_rate": 6.230954614477034e-07, + "loss": 0.80366582, + "num_input_tokens_seen": 268851880, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 12465, + "time_per_iteration": 2.438852071762085 + }, + { + "auxiliary_loss_clip": 0.0111071, + "auxiliary_loss_mlp": 0.01034499, + "balance_loss_clip": 1.02102232, + "balance_loss_mlp": 1.03788424, + "epoch": 0.7494964677589058, + "flos": 12490342162560.0, + "grad_norm": 2.367319558994289, + "language_loss": 0.73263687, + "learning_rate": 6.22813018144422e-07, + "loss": 0.75408894, + "num_input_tokens_seen": 268867910, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7265625, + "step": 12466, + "time_per_iteration": 2.4159023761749268 + }, + { + "auxiliary_loss_clip": 0.01103324, + "auxiliary_loss_mlp": 0.01032506, + "balance_loss_clip": 1.02088344, + "balance_loss_mlp": 1.03596592, + "epoch": 0.7495565910115737, + "flos": 21653057485440.0, + "grad_norm": 2.1301146092024004, + "language_loss": 0.66439664, + "learning_rate": 6.22530627064209e-07, + "loss": 0.6857549, + "num_input_tokens_seen": 268887260, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 12467, + "time_per_iteration": 2.476149320602417 + }, + { + "auxiliary_loss_clip": 0.0110248, + "auxiliary_loss_mlp": 0.01031367, + "balance_loss_clip": 1.01941586, + "balance_loss_mlp": 1.03570294, + "epoch": 0.7496167142642417, + "flos": 15268535591040.0, + "grad_norm": 2.3152910875520982, + "language_loss": 0.76111352, + "learning_rate": 6.222482882177735e-07, + "loss": 0.78245205, + "num_input_tokens_seen": 268902520, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66796875, + "step": 12468, + "time_per_iteration": 2.536062717437744 + }, + { + "auxiliary_loss_clip": 0.01101389, + "auxiliary_loss_mlp": 0.01029093, + "balance_loss_clip": 1.01648641, + "balance_loss_mlp": 1.03586531, + "epoch": 0.7496768375169096, + "flos": 22054933825920.0, + "grad_norm": 2.6980590171523238, + "language_loss": 0.69451874, + "learning_rate": 6.219660016158201e-07, + "loss": 0.71582359, + "num_input_tokens_seen": 268920970, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.65625, + "step": 12469, + "time_per_iteration": 3.8304295539855957 + }, + { + "auxiliary_loss_clip": 0.01102636, + "auxiliary_loss_mlp": 0.0103201, + "balance_loss_clip": 1.01960647, + "balance_loss_mlp": 1.03584695, + "epoch": 0.7497369607695776, + "flos": 19057038860160.0, + "grad_norm": 1.8066582872371235, + "language_loss": 0.68950933, + "learning_rate": 6.216837672690543e-07, + "loss": 0.71085578, + "num_input_tokens_seen": 268936600, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.66796875, + "step": 12470, + "time_per_iteration": 2.478144645690918 + }, + { + "auxiliary_loss_clip": 0.01105048, + "auxiliary_loss_mlp": 0.01031368, + "balance_loss_clip": 1.0178256, + "balance_loss_mlp": 1.03487074, + "epoch": 0.7497970840222457, + "flos": 21617434172160.0, + "grad_norm": 2.8963816737460606, + "language_loss": 0.74823713, + "learning_rate": 6.214015851881793e-07, + "loss": 0.76960123, + "num_input_tokens_seen": 268956560, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.69921875, + "step": 12471, + "time_per_iteration": 3.9513978958129883 + }, + { + "auxiliary_loss_clip": 0.01103416, + "auxiliary_loss_mlp": 0.01027975, + "balance_loss_clip": 1.01577377, + "balance_loss_mlp": 1.03611624, + "epoch": 0.7498572072749136, + "flos": 13735580906880.0, + "grad_norm": 1.9482854997068855, + "language_loss": 0.76652914, + "learning_rate": 6.211194553838929e-07, + "loss": 0.78784305, + "num_input_tokens_seen": 268973945, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 12472, + "time_per_iteration": 3.9247841835021973 + }, + { + "auxiliary_loss_clip": 0.01100064, + "auxiliary_loss_mlp": 0.01029842, + "balance_loss_clip": 1.01829672, + "balance_loss_mlp": 1.03378856, + "epoch": 0.7499173305275816, + "flos": 22966526113920.0, + "grad_norm": 1.4581749540086286, + "language_loss": 0.84420872, + "learning_rate": 6.208373778668951e-07, + "loss": 0.86550772, + "num_input_tokens_seen": 268993245, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66015625, + "step": 12473, + "time_per_iteration": 2.460721492767334 + }, + { + "auxiliary_loss_clip": 0.01107651, + "auxiliary_loss_mlp": 0.01034983, + "balance_loss_clip": 1.0219121, + "balance_loss_mlp": 1.03714895, + "epoch": 0.7499774537802495, + "flos": 22740467869440.0, + "grad_norm": 1.9225859728755545, + "language_loss": 0.73670536, + "learning_rate": 6.205553526478829e-07, + "loss": 0.75813174, + "num_input_tokens_seen": 269012125, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 12474, + "time_per_iteration": 3.8605411052703857 + }, + { + "auxiliary_loss_clip": 0.01106384, + "auxiliary_loss_mlp": 0.01033692, + "balance_loss_clip": 1.02101982, + "balance_loss_mlp": 1.03537238, + "epoch": 0.7500375770329175, + "flos": 18296559089280.0, + "grad_norm": 1.6563775017925497, + "language_loss": 0.74591839, + "learning_rate": 6.202733797375492e-07, + "loss": 0.7673192, + "num_input_tokens_seen": 269030545, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 12475, + "time_per_iteration": 2.42132830619812 + }, + { + "auxiliary_loss_clip": 0.01108313, + "auxiliary_loss_mlp": 0.01039073, + "balance_loss_clip": 1.02606773, + "balance_loss_mlp": 1.0368228, + "epoch": 0.7500977002855854, + "flos": 19169978198400.0, + "grad_norm": 3.53790302868858, + "language_loss": 0.80186552, + "learning_rate": 6.199914591465878e-07, + "loss": 0.82333934, + "num_input_tokens_seen": 269048180, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.71484375, + "step": 12476, + "time_per_iteration": 2.4238805770874023 + }, + { + "auxiliary_loss_clip": 0.01101438, + "auxiliary_loss_mlp": 0.01030238, + "balance_loss_clip": 1.01843047, + "balance_loss_mlp": 1.03465772, + "epoch": 0.7501578235382534, + "flos": 22163886754560.0, + "grad_norm": 1.8885808312532115, + "language_loss": 0.77860969, + "learning_rate": 6.19709590885688e-07, + "loss": 0.79992652, + "num_input_tokens_seen": 269068600, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66796875, + "step": 12477, + "time_per_iteration": 2.4582700729370117 + }, + { + "auxiliary_loss_clip": 0.01025103, + "auxiliary_loss_mlp": 0.01003277, + "balance_loss_clip": 1.00223351, + "balance_loss_mlp": 1.00471592, + "epoch": 0.7502179467909214, + "flos": 64465040033280.0, + "grad_norm": 0.8084596961185327, + "language_loss": 0.54396832, + "learning_rate": 6.194277749655394e-07, + "loss": 0.56425214, + "num_input_tokens_seen": 269119045, + "router_z_loss_clip": 0.01043701, + "router_z_loss_mlp": 0.20410156, + "step": 12478, + "time_per_iteration": 3.0614583492279053 + }, + { + "auxiliary_loss_clip": 0.01100592, + "auxiliary_loss_mlp": 0.01032025, + "balance_loss_clip": 1.02021098, + "balance_loss_mlp": 1.0357542, + "epoch": 0.7502780700435894, + "flos": 20478275268480.0, + "grad_norm": 1.732296797104268, + "language_loss": 0.80400872, + "learning_rate": 6.191460113968272e-07, + "loss": 0.82533485, + "num_input_tokens_seen": 269136755, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6484375, + "step": 12479, + "time_per_iteration": 2.4517574310302734 + }, + { + "auxiliary_loss_clip": 0.01106016, + "auxiliary_loss_mlp": 0.01035802, + "balance_loss_clip": 1.0231421, + "balance_loss_mlp": 1.03617644, + "epoch": 0.7503381932962573, + "flos": 20445273648000.0, + "grad_norm": 2.2068384473951386, + "language_loss": 0.62537003, + "learning_rate": 6.188643001902369e-07, + "loss": 0.64678824, + "num_input_tokens_seen": 269156120, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 12480, + "time_per_iteration": 2.464008092880249 + }, + { + "auxiliary_loss_clip": 0.01099382, + "auxiliary_loss_mlp": 0.01033195, + "balance_loss_clip": 1.02187634, + "balance_loss_mlp": 1.03546023, + "epoch": 0.7503983165489253, + "flos": 22381936266240.0, + "grad_norm": 1.8758461375908144, + "language_loss": 0.77756959, + "learning_rate": 6.185826413564512e-07, + "loss": 0.79889536, + "num_input_tokens_seen": 269175650, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 12481, + "time_per_iteration": 2.457960367202759 + }, + { + "auxiliary_loss_clip": 0.01103553, + "auxiliary_loss_mlp": 0.01030844, + "balance_loss_clip": 1.01870799, + "balance_loss_mlp": 1.03513408, + "epoch": 0.7504584398015932, + "flos": 24899453717760.0, + "grad_norm": 1.6027939437318084, + "language_loss": 0.70975888, + "learning_rate": 6.183010349061501e-07, + "loss": 0.73110282, + "num_input_tokens_seen": 269197080, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.68359375, + "step": 12482, + "time_per_iteration": 2.505486011505127 + }, + { + "auxiliary_loss_clip": 0.01103914, + "auxiliary_loss_mlp": 0.01031643, + "balance_loss_clip": 1.02004409, + "balance_loss_mlp": 1.03608012, + "epoch": 0.7505185630542612, + "flos": 25885237547520.0, + "grad_norm": 1.6593432935882615, + "language_loss": 0.70126545, + "learning_rate": 6.180194808500118e-07, + "loss": 0.72262096, + "num_input_tokens_seen": 269218600, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 12483, + "time_per_iteration": 2.5372493267059326 + }, + { + "auxiliary_loss_clip": 0.01101463, + "auxiliary_loss_mlp": 0.01025502, + "balance_loss_clip": 1.01488626, + "balance_loss_mlp": 1.03527784, + "epoch": 0.7505786863069293, + "flos": 23143852581120.0, + "grad_norm": 1.8314217473162897, + "language_loss": 0.74355495, + "learning_rate": 6.177379791987131e-07, + "loss": 0.76482463, + "num_input_tokens_seen": 269239245, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.6640625, + "step": 12484, + "time_per_iteration": 2.482421636581421 + }, + { + "auxiliary_loss_clip": 0.01100626, + "auxiliary_loss_mlp": 0.01026576, + "balance_loss_clip": 1.01453543, + "balance_loss_mlp": 1.03498316, + "epoch": 0.7506388095595972, + "flos": 16983377769600.0, + "grad_norm": 2.0535325266367153, + "language_loss": 0.84864926, + "learning_rate": 6.174565299629295e-07, + "loss": 0.86992133, + "num_input_tokens_seen": 269258520, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.65625, + "step": 12485, + "time_per_iteration": 2.446956157684326 + }, + { + "auxiliary_loss_clip": 0.0110043, + "auxiliary_loss_mlp": 0.01030894, + "balance_loss_clip": 1.01902699, + "balance_loss_mlp": 1.03467631, + "epoch": 0.7506989328122652, + "flos": 22344984149760.0, + "grad_norm": 1.4660860594284646, + "language_loss": 0.77995837, + "learning_rate": 6.171751331533323e-07, + "loss": 0.80127156, + "num_input_tokens_seen": 269278320, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.65625, + "step": 12486, + "time_per_iteration": 2.517058849334717 + }, + { + "auxiliary_loss_clip": 0.01106308, + "auxiliary_loss_mlp": 0.0103261, + "balance_loss_clip": 1.01999187, + "balance_loss_mlp": 1.03714168, + "epoch": 0.7507590560649331, + "flos": 25776069137280.0, + "grad_norm": 1.8190391114760833, + "language_loss": 0.72836137, + "learning_rate": 6.168937887805932e-07, + "loss": 0.74975049, + "num_input_tokens_seen": 269298025, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69140625, + "step": 12487, + "time_per_iteration": 2.5011062622070312 + }, + { + "auxiliary_loss_clip": 0.01102568, + "auxiliary_loss_mlp": 0.01029737, + "balance_loss_clip": 1.01789975, + "balance_loss_mlp": 1.03470707, + "epoch": 0.7508191793176011, + "flos": 24279420124800.0, + "grad_norm": 1.9101645594404746, + "language_loss": 0.67258334, + "learning_rate": 6.166124968553801e-07, + "loss": 0.69390637, + "num_input_tokens_seen": 269316770, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 12488, + "time_per_iteration": 2.4733595848083496 + }, + { + "auxiliary_loss_clip": 0.01102, + "auxiliary_loss_mlp": 0.01030971, + "balance_loss_clip": 1.01859736, + "balance_loss_mlp": 1.03543329, + "epoch": 0.750879302570269, + "flos": 19899575251200.0, + "grad_norm": 1.5954829957007908, + "language_loss": 0.77207714, + "learning_rate": 6.163312573883592e-07, + "loss": 0.79340684, + "num_input_tokens_seen": 269334755, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6640625, + "step": 12489, + "time_per_iteration": 2.41869854927063 + }, + { + "auxiliary_loss_clip": 0.01100051, + "auxiliary_loss_mlp": 0.01029682, + "balance_loss_clip": 1.01907802, + "balance_loss_mlp": 1.03533888, + "epoch": 0.750939425822937, + "flos": 29205681667200.0, + "grad_norm": 1.8920646114871729, + "language_loss": 0.75356829, + "learning_rate": 6.160500703901956e-07, + "loss": 0.77486563, + "num_input_tokens_seen": 269353810, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.6484375, + "step": 12490, + "time_per_iteration": 2.530346155166626 + }, + { + "auxiliary_loss_clip": 0.01102359, + "auxiliary_loss_mlp": 0.01029526, + "balance_loss_clip": 1.01762867, + "balance_loss_mlp": 1.03632128, + "epoch": 0.750999549075605, + "flos": 21142300043520.0, + "grad_norm": 1.6040694673861557, + "language_loss": 0.78232539, + "learning_rate": 6.157689358715527e-07, + "loss": 0.8036443, + "num_input_tokens_seen": 269372910, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66015625, + "step": 12491, + "time_per_iteration": 2.445436954498291 + }, + { + "auxiliary_loss_clip": 0.01097554, + "auxiliary_loss_mlp": 0.01029244, + "balance_loss_clip": 1.01876545, + "balance_loss_mlp": 1.03334594, + "epoch": 0.751059672328273, + "flos": 23547740083200.0, + "grad_norm": 2.0707908886127813, + "language_loss": 0.76477361, + "learning_rate": 6.154878538430899e-07, + "loss": 0.7860415, + "num_input_tokens_seen": 269391545, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.640625, + "step": 12492, + "time_per_iteration": 2.4592933654785156 + }, + { + "auxiliary_loss_clip": 0.01098246, + "auxiliary_loss_mlp": 0.01028344, + "balance_loss_clip": 1.01735926, + "balance_loss_mlp": 1.03225935, + "epoch": 0.7511197955809409, + "flos": 18989742729600.0, + "grad_norm": 2.019943812075004, + "language_loss": 0.71320605, + "learning_rate": 6.152068243154671e-07, + "loss": 0.73447198, + "num_input_tokens_seen": 269408530, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.66015625, + "step": 12493, + "time_per_iteration": 2.420647621154785 + }, + { + "auxiliary_loss_clip": 0.01103819, + "auxiliary_loss_mlp": 0.01025972, + "balance_loss_clip": 1.01421213, + "balance_loss_mlp": 1.03696609, + "epoch": 0.7511799188336089, + "flos": 22046961006720.0, + "grad_norm": 1.620130382276632, + "language_loss": 0.80576169, + "learning_rate": 6.149258472993395e-07, + "loss": 0.82705963, + "num_input_tokens_seen": 269425930, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.671875, + "step": 12494, + "time_per_iteration": 2.4511101245880127 + }, + { + "auxiliary_loss_clip": 0.01102931, + "auxiliary_loss_mlp": 0.01029107, + "balance_loss_clip": 1.01690626, + "balance_loss_mlp": 1.03543544, + "epoch": 0.7512400420862768, + "flos": 16467125546880.0, + "grad_norm": 2.1793596151447208, + "language_loss": 0.78629243, + "learning_rate": 6.146449228053634e-07, + "loss": 0.80761278, + "num_input_tokens_seen": 269443945, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 12495, + "time_per_iteration": 2.4220409393310547 + }, + { + "auxiliary_loss_clip": 0.01101733, + "auxiliary_loss_mlp": 0.0103546, + "balance_loss_clip": 1.02359903, + "balance_loss_mlp": 1.0354476, + "epoch": 0.7513001653389448, + "flos": 20448326304000.0, + "grad_norm": 2.0360130649256183, + "language_loss": 0.70592833, + "learning_rate": 6.143640508441898e-07, + "loss": 0.72730023, + "num_input_tokens_seen": 269463625, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 12496, + "time_per_iteration": 2.4752755165100098 + }, + { + "auxiliary_loss_clip": 0.01102064, + "auxiliary_loss_mlp": 0.01030349, + "balance_loss_clip": 1.01929259, + "balance_loss_mlp": 1.03579581, + "epoch": 0.7513602885916129, + "flos": 23476816679040.0, + "grad_norm": 1.644722371980129, + "language_loss": 0.77970195, + "learning_rate": 6.140832314264705e-07, + "loss": 0.80102611, + "num_input_tokens_seen": 269483415, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6640625, + "step": 12497, + "time_per_iteration": 2.4557857513427734 + }, + { + "auxiliary_loss_clip": 0.01102933, + "auxiliary_loss_mlp": 0.01033562, + "balance_loss_clip": 1.02162886, + "balance_loss_mlp": 1.03516352, + "epoch": 0.7514204118442808, + "flos": 26797224885120.0, + "grad_norm": 1.5625953994029207, + "language_loss": 0.7667886, + "learning_rate": 6.13802464562855e-07, + "loss": 0.78815353, + "num_input_tokens_seen": 269504635, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 12498, + "time_per_iteration": 2.4923367500305176 + }, + { + "auxiliary_loss_clip": 0.01100471, + "auxiliary_loss_mlp": 0.01031296, + "balance_loss_clip": 1.02007866, + "balance_loss_mlp": 1.03681421, + "epoch": 0.7514805350969488, + "flos": 19865639877120.0, + "grad_norm": 1.712775881225065, + "language_loss": 0.74015152, + "learning_rate": 6.135217502639878e-07, + "loss": 0.76146924, + "num_input_tokens_seen": 269523955, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.63671875, + "step": 12499, + "time_per_iteration": 2.42573618888855 + }, + { + "auxiliary_loss_clip": 0.0109767, + "auxiliary_loss_mlp": 0.01027517, + "balance_loss_clip": 1.01657331, + "balance_loss_mlp": 1.03243327, + "epoch": 0.7515406583496167, + "flos": 24571553437440.0, + "grad_norm": 2.6175707927072787, + "language_loss": 0.7927863, + "learning_rate": 6.132410885405148e-07, + "loss": 0.81403816, + "num_input_tokens_seen": 269544410, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.65234375, + "step": 12500, + "time_per_iteration": 2.4984662532806396 + }, + { + "auxiliary_loss_clip": 0.01109495, + "auxiliary_loss_mlp": 0.01033563, + "balance_loss_clip": 1.01993728, + "balance_loss_mlp": 1.03732872, + "epoch": 0.7516007816022847, + "flos": 20120246455680.0, + "grad_norm": 2.410232320418393, + "language_loss": 0.73039198, + "learning_rate": 6.129604794030794e-07, + "loss": 0.75182259, + "num_input_tokens_seen": 269563315, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.72265625, + "step": 12501, + "time_per_iteration": 2.4204771518707275 + }, + { + "auxiliary_loss_clip": 0.01098599, + "auxiliary_loss_mlp": 0.01025182, + "balance_loss_clip": 1.01324964, + "balance_loss_mlp": 1.0327723, + "epoch": 0.7516609048549526, + "flos": 22784638619520.0, + "grad_norm": 1.6630444702124707, + "language_loss": 0.7825129, + "learning_rate": 6.126799228623207e-07, + "loss": 0.80375075, + "num_input_tokens_seen": 269583950, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 12502, + "time_per_iteration": 2.4997878074645996 + }, + { + "auxiliary_loss_clip": 0.01102781, + "auxiliary_loss_mlp": 0.01032155, + "balance_loss_clip": 1.01995397, + "balance_loss_mlp": 1.03561115, + "epoch": 0.7517210281076206, + "flos": 10634012311680.0, + "grad_norm": 2.7088747693103663, + "language_loss": 0.70608878, + "learning_rate": 6.123994189288786e-07, + "loss": 0.72743809, + "num_input_tokens_seen": 269600120, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 12503, + "time_per_iteration": 2.407897472381592 + }, + { + "auxiliary_loss_clip": 0.01024599, + "auxiliary_loss_mlp": 0.01000364, + "balance_loss_clip": 0.99929094, + "balance_loss_mlp": 1.00410652, + "epoch": 0.7517811513602886, + "flos": 66052221275520.0, + "grad_norm": 0.9807627668089319, + "language_loss": 0.63942432, + "learning_rate": 6.121189676133903e-07, + "loss": 0.65967393, + "num_input_tokens_seen": 269659815, + "router_z_loss_clip": 0.01074219, + "router_z_loss_mlp": 0.20507812, + "step": 12504, + "time_per_iteration": 2.995584726333618 + }, + { + "auxiliary_loss_clip": 0.01096816, + "auxiliary_loss_mlp": 0.01029954, + "balance_loss_clip": 1.01842678, + "balance_loss_mlp": 1.03316665, + "epoch": 0.7518412746129566, + "flos": 37268345018880.0, + "grad_norm": 2.135704139669575, + "language_loss": 0.68474889, + "learning_rate": 6.118385689264896e-07, + "loss": 0.70601666, + "num_input_tokens_seen": 269684565, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.63671875, + "step": 12505, + "time_per_iteration": 2.5871872901916504 + }, + { + "auxiliary_loss_clip": 0.01024908, + "auxiliary_loss_mlp": 0.00998595, + "balance_loss_clip": 0.99765915, + "balance_loss_mlp": 1.00445008, + "epoch": 0.7519013978656245, + "flos": 60518567727360.0, + "grad_norm": 0.6625472273588794, + "language_loss": 0.5508914, + "learning_rate": 6.11558222878809e-07, + "loss": 0.57112646, + "num_input_tokens_seen": 269752325, + "router_z_loss_clip": 0.00933838, + "router_z_loss_mlp": 0.20507812, + "step": 12506, + "time_per_iteration": 3.1377921104431152 + }, + { + "auxiliary_loss_clip": 0.01105218, + "auxiliary_loss_mlp": 0.01033148, + "balance_loss_clip": 1.02154922, + "balance_loss_mlp": 1.03739369, + "epoch": 0.7519615211182925, + "flos": 18806885568000.0, + "grad_norm": 2.061903152831647, + "language_loss": 0.78302479, + "learning_rate": 6.112779294809796e-07, + "loss": 0.80440837, + "num_input_tokens_seen": 269770630, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 12507, + "time_per_iteration": 2.4135823249816895 + }, + { + "auxiliary_loss_clip": 0.01100841, + "auxiliary_loss_mlp": 0.01029111, + "balance_loss_clip": 1.01779842, + "balance_loss_mlp": 1.03669238, + "epoch": 0.7520216443709604, + "flos": 14575244209920.0, + "grad_norm": 1.6731769986850884, + "language_loss": 0.71181047, + "learning_rate": 6.10997688743631e-07, + "loss": 0.73311001, + "num_input_tokens_seen": 269787280, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 12508, + "time_per_iteration": 2.4572551250457764 + }, + { + "auxiliary_loss_clip": 0.01099119, + "auxiliary_loss_mlp": 0.01028584, + "balance_loss_clip": 1.01687193, + "balance_loss_mlp": 1.03434396, + "epoch": 0.7520817676236284, + "flos": 17056599644160.0, + "grad_norm": 1.7139417588852437, + "language_loss": 0.71999872, + "learning_rate": 6.107175006773885e-07, + "loss": 0.74127567, + "num_input_tokens_seen": 269805205, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6484375, + "step": 12509, + "time_per_iteration": 2.432441473007202 + }, + { + "auxiliary_loss_clip": 0.01107542, + "auxiliary_loss_mlp": 0.01036341, + "balance_loss_clip": 1.02306163, + "balance_loss_mlp": 1.03668177, + "epoch": 0.7521418908762965, + "flos": 25666397936640.0, + "grad_norm": 1.5641902395179517, + "language_loss": 0.61837184, + "learning_rate": 6.104373652928785e-07, + "loss": 0.63981068, + "num_input_tokens_seen": 269824820, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.70703125, + "step": 12510, + "time_per_iteration": 2.483800172805786 + }, + { + "auxiliary_loss_clip": 0.01098004, + "auxiliary_loss_mlp": 0.01030469, + "balance_loss_clip": 1.01876235, + "balance_loss_mlp": 1.03506911, + "epoch": 0.7522020141289644, + "flos": 20886759711360.0, + "grad_norm": 1.6552475399559823, + "language_loss": 0.81871247, + "learning_rate": 6.10157282600722e-07, + "loss": 0.83999723, + "num_input_tokens_seen": 269842825, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.62890625, + "step": 12511, + "time_per_iteration": 3.882760524749756 + }, + { + "auxiliary_loss_clip": 0.01105136, + "auxiliary_loss_mlp": 0.01033984, + "balance_loss_clip": 1.02165818, + "balance_loss_mlp": 1.03586888, + "epoch": 0.7522621373816324, + "flos": 12640305444480.0, + "grad_norm": 1.8295208531594718, + "language_loss": 0.7603333, + "learning_rate": 6.098772526115412e-07, + "loss": 0.78172445, + "num_input_tokens_seen": 269859000, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69140625, + "step": 12512, + "time_per_iteration": 2.38800048828125 + }, + { + "auxiliary_loss_clip": 0.01094203, + "auxiliary_loss_mlp": 0.01027458, + "balance_loss_clip": 1.01646113, + "balance_loss_mlp": 1.03219318, + "epoch": 0.7523222606343003, + "flos": 25626141768960.0, + "grad_norm": 1.6286622984961852, + "language_loss": 0.82186234, + "learning_rate": 6.095972753359537e-07, + "loss": 0.84307897, + "num_input_tokens_seen": 269878895, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.62109375, + "step": 12513, + "time_per_iteration": 3.94989013671875 + }, + { + "auxiliary_loss_clip": 0.01103819, + "auxiliary_loss_mlp": 0.01033273, + "balance_loss_clip": 1.02108955, + "balance_loss_mlp": 1.03550029, + "epoch": 0.7523823838869683, + "flos": 20448900921600.0, + "grad_norm": 1.990000011048308, + "language_loss": 0.75192893, + "learning_rate": 6.093173507845771e-07, + "loss": 0.77329987, + "num_input_tokens_seen": 269897280, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.68359375, + "step": 12514, + "time_per_iteration": 3.8526079654693604 + }, + { + "auxiliary_loss_clip": 0.01096596, + "auxiliary_loss_mlp": 0.01029324, + "balance_loss_clip": 1.01869035, + "balance_loss_mlp": 1.03373909, + "epoch": 0.7524425071396362, + "flos": 14720610551040.0, + "grad_norm": 1.7973618299480842, + "language_loss": 0.68311769, + "learning_rate": 6.090374789680271e-07, + "loss": 0.70437688, + "num_input_tokens_seen": 269914640, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.62890625, + "step": 12515, + "time_per_iteration": 2.394958257675171 + }, + { + "auxiliary_loss_clip": 0.01101823, + "auxiliary_loss_mlp": 0.01031956, + "balance_loss_clip": 1.0206188, + "balance_loss_mlp": 1.03523326, + "epoch": 0.7525026303923043, + "flos": 30592048947840.0, + "grad_norm": 2.066116424023424, + "language_loss": 0.70559716, + "learning_rate": 6.087576598969137e-07, + "loss": 0.72693491, + "num_input_tokens_seen": 269934960, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 12516, + "time_per_iteration": 3.9556925296783447 + }, + { + "auxiliary_loss_clip": 0.01099405, + "auxiliary_loss_mlp": 0.01031087, + "balance_loss_clip": 1.01887429, + "balance_loss_mlp": 1.0354656, + "epoch": 0.7525627536449722, + "flos": 24791757765120.0, + "grad_norm": 2.2302621688638764, + "language_loss": 0.8934896, + "learning_rate": 6.084778935818495e-07, + "loss": 0.91479456, + "num_input_tokens_seen": 269956655, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.640625, + "step": 12517, + "time_per_iteration": 2.47792387008667 + }, + { + "auxiliary_loss_clip": 0.01103304, + "auxiliary_loss_mlp": 0.01034057, + "balance_loss_clip": 1.02259493, + "balance_loss_mlp": 1.03562522, + "epoch": 0.7526228768976402, + "flos": 20779782030720.0, + "grad_norm": 1.6178525628265004, + "language_loss": 0.74129748, + "learning_rate": 6.081981800334437e-07, + "loss": 0.76267111, + "num_input_tokens_seen": 269976835, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.67578125, + "step": 12518, + "time_per_iteration": 2.462576150894165 + }, + { + "auxiliary_loss_clip": 0.01024303, + "auxiliary_loss_mlp": 0.00999013, + "balance_loss_clip": 0.99803591, + "balance_loss_mlp": 1.00396061, + "epoch": 0.7526830001503081, + "flos": 66559243703040.0, + "grad_norm": 0.7063379492670796, + "language_loss": 0.55728912, + "learning_rate": 6.079185192623017e-07, + "loss": 0.57752228, + "num_input_tokens_seen": 270040630, + "router_z_loss_clip": 0.00976562, + "router_z_loss_mlp": 0.203125, + "step": 12519, + "time_per_iteration": 3.1375198364257812 + }, + { + "auxiliary_loss_clip": 0.01099253, + "auxiliary_loss_mlp": 0.01033203, + "balance_loss_clip": 1.02270663, + "balance_loss_mlp": 1.03384554, + "epoch": 0.7527431234029761, + "flos": 23477894087040.0, + "grad_norm": 1.4310986441379439, + "language_loss": 0.7804352, + "learning_rate": 6.07638911279029e-07, + "loss": 0.80175972, + "num_input_tokens_seen": 270059695, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.65625, + "step": 12520, + "time_per_iteration": 2.456511974334717 + }, + { + "auxiliary_loss_clip": 0.01098797, + "auxiliary_loss_mlp": 0.01034351, + "balance_loss_clip": 1.02335954, + "balance_loss_mlp": 1.03329098, + "epoch": 0.752803246655644, + "flos": 22049546785920.0, + "grad_norm": 4.550524012485904, + "language_loss": 0.74427485, + "learning_rate": 6.07359356094229e-07, + "loss": 0.76560634, + "num_input_tokens_seen": 270078420, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.65625, + "step": 12521, + "time_per_iteration": 2.492000102996826 + }, + { + "auxiliary_loss_clip": 0.01106943, + "auxiliary_loss_mlp": 0.01031135, + "balance_loss_clip": 1.01836157, + "balance_loss_mlp": 1.03684282, + "epoch": 0.752863369908312, + "flos": 30153795108480.0, + "grad_norm": 1.9335055849585505, + "language_loss": 0.67128062, + "learning_rate": 6.070798537185016e-07, + "loss": 0.6926614, + "num_input_tokens_seen": 270097040, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 12522, + "time_per_iteration": 2.4961695671081543 + }, + { + "auxiliary_loss_clip": 0.01105031, + "auxiliary_loss_mlp": 0.01035216, + "balance_loss_clip": 1.02371216, + "balance_loss_mlp": 1.03653431, + "epoch": 0.7529234931609801, + "flos": 24567638855040.0, + "grad_norm": 2.7941692603753565, + "language_loss": 0.78211427, + "learning_rate": 6.068004041624453e-07, + "loss": 0.80351675, + "num_input_tokens_seen": 270116365, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.68359375, + "step": 12523, + "time_per_iteration": 2.507122039794922 + }, + { + "auxiliary_loss_clip": 0.01100044, + "auxiliary_loss_mlp": 0.01028294, + "balance_loss_clip": 1.01675415, + "balance_loss_mlp": 1.03509927, + "epoch": 0.752983616413648, + "flos": 23112395245440.0, + "grad_norm": 2.0548195739736603, + "language_loss": 0.80642009, + "learning_rate": 6.065210074366571e-07, + "loss": 0.82770348, + "num_input_tokens_seen": 270135395, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 12524, + "time_per_iteration": 2.470827579498291 + }, + { + "auxiliary_loss_clip": 0.01100479, + "auxiliary_loss_mlp": 0.01030003, + "balance_loss_clip": 1.01935172, + "balance_loss_mlp": 1.0362587, + "epoch": 0.753043739666316, + "flos": 24316946858880.0, + "grad_norm": 1.5342669106186173, + "language_loss": 0.7387985, + "learning_rate": 6.062416635517326e-07, + "loss": 0.76010329, + "num_input_tokens_seen": 270156425, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.640625, + "step": 12525, + "time_per_iteration": 2.506251335144043 + }, + { + "auxiliary_loss_clip": 0.01100462, + "auxiliary_loss_mlp": 0.01028627, + "balance_loss_clip": 1.01732588, + "balance_loss_mlp": 1.03503311, + "epoch": 0.7531038629189839, + "flos": 24243294021120.0, + "grad_norm": 1.881783434485301, + "language_loss": 0.71693766, + "learning_rate": 6.059623725182641e-07, + "loss": 0.73822856, + "num_input_tokens_seen": 270176905, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 12526, + "time_per_iteration": 2.4697048664093018 + }, + { + "auxiliary_loss_clip": 0.0109865, + "auxiliary_loss_mlp": 0.01025174, + "balance_loss_clip": 1.01402175, + "balance_loss_mlp": 1.0336082, + "epoch": 0.7531639861716519, + "flos": 30188807890560.0, + "grad_norm": 2.5161959473083675, + "language_loss": 0.71867061, + "learning_rate": 6.056831343468414e-07, + "loss": 0.73990887, + "num_input_tokens_seen": 270196640, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 12527, + "time_per_iteration": 2.544797658920288 + }, + { + "auxiliary_loss_clip": 0.01099923, + "auxiliary_loss_mlp": 0.01025133, + "balance_loss_clip": 1.01430297, + "balance_loss_mlp": 1.03523958, + "epoch": 0.7532241094243198, + "flos": 18223193560320.0, + "grad_norm": 1.8815008802332143, + "language_loss": 0.80829144, + "learning_rate": 6.054039490480539e-07, + "loss": 0.82954198, + "num_input_tokens_seen": 270213905, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6484375, + "step": 12528, + "time_per_iteration": 2.4095561504364014 + }, + { + "auxiliary_loss_clip": 0.01100721, + "auxiliary_loss_mlp": 0.01033151, + "balance_loss_clip": 1.0207355, + "balance_loss_mlp": 1.03425789, + "epoch": 0.7532842326769879, + "flos": 20881049448960.0, + "grad_norm": 1.941676529480235, + "language_loss": 0.84620762, + "learning_rate": 6.051248166324892e-07, + "loss": 0.86754632, + "num_input_tokens_seen": 270231995, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6640625, + "step": 12529, + "time_per_iteration": 2.4949631690979004 + }, + { + "auxiliary_loss_clip": 0.01105602, + "auxiliary_loss_mlp": 0.0103173, + "balance_loss_clip": 1.0194391, + "balance_loss_mlp": 1.03682232, + "epoch": 0.7533443559296558, + "flos": 18078689145600.0, + "grad_norm": 1.741456594396521, + "language_loss": 0.73868054, + "learning_rate": 6.048457371107303e-07, + "loss": 0.76005387, + "num_input_tokens_seen": 270251480, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 12530, + "time_per_iteration": 2.414186954498291 + }, + { + "auxiliary_loss_clip": 0.01024012, + "auxiliary_loss_mlp": 0.01001757, + "balance_loss_clip": 1.00077367, + "balance_loss_mlp": 1.00382376, + "epoch": 0.7534044791823238, + "flos": 50254830766080.0, + "grad_norm": 0.8225360852867398, + "language_loss": 0.63598192, + "learning_rate": 6.045667104933612e-07, + "loss": 0.65623963, + "num_input_tokens_seen": 270306480, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.20214844, + "step": 12531, + "time_per_iteration": 2.9014906883239746 + }, + { + "auxiliary_loss_clip": 0.0110411, + "auxiliary_loss_mlp": 0.0102764, + "balance_loss_clip": 1.0154748, + "balance_loss_mlp": 1.03552723, + "epoch": 0.7534646024349917, + "flos": 20850274471680.0, + "grad_norm": 2.4431425943596876, + "language_loss": 0.69780314, + "learning_rate": 6.042877367909633e-07, + "loss": 0.71912062, + "num_input_tokens_seen": 270324595, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6875, + "step": 12532, + "time_per_iteration": 2.4260380268096924 + }, + { + "auxiliary_loss_clip": 0.01097275, + "auxiliary_loss_mlp": 0.01028653, + "balance_loss_clip": 1.01846051, + "balance_loss_mlp": 1.03496122, + "epoch": 0.7535247256876597, + "flos": 23071779941760.0, + "grad_norm": 1.5569948577505761, + "language_loss": 0.77583849, + "learning_rate": 6.040088160141132e-07, + "loss": 0.79709774, + "num_input_tokens_seen": 270344375, + "router_z_loss_clip": 0.10205078, + "router_z_loss_mlp": 0.62109375, + "step": 12533, + "time_per_iteration": 2.454207181930542 + }, + { + "auxiliary_loss_clip": 0.01024523, + "auxiliary_loss_mlp": 0.01002703, + "balance_loss_clip": 1.00167179, + "balance_loss_mlp": 1.00402224, + "epoch": 0.7535848489403276, + "flos": 58623418252800.0, + "grad_norm": 0.7822513714763298, + "language_loss": 0.57376039, + "learning_rate": 6.037299481733886e-07, + "loss": 0.59403265, + "num_input_tokens_seen": 270405235, + "router_z_loss_clip": 0.01031494, + "router_z_loss_mlp": 0.20507812, + "step": 12534, + "time_per_iteration": 3.077544927597046 + }, + { + "auxiliary_loss_clip": 0.01097886, + "auxiliary_loss_mlp": 0.0102723, + "balance_loss_clip": 1.01530933, + "balance_loss_mlp": 1.03252482, + "epoch": 0.7536449721929956, + "flos": 26577882483840.0, + "grad_norm": 1.4171340268037091, + "language_loss": 0.71380311, + "learning_rate": 6.03451133279365e-07, + "loss": 0.73505425, + "num_input_tokens_seen": 270425820, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 12535, + "time_per_iteration": 2.526242971420288 + }, + { + "auxiliary_loss_clip": 0.01100548, + "auxiliary_loss_mlp": 0.01028567, + "balance_loss_clip": 1.01628292, + "balance_loss_mlp": 1.03258336, + "epoch": 0.7537050954456637, + "flos": 25735992537600.0, + "grad_norm": 1.6321998046367074, + "language_loss": 0.80901384, + "learning_rate": 6.031723713426135e-07, + "loss": 0.83030498, + "num_input_tokens_seen": 270447120, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6796875, + "step": 12536, + "time_per_iteration": 2.472864866256714 + }, + { + "auxiliary_loss_clip": 0.01096541, + "auxiliary_loss_mlp": 0.01024408, + "balance_loss_clip": 1.01334548, + "balance_loss_mlp": 1.03342223, + "epoch": 0.7537652186983316, + "flos": 30224431203840.0, + "grad_norm": 1.9374714714672148, + "language_loss": 0.74261057, + "learning_rate": 6.028936623737067e-07, + "loss": 0.76382011, + "num_input_tokens_seen": 270468680, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6328125, + "step": 12537, + "time_per_iteration": 2.5162243843078613 + }, + { + "auxiliary_loss_clip": 0.01101972, + "auxiliary_loss_mlp": 0.01031017, + "balance_loss_clip": 1.01921487, + "balance_loss_mlp": 1.03531504, + "epoch": 0.7538253419509996, + "flos": 12641239198080.0, + "grad_norm": 1.6037731039814345, + "language_loss": 0.74178267, + "learning_rate": 6.026150063832111e-07, + "loss": 0.76311255, + "num_input_tokens_seen": 270486310, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 12538, + "time_per_iteration": 2.3771462440490723 + }, + { + "auxiliary_loss_clip": 0.01102251, + "auxiliary_loss_mlp": 0.01030969, + "balance_loss_clip": 1.01944685, + "balance_loss_mlp": 1.03522778, + "epoch": 0.7538854652036675, + "flos": 23185976256000.0, + "grad_norm": 1.599430575608072, + "language_loss": 0.6738885, + "learning_rate": 6.023364033816956e-07, + "loss": 0.69522071, + "num_input_tokens_seen": 270507210, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 12539, + "time_per_iteration": 2.4771296977996826 + }, + { + "auxiliary_loss_clip": 0.01099836, + "auxiliary_loss_mlp": 0.01026603, + "balance_loss_clip": 1.01467586, + "balance_loss_mlp": 1.03530288, + "epoch": 0.7539455884563355, + "flos": 23186227651200.0, + "grad_norm": 1.910954039527726, + "language_loss": 0.74824083, + "learning_rate": 6.020578533797229e-07, + "loss": 0.7695052, + "num_input_tokens_seen": 270525250, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.64453125, + "step": 12540, + "time_per_iteration": 2.4341037273406982 + }, + { + "auxiliary_loss_clip": 0.01102106, + "auxiliary_loss_mlp": 0.01028012, + "balance_loss_clip": 1.01606107, + "balance_loss_mlp": 1.03418863, + "epoch": 0.7540057117090034, + "flos": 13181155505280.0, + "grad_norm": 1.9945629348385325, + "language_loss": 0.72719324, + "learning_rate": 6.017793563878566e-07, + "loss": 0.74849451, + "num_input_tokens_seen": 270539295, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6796875, + "step": 12541, + "time_per_iteration": 2.393623113632202 + }, + { + "auxiliary_loss_clip": 0.01100227, + "auxiliary_loss_mlp": 0.01030539, + "balance_loss_clip": 1.01907074, + "balance_loss_mlp": 1.03469777, + "epoch": 0.7540658349616715, + "flos": 45478134478080.0, + "grad_norm": 2.0115318030709277, + "language_loss": 0.72047889, + "learning_rate": 6.015009124166576e-07, + "loss": 0.74178648, + "num_input_tokens_seen": 270562815, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65234375, + "step": 12542, + "time_per_iteration": 2.635145902633667 + }, + { + "auxiliary_loss_clip": 0.01098214, + "auxiliary_loss_mlp": 0.01025398, + "balance_loss_clip": 1.01399565, + "balance_loss_mlp": 1.03344584, + "epoch": 0.7541259582143394, + "flos": 19930817105280.0, + "grad_norm": 1.9065173152707051, + "language_loss": 0.84603345, + "learning_rate": 6.012225214766844e-07, + "loss": 0.86726964, + "num_input_tokens_seen": 270579055, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 12543, + "time_per_iteration": 2.428612232208252 + }, + { + "auxiliary_loss_clip": 0.01104276, + "auxiliary_loss_mlp": 0.01029829, + "balance_loss_clip": 1.01840305, + "balance_loss_mlp": 1.03965712, + "epoch": 0.7541860814670074, + "flos": 27198239299200.0, + "grad_norm": 2.1119731634282766, + "language_loss": 0.73896754, + "learning_rate": 6.009441835784927e-07, + "loss": 0.76030856, + "num_input_tokens_seen": 270599080, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.64453125, + "step": 12544, + "time_per_iteration": 2.4670307636260986 + }, + { + "auxiliary_loss_clip": 0.01102346, + "auxiliary_loss_mlp": 0.01029394, + "balance_loss_clip": 1.0182302, + "balance_loss_mlp": 1.03597724, + "epoch": 0.7542462047196753, + "flos": 21324151624320.0, + "grad_norm": 2.101942602107972, + "language_loss": 0.6828922, + "learning_rate": 6.006658987326383e-07, + "loss": 0.70420957, + "num_input_tokens_seen": 270618715, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 12545, + "time_per_iteration": 2.459852933883667 + }, + { + "auxiliary_loss_clip": 0.01100275, + "auxiliary_loss_mlp": 0.01030728, + "balance_loss_clip": 1.01913524, + "balance_loss_mlp": 1.0337708, + "epoch": 0.7543063279723433, + "flos": 11940944664960.0, + "grad_norm": 1.8429570719628683, + "language_loss": 0.68578523, + "learning_rate": 6.003876669496728e-07, + "loss": 0.70709527, + "num_input_tokens_seen": 270635695, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6640625, + "step": 12546, + "time_per_iteration": 2.420004367828369 + }, + { + "auxiliary_loss_clip": 0.01100982, + "auxiliary_loss_mlp": 0.01032925, + "balance_loss_clip": 1.0202651, + "balance_loss_mlp": 1.03451025, + "epoch": 0.7543664512250112, + "flos": 22819974624000.0, + "grad_norm": 2.2369205909253917, + "language_loss": 0.73266494, + "learning_rate": 6.00109488240147e-07, + "loss": 0.75400406, + "num_input_tokens_seen": 270654325, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6640625, + "step": 12547, + "time_per_iteration": 2.4736859798431396 + }, + { + "auxiliary_loss_clip": 0.01101024, + "auxiliary_loss_mlp": 0.01024925, + "balance_loss_clip": 1.01246762, + "balance_loss_mlp": 1.03465009, + "epoch": 0.7544265744776792, + "flos": 20923855482240.0, + "grad_norm": 1.7870453962384887, + "language_loss": 0.67817152, + "learning_rate": 5.998313626146099e-07, + "loss": 0.699431, + "num_input_tokens_seen": 270674260, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6640625, + "step": 12548, + "time_per_iteration": 2.443042755126953 + }, + { + "auxiliary_loss_clip": 0.01103041, + "auxiliary_loss_mlp": 0.01032802, + "balance_loss_clip": 1.02093458, + "balance_loss_mlp": 1.03505886, + "epoch": 0.7544866977303473, + "flos": 15195493284480.0, + "grad_norm": 1.7833036384787766, + "language_loss": 0.87229598, + "learning_rate": 5.995532900836088e-07, + "loss": 0.89365441, + "num_input_tokens_seen": 270692200, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 12549, + "time_per_iteration": 2.4908969402313232 + }, + { + "auxiliary_loss_clip": 0.01097148, + "auxiliary_loss_mlp": 0.01035232, + "balance_loss_clip": 1.02423525, + "balance_loss_mlp": 1.0338217, + "epoch": 0.7545468209830152, + "flos": 27083683848960.0, + "grad_norm": 1.9918391310756007, + "language_loss": 0.76892895, + "learning_rate": 5.992752706576865e-07, + "loss": 0.79025269, + "num_input_tokens_seen": 270709675, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6328125, + "step": 12550, + "time_per_iteration": 2.5220580101013184 + }, + { + "auxiliary_loss_clip": 0.01101116, + "auxiliary_loss_mlp": 0.01025163, + "balance_loss_clip": 1.01411855, + "balance_loss_mlp": 1.0339551, + "epoch": 0.7546069442356832, + "flos": 26871703735680.0, + "grad_norm": 1.4369467492375085, + "language_loss": 0.69346207, + "learning_rate": 5.98997304347386e-07, + "loss": 0.7147249, + "num_input_tokens_seen": 270733055, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.671875, + "step": 12551, + "time_per_iteration": 2.517190933227539 + }, + { + "auxiliary_loss_clip": 0.0110311, + "auxiliary_loss_mlp": 0.01026388, + "balance_loss_clip": 1.01450872, + "balance_loss_mlp": 1.03722537, + "epoch": 0.7546670674883511, + "flos": 15743131015680.0, + "grad_norm": 1.8744654131641019, + "language_loss": 0.86030054, + "learning_rate": 5.987193911632487e-07, + "loss": 0.88159549, + "num_input_tokens_seen": 270749275, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66015625, + "step": 12552, + "time_per_iteration": 2.402366876602173 + }, + { + "auxiliary_loss_clip": 0.01102012, + "auxiliary_loss_mlp": 0.0102913, + "balance_loss_clip": 1.01788902, + "balance_loss_mlp": 1.03502691, + "epoch": 0.7547271907410191, + "flos": 23477714519040.0, + "grad_norm": 2.4665346108502533, + "language_loss": 0.78498495, + "learning_rate": 5.98441531115812e-07, + "loss": 0.80629647, + "num_input_tokens_seen": 270768230, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 12553, + "time_per_iteration": 3.900495767593384 + }, + { + "auxiliary_loss_clip": 0.01103224, + "auxiliary_loss_mlp": 0.01032799, + "balance_loss_clip": 1.02062142, + "balance_loss_mlp": 1.03664863, + "epoch": 0.754787313993687, + "flos": 31722804069120.0, + "grad_norm": 2.2168137149261518, + "language_loss": 0.62832999, + "learning_rate": 5.981637242156135e-07, + "loss": 0.64969027, + "num_input_tokens_seen": 270786285, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6640625, + "step": 12554, + "time_per_iteration": 2.517960786819458 + }, + { + "auxiliary_loss_clip": 0.01099717, + "auxiliary_loss_mlp": 0.01027707, + "balance_loss_clip": 1.01661491, + "balance_loss_mlp": 1.03377116, + "epoch": 0.7548474372463551, + "flos": 27563055782400.0, + "grad_norm": 1.582375661492136, + "language_loss": 0.73297715, + "learning_rate": 5.978859704731864e-07, + "loss": 0.75425136, + "num_input_tokens_seen": 270805505, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.66015625, + "step": 12555, + "time_per_iteration": 3.861729145050049 + }, + { + "auxiliary_loss_clip": 0.01105045, + "auxiliary_loss_mlp": 0.01028913, + "balance_loss_clip": 1.01683104, + "balance_loss_mlp": 1.03707051, + "epoch": 0.754907560499023, + "flos": 19318576763520.0, + "grad_norm": 2.3601676718523956, + "language_loss": 0.78618932, + "learning_rate": 5.976082698990645e-07, + "loss": 0.80752885, + "num_input_tokens_seen": 270824610, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6796875, + "step": 12556, + "time_per_iteration": 3.837012529373169 + }, + { + "auxiliary_loss_clip": 0.01024032, + "auxiliary_loss_mlp": 0.0100246, + "balance_loss_clip": 1.00142884, + "balance_loss_mlp": 1.00368142, + "epoch": 0.754967683751691, + "flos": 69744628684800.0, + "grad_norm": 0.708139285400587, + "language_loss": 0.50455654, + "learning_rate": 5.973306225037769e-07, + "loss": 0.52482152, + "num_input_tokens_seen": 270886155, + "router_z_loss_clip": 0.01031494, + "router_z_loss_mlp": 0.203125, + "step": 12557, + "time_per_iteration": 4.464947462081909 + }, + { + "auxiliary_loss_clip": 0.01105013, + "auxiliary_loss_mlp": 0.01027804, + "balance_loss_clip": 1.01530528, + "balance_loss_mlp": 1.03735423, + "epoch": 0.7550278070043589, + "flos": 24421913377920.0, + "grad_norm": 1.6820502805276656, + "language_loss": 0.71426684, + "learning_rate": 5.970530282978525e-07, + "loss": 0.73559499, + "num_input_tokens_seen": 270905325, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 12558, + "time_per_iteration": 2.4628171920776367 + }, + { + "auxiliary_loss_clip": 0.0109928, + "auxiliary_loss_mlp": 0.01027296, + "balance_loss_clip": 1.01590014, + "balance_loss_mlp": 1.0340848, + "epoch": 0.7550879302570269, + "flos": 32634611838720.0, + "grad_norm": 1.7073621929136382, + "language_loss": 0.80198216, + "learning_rate": 5.967754872918187e-07, + "loss": 0.82324797, + "num_input_tokens_seen": 270927535, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 12559, + "time_per_iteration": 2.5296967029571533 + }, + { + "auxiliary_loss_clip": 0.01102511, + "auxiliary_loss_mlp": 0.01027699, + "balance_loss_clip": 1.01577234, + "balance_loss_mlp": 1.03483188, + "epoch": 0.7551480535096948, + "flos": 21795550738560.0, + "grad_norm": 1.6276492932782158, + "language_loss": 0.78893793, + "learning_rate": 5.96497999496199e-07, + "loss": 0.81024003, + "num_input_tokens_seen": 270946920, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 12560, + "time_per_iteration": 2.5170834064483643 + }, + { + "auxiliary_loss_clip": 0.01098646, + "auxiliary_loss_mlp": 0.01033819, + "balance_loss_clip": 1.02261329, + "balance_loss_mlp": 1.03458691, + "epoch": 0.7552081767623628, + "flos": 18515111391360.0, + "grad_norm": 1.7060183642703433, + "language_loss": 0.70997584, + "learning_rate": 5.96220564921515e-07, + "loss": 0.73130047, + "num_input_tokens_seen": 270965705, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 12561, + "time_per_iteration": 2.491224765777588 + }, + { + "auxiliary_loss_clip": 0.01099644, + "auxiliary_loss_mlp": 0.01028258, + "balance_loss_clip": 1.01652765, + "balance_loss_mlp": 1.03315794, + "epoch": 0.7552683000150308, + "flos": 27634805199360.0, + "grad_norm": 1.5670310978935318, + "language_loss": 0.75664687, + "learning_rate": 5.959431835782889e-07, + "loss": 0.77792597, + "num_input_tokens_seen": 270986550, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 12562, + "time_per_iteration": 2.5043649673461914 + }, + { + "auxiliary_loss_clip": 0.0110067, + "auxiliary_loss_mlp": 0.01029353, + "balance_loss_clip": 1.01738989, + "balance_loss_mlp": 1.03472567, + "epoch": 0.7553284232676988, + "flos": 20302924049280.0, + "grad_norm": 2.5989481487272426, + "language_loss": 0.75632036, + "learning_rate": 5.956658554770371e-07, + "loss": 0.77762067, + "num_input_tokens_seen": 271006250, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66015625, + "step": 12563, + "time_per_iteration": 2.461552143096924 + }, + { + "auxiliary_loss_clip": 0.0110889, + "auxiliary_loss_mlp": 0.01034511, + "balance_loss_clip": 1.02043235, + "balance_loss_mlp": 1.03629291, + "epoch": 0.7553885465203668, + "flos": 33255471444480.0, + "grad_norm": 2.463791742652493, + "language_loss": 0.67465413, + "learning_rate": 5.953885806282768e-07, + "loss": 0.69608808, + "num_input_tokens_seen": 271025575, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7265625, + "step": 12564, + "time_per_iteration": 2.518521785736084 + }, + { + "auxiliary_loss_clip": 0.01104188, + "auxiliary_loss_mlp": 0.01032716, + "balance_loss_clip": 1.02057433, + "balance_loss_mlp": 1.03584766, + "epoch": 0.7554486697730347, + "flos": 21616249023360.0, + "grad_norm": 2.2259446193296943, + "language_loss": 0.68585801, + "learning_rate": 5.951113590425228e-07, + "loss": 0.70722699, + "num_input_tokens_seen": 271045805, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.68359375, + "step": 12565, + "time_per_iteration": 2.473606586456299 + }, + { + "auxiliary_loss_clip": 0.01103455, + "auxiliary_loss_mlp": 0.01027682, + "balance_loss_clip": 1.01519513, + "balance_loss_mlp": 1.0340724, + "epoch": 0.7555087930257027, + "flos": 27632973605760.0, + "grad_norm": 1.6339568808166163, + "language_loss": 0.7538799, + "learning_rate": 5.94834190730287e-07, + "loss": 0.77519131, + "num_input_tokens_seen": 271066065, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 12566, + "time_per_iteration": 2.4602677822113037 + }, + { + "auxiliary_loss_clip": 0.01105793, + "auxiliary_loss_mlp": 0.0103259, + "balance_loss_clip": 1.01922655, + "balance_loss_mlp": 1.03676701, + "epoch": 0.7555689162783706, + "flos": 23621644316160.0, + "grad_norm": 2.446271815399535, + "language_loss": 0.73930967, + "learning_rate": 5.945570757020789e-07, + "loss": 0.76069355, + "num_input_tokens_seen": 271085870, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.6875, + "step": 12567, + "time_per_iteration": 2.482639789581299 + }, + { + "auxiliary_loss_clip": 0.01100485, + "auxiliary_loss_mlp": 0.01028822, + "balance_loss_clip": 1.01764655, + "balance_loss_mlp": 1.03495455, + "epoch": 0.7556290395310387, + "flos": 24863076218880.0, + "grad_norm": 1.8407945721596504, + "language_loss": 0.62615836, + "learning_rate": 5.942800139684073e-07, + "loss": 0.6474514, + "num_input_tokens_seen": 271104260, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.65625, + "step": 12568, + "time_per_iteration": 2.5483205318450928 + }, + { + "auxiliary_loss_clip": 0.01101205, + "auxiliary_loss_mlp": 0.01030606, + "balance_loss_clip": 1.01934648, + "balance_loss_mlp": 1.03582668, + "epoch": 0.7556891627837066, + "flos": 43543770330240.0, + "grad_norm": 1.9963818018777864, + "language_loss": 0.66748881, + "learning_rate": 5.940030055397789e-07, + "loss": 0.68880689, + "num_input_tokens_seen": 271125745, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65234375, + "step": 12569, + "time_per_iteration": 2.659467935562134 + }, + { + "auxiliary_loss_clip": 0.01105651, + "auxiliary_loss_mlp": 0.0103263, + "balance_loss_clip": 1.01936173, + "balance_loss_mlp": 1.03600824, + "epoch": 0.7557492860363746, + "flos": 26650924790400.0, + "grad_norm": 1.6607243680943589, + "language_loss": 0.67248321, + "learning_rate": 5.93726050426697e-07, + "loss": 0.69386601, + "num_input_tokens_seen": 271147145, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6953125, + "step": 12570, + "time_per_iteration": 2.4708566665649414 + }, + { + "auxiliary_loss_clip": 0.01102793, + "auxiliary_loss_mlp": 0.0103299, + "balance_loss_clip": 1.02071762, + "balance_loss_mlp": 1.0357399, + "epoch": 0.7558094092890425, + "flos": 55182885010560.0, + "grad_norm": 1.8220604458329166, + "language_loss": 0.7152952, + "learning_rate": 5.934491486396647e-07, + "loss": 0.73665303, + "num_input_tokens_seen": 271170865, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.671875, + "step": 12571, + "time_per_iteration": 2.7677295207977295 + }, + { + "auxiliary_loss_clip": 0.01102851, + "auxiliary_loss_mlp": 0.01029018, + "balance_loss_clip": 1.01681685, + "balance_loss_mlp": 1.03468394, + "epoch": 0.7558695325417105, + "flos": 23988292392960.0, + "grad_norm": 1.6120967066403376, + "language_loss": 0.73383725, + "learning_rate": 5.931723001891811e-07, + "loss": 0.75515598, + "num_input_tokens_seen": 271191450, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6796875, + "step": 12572, + "time_per_iteration": 2.457766056060791 + }, + { + "auxiliary_loss_clip": 0.01104212, + "auxiliary_loss_mlp": 0.01032566, + "balance_loss_clip": 1.02069271, + "balance_loss_mlp": 1.03641462, + "epoch": 0.7559296557943784, + "flos": 14611262572800.0, + "grad_norm": 1.9236315061860603, + "language_loss": 0.76293039, + "learning_rate": 5.928955050857456e-07, + "loss": 0.78429818, + "num_input_tokens_seen": 271207335, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 12573, + "time_per_iteration": 2.419971466064453 + }, + { + "auxiliary_loss_clip": 0.01105728, + "auxiliary_loss_mlp": 0.01033526, + "balance_loss_clip": 1.02172422, + "balance_loss_mlp": 1.03609872, + "epoch": 0.7559897790470465, + "flos": 18550483309440.0, + "grad_norm": 1.703385006523425, + "language_loss": 0.69107687, + "learning_rate": 5.926187633398527e-07, + "loss": 0.7124694, + "num_input_tokens_seen": 271226895, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6953125, + "step": 12574, + "time_per_iteration": 2.4180386066436768 + }, + { + "auxiliary_loss_clip": 0.01099738, + "auxiliary_loss_mlp": 0.01033214, + "balance_loss_clip": 1.02082789, + "balance_loss_mlp": 1.03441286, + "epoch": 0.7560499022997144, + "flos": 17967868709760.0, + "grad_norm": 2.2423644939518423, + "language_loss": 0.7207917, + "learning_rate": 5.923420749619974e-07, + "loss": 0.74212122, + "num_input_tokens_seen": 271244375, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.65625, + "step": 12575, + "time_per_iteration": 2.455258846282959 + }, + { + "auxiliary_loss_clip": 0.01098947, + "auxiliary_loss_mlp": 0.01033415, + "balance_loss_clip": 1.02219176, + "balance_loss_mlp": 1.03365374, + "epoch": 0.7561100255523824, + "flos": 15737815802880.0, + "grad_norm": 2.02730026321769, + "language_loss": 0.72025073, + "learning_rate": 5.92065439962673e-07, + "loss": 0.74157435, + "num_input_tokens_seen": 271259530, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 12576, + "time_per_iteration": 2.4121248722076416 + }, + { + "auxiliary_loss_clip": 0.01101081, + "auxiliary_loss_mlp": 0.01033414, + "balance_loss_clip": 1.0213263, + "balance_loss_mlp": 1.03535473, + "epoch": 0.7561701488050504, + "flos": 15888102307200.0, + "grad_norm": 1.8488663808999763, + "language_loss": 0.67365032, + "learning_rate": 5.917888583523669e-07, + "loss": 0.69499528, + "num_input_tokens_seen": 271276835, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.65625, + "step": 12577, + "time_per_iteration": 2.4330592155456543 + }, + { + "auxiliary_loss_clip": 0.0110105, + "auxiliary_loss_mlp": 0.01031737, + "balance_loss_clip": 1.02031672, + "balance_loss_mlp": 1.03520989, + "epoch": 0.7562302720577183, + "flos": 20339157893760.0, + "grad_norm": 1.669663040088463, + "language_loss": 0.78626776, + "learning_rate": 5.915123301415685e-07, + "loss": 0.80759561, + "num_input_tokens_seen": 271296275, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 12578, + "time_per_iteration": 2.4133589267730713 + }, + { + "auxiliary_loss_clip": 0.01101874, + "auxiliary_loss_mlp": 0.01030497, + "balance_loss_clip": 1.01851618, + "balance_loss_mlp": 1.03413773, + "epoch": 0.7562903953103863, + "flos": 20812209033600.0, + "grad_norm": 1.4105288225039079, + "language_loss": 0.75553155, + "learning_rate": 5.912358553407641e-07, + "loss": 0.77685523, + "num_input_tokens_seen": 271315685, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 12579, + "time_per_iteration": 2.465855836868286 + }, + { + "auxiliary_loss_clip": 0.01107073, + "auxiliary_loss_mlp": 0.0103106, + "balance_loss_clip": 1.01840019, + "balance_loss_mlp": 1.03599763, + "epoch": 0.7563505185630542, + "flos": 37596999484800.0, + "grad_norm": 1.9246022226121349, + "language_loss": 0.62678003, + "learning_rate": 5.90959433960437e-07, + "loss": 0.64816135, + "num_input_tokens_seen": 271336790, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 12580, + "time_per_iteration": 2.5613341331481934 + }, + { + "auxiliary_loss_clip": 0.01102863, + "auxiliary_loss_mlp": 0.0103228, + "balance_loss_clip": 1.02075255, + "balance_loss_mlp": 1.03644252, + "epoch": 0.7564106418157223, + "flos": 20230995064320.0, + "grad_norm": 1.7113026290728908, + "language_loss": 0.74942124, + "learning_rate": 5.906830660110691e-07, + "loss": 0.7707727, + "num_input_tokens_seen": 271355470, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 12581, + "time_per_iteration": 2.4502360820770264 + }, + { + "auxiliary_loss_clip": 0.01101422, + "auxiliary_loss_mlp": 0.01029963, + "balance_loss_clip": 1.01828623, + "balance_loss_mlp": 1.03389621, + "epoch": 0.7564707650683902, + "flos": 24754877475840.0, + "grad_norm": 2.005641504780856, + "language_loss": 0.6295954, + "learning_rate": 5.904067515031412e-07, + "loss": 0.6509093, + "num_input_tokens_seen": 271375810, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.67578125, + "step": 12582, + "time_per_iteration": 2.4572439193725586 + }, + { + "auxiliary_loss_clip": 0.01023883, + "auxiliary_loss_mlp": 0.01000227, + "balance_loss_clip": 0.99921417, + "balance_loss_mlp": 1.00362778, + "epoch": 0.7565308883210582, + "flos": 48530076433920.0, + "grad_norm": 0.9810901823792554, + "language_loss": 0.60704458, + "learning_rate": 5.901304904471307e-07, + "loss": 0.6272856, + "num_input_tokens_seen": 271424775, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.203125, + "step": 12583, + "time_per_iteration": 2.7996931076049805 + }, + { + "auxiliary_loss_clip": 0.01103397, + "auxiliary_loss_mlp": 0.01035964, + "balance_loss_clip": 1.02372694, + "balance_loss_mlp": 1.03601849, + "epoch": 0.7565910115737261, + "flos": 12495082757760.0, + "grad_norm": 2.0250696621760413, + "language_loss": 0.78582263, + "learning_rate": 5.898542828535125e-07, + "loss": 0.80721629, + "num_input_tokens_seen": 271440500, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.671875, + "step": 12584, + "time_per_iteration": 2.400280475616455 + }, + { + "auxiliary_loss_clip": 0.01099872, + "auxiliary_loss_mlp": 0.01027615, + "balance_loss_clip": 1.01651096, + "balance_loss_mlp": 1.03562188, + "epoch": 0.7566511348263941, + "flos": 21173003193600.0, + "grad_norm": 2.69321954136788, + "language_loss": 0.77584487, + "learning_rate": 5.895781287327612e-07, + "loss": 0.79711974, + "num_input_tokens_seen": 271458180, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.64453125, + "step": 12585, + "time_per_iteration": 2.4472086429595947 + }, + { + "auxiliary_loss_clip": 0.01108261, + "auxiliary_loss_mlp": 0.01034728, + "balance_loss_clip": 1.02196097, + "balance_loss_mlp": 1.03907382, + "epoch": 0.756711258079062, + "flos": 21754827694080.0, + "grad_norm": 1.6081546851080741, + "language_loss": 0.82765162, + "learning_rate": 5.893020280953493e-07, + "loss": 0.84908152, + "num_input_tokens_seen": 271475730, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69140625, + "step": 12586, + "time_per_iteration": 2.4276626110076904 + }, + { + "auxiliary_loss_clip": 0.01106519, + "auxiliary_loss_mlp": 0.01030607, + "balance_loss_clip": 1.01965213, + "balance_loss_mlp": 1.03753841, + "epoch": 0.75677138133173, + "flos": 22382905933440.0, + "grad_norm": 1.873036053279186, + "language_loss": 0.83275306, + "learning_rate": 5.890259809517459e-07, + "loss": 0.85412443, + "num_input_tokens_seen": 271495030, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.69140625, + "step": 12587, + "time_per_iteration": 2.4600062370300293 + }, + { + "auxiliary_loss_clip": 0.01100482, + "auxiliary_loss_mlp": 0.01029229, + "balance_loss_clip": 1.01739788, + "balance_loss_mlp": 1.03461528, + "epoch": 0.756831504584398, + "flos": 22708974620160.0, + "grad_norm": 1.6080398539976855, + "language_loss": 0.71293926, + "learning_rate": 5.88749987312418e-07, + "loss": 0.73423636, + "num_input_tokens_seen": 271515355, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65625, + "step": 12588, + "time_per_iteration": 2.460441827774048 + }, + { + "auxiliary_loss_clip": 0.01105135, + "auxiliary_loss_mlp": 0.01028941, + "balance_loss_clip": 1.01644826, + "balance_loss_mlp": 1.03631103, + "epoch": 0.756891627837066, + "flos": 24098358643200.0, + "grad_norm": 1.7772907750031848, + "language_loss": 0.68223751, + "learning_rate": 5.884740471878327e-07, + "loss": 0.70357823, + "num_input_tokens_seen": 271535090, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 12589, + "time_per_iteration": 2.4796125888824463 + }, + { + "auxiliary_loss_clip": 0.0110204, + "auxiliary_loss_mlp": 0.01029865, + "balance_loss_clip": 1.01805186, + "balance_loss_mlp": 1.0352689, + "epoch": 0.756951751089734, + "flos": 19749001438080.0, + "grad_norm": 2.5553015061472326, + "language_loss": 0.91916406, + "learning_rate": 5.881981605884522e-07, + "loss": 0.94048315, + "num_input_tokens_seen": 271551075, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66796875, + "step": 12590, + "time_per_iteration": 2.4198997020721436 + }, + { + "auxiliary_loss_clip": 0.01098826, + "auxiliary_loss_mlp": 0.0102714, + "balance_loss_clip": 1.01576114, + "balance_loss_mlp": 1.03452909, + "epoch": 0.7570118743424019, + "flos": 35079266551680.0, + "grad_norm": 1.7917701509519888, + "language_loss": 0.65428317, + "learning_rate": 5.879223275247391e-07, + "loss": 0.67554283, + "num_input_tokens_seen": 271571035, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.640625, + "step": 12591, + "time_per_iteration": 2.56341814994812 + }, + { + "auxiliary_loss_clip": 0.01102228, + "auxiliary_loss_mlp": 0.01026308, + "balance_loss_clip": 1.01525116, + "balance_loss_mlp": 1.03707504, + "epoch": 0.7570719975950699, + "flos": 25594540778880.0, + "grad_norm": 1.511094647527582, + "language_loss": 0.73406184, + "learning_rate": 5.876465480071528e-07, + "loss": 0.75534725, + "num_input_tokens_seen": 271592950, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65234375, + "step": 12592, + "time_per_iteration": 2.474759340286255 + }, + { + "auxiliary_loss_clip": 0.01102216, + "auxiliary_loss_mlp": 0.01035375, + "balance_loss_clip": 1.02331686, + "balance_loss_mlp": 1.03412235, + "epoch": 0.7571321208477378, + "flos": 10816223028480.0, + "grad_norm": 2.1319710484730074, + "language_loss": 0.7111423, + "learning_rate": 5.873708220461522e-07, + "loss": 0.7325182, + "num_input_tokens_seen": 271608835, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6796875, + "step": 12593, + "time_per_iteration": 2.4273533821105957 + }, + { + "auxiliary_loss_clip": 0.01104658, + "auxiliary_loss_mlp": 0.01030106, + "balance_loss_clip": 1.01837587, + "balance_loss_mlp": 1.03637433, + "epoch": 0.7571922441004059, + "flos": 18260109763200.0, + "grad_norm": 1.845375608838855, + "language_loss": 0.66037387, + "learning_rate": 5.870951496521903e-07, + "loss": 0.68172151, + "num_input_tokens_seen": 271627730, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.68359375, + "step": 12594, + "time_per_iteration": 3.838972568511963 + }, + { + "auxiliary_loss_clip": 0.01104515, + "auxiliary_loss_mlp": 0.01032453, + "balance_loss_clip": 1.02056789, + "balance_loss_mlp": 1.03537512, + "epoch": 0.7572523673530738, + "flos": 22890502978560.0, + "grad_norm": 1.5603399133411295, + "language_loss": 0.80766582, + "learning_rate": 5.86819530835722e-07, + "loss": 0.82903558, + "num_input_tokens_seen": 271646415, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69140625, + "step": 12595, + "time_per_iteration": 2.4764091968536377 + }, + { + "auxiliary_loss_clip": 0.01102369, + "auxiliary_loss_mlp": 0.01031107, + "balance_loss_clip": 1.01978159, + "balance_loss_mlp": 1.03633952, + "epoch": 0.7573124906057418, + "flos": 20996323171200.0, + "grad_norm": 1.8048420186435026, + "language_loss": 0.71071315, + "learning_rate": 5.865439656071993e-07, + "loss": 0.73204786, + "num_input_tokens_seen": 271666240, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66015625, + "step": 12596, + "time_per_iteration": 3.9183623790740967 + }, + { + "auxiliary_loss_clip": 0.01100386, + "auxiliary_loss_mlp": 0.01030232, + "balance_loss_clip": 1.01918721, + "balance_loss_mlp": 1.0357343, + "epoch": 0.7573726138584097, + "flos": 20886292834560.0, + "grad_norm": 1.5321566367759303, + "language_loss": 0.80469054, + "learning_rate": 5.862684539770706e-07, + "loss": 0.82599676, + "num_input_tokens_seen": 271686370, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 12597, + "time_per_iteration": 3.962346076965332 + }, + { + "auxiliary_loss_clip": 0.01108273, + "auxiliary_loss_mlp": 0.01030578, + "balance_loss_clip": 1.01784658, + "balance_loss_mlp": 1.03885663, + "epoch": 0.7574327371110777, + "flos": 24530507170560.0, + "grad_norm": 1.9840297783183698, + "language_loss": 0.83408284, + "learning_rate": 5.859929959557835e-07, + "loss": 0.85547137, + "num_input_tokens_seen": 271705050, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 12598, + "time_per_iteration": 2.4496231079101562 + }, + { + "auxiliary_loss_clip": 0.01101103, + "auxiliary_loss_mlp": 0.01024155, + "balance_loss_clip": 1.01322365, + "balance_loss_mlp": 1.03596723, + "epoch": 0.7574928603637456, + "flos": 23364523785600.0, + "grad_norm": 1.806795486884082, + "language_loss": 0.62383306, + "learning_rate": 5.857175915537845e-07, + "loss": 0.64508563, + "num_input_tokens_seen": 271724915, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 12599, + "time_per_iteration": 3.9129881858825684 + }, + { + "auxiliary_loss_clip": 0.01106489, + "auxiliary_loss_mlp": 0.01033726, + "balance_loss_clip": 1.02027953, + "balance_loss_mlp": 1.03697991, + "epoch": 0.7575529836164137, + "flos": 13516274419200.0, + "grad_norm": 2.7350879991531523, + "language_loss": 0.62593752, + "learning_rate": 5.854422407815161e-07, + "loss": 0.6473397, + "num_input_tokens_seen": 271742410, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.6953125, + "step": 12600, + "time_per_iteration": 2.3905975818634033 + }, + { + "auxiliary_loss_clip": 0.01100395, + "auxiliary_loss_mlp": 0.01027152, + "balance_loss_clip": 1.01535034, + "balance_loss_mlp": 1.03529775, + "epoch": 0.7576131068690816, + "flos": 19646584784640.0, + "grad_norm": 1.9463870297593193, + "language_loss": 0.66116518, + "learning_rate": 5.851669436494191e-07, + "loss": 0.68244064, + "num_input_tokens_seen": 271761425, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65234375, + "step": 12601, + "time_per_iteration": 2.4491307735443115 + }, + { + "auxiliary_loss_clip": 0.01099051, + "auxiliary_loss_mlp": 0.01031238, + "balance_loss_clip": 1.02029419, + "balance_loss_mlp": 1.03474712, + "epoch": 0.7576732301217496, + "flos": 20048245643520.0, + "grad_norm": 1.5554220634885219, + "language_loss": 0.67926621, + "learning_rate": 5.848917001679335e-07, + "loss": 0.70056915, + "num_input_tokens_seen": 271780875, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.640625, + "step": 12602, + "time_per_iteration": 2.421680450439453 + }, + { + "auxiliary_loss_clip": 0.01103764, + "auxiliary_loss_mlp": 0.01032798, + "balance_loss_clip": 1.0201081, + "balance_loss_mlp": 1.03649664, + "epoch": 0.7577333533744176, + "flos": 15377093470080.0, + "grad_norm": 1.7612852584963323, + "language_loss": 0.67052841, + "learning_rate": 5.846165103474967e-07, + "loss": 0.69189405, + "num_input_tokens_seen": 271799490, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.671875, + "step": 12603, + "time_per_iteration": 2.4140625 + }, + { + "auxiliary_loss_clip": 0.01098496, + "auxiliary_loss_mlp": 0.01029738, + "balance_loss_clip": 1.01902699, + "balance_loss_mlp": 1.03334355, + "epoch": 0.7577934766270855, + "flos": 17894862316800.0, + "grad_norm": 1.9150574683213546, + "language_loss": 0.61476982, + "learning_rate": 5.843413741985439e-07, + "loss": 0.63605225, + "num_input_tokens_seen": 271817040, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.65234375, + "step": 12604, + "time_per_iteration": 2.4143993854522705 + }, + { + "auxiliary_loss_clip": 0.0110333, + "auxiliary_loss_mlp": 0.01032559, + "balance_loss_clip": 1.02098989, + "balance_loss_mlp": 1.03802633, + "epoch": 0.7578535998797535, + "flos": 21613770984960.0, + "grad_norm": 1.9881999977626783, + "language_loss": 0.80013704, + "learning_rate": 5.840662917315076e-07, + "loss": 0.82149595, + "num_input_tokens_seen": 271835480, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 12605, + "time_per_iteration": 2.4703023433685303 + }, + { + "auxiliary_loss_clip": 0.01105019, + "auxiliary_loss_mlp": 0.01028637, + "balance_loss_clip": 1.01616216, + "balance_loss_mlp": 1.03563833, + "epoch": 0.7579137231324214, + "flos": 18478374756480.0, + "grad_norm": 2.509488145051598, + "language_loss": 0.78940737, + "learning_rate": 5.837912629568198e-07, + "loss": 0.81074387, + "num_input_tokens_seen": 271849835, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 12606, + "time_per_iteration": 2.4461817741394043 + }, + { + "auxiliary_loss_clip": 0.01098445, + "auxiliary_loss_mlp": 0.01025586, + "balance_loss_clip": 1.01545978, + "balance_loss_mlp": 1.03513336, + "epoch": 0.7579738463850895, + "flos": 23255032152960.0, + "grad_norm": 1.3687592276329898, + "language_loss": 0.73185945, + "learning_rate": 5.835162878849087e-07, + "loss": 0.75309968, + "num_input_tokens_seen": 271869560, + "router_z_loss_clip": 0.10107422, + "router_z_loss_mlp": 0.6328125, + "step": 12607, + "time_per_iteration": 2.4908721446990967 + }, + { + "auxiliary_loss_clip": 0.01104024, + "auxiliary_loss_mlp": 0.01028582, + "balance_loss_clip": 1.01622605, + "balance_loss_mlp": 1.03433669, + "epoch": 0.7580339696377574, + "flos": 14027031861120.0, + "grad_norm": 2.4968443331698635, + "language_loss": 0.75006789, + "learning_rate": 5.83241366526202e-07, + "loss": 0.7713939, + "num_input_tokens_seen": 271887950, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6953125, + "step": 12608, + "time_per_iteration": 2.408450126647949 + }, + { + "auxiliary_loss_clip": 0.01100229, + "auxiliary_loss_mlp": 0.01032276, + "balance_loss_clip": 1.02053344, + "balance_loss_mlp": 1.03477442, + "epoch": 0.7580940928904254, + "flos": 25082777756160.0, + "grad_norm": 1.5859201905014537, + "language_loss": 0.71409112, + "learning_rate": 5.829664988911245e-07, + "loss": 0.73541617, + "num_input_tokens_seen": 271907700, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 12609, + "time_per_iteration": 2.498211622238159 + }, + { + "auxiliary_loss_clip": 0.01102343, + "auxiliary_loss_mlp": 0.01029809, + "balance_loss_clip": 1.01692224, + "balance_loss_mlp": 1.03438187, + "epoch": 0.7581542161430933, + "flos": 23836425690240.0, + "grad_norm": 2.844859157672467, + "language_loss": 0.81682944, + "learning_rate": 5.826916849901007e-07, + "loss": 0.83815098, + "num_input_tokens_seen": 271926840, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6796875, + "step": 12610, + "time_per_iteration": 2.432453155517578 + }, + { + "auxiliary_loss_clip": 0.01105711, + "auxiliary_loss_mlp": 0.01030053, + "balance_loss_clip": 1.01812613, + "balance_loss_mlp": 1.03694248, + "epoch": 0.7582143393957613, + "flos": 22237000888320.0, + "grad_norm": 1.6924171050782333, + "language_loss": 0.70433235, + "learning_rate": 5.824169248335488e-07, + "loss": 0.72569001, + "num_input_tokens_seen": 271946465, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 12611, + "time_per_iteration": 2.500880479812622 + }, + { + "auxiliary_loss_clip": 0.01102293, + "auxiliary_loss_mlp": 0.01027381, + "balance_loss_clip": 1.01576972, + "balance_loss_mlp": 1.03632438, + "epoch": 0.7582744626484292, + "flos": 21106389421440.0, + "grad_norm": 1.4523660094894448, + "language_loss": 0.70939386, + "learning_rate": 5.821422184318893e-07, + "loss": 0.7306906, + "num_input_tokens_seen": 271967295, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66015625, + "step": 12612, + "time_per_iteration": 2.4539196491241455 + }, + { + "auxiliary_loss_clip": 0.01104666, + "auxiliary_loss_mlp": 0.01035871, + "balance_loss_clip": 1.02454031, + "balance_loss_mlp": 1.03628385, + "epoch": 0.7583345859010973, + "flos": 24604770539520.0, + "grad_norm": 1.557484420274363, + "language_loss": 0.59628952, + "learning_rate": 5.818675657955397e-07, + "loss": 0.61769485, + "num_input_tokens_seen": 271987960, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.68359375, + "step": 12613, + "time_per_iteration": 2.5192790031433105 + }, + { + "auxiliary_loss_clip": 0.01102507, + "auxiliary_loss_mlp": 0.0103357, + "balance_loss_clip": 1.02141631, + "balance_loss_mlp": 1.03547192, + "epoch": 0.7583947091537652, + "flos": 33546814657920.0, + "grad_norm": 1.5699815827869172, + "language_loss": 0.59917688, + "learning_rate": 5.815929669349135e-07, + "loss": 0.62053764, + "num_input_tokens_seen": 272011780, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.671875, + "step": 12614, + "time_per_iteration": 2.5326051712036133 + }, + { + "auxiliary_loss_clip": 0.01102627, + "auxiliary_loss_mlp": 0.01026271, + "balance_loss_clip": 1.01423693, + "balance_loss_mlp": 1.03418064, + "epoch": 0.7584548324064332, + "flos": 20121000641280.0, + "grad_norm": 1.965283793201321, + "language_loss": 0.73299825, + "learning_rate": 5.813184218604246e-07, + "loss": 0.75428724, + "num_input_tokens_seen": 272030825, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 12615, + "time_per_iteration": 2.4653162956237793 + }, + { + "auxiliary_loss_clip": 0.01023549, + "auxiliary_loss_mlp": 0.01002988, + "balance_loss_clip": 1.00207007, + "balance_loss_mlp": 1.00344896, + "epoch": 0.7585149556591012, + "flos": 70402584061440.0, + "grad_norm": 0.8444154589468232, + "language_loss": 0.67707115, + "learning_rate": 5.810439305824828e-07, + "loss": 0.69733649, + "num_input_tokens_seen": 272095825, + "router_z_loss_clip": 0.00915527, + "router_z_loss_mlp": 0.20117188, + "step": 12616, + "time_per_iteration": 3.0754714012145996 + }, + { + "auxiliary_loss_clip": 0.01105053, + "auxiliary_loss_mlp": 0.01035378, + "balance_loss_clip": 1.02318311, + "balance_loss_mlp": 1.03608048, + "epoch": 0.7585750789117691, + "flos": 16143786293760.0, + "grad_norm": 1.7978643606873037, + "language_loss": 0.84971976, + "learning_rate": 5.807694931114979e-07, + "loss": 0.87112409, + "num_input_tokens_seen": 272113950, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69140625, + "step": 12617, + "time_per_iteration": 2.448288917541504 + }, + { + "auxiliary_loss_clip": 0.01103847, + "auxiliary_loss_mlp": 0.01035021, + "balance_loss_clip": 1.02366078, + "balance_loss_mlp": 1.03641772, + "epoch": 0.7586352021644371, + "flos": 17493165544320.0, + "grad_norm": 2.68696985331022, + "language_loss": 0.75189435, + "learning_rate": 5.804951094578757e-07, + "loss": 0.77328306, + "num_input_tokens_seen": 272130315, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.67578125, + "step": 12618, + "time_per_iteration": 2.3945305347442627 + }, + { + "auxiliary_loss_clip": 0.01106053, + "auxiliary_loss_mlp": 0.01033655, + "balance_loss_clip": 1.02109587, + "balance_loss_mlp": 1.03541541, + "epoch": 0.758695325417105, + "flos": 17275187859840.0, + "grad_norm": 2.6724320695855646, + "language_loss": 0.77528578, + "learning_rate": 5.802207796320209e-07, + "loss": 0.79668283, + "num_input_tokens_seen": 272149080, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.70703125, + "step": 12619, + "time_per_iteration": 2.5116357803344727 + }, + { + "auxiliary_loss_clip": 0.01101262, + "auxiliary_loss_mlp": 0.01033753, + "balance_loss_clip": 1.0217483, + "balance_loss_mlp": 1.03520966, + "epoch": 0.7587554486697731, + "flos": 29495660163840.0, + "grad_norm": 1.9430951948294126, + "language_loss": 0.8248623, + "learning_rate": 5.79946503644337e-07, + "loss": 0.84621245, + "num_input_tokens_seen": 272168285, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66015625, + "step": 12620, + "time_per_iteration": 2.4870126247406006 + }, + { + "auxiliary_loss_clip": 0.0110498, + "auxiliary_loss_mlp": 0.0103625, + "balance_loss_clip": 1.02316654, + "balance_loss_mlp": 1.03550339, + "epoch": 0.758815571922441, + "flos": 16100800692480.0, + "grad_norm": 2.128247483649562, + "language_loss": 0.82510465, + "learning_rate": 5.796722815052242e-07, + "loss": 0.84651691, + "num_input_tokens_seen": 272184585, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6953125, + "step": 12621, + "time_per_iteration": 2.407888412475586 + }, + { + "auxiliary_loss_clip": 0.01103126, + "auxiliary_loss_mlp": 0.01031415, + "balance_loss_clip": 1.01944077, + "balance_loss_mlp": 1.03546882, + "epoch": 0.758875695175109, + "flos": 16143714466560.0, + "grad_norm": 1.905238128524311, + "language_loss": 0.73415148, + "learning_rate": 5.7939811322508e-07, + "loss": 0.75549692, + "num_input_tokens_seen": 272200205, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 12622, + "time_per_iteration": 2.392918348312378 + }, + { + "auxiliary_loss_clip": 0.01023365, + "auxiliary_loss_mlp": 0.01001846, + "balance_loss_clip": 1.00088048, + "balance_loss_mlp": 1.00314832, + "epoch": 0.7589358184277769, + "flos": 68462006860800.0, + "grad_norm": 0.8354315652196721, + "language_loss": 0.60838234, + "learning_rate": 5.791239988143024e-07, + "loss": 0.62863445, + "num_input_tokens_seen": 272259670, + "router_z_loss_clip": 0.00964355, + "router_z_loss_mlp": 0.20214844, + "step": 12623, + "time_per_iteration": 3.0560390949249268 + }, + { + "auxiliary_loss_clip": 0.0110073, + "auxiliary_loss_mlp": 0.01033721, + "balance_loss_clip": 1.02278328, + "balance_loss_mlp": 1.03662705, + "epoch": 0.7589959416804449, + "flos": 20047311889920.0, + "grad_norm": 3.5930861717067653, + "language_loss": 0.66990733, + "learning_rate": 5.788499382832847e-07, + "loss": 0.69125187, + "num_input_tokens_seen": 272277925, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.640625, + "step": 12624, + "time_per_iteration": 2.41662335395813 + }, + { + "auxiliary_loss_clip": 0.01099572, + "auxiliary_loss_mlp": 0.01025695, + "balance_loss_clip": 1.01397121, + "balance_loss_mlp": 1.03495693, + "epoch": 0.7590560649331128, + "flos": 18771800958720.0, + "grad_norm": 1.9214697173160005, + "language_loss": 0.75980389, + "learning_rate": 5.785759316424196e-07, + "loss": 0.78105658, + "num_input_tokens_seen": 272296010, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6484375, + "step": 12625, + "time_per_iteration": 2.430710792541504 + }, + { + "auxiliary_loss_clip": 0.01102125, + "auxiliary_loss_mlp": 0.01033951, + "balance_loss_clip": 1.02228665, + "balance_loss_mlp": 1.0369575, + "epoch": 0.7591161881857809, + "flos": 29825284296960.0, + "grad_norm": 1.779846333652066, + "language_loss": 0.6279074, + "learning_rate": 5.783019789020977e-07, + "loss": 0.64926815, + "num_input_tokens_seen": 272318330, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6484375, + "step": 12626, + "time_per_iteration": 2.504363775253296 + }, + { + "auxiliary_loss_clip": 0.01104327, + "auxiliary_loss_mlp": 0.01037762, + "balance_loss_clip": 1.02497673, + "balance_loss_mlp": 1.03691292, + "epoch": 0.7591763114384488, + "flos": 20302708567680.0, + "grad_norm": 2.3505107376172782, + "language_loss": 0.73657954, + "learning_rate": 5.780280800727084e-07, + "loss": 0.75800049, + "num_input_tokens_seen": 272335265, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.67578125, + "step": 12627, + "time_per_iteration": 2.4584648609161377 + }, + { + "auxiliary_loss_clip": 0.01104059, + "auxiliary_loss_mlp": 0.01025855, + "balance_loss_clip": 1.01399338, + "balance_loss_mlp": 1.03618145, + "epoch": 0.7592364346911168, + "flos": 20813609664000.0, + "grad_norm": 1.9976061083215328, + "language_loss": 0.68754119, + "learning_rate": 5.777542351646356e-07, + "loss": 0.70884025, + "num_input_tokens_seen": 272354795, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 12628, + "time_per_iteration": 2.430168390274048 + }, + { + "auxiliary_loss_clip": 0.01111325, + "auxiliary_loss_mlp": 0.01038199, + "balance_loss_clip": 1.02543187, + "balance_loss_mlp": 1.03944075, + "epoch": 0.7592965579437848, + "flos": 21251504367360.0, + "grad_norm": 1.8845310767470707, + "language_loss": 0.63146746, + "learning_rate": 5.774804441882648e-07, + "loss": 0.65296274, + "num_input_tokens_seen": 272372875, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.71875, + "step": 12629, + "time_per_iteration": 2.4647164344787598 + }, + { + "auxiliary_loss_clip": 0.01096357, + "auxiliary_loss_mlp": 0.01029567, + "balance_loss_clip": 1.01772952, + "balance_loss_mlp": 1.03295267, + "epoch": 0.7593566811964527, + "flos": 26213604704640.0, + "grad_norm": 1.5320581360916075, + "language_loss": 0.77814519, + "learning_rate": 5.772067071539786e-07, + "loss": 0.79940444, + "num_input_tokens_seen": 272394715, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6328125, + "step": 12630, + "time_per_iteration": 2.4695019721984863 + }, + { + "auxiliary_loss_clip": 0.01023993, + "auxiliary_loss_mlp": 0.01002903, + "balance_loss_clip": 1.00193775, + "balance_loss_mlp": 1.00382364, + "epoch": 0.7594168044491207, + "flos": 71237255374080.0, + "grad_norm": 0.8096499014530706, + "language_loss": 0.61483628, + "learning_rate": 5.769330240721562e-07, + "loss": 0.63510519, + "num_input_tokens_seen": 272458775, + "router_z_loss_clip": 0.00964355, + "router_z_loss_mlp": 0.20117188, + "step": 12631, + "time_per_iteration": 3.0936625003814697 + }, + { + "auxiliary_loss_clip": 0.01109676, + "auxiliary_loss_mlp": 0.01034458, + "balance_loss_clip": 1.02038503, + "balance_loss_mlp": 1.0382787, + "epoch": 0.7594769277017887, + "flos": 26613326229120.0, + "grad_norm": 1.648732197394605, + "language_loss": 0.73976278, + "learning_rate": 5.766593949531767e-07, + "loss": 0.76120412, + "num_input_tokens_seen": 272479355, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.7109375, + "step": 12632, + "time_per_iteration": 2.480149745941162 + }, + { + "auxiliary_loss_clip": 0.01104237, + "auxiliary_loss_mlp": 0.01030871, + "balance_loss_clip": 1.01905131, + "balance_loss_mlp": 1.03713107, + "epoch": 0.7595370509544567, + "flos": 17595941333760.0, + "grad_norm": 1.9673738745547358, + "language_loss": 0.74681813, + "learning_rate": 5.763858198074154e-07, + "loss": 0.76816922, + "num_input_tokens_seen": 272493555, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 12633, + "time_per_iteration": 2.4051129817962646 + }, + { + "auxiliary_loss_clip": 0.01102602, + "auxiliary_loss_mlp": 0.0102895, + "balance_loss_clip": 1.0182507, + "balance_loss_mlp": 1.03637874, + "epoch": 0.7595971742071246, + "flos": 18002953319040.0, + "grad_norm": 1.9622807663436381, + "language_loss": 0.73751974, + "learning_rate": 5.76112298645246e-07, + "loss": 0.75883526, + "num_input_tokens_seen": 272508925, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.66015625, + "step": 12634, + "time_per_iteration": 2.4096055030822754 + }, + { + "auxiliary_loss_clip": 0.01105344, + "auxiliary_loss_mlp": 0.01032122, + "balance_loss_clip": 1.02009416, + "balance_loss_mlp": 1.03842199, + "epoch": 0.7596572974597926, + "flos": 28840326480000.0, + "grad_norm": 1.6454406828041275, + "language_loss": 0.64365327, + "learning_rate": 5.758388314770408e-07, + "loss": 0.66502792, + "num_input_tokens_seen": 272528805, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66796875, + "step": 12635, + "time_per_iteration": 2.50323224067688 + }, + { + "auxiliary_loss_clip": 0.01105903, + "auxiliary_loss_mlp": 0.01030647, + "balance_loss_clip": 1.01841593, + "balance_loss_mlp": 1.03627133, + "epoch": 0.7597174207124605, + "flos": 14282823588480.0, + "grad_norm": 1.7052959170016264, + "language_loss": 0.68446481, + "learning_rate": 5.7556541831317e-07, + "loss": 0.70583028, + "num_input_tokens_seen": 272546655, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69921875, + "step": 12636, + "time_per_iteration": 3.86566424369812 + }, + { + "auxiliary_loss_clip": 0.01106502, + "auxiliary_loss_mlp": 0.01032191, + "balance_loss_clip": 1.02063966, + "balance_loss_mlp": 1.03834045, + "epoch": 0.7597775439651285, + "flos": 21688932193920.0, + "grad_norm": 1.977358934255135, + "language_loss": 0.81089514, + "learning_rate": 5.752920591640018e-07, + "loss": 0.83228207, + "num_input_tokens_seen": 272564010, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.68359375, + "step": 12637, + "time_per_iteration": 2.4373815059661865 + }, + { + "auxiliary_loss_clip": 0.01100493, + "auxiliary_loss_mlp": 0.01032101, + "balance_loss_clip": 1.02041268, + "balance_loss_mlp": 1.0336833, + "epoch": 0.7598376672177964, + "flos": 36101248312320.0, + "grad_norm": 1.8305503551265345, + "language_loss": 0.66367668, + "learning_rate": 5.750187540399017e-07, + "loss": 0.68500262, + "num_input_tokens_seen": 272585840, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66796875, + "step": 12638, + "time_per_iteration": 3.9780218601226807 + }, + { + "auxiliary_loss_clip": 0.01104273, + "auxiliary_loss_mlp": 0.01033993, + "balance_loss_clip": 1.02106452, + "balance_loss_mlp": 1.03667748, + "epoch": 0.7598977904704645, + "flos": 18332326056960.0, + "grad_norm": 2.213704137729046, + "language_loss": 0.65462083, + "learning_rate": 5.747455029512323e-07, + "loss": 0.67600346, + "num_input_tokens_seen": 272602300, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.67578125, + "step": 12639, + "time_per_iteration": 3.9062952995300293 + }, + { + "auxiliary_loss_clip": 0.01101967, + "auxiliary_loss_mlp": 0.01027427, + "balance_loss_clip": 1.01530933, + "balance_loss_mlp": 1.03570402, + "epoch": 0.7599579137231324, + "flos": 20192642317440.0, + "grad_norm": 2.267632288408512, + "language_loss": 0.6999557, + "learning_rate": 5.744723059083572e-07, + "loss": 0.72124958, + "num_input_tokens_seen": 272619595, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 12640, + "time_per_iteration": 2.4175524711608887 + }, + { + "auxiliary_loss_clip": 0.01105941, + "auxiliary_loss_mlp": 0.01032991, + "balance_loss_clip": 1.02036071, + "balance_loss_mlp": 1.03658712, + "epoch": 0.7600180369758004, + "flos": 24024849459840.0, + "grad_norm": 1.786840701662577, + "language_loss": 0.6698308, + "learning_rate": 5.741991629216343e-07, + "loss": 0.69122016, + "num_input_tokens_seen": 272638825, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 12641, + "time_per_iteration": 3.921182632446289 + }, + { + "auxiliary_loss_clip": 0.01102198, + "auxiliary_loss_mlp": 0.01033642, + "balance_loss_clip": 1.02081525, + "balance_loss_mlp": 1.03358555, + "epoch": 0.7600781602284684, + "flos": 18989527248000.0, + "grad_norm": 2.0392329057559433, + "language_loss": 0.66791224, + "learning_rate": 5.73926074001422e-07, + "loss": 0.68927062, + "num_input_tokens_seen": 272657240, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6875, + "step": 12642, + "time_per_iteration": 2.437264919281006 + }, + { + "auxiliary_loss_clip": 0.01102022, + "auxiliary_loss_mlp": 0.01028646, + "balance_loss_clip": 1.01733255, + "balance_loss_mlp": 1.0378716, + "epoch": 0.7601382834811363, + "flos": 26067520091520.0, + "grad_norm": 1.817654182769989, + "language_loss": 0.75470227, + "learning_rate": 5.736530391580765e-07, + "loss": 0.77600896, + "num_input_tokens_seen": 272677520, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 12643, + "time_per_iteration": 2.454752206802368 + }, + { + "auxiliary_loss_clip": 0.01104004, + "auxiliary_loss_mlp": 0.01032717, + "balance_loss_clip": 1.01981854, + "balance_loss_mlp": 1.03661776, + "epoch": 0.7601984067338043, + "flos": 18844232734080.0, + "grad_norm": 1.71435715776806, + "language_loss": 0.78663039, + "learning_rate": 5.733800584019508e-07, + "loss": 0.80799764, + "num_input_tokens_seen": 272696770, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.671875, + "step": 12644, + "time_per_iteration": 2.464467763900757 + }, + { + "auxiliary_loss_clip": 0.01102086, + "auxiliary_loss_mlp": 0.01026874, + "balance_loss_clip": 1.01553774, + "balance_loss_mlp": 1.03507113, + "epoch": 0.7602585299864723, + "flos": 24646391424000.0, + "grad_norm": 1.487007417540331, + "language_loss": 0.80469275, + "learning_rate": 5.731071317433957e-07, + "loss": 0.82598233, + "num_input_tokens_seen": 272718340, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 12645, + "time_per_iteration": 2.46242094039917 + }, + { + "auxiliary_loss_clip": 0.01106779, + "auxiliary_loss_mlp": 0.01028551, + "balance_loss_clip": 1.01639724, + "balance_loss_mlp": 1.03778565, + "epoch": 0.7603186532391403, + "flos": 23842100039040.0, + "grad_norm": 1.4295948404829946, + "language_loss": 0.72978055, + "learning_rate": 5.728342591927611e-07, + "loss": 0.75113386, + "num_input_tokens_seen": 272739575, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.69140625, + "step": 12646, + "time_per_iteration": 2.471769332885742 + }, + { + "auxiliary_loss_clip": 0.01100614, + "auxiliary_loss_mlp": 0.01032121, + "balance_loss_clip": 1.02084398, + "balance_loss_mlp": 1.0357635, + "epoch": 0.7603787764918082, + "flos": 22199905117440.0, + "grad_norm": 2.0171184972904426, + "language_loss": 0.67350507, + "learning_rate": 5.725614407603949e-07, + "loss": 0.69483244, + "num_input_tokens_seen": 272758710, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6484375, + "step": 12647, + "time_per_iteration": 2.4212889671325684 + }, + { + "auxiliary_loss_clip": 0.01023895, + "auxiliary_loss_mlp": 0.01003551, + "balance_loss_clip": 1.00256717, + "balance_loss_mlp": 1.00363588, + "epoch": 0.7604388997444762, + "flos": 54086894254080.0, + "grad_norm": 0.6700081607219286, + "language_loss": 0.48957998, + "learning_rate": 5.722886764566415e-07, + "loss": 0.50985444, + "num_input_tokens_seen": 272814855, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.203125, + "step": 12648, + "time_per_iteration": 2.992032766342163 + }, + { + "auxiliary_loss_clip": 0.01099019, + "auxiliary_loss_mlp": 0.01032507, + "balance_loss_clip": 1.02124202, + "balance_loss_mlp": 1.03481627, + "epoch": 0.7604990229971441, + "flos": 19681920789120.0, + "grad_norm": 1.457089881735221, + "language_loss": 0.76486385, + "learning_rate": 5.720159662918451e-07, + "loss": 0.78617918, + "num_input_tokens_seen": 272834400, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.640625, + "step": 12649, + "time_per_iteration": 2.4250268936157227 + }, + { + "auxiliary_loss_clip": 0.01099572, + "auxiliary_loss_mlp": 0.01029511, + "balance_loss_clip": 1.0177089, + "balance_loss_mlp": 1.03462923, + "epoch": 0.7605591462498121, + "flos": 25228036356480.0, + "grad_norm": 1.4982970493787315, + "language_loss": 0.68732083, + "learning_rate": 5.717433102763462e-07, + "loss": 0.70861167, + "num_input_tokens_seen": 272854760, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6484375, + "step": 12650, + "time_per_iteration": 2.488598585128784 + }, + { + "auxiliary_loss_clip": 0.0102378, + "auxiliary_loss_mlp": 0.00999701, + "balance_loss_clip": 0.99867612, + "balance_loss_mlp": 1.00336099, + "epoch": 0.76061926950248, + "flos": 66783757662720.0, + "grad_norm": 0.7616307552749029, + "language_loss": 0.62742424, + "learning_rate": 5.714707084204838e-07, + "loss": 0.64765906, + "num_input_tokens_seen": 272919030, + "router_z_loss_clip": 0.01025391, + "router_z_loss_mlp": 0.20410156, + "step": 12651, + "time_per_iteration": 3.0423130989074707 + }, + { + "auxiliary_loss_clip": 0.01099802, + "auxiliary_loss_mlp": 0.01032274, + "balance_loss_clip": 1.02096677, + "balance_loss_mlp": 1.03473544, + "epoch": 0.7606793927551481, + "flos": 25338354001920.0, + "grad_norm": 1.3759590164717375, + "language_loss": 0.71249425, + "learning_rate": 5.711981607345951e-07, + "loss": 0.73381495, + "num_input_tokens_seen": 272938925, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65234375, + "step": 12652, + "time_per_iteration": 2.4702324867248535 + }, + { + "auxiliary_loss_clip": 0.01103881, + "auxiliary_loss_mlp": 0.01033324, + "balance_loss_clip": 1.0212419, + "balance_loss_mlp": 1.03609121, + "epoch": 0.760739516007816, + "flos": 18223624523520.0, + "grad_norm": 2.2736870535871354, + "language_loss": 0.80135083, + "learning_rate": 5.709256672290152e-07, + "loss": 0.82272291, + "num_input_tokens_seen": 272954945, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6796875, + "step": 12653, + "time_per_iteration": 2.452436685562134 + }, + { + "auxiliary_loss_clip": 0.01106986, + "auxiliary_loss_mlp": 0.01030648, + "balance_loss_clip": 1.01928127, + "balance_loss_mlp": 1.03704405, + "epoch": 0.760799639260484, + "flos": 22559119079040.0, + "grad_norm": 1.5498044874704002, + "language_loss": 0.80112356, + "learning_rate": 5.706532279140785e-07, + "loss": 0.82249987, + "num_input_tokens_seen": 272972855, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.69921875, + "step": 12654, + "time_per_iteration": 2.48616886138916 + }, + { + "auxiliary_loss_clip": 0.01104783, + "auxiliary_loss_mlp": 0.01034864, + "balance_loss_clip": 1.02255547, + "balance_loss_mlp": 1.03588712, + "epoch": 0.760859762513152, + "flos": 22309324922880.0, + "grad_norm": 2.061909970432495, + "language_loss": 0.79397112, + "learning_rate": 5.703808428001136e-07, + "loss": 0.81536764, + "num_input_tokens_seen": 272989895, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69140625, + "step": 12655, + "time_per_iteration": 2.421402931213379 + }, + { + "auxiliary_loss_clip": 0.01098302, + "auxiliary_loss_mlp": 0.01025594, + "balance_loss_clip": 1.01572394, + "balance_loss_mlp": 1.03430891, + "epoch": 0.7609198857658199, + "flos": 24863902231680.0, + "grad_norm": 1.6410708258422424, + "language_loss": 0.68456256, + "learning_rate": 5.701085118974505e-07, + "loss": 0.70580149, + "num_input_tokens_seen": 273011695, + "router_z_loss_clip": 0.09863281, + "router_z_loss_mlp": 0.640625, + "step": 12656, + "time_per_iteration": 2.5130324363708496 + }, + { + "auxiliary_loss_clip": 0.01103079, + "auxiliary_loss_mlp": 0.0102861, + "balance_loss_clip": 1.01629603, + "balance_loss_mlp": 1.03264689, + "epoch": 0.760980009018488, + "flos": 16836790366080.0, + "grad_norm": 1.9462034213744268, + "language_loss": 0.73116565, + "learning_rate": 5.698362352164164e-07, + "loss": 0.75248253, + "num_input_tokens_seen": 273028815, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.703125, + "step": 12657, + "time_per_iteration": 2.400148391723633 + }, + { + "auxiliary_loss_clip": 0.01024109, + "auxiliary_loss_mlp": 0.01000104, + "balance_loss_clip": 0.99908441, + "balance_loss_mlp": 1.0036025, + "epoch": 0.7610401322711559, + "flos": 61230603029760.0, + "grad_norm": 0.8561186291133048, + "language_loss": 0.64938498, + "learning_rate": 5.695640127673347e-07, + "loss": 0.66962707, + "num_input_tokens_seen": 273084080, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.20507812, + "step": 12658, + "time_per_iteration": 3.001168727874756 + }, + { + "auxiliary_loss_clip": 0.01098421, + "auxiliary_loss_mlp": 0.01030443, + "balance_loss_clip": 1.01878452, + "balance_loss_mlp": 1.03460932, + "epoch": 0.7611002555238239, + "flos": 19640730867840.0, + "grad_norm": 1.8302909281614124, + "language_loss": 0.79259527, + "learning_rate": 5.692918445605293e-07, + "loss": 0.8138839, + "num_input_tokens_seen": 273102295, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.640625, + "step": 12659, + "time_per_iteration": 2.4172587394714355 + }, + { + "auxiliary_loss_clip": 0.01099076, + "auxiliary_loss_mlp": 0.01025102, + "balance_loss_clip": 1.01360416, + "balance_loss_mlp": 1.03339934, + "epoch": 0.7611603787764918, + "flos": 26872206526080.0, + "grad_norm": 1.5401617612635332, + "language_loss": 0.68613267, + "learning_rate": 5.690197306063209e-07, + "loss": 0.70737445, + "num_input_tokens_seen": 273123400, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 12660, + "time_per_iteration": 2.486931085586548 + }, + { + "auxiliary_loss_clip": 0.01102403, + "auxiliary_loss_mlp": 0.01030099, + "balance_loss_clip": 1.01855946, + "balance_loss_mlp": 1.03502679, + "epoch": 0.7612205020291598, + "flos": 27344252085120.0, + "grad_norm": 1.63464824040793, + "language_loss": 0.70508969, + "learning_rate": 5.687476709150281e-07, + "loss": 0.72641468, + "num_input_tokens_seen": 273145150, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 12661, + "time_per_iteration": 2.5559232234954834 + }, + { + "auxiliary_loss_clip": 0.01099871, + "auxiliary_loss_mlp": 0.01027124, + "balance_loss_clip": 1.01578677, + "balance_loss_mlp": 1.03281772, + "epoch": 0.7612806252818277, + "flos": 29314598682240.0, + "grad_norm": 1.568031869440725, + "language_loss": 0.8346833, + "learning_rate": 5.68475665496966e-07, + "loss": 0.85595322, + "num_input_tokens_seen": 273165180, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 12662, + "time_per_iteration": 2.5182721614837646 + }, + { + "auxiliary_loss_clip": 0.01101806, + "auxiliary_loss_mlp": 0.01040729, + "balance_loss_clip": 1.02870047, + "balance_loss_mlp": 1.03437781, + "epoch": 0.7613407485344957, + "flos": 19026048401280.0, + "grad_norm": 1.7160561629790159, + "language_loss": 0.68380648, + "learning_rate": 5.682037143624505e-07, + "loss": 0.70523185, + "num_input_tokens_seen": 273184005, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 12663, + "time_per_iteration": 2.415670871734619 + }, + { + "auxiliary_loss_clip": 0.0110108, + "auxiliary_loss_mlp": 0.01025073, + "balance_loss_clip": 1.01368248, + "balance_loss_mlp": 1.03619945, + "epoch": 0.7614008717871636, + "flos": 23256037733760.0, + "grad_norm": 1.8370977086816516, + "language_loss": 0.70325685, + "learning_rate": 5.67931817521794e-07, + "loss": 0.72451836, + "num_input_tokens_seen": 273203565, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 12664, + "time_per_iteration": 2.4670281410217285 + }, + { + "auxiliary_loss_clip": 0.01107046, + "auxiliary_loss_mlp": 0.01038009, + "balance_loss_clip": 1.02551007, + "balance_loss_mlp": 1.03717303, + "epoch": 0.7614609950398317, + "flos": 21579907438080.0, + "grad_norm": 2.4295435457248575, + "language_loss": 0.79482126, + "learning_rate": 5.676599749853066e-07, + "loss": 0.81627178, + "num_input_tokens_seen": 273221645, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69921875, + "step": 12665, + "time_per_iteration": 2.4276509284973145 + }, + { + "auxiliary_loss_clip": 0.01101745, + "auxiliary_loss_mlp": 0.01031097, + "balance_loss_clip": 1.02022529, + "balance_loss_mlp": 1.03754544, + "epoch": 0.7615211182924996, + "flos": 29277897960960.0, + "grad_norm": 1.6635534140237522, + "language_loss": 0.88047594, + "learning_rate": 5.673881867632959e-07, + "loss": 0.90180439, + "num_input_tokens_seen": 273242040, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.640625, + "step": 12666, + "time_per_iteration": 2.51179575920105 + }, + { + "auxiliary_loss_clip": 0.0110239, + "auxiliary_loss_mlp": 0.0103263, + "balance_loss_clip": 1.0205127, + "balance_loss_mlp": 1.03515267, + "epoch": 0.7615812415451676, + "flos": 13261129136640.0, + "grad_norm": 1.9417407111979526, + "language_loss": 0.8323909, + "learning_rate": 5.671164528660693e-07, + "loss": 0.85374105, + "num_input_tokens_seen": 273257365, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 12667, + "time_per_iteration": 2.4148190021514893 + }, + { + "auxiliary_loss_clip": 0.01100905, + "auxiliary_loss_mlp": 0.01035947, + "balance_loss_clip": 1.02489078, + "balance_loss_mlp": 1.03628147, + "epoch": 0.7616413647978356, + "flos": 18584741905920.0, + "grad_norm": 1.6916218117768351, + "language_loss": 0.78259969, + "learning_rate": 5.668447733039296e-07, + "loss": 0.80396825, + "num_input_tokens_seen": 273274710, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.64453125, + "step": 12668, + "time_per_iteration": 2.4754624366760254 + }, + { + "auxiliary_loss_clip": 0.01100404, + "auxiliary_loss_mlp": 0.01030857, + "balance_loss_clip": 1.01928806, + "balance_loss_mlp": 1.0345788, + "epoch": 0.7617014880505035, + "flos": 18516188799360.0, + "grad_norm": 1.7878935447004587, + "language_loss": 0.63670552, + "learning_rate": 5.6657314808718e-07, + "loss": 0.65801817, + "num_input_tokens_seen": 273292870, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 12669, + "time_per_iteration": 2.406334638595581 + }, + { + "auxiliary_loss_clip": 0.01103168, + "auxiliary_loss_mlp": 0.01035702, + "balance_loss_clip": 1.023, + "balance_loss_mlp": 1.03439915, + "epoch": 0.7617616113031715, + "flos": 24973178382720.0, + "grad_norm": 1.8779652791388421, + "language_loss": 0.66191423, + "learning_rate": 5.663015772261202e-07, + "loss": 0.68330294, + "num_input_tokens_seen": 273312375, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 12670, + "time_per_iteration": 2.479275703430176 + }, + { + "auxiliary_loss_clip": 0.01103769, + "auxiliary_loss_mlp": 0.01032723, + "balance_loss_clip": 1.02112961, + "balance_loss_mlp": 1.0352459, + "epoch": 0.7618217345558395, + "flos": 23295036925440.0, + "grad_norm": 1.5352589226081985, + "language_loss": 0.73205262, + "learning_rate": 5.660300607310493e-07, + "loss": 0.75341749, + "num_input_tokens_seen": 273332590, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.68359375, + "step": 12671, + "time_per_iteration": 2.43534517288208 + }, + { + "auxiliary_loss_clip": 0.01098817, + "auxiliary_loss_mlp": 0.01028667, + "balance_loss_clip": 1.01777697, + "balance_loss_mlp": 1.03416443, + "epoch": 0.7618818578085075, + "flos": 25482894330240.0, + "grad_norm": 2.4136368104172607, + "language_loss": 0.73309898, + "learning_rate": 5.657585986122613e-07, + "loss": 0.75437379, + "num_input_tokens_seen": 273352885, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6484375, + "step": 12672, + "time_per_iteration": 2.4863340854644775 + }, + { + "auxiliary_loss_clip": 0.01023134, + "auxiliary_loss_mlp": 0.01002705, + "balance_loss_clip": 1.00170374, + "balance_loss_mlp": 1.00292134, + "epoch": 0.7619419810611754, + "flos": 61151994115200.0, + "grad_norm": 0.7636907167661546, + "language_loss": 0.56764495, + "learning_rate": 5.654871908800506e-07, + "loss": 0.58790326, + "num_input_tokens_seen": 273411730, + "router_z_loss_clip": 0.01000977, + "router_z_loss_mlp": 0.20214844, + "step": 12673, + "time_per_iteration": 3.0046093463897705 + }, + { + "auxiliary_loss_clip": 0.01103698, + "auxiliary_loss_mlp": 0.01027688, + "balance_loss_clip": 1.01493824, + "balance_loss_mlp": 1.03571641, + "epoch": 0.7620021043138434, + "flos": 23258659426560.0, + "grad_norm": 1.9214444027294126, + "language_loss": 0.74586606, + "learning_rate": 5.652158375447102e-07, + "loss": 0.76717991, + "num_input_tokens_seen": 273430020, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 12674, + "time_per_iteration": 2.4860613346099854 + }, + { + "auxiliary_loss_clip": 0.01099933, + "auxiliary_loss_mlp": 0.01027095, + "balance_loss_clip": 1.01634872, + "balance_loss_mlp": 1.03547001, + "epoch": 0.7620622275665113, + "flos": 25082490447360.0, + "grad_norm": 1.9445116324740603, + "language_loss": 0.72109187, + "learning_rate": 5.649445386165286e-07, + "loss": 0.74236214, + "num_input_tokens_seen": 273448690, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.64453125, + "step": 12675, + "time_per_iteration": 2.4733452796936035 + }, + { + "auxiliary_loss_clip": 0.01099705, + "auxiliary_loss_mlp": 0.01029829, + "balance_loss_clip": 1.01858139, + "balance_loss_mlp": 1.03579307, + "epoch": 0.7621223508191793, + "flos": 20155007842560.0, + "grad_norm": 2.3582627114091417, + "language_loss": 0.72836524, + "learning_rate": 5.646732941057936e-07, + "loss": 0.74966055, + "num_input_tokens_seen": 273465190, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 12676, + "time_per_iteration": 2.464700698852539 + }, + { + "auxiliary_loss_clip": 0.01108509, + "auxiliary_loss_mlp": 0.01030827, + "balance_loss_clip": 1.01853633, + "balance_loss_mlp": 1.0366993, + "epoch": 0.7621824740718472, + "flos": 18000187971840.0, + "grad_norm": 2.4246183918605055, + "language_loss": 0.54263771, + "learning_rate": 5.644021040227927e-07, + "loss": 0.56403106, + "num_input_tokens_seen": 273478620, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.71875, + "step": 12677, + "time_per_iteration": 2.3858957290649414 + }, + { + "auxiliary_loss_clip": 0.01101511, + "auxiliary_loss_mlp": 0.01028312, + "balance_loss_clip": 1.0166893, + "balance_loss_mlp": 1.03496563, + "epoch": 0.7622425973245153, + "flos": 21725668828800.0, + "grad_norm": 2.7484196878623104, + "language_loss": 0.78978539, + "learning_rate": 5.641309683778064e-07, + "loss": 0.81108367, + "num_input_tokens_seen": 273497635, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 12678, + "time_per_iteration": 3.8235199451446533 + }, + { + "auxiliary_loss_clip": 0.01103703, + "auxiliary_loss_mlp": 0.01030473, + "balance_loss_clip": 1.01842141, + "balance_loss_mlp": 1.0358417, + "epoch": 0.7623027205771832, + "flos": 19718549683200.0, + "grad_norm": 1.880562321588857, + "language_loss": 0.7751689, + "learning_rate": 5.638598871811175e-07, + "loss": 0.79651058, + "num_input_tokens_seen": 273513955, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 12679, + "time_per_iteration": 2.406036615371704 + }, + { + "auxiliary_loss_clip": 0.01102397, + "auxiliary_loss_mlp": 0.01024752, + "balance_loss_clip": 1.01288462, + "balance_loss_mlp": 1.03522229, + "epoch": 0.7623628438298512, + "flos": 23988831096960.0, + "grad_norm": 1.3855129030202036, + "language_loss": 0.79996926, + "learning_rate": 5.635888604430059e-07, + "loss": 0.82124078, + "num_input_tokens_seen": 273533970, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 12680, + "time_per_iteration": 3.8292644023895264 + }, + { + "auxiliary_loss_clip": 0.01102879, + "auxiliary_loss_mlp": 0.01027152, + "balance_loss_clip": 1.01448607, + "balance_loss_mlp": 1.03598523, + "epoch": 0.7624229670825191, + "flos": 22345702421760.0, + "grad_norm": 1.8104724953691376, + "language_loss": 0.62750268, + "learning_rate": 5.633178881737493e-07, + "loss": 0.64880306, + "num_input_tokens_seen": 273553090, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.66796875, + "step": 12681, + "time_per_iteration": 3.884755849838257 + }, + { + "auxiliary_loss_clip": 0.01099627, + "auxiliary_loss_mlp": 0.01030882, + "balance_loss_clip": 1.01962304, + "balance_loss_mlp": 1.03471422, + "epoch": 0.7624830903351871, + "flos": 22711775880960.0, + "grad_norm": 2.0185008739532946, + "language_loss": 0.76076877, + "learning_rate": 5.63046970383622e-07, + "loss": 0.78207386, + "num_input_tokens_seen": 273572460, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 12682, + "time_per_iteration": 3.9090828895568848 + }, + { + "auxiliary_loss_clip": 0.01098759, + "auxiliary_loss_mlp": 0.01027158, + "balance_loss_clip": 1.01630437, + "balance_loss_mlp": 1.0342561, + "epoch": 0.7625432135878552, + "flos": 25593714766080.0, + "grad_norm": 1.71259737430395, + "language_loss": 0.68134248, + "learning_rate": 5.627761070828974e-07, + "loss": 0.70260167, + "num_input_tokens_seen": 273592815, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.64453125, + "step": 12683, + "time_per_iteration": 2.4623308181762695 + }, + { + "auxiliary_loss_clip": 0.01103084, + "auxiliary_loss_mlp": 0.01029472, + "balance_loss_clip": 1.01777172, + "balance_loss_mlp": 1.03596735, + "epoch": 0.7626033368405231, + "flos": 23987645948160.0, + "grad_norm": 2.1249879118259285, + "language_loss": 0.83107448, + "learning_rate": 5.625052982818472e-07, + "loss": 0.85240012, + "num_input_tokens_seen": 273611790, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 12684, + "time_per_iteration": 2.4951984882354736 + }, + { + "auxiliary_loss_clip": 0.01103097, + "auxiliary_loss_mlp": 0.01037574, + "balance_loss_clip": 1.02493775, + "balance_loss_mlp": 1.03559566, + "epoch": 0.7626634600931911, + "flos": 12599115523200.0, + "grad_norm": 1.7953521206718834, + "language_loss": 0.82664561, + "learning_rate": 5.622345439907396e-07, + "loss": 0.84805232, + "num_input_tokens_seen": 273628340, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.67578125, + "step": 12685, + "time_per_iteration": 2.397047519683838 + }, + { + "auxiliary_loss_clip": 0.01104402, + "auxiliary_loss_mlp": 0.01025447, + "balance_loss_clip": 1.01402688, + "balance_loss_mlp": 1.03638494, + "epoch": 0.762723583345859, + "flos": 26322593546880.0, + "grad_norm": 1.8540410766605766, + "language_loss": 0.77068198, + "learning_rate": 5.619638442198422e-07, + "loss": 0.79198045, + "num_input_tokens_seen": 273646585, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6796875, + "step": 12686, + "time_per_iteration": 2.484248399734497 + }, + { + "auxiliary_loss_clip": 0.01104273, + "auxiliary_loss_mlp": 0.01038595, + "balance_loss_clip": 1.02545214, + "balance_loss_mlp": 1.03546059, + "epoch": 0.762783706598527, + "flos": 21907053532800.0, + "grad_norm": 1.6147280683220673, + "language_loss": 0.71894288, + "learning_rate": 5.616931989794198e-07, + "loss": 0.74037153, + "num_input_tokens_seen": 273665410, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6875, + "step": 12687, + "time_per_iteration": 2.438870668411255 + }, + { + "auxiliary_loss_clip": 0.01101986, + "auxiliary_loss_mlp": 0.01037451, + "balance_loss_clip": 1.02494013, + "balance_loss_mlp": 1.03586364, + "epoch": 0.7628438298511949, + "flos": 15339782217600.0, + "grad_norm": 1.7893122270206685, + "language_loss": 0.64678234, + "learning_rate": 5.614226082797369e-07, + "loss": 0.66817671, + "num_input_tokens_seen": 273683035, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.66015625, + "step": 12688, + "time_per_iteration": 2.473334550857544 + }, + { + "auxiliary_loss_clip": 0.01100145, + "auxiliary_loss_mlp": 0.01027183, + "balance_loss_clip": 1.01613188, + "balance_loss_mlp": 1.03599501, + "epoch": 0.7629039531038629, + "flos": 13006307076480.0, + "grad_norm": 1.824140660658097, + "language_loss": 0.70988876, + "learning_rate": 5.611520721310515e-07, + "loss": 0.73116207, + "num_input_tokens_seen": 273700130, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 12689, + "time_per_iteration": 2.389702320098877 + }, + { + "auxiliary_loss_clip": 0.01106966, + "auxiliary_loss_mlp": 0.01035843, + "balance_loss_clip": 1.02357674, + "balance_loss_mlp": 1.03706014, + "epoch": 0.7629640763565309, + "flos": 26171660597760.0, + "grad_norm": 1.6778934175859046, + "language_loss": 0.69599509, + "learning_rate": 5.608815905436238e-07, + "loss": 0.7174232, + "num_input_tokens_seen": 273720310, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.69921875, + "step": 12690, + "time_per_iteration": 2.4964652061462402 + }, + { + "auxiliary_loss_clip": 0.01103144, + "auxiliary_loss_mlp": 0.0102945, + "balance_loss_clip": 1.01791096, + "balance_loss_mlp": 1.03643334, + "epoch": 0.7630241996091989, + "flos": 36793713680640.0, + "grad_norm": 1.627452026729889, + "language_loss": 0.69135779, + "learning_rate": 5.606111635277109e-07, + "loss": 0.71268374, + "num_input_tokens_seen": 273744475, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66796875, + "step": 12691, + "time_per_iteration": 2.577179431915283 + }, + { + "auxiliary_loss_clip": 0.0109925, + "auxiliary_loss_mlp": 0.01032646, + "balance_loss_clip": 1.02197647, + "balance_loss_mlp": 1.03412747, + "epoch": 0.7630843228618668, + "flos": 21835160461440.0, + "grad_norm": 1.5885842386967668, + "language_loss": 0.81694877, + "learning_rate": 5.603407910935662e-07, + "loss": 0.83826768, + "num_input_tokens_seen": 273764635, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.65234375, + "step": 12692, + "time_per_iteration": 2.4633901119232178 + }, + { + "auxiliary_loss_clip": 0.01107736, + "auxiliary_loss_mlp": 0.01031236, + "balance_loss_clip": 1.02030492, + "balance_loss_mlp": 1.03841257, + "epoch": 0.7631444461145348, + "flos": 12640520926080.0, + "grad_norm": 2.217828968535983, + "language_loss": 0.76950878, + "learning_rate": 5.600704732514438e-07, + "loss": 0.79089856, + "num_input_tokens_seen": 273780115, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6953125, + "step": 12693, + "time_per_iteration": 2.4009978771209717 + }, + { + "auxiliary_loss_clip": 0.01103157, + "auxiliary_loss_mlp": 0.01030435, + "balance_loss_clip": 1.01835942, + "balance_loss_mlp": 1.03572786, + "epoch": 0.7632045693672027, + "flos": 16836610798080.0, + "grad_norm": 2.173871462173048, + "language_loss": 0.73079503, + "learning_rate": 5.598002100115933e-07, + "loss": 0.75213093, + "num_input_tokens_seen": 273796605, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.671875, + "step": 12694, + "time_per_iteration": 2.462535858154297 + }, + { + "auxiliary_loss_clip": 0.01098607, + "auxiliary_loss_mlp": 0.01027527, + "balance_loss_clip": 1.01585007, + "balance_loss_mlp": 1.03326893, + "epoch": 0.7632646926198707, + "flos": 22017335264640.0, + "grad_norm": 1.6266641771767514, + "language_loss": 0.70343757, + "learning_rate": 5.595300013842625e-07, + "loss": 0.7246989, + "num_input_tokens_seen": 273816515, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65625, + "step": 12695, + "time_per_iteration": 2.436309576034546 + }, + { + "auxiliary_loss_clip": 0.01101174, + "auxiliary_loss_mlp": 0.01029968, + "balance_loss_clip": 1.01889324, + "balance_loss_mlp": 1.03454077, + "epoch": 0.7633248158725388, + "flos": 23114011357440.0, + "grad_norm": 1.4700012541303298, + "language_loss": 0.72275102, + "learning_rate": 5.592598473796985e-07, + "loss": 0.74406242, + "num_input_tokens_seen": 273837060, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6640625, + "step": 12696, + "time_per_iteration": 2.473132371902466 + }, + { + "auxiliary_loss_clip": 0.01101016, + "auxiliary_loss_mlp": 0.01033584, + "balance_loss_clip": 1.02113307, + "balance_loss_mlp": 1.03426933, + "epoch": 0.7633849391252067, + "flos": 10889839952640.0, + "grad_norm": 2.376546359844648, + "language_loss": 0.71416759, + "learning_rate": 5.589897480081453e-07, + "loss": 0.73551357, + "num_input_tokens_seen": 273853365, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.66796875, + "step": 12697, + "time_per_iteration": 2.397484064102173 + }, + { + "auxiliary_loss_clip": 0.01103465, + "auxiliary_loss_mlp": 0.01030875, + "balance_loss_clip": 1.0194068, + "balance_loss_mlp": 1.03697562, + "epoch": 0.7634450623778747, + "flos": 20994168355200.0, + "grad_norm": 1.8987571133249672, + "language_loss": 0.66587389, + "learning_rate": 5.587197032798461e-07, + "loss": 0.6872173, + "num_input_tokens_seen": 273870750, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6640625, + "step": 12698, + "time_per_iteration": 2.4368910789489746 + }, + { + "auxiliary_loss_clip": 0.01099721, + "auxiliary_loss_mlp": 0.01028437, + "balance_loss_clip": 1.01665354, + "balance_loss_mlp": 1.03326559, + "epoch": 0.7635051856305426, + "flos": 18882046776960.0, + "grad_norm": 1.574933939339682, + "language_loss": 0.72529495, + "learning_rate": 5.5844971320504e-07, + "loss": 0.74657655, + "num_input_tokens_seen": 273890890, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6640625, + "step": 12699, + "time_per_iteration": 2.449216842651367 + }, + { + "auxiliary_loss_clip": 0.01099566, + "auxiliary_loss_mlp": 0.01032721, + "balance_loss_clip": 1.02182543, + "balance_loss_mlp": 1.03466082, + "epoch": 0.7635653088832106, + "flos": 34786989584640.0, + "grad_norm": 1.9214661095744658, + "language_loss": 0.73283732, + "learning_rate": 5.581797777939648e-07, + "loss": 0.75416017, + "num_input_tokens_seen": 273914015, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 12700, + "time_per_iteration": 2.614281177520752 + }, + { + "auxiliary_loss_clip": 0.01101231, + "auxiliary_loss_mlp": 0.01029995, + "balance_loss_clip": 1.01834226, + "balance_loss_mlp": 1.03391504, + "epoch": 0.7636254321358785, + "flos": 23178434400000.0, + "grad_norm": 3.347573177390183, + "language_loss": 0.68935323, + "learning_rate": 5.579098970568574e-07, + "loss": 0.71066546, + "num_input_tokens_seen": 273927415, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 12701, + "time_per_iteration": 2.407780170440674 + }, + { + "auxiliary_loss_clip": 0.01102757, + "auxiliary_loss_mlp": 0.01030007, + "balance_loss_clip": 1.01846802, + "balance_loss_mlp": 1.0361433, + "epoch": 0.7636855553885465, + "flos": 21325229032320.0, + "grad_norm": 2.4924220366961145, + "language_loss": 0.64379907, + "learning_rate": 5.576400710039508e-07, + "loss": 0.66512668, + "num_input_tokens_seen": 273946690, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 12702, + "time_per_iteration": 2.444377899169922 + }, + { + "auxiliary_loss_clip": 0.01101798, + "auxiliary_loss_mlp": 0.01031427, + "balance_loss_clip": 1.02000129, + "balance_loss_mlp": 1.0348711, + "epoch": 0.7637456786412145, + "flos": 28658079849600.0, + "grad_norm": 1.9476964019276684, + "language_loss": 0.65595478, + "learning_rate": 5.57370299645477e-07, + "loss": 0.67728704, + "num_input_tokens_seen": 273966870, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 12703, + "time_per_iteration": 2.4628920555114746 + }, + { + "auxiliary_loss_clip": 0.01102971, + "auxiliary_loss_mlp": 0.01023498, + "balance_loss_clip": 1.01204157, + "balance_loss_mlp": 1.03720379, + "epoch": 0.7638058018938825, + "flos": 21907269014400.0, + "grad_norm": 1.7669844217588608, + "language_loss": 0.83665591, + "learning_rate": 5.571005829916668e-07, + "loss": 0.85792065, + "num_input_tokens_seen": 273986360, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66015625, + "step": 12704, + "time_per_iteration": 2.448728561401367 + }, + { + "auxiliary_loss_clip": 0.01104257, + "auxiliary_loss_mlp": 0.01030733, + "balance_loss_clip": 1.01903248, + "balance_loss_mlp": 1.03712642, + "epoch": 0.7638659251465504, + "flos": 29643899592960.0, + "grad_norm": 1.3938959354870066, + "language_loss": 0.67689544, + "learning_rate": 5.568309210527469e-07, + "loss": 0.69824535, + "num_input_tokens_seen": 274009745, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 12705, + "time_per_iteration": 2.4803051948547363 + }, + { + "auxiliary_loss_clip": 0.01100722, + "auxiliary_loss_mlp": 0.0102598, + "balance_loss_clip": 1.01429725, + "balance_loss_mlp": 1.03554821, + "epoch": 0.7639260483992184, + "flos": 26141172929280.0, + "grad_norm": 1.691823975978675, + "language_loss": 0.74275041, + "learning_rate": 5.565613138389427e-07, + "loss": 0.7640174, + "num_input_tokens_seen": 274028775, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65234375, + "step": 12706, + "time_per_iteration": 2.4732961654663086 + }, + { + "auxiliary_loss_clip": 0.01102547, + "auxiliary_loss_mlp": 0.0102807, + "balance_loss_clip": 1.01650715, + "balance_loss_mlp": 1.03575993, + "epoch": 0.7639861716518863, + "flos": 20156695781760.0, + "grad_norm": 1.8728916449529083, + "language_loss": 0.7829448, + "learning_rate": 5.562917613604781e-07, + "loss": 0.80425096, + "num_input_tokens_seen": 274047520, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66796875, + "step": 12707, + "time_per_iteration": 2.44852352142334 + }, + { + "auxiliary_loss_clip": 0.01100823, + "auxiliary_loss_mlp": 0.01025401, + "balance_loss_clip": 1.01379025, + "balance_loss_mlp": 1.03446913, + "epoch": 0.7640462949045543, + "flos": 18583125793920.0, + "grad_norm": 1.8180584415058063, + "language_loss": 0.79873604, + "learning_rate": 5.560222636275751e-07, + "loss": 0.81999826, + "num_input_tokens_seen": 274065350, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 12708, + "time_per_iteration": 2.427623987197876 + }, + { + "auxiliary_loss_clip": 0.01024337, + "auxiliary_loss_mlp": 0.01003138, + "balance_loss_clip": 1.00209475, + "balance_loss_mlp": 1.00414193, + "epoch": 0.7641064181572224, + "flos": 68321991646080.0, + "grad_norm": 0.8188309305581064, + "language_loss": 0.56423205, + "learning_rate": 5.557528206504521e-07, + "loss": 0.58450681, + "num_input_tokens_seen": 274122315, + "router_z_loss_clip": 0.01043701, + "router_z_loss_mlp": 0.20214844, + "step": 12709, + "time_per_iteration": 3.0471227169036865 + }, + { + "auxiliary_loss_clip": 0.01106792, + "auxiliary_loss_mlp": 0.01031993, + "balance_loss_clip": 1.01926708, + "balance_loss_mlp": 1.03640699, + "epoch": 0.7641665414098903, + "flos": 17968982031360.0, + "grad_norm": 1.7746105056549126, + "language_loss": 0.63412935, + "learning_rate": 5.554834324393271e-07, + "loss": 0.65551722, + "num_input_tokens_seen": 274140555, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 12710, + "time_per_iteration": 2.436523199081421 + }, + { + "auxiliary_loss_clip": 0.01104937, + "auxiliary_loss_mlp": 0.01030515, + "balance_loss_clip": 1.01748586, + "balance_loss_mlp": 1.03611827, + "epoch": 0.7642266646625583, + "flos": 21252078984960.0, + "grad_norm": 2.4696813182375994, + "language_loss": 0.64710927, + "learning_rate": 5.552140990044154e-07, + "loss": 0.66846383, + "num_input_tokens_seen": 274161125, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6875, + "step": 12711, + "time_per_iteration": 2.413130760192871 + }, + { + "auxiliary_loss_clip": 0.01100872, + "auxiliary_loss_mlp": 0.01032613, + "balance_loss_clip": 1.02089453, + "balance_loss_mlp": 1.03438199, + "epoch": 0.7642867879152262, + "flos": 22747794243840.0, + "grad_norm": 1.5961757403151435, + "language_loss": 0.72854543, + "learning_rate": 5.549448203559293e-07, + "loss": 0.74988031, + "num_input_tokens_seen": 274180835, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 12712, + "time_per_iteration": 2.4923083782196045 + }, + { + "auxiliary_loss_clip": 0.01100743, + "auxiliary_loss_mlp": 0.01027304, + "balance_loss_clip": 1.01644421, + "balance_loss_mlp": 1.03588057, + "epoch": 0.7643469111678942, + "flos": 23332132696320.0, + "grad_norm": 1.512862256571613, + "language_loss": 0.8010205, + "learning_rate": 5.546755965040804e-07, + "loss": 0.82230103, + "num_input_tokens_seen": 274201190, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6484375, + "step": 12713, + "time_per_iteration": 2.4570553302764893 + }, + { + "auxiliary_loss_clip": 0.01104482, + "auxiliary_loss_mlp": 0.01029802, + "balance_loss_clip": 1.01756477, + "balance_loss_mlp": 1.03663445, + "epoch": 0.7644070344205621, + "flos": 19857092440320.0, + "grad_norm": 2.127063992718731, + "language_loss": 0.83558553, + "learning_rate": 5.544064274590776e-07, + "loss": 0.85692835, + "num_input_tokens_seen": 274217595, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6796875, + "step": 12714, + "time_per_iteration": 2.4317142963409424 + }, + { + "auxiliary_loss_clip": 0.01105545, + "auxiliary_loss_mlp": 0.01038361, + "balance_loss_clip": 1.0259099, + "balance_loss_mlp": 1.03701067, + "epoch": 0.7644671576732301, + "flos": 22090628966400.0, + "grad_norm": 1.592380808570538, + "language_loss": 0.72868395, + "learning_rate": 5.541373132311287e-07, + "loss": 0.75012302, + "num_input_tokens_seen": 274237885, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6875, + "step": 12715, + "time_per_iteration": 2.43247389793396 + }, + { + "auxiliary_loss_clip": 0.01100488, + "auxiliary_loss_mlp": 0.0102946, + "balance_loss_clip": 1.01739025, + "balance_loss_mlp": 1.03394234, + "epoch": 0.7645272809258981, + "flos": 25481421872640.0, + "grad_norm": 1.7023765879093384, + "language_loss": 0.63293636, + "learning_rate": 5.538682538304376e-07, + "loss": 0.65423584, + "num_input_tokens_seen": 274258820, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66796875, + "step": 12716, + "time_per_iteration": 2.519078016281128 + }, + { + "auxiliary_loss_clip": 0.0110555, + "auxiliary_loss_mlp": 0.01034917, + "balance_loss_clip": 1.02227485, + "balance_loss_mlp": 1.03597593, + "epoch": 0.7645874041785661, + "flos": 21541877913600.0, + "grad_norm": 1.4875164699453862, + "language_loss": 0.79791009, + "learning_rate": 5.535992492672068e-07, + "loss": 0.81931472, + "num_input_tokens_seen": 274278835, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 12717, + "time_per_iteration": 2.429151773452759 + }, + { + "auxiliary_loss_clip": 0.01102712, + "auxiliary_loss_mlp": 0.01033337, + "balance_loss_clip": 1.0218451, + "balance_loss_mlp": 1.03612757, + "epoch": 0.764647527431234, + "flos": 20630896156800.0, + "grad_norm": 2.2673772679539486, + "language_loss": 0.66456509, + "learning_rate": 5.53330299551638e-07, + "loss": 0.6859256, + "num_input_tokens_seen": 274297110, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6640625, + "step": 12718, + "time_per_iteration": 2.5357375144958496 + }, + { + "auxiliary_loss_clip": 0.01098639, + "auxiliary_loss_mlp": 0.01031904, + "balance_loss_clip": 1.02124047, + "balance_loss_mlp": 1.03456593, + "epoch": 0.764707650683902, + "flos": 21434074220160.0, + "grad_norm": 1.8849716661729419, + "language_loss": 0.77913976, + "learning_rate": 5.530614046939286e-07, + "loss": 0.8004452, + "num_input_tokens_seen": 274315610, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.640625, + "step": 12719, + "time_per_iteration": 3.9749484062194824 + }, + { + "auxiliary_loss_clip": 0.01102309, + "auxiliary_loss_mlp": 0.01025821, + "balance_loss_clip": 1.01369143, + "balance_loss_mlp": 1.03523517, + "epoch": 0.7647677739365699, + "flos": 22711201263360.0, + "grad_norm": 1.683095995258743, + "language_loss": 0.69655412, + "learning_rate": 5.527925647042754e-07, + "loss": 0.71783549, + "num_input_tokens_seen": 274333975, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.671875, + "step": 12720, + "time_per_iteration": 2.568380117416382 + }, + { + "auxiliary_loss_clip": 0.01102352, + "auxiliary_loss_mlp": 0.01031499, + "balance_loss_clip": 1.01990581, + "balance_loss_mlp": 1.03623235, + "epoch": 0.7648278971892379, + "flos": 21324115710720.0, + "grad_norm": 1.6712048084567594, + "language_loss": 0.73724437, + "learning_rate": 5.52523779592875e-07, + "loss": 0.75858283, + "num_input_tokens_seen": 274353695, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6640625, + "step": 12721, + "time_per_iteration": 3.8811776638031006 + }, + { + "auxiliary_loss_clip": 0.01101929, + "auxiliary_loss_mlp": 0.01028761, + "balance_loss_clip": 1.01676273, + "balance_loss_mlp": 1.03572047, + "epoch": 0.764888020441906, + "flos": 20667345482880.0, + "grad_norm": 1.8878016684824361, + "language_loss": 0.73512298, + "learning_rate": 5.522550493699163e-07, + "loss": 0.75642979, + "num_input_tokens_seen": 274371120, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66015625, + "step": 12722, + "time_per_iteration": 3.989180564880371 + }, + { + "auxiliary_loss_clip": 0.01101721, + "auxiliary_loss_mlp": 0.0103481, + "balance_loss_clip": 1.02355647, + "balance_loss_mlp": 1.03481197, + "epoch": 0.7649481436945739, + "flos": 25082526360960.0, + "grad_norm": 1.7865929213753133, + "language_loss": 0.7357918, + "learning_rate": 5.519863740455912e-07, + "loss": 0.75715715, + "num_input_tokens_seen": 274389665, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.66796875, + "step": 12723, + "time_per_iteration": 2.5361814498901367 + }, + { + "auxiliary_loss_clip": 0.01101913, + "auxiliary_loss_mlp": 0.01028255, + "balance_loss_clip": 1.01642966, + "balance_loss_mlp": 1.03334272, + "epoch": 0.7650082669472419, + "flos": 24900890261760.0, + "grad_norm": 1.812040578255397, + "language_loss": 0.73211122, + "learning_rate": 5.517177536300881e-07, + "loss": 0.7534129, + "num_input_tokens_seen": 274408750, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 12724, + "time_per_iteration": 3.9343338012695312 + }, + { + "auxiliary_loss_clip": 0.01099657, + "auxiliary_loss_mlp": 0.01024123, + "balance_loss_clip": 1.01270843, + "balance_loss_mlp": 1.03587949, + "epoch": 0.7650683901999098, + "flos": 14647388676480.0, + "grad_norm": 1.9420758894203383, + "language_loss": 0.8370254, + "learning_rate": 5.514491881335935e-07, + "loss": 0.85826313, + "num_input_tokens_seen": 274424600, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.63671875, + "step": 12725, + "time_per_iteration": 2.378312587738037 + }, + { + "auxiliary_loss_clip": 0.01102, + "auxiliary_loss_mlp": 0.01032852, + "balance_loss_clip": 1.02064514, + "balance_loss_mlp": 1.03584003, + "epoch": 0.7651285134525778, + "flos": 26352434770560.0, + "grad_norm": 1.7077077280444313, + "language_loss": 0.77513289, + "learning_rate": 5.511806775662901e-07, + "loss": 0.79648137, + "num_input_tokens_seen": 274443075, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66015625, + "step": 12726, + "time_per_iteration": 2.489109992980957 + }, + { + "auxiliary_loss_clip": 0.01103068, + "auxiliary_loss_mlp": 0.0103319, + "balance_loss_clip": 1.02161503, + "balance_loss_mlp": 1.03531957, + "epoch": 0.7651886367052457, + "flos": 26646866553600.0, + "grad_norm": 1.5743856699934278, + "language_loss": 0.7073437, + "learning_rate": 5.509122219383615e-07, + "loss": 0.7287063, + "num_input_tokens_seen": 274463240, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6796875, + "step": 12727, + "time_per_iteration": 2.4679818153381348 + }, + { + "auxiliary_loss_clip": 0.01097102, + "auxiliary_loss_mlp": 0.01024446, + "balance_loss_clip": 1.01324618, + "balance_loss_mlp": 1.03295493, + "epoch": 0.7652487599579137, + "flos": 25702847262720.0, + "grad_norm": 1.683634898596646, + "language_loss": 0.79648662, + "learning_rate": 5.506438212599864e-07, + "loss": 0.81770217, + "num_input_tokens_seen": 274482750, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.640625, + "step": 12728, + "time_per_iteration": 2.5594372749328613 + }, + { + "auxiliary_loss_clip": 0.01104269, + "auxiliary_loss_mlp": 0.01029906, + "balance_loss_clip": 1.01803839, + "balance_loss_mlp": 1.03638935, + "epoch": 0.7653088832105817, + "flos": 28585576247040.0, + "grad_norm": 1.9251474152175339, + "language_loss": 0.55158925, + "learning_rate": 5.503754755413424e-07, + "loss": 0.57293093, + "num_input_tokens_seen": 274503545, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 12729, + "time_per_iteration": 2.4821853637695312 + }, + { + "auxiliary_loss_clip": 0.01101542, + "auxiliary_loss_mlp": 0.01029418, + "balance_loss_clip": 1.01739013, + "balance_loss_mlp": 1.03482783, + "epoch": 0.7653690064632497, + "flos": 23366750428800.0, + "grad_norm": 2.177670439939341, + "language_loss": 0.77752316, + "learning_rate": 5.501071847926055e-07, + "loss": 0.79883277, + "num_input_tokens_seen": 274523825, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 12730, + "time_per_iteration": 2.4903311729431152 + }, + { + "auxiliary_loss_clip": 0.01107568, + "auxiliary_loss_mlp": 0.01037906, + "balance_loss_clip": 1.02540636, + "balance_loss_mlp": 1.03940296, + "epoch": 0.7654291297159176, + "flos": 15773905992960.0, + "grad_norm": 2.6215650166042854, + "language_loss": 0.68980086, + "learning_rate": 5.498389490239495e-07, + "loss": 0.71125555, + "num_input_tokens_seen": 274541625, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 12731, + "time_per_iteration": 2.4075534343719482 + }, + { + "auxiliary_loss_clip": 0.01103331, + "auxiliary_loss_mlp": 0.01029528, + "balance_loss_clip": 1.01775599, + "balance_loss_mlp": 1.03526866, + "epoch": 0.7654892529685856, + "flos": 18033800123520.0, + "grad_norm": 2.0647561779987598, + "language_loss": 0.69921666, + "learning_rate": 5.495707682455471e-07, + "loss": 0.72054529, + "num_input_tokens_seen": 274557580, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 12732, + "time_per_iteration": 2.4208905696868896 + }, + { + "auxiliary_loss_clip": 0.01103869, + "auxiliary_loss_mlp": 0.01029535, + "balance_loss_clip": 1.01700664, + "balance_loss_mlp": 1.03542268, + "epoch": 0.7655493762212535, + "flos": 27236017428480.0, + "grad_norm": 1.4500082329987547, + "language_loss": 0.78334171, + "learning_rate": 5.493026424675653e-07, + "loss": 0.8046757, + "num_input_tokens_seen": 274578135, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 12733, + "time_per_iteration": 2.4912784099578857 + }, + { + "auxiliary_loss_clip": 0.01101688, + "auxiliary_loss_mlp": 0.01031369, + "balance_loss_clip": 1.02012134, + "balance_loss_mlp": 1.03670192, + "epoch": 0.7656094994739215, + "flos": 20773964027520.0, + "grad_norm": 1.7158100423573102, + "language_loss": 0.77660191, + "learning_rate": 5.490345717001726e-07, + "loss": 0.79793251, + "num_input_tokens_seen": 274595655, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 12734, + "time_per_iteration": 2.500473737716675 + }, + { + "auxiliary_loss_clip": 0.01105167, + "auxiliary_loss_mlp": 0.01028995, + "balance_loss_clip": 1.01641846, + "balance_loss_mlp": 1.03554702, + "epoch": 0.7656696227265896, + "flos": 23039245198080.0, + "grad_norm": 1.8249591988641765, + "language_loss": 0.72925597, + "learning_rate": 5.48766555953535e-07, + "loss": 0.7505976, + "num_input_tokens_seen": 274616305, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 12735, + "time_per_iteration": 2.477151870727539 + }, + { + "auxiliary_loss_clip": 0.01100932, + "auxiliary_loss_mlp": 0.01031337, + "balance_loss_clip": 1.01950526, + "balance_loss_mlp": 1.03448582, + "epoch": 0.7657297459792575, + "flos": 27525636789120.0, + "grad_norm": 1.762755938447221, + "language_loss": 0.72515297, + "learning_rate": 5.484985952378145e-07, + "loss": 0.7464757, + "num_input_tokens_seen": 274638110, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 12736, + "time_per_iteration": 2.5486631393432617 + }, + { + "auxiliary_loss_clip": 0.01107585, + "auxiliary_loss_mlp": 0.01036525, + "balance_loss_clip": 1.02288127, + "balance_loss_mlp": 1.03783011, + "epoch": 0.7657898692319255, + "flos": 17128456801920.0, + "grad_norm": 1.7485103277952745, + "language_loss": 0.77891874, + "learning_rate": 5.482306895631728e-07, + "loss": 0.80035985, + "num_input_tokens_seen": 274656565, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.69921875, + "step": 12737, + "time_per_iteration": 2.4112277030944824 + }, + { + "auxiliary_loss_clip": 0.01101521, + "auxiliary_loss_mlp": 0.01028185, + "balance_loss_clip": 1.01596594, + "balance_loss_mlp": 1.0340569, + "epoch": 0.7658499924845934, + "flos": 21465747037440.0, + "grad_norm": 1.6956859838498979, + "language_loss": 0.76673079, + "learning_rate": 5.479628389397699e-07, + "loss": 0.78802776, + "num_input_tokens_seen": 274674215, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.67578125, + "step": 12738, + "time_per_iteration": 2.4841501712799072 + }, + { + "auxiliary_loss_clip": 0.01104744, + "auxiliary_loss_mlp": 0.01029004, + "balance_loss_clip": 1.01677346, + "balance_loss_mlp": 1.03617144, + "epoch": 0.7659101157372614, + "flos": 29496665744640.0, + "grad_norm": 1.8494809749417094, + "language_loss": 0.62757778, + "learning_rate": 5.476950433777603e-07, + "loss": 0.64891523, + "num_input_tokens_seen": 274693445, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 12739, + "time_per_iteration": 2.5342459678649902 + }, + { + "auxiliary_loss_clip": 0.01104187, + "auxiliary_loss_mlp": 0.01034147, + "balance_loss_clip": 1.02121282, + "balance_loss_mlp": 1.03702021, + "epoch": 0.7659702389899293, + "flos": 18551812112640.0, + "grad_norm": 1.9457756189181725, + "language_loss": 0.79532218, + "learning_rate": 5.474273028873004e-07, + "loss": 0.81670547, + "num_input_tokens_seen": 274712815, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.671875, + "step": 12740, + "time_per_iteration": 2.464242458343506 + }, + { + "auxiliary_loss_clip": 0.01101878, + "auxiliary_loss_mlp": 0.0102987, + "balance_loss_clip": 1.01806879, + "balance_loss_mlp": 1.03497076, + "epoch": 0.7660303622425974, + "flos": 23549176627200.0, + "grad_norm": 1.8538704286256995, + "language_loss": 0.65541816, + "learning_rate": 5.471596174785429e-07, + "loss": 0.67673558, + "num_input_tokens_seen": 274732690, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66796875, + "step": 12741, + "time_per_iteration": 2.6027071475982666 + }, + { + "auxiliary_loss_clip": 0.01101685, + "auxiliary_loss_mlp": 0.0102683, + "balance_loss_clip": 1.01482606, + "balance_loss_mlp": 1.03617609, + "epoch": 0.7660904854952653, + "flos": 18916736336640.0, + "grad_norm": 1.883849175475749, + "language_loss": 0.75741291, + "learning_rate": 5.468919871616386e-07, + "loss": 0.77869809, + "num_input_tokens_seen": 274752460, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.65625, + "step": 12742, + "time_per_iteration": 2.483158588409424 + }, + { + "auxiliary_loss_clip": 0.0109981, + "auxiliary_loss_mlp": 0.01028671, + "balance_loss_clip": 1.01749492, + "balance_loss_mlp": 1.03572869, + "epoch": 0.7661506087479333, + "flos": 23147515768320.0, + "grad_norm": 1.3603011041168136, + "language_loss": 0.76397032, + "learning_rate": 5.46624411946736e-07, + "loss": 0.78525507, + "num_input_tokens_seen": 274773070, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.640625, + "step": 12743, + "time_per_iteration": 2.4432547092437744 + }, + { + "auxiliary_loss_clip": 0.01100504, + "auxiliary_loss_mlp": 0.01027763, + "balance_loss_clip": 1.01619387, + "balance_loss_mlp": 1.0345211, + "epoch": 0.7662107320006012, + "flos": 17565776887680.0, + "grad_norm": 1.9126072304780652, + "language_loss": 0.749053, + "learning_rate": 5.463568918439805e-07, + "loss": 0.77033567, + "num_input_tokens_seen": 274790220, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 12744, + "time_per_iteration": 2.4553682804107666 + }, + { + "auxiliary_loss_clip": 0.01103322, + "auxiliary_loss_mlp": 0.01027604, + "balance_loss_clip": 1.01541495, + "balance_loss_mlp": 1.03609204, + "epoch": 0.7662708552532692, + "flos": 22303075956480.0, + "grad_norm": 2.243657219575693, + "language_loss": 0.70895386, + "learning_rate": 5.460894268635181e-07, + "loss": 0.73026311, + "num_input_tokens_seen": 274805095, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 12745, + "time_per_iteration": 2.4222021102905273 + }, + { + "auxiliary_loss_clip": 0.01102421, + "auxiliary_loss_mlp": 0.0103259, + "balance_loss_clip": 1.02016246, + "balance_loss_mlp": 1.03536963, + "epoch": 0.7663309785059371, + "flos": 15742053607680.0, + "grad_norm": 2.2580014777322264, + "language_loss": 0.7671814, + "learning_rate": 5.458220170154896e-07, + "loss": 0.78853154, + "num_input_tokens_seen": 274821800, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.671875, + "step": 12746, + "time_per_iteration": 2.4328715801239014 + }, + { + "auxiliary_loss_clip": 0.01024805, + "auxiliary_loss_mlp": 0.01002921, + "balance_loss_clip": 1.00194991, + "balance_loss_mlp": 1.00455523, + "epoch": 0.7663911017586051, + "flos": 62163312514560.0, + "grad_norm": 0.6617058093404249, + "language_loss": 0.56800187, + "learning_rate": 5.455546623100362e-07, + "loss": 0.58827913, + "num_input_tokens_seen": 274886970, + "router_z_loss_clip": 0.00970459, + "router_z_loss_mlp": 0.20214844, + "step": 12747, + "time_per_iteration": 3.0698306560516357 + }, + { + "auxiliary_loss_clip": 0.01098009, + "auxiliary_loss_mlp": 0.01028323, + "balance_loss_clip": 1.01804721, + "balance_loss_mlp": 1.03344798, + "epoch": 0.7664512250112732, + "flos": 26506025326080.0, + "grad_norm": 1.7111315539475358, + "language_loss": 0.72324377, + "learning_rate": 5.452873627572956e-07, + "loss": 0.74450713, + "num_input_tokens_seen": 274907240, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.6484375, + "step": 12748, + "time_per_iteration": 2.506683588027954 + }, + { + "auxiliary_loss_clip": 0.01099884, + "auxiliary_loss_mlp": 0.01030099, + "balance_loss_clip": 1.01763535, + "balance_loss_mlp": 1.03435397, + "epoch": 0.7665113482639411, + "flos": 16249542912000.0, + "grad_norm": 3.145698976514515, + "language_loss": 0.6893121, + "learning_rate": 5.450201183674052e-07, + "loss": 0.71061194, + "num_input_tokens_seen": 274924650, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.65625, + "step": 12749, + "time_per_iteration": 2.389932155609131 + }, + { + "auxiliary_loss_clip": 0.01101373, + "auxiliary_loss_mlp": 0.01026386, + "balance_loss_clip": 1.01423216, + "balance_loss_mlp": 1.034747, + "epoch": 0.7665714715166091, + "flos": 27197880163200.0, + "grad_norm": 1.5718921115117155, + "language_loss": 0.73633575, + "learning_rate": 5.447529291504967e-07, + "loss": 0.75761336, + "num_input_tokens_seen": 274944550, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6640625, + "step": 12750, + "time_per_iteration": 2.5167572498321533 + }, + { + "auxiliary_loss_clip": 0.01098567, + "auxiliary_loss_mlp": 0.01027406, + "balance_loss_clip": 1.01658773, + "balance_loss_mlp": 1.0340786, + "epoch": 0.766631594769277, + "flos": 21067785279360.0, + "grad_norm": 3.4547507974534937, + "language_loss": 0.75537312, + "learning_rate": 5.444857951167026e-07, + "loss": 0.77663291, + "num_input_tokens_seen": 274961330, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.64453125, + "step": 12751, + "time_per_iteration": 2.476710081100464 + }, + { + "auxiliary_loss_clip": 0.01103164, + "auxiliary_loss_mlp": 0.01036055, + "balance_loss_clip": 1.0242238, + "balance_loss_mlp": 1.03732014, + "epoch": 0.766691718021945, + "flos": 24097963593600.0, + "grad_norm": 2.104179028478291, + "language_loss": 0.61111033, + "learning_rate": 5.442187162761537e-07, + "loss": 0.6325025, + "num_input_tokens_seen": 274981655, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66015625, + "step": 12752, + "time_per_iteration": 2.483185291290283 + }, + { + "auxiliary_loss_clip": 0.01103162, + "auxiliary_loss_mlp": 0.01033869, + "balance_loss_clip": 1.0219605, + "balance_loss_mlp": 1.03612447, + "epoch": 0.7667518412746129, + "flos": 23440654661760.0, + "grad_norm": 1.7425308356363913, + "language_loss": 0.69364887, + "learning_rate": 5.439516926389767e-07, + "loss": 0.71501917, + "num_input_tokens_seen": 274999970, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 12753, + "time_per_iteration": 2.462432384490967 + }, + { + "auxiliary_loss_clip": 0.01102326, + "auxiliary_loss_mlp": 0.01036325, + "balance_loss_clip": 1.02463651, + "balance_loss_mlp": 1.03598034, + "epoch": 0.766811964527281, + "flos": 18148786536960.0, + "grad_norm": 2.935870889400166, + "language_loss": 0.62185645, + "learning_rate": 5.436847242152971e-07, + "loss": 0.64324296, + "num_input_tokens_seen": 275015805, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 12754, + "time_per_iteration": 2.518746852874756 + }, + { + "auxiliary_loss_clip": 0.01103223, + "auxiliary_loss_mlp": 0.01027471, + "balance_loss_clip": 1.01619387, + "balance_loss_mlp": 1.03773046, + "epoch": 0.7668720877799489, + "flos": 19536051657600.0, + "grad_norm": 2.3055221195996065, + "language_loss": 0.79792452, + "learning_rate": 5.434178110152401e-07, + "loss": 0.81923139, + "num_input_tokens_seen": 275031810, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65625, + "step": 12755, + "time_per_iteration": 2.4429805278778076 + }, + { + "auxiliary_loss_clip": 0.01101727, + "auxiliary_loss_mlp": 0.01028616, + "balance_loss_clip": 1.01758885, + "balance_loss_mlp": 1.03660679, + "epoch": 0.7669322110326169, + "flos": 22674320974080.0, + "grad_norm": 1.7360812888518318, + "language_loss": 0.70129168, + "learning_rate": 5.431509530489242e-07, + "loss": 0.7225951, + "num_input_tokens_seen": 275049325, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 12756, + "time_per_iteration": 2.4959518909454346 + }, + { + "auxiliary_loss_clip": 0.01101968, + "auxiliary_loss_mlp": 0.01034868, + "balance_loss_clip": 1.02353144, + "balance_loss_mlp": 1.03610778, + "epoch": 0.7669923342852848, + "flos": 26469396432000.0, + "grad_norm": 2.2706673014793766, + "language_loss": 0.70277941, + "learning_rate": 5.428841503264706e-07, + "loss": 0.7241478, + "num_input_tokens_seen": 275070865, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 12757, + "time_per_iteration": 2.503958225250244 + }, + { + "auxiliary_loss_clip": 0.01103409, + "auxiliary_loss_mlp": 0.01033303, + "balance_loss_clip": 1.02101266, + "balance_loss_mlp": 1.03675115, + "epoch": 0.7670524575379528, + "flos": 22856136641280.0, + "grad_norm": 1.9695287063261235, + "language_loss": 0.75929737, + "learning_rate": 5.426174028579955e-07, + "loss": 0.78066456, + "num_input_tokens_seen": 275088015, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66796875, + "step": 12758, + "time_per_iteration": 2.490203857421875 + }, + { + "auxiliary_loss_clip": 0.01098808, + "auxiliary_loss_mlp": 0.01032432, + "balance_loss_clip": 1.02119648, + "balance_loss_mlp": 1.03469872, + "epoch": 0.7671125807906207, + "flos": 22452141398400.0, + "grad_norm": 1.6224114327929111, + "language_loss": 0.76120728, + "learning_rate": 5.423507106536156e-07, + "loss": 0.7825197, + "num_input_tokens_seen": 275106975, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 12759, + "time_per_iteration": 2.462779998779297 + }, + { + "auxiliary_loss_clip": 0.0109933, + "auxiliary_loss_mlp": 0.01026965, + "balance_loss_clip": 1.01611102, + "balance_loss_mlp": 1.03285909, + "epoch": 0.7671727040432887, + "flos": 35371543518720.0, + "grad_norm": 2.0831597822945738, + "language_loss": 0.68447405, + "learning_rate": 5.420840737234425e-07, + "loss": 0.70573699, + "num_input_tokens_seen": 275129560, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6640625, + "step": 12760, + "time_per_iteration": 2.66218900680542 + }, + { + "auxiliary_loss_clip": 0.01102506, + "auxiliary_loss_mlp": 0.01029066, + "balance_loss_clip": 1.01719248, + "balance_loss_mlp": 1.03628325, + "epoch": 0.7672328272959568, + "flos": 22494947431680.0, + "grad_norm": 1.530930371771359, + "language_loss": 0.79041481, + "learning_rate": 5.418174920775871e-07, + "loss": 0.8117305, + "num_input_tokens_seen": 275151180, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6640625, + "step": 12761, + "time_per_iteration": 3.9318642616271973 + }, + { + "auxiliary_loss_clip": 0.01099585, + "auxiliary_loss_mlp": 0.0102753, + "balance_loss_clip": 1.0160147, + "balance_loss_mlp": 1.03551531, + "epoch": 0.7672929505486247, + "flos": 22815557251200.0, + "grad_norm": 1.7398225752644456, + "language_loss": 0.66273689, + "learning_rate": 5.415509657261589e-07, + "loss": 0.68400806, + "num_input_tokens_seen": 275170605, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 12762, + "time_per_iteration": 2.423274040222168 + }, + { + "auxiliary_loss_clip": 0.01102242, + "auxiliary_loss_mlp": 0.01027373, + "balance_loss_clip": 1.01513004, + "balance_loss_mlp": 1.03505349, + "epoch": 0.7673530738012927, + "flos": 20338834671360.0, + "grad_norm": 1.6795407868504282, + "language_loss": 0.73981798, + "learning_rate": 5.412844946792639e-07, + "loss": 0.76111412, + "num_input_tokens_seen": 275188750, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.671875, + "step": 12763, + "time_per_iteration": 3.845613718032837 + }, + { + "auxiliary_loss_clip": 0.01102131, + "auxiliary_loss_mlp": 0.01032406, + "balance_loss_clip": 1.02071738, + "balance_loss_mlp": 1.0367074, + "epoch": 0.7674131970539606, + "flos": 34933576988160.0, + "grad_norm": 1.585918390915768, + "language_loss": 0.70586705, + "learning_rate": 5.410180789470067e-07, + "loss": 0.72721243, + "num_input_tokens_seen": 275211365, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 12764, + "time_per_iteration": 3.981903314590454 + }, + { + "auxiliary_loss_clip": 0.01100669, + "auxiliary_loss_mlp": 0.01026967, + "balance_loss_clip": 1.01529598, + "balance_loss_mlp": 1.03549826, + "epoch": 0.7674733203066286, + "flos": 28328850766080.0, + "grad_norm": 1.533836649562743, + "language_loss": 0.69619727, + "learning_rate": 5.40751718539491e-07, + "loss": 0.71747363, + "num_input_tokens_seen": 275231670, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65234375, + "step": 12765, + "time_per_iteration": 2.4988484382629395 + }, + { + "auxiliary_loss_clip": 0.01097446, + "auxiliary_loss_mlp": 0.01030323, + "balance_loss_clip": 1.02000558, + "balance_loss_mlp": 1.03249931, + "epoch": 0.7675334435592965, + "flos": 16289727252480.0, + "grad_norm": 1.7341921361954618, + "language_loss": 0.60877311, + "learning_rate": 5.404854134668162e-07, + "loss": 0.63005078, + "num_input_tokens_seen": 275249425, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.65234375, + "step": 12766, + "time_per_iteration": 3.856095790863037 + }, + { + "auxiliary_loss_clip": 0.01024204, + "auxiliary_loss_mlp": 0.01001208, + "balance_loss_clip": 1.00024879, + "balance_loss_mlp": 1.00405478, + "epoch": 0.7675935668119646, + "flos": 64826232220800.0, + "grad_norm": 0.7388978362538794, + "language_loss": 0.60806286, + "learning_rate": 5.402191637390803e-07, + "loss": 0.628317, + "num_input_tokens_seen": 275312485, + "router_z_loss_clip": 0.00958252, + "router_z_loss_mlp": 0.20117188, + "step": 12767, + "time_per_iteration": 3.1863934993743896 + }, + { + "auxiliary_loss_clip": 0.01098543, + "auxiliary_loss_mlp": 0.01024357, + "balance_loss_clip": 1.01363397, + "balance_loss_mlp": 1.03486204, + "epoch": 0.7676536900646325, + "flos": 22675398382080.0, + "grad_norm": 1.9841724465329964, + "language_loss": 0.69505453, + "learning_rate": 5.399529693663801e-07, + "loss": 0.71628356, + "num_input_tokens_seen": 275331680, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.63671875, + "step": 12768, + "time_per_iteration": 2.423121452331543 + }, + { + "auxiliary_loss_clip": 0.01107565, + "auxiliary_loss_mlp": 0.01034083, + "balance_loss_clip": 1.02206123, + "balance_loss_mlp": 1.03830612, + "epoch": 0.7677138133173005, + "flos": 26939682224640.0, + "grad_norm": 1.9774662095092985, + "language_loss": 0.70799577, + "learning_rate": 5.3968683035881e-07, + "loss": 0.7294122, + "num_input_tokens_seen": 275351615, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6953125, + "step": 12769, + "time_per_iteration": 2.4947516918182373 + }, + { + "auxiliary_loss_clip": 0.01103148, + "auxiliary_loss_mlp": 0.01026944, + "balance_loss_clip": 1.0148201, + "balance_loss_mlp": 1.03540611, + "epoch": 0.7677739365699684, + "flos": 23799545400960.0, + "grad_norm": 1.823298760542139, + "language_loss": 0.80289495, + "learning_rate": 5.394207467264611e-07, + "loss": 0.82419586, + "num_input_tokens_seen": 275368815, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 12770, + "time_per_iteration": 2.4479711055755615 + }, + { + "auxiliary_loss_clip": 0.01098048, + "auxiliary_loss_mlp": 0.01030309, + "balance_loss_clip": 1.01986027, + "balance_loss_mlp": 1.0342977, + "epoch": 0.7678340598226364, + "flos": 34455497944320.0, + "grad_norm": 1.520087647586923, + "language_loss": 0.78579485, + "learning_rate": 5.391547184794245e-07, + "loss": 0.80707848, + "num_input_tokens_seen": 275389345, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.63671875, + "step": 12771, + "time_per_iteration": 2.5589637756347656 + }, + { + "auxiliary_loss_clip": 0.01100406, + "auxiliary_loss_mlp": 0.01027097, + "balance_loss_clip": 1.01595068, + "balance_loss_mlp": 1.03527427, + "epoch": 0.7678941830753043, + "flos": 23841740903040.0, + "grad_norm": 1.305591039584481, + "language_loss": 0.68094563, + "learning_rate": 5.388887456277876e-07, + "loss": 0.70222068, + "num_input_tokens_seen": 275411240, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65234375, + "step": 12772, + "time_per_iteration": 2.443350076675415 + }, + { + "auxiliary_loss_clip": 0.01097286, + "auxiliary_loss_mlp": 0.01023769, + "balance_loss_clip": 1.01294541, + "balance_loss_mlp": 1.03465271, + "epoch": 0.7679543063279723, + "flos": 25410929431680.0, + "grad_norm": 1.6667227683698287, + "language_loss": 0.73345917, + "learning_rate": 5.386228281816349e-07, + "loss": 0.75466973, + "num_input_tokens_seen": 275432010, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.62890625, + "step": 12773, + "time_per_iteration": 2.5177359580993652 + }, + { + "auxiliary_loss_clip": 0.01097604, + "auxiliary_loss_mlp": 0.01026807, + "balance_loss_clip": 1.01624548, + "balance_loss_mlp": 1.03416824, + "epoch": 0.7680144295806404, + "flos": 27962382257280.0, + "grad_norm": 1.8287819749313907, + "language_loss": 0.8077029, + "learning_rate": 5.383569661510512e-07, + "loss": 0.82894701, + "num_input_tokens_seen": 275453710, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.63671875, + "step": 12774, + "time_per_iteration": 2.4638662338256836 + }, + { + "auxiliary_loss_clip": 0.01102122, + "auxiliary_loss_mlp": 0.01030311, + "balance_loss_clip": 1.01865244, + "balance_loss_mlp": 1.03650451, + "epoch": 0.7680745528333083, + "flos": 20412810731520.0, + "grad_norm": 1.5141235793881351, + "language_loss": 0.6951592, + "learning_rate": 5.380911595461177e-07, + "loss": 0.71648353, + "num_input_tokens_seen": 275472915, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65625, + "step": 12775, + "time_per_iteration": 2.529325246810913 + }, + { + "auxiliary_loss_clip": 0.01024296, + "auxiliary_loss_mlp": 0.0099915, + "balance_loss_clip": 0.9981491, + "balance_loss_mlp": 1.00394726, + "epoch": 0.7681346760859763, + "flos": 68401103351040.0, + "grad_norm": 0.6956565563059588, + "language_loss": 0.56836295, + "learning_rate": 5.378254083769147e-07, + "loss": 0.58859742, + "num_input_tokens_seen": 275534785, + "router_z_loss_clip": 0.01000977, + "router_z_loss_mlp": 0.203125, + "step": 12776, + "time_per_iteration": 3.10646915435791 + }, + { + "auxiliary_loss_clip": 0.01100161, + "auxiliary_loss_mlp": 0.01031577, + "balance_loss_clip": 1.02058029, + "balance_loss_mlp": 1.03510964, + "epoch": 0.7681947993386442, + "flos": 21251468453760.0, + "grad_norm": 1.821518021735027, + "language_loss": 0.74034452, + "learning_rate": 5.375597126535188e-07, + "loss": 0.76166189, + "num_input_tokens_seen": 275553205, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6484375, + "step": 12777, + "time_per_iteration": 2.4750425815582275 + }, + { + "auxiliary_loss_clip": 0.01100992, + "auxiliary_loss_mlp": 0.01033586, + "balance_loss_clip": 1.02257133, + "balance_loss_mlp": 1.03636885, + "epoch": 0.7682549225913122, + "flos": 21397696721280.0, + "grad_norm": 2.340152185552387, + "language_loss": 0.70033187, + "learning_rate": 5.372940723860043e-07, + "loss": 0.72167766, + "num_input_tokens_seen": 275571490, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.64453125, + "step": 12778, + "time_per_iteration": 2.4316253662109375 + }, + { + "auxiliary_loss_clip": 0.01100934, + "auxiliary_loss_mlp": 0.01028306, + "balance_loss_clip": 1.01741004, + "balance_loss_mlp": 1.03619504, + "epoch": 0.7683150458439801, + "flos": 23038921975680.0, + "grad_norm": 1.7229591710828633, + "language_loss": 0.70021391, + "learning_rate": 5.37028487584446e-07, + "loss": 0.72150636, + "num_input_tokens_seen": 275589665, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6484375, + "step": 12779, + "time_per_iteration": 2.4962258338928223 + }, + { + "auxiliary_loss_clip": 0.01102633, + "auxiliary_loss_mlp": 0.01027115, + "balance_loss_clip": 1.01549852, + "balance_loss_mlp": 1.03702402, + "epoch": 0.7683751690966482, + "flos": 67332397996800.0, + "grad_norm": 1.5025489085425099, + "language_loss": 0.58335769, + "learning_rate": 5.367629582589133e-07, + "loss": 0.60465509, + "num_input_tokens_seen": 275615605, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 12780, + "time_per_iteration": 2.827277898788452 + }, + { + "auxiliary_loss_clip": 0.0110525, + "auxiliary_loss_mlp": 0.01036743, + "balance_loss_clip": 1.02303374, + "balance_loss_mlp": 1.03533888, + "epoch": 0.7684352923493161, + "flos": 21798890703360.0, + "grad_norm": 1.7175154048047394, + "language_loss": 0.68096447, + "learning_rate": 5.364974844194759e-07, + "loss": 0.70238441, + "num_input_tokens_seen": 275634965, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.69921875, + "step": 12781, + "time_per_iteration": 2.450493574142456 + }, + { + "auxiliary_loss_clip": 0.01099247, + "auxiliary_loss_mlp": 0.0102942, + "balance_loss_clip": 1.0181365, + "balance_loss_mlp": 1.03411829, + "epoch": 0.7684954156019841, + "flos": 25847603072640.0, + "grad_norm": 1.4930277529018858, + "language_loss": 0.79351133, + "learning_rate": 5.362320660762016e-07, + "loss": 0.814798, + "num_input_tokens_seen": 275655785, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65234375, + "step": 12782, + "time_per_iteration": 2.473785638809204 + }, + { + "auxiliary_loss_clip": 0.01101943, + "auxiliary_loss_mlp": 0.01028865, + "balance_loss_clip": 1.01648557, + "balance_loss_mlp": 1.03457451, + "epoch": 0.768555538854652, + "flos": 25447378757760.0, + "grad_norm": 3.89329070185187, + "language_loss": 0.6701203, + "learning_rate": 5.35966703239153e-07, + "loss": 0.6914283, + "num_input_tokens_seen": 275676160, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 12783, + "time_per_iteration": 2.496005058288574 + }, + { + "auxiliary_loss_clip": 0.0110336, + "auxiliary_loss_mlp": 0.01032843, + "balance_loss_clip": 1.02068949, + "balance_loss_mlp": 1.0368228, + "epoch": 0.76861566210732, + "flos": 19646369303040.0, + "grad_norm": 2.317173566412315, + "language_loss": 0.68567002, + "learning_rate": 5.357013959183938e-07, + "loss": 0.70703208, + "num_input_tokens_seen": 275695660, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6640625, + "step": 12784, + "time_per_iteration": 2.4193952083587646 + }, + { + "auxiliary_loss_clip": 0.01101224, + "auxiliary_loss_mlp": 0.01027175, + "balance_loss_clip": 1.01677442, + "balance_loss_mlp": 1.03561044, + "epoch": 0.7686757853599879, + "flos": 22419032037120.0, + "grad_norm": 2.4397788203349546, + "language_loss": 0.80600178, + "learning_rate": 5.354361441239843e-07, + "loss": 0.82728577, + "num_input_tokens_seen": 275714025, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.65625, + "step": 12785, + "time_per_iteration": 2.4642157554626465 + }, + { + "auxiliary_loss_clip": 0.01102953, + "auxiliary_loss_mlp": 0.0103025, + "balance_loss_clip": 1.01812065, + "balance_loss_mlp": 1.03647351, + "epoch": 0.768735908612656, + "flos": 47774262453120.0, + "grad_norm": 1.5675219455195206, + "language_loss": 0.77255261, + "learning_rate": 5.351709478659836e-07, + "loss": 0.79388458, + "num_input_tokens_seen": 275737300, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6640625, + "step": 12786, + "time_per_iteration": 2.6608307361602783 + }, + { + "auxiliary_loss_clip": 0.0109997, + "auxiliary_loss_mlp": 0.01029084, + "balance_loss_clip": 1.01797938, + "balance_loss_mlp": 1.03441632, + "epoch": 0.7687960318653239, + "flos": 30263179000320.0, + "grad_norm": 2.029037446974208, + "language_loss": 0.58857298, + "learning_rate": 5.349058071544468e-07, + "loss": 0.60986358, + "num_input_tokens_seen": 275757895, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 12787, + "time_per_iteration": 2.5195324420928955 + }, + { + "auxiliary_loss_clip": 0.01098338, + "auxiliary_loss_mlp": 0.01026426, + "balance_loss_clip": 1.01488662, + "balance_loss_mlp": 1.03323674, + "epoch": 0.7688561551179919, + "flos": 19573434737280.0, + "grad_norm": 1.5842728148921028, + "language_loss": 0.75863254, + "learning_rate": 5.346407219994292e-07, + "loss": 0.77988023, + "num_input_tokens_seen": 275776745, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65234375, + "step": 12788, + "time_per_iteration": 2.427560567855835 + }, + { + "auxiliary_loss_clip": 0.01103523, + "auxiliary_loss_mlp": 0.01037331, + "balance_loss_clip": 1.02540421, + "balance_loss_mlp": 1.03627038, + "epoch": 0.7689162783706599, + "flos": 22783776693120.0, + "grad_norm": 1.6525125671595142, + "language_loss": 0.66358525, + "learning_rate": 5.343756924109821e-07, + "loss": 0.6849938, + "num_input_tokens_seen": 275797205, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 12789, + "time_per_iteration": 2.484055280685425 + }, + { + "auxiliary_loss_clip": 0.01103699, + "auxiliary_loss_mlp": 0.01033442, + "balance_loss_clip": 1.02053142, + "balance_loss_mlp": 1.03660512, + "epoch": 0.7689764016233278, + "flos": 34204195416960.0, + "grad_norm": 1.730155675117843, + "language_loss": 0.68648386, + "learning_rate": 5.341107183991553e-07, + "loss": 0.70785522, + "num_input_tokens_seen": 275817935, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.671875, + "step": 12790, + "time_per_iteration": 2.5284645557403564 + }, + { + "auxiliary_loss_clip": 0.01101213, + "auxiliary_loss_mlp": 0.01031951, + "balance_loss_clip": 1.02015567, + "balance_loss_mlp": 1.03384793, + "epoch": 0.7690365248759958, + "flos": 17274469587840.0, + "grad_norm": 1.6904473195565226, + "language_loss": 0.68665707, + "learning_rate": 5.338457999739969e-07, + "loss": 0.70798862, + "num_input_tokens_seen": 275837145, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 12791, + "time_per_iteration": 2.4484951496124268 + }, + { + "auxiliary_loss_clip": 0.0109967, + "auxiliary_loss_mlp": 0.01032034, + "balance_loss_clip": 1.02082801, + "balance_loss_mlp": 1.03512239, + "epoch": 0.7690966481286637, + "flos": 18223157646720.0, + "grad_norm": 1.7979814428541672, + "language_loss": 0.79704869, + "learning_rate": 5.335809371455526e-07, + "loss": 0.81836575, + "num_input_tokens_seen": 275855705, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.64453125, + "step": 12792, + "time_per_iteration": 2.397611141204834 + }, + { + "auxiliary_loss_clip": 0.01109838, + "auxiliary_loss_mlp": 0.01030341, + "balance_loss_clip": 1.01751399, + "balance_loss_mlp": 1.04006386, + "epoch": 0.7691567713813318, + "flos": 21537568281600.0, + "grad_norm": 1.8065104235700298, + "language_loss": 0.72902393, + "learning_rate": 5.333161299238673e-07, + "loss": 0.7504257, + "num_input_tokens_seen": 275873930, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69921875, + "step": 12793, + "time_per_iteration": 2.445250988006592 + }, + { + "auxiliary_loss_clip": 0.01102182, + "auxiliary_loss_mlp": 0.01033309, + "balance_loss_clip": 1.02147722, + "balance_loss_mlp": 1.0359565, + "epoch": 0.7692168946339997, + "flos": 39379999720320.0, + "grad_norm": 1.7477925933074476, + "language_loss": 0.63753021, + "learning_rate": 5.330513783189803e-07, + "loss": 0.65888512, + "num_input_tokens_seen": 275895895, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 12794, + "time_per_iteration": 2.609574317932129 + }, + { + "auxiliary_loss_clip": 0.01107559, + "auxiliary_loss_mlp": 0.01031379, + "balance_loss_clip": 1.01955318, + "balance_loss_mlp": 1.03873158, + "epoch": 0.7692770178866677, + "flos": 25009950931200.0, + "grad_norm": 1.4386826522149643, + "language_loss": 0.76442081, + "learning_rate": 5.327866823409319e-07, + "loss": 0.78581011, + "num_input_tokens_seen": 275917825, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 12795, + "time_per_iteration": 2.491729736328125 + }, + { + "auxiliary_loss_clip": 0.01101903, + "auxiliary_loss_mlp": 0.01026935, + "balance_loss_clip": 1.01503158, + "balance_loss_mlp": 1.03450465, + "epoch": 0.7693371411393356, + "flos": 24716273333760.0, + "grad_norm": 1.5564929317372034, + "language_loss": 0.71727788, + "learning_rate": 5.325220419997601e-07, + "loss": 0.73856628, + "num_input_tokens_seen": 275937890, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 12796, + "time_per_iteration": 2.4555909633636475 + }, + { + "auxiliary_loss_clip": 0.01101987, + "auxiliary_loss_mlp": 0.01027607, + "balance_loss_clip": 1.01607311, + "balance_loss_mlp": 1.03496242, + "epoch": 0.7693972643920036, + "flos": 15924803028480.0, + "grad_norm": 2.1139443574880574, + "language_loss": 0.65011704, + "learning_rate": 5.32257457305499e-07, + "loss": 0.671413, + "num_input_tokens_seen": 275954495, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 12797, + "time_per_iteration": 2.4375650882720947 + }, + { + "auxiliary_loss_clip": 0.01102375, + "auxiliary_loss_mlp": 0.01032493, + "balance_loss_clip": 1.01997042, + "balance_loss_mlp": 1.03497744, + "epoch": 0.7694573876446715, + "flos": 25405901527680.0, + "grad_norm": 1.7406268375676737, + "language_loss": 0.91516721, + "learning_rate": 5.319929282681823e-07, + "loss": 0.93651593, + "num_input_tokens_seen": 275972395, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.671875, + "step": 12798, + "time_per_iteration": 2.4546101093292236 + }, + { + "auxiliary_loss_clip": 0.0110163, + "auxiliary_loss_mlp": 0.01027124, + "balance_loss_clip": 1.01569748, + "balance_loss_mlp": 1.03515077, + "epoch": 0.7695175108973396, + "flos": 16654220513280.0, + "grad_norm": 1.9252292535695115, + "language_loss": 0.82239765, + "learning_rate": 5.317284548978418e-07, + "loss": 0.84368521, + "num_input_tokens_seen": 275989020, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 12799, + "time_per_iteration": 2.44386625289917 + }, + { + "auxiliary_loss_clip": 0.01103323, + "auxiliary_loss_mlp": 0.01028131, + "balance_loss_clip": 1.01621604, + "balance_loss_mlp": 1.03646576, + "epoch": 0.7695776341500075, + "flos": 13626520237440.0, + "grad_norm": 2.0094364967525262, + "language_loss": 0.77591789, + "learning_rate": 5.314640372045045e-07, + "loss": 0.79723239, + "num_input_tokens_seen": 276006525, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66796875, + "step": 12800, + "time_per_iteration": 2.397705316543579 + }, + { + "auxiliary_loss_clip": 0.0110665, + "auxiliary_loss_mlp": 0.01028477, + "balance_loss_clip": 1.01594234, + "balance_loss_mlp": 1.03569245, + "epoch": 0.7696377574026755, + "flos": 24276690691200.0, + "grad_norm": 1.660947128359647, + "language_loss": 0.83736777, + "learning_rate": 5.31199675198198e-07, + "loss": 0.85871899, + "num_input_tokens_seen": 276027130, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.7109375, + "step": 12801, + "time_per_iteration": 2.4850664138793945 + }, + { + "auxiliary_loss_clip": 0.01101531, + "auxiliary_loss_mlp": 0.01030973, + "balance_loss_clip": 1.01908183, + "balance_loss_mlp": 1.03610682, + "epoch": 0.7696978806553435, + "flos": 20923137210240.0, + "grad_norm": 1.968794932529363, + "language_loss": 0.72192085, + "learning_rate": 5.30935368888947e-07, + "loss": 0.7432459, + "num_input_tokens_seen": 276045715, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.65625, + "step": 12802, + "time_per_iteration": 2.423994779586792 + }, + { + "auxiliary_loss_clip": 0.01101064, + "auxiliary_loss_mlp": 0.0102928, + "balance_loss_clip": 1.01767504, + "balance_loss_mlp": 1.03590822, + "epoch": 0.7697580039080114, + "flos": 22929609911040.0, + "grad_norm": 1.7968472672418645, + "language_loss": 0.75812244, + "learning_rate": 5.306711182867747e-07, + "loss": 0.77942592, + "num_input_tokens_seen": 276065375, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 12803, + "time_per_iteration": 3.8298709392547607 + }, + { + "auxiliary_loss_clip": 0.0102415, + "auxiliary_loss_mlp": 0.00999256, + "balance_loss_clip": 0.99821299, + "balance_loss_mlp": 1.00390053, + "epoch": 0.7698181271606794, + "flos": 68717654933760.0, + "grad_norm": 0.7302044850934681, + "language_loss": 0.55831051, + "learning_rate": 5.304069234017001e-07, + "loss": 0.57854456, + "num_input_tokens_seen": 276131405, + "router_z_loss_clip": 0.01043701, + "router_z_loss_mlp": 0.203125, + "step": 12804, + "time_per_iteration": 3.058547258377075 + }, + { + "auxiliary_loss_clip": 0.01024727, + "auxiliary_loss_mlp": 0.01002741, + "balance_loss_clip": 1.00166178, + "balance_loss_mlp": 1.00439858, + "epoch": 0.7698782504133473, + "flos": 67409716999680.0, + "grad_norm": 0.9747386199890918, + "language_loss": 0.54020375, + "learning_rate": 5.301427842437429e-07, + "loss": 0.56047845, + "num_input_tokens_seen": 276200755, + "router_z_loss_clip": 0.01080322, + "router_z_loss_mlp": 0.203125, + "step": 12805, + "time_per_iteration": 4.5421671867370605 + }, + { + "auxiliary_loss_clip": 0.01105608, + "auxiliary_loss_mlp": 0.0103352, + "balance_loss_clip": 1.02145565, + "balance_loss_mlp": 1.03835249, + "epoch": 0.7699383736660154, + "flos": 22488842119680.0, + "grad_norm": 2.1701782975166, + "language_loss": 0.72961175, + "learning_rate": 5.298787008229187e-07, + "loss": 0.75100303, + "num_input_tokens_seen": 276217880, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.671875, + "step": 12806, + "time_per_iteration": 3.833503246307373 + }, + { + "auxiliary_loss_clip": 0.01102326, + "auxiliary_loss_mlp": 0.01035726, + "balance_loss_clip": 1.02383482, + "balance_loss_mlp": 1.03555238, + "epoch": 0.7699984969186833, + "flos": 21539723097600.0, + "grad_norm": 3.0939147131077878, + "language_loss": 0.75202084, + "learning_rate": 5.296146731492408e-07, + "loss": 0.77340138, + "num_input_tokens_seen": 276234810, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66796875, + "step": 12807, + "time_per_iteration": 2.453640937805176 + }, + { + "auxiliary_loss_clip": 0.01107207, + "auxiliary_loss_mlp": 0.01033913, + "balance_loss_clip": 1.02156925, + "balance_loss_mlp": 1.03789043, + "epoch": 0.7700586201713513, + "flos": 21719096640000.0, + "grad_norm": 2.25264240922501, + "language_loss": 0.79834819, + "learning_rate": 5.293507012327218e-07, + "loss": 0.81975937, + "num_input_tokens_seen": 276252850, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 12808, + "time_per_iteration": 3.863776206970215 + }, + { + "auxiliary_loss_clip": 0.01106296, + "auxiliary_loss_mlp": 0.01035203, + "balance_loss_clip": 1.02278161, + "balance_loss_mlp": 1.03690052, + "epoch": 0.7701187434240192, + "flos": 27856015107840.0, + "grad_norm": 1.7718685431414871, + "language_loss": 0.79037017, + "learning_rate": 5.290867850833718e-07, + "loss": 0.81178522, + "num_input_tokens_seen": 276272525, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6953125, + "step": 12809, + "time_per_iteration": 2.50119948387146 + }, + { + "auxiliary_loss_clip": 0.01097513, + "auxiliary_loss_mlp": 0.0102508, + "balance_loss_clip": 1.01414251, + "balance_loss_mlp": 1.03431511, + "epoch": 0.7701788666766872, + "flos": 28621307301120.0, + "grad_norm": 1.5273739274998572, + "language_loss": 0.70192695, + "learning_rate": 5.288229247111993e-07, + "loss": 0.72315288, + "num_input_tokens_seen": 276294210, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6328125, + "step": 12810, + "time_per_iteration": 2.4800918102264404 + }, + { + "auxiliary_loss_clip": 0.01104583, + "auxiliary_loss_mlp": 0.01031615, + "balance_loss_clip": 1.01865101, + "balance_loss_mlp": 1.03556144, + "epoch": 0.7702389899293551, + "flos": 14246446089600.0, + "grad_norm": 2.2614131210478465, + "language_loss": 0.78612316, + "learning_rate": 5.285591201262079e-07, + "loss": 0.80748516, + "num_input_tokens_seen": 276310290, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6875, + "step": 12811, + "time_per_iteration": 2.404200792312622 + }, + { + "auxiliary_loss_clip": 0.01024644, + "auxiliary_loss_mlp": 0.01001291, + "balance_loss_clip": 1.00025964, + "balance_loss_mlp": 1.00433743, + "epoch": 0.7702991131820232, + "flos": 70574128439040.0, + "grad_norm": 0.8119263300614926, + "language_loss": 0.56688583, + "learning_rate": 5.28295371338402e-07, + "loss": 0.58714521, + "num_input_tokens_seen": 276371715, + "router_z_loss_clip": 0.01031494, + "router_z_loss_mlp": 0.203125, + "step": 12812, + "time_per_iteration": 3.1152541637420654 + }, + { + "auxiliary_loss_clip": 0.0110341, + "auxiliary_loss_mlp": 0.01034375, + "balance_loss_clip": 1.0224545, + "balance_loss_mlp": 1.0352596, + "epoch": 0.7703592364346911, + "flos": 25480021242240.0, + "grad_norm": 1.6865104586503614, + "language_loss": 0.7190448, + "learning_rate": 5.280316783577836e-07, + "loss": 0.74042261, + "num_input_tokens_seen": 276389895, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 12813, + "time_per_iteration": 2.4738786220550537 + }, + { + "auxiliary_loss_clip": 0.01102625, + "auxiliary_loss_mlp": 0.01029531, + "balance_loss_clip": 1.01718116, + "balance_loss_mlp": 1.03553629, + "epoch": 0.7704193596873591, + "flos": 19280906375040.0, + "grad_norm": 1.808315927971449, + "language_loss": 0.66342986, + "learning_rate": 5.27768041194351e-07, + "loss": 0.68475139, + "num_input_tokens_seen": 276408990, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.671875, + "step": 12814, + "time_per_iteration": 2.454023599624634 + }, + { + "auxiliary_loss_clip": 0.01101607, + "auxiliary_loss_mlp": 0.01031859, + "balance_loss_clip": 1.02005756, + "balance_loss_mlp": 1.03535891, + "epoch": 0.7704794829400271, + "flos": 23658452778240.0, + "grad_norm": 1.9935067754667941, + "language_loss": 0.65677094, + "learning_rate": 5.275044598581018e-07, + "loss": 0.67810559, + "num_input_tokens_seen": 276428190, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 12815, + "time_per_iteration": 2.453657627105713 + }, + { + "auxiliary_loss_clip": 0.01101554, + "auxiliary_loss_mlp": 0.01031447, + "balance_loss_clip": 1.01935291, + "balance_loss_mlp": 1.03516507, + "epoch": 0.770539606192695, + "flos": 18989311766400.0, + "grad_norm": 2.1548232448255566, + "language_loss": 0.6524539, + "learning_rate": 5.272409343590322e-07, + "loss": 0.6737839, + "num_input_tokens_seen": 276446855, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 12816, + "time_per_iteration": 2.407606840133667 + }, + { + "auxiliary_loss_clip": 0.01104205, + "auxiliary_loss_mlp": 0.0103424, + "balance_loss_clip": 1.02282572, + "balance_loss_mlp": 1.03735924, + "epoch": 0.770599729445363, + "flos": 11830160142720.0, + "grad_norm": 2.105850100227776, + "language_loss": 0.71998227, + "learning_rate": 5.26977464707133e-07, + "loss": 0.74136674, + "num_input_tokens_seen": 276462000, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 12817, + "time_per_iteration": 2.4196791648864746 + }, + { + "auxiliary_loss_clip": 0.01101955, + "auxiliary_loss_mlp": 0.01032617, + "balance_loss_clip": 1.02102351, + "balance_loss_mlp": 1.03574193, + "epoch": 0.770659852698031, + "flos": 17822610109440.0, + "grad_norm": 1.9485299894899226, + "language_loss": 0.61153173, + "learning_rate": 5.267140509123957e-07, + "loss": 0.63287747, + "num_input_tokens_seen": 276481190, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 12818, + "time_per_iteration": 2.422590970993042 + }, + { + "auxiliary_loss_clip": 0.0110086, + "auxiliary_loss_mlp": 0.0102778, + "balance_loss_clip": 1.01726627, + "balance_loss_mlp": 1.036057, + "epoch": 0.770719975950699, + "flos": 21871968923520.0, + "grad_norm": 1.7189181201095014, + "language_loss": 0.67140901, + "learning_rate": 5.264506929848093e-07, + "loss": 0.69269538, + "num_input_tokens_seen": 276499520, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.6484375, + "step": 12819, + "time_per_iteration": 2.445463180541992 + }, + { + "auxiliary_loss_clip": 0.01103433, + "auxiliary_loss_mlp": 0.01026789, + "balance_loss_clip": 1.0150826, + "balance_loss_mlp": 1.03642428, + "epoch": 0.7707800992033669, + "flos": 21325049464320.0, + "grad_norm": 1.8084191100945337, + "language_loss": 0.57428622, + "learning_rate": 5.261873909343608e-07, + "loss": 0.59558845, + "num_input_tokens_seen": 276519110, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66796875, + "step": 12820, + "time_per_iteration": 2.4313409328460693 + }, + { + "auxiliary_loss_clip": 0.01101387, + "auxiliary_loss_mlp": 0.01026524, + "balance_loss_clip": 1.01476407, + "balance_loss_mlp": 1.03471613, + "epoch": 0.7708402224560349, + "flos": 28179426188160.0, + "grad_norm": 1.656188868997019, + "language_loss": 0.80691266, + "learning_rate": 5.259241447710343e-07, + "loss": 0.82819176, + "num_input_tokens_seen": 276538805, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6640625, + "step": 12821, + "time_per_iteration": 2.495997190475464 + }, + { + "auxiliary_loss_clip": 0.01102331, + "auxiliary_loss_mlp": 0.01029131, + "balance_loss_clip": 1.01740658, + "balance_loss_mlp": 1.0356462, + "epoch": 0.7709003457087028, + "flos": 15377057556480.0, + "grad_norm": 2.1643932163706388, + "language_loss": 0.68480009, + "learning_rate": 5.256609545048114e-07, + "loss": 0.70611471, + "num_input_tokens_seen": 276554770, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66796875, + "step": 12822, + "time_per_iteration": 2.390167236328125 + }, + { + "auxiliary_loss_clip": 0.01101193, + "auxiliary_loss_mlp": 0.01032018, + "balance_loss_clip": 1.02059174, + "balance_loss_mlp": 1.03602922, + "epoch": 0.7709604689613708, + "flos": 30621854257920.0, + "grad_norm": 1.6982430970073337, + "language_loss": 0.72335845, + "learning_rate": 5.253978201456733e-07, + "loss": 0.74469054, + "num_input_tokens_seen": 276574535, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 12823, + "time_per_iteration": 2.492733955383301 + }, + { + "auxiliary_loss_clip": 0.01108942, + "auxiliary_loss_mlp": 0.01037818, + "balance_loss_clip": 1.02433515, + "balance_loss_mlp": 1.03756452, + "epoch": 0.7710205922140387, + "flos": 20301272023680.0, + "grad_norm": 1.8295063286437603, + "language_loss": 0.76613212, + "learning_rate": 5.251347417035969e-07, + "loss": 0.78759968, + "num_input_tokens_seen": 276592925, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.71484375, + "step": 12824, + "time_per_iteration": 2.4176483154296875 + }, + { + "auxiliary_loss_clip": 0.01102375, + "auxiliary_loss_mlp": 0.01027118, + "balance_loss_clip": 1.01542997, + "balance_loss_mlp": 1.03651464, + "epoch": 0.7710807154667068, + "flos": 19644214487040.0, + "grad_norm": 3.0696602507520603, + "language_loss": 0.72657233, + "learning_rate": 5.248717191885592e-07, + "loss": 0.74786729, + "num_input_tokens_seen": 276610540, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 12825, + "time_per_iteration": 2.451836109161377 + }, + { + "auxiliary_loss_clip": 0.0109918, + "auxiliary_loss_mlp": 0.01032495, + "balance_loss_clip": 1.02191544, + "balance_loss_mlp": 1.03549349, + "epoch": 0.7711408387193747, + "flos": 20006337450240.0, + "grad_norm": 1.391969266660785, + "language_loss": 0.73613906, + "learning_rate": 5.246087526105343e-07, + "loss": 0.75745583, + "num_input_tokens_seen": 276629200, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.63671875, + "step": 12826, + "time_per_iteration": 2.4155168533325195 + }, + { + "auxiliary_loss_clip": 0.01102055, + "auxiliary_loss_mlp": 0.01031129, + "balance_loss_clip": 1.01887417, + "balance_loss_mlp": 1.03364134, + "epoch": 0.7712009619720427, + "flos": 24971131307520.0, + "grad_norm": 1.6262733051040712, + "language_loss": 0.81322646, + "learning_rate": 5.243458419794933e-07, + "loss": 0.83455837, + "num_input_tokens_seen": 276648655, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.68359375, + "step": 12827, + "time_per_iteration": 2.459195852279663 + }, + { + "auxiliary_loss_clip": 0.01024065, + "auxiliary_loss_mlp": 0.00999839, + "balance_loss_clip": 0.99881953, + "balance_loss_mlp": 1.00367689, + "epoch": 0.7712610852247107, + "flos": 63249681404160.0, + "grad_norm": 0.8804510230026851, + "language_loss": 0.55191517, + "learning_rate": 5.240829873054051e-07, + "loss": 0.57215428, + "num_input_tokens_seen": 276716500, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.20410156, + "step": 12828, + "time_per_iteration": 3.203558921813965 + }, + { + "auxiliary_loss_clip": 0.01099176, + "auxiliary_loss_mlp": 0.01026642, + "balance_loss_clip": 1.0158478, + "balance_loss_mlp": 1.03485942, + "epoch": 0.7713212084773786, + "flos": 18697860812160.0, + "grad_norm": 1.7353204568908176, + "language_loss": 0.69503725, + "learning_rate": 5.23820188598238e-07, + "loss": 0.71629542, + "num_input_tokens_seen": 276733535, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.640625, + "step": 12829, + "time_per_iteration": 2.447021722793579 + }, + { + "auxiliary_loss_clip": 0.01105724, + "auxiliary_loss_mlp": 0.01031583, + "balance_loss_clip": 1.01863086, + "balance_loss_mlp": 1.036901, + "epoch": 0.7713813317300466, + "flos": 14173367869440.0, + "grad_norm": 4.262950048849265, + "language_loss": 0.79446471, + "learning_rate": 5.235574458679579e-07, + "loss": 0.8158378, + "num_input_tokens_seen": 276749575, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6875, + "step": 12830, + "time_per_iteration": 2.3964903354644775 + }, + { + "auxiliary_loss_clip": 0.01106244, + "auxiliary_loss_mlp": 0.01031562, + "balance_loss_clip": 1.0183301, + "balance_loss_mlp": 1.03630996, + "epoch": 0.7714414549827145, + "flos": 25703960584320.0, + "grad_norm": 1.6021673475847413, + "language_loss": 0.78127801, + "learning_rate": 5.232947591245269e-07, + "loss": 0.80265611, + "num_input_tokens_seen": 276769460, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.69921875, + "step": 12831, + "time_per_iteration": 2.5234055519104004 + }, + { + "auxiliary_loss_clip": 0.01100095, + "auxiliary_loss_mlp": 0.01025829, + "balance_loss_clip": 1.01434898, + "balance_loss_mlp": 1.03424239, + "epoch": 0.7715015782353826, + "flos": 30555312312960.0, + "grad_norm": 1.5450896985633467, + "language_loss": 0.60894483, + "learning_rate": 5.230321283779071e-07, + "loss": 0.63020408, + "num_input_tokens_seen": 276790820, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65625, + "step": 12832, + "time_per_iteration": 2.492701530456543 + }, + { + "auxiliary_loss_clip": 0.0110303, + "auxiliary_loss_mlp": 0.01032842, + "balance_loss_clip": 1.02072477, + "balance_loss_mlp": 1.03454924, + "epoch": 0.7715617014880505, + "flos": 20229343038720.0, + "grad_norm": 1.7425232320118673, + "language_loss": 0.79137206, + "learning_rate": 5.227695536380572e-07, + "loss": 0.81273079, + "num_input_tokens_seen": 276811345, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 12833, + "time_per_iteration": 2.497288942337036 + }, + { + "auxiliary_loss_clip": 0.01024056, + "auxiliary_loss_mlp": 0.01003026, + "balance_loss_clip": 1.00204265, + "balance_loss_mlp": 1.00360727, + "epoch": 0.7716218247407185, + "flos": 63664770971520.0, + "grad_norm": 0.8686662344719275, + "language_loss": 0.55410403, + "learning_rate": 5.22507034914933e-07, + "loss": 0.57437485, + "num_input_tokens_seen": 276870950, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.20507812, + "step": 12834, + "time_per_iteration": 3.03043532371521 + }, + { + "auxiliary_loss_clip": 0.01103044, + "auxiliary_loss_mlp": 0.0102846, + "balance_loss_clip": 1.01647997, + "balance_loss_mlp": 1.03643119, + "epoch": 0.7716819479933864, + "flos": 19791807471360.0, + "grad_norm": 2.117345370793711, + "language_loss": 0.72845638, + "learning_rate": 5.222445722184903e-07, + "loss": 0.74977142, + "num_input_tokens_seen": 276890760, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6640625, + "step": 12835, + "time_per_iteration": 2.446268320083618 + }, + { + "auxiliary_loss_clip": 0.01101161, + "auxiliary_loss_mlp": 0.01036937, + "balance_loss_clip": 1.02434242, + "balance_loss_mlp": 1.03385723, + "epoch": 0.7717420712460544, + "flos": 18442176825600.0, + "grad_norm": 1.6490070086393855, + "language_loss": 0.70007384, + "learning_rate": 5.219821655586814e-07, + "loss": 0.7214548, + "num_input_tokens_seen": 276909625, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.671875, + "step": 12836, + "time_per_iteration": 2.4494271278381348 + }, + { + "auxiliary_loss_clip": 0.01100539, + "auxiliary_loss_mlp": 0.01030913, + "balance_loss_clip": 1.01912892, + "balance_loss_mlp": 1.03515959, + "epoch": 0.7718021944987223, + "flos": 35189476456320.0, + "grad_norm": 1.6293860419166157, + "language_loss": 0.59337658, + "learning_rate": 5.217198149454575e-07, + "loss": 0.61469114, + "num_input_tokens_seen": 276930760, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65234375, + "step": 12837, + "time_per_iteration": 2.5418989658355713 + }, + { + "auxiliary_loss_clip": 0.01023613, + "auxiliary_loss_mlp": 0.00999355, + "balance_loss_clip": 0.99835348, + "balance_loss_mlp": 1.00311017, + "epoch": 0.7718623177513904, + "flos": 67923167961600.0, + "grad_norm": 0.8631972633412854, + "language_loss": 0.5581463, + "learning_rate": 5.214575203887666e-07, + "loss": 0.578376, + "num_input_tokens_seen": 276989580, + "router_z_loss_clip": 0.01000977, + "router_z_loss_mlp": 0.20507812, + "step": 12838, + "time_per_iteration": 3.0269720554351807 + }, + { + "auxiliary_loss_clip": 0.01100948, + "auxiliary_loss_mlp": 0.01025399, + "balance_loss_clip": 1.01461673, + "balance_loss_mlp": 1.03597295, + "epoch": 0.7719224410040583, + "flos": 18581401941120.0, + "grad_norm": 2.4471669974150347, + "language_loss": 0.69294447, + "learning_rate": 5.211952818985538e-07, + "loss": 0.71420795, + "num_input_tokens_seen": 277005450, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6484375, + "step": 12839, + "time_per_iteration": 2.4177730083465576 + }, + { + "auxiliary_loss_clip": 0.01099889, + "auxiliary_loss_mlp": 0.01025095, + "balance_loss_clip": 1.01409793, + "balance_loss_mlp": 1.03574765, + "epoch": 0.7719825642567263, + "flos": 23075802264960.0, + "grad_norm": 1.7653669822284475, + "language_loss": 0.79856348, + "learning_rate": 5.209330994847647e-07, + "loss": 0.81981325, + "num_input_tokens_seen": 277023055, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.640625, + "step": 12840, + "time_per_iteration": 2.5179991722106934 + }, + { + "auxiliary_loss_clip": 0.01102241, + "auxiliary_loss_mlp": 0.01030173, + "balance_loss_clip": 1.01843691, + "balance_loss_mlp": 1.0361371, + "epoch": 0.7720426875093943, + "flos": 20339086066560.0, + "grad_norm": 1.7784222568456114, + "language_loss": 0.79938293, + "learning_rate": 5.206709731573402e-07, + "loss": 0.82070708, + "num_input_tokens_seen": 277041150, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66015625, + "step": 12841, + "time_per_iteration": 2.5245449542999268 + }, + { + "auxiliary_loss_clip": 0.01102151, + "auxiliary_loss_mlp": 0.01029086, + "balance_loss_clip": 1.01720667, + "balance_loss_mlp": 1.03537869, + "epoch": 0.7721028107620622, + "flos": 23880704181120.0, + "grad_norm": 1.4922109541948092, + "language_loss": 0.76314819, + "learning_rate": 5.204089029262208e-07, + "loss": 0.7844606, + "num_input_tokens_seen": 277063895, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66796875, + "step": 12842, + "time_per_iteration": 2.5023560523986816 + }, + { + "auxiliary_loss_clip": 0.01104825, + "auxiliary_loss_mlp": 0.01034383, + "balance_loss_clip": 1.02228308, + "balance_loss_mlp": 1.03711128, + "epoch": 0.7721629340147302, + "flos": 26651571235200.0, + "grad_norm": 2.1043616353717525, + "language_loss": 0.68631554, + "learning_rate": 5.201468888013445e-07, + "loss": 0.70770752, + "num_input_tokens_seen": 277084045, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 12843, + "time_per_iteration": 2.493771553039551 + }, + { + "auxiliary_loss_clip": 0.01103415, + "auxiliary_loss_mlp": 0.01029337, + "balance_loss_clip": 1.01814365, + "balance_loss_mlp": 1.03442502, + "epoch": 0.7722230572673981, + "flos": 21178857110400.0, + "grad_norm": 2.427096295958664, + "language_loss": 0.73946643, + "learning_rate": 5.198849307926465e-07, + "loss": 0.76079392, + "num_input_tokens_seen": 277102625, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6875, + "step": 12844, + "time_per_iteration": 3.8521201610565186 + }, + { + "auxiliary_loss_clip": 0.01099917, + "auxiliary_loss_mlp": 0.01028689, + "balance_loss_clip": 1.01715553, + "balance_loss_mlp": 1.03452098, + "epoch": 0.7722831805200662, + "flos": 27964644814080.0, + "grad_norm": 1.3727417180259405, + "language_loss": 0.7147876, + "learning_rate": 5.196230289100596e-07, + "loss": 0.73607367, + "num_input_tokens_seen": 277123210, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65234375, + "step": 12845, + "time_per_iteration": 2.480782985687256 + }, + { + "auxiliary_loss_clip": 0.01098431, + "auxiliary_loss_mlp": 0.01027462, + "balance_loss_clip": 1.01647639, + "balance_loss_mlp": 1.03456306, + "epoch": 0.7723433037727341, + "flos": 33875576864640.0, + "grad_norm": 1.8692274529253097, + "language_loss": 0.64329362, + "learning_rate": 5.193611831635159e-07, + "loss": 0.66455245, + "num_input_tokens_seen": 277144895, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.640625, + "step": 12846, + "time_per_iteration": 3.939861297607422 + }, + { + "auxiliary_loss_clip": 0.01024017, + "auxiliary_loss_mlp": 0.00999429, + "balance_loss_clip": 0.99847573, + "balance_loss_mlp": 1.00368702, + "epoch": 0.7724034270254021, + "flos": 62848271940480.0, + "grad_norm": 0.7797260608055787, + "language_loss": 0.61791992, + "learning_rate": 5.19099393562945e-07, + "loss": 0.63815439, + "num_input_tokens_seen": 277205160, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.203125, + "step": 12847, + "time_per_iteration": 4.373151063919067 + }, + { + "auxiliary_loss_clip": 0.01099082, + "auxiliary_loss_mlp": 0.01024766, + "balance_loss_clip": 1.01329207, + "balance_loss_mlp": 1.03237033, + "epoch": 0.77246355027807, + "flos": 23295467888640.0, + "grad_norm": 1.8104305553743092, + "language_loss": 0.78874886, + "learning_rate": 5.188376601182732e-07, + "loss": 0.80998737, + "num_input_tokens_seen": 277223005, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.66796875, + "step": 12848, + "time_per_iteration": 2.4621658325195312 + }, + { + "auxiliary_loss_clip": 0.01104725, + "auxiliary_loss_mlp": 0.01031435, + "balance_loss_clip": 1.02015185, + "balance_loss_mlp": 1.03665447, + "epoch": 0.772523673530738, + "flos": 20121287950080.0, + "grad_norm": 1.8870380118998122, + "language_loss": 0.73187292, + "learning_rate": 5.185759828394261e-07, + "loss": 0.75323451, + "num_input_tokens_seen": 277241785, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6796875, + "step": 12849, + "time_per_iteration": 2.423586368560791 + }, + { + "auxiliary_loss_clip": 0.01099584, + "auxiliary_loss_mlp": 0.01029656, + "balance_loss_clip": 1.01813483, + "balance_loss_mlp": 1.03409362, + "epoch": 0.7725837967834059, + "flos": 17820096157440.0, + "grad_norm": 1.7816955634054865, + "language_loss": 0.78761244, + "learning_rate": 5.183143617363261e-07, + "loss": 0.80890489, + "num_input_tokens_seen": 277259050, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 12850, + "time_per_iteration": 3.8340566158294678 + }, + { + "auxiliary_loss_clip": 0.0110087, + "auxiliary_loss_mlp": 0.01034019, + "balance_loss_clip": 1.02208018, + "balance_loss_mlp": 1.03316355, + "epoch": 0.772643920036074, + "flos": 27198921657600.0, + "grad_norm": 1.5411131818733386, + "language_loss": 0.79572296, + "learning_rate": 5.180527968188935e-07, + "loss": 0.81707186, + "num_input_tokens_seen": 277278235, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 12851, + "time_per_iteration": 2.4925901889801025 + }, + { + "auxiliary_loss_clip": 0.01100454, + "auxiliary_loss_mlp": 0.0103028, + "balance_loss_clip": 1.01792979, + "balance_loss_mlp": 1.03538489, + "epoch": 0.7727040432887419, + "flos": 21579512388480.0, + "grad_norm": 1.50632412923142, + "language_loss": 0.73631006, + "learning_rate": 5.177912880970474e-07, + "loss": 0.75761741, + "num_input_tokens_seen": 277298355, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6484375, + "step": 12852, + "time_per_iteration": 2.4682977199554443 + }, + { + "auxiliary_loss_clip": 0.01098009, + "auxiliary_loss_mlp": 0.01033845, + "balance_loss_clip": 1.02231097, + "balance_loss_mlp": 1.03296447, + "epoch": 0.7727641665414099, + "flos": 22236641752320.0, + "grad_norm": 1.8447801118424108, + "language_loss": 0.8239882, + "learning_rate": 5.17529835580704e-07, + "loss": 0.84530675, + "num_input_tokens_seen": 277316095, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 12853, + "time_per_iteration": 2.4569756984710693 + }, + { + "auxiliary_loss_clip": 0.01023792, + "auxiliary_loss_mlp": 0.0099718, + "balance_loss_clip": 0.99619693, + "balance_loss_mlp": 1.00358176, + "epoch": 0.7728242897940779, + "flos": 54832221463680.0, + "grad_norm": 0.9862475584721329, + "language_loss": 0.54506302, + "learning_rate": 5.172684392797786e-07, + "loss": 0.56527275, + "num_input_tokens_seen": 277380130, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.20214844, + "step": 12854, + "time_per_iteration": 3.091365098953247 + }, + { + "auxiliary_loss_clip": 0.01103537, + "auxiliary_loss_mlp": 0.01028551, + "balance_loss_clip": 1.01575994, + "balance_loss_mlp": 1.0352025, + "epoch": 0.7728844130467458, + "flos": 34461962392320.0, + "grad_norm": 1.475002899268902, + "language_loss": 0.71589357, + "learning_rate": 5.170070992041826e-07, + "loss": 0.73721445, + "num_input_tokens_seen": 277404015, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.68359375, + "step": 12855, + "time_per_iteration": 2.563339948654175 + }, + { + "auxiliary_loss_clip": 0.01100584, + "auxiliary_loss_mlp": 0.01029763, + "balance_loss_clip": 1.01721025, + "balance_loss_mlp": 1.03491831, + "epoch": 0.7729445362994138, + "flos": 18916341287040.0, + "grad_norm": 1.6853102907434419, + "language_loss": 0.67508936, + "learning_rate": 5.167458153638254e-07, + "loss": 0.69639283, + "num_input_tokens_seen": 277421375, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.65625, + "step": 12856, + "time_per_iteration": 2.4246950149536133 + }, + { + "auxiliary_loss_clip": 0.01102261, + "auxiliary_loss_mlp": 0.0102718, + "balance_loss_clip": 1.0153966, + "balance_loss_mlp": 1.03492117, + "epoch": 0.7730046595520818, + "flos": 22200048771840.0, + "grad_norm": 1.6664497759881594, + "language_loss": 0.78636038, + "learning_rate": 5.164845877686162e-07, + "loss": 0.8076548, + "num_input_tokens_seen": 277440170, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.671875, + "step": 12857, + "time_per_iteration": 2.4259722232818604 + }, + { + "auxiliary_loss_clip": 0.01099797, + "auxiliary_loss_mlp": 0.01029002, + "balance_loss_clip": 1.0170691, + "balance_loss_mlp": 1.03505707, + "epoch": 0.7730647828047498, + "flos": 13552328695680.0, + "grad_norm": 2.4693745762825627, + "language_loss": 0.78503597, + "learning_rate": 5.162234164284591e-07, + "loss": 0.80632401, + "num_input_tokens_seen": 277456880, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6484375, + "step": 12858, + "time_per_iteration": 2.414808988571167 + }, + { + "auxiliary_loss_clip": 0.0110013, + "auxiliary_loss_mlp": 0.01030418, + "balance_loss_clip": 1.01887894, + "balance_loss_mlp": 1.0332911, + "epoch": 0.7731249060574177, + "flos": 21976037602560.0, + "grad_norm": 2.1506807950165716, + "language_loss": 0.76832533, + "learning_rate": 5.159623013532591e-07, + "loss": 0.78963083, + "num_input_tokens_seen": 277475365, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66796875, + "step": 12859, + "time_per_iteration": 2.4226794242858887 + }, + { + "auxiliary_loss_clip": 0.01098676, + "auxiliary_loss_mlp": 0.01027748, + "balance_loss_clip": 1.01765668, + "balance_loss_mlp": 1.03636694, + "epoch": 0.7731850293100857, + "flos": 22601817371520.0, + "grad_norm": 1.3976193464700644, + "language_loss": 0.67598879, + "learning_rate": 5.157012425529186e-07, + "loss": 0.69725305, + "num_input_tokens_seen": 277494975, + "router_z_loss_clip": 0.10107422, + "router_z_loss_mlp": 0.62109375, + "step": 12860, + "time_per_iteration": 2.4838390350341797 + }, + { + "auxiliary_loss_clip": 0.01102762, + "auxiliary_loss_mlp": 0.01036693, + "balance_loss_clip": 1.02416456, + "balance_loss_mlp": 1.03352654, + "epoch": 0.7732451525627536, + "flos": 14098422142080.0, + "grad_norm": 2.447865183826217, + "language_loss": 0.7403549, + "learning_rate": 5.154402400373343e-07, + "loss": 0.76174939, + "num_input_tokens_seen": 277510520, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69140625, + "step": 12861, + "time_per_iteration": 2.4177722930908203 + }, + { + "auxiliary_loss_clip": 0.01105061, + "auxiliary_loss_mlp": 0.01029699, + "balance_loss_clip": 1.01744413, + "balance_loss_mlp": 1.03674173, + "epoch": 0.7733052758154216, + "flos": 21470020755840.0, + "grad_norm": 1.5943042288451297, + "language_loss": 0.74818659, + "learning_rate": 5.15179293816405e-07, + "loss": 0.76953417, + "num_input_tokens_seen": 277530505, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6796875, + "step": 12862, + "time_per_iteration": 2.502509832382202 + }, + { + "auxiliary_loss_clip": 0.01098685, + "auxiliary_loss_mlp": 0.0102901, + "balance_loss_clip": 1.01839459, + "balance_loss_mlp": 1.03460789, + "epoch": 0.7733653990680895, + "flos": 21394284929280.0, + "grad_norm": 1.605143243310102, + "language_loss": 0.82941031, + "learning_rate": 5.149184039000256e-07, + "loss": 0.85068727, + "num_input_tokens_seen": 277550810, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.640625, + "step": 12863, + "time_per_iteration": 2.435492753982544 + }, + { + "auxiliary_loss_clip": 0.01099256, + "auxiliary_loss_mlp": 0.01029825, + "balance_loss_clip": 1.01841044, + "balance_loss_mlp": 1.03421164, + "epoch": 0.7734255223207576, + "flos": 17676058619520.0, + "grad_norm": 1.686286227621035, + "language_loss": 0.73311162, + "learning_rate": 5.146575702980898e-07, + "loss": 0.7544024, + "num_input_tokens_seen": 277567680, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 12864, + "time_per_iteration": 2.4345412254333496 + }, + { + "auxiliary_loss_clip": 0.01100211, + "auxiliary_loss_mlp": 0.0103261, + "balance_loss_clip": 1.02182722, + "balance_loss_mlp": 1.03336382, + "epoch": 0.7734856455734255, + "flos": 25230837617280.0, + "grad_norm": 1.7236073313381683, + "language_loss": 0.82668412, + "learning_rate": 5.143967930204871e-07, + "loss": 0.84801233, + "num_input_tokens_seen": 277588970, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.66796875, + "step": 12865, + "time_per_iteration": 2.489175796508789 + }, + { + "auxiliary_loss_clip": 0.01106204, + "auxiliary_loss_mlp": 0.01031833, + "balance_loss_clip": 1.01882756, + "balance_loss_mlp": 1.03688681, + "epoch": 0.7735457688260935, + "flos": 23433112805760.0, + "grad_norm": 1.9919400131202358, + "language_loss": 0.71579105, + "learning_rate": 5.141360720771077e-07, + "loss": 0.73717141, + "num_input_tokens_seen": 277605450, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6953125, + "step": 12866, + "time_per_iteration": 2.4729628562927246 + }, + { + "auxiliary_loss_clip": 0.01103336, + "auxiliary_loss_mlp": 0.01026596, + "balance_loss_clip": 1.01476479, + "balance_loss_mlp": 1.03699803, + "epoch": 0.7736058920787615, + "flos": 18729246320640.0, + "grad_norm": 2.21518020983948, + "language_loss": 0.64429164, + "learning_rate": 5.138754074778371e-07, + "loss": 0.66559094, + "num_input_tokens_seen": 277622530, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 12867, + "time_per_iteration": 2.3936469554901123 + }, + { + "auxiliary_loss_clip": 0.01098589, + "auxiliary_loss_mlp": 0.01033636, + "balance_loss_clip": 1.02214408, + "balance_loss_mlp": 1.03422713, + "epoch": 0.7736660153314294, + "flos": 22893304239360.0, + "grad_norm": 1.4977465030205475, + "language_loss": 0.70845938, + "learning_rate": 5.136147992325595e-07, + "loss": 0.72978157, + "num_input_tokens_seen": 277642700, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.640625, + "step": 12868, + "time_per_iteration": 2.5017075538635254 + }, + { + "auxiliary_loss_clip": 0.01103278, + "auxiliary_loss_mlp": 0.01027324, + "balance_loss_clip": 1.0157187, + "balance_loss_mlp": 1.03648961, + "epoch": 0.7737261385840974, + "flos": 13800901789440.0, + "grad_norm": 2.3483431493436506, + "language_loss": 0.78185302, + "learning_rate": 5.133542473511578e-07, + "loss": 0.803159, + "num_input_tokens_seen": 277660005, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66796875, + "step": 12869, + "time_per_iteration": 2.4156572818756104 + }, + { + "auxiliary_loss_clip": 0.01095592, + "auxiliary_loss_mlp": 0.01027488, + "balance_loss_clip": 1.01597238, + "balance_loss_mlp": 1.0325917, + "epoch": 0.7737862618367654, + "flos": 28730727106560.0, + "grad_norm": 2.073469705859901, + "language_loss": 0.73596758, + "learning_rate": 5.130937518435124e-07, + "loss": 0.75719839, + "num_input_tokens_seen": 277682890, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6328125, + "step": 12870, + "time_per_iteration": 2.517237663269043 + }, + { + "auxiliary_loss_clip": 0.01101602, + "auxiliary_loss_mlp": 0.0103016, + "balance_loss_clip": 1.01848328, + "balance_loss_mlp": 1.03500986, + "epoch": 0.7738463850894334, + "flos": 17018570119680.0, + "grad_norm": 2.174151142441679, + "language_loss": 0.75611806, + "learning_rate": 5.12833312719501e-07, + "loss": 0.77743572, + "num_input_tokens_seen": 277699330, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 12871, + "time_per_iteration": 2.400402069091797 + }, + { + "auxiliary_loss_clip": 0.01099358, + "auxiliary_loss_mlp": 0.01030488, + "balance_loss_clip": 1.0195806, + "balance_loss_mlp": 1.03400016, + "epoch": 0.7739065083421013, + "flos": 20704010290560.0, + "grad_norm": 1.515902079714309, + "language_loss": 0.69396317, + "learning_rate": 5.12572929988999e-07, + "loss": 0.71526158, + "num_input_tokens_seen": 277718750, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.65234375, + "step": 12872, + "time_per_iteration": 2.454831838607788 + }, + { + "auxiliary_loss_clip": 0.01101254, + "auxiliary_loss_mlp": 0.01031553, + "balance_loss_clip": 1.01894081, + "balance_loss_mlp": 1.03436494, + "epoch": 0.7739666315947693, + "flos": 20697222620160.0, + "grad_norm": 2.1128263848604303, + "language_loss": 0.85076445, + "learning_rate": 5.123126036618804e-07, + "loss": 0.87209249, + "num_input_tokens_seen": 277734645, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.66796875, + "step": 12873, + "time_per_iteration": 2.413208484649658 + }, + { + "auxiliary_loss_clip": 0.01103416, + "auxiliary_loss_mlp": 0.0103242, + "balance_loss_clip": 1.02108884, + "balance_loss_mlp": 1.03663659, + "epoch": 0.7740267548474372, + "flos": 29570677718400.0, + "grad_norm": 2.3833664106096357, + "language_loss": 0.65228915, + "learning_rate": 5.120523337480174e-07, + "loss": 0.67364746, + "num_input_tokens_seen": 277755535, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 12874, + "time_per_iteration": 2.511897563934326 + }, + { + "auxiliary_loss_clip": 0.01101804, + "auxiliary_loss_mlp": 0.01029077, + "balance_loss_clip": 1.01711988, + "balance_loss_mlp": 1.03627491, + "epoch": 0.7740868781001052, + "flos": 23659099223040.0, + "grad_norm": 1.672939756784885, + "language_loss": 0.62344849, + "learning_rate": 5.117921202572785e-07, + "loss": 0.64475727, + "num_input_tokens_seen": 277775585, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 12875, + "time_per_iteration": 2.4547970294952393 + }, + { + "auxiliary_loss_clip": 0.01100089, + "auxiliary_loss_mlp": 0.01030816, + "balance_loss_clip": 1.01926494, + "balance_loss_mlp": 1.03329086, + "epoch": 0.7741470013527731, + "flos": 24717314828160.0, + "grad_norm": 1.7114118176893034, + "language_loss": 0.65592134, + "learning_rate": 5.115319631995318e-07, + "loss": 0.67723036, + "num_input_tokens_seen": 277794795, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66796875, + "step": 12876, + "time_per_iteration": 2.507066011428833 + }, + { + "auxiliary_loss_clip": 0.01097976, + "auxiliary_loss_mlp": 0.01029435, + "balance_loss_clip": 1.01795566, + "balance_loss_mlp": 1.03334641, + "epoch": 0.7742071246054412, + "flos": 21871645701120.0, + "grad_norm": 2.056913252626623, + "language_loss": 0.71540773, + "learning_rate": 5.112718625846433e-07, + "loss": 0.73668182, + "num_input_tokens_seen": 277813235, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6484375, + "step": 12877, + "time_per_iteration": 2.4201643466949463 + }, + { + "auxiliary_loss_clip": 0.01104162, + "auxiliary_loss_mlp": 0.01031249, + "balance_loss_clip": 1.0190177, + "balance_loss_mlp": 1.03517962, + "epoch": 0.7742672478581091, + "flos": 22674249146880.0, + "grad_norm": 1.8044293280530723, + "language_loss": 0.82859612, + "learning_rate": 5.110118184224736e-07, + "loss": 0.84995025, + "num_input_tokens_seen": 277832560, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6875, + "step": 12878, + "time_per_iteration": 2.4779839515686035 + }, + { + "auxiliary_loss_clip": 0.01101355, + "auxiliary_loss_mlp": 0.01031253, + "balance_loss_clip": 1.01875401, + "balance_loss_mlp": 1.03469586, + "epoch": 0.7743273711107771, + "flos": 18840892769280.0, + "grad_norm": 1.7446777293969558, + "language_loss": 0.73307019, + "learning_rate": 5.10751830722885e-07, + "loss": 0.75439632, + "num_input_tokens_seen": 277850120, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6640625, + "step": 12879, + "time_per_iteration": 2.4160289764404297 + }, + { + "auxiliary_loss_clip": 0.01095247, + "auxiliary_loss_mlp": 0.01026524, + "balance_loss_clip": 1.01504445, + "balance_loss_mlp": 1.03218174, + "epoch": 0.7743874943634451, + "flos": 28729326476160.0, + "grad_norm": 2.0530344125877824, + "language_loss": 0.79587936, + "learning_rate": 5.104918994957364e-07, + "loss": 0.81709713, + "num_input_tokens_seen": 277871020, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6328125, + "step": 12880, + "time_per_iteration": 2.5343987941741943 + }, + { + "auxiliary_loss_clip": 0.01098931, + "auxiliary_loss_mlp": 0.01032432, + "balance_loss_clip": 1.02097631, + "balance_loss_mlp": 1.03499806, + "epoch": 0.774447617616113, + "flos": 21909639312000.0, + "grad_norm": 1.5022230028348473, + "language_loss": 0.69992185, + "learning_rate": 5.102320247508847e-07, + "loss": 0.72123551, + "num_input_tokens_seen": 277891525, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.640625, + "step": 12881, + "time_per_iteration": 2.4520153999328613 + }, + { + "auxiliary_loss_clip": 0.01105007, + "auxiliary_loss_mlp": 0.01035783, + "balance_loss_clip": 1.02330136, + "balance_loss_mlp": 1.0357368, + "epoch": 0.774507740868781, + "flos": 19500643825920.0, + "grad_norm": 2.221505140298077, + "language_loss": 0.84215307, + "learning_rate": 5.099722064981832e-07, + "loss": 0.86356097, + "num_input_tokens_seen": 277910425, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69140625, + "step": 12882, + "time_per_iteration": 2.4450690746307373 + }, + { + "auxiliary_loss_clip": 0.01023891, + "auxiliary_loss_mlp": 0.01012882, + "balance_loss_clip": 1.01180887, + "balance_loss_mlp": 1.00356591, + "epoch": 0.774567864121449, + "flos": 59426560402560.0, + "grad_norm": 0.8021199290846766, + "language_loss": 0.6040681, + "learning_rate": 5.097124447474858e-07, + "loss": 0.62443578, + "num_input_tokens_seen": 277972795, + "router_z_loss_clip": 0.01074219, + "router_z_loss_mlp": 0.203125, + "step": 12883, + "time_per_iteration": 3.0097620487213135 + }, + { + "auxiliary_loss_clip": 0.01103604, + "auxiliary_loss_mlp": 0.01032342, + "balance_loss_clip": 1.02008712, + "balance_loss_mlp": 1.03575671, + "epoch": 0.774627987374117, + "flos": 13225326255360.0, + "grad_norm": 1.7162492869747636, + "language_loss": 0.72789645, + "learning_rate": 5.094527395086416e-07, + "loss": 0.7492559, + "num_input_tokens_seen": 277990675, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6796875, + "step": 12884, + "time_per_iteration": 2.4377074241638184 + }, + { + "auxiliary_loss_clip": 0.01100524, + "auxiliary_loss_mlp": 0.01030847, + "balance_loss_clip": 1.02003515, + "balance_loss_mlp": 1.0354799, + "epoch": 0.7746881106267849, + "flos": 21394033534080.0, + "grad_norm": 3.230363758289503, + "language_loss": 0.80970025, + "learning_rate": 5.091930907914986e-07, + "loss": 0.83101392, + "num_input_tokens_seen": 278010050, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.65234375, + "step": 12885, + "time_per_iteration": 2.4225785732269287 + }, + { + "auxiliary_loss_clip": 0.01098684, + "auxiliary_loss_mlp": 0.01033412, + "balance_loss_clip": 1.02272451, + "balance_loss_mlp": 1.03376412, + "epoch": 0.7747482338794529, + "flos": 25629338079360.0, + "grad_norm": 1.8035422481179095, + "language_loss": 0.64108509, + "learning_rate": 5.089334986059029e-07, + "loss": 0.66240609, + "num_input_tokens_seen": 278030660, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6484375, + "step": 12886, + "time_per_iteration": 3.857712507247925 + }, + { + "auxiliary_loss_clip": 0.01099608, + "auxiliary_loss_mlp": 0.01028414, + "balance_loss_clip": 1.0177393, + "balance_loss_mlp": 1.03219748, + "epoch": 0.7748083571321208, + "flos": 11546933402880.0, + "grad_norm": 2.0473331213234327, + "language_loss": 0.69581932, + "learning_rate": 5.086739629616987e-07, + "loss": 0.71709955, + "num_input_tokens_seen": 278047645, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.671875, + "step": 12887, + "time_per_iteration": 2.423344373703003 + }, + { + "auxiliary_loss_clip": 0.01097443, + "auxiliary_loss_mlp": 0.01028368, + "balance_loss_clip": 1.01763892, + "balance_loss_mlp": 1.03330386, + "epoch": 0.7748684803847888, + "flos": 19062425900160.0, + "grad_norm": 1.7264815005579048, + "language_loss": 0.70614457, + "learning_rate": 5.084144838687275e-07, + "loss": 0.72740269, + "num_input_tokens_seen": 278066170, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.64453125, + "step": 12888, + "time_per_iteration": 3.8539748191833496 + }, + { + "auxiliary_loss_clip": 0.01101208, + "auxiliary_loss_mlp": 0.01029001, + "balance_loss_clip": 1.01708579, + "balance_loss_mlp": 1.03361416, + "epoch": 0.7749286036374567, + "flos": 22273162905600.0, + "grad_norm": 2.628922406260807, + "language_loss": 0.81764227, + "learning_rate": 5.081550613368279e-07, + "loss": 0.83894438, + "num_input_tokens_seen": 278085545, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 12889, + "time_per_iteration": 3.9081172943115234 + }, + { + "auxiliary_loss_clip": 0.01100926, + "auxiliary_loss_mlp": 0.01029423, + "balance_loss_clip": 1.01818752, + "balance_loss_mlp": 1.0352304, + "epoch": 0.7749887268901248, + "flos": 20192462749440.0, + "grad_norm": 1.7934757747385575, + "language_loss": 0.79690224, + "learning_rate": 5.07895695375838e-07, + "loss": 0.81820571, + "num_input_tokens_seen": 278102995, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 12890, + "time_per_iteration": 2.4259889125823975 + }, + { + "auxiliary_loss_clip": 0.0110576, + "auxiliary_loss_mlp": 0.01031786, + "balance_loss_clip": 1.02038956, + "balance_loss_mlp": 1.03786349, + "epoch": 0.7750488501427927, + "flos": 20337541781760.0, + "grad_norm": 1.8078298047405903, + "language_loss": 0.6619277, + "learning_rate": 5.076363859955932e-07, + "loss": 0.68330312, + "num_input_tokens_seen": 278121460, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6796875, + "step": 12891, + "time_per_iteration": 2.421792984008789 + }, + { + "auxiliary_loss_clip": 0.01100105, + "auxiliary_loss_mlp": 0.01027779, + "balance_loss_clip": 1.01623988, + "balance_loss_mlp": 1.03472996, + "epoch": 0.7751089733954607, + "flos": 28364043116160.0, + "grad_norm": 1.4457356185681014, + "language_loss": 0.78705311, + "learning_rate": 5.073771332059257e-07, + "loss": 0.80833197, + "num_input_tokens_seen": 278143905, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65234375, + "step": 12892, + "time_per_iteration": 4.022496223449707 + }, + { + "auxiliary_loss_clip": 0.0110464, + "auxiliary_loss_mlp": 0.01025963, + "balance_loss_clip": 1.01451278, + "balance_loss_mlp": 1.03709579, + "epoch": 0.7751690966481286, + "flos": 16943803960320.0, + "grad_norm": 1.95553815104522, + "language_loss": 0.6747188, + "learning_rate": 5.071179370166669e-07, + "loss": 0.69602484, + "num_input_tokens_seen": 278160850, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.67578125, + "step": 12893, + "time_per_iteration": 2.4064764976501465 + }, + { + "auxiliary_loss_clip": 0.010241, + "auxiliary_loss_mlp": 0.01003293, + "balance_loss_clip": 1.0022974, + "balance_loss_mlp": 1.00361943, + "epoch": 0.7752292199007966, + "flos": 65668050339840.0, + "grad_norm": 0.8057156528399092, + "language_loss": 0.58470869, + "learning_rate": 5.068587974376468e-07, + "loss": 0.60498261, + "num_input_tokens_seen": 278219950, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.20507812, + "step": 12894, + "time_per_iteration": 3.0993287563323975 + }, + { + "auxiliary_loss_clip": 0.01103557, + "auxiliary_loss_mlp": 0.01030628, + "balance_loss_clip": 1.01871347, + "balance_loss_mlp": 1.03571117, + "epoch": 0.7752893431534646, + "flos": 20594662312320.0, + "grad_norm": 1.980811218300561, + "language_loss": 0.78687382, + "learning_rate": 5.065997144786895e-07, + "loss": 0.80821562, + "num_input_tokens_seen": 278237805, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 12895, + "time_per_iteration": 2.4280591011047363 + }, + { + "auxiliary_loss_clip": 0.01101744, + "auxiliary_loss_mlp": 0.01026712, + "balance_loss_clip": 1.01484489, + "balance_loss_mlp": 1.03593993, + "epoch": 0.7753494664061326, + "flos": 20485350247680.0, + "grad_norm": 1.9018795725509905, + "language_loss": 0.67731452, + "learning_rate": 5.063406881496209e-07, + "loss": 0.69859904, + "num_input_tokens_seen": 278257660, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.65625, + "step": 12896, + "time_per_iteration": 2.461527109146118 + }, + { + "auxiliary_loss_clip": 0.01099523, + "auxiliary_loss_mlp": 0.01033427, + "balance_loss_clip": 1.0224123, + "balance_loss_mlp": 1.0342598, + "epoch": 0.7754095896588006, + "flos": 20265900105600.0, + "grad_norm": 1.7046546419810793, + "language_loss": 0.69181269, + "learning_rate": 5.060817184602629e-07, + "loss": 0.71314216, + "num_input_tokens_seen": 278275110, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65234375, + "step": 12897, + "time_per_iteration": 2.4287121295928955 + }, + { + "auxiliary_loss_clip": 0.01103573, + "auxiliary_loss_mlp": 0.01034198, + "balance_loss_clip": 1.02193165, + "balance_loss_mlp": 1.03643906, + "epoch": 0.7754697129114685, + "flos": 23331091201920.0, + "grad_norm": 1.6934570873388384, + "language_loss": 0.75021553, + "learning_rate": 5.058228054204364e-07, + "loss": 0.77159327, + "num_input_tokens_seen": 278293035, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.671875, + "step": 12898, + "time_per_iteration": 2.476008415222168 + }, + { + "auxiliary_loss_clip": 0.01101597, + "auxiliary_loss_mlp": 0.01028302, + "balance_loss_clip": 1.01574945, + "balance_loss_mlp": 1.03492308, + "epoch": 0.7755298361641365, + "flos": 17347619635200.0, + "grad_norm": 2.107133651932301, + "language_loss": 0.70084441, + "learning_rate": 5.055639490399588e-07, + "loss": 0.72214341, + "num_input_tokens_seen": 278311010, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.66796875, + "step": 12899, + "time_per_iteration": 2.4085845947265625 + }, + { + "auxiliary_loss_clip": 0.01100599, + "auxiliary_loss_mlp": 0.01030096, + "balance_loss_clip": 1.01829982, + "balance_loss_mlp": 1.03514957, + "epoch": 0.7755899594168044, + "flos": 19645866512640.0, + "grad_norm": 1.8299634820170116, + "language_loss": 0.74540645, + "learning_rate": 5.053051493286453e-07, + "loss": 0.76671344, + "num_input_tokens_seen": 278329900, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65625, + "step": 12900, + "time_per_iteration": 2.463158369064331 + }, + { + "auxiliary_loss_clip": 0.01097147, + "auxiliary_loss_mlp": 0.010342, + "balance_loss_clip": 1.02339911, + "balance_loss_mlp": 1.03308296, + "epoch": 0.7756500826694724, + "flos": 27414457217280.0, + "grad_norm": 1.7671979453264242, + "language_loss": 0.77766836, + "learning_rate": 5.050464062963113e-07, + "loss": 0.79898179, + "num_input_tokens_seen": 278349980, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.640625, + "step": 12901, + "time_per_iteration": 2.487149715423584 + }, + { + "auxiliary_loss_clip": 0.01103314, + "auxiliary_loss_mlp": 0.01027981, + "balance_loss_clip": 1.01622105, + "balance_loss_mlp": 1.03825235, + "epoch": 0.7757102059221404, + "flos": 28730511624960.0, + "grad_norm": 1.6889669978659576, + "language_loss": 0.77270627, + "learning_rate": 5.047877199527666e-07, + "loss": 0.79401928, + "num_input_tokens_seen": 278372485, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6484375, + "step": 12902, + "time_per_iteration": 2.522047758102417 + }, + { + "auxiliary_loss_clip": 0.01099422, + "auxiliary_loss_mlp": 0.01028642, + "balance_loss_clip": 1.01794279, + "balance_loss_mlp": 1.03434253, + "epoch": 0.7757703291748084, + "flos": 22486795044480.0, + "grad_norm": 1.6966870042115003, + "language_loss": 0.73324692, + "learning_rate": 5.045290903078215e-07, + "loss": 0.75452751, + "num_input_tokens_seen": 278391660, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.65234375, + "step": 12903, + "time_per_iteration": 2.4301648139953613 + }, + { + "auxiliary_loss_clip": 0.01101778, + "auxiliary_loss_mlp": 0.01025962, + "balance_loss_clip": 1.01404119, + "balance_loss_mlp": 1.03656173, + "epoch": 0.7758304524274763, + "flos": 21430159637760.0, + "grad_norm": 2.1229192794074025, + "language_loss": 0.76073396, + "learning_rate": 5.042705173712835e-07, + "loss": 0.78201139, + "num_input_tokens_seen": 278409125, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65234375, + "step": 12904, + "time_per_iteration": 2.4397873878479004 + }, + { + "auxiliary_loss_clip": 0.01096338, + "auxiliary_loss_mlp": 0.01023985, + "balance_loss_clip": 1.01313651, + "balance_loss_mlp": 1.03401458, + "epoch": 0.7758905756801443, + "flos": 23659242877440.0, + "grad_norm": 2.282889081568611, + "language_loss": 0.68131924, + "learning_rate": 5.040120011529576e-07, + "loss": 0.70252246, + "num_input_tokens_seen": 278429450, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.625, + "step": 12905, + "time_per_iteration": 2.444009780883789 + }, + { + "auxiliary_loss_clip": 0.01098458, + "auxiliary_loss_mlp": 0.01027316, + "balance_loss_clip": 1.01636112, + "balance_loss_mlp": 1.03590798, + "epoch": 0.7759506989328122, + "flos": 28365479660160.0, + "grad_norm": 1.6520534873626833, + "language_loss": 0.67321658, + "learning_rate": 5.037535416626459e-07, + "loss": 0.69447428, + "num_input_tokens_seen": 278449925, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.625, + "step": 12906, + "time_per_iteration": 2.5024046897888184 + }, + { + "auxiliary_loss_clip": 0.01101029, + "auxiliary_loss_mlp": 0.01031572, + "balance_loss_clip": 1.02074146, + "balance_loss_mlp": 1.03560805, + "epoch": 0.7760108221854802, + "flos": 14902785354240.0, + "grad_norm": 3.183876280395432, + "language_loss": 0.81314665, + "learning_rate": 5.034951389101498e-07, + "loss": 0.83447266, + "num_input_tokens_seen": 278467255, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.65625, + "step": 12907, + "time_per_iteration": 2.3983490467071533 + }, + { + "auxiliary_loss_clip": 0.01098064, + "auxiliary_loss_mlp": 0.01030989, + "balance_loss_clip": 1.02001524, + "balance_loss_mlp": 1.0352093, + "epoch": 0.7760709454381483, + "flos": 14792503622400.0, + "grad_norm": 2.1955762882014604, + "language_loss": 0.67891413, + "learning_rate": 5.032367929052685e-07, + "loss": 0.70020467, + "num_input_tokens_seen": 278484250, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.62890625, + "step": 12908, + "time_per_iteration": 2.4205586910247803 + }, + { + "auxiliary_loss_clip": 0.0110402, + "auxiliary_loss_mlp": 0.01037159, + "balance_loss_clip": 1.02561998, + "balance_loss_mlp": 1.0367105, + "epoch": 0.7761310686908162, + "flos": 17379831156480.0, + "grad_norm": 1.5072254351199776, + "language_loss": 0.70509684, + "learning_rate": 5.029785036577976e-07, + "loss": 0.72650868, + "num_input_tokens_seen": 278502740, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 12909, + "time_per_iteration": 2.411200523376465 + }, + { + "auxiliary_loss_clip": 0.01098463, + "auxiliary_loss_mlp": 0.01031862, + "balance_loss_clip": 1.02122903, + "balance_loss_mlp": 1.03443766, + "epoch": 0.7761911919434842, + "flos": 25556547168000.0, + "grad_norm": 1.8009791603999328, + "language_loss": 0.677131, + "learning_rate": 5.027202711775324e-07, + "loss": 0.69843423, + "num_input_tokens_seen": 278523890, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.640625, + "step": 12910, + "time_per_iteration": 2.4990389347076416 + }, + { + "auxiliary_loss_clip": 0.01102537, + "auxiliary_loss_mlp": 0.01030888, + "balance_loss_clip": 1.01995671, + "balance_loss_mlp": 1.03720117, + "epoch": 0.7762513151961521, + "flos": 23179763203200.0, + "grad_norm": 1.6715228881797681, + "language_loss": 0.71815217, + "learning_rate": 5.024620954742646e-07, + "loss": 0.73948646, + "num_input_tokens_seen": 278543185, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.65234375, + "step": 12911, + "time_per_iteration": 2.4534413814544678 + }, + { + "auxiliary_loss_clip": 0.01105044, + "auxiliary_loss_mlp": 0.01032193, + "balance_loss_clip": 1.01966429, + "balance_loss_mlp": 1.03769255, + "epoch": 0.7763114384488201, + "flos": 21689614552320.0, + "grad_norm": 2.415333717110697, + "language_loss": 0.63629675, + "learning_rate": 5.022039765577836e-07, + "loss": 0.65766907, + "num_input_tokens_seen": 278559220, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.67578125, + "step": 12912, + "time_per_iteration": 2.485800266265869 + }, + { + "auxiliary_loss_clip": 0.01024108, + "auxiliary_loss_mlp": 0.00998178, + "balance_loss_clip": 0.99713534, + "balance_loss_mlp": 1.00357115, + "epoch": 0.776371561701488, + "flos": 69025554316800.0, + "grad_norm": 1.0865465621016743, + "language_loss": 0.53211093, + "learning_rate": 5.019459144378779e-07, + "loss": 0.55233377, + "num_input_tokens_seen": 278618185, + "router_z_loss_clip": 0.01043701, + "router_z_loss_mlp": 0.20507812, + "step": 12913, + "time_per_iteration": 3.1158273220062256 + }, + { + "auxiliary_loss_clip": 0.01102849, + "auxiliary_loss_mlp": 0.01031884, + "balance_loss_clip": 1.02007604, + "balance_loss_mlp": 1.03618884, + "epoch": 0.776431684954156, + "flos": 22893914770560.0, + "grad_norm": 2.955130949159741, + "language_loss": 0.62075317, + "learning_rate": 5.016879091243338e-07, + "loss": 0.64210051, + "num_input_tokens_seen": 278636210, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6640625, + "step": 12914, + "time_per_iteration": 2.4749767780303955 + }, + { + "auxiliary_loss_clip": 0.01099375, + "auxiliary_loss_mlp": 0.0103267, + "balance_loss_clip": 1.02070785, + "balance_loss_mlp": 1.03413486, + "epoch": 0.776491808206824, + "flos": 20261554560000.0, + "grad_norm": 1.8057060992355358, + "language_loss": 0.82471168, + "learning_rate": 5.014299606269339e-07, + "loss": 0.84603214, + "num_input_tokens_seen": 278653305, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.65234375, + "step": 12915, + "time_per_iteration": 2.439039468765259 + }, + { + "auxiliary_loss_clip": 0.01103501, + "auxiliary_loss_mlp": 0.01031701, + "balance_loss_clip": 1.01918375, + "balance_loss_mlp": 1.03486073, + "epoch": 0.776551931459492, + "flos": 26759051706240.0, + "grad_norm": 1.6623901678084019, + "language_loss": 0.7471149, + "learning_rate": 5.011720689554603e-07, + "loss": 0.76846689, + "num_input_tokens_seen": 278671850, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 12916, + "time_per_iteration": 2.494717836380005 + }, + { + "auxiliary_loss_clip": 0.01099429, + "auxiliary_loss_mlp": 0.01027417, + "balance_loss_clip": 1.01588905, + "balance_loss_mlp": 1.03332853, + "epoch": 0.7766120547121599, + "flos": 52665080250240.0, + "grad_norm": 1.7727217475878263, + "language_loss": 0.65696949, + "learning_rate": 5.009142341196919e-07, + "loss": 0.67823803, + "num_input_tokens_seen": 278697860, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66015625, + "step": 12917, + "time_per_iteration": 2.718024969100952 + }, + { + "auxiliary_loss_clip": 0.01100019, + "auxiliary_loss_mlp": 0.01034508, + "balance_loss_clip": 1.02317166, + "balance_loss_mlp": 1.03343606, + "epoch": 0.7766721779648279, + "flos": 25156215112320.0, + "grad_norm": 1.709981739113561, + "language_loss": 0.64356208, + "learning_rate": 5.006564561294065e-07, + "loss": 0.66490734, + "num_input_tokens_seen": 278720655, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 12918, + "time_per_iteration": 2.5265743732452393 + }, + { + "auxiliary_loss_clip": 0.01099633, + "auxiliary_loss_mlp": 0.01031711, + "balance_loss_clip": 1.02063048, + "balance_loss_mlp": 1.03485835, + "epoch": 0.7767323012174958, + "flos": 23760761690880.0, + "grad_norm": 2.1453981037999386, + "language_loss": 0.73354542, + "learning_rate": 5.003987349943777e-07, + "loss": 0.75485885, + "num_input_tokens_seen": 278737375, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6484375, + "step": 12919, + "time_per_iteration": 2.436053514480591 + }, + { + "auxiliary_loss_clip": 0.01102045, + "auxiliary_loss_mlp": 0.01031685, + "balance_loss_clip": 1.01979434, + "balance_loss_mlp": 1.03540444, + "epoch": 0.7767924244701638, + "flos": 22086642556800.0, + "grad_norm": 2.524282476401475, + "language_loss": 0.79217321, + "learning_rate": 5.001410707243792e-07, + "loss": 0.81351054, + "num_input_tokens_seen": 278756510, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 12920, + "time_per_iteration": 2.4638402462005615 + }, + { + "auxiliary_loss_clip": 0.01101955, + "auxiliary_loss_mlp": 0.01030068, + "balance_loss_clip": 1.01851654, + "balance_loss_mlp": 1.03587747, + "epoch": 0.7768525477228319, + "flos": 21981640124160.0, + "grad_norm": 1.5839883144130948, + "language_loss": 0.70594597, + "learning_rate": 4.998834633291829e-07, + "loss": 0.72726625, + "num_input_tokens_seen": 278775410, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6640625, + "step": 12921, + "time_per_iteration": 2.4318997859954834 + }, + { + "auxiliary_loss_clip": 0.01103624, + "auxiliary_loss_mlp": 0.01027271, + "balance_loss_clip": 1.01492715, + "balance_loss_mlp": 1.03501809, + "epoch": 0.7769126709754998, + "flos": 21794581071360.0, + "grad_norm": 1.7058717810568553, + "language_loss": 0.76330459, + "learning_rate": 4.996259128185547e-07, + "loss": 0.78461355, + "num_input_tokens_seen": 278794260, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 12922, + "time_per_iteration": 2.470374345779419 + }, + { + "auxiliary_loss_clip": 0.01102145, + "auxiliary_loss_mlp": 0.01031584, + "balance_loss_clip": 1.01991987, + "balance_loss_mlp": 1.03619885, + "epoch": 0.7769727942281678, + "flos": 20047994248320.0, + "grad_norm": 1.882909865169764, + "language_loss": 0.80363363, + "learning_rate": 4.993684192022625e-07, + "loss": 0.82497096, + "num_input_tokens_seen": 278813290, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66015625, + "step": 12923, + "time_per_iteration": 2.488701343536377 + }, + { + "auxiliary_loss_clip": 0.01102496, + "auxiliary_loss_mlp": 0.01031608, + "balance_loss_clip": 1.02036023, + "balance_loss_mlp": 1.03716397, + "epoch": 0.7770329174808357, + "flos": 21686777377920.0, + "grad_norm": 1.9867390382218033, + "language_loss": 0.92483282, + "learning_rate": 4.991109824900699e-07, + "loss": 0.94617379, + "num_input_tokens_seen": 278830610, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 12924, + "time_per_iteration": 2.452601194381714 + }, + { + "auxiliary_loss_clip": 0.010996, + "auxiliary_loss_mlp": 0.01029134, + "balance_loss_clip": 1.01750469, + "balance_loss_mlp": 1.03356338, + "epoch": 0.7770930407335037, + "flos": 25849255098240.0, + "grad_norm": 1.980221846763212, + "language_loss": 0.65940827, + "learning_rate": 4.988536026917401e-07, + "loss": 0.68069565, + "num_input_tokens_seen": 278849530, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66015625, + "step": 12925, + "time_per_iteration": 2.4850525856018066 + }, + { + "auxiliary_loss_clip": 0.01103083, + "auxiliary_loss_mlp": 0.01035406, + "balance_loss_clip": 1.02330625, + "balance_loss_mlp": 1.03621173, + "epoch": 0.7771531639861716, + "flos": 24347865490560.0, + "grad_norm": 1.7141356167818045, + "language_loss": 0.71911299, + "learning_rate": 4.985962798170314e-07, + "loss": 0.74049789, + "num_input_tokens_seen": 278869005, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66796875, + "step": 12926, + "time_per_iteration": 2.4577598571777344 + }, + { + "auxiliary_loss_clip": 0.01103729, + "auxiliary_loss_mlp": 0.01026778, + "balance_loss_clip": 1.01420105, + "balance_loss_mlp": 1.03604841, + "epoch": 0.7772132872388396, + "flos": 25629948610560.0, + "grad_norm": 1.8312057216887105, + "language_loss": 0.65467525, + "learning_rate": 4.983390138757027e-07, + "loss": 0.67598033, + "num_input_tokens_seen": 278888790, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.67578125, + "step": 12927, + "time_per_iteration": 2.4614973068237305 + }, + { + "auxiliary_loss_clip": 0.01101116, + "auxiliary_loss_mlp": 0.01036421, + "balance_loss_clip": 1.02413607, + "balance_loss_mlp": 1.03512836, + "epoch": 0.7772734104915076, + "flos": 26067412350720.0, + "grad_norm": 1.745612038393379, + "language_loss": 0.72182518, + "learning_rate": 4.980818048775093e-07, + "loss": 0.74320054, + "num_input_tokens_seen": 278908150, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66015625, + "step": 12928, + "time_per_iteration": 3.8557302951812744 + }, + { + "auxiliary_loss_clip": 0.01097726, + "auxiliary_loss_mlp": 0.01029217, + "balance_loss_clip": 1.01759386, + "balance_loss_mlp": 1.03366995, + "epoch": 0.7773335337441756, + "flos": 22925048883840.0, + "grad_norm": 1.6060667874854504, + "language_loss": 0.73954302, + "learning_rate": 4.978246528322036e-07, + "loss": 0.76081246, + "num_input_tokens_seen": 278927425, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.640625, + "step": 12929, + "time_per_iteration": 2.4402310848236084 + }, + { + "auxiliary_loss_clip": 0.01101677, + "auxiliary_loss_mlp": 0.01029041, + "balance_loss_clip": 1.01716232, + "balance_loss_mlp": 1.03536963, + "epoch": 0.7773936569968435, + "flos": 20776765288320.0, + "grad_norm": 1.8904453576029416, + "language_loss": 0.77982825, + "learning_rate": 4.975675577495377e-07, + "loss": 0.80113542, + "num_input_tokens_seen": 278946475, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 12930, + "time_per_iteration": 3.86580491065979 + }, + { + "auxiliary_loss_clip": 0.01101392, + "auxiliary_loss_mlp": 0.01030849, + "balance_loss_clip": 1.01922631, + "balance_loss_mlp": 1.03665566, + "epoch": 0.7774537802495115, + "flos": 20372267255040.0, + "grad_norm": 1.7459832422973112, + "language_loss": 0.79347777, + "learning_rate": 4.973105196392613e-07, + "loss": 0.81480014, + "num_input_tokens_seen": 278964345, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6484375, + "step": 12931, + "time_per_iteration": 3.8444814682006836 + }, + { + "auxiliary_loss_clip": 0.01023847, + "auxiliary_loss_mlp": 0.01000444, + "balance_loss_clip": 0.99943125, + "balance_loss_mlp": 1.00351429, + "epoch": 0.7775139035021794, + "flos": 53912081738880.0, + "grad_norm": 0.8066088266331831, + "language_loss": 0.59735709, + "learning_rate": 4.970535385111199e-07, + "loss": 0.61760002, + "num_input_tokens_seen": 279022380, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.203125, + "step": 12932, + "time_per_iteration": 3.025099039077759 + }, + { + "auxiliary_loss_clip": 0.01102256, + "auxiliary_loss_mlp": 0.0102973, + "balance_loss_clip": 1.01822686, + "balance_loss_mlp": 1.03569841, + "epoch": 0.7775740267548474, + "flos": 28842481296000.0, + "grad_norm": 1.4815322595088087, + "language_loss": 0.76235545, + "learning_rate": 4.967966143748595e-07, + "loss": 0.78367525, + "num_input_tokens_seen": 279044275, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 12933, + "time_per_iteration": 4.019074440002441 + }, + { + "auxiliary_loss_clip": 0.01102231, + "auxiliary_loss_mlp": 0.01032552, + "balance_loss_clip": 1.02023768, + "balance_loss_mlp": 1.03603268, + "epoch": 0.7776341500075155, + "flos": 21872471713920.0, + "grad_norm": 2.0481953339666026, + "language_loss": 0.73607898, + "learning_rate": 4.965397472402215e-07, + "loss": 0.7574268, + "num_input_tokens_seen": 279063375, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6640625, + "step": 12934, + "time_per_iteration": 2.4444801807403564 + }, + { + "auxiliary_loss_clip": 0.01103168, + "auxiliary_loss_mlp": 0.0102577, + "balance_loss_clip": 1.01344395, + "balance_loss_mlp": 1.03648293, + "epoch": 0.7776942732601834, + "flos": 20229845829120.0, + "grad_norm": 1.8918830226491183, + "language_loss": 0.70461309, + "learning_rate": 4.962829371169475e-07, + "loss": 0.72590244, + "num_input_tokens_seen": 279082680, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66796875, + "step": 12935, + "time_per_iteration": 2.461881637573242 + }, + { + "auxiliary_loss_clip": 0.01103005, + "auxiliary_loss_mlp": 0.01037697, + "balance_loss_clip": 1.02545476, + "balance_loss_mlp": 1.03640771, + "epoch": 0.7777543965128514, + "flos": 22231829329920.0, + "grad_norm": 1.5340308714380857, + "language_loss": 0.83742738, + "learning_rate": 4.960261840147746e-07, + "loss": 0.85883445, + "num_input_tokens_seen": 279099805, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6640625, + "step": 12936, + "time_per_iteration": 2.4495856761932373 + }, + { + "auxiliary_loss_clip": 0.01103271, + "auxiliary_loss_mlp": 0.01027254, + "balance_loss_clip": 1.01662064, + "balance_loss_mlp": 1.03480935, + "epoch": 0.7778145197655193, + "flos": 14501950508160.0, + "grad_norm": 2.0135584494243255, + "language_loss": 0.67168462, + "learning_rate": 4.957694879434397e-07, + "loss": 0.69298995, + "num_input_tokens_seen": 279117975, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6875, + "step": 12937, + "time_per_iteration": 2.4478330612182617 + }, + { + "auxiliary_loss_clip": 0.01101824, + "auxiliary_loss_mlp": 0.01027729, + "balance_loss_clip": 1.01624966, + "balance_loss_mlp": 1.03470469, + "epoch": 0.7778746430181873, + "flos": 21140288881920.0, + "grad_norm": 1.648957424958238, + "language_loss": 0.868128, + "learning_rate": 4.955128489126777e-07, + "loss": 0.88942349, + "num_input_tokens_seen": 279137255, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 12938, + "time_per_iteration": 2.4823062419891357 + }, + { + "auxiliary_loss_clip": 0.01101697, + "auxiliary_loss_mlp": 0.01027803, + "balance_loss_clip": 1.01571488, + "balance_loss_mlp": 1.03527653, + "epoch": 0.7779347662708552, + "flos": 20266366982400.0, + "grad_norm": 1.8176002406557528, + "language_loss": 0.85162985, + "learning_rate": 4.95256266932218e-07, + "loss": 0.8729248, + "num_input_tokens_seen": 279154500, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6640625, + "step": 12939, + "time_per_iteration": 2.465057611465454 + }, + { + "auxiliary_loss_clip": 0.01097955, + "auxiliary_loss_mlp": 0.01031617, + "balance_loss_clip": 1.0204885, + "balance_loss_mlp": 1.03464723, + "epoch": 0.7779948895235232, + "flos": 19209013303680.0, + "grad_norm": 1.9198356417092663, + "language_loss": 0.68793273, + "learning_rate": 4.949997420117915e-07, + "loss": 0.70922846, + "num_input_tokens_seen": 279173635, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6328125, + "step": 12940, + "time_per_iteration": 2.4191107749938965 + }, + { + "auxiliary_loss_clip": 0.01100503, + "auxiliary_loss_mlp": 0.01024954, + "balance_loss_clip": 1.01418972, + "balance_loss_mlp": 1.03387284, + "epoch": 0.7780550127761912, + "flos": 23914711382400.0, + "grad_norm": 1.6186124498470607, + "language_loss": 0.77783638, + "learning_rate": 4.947432741611255e-07, + "loss": 0.79909098, + "num_input_tokens_seen": 279194430, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.66796875, + "step": 12941, + "time_per_iteration": 2.5182301998138428 + }, + { + "auxiliary_loss_clip": 0.01103819, + "auxiliary_loss_mlp": 0.01032841, + "balance_loss_clip": 1.02010357, + "balance_loss_mlp": 1.03493428, + "epoch": 0.7781151360288592, + "flos": 32415951795840.0, + "grad_norm": 2.252237972252455, + "language_loss": 0.73223758, + "learning_rate": 4.944868633899462e-07, + "loss": 0.75360417, + "num_input_tokens_seen": 279212920, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 12942, + "time_per_iteration": 2.5156443119049072 + }, + { + "auxiliary_loss_clip": 0.01097922, + "auxiliary_loss_mlp": 0.01031145, + "balance_loss_clip": 1.02004635, + "balance_loss_mlp": 1.03366685, + "epoch": 0.7781752592815271, + "flos": 22346384780160.0, + "grad_norm": 3.1295555400179653, + "language_loss": 0.6771059, + "learning_rate": 4.942305097079751e-07, + "loss": 0.69839656, + "num_input_tokens_seen": 279232310, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.640625, + "step": 12943, + "time_per_iteration": 2.4742066860198975 + }, + { + "auxiliary_loss_clip": 0.01023917, + "auxiliary_loss_mlp": 0.00999519, + "balance_loss_clip": 0.99852365, + "balance_loss_mlp": 1.00365448, + "epoch": 0.7782353825341951, + "flos": 70460183520000.0, + "grad_norm": 0.7816270653723761, + "language_loss": 0.5855267, + "learning_rate": 4.939742131249347e-07, + "loss": 0.60576105, + "num_input_tokens_seen": 279295375, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.203125, + "step": 12944, + "time_per_iteration": 3.1933257579803467 + }, + { + "auxiliary_loss_clip": 0.01103658, + "auxiliary_loss_mlp": 0.01035194, + "balance_loss_clip": 1.0226059, + "balance_loss_mlp": 1.03550398, + "epoch": 0.778295505786863, + "flos": 19062569554560.0, + "grad_norm": 2.222805879365814, + "language_loss": 0.6770618, + "learning_rate": 4.937179736505428e-07, + "loss": 0.69845027, + "num_input_tokens_seen": 279313660, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6796875, + "step": 12945, + "time_per_iteration": 2.4619064331054688 + }, + { + "auxiliary_loss_clip": 0.01101979, + "auxiliary_loss_mlp": 0.01031112, + "balance_loss_clip": 1.01970923, + "balance_loss_mlp": 1.03608465, + "epoch": 0.778355629039531, + "flos": 20999734963200.0, + "grad_norm": 1.9340302005475807, + "language_loss": 0.69121152, + "learning_rate": 4.93461791294516e-07, + "loss": 0.71254241, + "num_input_tokens_seen": 279334495, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66015625, + "step": 12946, + "time_per_iteration": 2.459763526916504 + }, + { + "auxiliary_loss_clip": 0.0110194, + "auxiliary_loss_mlp": 0.01027375, + "balance_loss_clip": 1.01551938, + "balance_loss_mlp": 1.03543854, + "epoch": 0.7784157522921991, + "flos": 21398091770880.0, + "grad_norm": 2.351828874315234, + "language_loss": 0.65289766, + "learning_rate": 4.932056660665689e-07, + "loss": 0.67419076, + "num_input_tokens_seen": 279352985, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6640625, + "step": 12947, + "time_per_iteration": 2.4477789402008057 + }, + { + "auxiliary_loss_clip": 0.01100664, + "auxiliary_loss_mlp": 0.01033296, + "balance_loss_clip": 1.02189326, + "balance_loss_mlp": 1.0360136, + "epoch": 0.778475875544867, + "flos": 20813861059200.0, + "grad_norm": 2.161531176276814, + "language_loss": 0.65099561, + "learning_rate": 4.929495979764147e-07, + "loss": 0.67233521, + "num_input_tokens_seen": 279371360, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6484375, + "step": 12948, + "time_per_iteration": 2.4290242195129395 + }, + { + "auxiliary_loss_clip": 0.01100958, + "auxiliary_loss_mlp": 0.01030087, + "balance_loss_clip": 1.01845825, + "balance_loss_mlp": 1.03465629, + "epoch": 0.778535998797535, + "flos": 14355363104640.0, + "grad_norm": 1.6859142998281702, + "language_loss": 0.74930477, + "learning_rate": 4.926935870337625e-07, + "loss": 0.77061522, + "num_input_tokens_seen": 279389400, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 12949, + "time_per_iteration": 2.4495837688446045 + }, + { + "auxiliary_loss_clip": 0.01105998, + "auxiliary_loss_mlp": 0.01032538, + "balance_loss_clip": 1.02037239, + "balance_loss_mlp": 1.03724563, + "epoch": 0.7785961220502029, + "flos": 19209552007680.0, + "grad_norm": 1.519597637019559, + "language_loss": 0.68952882, + "learning_rate": 4.924376332483202e-07, + "loss": 0.71091413, + "num_input_tokens_seen": 279409715, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6875, + "step": 12950, + "time_per_iteration": 2.4255573749542236 + }, + { + "auxiliary_loss_clip": 0.01099665, + "auxiliary_loss_mlp": 0.01028219, + "balance_loss_clip": 1.01623845, + "balance_loss_mlp": 1.0328927, + "epoch": 0.7786562453028709, + "flos": 25738757884800.0, + "grad_norm": 1.6317845562293505, + "language_loss": 0.71912777, + "learning_rate": 4.921817366297938e-07, + "loss": 0.74040663, + "num_input_tokens_seen": 279427705, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 12951, + "time_per_iteration": 2.481668710708618 + }, + { + "auxiliary_loss_clip": 0.0109957, + "auxiliary_loss_mlp": 0.0102935, + "balance_loss_clip": 1.017977, + "balance_loss_mlp": 1.03646922, + "epoch": 0.7787163685555388, + "flos": 25739440243200.0, + "grad_norm": 1.6634043770166038, + "language_loss": 0.65471166, + "learning_rate": 4.919258971878877e-07, + "loss": 0.67600083, + "num_input_tokens_seen": 279448215, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6328125, + "step": 12952, + "time_per_iteration": 2.4531540870666504 + }, + { + "auxiliary_loss_clip": 0.01093756, + "auxiliary_loss_mlp": 0.01026755, + "balance_loss_clip": 1.01577616, + "balance_loss_mlp": 1.03269386, + "epoch": 0.7787764918082068, + "flos": 22747722416640.0, + "grad_norm": 1.5845487757509182, + "language_loss": 0.81134123, + "learning_rate": 4.916701149323022e-07, + "loss": 0.83254635, + "num_input_tokens_seen": 279466260, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.609375, + "step": 12953, + "time_per_iteration": 2.463089942932129 + }, + { + "auxiliary_loss_clip": 0.01106842, + "auxiliary_loss_mlp": 0.01031184, + "balance_loss_clip": 1.01938844, + "balance_loss_mlp": 1.03845859, + "epoch": 0.7788366150608748, + "flos": 15190860430080.0, + "grad_norm": 3.927672519957359, + "language_loss": 0.77081442, + "learning_rate": 4.91414389872737e-07, + "loss": 0.79219466, + "num_input_tokens_seen": 279484520, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.68359375, + "step": 12954, + "time_per_iteration": 2.407898187637329 + }, + { + "auxiliary_loss_clip": 0.01103106, + "auxiliary_loss_mlp": 0.0102569, + "balance_loss_clip": 1.01457942, + "balance_loss_mlp": 1.03563237, + "epoch": 0.7788967383135428, + "flos": 21210242618880.0, + "grad_norm": 1.509444537933962, + "language_loss": 0.72937489, + "learning_rate": 4.911587220188905e-07, + "loss": 0.7506628, + "num_input_tokens_seen": 279503130, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.67578125, + "step": 12955, + "time_per_iteration": 2.4522764682769775 + }, + { + "auxiliary_loss_clip": 0.01100775, + "auxiliary_loss_mlp": 0.01028796, + "balance_loss_clip": 1.0172863, + "balance_loss_mlp": 1.03384817, + "epoch": 0.7789568615662107, + "flos": 21682970536320.0, + "grad_norm": 1.439262912645897, + "language_loss": 0.68722045, + "learning_rate": 4.909031113804551e-07, + "loss": 0.70851612, + "num_input_tokens_seen": 279521930, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66796875, + "step": 12956, + "time_per_iteration": 2.4333713054656982 + }, + { + "auxiliary_loss_clip": 0.01100355, + "auxiliary_loss_mlp": 0.01028981, + "balance_loss_clip": 1.01781666, + "balance_loss_mlp": 1.03511453, + "epoch": 0.7790169848188787, + "flos": 26360371676160.0, + "grad_norm": 1.517896090927025, + "language_loss": 0.76230508, + "learning_rate": 4.906475579671252e-07, + "loss": 0.78359848, + "num_input_tokens_seen": 279542375, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.65234375, + "step": 12957, + "time_per_iteration": 2.503735065460205 + }, + { + "auxiliary_loss_clip": 0.01100381, + "auxiliary_loss_mlp": 0.01028257, + "balance_loss_clip": 1.01647925, + "balance_loss_mlp": 1.03468633, + "epoch": 0.7790771080715466, + "flos": 25516183259520.0, + "grad_norm": 1.5979731248356082, + "language_loss": 0.77661026, + "learning_rate": 4.903920617885917e-07, + "loss": 0.79789662, + "num_input_tokens_seen": 279561885, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.65625, + "step": 12958, + "time_per_iteration": 2.470494270324707 + }, + { + "auxiliary_loss_clip": 0.01103674, + "auxiliary_loss_mlp": 0.01037597, + "balance_loss_clip": 1.02521682, + "balance_loss_mlp": 1.03665078, + "epoch": 0.7791372313242146, + "flos": 16034186920320.0, + "grad_norm": 1.8919094933835359, + "language_loss": 0.71729428, + "learning_rate": 4.901366228545418e-07, + "loss": 0.73870701, + "num_input_tokens_seen": 279579965, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 12959, + "time_per_iteration": 2.4404170513153076 + }, + { + "auxiliary_loss_clip": 0.01099647, + "auxiliary_loss_mlp": 0.01031546, + "balance_loss_clip": 1.02000022, + "balance_loss_mlp": 1.03415179, + "epoch": 0.7791973545768827, + "flos": 23842207779840.0, + "grad_norm": 2.165413341103088, + "language_loss": 0.7770282, + "learning_rate": 4.898812411746632e-07, + "loss": 0.79834014, + "num_input_tokens_seen": 279599030, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 12960, + "time_per_iteration": 2.4568068981170654 + }, + { + "auxiliary_loss_clip": 0.01103395, + "auxiliary_loss_mlp": 0.01034613, + "balance_loss_clip": 1.02233458, + "balance_loss_mlp": 1.03674865, + "epoch": 0.7792574778295506, + "flos": 24168384207360.0, + "grad_norm": 1.9020069613466535, + "language_loss": 0.75351453, + "learning_rate": 4.896259167586385e-07, + "loss": 0.7748946, + "num_input_tokens_seen": 279614400, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66796875, + "step": 12961, + "time_per_iteration": 2.4992313385009766 + }, + { + "auxiliary_loss_clip": 0.0109806, + "auxiliary_loss_mlp": 0.01035181, + "balance_loss_clip": 1.02412987, + "balance_loss_mlp": 1.03634429, + "epoch": 0.7793176010822186, + "flos": 21464921024640.0, + "grad_norm": 1.602325654578752, + "language_loss": 0.73415077, + "learning_rate": 4.893706496161511e-07, + "loss": 0.75548315, + "num_input_tokens_seen": 279633745, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6171875, + "step": 12962, + "time_per_iteration": 2.4623515605926514 + }, + { + "auxiliary_loss_clip": 0.01100833, + "auxiliary_loss_mlp": 0.01027912, + "balance_loss_clip": 1.01642597, + "balance_loss_mlp": 1.03580284, + "epoch": 0.7793777243348865, + "flos": 20666699038080.0, + "grad_norm": 4.864590611193701, + "language_loss": 0.6971066, + "learning_rate": 4.891154397568795e-07, + "loss": 0.71839404, + "num_input_tokens_seen": 279651165, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6484375, + "step": 12963, + "time_per_iteration": 2.4501214027404785 + }, + { + "auxiliary_loss_clip": 0.01102284, + "auxiliary_loss_mlp": 0.01030764, + "balance_loss_clip": 1.01936793, + "balance_loss_mlp": 1.0372932, + "epoch": 0.7794378475875545, + "flos": 27125771610240.0, + "grad_norm": 1.8027321276281432, + "language_loss": 0.63654995, + "learning_rate": 4.888602871905019e-07, + "loss": 0.65788043, + "num_input_tokens_seen": 279671175, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6484375, + "step": 12964, + "time_per_iteration": 2.491323709487915 + }, + { + "auxiliary_loss_clip": 0.01102129, + "auxiliary_loss_mlp": 0.01031652, + "balance_loss_clip": 1.02017188, + "balance_loss_mlp": 1.03510189, + "epoch": 0.7794979708402224, + "flos": 28074136446720.0, + "grad_norm": 1.518939457679847, + "language_loss": 0.7682904, + "learning_rate": 4.88605191926694e-07, + "loss": 0.78962815, + "num_input_tokens_seen": 279688675, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.671875, + "step": 12965, + "time_per_iteration": 2.528763771057129 + }, + { + "auxiliary_loss_clip": 0.01094543, + "auxiliary_loss_mlp": 0.01027037, + "balance_loss_clip": 1.01648712, + "balance_loss_mlp": 1.03374982, + "epoch": 0.7795580940928905, + "flos": 26869548919680.0, + "grad_norm": 2.0628769649637273, + "language_loss": 0.73018187, + "learning_rate": 4.883501539751289e-07, + "loss": 0.75139767, + "num_input_tokens_seen": 279710245, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.609375, + "step": 12966, + "time_per_iteration": 2.484900951385498 + }, + { + "auxiliary_loss_clip": 0.01098331, + "auxiliary_loss_mlp": 0.01025441, + "balance_loss_clip": 1.01554668, + "balance_loss_mlp": 1.03585887, + "epoch": 0.7796182173455584, + "flos": 23835384195840.0, + "grad_norm": 1.5008219463106178, + "language_loss": 0.73900837, + "learning_rate": 4.880951733454768e-07, + "loss": 0.76024604, + "num_input_tokens_seen": 279729045, + "router_z_loss_clip": 0.09912109, + "router_z_loss_mlp": 0.625, + "step": 12967, + "time_per_iteration": 2.478590250015259 + }, + { + "auxiliary_loss_clip": 0.01102816, + "auxiliary_loss_mlp": 0.01025264, + "balance_loss_clip": 1.01353419, + "balance_loss_mlp": 1.03645122, + "epoch": 0.7796783405982264, + "flos": 19792238434560.0, + "grad_norm": 2.127947897129968, + "language_loss": 0.72439355, + "learning_rate": 4.878402500474073e-07, + "loss": 0.74567437, + "num_input_tokens_seen": 279748350, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 12968, + "time_per_iteration": 2.4800057411193848 + }, + { + "auxiliary_loss_clip": 0.01099689, + "auxiliary_loss_mlp": 0.01034695, + "balance_loss_clip": 1.02313745, + "balance_loss_mlp": 1.0356847, + "epoch": 0.7797384638508943, + "flos": 15450207603840.0, + "grad_norm": 2.4882382801625114, + "language_loss": 0.6027385, + "learning_rate": 4.875853840905874e-07, + "loss": 0.62408233, + "num_input_tokens_seen": 279765620, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 12969, + "time_per_iteration": 3.8256025314331055 + }, + { + "auxiliary_loss_clip": 0.01095828, + "auxiliary_loss_mlp": 0.01027516, + "balance_loss_clip": 1.01716805, + "balance_loss_mlp": 1.03350222, + "epoch": 0.7797985871035623, + "flos": 20922742160640.0, + "grad_norm": 1.7218656768380223, + "language_loss": 0.70345086, + "learning_rate": 4.873305754846811e-07, + "loss": 0.7246843, + "num_input_tokens_seen": 279782485, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.62109375, + "step": 12970, + "time_per_iteration": 2.4424326419830322 + }, + { + "auxiliary_loss_clip": 0.01102147, + "auxiliary_loss_mlp": 0.01031819, + "balance_loss_clip": 1.01964164, + "balance_loss_mlp": 1.03676975, + "epoch": 0.7798587103562302, + "flos": 36937212514560.0, + "grad_norm": 1.5981872425492996, + "language_loss": 0.72214878, + "learning_rate": 4.870758242393507e-07, + "loss": 0.74348849, + "num_input_tokens_seen": 279804170, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.65625, + "step": 12971, + "time_per_iteration": 4.000694990158081 + }, + { + "auxiliary_loss_clip": 0.01105251, + "auxiliary_loss_mlp": 0.01031706, + "balance_loss_clip": 1.01978493, + "balance_loss_mlp": 1.03616154, + "epoch": 0.7799188336088982, + "flos": 22419283432320.0, + "grad_norm": 1.9065262783110748, + "language_loss": 0.74722421, + "learning_rate": 4.868211303642578e-07, + "loss": 0.76859379, + "num_input_tokens_seen": 279823730, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69140625, + "step": 12972, + "time_per_iteration": 3.843189001083374 + }, + { + "auxiliary_loss_clip": 0.01099808, + "auxiliary_loss_mlp": 0.01024082, + "balance_loss_clip": 1.01216161, + "balance_loss_mlp": 1.03417563, + "epoch": 0.7799789568615663, + "flos": 18880466578560.0, + "grad_norm": 2.146033088576411, + "language_loss": 0.71397805, + "learning_rate": 4.865664938690584e-07, + "loss": 0.73521698, + "num_input_tokens_seen": 279843035, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 12973, + "time_per_iteration": 2.4355766773223877 + }, + { + "auxiliary_loss_clip": 0.01097706, + "auxiliary_loss_mlp": 0.0102725, + "balance_loss_clip": 1.01677775, + "balance_loss_mlp": 1.03420782, + "epoch": 0.7800390801142342, + "flos": 20262272832000.0, + "grad_norm": 2.289500877533027, + "language_loss": 0.77711248, + "learning_rate": 4.863119147634089e-07, + "loss": 0.79836202, + "num_input_tokens_seen": 279861450, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.63671875, + "step": 12974, + "time_per_iteration": 2.4445388317108154 + }, + { + "auxiliary_loss_clip": 0.010981, + "auxiliary_loss_mlp": 0.01029835, + "balance_loss_clip": 1.01833785, + "balance_loss_mlp": 1.03401935, + "epoch": 0.7800992033669022, + "flos": 16690310703360.0, + "grad_norm": 1.507070733985586, + "language_loss": 0.69106656, + "learning_rate": 4.86057393056964e-07, + "loss": 0.71234584, + "num_input_tokens_seen": 279878660, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 12975, + "time_per_iteration": 4.026258230209351 + }, + { + "auxiliary_loss_clip": 0.01098461, + "auxiliary_loss_mlp": 0.01028562, + "balance_loss_clip": 1.01754749, + "balance_loss_mlp": 1.03443432, + "epoch": 0.7801593266195701, + "flos": 18585208782720.0, + "grad_norm": 3.483605083933044, + "language_loss": 0.81612706, + "learning_rate": 4.858029287593739e-07, + "loss": 0.83739734, + "num_input_tokens_seen": 279895685, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 12976, + "time_per_iteration": 2.39786696434021 + }, + { + "auxiliary_loss_clip": 0.01102312, + "auxiliary_loss_mlp": 0.01026506, + "balance_loss_clip": 1.01467419, + "balance_loss_mlp": 1.03479075, + "epoch": 0.7802194498722381, + "flos": 25484941405440.0, + "grad_norm": 1.6747970494866666, + "language_loss": 0.6597501, + "learning_rate": 4.85548521880289e-07, + "loss": 0.68103826, + "num_input_tokens_seen": 279917240, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.67578125, + "step": 12977, + "time_per_iteration": 2.509279489517212 + }, + { + "auxiliary_loss_clip": 0.01097395, + "auxiliary_loss_mlp": 0.01028658, + "balance_loss_clip": 1.01837647, + "balance_loss_mlp": 1.03446436, + "epoch": 0.780279573124906, + "flos": 31176315573120.0, + "grad_norm": 2.0224689564916236, + "language_loss": 0.74458158, + "learning_rate": 4.852941724293554e-07, + "loss": 0.76584208, + "num_input_tokens_seen": 279938665, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.6328125, + "step": 12978, + "time_per_iteration": 2.5191776752471924 + }, + { + "auxiliary_loss_clip": 0.01103093, + "auxiliary_loss_mlp": 0.01029874, + "balance_loss_clip": 1.01786351, + "balance_loss_mlp": 1.03624713, + "epoch": 0.780339696377574, + "flos": 26944027770240.0, + "grad_norm": 2.0922083765089523, + "language_loss": 0.62049854, + "learning_rate": 4.85039880416219e-07, + "loss": 0.64182818, + "num_input_tokens_seen": 279957965, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66796875, + "step": 12979, + "time_per_iteration": 2.5099925994873047 + }, + { + "auxiliary_loss_clip": 0.01099974, + "auxiliary_loss_mlp": 0.01027414, + "balance_loss_clip": 1.01567745, + "balance_loss_mlp": 1.03531623, + "epoch": 0.780399819630242, + "flos": 27957426180480.0, + "grad_norm": 1.9372520913604323, + "language_loss": 0.77348953, + "learning_rate": 4.847856458505217e-07, + "loss": 0.79476345, + "num_input_tokens_seen": 279977490, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.64453125, + "step": 12980, + "time_per_iteration": 2.4801688194274902 + }, + { + "auxiliary_loss_clip": 0.01102229, + "auxiliary_loss_mlp": 0.01031284, + "balance_loss_clip": 1.02032888, + "balance_loss_mlp": 1.03540671, + "epoch": 0.78045994288291, + "flos": 22486795044480.0, + "grad_norm": 3.6673789740050484, + "language_loss": 0.78181487, + "learning_rate": 4.845314687419046e-07, + "loss": 0.80315006, + "num_input_tokens_seen": 279994220, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.66796875, + "step": 12981, + "time_per_iteration": 2.4743919372558594 + }, + { + "auxiliary_loss_clip": 0.01104292, + "auxiliary_loss_mlp": 0.0103052, + "balance_loss_clip": 1.01932621, + "balance_loss_mlp": 1.03766203, + "epoch": 0.7805200661355779, + "flos": 20850849089280.0, + "grad_norm": 1.7572805466494936, + "language_loss": 0.7283631, + "learning_rate": 4.842773491000067e-07, + "loss": 0.74971128, + "num_input_tokens_seen": 280012590, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.66796875, + "step": 12982, + "time_per_iteration": 2.464043140411377 + }, + { + "auxiliary_loss_clip": 0.0109892, + "auxiliary_loss_mlp": 0.01028115, + "balance_loss_clip": 1.01735628, + "balance_loss_mlp": 1.03321373, + "epoch": 0.7805801893882459, + "flos": 25665966973440.0, + "grad_norm": 1.447832651307714, + "language_loss": 0.73497742, + "learning_rate": 4.840232869344636e-07, + "loss": 0.75624776, + "num_input_tokens_seen": 280033700, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.66015625, + "step": 12983, + "time_per_iteration": 2.5320849418640137 + }, + { + "auxiliary_loss_clip": 0.01098957, + "auxiliary_loss_mlp": 0.01029332, + "balance_loss_clip": 1.01825762, + "balance_loss_mlp": 1.03431869, + "epoch": 0.7806403126409138, + "flos": 11327806483200.0, + "grad_norm": 1.994731335047155, + "language_loss": 0.7493751, + "learning_rate": 4.837692822549086e-07, + "loss": 0.77065802, + "num_input_tokens_seen": 280052215, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6484375, + "step": 12984, + "time_per_iteration": 2.4252982139587402 + }, + { + "auxiliary_loss_clip": 0.01098022, + "auxiliary_loss_mlp": 0.01031539, + "balance_loss_clip": 1.02072072, + "balance_loss_mlp": 1.03346229, + "epoch": 0.7807004358935818, + "flos": 19573362910080.0, + "grad_norm": 1.7760899084313728, + "language_loss": 0.81298089, + "learning_rate": 4.835153350709746e-07, + "loss": 0.83427656, + "num_input_tokens_seen": 280070525, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.64453125, + "step": 12985, + "time_per_iteration": 2.442458391189575 + }, + { + "auxiliary_loss_clip": 0.01100867, + "auxiliary_loss_mlp": 0.01030708, + "balance_loss_clip": 1.01956177, + "balance_loss_mlp": 1.03591645, + "epoch": 0.7807605591462499, + "flos": 19135827342720.0, + "grad_norm": 1.639777449127703, + "language_loss": 0.77087915, + "learning_rate": 4.832614453922915e-07, + "loss": 0.79219496, + "num_input_tokens_seen": 280089855, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 12986, + "time_per_iteration": 2.4363291263580322 + }, + { + "auxiliary_loss_clip": 0.01100757, + "auxiliary_loss_mlp": 0.01031618, + "balance_loss_clip": 1.02032864, + "balance_loss_mlp": 1.03434944, + "epoch": 0.7808206823989178, + "flos": 32374654133760.0, + "grad_norm": 2.42025665629093, + "language_loss": 0.73686159, + "learning_rate": 4.830076132284859e-07, + "loss": 0.75818527, + "num_input_tokens_seen": 280109960, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6640625, + "step": 12987, + "time_per_iteration": 2.542191505432129 + }, + { + "auxiliary_loss_clip": 0.01023759, + "auxiliary_loss_mlp": 0.01001114, + "balance_loss_clip": 1.00014293, + "balance_loss_mlp": 1.0034368, + "epoch": 0.7808808056515858, + "flos": 55050235061760.0, + "grad_norm": 0.7329422119144833, + "language_loss": 0.55088633, + "learning_rate": 4.82753838589184e-07, + "loss": 0.57113504, + "num_input_tokens_seen": 280169805, + "router_z_loss_clip": 0.00970459, + "router_z_loss_mlp": 0.203125, + "step": 12988, + "time_per_iteration": 3.1061744689941406 + }, + { + "auxiliary_loss_clip": 0.01096388, + "auxiliary_loss_mlp": 0.01034542, + "balance_loss_clip": 1.02383065, + "balance_loss_mlp": 1.03418314, + "epoch": 0.7809409289042537, + "flos": 12859468277760.0, + "grad_norm": 2.47954830996045, + "language_loss": 0.80945504, + "learning_rate": 4.82500121484009e-07, + "loss": 0.83076429, + "num_input_tokens_seen": 280184630, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.625, + "step": 12989, + "time_per_iteration": 2.4808456897735596 + }, + { + "auxiliary_loss_clip": 0.01096337, + "auxiliary_loss_mlp": 0.01028263, + "balance_loss_clip": 1.01711679, + "balance_loss_mlp": 1.03300154, + "epoch": 0.7810010521569217, + "flos": 21687244254720.0, + "grad_norm": 1.5469006395559106, + "language_loss": 0.70564306, + "learning_rate": 4.822464619225806e-07, + "loss": 0.72688901, + "num_input_tokens_seen": 280203880, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6328125, + "step": 12990, + "time_per_iteration": 2.443657636642456 + }, + { + "auxiliary_loss_clip": 0.01101485, + "auxiliary_loss_mlp": 0.01028511, + "balance_loss_clip": 1.01631021, + "balance_loss_mlp": 1.03604221, + "epoch": 0.7810611754095896, + "flos": 16757068129920.0, + "grad_norm": 1.8688564219914294, + "language_loss": 0.77437395, + "learning_rate": 4.819928599145184e-07, + "loss": 0.79567397, + "num_input_tokens_seen": 280220460, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65625, + "step": 12991, + "time_per_iteration": 2.4350147247314453 + }, + { + "auxiliary_loss_clip": 0.01098523, + "auxiliary_loss_mlp": 0.01033078, + "balance_loss_clip": 1.0213058, + "balance_loss_mlp": 1.0333643, + "epoch": 0.7811212986622577, + "flos": 43507464658560.0, + "grad_norm": 1.6335805671408214, + "language_loss": 0.66026002, + "learning_rate": 4.817393154694398e-07, + "loss": 0.68157601, + "num_input_tokens_seen": 280242680, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.65234375, + "step": 12992, + "time_per_iteration": 2.689131259918213 + }, + { + "auxiliary_loss_clip": 0.01101645, + "auxiliary_loss_mlp": 0.0102982, + "balance_loss_clip": 1.01868546, + "balance_loss_mlp": 1.03544474, + "epoch": 0.7811814219149256, + "flos": 21757700782080.0, + "grad_norm": 1.671791427999923, + "language_loss": 0.6139763, + "learning_rate": 4.814858285969578e-07, + "loss": 0.63529098, + "num_input_tokens_seen": 280260655, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66015625, + "step": 12993, + "time_per_iteration": 2.4541869163513184 + }, + { + "auxiliary_loss_clip": 0.01098832, + "auxiliary_loss_mlp": 0.01028588, + "balance_loss_clip": 1.01672089, + "balance_loss_mlp": 1.03474307, + "epoch": 0.7812415451675936, + "flos": 24061514267520.0, + "grad_norm": 1.5259935915170835, + "language_loss": 0.68686914, + "learning_rate": 4.812323993066862e-07, + "loss": 0.70814335, + "num_input_tokens_seen": 280281185, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.640625, + "step": 12994, + "time_per_iteration": 2.4765658378601074 + }, + { + "auxiliary_loss_clip": 0.01098133, + "auxiliary_loss_mlp": 0.01025809, + "balance_loss_clip": 1.01448953, + "balance_loss_mlp": 1.03380036, + "epoch": 0.7813016684202615, + "flos": 18989706816000.0, + "grad_norm": 1.852574283053805, + "language_loss": 0.68799579, + "learning_rate": 4.809790276082335e-07, + "loss": 0.70923519, + "num_input_tokens_seen": 280298255, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.64453125, + "step": 12995, + "time_per_iteration": 2.4536421298980713 + }, + { + "auxiliary_loss_clip": 0.01095783, + "auxiliary_loss_mlp": 0.01026788, + "balance_loss_clip": 1.01633954, + "balance_loss_mlp": 1.03263307, + "epoch": 0.7813617916729295, + "flos": 25260786581760.0, + "grad_norm": 1.6880507432835572, + "language_loss": 0.74965352, + "learning_rate": 4.807257135112088e-07, + "loss": 0.77087927, + "num_input_tokens_seen": 280319000, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.6328125, + "step": 12996, + "time_per_iteration": 2.5054454803466797 + }, + { + "auxiliary_loss_clip": 0.0110413, + "auxiliary_loss_mlp": 0.01031779, + "balance_loss_clip": 1.01969695, + "balance_loss_mlp": 1.03568673, + "epoch": 0.7814219149255974, + "flos": 17966037116160.0, + "grad_norm": 2.5743234501120424, + "language_loss": 0.6912725, + "learning_rate": 4.804724570252167e-07, + "loss": 0.71263158, + "num_input_tokens_seen": 280336375, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.68359375, + "step": 12997, + "time_per_iteration": 2.4369044303894043 + }, + { + "auxiliary_loss_clip": 0.01104469, + "auxiliary_loss_mlp": 0.01031264, + "balance_loss_clip": 1.01893187, + "balance_loss_mlp": 1.03557801, + "epoch": 0.7814820381782654, + "flos": 25776176878080.0, + "grad_norm": 1.8652008126435036, + "language_loss": 0.82176995, + "learning_rate": 4.802192581598614e-07, + "loss": 0.84312725, + "num_input_tokens_seen": 280358760, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69140625, + "step": 12998, + "time_per_iteration": 2.486489772796631 + }, + { + "auxiliary_loss_clip": 0.01099451, + "auxiliary_loss_mlp": 0.01030802, + "balance_loss_clip": 1.01885128, + "balance_loss_mlp": 1.03346038, + "epoch": 0.7815421614309335, + "flos": 20519572930560.0, + "grad_norm": 1.8319036090536944, + "language_loss": 0.74508494, + "learning_rate": 4.799661169247453e-07, + "loss": 0.76638746, + "num_input_tokens_seen": 280377085, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66015625, + "step": 12999, + "time_per_iteration": 2.4737162590026855 + }, + { + "auxiliary_loss_clip": 0.01103401, + "auxiliary_loss_mlp": 0.01033986, + "balance_loss_clip": 1.02182698, + "balance_loss_mlp": 1.03589118, + "epoch": 0.7816022846836014, + "flos": 21287666384640.0, + "grad_norm": 1.563923642471339, + "language_loss": 0.84530002, + "learning_rate": 4.797130333294652e-07, + "loss": 0.86667389, + "num_input_tokens_seen": 280395465, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.67578125, + "step": 13000, + "time_per_iteration": 2.4414126873016357 + }, + { + "auxiliary_loss_clip": 0.01101696, + "auxiliary_loss_mlp": 0.01030595, + "balance_loss_clip": 1.01921082, + "balance_loss_mlp": 1.03525925, + "epoch": 0.7816624079362694, + "flos": 19208402772480.0, + "grad_norm": 1.979765622408292, + "language_loss": 0.65926194, + "learning_rate": 4.794600073836192e-07, + "loss": 0.68058491, + "num_input_tokens_seen": 280412775, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 13001, + "time_per_iteration": 2.459602117538452 + }, + { + "auxiliary_loss_clip": 0.01100637, + "auxiliary_loss_mlp": 0.01031843, + "balance_loss_clip": 1.02068496, + "balance_loss_mlp": 1.0349071, + "epoch": 0.7817225311889373, + "flos": 26104687689600.0, + "grad_norm": 1.7956850599557053, + "language_loss": 0.6699869, + "learning_rate": 4.792070390968027e-07, + "loss": 0.69131166, + "num_input_tokens_seen": 280432905, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 13002, + "time_per_iteration": 2.476304769515991 + }, + { + "auxiliary_loss_clip": 0.01104712, + "auxiliary_loss_mlp": 0.01035704, + "balance_loss_clip": 1.02302575, + "balance_loss_mlp": 1.0376792, + "epoch": 0.7817826544416053, + "flos": 21250929749760.0, + "grad_norm": 2.585481392916345, + "language_loss": 0.7332117, + "learning_rate": 4.78954128478607e-07, + "loss": 0.75461578, + "num_input_tokens_seen": 280450785, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.671875, + "step": 13003, + "time_per_iteration": 2.4720077514648438 + }, + { + "auxiliary_loss_clip": 0.01103208, + "auxiliary_loss_mlp": 0.01031396, + "balance_loss_clip": 1.0201664, + "balance_loss_mlp": 1.03717935, + "epoch": 0.7818427776942732, + "flos": 19932181822080.0, + "grad_norm": 1.569897666611527, + "language_loss": 0.62077022, + "learning_rate": 4.787012755386233e-07, + "loss": 0.64211631, + "num_input_tokens_seen": 280468400, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.66015625, + "step": 13004, + "time_per_iteration": 2.441561222076416 + }, + { + "auxiliary_loss_clip": 0.01094018, + "auxiliary_loss_mlp": 0.01029156, + "balance_loss_clip": 1.01888061, + "balance_loss_mlp": 1.03251433, + "epoch": 0.7819029009469413, + "flos": 11363753018880.0, + "grad_norm": 2.201816457690377, + "language_loss": 0.82857859, + "learning_rate": 4.784484802864403e-07, + "loss": 0.84981036, + "num_input_tokens_seen": 280483930, + "router_z_loss_clip": 0.10302734, + "router_z_loss_mlp": 0.6171875, + "step": 13005, + "time_per_iteration": 2.463477373123169 + }, + { + "auxiliary_loss_clip": 0.01098144, + "auxiliary_loss_mlp": 0.01029594, + "balance_loss_clip": 1.01770329, + "balance_loss_mlp": 1.033494, + "epoch": 0.7819630241996092, + "flos": 24279276470400.0, + "grad_norm": 1.897683871126404, + "language_loss": 0.72580653, + "learning_rate": 4.781957427316432e-07, + "loss": 0.7470839, + "num_input_tokens_seen": 280503465, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6484375, + "step": 13006, + "time_per_iteration": 2.465083122253418 + }, + { + "auxiliary_loss_clip": 0.01101706, + "auxiliary_loss_mlp": 0.01030088, + "balance_loss_clip": 1.01830435, + "balance_loss_mlp": 1.03508401, + "epoch": 0.7820231474522772, + "flos": 22708902792960.0, + "grad_norm": 1.6366399269872012, + "language_loss": 0.7201829, + "learning_rate": 4.779430628838157e-07, + "loss": 0.74150085, + "num_input_tokens_seen": 280523375, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66796875, + "step": 13007, + "time_per_iteration": 2.459080934524536 + }, + { + "auxiliary_loss_clip": 0.01100835, + "auxiliary_loss_mlp": 0.01027314, + "balance_loss_clip": 1.0149343, + "balance_loss_mlp": 1.03300202, + "epoch": 0.7820832707049451, + "flos": 20047419630720.0, + "grad_norm": 2.036752007618824, + "language_loss": 0.68872929, + "learning_rate": 4.776904407525397e-07, + "loss": 0.71001077, + "num_input_tokens_seen": 280542920, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 13008, + "time_per_iteration": 2.4224483966827393 + }, + { + "auxiliary_loss_clip": 0.01098365, + "auxiliary_loss_mlp": 0.01029334, + "balance_loss_clip": 1.01692426, + "balance_loss_mlp": 1.03345668, + "epoch": 0.7821433939576131, + "flos": 27162795553920.0, + "grad_norm": 1.640160857289297, + "language_loss": 0.69686973, + "learning_rate": 4.774378763473954e-07, + "loss": 0.71814674, + "num_input_tokens_seen": 280561700, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6484375, + "step": 13009, + "time_per_iteration": 2.5025076866149902 + }, + { + "auxiliary_loss_clip": 0.01097253, + "auxiliary_loss_mlp": 0.01027375, + "balance_loss_clip": 1.01588941, + "balance_loss_mlp": 1.03301144, + "epoch": 0.782203517210281, + "flos": 22602068766720.0, + "grad_norm": 1.5960610923342113, + "language_loss": 0.81570321, + "learning_rate": 4.771853696779586e-07, + "loss": 0.83694947, + "num_input_tokens_seen": 280580605, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 13010, + "time_per_iteration": 2.4285366535186768 + }, + { + "auxiliary_loss_clip": 0.01096868, + "auxiliary_loss_mlp": 0.0103215, + "balance_loss_clip": 1.02153432, + "balance_loss_mlp": 1.03357911, + "epoch": 0.782263640462949, + "flos": 29059812535680.0, + "grad_norm": 1.6142519346757356, + "language_loss": 0.62225044, + "learning_rate": 4.76932920753806e-07, + "loss": 0.64354062, + "num_input_tokens_seen": 280601495, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6328125, + "step": 13011, + "time_per_iteration": 3.926089286804199 + }, + { + "auxiliary_loss_clip": 0.01099415, + "auxiliary_loss_mlp": 0.01024998, + "balance_loss_clip": 1.01533031, + "balance_loss_mlp": 1.03547144, + "epoch": 0.782323763715617, + "flos": 25299498464640.0, + "grad_norm": 1.6368138696323526, + "language_loss": 0.6998511, + "learning_rate": 4.7668052958450913e-07, + "loss": 0.72109526, + "num_input_tokens_seen": 280622760, + "router_z_loss_clip": 0.09667969, + "router_z_loss_mlp": 0.640625, + "step": 13012, + "time_per_iteration": 2.4826955795288086 + }, + { + "auxiliary_loss_clip": 0.01023537, + "auxiliary_loss_mlp": 0.01008113, + "balance_loss_clip": 1.00711727, + "balance_loss_mlp": 1.00321245, + "epoch": 0.782383886968285, + "flos": 65194388668800.0, + "grad_norm": 0.7065375253302547, + "language_loss": 0.55039519, + "learning_rate": 4.764281961796395e-07, + "loss": 0.57071167, + "num_input_tokens_seen": 280687115, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.203125, + "step": 13013, + "time_per_iteration": 4.645312786102295 + }, + { + "auxiliary_loss_clip": 0.01104842, + "auxiliary_loss_mlp": 0.01032528, + "balance_loss_clip": 1.02101803, + "balance_loss_mlp": 1.03746831, + "epoch": 0.782444010220953, + "flos": 18405440190720.0, + "grad_norm": 1.6705985916443649, + "language_loss": 0.65102112, + "learning_rate": 4.76175920548765e-07, + "loss": 0.67239481, + "num_input_tokens_seen": 280705000, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 13014, + "time_per_iteration": 3.8477301597595215 + }, + { + "auxiliary_loss_clip": 0.01023801, + "auxiliary_loss_mlp": 0.01001816, + "balance_loss_clip": 1.00088012, + "balance_loss_mlp": 1.00337434, + "epoch": 0.7825041334736209, + "flos": 63955003841280.0, + "grad_norm": 0.727505311889394, + "language_loss": 0.58472216, + "learning_rate": 4.759237027014524e-07, + "loss": 0.60497832, + "num_input_tokens_seen": 280773525, + "router_z_loss_clip": 0.00933838, + "router_z_loss_mlp": 0.20507812, + "step": 13015, + "time_per_iteration": 3.1371023654937744 + }, + { + "auxiliary_loss_clip": 0.01098459, + "auxiliary_loss_mlp": 0.01028458, + "balance_loss_clip": 1.01759779, + "balance_loss_mlp": 1.03401864, + "epoch": 0.7825642567262889, + "flos": 20339373375360.0, + "grad_norm": 1.8961534099857338, + "language_loss": 0.7447719, + "learning_rate": 4.756715426472666e-07, + "loss": 0.76604104, + "num_input_tokens_seen": 280791915, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.64453125, + "step": 13016, + "time_per_iteration": 2.434140682220459 + }, + { + "auxiliary_loss_clip": 0.01101959, + "auxiliary_loss_mlp": 0.01030127, + "balance_loss_clip": 1.01715088, + "balance_loss_mlp": 1.03527248, + "epoch": 0.7826243799789568, + "flos": 20262955190400.0, + "grad_norm": 1.7784650318460415, + "language_loss": 0.75034481, + "learning_rate": 4.7541944039576766e-07, + "loss": 0.77166569, + "num_input_tokens_seen": 280811460, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6640625, + "step": 13017, + "time_per_iteration": 3.9943692684173584 + }, + { + "auxiliary_loss_clip": 0.01099632, + "auxiliary_loss_mlp": 0.01029093, + "balance_loss_clip": 1.01680803, + "balance_loss_mlp": 1.03296256, + "epoch": 0.7826845032316249, + "flos": 21132926593920.0, + "grad_norm": 1.8349392879241557, + "language_loss": 0.75123864, + "learning_rate": 4.7516739595651636e-07, + "loss": 0.77252591, + "num_input_tokens_seen": 280825415, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6640625, + "step": 13018, + "time_per_iteration": 2.4067063331604004 + }, + { + "auxiliary_loss_clip": 0.01098611, + "auxiliary_loss_mlp": 0.01026262, + "balance_loss_clip": 1.01444817, + "balance_loss_mlp": 1.03329933, + "epoch": 0.7827446264842928, + "flos": 22492253911680.0, + "grad_norm": 1.4416632846342243, + "language_loss": 0.77156466, + "learning_rate": 4.749154093390708e-07, + "loss": 0.79281342, + "num_input_tokens_seen": 280845335, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65234375, + "step": 13019, + "time_per_iteration": 2.463075876235962 + }, + { + "auxiliary_loss_clip": 0.01097045, + "auxiliary_loss_mlp": 0.01026432, + "balance_loss_clip": 1.01522064, + "balance_loss_mlp": 1.03263474, + "epoch": 0.7828047497369608, + "flos": 28840649702400.0, + "grad_norm": 1.5659008205546523, + "language_loss": 0.67608422, + "learning_rate": 4.746634805529852e-07, + "loss": 0.69731897, + "num_input_tokens_seen": 280867145, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.64453125, + "step": 13020, + "time_per_iteration": 2.4952075481414795 + }, + { + "auxiliary_loss_clip": 0.01100425, + "auxiliary_loss_mlp": 0.01028675, + "balance_loss_clip": 1.01770771, + "balance_loss_mlp": 1.03600883, + "epoch": 0.7828648729896287, + "flos": 23257689759360.0, + "grad_norm": 2.0993447559615905, + "language_loss": 0.6252991, + "learning_rate": 4.7441160960781325e-07, + "loss": 0.64659011, + "num_input_tokens_seen": 280886185, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.64453125, + "step": 13021, + "time_per_iteration": 2.4579381942749023 + }, + { + "auxiliary_loss_clip": 0.01097567, + "auxiliary_loss_mlp": 0.01030619, + "balance_loss_clip": 1.01984227, + "balance_loss_mlp": 1.03425419, + "epoch": 0.7829249962422967, + "flos": 25265670831360.0, + "grad_norm": 1.6887151004822496, + "language_loss": 0.69123161, + "learning_rate": 4.7415979651310636e-07, + "loss": 0.71251345, + "num_input_tokens_seen": 280907665, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6328125, + "step": 13022, + "time_per_iteration": 2.4774861335754395 + }, + { + "auxiliary_loss_clip": 0.01023146, + "auxiliary_loss_mlp": 0.01003513, + "balance_loss_clip": 1.00248182, + "balance_loss_mlp": 1.00289679, + "epoch": 0.7829851194949646, + "flos": 70722044645760.0, + "grad_norm": 0.6410994514398879, + "language_loss": 0.56181228, + "learning_rate": 4.739080412784131e-07, + "loss": 0.58207887, + "num_input_tokens_seen": 280971405, + "router_z_loss_clip": 0.01031494, + "router_z_loss_mlp": 0.203125, + "step": 13023, + "time_per_iteration": 3.216150999069214 + }, + { + "auxiliary_loss_clip": 0.01092363, + "auxiliary_loss_mlp": 0.01026003, + "balance_loss_clip": 1.01569128, + "balance_loss_mlp": 1.03068089, + "epoch": 0.7830452427476327, + "flos": 25660795415040.0, + "grad_norm": 1.5988888518402644, + "language_loss": 0.67096663, + "learning_rate": 4.736563439132792e-07, + "loss": 0.69215035, + "num_input_tokens_seen": 280989615, + "router_z_loss_clip": 0.10302734, + "router_z_loss_mlp": 0.6171875, + "step": 13024, + "time_per_iteration": 2.4942939281463623 + }, + { + "auxiliary_loss_clip": 0.01101952, + "auxiliary_loss_mlp": 0.01026503, + "balance_loss_clip": 1.01470125, + "balance_loss_mlp": 1.0346812, + "epoch": 0.7831053660003006, + "flos": 22784315397120.0, + "grad_norm": 1.8279349963305433, + "language_loss": 0.77768403, + "learning_rate": 4.734047044272498e-07, + "loss": 0.79896855, + "num_input_tokens_seen": 281009450, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.671875, + "step": 13025, + "time_per_iteration": 2.4907360076904297 + }, + { + "auxiliary_loss_clip": 0.01100969, + "auxiliary_loss_mlp": 0.01030225, + "balance_loss_clip": 1.01925731, + "balance_loss_mlp": 1.0364809, + "epoch": 0.7831654892529686, + "flos": 25812267068160.0, + "grad_norm": 1.6346779993689489, + "language_loss": 0.78158247, + "learning_rate": 4.731531228298673e-07, + "loss": 0.80289435, + "num_input_tokens_seen": 281028120, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.64453125, + "step": 13026, + "time_per_iteration": 2.510455846786499 + }, + { + "auxiliary_loss_clip": 0.01099064, + "auxiliary_loss_mlp": 0.01024558, + "balance_loss_clip": 1.01344812, + "balance_loss_mlp": 1.03539133, + "epoch": 0.7832256125056366, + "flos": 20771557816320.0, + "grad_norm": 1.8424561314636239, + "language_loss": 0.75538385, + "learning_rate": 4.729015991306715e-07, + "loss": 0.77662009, + "num_input_tokens_seen": 281042130, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.63671875, + "step": 13027, + "time_per_iteration": 2.4143946170806885 + }, + { + "auxiliary_loss_clip": 0.01100205, + "auxiliary_loss_mlp": 0.01026641, + "balance_loss_clip": 1.01554847, + "balance_loss_mlp": 1.03557467, + "epoch": 0.7832857357583045, + "flos": 21506541909120.0, + "grad_norm": 1.6598203189142682, + "language_loss": 0.70306528, + "learning_rate": 4.726501333391997e-07, + "loss": 0.72433376, + "num_input_tokens_seen": 281060945, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6484375, + "step": 13028, + "time_per_iteration": 2.4640142917633057 + }, + { + "auxiliary_loss_clip": 0.01103224, + "auxiliary_loss_mlp": 0.01034946, + "balance_loss_clip": 1.0228107, + "balance_loss_mlp": 1.03549385, + "epoch": 0.7833458590109725, + "flos": 18077791305600.0, + "grad_norm": 1.953273334391897, + "language_loss": 0.69041282, + "learning_rate": 4.7239872546498774e-07, + "loss": 0.71179456, + "num_input_tokens_seen": 281079270, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 13029, + "time_per_iteration": 2.4038736820220947 + }, + { + "auxiliary_loss_clip": 0.01101772, + "auxiliary_loss_mlp": 0.01026447, + "balance_loss_clip": 1.01434183, + "balance_loss_mlp": 1.03425694, + "epoch": 0.7834059822636404, + "flos": 28288738252800.0, + "grad_norm": 1.7164794542717685, + "language_loss": 0.81022191, + "learning_rate": 4.721473755175698e-07, + "loss": 0.83150411, + "num_input_tokens_seen": 281099500, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 13030, + "time_per_iteration": 2.5112462043762207 + }, + { + "auxiliary_loss_clip": 0.01102526, + "auxiliary_loss_mlp": 0.01029393, + "balance_loss_clip": 1.01789546, + "balance_loss_mlp": 1.03459156, + "epoch": 0.7834661055163085, + "flos": 31686211088640.0, + "grad_norm": 1.6569423927401024, + "language_loss": 0.70443982, + "learning_rate": 4.71896083506476e-07, + "loss": 0.72575903, + "num_input_tokens_seen": 281121250, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6796875, + "step": 13031, + "time_per_iteration": 2.5177314281463623 + }, + { + "auxiliary_loss_clip": 0.01101312, + "auxiliary_loss_mlp": 0.0103291, + "balance_loss_clip": 1.02162719, + "balance_loss_mlp": 1.03390551, + "epoch": 0.7835262287689764, + "flos": 12933192942720.0, + "grad_norm": 2.080929287511114, + "language_loss": 0.78692496, + "learning_rate": 4.7164484944123574e-07, + "loss": 0.80826724, + "num_input_tokens_seen": 281138760, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.671875, + "step": 13032, + "time_per_iteration": 2.438286066055298 + }, + { + "auxiliary_loss_clip": 0.01104134, + "auxiliary_loss_mlp": 0.01034155, + "balance_loss_clip": 1.02238297, + "balance_loss_mlp": 1.03637064, + "epoch": 0.7835863520216444, + "flos": 16143211676160.0, + "grad_norm": 2.498040083098191, + "language_loss": 0.62467206, + "learning_rate": 4.7139367333137726e-07, + "loss": 0.64605498, + "num_input_tokens_seen": 281157420, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6796875, + "step": 13033, + "time_per_iteration": 2.4075143337249756 + }, + { + "auxiliary_loss_clip": 0.01099306, + "auxiliary_loss_mlp": 0.01026458, + "balance_loss_clip": 1.01454878, + "balance_loss_mlp": 1.03466129, + "epoch": 0.7836464752743123, + "flos": 11509909459200.0, + "grad_norm": 1.5229312558567987, + "language_loss": 0.71800756, + "learning_rate": 4.7114255518642255e-07, + "loss": 0.7392652, + "num_input_tokens_seen": 281174620, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6484375, + "step": 13034, + "time_per_iteration": 2.426010847091675 + }, + { + "auxiliary_loss_clip": 0.01102657, + "auxiliary_loss_mlp": 0.01029907, + "balance_loss_clip": 1.01777768, + "balance_loss_mlp": 1.03581548, + "epoch": 0.7837065985269803, + "flos": 18223696350720.0, + "grad_norm": 1.6809698816895169, + "language_loss": 0.72046518, + "learning_rate": 4.7089149501589555e-07, + "loss": 0.74179089, + "num_input_tokens_seen": 281193865, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 13035, + "time_per_iteration": 2.417221784591675 + }, + { + "auxiliary_loss_clip": 0.0110217, + "auxiliary_loss_mlp": 0.01031304, + "balance_loss_clip": 1.01936555, + "balance_loss_mlp": 1.0355823, + "epoch": 0.7837667217796482, + "flos": 24754410599040.0, + "grad_norm": 1.9215035774038787, + "language_loss": 0.66247499, + "learning_rate": 4.7064049282931664e-07, + "loss": 0.6838097, + "num_input_tokens_seen": 281212250, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 13036, + "time_per_iteration": 2.4644551277160645 + }, + { + "auxiliary_loss_clip": 0.01105291, + "auxiliary_loss_mlp": 0.01032277, + "balance_loss_clip": 1.01995683, + "balance_loss_mlp": 1.03585243, + "epoch": 0.7838268450323163, + "flos": 22383121415040.0, + "grad_norm": 2.2777930341142945, + "language_loss": 0.72937357, + "learning_rate": 4.703895486362031e-07, + "loss": 0.75074923, + "num_input_tokens_seen": 281230850, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6953125, + "step": 13037, + "time_per_iteration": 2.449385404586792 + }, + { + "auxiliary_loss_clip": 0.01097375, + "auxiliary_loss_mlp": 0.01029582, + "balance_loss_clip": 1.0175482, + "balance_loss_mlp": 1.03236222, + "epoch": 0.7838869682849842, + "flos": 19500284689920.0, + "grad_norm": 2.4737781125187808, + "language_loss": 0.60029399, + "learning_rate": 4.701386624460717e-07, + "loss": 0.62156355, + "num_input_tokens_seen": 281249810, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6484375, + "step": 13038, + "time_per_iteration": 2.467207193374634 + }, + { + "auxiliary_loss_clip": 0.010977, + "auxiliary_loss_mlp": 0.01027526, + "balance_loss_clip": 1.01651084, + "balance_loss_mlp": 1.03378868, + "epoch": 0.7839470915376522, + "flos": 32892845690880.0, + "grad_norm": 1.8286159549617163, + "language_loss": 0.68401051, + "learning_rate": 4.698878342684349e-07, + "loss": 0.70526278, + "num_input_tokens_seen": 281273730, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 13039, + "time_per_iteration": 2.576012372970581 + }, + { + "auxiliary_loss_clip": 0.01095371, + "auxiliary_loss_mlp": 0.01021071, + "balance_loss_clip": 1.01055706, + "balance_loss_mlp": 1.03193581, + "epoch": 0.7840072147903202, + "flos": 29676003373440.0, + "grad_norm": 1.8627494716028734, + "language_loss": 0.68923277, + "learning_rate": 4.6963706411280537e-07, + "loss": 0.71039724, + "num_input_tokens_seen": 281293670, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6328125, + "step": 13040, + "time_per_iteration": 2.5061099529266357 + }, + { + "auxiliary_loss_clip": 0.01100843, + "auxiliary_loss_mlp": 0.01031272, + "balance_loss_clip": 1.01966667, + "balance_loss_mlp": 1.03439748, + "epoch": 0.7840673380429881, + "flos": 18186744234240.0, + "grad_norm": 1.5445420563280179, + "language_loss": 0.67223978, + "learning_rate": 4.6938635198869116e-07, + "loss": 0.6935609, + "num_input_tokens_seen": 281313070, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 13041, + "time_per_iteration": 2.4612159729003906 + }, + { + "auxiliary_loss_clip": 0.01023594, + "auxiliary_loss_mlp": 0.01001116, + "balance_loss_clip": 1.00019228, + "balance_loss_mlp": 1.00344205, + "epoch": 0.7841274612956561, + "flos": 66346006613760.0, + "grad_norm": 0.6599910887916006, + "language_loss": 0.57391232, + "learning_rate": 4.691356979055998e-07, + "loss": 0.59415942, + "num_input_tokens_seen": 281374880, + "router_z_loss_clip": 0.00921631, + "router_z_loss_mlp": 0.20117188, + "step": 13042, + "time_per_iteration": 3.0452370643615723 + }, + { + "auxiliary_loss_clip": 0.0110195, + "auxiliary_loss_mlp": 0.01027245, + "balance_loss_clip": 1.01545572, + "balance_loss_mlp": 1.03551662, + "epoch": 0.784187584548324, + "flos": 26648482665600.0, + "grad_norm": 2.3220034153225235, + "language_loss": 0.83760583, + "learning_rate": 4.688851018730369e-07, + "loss": 0.85889781, + "num_input_tokens_seen": 281392620, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6640625, + "step": 13043, + "time_per_iteration": 2.4752867221832275 + }, + { + "auxiliary_loss_clip": 0.01097987, + "auxiliary_loss_mlp": 0.01025032, + "balance_loss_clip": 1.01391542, + "balance_loss_mlp": 1.03412688, + "epoch": 0.7842477078009921, + "flos": 25740158515200.0, + "grad_norm": 1.3727755929331091, + "language_loss": 0.88437784, + "learning_rate": 4.6863456390050425e-07, + "loss": 0.905608, + "num_input_tokens_seen": 281413140, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.640625, + "step": 13044, + "time_per_iteration": 2.4991369247436523 + }, + { + "auxiliary_loss_clip": 0.01104783, + "auxiliary_loss_mlp": 0.0103004, + "balance_loss_clip": 1.01857805, + "balance_loss_mlp": 1.03586638, + "epoch": 0.78430783105366, + "flos": 21980957765760.0, + "grad_norm": 2.298673788206572, + "language_loss": 0.79098254, + "learning_rate": 4.6838408399750195e-07, + "loss": 0.81233072, + "num_input_tokens_seen": 281430860, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6875, + "step": 13045, + "time_per_iteration": 2.4472832679748535 + }, + { + "auxiliary_loss_clip": 0.01098057, + "auxiliary_loss_mlp": 0.01027036, + "balance_loss_clip": 1.0161643, + "balance_loss_mlp": 1.03325605, + "epoch": 0.784367954306328, + "flos": 23842279607040.0, + "grad_norm": 1.3934452663009353, + "language_loss": 0.72286654, + "learning_rate": 4.6813366217352925e-07, + "loss": 0.7441175, + "num_input_tokens_seen": 281451385, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6484375, + "step": 13046, + "time_per_iteration": 2.487778425216675 + }, + { + "auxiliary_loss_clip": 0.01098961, + "auxiliary_loss_mlp": 0.0103391, + "balance_loss_clip": 1.0218997, + "balance_loss_mlp": 1.03507853, + "epoch": 0.7844280775589959, + "flos": 24826662806400.0, + "grad_norm": 1.566633263646869, + "language_loss": 0.63192189, + "learning_rate": 4.678832984380809e-07, + "loss": 0.65325058, + "num_input_tokens_seen": 281472255, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.63671875, + "step": 13047, + "time_per_iteration": 2.5349674224853516 + }, + { + "auxiliary_loss_clip": 0.01098768, + "auxiliary_loss_mlp": 0.01024033, + "balance_loss_clip": 1.01313078, + "balance_loss_mlp": 1.03501678, + "epoch": 0.7844882008116639, + "flos": 22455660931200.0, + "grad_norm": 1.5581126874211093, + "language_loss": 0.73077911, + "learning_rate": 4.676329928006515e-07, + "loss": 0.75200713, + "num_input_tokens_seen": 281492860, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.63671875, + "step": 13048, + "time_per_iteration": 2.4880495071411133 + }, + { + "auxiliary_loss_clip": 0.01105114, + "auxiliary_loss_mlp": 0.01031215, + "balance_loss_clip": 1.01921093, + "balance_loss_mlp": 1.03758121, + "epoch": 0.7845483240643318, + "flos": 26104041244800.0, + "grad_norm": 2.6312152451554587, + "language_loss": 0.74826312, + "learning_rate": 4.6738274527073243e-07, + "loss": 0.76962638, + "num_input_tokens_seen": 281511815, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 13049, + "time_per_iteration": 2.477346658706665 + }, + { + "auxiliary_loss_clip": 0.01103242, + "auxiliary_loss_mlp": 0.01028189, + "balance_loss_clip": 1.01565409, + "balance_loss_mlp": 1.0343411, + "epoch": 0.7846084473169999, + "flos": 19354307817600.0, + "grad_norm": 1.741709533193149, + "language_loss": 0.72563767, + "learning_rate": 4.6713255585781454e-07, + "loss": 0.746952, + "num_input_tokens_seen": 281530090, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 13050, + "time_per_iteration": 2.4637062549591064 + }, + { + "auxiliary_loss_clip": 0.01099539, + "auxiliary_loss_mlp": 0.01033602, + "balance_loss_clip": 1.0217284, + "balance_loss_mlp": 1.03509378, + "epoch": 0.7846685705696678, + "flos": 23325811902720.0, + "grad_norm": 2.325466593852248, + "language_loss": 0.73197848, + "learning_rate": 4.668824245713825e-07, + "loss": 0.75330985, + "num_input_tokens_seen": 281547075, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.64453125, + "step": 13051, + "time_per_iteration": 2.4410598278045654 + }, + { + "auxiliary_loss_clip": 0.01102687, + "auxiliary_loss_mlp": 0.0103361, + "balance_loss_clip": 1.02142096, + "balance_loss_mlp": 1.03567302, + "epoch": 0.7847286938223358, + "flos": 35809545962880.0, + "grad_norm": 2.1693731979967965, + "language_loss": 0.72507489, + "learning_rate": 4.666323514209227e-07, + "loss": 0.74643779, + "num_input_tokens_seen": 281568080, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.66796875, + "step": 13052, + "time_per_iteration": 2.580509901046753 + }, + { + "auxiliary_loss_clip": 0.01096936, + "auxiliary_loss_mlp": 0.01029725, + "balance_loss_clip": 1.01875806, + "balance_loss_mlp": 1.0346005, + "epoch": 0.7847888170750038, + "flos": 18478159274880.0, + "grad_norm": 1.7569531144927393, + "language_loss": 0.69126081, + "learning_rate": 4.663823364159183e-07, + "loss": 0.71252745, + "num_input_tokens_seen": 281586925, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.625, + "step": 13053, + "time_per_iteration": 3.805539131164551 + }, + { + "auxiliary_loss_clip": 0.01098051, + "auxiliary_loss_mlp": 0.01027333, + "balance_loss_clip": 1.01637769, + "balance_loss_mlp": 1.03426385, + "epoch": 0.7848489403276717, + "flos": 25119155255040.0, + "grad_norm": 2.052215222925797, + "language_loss": 0.70214486, + "learning_rate": 4.6613237956584893e-07, + "loss": 0.72339875, + "num_input_tokens_seen": 281603915, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.640625, + "step": 13054, + "time_per_iteration": 2.4813599586486816 + }, + { + "auxiliary_loss_clip": 0.01103116, + "auxiliary_loss_mlp": 0.01034346, + "balance_loss_clip": 1.02268767, + "balance_loss_mlp": 1.03524971, + "epoch": 0.7849090635803397, + "flos": 26502433966080.0, + "grad_norm": 1.891443504325583, + "language_loss": 0.75708246, + "learning_rate": 4.658824808801938e-07, + "loss": 0.77845711, + "num_input_tokens_seen": 281624220, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 13055, + "time_per_iteration": 3.9307680130004883 + }, + { + "auxiliary_loss_clip": 0.01106616, + "auxiliary_loss_mlp": 0.01028557, + "balance_loss_clip": 1.01664162, + "balance_loss_mlp": 1.03725183, + "epoch": 0.7849691868330076, + "flos": 20959658363520.0, + "grad_norm": 6.321454082407856, + "language_loss": 0.74865484, + "learning_rate": 4.656326403684283e-07, + "loss": 0.77000654, + "num_input_tokens_seen": 281642325, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.69140625, + "step": 13056, + "time_per_iteration": 4.0152342319488525 + }, + { + "auxiliary_loss_clip": 0.01101822, + "auxiliary_loss_mlp": 0.01027242, + "balance_loss_clip": 1.01566076, + "balance_loss_mlp": 1.03655851, + "epoch": 0.7850293100856757, + "flos": 26067484177920.0, + "grad_norm": 1.5631013098906712, + "language_loss": 0.70461977, + "learning_rate": 4.6538285804002744e-07, + "loss": 0.72591043, + "num_input_tokens_seen": 281663065, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 13057, + "time_per_iteration": 2.5022852420806885 + }, + { + "auxiliary_loss_clip": 0.01101195, + "auxiliary_loss_mlp": 0.01031025, + "balance_loss_clip": 1.01983142, + "balance_loss_mlp": 1.03427744, + "epoch": 0.7850894333383436, + "flos": 22491894775680.0, + "grad_norm": 2.087059911869826, + "language_loss": 0.7686438, + "learning_rate": 4.6513313390446175e-07, + "loss": 0.78996599, + "num_input_tokens_seen": 281681005, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.66796875, + "step": 13058, + "time_per_iteration": 3.913203716278076 + }, + { + "auxiliary_loss_clip": 0.01101711, + "auxiliary_loss_mlp": 0.01029686, + "balance_loss_clip": 1.01822972, + "balance_loss_mlp": 1.03652596, + "epoch": 0.7851495565910116, + "flos": 20558643949440.0, + "grad_norm": 1.620822282702505, + "language_loss": 0.70728242, + "learning_rate": 4.6488346797120146e-07, + "loss": 0.72859639, + "num_input_tokens_seen": 281697965, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65234375, + "step": 13059, + "time_per_iteration": 2.4571406841278076 + }, + { + "auxiliary_loss_clip": 0.01104562, + "auxiliary_loss_mlp": 0.01038767, + "balance_loss_clip": 1.02604127, + "balance_loss_mlp": 1.03527403, + "epoch": 0.7852096798436795, + "flos": 15924838942080.0, + "grad_norm": 1.7516949433985336, + "language_loss": 0.76551163, + "learning_rate": 4.646338602497144e-07, + "loss": 0.78694499, + "num_input_tokens_seen": 281716035, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 13060, + "time_per_iteration": 2.452622413635254 + }, + { + "auxiliary_loss_clip": 0.0110109, + "auxiliary_loss_mlp": 0.01030961, + "balance_loss_clip": 1.01882577, + "balance_loss_mlp": 1.03516376, + "epoch": 0.7852698030963475, + "flos": 19062282245760.0, + "grad_norm": 2.1122245234180923, + "language_loss": 0.77249229, + "learning_rate": 4.643843107494654e-07, + "loss": 0.79381275, + "num_input_tokens_seen": 281732815, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66015625, + "step": 13061, + "time_per_iteration": 2.4392404556274414 + }, + { + "auxiliary_loss_clip": 0.01100348, + "auxiliary_loss_mlp": 0.01029308, + "balance_loss_clip": 1.01744044, + "balance_loss_mlp": 1.03367698, + "epoch": 0.7853299263490154, + "flos": 24644380262400.0, + "grad_norm": 2.075148531111265, + "language_loss": 0.73844373, + "learning_rate": 4.641348194799164e-07, + "loss": 0.75974035, + "num_input_tokens_seen": 281751980, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66796875, + "step": 13062, + "time_per_iteration": 2.542872428894043 + }, + { + "auxiliary_loss_clip": 0.01097942, + "auxiliary_loss_mlp": 0.01029163, + "balance_loss_clip": 1.01824331, + "balance_loss_mlp": 1.03418064, + "epoch": 0.7853900496016835, + "flos": 22017981709440.0, + "grad_norm": 1.4437360757682784, + "language_loss": 0.68408203, + "learning_rate": 4.638853864505297e-07, + "loss": 0.70535302, + "num_input_tokens_seen": 281772670, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.640625, + "step": 13063, + "time_per_iteration": 2.468757390975952 + }, + { + "auxiliary_loss_clip": 0.01102772, + "auxiliary_loss_mlp": 0.01033084, + "balance_loss_clip": 1.02163374, + "balance_loss_mlp": 1.03934288, + "epoch": 0.7854501728543514, + "flos": 30227412032640.0, + "grad_norm": 2.216322061173653, + "language_loss": 0.7278775, + "learning_rate": 4.636360116707625e-07, + "loss": 0.74923611, + "num_input_tokens_seen": 281792930, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6328125, + "step": 13064, + "time_per_iteration": 2.629014730453491 + }, + { + "auxiliary_loss_clip": 0.01101508, + "auxiliary_loss_mlp": 0.01031412, + "balance_loss_clip": 1.01990271, + "balance_loss_mlp": 1.03406608, + "epoch": 0.7855102961070194, + "flos": 18843694030080.0, + "grad_norm": 1.7428353830367498, + "language_loss": 0.67990673, + "learning_rate": 4.633866951500718e-07, + "loss": 0.70123595, + "num_input_tokens_seen": 281811805, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.67578125, + "step": 13065, + "time_per_iteration": 2.440537691116333 + }, + { + "auxiliary_loss_clip": 0.01101655, + "auxiliary_loss_mlp": 0.01030538, + "balance_loss_clip": 1.01917148, + "balance_loss_mlp": 1.03686762, + "epoch": 0.7855704193596874, + "flos": 22309971367680.0, + "grad_norm": 1.9043114354962565, + "language_loss": 0.76035756, + "learning_rate": 4.6313743689791196e-07, + "loss": 0.78167951, + "num_input_tokens_seen": 281831885, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6484375, + "step": 13066, + "time_per_iteration": 2.4779815673828125 + }, + { + "auxiliary_loss_clip": 0.01023361, + "auxiliary_loss_mlp": 0.0100262, + "balance_loss_clip": 1.00158274, + "balance_loss_mlp": 1.00318313, + "epoch": 0.7856305426123553, + "flos": 60004434407040.0, + "grad_norm": 0.7064057313548338, + "language_loss": 0.53389549, + "learning_rate": 4.628882369237346e-07, + "loss": 0.55415535, + "num_input_tokens_seen": 281900310, + "router_z_loss_clip": 0.01037598, + "router_z_loss_mlp": 0.20214844, + "step": 13067, + "time_per_iteration": 3.158377170562744 + }, + { + "auxiliary_loss_clip": 0.01099339, + "auxiliary_loss_mlp": 0.01030027, + "balance_loss_clip": 1.01784921, + "balance_loss_mlp": 1.03333259, + "epoch": 0.7856906658650233, + "flos": 21868593045120.0, + "grad_norm": 1.7780609677400445, + "language_loss": 0.67590213, + "learning_rate": 4.62639095236989e-07, + "loss": 0.69719583, + "num_input_tokens_seen": 281918870, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.66015625, + "step": 13068, + "time_per_iteration": 2.4604732990264893 + }, + { + "auxiliary_loss_clip": 0.01099845, + "auxiliary_loss_mlp": 0.01030575, + "balance_loss_clip": 1.01966739, + "balance_loss_mlp": 1.03644729, + "epoch": 0.7857507891176913, + "flos": 23622937205760.0, + "grad_norm": 1.9961392096486945, + "language_loss": 0.67999709, + "learning_rate": 4.6239001184712267e-07, + "loss": 0.70130128, + "num_input_tokens_seen": 281936905, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6328125, + "step": 13069, + "time_per_iteration": 2.470776319503784 + }, + { + "auxiliary_loss_clip": 0.01102413, + "auxiliary_loss_mlp": 0.01030592, + "balance_loss_clip": 1.01887965, + "balance_loss_mlp": 1.03625858, + "epoch": 0.7858109123703593, + "flos": 25520061928320.0, + "grad_norm": 1.6342789712373722, + "language_loss": 0.76993471, + "learning_rate": 4.6214098676358195e-07, + "loss": 0.79126477, + "num_input_tokens_seen": 281955625, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 13070, + "time_per_iteration": 2.4821813106536865 + }, + { + "auxiliary_loss_clip": 0.01097348, + "auxiliary_loss_mlp": 0.01030243, + "balance_loss_clip": 1.01948428, + "balance_loss_mlp": 1.0329771, + "epoch": 0.7858710356230272, + "flos": 17457398576640.0, + "grad_norm": 1.5497406441787502, + "language_loss": 0.65501463, + "learning_rate": 4.618920199958083e-07, + "loss": 0.67629051, + "num_input_tokens_seen": 281973285, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.64453125, + "step": 13071, + "time_per_iteration": 2.4392311573028564 + }, + { + "auxiliary_loss_clip": 0.01099716, + "auxiliary_loss_mlp": 0.01031474, + "balance_loss_clip": 1.02051842, + "balance_loss_mlp": 1.03337324, + "epoch": 0.7859311588756952, + "flos": 24679680353280.0, + "grad_norm": 1.7465471589650208, + "language_loss": 0.74096799, + "learning_rate": 4.616431115532442e-07, + "loss": 0.76227987, + "num_input_tokens_seen": 281991410, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6640625, + "step": 13072, + "time_per_iteration": 2.4858996868133545 + }, + { + "auxiliary_loss_clip": 0.0110303, + "auxiliary_loss_mlp": 0.01029417, + "balance_loss_clip": 1.01730585, + "balance_loss_mlp": 1.03666794, + "epoch": 0.7859912821283631, + "flos": 21799142098560.0, + "grad_norm": 2.0042152052909206, + "language_loss": 0.71074873, + "learning_rate": 4.613942614453268e-07, + "loss": 0.73207319, + "num_input_tokens_seen": 282010845, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 13073, + "time_per_iteration": 2.454535961151123 + }, + { + "auxiliary_loss_clip": 0.01099776, + "auxiliary_loss_mlp": 0.0103377, + "balance_loss_clip": 1.0218128, + "balance_loss_mlp": 1.03427434, + "epoch": 0.7860514053810311, + "flos": 20847293642880.0, + "grad_norm": 1.677206170034674, + "language_loss": 0.76719201, + "learning_rate": 4.611454696814938e-07, + "loss": 0.78852749, + "num_input_tokens_seen": 282029635, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 13074, + "time_per_iteration": 2.4688336849212646 + }, + { + "auxiliary_loss_clip": 0.01097672, + "auxiliary_loss_mlp": 0.01030189, + "balance_loss_clip": 1.01888216, + "balance_loss_mlp": 1.03478217, + "epoch": 0.786111528633699, + "flos": 24315689882880.0, + "grad_norm": 1.626029190410932, + "language_loss": 0.74981356, + "learning_rate": 4.608967362711782e-07, + "loss": 0.77109224, + "num_input_tokens_seen": 282050285, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.62890625, + "step": 13075, + "time_per_iteration": 2.4762327671051025 + }, + { + "auxiliary_loss_clip": 0.01100533, + "auxiliary_loss_mlp": 0.01024172, + "balance_loss_clip": 1.01356792, + "balance_loss_mlp": 1.03545177, + "epoch": 0.7861716518863671, + "flos": 24353180703360.0, + "grad_norm": 1.7567110977428382, + "language_loss": 0.6898433, + "learning_rate": 4.6064806122381283e-07, + "loss": 0.71109033, + "num_input_tokens_seen": 282071040, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.65234375, + "step": 13076, + "time_per_iteration": 2.5244879722595215 + }, + { + "auxiliary_loss_clip": 0.01098306, + "auxiliary_loss_mlp": 0.01026682, + "balance_loss_clip": 1.01502383, + "balance_loss_mlp": 1.0347321, + "epoch": 0.786231775139035, + "flos": 14022399006720.0, + "grad_norm": 2.2025280596790395, + "language_loss": 0.80192757, + "learning_rate": 4.603994445488282e-07, + "loss": 0.8231774, + "num_input_tokens_seen": 282086610, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.63671875, + "step": 13077, + "time_per_iteration": 2.491744041442871 + }, + { + "auxiliary_loss_clip": 0.01100583, + "auxiliary_loss_mlp": 0.01032144, + "balance_loss_clip": 1.01986599, + "balance_loss_mlp": 1.03536844, + "epoch": 0.786291898391703, + "flos": 33724248865920.0, + "grad_norm": 1.490748661053691, + "language_loss": 0.70515674, + "learning_rate": 4.6015088625564956e-07, + "loss": 0.72648406, + "num_input_tokens_seen": 282107440, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.65234375, + "step": 13078, + "time_per_iteration": 2.555865526199341 + }, + { + "auxiliary_loss_clip": 0.0109965, + "auxiliary_loss_mlp": 0.01030772, + "balance_loss_clip": 1.01984668, + "balance_loss_mlp": 1.0353632, + "epoch": 0.786352021644371, + "flos": 25811476968960.0, + "grad_norm": 1.565975595125152, + "language_loss": 0.81306797, + "learning_rate": 4.599023863537039e-07, + "loss": 0.83437216, + "num_input_tokens_seen": 282127290, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.640625, + "step": 13079, + "time_per_iteration": 2.49438738822937 + }, + { + "auxiliary_loss_clip": 0.01096305, + "auxiliary_loss_mlp": 0.01026692, + "balance_loss_clip": 1.01586151, + "balance_loss_mlp": 1.03352332, + "epoch": 0.7864121448970389, + "flos": 28910818920960.0, + "grad_norm": 1.6630658201399222, + "language_loss": 0.68445063, + "learning_rate": 4.596539448524146e-07, + "loss": 0.70568061, + "num_input_tokens_seen": 282147505, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.62890625, + "step": 13080, + "time_per_iteration": 2.5388312339782715 + }, + { + "auxiliary_loss_clip": 0.01099497, + "auxiliary_loss_mlp": 0.01031122, + "balance_loss_clip": 1.01981521, + "balance_loss_mlp": 1.03463578, + "epoch": 0.7864722681497069, + "flos": 19208833735680.0, + "grad_norm": 1.6317908200800284, + "language_loss": 0.69513613, + "learning_rate": 4.594055617612016e-07, + "loss": 0.71644235, + "num_input_tokens_seen": 282166450, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 13081, + "time_per_iteration": 2.470564842224121 + }, + { + "auxiliary_loss_clip": 0.01100243, + "auxiliary_loss_mlp": 0.01035049, + "balance_loss_clip": 1.02367032, + "balance_loss_mlp": 1.03415251, + "epoch": 0.7865323914023749, + "flos": 21871573873920.0, + "grad_norm": 1.6215934459039671, + "language_loss": 0.68073553, + "learning_rate": 4.591572370894838e-07, + "loss": 0.70208842, + "num_input_tokens_seen": 282186465, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.66015625, + "step": 13082, + "time_per_iteration": 2.47454833984375 + }, + { + "auxiliary_loss_clip": 0.01099019, + "auxiliary_loss_mlp": 0.01034558, + "balance_loss_clip": 1.02276242, + "balance_loss_mlp": 1.03449476, + "epoch": 0.7865925146550429, + "flos": 25520313323520.0, + "grad_norm": 1.8334733344878817, + "language_loss": 0.66071731, + "learning_rate": 4.589089708466789e-07, + "loss": 0.68205309, + "num_input_tokens_seen": 282207180, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.64453125, + "step": 13083, + "time_per_iteration": 2.4937517642974854 + }, + { + "auxiliary_loss_clip": 0.01103443, + "auxiliary_loss_mlp": 0.01030832, + "balance_loss_clip": 1.01840496, + "balance_loss_mlp": 1.03549075, + "epoch": 0.7866526379077108, + "flos": 19097366855040.0, + "grad_norm": 2.042540926509675, + "language_loss": 0.74778521, + "learning_rate": 4.5866076304220015e-07, + "loss": 0.76912796, + "num_input_tokens_seen": 282225865, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6796875, + "step": 13084, + "time_per_iteration": 2.4672179222106934 + }, + { + "auxiliary_loss_clip": 0.01098876, + "auxiliary_loss_mlp": 0.01028809, + "balance_loss_clip": 1.01814008, + "balance_loss_mlp": 1.03493166, + "epoch": 0.7867127611603788, + "flos": 16173771171840.0, + "grad_norm": 2.928531982319309, + "language_loss": 0.70411515, + "learning_rate": 4.584126136854591e-07, + "loss": 0.72539198, + "num_input_tokens_seen": 282242895, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.640625, + "step": 13085, + "time_per_iteration": 2.427304267883301 + }, + { + "auxiliary_loss_clip": 0.01103417, + "auxiliary_loss_mlp": 0.01028061, + "balance_loss_clip": 1.01565087, + "balance_loss_mlp": 1.03474259, + "epoch": 0.7867728844130467, + "flos": 20773640805120.0, + "grad_norm": 1.8136957772733184, + "language_loss": 0.72376126, + "learning_rate": 4.5816452278586617e-07, + "loss": 0.74507606, + "num_input_tokens_seen": 282260425, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 13086, + "time_per_iteration": 2.480523109436035 + }, + { + "auxiliary_loss_clip": 0.01097734, + "auxiliary_loss_mlp": 0.0102774, + "balance_loss_clip": 1.01654005, + "balance_loss_mlp": 1.03270912, + "epoch": 0.7868330076657147, + "flos": 21760106993280.0, + "grad_norm": 1.9014411249537477, + "language_loss": 0.74928933, + "learning_rate": 4.5791649035282965e-07, + "loss": 0.77054405, + "num_input_tokens_seen": 282279335, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 13087, + "time_per_iteration": 2.469919204711914 + }, + { + "auxiliary_loss_clip": 0.01097848, + "auxiliary_loss_mlp": 0.01032258, + "balance_loss_clip": 1.02146316, + "balance_loss_mlp": 1.03391075, + "epoch": 0.7868931309183826, + "flos": 25700692446720.0, + "grad_norm": 3.8678035141678913, + "language_loss": 0.71336555, + "learning_rate": 4.5766851639575456e-07, + "loss": 0.73466659, + "num_input_tokens_seen": 282299905, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.640625, + "step": 13088, + "time_per_iteration": 2.5597689151763916 + }, + { + "auxiliary_loss_clip": 0.01023649, + "auxiliary_loss_mlp": 0.01006009, + "balance_loss_clip": 1.00502574, + "balance_loss_mlp": 1.00346375, + "epoch": 0.7869532541710507, + "flos": 64644883430400.0, + "grad_norm": 0.6844618253743016, + "language_loss": 0.55505019, + "learning_rate": 4.574206009240431e-07, + "loss": 0.57534683, + "num_input_tokens_seen": 282367620, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.20117188, + "step": 13089, + "time_per_iteration": 3.174372673034668 + }, + { + "auxiliary_loss_clip": 0.01023353, + "auxiliary_loss_mlp": 0.01001352, + "balance_loss_clip": 1.00036299, + "balance_loss_mlp": 1.00316393, + "epoch": 0.7870133774237186, + "flos": 67453600440960.0, + "grad_norm": 0.7253731939477448, + "language_loss": 0.49957851, + "learning_rate": 4.571727439470976e-07, + "loss": 0.51982558, + "num_input_tokens_seen": 282435695, + "router_z_loss_clip": 0.0098877, + "router_z_loss_mlp": 0.20214844, + "step": 13090, + "time_per_iteration": 3.1464152336120605 + }, + { + "auxiliary_loss_clip": 0.01097486, + "auxiliary_loss_mlp": 0.01026378, + "balance_loss_clip": 1.01597738, + "balance_loss_mlp": 1.03442216, + "epoch": 0.7870735006763866, + "flos": 26068310190720.0, + "grad_norm": 2.0009020702147624, + "language_loss": 0.83693981, + "learning_rate": 4.5692494547431583e-07, + "loss": 0.8581785, + "num_input_tokens_seen": 282456025, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.6328125, + "step": 13091, + "time_per_iteration": 2.5320253372192383 + }, + { + "auxiliary_loss_clip": 0.01023736, + "auxiliary_loss_mlp": 0.01003239, + "balance_loss_clip": 1.00224388, + "balance_loss_mlp": 1.00338745, + "epoch": 0.7871336239290546, + "flos": 70289572896000.0, + "grad_norm": 0.7117957030218485, + "language_loss": 0.63994247, + "learning_rate": 4.566772055150947e-07, + "loss": 0.66021222, + "num_input_tokens_seen": 282520995, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.203125, + "step": 13092, + "time_per_iteration": 3.083390474319458 + }, + { + "auxiliary_loss_clip": 0.01102492, + "auxiliary_loss_mlp": 0.01031507, + "balance_loss_clip": 1.01996171, + "balance_loss_mlp": 1.03640008, + "epoch": 0.7871937471817225, + "flos": 15778574760960.0, + "grad_norm": 3.478229156670452, + "language_loss": 0.79910231, + "learning_rate": 4.564295240788285e-07, + "loss": 0.82044232, + "num_input_tokens_seen": 282539355, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66015625, + "step": 13093, + "time_per_iteration": 2.4508519172668457 + }, + { + "auxiliary_loss_clip": 0.01097319, + "auxiliary_loss_mlp": 0.01027817, + "balance_loss_clip": 1.01696348, + "balance_loss_mlp": 1.03387761, + "epoch": 0.7872538704343905, + "flos": 20485242506880.0, + "grad_norm": 2.289206273735693, + "language_loss": 0.7536335, + "learning_rate": 4.561819011749106e-07, + "loss": 0.77488482, + "num_input_tokens_seen": 282555735, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6328125, + "step": 13094, + "time_per_iteration": 2.423264980316162 + }, + { + "auxiliary_loss_clip": 0.01101607, + "auxiliary_loss_mlp": 0.01036216, + "balance_loss_clip": 1.02510548, + "balance_loss_mlp": 1.03562438, + "epoch": 0.7873139936870585, + "flos": 25082670015360.0, + "grad_norm": 1.6408632577371567, + "language_loss": 0.79475707, + "learning_rate": 4.5593433681272884e-07, + "loss": 0.81613529, + "num_input_tokens_seen": 282574550, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66015625, + "step": 13095, + "time_per_iteration": 3.9224746227264404 + }, + { + "auxiliary_loss_clip": 0.01099901, + "auxiliary_loss_mlp": 0.01030817, + "balance_loss_clip": 1.01915216, + "balance_loss_mlp": 1.03335738, + "epoch": 0.7873741169397265, + "flos": 30883176679680.0, + "grad_norm": 2.020167585783757, + "language_loss": 0.67747319, + "learning_rate": 4.556868310016715e-07, + "loss": 0.69878036, + "num_input_tokens_seen": 282596520, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 13096, + "time_per_iteration": 4.006121635437012 + }, + { + "auxiliary_loss_clip": 0.01093799, + "auxiliary_loss_mlp": 0.01025076, + "balance_loss_clip": 1.01535416, + "balance_loss_mlp": 1.03172147, + "epoch": 0.7874342401923944, + "flos": 46791962242560.0, + "grad_norm": 1.5298468077201632, + "language_loss": 0.70352769, + "learning_rate": 4.55439383751125e-07, + "loss": 0.72471642, + "num_input_tokens_seen": 282620560, + "router_z_loss_clip": 0.09716797, + "router_z_loss_mlp": 0.625, + "step": 13097, + "time_per_iteration": 4.101962327957153 + }, + { + "auxiliary_loss_clip": 0.01102049, + "auxiliary_loss_mlp": 0.01031353, + "balance_loss_clip": 1.02018285, + "balance_loss_mlp": 1.0361073, + "epoch": 0.7874943634450624, + "flos": 23584548545280.0, + "grad_norm": 1.6655151068519558, + "language_loss": 0.80427504, + "learning_rate": 4.5519199507047126e-07, + "loss": 0.82560897, + "num_input_tokens_seen": 282639830, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.66015625, + "step": 13098, + "time_per_iteration": 2.46547532081604 + }, + { + "auxiliary_loss_clip": 0.01098922, + "auxiliary_loss_mlp": 0.01029007, + "balance_loss_clip": 1.01834953, + "balance_loss_mlp": 1.03521609, + "epoch": 0.7875544866977303, + "flos": 20191169859840.0, + "grad_norm": 1.645167890556634, + "language_loss": 0.74275064, + "learning_rate": 4.5494466496909177e-07, + "loss": 0.76402998, + "num_input_tokens_seen": 282660130, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.63671875, + "step": 13099, + "time_per_iteration": 2.485710620880127 + }, + { + "auxiliary_loss_clip": 0.01099828, + "auxiliary_loss_mlp": 0.01026234, + "balance_loss_clip": 1.01486731, + "balance_loss_mlp": 1.03532815, + "epoch": 0.7876146099503983, + "flos": 22602571557120.0, + "grad_norm": 1.60096052488611, + "language_loss": 0.78410721, + "learning_rate": 4.5469739345636603e-07, + "loss": 0.80536783, + "num_input_tokens_seen": 282681125, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6484375, + "step": 13100, + "time_per_iteration": 4.035876750946045 + }, + { + "auxiliary_loss_clip": 0.01106007, + "auxiliary_loss_mlp": 0.01028363, + "balance_loss_clip": 1.01570272, + "balance_loss_mlp": 1.03587461, + "epoch": 0.7876747332030662, + "flos": 10705833555840.0, + "grad_norm": 2.2959557681189895, + "language_loss": 0.66067588, + "learning_rate": 4.5445018054167007e-07, + "loss": 0.68201947, + "num_input_tokens_seen": 282696690, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 13101, + "time_per_iteration": 2.4304044246673584 + }, + { + "auxiliary_loss_clip": 0.01098831, + "auxiliary_loss_mlp": 0.01027717, + "balance_loss_clip": 1.01638031, + "balance_loss_mlp": 1.03366089, + "epoch": 0.7877348564557343, + "flos": 38399315621760.0, + "grad_norm": 1.576742328174997, + "language_loss": 0.7767005, + "learning_rate": 4.5420302623437745e-07, + "loss": 0.79796594, + "num_input_tokens_seen": 282721210, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 13102, + "time_per_iteration": 2.587104320526123 + }, + { + "auxiliary_loss_clip": 0.01099699, + "auxiliary_loss_mlp": 0.01033598, + "balance_loss_clip": 1.02300668, + "balance_loss_mlp": 1.03498983, + "epoch": 0.7877949797084022, + "flos": 18329524796160.0, + "grad_norm": 2.03801984289661, + "language_loss": 0.82200575, + "learning_rate": 4.5395593054386093e-07, + "loss": 0.84333879, + "num_input_tokens_seen": 282738505, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.6484375, + "step": 13103, + "time_per_iteration": 2.4504380226135254 + }, + { + "auxiliary_loss_clip": 0.01102423, + "auxiliary_loss_mlp": 0.01033481, + "balance_loss_clip": 1.02108872, + "balance_loss_mlp": 1.03538537, + "epoch": 0.7878551029610702, + "flos": 25806736373760.0, + "grad_norm": 1.9382935639553287, + "language_loss": 0.80800354, + "learning_rate": 4.537088934794913e-07, + "loss": 0.82936251, + "num_input_tokens_seen": 282756895, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 13104, + "time_per_iteration": 2.4761226177215576 + }, + { + "auxiliary_loss_clip": 0.01101176, + "auxiliary_loss_mlp": 0.01032376, + "balance_loss_clip": 1.02072978, + "balance_loss_mlp": 1.03486192, + "epoch": 0.7879152262137382, + "flos": 22342685679360.0, + "grad_norm": 1.5580110951181336, + "language_loss": 0.74400711, + "learning_rate": 4.5346191505063515e-07, + "loss": 0.76534271, + "num_input_tokens_seen": 282774955, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 13105, + "time_per_iteration": 2.458893060684204 + }, + { + "auxiliary_loss_clip": 0.01102329, + "auxiliary_loss_mlp": 0.01033841, + "balance_loss_clip": 1.0220865, + "balance_loss_mlp": 1.03494358, + "epoch": 0.7879753494664061, + "flos": 24785329230720.0, + "grad_norm": 1.6914912151610795, + "language_loss": 0.75718057, + "learning_rate": 4.5321499526665776e-07, + "loss": 0.77854228, + "num_input_tokens_seen": 282793165, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.67578125, + "step": 13106, + "time_per_iteration": 2.4740750789642334 + }, + { + "auxiliary_loss_clip": 0.01101506, + "auxiliary_loss_mlp": 0.01032404, + "balance_loss_clip": 1.02129924, + "balance_loss_mlp": 1.03471053, + "epoch": 0.7880354727190741, + "flos": 16909078487040.0, + "grad_norm": 2.2970900789620767, + "language_loss": 0.73269242, + "learning_rate": 4.5296813413692337e-07, + "loss": 0.75403154, + "num_input_tokens_seen": 282809820, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66796875, + "step": 13107, + "time_per_iteration": 2.465049982070923 + }, + { + "auxiliary_loss_clip": 0.01098914, + "auxiliary_loss_mlp": 0.0103322, + "balance_loss_clip": 1.02140641, + "balance_loss_mlp": 1.03424203, + "epoch": 0.7880955959717421, + "flos": 22230500526720.0, + "grad_norm": 1.8872299288056482, + "language_loss": 0.73182052, + "learning_rate": 4.5272133167079165e-07, + "loss": 0.75314188, + "num_input_tokens_seen": 282828600, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6484375, + "step": 13108, + "time_per_iteration": 2.4711079597473145 + }, + { + "auxiliary_loss_clip": 0.01023267, + "auxiliary_loss_mlp": 0.01002041, + "balance_loss_clip": 1.00098598, + "balance_loss_mlp": 1.00313234, + "epoch": 0.7881557192244101, + "flos": 69183200131200.0, + "grad_norm": 1.661709536618796, + "language_loss": 0.60381085, + "learning_rate": 4.5247458787762216e-07, + "loss": 0.62406397, + "num_input_tokens_seen": 282882775, + "router_z_loss_clip": 0.01055908, + "router_z_loss_mlp": 0.20117188, + "step": 13109, + "time_per_iteration": 3.0089924335479736 + }, + { + "auxiliary_loss_clip": 0.01097142, + "auxiliary_loss_mlp": 0.01028206, + "balance_loss_clip": 1.01739979, + "balance_loss_mlp": 1.03491497, + "epoch": 0.788215842477078, + "flos": 24935436167040.0, + "grad_norm": 1.5824275736461375, + "language_loss": 0.71883583, + "learning_rate": 4.5222790276677126e-07, + "loss": 0.7400893, + "num_input_tokens_seen": 282902680, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.62109375, + "step": 13110, + "time_per_iteration": 2.465576171875 + }, + { + "auxiliary_loss_clip": 0.01098188, + "auxiliary_loss_mlp": 0.01026783, + "balance_loss_clip": 1.01631093, + "balance_loss_mlp": 1.03485966, + "epoch": 0.788275965729746, + "flos": 26106483369600.0, + "grad_norm": 1.3860317758339384, + "language_loss": 0.75074577, + "learning_rate": 4.5198127634759455e-07, + "loss": 0.77199543, + "num_input_tokens_seen": 282923625, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.6328125, + "step": 13111, + "time_per_iteration": 2.4993157386779785 + }, + { + "auxiliary_loss_clip": 0.01098161, + "auxiliary_loss_mlp": 0.01031611, + "balance_loss_clip": 1.02001154, + "balance_loss_mlp": 1.03351355, + "epoch": 0.7883360889824139, + "flos": 21214803646080.0, + "grad_norm": 1.94564104551391, + "language_loss": 0.61333418, + "learning_rate": 4.5173470862944206e-07, + "loss": 0.63463187, + "num_input_tokens_seen": 282941955, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.64453125, + "step": 13112, + "time_per_iteration": 2.43581485748291 + }, + { + "auxiliary_loss_clip": 0.01099376, + "auxiliary_loss_mlp": 0.01027358, + "balance_loss_clip": 1.01515722, + "balance_loss_mlp": 1.0338614, + "epoch": 0.7883962122350819, + "flos": 21142551438720.0, + "grad_norm": 1.7382192958818077, + "language_loss": 0.67246455, + "learning_rate": 4.514881996216644e-07, + "loss": 0.69373184, + "num_input_tokens_seen": 282961280, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65625, + "step": 13113, + "time_per_iteration": 2.4511425495147705 + }, + { + "auxiliary_loss_clip": 0.01098431, + "auxiliary_loss_mlp": 0.01027803, + "balance_loss_clip": 1.0168004, + "balance_loss_mlp": 1.03448272, + "epoch": 0.7884563354877498, + "flos": 15302901928320.0, + "grad_norm": 12.027787303417453, + "language_loss": 0.58199584, + "learning_rate": 4.5124174933361e-07, + "loss": 0.60325825, + "num_input_tokens_seen": 282978210, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 13114, + "time_per_iteration": 2.3941895961761475 + }, + { + "auxiliary_loss_clip": 0.01101584, + "auxiliary_loss_mlp": 0.01028509, + "balance_loss_clip": 1.01636708, + "balance_loss_mlp": 1.03487444, + "epoch": 0.7885164587404179, + "flos": 24388301226240.0, + "grad_norm": 1.6461122480026786, + "language_loss": 0.66887224, + "learning_rate": 4.5099535777462306e-07, + "loss": 0.69017321, + "num_input_tokens_seen": 282998845, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66796875, + "step": 13115, + "time_per_iteration": 2.4768731594085693 + }, + { + "auxiliary_loss_clip": 0.01099861, + "auxiliary_loss_mlp": 0.01025915, + "balance_loss_clip": 1.01442361, + "balance_loss_mlp": 1.03510892, + "epoch": 0.7885765819930858, + "flos": 14385886686720.0, + "grad_norm": 1.909649629635062, + "language_loss": 0.8859247, + "learning_rate": 4.50749024954048e-07, + "loss": 0.90718246, + "num_input_tokens_seen": 283015200, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6484375, + "step": 13116, + "time_per_iteration": 2.4047675132751465 + }, + { + "auxiliary_loss_clip": 0.01106955, + "auxiliary_loss_mlp": 0.01031924, + "balance_loss_clip": 1.01909757, + "balance_loss_mlp": 1.0356214, + "epoch": 0.7886367052457538, + "flos": 18259930195200.0, + "grad_norm": 1.7003920490690876, + "language_loss": 0.72708535, + "learning_rate": 4.505027508812245e-07, + "loss": 0.74847412, + "num_input_tokens_seen": 283033680, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.7109375, + "step": 13117, + "time_per_iteration": 2.4341704845428467 + }, + { + "auxiliary_loss_clip": 0.01097792, + "auxiliary_loss_mlp": 0.01023058, + "balance_loss_clip": 1.01247823, + "balance_loss_mlp": 1.03483558, + "epoch": 0.7886968284984217, + "flos": 15305092657920.0, + "grad_norm": 1.4826682639516906, + "language_loss": 0.79875678, + "learning_rate": 4.502565355654926e-07, + "loss": 0.81996524, + "num_input_tokens_seen": 283050620, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.62890625, + "step": 13118, + "time_per_iteration": 2.394805431365967 + }, + { + "auxiliary_loss_clip": 0.01099322, + "auxiliary_loss_mlp": 0.01027518, + "balance_loss_clip": 1.01643777, + "balance_loss_mlp": 1.03507447, + "epoch": 0.7887569517510897, + "flos": 21215450090880.0, + "grad_norm": 1.7945164673922278, + "language_loss": 0.73091543, + "learning_rate": 4.500103790161878e-07, + "loss": 0.75218379, + "num_input_tokens_seen": 283070215, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.64453125, + "step": 13119, + "time_per_iteration": 2.460057258605957 + }, + { + "auxiliary_loss_clip": 0.01101447, + "auxiliary_loss_mlp": 0.01023623, + "balance_loss_clip": 1.01194072, + "balance_loss_mlp": 1.03509176, + "epoch": 0.7888170750037578, + "flos": 22711237176960.0, + "grad_norm": 1.261657596478895, + "language_loss": 0.71529341, + "learning_rate": 4.4976428124264454e-07, + "loss": 0.73654413, + "num_input_tokens_seen": 283091485, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 13120, + "time_per_iteration": 2.455064058303833 + }, + { + "auxiliary_loss_clip": 0.01100545, + "auxiliary_loss_mlp": 0.01031312, + "balance_loss_clip": 1.01978469, + "balance_loss_mlp": 1.03517127, + "epoch": 0.7888771982564257, + "flos": 36429148592640.0, + "grad_norm": 1.4332103532117941, + "language_loss": 0.78814548, + "learning_rate": 4.4951824225419564e-07, + "loss": 0.8094641, + "num_input_tokens_seen": 283115040, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 13121, + "time_per_iteration": 2.599400281906128 + }, + { + "auxiliary_loss_clip": 0.01098409, + "auxiliary_loss_mlp": 0.01026067, + "balance_loss_clip": 1.01478994, + "balance_loss_mlp": 1.03450656, + "epoch": 0.7889373215090937, + "flos": 27309993488640.0, + "grad_norm": 1.3967660183368626, + "language_loss": 0.80094564, + "learning_rate": 4.4927226206017057e-07, + "loss": 0.8221904, + "num_input_tokens_seen": 283136925, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.640625, + "step": 13122, + "time_per_iteration": 2.4992713928222656 + }, + { + "auxiliary_loss_clip": 0.0110103, + "auxiliary_loss_mlp": 0.01022634, + "balance_loss_clip": 1.01157165, + "balance_loss_mlp": 1.03481627, + "epoch": 0.7889974447617616, + "flos": 19829010983040.0, + "grad_norm": 2.145985677381676, + "language_loss": 0.77920961, + "learning_rate": 4.4902634066989597e-07, + "loss": 0.80044621, + "num_input_tokens_seen": 283155725, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.66015625, + "step": 13123, + "time_per_iteration": 2.4735960960388184 + }, + { + "auxiliary_loss_clip": 0.01104198, + "auxiliary_loss_mlp": 0.01029642, + "balance_loss_clip": 1.01790643, + "balance_loss_mlp": 1.0362196, + "epoch": 0.7890575680144296, + "flos": 17271201450240.0, + "grad_norm": 1.856299947344871, + "language_loss": 0.6726073, + "learning_rate": 4.487804780926985e-07, + "loss": 0.69394577, + "num_input_tokens_seen": 283173845, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6796875, + "step": 13124, + "time_per_iteration": 2.4079813957214355 + }, + { + "auxiliary_loss_clip": 0.01103324, + "auxiliary_loss_mlp": 0.01025655, + "balance_loss_clip": 1.01391327, + "balance_loss_mlp": 1.03546476, + "epoch": 0.7891176912670975, + "flos": 27600151553280.0, + "grad_norm": 2.605711353354914, + "language_loss": 0.72957736, + "learning_rate": 4.4853467433790036e-07, + "loss": 0.75086713, + "num_input_tokens_seen": 283191985, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 13125, + "time_per_iteration": 2.5052480697631836 + }, + { + "auxiliary_loss_clip": 0.01099892, + "auxiliary_loss_mlp": 0.01027655, + "balance_loss_clip": 1.0155673, + "balance_loss_mlp": 1.03235054, + "epoch": 0.7891778145197655, + "flos": 22711668140160.0, + "grad_norm": 2.154516730399549, + "language_loss": 0.72528452, + "learning_rate": 4.4828892941482267e-07, + "loss": 0.74655998, + "num_input_tokens_seen": 283210855, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 13126, + "time_per_iteration": 2.4527993202209473 + }, + { + "auxiliary_loss_clip": 0.01102896, + "auxiliary_loss_mlp": 0.01026431, + "balance_loss_clip": 1.01474881, + "balance_loss_mlp": 1.03575099, + "epoch": 0.7892379377724335, + "flos": 17310775259520.0, + "grad_norm": 2.0791406277804567, + "language_loss": 0.76886559, + "learning_rate": 4.480432433327845e-07, + "loss": 0.79015887, + "num_input_tokens_seen": 283229665, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.671875, + "step": 13127, + "time_per_iteration": 2.4405977725982666 + }, + { + "auxiliary_loss_clip": 0.01098106, + "auxiliary_loss_mlp": 0.01026719, + "balance_loss_clip": 1.0155077, + "balance_loss_mlp": 1.03493667, + "epoch": 0.7892980610251015, + "flos": 25775674087680.0, + "grad_norm": 1.7461753139665992, + "language_loss": 0.85763645, + "learning_rate": 4.47797616101103e-07, + "loss": 0.87888473, + "num_input_tokens_seen": 283248615, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6328125, + "step": 13128, + "time_per_iteration": 2.474844455718994 + }, + { + "auxiliary_loss_clip": 0.01098818, + "auxiliary_loss_mlp": 0.01031045, + "balance_loss_clip": 1.02045906, + "balance_loss_mlp": 1.03425086, + "epoch": 0.7893581842777694, + "flos": 21579943351680.0, + "grad_norm": 2.0767433694769175, + "language_loss": 0.68800604, + "learning_rate": 4.475520477290904e-07, + "loss": 0.70930469, + "num_input_tokens_seen": 283267135, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.64453125, + "step": 13129, + "time_per_iteration": 2.5359485149383545 + }, + { + "auxiliary_loss_clip": 0.01022991, + "auxiliary_loss_mlp": 0.01001965, + "balance_loss_clip": 1.00090396, + "balance_loss_mlp": 1.00285482, + "epoch": 0.7894183075304374, + "flos": 69016468176000.0, + "grad_norm": 0.7130558400515205, + "language_loss": 0.61589611, + "learning_rate": 4.473065382260597e-07, + "loss": 0.63614571, + "num_input_tokens_seen": 283328940, + "router_z_loss_clip": 0.01062012, + "router_z_loss_mlp": 0.20117188, + "step": 13130, + "time_per_iteration": 3.0489916801452637 + }, + { + "auxiliary_loss_clip": 0.01103251, + "auxiliary_loss_mlp": 0.01027204, + "balance_loss_clip": 1.01583779, + "balance_loss_mlp": 1.03717756, + "epoch": 0.7894784307831053, + "flos": 24243258107520.0, + "grad_norm": 1.6182422451860332, + "language_loss": 0.73774695, + "learning_rate": 4.4706108760132124e-07, + "loss": 0.7590515, + "num_input_tokens_seen": 283350000, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6640625, + "step": 13131, + "time_per_iteration": 2.475581169128418 + }, + { + "auxiliary_loss_clip": 0.01108004, + "auxiliary_loss_mlp": 0.01025009, + "balance_loss_clip": 1.01216388, + "balance_loss_mlp": 1.034796, + "epoch": 0.7895385540357733, + "flos": 20266546550400.0, + "grad_norm": 2.199372765286003, + "language_loss": 0.68987596, + "learning_rate": 4.4681569586418153e-07, + "loss": 0.71120608, + "num_input_tokens_seen": 283368020, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.734375, + "step": 13132, + "time_per_iteration": 2.4295406341552734 + }, + { + "auxiliary_loss_clip": 0.01102436, + "auxiliary_loss_mlp": 0.01033148, + "balance_loss_clip": 1.02129269, + "balance_loss_mlp": 1.03545117, + "epoch": 0.7895986772884414, + "flos": 20996574566400.0, + "grad_norm": 2.1121460507768406, + "language_loss": 0.62110436, + "learning_rate": 4.465703630239468e-07, + "loss": 0.64246017, + "num_input_tokens_seen": 283387030, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66796875, + "step": 13133, + "time_per_iteration": 2.483172655105591 + }, + { + "auxiliary_loss_clip": 0.01105396, + "auxiliary_loss_mlp": 0.0103757, + "balance_loss_clip": 1.02418268, + "balance_loss_mlp": 1.03652048, + "epoch": 0.7896588005411093, + "flos": 18657999694080.0, + "grad_norm": 2.3671306381438817, + "language_loss": 0.79635763, + "learning_rate": 4.463250890899195e-07, + "loss": 0.81778735, + "num_input_tokens_seen": 283402090, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.6875, + "step": 13134, + "time_per_iteration": 2.4047813415527344 + }, + { + "auxiliary_loss_clip": 0.01100888, + "auxiliary_loss_mlp": 0.01027831, + "balance_loss_clip": 1.01651824, + "balance_loss_mlp": 1.03489256, + "epoch": 0.7897189237937773, + "flos": 18405907067520.0, + "grad_norm": 1.729726812184161, + "language_loss": 0.79917061, + "learning_rate": 4.460798740713998e-07, + "loss": 0.82045782, + "num_input_tokens_seen": 283421035, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66015625, + "step": 13135, + "time_per_iteration": 2.4462645053863525 + }, + { + "auxiliary_loss_clip": 0.01099492, + "auxiliary_loss_mlp": 0.01028879, + "balance_loss_clip": 1.01702976, + "balance_loss_mlp": 1.03459549, + "epoch": 0.7897790470464452, + "flos": 23731602825600.0, + "grad_norm": 1.7066786377957706, + "language_loss": 0.72467506, + "learning_rate": 4.4583471797768733e-07, + "loss": 0.74595881, + "num_input_tokens_seen": 283441830, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6484375, + "step": 13136, + "time_per_iteration": 3.8541600704193115 + }, + { + "auxiliary_loss_clip": 0.01107278, + "auxiliary_loss_mlp": 0.01033045, + "balance_loss_clip": 1.02079642, + "balance_loss_mlp": 1.03614569, + "epoch": 0.7898391702991132, + "flos": 15918949111680.0, + "grad_norm": 1.8157038606560463, + "language_loss": 0.70418733, + "learning_rate": 4.455896208180778e-07, + "loss": 0.72559059, + "num_input_tokens_seen": 283459540, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.7109375, + "step": 13137, + "time_per_iteration": 2.451396942138672 + }, + { + "auxiliary_loss_clip": 0.01099987, + "auxiliary_loss_mlp": 0.01033834, + "balance_loss_clip": 1.02095389, + "balance_loss_mlp": 1.03527665, + "epoch": 0.7898992935517811, + "flos": 19829046896640.0, + "grad_norm": 1.748688408488967, + "language_loss": 0.74126804, + "learning_rate": 4.4534458260186645e-07, + "loss": 0.7626062, + "num_input_tokens_seen": 283478790, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6484375, + "step": 13138, + "time_per_iteration": 3.8486387729644775 + }, + { + "auxiliary_loss_clip": 0.01099719, + "auxiliary_loss_mlp": 0.01028497, + "balance_loss_clip": 1.01726758, + "balance_loss_mlp": 1.03461611, + "epoch": 0.7899594168044491, + "flos": 16216253982720.0, + "grad_norm": 2.0347678051570046, + "language_loss": 0.68777812, + "learning_rate": 4.4509960333834426e-07, + "loss": 0.70906031, + "num_input_tokens_seen": 283495720, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 13139, + "time_per_iteration": 3.8628947734832764 + }, + { + "auxiliary_loss_clip": 0.01022998, + "auxiliary_loss_mlp": 0.01001993, + "balance_loss_clip": 1.00090218, + "balance_loss_mlp": 1.00276661, + "epoch": 0.790019540057117, + "flos": 68331005959680.0, + "grad_norm": 0.8639772352746394, + "language_loss": 0.60299456, + "learning_rate": 4.448546830368003e-07, + "loss": 0.62324452, + "num_input_tokens_seen": 283558795, + "router_z_loss_clip": 0.01092529, + "router_z_loss_mlp": 0.20214844, + "step": 13140, + "time_per_iteration": 3.12382435798645 + }, + { + "auxiliary_loss_clip": 0.01101176, + "auxiliary_loss_mlp": 0.01031932, + "balance_loss_clip": 1.01973701, + "balance_loss_mlp": 1.03487992, + "epoch": 0.7900796633097851, + "flos": 30332773601280.0, + "grad_norm": 1.6042755472834633, + "language_loss": 0.7596916, + "learning_rate": 4.4460982170652304e-07, + "loss": 0.78102267, + "num_input_tokens_seen": 283579305, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6640625, + "step": 13141, + "time_per_iteration": 2.5595388412475586 + }, + { + "auxiliary_loss_clip": 0.0110272, + "auxiliary_loss_mlp": 0.01032935, + "balance_loss_clip": 1.02109766, + "balance_loss_mlp": 1.0354681, + "epoch": 0.790139786562453, + "flos": 22126790983680.0, + "grad_norm": 2.061867815111243, + "language_loss": 0.68504715, + "learning_rate": 4.4436501935679694e-07, + "loss": 0.70640367, + "num_input_tokens_seen": 283597840, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 13142, + "time_per_iteration": 3.9543938636779785 + }, + { + "auxiliary_loss_clip": 0.0102319, + "auxiliary_loss_mlp": 0.01000022, + "balance_loss_clip": 0.99900836, + "balance_loss_mlp": 1.00304079, + "epoch": 0.790199909815121, + "flos": 58207284213120.0, + "grad_norm": 0.8198553177673825, + "language_loss": 0.60004789, + "learning_rate": 4.441202759969049e-07, + "loss": 0.62028003, + "num_input_tokens_seen": 283647950, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.20117188, + "step": 13143, + "time_per_iteration": 2.863976001739502 + }, + { + "auxiliary_loss_clip": 0.01103929, + "auxiliary_loss_mlp": 0.01029168, + "balance_loss_clip": 1.01715136, + "balance_loss_mlp": 1.03638124, + "epoch": 0.7902600330677889, + "flos": 34533316759680.0, + "grad_norm": 1.589507938557268, + "language_loss": 0.74556917, + "learning_rate": 4.4387559163612875e-07, + "loss": 0.76690018, + "num_input_tokens_seen": 283670645, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 13144, + "time_per_iteration": 2.5839059352874756 + }, + { + "auxiliary_loss_clip": 0.01103839, + "auxiliary_loss_mlp": 0.01032647, + "balance_loss_clip": 1.02020764, + "balance_loss_mlp": 1.03596044, + "epoch": 0.7903201563204569, + "flos": 22346384780160.0, + "grad_norm": 1.7274125688221094, + "language_loss": 0.83230376, + "learning_rate": 4.4363096628374605e-07, + "loss": 0.85366857, + "num_input_tokens_seen": 283688830, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 13145, + "time_per_iteration": 2.468961000442505 + }, + { + "auxiliary_loss_clip": 0.01094904, + "auxiliary_loss_mlp": 0.01029429, + "balance_loss_clip": 1.01889074, + "balance_loss_mlp": 1.03252196, + "epoch": 0.790380279573125, + "flos": 22053533195520.0, + "grad_norm": 1.7706663213688858, + "language_loss": 0.72783786, + "learning_rate": 4.4338639994903235e-07, + "loss": 0.74908125, + "num_input_tokens_seen": 283708625, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.625, + "step": 13146, + "time_per_iteration": 2.483905076980591 + }, + { + "auxiliary_loss_clip": 0.01101036, + "auxiliary_loss_mlp": 0.01028151, + "balance_loss_clip": 1.01676106, + "balance_loss_mlp": 1.03308654, + "epoch": 0.7904404028257929, + "flos": 20302600826880.0, + "grad_norm": 1.9329251437189798, + "language_loss": 0.75868392, + "learning_rate": 4.4314189264126246e-07, + "loss": 0.77997577, + "num_input_tokens_seen": 283725710, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6796875, + "step": 13147, + "time_per_iteration": 2.444445848464966 + }, + { + "auxiliary_loss_clip": 0.01098948, + "auxiliary_loss_mlp": 0.0103655, + "balance_loss_clip": 1.02420568, + "balance_loss_mlp": 1.03389215, + "epoch": 0.7905005260784609, + "flos": 20008923229440.0, + "grad_norm": 1.8432803916429288, + "language_loss": 0.71830833, + "learning_rate": 4.428974443697087e-07, + "loss": 0.7396633, + "num_input_tokens_seen": 283744150, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6484375, + "step": 13148, + "time_per_iteration": 2.4763596057891846 + }, + { + "auxiliary_loss_clip": 0.01099876, + "auxiliary_loss_mlp": 0.01029272, + "balance_loss_clip": 1.0174942, + "balance_loss_mlp": 1.03280914, + "epoch": 0.7905606493311288, + "flos": 26905926418560.0, + "grad_norm": 2.2200316445748536, + "language_loss": 0.71857107, + "learning_rate": 4.4265305514363913e-07, + "loss": 0.73986256, + "num_input_tokens_seen": 283764170, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 13149, + "time_per_iteration": 2.5340046882629395 + }, + { + "auxiliary_loss_clip": 0.01103652, + "auxiliary_loss_mlp": 0.01030142, + "balance_loss_clip": 1.01735091, + "balance_loss_mlp": 1.03590095, + "epoch": 0.7906207725837968, + "flos": 23696230907520.0, + "grad_norm": 2.727710817995862, + "language_loss": 0.65459621, + "learning_rate": 4.424087249723225e-07, + "loss": 0.67593414, + "num_input_tokens_seen": 283784305, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 13150, + "time_per_iteration": 2.4871621131896973 + }, + { + "auxiliary_loss_clip": 0.01098617, + "auxiliary_loss_mlp": 0.01029632, + "balance_loss_clip": 1.01808667, + "balance_loss_mlp": 1.03340101, + "epoch": 0.7906808958364647, + "flos": 20848837927680.0, + "grad_norm": 2.2316729864145035, + "language_loss": 0.69869459, + "learning_rate": 4.421644538650231e-07, + "loss": 0.71997708, + "num_input_tokens_seen": 283804040, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65234375, + "step": 13151, + "time_per_iteration": 2.429283857345581 + }, + { + "auxiliary_loss_clip": 0.01102592, + "auxiliary_loss_mlp": 0.01035884, + "balance_loss_clip": 1.02360559, + "balance_loss_mlp": 1.03463364, + "epoch": 0.7907410190891327, + "flos": 40735196974080.0, + "grad_norm": 1.3770531341531196, + "language_loss": 0.70089221, + "learning_rate": 4.4192024183100306e-07, + "loss": 0.72227693, + "num_input_tokens_seen": 283827120, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6796875, + "step": 13152, + "time_per_iteration": 2.6216795444488525 + }, + { + "auxiliary_loss_clip": 0.01099523, + "auxiliary_loss_mlp": 0.01026461, + "balance_loss_clip": 1.01488543, + "balance_loss_mlp": 1.03391027, + "epoch": 0.7908011423418007, + "flos": 13261165050240.0, + "grad_norm": 1.7997753431488441, + "language_loss": 0.72821844, + "learning_rate": 4.4167608887952367e-07, + "loss": 0.74947822, + "num_input_tokens_seen": 283844820, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 13153, + "time_per_iteration": 2.432175636291504 + }, + { + "auxiliary_loss_clip": 0.01098332, + "auxiliary_loss_mlp": 0.01024691, + "balance_loss_clip": 1.01356864, + "balance_loss_mlp": 1.03256023, + "epoch": 0.7908612655944687, + "flos": 19754747614080.0, + "grad_norm": 1.8282420637025174, + "language_loss": 0.78883809, + "learning_rate": 4.4143199501984306e-07, + "loss": 0.81006831, + "num_input_tokens_seen": 283862870, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 13154, + "time_per_iteration": 2.466029167175293 + }, + { + "auxiliary_loss_clip": 0.01105447, + "auxiliary_loss_mlp": 0.01030297, + "balance_loss_clip": 1.01705313, + "balance_loss_mlp": 1.03479743, + "epoch": 0.7909213888471366, + "flos": 21287738211840.0, + "grad_norm": 1.8238344908904138, + "language_loss": 0.70285016, + "learning_rate": 4.411879602612185e-07, + "loss": 0.72420764, + "num_input_tokens_seen": 283882405, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.70703125, + "step": 13155, + "time_per_iteration": 2.446547746658325 + }, + { + "auxiliary_loss_clip": 0.01100095, + "auxiliary_loss_mlp": 0.01027373, + "balance_loss_clip": 1.01582754, + "balance_loss_mlp": 1.03381193, + "epoch": 0.7909815120998046, + "flos": 22528882805760.0, + "grad_norm": 2.6081718094801003, + "language_loss": 0.7679953, + "learning_rate": 4.4094398461290174e-07, + "loss": 0.78926998, + "num_input_tokens_seen": 283902070, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 13156, + "time_per_iteration": 2.475921154022217 + }, + { + "auxiliary_loss_clip": 0.01099115, + "auxiliary_loss_mlp": 0.01028477, + "balance_loss_clip": 1.01664567, + "balance_loss_mlp": 1.03353715, + "epoch": 0.7910416353524725, + "flos": 26727702111360.0, + "grad_norm": 1.636282955731654, + "language_loss": 0.65013611, + "learning_rate": 4.4070006808414526e-07, + "loss": 0.67141205, + "num_input_tokens_seen": 283924100, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65625, + "step": 13157, + "time_per_iteration": 2.504150867462158 + }, + { + "auxiliary_loss_clip": 0.0110173, + "auxiliary_loss_mlp": 0.01031989, + "balance_loss_clip": 1.01937079, + "balance_loss_mlp": 1.03502417, + "epoch": 0.7911017586051405, + "flos": 24644847139200.0, + "grad_norm": 1.6940743634270539, + "language_loss": 0.73872387, + "learning_rate": 4.4045621068419894e-07, + "loss": 0.76006109, + "num_input_tokens_seen": 283944955, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.66796875, + "step": 13158, + "time_per_iteration": 2.4976253509521484 + }, + { + "auxiliary_loss_clip": 0.01096891, + "auxiliary_loss_mlp": 0.01028906, + "balance_loss_clip": 1.018332, + "balance_loss_mlp": 1.03334785, + "epoch": 0.7911618818578086, + "flos": 17565489578880.0, + "grad_norm": 1.9043976356667784, + "language_loss": 0.6686908, + "learning_rate": 4.40212412422309e-07, + "loss": 0.68994868, + "num_input_tokens_seen": 283963125, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.63671875, + "step": 13159, + "time_per_iteration": 2.4071156978607178 + }, + { + "auxiliary_loss_clip": 0.01098959, + "auxiliary_loss_mlp": 0.01028336, + "balance_loss_clip": 1.01733327, + "balance_loss_mlp": 1.03454971, + "epoch": 0.7912220051104765, + "flos": 16721660298240.0, + "grad_norm": 1.8560150384461531, + "language_loss": 0.67281532, + "learning_rate": 4.399686733077206e-07, + "loss": 0.69408834, + "num_input_tokens_seen": 283982850, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.64453125, + "step": 13160, + "time_per_iteration": 2.4779374599456787 + }, + { + "auxiliary_loss_clip": 0.01092943, + "auxiliary_loss_mlp": 0.01025967, + "balance_loss_clip": 1.01608515, + "balance_loss_mlp": 1.03147316, + "epoch": 0.7912821283631445, + "flos": 13698736531200.0, + "grad_norm": 1.960219382824367, + "language_loss": 0.72932816, + "learning_rate": 4.3972499334967694e-07, + "loss": 0.75051731, + "num_input_tokens_seen": 283998275, + "router_z_loss_clip": 0.09863281, + "router_z_loss_mlp": 0.61328125, + "step": 13161, + "time_per_iteration": 2.393747091293335 + }, + { + "auxiliary_loss_clip": 0.01099171, + "auxiliary_loss_mlp": 0.0102686, + "balance_loss_clip": 1.01512957, + "balance_loss_mlp": 1.03505635, + "epoch": 0.7913422516158124, + "flos": 23769021818880.0, + "grad_norm": 2.030740934223021, + "language_loss": 0.73477876, + "learning_rate": 4.39481372557418e-07, + "loss": 0.75603908, + "num_input_tokens_seen": 284018750, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.640625, + "step": 13162, + "time_per_iteration": 2.476824998855591 + }, + { + "auxiliary_loss_clip": 0.01102454, + "auxiliary_loss_mlp": 0.01030138, + "balance_loss_clip": 1.01868761, + "balance_loss_mlp": 1.03506005, + "epoch": 0.7914023748684804, + "flos": 19938251220480.0, + "grad_norm": 1.6298606745864626, + "language_loss": 0.72000325, + "learning_rate": 4.392378109401811e-07, + "loss": 0.74132919, + "num_input_tokens_seen": 284037850, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 13163, + "time_per_iteration": 2.4319183826446533 + }, + { + "auxiliary_loss_clip": 0.01101866, + "auxiliary_loss_mlp": 0.01031655, + "balance_loss_clip": 1.01945353, + "balance_loss_mlp": 1.03639102, + "epoch": 0.7914624981211483, + "flos": 20594805966720.0, + "grad_norm": 1.9265161616003688, + "language_loss": 0.69604623, + "learning_rate": 4.3899430850720296e-07, + "loss": 0.71738136, + "num_input_tokens_seen": 284056380, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65625, + "step": 13164, + "time_per_iteration": 2.4604907035827637 + }, + { + "auxiliary_loss_clip": 0.01098403, + "auxiliary_loss_mlp": 0.01029886, + "balance_loss_clip": 1.01869857, + "balance_loss_mlp": 1.03331554, + "epoch": 0.7915226213738163, + "flos": 21799465320960.0, + "grad_norm": 1.9521377640863393, + "language_loss": 0.66389132, + "learning_rate": 4.387508652677177e-07, + "loss": 0.68517423, + "num_input_tokens_seen": 284074945, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.65234375, + "step": 13165, + "time_per_iteration": 2.4393765926361084 + }, + { + "auxiliary_loss_clip": 0.01093623, + "auxiliary_loss_mlp": 0.01024396, + "balance_loss_clip": 1.01379871, + "balance_loss_mlp": 1.03140879, + "epoch": 0.7915827446264843, + "flos": 16288362535680.0, + "grad_norm": 1.870206675725358, + "language_loss": 0.72397065, + "learning_rate": 4.385074812309557e-07, + "loss": 0.74515086, + "num_input_tokens_seen": 284092070, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.62109375, + "step": 13166, + "time_per_iteration": 2.42858624458313 + }, + { + "auxiliary_loss_clip": 0.01098429, + "auxiliary_loss_mlp": 0.01029233, + "balance_loss_clip": 1.01669192, + "balance_loss_mlp": 1.03284669, + "epoch": 0.7916428678791523, + "flos": 25702595867520.0, + "grad_norm": 1.6243880882538562, + "language_loss": 0.77239472, + "learning_rate": 4.382641564061462e-07, + "loss": 0.79367137, + "num_input_tokens_seen": 284112255, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.65625, + "step": 13167, + "time_per_iteration": 2.4857194423675537 + }, + { + "auxiliary_loss_clip": 0.0109987, + "auxiliary_loss_mlp": 0.01029913, + "balance_loss_clip": 1.0192678, + "balance_loss_mlp": 1.03484404, + "epoch": 0.7917029911318202, + "flos": 23878513451520.0, + "grad_norm": 1.6932683776062956, + "language_loss": 0.84575874, + "learning_rate": 4.3802089080251713e-07, + "loss": 0.86705655, + "num_input_tokens_seen": 284132330, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6484375, + "step": 13168, + "time_per_iteration": 2.5257365703582764 + }, + { + "auxiliary_loss_clip": 0.01101185, + "auxiliary_loss_mlp": 0.01027494, + "balance_loss_clip": 1.01633573, + "balance_loss_mlp": 1.03501356, + "epoch": 0.7917631143844882, + "flos": 21646593037440.0, + "grad_norm": 1.7075722391650643, + "language_loss": 0.72710097, + "learning_rate": 4.3777768442929155e-07, + "loss": 0.74838775, + "num_input_tokens_seen": 284150640, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66015625, + "step": 13169, + "time_per_iteration": 2.4436428546905518 + }, + { + "auxiliary_loss_clip": 0.01102971, + "auxiliary_loss_mlp": 0.01032186, + "balance_loss_clip": 1.02028275, + "balance_loss_mlp": 1.03484845, + "epoch": 0.7918232376371561, + "flos": 38874198355200.0, + "grad_norm": 1.8243232954035, + "language_loss": 0.67037463, + "learning_rate": 4.3753453729569287e-07, + "loss": 0.69172621, + "num_input_tokens_seen": 284171910, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 13170, + "time_per_iteration": 2.624098777770996 + }, + { + "auxiliary_loss_clip": 0.01099882, + "auxiliary_loss_mlp": 0.01022631, + "balance_loss_clip": 1.01188445, + "balance_loss_mlp": 1.03370655, + "epoch": 0.7918833608898241, + "flos": 20775544225920.0, + "grad_norm": 2.145643776900154, + "language_loss": 0.70821196, + "learning_rate": 4.372914494109412e-07, + "loss": 0.72943711, + "num_input_tokens_seen": 284191340, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6640625, + "step": 13171, + "time_per_iteration": 2.4759225845336914 + }, + { + "auxiliary_loss_clip": 0.01097813, + "auxiliary_loss_mlp": 0.01026979, + "balance_loss_clip": 1.01555896, + "balance_loss_mlp": 1.03287041, + "epoch": 0.7919434841424922, + "flos": 33910122769920.0, + "grad_norm": 1.7808114898510692, + "language_loss": 0.66749847, + "learning_rate": 4.370484207842553e-07, + "loss": 0.68874633, + "num_input_tokens_seen": 284212495, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 13172, + "time_per_iteration": 2.5700619220733643 + }, + { + "auxiliary_loss_clip": 0.0110086, + "auxiliary_loss_mlp": 0.01030852, + "balance_loss_clip": 1.01951575, + "balance_loss_mlp": 1.03532124, + "epoch": 0.7920036073951601, + "flos": 21064660796160.0, + "grad_norm": 1.881471397827846, + "language_loss": 0.79114199, + "learning_rate": 4.3680545142484893e-07, + "loss": 0.81245905, + "num_input_tokens_seen": 284230825, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 13173, + "time_per_iteration": 2.4757769107818604 + }, + { + "auxiliary_loss_clip": 0.01098601, + "auxiliary_loss_mlp": 0.01026298, + "balance_loss_clip": 1.01604629, + "balance_loss_mlp": 1.03356767, + "epoch": 0.7920637306478281, + "flos": 23655974739840.0, + "grad_norm": 1.8257169099577297, + "language_loss": 0.7678805, + "learning_rate": 4.365625413419365e-07, + "loss": 0.7891295, + "num_input_tokens_seen": 284250365, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.6484375, + "step": 13174, + "time_per_iteration": 2.478116989135742 + }, + { + "auxiliary_loss_clip": 0.01097308, + "auxiliary_loss_mlp": 0.01031423, + "balance_loss_clip": 1.02046227, + "balance_loss_mlp": 1.03321493, + "epoch": 0.792123853900496, + "flos": 27195438038400.0, + "grad_norm": 1.6179511988960908, + "language_loss": 0.71719491, + "learning_rate": 4.363196905447297e-07, + "loss": 0.73848224, + "num_input_tokens_seen": 284269635, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.640625, + "step": 13175, + "time_per_iteration": 2.528700590133667 + }, + { + "auxiliary_loss_clip": 0.01099648, + "auxiliary_loss_mlp": 0.01027473, + "balance_loss_clip": 1.01570737, + "balance_loss_mlp": 1.03435004, + "epoch": 0.792183977153164, + "flos": 19098659744640.0, + "grad_norm": 1.9378539521552467, + "language_loss": 0.59763598, + "learning_rate": 4.360768990424364e-07, + "loss": 0.61890721, + "num_input_tokens_seen": 284288380, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.65234375, + "step": 13176, + "time_per_iteration": 2.4653594493865967 + }, + { + "auxiliary_loss_clip": 0.01101303, + "auxiliary_loss_mlp": 0.01030579, + "balance_loss_clip": 1.01922417, + "balance_loss_mlp": 1.03675985, + "epoch": 0.7922441004058319, + "flos": 17128851851520.0, + "grad_norm": 1.8690026492537037, + "language_loss": 0.73695058, + "learning_rate": 4.3583416684426376e-07, + "loss": 0.75826943, + "num_input_tokens_seen": 284306920, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.64453125, + "step": 13177, + "time_per_iteration": 2.439019203186035 + }, + { + "auxiliary_loss_clip": 0.01098632, + "auxiliary_loss_mlp": 0.01032505, + "balance_loss_clip": 1.02159739, + "balance_loss_mlp": 1.0353229, + "epoch": 0.7923042236585, + "flos": 17821640442240.0, + "grad_norm": 2.5597015980871656, + "language_loss": 0.63997006, + "learning_rate": 4.355914939594174e-07, + "loss": 0.66128141, + "num_input_tokens_seen": 284324700, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6328125, + "step": 13178, + "time_per_iteration": 3.8768224716186523 + }, + { + "auxiliary_loss_clip": 0.01098113, + "auxiliary_loss_mlp": 0.01030005, + "balance_loss_clip": 1.0197531, + "balance_loss_mlp": 1.03276765, + "epoch": 0.7923643469111679, + "flos": 29935206892800.0, + "grad_norm": 1.4086658766608762, + "language_loss": 0.68400067, + "learning_rate": 4.3534888039709726e-07, + "loss": 0.70528185, + "num_input_tokens_seen": 284345985, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.65625, + "step": 13179, + "time_per_iteration": 2.5326123237609863 + }, + { + "auxiliary_loss_clip": 0.01099366, + "auxiliary_loss_mlp": 0.01029023, + "balance_loss_clip": 1.01749516, + "balance_loss_mlp": 1.03448081, + "epoch": 0.7924244701638359, + "flos": 22674716023680.0, + "grad_norm": 3.8313461513968408, + "language_loss": 0.74134624, + "learning_rate": 4.3510632616650444e-07, + "loss": 0.76263011, + "num_input_tokens_seen": 284364475, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 13180, + "time_per_iteration": 3.892685651779175 + }, + { + "auxiliary_loss_clip": 0.01104124, + "auxiliary_loss_mlp": 0.01032877, + "balance_loss_clip": 1.02059281, + "balance_loss_mlp": 1.03637862, + "epoch": 0.7924845934165038, + "flos": 17968156018560.0, + "grad_norm": 2.6414763504058936, + "language_loss": 0.81435031, + "learning_rate": 4.3486383127683646e-07, + "loss": 0.8357203, + "num_input_tokens_seen": 284382125, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6796875, + "step": 13181, + "time_per_iteration": 3.8623433113098145 + }, + { + "auxiliary_loss_clip": 0.01098541, + "auxiliary_loss_mlp": 0.01032498, + "balance_loss_clip": 1.02029681, + "balance_loss_mlp": 1.03413761, + "epoch": 0.7925447166691718, + "flos": 23476960333440.0, + "grad_norm": 1.7875723421609098, + "language_loss": 0.77434945, + "learning_rate": 4.346213957372895e-07, + "loss": 0.7956599, + "num_input_tokens_seen": 284401585, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.64453125, + "step": 13182, + "time_per_iteration": 2.4663844108581543 + }, + { + "auxiliary_loss_clip": 0.01103982, + "auxiliary_loss_mlp": 0.01032586, + "balance_loss_clip": 1.01979494, + "balance_loss_mlp": 1.03470898, + "epoch": 0.7926048399218397, + "flos": 20447572118400.0, + "grad_norm": 2.7996855635820013, + "language_loss": 0.74354494, + "learning_rate": 4.34379019557056e-07, + "loss": 0.7649107, + "num_input_tokens_seen": 284419125, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 13183, + "time_per_iteration": 2.490994930267334 + }, + { + "auxiliary_loss_clip": 0.0109888, + "auxiliary_loss_mlp": 0.01024612, + "balance_loss_clip": 1.01273239, + "balance_loss_mlp": 1.03439891, + "epoch": 0.7926649631745077, + "flos": 37160038535040.0, + "grad_norm": 1.6595627925509142, + "language_loss": 0.68164527, + "learning_rate": 4.341367027453264e-07, + "loss": 0.70288026, + "num_input_tokens_seen": 284440445, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.64453125, + "step": 13184, + "time_per_iteration": 4.066596508026123 + }, + { + "auxiliary_loss_clip": 0.01102689, + "auxiliary_loss_mlp": 0.01028631, + "balance_loss_clip": 1.01719308, + "balance_loss_mlp": 1.03515947, + "epoch": 0.7927250864271758, + "flos": 17018606033280.0, + "grad_norm": 1.6953007662822652, + "language_loss": 0.70649928, + "learning_rate": 4.338944453112907e-07, + "loss": 0.72781253, + "num_input_tokens_seen": 284459370, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.67578125, + "step": 13185, + "time_per_iteration": 2.5168697834014893 + }, + { + "auxiliary_loss_clip": 0.01101927, + "auxiliary_loss_mlp": 0.01027341, + "balance_loss_clip": 1.01530719, + "balance_loss_mlp": 1.03461063, + "epoch": 0.7927852096798437, + "flos": 17749208666880.0, + "grad_norm": 2.0010064491427335, + "language_loss": 0.65568876, + "learning_rate": 4.3365224726413375e-07, + "loss": 0.67698145, + "num_input_tokens_seen": 284477525, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 13186, + "time_per_iteration": 2.4313526153564453 + }, + { + "auxiliary_loss_clip": 0.01099917, + "auxiliary_loss_mlp": 0.01028869, + "balance_loss_clip": 1.01786041, + "balance_loss_mlp": 1.03488398, + "epoch": 0.7928453329325117, + "flos": 23838436851840.0, + "grad_norm": 2.458790452958655, + "language_loss": 0.76782525, + "learning_rate": 4.334101086130408e-07, + "loss": 0.78911316, + "num_input_tokens_seen": 284496590, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6484375, + "step": 13187, + "time_per_iteration": 2.4705545902252197 + }, + { + "auxiliary_loss_clip": 0.01097825, + "auxiliary_loss_mlp": 0.01026522, + "balance_loss_clip": 1.01525056, + "balance_loss_mlp": 1.03388309, + "epoch": 0.7929054561851796, + "flos": 17454920538240.0, + "grad_norm": 2.052216881515836, + "language_loss": 0.72776371, + "learning_rate": 4.3316802936719334e-07, + "loss": 0.74900717, + "num_input_tokens_seen": 284511470, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.640625, + "step": 13188, + "time_per_iteration": 2.473217010498047 + }, + { + "auxiliary_loss_clip": 0.01102244, + "auxiliary_loss_mlp": 0.01036567, + "balance_loss_clip": 1.02387083, + "balance_loss_mlp": 1.03462553, + "epoch": 0.7929655794378476, + "flos": 21981280988160.0, + "grad_norm": 2.027455817824797, + "language_loss": 0.62665582, + "learning_rate": 4.329260095357725e-07, + "loss": 0.64804399, + "num_input_tokens_seen": 284531125, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.67578125, + "step": 13189, + "time_per_iteration": 2.442365884780884 + }, + { + "auxiliary_loss_clip": 0.01098917, + "auxiliary_loss_mlp": 0.01028169, + "balance_loss_clip": 1.01705313, + "balance_loss_mlp": 1.03361034, + "epoch": 0.7930257026905155, + "flos": 17273930883840.0, + "grad_norm": 2.5304062276018793, + "language_loss": 0.72505867, + "learning_rate": 4.3268404912795307e-07, + "loss": 0.74632961, + "num_input_tokens_seen": 284549340, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65234375, + "step": 13190, + "time_per_iteration": 2.5228397846221924 + }, + { + "auxiliary_loss_clip": 0.01096381, + "auxiliary_loss_mlp": 0.01027089, + "balance_loss_clip": 1.01708758, + "balance_loss_mlp": 1.03499353, + "epoch": 0.7930858259431836, + "flos": 27300584125440.0, + "grad_norm": 1.8037952214110713, + "language_loss": 0.73300159, + "learning_rate": 4.3244214815291166e-07, + "loss": 0.75423628, + "num_input_tokens_seen": 284567060, + "router_z_loss_clip": 0.10009766, + "router_z_loss_mlp": 0.6171875, + "step": 13191, + "time_per_iteration": 2.5402090549468994 + }, + { + "auxiliary_loss_clip": 0.01099659, + "auxiliary_loss_mlp": 0.01036425, + "balance_loss_clip": 1.02452767, + "balance_loss_mlp": 1.03368807, + "epoch": 0.7931459491958515, + "flos": 19863736456320.0, + "grad_norm": 1.9478523410400206, + "language_loss": 0.69033474, + "learning_rate": 4.322003066198219e-07, + "loss": 0.71169555, + "num_input_tokens_seen": 284586600, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66015625, + "step": 13192, + "time_per_iteration": 2.488039970397949 + }, + { + "auxiliary_loss_clip": 0.01100062, + "auxiliary_loss_mlp": 0.010329, + "balance_loss_clip": 1.02173603, + "balance_loss_mlp": 1.03413558, + "epoch": 0.7932060724485195, + "flos": 23147120718720.0, + "grad_norm": 1.5635403333357274, + "language_loss": 0.75213289, + "learning_rate": 4.3195852453785274e-07, + "loss": 0.77346253, + "num_input_tokens_seen": 284605715, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.66015625, + "step": 13193, + "time_per_iteration": 2.464966297149658 + }, + { + "auxiliary_loss_clip": 0.01100043, + "auxiliary_loss_mlp": 0.01033879, + "balance_loss_clip": 1.02102232, + "balance_loss_mlp": 1.03474998, + "epoch": 0.7932661957011874, + "flos": 29934847756800.0, + "grad_norm": 1.8781856147923044, + "language_loss": 0.72225535, + "learning_rate": 4.317168019161741e-07, + "loss": 0.74359465, + "num_input_tokens_seen": 284628540, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.65234375, + "step": 13194, + "time_per_iteration": 2.55106520652771 + }, + { + "auxiliary_loss_clip": 0.01104878, + "auxiliary_loss_mlp": 0.01030783, + "balance_loss_clip": 1.01911819, + "balance_loss_mlp": 1.03578103, + "epoch": 0.7933263189538554, + "flos": 22559119079040.0, + "grad_norm": 1.9952958516123638, + "language_loss": 0.69781977, + "learning_rate": 4.314751387639517e-07, + "loss": 0.71917635, + "num_input_tokens_seen": 284646040, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.69140625, + "step": 13195, + "time_per_iteration": 2.4327144622802734 + }, + { + "auxiliary_loss_clip": 0.01100264, + "auxiliary_loss_mlp": 0.01025694, + "balance_loss_clip": 1.0142858, + "balance_loss_mlp": 1.03533435, + "epoch": 0.7933864422065233, + "flos": 25479051575040.0, + "grad_norm": 1.5235615459382654, + "language_loss": 0.77706164, + "learning_rate": 4.3123353509034844e-07, + "loss": 0.79832125, + "num_input_tokens_seen": 284665110, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 13196, + "time_per_iteration": 2.4901678562164307 + }, + { + "auxiliary_loss_clip": 0.01103725, + "auxiliary_loss_mlp": 0.01034187, + "balance_loss_clip": 1.02258193, + "balance_loss_mlp": 1.03656614, + "epoch": 0.7934465654591913, + "flos": 33583156243200.0, + "grad_norm": 1.803068943631605, + "language_loss": 0.68970078, + "learning_rate": 4.309919909045268e-07, + "loss": 0.71107984, + "num_input_tokens_seen": 284686515, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 13197, + "time_per_iteration": 2.5378594398498535 + }, + { + "auxiliary_loss_clip": 0.01098819, + "auxiliary_loss_mlp": 0.0102988, + "balance_loss_clip": 1.01860309, + "balance_loss_mlp": 1.03417861, + "epoch": 0.7935066887118594, + "flos": 31432538263680.0, + "grad_norm": 1.7643596771229297, + "language_loss": 0.64804506, + "learning_rate": 4.30750506215646e-07, + "loss": 0.66933215, + "num_input_tokens_seen": 284707300, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.64453125, + "step": 13198, + "time_per_iteration": 2.534534215927124 + }, + { + "auxiliary_loss_clip": 0.01103865, + "auxiliary_loss_mlp": 0.01030473, + "balance_loss_clip": 1.01787245, + "balance_loss_mlp": 1.03533065, + "epoch": 0.7935668119645273, + "flos": 14682616940160.0, + "grad_norm": 2.0561177660493453, + "language_loss": 0.72203559, + "learning_rate": 4.30509081032864e-07, + "loss": 0.743379, + "num_input_tokens_seen": 284723545, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.68359375, + "step": 13199, + "time_per_iteration": 2.409954071044922 + }, + { + "auxiliary_loss_clip": 0.01100244, + "auxiliary_loss_mlp": 0.01028741, + "balance_loss_clip": 1.01794064, + "balance_loss_mlp": 1.03514385, + "epoch": 0.7936269352171953, + "flos": 18004246208640.0, + "grad_norm": 2.5680157152450933, + "language_loss": 0.80811197, + "learning_rate": 4.302677153653349e-07, + "loss": 0.82940185, + "num_input_tokens_seen": 284742650, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.65234375, + "step": 13200, + "time_per_iteration": 2.4604108333587646 + }, + { + "auxiliary_loss_clip": 0.01098579, + "auxiliary_loss_mlp": 0.01028539, + "balance_loss_clip": 1.01745248, + "balance_loss_mlp": 1.0353868, + "epoch": 0.7936870584698632, + "flos": 18880215183360.0, + "grad_norm": 1.627584700503655, + "language_loss": 0.77191329, + "learning_rate": 4.3002640922221077e-07, + "loss": 0.7931844, + "num_input_tokens_seen": 284760955, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6328125, + "step": 13201, + "time_per_iteration": 2.428744077682495 + }, + { + "auxiliary_loss_clip": 0.01098306, + "auxiliary_loss_mlp": 0.01028208, + "balance_loss_clip": 1.01721644, + "balance_loss_mlp": 1.03374922, + "epoch": 0.7937471817225312, + "flos": 23367001824000.0, + "grad_norm": 1.4615967760668465, + "language_loss": 0.67071187, + "learning_rate": 4.2978516261264296e-07, + "loss": 0.69197702, + "num_input_tokens_seen": 284780745, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.64453125, + "step": 13202, + "time_per_iteration": 2.4896771907806396 + }, + { + "auxiliary_loss_clip": 0.0110056, + "auxiliary_loss_mlp": 0.01032272, + "balance_loss_clip": 1.02063727, + "balance_loss_mlp": 1.03468037, + "epoch": 0.7938073049751991, + "flos": 22674428714880.0, + "grad_norm": 1.816192931663621, + "language_loss": 0.74804997, + "learning_rate": 4.2954397554577884e-07, + "loss": 0.7693783, + "num_input_tokens_seen": 284799000, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 13203, + "time_per_iteration": 2.451380729675293 + }, + { + "auxiliary_loss_clip": 0.01100879, + "auxiliary_loss_mlp": 0.01030155, + "balance_loss_clip": 1.01872849, + "balance_loss_mlp": 1.03399134, + "epoch": 0.7938674282278672, + "flos": 22851431959680.0, + "grad_norm": 2.0709813366174807, + "language_loss": 0.6622262, + "learning_rate": 4.293028480307643e-07, + "loss": 0.68353653, + "num_input_tokens_seen": 284817450, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 13204, + "time_per_iteration": 2.4800636768341064 + }, + { + "auxiliary_loss_clip": 0.01097898, + "auxiliary_loss_mlp": 0.01029539, + "balance_loss_clip": 1.01835084, + "balance_loss_mlp": 1.03296351, + "epoch": 0.7939275514805351, + "flos": 27012509049600.0, + "grad_norm": 1.3281882721679232, + "language_loss": 0.7925297, + "learning_rate": 4.290617800767438e-07, + "loss": 0.81380415, + "num_input_tokens_seen": 284838865, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6484375, + "step": 13205, + "time_per_iteration": 2.4787778854370117 + }, + { + "auxiliary_loss_clip": 0.01096536, + "auxiliary_loss_mlp": 0.01026398, + "balance_loss_clip": 1.01493573, + "balance_loss_mlp": 1.03291297, + "epoch": 0.7939876747332031, + "flos": 21142838747520.0, + "grad_norm": 1.7942191439670012, + "language_loss": 0.77874231, + "learning_rate": 4.28820771692858e-07, + "loss": 0.7999717, + "num_input_tokens_seen": 284857975, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.63671875, + "step": 13206, + "time_per_iteration": 2.499706983566284 + }, + { + "auxiliary_loss_clip": 0.01104173, + "auxiliary_loss_mlp": 0.01029487, + "balance_loss_clip": 1.0172863, + "balance_loss_mlp": 1.03587031, + "epoch": 0.794047797985871, + "flos": 23289075267840.0, + "grad_norm": 1.8397672987802902, + "language_loss": 0.79237318, + "learning_rate": 4.285798228882456e-07, + "loss": 0.81370986, + "num_input_tokens_seen": 284877145, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 13207, + "time_per_iteration": 2.4636006355285645 + }, + { + "auxiliary_loss_clip": 0.01099783, + "auxiliary_loss_mlp": 0.01032098, + "balance_loss_clip": 1.02048755, + "balance_loss_mlp": 1.03468966, + "epoch": 0.794107921238539, + "flos": 24608074590720.0, + "grad_norm": 1.9530235791320048, + "language_loss": 0.84002006, + "learning_rate": 4.2833893367204375e-07, + "loss": 0.86133885, + "num_input_tokens_seen": 284895560, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 13208, + "time_per_iteration": 2.5083847045898438 + }, + { + "auxiliary_loss_clip": 0.01022967, + "auxiliary_loss_mlp": 0.00999733, + "balance_loss_clip": 0.99883288, + "balance_loss_mlp": 1.00283718, + "epoch": 0.7941680444912069, + "flos": 64093690252800.0, + "grad_norm": 0.7192767006915639, + "language_loss": 0.58359563, + "learning_rate": 4.280981040533875e-07, + "loss": 0.60382259, + "num_input_tokens_seen": 284963135, + "router_z_loss_clip": 0.00897217, + "router_z_loss_mlp": 0.20117188, + "step": 13209, + "time_per_iteration": 3.1166725158691406 + }, + { + "auxiliary_loss_clip": 0.01105651, + "auxiliary_loss_mlp": 0.0102813, + "balance_loss_clip": 1.01602447, + "balance_loss_mlp": 1.03716731, + "epoch": 0.794228167743875, + "flos": 24388839930240.0, + "grad_norm": 6.276461119543849, + "language_loss": 0.62636811, + "learning_rate": 4.2785733404140825e-07, + "loss": 0.64770591, + "num_input_tokens_seen": 284981755, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.68359375, + "step": 13210, + "time_per_iteration": 2.5011911392211914 + }, + { + "auxiliary_loss_clip": 0.01100308, + "auxiliary_loss_mlp": 0.010306, + "balance_loss_clip": 1.01959693, + "balance_loss_mlp": 1.03402996, + "epoch": 0.794288290996543, + "flos": 28512498026880.0, + "grad_norm": 1.5861692556571285, + "language_loss": 0.68948948, + "learning_rate": 4.2761662364523676e-07, + "loss": 0.71079856, + "num_input_tokens_seen": 285003060, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6640625, + "step": 13211, + "time_per_iteration": 2.5030434131622314 + }, + { + "auxiliary_loss_clip": 0.01102809, + "auxiliary_loss_mlp": 0.01036253, + "balance_loss_clip": 1.02349782, + "balance_loss_mlp": 1.03480554, + "epoch": 0.7943484142492109, + "flos": 25922117836800.0, + "grad_norm": 1.5459525414339919, + "language_loss": 0.72359824, + "learning_rate": 4.2737597287400074e-07, + "loss": 0.7449888, + "num_input_tokens_seen": 285021640, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 13212, + "time_per_iteration": 2.513190984725952 + }, + { + "auxiliary_loss_clip": 0.01098106, + "auxiliary_loss_mlp": 0.01025052, + "balance_loss_clip": 1.01388764, + "balance_loss_mlp": 1.03500962, + "epoch": 0.7944085375018789, + "flos": 23915286000000.0, + "grad_norm": 1.663831013986619, + "language_loss": 0.80758727, + "learning_rate": 4.271353817368246e-07, + "loss": 0.82881892, + "num_input_tokens_seen": 285040490, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6328125, + "step": 13213, + "time_per_iteration": 2.4620864391326904 + }, + { + "auxiliary_loss_clip": 0.01106094, + "auxiliary_loss_mlp": 0.01029417, + "balance_loss_clip": 1.01679885, + "balance_loss_mlp": 1.03663135, + "epoch": 0.7944686607545468, + "flos": 20229953569920.0, + "grad_norm": 2.2825802582203476, + "language_loss": 0.68319535, + "learning_rate": 4.268948502428327e-07, + "loss": 0.70455045, + "num_input_tokens_seen": 285059270, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 13214, + "time_per_iteration": 2.4502992630004883 + }, + { + "auxiliary_loss_clip": 0.0109771, + "auxiliary_loss_mlp": 0.01028761, + "balance_loss_clip": 1.01793051, + "balance_loss_mlp": 1.03391325, + "epoch": 0.7945287840072148, + "flos": 21980993679360.0, + "grad_norm": 1.8169772357963099, + "language_loss": 0.72712231, + "learning_rate": 4.2665437840114535e-07, + "loss": 0.74838698, + "num_input_tokens_seen": 285075390, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.63671875, + "step": 13215, + "time_per_iteration": 2.4472222328186035 + }, + { + "auxiliary_loss_clip": 0.01101234, + "auxiliary_loss_mlp": 0.01028514, + "balance_loss_clip": 1.01751041, + "balance_loss_mlp": 1.03695011, + "epoch": 0.7945889072598827, + "flos": 26397718842240.0, + "grad_norm": 1.5004674854133762, + "language_loss": 0.78918624, + "learning_rate": 4.2641396622088253e-07, + "loss": 0.81048369, + "num_input_tokens_seen": 285096290, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.640625, + "step": 13216, + "time_per_iteration": 2.5075128078460693 + }, + { + "auxiliary_loss_clip": 0.01100883, + "auxiliary_loss_mlp": 0.01029594, + "balance_loss_clip": 1.01874018, + "balance_loss_mlp": 1.03463197, + "epoch": 0.7946490305125508, + "flos": 25810255906560.0, + "grad_norm": 1.6163941804337032, + "language_loss": 0.73908085, + "learning_rate": 4.261736137111598e-07, + "loss": 0.76038563, + "num_input_tokens_seen": 285116020, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6640625, + "step": 13217, + "time_per_iteration": 2.4624104499816895 + }, + { + "auxiliary_loss_clip": 0.01097689, + "auxiliary_loss_mlp": 0.01034045, + "balance_loss_clip": 1.02317882, + "balance_loss_mlp": 1.03437877, + "epoch": 0.7947091537652187, + "flos": 15960965045760.0, + "grad_norm": 1.7536489091121308, + "language_loss": 0.74128562, + "learning_rate": 4.259333208810907e-07, + "loss": 0.76260298, + "num_input_tokens_seen": 285133510, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6328125, + "step": 13218, + "time_per_iteration": 2.4378395080566406 + }, + { + "auxiliary_loss_clip": 0.01101542, + "auxiliary_loss_mlp": 0.0103414, + "balance_loss_clip": 1.02162278, + "balance_loss_mlp": 1.03341603, + "epoch": 0.7947692770178867, + "flos": 18587866389120.0, + "grad_norm": 1.8944799290168057, + "language_loss": 0.83180892, + "learning_rate": 4.2569308773978817e-07, + "loss": 0.85316575, + "num_input_tokens_seen": 285151690, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 13219, + "time_per_iteration": 2.4046013355255127 + }, + { + "auxiliary_loss_clip": 0.01104407, + "auxiliary_loss_mlp": 0.01033611, + "balance_loss_clip": 1.02093291, + "balance_loss_mlp": 1.03578758, + "epoch": 0.7948294002705546, + "flos": 20442220992000.0, + "grad_norm": 1.8955600034556859, + "language_loss": 0.7588414, + "learning_rate": 4.2545291429636123e-07, + "loss": 0.78022164, + "num_input_tokens_seen": 285170485, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 13220, + "time_per_iteration": 3.8154995441436768 + }, + { + "auxiliary_loss_clip": 0.01104021, + "auxiliary_loss_mlp": 0.01033248, + "balance_loss_clip": 1.02123189, + "balance_loss_mlp": 1.03558075, + "epoch": 0.7948895235232226, + "flos": 38181194282880.0, + "grad_norm": 1.997206331366737, + "language_loss": 0.72682828, + "learning_rate": 4.252128005599176e-07, + "loss": 0.74820095, + "num_input_tokens_seen": 285191050, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 13221, + "time_per_iteration": 4.05722188949585 + }, + { + "auxiliary_loss_clip": 0.0109725, + "auxiliary_loss_mlp": 0.01026356, + "balance_loss_clip": 1.01540709, + "balance_loss_mlp": 1.03402424, + "epoch": 0.7949496467758905, + "flos": 15559806977280.0, + "grad_norm": 1.8234441442382394, + "language_loss": 0.7454437, + "learning_rate": 4.249727465395634e-07, + "loss": 0.76667982, + "num_input_tokens_seen": 285208750, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6328125, + "step": 13222, + "time_per_iteration": 3.82381534576416 + }, + { + "auxiliary_loss_clip": 0.01023305, + "auxiliary_loss_mlp": 0.01001588, + "balance_loss_clip": 1.00058103, + "balance_loss_mlp": 1.00324297, + "epoch": 0.7950097700285585, + "flos": 70897036728960.0, + "grad_norm": 0.7682356775746639, + "language_loss": 0.67054129, + "learning_rate": 4.247327522443993e-07, + "loss": 0.6907903, + "num_input_tokens_seen": 285264605, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.20117188, + "step": 13223, + "time_per_iteration": 2.910489797592163 + }, + { + "auxiliary_loss_clip": 0.01098555, + "auxiliary_loss_mlp": 0.0102863, + "balance_loss_clip": 1.01635742, + "balance_loss_mlp": 1.03264594, + "epoch": 0.7950698932812266, + "flos": 23951627585280.0, + "grad_norm": 1.726476210042691, + "language_loss": 0.7146225, + "learning_rate": 4.2449281768352717e-07, + "loss": 0.73589438, + "num_input_tokens_seen": 285283940, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66015625, + "step": 13224, + "time_per_iteration": 2.4960734844207764 + }, + { + "auxiliary_loss_clip": 0.01023049, + "auxiliary_loss_mlp": 0.00999614, + "balance_loss_clip": 0.9985711, + "balance_loss_mlp": 1.00314784, + "epoch": 0.7951300165338945, + "flos": 60282561415680.0, + "grad_norm": 0.6952095607048513, + "language_loss": 0.55011863, + "learning_rate": 4.2425294286604527e-07, + "loss": 0.57034522, + "num_input_tokens_seen": 285349525, + "router_z_loss_clip": 0.01043701, + "router_z_loss_mlp": 0.19921875, + "step": 13225, + "time_per_iteration": 4.49747109413147 + }, + { + "auxiliary_loss_clip": 0.01097582, + "auxiliary_loss_mlp": 0.01023614, + "balance_loss_clip": 1.01296818, + "balance_loss_mlp": 1.03373742, + "epoch": 0.7951901397865625, + "flos": 22819004956800.0, + "grad_norm": 1.932116603626369, + "language_loss": 0.64920199, + "learning_rate": 4.2401312780105034e-07, + "loss": 0.67041391, + "num_input_tokens_seen": 285367355, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.640625, + "step": 13226, + "time_per_iteration": 2.492919921875 + }, + { + "auxiliary_loss_clip": 0.01102867, + "auxiliary_loss_mlp": 0.01036554, + "balance_loss_clip": 1.02517581, + "balance_loss_mlp": 1.03584349, + "epoch": 0.7952502630392304, + "flos": 35695672871040.0, + "grad_norm": 3.097889886505811, + "language_loss": 0.70084739, + "learning_rate": 4.237733724976349e-07, + "loss": 0.72224164, + "num_input_tokens_seen": 285386190, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.671875, + "step": 13227, + "time_per_iteration": 2.55519700050354 + }, + { + "auxiliary_loss_clip": 0.01096905, + "auxiliary_loss_mlp": 0.01027043, + "balance_loss_clip": 1.01701736, + "balance_loss_mlp": 1.03388405, + "epoch": 0.7953103862918984, + "flos": 25629840869760.0, + "grad_norm": 1.6685312506793168, + "language_loss": 0.69431317, + "learning_rate": 4.2353367696489184e-07, + "loss": 0.71555269, + "num_input_tokens_seen": 285406150, + "router_z_loss_clip": 0.10058594, + "router_z_loss_mlp": 0.62890625, + "step": 13228, + "time_per_iteration": 2.5069961547851562 + }, + { + "auxiliary_loss_clip": 0.01100896, + "auxiliary_loss_mlp": 0.01032604, + "balance_loss_clip": 1.02095747, + "balance_loss_mlp": 1.03423619, + "epoch": 0.7953705095445663, + "flos": 40551980676480.0, + "grad_norm": 1.445985556067254, + "language_loss": 0.70922631, + "learning_rate": 4.232940412119095e-07, + "loss": 0.73056132, + "num_input_tokens_seen": 285429900, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66796875, + "step": 13229, + "time_per_iteration": 2.6479508876800537 + }, + { + "auxiliary_loss_clip": 0.01106737, + "auxiliary_loss_mlp": 0.01030695, + "balance_loss_clip": 1.01903689, + "balance_loss_mlp": 1.03793633, + "epoch": 0.7954306327972344, + "flos": 27636672706560.0, + "grad_norm": 1.7589665184565293, + "language_loss": 0.71889889, + "learning_rate": 4.2305446524777457e-07, + "loss": 0.74027318, + "num_input_tokens_seen": 285452555, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6875, + "step": 13230, + "time_per_iteration": 2.5062637329101562 + }, + { + "auxiliary_loss_clip": 0.01022715, + "auxiliary_loss_mlp": 0.01000194, + "balance_loss_clip": 0.9992041, + "balance_loss_mlp": 1.00273073, + "epoch": 0.7954907560499023, + "flos": 59504055995520.0, + "grad_norm": 0.8959170552781407, + "language_loss": 0.63557678, + "learning_rate": 4.2281494908157247e-07, + "loss": 0.65580589, + "num_input_tokens_seen": 285515700, + "router_z_loss_clip": 0.0098877, + "router_z_loss_mlp": 0.19921875, + "step": 13231, + "time_per_iteration": 3.082951784133911 + }, + { + "auxiliary_loss_clip": 0.01099992, + "auxiliary_loss_mlp": 0.01025832, + "balance_loss_clip": 1.01489472, + "balance_loss_mlp": 1.03479195, + "epoch": 0.7955508793025703, + "flos": 20120533764480.0, + "grad_norm": 1.513210283199707, + "language_loss": 0.69656473, + "learning_rate": 4.2257549272238566e-07, + "loss": 0.71782291, + "num_input_tokens_seen": 285533910, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.65234375, + "step": 13232, + "time_per_iteration": 2.440912961959839 + }, + { + "auxiliary_loss_clip": 0.01099299, + "auxiliary_loss_mlp": 0.01027268, + "balance_loss_clip": 1.01572859, + "balance_loss_mlp": 1.03366399, + "epoch": 0.7956110025552382, + "flos": 26505378881280.0, + "grad_norm": 1.5695652916232832, + "language_loss": 0.77775937, + "learning_rate": 4.223360961792952e-07, + "loss": 0.79902506, + "num_input_tokens_seen": 285554080, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 13233, + "time_per_iteration": 2.5125248432159424 + }, + { + "auxiliary_loss_clip": 0.01100048, + "auxiliary_loss_mlp": 0.01029216, + "balance_loss_clip": 1.01780808, + "balance_loss_mlp": 1.03384972, + "epoch": 0.7956711258079062, + "flos": 22565475786240.0, + "grad_norm": 2.4242376989153183, + "language_loss": 0.78652054, + "learning_rate": 4.220967594613769e-07, + "loss": 0.80781317, + "num_input_tokens_seen": 285572325, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66015625, + "step": 13234, + "time_per_iteration": 2.468038558959961 + }, + { + "auxiliary_loss_clip": 0.0109831, + "auxiliary_loss_mlp": 0.01023983, + "balance_loss_clip": 1.01374316, + "balance_loss_mlp": 1.03441608, + "epoch": 0.7957312490605741, + "flos": 17379005143680.0, + "grad_norm": 1.5905892668664205, + "language_loss": 0.70050478, + "learning_rate": 4.218574825777077e-07, + "loss": 0.72172773, + "num_input_tokens_seen": 285589770, + "router_z_loss_clip": 0.10205078, + "router_z_loss_mlp": 0.640625, + "step": 13235, + "time_per_iteration": 2.493274450302124 + }, + { + "auxiliary_loss_clip": 0.01100603, + "auxiliary_loss_mlp": 0.01027765, + "balance_loss_clip": 1.01598716, + "balance_loss_mlp": 1.03456783, + "epoch": 0.7957913723132422, + "flos": 22491427898880.0, + "grad_norm": 1.4327288828899616, + "language_loss": 0.6766414, + "learning_rate": 4.2161826553736145e-07, + "loss": 0.69792509, + "num_input_tokens_seen": 285610065, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6640625, + "step": 13236, + "time_per_iteration": 2.484380006790161 + }, + { + "auxiliary_loss_clip": 0.01097874, + "auxiliary_loss_mlp": 0.01026353, + "balance_loss_clip": 1.01505828, + "balance_loss_mlp": 1.03377748, + "epoch": 0.7958514955659101, + "flos": 22638087129600.0, + "grad_norm": 1.7411950179861415, + "language_loss": 0.75172085, + "learning_rate": 4.2137910834940826e-07, + "loss": 0.77296317, + "num_input_tokens_seen": 285628480, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 13237, + "time_per_iteration": 2.4766552448272705 + }, + { + "auxiliary_loss_clip": 0.01101102, + "auxiliary_loss_mlp": 0.01032287, + "balance_loss_clip": 1.01983571, + "balance_loss_mlp": 1.03548527, + "epoch": 0.7959116188185781, + "flos": 20704225772160.0, + "grad_norm": 1.9189361259680966, + "language_loss": 0.71440208, + "learning_rate": 4.211400110229175e-07, + "loss": 0.73573601, + "num_input_tokens_seen": 285647805, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.65625, + "step": 13238, + "time_per_iteration": 2.456925392150879 + }, + { + "auxiliary_loss_clip": 0.01099911, + "auxiliary_loss_mlp": 0.01026443, + "balance_loss_clip": 1.01485622, + "balance_loss_mlp": 1.0334146, + "epoch": 0.7959717420712461, + "flos": 19024683684480.0, + "grad_norm": 1.8980651664510928, + "language_loss": 0.73918056, + "learning_rate": 4.2090097356695684e-07, + "loss": 0.76044405, + "num_input_tokens_seen": 285665505, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6640625, + "step": 13239, + "time_per_iteration": 2.443584680557251 + }, + { + "auxiliary_loss_clip": 0.01103286, + "auxiliary_loss_mlp": 0.01032954, + "balance_loss_clip": 1.02093184, + "balance_loss_mlp": 1.03495479, + "epoch": 0.796031865323914, + "flos": 26356636661760.0, + "grad_norm": 1.6947466268706028, + "language_loss": 0.69046456, + "learning_rate": 4.2066199599058814e-07, + "loss": 0.71182698, + "num_input_tokens_seen": 285685855, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 13240, + "time_per_iteration": 2.4764912128448486 + }, + { + "auxiliary_loss_clip": 0.01022946, + "auxiliary_loss_mlp": 0.00999029, + "balance_loss_clip": 0.99800378, + "balance_loss_mlp": 1.00302553, + "epoch": 0.796091988576582, + "flos": 62069440320000.0, + "grad_norm": 0.8878157964624488, + "language_loss": 0.58645731, + "learning_rate": 4.2042307830287526e-07, + "loss": 0.60667706, + "num_input_tokens_seen": 285735710, + "router_z_loss_clip": 0.01025391, + "router_z_loss_mlp": 0.19921875, + "step": 13241, + "time_per_iteration": 2.843022108078003 + }, + { + "auxiliary_loss_clip": 0.01099407, + "auxiliary_loss_mlp": 0.01028324, + "balance_loss_clip": 1.01764321, + "balance_loss_mlp": 1.03436446, + "epoch": 0.7961521118292499, + "flos": 39020103400320.0, + "grad_norm": 1.8253771110110306, + "language_loss": 0.64276886, + "learning_rate": 4.201842205128772e-07, + "loss": 0.66404617, + "num_input_tokens_seen": 285757045, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6484375, + "step": 13242, + "time_per_iteration": 2.689807653427124 + }, + { + "auxiliary_loss_clip": 0.01100947, + "auxiliary_loss_mlp": 0.01031927, + "balance_loss_clip": 1.01986313, + "balance_loss_mlp": 1.03429365, + "epoch": 0.796212235081918, + "flos": 21762836426880.0, + "grad_norm": 2.1932509816632235, + "language_loss": 0.75971556, + "learning_rate": 4.199454226296526e-07, + "loss": 0.78104436, + "num_input_tokens_seen": 285776050, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6640625, + "step": 13243, + "time_per_iteration": 2.48710298538208 + }, + { + "auxiliary_loss_clip": 0.01101282, + "auxiliary_loss_mlp": 0.01028309, + "balance_loss_clip": 1.01662683, + "balance_loss_mlp": 1.03448629, + "epoch": 0.7962723583345859, + "flos": 21178857110400.0, + "grad_norm": 1.748658102628615, + "language_loss": 0.7998516, + "learning_rate": 4.1970668466225565e-07, + "loss": 0.8211475, + "num_input_tokens_seen": 285796830, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66796875, + "step": 13244, + "time_per_iteration": 2.475694179534912 + }, + { + "auxiliary_loss_clip": 0.01103079, + "auxiliary_loss_mlp": 0.01028494, + "balance_loss_clip": 1.01651323, + "balance_loss_mlp": 1.03422666, + "epoch": 0.7963324815872539, + "flos": 17128636369920.0, + "grad_norm": 1.9995558497633756, + "language_loss": 0.67953658, + "learning_rate": 4.1946800661973934e-07, + "loss": 0.70085227, + "num_input_tokens_seen": 285814755, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6875, + "step": 13245, + "time_per_iteration": 2.4532089233398438 + }, + { + "auxiliary_loss_clip": 0.01101276, + "auxiliary_loss_mlp": 0.01031868, + "balance_loss_clip": 1.02030444, + "balance_loss_mlp": 1.03515017, + "epoch": 0.7963926048399218, + "flos": 21397481239680.0, + "grad_norm": 1.3612442472486292, + "language_loss": 0.78971922, + "learning_rate": 4.192293885111549e-07, + "loss": 0.81105065, + "num_input_tokens_seen": 285834255, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66015625, + "step": 13246, + "time_per_iteration": 2.440587282180786 + }, + { + "auxiliary_loss_clip": 0.01101796, + "auxiliary_loss_mlp": 0.01026671, + "balance_loss_clip": 1.0149411, + "balance_loss_mlp": 1.0336647, + "epoch": 0.7964527280925898, + "flos": 25184188828800.0, + "grad_norm": 1.6847390016039745, + "language_loss": 0.66190958, + "learning_rate": 4.1899083034555007e-07, + "loss": 0.68319428, + "num_input_tokens_seen": 285853540, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 13247, + "time_per_iteration": 2.487718343734741 + }, + { + "auxiliary_loss_clip": 0.01097373, + "auxiliary_loss_mlp": 0.01028984, + "balance_loss_clip": 1.01829112, + "balance_loss_mlp": 1.03314734, + "epoch": 0.7965128513452577, + "flos": 27015884928000.0, + "grad_norm": 2.8639636552336234, + "language_loss": 0.71457285, + "learning_rate": 4.1875233213197123e-07, + "loss": 0.73583645, + "num_input_tokens_seen": 285872705, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.640625, + "step": 13248, + "time_per_iteration": 2.474893093109131 + }, + { + "auxiliary_loss_clip": 0.0110231, + "auxiliary_loss_mlp": 0.01028868, + "balance_loss_clip": 1.01695275, + "balance_loss_mlp": 1.03439724, + "epoch": 0.7965729745979258, + "flos": 24419578993920.0, + "grad_norm": 2.0200427415060416, + "language_loss": 0.7616542, + "learning_rate": 4.1851389387946255e-07, + "loss": 0.78296602, + "num_input_tokens_seen": 285890290, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 13249, + "time_per_iteration": 2.48595929145813 + }, + { + "auxiliary_loss_clip": 0.01099446, + "auxiliary_loss_mlp": 0.01031046, + "balance_loss_clip": 1.01975679, + "balance_loss_mlp": 1.03507221, + "epoch": 0.7966330978505937, + "flos": 18840389978880.0, + "grad_norm": 2.126182284443467, + "language_loss": 0.61335742, + "learning_rate": 4.1827551559706674e-07, + "loss": 0.63466233, + "num_input_tokens_seen": 285909190, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.64453125, + "step": 13250, + "time_per_iteration": 2.4277217388153076 + }, + { + "auxiliary_loss_clip": 0.01101542, + "auxiliary_loss_mlp": 0.01025247, + "balance_loss_clip": 1.01375568, + "balance_loss_mlp": 1.03543615, + "epoch": 0.7966932211032617, + "flos": 13152319862400.0, + "grad_norm": 2.322541545659239, + "language_loss": 0.72526091, + "learning_rate": 4.180371972938206e-07, + "loss": 0.7465288, + "num_input_tokens_seen": 285927570, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66015625, + "step": 13251, + "time_per_iteration": 2.4575724601745605 + }, + { + "auxiliary_loss_clip": 0.01103859, + "auxiliary_loss_mlp": 0.01027801, + "balance_loss_clip": 1.0152247, + "balance_loss_mlp": 1.03521776, + "epoch": 0.7967533443559297, + "flos": 23949760078080.0, + "grad_norm": 1.8469414013396577, + "language_loss": 0.72915018, + "learning_rate": 4.177989389787624e-07, + "loss": 0.75046682, + "num_input_tokens_seen": 285945810, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 13252, + "time_per_iteration": 2.4559550285339355 + }, + { + "auxiliary_loss_clip": 0.010964, + "auxiliary_loss_mlp": 0.01027003, + "balance_loss_clip": 1.01549911, + "balance_loss_mlp": 1.03332281, + "epoch": 0.7968134676085976, + "flos": 30368791964160.0, + "grad_norm": 1.6873706589511155, + "language_loss": 0.66239917, + "learning_rate": 4.175607406609278e-07, + "loss": 0.68363321, + "num_input_tokens_seen": 285964235, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6328125, + "step": 13253, + "time_per_iteration": 2.538630962371826 + }, + { + "auxiliary_loss_clip": 0.01105101, + "auxiliary_loss_mlp": 0.01035714, + "balance_loss_clip": 1.02353644, + "balance_loss_mlp": 1.03758895, + "epoch": 0.7968735908612656, + "flos": 23075048079360.0, + "grad_norm": 1.5649254820848235, + "language_loss": 0.67826599, + "learning_rate": 4.1732260234934767e-07, + "loss": 0.69967413, + "num_input_tokens_seen": 285983710, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 13254, + "time_per_iteration": 2.4423506259918213 + }, + { + "auxiliary_loss_clip": 0.01098639, + "auxiliary_loss_mlp": 0.01035877, + "balance_loss_clip": 1.02485597, + "balance_loss_mlp": 1.03396869, + "epoch": 0.7969337141139335, + "flos": 23582250074880.0, + "grad_norm": 1.8439634807377834, + "language_loss": 0.69335532, + "learning_rate": 4.1708452405305314e-07, + "loss": 0.71470052, + "num_input_tokens_seen": 286003425, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 13255, + "time_per_iteration": 2.4770302772521973 + }, + { + "auxiliary_loss_clip": 0.01098301, + "auxiliary_loss_mlp": 0.0103042, + "balance_loss_clip": 1.01932132, + "balance_loss_mlp": 1.03357673, + "epoch": 0.7969938373666016, + "flos": 19755860935680.0, + "grad_norm": 2.0839299199597576, + "language_loss": 0.79384631, + "learning_rate": 4.168465057810733e-07, + "loss": 0.81513351, + "num_input_tokens_seen": 286020130, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6484375, + "step": 13256, + "time_per_iteration": 2.435065507888794 + }, + { + "auxiliary_loss_clip": 0.01100559, + "auxiliary_loss_mlp": 0.01025356, + "balance_loss_clip": 1.01379228, + "balance_loss_mlp": 1.03476715, + "epoch": 0.7970539606192695, + "flos": 24134089697280.0, + "grad_norm": 1.7632548016359857, + "language_loss": 0.65341133, + "learning_rate": 4.166085475424315e-07, + "loss": 0.67467046, + "num_input_tokens_seen": 286040230, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 13257, + "time_per_iteration": 2.4952993392944336 + }, + { + "auxiliary_loss_clip": 0.01106098, + "auxiliary_loss_mlp": 0.01033925, + "balance_loss_clip": 1.02211094, + "balance_loss_mlp": 1.03727162, + "epoch": 0.7971140838719375, + "flos": 17968622895360.0, + "grad_norm": 4.269740157114163, + "language_loss": 0.72265047, + "learning_rate": 4.163706493461523e-07, + "loss": 0.74405068, + "num_input_tokens_seen": 286059475, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 13258, + "time_per_iteration": 2.422609806060791 + }, + { + "auxiliary_loss_clip": 0.01103566, + "auxiliary_loss_mlp": 0.01030154, + "balance_loss_clip": 1.01795912, + "balance_loss_mlp": 1.03580558, + "epoch": 0.7971742071246054, + "flos": 19169547235200.0, + "grad_norm": 1.7787889345265135, + "language_loss": 0.68876815, + "learning_rate": 4.1613281120125655e-07, + "loss": 0.7101053, + "num_input_tokens_seen": 286077820, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6796875, + "step": 13259, + "time_per_iteration": 2.454787015914917 + }, + { + "auxiliary_loss_clip": 0.01098869, + "auxiliary_loss_mlp": 0.01029239, + "balance_loss_clip": 1.01853442, + "balance_loss_mlp": 1.03478527, + "epoch": 0.7972343303772734, + "flos": 27125951178240.0, + "grad_norm": 1.6665251005798685, + "language_loss": 0.73773205, + "learning_rate": 4.158950331167641e-07, + "loss": 0.75901318, + "num_input_tokens_seen": 286097285, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.640625, + "step": 13260, + "time_per_iteration": 2.491205930709839 + }, + { + "auxiliary_loss_clip": 0.01097155, + "auxiliary_loss_mlp": 0.01028441, + "balance_loss_clip": 1.01763427, + "balance_loss_mlp": 1.03306603, + "epoch": 0.7972944536299413, + "flos": 20996646393600.0, + "grad_norm": 1.7740121958206554, + "language_loss": 0.78436148, + "learning_rate": 4.1565731510169065e-07, + "loss": 0.80561745, + "num_input_tokens_seen": 286116000, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.640625, + "step": 13261, + "time_per_iteration": 3.8501453399658203 + }, + { + "auxiliary_loss_clip": 0.01097148, + "auxiliary_loss_mlp": 0.01030088, + "balance_loss_clip": 1.02008629, + "balance_loss_mlp": 1.035748, + "epoch": 0.7973545768826094, + "flos": 21580015178880.0, + "grad_norm": 1.4361813599632072, + "language_loss": 0.75999635, + "learning_rate": 4.154196571650501e-07, + "loss": 0.78126872, + "num_input_tokens_seen": 286135110, + "router_z_loss_clip": 0.10009766, + "router_z_loss_mlp": 0.61328125, + "step": 13262, + "time_per_iteration": 2.4577090740203857 + }, + { + "auxiliary_loss_clip": 0.01107624, + "auxiliary_loss_mlp": 0.0102757, + "balance_loss_clip": 1.01496387, + "balance_loss_mlp": 1.03712356, + "epoch": 0.7974147001352773, + "flos": 20558536208640.0, + "grad_norm": 2.1431092546500103, + "language_loss": 0.7052893, + "learning_rate": 4.1518205931585524e-07, + "loss": 0.7266413, + "num_input_tokens_seen": 286152835, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.703125, + "step": 13263, + "time_per_iteration": 3.8635799884796143 + }, + { + "auxiliary_loss_clip": 0.01106881, + "auxiliary_loss_mlp": 0.01034326, + "balance_loss_clip": 1.021523, + "balance_loss_mlp": 1.03610015, + "epoch": 0.7974748233879453, + "flos": 20996790048000.0, + "grad_norm": 1.9663243641140786, + "language_loss": 0.71254778, + "learning_rate": 4.149445215631153e-07, + "loss": 0.73395979, + "num_input_tokens_seen": 286171785, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.70703125, + "step": 13264, + "time_per_iteration": 3.8191962242126465 + }, + { + "auxiliary_loss_clip": 0.01098223, + "auxiliary_loss_mlp": 0.01033387, + "balance_loss_clip": 1.0220921, + "balance_loss_mlp": 1.03471494, + "epoch": 0.7975349466406133, + "flos": 22565188477440.0, + "grad_norm": 1.6219090858782177, + "language_loss": 0.76819849, + "learning_rate": 4.1470704391583776e-07, + "loss": 0.78951454, + "num_input_tokens_seen": 286190420, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.63671875, + "step": 13265, + "time_per_iteration": 2.4498677253723145 + }, + { + "auxiliary_loss_clip": 0.0110135, + "auxiliary_loss_mlp": 0.01027942, + "balance_loss_clip": 1.01672459, + "balance_loss_mlp": 1.03420353, + "epoch": 0.7975950698932812, + "flos": 21689542725120.0, + "grad_norm": 1.8502756325316978, + "language_loss": 0.75627744, + "learning_rate": 4.144696263830285e-07, + "loss": 0.77757037, + "num_input_tokens_seen": 286210105, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 13266, + "time_per_iteration": 2.4424939155578613 + }, + { + "auxiliary_loss_clip": 0.01097761, + "auxiliary_loss_mlp": 0.01027384, + "balance_loss_clip": 1.01613104, + "balance_loss_mlp": 1.03291893, + "epoch": 0.7976551931459492, + "flos": 19604568850560.0, + "grad_norm": 1.5381451690373484, + "language_loss": 0.83917278, + "learning_rate": 4.1423226897369015e-07, + "loss": 0.86042428, + "num_input_tokens_seen": 286228180, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6484375, + "step": 13267, + "time_per_iteration": 4.030280113220215 + }, + { + "auxiliary_loss_clip": 0.01099973, + "auxiliary_loss_mlp": 0.01032115, + "balance_loss_clip": 1.02045047, + "balance_loss_mlp": 1.03457189, + "epoch": 0.7977153163986171, + "flos": 21687603390720.0, + "grad_norm": 1.7026811780981197, + "language_loss": 0.75749743, + "learning_rate": 4.139949716968223e-07, + "loss": 0.77881831, + "num_input_tokens_seen": 286247305, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65625, + "step": 13268, + "time_per_iteration": 2.4395506381988525 + }, + { + "auxiliary_loss_clip": 0.01101025, + "auxiliary_loss_mlp": 0.01027573, + "balance_loss_clip": 1.01612282, + "balance_loss_mlp": 1.0355351, + "epoch": 0.7977754396512852, + "flos": 23476780765440.0, + "grad_norm": 1.5399567563780987, + "language_loss": 0.77794158, + "learning_rate": 4.1375773456142403e-07, + "loss": 0.7992276, + "num_input_tokens_seen": 286268145, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65625, + "step": 13269, + "time_per_iteration": 2.4894964694976807 + }, + { + "auxiliary_loss_clip": 0.01096838, + "auxiliary_loss_mlp": 0.01030312, + "balance_loss_clip": 1.01972592, + "balance_loss_mlp": 1.03359246, + "epoch": 0.7978355629039531, + "flos": 22382223575040.0, + "grad_norm": 1.7083868858848195, + "language_loss": 0.82055652, + "learning_rate": 4.135205575764922e-07, + "loss": 0.84182805, + "num_input_tokens_seen": 286286775, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6328125, + "step": 13270, + "time_per_iteration": 2.4561750888824463 + }, + { + "auxiliary_loss_clip": 0.01101524, + "auxiliary_loss_mlp": 0.0102809, + "balance_loss_clip": 1.0165925, + "balance_loss_mlp": 1.03613377, + "epoch": 0.7978956861566211, + "flos": 20266331068800.0, + "grad_norm": 1.6705229084811413, + "language_loss": 0.595366, + "learning_rate": 4.1328344075101905e-07, + "loss": 0.61666214, + "num_input_tokens_seen": 286305590, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65234375, + "step": 13271, + "time_per_iteration": 2.4954357147216797 + }, + { + "auxiliary_loss_clip": 0.01104573, + "auxiliary_loss_mlp": 0.01031498, + "balance_loss_clip": 1.01961303, + "balance_loss_mlp": 1.03635263, + "epoch": 0.797955809409289, + "flos": 28112417366400.0, + "grad_norm": 1.5850933882113063, + "language_loss": 0.73206866, + "learning_rate": 4.130463840939975e-07, + "loss": 0.75342935, + "num_input_tokens_seen": 286328050, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.68359375, + "step": 13272, + "time_per_iteration": 2.509640693664551 + }, + { + "auxiliary_loss_clip": 0.0110219, + "auxiliary_loss_mlp": 0.01027919, + "balance_loss_clip": 1.01630759, + "balance_loss_mlp": 1.03711224, + "epoch": 0.798015932661957, + "flos": 15559591495680.0, + "grad_norm": 2.009910797942707, + "language_loss": 0.71586084, + "learning_rate": 4.128093876144161e-07, + "loss": 0.73716193, + "num_input_tokens_seen": 286345265, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 13273, + "time_per_iteration": 2.488239049911499 + }, + { + "auxiliary_loss_clip": 0.01103696, + "auxiliary_loss_mlp": 0.01030621, + "balance_loss_clip": 1.01880777, + "balance_loss_mlp": 1.03583157, + "epoch": 0.7980760559146249, + "flos": 23951196622080.0, + "grad_norm": 1.887561029731992, + "language_loss": 0.7577731, + "learning_rate": 4.1257245132126117e-07, + "loss": 0.77911627, + "num_input_tokens_seen": 286364465, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 13274, + "time_per_iteration": 2.462188482284546 + }, + { + "auxiliary_loss_clip": 0.01095507, + "auxiliary_loss_mlp": 0.01028761, + "balance_loss_clip": 1.01827025, + "balance_loss_mlp": 1.03438878, + "epoch": 0.798136179167293, + "flos": 28038082170240.0, + "grad_norm": 1.3212411504254853, + "language_loss": 0.77607358, + "learning_rate": 4.12335575223518e-07, + "loss": 0.79731625, + "num_input_tokens_seen": 286385565, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.61328125, + "step": 13275, + "time_per_iteration": 2.514090061187744 + }, + { + "auxiliary_loss_clip": 0.0110285, + "auxiliary_loss_mlp": 0.01031831, + "balance_loss_clip": 1.01921892, + "balance_loss_mlp": 1.0345515, + "epoch": 0.7981963024199609, + "flos": 35984538046080.0, + "grad_norm": 1.829043802525264, + "language_loss": 0.64052433, + "learning_rate": 4.1209875933016877e-07, + "loss": 0.66187114, + "num_input_tokens_seen": 286403950, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.68359375, + "step": 13276, + "time_per_iteration": 2.5371670722961426 + }, + { + "auxiliary_loss_clip": 0.01097788, + "auxiliary_loss_mlp": 0.01028439, + "balance_loss_clip": 1.01740062, + "balance_loss_mlp": 1.03425479, + "epoch": 0.7982564256726289, + "flos": 25884914325120.0, + "grad_norm": 1.5295363534394828, + "language_loss": 0.60448158, + "learning_rate": 4.118620036501945e-07, + "loss": 0.62574387, + "num_input_tokens_seen": 286426160, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.63671875, + "step": 13277, + "time_per_iteration": 2.4880197048187256 + }, + { + "auxiliary_loss_clip": 0.01105128, + "auxiliary_loss_mlp": 0.01030377, + "balance_loss_clip": 1.01911783, + "balance_loss_mlp": 1.0375464, + "epoch": 0.7983165489252969, + "flos": 25739152934400.0, + "grad_norm": 2.008631814369184, + "language_loss": 0.79715037, + "learning_rate": 4.1162530819257227e-07, + "loss": 0.81850541, + "num_input_tokens_seen": 286446610, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.67578125, + "step": 13278, + "time_per_iteration": 2.4780664443969727 + }, + { + "auxiliary_loss_clip": 0.01103768, + "auxiliary_loss_mlp": 0.01037367, + "balance_loss_clip": 1.02485633, + "balance_loss_mlp": 1.03559947, + "epoch": 0.7983766721779648, + "flos": 21908202768000.0, + "grad_norm": 1.9965492403610876, + "language_loss": 0.6323722, + "learning_rate": 4.113886729662768e-07, + "loss": 0.65378356, + "num_input_tokens_seen": 286465460, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 13279, + "time_per_iteration": 2.4683034420013428 + }, + { + "auxiliary_loss_clip": 0.01095285, + "auxiliary_loss_mlp": 0.01026468, + "balance_loss_clip": 1.01601911, + "balance_loss_mlp": 1.03389192, + "epoch": 0.7984367954306328, + "flos": 29347420734720.0, + "grad_norm": 1.6504787755208947, + "language_loss": 0.70773625, + "learning_rate": 4.111520979802825e-07, + "loss": 0.72895384, + "num_input_tokens_seen": 286485720, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.61328125, + "step": 13280, + "time_per_iteration": 2.4923903942108154 + }, + { + "auxiliary_loss_clip": 0.01103118, + "auxiliary_loss_mlp": 0.01031209, + "balance_loss_clip": 1.01919234, + "balance_loss_mlp": 1.03547907, + "epoch": 0.7984969186833007, + "flos": 31357772104320.0, + "grad_norm": 1.6234618647236767, + "language_loss": 0.62751859, + "learning_rate": 4.1091558324355955e-07, + "loss": 0.64886189, + "num_input_tokens_seen": 286507465, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 13281, + "time_per_iteration": 2.5414252281188965 + }, + { + "auxiliary_loss_clip": 0.01104951, + "auxiliary_loss_mlp": 0.01033082, + "balance_loss_clip": 1.02107787, + "balance_loss_mlp": 1.03481054, + "epoch": 0.7985570419359688, + "flos": 24312924535680.0, + "grad_norm": 1.7029379552600752, + "language_loss": 0.80491292, + "learning_rate": 4.1067912876507683e-07, + "loss": 0.82629329, + "num_input_tokens_seen": 286526345, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.703125, + "step": 13282, + "time_per_iteration": 2.4520959854125977 + }, + { + "auxiliary_loss_clip": 0.01101884, + "auxiliary_loss_mlp": 0.0103017, + "balance_loss_clip": 1.01801062, + "balance_loss_mlp": 1.03421319, + "epoch": 0.7986171651886367, + "flos": 15742233175680.0, + "grad_norm": 1.8947522031030082, + "language_loss": 0.7154727, + "learning_rate": 4.10442734553802e-07, + "loss": 0.73679316, + "num_input_tokens_seen": 286544095, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6796875, + "step": 13283, + "time_per_iteration": 2.4246160984039307 + }, + { + "auxiliary_loss_clip": 0.01098743, + "auxiliary_loss_mlp": 0.01029389, + "balance_loss_clip": 1.01833189, + "balance_loss_mlp": 1.03302252, + "epoch": 0.7986772884413047, + "flos": 11619401091840.0, + "grad_norm": 1.8968441964994822, + "language_loss": 0.7347362, + "learning_rate": 4.102064006186967e-07, + "loss": 0.75601751, + "num_input_tokens_seen": 286560960, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 13284, + "time_per_iteration": 2.402165174484253 + }, + { + "auxiliary_loss_clip": 0.01100101, + "auxiliary_loss_mlp": 0.01030516, + "balance_loss_clip": 1.02016854, + "balance_loss_mlp": 1.03526652, + "epoch": 0.7987374116939726, + "flos": 22091059929600.0, + "grad_norm": 1.5742258488227296, + "language_loss": 0.70226932, + "learning_rate": 4.0997012696872415e-07, + "loss": 0.72357547, + "num_input_tokens_seen": 286579865, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.6484375, + "step": 13285, + "time_per_iteration": 2.5729713439941406 + }, + { + "auxiliary_loss_clip": 0.0109928, + "auxiliary_loss_mlp": 0.01027172, + "balance_loss_clip": 1.01627612, + "balance_loss_mlp": 1.03425968, + "epoch": 0.7987975349466406, + "flos": 17890696339200.0, + "grad_norm": 1.5373042942121937, + "language_loss": 0.73492497, + "learning_rate": 4.097339136128437e-07, + "loss": 0.75618953, + "num_input_tokens_seen": 286597295, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6484375, + "step": 13286, + "time_per_iteration": 2.435335874557495 + }, + { + "auxiliary_loss_clip": 0.01100513, + "auxiliary_loss_mlp": 0.01032503, + "balance_loss_clip": 1.02102304, + "balance_loss_mlp": 1.03493309, + "epoch": 0.7988576581993085, + "flos": 19719232041600.0, + "grad_norm": 2.2146294120164876, + "language_loss": 0.74433863, + "learning_rate": 4.0949776056001296e-07, + "loss": 0.76566875, + "num_input_tokens_seen": 286616270, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 13287, + "time_per_iteration": 2.4583966732025146 + }, + { + "auxiliary_loss_clip": 0.0109898, + "auxiliary_loss_mlp": 0.01028536, + "balance_loss_clip": 1.0171752, + "balance_loss_mlp": 1.03461981, + "epoch": 0.7989177814519766, + "flos": 28036358317440.0, + "grad_norm": 2.4603095156491457, + "language_loss": 0.61630833, + "learning_rate": 4.092616678191863e-07, + "loss": 0.63758349, + "num_input_tokens_seen": 286638315, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.64453125, + "step": 13288, + "time_per_iteration": 2.5024874210357666 + }, + { + "auxiliary_loss_clip": 0.01100282, + "auxiliary_loss_mlp": 0.0102498, + "balance_loss_clip": 1.0142808, + "balance_loss_mlp": 1.03618968, + "epoch": 0.7989779047046445, + "flos": 28871029630080.0, + "grad_norm": 2.398551145532932, + "language_loss": 0.70419228, + "learning_rate": 4.090256353993169e-07, + "loss": 0.72544491, + "num_input_tokens_seen": 286658630, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.640625, + "step": 13289, + "time_per_iteration": 2.5227341651916504 + }, + { + "auxiliary_loss_clip": 0.01099382, + "auxiliary_loss_mlp": 0.01032264, + "balance_loss_clip": 1.01975262, + "balance_loss_mlp": 1.03570962, + "epoch": 0.7990380279573125, + "flos": 18186887888640.0, + "grad_norm": 3.476010785150094, + "language_loss": 0.62750173, + "learning_rate": 4.0878966330935506e-07, + "loss": 0.64881819, + "num_input_tokens_seen": 286676870, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.63671875, + "step": 13290, + "time_per_iteration": 2.413945436477661 + }, + { + "auxiliary_loss_clip": 0.01103234, + "auxiliary_loss_mlp": 0.0102787, + "balance_loss_clip": 1.0152936, + "balance_loss_mlp": 1.03642523, + "epoch": 0.7990981512099805, + "flos": 20879936127360.0, + "grad_norm": 3.9151132007409513, + "language_loss": 0.71637499, + "learning_rate": 4.08553751558248e-07, + "loss": 0.73768604, + "num_input_tokens_seen": 286694300, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.66796875, + "step": 13291, + "time_per_iteration": 2.4885568618774414 + }, + { + "auxiliary_loss_clip": 0.01097167, + "auxiliary_loss_mlp": 0.01025771, + "balance_loss_clip": 1.01506627, + "balance_loss_mlp": 1.03270483, + "epoch": 0.7991582744626484, + "flos": 26099911180800.0, + "grad_norm": 1.5485118073746154, + "language_loss": 0.6335237, + "learning_rate": 4.083179001549422e-07, + "loss": 0.65475303, + "num_input_tokens_seen": 286714545, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6484375, + "step": 13292, + "time_per_iteration": 2.4616239070892334 + }, + { + "auxiliary_loss_clip": 0.0109979, + "auxiliary_loss_mlp": 0.01029505, + "balance_loss_clip": 1.0185678, + "balance_loss_mlp": 1.0349102, + "epoch": 0.7992183977153164, + "flos": 35295843605760.0, + "grad_norm": 1.555240655733236, + "language_loss": 0.56249213, + "learning_rate": 4.0808210910838105e-07, + "loss": 0.58378512, + "num_input_tokens_seen": 286734525, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 13293, + "time_per_iteration": 2.5668938159942627 + }, + { + "auxiliary_loss_clip": 0.01101281, + "auxiliary_loss_mlp": 0.01032816, + "balance_loss_clip": 1.02145505, + "balance_loss_mlp": 1.03606391, + "epoch": 0.7992785209679844, + "flos": 51853426577280.0, + "grad_norm": 2.987312394872763, + "language_loss": 0.71444452, + "learning_rate": 4.0784637842750704e-07, + "loss": 0.73578554, + "num_input_tokens_seen": 286753430, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65234375, + "step": 13294, + "time_per_iteration": 2.693946361541748 + }, + { + "auxiliary_loss_clip": 0.01101257, + "auxiliary_loss_mlp": 0.01030873, + "balance_loss_clip": 1.0196135, + "balance_loss_mlp": 1.03554058, + "epoch": 0.7993386442206524, + "flos": 22565116650240.0, + "grad_norm": 1.7329593206167035, + "language_loss": 0.72202832, + "learning_rate": 4.0761070812125675e-07, + "loss": 0.74334961, + "num_input_tokens_seen": 286771915, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 13295, + "time_per_iteration": 2.440544605255127 + }, + { + "auxiliary_loss_clip": 0.01096658, + "auxiliary_loss_mlp": 0.01030621, + "balance_loss_clip": 1.01995802, + "balance_loss_mlp": 1.03398633, + "epoch": 0.7993987674733203, + "flos": 18800277465600.0, + "grad_norm": 2.1702200839393395, + "language_loss": 0.76480281, + "learning_rate": 4.0737509819856797e-07, + "loss": 0.78607565, + "num_input_tokens_seen": 286789835, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.625, + "step": 13296, + "time_per_iteration": 2.4405605792999268 + }, + { + "auxiliary_loss_clip": 0.01023152, + "auxiliary_loss_mlp": 0.01002637, + "balance_loss_clip": 1.001755, + "balance_loss_mlp": 1.00325012, + "epoch": 0.7994588907259883, + "flos": 69421720394880.0, + "grad_norm": 0.6861737124330846, + "language_loss": 0.60802543, + "learning_rate": 4.0713954866837573e-07, + "loss": 0.62828332, + "num_input_tokens_seen": 286855580, + "router_z_loss_clip": 0.0088501, + "router_z_loss_mlp": 0.19921875, + "step": 13297, + "time_per_iteration": 3.11775541305542 + }, + { + "auxiliary_loss_clip": 0.01099558, + "auxiliary_loss_mlp": 0.01029569, + "balance_loss_clip": 1.01829767, + "balance_loss_mlp": 1.03441751, + "epoch": 0.7995190139786562, + "flos": 13480327883520.0, + "grad_norm": 1.8885665209520346, + "language_loss": 0.70239675, + "learning_rate": 4.0690405953961073e-07, + "loss": 0.72368801, + "num_input_tokens_seen": 286874360, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65234375, + "step": 13298, + "time_per_iteration": 2.4225876331329346 + }, + { + "auxiliary_loss_clip": 0.01103672, + "auxiliary_loss_mlp": 0.0103276, + "balance_loss_clip": 1.02003491, + "balance_loss_mlp": 1.0351696, + "epoch": 0.7995791372313242, + "flos": 21652842003840.0, + "grad_norm": 1.9550250872317747, + "language_loss": 0.75762308, + "learning_rate": 4.066686308212037e-07, + "loss": 0.77898747, + "num_input_tokens_seen": 286891950, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 13299, + "time_per_iteration": 2.4788718223571777 + }, + { + "auxiliary_loss_clip": 0.01098072, + "auxiliary_loss_mlp": 0.01027232, + "balance_loss_clip": 1.01662207, + "balance_loss_mlp": 1.03498912, + "epoch": 0.7996392604839921, + "flos": 26068130622720.0, + "grad_norm": 1.7719100438283584, + "language_loss": 0.77760887, + "learning_rate": 4.064332625220828e-07, + "loss": 0.79886186, + "num_input_tokens_seen": 286911725, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.6328125, + "step": 13300, + "time_per_iteration": 2.4796881675720215 + }, + { + "auxiliary_loss_clip": 0.01101744, + "auxiliary_loss_mlp": 0.01027746, + "balance_loss_clip": 1.01584315, + "balance_loss_mlp": 1.03424072, + "epoch": 0.7996993837366602, + "flos": 24606889441920.0, + "grad_norm": 2.002040406516657, + "language_loss": 0.63432777, + "learning_rate": 4.0619795465117115e-07, + "loss": 0.65562272, + "num_input_tokens_seen": 286931400, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 13301, + "time_per_iteration": 2.4858558177948 + }, + { + "auxiliary_loss_clip": 0.01098587, + "auxiliary_loss_mlp": 0.01033841, + "balance_loss_clip": 1.02241445, + "balance_loss_mlp": 1.0356431, + "epoch": 0.7997595069893281, + "flos": 20992049452800.0, + "grad_norm": 1.6279257021355094, + "language_loss": 0.71502745, + "learning_rate": 4.059627072173928e-07, + "loss": 0.73635173, + "num_input_tokens_seen": 286949795, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.62890625, + "step": 13302, + "time_per_iteration": 2.4388864040374756 + }, + { + "auxiliary_loss_clip": 0.01104001, + "auxiliary_loss_mlp": 0.01030712, + "balance_loss_clip": 1.01885068, + "balance_loss_mlp": 1.03659701, + "epoch": 0.7998196302419961, + "flos": 24426510318720.0, + "grad_norm": 1.833344875316907, + "language_loss": 0.83622801, + "learning_rate": 4.057275202296684e-07, + "loss": 0.85757518, + "num_input_tokens_seen": 286968805, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 13303, + "time_per_iteration": 3.86017107963562 + }, + { + "auxiliary_loss_clip": 0.01098042, + "auxiliary_loss_mlp": 0.01030765, + "balance_loss_clip": 1.02020907, + "balance_loss_mlp": 1.03435075, + "epoch": 0.7998797534946641, + "flos": 30264651457920.0, + "grad_norm": 1.6100512541022713, + "language_loss": 0.5873881, + "learning_rate": 4.054923936969166e-07, + "loss": 0.60867614, + "num_input_tokens_seen": 286990235, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.640625, + "step": 13304, + "time_per_iteration": 2.5343167781829834 + }, + { + "auxiliary_loss_clip": 0.01101146, + "auxiliary_loss_mlp": 0.01027155, + "balance_loss_clip": 1.01525831, + "balance_loss_mlp": 1.03274465, + "epoch": 0.799939876747332, + "flos": 23513984277120.0, + "grad_norm": 1.7664004546927765, + "language_loss": 0.69075799, + "learning_rate": 4.0525732762805265e-07, + "loss": 0.71204102, + "num_input_tokens_seen": 287011060, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.68359375, + "step": 13305, + "time_per_iteration": 3.846991777420044 + }, + { + "auxiliary_loss_clip": 0.01097982, + "auxiliary_loss_mlp": 0.01026107, + "balance_loss_clip": 1.0154916, + "balance_loss_mlp": 1.03421581, + "epoch": 0.8, + "flos": 19318109886720.0, + "grad_norm": 1.5410766724401597, + "language_loss": 0.6923117, + "learning_rate": 4.0502232203199107e-07, + "loss": 0.71355259, + "num_input_tokens_seen": 287029215, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.63671875, + "step": 13306, + "time_per_iteration": 3.824300527572632 + }, + { + "auxiliary_loss_clip": 0.01101652, + "auxiliary_loss_mlp": 0.01033892, + "balance_loss_clip": 1.02248406, + "balance_loss_mlp": 1.03599465, + "epoch": 0.800060123252668, + "flos": 32412432263040.0, + "grad_norm": 1.5349326427116308, + "language_loss": 0.69361722, + "learning_rate": 4.0478737691764286e-07, + "loss": 0.71497267, + "num_input_tokens_seen": 287050855, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 13307, + "time_per_iteration": 2.641338348388672 + }, + { + "auxiliary_loss_clip": 0.01100663, + "auxiliary_loss_mlp": 0.01029461, + "balance_loss_clip": 1.01857102, + "balance_loss_mlp": 1.03444958, + "epoch": 0.800120246505336, + "flos": 20010611168640.0, + "grad_norm": 1.9021997746458712, + "language_loss": 0.76933712, + "learning_rate": 4.0455249229391677e-07, + "loss": 0.79063845, + "num_input_tokens_seen": 287069915, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6640625, + "step": 13308, + "time_per_iteration": 2.449411630630493 + }, + { + "auxiliary_loss_clip": 0.01102913, + "auxiliary_loss_mlp": 0.01029431, + "balance_loss_clip": 1.01708674, + "balance_loss_mlp": 1.03450274, + "epoch": 0.8001803697580039, + "flos": 31868278151040.0, + "grad_norm": 1.9120896372435958, + "language_loss": 0.78702182, + "learning_rate": 4.0431766816972e-07, + "loss": 0.80834526, + "num_input_tokens_seen": 287091450, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.68359375, + "step": 13309, + "time_per_iteration": 4.006925106048584 + }, + { + "auxiliary_loss_clip": 0.01023336, + "auxiliary_loss_mlp": 0.0100397, + "balance_loss_clip": 1.00295115, + "balance_loss_mlp": 1.00320923, + "epoch": 0.8002404930106719, + "flos": 63392066916480.0, + "grad_norm": 0.9338382930636256, + "language_loss": 0.64702326, + "learning_rate": 4.040829045539571e-07, + "loss": 0.66729629, + "num_input_tokens_seen": 287148365, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.20117188, + "step": 13310, + "time_per_iteration": 2.975738525390625 + }, + { + "auxiliary_loss_clip": 0.01100488, + "auxiliary_loss_mlp": 0.01034343, + "balance_loss_clip": 1.02319705, + "balance_loss_mlp": 1.03539252, + "epoch": 0.8003006162633398, + "flos": 27855476403840.0, + "grad_norm": 4.652395781854749, + "language_loss": 0.82905459, + "learning_rate": 4.0384820145553156e-07, + "loss": 0.85040295, + "num_input_tokens_seen": 287168280, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65234375, + "step": 13311, + "time_per_iteration": 2.5010745525360107 + }, + { + "auxiliary_loss_clip": 0.01100391, + "auxiliary_loss_mlp": 0.0103207, + "balance_loss_clip": 1.0205307, + "balance_loss_mlp": 1.03499472, + "epoch": 0.8003607395160078, + "flos": 18223337214720.0, + "grad_norm": 2.136696844503174, + "language_loss": 0.6653198, + "learning_rate": 4.0361355888334116e-07, + "loss": 0.68664443, + "num_input_tokens_seen": 287185980, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 13312, + "time_per_iteration": 2.413475275039673 + }, + { + "auxiliary_loss_clip": 0.01104828, + "auxiliary_loss_mlp": 0.01030863, + "balance_loss_clip": 1.01855493, + "balance_loss_mlp": 1.03699255, + "epoch": 0.8004208627686757, + "flos": 20886975192960.0, + "grad_norm": 1.6185384671425953, + "language_loss": 0.75226915, + "learning_rate": 4.033789768462843e-07, + "loss": 0.77362603, + "num_input_tokens_seen": 287203875, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6796875, + "step": 13313, + "time_per_iteration": 2.4858338832855225 + }, + { + "auxiliary_loss_clip": 0.01099877, + "auxiliary_loss_mlp": 0.01029079, + "balance_loss_clip": 1.0179745, + "balance_loss_mlp": 1.03416753, + "epoch": 0.8004809860213438, + "flos": 26436143416320.0, + "grad_norm": 1.3383617423886183, + "language_loss": 0.75627804, + "learning_rate": 4.031444553532575e-07, + "loss": 0.77756763, + "num_input_tokens_seen": 287226445, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 13314, + "time_per_iteration": 2.544503927230835 + }, + { + "auxiliary_loss_clip": 0.01023163, + "auxiliary_loss_mlp": 0.00998551, + "balance_loss_clip": 0.99763316, + "balance_loss_mlp": 1.00305307, + "epoch": 0.8005411092740117, + "flos": 63648612829440.0, + "grad_norm": 0.8171555714712136, + "language_loss": 0.53831571, + "learning_rate": 4.029099944131522e-07, + "loss": 0.55853283, + "num_input_tokens_seen": 287286240, + "router_z_loss_clip": 0.00915527, + "router_z_loss_mlp": 0.20117188, + "step": 13315, + "time_per_iteration": 2.9481279850006104 + }, + { + "auxiliary_loss_clip": 0.01099698, + "auxiliary_loss_mlp": 0.01028111, + "balance_loss_clip": 1.0170964, + "balance_loss_mlp": 1.03518033, + "epoch": 0.8006012325266797, + "flos": 36138056774400.0, + "grad_norm": 2.5476348031673157, + "language_loss": 0.71353263, + "learning_rate": 4.026755940348603e-07, + "loss": 0.73481071, + "num_input_tokens_seen": 287310265, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 13316, + "time_per_iteration": 2.573376417160034 + }, + { + "auxiliary_loss_clip": 0.011043, + "auxiliary_loss_mlp": 0.01031222, + "balance_loss_clip": 1.01980805, + "balance_loss_mlp": 1.0358212, + "epoch": 0.8006613557793477, + "flos": 33838947970560.0, + "grad_norm": 1.707434135100754, + "language_loss": 0.64464766, + "learning_rate": 4.024412542272706e-07, + "loss": 0.66600287, + "num_input_tokens_seen": 287331610, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6875, + "step": 13317, + "time_per_iteration": 2.5424327850341797 + }, + { + "auxiliary_loss_clip": 0.01023338, + "auxiliary_loss_mlp": 0.00999, + "balance_loss_clip": 0.99798673, + "balance_loss_mlp": 1.00323308, + "epoch": 0.8007214790320156, + "flos": 67348310699520.0, + "grad_norm": 0.7645299039687239, + "language_loss": 0.59047085, + "learning_rate": 4.0220697499926783e-07, + "loss": 0.61069423, + "num_input_tokens_seen": 287394795, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.20117188, + "step": 13318, + "time_per_iteration": 3.1595919132232666 + }, + { + "auxiliary_loss_clip": 0.01097271, + "auxiliary_loss_mlp": 0.01022623, + "balance_loss_clip": 1.0112803, + "balance_loss_mlp": 1.03338647, + "epoch": 0.8007816022846836, + "flos": 23185653033600.0, + "grad_norm": 1.5349381284748576, + "language_loss": 0.66329014, + "learning_rate": 4.019727563597366e-07, + "loss": 0.68448913, + "num_input_tokens_seen": 287414595, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 13319, + "time_per_iteration": 2.4969546794891357 + }, + { + "auxiliary_loss_clip": 0.01101497, + "auxiliary_loss_mlp": 0.01034852, + "balance_loss_clip": 1.02248383, + "balance_loss_mlp": 1.0337888, + "epoch": 0.8008417255373516, + "flos": 21981388728960.0, + "grad_norm": 1.7719196822913061, + "language_loss": 0.74056709, + "learning_rate": 4.0173859831755873e-07, + "loss": 0.76193058, + "num_input_tokens_seen": 287434395, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.67578125, + "step": 13320, + "time_per_iteration": 2.4454445838928223 + }, + { + "auxiliary_loss_clip": 0.01101301, + "auxiliary_loss_mlp": 0.01025658, + "balance_loss_clip": 1.01393962, + "balance_loss_mlp": 1.03455615, + "epoch": 0.8009018487900196, + "flos": 16727334647040.0, + "grad_norm": 2.1051785916089485, + "language_loss": 0.80298382, + "learning_rate": 4.015045008816138e-07, + "loss": 0.82425332, + "num_input_tokens_seen": 287450590, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66796875, + "step": 13321, + "time_per_iteration": 2.4167563915252686 + }, + { + "auxiliary_loss_clip": 0.01094949, + "auxiliary_loss_mlp": 0.01029655, + "balance_loss_clip": 1.01849723, + "balance_loss_mlp": 1.03163719, + "epoch": 0.8009619720426875, + "flos": 20813609664000.0, + "grad_norm": 1.9091600224392815, + "language_loss": 0.65907997, + "learning_rate": 4.0127046406077825e-07, + "loss": 0.68032598, + "num_input_tokens_seen": 287468455, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6328125, + "step": 13322, + "time_per_iteration": 2.4598209857940674 + }, + { + "auxiliary_loss_clip": 0.01099776, + "auxiliary_loss_mlp": 0.01025985, + "balance_loss_clip": 1.01448131, + "balance_loss_mlp": 1.03424549, + "epoch": 0.8010220952953555, + "flos": 17931096161280.0, + "grad_norm": 1.7799462276417908, + "language_loss": 0.78038085, + "learning_rate": 4.010364878639265e-07, + "loss": 0.80163848, + "num_input_tokens_seen": 287486485, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 13323, + "time_per_iteration": 2.4523849487304688 + }, + { + "auxiliary_loss_clip": 0.01100974, + "auxiliary_loss_mlp": 0.01026925, + "balance_loss_clip": 1.01565957, + "balance_loss_mlp": 1.03384399, + "epoch": 0.8010822185480234, + "flos": 24572235795840.0, + "grad_norm": 2.287441188670043, + "language_loss": 0.7207495, + "learning_rate": 4.00802572299932e-07, + "loss": 0.74202847, + "num_input_tokens_seen": 287503940, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 13324, + "time_per_iteration": 2.4756648540496826 + }, + { + "auxiliary_loss_clip": 0.01101897, + "auxiliary_loss_mlp": 0.01029215, + "balance_loss_clip": 1.01729989, + "balance_loss_mlp": 1.03378117, + "epoch": 0.8011423418006914, + "flos": 21829988903040.0, + "grad_norm": 1.7124158785666046, + "language_loss": 0.76591057, + "learning_rate": 4.005687173776635e-07, + "loss": 0.78722167, + "num_input_tokens_seen": 287521660, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 13325, + "time_per_iteration": 2.4179391860961914 + }, + { + "auxiliary_loss_clip": 0.01093003, + "auxiliary_loss_mlp": 0.01024591, + "balance_loss_clip": 1.01423728, + "balance_loss_mlp": 1.03209794, + "epoch": 0.8012024650533593, + "flos": 23915178259200.0, + "grad_norm": 1.4883986752450347, + "language_loss": 0.79431766, + "learning_rate": 4.003349231059898e-07, + "loss": 0.81549358, + "num_input_tokens_seen": 287541505, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.609375, + "step": 13326, + "time_per_iteration": 2.5107691287994385 + }, + { + "auxiliary_loss_clip": 0.01098238, + "auxiliary_loss_mlp": 0.01031505, + "balance_loss_clip": 1.02084196, + "balance_loss_mlp": 1.03452349, + "epoch": 0.8012625883060274, + "flos": 23587062497280.0, + "grad_norm": 1.8420056555036817, + "language_loss": 0.66117686, + "learning_rate": 4.001011894937765e-07, + "loss": 0.68247426, + "num_input_tokens_seen": 287560015, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.63671875, + "step": 13327, + "time_per_iteration": 2.4520139694213867 + }, + { + "auxiliary_loss_clip": 0.01095786, + "auxiliary_loss_mlp": 0.01026094, + "balance_loss_clip": 1.01527607, + "balance_loss_mlp": 1.03323984, + "epoch": 0.8013227115586953, + "flos": 20813932886400.0, + "grad_norm": 2.067352275023529, + "language_loss": 0.73374075, + "learning_rate": 3.9986751654988636e-07, + "loss": 0.75495958, + "num_input_tokens_seen": 287579150, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.625, + "step": 13328, + "time_per_iteration": 2.4723997116088867 + }, + { + "auxiliary_loss_clip": 0.01101687, + "auxiliary_loss_mlp": 0.0103048, + "balance_loss_clip": 1.01789129, + "balance_loss_mlp": 1.03389966, + "epoch": 0.8013828348113633, + "flos": 15888317788800.0, + "grad_norm": 2.2657788983381573, + "language_loss": 0.73454827, + "learning_rate": 3.996339042831798e-07, + "loss": 0.75586998, + "num_input_tokens_seen": 287597420, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.67578125, + "step": 13329, + "time_per_iteration": 2.3919262886047363 + }, + { + "auxiliary_loss_clip": 0.01023092, + "auxiliary_loss_mlp": 0.01000506, + "balance_loss_clip": 0.99962944, + "balance_loss_mlp": 1.00292683, + "epoch": 0.8014429580640313, + "flos": 71062981562880.0, + "grad_norm": 0.7030773083035402, + "language_loss": 0.52944195, + "learning_rate": 3.9940035270251605e-07, + "loss": 0.54967791, + "num_input_tokens_seen": 287667280, + "router_z_loss_clip": 0.00878906, + "router_z_loss_mlp": 0.20117188, + "step": 13330, + "time_per_iteration": 3.1469671726226807 + }, + { + "auxiliary_loss_clip": 0.0110212, + "auxiliary_loss_mlp": 0.01033988, + "balance_loss_clip": 1.02126801, + "balance_loss_mlp": 1.03404713, + "epoch": 0.8015030813166992, + "flos": 23076340968960.0, + "grad_norm": 1.7749874535506867, + "language_loss": 0.72585219, + "learning_rate": 3.991668618167519e-07, + "loss": 0.74721324, + "num_input_tokens_seen": 287687375, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 13331, + "time_per_iteration": 2.4748101234436035 + }, + { + "auxiliary_loss_clip": 0.01097861, + "auxiliary_loss_mlp": 0.01025208, + "balance_loss_clip": 1.01499796, + "balance_loss_mlp": 1.03351557, + "epoch": 0.8015632045693672, + "flos": 21872328059520.0, + "grad_norm": 1.8459945911210676, + "language_loss": 0.77300894, + "learning_rate": 3.989334316347401e-07, + "loss": 0.79423964, + "num_input_tokens_seen": 287707895, + "router_z_loss_clip": 0.10205078, + "router_z_loss_mlp": 0.64453125, + "step": 13332, + "time_per_iteration": 2.531803846359253 + }, + { + "auxiliary_loss_clip": 0.01100359, + "auxiliary_loss_mlp": 0.01026397, + "balance_loss_clip": 1.01504815, + "balance_loss_mlp": 1.03493059, + "epoch": 0.8016233278220352, + "flos": 23656728925440.0, + "grad_norm": 2.098330097489523, + "language_loss": 0.83255255, + "learning_rate": 3.987000621653338e-07, + "loss": 0.85382015, + "num_input_tokens_seen": 287723990, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65234375, + "step": 13333, + "time_per_iteration": 2.463542938232422 + }, + { + "auxiliary_loss_clip": 0.01099481, + "auxiliary_loss_mlp": 0.01025625, + "balance_loss_clip": 1.01358485, + "balance_loss_mlp": 1.03295958, + "epoch": 0.8016834510747032, + "flos": 16253170185600.0, + "grad_norm": 1.517795063726895, + "language_loss": 0.73388004, + "learning_rate": 3.9846675341738133e-07, + "loss": 0.75513119, + "num_input_tokens_seen": 287742380, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.66796875, + "step": 13334, + "time_per_iteration": 2.433274984359741 + }, + { + "auxiliary_loss_clip": 0.01098378, + "auxiliary_loss_mlp": 0.01026638, + "balance_loss_clip": 1.01558757, + "balance_loss_mlp": 1.03512001, + "epoch": 0.8017435743273711, + "flos": 12276027665280.0, + "grad_norm": 1.9072349752607467, + "language_loss": 0.7468729, + "learning_rate": 3.9823350539972967e-07, + "loss": 0.76812309, + "num_input_tokens_seen": 287760130, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6328125, + "step": 13335, + "time_per_iteration": 2.42297625541687 + }, + { + "auxiliary_loss_clip": 0.01096886, + "auxiliary_loss_mlp": 0.01025738, + "balance_loss_clip": 1.01403213, + "balance_loss_mlp": 1.03252053, + "epoch": 0.8018036975800391, + "flos": 17196112068480.0, + "grad_norm": 3.6392795716902553, + "language_loss": 0.75419021, + "learning_rate": 3.9800031812122416e-07, + "loss": 0.77541637, + "num_input_tokens_seen": 287777565, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.640625, + "step": 13336, + "time_per_iteration": 2.520874261856079 + }, + { + "auxiliary_loss_clip": 0.01106538, + "auxiliary_loss_mlp": 0.01034989, + "balance_loss_clip": 1.02240658, + "balance_loss_mlp": 1.03661346, + "epoch": 0.801863820832707, + "flos": 20631865824000.0, + "grad_norm": 2.0097115847090556, + "language_loss": 0.74682361, + "learning_rate": 3.977671915907068e-07, + "loss": 0.76823884, + "num_input_tokens_seen": 287796310, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69921875, + "step": 13337, + "time_per_iteration": 2.4630508422851562 + }, + { + "auxiliary_loss_clip": 0.0110342, + "auxiliary_loss_mlp": 0.01031638, + "balance_loss_clip": 1.01966929, + "balance_loss_mlp": 1.03606987, + "epoch": 0.801923944085375, + "flos": 30445569285120.0, + "grad_norm": 1.6741859292029853, + "language_loss": 0.80250359, + "learning_rate": 3.9753412581701883e-07, + "loss": 0.82385421, + "num_input_tokens_seen": 287817330, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.671875, + "step": 13338, + "time_per_iteration": 2.527026414871216 + }, + { + "auxiliary_loss_clip": 0.01102243, + "auxiliary_loss_mlp": 0.01029555, + "balance_loss_clip": 1.01716948, + "balance_loss_mlp": 1.03478169, + "epoch": 0.801984067338043, + "flos": 20010575255040.0, + "grad_norm": 1.7639189487021163, + "language_loss": 0.74894798, + "learning_rate": 3.9730112080899733e-07, + "loss": 0.77026594, + "num_input_tokens_seen": 287835095, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 13339, + "time_per_iteration": 2.4820516109466553 + }, + { + "auxiliary_loss_clip": 0.01096664, + "auxiliary_loss_mlp": 0.0102455, + "balance_loss_clip": 1.01394606, + "balance_loss_mlp": 1.03399611, + "epoch": 0.802044190590711, + "flos": 22784028088320.0, + "grad_norm": 1.6460943176335554, + "language_loss": 0.7905581, + "learning_rate": 3.970681765754775e-07, + "loss": 0.81177026, + "num_input_tokens_seen": 287854595, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.625, + "step": 13340, + "time_per_iteration": 2.500736951828003 + }, + { + "auxiliary_loss_clip": 0.01098703, + "auxiliary_loss_mlp": 0.01027823, + "balance_loss_clip": 1.01727295, + "balance_loss_mlp": 1.0331881, + "epoch": 0.8021043138433789, + "flos": 27600115639680.0, + "grad_norm": 2.2084467799286736, + "language_loss": 0.68105626, + "learning_rate": 3.968352931252936e-07, + "loss": 0.70232147, + "num_input_tokens_seen": 287876960, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.65625, + "step": 13341, + "time_per_iteration": 2.5179948806762695 + }, + { + "auxiliary_loss_clip": 0.01023459, + "auxiliary_loss_mlp": 0.00999082, + "balance_loss_clip": 0.99812281, + "balance_loss_mlp": 1.00333941, + "epoch": 0.8021644370960469, + "flos": 62063730057600.0, + "grad_norm": 0.8369751293594621, + "language_loss": 0.6160937, + "learning_rate": 3.9660247046727547e-07, + "loss": 0.6363191, + "num_input_tokens_seen": 287936530, + "router_z_loss_clip": 0.00958252, + "router_z_loss_mlp": 0.20117188, + "step": 13342, + "time_per_iteration": 3.000945806503296 + }, + { + "auxiliary_loss_clip": 0.01101839, + "auxiliary_loss_mlp": 0.01031495, + "balance_loss_clip": 1.01944876, + "balance_loss_mlp": 1.03650236, + "epoch": 0.8022245603487148, + "flos": 23361794352000.0, + "grad_norm": 1.7496140842245578, + "language_loss": 0.63761848, + "learning_rate": 3.963697086102522e-07, + "loss": 0.65895188, + "num_input_tokens_seen": 287954285, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.65234375, + "step": 13343, + "time_per_iteration": 2.4751808643341064 + }, + { + "auxiliary_loss_clip": 0.01095544, + "auxiliary_loss_mlp": 0.01025293, + "balance_loss_clip": 1.01477909, + "balance_loss_mlp": 1.03363252, + "epoch": 0.8022846836013828, + "flos": 10853354712960.0, + "grad_norm": 1.8534538865060244, + "language_loss": 0.68717116, + "learning_rate": 3.96137007563051e-07, + "loss": 0.70837951, + "num_input_tokens_seen": 287971595, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.6171875, + "step": 13344, + "time_per_iteration": 3.879085063934326 + }, + { + "auxiliary_loss_clip": 0.01100911, + "auxiliary_loss_mlp": 0.01026246, + "balance_loss_clip": 1.01449227, + "balance_loss_mlp": 1.03534698, + "epoch": 0.8023448068540509, + "flos": 29240443054080.0, + "grad_norm": 1.4750491175243907, + "language_loss": 0.70234525, + "learning_rate": 3.9590436733449506e-07, + "loss": 0.72361684, + "num_input_tokens_seen": 287992540, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.65625, + "step": 13345, + "time_per_iteration": 2.526529550552368 + }, + { + "auxiliary_loss_clip": 0.01023092, + "auxiliary_loss_mlp": 0.00996047, + "balance_loss_clip": 0.99513531, + "balance_loss_mlp": 1.00295401, + "epoch": 0.8024049301067188, + "flos": 64153588181760.0, + "grad_norm": 0.8675779351996153, + "language_loss": 0.62968004, + "learning_rate": 3.956717879334059e-07, + "loss": 0.64987135, + "num_input_tokens_seen": 288052810, + "router_z_loss_clip": 0.00909424, + "router_z_loss_mlp": 0.20117188, + "step": 13346, + "time_per_iteration": 4.479830980300903 + }, + { + "auxiliary_loss_clip": 0.01099962, + "auxiliary_loss_mlp": 0.01029683, + "balance_loss_clip": 1.01858425, + "balance_loss_mlp": 1.03673768, + "epoch": 0.8024650533593868, + "flos": 28585360765440.0, + "grad_norm": 1.5938834644000792, + "language_loss": 0.72352123, + "learning_rate": 3.9543926936860327e-07, + "loss": 0.74481773, + "num_input_tokens_seen": 288073045, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6328125, + "step": 13347, + "time_per_iteration": 3.90059232711792 + }, + { + "auxiliary_loss_clip": 0.01101618, + "auxiliary_loss_mlp": 0.01028439, + "balance_loss_clip": 1.01647007, + "balance_loss_mlp": 1.03545678, + "epoch": 0.8025251766120547, + "flos": 16982264448000.0, + "grad_norm": 1.9020244610460446, + "language_loss": 0.72658664, + "learning_rate": 3.9520681164890493e-07, + "loss": 0.74788725, + "num_input_tokens_seen": 288091165, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6640625, + "step": 13348, + "time_per_iteration": 2.4068453311920166 + }, + { + "auxiliary_loss_clip": 0.0109995, + "auxiliary_loss_mlp": 0.01028163, + "balance_loss_clip": 1.01632524, + "balance_loss_mlp": 1.03549361, + "epoch": 0.8025852998647227, + "flos": 22163671272960.0, + "grad_norm": 1.8226385974634305, + "language_loss": 0.75890076, + "learning_rate": 3.9497441478312444e-07, + "loss": 0.78018188, + "num_input_tokens_seen": 288110595, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.64453125, + "step": 13349, + "time_per_iteration": 2.4651827812194824 + }, + { + "auxiliary_loss_clip": 0.01102119, + "auxiliary_loss_mlp": 0.01032465, + "balance_loss_clip": 1.02177215, + "balance_loss_mlp": 1.0367322, + "epoch": 0.8026454231173906, + "flos": 22017012042240.0, + "grad_norm": 1.9845904046749108, + "language_loss": 0.83774155, + "learning_rate": 3.947420787800755e-07, + "loss": 0.85908747, + "num_input_tokens_seen": 288128995, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.65625, + "step": 13350, + "time_per_iteration": 3.9758567810058594 + }, + { + "auxiliary_loss_clip": 0.01102317, + "auxiliary_loss_mlp": 0.01032473, + "balance_loss_clip": 1.02151191, + "balance_loss_mlp": 1.03663123, + "epoch": 0.8027055463700586, + "flos": 22491320158080.0, + "grad_norm": 1.6434071857139758, + "language_loss": 0.71458006, + "learning_rate": 3.945098036485679e-07, + "loss": 0.73592794, + "num_input_tokens_seen": 288149265, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.65625, + "step": 13351, + "time_per_iteration": 2.456387519836426 + }, + { + "auxiliary_loss_clip": 0.01100958, + "auxiliary_loss_mlp": 0.01025905, + "balance_loss_clip": 1.0146575, + "balance_loss_mlp": 1.03703523, + "epoch": 0.8027656696227266, + "flos": 28912901909760.0, + "grad_norm": 1.569231510448569, + "language_loss": 0.6175856, + "learning_rate": 3.9427758939740885e-07, + "loss": 0.63885427, + "num_input_tokens_seen": 288170745, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 13352, + "time_per_iteration": 2.5027875900268555 + }, + { + "auxiliary_loss_clip": 0.01099513, + "auxiliary_loss_mlp": 0.01033953, + "balance_loss_clip": 1.02271748, + "balance_loss_mlp": 1.03542566, + "epoch": 0.8028257928753946, + "flos": 18589374760320.0, + "grad_norm": 1.9674575511392012, + "language_loss": 0.76736349, + "learning_rate": 3.940454360354046e-07, + "loss": 0.78869814, + "num_input_tokens_seen": 288189415, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 13353, + "time_per_iteration": 2.4299416542053223 + }, + { + "auxiliary_loss_clip": 0.01107387, + "auxiliary_loss_mlp": 0.01028754, + "balance_loss_clip": 1.01592684, + "balance_loss_mlp": 1.03595626, + "epoch": 0.8028859161280625, + "flos": 19130009339520.0, + "grad_norm": 4.591297164367042, + "language_loss": 0.73969984, + "learning_rate": 3.938133435713582e-07, + "loss": 0.76106119, + "num_input_tokens_seen": 288206900, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.71484375, + "step": 13354, + "time_per_iteration": 2.4425058364868164 + }, + { + "auxiliary_loss_clip": 0.01100936, + "auxiliary_loss_mlp": 0.0103158, + "balance_loss_clip": 1.02040386, + "balance_loss_mlp": 1.03437519, + "epoch": 0.8029460393807305, + "flos": 20229881742720.0, + "grad_norm": 2.057342776688467, + "language_loss": 0.65729123, + "learning_rate": 3.935813120140714e-07, + "loss": 0.6786164, + "num_input_tokens_seen": 288224800, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6640625, + "step": 13355, + "time_per_iteration": 2.4422569274902344 + }, + { + "auxiliary_loss_clip": 0.01103198, + "auxiliary_loss_mlp": 0.01030639, + "balance_loss_clip": 1.01824093, + "balance_loss_mlp": 1.03504288, + "epoch": 0.8030061626333984, + "flos": 49783320933120.0, + "grad_norm": 1.8283444100924588, + "language_loss": 0.68699443, + "learning_rate": 3.9334934137234235e-07, + "loss": 0.70833278, + "num_input_tokens_seen": 288249400, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 13356, + "time_per_iteration": 2.7776293754577637 + }, + { + "auxiliary_loss_clip": 0.01100058, + "auxiliary_loss_mlp": 0.01027162, + "balance_loss_clip": 1.01561069, + "balance_loss_mlp": 1.03510857, + "epoch": 0.8030662858860664, + "flos": 21615243442560.0, + "grad_norm": 1.6343755338343116, + "language_loss": 0.77451766, + "learning_rate": 3.931174316549666e-07, + "loss": 0.79578984, + "num_input_tokens_seen": 288268780, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6484375, + "step": 13357, + "time_per_iteration": 2.4800233840942383 + }, + { + "auxiliary_loss_clip": 0.01100087, + "auxiliary_loss_mlp": 0.0102911, + "balance_loss_clip": 1.01638436, + "balance_loss_mlp": 1.03227997, + "epoch": 0.8031264091387345, + "flos": 25630056351360.0, + "grad_norm": 1.418419476215859, + "language_loss": 0.76987123, + "learning_rate": 3.9288558287073937e-07, + "loss": 0.79116321, + "num_input_tokens_seen": 288290830, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 13358, + "time_per_iteration": 2.500126361846924 + }, + { + "auxiliary_loss_clip": 0.01097171, + "auxiliary_loss_mlp": 0.01029617, + "balance_loss_clip": 1.01845884, + "balance_loss_mlp": 1.03335798, + "epoch": 0.8031865323914024, + "flos": 19646225648640.0, + "grad_norm": 1.4766521740684297, + "language_loss": 0.84945107, + "learning_rate": 3.9265379502845143e-07, + "loss": 0.87071896, + "num_input_tokens_seen": 288308865, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.640625, + "step": 13359, + "time_per_iteration": 2.467322826385498 + }, + { + "auxiliary_loss_clip": 0.01098755, + "auxiliary_loss_mlp": 0.01025457, + "balance_loss_clip": 1.0148114, + "balance_loss_mlp": 1.0347209, + "epoch": 0.8032466556440704, + "flos": 26169110732160.0, + "grad_norm": 5.478885485071422, + "language_loss": 0.73389184, + "learning_rate": 3.924220681368928e-07, + "loss": 0.75513393, + "num_input_tokens_seen": 288327325, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.640625, + "step": 13360, + "time_per_iteration": 2.4644036293029785 + }, + { + "auxiliary_loss_clip": 0.011011, + "auxiliary_loss_mlp": 0.01025712, + "balance_loss_clip": 1.01467943, + "balance_loss_mlp": 1.03494728, + "epoch": 0.8033067788967383, + "flos": 25520026014720.0, + "grad_norm": 1.658575264566963, + "language_loss": 0.69541776, + "learning_rate": 3.921904022048512e-07, + "loss": 0.71668589, + "num_input_tokens_seen": 288347285, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.66015625, + "step": 13361, + "time_per_iteration": 2.4854345321655273 + }, + { + "auxiliary_loss_clip": 0.01102908, + "auxiliary_loss_mlp": 0.01034224, + "balance_loss_clip": 1.02245271, + "balance_loss_mlp": 1.03511453, + "epoch": 0.8033669021494063, + "flos": 24024274842240.0, + "grad_norm": 1.5325871265997801, + "language_loss": 0.6999588, + "learning_rate": 3.919587972411098e-07, + "loss": 0.72133017, + "num_input_tokens_seen": 288367785, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6796875, + "step": 13362, + "time_per_iteration": 2.488231658935547 + }, + { + "auxiliary_loss_clip": 0.01107256, + "auxiliary_loss_mlp": 0.01037143, + "balance_loss_clip": 1.02425015, + "balance_loss_mlp": 1.03642321, + "epoch": 0.8034270254020742, + "flos": 13588059749760.0, + "grad_norm": 3.0174413399707363, + "language_loss": 0.78229916, + "learning_rate": 3.91727253254452e-07, + "loss": 0.80374312, + "num_input_tokens_seen": 288384135, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.70703125, + "step": 13363, + "time_per_iteration": 2.418231964111328 + }, + { + "auxiliary_loss_clip": 0.01099372, + "auxiliary_loss_mlp": 0.01027358, + "balance_loss_clip": 1.01533544, + "balance_loss_mlp": 1.03315711, + "epoch": 0.8034871486547422, + "flos": 27412661537280.0, + "grad_norm": 2.2119044430692725, + "language_loss": 0.7432642, + "learning_rate": 3.9149577025365787e-07, + "loss": 0.76453155, + "num_input_tokens_seen": 288403805, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 13364, + "time_per_iteration": 2.5197720527648926 + }, + { + "auxiliary_loss_clip": 0.01103376, + "auxiliary_loss_mlp": 0.01028234, + "balance_loss_clip": 1.01715374, + "balance_loss_mlp": 1.0384289, + "epoch": 0.8035472719074102, + "flos": 32598593475840.0, + "grad_norm": 1.8638691080959722, + "language_loss": 0.60079575, + "learning_rate": 3.9126434824750596e-07, + "loss": 0.62211186, + "num_input_tokens_seen": 288424895, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6484375, + "step": 13365, + "time_per_iteration": 2.5367517471313477 + }, + { + "auxiliary_loss_clip": 0.01102557, + "auxiliary_loss_mlp": 0.010325, + "balance_loss_clip": 1.02088296, + "balance_loss_mlp": 1.03595102, + "epoch": 0.8036073951600782, + "flos": 21287989607040.0, + "grad_norm": 1.770848682899336, + "language_loss": 0.66261953, + "learning_rate": 3.910329872447706e-07, + "loss": 0.68397009, + "num_input_tokens_seen": 288443865, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 13366, + "time_per_iteration": 2.4602773189544678 + }, + { + "auxiliary_loss_clip": 0.01097083, + "auxiliary_loss_mlp": 0.010267, + "balance_loss_clip": 1.01548815, + "balance_loss_mlp": 1.03327024, + "epoch": 0.8036675184127461, + "flos": 18113845582080.0, + "grad_norm": 2.111763944733339, + "language_loss": 0.75102258, + "learning_rate": 3.908016872542259e-07, + "loss": 0.77226043, + "num_input_tokens_seen": 288461065, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.63671875, + "step": 13367, + "time_per_iteration": 2.41711163520813 + }, + { + "auxiliary_loss_clip": 0.0109804, + "auxiliary_loss_mlp": 0.01024389, + "balance_loss_clip": 1.01354659, + "balance_loss_mlp": 1.0338217, + "epoch": 0.8037276416654141, + "flos": 26030280666240.0, + "grad_norm": 1.54219979673165, + "language_loss": 0.73962986, + "learning_rate": 3.905704482846428e-07, + "loss": 0.76085418, + "num_input_tokens_seen": 288481865, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.640625, + "step": 13368, + "time_per_iteration": 2.501863956451416 + }, + { + "auxiliary_loss_clip": 0.01102548, + "auxiliary_loss_mlp": 0.01033536, + "balance_loss_clip": 1.02176404, + "balance_loss_mlp": 1.03486526, + "epoch": 0.803787764918082, + "flos": 18802180886400.0, + "grad_norm": 1.9813333443993375, + "language_loss": 0.69734561, + "learning_rate": 3.90339270344789e-07, + "loss": 0.71870649, + "num_input_tokens_seen": 288499345, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.67578125, + "step": 13369, + "time_per_iteration": 2.456852674484253 + }, + { + "auxiliary_loss_clip": 0.01096676, + "auxiliary_loss_mlp": 0.01029855, + "balance_loss_clip": 1.01927543, + "balance_loss_mlp": 1.03303206, + "epoch": 0.80384788817075, + "flos": 20225787592320.0, + "grad_norm": 1.6987132641471832, + "language_loss": 0.74007034, + "learning_rate": 3.901081534434312e-07, + "loss": 0.76133567, + "num_input_tokens_seen": 288517660, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.63671875, + "step": 13370, + "time_per_iteration": 2.447308301925659 + }, + { + "auxiliary_loss_clip": 0.0110293, + "auxiliary_loss_mlp": 0.01034302, + "balance_loss_clip": 1.02192795, + "balance_loss_mlp": 1.03425717, + "epoch": 0.8039080114234181, + "flos": 18515290959360.0, + "grad_norm": 2.381680078158343, + "language_loss": 0.87296432, + "learning_rate": 3.898770975893342e-07, + "loss": 0.89433664, + "num_input_tokens_seen": 288534180, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6875, + "step": 13371, + "time_per_iteration": 2.4829249382019043 + }, + { + "auxiliary_loss_clip": 0.01103599, + "auxiliary_loss_mlp": 0.01032892, + "balance_loss_clip": 1.02035713, + "balance_loss_mlp": 1.03440571, + "epoch": 0.803968134676086, + "flos": 22382510883840.0, + "grad_norm": 1.7658004045692555, + "language_loss": 0.74599552, + "learning_rate": 3.89646102791259e-07, + "loss": 0.76736039, + "num_input_tokens_seen": 288553350, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69140625, + "step": 13372, + "time_per_iteration": 2.442962169647217 + }, + { + "auxiliary_loss_clip": 0.01098823, + "auxiliary_loss_mlp": 0.0102821, + "balance_loss_clip": 1.01571703, + "balance_loss_mlp": 1.03405392, + "epoch": 0.804028257928754, + "flos": 23842566915840.0, + "grad_norm": 2.4174801447044807, + "language_loss": 0.79491466, + "learning_rate": 3.894151690579646e-07, + "loss": 0.816185, + "num_input_tokens_seen": 288571325, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6484375, + "step": 13373, + "time_per_iteration": 2.453648090362549 + }, + { + "auxiliary_loss_clip": 0.01097395, + "auxiliary_loss_mlp": 0.01030701, + "balance_loss_clip": 1.01992464, + "balance_loss_mlp": 1.03413832, + "epoch": 0.8040883811814219, + "flos": 23550720912000.0, + "grad_norm": 2.1161878691178244, + "language_loss": 0.74199659, + "learning_rate": 3.8918429639820815e-07, + "loss": 0.76327753, + "num_input_tokens_seen": 288592100, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6328125, + "step": 13374, + "time_per_iteration": 2.4652347564697266 + }, + { + "auxiliary_loss_clip": 0.01101497, + "auxiliary_loss_mlp": 0.01028296, + "balance_loss_clip": 1.01642847, + "balance_loss_mlp": 1.03416824, + "epoch": 0.8041485044340899, + "flos": 19026263882880.0, + "grad_norm": 1.8766085876406744, + "language_loss": 0.68541491, + "learning_rate": 3.889534848207452e-07, + "loss": 0.70671284, + "num_input_tokens_seen": 288612305, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 13375, + "time_per_iteration": 2.4260308742523193 + }, + { + "auxiliary_loss_clip": 0.0102349, + "auxiliary_loss_mlp": 0.01009026, + "balance_loss_clip": 1.00806642, + "balance_loss_mlp": 1.00329185, + "epoch": 0.8042086276867578, + "flos": 70005663797760.0, + "grad_norm": 0.726849315788399, + "language_loss": 0.55648947, + "learning_rate": 3.887227343343271e-07, + "loss": 0.57681465, + "num_input_tokens_seen": 288676015, + "router_z_loss_clip": 0.00958252, + "router_z_loss_mlp": 0.20117188, + "step": 13376, + "time_per_iteration": 3.1642260551452637 + }, + { + "auxiliary_loss_clip": 0.0110284, + "auxiliary_loss_mlp": 0.01028344, + "balance_loss_clip": 1.01679826, + "balance_loss_mlp": 1.03498244, + "epoch": 0.8042687509394258, + "flos": 21872435800320.0, + "grad_norm": 1.6135536468641871, + "language_loss": 0.72961086, + "learning_rate": 3.8849204494770425e-07, + "loss": 0.75092268, + "num_input_tokens_seen": 288696455, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6796875, + "step": 13377, + "time_per_iteration": 2.4822981357574463 + }, + { + "auxiliary_loss_clip": 0.01098741, + "auxiliary_loss_mlp": 0.01026807, + "balance_loss_clip": 1.01541018, + "balance_loss_mlp": 1.03249967, + "epoch": 0.8043288741920938, + "flos": 26614870513920.0, + "grad_norm": 1.981270631697856, + "language_loss": 0.69881338, + "learning_rate": 3.8826141666962567e-07, + "loss": 0.72006881, + "num_input_tokens_seen": 288715560, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6640625, + "step": 13378, + "time_per_iteration": 2.5707526206970215 + }, + { + "auxiliary_loss_clip": 0.01100528, + "auxiliary_loss_mlp": 0.01024589, + "balance_loss_clip": 1.01306105, + "balance_loss_mlp": 1.03421903, + "epoch": 0.8043889974447618, + "flos": 33403387651200.0, + "grad_norm": 1.411246611707562, + "language_loss": 0.69285733, + "learning_rate": 3.880308495088347e-07, + "loss": 0.71410847, + "num_input_tokens_seen": 288739485, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 13379, + "time_per_iteration": 2.5638318061828613 + }, + { + "auxiliary_loss_clip": 0.01104566, + "auxiliary_loss_mlp": 0.01030173, + "balance_loss_clip": 1.01733994, + "balance_loss_mlp": 1.03576803, + "epoch": 0.8044491206974297, + "flos": 20375966355840.0, + "grad_norm": 1.6928321520732015, + "language_loss": 0.76132649, + "learning_rate": 3.8780034347407533e-07, + "loss": 0.7826739, + "num_input_tokens_seen": 288757420, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6875, + "step": 13380, + "time_per_iteration": 2.4436728954315186 + }, + { + "auxiliary_loss_clip": 0.0109777, + "auxiliary_loss_mlp": 0.01024309, + "balance_loss_clip": 1.01334739, + "balance_loss_mlp": 1.03278756, + "epoch": 0.8045092439500977, + "flos": 23403810286080.0, + "grad_norm": 1.762890691541343, + "language_loss": 0.68871969, + "learning_rate": 3.875698985740887e-07, + "loss": 0.70994055, + "num_input_tokens_seen": 288775535, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 13381, + "time_per_iteration": 2.4540696144104004 + }, + { + "auxiliary_loss_clip": 0.01102189, + "auxiliary_loss_mlp": 0.01032677, + "balance_loss_clip": 1.02096522, + "balance_loss_mlp": 1.03659916, + "epoch": 0.8045693672027656, + "flos": 24097245321600.0, + "grad_norm": 2.070272344972077, + "language_loss": 0.63770294, + "learning_rate": 3.873395148176135e-07, + "loss": 0.65905166, + "num_input_tokens_seen": 288795035, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 13382, + "time_per_iteration": 2.476844549179077 + }, + { + "auxiliary_loss_clip": 0.01099779, + "auxiliary_loss_mlp": 0.01031953, + "balance_loss_clip": 1.02138495, + "balance_loss_mlp": 1.03501844, + "epoch": 0.8046294904554336, + "flos": 27707165147520.0, + "grad_norm": 1.9560619657883067, + "language_loss": 0.76228422, + "learning_rate": 3.8710919221338487e-07, + "loss": 0.78360152, + "num_input_tokens_seen": 288816270, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6484375, + "step": 13383, + "time_per_iteration": 2.5023083686828613 + }, + { + "auxiliary_loss_clip": 0.0110036, + "auxiliary_loss_mlp": 0.01033085, + "balance_loss_clip": 1.0221895, + "balance_loss_mlp": 1.03507757, + "epoch": 0.8046896137081017, + "flos": 24972998814720.0, + "grad_norm": 1.704803693538242, + "language_loss": 0.69562024, + "learning_rate": 3.868789307701381e-07, + "loss": 0.71695471, + "num_input_tokens_seen": 288836050, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.65625, + "step": 13384, + "time_per_iteration": 2.4720067977905273 + }, + { + "auxiliary_loss_clip": 0.0110185, + "auxiliary_loss_mlp": 0.01032882, + "balance_loss_clip": 1.02033544, + "balance_loss_mlp": 1.0335815, + "epoch": 0.8047497369607696, + "flos": 17675484001920.0, + "grad_norm": 2.241572450545315, + "language_loss": 0.79350901, + "learning_rate": 3.8664873049660375e-07, + "loss": 0.81485635, + "num_input_tokens_seen": 288852900, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 13385, + "time_per_iteration": 2.439087152481079 + }, + { + "auxiliary_loss_clip": 0.01099604, + "auxiliary_loss_mlp": 0.01029903, + "balance_loss_clip": 1.01754057, + "balance_loss_mlp": 1.03369808, + "epoch": 0.8048098602134376, + "flos": 22382079920640.0, + "grad_norm": 1.6118027909043755, + "language_loss": 0.72186625, + "learning_rate": 3.864185914015108e-07, + "loss": 0.74316132, + "num_input_tokens_seen": 288872625, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.65625, + "step": 13386, + "time_per_iteration": 3.8713440895080566 + }, + { + "auxiliary_loss_clip": 0.01024358, + "auxiliary_loss_mlp": 0.01001397, + "balance_loss_clip": 1.00044346, + "balance_loss_mlp": 1.00412393, + "epoch": 0.8048699834661055, + "flos": 71200949702400.0, + "grad_norm": 0.6667783047012105, + "language_loss": 0.5129301, + "learning_rate": 3.861885134935865e-07, + "loss": 0.53318763, + "num_input_tokens_seen": 288939180, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.203125, + "step": 13387, + "time_per_iteration": 4.501964330673218 + }, + { + "auxiliary_loss_clip": 0.01099839, + "auxiliary_loss_mlp": 0.01032124, + "balance_loss_clip": 1.01966643, + "balance_loss_mlp": 1.03382778, + "epoch": 0.8049301067187735, + "flos": 23660320285440.0, + "grad_norm": 1.6706301828643437, + "language_loss": 0.73789018, + "learning_rate": 3.859584967815559e-07, + "loss": 0.75920987, + "num_input_tokens_seen": 288958925, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.66015625, + "step": 13388, + "time_per_iteration": 3.843517780303955 + }, + { + "auxiliary_loss_clip": 0.0109843, + "auxiliary_loss_mlp": 0.0102773, + "balance_loss_clip": 1.01682854, + "balance_loss_mlp": 1.03459811, + "epoch": 0.8049902299714414, + "flos": 24426330750720.0, + "grad_norm": 1.8701026926914783, + "language_loss": 0.71383917, + "learning_rate": 3.857285412741411e-07, + "loss": 0.73510081, + "num_input_tokens_seen": 288980935, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.640625, + "step": 13389, + "time_per_iteration": 2.5054638385772705 + }, + { + "auxiliary_loss_clip": 0.0110175, + "auxiliary_loss_mlp": 0.01033044, + "balance_loss_clip": 1.02133179, + "balance_loss_mlp": 1.03612518, + "epoch": 0.8050503532241094, + "flos": 17492626840320.0, + "grad_norm": 3.088475766365905, + "language_loss": 0.82746458, + "learning_rate": 3.8549864698006097e-07, + "loss": 0.84881252, + "num_input_tokens_seen": 288996780, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 13390, + "time_per_iteration": 2.423768997192383 + }, + { + "auxiliary_loss_clip": 0.01023418, + "auxiliary_loss_mlp": 0.01000717, + "balance_loss_clip": 0.99975187, + "balance_loss_mlp": 1.00324726, + "epoch": 0.8051104764767774, + "flos": 57658030369920.0, + "grad_norm": 0.7770155709179203, + "language_loss": 0.55552375, + "learning_rate": 3.8526881390803424e-07, + "loss": 0.57576513, + "num_input_tokens_seen": 289057590, + "router_z_loss_clip": 0.00964355, + "router_z_loss_mlp": 0.20214844, + "step": 13391, + "time_per_iteration": 3.0361075401306152 + }, + { + "auxiliary_loss_clip": 0.01096866, + "auxiliary_loss_mlp": 0.01029072, + "balance_loss_clip": 1.01784241, + "balance_loss_mlp": 1.03412104, + "epoch": 0.8051705997294454, + "flos": 18003456109440.0, + "grad_norm": 1.5537305479409738, + "language_loss": 0.84568977, + "learning_rate": 3.850390420667762e-07, + "loss": 0.8669492, + "num_input_tokens_seen": 289076285, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.625, + "step": 13392, + "time_per_iteration": 3.88423490524292 + }, + { + "auxiliary_loss_clip": 0.01099202, + "auxiliary_loss_mlp": 0.01029176, + "balance_loss_clip": 1.01805949, + "balance_loss_mlp": 1.03294063, + "epoch": 0.8052307229821133, + "flos": 26397754755840.0, + "grad_norm": 1.4560269094808058, + "language_loss": 0.70109689, + "learning_rate": 3.8480933146499914e-07, + "loss": 0.72238064, + "num_input_tokens_seen": 289097585, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 13393, + "time_per_iteration": 2.503424644470215 + }, + { + "auxiliary_loss_clip": 0.01101229, + "auxiliary_loss_mlp": 0.01028713, + "balance_loss_clip": 1.01657152, + "balance_loss_mlp": 1.03488159, + "epoch": 0.8052908462347813, + "flos": 21757018423680.0, + "grad_norm": 3.6975183577727937, + "language_loss": 0.76589131, + "learning_rate": 3.84579682111414e-07, + "loss": 0.78719074, + "num_input_tokens_seen": 289116890, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6640625, + "step": 13394, + "time_per_iteration": 2.50583553314209 + }, + { + "auxiliary_loss_clip": 0.01103543, + "auxiliary_loss_mlp": 0.01032691, + "balance_loss_clip": 1.02136636, + "balance_loss_mlp": 1.03756762, + "epoch": 0.8053509694874492, + "flos": 25442279026560.0, + "grad_norm": 1.5906129807299372, + "language_loss": 0.64856386, + "learning_rate": 3.843500940147304e-07, + "loss": 0.66992623, + "num_input_tokens_seen": 289136670, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66015625, + "step": 13395, + "time_per_iteration": 2.482172966003418 + }, + { + "auxiliary_loss_clip": 0.01023951, + "auxiliary_loss_mlp": 0.01003964, + "balance_loss_clip": 1.00302815, + "balance_loss_mlp": 1.00378847, + "epoch": 0.8054110927401172, + "flos": 57668122091520.0, + "grad_norm": 0.7505786157423635, + "language_loss": 0.57311893, + "learning_rate": 3.8412056718365206e-07, + "loss": 0.59339797, + "num_input_tokens_seen": 289200150, + "router_z_loss_clip": 0.00933838, + "router_z_loss_mlp": 0.20117188, + "step": 13396, + "time_per_iteration": 3.1897172927856445 + }, + { + "auxiliary_loss_clip": 0.01100884, + "auxiliary_loss_mlp": 0.01033808, + "balance_loss_clip": 1.02165508, + "balance_loss_mlp": 1.03515673, + "epoch": 0.8054712159927853, + "flos": 19276201693440.0, + "grad_norm": 1.5955257474289526, + "language_loss": 0.77324402, + "learning_rate": 3.8389110162688353e-07, + "loss": 0.79459095, + "num_input_tokens_seen": 289218125, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.65625, + "step": 13397, + "time_per_iteration": 2.457589626312256 + }, + { + "auxiliary_loss_clip": 0.01100616, + "auxiliary_loss_mlp": 0.01025244, + "balance_loss_clip": 1.01415157, + "balance_loss_mlp": 1.03589237, + "epoch": 0.8055313392454532, + "flos": 17967617314560.0, + "grad_norm": 1.4619231404406883, + "language_loss": 0.70318341, + "learning_rate": 3.836616973531266e-07, + "loss": 0.72444201, + "num_input_tokens_seen": 289237115, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 13398, + "time_per_iteration": 2.434720754623413 + }, + { + "auxiliary_loss_clip": 0.01098688, + "auxiliary_loss_mlp": 0.01028967, + "balance_loss_clip": 1.01820254, + "balance_loss_mlp": 1.0339874, + "epoch": 0.8055914624981212, + "flos": 13478352635520.0, + "grad_norm": 2.5249609811736993, + "language_loss": 0.68945122, + "learning_rate": 3.834323543710805e-07, + "loss": 0.71072781, + "num_input_tokens_seen": 289253635, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.64453125, + "step": 13399, + "time_per_iteration": 2.4369473457336426 + }, + { + "auxiliary_loss_clip": 0.01100707, + "auxiliary_loss_mlp": 0.01032742, + "balance_loss_clip": 1.02185178, + "balance_loss_mlp": 1.03535557, + "epoch": 0.8056515857507891, + "flos": 13224787551360.0, + "grad_norm": 2.741407095084056, + "language_loss": 0.72130084, + "learning_rate": 3.8320307268944153e-07, + "loss": 0.74263531, + "num_input_tokens_seen": 289270085, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.65625, + "step": 13400, + "time_per_iteration": 2.45465350151062 + }, + { + "auxiliary_loss_clip": 0.01095975, + "auxiliary_loss_mlp": 0.01029235, + "balance_loss_clip": 1.01795244, + "balance_loss_mlp": 1.03169787, + "epoch": 0.8057117090034571, + "flos": 23878190229120.0, + "grad_norm": 1.762424898680502, + "language_loss": 0.6360321, + "learning_rate": 3.829738523169037e-07, + "loss": 0.6572842, + "num_input_tokens_seen": 289289645, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.640625, + "step": 13401, + "time_per_iteration": 2.46634840965271 + }, + { + "auxiliary_loss_clip": 0.01101558, + "auxiliary_loss_mlp": 0.01026627, + "balance_loss_clip": 1.01541567, + "balance_loss_mlp": 1.03495288, + "epoch": 0.805771832256125, + "flos": 21214300855680.0, + "grad_norm": 2.0776999843399215, + "language_loss": 0.83539009, + "learning_rate": 3.8274469326215985e-07, + "loss": 0.85667193, + "num_input_tokens_seen": 289306630, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.66796875, + "step": 13402, + "time_per_iteration": 2.4731264114379883 + }, + { + "auxiliary_loss_clip": 0.01104077, + "auxiliary_loss_mlp": 0.01029892, + "balance_loss_clip": 1.01843047, + "balance_loss_mlp": 1.03683734, + "epoch": 0.805831955508793, + "flos": 17566818382080.0, + "grad_norm": 1.7791761836453988, + "language_loss": 0.67880774, + "learning_rate": 3.8251559553389876e-07, + "loss": 0.70014745, + "num_input_tokens_seen": 289324960, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.671875, + "step": 13403, + "time_per_iteration": 2.4434680938720703 + }, + { + "auxiliary_loss_clip": 0.01100505, + "auxiliary_loss_mlp": 0.01034064, + "balance_loss_clip": 1.02343667, + "balance_loss_mlp": 1.03693151, + "epoch": 0.805892078761461, + "flos": 26907542530560.0, + "grad_norm": 1.6082788038338753, + "language_loss": 0.84920156, + "learning_rate": 3.822865591408084e-07, + "loss": 0.87054729, + "num_input_tokens_seen": 289344980, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.63671875, + "step": 13404, + "time_per_iteration": 2.531658172607422 + }, + { + "auxiliary_loss_clip": 0.01094833, + "auxiliary_loss_mlp": 0.01027389, + "balance_loss_clip": 1.01671946, + "balance_loss_mlp": 1.03220367, + "epoch": 0.805952202014129, + "flos": 31506442496640.0, + "grad_norm": 1.3891013109645525, + "language_loss": 0.70190167, + "learning_rate": 3.820575840915743e-07, + "loss": 0.72312385, + "num_input_tokens_seen": 289367500, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.625, + "step": 13405, + "time_per_iteration": 2.6014316082000732 + }, + { + "auxiliary_loss_clip": 0.01098677, + "auxiliary_loss_mlp": 0.01023373, + "balance_loss_clip": 1.01291847, + "balance_loss_mlp": 1.0349884, + "epoch": 0.8060123252667969, + "flos": 24389953251840.0, + "grad_norm": 4.539530104716245, + "language_loss": 0.75637108, + "learning_rate": 3.818286703948788e-07, + "loss": 0.77759159, + "num_input_tokens_seen": 289385930, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.63671875, + "step": 13406, + "time_per_iteration": 2.5324466228485107 + }, + { + "auxiliary_loss_clip": 0.01101098, + "auxiliary_loss_mlp": 0.01032193, + "balance_loss_clip": 1.02026033, + "balance_loss_mlp": 1.03502393, + "epoch": 0.8060724485194649, + "flos": 23479941162240.0, + "grad_norm": 1.4950577353974586, + "language_loss": 0.76435769, + "learning_rate": 3.815998180594018e-07, + "loss": 0.78569061, + "num_input_tokens_seen": 289408025, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66015625, + "step": 13407, + "time_per_iteration": 2.477576971054077 + }, + { + "auxiliary_loss_clip": 0.01099093, + "auxiliary_loss_mlp": 0.01031259, + "balance_loss_clip": 1.01961851, + "balance_loss_mlp": 1.03387368, + "epoch": 0.8061325717721328, + "flos": 18624495283200.0, + "grad_norm": 1.6921113118450146, + "language_loss": 0.73480356, + "learning_rate": 3.81371027093822e-07, + "loss": 0.75610703, + "num_input_tokens_seen": 289426575, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 13408, + "time_per_iteration": 2.4257562160491943 + }, + { + "auxiliary_loss_clip": 0.01099181, + "auxiliary_loss_mlp": 0.0102687, + "balance_loss_clip": 1.01519299, + "balance_loss_mlp": 1.03426397, + "epoch": 0.8061926950248008, + "flos": 23582752865280.0, + "grad_norm": 1.9196860165505316, + "language_loss": 0.70829517, + "learning_rate": 3.8114229750681523e-07, + "loss": 0.72955573, + "num_input_tokens_seen": 289447760, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6484375, + "step": 13409, + "time_per_iteration": 2.463796854019165 + }, + { + "auxiliary_loss_clip": 0.01099235, + "auxiliary_loss_mlp": 0.01025918, + "balance_loss_clip": 1.01424718, + "balance_loss_mlp": 1.03369415, + "epoch": 0.8062528182774689, + "flos": 11143333209600.0, + "grad_norm": 1.9740457439669723, + "language_loss": 0.76695901, + "learning_rate": 3.809136293070545e-07, + "loss": 0.78821057, + "num_input_tokens_seen": 289463920, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65625, + "step": 13410, + "time_per_iteration": 2.4198813438415527 + }, + { + "auxiliary_loss_clip": 0.01100494, + "auxiliary_loss_mlp": 0.01033141, + "balance_loss_clip": 1.02152979, + "balance_loss_mlp": 1.03608918, + "epoch": 0.8063129415301368, + "flos": 22346815743360.0, + "grad_norm": 1.7996163634950662, + "language_loss": 0.68654764, + "learning_rate": 3.806850225032117e-07, + "loss": 0.70788395, + "num_input_tokens_seen": 289482635, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.64453125, + "step": 13411, + "time_per_iteration": 2.4347574710845947 + }, + { + "auxiliary_loss_clip": 0.01097282, + "auxiliary_loss_mlp": 0.01026428, + "balance_loss_clip": 1.01528811, + "balance_loss_mlp": 1.03363693, + "epoch": 0.8063730647828048, + "flos": 23988400133760.0, + "grad_norm": 1.6674107139859142, + "language_loss": 0.68204069, + "learning_rate": 3.804564771039551e-07, + "loss": 0.70327783, + "num_input_tokens_seen": 289502040, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.63671875, + "step": 13412, + "time_per_iteration": 2.4598028659820557 + }, + { + "auxiliary_loss_clip": 0.01105517, + "auxiliary_loss_mlp": 0.01031169, + "balance_loss_clip": 1.01803827, + "balance_loss_mlp": 1.03777528, + "epoch": 0.8064331880354727, + "flos": 21321494017920.0, + "grad_norm": 1.6536300325901656, + "language_loss": 0.81038213, + "learning_rate": 3.8022799311795064e-07, + "loss": 0.83174896, + "num_input_tokens_seen": 289520740, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.6796875, + "step": 13413, + "time_per_iteration": 2.4321577548980713 + }, + { + "auxiliary_loss_clip": 0.01098188, + "auxiliary_loss_mlp": 0.01031064, + "balance_loss_clip": 1.01991224, + "balance_loss_mlp": 1.0338769, + "epoch": 0.8064933112881407, + "flos": 19682890456320.0, + "grad_norm": 1.984977186812749, + "language_loss": 0.84957677, + "learning_rate": 3.7999957055386303e-07, + "loss": 0.87086928, + "num_input_tokens_seen": 289535840, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.640625, + "step": 13414, + "time_per_iteration": 2.4548234939575195 + }, + { + "auxiliary_loss_clip": 0.01096994, + "auxiliary_loss_mlp": 0.01031681, + "balance_loss_clip": 1.02094603, + "balance_loss_mlp": 1.03234887, + "epoch": 0.8065534345408086, + "flos": 19279721226240.0, + "grad_norm": 2.244506083720548, + "language_loss": 0.67268044, + "learning_rate": 3.7977120942035467e-07, + "loss": 0.69396722, + "num_input_tokens_seen": 289555205, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6484375, + "step": 13415, + "time_per_iteration": 2.425309181213379 + }, + { + "auxiliary_loss_clip": 0.01096685, + "auxiliary_loss_mlp": 0.01022432, + "balance_loss_clip": 1.01168513, + "balance_loss_mlp": 1.03403616, + "epoch": 0.8066135577934767, + "flos": 19677718897920.0, + "grad_norm": 1.573239663974263, + "language_loss": 0.76294547, + "learning_rate": 3.7954290972608383e-07, + "loss": 0.78413665, + "num_input_tokens_seen": 289573000, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.625, + "step": 13416, + "time_per_iteration": 2.4303643703460693 + }, + { + "auxiliary_loss_clip": 0.01102292, + "auxiliary_loss_mlp": 0.01030702, + "balance_loss_clip": 1.01951385, + "balance_loss_mlp": 1.03429639, + "epoch": 0.8066736810461446, + "flos": 21143592933120.0, + "grad_norm": 1.567703379933568, + "language_loss": 0.65159631, + "learning_rate": 3.793146714797086e-07, + "loss": 0.67292631, + "num_input_tokens_seen": 289592625, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6796875, + "step": 13417, + "time_per_iteration": 2.4838106632232666 + }, + { + "auxiliary_loss_clip": 0.0110189, + "auxiliary_loss_mlp": 0.01034276, + "balance_loss_clip": 1.02290344, + "balance_loss_mlp": 1.03483796, + "epoch": 0.8067338042988126, + "flos": 22598261925120.0, + "grad_norm": 1.7972598852590256, + "language_loss": 0.80653781, + "learning_rate": 3.7908649468988306e-07, + "loss": 0.82789946, + "num_input_tokens_seen": 289610780, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 13418, + "time_per_iteration": 2.4769530296325684 + }, + { + "auxiliary_loss_clip": 0.01102946, + "auxiliary_loss_mlp": 0.01029292, + "balance_loss_clip": 1.01751471, + "balance_loss_mlp": 1.03614044, + "epoch": 0.8067939275514805, + "flos": 16508423208960.0, + "grad_norm": 1.5261451461665583, + "language_loss": 0.845676, + "learning_rate": 3.7885837936526066e-07, + "loss": 0.86699843, + "num_input_tokens_seen": 289628890, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66796875, + "step": 13419, + "time_per_iteration": 2.4280943870544434 + }, + { + "auxiliary_loss_clip": 0.01101257, + "auxiliary_loss_mlp": 0.01029456, + "balance_loss_clip": 1.01786304, + "balance_loss_mlp": 1.03399837, + "epoch": 0.8068540508041485, + "flos": 28541836460160.0, + "grad_norm": 1.6210776482059308, + "language_loss": 0.75624955, + "learning_rate": 3.7863032551449047e-07, + "loss": 0.77755666, + "num_input_tokens_seen": 289647220, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 13420, + "time_per_iteration": 2.489564895629883 + }, + { + "auxiliary_loss_clip": 0.01096685, + "auxiliary_loss_mlp": 0.01026556, + "balance_loss_clip": 1.01652443, + "balance_loss_mlp": 1.03300762, + "epoch": 0.8069141740568164, + "flos": 21652482867840.0, + "grad_norm": 1.6825094643098477, + "language_loss": 0.78326774, + "learning_rate": 3.784023331462207e-07, + "loss": 0.8045001, + "num_input_tokens_seen": 289665800, + "router_z_loss_clip": 0.10009766, + "router_z_loss_mlp": 0.63671875, + "step": 13421, + "time_per_iteration": 2.4398117065429688 + }, + { + "auxiliary_loss_clip": 0.01102139, + "auxiliary_loss_mlp": 0.0102348, + "balance_loss_clip": 1.01245332, + "balance_loss_mlp": 1.03592634, + "epoch": 0.8069742973094844, + "flos": 17529327561600.0, + "grad_norm": 1.6379524455499936, + "language_loss": 0.79461509, + "learning_rate": 3.78174402269098e-07, + "loss": 0.81587136, + "num_input_tokens_seen": 289682705, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6640625, + "step": 13422, + "time_per_iteration": 2.415855646133423 + }, + { + "auxiliary_loss_clip": 0.01098682, + "auxiliary_loss_mlp": 0.010287, + "balance_loss_clip": 1.01785183, + "balance_loss_mlp": 1.03383088, + "epoch": 0.8070344205621525, + "flos": 23367037737600.0, + "grad_norm": 1.5228776054135154, + "language_loss": 0.67973536, + "learning_rate": 3.7794653289176347e-07, + "loss": 0.70100921, + "num_input_tokens_seen": 289702920, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6484375, + "step": 13423, + "time_per_iteration": 2.4429931640625 + }, + { + "auxiliary_loss_clip": 0.01101429, + "auxiliary_loss_mlp": 0.01035247, + "balance_loss_clip": 1.02315295, + "balance_loss_mlp": 1.0344708, + "epoch": 0.8070945438148204, + "flos": 22930184528640.0, + "grad_norm": 1.6344709162264897, + "language_loss": 0.80310905, + "learning_rate": 3.7771872502285904e-07, + "loss": 0.82447577, + "num_input_tokens_seen": 289723280, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 13424, + "time_per_iteration": 2.46093487739563 + }, + { + "auxiliary_loss_clip": 0.01100202, + "auxiliary_loss_mlp": 0.01025321, + "balance_loss_clip": 1.01418114, + "balance_loss_mlp": 1.03266358, + "epoch": 0.8071546670674884, + "flos": 25300683613440.0, + "grad_norm": 1.4436773740307707, + "language_loss": 0.79038882, + "learning_rate": 3.774909786710232e-07, + "loss": 0.81164408, + "num_input_tokens_seen": 289743475, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.67578125, + "step": 13425, + "time_per_iteration": 2.4803316593170166 + }, + { + "auxiliary_loss_clip": 0.01097262, + "auxiliary_loss_mlp": 0.01028852, + "balance_loss_clip": 1.01813531, + "balance_loss_mlp": 1.03308177, + "epoch": 0.8072147903201563, + "flos": 18113701927680.0, + "grad_norm": 2.7525970280950185, + "language_loss": 0.75375247, + "learning_rate": 3.772632938448923e-07, + "loss": 0.77501363, + "num_input_tokens_seen": 289761400, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.640625, + "step": 13426, + "time_per_iteration": 2.450507164001465 + }, + { + "auxiliary_loss_clip": 0.01099759, + "auxiliary_loss_mlp": 0.01023019, + "balance_loss_clip": 1.01242161, + "balance_loss_mlp": 1.0346477, + "epoch": 0.8072749135728243, + "flos": 26688164215680.0, + "grad_norm": 1.7578530732787132, + "language_loss": 0.72718084, + "learning_rate": 3.770356705530997e-07, + "loss": 0.74840856, + "num_input_tokens_seen": 289781025, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.65234375, + "step": 13427, + "time_per_iteration": 2.503213405609131 + }, + { + "auxiliary_loss_clip": 0.01100811, + "auxiliary_loss_mlp": 0.01037925, + "balance_loss_clip": 1.02561057, + "balance_loss_mlp": 1.03508282, + "epoch": 0.8073350368254922, + "flos": 19240291071360.0, + "grad_norm": 1.5806827862127526, + "language_loss": 0.69905955, + "learning_rate": 3.768081088042774e-07, + "loss": 0.72044694, + "num_input_tokens_seen": 289798380, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.65625, + "step": 13428, + "time_per_iteration": 3.9154212474823 + }, + { + "auxiliary_loss_clip": 0.01100554, + "auxiliary_loss_mlp": 0.01028315, + "balance_loss_clip": 1.01791382, + "balance_loss_mlp": 1.03464985, + "epoch": 0.8073951600781603, + "flos": 13334530579200.0, + "grad_norm": 1.7256897581307475, + "language_loss": 0.74537814, + "learning_rate": 3.765806086070544e-07, + "loss": 0.76666689, + "num_input_tokens_seen": 289814515, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.65625, + "step": 13429, + "time_per_iteration": 3.8353562355041504 + }, + { + "auxiliary_loss_clip": 0.01096625, + "auxiliary_loss_mlp": 0.01027974, + "balance_loss_clip": 1.01703608, + "balance_loss_mlp": 1.03373289, + "epoch": 0.8074552833308282, + "flos": 22853191726080.0, + "grad_norm": 1.6508081444527534, + "language_loss": 0.66780758, + "learning_rate": 3.763531699700568e-07, + "loss": 0.68905354, + "num_input_tokens_seen": 289834315, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.62890625, + "step": 13430, + "time_per_iteration": 3.8374648094177246 + }, + { + "auxiliary_loss_clip": 0.01099608, + "auxiliary_loss_mlp": 0.01026452, + "balance_loss_clip": 1.01557398, + "balance_loss_mlp": 1.03463328, + "epoch": 0.8075154065834962, + "flos": 20339409288960.0, + "grad_norm": 1.7401371599211086, + "language_loss": 0.80040669, + "learning_rate": 3.7612579290190994e-07, + "loss": 0.82166731, + "num_input_tokens_seen": 289853770, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6484375, + "step": 13431, + "time_per_iteration": 2.4570858478546143 + }, + { + "auxiliary_loss_clip": 0.0109803, + "auxiliary_loss_mlp": 0.01026665, + "balance_loss_clip": 1.01483989, + "balance_loss_mlp": 1.03383279, + "epoch": 0.8075755298361641, + "flos": 21908059113600.0, + "grad_norm": 1.7148074756954637, + "language_loss": 0.80367452, + "learning_rate": 3.7589847741123593e-07, + "loss": 0.82492149, + "num_input_tokens_seen": 289870480, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.640625, + "step": 13432, + "time_per_iteration": 2.436596155166626 + }, + { + "auxiliary_loss_clip": 0.0110653, + "auxiliary_loss_mlp": 0.01029872, + "balance_loss_clip": 1.01817179, + "balance_loss_mlp": 1.03757977, + "epoch": 0.8076356530888321, + "flos": 15669298609920.0, + "grad_norm": 2.2592964465029524, + "language_loss": 0.70442599, + "learning_rate": 3.7567122350665415e-07, + "loss": 0.72579002, + "num_input_tokens_seen": 289888275, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6875, + "step": 13433, + "time_per_iteration": 3.920189142227173 + }, + { + "auxiliary_loss_clip": 0.01099536, + "auxiliary_loss_mlp": 0.01027383, + "balance_loss_clip": 1.01615393, + "balance_loss_mlp": 1.03418195, + "epoch": 0.8076957763415, + "flos": 37777414521600.0, + "grad_norm": 1.542675300330219, + "language_loss": 0.72662854, + "learning_rate": 3.754440311967828e-07, + "loss": 0.7478978, + "num_input_tokens_seen": 289911495, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 13434, + "time_per_iteration": 2.579868793487549 + }, + { + "auxiliary_loss_clip": 0.01102649, + "auxiliary_loss_mlp": 0.01027174, + "balance_loss_clip": 1.0162847, + "balance_loss_mlp": 1.03770304, + "epoch": 0.807755899594168, + "flos": 19610781903360.0, + "grad_norm": 2.005086491420573, + "language_loss": 0.68262374, + "learning_rate": 3.752169004902361e-07, + "loss": 0.70392191, + "num_input_tokens_seen": 289930045, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 13435, + "time_per_iteration": 2.4222404956817627 + }, + { + "auxiliary_loss_clip": 0.01103995, + "auxiliary_loss_mlp": 0.01032794, + "balance_loss_clip": 1.0194422, + "balance_loss_mlp": 1.03674603, + "epoch": 0.8078160228468361, + "flos": 23294893271040.0, + "grad_norm": 3.3656711098835967, + "language_loss": 0.75132048, + "learning_rate": 3.749898313956279e-07, + "loss": 0.77268833, + "num_input_tokens_seen": 289950815, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.671875, + "step": 13436, + "time_per_iteration": 2.4514195919036865 + }, + { + "auxiliary_loss_clip": 0.0109426, + "auxiliary_loss_mlp": 0.01025813, + "balance_loss_clip": 1.0144937, + "balance_loss_mlp": 1.03109729, + "epoch": 0.807876146099504, + "flos": 27162651899520.0, + "grad_norm": 1.6647863446224558, + "language_loss": 0.70325077, + "learning_rate": 3.747628239215674e-07, + "loss": 0.72445142, + "num_input_tokens_seen": 289971730, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6328125, + "step": 13437, + "time_per_iteration": 2.486090660095215 + }, + { + "auxiliary_loss_clip": 0.01100581, + "auxiliary_loss_mlp": 0.01028394, + "balance_loss_clip": 1.01785624, + "balance_loss_mlp": 1.03698409, + "epoch": 0.807936269352172, + "flos": 27160030206720.0, + "grad_norm": 1.6509450108109354, + "language_loss": 0.73176312, + "learning_rate": 3.745358780766636e-07, + "loss": 0.75305283, + "num_input_tokens_seen": 289992995, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.63671875, + "step": 13438, + "time_per_iteration": 2.50380277633667 + }, + { + "auxiliary_loss_clip": 0.01097642, + "auxiliary_loss_mlp": 0.01028719, + "balance_loss_clip": 1.01776934, + "balance_loss_mlp": 1.03364897, + "epoch": 0.8079963926048399, + "flos": 20740423703040.0, + "grad_norm": 2.024260239106251, + "language_loss": 0.77098519, + "learning_rate": 3.7430899386952344e-07, + "loss": 0.79224879, + "num_input_tokens_seen": 290009405, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.640625, + "step": 13439, + "time_per_iteration": 2.4301347732543945 + }, + { + "auxiliary_loss_clip": 0.01100059, + "auxiliary_loss_mlp": 0.01032366, + "balance_loss_clip": 1.02110648, + "balance_loss_mlp": 1.03528643, + "epoch": 0.8080565158575079, + "flos": 25009663622400.0, + "grad_norm": 1.5099931355487166, + "language_loss": 0.78758991, + "learning_rate": 3.7408217130874786e-07, + "loss": 0.80891412, + "num_input_tokens_seen": 290031085, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 13440, + "time_per_iteration": 2.5148396492004395 + }, + { + "auxiliary_loss_clip": 0.01100626, + "auxiliary_loss_mlp": 0.01028309, + "balance_loss_clip": 1.0156852, + "balance_loss_mlp": 1.03371847, + "epoch": 0.8081166391101758, + "flos": 18698076293760.0, + "grad_norm": 2.4238426069690138, + "language_loss": 0.59080982, + "learning_rate": 3.7385541040293946e-07, + "loss": 0.61209911, + "num_input_tokens_seen": 290048670, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.671875, + "step": 13441, + "time_per_iteration": 2.4081990718841553 + }, + { + "auxiliary_loss_clip": 0.01099503, + "auxiliary_loss_mlp": 0.01029636, + "balance_loss_clip": 1.01740527, + "balance_loss_mlp": 1.0348506, + "epoch": 0.8081767623628439, + "flos": 19828651847040.0, + "grad_norm": 2.0479440790186696, + "language_loss": 0.76248497, + "learning_rate": 3.7362871116069684e-07, + "loss": 0.7837764, + "num_input_tokens_seen": 290064085, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.64453125, + "step": 13442, + "time_per_iteration": 2.4318795204162598 + }, + { + "auxiliary_loss_clip": 0.01099068, + "auxiliary_loss_mlp": 0.01027394, + "balance_loss_clip": 1.01610458, + "balance_loss_mlp": 1.03397799, + "epoch": 0.8082368856155118, + "flos": 35772952982400.0, + "grad_norm": 1.4112567306342216, + "language_loss": 0.7047745, + "learning_rate": 3.734020735906169e-07, + "loss": 0.72603905, + "num_input_tokens_seen": 290086255, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6484375, + "step": 13443, + "time_per_iteration": 2.59014892578125 + }, + { + "auxiliary_loss_clip": 0.01098748, + "auxiliary_loss_mlp": 0.01035834, + "balance_loss_clip": 1.02450943, + "balance_loss_mlp": 1.03480315, + "epoch": 0.8082970088681798, + "flos": 17198015489280.0, + "grad_norm": 1.822282396232332, + "language_loss": 0.82413107, + "learning_rate": 3.7317549770129286e-07, + "loss": 0.84547687, + "num_input_tokens_seen": 290103995, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 13444, + "time_per_iteration": 2.4474878311157227 + }, + { + "auxiliary_loss_clip": 0.01023449, + "auxiliary_loss_mlp": 0.01001686, + "balance_loss_clip": 1.00072086, + "balance_loss_mlp": 1.00341463, + "epoch": 0.8083571321208477, + "flos": 63555207511680.0, + "grad_norm": 0.8424351195501338, + "language_loss": 0.53699923, + "learning_rate": 3.7294898350131754e-07, + "loss": 0.55725062, + "num_input_tokens_seen": 290157245, + "router_z_loss_clip": 0.00964355, + "router_z_loss_mlp": 0.20117188, + "step": 13445, + "time_per_iteration": 2.893291473388672 + }, + { + "auxiliary_loss_clip": 0.0109982, + "auxiliary_loss_mlp": 0.01029106, + "balance_loss_clip": 1.0172683, + "balance_loss_mlp": 1.03552246, + "epoch": 0.8084172553735157, + "flos": 17930701111680.0, + "grad_norm": 3.4905942687321514, + "language_loss": 0.72271657, + "learning_rate": 3.7272253099927964e-07, + "loss": 0.7440058, + "num_input_tokens_seen": 290174970, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.640625, + "step": 13446, + "time_per_iteration": 2.444681167602539 + }, + { + "auxiliary_loss_clip": 0.01104016, + "auxiliary_loss_mlp": 0.01028842, + "balance_loss_clip": 1.01638484, + "balance_loss_mlp": 1.03635025, + "epoch": 0.8084773786261836, + "flos": 24097999507200.0, + "grad_norm": 1.848843080578613, + "language_loss": 0.71273375, + "learning_rate": 3.7249614020376606e-07, + "loss": 0.73406231, + "num_input_tokens_seen": 290194395, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.67578125, + "step": 13447, + "time_per_iteration": 2.4645798206329346 + }, + { + "auxiliary_loss_clip": 0.01101801, + "auxiliary_loss_mlp": 0.01030026, + "balance_loss_clip": 1.01762831, + "balance_loss_mlp": 1.03445625, + "epoch": 0.8085375018788516, + "flos": 15588211656960.0, + "grad_norm": 2.247658895940983, + "language_loss": 0.75123751, + "learning_rate": 3.7226981112336197e-07, + "loss": 0.77255571, + "num_input_tokens_seen": 290209200, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 13448, + "time_per_iteration": 2.393450975418091 + }, + { + "auxiliary_loss_clip": 0.01023124, + "auxiliary_loss_mlp": 0.01004466, + "balance_loss_clip": 1.00351226, + "balance_loss_mlp": 1.00315809, + "epoch": 0.8085976251315197, + "flos": 67561296393600.0, + "grad_norm": 0.7387933827172105, + "language_loss": 0.63826883, + "learning_rate": 3.7204354376665024e-07, + "loss": 0.65854478, + "num_input_tokens_seen": 290274565, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.20019531, + "step": 13449, + "time_per_iteration": 3.089714288711548 + }, + { + "auxiliary_loss_clip": 0.01100909, + "auxiliary_loss_mlp": 0.01025967, + "balance_loss_clip": 1.01415896, + "balance_loss_mlp": 1.03577614, + "epoch": 0.8086577483841876, + "flos": 22561453463040.0, + "grad_norm": 1.6804570803632504, + "language_loss": 0.73693436, + "learning_rate": 3.718173381422105e-07, + "loss": 0.75820303, + "num_input_tokens_seen": 290293630, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65234375, + "step": 13450, + "time_per_iteration": 2.4564168453216553 + }, + { + "auxiliary_loss_clip": 0.01098501, + "auxiliary_loss_mlp": 0.01028693, + "balance_loss_clip": 1.01766062, + "balance_loss_mlp": 1.03300965, + "epoch": 0.8087178716368556, + "flos": 17968084191360.0, + "grad_norm": 1.9099167962984258, + "language_loss": 0.73742312, + "learning_rate": 3.7159119425861986e-07, + "loss": 0.75869507, + "num_input_tokens_seen": 290311450, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 13451, + "time_per_iteration": 2.4158482551574707 + }, + { + "auxiliary_loss_clip": 0.01104266, + "auxiliary_loss_mlp": 0.01027862, + "balance_loss_clip": 1.01489806, + "balance_loss_mlp": 1.03483427, + "epoch": 0.8087779948895235, + "flos": 21719527603200.0, + "grad_norm": 1.7227484700125357, + "language_loss": 0.80100703, + "learning_rate": 3.713651121244543e-07, + "loss": 0.82232833, + "num_input_tokens_seen": 290330165, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6953125, + "step": 13452, + "time_per_iteration": 2.4718620777130127 + }, + { + "auxiliary_loss_clip": 0.01101927, + "auxiliary_loss_mlp": 0.01034298, + "balance_loss_clip": 1.02291918, + "balance_loss_mlp": 1.03578424, + "epoch": 0.8088381181421915, + "flos": 29092885983360.0, + "grad_norm": 3.0126577683381246, + "language_loss": 0.78564459, + "learning_rate": 3.711390917482875e-07, + "loss": 0.80700684, + "num_input_tokens_seen": 290350815, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6640625, + "step": 13453, + "time_per_iteration": 2.4844601154327393 + }, + { + "auxiliary_loss_clip": 0.01098121, + "auxiliary_loss_mlp": 0.01027888, + "balance_loss_clip": 1.01561522, + "balance_loss_mlp": 1.03227544, + "epoch": 0.8088982413948594, + "flos": 22198432659840.0, + "grad_norm": 3.8296608208980762, + "language_loss": 0.77353287, + "learning_rate": 3.709131331386892e-07, + "loss": 0.79479295, + "num_input_tokens_seen": 290367380, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66015625, + "step": 13454, + "time_per_iteration": 2.4381799697875977 + }, + { + "auxiliary_loss_clip": 0.01097801, + "auxiliary_loss_mlp": 0.01030092, + "balance_loss_clip": 1.01824236, + "balance_loss_mlp": 1.03329492, + "epoch": 0.8089583646475275, + "flos": 28036717453440.0, + "grad_norm": 1.6657620401651272, + "language_loss": 0.7656436, + "learning_rate": 3.7068723630422795e-07, + "loss": 0.78692257, + "num_input_tokens_seen": 290387965, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.64453125, + "step": 13455, + "time_per_iteration": 2.489542007446289 + }, + { + "auxiliary_loss_clip": 0.01099517, + "auxiliary_loss_mlp": 0.01026999, + "balance_loss_clip": 1.01534009, + "balance_loss_mlp": 1.03383231, + "epoch": 0.8090184879001954, + "flos": 16617735273600.0, + "grad_norm": 1.6998136101737356, + "language_loss": 0.78545928, + "learning_rate": 3.70461401253471e-07, + "loss": 0.80672443, + "num_input_tokens_seen": 290404150, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 13456, + "time_per_iteration": 2.493177890777588 + }, + { + "auxiliary_loss_clip": 0.0110144, + "auxiliary_loss_mlp": 0.01033934, + "balance_loss_clip": 1.02302623, + "balance_loss_mlp": 1.03721011, + "epoch": 0.8090786111528634, + "flos": 27340804379520.0, + "grad_norm": 2.0933677582295265, + "language_loss": 0.71244174, + "learning_rate": 3.702356279949801e-07, + "loss": 0.7337954, + "num_input_tokens_seen": 290422370, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.640625, + "step": 13457, + "time_per_iteration": 2.50264835357666 + }, + { + "auxiliary_loss_clip": 0.01099652, + "auxiliary_loss_mlp": 0.01029966, + "balance_loss_clip": 1.01955891, + "balance_loss_mlp": 1.03506947, + "epoch": 0.8091387344055313, + "flos": 21105742976640.0, + "grad_norm": 1.8174801771969786, + "language_loss": 0.72725999, + "learning_rate": 3.700099165373176e-07, + "loss": 0.74855614, + "num_input_tokens_seen": 290442645, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.64453125, + "step": 13458, + "time_per_iteration": 2.4687604904174805 + }, + { + "auxiliary_loss_clip": 0.01100692, + "auxiliary_loss_mlp": 0.01030689, + "balance_loss_clip": 1.0188396, + "balance_loss_mlp": 1.03538537, + "epoch": 0.8091988576581993, + "flos": 11655060318720.0, + "grad_norm": 9.489100593414795, + "language_loss": 0.78715897, + "learning_rate": 3.6978426688904275e-07, + "loss": 0.80847281, + "num_input_tokens_seen": 290458520, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65625, + "step": 13459, + "time_per_iteration": 2.459733724594116 + }, + { + "auxiliary_loss_clip": 0.01101626, + "auxiliary_loss_mlp": 0.01029005, + "balance_loss_clip": 1.0167737, + "balance_loss_mlp": 1.03463078, + "epoch": 0.8092589809108672, + "flos": 22963329803520.0, + "grad_norm": 2.336644313106872, + "language_loss": 0.80171156, + "learning_rate": 3.695586790587113e-07, + "loss": 0.82301795, + "num_input_tokens_seen": 290474465, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66796875, + "step": 13460, + "time_per_iteration": 2.4446218013763428 + }, + { + "auxiliary_loss_clip": 0.01101005, + "auxiliary_loss_mlp": 0.01032325, + "balance_loss_clip": 1.01998675, + "balance_loss_mlp": 1.03367543, + "epoch": 0.8093191041635353, + "flos": 13260985482240.0, + "grad_norm": 1.8186095835503757, + "language_loss": 0.84812057, + "learning_rate": 3.693331530548789e-07, + "loss": 0.86945391, + "num_input_tokens_seen": 290492060, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.67578125, + "step": 13461, + "time_per_iteration": 2.400993824005127 + }, + { + "auxiliary_loss_clip": 0.0110315, + "auxiliary_loss_mlp": 0.01036215, + "balance_loss_clip": 1.02418709, + "balance_loss_mlp": 1.03562999, + "epoch": 0.8093792274162032, + "flos": 25516003691520.0, + "grad_norm": 1.848257188475226, + "language_loss": 0.76413333, + "learning_rate": 3.69107688886096e-07, + "loss": 0.78552705, + "num_input_tokens_seen": 290511510, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 13462, + "time_per_iteration": 2.476879358291626 + }, + { + "auxiliary_loss_clip": 0.01102421, + "auxiliary_loss_mlp": 0.01033505, + "balance_loss_clip": 1.02077329, + "balance_loss_mlp": 1.03630662, + "epoch": 0.8094393506688712, + "flos": 23546483107200.0, + "grad_norm": 2.0825422363355948, + "language_loss": 0.82803857, + "learning_rate": 3.6888228656091357e-07, + "loss": 0.84939778, + "num_input_tokens_seen": 290530035, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.66015625, + "step": 13463, + "time_per_iteration": 2.4521071910858154 + }, + { + "auxiliary_loss_clip": 0.01099095, + "auxiliary_loss_mlp": 0.01032745, + "balance_loss_clip": 1.0219866, + "balance_loss_mlp": 1.03470421, + "epoch": 0.8094994739215392, + "flos": 17055917285760.0, + "grad_norm": 5.844094604109171, + "language_loss": 0.62201041, + "learning_rate": 3.686569460878779e-07, + "loss": 0.64332885, + "num_input_tokens_seen": 290548245, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6484375, + "step": 13464, + "time_per_iteration": 2.424069404602051 + }, + { + "auxiliary_loss_clip": 0.01097475, + "auxiliary_loss_mlp": 0.01028202, + "balance_loss_clip": 1.01769936, + "balance_loss_mlp": 1.03367341, + "epoch": 0.8095595971742071, + "flos": 23551223702400.0, + "grad_norm": 1.5079882222781815, + "language_loss": 0.61727977, + "learning_rate": 3.684316674755341e-07, + "loss": 0.63853657, + "num_input_tokens_seen": 290568625, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.63671875, + "step": 13465, + "time_per_iteration": 2.4566633701324463 + }, + { + "auxiliary_loss_clip": 0.0110087, + "auxiliary_loss_mlp": 0.01034182, + "balance_loss_clip": 1.02232695, + "balance_loss_mlp": 1.03666687, + "epoch": 0.8096197204268751, + "flos": 20373201008640.0, + "grad_norm": 1.6661852596704285, + "language_loss": 0.81980264, + "learning_rate": 3.682064507324256e-07, + "loss": 0.84115314, + "num_input_tokens_seen": 290586575, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.640625, + "step": 13466, + "time_per_iteration": 2.4338531494140625 + }, + { + "auxiliary_loss_clip": 0.01103687, + "auxiliary_loss_mlp": 0.01033782, + "balance_loss_clip": 1.02221847, + "balance_loss_mlp": 1.0364902, + "epoch": 0.809679843679543, + "flos": 27818775682560.0, + "grad_norm": 1.9379402602159286, + "language_loss": 0.76123488, + "learning_rate": 3.6798129586709204e-07, + "loss": 0.78260958, + "num_input_tokens_seen": 290606790, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.671875, + "step": 13467, + "time_per_iteration": 2.5050792694091797 + }, + { + "auxiliary_loss_clip": 0.01096837, + "auxiliary_loss_mlp": 0.01024677, + "balance_loss_clip": 1.01335227, + "balance_loss_mlp": 1.03137767, + "epoch": 0.8097399669322111, + "flos": 22014103040640.0, + "grad_norm": 1.827082362684537, + "language_loss": 0.79509449, + "learning_rate": 3.6775620288807073e-07, + "loss": 0.81630957, + "num_input_tokens_seen": 290625525, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 13468, + "time_per_iteration": 2.4531302452087402 + }, + { + "auxiliary_loss_clip": 0.01095055, + "auxiliary_loss_mlp": 0.01028993, + "balance_loss_clip": 1.01831162, + "balance_loss_mlp": 1.03250098, + "epoch": 0.809800090184879, + "flos": 18988988544000.0, + "grad_norm": 2.16796452248889, + "language_loss": 0.67542112, + "learning_rate": 3.675311718038978e-07, + "loss": 0.69666153, + "num_input_tokens_seen": 290644935, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.625, + "step": 13469, + "time_per_iteration": 2.4108262062072754 + }, + { + "auxiliary_loss_clip": 0.01022711, + "auxiliary_loss_mlp": 0.01002994, + "balance_loss_clip": 1.00204587, + "balance_loss_mlp": 1.00256538, + "epoch": 0.809860213437547, + "flos": 66099516508800.0, + "grad_norm": 0.6937103683167268, + "language_loss": 0.54675603, + "learning_rate": 3.6730620262310683e-07, + "loss": 0.56701303, + "num_input_tokens_seen": 290710735, + "router_z_loss_clip": 0.00946045, + "router_z_loss_mlp": 0.20117188, + "step": 13470, + "time_per_iteration": 4.479866027832031 + }, + { + "auxiliary_loss_clip": 0.01098507, + "auxiliary_loss_mlp": 0.01029422, + "balance_loss_clip": 1.01866305, + "balance_loss_mlp": 1.03306627, + "epoch": 0.8099203366902149, + "flos": 20882485992960.0, + "grad_norm": 1.8860877389353394, + "language_loss": 0.69289327, + "learning_rate": 3.670812953542279e-07, + "loss": 0.71417254, + "num_input_tokens_seen": 290729565, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.65625, + "step": 13471, + "time_per_iteration": 3.869608163833618 + }, + { + "auxiliary_loss_clip": 0.01099092, + "auxiliary_loss_mlp": 0.01024651, + "balance_loss_clip": 1.01318324, + "balance_loss_mlp": 1.03399885, + "epoch": 0.8099804599428829, + "flos": 26030927111040.0, + "grad_norm": 1.6760618774214828, + "language_loss": 0.79667246, + "learning_rate": 3.6685645000579003e-07, + "loss": 0.8179099, + "num_input_tokens_seen": 290749360, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65234375, + "step": 13472, + "time_per_iteration": 3.874138355255127 + }, + { + "auxiliary_loss_clip": 0.01022918, + "auxiliary_loss_mlp": 0.01002051, + "balance_loss_clip": 1.00103176, + "balance_loss_mlp": 1.00270104, + "epoch": 0.8100405831955508, + "flos": 69303573584640.0, + "grad_norm": 0.7480344788925887, + "language_loss": 0.57732165, + "learning_rate": 3.666316665863201e-07, + "loss": 0.59757125, + "num_input_tokens_seen": 290812145, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.20214844, + "step": 13473, + "time_per_iteration": 2.9958584308624268 + }, + { + "auxiliary_loss_clip": 0.01101746, + "auxiliary_loss_mlp": 0.01028502, + "balance_loss_clip": 1.01672435, + "balance_loss_mlp": 1.03484774, + "epoch": 0.8101007064482189, + "flos": 15012492468480.0, + "grad_norm": 2.1302900400638727, + "language_loss": 0.73930925, + "learning_rate": 3.664069451043399e-07, + "loss": 0.76061177, + "num_input_tokens_seen": 290829845, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.66796875, + "step": 13474, + "time_per_iteration": 2.4078030586242676 + }, + { + "auxiliary_loss_clip": 0.01103776, + "auxiliary_loss_mlp": 0.01032761, + "balance_loss_clip": 1.02123356, + "balance_loss_mlp": 1.03630209, + "epoch": 0.8101608297008868, + "flos": 21067210661760.0, + "grad_norm": 1.7301806591727757, + "language_loss": 0.79092455, + "learning_rate": 3.661822855683723e-07, + "loss": 0.81228995, + "num_input_tokens_seen": 290848815, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.67578125, + "step": 13475, + "time_per_iteration": 3.9835152626037598 + }, + { + "auxiliary_loss_clip": 0.01098463, + "auxiliary_loss_mlp": 0.01034903, + "balance_loss_clip": 1.02390599, + "balance_loss_mlp": 1.0341773, + "epoch": 0.8102209529535548, + "flos": 23731279603200.0, + "grad_norm": 1.536909800209771, + "language_loss": 0.75346851, + "learning_rate": 3.659576879869364e-07, + "loss": 0.77480221, + "num_input_tokens_seen": 290868580, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.640625, + "step": 13476, + "time_per_iteration": 2.462615728378296 + }, + { + "auxiliary_loss_clip": 0.01102891, + "auxiliary_loss_mlp": 0.01035167, + "balance_loss_clip": 1.02225113, + "balance_loss_mlp": 1.03499579, + "epoch": 0.8102810762062228, + "flos": 10955879107200.0, + "grad_norm": 2.160073073181854, + "language_loss": 0.73751932, + "learning_rate": 3.657331523685485e-07, + "loss": 0.75889993, + "num_input_tokens_seen": 290883540, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6796875, + "step": 13477, + "time_per_iteration": 2.396301031112671 + }, + { + "auxiliary_loss_clip": 0.01098807, + "auxiliary_loss_mlp": 0.01031967, + "balance_loss_clip": 1.02095175, + "balance_loss_mlp": 1.03388894, + "epoch": 0.8103411994588907, + "flos": 14648825220480.0, + "grad_norm": 2.052818200341471, + "language_loss": 0.69685113, + "learning_rate": 3.6550867872172365e-07, + "loss": 0.71815884, + "num_input_tokens_seen": 290901560, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 13478, + "time_per_iteration": 2.429624557495117 + }, + { + "auxiliary_loss_clip": 0.01022393, + "auxiliary_loss_mlp": 0.00998048, + "balance_loss_clip": 0.99711275, + "balance_loss_mlp": 1.00228572, + "epoch": 0.8104013227115587, + "flos": 59153314665600.0, + "grad_norm": 0.6817621273337255, + "language_loss": 0.52143216, + "learning_rate": 3.6528426705497293e-07, + "loss": 0.54163659, + "num_input_tokens_seen": 290959185, + "router_z_loss_clip": 0.00933838, + "router_z_loss_mlp": 0.20117188, + "step": 13479, + "time_per_iteration": 2.9901397228240967 + }, + { + "auxiliary_loss_clip": 0.01099068, + "auxiliary_loss_mlp": 0.01030237, + "balance_loss_clip": 1.0192523, + "balance_loss_mlp": 1.03441501, + "epoch": 0.8104614459642266, + "flos": 19828687760640.0, + "grad_norm": 1.7169368988258746, + "language_loss": 0.71180439, + "learning_rate": 3.650599173768072e-07, + "loss": 0.73309743, + "num_input_tokens_seen": 290979585, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6484375, + "step": 13480, + "time_per_iteration": 2.455625295639038 + }, + { + "auxiliary_loss_clip": 0.01101048, + "auxiliary_loss_mlp": 0.01030896, + "balance_loss_clip": 1.01992917, + "balance_loss_mlp": 1.03478885, + "epoch": 0.8105215692168947, + "flos": 25374264624000.0, + "grad_norm": 1.8294691640440002, + "language_loss": 0.79820704, + "learning_rate": 3.648356296957327e-07, + "loss": 0.81952655, + "num_input_tokens_seen": 291000865, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6640625, + "step": 13481, + "time_per_iteration": 2.4734697341918945 + }, + { + "auxiliary_loss_clip": 0.01098939, + "auxiliary_loss_mlp": 0.01030269, + "balance_loss_clip": 1.01901519, + "balance_loss_mlp": 1.03369451, + "epoch": 0.8105816924695626, + "flos": 20481722974080.0, + "grad_norm": 1.7697725439272614, + "language_loss": 0.72478992, + "learning_rate": 3.646114040202548e-07, + "loss": 0.74608201, + "num_input_tokens_seen": 291018285, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 13482, + "time_per_iteration": 2.439736843109131 + }, + { + "auxiliary_loss_clip": 0.01100486, + "auxiliary_loss_mlp": 0.01025923, + "balance_loss_clip": 1.01400197, + "balance_loss_mlp": 1.03284776, + "epoch": 0.8106418157222306, + "flos": 14538687143040.0, + "grad_norm": 2.090719044205904, + "language_loss": 0.65953445, + "learning_rate": 3.6438724035887705e-07, + "loss": 0.68079859, + "num_input_tokens_seen": 291035745, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 13483, + "time_per_iteration": 2.444854259490967 + }, + { + "auxiliary_loss_clip": 0.01099291, + "auxiliary_loss_mlp": 0.01027053, + "balance_loss_clip": 1.01493549, + "balance_loss_mlp": 1.0334847, + "epoch": 0.8107019389748985, + "flos": 22564470205440.0, + "grad_norm": 1.6031206001682317, + "language_loss": 0.76335526, + "learning_rate": 3.641631387200992e-07, + "loss": 0.78461868, + "num_input_tokens_seen": 291053280, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.65625, + "step": 13484, + "time_per_iteration": 2.464406728744507 + }, + { + "auxiliary_loss_clip": 0.0110487, + "auxiliary_loss_mlp": 0.01034132, + "balance_loss_clip": 1.02137613, + "balance_loss_mlp": 1.03535843, + "epoch": 0.8107620622275665, + "flos": 19609560840960.0, + "grad_norm": 1.6053200724727246, + "language_loss": 0.72207975, + "learning_rate": 3.639390991124183e-07, + "loss": 0.74346977, + "num_input_tokens_seen": 291072855, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 13485, + "time_per_iteration": 2.4401731491088867 + }, + { + "auxiliary_loss_clip": 0.01096529, + "auxiliary_loss_mlp": 0.01026674, + "balance_loss_clip": 1.01576591, + "balance_loss_mlp": 1.03370321, + "epoch": 0.8108221854802344, + "flos": 16143498984960.0, + "grad_norm": 1.8011758886581477, + "language_loss": 0.75758684, + "learning_rate": 3.637151215443308e-07, + "loss": 0.77881885, + "num_input_tokens_seen": 291090285, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.62890625, + "step": 13486, + "time_per_iteration": 2.4450883865356445 + }, + { + "auxiliary_loss_clip": 0.01102508, + "auxiliary_loss_mlp": 0.01029573, + "balance_loss_clip": 1.01787281, + "balance_loss_mlp": 1.03437519, + "epoch": 0.8108823087329025, + "flos": 21106209853440.0, + "grad_norm": 2.036003416638632, + "language_loss": 0.72445893, + "learning_rate": 3.6349120602433045e-07, + "loss": 0.74577975, + "num_input_tokens_seen": 291107675, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 13487, + "time_per_iteration": 2.4479668140411377 + }, + { + "auxiliary_loss_clip": 0.01097974, + "auxiliary_loss_mlp": 0.01030011, + "balance_loss_clip": 1.01929998, + "balance_loss_mlp": 1.03596091, + "epoch": 0.8109424319855704, + "flos": 29199648182400.0, + "grad_norm": 1.6462408552196026, + "language_loss": 0.84215033, + "learning_rate": 3.6326735256090715e-07, + "loss": 0.86343014, + "num_input_tokens_seen": 291126900, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.62109375, + "step": 13488, + "time_per_iteration": 2.4955086708068848 + }, + { + "auxiliary_loss_clip": 0.01102332, + "auxiliary_loss_mlp": 0.01031462, + "balance_loss_clip": 1.01954138, + "balance_loss_mlp": 1.03592181, + "epoch": 0.8110025552382384, + "flos": 23111856541440.0, + "grad_norm": 1.790341978719953, + "language_loss": 0.73587167, + "learning_rate": 3.630435611625502e-07, + "loss": 0.75720966, + "num_input_tokens_seen": 291145285, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 13489, + "time_per_iteration": 2.4345548152923584 + }, + { + "auxiliary_loss_clip": 0.0109709, + "auxiliary_loss_mlp": 0.01028913, + "balance_loss_clip": 1.01749909, + "balance_loss_mlp": 1.03397191, + "epoch": 0.8110626784909064, + "flos": 22379961018240.0, + "grad_norm": 2.056957564654556, + "language_loss": 0.71371531, + "learning_rate": 3.628198318377453e-07, + "loss": 0.73497528, + "num_input_tokens_seen": 291163485, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6328125, + "step": 13490, + "time_per_iteration": 2.458850622177124 + }, + { + "auxiliary_loss_clip": 0.01103063, + "auxiliary_loss_mlp": 0.01038299, + "balance_loss_clip": 1.02602684, + "balance_loss_mlp": 1.03627634, + "epoch": 0.8111228017435743, + "flos": 23368043318400.0, + "grad_norm": 2.627624655824894, + "language_loss": 0.72095811, + "learning_rate": 3.625961645949762e-07, + "loss": 0.74237174, + "num_input_tokens_seen": 291182215, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6640625, + "step": 13491, + "time_per_iteration": 2.435976266860962 + }, + { + "auxiliary_loss_clip": 0.01099382, + "auxiliary_loss_mlp": 0.01029337, + "balance_loss_clip": 1.01806605, + "balance_loss_mlp": 1.03369725, + "epoch": 0.8111829249962423, + "flos": 21286553063040.0, + "grad_norm": 1.475540339254428, + "language_loss": 0.67907929, + "learning_rate": 3.623725594427245e-07, + "loss": 0.70036656, + "num_input_tokens_seen": 291203145, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 13492, + "time_per_iteration": 2.4556221961975098 + }, + { + "auxiliary_loss_clip": 0.0110221, + "auxiliary_loss_mlp": 0.01029201, + "balance_loss_clip": 1.01752472, + "balance_loss_mlp": 1.03487253, + "epoch": 0.8112430482489102, + "flos": 22345558767360.0, + "grad_norm": 2.049238778723241, + "language_loss": 0.72220272, + "learning_rate": 3.6214901638947006e-07, + "loss": 0.7435168, + "num_input_tokens_seen": 291220600, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.671875, + "step": 13493, + "time_per_iteration": 2.447108030319214 + }, + { + "auxiliary_loss_clip": 0.01100034, + "auxiliary_loss_mlp": 0.01036233, + "balance_loss_clip": 1.02388942, + "balance_loss_mlp": 1.03342462, + "epoch": 0.8113031715015783, + "flos": 31138321962240.0, + "grad_norm": 1.6962255282126324, + "language_loss": 0.70346606, + "learning_rate": 3.619255354436885e-07, + "loss": 0.72482872, + "num_input_tokens_seen": 291241195, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6640625, + "step": 13494, + "time_per_iteration": 2.503356456756592 + }, + { + "auxiliary_loss_clip": 0.01104239, + "auxiliary_loss_mlp": 0.01033492, + "balance_loss_clip": 1.02064085, + "balance_loss_mlp": 1.03645778, + "epoch": 0.8113632947542462, + "flos": 25335445000320.0, + "grad_norm": 2.1915991847762966, + "language_loss": 0.76373303, + "learning_rate": 3.6170211661385543e-07, + "loss": 0.78511035, + "num_input_tokens_seen": 291258715, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6796875, + "step": 13495, + "time_per_iteration": 2.4522132873535156 + }, + { + "auxiliary_loss_clip": 0.01100729, + "auxiliary_loss_mlp": 0.010345, + "balance_loss_clip": 1.0229249, + "balance_loss_mlp": 1.03444338, + "epoch": 0.8114234180069142, + "flos": 28439168411520.0, + "grad_norm": 1.9013360274745676, + "language_loss": 0.80117953, + "learning_rate": 3.614787599084417e-07, + "loss": 0.82253182, + "num_input_tokens_seen": 291278030, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6640625, + "step": 13496, + "time_per_iteration": 2.4913132190704346 + }, + { + "auxiliary_loss_clip": 0.01100021, + "auxiliary_loss_mlp": 0.01033487, + "balance_loss_clip": 1.02057636, + "balance_loss_mlp": 1.03446186, + "epoch": 0.8114835412595821, + "flos": 20338870584960.0, + "grad_norm": 1.6264176986100232, + "language_loss": 0.70963192, + "learning_rate": 3.6125546533591787e-07, + "loss": 0.73096704, + "num_input_tokens_seen": 291296740, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.65625, + "step": 13497, + "time_per_iteration": 2.4535224437713623 + }, + { + "auxiliary_loss_clip": 0.01101999, + "auxiliary_loss_mlp": 0.01028348, + "balance_loss_clip": 1.01754749, + "balance_loss_mlp": 1.03544033, + "epoch": 0.8115436645122501, + "flos": 22490889194880.0, + "grad_norm": 1.6440600929050224, + "language_loss": 0.76892304, + "learning_rate": 3.610322329047508e-07, + "loss": 0.79022652, + "num_input_tokens_seen": 291318730, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6640625, + "step": 13498, + "time_per_iteration": 2.4672887325286865 + }, + { + "auxiliary_loss_clip": 0.01099408, + "auxiliary_loss_mlp": 0.01036188, + "balance_loss_clip": 1.02421904, + "balance_loss_mlp": 1.03345525, + "epoch": 0.811603787764918, + "flos": 13845288021120.0, + "grad_norm": 1.8169104035593646, + "language_loss": 0.83573735, + "learning_rate": 3.608090626234055e-07, + "loss": 0.85709327, + "num_input_tokens_seen": 291336755, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66015625, + "step": 13499, + "time_per_iteration": 2.436964273452759 + }, + { + "auxiliary_loss_clip": 0.01098883, + "auxiliary_loss_mlp": 0.01028492, + "balance_loss_clip": 1.01583743, + "balance_loss_mlp": 1.0345273, + "epoch": 0.8116639110175861, + "flos": 21614632911360.0, + "grad_norm": 1.6399516291980092, + "language_loss": 0.7623418, + "learning_rate": 3.6058595450034603e-07, + "loss": 0.78361559, + "num_input_tokens_seen": 291356795, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.640625, + "step": 13500, + "time_per_iteration": 2.4722161293029785 + }, + { + "auxiliary_loss_clip": 0.01022943, + "auxiliary_loss_mlp": 0.01002875, + "balance_loss_clip": 1.00186753, + "balance_loss_mlp": 1.0028348, + "epoch": 0.811724034270254, + "flos": 64459799625600.0, + "grad_norm": 0.8054655192024942, + "language_loss": 0.59980321, + "learning_rate": 3.603629085440303e-07, + "loss": 0.62006134, + "num_input_tokens_seen": 291416005, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.20117188, + "step": 13501, + "time_per_iteration": 3.075920820236206 + }, + { + "auxiliary_loss_clip": 0.01096394, + "auxiliary_loss_mlp": 0.01025554, + "balance_loss_clip": 1.01445556, + "balance_loss_mlp": 1.03376746, + "epoch": 0.811784157522922, + "flos": 24754123290240.0, + "grad_norm": 1.494419100022629, + "language_loss": 0.7909618, + "learning_rate": 3.6013992476291753e-07, + "loss": 0.81218129, + "num_input_tokens_seen": 291434870, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.625, + "step": 13502, + "time_per_iteration": 2.4842851161956787 + }, + { + "auxiliary_loss_clip": 0.01099167, + "auxiliary_loss_mlp": 0.01030482, + "balance_loss_clip": 1.01933587, + "balance_loss_mlp": 1.03465641, + "epoch": 0.81184428077559, + "flos": 12167146563840.0, + "grad_norm": 1.8524419382640553, + "language_loss": 0.71067178, + "learning_rate": 3.599170031654635e-07, + "loss": 0.73196828, + "num_input_tokens_seen": 291452230, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.64453125, + "step": 13503, + "time_per_iteration": 2.4225172996520996 + }, + { + "auxiliary_loss_clip": 0.01100085, + "auxiliary_loss_mlp": 0.01027413, + "balance_loss_clip": 1.01456833, + "balance_loss_mlp": 1.0341773, + "epoch": 0.8119044040282579, + "flos": 44422037775360.0, + "grad_norm": 1.453799987643089, + "language_loss": 0.67700541, + "learning_rate": 3.5969414376012065e-07, + "loss": 0.69828039, + "num_input_tokens_seen": 291477425, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.66015625, + "step": 13504, + "time_per_iteration": 2.6496918201446533 + }, + { + "auxiliary_loss_clip": 0.01098923, + "auxiliary_loss_mlp": 0.0102621, + "balance_loss_clip": 1.01401496, + "balance_loss_mlp": 1.03196406, + "epoch": 0.8119645272809259, + "flos": 52155507957120.0, + "grad_norm": 1.9644950813990756, + "language_loss": 0.7421549, + "learning_rate": 3.594713465553403e-07, + "loss": 0.76340622, + "num_input_tokens_seen": 291501070, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 13505, + "time_per_iteration": 2.7024779319763184 + }, + { + "auxiliary_loss_clip": 0.01101045, + "auxiliary_loss_mlp": 0.01026961, + "balance_loss_clip": 1.01418757, + "balance_loss_mlp": 1.03452218, + "epoch": 0.8120246505335939, + "flos": 30232978640640.0, + "grad_norm": 2.0008882636590863, + "language_loss": 0.72537345, + "learning_rate": 3.5924861155957123e-07, + "loss": 0.7466535, + "num_input_tokens_seen": 291524945, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6640625, + "step": 13506, + "time_per_iteration": 2.6178457736968994 + }, + { + "auxiliary_loss_clip": 0.01103788, + "auxiliary_loss_mlp": 0.01029775, + "balance_loss_clip": 1.01809824, + "balance_loss_mlp": 1.03516591, + "epoch": 0.8120847737862619, + "flos": 22127652910080.0, + "grad_norm": 2.171112313487914, + "language_loss": 0.76039851, + "learning_rate": 3.590259387812593e-07, + "loss": 0.78173417, + "num_input_tokens_seen": 291544605, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6875, + "step": 13507, + "time_per_iteration": 2.4627292156219482 + }, + { + "auxiliary_loss_clip": 0.01100119, + "auxiliary_loss_mlp": 0.01027115, + "balance_loss_clip": 1.01539683, + "balance_loss_mlp": 1.03240228, + "epoch": 0.8121448970389298, + "flos": 23295180579840.0, + "grad_norm": 1.60963129447103, + "language_loss": 0.70528185, + "learning_rate": 3.5880332822884783e-07, + "loss": 0.72655416, + "num_input_tokens_seen": 291563850, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 13508, + "time_per_iteration": 2.5088045597076416 + }, + { + "auxiliary_loss_clip": 0.01098821, + "auxiliary_loss_mlp": 0.01029098, + "balance_loss_clip": 1.01806569, + "balance_loss_mlp": 1.03429413, + "epoch": 0.8122050202915978, + "flos": 22164138149760.0, + "grad_norm": 1.6081819995650735, + "language_loss": 0.75921357, + "learning_rate": 3.585807799107785e-07, + "loss": 0.78049272, + "num_input_tokens_seen": 291581730, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.64453125, + "step": 13509, + "time_per_iteration": 2.5896267890930176 + }, + { + "auxiliary_loss_clip": 0.0110263, + "auxiliary_loss_mlp": 0.01031713, + "balance_loss_clip": 1.01973832, + "balance_loss_mlp": 1.03542197, + "epoch": 0.8122651435442657, + "flos": 23258946735360.0, + "grad_norm": 2.5116531958585377, + "language_loss": 0.76849926, + "learning_rate": 3.58358293835491e-07, + "loss": 0.78984267, + "num_input_tokens_seen": 291601225, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.671875, + "step": 13510, + "time_per_iteration": 2.5139570236206055 + }, + { + "auxiliary_loss_clip": 0.01102069, + "auxiliary_loss_mlp": 0.01031746, + "balance_loss_clip": 1.01926446, + "balance_loss_mlp": 1.03460789, + "epoch": 0.8123252667969337, + "flos": 16140015365760.0, + "grad_norm": 1.7922346850114963, + "language_loss": 0.69833112, + "learning_rate": 3.581358700114212e-07, + "loss": 0.71966922, + "num_input_tokens_seen": 291616995, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.67578125, + "step": 13511, + "time_per_iteration": 3.824244737625122 + }, + { + "auxiliary_loss_clip": 0.01102581, + "auxiliary_loss_mlp": 0.01035583, + "balance_loss_clip": 1.02370417, + "balance_loss_mlp": 1.03556788, + "epoch": 0.8123853900496016, + "flos": 21245399055360.0, + "grad_norm": 1.6910823817880791, + "language_loss": 0.79742736, + "learning_rate": 3.57913508447004e-07, + "loss": 0.81880891, + "num_input_tokens_seen": 291636145, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66796875, + "step": 13512, + "time_per_iteration": 3.9224977493286133 + }, + { + "auxiliary_loss_clip": 0.01096955, + "auxiliary_loss_mlp": 0.01030133, + "balance_loss_clip": 1.01904023, + "balance_loss_mlp": 1.03273702, + "epoch": 0.8124455133022697, + "flos": 64377596373120.0, + "grad_norm": 1.6257879810595937, + "language_loss": 0.63466936, + "learning_rate": 3.5769120915067076e-07, + "loss": 0.65594023, + "num_input_tokens_seen": 291662440, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.640625, + "step": 13513, + "time_per_iteration": 4.236290454864502 + }, + { + "auxiliary_loss_clip": 0.01102479, + "auxiliary_loss_mlp": 0.01032035, + "balance_loss_clip": 1.02036476, + "balance_loss_mlp": 1.03472722, + "epoch": 0.8125056365549376, + "flos": 23842207779840.0, + "grad_norm": 1.7631319357597248, + "language_loss": 0.71392423, + "learning_rate": 3.5746897213085194e-07, + "loss": 0.73526937, + "num_input_tokens_seen": 291680950, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 13514, + "time_per_iteration": 2.4483985900878906 + }, + { + "auxiliary_loss_clip": 0.01096174, + "auxiliary_loss_mlp": 0.01027979, + "balance_loss_clip": 1.016523, + "balance_loss_mlp": 1.03252888, + "epoch": 0.8125657598076056, + "flos": 23550325862400.0, + "grad_norm": 1.5280686394731957, + "language_loss": 0.62873226, + "learning_rate": 3.5724679739597364e-07, + "loss": 0.64997381, + "num_input_tokens_seen": 291702395, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.63671875, + "step": 13515, + "time_per_iteration": 2.4698777198791504 + }, + { + "auxiliary_loss_clip": 0.01093097, + "auxiliary_loss_mlp": 0.01027085, + "balance_loss_clip": 1.01553404, + "balance_loss_mlp": 1.03191626, + "epoch": 0.8126258830602736, + "flos": 20704225772160.0, + "grad_norm": 2.3397384138654482, + "language_loss": 0.7533434, + "learning_rate": 3.570246849544616e-07, + "loss": 0.77454519, + "num_input_tokens_seen": 291721135, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.609375, + "step": 13516, + "time_per_iteration": 2.437889337539673 + }, + { + "auxiliary_loss_clip": 0.01101307, + "auxiliary_loss_mlp": 0.01031935, + "balance_loss_clip": 1.02078295, + "balance_loss_mlp": 1.03491974, + "epoch": 0.8126860063129415, + "flos": 23618160696960.0, + "grad_norm": 1.450755820656369, + "language_loss": 0.91134322, + "learning_rate": 3.5680263481473907e-07, + "loss": 0.9326756, + "num_input_tokens_seen": 291741235, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6640625, + "step": 13517, + "time_per_iteration": 3.8920905590057373 + }, + { + "auxiliary_loss_clip": 0.01103043, + "auxiliary_loss_mlp": 0.01031105, + "balance_loss_clip": 1.01977992, + "balance_loss_mlp": 1.03670573, + "epoch": 0.8127461295656095, + "flos": 25007149670400.0, + "grad_norm": 1.4138276648329293, + "language_loss": 0.78618169, + "learning_rate": 3.565806469852244e-07, + "loss": 0.80752319, + "num_input_tokens_seen": 291761430, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 13518, + "time_per_iteration": 2.4696271419525146 + }, + { + "auxiliary_loss_clip": 0.01101068, + "auxiliary_loss_mlp": 0.01029067, + "balance_loss_clip": 1.01901174, + "balance_loss_mlp": 1.03644419, + "epoch": 0.8128062528182775, + "flos": 27342169096320.0, + "grad_norm": 1.5355148444526316, + "language_loss": 0.7910862, + "learning_rate": 3.56358721474336e-07, + "loss": 0.81238753, + "num_input_tokens_seen": 291781755, + "router_z_loss_clip": 0.10058594, + "router_z_loss_mlp": 0.64453125, + "step": 13519, + "time_per_iteration": 2.4910452365875244 + }, + { + "auxiliary_loss_clip": 0.0109989, + "auxiliary_loss_mlp": 0.01033264, + "balance_loss_clip": 1.02146196, + "balance_loss_mlp": 1.03325295, + "epoch": 0.8128663760709455, + "flos": 26506312634880.0, + "grad_norm": 1.5645727915672079, + "language_loss": 0.70513344, + "learning_rate": 3.561368582904905e-07, + "loss": 0.72646499, + "num_input_tokens_seen": 291804410, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 13520, + "time_per_iteration": 2.485353708267212 + }, + { + "auxiliary_loss_clip": 0.01101276, + "auxiliary_loss_mlp": 0.01029255, + "balance_loss_clip": 1.01775169, + "balance_loss_mlp": 1.03453207, + "epoch": 0.8129264993236134, + "flos": 17931239815680.0, + "grad_norm": 1.408947951983829, + "language_loss": 0.72724366, + "learning_rate": 3.5591505744209925e-07, + "loss": 0.74854898, + "num_input_tokens_seen": 291823285, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66796875, + "step": 13521, + "time_per_iteration": 2.450223684310913 + }, + { + "auxiliary_loss_clip": 0.01100366, + "auxiliary_loss_mlp": 0.01028301, + "balance_loss_clip": 1.01657629, + "balance_loss_mlp": 1.03304505, + "epoch": 0.8129866225762814, + "flos": 26177694082560.0, + "grad_norm": 1.5863649174489216, + "language_loss": 0.70147657, + "learning_rate": 3.5569331893757394e-07, + "loss": 0.7227633, + "num_input_tokens_seen": 291845305, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 13522, + "time_per_iteration": 2.5076496601104736 + }, + { + "auxiliary_loss_clip": 0.01096847, + "auxiliary_loss_mlp": 0.01029918, + "balance_loss_clip": 1.01915908, + "balance_loss_mlp": 1.03422368, + "epoch": 0.8130467458289493, + "flos": 21032197879680.0, + "grad_norm": 1.5124690207001534, + "language_loss": 0.70565176, + "learning_rate": 3.554716427853233e-07, + "loss": 0.72691941, + "num_input_tokens_seen": 291863715, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.625, + "step": 13523, + "time_per_iteration": 2.44834566116333 + }, + { + "auxiliary_loss_clip": 0.01098014, + "auxiliary_loss_mlp": 0.01029649, + "balance_loss_clip": 1.01813328, + "balance_loss_mlp": 1.03282428, + "epoch": 0.8131068690816173, + "flos": 15487051979520.0, + "grad_norm": 2.3974878608066192, + "language_loss": 0.71435654, + "learning_rate": 3.5525002899375256e-07, + "loss": 0.73563313, + "num_input_tokens_seen": 291880735, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65234375, + "step": 13524, + "time_per_iteration": 2.3961422443389893 + }, + { + "auxiliary_loss_clip": 0.01099359, + "auxiliary_loss_mlp": 0.01029777, + "balance_loss_clip": 1.01874423, + "balance_loss_mlp": 1.03370309, + "epoch": 0.8131669923342852, + "flos": 29351227576320.0, + "grad_norm": 1.672589680001, + "language_loss": 0.62591136, + "learning_rate": 3.550284775712653e-07, + "loss": 0.64720273, + "num_input_tokens_seen": 291900535, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 13525, + "time_per_iteration": 2.5271544456481934 + }, + { + "auxiliary_loss_clip": 0.01098837, + "auxiliary_loss_mlp": 0.01031848, + "balance_loss_clip": 1.02077389, + "balance_loss_mlp": 1.03405976, + "epoch": 0.8132271155869533, + "flos": 35256162055680.0, + "grad_norm": 1.6477512621572448, + "language_loss": 0.65588397, + "learning_rate": 3.548069885262628e-07, + "loss": 0.67719084, + "num_input_tokens_seen": 291919760, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6484375, + "step": 13526, + "time_per_iteration": 2.540858745574951 + }, + { + "auxiliary_loss_clip": 0.0109667, + "auxiliary_loss_mlp": 0.01027456, + "balance_loss_clip": 1.01690626, + "balance_loss_mlp": 1.03244853, + "epoch": 0.8132872388396212, + "flos": 27781895393280.0, + "grad_norm": 1.5159907981039755, + "language_loss": 0.74966121, + "learning_rate": 3.5458556186714473e-07, + "loss": 0.77090245, + "num_input_tokens_seen": 291938915, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.640625, + "step": 13527, + "time_per_iteration": 2.5109777450561523 + }, + { + "auxiliary_loss_clip": 0.01097482, + "auxiliary_loss_mlp": 0.01024208, + "balance_loss_clip": 1.01289546, + "balance_loss_mlp": 1.0329324, + "epoch": 0.8133473620922892, + "flos": 27819601695360.0, + "grad_norm": 2.97186527291202, + "language_loss": 0.7050457, + "learning_rate": 3.5436419760230706e-07, + "loss": 0.72626257, + "num_input_tokens_seen": 291958145, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.64453125, + "step": 13528, + "time_per_iteration": 2.485001564025879 + }, + { + "auxiliary_loss_clip": 0.01101089, + "auxiliary_loss_mlp": 0.01028041, + "balance_loss_clip": 1.01688933, + "balance_loss_mlp": 1.03395605, + "epoch": 0.8134074853449572, + "flos": 18989527248000.0, + "grad_norm": 1.7819953771263581, + "language_loss": 0.68812644, + "learning_rate": 3.5414289574014357e-07, + "loss": 0.70941776, + "num_input_tokens_seen": 291976860, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.671875, + "step": 13529, + "time_per_iteration": 2.422464370727539 + }, + { + "auxiliary_loss_clip": 0.01095559, + "auxiliary_loss_mlp": 0.01027778, + "balance_loss_clip": 1.01693559, + "balance_loss_mlp": 1.03227735, + "epoch": 0.8134676085976251, + "flos": 24242863057920.0, + "grad_norm": 1.3700358938115667, + "language_loss": 0.77336764, + "learning_rate": 3.5392165628904635e-07, + "loss": 0.79460108, + "num_input_tokens_seen": 291998085, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6328125, + "step": 13530, + "time_per_iteration": 2.4621317386627197 + }, + { + "auxiliary_loss_clip": 0.01100258, + "auxiliary_loss_mlp": 0.0102878, + "balance_loss_clip": 1.01672745, + "balance_loss_mlp": 1.0348835, + "epoch": 0.8135277318502931, + "flos": 19062389986560.0, + "grad_norm": 1.7581333634439877, + "language_loss": 0.82189894, + "learning_rate": 3.537004792574052e-07, + "loss": 0.8431893, + "num_input_tokens_seen": 292016585, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.65234375, + "step": 13531, + "time_per_iteration": 2.427777051925659 + }, + { + "auxiliary_loss_clip": 0.01100801, + "auxiliary_loss_mlp": 0.01029841, + "balance_loss_clip": 1.01733541, + "balance_loss_mlp": 1.03366113, + "epoch": 0.813587855102961, + "flos": 17269728992640.0, + "grad_norm": 3.204591794551847, + "language_loss": 0.71781331, + "learning_rate": 3.534793646536065e-07, + "loss": 0.73911971, + "num_input_tokens_seen": 292033255, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.671875, + "step": 13532, + "time_per_iteration": 2.3813064098358154 + }, + { + "auxiliary_loss_clip": 0.01098279, + "auxiliary_loss_mlp": 0.01025257, + "balance_loss_clip": 1.0142777, + "balance_loss_mlp": 1.03366661, + "epoch": 0.8136479783556291, + "flos": 20157593621760.0, + "grad_norm": 1.7506531009723905, + "language_loss": 0.76182723, + "learning_rate": 3.5325831248603533e-07, + "loss": 0.78306258, + "num_input_tokens_seen": 292051800, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.64453125, + "step": 13533, + "time_per_iteration": 2.438998222351074 + }, + { + "auxiliary_loss_clip": 0.0110237, + "auxiliary_loss_mlp": 0.0103567, + "balance_loss_clip": 1.02322412, + "balance_loss_mlp": 1.03353691, + "epoch": 0.813708101608297, + "flos": 22052348046720.0, + "grad_norm": 2.51446757453604, + "language_loss": 0.7628231, + "learning_rate": 3.5303732276307495e-07, + "loss": 0.78420353, + "num_input_tokens_seen": 292072215, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6875, + "step": 13534, + "time_per_iteration": 2.441894292831421 + }, + { + "auxiliary_loss_clip": 0.01099028, + "auxiliary_loss_mlp": 0.01024057, + "balance_loss_clip": 1.01414454, + "balance_loss_mlp": 1.03443563, + "epoch": 0.813768224860965, + "flos": 16173412035840.0, + "grad_norm": 2.0308122816810448, + "language_loss": 0.92820883, + "learning_rate": 3.5281639549310336e-07, + "loss": 0.94943964, + "num_input_tokens_seen": 292088830, + "router_z_loss_clip": 0.09912109, + "router_z_loss_mlp": 0.6484375, + "step": 13535, + "time_per_iteration": 2.4160375595092773 + }, + { + "auxiliary_loss_clip": 0.01097801, + "auxiliary_loss_mlp": 0.01027934, + "balance_loss_clip": 1.01700234, + "balance_loss_mlp": 1.03462958, + "epoch": 0.8138283481136329, + "flos": 24352318776960.0, + "grad_norm": 1.8424678375947172, + "language_loss": 0.70300984, + "learning_rate": 3.52595530684499e-07, + "loss": 0.72426724, + "num_input_tokens_seen": 292109225, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6328125, + "step": 13536, + "time_per_iteration": 2.456167459487915 + }, + { + "auxiliary_loss_clip": 0.01099152, + "auxiliary_loss_mlp": 0.01030226, + "balance_loss_clip": 1.01872778, + "balance_loss_mlp": 1.03421807, + "epoch": 0.8138884713663009, + "flos": 25516362827520.0, + "grad_norm": 1.5997718183114498, + "language_loss": 0.7515735, + "learning_rate": 3.5237472834563775e-07, + "loss": 0.77286726, + "num_input_tokens_seen": 292129660, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 13537, + "time_per_iteration": 2.465872287750244 + }, + { + "auxiliary_loss_clip": 0.01098101, + "auxiliary_loss_mlp": 0.01025651, + "balance_loss_clip": 1.01411164, + "balance_loss_mlp": 1.03465509, + "epoch": 0.8139485946189688, + "flos": 22454368041600.0, + "grad_norm": 1.4929419897063716, + "language_loss": 0.76306385, + "learning_rate": 3.5215398848489163e-07, + "loss": 0.78430134, + "num_input_tokens_seen": 292149090, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6328125, + "step": 13538, + "time_per_iteration": 2.436523914337158 + }, + { + "auxiliary_loss_clip": 0.0109732, + "auxiliary_loss_mlp": 0.0102977, + "balance_loss_clip": 1.01836777, + "balance_loss_mlp": 1.03199041, + "epoch": 0.8140087178716369, + "flos": 21250391045760.0, + "grad_norm": 1.552319087544461, + "language_loss": 0.77843738, + "learning_rate": 3.5193331111063176e-07, + "loss": 0.79970831, + "num_input_tokens_seen": 292169260, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 13539, + "time_per_iteration": 2.45881986618042 + }, + { + "auxiliary_loss_clip": 0.01098918, + "auxiliary_loss_mlp": 0.01032867, + "balance_loss_clip": 1.02183998, + "balance_loss_mlp": 1.03521299, + "epoch": 0.8140688411243048, + "flos": 39415730774400.0, + "grad_norm": 2.7567444964119603, + "language_loss": 0.66205287, + "learning_rate": 3.5171269623122533e-07, + "loss": 0.68337071, + "num_input_tokens_seen": 292188145, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.63671875, + "step": 13540, + "time_per_iteration": 2.566528797149658 + }, + { + "auxiliary_loss_clip": 0.01100937, + "auxiliary_loss_mlp": 0.01032367, + "balance_loss_clip": 1.02160239, + "balance_loss_mlp": 1.03553224, + "epoch": 0.8141289643769728, + "flos": 25415885508480.0, + "grad_norm": 1.5718398133314953, + "language_loss": 0.67359984, + "learning_rate": 3.5149214385503913e-07, + "loss": 0.69493288, + "num_input_tokens_seen": 292212135, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.65625, + "step": 13541, + "time_per_iteration": 2.5164880752563477 + }, + { + "auxiliary_loss_clip": 0.01098261, + "auxiliary_loss_mlp": 0.01031047, + "balance_loss_clip": 1.01922774, + "balance_loss_mlp": 1.03353024, + "epoch": 0.8141890876296408, + "flos": 12568053237120.0, + "grad_norm": 2.2070467934510534, + "language_loss": 0.6900422, + "learning_rate": 3.512716539904355e-07, + "loss": 0.7113353, + "num_input_tokens_seen": 292230645, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6484375, + "step": 13542, + "time_per_iteration": 2.4056601524353027 + }, + { + "auxiliary_loss_clip": 0.01103316, + "auxiliary_loss_mlp": 0.01031253, + "balance_loss_clip": 1.01861072, + "balance_loss_mlp": 1.03395188, + "epoch": 0.8142492108823087, + "flos": 14967172483200.0, + "grad_norm": 2.5090271200774317, + "language_loss": 0.79490924, + "learning_rate": 3.5105122664577613e-07, + "loss": 0.81625485, + "num_input_tokens_seen": 292243540, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 13543, + "time_per_iteration": 2.40470814704895 + }, + { + "auxiliary_loss_clip": 0.0110378, + "auxiliary_loss_mlp": 0.01037918, + "balance_loss_clip": 1.02544892, + "balance_loss_mlp": 1.03483176, + "epoch": 0.8143093341349767, + "flos": 12422004537600.0, + "grad_norm": 4.984235141468566, + "language_loss": 0.77592224, + "learning_rate": 3.5083086182942003e-07, + "loss": 0.79733926, + "num_input_tokens_seen": 292261715, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69140625, + "step": 13544, + "time_per_iteration": 2.400831699371338 + }, + { + "auxiliary_loss_clip": 0.01107998, + "auxiliary_loss_mlp": 0.01030568, + "balance_loss_clip": 1.01703811, + "balance_loss_mlp": 1.03668332, + "epoch": 0.8143694573876447, + "flos": 11910564737280.0, + "grad_norm": 2.959736733098292, + "language_loss": 0.73320651, + "learning_rate": 3.5061055954972264e-07, + "loss": 0.75459218, + "num_input_tokens_seen": 292275080, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7109375, + "step": 13545, + "time_per_iteration": 2.4131081104278564 + }, + { + "auxiliary_loss_clip": 0.01097302, + "auxiliary_loss_mlp": 0.01028993, + "balance_loss_clip": 1.01790035, + "balance_loss_mlp": 1.03349757, + "epoch": 0.8144295806403127, + "flos": 21212900225280.0, + "grad_norm": 3.066983178080017, + "language_loss": 0.76798058, + "learning_rate": 3.5039031981503776e-07, + "loss": 0.78924346, + "num_input_tokens_seen": 292294635, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.63671875, + "step": 13546, + "time_per_iteration": 2.468132495880127 + }, + { + "auxiliary_loss_clip": 0.01103793, + "auxiliary_loss_mlp": 0.01027092, + "balance_loss_clip": 1.01630902, + "balance_loss_mlp": 1.03670955, + "epoch": 0.8144897038929806, + "flos": 19865280741120.0, + "grad_norm": 3.13218822549319, + "language_loss": 0.70365715, + "learning_rate": 3.501701426337178e-07, + "loss": 0.72496605, + "num_input_tokens_seen": 292312695, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.671875, + "step": 13547, + "time_per_iteration": 2.4655685424804688 + }, + { + "auxiliary_loss_clip": 0.01103958, + "auxiliary_loss_mlp": 0.01034207, + "balance_loss_clip": 1.02147591, + "balance_loss_mlp": 1.03629994, + "epoch": 0.8145498271456486, + "flos": 24571733005440.0, + "grad_norm": 2.7390285588234913, + "language_loss": 0.70459747, + "learning_rate": 3.49950028014111e-07, + "loss": 0.72597909, + "num_input_tokens_seen": 292332005, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.67578125, + "step": 13548, + "time_per_iteration": 2.470452070236206 + }, + { + "auxiliary_loss_clip": 0.01104253, + "auxiliary_loss_mlp": 0.0103321, + "balance_loss_clip": 1.02054405, + "balance_loss_mlp": 1.03680122, + "epoch": 0.8146099503983165, + "flos": 20193037367040.0, + "grad_norm": 2.3353911072651794, + "language_loss": 0.76804066, + "learning_rate": 3.4972997596456444e-07, + "loss": 0.7894153, + "num_input_tokens_seen": 292348365, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.67578125, + "step": 13549, + "time_per_iteration": 2.4691059589385986 + }, + { + "auxiliary_loss_clip": 0.01101068, + "auxiliary_loss_mlp": 0.01030286, + "balance_loss_clip": 1.01863968, + "balance_loss_mlp": 1.03536999, + "epoch": 0.8146700736509845, + "flos": 19536949497600.0, + "grad_norm": 1.9549625457918085, + "language_loss": 0.71548051, + "learning_rate": 3.4950998649342233e-07, + "loss": 0.73679399, + "num_input_tokens_seen": 292368050, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 13550, + "time_per_iteration": 2.422795057296753 + }, + { + "auxiliary_loss_clip": 0.01096222, + "auxiliary_loss_mlp": 0.01025099, + "balance_loss_clip": 1.01402998, + "balance_loss_mlp": 1.03409493, + "epoch": 0.8147301969036524, + "flos": 18041341979520.0, + "grad_norm": 1.9969484148682712, + "language_loss": 0.71753186, + "learning_rate": 3.4929005960902826e-07, + "loss": 0.73874509, + "num_input_tokens_seen": 292385315, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.62109375, + "step": 13551, + "time_per_iteration": 2.409451723098755 + }, + { + "auxiliary_loss_clip": 0.01106922, + "auxiliary_loss_mlp": 0.0103091, + "balance_loss_clip": 1.01781511, + "balance_loss_mlp": 1.03745246, + "epoch": 0.8147903201563205, + "flos": 18004713085440.0, + "grad_norm": 1.9327416746380717, + "language_loss": 0.68366426, + "learning_rate": 3.4907019531971926e-07, + "loss": 0.7050426, + "num_input_tokens_seen": 292403375, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6953125, + "step": 13552, + "time_per_iteration": 2.406599760055542 + }, + { + "auxiliary_loss_clip": 0.01098814, + "auxiliary_loss_mlp": 0.01037716, + "balance_loss_clip": 1.02616453, + "balance_loss_mlp": 1.03343797, + "epoch": 0.8148504434089884, + "flos": 20259327916800.0, + "grad_norm": 2.3058908997285377, + "language_loss": 0.82212341, + "learning_rate": 3.4885039363383407e-07, + "loss": 0.84348869, + "num_input_tokens_seen": 292419260, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65234375, + "step": 13553, + "time_per_iteration": 3.8188424110412598 + }, + { + "auxiliary_loss_clip": 0.0109982, + "auxiliary_loss_mlp": 0.01025878, + "balance_loss_clip": 1.01406431, + "balance_loss_mlp": 1.03445435, + "epoch": 0.8149105666616564, + "flos": 12494723621760.0, + "grad_norm": 1.7247482274823256, + "language_loss": 0.68057621, + "learning_rate": 3.4863065455970795e-07, + "loss": 0.70183313, + "num_input_tokens_seen": 292436095, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65625, + "step": 13554, + "time_per_iteration": 3.834584951400757 + }, + { + "auxiliary_loss_clip": 0.0110393, + "auxiliary_loss_mlp": 0.01029417, + "balance_loss_clip": 1.01726937, + "balance_loss_mlp": 1.03757811, + "epoch": 0.8149706899143244, + "flos": 32523683662080.0, + "grad_norm": 1.6822328630436465, + "language_loss": 0.66322923, + "learning_rate": 3.484109781056723e-07, + "loss": 0.68456268, + "num_input_tokens_seen": 292457190, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6640625, + "step": 13555, + "time_per_iteration": 3.930266857147217 + }, + { + "auxiliary_loss_clip": 0.0110298, + "auxiliary_loss_mlp": 0.01033513, + "balance_loss_clip": 1.02141881, + "balance_loss_mlp": 1.03490579, + "epoch": 0.8150308131669923, + "flos": 19386088375680.0, + "grad_norm": 1.8062720841760551, + "language_loss": 0.73134083, + "learning_rate": 3.4819136428005844e-07, + "loss": 0.75270575, + "num_input_tokens_seen": 292474300, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 13556, + "time_per_iteration": 2.4044859409332275 + }, + { + "auxiliary_loss_clip": 0.01100509, + "auxiliary_loss_mlp": 0.01026439, + "balance_loss_clip": 1.01595473, + "balance_loss_mlp": 1.03617573, + "epoch": 0.8150909364196604, + "flos": 17421380213760.0, + "grad_norm": 1.547127180086827, + "language_loss": 0.80460906, + "learning_rate": 3.4797181309119307e-07, + "loss": 0.8258785, + "num_input_tokens_seen": 292492420, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.64453125, + "step": 13557, + "time_per_iteration": 2.415175199508667 + }, + { + "auxiliary_loss_clip": 0.01104379, + "auxiliary_loss_mlp": 0.01031154, + "balance_loss_clip": 1.01966846, + "balance_loss_mlp": 1.03623772, + "epoch": 0.8151510596723283, + "flos": 27162795553920.0, + "grad_norm": 3.4496613594864227, + "language_loss": 0.65522265, + "learning_rate": 3.4775232454740255e-07, + "loss": 0.67657804, + "num_input_tokens_seen": 292512895, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6796875, + "step": 13558, + "time_per_iteration": 3.919435977935791 + }, + { + "auxiliary_loss_clip": 0.01022856, + "auxiliary_loss_mlp": 0.01007035, + "balance_loss_clip": 1.00606906, + "balance_loss_mlp": 1.00276268, + "epoch": 0.8152111829249963, + "flos": 64219052718720.0, + "grad_norm": 0.9540585535167397, + "language_loss": 0.568519, + "learning_rate": 3.4753289865700896e-07, + "loss": 0.58881789, + "num_input_tokens_seen": 292566580, + "router_z_loss_clip": 0.00964355, + "router_z_loss_mlp": 0.20117188, + "step": 13559, + "time_per_iteration": 2.9688994884490967 + }, + { + "auxiliary_loss_clip": 0.01023096, + "auxiliary_loss_mlp": 0.0100422, + "balance_loss_clip": 1.00320113, + "balance_loss_mlp": 1.0028497, + "epoch": 0.8152713061776642, + "flos": 67072012306560.0, + "grad_norm": 0.6777740106717581, + "language_loss": 0.5530026, + "learning_rate": 3.473135354283334e-07, + "loss": 0.57327569, + "num_input_tokens_seen": 292621490, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.203125, + "step": 13560, + "time_per_iteration": 2.901609182357788 + }, + { + "auxiliary_loss_clip": 0.01098355, + "auxiliary_loss_mlp": 0.01029382, + "balance_loss_clip": 1.01828361, + "balance_loss_mlp": 1.03364336, + "epoch": 0.8153314294303322, + "flos": 14391130072320.0, + "grad_norm": 1.715897445640704, + "language_loss": 0.67507559, + "learning_rate": 3.470942348696948e-07, + "loss": 0.69635296, + "num_input_tokens_seen": 292638660, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 13561, + "time_per_iteration": 2.423055648803711 + }, + { + "auxiliary_loss_clip": 0.01104045, + "auxiliary_loss_mlp": 0.01030653, + "balance_loss_clip": 1.01900613, + "balance_loss_mlp": 1.03551221, + "epoch": 0.8153915526830001, + "flos": 25623520076160.0, + "grad_norm": 1.570606711113296, + "language_loss": 0.81580901, + "learning_rate": 3.468749969894085e-07, + "loss": 0.83715606, + "num_input_tokens_seen": 292658545, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6875, + "step": 13562, + "time_per_iteration": 2.458662271499634 + }, + { + "auxiliary_loss_clip": 0.0109998, + "auxiliary_loss_mlp": 0.01030188, + "balance_loss_clip": 1.0185647, + "balance_loss_mlp": 1.03420377, + "epoch": 0.8154516759356681, + "flos": 23369156640000.0, + "grad_norm": 1.823972235032081, + "language_loss": 0.72110701, + "learning_rate": 3.4665582179578734e-07, + "loss": 0.74240875, + "num_input_tokens_seen": 292678460, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 13563, + "time_per_iteration": 2.4745733737945557 + }, + { + "auxiliary_loss_clip": 0.01099418, + "auxiliary_loss_mlp": 0.01029075, + "balance_loss_clip": 1.01654577, + "balance_loss_mlp": 1.03244758, + "epoch": 0.815511799188336, + "flos": 28149189914880.0, + "grad_norm": 1.563191815862559, + "language_loss": 0.70054388, + "learning_rate": 3.4643670929714387e-07, + "loss": 0.72182882, + "num_input_tokens_seen": 292699815, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.671875, + "step": 13564, + "time_per_iteration": 2.4845049381256104 + }, + { + "auxiliary_loss_clip": 0.01101322, + "auxiliary_loss_mlp": 0.01026763, + "balance_loss_clip": 1.01533651, + "balance_loss_mlp": 1.0358603, + "epoch": 0.8155719224410041, + "flos": 16983413683200.0, + "grad_norm": 1.9946584729028405, + "language_loss": 0.70459116, + "learning_rate": 3.462176595017854e-07, + "loss": 0.72587204, + "num_input_tokens_seen": 292717370, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 13565, + "time_per_iteration": 2.4145777225494385 + }, + { + "auxiliary_loss_clip": 0.0109904, + "auxiliary_loss_mlp": 0.01034196, + "balance_loss_clip": 1.02238798, + "balance_loss_mlp": 1.03453624, + "epoch": 0.815632045693672, + "flos": 24681727428480.0, + "grad_norm": 1.6936331976057795, + "language_loss": 0.78862619, + "learning_rate": 3.459986724180188e-07, + "loss": 0.80995858, + "num_input_tokens_seen": 292737110, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.64453125, + "step": 13566, + "time_per_iteration": 2.4679157733917236 + }, + { + "auxiliary_loss_clip": 0.01099231, + "auxiliary_loss_mlp": 0.01030299, + "balance_loss_clip": 1.01951647, + "balance_loss_mlp": 1.03541529, + "epoch": 0.81569216894634, + "flos": 19938323047680.0, + "grad_norm": 1.7251861145582532, + "language_loss": 0.82568282, + "learning_rate": 3.457797480541491e-07, + "loss": 0.84697807, + "num_input_tokens_seen": 292756510, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.640625, + "step": 13567, + "time_per_iteration": 2.4739766120910645 + }, + { + "auxiliary_loss_clip": 0.01097184, + "auxiliary_loss_mlp": 0.01025183, + "balance_loss_clip": 1.01482391, + "balance_loss_mlp": 1.03362584, + "epoch": 0.8157522921990079, + "flos": 21799393493760.0, + "grad_norm": 2.02909207934991, + "language_loss": 0.7959435, + "learning_rate": 3.455608864184771e-07, + "loss": 0.81716716, + "num_input_tokens_seen": 292776710, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.63671875, + "step": 13568, + "time_per_iteration": 2.456554889678955 + }, + { + "auxiliary_loss_clip": 0.01095954, + "auxiliary_loss_mlp": 0.01027921, + "balance_loss_clip": 1.01694787, + "balance_loss_mlp": 1.03310943, + "epoch": 0.8158124154516759, + "flos": 18508323720960.0, + "grad_norm": 1.8416659352028353, + "language_loss": 0.77024674, + "learning_rate": 3.453420875193016e-07, + "loss": 0.79148549, + "num_input_tokens_seen": 292794350, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.62890625, + "step": 13569, + "time_per_iteration": 2.476374626159668 + }, + { + "auxiliary_loss_clip": 0.01098074, + "auxiliary_loss_mlp": 0.01033043, + "balance_loss_clip": 1.02225435, + "balance_loss_mlp": 1.03394771, + "epoch": 0.815872538704344, + "flos": 26830801123200.0, + "grad_norm": 2.224542693134122, + "language_loss": 0.58551776, + "learning_rate": 3.451233513649199e-07, + "loss": 0.60682887, + "num_input_tokens_seen": 292814005, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.640625, + "step": 13570, + "time_per_iteration": 2.5146484375 + }, + { + "auxiliary_loss_clip": 0.01103281, + "auxiliary_loss_mlp": 0.01034724, + "balance_loss_clip": 1.02286851, + "balance_loss_mlp": 1.03557253, + "epoch": 0.8159326619570119, + "flos": 21725704742400.0, + "grad_norm": 2.075350535427022, + "language_loss": 0.82674634, + "learning_rate": 3.4490467796362687e-07, + "loss": 0.84812641, + "num_input_tokens_seen": 292833485, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 13571, + "time_per_iteration": 2.4438490867614746 + }, + { + "auxiliary_loss_clip": 0.01100306, + "auxiliary_loss_mlp": 0.01036489, + "balance_loss_clip": 1.02435327, + "balance_loss_mlp": 1.03504193, + "epoch": 0.8159927852096799, + "flos": 13840726993920.0, + "grad_norm": 2.3390171300430223, + "language_loss": 0.78043985, + "learning_rate": 3.446860673237142e-07, + "loss": 0.80180776, + "num_input_tokens_seen": 292848045, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.65234375, + "step": 13572, + "time_per_iteration": 2.3682100772857666 + }, + { + "auxiliary_loss_clip": 0.01101131, + "auxiliary_loss_mlp": 0.01034215, + "balance_loss_clip": 1.02277076, + "balance_loss_mlp": 1.03415191, + "epoch": 0.8160529084623478, + "flos": 24499516711680.0, + "grad_norm": 1.653683230852661, + "language_loss": 0.64836442, + "learning_rate": 3.4446751945347186e-07, + "loss": 0.66971791, + "num_input_tokens_seen": 292869965, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 13573, + "time_per_iteration": 2.4918434619903564 + }, + { + "auxiliary_loss_clip": 0.01099065, + "auxiliary_loss_mlp": 0.0102872, + "balance_loss_clip": 1.01802051, + "balance_loss_mlp": 1.03432262, + "epoch": 0.8161130317150158, + "flos": 24826339584000.0, + "grad_norm": 1.5818326048732438, + "language_loss": 0.75434422, + "learning_rate": 3.442490343611868e-07, + "loss": 0.77562207, + "num_input_tokens_seen": 292889680, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6484375, + "step": 13574, + "time_per_iteration": 2.4578306674957275 + }, + { + "auxiliary_loss_clip": 0.01101338, + "auxiliary_loss_mlp": 0.01032163, + "balance_loss_clip": 1.0202359, + "balance_loss_mlp": 1.03471351, + "epoch": 0.8161731549676837, + "flos": 30956542208640.0, + "grad_norm": 2.0897739522455345, + "language_loss": 0.59801751, + "learning_rate": 3.4403061205514485e-07, + "loss": 0.61935258, + "num_input_tokens_seen": 292912360, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66796875, + "step": 13575, + "time_per_iteration": 2.5205721855163574 + }, + { + "auxiliary_loss_clip": 0.01100012, + "auxiliary_loss_mlp": 0.01030819, + "balance_loss_clip": 1.01865935, + "balance_loss_mlp": 1.03423405, + "epoch": 0.8162332782203517, + "flos": 18551991680640.0, + "grad_norm": 1.797663124908432, + "language_loss": 0.7433396, + "learning_rate": 3.4381225254362736e-07, + "loss": 0.76464796, + "num_input_tokens_seen": 292928325, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.65625, + "step": 13576, + "time_per_iteration": 2.407588243484497 + }, + { + "auxiliary_loss_clip": 0.01022867, + "auxiliary_loss_mlp": 0.01000366, + "balance_loss_clip": 0.99935305, + "balance_loss_mlp": 1.00269318, + "epoch": 0.8162934014730197, + "flos": 70386853904640.0, + "grad_norm": 0.8261597243794896, + "language_loss": 0.58621252, + "learning_rate": 3.435939558349155e-07, + "loss": 0.60644484, + "num_input_tokens_seen": 292992795, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.20214844, + "step": 13577, + "time_per_iteration": 3.03220534324646 + }, + { + "auxiliary_loss_clip": 0.01096665, + "auxiliary_loss_mlp": 0.01028017, + "balance_loss_clip": 1.01725233, + "balance_loss_mlp": 1.03460181, + "epoch": 0.8163535247256877, + "flos": 21214839559680.0, + "grad_norm": 1.6864707181189702, + "language_loss": 0.71403098, + "learning_rate": 3.4337572193728747e-07, + "loss": 0.73527777, + "num_input_tokens_seen": 293011950, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.62109375, + "step": 13578, + "time_per_iteration": 2.424729585647583 + }, + { + "auxiliary_loss_clip": 0.01099052, + "auxiliary_loss_mlp": 0.01026889, + "balance_loss_clip": 1.01565957, + "balance_loss_mlp": 1.03452241, + "epoch": 0.8164136479783556, + "flos": 21098847565440.0, + "grad_norm": 1.7521763142513538, + "language_loss": 0.73644769, + "learning_rate": 3.431575508590172e-07, + "loss": 0.75770712, + "num_input_tokens_seen": 293030175, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.64453125, + "step": 13579, + "time_per_iteration": 2.449126720428467 + }, + { + "auxiliary_loss_clip": 0.0110219, + "auxiliary_loss_mlp": 0.0102731, + "balance_loss_clip": 1.01587224, + "balance_loss_mlp": 1.03481781, + "epoch": 0.8164737712310236, + "flos": 21720640924800.0, + "grad_norm": 1.8185977038329606, + "language_loss": 0.78892076, + "learning_rate": 3.4293944260837873e-07, + "loss": 0.81021571, + "num_input_tokens_seen": 293047980, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 13580, + "time_per_iteration": 2.425795555114746 + }, + { + "auxiliary_loss_clip": 0.0109879, + "auxiliary_loss_mlp": 0.01033251, + "balance_loss_clip": 1.02177167, + "balance_loss_mlp": 1.0351367, + "epoch": 0.8165338944836915, + "flos": 19536805843200.0, + "grad_norm": 1.6552984035314777, + "language_loss": 0.68889928, + "learning_rate": 3.4272139719364314e-07, + "loss": 0.71021968, + "num_input_tokens_seen": 293067030, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.63671875, + "step": 13581, + "time_per_iteration": 2.4383456707000732 + }, + { + "auxiliary_loss_clip": 0.01099114, + "auxiliary_loss_mlp": 0.01026331, + "balance_loss_clip": 1.01510167, + "balance_loss_mlp": 1.03388476, + "epoch": 0.8165940177363595, + "flos": 22928568416640.0, + "grad_norm": 1.7746948932772684, + "language_loss": 0.59413254, + "learning_rate": 3.4250341462307786e-07, + "loss": 0.61538696, + "num_input_tokens_seen": 293085575, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 13582, + "time_per_iteration": 2.446333169937134 + }, + { + "auxiliary_loss_clip": 0.01095885, + "auxiliary_loss_mlp": 0.0102881, + "balance_loss_clip": 1.01781857, + "balance_loss_mlp": 1.0341419, + "epoch": 0.8166541409890276, + "flos": 23370377702400.0, + "grad_norm": 1.3768473846138, + "language_loss": 0.82010365, + "learning_rate": 3.4228549490494897e-07, + "loss": 0.84135062, + "num_input_tokens_seen": 293108200, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6171875, + "step": 13583, + "time_per_iteration": 2.5130319595336914 + }, + { + "auxiliary_loss_clip": 0.01100945, + "auxiliary_loss_mlp": 0.01026345, + "balance_loss_clip": 1.01556253, + "balance_loss_mlp": 1.03554404, + "epoch": 0.8167142642416955, + "flos": 18441997257600.0, + "grad_norm": 1.826376141415004, + "language_loss": 0.7425015, + "learning_rate": 3.4206763804752093e-07, + "loss": 0.76377439, + "num_input_tokens_seen": 293126020, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.65625, + "step": 13584, + "time_per_iteration": 2.409642457962036 + }, + { + "auxiliary_loss_clip": 0.01102581, + "auxiliary_loss_mlp": 0.01028328, + "balance_loss_clip": 1.01652598, + "balance_loss_mlp": 1.03693473, + "epoch": 0.8167743874943635, + "flos": 21214983214080.0, + "grad_norm": 4.717780389008525, + "language_loss": 0.74340463, + "learning_rate": 3.4184984405905405e-07, + "loss": 0.76471376, + "num_input_tokens_seen": 293144620, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65625, + "step": 13585, + "time_per_iteration": 2.4532628059387207 + }, + { + "auxiliary_loss_clip": 0.01101272, + "auxiliary_loss_mlp": 0.01035762, + "balance_loss_clip": 1.02410972, + "balance_loss_mlp": 1.03578067, + "epoch": 0.8168345107470314, + "flos": 18697681244160.0, + "grad_norm": 1.6545337025021891, + "language_loss": 0.69145906, + "learning_rate": 3.416321129478068e-07, + "loss": 0.71282941, + "num_input_tokens_seen": 293162850, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 13586, + "time_per_iteration": 2.4178428649902344 + }, + { + "auxiliary_loss_clip": 0.01100971, + "auxiliary_loss_mlp": 0.01029383, + "balance_loss_clip": 1.01897573, + "balance_loss_mlp": 1.03632236, + "epoch": 0.8168946339996994, + "flos": 16253098358400.0, + "grad_norm": 1.5123353842035532, + "language_loss": 0.60895872, + "learning_rate": 3.4141444472203594e-07, + "loss": 0.63026226, + "num_input_tokens_seen": 293181620, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.6484375, + "step": 13587, + "time_per_iteration": 2.4606432914733887 + }, + { + "auxiliary_loss_clip": 0.01102914, + "auxiliary_loss_mlp": 0.0103289, + "balance_loss_clip": 1.02086794, + "balance_loss_mlp": 1.03446078, + "epoch": 0.8169547572523673, + "flos": 26941585645440.0, + "grad_norm": 2.2917834660377534, + "language_loss": 0.6959576, + "learning_rate": 3.4119683938999624e-07, + "loss": 0.71731567, + "num_input_tokens_seen": 293200270, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6875, + "step": 13588, + "time_per_iteration": 2.4705071449279785 + }, + { + "auxiliary_loss_clip": 0.011024, + "auxiliary_loss_mlp": 0.01032191, + "balance_loss_clip": 1.01979363, + "balance_loss_mlp": 1.03581333, + "epoch": 0.8170148805050353, + "flos": 18952323736320.0, + "grad_norm": 1.5447018635848075, + "language_loss": 0.73065209, + "learning_rate": 3.4097929695993854e-07, + "loss": 0.75199795, + "num_input_tokens_seen": 293218960, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6640625, + "step": 13589, + "time_per_iteration": 2.45959210395813 + }, + { + "auxiliary_loss_clip": 0.01097376, + "auxiliary_loss_mlp": 0.01031015, + "balance_loss_clip": 1.01933265, + "balance_loss_mlp": 1.03389359, + "epoch": 0.8170750037577033, + "flos": 21834909066240.0, + "grad_norm": 1.7225863483804695, + "language_loss": 0.72977889, + "learning_rate": 3.4076181744011166e-07, + "loss": 0.75106275, + "num_input_tokens_seen": 293236450, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.63671875, + "step": 13590, + "time_per_iteration": 2.4645774364471436 + }, + { + "auxiliary_loss_clip": 0.01104182, + "auxiliary_loss_mlp": 0.01031011, + "balance_loss_clip": 1.01877403, + "balance_loss_mlp": 1.03568554, + "epoch": 0.8171351270103713, + "flos": 33507169021440.0, + "grad_norm": 1.9829017797155066, + "language_loss": 0.65020001, + "learning_rate": 3.4054440083876345e-07, + "loss": 0.67155194, + "num_input_tokens_seen": 293256480, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 13591, + "time_per_iteration": 2.536670207977295 + }, + { + "auxiliary_loss_clip": 0.01101496, + "auxiliary_loss_mlp": 0.01033923, + "balance_loss_clip": 1.02190661, + "balance_loss_mlp": 1.03379011, + "epoch": 0.8171952502630392, + "flos": 22708184520960.0, + "grad_norm": 1.7852703265171805, + "language_loss": 0.68164837, + "learning_rate": 3.403270471641373e-07, + "loss": 0.70300251, + "num_input_tokens_seen": 293274960, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 13592, + "time_per_iteration": 2.4266905784606934 + }, + { + "auxiliary_loss_clip": 0.0110127, + "auxiliary_loss_mlp": 0.01025018, + "balance_loss_clip": 1.01322818, + "balance_loss_mlp": 1.03527188, + "epoch": 0.8172553735157072, + "flos": 26723715701760.0, + "grad_norm": 1.9676579409127355, + "language_loss": 0.66395956, + "learning_rate": 3.401097564244759e-07, + "loss": 0.68522245, + "num_input_tokens_seen": 293295945, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.66015625, + "step": 13593, + "time_per_iteration": 2.4653220176696777 + }, + { + "auxiliary_loss_clip": 0.01098248, + "auxiliary_loss_mlp": 0.01030415, + "balance_loss_clip": 1.01953697, + "balance_loss_mlp": 1.0330466, + "epoch": 0.8173154967683751, + "flos": 15961072786560.0, + "grad_norm": 2.1589991907260826, + "language_loss": 0.69275898, + "learning_rate": 3.398925286280188e-07, + "loss": 0.71404564, + "num_input_tokens_seen": 293313300, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.65234375, + "step": 13594, + "time_per_iteration": 2.441347122192383 + }, + { + "auxiliary_loss_clip": 0.01101079, + "auxiliary_loss_mlp": 0.01029785, + "balance_loss_clip": 1.01838267, + "balance_loss_mlp": 1.03391123, + "epoch": 0.8173756200210431, + "flos": 25986720447360.0, + "grad_norm": 1.7250973589133012, + "language_loss": 0.65802509, + "learning_rate": 3.3967536378300456e-07, + "loss": 0.67933369, + "num_input_tokens_seen": 293333085, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 13595, + "time_per_iteration": 3.834423542022705 + }, + { + "auxiliary_loss_clip": 0.01103171, + "auxiliary_loss_mlp": 0.01026496, + "balance_loss_clip": 1.01491535, + "balance_loss_mlp": 1.03471351, + "epoch": 0.8174357432737112, + "flos": 25664422688640.0, + "grad_norm": 1.6805151919740065, + "language_loss": 0.78552544, + "learning_rate": 3.394582618976658e-07, + "loss": 0.80682206, + "num_input_tokens_seen": 293351895, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.68359375, + "step": 13596, + "time_per_iteration": 3.870290517807007 + }, + { + "auxiliary_loss_clip": 0.01096898, + "auxiliary_loss_mlp": 0.0102693, + "balance_loss_clip": 1.01530743, + "balance_loss_mlp": 1.03267086, + "epoch": 0.8174958665263791, + "flos": 21835088634240.0, + "grad_norm": 3.056945403146244, + "language_loss": 0.58674574, + "learning_rate": 3.392412229802362e-07, + "loss": 0.60798407, + "num_input_tokens_seen": 293371165, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.640625, + "step": 13597, + "time_per_iteration": 3.83126163482666 + }, + { + "auxiliary_loss_clip": 0.01096843, + "auxiliary_loss_mlp": 0.0103056, + "balance_loss_clip": 1.0189966, + "balance_loss_mlp": 1.03306866, + "epoch": 0.8175559897790471, + "flos": 22455517276800.0, + "grad_norm": 2.0152722987790117, + "language_loss": 0.82239521, + "learning_rate": 3.390242470389462e-07, + "loss": 0.84366918, + "num_input_tokens_seen": 293391150, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.63671875, + "step": 13598, + "time_per_iteration": 2.461413621902466 + }, + { + "auxiliary_loss_clip": 0.01101172, + "auxiliary_loss_mlp": 0.01028969, + "balance_loss_clip": 1.01804388, + "balance_loss_mlp": 1.03485656, + "epoch": 0.817616113031715, + "flos": 23615790399360.0, + "grad_norm": 1.6794406280078336, + "language_loss": 0.8206194, + "learning_rate": 3.3880733408202277e-07, + "loss": 0.84192085, + "num_input_tokens_seen": 293409440, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6640625, + "step": 13599, + "time_per_iteration": 2.4389727115631104 + }, + { + "auxiliary_loss_clip": 0.01098228, + "auxiliary_loss_mlp": 0.01030677, + "balance_loss_clip": 1.01945949, + "balance_loss_mlp": 1.03399026, + "epoch": 0.817676236284383, + "flos": 27672260106240.0, + "grad_norm": 1.8119321548666836, + "language_loss": 0.83470106, + "learning_rate": 3.3859048411769186e-07, + "loss": 0.85599005, + "num_input_tokens_seen": 293428995, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.64453125, + "step": 13600, + "time_per_iteration": 3.8930134773254395 + }, + { + "auxiliary_loss_clip": 0.01100628, + "auxiliary_loss_mlp": 0.01029833, + "balance_loss_clip": 1.01807261, + "balance_loss_mlp": 1.0340941, + "epoch": 0.8177363595370509, + "flos": 24681009156480.0, + "grad_norm": 1.8552295525617326, + "language_loss": 0.74228668, + "learning_rate": 3.383736971541766e-07, + "loss": 0.76359135, + "num_input_tokens_seen": 293449155, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.66796875, + "step": 13601, + "time_per_iteration": 2.4926183223724365 + }, + { + "auxiliary_loss_clip": 0.01103435, + "auxiliary_loss_mlp": 0.01031251, + "balance_loss_clip": 1.01916885, + "balance_loss_mlp": 1.03450692, + "epoch": 0.817796482789719, + "flos": 17346326745600.0, + "grad_norm": 2.233954620070709, + "language_loss": 0.67695427, + "learning_rate": 3.3815697319969737e-07, + "loss": 0.69830108, + "num_input_tokens_seen": 293466125, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6875, + "step": 13602, + "time_per_iteration": 2.4101457595825195 + }, + { + "auxiliary_loss_clip": 0.01097878, + "auxiliary_loss_mlp": 0.01025934, + "balance_loss_clip": 1.01494884, + "balance_loss_mlp": 1.03410912, + "epoch": 0.8178566060423869, + "flos": 17778475272960.0, + "grad_norm": 2.2019513074937596, + "language_loss": 0.83764672, + "learning_rate": 3.379403122624718e-07, + "loss": 0.85888481, + "num_input_tokens_seen": 293481345, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.63671875, + "step": 13603, + "time_per_iteration": 2.4125587940216064 + }, + { + "auxiliary_loss_clip": 0.01100456, + "auxiliary_loss_mlp": 0.01027642, + "balance_loss_clip": 1.01626372, + "balance_loss_mlp": 1.03461123, + "epoch": 0.8179167292950549, + "flos": 24973250209920.0, + "grad_norm": 1.6014751924938777, + "language_loss": 0.69117272, + "learning_rate": 3.377237143507159e-07, + "loss": 0.71245372, + "num_input_tokens_seen": 293502330, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65625, + "step": 13604, + "time_per_iteration": 2.5184106826782227 + }, + { + "auxiliary_loss_clip": 0.01102038, + "auxiliary_loss_mlp": 0.01032838, + "balance_loss_clip": 1.02132845, + "balance_loss_mlp": 1.03665352, + "epoch": 0.8179768525477228, + "flos": 22856783086080.0, + "grad_norm": 1.681681436468054, + "language_loss": 0.742558, + "learning_rate": 3.3750717947264406e-07, + "loss": 0.76390678, + "num_input_tokens_seen": 293521415, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65625, + "step": 13605, + "time_per_iteration": 2.676684856414795 + }, + { + "auxiliary_loss_clip": 0.01099847, + "auxiliary_loss_mlp": 0.01033888, + "balance_loss_clip": 1.02215791, + "balance_loss_mlp": 1.03632402, + "epoch": 0.8180369758003908, + "flos": 18515147304960.0, + "grad_norm": 1.8459986304630236, + "language_loss": 0.74292308, + "learning_rate": 3.372907076364666e-07, + "loss": 0.76426041, + "num_input_tokens_seen": 293539245, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.63671875, + "step": 13606, + "time_per_iteration": 2.4154725074768066 + }, + { + "auxiliary_loss_clip": 0.01099557, + "auxiliary_loss_mlp": 0.01030629, + "balance_loss_clip": 1.01939964, + "balance_loss_mlp": 1.03485346, + "epoch": 0.8180970990530587, + "flos": 33182105915520.0, + "grad_norm": 1.9141153013042538, + "language_loss": 0.65221226, + "learning_rate": 3.370742988503916e-07, + "loss": 0.67351413, + "num_input_tokens_seen": 293560640, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 13607, + "time_per_iteration": 2.547203302383423 + }, + { + "auxiliary_loss_clip": 0.01101005, + "auxiliary_loss_mlp": 0.01030417, + "balance_loss_clip": 1.0187819, + "balance_loss_mlp": 1.0351789, + "epoch": 0.8181572223057267, + "flos": 25010022758400.0, + "grad_norm": 1.6236111572449494, + "language_loss": 0.70368075, + "learning_rate": 3.3685795312262634e-07, + "loss": 0.72499502, + "num_input_tokens_seen": 293579465, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 13608, + "time_per_iteration": 2.498760461807251 + }, + { + "auxiliary_loss_clip": 0.01097872, + "auxiliary_loss_mlp": 0.01033201, + "balance_loss_clip": 1.02187049, + "balance_loss_mlp": 1.03291011, + "epoch": 0.8182173455583948, + "flos": 28548731871360.0, + "grad_norm": 1.7153740974469267, + "language_loss": 0.79468846, + "learning_rate": 3.366416704613735e-07, + "loss": 0.81599921, + "num_input_tokens_seen": 293600540, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 13609, + "time_per_iteration": 2.5317742824554443 + }, + { + "auxiliary_loss_clip": 0.01023454, + "auxiliary_loss_mlp": 0.01002374, + "balance_loss_clip": 1.00132453, + "balance_loss_mlp": 1.00311923, + "epoch": 0.8182774688110627, + "flos": 72028043245440.0, + "grad_norm": 0.749863599431258, + "language_loss": 0.5588702, + "learning_rate": 3.3642545087483544e-07, + "loss": 0.5791285, + "num_input_tokens_seen": 293665160, + "router_z_loss_clip": 0.01049805, + "router_z_loss_mlp": 0.203125, + "step": 13610, + "time_per_iteration": 3.150242567062378 + }, + { + "auxiliary_loss_clip": 0.01096143, + "auxiliary_loss_mlp": 0.01025663, + "balance_loss_clip": 1.01499379, + "balance_loss_mlp": 1.03376782, + "epoch": 0.8183375920637307, + "flos": 19755358145280.0, + "grad_norm": 1.6981510303220106, + "language_loss": 0.77559108, + "learning_rate": 3.362092943712107e-07, + "loss": 0.79680908, + "num_input_tokens_seen": 293683995, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.625, + "step": 13611, + "time_per_iteration": 2.4540553092956543 + }, + { + "auxiliary_loss_clip": 0.01104443, + "auxiliary_loss_mlp": 0.0103386, + "balance_loss_clip": 1.02056813, + "balance_loss_mlp": 1.0341984, + "epoch": 0.8183977153163986, + "flos": 22341895580160.0, + "grad_norm": 1.7496159183254283, + "language_loss": 0.770311, + "learning_rate": 3.3599320095869745e-07, + "loss": 0.79169405, + "num_input_tokens_seen": 293704115, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.703125, + "step": 13612, + "time_per_iteration": 2.481358766555786 + }, + { + "auxiliary_loss_clip": 0.01096746, + "auxiliary_loss_mlp": 0.01027079, + "balance_loss_clip": 1.01580763, + "balance_loss_mlp": 1.03323674, + "epoch": 0.8184578385690666, + "flos": 17712472032000.0, + "grad_norm": 1.8750318483309736, + "language_loss": 0.86217213, + "learning_rate": 3.3577717064548793e-07, + "loss": 0.88341039, + "num_input_tokens_seen": 293722225, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6328125, + "step": 13613, + "time_per_iteration": 2.4214251041412354 + }, + { + "auxiliary_loss_clip": 0.01102179, + "auxiliary_loss_mlp": 0.01042499, + "balance_loss_clip": 1.03140044, + "balance_loss_mlp": 1.0373559, + "epoch": 0.8185179618217345, + "flos": 25701159323520.0, + "grad_norm": 1.775880767921388, + "language_loss": 0.72751027, + "learning_rate": 3.355612034397746e-07, + "loss": 0.74895704, + "num_input_tokens_seen": 293743995, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6484375, + "step": 13614, + "time_per_iteration": 2.480942487716675 + }, + { + "auxiliary_loss_clip": 0.01100997, + "auxiliary_loss_mlp": 0.01034589, + "balance_loss_clip": 1.02239406, + "balance_loss_mlp": 1.03468037, + "epoch": 0.8185780850744026, + "flos": 25960326929280.0, + "grad_norm": 1.580717147390374, + "language_loss": 0.81211054, + "learning_rate": 3.353452993497479e-07, + "loss": 0.83346641, + "num_input_tokens_seen": 293764935, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6640625, + "step": 13615, + "time_per_iteration": 2.512587308883667 + }, + { + "auxiliary_loss_clip": 0.01098725, + "auxiliary_loss_mlp": 0.01029727, + "balance_loss_clip": 1.0171864, + "balance_loss_mlp": 1.03305793, + "epoch": 0.8186382083270705, + "flos": 25228431406080.0, + "grad_norm": 1.9650123259608059, + "language_loss": 0.75749093, + "learning_rate": 3.3512945838359375e-07, + "loss": 0.77877545, + "num_input_tokens_seen": 293784035, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.65625, + "step": 13616, + "time_per_iteration": 2.4874508380889893 + }, + { + "auxiliary_loss_clip": 0.01096039, + "auxiliary_loss_mlp": 0.01031242, + "balance_loss_clip": 1.01892185, + "balance_loss_mlp": 1.03211665, + "epoch": 0.8186983315797385, + "flos": 22415009713920.0, + "grad_norm": 1.6631144614054594, + "language_loss": 0.75075936, + "learning_rate": 3.349136805494979e-07, + "loss": 0.77203214, + "num_input_tokens_seen": 293803360, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.640625, + "step": 13617, + "time_per_iteration": 2.4808571338653564 + }, + { + "auxiliary_loss_clip": 0.01094749, + "auxiliary_loss_mlp": 0.01028857, + "balance_loss_clip": 1.0177412, + "balance_loss_mlp": 1.03146482, + "epoch": 0.8187584548324064, + "flos": 22018017623040.0, + "grad_norm": 2.3105129174320362, + "language_loss": 0.68536007, + "learning_rate": 3.346979658556415e-07, + "loss": 0.70659614, + "num_input_tokens_seen": 293821325, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6328125, + "step": 13618, + "time_per_iteration": 2.44240665435791 + }, + { + "auxiliary_loss_clip": 0.01103541, + "auxiliary_loss_mlp": 0.01031881, + "balance_loss_clip": 1.01982939, + "balance_loss_mlp": 1.03484082, + "epoch": 0.8188185780850744, + "flos": 29241664116480.0, + "grad_norm": 1.820443995166382, + "language_loss": 0.70164716, + "learning_rate": 3.344823143102058e-07, + "loss": 0.72300136, + "num_input_tokens_seen": 293840315, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6875, + "step": 13619, + "time_per_iteration": 2.519563913345337 + }, + { + "auxiliary_loss_clip": 0.01105113, + "auxiliary_loss_mlp": 0.01026554, + "balance_loss_clip": 1.01481771, + "balance_loss_mlp": 1.03768373, + "epoch": 0.8188787013377423, + "flos": 20696504348160.0, + "grad_norm": 1.7568931118240076, + "language_loss": 0.73624021, + "learning_rate": 3.3426672592136694e-07, + "loss": 0.75755692, + "num_input_tokens_seen": 293855685, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.67578125, + "step": 13620, + "time_per_iteration": 2.4257287979125977 + }, + { + "auxiliary_loss_clip": 0.01097016, + "auxiliary_loss_mlp": 0.01027413, + "balance_loss_clip": 1.01578975, + "balance_loss_mlp": 1.03350806, + "epoch": 0.8189388245904103, + "flos": 23732967542400.0, + "grad_norm": 1.561934577342782, + "language_loss": 0.760234, + "learning_rate": 3.340512006973011e-07, + "loss": 0.78147829, + "num_input_tokens_seen": 293875540, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.63671875, + "step": 13621, + "time_per_iteration": 2.474597454071045 + }, + { + "auxiliary_loss_clip": 0.01098691, + "auxiliary_loss_mlp": 0.01025725, + "balance_loss_clip": 1.01428652, + "balance_loss_mlp": 1.03371286, + "epoch": 0.8189989478430784, + "flos": 28255090187520.0, + "grad_norm": 2.221894221463605, + "language_loss": 0.65659404, + "learning_rate": 3.3383573864618076e-07, + "loss": 0.67783821, + "num_input_tokens_seen": 293896570, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 13622, + "time_per_iteration": 2.4912831783294678 + }, + { + "auxiliary_loss_clip": 0.01103867, + "auxiliary_loss_mlp": 0.01026423, + "balance_loss_clip": 1.01354301, + "balance_loss_mlp": 1.03722382, + "epoch": 0.8190590710957463, + "flos": 21397696721280.0, + "grad_norm": 1.9628662622267186, + "language_loss": 0.74750388, + "learning_rate": 3.3362033977617653e-07, + "loss": 0.76880676, + "num_input_tokens_seen": 293914680, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.66796875, + "step": 13623, + "time_per_iteration": 2.464510440826416 + }, + { + "auxiliary_loss_clip": 0.01101923, + "auxiliary_loss_mlp": 0.01035759, + "balance_loss_clip": 1.02379024, + "balance_loss_mlp": 1.03451788, + "epoch": 0.8191191943484143, + "flos": 38796451367040.0, + "grad_norm": 2.130613473950277, + "language_loss": 0.63448161, + "learning_rate": 3.3340500409545527e-07, + "loss": 0.6558584, + "num_input_tokens_seen": 293936480, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 13624, + "time_per_iteration": 2.597849130630493 + }, + { + "auxiliary_loss_clip": 0.01097755, + "auxiliary_loss_mlp": 0.01032015, + "balance_loss_clip": 1.0206188, + "balance_loss_mlp": 1.03409612, + "epoch": 0.8191793176010822, + "flos": 25446516831360.0, + "grad_norm": 1.5840449954467926, + "language_loss": 0.78271246, + "learning_rate": 3.3318973161218386e-07, + "loss": 0.80401015, + "num_input_tokens_seen": 293957815, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.63671875, + "step": 13625, + "time_per_iteration": 2.5138373374938965 + }, + { + "auxiliary_loss_clip": 0.01104522, + "auxiliary_loss_mlp": 0.01029595, + "balance_loss_clip": 1.01713765, + "balance_loss_mlp": 1.034271, + "epoch": 0.8192394408537502, + "flos": 25083029151360.0, + "grad_norm": 1.8933767850684242, + "language_loss": 0.7606883, + "learning_rate": 3.329745223345244e-07, + "loss": 0.78202951, + "num_input_tokens_seen": 293975440, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 13626, + "time_per_iteration": 2.4585258960723877 + }, + { + "auxiliary_loss_clip": 0.01099817, + "auxiliary_loss_mlp": 0.01031587, + "balance_loss_clip": 1.02102554, + "balance_loss_mlp": 1.03533244, + "epoch": 0.8192995641064181, + "flos": 27673732563840.0, + "grad_norm": 1.454259930167211, + "language_loss": 0.73434258, + "learning_rate": 3.3275937627063823e-07, + "loss": 0.7556566, + "num_input_tokens_seen": 293997540, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.64453125, + "step": 13627, + "time_per_iteration": 2.5106747150421143 + }, + { + "auxiliary_loss_clip": 0.01099718, + "auxiliary_loss_mlp": 0.01032208, + "balance_loss_clip": 1.02029932, + "balance_loss_mlp": 1.03390169, + "epoch": 0.8193596873590862, + "flos": 21288492397440.0, + "grad_norm": 2.370107674869554, + "language_loss": 0.6889739, + "learning_rate": 3.3254429342868353e-07, + "loss": 0.71029305, + "num_input_tokens_seen": 294017030, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 13628, + "time_per_iteration": 2.472726345062256 + }, + { + "auxiliary_loss_clip": 0.01105355, + "auxiliary_loss_mlp": 0.01030355, + "balance_loss_clip": 1.01854777, + "balance_loss_mlp": 1.03616786, + "epoch": 0.8194198106117541, + "flos": 17492626840320.0, + "grad_norm": 1.5708816521142615, + "language_loss": 0.85466886, + "learning_rate": 3.323292738168171e-07, + "loss": 0.87602592, + "num_input_tokens_seen": 294035700, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.69140625, + "step": 13629, + "time_per_iteration": 2.4741504192352295 + }, + { + "auxiliary_loss_clip": 0.01099126, + "auxiliary_loss_mlp": 0.01024154, + "balance_loss_clip": 1.01312077, + "balance_loss_mlp": 1.03430605, + "epoch": 0.8194799338644221, + "flos": 15267925059840.0, + "grad_norm": 2.023051880199768, + "language_loss": 0.73787737, + "learning_rate": 3.3211431744319084e-07, + "loss": 0.75911021, + "num_input_tokens_seen": 294049730, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 13630, + "time_per_iteration": 2.440484046936035 + }, + { + "auxiliary_loss_clip": 0.01102576, + "auxiliary_loss_mlp": 0.01027871, + "balance_loss_clip": 1.01646876, + "balance_loss_mlp": 1.03574312, + "epoch": 0.81954005711709, + "flos": 14718814871040.0, + "grad_norm": 2.5690554004253507, + "language_loss": 0.71959084, + "learning_rate": 3.31899424315957e-07, + "loss": 0.74089527, + "num_input_tokens_seen": 294066545, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 13631, + "time_per_iteration": 2.459568738937378 + }, + { + "auxiliary_loss_clip": 0.01099537, + "auxiliary_loss_mlp": 0.01033114, + "balance_loss_clip": 1.02199757, + "balance_loss_mlp": 1.03329933, + "epoch": 0.819600180369758, + "flos": 23074042498560.0, + "grad_norm": 1.5024755031479913, + "language_loss": 0.76703703, + "learning_rate": 3.3168459444326447e-07, + "loss": 0.78836352, + "num_input_tokens_seen": 294087455, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 13632, + "time_per_iteration": 2.4642910957336426 + }, + { + "auxiliary_loss_clip": 0.01098389, + "auxiliary_loss_mlp": 0.01026774, + "balance_loss_clip": 1.01664722, + "balance_loss_mlp": 1.03358042, + "epoch": 0.8196603036224259, + "flos": 27599792417280.0, + "grad_norm": 1.6696759158330585, + "language_loss": 0.6536504, + "learning_rate": 3.314698278332588e-07, + "loss": 0.67490202, + "num_input_tokens_seen": 294107480, + "router_z_loss_clip": 0.1015625, + "router_z_loss_mlp": 0.6484375, + "step": 13633, + "time_per_iteration": 2.4936697483062744 + }, + { + "auxiliary_loss_clip": 0.01097253, + "auxiliary_loss_mlp": 0.01033244, + "balance_loss_clip": 1.02256322, + "balance_loss_mlp": 1.0340724, + "epoch": 0.8197204268750939, + "flos": 28582020800640.0, + "grad_norm": 1.6333258290671406, + "language_loss": 0.75608504, + "learning_rate": 3.3125512449408513e-07, + "loss": 0.77739, + "num_input_tokens_seen": 294130115, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6328125, + "step": 13634, + "time_per_iteration": 2.490642547607422 + }, + { + "auxiliary_loss_clip": 0.01096629, + "auxiliary_loss_mlp": 0.01027045, + "balance_loss_clip": 1.01635802, + "balance_loss_mlp": 1.03337777, + "epoch": 0.819780550127762, + "flos": 23258300290560.0, + "grad_norm": 1.9576833355326961, + "language_loss": 0.81758225, + "learning_rate": 3.310404844338841e-07, + "loss": 0.83881891, + "num_input_tokens_seen": 294148495, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6328125, + "step": 13635, + "time_per_iteration": 2.4764068126678467 + }, + { + "auxiliary_loss_clip": 0.01099631, + "auxiliary_loss_mlp": 0.01029582, + "balance_loss_clip": 1.01730359, + "balance_loss_mlp": 1.03370953, + "epoch": 0.8198406733804299, + "flos": 26685255214080.0, + "grad_norm": 1.8160936687392844, + "language_loss": 0.75971925, + "learning_rate": 3.308259076607949e-07, + "loss": 0.78101134, + "num_input_tokens_seen": 294169595, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66015625, + "step": 13636, + "time_per_iteration": 3.8742868900299072 + }, + { + "auxiliary_loss_clip": 0.0109741, + "auxiliary_loss_mlp": 0.01030281, + "balance_loss_clip": 1.01852703, + "balance_loss_mlp": 1.03244948, + "epoch": 0.8199007966330979, + "flos": 20084084438400.0, + "grad_norm": 2.523932105098669, + "language_loss": 0.8138752, + "learning_rate": 3.3061139418295445e-07, + "loss": 0.83515203, + "num_input_tokens_seen": 294183885, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6484375, + "step": 13637, + "time_per_iteration": 2.444077491760254 + }, + { + "auxiliary_loss_clip": 0.01100022, + "auxiliary_loss_mlp": 0.01030226, + "balance_loss_clip": 1.01866293, + "balance_loss_mlp": 1.03494465, + "epoch": 0.8199609198857658, + "flos": 31902788142720.0, + "grad_norm": 2.1971126807385617, + "language_loss": 0.71151501, + "learning_rate": 3.3039694400849725e-07, + "loss": 0.73281747, + "num_input_tokens_seen": 294200150, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6484375, + "step": 13638, + "time_per_iteration": 5.341911554336548 + }, + { + "auxiliary_loss_clip": 0.0110428, + "auxiliary_loss_mlp": 0.01032887, + "balance_loss_clip": 1.0195235, + "balance_loss_mlp": 1.03576005, + "epoch": 0.8200210431384338, + "flos": 26470150617600.0, + "grad_norm": 1.87843772598682, + "language_loss": 0.79670238, + "learning_rate": 3.3018255714555564e-07, + "loss": 0.81807411, + "num_input_tokens_seen": 294220385, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.6875, + "step": 13639, + "time_per_iteration": 2.508781909942627 + }, + { + "auxiliary_loss_clip": 0.01099222, + "auxiliary_loss_mlp": 0.01027252, + "balance_loss_clip": 1.0162971, + "balance_loss_mlp": 1.03431201, + "epoch": 0.8200811663911017, + "flos": 22091454979200.0, + "grad_norm": 1.6458792475114847, + "language_loss": 0.7922225, + "learning_rate": 3.299682336022589e-07, + "loss": 0.81348717, + "num_input_tokens_seen": 294239355, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 13640, + "time_per_iteration": 2.4696903228759766 + }, + { + "auxiliary_loss_clip": 0.01104141, + "auxiliary_loss_mlp": 0.01031614, + "balance_loss_clip": 1.01967573, + "balance_loss_mlp": 1.03490579, + "epoch": 0.8201412896437698, + "flos": 37593659520000.0, + "grad_norm": 2.5540262458401086, + "language_loss": 0.63221669, + "learning_rate": 3.297539733867336e-07, + "loss": 0.65357423, + "num_input_tokens_seen": 294259395, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6953125, + "step": 13641, + "time_per_iteration": 2.6128504276275635 + }, + { + "auxiliary_loss_clip": 0.01099233, + "auxiliary_loss_mlp": 0.0102772, + "balance_loss_clip": 1.01587081, + "balance_loss_mlp": 1.03426635, + "epoch": 0.8202014128964377, + "flos": 19646333389440.0, + "grad_norm": 1.7946426536258016, + "language_loss": 0.73509145, + "learning_rate": 3.295397765071055e-07, + "loss": 0.75636101, + "num_input_tokens_seen": 294277365, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6484375, + "step": 13642, + "time_per_iteration": 3.9330053329467773 + }, + { + "auxiliary_loss_clip": 0.01100563, + "auxiliary_loss_mlp": 0.01031235, + "balance_loss_clip": 1.0196774, + "balance_loss_mlp": 1.03578949, + "epoch": 0.8202615361491057, + "flos": 31467335564160.0, + "grad_norm": 1.7449860338382779, + "language_loss": 0.703394, + "learning_rate": 3.2932564297149615e-07, + "loss": 0.72471195, + "num_input_tokens_seen": 294297555, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6484375, + "step": 13643, + "time_per_iteration": 2.5429534912109375 + }, + { + "auxiliary_loss_clip": 0.01099353, + "auxiliary_loss_mlp": 0.01031832, + "balance_loss_clip": 1.02020276, + "balance_loss_mlp": 1.03523421, + "epoch": 0.8203216594017736, + "flos": 24715555061760.0, + "grad_norm": 1.648171035996549, + "language_loss": 0.65431941, + "learning_rate": 3.291115727880256e-07, + "loss": 0.67563128, + "num_input_tokens_seen": 294317600, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.640625, + "step": 13644, + "time_per_iteration": 2.469975709915161 + }, + { + "auxiliary_loss_clip": 0.01102103, + "auxiliary_loss_mlp": 0.01033753, + "balance_loss_clip": 1.02192736, + "balance_loss_mlp": 1.03459549, + "epoch": 0.8203817826544416, + "flos": 26031824951040.0, + "grad_norm": 1.4857161465071853, + "language_loss": 0.70731783, + "learning_rate": 3.2889756596481234e-07, + "loss": 0.72867638, + "num_input_tokens_seen": 294340215, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.67578125, + "step": 13645, + "time_per_iteration": 2.507760763168335 + }, + { + "auxiliary_loss_clip": 0.01097751, + "auxiliary_loss_mlp": 0.01028891, + "balance_loss_clip": 1.01763785, + "balance_loss_mlp": 1.03391588, + "epoch": 0.8204419059071095, + "flos": 25954544839680.0, + "grad_norm": 1.7507779511305261, + "language_loss": 0.71368539, + "learning_rate": 3.286836225099707e-07, + "loss": 0.73495179, + "num_input_tokens_seen": 294358590, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.63671875, + "step": 13646, + "time_per_iteration": 2.4842605590820312 + }, + { + "auxiliary_loss_clip": 0.01103715, + "auxiliary_loss_mlp": 0.0103143, + "balance_loss_clip": 1.01968813, + "balance_loss_mlp": 1.0362978, + "epoch": 0.8205020291597775, + "flos": 23580059345280.0, + "grad_norm": 2.2469307057913124, + "language_loss": 0.78236741, + "learning_rate": 3.284697424316132e-07, + "loss": 0.80371881, + "num_input_tokens_seen": 294375825, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.671875, + "step": 13647, + "time_per_iteration": 2.460960626602173 + }, + { + "auxiliary_loss_clip": 0.01097333, + "auxiliary_loss_mlp": 0.01029884, + "balance_loss_clip": 1.01906538, + "balance_loss_mlp": 1.03564429, + "epoch": 0.8205621524124456, + "flos": 26799164219520.0, + "grad_norm": 2.4969141122611855, + "language_loss": 0.67900592, + "learning_rate": 3.2825592573785034e-07, + "loss": 0.7002781, + "num_input_tokens_seen": 294398500, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6171875, + "step": 13648, + "time_per_iteration": 2.4923458099365234 + }, + { + "auxiliary_loss_clip": 0.01099146, + "auxiliary_loss_mlp": 0.01028588, + "balance_loss_clip": 1.01675045, + "balance_loss_mlp": 1.03283572, + "epoch": 0.8206222756651135, + "flos": 27527863432320.0, + "grad_norm": 1.752469920851942, + "language_loss": 0.80176151, + "learning_rate": 3.28042172436791e-07, + "loss": 0.82303882, + "num_input_tokens_seen": 294418840, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 13649, + "time_per_iteration": 2.499342441558838 + }, + { + "auxiliary_loss_clip": 0.01104146, + "auxiliary_loss_mlp": 0.01034331, + "balance_loss_clip": 1.02167666, + "balance_loss_mlp": 1.03822005, + "epoch": 0.8206823989177815, + "flos": 21178605715200.0, + "grad_norm": 1.5497313587872559, + "language_loss": 0.68704414, + "learning_rate": 3.278284825365396e-07, + "loss": 0.70842898, + "num_input_tokens_seen": 294438215, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.66015625, + "step": 13650, + "time_per_iteration": 2.542250871658325 + }, + { + "auxiliary_loss_clip": 0.0110322, + "auxiliary_loss_mlp": 0.01029357, + "balance_loss_clip": 1.01717925, + "balance_loss_mlp": 1.03656495, + "epoch": 0.8207425221704494, + "flos": 11509622150400.0, + "grad_norm": 1.992917875581952, + "language_loss": 0.60637325, + "learning_rate": 3.276148560452001e-07, + "loss": 0.62769902, + "num_input_tokens_seen": 294455260, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.66796875, + "step": 13651, + "time_per_iteration": 2.427485227584839 + }, + { + "auxiliary_loss_clip": 0.01104379, + "auxiliary_loss_mlp": 0.01031683, + "balance_loss_clip": 1.01947021, + "balance_loss_mlp": 1.03686225, + "epoch": 0.8208026454231174, + "flos": 19791987039360.0, + "grad_norm": 1.8723669979316186, + "language_loss": 0.72488928, + "learning_rate": 3.2740129297087293e-07, + "loss": 0.74624991, + "num_input_tokens_seen": 294473205, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.67578125, + "step": 13652, + "time_per_iteration": 2.4548513889312744 + }, + { + "auxiliary_loss_clip": 0.0109596, + "auxiliary_loss_mlp": 0.010267, + "balance_loss_clip": 1.01649594, + "balance_loss_mlp": 1.0346185, + "epoch": 0.8208627686757853, + "flos": 15667538843520.0, + "grad_norm": 2.0134123084835185, + "language_loss": 0.72847176, + "learning_rate": 3.271877933216558e-07, + "loss": 0.7496984, + "num_input_tokens_seen": 294490645, + "router_z_loss_clip": 0.10205078, + "router_z_loss_mlp": 0.61328125, + "step": 13653, + "time_per_iteration": 2.496058702468872 + }, + { + "auxiliary_loss_clip": 0.01106157, + "auxiliary_loss_mlp": 0.01033652, + "balance_loss_clip": 1.02056861, + "balance_loss_mlp": 1.03688347, + "epoch": 0.8209228919284534, + "flos": 37482659516160.0, + "grad_norm": 1.9633450823694507, + "language_loss": 0.62664771, + "learning_rate": 3.269743571056451e-07, + "loss": 0.64804584, + "num_input_tokens_seen": 294513500, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.69140625, + "step": 13654, + "time_per_iteration": 2.659797430038452 + }, + { + "auxiliary_loss_clip": 0.01099475, + "auxiliary_loss_mlp": 0.01025523, + "balance_loss_clip": 1.01414406, + "balance_loss_mlp": 1.0323689, + "epoch": 0.8209830151811213, + "flos": 23112969863040.0, + "grad_norm": 1.437038379365976, + "language_loss": 0.70098144, + "learning_rate": 3.2676098433093447e-07, + "loss": 0.72223151, + "num_input_tokens_seen": 294535710, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.671875, + "step": 13655, + "time_per_iteration": 2.528794527053833 + }, + { + "auxiliary_loss_clip": 0.01099497, + "auxiliary_loss_mlp": 0.01031567, + "balance_loss_clip": 1.0198307, + "balance_loss_mlp": 1.03530574, + "epoch": 0.8210431384337893, + "flos": 21288169175040.0, + "grad_norm": 2.2263788010004673, + "language_loss": 0.82174385, + "learning_rate": 3.265476750056162e-07, + "loss": 0.84305441, + "num_input_tokens_seen": 294554055, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.640625, + "step": 13656, + "time_per_iteration": 2.4395313262939453 + }, + { + "auxiliary_loss_clip": 0.01097914, + "auxiliary_loss_mlp": 0.01030496, + "balance_loss_clip": 1.01891446, + "balance_loss_mlp": 1.03546, + "epoch": 0.8211032616864572, + "flos": 11502403516800.0, + "grad_norm": 2.174318286029315, + "language_loss": 0.74104166, + "learning_rate": 3.2633442913777654e-07, + "loss": 0.76232576, + "num_input_tokens_seen": 294570390, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.625, + "step": 13657, + "time_per_iteration": 2.421473979949951 + }, + { + "auxiliary_loss_clip": 0.01099474, + "auxiliary_loss_mlp": 0.01028953, + "balance_loss_clip": 1.01778913, + "balance_loss_mlp": 1.03430414, + "epoch": 0.8211633849391252, + "flos": 29821477455360.0, + "grad_norm": 1.6389881975455622, + "language_loss": 0.55704254, + "learning_rate": 3.2612124673550325e-07, + "loss": 0.57832676, + "num_input_tokens_seen": 294593050, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.65234375, + "step": 13658, + "time_per_iteration": 2.515174150466919 + }, + { + "auxiliary_loss_clip": 0.01099693, + "auxiliary_loss_mlp": 0.0103061, + "balance_loss_clip": 1.01874256, + "balance_loss_mlp": 1.03353977, + "epoch": 0.8212235081917931, + "flos": 13115439573120.0, + "grad_norm": 2.0858801212680804, + "language_loss": 0.7889123, + "learning_rate": 3.259081278068805e-07, + "loss": 0.81021535, + "num_input_tokens_seen": 294608550, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66015625, + "step": 13659, + "time_per_iteration": 2.4070687294006348 + }, + { + "auxiliary_loss_clip": 0.01092982, + "auxiliary_loss_mlp": 0.01025586, + "balance_loss_clip": 1.01550722, + "balance_loss_mlp": 1.03138757, + "epoch": 0.8212836314444611, + "flos": 40515351782400.0, + "grad_norm": 1.4937211500780294, + "language_loss": 0.59556949, + "learning_rate": 3.256950723599887e-07, + "loss": 0.61675525, + "num_input_tokens_seen": 294630380, + "router_z_loss_clip": 0.10058594, + "router_z_loss_mlp": 0.6171875, + "step": 13660, + "time_per_iteration": 2.5837912559509277 + }, + { + "auxiliary_loss_clip": 0.01101967, + "auxiliary_loss_mlp": 0.01032748, + "balance_loss_clip": 1.02019513, + "balance_loss_mlp": 1.03470123, + "epoch": 0.8213437546971292, + "flos": 18770543982720.0, + "grad_norm": 2.5721121812428285, + "language_loss": 0.72652888, + "learning_rate": 3.254820804029075e-07, + "loss": 0.74787605, + "num_input_tokens_seen": 294648655, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.671875, + "step": 13661, + "time_per_iteration": 2.4341530799865723 + }, + { + "auxiliary_loss_clip": 0.01101061, + "auxiliary_loss_mlp": 0.01031263, + "balance_loss_clip": 1.01971793, + "balance_loss_mlp": 1.03330231, + "epoch": 0.8214038779497971, + "flos": 19682279925120.0, + "grad_norm": 1.950667378612405, + "language_loss": 0.74900603, + "learning_rate": 3.252691519437143e-07, + "loss": 0.77032924, + "num_input_tokens_seen": 294666915, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.67578125, + "step": 13662, + "time_per_iteration": 2.4195828437805176 + }, + { + "auxiliary_loss_clip": 0.01023814, + "auxiliary_loss_mlp": 0.01002606, + "balance_loss_clip": 1.00160432, + "balance_loss_mlp": 1.00386024, + "epoch": 0.8214640012024651, + "flos": 71602969697280.0, + "grad_norm": 0.745371582589168, + "language_loss": 0.54059064, + "learning_rate": 3.250562869904825e-07, + "loss": 0.56085479, + "num_input_tokens_seen": 294731545, + "router_z_loss_clip": 0.01000977, + "router_z_loss_mlp": 0.19921875, + "step": 13663, + "time_per_iteration": 3.190706729888916 + }, + { + "auxiliary_loss_clip": 0.01099277, + "auxiliary_loss_mlp": 0.01031253, + "balance_loss_clip": 1.01952863, + "balance_loss_mlp": 1.0335598, + "epoch": 0.821524124455133, + "flos": 14757203531520.0, + "grad_norm": 2.126901212447461, + "language_loss": 0.65428329, + "learning_rate": 3.248434855512838e-07, + "loss": 0.67558861, + "num_input_tokens_seen": 294748745, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 13664, + "time_per_iteration": 2.437980890274048 + }, + { + "auxiliary_loss_clip": 0.01098267, + "auxiliary_loss_mlp": 0.01028172, + "balance_loss_clip": 1.01769304, + "balance_loss_mlp": 1.03531146, + "epoch": 0.821584247707801, + "flos": 25082274965760.0, + "grad_norm": 1.764912039303068, + "language_loss": 0.75243938, + "learning_rate": 3.246307476341881e-07, + "loss": 0.77370375, + "num_input_tokens_seen": 294768955, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.6328125, + "step": 13665, + "time_per_iteration": 2.487011194229126 + }, + { + "auxiliary_loss_clip": 0.01100929, + "auxiliary_loss_mlp": 0.01028919, + "balance_loss_clip": 1.01756454, + "balance_loss_mlp": 1.03510714, + "epoch": 0.8216443709604689, + "flos": 36830701710720.0, + "grad_norm": 4.066044981617709, + "language_loss": 0.65191346, + "learning_rate": 3.2441807324726256e-07, + "loss": 0.67321193, + "num_input_tokens_seen": 294789250, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 13666, + "time_per_iteration": 2.570254325866699 + }, + { + "auxiliary_loss_clip": 0.01099902, + "auxiliary_loss_mlp": 0.01030196, + "balance_loss_clip": 1.0189724, + "balance_loss_mlp": 1.03564048, + "epoch": 0.821704494213137, + "flos": 25081808088960.0, + "grad_norm": 1.6810733916735099, + "language_loss": 0.76897776, + "learning_rate": 3.2420546239857174e-07, + "loss": 0.79027867, + "num_input_tokens_seen": 294809760, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 13667, + "time_per_iteration": 2.493004322052002 + }, + { + "auxiliary_loss_clip": 0.01102843, + "auxiliary_loss_mlp": 0.01030551, + "balance_loss_clip": 1.01841545, + "balance_loss_mlp": 1.03575373, + "epoch": 0.8217646174658049, + "flos": 14356117290240.0, + "grad_norm": 1.809243684883085, + "language_loss": 0.77085578, + "learning_rate": 3.239929150961773e-07, + "loss": 0.79218972, + "num_input_tokens_seen": 294826495, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 13668, + "time_per_iteration": 2.4280850887298584 + }, + { + "auxiliary_loss_clip": 0.01098761, + "auxiliary_loss_mlp": 0.01030147, + "balance_loss_clip": 1.01847649, + "balance_loss_mlp": 1.03395486, + "epoch": 0.8218247407184729, + "flos": 22090557139200.0, + "grad_norm": 2.0672708102339894, + "language_loss": 0.73729622, + "learning_rate": 3.2378043134813984e-07, + "loss": 0.75858533, + "num_input_tokens_seen": 294845370, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6484375, + "step": 13669, + "time_per_iteration": 2.4670791625976562 + }, + { + "auxiliary_loss_clip": 0.01098476, + "auxiliary_loss_mlp": 0.01026323, + "balance_loss_clip": 1.01525986, + "balance_loss_mlp": 1.03419805, + "epoch": 0.8218848639711408, + "flos": 16764035368320.0, + "grad_norm": 1.6347371555380708, + "language_loss": 0.78685886, + "learning_rate": 3.235680111625161e-07, + "loss": 0.80810678, + "num_input_tokens_seen": 294863740, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.640625, + "step": 13670, + "time_per_iteration": 2.4091546535491943 + }, + { + "auxiliary_loss_clip": 0.01105531, + "auxiliary_loss_mlp": 0.01036754, + "balance_loss_clip": 1.0243504, + "balance_loss_mlp": 1.03711927, + "epoch": 0.8219449872238088, + "flos": 25994801007360.0, + "grad_norm": 2.4869445787160616, + "language_loss": 0.74846464, + "learning_rate": 3.2335565454736123e-07, + "loss": 0.76988751, + "num_input_tokens_seen": 294882815, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 13671, + "time_per_iteration": 2.4942846298217773 + }, + { + "auxiliary_loss_clip": 0.01105717, + "auxiliary_loss_mlp": 0.01029969, + "balance_loss_clip": 1.01770282, + "balance_loss_mlp": 1.0356338, + "epoch": 0.8220051104764767, + "flos": 20778094091520.0, + "grad_norm": 7.475778377778618, + "language_loss": 0.76535976, + "learning_rate": 3.23143361510728e-07, + "loss": 0.7867167, + "num_input_tokens_seen": 294901985, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.69921875, + "step": 13672, + "time_per_iteration": 2.4521193504333496 + }, + { + "auxiliary_loss_clip": 0.01101297, + "auxiliary_loss_mlp": 0.01033533, + "balance_loss_clip": 1.02047908, + "balance_loss_mlp": 1.03531826, + "epoch": 0.8220652337291448, + "flos": 14574849160320.0, + "grad_norm": 2.6550856342382088, + "language_loss": 0.74858975, + "learning_rate": 3.2293113206066733e-07, + "loss": 0.76993799, + "num_input_tokens_seen": 294919705, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.66015625, + "step": 13673, + "time_per_iteration": 2.4321844577789307 + }, + { + "auxiliary_loss_clip": 0.01101553, + "auxiliary_loss_mlp": 0.01031215, + "balance_loss_clip": 1.01893675, + "balance_loss_mlp": 1.03496301, + "epoch": 0.8221253569818128, + "flos": 23805866194560.0, + "grad_norm": 1.7367122005490172, + "language_loss": 0.79398859, + "learning_rate": 3.227189662052254e-07, + "loss": 0.81531632, + "num_input_tokens_seen": 294939900, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66796875, + "step": 13674, + "time_per_iteration": 2.452885150909424 + }, + { + "auxiliary_loss_clip": 0.01099597, + "auxiliary_loss_mlp": 0.01030577, + "balance_loss_clip": 1.01925182, + "balance_loss_mlp": 1.03428209, + "epoch": 0.8221854802344807, + "flos": 21288241002240.0, + "grad_norm": 2.1133835706017403, + "language_loss": 0.70256555, + "learning_rate": 3.225068639524484e-07, + "loss": 0.7238673, + "num_input_tokens_seen": 294959110, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65234375, + "step": 13675, + "time_per_iteration": 2.456704616546631 + }, + { + "auxiliary_loss_clip": 0.01098649, + "auxiliary_loss_mlp": 0.01033846, + "balance_loss_clip": 1.02252114, + "balance_loss_mlp": 1.03533232, + "epoch": 0.8222456034871487, + "flos": 20956785275520.0, + "grad_norm": 1.9196064294997741, + "language_loss": 0.74139565, + "learning_rate": 3.2229482531037965e-07, + "loss": 0.76272058, + "num_input_tokens_seen": 294978660, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6328125, + "step": 13676, + "time_per_iteration": 2.4562814235687256 + }, + { + "auxiliary_loss_clip": 0.01099141, + "auxiliary_loss_mlp": 0.01029688, + "balance_loss_clip": 1.01866674, + "balance_loss_mlp": 1.0337007, + "epoch": 0.8223057267398166, + "flos": 21397517153280.0, + "grad_norm": 1.9554879616181895, + "language_loss": 0.80535352, + "learning_rate": 3.2208285028705893e-07, + "loss": 0.8266418, + "num_input_tokens_seen": 294998075, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65234375, + "step": 13677, + "time_per_iteration": 2.4471378326416016 + }, + { + "auxiliary_loss_clip": 0.01102612, + "auxiliary_loss_mlp": 0.01031479, + "balance_loss_clip": 1.01983237, + "balance_loss_mlp": 1.03584671, + "epoch": 0.8223658499924846, + "flos": 15268212368640.0, + "grad_norm": 1.7803528403739162, + "language_loss": 0.6982736, + "learning_rate": 3.218709388905245e-07, + "loss": 0.71961451, + "num_input_tokens_seen": 295015950, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66796875, + "step": 13678, + "time_per_iteration": 3.784363269805908 + }, + { + "auxiliary_loss_clip": 0.01097789, + "auxiliary_loss_mlp": 0.01034524, + "balance_loss_clip": 1.02262068, + "balance_loss_mlp": 1.03327274, + "epoch": 0.8224259732451525, + "flos": 31249537447680.0, + "grad_norm": 1.7357616205251198, + "language_loss": 0.71496773, + "learning_rate": 3.216590911288133e-07, + "loss": 0.73629081, + "num_input_tokens_seen": 295036800, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.64453125, + "step": 13679, + "time_per_iteration": 2.5227231979370117 + }, + { + "auxiliary_loss_clip": 0.01097414, + "auxiliary_loss_mlp": 0.01025772, + "balance_loss_clip": 1.01444733, + "balance_loss_mlp": 1.03310823, + "epoch": 0.8224860964978206, + "flos": 21574628138880.0, + "grad_norm": 1.9577304134380913, + "language_loss": 0.70049226, + "learning_rate": 3.214473070099564e-07, + "loss": 0.72172415, + "num_input_tokens_seen": 295055300, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 13680, + "time_per_iteration": 5.352876901626587 + }, + { + "auxiliary_loss_clip": 0.01101662, + "auxiliary_loss_mlp": 0.01028207, + "balance_loss_clip": 1.01710844, + "balance_loss_mlp": 1.03647804, + "epoch": 0.8225462197504885, + "flos": 25483217552640.0, + "grad_norm": 1.9159882812306386, + "language_loss": 0.59767008, + "learning_rate": 3.21235586541986e-07, + "loss": 0.61896878, + "num_input_tokens_seen": 295076420, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.65234375, + "step": 13681, + "time_per_iteration": 2.490726947784424 + }, + { + "auxiliary_loss_clip": 0.01104078, + "auxiliary_loss_mlp": 0.01033867, + "balance_loss_clip": 1.0217855, + "balance_loss_mlp": 1.03521645, + "epoch": 0.8226063430031565, + "flos": 39385458587520.0, + "grad_norm": 1.6390397647807602, + "language_loss": 0.69242489, + "learning_rate": 3.2102392973293047e-07, + "loss": 0.7138043, + "num_input_tokens_seen": 295100540, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 13682, + "time_per_iteration": 2.602713108062744 + }, + { + "auxiliary_loss_clip": 0.01102237, + "auxiliary_loss_mlp": 0.01031178, + "balance_loss_clip": 1.01805353, + "balance_loss_mlp": 1.03474033, + "epoch": 0.8226664662558244, + "flos": 22815269942400.0, + "grad_norm": 1.8183335657077608, + "language_loss": 0.79319465, + "learning_rate": 3.20812336590816e-07, + "loss": 0.81452876, + "num_input_tokens_seen": 295120180, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.67578125, + "step": 13683, + "time_per_iteration": 2.473590850830078 + }, + { + "auxiliary_loss_clip": 0.01094969, + "auxiliary_loss_mlp": 0.01025176, + "balance_loss_clip": 1.01506758, + "balance_loss_mlp": 1.0332613, + "epoch": 0.8227265895084924, + "flos": 25665607837440.0, + "grad_norm": 1.9785950303413915, + "language_loss": 0.86425269, + "learning_rate": 3.206008071236661e-07, + "loss": 0.88545412, + "num_input_tokens_seen": 295138530, + "router_z_loss_clip": 0.10107422, + "router_z_loss_mlp": 0.6171875, + "step": 13684, + "time_per_iteration": 3.9735918045043945 + }, + { + "auxiliary_loss_clip": 0.01096791, + "auxiliary_loss_mlp": 0.01027071, + "balance_loss_clip": 1.01606178, + "balance_loss_mlp": 1.03397489, + "epoch": 0.8227867127611603, + "flos": 26179274280960.0, + "grad_norm": 1.4566743169469651, + "language_loss": 0.7976135, + "learning_rate": 3.2038934133950157e-07, + "loss": 0.81885219, + "num_input_tokens_seen": 295160260, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.62890625, + "step": 13685, + "time_per_iteration": 2.5093915462493896 + }, + { + "auxiliary_loss_clip": 0.01099427, + "auxiliary_loss_mlp": 0.01029254, + "balance_loss_clip": 1.01740456, + "balance_loss_mlp": 1.03454077, + "epoch": 0.8228468360138284, + "flos": 22018053536640.0, + "grad_norm": 1.5897457656815852, + "language_loss": 0.68847555, + "learning_rate": 3.2017793924634194e-07, + "loss": 0.70976239, + "num_input_tokens_seen": 295177055, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6484375, + "step": 13686, + "time_per_iteration": 2.485407829284668 + }, + { + "auxiliary_loss_clip": 0.01100256, + "auxiliary_loss_mlp": 0.01031583, + "balance_loss_clip": 1.01950085, + "balance_loss_mlp": 1.03301668, + "epoch": 0.8229069592664963, + "flos": 14903359971840.0, + "grad_norm": 2.097008209143573, + "language_loss": 0.77891821, + "learning_rate": 3.1996660085220263e-07, + "loss": 0.80023664, + "num_input_tokens_seen": 295193870, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 13687, + "time_per_iteration": 2.436474323272705 + }, + { + "auxiliary_loss_clip": 0.0109971, + "auxiliary_loss_mlp": 0.01028497, + "balance_loss_clip": 1.01672459, + "balance_loss_mlp": 1.03359497, + "epoch": 0.8229670825191643, + "flos": 15669478177920.0, + "grad_norm": 1.8388949681321325, + "language_loss": 0.72154832, + "learning_rate": 3.1975532616509825e-07, + "loss": 0.7428304, + "num_input_tokens_seen": 295211040, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66015625, + "step": 13688, + "time_per_iteration": 2.492417097091675 + }, + { + "auxiliary_loss_clip": 0.01100514, + "auxiliary_loss_mlp": 0.01032428, + "balance_loss_clip": 1.02074492, + "balance_loss_mlp": 1.03449452, + "epoch": 0.8230272057718323, + "flos": 23183498217600.0, + "grad_norm": 1.8251885380821353, + "language_loss": 0.73366064, + "learning_rate": 3.1954411519304025e-07, + "loss": 0.75499004, + "num_input_tokens_seen": 295231300, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66015625, + "step": 13689, + "time_per_iteration": 2.5088613033294678 + }, + { + "auxiliary_loss_clip": 0.01100436, + "auxiliary_loss_mlp": 0.01032784, + "balance_loss_clip": 1.02097631, + "balance_loss_mlp": 1.03380239, + "epoch": 0.8230873290245002, + "flos": 21032413361280.0, + "grad_norm": 1.8248664958327294, + "language_loss": 0.689372, + "learning_rate": 3.1933296794403887e-07, + "loss": 0.71070421, + "num_input_tokens_seen": 295251045, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 13690, + "time_per_iteration": 2.492891550064087 + }, + { + "auxiliary_loss_clip": 0.01099884, + "auxiliary_loss_mlp": 0.01032076, + "balance_loss_clip": 1.02075148, + "balance_loss_mlp": 1.03438592, + "epoch": 0.8231474522771682, + "flos": 21250139650560.0, + "grad_norm": 1.9670817251189323, + "language_loss": 0.85403329, + "learning_rate": 3.191218844260988e-07, + "loss": 0.87535292, + "num_input_tokens_seen": 295270225, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 13691, + "time_per_iteration": 2.5006744861602783 + }, + { + "auxiliary_loss_clip": 0.01101672, + "auxiliary_loss_mlp": 0.01031634, + "balance_loss_clip": 1.0202024, + "balance_loss_mlp": 1.03484964, + "epoch": 0.8232075755298361, + "flos": 23842028211840.0, + "grad_norm": 1.942309655074723, + "language_loss": 0.76985818, + "learning_rate": 3.189108646472252e-07, + "loss": 0.79119122, + "num_input_tokens_seen": 295288950, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 13692, + "time_per_iteration": 2.5435726642608643 + }, + { + "auxiliary_loss_clip": 0.01098631, + "auxiliary_loss_mlp": 0.01027484, + "balance_loss_clip": 1.01581907, + "balance_loss_mlp": 1.03404903, + "epoch": 0.8232676987825042, + "flos": 21653955325440.0, + "grad_norm": 1.5338263277775153, + "language_loss": 0.71625656, + "learning_rate": 3.186999086154205e-07, + "loss": 0.73751771, + "num_input_tokens_seen": 295309405, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.64453125, + "step": 13693, + "time_per_iteration": 2.462322473526001 + }, + { + "auxiliary_loss_clip": 0.01095198, + "auxiliary_loss_mlp": 0.01032132, + "balance_loss_clip": 1.02157629, + "balance_loss_mlp": 1.03228104, + "epoch": 0.8233278220351721, + "flos": 26322701287680.0, + "grad_norm": 1.484575932799216, + "language_loss": 0.83818102, + "learning_rate": 3.1848901633868355e-07, + "loss": 0.85945427, + "num_input_tokens_seen": 295331115, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6328125, + "step": 13694, + "time_per_iteration": 2.530102491378784 + }, + { + "auxiliary_loss_clip": 0.01099219, + "auxiliary_loss_mlp": 0.010295, + "balance_loss_clip": 1.01793063, + "balance_loss_mlp": 1.03319895, + "epoch": 0.8233879452878401, + "flos": 21725812483200.0, + "grad_norm": 1.5670764630001808, + "language_loss": 0.76820183, + "learning_rate": 3.182781878250118e-07, + "loss": 0.78948903, + "num_input_tokens_seen": 295350495, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66015625, + "step": 13695, + "time_per_iteration": 2.4689533710479736 + }, + { + "auxiliary_loss_clip": 0.01101143, + "auxiliary_loss_mlp": 0.01032503, + "balance_loss_clip": 1.02139306, + "balance_loss_mlp": 1.03628111, + "epoch": 0.823448068540508, + "flos": 20557746109440.0, + "grad_norm": 1.716943951342175, + "language_loss": 0.80500603, + "learning_rate": 3.1806742308239985e-07, + "loss": 0.82634246, + "num_input_tokens_seen": 295368225, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 13696, + "time_per_iteration": 2.4298195838928223 + }, + { + "auxiliary_loss_clip": 0.01023101, + "auxiliary_loss_mlp": 0.01002114, + "balance_loss_clip": 1.00116658, + "balance_loss_mlp": 1.00284874, + "epoch": 0.823508191793176, + "flos": 67273688194560.0, + "grad_norm": 0.735830017685578, + "language_loss": 0.63844752, + "learning_rate": 3.178567221188393e-07, + "loss": 0.65869963, + "num_input_tokens_seen": 295430035, + "router_z_loss_clip": 0.00946045, + "router_z_loss_mlp": 0.203125, + "step": 13697, + "time_per_iteration": 3.1228291988372803 + }, + { + "auxiliary_loss_clip": 0.01094179, + "auxiliary_loss_mlp": 0.01024098, + "balance_loss_clip": 1.01381028, + "balance_loss_mlp": 1.03191829, + "epoch": 0.8235683150458439, + "flos": 17928402641280.0, + "grad_norm": 1.8408059577999478, + "language_loss": 0.73020118, + "learning_rate": 3.1764608494232037e-07, + "loss": 0.7513839, + "num_input_tokens_seen": 295447765, + "router_z_loss_clip": 0.10302734, + "router_z_loss_mlp": 0.62109375, + "step": 13698, + "time_per_iteration": 2.4486424922943115 + }, + { + "auxiliary_loss_clip": 0.01100262, + "auxiliary_loss_mlp": 0.01027014, + "balance_loss_clip": 1.01482511, + "balance_loss_mlp": 1.03423512, + "epoch": 0.823628438298512, + "flos": 18916089891840.0, + "grad_norm": 1.8648644268579129, + "language_loss": 0.7192139, + "learning_rate": 3.174355115608305e-07, + "loss": 0.74048668, + "num_input_tokens_seen": 295464810, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66015625, + "step": 13699, + "time_per_iteration": 2.4568445682525635 + }, + { + "auxiliary_loss_clip": 0.0109784, + "auxiliary_loss_mlp": 0.0102711, + "balance_loss_clip": 1.01595223, + "balance_loss_mlp": 1.03365541, + "epoch": 0.8236885615511799, + "flos": 18696460181760.0, + "grad_norm": 3.252304717758055, + "language_loss": 0.8196072, + "learning_rate": 3.1722500198235526e-07, + "loss": 0.84085667, + "num_input_tokens_seen": 295482605, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.640625, + "step": 13700, + "time_per_iteration": 2.417517900466919 + }, + { + "auxiliary_loss_clip": 0.01100609, + "auxiliary_loss_mlp": 0.01032273, + "balance_loss_clip": 1.02116919, + "balance_loss_mlp": 1.03394866, + "epoch": 0.8237486848038479, + "flos": 23695009845120.0, + "grad_norm": 1.5761573110612463, + "language_loss": 0.72924078, + "learning_rate": 3.170145562148763e-07, + "loss": 0.75056958, + "num_input_tokens_seen": 295503780, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66796875, + "step": 13701, + "time_per_iteration": 2.467587947845459 + }, + { + "auxiliary_loss_clip": 0.01099008, + "auxiliary_loss_mlp": 0.01031893, + "balance_loss_clip": 1.01987123, + "balance_loss_mlp": 1.03201449, + "epoch": 0.8238088080565159, + "flos": 23441301106560.0, + "grad_norm": 1.687494382676569, + "language_loss": 0.69314957, + "learning_rate": 3.1680417426637384e-07, + "loss": 0.71445858, + "num_input_tokens_seen": 295522035, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 13702, + "time_per_iteration": 2.455983877182007 + }, + { + "auxiliary_loss_clip": 0.01099997, + "auxiliary_loss_mlp": 0.01029664, + "balance_loss_clip": 1.01816654, + "balance_loss_mlp": 1.03495264, + "epoch": 0.8238689313091838, + "flos": 22746537267840.0, + "grad_norm": 1.7567821119392977, + "language_loss": 0.74843061, + "learning_rate": 3.1659385614482603e-07, + "loss": 0.76972723, + "num_input_tokens_seen": 295541190, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6484375, + "step": 13703, + "time_per_iteration": 2.46303129196167 + }, + { + "auxiliary_loss_clip": 0.0110475, + "auxiliary_loss_mlp": 0.01037304, + "balance_loss_clip": 1.02411962, + "balance_loss_mlp": 1.03483748, + "epoch": 0.8239290545618518, + "flos": 25630092264960.0, + "grad_norm": 1.6727554816785857, + "language_loss": 0.697137, + "learning_rate": 3.1638360185820755e-07, + "loss": 0.71855754, + "num_input_tokens_seen": 295558860, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.703125, + "step": 13704, + "time_per_iteration": 2.4747629165649414 + }, + { + "auxiliary_loss_clip": 0.01097133, + "auxiliary_loss_mlp": 0.01029303, + "balance_loss_clip": 1.01810384, + "balance_loss_mlp": 1.03255594, + "epoch": 0.8239891778145197, + "flos": 26026473824640.0, + "grad_norm": 1.5661062329236886, + "language_loss": 0.63864183, + "learning_rate": 3.161734114144916e-07, + "loss": 0.65990615, + "num_input_tokens_seen": 295578155, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.64453125, + "step": 13705, + "time_per_iteration": 2.487370014190674 + }, + { + "auxiliary_loss_clip": 0.01100513, + "auxiliary_loss_mlp": 0.01027202, + "balance_loss_clip": 1.01513791, + "balance_loss_mlp": 1.03334999, + "epoch": 0.8240493010671878, + "flos": 21833257040640.0, + "grad_norm": 5.668073340397448, + "language_loss": 0.69304025, + "learning_rate": 3.1596328482164915e-07, + "loss": 0.71431744, + "num_input_tokens_seen": 295599170, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.671875, + "step": 13706, + "time_per_iteration": 2.452761173248291 + }, + { + "auxiliary_loss_clip": 0.01103005, + "auxiliary_loss_mlp": 0.01031795, + "balance_loss_clip": 1.0199219, + "balance_loss_mlp": 1.0368948, + "epoch": 0.8241094243198557, + "flos": 18551919853440.0, + "grad_norm": 1.7444306759307577, + "language_loss": 0.69689429, + "learning_rate": 3.157532220876475e-07, + "loss": 0.71824229, + "num_input_tokens_seen": 295617465, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66015625, + "step": 13707, + "time_per_iteration": 2.442892551422119 + }, + { + "auxiliary_loss_clip": 0.0110125, + "auxiliary_loss_mlp": 0.01030414, + "balance_loss_clip": 1.01842141, + "balance_loss_mlp": 1.03448296, + "epoch": 0.8241695475725237, + "flos": 25447163276160.0, + "grad_norm": 1.6784140276510164, + "language_loss": 0.79208684, + "learning_rate": 3.1554322322045226e-07, + "loss": 0.81340349, + "num_input_tokens_seen": 295634960, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66796875, + "step": 13708, + "time_per_iteration": 2.520395517349243 + }, + { + "auxiliary_loss_clip": 0.01099959, + "auxiliary_loss_mlp": 0.01028512, + "balance_loss_clip": 1.01653779, + "balance_loss_mlp": 1.03384626, + "epoch": 0.8242296708251916, + "flos": 18989670902400.0, + "grad_norm": 2.2364535014485996, + "language_loss": 0.68625695, + "learning_rate": 3.1533328822802664e-07, + "loss": 0.70754164, + "num_input_tokens_seen": 295652725, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66015625, + "step": 13709, + "time_per_iteration": 2.433765172958374 + }, + { + "auxiliary_loss_clip": 0.01099313, + "auxiliary_loss_mlp": 0.01032329, + "balance_loss_clip": 1.02135551, + "balance_loss_mlp": 1.03391647, + "epoch": 0.8242897940778596, + "flos": 22600883617920.0, + "grad_norm": 1.7389997143099616, + "language_loss": 0.82326722, + "learning_rate": 3.151234171183319e-07, + "loss": 0.84458363, + "num_input_tokens_seen": 295671195, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.65234375, + "step": 13710, + "time_per_iteration": 2.448054075241089 + }, + { + "auxiliary_loss_clip": 0.01100024, + "auxiliary_loss_mlp": 0.01028924, + "balance_loss_clip": 1.01690805, + "balance_loss_mlp": 1.0341599, + "epoch": 0.8243499173305275, + "flos": 21468153248640.0, + "grad_norm": 2.108497711538903, + "language_loss": 0.78206408, + "learning_rate": 3.149136098993257e-07, + "loss": 0.80335355, + "num_input_tokens_seen": 295689130, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.65625, + "step": 13711, + "time_per_iteration": 2.4674675464630127 + }, + { + "auxiliary_loss_clip": 0.01098795, + "auxiliary_loss_mlp": 0.01028415, + "balance_loss_clip": 1.01678646, + "balance_loss_mlp": 1.03360128, + "epoch": 0.8244100405831956, + "flos": 20010359773440.0, + "grad_norm": 1.7882059263039318, + "language_loss": 0.65825897, + "learning_rate": 3.1470386657896473e-07, + "loss": 0.67953104, + "num_input_tokens_seen": 295706385, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 13712, + "time_per_iteration": 2.436894655227661 + }, + { + "auxiliary_loss_clip": 0.01099114, + "auxiliary_loss_mlp": 0.01026198, + "balance_loss_clip": 1.01524234, + "balance_loss_mlp": 1.03398395, + "epoch": 0.8244701638358635, + "flos": 26430684549120.0, + "grad_norm": 1.522082623848733, + "language_loss": 0.73938203, + "learning_rate": 3.14494187165202e-07, + "loss": 0.76063514, + "num_input_tokens_seen": 295727925, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 13713, + "time_per_iteration": 2.4973256587982178 + }, + { + "auxiliary_loss_clip": 0.01098726, + "auxiliary_loss_mlp": 0.01025141, + "balance_loss_clip": 1.01357198, + "balance_loss_mlp": 1.03296065, + "epoch": 0.8245302870885315, + "flos": 17640004343040.0, + "grad_norm": 1.9600831331302564, + "language_loss": 0.81260616, + "learning_rate": 3.1428457166598833e-07, + "loss": 0.83384484, + "num_input_tokens_seen": 295744420, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 13714, + "time_per_iteration": 2.3998029232025146 + }, + { + "auxiliary_loss_clip": 0.01101569, + "auxiliary_loss_mlp": 0.01031928, + "balance_loss_clip": 1.01970863, + "balance_loss_mlp": 1.03630209, + "epoch": 0.8245904103411995, + "flos": 26209510554240.0, + "grad_norm": 1.8246610478563798, + "language_loss": 0.65964639, + "learning_rate": 3.1407502008927235e-07, + "loss": 0.68098134, + "num_input_tokens_seen": 295765105, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65234375, + "step": 13715, + "time_per_iteration": 2.508072853088379 + }, + { + "auxiliary_loss_clip": 0.01102798, + "auxiliary_loss_mlp": 0.01029868, + "balance_loss_clip": 1.01794744, + "balance_loss_mlp": 1.03492641, + "epoch": 0.8246505335938674, + "flos": 24205084928640.0, + "grad_norm": 1.7553847724499971, + "language_loss": 0.75059605, + "learning_rate": 3.1386553244300086e-07, + "loss": 0.77192277, + "num_input_tokens_seen": 295784200, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 13716, + "time_per_iteration": 2.443873405456543 + }, + { + "auxiliary_loss_clip": 0.01022766, + "auxiliary_loss_mlp": 0.01002387, + "balance_loss_clip": 1.00139761, + "balance_loss_mlp": 1.00258684, + "epoch": 0.8247106568465354, + "flos": 67092195749760.0, + "grad_norm": 0.7319645588496716, + "language_loss": 0.58983648, + "learning_rate": 3.136561087351175e-07, + "loss": 0.61008805, + "num_input_tokens_seen": 295846555, + "router_z_loss_clip": 0.0098877, + "router_z_loss_mlp": 0.20117188, + "step": 13717, + "time_per_iteration": 3.165395975112915 + }, + { + "auxiliary_loss_clip": 0.01100792, + "auxiliary_loss_mlp": 0.01027838, + "balance_loss_clip": 1.01752055, + "balance_loss_mlp": 1.03656578, + "epoch": 0.8247707800992033, + "flos": 12568232805120.0, + "grad_norm": 1.9648404876687129, + "language_loss": 0.79825944, + "learning_rate": 3.1344674897356373e-07, + "loss": 0.81954575, + "num_input_tokens_seen": 295863425, + "router_z_loss_clip": 0.10302734, + "router_z_loss_mlp": 0.640625, + "step": 13718, + "time_per_iteration": 2.4321165084838867 + }, + { + "auxiliary_loss_clip": 0.01096221, + "auxiliary_loss_mlp": 0.0103075, + "balance_loss_clip": 1.01941967, + "balance_loss_mlp": 1.03296947, + "epoch": 0.8248309033518714, + "flos": 15923617879680.0, + "grad_norm": 1.5600124582727455, + "language_loss": 0.69004935, + "learning_rate": 3.132374531662778e-07, + "loss": 0.71131909, + "num_input_tokens_seen": 295880925, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6328125, + "step": 13719, + "time_per_iteration": 2.4147088527679443 + }, + { + "auxiliary_loss_clip": 0.01099942, + "auxiliary_loss_mlp": 0.01028482, + "balance_loss_clip": 1.01622105, + "balance_loss_mlp": 1.03254807, + "epoch": 0.8248910266045393, + "flos": 17564735393280.0, + "grad_norm": 2.151484715276455, + "language_loss": 0.69623858, + "learning_rate": 3.13028221321197e-07, + "loss": 0.71752286, + "num_input_tokens_seen": 295898205, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.67578125, + "step": 13720, + "time_per_iteration": 3.8741157054901123 + }, + { + "auxiliary_loss_clip": 0.01101452, + "auxiliary_loss_mlp": 0.01028839, + "balance_loss_clip": 1.01701367, + "balance_loss_mlp": 1.03437507, + "epoch": 0.8249511498572073, + "flos": 28619655275520.0, + "grad_norm": 2.0074943971532013, + "language_loss": 0.75765574, + "learning_rate": 3.1281905344625467e-07, + "loss": 0.77895862, + "num_input_tokens_seen": 295918130, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 13721, + "time_per_iteration": 2.4852468967437744 + }, + { + "auxiliary_loss_clip": 0.01097082, + "auxiliary_loss_mlp": 0.01024921, + "balance_loss_clip": 1.01376283, + "balance_loss_mlp": 1.03250015, + "epoch": 0.8250112731098752, + "flos": 25556583081600.0, + "grad_norm": 1.8887788155393048, + "language_loss": 0.77601635, + "learning_rate": 3.1260994954938305e-07, + "loss": 0.79723638, + "num_input_tokens_seen": 295937760, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.64453125, + "step": 13722, + "time_per_iteration": 3.9625513553619385 + }, + { + "auxiliary_loss_clip": 0.01098838, + "auxiliary_loss_mlp": 0.01029739, + "balance_loss_clip": 1.01853311, + "balance_loss_mlp": 1.03513193, + "epoch": 0.8250713963625432, + "flos": 27746164339200.0, + "grad_norm": 1.6348425441853751, + "language_loss": 0.63200963, + "learning_rate": 3.1240090963851205e-07, + "loss": 0.65329552, + "num_input_tokens_seen": 295957585, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.63671875, + "step": 13723, + "time_per_iteration": 2.5051403045654297 + }, + { + "auxiliary_loss_clip": 0.01100161, + "auxiliary_loss_mlp": 0.01031711, + "balance_loss_clip": 1.02004635, + "balance_loss_mlp": 1.03461981, + "epoch": 0.8251315196152111, + "flos": 21610610588160.0, + "grad_norm": 1.4987844407336721, + "language_loss": 0.73996544, + "learning_rate": 3.121919337215666e-07, + "loss": 0.76128417, + "num_input_tokens_seen": 295977135, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65625, + "step": 13724, + "time_per_iteration": 2.446829319000244 + }, + { + "auxiliary_loss_clip": 0.01102165, + "auxiliary_loss_mlp": 0.01032146, + "balance_loss_clip": 1.0198431, + "balance_loss_mlp": 1.03561938, + "epoch": 0.8251916428678792, + "flos": 28579363194240.0, + "grad_norm": 1.943405215857899, + "language_loss": 0.64098012, + "learning_rate": 3.1198302180647253e-07, + "loss": 0.6623233, + "num_input_tokens_seen": 295996265, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6640625, + "step": 13725, + "time_per_iteration": 3.9392764568328857 + }, + { + "auxiliary_loss_clip": 0.01096724, + "auxiliary_loss_mlp": 0.01027974, + "balance_loss_clip": 1.01672626, + "balance_loss_mlp": 1.03262889, + "epoch": 0.8252517661205471, + "flos": 23075191733760.0, + "grad_norm": 1.4837980675924767, + "language_loss": 0.81744307, + "learning_rate": 3.1177417390115125e-07, + "loss": 0.83869004, + "num_input_tokens_seen": 296014745, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 13726, + "time_per_iteration": 2.468735456466675 + }, + { + "auxiliary_loss_clip": 0.01093251, + "auxiliary_loss_mlp": 0.01029432, + "balance_loss_clip": 1.01873279, + "balance_loss_mlp": 1.03116345, + "epoch": 0.8253118893732151, + "flos": 31759576617600.0, + "grad_norm": 1.6663240339178054, + "language_loss": 0.70314664, + "learning_rate": 3.1156539001352286e-07, + "loss": 0.72437346, + "num_input_tokens_seen": 296036960, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.62109375, + "step": 13727, + "time_per_iteration": 2.5540802478790283 + }, + { + "auxiliary_loss_clip": 0.01101407, + "auxiliary_loss_mlp": 0.01031517, + "balance_loss_clip": 1.01923275, + "balance_loss_mlp": 1.03472996, + "epoch": 0.8253720126258831, + "flos": 18296415434880.0, + "grad_norm": 1.6751326547454333, + "language_loss": 0.62385333, + "learning_rate": 3.113566701515036e-07, + "loss": 0.64518249, + "num_input_tokens_seen": 296056540, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6640625, + "step": 13728, + "time_per_iteration": 2.4579248428344727 + }, + { + "auxiliary_loss_clip": 0.01105596, + "auxiliary_loss_mlp": 0.010299, + "balance_loss_clip": 1.01763988, + "balance_loss_mlp": 1.03654742, + "epoch": 0.825432135878551, + "flos": 26797332625920.0, + "grad_norm": 1.9717282075375915, + "language_loss": 0.71042085, + "learning_rate": 3.111480143230092e-07, + "loss": 0.73177588, + "num_input_tokens_seen": 296077950, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.69140625, + "step": 13729, + "time_per_iteration": 2.5197970867156982 + }, + { + "auxiliary_loss_clip": 0.01022956, + "auxiliary_loss_mlp": 0.01004312, + "balance_loss_clip": 1.00337625, + "balance_loss_mlp": 1.0028019, + "epoch": 0.825492259131219, + "flos": 54219116217600.0, + "grad_norm": 0.8572305037082905, + "language_loss": 0.6273154, + "learning_rate": 3.109394225359514e-07, + "loss": 0.64758813, + "num_input_tokens_seen": 296127060, + "router_z_loss_clip": 0.00933838, + "router_z_loss_mlp": 0.20117188, + "step": 13730, + "time_per_iteration": 2.863248825073242 + }, + { + "auxiliary_loss_clip": 0.01100543, + "auxiliary_loss_mlp": 0.01030106, + "balance_loss_clip": 1.0182085, + "balance_loss_mlp": 1.03533745, + "epoch": 0.825552382383887, + "flos": 43756145493120.0, + "grad_norm": 2.0536756687084847, + "language_loss": 0.63441122, + "learning_rate": 3.1073089479823945e-07, + "loss": 0.65571773, + "num_input_tokens_seen": 296147775, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65234375, + "step": 13731, + "time_per_iteration": 2.6466689109802246 + }, + { + "auxiliary_loss_clip": 0.0110256, + "auxiliary_loss_mlp": 0.01028814, + "balance_loss_clip": 1.01711965, + "balance_loss_mlp": 1.03353024, + "epoch": 0.825612505636555, + "flos": 12602814624000.0, + "grad_norm": 4.369732122043469, + "language_loss": 0.69833827, + "learning_rate": 3.105224311177812e-07, + "loss": 0.71965206, + "num_input_tokens_seen": 296163560, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6875, + "step": 13732, + "time_per_iteration": 2.431699514389038 + }, + { + "auxiliary_loss_clip": 0.01102382, + "auxiliary_loss_mlp": 0.01032659, + "balance_loss_clip": 1.02082729, + "balance_loss_mlp": 1.03338408, + "epoch": 0.8256726288892229, + "flos": 17595618111360.0, + "grad_norm": 9.416973865724984, + "language_loss": 0.70556611, + "learning_rate": 3.103140315024817e-07, + "loss": 0.72691655, + "num_input_tokens_seen": 296178730, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6875, + "step": 13733, + "time_per_iteration": 2.4896795749664307 + }, + { + "auxiliary_loss_clip": 0.01095159, + "auxiliary_loss_mlp": 0.01026767, + "balance_loss_clip": 1.01482248, + "balance_loss_mlp": 1.03152645, + "epoch": 0.8257327521418909, + "flos": 23805794367360.0, + "grad_norm": 1.4905280980303643, + "language_loss": 0.82499802, + "learning_rate": 3.1010569596024437e-07, + "loss": 0.84621727, + "num_input_tokens_seen": 296200175, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.63671875, + "step": 13734, + "time_per_iteration": 2.4829580783843994 + }, + { + "auxiliary_loss_clip": 0.01096383, + "auxiliary_loss_mlp": 0.01030721, + "balance_loss_clip": 1.01891303, + "balance_loss_mlp": 1.03280544, + "epoch": 0.8257928753945588, + "flos": 19281121856640.0, + "grad_norm": 1.7089910014133873, + "language_loss": 0.82727551, + "learning_rate": 3.098974244989676e-07, + "loss": 0.84854656, + "num_input_tokens_seen": 296219305, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.63671875, + "step": 13735, + "time_per_iteration": 2.447176456451416 + }, + { + "auxiliary_loss_clip": 0.01100896, + "auxiliary_loss_mlp": 0.01026899, + "balance_loss_clip": 1.01627731, + "balance_loss_mlp": 1.03607154, + "epoch": 0.8258529986472268, + "flos": 18478841633280.0, + "grad_norm": 1.741963108109938, + "language_loss": 0.70721442, + "learning_rate": 3.096892171265497e-07, + "loss": 0.72849238, + "num_input_tokens_seen": 296236945, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6484375, + "step": 13736, + "time_per_iteration": 2.410585403442383 + }, + { + "auxiliary_loss_clip": 0.01022898, + "auxiliary_loss_mlp": 0.00996896, + "balance_loss_clip": 0.99593621, + "balance_loss_mlp": 1.00281882, + "epoch": 0.8259131218998947, + "flos": 62137957512960.0, + "grad_norm": 0.8456109831997218, + "language_loss": 0.67959881, + "learning_rate": 3.0948107385088665e-07, + "loss": 0.6997968, + "num_input_tokens_seen": 296294685, + "router_z_loss_clip": 0.00958252, + "router_z_loss_mlp": 0.20117188, + "step": 13737, + "time_per_iteration": 3.0272867679595947 + }, + { + "auxiliary_loss_clip": 0.01101345, + "auxiliary_loss_mlp": 0.01032726, + "balance_loss_clip": 1.0216099, + "balance_loss_mlp": 1.03449416, + "epoch": 0.8259732451525628, + "flos": 22159038418560.0, + "grad_norm": 2.073993143521232, + "language_loss": 0.6973623, + "learning_rate": 3.0927299467987e-07, + "loss": 0.71870303, + "num_input_tokens_seen": 296314790, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.671875, + "step": 13738, + "time_per_iteration": 2.4943878650665283 + }, + { + "auxiliary_loss_clip": 0.01103797, + "auxiliary_loss_mlp": 0.01031248, + "balance_loss_clip": 1.0177238, + "balance_loss_mlp": 1.03640378, + "epoch": 0.8260333684052307, + "flos": 38361645233280.0, + "grad_norm": 2.1714689759020263, + "language_loss": 0.63835168, + "learning_rate": 3.090649796213911e-07, + "loss": 0.65970206, + "num_input_tokens_seen": 296335355, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.671875, + "step": 13739, + "time_per_iteration": 2.5875649452209473 + }, + { + "auxiliary_loss_clip": 0.01022875, + "auxiliary_loss_mlp": 0.01001493, + "balance_loss_clip": 1.00052786, + "balance_loss_mlp": 1.00276709, + "epoch": 0.8260934916578987, + "flos": 62185611882240.0, + "grad_norm": 0.8181807720389914, + "language_loss": 0.59289646, + "learning_rate": 3.0885702868333853e-07, + "loss": 0.61314023, + "num_input_tokens_seen": 296399885, + "router_z_loss_clip": 0.00964355, + "router_z_loss_mlp": 0.20117188, + "step": 13740, + "time_per_iteration": 3.1312007904052734 + }, + { + "auxiliary_loss_clip": 0.01104538, + "auxiliary_loss_mlp": 0.01030141, + "balance_loss_clip": 1.01754618, + "balance_loss_mlp": 1.03531182, + "epoch": 0.8261536149105667, + "flos": 22565475786240.0, + "grad_norm": 1.7686192119989161, + "language_loss": 0.74968207, + "learning_rate": 3.086491418735959e-07, + "loss": 0.77102888, + "num_input_tokens_seen": 296417660, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 13741, + "time_per_iteration": 2.4407901763916016 + }, + { + "auxiliary_loss_clip": 0.01098345, + "auxiliary_loss_mlp": 0.01030197, + "balance_loss_clip": 1.01872921, + "balance_loss_mlp": 1.03380883, + "epoch": 0.8262137381632346, + "flos": 32525479342080.0, + "grad_norm": 1.9409815336260887, + "language_loss": 0.62311375, + "learning_rate": 3.0844131920004726e-07, + "loss": 0.64439917, + "num_input_tokens_seen": 296438255, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.64453125, + "step": 13742, + "time_per_iteration": 2.5456202030181885 + }, + { + "auxiliary_loss_clip": 0.01105188, + "auxiliary_loss_mlp": 0.01034876, + "balance_loss_clip": 1.02133405, + "balance_loss_mlp": 1.03567028, + "epoch": 0.8262738614159026, + "flos": 14136451666560.0, + "grad_norm": 2.5756916838702324, + "language_loss": 0.65460289, + "learning_rate": 3.0823356067057327e-07, + "loss": 0.67600346, + "num_input_tokens_seen": 296454485, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.6953125, + "step": 13743, + "time_per_iteration": 2.4249460697174072 + }, + { + "auxiliary_loss_clip": 0.01102194, + "auxiliary_loss_mlp": 0.01032775, + "balance_loss_clip": 1.02113438, + "balance_loss_mlp": 1.03607357, + "epoch": 0.8263339846685706, + "flos": 19825347795840.0, + "grad_norm": 1.9730878260340954, + "language_loss": 0.6655553, + "learning_rate": 3.0802586629305283e-07, + "loss": 0.68690503, + "num_input_tokens_seen": 296473740, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 13744, + "time_per_iteration": 2.441387414932251 + }, + { + "auxiliary_loss_clip": 0.01100364, + "auxiliary_loss_mlp": 0.01028674, + "balance_loss_clip": 1.01736081, + "balance_loss_mlp": 1.03470361, + "epoch": 0.8263941079212386, + "flos": 22745962650240.0, + "grad_norm": 1.755399157939641, + "language_loss": 0.75241995, + "learning_rate": 3.078182360753612e-07, + "loss": 0.77371031, + "num_input_tokens_seen": 296493355, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65625, + "step": 13745, + "time_per_iteration": 2.4459309577941895 + }, + { + "auxiliary_loss_clip": 0.01095928, + "auxiliary_loss_mlp": 0.01030217, + "balance_loss_clip": 1.02005458, + "balance_loss_mlp": 1.03274918, + "epoch": 0.8264542311739065, + "flos": 20120641505280.0, + "grad_norm": 1.8118192863065001, + "language_loss": 0.79148436, + "learning_rate": 3.076106700253709e-07, + "loss": 0.81274581, + "num_input_tokens_seen": 296510520, + "router_z_loss_clip": 0.1015625, + "router_z_loss_mlp": 0.6328125, + "step": 13746, + "time_per_iteration": 2.4544129371643066 + }, + { + "auxiliary_loss_clip": 0.01104486, + "auxiliary_loss_mlp": 0.01032904, + "balance_loss_clip": 1.02051806, + "balance_loss_mlp": 1.03626895, + "epoch": 0.8265143544265745, + "flos": 16837149502080.0, + "grad_norm": 1.948876431747442, + "language_loss": 0.68665206, + "learning_rate": 3.0740316815095415e-07, + "loss": 0.70802593, + "num_input_tokens_seen": 296528265, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 13747, + "time_per_iteration": 2.4420480728149414 + }, + { + "auxiliary_loss_clip": 0.01101378, + "auxiliary_loss_mlp": 0.01031327, + "balance_loss_clip": 1.01921535, + "balance_loss_mlp": 1.03463233, + "epoch": 0.8265744776792424, + "flos": 22018592240640.0, + "grad_norm": 2.2816524530159756, + "language_loss": 0.75179929, + "learning_rate": 3.0719573045997835e-07, + "loss": 0.7731263, + "num_input_tokens_seen": 296547810, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66796875, + "step": 13748, + "time_per_iteration": 2.4481050968170166 + }, + { + "auxiliary_loss_clip": 0.01096027, + "auxiliary_loss_mlp": 0.01030654, + "balance_loss_clip": 1.02046204, + "balance_loss_mlp": 1.03393388, + "epoch": 0.8266346009319104, + "flos": 19244852098560.0, + "grad_norm": 5.161325368268591, + "language_loss": 0.63943124, + "learning_rate": 3.069883569603102e-07, + "loss": 0.66069806, + "num_input_tokens_seen": 296565940, + "router_z_loss_clip": 0.10205078, + "router_z_loss_mlp": 0.62109375, + "step": 13749, + "time_per_iteration": 2.44142746925354 + }, + { + "auxiliary_loss_clip": 0.01095615, + "auxiliary_loss_mlp": 0.01025699, + "balance_loss_clip": 1.01466656, + "balance_loss_mlp": 1.03140473, + "epoch": 0.8266947241845783, + "flos": 24166768095360.0, + "grad_norm": 1.8454134649317644, + "language_loss": 0.73651314, + "learning_rate": 3.067810476598132e-07, + "loss": 0.75772631, + "num_input_tokens_seen": 296585090, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 13750, + "time_per_iteration": 2.472975969314575 + }, + { + "auxiliary_loss_clip": 0.0110087, + "auxiliary_loss_mlp": 0.01033301, + "balance_loss_clip": 1.02129698, + "balance_loss_mlp": 1.03503251, + "epoch": 0.8267548474372464, + "flos": 21105814803840.0, + "grad_norm": 1.831070973907418, + "language_loss": 0.65703225, + "learning_rate": 3.065738025663496e-07, + "loss": 0.67837399, + "num_input_tokens_seen": 296604950, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.65625, + "step": 13751, + "time_per_iteration": 2.460341215133667 + }, + { + "auxiliary_loss_clip": 0.01096828, + "auxiliary_loss_mlp": 0.01026688, + "balance_loss_clip": 1.01563787, + "balance_loss_mlp": 1.03284085, + "epoch": 0.8268149706899143, + "flos": 39968288668800.0, + "grad_norm": 1.4782058761360306, + "language_loss": 0.60665822, + "learning_rate": 3.0636662168777607e-07, + "loss": 0.62789339, + "num_input_tokens_seen": 296627780, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 13752, + "time_per_iteration": 2.6207900047302246 + }, + { + "auxiliary_loss_clip": 0.01022684, + "auxiliary_loss_mlp": 0.01002368, + "balance_loss_clip": 1.00148606, + "balance_loss_mlp": 1.00261712, + "epoch": 0.8268750939425823, + "flos": 65782423244160.0, + "grad_norm": 0.7696499438638109, + "language_loss": 0.57472384, + "learning_rate": 3.0615950503194986e-07, + "loss": 0.59497434, + "num_input_tokens_seen": 296683850, + "router_z_loss_clip": 0.0088501, + "router_z_loss_mlp": 0.20117188, + "step": 13753, + "time_per_iteration": 3.1323916912078857 + }, + { + "auxiliary_loss_clip": 0.01022837, + "auxiliary_loss_mlp": 0.00999424, + "balance_loss_clip": 0.99852353, + "balance_loss_mlp": 1.0027312, + "epoch": 0.8269352171952503, + "flos": 52981455242880.0, + "grad_norm": 0.6991010254330118, + "language_loss": 0.54898673, + "learning_rate": 3.0595245260672563e-07, + "loss": 0.56920928, + "num_input_tokens_seen": 296741420, + "router_z_loss_clip": 0.00897217, + "router_z_loss_mlp": 0.20117188, + "step": 13754, + "time_per_iteration": 3.19143009185791 + }, + { + "auxiliary_loss_clip": 0.01095849, + "auxiliary_loss_mlp": 0.01028538, + "balance_loss_clip": 1.01835752, + "balance_loss_mlp": 1.03289616, + "epoch": 0.8269953404479182, + "flos": 23076125487360.0, + "grad_norm": 1.901395420853525, + "language_loss": 0.68808734, + "learning_rate": 3.0574546441995354e-07, + "loss": 0.70933127, + "num_input_tokens_seen": 296759620, + "router_z_loss_clip": 0.1015625, + "router_z_loss_mlp": 0.62890625, + "step": 13755, + "time_per_iteration": 2.563145160675049 + }, + { + "auxiliary_loss_clip": 0.01097596, + "auxiliary_loss_mlp": 0.01025322, + "balance_loss_clip": 1.01480806, + "balance_loss_mlp": 1.03408217, + "epoch": 0.8270554637005862, + "flos": 14209996763520.0, + "grad_norm": 2.2342864185465454, + "language_loss": 0.69950449, + "learning_rate": 3.0553854047948324e-07, + "loss": 0.7207337, + "num_input_tokens_seen": 296777275, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6328125, + "step": 13756, + "time_per_iteration": 2.5362203121185303 + }, + { + "auxiliary_loss_clip": 0.01102655, + "auxiliary_loss_mlp": 0.01029917, + "balance_loss_clip": 1.01820481, + "balance_loss_mlp": 1.03683186, + "epoch": 0.8271155869532542, + "flos": 21762046327680.0, + "grad_norm": 1.935436215095936, + "language_loss": 0.71919167, + "learning_rate": 3.053316807931623e-07, + "loss": 0.74051744, + "num_input_tokens_seen": 296796655, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 13757, + "time_per_iteration": 2.4298806190490723 + }, + { + "auxiliary_loss_clip": 0.01102829, + "auxiliary_loss_mlp": 0.0103455, + "balance_loss_clip": 1.02090025, + "balance_loss_mlp": 1.03461754, + "epoch": 0.8271757102059222, + "flos": 15120475729920.0, + "grad_norm": 2.0681881017111734, + "language_loss": 0.6859889, + "learning_rate": 3.0512488536883283e-07, + "loss": 0.70736271, + "num_input_tokens_seen": 296813705, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.6796875, + "step": 13758, + "time_per_iteration": 2.450861692428589 + }, + { + "auxiliary_loss_clip": 0.01094703, + "auxiliary_loss_mlp": 0.01028428, + "balance_loss_clip": 1.01730585, + "balance_loss_mlp": 1.03196728, + "epoch": 0.8272358334585901, + "flos": 24133730561280.0, + "grad_norm": 1.58450668225913, + "language_loss": 0.69761419, + "learning_rate": 3.0491815421433775e-07, + "loss": 0.71884549, + "num_input_tokens_seen": 296833985, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.625, + "step": 13759, + "time_per_iteration": 2.4828851222991943 + }, + { + "auxiliary_loss_clip": 0.01097875, + "auxiliary_loss_mlp": 0.0102834, + "balance_loss_clip": 1.01631212, + "balance_loss_mlp": 1.03378034, + "epoch": 0.8272959567112581, + "flos": 18990712396800.0, + "grad_norm": 1.7473581768937994, + "language_loss": 0.70969361, + "learning_rate": 3.047114873375161e-07, + "loss": 0.73095572, + "num_input_tokens_seen": 296850150, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.640625, + "step": 13760, + "time_per_iteration": 2.4519858360290527 + }, + { + "auxiliary_loss_clip": 0.01098548, + "auxiliary_loss_mlp": 0.01026282, + "balance_loss_clip": 1.01542258, + "balance_loss_mlp": 1.03550053, + "epoch": 0.827356079963926, + "flos": 20631614428800.0, + "grad_norm": 1.664740297618068, + "language_loss": 0.77527195, + "learning_rate": 3.0450488474620505e-07, + "loss": 0.79652023, + "num_input_tokens_seen": 296869585, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6328125, + "step": 13761, + "time_per_iteration": 3.890357255935669 + }, + { + "auxiliary_loss_clip": 0.01095154, + "auxiliary_loss_mlp": 0.01026948, + "balance_loss_clip": 1.01645768, + "balance_loss_mlp": 1.03343439, + "epoch": 0.827416203216594, + "flos": 22416625825920.0, + "grad_norm": 1.6562788022934103, + "language_loss": 0.69847965, + "learning_rate": 3.042983464482387e-07, + "loss": 0.71970069, + "num_input_tokens_seen": 296887710, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.6171875, + "step": 13762, + "time_per_iteration": 2.4446358680725098 + }, + { + "auxiliary_loss_clip": 0.01097413, + "auxiliary_loss_mlp": 0.01021229, + "balance_loss_clip": 1.01057184, + "balance_loss_mlp": 1.03246927, + "epoch": 0.827476326469262, + "flos": 19026192055680.0, + "grad_norm": 1.8662073459015955, + "language_loss": 0.70074844, + "learning_rate": 3.0409187245144853e-07, + "loss": 0.72193485, + "num_input_tokens_seen": 296906265, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6484375, + "step": 13763, + "time_per_iteration": 5.2924604415893555 + }, + { + "auxiliary_loss_clip": 0.01022864, + "auxiliary_loss_mlp": 0.01000148, + "balance_loss_clip": 0.99919397, + "balance_loss_mlp": 1.0026294, + "epoch": 0.82753644972193, + "flos": 68500575089280.0, + "grad_norm": 0.8373654937197863, + "language_loss": 0.65168589, + "learning_rate": 3.038854627636651e-07, + "loss": 0.67191601, + "num_input_tokens_seen": 296971290, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.203125, + "step": 13764, + "time_per_iteration": 3.095142126083374 + }, + { + "auxiliary_loss_clip": 0.01102775, + "auxiliary_loss_mlp": 0.01033107, + "balance_loss_clip": 1.02101338, + "balance_loss_mlp": 1.03624713, + "epoch": 0.8275965729745979, + "flos": 18405404277120.0, + "grad_norm": 2.1372557336076032, + "language_loss": 0.77729869, + "learning_rate": 3.0367911739271423e-07, + "loss": 0.79865754, + "num_input_tokens_seen": 296989060, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 13765, + "time_per_iteration": 2.415915012359619 + }, + { + "auxiliary_loss_clip": 0.01102127, + "auxiliary_loss_mlp": 0.01030033, + "balance_loss_clip": 1.01768899, + "balance_loss_mlp": 1.03454924, + "epoch": 0.8276566962272659, + "flos": 28512067063680.0, + "grad_norm": 1.6221712528738066, + "language_loss": 0.62191451, + "learning_rate": 3.034728363464214e-07, + "loss": 0.64323616, + "num_input_tokens_seen": 297011300, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.67578125, + "step": 13766, + "time_per_iteration": 2.5222461223602295 + }, + { + "auxiliary_loss_clip": 0.01099033, + "auxiliary_loss_mlp": 0.01028817, + "balance_loss_clip": 1.01703894, + "balance_loss_mlp": 1.03382564, + "epoch": 0.8277168194799339, + "flos": 20230240878720.0, + "grad_norm": 2.733674200734292, + "language_loss": 0.82816303, + "learning_rate": 3.03266619632609e-07, + "loss": 0.84944153, + "num_input_tokens_seen": 297030350, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65234375, + "step": 13767, + "time_per_iteration": 3.9097790718078613 + }, + { + "auxiliary_loss_clip": 0.01102172, + "auxiliary_loss_mlp": 0.01026301, + "balance_loss_clip": 1.01461267, + "balance_loss_mlp": 1.03601408, + "epoch": 0.8277769427326018, + "flos": 28476623318400.0, + "grad_norm": 1.8123727599294597, + "language_loss": 0.69225526, + "learning_rate": 3.030604672590964e-07, + "loss": 0.71353996, + "num_input_tokens_seen": 297049710, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66015625, + "step": 13768, + "time_per_iteration": 2.5282175540924072 + }, + { + "auxiliary_loss_clip": 0.0109497, + "auxiliary_loss_mlp": 0.01025867, + "balance_loss_clip": 1.01526916, + "balance_loss_mlp": 1.03135371, + "epoch": 0.8278370659852698, + "flos": 27197628768000.0, + "grad_norm": 1.7692402287789066, + "language_loss": 0.74371201, + "learning_rate": 3.028543792337006e-07, + "loss": 0.76492047, + "num_input_tokens_seen": 297070510, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.63671875, + "step": 13769, + "time_per_iteration": 2.501898765563965 + }, + { + "auxiliary_loss_clip": 0.01098728, + "auxiliary_loss_mlp": 0.01027102, + "balance_loss_clip": 1.01539564, + "balance_loss_mlp": 1.03283179, + "epoch": 0.8278971892379378, + "flos": 37816126404480.0, + "grad_norm": 2.1253743254502977, + "language_loss": 0.74551117, + "learning_rate": 3.0264835556423675e-07, + "loss": 0.76676941, + "num_input_tokens_seen": 297092585, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65625, + "step": 13770, + "time_per_iteration": 2.597078323364258 + }, + { + "auxiliary_loss_clip": 0.01099272, + "auxiliary_loss_mlp": 0.01030783, + "balance_loss_clip": 1.01858819, + "balance_loss_mlp": 1.03407049, + "epoch": 0.8279573124906058, + "flos": 22560160573440.0, + "grad_norm": 1.6790850310045173, + "language_loss": 0.75939202, + "learning_rate": 3.0244239625851785e-07, + "loss": 0.78069258, + "num_input_tokens_seen": 297110055, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65234375, + "step": 13771, + "time_per_iteration": 2.4581122398376465 + }, + { + "auxiliary_loss_clip": 0.01099759, + "auxiliary_loss_mlp": 0.01029427, + "balance_loss_clip": 1.01814413, + "balance_loss_mlp": 1.03393221, + "epoch": 0.8280174357432737, + "flos": 36064619418240.0, + "grad_norm": 1.6138100145294163, + "language_loss": 0.72420895, + "learning_rate": 3.0223650132435284e-07, + "loss": 0.7455008, + "num_input_tokens_seen": 297132170, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65625, + "step": 13772, + "time_per_iteration": 2.573108673095703 + }, + { + "auxiliary_loss_clip": 0.01098054, + "auxiliary_loss_mlp": 0.01029734, + "balance_loss_clip": 1.01759243, + "balance_loss_mlp": 1.03461206, + "epoch": 0.8280775589959417, + "flos": 22961067246720.0, + "grad_norm": 2.2283903024413547, + "language_loss": 0.74291146, + "learning_rate": 3.0203067076955035e-07, + "loss": 0.76418936, + "num_input_tokens_seen": 297149515, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6328125, + "step": 13773, + "time_per_iteration": 2.429135322570801 + }, + { + "auxiliary_loss_clip": 0.01098833, + "auxiliary_loss_mlp": 0.01031828, + "balance_loss_clip": 1.02056837, + "balance_loss_mlp": 1.03527474, + "epoch": 0.8281376822486096, + "flos": 26063282286720.0, + "grad_norm": 1.72037457005478, + "language_loss": 0.75935221, + "learning_rate": 3.01824904601915e-07, + "loss": 0.78065884, + "num_input_tokens_seen": 297170320, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6328125, + "step": 13774, + "time_per_iteration": 2.475358486175537 + }, + { + "auxiliary_loss_clip": 0.01104314, + "auxiliary_loss_mlp": 0.01025788, + "balance_loss_clip": 1.01416469, + "balance_loss_mlp": 1.0359422, + "epoch": 0.8281978055012776, + "flos": 20667776446080.0, + "grad_norm": 1.6086264049463133, + "language_loss": 0.75185502, + "learning_rate": 3.01619202829249e-07, + "loss": 0.77315605, + "num_input_tokens_seen": 297189935, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.68359375, + "step": 13775, + "time_per_iteration": 2.43330717086792 + }, + { + "auxiliary_loss_clip": 0.01103717, + "auxiliary_loss_mlp": 0.01027612, + "balance_loss_clip": 1.01517248, + "balance_loss_mlp": 1.03436899, + "epoch": 0.8282579287539455, + "flos": 29315281040640.0, + "grad_norm": 1.9945621965975238, + "language_loss": 0.73318064, + "learning_rate": 3.01413565459353e-07, + "loss": 0.75449395, + "num_input_tokens_seen": 297210885, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6953125, + "step": 13776, + "time_per_iteration": 2.5265419483184814 + }, + { + "auxiliary_loss_clip": 0.01099687, + "auxiliary_loss_mlp": 0.01023413, + "balance_loss_clip": 1.01215982, + "balance_loss_mlp": 1.03306937, + "epoch": 0.8283180520066136, + "flos": 15706178899200.0, + "grad_norm": 2.0087174485094943, + "language_loss": 0.77922744, + "learning_rate": 3.0120799250002483e-07, + "loss": 0.80045843, + "num_input_tokens_seen": 297228500, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.66796875, + "step": 13777, + "time_per_iteration": 2.438504934310913 + }, + { + "auxiliary_loss_clip": 0.01097696, + "auxiliary_loss_mlp": 0.01029126, + "balance_loss_clip": 1.01874244, + "balance_loss_mlp": 1.03520977, + "epoch": 0.8283781752592815, + "flos": 24791470456320.0, + "grad_norm": 1.4952468649101947, + "language_loss": 0.82470471, + "learning_rate": 3.010024839590604e-07, + "loss": 0.8459729, + "num_input_tokens_seen": 297249470, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.625, + "step": 13778, + "time_per_iteration": 2.4861180782318115 + }, + { + "auxiliary_loss_clip": 0.0109427, + "auxiliary_loss_mlp": 0.0102475, + "balance_loss_clip": 1.01334167, + "balance_loss_mlp": 1.03251445, + "epoch": 0.8284382985119495, + "flos": 18982811404800.0, + "grad_norm": 1.8580201526843971, + "language_loss": 0.74507427, + "learning_rate": 3.0079703984425187e-07, + "loss": 0.7662645, + "num_input_tokens_seen": 297265970, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6171875, + "step": 13779, + "time_per_iteration": 2.413422107696533 + }, + { + "auxiliary_loss_clip": 0.01022695, + "auxiliary_loss_mlp": 0.01001477, + "balance_loss_clip": 1.00053501, + "balance_loss_mlp": 1.00272822, + "epoch": 0.8284984217646175, + "flos": 61034460814080.0, + "grad_norm": 0.8243655994188706, + "language_loss": 0.56794745, + "learning_rate": 3.0059166016338954e-07, + "loss": 0.58818918, + "num_input_tokens_seen": 297325525, + "router_z_loss_clip": 0.00939941, + "router_z_loss_mlp": 0.19921875, + "step": 13780, + "time_per_iteration": 3.070969343185425 + }, + { + "auxiliary_loss_clip": 0.01099038, + "auxiliary_loss_mlp": 0.01026153, + "balance_loss_clip": 1.01467896, + "balance_loss_mlp": 1.03370905, + "epoch": 0.8285585450172854, + "flos": 19714635100800.0, + "grad_norm": 1.663013046822846, + "language_loss": 0.80247319, + "learning_rate": 3.0038634492426205e-07, + "loss": 0.82372504, + "num_input_tokens_seen": 297345025, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65625, + "step": 13781, + "time_per_iteration": 2.4518861770629883 + }, + { + "auxiliary_loss_clip": 0.01102768, + "auxiliary_loss_mlp": 0.01029639, + "balance_loss_clip": 1.01744401, + "balance_loss_mlp": 1.03644729, + "epoch": 0.8286186682699535, + "flos": 21688896280320.0, + "grad_norm": 3.0303524050285557, + "language_loss": 0.75560725, + "learning_rate": 3.001810941346543e-07, + "loss": 0.77693129, + "num_input_tokens_seen": 297363570, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6640625, + "step": 13782, + "time_per_iteration": 2.504301071166992 + }, + { + "auxiliary_loss_clip": 0.01097103, + "auxiliary_loss_mlp": 0.01026716, + "balance_loss_clip": 1.01530743, + "balance_loss_mlp": 1.03192592, + "epoch": 0.8286787915226214, + "flos": 25775566346880.0, + "grad_norm": 1.6551546330725684, + "language_loss": 0.75982195, + "learning_rate": 2.9997590780234983e-07, + "loss": 0.78106016, + "num_input_tokens_seen": 297385385, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 13783, + "time_per_iteration": 2.520042657852173 + }, + { + "auxiliary_loss_clip": 0.01098829, + "auxiliary_loss_mlp": 0.01027701, + "balance_loss_clip": 1.0163579, + "balance_loss_mlp": 1.03363252, + "epoch": 0.8287389147752894, + "flos": 21288348743040.0, + "grad_norm": 1.6853866319611193, + "language_loss": 0.73697698, + "learning_rate": 2.997707859351304e-07, + "loss": 0.75824231, + "num_input_tokens_seen": 297403950, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65234375, + "step": 13784, + "time_per_iteration": 2.462013006210327 + }, + { + "auxiliary_loss_clip": 0.01102405, + "auxiliary_loss_mlp": 0.01034205, + "balance_loss_clip": 1.02154541, + "balance_loss_mlp": 1.03321636, + "epoch": 0.8287990380279573, + "flos": 33544875323520.0, + "grad_norm": 1.4282761232574668, + "language_loss": 0.70307374, + "learning_rate": 2.99565728540772e-07, + "loss": 0.72443986, + "num_input_tokens_seen": 297424565, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 13785, + "time_per_iteration": 2.577817678451538 + }, + { + "auxiliary_loss_clip": 0.01101293, + "auxiliary_loss_mlp": 0.01031549, + "balance_loss_clip": 1.01994348, + "balance_loss_mlp": 1.03573847, + "epoch": 0.8288591612806253, + "flos": 22966346545920.0, + "grad_norm": 1.578728182977374, + "language_loss": 0.68448269, + "learning_rate": 2.993607356270516e-07, + "loss": 0.70581114, + "num_input_tokens_seen": 297445180, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 13786, + "time_per_iteration": 2.4532599449157715 + }, + { + "auxiliary_loss_clip": 0.01102631, + "auxiliary_loss_mlp": 0.01033324, + "balance_loss_clip": 1.0212723, + "balance_loss_mlp": 1.03444433, + "epoch": 0.8289192845332932, + "flos": 18588979710720.0, + "grad_norm": 1.9713140427276798, + "language_loss": 0.7668817, + "learning_rate": 2.991558072017426e-07, + "loss": 0.78824121, + "num_input_tokens_seen": 297463790, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 13787, + "time_per_iteration": 2.4399027824401855 + }, + { + "auxiliary_loss_clip": 0.01098907, + "auxiliary_loss_mlp": 0.01031403, + "balance_loss_clip": 1.02052522, + "balance_loss_mlp": 1.03484738, + "epoch": 0.8289794077859612, + "flos": 15450423085440.0, + "grad_norm": 1.81084531479829, + "language_loss": 0.80682862, + "learning_rate": 2.989509432726163e-07, + "loss": 0.82813168, + "num_input_tokens_seen": 297480100, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.640625, + "step": 13788, + "time_per_iteration": 2.4278197288513184 + }, + { + "auxiliary_loss_clip": 0.0109796, + "auxiliary_loss_mlp": 0.0102936, + "balance_loss_clip": 1.01862526, + "balance_loss_mlp": 1.03381896, + "epoch": 0.8290395310386292, + "flos": 28877853214080.0, + "grad_norm": 1.9710434276893554, + "language_loss": 0.71272284, + "learning_rate": 2.9874614384744014e-07, + "loss": 0.73399603, + "num_input_tokens_seen": 297499890, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.640625, + "step": 13789, + "time_per_iteration": 2.5227370262145996 + }, + { + "auxiliary_loss_clip": 0.01100536, + "auxiliary_loss_mlp": 0.01028676, + "balance_loss_clip": 1.01702929, + "balance_loss_mlp": 1.03298402, + "epoch": 0.8290996542912972, + "flos": 36576274700160.0, + "grad_norm": 2.3533279169078614, + "language_loss": 0.68549865, + "learning_rate": 2.985414089339813e-07, + "loss": 0.7067908, + "num_input_tokens_seen": 297521440, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.67578125, + "step": 13790, + "time_per_iteration": 2.585700511932373 + }, + { + "auxiliary_loss_clip": 0.01101161, + "auxiliary_loss_mlp": 0.01030646, + "balance_loss_clip": 1.01763999, + "balance_loss_mlp": 1.0343529, + "epoch": 0.8291597775439651, + "flos": 23623009032960.0, + "grad_norm": 4.149168157668411, + "language_loss": 0.77716172, + "learning_rate": 2.9833673854000265e-07, + "loss": 0.7984798, + "num_input_tokens_seen": 297539920, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.671875, + "step": 13791, + "time_per_iteration": 2.507520914077759 + }, + { + "auxiliary_loss_clip": 0.0109744, + "auxiliary_loss_mlp": 0.0102597, + "balance_loss_clip": 1.01394224, + "balance_loss_mlp": 1.03434205, + "epoch": 0.8292199007966331, + "flos": 21397481239680.0, + "grad_norm": 1.3726823147791687, + "language_loss": 0.69920421, + "learning_rate": 2.981321326732651e-07, + "loss": 0.72043836, + "num_input_tokens_seen": 297560000, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6328125, + "step": 13792, + "time_per_iteration": 2.4654951095581055 + }, + { + "auxiliary_loss_clip": 0.01099831, + "auxiliary_loss_mlp": 0.01031375, + "balance_loss_clip": 1.01934695, + "balance_loss_mlp": 1.03244185, + "epoch": 0.829280024049301, + "flos": 28767607395840.0, + "grad_norm": 1.877850305327316, + "language_loss": 0.65054023, + "learning_rate": 2.9792759134152736e-07, + "loss": 0.67185235, + "num_input_tokens_seen": 297579300, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 13793, + "time_per_iteration": 2.544015884399414 + }, + { + "auxiliary_loss_clip": 0.01102087, + "auxiliary_loss_mlp": 0.01028709, + "balance_loss_clip": 1.01646078, + "balance_loss_mlp": 1.03449655, + "epoch": 0.829340147301969, + "flos": 19938071652480.0, + "grad_norm": 1.673378626627257, + "language_loss": 0.66431141, + "learning_rate": 2.977231145525461e-07, + "loss": 0.68561947, + "num_input_tokens_seen": 297598095, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.67578125, + "step": 13794, + "time_per_iteration": 2.431690216064453 + }, + { + "auxiliary_loss_clip": 0.0109811, + "auxiliary_loss_mlp": 0.01032069, + "balance_loss_clip": 1.01981437, + "balance_loss_mlp": 1.03217447, + "epoch": 0.829400270554637, + "flos": 25228575060480.0, + "grad_norm": 1.7723681327467673, + "language_loss": 0.65998554, + "learning_rate": 2.975187023140757e-07, + "loss": 0.68128735, + "num_input_tokens_seen": 297615955, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.65625, + "step": 13795, + "time_per_iteration": 2.4870991706848145 + }, + { + "auxiliary_loss_clip": 0.01095736, + "auxiliary_loss_mlp": 0.0102856, + "balance_loss_clip": 1.01775384, + "balance_loss_mlp": 1.03428745, + "epoch": 0.829460393807305, + "flos": 24463570176000.0, + "grad_norm": 1.9001575198437086, + "language_loss": 0.66477525, + "learning_rate": 2.973143546338661e-07, + "loss": 0.68601817, + "num_input_tokens_seen": 297636285, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.61328125, + "step": 13796, + "time_per_iteration": 2.4674506187438965 + }, + { + "auxiliary_loss_clip": 0.01097029, + "auxiliary_loss_mlp": 0.01029082, + "balance_loss_clip": 1.01781702, + "balance_loss_mlp": 1.03385639, + "epoch": 0.829520517059973, + "flos": 15122486891520.0, + "grad_norm": 1.5598295545744347, + "language_loss": 0.71532977, + "learning_rate": 2.971100715196666e-07, + "loss": 0.73659086, + "num_input_tokens_seen": 297653315, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6328125, + "step": 13797, + "time_per_iteration": 2.4169154167175293 + }, + { + "auxiliary_loss_clip": 0.01101059, + "auxiliary_loss_mlp": 0.01030069, + "balance_loss_clip": 1.01906002, + "balance_loss_mlp": 1.03453267, + "epoch": 0.8295806403126409, + "flos": 21579979265280.0, + "grad_norm": 1.750517911189691, + "language_loss": 0.72251916, + "learning_rate": 2.969058529792243e-07, + "loss": 0.74383044, + "num_input_tokens_seen": 297673480, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6640625, + "step": 13798, + "time_per_iteration": 2.4415347576141357 + }, + { + "auxiliary_loss_clip": 0.01094228, + "auxiliary_loss_mlp": 0.01029041, + "balance_loss_clip": 1.01819897, + "balance_loss_mlp": 1.03278971, + "epoch": 0.8296407635653089, + "flos": 21726566668800.0, + "grad_norm": 1.5603064729869331, + "language_loss": 0.76201189, + "learning_rate": 2.967016990202822e-07, + "loss": 0.78324461, + "num_input_tokens_seen": 297693250, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.61328125, + "step": 13799, + "time_per_iteration": 2.463636636734009 + }, + { + "auxiliary_loss_clip": 0.0109965, + "auxiliary_loss_mlp": 0.01030435, + "balance_loss_clip": 1.01907468, + "balance_loss_mlp": 1.0355351, + "epoch": 0.8297008868179768, + "flos": 11181147252480.0, + "grad_norm": 2.1935741637351174, + "language_loss": 0.67862946, + "learning_rate": 2.9649760965058245e-07, + "loss": 0.69993031, + "num_input_tokens_seen": 297710975, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.640625, + "step": 13800, + "time_per_iteration": 2.405726909637451 + }, + { + "auxiliary_loss_clip": 0.01105563, + "auxiliary_loss_mlp": 0.01033853, + "balance_loss_clip": 1.02096081, + "balance_loss_mlp": 1.03694606, + "epoch": 0.8297610100706448, + "flos": 20664041431680.0, + "grad_norm": 1.7039230196700386, + "language_loss": 0.74584657, + "learning_rate": 2.9629358487786515e-07, + "loss": 0.76724076, + "num_input_tokens_seen": 297730860, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6875, + "step": 13801, + "time_per_iteration": 2.4256935119628906 + }, + { + "auxiliary_loss_clip": 0.01100874, + "auxiliary_loss_mlp": 0.01028372, + "balance_loss_clip": 1.01740479, + "balance_loss_mlp": 1.03498697, + "epoch": 0.8298211333233128, + "flos": 20376325491840.0, + "grad_norm": 1.4930307504184834, + "language_loss": 0.73669171, + "learning_rate": 2.9608962470986476e-07, + "loss": 0.75798416, + "num_input_tokens_seen": 297749765, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.66015625, + "step": 13802, + "time_per_iteration": 2.432267427444458 + }, + { + "auxiliary_loss_clip": 0.01100091, + "auxiliary_loss_mlp": 0.0102982, + "balance_loss_clip": 1.01868546, + "balance_loss_mlp": 1.03392529, + "epoch": 0.8298812565759808, + "flos": 21508696725120.0, + "grad_norm": 1.6288424480381258, + "language_loss": 0.74650079, + "learning_rate": 2.9588572915431644e-07, + "loss": 0.76779985, + "num_input_tokens_seen": 297770380, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 13803, + "time_per_iteration": 3.8569204807281494 + }, + { + "auxiliary_loss_clip": 0.01100884, + "auxiliary_loss_mlp": 0.01029893, + "balance_loss_clip": 1.01874709, + "balance_loss_mlp": 1.03565145, + "epoch": 0.8299413798286487, + "flos": 22818681734400.0, + "grad_norm": 1.579848035372401, + "language_loss": 0.79086143, + "learning_rate": 2.9568189821895215e-07, + "loss": 0.81216919, + "num_input_tokens_seen": 297789440, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.65234375, + "step": 13804, + "time_per_iteration": 2.4629805088043213 + }, + { + "auxiliary_loss_clip": 0.01098393, + "auxiliary_loss_mlp": 0.0103112, + "balance_loss_clip": 1.02012861, + "balance_loss_mlp": 1.03375435, + "epoch": 0.8300015030813167, + "flos": 29679199683840.0, + "grad_norm": 1.7198477765468532, + "language_loss": 0.73292375, + "learning_rate": 2.954781319115016e-07, + "loss": 0.75421888, + "num_input_tokens_seen": 297810425, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 13805, + "time_per_iteration": 5.304149389266968 + }, + { + "auxiliary_loss_clip": 0.01100424, + "auxiliary_loss_mlp": 0.01029387, + "balance_loss_clip": 1.017735, + "balance_loss_mlp": 1.03412557, + "epoch": 0.8300616263339846, + "flos": 19719483436800.0, + "grad_norm": 1.9936443323476183, + "language_loss": 0.7744779, + "learning_rate": 2.952744302396906e-07, + "loss": 0.79577601, + "num_input_tokens_seen": 297827680, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 13806, + "time_per_iteration": 2.467191219329834 + }, + { + "auxiliary_loss_clip": 0.01103109, + "auxiliary_loss_mlp": 0.0103205, + "balance_loss_clip": 1.01940179, + "balance_loss_mlp": 1.03535724, + "epoch": 0.8301217495866526, + "flos": 19901945548800.0, + "grad_norm": 1.69387753242653, + "language_loss": 0.63385892, + "learning_rate": 2.950707932112444e-07, + "loss": 0.65521049, + "num_input_tokens_seen": 297848005, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.67578125, + "step": 13807, + "time_per_iteration": 2.4374656677246094 + }, + { + "auxiliary_loss_clip": 0.0110126, + "auxiliary_loss_mlp": 0.0102519, + "balance_loss_clip": 1.01403785, + "balance_loss_mlp": 1.03610516, + "epoch": 0.8301818728393207, + "flos": 19715784336000.0, + "grad_norm": 1.711706373511074, + "language_loss": 0.73087573, + "learning_rate": 2.948672208338847e-07, + "loss": 0.75214028, + "num_input_tokens_seen": 297866730, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65234375, + "step": 13808, + "time_per_iteration": 2.4465322494506836 + }, + { + "auxiliary_loss_clip": 0.0110707, + "auxiliary_loss_mlp": 0.01035806, + "balance_loss_clip": 1.02331913, + "balance_loss_mlp": 1.03772509, + "epoch": 0.8302419960919886, + "flos": 28293658416000.0, + "grad_norm": 1.786221226003615, + "language_loss": 0.66342396, + "learning_rate": 2.9466371311533046e-07, + "loss": 0.68485272, + "num_input_tokens_seen": 297886390, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 13809, + "time_per_iteration": 3.953455686569214 + }, + { + "auxiliary_loss_clip": 0.01100579, + "auxiliary_loss_mlp": 0.01023454, + "balance_loss_clip": 1.0124923, + "balance_loss_mlp": 1.03412163, + "epoch": 0.8303021193446566, + "flos": 18223444955520.0, + "grad_norm": 1.9422535896346522, + "language_loss": 0.73977947, + "learning_rate": 2.9446027006329896e-07, + "loss": 0.76101977, + "num_input_tokens_seen": 297905110, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6640625, + "step": 13810, + "time_per_iteration": 2.4113035202026367 + }, + { + "auxiliary_loss_clip": 0.01098267, + "auxiliary_loss_mlp": 0.0103445, + "balance_loss_clip": 1.02415669, + "balance_loss_mlp": 1.03506088, + "epoch": 0.8303622425973245, + "flos": 23111425578240.0, + "grad_norm": 1.5397227407858767, + "language_loss": 0.81322253, + "learning_rate": 2.94256891685505e-07, + "loss": 0.83454967, + "num_input_tokens_seen": 297925460, + "router_z_loss_clip": 0.10302734, + "router_z_loss_mlp": 0.6328125, + "step": 13811, + "time_per_iteration": 2.5003349781036377 + }, + { + "auxiliary_loss_clip": 0.01102763, + "auxiliary_loss_mlp": 0.01035668, + "balance_loss_clip": 1.02403903, + "balance_loss_mlp": 1.03681374, + "epoch": 0.8304223658499925, + "flos": 19572860119680.0, + "grad_norm": 1.83757541907444, + "language_loss": 0.73298693, + "learning_rate": 2.9405357798966156e-07, + "loss": 0.75437129, + "num_input_tokens_seen": 297941760, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66015625, + "step": 13812, + "time_per_iteration": 2.40651798248291 + }, + { + "auxiliary_loss_clip": 0.01097578, + "auxiliary_loss_mlp": 0.01026036, + "balance_loss_clip": 1.01416874, + "balance_loss_mlp": 1.03434229, + "epoch": 0.8304824891026604, + "flos": 24426115269120.0, + "grad_norm": 1.6568210885633572, + "language_loss": 0.78265715, + "learning_rate": 2.9385032898347664e-07, + "loss": 0.80389333, + "num_input_tokens_seen": 297959745, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6328125, + "step": 13813, + "time_per_iteration": 2.4902334213256836 + }, + { + "auxiliary_loss_clip": 0.01099825, + "auxiliary_loss_mlp": 0.01025617, + "balance_loss_clip": 1.01365399, + "balance_loss_mlp": 1.03272295, + "epoch": 0.8305426123553284, + "flos": 22381792611840.0, + "grad_norm": 2.334398224953377, + "language_loss": 0.71084231, + "learning_rate": 2.93647144674658e-07, + "loss": 0.73209673, + "num_input_tokens_seen": 297977665, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.671875, + "step": 13814, + "time_per_iteration": 2.4226105213165283 + }, + { + "auxiliary_loss_clip": 0.01106932, + "auxiliary_loss_mlp": 0.01039965, + "balance_loss_clip": 1.02605355, + "balance_loss_mlp": 1.03508019, + "epoch": 0.8306027356079964, + "flos": 14903575453440.0, + "grad_norm": 2.038226017442531, + "language_loss": 0.68133175, + "learning_rate": 2.9344402507091116e-07, + "loss": 0.70280063, + "num_input_tokens_seen": 297993525, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.71875, + "step": 13815, + "time_per_iteration": 2.434314250946045 + }, + { + "auxiliary_loss_clip": 0.01101072, + "auxiliary_loss_mlp": 0.01028614, + "balance_loss_clip": 1.01693177, + "balance_loss_mlp": 1.03570247, + "epoch": 0.8306628588606644, + "flos": 19644573623040.0, + "grad_norm": 1.8066947340112232, + "language_loss": 0.75933707, + "learning_rate": 2.9324097017993745e-07, + "loss": 0.78063387, + "num_input_tokens_seen": 298012920, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65234375, + "step": 13816, + "time_per_iteration": 2.442898750305176 + }, + { + "auxiliary_loss_clip": 0.01098329, + "auxiliary_loss_mlp": 0.01029317, + "balance_loss_clip": 1.01845741, + "balance_loss_mlp": 1.03341258, + "epoch": 0.8307229821133323, + "flos": 24389737770240.0, + "grad_norm": 1.7055249793285534, + "language_loss": 0.81462383, + "learning_rate": 2.930379800094371e-07, + "loss": 0.83590031, + "num_input_tokens_seen": 298033310, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6484375, + "step": 13817, + "time_per_iteration": 2.481618881225586 + }, + { + "auxiliary_loss_clip": 0.01102205, + "auxiliary_loss_mlp": 0.01034587, + "balance_loss_clip": 1.02211809, + "balance_loss_mlp": 1.03496206, + "epoch": 0.8307831053660003, + "flos": 20996933702400.0, + "grad_norm": 1.6134967348632454, + "language_loss": 0.78043187, + "learning_rate": 2.9283505456710875e-07, + "loss": 0.80179971, + "num_input_tokens_seen": 298053530, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.671875, + "step": 13818, + "time_per_iteration": 2.456963539123535 + }, + { + "auxiliary_loss_clip": 0.01102673, + "auxiliary_loss_mlp": 0.01033908, + "balance_loss_clip": 1.02229691, + "balance_loss_mlp": 1.03631234, + "epoch": 0.8308432286186682, + "flos": 21397301671680.0, + "grad_norm": 1.7992893057901929, + "language_loss": 0.82007933, + "learning_rate": 2.926321938606453e-07, + "loss": 0.84144515, + "num_input_tokens_seen": 298069305, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 13819, + "time_per_iteration": 2.4531350135803223 + }, + { + "auxiliary_loss_clip": 0.01023094, + "auxiliary_loss_mlp": 0.01001492, + "balance_loss_clip": 1.00050247, + "balance_loss_mlp": 1.00293744, + "epoch": 0.8309033518713362, + "flos": 62533656714240.0, + "grad_norm": 0.7540383320597264, + "language_loss": 0.56269968, + "learning_rate": 2.924293978977399e-07, + "loss": 0.58294547, + "num_input_tokens_seen": 298125830, + "router_z_loss_clip": 0.0098877, + "router_z_loss_mlp": 0.20117188, + "step": 13820, + "time_per_iteration": 3.07944917678833 + }, + { + "auxiliary_loss_clip": 0.01096009, + "auxiliary_loss_mlp": 0.01021975, + "balance_loss_clip": 1.01050711, + "balance_loss_mlp": 1.03318071, + "epoch": 0.8309634751240043, + "flos": 16979104051200.0, + "grad_norm": 1.8012504635990183, + "language_loss": 0.68316829, + "learning_rate": 2.922266666860831e-07, + "loss": 0.70434809, + "num_input_tokens_seen": 298142320, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.62890625, + "step": 13821, + "time_per_iteration": 2.4352285861968994 + }, + { + "auxiliary_loss_clip": 0.01101835, + "auxiliary_loss_mlp": 0.01029369, + "balance_loss_clip": 1.01752543, + "balance_loss_mlp": 1.03386974, + "epoch": 0.8310235983766722, + "flos": 22674464628480.0, + "grad_norm": 2.554596650493425, + "language_loss": 0.68782902, + "learning_rate": 2.920240002333625e-07, + "loss": 0.70914102, + "num_input_tokens_seen": 298161845, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 13822, + "time_per_iteration": 2.4268665313720703 + }, + { + "auxiliary_loss_clip": 0.01097449, + "auxiliary_loss_mlp": 0.01033673, + "balance_loss_clip": 1.02247906, + "balance_loss_mlp": 1.0342207, + "epoch": 0.8310837216293402, + "flos": 30811463176320.0, + "grad_norm": 1.735748965364619, + "language_loss": 0.62030697, + "learning_rate": 2.918213985472631e-07, + "loss": 0.64161813, + "num_input_tokens_seen": 298184165, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6328125, + "step": 13823, + "time_per_iteration": 2.5616307258605957 + }, + { + "auxiliary_loss_clip": 0.01022918, + "auxiliary_loss_mlp": 0.01006024, + "balance_loss_clip": 1.00502288, + "balance_loss_mlp": 1.00274229, + "epoch": 0.8311438448820081, + "flos": 71276074997760.0, + "grad_norm": 1.0144184118719066, + "language_loss": 0.61859858, + "learning_rate": 2.916188616354669e-07, + "loss": 0.638888, + "num_input_tokens_seen": 298251720, + "router_z_loss_clip": 0.01000977, + "router_z_loss_mlp": 0.20214844, + "step": 13824, + "time_per_iteration": 3.1341047286987305 + }, + { + "auxiliary_loss_clip": 0.01099745, + "auxiliary_loss_mlp": 0.01029378, + "balance_loss_clip": 1.01800573, + "balance_loss_mlp": 1.03508544, + "epoch": 0.8312039681346761, + "flos": 20887082933760.0, + "grad_norm": 1.9203244405099236, + "language_loss": 0.7410804, + "learning_rate": 2.914163895056552e-07, + "loss": 0.7623716, + "num_input_tokens_seen": 298271910, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6484375, + "step": 13825, + "time_per_iteration": 2.4680562019348145 + }, + { + "auxiliary_loss_clip": 0.01100678, + "auxiliary_loss_mlp": 0.01031495, + "balance_loss_clip": 1.01961005, + "balance_loss_mlp": 1.03404522, + "epoch": 0.831264091387344, + "flos": 17017528625280.0, + "grad_norm": 2.6831981740546804, + "language_loss": 0.80436289, + "learning_rate": 2.9121398216550486e-07, + "loss": 0.82568467, + "num_input_tokens_seen": 298288105, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66796875, + "step": 13826, + "time_per_iteration": 2.3986868858337402 + }, + { + "auxiliary_loss_clip": 0.01099812, + "auxiliary_loss_mlp": 0.01031012, + "balance_loss_clip": 1.01911521, + "balance_loss_mlp": 1.03373289, + "epoch": 0.831324214640012, + "flos": 24419578993920.0, + "grad_norm": 1.6899698204069646, + "language_loss": 0.67370605, + "learning_rate": 2.910116396226914e-07, + "loss": 0.6950143, + "num_input_tokens_seen": 298307600, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 13827, + "time_per_iteration": 2.556210517883301 + }, + { + "auxiliary_loss_clip": 0.01097618, + "auxiliary_loss_mlp": 0.01026907, + "balance_loss_clip": 1.01631558, + "balance_loss_mlp": 1.03311372, + "epoch": 0.83138433789268, + "flos": 13545576938880.0, + "grad_norm": 1.8718519853122935, + "language_loss": 0.73761111, + "learning_rate": 2.9080936188488834e-07, + "loss": 0.75885636, + "num_input_tokens_seen": 298323055, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.64453125, + "step": 13828, + "time_per_iteration": 2.402517557144165 + }, + { + "auxiliary_loss_clip": 0.01098978, + "auxiliary_loss_mlp": 0.01033392, + "balance_loss_clip": 1.02159595, + "balance_loss_mlp": 1.03244913, + "epoch": 0.831444461145348, + "flos": 44492386561920.0, + "grad_norm": 5.897206901344671, + "language_loss": 0.67066121, + "learning_rate": 2.906071489597657e-07, + "loss": 0.69198495, + "num_input_tokens_seen": 298346950, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66796875, + "step": 13829, + "time_per_iteration": 2.6429443359375 + }, + { + "auxiliary_loss_clip": 0.01102016, + "auxiliary_loss_mlp": 0.01029889, + "balance_loss_clip": 1.01768827, + "balance_loss_mlp": 1.03440702, + "epoch": 0.8315045843980159, + "flos": 22705024124160.0, + "grad_norm": 3.810275572521135, + "language_loss": 0.82567447, + "learning_rate": 2.9040500085499054e-07, + "loss": 0.84699351, + "num_input_tokens_seen": 298366315, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.67578125, + "step": 13830, + "time_per_iteration": 2.4443130493164062 + }, + { + "auxiliary_loss_clip": 0.01098309, + "auxiliary_loss_mlp": 0.01029327, + "balance_loss_clip": 1.0173049, + "balance_loss_mlp": 1.03295636, + "epoch": 0.8315647076506839, + "flos": 16873491087360.0, + "grad_norm": 1.9438123973766057, + "language_loss": 0.74598849, + "learning_rate": 2.9020291757822925e-07, + "loss": 0.76726484, + "num_input_tokens_seen": 298385185, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.65625, + "step": 13831, + "time_per_iteration": 2.4210164546966553 + }, + { + "auxiliary_loss_clip": 0.01100189, + "auxiliary_loss_mlp": 0.01031465, + "balance_loss_clip": 1.01938343, + "balance_loss_mlp": 1.03486538, + "epoch": 0.8316248309033518, + "flos": 13808730954240.0, + "grad_norm": 1.6326296110145166, + "language_loss": 0.71145892, + "learning_rate": 2.9000089913714523e-07, + "loss": 0.73277545, + "num_input_tokens_seen": 298402335, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.65625, + "step": 13832, + "time_per_iteration": 2.394432306289673 + }, + { + "auxiliary_loss_clip": 0.01097955, + "auxiliary_loss_mlp": 0.0102979, + "balance_loss_clip": 1.01818514, + "balance_loss_mlp": 1.03306818, + "epoch": 0.8316849541560198, + "flos": 23512511819520.0, + "grad_norm": 1.671736140785639, + "language_loss": 0.84483445, + "learning_rate": 2.897989455393979e-07, + "loss": 0.86611187, + "num_input_tokens_seen": 298423370, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6484375, + "step": 13833, + "time_per_iteration": 2.504239797592163 + }, + { + "auxiliary_loss_clip": 0.0110191, + "auxiliary_loss_mlp": 0.01033818, + "balance_loss_clip": 1.02147341, + "balance_loss_mlp": 1.03484035, + "epoch": 0.8317450774086879, + "flos": 23771356202880.0, + "grad_norm": 1.54787905183348, + "language_loss": 0.7613343, + "learning_rate": 2.8959705679264625e-07, + "loss": 0.7826916, + "num_input_tokens_seen": 298444835, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.671875, + "step": 13834, + "time_per_iteration": 2.4704878330230713 + }, + { + "auxiliary_loss_clip": 0.01095699, + "auxiliary_loss_mlp": 0.01028887, + "balance_loss_clip": 1.01751471, + "balance_loss_mlp": 1.0322547, + "epoch": 0.8318052006613558, + "flos": 16215535710720.0, + "grad_norm": 2.046199004722401, + "language_loss": 0.79697442, + "learning_rate": 2.893952329045459e-07, + "loss": 0.81822026, + "num_input_tokens_seen": 298461845, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6328125, + "step": 13835, + "time_per_iteration": 2.40838623046875 + }, + { + "auxiliary_loss_clip": 0.01106083, + "auxiliary_loss_mlp": 0.0103428, + "balance_loss_clip": 1.02129853, + "balance_loss_mlp": 1.03730321, + "epoch": 0.8318653239140238, + "flos": 19974556892160.0, + "grad_norm": 1.7818971205631189, + "language_loss": 0.80744654, + "learning_rate": 2.8919347388274905e-07, + "loss": 0.82885015, + "num_input_tokens_seen": 298479095, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6875, + "step": 13836, + "time_per_iteration": 2.4172093868255615 + }, + { + "auxiliary_loss_clip": 0.01098724, + "auxiliary_loss_mlp": 0.01028555, + "balance_loss_clip": 1.01750398, + "balance_loss_mlp": 1.03422546, + "epoch": 0.8319254471666917, + "flos": 17704714694400.0, + "grad_norm": 1.9244544867437152, + "language_loss": 0.77690089, + "learning_rate": 2.8899177973490727e-07, + "loss": 0.79817367, + "num_input_tokens_seen": 298494475, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.64453125, + "step": 13837, + "time_per_iteration": 2.4101178646087646 + }, + { + "auxiliary_loss_clip": 0.01103421, + "auxiliary_loss_mlp": 0.01029609, + "balance_loss_clip": 1.01641822, + "balance_loss_mlp": 1.03414893, + "epoch": 0.8319855704193597, + "flos": 19536554448000.0, + "grad_norm": 1.6930309583903163, + "language_loss": 0.8365382, + "learning_rate": 2.887901504686685e-07, + "loss": 0.85786849, + "num_input_tokens_seen": 298513185, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.69140625, + "step": 13838, + "time_per_iteration": 2.4097585678100586 + }, + { + "auxiliary_loss_clip": 0.01098918, + "auxiliary_loss_mlp": 0.01031867, + "balance_loss_clip": 1.01960075, + "balance_loss_mlp": 1.03409493, + "epoch": 0.8320456936720276, + "flos": 21178067011200.0, + "grad_norm": 2.1542389806886266, + "language_loss": 0.74221098, + "learning_rate": 2.885885860916795e-07, + "loss": 0.76351881, + "num_input_tokens_seen": 298531885, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6484375, + "step": 13839, + "time_per_iteration": 2.4400813579559326 + }, + { + "auxiliary_loss_clip": 0.01101691, + "auxiliary_loss_mlp": 0.01031203, + "balance_loss_clip": 1.01941919, + "balance_loss_mlp": 1.03564334, + "epoch": 0.8321058169246957, + "flos": 33250874503680.0, + "grad_norm": 1.4115307011587832, + "language_loss": 0.67430389, + "learning_rate": 2.8838708661158253e-07, + "loss": 0.69563287, + "num_input_tokens_seen": 298554905, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.66015625, + "step": 13840, + "time_per_iteration": 2.5372142791748047 + }, + { + "auxiliary_loss_clip": 0.01099376, + "auxiliary_loss_mlp": 0.01027212, + "balance_loss_clip": 1.01557088, + "balance_loss_mlp": 1.0327549, + "epoch": 0.8321659401773636, + "flos": 14208129256320.0, + "grad_norm": 1.8532994012873067, + "language_loss": 0.79538697, + "learning_rate": 2.8818565203601843e-07, + "loss": 0.81665289, + "num_input_tokens_seen": 298571185, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 13841, + "time_per_iteration": 2.406419277191162 + }, + { + "auxiliary_loss_clip": 0.01098521, + "auxiliary_loss_mlp": 0.01027656, + "balance_loss_clip": 1.01594353, + "balance_loss_mlp": 1.03416336, + "epoch": 0.8322260634300316, + "flos": 15158253859200.0, + "grad_norm": 1.9703155224282078, + "language_loss": 0.68665957, + "learning_rate": 2.879842823726262e-07, + "loss": 0.70792133, + "num_input_tokens_seen": 298588505, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.640625, + "step": 13842, + "time_per_iteration": 2.4088361263275146 + }, + { + "auxiliary_loss_clip": 0.01099099, + "auxiliary_loss_mlp": 0.01025711, + "balance_loss_clip": 1.01326537, + "balance_loss_mlp": 1.03484809, + "epoch": 0.8322861866826995, + "flos": 25300827267840.0, + "grad_norm": 2.252583895579188, + "language_loss": 0.73118508, + "learning_rate": 2.8778297762904124e-07, + "loss": 0.75243318, + "num_input_tokens_seen": 298609295, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.64453125, + "step": 13843, + "time_per_iteration": 2.4760208129882812 + }, + { + "auxiliary_loss_clip": 0.01100509, + "auxiliary_loss_mlp": 0.0103056, + "balance_loss_clip": 1.01875806, + "balance_loss_mlp": 1.03649437, + "epoch": 0.8323463099353675, + "flos": 17019360218880.0, + "grad_norm": 1.8153804396647903, + "language_loss": 0.77374804, + "learning_rate": 2.875817378128975e-07, + "loss": 0.79505873, + "num_input_tokens_seen": 298625765, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.640625, + "step": 13844, + "time_per_iteration": 2.3891868591308594 + }, + { + "auxiliary_loss_clip": 0.01022573, + "auxiliary_loss_mlp": 0.00999494, + "balance_loss_clip": 0.99842119, + "balance_loss_mlp": 1.00265634, + "epoch": 0.8324064331880354, + "flos": 55607889709440.0, + "grad_norm": 0.8581285826544858, + "language_loss": 0.55275869, + "learning_rate": 2.8738056293182624e-07, + "loss": 0.57297933, + "num_input_tokens_seen": 298683005, + "router_z_loss_clip": 0.01074219, + "router_z_loss_mlp": 0.19921875, + "step": 13845, + "time_per_iteration": 4.351477861404419 + }, + { + "auxiliary_loss_clip": 0.011024, + "auxiliary_loss_mlp": 0.01043293, + "balance_loss_clip": 1.03099656, + "balance_loss_mlp": 1.03529, + "epoch": 0.8324665564407034, + "flos": 26138623063680.0, + "grad_norm": 2.3644606259383782, + "language_loss": 0.75436401, + "learning_rate": 2.871794529934555e-07, + "loss": 0.77582097, + "num_input_tokens_seen": 298703060, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.671875, + "step": 13846, + "time_per_iteration": 2.4772729873657227 + }, + { + "auxiliary_loss_clip": 0.0110107, + "auxiliary_loss_mlp": 0.01026629, + "balance_loss_clip": 1.01367104, + "balance_loss_mlp": 1.03235412, + "epoch": 0.8325266796933715, + "flos": 22049187649920.0, + "grad_norm": 1.56529249468272, + "language_loss": 0.78832293, + "learning_rate": 2.8697840800541115e-07, + "loss": 0.80959988, + "num_input_tokens_seen": 298721765, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6875, + "step": 13847, + "time_per_iteration": 3.8999733924865723 + }, + { + "auxiliary_loss_clip": 0.01099196, + "auxiliary_loss_mlp": 0.01028062, + "balance_loss_clip": 1.01703548, + "balance_loss_mlp": 1.03460264, + "epoch": 0.8325868029460394, + "flos": 22816634659200.0, + "grad_norm": 1.5773716692897488, + "language_loss": 0.74506044, + "learning_rate": 2.867774279753175e-07, + "loss": 0.76633298, + "num_input_tokens_seen": 298740825, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.64453125, + "step": 13848, + "time_per_iteration": 2.4543795585632324 + }, + { + "auxiliary_loss_clip": 0.01099371, + "auxiliary_loss_mlp": 0.01028794, + "balance_loss_clip": 1.01737964, + "balance_loss_mlp": 1.03426051, + "epoch": 0.8326469261987074, + "flos": 14757454926720.0, + "grad_norm": 2.274626323524008, + "language_loss": 0.63361812, + "learning_rate": 2.8657651291079554e-07, + "loss": 0.65489972, + "num_input_tokens_seen": 298758515, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 13849, + "time_per_iteration": 2.4305379390716553 + }, + { + "auxiliary_loss_clip": 0.01101578, + "auxiliary_loss_mlp": 0.01030731, + "balance_loss_clip": 1.01899529, + "balance_loss_mlp": 1.03342891, + "epoch": 0.8327070494513753, + "flos": 22926126291840.0, + "grad_norm": 2.028058790500836, + "language_loss": 0.79350019, + "learning_rate": 2.863756628194638e-07, + "loss": 0.81482327, + "num_input_tokens_seen": 298776375, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 13850, + "time_per_iteration": 3.875143527984619 + }, + { + "auxiliary_loss_clip": 0.01095589, + "auxiliary_loss_mlp": 0.01031181, + "balance_loss_clip": 1.02058375, + "balance_loss_mlp": 1.03317165, + "epoch": 0.8327671727040433, + "flos": 20665334321280.0, + "grad_norm": 1.619138170366384, + "language_loss": 0.7828756, + "learning_rate": 2.8617487770893877e-07, + "loss": 0.80414331, + "num_input_tokens_seen": 298795135, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.625, + "step": 13851, + "time_per_iteration": 2.435199499130249 + }, + { + "auxiliary_loss_clip": 0.01022819, + "auxiliary_loss_mlp": 0.00999775, + "balance_loss_clip": 0.99877405, + "balance_loss_mlp": 1.00280309, + "epoch": 0.8328272959567112, + "flos": 56060760384000.0, + "grad_norm": 0.7623097192798404, + "language_loss": 0.55791199, + "learning_rate": 2.859741575868344e-07, + "loss": 0.57813787, + "num_input_tokens_seen": 298855475, + "router_z_loss_clip": 0.01000977, + "router_z_loss_mlp": 0.20019531, + "step": 13852, + "time_per_iteration": 3.025131940841675 + }, + { + "auxiliary_loss_clip": 0.01098525, + "auxiliary_loss_mlp": 0.01027782, + "balance_loss_clip": 1.01639128, + "balance_loss_mlp": 1.034675, + "epoch": 0.8328874192093793, + "flos": 32303084284800.0, + "grad_norm": 2.229501971998781, + "language_loss": 0.67093384, + "learning_rate": 2.8577350246076125e-07, + "loss": 0.69219691, + "num_input_tokens_seen": 298875875, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.63671875, + "step": 13853, + "time_per_iteration": 2.560558319091797 + }, + { + "auxiliary_loss_clip": 0.01101824, + "auxiliary_loss_mlp": 0.01030236, + "balance_loss_clip": 1.01892853, + "balance_loss_mlp": 1.03612375, + "epoch": 0.8329475424620472, + "flos": 23512691387520.0, + "grad_norm": 1.5146603164888313, + "language_loss": 0.78381944, + "learning_rate": 2.855729123383286e-07, + "loss": 0.80514002, + "num_input_tokens_seen": 298895950, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 13854, + "time_per_iteration": 2.4560956954956055 + }, + { + "auxiliary_loss_clip": 0.01022713, + "auxiliary_loss_mlp": 0.01004861, + "balance_loss_clip": 1.0038538, + "balance_loss_mlp": 1.00278306, + "epoch": 0.8330076657147152, + "flos": 67840680378240.0, + "grad_norm": 0.7920815382427507, + "language_loss": 0.58700705, + "learning_rate": 2.8537238722714295e-07, + "loss": 0.60728288, + "num_input_tokens_seen": 298955770, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.19921875, + "step": 13855, + "time_per_iteration": 2.948824644088745 + }, + { + "auxiliary_loss_clip": 0.01099595, + "auxiliary_loss_mlp": 0.0102813, + "balance_loss_clip": 1.01669788, + "balance_loss_mlp": 1.03500164, + "epoch": 0.8330677889673831, + "flos": 22892801448960.0, + "grad_norm": 1.6173618311844826, + "language_loss": 0.71731192, + "learning_rate": 2.8517192713480853e-07, + "loss": 0.73858917, + "num_input_tokens_seen": 298976545, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.64453125, + "step": 13856, + "time_per_iteration": 2.4495739936828613 + }, + { + "auxiliary_loss_clip": 0.01098834, + "auxiliary_loss_mlp": 0.01028183, + "balance_loss_clip": 1.01712024, + "balance_loss_mlp": 1.03420722, + "epoch": 0.8331279122200511, + "flos": 27345042184320.0, + "grad_norm": 1.5263372259770802, + "language_loss": 0.7549566, + "learning_rate": 2.8497153206892677e-07, + "loss": 0.77622676, + "num_input_tokens_seen": 298996750, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 13857, + "time_per_iteration": 2.48952317237854 + }, + { + "auxiliary_loss_clip": 0.01096823, + "auxiliary_loss_mlp": 0.01024887, + "balance_loss_clip": 1.01476622, + "balance_loss_mlp": 1.03515077, + "epoch": 0.833188035472719, + "flos": 19938179393280.0, + "grad_norm": 1.5444412658086444, + "language_loss": 0.7369523, + "learning_rate": 2.847712020370958e-07, + "loss": 0.75816941, + "num_input_tokens_seen": 299014895, + "router_z_loss_clip": 0.1015625, + "router_z_loss_mlp": 0.6171875, + "step": 13858, + "time_per_iteration": 2.415557622909546 + }, + { + "auxiliary_loss_clip": 0.01102993, + "auxiliary_loss_mlp": 0.01034018, + "balance_loss_clip": 1.02144766, + "balance_loss_mlp": 1.03377628, + "epoch": 0.833248158725387, + "flos": 15232624968960.0, + "grad_norm": 2.882415759044888, + "language_loss": 0.73106527, + "learning_rate": 2.8457093704691316e-07, + "loss": 0.75243539, + "num_input_tokens_seen": 299032855, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69140625, + "step": 13859, + "time_per_iteration": 2.4257359504699707 + }, + { + "auxiliary_loss_clip": 0.01095625, + "auxiliary_loss_mlp": 0.01026152, + "balance_loss_clip": 1.01519704, + "balance_loss_mlp": 1.03375316, + "epoch": 0.8333082819780551, + "flos": 24535535074560.0, + "grad_norm": 1.5687647819077657, + "language_loss": 0.79128706, + "learning_rate": 2.8437073710597205e-07, + "loss": 0.81250489, + "num_input_tokens_seen": 299052055, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6171875, + "step": 13860, + "time_per_iteration": 2.4974732398986816 + }, + { + "auxiliary_loss_clip": 0.01098794, + "auxiliary_loss_mlp": 0.01031344, + "balance_loss_clip": 1.0195545, + "balance_loss_mlp": 1.03446364, + "epoch": 0.833368405230723, + "flos": 31467407391360.0, + "grad_norm": 1.3446987096188727, + "language_loss": 0.82059264, + "learning_rate": 2.841706022218644e-07, + "loss": 0.84189403, + "num_input_tokens_seen": 299075285, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.64453125, + "step": 13861, + "time_per_iteration": 2.5322492122650146 + }, + { + "auxiliary_loss_clip": 0.01103178, + "auxiliary_loss_mlp": 0.01032821, + "balance_loss_clip": 1.02115631, + "balance_loss_mlp": 1.03720117, + "epoch": 0.833428528483391, + "flos": 14902713527040.0, + "grad_norm": 1.926026515251472, + "language_loss": 0.78863573, + "learning_rate": 2.839705324021806e-07, + "loss": 0.80999571, + "num_input_tokens_seen": 299092520, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66015625, + "step": 13862, + "time_per_iteration": 2.4110610485076904 + }, + { + "auxiliary_loss_clip": 0.01099524, + "auxiliary_loss_mlp": 0.01035001, + "balance_loss_clip": 1.02301443, + "balance_loss_mlp": 1.03292191, + "epoch": 0.8334886517360589, + "flos": 22199833290240.0, + "grad_norm": 1.865354968291114, + "language_loss": 0.75375336, + "learning_rate": 2.83770527654505e-07, + "loss": 0.77509862, + "num_input_tokens_seen": 299109450, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 13863, + "time_per_iteration": 2.45611572265625 + }, + { + "auxiliary_loss_clip": 0.01098316, + "auxiliary_loss_mlp": 0.01028979, + "balance_loss_clip": 1.01801133, + "balance_loss_mlp": 1.03540921, + "epoch": 0.8335487749887269, + "flos": 30372562892160.0, + "grad_norm": 2.920899657352717, + "language_loss": 0.74782169, + "learning_rate": 2.835705879864232e-07, + "loss": 0.76909465, + "num_input_tokens_seen": 299129540, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6328125, + "step": 13864, + "time_per_iteration": 2.531675100326538 + }, + { + "auxiliary_loss_clip": 0.01099559, + "auxiliary_loss_mlp": 0.01032767, + "balance_loss_clip": 1.02045846, + "balance_loss_mlp": 1.03386116, + "epoch": 0.8336088982413948, + "flos": 24681152810880.0, + "grad_norm": 1.9906253642830378, + "language_loss": 0.69348955, + "learning_rate": 2.833707134055168e-07, + "loss": 0.71481282, + "num_input_tokens_seen": 299148670, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.65625, + "step": 13865, + "time_per_iteration": 2.471926689147949 + }, + { + "auxiliary_loss_clip": 0.01100902, + "auxiliary_loss_mlp": 0.01030815, + "balance_loss_clip": 1.01927531, + "balance_loss_mlp": 1.03523529, + "epoch": 0.8336690214940629, + "flos": 38177207873280.0, + "grad_norm": 1.8969671678573263, + "language_loss": 0.7543878, + "learning_rate": 2.831709039193653e-07, + "loss": 0.77570498, + "num_input_tokens_seen": 299169330, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 13866, + "time_per_iteration": 2.574395179748535 + }, + { + "auxiliary_loss_clip": 0.01022788, + "auxiliary_loss_mlp": 0.01000732, + "balance_loss_clip": 0.99971908, + "balance_loss_mlp": 1.00291204, + "epoch": 0.8337291447467308, + "flos": 55565119589760.0, + "grad_norm": 0.8706720724584954, + "language_loss": 0.63136578, + "learning_rate": 2.8297115953554465e-07, + "loss": 0.65160096, + "num_input_tokens_seen": 299220980, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.19921875, + "step": 13867, + "time_per_iteration": 2.934981107711792 + }, + { + "auxiliary_loss_clip": 0.01096579, + "auxiliary_loss_mlp": 0.01032744, + "balance_loss_clip": 1.02190161, + "balance_loss_mlp": 1.03341174, + "epoch": 0.8337892679993988, + "flos": 24133550993280.0, + "grad_norm": 1.5625976784768958, + "language_loss": 0.71867061, + "learning_rate": 2.827714802616301e-07, + "loss": 0.73996377, + "num_input_tokens_seen": 299240130, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6328125, + "step": 13868, + "time_per_iteration": 2.4451518058776855 + }, + { + "auxiliary_loss_clip": 0.01101584, + "auxiliary_loss_mlp": 0.01030698, + "balance_loss_clip": 1.01893783, + "balance_loss_mlp": 1.03663313, + "epoch": 0.8338493912520667, + "flos": 28183915388160.0, + "grad_norm": 1.355395480855395, + "language_loss": 0.80121469, + "learning_rate": 2.8257186610519325e-07, + "loss": 0.82253754, + "num_input_tokens_seen": 299260705, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6484375, + "step": 13869, + "time_per_iteration": 2.543779134750366 + }, + { + "auxiliary_loss_clip": 0.01100171, + "auxiliary_loss_mlp": 0.01031265, + "balance_loss_clip": 1.01947522, + "balance_loss_mlp": 1.03504825, + "epoch": 0.8339095145047347, + "flos": 22158356060160.0, + "grad_norm": 2.3494726430887423, + "language_loss": 0.82560599, + "learning_rate": 2.823723170738028e-07, + "loss": 0.84692031, + "num_input_tokens_seen": 299278925, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6484375, + "step": 13870, + "time_per_iteration": 2.461456775665283 + }, + { + "auxiliary_loss_clip": 0.01100457, + "auxiliary_loss_mlp": 0.01026188, + "balance_loss_clip": 1.01389122, + "balance_loss_mlp": 1.03311634, + "epoch": 0.8339696377574026, + "flos": 17307112072320.0, + "grad_norm": 2.2704467550194503, + "language_loss": 0.70611966, + "learning_rate": 2.821728331750264e-07, + "loss": 0.72738612, + "num_input_tokens_seen": 299291580, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.671875, + "step": 13871, + "time_per_iteration": 2.422774076461792 + }, + { + "auxiliary_loss_clip": 0.01099073, + "auxiliary_loss_mlp": 0.01034652, + "balance_loss_clip": 1.02315414, + "balance_loss_mlp": 1.03536725, + "epoch": 0.8340297610100706, + "flos": 20668351063680.0, + "grad_norm": 2.1800938394857257, + "language_loss": 0.68849045, + "learning_rate": 2.8197341441642853e-07, + "loss": 0.70982766, + "num_input_tokens_seen": 299310385, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.63671875, + "step": 13872, + "time_per_iteration": 2.4503636360168457 + }, + { + "auxiliary_loss_clip": 0.01099674, + "auxiliary_loss_mlp": 0.01024709, + "balance_loss_clip": 1.01330113, + "balance_loss_mlp": 1.03414083, + "epoch": 0.8340898842627387, + "flos": 20515442866560.0, + "grad_norm": 2.0378209067910906, + "language_loss": 0.73376065, + "learning_rate": 2.817740608055712e-07, + "loss": 0.75500453, + "num_input_tokens_seen": 299327660, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 13873, + "time_per_iteration": 2.446756601333618 + }, + { + "auxiliary_loss_clip": 0.01101733, + "auxiliary_loss_mlp": 0.01032433, + "balance_loss_clip": 1.01899242, + "balance_loss_mlp": 1.03445745, + "epoch": 0.8341500075154066, + "flos": 21425850005760.0, + "grad_norm": 2.1330772201747354, + "language_loss": 0.75205374, + "learning_rate": 2.81574772350013e-07, + "loss": 0.77339536, + "num_input_tokens_seen": 299343685, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.671875, + "step": 13874, + "time_per_iteration": 2.462092638015747 + }, + { + "auxiliary_loss_clip": 0.01097984, + "auxiliary_loss_mlp": 0.01024849, + "balance_loss_clip": 1.01381576, + "balance_loss_mlp": 1.03387237, + "epoch": 0.8342101307680746, + "flos": 22090988102400.0, + "grad_norm": 1.868204921667949, + "language_loss": 0.65978831, + "learning_rate": 2.813755490573118e-07, + "loss": 0.68101668, + "num_input_tokens_seen": 299363305, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 13875, + "time_per_iteration": 2.5084481239318848 + }, + { + "auxiliary_loss_clip": 0.01100848, + "auxiliary_loss_mlp": 0.01035626, + "balance_loss_clip": 1.02417028, + "balance_loss_mlp": 1.0363009, + "epoch": 0.8342702540207425, + "flos": 21871466133120.0, + "grad_norm": 1.702367531378977, + "language_loss": 0.79506415, + "learning_rate": 2.8117639093502243e-07, + "loss": 0.8164289, + "num_input_tokens_seen": 299382630, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6484375, + "step": 13876, + "time_per_iteration": 2.468416213989258 + }, + { + "auxiliary_loss_clip": 0.01098276, + "auxiliary_loss_mlp": 0.01030323, + "balance_loss_clip": 1.01830101, + "balance_loss_mlp": 1.03345704, + "epoch": 0.8343303772734105, + "flos": 22528487756160.0, + "grad_norm": 2.017532835470735, + "language_loss": 0.87241477, + "learning_rate": 2.8097729799069615e-07, + "loss": 0.89370072, + "num_input_tokens_seen": 299402385, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6484375, + "step": 13877, + "time_per_iteration": 2.4826865196228027 + }, + { + "auxiliary_loss_clip": 0.01100446, + "auxiliary_loss_mlp": 0.01026775, + "balance_loss_clip": 1.01605773, + "balance_loss_mlp": 1.03529167, + "epoch": 0.8343905005260784, + "flos": 14939773384320.0, + "grad_norm": 1.8556989537670767, + "language_loss": 0.6919421, + "learning_rate": 2.807782702318828e-07, + "loss": 0.71321428, + "num_input_tokens_seen": 299419820, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.65234375, + "step": 13878, + "time_per_iteration": 2.4149510860443115 + }, + { + "auxiliary_loss_clip": 0.01097301, + "auxiliary_loss_mlp": 0.01028652, + "balance_loss_clip": 1.01760721, + "balance_loss_mlp": 1.03321266, + "epoch": 0.8344506237787465, + "flos": 15012456554880.0, + "grad_norm": 2.151087200806259, + "language_loss": 0.79375225, + "learning_rate": 2.805793076661309e-07, + "loss": 0.81501174, + "num_input_tokens_seen": 299436265, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 13879, + "time_per_iteration": 2.4393885135650635 + }, + { + "auxiliary_loss_clip": 0.01098676, + "auxiliary_loss_mlp": 0.01030826, + "balance_loss_clip": 1.02046657, + "balance_loss_mlp": 1.03434122, + "epoch": 0.8345107470314144, + "flos": 17560389847680.0, + "grad_norm": 1.9366754289118486, + "language_loss": 0.83347481, + "learning_rate": 2.803804103009828e-07, + "loss": 0.85476983, + "num_input_tokens_seen": 299451660, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.64453125, + "step": 13880, + "time_per_iteration": 2.4007508754730225 + }, + { + "auxiliary_loss_clip": 0.01102102, + "auxiliary_loss_mlp": 0.01028434, + "balance_loss_clip": 1.01709139, + "balance_loss_mlp": 1.03468037, + "epoch": 0.8345708702840824, + "flos": 25187277398400.0, + "grad_norm": 1.5189652772607405, + "language_loss": 0.78158617, + "learning_rate": 2.80181578143982e-07, + "loss": 0.80289149, + "num_input_tokens_seen": 299472070, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 13881, + "time_per_iteration": 2.486856698989868 + }, + { + "auxiliary_loss_clip": 0.01094738, + "auxiliary_loss_mlp": 0.01023668, + "balance_loss_clip": 1.01344538, + "balance_loss_mlp": 1.03388488, + "epoch": 0.8346309935367503, + "flos": 15083559527040.0, + "grad_norm": 2.7414546532232285, + "language_loss": 0.78763664, + "learning_rate": 2.7998281120266807e-07, + "loss": 0.80882066, + "num_input_tokens_seen": 299486725, + "router_z_loss_clip": 0.10205078, + "router_z_loss_mlp": 0.609375, + "step": 13882, + "time_per_iteration": 2.383542060852051 + }, + { + "auxiliary_loss_clip": 0.01102782, + "auxiliary_loss_mlp": 0.01038346, + "balance_loss_clip": 1.02697921, + "balance_loss_mlp": 1.03675485, + "epoch": 0.8346911167894183, + "flos": 22930615491840.0, + "grad_norm": 1.7318453310688504, + "language_loss": 0.80458236, + "learning_rate": 2.79784109484579e-07, + "loss": 0.82599366, + "num_input_tokens_seen": 299505435, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.66015625, + "step": 13883, + "time_per_iteration": 2.4578230381011963 + }, + { + "auxiliary_loss_clip": 0.01100881, + "auxiliary_loss_mlp": 0.01030733, + "balance_loss_clip": 1.01879406, + "balance_loss_mlp": 1.03334713, + "epoch": 0.8347512400420862, + "flos": 20193037367040.0, + "grad_norm": 2.0429837151334795, + "language_loss": 0.74506301, + "learning_rate": 2.795854729972482e-07, + "loss": 0.76637912, + "num_input_tokens_seen": 299523555, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.67578125, + "step": 13884, + "time_per_iteration": 2.4351799488067627 + }, + { + "auxiliary_loss_clip": 0.01107845, + "auxiliary_loss_mlp": 0.01034733, + "balance_loss_clip": 1.02128029, + "balance_loss_mlp": 1.03687263, + "epoch": 0.8348113632947542, + "flos": 25954832148480.0, + "grad_norm": 1.6804118695495678, + "language_loss": 0.70060503, + "learning_rate": 2.7938690174820913e-07, + "loss": 0.72203082, + "num_input_tokens_seen": 299541660, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.7109375, + "step": 13885, + "time_per_iteration": 2.4579498767852783 + }, + { + "auxiliary_loss_clip": 0.01101392, + "auxiliary_loss_mlp": 0.01030833, + "balance_loss_clip": 1.01899576, + "balance_loss_mlp": 1.03498375, + "epoch": 0.8348714865474223, + "flos": 34204554552960.0, + "grad_norm": 1.6036472675967848, + "language_loss": 0.69851661, + "learning_rate": 2.791883957449912e-07, + "loss": 0.7198388, + "num_input_tokens_seen": 299562465, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 13886, + "time_per_iteration": 2.5490212440490723 + }, + { + "auxiliary_loss_clip": 0.01099122, + "auxiliary_loss_mlp": 0.01028924, + "balance_loss_clip": 1.01703906, + "balance_loss_mlp": 1.03454471, + "epoch": 0.8349316098000902, + "flos": 24390132819840.0, + "grad_norm": 1.8702911188252411, + "language_loss": 0.79043454, + "learning_rate": 2.7898995499512134e-07, + "loss": 0.81171501, + "num_input_tokens_seen": 299582700, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6484375, + "step": 13887, + "time_per_iteration": 3.8734936714172363 + }, + { + "auxiliary_loss_clip": 0.01105837, + "auxiliary_loss_mlp": 0.0103061, + "balance_loss_clip": 1.01753235, + "balance_loss_mlp": 1.03689051, + "epoch": 0.8349917330527582, + "flos": 23032744836480.0, + "grad_norm": 2.4281394961126277, + "language_loss": 0.64525139, + "learning_rate": 2.7879157950612467e-07, + "loss": 0.66661584, + "num_input_tokens_seen": 299600310, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.69140625, + "step": 13888, + "time_per_iteration": 5.29352068901062 + }, + { + "auxiliary_loss_clip": 0.01102274, + "auxiliary_loss_mlp": 0.01026091, + "balance_loss_clip": 1.01442051, + "balance_loss_mlp": 1.03387511, + "epoch": 0.8350518563054261, + "flos": 13625873792640.0, + "grad_norm": 2.253647584717518, + "language_loss": 0.6737141, + "learning_rate": 2.785932692855244e-07, + "loss": 0.69499779, + "num_input_tokens_seen": 299617025, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.68359375, + "step": 13889, + "time_per_iteration": 2.4108006954193115 + }, + { + "auxiliary_loss_clip": 0.01096996, + "auxiliary_loss_mlp": 0.01028659, + "balance_loss_clip": 1.01737046, + "balance_loss_mlp": 1.03261387, + "epoch": 0.8351119795580941, + "flos": 21579799697280.0, + "grad_norm": 2.354736247882719, + "language_loss": 0.68670756, + "learning_rate": 2.783950243408399e-07, + "loss": 0.70796412, + "num_input_tokens_seen": 299633050, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.64453125, + "step": 13890, + "time_per_iteration": 2.4558017253875732 + }, + { + "auxiliary_loss_clip": 0.01100731, + "auxiliary_loss_mlp": 0.01031277, + "balance_loss_clip": 1.01932621, + "balance_loss_mlp": 1.03489208, + "epoch": 0.835172102810762, + "flos": 20038297576320.0, + "grad_norm": 2.3411484544759187, + "language_loss": 0.58889383, + "learning_rate": 2.7819684467958817e-07, + "loss": 0.61021388, + "num_input_tokens_seen": 299646445, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.65625, + "step": 13891, + "time_per_iteration": 2.4097304344177246 + }, + { + "auxiliary_loss_clip": 0.01100517, + "auxiliary_loss_mlp": 0.01031516, + "balance_loss_clip": 1.02039945, + "balance_loss_mlp": 1.03453255, + "epoch": 0.8352322260634301, + "flos": 25111577485440.0, + "grad_norm": 1.7015467014644545, + "language_loss": 0.71564895, + "learning_rate": 2.779987303092846e-07, + "loss": 0.73696935, + "num_input_tokens_seen": 299662665, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 13892, + "time_per_iteration": 3.8961503505706787 + }, + { + "auxiliary_loss_clip": 0.01096459, + "auxiliary_loss_mlp": 0.01029035, + "balance_loss_clip": 1.01702511, + "balance_loss_mlp": 1.03270864, + "epoch": 0.835292349316098, + "flos": 24863758577280.0, + "grad_norm": 1.5839366576450844, + "language_loss": 0.66044503, + "learning_rate": 2.7780068123744207e-07, + "loss": 0.68169999, + "num_input_tokens_seen": 299683585, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.63671875, + "step": 13893, + "time_per_iteration": 2.5053062438964844 + }, + { + "auxiliary_loss_clip": 0.01097033, + "auxiliary_loss_mlp": 0.01025409, + "balance_loss_clip": 1.01406002, + "balance_loss_mlp": 1.03166842, + "epoch": 0.835352472568766, + "flos": 19865568049920.0, + "grad_norm": 1.9915669403341283, + "language_loss": 0.78155309, + "learning_rate": 2.7760269747156996e-07, + "loss": 0.80277747, + "num_input_tokens_seen": 299702680, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65234375, + "step": 13894, + "time_per_iteration": 2.472388505935669 + }, + { + "auxiliary_loss_clip": 0.01096943, + "auxiliary_loss_mlp": 0.01025896, + "balance_loss_clip": 1.01453519, + "balance_loss_mlp": 1.03513694, + "epoch": 0.8354125958214339, + "flos": 22054754257920.0, + "grad_norm": 1.6322720137686266, + "language_loss": 0.72857749, + "learning_rate": 2.7740477901917625e-07, + "loss": 0.74980593, + "num_input_tokens_seen": 299721050, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6171875, + "step": 13895, + "time_per_iteration": 2.440232515335083 + }, + { + "auxiliary_loss_clip": 0.011002, + "auxiliary_loss_mlp": 0.01040296, + "balance_loss_clip": 1.02728426, + "balance_loss_mlp": 1.03335416, + "epoch": 0.8354727190741019, + "flos": 21397804462080.0, + "grad_norm": 2.155535332480292, + "language_loss": 0.71964091, + "learning_rate": 2.772069258877667e-07, + "loss": 0.74104589, + "num_input_tokens_seen": 299738255, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.66796875, + "step": 13896, + "time_per_iteration": 2.4436943531036377 + }, + { + "auxiliary_loss_clip": 0.01097879, + "auxiliary_loss_mlp": 0.01026397, + "balance_loss_clip": 1.01523876, + "balance_loss_mlp": 1.03357804, + "epoch": 0.8355328423267698, + "flos": 50840997834240.0, + "grad_norm": 2.3454754721795763, + "language_loss": 0.58714581, + "learning_rate": 2.770091380848423e-07, + "loss": 0.6083886, + "num_input_tokens_seen": 299761315, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.640625, + "step": 13897, + "time_per_iteration": 2.700792074203491 + }, + { + "auxiliary_loss_clip": 0.01022191, + "auxiliary_loss_mlp": 0.01000308, + "balance_loss_clip": 0.9993543, + "balance_loss_mlp": 1.00201261, + "epoch": 0.8355929655794379, + "flos": 65551052764800.0, + "grad_norm": 0.6979475154433069, + "language_loss": 0.57681328, + "learning_rate": 2.7681141561790423e-07, + "loss": 0.59703827, + "num_input_tokens_seen": 299828735, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.20117188, + "step": 13898, + "time_per_iteration": 3.0732853412628174 + }, + { + "auxiliary_loss_clip": 0.01101202, + "auxiliary_loss_mlp": 0.01031023, + "balance_loss_clip": 1.01861334, + "balance_loss_mlp": 1.03426445, + "epoch": 0.8356530888321058, + "flos": 19170516902400.0, + "grad_norm": 1.7459873042181069, + "language_loss": 0.79868174, + "learning_rate": 2.7661375849444967e-07, + "loss": 0.82000399, + "num_input_tokens_seen": 299848395, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.66796875, + "step": 13899, + "time_per_iteration": 2.432739734649658 + }, + { + "auxiliary_loss_clip": 0.01100718, + "auxiliary_loss_mlp": 0.01029966, + "balance_loss_clip": 1.01889777, + "balance_loss_mlp": 1.03398609, + "epoch": 0.8357132120847738, + "flos": 44126672238720.0, + "grad_norm": 1.8371741608705614, + "language_loss": 0.68867636, + "learning_rate": 2.764161667219749e-07, + "loss": 0.70998323, + "num_input_tokens_seen": 299871665, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.66796875, + "step": 13900, + "time_per_iteration": 2.623135805130005 + }, + { + "auxiliary_loss_clip": 0.01099818, + "auxiliary_loss_mlp": 0.01032209, + "balance_loss_clip": 1.02103853, + "balance_loss_mlp": 1.03531981, + "epoch": 0.8357733353374418, + "flos": 24389701856640.0, + "grad_norm": 1.5620121464910832, + "language_loss": 0.71323341, + "learning_rate": 2.762186403079716e-07, + "loss": 0.73455364, + "num_input_tokens_seen": 299891960, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.64453125, + "step": 13901, + "time_per_iteration": 2.4815425872802734 + }, + { + "auxiliary_loss_clip": 0.01104099, + "auxiliary_loss_mlp": 0.01034926, + "balance_loss_clip": 1.02242041, + "balance_loss_mlp": 1.03513288, + "epoch": 0.8358334585901097, + "flos": 20916313626240.0, + "grad_norm": 2.012713035482953, + "language_loss": 0.80224025, + "learning_rate": 2.7602117925992963e-07, + "loss": 0.82363057, + "num_input_tokens_seen": 299905070, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 13902, + "time_per_iteration": 2.397468328475952 + }, + { + "auxiliary_loss_clip": 0.01097387, + "auxiliary_loss_mlp": 0.01029112, + "balance_loss_clip": 1.01762605, + "balance_loss_mlp": 1.03423131, + "epoch": 0.8358935818427777, + "flos": 19244169740160.0, + "grad_norm": 1.5609736285597, + "language_loss": 0.62570262, + "learning_rate": 2.758237835853379e-07, + "loss": 0.64696753, + "num_input_tokens_seen": 299925130, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6328125, + "step": 13903, + "time_per_iteration": 2.4553894996643066 + }, + { + "auxiliary_loss_clip": 0.0110081, + "auxiliary_loss_mlp": 0.01031479, + "balance_loss_clip": 1.02035093, + "balance_loss_mlp": 1.03525472, + "epoch": 0.8359537050954456, + "flos": 24134053783680.0, + "grad_norm": 1.9013104570600536, + "language_loss": 0.74193108, + "learning_rate": 2.7562645329168054e-07, + "loss": 0.76325393, + "num_input_tokens_seen": 299943845, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 13904, + "time_per_iteration": 2.442950487136841 + }, + { + "auxiliary_loss_clip": 0.01095719, + "auxiliary_loss_mlp": 0.01031465, + "balance_loss_clip": 1.01960409, + "balance_loss_mlp": 1.03244185, + "epoch": 0.8360138283481137, + "flos": 16180415187840.0, + "grad_norm": 1.7117779805733213, + "language_loss": 0.72669482, + "learning_rate": 2.7542918838644104e-07, + "loss": 0.74796671, + "num_input_tokens_seen": 299961620, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6328125, + "step": 13905, + "time_per_iteration": 2.4159255027770996 + }, + { + "auxiliary_loss_clip": 0.01100321, + "auxiliary_loss_mlp": 0.01036401, + "balance_loss_clip": 1.02577984, + "balance_loss_mlp": 1.0364629, + "epoch": 0.8360739516007816, + "flos": 22198899536640.0, + "grad_norm": 1.860066119718017, + "language_loss": 0.66428232, + "learning_rate": 2.752319888771e-07, + "loss": 0.68564951, + "num_input_tokens_seen": 299982170, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.640625, + "step": 13906, + "time_per_iteration": 2.5206921100616455 + }, + { + "auxiliary_loss_clip": 0.01099727, + "auxiliary_loss_mlp": 0.01027111, + "balance_loss_clip": 1.01541066, + "balance_loss_mlp": 1.03409457, + "epoch": 0.8361340748534496, + "flos": 20923137210240.0, + "grad_norm": 1.450963295791905, + "language_loss": 0.74274147, + "learning_rate": 2.7503485477113475e-07, + "loss": 0.76400983, + "num_input_tokens_seen": 300001330, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 13907, + "time_per_iteration": 2.4509541988372803 + }, + { + "auxiliary_loss_clip": 0.0110021, + "auxiliary_loss_mlp": 0.01034987, + "balance_loss_clip": 1.023036, + "balance_loss_mlp": 1.03234959, + "epoch": 0.8361941981061175, + "flos": 26173599932160.0, + "grad_norm": 1.7096278514940075, + "language_loss": 0.75336194, + "learning_rate": 2.7483778607602005e-07, + "loss": 0.77471387, + "num_input_tokens_seen": 300020645, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 13908, + "time_per_iteration": 2.473710060119629 + }, + { + "auxiliary_loss_clip": 0.01101414, + "auxiliary_loss_mlp": 0.01031459, + "balance_loss_clip": 1.01834011, + "balance_loss_mlp": 1.03433633, + "epoch": 0.8362543213587855, + "flos": 24419363512320.0, + "grad_norm": 2.0866374581819676, + "language_loss": 0.70907331, + "learning_rate": 2.7464078279922964e-07, + "loss": 0.73040199, + "num_input_tokens_seen": 300039945, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.671875, + "step": 13909, + "time_per_iteration": 2.4959874153137207 + }, + { + "auxiliary_loss_clip": 0.01102211, + "auxiliary_loss_mlp": 0.0103805, + "balance_loss_clip": 1.02625394, + "balance_loss_mlp": 1.0341723, + "epoch": 0.8363144446114534, + "flos": 17202396948480.0, + "grad_norm": 1.9337905516009115, + "language_loss": 0.73345798, + "learning_rate": 2.744438449482338e-07, + "loss": 0.75486064, + "num_input_tokens_seen": 300058260, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 13910, + "time_per_iteration": 2.4417479038238525 + }, + { + "auxiliary_loss_clip": 0.01101132, + "auxiliary_loss_mlp": 0.01031168, + "balance_loss_clip": 1.02002239, + "balance_loss_mlp": 1.03492677, + "epoch": 0.8363745678641215, + "flos": 19279398003840.0, + "grad_norm": 1.739738236911797, + "language_loss": 0.73179841, + "learning_rate": 2.742469725305001e-07, + "loss": 0.75312144, + "num_input_tokens_seen": 300076720, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 13911, + "time_per_iteration": 2.4407854080200195 + }, + { + "auxiliary_loss_clip": 0.01103906, + "auxiliary_loss_mlp": 0.01039441, + "balance_loss_clip": 1.02751374, + "balance_loss_mlp": 1.03597605, + "epoch": 0.8364346911167894, + "flos": 11874869596800.0, + "grad_norm": 1.935636032044244, + "language_loss": 0.7883411, + "learning_rate": 2.740501655534946e-07, + "loss": 0.80977458, + "num_input_tokens_seen": 300092950, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 13912, + "time_per_iteration": 2.4071462154388428 + }, + { + "auxiliary_loss_clip": 0.01100355, + "auxiliary_loss_mlp": 0.01029404, + "balance_loss_clip": 1.0182991, + "balance_loss_mlp": 1.0349431, + "epoch": 0.8364948143694574, + "flos": 20225212974720.0, + "grad_norm": 1.6944232118607583, + "language_loss": 0.78812778, + "learning_rate": 2.738534240246797e-07, + "loss": 0.80942535, + "num_input_tokens_seen": 300110950, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.65625, + "step": 13913, + "time_per_iteration": 2.4921114444732666 + }, + { + "auxiliary_loss_clip": 0.0109927, + "auxiliary_loss_mlp": 0.01028996, + "balance_loss_clip": 1.01644349, + "balance_loss_mlp": 1.03274608, + "epoch": 0.8365549376221254, + "flos": 21612909058560.0, + "grad_norm": 3.2282140586828243, + "language_loss": 0.73658252, + "learning_rate": 2.736567479515153e-07, + "loss": 0.75786519, + "num_input_tokens_seen": 300128705, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6640625, + "step": 13914, + "time_per_iteration": 2.4990034103393555 + }, + { + "auxiliary_loss_clip": 0.01100583, + "auxiliary_loss_mlp": 0.01036368, + "balance_loss_clip": 1.02429831, + "balance_loss_mlp": 1.03516841, + "epoch": 0.8366150608747933, + "flos": 23294210912640.0, + "grad_norm": 1.639775835358494, + "language_loss": 0.7142942, + "learning_rate": 2.7346013734146025e-07, + "loss": 0.73566371, + "num_input_tokens_seen": 300148635, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.65625, + "step": 13915, + "time_per_iteration": 2.5020627975463867 + }, + { + "auxiliary_loss_clip": 0.01100305, + "auxiliary_loss_mlp": 0.01030346, + "balance_loss_clip": 1.01923013, + "balance_loss_mlp": 1.03396535, + "epoch": 0.8366751841274613, + "flos": 15267673664640.0, + "grad_norm": 1.83818917702025, + "language_loss": 0.72230256, + "learning_rate": 2.7326359220197035e-07, + "loss": 0.74360901, + "num_input_tokens_seen": 300165490, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 13916, + "time_per_iteration": 2.4200260639190674 + }, + { + "auxiliary_loss_clip": 0.01101017, + "auxiliary_loss_mlp": 0.01027576, + "balance_loss_clip": 1.01569629, + "balance_loss_mlp": 1.03474307, + "epoch": 0.8367353073801292, + "flos": 13224931205760.0, + "grad_norm": 2.1106769436504336, + "language_loss": 0.74262899, + "learning_rate": 2.7306711254049755e-07, + "loss": 0.76391494, + "num_input_tokens_seen": 300182130, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6640625, + "step": 13917, + "time_per_iteration": 2.4369187355041504 + }, + { + "auxiliary_loss_clip": 0.01097995, + "auxiliary_loss_mlp": 0.01031655, + "balance_loss_clip": 1.02054524, + "balance_loss_mlp": 1.03640127, + "epoch": 0.8367954306327973, + "flos": 24205084928640.0, + "grad_norm": 1.8755136087020403, + "language_loss": 0.79014456, + "learning_rate": 2.728706983644933e-07, + "loss": 0.81144106, + "num_input_tokens_seen": 300203050, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.61328125, + "step": 13918, + "time_per_iteration": 2.4455313682556152 + }, + { + "auxiliary_loss_clip": 0.01101776, + "auxiliary_loss_mlp": 0.01034264, + "balance_loss_clip": 1.02224147, + "balance_loss_mlp": 1.03523874, + "epoch": 0.8368555538854652, + "flos": 24534744975360.0, + "grad_norm": 1.477025515229524, + "language_loss": 0.67901552, + "learning_rate": 2.7267434968140457e-07, + "loss": 0.70037591, + "num_input_tokens_seen": 300224380, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 13919, + "time_per_iteration": 2.4661288261413574 + }, + { + "auxiliary_loss_clip": 0.01098166, + "auxiliary_loss_mlp": 0.01028175, + "balance_loss_clip": 1.01685655, + "balance_loss_mlp": 1.03343344, + "epoch": 0.8369156771381332, + "flos": 20259363830400.0, + "grad_norm": 3.3051256361077685, + "language_loss": 0.73841083, + "learning_rate": 2.7247806649867835e-07, + "loss": 0.75967425, + "num_input_tokens_seen": 300242915, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 13920, + "time_per_iteration": 2.4106199741363525 + }, + { + "auxiliary_loss_clip": 0.01100689, + "auxiliary_loss_mlp": 0.01031072, + "balance_loss_clip": 1.01916862, + "balance_loss_mlp": 1.03469241, + "epoch": 0.8369758003908011, + "flos": 21835555511040.0, + "grad_norm": 1.6224503256845468, + "language_loss": 0.68769908, + "learning_rate": 2.722818488237566e-07, + "loss": 0.70901674, + "num_input_tokens_seen": 300261905, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66015625, + "step": 13921, + "time_per_iteration": 2.442763090133667 + }, + { + "auxiliary_loss_clip": 0.01103103, + "auxiliary_loss_mlp": 0.01031989, + "balance_loss_clip": 1.02022922, + "balance_loss_mlp": 1.03511822, + "epoch": 0.8370359236434691, + "flos": 21719312121600.0, + "grad_norm": 1.934901742851694, + "language_loss": 0.85668844, + "learning_rate": 2.720856966640801e-07, + "loss": 0.87803936, + "num_input_tokens_seen": 300281145, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 13922, + "time_per_iteration": 2.4856491088867188 + }, + { + "auxiliary_loss_clip": 0.0109526, + "auxiliary_loss_mlp": 0.01028959, + "balance_loss_clip": 1.01823044, + "balance_loss_mlp": 1.03202581, + "epoch": 0.837096046896137, + "flos": 23148880485120.0, + "grad_norm": 1.6289815235404943, + "language_loss": 0.71753758, + "learning_rate": 2.71889610027088e-07, + "loss": 0.73877978, + "num_input_tokens_seen": 300301610, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6328125, + "step": 13923, + "time_per_iteration": 2.433978319168091 + }, + { + "auxiliary_loss_clip": 0.01098083, + "auxiliary_loss_mlp": 0.01025149, + "balance_loss_clip": 1.01288259, + "balance_loss_mlp": 1.03416324, + "epoch": 0.8371561701488051, + "flos": 24492872695680.0, + "grad_norm": 1.756856954459112, + "language_loss": 0.76217532, + "learning_rate": 2.7169358892021433e-07, + "loss": 0.78340769, + "num_input_tokens_seen": 300319420, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.640625, + "step": 13924, + "time_per_iteration": 2.4699859619140625 + }, + { + "auxiliary_loss_clip": 0.01098831, + "auxiliary_loss_mlp": 0.01027408, + "balance_loss_clip": 1.01606488, + "balance_loss_mlp": 1.03462529, + "epoch": 0.837216293401473, + "flos": 29206723161600.0, + "grad_norm": 1.572870754789481, + "language_loss": 0.64186335, + "learning_rate": 2.7149763335089293e-07, + "loss": 0.66312575, + "num_input_tokens_seen": 300341325, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 13925, + "time_per_iteration": 2.501033067703247 + }, + { + "auxiliary_loss_clip": 0.01102375, + "auxiliary_loss_mlp": 0.01030794, + "balance_loss_clip": 1.01933801, + "balance_loss_mlp": 1.03535914, + "epoch": 0.837276416654141, + "flos": 25265275781760.0, + "grad_norm": 2.4566320285291625, + "language_loss": 0.74334025, + "learning_rate": 2.713017433265543e-07, + "loss": 0.76467204, + "num_input_tokens_seen": 300361620, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.66796875, + "step": 13926, + "time_per_iteration": 2.47856068611145 + }, + { + "auxiliary_loss_clip": 0.01102753, + "auxiliary_loss_mlp": 0.01035641, + "balance_loss_clip": 1.02346361, + "balance_loss_mlp": 1.03702879, + "epoch": 0.837336539906809, + "flos": 13882024656000.0, + "grad_norm": 1.6715120452559071, + "language_loss": 0.71465194, + "learning_rate": 2.711059188546274e-07, + "loss": 0.73603582, + "num_input_tokens_seen": 300378675, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65625, + "step": 13927, + "time_per_iteration": 2.411735773086548 + }, + { + "auxiliary_loss_clip": 0.01022785, + "auxiliary_loss_mlp": 0.01002585, + "balance_loss_clip": 1.00157166, + "balance_loss_mlp": 1.00261497, + "epoch": 0.8373966631594769, + "flos": 68870599044480.0, + "grad_norm": 0.8374200555730595, + "language_loss": 0.58843565, + "learning_rate": 2.7091015994253695e-07, + "loss": 0.60868931, + "num_input_tokens_seen": 300449740, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.20214844, + "step": 13928, + "time_per_iteration": 4.569923639297485 + }, + { + "auxiliary_loss_clip": 0.01103713, + "auxiliary_loss_mlp": 0.01031215, + "balance_loss_clip": 1.01919854, + "balance_loss_mlp": 1.0377028, + "epoch": 0.8374567864121449, + "flos": 20448972748800.0, + "grad_norm": 1.6670643605711446, + "language_loss": 0.69916427, + "learning_rate": 2.707144665977068e-07, + "loss": 0.72051352, + "num_input_tokens_seen": 300470000, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66015625, + "step": 13929, + "time_per_iteration": 2.450941801071167 + }, + { + "auxiliary_loss_clip": 0.01103074, + "auxiliary_loss_mlp": 0.01027744, + "balance_loss_clip": 1.01532209, + "balance_loss_mlp": 1.0351336, + "epoch": 0.8375169096648128, + "flos": 41904197101440.0, + "grad_norm": 1.4407137482124839, + "language_loss": 0.6694839, + "learning_rate": 2.705188388275574e-07, + "loss": 0.69079208, + "num_input_tokens_seen": 300494975, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 13930, + "time_per_iteration": 4.168683052062988 + }, + { + "auxiliary_loss_clip": 0.01100625, + "auxiliary_loss_mlp": 0.01028404, + "balance_loss_clip": 1.01687002, + "balance_loss_mlp": 1.03649974, + "epoch": 0.8375770329174809, + "flos": 20009354192640.0, + "grad_norm": 1.9250443938302013, + "language_loss": 0.71341848, + "learning_rate": 2.703232766395067e-07, + "loss": 0.73470879, + "num_input_tokens_seen": 300513175, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 13931, + "time_per_iteration": 2.4318478107452393 + }, + { + "auxiliary_loss_clip": 0.0109844, + "auxiliary_loss_mlp": 0.01030356, + "balance_loss_clip": 1.01900148, + "balance_loss_mlp": 1.03423381, + "epoch": 0.8376371561701488, + "flos": 22783597125120.0, + "grad_norm": 1.6115485766829456, + "language_loss": 0.71996433, + "learning_rate": 2.701277800409705e-07, + "loss": 0.7412523, + "num_input_tokens_seen": 300533770, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.640625, + "step": 13932, + "time_per_iteration": 2.4718666076660156 + }, + { + "auxiliary_loss_clip": 0.01097692, + "auxiliary_loss_mlp": 0.01029982, + "balance_loss_clip": 1.01924706, + "balance_loss_mlp": 1.03308678, + "epoch": 0.8376972794228168, + "flos": 23914459987200.0, + "grad_norm": 2.572463429726218, + "language_loss": 0.66981155, + "learning_rate": 2.699323490393628e-07, + "loss": 0.69108832, + "num_input_tokens_seen": 300552995, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.64453125, + "step": 13933, + "time_per_iteration": 2.462989091873169 + }, + { + "auxiliary_loss_clip": 0.01099376, + "auxiliary_loss_mlp": 0.01035521, + "balance_loss_clip": 1.02439857, + "balance_loss_mlp": 1.03577971, + "epoch": 0.8377574026754847, + "flos": 13734718980480.0, + "grad_norm": 1.9819338387703926, + "language_loss": 0.76037461, + "learning_rate": 2.697369836420933e-07, + "loss": 0.78172362, + "num_input_tokens_seen": 300570275, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.63671875, + "step": 13934, + "time_per_iteration": 3.8440794944763184 + }, + { + "auxiliary_loss_clip": 0.01101761, + "auxiliary_loss_mlp": 0.01028154, + "balance_loss_clip": 1.01658523, + "balance_loss_mlp": 1.03747773, + "epoch": 0.8378175259281527, + "flos": 21651333632640.0, + "grad_norm": 2.4023414494461206, + "language_loss": 0.77042425, + "learning_rate": 2.6954168385657115e-07, + "loss": 0.79172337, + "num_input_tokens_seen": 300590875, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.64453125, + "step": 13935, + "time_per_iteration": 2.4580769538879395 + }, + { + "auxiliary_loss_clip": 0.01099502, + "auxiliary_loss_mlp": 0.01031487, + "balance_loss_clip": 1.01969159, + "balance_loss_mlp": 1.0334816, + "epoch": 0.8378776491808206, + "flos": 15448806973440.0, + "grad_norm": 2.54000512074222, + "language_loss": 0.55758452, + "learning_rate": 2.6934644969020135e-07, + "loss": 0.57889438, + "num_input_tokens_seen": 300607490, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.66015625, + "step": 13936, + "time_per_iteration": 2.3995320796966553 + }, + { + "auxiliary_loss_clip": 0.01097268, + "auxiliary_loss_mlp": 0.01028442, + "balance_loss_clip": 1.01752198, + "balance_loss_mlp": 1.03285074, + "epoch": 0.8379377724334887, + "flos": 14720395069440.0, + "grad_norm": 1.7997670475804433, + "language_loss": 0.89385533, + "learning_rate": 2.691512811503882e-07, + "loss": 0.91511238, + "num_input_tokens_seen": 300623635, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.64453125, + "step": 13937, + "time_per_iteration": 2.3957390785217285 + }, + { + "auxiliary_loss_clip": 0.0110113, + "auxiliary_loss_mlp": 0.01028341, + "balance_loss_clip": 1.01721907, + "balance_loss_mlp": 1.03569484, + "epoch": 0.8379978956861566, + "flos": 24535247765760.0, + "grad_norm": 1.9085284321860068, + "language_loss": 0.81626403, + "learning_rate": 2.689561782445313e-07, + "loss": 0.83755875, + "num_input_tokens_seen": 300643835, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 13938, + "time_per_iteration": 2.462914228439331 + }, + { + "auxiliary_loss_clip": 0.01101992, + "auxiliary_loss_mlp": 0.01031806, + "balance_loss_clip": 1.01986718, + "balance_loss_mlp": 1.03539038, + "epoch": 0.8380580189388246, + "flos": 18952611045120.0, + "grad_norm": 1.9830063454594962, + "language_loss": 0.70170665, + "learning_rate": 2.6876114098002965e-07, + "loss": 0.72304463, + "num_input_tokens_seen": 300662500, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66796875, + "step": 13939, + "time_per_iteration": 2.419306516647339 + }, + { + "auxiliary_loss_clip": 0.01103831, + "auxiliary_loss_mlp": 0.0103631, + "balance_loss_clip": 1.02409744, + "balance_loss_mlp": 1.03695917, + "epoch": 0.8381181421914926, + "flos": 26540283922560.0, + "grad_norm": 2.042221419683719, + "language_loss": 0.76166761, + "learning_rate": 2.6856616936428e-07, + "loss": 0.78306901, + "num_input_tokens_seen": 300681480, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66796875, + "step": 13940, + "time_per_iteration": 2.465965509414673 + }, + { + "auxiliary_loss_clip": 0.01098815, + "auxiliary_loss_mlp": 0.01029559, + "balance_loss_clip": 1.01821637, + "balance_loss_mlp": 1.03480721, + "epoch": 0.8381782654441605, + "flos": 23291481479040.0, + "grad_norm": 1.701308985819195, + "language_loss": 0.76258647, + "learning_rate": 2.6837126340467374e-07, + "loss": 0.78387022, + "num_input_tokens_seen": 300699165, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 13941, + "time_per_iteration": 2.471020221710205 + }, + { + "auxiliary_loss_clip": 0.01101967, + "auxiliary_loss_mlp": 0.01028195, + "balance_loss_clip": 1.01569009, + "balance_loss_mlp": 1.03386378, + "epoch": 0.8382383886968285, + "flos": 26758800311040.0, + "grad_norm": 2.8349668095914025, + "language_loss": 0.73475212, + "learning_rate": 2.6817642310860276e-07, + "loss": 0.75605369, + "num_input_tokens_seen": 300714615, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 13942, + "time_per_iteration": 2.468085527420044 + }, + { + "auxiliary_loss_clip": 0.01107356, + "auxiliary_loss_mlp": 0.01035548, + "balance_loss_clip": 1.02313805, + "balance_loss_mlp": 1.03645301, + "epoch": 0.8382985119494964, + "flos": 26104544035200.0, + "grad_norm": 1.5830946628416007, + "language_loss": 0.7929855, + "learning_rate": 2.679816484834554e-07, + "loss": 0.81441456, + "num_input_tokens_seen": 300734860, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.7109375, + "step": 13943, + "time_per_iteration": 2.46358323097229 + }, + { + "auxiliary_loss_clip": 0.01098177, + "auxiliary_loss_mlp": 0.0102923, + "balance_loss_clip": 1.01794708, + "balance_loss_mlp": 1.03353262, + "epoch": 0.8383586352021645, + "flos": 16435129507200.0, + "grad_norm": 1.9364854402368852, + "language_loss": 0.85158527, + "learning_rate": 2.6778693953661766e-07, + "loss": 0.87285936, + "num_input_tokens_seen": 300752735, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.64453125, + "step": 13944, + "time_per_iteration": 2.442012310028076 + }, + { + "auxiliary_loss_clip": 0.01022937, + "auxiliary_loss_mlp": 0.01002153, + "balance_loss_clip": 1.00108051, + "balance_loss_mlp": 1.00304651, + "epoch": 0.8384187584548324, + "flos": 64195532288640.0, + "grad_norm": 0.6210896800170687, + "language_loss": 0.50280273, + "learning_rate": 2.6759229627547263e-07, + "loss": 0.52305365, + "num_input_tokens_seen": 300820760, + "router_z_loss_clip": 0.01074219, + "router_z_loss_mlp": 0.19921875, + "step": 13945, + "time_per_iteration": 3.166820526123047 + }, + { + "auxiliary_loss_clip": 0.01098094, + "auxiliary_loss_mlp": 0.01029063, + "balance_loss_clip": 1.0179286, + "balance_loss_mlp": 1.03397751, + "epoch": 0.8384788817075004, + "flos": 22382905933440.0, + "grad_norm": 1.8419185707683658, + "language_loss": 0.6506319, + "learning_rate": 2.673977187074017e-07, + "loss": 0.67190349, + "num_input_tokens_seen": 300840025, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.640625, + "step": 13946, + "time_per_iteration": 2.4629406929016113 + }, + { + "auxiliary_loss_clip": 0.0109974, + "auxiliary_loss_mlp": 0.01031185, + "balance_loss_clip": 1.01890647, + "balance_loss_mlp": 1.03315461, + "epoch": 0.8385390049601683, + "flos": 29496845312640.0, + "grad_norm": 1.5727213172282053, + "language_loss": 0.67289019, + "learning_rate": 2.672032068397829e-07, + "loss": 0.69419944, + "num_input_tokens_seen": 300860380, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.66796875, + "step": 13947, + "time_per_iteration": 2.4871394634246826 + }, + { + "auxiliary_loss_clip": 0.01101642, + "auxiliary_loss_mlp": 0.01028612, + "balance_loss_clip": 1.01655436, + "balance_loss_mlp": 1.03533816, + "epoch": 0.8385991282128363, + "flos": 32707797799680.0, + "grad_norm": 1.4208449303436252, + "language_loss": 0.69888943, + "learning_rate": 2.6700876067999176e-07, + "loss": 0.72019202, + "num_input_tokens_seen": 300881895, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 13948, + "time_per_iteration": 2.5325706005096436 + }, + { + "auxiliary_loss_clip": 0.01096897, + "auxiliary_loss_mlp": 0.0103067, + "balance_loss_clip": 1.01995289, + "balance_loss_mlp": 1.03378117, + "epoch": 0.8386592514655042, + "flos": 25441022050560.0, + "grad_norm": 2.1940873483336927, + "language_loss": 0.84753001, + "learning_rate": 2.6681438023540194e-07, + "loss": 0.86880571, + "num_input_tokens_seen": 300901575, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6328125, + "step": 13949, + "time_per_iteration": 2.4774601459503174 + }, + { + "auxiliary_loss_clip": 0.01097997, + "auxiliary_loss_mlp": 0.01026375, + "balance_loss_clip": 1.01491952, + "balance_loss_mlp": 1.03441536, + "epoch": 0.8387193747181723, + "flos": 22015898720640.0, + "grad_norm": 1.849770284110971, + "language_loss": 0.70397264, + "learning_rate": 2.66620065513385e-07, + "loss": 0.72521639, + "num_input_tokens_seen": 300919735, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6328125, + "step": 13950, + "time_per_iteration": 2.4515769481658936 + }, + { + "auxiliary_loss_clip": 0.01098204, + "auxiliary_loss_mlp": 0.0102659, + "balance_loss_clip": 1.014979, + "balance_loss_mlp": 1.03368354, + "epoch": 0.8387794979708402, + "flos": 18150223080960.0, + "grad_norm": 1.91426323205629, + "language_loss": 0.64385873, + "learning_rate": 2.6642581652130913e-07, + "loss": 0.66510665, + "num_input_tokens_seen": 300939150, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.64453125, + "step": 13951, + "time_per_iteration": 2.413670539855957 + }, + { + "auxiliary_loss_clip": 0.01100218, + "auxiliary_loss_mlp": 0.01030938, + "balance_loss_clip": 1.0197382, + "balance_loss_mlp": 1.03516793, + "epoch": 0.8388396212235082, + "flos": 25411216740480.0, + "grad_norm": 1.4096665039754765, + "language_loss": 0.69953537, + "learning_rate": 2.662316332665393e-07, + "loss": 0.72084689, + "num_input_tokens_seen": 300959730, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6484375, + "step": 13952, + "time_per_iteration": 2.4785561561584473 + }, + { + "auxiliary_loss_clip": 0.01098001, + "auxiliary_loss_mlp": 0.01025504, + "balance_loss_clip": 1.01420259, + "balance_loss_mlp": 1.03371167, + "epoch": 0.8388997444761762, + "flos": 22273055164800.0, + "grad_norm": 2.562220143199556, + "language_loss": 0.72693485, + "learning_rate": 2.6603751575643987e-07, + "loss": 0.7481699, + "num_input_tokens_seen": 300976120, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.64453125, + "step": 13953, + "time_per_iteration": 2.4456255435943604 + }, + { + "auxiliary_loss_clip": 0.01097183, + "auxiliary_loss_mlp": 0.01025707, + "balance_loss_clip": 1.01448941, + "balance_loss_mlp": 1.03371382, + "epoch": 0.8389598677288441, + "flos": 19573219255680.0, + "grad_norm": 2.571526992442188, + "language_loss": 0.68295968, + "learning_rate": 2.6584346399837176e-07, + "loss": 0.70418859, + "num_input_tokens_seen": 300995080, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.63671875, + "step": 13954, + "time_per_iteration": 2.4475159645080566 + }, + { + "auxiliary_loss_clip": 0.01102027, + "auxiliary_loss_mlp": 0.01033062, + "balance_loss_clip": 1.02216661, + "balance_loss_mlp": 1.03607535, + "epoch": 0.8390199909815121, + "flos": 17384715406080.0, + "grad_norm": 1.8685810637104039, + "language_loss": 0.72950685, + "learning_rate": 2.656494779996932e-07, + "loss": 0.75085771, + "num_input_tokens_seen": 301012920, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.65625, + "step": 13955, + "time_per_iteration": 2.4151742458343506 + }, + { + "auxiliary_loss_clip": 0.01100167, + "auxiliary_loss_mlp": 0.01027037, + "balance_loss_clip": 1.01495552, + "balance_loss_mlp": 1.03408492, + "epoch": 0.83908011423418, + "flos": 24639639667200.0, + "grad_norm": 2.2257720048014145, + "language_loss": 0.66271257, + "learning_rate": 2.6545555776775995e-07, + "loss": 0.68398464, + "num_input_tokens_seen": 301028875, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66015625, + "step": 13956, + "time_per_iteration": 2.472264528274536 + }, + { + "auxiliary_loss_clip": 0.0110128, + "auxiliary_loss_mlp": 0.01029711, + "balance_loss_clip": 1.01721215, + "balance_loss_mlp": 1.03407979, + "epoch": 0.8391402374868481, + "flos": 24718356322560.0, + "grad_norm": 1.6130326255768752, + "language_loss": 0.79156423, + "learning_rate": 2.6526170330992667e-07, + "loss": 0.8128742, + "num_input_tokens_seen": 301050115, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.671875, + "step": 13957, + "time_per_iteration": 2.5556459426879883 + }, + { + "auxiliary_loss_clip": 0.01022919, + "auxiliary_loss_mlp": 0.00998362, + "balance_loss_clip": 0.99737281, + "balance_loss_mlp": 1.00284195, + "epoch": 0.839200360739516, + "flos": 56871695784960.0, + "grad_norm": 0.7468906710364033, + "language_loss": 0.53393608, + "learning_rate": 2.6506791463354283e-07, + "loss": 0.55414885, + "num_input_tokens_seen": 301114155, + "router_z_loss_clip": 0.0098877, + "router_z_loss_mlp": 0.20117188, + "step": 13958, + "time_per_iteration": 3.1345131397247314 + }, + { + "auxiliary_loss_clip": 0.01098889, + "auxiliary_loss_mlp": 0.01030882, + "balance_loss_clip": 1.01889539, + "balance_loss_mlp": 1.03390813, + "epoch": 0.839260483992184, + "flos": 18332792933760.0, + "grad_norm": 1.8191948509907279, + "language_loss": 0.73426306, + "learning_rate": 2.648741917459574e-07, + "loss": 0.75556076, + "num_input_tokens_seen": 301133150, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6484375, + "step": 13959, + "time_per_iteration": 2.422290802001953 + }, + { + "auxiliary_loss_clip": 0.01096696, + "auxiliary_loss_mlp": 0.01028323, + "balance_loss_clip": 1.01730168, + "balance_loss_mlp": 1.03430462, + "epoch": 0.8393206072448519, + "flos": 27087921653760.0, + "grad_norm": 1.7739336308149691, + "language_loss": 0.55481756, + "learning_rate": 2.646805346545169e-07, + "loss": 0.57606781, + "num_input_tokens_seen": 301153600, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.625, + "step": 13960, + "time_per_iteration": 2.48284649848938 + }, + { + "auxiliary_loss_clip": 0.01023374, + "auxiliary_loss_mlp": 0.00998479, + "balance_loss_clip": 0.99740618, + "balance_loss_mlp": 1.00327396, + "epoch": 0.8393807304975199, + "flos": 61521192057600.0, + "grad_norm": 0.7837966547782983, + "language_loss": 0.60692465, + "learning_rate": 2.6448694336656397e-07, + "loss": 0.6271432, + "num_input_tokens_seen": 301214335, + "router_z_loss_clip": 0.01074219, + "router_z_loss_mlp": 0.20117188, + "step": 13961, + "time_per_iteration": 3.1125965118408203 + }, + { + "auxiliary_loss_clip": 0.01096869, + "auxiliary_loss_mlp": 0.01028286, + "balance_loss_clip": 1.01716423, + "balance_loss_mlp": 1.03198576, + "epoch": 0.8394408537501878, + "flos": 14894848448640.0, + "grad_norm": 2.2318920850341626, + "language_loss": 0.68340284, + "learning_rate": 2.642934178894405e-07, + "loss": 0.7046544, + "num_input_tokens_seen": 301228960, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 13962, + "time_per_iteration": 2.3924074172973633 + }, + { + "auxiliary_loss_clip": 0.01099112, + "auxiliary_loss_mlp": 0.01029285, + "balance_loss_clip": 1.01776314, + "balance_loss_mlp": 1.03186655, + "epoch": 0.8395009770028559, + "flos": 17412186332160.0, + "grad_norm": 1.8739474188933585, + "language_loss": 0.73263037, + "learning_rate": 2.640999582304841e-07, + "loss": 0.7539143, + "num_input_tokens_seen": 301245875, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 13963, + "time_per_iteration": 2.411219835281372 + }, + { + "auxiliary_loss_clip": 0.01100071, + "auxiliary_loss_mlp": 0.01035213, + "balance_loss_clip": 1.02403665, + "balance_loss_mlp": 1.03410983, + "epoch": 0.8395611002555238, + "flos": 27924747782400.0, + "grad_norm": 1.7340708805723295, + "language_loss": 0.76229376, + "learning_rate": 2.6390656439703173e-07, + "loss": 0.78364658, + "num_input_tokens_seen": 301265550, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.66015625, + "step": 13964, + "time_per_iteration": 2.478710412979126 + }, + { + "auxiliary_loss_clip": 0.01103609, + "auxiliary_loss_mlp": 0.01035903, + "balance_loss_clip": 1.02302241, + "balance_loss_mlp": 1.03555202, + "epoch": 0.8396212235081918, + "flos": 11100922225920.0, + "grad_norm": 1.9577585475000066, + "language_loss": 0.78094041, + "learning_rate": 2.637132363964161e-07, + "loss": 0.8023355, + "num_input_tokens_seen": 301282035, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6796875, + "step": 13965, + "time_per_iteration": 2.4036173820495605 + }, + { + "auxiliary_loss_clip": 0.0109763, + "auxiliary_loss_mlp": 0.01028028, + "balance_loss_clip": 1.01681042, + "balance_loss_mlp": 1.03346133, + "epoch": 0.8396813467608598, + "flos": 35735641729920.0, + "grad_norm": 2.0786855194651714, + "language_loss": 0.66062534, + "learning_rate": 2.635199742359684e-07, + "loss": 0.68188184, + "num_input_tokens_seen": 301305210, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 13966, + "time_per_iteration": 2.558805465698242 + }, + { + "auxiliary_loss_clip": 0.0109852, + "auxiliary_loss_mlp": 0.01031584, + "balance_loss_clip": 1.02015817, + "balance_loss_mlp": 1.03405714, + "epoch": 0.8397414700135277, + "flos": 26176724415360.0, + "grad_norm": 1.664869225278249, + "language_loss": 0.74680585, + "learning_rate": 2.633267779230177e-07, + "loss": 0.76810688, + "num_input_tokens_seen": 301324885, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.64453125, + "step": 13967, + "time_per_iteration": 2.4877943992614746 + }, + { + "auxiliary_loss_clip": 0.01099282, + "auxiliary_loss_mlp": 0.01029925, + "balance_loss_clip": 1.01832557, + "balance_loss_mlp": 1.03487408, + "epoch": 0.8398015932661957, + "flos": 18333116156160.0, + "grad_norm": 2.2246871986534464, + "language_loss": 0.83141935, + "learning_rate": 2.6313364746488974e-07, + "loss": 0.85271138, + "num_input_tokens_seen": 301343070, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.64453125, + "step": 13968, + "time_per_iteration": 2.4127590656280518 + }, + { + "auxiliary_loss_clip": 0.01101548, + "auxiliary_loss_mlp": 0.01032272, + "balance_loss_clip": 1.02051806, + "balance_loss_mlp": 1.03508413, + "epoch": 0.8398617165188637, + "flos": 17379507934080.0, + "grad_norm": 2.3055427477830177, + "language_loss": 0.77584493, + "learning_rate": 2.629405828689075e-07, + "loss": 0.7971831, + "num_input_tokens_seen": 301359280, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6640625, + "step": 13969, + "time_per_iteration": 2.531064033508301 + }, + { + "auxiliary_loss_clip": 0.01101785, + "auxiliary_loss_mlp": 0.01027649, + "balance_loss_clip": 1.01506102, + "balance_loss_mlp": 1.03373933, + "epoch": 0.8399218397715317, + "flos": 22929681738240.0, + "grad_norm": 2.0172098119566026, + "language_loss": 0.77522105, + "learning_rate": 2.627475841423923e-07, + "loss": 0.79651541, + "num_input_tokens_seen": 301376465, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6796875, + "step": 13970, + "time_per_iteration": 3.822666645050049 + }, + { + "auxiliary_loss_clip": 0.01099108, + "auxiliary_loss_mlp": 0.01035452, + "balance_loss_clip": 1.02421093, + "balance_loss_mlp": 1.03340948, + "epoch": 0.8399819630241996, + "flos": 23149562843520.0, + "grad_norm": 2.67779469297833, + "language_loss": 0.72165084, + "learning_rate": 2.625546512926633e-07, + "loss": 0.74299651, + "num_input_tokens_seen": 301396000, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 13971, + "time_per_iteration": 2.4577131271362305 + }, + { + "auxiliary_loss_clip": 0.01098585, + "auxiliary_loss_mlp": 0.01027489, + "balance_loss_clip": 1.01532924, + "balance_loss_mlp": 1.03278506, + "epoch": 0.8400420862768676, + "flos": 16397423205120.0, + "grad_norm": 1.789878655946985, + "language_loss": 0.77530694, + "learning_rate": 2.623617843270358e-07, + "loss": 0.79656768, + "num_input_tokens_seen": 301413160, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.65625, + "step": 13972, + "time_per_iteration": 3.873415231704712 + }, + { + "auxiliary_loss_clip": 0.01097007, + "auxiliary_loss_mlp": 0.01032465, + "balance_loss_clip": 1.02161694, + "balance_loss_mlp": 1.03390861, + "epoch": 0.8401022095295355, + "flos": 21287486816640.0, + "grad_norm": 1.3193607521155475, + "language_loss": 0.68169355, + "learning_rate": 2.6216898325282333e-07, + "loss": 0.70298827, + "num_input_tokens_seen": 301433325, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6328125, + "step": 13973, + "time_per_iteration": 2.4711923599243164 + }, + { + "auxiliary_loss_clip": 0.01101565, + "auxiliary_loss_mlp": 0.01026266, + "balance_loss_clip": 1.01435125, + "balance_loss_mlp": 1.03448188, + "epoch": 0.8401623327822035, + "flos": 17311313963520.0, + "grad_norm": 1.7672136732051997, + "language_loss": 0.78160721, + "learning_rate": 2.619762480773382e-07, + "loss": 0.80288553, + "num_input_tokens_seen": 301450265, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 13974, + "time_per_iteration": 2.471079111099243 + }, + { + "auxiliary_loss_clip": 0.010996, + "auxiliary_loss_mlp": 0.01027292, + "balance_loss_clip": 1.01623559, + "balance_loss_mlp": 1.03378248, + "epoch": 0.8402224560348714, + "flos": 22236677665920.0, + "grad_norm": 1.5307501789030493, + "language_loss": 0.72512347, + "learning_rate": 2.617835788078868e-07, + "loss": 0.74639237, + "num_input_tokens_seen": 301470760, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.66015625, + "step": 13975, + "time_per_iteration": 3.9028189182281494 + }, + { + "auxiliary_loss_clip": 0.01097964, + "auxiliary_loss_mlp": 0.01025214, + "balance_loss_clip": 1.01351976, + "balance_loss_mlp": 1.03358281, + "epoch": 0.8402825792875395, + "flos": 20229953569920.0, + "grad_norm": 1.6696381164550365, + "language_loss": 0.72594655, + "learning_rate": 2.6159097545177645e-07, + "loss": 0.74717832, + "num_input_tokens_seen": 301489425, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.64453125, + "step": 13976, + "time_per_iteration": 2.5141208171844482 + }, + { + "auxiliary_loss_clip": 0.01096247, + "auxiliary_loss_mlp": 0.01027249, + "balance_loss_clip": 1.01650858, + "balance_loss_mlp": 1.0321219, + "epoch": 0.8403427025402074, + "flos": 23289973107840.0, + "grad_norm": 1.6819185940654011, + "language_loss": 0.72135288, + "learning_rate": 2.61398438016311e-07, + "loss": 0.7425878, + "num_input_tokens_seen": 301508885, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.640625, + "step": 13977, + "time_per_iteration": 2.50201153755188 + }, + { + "auxiliary_loss_clip": 0.0109805, + "auxiliary_loss_mlp": 0.01027928, + "balance_loss_clip": 1.01634645, + "balance_loss_mlp": 1.03184259, + "epoch": 0.8404028257928754, + "flos": 32675586278400.0, + "grad_norm": 1.3910921422626445, + "language_loss": 0.68459249, + "learning_rate": 2.6120596650879043e-07, + "loss": 0.70585227, + "num_input_tokens_seen": 301533780, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6640625, + "step": 13978, + "time_per_iteration": 2.5799074172973633 + }, + { + "auxiliary_loss_clip": 0.01094581, + "auxiliary_loss_mlp": 0.01027033, + "balance_loss_clip": 1.0161258, + "balance_loss_mlp": 1.03286028, + "epoch": 0.8404629490455434, + "flos": 16180522928640.0, + "grad_norm": 1.6895591936246208, + "language_loss": 0.77976441, + "learning_rate": 2.610135609365145e-07, + "loss": 0.80098057, + "num_input_tokens_seen": 301551775, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6171875, + "step": 13979, + "time_per_iteration": 2.4045827388763428 + }, + { + "auxiliary_loss_clip": 0.01100503, + "auxiliary_loss_mlp": 0.01028111, + "balance_loss_clip": 1.01674402, + "balance_loss_mlp": 1.03574336, + "epoch": 0.8405230722982113, + "flos": 15194451790080.0, + "grad_norm": 1.8475240517602953, + "language_loss": 0.77947694, + "learning_rate": 2.60821221306778e-07, + "loss": 0.80076307, + "num_input_tokens_seen": 301570495, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6484375, + "step": 13980, + "time_per_iteration": 2.4739646911621094 + }, + { + "auxiliary_loss_clip": 0.01098895, + "auxiliary_loss_mlp": 0.01026959, + "balance_loss_clip": 1.01647449, + "balance_loss_mlp": 1.0354929, + "epoch": 0.8405831955508793, + "flos": 27812418975360.0, + "grad_norm": 1.5275513384227286, + "language_loss": 0.86409223, + "learning_rate": 2.606289476268757e-07, + "loss": 0.88535082, + "num_input_tokens_seen": 301591705, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.6328125, + "step": 13981, + "time_per_iteration": 2.4817118644714355 + }, + { + "auxiliary_loss_clip": 0.01100072, + "auxiliary_loss_mlp": 0.01028176, + "balance_loss_clip": 1.01689255, + "balance_loss_mlp": 1.03534365, + "epoch": 0.8406433188035473, + "flos": 23769452782080.0, + "grad_norm": 2.1699458590209955, + "language_loss": 0.67915559, + "learning_rate": 2.6043673990409745e-07, + "loss": 0.70043814, + "num_input_tokens_seen": 301611670, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6484375, + "step": 13982, + "time_per_iteration": 2.4667885303497314 + }, + { + "auxiliary_loss_clip": 0.01101005, + "auxiliary_loss_mlp": 0.01033317, + "balance_loss_clip": 1.02090764, + "balance_loss_mlp": 1.03581178, + "epoch": 0.8407034420562153, + "flos": 29205681667200.0, + "grad_norm": 1.6697203722458216, + "language_loss": 0.68169171, + "learning_rate": 2.602445981457324e-07, + "loss": 0.70303488, + "num_input_tokens_seen": 301632540, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.65234375, + "step": 13983, + "time_per_iteration": 2.5006182193756104 + }, + { + "auxiliary_loss_clip": 0.01098671, + "auxiliary_loss_mlp": 0.01026021, + "balance_loss_clip": 1.01460695, + "balance_loss_mlp": 1.03199232, + "epoch": 0.8407635653088832, + "flos": 26360084367360.0, + "grad_norm": 1.7762423730618389, + "language_loss": 0.78527683, + "learning_rate": 2.6005252235906684e-07, + "loss": 0.8065238, + "num_input_tokens_seen": 301651480, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 13984, + "time_per_iteration": 2.489741086959839 + }, + { + "auxiliary_loss_clip": 0.01095303, + "auxiliary_loss_mlp": 0.01032124, + "balance_loss_clip": 1.02112675, + "balance_loss_mlp": 1.03048182, + "epoch": 0.8408236885615512, + "flos": 21468799693440.0, + "grad_norm": 1.933267070972905, + "language_loss": 0.60296601, + "learning_rate": 2.598605125513842e-07, + "loss": 0.62424028, + "num_input_tokens_seen": 301670010, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 13985, + "time_per_iteration": 2.4298417568206787 + }, + { + "auxiliary_loss_clip": 0.01101526, + "auxiliary_loss_mlp": 0.01027355, + "balance_loss_clip": 1.01508272, + "balance_loss_mlp": 1.03452444, + "epoch": 0.8408838118142191, + "flos": 22963724853120.0, + "grad_norm": 1.5581250113254055, + "language_loss": 0.8171947, + "learning_rate": 2.5966856872996467e-07, + "loss": 0.83848357, + "num_input_tokens_seen": 301689785, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.671875, + "step": 13986, + "time_per_iteration": 2.475343942642212 + }, + { + "auxiliary_loss_clip": 0.01099857, + "auxiliary_loss_mlp": 0.01028406, + "balance_loss_clip": 1.01718867, + "balance_loss_mlp": 1.03571963, + "epoch": 0.8409439350668871, + "flos": 26800026145920.0, + "grad_norm": 1.6939805128572716, + "language_loss": 0.65535557, + "learning_rate": 2.5947669090208755e-07, + "loss": 0.67663825, + "num_input_tokens_seen": 301712225, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 13987, + "time_per_iteration": 2.5180106163024902 + }, + { + "auxiliary_loss_clip": 0.01099856, + "auxiliary_loss_mlp": 0.01032928, + "balance_loss_clip": 1.02157354, + "balance_loss_mlp": 1.03494871, + "epoch": 0.841004058319555, + "flos": 26578672583040.0, + "grad_norm": 1.8313027359728804, + "language_loss": 0.67391479, + "learning_rate": 2.5928487907502906e-07, + "loss": 0.69524264, + "num_input_tokens_seen": 301730955, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 13988, + "time_per_iteration": 2.536297559738159 + }, + { + "auxiliary_loss_clip": 0.01105067, + "auxiliary_loss_mlp": 0.01035229, + "balance_loss_clip": 1.022843, + "balance_loss_mlp": 1.03692877, + "epoch": 0.8410641815722231, + "flos": 14501878680960.0, + "grad_norm": 2.1164994758777573, + "language_loss": 0.80786854, + "learning_rate": 2.590931332560622e-07, + "loss": 0.82927155, + "num_input_tokens_seen": 301746930, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 13989, + "time_per_iteration": 2.415370225906372 + }, + { + "auxiliary_loss_clip": 0.01100037, + "auxiliary_loss_mlp": 0.01027672, + "balance_loss_clip": 1.01601958, + "balance_loss_mlp": 1.03373408, + "epoch": 0.841124304824891, + "flos": 29166682475520.0, + "grad_norm": 1.624389596887663, + "language_loss": 0.75334507, + "learning_rate": 2.5890145345245826e-07, + "loss": 0.7746222, + "num_input_tokens_seen": 301766945, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 13990, + "time_per_iteration": 2.53814959526062 + }, + { + "auxiliary_loss_clip": 0.01092936, + "auxiliary_loss_mlp": 0.01031853, + "balance_loss_clip": 1.02082646, + "balance_loss_mlp": 1.03118992, + "epoch": 0.841184428077559, + "flos": 22412028885120.0, + "grad_norm": 1.6301250053203777, + "language_loss": 0.80746663, + "learning_rate": 2.5870983967148597e-07, + "loss": 0.82871455, + "num_input_tokens_seen": 301785460, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6171875, + "step": 13991, + "time_per_iteration": 2.450252056121826 + }, + { + "auxiliary_loss_clip": 0.01097513, + "auxiliary_loss_mlp": 0.01032194, + "balance_loss_clip": 1.02135789, + "balance_loss_mlp": 1.03337455, + "epoch": 0.841244551330227, + "flos": 22962791099520.0, + "grad_norm": 1.8798484826886184, + "language_loss": 0.70560163, + "learning_rate": 2.585182919204105e-07, + "loss": 0.72689867, + "num_input_tokens_seen": 301804180, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.640625, + "step": 13992, + "time_per_iteration": 2.4831645488739014 + }, + { + "auxiliary_loss_clip": 0.01099899, + "auxiliary_loss_mlp": 0.01023689, + "balance_loss_clip": 1.01287127, + "balance_loss_mlp": 1.03403449, + "epoch": 0.8413046745828949, + "flos": 21032736583680.0, + "grad_norm": 1.6703212978167075, + "language_loss": 0.76615024, + "learning_rate": 2.583268102064959e-07, + "loss": 0.78738606, + "num_input_tokens_seen": 301823670, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.65625, + "step": 13993, + "time_per_iteration": 2.4382317066192627 + }, + { + "auxiliary_loss_clip": 0.01104675, + "auxiliary_loss_mlp": 0.01033694, + "balance_loss_clip": 1.02032471, + "balance_loss_mlp": 1.03401446, + "epoch": 0.841364797835563, + "flos": 27052082858880.0, + "grad_norm": 1.8832197605446068, + "language_loss": 0.74138421, + "learning_rate": 2.5813539453700393e-07, + "loss": 0.76276791, + "num_input_tokens_seen": 301845890, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.70703125, + "step": 13994, + "time_per_iteration": 2.5059263706207275 + }, + { + "auxiliary_loss_clip": 0.01098149, + "auxiliary_loss_mlp": 0.01028107, + "balance_loss_clip": 1.01743138, + "balance_loss_mlp": 1.03507328, + "epoch": 0.8414249210882309, + "flos": 17895688329600.0, + "grad_norm": 1.4974162052212234, + "language_loss": 0.59372008, + "learning_rate": 2.5794404491919163e-07, + "loss": 0.61498266, + "num_input_tokens_seen": 301863985, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6328125, + "step": 13995, + "time_per_iteration": 2.43625545501709 + }, + { + "auxiliary_loss_clip": 0.01098487, + "auxiliary_loss_mlp": 0.01030117, + "balance_loss_clip": 1.01815403, + "balance_loss_mlp": 1.03378701, + "epoch": 0.8414850443408989, + "flos": 25441201618560.0, + "grad_norm": 1.8212710126456297, + "language_loss": 0.72060537, + "learning_rate": 2.577527613603163e-07, + "loss": 0.74189138, + "num_input_tokens_seen": 301882765, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6484375, + "step": 13996, + "time_per_iteration": 2.503129482269287 + }, + { + "auxiliary_loss_clip": 0.01099556, + "auxiliary_loss_mlp": 0.01029417, + "balance_loss_clip": 1.01885498, + "balance_loss_mlp": 1.03361964, + "epoch": 0.8415451675935668, + "flos": 23220055284480.0, + "grad_norm": 1.64892494823158, + "language_loss": 0.64126182, + "learning_rate": 2.5756154386763017e-07, + "loss": 0.66255158, + "num_input_tokens_seen": 301902720, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.66015625, + "step": 13997, + "time_per_iteration": 2.450742244720459 + }, + { + "auxiliary_loss_clip": 0.01102161, + "auxiliary_loss_mlp": 0.01033597, + "balance_loss_clip": 1.02135468, + "balance_loss_mlp": 1.0345906, + "epoch": 0.8416052908462348, + "flos": 18546496899840.0, + "grad_norm": 2.9556557520305535, + "language_loss": 0.82418084, + "learning_rate": 2.5737039244838565e-07, + "loss": 0.84553838, + "num_input_tokens_seen": 301921245, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.67578125, + "step": 13998, + "time_per_iteration": 2.4852468967437744 + }, + { + "auxiliary_loss_clip": 0.01100506, + "auxiliary_loss_mlp": 0.01032329, + "balance_loss_clip": 1.02012753, + "balance_loss_mlp": 1.03476977, + "epoch": 0.8416654140989027, + "flos": 26105190480000.0, + "grad_norm": 1.6783013419756503, + "language_loss": 0.80256122, + "learning_rate": 2.5717930710982984e-07, + "loss": 0.82388961, + "num_input_tokens_seen": 301942320, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65625, + "step": 13999, + "time_per_iteration": 2.4772074222564697 + }, + { + "auxiliary_loss_clip": 0.01102624, + "auxiliary_loss_mlp": 0.01033112, + "balance_loss_clip": 1.02082753, + "balance_loss_mlp": 1.03548217, + "epoch": 0.8417255373515707, + "flos": 26433270328320.0, + "grad_norm": 1.8918548509901008, + "language_loss": 0.6631999, + "learning_rate": 2.569882878592096e-07, + "loss": 0.68455726, + "num_input_tokens_seen": 301963110, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.671875, + "step": 14000, + "time_per_iteration": 2.4916574954986572 + }, + { + "auxiliary_loss_clip": 0.0110369, + "auxiliary_loss_mlp": 0.01028593, + "balance_loss_clip": 1.0162369, + "balance_loss_mlp": 1.0360285, + "epoch": 0.8417856606042387, + "flos": 24717745791360.0, + "grad_norm": 1.439269404890624, + "language_loss": 0.79670191, + "learning_rate": 2.5679733470376885e-07, + "loss": 0.81802464, + "num_input_tokens_seen": 301984915, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.67578125, + "step": 14001, + "time_per_iteration": 2.506103754043579 + }, + { + "auxiliary_loss_clip": 0.01098118, + "auxiliary_loss_mlp": 0.01026653, + "balance_loss_clip": 1.01570415, + "balance_loss_mlp": 1.03333926, + "epoch": 0.8418457838569067, + "flos": 20850849089280.0, + "grad_norm": 1.8424460389803186, + "language_loss": 0.78693283, + "learning_rate": 2.5660644765074703e-07, + "loss": 0.80818045, + "num_input_tokens_seen": 302004095, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 14002, + "time_per_iteration": 2.518280267715454 + }, + { + "auxiliary_loss_clip": 0.01098226, + "auxiliary_loss_mlp": 0.01027702, + "balance_loss_clip": 1.01548314, + "balance_loss_mlp": 1.03327775, + "epoch": 0.8419059071095746, + "flos": 28660629715200.0, + "grad_norm": 1.4332439734479316, + "language_loss": 0.77908051, + "learning_rate": 2.5641562670738334e-07, + "loss": 0.80033976, + "num_input_tokens_seen": 302027250, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6484375, + "step": 14003, + "time_per_iteration": 2.537221908569336 + }, + { + "auxiliary_loss_clip": 0.01098857, + "auxiliary_loss_mlp": 0.01025752, + "balance_loss_clip": 1.01390839, + "balance_loss_mlp": 1.03347921, + "epoch": 0.8419660303622426, + "flos": 21653596189440.0, + "grad_norm": 1.6966804421866966, + "language_loss": 0.65271151, + "learning_rate": 2.5622487188091436e-07, + "loss": 0.67395759, + "num_input_tokens_seen": 302046950, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65234375, + "step": 14004, + "time_per_iteration": 2.5128061771392822 + }, + { + "auxiliary_loss_clip": 0.01102661, + "auxiliary_loss_mlp": 0.01029562, + "balance_loss_clip": 1.01708663, + "balance_loss_mlp": 1.03558087, + "epoch": 0.8420261536149106, + "flos": 25301114576640.0, + "grad_norm": 1.9855294576572216, + "language_loss": 0.75816196, + "learning_rate": 2.560341831785724e-07, + "loss": 0.77948421, + "num_input_tokens_seen": 302065470, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.671875, + "step": 14005, + "time_per_iteration": 2.52797794342041 + }, + { + "auxiliary_loss_clip": 0.01099832, + "auxiliary_loss_mlp": 0.01030415, + "balance_loss_clip": 1.01821995, + "balance_loss_mlp": 1.03331256, + "epoch": 0.8420862768675785, + "flos": 18763397176320.0, + "grad_norm": 1.6128094911025277, + "language_loss": 0.77796531, + "learning_rate": 2.5584356060758906e-07, + "loss": 0.79926783, + "num_input_tokens_seen": 302083190, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6640625, + "step": 14006, + "time_per_iteration": 2.4496371746063232 + }, + { + "auxiliary_loss_clip": 0.01099985, + "auxiliary_loss_mlp": 0.01033097, + "balance_loss_clip": 1.02158761, + "balance_loss_mlp": 1.03485298, + "epoch": 0.8421464001202466, + "flos": 18328052338560.0, + "grad_norm": 1.7960476944276447, + "language_loss": 0.76950121, + "learning_rate": 2.556530041751932e-07, + "loss": 0.79083204, + "num_input_tokens_seen": 302098820, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 14007, + "time_per_iteration": 2.422987222671509 + }, + { + "auxiliary_loss_clip": 0.01100017, + "auxiliary_loss_mlp": 0.01028707, + "balance_loss_clip": 1.01662517, + "balance_loss_mlp": 1.03380418, + "epoch": 0.8422065233729145, + "flos": 31537181560320.0, + "grad_norm": 1.9383988075415828, + "language_loss": 0.65885502, + "learning_rate": 2.554625138886102e-07, + "loss": 0.68014228, + "num_input_tokens_seen": 302117075, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 14008, + "time_per_iteration": 2.5793111324310303 + }, + { + "auxiliary_loss_clip": 0.01022812, + "auxiliary_loss_mlp": 0.01000595, + "balance_loss_clip": 0.99958724, + "balance_loss_mlp": 1.00263965, + "epoch": 0.8422666466255825, + "flos": 64298128510080.0, + "grad_norm": 0.7113984001609904, + "language_loss": 0.56948996, + "learning_rate": 2.552720897550631e-07, + "loss": 0.589724, + "num_input_tokens_seen": 302179735, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.20214844, + "step": 14009, + "time_per_iteration": 3.0907700061798096 + }, + { + "auxiliary_loss_clip": 0.01095048, + "auxiliary_loss_mlp": 0.01029218, + "balance_loss_clip": 1.01879275, + "balance_loss_mlp": 1.03142929, + "epoch": 0.8423267698782504, + "flos": 24316731377280.0, + "grad_norm": 1.8875851862493795, + "language_loss": 0.77928913, + "learning_rate": 2.5508173178177304e-07, + "loss": 0.80053174, + "num_input_tokens_seen": 302202055, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.63671875, + "step": 14010, + "time_per_iteration": 2.529472589492798 + }, + { + "auxiliary_loss_clip": 0.01103883, + "auxiliary_loss_mlp": 0.01035285, + "balance_loss_clip": 1.02301264, + "balance_loss_mlp": 1.03620028, + "epoch": 0.8423868931309184, + "flos": 18296092212480.0, + "grad_norm": 1.6401509883189613, + "language_loss": 0.72421598, + "learning_rate": 2.548914399759592e-07, + "loss": 0.74560767, + "num_input_tokens_seen": 302221360, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.67578125, + "step": 14011, + "time_per_iteration": 2.447643280029297 + }, + { + "auxiliary_loss_clip": 0.01098963, + "auxiliary_loss_mlp": 0.01034293, + "balance_loss_clip": 1.0230099, + "balance_loss_mlp": 1.03326452, + "epoch": 0.8424470163835863, + "flos": 23550218121600.0, + "grad_norm": 1.9531405231766128, + "language_loss": 0.84154844, + "learning_rate": 2.5470121434483636e-07, + "loss": 0.86288095, + "num_input_tokens_seen": 302240715, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65625, + "step": 14012, + "time_per_iteration": 3.9144818782806396 + }, + { + "auxiliary_loss_clip": 0.01091927, + "auxiliary_loss_mlp": 0.01028828, + "balance_loss_clip": 1.0187788, + "balance_loss_mlp": 1.03218663, + "epoch": 0.8425071396362543, + "flos": 23769488695680.0, + "grad_norm": 1.723607660424782, + "language_loss": 0.6789465, + "learning_rate": 2.5451105489561884e-07, + "loss": 0.70015401, + "num_input_tokens_seen": 302260950, + "router_z_loss_clip": 0.10058594, + "router_z_loss_mlp": 0.59765625, + "step": 14013, + "time_per_iteration": 5.320711612701416 + }, + { + "auxiliary_loss_clip": 0.01103875, + "auxiliary_loss_mlp": 0.01029273, + "balance_loss_clip": 1.01710737, + "balance_loss_mlp": 1.03486574, + "epoch": 0.8425672628889223, + "flos": 16178906816640.0, + "grad_norm": 2.2453340608922003, + "language_loss": 0.78587079, + "learning_rate": 2.5432096163551644e-07, + "loss": 0.80720234, + "num_input_tokens_seen": 302277500, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69140625, + "step": 14014, + "time_per_iteration": 2.436648368835449 + }, + { + "auxiliary_loss_clip": 0.01098868, + "auxiliary_loss_mlp": 0.0102707, + "balance_loss_clip": 1.0159297, + "balance_loss_mlp": 1.03430867, + "epoch": 0.8426273861415903, + "flos": 23149131880320.0, + "grad_norm": 1.6871681799127176, + "language_loss": 0.67591381, + "learning_rate": 2.5413093457173884e-07, + "loss": 0.69717318, + "num_input_tokens_seen": 302297930, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.64453125, + "step": 14015, + "time_per_iteration": 2.471731185913086 + }, + { + "auxiliary_loss_clip": 0.01098465, + "auxiliary_loss_mlp": 0.01028558, + "balance_loss_clip": 1.01645255, + "balance_loss_mlp": 1.0342679, + "epoch": 0.8426875093942582, + "flos": 17457757712640.0, + "grad_norm": 2.6823702306015687, + "language_loss": 0.75894105, + "learning_rate": 2.5394097371149036e-07, + "loss": 0.78021133, + "num_input_tokens_seen": 302315735, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.64453125, + "step": 14016, + "time_per_iteration": 2.5013680458068848 + }, + { + "auxiliary_loss_clip": 0.0109948, + "auxiliary_loss_mlp": 0.01030243, + "balance_loss_clip": 1.01901984, + "balance_loss_mlp": 1.03484464, + "epoch": 0.8427476326469262, + "flos": 19640551299840.0, + "grad_norm": 2.2337330694664264, + "language_loss": 0.79515624, + "learning_rate": 2.5375107906197544e-07, + "loss": 0.81645346, + "num_input_tokens_seen": 302332790, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.64453125, + "step": 14017, + "time_per_iteration": 3.9724068641662598 + }, + { + "auxiliary_loss_clip": 0.01098149, + "auxiliary_loss_mlp": 0.01030572, + "balance_loss_clip": 1.01912832, + "balance_loss_mlp": 1.03382218, + "epoch": 0.8428077558995941, + "flos": 11941160146560.0, + "grad_norm": 2.091031083104406, + "language_loss": 0.62672061, + "learning_rate": 2.5356125063039525e-07, + "loss": 0.64800781, + "num_input_tokens_seen": 302346490, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.640625, + "step": 14018, + "time_per_iteration": 2.411105155944824 + }, + { + "auxiliary_loss_clip": 0.01098996, + "auxiliary_loss_mlp": 0.01033729, + "balance_loss_clip": 1.02257061, + "balance_loss_mlp": 1.03304076, + "epoch": 0.8428678791522621, + "flos": 10451729767680.0, + "grad_norm": 1.7561486170770395, + "language_loss": 0.79493165, + "learning_rate": 2.5337148842394687e-07, + "loss": 0.81625891, + "num_input_tokens_seen": 302363235, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.66015625, + "step": 14019, + "time_per_iteration": 2.4422781467437744 + }, + { + "auxiliary_loss_clip": 0.01100268, + "auxiliary_loss_mlp": 0.01028189, + "balance_loss_clip": 1.01616716, + "balance_loss_mlp": 1.03396749, + "epoch": 0.8429280024049302, + "flos": 28767248259840.0, + "grad_norm": 1.7295630345102972, + "language_loss": 0.78420174, + "learning_rate": 2.531817924498265e-07, + "loss": 0.80548632, + "num_input_tokens_seen": 302383270, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 14020, + "time_per_iteration": 2.504492998123169 + }, + { + "auxiliary_loss_clip": 0.01101741, + "auxiliary_loss_mlp": 0.01026853, + "balance_loss_clip": 1.0155102, + "balance_loss_mlp": 1.03599119, + "epoch": 0.8429881256575981, + "flos": 19537093152000.0, + "grad_norm": 1.740406918389935, + "language_loss": 0.71201503, + "learning_rate": 2.5299216271522805e-07, + "loss": 0.73330098, + "num_input_tokens_seen": 302401355, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 14021, + "time_per_iteration": 2.4488563537597656 + }, + { + "auxiliary_loss_clip": 0.01101185, + "auxiliary_loss_mlp": 0.01035627, + "balance_loss_clip": 1.02362275, + "balance_loss_mlp": 1.03446221, + "epoch": 0.8430482489102661, + "flos": 24790931752320.0, + "grad_norm": 1.60894104728434, + "language_loss": 0.69625163, + "learning_rate": 2.5280259922734125e-07, + "loss": 0.71761978, + "num_input_tokens_seen": 302419515, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66796875, + "step": 14022, + "time_per_iteration": 2.4879534244537354 + }, + { + "auxiliary_loss_clip": 0.01103852, + "auxiliary_loss_mlp": 0.01033331, + "balance_loss_clip": 1.02093339, + "balance_loss_mlp": 1.03596544, + "epoch": 0.843108372162934, + "flos": 21544248211200.0, + "grad_norm": 3.722986578619458, + "language_loss": 0.72199565, + "learning_rate": 2.526131019933553e-07, + "loss": 0.74336749, + "num_input_tokens_seen": 302438280, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 14023, + "time_per_iteration": 2.440702199935913 + }, + { + "auxiliary_loss_clip": 0.01099861, + "auxiliary_loss_mlp": 0.01033807, + "balance_loss_clip": 1.02184463, + "balance_loss_mlp": 1.03477669, + "epoch": 0.843168495415602, + "flos": 24608792862720.0, + "grad_norm": 1.3509908984580676, + "language_loss": 0.66908002, + "learning_rate": 2.524236710204559e-07, + "loss": 0.69041669, + "num_input_tokens_seen": 302460860, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65234375, + "step": 14024, + "time_per_iteration": 2.5194430351257324 + }, + { + "auxiliary_loss_clip": 0.01098712, + "auxiliary_loss_mlp": 0.01030558, + "balance_loss_clip": 1.01882803, + "balance_loss_mlp": 1.03412294, + "epoch": 0.8432286186682699, + "flos": 15122738286720.0, + "grad_norm": 1.8914070683276865, + "language_loss": 0.80512542, + "learning_rate": 2.522343063158261e-07, + "loss": 0.82641816, + "num_input_tokens_seen": 302476980, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.64453125, + "step": 14025, + "time_per_iteration": 2.418902635574341 + }, + { + "auxiliary_loss_clip": 0.01096552, + "auxiliary_loss_mlp": 0.01029052, + "balance_loss_clip": 1.01918721, + "balance_loss_mlp": 1.03351688, + "epoch": 0.843288741920938, + "flos": 20301882554880.0, + "grad_norm": 1.4865758896664674, + "language_loss": 0.77659529, + "learning_rate": 2.5204500788664606e-07, + "loss": 0.79785132, + "num_input_tokens_seen": 302496380, + "router_z_loss_clip": 0.09863281, + "router_z_loss_mlp": 0.62890625, + "step": 14026, + "time_per_iteration": 2.4354312419891357 + }, + { + "auxiliary_loss_clip": 0.01099258, + "auxiliary_loss_mlp": 0.01031113, + "balance_loss_clip": 1.01970434, + "balance_loss_mlp": 1.03534245, + "epoch": 0.8433488651736059, + "flos": 23332096782720.0, + "grad_norm": 1.3796664446051232, + "language_loss": 0.82750577, + "learning_rate": 2.518557757400945e-07, + "loss": 0.84880948, + "num_input_tokens_seen": 302516845, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.640625, + "step": 14027, + "time_per_iteration": 2.5538077354431152 + }, + { + "auxiliary_loss_clip": 0.01098136, + "auxiliary_loss_mlp": 0.01031411, + "balance_loss_clip": 1.02059913, + "balance_loss_mlp": 1.0331111, + "epoch": 0.8434089884262739, + "flos": 39458105844480.0, + "grad_norm": 1.5166229721837947, + "language_loss": 0.56329119, + "learning_rate": 2.5166660988334754e-07, + "loss": 0.58458668, + "num_input_tokens_seen": 302538865, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6484375, + "step": 14028, + "time_per_iteration": 2.616518020629883 + }, + { + "auxiliary_loss_clip": 0.01098938, + "auxiliary_loss_mlp": 0.01026458, + "balance_loss_clip": 1.01574659, + "balance_loss_mlp": 1.03393281, + "epoch": 0.8434691116789418, + "flos": 23768842250880.0, + "grad_norm": 2.148426968087737, + "language_loss": 0.6371001, + "learning_rate": 2.51477510323578e-07, + "loss": 0.65835404, + "num_input_tokens_seen": 302557970, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6484375, + "step": 14029, + "time_per_iteration": 2.473363161087036 + }, + { + "auxiliary_loss_clip": 0.01096698, + "auxiliary_loss_mlp": 0.01028098, + "balance_loss_clip": 1.01784623, + "balance_loss_mlp": 1.03464794, + "epoch": 0.8435292349316098, + "flos": 22671411972480.0, + "grad_norm": 1.5285969366660268, + "language_loss": 0.75408536, + "learning_rate": 2.51288477067956e-07, + "loss": 0.77533334, + "num_input_tokens_seen": 302578915, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.62109375, + "step": 14030, + "time_per_iteration": 2.454810380935669 + }, + { + "auxiliary_loss_clip": 0.01098147, + "auxiliary_loss_mlp": 0.01032966, + "balance_loss_clip": 1.02121234, + "balance_loss_mlp": 1.03436208, + "epoch": 0.8435893581842777, + "flos": 18843622202880.0, + "grad_norm": 1.7934754806619189, + "language_loss": 0.82908231, + "learning_rate": 2.510995101236502e-07, + "loss": 0.85039353, + "num_input_tokens_seen": 302596300, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.640625, + "step": 14031, + "time_per_iteration": 2.4392600059509277 + }, + { + "auxiliary_loss_clip": 0.01096568, + "auxiliary_loss_mlp": 0.01027224, + "balance_loss_clip": 1.01611948, + "balance_loss_mlp": 1.03294599, + "epoch": 0.8436494814369457, + "flos": 20704225772160.0, + "grad_norm": 4.6225832312305135, + "language_loss": 0.79887378, + "learning_rate": 2.509106094978266e-07, + "loss": 0.82011175, + "num_input_tokens_seen": 302614975, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.63671875, + "step": 14032, + "time_per_iteration": 2.4791696071624756 + }, + { + "auxiliary_loss_clip": 0.01099257, + "auxiliary_loss_mlp": 0.01032483, + "balance_loss_clip": 1.019454, + "balance_loss_mlp": 1.03245103, + "epoch": 0.8437096046896138, + "flos": 22674177319680.0, + "grad_norm": 1.3665627856205167, + "language_loss": 0.75488985, + "learning_rate": 2.507217751976478e-07, + "loss": 0.77620721, + "num_input_tokens_seen": 302636415, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.66796875, + "step": 14033, + "time_per_iteration": 2.4676787853240967 + }, + { + "auxiliary_loss_clip": 0.01098, + "auxiliary_loss_mlp": 0.01032734, + "balance_loss_clip": 1.02204669, + "balance_loss_mlp": 1.03290701, + "epoch": 0.8437697279422817, + "flos": 16180127879040.0, + "grad_norm": 1.9639820469891438, + "language_loss": 0.83208501, + "learning_rate": 2.505330072302743e-07, + "loss": 0.85339236, + "num_input_tokens_seen": 302653605, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6484375, + "step": 14034, + "time_per_iteration": 2.469835042953491 + }, + { + "auxiliary_loss_clip": 0.01100301, + "auxiliary_loss_mlp": 0.01023747, + "balance_loss_clip": 1.01229727, + "balance_loss_mlp": 1.03501594, + "epoch": 0.8438298511949497, + "flos": 28765847629440.0, + "grad_norm": 1.4504915159037657, + "language_loss": 0.7833904, + "learning_rate": 2.503443056028656e-07, + "loss": 0.80463088, + "num_input_tokens_seen": 302673965, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 14035, + "time_per_iteration": 2.50130558013916 + }, + { + "auxiliary_loss_clip": 0.0109866, + "auxiliary_loss_mlp": 0.01028359, + "balance_loss_clip": 1.01701045, + "balance_loss_mlp": 1.03443027, + "epoch": 0.8438899744476176, + "flos": 33724284779520.0, + "grad_norm": 1.3198614132572242, + "language_loss": 0.72175288, + "learning_rate": 2.501556703225751e-07, + "loss": 0.74302304, + "num_input_tokens_seen": 302695560, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 14036, + "time_per_iteration": 2.5673165321350098 + }, + { + "auxiliary_loss_clip": 0.01094598, + "auxiliary_loss_mlp": 0.01025043, + "balance_loss_clip": 1.01530933, + "balance_loss_mlp": 1.03373456, + "epoch": 0.8439500977002856, + "flos": 25110787386240.0, + "grad_norm": 2.270504860744628, + "language_loss": 0.69560575, + "learning_rate": 2.49967101396557e-07, + "loss": 0.71680212, + "num_input_tokens_seen": 302713480, + "router_z_loss_clip": 0.09765625, + "router_z_loss_mlp": 0.609375, + "step": 14037, + "time_per_iteration": 2.462125062942505 + }, + { + "auxiliary_loss_clip": 0.0109787, + "auxiliary_loss_mlp": 0.01023197, + "balance_loss_clip": 1.01211691, + "balance_loss_mlp": 1.03348804, + "epoch": 0.8440102209529535, + "flos": 32850362880000.0, + "grad_norm": 1.5784558103110167, + "language_loss": 0.68976426, + "learning_rate": 2.4977859883196227e-07, + "loss": 0.71097493, + "num_input_tokens_seen": 302736860, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.64453125, + "step": 14038, + "time_per_iteration": 2.5513498783111572 + }, + { + "auxiliary_loss_clip": 0.01099747, + "auxiliary_loss_mlp": 0.01034477, + "balance_loss_clip": 1.02248418, + "balance_loss_mlp": 1.03365922, + "epoch": 0.8440703442056215, + "flos": 23730202195200.0, + "grad_norm": 1.6561315096706188, + "language_loss": 0.76345998, + "learning_rate": 2.49590162635938e-07, + "loss": 0.7848022, + "num_input_tokens_seen": 302757745, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66015625, + "step": 14039, + "time_per_iteration": 2.481635093688965 + }, + { + "auxiliary_loss_clip": 0.01104423, + "auxiliary_loss_mlp": 0.01025998, + "balance_loss_clip": 1.01443481, + "balance_loss_mlp": 1.03612375, + "epoch": 0.8441304674582895, + "flos": 20193719725440.0, + "grad_norm": 2.016141716862511, + "language_loss": 0.79202807, + "learning_rate": 2.4940179281563046e-07, + "loss": 0.81333232, + "num_input_tokens_seen": 302774885, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.68359375, + "step": 14040, + "time_per_iteration": 2.501422166824341 + }, + { + "auxiliary_loss_clip": 0.01102231, + "auxiliary_loss_mlp": 0.01031429, + "balance_loss_clip": 1.01974607, + "balance_loss_mlp": 1.03618717, + "epoch": 0.8441905907109575, + "flos": 20219897761920.0, + "grad_norm": 2.0550149763476093, + "language_loss": 0.69268221, + "learning_rate": 2.492134893781821e-07, + "loss": 0.71401882, + "num_input_tokens_seen": 302791035, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66015625, + "step": 14041, + "time_per_iteration": 2.4294750690460205 + }, + { + "auxiliary_loss_clip": 0.01102612, + "auxiliary_loss_mlp": 0.0102986, + "balance_loss_clip": 1.0185411, + "balance_loss_mlp": 1.03519189, + "epoch": 0.8442507139636254, + "flos": 13516453987200.0, + "grad_norm": 1.7597415592284222, + "language_loss": 0.69147003, + "learning_rate": 2.490252523307341e-07, + "loss": 0.71279472, + "num_input_tokens_seen": 302808650, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.67578125, + "step": 14042, + "time_per_iteration": 2.442840814590454 + }, + { + "auxiliary_loss_clip": 0.01097842, + "auxiliary_loss_mlp": 0.01031745, + "balance_loss_clip": 1.02081347, + "balance_loss_mlp": 1.03461182, + "epoch": 0.8443108372162934, + "flos": 18220212731520.0, + "grad_norm": 1.6402432311205208, + "language_loss": 0.74725193, + "learning_rate": 2.4883708168042373e-07, + "loss": 0.76854777, + "num_input_tokens_seen": 302824605, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6328125, + "step": 14043, + "time_per_iteration": 2.5701467990875244 + }, + { + "auxiliary_loss_clip": 0.01098174, + "auxiliary_loss_mlp": 0.01026595, + "balance_loss_clip": 1.01558638, + "balance_loss_mlp": 1.03430986, + "epoch": 0.8443709604689613, + "flos": 16105110324480.0, + "grad_norm": 2.0792624601455127, + "language_loss": 0.71829164, + "learning_rate": 2.486489774343865e-07, + "loss": 0.73953938, + "num_input_tokens_seen": 302840170, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.63671875, + "step": 14044, + "time_per_iteration": 2.4172301292419434 + }, + { + "auxiliary_loss_clip": 0.01095955, + "auxiliary_loss_mlp": 0.01028701, + "balance_loss_clip": 1.01734626, + "balance_loss_mlp": 1.03243351, + "epoch": 0.8444310837216293, + "flos": 18512130562560.0, + "grad_norm": 1.5189186579429734, + "language_loss": 0.74687707, + "learning_rate": 2.484609395997559e-07, + "loss": 0.76812357, + "num_input_tokens_seen": 302858320, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.63671875, + "step": 14045, + "time_per_iteration": 2.427867889404297 + }, + { + "auxiliary_loss_clip": 0.01098274, + "auxiliary_loss_mlp": 0.01030475, + "balance_loss_clip": 1.01913869, + "balance_loss_mlp": 1.03317916, + "epoch": 0.8444912069742974, + "flos": 14939845211520.0, + "grad_norm": 1.6309917453055534, + "language_loss": 0.78394771, + "learning_rate": 2.4827296818366216e-07, + "loss": 0.80523521, + "num_input_tokens_seen": 302875255, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 14046, + "time_per_iteration": 2.4088222980499268 + }, + { + "auxiliary_loss_clip": 0.01101869, + "auxiliary_loss_mlp": 0.01030527, + "balance_loss_clip": 1.01861823, + "balance_loss_mlp": 1.03469396, + "epoch": 0.8445513302269653, + "flos": 20120318282880.0, + "grad_norm": 1.9427954061525838, + "language_loss": 0.7794674, + "learning_rate": 2.4808506319323255e-07, + "loss": 0.80079138, + "num_input_tokens_seen": 302894690, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 14047, + "time_per_iteration": 2.4330637454986572 + }, + { + "auxiliary_loss_clip": 0.01100445, + "auxiliary_loss_mlp": 0.01029203, + "balance_loss_clip": 1.01727068, + "balance_loss_mlp": 1.03626013, + "epoch": 0.8446114534796333, + "flos": 31170928533120.0, + "grad_norm": 1.713262783377482, + "language_loss": 0.71858978, + "learning_rate": 2.478972246355935e-07, + "loss": 0.73988628, + "num_input_tokens_seen": 302912405, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.640625, + "step": 14048, + "time_per_iteration": 2.5051729679107666 + }, + { + "auxiliary_loss_clip": 0.01099733, + "auxiliary_loss_mlp": 0.01031303, + "balance_loss_clip": 1.01973367, + "balance_loss_mlp": 1.03443313, + "epoch": 0.8446715767323012, + "flos": 23948323534080.0, + "grad_norm": 1.9354286067009534, + "language_loss": 0.73582602, + "learning_rate": 2.477094525178667e-07, + "loss": 0.75713634, + "num_input_tokens_seen": 302932525, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65234375, + "step": 14049, + "time_per_iteration": 2.4543259143829346 + }, + { + "auxiliary_loss_clip": 0.01022781, + "auxiliary_loss_mlp": 0.00999339, + "balance_loss_clip": 0.99834388, + "balance_loss_mlp": 1.00275576, + "epoch": 0.8447316999849692, + "flos": 67984897484160.0, + "grad_norm": 0.8078250011122586, + "language_loss": 0.60653841, + "learning_rate": 2.475217468471729e-07, + "loss": 0.62675965, + "num_input_tokens_seen": 302991285, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.20019531, + "step": 14050, + "time_per_iteration": 2.9804251194000244 + }, + { + "auxiliary_loss_clip": 0.01097821, + "auxiliary_loss_mlp": 0.01029422, + "balance_loss_clip": 1.01737618, + "balance_loss_mlp": 1.03271341, + "epoch": 0.8447918232376371, + "flos": 22418924296320.0, + "grad_norm": 3.49479551702144, + "language_loss": 0.72012359, + "learning_rate": 2.473341076306303e-07, + "loss": 0.74139607, + "num_input_tokens_seen": 303009515, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6484375, + "step": 14051, + "time_per_iteration": 2.484506368637085 + }, + { + "auxiliary_loss_clip": 0.01097541, + "auxiliary_loss_mlp": 0.01024468, + "balance_loss_clip": 1.01300573, + "balance_loss_mlp": 1.03342485, + "epoch": 0.8448519464903052, + "flos": 23694147918720.0, + "grad_norm": 1.9860085309394724, + "language_loss": 0.74646604, + "learning_rate": 2.471465348753547e-07, + "loss": 0.76768613, + "num_input_tokens_seen": 303026905, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.640625, + "step": 14052, + "time_per_iteration": 2.480921506881714 + }, + { + "auxiliary_loss_clip": 0.01092244, + "auxiliary_loss_mlp": 0.01026184, + "balance_loss_clip": 1.01610494, + "balance_loss_mlp": 1.03236473, + "epoch": 0.8449120697429731, + "flos": 13735904129280.0, + "grad_norm": 1.7502333612228071, + "language_loss": 0.7411198, + "learning_rate": 2.469590285884575e-07, + "loss": 0.76230407, + "num_input_tokens_seen": 303045245, + "router_z_loss_clip": 0.10058594, + "router_z_loss_mlp": 0.59765625, + "step": 14053, + "time_per_iteration": 3.8867318630218506 + }, + { + "auxiliary_loss_clip": 0.01099171, + "auxiliary_loss_mlp": 0.01025236, + "balance_loss_clip": 1.01445961, + "balance_loss_mlp": 1.03528714, + "epoch": 0.8449721929956411, + "flos": 20886795624960.0, + "grad_norm": 1.6433817636003443, + "language_loss": 0.74101913, + "learning_rate": 2.467715887770494e-07, + "loss": 0.76226318, + "num_input_tokens_seen": 303065205, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.640625, + "step": 14054, + "time_per_iteration": 2.4648666381835938 + }, + { + "auxiliary_loss_clip": 0.01103393, + "auxiliary_loss_mlp": 0.0102962, + "balance_loss_clip": 1.01794338, + "balance_loss_mlp": 1.03570211, + "epoch": 0.845032316248309, + "flos": 33216939129600.0, + "grad_norm": 1.4033979981207616, + "language_loss": 0.78469646, + "learning_rate": 2.4658421544823895e-07, + "loss": 0.80602658, + "num_input_tokens_seen": 303088250, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 14055, + "time_per_iteration": 5.422392845153809 + }, + { + "auxiliary_loss_clip": 0.01097429, + "auxiliary_loss_mlp": 0.01026244, + "balance_loss_clip": 1.01544404, + "balance_loss_mlp": 1.0341053, + "epoch": 0.845092439500977, + "flos": 23585230903680.0, + "grad_norm": 1.6864328278526126, + "language_loss": 0.72890306, + "learning_rate": 2.463969086091302e-07, + "loss": 0.75013983, + "num_input_tokens_seen": 303109280, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6328125, + "step": 14056, + "time_per_iteration": 2.477959394454956 + }, + { + "auxiliary_loss_clip": 0.01105764, + "auxiliary_loss_mlp": 0.01032313, + "balance_loss_clip": 1.02083945, + "balance_loss_mlp": 1.03714287, + "epoch": 0.8451525627536449, + "flos": 13333920048000.0, + "grad_norm": 2.258325426074488, + "language_loss": 0.67587829, + "learning_rate": 2.4620966826682686e-07, + "loss": 0.69725907, + "num_input_tokens_seen": 303126075, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6875, + "step": 14057, + "time_per_iteration": 2.414092779159546 + }, + { + "auxiliary_loss_clip": 0.01101571, + "auxiliary_loss_mlp": 0.01027638, + "balance_loss_clip": 1.01612806, + "balance_loss_mlp": 1.03506553, + "epoch": 0.8452126860063129, + "flos": 27817985583360.0, + "grad_norm": 1.7994228407078163, + "language_loss": 0.77547145, + "learning_rate": 2.460224944284284e-07, + "loss": 0.79676348, + "num_input_tokens_seen": 303146920, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 14058, + "time_per_iteration": 2.4836339950561523 + }, + { + "auxiliary_loss_clip": 0.01099526, + "auxiliary_loss_mlp": 0.01031734, + "balance_loss_clip": 1.0203855, + "balance_loss_mlp": 1.03322566, + "epoch": 0.845272809258981, + "flos": 27124694202240.0, + "grad_norm": 1.4871834521741678, + "language_loss": 0.69746482, + "learning_rate": 2.45835387101033e-07, + "loss": 0.71877742, + "num_input_tokens_seen": 303167885, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 14059, + "time_per_iteration": 3.935227155685425 + }, + { + "auxiliary_loss_clip": 0.01103214, + "auxiliary_loss_mlp": 0.01034961, + "balance_loss_clip": 1.02245557, + "balance_loss_mlp": 1.03492641, + "epoch": 0.8453329325116489, + "flos": 18332577452160.0, + "grad_norm": 1.763536728638446, + "language_loss": 0.57535338, + "learning_rate": 2.4564834629173516e-07, + "loss": 0.59673512, + "num_input_tokens_seen": 303185000, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 14060, + "time_per_iteration": 2.472986936569214 + }, + { + "auxiliary_loss_clip": 0.01101496, + "auxiliary_loss_mlp": 0.010352, + "balance_loss_clip": 1.02237284, + "balance_loss_mlp": 1.03304076, + "epoch": 0.8453930557643169, + "flos": 22675254727680.0, + "grad_norm": 1.5294645823993187, + "language_loss": 0.75755733, + "learning_rate": 2.454613720076277e-07, + "loss": 0.77892435, + "num_input_tokens_seen": 303205210, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6875, + "step": 14061, + "time_per_iteration": 2.459678888320923 + }, + { + "auxiliary_loss_clip": 0.01101612, + "auxiliary_loss_mlp": 0.01027088, + "balance_loss_clip": 1.01516151, + "balance_loss_mlp": 1.03408587, + "epoch": 0.8454531790169848, + "flos": 22487261921280.0, + "grad_norm": 2.0519400058397066, + "language_loss": 0.7084868, + "learning_rate": 2.452744642558013e-07, + "loss": 0.72977388, + "num_input_tokens_seen": 303224655, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 14062, + "time_per_iteration": 2.4579243659973145 + }, + { + "auxiliary_loss_clip": 0.01022787, + "auxiliary_loss_mlp": 0.01001721, + "balance_loss_clip": 1.00071383, + "balance_loss_mlp": 1.00271332, + "epoch": 0.8455133022696528, + "flos": 58277848481280.0, + "grad_norm": 0.6326686900336163, + "language_loss": 0.52631342, + "learning_rate": 2.450876230433432e-07, + "loss": 0.5465585, + "num_input_tokens_seen": 303289645, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.20117188, + "step": 14063, + "time_per_iteration": 3.0987555980682373 + }, + { + "auxiliary_loss_clip": 0.01096616, + "auxiliary_loss_mlp": 0.01023156, + "balance_loss_clip": 1.01282668, + "balance_loss_mlp": 1.03490078, + "epoch": 0.8455734255223207, + "flos": 21361283308800.0, + "grad_norm": 1.7407026281004632, + "language_loss": 0.81590897, + "learning_rate": 2.449008483773378e-07, + "loss": 0.8371067, + "num_input_tokens_seen": 303308350, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.6171875, + "step": 14064, + "time_per_iteration": 2.430516481399536 + }, + { + "auxiliary_loss_clip": 0.01103129, + "auxiliary_loss_mlp": 0.01030903, + "balance_loss_clip": 1.01860666, + "balance_loss_mlp": 1.0363518, + "epoch": 0.8456335487749888, + "flos": 20449260057600.0, + "grad_norm": 1.8631632297123397, + "language_loss": 0.72349954, + "learning_rate": 2.447141402648685e-07, + "loss": 0.74483991, + "num_input_tokens_seen": 303325230, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66796875, + "step": 14065, + "time_per_iteration": 2.4640002250671387 + }, + { + "auxiliary_loss_clip": 0.01098256, + "auxiliary_loss_mlp": 0.0102714, + "balance_loss_clip": 1.0159936, + "balance_loss_mlp": 1.03512609, + "epoch": 0.8456936720276567, + "flos": 28840901097600.0, + "grad_norm": 1.498676898240102, + "language_loss": 0.77308834, + "learning_rate": 2.445274987130146e-07, + "loss": 0.79434228, + "num_input_tokens_seen": 303345810, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.62890625, + "step": 14066, + "time_per_iteration": 2.506878614425659 + }, + { + "auxiliary_loss_clip": 0.01101195, + "auxiliary_loss_mlp": 0.010281, + "balance_loss_clip": 1.01635194, + "balance_loss_mlp": 1.03654039, + "epoch": 0.8457537952803247, + "flos": 22672884430080.0, + "grad_norm": 1.821364194037934, + "language_loss": 0.70122147, + "learning_rate": 2.4434092372885363e-07, + "loss": 0.72251445, + "num_input_tokens_seen": 303365140, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6484375, + "step": 14067, + "time_per_iteration": 2.476407051086426 + }, + { + "auxiliary_loss_clip": 0.01096849, + "auxiliary_loss_mlp": 0.01025297, + "balance_loss_clip": 1.01409197, + "balance_loss_mlp": 1.03197587, + "epoch": 0.8458139185329926, + "flos": 33802929607680.0, + "grad_norm": 6.126876803000313, + "language_loss": 0.7123543, + "learning_rate": 2.4415441531946144e-07, + "loss": 0.7335757, + "num_input_tokens_seen": 303386150, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 14068, + "time_per_iteration": 2.541780948638916 + }, + { + "auxiliary_loss_clip": 0.01022903, + "auxiliary_loss_mlp": 0.00998547, + "balance_loss_clip": 0.997522, + "balance_loss_mlp": 1.00272095, + "epoch": 0.8458740417856606, + "flos": 70295929603200.0, + "grad_norm": 0.6989192180305637, + "language_loss": 0.60597819, + "learning_rate": 2.4396797349190976e-07, + "loss": 0.62619269, + "num_input_tokens_seen": 303453770, + "router_z_loss_clip": 0.01025391, + "router_z_loss_mlp": 0.20214844, + "step": 14069, + "time_per_iteration": 3.1510071754455566 + }, + { + "auxiliary_loss_clip": 0.01100142, + "auxiliary_loss_mlp": 0.01027402, + "balance_loss_clip": 1.01657152, + "balance_loss_mlp": 1.03431201, + "epoch": 0.8459341650383285, + "flos": 24170862245760.0, + "grad_norm": 1.677018602985038, + "language_loss": 0.74419677, + "learning_rate": 2.4378159825326804e-07, + "loss": 0.76547223, + "num_input_tokens_seen": 303474520, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.65625, + "step": 14070, + "time_per_iteration": 2.4841506481170654 + }, + { + "auxiliary_loss_clip": 0.01099817, + "auxiliary_loss_mlp": 0.01033098, + "balance_loss_clip": 1.0215106, + "balance_loss_mlp": 1.03530526, + "epoch": 0.8459942882909965, + "flos": 38181158369280.0, + "grad_norm": 1.6241169837210538, + "language_loss": 0.66860032, + "learning_rate": 2.435952896106039e-07, + "loss": 0.68992949, + "num_input_tokens_seen": 303497345, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.64453125, + "step": 14071, + "time_per_iteration": 2.594825267791748 + }, + { + "auxiliary_loss_clip": 0.01022635, + "auxiliary_loss_mlp": 0.0099954, + "balance_loss_clip": 0.99856204, + "balance_loss_mlp": 1.00254095, + "epoch": 0.8460544115436646, + "flos": 64118252177280.0, + "grad_norm": 0.7322891811634097, + "language_loss": 0.60995638, + "learning_rate": 2.4340904757098313e-07, + "loss": 0.63017821, + "num_input_tokens_seen": 303554890, + "router_z_loss_clip": 0.00976562, + "router_z_loss_mlp": 0.20117188, + "step": 14072, + "time_per_iteration": 2.906951427459717 + }, + { + "auxiliary_loss_clip": 0.01101338, + "auxiliary_loss_mlp": 0.01031787, + "balance_loss_clip": 1.01863873, + "balance_loss_mlp": 1.03404236, + "epoch": 0.8461145347963325, + "flos": 24170826332160.0, + "grad_norm": 1.7449520639436589, + "language_loss": 0.72158128, + "learning_rate": 2.4322287214146664e-07, + "loss": 0.74291253, + "num_input_tokens_seen": 303574380, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.671875, + "step": 14073, + "time_per_iteration": 2.491899013519287 + }, + { + "auxiliary_loss_clip": 0.011067, + "auxiliary_loss_mlp": 0.01034402, + "balance_loss_clip": 1.02137196, + "balance_loss_mlp": 1.03658199, + "epoch": 0.8461746580490005, + "flos": 34893787697280.0, + "grad_norm": 2.211703240876086, + "language_loss": 0.78310221, + "learning_rate": 2.430367633291155e-07, + "loss": 0.80451322, + "num_input_tokens_seen": 303594910, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 14074, + "time_per_iteration": 2.5973291397094727 + }, + { + "auxiliary_loss_clip": 0.01102012, + "auxiliary_loss_mlp": 0.01030857, + "balance_loss_clip": 1.01937723, + "balance_loss_mlp": 1.03654218, + "epoch": 0.8462347813016684, + "flos": 25557014044800.0, + "grad_norm": 1.9799370549513835, + "language_loss": 0.75153923, + "learning_rate": 2.4285072114098583e-07, + "loss": 0.77286798, + "num_input_tokens_seen": 303613520, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65625, + "step": 14075, + "time_per_iteration": 2.5073180198669434 + }, + { + "auxiliary_loss_clip": 0.01098407, + "auxiliary_loss_mlp": 0.01028304, + "balance_loss_clip": 1.01677096, + "balance_loss_mlp": 1.03368163, + "epoch": 0.8462949045543364, + "flos": 21325336773120.0, + "grad_norm": 3.0035376812832966, + "language_loss": 0.73357224, + "learning_rate": 2.4266474558413355e-07, + "loss": 0.7548393, + "num_input_tokens_seen": 303631225, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 14076, + "time_per_iteration": 2.493821859359741 + }, + { + "auxiliary_loss_clip": 0.0110366, + "auxiliary_loss_mlp": 0.01033318, + "balance_loss_clip": 1.02139723, + "balance_loss_mlp": 1.03518665, + "epoch": 0.8463550278070043, + "flos": 22637440684800.0, + "grad_norm": 1.8749402802311503, + "language_loss": 0.77490556, + "learning_rate": 2.4247883666560945e-07, + "loss": 0.79627538, + "num_input_tokens_seen": 303649175, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.68359375, + "step": 14077, + "time_per_iteration": 2.4984679222106934 + }, + { + "auxiliary_loss_clip": 0.01104786, + "auxiliary_loss_mlp": 0.01033509, + "balance_loss_clip": 1.02190399, + "balance_loss_mlp": 1.03648067, + "epoch": 0.8464151510596724, + "flos": 13005588804480.0, + "grad_norm": 2.450375908672133, + "language_loss": 0.75225329, + "learning_rate": 2.422929943924643e-07, + "loss": 0.77363622, + "num_input_tokens_seen": 303665915, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.68359375, + "step": 14078, + "time_per_iteration": 2.474865436553955 + }, + { + "auxiliary_loss_clip": 0.01097121, + "auxiliary_loss_mlp": 0.01024557, + "balance_loss_clip": 1.01263642, + "balance_loss_mlp": 1.03324652, + "epoch": 0.8464752743123403, + "flos": 15704921923200.0, + "grad_norm": 2.129652796655016, + "language_loss": 0.85099643, + "learning_rate": 2.4210721877174565e-07, + "loss": 0.87221324, + "num_input_tokens_seen": 303679985, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.640625, + "step": 14079, + "time_per_iteration": 2.448030471801758 + }, + { + "auxiliary_loss_clip": 0.0110791, + "auxiliary_loss_mlp": 0.01034332, + "balance_loss_clip": 1.02166057, + "balance_loss_mlp": 1.03643155, + "epoch": 0.8465353975650083, + "flos": 21653955325440.0, + "grad_norm": 2.083587711277689, + "language_loss": 0.58946401, + "learning_rate": 2.419215098104965e-07, + "loss": 0.61088645, + "num_input_tokens_seen": 303698470, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.7109375, + "step": 14080, + "time_per_iteration": 2.477292776107788 + }, + { + "auxiliary_loss_clip": 0.01101645, + "auxiliary_loss_mlp": 0.01030543, + "balance_loss_clip": 1.01849711, + "balance_loss_mlp": 1.03358364, + "epoch": 0.8465955208176762, + "flos": 18515650095360.0, + "grad_norm": 2.2918439013364615, + "language_loss": 0.66583252, + "learning_rate": 2.4173586751576014e-07, + "loss": 0.68715435, + "num_input_tokens_seen": 303716415, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 14081, + "time_per_iteration": 2.496119976043701 + }, + { + "auxiliary_loss_clip": 0.011022, + "auxiliary_loss_mlp": 0.01027784, + "balance_loss_clip": 1.01699603, + "balance_loss_mlp": 1.03520298, + "epoch": 0.8466556440703442, + "flos": 24200559815040.0, + "grad_norm": 1.6481433281292062, + "language_loss": 0.73019934, + "learning_rate": 2.41550291894576e-07, + "loss": 0.75149918, + "num_input_tokens_seen": 303734490, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.671875, + "step": 14082, + "time_per_iteration": 2.516955614089966 + }, + { + "auxiliary_loss_clip": 0.01100527, + "auxiliary_loss_mlp": 0.01026426, + "balance_loss_clip": 1.01518404, + "balance_loss_mlp": 1.0332799, + "epoch": 0.8467157673230121, + "flos": 20375894528640.0, + "grad_norm": 1.8611445286872557, + "language_loss": 0.75691915, + "learning_rate": 2.413647829539809e-07, + "loss": 0.77818871, + "num_input_tokens_seen": 303752310, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 14083, + "time_per_iteration": 2.443368673324585 + }, + { + "auxiliary_loss_clip": 0.01103347, + "auxiliary_loss_mlp": 0.01032333, + "balance_loss_clip": 1.01958406, + "balance_loss_mlp": 1.03421068, + "epoch": 0.8467758905756801, + "flos": 28473642489600.0, + "grad_norm": 1.9018584016547608, + "language_loss": 0.66331363, + "learning_rate": 2.411793407010092e-07, + "loss": 0.68467045, + "num_input_tokens_seen": 303776065, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 14084, + "time_per_iteration": 2.5169622898101807 + }, + { + "auxiliary_loss_clip": 0.01103562, + "auxiliary_loss_mlp": 0.01030247, + "balance_loss_clip": 1.01891565, + "balance_loss_mlp": 1.03716993, + "epoch": 0.8468360138283482, + "flos": 11692551139200.0, + "grad_norm": 1.8997830806116998, + "language_loss": 0.69932806, + "learning_rate": 2.409939651426938e-07, + "loss": 0.72066617, + "num_input_tokens_seen": 303793500, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 14085, + "time_per_iteration": 2.4265129566192627 + }, + { + "auxiliary_loss_clip": 0.01099334, + "auxiliary_loss_mlp": 0.01028905, + "balance_loss_clip": 1.01743662, + "balance_loss_mlp": 1.03297186, + "epoch": 0.8468961370810161, + "flos": 24607859109120.0, + "grad_norm": 1.5697280005670382, + "language_loss": 0.71030748, + "learning_rate": 2.408086562860634e-07, + "loss": 0.73158979, + "num_input_tokens_seen": 303814835, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 14086, + "time_per_iteration": 2.5099709033966064 + }, + { + "auxiliary_loss_clip": 0.01099375, + "auxiliary_loss_mlp": 0.0102745, + "balance_loss_clip": 1.01607704, + "balance_loss_mlp": 1.034688, + "epoch": 0.8469562603336841, + "flos": 19609812236160.0, + "grad_norm": 1.65755050150048, + "language_loss": 0.75040638, + "learning_rate": 2.4062341413814445e-07, + "loss": 0.77167463, + "num_input_tokens_seen": 303834505, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6484375, + "step": 14087, + "time_per_iteration": 2.508406639099121 + }, + { + "auxiliary_loss_clip": 0.01099258, + "auxiliary_loss_mlp": 0.01023821, + "balance_loss_clip": 1.01230574, + "balance_loss_mlp": 1.03506553, + "epoch": 0.847016383586352, + "flos": 22638949056000.0, + "grad_norm": 1.3368514111731342, + "language_loss": 0.7384972, + "learning_rate": 2.4043823870596227e-07, + "loss": 0.75972795, + "num_input_tokens_seen": 303855050, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 14088, + "time_per_iteration": 2.532632827758789 + }, + { + "auxiliary_loss_clip": 0.01101991, + "auxiliary_loss_mlp": 0.01034515, + "balance_loss_clip": 1.02231431, + "balance_loss_mlp": 1.03481674, + "epoch": 0.84707650683902, + "flos": 20960161153920.0, + "grad_norm": 1.8296611825235667, + "language_loss": 0.7228626, + "learning_rate": 2.402531299965387e-07, + "loss": 0.74422771, + "num_input_tokens_seen": 303875635, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 14089, + "time_per_iteration": 2.451185703277588 + }, + { + "auxiliary_loss_clip": 0.01097339, + "auxiliary_loss_mlp": 0.0102571, + "balance_loss_clip": 1.0147543, + "balance_loss_mlp": 1.03443742, + "epoch": 0.8471366300916879, + "flos": 24093007516800.0, + "grad_norm": 1.409397008066916, + "language_loss": 0.79183906, + "learning_rate": 2.400680880168928e-07, + "loss": 0.81306958, + "num_input_tokens_seen": 303896750, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.62890625, + "step": 14090, + "time_per_iteration": 2.4930338859558105 + }, + { + "auxiliary_loss_clip": 0.01101917, + "auxiliary_loss_mlp": 0.01038153, + "balance_loss_clip": 1.02512336, + "balance_loss_mlp": 1.03417242, + "epoch": 0.847196753344356, + "flos": 18332900674560.0, + "grad_norm": 1.8243760305256211, + "language_loss": 0.7671752, + "learning_rate": 2.3988311277404085e-07, + "loss": 0.78857589, + "num_input_tokens_seen": 303915435, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6796875, + "step": 14091, + "time_per_iteration": 2.4504411220550537 + }, + { + "auxiliary_loss_clip": 0.01022805, + "auxiliary_loss_mlp": 0.01001176, + "balance_loss_clip": 1.00027013, + "balance_loss_mlp": 1.0028131, + "epoch": 0.8472568765970239, + "flos": 49567536956160.0, + "grad_norm": 0.8882896911860697, + "language_loss": 0.5941655, + "learning_rate": 2.396982042749982e-07, + "loss": 0.61440521, + "num_input_tokens_seen": 303977245, + "router_z_loss_clip": 0.0090332, + "router_z_loss_mlp": 0.20019531, + "step": 14092, + "time_per_iteration": 3.120185613632202 + }, + { + "auxiliary_loss_clip": 0.0109951, + "auxiliary_loss_mlp": 0.0103338, + "balance_loss_clip": 1.02101243, + "balance_loss_mlp": 1.03303409, + "epoch": 0.8473169998496919, + "flos": 19279074781440.0, + "grad_norm": 1.9515587052463406, + "language_loss": 0.70222908, + "learning_rate": 2.395133625267756e-07, + "loss": 0.72355801, + "num_input_tokens_seen": 303996055, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6640625, + "step": 14093, + "time_per_iteration": 2.437931537628174 + }, + { + "auxiliary_loss_clip": 0.01095773, + "auxiliary_loss_mlp": 0.0102644, + "balance_loss_clip": 1.01558602, + "balance_loss_mlp": 1.0323596, + "epoch": 0.8473771231023598, + "flos": 17675555829120.0, + "grad_norm": 2.0380428061341176, + "language_loss": 0.83106399, + "learning_rate": 2.3932858753638263e-07, + "loss": 0.8522861, + "num_input_tokens_seen": 304012205, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6328125, + "step": 14094, + "time_per_iteration": 2.4864912033081055 + }, + { + "auxiliary_loss_clip": 0.01096593, + "auxiliary_loss_mlp": 0.01030533, + "balance_loss_clip": 1.01932764, + "balance_loss_mlp": 1.0341723, + "epoch": 0.8474372463550278, + "flos": 26359761144960.0, + "grad_norm": 1.819291360487763, + "language_loss": 0.71216273, + "learning_rate": 2.3914387931082626e-07, + "loss": 0.73343396, + "num_input_tokens_seen": 304033475, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.625, + "step": 14095, + "time_per_iteration": 3.993910074234009 + }, + { + "auxiliary_loss_clip": 0.01097533, + "auxiliary_loss_mlp": 0.01032238, + "balance_loss_clip": 1.02124119, + "balance_loss_mlp": 1.03403068, + "epoch": 0.8474973696076957, + "flos": 23402050519680.0, + "grad_norm": 1.825062729979651, + "language_loss": 0.81036246, + "learning_rate": 2.3895923785711105e-07, + "loss": 0.83166021, + "num_input_tokens_seen": 304051845, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6328125, + "step": 14096, + "time_per_iteration": 2.4629123210906982 + }, + { + "auxiliary_loss_clip": 0.01102093, + "auxiliary_loss_mlp": 0.0103143, + "balance_loss_clip": 1.018502, + "balance_loss_mlp": 1.03426218, + "epoch": 0.8475574928603637, + "flos": 25075666863360.0, + "grad_norm": 1.7173304527210933, + "language_loss": 0.77041292, + "learning_rate": 2.387746631822374e-07, + "loss": 0.79174817, + "num_input_tokens_seen": 304069965, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6796875, + "step": 14097, + "time_per_iteration": 3.9817750453948975 + }, + { + "auxiliary_loss_clip": 0.01099142, + "auxiliary_loss_mlp": 0.01026496, + "balance_loss_clip": 1.01546907, + "balance_loss_mlp": 1.03521109, + "epoch": 0.8476176161130318, + "flos": 19966691813760.0, + "grad_norm": 1.704153380132331, + "language_loss": 0.8026402, + "learning_rate": 2.385901552932048e-07, + "loss": 0.82389653, + "num_input_tokens_seen": 304086805, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 14098, + "time_per_iteration": 2.429412603378296 + }, + { + "auxiliary_loss_clip": 0.01099681, + "auxiliary_loss_mlp": 0.01030512, + "balance_loss_clip": 1.01887143, + "balance_loss_mlp": 1.03510809, + "epoch": 0.8476777393656997, + "flos": 21285834791040.0, + "grad_norm": 1.8292147668757888, + "language_loss": 0.71778166, + "learning_rate": 2.3840571419701062e-07, + "loss": 0.73908365, + "num_input_tokens_seen": 304105865, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.64453125, + "step": 14099, + "time_per_iteration": 2.4910991191864014 + }, + { + "auxiliary_loss_clip": 0.01099079, + "auxiliary_loss_mlp": 0.01029502, + "balance_loss_clip": 1.01675868, + "balance_loss_mlp": 1.03380799, + "epoch": 0.8477378626183677, + "flos": 29971476650880.0, + "grad_norm": 1.9479832989077739, + "language_loss": 0.63951457, + "learning_rate": 2.3822133990064787e-07, + "loss": 0.6608004, + "num_input_tokens_seen": 304128300, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.65234375, + "step": 14100, + "time_per_iteration": 3.935777425765991 + }, + { + "auxiliary_loss_clip": 0.01102751, + "auxiliary_loss_mlp": 0.01030884, + "balance_loss_clip": 1.0190171, + "balance_loss_mlp": 1.03465199, + "epoch": 0.8477979858710356, + "flos": 24237727413120.0, + "grad_norm": 1.8887642813622785, + "language_loss": 0.73411292, + "learning_rate": 2.380370324111085e-07, + "loss": 0.75544924, + "num_input_tokens_seen": 304143695, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 14101, + "time_per_iteration": 2.4780516624450684 + }, + { + "auxiliary_loss_clip": 0.01099179, + "auxiliary_loss_mlp": 0.01027832, + "balance_loss_clip": 1.01664448, + "balance_loss_mlp": 1.03323436, + "epoch": 0.8478581091237036, + "flos": 25593678852480.0, + "grad_norm": 1.5929891026522867, + "language_loss": 0.71019483, + "learning_rate": 2.3785279173538163e-07, + "loss": 0.73146498, + "num_input_tokens_seen": 304165800, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.66015625, + "step": 14102, + "time_per_iteration": 2.487032651901245 + }, + { + "auxiliary_loss_clip": 0.01104118, + "auxiliary_loss_mlp": 0.01030554, + "balance_loss_clip": 1.01804352, + "balance_loss_mlp": 1.03573275, + "epoch": 0.8479182323763715, + "flos": 12057116227200.0, + "grad_norm": 2.358108614100406, + "language_loss": 0.81502283, + "learning_rate": 2.3766861788045366e-07, + "loss": 0.83636951, + "num_input_tokens_seen": 304182910, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 14103, + "time_per_iteration": 2.421996831893921 + }, + { + "auxiliary_loss_clip": 0.01100518, + "auxiliary_loss_mlp": 0.01028635, + "balance_loss_clip": 1.01750135, + "balance_loss_mlp": 1.03581166, + "epoch": 0.8479783556290396, + "flos": 21433391861760.0, + "grad_norm": 5.145058817930484, + "language_loss": 0.78646743, + "learning_rate": 2.374845108533079e-07, + "loss": 0.80775893, + "num_input_tokens_seen": 304200175, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 14104, + "time_per_iteration": 2.439422607421875 + }, + { + "auxiliary_loss_clip": 0.01103668, + "auxiliary_loss_mlp": 0.01034723, + "balance_loss_clip": 1.0222832, + "balance_loss_mlp": 1.03649271, + "epoch": 0.8480384788817075, + "flos": 19642634288640.0, + "grad_norm": 2.160459351440593, + "language_loss": 0.78862703, + "learning_rate": 2.3730047066092607e-07, + "loss": 0.81001097, + "num_input_tokens_seen": 304217775, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 14105, + "time_per_iteration": 2.460575580596924 + }, + { + "auxiliary_loss_clip": 0.0110572, + "auxiliary_loss_mlp": 0.01028511, + "balance_loss_clip": 1.01601195, + "balance_loss_mlp": 1.03611135, + "epoch": 0.8480986021343755, + "flos": 22489201255680.0, + "grad_norm": 2.266475836383515, + "language_loss": 0.50339055, + "learning_rate": 2.3711649731028749e-07, + "loss": 0.52473295, + "num_input_tokens_seen": 304235760, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 14106, + "time_per_iteration": 2.4719579219818115 + }, + { + "auxiliary_loss_clip": 0.01100101, + "auxiliary_loss_mlp": 0.01030022, + "balance_loss_clip": 1.01833999, + "balance_loss_mlp": 1.03461707, + "epoch": 0.8481587253870434, + "flos": 22090557139200.0, + "grad_norm": 3.1041228706875006, + "language_loss": 0.75183088, + "learning_rate": 2.3693259080836792e-07, + "loss": 0.77313209, + "num_input_tokens_seen": 304253985, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 14107, + "time_per_iteration": 2.4628798961639404 + }, + { + "auxiliary_loss_clip": 0.01099676, + "auxiliary_loss_mlp": 0.01025265, + "balance_loss_clip": 1.01383924, + "balance_loss_mlp": 1.03401649, + "epoch": 0.8482188486397114, + "flos": 33582689366400.0, + "grad_norm": 1.5739400840020021, + "language_loss": 0.73535973, + "learning_rate": 2.3674875116214087e-07, + "loss": 0.75660914, + "num_input_tokens_seen": 304276785, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 14108, + "time_per_iteration": 2.5669243335723877 + }, + { + "auxiliary_loss_clip": 0.01096623, + "auxiliary_loss_mlp": 0.01024357, + "balance_loss_clip": 1.01170897, + "balance_loss_mlp": 1.03377414, + "epoch": 0.8482789718923793, + "flos": 20919402195840.0, + "grad_norm": 1.5922504607484824, + "language_loss": 0.72592628, + "learning_rate": 2.3656497837857836e-07, + "loss": 0.74713612, + "num_input_tokens_seen": 304296310, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.62890625, + "step": 14109, + "time_per_iteration": 2.5060198307037354 + }, + { + "auxiliary_loss_clip": 0.01098271, + "auxiliary_loss_mlp": 0.01031455, + "balance_loss_clip": 1.01955819, + "balance_loss_mlp": 1.03390074, + "epoch": 0.8483390951450474, + "flos": 12896204912640.0, + "grad_norm": 3.767796644059149, + "language_loss": 0.73706329, + "learning_rate": 2.3638127246464811e-07, + "loss": 0.75836062, + "num_input_tokens_seen": 304311715, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.64453125, + "step": 14110, + "time_per_iteration": 2.4130804538726807 + }, + { + "auxiliary_loss_clip": 0.01100273, + "auxiliary_loss_mlp": 0.01029588, + "balance_loss_clip": 1.01855469, + "balance_loss_mlp": 1.0350523, + "epoch": 0.8483992183977154, + "flos": 25081628520960.0, + "grad_norm": 1.6650304644780962, + "language_loss": 0.76256633, + "learning_rate": 2.3619763342731658e-07, + "loss": 0.78386492, + "num_input_tokens_seen": 304331910, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65234375, + "step": 14111, + "time_per_iteration": 2.5073182582855225 + }, + { + "auxiliary_loss_clip": 0.01099201, + "auxiliary_loss_mlp": 0.01026896, + "balance_loss_clip": 1.0163821, + "balance_loss_mlp": 1.03553009, + "epoch": 0.8484593416503833, + "flos": 25557445008000.0, + "grad_norm": 2.146588473142915, + "language_loss": 0.67528129, + "learning_rate": 2.3601406127354772e-07, + "loss": 0.69654226, + "num_input_tokens_seen": 304351405, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.63671875, + "step": 14112, + "time_per_iteration": 2.54217267036438 + }, + { + "auxiliary_loss_clip": 0.01099478, + "auxiliary_loss_mlp": 0.0102891, + "balance_loss_clip": 1.01754332, + "balance_loss_mlp": 1.03301573, + "epoch": 0.8485194649030513, + "flos": 27198454780800.0, + "grad_norm": 1.4581760668083692, + "language_loss": 0.73855281, + "learning_rate": 2.3583055601030312e-07, + "loss": 0.75983667, + "num_input_tokens_seen": 304372935, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 14113, + "time_per_iteration": 2.5123813152313232 + }, + { + "auxiliary_loss_clip": 0.01099678, + "auxiliary_loss_mlp": 0.01031848, + "balance_loss_clip": 1.02015352, + "balance_loss_mlp": 1.03446484, + "epoch": 0.8485795881557192, + "flos": 24205910941440.0, + "grad_norm": 2.114125636315057, + "language_loss": 0.66483456, + "learning_rate": 2.3564711764454003e-07, + "loss": 0.68614984, + "num_input_tokens_seen": 304393070, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65234375, + "step": 14114, + "time_per_iteration": 2.4642868041992188 + }, + { + "auxiliary_loss_clip": 0.01102052, + "auxiliary_loss_mlp": 0.01031511, + "balance_loss_clip": 1.01924419, + "balance_loss_mlp": 1.03529406, + "epoch": 0.8486397114083872, + "flos": 21141653598720.0, + "grad_norm": 1.7472449430374317, + "language_loss": 0.78489804, + "learning_rate": 2.3546374618321495e-07, + "loss": 0.80623364, + "num_input_tokens_seen": 304411195, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.66796875, + "step": 14115, + "time_per_iteration": 2.4579203128814697 + }, + { + "auxiliary_loss_clip": 0.01101492, + "auxiliary_loss_mlp": 0.01031038, + "balance_loss_clip": 1.02001727, + "balance_loss_mlp": 1.03533506, + "epoch": 0.8486998346610551, + "flos": 19974772373760.0, + "grad_norm": 1.999613584500072, + "language_loss": 0.7920562, + "learning_rate": 2.3528044163328187e-07, + "loss": 0.81338149, + "num_input_tokens_seen": 304429425, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.66015625, + "step": 14116, + "time_per_iteration": 2.429086446762085 + }, + { + "auxiliary_loss_clip": 0.01101969, + "auxiliary_loss_mlp": 0.01029409, + "balance_loss_clip": 1.01742232, + "balance_loss_mlp": 1.03365159, + "epoch": 0.8487599579137232, + "flos": 19792310261760.0, + "grad_norm": 1.7431190713062676, + "language_loss": 0.6832031, + "learning_rate": 2.3509720400169076e-07, + "loss": 0.70451689, + "num_input_tokens_seen": 304447460, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.68359375, + "step": 14117, + "time_per_iteration": 2.450892448425293 + }, + { + "auxiliary_loss_clip": 0.01101464, + "auxiliary_loss_mlp": 0.01027274, + "balance_loss_clip": 1.01512623, + "balance_loss_mlp": 1.03329229, + "epoch": 0.8488200811663911, + "flos": 26396030903040.0, + "grad_norm": 2.202306499476065, + "language_loss": 0.64843965, + "learning_rate": 2.3491403329539096e-07, + "loss": 0.66972697, + "num_input_tokens_seen": 304468230, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 14118, + "time_per_iteration": 2.4827311038970947 + }, + { + "auxiliary_loss_clip": 0.01098527, + "auxiliary_loss_mlp": 0.01029479, + "balance_loss_clip": 1.01851785, + "balance_loss_mlp": 1.03415918, + "epoch": 0.8488802044190591, + "flos": 16359285939840.0, + "grad_norm": 1.5391821500879839, + "language_loss": 0.73291403, + "learning_rate": 2.3473092952132757e-07, + "loss": 0.75419414, + "num_input_tokens_seen": 304484860, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.640625, + "step": 14119, + "time_per_iteration": 2.4333455562591553 + }, + { + "auxiliary_loss_clip": 0.01101713, + "auxiliary_loss_mlp": 0.01027791, + "balance_loss_clip": 1.01544058, + "balance_loss_mlp": 1.03465796, + "epoch": 0.848940327671727, + "flos": 19208869649280.0, + "grad_norm": 1.8345474405737905, + "language_loss": 0.7795918, + "learning_rate": 2.345478926864446e-07, + "loss": 0.80088687, + "num_input_tokens_seen": 304503575, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.671875, + "step": 14120, + "time_per_iteration": 2.438777446746826 + }, + { + "auxiliary_loss_clip": 0.01101394, + "auxiliary_loss_mlp": 0.01028536, + "balance_loss_clip": 1.01621592, + "balance_loss_mlp": 1.03509915, + "epoch": 0.849000450924395, + "flos": 21871178824320.0, + "grad_norm": 5.405751180834322, + "language_loss": 0.75322181, + "learning_rate": 2.3436492279768227e-07, + "loss": 0.77452111, + "num_input_tokens_seen": 304525005, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6640625, + "step": 14121, + "time_per_iteration": 2.4752485752105713 + }, + { + "auxiliary_loss_clip": 0.01022863, + "auxiliary_loss_mlp": 0.01000803, + "balance_loss_clip": 0.9997595, + "balance_loss_mlp": 1.00264168, + "epoch": 0.8490605741770629, + "flos": 71166475624320.0, + "grad_norm": 0.8037296020367628, + "language_loss": 0.60112953, + "learning_rate": 2.3418201986197883e-07, + "loss": 0.6213662, + "num_input_tokens_seen": 304585220, + "router_z_loss_clip": 0.01043701, + "router_z_loss_mlp": 0.203125, + "step": 14122, + "time_per_iteration": 3.0530099868774414 + }, + { + "auxiliary_loss_clip": 0.01101962, + "auxiliary_loss_mlp": 0.01030351, + "balance_loss_clip": 1.01896024, + "balance_loss_mlp": 1.03606272, + "epoch": 0.849120697429731, + "flos": 24973357950720.0, + "grad_norm": 1.8836683151356197, + "language_loss": 0.79854351, + "learning_rate": 2.3399918388627048e-07, + "loss": 0.81986666, + "num_input_tokens_seen": 304604665, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.66015625, + "step": 14123, + "time_per_iteration": 2.481076240539551 + }, + { + "auxiliary_loss_clip": 0.01097323, + "auxiliary_loss_mlp": 0.010265, + "balance_loss_clip": 1.01523471, + "balance_loss_mlp": 1.03398323, + "epoch": 0.8491808206823989, + "flos": 23032277959680.0, + "grad_norm": 2.239097832743251, + "language_loss": 0.83009315, + "learning_rate": 2.3381641487749016e-07, + "loss": 0.85133135, + "num_input_tokens_seen": 304620600, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6328125, + "step": 14124, + "time_per_iteration": 2.4361042976379395 + }, + { + "auxiliary_loss_clip": 0.01102837, + "auxiliary_loss_mlp": 0.01030836, + "balance_loss_clip": 1.01883113, + "balance_loss_mlp": 1.03712273, + "epoch": 0.8492409439350669, + "flos": 23878549365120.0, + "grad_norm": 1.9193916566663176, + "language_loss": 0.7154206, + "learning_rate": 2.3363371284256805e-07, + "loss": 0.7367574, + "num_input_tokens_seen": 304639540, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66015625, + "step": 14125, + "time_per_iteration": 2.4565751552581787 + }, + { + "auxiliary_loss_clip": 0.01104988, + "auxiliary_loss_mlp": 0.01034732, + "balance_loss_clip": 1.02196431, + "balance_loss_mlp": 1.03545964, + "epoch": 0.8493010671877349, + "flos": 22419893963520.0, + "grad_norm": 1.5814217786789184, + "language_loss": 0.73540419, + "learning_rate": 2.3345107778843288e-07, + "loss": 0.75680137, + "num_input_tokens_seen": 304660595, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 14126, + "time_per_iteration": 2.4653918743133545 + }, + { + "auxiliary_loss_clip": 0.01099143, + "auxiliary_loss_mlp": 0.01029819, + "balance_loss_clip": 1.01827955, + "balance_loss_mlp": 1.03460646, + "epoch": 0.8493611904404028, + "flos": 17529435302400.0, + "grad_norm": 1.447182138612943, + "language_loss": 0.67323148, + "learning_rate": 2.3326850972200928e-07, + "loss": 0.69452107, + "num_input_tokens_seen": 304679580, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 14127, + "time_per_iteration": 2.4421885013580322 + }, + { + "auxiliary_loss_clip": 0.01102144, + "auxiliary_loss_mlp": 0.01026876, + "balance_loss_clip": 1.01496696, + "balance_loss_mlp": 1.03487849, + "epoch": 0.8494213136930708, + "flos": 19462937523840.0, + "grad_norm": 1.7486949630521547, + "language_loss": 0.69433224, + "learning_rate": 2.330860086502211e-07, + "loss": 0.71562243, + "num_input_tokens_seen": 304698385, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 14128, + "time_per_iteration": 2.447857618331909 + }, + { + "auxiliary_loss_clip": 0.01099421, + "auxiliary_loss_mlp": 0.01029257, + "balance_loss_clip": 1.01758027, + "balance_loss_mlp": 1.03517187, + "epoch": 0.8494814369457387, + "flos": 18770292587520.0, + "grad_norm": 4.278782473141161, + "language_loss": 0.77867216, + "learning_rate": 2.3290357457998855e-07, + "loss": 0.79995894, + "num_input_tokens_seen": 304715430, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.640625, + "step": 14129, + "time_per_iteration": 2.444333076477051 + }, + { + "auxiliary_loss_clip": 0.01101533, + "auxiliary_loss_mlp": 0.01028606, + "balance_loss_clip": 1.01731718, + "balance_loss_mlp": 1.03556049, + "epoch": 0.8495415601984068, + "flos": 23331486251520.0, + "grad_norm": 2.0594648435116234, + "language_loss": 0.68019104, + "learning_rate": 2.3272120751823031e-07, + "loss": 0.70149243, + "num_input_tokens_seen": 304734345, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.66015625, + "step": 14130, + "time_per_iteration": 2.4651596546173096 + }, + { + "auxiliary_loss_clip": 0.01100363, + "auxiliary_loss_mlp": 0.01028314, + "balance_loss_clip": 1.01677465, + "balance_loss_mlp": 1.03450692, + "epoch": 0.8496016834510747, + "flos": 26612859352320.0, + "grad_norm": 2.677757108055573, + "language_loss": 0.70964313, + "learning_rate": 2.3253890747186e-07, + "loss": 0.73092985, + "num_input_tokens_seen": 304755030, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 14131, + "time_per_iteration": 2.4959685802459717 + }, + { + "auxiliary_loss_clip": 0.01097843, + "auxiliary_loss_mlp": 0.01026281, + "balance_loss_clip": 1.0147171, + "balance_loss_mlp": 1.03148651, + "epoch": 0.8496618067037427, + "flos": 25480380378240.0, + "grad_norm": 1.9992354327212007, + "language_loss": 0.68285507, + "learning_rate": 2.3235667444779162e-07, + "loss": 0.70409632, + "num_input_tokens_seen": 304774320, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6640625, + "step": 14132, + "time_per_iteration": 2.4957363605499268 + }, + { + "auxiliary_loss_clip": 0.01096388, + "auxiliary_loss_mlp": 0.01033853, + "balance_loss_clip": 1.02295697, + "balance_loss_mlp": 1.0323298, + "epoch": 0.8497219299564106, + "flos": 25374587846400.0, + "grad_norm": 1.648812223395765, + "language_loss": 0.70260388, + "learning_rate": 2.3217450845293564e-07, + "loss": 0.72390628, + "num_input_tokens_seen": 304795355, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.640625, + "step": 14133, + "time_per_iteration": 2.5129992961883545 + }, + { + "auxiliary_loss_clip": 0.01022484, + "auxiliary_loss_mlp": 0.01003624, + "balance_loss_clip": 1.00258112, + "balance_loss_mlp": 1.00221777, + "epoch": 0.8497820532090786, + "flos": 67780279658880.0, + "grad_norm": 0.7241616741405683, + "language_loss": 0.57608092, + "learning_rate": 2.3199240949419918e-07, + "loss": 0.59634197, + "num_input_tokens_seen": 304863915, + "router_z_loss_clip": 0.01043701, + "router_z_loss_mlp": 0.203125, + "step": 14134, + "time_per_iteration": 3.173593282699585 + }, + { + "auxiliary_loss_clip": 0.01103223, + "auxiliary_loss_mlp": 0.01028585, + "balance_loss_clip": 1.01674151, + "balance_loss_mlp": 1.03537357, + "epoch": 0.8498421764617465, + "flos": 23440546920960.0, + "grad_norm": 1.97452600255058, + "language_loss": 0.78879797, + "learning_rate": 2.3181037757848787e-07, + "loss": 0.81011605, + "num_input_tokens_seen": 304881555, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 14135, + "time_per_iteration": 2.465132236480713 + }, + { + "auxiliary_loss_clip": 0.01101655, + "auxiliary_loss_mlp": 0.01031405, + "balance_loss_clip": 1.01925802, + "balance_loss_mlp": 1.03432846, + "epoch": 0.8499022997144146, + "flos": 17712615686400.0, + "grad_norm": 1.8051952520241694, + "language_loss": 0.63200223, + "learning_rate": 2.316284127127044e-07, + "loss": 0.65333283, + "num_input_tokens_seen": 304898760, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 14136, + "time_per_iteration": 2.4003732204437256 + }, + { + "auxiliary_loss_clip": 0.01104726, + "auxiliary_loss_mlp": 0.0103221, + "balance_loss_clip": 1.01959145, + "balance_loss_mlp": 1.03692389, + "epoch": 0.8499624229670825, + "flos": 18588512833920.0, + "grad_norm": 1.8110537198368084, + "language_loss": 0.83839071, + "learning_rate": 2.3144651490374835e-07, + "loss": 0.85976005, + "num_input_tokens_seen": 304915465, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6796875, + "step": 14137, + "time_per_iteration": 3.834845542907715 + }, + { + "auxiliary_loss_clip": 0.01097523, + "auxiliary_loss_mlp": 0.01026857, + "balance_loss_clip": 1.016343, + "balance_loss_mlp": 1.03378332, + "epoch": 0.8500225462197505, + "flos": 24345854328960.0, + "grad_norm": 2.01797350925578, + "language_loss": 0.78820533, + "learning_rate": 2.3126468415851773e-07, + "loss": 0.80944908, + "num_input_tokens_seen": 304933190, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.63671875, + "step": 14138, + "time_per_iteration": 5.281948566436768 + }, + { + "auxiliary_loss_clip": 0.01102022, + "auxiliary_loss_mlp": 0.01026275, + "balance_loss_clip": 1.01498008, + "balance_loss_mlp": 1.0361867, + "epoch": 0.8500826694724185, + "flos": 16545518979840.0, + "grad_norm": 1.7959695492088958, + "language_loss": 0.64545155, + "learning_rate": 2.310829204839073e-07, + "loss": 0.66673458, + "num_input_tokens_seen": 304951110, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 14139, + "time_per_iteration": 2.4514920711517334 + }, + { + "auxiliary_loss_clip": 0.01098312, + "auxiliary_loss_mlp": 0.01031602, + "balance_loss_clip": 1.02062881, + "balance_loss_mlp": 1.03338087, + "epoch": 0.8501427927250864, + "flos": 16289404030080.0, + "grad_norm": 1.4663932183968211, + "language_loss": 0.70549941, + "learning_rate": 2.3090122388681043e-07, + "loss": 0.72679853, + "num_input_tokens_seen": 304969095, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6484375, + "step": 14140, + "time_per_iteration": 2.456413745880127 + }, + { + "auxiliary_loss_clip": 0.01101673, + "auxiliary_loss_mlp": 0.01028058, + "balance_loss_clip": 1.0160358, + "balance_loss_mlp": 1.03334641, + "epoch": 0.8502029159777544, + "flos": 26687912820480.0, + "grad_norm": 3.118994023074249, + "language_loss": 0.64317191, + "learning_rate": 2.3071959437411648e-07, + "loss": 0.66446924, + "num_input_tokens_seen": 304989315, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 14141, + "time_per_iteration": 2.4837236404418945 + }, + { + "auxiliary_loss_clip": 0.01102087, + "auxiliary_loss_mlp": 0.01030129, + "balance_loss_clip": 1.01862574, + "balance_loss_mlp": 1.03575301, + "epoch": 0.8502630392304223, + "flos": 35590778179200.0, + "grad_norm": 1.485418549582861, + "language_loss": 0.7077021, + "learning_rate": 2.3053803195271214e-07, + "loss": 0.72902429, + "num_input_tokens_seen": 305011020, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 14142, + "time_per_iteration": 4.026219129562378 + }, + { + "auxiliary_loss_clip": 0.01099601, + "auxiliary_loss_mlp": 0.0102499, + "balance_loss_clip": 1.01399326, + "balance_loss_mlp": 1.03333116, + "epoch": 0.8503231624830904, + "flos": 21649466125440.0, + "grad_norm": 1.5471529006166378, + "language_loss": 0.65363872, + "learning_rate": 2.3035653662948375e-07, + "loss": 0.67488462, + "num_input_tokens_seen": 305033550, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6640625, + "step": 14143, + "time_per_iteration": 2.522714138031006 + }, + { + "auxiliary_loss_clip": 0.01103022, + "auxiliary_loss_mlp": 0.01031915, + "balance_loss_clip": 1.02001226, + "balance_loss_mlp": 1.03437936, + "epoch": 0.8503832857357583, + "flos": 22417451838720.0, + "grad_norm": 2.272347066205258, + "language_loss": 0.67796141, + "learning_rate": 2.3017510841131216e-07, + "loss": 0.69931078, + "num_input_tokens_seen": 305052885, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 14144, + "time_per_iteration": 2.464179277420044 + }, + { + "auxiliary_loss_clip": 0.01096846, + "auxiliary_loss_mlp": 0.01029953, + "balance_loss_clip": 1.01843166, + "balance_loss_mlp": 1.03374457, + "epoch": 0.8504434089884263, + "flos": 18697968552960.0, + "grad_norm": 2.0058736259336913, + "language_loss": 0.65126836, + "learning_rate": 2.299937473050777e-07, + "loss": 0.67253637, + "num_input_tokens_seen": 305071995, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6328125, + "step": 14145, + "time_per_iteration": 2.4138495922088623 + }, + { + "auxiliary_loss_clip": 0.01099661, + "auxiliary_loss_mlp": 0.01031309, + "balance_loss_clip": 1.01923287, + "balance_loss_mlp": 1.03460836, + "epoch": 0.8505035322410942, + "flos": 20007989475840.0, + "grad_norm": 1.9741854800625371, + "language_loss": 0.85892701, + "learning_rate": 2.2981245331765842e-07, + "loss": 0.88023674, + "num_input_tokens_seen": 305090190, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.65234375, + "step": 14146, + "time_per_iteration": 2.4394941329956055 + }, + { + "auxiliary_loss_clip": 0.01097854, + "auxiliary_loss_mlp": 0.01024442, + "balance_loss_clip": 1.01284277, + "balance_loss_mlp": 1.03269684, + "epoch": 0.8505636554937622, + "flos": 20812173120000.0, + "grad_norm": 4.7433845551584115, + "language_loss": 0.83587158, + "learning_rate": 2.2963122645592814e-07, + "loss": 0.85709453, + "num_input_tokens_seen": 305109355, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 14147, + "time_per_iteration": 2.4312021732330322 + }, + { + "auxiliary_loss_clip": 0.01103794, + "auxiliary_loss_mlp": 0.01028362, + "balance_loss_clip": 1.01654828, + "balance_loss_mlp": 1.03492069, + "epoch": 0.8506237787464301, + "flos": 14174445277440.0, + "grad_norm": 2.524544245155263, + "language_loss": 0.85632455, + "learning_rate": 2.2945006672675894e-07, + "loss": 0.87764609, + "num_input_tokens_seen": 305124165, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 14148, + "time_per_iteration": 2.408759355545044 + }, + { + "auxiliary_loss_clip": 0.01099695, + "auxiliary_loss_mlp": 0.01029763, + "balance_loss_clip": 1.01789546, + "balance_loss_mlp": 1.03512621, + "epoch": 0.8506839019990982, + "flos": 23258372117760.0, + "grad_norm": 1.583969514237289, + "language_loss": 0.72040755, + "learning_rate": 2.292689741370204e-07, + "loss": 0.74170214, + "num_input_tokens_seen": 305143940, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.64453125, + "step": 14149, + "time_per_iteration": 2.4647998809814453 + }, + { + "auxiliary_loss_clip": 0.01101741, + "auxiliary_loss_mlp": 0.01025822, + "balance_loss_clip": 1.01436639, + "balance_loss_mlp": 1.03563142, + "epoch": 0.8507440252517661, + "flos": 23659206963840.0, + "grad_norm": 1.7435264384090372, + "language_loss": 0.76055348, + "learning_rate": 2.290879486935804e-07, + "loss": 0.78182906, + "num_input_tokens_seen": 305163505, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66015625, + "step": 14150, + "time_per_iteration": 2.4704911708831787 + }, + { + "auxiliary_loss_clip": 0.01100681, + "auxiliary_loss_mlp": 0.01030406, + "balance_loss_clip": 1.01903915, + "balance_loss_mlp": 1.03694618, + "epoch": 0.8508041485044341, + "flos": 18661339658880.0, + "grad_norm": 1.6165351455884314, + "language_loss": 0.72317696, + "learning_rate": 2.2890699040330231e-07, + "loss": 0.74448776, + "num_input_tokens_seen": 305182325, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.640625, + "step": 14151, + "time_per_iteration": 2.4117584228515625 + }, + { + "auxiliary_loss_clip": 0.01022519, + "auxiliary_loss_mlp": 0.01005156, + "balance_loss_clip": 1.00400531, + "balance_loss_mlp": 1.00236225, + "epoch": 0.8508642717571021, + "flos": 52510918055040.0, + "grad_norm": 0.8888130234597027, + "language_loss": 0.59599686, + "learning_rate": 2.2872609927304909e-07, + "loss": 0.61627358, + "num_input_tokens_seen": 305230775, + "router_z_loss_clip": 0.01147461, + "router_z_loss_mlp": 0.20117188, + "step": 14152, + "time_per_iteration": 2.8257334232330322 + }, + { + "auxiliary_loss_clip": 0.01022311, + "auxiliary_loss_mlp": 0.01002793, + "balance_loss_clip": 1.00180936, + "balance_loss_mlp": 1.00222039, + "epoch": 0.85092439500977, + "flos": 69297145050240.0, + "grad_norm": 0.694800012848997, + "language_loss": 0.61128682, + "learning_rate": 2.285452753096797e-07, + "loss": 0.63153785, + "num_input_tokens_seen": 305296000, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.20117188, + "step": 14153, + "time_per_iteration": 3.0687737464904785 + }, + { + "auxiliary_loss_clip": 0.01101332, + "auxiliary_loss_mlp": 0.01028277, + "balance_loss_clip": 1.01596856, + "balance_loss_mlp": 1.03580403, + "epoch": 0.850984518262438, + "flos": 24389737770240.0, + "grad_norm": 1.6380875980431746, + "language_loss": 0.80774456, + "learning_rate": 2.2836451852005067e-07, + "loss": 0.82904065, + "num_input_tokens_seen": 305314705, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.65625, + "step": 14154, + "time_per_iteration": 2.4598207473754883 + }, + { + "auxiliary_loss_clip": 0.01092813, + "auxiliary_loss_mlp": 0.01029637, + "balance_loss_clip": 1.01902699, + "balance_loss_mlp": 1.03123856, + "epoch": 0.851044641515106, + "flos": 23294821443840.0, + "grad_norm": 1.6273243802969442, + "language_loss": 0.79549897, + "learning_rate": 2.281838289110165e-07, + "loss": 0.81672347, + "num_input_tokens_seen": 305333870, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.6171875, + "step": 14155, + "time_per_iteration": 2.472735643386841 + }, + { + "auxiliary_loss_clip": 0.01100526, + "auxiliary_loss_mlp": 0.01028093, + "balance_loss_clip": 1.01641667, + "balance_loss_mlp": 1.03228521, + "epoch": 0.851104764767774, + "flos": 22050085489920.0, + "grad_norm": 1.6278681013298135, + "language_loss": 0.70760596, + "learning_rate": 2.2800320648942904e-07, + "loss": 0.72889221, + "num_input_tokens_seen": 305352780, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.68359375, + "step": 14156, + "time_per_iteration": 2.4720516204833984 + }, + { + "auxiliary_loss_clip": 0.01097071, + "auxiliary_loss_mlp": 0.01030433, + "balance_loss_clip": 1.01863134, + "balance_loss_mlp": 1.03398883, + "epoch": 0.8511648880204419, + "flos": 20704728562560.0, + "grad_norm": 1.945584806630902, + "language_loss": 0.73951316, + "learning_rate": 2.278226512621386e-07, + "loss": 0.76078814, + "num_input_tokens_seen": 305371370, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6328125, + "step": 14157, + "time_per_iteration": 2.445727586746216 + }, + { + "auxiliary_loss_clip": 0.01096578, + "auxiliary_loss_mlp": 0.01023751, + "balance_loss_clip": 1.01308143, + "balance_loss_mlp": 1.03393173, + "epoch": 0.8512250112731099, + "flos": 24024669891840.0, + "grad_norm": 2.049881321854855, + "language_loss": 0.79299182, + "learning_rate": 2.2764216323598995e-07, + "loss": 0.8141951, + "num_input_tokens_seen": 305387955, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.62890625, + "step": 14158, + "time_per_iteration": 2.4651825428009033 + }, + { + "auxiliary_loss_clip": 0.01100355, + "auxiliary_loss_mlp": 0.01030404, + "balance_loss_clip": 1.01841724, + "balance_loss_mlp": 1.03510904, + "epoch": 0.8512851345257778, + "flos": 22015467757440.0, + "grad_norm": 2.0779394687943, + "language_loss": 0.7930764, + "learning_rate": 2.27461742417828e-07, + "loss": 0.81438398, + "num_input_tokens_seen": 305406285, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.65234375, + "step": 14159, + "time_per_iteration": 2.430978775024414 + }, + { + "auxiliary_loss_clip": 0.01101719, + "auxiliary_loss_mlp": 0.01033534, + "balance_loss_clip": 1.02200091, + "balance_loss_mlp": 1.03555334, + "epoch": 0.8513452577784458, + "flos": 14830209924480.0, + "grad_norm": 2.5681034640558433, + "language_loss": 0.71410954, + "learning_rate": 2.2728138881449488e-07, + "loss": 0.73546207, + "num_input_tokens_seen": 305424500, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 14160, + "time_per_iteration": 2.4289729595184326 + }, + { + "auxiliary_loss_clip": 0.01106194, + "auxiliary_loss_mlp": 0.01032101, + "balance_loss_clip": 1.0195365, + "balance_loss_mlp": 1.03638792, + "epoch": 0.8514053810311137, + "flos": 33035662166400.0, + "grad_norm": 1.9043333827354807, + "language_loss": 0.70242059, + "learning_rate": 2.2710110243282866e-07, + "loss": 0.72380352, + "num_input_tokens_seen": 305442990, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69921875, + "step": 14161, + "time_per_iteration": 2.5416407585144043 + }, + { + "auxiliary_loss_clip": 0.01099976, + "auxiliary_loss_mlp": 0.01030032, + "balance_loss_clip": 1.01886177, + "balance_loss_mlp": 1.03187084, + "epoch": 0.8514655042837818, + "flos": 27564456412800.0, + "grad_norm": 2.9260794515175017, + "language_loss": 0.78138113, + "learning_rate": 2.2692088327966653e-07, + "loss": 0.80268127, + "num_input_tokens_seen": 305463065, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6796875, + "step": 14162, + "time_per_iteration": 2.5057663917541504 + }, + { + "auxiliary_loss_clip": 0.01100522, + "auxiliary_loss_mlp": 0.01032359, + "balance_loss_clip": 1.02059305, + "balance_loss_mlp": 1.03527033, + "epoch": 0.8515256275364497, + "flos": 35556052705920.0, + "grad_norm": 2.7045213694102292, + "language_loss": 0.76977819, + "learning_rate": 2.2674073136184235e-07, + "loss": 0.79110706, + "num_input_tokens_seen": 305489070, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.65234375, + "step": 14163, + "time_per_iteration": 2.576014280319214 + }, + { + "auxiliary_loss_clip": 0.01022488, + "auxiliary_loss_mlp": 0.01004361, + "balance_loss_clip": 1.00332379, + "balance_loss_mlp": 1.00239372, + "epoch": 0.8515857507891177, + "flos": 70207372621440.0, + "grad_norm": 0.6917600034361476, + "language_loss": 0.55013472, + "learning_rate": 2.2656064668618735e-07, + "loss": 0.57040328, + "num_input_tokens_seen": 305551490, + "router_z_loss_clip": 0.01037598, + "router_z_loss_mlp": 0.20117188, + "step": 14164, + "time_per_iteration": 3.0798745155334473 + }, + { + "auxiliary_loss_clip": 0.0109938, + "auxiliary_loss_mlp": 0.01031942, + "balance_loss_clip": 1.02049756, + "balance_loss_mlp": 1.03448367, + "epoch": 0.8516458740417857, + "flos": 22675290641280.0, + "grad_norm": 1.8565116591235626, + "language_loss": 0.72916138, + "learning_rate": 2.2638062925953005e-07, + "loss": 0.75047463, + "num_input_tokens_seen": 305570535, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 14165, + "time_per_iteration": 2.4583141803741455 + }, + { + "auxiliary_loss_clip": 0.01096948, + "auxiliary_loss_mlp": 0.01028375, + "balance_loss_clip": 1.01683545, + "balance_loss_mlp": 1.03286302, + "epoch": 0.8517059972944536, + "flos": 22747435107840.0, + "grad_norm": 1.53357605773611, + "language_loss": 0.67339641, + "learning_rate": 2.26200679088697e-07, + "loss": 0.69464964, + "num_input_tokens_seen": 305590800, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 14166, + "time_per_iteration": 2.4731404781341553 + }, + { + "auxiliary_loss_clip": 0.01098945, + "auxiliary_loss_mlp": 0.01028293, + "balance_loss_clip": 1.01718283, + "balance_loss_mlp": 1.03358221, + "epoch": 0.8517661205471216, + "flos": 21689147675520.0, + "grad_norm": 2.1743229619980284, + "language_loss": 0.73408175, + "learning_rate": 2.260207961805125e-07, + "loss": 0.75535411, + "num_input_tokens_seen": 305609495, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 14167, + "time_per_iteration": 2.424105167388916 + }, + { + "auxiliary_loss_clip": 0.01100059, + "auxiliary_loss_mlp": 0.01027185, + "balance_loss_clip": 1.01569331, + "balance_loss_mlp": 1.03490484, + "epoch": 0.8518262437997896, + "flos": 25374839241600.0, + "grad_norm": 1.5567552814604415, + "language_loss": 0.80538321, + "learning_rate": 2.258409805417969e-07, + "loss": 0.82665563, + "num_input_tokens_seen": 305629420, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65234375, + "step": 14168, + "time_per_iteration": 2.501282215118408 + }, + { + "auxiliary_loss_clip": 0.01098651, + "auxiliary_loss_mlp": 0.01025085, + "balance_loss_clip": 1.01371276, + "balance_loss_mlp": 1.03329349, + "epoch": 0.8518863670524576, + "flos": 27235406897280.0, + "grad_norm": 1.8118683841885685, + "language_loss": 0.76072329, + "learning_rate": 2.2566123217936893e-07, + "loss": 0.78196067, + "num_input_tokens_seen": 305649835, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 14169, + "time_per_iteration": 2.466012716293335 + }, + { + "auxiliary_loss_clip": 0.01103905, + "auxiliary_loss_mlp": 0.01031734, + "balance_loss_clip": 1.01977158, + "balance_loss_mlp": 1.03580987, + "epoch": 0.8519464903051255, + "flos": 20959514709120.0, + "grad_norm": 2.198130292070983, + "language_loss": 0.63613892, + "learning_rate": 2.254815511000452e-07, + "loss": 0.65749532, + "num_input_tokens_seen": 305668840, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6796875, + "step": 14170, + "time_per_iteration": 2.4390439987182617 + }, + { + "auxiliary_loss_clip": 0.01096568, + "auxiliary_loss_mlp": 0.01025636, + "balance_loss_clip": 1.01424527, + "balance_loss_mlp": 1.03146672, + "epoch": 0.8520066135577935, + "flos": 18441745862400.0, + "grad_norm": 2.336302348660875, + "language_loss": 0.86398733, + "learning_rate": 2.253019373106384e-07, + "loss": 0.88520932, + "num_input_tokens_seen": 305686955, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6484375, + "step": 14171, + "time_per_iteration": 2.40663480758667 + }, + { + "auxiliary_loss_clip": 0.01101227, + "auxiliary_loss_mlp": 0.01032324, + "balance_loss_clip": 1.02092147, + "balance_loss_mlp": 1.03520513, + "epoch": 0.8520667368104614, + "flos": 29130233149440.0, + "grad_norm": 1.8512649886443278, + "language_loss": 0.5462482, + "learning_rate": 2.2512239081796003e-07, + "loss": 0.56758368, + "num_input_tokens_seen": 305706290, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66015625, + "step": 14172, + "time_per_iteration": 2.4998886585235596 + }, + { + "auxiliary_loss_clip": 0.01095976, + "auxiliary_loss_mlp": 0.01025638, + "balance_loss_clip": 1.01607168, + "balance_loss_mlp": 1.03305733, + "epoch": 0.8521268600631294, + "flos": 16034366488320.0, + "grad_norm": 2.086050566493409, + "language_loss": 0.69540936, + "learning_rate": 2.2494291162881862e-07, + "loss": 0.71662551, + "num_input_tokens_seen": 305723835, + "router_z_loss_clip": 0.09570312, + "router_z_loss_mlp": 0.62890625, + "step": 14173, + "time_per_iteration": 2.4107959270477295 + }, + { + "auxiliary_loss_clip": 0.01100817, + "auxiliary_loss_mlp": 0.01028789, + "balance_loss_clip": 1.01621222, + "balance_loss_mlp": 1.0341866, + "epoch": 0.8521869833157973, + "flos": 22454870832000.0, + "grad_norm": 2.2445717873488027, + "language_loss": 0.77038109, + "learning_rate": 2.247634997500205e-07, + "loss": 0.79167712, + "num_input_tokens_seen": 305741655, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6640625, + "step": 14174, + "time_per_iteration": 2.4528019428253174 + }, + { + "auxiliary_loss_clip": 0.0110205, + "auxiliary_loss_mlp": 0.01028815, + "balance_loss_clip": 1.01738858, + "balance_loss_mlp": 1.03537321, + "epoch": 0.8522471065684654, + "flos": 24972029147520.0, + "grad_norm": 1.531960767298018, + "language_loss": 0.81722677, + "learning_rate": 2.245841551883676e-07, + "loss": 0.83853537, + "num_input_tokens_seen": 305761890, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 14175, + "time_per_iteration": 2.4613003730773926 + }, + { + "auxiliary_loss_clip": 0.01103945, + "auxiliary_loss_mlp": 0.01031278, + "balance_loss_clip": 1.01919007, + "balance_loss_mlp": 1.03601801, + "epoch": 0.8523072298211333, + "flos": 17710604524800.0, + "grad_norm": 2.6143135080090913, + "language_loss": 0.65842164, + "learning_rate": 2.2440487795066153e-07, + "loss": 0.67977381, + "num_input_tokens_seen": 305779190, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 14176, + "time_per_iteration": 2.4280595779418945 + }, + { + "auxiliary_loss_clip": 0.01098874, + "auxiliary_loss_mlp": 0.01028131, + "balance_loss_clip": 1.01610303, + "balance_loss_mlp": 1.03441358, + "epoch": 0.8523673530738013, + "flos": 25446193608960.0, + "grad_norm": 1.5794340083453389, + "language_loss": 0.78320289, + "learning_rate": 2.2422566804370068e-07, + "loss": 0.80447292, + "num_input_tokens_seen": 305799870, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.64453125, + "step": 14177, + "time_per_iteration": 2.4813055992126465 + }, + { + "auxiliary_loss_clip": 0.01100784, + "auxiliary_loss_mlp": 0.01029671, + "balance_loss_clip": 1.01731539, + "balance_loss_mlp": 1.03469777, + "epoch": 0.8524274763264693, + "flos": 31429593348480.0, + "grad_norm": 1.6547287833476916, + "language_loss": 0.73443151, + "learning_rate": 2.2404652547428026e-07, + "loss": 0.75573605, + "num_input_tokens_seen": 305819695, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.66015625, + "step": 14178, + "time_per_iteration": 3.9108073711395264 + }, + { + "auxiliary_loss_clip": 0.01101319, + "auxiliary_loss_mlp": 0.0103325, + "balance_loss_clip": 1.0220679, + "balance_loss_mlp": 1.03550065, + "epoch": 0.8524875995791372, + "flos": 17712651600000.0, + "grad_norm": 1.8377191267924193, + "language_loss": 0.74717975, + "learning_rate": 2.238674502491935e-07, + "loss": 0.76852548, + "num_input_tokens_seen": 305837270, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.66015625, + "step": 14179, + "time_per_iteration": 2.4225523471832275 + }, + { + "auxiliary_loss_clip": 0.01098767, + "auxiliary_loss_mlp": 0.01025724, + "balance_loss_clip": 1.01433325, + "balance_loss_mlp": 1.03464127, + "epoch": 0.8525477228318052, + "flos": 21687316081920.0, + "grad_norm": 2.460422297621149, + "language_loss": 0.81496072, + "learning_rate": 2.2368844237523165e-07, + "loss": 0.83620566, + "num_input_tokens_seen": 305855250, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.640625, + "step": 14180, + "time_per_iteration": 5.2744059562683105 + }, + { + "auxiliary_loss_clip": 0.01100192, + "auxiliary_loss_mlp": 0.01030938, + "balance_loss_clip": 1.02003634, + "balance_loss_mlp": 1.03413081, + "epoch": 0.8526078460844732, + "flos": 24827057856000.0, + "grad_norm": 2.369494147996583, + "language_loss": 0.61639541, + "learning_rate": 2.235095018591815e-07, + "loss": 0.63770676, + "num_input_tokens_seen": 305875660, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.66015625, + "step": 14181, + "time_per_iteration": 2.4872968196868896 + }, + { + "auxiliary_loss_clip": 0.01098397, + "auxiliary_loss_mlp": 0.0103076, + "balance_loss_clip": 1.01989412, + "balance_loss_mlp": 1.03492475, + "epoch": 0.8526679693371412, + "flos": 13516418073600.0, + "grad_norm": 2.119628818212838, + "language_loss": 0.72303843, + "learning_rate": 2.2333062870782894e-07, + "loss": 0.74433005, + "num_input_tokens_seen": 305892415, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.63671875, + "step": 14182, + "time_per_iteration": 2.405911922454834 + }, + { + "auxiliary_loss_clip": 0.0109916, + "auxiliary_loss_mlp": 0.01031468, + "balance_loss_clip": 1.01984537, + "balance_loss_mlp": 1.03547144, + "epoch": 0.8527280925898091, + "flos": 23514092017920.0, + "grad_norm": 1.877933371145743, + "language_loss": 0.70888335, + "learning_rate": 2.2315182292795697e-07, + "loss": 0.73018968, + "num_input_tokens_seen": 305912665, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.63671875, + "step": 14183, + "time_per_iteration": 2.465843439102173 + }, + { + "auxiliary_loss_clip": 0.01099389, + "auxiliary_loss_mlp": 0.0102998, + "balance_loss_clip": 1.01894093, + "balance_loss_mlp": 1.03608322, + "epoch": 0.8527882158424771, + "flos": 20303031790080.0, + "grad_norm": 2.003256962721328, + "language_loss": 0.72409725, + "learning_rate": 2.2297308452634644e-07, + "loss": 0.74539095, + "num_input_tokens_seen": 305931515, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6328125, + "step": 14184, + "time_per_iteration": 3.8862593173980713 + }, + { + "auxiliary_loss_clip": 0.01101013, + "auxiliary_loss_mlp": 0.01031741, + "balance_loss_clip": 1.020082, + "balance_loss_mlp": 1.03550458, + "epoch": 0.852848339095145, + "flos": 17202504689280.0, + "grad_norm": 1.6403093384552982, + "language_loss": 0.76668632, + "learning_rate": 2.2279441350977457e-07, + "loss": 0.78801394, + "num_input_tokens_seen": 305949965, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65625, + "step": 14185, + "time_per_iteration": 2.43679141998291 + }, + { + "auxiliary_loss_clip": 0.01100786, + "auxiliary_loss_mlp": 0.01025879, + "balance_loss_clip": 1.01407111, + "balance_loss_mlp": 1.0342108, + "epoch": 0.852908462347813, + "flos": 18368990864640.0, + "grad_norm": 1.7633313244076745, + "language_loss": 0.79761022, + "learning_rate": 2.2261580988501637e-07, + "loss": 0.81887686, + "num_input_tokens_seen": 305967820, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 14186, + "time_per_iteration": 2.425837755203247 + }, + { + "auxiliary_loss_clip": 0.01098762, + "auxiliary_loss_mlp": 0.01027369, + "balance_loss_clip": 1.01503086, + "balance_loss_mlp": 1.03246689, + "epoch": 0.8529685856004809, + "flos": 18624890332800.0, + "grad_norm": 1.6649429350978724, + "language_loss": 0.62752771, + "learning_rate": 2.224372736588449e-07, + "loss": 0.64878899, + "num_input_tokens_seen": 305985505, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6640625, + "step": 14187, + "time_per_iteration": 2.4106929302215576 + }, + { + "auxiliary_loss_clip": 0.01101676, + "auxiliary_loss_mlp": 0.01028, + "balance_loss_clip": 1.01542985, + "balance_loss_mlp": 1.03296733, + "epoch": 0.853028708853149, + "flos": 29607665748480.0, + "grad_norm": 1.9637586597140755, + "language_loss": 0.7628786, + "learning_rate": 2.2225880483803005e-07, + "loss": 0.7841754, + "num_input_tokens_seen": 306005220, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 14188, + "time_per_iteration": 2.51119327545166 + }, + { + "auxiliary_loss_clip": 0.01101265, + "auxiliary_loss_mlp": 0.0102843, + "balance_loss_clip": 1.015764, + "balance_loss_mlp": 1.03399968, + "epoch": 0.8530888321058169, + "flos": 26353153042560.0, + "grad_norm": 1.568613881178684, + "language_loss": 0.78370714, + "learning_rate": 2.2208040342933932e-07, + "loss": 0.805004, + "num_input_tokens_seen": 306023785, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.671875, + "step": 14189, + "time_per_iteration": 2.467890739440918 + }, + { + "auxiliary_loss_clip": 0.0110047, + "auxiliary_loss_mlp": 0.01031251, + "balance_loss_clip": 1.0192889, + "balance_loss_mlp": 1.03368163, + "epoch": 0.8531489553584849, + "flos": 20521979141760.0, + "grad_norm": 1.8423482276123486, + "language_loss": 0.79671139, + "learning_rate": 2.2190206943953793e-07, + "loss": 0.81802857, + "num_input_tokens_seen": 306041600, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66796875, + "step": 14190, + "time_per_iteration": 2.426774263381958 + }, + { + "auxiliary_loss_clip": 0.01099971, + "auxiliary_loss_mlp": 0.01033211, + "balance_loss_clip": 1.02108765, + "balance_loss_mlp": 1.03475523, + "epoch": 0.8532090786111529, + "flos": 20704297599360.0, + "grad_norm": 2.2393724567806537, + "language_loss": 0.76187646, + "learning_rate": 2.2172380287538894e-07, + "loss": 0.78320825, + "num_input_tokens_seen": 306060345, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.65234375, + "step": 14191, + "time_per_iteration": 2.493112802505493 + }, + { + "auxiliary_loss_clip": 0.01099669, + "auxiliary_loss_mlp": 0.01027383, + "balance_loss_clip": 1.01559925, + "balance_loss_mlp": 1.03472078, + "epoch": 0.8532692018638208, + "flos": 19828903242240.0, + "grad_norm": 1.9447286262265506, + "language_loss": 0.6872884, + "learning_rate": 2.2154560374365073e-07, + "loss": 0.70855892, + "num_input_tokens_seen": 306078285, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6484375, + "step": 14192, + "time_per_iteration": 2.460580825805664 + }, + { + "auxiliary_loss_clip": 0.01105512, + "auxiliary_loss_mlp": 0.01034778, + "balance_loss_clip": 1.0211103, + "balance_loss_mlp": 1.03487968, + "epoch": 0.8533293251164888, + "flos": 20996790048000.0, + "grad_norm": 2.13238273698423, + "language_loss": 0.62750936, + "learning_rate": 2.2136747205108164e-07, + "loss": 0.64891225, + "num_input_tokens_seen": 306093760, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.70703125, + "step": 14193, + "time_per_iteration": 2.422989845275879 + }, + { + "auxiliary_loss_clip": 0.01099517, + "auxiliary_loss_mlp": 0.0102952, + "balance_loss_clip": 1.0180825, + "balance_loss_mlp": 1.03407574, + "epoch": 0.8533894483691568, + "flos": 22419606654720.0, + "grad_norm": 2.376253563391597, + "language_loss": 0.7662459, + "learning_rate": 2.211894078044365e-07, + "loss": 0.78753626, + "num_input_tokens_seen": 306112595, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 14194, + "time_per_iteration": 2.4911811351776123 + }, + { + "auxiliary_loss_clip": 0.01099442, + "auxiliary_loss_mlp": 0.01027223, + "balance_loss_clip": 1.01629698, + "balance_loss_mlp": 1.03374958, + "epoch": 0.8534495716218248, + "flos": 21616536332160.0, + "grad_norm": 2.870797059633662, + "language_loss": 0.69526476, + "learning_rate": 2.2101141101046705e-07, + "loss": 0.7165314, + "num_input_tokens_seen": 306131800, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.65625, + "step": 14195, + "time_per_iteration": 2.43084454536438 + }, + { + "auxiliary_loss_clip": 0.01100087, + "auxiliary_loss_mlp": 0.01030867, + "balance_loss_clip": 1.01892877, + "balance_loss_mlp": 1.03276181, + "epoch": 0.8535096948744927, + "flos": 22346277039360.0, + "grad_norm": 1.951192155992327, + "language_loss": 0.8569665, + "learning_rate": 2.2083348167592343e-07, + "loss": 0.87827611, + "num_input_tokens_seen": 306150590, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.671875, + "step": 14196, + "time_per_iteration": 2.450251340866089 + }, + { + "auxiliary_loss_clip": 0.01022416, + "auxiliary_loss_mlp": 0.01004449, + "balance_loss_clip": 1.00341821, + "balance_loss_mlp": 1.00223291, + "epoch": 0.8535698181271607, + "flos": 52762507891200.0, + "grad_norm": 0.7740707034591511, + "language_loss": 0.55144757, + "learning_rate": 2.2065561980755243e-07, + "loss": 0.57171625, + "num_input_tokens_seen": 306205850, + "router_z_loss_clip": 0.01031494, + "router_z_loss_mlp": 0.20214844, + "step": 14197, + "time_per_iteration": 3.005002975463867 + }, + { + "auxiliary_loss_clip": 0.01096299, + "auxiliary_loss_mlp": 0.01032154, + "balance_loss_clip": 1.02065611, + "balance_loss_mlp": 1.03312826, + "epoch": 0.8536299413798286, + "flos": 19062892776960.0, + "grad_norm": 1.515492209214701, + "language_loss": 0.81483853, + "learning_rate": 2.2047782541209826e-07, + "loss": 0.83612299, + "num_input_tokens_seen": 306225220, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6328125, + "step": 14198, + "time_per_iteration": 2.4649815559387207 + }, + { + "auxiliary_loss_clip": 0.01099001, + "auxiliary_loss_mlp": 0.01026718, + "balance_loss_clip": 1.01638854, + "balance_loss_mlp": 1.03425956, + "epoch": 0.8536900646324966, + "flos": 49344743871360.0, + "grad_norm": 1.4080909116344482, + "language_loss": 0.68194431, + "learning_rate": 2.203000984963035e-07, + "loss": 0.70320153, + "num_input_tokens_seen": 306249865, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.6484375, + "step": 14199, + "time_per_iteration": 2.75403094291687 + }, + { + "auxiliary_loss_clip": 0.01094508, + "auxiliary_loss_mlp": 0.01028493, + "balance_loss_clip": 1.0175674, + "balance_loss_mlp": 1.03173518, + "epoch": 0.8537501878851645, + "flos": 21762333636480.0, + "grad_norm": 1.6037725404598313, + "language_loss": 0.86364204, + "learning_rate": 2.201224390669072e-07, + "loss": 0.88487208, + "num_input_tokens_seen": 306270215, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.62890625, + "step": 14200, + "time_per_iteration": 2.4806206226348877 + }, + { + "auxiliary_loss_clip": 0.01099065, + "auxiliary_loss_mlp": 0.01026575, + "balance_loss_clip": 1.0155009, + "balance_loss_mlp": 1.03298926, + "epoch": 0.8538103111378326, + "flos": 22269176496000.0, + "grad_norm": 1.7777053516959667, + "language_loss": 0.77743292, + "learning_rate": 2.1994484713064666e-07, + "loss": 0.79868931, + "num_input_tokens_seen": 306288960, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.66015625, + "step": 14201, + "time_per_iteration": 2.4462931156158447 + }, + { + "auxiliary_loss_clip": 0.01097721, + "auxiliary_loss_mlp": 0.01026351, + "balance_loss_clip": 1.01518703, + "balance_loss_mlp": 1.0337944, + "epoch": 0.8538704343905005, + "flos": 20303929630080.0, + "grad_norm": 1.7109350279161786, + "language_loss": 0.68886614, + "learning_rate": 2.19767322694256e-07, + "loss": 0.71010685, + "num_input_tokens_seen": 306308735, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.640625, + "step": 14202, + "time_per_iteration": 2.463541030883789 + }, + { + "auxiliary_loss_clip": 0.01099825, + "auxiliary_loss_mlp": 0.01032542, + "balance_loss_clip": 1.02116919, + "balance_loss_mlp": 1.03426242, + "epoch": 0.8539305576431685, + "flos": 24755164784640.0, + "grad_norm": 1.9535951934436555, + "language_loss": 0.80181468, + "learning_rate": 2.195898657644666e-07, + "loss": 0.82313836, + "num_input_tokens_seen": 306329015, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65625, + "step": 14203, + "time_per_iteration": 2.5205531120300293 + }, + { + "auxiliary_loss_clip": 0.01101653, + "auxiliary_loss_mlp": 0.01032256, + "balance_loss_clip": 1.01982856, + "balance_loss_mlp": 1.03427434, + "epoch": 0.8539906808958365, + "flos": 26687625511680.0, + "grad_norm": 1.984669795518607, + "language_loss": 0.65570819, + "learning_rate": 2.1941247634800808e-07, + "loss": 0.67704731, + "num_input_tokens_seen": 306349085, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.671875, + "step": 14204, + "time_per_iteration": 2.4955055713653564 + }, + { + "auxiliary_loss_clip": 0.01101005, + "auxiliary_loss_mlp": 0.01032864, + "balance_loss_clip": 1.02073479, + "balance_loss_mlp": 1.03411329, + "epoch": 0.8540508041485044, + "flos": 13365521038080.0, + "grad_norm": 2.2503266351181885, + "language_loss": 0.59924453, + "learning_rate": 2.1923515445160667e-07, + "loss": 0.62058318, + "num_input_tokens_seen": 306365385, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 14205, + "time_per_iteration": 2.451064348220825 + }, + { + "auxiliary_loss_clip": 0.0109883, + "auxiliary_loss_mlp": 0.01026041, + "balance_loss_clip": 1.01386976, + "balance_loss_mlp": 1.03404224, + "epoch": 0.8541109274011724, + "flos": 32780876019840.0, + "grad_norm": 2.00271599179622, + "language_loss": 0.72058553, + "learning_rate": 2.1905790008198655e-07, + "loss": 0.74183416, + "num_input_tokens_seen": 306384585, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6484375, + "step": 14206, + "time_per_iteration": 2.691939115524292 + }, + { + "auxiliary_loss_clip": 0.01102004, + "auxiliary_loss_mlp": 0.01027685, + "balance_loss_clip": 1.01611567, + "balance_loss_mlp": 1.03535128, + "epoch": 0.8541710506538404, + "flos": 17639286071040.0, + "grad_norm": 3.093713921060051, + "language_loss": 0.76876032, + "learning_rate": 2.1888071324586987e-07, + "loss": 0.79005724, + "num_input_tokens_seen": 306401565, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66796875, + "step": 14207, + "time_per_iteration": 2.4235453605651855 + }, + { + "auxiliary_loss_clip": 0.01101035, + "auxiliary_loss_mlp": 0.01030298, + "balance_loss_clip": 1.0175786, + "balance_loss_mlp": 1.03433371, + "epoch": 0.8542311739065084, + "flos": 20263062931200.0, + "grad_norm": 1.6417850294728733, + "language_loss": 0.85100585, + "learning_rate": 2.1870359394997485e-07, + "loss": 0.87231922, + "num_input_tokens_seen": 306419995, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6640625, + "step": 14208, + "time_per_iteration": 2.4407296180725098 + }, + { + "auxiliary_loss_clip": 0.01101124, + "auxiliary_loss_mlp": 0.01032284, + "balance_loss_clip": 1.02127552, + "balance_loss_mlp": 1.03569424, + "epoch": 0.8542912971591763, + "flos": 17785657992960.0, + "grad_norm": 1.4610514285871214, + "language_loss": 0.65849692, + "learning_rate": 2.1852654220101785e-07, + "loss": 0.67983097, + "num_input_tokens_seen": 306439240, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 14209, + "time_per_iteration": 2.418771266937256 + }, + { + "auxiliary_loss_clip": 0.01099443, + "auxiliary_loss_mlp": 0.01026049, + "balance_loss_clip": 1.01455092, + "balance_loss_mlp": 1.03474426, + "epoch": 0.8543514204118443, + "flos": 26979507429120.0, + "grad_norm": 2.120429439865478, + "language_loss": 0.70436859, + "learning_rate": 2.1834955800571287e-07, + "loss": 0.72562349, + "num_input_tokens_seen": 306458425, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 14210, + "time_per_iteration": 2.474961996078491 + }, + { + "auxiliary_loss_clip": 0.01098268, + "auxiliary_loss_mlp": 0.01030207, + "balance_loss_clip": 1.01849508, + "balance_loss_mlp": 1.03316927, + "epoch": 0.8544115436645122, + "flos": 24024598064640.0, + "grad_norm": 1.342038230302634, + "language_loss": 0.70265722, + "learning_rate": 2.1817264137077141e-07, + "loss": 0.72394198, + "num_input_tokens_seen": 306477210, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65234375, + "step": 14211, + "time_per_iteration": 2.4852607250213623 + }, + { + "auxiliary_loss_clip": 0.01100811, + "auxiliary_loss_mlp": 0.01028983, + "balance_loss_clip": 1.01696706, + "balance_loss_mlp": 1.03383327, + "epoch": 0.8544716669171802, + "flos": 16617986668800.0, + "grad_norm": 6.036072710383437, + "language_loss": 0.811239, + "learning_rate": 2.1799579230290166e-07, + "loss": 0.83253694, + "num_input_tokens_seen": 306495820, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66796875, + "step": 14212, + "time_per_iteration": 2.428615093231201 + }, + { + "auxiliary_loss_clip": 0.01100834, + "auxiliary_loss_mlp": 0.01033389, + "balance_loss_clip": 1.02034807, + "balance_loss_mlp": 1.03400826, + "epoch": 0.8545317901698481, + "flos": 40005779489280.0, + "grad_norm": 2.153668770335833, + "language_loss": 0.66985464, + "learning_rate": 2.178190108088105e-07, + "loss": 0.69119686, + "num_input_tokens_seen": 306516420, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6640625, + "step": 14213, + "time_per_iteration": 2.5968360900878906 + }, + { + "auxiliary_loss_clip": 0.01098117, + "auxiliary_loss_mlp": 0.0102624, + "balance_loss_clip": 1.01437306, + "balance_loss_mlp": 1.0335108, + "epoch": 0.8545919134225162, + "flos": 19902520166400.0, + "grad_norm": 1.5860576713559527, + "language_loss": 0.78203142, + "learning_rate": 2.1764229689520098e-07, + "loss": 0.80327499, + "num_input_tokens_seen": 306534785, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.64453125, + "step": 14214, + "time_per_iteration": 2.437434196472168 + }, + { + "auxiliary_loss_clip": 0.01103195, + "auxiliary_loss_mlp": 0.01028084, + "balance_loss_clip": 1.01534665, + "balance_loss_mlp": 1.0336858, + "epoch": 0.8546520366751841, + "flos": 18952970181120.0, + "grad_norm": 2.3601072733051764, + "language_loss": 0.66634488, + "learning_rate": 2.1746565056877397e-07, + "loss": 0.68765759, + "num_input_tokens_seen": 306552440, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6953125, + "step": 14215, + "time_per_iteration": 2.4206948280334473 + }, + { + "auxiliary_loss_clip": 0.01100459, + "auxiliary_loss_mlp": 0.01026262, + "balance_loss_clip": 1.01484776, + "balance_loss_mlp": 1.03546536, + "epoch": 0.8547121599278521, + "flos": 35621445415680.0, + "grad_norm": 1.6911842377529425, + "language_loss": 0.62753046, + "learning_rate": 2.172890718362279e-07, + "loss": 0.64879763, + "num_input_tokens_seen": 306573600, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 14216, + "time_per_iteration": 2.573880434036255 + }, + { + "auxiliary_loss_clip": 0.01100323, + "auxiliary_loss_mlp": 0.0102972, + "balance_loss_clip": 1.01847267, + "balance_loss_mlp": 1.0334928, + "epoch": 0.8547722831805201, + "flos": 16910048154240.0, + "grad_norm": 1.9929110810003072, + "language_loss": 0.65539861, + "learning_rate": 2.17112560704259e-07, + "loss": 0.67669904, + "num_input_tokens_seen": 306592840, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.66796875, + "step": 14217, + "time_per_iteration": 2.4209792613983154 + }, + { + "auxiliary_loss_clip": 0.01098009, + "auxiliary_loss_mlp": 0.010284, + "balance_loss_clip": 1.01740897, + "balance_loss_mlp": 1.03479838, + "epoch": 0.854832406433188, + "flos": 23002616304000.0, + "grad_norm": 2.1082100037131184, + "language_loss": 0.64531755, + "learning_rate": 2.1693611717956072e-07, + "loss": 0.66658163, + "num_input_tokens_seen": 306613210, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6328125, + "step": 14218, + "time_per_iteration": 2.4655544757843018 + }, + { + "auxiliary_loss_clip": 0.0110141, + "auxiliary_loss_mlp": 0.01029202, + "balance_loss_clip": 1.01771629, + "balance_loss_mlp": 1.03406596, + "epoch": 0.854892529685856, + "flos": 20412595249920.0, + "grad_norm": 1.7906180605197195, + "language_loss": 0.6969347, + "learning_rate": 2.167597412688238e-07, + "loss": 0.7182408, + "num_input_tokens_seen": 306631620, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.671875, + "step": 14219, + "time_per_iteration": 2.4618418216705322 + }, + { + "auxiliary_loss_clip": 0.01101273, + "auxiliary_loss_mlp": 0.0103232, + "balance_loss_clip": 1.02023816, + "balance_loss_mlp": 1.03277564, + "epoch": 0.854952652938524, + "flos": 16398716094720.0, + "grad_norm": 2.2477262309948722, + "language_loss": 0.67420268, + "learning_rate": 2.1658343297873549e-07, + "loss": 0.69553864, + "num_input_tokens_seen": 306646695, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.68359375, + "step": 14220, + "time_per_iteration": 3.827411651611328 + }, + { + "auxiliary_loss_clip": 0.01096891, + "auxiliary_loss_mlp": 0.0102881, + "balance_loss_clip": 1.01792645, + "balance_loss_mlp": 1.034024, + "epoch": 0.855012776191192, + "flos": 21178677542400.0, + "grad_norm": 1.977915778436477, + "language_loss": 0.71490705, + "learning_rate": 2.164071923159827e-07, + "loss": 0.73616409, + "num_input_tokens_seen": 306665465, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.62890625, + "step": 14221, + "time_per_iteration": 2.4499704837799072 + }, + { + "auxiliary_loss_clip": 0.01100961, + "auxiliary_loss_mlp": 0.01035966, + "balance_loss_clip": 1.02373493, + "balance_loss_mlp": 1.0342536, + "epoch": 0.8550728994438599, + "flos": 26140993361280.0, + "grad_norm": 1.8036420515199072, + "language_loss": 0.59936148, + "learning_rate": 2.1623101928724763e-07, + "loss": 0.62073076, + "num_input_tokens_seen": 306685950, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66796875, + "step": 14222, + "time_per_iteration": 3.9379451274871826 + }, + { + "auxiliary_loss_clip": 0.01098725, + "auxiliary_loss_mlp": 0.01031633, + "balance_loss_clip": 1.01986158, + "balance_loss_mlp": 1.03435004, + "epoch": 0.8551330226965279, + "flos": 22786793435520.0, + "grad_norm": 1.8788661721369253, + "language_loss": 0.8384949, + "learning_rate": 2.1605491389921093e-07, + "loss": 0.85979849, + "num_input_tokens_seen": 306705740, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.640625, + "step": 14223, + "time_per_iteration": 2.4997923374176025 + }, + { + "auxiliary_loss_clip": 0.01099205, + "auxiliary_loss_mlp": 0.01029513, + "balance_loss_clip": 1.0179739, + "balance_loss_mlp": 1.03510475, + "epoch": 0.8551931459491958, + "flos": 22419032037120.0, + "grad_norm": 1.9589548379338593, + "language_loss": 0.74226081, + "learning_rate": 2.158788761585515e-07, + "loss": 0.76354808, + "num_input_tokens_seen": 306725065, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 14224, + "time_per_iteration": 2.4575846195220947 + }, + { + "auxiliary_loss_clip": 0.01099212, + "auxiliary_loss_mlp": 0.01025746, + "balance_loss_clip": 1.01395071, + "balance_loss_mlp": 1.03435862, + "epoch": 0.8552532692018638, + "flos": 19573183342080.0, + "grad_norm": 2.0140938675703404, + "language_loss": 0.75260413, + "learning_rate": 2.1570290607194307e-07, + "loss": 0.77385372, + "num_input_tokens_seen": 306743630, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6484375, + "step": 14225, + "time_per_iteration": 3.9161388874053955 + }, + { + "auxiliary_loss_clip": 0.01098731, + "auxiliary_loss_mlp": 0.01034967, + "balance_loss_clip": 1.02398777, + "balance_loss_mlp": 1.03516436, + "epoch": 0.8553133924545318, + "flos": 26432767537920.0, + "grad_norm": 1.522872889346961, + "language_loss": 0.76993561, + "learning_rate": 2.1552700364605925e-07, + "loss": 0.79127258, + "num_input_tokens_seen": 306763105, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6328125, + "step": 14226, + "time_per_iteration": 2.506988286972046 + }, + { + "auxiliary_loss_clip": 0.01102328, + "auxiliary_loss_mlp": 0.01033106, + "balance_loss_clip": 1.0207684, + "balance_loss_mlp": 1.03414547, + "epoch": 0.8553735157071998, + "flos": 16362446336640.0, + "grad_norm": 2.177222664744404, + "language_loss": 0.54483128, + "learning_rate": 2.153511688875702e-07, + "loss": 0.56618559, + "num_input_tokens_seen": 306779875, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6796875, + "step": 14227, + "time_per_iteration": 2.4459900856018066 + }, + { + "auxiliary_loss_clip": 0.01099961, + "auxiliary_loss_mlp": 0.01027518, + "balance_loss_clip": 1.01631856, + "balance_loss_mlp": 1.03569543, + "epoch": 0.8554336389598677, + "flos": 20887334328960.0, + "grad_norm": 2.2247440152306632, + "language_loss": 0.6510337, + "learning_rate": 2.151754018031442e-07, + "loss": 0.6723085, + "num_input_tokens_seen": 306800015, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.64453125, + "step": 14228, + "time_per_iteration": 2.4324324131011963 + }, + { + "auxiliary_loss_clip": 0.01101168, + "auxiliary_loss_mlp": 0.01030155, + "balance_loss_clip": 1.01824021, + "balance_loss_mlp": 1.03467417, + "epoch": 0.8554937622125357, + "flos": 21284721469440.0, + "grad_norm": 7.206926402956923, + "language_loss": 0.74007577, + "learning_rate": 2.1499970239944542e-07, + "loss": 0.76138902, + "num_input_tokens_seen": 306814160, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 14229, + "time_per_iteration": 2.4285945892333984 + }, + { + "auxiliary_loss_clip": 0.01097864, + "auxiliary_loss_mlp": 0.01025289, + "balance_loss_clip": 1.01422048, + "balance_loss_mlp": 1.0336206, + "epoch": 0.8555538854652037, + "flos": 22413178120320.0, + "grad_norm": 1.6659159348805417, + "language_loss": 0.72572798, + "learning_rate": 2.1482407068313724e-07, + "loss": 0.74695945, + "num_input_tokens_seen": 306833310, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 14230, + "time_per_iteration": 2.4370429515838623 + }, + { + "auxiliary_loss_clip": 0.01099354, + "auxiliary_loss_mlp": 0.01025787, + "balance_loss_clip": 1.01428294, + "balance_loss_mlp": 1.03463602, + "epoch": 0.8556140087178716, + "flos": 20193719725440.0, + "grad_norm": 1.858724204103366, + "language_loss": 0.82625288, + "learning_rate": 2.1464850666087897e-07, + "loss": 0.84750426, + "num_input_tokens_seen": 306851345, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 14231, + "time_per_iteration": 2.416818618774414 + }, + { + "auxiliary_loss_clip": 0.01102308, + "auxiliary_loss_mlp": 0.01033345, + "balance_loss_clip": 1.02078056, + "balance_loss_mlp": 1.03642631, + "epoch": 0.8556741319705397, + "flos": 22638123043200.0, + "grad_norm": 1.8022250670244886, + "language_loss": 0.67731422, + "learning_rate": 2.1447301033932796e-07, + "loss": 0.69867074, + "num_input_tokens_seen": 306871040, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.66015625, + "step": 14232, + "time_per_iteration": 2.4504547119140625 + }, + { + "auxiliary_loss_clip": 0.01102277, + "auxiliary_loss_mlp": 0.01030336, + "balance_loss_clip": 1.0181942, + "balance_loss_mlp": 1.03550363, + "epoch": 0.8557342552232076, + "flos": 23549320281600.0, + "grad_norm": 1.4699321187791279, + "language_loss": 0.66779065, + "learning_rate": 2.1429758172513955e-07, + "loss": 0.68911678, + "num_input_tokens_seen": 306891625, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66796875, + "step": 14233, + "time_per_iteration": 2.478637933731079 + }, + { + "auxiliary_loss_clip": 0.01097787, + "auxiliary_loss_mlp": 0.01031044, + "balance_loss_clip": 1.01955795, + "balance_loss_mlp": 1.032691, + "epoch": 0.8557943784758756, + "flos": 19609884063360.0, + "grad_norm": 1.6274699918849787, + "language_loss": 0.76340926, + "learning_rate": 2.1412222082496556e-07, + "loss": 0.78469753, + "num_input_tokens_seen": 306910020, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65234375, + "step": 14234, + "time_per_iteration": 2.421801805496216 + }, + { + "auxiliary_loss_clip": 0.01022342, + "auxiliary_loss_mlp": 0.01001271, + "balance_loss_clip": 1.00026405, + "balance_loss_mlp": 1.00228214, + "epoch": 0.8558545017285435, + "flos": 70641891446400.0, + "grad_norm": 0.7533011724258024, + "language_loss": 0.58039862, + "learning_rate": 2.1394692764545684e-07, + "loss": 0.60063475, + "num_input_tokens_seen": 306969505, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.20117188, + "step": 14235, + "time_per_iteration": 3.016435146331787 + }, + { + "auxiliary_loss_clip": 0.01022713, + "auxiliary_loss_mlp": 0.0100235, + "balance_loss_clip": 1.00124168, + "balance_loss_mlp": 1.00244868, + "epoch": 0.8559146249812115, + "flos": 56649983086080.0, + "grad_norm": 0.7884525554635639, + "language_loss": 0.56691235, + "learning_rate": 2.1377170219325858e-07, + "loss": 0.58716297, + "num_input_tokens_seen": 307027710, + "router_z_loss_clip": 0.0111084, + "router_z_loss_mlp": 0.203125, + "step": 14236, + "time_per_iteration": 2.9483742713928223 + }, + { + "auxiliary_loss_clip": 0.01100519, + "auxiliary_loss_mlp": 0.01034279, + "balance_loss_clip": 1.02263188, + "balance_loss_mlp": 1.03376889, + "epoch": 0.8559747482338794, + "flos": 22888240421760.0, + "grad_norm": 1.880299009711037, + "language_loss": 0.70168215, + "learning_rate": 2.1359654447501673e-07, + "loss": 0.72303009, + "num_input_tokens_seen": 307045515, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66796875, + "step": 14237, + "time_per_iteration": 2.4361391067504883 + }, + { + "auxiliary_loss_clip": 0.01097899, + "auxiliary_loss_mlp": 0.01028922, + "balance_loss_clip": 1.01747763, + "balance_loss_mlp": 1.03274059, + "epoch": 0.8560348714865474, + "flos": 22601925112320.0, + "grad_norm": 2.07166971946217, + "language_loss": 0.63688266, + "learning_rate": 2.1342145449737314e-07, + "loss": 0.65815091, + "num_input_tokens_seen": 307064470, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 14238, + "time_per_iteration": 2.4701902866363525 + }, + { + "auxiliary_loss_clip": 0.01095366, + "auxiliary_loss_mlp": 0.01030008, + "balance_loss_clip": 1.01998234, + "balance_loss_mlp": 1.03308296, + "epoch": 0.8560949947392154, + "flos": 17931455297280.0, + "grad_norm": 1.599452795131822, + "language_loss": 0.69295937, + "learning_rate": 2.1324643226696648e-07, + "loss": 0.71421313, + "num_input_tokens_seen": 307083900, + "router_z_loss_clip": 0.10058594, + "router_z_loss_mlp": 0.625, + "step": 14239, + "time_per_iteration": 2.4972500801086426 + }, + { + "auxiliary_loss_clip": 0.01102946, + "auxiliary_loss_mlp": 0.01031631, + "balance_loss_clip": 1.01977563, + "balance_loss_mlp": 1.0346055, + "epoch": 0.8561551179918834, + "flos": 31026208636800.0, + "grad_norm": 1.9185715345631495, + "language_loss": 0.66292799, + "learning_rate": 2.1307147779043455e-07, + "loss": 0.68427372, + "num_input_tokens_seen": 307104590, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.68359375, + "step": 14240, + "time_per_iteration": 2.552086114883423 + }, + { + "auxiliary_loss_clip": 0.01100896, + "auxiliary_loss_mlp": 0.01031078, + "balance_loss_clip": 1.01841772, + "balance_loss_mlp": 1.03389883, + "epoch": 0.8562152412445513, + "flos": 30665198995200.0, + "grad_norm": 1.5899388107662171, + "language_loss": 0.62232125, + "learning_rate": 2.1289659107441182e-07, + "loss": 0.64364094, + "num_input_tokens_seen": 307125580, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.671875, + "step": 14241, + "time_per_iteration": 2.50657057762146 + }, + { + "auxiliary_loss_clip": 0.01104391, + "auxiliary_loss_mlp": 0.01033043, + "balance_loss_clip": 1.02027607, + "balance_loss_mlp": 1.03487253, + "epoch": 0.8562753644972193, + "flos": 31576144838400.0, + "grad_norm": 1.708079074256712, + "language_loss": 0.74306595, + "learning_rate": 2.1272177212552855e-07, + "loss": 0.76444036, + "num_input_tokens_seen": 307147625, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 14242, + "time_per_iteration": 2.566230297088623 + }, + { + "auxiliary_loss_clip": 0.011037, + "auxiliary_loss_mlp": 0.01037225, + "balance_loss_clip": 1.02588248, + "balance_loss_mlp": 1.0358814, + "epoch": 0.8563354877498872, + "flos": 26213640618240.0, + "grad_norm": 1.936352707521622, + "language_loss": 0.7619487, + "learning_rate": 2.1254702095041498e-07, + "loss": 0.78335792, + "num_input_tokens_seen": 307164665, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6796875, + "step": 14243, + "time_per_iteration": 2.4888031482696533 + }, + { + "auxiliary_loss_clip": 0.01100916, + "auxiliary_loss_mlp": 0.01032267, + "balance_loss_clip": 1.02029276, + "balance_loss_mlp": 1.03415143, + "epoch": 0.8563956110025552, + "flos": 24134341092480.0, + "grad_norm": 1.9431842479847303, + "language_loss": 0.68101519, + "learning_rate": 2.123723375556974e-07, + "loss": 0.70234704, + "num_input_tokens_seen": 307182530, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66796875, + "step": 14244, + "time_per_iteration": 2.508938789367676 + }, + { + "auxiliary_loss_clip": 0.01022635, + "auxiliary_loss_mlp": 0.01003162, + "balance_loss_clip": 1.00209546, + "balance_loss_mlp": 1.00242233, + "epoch": 0.8564557342552233, + "flos": 56271986311680.0, + "grad_norm": 0.7550815823287989, + "language_loss": 0.584894, + "learning_rate": 2.1219772194800046e-07, + "loss": 0.60515195, + "num_input_tokens_seen": 307241240, + "router_z_loss_clip": 0.01068115, + "router_z_loss_mlp": 0.203125, + "step": 14245, + "time_per_iteration": 2.9439289569854736 + }, + { + "auxiliary_loss_clip": 0.01104025, + "auxiliary_loss_mlp": 0.01029128, + "balance_loss_clip": 1.01670074, + "balance_loss_mlp": 1.03502369, + "epoch": 0.8565158575078912, + "flos": 23440618748160.0, + "grad_norm": 1.6717513194149345, + "language_loss": 0.77544534, + "learning_rate": 2.1202317413394488e-07, + "loss": 0.79677689, + "num_input_tokens_seen": 307261485, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 14246, + "time_per_iteration": 2.499782085418701 + }, + { + "auxiliary_loss_clip": 0.01097341, + "auxiliary_loss_mlp": 0.0102493, + "balance_loss_clip": 1.01326489, + "balance_loss_mlp": 1.03187299, + "epoch": 0.8565759807605592, + "flos": 20375930442240.0, + "grad_norm": 1.8912838995768235, + "language_loss": 0.81099033, + "learning_rate": 2.1184869412014938e-07, + "loss": 0.83221304, + "num_input_tokens_seen": 307279160, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 14247, + "time_per_iteration": 2.416072130203247 + }, + { + "auxiliary_loss_clip": 0.01100135, + "auxiliary_loss_mlp": 0.01030939, + "balance_loss_clip": 1.01893449, + "balance_loss_mlp": 1.03441513, + "epoch": 0.8566361040132271, + "flos": 18807101049600.0, + "grad_norm": 2.3574465797588506, + "language_loss": 0.77318221, + "learning_rate": 2.1167428191323112e-07, + "loss": 0.79449296, + "num_input_tokens_seen": 307297920, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.65625, + "step": 14248, + "time_per_iteration": 2.427107334136963 + }, + { + "auxiliary_loss_clip": 0.01099982, + "auxiliary_loss_mlp": 0.01028728, + "balance_loss_clip": 1.01661038, + "balance_loss_mlp": 1.03290796, + "epoch": 0.8566962272658951, + "flos": 24535355506560.0, + "grad_norm": 1.848881659370633, + "language_loss": 0.77508557, + "learning_rate": 2.1149993751980278e-07, + "loss": 0.79637265, + "num_input_tokens_seen": 307318320, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 14249, + "time_per_iteration": 2.4711170196533203 + }, + { + "auxiliary_loss_clip": 0.01097071, + "auxiliary_loss_mlp": 0.01033579, + "balance_loss_clip": 1.02206969, + "balance_loss_mlp": 1.03354955, + "epoch": 0.856756350518563, + "flos": 23178506227200.0, + "grad_norm": 2.07712303854438, + "language_loss": 0.78380144, + "learning_rate": 2.1132566094647597e-07, + "loss": 0.80510795, + "num_input_tokens_seen": 307336720, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.63671875, + "step": 14250, + "time_per_iteration": 2.4842865467071533 + }, + { + "auxiliary_loss_clip": 0.01096261, + "auxiliary_loss_mlp": 0.01029426, + "balance_loss_clip": 1.01872694, + "balance_loss_mlp": 1.03293228, + "epoch": 0.856816473771231, + "flos": 20808581760000.0, + "grad_norm": 1.6921215208893117, + "language_loss": 0.79659212, + "learning_rate": 2.1115145219985942e-07, + "loss": 0.81784904, + "num_input_tokens_seen": 307354120, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6328125, + "step": 14251, + "time_per_iteration": 2.4252700805664062 + }, + { + "auxiliary_loss_clip": 0.01097886, + "auxiliary_loss_mlp": 0.01027247, + "balance_loss_clip": 1.0159874, + "balance_loss_mlp": 1.03339255, + "epoch": 0.856876597023899, + "flos": 20228157889920.0, + "grad_norm": 1.9109310863518794, + "language_loss": 0.61741138, + "learning_rate": 2.1097731128656005e-07, + "loss": 0.6386627, + "num_input_tokens_seen": 307373165, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.64453125, + "step": 14252, + "time_per_iteration": 2.4443397521972656 + }, + { + "auxiliary_loss_clip": 0.01104246, + "auxiliary_loss_mlp": 0.01031655, + "balance_loss_clip": 1.01910281, + "balance_loss_mlp": 1.0367614, + "epoch": 0.856936720276567, + "flos": 18296128126080.0, + "grad_norm": 1.805516150747159, + "language_loss": 0.69350702, + "learning_rate": 2.1080323821317924e-07, + "loss": 0.71486604, + "num_input_tokens_seen": 307391000, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.67578125, + "step": 14253, + "time_per_iteration": 2.4410698413848877 + }, + { + "auxiliary_loss_clip": 0.01022365, + "auxiliary_loss_mlp": 0.01004805, + "balance_loss_clip": 1.00375557, + "balance_loss_mlp": 1.00232601, + "epoch": 0.8569968435292349, + "flos": 69878394933120.0, + "grad_norm": 0.8052451552240734, + "language_loss": 0.59255153, + "learning_rate": 2.1062923298631907e-07, + "loss": 0.61282325, + "num_input_tokens_seen": 307452865, + "router_z_loss_clip": 0.01049805, + "router_z_loss_mlp": 0.20117188, + "step": 14254, + "time_per_iteration": 3.0850088596343994 + }, + { + "auxiliary_loss_clip": 0.01097856, + "auxiliary_loss_mlp": 0.01033179, + "balance_loss_clip": 1.02067935, + "balance_loss_mlp": 1.03278446, + "epoch": 0.8570569667819029, + "flos": 25848572739840.0, + "grad_norm": 1.6979396756616612, + "language_loss": 0.80917549, + "learning_rate": 2.1045529561257825e-07, + "loss": 0.83048582, + "num_input_tokens_seen": 307471940, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.65234375, + "step": 14255, + "time_per_iteration": 2.4804954528808594 + }, + { + "auxiliary_loss_clip": 0.01098269, + "auxiliary_loss_mlp": 0.01025981, + "balance_loss_clip": 1.01472759, + "balance_loss_mlp": 1.03411806, + "epoch": 0.8571170900345708, + "flos": 23257115141760.0, + "grad_norm": 2.1938924508731823, + "language_loss": 0.67312753, + "learning_rate": 2.1028142609855126e-07, + "loss": 0.69437003, + "num_input_tokens_seen": 307488745, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.640625, + "step": 14256, + "time_per_iteration": 2.4801876544952393 + }, + { + "auxiliary_loss_clip": 0.01101957, + "auxiliary_loss_mlp": 0.01031827, + "balance_loss_clip": 1.02076387, + "balance_loss_mlp": 1.03524375, + "epoch": 0.8571772132872388, + "flos": 18917670090240.0, + "grad_norm": 1.7102679529995346, + "language_loss": 0.69775069, + "learning_rate": 2.1010762445083218e-07, + "loss": 0.71908844, + "num_input_tokens_seen": 307506855, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.66796875, + "step": 14257, + "time_per_iteration": 2.431058406829834 + }, + { + "auxiliary_loss_clip": 0.01098652, + "auxiliary_loss_mlp": 0.01028541, + "balance_loss_clip": 1.01653636, + "balance_loss_mlp": 1.03418648, + "epoch": 0.8572373365399069, + "flos": 33250120318080.0, + "grad_norm": 2.0065019145348204, + "language_loss": 0.77076191, + "learning_rate": 2.0993389067601197e-07, + "loss": 0.79203385, + "num_input_tokens_seen": 307526115, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.64453125, + "step": 14258, + "time_per_iteration": 2.534079074859619 + }, + { + "auxiliary_loss_clip": 0.01098475, + "auxiliary_loss_mlp": 0.01028758, + "balance_loss_clip": 1.01732588, + "balance_loss_mlp": 1.03451884, + "epoch": 0.8572974597925748, + "flos": 23327535755520.0, + "grad_norm": 1.5055834428121542, + "language_loss": 0.67819071, + "learning_rate": 2.0976022478067735e-07, + "loss": 0.69946301, + "num_input_tokens_seen": 307545230, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.640625, + "step": 14259, + "time_per_iteration": 2.485220432281494 + }, + { + "auxiliary_loss_clip": 0.01098298, + "auxiliary_loss_mlp": 0.01028367, + "balance_loss_clip": 1.01689315, + "balance_loss_mlp": 1.03273714, + "epoch": 0.8573575830452428, + "flos": 24535858296960.0, + "grad_norm": 1.6998154571909854, + "language_loss": 0.77415329, + "learning_rate": 2.0958662677141437e-07, + "loss": 0.79541999, + "num_input_tokens_seen": 307564900, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65625, + "step": 14260, + "time_per_iteration": 2.5281169414520264 + }, + { + "auxiliary_loss_clip": 0.01101383, + "auxiliary_loss_mlp": 0.01028533, + "balance_loss_clip": 1.01651108, + "balance_loss_mlp": 1.03417957, + "epoch": 0.8574177062979107, + "flos": 24165403378560.0, + "grad_norm": 2.4235985456915867, + "language_loss": 0.74327439, + "learning_rate": 2.09413096654806e-07, + "loss": 0.76457351, + "num_input_tokens_seen": 307583500, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 14261, + "time_per_iteration": 2.4572179317474365 + }, + { + "auxiliary_loss_clip": 0.01102872, + "auxiliary_loss_mlp": 0.01030228, + "balance_loss_clip": 1.01750875, + "balance_loss_mlp": 1.03488421, + "epoch": 0.8574778295505787, + "flos": 17930737025280.0, + "grad_norm": 1.7840842625945281, + "language_loss": 0.7859261, + "learning_rate": 2.0923963443743276e-07, + "loss": 0.80725712, + "num_input_tokens_seen": 307601430, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 14262, + "time_per_iteration": 3.904160499572754 + }, + { + "auxiliary_loss_clip": 0.01099834, + "auxiliary_loss_mlp": 0.01031195, + "balance_loss_clip": 1.02024531, + "balance_loss_mlp": 1.03637409, + "epoch": 0.8575379528032466, + "flos": 21580697537280.0, + "grad_norm": 1.756622359750573, + "language_loss": 0.67971861, + "learning_rate": 2.0906624012587203e-07, + "loss": 0.70102894, + "num_input_tokens_seen": 307621495, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6328125, + "step": 14263, + "time_per_iteration": 5.447778224945068 + }, + { + "auxiliary_loss_clip": 0.01099256, + "auxiliary_loss_mlp": 0.0102962, + "balance_loss_clip": 1.01801479, + "balance_loss_mlp": 1.03373396, + "epoch": 0.8575980760559146, + "flos": 21761579450880.0, + "grad_norm": 1.4777257983100802, + "language_loss": 0.79465747, + "learning_rate": 2.088929137266986e-07, + "loss": 0.81594616, + "num_input_tokens_seen": 307640840, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 14264, + "time_per_iteration": 2.500290870666504 + }, + { + "auxiliary_loss_clip": 0.01099525, + "auxiliary_loss_mlp": 0.01032748, + "balance_loss_clip": 1.02100539, + "balance_loss_mlp": 1.03500402, + "epoch": 0.8576581993085826, + "flos": 34386442047360.0, + "grad_norm": 1.2906730752175566, + "language_loss": 0.69431353, + "learning_rate": 2.0871965524648582e-07, + "loss": 0.71563625, + "num_input_tokens_seen": 307663820, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.64453125, + "step": 14265, + "time_per_iteration": 2.5555310249328613 + }, + { + "auxiliary_loss_clip": 0.0109498, + "auxiliary_loss_mlp": 0.01024602, + "balance_loss_clip": 1.01414716, + "balance_loss_mlp": 1.03292894, + "epoch": 0.8577183225612506, + "flos": 23222497409280.0, + "grad_norm": 2.230270985718821, + "language_loss": 0.66134441, + "learning_rate": 2.085464646918027e-07, + "loss": 0.6825403, + "num_input_tokens_seen": 307682385, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.62109375, + "step": 14266, + "time_per_iteration": 2.497089147567749 + }, + { + "auxiliary_loss_clip": 0.01098797, + "auxiliary_loss_mlp": 0.01030539, + "balance_loss_clip": 1.01898146, + "balance_loss_mlp": 1.03485322, + "epoch": 0.8577784458139185, + "flos": 28804164462720.0, + "grad_norm": 2.2520575670699468, + "language_loss": 0.75218296, + "learning_rate": 2.0837334206921731e-07, + "loss": 0.77347636, + "num_input_tokens_seen": 307704680, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 14267, + "time_per_iteration": 4.048743963241577 + }, + { + "auxiliary_loss_clip": 0.01098109, + "auxiliary_loss_mlp": 0.0102667, + "balance_loss_clip": 1.01561344, + "balance_loss_mlp": 1.0337677, + "epoch": 0.8578385690665865, + "flos": 19755573626880.0, + "grad_norm": 1.7118804169147943, + "language_loss": 0.87602067, + "learning_rate": 2.082002873852946e-07, + "loss": 0.89726847, + "num_input_tokens_seen": 307723245, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.64453125, + "step": 14268, + "time_per_iteration": 2.439980983734131 + }, + { + "auxiliary_loss_clip": 0.01100897, + "auxiliary_loss_mlp": 0.01032301, + "balance_loss_clip": 1.02029121, + "balance_loss_mlp": 1.0352304, + "epoch": 0.8578986923192544, + "flos": 20704082117760.0, + "grad_norm": 1.6901457258620303, + "language_loss": 0.73087263, + "learning_rate": 2.0802730064659667e-07, + "loss": 0.7522046, + "num_input_tokens_seen": 307742510, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.65625, + "step": 14269, + "time_per_iteration": 2.4319350719451904 + }, + { + "auxiliary_loss_clip": 0.01100626, + "auxiliary_loss_mlp": 0.01030094, + "balance_loss_clip": 1.01835752, + "balance_loss_mlp": 1.03426695, + "epoch": 0.8579588155719224, + "flos": 36101715189120.0, + "grad_norm": 1.4429206578235492, + "language_loss": 0.66260904, + "learning_rate": 2.0785438185968252e-07, + "loss": 0.68391621, + "num_input_tokens_seen": 307766030, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 14270, + "time_per_iteration": 2.5631306171417236 + }, + { + "auxiliary_loss_clip": 0.01097057, + "auxiliary_loss_mlp": 0.01027188, + "balance_loss_clip": 1.01599455, + "balance_loss_mlp": 1.03327835, + "epoch": 0.8580189388245905, + "flos": 22853479034880.0, + "grad_norm": 1.5248509444255016, + "language_loss": 0.73964077, + "learning_rate": 2.0768153103110997e-07, + "loss": 0.76088321, + "num_input_tokens_seen": 307785800, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 14271, + "time_per_iteration": 2.4392032623291016 + }, + { + "auxiliary_loss_clip": 0.01022391, + "auxiliary_loss_mlp": 0.01000539, + "balance_loss_clip": 0.99947166, + "balance_loss_mlp": 1.00218976, + "epoch": 0.8580790620772584, + "flos": 69642104290560.0, + "grad_norm": 0.8025264948079197, + "language_loss": 0.59533787, + "learning_rate": 2.0750874816743358e-07, + "loss": 0.61556721, + "num_input_tokens_seen": 307850995, + "router_z_loss_clip": 0.01068115, + "router_z_loss_mlp": 0.20214844, + "step": 14272, + "time_per_iteration": 3.0923521518707275 + }, + { + "auxiliary_loss_clip": 0.01103984, + "auxiliary_loss_mlp": 0.01030555, + "balance_loss_clip": 1.01855707, + "balance_loss_mlp": 1.03503764, + "epoch": 0.8581391853299264, + "flos": 13334243270400.0, + "grad_norm": 1.8233636053410176, + "language_loss": 0.7532993, + "learning_rate": 2.0733603327520499e-07, + "loss": 0.77464467, + "num_input_tokens_seen": 307868585, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6875, + "step": 14273, + "time_per_iteration": 2.4198429584503174 + }, + { + "auxiliary_loss_clip": 0.01100263, + "auxiliary_loss_mlp": 0.0103004, + "balance_loss_clip": 1.01861334, + "balance_loss_mlp": 1.03461802, + "epoch": 0.8581993085825943, + "flos": 19645651031040.0, + "grad_norm": 1.8234845594487459, + "language_loss": 0.82452077, + "learning_rate": 2.0716338636097385e-07, + "loss": 0.84582376, + "num_input_tokens_seen": 307886820, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 14274, + "time_per_iteration": 2.423983573913574 + }, + { + "auxiliary_loss_clip": 0.01022729, + "auxiliary_loss_mlp": 0.01001298, + "balance_loss_clip": 1.00023675, + "balance_loss_mlp": 1.00258183, + "epoch": 0.8582594318352623, + "flos": 55825077294720.0, + "grad_norm": 0.800704100166407, + "language_loss": 0.60889721, + "learning_rate": 2.0699080743128672e-07, + "loss": 0.62913746, + "num_input_tokens_seen": 307944020, + "router_z_loss_clip": 0.01062012, + "router_z_loss_mlp": 0.20117188, + "step": 14275, + "time_per_iteration": 3.0931267738342285 + }, + { + "auxiliary_loss_clip": 0.01100126, + "auxiliary_loss_mlp": 0.01023241, + "balance_loss_clip": 1.01086688, + "balance_loss_mlp": 1.03334188, + "epoch": 0.8583195550879302, + "flos": 24279563779200.0, + "grad_norm": 2.027592784857776, + "language_loss": 0.5901401, + "learning_rate": 2.0681829649268768e-07, + "loss": 0.61137378, + "num_input_tokens_seen": 307961055, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.66796875, + "step": 14276, + "time_per_iteration": 2.4789164066314697 + }, + { + "auxiliary_loss_clip": 0.0109966, + "auxiliary_loss_mlp": 0.0103046, + "balance_loss_clip": 1.01880693, + "balance_loss_mlp": 1.03338301, + "epoch": 0.8583796783405983, + "flos": 13444129952640.0, + "grad_norm": 1.9301473140646082, + "language_loss": 0.76305163, + "learning_rate": 2.0664585355171838e-07, + "loss": 0.78435278, + "num_input_tokens_seen": 307978690, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 14277, + "time_per_iteration": 2.4284141063690186 + }, + { + "auxiliary_loss_clip": 0.01099699, + "auxiliary_loss_mlp": 0.0103111, + "balance_loss_clip": 1.01927865, + "balance_loss_mlp": 1.03406823, + "epoch": 0.8584398015932662, + "flos": 16180271533440.0, + "grad_norm": 1.7203854489642774, + "language_loss": 0.83213818, + "learning_rate": 2.0647347861491803e-07, + "loss": 0.8534463, + "num_input_tokens_seen": 307995870, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65625, + "step": 14278, + "time_per_iteration": 2.4328911304473877 + }, + { + "auxiliary_loss_clip": 0.01103618, + "auxiliary_loss_mlp": 0.01031741, + "balance_loss_clip": 1.01918292, + "balance_loss_mlp": 1.03535473, + "epoch": 0.8584999248459342, + "flos": 17450431338240.0, + "grad_norm": 1.904094363683198, + "language_loss": 0.74556804, + "learning_rate": 2.0630117168882366e-07, + "loss": 0.76692164, + "num_input_tokens_seen": 308013645, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.68359375, + "step": 14279, + "time_per_iteration": 2.4168436527252197 + }, + { + "auxiliary_loss_clip": 0.01098473, + "auxiliary_loss_mlp": 0.01031893, + "balance_loss_clip": 1.02028763, + "balance_loss_mlp": 1.03394675, + "epoch": 0.8585600480986021, + "flos": 23441013797760.0, + "grad_norm": 2.550357980437511, + "language_loss": 0.66932499, + "learning_rate": 2.0612893277996845e-07, + "loss": 0.69062865, + "num_input_tokens_seen": 308032490, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.64453125, + "step": 14280, + "time_per_iteration": 2.4760711193084717 + }, + { + "auxiliary_loss_clip": 0.01097121, + "auxiliary_loss_mlp": 0.01026889, + "balance_loss_clip": 1.0157609, + "balance_loss_mlp": 1.03308606, + "epoch": 0.8586201713512701, + "flos": 19937927998080.0, + "grad_norm": 1.980183019426352, + "language_loss": 0.62603807, + "learning_rate": 2.0595676189488343e-07, + "loss": 0.64727819, + "num_input_tokens_seen": 308052110, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.640625, + "step": 14281, + "time_per_iteration": 2.4344727993011475 + }, + { + "auxiliary_loss_clip": 0.01097793, + "auxiliary_loss_mlp": 0.01029239, + "balance_loss_clip": 1.01754475, + "balance_loss_mlp": 1.03264332, + "epoch": 0.858680294603938, + "flos": 15304769435520.0, + "grad_norm": 1.8443011587610691, + "language_loss": 0.73088598, + "learning_rate": 2.0578465904009845e-07, + "loss": 0.75215626, + "num_input_tokens_seen": 308070660, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65234375, + "step": 14282, + "time_per_iteration": 2.43040132522583 + }, + { + "auxiliary_loss_clip": 0.01096079, + "auxiliary_loss_mlp": 0.01023811, + "balance_loss_clip": 1.01301026, + "balance_loss_mlp": 1.03145468, + "epoch": 0.858740417856606, + "flos": 22711237176960.0, + "grad_norm": 1.8514176123230044, + "language_loss": 0.75841701, + "learning_rate": 2.0561262422213832e-07, + "loss": 0.77961594, + "num_input_tokens_seen": 308089520, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.64453125, + "step": 14283, + "time_per_iteration": 2.4622461795806885 + }, + { + "auxiliary_loss_clip": 0.01099212, + "auxiliary_loss_mlp": 0.0102773, + "balance_loss_clip": 1.01597571, + "balance_loss_mlp": 1.03318167, + "epoch": 0.8588005411092741, + "flos": 34054303962240.0, + "grad_norm": 1.9075857879793239, + "language_loss": 0.59797478, + "learning_rate": 2.0544065744752736e-07, + "loss": 0.61924422, + "num_input_tokens_seen": 308111545, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66015625, + "step": 14284, + "time_per_iteration": 2.572291135787964 + }, + { + "auxiliary_loss_clip": 0.01098357, + "auxiliary_loss_mlp": 0.01028693, + "balance_loss_clip": 1.01714206, + "balance_loss_mlp": 1.03492391, + "epoch": 0.858860664361942, + "flos": 28913584268160.0, + "grad_norm": 1.7674555640674585, + "language_loss": 0.75632811, + "learning_rate": 2.0526875872278749e-07, + "loss": 0.77759862, + "num_input_tokens_seen": 308129690, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6328125, + "step": 14285, + "time_per_iteration": 2.5170655250549316 + }, + { + "auxiliary_loss_clip": 0.01103712, + "auxiliary_loss_mlp": 0.01032045, + "balance_loss_clip": 1.02010632, + "balance_loss_mlp": 1.03761566, + "epoch": 0.85892078761461, + "flos": 19792525743360.0, + "grad_norm": 2.139576760488393, + "language_loss": 0.74618649, + "learning_rate": 2.0509692805443524e-07, + "loss": 0.76754409, + "num_input_tokens_seen": 308147410, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66015625, + "step": 14286, + "time_per_iteration": 2.470148801803589 + }, + { + "auxiliary_loss_clip": 0.01022355, + "auxiliary_loss_mlp": 0.01002252, + "balance_loss_clip": 1.00119138, + "balance_loss_mlp": 1.00219035, + "epoch": 0.8589809108672779, + "flos": 67106630039040.0, + "grad_norm": 0.7838160557380515, + "language_loss": 0.49406371, + "learning_rate": 2.0492516544898718e-07, + "loss": 0.51430982, + "num_input_tokens_seen": 308204875, + "router_z_loss_clip": 0.01062012, + "router_z_loss_mlp": 0.20117188, + "step": 14287, + "time_per_iteration": 3.0114223957061768 + }, + { + "auxiliary_loss_clip": 0.01101532, + "auxiliary_loss_mlp": 0.01028987, + "balance_loss_clip": 1.01747108, + "balance_loss_mlp": 1.03548408, + "epoch": 0.8590410341199459, + "flos": 29716259541120.0, + "grad_norm": 1.771130529708156, + "language_loss": 0.79141223, + "learning_rate": 2.0475347091295704e-07, + "loss": 0.81271744, + "num_input_tokens_seen": 308225690, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66015625, + "step": 14288, + "time_per_iteration": 2.515892505645752 + }, + { + "auxiliary_loss_clip": 0.01101196, + "auxiliary_loss_mlp": 0.01030639, + "balance_loss_clip": 1.01805007, + "balance_loss_mlp": 1.03466153, + "epoch": 0.8591011573726138, + "flos": 23987430466560.0, + "grad_norm": 1.977299160967245, + "language_loss": 0.80659628, + "learning_rate": 2.045818444528553e-07, + "loss": 0.82791466, + "num_input_tokens_seen": 308245255, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6640625, + "step": 14289, + "time_per_iteration": 2.479477643966675 + }, + { + "auxiliary_loss_clip": 0.01103975, + "auxiliary_loss_mlp": 0.01024939, + "balance_loss_clip": 1.01370382, + "balance_loss_mlp": 1.03834021, + "epoch": 0.8591612806252819, + "flos": 14428656806400.0, + "grad_norm": 1.963202375321621, + "language_loss": 0.65073603, + "learning_rate": 2.0441028607518973e-07, + "loss": 0.6720252, + "num_input_tokens_seen": 308261755, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 14290, + "time_per_iteration": 2.423489809036255 + }, + { + "auxiliary_loss_clip": 0.0110313, + "auxiliary_loss_mlp": 0.01027625, + "balance_loss_clip": 1.01552534, + "balance_loss_mlp": 1.03614926, + "epoch": 0.8592214038779498, + "flos": 31577150419200.0, + "grad_norm": 1.8450218839619243, + "language_loss": 0.55117351, + "learning_rate": 2.0423879578646642e-07, + "loss": 0.57248104, + "num_input_tokens_seen": 308285145, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 14291, + "time_per_iteration": 2.5542116165161133 + }, + { + "auxiliary_loss_clip": 0.011028, + "auxiliary_loss_mlp": 0.01030334, + "balance_loss_clip": 1.01830578, + "balance_loss_mlp": 1.03624988, + "epoch": 0.8592815271306178, + "flos": 17457290835840.0, + "grad_norm": 1.854464069187037, + "language_loss": 0.70960593, + "learning_rate": 2.0406737359318792e-07, + "loss": 0.7309373, + "num_input_tokens_seen": 308304130, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 14292, + "time_per_iteration": 2.443986177444458 + }, + { + "auxiliary_loss_clip": 0.0109989, + "auxiliary_loss_mlp": 0.0103186, + "balance_loss_clip": 1.01981997, + "balance_loss_mlp": 1.0340476, + "epoch": 0.8593416503832857, + "flos": 25411360394880.0, + "grad_norm": 1.5234570799491314, + "language_loss": 0.71305615, + "learning_rate": 2.038960195018542e-07, + "loss": 0.73437369, + "num_input_tokens_seen": 308324670, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.66015625, + "step": 14293, + "time_per_iteration": 2.4947001934051514 + }, + { + "auxiliary_loss_clip": 0.01097643, + "auxiliary_loss_mlp": 0.01027806, + "balance_loss_clip": 1.01639175, + "balance_loss_mlp": 1.03337383, + "epoch": 0.8594017736359537, + "flos": 20996646393600.0, + "grad_norm": 1.4740238785457052, + "language_loss": 0.68373334, + "learning_rate": 2.0372473351896358e-07, + "loss": 0.70498788, + "num_input_tokens_seen": 308344215, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.64453125, + "step": 14294, + "time_per_iteration": 2.457850217819214 + }, + { + "auxiliary_loss_clip": 0.01096656, + "auxiliary_loss_mlp": 0.01027361, + "balance_loss_clip": 1.01627481, + "balance_loss_mlp": 1.03254747, + "epoch": 0.8594618968886216, + "flos": 22091059929600.0, + "grad_norm": 2.085676354981098, + "language_loss": 0.77626079, + "learning_rate": 2.0355351565101087e-07, + "loss": 0.79750097, + "num_input_tokens_seen": 308360520, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.640625, + "step": 14295, + "time_per_iteration": 2.495396614074707 + }, + { + "auxiliary_loss_clip": 0.01104942, + "auxiliary_loss_mlp": 0.01037594, + "balance_loss_clip": 1.02426088, + "balance_loss_mlp": 1.03482556, + "epoch": 0.8595220201412896, + "flos": 11656245467520.0, + "grad_norm": 3.9782673928281707, + "language_loss": 0.68883216, + "learning_rate": 2.0338236590448975e-07, + "loss": 0.71025753, + "num_input_tokens_seen": 308376865, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.703125, + "step": 14296, + "time_per_iteration": 2.450352907180786 + }, + { + "auxiliary_loss_clip": 0.01100285, + "auxiliary_loss_mlp": 0.01027668, + "balance_loss_clip": 1.0159142, + "balance_loss_mlp": 1.03523743, + "epoch": 0.8595821433939577, + "flos": 25040366772480.0, + "grad_norm": 3.015762114877603, + "language_loss": 0.79009968, + "learning_rate": 2.0321128428588842e-07, + "loss": 0.81137919, + "num_input_tokens_seen": 308395870, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6484375, + "step": 14297, + "time_per_iteration": 2.496976375579834 + }, + { + "auxiliary_loss_clip": 0.01095598, + "auxiliary_loss_mlp": 0.0102669, + "balance_loss_clip": 1.01620579, + "balance_loss_mlp": 1.03275037, + "epoch": 0.8596422666466256, + "flos": 28511528359680.0, + "grad_norm": 1.5403409682157543, + "language_loss": 0.67909223, + "learning_rate": 2.030402708016954e-07, + "loss": 0.70031512, + "num_input_tokens_seen": 308417250, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.62890625, + "step": 14298, + "time_per_iteration": 2.538550615310669 + }, + { + "auxiliary_loss_clip": 0.01098875, + "auxiliary_loss_mlp": 0.01031363, + "balance_loss_clip": 1.02002037, + "balance_loss_mlp": 1.03526485, + "epoch": 0.8597023898992936, + "flos": 13589137157760.0, + "grad_norm": 1.9417237901120834, + "language_loss": 0.68884093, + "learning_rate": 2.0286932545839576e-07, + "loss": 0.71014321, + "num_input_tokens_seen": 308434565, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.63671875, + "step": 14299, + "time_per_iteration": 2.474458932876587 + }, + { + "auxiliary_loss_clip": 0.01104456, + "auxiliary_loss_mlp": 0.01034924, + "balance_loss_clip": 1.02301526, + "balance_loss_mlp": 1.03598309, + "epoch": 0.8597625131519615, + "flos": 32300821728000.0, + "grad_norm": 2.8413419910831603, + "language_loss": 0.714288, + "learning_rate": 2.0269844826247096e-07, + "loss": 0.73568177, + "num_input_tokens_seen": 308450040, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.68359375, + "step": 14300, + "time_per_iteration": 2.557711362838745 + }, + { + "auxiliary_loss_clip": 0.01098234, + "auxiliary_loss_mlp": 0.01028358, + "balance_loss_clip": 1.01701558, + "balance_loss_mlp": 1.03311145, + "epoch": 0.8598226364046295, + "flos": 28730367970560.0, + "grad_norm": 1.9517620753263947, + "language_loss": 0.68880975, + "learning_rate": 2.0252763922040116e-07, + "loss": 0.71007574, + "num_input_tokens_seen": 308470545, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 14301, + "time_per_iteration": 2.508291482925415 + }, + { + "auxiliary_loss_clip": 0.0110057, + "auxiliary_loss_mlp": 0.01028957, + "balance_loss_clip": 1.01737559, + "balance_loss_mlp": 1.03456163, + "epoch": 0.8598827596572974, + "flos": 21871825269120.0, + "grad_norm": 1.7412735046511287, + "language_loss": 0.74149466, + "learning_rate": 2.023568983386641e-07, + "loss": 0.76278991, + "num_input_tokens_seen": 308490020, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66015625, + "step": 14302, + "time_per_iteration": 2.4727671146392822 + }, + { + "auxiliary_loss_clip": 0.01094464, + "auxiliary_loss_mlp": 0.01029947, + "balance_loss_clip": 1.01959372, + "balance_loss_mlp": 1.03196108, + "epoch": 0.8599428829099655, + "flos": 23767297966080.0, + "grad_norm": 1.936490583350926, + "language_loss": 0.83610648, + "learning_rate": 2.02186225623733e-07, + "loss": 0.85735059, + "num_input_tokens_seen": 308509065, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.625, + "step": 14303, + "time_per_iteration": 3.903238534927368 + }, + { + "auxiliary_loss_clip": 0.01100544, + "auxiliary_loss_mlp": 0.01034312, + "balance_loss_clip": 1.02172899, + "balance_loss_mlp": 1.0343709, + "epoch": 0.8600030061626334, + "flos": 16212770363520.0, + "grad_norm": 2.4591826021535392, + "language_loss": 0.77123845, + "learning_rate": 2.0201562108208025e-07, + "loss": 0.79258698, + "num_input_tokens_seen": 308524725, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.66015625, + "step": 14304, + "time_per_iteration": 2.416091203689575 + }, + { + "auxiliary_loss_clip": 0.0110075, + "auxiliary_loss_mlp": 0.01034223, + "balance_loss_clip": 1.02202225, + "balance_loss_mlp": 1.03498769, + "epoch": 0.8600631294153014, + "flos": 15669370437120.0, + "grad_norm": 2.157857805274316, + "language_loss": 0.53831017, + "learning_rate": 2.0184508472017537e-07, + "loss": 0.5596599, + "num_input_tokens_seen": 308543525, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65625, + "step": 14305, + "time_per_iteration": 3.9741735458374023 + }, + { + "auxiliary_loss_clip": 0.01100123, + "auxiliary_loss_mlp": 0.0102697, + "balance_loss_clip": 1.01480484, + "balance_loss_mlp": 1.0354023, + "epoch": 0.8601232526679693, + "flos": 17493093717120.0, + "grad_norm": 1.7690199733302432, + "language_loss": 0.83873999, + "learning_rate": 2.0167461654448558e-07, + "loss": 0.86001092, + "num_input_tokens_seen": 308557995, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6484375, + "step": 14306, + "time_per_iteration": 2.4614713191986084 + }, + { + "auxiliary_loss_clip": 0.01095821, + "auxiliary_loss_mlp": 0.01024556, + "balance_loss_clip": 1.01425052, + "balance_loss_mlp": 1.03247511, + "epoch": 0.8601833759206373, + "flos": 26985935963520.0, + "grad_norm": 1.3609792949742232, + "language_loss": 0.71544546, + "learning_rate": 2.01504216561474e-07, + "loss": 0.73664916, + "num_input_tokens_seen": 308582750, + "router_z_loss_clip": 0.10302734, + "router_z_loss_mlp": 0.6328125, + "step": 14307, + "time_per_iteration": 2.544206380844116 + }, + { + "auxiliary_loss_clip": 0.01101911, + "auxiliary_loss_mlp": 0.01038429, + "balance_loss_clip": 1.02579904, + "balance_loss_mlp": 1.03417909, + "epoch": 0.8602434991733052, + "flos": 25229760209280.0, + "grad_norm": 2.1884393758848018, + "language_loss": 0.63564229, + "learning_rate": 2.0133388477760316e-07, + "loss": 0.65704566, + "num_input_tokens_seen": 308603770, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6796875, + "step": 14308, + "time_per_iteration": 2.4838173389434814 + }, + { + "auxiliary_loss_clip": 0.01022727, + "auxiliary_loss_mlp": 0.01000151, + "balance_loss_clip": 0.99907821, + "balance_loss_mlp": 1.00265324, + "epoch": 0.8603036224259732, + "flos": 71015363107200.0, + "grad_norm": 0.6241531755089905, + "language_loss": 0.48517621, + "learning_rate": 2.0116362119933172e-07, + "loss": 0.50540501, + "num_input_tokens_seen": 308667735, + "router_z_loss_clip": 0.01074219, + "router_z_loss_mlp": 0.20117188, + "step": 14309, + "time_per_iteration": 4.55596661567688 + }, + { + "auxiliary_loss_clip": 0.01101397, + "auxiliary_loss_mlp": 0.0103216, + "balance_loss_clip": 1.01948833, + "balance_loss_mlp": 1.03436673, + "epoch": 0.8603637456786413, + "flos": 20300625578880.0, + "grad_norm": 1.614329583296113, + "language_loss": 0.67071158, + "learning_rate": 2.0099342583311563e-07, + "loss": 0.69204712, + "num_input_tokens_seen": 308686300, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.671875, + "step": 14310, + "time_per_iteration": 2.468801498413086 + }, + { + "auxiliary_loss_clip": 0.01101001, + "auxiliary_loss_mlp": 0.0102965, + "balance_loss_clip": 1.01839685, + "balance_loss_mlp": 1.03405249, + "epoch": 0.8604238689313092, + "flos": 21835842819840.0, + "grad_norm": 1.9370711593736145, + "language_loss": 0.77883255, + "learning_rate": 2.0082329868540905e-07, + "loss": 0.80013907, + "num_input_tokens_seen": 308705825, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 14311, + "time_per_iteration": 2.4640908241271973 + }, + { + "auxiliary_loss_clip": 0.01097513, + "auxiliary_loss_mlp": 0.01028007, + "balance_loss_clip": 1.01667011, + "balance_loss_mlp": 1.03325152, + "epoch": 0.8604839921839772, + "flos": 18004210295040.0, + "grad_norm": 2.1891429021237627, + "language_loss": 0.71380526, + "learning_rate": 2.006532397626639e-07, + "loss": 0.73506045, + "num_input_tokens_seen": 308723340, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 14312, + "time_per_iteration": 2.445478916168213 + }, + { + "auxiliary_loss_clip": 0.01096908, + "auxiliary_loss_mlp": 0.01028725, + "balance_loss_clip": 1.01734078, + "balance_loss_mlp": 1.03194928, + "epoch": 0.8605441154366451, + "flos": 16252164604800.0, + "grad_norm": 3.7836745296831364, + "language_loss": 0.77748859, + "learning_rate": 2.0048324907132797e-07, + "loss": 0.79874492, + "num_input_tokens_seen": 308741280, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 14313, + "time_per_iteration": 2.419283866882324 + }, + { + "auxiliary_loss_clip": 0.01100089, + "auxiliary_loss_mlp": 0.01030072, + "balance_loss_clip": 1.01744211, + "balance_loss_mlp": 1.0360148, + "epoch": 0.8606042386893131, + "flos": 32267065921920.0, + "grad_norm": 1.4787616110035209, + "language_loss": 0.72963393, + "learning_rate": 2.003133266178474e-07, + "loss": 0.75093555, + "num_input_tokens_seen": 308762875, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.640625, + "step": 14314, + "time_per_iteration": 2.5891568660736084 + }, + { + "auxiliary_loss_clip": 0.01097226, + "auxiliary_loss_mlp": 0.01028126, + "balance_loss_clip": 1.01601493, + "balance_loss_mlp": 1.03222215, + "epoch": 0.860664361941981, + "flos": 20229774001920.0, + "grad_norm": 1.725480228897019, + "language_loss": 0.69030631, + "learning_rate": 2.001434724086657e-07, + "loss": 0.71155983, + "num_input_tokens_seen": 308780315, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6484375, + "step": 14315, + "time_per_iteration": 2.43929386138916 + }, + { + "auxiliary_loss_clip": 0.01099964, + "auxiliary_loss_mlp": 0.01035331, + "balance_loss_clip": 1.02407146, + "balance_loss_mlp": 1.03532481, + "epoch": 0.8607244851946491, + "flos": 25191622944000.0, + "grad_norm": 1.720640234403056, + "language_loss": 0.72141165, + "learning_rate": 1.9997368645022418e-07, + "loss": 0.74276459, + "num_input_tokens_seen": 308799435, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 14316, + "time_per_iteration": 2.5051429271698 + }, + { + "auxiliary_loss_clip": 0.01103739, + "auxiliary_loss_mlp": 0.01031834, + "balance_loss_clip": 1.02011001, + "balance_loss_mlp": 1.03745365, + "epoch": 0.860784608447317, + "flos": 20482082110080.0, + "grad_norm": 5.919015136820617, + "language_loss": 0.82782209, + "learning_rate": 1.9980396874896056e-07, + "loss": 0.84917772, + "num_input_tokens_seen": 308817730, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 14317, + "time_per_iteration": 2.430666923522949 + }, + { + "auxiliary_loss_clip": 0.01099259, + "auxiliary_loss_mlp": 0.01030362, + "balance_loss_clip": 1.0189358, + "balance_loss_mlp": 1.03569496, + "epoch": 0.860844731699985, + "flos": 50476037696640.0, + "grad_norm": 1.677931985843384, + "language_loss": 0.67345351, + "learning_rate": 1.996343193113108e-07, + "loss": 0.69474971, + "num_input_tokens_seen": 308841735, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.63671875, + "step": 14318, + "time_per_iteration": 2.736192226409912 + }, + { + "auxiliary_loss_clip": 0.01096419, + "auxiliary_loss_mlp": 0.01026294, + "balance_loss_clip": 1.01545799, + "balance_loss_mlp": 1.03341532, + "epoch": 0.8609048549526529, + "flos": 41172768455040.0, + "grad_norm": 1.8345368571584644, + "language_loss": 0.71489882, + "learning_rate": 1.9946473814370911e-07, + "loss": 0.73612595, + "num_input_tokens_seen": 308865050, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.62890625, + "step": 14319, + "time_per_iteration": 2.6309432983398438 + }, + { + "auxiliary_loss_clip": 0.01106052, + "auxiliary_loss_mlp": 0.01030959, + "balance_loss_clip": 1.01912153, + "balance_loss_mlp": 1.03829098, + "epoch": 0.8609649782053209, + "flos": 23951196622080.0, + "grad_norm": 3.6611413698943016, + "language_loss": 0.67307162, + "learning_rate": 1.992952252525839e-07, + "loss": 0.69444174, + "num_input_tokens_seen": 308885375, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.67578125, + "step": 14320, + "time_per_iteration": 2.4838664531707764 + }, + { + "auxiliary_loss_clip": 0.0110247, + "auxiliary_loss_mlp": 0.01036541, + "balance_loss_clip": 1.02404201, + "balance_loss_mlp": 1.03398824, + "epoch": 0.8610251014579888, + "flos": 23112574813440.0, + "grad_norm": 3.6200178923733457, + "language_loss": 0.80436695, + "learning_rate": 1.9912578064436446e-07, + "loss": 0.82575703, + "num_input_tokens_seen": 308904700, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 14321, + "time_per_iteration": 2.4537878036499023 + }, + { + "auxiliary_loss_clip": 0.01097469, + "auxiliary_loss_mlp": 0.01029444, + "balance_loss_clip": 1.01696324, + "balance_loss_mlp": 1.03430629, + "epoch": 0.8610852247106568, + "flos": 19426811420160.0, + "grad_norm": 2.244134105201333, + "language_loss": 0.71181291, + "learning_rate": 1.9895640432547567e-07, + "loss": 0.73308206, + "num_input_tokens_seen": 308922985, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6328125, + "step": 14322, + "time_per_iteration": 2.4386720657348633 + }, + { + "auxiliary_loss_clip": 0.01105065, + "auxiliary_loss_mlp": 0.0103565, + "balance_loss_clip": 1.0233357, + "balance_loss_mlp": 1.03611112, + "epoch": 0.8611453479633249, + "flos": 19312076401920.0, + "grad_norm": 1.9495478111154043, + "language_loss": 0.55936325, + "learning_rate": 1.9878709630234102e-07, + "loss": 0.58077037, + "num_input_tokens_seen": 308940765, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 14323, + "time_per_iteration": 2.459597587585449 + }, + { + "auxiliary_loss_clip": 0.01098786, + "auxiliary_loss_mlp": 0.01028404, + "balance_loss_clip": 1.016716, + "balance_loss_mlp": 1.03410077, + "epoch": 0.8612054712159928, + "flos": 23253667436160.0, + "grad_norm": 1.5772528129595897, + "language_loss": 0.75499862, + "learning_rate": 1.986178565813801e-07, + "loss": 0.77627051, + "num_input_tokens_seen": 308960110, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6484375, + "step": 14324, + "time_per_iteration": 2.4845757484436035 + }, + { + "auxiliary_loss_clip": 0.01101451, + "auxiliary_loss_mlp": 0.01032685, + "balance_loss_clip": 1.02004886, + "balance_loss_mlp": 1.03591871, + "epoch": 0.8612655944686608, + "flos": 16028440744320.0, + "grad_norm": 2.344168191200211, + "language_loss": 0.66626883, + "learning_rate": 1.9844868516901036e-07, + "loss": 0.68761015, + "num_input_tokens_seen": 308976665, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.65625, + "step": 14325, + "time_per_iteration": 2.426171064376831 + }, + { + "auxiliary_loss_clip": 0.01101988, + "auxiliary_loss_mlp": 0.01031303, + "balance_loss_clip": 1.01929832, + "balance_loss_mlp": 1.03482699, + "epoch": 0.8613257177213287, + "flos": 22492720788480.0, + "grad_norm": 1.5550716058490144, + "language_loss": 0.64468634, + "learning_rate": 1.982795820716472e-07, + "loss": 0.6660192, + "num_input_tokens_seen": 308997015, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 14326, + "time_per_iteration": 2.4898109436035156 + }, + { + "auxiliary_loss_clip": 0.01099753, + "auxiliary_loss_mlp": 0.01027672, + "balance_loss_clip": 1.01547134, + "balance_loss_mlp": 1.03399992, + "epoch": 0.8613858409739967, + "flos": 17238056175360.0, + "grad_norm": 1.9452722163445866, + "language_loss": 0.83793277, + "learning_rate": 1.9811054729570253e-07, + "loss": 0.85920697, + "num_input_tokens_seen": 309015250, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65625, + "step": 14327, + "time_per_iteration": 2.468287229537964 + }, + { + "auxiliary_loss_clip": 0.01099492, + "auxiliary_loss_mlp": 0.01027725, + "balance_loss_clip": 1.01605403, + "balance_loss_mlp": 1.03375804, + "epoch": 0.8614459642266646, + "flos": 22821123859200.0, + "grad_norm": 2.187677830204378, + "language_loss": 0.74751425, + "learning_rate": 1.9794158084758661e-07, + "loss": 0.76878637, + "num_input_tokens_seen": 309034140, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65625, + "step": 14328, + "time_per_iteration": 2.4822041988372803 + }, + { + "auxiliary_loss_clip": 0.01098484, + "auxiliary_loss_mlp": 0.01027826, + "balance_loss_clip": 1.01648343, + "balance_loss_mlp": 1.03398705, + "epoch": 0.8615060874793327, + "flos": 26504301473280.0, + "grad_norm": 2.543685258961499, + "language_loss": 0.80284798, + "learning_rate": 1.9777268273370673e-07, + "loss": 0.8241111, + "num_input_tokens_seen": 309055075, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.64453125, + "step": 14329, + "time_per_iteration": 2.494361400604248 + }, + { + "auxiliary_loss_clip": 0.01098414, + "auxiliary_loss_mlp": 0.01026208, + "balance_loss_clip": 1.0146507, + "balance_loss_mlp": 1.03334963, + "epoch": 0.8615662107320006, + "flos": 24061011477120.0, + "grad_norm": 2.041563581055635, + "language_loss": 0.7741102, + "learning_rate": 1.9760385296046757e-07, + "loss": 0.79535639, + "num_input_tokens_seen": 309074650, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6484375, + "step": 14330, + "time_per_iteration": 2.4986650943756104 + }, + { + "auxiliary_loss_clip": 0.01099802, + "auxiliary_loss_mlp": 0.01029112, + "balance_loss_clip": 1.01725626, + "balance_loss_mlp": 1.03433192, + "epoch": 0.8616263339846686, + "flos": 24165044242560.0, + "grad_norm": 1.8440018429661427, + "language_loss": 0.64671254, + "learning_rate": 1.974350915342702e-07, + "loss": 0.66800165, + "num_input_tokens_seen": 309094385, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65625, + "step": 14331, + "time_per_iteration": 2.4618399143218994 + }, + { + "auxiliary_loss_clip": 0.01097847, + "auxiliary_loss_mlp": 0.01029793, + "balance_loss_clip": 1.01903462, + "balance_loss_mlp": 1.03375757, + "epoch": 0.8616864572373365, + "flos": 21724340025600.0, + "grad_norm": 1.617939309613219, + "language_loss": 0.7562784, + "learning_rate": 1.9726639846151506e-07, + "loss": 0.77755475, + "num_input_tokens_seen": 309111815, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.640625, + "step": 14332, + "time_per_iteration": 2.4628806114196777 + }, + { + "auxiliary_loss_clip": 0.01102042, + "auxiliary_loss_mlp": 0.01027637, + "balance_loss_clip": 1.0153228, + "balance_loss_mlp": 1.03375912, + "epoch": 0.8617465804900045, + "flos": 23766651521280.0, + "grad_norm": 2.5076238331768623, + "language_loss": 0.67116582, + "learning_rate": 1.9709777374859904e-07, + "loss": 0.69246262, + "num_input_tokens_seen": 309131385, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.68359375, + "step": 14333, + "time_per_iteration": 2.4831488132476807 + }, + { + "auxiliary_loss_clip": 0.01105944, + "auxiliary_loss_mlp": 0.01037056, + "balance_loss_clip": 1.0235616, + "balance_loss_mlp": 1.03613853, + "epoch": 0.8618067037426724, + "flos": 37703941251840.0, + "grad_norm": 1.640443227867146, + "language_loss": 0.62265992, + "learning_rate": 1.969292174019157e-07, + "loss": 0.64408994, + "num_input_tokens_seen": 309155020, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.6953125, + "step": 14334, + "time_per_iteration": 2.6413328647613525 + }, + { + "auxiliary_loss_clip": 0.01104093, + "auxiliary_loss_mlp": 0.01035775, + "balance_loss_clip": 1.02375257, + "balance_loss_mlp": 1.03698754, + "epoch": 0.8618668269953405, + "flos": 21471026336640.0, + "grad_norm": 2.0607406603383387, + "language_loss": 0.69330931, + "learning_rate": 1.967607294278577e-07, + "loss": 0.71470803, + "num_input_tokens_seen": 309172865, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 14335, + "time_per_iteration": 2.488579273223877 + }, + { + "auxiliary_loss_clip": 0.01102454, + "auxiliary_loss_mlp": 0.01029981, + "balance_loss_clip": 1.01848936, + "balance_loss_mlp": 1.03566599, + "epoch": 0.8619269502480085, + "flos": 22232691256320.0, + "grad_norm": 1.4915762191068862, + "language_loss": 0.82732737, + "learning_rate": 1.965923098328135e-07, + "loss": 0.84865171, + "num_input_tokens_seen": 309193575, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66796875, + "step": 14336, + "time_per_iteration": 2.465843677520752 + }, + { + "auxiliary_loss_clip": 0.0110378, + "auxiliary_loss_mlp": 0.01033334, + "balance_loss_clip": 1.0215981, + "balance_loss_mlp": 1.03505278, + "epoch": 0.8619870735006764, + "flos": 22710626645760.0, + "grad_norm": 1.9872260251064142, + "language_loss": 0.67640537, + "learning_rate": 1.9642395862316907e-07, + "loss": 0.69777656, + "num_input_tokens_seen": 309212680, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6875, + "step": 14337, + "time_per_iteration": 2.4723286628723145 + }, + { + "auxiliary_loss_clip": 0.01098134, + "auxiliary_loss_mlp": 0.01029072, + "balance_loss_clip": 1.01767564, + "balance_loss_mlp": 1.03330922, + "epoch": 0.8620471967533444, + "flos": 37520293991040.0, + "grad_norm": 1.6812153581439713, + "language_loss": 0.66831827, + "learning_rate": 1.962556758053089e-07, + "loss": 0.68959028, + "num_input_tokens_seen": 309234485, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 14338, + "time_per_iteration": 2.7375404834747314 + }, + { + "auxiliary_loss_clip": 0.01101827, + "auxiliary_loss_mlp": 0.01030096, + "balance_loss_clip": 1.01874149, + "balance_loss_mlp": 1.03572762, + "epoch": 0.8621073200060123, + "flos": 19682459493120.0, + "grad_norm": 1.9113561041240386, + "language_loss": 0.61766338, + "learning_rate": 1.9608746138561448e-07, + "loss": 0.63898253, + "num_input_tokens_seen": 309253630, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.66015625, + "step": 14339, + "time_per_iteration": 2.4824411869049072 + }, + { + "auxiliary_loss_clip": 0.01098374, + "auxiliary_loss_mlp": 0.0103256, + "balance_loss_clip": 1.02059722, + "balance_loss_mlp": 1.0335691, + "epoch": 0.8621674432586803, + "flos": 14536855549440.0, + "grad_norm": 2.224022175139818, + "language_loss": 0.62476075, + "learning_rate": 1.9591931537046458e-07, + "loss": 0.64607006, + "num_input_tokens_seen": 309270950, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6484375, + "step": 14340, + "time_per_iteration": 2.4229180812835693 + }, + { + "auxiliary_loss_clip": 0.01094892, + "auxiliary_loss_mlp": 0.01022284, + "balance_loss_clip": 1.01154339, + "balance_loss_mlp": 1.034199, + "epoch": 0.8622275665113482, + "flos": 20740100480640.0, + "grad_norm": 6.918273965774928, + "language_loss": 0.80039394, + "learning_rate": 1.9575123776623493e-07, + "loss": 0.82156569, + "num_input_tokens_seen": 309288780, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.60546875, + "step": 14341, + "time_per_iteration": 2.458031177520752 + }, + { + "auxiliary_loss_clip": 0.01097965, + "auxiliary_loss_mlp": 0.01029866, + "balance_loss_clip": 1.01902366, + "balance_loss_mlp": 1.03404796, + "epoch": 0.8622876897640163, + "flos": 24715914197760.0, + "grad_norm": 1.6209236423079696, + "language_loss": 0.74565721, + "learning_rate": 1.9558322857929887e-07, + "loss": 0.76693547, + "num_input_tokens_seen": 309310875, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.640625, + "step": 14342, + "time_per_iteration": 2.4915459156036377 + }, + { + "auxiliary_loss_clip": 0.01101781, + "auxiliary_loss_mlp": 0.01026411, + "balance_loss_clip": 1.01432872, + "balance_loss_mlp": 1.03483212, + "epoch": 0.8623478130166842, + "flos": 17457362663040.0, + "grad_norm": 1.9153066937981833, + "language_loss": 0.68352622, + "learning_rate": 1.95415287816028e-07, + "loss": 0.70480812, + "num_input_tokens_seen": 309329900, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66796875, + "step": 14343, + "time_per_iteration": 2.4379770755767822 + }, + { + "auxiliary_loss_clip": 0.01099898, + "auxiliary_loss_mlp": 0.01039302, + "balance_loss_clip": 1.0267911, + "balance_loss_mlp": 1.03367257, + "epoch": 0.8624079362693522, + "flos": 18109176814080.0, + "grad_norm": 2.013946395887745, + "language_loss": 0.67857057, + "learning_rate": 1.9524741548278967e-07, + "loss": 0.69996256, + "num_input_tokens_seen": 309347870, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6640625, + "step": 14344, + "time_per_iteration": 2.4314322471618652 + }, + { + "auxiliary_loss_clip": 0.01101335, + "auxiliary_loss_mlp": 0.01033381, + "balance_loss_clip": 1.02183521, + "balance_loss_mlp": 1.03399456, + "epoch": 0.8624680595220201, + "flos": 30666455971200.0, + "grad_norm": 1.3859624698188922, + "language_loss": 0.81348252, + "learning_rate": 1.9507961158595054e-07, + "loss": 0.83482969, + "num_input_tokens_seen": 309371695, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 14345, + "time_per_iteration": 4.005348205566406 + }, + { + "auxiliary_loss_clip": 0.01102538, + "auxiliary_loss_mlp": 0.01030595, + "balance_loss_clip": 1.01834607, + "balance_loss_mlp": 1.03569663, + "epoch": 0.8625281827746881, + "flos": 37998588516480.0, + "grad_norm": 2.749407337350185, + "language_loss": 0.50631642, + "learning_rate": 1.9491187613187355e-07, + "loss": 0.52764773, + "num_input_tokens_seen": 309394645, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.66796875, + "step": 14346, + "time_per_iteration": 2.592672824859619 + }, + { + "auxiliary_loss_clip": 0.01098677, + "auxiliary_loss_mlp": 0.01030095, + "balance_loss_clip": 1.01834738, + "balance_loss_mlp": 1.0334121, + "epoch": 0.862588306027356, + "flos": 26249730808320.0, + "grad_norm": 1.492868040910136, + "language_loss": 0.75041229, + "learning_rate": 1.9474420912691913e-07, + "loss": 0.77169997, + "num_input_tokens_seen": 309413170, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65234375, + "step": 14347, + "time_per_iteration": 3.972740888595581 + }, + { + "auxiliary_loss_clip": 0.01101781, + "auxiliary_loss_mlp": 0.01027989, + "balance_loss_clip": 1.01587152, + "balance_loss_mlp": 1.03619015, + "epoch": 0.862648429280024, + "flos": 25878809013120.0, + "grad_norm": 1.9072390574317508, + "language_loss": 0.80890203, + "learning_rate": 1.945766105774449e-07, + "loss": 0.83019972, + "num_input_tokens_seen": 309431315, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.65625, + "step": 14348, + "time_per_iteration": 2.496711015701294 + }, + { + "auxiliary_loss_clip": 0.01095235, + "auxiliary_loss_mlp": 0.01026238, + "balance_loss_clip": 1.01494908, + "balance_loss_mlp": 1.03236437, + "epoch": 0.862708552532692, + "flos": 37816413713280.0, + "grad_norm": 1.6607225101041632, + "language_loss": 0.66021013, + "learning_rate": 1.9440908048980665e-07, + "loss": 0.68142486, + "num_input_tokens_seen": 309453020, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.62890625, + "step": 14349, + "time_per_iteration": 2.5997297763824463 + }, + { + "auxiliary_loss_clip": 0.01098965, + "auxiliary_loss_mlp": 0.01037723, + "balance_loss_clip": 1.02595711, + "balance_loss_mlp": 1.03370428, + "epoch": 0.86276867578536, + "flos": 19091800247040.0, + "grad_norm": 2.2216960687680865, + "language_loss": 0.7004388, + "learning_rate": 1.942416188703573e-07, + "loss": 0.72180569, + "num_input_tokens_seen": 309469780, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.65234375, + "step": 14350, + "time_per_iteration": 2.4430317878723145 + }, + { + "auxiliary_loss_clip": 0.01098606, + "auxiliary_loss_mlp": 0.01029753, + "balance_loss_clip": 1.01812983, + "balance_loss_mlp": 1.033795, + "epoch": 0.862828799038028, + "flos": 22164281804160.0, + "grad_norm": 1.7403260075665494, + "language_loss": 0.77165568, + "learning_rate": 1.9407422572544618e-07, + "loss": 0.79293925, + "num_input_tokens_seen": 309489610, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6484375, + "step": 14351, + "time_per_iteration": 3.8987040519714355 + }, + { + "auxiliary_loss_clip": 0.01099378, + "auxiliary_loss_mlp": 0.01031637, + "balance_loss_clip": 1.02070606, + "balance_loss_mlp": 1.0340271, + "epoch": 0.8628889222906959, + "flos": 23145576433920.0, + "grad_norm": 1.7490870392415556, + "language_loss": 0.8492626, + "learning_rate": 1.9390690106142204e-07, + "loss": 0.87057269, + "num_input_tokens_seen": 309508295, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.65625, + "step": 14352, + "time_per_iteration": 2.4467623233795166 + }, + { + "auxiliary_loss_clip": 0.01022274, + "auxiliary_loss_mlp": 0.00997547, + "balance_loss_clip": 0.99648613, + "balance_loss_mlp": 1.00222087, + "epoch": 0.8629490455433639, + "flos": 57817762151040.0, + "grad_norm": 0.8026714354601334, + "language_loss": 0.61920941, + "learning_rate": 1.9373964488462913e-07, + "loss": 0.63940763, + "num_input_tokens_seen": 309567960, + "router_z_loss_clip": 0.01062012, + "router_z_loss_mlp": 0.20117188, + "step": 14353, + "time_per_iteration": 3.0518198013305664 + }, + { + "auxiliary_loss_clip": 0.01100741, + "auxiliary_loss_mlp": 0.01028388, + "balance_loss_clip": 1.01759958, + "balance_loss_mlp": 1.03660202, + "epoch": 0.8630091687960318, + "flos": 15919667383680.0, + "grad_norm": 1.6257849908762414, + "language_loss": 0.82292426, + "learning_rate": 1.9357245720140948e-07, + "loss": 0.84421557, + "num_input_tokens_seen": 309586050, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.640625, + "step": 14354, + "time_per_iteration": 2.454332113265991 + }, + { + "auxiliary_loss_clip": 0.01098964, + "auxiliary_loss_mlp": 0.01029623, + "balance_loss_clip": 1.01815522, + "balance_loss_mlp": 1.03391898, + "epoch": 0.8630692920486999, + "flos": 17961691570560.0, + "grad_norm": 2.066989456168094, + "language_loss": 0.85694742, + "learning_rate": 1.934053380181031e-07, + "loss": 0.87823325, + "num_input_tokens_seen": 309602910, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 14355, + "time_per_iteration": 2.4151952266693115 + }, + { + "auxiliary_loss_clip": 0.01100727, + "auxiliary_loss_mlp": 0.01031661, + "balance_loss_clip": 1.01879287, + "balance_loss_mlp": 1.03383946, + "epoch": 0.8631294153013678, + "flos": 22455158140800.0, + "grad_norm": 3.5101420065502404, + "language_loss": 0.58819818, + "learning_rate": 1.9323828734104763e-07, + "loss": 0.6095221, + "num_input_tokens_seen": 309621175, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.66796875, + "step": 14356, + "time_per_iteration": 2.435149908065796 + }, + { + "auxiliary_loss_clip": 0.0110249, + "auxiliary_loss_mlp": 0.01032276, + "balance_loss_clip": 1.01964545, + "balance_loss_mlp": 1.0345161, + "epoch": 0.8631895385540358, + "flos": 16837005847680.0, + "grad_norm": 1.6248412634448182, + "language_loss": 0.76978958, + "learning_rate": 1.9307130517657756e-07, + "loss": 0.79113722, + "num_input_tokens_seen": 309639395, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 14357, + "time_per_iteration": 2.3934738636016846 + }, + { + "auxiliary_loss_clip": 0.01101033, + "auxiliary_loss_mlp": 0.01031615, + "balance_loss_clip": 1.01962256, + "balance_loss_mlp": 1.03499317, + "epoch": 0.8632496618067037, + "flos": 18697214367360.0, + "grad_norm": 2.3756415536738897, + "language_loss": 0.77549875, + "learning_rate": 1.9290439153102468e-07, + "loss": 0.79682523, + "num_input_tokens_seen": 309657265, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66015625, + "step": 14358, + "time_per_iteration": 2.4436261653900146 + }, + { + "auxiliary_loss_clip": 0.01100136, + "auxiliary_loss_mlp": 0.01027101, + "balance_loss_clip": 1.01543641, + "balance_loss_mlp": 1.03392935, + "epoch": 0.8633097850593717, + "flos": 24279922915200.0, + "grad_norm": 1.7651797926248376, + "language_loss": 0.7522471, + "learning_rate": 1.9273754641071816e-07, + "loss": 0.77351952, + "num_input_tokens_seen": 309678610, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 14359, + "time_per_iteration": 2.4768805503845215 + }, + { + "auxiliary_loss_clip": 0.01094416, + "auxiliary_loss_mlp": 0.01025369, + "balance_loss_clip": 1.0139606, + "balance_loss_mlp": 1.0316956, + "epoch": 0.8633699083120396, + "flos": 21178569801600.0, + "grad_norm": 1.7556957611737163, + "language_loss": 0.70558703, + "learning_rate": 1.9257076982198517e-07, + "loss": 0.72678494, + "num_input_tokens_seen": 309697710, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.625, + "step": 14360, + "time_per_iteration": 2.4633822441101074 + }, + { + "auxiliary_loss_clip": 0.01104009, + "auxiliary_loss_mlp": 0.0103215, + "balance_loss_clip": 1.01929903, + "balance_loss_mlp": 1.0365603, + "epoch": 0.8634300315647077, + "flos": 19244888012160.0, + "grad_norm": 1.7204964341930526, + "language_loss": 0.76154602, + "learning_rate": 1.9240406177114953e-07, + "loss": 0.78290761, + "num_input_tokens_seen": 309715985, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.671875, + "step": 14361, + "time_per_iteration": 2.457219123840332 + }, + { + "auxiliary_loss_clip": 0.01022616, + "auxiliary_loss_mlp": 0.00998161, + "balance_loss_clip": 0.99707615, + "balance_loss_mlp": 1.00261354, + "epoch": 0.8634901548173756, + "flos": 66195648282240.0, + "grad_norm": 0.9659533407712392, + "language_loss": 0.58873498, + "learning_rate": 1.922374222645329e-07, + "loss": 0.60894275, + "num_input_tokens_seen": 309779930, + "router_z_loss_clip": 0.01086426, + "router_z_loss_mlp": 0.20019531, + "step": 14362, + "time_per_iteration": 3.0631728172302246 + }, + { + "auxiliary_loss_clip": 0.01101996, + "auxiliary_loss_mlp": 0.0103188, + "balance_loss_clip": 1.01902294, + "balance_loss_mlp": 1.03466797, + "epoch": 0.8635502780700436, + "flos": 24789531121920.0, + "grad_norm": 2.5080162128467394, + "language_loss": 0.8062591, + "learning_rate": 1.9207085130845524e-07, + "loss": 0.82759786, + "num_input_tokens_seen": 309800580, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.671875, + "step": 14363, + "time_per_iteration": 2.5122597217559814 + }, + { + "auxiliary_loss_clip": 0.01101414, + "auxiliary_loss_mlp": 0.01035658, + "balance_loss_clip": 1.02290881, + "balance_loss_mlp": 1.03377891, + "epoch": 0.8636104013227116, + "flos": 25189970918400.0, + "grad_norm": 2.5438276077077013, + "language_loss": 0.72507155, + "learning_rate": 1.9190434890923112e-07, + "loss": 0.74644232, + "num_input_tokens_seen": 309821725, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 14364, + "time_per_iteration": 2.4990742206573486 + }, + { + "auxiliary_loss_clip": 0.01101322, + "auxiliary_loss_mlp": 0.01032806, + "balance_loss_clip": 1.02173114, + "balance_loss_mlp": 1.03382576, + "epoch": 0.8636705245753795, + "flos": 23878441624320.0, + "grad_norm": 1.5907362446785047, + "language_loss": 0.71736836, + "learning_rate": 1.917379150731755e-07, + "loss": 0.73870963, + "num_input_tokens_seen": 309841565, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.671875, + "step": 14365, + "time_per_iteration": 2.5119576454162598 + }, + { + "auxiliary_loss_clip": 0.01104136, + "auxiliary_loss_mlp": 0.01037185, + "balance_loss_clip": 1.0247215, + "balance_loss_mlp": 1.03606164, + "epoch": 0.8637306478280475, + "flos": 23110455911040.0, + "grad_norm": 2.5037691791086902, + "language_loss": 0.70827854, + "learning_rate": 1.915715498065993e-07, + "loss": 0.72969174, + "num_input_tokens_seen": 309858635, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6796875, + "step": 14366, + "time_per_iteration": 2.442091464996338 + }, + { + "auxiliary_loss_clip": 0.01098479, + "auxiliary_loss_mlp": 0.01026254, + "balance_loss_clip": 1.01578689, + "balance_loss_mlp": 1.03511524, + "epoch": 0.8637907710807154, + "flos": 21906802137600.0, + "grad_norm": 1.5905343541325248, + "language_loss": 0.81950366, + "learning_rate": 1.9140525311581146e-07, + "loss": 0.84075105, + "num_input_tokens_seen": 309877885, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.6328125, + "step": 14367, + "time_per_iteration": 2.4821760654449463 + }, + { + "auxiliary_loss_clip": 0.01102272, + "auxiliary_loss_mlp": 0.01027729, + "balance_loss_clip": 1.01515222, + "balance_loss_mlp": 1.03513527, + "epoch": 0.8638508943333835, + "flos": 23580526222080.0, + "grad_norm": 1.9267157021688266, + "language_loss": 0.61380374, + "learning_rate": 1.9123902500711743e-07, + "loss": 0.63510376, + "num_input_tokens_seen": 309893140, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.671875, + "step": 14368, + "time_per_iteration": 2.4707953929901123 + }, + { + "auxiliary_loss_clip": 0.01100887, + "auxiliary_loss_mlp": 0.01030706, + "balance_loss_clip": 1.01909471, + "balance_loss_mlp": 1.03561449, + "epoch": 0.8639110175860514, + "flos": 25775853655680.0, + "grad_norm": 1.945590600384619, + "language_loss": 0.76329541, + "learning_rate": 1.91072865486821e-07, + "loss": 0.78461134, + "num_input_tokens_seen": 309914175, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 14369, + "time_per_iteration": 2.5276546478271484 + }, + { + "auxiliary_loss_clip": 0.01101224, + "auxiliary_loss_mlp": 0.01031574, + "balance_loss_clip": 1.01921809, + "balance_loss_mlp": 1.03340197, + "epoch": 0.8639711408387194, + "flos": 23369443948800.0, + "grad_norm": 1.737706503573944, + "language_loss": 0.6419906, + "learning_rate": 1.9090677456122294e-07, + "loss": 0.66331857, + "num_input_tokens_seen": 309932395, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6796875, + "step": 14370, + "time_per_iteration": 2.452861785888672 + }, + { + "auxiliary_loss_clip": 0.0110173, + "auxiliary_loss_mlp": 0.01032222, + "balance_loss_clip": 1.02055156, + "balance_loss_mlp": 1.03612161, + "epoch": 0.8640312640913873, + "flos": 22127221946880.0, + "grad_norm": 1.5489158214730672, + "language_loss": 0.6619693, + "learning_rate": 1.907407522366209e-07, + "loss": 0.68330884, + "num_input_tokens_seen": 309951720, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 14371, + "time_per_iteration": 2.454864263534546 + }, + { + "auxiliary_loss_clip": 0.01021913, + "auxiliary_loss_mlp": 0.00998207, + "balance_loss_clip": 0.99720526, + "balance_loss_mlp": 1.00191593, + "epoch": 0.8640913873440553, + "flos": 57571735944960.0, + "grad_norm": 0.8716085073446712, + "language_loss": 0.56875324, + "learning_rate": 1.905747985193107e-07, + "loss": 0.58895445, + "num_input_tokens_seen": 310006120, + "router_z_loss_clip": 0.01000977, + "router_z_loss_mlp": 0.20019531, + "step": 14372, + "time_per_iteration": 2.9307870864868164 + }, + { + "auxiliary_loss_clip": 0.01098629, + "auxiliary_loss_mlp": 0.0103017, + "balance_loss_clip": 1.01812363, + "balance_loss_mlp": 1.03571773, + "epoch": 0.8641515105967232, + "flos": 23987430466560.0, + "grad_norm": 1.7171736097417043, + "language_loss": 0.79384911, + "learning_rate": 1.9040891341558597e-07, + "loss": 0.81513715, + "num_input_tokens_seen": 310026740, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.62890625, + "step": 14373, + "time_per_iteration": 2.493739604949951 + }, + { + "auxiliary_loss_clip": 0.01100672, + "auxiliary_loss_mlp": 0.01027536, + "balance_loss_clip": 1.01599669, + "balance_loss_mlp": 1.03470039, + "epoch": 0.8642116338493913, + "flos": 19062749122560.0, + "grad_norm": 1.7213637290522288, + "language_loss": 0.63829684, + "learning_rate": 1.9024309693173656e-07, + "loss": 0.65957886, + "num_input_tokens_seen": 310044135, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66015625, + "step": 14374, + "time_per_iteration": 2.42464017868042 + }, + { + "auxiliary_loss_clip": 0.01100064, + "auxiliary_loss_mlp": 0.0102942, + "balance_loss_clip": 1.01809502, + "balance_loss_mlp": 1.03603303, + "epoch": 0.8642717571020592, + "flos": 18254148105600.0, + "grad_norm": 1.9472850722016972, + "language_loss": 0.77497673, + "learning_rate": 1.9007734907404993e-07, + "loss": 0.79627156, + "num_input_tokens_seen": 310061560, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 14375, + "time_per_iteration": 2.4481828212738037 + }, + { + "auxiliary_loss_clip": 0.0109996, + "auxiliary_loss_mlp": 0.01032779, + "balance_loss_clip": 1.02089977, + "balance_loss_mlp": 1.03409755, + "epoch": 0.8643318803547272, + "flos": 57663270777600.0, + "grad_norm": 1.5689694423673801, + "language_loss": 0.60686284, + "learning_rate": 1.899116698488117e-07, + "loss": 0.62819022, + "num_input_tokens_seen": 310087310, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66015625, + "step": 14376, + "time_per_iteration": 2.776718854904175 + }, + { + "auxiliary_loss_clip": 0.01097662, + "auxiliary_loss_mlp": 0.01032505, + "balance_loss_clip": 1.0215137, + "balance_loss_mlp": 1.03338146, + "epoch": 0.8643920036073952, + "flos": 19609524927360.0, + "grad_norm": 1.6233773898325343, + "language_loss": 0.66225243, + "learning_rate": 1.8974605926230457e-07, + "loss": 0.68355405, + "num_input_tokens_seen": 310106260, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.64453125, + "step": 14377, + "time_per_iteration": 2.4549734592437744 + }, + { + "auxiliary_loss_clip": 0.01100162, + "auxiliary_loss_mlp": 0.01032145, + "balance_loss_clip": 1.02024806, + "balance_loss_mlp": 1.03251028, + "epoch": 0.8644521268600631, + "flos": 20850346298880.0, + "grad_norm": 1.716134367829843, + "language_loss": 0.70389247, + "learning_rate": 1.8958051732080804e-07, + "loss": 0.72521555, + "num_input_tokens_seen": 310125305, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 14378, + "time_per_iteration": 2.4362454414367676 + }, + { + "auxiliary_loss_clip": 0.01022402, + "auxiliary_loss_mlp": 0.00998397, + "balance_loss_clip": 0.99738961, + "balance_loss_mlp": 1.00238144, + "epoch": 0.8645122501127311, + "flos": 66719550101760.0, + "grad_norm": 0.8057206784790626, + "language_loss": 0.60312063, + "learning_rate": 1.894150440305995e-07, + "loss": 0.62332863, + "num_input_tokens_seen": 310189270, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.20019531, + "step": 14379, + "time_per_iteration": 3.062391757965088 + }, + { + "auxiliary_loss_clip": 0.01097844, + "auxiliary_loss_mlp": 0.01029334, + "balance_loss_clip": 1.0178901, + "balance_loss_mlp": 1.03340411, + "epoch": 0.864572373365399, + "flos": 21690009601920.0, + "grad_norm": 1.4504200122603432, + "language_loss": 0.74308336, + "learning_rate": 1.8924963939795478e-07, + "loss": 0.76435512, + "num_input_tokens_seen": 310208395, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.64453125, + "step": 14380, + "time_per_iteration": 2.4468491077423096 + }, + { + "auxiliary_loss_clip": 0.01101324, + "auxiliary_loss_mlp": 0.01029458, + "balance_loss_clip": 1.01766789, + "balance_loss_mlp": 1.03408456, + "epoch": 0.8646324966180671, + "flos": 20266402896000.0, + "grad_norm": 2.1859120044364206, + "language_loss": 0.74855471, + "learning_rate": 1.8908430342914473e-07, + "loss": 0.76986253, + "num_input_tokens_seen": 310227415, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.671875, + "step": 14381, + "time_per_iteration": 2.4468581676483154 + }, + { + "auxiliary_loss_clip": 0.01098259, + "auxiliary_loss_mlp": 0.01034372, + "balance_loss_clip": 1.02303529, + "balance_loss_mlp": 1.03415072, + "epoch": 0.864692619870735, + "flos": 11946188050560.0, + "grad_norm": 2.3474051146654538, + "language_loss": 0.84216976, + "learning_rate": 1.8891903613043892e-07, + "loss": 0.86349607, + "num_input_tokens_seen": 310242625, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 14382, + "time_per_iteration": 2.4011387825012207 + }, + { + "auxiliary_loss_clip": 0.01101348, + "auxiliary_loss_mlp": 0.0103088, + "balance_loss_clip": 1.01876235, + "balance_loss_mlp": 1.03522801, + "epoch": 0.864752743123403, + "flos": 21470703114240.0, + "grad_norm": 1.8738363294425433, + "language_loss": 0.75711656, + "learning_rate": 1.8875383750810504e-07, + "loss": 0.77843881, + "num_input_tokens_seen": 310260585, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66015625, + "step": 14383, + "time_per_iteration": 2.45684814453125 + }, + { + "auxiliary_loss_clip": 0.0110184, + "auxiliary_loss_mlp": 0.01030116, + "balance_loss_clip": 1.01837349, + "balance_loss_mlp": 1.03689098, + "epoch": 0.8648128663760709, + "flos": 19530018172800.0, + "grad_norm": 2.9063267883434047, + "language_loss": 0.84982598, + "learning_rate": 1.8858870756840738e-07, + "loss": 0.87114561, + "num_input_tokens_seen": 310277210, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6484375, + "step": 14384, + "time_per_iteration": 2.4381446838378906 + }, + { + "auxiliary_loss_clip": 0.01096025, + "auxiliary_loss_mlp": 0.01028139, + "balance_loss_clip": 1.01730251, + "balance_loss_mlp": 1.03212559, + "epoch": 0.8648729896287389, + "flos": 21287953693440.0, + "grad_norm": 1.678692131551695, + "language_loss": 0.8082968, + "learning_rate": 1.884236463176072e-07, + "loss": 0.82953846, + "num_input_tokens_seen": 310296610, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.640625, + "step": 14385, + "time_per_iteration": 2.4595248699188232 + }, + { + "auxiliary_loss_clip": 0.01104363, + "auxiliary_loss_mlp": 0.01030099, + "balance_loss_clip": 1.01813602, + "balance_loss_mlp": 1.03674197, + "epoch": 0.8649331128814068, + "flos": 24604483230720.0, + "grad_norm": 2.080462579763744, + "language_loss": 0.7260626, + "learning_rate": 1.8825865376196437e-07, + "loss": 0.74740726, + "num_input_tokens_seen": 310316830, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.67578125, + "step": 14386, + "time_per_iteration": 2.464761972427368 + }, + { + "auxiliary_loss_clip": 0.01100332, + "auxiliary_loss_mlp": 0.0103323, + "balance_loss_clip": 1.02241206, + "balance_loss_mlp": 1.03550792, + "epoch": 0.8649932361340749, + "flos": 15377811742080.0, + "grad_norm": 2.2876758841906026, + "language_loss": 0.82462382, + "learning_rate": 1.8809372990773476e-07, + "loss": 0.84595942, + "num_input_tokens_seen": 310334355, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6484375, + "step": 14387, + "time_per_iteration": 3.922278642654419 + }, + { + "auxiliary_loss_clip": 0.01097667, + "auxiliary_loss_mlp": 0.01026771, + "balance_loss_clip": 1.01570868, + "balance_loss_mlp": 1.03452444, + "epoch": 0.8650533593867428, + "flos": 19901227276800.0, + "grad_norm": 1.9528284009807142, + "language_loss": 0.68743157, + "learning_rate": 1.8792887476117224e-07, + "loss": 0.70867598, + "num_input_tokens_seen": 310352900, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6328125, + "step": 14388, + "time_per_iteration": 5.270868301391602 + }, + { + "auxiliary_loss_clip": 0.01097314, + "auxiliary_loss_mlp": 0.01033896, + "balance_loss_clip": 1.02323222, + "balance_loss_mlp": 1.03490067, + "epoch": 0.8651134826394108, + "flos": 25626931868160.0, + "grad_norm": 1.6968271505710826, + "language_loss": 0.90530205, + "learning_rate": 1.877640883285283e-07, + "loss": 0.92661411, + "num_input_tokens_seen": 310372855, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.625, + "step": 14389, + "time_per_iteration": 2.5115513801574707 + }, + { + "auxiliary_loss_clip": 0.01097489, + "auxiliary_loss_mlp": 0.01028543, + "balance_loss_clip": 1.01771283, + "balance_loss_mlp": 1.03391635, + "epoch": 0.8651736058920788, + "flos": 18734525619840.0, + "grad_norm": 1.4619178104484627, + "language_loss": 0.70866364, + "learning_rate": 1.8759937061605212e-07, + "loss": 0.72992396, + "num_input_tokens_seen": 310391595, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.63671875, + "step": 14390, + "time_per_iteration": 2.5057373046875 + }, + { + "auxiliary_loss_clip": 0.01102072, + "auxiliary_loss_mlp": 0.01033833, + "balance_loss_clip": 1.02169168, + "balance_loss_mlp": 1.03451252, + "epoch": 0.8652337291447467, + "flos": 20776765288320.0, + "grad_norm": 1.7005812844165624, + "language_loss": 0.81973195, + "learning_rate": 1.8743472162998941e-07, + "loss": 0.84109104, + "num_input_tokens_seen": 310410090, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.67578125, + "step": 14391, + "time_per_iteration": 2.4508116245269775 + }, + { + "auxiliary_loss_clip": 0.01022254, + "auxiliary_loss_mlp": 0.01000548, + "balance_loss_clip": 0.99952883, + "balance_loss_mlp": 1.00220108, + "epoch": 0.8652938523974147, + "flos": 64227887464320.0, + "grad_norm": 0.793527060984129, + "language_loss": 0.68029255, + "learning_rate": 1.8727014137658337e-07, + "loss": 0.70052058, + "num_input_tokens_seen": 310470055, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.20019531, + "step": 14392, + "time_per_iteration": 4.455903053283691 + }, + { + "auxiliary_loss_clip": 0.01104699, + "auxiliary_loss_mlp": 0.01029502, + "balance_loss_clip": 1.01706886, + "balance_loss_mlp": 1.03512073, + "epoch": 0.8653539756500827, + "flos": 18040587793920.0, + "grad_norm": 2.799808785659983, + "language_loss": 0.75864339, + "learning_rate": 1.8710562986207523e-07, + "loss": 0.77998543, + "num_input_tokens_seen": 310487665, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6953125, + "step": 14393, + "time_per_iteration": 2.4210777282714844 + }, + { + "auxiliary_loss_clip": 0.0110015, + "auxiliary_loss_mlp": 0.01030711, + "balance_loss_clip": 1.01872993, + "balance_loss_mlp": 1.03299379, + "epoch": 0.8654140989027507, + "flos": 17382416935680.0, + "grad_norm": 2.253323172073062, + "language_loss": 0.73935288, + "learning_rate": 1.8694118709270357e-07, + "loss": 0.76066148, + "num_input_tokens_seen": 310506130, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.671875, + "step": 14394, + "time_per_iteration": 2.4389302730560303 + }, + { + "auxiliary_loss_clip": 0.01101599, + "auxiliary_loss_mlp": 0.01027809, + "balance_loss_clip": 1.01563203, + "balance_loss_mlp": 1.03460002, + "epoch": 0.8654742221554186, + "flos": 53284862448000.0, + "grad_norm": 1.9628002470844093, + "language_loss": 0.65009511, + "learning_rate": 1.867768130747036e-07, + "loss": 0.6713891, + "num_input_tokens_seen": 310532445, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 14395, + "time_per_iteration": 2.7287087440490723 + }, + { + "auxiliary_loss_clip": 0.01100411, + "auxiliary_loss_mlp": 0.01034276, + "balance_loss_clip": 1.02282023, + "balance_loss_mlp": 1.03583765, + "epoch": 0.8655343454080866, + "flos": 23914711382400.0, + "grad_norm": 1.6931560876966212, + "language_loss": 0.67837584, + "learning_rate": 1.8661250781430838e-07, + "loss": 0.69972277, + "num_input_tokens_seen": 310552300, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.64453125, + "step": 14396, + "time_per_iteration": 2.4718122482299805 + }, + { + "auxiliary_loss_clip": 0.01104393, + "auxiliary_loss_mlp": 0.01034049, + "balance_loss_clip": 1.02227736, + "balance_loss_mlp": 1.03622818, + "epoch": 0.8655944686607545, + "flos": 24097209408000.0, + "grad_norm": 2.2875243864017256, + "language_loss": 0.69540256, + "learning_rate": 1.8644827131774954e-07, + "loss": 0.71678698, + "num_input_tokens_seen": 310572710, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.68359375, + "step": 14397, + "time_per_iteration": 2.487607479095459 + }, + { + "auxiliary_loss_clip": 0.01098278, + "auxiliary_loss_mlp": 0.01027254, + "balance_loss_clip": 1.01615524, + "balance_loss_mlp": 1.03279233, + "epoch": 0.8656545919134225, + "flos": 23112718467840.0, + "grad_norm": 2.9815102788946666, + "language_loss": 0.63472062, + "learning_rate": 1.86284103591253e-07, + "loss": 0.65597594, + "num_input_tokens_seen": 310592460, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.65625, + "step": 14398, + "time_per_iteration": 2.527153491973877 + }, + { + "auxiliary_loss_clip": 0.01100558, + "auxiliary_loss_mlp": 0.01031131, + "balance_loss_clip": 1.02000332, + "balance_loss_mlp": 1.03505087, + "epoch": 0.8657147151660904, + "flos": 21141761339520.0, + "grad_norm": 1.8815607100510332, + "language_loss": 0.76581329, + "learning_rate": 1.8612000464104517e-07, + "loss": 0.78713018, + "num_input_tokens_seen": 310609375, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 14399, + "time_per_iteration": 2.460524559020996 + }, + { + "auxiliary_loss_clip": 0.01098284, + "auxiliary_loss_mlp": 0.01027999, + "balance_loss_clip": 1.01721668, + "balance_loss_mlp": 1.03396773, + "epoch": 0.8657748384187585, + "flos": 16289439943680.0, + "grad_norm": 1.9369254428150626, + "language_loss": 0.9345935, + "learning_rate": 1.8595597447334855e-07, + "loss": 0.95585632, + "num_input_tokens_seen": 310627405, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.640625, + "step": 14400, + "time_per_iteration": 2.4347453117370605 + }, + { + "auxiliary_loss_clip": 0.01101332, + "auxiliary_loss_mlp": 0.01030649, + "balance_loss_clip": 1.01947284, + "balance_loss_mlp": 1.03524673, + "epoch": 0.8658349616714264, + "flos": 30843890179200.0, + "grad_norm": 1.8338281378598142, + "language_loss": 0.67362767, + "learning_rate": 1.8579201309438353e-07, + "loss": 0.69494748, + "num_input_tokens_seen": 310649945, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.66015625, + "step": 14401, + "time_per_iteration": 2.535815715789795 + }, + { + "auxiliary_loss_clip": 0.01100666, + "auxiliary_loss_mlp": 0.01029464, + "balance_loss_clip": 1.01766849, + "balance_loss_mlp": 1.03385806, + "epoch": 0.8658950849240944, + "flos": 18952862440320.0, + "grad_norm": 1.9734469645233848, + "language_loss": 0.73550159, + "learning_rate": 1.8562812051036714e-07, + "loss": 0.75680286, + "num_input_tokens_seen": 310668285, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66796875, + "step": 14402, + "time_per_iteration": 2.464496612548828 + }, + { + "auxiliary_loss_clip": 0.01098479, + "auxiliary_loss_mlp": 0.01030801, + "balance_loss_clip": 1.01988101, + "balance_loss_mlp": 1.0342344, + "epoch": 0.8659552081767624, + "flos": 23364344217600.0, + "grad_norm": 1.6389653471656214, + "language_loss": 0.74639928, + "learning_rate": 1.8546429672751397e-07, + "loss": 0.76769209, + "num_input_tokens_seen": 310687015, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.640625, + "step": 14403, + "time_per_iteration": 2.481019973754883 + }, + { + "auxiliary_loss_clip": 0.01101184, + "auxiliary_loss_mlp": 0.0103124, + "balance_loss_clip": 1.01884234, + "balance_loss_mlp": 1.03445876, + "epoch": 0.8660153314294303, + "flos": 23841992298240.0, + "grad_norm": 2.071645757031755, + "language_loss": 0.72956061, + "learning_rate": 1.853005417520368e-07, + "loss": 0.75088489, + "num_input_tokens_seen": 310707580, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.66796875, + "step": 14404, + "time_per_iteration": 2.5011138916015625 + }, + { + "auxiliary_loss_clip": 0.01099247, + "auxiliary_loss_mlp": 0.01031077, + "balance_loss_clip": 1.01935267, + "balance_loss_mlp": 1.03516841, + "epoch": 0.8660754546820983, + "flos": 23112467072640.0, + "grad_norm": 1.7013566949514276, + "language_loss": 0.7065661, + "learning_rate": 1.851368555901447e-07, + "loss": 0.72786927, + "num_input_tokens_seen": 310727300, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.640625, + "step": 14405, + "time_per_iteration": 2.4559895992279053 + }, + { + "auxiliary_loss_clip": 0.01101166, + "auxiliary_loss_mlp": 0.01031658, + "balance_loss_clip": 1.01983237, + "balance_loss_mlp": 1.0338521, + "epoch": 0.8661355779347663, + "flos": 14391991998720.0, + "grad_norm": 1.7861200215864865, + "language_loss": 0.66381979, + "learning_rate": 1.8497323824804467e-07, + "loss": 0.685148, + "num_input_tokens_seen": 310744935, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 14406, + "time_per_iteration": 2.409515142440796 + }, + { + "auxiliary_loss_clip": 0.0109867, + "auxiliary_loss_mlp": 0.01025585, + "balance_loss_clip": 1.01523805, + "balance_loss_mlp": 1.03394818, + "epoch": 0.8661957011874343, + "flos": 21870137329920.0, + "grad_norm": 1.5358857043199197, + "language_loss": 0.83051056, + "learning_rate": 1.8480968973194177e-07, + "loss": 0.85175312, + "num_input_tokens_seen": 310765085, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.6484375, + "step": 14407, + "time_per_iteration": 2.4647037982940674 + }, + { + "auxiliary_loss_clip": 0.01099601, + "auxiliary_loss_mlp": 0.01036617, + "balance_loss_clip": 1.02496433, + "balance_loss_mlp": 1.03503644, + "epoch": 0.8662558244401022, + "flos": 21835160461440.0, + "grad_norm": 1.6444953555030566, + "language_loss": 0.69656581, + "learning_rate": 1.8464621004803748e-07, + "loss": 0.71792799, + "num_input_tokens_seen": 310783260, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.64453125, + "step": 14408, + "time_per_iteration": 2.472282648086548 + }, + { + "auxiliary_loss_clip": 0.01095121, + "auxiliary_loss_mlp": 0.01029858, + "balance_loss_clip": 1.01912928, + "balance_loss_mlp": 1.0326798, + "epoch": 0.8663159476927702, + "flos": 17384104874880.0, + "grad_norm": 1.783525972516364, + "language_loss": 0.77200353, + "learning_rate": 1.844827992025304e-07, + "loss": 0.7932533, + "num_input_tokens_seen": 310801970, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.625, + "step": 14409, + "time_per_iteration": 2.4336869716644287 + }, + { + "auxiliary_loss_clip": 0.01104531, + "auxiliary_loss_mlp": 0.01029768, + "balance_loss_clip": 1.0170958, + "balance_loss_mlp": 1.03682649, + "epoch": 0.8663760709454381, + "flos": 22747722416640.0, + "grad_norm": 2.112427414775802, + "language_loss": 0.77122021, + "learning_rate": 1.8431945720161757e-07, + "loss": 0.7925632, + "num_input_tokens_seen": 310822070, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 14410, + "time_per_iteration": 2.4947590827941895 + }, + { + "auxiliary_loss_clip": 0.01101131, + "auxiliary_loss_mlp": 0.01030563, + "balance_loss_clip": 1.01899338, + "balance_loss_mlp": 1.0351249, + "epoch": 0.8664361941981061, + "flos": 17376850327680.0, + "grad_norm": 1.8818128477280998, + "language_loss": 0.7770704, + "learning_rate": 1.8415618405149315e-07, + "loss": 0.79838735, + "num_input_tokens_seen": 310838355, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66015625, + "step": 14411, + "time_per_iteration": 2.4171528816223145 + }, + { + "auxiliary_loss_clip": 0.0109563, + "auxiliary_loss_mlp": 0.01032813, + "balance_loss_clip": 1.02179205, + "balance_loss_mlp": 1.03105211, + "epoch": 0.866496317450774, + "flos": 16034438315520.0, + "grad_norm": 1.9843804705658286, + "language_loss": 0.73648727, + "learning_rate": 1.8399297975834794e-07, + "loss": 0.75777173, + "num_input_tokens_seen": 310856055, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.64453125, + "step": 14412, + "time_per_iteration": 2.409283399581909 + }, + { + "auxiliary_loss_clip": 0.01097266, + "auxiliary_loss_mlp": 0.01026079, + "balance_loss_clip": 1.01570809, + "balance_loss_mlp": 1.03446996, + "epoch": 0.8665564407034421, + "flos": 20814830726400.0, + "grad_norm": 1.9946534304785197, + "language_loss": 0.69103146, + "learning_rate": 1.83829844328371e-07, + "loss": 0.7122649, + "num_input_tokens_seen": 310876695, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.62890625, + "step": 14413, + "time_per_iteration": 2.45535945892334 + }, + { + "auxiliary_loss_clip": 0.01100064, + "auxiliary_loss_mlp": 0.01028753, + "balance_loss_clip": 1.01704669, + "balance_loss_mlp": 1.03485107, + "epoch": 0.86661656395611, + "flos": 15815167741440.0, + "grad_norm": 2.124342813127657, + "language_loss": 0.63079798, + "learning_rate": 1.8366677776774874e-07, + "loss": 0.65208614, + "num_input_tokens_seen": 310893880, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6484375, + "step": 14414, + "time_per_iteration": 2.426166534423828 + }, + { + "auxiliary_loss_clip": 0.01100431, + "auxiliary_loss_mlp": 0.01029476, + "balance_loss_clip": 1.01786518, + "balance_loss_mlp": 1.03504372, + "epoch": 0.866676687208778, + "flos": 23036910814080.0, + "grad_norm": 5.094417557754872, + "language_loss": 0.64098227, + "learning_rate": 1.8350378008266377e-07, + "loss": 0.66228133, + "num_input_tokens_seen": 310914145, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 14415, + "time_per_iteration": 2.505326271057129 + }, + { + "auxiliary_loss_clip": 0.01022563, + "auxiliary_loss_mlp": 0.00999471, + "balance_loss_clip": 0.99847585, + "balance_loss_mlp": 1.00243139, + "epoch": 0.866736810461446, + "flos": 63802275212160.0, + "grad_norm": 0.994856403197853, + "language_loss": 0.6039449, + "learning_rate": 1.8334085127929754e-07, + "loss": 0.6241653, + "num_input_tokens_seen": 310972825, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.20117188, + "step": 14416, + "time_per_iteration": 3.1169750690460205 + }, + { + "auxiliary_loss_clip": 0.0110175, + "auxiliary_loss_mlp": 0.01031843, + "balance_loss_clip": 1.01969552, + "balance_loss_mlp": 1.03320909, + "epoch": 0.8667969337141139, + "flos": 20449367798400.0, + "grad_norm": 1.709121823786769, + "language_loss": 0.74786484, + "learning_rate": 1.831779913638285e-07, + "loss": 0.76920074, + "num_input_tokens_seen": 310992050, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.68359375, + "step": 14417, + "time_per_iteration": 2.4594523906707764 + }, + { + "auxiliary_loss_clip": 0.01098679, + "auxiliary_loss_mlp": 0.0103671, + "balance_loss_clip": 1.02536714, + "balance_loss_mlp": 1.03374553, + "epoch": 0.866857056966782, + "flos": 21653703930240.0, + "grad_norm": 1.4667054132693154, + "language_loss": 0.74927706, + "learning_rate": 1.830152003424319e-07, + "loss": 0.77063096, + "num_input_tokens_seen": 311011105, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 14418, + "time_per_iteration": 2.4421162605285645 + }, + { + "auxiliary_loss_clip": 0.01099218, + "auxiliary_loss_mlp": 0.01034392, + "balance_loss_clip": 1.02282906, + "balance_loss_mlp": 1.03383338, + "epoch": 0.8669171802194499, + "flos": 22852832590080.0, + "grad_norm": 1.7373317907861416, + "language_loss": 0.68308914, + "learning_rate": 1.8285247822128126e-07, + "loss": 0.70442522, + "num_input_tokens_seen": 311032080, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65234375, + "step": 14419, + "time_per_iteration": 2.4572219848632812 + }, + { + "auxiliary_loss_clip": 0.01099714, + "auxiliary_loss_mlp": 0.01028303, + "balance_loss_clip": 1.01743746, + "balance_loss_mlp": 1.03343856, + "epoch": 0.8669773034721179, + "flos": 18734166483840.0, + "grad_norm": 1.8427566816345506, + "language_loss": 0.78783178, + "learning_rate": 1.826898250065465e-07, + "loss": 0.80911195, + "num_input_tokens_seen": 311049735, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6640625, + "step": 14420, + "time_per_iteration": 2.431793212890625 + }, + { + "auxiliary_loss_clip": 0.01099106, + "auxiliary_loss_mlp": 0.01026008, + "balance_loss_clip": 1.01507688, + "balance_loss_mlp": 1.03495288, + "epoch": 0.8670374267247858, + "flos": 18916018064640.0, + "grad_norm": 1.5228427156861324, + "language_loss": 0.83455002, + "learning_rate": 1.8252724070439586e-07, + "loss": 0.85580111, + "num_input_tokens_seen": 311067675, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.640625, + "step": 14421, + "time_per_iteration": 2.4668033123016357 + }, + { + "auxiliary_loss_clip": 0.01022879, + "auxiliary_loss_mlp": 0.01001113, + "balance_loss_clip": 1.00009346, + "balance_loss_mlp": 1.00281167, + "epoch": 0.8670975499774538, + "flos": 48814527214080.0, + "grad_norm": 0.6988406736665802, + "language_loss": 0.49181524, + "learning_rate": 1.823647253209941e-07, + "loss": 0.51205516, + "num_input_tokens_seen": 311126605, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.20117188, + "step": 14422, + "time_per_iteration": 3.0614736080169678 + }, + { + "auxiliary_loss_clip": 0.01098788, + "auxiliary_loss_mlp": 0.0102549, + "balance_loss_clip": 1.01436234, + "balance_loss_mlp": 1.03404534, + "epoch": 0.8671576732301217, + "flos": 26136145025280.0, + "grad_norm": 1.6515883006582035, + "language_loss": 0.73396868, + "learning_rate": 1.8220227886250417e-07, + "loss": 0.75521147, + "num_input_tokens_seen": 311147325, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 14423, + "time_per_iteration": 2.5444014072418213 + }, + { + "auxiliary_loss_clip": 0.01095039, + "auxiliary_loss_mlp": 0.01024294, + "balance_loss_clip": 1.01376247, + "balance_loss_mlp": 1.03325748, + "epoch": 0.8672177964827897, + "flos": 18367446579840.0, + "grad_norm": 1.6484964407004838, + "language_loss": 0.76470268, + "learning_rate": 1.8203990133508684e-07, + "loss": 0.785896, + "num_input_tokens_seen": 311165385, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6171875, + "step": 14424, + "time_per_iteration": 2.424851179122925 + }, + { + "auxiliary_loss_clip": 0.01095461, + "auxiliary_loss_mlp": 0.01031583, + "balance_loss_clip": 1.02093172, + "balance_loss_mlp": 1.03368878, + "epoch": 0.8672779197354576, + "flos": 28545355992960.0, + "grad_norm": 1.5813311269546795, + "language_loss": 0.71488333, + "learning_rate": 1.8187759274489767e-07, + "loss": 0.73615384, + "num_input_tokens_seen": 311185860, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6171875, + "step": 14425, + "time_per_iteration": 2.511129379272461 + }, + { + "auxiliary_loss_clip": 0.01100644, + "auxiliary_loss_mlp": 0.01031036, + "balance_loss_clip": 1.0186975, + "balance_loss_mlp": 1.03369915, + "epoch": 0.8673380429881257, + "flos": 22382474970240.0, + "grad_norm": 1.7136647691025457, + "language_loss": 0.68201184, + "learning_rate": 1.817153530980926e-07, + "loss": 0.70332867, + "num_input_tokens_seen": 311205810, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.671875, + "step": 14426, + "time_per_iteration": 2.45344614982605 + }, + { + "auxiliary_loss_clip": 0.01100039, + "auxiliary_loss_mlp": 0.01026509, + "balance_loss_clip": 1.01479053, + "balance_loss_mlp": 1.03466105, + "epoch": 0.8673981662407936, + "flos": 20996430912000.0, + "grad_norm": 2.1655107539362732, + "language_loss": 0.71177137, + "learning_rate": 1.815531824008234e-07, + "loss": 0.73303688, + "num_input_tokens_seen": 311226080, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 14427, + "time_per_iteration": 2.4853262901306152 + }, + { + "auxiliary_loss_clip": 0.01100692, + "auxiliary_loss_mlp": 0.01027171, + "balance_loss_clip": 1.01605487, + "balance_loss_mlp": 1.03558111, + "epoch": 0.8674582894934616, + "flos": 24426797627520.0, + "grad_norm": 2.4919308270407967, + "language_loss": 0.67974901, + "learning_rate": 1.8139108065924004e-07, + "loss": 0.70102763, + "num_input_tokens_seen": 311246380, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65234375, + "step": 14428, + "time_per_iteration": 3.881380558013916 + }, + { + "auxiliary_loss_clip": 0.01099393, + "auxiliary_loss_mlp": 0.01027709, + "balance_loss_clip": 1.01625252, + "balance_loss_mlp": 1.03412676, + "epoch": 0.8675184127461296, + "flos": 20737514701440.0, + "grad_norm": 1.9995749889958705, + "language_loss": 0.70442253, + "learning_rate": 1.812290478794889e-07, + "loss": 0.72569358, + "num_input_tokens_seen": 311266465, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 14429, + "time_per_iteration": 2.485006093978882 + }, + { + "auxiliary_loss_clip": 0.01099339, + "auxiliary_loss_mlp": 0.01027402, + "balance_loss_clip": 1.01582026, + "balance_loss_mlp": 1.03418374, + "epoch": 0.8675785359987975, + "flos": 19135647774720.0, + "grad_norm": 1.994306556139827, + "language_loss": 0.66704834, + "learning_rate": 1.810670840677151e-07, + "loss": 0.68831575, + "num_input_tokens_seen": 311285075, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 14430, + "time_per_iteration": 5.315447092056274 + }, + { + "auxiliary_loss_clip": 0.01102359, + "auxiliary_loss_mlp": 0.01039308, + "balance_loss_clip": 1.02674353, + "balance_loss_mlp": 1.03518546, + "epoch": 0.8676386592514655, + "flos": 22710662559360.0, + "grad_norm": 2.1027224790368373, + "language_loss": 0.6922996, + "learning_rate": 1.8090518923005948e-07, + "loss": 0.71371627, + "num_input_tokens_seen": 311303230, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.671875, + "step": 14431, + "time_per_iteration": 2.529296636581421 + }, + { + "auxiliary_loss_clip": 0.0110053, + "auxiliary_loss_mlp": 0.01037421, + "balance_loss_clip": 1.0257802, + "balance_loss_mlp": 1.03467131, + "epoch": 0.8676987825041335, + "flos": 14209853109120.0, + "grad_norm": 2.1202825244865617, + "language_loss": 0.63412476, + "learning_rate": 1.8074336337266116e-07, + "loss": 0.65550429, + "num_input_tokens_seen": 311318070, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66015625, + "step": 14432, + "time_per_iteration": 2.4086527824401855 + }, + { + "auxiliary_loss_clip": 0.01099173, + "auxiliary_loss_mlp": 0.01034176, + "balance_loss_clip": 1.02346504, + "balance_loss_mlp": 1.03396499, + "epoch": 0.8677589057568015, + "flos": 13589927256960.0, + "grad_norm": 1.8814819068968875, + "language_loss": 0.78167719, + "learning_rate": 1.8058160650165656e-07, + "loss": 0.8030107, + "num_input_tokens_seen": 311334885, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.65234375, + "step": 14433, + "time_per_iteration": 2.4273760318756104 + }, + { + "auxiliary_loss_clip": 0.01022423, + "auxiliary_loss_mlp": 0.00999558, + "balance_loss_clip": 0.99843693, + "balance_loss_mlp": 1.00253153, + "epoch": 0.8678190290094694, + "flos": 68933657370240.0, + "grad_norm": 0.7045847422968128, + "language_loss": 0.58498955, + "learning_rate": 1.804199186231805e-07, + "loss": 0.60520935, + "num_input_tokens_seen": 311399780, + "router_z_loss_clip": 0.01123047, + "router_z_loss_mlp": 0.19921875, + "step": 14434, + "time_per_iteration": 4.538690090179443 + }, + { + "auxiliary_loss_clip": 0.01095692, + "auxiliary_loss_mlp": 0.0103128, + "balance_loss_clip": 1.02078414, + "balance_loss_mlp": 1.0331434, + "epoch": 0.8678791522621374, + "flos": 32557726776960.0, + "grad_norm": 1.9536898864428005, + "language_loss": 0.80034566, + "learning_rate": 1.802582997433628e-07, + "loss": 0.8216154, + "num_input_tokens_seen": 311419610, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.625, + "step": 14435, + "time_per_iteration": 2.5227813720703125 + }, + { + "auxiliary_loss_clip": 0.01099769, + "auxiliary_loss_mlp": 0.01024614, + "balance_loss_clip": 1.0133307, + "balance_loss_mlp": 1.03240716, + "epoch": 0.8679392755148053, + "flos": 35042637657600.0, + "grad_norm": 1.931795708002661, + "language_loss": 0.62170672, + "learning_rate": 1.8009674986833322e-07, + "loss": 0.64295053, + "num_input_tokens_seen": 311440045, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 14436, + "time_per_iteration": 2.5728566646575928 + }, + { + "auxiliary_loss_clip": 0.01100272, + "auxiliary_loss_mlp": 0.01029612, + "balance_loss_clip": 1.01709533, + "balance_loss_mlp": 1.03472376, + "epoch": 0.8679993987674733, + "flos": 18552494471040.0, + "grad_norm": 2.343547123058283, + "language_loss": 0.70253652, + "learning_rate": 1.7993526900421706e-07, + "loss": 0.72383535, + "num_input_tokens_seen": 311456660, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.65625, + "step": 14437, + "time_per_iteration": 2.4431545734405518 + }, + { + "auxiliary_loss_clip": 0.01099398, + "auxiliary_loss_mlp": 0.01028868, + "balance_loss_clip": 1.01700628, + "balance_loss_mlp": 1.03465986, + "epoch": 0.8680595220201412, + "flos": 27454390162560.0, + "grad_norm": 2.0341554887187527, + "language_loss": 0.80222631, + "learning_rate": 1.797738571571381e-07, + "loss": 0.82350898, + "num_input_tokens_seen": 311475460, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6484375, + "step": 14438, + "time_per_iteration": 2.494382858276367 + }, + { + "auxiliary_loss_clip": 0.01095122, + "auxiliary_loss_mlp": 0.01025377, + "balance_loss_clip": 1.01420712, + "balance_loss_mlp": 1.03247368, + "epoch": 0.8681196452728093, + "flos": 19208797822080.0, + "grad_norm": 1.989777139729131, + "language_loss": 0.67343026, + "learning_rate": 1.7961251433321656e-07, + "loss": 0.69463527, + "num_input_tokens_seen": 311494575, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.62890625, + "step": 14439, + "time_per_iteration": 2.446855306625366 + }, + { + "auxiliary_loss_clip": 0.01097843, + "auxiliary_loss_mlp": 0.01033073, + "balance_loss_clip": 1.02216566, + "balance_loss_mlp": 1.03362823, + "epoch": 0.8681797685254772, + "flos": 37560442417920.0, + "grad_norm": 1.5866507009228823, + "language_loss": 0.63566774, + "learning_rate": 1.7945124053857085e-07, + "loss": 0.65697688, + "num_input_tokens_seen": 311515805, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.640625, + "step": 14440, + "time_per_iteration": 2.598418951034546 + }, + { + "auxiliary_loss_clip": 0.01097534, + "auxiliary_loss_mlp": 0.01031351, + "balance_loss_clip": 1.01960874, + "balance_loss_mlp": 1.03475738, + "epoch": 0.8682398917781452, + "flos": 23289937194240.0, + "grad_norm": 1.520964551160635, + "language_loss": 0.65901554, + "learning_rate": 1.7929003577931722e-07, + "loss": 0.68030441, + "num_input_tokens_seen": 311536000, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.62890625, + "step": 14441, + "time_per_iteration": 2.4468817710876465 + }, + { + "auxiliary_loss_clip": 0.01098077, + "auxiliary_loss_mlp": 0.01025314, + "balance_loss_clip": 1.01465619, + "balance_loss_mlp": 1.03496826, + "epoch": 0.8683000150308132, + "flos": 21872794936320.0, + "grad_norm": 1.540549688981214, + "language_loss": 0.66407061, + "learning_rate": 1.7912890006156722e-07, + "loss": 0.68530446, + "num_input_tokens_seen": 311556220, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.62890625, + "step": 14442, + "time_per_iteration": 2.474977493286133 + }, + { + "auxiliary_loss_clip": 0.01101636, + "auxiliary_loss_mlp": 0.01030241, + "balance_loss_clip": 1.01771832, + "balance_loss_mlp": 1.0342052, + "epoch": 0.8683601382834811, + "flos": 14647209108480.0, + "grad_norm": 1.8064516397377273, + "language_loss": 0.72410548, + "learning_rate": 1.7896783339143195e-07, + "loss": 0.74542421, + "num_input_tokens_seen": 311572530, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.671875, + "step": 14443, + "time_per_iteration": 2.411228895187378 + }, + { + "auxiliary_loss_clip": 0.01100605, + "auxiliary_loss_mlp": 0.01028407, + "balance_loss_clip": 1.01658189, + "balance_loss_mlp": 1.0347054, + "epoch": 0.8684202615361492, + "flos": 26359904799360.0, + "grad_norm": 1.608459779685937, + "language_loss": 0.83502007, + "learning_rate": 1.7880683577501877e-07, + "loss": 0.85631013, + "num_input_tokens_seen": 311591105, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66015625, + "step": 14444, + "time_per_iteration": 2.4930927753448486 + }, + { + "auxiliary_loss_clip": 0.01100137, + "auxiliary_loss_mlp": 0.01030605, + "balance_loss_clip": 1.01894629, + "balance_loss_mlp": 1.03413963, + "epoch": 0.8684803847888171, + "flos": 20704010290560.0, + "grad_norm": 2.448486590537531, + "language_loss": 0.77183151, + "learning_rate": 1.7864590721843342e-07, + "loss": 0.79313886, + "num_input_tokens_seen": 311608350, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66015625, + "step": 14445, + "time_per_iteration": 2.426669120788574 + }, + { + "auxiliary_loss_clip": 0.01103385, + "auxiliary_loss_mlp": 0.0102966, + "balance_loss_clip": 1.01798332, + "balance_loss_mlp": 1.03616834, + "epoch": 0.8685405080414851, + "flos": 22638123043200.0, + "grad_norm": 1.7750211361688581, + "language_loss": 0.67744529, + "learning_rate": 1.7848504772777728e-07, + "loss": 0.69877577, + "num_input_tokens_seen": 311626380, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 14446, + "time_per_iteration": 2.4654579162597656 + }, + { + "auxiliary_loss_clip": 0.01099868, + "auxiliary_loss_mlp": 0.01032187, + "balance_loss_clip": 1.02041531, + "balance_loss_mlp": 1.03514719, + "epoch": 0.868600631294153, + "flos": 24822065865600.0, + "grad_norm": 1.7454626226513459, + "language_loss": 0.82879949, + "learning_rate": 1.7832425730915102e-07, + "loss": 0.85012007, + "num_input_tokens_seen": 311644345, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6484375, + "step": 14447, + "time_per_iteration": 2.493257522583008 + }, + { + "auxiliary_loss_clip": 0.01097872, + "auxiliary_loss_mlp": 0.01026699, + "balance_loss_clip": 1.01551676, + "balance_loss_mlp": 1.03253877, + "epoch": 0.868660754546821, + "flos": 25113983696640.0, + "grad_norm": 1.6479414858635801, + "language_loss": 0.73994362, + "learning_rate": 1.781635359686515e-07, + "loss": 0.76118934, + "num_input_tokens_seen": 311663340, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.65234375, + "step": 14448, + "time_per_iteration": 2.517547607421875 + }, + { + "auxiliary_loss_clip": 0.01099877, + "auxiliary_loss_mlp": 0.01029061, + "balance_loss_clip": 1.01732528, + "balance_loss_mlp": 1.03410125, + "epoch": 0.8687208777994889, + "flos": 12677832178560.0, + "grad_norm": 2.0155069578364815, + "language_loss": 0.80403781, + "learning_rate": 1.7800288371237303e-07, + "loss": 0.82532716, + "num_input_tokens_seen": 311679860, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 14449, + "time_per_iteration": 2.4734561443328857 + }, + { + "auxiliary_loss_clip": 0.01022701, + "auxiliary_loss_mlp": 0.01001927, + "balance_loss_clip": 1.00090826, + "balance_loss_mlp": 1.00270224, + "epoch": 0.8687810010521569, + "flos": 65617235573760.0, + "grad_norm": 0.8078709150290084, + "language_loss": 0.60570407, + "learning_rate": 1.7784230054640758e-07, + "loss": 0.62595034, + "num_input_tokens_seen": 311738135, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.20019531, + "step": 14450, + "time_per_iteration": 3.00457501411438 + }, + { + "auxiliary_loss_clip": 0.01101764, + "auxiliary_loss_mlp": 0.01025128, + "balance_loss_clip": 1.01391602, + "balance_loss_mlp": 1.03496504, + "epoch": 0.8688411243048249, + "flos": 24244012293120.0, + "grad_norm": 1.6214229272889056, + "language_loss": 0.75951099, + "learning_rate": 1.7768178647684517e-07, + "loss": 0.78077996, + "num_input_tokens_seen": 311756975, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6640625, + "step": 14451, + "time_per_iteration": 2.466799736022949 + }, + { + "auxiliary_loss_clip": 0.01097261, + "auxiliary_loss_mlp": 0.01026963, + "balance_loss_clip": 1.01535177, + "balance_loss_mlp": 1.03310919, + "epoch": 0.8689012475574929, + "flos": 18221828843520.0, + "grad_norm": 2.7255099911966045, + "language_loss": 0.72161841, + "learning_rate": 1.7752134150977205e-07, + "loss": 0.74286067, + "num_input_tokens_seen": 311771830, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.640625, + "step": 14452, + "time_per_iteration": 2.4162471294403076 + }, + { + "auxiliary_loss_clip": 0.01104116, + "auxiliary_loss_mlp": 0.01031339, + "balance_loss_clip": 1.0188818, + "balance_loss_mlp": 1.03695846, + "epoch": 0.8689613708101608, + "flos": 19646728439040.0, + "grad_norm": 1.4737303556350767, + "language_loss": 0.71964049, + "learning_rate": 1.7736096565127201e-07, + "loss": 0.74099505, + "num_input_tokens_seen": 311790130, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.671875, + "step": 14453, + "time_per_iteration": 2.4244184494018555 + }, + { + "auxiliary_loss_clip": 0.01101076, + "auxiliary_loss_mlp": 0.01032389, + "balance_loss_clip": 1.0210638, + "balance_loss_mlp": 1.03651702, + "epoch": 0.8690214940628288, + "flos": 11728749070080.0, + "grad_norm": 1.9633097563791408, + "language_loss": 0.7370978, + "learning_rate": 1.7720065890742664e-07, + "loss": 0.75843245, + "num_input_tokens_seen": 311808360, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 14454, + "time_per_iteration": 2.4440579414367676 + }, + { + "auxiliary_loss_clip": 0.01100252, + "auxiliary_loss_mlp": 0.01026671, + "balance_loss_clip": 1.01540554, + "balance_loss_mlp": 1.03604889, + "epoch": 0.8690816173154968, + "flos": 34936450076160.0, + "grad_norm": 1.857806171796081, + "language_loss": 0.59278631, + "learning_rate": 1.7704042128431552e-07, + "loss": 0.61405551, + "num_input_tokens_seen": 311831325, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.640625, + "step": 14455, + "time_per_iteration": 2.5671346187591553 + }, + { + "auxiliary_loss_clip": 0.01100982, + "auxiliary_loss_mlp": 0.01026904, + "balance_loss_clip": 1.01522756, + "balance_loss_mlp": 1.03404677, + "epoch": 0.8691417405681647, + "flos": 11614804151040.0, + "grad_norm": 2.0707431382841746, + "language_loss": 0.79973984, + "learning_rate": 1.7688025278801378e-07, + "loss": 0.8210187, + "num_input_tokens_seen": 311848090, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66796875, + "step": 14456, + "time_per_iteration": 2.418536424636841 + }, + { + "auxiliary_loss_clip": 0.01104649, + "auxiliary_loss_mlp": 0.01034755, + "balance_loss_clip": 1.02168369, + "balance_loss_mlp": 1.03677177, + "epoch": 0.8692018638208328, + "flos": 24608038677120.0, + "grad_norm": 3.2598349085995713, + "language_loss": 0.74551702, + "learning_rate": 1.7672015342459568e-07, + "loss": 0.76691103, + "num_input_tokens_seen": 311867855, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6796875, + "step": 14457, + "time_per_iteration": 2.4583218097686768 + }, + { + "auxiliary_loss_clip": 0.01096046, + "auxiliary_loss_mlp": 0.01028332, + "balance_loss_clip": 1.01771641, + "balance_loss_mlp": 1.0333178, + "epoch": 0.8692619870735007, + "flos": 25995124229760.0, + "grad_norm": 1.5024493639781449, + "language_loss": 0.78523105, + "learning_rate": 1.765601232001328e-07, + "loss": 0.80647486, + "num_input_tokens_seen": 311888675, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.625, + "step": 14458, + "time_per_iteration": 2.4820399284362793 + }, + { + "auxiliary_loss_clip": 0.01099419, + "auxiliary_loss_mlp": 0.01031789, + "balance_loss_clip": 1.01935577, + "balance_loss_mlp": 1.03463364, + "epoch": 0.8693221103261687, + "flos": 18041808856320.0, + "grad_norm": 1.644405471391384, + "language_loss": 0.70893437, + "learning_rate": 1.7640016212069187e-07, + "loss": 0.73024642, + "num_input_tokens_seen": 311907310, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6484375, + "step": 14459, + "time_per_iteration": 2.4393928050994873 + }, + { + "auxiliary_loss_clip": 0.01093983, + "auxiliary_loss_mlp": 0.01030121, + "balance_loss_clip": 1.01987517, + "balance_loss_mlp": 1.03369355, + "epoch": 0.8693822335788366, + "flos": 27492347859840.0, + "grad_norm": 1.6107626843517802, + "language_loss": 0.73736501, + "learning_rate": 1.762402701923398e-07, + "loss": 0.75860602, + "num_input_tokens_seen": 311929635, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.6015625, + "step": 14460, + "time_per_iteration": 2.503788948059082 + }, + { + "auxiliary_loss_clip": 0.01102012, + "auxiliary_loss_mlp": 0.01030741, + "balance_loss_clip": 1.01904643, + "balance_loss_mlp": 1.03427231, + "epoch": 0.8694423568315046, + "flos": 24097712198400.0, + "grad_norm": 2.041721670778758, + "language_loss": 0.6509198, + "learning_rate": 1.7608044742113947e-07, + "loss": 0.67224729, + "num_input_tokens_seen": 311948800, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.67578125, + "step": 14461, + "time_per_iteration": 2.5010743141174316 + }, + { + "auxiliary_loss_clip": 0.01097505, + "auxiliary_loss_mlp": 0.01033155, + "balance_loss_clip": 1.02073312, + "balance_loss_mlp": 1.0317719, + "epoch": 0.8695024800841725, + "flos": 18362131367040.0, + "grad_norm": 2.2325103269251474, + "language_loss": 0.83019292, + "learning_rate": 1.7592069381315123e-07, + "loss": 0.8514995, + "num_input_tokens_seen": 311964090, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.65625, + "step": 14462, + "time_per_iteration": 2.433208703994751 + }, + { + "auxiliary_loss_clip": 0.01099153, + "auxiliary_loss_mlp": 0.01030132, + "balance_loss_clip": 1.01810336, + "balance_loss_mlp": 1.0335896, + "epoch": 0.8695626033368405, + "flos": 14027750133120.0, + "grad_norm": 1.7962828010788623, + "language_loss": 0.65557456, + "learning_rate": 1.757610093744335e-07, + "loss": 0.67686737, + "num_input_tokens_seen": 311981460, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.65625, + "step": 14463, + "time_per_iteration": 2.4401493072509766 + }, + { + "auxiliary_loss_clip": 0.01105422, + "auxiliary_loss_mlp": 0.01034725, + "balance_loss_clip": 1.02233291, + "balance_loss_mlp": 1.03725314, + "epoch": 0.8696227265895085, + "flos": 16836862193280.0, + "grad_norm": 2.9623729778046357, + "language_loss": 0.66444242, + "learning_rate": 1.7560139411104058e-07, + "loss": 0.68584383, + "num_input_tokens_seen": 312000115, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 14464, + "time_per_iteration": 2.436553716659546 + }, + { + "auxiliary_loss_clip": 0.01103351, + "auxiliary_loss_mlp": 0.01030622, + "balance_loss_clip": 1.01875448, + "balance_loss_mlp": 1.03480899, + "epoch": 0.8696828498421765, + "flos": 21799070271360.0, + "grad_norm": 2.4735109448365376, + "language_loss": 0.63112307, + "learning_rate": 1.7544184802902607e-07, + "loss": 0.65246278, + "num_input_tokens_seen": 312020770, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 14465, + "time_per_iteration": 2.4660658836364746 + }, + { + "auxiliary_loss_clip": 0.01094609, + "auxiliary_loss_mlp": 0.01037505, + "balance_loss_clip": 1.02676392, + "balance_loss_mlp": 1.03304648, + "epoch": 0.8697429730948444, + "flos": 22894812610560.0, + "grad_norm": 1.685225165628944, + "language_loss": 0.84644353, + "learning_rate": 1.7528237113443934e-07, + "loss": 0.86776471, + "num_input_tokens_seen": 312041870, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6171875, + "step": 14466, + "time_per_iteration": 2.480961322784424 + }, + { + "auxiliary_loss_clip": 0.01105582, + "auxiliary_loss_mlp": 0.01038303, + "balance_loss_clip": 1.02561951, + "balance_loss_mlp": 1.03688443, + "epoch": 0.8698030963475124, + "flos": 24717458482560.0, + "grad_norm": 2.282426713545837, + "language_loss": 0.62034947, + "learning_rate": 1.7512296343332779e-07, + "loss": 0.64178836, + "num_input_tokens_seen": 312058210, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 14467, + "time_per_iteration": 2.538341999053955 + }, + { + "auxiliary_loss_clip": 0.01094205, + "auxiliary_loss_mlp": 0.01028408, + "balance_loss_clip": 1.01785159, + "balance_loss_mlp": 1.03268588, + "epoch": 0.8698632196001803, + "flos": 28442221067520.0, + "grad_norm": 1.3777636597290568, + "language_loss": 0.68952703, + "learning_rate": 1.7496362493173655e-07, + "loss": 0.71075314, + "num_input_tokens_seen": 312082665, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6171875, + "step": 14468, + "time_per_iteration": 2.5434041023254395 + }, + { + "auxiliary_loss_clip": 0.0109501, + "auxiliary_loss_mlp": 0.0102873, + "balance_loss_clip": 1.01774478, + "balance_loss_mlp": 1.03186822, + "epoch": 0.8699233428528483, + "flos": 27636457224960.0, + "grad_norm": 1.4228431843567073, + "language_loss": 0.70863521, + "learning_rate": 1.7480435563570773e-07, + "loss": 0.72987258, + "num_input_tokens_seen": 312101960, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.62890625, + "step": 14469, + "time_per_iteration": 2.4869115352630615 + }, + { + "auxiliary_loss_clip": 0.01094893, + "auxiliary_loss_mlp": 0.01026479, + "balance_loss_clip": 1.01596522, + "balance_loss_mlp": 1.03371549, + "epoch": 0.8699834661055164, + "flos": 20045659864320.0, + "grad_norm": 2.363369045660802, + "language_loss": 0.83497709, + "learning_rate": 1.7464515555128024e-07, + "loss": 0.8561908, + "num_input_tokens_seen": 312117125, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.61328125, + "step": 14470, + "time_per_iteration": 3.7966389656066895 + }, + { + "auxiliary_loss_clip": 0.01100686, + "auxiliary_loss_mlp": 0.01028161, + "balance_loss_clip": 1.01659179, + "balance_loss_mlp": 1.0359199, + "epoch": 0.8700435893581843, + "flos": 23732787974400.0, + "grad_norm": 1.7851623388231517, + "language_loss": 0.729653, + "learning_rate": 1.7448602468449148e-07, + "loss": 0.75094146, + "num_input_tokens_seen": 312135775, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 14471, + "time_per_iteration": 2.4429454803466797 + }, + { + "auxiliary_loss_clip": 0.01098438, + "auxiliary_loss_mlp": 0.0102807, + "balance_loss_clip": 1.01722205, + "balance_loss_mlp": 1.03454566, + "epoch": 0.8701037126108523, + "flos": 23548422441600.0, + "grad_norm": 1.5357760114727317, + "language_loss": 0.79059005, + "learning_rate": 1.7432696304137573e-07, + "loss": 0.81185514, + "num_input_tokens_seen": 312156070, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.640625, + "step": 14472, + "time_per_iteration": 3.921783208847046 + }, + { + "auxiliary_loss_clip": 0.01098113, + "auxiliary_loss_mlp": 0.01025833, + "balance_loss_clip": 1.01443648, + "balance_loss_mlp": 1.03344178, + "epoch": 0.8701638358635202, + "flos": 18843442634880.0, + "grad_norm": 3.2533585330072574, + "language_loss": 0.72799373, + "learning_rate": 1.741679706279644e-07, + "loss": 0.74923319, + "num_input_tokens_seen": 312174380, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 14473, + "time_per_iteration": 2.4259376525878906 + }, + { + "auxiliary_loss_clip": 0.01102046, + "auxiliary_loss_mlp": 0.01028818, + "balance_loss_clip": 1.01699805, + "balance_loss_mlp": 1.03496337, + "epoch": 0.8702239591161882, + "flos": 27928339142400.0, + "grad_norm": 1.846236216417208, + "language_loss": 0.72311044, + "learning_rate": 1.7400904745028644e-07, + "loss": 0.7444191, + "num_input_tokens_seen": 312195130, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 14474, + "time_per_iteration": 2.473069190979004 + }, + { + "auxiliary_loss_clip": 0.01097682, + "auxiliary_loss_mlp": 0.01034295, + "balance_loss_clip": 1.02192092, + "balance_loss_mlp": 1.03249419, + "epoch": 0.8702840823688561, + "flos": 17233997938560.0, + "grad_norm": 1.7934338685222735, + "language_loss": 0.67214453, + "learning_rate": 1.7385019351436925e-07, + "loss": 0.69346434, + "num_input_tokens_seen": 312212300, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.65234375, + "step": 14475, + "time_per_iteration": 2.4029717445373535 + }, + { + "auxiliary_loss_clip": 0.01098351, + "auxiliary_loss_mlp": 0.0102784, + "balance_loss_clip": 1.0158658, + "balance_loss_mlp": 1.03214025, + "epoch": 0.8703442056215241, + "flos": 19427565605760.0, + "grad_norm": 1.5313783538807326, + "language_loss": 0.7782256, + "learning_rate": 1.736914088262349e-07, + "loss": 0.79948747, + "num_input_tokens_seen": 312231735, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6640625, + "step": 14476, + "time_per_iteration": 3.8718321323394775 + }, + { + "auxiliary_loss_clip": 0.01097278, + "auxiliary_loss_mlp": 0.01026284, + "balance_loss_clip": 1.01576388, + "balance_loss_mlp": 1.03464425, + "epoch": 0.8704043288741921, + "flos": 22273845264000.0, + "grad_norm": 1.4589876491690197, + "language_loss": 0.72287905, + "learning_rate": 1.7353269339190525e-07, + "loss": 0.74411464, + "num_input_tokens_seen": 312253060, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.625, + "step": 14477, + "time_per_iteration": 2.467496156692505 + }, + { + "auxiliary_loss_clip": 0.01099819, + "auxiliary_loss_mlp": 0.01026827, + "balance_loss_clip": 1.01542509, + "balance_loss_mlp": 1.03454363, + "epoch": 0.8704644521268601, + "flos": 16648725732480.0, + "grad_norm": 2.2041125468373424, + "language_loss": 0.59643745, + "learning_rate": 1.7337404721739946e-07, + "loss": 0.61770391, + "num_input_tokens_seen": 312269460, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 14478, + "time_per_iteration": 2.452988862991333 + }, + { + "auxiliary_loss_clip": 0.01099937, + "auxiliary_loss_mlp": 0.01027655, + "balance_loss_clip": 1.01758146, + "balance_loss_mlp": 1.03778374, + "epoch": 0.870524575379528, + "flos": 24280210224000.0, + "grad_norm": 1.4601612361115293, + "language_loss": 0.71350467, + "learning_rate": 1.732154703087323e-07, + "loss": 0.73478055, + "num_input_tokens_seen": 312289830, + "router_z_loss_clip": 0.10058594, + "router_z_loss_mlp": 0.62109375, + "step": 14479, + "time_per_iteration": 2.4820868968963623 + }, + { + "auxiliary_loss_clip": 0.01099029, + "auxiliary_loss_mlp": 0.01031237, + "balance_loss_clip": 1.01922119, + "balance_loss_mlp": 1.034477, + "epoch": 0.870584698632196, + "flos": 28768684803840.0, + "grad_norm": 1.4445497699948229, + "language_loss": 0.70891637, + "learning_rate": 1.7305696267191805e-07, + "loss": 0.73021901, + "num_input_tokens_seen": 312311320, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.64453125, + "step": 14480, + "time_per_iteration": 2.513767957687378 + }, + { + "auxiliary_loss_clip": 0.01100037, + "auxiliary_loss_mlp": 0.01027419, + "balance_loss_clip": 1.01664293, + "balance_loss_mlp": 1.03382504, + "epoch": 0.8706448218848639, + "flos": 32449635774720.0, + "grad_norm": 1.6114944508469022, + "language_loss": 0.70245749, + "learning_rate": 1.728985243129666e-07, + "loss": 0.72373205, + "num_input_tokens_seen": 312332095, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6640625, + "step": 14481, + "time_per_iteration": 2.554398775100708 + }, + { + "auxiliary_loss_clip": 0.01096608, + "auxiliary_loss_mlp": 0.01027738, + "balance_loss_clip": 1.01684833, + "balance_loss_mlp": 1.03254092, + "epoch": 0.8707049451375319, + "flos": 22748009725440.0, + "grad_norm": 1.964144913348514, + "language_loss": 0.77078795, + "learning_rate": 1.7274015523788643e-07, + "loss": 0.79203141, + "num_input_tokens_seen": 312351225, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.640625, + "step": 14482, + "time_per_iteration": 2.4815468788146973 + }, + { + "auxiliary_loss_clip": 0.01098791, + "auxiliary_loss_mlp": 0.01030512, + "balance_loss_clip": 1.01902604, + "balance_loss_mlp": 1.03534698, + "epoch": 0.8707650683902, + "flos": 15851976203520.0, + "grad_norm": 1.9700716147504154, + "language_loss": 0.76582003, + "learning_rate": 1.7258185545268234e-07, + "loss": 0.78711307, + "num_input_tokens_seen": 312369730, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6328125, + "step": 14483, + "time_per_iteration": 2.436213254928589 + }, + { + "auxiliary_loss_clip": 0.01104536, + "auxiliary_loss_mlp": 0.01035885, + "balance_loss_clip": 1.02293277, + "balance_loss_mlp": 1.0356437, + "epoch": 0.8708251916428679, + "flos": 16468131127680.0, + "grad_norm": 1.9594179999364503, + "language_loss": 0.61840808, + "learning_rate": 1.7242362496335749e-07, + "loss": 0.63981229, + "num_input_tokens_seen": 312386780, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6875, + "step": 14484, + "time_per_iteration": 2.442678213119507 + }, + { + "auxiliary_loss_clip": 0.01099606, + "auxiliary_loss_mlp": 0.01028654, + "balance_loss_clip": 1.01751947, + "balance_loss_mlp": 1.03592515, + "epoch": 0.8708853148955359, + "flos": 15377847655680.0, + "grad_norm": 1.8902385859017612, + "language_loss": 0.67741799, + "learning_rate": 1.7226546377591222e-07, + "loss": 0.69870055, + "num_input_tokens_seen": 312404875, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.63671875, + "step": 14485, + "time_per_iteration": 2.4474589824676514 + }, + { + "auxiliary_loss_clip": 0.01098241, + "auxiliary_loss_mlp": 0.01029846, + "balance_loss_clip": 1.01782358, + "balance_loss_mlp": 1.03395045, + "epoch": 0.8709454381482038, + "flos": 30551325903360.0, + "grad_norm": 1.8241363232690688, + "language_loss": 0.62720448, + "learning_rate": 1.7210737189634373e-07, + "loss": 0.64848542, + "num_input_tokens_seen": 312425280, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.640625, + "step": 14486, + "time_per_iteration": 2.5260205268859863 + }, + { + "auxiliary_loss_clip": 0.01102715, + "auxiliary_loss_mlp": 0.01034103, + "balance_loss_clip": 1.02160406, + "balance_loss_mlp": 1.0344007, + "epoch": 0.8710055614008718, + "flos": 22601422321920.0, + "grad_norm": 2.140129151217502, + "language_loss": 0.61595458, + "learning_rate": 1.7194934933064653e-07, + "loss": 0.63732278, + "num_input_tokens_seen": 312443835, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 14487, + "time_per_iteration": 2.4534120559692383 + }, + { + "auxiliary_loss_clip": 0.01095929, + "auxiliary_loss_mlp": 0.0102546, + "balance_loss_clip": 1.01547635, + "balance_loss_mlp": 1.03313756, + "epoch": 0.8710656846535397, + "flos": 18443146492800.0, + "grad_norm": 1.935883465680325, + "language_loss": 0.67719936, + "learning_rate": 1.7179139608481318e-07, + "loss": 0.69841325, + "num_input_tokens_seen": 312460830, + "router_z_loss_clip": 0.09960938, + "router_z_loss_mlp": 0.62890625, + "step": 14488, + "time_per_iteration": 2.4133338928222656 + }, + { + "auxiliary_loss_clip": 0.01101696, + "auxiliary_loss_mlp": 0.01030683, + "balance_loss_clip": 1.01882124, + "balance_loss_mlp": 1.03593814, + "epoch": 0.8711258079062077, + "flos": 16503862181760.0, + "grad_norm": 2.0140303277845635, + "language_loss": 0.85666835, + "learning_rate": 1.716335121648338e-07, + "loss": 0.87799209, + "num_input_tokens_seen": 312477575, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65625, + "step": 14489, + "time_per_iteration": 2.447730779647827 + }, + { + "auxiliary_loss_clip": 0.01105324, + "auxiliary_loss_mlp": 0.01031789, + "balance_loss_clip": 1.01909935, + "balance_loss_mlp": 1.03548503, + "epoch": 0.8711859311588757, + "flos": 15663336952320.0, + "grad_norm": 2.3883647349321833, + "language_loss": 0.7595576, + "learning_rate": 1.7147569757669445e-07, + "loss": 0.78092867, + "num_input_tokens_seen": 312492140, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69921875, + "step": 14490, + "time_per_iteration": 2.455131769180298 + }, + { + "auxiliary_loss_clip": 0.01103297, + "auxiliary_loss_mlp": 0.01029879, + "balance_loss_clip": 1.01726079, + "balance_loss_mlp": 1.0360409, + "epoch": 0.8712460544115437, + "flos": 15557544420480.0, + "grad_norm": 2.077474010199246, + "language_loss": 0.76046753, + "learning_rate": 1.7131795232638012e-07, + "loss": 0.78179932, + "num_input_tokens_seen": 312508400, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.671875, + "step": 14491, + "time_per_iteration": 2.444725513458252 + }, + { + "auxiliary_loss_clip": 0.01101932, + "auxiliary_loss_mlp": 0.01026867, + "balance_loss_clip": 1.01542914, + "balance_loss_mlp": 1.03789806, + "epoch": 0.8713061776642116, + "flos": 16763568491520.0, + "grad_norm": 1.5462104355983195, + "language_loss": 0.67157114, + "learning_rate": 1.711602764198723e-07, + "loss": 0.69285911, + "num_input_tokens_seen": 312525915, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.640625, + "step": 14492, + "time_per_iteration": 2.456963539123535 + }, + { + "auxiliary_loss_clip": 0.01097894, + "auxiliary_loss_mlp": 0.01028195, + "balance_loss_clip": 1.01746666, + "balance_loss_mlp": 1.0347116, + "epoch": 0.8713663009168796, + "flos": 24279887001600.0, + "grad_norm": 1.9471643685037383, + "language_loss": 0.69513756, + "learning_rate": 1.7100266986314992e-07, + "loss": 0.71639848, + "num_input_tokens_seen": 312544735, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6328125, + "step": 14493, + "time_per_iteration": 2.486240863800049 + }, + { + "auxiliary_loss_clip": 0.01103732, + "auxiliary_loss_mlp": 0.01032556, + "balance_loss_clip": 1.02045047, + "balance_loss_mlp": 1.03747892, + "epoch": 0.8714264241695475, + "flos": 23795594904960.0, + "grad_norm": 2.373229138100018, + "language_loss": 0.89281845, + "learning_rate": 1.7084513266218936e-07, + "loss": 0.91418135, + "num_input_tokens_seen": 312557910, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66015625, + "step": 14494, + "time_per_iteration": 2.426887273788452 + }, + { + "auxiliary_loss_clip": 0.01100458, + "auxiliary_loss_mlp": 0.01029806, + "balance_loss_clip": 1.01902366, + "balance_loss_mlp": 1.03658473, + "epoch": 0.8714865474222155, + "flos": 37997942071680.0, + "grad_norm": 2.4859036528024387, + "language_loss": 0.59329295, + "learning_rate": 1.7068766482296514e-07, + "loss": 0.61459565, + "num_input_tokens_seen": 312580360, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.640625, + "step": 14495, + "time_per_iteration": 2.609177350997925 + }, + { + "auxiliary_loss_clip": 0.01099752, + "auxiliary_loss_mlp": 0.01032103, + "balance_loss_clip": 1.02050412, + "balance_loss_mlp": 1.03364801, + "epoch": 0.8715466706748836, + "flos": 22455696844800.0, + "grad_norm": 1.8624158816524548, + "language_loss": 0.80186629, + "learning_rate": 1.7053026635144762e-07, + "loss": 0.82318485, + "num_input_tokens_seen": 312597550, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66015625, + "step": 14496, + "time_per_iteration": 2.48067569732666 + }, + { + "auxiliary_loss_clip": 0.01100552, + "auxiliary_loss_mlp": 0.01034326, + "balance_loss_clip": 1.02149367, + "balance_loss_mlp": 1.03474569, + "epoch": 0.8716067939275515, + "flos": 21215126868480.0, + "grad_norm": 1.9430726479329608, + "language_loss": 0.7876395, + "learning_rate": 1.7037293725360624e-07, + "loss": 0.80898833, + "num_input_tokens_seen": 312616435, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.65625, + "step": 14497, + "time_per_iteration": 2.4636995792388916 + }, + { + "auxiliary_loss_clip": 0.01101538, + "auxiliary_loss_mlp": 0.01029752, + "balance_loss_clip": 1.01755071, + "balance_loss_mlp": 1.03472698, + "epoch": 0.8716669171802195, + "flos": 22997732054400.0, + "grad_norm": 2.083369518158611, + "language_loss": 0.66958046, + "learning_rate": 1.70215677535406e-07, + "loss": 0.69089335, + "num_input_tokens_seen": 312632770, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66796875, + "step": 14498, + "time_per_iteration": 2.4272525310516357 + }, + { + "auxiliary_loss_clip": 0.01097343, + "auxiliary_loss_mlp": 0.010302, + "balance_loss_clip": 1.01882744, + "balance_loss_mlp": 1.03298473, + "epoch": 0.8717270404328874, + "flos": 29784058462080.0, + "grad_norm": 1.650352141003142, + "language_loss": 0.57090127, + "learning_rate": 1.700584872028108e-07, + "loss": 0.59217668, + "num_input_tokens_seen": 312651900, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.64453125, + "step": 14499, + "time_per_iteration": 2.5417370796203613 + }, + { + "auxiliary_loss_clip": 0.01100865, + "auxiliary_loss_mlp": 0.01030099, + "balance_loss_clip": 1.01810622, + "balance_loss_mlp": 1.03407812, + "epoch": 0.8717871636855554, + "flos": 22018125363840.0, + "grad_norm": 1.8287657436259406, + "language_loss": 0.79665649, + "learning_rate": 1.6990136626178097e-07, + "loss": 0.81796622, + "num_input_tokens_seen": 312671380, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66796875, + "step": 14500, + "time_per_iteration": 2.4384779930114746 + }, + { + "auxiliary_loss_clip": 0.01099214, + "auxiliary_loss_mlp": 0.01026537, + "balance_loss_clip": 1.01524222, + "balance_loss_mlp": 1.03458488, + "epoch": 0.8718472869382233, + "flos": 16654256426880.0, + "grad_norm": 3.927270356481981, + "language_loss": 0.72778672, + "learning_rate": 1.6974431471827466e-07, + "loss": 0.74904418, + "num_input_tokens_seen": 312689215, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 14501, + "time_per_iteration": 2.4204492568969727 + }, + { + "auxiliary_loss_clip": 0.01105269, + "auxiliary_loss_mlp": 0.01027676, + "balance_loss_clip": 1.01549292, + "balance_loss_mlp": 1.03746784, + "epoch": 0.8719074101908914, + "flos": 19495328613120.0, + "grad_norm": 1.8814783525974537, + "language_loss": 0.64179307, + "learning_rate": 1.695873325782482e-07, + "loss": 0.66312253, + "num_input_tokens_seen": 312706400, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6796875, + "step": 14502, + "time_per_iteration": 2.434103488922119 + }, + { + "auxiliary_loss_clip": 0.01101671, + "auxiliary_loss_mlp": 0.01032355, + "balance_loss_clip": 1.02017736, + "balance_loss_mlp": 1.0345037, + "epoch": 0.8719675334435593, + "flos": 33070890430080.0, + "grad_norm": 1.8726397651262905, + "language_loss": 0.68590897, + "learning_rate": 1.6943041984765262e-07, + "loss": 0.70724928, + "num_input_tokens_seen": 312727985, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.671875, + "step": 14503, + "time_per_iteration": 2.5533761978149414 + }, + { + "auxiliary_loss_clip": 0.01100258, + "auxiliary_loss_mlp": 0.01027095, + "balance_loss_clip": 1.01556742, + "balance_loss_mlp": 1.03493106, + "epoch": 0.8720276566962273, + "flos": 13626268842240.0, + "grad_norm": 2.557034340222278, + "language_loss": 0.69279027, + "learning_rate": 1.6927357653243912e-07, + "loss": 0.71406382, + "num_input_tokens_seen": 312745025, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 14504, + "time_per_iteration": 2.3973300457000732 + }, + { + "auxiliary_loss_clip": 0.01100515, + "auxiliary_loss_mlp": 0.01023828, + "balance_loss_clip": 1.01218152, + "balance_loss_mlp": 1.03470516, + "epoch": 0.8720877799488952, + "flos": 23514163845120.0, + "grad_norm": 1.7855236968382922, + "language_loss": 0.70064294, + "learning_rate": 1.691168026385552e-07, + "loss": 0.7218864, + "num_input_tokens_seen": 312764170, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 14505, + "time_per_iteration": 2.4677655696868896 + }, + { + "auxiliary_loss_clip": 0.01099166, + "auxiliary_loss_mlp": 0.01028441, + "balance_loss_clip": 1.01750898, + "balance_loss_mlp": 1.03490877, + "epoch": 0.8721479032015632, + "flos": 20814148368000.0, + "grad_norm": 1.5833201060926712, + "language_loss": 0.78157365, + "learning_rate": 1.6896009817194545e-07, + "loss": 0.80284977, + "num_input_tokens_seen": 312783830, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.64453125, + "step": 14506, + "time_per_iteration": 2.4351515769958496 + }, + { + "auxiliary_loss_clip": 0.01099602, + "auxiliary_loss_mlp": 0.01028009, + "balance_loss_clip": 1.016541, + "balance_loss_mlp": 1.03255463, + "epoch": 0.8722080264542311, + "flos": 19463655795840.0, + "grad_norm": 2.103988804675608, + "language_loss": 0.73979723, + "learning_rate": 1.6880346313855221e-07, + "loss": 0.76107335, + "num_input_tokens_seen": 312802015, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.671875, + "step": 14507, + "time_per_iteration": 2.455190420150757 + }, + { + "auxiliary_loss_clip": 0.0110337, + "auxiliary_loss_mlp": 0.01029571, + "balance_loss_clip": 1.01671457, + "balance_loss_mlp": 1.03505719, + "epoch": 0.8722681497068991, + "flos": 21761866759680.0, + "grad_norm": 2.4004535186438694, + "language_loss": 0.72314352, + "learning_rate": 1.686468975443156e-07, + "loss": 0.74447292, + "num_input_tokens_seen": 312820650, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.68359375, + "step": 14508, + "time_per_iteration": 2.429776191711426 + }, + { + "auxiliary_loss_clip": 0.01103509, + "auxiliary_loss_mlp": 0.01032368, + "balance_loss_clip": 1.01998162, + "balance_loss_mlp": 1.03545594, + "epoch": 0.8723282729595672, + "flos": 28877134942080.0, + "grad_norm": 1.6736592907716532, + "language_loss": 0.68370092, + "learning_rate": 1.6849040139517202e-07, + "loss": 0.70505971, + "num_input_tokens_seen": 312841310, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 14509, + "time_per_iteration": 2.512343168258667 + }, + { + "auxiliary_loss_clip": 0.01100301, + "auxiliary_loss_mlp": 0.01031497, + "balance_loss_clip": 1.02042294, + "balance_loss_mlp": 1.03505254, + "epoch": 0.8723883962122351, + "flos": 26469145036800.0, + "grad_norm": 1.602744668687492, + "language_loss": 0.58099592, + "learning_rate": 1.683339746970558e-07, + "loss": 0.60231388, + "num_input_tokens_seen": 312862100, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.65234375, + "step": 14510, + "time_per_iteration": 2.5066168308258057 + }, + { + "auxiliary_loss_clip": 0.01106, + "auxiliary_loss_mlp": 0.01030297, + "balance_loss_clip": 1.01718426, + "balance_loss_mlp": 1.03560722, + "epoch": 0.8724485194649031, + "flos": 20521476351360.0, + "grad_norm": 3.01668414840158, + "language_loss": 0.67472696, + "learning_rate": 1.6817761745589865e-07, + "loss": 0.69608998, + "num_input_tokens_seen": 312880220, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 14511, + "time_per_iteration": 2.451087236404419 + }, + { + "auxiliary_loss_clip": 0.01103515, + "auxiliary_loss_mlp": 0.010345, + "balance_loss_clip": 1.02203035, + "balance_loss_mlp": 1.03443432, + "epoch": 0.872508642717571, + "flos": 24353360271360.0, + "grad_norm": 3.2501383047133405, + "language_loss": 0.81799793, + "learning_rate": 1.6802132967763027e-07, + "loss": 0.83937812, + "num_input_tokens_seen": 312900765, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69140625, + "step": 14512, + "time_per_iteration": 3.9438862800598145 + }, + { + "auxiliary_loss_clip": 0.01022617, + "auxiliary_loss_mlp": 0.01002239, + "balance_loss_clip": 1.00121391, + "balance_loss_mlp": 1.00251293, + "epoch": 0.872568765970239, + "flos": 61410012485760.0, + "grad_norm": 0.821541974320149, + "language_loss": 0.58620477, + "learning_rate": 1.6786511136817617e-07, + "loss": 0.6064533, + "num_input_tokens_seen": 312955840, + "router_z_loss_clip": 0.01025391, + "router_z_loss_mlp": 0.20117188, + "step": 14513, + "time_per_iteration": 5.803184747695923 + }, + { + "auxiliary_loss_clip": 0.01100291, + "auxiliary_loss_mlp": 0.01027146, + "balance_loss_clip": 1.01511812, + "balance_loss_mlp": 1.035128, + "epoch": 0.8726288892229069, + "flos": 22598046443520.0, + "grad_norm": 1.7723681458845877, + "language_loss": 0.76434934, + "learning_rate": 1.6770896253346112e-07, + "loss": 0.78562373, + "num_input_tokens_seen": 312973565, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.65234375, + "step": 14514, + "time_per_iteration": 2.43147611618042 + }, + { + "auxiliary_loss_clip": 0.01104793, + "auxiliary_loss_mlp": 0.0102735, + "balance_loss_clip": 1.01603079, + "balance_loss_mlp": 1.0365963, + "epoch": 0.872689012475575, + "flos": 25885201633920.0, + "grad_norm": 1.8948516392444266, + "language_loss": 0.6500389, + "learning_rate": 1.675528831794055e-07, + "loss": 0.67136031, + "num_input_tokens_seen": 312994660, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6796875, + "step": 14515, + "time_per_iteration": 2.4740371704101562 + }, + { + "auxiliary_loss_clip": 0.01102787, + "auxiliary_loss_mlp": 0.01034006, + "balance_loss_clip": 1.02158499, + "balance_loss_mlp": 1.03575802, + "epoch": 0.8727491357282429, + "flos": 21506721477120.0, + "grad_norm": 2.2682848027061837, + "language_loss": 0.79204381, + "learning_rate": 1.6739687331192842e-07, + "loss": 0.81341171, + "num_input_tokens_seen": 313009860, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 14516, + "time_per_iteration": 2.415546417236328 + }, + { + "auxiliary_loss_clip": 0.01104026, + "auxiliary_loss_mlp": 0.01030356, + "balance_loss_clip": 1.01828611, + "balance_loss_mlp": 1.03575611, + "epoch": 0.8728092589809109, + "flos": 19207504932480.0, + "grad_norm": 1.7654736819824852, + "language_loss": 0.71866733, + "learning_rate": 1.672409329369453e-07, + "loss": 0.74001116, + "num_input_tokens_seen": 313027025, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 14517, + "time_per_iteration": 3.8708460330963135 + }, + { + "auxiliary_loss_clip": 0.01096366, + "auxiliary_loss_mlp": 0.01021966, + "balance_loss_clip": 1.01117766, + "balance_loss_mlp": 1.03266263, + "epoch": 0.8728693822335788, + "flos": 20595308757120.0, + "grad_norm": 1.986670256969538, + "language_loss": 0.72410166, + "learning_rate": 1.6708506206036966e-07, + "loss": 0.74528503, + "num_input_tokens_seen": 313046830, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.63671875, + "step": 14518, + "time_per_iteration": 2.4295384883880615 + }, + { + "auxiliary_loss_clip": 0.01097506, + "auxiliary_loss_mlp": 0.01031272, + "balance_loss_clip": 1.02022767, + "balance_loss_mlp": 1.03408229, + "epoch": 0.8729295054862468, + "flos": 21728613744000.0, + "grad_norm": 1.5057549302495625, + "language_loss": 0.74070251, + "learning_rate": 1.6692926068811275e-07, + "loss": 0.76199031, + "num_input_tokens_seen": 313067715, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6328125, + "step": 14519, + "time_per_iteration": 2.4812824726104736 + }, + { + "auxiliary_loss_clip": 0.01102566, + "auxiliary_loss_mlp": 0.01027934, + "balance_loss_clip": 1.0154469, + "balance_loss_mlp": 1.03454578, + "epoch": 0.8729896287389147, + "flos": 17673436926720.0, + "grad_norm": 2.392734857107026, + "language_loss": 0.76474625, + "learning_rate": 1.6677352882608142e-07, + "loss": 0.78605127, + "num_input_tokens_seen": 313082305, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 14520, + "time_per_iteration": 2.395700693130493 + }, + { + "auxiliary_loss_clip": 0.01101699, + "auxiliary_loss_mlp": 0.01033653, + "balance_loss_clip": 1.02091551, + "balance_loss_mlp": 1.03470957, + "epoch": 0.8730497519915827, + "flos": 24571804832640.0, + "grad_norm": 1.7557471092827008, + "language_loss": 0.81959832, + "learning_rate": 1.666178664801816e-07, + "loss": 0.8409518, + "num_input_tokens_seen": 313101190, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.66796875, + "step": 14521, + "time_per_iteration": 2.4852864742279053 + }, + { + "auxiliary_loss_clip": 0.01103057, + "auxiliary_loss_mlp": 0.0103373, + "balance_loss_clip": 1.02129066, + "balance_loss_mlp": 1.03620303, + "epoch": 0.8731098752442508, + "flos": 13443734903040.0, + "grad_norm": 2.742686839679241, + "language_loss": 0.76673812, + "learning_rate": 1.6646227365631616e-07, + "loss": 0.78810602, + "num_input_tokens_seen": 313118965, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 14522, + "time_per_iteration": 2.400723457336426 + }, + { + "auxiliary_loss_clip": 0.01096241, + "auxiliary_loss_mlp": 0.01027206, + "balance_loss_clip": 1.01647091, + "balance_loss_mlp": 1.03311884, + "epoch": 0.8731699984969187, + "flos": 23474446381440.0, + "grad_norm": 1.7966365150802115, + "language_loss": 0.75488186, + "learning_rate": 1.66306750360385e-07, + "loss": 0.77611631, + "num_input_tokens_seen": 313139280, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6328125, + "step": 14523, + "time_per_iteration": 2.4636495113372803 + }, + { + "auxiliary_loss_clip": 0.01096575, + "auxiliary_loss_mlp": 0.01028177, + "balance_loss_clip": 1.01697135, + "balance_loss_mlp": 1.03261673, + "epoch": 0.8732301217495867, + "flos": 17712651600000.0, + "grad_norm": 4.0015763337701715, + "language_loss": 0.78712022, + "learning_rate": 1.6615129659828542e-07, + "loss": 0.80836773, + "num_input_tokens_seen": 313156655, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.640625, + "step": 14524, + "time_per_iteration": 2.4102303981781006 + }, + { + "auxiliary_loss_clip": 0.01096233, + "auxiliary_loss_mlp": 0.01030493, + "balance_loss_clip": 1.01968098, + "balance_loss_mlp": 1.03349209, + "epoch": 0.8732902450022546, + "flos": 22054359208320.0, + "grad_norm": 1.906291591567106, + "language_loss": 0.77577364, + "learning_rate": 1.6599591237591272e-07, + "loss": 0.79704088, + "num_input_tokens_seen": 313174050, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.625, + "step": 14525, + "time_per_iteration": 2.4395744800567627 + }, + { + "auxiliary_loss_clip": 0.01100789, + "auxiliary_loss_mlp": 0.01031139, + "balance_loss_clip": 1.01979661, + "balance_loss_mlp": 1.03422463, + "epoch": 0.8733503682549226, + "flos": 22272983337600.0, + "grad_norm": 1.583399004883713, + "language_loss": 0.68971789, + "learning_rate": 1.6584059769915902e-07, + "loss": 0.71103716, + "num_input_tokens_seen": 313192765, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 14526, + "time_per_iteration": 2.460601329803467 + }, + { + "auxiliary_loss_clip": 0.01104095, + "auxiliary_loss_mlp": 0.0103504, + "balance_loss_clip": 1.02277374, + "balance_loss_mlp": 1.0354166, + "epoch": 0.8734104915075905, + "flos": 23364344217600.0, + "grad_norm": 1.8364570696793545, + "language_loss": 0.61118007, + "learning_rate": 1.6568535257391326e-07, + "loss": 0.6325714, + "num_input_tokens_seen": 313210925, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 14527, + "time_per_iteration": 2.451878547668457 + }, + { + "auxiliary_loss_clip": 0.01107658, + "auxiliary_loss_mlp": 0.01030345, + "balance_loss_clip": 1.01725018, + "balance_loss_mlp": 1.03724599, + "epoch": 0.8734706147602586, + "flos": 17712292464000.0, + "grad_norm": 1.900288046481709, + "language_loss": 0.65428543, + "learning_rate": 1.6553017700606265e-07, + "loss": 0.6756655, + "num_input_tokens_seen": 313228250, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.703125, + "step": 14528, + "time_per_iteration": 2.4043383598327637 + }, + { + "auxiliary_loss_clip": 0.01099208, + "auxiliary_loss_mlp": 0.01027467, + "balance_loss_clip": 1.01605284, + "balance_loss_mlp": 1.03587151, + "epoch": 0.8735307380129265, + "flos": 22049367217920.0, + "grad_norm": 1.7400708711936286, + "language_loss": 0.89542592, + "learning_rate": 1.6537507100149205e-07, + "loss": 0.91669267, + "num_input_tokens_seen": 313247880, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6328125, + "step": 14529, + "time_per_iteration": 2.4536657333374023 + }, + { + "auxiliary_loss_clip": 0.01098594, + "auxiliary_loss_mlp": 0.01026461, + "balance_loss_clip": 1.01464105, + "balance_loss_mlp": 1.03477812, + "epoch": 0.8735908612655945, + "flos": 25338425829120.0, + "grad_norm": 1.8948501189750897, + "language_loss": 0.85129809, + "learning_rate": 1.6522003456608258e-07, + "loss": 0.8725487, + "num_input_tokens_seen": 313266790, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.63671875, + "step": 14530, + "time_per_iteration": 2.4669382572174072 + }, + { + "auxiliary_loss_clip": 0.01099866, + "auxiliary_loss_mlp": 0.0103023, + "balance_loss_clip": 1.01923275, + "balance_loss_mlp": 1.03396988, + "epoch": 0.8736509845182624, + "flos": 21540908246400.0, + "grad_norm": 1.576558377957066, + "language_loss": 0.74252665, + "learning_rate": 1.650650677057128e-07, + "loss": 0.76382756, + "num_input_tokens_seen": 313286805, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.66015625, + "step": 14531, + "time_per_iteration": 2.4694976806640625 + }, + { + "auxiliary_loss_clip": 0.0109496, + "auxiliary_loss_mlp": 0.01028468, + "balance_loss_clip": 1.01751268, + "balance_loss_mlp": 1.03228939, + "epoch": 0.8737111077709304, + "flos": 22017227523840.0, + "grad_norm": 1.9295326178711187, + "language_loss": 0.61642307, + "learning_rate": 1.6491017042625966e-07, + "loss": 0.63765734, + "num_input_tokens_seen": 313305415, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.625, + "step": 14532, + "time_per_iteration": 2.467203140258789 + }, + { + "auxiliary_loss_clip": 0.01022366, + "auxiliary_loss_mlp": 0.01002055, + "balance_loss_clip": 1.00104749, + "balance_loss_mlp": 1.00228763, + "epoch": 0.8737712310235983, + "flos": 70066315912320.0, + "grad_norm": 1.0435222234743866, + "language_loss": 0.58747792, + "learning_rate": 1.6475534273359704e-07, + "loss": 0.60772216, + "num_input_tokens_seen": 313369940, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.20117188, + "step": 14533, + "time_per_iteration": 3.1370933055877686 + }, + { + "auxiliary_loss_clip": 0.01098118, + "auxiliary_loss_mlp": 0.01028866, + "balance_loss_clip": 1.0177381, + "balance_loss_mlp": 1.03478742, + "epoch": 0.8738313542762663, + "flos": 28658331244800.0, + "grad_norm": 1.6991996292719136, + "language_loss": 0.770051, + "learning_rate": 1.646005846335954e-07, + "loss": 0.7913208, + "num_input_tokens_seen": 313390965, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6328125, + "step": 14534, + "time_per_iteration": 2.4806315898895264 + }, + { + "auxiliary_loss_clip": 0.01099042, + "auxiliary_loss_mlp": 0.01027788, + "balance_loss_clip": 1.01656461, + "balance_loss_mlp": 1.03348231, + "epoch": 0.8738914775289344, + "flos": 22346384780160.0, + "grad_norm": 5.211875046106134, + "language_loss": 0.7515831, + "learning_rate": 1.6444589613212357e-07, + "loss": 0.77285141, + "num_input_tokens_seen": 313409680, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 14535, + "time_per_iteration": 2.4810731410980225 + }, + { + "auxiliary_loss_clip": 0.01098515, + "auxiliary_loss_mlp": 0.01030127, + "balance_loss_clip": 1.01817584, + "balance_loss_mlp": 1.03306127, + "epoch": 0.8739516007816023, + "flos": 31759648444800.0, + "grad_norm": 1.6829802403654797, + "language_loss": 0.74085766, + "learning_rate": 1.64291277235048e-07, + "loss": 0.76214409, + "num_input_tokens_seen": 313431335, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.65625, + "step": 14536, + "time_per_iteration": 2.523463010787964 + }, + { + "auxiliary_loss_clip": 0.01097805, + "auxiliary_loss_mlp": 0.01032126, + "balance_loss_clip": 1.02120638, + "balance_loss_mlp": 1.03282738, + "epoch": 0.8740117240342703, + "flos": 21211715076480.0, + "grad_norm": 1.6182033888035987, + "language_loss": 0.6362291, + "learning_rate": 1.641367279482304e-07, + "loss": 0.6575284, + "num_input_tokens_seen": 313449225, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 14537, + "time_per_iteration": 2.4442684650421143 + }, + { + "auxiliary_loss_clip": 0.01098039, + "auxiliary_loss_mlp": 0.01030257, + "balance_loss_clip": 1.01776958, + "balance_loss_mlp": 1.03407526, + "epoch": 0.8740718472869382, + "flos": 25186666867200.0, + "grad_norm": 1.9755012548468744, + "language_loss": 0.58271295, + "learning_rate": 1.6398224827753216e-07, + "loss": 0.60399592, + "num_input_tokens_seen": 313467715, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.640625, + "step": 14538, + "time_per_iteration": 2.4764134883880615 + }, + { + "auxiliary_loss_clip": 0.01098568, + "auxiliary_loss_mlp": 0.01026406, + "balance_loss_clip": 1.01509345, + "balance_loss_mlp": 1.03636038, + "epoch": 0.8741319705396062, + "flos": 19500931134720.0, + "grad_norm": 1.8688727440620683, + "language_loss": 0.68641996, + "learning_rate": 1.6382783822881142e-07, + "loss": 0.70766973, + "num_input_tokens_seen": 313486805, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.62109375, + "step": 14539, + "time_per_iteration": 2.5020625591278076 + }, + { + "auxiliary_loss_clip": 0.01101347, + "auxiliary_loss_mlp": 0.01029164, + "balance_loss_clip": 1.01667643, + "balance_loss_mlp": 1.03300726, + "epoch": 0.8741920937922741, + "flos": 14100900180480.0, + "grad_norm": 2.6060863933182126, + "language_loss": 0.74274981, + "learning_rate": 1.6367349780792262e-07, + "loss": 0.76405495, + "num_input_tokens_seen": 313504880, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.68359375, + "step": 14540, + "time_per_iteration": 2.404411792755127 + }, + { + "auxiliary_loss_clip": 0.01100315, + "auxiliary_loss_mlp": 0.01032644, + "balance_loss_clip": 1.02098525, + "balance_loss_mlp": 1.03433895, + "epoch": 0.8742522170449422, + "flos": 27709858667520.0, + "grad_norm": 1.6726829694378176, + "language_loss": 0.78856957, + "learning_rate": 1.635192270207193e-07, + "loss": 0.80989909, + "num_input_tokens_seen": 313524995, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66015625, + "step": 14541, + "time_per_iteration": 2.503199338912964 + }, + { + "auxiliary_loss_clip": 0.01104191, + "auxiliary_loss_mlp": 0.01030098, + "balance_loss_clip": 1.01660323, + "balance_loss_mlp": 1.03575325, + "epoch": 0.8743123402976101, + "flos": 21142587352320.0, + "grad_norm": 2.184797101770986, + "language_loss": 0.66509086, + "learning_rate": 1.6336502587305035e-07, + "loss": 0.68643373, + "num_input_tokens_seen": 313541740, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.6875, + "step": 14542, + "time_per_iteration": 2.4233803749084473 + }, + { + "auxiliary_loss_clip": 0.0102268, + "auxiliary_loss_mlp": 0.01004544, + "balance_loss_clip": 1.00348306, + "balance_loss_mlp": 1.00264943, + "epoch": 0.8743724635502781, + "flos": 60870024351360.0, + "grad_norm": 0.78036669727378, + "language_loss": 0.54485124, + "learning_rate": 1.632108943707642e-07, + "loss": 0.5651235, + "num_input_tokens_seen": 313593445, + "router_z_loss_clip": 0.01062012, + "router_z_loss_mlp": 0.20117188, + "step": 14543, + "time_per_iteration": 2.86068058013916 + }, + { + "auxiliary_loss_clip": 0.0110231, + "auxiliary_loss_mlp": 0.01031505, + "balance_loss_clip": 1.01938748, + "balance_loss_mlp": 1.03536141, + "epoch": 0.874432586802946, + "flos": 28109292883200.0, + "grad_norm": 2.471051898904442, + "language_loss": 0.69747186, + "learning_rate": 1.6305683251970458e-07, + "loss": 0.71880996, + "num_input_tokens_seen": 313615640, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 14544, + "time_per_iteration": 2.5185115337371826 + }, + { + "auxiliary_loss_clip": 0.0109533, + "auxiliary_loss_mlp": 0.01024688, + "balance_loss_clip": 1.01388133, + "balance_loss_mlp": 1.03356862, + "epoch": 0.874492710055614, + "flos": 23550289948800.0, + "grad_norm": 1.6364457025901016, + "language_loss": 0.75830984, + "learning_rate": 1.62902840325714e-07, + "loss": 0.77951002, + "num_input_tokens_seen": 313635550, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6171875, + "step": 14545, + "time_per_iteration": 2.4716804027557373 + }, + { + "auxiliary_loss_clip": 0.01100059, + "auxiliary_loss_mlp": 0.01032496, + "balance_loss_clip": 1.01906097, + "balance_loss_mlp": 1.03366208, + "epoch": 0.8745528333082819, + "flos": 40915647924480.0, + "grad_norm": 1.6291355552891738, + "language_loss": 0.65811241, + "learning_rate": 1.6274891779463217e-07, + "loss": 0.67943794, + "num_input_tokens_seen": 313659275, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.6640625, + "step": 14546, + "time_per_iteration": 2.5942723751068115 + }, + { + "auxiliary_loss_clip": 0.01099717, + "auxiliary_loss_mlp": 0.01028731, + "balance_loss_clip": 1.01728725, + "balance_loss_mlp": 1.03421474, + "epoch": 0.87461295656095, + "flos": 23622901292160.0, + "grad_norm": 1.6089408815054664, + "language_loss": 0.72915637, + "learning_rate": 1.6259506493229536e-07, + "loss": 0.75044084, + "num_input_tokens_seen": 313680595, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 14547, + "time_per_iteration": 2.476132869720459 + }, + { + "auxiliary_loss_clip": 0.01106258, + "auxiliary_loss_mlp": 0.01034848, + "balance_loss_clip": 1.02195573, + "balance_loss_mlp": 1.03549254, + "epoch": 0.874673079813618, + "flos": 38794116983040.0, + "grad_norm": 2.091980214733164, + "language_loss": 0.69212079, + "learning_rate": 1.6244128174453752e-07, + "loss": 0.71353185, + "num_input_tokens_seen": 313699730, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.70703125, + "step": 14548, + "time_per_iteration": 2.5924787521362305 + }, + { + "auxiliary_loss_clip": 0.01104345, + "auxiliary_loss_mlp": 0.01031321, + "balance_loss_clip": 1.0189178, + "balance_loss_mlp": 1.03624892, + "epoch": 0.8747332030662859, + "flos": 23696159080320.0, + "grad_norm": 1.7846159993944952, + "language_loss": 0.71013767, + "learning_rate": 1.6228756823719093e-07, + "loss": 0.73149431, + "num_input_tokens_seen": 313720090, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 14549, + "time_per_iteration": 2.662411689758301 + }, + { + "auxiliary_loss_clip": 0.01103895, + "auxiliary_loss_mlp": 0.01034303, + "balance_loss_clip": 1.02070725, + "balance_loss_mlp": 1.03421688, + "epoch": 0.8747933263189539, + "flos": 24462456854400.0, + "grad_norm": 2.2861390343765375, + "language_loss": 0.83157504, + "learning_rate": 1.6213392441608352e-07, + "loss": 0.85295701, + "num_input_tokens_seen": 313736795, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.6953125, + "step": 14550, + "time_per_iteration": 2.5321483612060547 + }, + { + "auxiliary_loss_clip": 0.01102064, + "auxiliary_loss_mlp": 0.01034898, + "balance_loss_clip": 1.02321005, + "balance_loss_mlp": 1.03460526, + "epoch": 0.8748534495716218, + "flos": 13809161917440.0, + "grad_norm": 1.7632417241957978, + "language_loss": 0.71897519, + "learning_rate": 1.6198035028704183e-07, + "loss": 0.74034476, + "num_input_tokens_seen": 313754820, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.67578125, + "step": 14551, + "time_per_iteration": 2.4257118701934814 + }, + { + "auxiliary_loss_clip": 0.01097904, + "auxiliary_loss_mlp": 0.01029093, + "balance_loss_clip": 1.01725554, + "balance_loss_mlp": 1.03376746, + "epoch": 0.8749135728242898, + "flos": 29862092759040.0, + "grad_norm": 1.8511874506751833, + "language_loss": 0.63747656, + "learning_rate": 1.6182684585588934e-07, + "loss": 0.65874648, + "num_input_tokens_seen": 313775830, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.640625, + "step": 14552, + "time_per_iteration": 2.507741689682007 + }, + { + "auxiliary_loss_clip": 0.01103006, + "auxiliary_loss_mlp": 0.01028493, + "balance_loss_clip": 1.01513553, + "balance_loss_mlp": 1.03501391, + "epoch": 0.8749736960769577, + "flos": 24133479166080.0, + "grad_norm": 2.446923250150636, + "language_loss": 0.79332548, + "learning_rate": 1.616734111284479e-07, + "loss": 0.81464052, + "num_input_tokens_seen": 313795745, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.6796875, + "step": 14553, + "time_per_iteration": 3.8591794967651367 + }, + { + "auxiliary_loss_clip": 0.01101263, + "auxiliary_loss_mlp": 0.0102859, + "balance_loss_clip": 1.01679969, + "balance_loss_mlp": 1.03322935, + "epoch": 0.8750338193296258, + "flos": 17202540602880.0, + "grad_norm": 1.9284579962234305, + "language_loss": 0.70292234, + "learning_rate": 1.6152004611053416e-07, + "loss": 0.72422087, + "num_input_tokens_seen": 313813895, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6796875, + "step": 14554, + "time_per_iteration": 2.425405740737915 + }, + { + "auxiliary_loss_clip": 0.01102552, + "auxiliary_loss_mlp": 0.01024857, + "balance_loss_clip": 1.01338291, + "balance_loss_mlp": 1.03636527, + "epoch": 0.8750939425822937, + "flos": 23733218937600.0, + "grad_norm": 1.403685789536988, + "language_loss": 0.83570188, + "learning_rate": 1.6136675080796457e-07, + "loss": 0.85697597, + "num_input_tokens_seen": 313834225, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6640625, + "step": 14555, + "time_per_iteration": 5.303139686584473 + }, + { + "auxiliary_loss_clip": 0.0109872, + "auxiliary_loss_mlp": 0.01029685, + "balance_loss_clip": 1.0174067, + "balance_loss_mlp": 1.03311133, + "epoch": 0.8751540658349617, + "flos": 26541684552960.0, + "grad_norm": 1.5544926685041807, + "language_loss": 0.71064872, + "learning_rate": 1.6121352522655252e-07, + "loss": 0.73193276, + "num_input_tokens_seen": 313854430, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.65625, + "step": 14556, + "time_per_iteration": 2.479682207107544 + }, + { + "auxiliary_loss_clip": 0.01102158, + "auxiliary_loss_mlp": 0.01037412, + "balance_loss_clip": 1.02427554, + "balance_loss_mlp": 1.03392434, + "epoch": 0.8752141890876296, + "flos": 19386806647680.0, + "grad_norm": 2.090988488565758, + "language_loss": 0.76491272, + "learning_rate": 1.6106036937210732e-07, + "loss": 0.78630841, + "num_input_tokens_seen": 313871600, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.68359375, + "step": 14557, + "time_per_iteration": 2.4440789222717285 + }, + { + "auxiliary_loss_clip": 0.01104191, + "auxiliary_loss_mlp": 0.01036641, + "balance_loss_clip": 1.02429736, + "balance_loss_mlp": 1.03754401, + "epoch": 0.8752743123402976, + "flos": 25374408278400.0, + "grad_norm": 2.745271516916585, + "language_loss": 0.82856929, + "learning_rate": 1.6090728325043767e-07, + "loss": 0.84997767, + "num_input_tokens_seen": 313891570, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.66796875, + "step": 14558, + "time_per_iteration": 2.4604156017303467 + }, + { + "auxiliary_loss_clip": 0.01022061, + "auxiliary_loss_mlp": 0.0099888, + "balance_loss_clip": 0.99787825, + "balance_loss_mlp": 1.00195646, + "epoch": 0.8753344355929655, + "flos": 59952398578560.0, + "grad_norm": 0.8098972335456046, + "language_loss": 0.56113648, + "learning_rate": 1.6075426686734784e-07, + "loss": 0.58134592, + "num_input_tokens_seen": 313951290, + "router_z_loss_clip": 0.01000977, + "router_z_loss_mlp": 0.20117188, + "step": 14559, + "time_per_iteration": 4.470167636871338 + }, + { + "auxiliary_loss_clip": 0.01097721, + "auxiliary_loss_mlp": 0.01032387, + "balance_loss_clip": 1.02115154, + "balance_loss_mlp": 1.03361118, + "epoch": 0.8753945588456336, + "flos": 17894646835200.0, + "grad_norm": 1.753910973149056, + "language_loss": 0.65810168, + "learning_rate": 1.606013202286407e-07, + "loss": 0.67940271, + "num_input_tokens_seen": 313968645, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 14560, + "time_per_iteration": 2.4606189727783203 + }, + { + "auxiliary_loss_clip": 0.01098497, + "auxiliary_loss_mlp": 0.01026014, + "balance_loss_clip": 1.0150826, + "balance_loss_mlp": 1.03399324, + "epoch": 0.8754546820983016, + "flos": 30914885410560.0, + "grad_norm": 1.7971412952154644, + "language_loss": 0.78488302, + "learning_rate": 1.6044844334011541e-07, + "loss": 0.80612814, + "num_input_tokens_seen": 313987580, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.64453125, + "step": 14561, + "time_per_iteration": 2.5178775787353516 + }, + { + "auxiliary_loss_clip": 0.0110177, + "auxiliary_loss_mlp": 0.01032414, + "balance_loss_clip": 1.01950347, + "balance_loss_mlp": 1.03332877, + "epoch": 0.8755148053509695, + "flos": 20631075724800.0, + "grad_norm": 2.052648485503804, + "language_loss": 0.7722398, + "learning_rate": 1.6029563620756982e-07, + "loss": 0.7935816, + "num_input_tokens_seen": 314004460, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6875, + "step": 14562, + "time_per_iteration": 2.4635396003723145 + }, + { + "auxiliary_loss_clip": 0.01093542, + "auxiliary_loss_mlp": 0.01027074, + "balance_loss_clip": 1.01629758, + "balance_loss_mlp": 1.03205824, + "epoch": 0.8755749286036375, + "flos": 34969739005440.0, + "grad_norm": 1.5193476217088446, + "language_loss": 0.72028875, + "learning_rate": 1.601428988367981e-07, + "loss": 0.74149489, + "num_input_tokens_seen": 314026855, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.61328125, + "step": 14563, + "time_per_iteration": 2.5659685134887695 + }, + { + "auxiliary_loss_clip": 0.01105043, + "auxiliary_loss_mlp": 0.01031152, + "balance_loss_clip": 1.01930261, + "balance_loss_mlp": 1.03699827, + "epoch": 0.8756350518563054, + "flos": 18186456925440.0, + "grad_norm": 2.447309960034295, + "language_loss": 0.65054131, + "learning_rate": 1.5999023123359235e-07, + "loss": 0.67190331, + "num_input_tokens_seen": 314042830, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 14564, + "time_per_iteration": 2.4639861583709717 + }, + { + "auxiliary_loss_clip": 0.01098108, + "auxiliary_loss_mlp": 0.01035336, + "balance_loss_clip": 1.02411819, + "balance_loss_mlp": 1.03273273, + "epoch": 0.8756951751089734, + "flos": 20084012611200.0, + "grad_norm": 1.7263402838064887, + "language_loss": 0.70455498, + "learning_rate": 1.598376334037408e-07, + "loss": 0.72588944, + "num_input_tokens_seen": 314062225, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 14565, + "time_per_iteration": 2.4354052543640137 + }, + { + "auxiliary_loss_clip": 0.01105002, + "auxiliary_loss_mlp": 0.01030182, + "balance_loss_clip": 1.01703906, + "balance_loss_mlp": 1.03534722, + "epoch": 0.8757552983616413, + "flos": 27525241739520.0, + "grad_norm": 1.653802479617828, + "language_loss": 0.77780795, + "learning_rate": 1.5968510535303102e-07, + "loss": 0.79915977, + "num_input_tokens_seen": 314082325, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6953125, + "step": 14566, + "time_per_iteration": 2.545552968978882 + }, + { + "auxiliary_loss_clip": 0.01101511, + "auxiliary_loss_mlp": 0.01029629, + "balance_loss_clip": 1.01816058, + "balance_loss_mlp": 1.03673196, + "epoch": 0.8758154216143094, + "flos": 18073014796800.0, + "grad_norm": 1.9682068715517365, + "language_loss": 0.71192074, + "learning_rate": 1.5953264708724624e-07, + "loss": 0.73323214, + "num_input_tokens_seen": 314100310, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6484375, + "step": 14567, + "time_per_iteration": 2.4351353645324707 + }, + { + "auxiliary_loss_clip": 0.01098933, + "auxiliary_loss_mlp": 0.01031287, + "balance_loss_clip": 1.01959288, + "balance_loss_mlp": 1.03442371, + "epoch": 0.8758755448669773, + "flos": 25045681985280.0, + "grad_norm": 1.8356377960329546, + "language_loss": 0.74325889, + "learning_rate": 1.5938025861216776e-07, + "loss": 0.76456112, + "num_input_tokens_seen": 314121330, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.64453125, + "step": 14568, + "time_per_iteration": 2.5124824047088623 + }, + { + "auxiliary_loss_clip": 0.01097935, + "auxiliary_loss_mlp": 0.01024092, + "balance_loss_clip": 1.0131247, + "balance_loss_mlp": 1.03408551, + "epoch": 0.8759356681196453, + "flos": 22856818999680.0, + "grad_norm": 2.0142792797007067, + "language_loss": 0.86751103, + "learning_rate": 1.5922793993357475e-07, + "loss": 0.88873136, + "num_input_tokens_seen": 314139875, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.640625, + "step": 14569, + "time_per_iteration": 2.4586262702941895 + }, + { + "auxiliary_loss_clip": 0.01099407, + "auxiliary_loss_mlp": 0.01027017, + "balance_loss_clip": 1.01596665, + "balance_loss_mlp": 1.03311825, + "epoch": 0.8759957913723132, + "flos": 21032521102080.0, + "grad_norm": 1.7415448650731975, + "language_loss": 0.73872113, + "learning_rate": 1.5907569105724284e-07, + "loss": 0.75998533, + "num_input_tokens_seen": 314157850, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6640625, + "step": 14570, + "time_per_iteration": 2.483851194381714 + }, + { + "auxiliary_loss_clip": 0.01101763, + "auxiliary_loss_mlp": 0.01028074, + "balance_loss_clip": 1.01600981, + "balance_loss_mlp": 1.03438187, + "epoch": 0.8760559146249812, + "flos": 20010467514240.0, + "grad_norm": 2.5331748701208454, + "language_loss": 0.67766106, + "learning_rate": 1.5892351198894472e-07, + "loss": 0.69895947, + "num_input_tokens_seen": 314176720, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.671875, + "step": 14571, + "time_per_iteration": 2.465486764907837 + }, + { + "auxiliary_loss_clip": 0.01096204, + "auxiliary_loss_mlp": 0.01027816, + "balance_loss_clip": 1.01672387, + "balance_loss_mlp": 1.03270459, + "epoch": 0.8761160378776491, + "flos": 19974161842560.0, + "grad_norm": 1.8497496461068688, + "language_loss": 0.62435377, + "learning_rate": 1.5877140273445156e-07, + "loss": 0.64559394, + "num_input_tokens_seen": 314196645, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6328125, + "step": 14572, + "time_per_iteration": 2.489522933959961 + }, + { + "auxiliary_loss_clip": 0.01097579, + "auxiliary_loss_mlp": 0.01026518, + "balance_loss_clip": 1.01595557, + "balance_loss_mlp": 1.03467846, + "epoch": 0.8761761611303172, + "flos": 28804415857920.0, + "grad_norm": 1.6601578113112918, + "language_loss": 0.73479891, + "learning_rate": 1.5861936329953162e-07, + "loss": 0.75603998, + "num_input_tokens_seen": 314217430, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.62890625, + "step": 14573, + "time_per_iteration": 2.5072882175445557 + }, + { + "auxiliary_loss_clip": 0.01095801, + "auxiliary_loss_mlp": 0.0102742, + "balance_loss_clip": 1.01690626, + "balance_loss_mlp": 1.03304029, + "epoch": 0.8762362843829851, + "flos": 18332505624960.0, + "grad_norm": 2.044894217816748, + "language_loss": 0.731619, + "learning_rate": 1.5846739368994966e-07, + "loss": 0.75285125, + "num_input_tokens_seen": 314235310, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.625, + "step": 14574, + "time_per_iteration": 2.4775447845458984 + }, + { + "auxiliary_loss_clip": 0.01098302, + "auxiliary_loss_mlp": 0.01033356, + "balance_loss_clip": 1.02203143, + "balance_loss_mlp": 1.03358328, + "epoch": 0.8762964076356531, + "flos": 15779149378560.0, + "grad_norm": 1.6149610801903476, + "language_loss": 0.75919485, + "learning_rate": 1.5831549391146903e-07, + "loss": 0.7805115, + "num_input_tokens_seen": 314252355, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 14575, + "time_per_iteration": 2.4268925189971924 + }, + { + "auxiliary_loss_clip": 0.01099511, + "auxiliary_loss_mlp": 0.0103512, + "balance_loss_clip": 1.02400935, + "balance_loss_mlp": 1.03539479, + "epoch": 0.8763565308883211, + "flos": 33176754789120.0, + "grad_norm": 1.7878452146905504, + "language_loss": 0.66882926, + "learning_rate": 1.5816366396984916e-07, + "loss": 0.69017559, + "num_input_tokens_seen": 314272755, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.640625, + "step": 14576, + "time_per_iteration": 2.5595736503601074 + }, + { + "auxiliary_loss_clip": 0.01096684, + "auxiliary_loss_mlp": 0.01030312, + "balance_loss_clip": 1.01937437, + "balance_loss_mlp": 1.03251886, + "epoch": 0.876416654140989, + "flos": 15888102307200.0, + "grad_norm": 1.7208202540038426, + "language_loss": 0.66684705, + "learning_rate": 1.5801190387084806e-07, + "loss": 0.68811697, + "num_input_tokens_seen": 314291365, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.640625, + "step": 14577, + "time_per_iteration": 2.4102935791015625 + }, + { + "auxiliary_loss_clip": 0.01100979, + "auxiliary_loss_mlp": 0.01032508, + "balance_loss_clip": 1.01999068, + "balance_loss_mlp": 1.03496742, + "epoch": 0.876476777393657, + "flos": 25885237547520.0, + "grad_norm": 2.108641527018096, + "language_loss": 0.70767337, + "learning_rate": 1.5786021362021962e-07, + "loss": 0.72900826, + "num_input_tokens_seen": 314310075, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.66015625, + "step": 14578, + "time_per_iteration": 2.5292482376098633 + }, + { + "auxiliary_loss_clip": 0.01100899, + "auxiliary_loss_mlp": 0.01031417, + "balance_loss_clip": 1.01962769, + "balance_loss_mlp": 1.03398395, + "epoch": 0.876536900646325, + "flos": 13589675861760.0, + "grad_norm": 1.9999789208400311, + "language_loss": 0.71355838, + "learning_rate": 1.5770859322371676e-07, + "loss": 0.73488152, + "num_input_tokens_seen": 314325695, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.671875, + "step": 14579, + "time_per_iteration": 2.4042766094207764 + }, + { + "auxiliary_loss_clip": 0.01096428, + "auxiliary_loss_mlp": 0.01027659, + "balance_loss_clip": 1.01671004, + "balance_loss_mlp": 1.03457344, + "epoch": 0.876597023898993, + "flos": 12203344494720.0, + "grad_norm": 1.902275188035939, + "language_loss": 0.7026614, + "learning_rate": 1.5755704268708912e-07, + "loss": 0.72390223, + "num_input_tokens_seen": 314343605, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6171875, + "step": 14580, + "time_per_iteration": 2.4405770301818848 + }, + { + "auxiliary_loss_clip": 0.01098077, + "auxiliary_loss_mlp": 0.0102868, + "balance_loss_clip": 1.01745093, + "balance_loss_mlp": 1.03479218, + "epoch": 0.8766571471516609, + "flos": 25336773803520.0, + "grad_norm": 1.5728356070217824, + "language_loss": 0.65423614, + "learning_rate": 1.5740556201608256e-07, + "loss": 0.67550373, + "num_input_tokens_seen": 314364275, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6328125, + "step": 14581, + "time_per_iteration": 2.469336986541748 + }, + { + "auxiliary_loss_clip": 0.01097037, + "auxiliary_loss_mlp": 0.01030022, + "balance_loss_clip": 1.01942396, + "balance_loss_mlp": 1.03387427, + "epoch": 0.8767172704043289, + "flos": 30113287545600.0, + "grad_norm": 1.70098505015339, + "language_loss": 0.73786414, + "learning_rate": 1.572541512164416e-07, + "loss": 0.75913477, + "num_input_tokens_seen": 314385140, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.6328125, + "step": 14582, + "time_per_iteration": 2.543093204498291 + }, + { + "auxiliary_loss_clip": 0.01098192, + "auxiliary_loss_mlp": 0.01029559, + "balance_loss_clip": 1.01763248, + "balance_loss_mlp": 1.03288174, + "epoch": 0.8767773936569968, + "flos": 19281157770240.0, + "grad_norm": 2.590977059318644, + "language_loss": 0.67103446, + "learning_rate": 1.5710281029390826e-07, + "loss": 0.692312, + "num_input_tokens_seen": 314403715, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65234375, + "step": 14583, + "time_per_iteration": 2.432237148284912 + }, + { + "auxiliary_loss_clip": 0.01100486, + "auxiliary_loss_mlp": 0.01027321, + "balance_loss_clip": 1.01563215, + "balance_loss_mlp": 1.033885, + "epoch": 0.8768375169096648, + "flos": 21247230648960.0, + "grad_norm": 1.8140039658285496, + "language_loss": 0.79140723, + "learning_rate": 1.5695153925422067e-07, + "loss": 0.81268525, + "num_input_tokens_seen": 314421880, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66796875, + "step": 14584, + "time_per_iteration": 2.4755969047546387 + }, + { + "auxiliary_loss_clip": 0.01100277, + "auxiliary_loss_mlp": 0.01031407, + "balance_loss_clip": 1.02005851, + "balance_loss_mlp": 1.03356349, + "epoch": 0.8768976401623327, + "flos": 23295539715840.0, + "grad_norm": 1.5843541811660464, + "language_loss": 0.72366554, + "learning_rate": 1.5680033810311555e-07, + "loss": 0.74498236, + "num_input_tokens_seen": 314441585, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66796875, + "step": 14585, + "time_per_iteration": 2.4504952430725098 + }, + { + "auxiliary_loss_clip": 0.01100354, + "auxiliary_loss_mlp": 0.01030297, + "balance_loss_clip": 1.01835251, + "balance_loss_mlp": 1.03485799, + "epoch": 0.8769577634150008, + "flos": 21361247395200.0, + "grad_norm": 1.8197564931552062, + "language_loss": 0.74027938, + "learning_rate": 1.5664920684632654e-07, + "loss": 0.76158589, + "num_input_tokens_seen": 314459020, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.65625, + "step": 14586, + "time_per_iteration": 2.476065158843994 + }, + { + "auxiliary_loss_clip": 0.01097699, + "auxiliary_loss_mlp": 0.01028115, + "balance_loss_clip": 1.01619434, + "balance_loss_mlp": 1.03294468, + "epoch": 0.8770178866676687, + "flos": 23514056104320.0, + "grad_norm": 1.8004743742414036, + "language_loss": 0.78392655, + "learning_rate": 1.564981454895844e-07, + "loss": 0.80518472, + "num_input_tokens_seen": 314478935, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6484375, + "step": 14587, + "time_per_iteration": 2.4667489528656006 + }, + { + "auxiliary_loss_clip": 0.01100555, + "auxiliary_loss_mlp": 0.01027789, + "balance_loss_clip": 1.01515913, + "balance_loss_mlp": 1.03499091, + "epoch": 0.8770780099203367, + "flos": 19719052473600.0, + "grad_norm": 1.5237767345911253, + "language_loss": 0.73971182, + "learning_rate": 1.5634715403861697e-07, + "loss": 0.76099527, + "num_input_tokens_seen": 314497635, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.65625, + "step": 14588, + "time_per_iteration": 2.492217779159546 + }, + { + "auxiliary_loss_clip": 0.01098609, + "auxiliary_loss_mlp": 0.01028086, + "balance_loss_clip": 1.0172143, + "balance_loss_mlp": 1.03383243, + "epoch": 0.8771381331730047, + "flos": 21395901041280.0, + "grad_norm": 1.9182743142543304, + "language_loss": 0.66461021, + "learning_rate": 1.5619623249915016e-07, + "loss": 0.68587714, + "num_input_tokens_seen": 314515445, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6484375, + "step": 14589, + "time_per_iteration": 2.4805967807769775 + }, + { + "auxiliary_loss_clip": 0.010995, + "auxiliary_loss_mlp": 0.01032316, + "balance_loss_clip": 1.02103901, + "balance_loss_mlp": 1.03442669, + "epoch": 0.8771982564256726, + "flos": 20261770041600.0, + "grad_norm": 2.249132588118827, + "language_loss": 0.70547277, + "learning_rate": 1.5604538087690732e-07, + "loss": 0.7267909, + "num_input_tokens_seen": 314533040, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65234375, + "step": 14590, + "time_per_iteration": 2.457648992538452 + }, + { + "auxiliary_loss_clip": 0.01105657, + "auxiliary_loss_mlp": 0.01035953, + "balance_loss_clip": 1.02332294, + "balance_loss_mlp": 1.03528619, + "epoch": 0.8772583796783406, + "flos": 12489372495360.0, + "grad_norm": 1.974082203683765, + "language_loss": 0.75044048, + "learning_rate": 1.558945991776086e-07, + "loss": 0.77185655, + "num_input_tokens_seen": 314548280, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.703125, + "step": 14591, + "time_per_iteration": 2.405331611633301 + }, + { + "auxiliary_loss_clip": 0.01094641, + "auxiliary_loss_mlp": 0.01024689, + "balance_loss_clip": 1.01393628, + "balance_loss_mlp": 1.03357577, + "epoch": 0.8773185029310085, + "flos": 15921103927680.0, + "grad_norm": 2.031313863319318, + "language_loss": 0.79909766, + "learning_rate": 1.5574388740697096e-07, + "loss": 0.82029092, + "num_input_tokens_seen": 314565345, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.609375, + "step": 14592, + "time_per_iteration": 2.454929828643799 + }, + { + "auxiliary_loss_clip": 0.01095316, + "auxiliary_loss_mlp": 0.01026873, + "balance_loss_clip": 1.01629364, + "balance_loss_mlp": 1.03316784, + "epoch": 0.8773786261836766, + "flos": 21504530747520.0, + "grad_norm": 1.614306440417284, + "language_loss": 0.82640499, + "learning_rate": 1.5559324557071052e-07, + "loss": 0.84762686, + "num_input_tokens_seen": 314584190, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.62109375, + "step": 14593, + "time_per_iteration": 2.4428117275238037 + }, + { + "auxiliary_loss_clip": 0.01098816, + "auxiliary_loss_mlp": 0.01022638, + "balance_loss_clip": 1.01165867, + "balance_loss_mlp": 1.03518867, + "epoch": 0.8774387494363445, + "flos": 26761493831040.0, + "grad_norm": 1.553725648674736, + "language_loss": 0.7587297, + "learning_rate": 1.5544267367453845e-07, + "loss": 0.77994418, + "num_input_tokens_seen": 314605625, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6328125, + "step": 14594, + "time_per_iteration": 2.4924726486206055 + }, + { + "auxiliary_loss_clip": 0.01098996, + "auxiliary_loss_mlp": 0.0102714, + "balance_loss_clip": 1.01548719, + "balance_loss_mlp": 1.03252506, + "epoch": 0.8774988726890125, + "flos": 18478841633280.0, + "grad_norm": 2.2507371264600082, + "language_loss": 0.77722549, + "learning_rate": 1.552921717241651e-07, + "loss": 0.79848695, + "num_input_tokens_seen": 314622630, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 14595, + "time_per_iteration": 3.8839778900146484 + }, + { + "auxiliary_loss_clip": 0.01100028, + "auxiliary_loss_mlp": 0.01032517, + "balance_loss_clip": 1.02081716, + "balance_loss_mlp": 1.03495049, + "epoch": 0.8775589959416804, + "flos": 24426366664320.0, + "grad_norm": 1.6010458814684418, + "language_loss": 0.70719904, + "learning_rate": 1.5514173972529743e-07, + "loss": 0.72852451, + "num_input_tokens_seen": 314642460, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6484375, + "step": 14596, + "time_per_iteration": 2.485382080078125 + }, + { + "auxiliary_loss_clip": 0.01099029, + "auxiliary_loss_mlp": 0.01025253, + "balance_loss_clip": 1.01380861, + "balance_loss_mlp": 1.03495514, + "epoch": 0.8776191191943484, + "flos": 23440151871360.0, + "grad_norm": 1.7430220706670174, + "language_loss": 0.86074364, + "learning_rate": 1.5499137768364067e-07, + "loss": 0.88198644, + "num_input_tokens_seen": 314659875, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.640625, + "step": 14597, + "time_per_iteration": 3.946387529373169 + }, + { + "auxiliary_loss_clip": 0.01098851, + "auxiliary_loss_mlp": 0.01028738, + "balance_loss_clip": 1.0176214, + "balance_loss_mlp": 1.03434682, + "epoch": 0.8776792424470163, + "flos": 26830872950400.0, + "grad_norm": 3.272225714206706, + "language_loss": 0.72833431, + "learning_rate": 1.5484108560489494e-07, + "loss": 0.74961019, + "num_input_tokens_seen": 314680260, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.64453125, + "step": 14598, + "time_per_iteration": 2.549870491027832 + }, + { + "auxiliary_loss_clip": 0.01102022, + "auxiliary_loss_mlp": 0.01028103, + "balance_loss_clip": 1.01646245, + "balance_loss_mlp": 1.03658414, + "epoch": 0.8777393656996844, + "flos": 15626169354240.0, + "grad_norm": 2.3090049722541095, + "language_loss": 0.77496958, + "learning_rate": 1.5469086349476036e-07, + "loss": 0.79627085, + "num_input_tokens_seen": 314696260, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65625, + "step": 14599, + "time_per_iteration": 2.409959077835083 + }, + { + "auxiliary_loss_clip": 0.01099573, + "auxiliary_loss_mlp": 0.01029278, + "balance_loss_clip": 1.01803648, + "balance_loss_mlp": 1.03392327, + "epoch": 0.8777994889523523, + "flos": 18879999701760.0, + "grad_norm": 2.428745416701903, + "language_loss": 0.67349386, + "learning_rate": 1.545407113589332e-07, + "loss": 0.69478238, + "num_input_tokens_seen": 314714215, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 14600, + "time_per_iteration": 2.474609613418579 + }, + { + "auxiliary_loss_clip": 0.01100255, + "auxiliary_loss_mlp": 0.01035078, + "balance_loss_clip": 1.02363992, + "balance_loss_mlp": 1.03416967, + "epoch": 0.8778596122050203, + "flos": 48826516400640.0, + "grad_norm": 2.092434255676991, + "language_loss": 0.69479287, + "learning_rate": 1.543906292031072e-07, + "loss": 0.71614623, + "num_input_tokens_seen": 314735700, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66015625, + "step": 14601, + "time_per_iteration": 4.140269994735718 + }, + { + "auxiliary_loss_clip": 0.01103745, + "auxiliary_loss_mlp": 0.01027151, + "balance_loss_clip": 1.01555753, + "balance_loss_mlp": 1.03566706, + "epoch": 0.8779197354576883, + "flos": 25660184883840.0, + "grad_norm": 1.9032016370859126, + "language_loss": 0.73216182, + "learning_rate": 1.542406170329733e-07, + "loss": 0.75347078, + "num_input_tokens_seen": 314753335, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 14602, + "time_per_iteration": 2.48760986328125 + }, + { + "auxiliary_loss_clip": 0.01098268, + "auxiliary_loss_mlp": 0.01030865, + "balance_loss_clip": 1.02021384, + "balance_loss_mlp": 1.03420591, + "epoch": 0.8779798587103562, + "flos": 18843227153280.0, + "grad_norm": 1.8928349541350598, + "language_loss": 0.71194154, + "learning_rate": 1.5409067485422056e-07, + "loss": 0.73323286, + "num_input_tokens_seen": 314770800, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.640625, + "step": 14603, + "time_per_iteration": 2.4426493644714355 + }, + { + "auxiliary_loss_clip": 0.01022978, + "auxiliary_loss_mlp": 0.01002674, + "balance_loss_clip": 1.00168419, + "balance_loss_mlp": 1.00285864, + "epoch": 0.8780399819630242, + "flos": 68613119377920.0, + "grad_norm": 0.7364725724275261, + "language_loss": 0.54201496, + "learning_rate": 1.539408026725344e-07, + "loss": 0.56227148, + "num_input_tokens_seen": 314837275, + "router_z_loss_clip": 0.0098877, + "router_z_loss_mlp": 0.20117188, + "step": 14604, + "time_per_iteration": 3.0395615100860596 + }, + { + "auxiliary_loss_clip": 0.01022902, + "auxiliary_loss_mlp": 0.01002151, + "balance_loss_clip": 1.0011977, + "balance_loss_mlp": 1.00290179, + "epoch": 0.8781001052156922, + "flos": 65734807766400.0, + "grad_norm": 0.7228284416194825, + "language_loss": 0.59237391, + "learning_rate": 1.537910004935976e-07, + "loss": 0.61262447, + "num_input_tokens_seen": 314902220, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.20019531, + "step": 14605, + "time_per_iteration": 3.035781145095825 + }, + { + "auxiliary_loss_clip": 0.01101512, + "auxiliary_loss_mlp": 0.01033244, + "balance_loss_clip": 1.02125812, + "balance_loss_mlp": 1.03448224, + "epoch": 0.8781602284683602, + "flos": 22049654526720.0, + "grad_norm": 1.5739361881333696, + "language_loss": 0.85203683, + "learning_rate": 1.536412683230912e-07, + "loss": 0.87338436, + "num_input_tokens_seen": 314921645, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.671875, + "step": 14606, + "time_per_iteration": 2.4679386615753174 + }, + { + "auxiliary_loss_clip": 0.01102154, + "auxiliary_loss_mlp": 0.01027184, + "balance_loss_clip": 1.01512599, + "balance_loss_mlp": 1.03583789, + "epoch": 0.8782203517210281, + "flos": 17562939713280.0, + "grad_norm": 2.14242469072768, + "language_loss": 0.70639741, + "learning_rate": 1.534916061666931e-07, + "loss": 0.72769076, + "num_input_tokens_seen": 314939390, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 14607, + "time_per_iteration": 2.429849147796631 + }, + { + "auxiliary_loss_clip": 0.0109832, + "auxiliary_loss_mlp": 0.01034086, + "balance_loss_clip": 1.02337539, + "balance_loss_mlp": 1.03447664, + "epoch": 0.8782804749736961, + "flos": 25520421064320.0, + "grad_norm": 1.7322326558038397, + "language_loss": 0.71684766, + "learning_rate": 1.533420140300785e-07, + "loss": 0.7381717, + "num_input_tokens_seen": 314959205, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.63671875, + "step": 14608, + "time_per_iteration": 2.4912965297698975 + }, + { + "auxiliary_loss_clip": 0.01103002, + "auxiliary_loss_mlp": 0.01034288, + "balance_loss_clip": 1.02251017, + "balance_loss_mlp": 1.03509843, + "epoch": 0.878340598226364, + "flos": 21798747048960.0, + "grad_norm": 2.337485246966266, + "language_loss": 0.87112725, + "learning_rate": 1.5319249191891936e-07, + "loss": 0.89250016, + "num_input_tokens_seen": 314977485, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6796875, + "step": 14609, + "time_per_iteration": 2.458808660507202 + }, + { + "auxiliary_loss_clip": 0.01099988, + "auxiliary_loss_mlp": 0.01028854, + "balance_loss_clip": 1.01741576, + "balance_loss_mlp": 1.03460443, + "epoch": 0.878400721479032, + "flos": 21102403011840.0, + "grad_norm": 1.603979145796894, + "language_loss": 0.7021966, + "learning_rate": 1.5304303983888643e-07, + "loss": 0.72348499, + "num_input_tokens_seen": 314997830, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 14610, + "time_per_iteration": 2.4685328006744385 + }, + { + "auxiliary_loss_clip": 0.01099125, + "auxiliary_loss_mlp": 0.01030436, + "balance_loss_clip": 1.01932585, + "balance_loss_mlp": 1.03569698, + "epoch": 0.8784608447316999, + "flos": 20923532259840.0, + "grad_norm": 4.627089606840685, + "language_loss": 0.80114305, + "learning_rate": 1.5289365779564612e-07, + "loss": 0.82243866, + "num_input_tokens_seen": 315016480, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6328125, + "step": 14611, + "time_per_iteration": 2.442768096923828 + }, + { + "auxiliary_loss_clip": 0.01100873, + "auxiliary_loss_mlp": 0.01032791, + "balance_loss_clip": 1.02121592, + "balance_loss_mlp": 1.03429496, + "epoch": 0.878520967984368, + "flos": 23330660238720.0, + "grad_norm": 1.6363638945337065, + "language_loss": 0.76340765, + "learning_rate": 1.5274434579486338e-07, + "loss": 0.78474426, + "num_input_tokens_seen": 315036135, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66796875, + "step": 14612, + "time_per_iteration": 2.4471793174743652 + }, + { + "auxiliary_loss_clip": 0.01098814, + "auxiliary_loss_mlp": 0.01034029, + "balance_loss_clip": 1.02276969, + "balance_loss_mlp": 1.03435552, + "epoch": 0.8785810912370359, + "flos": 25518984520320.0, + "grad_norm": 1.4298080902663715, + "language_loss": 0.72504056, + "learning_rate": 1.525951038422002e-07, + "loss": 0.746369, + "num_input_tokens_seen": 315057995, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.64453125, + "step": 14613, + "time_per_iteration": 2.4921391010284424 + }, + { + "auxiliary_loss_clip": 0.01022277, + "auxiliary_loss_mlp": 0.01002009, + "balance_loss_clip": 1.00101399, + "balance_loss_mlp": 1.0023061, + "epoch": 0.8786412144897039, + "flos": 61841047691520.0, + "grad_norm": 1.04250108997431, + "language_loss": 0.64641011, + "learning_rate": 1.5244593194331667e-07, + "loss": 0.66665304, + "num_input_tokens_seen": 315104010, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.19921875, + "step": 14614, + "time_per_iteration": 2.8672502040863037 + }, + { + "auxiliary_loss_clip": 0.01022982, + "auxiliary_loss_mlp": 0.01001073, + "balance_loss_clip": 1.0000422, + "balance_loss_mlp": 1.00285435, + "epoch": 0.8787013377423719, + "flos": 70989364638720.0, + "grad_norm": 0.6615602022386656, + "language_loss": 0.58617866, + "learning_rate": 1.5229683010386762e-07, + "loss": 0.60641921, + "num_input_tokens_seen": 315174550, + "router_z_loss_clip": 0.01031494, + "router_z_loss_mlp": 0.20117188, + "step": 14615, + "time_per_iteration": 3.120760917663574 + }, + { + "auxiliary_loss_clip": 0.0109898, + "auxiliary_loss_mlp": 0.01031525, + "balance_loss_clip": 1.02014589, + "balance_loss_mlp": 1.03304863, + "epoch": 0.8787614609950398, + "flos": 17347404153600.0, + "grad_norm": 2.277379368567329, + "language_loss": 0.7279399, + "learning_rate": 1.5214779832950807e-07, + "loss": 0.74924493, + "num_input_tokens_seen": 315191825, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65625, + "step": 14616, + "time_per_iteration": 2.4196126461029053 + }, + { + "auxiliary_loss_clip": 0.01022719, + "auxiliary_loss_mlp": 0.01001804, + "balance_loss_clip": 1.00076103, + "balance_loss_mlp": 1.0026381, + "epoch": 0.8788215842477078, + "flos": 72511401588480.0, + "grad_norm": 0.8027403534431957, + "language_loss": 0.57973462, + "learning_rate": 1.5199883662588953e-07, + "loss": 0.59997988, + "num_input_tokens_seen": 315255075, + "router_z_loss_clip": 0.01043701, + "router_z_loss_mlp": 0.20117188, + "step": 14617, + "time_per_iteration": 3.1586780548095703 + }, + { + "auxiliary_loss_clip": 0.01096253, + "auxiliary_loss_mlp": 0.0102978, + "balance_loss_clip": 1.01859808, + "balance_loss_mlp": 1.03347445, + "epoch": 0.8788817075003758, + "flos": 24827452905600.0, + "grad_norm": 1.9129594461835326, + "language_loss": 0.83026248, + "learning_rate": 1.5184994499865987e-07, + "loss": 0.8515228, + "num_input_tokens_seen": 315273995, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.62890625, + "step": 14618, + "time_per_iteration": 2.5385875701904297 + }, + { + "auxiliary_loss_clip": 0.01094322, + "auxiliary_loss_mlp": 0.0102493, + "balance_loss_clip": 1.01411796, + "balance_loss_mlp": 1.03331971, + "epoch": 0.8789418307530438, + "flos": 22638769488000.0, + "grad_norm": 1.5097664240829207, + "language_loss": 0.69104743, + "learning_rate": 1.5170112345346598e-07, + "loss": 0.71223986, + "num_input_tokens_seen": 315294485, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.609375, + "step": 14619, + "time_per_iteration": 2.459087610244751 + }, + { + "auxiliary_loss_clip": 0.01101429, + "auxiliary_loss_mlp": 0.01031966, + "balance_loss_clip": 1.02081394, + "balance_loss_mlp": 1.03381336, + "epoch": 0.8790019540057117, + "flos": 19785738072960.0, + "grad_norm": 2.822522810502768, + "language_loss": 0.77135247, + "learning_rate": 1.5155237199595016e-07, + "loss": 0.7926864, + "num_input_tokens_seen": 315310420, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6796875, + "step": 14620, + "time_per_iteration": 2.4514245986938477 + }, + { + "auxiliary_loss_clip": 0.01101357, + "auxiliary_loss_mlp": 0.01030492, + "balance_loss_clip": 1.01795721, + "balance_loss_mlp": 1.03538573, + "epoch": 0.8790620772583797, + "flos": 20229774001920.0, + "grad_norm": 1.663844262033778, + "language_loss": 0.79417694, + "learning_rate": 1.514036906317542e-07, + "loss": 0.81549543, + "num_input_tokens_seen": 315330110, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.66015625, + "step": 14621, + "time_per_iteration": 2.4423892498016357 + }, + { + "auxiliary_loss_clip": 0.0110234, + "auxiliary_loss_mlp": 0.01034194, + "balance_loss_clip": 1.02238619, + "balance_loss_mlp": 1.03448009, + "epoch": 0.8791222005110476, + "flos": 24130785646080.0, + "grad_norm": 3.159156225679449, + "language_loss": 0.66855097, + "learning_rate": 1.5125507936651506e-07, + "loss": 0.68991637, + "num_input_tokens_seen": 315350080, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 14622, + "time_per_iteration": 2.476047992706299 + }, + { + "auxiliary_loss_clip": 0.01099562, + "auxiliary_loss_mlp": 0.01033553, + "balance_loss_clip": 1.02223372, + "balance_loss_mlp": 1.03490078, + "epoch": 0.8791823237637156, + "flos": 21614201948160.0, + "grad_norm": 1.928437907767961, + "language_loss": 0.7306127, + "learning_rate": 1.511065382058687e-07, + "loss": 0.75194383, + "num_input_tokens_seen": 315366360, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 14623, + "time_per_iteration": 2.4311418533325195 + }, + { + "auxiliary_loss_clip": 0.01094016, + "auxiliary_loss_mlp": 0.01027822, + "balance_loss_clip": 1.01631272, + "balance_loss_mlp": 1.03009653, + "epoch": 0.8792424470163835, + "flos": 24243401761920.0, + "grad_norm": 2.38983757002019, + "language_loss": 0.7877636, + "learning_rate": 1.5095806715544801e-07, + "loss": 0.80898196, + "num_input_tokens_seen": 315385890, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 14624, + "time_per_iteration": 2.454042911529541 + }, + { + "auxiliary_loss_clip": 0.01101284, + "auxiliary_loss_mlp": 0.01034601, + "balance_loss_clip": 1.02183962, + "balance_loss_mlp": 1.03443372, + "epoch": 0.8793025702690516, + "flos": 24893204751360.0, + "grad_norm": 1.8092429403664327, + "language_loss": 0.79949045, + "learning_rate": 1.5080966622088265e-07, + "loss": 0.82084924, + "num_input_tokens_seen": 315403400, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.66796875, + "step": 14625, + "time_per_iteration": 2.468273162841797 + }, + { + "auxiliary_loss_clip": 0.0109769, + "auxiliary_loss_mlp": 0.01035432, + "balance_loss_clip": 1.02404141, + "balance_loss_mlp": 1.03421533, + "epoch": 0.8793626935217195, + "flos": 25373115388800.0, + "grad_norm": 1.5248039405133302, + "language_loss": 0.74116158, + "learning_rate": 1.5066133540779967e-07, + "loss": 0.76249278, + "num_input_tokens_seen": 315423670, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6328125, + "step": 14626, + "time_per_iteration": 2.5007894039154053 + }, + { + "auxiliary_loss_clip": 0.01100657, + "auxiliary_loss_mlp": 0.01030005, + "balance_loss_clip": 1.01841187, + "balance_loss_mlp": 1.03324091, + "epoch": 0.8794228167743875, + "flos": 34678000742400.0, + "grad_norm": 1.5472909001044985, + "language_loss": 0.7117843, + "learning_rate": 1.505130747218246e-07, + "loss": 0.73309094, + "num_input_tokens_seen": 315446265, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 14627, + "time_per_iteration": 2.572488784790039 + }, + { + "auxiliary_loss_clip": 0.0109865, + "auxiliary_loss_mlp": 0.01026703, + "balance_loss_clip": 1.01455545, + "balance_loss_mlp": 1.0333581, + "epoch": 0.8794829400270555, + "flos": 19464014931840.0, + "grad_norm": 1.7161031145560457, + "language_loss": 0.72222739, + "learning_rate": 1.5036488416857873e-07, + "loss": 0.74348092, + "num_input_tokens_seen": 315464655, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.65234375, + "step": 14628, + "time_per_iteration": 2.5836756229400635 + }, + { + "auxiliary_loss_clip": 0.01099882, + "auxiliary_loss_mlp": 0.0103277, + "balance_loss_clip": 1.02062297, + "balance_loss_mlp": 1.0343008, + "epoch": 0.8795430632797234, + "flos": 15231403906560.0, + "grad_norm": 2.974240277215887, + "language_loss": 0.69140917, + "learning_rate": 1.5021676375368175e-07, + "loss": 0.71273565, + "num_input_tokens_seen": 315481090, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.65625, + "step": 14629, + "time_per_iteration": 2.5673904418945312 + }, + { + "auxiliary_loss_clip": 0.01095341, + "auxiliary_loss_mlp": 0.01029354, + "balance_loss_clip": 1.01881027, + "balance_loss_mlp": 1.03196287, + "epoch": 0.8796031865323914, + "flos": 27744727795200.0, + "grad_norm": 1.611077120019427, + "language_loss": 0.68476737, + "learning_rate": 1.5006871348275053e-07, + "loss": 0.70601434, + "num_input_tokens_seen": 315502010, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6328125, + "step": 14630, + "time_per_iteration": 2.6186506748199463 + }, + { + "auxiliary_loss_clip": 0.0109541, + "auxiliary_loss_mlp": 0.01032622, + "balance_loss_clip": 1.02080238, + "balance_loss_mlp": 1.03272772, + "epoch": 0.8796633097850594, + "flos": 31285412156160.0, + "grad_norm": 1.5868483817753165, + "language_loss": 0.74161929, + "learning_rate": 1.499207333613999e-07, + "loss": 0.76289958, + "num_input_tokens_seen": 315523040, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.62890625, + "step": 14631, + "time_per_iteration": 2.5261404514312744 + }, + { + "auxiliary_loss_clip": 0.01096064, + "auxiliary_loss_mlp": 0.01031196, + "balance_loss_clip": 1.01977563, + "balance_loss_mlp": 1.03400874, + "epoch": 0.8797234330377274, + "flos": 24243150366720.0, + "grad_norm": 3.2568719611534367, + "language_loss": 0.69245052, + "learning_rate": 1.4977282339523954e-07, + "loss": 0.71372306, + "num_input_tokens_seen": 315541865, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.62109375, + "step": 14632, + "time_per_iteration": 2.5176477432250977 + }, + { + "auxiliary_loss_clip": 0.01100067, + "auxiliary_loss_mlp": 0.01029856, + "balance_loss_clip": 1.01921034, + "balance_loss_mlp": 1.03524411, + "epoch": 0.8797835562903953, + "flos": 24167414540160.0, + "grad_norm": 1.850853820165369, + "language_loss": 0.64914048, + "learning_rate": 1.4962498358987929e-07, + "loss": 0.67043972, + "num_input_tokens_seen": 315561470, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6484375, + "step": 14633, + "time_per_iteration": 2.4824862480163574 + }, + { + "auxiliary_loss_clip": 0.01099036, + "auxiliary_loss_mlp": 0.01028782, + "balance_loss_clip": 1.01750481, + "balance_loss_mlp": 1.03487611, + "epoch": 0.8798436795430633, + "flos": 19284677303040.0, + "grad_norm": 1.9427253459793778, + "language_loss": 0.84233886, + "learning_rate": 1.4947721395092528e-07, + "loss": 0.863617, + "num_input_tokens_seen": 315583140, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.640625, + "step": 14634, + "time_per_iteration": 2.507662296295166 + }, + { + "auxiliary_loss_clip": 0.01098176, + "auxiliary_loss_mlp": 0.01033773, + "balance_loss_clip": 1.02172065, + "balance_loss_mlp": 1.03326917, + "epoch": 0.8799038027957312, + "flos": 28179390274560.0, + "grad_norm": 1.624999412109894, + "language_loss": 0.79993856, + "learning_rate": 1.4932951448398056e-07, + "loss": 0.82125807, + "num_input_tokens_seen": 315601935, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6484375, + "step": 14635, + "time_per_iteration": 2.518354654312134 + }, + { + "auxiliary_loss_clip": 0.01099052, + "auxiliary_loss_mlp": 0.01024128, + "balance_loss_clip": 1.01243329, + "balance_loss_mlp": 1.03359151, + "epoch": 0.8799639260483992, + "flos": 24644703484800.0, + "grad_norm": 1.8321760419089794, + "language_loss": 0.65398335, + "learning_rate": 1.4918188519464648e-07, + "loss": 0.67521518, + "num_input_tokens_seen": 315619995, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 14636, + "time_per_iteration": 2.479426860809326 + }, + { + "auxiliary_loss_clip": 0.01100885, + "auxiliary_loss_mlp": 0.01034558, + "balance_loss_clip": 1.02273285, + "balance_loss_mlp": 1.03467786, + "epoch": 0.8800240493010671, + "flos": 22200479735040.0, + "grad_norm": 1.6657689764280696, + "language_loss": 0.7029084, + "learning_rate": 1.4903432608852074e-07, + "loss": 0.72426283, + "num_input_tokens_seen": 315637895, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 14637, + "time_per_iteration": 3.868614912033081 + }, + { + "auxiliary_loss_clip": 0.01101928, + "auxiliary_loss_mlp": 0.01029648, + "balance_loss_clip": 1.01857972, + "balance_loss_mlp": 1.03736019, + "epoch": 0.8800841725537352, + "flos": 14246086953600.0, + "grad_norm": 2.8684329646632407, + "language_loss": 0.66271627, + "learning_rate": 1.4888683717119843e-07, + "loss": 0.68403208, + "num_input_tokens_seen": 315655520, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.64453125, + "step": 14638, + "time_per_iteration": 5.274388551712036 + }, + { + "auxiliary_loss_clip": 0.01101226, + "auxiliary_loss_mlp": 0.01029099, + "balance_loss_clip": 1.01731563, + "balance_loss_mlp": 1.03512883, + "epoch": 0.8801442958064031, + "flos": 37415794348800.0, + "grad_norm": 1.846776062468507, + "language_loss": 0.58106345, + "learning_rate": 1.4873941844827286e-07, + "loss": 0.60236669, + "num_input_tokens_seen": 315678955, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66015625, + "step": 14639, + "time_per_iteration": 2.5819764137268066 + }, + { + "auxiliary_loss_clip": 0.01099525, + "auxiliary_loss_mlp": 0.01037199, + "balance_loss_clip": 1.02529562, + "balance_loss_mlp": 1.03383088, + "epoch": 0.8802044190590711, + "flos": 25047334010880.0, + "grad_norm": 1.3914179808577423, + "language_loss": 0.7458142, + "learning_rate": 1.4859206992533402e-07, + "loss": 0.76718146, + "num_input_tokens_seen": 315700360, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 14640, + "time_per_iteration": 2.521860361099243 + }, + { + "auxiliary_loss_clip": 0.01099653, + "auxiliary_loss_mlp": 0.01036259, + "balance_loss_clip": 1.0246067, + "balance_loss_mlp": 1.03402758, + "epoch": 0.8802645423117391, + "flos": 24133874215680.0, + "grad_norm": 1.9721181093875695, + "language_loss": 0.6971339, + "learning_rate": 1.4844479160796985e-07, + "loss": 0.71849298, + "num_input_tokens_seen": 315719270, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65625, + "step": 14641, + "time_per_iteration": 2.465099811553955 + }, + { + "auxiliary_loss_clip": 0.01101581, + "auxiliary_loss_mlp": 0.01025603, + "balance_loss_clip": 1.01319361, + "balance_loss_mlp": 1.03489089, + "epoch": 0.880324665564407, + "flos": 17931203902080.0, + "grad_norm": 2.100901635166447, + "language_loss": 0.84755206, + "learning_rate": 1.4829758350176457e-07, + "loss": 0.86882389, + "num_input_tokens_seen": 315737425, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.66796875, + "step": 14642, + "time_per_iteration": 2.4675049781799316 + }, + { + "auxiliary_loss_clip": 0.01100814, + "auxiliary_loss_mlp": 0.0103406, + "balance_loss_clip": 1.02078056, + "balance_loss_mlp": 1.0357337, + "epoch": 0.880384788817075, + "flos": 21287630471040.0, + "grad_norm": 1.6823727601276586, + "language_loss": 0.78799748, + "learning_rate": 1.4815044561230038e-07, + "loss": 0.80934626, + "num_input_tokens_seen": 315755725, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6484375, + "step": 14643, + "time_per_iteration": 3.872709274291992 + }, + { + "auxiliary_loss_clip": 0.01094296, + "auxiliary_loss_mlp": 0.01023704, + "balance_loss_clip": 1.01277804, + "balance_loss_mlp": 1.03229833, + "epoch": 0.880444912069743, + "flos": 12458489777280.0, + "grad_norm": 1.6378114000618107, + "language_loss": 0.73273623, + "learning_rate": 1.4800337794515705e-07, + "loss": 0.75391626, + "num_input_tokens_seen": 315773835, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.62109375, + "step": 14644, + "time_per_iteration": 2.4105916023254395 + }, + { + "auxiliary_loss_clip": 0.01102632, + "auxiliary_loss_mlp": 0.01033481, + "balance_loss_clip": 1.02102923, + "balance_loss_mlp": 1.0351516, + "epoch": 0.880505035322411, + "flos": 13625945619840.0, + "grad_norm": 1.7663467808717348, + "language_loss": 0.79154408, + "learning_rate": 1.47856380505911e-07, + "loss": 0.81290519, + "num_input_tokens_seen": 315790615, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.67578125, + "step": 14645, + "time_per_iteration": 2.4135661125183105 + }, + { + "auxiliary_loss_clip": 0.01097489, + "auxiliary_loss_mlp": 0.01032803, + "balance_loss_clip": 1.02140081, + "balance_loss_mlp": 1.03396201, + "epoch": 0.8805651585750789, + "flos": 23183067254400.0, + "grad_norm": 1.8153463800706586, + "language_loss": 0.64348304, + "learning_rate": 1.477094533001364e-07, + "loss": 0.66478598, + "num_input_tokens_seen": 315811010, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.63671875, + "step": 14646, + "time_per_iteration": 2.510627031326294 + }, + { + "auxiliary_loss_clip": 0.01103533, + "auxiliary_loss_mlp": 0.01033554, + "balance_loss_clip": 1.02099562, + "balance_loss_mlp": 1.0349977, + "epoch": 0.8806252818277469, + "flos": 14903000835840.0, + "grad_norm": 2.0987215811216617, + "language_loss": 0.77177233, + "learning_rate": 1.475625963334055e-07, + "loss": 0.79314315, + "num_input_tokens_seen": 315828130, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 14647, + "time_per_iteration": 2.455548048019409 + }, + { + "auxiliary_loss_clip": 0.01097739, + "auxiliary_loss_mlp": 0.01025889, + "balance_loss_clip": 1.01538038, + "balance_loss_mlp": 1.03470957, + "epoch": 0.8806854050804148, + "flos": 17639178330240.0, + "grad_norm": 2.016380726471692, + "language_loss": 0.75440037, + "learning_rate": 1.4741580961128652e-07, + "loss": 0.77563667, + "num_input_tokens_seen": 315844900, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.6328125, + "step": 14648, + "time_per_iteration": 2.4378833770751953 + }, + { + "auxiliary_loss_clip": 0.01098481, + "auxiliary_loss_mlp": 0.01028659, + "balance_loss_clip": 1.01756096, + "balance_loss_mlp": 1.03212929, + "epoch": 0.8807455283330828, + "flos": 25332392344320.0, + "grad_norm": 1.800540393972122, + "language_loss": 0.65671074, + "learning_rate": 1.4726909313934522e-07, + "loss": 0.67798209, + "num_input_tokens_seen": 315863745, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6640625, + "step": 14649, + "time_per_iteration": 2.475167751312256 + }, + { + "auxiliary_loss_clip": 0.01099588, + "auxiliary_loss_mlp": 0.01027763, + "balance_loss_clip": 1.01583624, + "balance_loss_mlp": 1.0346787, + "epoch": 0.8808056515857507, + "flos": 25265168040960.0, + "grad_norm": 1.3656118793180194, + "language_loss": 0.62488627, + "learning_rate": 1.4712244692314578e-07, + "loss": 0.64615977, + "num_input_tokens_seen": 315885765, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6484375, + "step": 14650, + "time_per_iteration": 2.5512006282806396 + }, + { + "auxiliary_loss_clip": 0.01097299, + "auxiliary_loss_mlp": 0.01027192, + "balance_loss_clip": 1.0164454, + "balance_loss_mlp": 1.03367639, + "epoch": 0.8808657748384188, + "flos": 26578852151040.0, + "grad_norm": 1.4406035911572534, + "language_loss": 0.72946811, + "learning_rate": 1.4697587096824914e-07, + "loss": 0.75071305, + "num_input_tokens_seen": 315907340, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.63671875, + "step": 14651, + "time_per_iteration": 2.4996044635772705 + }, + { + "auxiliary_loss_clip": 0.01100922, + "auxiliary_loss_mlp": 0.01029237, + "balance_loss_clip": 1.01687539, + "balance_loss_mlp": 1.03476334, + "epoch": 0.8809258980910867, + "flos": 18661231918080.0, + "grad_norm": 1.8309456518372134, + "language_loss": 0.72026336, + "learning_rate": 1.4682936528021284e-07, + "loss": 0.74156499, + "num_input_tokens_seen": 315924935, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.66015625, + "step": 14652, + "time_per_iteration": 2.432687282562256 + }, + { + "auxiliary_loss_clip": 0.01097085, + "auxiliary_loss_mlp": 0.01030977, + "balance_loss_clip": 1.01948547, + "balance_loss_mlp": 1.0323956, + "epoch": 0.8809860213437547, + "flos": 19792274348160.0, + "grad_norm": 1.9095636573568913, + "language_loss": 0.74354553, + "learning_rate": 1.4668292986459286e-07, + "loss": 0.76482618, + "num_input_tokens_seen": 315943165, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6484375, + "step": 14653, + "time_per_iteration": 2.4471185207366943 + }, + { + "auxiliary_loss_clip": 0.01101564, + "auxiliary_loss_mlp": 0.010261, + "balance_loss_clip": 1.0141021, + "balance_loss_mlp": 1.03399158, + "epoch": 0.8810461445964227, + "flos": 17894467267200.0, + "grad_norm": 1.6655759568557502, + "language_loss": 0.71326327, + "learning_rate": 1.465365647269421e-07, + "loss": 0.73453987, + "num_input_tokens_seen": 315961340, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 14654, + "time_per_iteration": 2.463416576385498 + }, + { + "auxiliary_loss_clip": 0.01100107, + "auxiliary_loss_mlp": 0.01034774, + "balance_loss_clip": 1.02197719, + "balance_loss_mlp": 1.03539014, + "epoch": 0.8811062678490906, + "flos": 29163917128320.0, + "grad_norm": 2.5564410851677284, + "language_loss": 0.71378338, + "learning_rate": 1.4639026987281012e-07, + "loss": 0.73513222, + "num_input_tokens_seen": 315981335, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6484375, + "step": 14655, + "time_per_iteration": 2.506082057952881 + }, + { + "auxiliary_loss_clip": 0.01099171, + "auxiliary_loss_mlp": 0.01031333, + "balance_loss_clip": 1.01999068, + "balance_loss_mlp": 1.03464365, + "epoch": 0.8811663911017587, + "flos": 20338834671360.0, + "grad_norm": 1.5670673465427962, + "language_loss": 0.8118304, + "learning_rate": 1.462440453077449e-07, + "loss": 0.83313543, + "num_input_tokens_seen": 316001325, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.64453125, + "step": 14656, + "time_per_iteration": 2.4655163288116455 + }, + { + "auxiliary_loss_clip": 0.01100032, + "auxiliary_loss_mlp": 0.01028991, + "balance_loss_clip": 1.01789308, + "balance_loss_mlp": 1.03457642, + "epoch": 0.8812265143544266, + "flos": 25885704424320.0, + "grad_norm": 1.7915054881037722, + "language_loss": 0.68660492, + "learning_rate": 1.460978910372914e-07, + "loss": 0.70789516, + "num_input_tokens_seen": 316022540, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 14657, + "time_per_iteration": 2.478731393814087 + }, + { + "auxiliary_loss_clip": 0.01101157, + "auxiliary_loss_mlp": 0.01031358, + "balance_loss_clip": 1.0200932, + "balance_loss_mlp": 1.03532481, + "epoch": 0.8812866376070946, + "flos": 27195509865600.0, + "grad_norm": 1.9296351990440808, + "language_loss": 0.83915722, + "learning_rate": 1.4595180706699207e-07, + "loss": 0.86048234, + "num_input_tokens_seen": 316037735, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.66015625, + "step": 14658, + "time_per_iteration": 2.4841041564941406 + }, + { + "auxiliary_loss_clip": 0.01106037, + "auxiliary_loss_mlp": 0.01034789, + "balance_loss_clip": 1.02234912, + "balance_loss_mlp": 1.03650165, + "epoch": 0.8813467608597625, + "flos": 23807194997760.0, + "grad_norm": 2.487290321183497, + "language_loss": 0.77357286, + "learning_rate": 1.4580579340238554e-07, + "loss": 0.79498112, + "num_input_tokens_seen": 316058105, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6953125, + "step": 14659, + "time_per_iteration": 2.461486577987671 + }, + { + "auxiliary_loss_clip": 0.01099162, + "auxiliary_loss_mlp": 0.01032849, + "balance_loss_clip": 1.02099383, + "balance_loss_mlp": 1.03384209, + "epoch": 0.8814068841124305, + "flos": 21105455667840.0, + "grad_norm": 1.8826679554051244, + "language_loss": 0.60173553, + "learning_rate": 1.4565985004900894e-07, + "loss": 0.62305564, + "num_input_tokens_seen": 316074415, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.65625, + "step": 14660, + "time_per_iteration": 2.464398145675659 + }, + { + "auxiliary_loss_clip": 0.01100447, + "auxiliary_loss_mlp": 0.01038937, + "balance_loss_clip": 1.02622342, + "balance_loss_mlp": 1.0339849, + "epoch": 0.8814670073650984, + "flos": 24716991605760.0, + "grad_norm": 1.7496119170804572, + "language_loss": 0.78005695, + "learning_rate": 1.455139770123972e-07, + "loss": 0.80145085, + "num_input_tokens_seen": 316094405, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6640625, + "step": 14661, + "time_per_iteration": 2.498229742050171 + }, + { + "auxiliary_loss_clip": 0.01102652, + "auxiliary_loss_mlp": 0.01042094, + "balance_loss_clip": 1.02973175, + "balance_loss_mlp": 1.03629279, + "epoch": 0.8815271306177664, + "flos": 22966274718720.0, + "grad_norm": 1.6809136256022188, + "language_loss": 0.76650071, + "learning_rate": 1.45368174298081e-07, + "loss": 0.78794813, + "num_input_tokens_seen": 316113390, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6640625, + "step": 14662, + "time_per_iteration": 2.4708175659179688 + }, + { + "auxiliary_loss_clip": 0.01097442, + "auxiliary_loss_mlp": 0.01024383, + "balance_loss_clip": 1.01413739, + "balance_loss_mlp": 1.03349352, + "epoch": 0.8815872538704344, + "flos": 19460064435840.0, + "grad_norm": 1.798644895415272, + "language_loss": 0.74030846, + "learning_rate": 1.4522244191158929e-07, + "loss": 0.7615267, + "num_input_tokens_seen": 316131085, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.640625, + "step": 14663, + "time_per_iteration": 2.4278945922851562 + }, + { + "auxiliary_loss_clip": 0.01099102, + "auxiliary_loss_mlp": 0.01032819, + "balance_loss_clip": 1.02133894, + "balance_loss_mlp": 1.03406262, + "epoch": 0.8816473771231024, + "flos": 32156604622080.0, + "grad_norm": 1.5255495118497213, + "language_loss": 0.69844538, + "learning_rate": 1.450767798584489e-07, + "loss": 0.71976459, + "num_input_tokens_seen": 316151440, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 14664, + "time_per_iteration": 2.5401558876037598 + }, + { + "auxiliary_loss_clip": 0.01095808, + "auxiliary_loss_mlp": 0.01032478, + "balance_loss_clip": 1.02217281, + "balance_loss_mlp": 1.0326488, + "epoch": 0.8817075003757703, + "flos": 19682279925120.0, + "grad_norm": 1.6618340799820441, + "language_loss": 0.81018615, + "learning_rate": 1.449311881441828e-07, + "loss": 0.83146906, + "num_input_tokens_seen": 316170750, + "router_z_loss_clip": 0.10302734, + "router_z_loss_mlp": 0.6328125, + "step": 14665, + "time_per_iteration": 2.433636426925659 + }, + { + "auxiliary_loss_clip": 0.01101369, + "auxiliary_loss_mlp": 0.01030385, + "balance_loss_clip": 1.01962006, + "balance_loss_mlp": 1.03590393, + "epoch": 0.8817676236284383, + "flos": 15668616251520.0, + "grad_norm": 1.9840035014600133, + "language_loss": 0.58445227, + "learning_rate": 1.447856667743117e-07, + "loss": 0.60576975, + "num_input_tokens_seen": 316187265, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.65625, + "step": 14666, + "time_per_iteration": 2.4269118309020996 + }, + { + "auxiliary_loss_clip": 0.01102004, + "auxiliary_loss_mlp": 0.01031848, + "balance_loss_clip": 1.0194447, + "balance_loss_mlp": 1.03639972, + "epoch": 0.8818277468811063, + "flos": 17895185539200.0, + "grad_norm": 1.9152368357070615, + "language_loss": 0.8380903, + "learning_rate": 1.4464021575435403e-07, + "loss": 0.85942888, + "num_input_tokens_seen": 316206555, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.65625, + "step": 14667, + "time_per_iteration": 2.4528279304504395 + }, + { + "auxiliary_loss_clip": 0.01098974, + "auxiliary_loss_mlp": 0.01030246, + "balance_loss_clip": 1.01812816, + "balance_loss_mlp": 1.03405619, + "epoch": 0.8818878701337742, + "flos": 18770508069120.0, + "grad_norm": 1.7252025562955478, + "language_loss": 0.62386823, + "learning_rate": 1.4449483508982563e-07, + "loss": 0.64516038, + "num_input_tokens_seen": 316225210, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6484375, + "step": 14668, + "time_per_iteration": 2.4459190368652344 + }, + { + "auxiliary_loss_clip": 0.01097923, + "auxiliary_loss_mlp": 0.01025692, + "balance_loss_clip": 1.0153687, + "balance_loss_mlp": 1.03508496, + "epoch": 0.8819479933864423, + "flos": 17712292464000.0, + "grad_norm": 2.1655760698377238, + "language_loss": 0.56931686, + "learning_rate": 1.4434952478623918e-07, + "loss": 0.59055305, + "num_input_tokens_seen": 316242685, + "router_z_loss_clip": 0.10302734, + "router_z_loss_mlp": 0.62890625, + "step": 14669, + "time_per_iteration": 2.421549081802368 + }, + { + "auxiliary_loss_clip": 0.0109805, + "auxiliary_loss_mlp": 0.01031814, + "balance_loss_clip": 1.02056646, + "balance_loss_mlp": 1.03313446, + "epoch": 0.8820081166391102, + "flos": 11728749070080.0, + "grad_norm": 2.8416626474645454, + "language_loss": 0.70905107, + "learning_rate": 1.442042848491043e-07, + "loss": 0.73034966, + "num_input_tokens_seen": 316260935, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 14670, + "time_per_iteration": 2.4560811519622803 + }, + { + "auxiliary_loss_clip": 0.0109736, + "auxiliary_loss_mlp": 0.01029663, + "balance_loss_clip": 1.01844513, + "balance_loss_mlp": 1.03294659, + "epoch": 0.8820682398917782, + "flos": 27490372611840.0, + "grad_norm": 2.2004131158768034, + "language_loss": 0.73885584, + "learning_rate": 1.44059115283929e-07, + "loss": 0.76012611, + "num_input_tokens_seen": 316281190, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.64453125, + "step": 14671, + "time_per_iteration": 2.4802374839782715 + }, + { + "auxiliary_loss_clip": 0.01101545, + "auxiliary_loss_mlp": 0.01026485, + "balance_loss_clip": 1.01405191, + "balance_loss_mlp": 1.03317046, + "epoch": 0.8821283631444461, + "flos": 16873850223360.0, + "grad_norm": 2.5737245171058007, + "language_loss": 0.847103, + "learning_rate": 1.43914016096218e-07, + "loss": 0.86838329, + "num_input_tokens_seen": 316297115, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 14672, + "time_per_iteration": 2.4168829917907715 + }, + { + "auxiliary_loss_clip": 0.01097209, + "auxiliary_loss_mlp": 0.01029095, + "balance_loss_clip": 1.01794291, + "balance_loss_mlp": 1.0340246, + "epoch": 0.8821884863971141, + "flos": 24280964409600.0, + "grad_norm": 1.5306407957172687, + "language_loss": 0.72456914, + "learning_rate": 1.4376898729147336e-07, + "loss": 0.74583215, + "num_input_tokens_seen": 316318235, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6328125, + "step": 14673, + "time_per_iteration": 2.460655689239502 + }, + { + "auxiliary_loss_clip": 0.01021936, + "auxiliary_loss_mlp": 0.01000476, + "balance_loss_clip": 0.99948043, + "balance_loss_mlp": 1.00207949, + "epoch": 0.882248609649782, + "flos": 59432342492160.0, + "grad_norm": 0.8084320527661446, + "language_loss": 0.49390993, + "learning_rate": 1.4362402887519487e-07, + "loss": 0.51413405, + "num_input_tokens_seen": 316384705, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.19921875, + "step": 14674, + "time_per_iteration": 3.162792682647705 + }, + { + "auxiliary_loss_clip": 0.01101068, + "auxiliary_loss_mlp": 0.01028636, + "balance_loss_clip": 1.01710272, + "balance_loss_mlp": 1.0344094, + "epoch": 0.88230873290245, + "flos": 19937784343680.0, + "grad_norm": 1.9780221467506172, + "language_loss": 0.76291561, + "learning_rate": 1.4347914085287971e-07, + "loss": 0.78421265, + "num_input_tokens_seen": 316401165, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66796875, + "step": 14675, + "time_per_iteration": 2.4549062252044678 + }, + { + "auxiliary_loss_clip": 0.010958, + "auxiliary_loss_mlp": 0.0103147, + "balance_loss_clip": 1.0201273, + "balance_loss_mlp": 1.03307641, + "epoch": 0.882368856155118, + "flos": 16362769559040.0, + "grad_norm": 1.806810226504955, + "language_loss": 0.79589498, + "learning_rate": 1.4333432323002105e-07, + "loss": 0.8171677, + "num_input_tokens_seen": 316418780, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.62890625, + "step": 14676, + "time_per_iteration": 2.4810338020324707 + }, + { + "auxiliary_loss_clip": 0.01022536, + "auxiliary_loss_mlp": 0.01002158, + "balance_loss_clip": 1.00122857, + "balance_loss_mlp": 1.00247622, + "epoch": 0.882428979407786, + "flos": 70594563277440.0, + "grad_norm": 0.6887169643192462, + "language_loss": 0.54792887, + "learning_rate": 1.431895760121109e-07, + "loss": 0.56817579, + "num_input_tokens_seen": 316482030, + "router_z_loss_clip": 0.00927734, + "router_z_loss_mlp": 0.20117188, + "step": 14677, + "time_per_iteration": 3.141437530517578 + }, + { + "auxiliary_loss_clip": 0.01095907, + "auxiliary_loss_mlp": 0.01027291, + "balance_loss_clip": 1.01610887, + "balance_loss_mlp": 1.0322262, + "epoch": 0.8824891026604539, + "flos": 18150294908160.0, + "grad_norm": 2.313090025905276, + "language_loss": 0.65397072, + "learning_rate": 1.4304489920463847e-07, + "loss": 0.67520267, + "num_input_tokens_seen": 316499175, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.63671875, + "step": 14678, + "time_per_iteration": 3.9388959407806396 + }, + { + "auxiliary_loss_clip": 0.01102187, + "auxiliary_loss_mlp": 0.01031683, + "balance_loss_clip": 1.02029228, + "balance_loss_mlp": 1.03451753, + "epoch": 0.8825492259131219, + "flos": 27232713377280.0, + "grad_norm": 1.7618183642532588, + "language_loss": 0.71121728, + "learning_rate": 1.4290029281308936e-07, + "loss": 0.73255599, + "num_input_tokens_seen": 316519495, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.67578125, + "step": 14679, + "time_per_iteration": 2.5044422149658203 + }, + { + "auxiliary_loss_clip": 0.01097187, + "auxiliary_loss_mlp": 0.01029682, + "balance_loss_clip": 1.01925075, + "balance_loss_mlp": 1.03367972, + "epoch": 0.8826093491657898, + "flos": 22274419881600.0, + "grad_norm": 1.7004762448338653, + "language_loss": 0.6368348, + "learning_rate": 1.4275575684294694e-07, + "loss": 0.65810347, + "num_input_tokens_seen": 316538180, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.6328125, + "step": 14680, + "time_per_iteration": 3.960117816925049 + }, + { + "auxiliary_loss_clip": 0.01099928, + "auxiliary_loss_mlp": 0.01032293, + "balance_loss_clip": 1.02081347, + "balance_loss_mlp": 1.03534234, + "epoch": 0.8826694724184578, + "flos": 14204753377920.0, + "grad_norm": 2.3286079869069423, + "language_loss": 0.77274716, + "learning_rate": 1.4261129129969328e-07, + "loss": 0.79406941, + "num_input_tokens_seen": 316551750, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6484375, + "step": 14681, + "time_per_iteration": 2.396320343017578 + }, + { + "auxiliary_loss_clip": 0.01101169, + "auxiliary_loss_mlp": 0.01028851, + "balance_loss_clip": 1.01660836, + "balance_loss_mlp": 1.03466046, + "epoch": 0.8827295956711259, + "flos": 20631686256000.0, + "grad_norm": 1.7616668495516699, + "language_loss": 0.72610635, + "learning_rate": 1.424668961888047e-07, + "loss": 0.7474066, + "num_input_tokens_seen": 316570680, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6640625, + "step": 14682, + "time_per_iteration": 2.455319404602051 + }, + { + "auxiliary_loss_clip": 0.01104116, + "auxiliary_loss_mlp": 0.01031548, + "balance_loss_clip": 1.0181849, + "balance_loss_mlp": 1.0359025, + "epoch": 0.8827897189237938, + "flos": 18513064316160.0, + "grad_norm": 1.8948306470758551, + "language_loss": 0.74149251, + "learning_rate": 1.4232257151575765e-07, + "loss": 0.76284921, + "num_input_tokens_seen": 316588635, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.68359375, + "step": 14683, + "time_per_iteration": 2.4281208515167236 + }, + { + "auxiliary_loss_clip": 0.01100505, + "auxiliary_loss_mlp": 0.01031099, + "balance_loss_clip": 1.01920223, + "balance_loss_mlp": 1.03486133, + "epoch": 0.8828498421764618, + "flos": 22747399194240.0, + "grad_norm": 1.7117849118733992, + "language_loss": 0.65447652, + "learning_rate": 1.4217831728602492e-07, + "loss": 0.67579257, + "num_input_tokens_seen": 316607550, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 14684, + "time_per_iteration": 3.894663095474243 + }, + { + "auxiliary_loss_clip": 0.0109682, + "auxiliary_loss_mlp": 0.01025184, + "balance_loss_clip": 1.01400256, + "balance_loss_mlp": 1.03307378, + "epoch": 0.8829099654291297, + "flos": 15012384727680.0, + "grad_norm": 1.6699013852482991, + "language_loss": 0.69357675, + "learning_rate": 1.4203413350507677e-07, + "loss": 0.71479678, + "num_input_tokens_seen": 316624460, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.63671875, + "step": 14685, + "time_per_iteration": 2.3994603157043457 + }, + { + "auxiliary_loss_clip": 0.01102745, + "auxiliary_loss_mlp": 0.01030639, + "balance_loss_clip": 1.01808035, + "balance_loss_mlp": 1.03550434, + "epoch": 0.8829700886817977, + "flos": 16720546976640.0, + "grad_norm": 1.7745914045293507, + "language_loss": 0.74189049, + "learning_rate": 1.418900201783806e-07, + "loss": 0.76322436, + "num_input_tokens_seen": 316640765, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.671875, + "step": 14686, + "time_per_iteration": 2.4151484966278076 + }, + { + "auxiliary_loss_clip": 0.01096349, + "auxiliary_loss_mlp": 0.01024315, + "balance_loss_clip": 1.01291823, + "balance_loss_mlp": 1.03275704, + "epoch": 0.8830302119344656, + "flos": 15263256291840.0, + "grad_norm": 2.0272125765642732, + "language_loss": 0.63428628, + "learning_rate": 1.417459773114007e-07, + "loss": 0.6554929, + "num_input_tokens_seen": 316656120, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.63671875, + "step": 14687, + "time_per_iteration": 2.4173166751861572 + }, + { + "auxiliary_loss_clip": 0.01100854, + "auxiliary_loss_mlp": 0.01032119, + "balance_loss_clip": 1.0205617, + "balance_loss_mlp": 1.03395879, + "epoch": 0.8830903351871336, + "flos": 28617751854720.0, + "grad_norm": 1.6968934046619368, + "language_loss": 0.68904001, + "learning_rate": 1.4160200490959984e-07, + "loss": 0.71036971, + "num_input_tokens_seen": 316676095, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66796875, + "step": 14688, + "time_per_iteration": 2.500330686569214 + }, + { + "auxiliary_loss_clip": 0.01096963, + "auxiliary_loss_mlp": 0.01025911, + "balance_loss_clip": 1.01433623, + "balance_loss_mlp": 1.033746, + "epoch": 0.8831504584398016, + "flos": 28001632844160.0, + "grad_norm": 1.493679343363792, + "language_loss": 0.67016995, + "learning_rate": 1.4145810297843697e-07, + "loss": 0.69139874, + "num_input_tokens_seen": 316696235, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6328125, + "step": 14689, + "time_per_iteration": 2.4815356731414795 + }, + { + "auxiliary_loss_clip": 0.01101595, + "auxiliary_loss_mlp": 0.01028938, + "balance_loss_clip": 1.01803041, + "balance_loss_mlp": 1.03819656, + "epoch": 0.8832105816924696, + "flos": 26579642250240.0, + "grad_norm": 1.5915741944618107, + "language_loss": 0.74574995, + "learning_rate": 1.4131427152336905e-07, + "loss": 0.76705527, + "num_input_tokens_seen": 316719680, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6328125, + "step": 14690, + "time_per_iteration": 2.514997959136963 + }, + { + "auxiliary_loss_clip": 0.01099856, + "auxiliary_loss_mlp": 0.0103339, + "balance_loss_clip": 1.02113485, + "balance_loss_mlp": 1.03427589, + "epoch": 0.8832707049451375, + "flos": 24898771359360.0, + "grad_norm": 1.438089789703588, + "language_loss": 0.72641426, + "learning_rate": 1.4117051054985018e-07, + "loss": 0.74774671, + "num_input_tokens_seen": 316739830, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.65625, + "step": 14691, + "time_per_iteration": 2.4770781993865967 + }, + { + "auxiliary_loss_clip": 0.01103509, + "auxiliary_loss_mlp": 0.01030457, + "balance_loss_clip": 1.01801181, + "balance_loss_mlp": 1.03464651, + "epoch": 0.8833308281978055, + "flos": 15451141357440.0, + "grad_norm": 1.838759205957813, + "language_loss": 0.51184076, + "learning_rate": 1.4102682006333243e-07, + "loss": 0.53318036, + "num_input_tokens_seen": 316758105, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6875, + "step": 14692, + "time_per_iteration": 2.426839828491211 + }, + { + "auxiliary_loss_clip": 0.01101344, + "auxiliary_loss_mlp": 0.01029759, + "balance_loss_clip": 1.01842189, + "balance_loss_mlp": 1.03562319, + "epoch": 0.8833909514504734, + "flos": 20301523418880.0, + "grad_norm": 2.5343795474068576, + "language_loss": 0.60240692, + "learning_rate": 1.4088320006926346e-07, + "loss": 0.62371796, + "num_input_tokens_seen": 316777455, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 14693, + "time_per_iteration": 2.427978992462158 + }, + { + "auxiliary_loss_clip": 0.01097522, + "auxiliary_loss_mlp": 0.01026791, + "balance_loss_clip": 1.01653326, + "balance_loss_mlp": 1.03563237, + "epoch": 0.8834510747031414, + "flos": 20374027021440.0, + "grad_norm": 1.4622609213581108, + "language_loss": 0.75340641, + "learning_rate": 1.407396505730898e-07, + "loss": 0.7746495, + "num_input_tokens_seen": 316796300, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.62109375, + "step": 14694, + "time_per_iteration": 2.4537601470947266 + }, + { + "auxiliary_loss_clip": 0.01099823, + "auxiliary_loss_mlp": 0.01029601, + "balance_loss_clip": 1.01875305, + "balance_loss_mlp": 1.03203654, + "epoch": 0.8835111979558095, + "flos": 29752026508800.0, + "grad_norm": 1.6951093668851203, + "language_loss": 0.72519171, + "learning_rate": 1.4059617158025527e-07, + "loss": 0.74648589, + "num_input_tokens_seen": 316819090, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6796875, + "step": 14695, + "time_per_iteration": 2.54256010055542 + }, + { + "auxiliary_loss_clip": 0.01094268, + "auxiliary_loss_mlp": 0.01026407, + "balance_loss_clip": 1.01576185, + "balance_loss_mlp": 1.03320861, + "epoch": 0.8835713212084774, + "flos": 24134556574080.0, + "grad_norm": 1.6044220057517486, + "language_loss": 0.80077511, + "learning_rate": 1.404527630961998e-07, + "loss": 0.82198191, + "num_input_tokens_seen": 316839250, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.609375, + "step": 14696, + "time_per_iteration": 2.476656913757324 + }, + { + "auxiliary_loss_clip": 0.01100456, + "auxiliary_loss_mlp": 0.0102923, + "balance_loss_clip": 1.01835179, + "balance_loss_mlp": 1.034863, + "epoch": 0.8836314444611454, + "flos": 27672331933440.0, + "grad_norm": 1.4070028272927375, + "language_loss": 0.74347401, + "learning_rate": 1.4030942512636236e-07, + "loss": 0.76477087, + "num_input_tokens_seen": 316861315, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.65625, + "step": 14697, + "time_per_iteration": 2.5067691802978516 + }, + { + "auxiliary_loss_clip": 0.01098587, + "auxiliary_loss_mlp": 0.01033486, + "balance_loss_clip": 1.02203012, + "balance_loss_mlp": 1.03380871, + "epoch": 0.8836915677138133, + "flos": 16836969934080.0, + "grad_norm": 1.9574693884985603, + "language_loss": 0.72150856, + "learning_rate": 1.401661576761779e-07, + "loss": 0.74282926, + "num_input_tokens_seen": 316879325, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 14698, + "time_per_iteration": 2.4394617080688477 + }, + { + "auxiliary_loss_clip": 0.01022142, + "auxiliary_loss_mlp": 0.01001525, + "balance_loss_clip": 1.00058353, + "balance_loss_mlp": 1.00201714, + "epoch": 0.8837516909664813, + "flos": 69310540823040.0, + "grad_norm": 0.8043510502151429, + "language_loss": 0.5371002, + "learning_rate": 1.4002296075107856e-07, + "loss": 0.55733687, + "num_input_tokens_seen": 316936425, + "router_z_loss_clip": 0.00939941, + "router_z_loss_mlp": 0.20117188, + "step": 14699, + "time_per_iteration": 3.0387063026428223 + }, + { + "auxiliary_loss_clip": 0.01102957, + "auxiliary_loss_mlp": 0.0102854, + "balance_loss_clip": 1.01674378, + "balance_loss_mlp": 1.03511429, + "epoch": 0.8838118142191492, + "flos": 21324726241920.0, + "grad_norm": 1.8010601059746882, + "language_loss": 0.76841766, + "learning_rate": 1.3987983435649508e-07, + "loss": 0.78973258, + "num_input_tokens_seen": 316956360, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 14700, + "time_per_iteration": 2.4849624633789062 + }, + { + "auxiliary_loss_clip": 0.01098124, + "auxiliary_loss_mlp": 0.01028892, + "balance_loss_clip": 1.0178647, + "balance_loss_mlp": 1.03467011, + "epoch": 0.8838719374718172, + "flos": 21470559459840.0, + "grad_norm": 1.9788606423374575, + "language_loss": 0.72744364, + "learning_rate": 1.3973677849785494e-07, + "loss": 0.74871373, + "num_input_tokens_seen": 316975295, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6328125, + "step": 14701, + "time_per_iteration": 2.439821243286133 + }, + { + "auxiliary_loss_clip": 0.01101947, + "auxiliary_loss_mlp": 0.0103152, + "balance_loss_clip": 1.01981974, + "balance_loss_mlp": 1.03463852, + "epoch": 0.8839320607244852, + "flos": 26468929555200.0, + "grad_norm": 14.71295630109059, + "language_loss": 0.70860976, + "learning_rate": 1.3959379318058262e-07, + "loss": 0.72994447, + "num_input_tokens_seen": 316994520, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 14702, + "time_per_iteration": 2.483827590942383 + }, + { + "auxiliary_loss_clip": 0.01102205, + "auxiliary_loss_mlp": 0.01032007, + "balance_loss_clip": 1.01999676, + "balance_loss_mlp": 1.03664851, + "epoch": 0.8839921839771532, + "flos": 45222270923520.0, + "grad_norm": 2.6898618160604, + "language_loss": 0.71423376, + "learning_rate": 1.3945087841010006e-07, + "loss": 0.73557591, + "num_input_tokens_seen": 317018095, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.65625, + "step": 14703, + "time_per_iteration": 2.644871950149536 + }, + { + "auxiliary_loss_clip": 0.01097307, + "auxiliary_loss_mlp": 0.01025831, + "balance_loss_clip": 1.01494169, + "balance_loss_mlp": 1.03430629, + "epoch": 0.8840523072298211, + "flos": 20006876154240.0, + "grad_norm": 1.7004164598471103, + "language_loss": 0.6647324, + "learning_rate": 1.3930803419182645e-07, + "loss": 0.68596381, + "num_input_tokens_seen": 317035755, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6328125, + "step": 14704, + "time_per_iteration": 2.4443137645721436 + }, + { + "auxiliary_loss_clip": 0.01094574, + "auxiliary_loss_mlp": 0.01024695, + "balance_loss_clip": 1.01381683, + "balance_loss_mlp": 1.03165603, + "epoch": 0.8841124304824891, + "flos": 24426007528320.0, + "grad_norm": 1.6723604475362273, + "language_loss": 0.70644706, + "learning_rate": 1.3916526053117905e-07, + "loss": 0.72763973, + "num_input_tokens_seen": 317055765, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.62890625, + "step": 14705, + "time_per_iteration": 2.469675302505493 + }, + { + "auxiliary_loss_clip": 0.01098911, + "auxiliary_loss_mlp": 0.01031156, + "balance_loss_clip": 1.02092755, + "balance_loss_mlp": 1.0351615, + "epoch": 0.884172553735157, + "flos": 31284622056960.0, + "grad_norm": 1.5306826086983725, + "language_loss": 0.70983511, + "learning_rate": 1.3902255743357104e-07, + "loss": 0.73113579, + "num_input_tokens_seen": 317077955, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.63671875, + "step": 14706, + "time_per_iteration": 2.54547381401062 + }, + { + "auxiliary_loss_clip": 0.01098056, + "auxiliary_loss_mlp": 0.01027398, + "balance_loss_clip": 1.01607299, + "balance_loss_mlp": 1.0337317, + "epoch": 0.884232676987825, + "flos": 21391160446080.0, + "grad_norm": 1.6132809989349575, + "language_loss": 0.7450251, + "learning_rate": 1.3887992490441413e-07, + "loss": 0.76627964, + "num_input_tokens_seen": 317095825, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.64453125, + "step": 14707, + "time_per_iteration": 2.448845624923706 + }, + { + "auxiliary_loss_clip": 0.01021776, + "auxiliary_loss_mlp": 0.01004857, + "balance_loss_clip": 1.00377238, + "balance_loss_mlp": 1.00174427, + "epoch": 0.8842928002404931, + "flos": 57911451799680.0, + "grad_norm": 0.8117028150723945, + "language_loss": 0.60430789, + "learning_rate": 1.387373629491173e-07, + "loss": 0.62457418, + "num_input_tokens_seen": 317152875, + "router_z_loss_clip": 0.01086426, + "router_z_loss_mlp": 0.20019531, + "step": 14708, + "time_per_iteration": 2.9150478839874268 + }, + { + "auxiliary_loss_clip": 0.01093498, + "auxiliary_loss_mlp": 0.01028651, + "balance_loss_clip": 1.01817834, + "balance_loss_mlp": 1.03259778, + "epoch": 0.884352923493161, + "flos": 41463896186880.0, + "grad_norm": 1.827497004044899, + "language_loss": 0.67355728, + "learning_rate": 1.3859487157308625e-07, + "loss": 0.69477868, + "num_input_tokens_seen": 317176725, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.609375, + "step": 14709, + "time_per_iteration": 2.643474817276001 + }, + { + "auxiliary_loss_clip": 0.01104027, + "auxiliary_loss_mlp": 0.01035315, + "balance_loss_clip": 1.02236271, + "balance_loss_mlp": 1.03464079, + "epoch": 0.884413046745829, + "flos": 46541234332800.0, + "grad_norm": 1.5868272680422912, + "language_loss": 0.62517226, + "learning_rate": 1.3845245078172373e-07, + "loss": 0.64656574, + "num_input_tokens_seen": 317206880, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6953125, + "step": 14710, + "time_per_iteration": 2.694308042526245 + }, + { + "auxiliary_loss_clip": 0.01097265, + "auxiliary_loss_mlp": 0.01027868, + "balance_loss_clip": 1.01702642, + "balance_loss_mlp": 1.03435802, + "epoch": 0.8844731699984969, + "flos": 19135324552320.0, + "grad_norm": 2.254524973273371, + "language_loss": 0.63405102, + "learning_rate": 1.38310100580431e-07, + "loss": 0.65530241, + "num_input_tokens_seen": 317224135, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.62890625, + "step": 14711, + "time_per_iteration": 2.454507350921631 + }, + { + "auxiliary_loss_clip": 0.01102557, + "auxiliary_loss_mlp": 0.01033319, + "balance_loss_clip": 1.02133811, + "balance_loss_mlp": 1.03427267, + "epoch": 0.8845332932511649, + "flos": 23260634674560.0, + "grad_norm": 2.2842740849754115, + "language_loss": 0.75539434, + "learning_rate": 1.38167820974606e-07, + "loss": 0.77675307, + "num_input_tokens_seen": 317244505, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 14712, + "time_per_iteration": 2.4946510791778564 + }, + { + "auxiliary_loss_clip": 0.01098374, + "auxiliary_loss_mlp": 0.01024835, + "balance_loss_clip": 1.01309907, + "balance_loss_mlp": 1.03245926, + "epoch": 0.8845934165038328, + "flos": 17564591738880.0, + "grad_norm": 3.708605595590302, + "language_loss": 0.81021023, + "learning_rate": 1.3802561196964368e-07, + "loss": 0.83144236, + "num_input_tokens_seen": 317257830, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66015625, + "step": 14713, + "time_per_iteration": 2.479050397872925 + }, + { + "auxiliary_loss_clip": 0.01097877, + "auxiliary_loss_mlp": 0.01027435, + "balance_loss_clip": 1.01575828, + "balance_loss_mlp": 1.03250861, + "epoch": 0.8846535397565009, + "flos": 27485739757440.0, + "grad_norm": 1.4231578752957819, + "language_loss": 0.55540788, + "learning_rate": 1.3788347357093688e-07, + "loss": 0.57666099, + "num_input_tokens_seen": 317278430, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65234375, + "step": 14714, + "time_per_iteration": 2.521453857421875 + }, + { + "auxiliary_loss_clip": 0.01097743, + "auxiliary_loss_mlp": 0.0103307, + "balance_loss_clip": 1.0212388, + "balance_loss_mlp": 1.03320169, + "epoch": 0.8847136630091688, + "flos": 28761430256640.0, + "grad_norm": 1.743794814906259, + "language_loss": 0.73726749, + "learning_rate": 1.377414057838755e-07, + "loss": 0.75857568, + "num_input_tokens_seen": 317295970, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.64453125, + "step": 14715, + "time_per_iteration": 2.519960641860962 + }, + { + "auxiliary_loss_clip": 0.0109907, + "auxiliary_loss_mlp": 0.01029788, + "balance_loss_clip": 1.01849318, + "balance_loss_mlp": 1.03362608, + "epoch": 0.8847737862618368, + "flos": 23476924419840.0, + "grad_norm": 2.799687006211767, + "language_loss": 0.75298744, + "learning_rate": 1.375994086138461e-07, + "loss": 0.77427602, + "num_input_tokens_seen": 317316185, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 14716, + "time_per_iteration": 2.5302252769470215 + }, + { + "auxiliary_loss_clip": 0.01099052, + "auxiliary_loss_mlp": 0.01034067, + "balance_loss_clip": 1.02258122, + "balance_loss_mlp": 1.03498149, + "epoch": 0.8848339095145047, + "flos": 18660872782080.0, + "grad_norm": 1.9463895585575124, + "language_loss": 0.71236145, + "learning_rate": 1.3745748206623397e-07, + "loss": 0.73369265, + "num_input_tokens_seen": 317333275, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 14717, + "time_per_iteration": 2.4483509063720703 + }, + { + "auxiliary_loss_clip": 0.01095292, + "auxiliary_loss_mlp": 0.01030348, + "balance_loss_clip": 1.01952374, + "balance_loss_mlp": 1.03427327, + "epoch": 0.8848940327671727, + "flos": 32270298145920.0, + "grad_norm": 2.2448423833048667, + "language_loss": 0.74712592, + "learning_rate": 1.373156261464208e-07, + "loss": 0.76838231, + "num_input_tokens_seen": 317351245, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.609375, + "step": 14718, + "time_per_iteration": 2.528916597366333 + }, + { + "auxiliary_loss_clip": 0.0109995, + "auxiliary_loss_mlp": 0.01027778, + "balance_loss_clip": 1.01569605, + "balance_loss_mlp": 1.03310704, + "epoch": 0.8849541560198406, + "flos": 24021832717440.0, + "grad_norm": 1.5894409010966428, + "language_loss": 0.7822836, + "learning_rate": 1.3717384085978602e-07, + "loss": 0.80356085, + "num_input_tokens_seen": 317370740, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.66796875, + "step": 14719, + "time_per_iteration": 2.4806060791015625 + }, + { + "auxiliary_loss_clip": 0.01100885, + "auxiliary_loss_mlp": 0.01025821, + "balance_loss_clip": 1.01418066, + "balance_loss_mlp": 1.03466296, + "epoch": 0.8850142792725086, + "flos": 16873060124160.0, + "grad_norm": 1.5765510176535809, + "language_loss": 0.71778101, + "learning_rate": 1.3703212621170579e-07, + "loss": 0.73904806, + "num_input_tokens_seen": 317388370, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 14720, + "time_per_iteration": 3.864971160888672 + }, + { + "auxiliary_loss_clip": 0.01101707, + "auxiliary_loss_mlp": 0.01027746, + "balance_loss_clip": 1.01578975, + "balance_loss_mlp": 1.03353, + "epoch": 0.8850744025251767, + "flos": 24024059360640.0, + "grad_norm": 2.5959463277738974, + "language_loss": 0.82530278, + "learning_rate": 1.3689048220755383e-07, + "loss": 0.84659731, + "num_input_tokens_seen": 317407390, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6796875, + "step": 14721, + "time_per_iteration": 2.4602034091949463 + }, + { + "auxiliary_loss_clip": 0.01098248, + "auxiliary_loss_mlp": 0.01029899, + "balance_loss_clip": 1.01759672, + "balance_loss_mlp": 1.03253555, + "epoch": 0.8851345257778446, + "flos": 47955575329920.0, + "grad_norm": 1.6462057536303287, + "language_loss": 0.6220575, + "learning_rate": 1.3674890885270186e-07, + "loss": 0.64333898, + "num_input_tokens_seen": 317430825, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.65625, + "step": 14722, + "time_per_iteration": 4.181842565536499 + }, + { + "auxiliary_loss_clip": 0.01100414, + "auxiliary_loss_mlp": 0.01026875, + "balance_loss_clip": 1.01504993, + "balance_loss_mlp": 1.03351831, + "epoch": 0.8851946490305126, + "flos": 36611000173440.0, + "grad_norm": 1.8405460130697608, + "language_loss": 0.68605506, + "learning_rate": 1.3660740615251754e-07, + "loss": 0.70732802, + "num_input_tokens_seen": 317451905, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66796875, + "step": 14723, + "time_per_iteration": 2.5939276218414307 + }, + { + "auxiliary_loss_clip": 0.01098926, + "auxiliary_loss_mlp": 0.01029416, + "balance_loss_clip": 1.01789427, + "balance_loss_mlp": 1.03490961, + "epoch": 0.8852547722831805, + "flos": 21544248211200.0, + "grad_norm": 1.5982216956650597, + "language_loss": 0.77820933, + "learning_rate": 1.3646597411236703e-07, + "loss": 0.79949278, + "num_input_tokens_seen": 317470030, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 14724, + "time_per_iteration": 2.5080296993255615 + }, + { + "auxiliary_loss_clip": 0.0102205, + "auxiliary_loss_mlp": 0.00996579, + "balance_loss_clip": 0.995673, + "balance_loss_mlp": 1.00202656, + "epoch": 0.8853148955358485, + "flos": 63059246472960.0, + "grad_norm": 0.7976703792304201, + "language_loss": 0.58909416, + "learning_rate": 1.363246127376143e-07, + "loss": 0.60928047, + "num_input_tokens_seen": 317527460, + "router_z_loss_clip": 0.0090332, + "router_z_loss_mlp": 0.20019531, + "step": 14725, + "time_per_iteration": 2.942244529724121 + }, + { + "auxiliary_loss_clip": 0.01103081, + "auxiliary_loss_mlp": 0.01037165, + "balance_loss_clip": 1.0250175, + "balance_loss_mlp": 1.03376329, + "epoch": 0.8853750187885164, + "flos": 18149828031360.0, + "grad_norm": 3.7159601069719743, + "language_loss": 0.6908325, + "learning_rate": 1.3618332203361837e-07, + "loss": 0.71223497, + "num_input_tokens_seen": 317544070, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6953125, + "step": 14726, + "time_per_iteration": 3.910275459289551 + }, + { + "auxiliary_loss_clip": 0.01098863, + "auxiliary_loss_mlp": 0.01030877, + "balance_loss_clip": 1.0189023, + "balance_loss_mlp": 1.03549707, + "epoch": 0.8854351420411845, + "flos": 39570542392320.0, + "grad_norm": 1.4685006147064286, + "language_loss": 0.69542432, + "learning_rate": 1.3604210200573785e-07, + "loss": 0.71672177, + "num_input_tokens_seen": 317570275, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6328125, + "step": 14727, + "time_per_iteration": 2.6747992038726807 + }, + { + "auxiliary_loss_clip": 0.0110346, + "auxiliary_loss_mlp": 0.01031311, + "balance_loss_clip": 1.01936054, + "balance_loss_mlp": 1.03817511, + "epoch": 0.8854952652938524, + "flos": 23769309127680.0, + "grad_norm": 1.802344998111036, + "language_loss": 0.70243108, + "learning_rate": 1.3590095265932733e-07, + "loss": 0.72377884, + "num_input_tokens_seen": 317590160, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.65234375, + "step": 14728, + "time_per_iteration": 2.4881274700164795 + }, + { + "auxiliary_loss_clip": 0.0109924, + "auxiliary_loss_mlp": 0.01027662, + "balance_loss_clip": 1.01676655, + "balance_loss_mlp": 1.0337584, + "epoch": 0.8855553885465204, + "flos": 18290310122880.0, + "grad_norm": 2.1419875680940637, + "language_loss": 0.66392922, + "learning_rate": 1.3575987399973987e-07, + "loss": 0.68519825, + "num_input_tokens_seen": 317608340, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.65625, + "step": 14729, + "time_per_iteration": 2.422187566757202 + }, + { + "auxiliary_loss_clip": 0.01099558, + "auxiliary_loss_mlp": 0.01030434, + "balance_loss_clip": 1.01968694, + "balance_loss_mlp": 1.03564954, + "epoch": 0.8856155117991883, + "flos": 36867402432000.0, + "grad_norm": 1.7117430310189854, + "language_loss": 0.62781358, + "learning_rate": 1.3561886603232453e-07, + "loss": 0.64911354, + "num_input_tokens_seen": 317629910, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.640625, + "step": 14730, + "time_per_iteration": 2.5803756713867188 + }, + { + "auxiliary_loss_clip": 0.01096046, + "auxiliary_loss_mlp": 0.01033705, + "balance_loss_clip": 1.02272558, + "balance_loss_mlp": 1.03250694, + "epoch": 0.8856756350518563, + "flos": 22163886754560.0, + "grad_norm": 1.4792332459345614, + "language_loss": 0.79300416, + "learning_rate": 1.3547792876242904e-07, + "loss": 0.81430167, + "num_input_tokens_seen": 317650265, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6328125, + "step": 14731, + "time_per_iteration": 2.4924111366271973 + }, + { + "auxiliary_loss_clip": 0.01099374, + "auxiliary_loss_mlp": 0.01030243, + "balance_loss_clip": 1.0186559, + "balance_loss_mlp": 1.03311884, + "epoch": 0.8857357583045242, + "flos": 20740962407040.0, + "grad_norm": 1.6116048761510777, + "language_loss": 0.83205569, + "learning_rate": 1.3533706219539708e-07, + "loss": 0.85335195, + "num_input_tokens_seen": 317669045, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6640625, + "step": 14732, + "time_per_iteration": 2.4561238288879395 + }, + { + "auxiliary_loss_clip": 0.01022084, + "auxiliary_loss_mlp": 0.01000626, + "balance_loss_clip": 0.99961245, + "balance_loss_mlp": 1.00209713, + "epoch": 0.8857958815571922, + "flos": 69892329409920.0, + "grad_norm": 0.913679809419791, + "language_loss": 0.59908044, + "learning_rate": 1.3519626633657045e-07, + "loss": 0.61930752, + "num_input_tokens_seen": 317728065, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.20019531, + "step": 14733, + "time_per_iteration": 3.068289041519165 + }, + { + "auxiliary_loss_clip": 0.01100673, + "auxiliary_loss_mlp": 0.01032202, + "balance_loss_clip": 1.02050126, + "balance_loss_mlp": 1.03517413, + "epoch": 0.8858560048098603, + "flos": 15121948187520.0, + "grad_norm": 1.8626138238723922, + "language_loss": 0.66439319, + "learning_rate": 1.3505554119128838e-07, + "loss": 0.68572199, + "num_input_tokens_seen": 317746120, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 14734, + "time_per_iteration": 2.4276156425476074 + }, + { + "auxiliary_loss_clip": 0.01099506, + "auxiliary_loss_mlp": 0.01036665, + "balance_loss_clip": 1.02576971, + "balance_loss_mlp": 1.03578985, + "epoch": 0.8859161280625282, + "flos": 16611019430400.0, + "grad_norm": 2.0317348064354213, + "language_loss": 0.75379711, + "learning_rate": 1.3491488676488682e-07, + "loss": 0.77515882, + "num_input_tokens_seen": 317762280, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.63671875, + "step": 14735, + "time_per_iteration": 2.4584195613861084 + }, + { + "auxiliary_loss_clip": 0.01100195, + "auxiliary_loss_mlp": 0.01029168, + "balance_loss_clip": 1.01778936, + "balance_loss_mlp": 1.03425932, + "epoch": 0.8859762513151962, + "flos": 18694484933760.0, + "grad_norm": 1.6977065723830995, + "language_loss": 0.7023108, + "learning_rate": 1.3477430306270066e-07, + "loss": 0.72360444, + "num_input_tokens_seen": 317780615, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.66015625, + "step": 14736, + "time_per_iteration": 2.460245132446289 + }, + { + "auxiliary_loss_clip": 0.01102419, + "auxiliary_loss_mlp": 0.0102873, + "balance_loss_clip": 1.01685667, + "balance_loss_mlp": 1.03579187, + "epoch": 0.8860363745678641, + "flos": 19536877670400.0, + "grad_norm": 1.8186891549833935, + "language_loss": 0.84355164, + "learning_rate": 1.3463379009005892e-07, + "loss": 0.86486316, + "num_input_tokens_seen": 317798830, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6640625, + "step": 14737, + "time_per_iteration": 2.451251745223999 + }, + { + "auxiliary_loss_clip": 0.01106985, + "auxiliary_loss_mlp": 0.01031522, + "balance_loss_clip": 1.01867151, + "balance_loss_mlp": 1.03683579, + "epoch": 0.8860964978205321, + "flos": 35954912304000.0, + "grad_norm": 3.2801540845038777, + "language_loss": 0.68354762, + "learning_rate": 1.3449334785229093e-07, + "loss": 0.70493269, + "num_input_tokens_seen": 317819235, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69921875, + "step": 14738, + "time_per_iteration": 2.5500543117523193 + }, + { + "auxiliary_loss_clip": 0.01103471, + "auxiliary_loss_mlp": 0.01026359, + "balance_loss_clip": 1.01440811, + "balance_loss_mlp": 1.03358066, + "epoch": 0.8861566210732, + "flos": 21212577002880.0, + "grad_norm": 1.7831357907048164, + "language_loss": 0.75100833, + "learning_rate": 1.343529763547222e-07, + "loss": 0.77230668, + "num_input_tokens_seen": 317836785, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.69921875, + "step": 14739, + "time_per_iteration": 2.4511783123016357 + }, + { + "auxiliary_loss_clip": 0.01096933, + "auxiliary_loss_mlp": 0.01028255, + "balance_loss_clip": 1.01746702, + "balance_loss_mlp": 1.03344214, + "epoch": 0.886216744325868, + "flos": 14609071843200.0, + "grad_norm": 1.7409264572632928, + "language_loss": 0.86878449, + "learning_rate": 1.3421267560267559e-07, + "loss": 0.8900364, + "num_input_tokens_seen": 317854225, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6328125, + "step": 14740, + "time_per_iteration": 2.4006471633911133 + }, + { + "auxiliary_loss_clip": 0.0110013, + "auxiliary_loss_mlp": 0.01031657, + "balance_loss_clip": 1.02006936, + "balance_loss_mlp": 1.03563619, + "epoch": 0.886276867578536, + "flos": 26651643062400.0, + "grad_norm": 1.7595112393939192, + "language_loss": 0.63362885, + "learning_rate": 1.34072445601471e-07, + "loss": 0.65494668, + "num_input_tokens_seen": 317874865, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.64453125, + "step": 14741, + "time_per_iteration": 2.4974660873413086 + }, + { + "auxiliary_loss_clip": 0.01099837, + "auxiliary_loss_mlp": 0.0103215, + "balance_loss_clip": 1.02044952, + "balance_loss_mlp": 1.03457093, + "epoch": 0.886336990831204, + "flos": 16764071281920.0, + "grad_norm": 2.724117158165582, + "language_loss": 0.72620136, + "learning_rate": 1.3393228635642717e-07, + "loss": 0.74752122, + "num_input_tokens_seen": 317892830, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65234375, + "step": 14742, + "time_per_iteration": 2.4552924633026123 + }, + { + "auxiliary_loss_clip": 0.01098575, + "auxiliary_loss_mlp": 0.0103209, + "balance_loss_clip": 1.02099192, + "balance_loss_mlp": 1.03406441, + "epoch": 0.8863971140838719, + "flos": 25265275781760.0, + "grad_norm": 1.890478101266105, + "language_loss": 0.59076136, + "learning_rate": 1.3379219787285733e-07, + "loss": 0.61206806, + "num_input_tokens_seen": 317911780, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.64453125, + "step": 14743, + "time_per_iteration": 2.4963841438293457 + }, + { + "auxiliary_loss_clip": 0.01102411, + "auxiliary_loss_mlp": 0.01030911, + "balance_loss_clip": 1.01763701, + "balance_loss_mlp": 1.03440762, + "epoch": 0.8864572373365399, + "flos": 23404313076480.0, + "grad_norm": 1.8739935931766052, + "language_loss": 0.60211849, + "learning_rate": 1.3365218015607437e-07, + "loss": 0.62345171, + "num_input_tokens_seen": 317932855, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6796875, + "step": 14744, + "time_per_iteration": 2.488271474838257 + }, + { + "auxiliary_loss_clip": 0.01099725, + "auxiliary_loss_mlp": 0.01033783, + "balance_loss_clip": 1.02141476, + "balance_loss_mlp": 1.0342828, + "epoch": 0.8865173605892078, + "flos": 18548759456640.0, + "grad_norm": 1.5337573847338424, + "language_loss": 0.76551473, + "learning_rate": 1.3351223321138762e-07, + "loss": 0.78684986, + "num_input_tokens_seen": 317952090, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.65625, + "step": 14745, + "time_per_iteration": 2.4542477130889893 + }, + { + "auxiliary_loss_clip": 0.01098813, + "auxiliary_loss_mlp": 0.01034127, + "balance_loss_clip": 1.02296925, + "balance_loss_mlp": 1.0353713, + "epoch": 0.8865774838418758, + "flos": 19025868833280.0, + "grad_norm": 1.7959141313080134, + "language_loss": 0.77085936, + "learning_rate": 1.3337235704410454e-07, + "loss": 0.79218876, + "num_input_tokens_seen": 317970370, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.63671875, + "step": 14746, + "time_per_iteration": 2.4547431468963623 + }, + { + "auxiliary_loss_clip": 0.01104158, + "auxiliary_loss_mlp": 0.01030457, + "balance_loss_clip": 1.01819599, + "balance_loss_mlp": 1.03627443, + "epoch": 0.8866376070945439, + "flos": 22163168482560.0, + "grad_norm": 1.8871990542262298, + "language_loss": 0.76628375, + "learning_rate": 1.3323255165952873e-07, + "loss": 0.78762996, + "num_input_tokens_seen": 317989125, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6796875, + "step": 14747, + "time_per_iteration": 2.4549856185913086 + }, + { + "auxiliary_loss_clip": 0.01094661, + "auxiliary_loss_mlp": 0.01026241, + "balance_loss_clip": 1.01498127, + "balance_loss_mlp": 1.03204846, + "epoch": 0.8866977303472118, + "flos": 20704261685760.0, + "grad_norm": 1.7887126918220513, + "language_loss": 0.82725775, + "learning_rate": 1.3309281706296127e-07, + "loss": 0.84846675, + "num_input_tokens_seen": 318007820, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.625, + "step": 14748, + "time_per_iteration": 2.4627127647399902 + }, + { + "auxiliary_loss_clip": 0.01100636, + "auxiliary_loss_mlp": 0.01029339, + "balance_loss_clip": 1.01723945, + "balance_loss_mlp": 1.03471351, + "epoch": 0.8867578535998798, + "flos": 48794448533760.0, + "grad_norm": 2.197486200203094, + "language_loss": 0.77274418, + "learning_rate": 1.3295315325970148e-07, + "loss": 0.7940439, + "num_input_tokens_seen": 318030435, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66015625, + "step": 14749, + "time_per_iteration": 2.6969592571258545 + }, + { + "auxiliary_loss_clip": 0.01100997, + "auxiliary_loss_mlp": 0.01033354, + "balance_loss_clip": 1.02116513, + "balance_loss_mlp": 1.03323364, + "epoch": 0.8868179768525477, + "flos": 21105312013440.0, + "grad_norm": 1.8706744703437488, + "language_loss": 0.69848335, + "learning_rate": 1.328135602550451e-07, + "loss": 0.71982694, + "num_input_tokens_seen": 318049465, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6796875, + "step": 14750, + "time_per_iteration": 2.4876596927642822 + }, + { + "auxiliary_loss_clip": 0.0109901, + "auxiliary_loss_mlp": 0.01029706, + "balance_loss_clip": 1.01825023, + "balance_loss_mlp": 1.03434694, + "epoch": 0.8868781001052157, + "flos": 21830922656640.0, + "grad_norm": 2.01913942737069, + "language_loss": 0.59346163, + "learning_rate": 1.3267403805428546e-07, + "loss": 0.61474878, + "num_input_tokens_seen": 318067760, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6484375, + "step": 14751, + "time_per_iteration": 2.4688756465911865 + }, + { + "auxiliary_loss_clip": 0.01100041, + "auxiliary_loss_mlp": 0.01027394, + "balance_loss_clip": 1.01586664, + "balance_loss_mlp": 1.03530931, + "epoch": 0.8869382233578836, + "flos": 13516418073600.0, + "grad_norm": 2.1999031985254436, + "language_loss": 0.81069493, + "learning_rate": 1.3253458666271344e-07, + "loss": 0.83196926, + "num_input_tokens_seen": 318082785, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 14752, + "time_per_iteration": 2.4317078590393066 + }, + { + "auxiliary_loss_clip": 0.01105544, + "auxiliary_loss_mlp": 0.01030661, + "balance_loss_clip": 1.01819205, + "balance_loss_mlp": 1.03595507, + "epoch": 0.8869983466105517, + "flos": 22704988210560.0, + "grad_norm": 1.8695812916150454, + "language_loss": 0.80406618, + "learning_rate": 1.3239520608561793e-07, + "loss": 0.82542825, + "num_input_tokens_seen": 318101925, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.69921875, + "step": 14753, + "time_per_iteration": 2.4720840454101562 + }, + { + "auxiliary_loss_clip": 0.01097069, + "auxiliary_loss_mlp": 0.01030109, + "balance_loss_clip": 1.01882041, + "balance_loss_mlp": 1.03314829, + "epoch": 0.8870584698632196, + "flos": 15340751884800.0, + "grad_norm": 1.9768901088990127, + "language_loss": 0.65004474, + "learning_rate": 1.3225589632828248e-07, + "loss": 0.6713165, + "num_input_tokens_seen": 318119945, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 14754, + "time_per_iteration": 2.4193625450134277 + }, + { + "auxiliary_loss_clip": 0.01102106, + "auxiliary_loss_mlp": 0.01031348, + "balance_loss_clip": 1.01968324, + "balance_loss_mlp": 1.03563762, + "epoch": 0.8871185931158876, + "flos": 26615624699520.0, + "grad_norm": 1.8164304969475906, + "language_loss": 0.7455616, + "learning_rate": 1.3211665739599065e-07, + "loss": 0.76689613, + "num_input_tokens_seen": 318139685, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 14755, + "time_per_iteration": 2.493603229522705 + }, + { + "auxiliary_loss_clip": 0.01099041, + "auxiliary_loss_mlp": 0.01030604, + "balance_loss_clip": 1.01815271, + "balance_loss_mlp": 1.03300142, + "epoch": 0.8871787163685555, + "flos": 21799034357760.0, + "grad_norm": 1.634586619423876, + "language_loss": 0.77746713, + "learning_rate": 1.3197748929402262e-07, + "loss": 0.79876363, + "num_input_tokens_seen": 318160375, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.66015625, + "step": 14756, + "time_per_iteration": 2.4780538082122803 + }, + { + "auxiliary_loss_clip": 0.01100814, + "auxiliary_loss_mlp": 0.01032077, + "balance_loss_clip": 1.02048993, + "balance_loss_mlp": 1.03467703, + "epoch": 0.8872388396212235, + "flos": 14902964922240.0, + "grad_norm": 1.8588799430656529, + "language_loss": 0.76319844, + "learning_rate": 1.3183839202765535e-07, + "loss": 0.78452736, + "num_input_tokens_seen": 318177995, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6640625, + "step": 14757, + "time_per_iteration": 2.4160494804382324 + }, + { + "auxiliary_loss_clip": 0.01096707, + "auxiliary_loss_mlp": 0.01032976, + "balance_loss_clip": 1.02180624, + "balance_loss_mlp": 1.03396797, + "epoch": 0.8872989628738914, + "flos": 26432157006720.0, + "grad_norm": 1.8526342000102967, + "language_loss": 0.67985821, + "learning_rate": 1.316993656021632e-07, + "loss": 0.70115507, + "num_input_tokens_seen": 318197030, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.625, + "step": 14758, + "time_per_iteration": 2.512890100479126 + }, + { + "auxiliary_loss_clip": 0.01100758, + "auxiliary_loss_mlp": 0.01031037, + "balance_loss_clip": 1.01833534, + "balance_loss_mlp": 1.03502667, + "epoch": 0.8873590861265594, + "flos": 48142562555520.0, + "grad_norm": 1.5896545360448344, + "language_loss": 0.68797654, + "learning_rate": 1.3156041002281915e-07, + "loss": 0.70929444, + "num_input_tokens_seen": 318221780, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.65625, + "step": 14759, + "time_per_iteration": 2.694464683532715 + }, + { + "auxiliary_loss_clip": 0.01096524, + "auxiliary_loss_mlp": 0.01028283, + "balance_loss_clip": 1.01685667, + "balance_loss_mlp": 1.0320487, + "epoch": 0.8874192093792275, + "flos": 18332972501760.0, + "grad_norm": 1.9328359040343546, + "language_loss": 0.74210972, + "learning_rate": 1.3142152529489092e-07, + "loss": 0.76335776, + "num_input_tokens_seen": 318239710, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.64453125, + "step": 14760, + "time_per_iteration": 2.450502634048462 + }, + { + "auxiliary_loss_clip": 0.01102656, + "auxiliary_loss_mlp": 0.01029431, + "balance_loss_clip": 1.01763535, + "balance_loss_mlp": 1.0351845, + "epoch": 0.8874793326318954, + "flos": 17894215872000.0, + "grad_norm": 2.2540100924587434, + "language_loss": 0.75508064, + "learning_rate": 1.3128271142364565e-07, + "loss": 0.77640146, + "num_input_tokens_seen": 318257425, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.67578125, + "step": 14761, + "time_per_iteration": 2.450575351715088 + }, + { + "auxiliary_loss_clip": 0.01100474, + "auxiliary_loss_mlp": 0.01038175, + "balance_loss_clip": 1.02637935, + "balance_loss_mlp": 1.03368759, + "epoch": 0.8875394558845634, + "flos": 31102231772160.0, + "grad_norm": 2.70563758191793, + "language_loss": 0.61649144, + "learning_rate": 1.3114396841434717e-07, + "loss": 0.63787794, + "num_input_tokens_seen": 318278485, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66796875, + "step": 14762, + "time_per_iteration": 3.9083144664764404 + }, + { + "auxiliary_loss_clip": 0.01099715, + "auxiliary_loss_mlp": 0.01029623, + "balance_loss_clip": 1.01740968, + "balance_loss_mlp": 1.03380537, + "epoch": 0.8875995791372313, + "flos": 21142048648320.0, + "grad_norm": 1.8083411551744764, + "language_loss": 0.64272511, + "learning_rate": 1.3100529627225697e-07, + "loss": 0.66401851, + "num_input_tokens_seen": 318297560, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65625, + "step": 14763, + "time_per_iteration": 5.301745176315308 + }, + { + "auxiliary_loss_clip": 0.01100472, + "auxiliary_loss_mlp": 0.01030098, + "balance_loss_clip": 1.01755691, + "balance_loss_mlp": 1.03406501, + "epoch": 0.8876597023898993, + "flos": 17455136019840.0, + "grad_norm": 2.1398532909429635, + "language_loss": 0.71166742, + "learning_rate": 1.3086669500263335e-07, + "loss": 0.7329731, + "num_input_tokens_seen": 318313060, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6640625, + "step": 14764, + "time_per_iteration": 2.4461166858673096 + }, + { + "auxiliary_loss_clip": 0.01103023, + "auxiliary_loss_mlp": 0.01031847, + "balance_loss_clip": 1.020015, + "balance_loss_mlp": 1.0344727, + "epoch": 0.8877198256425672, + "flos": 22707933125760.0, + "grad_norm": 2.7429750969240043, + "language_loss": 0.66583252, + "learning_rate": 1.3072816461073166e-07, + "loss": 0.68718123, + "num_input_tokens_seen": 318332030, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 14765, + "time_per_iteration": 2.46746563911438 + }, + { + "auxiliary_loss_clip": 0.01096438, + "auxiliary_loss_mlp": 0.01024105, + "balance_loss_clip": 1.0139488, + "balance_loss_mlp": 1.0340445, + "epoch": 0.8877799488952353, + "flos": 24535104111360.0, + "grad_norm": 1.6144347277628304, + "language_loss": 0.76532453, + "learning_rate": 1.3058970510180568e-07, + "loss": 0.78652996, + "num_input_tokens_seen": 318351090, + "router_z_loss_clip": 0.1015625, + "router_z_loss_mlp": 0.625, + "step": 14766, + "time_per_iteration": 2.511964797973633 + }, + { + "auxiliary_loss_clip": 0.01096312, + "auxiliary_loss_mlp": 0.01029078, + "balance_loss_clip": 1.01800895, + "balance_loss_mlp": 1.0334599, + "epoch": 0.8878400721479032, + "flos": 20959191486720.0, + "grad_norm": 2.2390564456183863, + "language_loss": 0.73575568, + "learning_rate": 1.3045131648110496e-07, + "loss": 0.75700963, + "num_input_tokens_seen": 318372000, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.62890625, + "step": 14767, + "time_per_iteration": 2.454369306564331 + }, + { + "auxiliary_loss_clip": 0.01095656, + "auxiliary_loss_mlp": 0.01023366, + "balance_loss_clip": 1.01239324, + "balance_loss_mlp": 1.03359067, + "epoch": 0.8879001954005712, + "flos": 25295260659840.0, + "grad_norm": 1.829789485746044, + "language_loss": 0.71202058, + "learning_rate": 1.303129987538778e-07, + "loss": 0.7332108, + "num_input_tokens_seen": 318391530, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.62109375, + "step": 14768, + "time_per_iteration": 2.5051445960998535 + }, + { + "auxiliary_loss_clip": 0.01097532, + "auxiliary_loss_mlp": 0.01027923, + "balance_loss_clip": 1.016711, + "balance_loss_mlp": 1.03297067, + "epoch": 0.8879603186532391, + "flos": 23185329811200.0, + "grad_norm": 1.8019675582043564, + "language_loss": 0.70299733, + "learning_rate": 1.3017475192536932e-07, + "loss": 0.72425187, + "num_input_tokens_seen": 318410690, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.64453125, + "step": 14769, + "time_per_iteration": 4.037384271621704 + }, + { + "auxiliary_loss_clip": 0.01098828, + "auxiliary_loss_mlp": 0.01031996, + "balance_loss_clip": 1.02146339, + "balance_loss_mlp": 1.0355804, + "epoch": 0.8880204419059071, + "flos": 13655427707520.0, + "grad_norm": 2.2333804344383847, + "language_loss": 0.67153198, + "learning_rate": 1.3003657600082174e-07, + "loss": 0.69284022, + "num_input_tokens_seen": 318427380, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6328125, + "step": 14770, + "time_per_iteration": 2.4550540447235107 + }, + { + "auxiliary_loss_clip": 0.01096046, + "auxiliary_loss_mlp": 0.01028734, + "balance_loss_clip": 1.01742697, + "balance_loss_mlp": 1.03437459, + "epoch": 0.888080565158575, + "flos": 20631865824000.0, + "grad_norm": 1.686571312433287, + "language_loss": 0.65049809, + "learning_rate": 1.2989847098547424e-07, + "loss": 0.6717459, + "num_input_tokens_seen": 318448530, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6171875, + "step": 14771, + "time_per_iteration": 2.506824016571045 + }, + { + "auxiliary_loss_clip": 0.01097555, + "auxiliary_loss_mlp": 0.01026042, + "balance_loss_clip": 1.01502669, + "balance_loss_mlp": 1.0331111, + "epoch": 0.888140688411243, + "flos": 28620014411520.0, + "grad_norm": 1.4703989654139515, + "language_loss": 0.82365024, + "learning_rate": 1.2976043688456396e-07, + "loss": 0.84488624, + "num_input_tokens_seen": 318468655, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.64453125, + "step": 14772, + "time_per_iteration": 2.5196051597595215 + }, + { + "auxiliary_loss_clip": 0.01093264, + "auxiliary_loss_mlp": 0.01022956, + "balance_loss_clip": 1.01254296, + "balance_loss_mlp": 1.03136611, + "epoch": 0.8882008116639111, + "flos": 25520241496320.0, + "grad_norm": 1.5905781508550767, + "language_loss": 0.76286173, + "learning_rate": 1.296224737033258e-07, + "loss": 0.78402388, + "num_input_tokens_seen": 318488740, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.6171875, + "step": 14773, + "time_per_iteration": 2.5159168243408203 + }, + { + "auxiliary_loss_clip": 0.01096414, + "auxiliary_loss_mlp": 0.0102585, + "balance_loss_clip": 1.01539564, + "balance_loss_mlp": 1.03436065, + "epoch": 0.888260934916579, + "flos": 27673696650240.0, + "grad_norm": 1.9010559133370122, + "language_loss": 0.74874908, + "learning_rate": 1.294845814469907e-07, + "loss": 0.76997173, + "num_input_tokens_seen": 318508810, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.62109375, + "step": 14774, + "time_per_iteration": 2.5161659717559814 + }, + { + "auxiliary_loss_clip": 0.01100538, + "auxiliary_loss_mlp": 0.01031054, + "balance_loss_clip": 1.01881158, + "balance_loss_mlp": 1.03431296, + "epoch": 0.888321058169247, + "flos": 21611077464960.0, + "grad_norm": 2.3755667319162383, + "language_loss": 0.72226775, + "learning_rate": 1.2934676012078783e-07, + "loss": 0.74358368, + "num_input_tokens_seen": 318526860, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6640625, + "step": 14775, + "time_per_iteration": 2.4795637130737305 + }, + { + "auxiliary_loss_clip": 0.01097248, + "auxiliary_loss_mlp": 0.0102801, + "balance_loss_clip": 1.01710868, + "balance_loss_mlp": 1.03339446, + "epoch": 0.8883811814219149, + "flos": 18149109759360.0, + "grad_norm": 1.6483138491279807, + "language_loss": 0.80294418, + "learning_rate": 1.292090097299432e-07, + "loss": 0.82419682, + "num_input_tokens_seen": 318545180, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.640625, + "step": 14776, + "time_per_iteration": 2.453660726547241 + }, + { + "auxiliary_loss_clip": 0.01101713, + "auxiliary_loss_mlp": 0.01031535, + "balance_loss_clip": 1.01928067, + "balance_loss_mlp": 1.03330636, + "epoch": 0.8884413046745829, + "flos": 28324648874880.0, + "grad_norm": 3.6584424512501976, + "language_loss": 0.69919568, + "learning_rate": 1.290713302796802e-07, + "loss": 0.72052813, + "num_input_tokens_seen": 318564350, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.68359375, + "step": 14777, + "time_per_iteration": 2.537234306335449 + }, + { + "auxiliary_loss_clip": 0.01096023, + "auxiliary_loss_mlp": 0.01034119, + "balance_loss_clip": 1.02306223, + "balance_loss_mlp": 1.03184962, + "epoch": 0.8885014279272508, + "flos": 15158756649600.0, + "grad_norm": 1.735112860349567, + "language_loss": 0.70467377, + "learning_rate": 1.2893372177522e-07, + "loss": 0.72597522, + "num_input_tokens_seen": 318582275, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 14778, + "time_per_iteration": 2.467770576477051 + }, + { + "auxiliary_loss_clip": 0.01098895, + "auxiliary_loss_mlp": 0.01028221, + "balance_loss_clip": 1.0175935, + "balance_loss_mlp": 1.03429198, + "epoch": 0.8885615511799189, + "flos": 19099593498240.0, + "grad_norm": 1.5537401295663211, + "language_loss": 0.77455193, + "learning_rate": 1.287961842217804e-07, + "loss": 0.7958231, + "num_input_tokens_seen": 318601230, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.6484375, + "step": 14779, + "time_per_iteration": 2.4519615173339844 + }, + { + "auxiliary_loss_clip": 0.01021951, + "auxiliary_loss_mlp": 0.01002391, + "balance_loss_clip": 1.00141323, + "balance_loss_mlp": 1.00185418, + "epoch": 0.8886216744325868, + "flos": 51186567605760.0, + "grad_norm": 0.9099787300950598, + "language_loss": 0.56692004, + "learning_rate": 1.2865871762457747e-07, + "loss": 0.58716345, + "num_input_tokens_seen": 318645595, + "router_z_loss_clip": 0.00976562, + "router_z_loss_mlp": 0.20117188, + "step": 14780, + "time_per_iteration": 2.8395655155181885 + }, + { + "auxiliary_loss_clip": 0.01021748, + "auxiliary_loss_mlp": 0.01001636, + "balance_loss_clip": 1.00069416, + "balance_loss_mlp": 1.00195396, + "epoch": 0.8886817976852548, + "flos": 61612981263360.0, + "grad_norm": 0.7880016601364539, + "language_loss": 0.6246208, + "learning_rate": 1.2852132198882326e-07, + "loss": 0.64485466, + "num_input_tokens_seen": 318707850, + "router_z_loss_clip": 0.00939941, + "router_z_loss_mlp": 0.19824219, + "step": 14781, + "time_per_iteration": 3.128025770187378 + }, + { + "auxiliary_loss_clip": 0.01022011, + "auxiliary_loss_mlp": 0.01002356, + "balance_loss_clip": 1.00142026, + "balance_loss_mlp": 1.00189745, + "epoch": 0.8887419209379227, + "flos": 60646946935680.0, + "grad_norm": 0.7913802698138945, + "language_loss": 0.58146596, + "learning_rate": 1.2838399731972805e-07, + "loss": 0.6017096, + "num_input_tokens_seen": 318764915, + "router_z_loss_clip": 0.00933838, + "router_z_loss_mlp": 0.20117188, + "step": 14782, + "time_per_iteration": 2.9118587970733643 + }, + { + "auxiliary_loss_clip": 0.01097314, + "auxiliary_loss_mlp": 0.0102938, + "balance_loss_clip": 1.01874638, + "balance_loss_mlp": 1.03459406, + "epoch": 0.8888020441905907, + "flos": 29205861235200.0, + "grad_norm": 2.667047910128226, + "language_loss": 0.65728068, + "learning_rate": 1.2824674362249922e-07, + "loss": 0.67854762, + "num_input_tokens_seen": 318785660, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.625, + "step": 14783, + "time_per_iteration": 2.531919002532959 + }, + { + "auxiliary_loss_clip": 0.01102053, + "auxiliary_loss_mlp": 0.0103183, + "balance_loss_clip": 1.01958156, + "balance_loss_mlp": 1.03455818, + "epoch": 0.8888621674432586, + "flos": 22162701605760.0, + "grad_norm": 1.5498583522187301, + "language_loss": 0.77504814, + "learning_rate": 1.281095609023415e-07, + "loss": 0.79638696, + "num_input_tokens_seen": 318806080, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.67578125, + "step": 14784, + "time_per_iteration": 2.468636989593506 + }, + { + "auxiliary_loss_clip": 0.01101877, + "auxiliary_loss_mlp": 0.01029403, + "balance_loss_clip": 1.01757181, + "balance_loss_mlp": 1.03568482, + "epoch": 0.8889222906959267, + "flos": 27672834723840.0, + "grad_norm": 2.219338510928114, + "language_loss": 0.60414922, + "learning_rate": 1.279724491644565e-07, + "loss": 0.62546206, + "num_input_tokens_seen": 318826445, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 14785, + "time_per_iteration": 2.525151252746582 + }, + { + "auxiliary_loss_clip": 0.01100607, + "auxiliary_loss_mlp": 0.01030296, + "balance_loss_clip": 1.0186789, + "balance_loss_mlp": 1.03575349, + "epoch": 0.8889824139485947, + "flos": 14168627274240.0, + "grad_norm": 1.7836289092233713, + "language_loss": 0.64846861, + "learning_rate": 1.278354084140445e-07, + "loss": 0.66977763, + "num_input_tokens_seen": 318843915, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6484375, + "step": 14786, + "time_per_iteration": 2.395446300506592 + }, + { + "auxiliary_loss_clip": 0.01103855, + "auxiliary_loss_mlp": 0.01030306, + "balance_loss_clip": 1.01772904, + "balance_loss_mlp": 1.03465486, + "epoch": 0.8890425372012626, + "flos": 12853003829760.0, + "grad_norm": 2.227669183130591, + "language_loss": 0.85661733, + "learning_rate": 1.276984386563009e-07, + "loss": 0.87795901, + "num_input_tokens_seen": 318859670, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69140625, + "step": 14787, + "time_per_iteration": 2.432615041732788 + }, + { + "auxiliary_loss_clip": 0.01099197, + "auxiliary_loss_mlp": 0.01029873, + "balance_loss_clip": 1.01820874, + "balance_loss_mlp": 1.03418851, + "epoch": 0.8891026604539306, + "flos": 21689291329920.0, + "grad_norm": 1.834557891315271, + "language_loss": 0.71064335, + "learning_rate": 1.2756153989642027e-07, + "loss": 0.73193407, + "num_input_tokens_seen": 318877855, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6484375, + "step": 14788, + "time_per_iteration": 2.4537904262542725 + }, + { + "auxiliary_loss_clip": 0.01095263, + "auxiliary_loss_mlp": 0.01027034, + "balance_loss_clip": 1.01619184, + "balance_loss_mlp": 1.03322816, + "epoch": 0.8891627837065985, + "flos": 21871430219520.0, + "grad_norm": 1.5923121621741885, + "language_loss": 0.70096779, + "learning_rate": 1.274247121395935e-07, + "loss": 0.72219074, + "num_input_tokens_seen": 318896045, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6171875, + "step": 14789, + "time_per_iteration": 2.476649284362793 + }, + { + "auxiliary_loss_clip": 0.01099815, + "auxiliary_loss_mlp": 0.01023209, + "balance_loss_clip": 1.01159167, + "balance_loss_mlp": 1.03555179, + "epoch": 0.8892229069592665, + "flos": 21580230660480.0, + "grad_norm": 5.015147056087475, + "language_loss": 0.70436954, + "learning_rate": 1.2728795539100956e-07, + "loss": 0.72559977, + "num_input_tokens_seen": 318915515, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.640625, + "step": 14790, + "time_per_iteration": 2.4777982234954834 + }, + { + "auxiliary_loss_clip": 0.0109958, + "auxiliary_loss_mlp": 0.01025006, + "balance_loss_clip": 1.01421189, + "balance_loss_mlp": 1.03437293, + "epoch": 0.8892830302119344, + "flos": 23075981832960.0, + "grad_norm": 3.521302837326341, + "language_loss": 0.73018265, + "learning_rate": 1.2715126965585387e-07, + "loss": 0.75142848, + "num_input_tokens_seen": 318934305, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.65234375, + "step": 14791, + "time_per_iteration": 2.489640951156616 + }, + { + "auxiliary_loss_clip": 0.01097004, + "auxiliary_loss_mlp": 0.01033084, + "balance_loss_clip": 1.02203906, + "balance_loss_mlp": 1.03449202, + "epoch": 0.8893431534646025, + "flos": 23072139077760.0, + "grad_norm": 2.988212412187788, + "language_loss": 0.74027723, + "learning_rate": 1.2701465493931008e-07, + "loss": 0.76157808, + "num_input_tokens_seen": 318953880, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.625, + "step": 14792, + "time_per_iteration": 2.4790284633636475 + }, + { + "auxiliary_loss_clip": 0.01102563, + "auxiliary_loss_mlp": 0.01030706, + "balance_loss_clip": 1.01840937, + "balance_loss_mlp": 1.03461504, + "epoch": 0.8894032767172704, + "flos": 22454978572800.0, + "grad_norm": 1.8885669477718985, + "language_loss": 0.65883052, + "learning_rate": 1.2687811124655801e-07, + "loss": 0.68016326, + "num_input_tokens_seen": 318971395, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6796875, + "step": 14793, + "time_per_iteration": 2.4892311096191406 + }, + { + "auxiliary_loss_clip": 0.01101873, + "auxiliary_loss_mlp": 0.01030691, + "balance_loss_clip": 1.01855588, + "balance_loss_mlp": 1.03431058, + "epoch": 0.8894633999699384, + "flos": 25338246261120.0, + "grad_norm": 1.6206939112924994, + "language_loss": 0.71852094, + "learning_rate": 1.2674163858277552e-07, + "loss": 0.73984659, + "num_input_tokens_seen": 318990580, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 14794, + "time_per_iteration": 2.5023927688598633 + }, + { + "auxiliary_loss_clip": 0.01104706, + "auxiliary_loss_mlp": 0.0102926, + "balance_loss_clip": 1.01692796, + "balance_loss_mlp": 1.03636956, + "epoch": 0.8895235232226063, + "flos": 20994096528000.0, + "grad_norm": 2.0233145744012853, + "language_loss": 0.75055683, + "learning_rate": 1.2660523695313785e-07, + "loss": 0.77189648, + "num_input_tokens_seen": 319010040, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.68359375, + "step": 14795, + "time_per_iteration": 2.44732403755188 + }, + { + "auxiliary_loss_clip": 0.0102198, + "auxiliary_loss_mlp": 0.01003025, + "balance_loss_clip": 1.00208306, + "balance_loss_mlp": 1.00205803, + "epoch": 0.8895836464752743, + "flos": 69732956764800.0, + "grad_norm": 0.7697530463267467, + "language_loss": 0.56135261, + "learning_rate": 1.2646890636281727e-07, + "loss": 0.58160269, + "num_input_tokens_seen": 319063860, + "router_z_loss_clip": 0.00939941, + "router_z_loss_mlp": 0.19921875, + "step": 14796, + "time_per_iteration": 2.9481120109558105 + }, + { + "auxiliary_loss_clip": 0.01102738, + "auxiliary_loss_mlp": 0.01031908, + "balance_loss_clip": 1.01900983, + "balance_loss_mlp": 1.03571939, + "epoch": 0.8896437697279422, + "flos": 23221815050880.0, + "grad_norm": 1.8360219306966759, + "language_loss": 0.70659775, + "learning_rate": 1.263326468169843e-07, + "loss": 0.7279442, + "num_input_tokens_seen": 319082335, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.671875, + "step": 14797, + "time_per_iteration": 2.4758121967315674 + }, + { + "auxiliary_loss_clip": 0.01021915, + "auxiliary_loss_mlp": 0.01001904, + "balance_loss_clip": 1.00082493, + "balance_loss_mlp": 1.00191402, + "epoch": 0.8897038929806103, + "flos": 70752711882240.0, + "grad_norm": 0.7562531422498101, + "language_loss": 0.58068562, + "learning_rate": 1.2619645832080417e-07, + "loss": 0.60092378, + "num_input_tokens_seen": 319147075, + "router_z_loss_clip": 0.01080322, + "router_z_loss_mlp": 0.19921875, + "step": 14798, + "time_per_iteration": 3.100543260574341 + }, + { + "auxiliary_loss_clip": 0.01099245, + "auxiliary_loss_mlp": 0.01024813, + "balance_loss_clip": 1.01277268, + "balance_loss_mlp": 1.03444302, + "epoch": 0.8897640162332782, + "flos": 19245103493760.0, + "grad_norm": 1.5369346664635186, + "language_loss": 0.79333103, + "learning_rate": 1.2606034087944251e-07, + "loss": 0.81457162, + "num_input_tokens_seen": 319166630, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6484375, + "step": 14799, + "time_per_iteration": 2.4709973335266113 + }, + { + "auxiliary_loss_clip": 0.01021995, + "auxiliary_loss_mlp": 0.00998421, + "balance_loss_clip": 0.99741381, + "balance_loss_mlp": 1.00197566, + "epoch": 0.8898241394859462, + "flos": 41356275039360.0, + "grad_norm": 0.8801583470464978, + "language_loss": 0.58083129, + "learning_rate": 1.2592429449806053e-07, + "loss": 0.60103536, + "num_input_tokens_seen": 319221865, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.20019531, + "step": 14800, + "time_per_iteration": 3.0016472339630127 + }, + { + "auxiliary_loss_clip": 0.01100463, + "auxiliary_loss_mlp": 0.0102968, + "balance_loss_clip": 1.01881397, + "balance_loss_mlp": 1.03615224, + "epoch": 0.8898842627386142, + "flos": 18986295024000.0, + "grad_norm": 1.9195223698734736, + "language_loss": 0.65940589, + "learning_rate": 1.2578831918181698e-07, + "loss": 0.68070734, + "num_input_tokens_seen": 319240710, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.64453125, + "step": 14801, + "time_per_iteration": 2.4564545154571533 + }, + { + "auxiliary_loss_clip": 0.01103883, + "auxiliary_loss_mlp": 0.01033929, + "balance_loss_clip": 1.02052987, + "balance_loss_mlp": 1.03634536, + "epoch": 0.8899443859912821, + "flos": 13217173868160.0, + "grad_norm": 2.3831690088780797, + "language_loss": 0.75702822, + "learning_rate": 1.256524149358682e-07, + "loss": 0.77840638, + "num_input_tokens_seen": 319256495, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.67578125, + "step": 14802, + "time_per_iteration": 2.4362783432006836 + }, + { + "auxiliary_loss_clip": 0.01098284, + "auxiliary_loss_mlp": 0.01029288, + "balance_loss_clip": 1.01845789, + "balance_loss_mlp": 1.03538465, + "epoch": 0.8900045092439501, + "flos": 22674680110080.0, + "grad_norm": 1.7447945915193968, + "language_loss": 0.73556334, + "learning_rate": 1.2551658176536805e-07, + "loss": 0.7568391, + "num_input_tokens_seen": 319273620, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.62890625, + "step": 14803, + "time_per_iteration": 3.843716859817505 + }, + { + "auxiliary_loss_clip": 0.01097556, + "auxiliary_loss_mlp": 0.01031798, + "balance_loss_clip": 1.02025902, + "balance_loss_mlp": 1.0338726, + "epoch": 0.890064632496618, + "flos": 21141617685120.0, + "grad_norm": 2.2342057214139244, + "language_loss": 0.71535265, + "learning_rate": 1.2538081967546664e-07, + "loss": 0.73664618, + "num_input_tokens_seen": 319291720, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.63671875, + "step": 14804, + "time_per_iteration": 2.4600677490234375 + }, + { + "auxiliary_loss_clip": 0.01099154, + "auxiliary_loss_mlp": 0.01030025, + "balance_loss_clip": 1.01805639, + "balance_loss_mlp": 1.03340125, + "epoch": 0.8901247557492861, + "flos": 23397058529280.0, + "grad_norm": 1.679518955949807, + "language_loss": 0.81240398, + "learning_rate": 1.252451286713123e-07, + "loss": 0.83369577, + "num_input_tokens_seen": 319310380, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.65625, + "step": 14805, + "time_per_iteration": 5.3233935832977295 + }, + { + "auxiliary_loss_clip": 0.01101908, + "auxiliary_loss_mlp": 0.0103062, + "balance_loss_clip": 1.01857388, + "balance_loss_mlp": 1.03456831, + "epoch": 0.890184879001954, + "flos": 29169591477120.0, + "grad_norm": 1.9168109162120714, + "language_loss": 0.67573619, + "learning_rate": 1.251095087580505e-07, + "loss": 0.69706142, + "num_input_tokens_seen": 319331765, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.67578125, + "step": 14806, + "time_per_iteration": 2.516892671585083 + }, + { + "auxiliary_loss_clip": 0.01097771, + "auxiliary_loss_mlp": 0.01029154, + "balance_loss_clip": 1.01762652, + "balance_loss_mlp": 1.03334141, + "epoch": 0.890245002254622, + "flos": 14427830793600.0, + "grad_norm": 1.8087810947787646, + "language_loss": 0.66934985, + "learning_rate": 1.2497395994082438e-07, + "loss": 0.69061911, + "num_input_tokens_seen": 319349135, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 14807, + "time_per_iteration": 2.4431300163269043 + }, + { + "auxiliary_loss_clip": 0.0109679, + "auxiliary_loss_mlp": 0.0102731, + "balance_loss_clip": 1.01656318, + "balance_loss_mlp": 1.03291702, + "epoch": 0.8903051255072899, + "flos": 22382187661440.0, + "grad_norm": 1.7392302676531743, + "language_loss": 0.75443882, + "learning_rate": 1.248384822247732e-07, + "loss": 0.77567983, + "num_input_tokens_seen": 319368410, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.640625, + "step": 14808, + "time_per_iteration": 2.4573440551757812 + }, + { + "auxiliary_loss_clip": 0.01099351, + "auxiliary_loss_mlp": 0.01032305, + "balance_loss_clip": 1.02105141, + "balance_loss_mlp": 1.03359127, + "epoch": 0.8903652487599579, + "flos": 20777375819520.0, + "grad_norm": 1.8850733161628792, + "language_loss": 0.81599617, + "learning_rate": 1.2470307561503513e-07, + "loss": 0.83731276, + "num_input_tokens_seen": 319387535, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 14809, + "time_per_iteration": 2.49157452583313 + }, + { + "auxiliary_loss_clip": 0.01099477, + "auxiliary_loss_mlp": 0.01027824, + "balance_loss_clip": 1.01664197, + "balance_loss_mlp": 1.03431225, + "epoch": 0.8904253720126258, + "flos": 24424499157120.0, + "grad_norm": 1.7329886476679317, + "language_loss": 0.68297541, + "learning_rate": 1.2456774011674442e-07, + "loss": 0.70424849, + "num_input_tokens_seen": 319407210, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.65234375, + "step": 14810, + "time_per_iteration": 4.0772764682769775 + }, + { + "auxiliary_loss_clip": 0.01099319, + "auxiliary_loss_mlp": 0.01026645, + "balance_loss_clip": 1.01500988, + "balance_loss_mlp": 1.03268421, + "epoch": 0.8904854952652939, + "flos": 19463871277440.0, + "grad_norm": 2.030728566700246, + "language_loss": 0.69870633, + "learning_rate": 1.2443247573503257e-07, + "loss": 0.71996593, + "num_input_tokens_seen": 319425340, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66796875, + "step": 14811, + "time_per_iteration": 2.458737850189209 + }, + { + "auxiliary_loss_clip": 0.01101881, + "auxiliary_loss_mlp": 0.01030049, + "balance_loss_clip": 1.01853955, + "balance_loss_mlp": 1.03482771, + "epoch": 0.8905456185179618, + "flos": 50800741666560.0, + "grad_norm": 6.429512242388682, + "language_loss": 0.6537776, + "learning_rate": 1.2429728247502924e-07, + "loss": 0.67509687, + "num_input_tokens_seen": 319448150, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 14812, + "time_per_iteration": 2.740006685256958 + }, + { + "auxiliary_loss_clip": 0.01097646, + "auxiliary_loss_mlp": 0.01028927, + "balance_loss_clip": 1.01789427, + "balance_loss_mlp": 1.03355992, + "epoch": 0.8906057417706298, + "flos": 17784867893760.0, + "grad_norm": 1.6667342349025365, + "language_loss": 0.68745792, + "learning_rate": 1.24162160341861e-07, + "loss": 0.70872366, + "num_input_tokens_seen": 319466115, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 14813, + "time_per_iteration": 2.4327640533447266 + }, + { + "auxiliary_loss_clip": 0.01105069, + "auxiliary_loss_mlp": 0.01033392, + "balance_loss_clip": 1.02008224, + "balance_loss_mlp": 1.03459501, + "epoch": 0.8906658650232978, + "flos": 21944867575680.0, + "grad_norm": 3.7354447140562157, + "language_loss": 0.75532061, + "learning_rate": 1.2402710934065198e-07, + "loss": 0.77670521, + "num_input_tokens_seen": 319485255, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.703125, + "step": 14814, + "time_per_iteration": 2.484541893005371 + }, + { + "auxiliary_loss_clip": 0.01100943, + "auxiliary_loss_mlp": 0.01026811, + "balance_loss_clip": 1.0147295, + "balance_loss_mlp": 1.03317893, + "epoch": 0.8907259882759657, + "flos": 21287810039040.0, + "grad_norm": 2.065309630726402, + "language_loss": 0.74279094, + "learning_rate": 1.2389212947652229e-07, + "loss": 0.76406848, + "num_input_tokens_seen": 319501800, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 14815, + "time_per_iteration": 2.465571165084839 + }, + { + "auxiliary_loss_clip": 0.01097426, + "auxiliary_loss_mlp": 0.01028489, + "balance_loss_clip": 1.01690221, + "balance_loss_mlp": 1.03356385, + "epoch": 0.8907861115286337, + "flos": 20120426023680.0, + "grad_norm": 1.9753473376305755, + "language_loss": 0.75420868, + "learning_rate": 1.237572207545914e-07, + "loss": 0.77546787, + "num_input_tokens_seen": 319520415, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.640625, + "step": 14816, + "time_per_iteration": 2.4814677238464355 + }, + { + "auxiliary_loss_clip": 0.01098854, + "auxiliary_loss_mlp": 0.01027321, + "balance_loss_clip": 1.01619875, + "balance_loss_mlp": 1.03312755, + "epoch": 0.8908462347813016, + "flos": 20084156265600.0, + "grad_norm": 2.0793302281151655, + "language_loss": 0.77708268, + "learning_rate": 1.2362238317997476e-07, + "loss": 0.79834437, + "num_input_tokens_seen": 319538410, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 14817, + "time_per_iteration": 2.486025333404541 + }, + { + "auxiliary_loss_clip": 0.01021999, + "auxiliary_loss_mlp": 0.00998991, + "balance_loss_clip": 0.99791193, + "balance_loss_mlp": 1.00199425, + "epoch": 0.8909063580339697, + "flos": 65503649790720.0, + "grad_norm": 0.754133836270162, + "language_loss": 0.56543994, + "learning_rate": 1.2348761675778517e-07, + "loss": 0.58564985, + "num_input_tokens_seen": 319602565, + "router_z_loss_clip": 0.01080322, + "router_z_loss_mlp": 0.20019531, + "step": 14818, + "time_per_iteration": 3.1222634315490723 + }, + { + "auxiliary_loss_clip": 0.01099653, + "auxiliary_loss_mlp": 0.01030248, + "balance_loss_clip": 1.01856565, + "balance_loss_mlp": 1.03452563, + "epoch": 0.8909664812866376, + "flos": 29863062426240.0, + "grad_norm": 1.7280404234864395, + "language_loss": 0.64667571, + "learning_rate": 1.2335292149313325e-07, + "loss": 0.66797471, + "num_input_tokens_seen": 319624645, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65234375, + "step": 14819, + "time_per_iteration": 2.5655226707458496 + }, + { + "auxiliary_loss_clip": 0.01101351, + "auxiliary_loss_mlp": 0.01029856, + "balance_loss_clip": 1.01726794, + "balance_loss_mlp": 1.03483844, + "epoch": 0.8910266045393056, + "flos": 25447127362560.0, + "grad_norm": 1.673521506671084, + "language_loss": 0.78504813, + "learning_rate": 1.2321829739112731e-07, + "loss": 0.80636024, + "num_input_tokens_seen": 319644040, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6640625, + "step": 14820, + "time_per_iteration": 2.4987428188323975 + }, + { + "auxiliary_loss_clip": 0.01100213, + "auxiliary_loss_mlp": 0.01032381, + "balance_loss_clip": 1.02140188, + "balance_loss_mlp": 1.03441, + "epoch": 0.8910867277919735, + "flos": 24499121662080.0, + "grad_norm": 1.8625788775928358, + "language_loss": 0.76595819, + "learning_rate": 1.2308374445687087e-07, + "loss": 0.78728414, + "num_input_tokens_seen": 319663930, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.65625, + "step": 14821, + "time_per_iteration": 2.47625470161438 + }, + { + "auxiliary_loss_clip": 0.01022043, + "auxiliary_loss_mlp": 0.01000344, + "balance_loss_clip": 0.99936658, + "balance_loss_mlp": 1.00216877, + "epoch": 0.8911468510446415, + "flos": 60688136856960.0, + "grad_norm": 0.7883315331563723, + "language_loss": 0.59294641, + "learning_rate": 1.2294926269546712e-07, + "loss": 0.61317027, + "num_input_tokens_seen": 319721245, + "router_z_loss_clip": 0.00976562, + "router_z_loss_mlp": 0.19921875, + "step": 14822, + "time_per_iteration": 2.965127468109131 + }, + { + "auxiliary_loss_clip": 0.01099976, + "auxiliary_loss_mlp": 0.01028436, + "balance_loss_clip": 1.01686049, + "balance_loss_mlp": 1.03401423, + "epoch": 0.8912069742973094, + "flos": 25337492075520.0, + "grad_norm": 1.9811529785013153, + "language_loss": 0.68799651, + "learning_rate": 1.2281485211201515e-07, + "loss": 0.70928061, + "num_input_tokens_seen": 319741200, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66015625, + "step": 14823, + "time_per_iteration": 2.503521203994751 + }, + { + "auxiliary_loss_clip": 0.01096068, + "auxiliary_loss_mlp": 0.01028171, + "balance_loss_clip": 1.01657248, + "balance_loss_mlp": 1.03248489, + "epoch": 0.8912670975499775, + "flos": 18223516782720.0, + "grad_norm": 1.493967977658863, + "language_loss": 0.69340491, + "learning_rate": 1.2268051271161262e-07, + "loss": 0.71464735, + "num_input_tokens_seen": 319759265, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.63671875, + "step": 14824, + "time_per_iteration": 2.4937334060668945 + }, + { + "auxiliary_loss_clip": 0.01100645, + "auxiliary_loss_mlp": 0.01031923, + "balance_loss_clip": 1.01948369, + "balance_loss_mlp": 1.03348267, + "epoch": 0.8913272208026454, + "flos": 26504481041280.0, + "grad_norm": 1.7886667473291846, + "language_loss": 0.70545679, + "learning_rate": 1.2254624449935303e-07, + "loss": 0.72678244, + "num_input_tokens_seen": 319777560, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 14825, + "time_per_iteration": 2.4795172214508057 + }, + { + "auxiliary_loss_clip": 0.01097621, + "auxiliary_loss_mlp": 0.01029336, + "balance_loss_clip": 1.01749253, + "balance_loss_mlp": 1.03321981, + "epoch": 0.8913873440553134, + "flos": 18802324540800.0, + "grad_norm": 1.8753873126161282, + "language_loss": 0.71137297, + "learning_rate": 1.2241204748032786e-07, + "loss": 0.73264253, + "num_input_tokens_seen": 319794125, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.640625, + "step": 14826, + "time_per_iteration": 2.459636688232422 + }, + { + "auxiliary_loss_clip": 0.01097916, + "auxiliary_loss_mlp": 0.01026074, + "balance_loss_clip": 1.01506472, + "balance_loss_mlp": 1.03418994, + "epoch": 0.8914474673079814, + "flos": 20884892204160.0, + "grad_norm": 1.9750957296989986, + "language_loss": 0.74912608, + "learning_rate": 1.2227792165962615e-07, + "loss": 0.77036595, + "num_input_tokens_seen": 319810310, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.63671875, + "step": 14827, + "time_per_iteration": 2.429797410964966 + }, + { + "auxiliary_loss_clip": 0.01099273, + "auxiliary_loss_mlp": 0.01027727, + "balance_loss_clip": 1.01571679, + "balance_loss_mlp": 1.03379297, + "epoch": 0.8915075905606493, + "flos": 20952439729920.0, + "grad_norm": 1.6925399195324096, + "language_loss": 0.78210777, + "learning_rate": 1.221438670423336e-07, + "loss": 0.80337775, + "num_input_tokens_seen": 319828505, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.65234375, + "step": 14828, + "time_per_iteration": 2.4611334800720215 + }, + { + "auxiliary_loss_clip": 0.01098983, + "auxiliary_loss_mlp": 0.01030227, + "balance_loss_clip": 1.0185442, + "balance_loss_mlp": 1.03426635, + "epoch": 0.8915677138133173, + "flos": 23076305055360.0, + "grad_norm": 1.608358893281869, + "language_loss": 0.75332123, + "learning_rate": 1.2200988363353392e-07, + "loss": 0.77461332, + "num_input_tokens_seen": 319848680, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6484375, + "step": 14829, + "time_per_iteration": 2.450355291366577 + }, + { + "auxiliary_loss_clip": 0.01098587, + "auxiliary_loss_mlp": 0.01034705, + "balance_loss_clip": 1.0239166, + "balance_loss_mlp": 1.03289604, + "epoch": 0.8916278370659853, + "flos": 23440259612160.0, + "grad_norm": 1.535946632179645, + "language_loss": 0.84532714, + "learning_rate": 1.2187597143830773e-07, + "loss": 0.86666012, + "num_input_tokens_seen": 319868835, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.65625, + "step": 14830, + "time_per_iteration": 2.5008816719055176 + }, + { + "auxiliary_loss_clip": 0.01096274, + "auxiliary_loss_mlp": 0.01024693, + "balance_loss_clip": 1.01413131, + "balance_loss_mlp": 1.03340077, + "epoch": 0.8916879603186533, + "flos": 25160488830720.0, + "grad_norm": 1.4505342083014159, + "language_loss": 0.74674547, + "learning_rate": 1.2174213046173299e-07, + "loss": 0.76795518, + "num_input_tokens_seen": 319891585, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.62890625, + "step": 14831, + "time_per_iteration": 2.52681827545166 + }, + { + "auxiliary_loss_clip": 0.01100331, + "auxiliary_loss_mlp": 0.01027206, + "balance_loss_clip": 1.01556492, + "balance_loss_mlp": 1.03387928, + "epoch": 0.8917480835713212, + "flos": 20229845829120.0, + "grad_norm": 1.6745380328604238, + "language_loss": 0.72861183, + "learning_rate": 1.216083607088847e-07, + "loss": 0.74988717, + "num_input_tokens_seen": 319910315, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 14832, + "time_per_iteration": 2.4757449626922607 + }, + { + "auxiliary_loss_clip": 0.01100323, + "auxiliary_loss_mlp": 0.01029449, + "balance_loss_clip": 1.01790977, + "balance_loss_mlp": 1.03276098, + "epoch": 0.8918082068239892, + "flos": 26101922342400.0, + "grad_norm": 1.7806350383888931, + "language_loss": 0.66921455, + "learning_rate": 1.214746621848355e-07, + "loss": 0.6905123, + "num_input_tokens_seen": 319932275, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.67578125, + "step": 14833, + "time_per_iteration": 2.486619234085083 + }, + { + "auxiliary_loss_clip": 0.01103899, + "auxiliary_loss_mlp": 0.01030365, + "balance_loss_clip": 1.01791346, + "balance_loss_mlp": 1.03564548, + "epoch": 0.8918683300766571, + "flos": 24831439315200.0, + "grad_norm": 1.6026570762407482, + "language_loss": 0.73980582, + "learning_rate": 1.2134103489465575e-07, + "loss": 0.76114845, + "num_input_tokens_seen": 319955335, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.68359375, + "step": 14834, + "time_per_iteration": 2.5816140174865723 + }, + { + "auxiliary_loss_clip": 0.01099178, + "auxiliary_loss_mlp": 0.01030289, + "balance_loss_clip": 1.01897621, + "balance_loss_mlp": 1.03406143, + "epoch": 0.8919284533293251, + "flos": 22305158945280.0, + "grad_norm": 2.0716513864685107, + "language_loss": 0.78957003, + "learning_rate": 1.2120747884341188e-07, + "loss": 0.81086469, + "num_input_tokens_seen": 319973990, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 14835, + "time_per_iteration": 2.4538369178771973 + }, + { + "auxiliary_loss_clip": 0.01095585, + "auxiliary_loss_mlp": 0.0102702, + "balance_loss_clip": 1.01586795, + "balance_loss_mlp": 1.03217602, + "epoch": 0.891988576581993, + "flos": 30373532559360.0, + "grad_norm": 1.3453069661779542, + "language_loss": 0.73707056, + "learning_rate": 1.210739940361689e-07, + "loss": 0.75829661, + "num_input_tokens_seen": 319995555, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6328125, + "step": 14836, + "time_per_iteration": 2.5771117210388184 + }, + { + "auxiliary_loss_clip": 0.0109794, + "auxiliary_loss_mlp": 0.01031984, + "balance_loss_clip": 1.02038467, + "balance_loss_mlp": 1.03253198, + "epoch": 0.8920486998346611, + "flos": 15552947479680.0, + "grad_norm": 2.043235250771678, + "language_loss": 0.68709385, + "learning_rate": 1.2094058047798838e-07, + "loss": 0.7083931, + "num_input_tokens_seen": 320012385, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 14837, + "time_per_iteration": 2.414586305618286 + }, + { + "auxiliary_loss_clip": 0.01103306, + "auxiliary_loss_mlp": 0.01029674, + "balance_loss_clip": 1.01693094, + "balance_loss_mlp": 1.03462231, + "epoch": 0.892108823087329, + "flos": 21214983214080.0, + "grad_norm": 1.9615265471061178, + "language_loss": 0.6747911, + "learning_rate": 1.2080723817392913e-07, + "loss": 0.69612092, + "num_input_tokens_seen": 320032390, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 14838, + "time_per_iteration": 2.4969213008880615 + }, + { + "auxiliary_loss_clip": 0.01099744, + "auxiliary_loss_mlp": 0.01026647, + "balance_loss_clip": 1.01435709, + "balance_loss_mlp": 1.03337884, + "epoch": 0.892168946339997, + "flos": 21978982517760.0, + "grad_norm": 1.977366243331741, + "language_loss": 0.76072603, + "learning_rate": 1.2067396712904777e-07, + "loss": 0.78198999, + "num_input_tokens_seen": 320052885, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6640625, + "step": 14839, + "time_per_iteration": 2.4536030292510986 + }, + { + "auxiliary_loss_clip": 0.01022037, + "auxiliary_loss_mlp": 0.01000199, + "balance_loss_clip": 0.99922198, + "balance_loss_mlp": 1.0020833, + "epoch": 0.892229069592665, + "flos": 67475289277440.0, + "grad_norm": 0.6822740725500295, + "language_loss": 0.49385339, + "learning_rate": 1.205407673483978e-07, + "loss": 0.51407576, + "num_input_tokens_seen": 320113685, + "router_z_loss_clip": 0.00976562, + "router_z_loss_mlp": 0.19921875, + "step": 14840, + "time_per_iteration": 3.0283010005950928 + }, + { + "auxiliary_loss_clip": 0.01103846, + "auxiliary_loss_mlp": 0.01028258, + "balance_loss_clip": 1.01541281, + "balance_loss_mlp": 1.03384066, + "epoch": 0.8922891928453329, + "flos": 19459561645440.0, + "grad_norm": 2.2683869685699505, + "language_loss": 0.64067227, + "learning_rate": 1.2040763883703074e-07, + "loss": 0.66199327, + "num_input_tokens_seen": 320130810, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69921875, + "step": 14841, + "time_per_iteration": 2.4450442790985107 + }, + { + "auxiliary_loss_clip": 0.01095053, + "auxiliary_loss_mlp": 0.01033419, + "balance_loss_clip": 1.02289844, + "balance_loss_mlp": 1.03297675, + "epoch": 0.8923493160980009, + "flos": 23367396873600.0, + "grad_norm": 1.605514543360149, + "language_loss": 0.686297, + "learning_rate": 1.2027458159999438e-07, + "loss": 0.70758176, + "num_input_tokens_seen": 320152170, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.62109375, + "step": 14842, + "time_per_iteration": 2.5407049655914307 + }, + { + "auxiliary_loss_clip": 0.01097557, + "auxiliary_loss_mlp": 0.01029792, + "balance_loss_clip": 1.01925397, + "balance_loss_mlp": 1.0342983, + "epoch": 0.8924094393506689, + "flos": 26177047637760.0, + "grad_norm": 1.8531743729129386, + "language_loss": 0.79840702, + "learning_rate": 1.2014159564233373e-07, + "loss": 0.81968051, + "num_input_tokens_seen": 320172360, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6328125, + "step": 14843, + "time_per_iteration": 2.482599973678589 + }, + { + "auxiliary_loss_clip": 0.01101829, + "auxiliary_loss_mlp": 0.01031096, + "balance_loss_clip": 1.01853776, + "balance_loss_mlp": 1.03437209, + "epoch": 0.8924695626033369, + "flos": 22018520413440.0, + "grad_norm": 1.9570611228190977, + "language_loss": 0.68831146, + "learning_rate": 1.2000868096909257e-07, + "loss": 0.70964074, + "num_input_tokens_seen": 320192130, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.67578125, + "step": 14844, + "time_per_iteration": 2.475032091140747 + }, + { + "auxiliary_loss_clip": 0.01101274, + "auxiliary_loss_mlp": 0.0103247, + "balance_loss_clip": 1.02129424, + "balance_loss_mlp": 1.03532469, + "epoch": 0.8925296858560048, + "flos": 14793940166400.0, + "grad_norm": 1.8946118282729945, + "language_loss": 0.91013724, + "learning_rate": 1.1987583758531038e-07, + "loss": 0.93147469, + "num_input_tokens_seen": 320207760, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.66015625, + "step": 14845, + "time_per_iteration": 3.7998101711273193 + }, + { + "auxiliary_loss_clip": 0.01098517, + "auxiliary_loss_mlp": 0.01025686, + "balance_loss_clip": 1.01481438, + "balance_loss_mlp": 1.03497481, + "epoch": 0.8925898091086728, + "flos": 22346636175360.0, + "grad_norm": 2.0813113286417555, + "language_loss": 0.72576404, + "learning_rate": 1.1974306549602476e-07, + "loss": 0.74700606, + "num_input_tokens_seen": 320225325, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6328125, + "step": 14846, + "time_per_iteration": 2.474081039428711 + }, + { + "auxiliary_loss_clip": 0.01101498, + "auxiliary_loss_mlp": 0.01034011, + "balance_loss_clip": 1.02219105, + "balance_loss_mlp": 1.03516674, + "epoch": 0.8926499323613407, + "flos": 45806322067200.0, + "grad_norm": 1.825993599740926, + "language_loss": 0.57318634, + "learning_rate": 1.1961036470627094e-07, + "loss": 0.59454143, + "num_input_tokens_seen": 320247645, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 14847, + "time_per_iteration": 4.101036071777344 + }, + { + "auxiliary_loss_clip": 0.01099025, + "auxiliary_loss_mlp": 0.01030816, + "balance_loss_clip": 1.02008104, + "balance_loss_mlp": 1.03349578, + "epoch": 0.8927100556140087, + "flos": 22127042378880.0, + "grad_norm": 1.8027724109005723, + "language_loss": 0.76794285, + "learning_rate": 1.1947773522108052e-07, + "loss": 0.78924131, + "num_input_tokens_seen": 320266005, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.65625, + "step": 14848, + "time_per_iteration": 2.5357553958892822 + }, + { + "auxiliary_loss_clip": 0.01098164, + "auxiliary_loss_mlp": 0.01027847, + "balance_loss_clip": 1.01653409, + "balance_loss_mlp": 1.03388548, + "epoch": 0.8927701788666766, + "flos": 28330143655680.0, + "grad_norm": 2.852551904806777, + "language_loss": 0.69231212, + "learning_rate": 1.1934517704548251e-07, + "loss": 0.71357226, + "num_input_tokens_seen": 320285555, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 14849, + "time_per_iteration": 2.5289762020111084 + }, + { + "auxiliary_loss_clip": 0.01102332, + "auxiliary_loss_mlp": 0.01033428, + "balance_loss_clip": 1.02210355, + "balance_loss_mlp": 1.03686213, + "epoch": 0.8928303021193447, + "flos": 25294973351040.0, + "grad_norm": 1.5363403291321316, + "language_loss": 0.80896437, + "learning_rate": 1.1921269018450364e-07, + "loss": 0.83032203, + "num_input_tokens_seen": 320305395, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 14850, + "time_per_iteration": 2.539560556411743 + }, + { + "auxiliary_loss_clip": 0.01097951, + "auxiliary_loss_mlp": 0.01033461, + "balance_loss_clip": 1.02183247, + "balance_loss_mlp": 1.03397167, + "epoch": 0.8928904253720126, + "flos": 22236713579520.0, + "grad_norm": 1.4972669302776855, + "language_loss": 0.75046718, + "learning_rate": 1.1908027464316872e-07, + "loss": 0.77178133, + "num_input_tokens_seen": 320324220, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.640625, + "step": 14851, + "time_per_iteration": 3.946723699569702 + }, + { + "auxiliary_loss_clip": 0.01097886, + "auxiliary_loss_mlp": 0.01028153, + "balance_loss_clip": 1.01645279, + "balance_loss_mlp": 1.03404009, + "epoch": 0.8929505486246806, + "flos": 27092374940160.0, + "grad_norm": 1.6306137064929098, + "language_loss": 0.78424543, + "learning_rate": 1.1894793042649775e-07, + "loss": 0.80550581, + "num_input_tokens_seen": 320347195, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.640625, + "step": 14852, + "time_per_iteration": 2.588900089263916 + }, + { + "auxiliary_loss_clip": 0.01097941, + "auxiliary_loss_mlp": 0.01030272, + "balance_loss_clip": 1.01950192, + "balance_loss_mlp": 1.03595543, + "epoch": 0.8930106718773486, + "flos": 23039352938880.0, + "grad_norm": 1.4048830284686333, + "language_loss": 0.69412851, + "learning_rate": 1.1881565753951006e-07, + "loss": 0.71541065, + "num_input_tokens_seen": 320366850, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.62109375, + "step": 14853, + "time_per_iteration": 2.492919921875 + }, + { + "auxiliary_loss_clip": 0.01100668, + "auxiliary_loss_mlp": 0.01031135, + "balance_loss_clip": 1.01944065, + "balance_loss_mlp": 1.03522491, + "epoch": 0.8930707951300165, + "flos": 35626652887680.0, + "grad_norm": 1.7034933673051655, + "language_loss": 0.67261219, + "learning_rate": 1.1868345598722118e-07, + "loss": 0.69393027, + "num_input_tokens_seen": 320388895, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65234375, + "step": 14854, + "time_per_iteration": 2.6161773204803467 + }, + { + "auxiliary_loss_clip": 0.01096124, + "auxiliary_loss_mlp": 0.01028012, + "balance_loss_clip": 1.01752734, + "balance_loss_mlp": 1.03351092, + "epoch": 0.8931309183826845, + "flos": 23039891642880.0, + "grad_norm": 1.452507573496769, + "language_loss": 0.74611282, + "learning_rate": 1.1855132577464399e-07, + "loss": 0.76735425, + "num_input_tokens_seen": 320408520, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.625, + "step": 14855, + "time_per_iteration": 2.473764657974243 + }, + { + "auxiliary_loss_clip": 0.0109814, + "auxiliary_loss_mlp": 0.01030131, + "balance_loss_clip": 1.01865709, + "balance_loss_mlp": 1.0337348, + "epoch": 0.8931910416353525, + "flos": 26504624695680.0, + "grad_norm": 1.9027124813935195, + "language_loss": 0.64368689, + "learning_rate": 1.1841926690678893e-07, + "loss": 0.66496962, + "num_input_tokens_seen": 320427400, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.64453125, + "step": 14856, + "time_per_iteration": 2.532707691192627 + }, + { + "auxiliary_loss_clip": 0.01098751, + "auxiliary_loss_mlp": 0.01027284, + "balance_loss_clip": 1.01631689, + "balance_loss_mlp": 1.03341556, + "epoch": 0.8932511648880205, + "flos": 24973609345920.0, + "grad_norm": 1.8696512418627556, + "language_loss": 0.66240281, + "learning_rate": 1.1828727938866378e-07, + "loss": 0.68366313, + "num_input_tokens_seen": 320447570, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.65625, + "step": 14857, + "time_per_iteration": 2.4812355041503906 + }, + { + "auxiliary_loss_clip": 0.01102247, + "auxiliary_loss_mlp": 0.01031555, + "balance_loss_clip": 1.02011704, + "balance_loss_mlp": 1.0357101, + "epoch": 0.8933112881406884, + "flos": 24460733001600.0, + "grad_norm": 2.3462499304119415, + "language_loss": 0.75313234, + "learning_rate": 1.1815536322527408e-07, + "loss": 0.77447033, + "num_input_tokens_seen": 320464405, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 14858, + "time_per_iteration": 2.4967639446258545 + }, + { + "auxiliary_loss_clip": 0.01098575, + "auxiliary_loss_mlp": 0.01029184, + "balance_loss_clip": 1.01716173, + "balance_loss_mlp": 1.03381801, + "epoch": 0.8933714113933564, + "flos": 28293083798400.0, + "grad_norm": 1.749677970064563, + "language_loss": 0.69162208, + "learning_rate": 1.1802351842162139e-07, + "loss": 0.71289968, + "num_input_tokens_seen": 320485525, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6484375, + "step": 14859, + "time_per_iteration": 2.504290819168091 + }, + { + "auxiliary_loss_clip": 0.01093256, + "auxiliary_loss_mlp": 0.01026074, + "balance_loss_clip": 1.01584625, + "balance_loss_mlp": 1.03289175, + "epoch": 0.8934315346460243, + "flos": 21434864319360.0, + "grad_norm": 1.7863861979655313, + "language_loss": 0.75433087, + "learning_rate": 1.1789174498270526e-07, + "loss": 0.77552414, + "num_input_tokens_seen": 320506725, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.6015625, + "step": 14860, + "time_per_iteration": 2.479966402053833 + }, + { + "auxiliary_loss_clip": 0.01102206, + "auxiliary_loss_mlp": 0.0103045, + "balance_loss_clip": 1.0180645, + "balance_loss_mlp": 1.03548205, + "epoch": 0.8934916578986923, + "flos": 23769596436480.0, + "grad_norm": 2.336029575980188, + "language_loss": 0.57421482, + "learning_rate": 1.1776004291352303e-07, + "loss": 0.59554136, + "num_input_tokens_seen": 320525425, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6640625, + "step": 14861, + "time_per_iteration": 2.4453883171081543 + }, + { + "auxiliary_loss_clip": 0.01097311, + "auxiliary_loss_mlp": 0.01030271, + "balance_loss_clip": 1.0188086, + "balance_loss_mlp": 1.03289747, + "epoch": 0.8935517811513602, + "flos": 18916161719040.0, + "grad_norm": 1.9703362787241279, + "language_loss": 0.63988757, + "learning_rate": 1.176284122190685e-07, + "loss": 0.66116345, + "num_input_tokens_seen": 320543010, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.640625, + "step": 14862, + "time_per_iteration": 2.4691827297210693 + }, + { + "auxiliary_loss_clip": 0.0109601, + "auxiliary_loss_mlp": 0.01026536, + "balance_loss_clip": 1.01505589, + "balance_loss_mlp": 1.03218484, + "epoch": 0.8936119044040283, + "flos": 24061370613120.0, + "grad_norm": 1.5823454170060147, + "language_loss": 0.77867645, + "learning_rate": 1.1749685290433298e-07, + "loss": 0.7999019, + "num_input_tokens_seen": 320562180, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.63671875, + "step": 14863, + "time_per_iteration": 2.540869951248169 + }, + { + "auxiliary_loss_clip": 0.01094615, + "auxiliary_loss_mlp": 0.01024455, + "balance_loss_clip": 1.01424432, + "balance_loss_mlp": 1.0320065, + "epoch": 0.8936720276566962, + "flos": 21324079797120.0, + "grad_norm": 1.9517641145653177, + "language_loss": 0.70929408, + "learning_rate": 1.1736536497430627e-07, + "loss": 0.73048472, + "num_input_tokens_seen": 320580395, + "router_z_loss_clip": 0.10205078, + "router_z_loss_mlp": 0.62890625, + "step": 14864, + "time_per_iteration": 2.5036158561706543 + }, + { + "auxiliary_loss_clip": 0.01107034, + "auxiliary_loss_mlp": 0.01033263, + "balance_loss_clip": 1.021366, + "balance_loss_mlp": 1.03713703, + "epoch": 0.8937321509093642, + "flos": 18406122549120.0, + "grad_norm": 1.960140962390111, + "language_loss": 0.75742739, + "learning_rate": 1.1723394843397283e-07, + "loss": 0.77883035, + "num_input_tokens_seen": 320599505, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.69921875, + "step": 14865, + "time_per_iteration": 2.442366123199463 + }, + { + "auxiliary_loss_clip": 0.01095846, + "auxiliary_loss_mlp": 0.01027973, + "balance_loss_clip": 1.017483, + "balance_loss_mlp": 1.03252757, + "epoch": 0.8937922741620322, + "flos": 22054754257920.0, + "grad_norm": 1.6471559543699055, + "language_loss": 0.71687293, + "learning_rate": 1.1710260328831668e-07, + "loss": 0.73811114, + "num_input_tokens_seen": 320619825, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.6328125, + "step": 14866, + "time_per_iteration": 2.5246763229370117 + }, + { + "auxiliary_loss_clip": 0.01104023, + "auxiliary_loss_mlp": 0.01028647, + "balance_loss_clip": 1.01588607, + "balance_loss_mlp": 1.03644776, + "epoch": 0.8938523974147001, + "flos": 25664386775040.0, + "grad_norm": 1.8105973277463203, + "language_loss": 0.83971083, + "learning_rate": 1.1697132954231869e-07, + "loss": 0.86103749, + "num_input_tokens_seen": 320638515, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.67578125, + "step": 14867, + "time_per_iteration": 2.4837403297424316 + }, + { + "auxiliary_loss_clip": 0.01098392, + "auxiliary_loss_mlp": 0.01027811, + "balance_loss_clip": 1.01751125, + "balance_loss_mlp": 1.03336859, + "epoch": 0.8939125206673681, + "flos": 25742852035200.0, + "grad_norm": 1.5257308716937024, + "language_loss": 0.80485952, + "learning_rate": 1.168401272009567e-07, + "loss": 0.82612157, + "num_input_tokens_seen": 320659430, + "router_z_loss_clip": 0.10302734, + "router_z_loss_mlp": 0.6484375, + "step": 14868, + "time_per_iteration": 2.539396047592163 + }, + { + "auxiliary_loss_clip": 0.01100509, + "auxiliary_loss_mlp": 0.01030561, + "balance_loss_clip": 1.01893783, + "balance_loss_mlp": 1.03468442, + "epoch": 0.8939726439200361, + "flos": 27344503480320.0, + "grad_norm": 1.73703480181996, + "language_loss": 0.77222109, + "learning_rate": 1.167089962692056e-07, + "loss": 0.79353189, + "num_input_tokens_seen": 320679295, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 14869, + "time_per_iteration": 2.503376007080078 + }, + { + "auxiliary_loss_clip": 0.01099024, + "auxiliary_loss_mlp": 0.01022264, + "balance_loss_clip": 1.01077819, + "balance_loss_mlp": 1.03436267, + "epoch": 0.8940327671727041, + "flos": 20338834671360.0, + "grad_norm": 1.4728974184814814, + "language_loss": 0.6547929, + "learning_rate": 1.1657793675203853e-07, + "loss": 0.67600584, + "num_input_tokens_seen": 320697535, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.64453125, + "step": 14870, + "time_per_iteration": 2.4696993827819824 + }, + { + "auxiliary_loss_clip": 0.01021955, + "auxiliary_loss_mlp": 0.01003959, + "balance_loss_clip": 1.00302875, + "balance_loss_mlp": 1.00201225, + "epoch": 0.894092890425372, + "flos": 58410573235200.0, + "grad_norm": 0.8434452713885856, + "language_loss": 0.55948913, + "learning_rate": 1.1644694865442461e-07, + "loss": 0.57974827, + "num_input_tokens_seen": 320758635, + "router_z_loss_clip": 0.00927734, + "router_z_loss_mlp": 0.19921875, + "step": 14871, + "time_per_iteration": 3.098759412765503 + }, + { + "auxiliary_loss_clip": 0.01098394, + "auxiliary_loss_mlp": 0.01032324, + "balance_loss_clip": 1.02132666, + "balance_loss_mlp": 1.03488946, + "epoch": 0.89415301367804, + "flos": 19829657427840.0, + "grad_norm": 1.8033147229452833, + "language_loss": 0.76229548, + "learning_rate": 1.16316031981331e-07, + "loss": 0.78360265, + "num_input_tokens_seen": 320777175, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6328125, + "step": 14872, + "time_per_iteration": 2.4757678508758545 + }, + { + "auxiliary_loss_clip": 0.01095042, + "auxiliary_loss_mlp": 0.01026607, + "balance_loss_clip": 1.0163548, + "balance_loss_mlp": 1.03337288, + "epoch": 0.8942131369307079, + "flos": 25775781828480.0, + "grad_norm": 1.490700807753622, + "language_loss": 0.66794723, + "learning_rate": 1.1618518673772215e-07, + "loss": 0.68916368, + "num_input_tokens_seen": 320797670, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.6171875, + "step": 14873, + "time_per_iteration": 2.5868990421295166 + }, + { + "auxiliary_loss_clip": 0.01096304, + "auxiliary_loss_mlp": 0.01032791, + "balance_loss_clip": 1.02131701, + "balance_loss_mlp": 1.03331888, + "epoch": 0.8942732601833759, + "flos": 23149024139520.0, + "grad_norm": 1.55215288470212, + "language_loss": 0.59791553, + "learning_rate": 1.1605441292856033e-07, + "loss": 0.61920649, + "num_input_tokens_seen": 320817410, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6328125, + "step": 14874, + "time_per_iteration": 2.4845948219299316 + }, + { + "auxiliary_loss_clip": 0.01104539, + "auxiliary_loss_mlp": 0.01030928, + "balance_loss_clip": 1.01850021, + "balance_loss_mlp": 1.03692889, + "epoch": 0.8943333834360438, + "flos": 27855548231040.0, + "grad_norm": 1.8608507472011937, + "language_loss": 0.75573874, + "learning_rate": 1.1592371055880356e-07, + "loss": 0.77709341, + "num_input_tokens_seen": 320836745, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.67578125, + "step": 14875, + "time_per_iteration": 2.552445650100708 + }, + { + "auxiliary_loss_clip": 0.01106238, + "auxiliary_loss_mlp": 0.01031424, + "balance_loss_clip": 1.01806641, + "balance_loss_mlp": 1.03644109, + "epoch": 0.8943935066887119, + "flos": 22163958581760.0, + "grad_norm": 1.7553065513486439, + "language_loss": 0.77431512, + "learning_rate": 1.1579307963340857e-07, + "loss": 0.79569167, + "num_input_tokens_seen": 320853305, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.69921875, + "step": 14876, + "time_per_iteration": 2.479843854904175 + }, + { + "auxiliary_loss_clip": 0.01097857, + "auxiliary_loss_mlp": 0.01027719, + "balance_loss_clip": 1.01707911, + "balance_loss_mlp": 1.03393304, + "epoch": 0.8944536299413798, + "flos": 21470056669440.0, + "grad_norm": 1.7034619965415823, + "language_loss": 0.78767753, + "learning_rate": 1.156625201573287e-07, + "loss": 0.80893332, + "num_input_tokens_seen": 320872885, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.640625, + "step": 14877, + "time_per_iteration": 2.468996524810791 + }, + { + "auxiliary_loss_clip": 0.01099186, + "auxiliary_loss_mlp": 0.01028425, + "balance_loss_clip": 1.01641452, + "balance_loss_mlp": 1.03482389, + "epoch": 0.8945137531940478, + "flos": 17748777703680.0, + "grad_norm": 2.091963059351132, + "language_loss": 0.7505362, + "learning_rate": 1.155320321355151e-07, + "loss": 0.77181232, + "num_input_tokens_seen": 320889755, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.640625, + "step": 14878, + "time_per_iteration": 2.4732110500335693 + }, + { + "auxiliary_loss_clip": 0.0109821, + "auxiliary_loss_mlp": 0.01027029, + "balance_loss_clip": 1.01513195, + "balance_loss_mlp": 1.03284216, + "epoch": 0.8945738764467158, + "flos": 21142264129920.0, + "grad_norm": 1.5818118960503171, + "language_loss": 0.76242149, + "learning_rate": 1.1540161557291539e-07, + "loss": 0.78367388, + "num_input_tokens_seen": 320907860, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 14879, + "time_per_iteration": 2.5296013355255127 + }, + { + "auxiliary_loss_clip": 0.0110191, + "auxiliary_loss_mlp": 0.01030173, + "balance_loss_clip": 1.01895523, + "balance_loss_mlp": 1.03676414, + "epoch": 0.8946339996993837, + "flos": 14903000835840.0, + "grad_norm": 1.8977007401222414, + "language_loss": 0.7420851, + "learning_rate": 1.1527127047447538e-07, + "loss": 0.76340598, + "num_input_tokens_seen": 320925825, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 14880, + "time_per_iteration": 2.501164197921753 + }, + { + "auxiliary_loss_clip": 0.01099433, + "auxiliary_loss_mlp": 0.01025992, + "balance_loss_clip": 1.01408911, + "balance_loss_mlp": 1.03427339, + "epoch": 0.8946941229520518, + "flos": 27382173868800.0, + "grad_norm": 1.5190919090163466, + "language_loss": 0.82769126, + "learning_rate": 1.1514099684513822e-07, + "loss": 0.8489455, + "num_input_tokens_seen": 320946165, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6484375, + "step": 14881, + "time_per_iteration": 2.5418641567230225 + }, + { + "auxiliary_loss_clip": 0.01095788, + "auxiliary_loss_mlp": 0.0102559, + "balance_loss_clip": 1.01472986, + "balance_loss_mlp": 1.03236985, + "epoch": 0.8947542462047197, + "flos": 31796277338880.0, + "grad_norm": 1.649013399005573, + "language_loss": 0.67482835, + "learning_rate": 1.1501079468984287e-07, + "loss": 0.69604212, + "num_input_tokens_seen": 320969330, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6328125, + "step": 14882, + "time_per_iteration": 2.564490795135498 + }, + { + "auxiliary_loss_clip": 0.01104448, + "auxiliary_loss_mlp": 0.01028251, + "balance_loss_clip": 1.01529944, + "balance_loss_mlp": 1.03529155, + "epoch": 0.8948143694573877, + "flos": 20883599314560.0, + "grad_norm": 2.429036760271906, + "language_loss": 0.75044572, + "learning_rate": 1.1488066401352691e-07, + "loss": 0.77177274, + "num_input_tokens_seen": 320985055, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.69140625, + "step": 14883, + "time_per_iteration": 2.4705231189727783 + }, + { + "auxiliary_loss_clip": 0.01094799, + "auxiliary_loss_mlp": 0.01030489, + "balance_loss_clip": 1.01948595, + "balance_loss_mlp": 1.0331018, + "epoch": 0.8948744927100556, + "flos": 28215552291840.0, + "grad_norm": 1.5479121548537522, + "language_loss": 0.72337794, + "learning_rate": 1.147506048211253e-07, + "loss": 0.74463081, + "num_input_tokens_seen": 321004720, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6171875, + "step": 14884, + "time_per_iteration": 2.5401506423950195 + }, + { + "auxiliary_loss_clip": 0.01094217, + "auxiliary_loss_mlp": 0.01025329, + "balance_loss_clip": 1.01457047, + "balance_loss_mlp": 1.03127992, + "epoch": 0.8949346159627236, + "flos": 21902672073600.0, + "grad_norm": 1.5118040441368576, + "language_loss": 0.75339627, + "learning_rate": 1.1462061711756987e-07, + "loss": 0.77459168, + "num_input_tokens_seen": 321022350, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.62890625, + "step": 14885, + "time_per_iteration": 2.4566526412963867 + }, + { + "auxiliary_loss_clip": 0.01099303, + "auxiliary_loss_mlp": 0.01028817, + "balance_loss_clip": 1.01721168, + "balance_loss_mlp": 1.03246248, + "epoch": 0.8949947392153915, + "flos": 21359128492800.0, + "grad_norm": 2.3103597790279053, + "language_loss": 0.81585598, + "learning_rate": 1.1449070090778911e-07, + "loss": 0.83713722, + "num_input_tokens_seen": 321040450, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66796875, + "step": 14886, + "time_per_iteration": 2.4688005447387695 + }, + { + "auxiliary_loss_clip": 0.01099421, + "auxiliary_loss_mlp": 0.01027155, + "balance_loss_clip": 1.01639688, + "balance_loss_mlp": 1.03421152, + "epoch": 0.8950548624680595, + "flos": 52445342799360.0, + "grad_norm": 1.478515652092319, + "language_loss": 0.63619804, + "learning_rate": 1.1436085619671043e-07, + "loss": 0.65746379, + "num_input_tokens_seen": 321063970, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.65234375, + "step": 14887, + "time_per_iteration": 4.133228302001953 + }, + { + "auxiliary_loss_clip": 0.01102604, + "auxiliary_loss_mlp": 0.01032867, + "balance_loss_clip": 1.02122581, + "balance_loss_mlp": 1.03541362, + "epoch": 0.8951149857207275, + "flos": 20121323863680.0, + "grad_norm": 1.7844722249086462, + "language_loss": 0.61070365, + "learning_rate": 1.1423108298925698e-07, + "loss": 0.63205838, + "num_input_tokens_seen": 321083840, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 14888, + "time_per_iteration": 3.9138073921203613 + }, + { + "auxiliary_loss_clip": 0.011003, + "auxiliary_loss_mlp": 0.0102525, + "balance_loss_clip": 1.01431835, + "balance_loss_mlp": 1.03365338, + "epoch": 0.8951751089733955, + "flos": 29862631463040.0, + "grad_norm": 1.7551662985764278, + "language_loss": 0.69682604, + "learning_rate": 1.1410138129034952e-07, + "loss": 0.71808153, + "num_input_tokens_seen": 321104165, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6640625, + "step": 14889, + "time_per_iteration": 3.9413297176361084 + }, + { + "auxiliary_loss_clip": 0.01102211, + "auxiliary_loss_mlp": 0.0102801, + "balance_loss_clip": 1.01619697, + "balance_loss_mlp": 1.03535187, + "epoch": 0.8952352322260634, + "flos": 15262789415040.0, + "grad_norm": 2.3512086286063614, + "language_loss": 0.70814884, + "learning_rate": 1.1397175110490676e-07, + "loss": 0.72945112, + "num_input_tokens_seen": 321117290, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66796875, + "step": 14890, + "time_per_iteration": 2.435168743133545 + }, + { + "auxiliary_loss_clip": 0.01098203, + "auxiliary_loss_mlp": 0.01025849, + "balance_loss_clip": 1.01420212, + "balance_loss_mlp": 1.0328474, + "epoch": 0.8952953554787314, + "flos": 26798338206720.0, + "grad_norm": 1.5590643217837603, + "language_loss": 0.75952852, + "learning_rate": 1.1384219243784454e-07, + "loss": 0.78076905, + "num_input_tokens_seen": 321137115, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65234375, + "step": 14891, + "time_per_iteration": 2.477548122406006 + }, + { + "auxiliary_loss_clip": 0.01101231, + "auxiliary_loss_mlp": 0.01031497, + "balance_loss_clip": 1.0199697, + "balance_loss_mlp": 1.03322709, + "epoch": 0.8953554787313994, + "flos": 14137205852160.0, + "grad_norm": 1.8140090200899526, + "language_loss": 0.76758611, + "learning_rate": 1.1371270529407517e-07, + "loss": 0.78891343, + "num_input_tokens_seen": 321154490, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6796875, + "step": 14892, + "time_per_iteration": 2.452353000640869 + }, + { + "auxiliary_loss_clip": 0.01098634, + "auxiliary_loss_mlp": 0.01030455, + "balance_loss_clip": 1.01936281, + "balance_loss_mlp": 1.03434777, + "epoch": 0.8954156019840673, + "flos": 25703314139520.0, + "grad_norm": 1.3207000845430072, + "language_loss": 0.81841969, + "learning_rate": 1.1358328967850895e-07, + "loss": 0.83971059, + "num_input_tokens_seen": 321175625, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.64453125, + "step": 14893, + "time_per_iteration": 2.5077221393585205 + }, + { + "auxiliary_loss_clip": 0.0109668, + "auxiliary_loss_mlp": 0.01028564, + "balance_loss_clip": 1.01751983, + "balance_loss_mlp": 1.03423703, + "epoch": 0.8954757252367354, + "flos": 21907987286400.0, + "grad_norm": 11.57553130306276, + "language_loss": 0.74789113, + "learning_rate": 1.1345394559605348e-07, + "loss": 0.76914358, + "num_input_tokens_seen": 321193895, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.625, + "step": 14894, + "time_per_iteration": 3.945729970932007 + }, + { + "auxiliary_loss_clip": 0.01102545, + "auxiliary_loss_mlp": 0.01032674, + "balance_loss_clip": 1.02041912, + "balance_loss_mlp": 1.03604054, + "epoch": 0.8955358484894033, + "flos": 12970396454400.0, + "grad_norm": 1.6244269958664943, + "language_loss": 0.66519237, + "learning_rate": 1.1332467305161352e-07, + "loss": 0.68654454, + "num_input_tokens_seen": 321211610, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6640625, + "step": 14895, + "time_per_iteration": 2.4295578002929688 + }, + { + "auxiliary_loss_clip": 0.01102129, + "auxiliary_loss_mlp": 0.01029088, + "balance_loss_clip": 1.01665497, + "balance_loss_mlp": 1.03526545, + "epoch": 0.8955959717420713, + "flos": 17273966797440.0, + "grad_norm": 1.8850172467985669, + "language_loss": 0.67215335, + "learning_rate": 1.1319547205009094e-07, + "loss": 0.69346553, + "num_input_tokens_seen": 321229805, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 14896, + "time_per_iteration": 2.441373586654663 + }, + { + "auxiliary_loss_clip": 0.01099805, + "auxiliary_loss_mlp": 0.0103214, + "balance_loss_clip": 1.02071393, + "balance_loss_mlp": 1.03478193, + "epoch": 0.8956560949947392, + "flos": 14793868339200.0, + "grad_norm": 1.7592330291905882, + "language_loss": 0.75651777, + "learning_rate": 1.1306634259638492e-07, + "loss": 0.77783716, + "num_input_tokens_seen": 321247165, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 14897, + "time_per_iteration": 2.45491361618042 + }, + { + "auxiliary_loss_clip": 0.01021895, + "auxiliary_loss_mlp": 0.01003334, + "balance_loss_clip": 1.00235045, + "balance_loss_mlp": 1.00189352, + "epoch": 0.8957162182474072, + "flos": 63607817957760.0, + "grad_norm": 0.7479901222096683, + "language_loss": 0.55332673, + "learning_rate": 1.129372846953931e-07, + "loss": 0.57357907, + "num_input_tokens_seen": 321308425, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.20019531, + "step": 14898, + "time_per_iteration": 3.0941059589385986 + }, + { + "auxiliary_loss_clip": 0.01100232, + "auxiliary_loss_mlp": 0.01028995, + "balance_loss_clip": 1.01750898, + "balance_loss_mlp": 1.0343554, + "epoch": 0.8957763415000751, + "flos": 25009843190400.0, + "grad_norm": 1.407731502520021, + "language_loss": 0.7033121, + "learning_rate": 1.12808298352008e-07, + "loss": 0.72460437, + "num_input_tokens_seen": 321329295, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.66015625, + "step": 14899, + "time_per_iteration": 2.500845432281494 + }, + { + "auxiliary_loss_clip": 0.01102543, + "auxiliary_loss_mlp": 0.01036157, + "balance_loss_clip": 1.02335942, + "balance_loss_mlp": 1.03636515, + "epoch": 0.8958364647527431, + "flos": 19828615933440.0, + "grad_norm": 2.007406117160179, + "language_loss": 0.73626882, + "learning_rate": 1.1267938357112106e-07, + "loss": 0.7576558, + "num_input_tokens_seen": 321347580, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.66015625, + "step": 14900, + "time_per_iteration": 2.462517738342285 + }, + { + "auxiliary_loss_clip": 0.01022163, + "auxiliary_loss_mlp": 0.0100183, + "balance_loss_clip": 1.00092971, + "balance_loss_mlp": 1.00212479, + "epoch": 0.895896588005411, + "flos": 65537190115200.0, + "grad_norm": 0.7858702190089509, + "language_loss": 0.61846119, + "learning_rate": 1.1255054035762124e-07, + "loss": 0.63870108, + "num_input_tokens_seen": 321407820, + "router_z_loss_clip": 0.00897217, + "router_z_loss_mlp": 0.20019531, + "step": 14901, + "time_per_iteration": 3.0669608116149902 + }, + { + "auxiliary_loss_clip": 0.01099585, + "auxiliary_loss_mlp": 0.01028043, + "balance_loss_clip": 1.01636648, + "balance_loss_mlp": 1.03309727, + "epoch": 0.8959567112580791, + "flos": 25591021246080.0, + "grad_norm": 1.7072266384182382, + "language_loss": 0.70579618, + "learning_rate": 1.1242176871639441e-07, + "loss": 0.72707248, + "num_input_tokens_seen": 321426745, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 14902, + "time_per_iteration": 2.4966113567352295 + }, + { + "auxiliary_loss_clip": 0.01095333, + "auxiliary_loss_mlp": 0.0102621, + "balance_loss_clip": 1.01517129, + "balance_loss_mlp": 1.03252649, + "epoch": 0.896016834510747, + "flos": 24201780877440.0, + "grad_norm": 1.666989732507148, + "language_loss": 0.78098989, + "learning_rate": 1.1229306865232313e-07, + "loss": 0.80220532, + "num_input_tokens_seen": 321446165, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.62890625, + "step": 14903, + "time_per_iteration": 2.4643096923828125 + }, + { + "auxiliary_loss_clip": 0.01103263, + "auxiliary_loss_mlp": 0.01028383, + "balance_loss_clip": 1.01646781, + "balance_loss_mlp": 1.03564441, + "epoch": 0.896076957763415, + "flos": 23075945919360.0, + "grad_norm": 1.6349395372995028, + "language_loss": 0.72710371, + "learning_rate": 1.121644401702877e-07, + "loss": 0.74842012, + "num_input_tokens_seen": 321465285, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 14904, + "time_per_iteration": 2.476510524749756 + }, + { + "auxiliary_loss_clip": 0.01100675, + "auxiliary_loss_mlp": 0.01025875, + "balance_loss_clip": 1.01292312, + "balance_loss_mlp": 1.03407562, + "epoch": 0.8961370810160829, + "flos": 22236605838720.0, + "grad_norm": 2.003019000801922, + "language_loss": 0.74558008, + "learning_rate": 1.12035883275166e-07, + "loss": 0.76684558, + "num_input_tokens_seen": 321483670, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.66796875, + "step": 14905, + "time_per_iteration": 2.5374937057495117 + }, + { + "auxiliary_loss_clip": 0.01097255, + "auxiliary_loss_mlp": 0.0102924, + "balance_loss_clip": 1.01764691, + "balance_loss_mlp": 1.03354824, + "epoch": 0.8961972042687509, + "flos": 23072318645760.0, + "grad_norm": 1.5757434113327204, + "language_loss": 0.76621282, + "learning_rate": 1.1190739797183279e-07, + "loss": 0.78747779, + "num_input_tokens_seen": 321501190, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.640625, + "step": 14906, + "time_per_iteration": 2.4911582469940186 + }, + { + "auxiliary_loss_clip": 0.0109995, + "auxiliary_loss_mlp": 0.01030126, + "balance_loss_clip": 1.01864624, + "balance_loss_mlp": 1.03458023, + "epoch": 0.896257327521419, + "flos": 18185882307840.0, + "grad_norm": 1.9866469364584363, + "language_loss": 0.74468869, + "learning_rate": 1.1177898426515996e-07, + "loss": 0.76598948, + "num_input_tokens_seen": 321518540, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65234375, + "step": 14907, + "time_per_iteration": 2.4224627017974854 + }, + { + "auxiliary_loss_clip": 0.0109858, + "auxiliary_loss_mlp": 0.01032943, + "balance_loss_clip": 1.02196395, + "balance_loss_mlp": 1.03504455, + "epoch": 0.8963174507740869, + "flos": 17895472848000.0, + "grad_norm": 2.42177438179363, + "language_loss": 0.82961619, + "learning_rate": 1.1165064216001785e-07, + "loss": 0.85093141, + "num_input_tokens_seen": 321536555, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.63671875, + "step": 14908, + "time_per_iteration": 2.4348533153533936 + }, + { + "auxiliary_loss_clip": 0.01101575, + "auxiliary_loss_mlp": 0.01028145, + "balance_loss_clip": 1.01572299, + "balance_loss_mlp": 1.03415501, + "epoch": 0.8963775740267549, + "flos": 21032269706880.0, + "grad_norm": 1.7129159855777194, + "language_loss": 0.70255554, + "learning_rate": 1.1152237166127232e-07, + "loss": 0.72385275, + "num_input_tokens_seen": 321557655, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.67578125, + "step": 14909, + "time_per_iteration": 2.458195924758911 + }, + { + "auxiliary_loss_clip": 0.01102257, + "auxiliary_loss_mlp": 0.01032834, + "balance_loss_clip": 1.02075171, + "balance_loss_mlp": 1.03573656, + "epoch": 0.8964376972794228, + "flos": 23179619548800.0, + "grad_norm": 1.709642155357814, + "language_loss": 0.72406387, + "learning_rate": 1.113941727737877e-07, + "loss": 0.74541485, + "num_input_tokens_seen": 321576160, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 14910, + "time_per_iteration": 2.4810500144958496 + }, + { + "auxiliary_loss_clip": 0.0109713, + "auxiliary_loss_mlp": 0.01026641, + "balance_loss_clip": 1.01558399, + "balance_loss_mlp": 1.03219759, + "epoch": 0.8964978205320908, + "flos": 24972998814720.0, + "grad_norm": 2.502367907177224, + "language_loss": 0.63351315, + "learning_rate": 1.1126604550242502e-07, + "loss": 0.65475088, + "num_input_tokens_seen": 321596205, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 14911, + "time_per_iteration": 2.48689866065979 + }, + { + "auxiliary_loss_clip": 0.01102037, + "auxiliary_loss_mlp": 0.01027549, + "balance_loss_clip": 1.01584291, + "balance_loss_mlp": 1.03563142, + "epoch": 0.8965579437847587, + "flos": 19172025273600.0, + "grad_norm": 1.9938612971873675, + "language_loss": 0.74839032, + "learning_rate": 1.111379898520437e-07, + "loss": 0.76968622, + "num_input_tokens_seen": 321614800, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 14912, + "time_per_iteration": 2.4474406242370605 + }, + { + "auxiliary_loss_clip": 0.01099856, + "auxiliary_loss_mlp": 0.01030977, + "balance_loss_clip": 1.01930642, + "balance_loss_mlp": 1.03326905, + "epoch": 0.8966180670374267, + "flos": 24276690691200.0, + "grad_norm": 1.7165060291362908, + "language_loss": 0.81594461, + "learning_rate": 1.1101000582749876e-07, + "loss": 0.83725297, + "num_input_tokens_seen": 321633445, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 14913, + "time_per_iteration": 2.4972469806671143 + }, + { + "auxiliary_loss_clip": 0.01103057, + "auxiliary_loss_mlp": 0.01033203, + "balance_loss_clip": 1.0206089, + "balance_loss_mlp": 1.03490567, + "epoch": 0.8966781902900947, + "flos": 13553190622080.0, + "grad_norm": 2.7571386064915555, + "language_loss": 0.61551863, + "learning_rate": 1.1088209343364407e-07, + "loss": 0.63688123, + "num_input_tokens_seen": 321650890, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.68359375, + "step": 14914, + "time_per_iteration": 2.4439823627471924 + }, + { + "auxiliary_loss_clip": 0.01021938, + "auxiliary_loss_mlp": 0.01003898, + "balance_loss_clip": 1.00294387, + "balance_loss_mlp": 1.00187731, + "epoch": 0.8967383135427627, + "flos": 65066114223360.0, + "grad_norm": 0.74229947958434, + "language_loss": 0.55134475, + "learning_rate": 1.1075425267532956e-07, + "loss": 0.57160312, + "num_input_tokens_seen": 321710960, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.20117188, + "step": 14915, + "time_per_iteration": 3.0520334243774414 + }, + { + "auxiliary_loss_clip": 0.0109578, + "auxiliary_loss_mlp": 0.01029121, + "balance_loss_clip": 1.0185411, + "balance_loss_mlp": 1.03272772, + "epoch": 0.8967984367954306, + "flos": 29713027317120.0, + "grad_norm": 1.557041844193811, + "language_loss": 0.71559089, + "learning_rate": 1.1062648355740289e-07, + "loss": 0.73683989, + "num_input_tokens_seen": 321733290, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.62890625, + "step": 14916, + "time_per_iteration": 2.5423061847686768 + }, + { + "auxiliary_loss_clip": 0.01100034, + "auxiliary_loss_mlp": 0.0102934, + "balance_loss_clip": 1.01840854, + "balance_loss_mlp": 1.03454828, + "epoch": 0.8968585600480986, + "flos": 25702488126720.0, + "grad_norm": 1.5928608101669224, + "language_loss": 0.77743876, + "learning_rate": 1.1049878608470931e-07, + "loss": 0.79873246, + "num_input_tokens_seen": 321753120, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.65625, + "step": 14917, + "time_per_iteration": 2.4978179931640625 + }, + { + "auxiliary_loss_clip": 0.01104708, + "auxiliary_loss_mlp": 0.01035154, + "balance_loss_clip": 1.02280986, + "balance_loss_mlp": 1.03608799, + "epoch": 0.8969186833007665, + "flos": 30044698525440.0, + "grad_norm": 1.9323719767216765, + "language_loss": 0.68000007, + "learning_rate": 1.1037116026209137e-07, + "loss": 0.70139873, + "num_input_tokens_seen": 321772840, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6875, + "step": 14918, + "time_per_iteration": 2.5807528495788574 + }, + { + "auxiliary_loss_clip": 0.01099957, + "auxiliary_loss_mlp": 0.01027656, + "balance_loss_clip": 1.01676011, + "balance_loss_mlp": 1.03353751, + "epoch": 0.8969788065534345, + "flos": 22818143030400.0, + "grad_norm": 1.7530397429371827, + "language_loss": 0.83640873, + "learning_rate": 1.102436060943881e-07, + "loss": 0.85768479, + "num_input_tokens_seen": 321791020, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6640625, + "step": 14919, + "time_per_iteration": 2.4691712856292725 + }, + { + "auxiliary_loss_clip": 0.01100826, + "auxiliary_loss_mlp": 0.01028742, + "balance_loss_clip": 1.01639748, + "balance_loss_mlp": 1.03384709, + "epoch": 0.8970389298061026, + "flos": 13261488272640.0, + "grad_norm": 2.1041581456995018, + "language_loss": 0.71935117, + "learning_rate": 1.1011612358643696e-07, + "loss": 0.74064684, + "num_input_tokens_seen": 321810075, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.671875, + "step": 14920, + "time_per_iteration": 2.450545072555542 + }, + { + "auxiliary_loss_clip": 0.0109998, + "auxiliary_loss_mlp": 0.01029714, + "balance_loss_clip": 1.01801395, + "balance_loss_mlp": 1.03451681, + "epoch": 0.8970990530587705, + "flos": 10266071345280.0, + "grad_norm": 2.364040496753844, + "language_loss": 0.90711236, + "learning_rate": 1.0998871274307164e-07, + "loss": 0.92840934, + "num_input_tokens_seen": 321822635, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 14921, + "time_per_iteration": 2.416222095489502 + }, + { + "auxiliary_loss_clip": 0.01103175, + "auxiliary_loss_mlp": 0.01028402, + "balance_loss_clip": 1.01680899, + "balance_loss_mlp": 1.03518486, + "epoch": 0.8971591763114385, + "flos": 20302708567680.0, + "grad_norm": 1.7934445633449958, + "language_loss": 0.73719668, + "learning_rate": 1.0986137356912384e-07, + "loss": 0.7585125, + "num_input_tokens_seen": 321841130, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 14922, + "time_per_iteration": 2.4588661193847656 + }, + { + "auxiliary_loss_clip": 0.01097034, + "auxiliary_loss_mlp": 0.01029912, + "balance_loss_clip": 1.01772904, + "balance_loss_mlp": 1.03221571, + "epoch": 0.8972192995641064, + "flos": 23257043314560.0, + "grad_norm": 2.0889474548738995, + "language_loss": 0.70069325, + "learning_rate": 1.097341060694219e-07, + "loss": 0.72196275, + "num_input_tokens_seen": 321859855, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6484375, + "step": 14923, + "time_per_iteration": 2.453202724456787 + }, + { + "auxiliary_loss_clip": 0.01100903, + "auxiliary_loss_mlp": 0.01027393, + "balance_loss_clip": 1.01448882, + "balance_loss_mlp": 1.03415108, + "epoch": 0.8972794228167744, + "flos": 18369601395840.0, + "grad_norm": 2.37761241543336, + "language_loss": 0.69968379, + "learning_rate": 1.0960691024879221e-07, + "loss": 0.72096676, + "num_input_tokens_seen": 321877990, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.66796875, + "step": 14924, + "time_per_iteration": 2.4540958404541016 + }, + { + "auxiliary_loss_clip": 0.01097287, + "auxiliary_loss_mlp": 0.01029877, + "balance_loss_clip": 1.01961923, + "balance_loss_mlp": 1.03243768, + "epoch": 0.8973395460694423, + "flos": 23952058548480.0, + "grad_norm": 1.797667243382315, + "language_loss": 0.72386622, + "learning_rate": 1.0947978611205844e-07, + "loss": 0.74513781, + "num_input_tokens_seen": 321898120, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.6484375, + "step": 14925, + "time_per_iteration": 2.484833240509033 + }, + { + "auxiliary_loss_clip": 0.01103822, + "auxiliary_loss_mlp": 0.01027799, + "balance_loss_clip": 1.01577675, + "balance_loss_mlp": 1.03691697, + "epoch": 0.8973996693221103, + "flos": 24970843998720.0, + "grad_norm": 1.7702759423239505, + "language_loss": 0.82245016, + "learning_rate": 1.0935273366404008e-07, + "loss": 0.84376639, + "num_input_tokens_seen": 321918140, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.66796875, + "step": 14926, + "time_per_iteration": 2.492662191390991 + }, + { + "auxiliary_loss_clip": 0.01097674, + "auxiliary_loss_mlp": 0.01027525, + "balance_loss_clip": 1.01665902, + "balance_loss_mlp": 1.03260446, + "epoch": 0.8974597925747783, + "flos": 25738937452800.0, + "grad_norm": 1.6352336400713978, + "language_loss": 0.79072952, + "learning_rate": 1.092257529095555e-07, + "loss": 0.81198144, + "num_input_tokens_seen": 321938580, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.65234375, + "step": 14927, + "time_per_iteration": 2.4794511795043945 + }, + { + "auxiliary_loss_clip": 0.01097797, + "auxiliary_loss_mlp": 0.01026085, + "balance_loss_clip": 1.01563644, + "balance_loss_mlp": 1.03343737, + "epoch": 0.8975199158274463, + "flos": 38071918131840.0, + "grad_norm": 1.5303190462919, + "language_loss": 0.66319346, + "learning_rate": 1.0909884385341994e-07, + "loss": 0.68443227, + "num_input_tokens_seen": 321961135, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.64453125, + "step": 14928, + "time_per_iteration": 3.9788811206817627 + }, + { + "auxiliary_loss_clip": 0.01102325, + "auxiliary_loss_mlp": 0.01039454, + "balance_loss_clip": 1.02505386, + "balance_loss_mlp": 1.03454542, + "epoch": 0.8975800390801142, + "flos": 25411683617280.0, + "grad_norm": 1.753595417229725, + "language_loss": 0.70549512, + "learning_rate": 1.0897200650044602e-07, + "loss": 0.72691292, + "num_input_tokens_seen": 321980945, + "router_z_loss_clip": 0.14355469, + "router_z_loss_mlp": 0.67578125, + "step": 14929, + "time_per_iteration": 2.484863519668579 + }, + { + "auxiliary_loss_clip": 0.01101124, + "auxiliary_loss_mlp": 0.01029057, + "balance_loss_clip": 1.01841164, + "balance_loss_mlp": 1.03583241, + "epoch": 0.8976401623327822, + "flos": 21759604202880.0, + "grad_norm": 1.6065357298131178, + "language_loss": 0.67851043, + "learning_rate": 1.0884524085544256e-07, + "loss": 0.69981223, + "num_input_tokens_seen": 322000350, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.65234375, + "step": 14930, + "time_per_iteration": 3.8712880611419678 + }, + { + "auxiliary_loss_clip": 0.01096092, + "auxiliary_loss_mlp": 0.0103007, + "balance_loss_clip": 1.01850104, + "balance_loss_mlp": 1.03163958, + "epoch": 0.8977002855854501, + "flos": 13845323934720.0, + "grad_norm": 1.6896064423054011, + "language_loss": 0.7473526, + "learning_rate": 1.0871854692321769e-07, + "loss": 0.76861417, + "num_input_tokens_seen": 322018980, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.64453125, + "step": 14931, + "time_per_iteration": 3.825070858001709 + }, + { + "auxiliary_loss_clip": 0.01099993, + "auxiliary_loss_mlp": 0.01025398, + "balance_loss_clip": 1.01441276, + "balance_loss_mlp": 1.035815, + "epoch": 0.8977604088381181, + "flos": 19427529692160.0, + "grad_norm": 2.2133845278517925, + "language_loss": 0.63313723, + "learning_rate": 1.0859192470857492e-07, + "loss": 0.65439111, + "num_input_tokens_seen": 322037675, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.640625, + "step": 14932, + "time_per_iteration": 2.4451212882995605 + }, + { + "auxiliary_loss_clip": 0.01095296, + "auxiliary_loss_mlp": 0.01026455, + "balance_loss_clip": 1.01614976, + "balance_loss_mlp": 1.03391647, + "epoch": 0.8978205320907862, + "flos": 22742083981440.0, + "grad_norm": 1.5636902244201198, + "language_loss": 0.71594745, + "learning_rate": 1.0846537421631552e-07, + "loss": 0.73716497, + "num_input_tokens_seen": 322055130, + "router_z_loss_clip": 0.10302734, + "router_z_loss_mlp": 0.609375, + "step": 14933, + "time_per_iteration": 2.44564151763916 + }, + { + "auxiliary_loss_clip": 0.01100715, + "auxiliary_loss_mlp": 0.01027865, + "balance_loss_clip": 1.01600361, + "balance_loss_mlp": 1.03398991, + "epoch": 0.8978806553434541, + "flos": 21360529123200.0, + "grad_norm": 1.4921679700135908, + "language_loss": 0.74557078, + "learning_rate": 1.0833889545123898e-07, + "loss": 0.76685655, + "num_input_tokens_seen": 322074850, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66796875, + "step": 14934, + "time_per_iteration": 2.472975015640259 + }, + { + "auxiliary_loss_clip": 0.01098138, + "auxiliary_loss_mlp": 0.01026091, + "balance_loss_clip": 1.01478958, + "balance_loss_mlp": 1.03404582, + "epoch": 0.8979407785961221, + "flos": 20924178704640.0, + "grad_norm": 1.7760053421270396, + "language_loss": 0.60305613, + "learning_rate": 1.0821248841814123e-07, + "loss": 0.62429839, + "num_input_tokens_seen": 322093315, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.640625, + "step": 14935, + "time_per_iteration": 3.8992903232574463 + }, + { + "auxiliary_loss_clip": 0.01096724, + "auxiliary_loss_mlp": 0.01025165, + "balance_loss_clip": 1.01354182, + "balance_loss_mlp": 1.0332725, + "epoch": 0.89800090184879, + "flos": 25228934196480.0, + "grad_norm": 2.01940463607824, + "language_loss": 0.76901841, + "learning_rate": 1.0808615312181512e-07, + "loss": 0.79023731, + "num_input_tokens_seen": 322112555, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.63671875, + "step": 14936, + "time_per_iteration": 2.510512351989746 + }, + { + "auxiliary_loss_clip": 0.01098978, + "auxiliary_loss_mlp": 0.01029658, + "balance_loss_clip": 1.01873279, + "balance_loss_mlp": 1.03417063, + "epoch": 0.898061025101458, + "flos": 22562674525440.0, + "grad_norm": 1.7165054154452661, + "language_loss": 0.7398392, + "learning_rate": 1.0795988956705193e-07, + "loss": 0.76112556, + "num_input_tokens_seen": 322130440, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6484375, + "step": 14937, + "time_per_iteration": 2.4757297039031982 + }, + { + "auxiliary_loss_clip": 0.0102209, + "auxiliary_loss_mlp": 0.01004521, + "balance_loss_clip": 1.00356126, + "balance_loss_mlp": 1.00203967, + "epoch": 0.8981211483541259, + "flos": 56192551384320.0, + "grad_norm": 0.8567673559760905, + "language_loss": 0.63504851, + "learning_rate": 1.0783369775863915e-07, + "loss": 0.65531462, + "num_input_tokens_seen": 322187295, + "router_z_loss_clip": 0.00958252, + "router_z_loss_mlp": 0.20117188, + "step": 14938, + "time_per_iteration": 2.9755895137786865 + }, + { + "auxiliary_loss_clip": 0.01098564, + "auxiliary_loss_mlp": 0.01029561, + "balance_loss_clip": 1.01804495, + "balance_loss_mlp": 1.03520513, + "epoch": 0.898181271606794, + "flos": 16392718523520.0, + "grad_norm": 2.285609419822284, + "language_loss": 0.80244672, + "learning_rate": 1.0770757770136251e-07, + "loss": 0.82372797, + "num_input_tokens_seen": 322202965, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6328125, + "step": 14939, + "time_per_iteration": 2.4215035438537598 + }, + { + "auxiliary_loss_clip": 0.0102229, + "auxiliary_loss_mlp": 0.01001638, + "balance_loss_clip": 1.00065446, + "balance_loss_mlp": 1.0022645, + "epoch": 0.8982413948594619, + "flos": 63440259989760.0, + "grad_norm": 0.720038785949184, + "language_loss": 0.52935207, + "learning_rate": 1.0758152940000375e-07, + "loss": 0.5495913, + "num_input_tokens_seen": 322269490, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.20019531, + "step": 14940, + "time_per_iteration": 3.20149302482605 + }, + { + "auxiliary_loss_clip": 0.01098494, + "auxiliary_loss_mlp": 0.0103145, + "balance_loss_clip": 1.0192368, + "balance_loss_mlp": 1.03297186, + "epoch": 0.8983015181121299, + "flos": 21835340029440.0, + "grad_norm": 1.693993393171052, + "language_loss": 0.77516085, + "learning_rate": 1.0745555285934327e-07, + "loss": 0.79646027, + "num_input_tokens_seen": 322288060, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65625, + "step": 14941, + "time_per_iteration": 2.434145212173462 + }, + { + "auxiliary_loss_clip": 0.01100191, + "auxiliary_loss_mlp": 0.01030832, + "balance_loss_clip": 1.01858878, + "balance_loss_mlp": 1.03449476, + "epoch": 0.8983616413647978, + "flos": 28949961767040.0, + "grad_norm": 1.878231548634812, + "language_loss": 0.73163295, + "learning_rate": 1.0732964808415834e-07, + "loss": 0.75294316, + "num_input_tokens_seen": 322307930, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.65625, + "step": 14942, + "time_per_iteration": 2.5305984020233154 + }, + { + "auxiliary_loss_clip": 0.01101917, + "auxiliary_loss_mlp": 0.01032244, + "balance_loss_clip": 1.02054954, + "balance_loss_mlp": 1.03539038, + "epoch": 0.8984217646174658, + "flos": 17785083375360.0, + "grad_norm": 2.259546174056382, + "language_loss": 0.79731816, + "learning_rate": 1.0720381507922205e-07, + "loss": 0.81865978, + "num_input_tokens_seen": 322326155, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 14943, + "time_per_iteration": 2.404646396636963 + }, + { + "auxiliary_loss_clip": 0.01101262, + "auxiliary_loss_mlp": 0.0103413, + "balance_loss_clip": 1.02152395, + "balance_loss_mlp": 1.03429723, + "epoch": 0.8984818878701337, + "flos": 23404528558080.0, + "grad_norm": 1.4369536003517842, + "language_loss": 0.70990932, + "learning_rate": 1.0707805384930701e-07, + "loss": 0.73126322, + "num_input_tokens_seen": 322345850, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.671875, + "step": 14944, + "time_per_iteration": 2.499791383743286 + }, + { + "auxiliary_loss_clip": 0.01105021, + "auxiliary_loss_mlp": 0.0103126, + "balance_loss_clip": 1.01886237, + "balance_loss_mlp": 1.03674543, + "epoch": 0.8985420111228017, + "flos": 22346061557760.0, + "grad_norm": 1.8945410483938205, + "language_loss": 0.75043732, + "learning_rate": 1.0695236439918187e-07, + "loss": 0.77180016, + "num_input_tokens_seen": 322364715, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 14945, + "time_per_iteration": 2.4378676414489746 + }, + { + "auxiliary_loss_clip": 0.0110561, + "auxiliary_loss_mlp": 0.01033207, + "balance_loss_clip": 1.02051139, + "balance_loss_mlp": 1.03481817, + "epoch": 0.8986021343754698, + "flos": 21392776558080.0, + "grad_norm": 1.87755765658424, + "language_loss": 0.73487711, + "learning_rate": 1.0682674673361302e-07, + "loss": 0.75626534, + "num_input_tokens_seen": 322383570, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.70703125, + "step": 14946, + "time_per_iteration": 2.4829940795898438 + }, + { + "auxiliary_loss_clip": 0.01098299, + "auxiliary_loss_mlp": 0.01023691, + "balance_loss_clip": 1.0120796, + "balance_loss_mlp": 1.03329158, + "epoch": 0.8986622576281377, + "flos": 21325372686720.0, + "grad_norm": 2.0014358905556855, + "language_loss": 0.64285457, + "learning_rate": 1.0670120085736334e-07, + "loss": 0.66407442, + "num_input_tokens_seen": 322401375, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 14947, + "time_per_iteration": 2.438631296157837 + }, + { + "auxiliary_loss_clip": 0.01099177, + "auxiliary_loss_mlp": 0.01032773, + "balance_loss_clip": 1.02115035, + "balance_loss_mlp": 1.03433037, + "epoch": 0.8987223808808057, + "flos": 23988292392960.0, + "grad_norm": 3.1927043706660894, + "language_loss": 0.69610405, + "learning_rate": 1.0657572677519411e-07, + "loss": 0.71742362, + "num_input_tokens_seen": 322421890, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6484375, + "step": 14948, + "time_per_iteration": 2.4895057678222656 + }, + { + "auxiliary_loss_clip": 0.01100214, + "auxiliary_loss_mlp": 0.01029544, + "balance_loss_clip": 1.01798058, + "balance_loss_mlp": 1.03443384, + "epoch": 0.8987825041334736, + "flos": 41500956044160.0, + "grad_norm": 1.7633000851401956, + "language_loss": 0.74272358, + "learning_rate": 1.0645032449186309e-07, + "loss": 0.76402116, + "num_input_tokens_seen": 322445730, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 14949, + "time_per_iteration": 2.608767509460449 + }, + { + "auxiliary_loss_clip": 0.01101359, + "auxiliary_loss_mlp": 0.0103208, + "balance_loss_clip": 1.01912796, + "balance_loss_mlp": 1.03402793, + "epoch": 0.8988426273861416, + "flos": 27564276844800.0, + "grad_norm": 1.6829290861590958, + "language_loss": 0.75664008, + "learning_rate": 1.0632499401212513e-07, + "loss": 0.77797437, + "num_input_tokens_seen": 322464595, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.671875, + "step": 14950, + "time_per_iteration": 2.529982328414917 + }, + { + "auxiliary_loss_clip": 0.01100934, + "auxiliary_loss_mlp": 0.01029262, + "balance_loss_clip": 1.01849723, + "balance_loss_mlp": 1.03643644, + "epoch": 0.8989027506388095, + "flos": 17092653920640.0, + "grad_norm": 1.5718302114690097, + "language_loss": 0.66472352, + "learning_rate": 1.0619973534073334e-07, + "loss": 0.6860255, + "num_input_tokens_seen": 322483305, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.64453125, + "step": 14951, + "time_per_iteration": 2.4156932830810547 + }, + { + "auxiliary_loss_clip": 0.01102118, + "auxiliary_loss_mlp": 0.01025294, + "balance_loss_clip": 1.0141542, + "balance_loss_mlp": 1.03302848, + "epoch": 0.8989628738914776, + "flos": 20555124416640.0, + "grad_norm": 5.045918966257551, + "language_loss": 0.73914707, + "learning_rate": 1.0607454848243769e-07, + "loss": 0.76042116, + "num_input_tokens_seen": 322501905, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.69140625, + "step": 14952, + "time_per_iteration": 2.4806807041168213 + }, + { + "auxiliary_loss_clip": 0.01099151, + "auxiliary_loss_mlp": 0.01031976, + "balance_loss_clip": 1.0206635, + "balance_loss_mlp": 1.03456223, + "epoch": 0.8990229971441455, + "flos": 16251087196800.0, + "grad_norm": 2.152704477585915, + "language_loss": 0.56480038, + "learning_rate": 1.0594943344198481e-07, + "loss": 0.58611166, + "num_input_tokens_seen": 322518135, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 14953, + "time_per_iteration": 2.399141788482666 + }, + { + "auxiliary_loss_clip": 0.01099987, + "auxiliary_loss_mlp": 0.01033801, + "balance_loss_clip": 1.02200556, + "balance_loss_mlp": 1.03480887, + "epoch": 0.8990831203968135, + "flos": 21981316901760.0, + "grad_norm": 2.012480548312481, + "language_loss": 0.81600904, + "learning_rate": 1.0582439022411915e-07, + "loss": 0.83734691, + "num_input_tokens_seen": 322537905, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65234375, + "step": 14954, + "time_per_iteration": 2.4930219650268555 + }, + { + "auxiliary_loss_clip": 0.01098756, + "auxiliary_loss_mlp": 0.01030553, + "balance_loss_clip": 1.01884711, + "balance_loss_mlp": 1.03465247, + "epoch": 0.8991432436494814, + "flos": 27447171528960.0, + "grad_norm": 3.2641482155182318, + "language_loss": 0.60263079, + "learning_rate": 1.0569941883358224e-07, + "loss": 0.6239239, + "num_input_tokens_seen": 322557945, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.640625, + "step": 14955, + "time_per_iteration": 2.4798853397369385 + }, + { + "auxiliary_loss_clip": 0.01097787, + "auxiliary_loss_mlp": 0.01028865, + "balance_loss_clip": 1.01803517, + "balance_loss_mlp": 1.03440011, + "epoch": 0.8992033669021494, + "flos": 21579835610880.0, + "grad_norm": 2.1065178857932385, + "language_loss": 0.54606581, + "learning_rate": 1.0557451927511341e-07, + "loss": 0.56733239, + "num_input_tokens_seen": 322575765, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6328125, + "step": 14956, + "time_per_iteration": 2.498508930206299 + }, + { + "auxiliary_loss_clip": 0.01099303, + "auxiliary_loss_mlp": 0.01030831, + "balance_loss_clip": 1.0192976, + "balance_loss_mlp": 1.03451133, + "epoch": 0.8992634901548173, + "flos": 28584211530240.0, + "grad_norm": 1.7585876876511954, + "language_loss": 0.7994734, + "learning_rate": 1.0544969155344863e-07, + "loss": 0.82077473, + "num_input_tokens_seen": 322595665, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 14957, + "time_per_iteration": 2.4910168647766113 + }, + { + "auxiliary_loss_clip": 0.01103886, + "auxiliary_loss_mlp": 0.01032558, + "balance_loss_clip": 1.02023137, + "balance_loss_mlp": 1.0356164, + "epoch": 0.8993236134074853, + "flos": 19867435557120.0, + "grad_norm": 1.8978719607927441, + "language_loss": 0.78686506, + "learning_rate": 1.0532493567332123e-07, + "loss": 0.80822951, + "num_input_tokens_seen": 322614755, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.68359375, + "step": 14958, + "time_per_iteration": 2.5033602714538574 + }, + { + "auxiliary_loss_clip": 0.01099017, + "auxiliary_loss_mlp": 0.01026673, + "balance_loss_clip": 1.01628995, + "balance_loss_mlp": 1.03592014, + "epoch": 0.8993837366601534, + "flos": 19390649402880.0, + "grad_norm": 1.5628926324551675, + "language_loss": 0.749843, + "learning_rate": 1.0520025163946277e-07, + "loss": 0.77109987, + "num_input_tokens_seen": 322633425, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.62890625, + "step": 14959, + "time_per_iteration": 2.421219825744629 + }, + { + "auxiliary_loss_clip": 0.01094435, + "auxiliary_loss_mlp": 0.01028178, + "balance_loss_clip": 1.01707387, + "balance_loss_mlp": 1.03165054, + "epoch": 0.8994438599128213, + "flos": 18551740285440.0, + "grad_norm": 1.9131887389317577, + "language_loss": 0.68210769, + "learning_rate": 1.0507563945660015e-07, + "loss": 0.7033338, + "num_input_tokens_seen": 322652065, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.62890625, + "step": 14960, + "time_per_iteration": 2.4903640747070312 + }, + { + "auxiliary_loss_clip": 0.01099033, + "auxiliary_loss_mlp": 0.01026859, + "balance_loss_clip": 1.01579618, + "balance_loss_mlp": 1.03471529, + "epoch": 0.8995039831654893, + "flos": 24427587726720.0, + "grad_norm": 1.5401076588305127, + "language_loss": 0.6614114, + "learning_rate": 1.049510991294591e-07, + "loss": 0.68267035, + "num_input_tokens_seen": 322673275, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.640625, + "step": 14961, + "time_per_iteration": 2.4783365726470947 + }, + { + "auxiliary_loss_clip": 0.01095925, + "auxiliary_loss_mlp": 0.01027111, + "balance_loss_clip": 1.01650167, + "balance_loss_mlp": 1.03261304, + "epoch": 0.8995641064181572, + "flos": 21251324799360.0, + "grad_norm": 1.6811371424318786, + "language_loss": 0.82988048, + "learning_rate": 1.0482663066276254e-07, + "loss": 0.85111082, + "num_input_tokens_seen": 322693375, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.6328125, + "step": 14962, + "time_per_iteration": 2.471440553665161 + }, + { + "auxiliary_loss_clip": 0.0110458, + "auxiliary_loss_mlp": 0.01027368, + "balance_loss_clip": 1.0148747, + "balance_loss_mlp": 1.03580785, + "epoch": 0.8996242296708252, + "flos": 23513661054720.0, + "grad_norm": 2.1190623548906156, + "language_loss": 0.76490587, + "learning_rate": 1.047022340612298e-07, + "loss": 0.78622532, + "num_input_tokens_seen": 322712615, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6875, + "step": 14963, + "time_per_iteration": 2.4583775997161865 + }, + { + "auxiliary_loss_clip": 0.01022033, + "auxiliary_loss_mlp": 0.01001036, + "balance_loss_clip": 1.00004029, + "balance_loss_mlp": 1.00195932, + "epoch": 0.8996843529234931, + "flos": 62403230430720.0, + "grad_norm": 0.8094632900613583, + "language_loss": 0.57510412, + "learning_rate": 1.0457790932957867e-07, + "loss": 0.59533477, + "num_input_tokens_seen": 322766855, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.20117188, + "step": 14964, + "time_per_iteration": 2.8906238079071045 + }, + { + "auxiliary_loss_clip": 0.01107029, + "auxiliary_loss_mlp": 0.01032612, + "balance_loss_clip": 1.02000546, + "balance_loss_mlp": 1.03737593, + "epoch": 0.8997444761761612, + "flos": 24236829573120.0, + "grad_norm": 2.3957838042157134, + "language_loss": 0.67410362, + "learning_rate": 1.0445365647252269e-07, + "loss": 0.69550008, + "num_input_tokens_seen": 322781130, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 14965, + "time_per_iteration": 2.432751178741455 + }, + { + "auxiliary_loss_clip": 0.01101743, + "auxiliary_loss_mlp": 0.01032532, + "balance_loss_clip": 1.02107596, + "balance_loss_mlp": 1.03458083, + "epoch": 0.8998045994288291, + "flos": 21361103740800.0, + "grad_norm": 1.7764506802390192, + "language_loss": 0.71361762, + "learning_rate": 1.0432947549477433e-07, + "loss": 0.73496038, + "num_input_tokens_seen": 322800310, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.671875, + "step": 14966, + "time_per_iteration": 2.4522528648376465 + }, + { + "auxiliary_loss_clip": 0.01102649, + "auxiliary_loss_mlp": 0.01033604, + "balance_loss_clip": 1.0215342, + "balance_loss_mlp": 1.0366888, + "epoch": 0.8998647226814971, + "flos": 28986159697920.0, + "grad_norm": 1.6727777678710354, + "language_loss": 0.73497134, + "learning_rate": 1.0420536640104205e-07, + "loss": 0.75633389, + "num_input_tokens_seen": 322820955, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.66015625, + "step": 14967, + "time_per_iteration": 2.5017104148864746 + }, + { + "auxiliary_loss_clip": 0.01099365, + "auxiliary_loss_mlp": 0.01028351, + "balance_loss_clip": 1.01702619, + "balance_loss_mlp": 1.03394258, + "epoch": 0.899924845934165, + "flos": 13625909706240.0, + "grad_norm": 1.768165327175364, + "language_loss": 0.719221, + "learning_rate": 1.040813291960323e-07, + "loss": 0.74049813, + "num_input_tokens_seen": 322838780, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 14968, + "time_per_iteration": 2.4704649448394775 + }, + { + "auxiliary_loss_clip": 0.01100587, + "auxiliary_loss_mlp": 0.01029102, + "balance_loss_clip": 1.01798606, + "balance_loss_mlp": 1.03511333, + "epoch": 0.899984969186833, + "flos": 20882629647360.0, + "grad_norm": 2.1535153440352324, + "language_loss": 0.71085668, + "learning_rate": 1.0395736388444864e-07, + "loss": 0.73215359, + "num_input_tokens_seen": 322856710, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 14969, + "time_per_iteration": 2.4344112873077393 + }, + { + "auxiliary_loss_clip": 0.01103451, + "auxiliary_loss_mlp": 0.01028419, + "balance_loss_clip": 1.01689792, + "balance_loss_mlp": 1.03636861, + "epoch": 0.9000450924395009, + "flos": 20921808407040.0, + "grad_norm": 1.7775305510277348, + "language_loss": 0.76438725, + "learning_rate": 1.0383347047099201e-07, + "loss": 0.78570598, + "num_input_tokens_seen": 322876070, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 14970, + "time_per_iteration": 3.8684945106506348 + }, + { + "auxiliary_loss_clip": 0.01101777, + "auxiliary_loss_mlp": 0.01030114, + "balance_loss_clip": 1.01909924, + "balance_loss_mlp": 1.03452706, + "epoch": 0.900105215692169, + "flos": 17165049782400.0, + "grad_norm": 1.6290735872590396, + "language_loss": 0.73082769, + "learning_rate": 1.0370964896035972e-07, + "loss": 0.7521466, + "num_input_tokens_seen": 322895095, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.671875, + "step": 14971, + "time_per_iteration": 2.413736343383789 + }, + { + "auxiliary_loss_clip": 0.01099182, + "auxiliary_loss_mlp": 0.01026185, + "balance_loss_clip": 1.01400757, + "balance_loss_mlp": 1.03376043, + "epoch": 0.900165338944837, + "flos": 19931930426880.0, + "grad_norm": 1.967175811246173, + "language_loss": 0.81928706, + "learning_rate": 1.035858993572476e-07, + "loss": 0.84054077, + "num_input_tokens_seen": 322911845, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65625, + "step": 14972, + "time_per_iteration": 3.8709967136383057 + }, + { + "auxiliary_loss_clip": 0.01102627, + "auxiliary_loss_mlp": 0.01026983, + "balance_loss_clip": 1.01551533, + "balance_loss_mlp": 1.03478956, + "epoch": 0.9002254621975049, + "flos": 16107085572480.0, + "grad_norm": 2.346023909533121, + "language_loss": 0.81425643, + "learning_rate": 1.0346222166634855e-07, + "loss": 0.83555251, + "num_input_tokens_seen": 322928170, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6796875, + "step": 14973, + "time_per_iteration": 3.868040084838867 + }, + { + "auxiliary_loss_clip": 0.01098554, + "auxiliary_loss_mlp": 0.01034054, + "balance_loss_clip": 1.02212131, + "balance_loss_mlp": 1.03365421, + "epoch": 0.9002855854501729, + "flos": 28476120528000.0, + "grad_norm": 1.7893132294646954, + "language_loss": 0.57785386, + "learning_rate": 1.0333861589235193e-07, + "loss": 0.59917992, + "num_input_tokens_seen": 322948165, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6484375, + "step": 14974, + "time_per_iteration": 2.4945571422576904 + }, + { + "auxiliary_loss_clip": 0.01102563, + "auxiliary_loss_mlp": 0.01031866, + "balance_loss_clip": 1.02042198, + "balance_loss_mlp": 1.03682697, + "epoch": 0.9003457087028408, + "flos": 25630307746560.0, + "grad_norm": 1.6956938729099027, + "language_loss": 0.63379133, + "learning_rate": 1.0321508203994489e-07, + "loss": 0.65513563, + "num_input_tokens_seen": 322968880, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 14975, + "time_per_iteration": 2.496723175048828 + }, + { + "auxiliary_loss_clip": 0.01100905, + "auxiliary_loss_mlp": 0.01029003, + "balance_loss_clip": 1.01756501, + "balance_loss_mlp": 1.03466403, + "epoch": 0.9004058319555088, + "flos": 24389414547840.0, + "grad_norm": 1.809232420196502, + "language_loss": 0.7320652, + "learning_rate": 1.0309162011381257e-07, + "loss": 0.75336432, + "num_input_tokens_seen": 322989395, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66015625, + "step": 14976, + "time_per_iteration": 3.913522243499756 + }, + { + "auxiliary_loss_clip": 0.01101866, + "auxiliary_loss_mlp": 0.01029048, + "balance_loss_clip": 1.01757395, + "balance_loss_mlp": 1.03592968, + "epoch": 0.9004659552081767, + "flos": 29059345658880.0, + "grad_norm": 1.8460911703880327, + "language_loss": 0.69739205, + "learning_rate": 1.0296823011863565e-07, + "loss": 0.71870112, + "num_input_tokens_seen": 323009060, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.66015625, + "step": 14977, + "time_per_iteration": 2.4934232234954834 + }, + { + "auxiliary_loss_clip": 0.01101319, + "auxiliary_loss_mlp": 0.01034067, + "balance_loss_clip": 1.02094841, + "balance_loss_mlp": 1.03397775, + "epoch": 0.9005260784608448, + "flos": 16763855800320.0, + "grad_norm": 2.535072369203501, + "language_loss": 0.65230364, + "learning_rate": 1.0284491205909351e-07, + "loss": 0.67365754, + "num_input_tokens_seen": 323027530, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.671875, + "step": 14978, + "time_per_iteration": 2.4551024436950684 + }, + { + "auxiliary_loss_clip": 0.01105073, + "auxiliary_loss_mlp": 0.01031199, + "balance_loss_clip": 1.01883125, + "balance_loss_mlp": 1.03654051, + "epoch": 0.9005862017135127, + "flos": 20376002269440.0, + "grad_norm": 1.7787659821570612, + "language_loss": 0.78990376, + "learning_rate": 1.0272166593986286e-07, + "loss": 0.81126642, + "num_input_tokens_seen": 323045370, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 14979, + "time_per_iteration": 2.487516403198242 + }, + { + "auxiliary_loss_clip": 0.01021612, + "auxiliary_loss_mlp": 0.01001783, + "balance_loss_clip": 1.0008111, + "balance_loss_mlp": 1.00152564, + "epoch": 0.9006463249661807, + "flos": 67580255796480.0, + "grad_norm": 0.7257419902012564, + "language_loss": 0.53625673, + "learning_rate": 1.0259849176561642e-07, + "loss": 0.5564906, + "num_input_tokens_seen": 323105660, + "router_z_loss_clip": 0.00970459, + "router_z_loss_mlp": 0.20117188, + "step": 14980, + "time_per_iteration": 3.094318389892578 + }, + { + "auxiliary_loss_clip": 0.011041, + "auxiliary_loss_mlp": 0.01036054, + "balance_loss_clip": 1.02391291, + "balance_loss_mlp": 1.03616858, + "epoch": 0.9007064482188486, + "flos": 28293335193600.0, + "grad_norm": 1.8547665659485781, + "language_loss": 0.82101512, + "learning_rate": 1.0247538954102553e-07, + "loss": 0.84241676, + "num_input_tokens_seen": 323126365, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6796875, + "step": 14981, + "time_per_iteration": 2.4936017990112305 + }, + { + "auxiliary_loss_clip": 0.01097983, + "auxiliary_loss_mlp": 0.01031157, + "balance_loss_clip": 1.01992702, + "balance_loss_mlp": 1.03415704, + "epoch": 0.9007665714715166, + "flos": 21616320850560.0, + "grad_norm": 2.7203563856425985, + "language_loss": 0.81460178, + "learning_rate": 1.0235235927075758e-07, + "loss": 0.83589315, + "num_input_tokens_seen": 323145655, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.63671875, + "step": 14982, + "time_per_iteration": 2.4565653800964355 + }, + { + "auxiliary_loss_clip": 0.01095335, + "auxiliary_loss_mlp": 0.01029882, + "balance_loss_clip": 1.0194211, + "balance_loss_mlp": 1.03379464, + "epoch": 0.9008266947241845, + "flos": 26541864120960.0, + "grad_norm": 1.8323465775845416, + "language_loss": 0.71544576, + "learning_rate": 1.0222940095947885e-07, + "loss": 0.73669791, + "num_input_tokens_seen": 323164540, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.6171875, + "step": 14983, + "time_per_iteration": 2.4885804653167725 + }, + { + "auxiliary_loss_clip": 0.0109823, + "auxiliary_loss_mlp": 0.0102521, + "balance_loss_clip": 1.0146662, + "balance_loss_mlp": 1.03483844, + "epoch": 0.9008868179768525, + "flos": 23110527738240.0, + "grad_norm": 1.3467294069488691, + "language_loss": 0.75163013, + "learning_rate": 1.0210651461185115e-07, + "loss": 0.77286458, + "num_input_tokens_seen": 323186960, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6328125, + "step": 14984, + "time_per_iteration": 2.49717116355896 + }, + { + "auxiliary_loss_clip": 0.01095786, + "auxiliary_loss_mlp": 0.01031055, + "balance_loss_clip": 1.02003372, + "balance_loss_mlp": 1.03259993, + "epoch": 0.9009469412295206, + "flos": 19060809788160.0, + "grad_norm": 1.3973917605872446, + "language_loss": 0.70561159, + "learning_rate": 1.0198370023253456e-07, + "loss": 0.72688001, + "num_input_tokens_seen": 323206135, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6328125, + "step": 14985, + "time_per_iteration": 2.4311397075653076 + }, + { + "auxiliary_loss_clip": 0.01102011, + "auxiliary_loss_mlp": 0.01030572, + "balance_loss_clip": 1.01862133, + "balance_loss_mlp": 1.03354049, + "epoch": 0.9010070644821885, + "flos": 23222281927680.0, + "grad_norm": 1.8726650085306151, + "language_loss": 0.70216691, + "learning_rate": 1.0186095782618643e-07, + "loss": 0.7234928, + "num_input_tokens_seen": 323225980, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.68359375, + "step": 14986, + "time_per_iteration": 2.4868855476379395 + }, + { + "auxiliary_loss_clip": 0.0110024, + "auxiliary_loss_mlp": 0.01030525, + "balance_loss_clip": 1.01909316, + "balance_loss_mlp": 1.03285623, + "epoch": 0.9010671877348565, + "flos": 17384823146880.0, + "grad_norm": 1.6031253163932484, + "language_loss": 0.76845485, + "learning_rate": 1.0173828739746104e-07, + "loss": 0.7897625, + "num_input_tokens_seen": 323243700, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.67578125, + "step": 14987, + "time_per_iteration": 2.440084457397461 + }, + { + "auxiliary_loss_clip": 0.01098211, + "auxiliary_loss_mlp": 0.01030635, + "balance_loss_clip": 1.0194118, + "balance_loss_mlp": 1.03421617, + "epoch": 0.9011273109875244, + "flos": 21908166854400.0, + "grad_norm": 2.1006548743325673, + "language_loss": 0.74064976, + "learning_rate": 1.0161568895100981e-07, + "loss": 0.76193821, + "num_input_tokens_seen": 323261535, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 14988, + "time_per_iteration": 2.4963526725769043 + }, + { + "auxiliary_loss_clip": 0.01104597, + "auxiliary_loss_mlp": 0.01031072, + "balance_loss_clip": 1.01829827, + "balance_loss_mlp": 1.0362258, + "epoch": 0.9011874342401924, + "flos": 24060831909120.0, + "grad_norm": 1.8626335922164043, + "language_loss": 0.69308305, + "learning_rate": 1.0149316249148188e-07, + "loss": 0.71443975, + "num_input_tokens_seen": 323281855, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.68359375, + "step": 14989, + "time_per_iteration": 2.4539735317230225 + }, + { + "auxiliary_loss_clip": 0.01100876, + "auxiliary_loss_mlp": 0.01026062, + "balance_loss_clip": 1.01488054, + "balance_loss_mlp": 1.03533888, + "epoch": 0.9012475574928603, + "flos": 16758791982720.0, + "grad_norm": 1.8473333523557436, + "language_loss": 0.79848897, + "learning_rate": 1.0137070802352376e-07, + "loss": 0.8197583, + "num_input_tokens_seen": 323299505, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.65625, + "step": 14990, + "time_per_iteration": 2.4267704486846924 + }, + { + "auxiliary_loss_clip": 0.01103975, + "auxiliary_loss_mlp": 0.01027054, + "balance_loss_clip": 1.01531839, + "balance_loss_mlp": 1.03590596, + "epoch": 0.9013076807455284, + "flos": 19971109186560.0, + "grad_norm": 2.32674019018607, + "language_loss": 0.77580243, + "learning_rate": 1.0124832555177842e-07, + "loss": 0.79711276, + "num_input_tokens_seen": 323318365, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6796875, + "step": 14991, + "time_per_iteration": 2.4224207401275635 + }, + { + "auxiliary_loss_clip": 0.01021773, + "auxiliary_loss_mlp": 0.01002102, + "balance_loss_clip": 1.00102293, + "balance_loss_mlp": 1.00179267, + "epoch": 0.9013678039981963, + "flos": 65180274624000.0, + "grad_norm": 0.7778537052322216, + "language_loss": 0.60237074, + "learning_rate": 1.0112601508088726e-07, + "loss": 0.6226095, + "num_input_tokens_seen": 323371835, + "router_z_loss_clip": 0.01080322, + "router_z_loss_mlp": 0.19921875, + "step": 14992, + "time_per_iteration": 2.954866886138916 + }, + { + "auxiliary_loss_clip": 0.01098898, + "auxiliary_loss_mlp": 0.01024572, + "balance_loss_clip": 1.01278269, + "balance_loss_mlp": 1.03394318, + "epoch": 0.9014279272508643, + "flos": 20521224956160.0, + "grad_norm": 1.962840452399395, + "language_loss": 0.82822621, + "learning_rate": 1.0100377661548764e-07, + "loss": 0.8494609, + "num_input_tokens_seen": 323388495, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6484375, + "step": 14993, + "time_per_iteration": 2.443319797515869 + }, + { + "auxiliary_loss_clip": 0.01099362, + "auxiliary_loss_mlp": 0.01030918, + "balance_loss_clip": 1.01925957, + "balance_loss_mlp": 1.03348362, + "epoch": 0.9014880505035322, + "flos": 17309051406720.0, + "grad_norm": 1.9906429893811004, + "language_loss": 0.73098803, + "learning_rate": 1.0088161016021502e-07, + "loss": 0.75229084, + "num_input_tokens_seen": 323405280, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66015625, + "step": 14994, + "time_per_iteration": 2.410883665084839 + }, + { + "auxiliary_loss_clip": 0.01096276, + "auxiliary_loss_mlp": 0.01027611, + "balance_loss_clip": 1.01672149, + "balance_loss_mlp": 1.03312874, + "epoch": 0.9015481737562002, + "flos": 28402862739840.0, + "grad_norm": 1.724933842876544, + "language_loss": 0.64662391, + "learning_rate": 1.0075951571970187e-07, + "loss": 0.66786277, + "num_input_tokens_seen": 323425310, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6328125, + "step": 14995, + "time_per_iteration": 2.489705801010132 + }, + { + "auxiliary_loss_clip": 0.01099565, + "auxiliary_loss_mlp": 0.01029987, + "balance_loss_clip": 1.01805377, + "balance_loss_mlp": 1.03272486, + "epoch": 0.9016082970088681, + "flos": 29752672953600.0, + "grad_norm": 1.5339343417289035, + "language_loss": 0.66576183, + "learning_rate": 1.0063749329857873e-07, + "loss": 0.68705738, + "num_input_tokens_seen": 323447805, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66796875, + "step": 14996, + "time_per_iteration": 2.5253312587738037 + }, + { + "auxiliary_loss_clip": 0.01097858, + "auxiliary_loss_mlp": 0.01029991, + "balance_loss_clip": 1.01918435, + "balance_loss_mlp": 1.03384423, + "epoch": 0.9016684202615362, + "flos": 23513230091520.0, + "grad_norm": 1.737794019911755, + "language_loss": 0.6594162, + "learning_rate": 1.0051554290147168e-07, + "loss": 0.6806947, + "num_input_tokens_seen": 323467150, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.640625, + "step": 14997, + "time_per_iteration": 2.457235097885132 + }, + { + "auxiliary_loss_clip": 0.01099415, + "auxiliary_loss_mlp": 0.01033041, + "balance_loss_clip": 1.02148402, + "balance_loss_mlp": 1.03402448, + "epoch": 0.9017285435142042, + "flos": 16979247705600.0, + "grad_norm": 1.924410935543244, + "language_loss": 0.77711892, + "learning_rate": 1.0039366453300613e-07, + "loss": 0.79844344, + "num_input_tokens_seen": 323484250, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 14998, + "time_per_iteration": 2.4296000003814697 + }, + { + "auxiliary_loss_clip": 0.01099155, + "auxiliary_loss_mlp": 0.01027859, + "balance_loss_clip": 1.01644444, + "balance_loss_mlp": 1.03295517, + "epoch": 0.9017886667668721, + "flos": 21393351175680.0, + "grad_norm": 1.7165767621933876, + "language_loss": 0.74958098, + "learning_rate": 1.0027185819780281e-07, + "loss": 0.77085114, + "num_input_tokens_seen": 323502910, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6640625, + "step": 14999, + "time_per_iteration": 2.4602272510528564 + }, + { + "auxiliary_loss_clip": 0.01098364, + "auxiliary_loss_mlp": 0.01028562, + "balance_loss_clip": 1.01653385, + "balance_loss_mlp": 1.033705, + "epoch": 0.9018487900195401, + "flos": 20996574566400.0, + "grad_norm": 2.1701142203140855, + "language_loss": 0.7583977, + "learning_rate": 1.0015012390048117e-07, + "loss": 0.77966702, + "num_input_tokens_seen": 323521820, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6484375, + "step": 15000, + "time_per_iteration": 2.4680068492889404 + }, + { + "auxiliary_loss_clip": 0.01096199, + "auxiliary_loss_mlp": 0.01023147, + "balance_loss_clip": 1.01187539, + "balance_loss_mlp": 1.03231204, + "epoch": 0.901908913272208, + "flos": 53358443458560.0, + "grad_norm": 2.156487419883931, + "language_loss": 0.8029359, + "learning_rate": 1.0002846164565704e-07, + "loss": 0.82412934, + "num_input_tokens_seen": 323543200, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.640625, + "step": 15001, + "time_per_iteration": 2.746706485748291 + }, + { + "auxiliary_loss_clip": 0.01099004, + "auxiliary_loss_mlp": 0.01026922, + "balance_loss_clip": 1.01602638, + "balance_loss_mlp": 1.03517938, + "epoch": 0.901969036524876, + "flos": 22089838867200.0, + "grad_norm": 1.5066901043573788, + "language_loss": 0.78355694, + "learning_rate": 9.990687143794407e-08, + "loss": 0.80481625, + "num_input_tokens_seen": 323563075, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.63671875, + "step": 15002, + "time_per_iteration": 2.4581403732299805 + }, + { + "auxiliary_loss_clip": 0.01101993, + "auxiliary_loss_mlp": 0.01031757, + "balance_loss_clip": 1.0189364, + "balance_loss_mlp": 1.03616595, + "epoch": 0.9020291597775439, + "flos": 23835025059840.0, + "grad_norm": 1.9922520479802717, + "language_loss": 0.68118757, + "learning_rate": 9.978535328195347e-08, + "loss": 0.70252508, + "num_input_tokens_seen": 323579065, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.65625, + "step": 15003, + "time_per_iteration": 2.4612655639648438 + }, + { + "auxiliary_loss_clip": 0.01101641, + "auxiliary_loss_mlp": 0.01035508, + "balance_loss_clip": 1.02349198, + "balance_loss_mlp": 1.03430641, + "epoch": 0.902089283030212, + "flos": 18326005263360.0, + "grad_norm": 2.2792574133264916, + "language_loss": 0.8624227, + "learning_rate": 9.9663907182292e-08, + "loss": 0.88379425, + "num_input_tokens_seen": 323594835, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 15004, + "time_per_iteration": 2.415255308151245 + }, + { + "auxiliary_loss_clip": 0.01100861, + "auxiliary_loss_mlp": 0.01029911, + "balance_loss_clip": 1.01819897, + "balance_loss_mlp": 1.03495574, + "epoch": 0.9021494062828799, + "flos": 24170359455360.0, + "grad_norm": 2.707915413244076, + "language_loss": 0.72780323, + "learning_rate": 9.954253314356575e-08, + "loss": 0.74911094, + "num_input_tokens_seen": 323611475, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66015625, + "step": 15005, + "time_per_iteration": 2.4494197368621826 + }, + { + "auxiliary_loss_clip": 0.01100686, + "auxiliary_loss_mlp": 0.01027584, + "balance_loss_clip": 1.01526415, + "balance_loss_mlp": 1.03250015, + "epoch": 0.9022095295355479, + "flos": 21616859554560.0, + "grad_norm": 1.9208568535467845, + "language_loss": 0.71333838, + "learning_rate": 9.942123117037748e-08, + "loss": 0.73462105, + "num_input_tokens_seen": 323629730, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.68359375, + "step": 15006, + "time_per_iteration": 2.4571921825408936 + }, + { + "auxiliary_loss_clip": 0.01102333, + "auxiliary_loss_mlp": 0.01027269, + "balance_loss_clip": 1.01600361, + "balance_loss_mlp": 1.03493929, + "epoch": 0.9022696527882158, + "flos": 18726229578240.0, + "grad_norm": 1.8905972356215264, + "language_loss": 0.84425151, + "learning_rate": 9.930000126732618e-08, + "loss": 0.8655476, + "num_input_tokens_seen": 323646000, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.67578125, + "step": 15007, + "time_per_iteration": 2.40256404876709 + }, + { + "auxiliary_loss_clip": 0.01098363, + "auxiliary_loss_mlp": 0.01028058, + "balance_loss_clip": 1.01581562, + "balance_loss_mlp": 1.03405857, + "epoch": 0.9023297760408838, + "flos": 26761206522240.0, + "grad_norm": 1.665209400281313, + "language_loss": 0.78456664, + "learning_rate": 9.917884343900928e-08, + "loss": 0.8058309, + "num_input_tokens_seen": 323667250, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.64453125, + "step": 15008, + "time_per_iteration": 2.5052990913391113 + }, + { + "auxiliary_loss_clip": 0.01094747, + "auxiliary_loss_mlp": 0.01031954, + "balance_loss_clip": 1.02086759, + "balance_loss_mlp": 1.03389013, + "epoch": 0.9023898992935517, + "flos": 20522553759360.0, + "grad_norm": 1.7711717743848543, + "language_loss": 0.73629749, + "learning_rate": 9.905775769002156e-08, + "loss": 0.75756449, + "num_input_tokens_seen": 323687150, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.609375, + "step": 15009, + "time_per_iteration": 2.429013252258301 + }, + { + "auxiliary_loss_clip": 0.01097775, + "auxiliary_loss_mlp": 0.01030275, + "balance_loss_clip": 1.01856875, + "balance_loss_mlp": 1.03367591, + "epoch": 0.9024500225462198, + "flos": 17456644391040.0, + "grad_norm": 1.8285057036627976, + "language_loss": 0.73694813, + "learning_rate": 9.893674402495399e-08, + "loss": 0.75822866, + "num_input_tokens_seen": 323703660, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.640625, + "step": 15010, + "time_per_iteration": 2.474562644958496 + }, + { + "auxiliary_loss_clip": 0.01100669, + "auxiliary_loss_mlp": 0.01027502, + "balance_loss_clip": 1.01529527, + "balance_loss_mlp": 1.03394616, + "epoch": 0.9025101457988878, + "flos": 20813609664000.0, + "grad_norm": 1.9793507655734852, + "language_loss": 0.74131656, + "learning_rate": 9.881580244839538e-08, + "loss": 0.76259828, + "num_input_tokens_seen": 323722060, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66796875, + "step": 15011, + "time_per_iteration": 2.4341142177581787 + }, + { + "auxiliary_loss_clip": 0.01102832, + "auxiliary_loss_mlp": 0.01029835, + "balance_loss_clip": 1.01797962, + "balance_loss_mlp": 1.03446949, + "epoch": 0.9025702690515557, + "flos": 19026371623680.0, + "grad_norm": 1.9745596076843646, + "language_loss": 0.73315668, + "learning_rate": 9.869493296493204e-08, + "loss": 0.75448334, + "num_input_tokens_seen": 323740645, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.68359375, + "step": 15012, + "time_per_iteration": 3.8315765857696533 + }, + { + "auxiliary_loss_clip": 0.01099189, + "auxiliary_loss_mlp": 0.01034355, + "balance_loss_clip": 1.02320898, + "balance_loss_mlp": 1.03451538, + "epoch": 0.9026303923042237, + "flos": 19682818629120.0, + "grad_norm": 1.7128250561733314, + "language_loss": 0.69050443, + "learning_rate": 9.857413557914763e-08, + "loss": 0.71183991, + "num_input_tokens_seen": 323758905, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 15013, + "time_per_iteration": 3.9151742458343506 + }, + { + "auxiliary_loss_clip": 0.01095444, + "auxiliary_loss_mlp": 0.01030147, + "balance_loss_clip": 1.01933491, + "balance_loss_mlp": 1.03227758, + "epoch": 0.9026905155568916, + "flos": 24608110504320.0, + "grad_norm": 1.3635728352284588, + "language_loss": 0.73009402, + "learning_rate": 9.845341029562249e-08, + "loss": 0.75134999, + "num_input_tokens_seen": 323780595, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6328125, + "step": 15014, + "time_per_iteration": 4.017148971557617 + }, + { + "auxiliary_loss_clip": 0.01098941, + "auxiliary_loss_mlp": 0.01027719, + "balance_loss_clip": 1.01563144, + "balance_loss_mlp": 1.03362596, + "epoch": 0.9027506388095596, + "flos": 20521799573760.0, + "grad_norm": 1.7828669233535654, + "language_loss": 0.72010767, + "learning_rate": 9.833275711893474e-08, + "loss": 0.74137437, + "num_input_tokens_seen": 323798160, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.65234375, + "step": 15015, + "time_per_iteration": 2.4381561279296875 + }, + { + "auxiliary_loss_clip": 0.01100433, + "auxiliary_loss_mlp": 0.01029767, + "balance_loss_clip": 1.01884782, + "balance_loss_mlp": 1.03419542, + "epoch": 0.9028107620622275, + "flos": 22784494965120.0, + "grad_norm": 1.9749885749535356, + "language_loss": 0.68876898, + "learning_rate": 9.821217605365895e-08, + "loss": 0.71007097, + "num_input_tokens_seen": 323816810, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6640625, + "step": 15016, + "time_per_iteration": 2.4668843746185303 + }, + { + "auxiliary_loss_clip": 0.0109769, + "auxiliary_loss_mlp": 0.01024064, + "balance_loss_clip": 1.01355553, + "balance_loss_mlp": 1.03386807, + "epoch": 0.9028708853148956, + "flos": 25410534382080.0, + "grad_norm": 2.00411116225002, + "language_loss": 0.70883679, + "learning_rate": 9.809166710436855e-08, + "loss": 0.73005426, + "num_input_tokens_seen": 323836900, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.640625, + "step": 15017, + "time_per_iteration": 2.482185125350952 + }, + { + "auxiliary_loss_clip": 0.01103533, + "auxiliary_loss_mlp": 0.0102941, + "balance_loss_clip": 1.0185684, + "balance_loss_mlp": 1.0386523, + "epoch": 0.9029310085675635, + "flos": 21871322478720.0, + "grad_norm": 1.6650542688248349, + "language_loss": 0.69549167, + "learning_rate": 9.797123027563237e-08, + "loss": 0.71682107, + "num_input_tokens_seen": 323855325, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6484375, + "step": 15018, + "time_per_iteration": 2.4584224224090576 + }, + { + "auxiliary_loss_clip": 0.01101224, + "auxiliary_loss_mlp": 0.01028924, + "balance_loss_clip": 1.01672328, + "balance_loss_mlp": 1.03524184, + "epoch": 0.9029911318202315, + "flos": 26214394803840.0, + "grad_norm": 1.6802563263084365, + "language_loss": 0.68777132, + "learning_rate": 9.785086557201782e-08, + "loss": 0.70907283, + "num_input_tokens_seen": 323875650, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66015625, + "step": 15019, + "time_per_iteration": 3.9201221466064453 + }, + { + "auxiliary_loss_clip": 0.01096083, + "auxiliary_loss_mlp": 0.01034982, + "balance_loss_clip": 1.02395487, + "balance_loss_mlp": 1.03281188, + "epoch": 0.9030512550728994, + "flos": 15961360095360.0, + "grad_norm": 1.798205753990298, + "language_loss": 0.71837938, + "learning_rate": 9.773057299808951e-08, + "loss": 0.73969001, + "num_input_tokens_seen": 323892920, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6328125, + "step": 15020, + "time_per_iteration": 2.4389073848724365 + }, + { + "auxiliary_loss_clip": 0.01099908, + "auxiliary_loss_mlp": 0.01031951, + "balance_loss_clip": 1.02038217, + "balance_loss_mlp": 1.03332424, + "epoch": 0.9031113783255674, + "flos": 23987610034560.0, + "grad_norm": 1.5503421486801767, + "language_loss": 0.74545062, + "learning_rate": 9.7610352558408e-08, + "loss": 0.76676923, + "num_input_tokens_seen": 323913835, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 15021, + "time_per_iteration": 2.4600391387939453 + }, + { + "auxiliary_loss_clip": 0.01103444, + "auxiliary_loss_mlp": 0.01028888, + "balance_loss_clip": 1.01647258, + "balance_loss_mlp": 1.03510642, + "epoch": 0.9031715015782353, + "flos": 22237216369920.0, + "grad_norm": 1.9984076842455942, + "language_loss": 0.72428268, + "learning_rate": 9.749020425753251e-08, + "loss": 0.74560601, + "num_input_tokens_seen": 323933440, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 15022, + "time_per_iteration": 2.459587335586548 + }, + { + "auxiliary_loss_clip": 0.01094092, + "auxiliary_loss_mlp": 0.01026663, + "balance_loss_clip": 1.01590419, + "balance_loss_mlp": 1.03282905, + "epoch": 0.9032316248309034, + "flos": 26323168164480.0, + "grad_norm": 1.75835560133373, + "language_loss": 0.72548139, + "learning_rate": 9.737012810001943e-08, + "loss": 0.74668896, + "num_input_tokens_seen": 323954090, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.61328125, + "step": 15023, + "time_per_iteration": 2.482862710952759 + }, + { + "auxiliary_loss_clip": 0.01099722, + "auxiliary_loss_mlp": 0.01028059, + "balance_loss_clip": 1.01691282, + "balance_loss_mlp": 1.03500438, + "epoch": 0.9032917480835713, + "flos": 22636686499200.0, + "grad_norm": 1.6389407013469093, + "language_loss": 0.82752883, + "learning_rate": 9.725012409042155e-08, + "loss": 0.84880662, + "num_input_tokens_seen": 323974040, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 15024, + "time_per_iteration": 2.4502294063568115 + }, + { + "auxiliary_loss_clip": 0.01100414, + "auxiliary_loss_mlp": 0.01026671, + "balance_loss_clip": 1.01547694, + "balance_loss_mlp": 1.03365493, + "epoch": 0.9033518713362393, + "flos": 23878764846720.0, + "grad_norm": 1.4809591037049192, + "language_loss": 0.69610882, + "learning_rate": 9.713019223328966e-08, + "loss": 0.71737969, + "num_input_tokens_seen": 323996125, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.66796875, + "step": 15025, + "time_per_iteration": 2.4894895553588867 + }, + { + "auxiliary_loss_clip": 0.01096725, + "auxiliary_loss_mlp": 0.01030602, + "balance_loss_clip": 1.01903272, + "balance_loss_mlp": 1.03266263, + "epoch": 0.9034119945889073, + "flos": 26905279973760.0, + "grad_norm": 2.0415991613165314, + "language_loss": 0.76887953, + "learning_rate": 9.70103325331717e-08, + "loss": 0.79015279, + "num_input_tokens_seen": 324017645, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.640625, + "step": 15026, + "time_per_iteration": 2.4938175678253174 + }, + { + "auxiliary_loss_clip": 0.01099584, + "auxiliary_loss_mlp": 0.01026983, + "balance_loss_clip": 1.01609278, + "balance_loss_mlp": 1.03508067, + "epoch": 0.9034721178415752, + "flos": 20850166730880.0, + "grad_norm": 1.603136682334736, + "language_loss": 0.68016422, + "learning_rate": 9.68905449946129e-08, + "loss": 0.7014299, + "num_input_tokens_seen": 324036875, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6484375, + "step": 15027, + "time_per_iteration": 2.4476981163024902 + }, + { + "auxiliary_loss_clip": 0.01095957, + "auxiliary_loss_mlp": 0.01028791, + "balance_loss_clip": 1.01779437, + "balance_loss_mlp": 1.03428614, + "epoch": 0.9035322410942432, + "flos": 22234307368320.0, + "grad_norm": 1.6502009028380256, + "language_loss": 0.76070625, + "learning_rate": 9.677082962215477e-08, + "loss": 0.78195375, + "num_input_tokens_seen": 324057045, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6171875, + "step": 15028, + "time_per_iteration": 2.4905800819396973 + }, + { + "auxiliary_loss_clip": 0.01099349, + "auxiliary_loss_mlp": 0.01033987, + "balance_loss_clip": 1.02226925, + "balance_loss_mlp": 1.03465712, + "epoch": 0.9035923643469111, + "flos": 25923410726400.0, + "grad_norm": 1.6442905394106826, + "language_loss": 0.69341254, + "learning_rate": 9.665118642033765e-08, + "loss": 0.71474588, + "num_input_tokens_seen": 324079735, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6484375, + "step": 15029, + "time_per_iteration": 2.5010592937469482 + }, + { + "auxiliary_loss_clip": 0.01101158, + "auxiliary_loss_mlp": 0.01029375, + "balance_loss_clip": 1.01723909, + "balance_loss_mlp": 1.03424096, + "epoch": 0.9036524875995792, + "flos": 20339804338560.0, + "grad_norm": 1.8286388517310772, + "language_loss": 0.73346627, + "learning_rate": 9.653161539369858e-08, + "loss": 0.75477159, + "num_input_tokens_seen": 324097785, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.66796875, + "step": 15030, + "time_per_iteration": 2.459181308746338 + }, + { + "auxiliary_loss_clip": 0.01101727, + "auxiliary_loss_mlp": 0.01028974, + "balance_loss_clip": 1.01754785, + "balance_loss_mlp": 1.03421438, + "epoch": 0.9037126108522471, + "flos": 40114624677120.0, + "grad_norm": 2.1675373254855548, + "language_loss": 0.68079257, + "learning_rate": 9.641211654677151e-08, + "loss": 0.70209956, + "num_input_tokens_seen": 324121625, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.67578125, + "step": 15031, + "time_per_iteration": 2.5919394493103027 + }, + { + "auxiliary_loss_clip": 0.01097122, + "auxiliary_loss_mlp": 0.01024246, + "balance_loss_clip": 1.01340365, + "balance_loss_mlp": 1.03348684, + "epoch": 0.9037727341049151, + "flos": 23332024955520.0, + "grad_norm": 1.5159455888593432, + "language_loss": 0.76419586, + "learning_rate": 9.629268988408723e-08, + "loss": 0.78540957, + "num_input_tokens_seen": 324142535, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.63671875, + "step": 15032, + "time_per_iteration": 2.489576578140259 + }, + { + "auxiliary_loss_clip": 0.01101137, + "auxiliary_loss_mlp": 0.0103026, + "balance_loss_clip": 1.01882803, + "balance_loss_mlp": 1.03507411, + "epoch": 0.903832857357583, + "flos": 12822659815680.0, + "grad_norm": 2.423158447270867, + "language_loss": 0.74984133, + "learning_rate": 9.617333541017502e-08, + "loss": 0.77115536, + "num_input_tokens_seen": 324159610, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66015625, + "step": 15033, + "time_per_iteration": 2.4186246395111084 + }, + { + "auxiliary_loss_clip": 0.01101634, + "auxiliary_loss_mlp": 0.01033122, + "balance_loss_clip": 1.02090335, + "balance_loss_mlp": 1.03524292, + "epoch": 0.903892980610251, + "flos": 25703026830720.0, + "grad_norm": 1.9290223402014928, + "language_loss": 0.74161011, + "learning_rate": 9.605405312956105e-08, + "loss": 0.76295769, + "num_input_tokens_seen": 324182510, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6640625, + "step": 15034, + "time_per_iteration": 2.5180463790893555 + }, + { + "auxiliary_loss_clip": 0.0110122, + "auxiliary_loss_mlp": 0.01031305, + "balance_loss_clip": 1.02005112, + "balance_loss_mlp": 1.0361948, + "epoch": 0.9039531038629189, + "flos": 14684089397760.0, + "grad_norm": 1.5379813145844734, + "language_loss": 0.63320333, + "learning_rate": 9.593484304676791e-08, + "loss": 0.65452856, + "num_input_tokens_seen": 324200555, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 15035, + "time_per_iteration": 2.409738540649414 + }, + { + "auxiliary_loss_clip": 0.01099917, + "auxiliary_loss_mlp": 0.01030413, + "balance_loss_clip": 1.01796794, + "balance_loss_mlp": 1.03505278, + "epoch": 0.904013227115587, + "flos": 24024921287040.0, + "grad_norm": 1.9696969037347303, + "language_loss": 0.61885166, + "learning_rate": 9.581570516631643e-08, + "loss": 0.64015502, + "num_input_tokens_seen": 324220255, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6484375, + "step": 15036, + "time_per_iteration": 2.470014810562134 + }, + { + "auxiliary_loss_clip": 0.01095707, + "auxiliary_loss_mlp": 0.01028293, + "balance_loss_clip": 1.01708138, + "balance_loss_mlp": 1.03364372, + "epoch": 0.9040733503682549, + "flos": 22856459863680.0, + "grad_norm": 1.9028952034418158, + "language_loss": 0.82219112, + "learning_rate": 9.569663949272455e-08, + "loss": 0.84343117, + "num_input_tokens_seen": 324237855, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.62109375, + "step": 15037, + "time_per_iteration": 2.4479734897613525 + }, + { + "auxiliary_loss_clip": 0.01100932, + "auxiliary_loss_mlp": 0.01026081, + "balance_loss_clip": 1.01457763, + "balance_loss_mlp": 1.03461885, + "epoch": 0.9041334736209229, + "flos": 19974951941760.0, + "grad_norm": 1.906996671867115, + "language_loss": 0.67425549, + "learning_rate": 9.557764603050667e-08, + "loss": 0.69552565, + "num_input_tokens_seen": 324257050, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 15038, + "time_per_iteration": 2.451862335205078 + }, + { + "auxiliary_loss_clip": 0.01099559, + "auxiliary_loss_mlp": 0.01034581, + "balance_loss_clip": 1.02312541, + "balance_loss_mlp": 1.03372645, + "epoch": 0.9041935968735909, + "flos": 17530548624000.0, + "grad_norm": 1.957839871915482, + "language_loss": 0.75246155, + "learning_rate": 9.545872478417494e-08, + "loss": 0.77380288, + "num_input_tokens_seen": 324275510, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 15039, + "time_per_iteration": 2.4133412837982178 + }, + { + "auxiliary_loss_clip": 0.0109794, + "auxiliary_loss_mlp": 0.0102971, + "balance_loss_clip": 1.01852202, + "balance_loss_mlp": 1.03453422, + "epoch": 0.9042537201262588, + "flos": 22780149419520.0, + "grad_norm": 2.283373021673528, + "language_loss": 0.70320803, + "learning_rate": 9.533987575823977e-08, + "loss": 0.72448456, + "num_input_tokens_seen": 324295150, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6328125, + "step": 15040, + "time_per_iteration": 2.4517173767089844 + }, + { + "auxiliary_loss_clip": 0.01096888, + "auxiliary_loss_mlp": 0.01028129, + "balance_loss_clip": 1.01703691, + "balance_loss_mlp": 1.03318739, + "epoch": 0.9043138433789268, + "flos": 20595416497920.0, + "grad_norm": 1.9449826459631894, + "language_loss": 0.68166679, + "learning_rate": 9.522109895720709e-08, + "loss": 0.70291698, + "num_input_tokens_seen": 324313855, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.63671875, + "step": 15041, + "time_per_iteration": 2.4308114051818848 + }, + { + "auxiliary_loss_clip": 0.01098669, + "auxiliary_loss_mlp": 0.01028741, + "balance_loss_clip": 1.01715946, + "balance_loss_mlp": 1.03303921, + "epoch": 0.9043739666315948, + "flos": 32962978995840.0, + "grad_norm": 4.567380169873624, + "language_loss": 0.57341689, + "learning_rate": 9.510239438558155e-08, + "loss": 0.59469104, + "num_input_tokens_seen": 324338465, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 15042, + "time_per_iteration": 2.55881404876709 + }, + { + "auxiliary_loss_clip": 0.01021889, + "auxiliary_loss_mlp": 0.00999686, + "balance_loss_clip": 0.99871486, + "balance_loss_mlp": 1.00187063, + "epoch": 0.9044340898842628, + "flos": 67296418525440.0, + "grad_norm": 0.7814772334169297, + "language_loss": 0.56925297, + "learning_rate": 9.498376204786351e-08, + "loss": 0.58946878, + "num_input_tokens_seen": 324398740, + "router_z_loss_clip": 0.00970459, + "router_z_loss_mlp": 0.20117188, + "step": 15043, + "time_per_iteration": 3.017444610595703 + }, + { + "auxiliary_loss_clip": 0.01101344, + "auxiliary_loss_mlp": 0.01027245, + "balance_loss_clip": 1.01484108, + "balance_loss_mlp": 1.03475761, + "epoch": 0.9044942131369307, + "flos": 17713154390400.0, + "grad_norm": 1.7111729102686908, + "language_loss": 0.69845128, + "learning_rate": 9.486520194855274e-08, + "loss": 0.71973717, + "num_input_tokens_seen": 324417335, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6640625, + "step": 15044, + "time_per_iteration": 2.434213876724243 + }, + { + "auxiliary_loss_clip": 0.01101302, + "auxiliary_loss_mlp": 0.01035972, + "balance_loss_clip": 1.02353251, + "balance_loss_mlp": 1.03485394, + "epoch": 0.9045543363895987, + "flos": 17820563034240.0, + "grad_norm": 3.2925488860468506, + "language_loss": 0.69370788, + "learning_rate": 9.474671409214407e-08, + "loss": 0.71508062, + "num_input_tokens_seen": 324433240, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6640625, + "step": 15045, + "time_per_iteration": 2.3861148357391357 + }, + { + "auxiliary_loss_clip": 0.01103641, + "auxiliary_loss_mlp": 0.01035073, + "balance_loss_clip": 1.02292609, + "balance_loss_mlp": 1.03618574, + "epoch": 0.9046144596422666, + "flos": 21872723109120.0, + "grad_norm": 2.269963118730515, + "language_loss": 0.65502143, + "learning_rate": 9.462829848313081e-08, + "loss": 0.67640865, + "num_input_tokens_seen": 324452675, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.67578125, + "step": 15046, + "time_per_iteration": 2.4733235836029053 + }, + { + "auxiliary_loss_clip": 0.01101419, + "auxiliary_loss_mlp": 0.01031632, + "balance_loss_clip": 1.02004504, + "balance_loss_mlp": 1.03384709, + "epoch": 0.9046745828949346, + "flos": 17672646827520.0, + "grad_norm": 2.0115064560474045, + "language_loss": 0.62080795, + "learning_rate": 9.450995512600379e-08, + "loss": 0.64213848, + "num_input_tokens_seen": 324467865, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.67578125, + "step": 15047, + "time_per_iteration": 2.4283289909362793 + }, + { + "auxiliary_loss_clip": 0.01099821, + "auxiliary_loss_mlp": 0.01030926, + "balance_loss_clip": 1.0202632, + "balance_loss_mlp": 1.03540373, + "epoch": 0.9047347061476025, + "flos": 25702559953920.0, + "grad_norm": 1.454198093610582, + "language_loss": 0.71481526, + "learning_rate": 9.439168402525032e-08, + "loss": 0.73612273, + "num_input_tokens_seen": 324490430, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.64453125, + "step": 15048, + "time_per_iteration": 2.5019192695617676 + }, + { + "auxiliary_loss_clip": 0.01100982, + "auxiliary_loss_mlp": 0.01032451, + "balance_loss_clip": 1.01958215, + "balance_loss_mlp": 1.0330019, + "epoch": 0.9047948294002706, + "flos": 15158146118400.0, + "grad_norm": 2.3424128392967405, + "language_loss": 0.75003755, + "learning_rate": 9.427348518535483e-08, + "loss": 0.77137184, + "num_input_tokens_seen": 324506620, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6796875, + "step": 15049, + "time_per_iteration": 2.400944948196411 + }, + { + "auxiliary_loss_clip": 0.01099611, + "auxiliary_loss_mlp": 0.01027604, + "balance_loss_clip": 1.01615953, + "balance_loss_mlp": 1.03538918, + "epoch": 0.9048549526529385, + "flos": 21872292145920.0, + "grad_norm": 1.6894904207673944, + "language_loss": 0.75737369, + "learning_rate": 9.415535861079993e-08, + "loss": 0.77864587, + "num_input_tokens_seen": 324525505, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.64453125, + "step": 15050, + "time_per_iteration": 2.477255344390869 + }, + { + "auxiliary_loss_clip": 0.01100101, + "auxiliary_loss_mlp": 0.0103241, + "balance_loss_clip": 1.02103782, + "balance_loss_mlp": 1.03422129, + "epoch": 0.9049150759056065, + "flos": 23546626761600.0, + "grad_norm": 1.6844009499475698, + "language_loss": 0.81676704, + "learning_rate": 9.403730430606472e-08, + "loss": 0.83809221, + "num_input_tokens_seen": 324544415, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.66015625, + "step": 15051, + "time_per_iteration": 2.4604718685150146 + }, + { + "auxiliary_loss_clip": 0.01099469, + "auxiliary_loss_mlp": 0.01028744, + "balance_loss_clip": 1.01816452, + "balance_loss_mlp": 1.0342989, + "epoch": 0.9049751991582745, + "flos": 19645902426240.0, + "grad_norm": 2.137215474768832, + "language_loss": 0.88935357, + "learning_rate": 9.391932227562582e-08, + "loss": 0.91063577, + "num_input_tokens_seen": 324562555, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6484375, + "step": 15052, + "time_per_iteration": 2.4719793796539307 + }, + { + "auxiliary_loss_clip": 0.0110445, + "auxiliary_loss_mlp": 0.01031379, + "balance_loss_clip": 1.0199585, + "balance_loss_mlp": 1.0360204, + "epoch": 0.9050353224109424, + "flos": 15596220389760.0, + "grad_norm": 2.0056674164312835, + "language_loss": 0.76978087, + "learning_rate": 9.380141252395724e-08, + "loss": 0.79113925, + "num_input_tokens_seen": 324580865, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.68359375, + "step": 15053, + "time_per_iteration": 3.8475534915924072 + }, + { + "auxiliary_loss_clip": 0.01098067, + "auxiliary_loss_mlp": 0.01033255, + "balance_loss_clip": 1.0216974, + "balance_loss_mlp": 1.03388309, + "epoch": 0.9050954456636104, + "flos": 28183592165760.0, + "grad_norm": 1.68822128465627, + "language_loss": 0.73156083, + "learning_rate": 9.368357505553049e-08, + "loss": 0.75287408, + "num_input_tokens_seen": 324600665, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 15054, + "time_per_iteration": 2.504624128341675 + }, + { + "auxiliary_loss_clip": 0.01098343, + "auxiliary_loss_mlp": 0.01028194, + "balance_loss_clip": 1.01721489, + "balance_loss_mlp": 1.03398204, + "epoch": 0.9051555689162784, + "flos": 25731611078400.0, + "grad_norm": 1.5885770670663444, + "language_loss": 0.82941592, + "learning_rate": 9.356580987481333e-08, + "loss": 0.8506813, + "num_input_tokens_seen": 324618145, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.64453125, + "step": 15055, + "time_per_iteration": 3.941993474960327 + }, + { + "auxiliary_loss_clip": 0.01098634, + "auxiliary_loss_mlp": 0.01033029, + "balance_loss_clip": 1.02112579, + "balance_loss_mlp": 1.03452206, + "epoch": 0.9052156921689464, + "flos": 23257258796160.0, + "grad_norm": 1.6252305322329523, + "language_loss": 0.84889591, + "learning_rate": 9.344811698627176e-08, + "loss": 0.87021255, + "num_input_tokens_seen": 324638165, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.640625, + "step": 15056, + "time_per_iteration": 3.8730804920196533 + }, + { + "auxiliary_loss_clip": 0.01098086, + "auxiliary_loss_mlp": 0.01026272, + "balance_loss_clip": 1.01563239, + "balance_loss_mlp": 1.03365731, + "epoch": 0.9052758154216143, + "flos": 29564285097600.0, + "grad_norm": 1.901364143448049, + "language_loss": 0.71921766, + "learning_rate": 9.333049639436863e-08, + "loss": 0.74046123, + "num_input_tokens_seen": 324658560, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.64453125, + "step": 15057, + "time_per_iteration": 2.5385053157806396 + }, + { + "auxiliary_loss_clip": 0.01097658, + "auxiliary_loss_mlp": 0.01026955, + "balance_loss_clip": 1.01602972, + "balance_loss_mlp": 1.03402162, + "epoch": 0.9053359386742823, + "flos": 22127688823680.0, + "grad_norm": 1.8215014735352213, + "language_loss": 0.80796474, + "learning_rate": 9.321294810356418e-08, + "loss": 0.82921088, + "num_input_tokens_seen": 324679185, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.63671875, + "step": 15058, + "time_per_iteration": 2.4862685203552246 + }, + { + "auxiliary_loss_clip": 0.01021601, + "auxiliary_loss_mlp": 0.00999772, + "balance_loss_clip": 0.99881822, + "balance_loss_mlp": 1.00167274, + "epoch": 0.9053960619269502, + "flos": 67090112760960.0, + "grad_norm": 0.6796033479937826, + "language_loss": 0.51406193, + "learning_rate": 9.309547211831592e-08, + "loss": 0.53427565, + "num_input_tokens_seen": 324744830, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.19921875, + "step": 15059, + "time_per_iteration": 3.1576147079467773 + }, + { + "auxiliary_loss_clip": 0.01098633, + "auxiliary_loss_mlp": 0.01027608, + "balance_loss_clip": 1.01625299, + "balance_loss_mlp": 1.03367758, + "epoch": 0.9054561851796182, + "flos": 15815419136640.0, + "grad_norm": 1.6940162245344546, + "language_loss": 0.6707449, + "learning_rate": 9.297806844307831e-08, + "loss": 0.6920073, + "num_input_tokens_seen": 324762905, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6484375, + "step": 15060, + "time_per_iteration": 3.8765251636505127 + }, + { + "auxiliary_loss_clip": 0.01102869, + "auxiliary_loss_mlp": 0.01030416, + "balance_loss_clip": 1.01882291, + "balance_loss_mlp": 1.03586102, + "epoch": 0.9055163084322861, + "flos": 17566997950080.0, + "grad_norm": 2.552329280643452, + "language_loss": 0.64600372, + "learning_rate": 9.286073708230357e-08, + "loss": 0.66733658, + "num_input_tokens_seen": 324781905, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 15061, + "time_per_iteration": 2.414705514907837 + }, + { + "auxiliary_loss_clip": 0.01101953, + "auxiliary_loss_mlp": 0.0103199, + "balance_loss_clip": 1.02084422, + "balance_loss_mlp": 1.03586221, + "epoch": 0.9055764316849542, + "flos": 17639573379840.0, + "grad_norm": 1.8780376792714681, + "language_loss": 0.71583116, + "learning_rate": 9.274347804044058e-08, + "loss": 0.73717052, + "num_input_tokens_seen": 324799260, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 15062, + "time_per_iteration": 2.453206777572632 + }, + { + "auxiliary_loss_clip": 0.01097235, + "auxiliary_loss_mlp": 0.01030982, + "balance_loss_clip": 1.01993728, + "balance_loss_mlp": 1.03298724, + "epoch": 0.9056365549376221, + "flos": 20120856986880.0, + "grad_norm": 1.594142455041583, + "language_loss": 0.70841157, + "learning_rate": 9.2626291321936e-08, + "loss": 0.72969377, + "num_input_tokens_seen": 324817800, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 15063, + "time_per_iteration": 2.4209649562835693 + }, + { + "auxiliary_loss_clip": 0.01095403, + "auxiliary_loss_mlp": 0.01029371, + "balance_loss_clip": 1.01830244, + "balance_loss_mlp": 1.03247714, + "epoch": 0.9056966781902901, + "flos": 27598786836480.0, + "grad_norm": 2.2069816413887695, + "language_loss": 0.71933818, + "learning_rate": 9.250917693123406e-08, + "loss": 0.74058586, + "num_input_tokens_seen": 324838445, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.62890625, + "step": 15064, + "time_per_iteration": 2.529771566390991 + }, + { + "auxiliary_loss_clip": 0.01099735, + "auxiliary_loss_mlp": 0.0103321, + "balance_loss_clip": 1.02177143, + "balance_loss_mlp": 1.03191257, + "epoch": 0.9057568014429581, + "flos": 25920106675200.0, + "grad_norm": 1.7008106699079477, + "language_loss": 0.69489098, + "learning_rate": 9.23921348727752e-08, + "loss": 0.71622044, + "num_input_tokens_seen": 324859895, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6796875, + "step": 15065, + "time_per_iteration": 2.4695396423339844 + }, + { + "auxiliary_loss_clip": 0.01100826, + "auxiliary_loss_mlp": 0.0103538, + "balance_loss_clip": 1.0242219, + "balance_loss_mlp": 1.036026, + "epoch": 0.905816924695626, + "flos": 22930364096640.0, + "grad_norm": 1.654206312026751, + "language_loss": 0.63057613, + "learning_rate": 9.227516515099743e-08, + "loss": 0.6519382, + "num_input_tokens_seen": 324879580, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 15066, + "time_per_iteration": 2.5075199604034424 + }, + { + "auxiliary_loss_clip": 0.01102024, + "auxiliary_loss_mlp": 0.01028293, + "balance_loss_clip": 1.01550758, + "balance_loss_mlp": 1.03313625, + "epoch": 0.905877047948294, + "flos": 22157422306560.0, + "grad_norm": 1.7954074160752378, + "language_loss": 0.80386341, + "learning_rate": 9.215826777033675e-08, + "loss": 0.82516658, + "num_input_tokens_seen": 324898950, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6875, + "step": 15067, + "time_per_iteration": 2.4531846046447754 + }, + { + "auxiliary_loss_clip": 0.01101013, + "auxiliary_loss_mlp": 0.01034254, + "balance_loss_clip": 1.02201724, + "balance_loss_mlp": 1.03438091, + "epoch": 0.905937171200962, + "flos": 15304805349120.0, + "grad_norm": 1.6316864228138155, + "language_loss": 0.70004576, + "learning_rate": 9.204144273522563e-08, + "loss": 0.72139847, + "num_input_tokens_seen": 324917455, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6640625, + "step": 15068, + "time_per_iteration": 2.454280376434326 + }, + { + "auxiliary_loss_clip": 0.01096195, + "auxiliary_loss_mlp": 0.01027996, + "balance_loss_clip": 1.01653373, + "balance_loss_mlp": 1.03272903, + "epoch": 0.90599729445363, + "flos": 19462973437440.0, + "grad_norm": 1.9832765312866232, + "language_loss": 0.85433835, + "learning_rate": 9.19246900500943e-08, + "loss": 0.87558019, + "num_input_tokens_seen": 324934495, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6328125, + "step": 15069, + "time_per_iteration": 2.446370840072632 + }, + { + "auxiliary_loss_clip": 0.0110437, + "auxiliary_loss_mlp": 0.01028245, + "balance_loss_clip": 1.01588929, + "balance_loss_mlp": 1.03456664, + "epoch": 0.9060574177062979, + "flos": 23732967542400.0, + "grad_norm": 1.757997353290004, + "language_loss": 0.58988464, + "learning_rate": 9.180800971936987e-08, + "loss": 0.61121076, + "num_input_tokens_seen": 324953230, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.69921875, + "step": 15070, + "time_per_iteration": 2.4755237102508545 + }, + { + "auxiliary_loss_clip": 0.01102192, + "auxiliary_loss_mlp": 0.01025212, + "balance_loss_clip": 1.01301742, + "balance_loss_mlp": 1.03411961, + "epoch": 0.9061175409589659, + "flos": 17311134395520.0, + "grad_norm": 3.1017762628694516, + "language_loss": 0.81624448, + "learning_rate": 9.169140174747724e-08, + "loss": 0.83751857, + "num_input_tokens_seen": 324969880, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6796875, + "step": 15071, + "time_per_iteration": 2.4169604778289795 + }, + { + "auxiliary_loss_clip": 0.01102965, + "auxiliary_loss_mlp": 0.01035458, + "balance_loss_clip": 1.02284586, + "balance_loss_mlp": 1.03447402, + "epoch": 0.9061776642116338, + "flos": 17778439359360.0, + "grad_norm": 1.8293370138207419, + "language_loss": 0.62059128, + "learning_rate": 9.157486613883758e-08, + "loss": 0.64197552, + "num_input_tokens_seen": 324987005, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.68359375, + "step": 15072, + "time_per_iteration": 2.4581499099731445 + }, + { + "auxiliary_loss_clip": 0.01098912, + "auxiliary_loss_mlp": 0.01032654, + "balance_loss_clip": 1.02073944, + "balance_loss_mlp": 1.03379536, + "epoch": 0.9062377874643018, + "flos": 42777688037760.0, + "grad_norm": 1.8672804369539069, + "language_loss": 0.73105884, + "learning_rate": 9.145840289787021e-08, + "loss": 0.75237453, + "num_input_tokens_seen": 325010700, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65234375, + "step": 15073, + "time_per_iteration": 2.641338586807251 + }, + { + "auxiliary_loss_clip": 0.01097199, + "auxiliary_loss_mlp": 0.01024734, + "balance_loss_clip": 1.01392126, + "balance_loss_mlp": 1.03387177, + "epoch": 0.9062979107169697, + "flos": 16361620323840.0, + "grad_norm": 1.788469817713978, + "language_loss": 0.80764318, + "learning_rate": 9.134201202899161e-08, + "loss": 0.82886249, + "num_input_tokens_seen": 325028760, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6328125, + "step": 15074, + "time_per_iteration": 2.460603952407837 + }, + { + "auxiliary_loss_clip": 0.01022009, + "auxiliary_loss_mlp": 0.00998758, + "balance_loss_clip": 0.99781018, + "balance_loss_mlp": 1.00188327, + "epoch": 0.9063580339696378, + "flos": 69313988528640.0, + "grad_norm": 0.7466398026663508, + "language_loss": 0.52349371, + "learning_rate": 9.122569353661513e-08, + "loss": 0.54370141, + "num_input_tokens_seen": 325093545, + "router_z_loss_clip": 0.00946045, + "router_z_loss_mlp": 0.20117188, + "step": 15075, + "time_per_iteration": 3.1416776180267334 + }, + { + "auxiliary_loss_clip": 0.01022161, + "auxiliary_loss_mlp": 0.01000705, + "balance_loss_clip": 0.99977523, + "balance_loss_mlp": 1.00210333, + "epoch": 0.9064181572223057, + "flos": 58794747148800.0, + "grad_norm": 0.7362778409223477, + "language_loss": 0.62075734, + "learning_rate": 9.11094474251517e-08, + "loss": 0.64098597, + "num_input_tokens_seen": 325152295, + "router_z_loss_clip": 0.00927734, + "router_z_loss_mlp": 0.20117188, + "step": 15076, + "time_per_iteration": 2.9436872005462646 + }, + { + "auxiliary_loss_clip": 0.01098779, + "auxiliary_loss_mlp": 0.01034374, + "balance_loss_clip": 1.0230608, + "balance_loss_mlp": 1.03310323, + "epoch": 0.9064782804749737, + "flos": 21762692772480.0, + "grad_norm": 1.8036310285355786, + "language_loss": 0.82249612, + "learning_rate": 9.09932736990091e-08, + "loss": 0.84382766, + "num_input_tokens_seen": 325169705, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 15077, + "time_per_iteration": 2.446316719055176 + }, + { + "auxiliary_loss_clip": 0.01095172, + "auxiliary_loss_mlp": 0.01023194, + "balance_loss_clip": 1.01250672, + "balance_loss_mlp": 1.03184319, + "epoch": 0.9065384037276417, + "flos": 21397373498880.0, + "grad_norm": 1.8387781390293605, + "language_loss": 0.83909905, + "learning_rate": 9.08771723625934e-08, + "loss": 0.86028278, + "num_input_tokens_seen": 325189175, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6328125, + "step": 15078, + "time_per_iteration": 2.4532206058502197 + }, + { + "auxiliary_loss_clip": 0.01095731, + "auxiliary_loss_mlp": 0.01029348, + "balance_loss_clip": 1.01802921, + "balance_loss_mlp": 1.03388798, + "epoch": 0.9065985269803096, + "flos": 38283646849920.0, + "grad_norm": 1.4797980658889718, + "language_loss": 0.65172887, + "learning_rate": 9.076114342030617e-08, + "loss": 0.67297965, + "num_input_tokens_seen": 325211020, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6171875, + "step": 15079, + "time_per_iteration": 2.595116376876831 + }, + { + "auxiliary_loss_clip": 0.01096827, + "auxiliary_loss_mlp": 0.0102679, + "balance_loss_clip": 1.01537549, + "balance_loss_mlp": 1.03209925, + "epoch": 0.9066586502329776, + "flos": 44818562989440.0, + "grad_norm": 1.599991637278185, + "language_loss": 0.70963979, + "learning_rate": 9.064518687654765e-08, + "loss": 0.73087597, + "num_input_tokens_seen": 325236970, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 15080, + "time_per_iteration": 2.634389877319336 + }, + { + "auxiliary_loss_clip": 0.01102802, + "auxiliary_loss_mlp": 0.01028039, + "balance_loss_clip": 1.01576686, + "balance_loss_mlp": 1.0354439, + "epoch": 0.9067187734856456, + "flos": 18623992492800.0, + "grad_norm": 2.1816197679044773, + "language_loss": 0.7070353, + "learning_rate": 9.052930273571547e-08, + "loss": 0.72834378, + "num_input_tokens_seen": 325252670, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.671875, + "step": 15081, + "time_per_iteration": 2.4671077728271484 + }, + { + "auxiliary_loss_clip": 0.01097302, + "auxiliary_loss_mlp": 0.01032284, + "balance_loss_clip": 1.02084613, + "balance_loss_mlp": 1.03395283, + "epoch": 0.9067788967383136, + "flos": 22747578762240.0, + "grad_norm": 3.2112042935174565, + "language_loss": 0.74457014, + "learning_rate": 9.04134910022032e-08, + "loss": 0.76586604, + "num_input_tokens_seen": 325273860, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6328125, + "step": 15082, + "time_per_iteration": 2.4647719860076904 + }, + { + "auxiliary_loss_clip": 0.01098042, + "auxiliary_loss_mlp": 0.01032237, + "balance_loss_clip": 1.02134085, + "balance_loss_mlp": 1.03468823, + "epoch": 0.9068390199909815, + "flos": 27670787648640.0, + "grad_norm": 2.1491807058491905, + "language_loss": 0.78196669, + "learning_rate": 9.029775168040266e-08, + "loss": 0.80326951, + "num_input_tokens_seen": 325294140, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6328125, + "step": 15083, + "time_per_iteration": 2.504772424697876 + }, + { + "auxiliary_loss_clip": 0.01096671, + "auxiliary_loss_mlp": 0.01029863, + "balance_loss_clip": 1.01927173, + "balance_loss_mlp": 1.03399706, + "epoch": 0.9068991432436495, + "flos": 24244012293120.0, + "grad_norm": 1.6316555525453107, + "language_loss": 0.69215107, + "learning_rate": 9.01820847747028e-08, + "loss": 0.7134164, + "num_input_tokens_seen": 325313130, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.625, + "step": 15084, + "time_per_iteration": 2.4626307487487793 + }, + { + "auxiliary_loss_clip": 0.01100231, + "auxiliary_loss_mlp": 0.01032, + "balance_loss_clip": 1.0205797, + "balance_loss_mlp": 1.03536558, + "epoch": 0.9069592664963174, + "flos": 28033305661440.0, + "grad_norm": 1.5369468428614528, + "language_loss": 0.66654259, + "learning_rate": 9.006649028948965e-08, + "loss": 0.68786484, + "num_input_tokens_seen": 325334880, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 15085, + "time_per_iteration": 2.526599645614624 + }, + { + "auxiliary_loss_clip": 0.01021996, + "auxiliary_loss_mlp": 0.00999372, + "balance_loss_clip": 0.99840033, + "balance_loss_mlp": 1.00198984, + "epoch": 0.9070193897489854, + "flos": 68778414789120.0, + "grad_norm": 0.7678247485693439, + "language_loss": 0.61296463, + "learning_rate": 8.995096822914638e-08, + "loss": 0.63317835, + "num_input_tokens_seen": 325394175, + "router_z_loss_clip": 0.00970459, + "router_z_loss_mlp": 0.20019531, + "step": 15086, + "time_per_iteration": 3.064495325088501 + }, + { + "auxiliary_loss_clip": 0.01097744, + "auxiliary_loss_mlp": 0.0103656, + "balance_loss_clip": 1.02425778, + "balance_loss_mlp": 1.03301144, + "epoch": 0.9070795130016533, + "flos": 23441624328960.0, + "grad_norm": 1.5347716312449224, + "language_loss": 0.72131354, + "learning_rate": 8.983551859805416e-08, + "loss": 0.74265659, + "num_input_tokens_seen": 325415020, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6484375, + "step": 15087, + "time_per_iteration": 2.4933295249938965 + }, + { + "auxiliary_loss_clip": 0.01100004, + "auxiliary_loss_mlp": 0.01024891, + "balance_loss_clip": 1.01387572, + "balance_loss_mlp": 1.03501546, + "epoch": 0.9071396362543214, + "flos": 18916413114240.0, + "grad_norm": 3.778989757753675, + "language_loss": 0.76889527, + "learning_rate": 8.972014140059058e-08, + "loss": 0.79014421, + "num_input_tokens_seen": 325433595, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6484375, + "step": 15088, + "time_per_iteration": 2.420175552368164 + }, + { + "auxiliary_loss_clip": 0.01095841, + "auxiliary_loss_mlp": 0.01028108, + "balance_loss_clip": 1.01755214, + "balance_loss_mlp": 1.03368163, + "epoch": 0.9071997595069893, + "flos": 25228646887680.0, + "grad_norm": 1.9194225212557636, + "language_loss": 0.73643494, + "learning_rate": 8.960483664113038e-08, + "loss": 0.7576744, + "num_input_tokens_seen": 325451605, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.62109375, + "step": 15089, + "time_per_iteration": 2.472822904586792 + }, + { + "auxiliary_loss_clip": 0.01093778, + "auxiliary_loss_mlp": 0.01030536, + "balance_loss_clip": 1.02005196, + "balance_loss_mlp": 1.03267169, + "epoch": 0.9072598827596573, + "flos": 24346608514560.0, + "grad_norm": 1.8055084062755367, + "language_loss": 0.75715423, + "learning_rate": 8.948960432404628e-08, + "loss": 0.77839744, + "num_input_tokens_seen": 325470645, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.609375, + "step": 15090, + "time_per_iteration": 2.4550294876098633 + }, + { + "auxiliary_loss_clip": 0.01100863, + "auxiliary_loss_mlp": 0.01027406, + "balance_loss_clip": 1.01507401, + "balance_loss_mlp": 1.03407693, + "epoch": 0.9073200060123253, + "flos": 22674967418880.0, + "grad_norm": 2.1593176290056766, + "language_loss": 0.77432215, + "learning_rate": 8.93744444537079e-08, + "loss": 0.79560483, + "num_input_tokens_seen": 325488070, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.66796875, + "step": 15091, + "time_per_iteration": 2.4498257637023926 + }, + { + "auxiliary_loss_clip": 0.01092067, + "auxiliary_loss_mlp": 0.0102371, + "balance_loss_clip": 1.01344657, + "balance_loss_mlp": 1.03182781, + "epoch": 0.9073801292649932, + "flos": 23695476721920.0, + "grad_norm": 1.6802751346458031, + "language_loss": 0.86002195, + "learning_rate": 8.925935703448217e-08, + "loss": 0.88117981, + "num_input_tokens_seen": 325509285, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.6015625, + "step": 15092, + "time_per_iteration": 2.455930233001709 + }, + { + "auxiliary_loss_clip": 0.01100534, + "auxiliary_loss_mlp": 0.01031045, + "balance_loss_clip": 1.01952982, + "balance_loss_mlp": 1.03627157, + "epoch": 0.9074402525176612, + "flos": 25375413859200.0, + "grad_norm": 1.9030334099331545, + "language_loss": 0.78655577, + "learning_rate": 8.914434207073296e-08, + "loss": 0.80787158, + "num_input_tokens_seen": 325529360, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 15093, + "time_per_iteration": 2.4959311485290527 + }, + { + "auxiliary_loss_clip": 0.01021984, + "auxiliary_loss_mlp": 0.00998909, + "balance_loss_clip": 0.9979021, + "balance_loss_mlp": 1.00177145, + "epoch": 0.9075003757703292, + "flos": 67649024384640.0, + "grad_norm": 0.7368950598550581, + "language_loss": 0.57025433, + "learning_rate": 8.902939956682188e-08, + "loss": 0.59046328, + "num_input_tokens_seen": 325583565, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.203125, + "step": 15094, + "time_per_iteration": 2.9761135578155518 + }, + { + "auxiliary_loss_clip": 0.01102196, + "auxiliary_loss_mlp": 0.01031359, + "balance_loss_clip": 1.01930702, + "balance_loss_mlp": 1.03587985, + "epoch": 0.9075604990229972, + "flos": 22453649769600.0, + "grad_norm": 2.2862090497485945, + "language_loss": 0.71629637, + "learning_rate": 8.891452952710742e-08, + "loss": 0.73763192, + "num_input_tokens_seen": 325603690, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6640625, + "step": 15095, + "time_per_iteration": 3.8516845703125 + }, + { + "auxiliary_loss_clip": 0.01099489, + "auxiliary_loss_mlp": 0.01029255, + "balance_loss_clip": 1.01822233, + "balance_loss_mlp": 1.03436017, + "epoch": 0.9076206222756651, + "flos": 19536662188800.0, + "grad_norm": 1.6691319696305897, + "language_loss": 0.74130392, + "learning_rate": 8.879973195594526e-08, + "loss": 0.76259142, + "num_input_tokens_seen": 325622255, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65234375, + "step": 15096, + "time_per_iteration": 2.451765537261963 + }, + { + "auxiliary_loss_clip": 0.01101574, + "auxiliary_loss_mlp": 0.01037614, + "balance_loss_clip": 1.02484095, + "balance_loss_mlp": 1.03521657, + "epoch": 0.9076807455283331, + "flos": 30116914819200.0, + "grad_norm": 1.859736829180508, + "language_loss": 0.57428157, + "learning_rate": 8.868500685768898e-08, + "loss": 0.59567344, + "num_input_tokens_seen": 325640165, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6640625, + "step": 15097, + "time_per_iteration": 3.881094217300415 + }, + { + "auxiliary_loss_clip": 0.01093901, + "auxiliary_loss_mlp": 0.01022633, + "balance_loss_clip": 1.01194012, + "balance_loss_mlp": 1.03041399, + "epoch": 0.907740868781001, + "flos": 18697537589760.0, + "grad_norm": 1.7073383021734174, + "language_loss": 0.80004597, + "learning_rate": 8.857035423668935e-08, + "loss": 0.82121134, + "num_input_tokens_seen": 325659455, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6328125, + "step": 15098, + "time_per_iteration": 3.9118173122406006 + }, + { + "auxiliary_loss_clip": 0.01102439, + "auxiliary_loss_mlp": 0.0102718, + "balance_loss_clip": 1.01512241, + "balance_loss_mlp": 1.03452134, + "epoch": 0.907800992033669, + "flos": 22638805401600.0, + "grad_norm": 1.7264575763342218, + "language_loss": 0.66292477, + "learning_rate": 8.845577409729266e-08, + "loss": 0.68422097, + "num_input_tokens_seen": 325678095, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6796875, + "step": 15099, + "time_per_iteration": 2.48301100730896 + }, + { + "auxiliary_loss_clip": 0.01101737, + "auxiliary_loss_mlp": 0.01033877, + "balance_loss_clip": 1.02177739, + "balance_loss_mlp": 1.03520679, + "epoch": 0.907861115286337, + "flos": 21287666384640.0, + "grad_norm": 2.2265413383772255, + "language_loss": 0.70710111, + "learning_rate": 8.834126644384477e-08, + "loss": 0.72845727, + "num_input_tokens_seen": 325695825, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 15100, + "time_per_iteration": 2.43037486076355 + }, + { + "auxiliary_loss_clip": 0.0102224, + "auxiliary_loss_mlp": 0.01001474, + "balance_loss_clip": 1.00043106, + "balance_loss_mlp": 1.00208926, + "epoch": 0.907921238539005, + "flos": 69739493040000.0, + "grad_norm": 0.62151668648413, + "language_loss": 0.53409314, + "learning_rate": 8.822683128068775e-08, + "loss": 0.55433023, + "num_input_tokens_seen": 325764515, + "router_z_loss_clip": 0.01043701, + "router_z_loss_mlp": 0.20117188, + "step": 15101, + "time_per_iteration": 3.1229333877563477 + }, + { + "auxiliary_loss_clip": 0.01099032, + "auxiliary_loss_mlp": 0.01026926, + "balance_loss_clip": 1.01553512, + "balance_loss_mlp": 1.03384113, + "epoch": 0.9079813617916729, + "flos": 23477391296640.0, + "grad_norm": 1.6634584368490581, + "language_loss": 0.6806314, + "learning_rate": 8.811246861216081e-08, + "loss": 0.70189095, + "num_input_tokens_seen": 325783235, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65234375, + "step": 15102, + "time_per_iteration": 3.915992498397827 + }, + { + "auxiliary_loss_clip": 0.01099332, + "auxiliary_loss_mlp": 0.01028654, + "balance_loss_clip": 1.01760292, + "balance_loss_mlp": 1.03539491, + "epoch": 0.9080414850443409, + "flos": 22929933133440.0, + "grad_norm": 1.866647802447474, + "language_loss": 0.79140002, + "learning_rate": 8.799817844260049e-08, + "loss": 0.81267983, + "num_input_tokens_seen": 325800195, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 15103, + "time_per_iteration": 2.4541261196136475 + }, + { + "auxiliary_loss_clip": 0.01099368, + "auxiliary_loss_mlp": 0.0102728, + "balance_loss_clip": 1.01546621, + "balance_loss_mlp": 1.03330398, + "epoch": 0.9081016082970089, + "flos": 26177083551360.0, + "grad_norm": 1.6943626894395103, + "language_loss": 0.71684384, + "learning_rate": 8.78839607763413e-08, + "loss": 0.7381103, + "num_input_tokens_seen": 325820215, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66015625, + "step": 15104, + "time_per_iteration": 2.482779026031494 + }, + { + "auxiliary_loss_clip": 0.01096986, + "auxiliary_loss_mlp": 0.01026694, + "balance_loss_clip": 1.01615047, + "balance_loss_mlp": 1.03385842, + "epoch": 0.9081617315496768, + "flos": 24462169545600.0, + "grad_norm": 1.8695561249612997, + "language_loss": 0.77266347, + "learning_rate": 8.77698156177138e-08, + "loss": 0.79390025, + "num_input_tokens_seen": 325838415, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6328125, + "step": 15105, + "time_per_iteration": 2.4875688552856445 + }, + { + "auxiliary_loss_clip": 0.01098253, + "auxiliary_loss_mlp": 0.01033196, + "balance_loss_clip": 1.02169788, + "balance_loss_mlp": 1.03278279, + "epoch": 0.9082218548023449, + "flos": 24746868743040.0, + "grad_norm": 1.8708472286705515, + "language_loss": 0.73539734, + "learning_rate": 8.765574297104628e-08, + "loss": 0.75671178, + "num_input_tokens_seen": 325855580, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 15106, + "time_per_iteration": 2.4679880142211914 + }, + { + "auxiliary_loss_clip": 0.01099508, + "auxiliary_loss_mlp": 0.01031628, + "balance_loss_clip": 1.01973677, + "balance_loss_mlp": 1.03357434, + "epoch": 0.9082819780550128, + "flos": 24421302846720.0, + "grad_norm": 1.6236610377303244, + "language_loss": 0.80442846, + "learning_rate": 8.754174284066462e-08, + "loss": 0.82573986, + "num_input_tokens_seen": 325874890, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66015625, + "step": 15107, + "time_per_iteration": 2.4709742069244385 + }, + { + "auxiliary_loss_clip": 0.01021838, + "auxiliary_loss_mlp": 0.01001997, + "balance_loss_clip": 1.00099587, + "balance_loss_mlp": 1.00181663, + "epoch": 0.9083421013076808, + "flos": 59609704872960.0, + "grad_norm": 0.8133672060912172, + "language_loss": 0.59727746, + "learning_rate": 8.742781523089205e-08, + "loss": 0.61751574, + "num_input_tokens_seen": 325935835, + "router_z_loss_clip": 0.01000977, + "router_z_loss_mlp": 0.20019531, + "step": 15108, + "time_per_iteration": 3.0274457931518555 + }, + { + "auxiliary_loss_clip": 0.01098636, + "auxiliary_loss_mlp": 0.01027249, + "balance_loss_clip": 1.01596022, + "balance_loss_mlp": 1.03307295, + "epoch": 0.9084022245603487, + "flos": 33620216100480.0, + "grad_norm": 1.8322800202774943, + "language_loss": 0.73455155, + "learning_rate": 8.73139601460482e-08, + "loss": 0.75581038, + "num_input_tokens_seen": 325958035, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 15109, + "time_per_iteration": 2.544933557510376 + }, + { + "auxiliary_loss_clip": 0.01096619, + "auxiliary_loss_mlp": 0.01028863, + "balance_loss_clip": 1.01830721, + "balance_loss_mlp": 1.0329206, + "epoch": 0.9084623478130167, + "flos": 24971705925120.0, + "grad_norm": 4.344593502193457, + "language_loss": 0.71237719, + "learning_rate": 8.720017759045073e-08, + "loss": 0.73363197, + "num_input_tokens_seen": 325979870, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.63671875, + "step": 15110, + "time_per_iteration": 2.497110605239868 + }, + { + "auxiliary_loss_clip": 0.01096477, + "auxiliary_loss_mlp": 0.01027944, + "balance_loss_clip": 1.01717973, + "balance_loss_mlp": 1.03286374, + "epoch": 0.9085224710656846, + "flos": 31461804869760.0, + "grad_norm": 1.8914312332237635, + "language_loss": 0.68927699, + "learning_rate": 8.708646756841421e-08, + "loss": 0.7105211, + "num_input_tokens_seen": 325998245, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.63671875, + "step": 15111, + "time_per_iteration": 2.505744457244873 + }, + { + "auxiliary_loss_clip": 0.01022286, + "auxiliary_loss_mlp": 0.01003787, + "balance_loss_clip": 1.0028336, + "balance_loss_mlp": 1.00213766, + "epoch": 0.9085825943183526, + "flos": 64917012867840.0, + "grad_norm": 0.6884098110857299, + "language_loss": 0.51761699, + "learning_rate": 8.697283008425026e-08, + "loss": 0.53787768, + "num_input_tokens_seen": 326061770, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.20117188, + "step": 15112, + "time_per_iteration": 3.103571891784668 + }, + { + "auxiliary_loss_clip": 0.01099285, + "auxiliary_loss_mlp": 0.01033775, + "balance_loss_clip": 1.02190137, + "balance_loss_mlp": 1.03310108, + "epoch": 0.9086427175710206, + "flos": 18953221576320.0, + "grad_norm": 2.8732583943790106, + "language_loss": 0.69663835, + "learning_rate": 8.685926514226837e-08, + "loss": 0.717969, + "num_input_tokens_seen": 326080945, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66015625, + "step": 15113, + "time_per_iteration": 2.4426491260528564 + }, + { + "auxiliary_loss_clip": 0.01100751, + "auxiliary_loss_mlp": 0.01030585, + "balance_loss_clip": 1.01948047, + "balance_loss_mlp": 1.03551614, + "epoch": 0.9087028408236886, + "flos": 34014873807360.0, + "grad_norm": 2.070851389766841, + "language_loss": 0.79043949, + "learning_rate": 8.674577274677508e-08, + "loss": 0.81175292, + "num_input_tokens_seen": 326100630, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.65234375, + "step": 15114, + "time_per_iteration": 2.5486881732940674 + }, + { + "auxiliary_loss_clip": 0.01104287, + "auxiliary_loss_mlp": 0.01029821, + "balance_loss_clip": 1.01682091, + "balance_loss_mlp": 1.03576267, + "epoch": 0.9087629640763565, + "flos": 21944580266880.0, + "grad_norm": 1.912898599360711, + "language_loss": 0.70125389, + "learning_rate": 8.663235290207405e-08, + "loss": 0.72259498, + "num_input_tokens_seen": 326120145, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6875, + "step": 15115, + "time_per_iteration": 2.4841859340667725 + }, + { + "auxiliary_loss_clip": 0.01107149, + "auxiliary_loss_mlp": 0.01027577, + "balance_loss_clip": 1.01529849, + "balance_loss_mlp": 1.03841996, + "epoch": 0.9088230873290245, + "flos": 21762908254080.0, + "grad_norm": 1.5429378275773247, + "language_loss": 0.6537832, + "learning_rate": 8.651900561246561e-08, + "loss": 0.67513043, + "num_input_tokens_seen": 326140715, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 15116, + "time_per_iteration": 2.473940849304199 + }, + { + "auxiliary_loss_clip": 0.01098134, + "auxiliary_loss_mlp": 0.01031, + "balance_loss_clip": 1.01934111, + "balance_loss_mlp": 1.03502166, + "epoch": 0.9088832105816925, + "flos": 21541267382400.0, + "grad_norm": 2.042140958343974, + "language_loss": 0.69371068, + "learning_rate": 8.640573088224812e-08, + "loss": 0.71500206, + "num_input_tokens_seen": 326159130, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6328125, + "step": 15117, + "time_per_iteration": 2.446790933609009 + }, + { + "auxiliary_loss_clip": 0.01097454, + "auxiliary_loss_mlp": 0.01026475, + "balance_loss_clip": 1.01577044, + "balance_loss_mlp": 1.03358901, + "epoch": 0.9089433338343604, + "flos": 25996704428160.0, + "grad_norm": 1.3695155958965473, + "language_loss": 0.74376065, + "learning_rate": 8.629252871571745e-08, + "loss": 0.76499993, + "num_input_tokens_seen": 326181375, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.640625, + "step": 15118, + "time_per_iteration": 2.5014708042144775 + }, + { + "auxiliary_loss_clip": 0.0110317, + "auxiliary_loss_mlp": 0.01036197, + "balance_loss_clip": 1.02320337, + "balance_loss_mlp": 1.03314781, + "epoch": 0.9090034570870285, + "flos": 21178426147200.0, + "grad_norm": 1.9496085860270096, + "language_loss": 0.72797048, + "learning_rate": 8.617939911716554e-08, + "loss": 0.74936414, + "num_input_tokens_seen": 326199740, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.69921875, + "step": 15119, + "time_per_iteration": 2.438727617263794 + }, + { + "auxiliary_loss_clip": 0.01103886, + "auxiliary_loss_mlp": 0.01032043, + "balance_loss_clip": 1.01900196, + "balance_loss_mlp": 1.03510284, + "epoch": 0.9090635803396964, + "flos": 16141811045760.0, + "grad_norm": 2.433261916174762, + "language_loss": 0.71455759, + "learning_rate": 8.60663420908827e-08, + "loss": 0.73591691, + "num_input_tokens_seen": 326214350, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6875, + "step": 15120, + "time_per_iteration": 2.4505975246429443 + }, + { + "auxiliary_loss_clip": 0.01100898, + "auxiliary_loss_mlp": 0.01024815, + "balance_loss_clip": 1.01320434, + "balance_loss_mlp": 1.03470683, + "epoch": 0.9091237035923644, + "flos": 20591537829120.0, + "grad_norm": 2.262414929698902, + "language_loss": 0.65746844, + "learning_rate": 8.595335764115596e-08, + "loss": 0.6787256, + "num_input_tokens_seen": 326234580, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 15121, + "time_per_iteration": 2.460604667663574 + }, + { + "auxiliary_loss_clip": 0.0110018, + "auxiliary_loss_mlp": 0.01037, + "balance_loss_clip": 1.02520478, + "balance_loss_mlp": 1.03480613, + "epoch": 0.9091838268450323, + "flos": 52227760164480.0, + "grad_norm": 1.7042236575270435, + "language_loss": 0.70428181, + "learning_rate": 8.58404457722699e-08, + "loss": 0.72565359, + "num_input_tokens_seen": 326259080, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65234375, + "step": 15122, + "time_per_iteration": 2.750230550765991 + }, + { + "auxiliary_loss_clip": 0.01095884, + "auxiliary_loss_mlp": 0.01030523, + "balance_loss_clip": 1.01950181, + "balance_loss_mlp": 1.03195405, + "epoch": 0.9092439500977003, + "flos": 20559613616640.0, + "grad_norm": 1.642128872010493, + "language_loss": 0.74480474, + "learning_rate": 8.572760648850575e-08, + "loss": 0.7660687, + "num_input_tokens_seen": 326280175, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 15123, + "time_per_iteration": 2.4654879570007324 + }, + { + "auxiliary_loss_clip": 0.01096662, + "auxiliary_loss_mlp": 0.01028184, + "balance_loss_clip": 1.01699638, + "balance_loss_mlp": 1.03369832, + "epoch": 0.9093040733503682, + "flos": 28617859595520.0, + "grad_norm": 2.1206798390190262, + "language_loss": 0.75936723, + "learning_rate": 8.561483979414253e-08, + "loss": 0.78061569, + "num_input_tokens_seen": 326297990, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.62890625, + "step": 15124, + "time_per_iteration": 2.52099871635437 + }, + { + "auxiliary_loss_clip": 0.01098724, + "auxiliary_loss_mlp": 0.01027911, + "balance_loss_clip": 1.0162288, + "balance_loss_mlp": 1.03414643, + "epoch": 0.9093641966030362, + "flos": 23440187784960.0, + "grad_norm": 1.8194159456969368, + "language_loss": 0.71981823, + "learning_rate": 8.55021456934566e-08, + "loss": 0.74108458, + "num_input_tokens_seen": 326316735, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.64453125, + "step": 15125, + "time_per_iteration": 2.4560205936431885 + }, + { + "auxiliary_loss_clip": 0.01100093, + "auxiliary_loss_mlp": 0.01033365, + "balance_loss_clip": 1.02221918, + "balance_loss_mlp": 1.03570986, + "epoch": 0.9094243198557042, + "flos": 16800197385600.0, + "grad_norm": 1.7563994353439563, + "language_loss": 0.79251933, + "learning_rate": 8.538952419072143e-08, + "loss": 0.81385386, + "num_input_tokens_seen": 326334370, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.64453125, + "step": 15126, + "time_per_iteration": 2.453873634338379 + }, + { + "auxiliary_loss_clip": 0.01098149, + "auxiliary_loss_mlp": 0.01030363, + "balance_loss_clip": 1.01834726, + "balance_loss_mlp": 1.03453374, + "epoch": 0.9094844431083722, + "flos": 24273278899200.0, + "grad_norm": 1.6858763674197714, + "language_loss": 0.75407279, + "learning_rate": 8.527697529020694e-08, + "loss": 0.77535784, + "num_input_tokens_seen": 326353435, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6328125, + "step": 15127, + "time_per_iteration": 2.4735212326049805 + }, + { + "auxiliary_loss_clip": 0.01099168, + "auxiliary_loss_mlp": 0.01031751, + "balance_loss_clip": 1.0202589, + "balance_loss_mlp": 1.03267837, + "epoch": 0.9095445663610401, + "flos": 21944652094080.0, + "grad_norm": 1.896028987047219, + "language_loss": 0.6233058, + "learning_rate": 8.516449899618173e-08, + "loss": 0.64461499, + "num_input_tokens_seen": 326371810, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 15128, + "time_per_iteration": 2.4959044456481934 + }, + { + "auxiliary_loss_clip": 0.01096673, + "auxiliary_loss_mlp": 0.01023024, + "balance_loss_clip": 1.01165116, + "balance_loss_mlp": 1.03295422, + "epoch": 0.9096046896137081, + "flos": 19792848965760.0, + "grad_norm": 1.5525985783311769, + "language_loss": 0.76395273, + "learning_rate": 8.505209531291013e-08, + "loss": 0.78514969, + "num_input_tokens_seen": 326391380, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.63671875, + "step": 15129, + "time_per_iteration": 2.4258792400360107 + }, + { + "auxiliary_loss_clip": 0.01100603, + "auxiliary_loss_mlp": 0.01027213, + "balance_loss_clip": 1.0159955, + "balance_loss_mlp": 1.03430605, + "epoch": 0.909664812866376, + "flos": 22638087129600.0, + "grad_norm": 2.1640381281313377, + "language_loss": 0.83347154, + "learning_rate": 8.49397642446552e-08, + "loss": 0.85474968, + "num_input_tokens_seen": 326408800, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6640625, + "step": 15130, + "time_per_iteration": 2.474724054336548 + }, + { + "auxiliary_loss_clip": 0.01102705, + "auxiliary_loss_mlp": 0.01029426, + "balance_loss_clip": 1.01755881, + "balance_loss_mlp": 1.03607357, + "epoch": 0.909724936119044, + "flos": 39852153020160.0, + "grad_norm": 1.6552952857551617, + "language_loss": 0.7494061, + "learning_rate": 8.482750579567644e-08, + "loss": 0.7707274, + "num_input_tokens_seen": 326431565, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6640625, + "step": 15131, + "time_per_iteration": 2.592521905899048 + }, + { + "auxiliary_loss_clip": 0.01101645, + "auxiliary_loss_mlp": 0.01027681, + "balance_loss_clip": 1.0159862, + "balance_loss_mlp": 1.03632045, + "epoch": 0.9097850593717121, + "flos": 35071616954880.0, + "grad_norm": 2.576621383349304, + "language_loss": 0.5961653, + "learning_rate": 8.471531997023085e-08, + "loss": 0.61745852, + "num_input_tokens_seen": 326451715, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 15132, + "time_per_iteration": 2.5617480278015137 + }, + { + "auxiliary_loss_clip": 0.01101277, + "auxiliary_loss_mlp": 0.01028373, + "balance_loss_clip": 1.01764417, + "balance_loss_mlp": 1.03624594, + "epoch": 0.90984518262438, + "flos": 23367468700800.0, + "grad_norm": 1.5851128312655174, + "language_loss": 0.82403994, + "learning_rate": 8.460320677257193e-08, + "loss": 0.84533644, + "num_input_tokens_seen": 326470855, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6484375, + "step": 15133, + "time_per_iteration": 2.457850456237793 + }, + { + "auxiliary_loss_clip": 0.01099344, + "auxiliary_loss_mlp": 0.01029584, + "balance_loss_clip": 1.01784801, + "balance_loss_mlp": 1.03306341, + "epoch": 0.909905305877048, + "flos": 27523302405120.0, + "grad_norm": 1.8998682827883817, + "language_loss": 0.7366299, + "learning_rate": 8.449116620695118e-08, + "loss": 0.75791919, + "num_input_tokens_seen": 326490480, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 15134, + "time_per_iteration": 2.5147948265075684 + }, + { + "auxiliary_loss_clip": 0.01104628, + "auxiliary_loss_mlp": 0.01032392, + "balance_loss_clip": 1.02057791, + "balance_loss_mlp": 1.03571355, + "epoch": 0.9099654291297159, + "flos": 24347865490560.0, + "grad_norm": 1.4223380746982386, + "language_loss": 0.72740394, + "learning_rate": 8.437919827761786e-08, + "loss": 0.74877417, + "num_input_tokens_seen": 326509445, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 15135, + "time_per_iteration": 2.4703848361968994 + }, + { + "auxiliary_loss_clip": 0.01098783, + "auxiliary_loss_mlp": 0.01030997, + "balance_loss_clip": 1.01942205, + "balance_loss_mlp": 1.03540444, + "epoch": 0.9100255523823839, + "flos": 21215234609280.0, + "grad_norm": 1.7021694614162164, + "language_loss": 0.70180988, + "learning_rate": 8.426730298881702e-08, + "loss": 0.72310776, + "num_input_tokens_seen": 326528380, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6328125, + "step": 15136, + "time_per_iteration": 2.487475872039795 + }, + { + "auxiliary_loss_clip": 0.01021711, + "auxiliary_loss_mlp": 0.01001642, + "balance_loss_clip": 1.00064075, + "balance_loss_mlp": 1.00169659, + "epoch": 0.9100856756350518, + "flos": 46052276446080.0, + "grad_norm": 0.8227515384333137, + "language_loss": 0.59297395, + "learning_rate": 8.415548034479214e-08, + "loss": 0.61320746, + "num_input_tokens_seen": 326576940, + "router_z_loss_clip": 0.01000977, + "router_z_loss_mlp": 0.20019531, + "step": 15137, + "time_per_iteration": 4.198936700820923 + }, + { + "auxiliary_loss_clip": 0.01100339, + "auxiliary_loss_mlp": 0.010322, + "balance_loss_clip": 1.02104831, + "balance_loss_mlp": 1.03473079, + "epoch": 0.9101457988877198, + "flos": 20229917656320.0, + "grad_norm": 1.6400889092436695, + "language_loss": 0.82225323, + "learning_rate": 8.40437303497834e-08, + "loss": 0.84357858, + "num_input_tokens_seen": 326596100, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 15138, + "time_per_iteration": 3.895996570587158 + }, + { + "auxiliary_loss_clip": 0.01095947, + "auxiliary_loss_mlp": 0.01024499, + "balance_loss_clip": 1.01367462, + "balance_loss_mlp": 1.03384078, + "epoch": 0.9102059221403878, + "flos": 26615157822720.0, + "grad_norm": 1.430309258083403, + "language_loss": 0.81232274, + "learning_rate": 8.39320530080283e-08, + "loss": 0.83352715, + "num_input_tokens_seen": 326615700, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.62109375, + "step": 15139, + "time_per_iteration": 3.915422201156616 + }, + { + "auxiliary_loss_clip": 0.01099972, + "auxiliary_loss_mlp": 0.01029152, + "balance_loss_clip": 1.01819634, + "balance_loss_mlp": 1.0353173, + "epoch": 0.9102660453930558, + "flos": 21908561904000.0, + "grad_norm": 1.715308850913459, + "language_loss": 0.77420986, + "learning_rate": 8.382044832376167e-08, + "loss": 0.79550105, + "num_input_tokens_seen": 326635905, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 15140, + "time_per_iteration": 2.482774019241333 + }, + { + "auxiliary_loss_clip": 0.01098266, + "auxiliary_loss_mlp": 0.0102809, + "balance_loss_clip": 1.01683044, + "balance_loss_mlp": 1.03335404, + "epoch": 0.9103261686457237, + "flos": 36176660916480.0, + "grad_norm": 1.5748509366300032, + "language_loss": 0.66406751, + "learning_rate": 8.370891630121569e-08, + "loss": 0.68533105, + "num_input_tokens_seen": 326661855, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6484375, + "step": 15141, + "time_per_iteration": 2.627438545227051 + }, + { + "auxiliary_loss_clip": 0.01100489, + "auxiliary_loss_mlp": 0.01034279, + "balance_loss_clip": 1.02277482, + "balance_loss_mlp": 1.03375959, + "epoch": 0.9103862918983917, + "flos": 23878549365120.0, + "grad_norm": 1.8835074175782365, + "language_loss": 0.74966937, + "learning_rate": 8.359745694462005e-08, + "loss": 0.77101701, + "num_input_tokens_seen": 326679320, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66796875, + "step": 15142, + "time_per_iteration": 2.4914710521698 + }, + { + "auxiliary_loss_clip": 0.01097437, + "auxiliary_loss_mlp": 0.01031506, + "balance_loss_clip": 1.02074134, + "balance_loss_mlp": 1.03283298, + "epoch": 0.9104464151510596, + "flos": 14939521989120.0, + "grad_norm": 1.5812700495772496, + "language_loss": 0.64177603, + "learning_rate": 8.348607025820076e-08, + "loss": 0.66306543, + "num_input_tokens_seen": 326698110, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6484375, + "step": 15143, + "time_per_iteration": 3.8821182250976562 + }, + { + "auxiliary_loss_clip": 0.01099566, + "auxiliary_loss_mlp": 0.01032309, + "balance_loss_clip": 1.02014375, + "balance_loss_mlp": 1.03280878, + "epoch": 0.9105065384037276, + "flos": 33655803500160.0, + "grad_norm": 2.535789613284141, + "language_loss": 0.61168027, + "learning_rate": 8.337475624618152e-08, + "loss": 0.63299894, + "num_input_tokens_seen": 326718370, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.66796875, + "step": 15144, + "time_per_iteration": 2.569805145263672 + }, + { + "auxiliary_loss_clip": 0.01096299, + "auxiliary_loss_mlp": 0.01023873, + "balance_loss_clip": 1.0129174, + "balance_loss_mlp": 1.03423166, + "epoch": 0.9105666616563957, + "flos": 24316695463680.0, + "grad_norm": 1.5502656726328978, + "language_loss": 0.71112603, + "learning_rate": 8.326351491278382e-08, + "loss": 0.7323277, + "num_input_tokens_seen": 326738445, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.62109375, + "step": 15145, + "time_per_iteration": 2.4523370265960693 + }, + { + "auxiliary_loss_clip": 0.01095165, + "auxiliary_loss_mlp": 0.01029959, + "balance_loss_clip": 1.01897335, + "balance_loss_mlp": 1.03254509, + "epoch": 0.9106267849090636, + "flos": 29971692132480.0, + "grad_norm": 2.6249529159615514, + "language_loss": 0.70575893, + "learning_rate": 8.315234626222545e-08, + "loss": 0.72701019, + "num_input_tokens_seen": 326758855, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.62890625, + "step": 15146, + "time_per_iteration": 2.532625436782837 + }, + { + "auxiliary_loss_clip": 0.0109792, + "auxiliary_loss_mlp": 0.01028691, + "balance_loss_clip": 1.01795065, + "balance_loss_mlp": 1.03296185, + "epoch": 0.9106869081617316, + "flos": 25337743470720.0, + "grad_norm": 2.0360232738172788, + "language_loss": 0.72551036, + "learning_rate": 8.304125029872233e-08, + "loss": 0.74677646, + "num_input_tokens_seen": 326777140, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6484375, + "step": 15147, + "time_per_iteration": 2.4464521408081055 + }, + { + "auxiliary_loss_clip": 0.0110162, + "auxiliary_loss_mlp": 0.01025608, + "balance_loss_clip": 1.01392555, + "balance_loss_mlp": 1.03255963, + "epoch": 0.9107470314143995, + "flos": 18187031543040.0, + "grad_norm": 2.0097623075783235, + "language_loss": 0.80071878, + "learning_rate": 8.293022702648711e-08, + "loss": 0.82199109, + "num_input_tokens_seen": 326794070, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.69140625, + "step": 15148, + "time_per_iteration": 2.459246873855591 + }, + { + "auxiliary_loss_clip": 0.01099453, + "auxiliary_loss_mlp": 0.01032166, + "balance_loss_clip": 1.02068663, + "balance_loss_mlp": 1.03310466, + "epoch": 0.9108071546670675, + "flos": 23550828652800.0, + "grad_norm": 2.144518085707252, + "language_loss": 0.68096125, + "learning_rate": 8.281927644972996e-08, + "loss": 0.70227742, + "num_input_tokens_seen": 326814695, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6640625, + "step": 15149, + "time_per_iteration": 2.4559459686279297 + }, + { + "auxiliary_loss_clip": 0.01100315, + "auxiliary_loss_mlp": 0.01025911, + "balance_loss_clip": 1.0136447, + "balance_loss_mlp": 1.03507054, + "epoch": 0.9108672779197354, + "flos": 25630307746560.0, + "grad_norm": 1.4816254135406823, + "language_loss": 0.63344759, + "learning_rate": 8.270839857265776e-08, + "loss": 0.65470982, + "num_input_tokens_seen": 326835295, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.65234375, + "step": 15150, + "time_per_iteration": 2.5041751861572266 + }, + { + "auxiliary_loss_clip": 0.01097831, + "auxiliary_loss_mlp": 0.01031854, + "balance_loss_clip": 1.02024317, + "balance_loss_mlp": 1.03323102, + "epoch": 0.9109274011724035, + "flos": 22339094319360.0, + "grad_norm": 2.0033802871305166, + "language_loss": 0.72777343, + "learning_rate": 8.259759339947514e-08, + "loss": 0.74907029, + "num_input_tokens_seen": 326853350, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.64453125, + "step": 15151, + "time_per_iteration": 2.4370014667510986 + }, + { + "auxiliary_loss_clip": 0.01099185, + "auxiliary_loss_mlp": 0.01026953, + "balance_loss_clip": 1.01581335, + "balance_loss_mlp": 1.03446126, + "epoch": 0.9109875244250714, + "flos": 26688200129280.0, + "grad_norm": 1.71458072737329, + "language_loss": 0.64443874, + "learning_rate": 8.248686093438429e-08, + "loss": 0.66570008, + "num_input_tokens_seen": 326873425, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 15152, + "time_per_iteration": 2.502570867538452 + }, + { + "auxiliary_loss_clip": 0.01100177, + "auxiliary_loss_mlp": 0.01025715, + "balance_loss_clip": 1.01365113, + "balance_loss_mlp": 1.03537905, + "epoch": 0.9110476476777394, + "flos": 22930112701440.0, + "grad_norm": 2.146338606112044, + "language_loss": 0.73740828, + "learning_rate": 8.23762011815834e-08, + "loss": 0.75866711, + "num_input_tokens_seen": 326893455, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6484375, + "step": 15153, + "time_per_iteration": 2.4459383487701416 + }, + { + "auxiliary_loss_clip": 0.01101084, + "auxiliary_loss_mlp": 0.01029699, + "balance_loss_clip": 1.01843357, + "balance_loss_mlp": 1.03521991, + "epoch": 0.9111077709304073, + "flos": 13472857854720.0, + "grad_norm": 2.0173388878771843, + "language_loss": 0.72387171, + "learning_rate": 8.226561414526956e-08, + "loss": 0.74517953, + "num_input_tokens_seen": 326910210, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65625, + "step": 15154, + "time_per_iteration": 2.4318478107452393 + }, + { + "auxiliary_loss_clip": 0.01099774, + "auxiliary_loss_mlp": 0.01029884, + "balance_loss_clip": 1.01911378, + "balance_loss_mlp": 1.03551435, + "epoch": 0.9111678941830753, + "flos": 20850561780480.0, + "grad_norm": 1.8402378708668206, + "language_loss": 0.81793249, + "learning_rate": 8.215509982963564e-08, + "loss": 0.83922905, + "num_input_tokens_seen": 326929350, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.640625, + "step": 15155, + "time_per_iteration": 2.4388551712036133 + }, + { + "auxiliary_loss_clip": 0.01100349, + "auxiliary_loss_mlp": 0.01027255, + "balance_loss_clip": 1.01544142, + "balance_loss_mlp": 1.03586698, + "epoch": 0.9112280174357432, + "flos": 19682244011520.0, + "grad_norm": 1.9446654713902813, + "language_loss": 0.5985598, + "learning_rate": 8.204465823887252e-08, + "loss": 0.61983585, + "num_input_tokens_seen": 326949060, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.64453125, + "step": 15156, + "time_per_iteration": 2.4715144634246826 + }, + { + "auxiliary_loss_clip": 0.01101793, + "auxiliary_loss_mlp": 0.01028719, + "balance_loss_clip": 1.01614261, + "balance_loss_mlp": 1.03311276, + "epoch": 0.9112881406884112, + "flos": 25447163276160.0, + "grad_norm": 1.8459673274861486, + "language_loss": 0.73944056, + "learning_rate": 8.193428937716796e-08, + "loss": 0.76074564, + "num_input_tokens_seen": 326968950, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6875, + "step": 15157, + "time_per_iteration": 2.477900505065918 + }, + { + "auxiliary_loss_clip": 0.01098535, + "auxiliary_loss_mlp": 0.01028969, + "balance_loss_clip": 1.01842475, + "balance_loss_mlp": 1.03296149, + "epoch": 0.9113482639410793, + "flos": 33066975847680.0, + "grad_norm": 1.9741032786689436, + "language_loss": 0.59582591, + "learning_rate": 8.182399324870747e-08, + "loss": 0.61710095, + "num_input_tokens_seen": 326989455, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.65625, + "step": 15158, + "time_per_iteration": 2.5578408241271973 + }, + { + "auxiliary_loss_clip": 0.01099182, + "auxiliary_loss_mlp": 0.01033219, + "balance_loss_clip": 1.02256203, + "balance_loss_mlp": 1.0345186, + "epoch": 0.9114083871937472, + "flos": 21835591424640.0, + "grad_norm": 1.5097525180597062, + "language_loss": 0.67755049, + "learning_rate": 8.171376985767375e-08, + "loss": 0.69887447, + "num_input_tokens_seen": 327009640, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6484375, + "step": 15159, + "time_per_iteration": 2.452134132385254 + }, + { + "auxiliary_loss_clip": 0.01100265, + "auxiliary_loss_mlp": 0.01027178, + "balance_loss_clip": 1.01585364, + "balance_loss_mlp": 1.03467369, + "epoch": 0.9114685104464152, + "flos": 27088999061760.0, + "grad_norm": 1.8061769242302645, + "language_loss": 0.7828775, + "learning_rate": 8.160361920824588e-08, + "loss": 0.80415201, + "num_input_tokens_seen": 327027690, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 15160, + "time_per_iteration": 2.4899487495422363 + }, + { + "auxiliary_loss_clip": 0.01101577, + "auxiliary_loss_mlp": 0.01028261, + "balance_loss_clip": 1.01589894, + "balance_loss_mlp": 1.03610229, + "epoch": 0.9115286336990831, + "flos": 17967042696960.0, + "grad_norm": 1.6956442783060977, + "language_loss": 0.69036943, + "learning_rate": 8.149354130460073e-08, + "loss": 0.71166778, + "num_input_tokens_seen": 327045915, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.65625, + "step": 15161, + "time_per_iteration": 2.3957245349884033 + }, + { + "auxiliary_loss_clip": 0.01101547, + "auxiliary_loss_mlp": 0.01031821, + "balance_loss_clip": 1.01952457, + "balance_loss_mlp": 1.03551626, + "epoch": 0.9115887569517511, + "flos": 22929861306240.0, + "grad_norm": 1.6456282079841216, + "language_loss": 0.76603878, + "learning_rate": 8.138353615091321e-08, + "loss": 0.78737247, + "num_input_tokens_seen": 327066355, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66015625, + "step": 15162, + "time_per_iteration": 2.513727903366089 + }, + { + "auxiliary_loss_clip": 0.01099317, + "auxiliary_loss_mlp": 0.01030473, + "balance_loss_clip": 1.01908267, + "balance_loss_mlp": 1.03456116, + "epoch": 0.911648880204419, + "flos": 23988436047360.0, + "grad_norm": 1.8294416135843556, + "language_loss": 0.66720057, + "learning_rate": 8.127360375135395e-08, + "loss": 0.6884985, + "num_input_tokens_seen": 327086735, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 15163, + "time_per_iteration": 2.4560275077819824 + }, + { + "auxiliary_loss_clip": 0.01103819, + "auxiliary_loss_mlp": 0.01033242, + "balance_loss_clip": 1.02094579, + "balance_loss_mlp": 1.03538442, + "epoch": 0.911709003457087, + "flos": 17055306754560.0, + "grad_norm": 2.184740599613921, + "language_loss": 0.70217323, + "learning_rate": 8.116374411009186e-08, + "loss": 0.72354388, + "num_input_tokens_seen": 327104035, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.68359375, + "step": 15164, + "time_per_iteration": 2.4589922428131104 + }, + { + "auxiliary_loss_clip": 0.01098923, + "auxiliary_loss_mlp": 0.01031914, + "balance_loss_clip": 1.0209229, + "balance_loss_mlp": 1.03687394, + "epoch": 0.911769126709755, + "flos": 21653344794240.0, + "grad_norm": 1.4332025157922594, + "language_loss": 0.75946969, + "learning_rate": 8.105395723129315e-08, + "loss": 0.78077805, + "num_input_tokens_seen": 327124370, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.62109375, + "step": 15165, + "time_per_iteration": 2.459932565689087 + }, + { + "auxiliary_loss_clip": 0.01101128, + "auxiliary_loss_mlp": 0.01030795, + "balance_loss_clip": 1.01934457, + "balance_loss_mlp": 1.03483164, + "epoch": 0.911829249962423, + "flos": 24790321221120.0, + "grad_norm": 2.577228898134307, + "language_loss": 0.72376269, + "learning_rate": 8.094424311912074e-08, + "loss": 0.7450819, + "num_input_tokens_seen": 327140915, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 15166, + "time_per_iteration": 2.5008912086486816 + }, + { + "auxiliary_loss_clip": 0.01100156, + "auxiliary_loss_mlp": 0.01033853, + "balance_loss_clip": 1.02166939, + "balance_loss_mlp": 1.03380466, + "epoch": 0.9118893732150909, + "flos": 20959406968320.0, + "grad_norm": 1.790469764052118, + "language_loss": 0.72797149, + "learning_rate": 8.083460177773482e-08, + "loss": 0.74931157, + "num_input_tokens_seen": 327158940, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6640625, + "step": 15167, + "time_per_iteration": 2.426950216293335 + }, + { + "auxiliary_loss_clip": 0.01022043, + "auxiliary_loss_mlp": 0.01000197, + "balance_loss_clip": 0.9992674, + "balance_loss_mlp": 1.00187171, + "epoch": 0.9119494964677589, + "flos": 67917385872000.0, + "grad_norm": 0.7700712875072107, + "language_loss": 0.65548205, + "learning_rate": 8.072503321129298e-08, + "loss": 0.67570436, + "num_input_tokens_seen": 327217450, + "router_z_loss_clip": 0.00927734, + "router_z_loss_mlp": 0.20214844, + "step": 15168, + "time_per_iteration": 3.0059800148010254 + }, + { + "auxiliary_loss_clip": 0.01097014, + "auxiliary_loss_mlp": 0.01027417, + "balance_loss_clip": 1.0165627, + "balance_loss_mlp": 1.03288174, + "epoch": 0.9120096197204268, + "flos": 18551524803840.0, + "grad_norm": 1.934423390877551, + "language_loss": 0.7840631, + "learning_rate": 8.061553742395033e-08, + "loss": 0.80530739, + "num_input_tokens_seen": 327233905, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.640625, + "step": 15169, + "time_per_iteration": 2.3854548931121826 + }, + { + "auxiliary_loss_clip": 0.01098796, + "auxiliary_loss_mlp": 0.01029198, + "balance_loss_clip": 1.01809335, + "balance_loss_mlp": 1.03389323, + "epoch": 0.9120697429730948, + "flos": 19025725178880.0, + "grad_norm": 1.8822447318945712, + "language_loss": 0.8215884, + "learning_rate": 8.05061144198591e-08, + "loss": 0.84286833, + "num_input_tokens_seen": 327252430, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6484375, + "step": 15170, + "time_per_iteration": 2.4406228065490723 + }, + { + "auxiliary_loss_clip": 0.01101631, + "auxiliary_loss_mlp": 0.01031893, + "balance_loss_clip": 1.01971602, + "balance_loss_mlp": 1.03597105, + "epoch": 0.9121298662257629, + "flos": 17163685065600.0, + "grad_norm": 2.184092599163872, + "language_loss": 0.77514195, + "learning_rate": 8.039676420316799e-08, + "loss": 0.79647714, + "num_input_tokens_seen": 327269215, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.65625, + "step": 15171, + "time_per_iteration": 2.3917133808135986 + }, + { + "auxiliary_loss_clip": 0.01096383, + "auxiliary_loss_mlp": 0.0103098, + "balance_loss_clip": 1.0195179, + "balance_loss_mlp": 1.03134727, + "epoch": 0.9121899894784308, + "flos": 19682710888320.0, + "grad_norm": 1.5433702960401063, + "language_loss": 0.66869926, + "learning_rate": 8.02874867780241e-08, + "loss": 0.68997288, + "num_input_tokens_seen": 327290320, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6484375, + "step": 15172, + "time_per_iteration": 2.472642421722412 + }, + { + "auxiliary_loss_clip": 0.01101924, + "auxiliary_loss_mlp": 0.01030191, + "balance_loss_clip": 1.01863933, + "balance_loss_mlp": 1.0358417, + "epoch": 0.9122501127310988, + "flos": 22235743912320.0, + "grad_norm": 1.6574136999814857, + "language_loss": 0.75031823, + "learning_rate": 8.017828214857103e-08, + "loss": 0.77163935, + "num_input_tokens_seen": 327310150, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66015625, + "step": 15173, + "time_per_iteration": 2.4400486946105957 + }, + { + "auxiliary_loss_clip": 0.01106005, + "auxiliary_loss_mlp": 0.01033432, + "balance_loss_clip": 1.01966953, + "balance_loss_mlp": 1.03647041, + "epoch": 0.9123102359837667, + "flos": 15957122290560.0, + "grad_norm": 2.201313503616394, + "language_loss": 0.65935463, + "learning_rate": 8.00691503189499e-08, + "loss": 0.680749, + "num_input_tokens_seen": 327326660, + "router_z_loss_clip": 0.13769531, + "router_z_loss_mlp": 0.6953125, + "step": 15174, + "time_per_iteration": 2.4405062198638916 + }, + { + "auxiliary_loss_clip": 0.01101949, + "auxiliary_loss_mlp": 0.01032545, + "balance_loss_clip": 1.019593, + "balance_loss_mlp": 1.03539705, + "epoch": 0.9123703592364347, + "flos": 25155784149120.0, + "grad_norm": 1.786987037941784, + "language_loss": 0.74865186, + "learning_rate": 7.996009129329894e-08, + "loss": 0.76999688, + "num_input_tokens_seen": 327346700, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6640625, + "step": 15175, + "time_per_iteration": 2.5217480659484863 + }, + { + "auxiliary_loss_clip": 0.01021805, + "auxiliary_loss_mlp": 0.01000925, + "balance_loss_clip": 0.99997121, + "balance_loss_mlp": 1.00181603, + "epoch": 0.9124304824891026, + "flos": 60801650812800.0, + "grad_norm": 0.9628700747874241, + "language_loss": 0.58435005, + "learning_rate": 7.985110507575421e-08, + "loss": 0.6045773, + "num_input_tokens_seen": 327403050, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.20019531, + "step": 15176, + "time_per_iteration": 3.0988872051239014 + }, + { + "auxiliary_loss_clip": 0.01099776, + "auxiliary_loss_mlp": 0.01032073, + "balance_loss_clip": 1.02064109, + "balance_loss_mlp": 1.03451729, + "epoch": 0.9124906057417707, + "flos": 18150941352960.0, + "grad_norm": 1.8294542789280668, + "language_loss": 0.65551788, + "learning_rate": 7.97421916704475e-08, + "loss": 0.67683637, + "num_input_tokens_seen": 327422225, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 15177, + "time_per_iteration": 2.4310319423675537 + }, + { + "auxiliary_loss_clip": 0.01098513, + "auxiliary_loss_mlp": 0.01026355, + "balance_loss_clip": 1.01526892, + "balance_loss_mlp": 1.0335021, + "epoch": 0.9125507289944386, + "flos": 11686769049600.0, + "grad_norm": 1.8886919295946252, + "language_loss": 0.81066203, + "learning_rate": 7.963335108150926e-08, + "loss": 0.83191073, + "num_input_tokens_seen": 327437025, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6484375, + "step": 15178, + "time_per_iteration": 3.8186910152435303 + }, + { + "auxiliary_loss_clip": 0.01099546, + "auxiliary_loss_mlp": 0.01027725, + "balance_loss_clip": 1.01648307, + "balance_loss_mlp": 1.03465581, + "epoch": 0.9126108522471066, + "flos": 17748813617280.0, + "grad_norm": 2.0463085825275034, + "language_loss": 0.78655928, + "learning_rate": 7.952458331306711e-08, + "loss": 0.807832, + "num_input_tokens_seen": 327453915, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 15179, + "time_per_iteration": 2.4364078044891357 + }, + { + "auxiliary_loss_clip": 0.01097377, + "auxiliary_loss_mlp": 0.01030818, + "balance_loss_clip": 1.0196358, + "balance_loss_mlp": 1.03346872, + "epoch": 0.9126709754997745, + "flos": 27635738952960.0, + "grad_norm": 2.3074557537975626, + "language_loss": 0.68364185, + "learning_rate": 7.941588836924507e-08, + "loss": 0.70492381, + "num_input_tokens_seen": 327474415, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.640625, + "step": 15180, + "time_per_iteration": 3.9669907093048096 + }, + { + "auxiliary_loss_clip": 0.01096034, + "auxiliary_loss_mlp": 0.01026412, + "balance_loss_clip": 1.01565289, + "balance_loss_mlp": 1.03221154, + "epoch": 0.9127310987524425, + "flos": 15924982596480.0, + "grad_norm": 1.6615442827017741, + "language_loss": 0.75214398, + "learning_rate": 7.930726625416495e-08, + "loss": 0.77336842, + "num_input_tokens_seen": 327492750, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.640625, + "step": 15181, + "time_per_iteration": 3.870901584625244 + }, + { + "auxiliary_loss_clip": 0.01103971, + "auxiliary_loss_mlp": 0.01030083, + "balance_loss_clip": 1.01850748, + "balance_loss_mlp": 1.03598988, + "epoch": 0.9127912220051104, + "flos": 21536885923200.0, + "grad_norm": 2.297286307607851, + "language_loss": 0.74843061, + "learning_rate": 7.919871697194614e-08, + "loss": 0.7697711, + "num_input_tokens_seen": 327509470, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 15182, + "time_per_iteration": 2.4776079654693604 + }, + { + "auxiliary_loss_clip": 0.01100627, + "auxiliary_loss_mlp": 0.01029232, + "balance_loss_clip": 1.01718605, + "balance_loss_mlp": 1.03372836, + "epoch": 0.9128513452577784, + "flos": 24063561342720.0, + "grad_norm": 3.985980312543223, + "language_loss": 0.76413208, + "learning_rate": 7.909024052670421e-08, + "loss": 0.78543067, + "num_input_tokens_seen": 327530520, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.66796875, + "step": 15183, + "time_per_iteration": 2.468374729156494 + }, + { + "auxiliary_loss_clip": 0.01102788, + "auxiliary_loss_mlp": 0.01030705, + "balance_loss_clip": 1.01908207, + "balance_loss_mlp": 1.03510523, + "epoch": 0.9129114685104465, + "flos": 16216469464320.0, + "grad_norm": 2.281455397208263, + "language_loss": 0.76592457, + "learning_rate": 7.898183692255256e-08, + "loss": 0.78725952, + "num_input_tokens_seen": 327546960, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6796875, + "step": 15184, + "time_per_iteration": 2.421410322189331 + }, + { + "auxiliary_loss_clip": 0.01102745, + "auxiliary_loss_mlp": 0.01032849, + "balance_loss_clip": 1.02141702, + "balance_loss_mlp": 1.03666544, + "epoch": 0.9129715917631144, + "flos": 19384364522880.0, + "grad_norm": 1.638011852931889, + "language_loss": 0.74281073, + "learning_rate": 7.887350616360233e-08, + "loss": 0.76416671, + "num_input_tokens_seen": 327564830, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 15185, + "time_per_iteration": 3.940502166748047 + }, + { + "auxiliary_loss_clip": 0.01098799, + "auxiliary_loss_mlp": 0.01029378, + "balance_loss_clip": 1.01797581, + "balance_loss_mlp": 1.03400433, + "epoch": 0.9130317150157824, + "flos": 20590460421120.0, + "grad_norm": 2.0531657489206188, + "language_loss": 0.68440223, + "learning_rate": 7.876524825396158e-08, + "loss": 0.70568401, + "num_input_tokens_seen": 327583675, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 15186, + "time_per_iteration": 2.4343931674957275 + }, + { + "auxiliary_loss_clip": 0.01108195, + "auxiliary_loss_mlp": 0.0103352, + "balance_loss_clip": 1.02074695, + "balance_loss_mlp": 1.03696275, + "epoch": 0.9130918382684503, + "flos": 20189230525440.0, + "grad_norm": 2.6512076231674806, + "language_loss": 0.77220356, + "learning_rate": 7.865706319773502e-08, + "loss": 0.79362077, + "num_input_tokens_seen": 327602280, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.7109375, + "step": 15187, + "time_per_iteration": 2.438368558883667 + }, + { + "auxiliary_loss_clip": 0.01098925, + "auxiliary_loss_mlp": 0.01029611, + "balance_loss_clip": 1.01903725, + "balance_loss_mlp": 1.03280544, + "epoch": 0.9131519615211183, + "flos": 25556870390400.0, + "grad_norm": 2.0471267389391024, + "language_loss": 0.66164011, + "learning_rate": 7.854895099902515e-08, + "loss": 0.68292546, + "num_input_tokens_seen": 327623515, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.66015625, + "step": 15188, + "time_per_iteration": 2.484286069869995 + }, + { + "auxiliary_loss_clip": 0.01097061, + "auxiliary_loss_mlp": 0.0103036, + "balance_loss_clip": 1.01933336, + "balance_loss_mlp": 1.03323078, + "epoch": 0.9132120847737862, + "flos": 17931563038080.0, + "grad_norm": 2.3139671403974824, + "language_loss": 0.76142931, + "learning_rate": 7.844091166193157e-08, + "loss": 0.78270352, + "num_input_tokens_seen": 327642875, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.63671875, + "step": 15189, + "time_per_iteration": 2.4128341674804688 + }, + { + "auxiliary_loss_clip": 0.01098834, + "auxiliary_loss_mlp": 0.01028075, + "balance_loss_clip": 1.01762676, + "balance_loss_mlp": 1.03456092, + "epoch": 0.9132722080264543, + "flos": 20047635112320.0, + "grad_norm": 1.6090293232992543, + "language_loss": 0.75407052, + "learning_rate": 7.8332945190551e-08, + "loss": 0.77533972, + "num_input_tokens_seen": 327662450, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.640625, + "step": 15190, + "time_per_iteration": 2.435072660446167 + }, + { + "auxiliary_loss_clip": 0.01021871, + "auxiliary_loss_mlp": 0.01001637, + "balance_loss_clip": 1.00067699, + "balance_loss_mlp": 1.00190675, + "epoch": 0.9133323312791222, + "flos": 70439967141120.0, + "grad_norm": 0.7094888016430416, + "language_loss": 0.57359248, + "learning_rate": 7.822505158897797e-08, + "loss": 0.59382761, + "num_input_tokens_seen": 327723845, + "router_z_loss_clip": 0.00958252, + "router_z_loss_mlp": 0.19921875, + "step": 15191, + "time_per_iteration": 3.087395429611206 + }, + { + "auxiliary_loss_clip": 0.01102347, + "auxiliary_loss_mlp": 0.01034605, + "balance_loss_clip": 1.02263618, + "balance_loss_mlp": 1.03504705, + "epoch": 0.9133924545317902, + "flos": 25483792170240.0, + "grad_norm": 1.6777146532024645, + "language_loss": 0.73936659, + "learning_rate": 7.81172308613034e-08, + "loss": 0.76073611, + "num_input_tokens_seen": 327742590, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 15192, + "time_per_iteration": 2.4690206050872803 + }, + { + "auxiliary_loss_clip": 0.01099289, + "auxiliary_loss_mlp": 0.01024504, + "balance_loss_clip": 1.0133816, + "balance_loss_mlp": 1.03559685, + "epoch": 0.9134525777844581, + "flos": 39930690107520.0, + "grad_norm": 1.6421376517617297, + "language_loss": 0.69312721, + "learning_rate": 7.800948301161647e-08, + "loss": 0.71436512, + "num_input_tokens_seen": 327764350, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.63671875, + "step": 15193, + "time_per_iteration": 2.6223676204681396 + }, + { + "auxiliary_loss_clip": 0.01096991, + "auxiliary_loss_mlp": 0.01036528, + "balance_loss_clip": 1.02556682, + "balance_loss_mlp": 1.03420997, + "epoch": 0.9135127010371261, + "flos": 20886723797760.0, + "grad_norm": 1.485940174271116, + "language_loss": 0.73231626, + "learning_rate": 7.790180804400215e-08, + "loss": 0.7536515, + "num_input_tokens_seen": 327783120, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.62890625, + "step": 15194, + "time_per_iteration": 2.463771343231201 + }, + { + "auxiliary_loss_clip": 0.01101991, + "auxiliary_loss_mlp": 0.01032601, + "balance_loss_clip": 1.01946473, + "balance_loss_mlp": 1.03339386, + "epoch": 0.913572824289794, + "flos": 20813250528000.0, + "grad_norm": 2.0304511645120455, + "language_loss": 0.61398089, + "learning_rate": 7.779420596254383e-08, + "loss": 0.63532686, + "num_input_tokens_seen": 327801960, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.68359375, + "step": 15195, + "time_per_iteration": 2.453814744949341 + }, + { + "auxiliary_loss_clip": 0.01099363, + "auxiliary_loss_mlp": 0.01030955, + "balance_loss_clip": 1.01902771, + "balance_loss_mlp": 1.0335803, + "epoch": 0.913632947542462, + "flos": 25703278225920.0, + "grad_norm": 1.5172036842114138, + "language_loss": 0.7131865, + "learning_rate": 7.768667677132201e-08, + "loss": 0.73448968, + "num_input_tokens_seen": 327823795, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 15196, + "time_per_iteration": 2.4799444675445557 + }, + { + "auxiliary_loss_clip": 0.01100065, + "auxiliary_loss_mlp": 0.01029829, + "balance_loss_clip": 1.01855159, + "balance_loss_mlp": 1.03487432, + "epoch": 0.9136930707951301, + "flos": 26286216048000.0, + "grad_norm": 1.4441541156938638, + "language_loss": 0.71125305, + "learning_rate": 7.757922047441411e-08, + "loss": 0.73255193, + "num_input_tokens_seen": 327845175, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65234375, + "step": 15197, + "time_per_iteration": 2.5215611457824707 + }, + { + "auxiliary_loss_clip": 0.01100431, + "auxiliary_loss_mlp": 0.01026363, + "balance_loss_clip": 1.01447809, + "balance_loss_mlp": 1.03330481, + "epoch": 0.913753194047798, + "flos": 22091885942400.0, + "grad_norm": 1.889161439888729, + "language_loss": 0.77785528, + "learning_rate": 7.747183707589489e-08, + "loss": 0.79912317, + "num_input_tokens_seen": 327863150, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 15198, + "time_per_iteration": 2.442277431488037 + }, + { + "auxiliary_loss_clip": 0.01096101, + "auxiliary_loss_mlp": 0.01029423, + "balance_loss_clip": 1.01853919, + "balance_loss_mlp": 1.03312182, + "epoch": 0.913813317300466, + "flos": 23587206151680.0, + "grad_norm": 1.3365328248407828, + "language_loss": 0.6804055, + "learning_rate": 7.736452657983616e-08, + "loss": 0.70166075, + "num_input_tokens_seen": 327883445, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6328125, + "step": 15199, + "time_per_iteration": 2.4880144596099854 + }, + { + "auxiliary_loss_clip": 0.01100948, + "auxiliary_loss_mlp": 0.01033065, + "balance_loss_clip": 1.02183509, + "balance_loss_mlp": 1.03505015, + "epoch": 0.9138734405531339, + "flos": 28876452583680.0, + "grad_norm": 1.6447593727576186, + "language_loss": 0.67633069, + "learning_rate": 7.725728899030714e-08, + "loss": 0.69767076, + "num_input_tokens_seen": 327905745, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 15200, + "time_per_iteration": 2.5086967945098877 + }, + { + "auxiliary_loss_clip": 0.01098027, + "auxiliary_loss_mlp": 0.0102929, + "balance_loss_clip": 1.01860261, + "balance_loss_mlp": 1.03541744, + "epoch": 0.9139335638058019, + "flos": 22821087945600.0, + "grad_norm": 1.5101416705919046, + "language_loss": 0.71488059, + "learning_rate": 7.715012431137435e-08, + "loss": 0.73615378, + "num_input_tokens_seen": 327925435, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.625, + "step": 15201, + "time_per_iteration": 2.4748075008392334 + }, + { + "auxiliary_loss_clip": 0.01098308, + "auxiliary_loss_mlp": 0.01026384, + "balance_loss_clip": 1.01603663, + "balance_loss_mlp": 1.03388548, + "epoch": 0.9139936870584698, + "flos": 18004174381440.0, + "grad_norm": 1.6982723466196472, + "language_loss": 0.70671141, + "learning_rate": 7.704303254710165e-08, + "loss": 0.72795826, + "num_input_tokens_seen": 327944145, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.64453125, + "step": 15202, + "time_per_iteration": 2.438340902328491 + }, + { + "auxiliary_loss_clip": 0.01099499, + "auxiliary_loss_mlp": 0.01028688, + "balance_loss_clip": 1.01694536, + "balance_loss_mlp": 1.033728, + "epoch": 0.9140538103111379, + "flos": 15813767111040.0, + "grad_norm": 1.8389143289614247, + "language_loss": 0.66278571, + "learning_rate": 7.693601370155001e-08, + "loss": 0.68406761, + "num_input_tokens_seen": 327960565, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.65625, + "step": 15203, + "time_per_iteration": 2.479570150375366 + }, + { + "auxiliary_loss_clip": 0.01101513, + "auxiliary_loss_mlp": 0.01031204, + "balance_loss_clip": 1.01884818, + "balance_loss_mlp": 1.03622997, + "epoch": 0.9141139335638058, + "flos": 23987035416960.0, + "grad_norm": 1.6116143834320078, + "language_loss": 0.68694603, + "learning_rate": 7.682906777877751e-08, + "loss": 0.70827323, + "num_input_tokens_seen": 327981180, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.65234375, + "step": 15204, + "time_per_iteration": 2.4571127891540527 + }, + { + "auxiliary_loss_clip": 0.01097969, + "auxiliary_loss_mlp": 0.0102718, + "balance_loss_clip": 1.01541436, + "balance_loss_mlp": 1.03155589, + "epoch": 0.9141740568164738, + "flos": 24024418496640.0, + "grad_norm": 3.922644674867668, + "language_loss": 0.59307611, + "learning_rate": 7.672219478283915e-08, + "loss": 0.61432767, + "num_input_tokens_seen": 328001500, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6640625, + "step": 15205, + "time_per_iteration": 2.4621520042419434 + }, + { + "auxiliary_loss_clip": 0.0109613, + "auxiliary_loss_mlp": 0.01032774, + "balance_loss_clip": 1.02129984, + "balance_loss_mlp": 1.03344274, + "epoch": 0.9142341800691417, + "flos": 27018291139200.0, + "grad_norm": 1.8519620978555191, + "language_loss": 0.81337631, + "learning_rate": 7.661539471778811e-08, + "loss": 0.8346653, + "num_input_tokens_seen": 328023025, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.625, + "step": 15206, + "time_per_iteration": 2.4877803325653076 + }, + { + "auxiliary_loss_clip": 0.01098654, + "auxiliary_loss_mlp": 0.01028165, + "balance_loss_clip": 1.01667953, + "balance_loss_mlp": 1.03213692, + "epoch": 0.9142943033218097, + "flos": 20412487509120.0, + "grad_norm": 2.0513534201976173, + "language_loss": 0.73153603, + "learning_rate": 7.650866758767382e-08, + "loss": 0.75280422, + "num_input_tokens_seen": 328041410, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66796875, + "step": 15207, + "time_per_iteration": 2.4546096324920654 + }, + { + "auxiliary_loss_clip": 0.01099504, + "auxiliary_loss_mlp": 0.01033304, + "balance_loss_clip": 1.0210495, + "balance_loss_mlp": 1.03416693, + "epoch": 0.9143544265744776, + "flos": 19755322231680.0, + "grad_norm": 1.7864801619442867, + "language_loss": 0.72906077, + "learning_rate": 7.640201339654373e-08, + "loss": 0.75038886, + "num_input_tokens_seen": 328060495, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.65234375, + "step": 15208, + "time_per_iteration": 2.4418113231658936 + }, + { + "auxiliary_loss_clip": 0.01098905, + "auxiliary_loss_mlp": 0.01027437, + "balance_loss_clip": 1.01648736, + "balance_loss_mlp": 1.03465152, + "epoch": 0.9144145498271457, + "flos": 17165444832000.0, + "grad_norm": 2.1695224454551423, + "language_loss": 0.86409903, + "learning_rate": 7.629543214844237e-08, + "loss": 0.88536251, + "num_input_tokens_seen": 328076905, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.640625, + "step": 15209, + "time_per_iteration": 2.413148880004883 + }, + { + "auxiliary_loss_clip": 0.01098935, + "auxiliary_loss_mlp": 0.01035579, + "balance_loss_clip": 1.0244627, + "balance_loss_mlp": 1.03434813, + "epoch": 0.9144746730798137, + "flos": 23726072131200.0, + "grad_norm": 1.735664955022414, + "language_loss": 0.75140452, + "learning_rate": 7.618892384741093e-08, + "loss": 0.77274966, + "num_input_tokens_seen": 328096960, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.64453125, + "step": 15210, + "time_per_iteration": 2.493086099624634 + }, + { + "auxiliary_loss_clip": 0.01097401, + "auxiliary_loss_mlp": 0.01030994, + "balance_loss_clip": 1.01927555, + "balance_loss_mlp": 1.0315125, + "epoch": 0.9145347963324816, + "flos": 25847854467840.0, + "grad_norm": 1.7521937388781827, + "language_loss": 0.77584058, + "learning_rate": 7.6082488497488e-08, + "loss": 0.79712451, + "num_input_tokens_seen": 328115445, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 15211, + "time_per_iteration": 2.4678258895874023 + }, + { + "auxiliary_loss_clip": 0.01100975, + "auxiliary_loss_mlp": 0.01026726, + "balance_loss_clip": 1.01571679, + "balance_loss_mlp": 1.03529775, + "epoch": 0.9145949195851496, + "flos": 19242769109760.0, + "grad_norm": 1.715606657088832, + "language_loss": 0.82844532, + "learning_rate": 7.597612610270986e-08, + "loss": 0.84972233, + "num_input_tokens_seen": 328133965, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 15212, + "time_per_iteration": 2.4359114170074463 + }, + { + "auxiliary_loss_clip": 0.01096761, + "auxiliary_loss_mlp": 0.01027305, + "balance_loss_clip": 1.01680315, + "balance_loss_mlp": 1.03358889, + "epoch": 0.9146550428378175, + "flos": 18296379521280.0, + "grad_norm": 1.7611506527071346, + "language_loss": 0.83891743, + "learning_rate": 7.586983666711022e-08, + "loss": 0.86015809, + "num_input_tokens_seen": 328151520, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.6328125, + "step": 15213, + "time_per_iteration": 2.4092206954956055 + }, + { + "auxiliary_loss_clip": 0.01099693, + "auxiliary_loss_mlp": 0.01025121, + "balance_loss_clip": 1.01386786, + "balance_loss_mlp": 1.03436995, + "epoch": 0.9147151660904855, + "flos": 20084264006400.0, + "grad_norm": 2.085343861067481, + "language_loss": 0.70816439, + "learning_rate": 7.576362019471894e-08, + "loss": 0.72941256, + "num_input_tokens_seen": 328171275, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 15214, + "time_per_iteration": 2.447380781173706 + }, + { + "auxiliary_loss_clip": 0.01103848, + "auxiliary_loss_mlp": 0.01033056, + "balance_loss_clip": 1.02051485, + "balance_loss_mlp": 1.03580141, + "epoch": 0.9147752893431534, + "flos": 24389127239040.0, + "grad_norm": 1.5390932185045392, + "language_loss": 0.62629873, + "learning_rate": 7.565747668956413e-08, + "loss": 0.64766777, + "num_input_tokens_seen": 328192115, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 15215, + "time_per_iteration": 2.461411476135254 + }, + { + "auxiliary_loss_clip": 0.01104348, + "auxiliary_loss_mlp": 0.01029469, + "balance_loss_clip": 1.01738763, + "balance_loss_mlp": 1.03553486, + "epoch": 0.9148354125958215, + "flos": 18150402648960.0, + "grad_norm": 2.721534331397324, + "language_loss": 0.75732076, + "learning_rate": 7.555140615567058e-08, + "loss": 0.77865899, + "num_input_tokens_seen": 328208990, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 15216, + "time_per_iteration": 2.523115873336792 + }, + { + "auxiliary_loss_clip": 0.01100136, + "auxiliary_loss_mlp": 0.01035305, + "balance_loss_clip": 1.02305031, + "balance_loss_mlp": 1.03539312, + "epoch": 0.9148955358484894, + "flos": 23367540528000.0, + "grad_norm": 2.4268000375661773, + "language_loss": 0.68142593, + "learning_rate": 7.544540859706062e-08, + "loss": 0.70278037, + "num_input_tokens_seen": 328227840, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6484375, + "step": 15217, + "time_per_iteration": 2.4448487758636475 + }, + { + "auxiliary_loss_clip": 0.01098239, + "auxiliary_loss_mlp": 0.0102959, + "balance_loss_clip": 1.01877725, + "balance_loss_mlp": 1.03458583, + "epoch": 0.9149556591011574, + "flos": 18076498416000.0, + "grad_norm": 1.8653431496405544, + "language_loss": 0.79877293, + "learning_rate": 7.533948401775347e-08, + "loss": 0.82005119, + "num_input_tokens_seen": 328246250, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.63671875, + "step": 15218, + "time_per_iteration": 2.434863567352295 + }, + { + "auxiliary_loss_clip": 0.01021776, + "auxiliary_loss_mlp": 0.01000225, + "balance_loss_clip": 0.99925387, + "balance_loss_mlp": 1.00182867, + "epoch": 0.9150157823538253, + "flos": 54586374825600.0, + "grad_norm": 0.8896822050183594, + "language_loss": 0.59232152, + "learning_rate": 7.523363242176595e-08, + "loss": 0.61254156, + "num_input_tokens_seen": 328303625, + "router_z_loss_clip": 0.00970459, + "router_z_loss_mlp": 0.19921875, + "step": 15219, + "time_per_iteration": 2.9880809783935547 + }, + { + "auxiliary_loss_clip": 0.01097663, + "auxiliary_loss_mlp": 0.01030559, + "balance_loss_clip": 1.01906729, + "balance_loss_mlp": 1.03414941, + "epoch": 0.9150759056064933, + "flos": 17893102550400.0, + "grad_norm": 1.812367414833016, + "language_loss": 0.78320539, + "learning_rate": 7.512785381311216e-08, + "loss": 0.80448759, + "num_input_tokens_seen": 328322135, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.63671875, + "step": 15220, + "time_per_iteration": 3.818652391433716 + }, + { + "auxiliary_loss_clip": 0.01102017, + "auxiliary_loss_mlp": 0.0103102, + "balance_loss_clip": 1.01826453, + "balance_loss_mlp": 1.03302217, + "epoch": 0.9151360288591612, + "flos": 18073517587200.0, + "grad_norm": 1.8962710431659022, + "language_loss": 0.65642536, + "learning_rate": 7.50221481958031e-08, + "loss": 0.67775571, + "num_input_tokens_seen": 328340750, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 15221, + "time_per_iteration": 2.4236178398132324 + }, + { + "auxiliary_loss_clip": 0.01098425, + "auxiliary_loss_mlp": 0.01027849, + "balance_loss_clip": 1.01696539, + "balance_loss_mlp": 1.03305852, + "epoch": 0.9151961521118293, + "flos": 19354523299200.0, + "grad_norm": 1.6737011453646373, + "language_loss": 0.8425433, + "learning_rate": 7.491651557384692e-08, + "loss": 0.86380607, + "num_input_tokens_seen": 328359995, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.65234375, + "step": 15222, + "time_per_iteration": 3.9798471927642822 + }, + { + "auxiliary_loss_clip": 0.01022041, + "auxiliary_loss_mlp": 0.01001485, + "balance_loss_clip": 1.00054312, + "balance_loss_mlp": 1.00207162, + "epoch": 0.9152562753644973, + "flos": 72146621018880.0, + "grad_norm": 0.7306316562738401, + "language_loss": 0.49642789, + "learning_rate": 7.481095595124953e-08, + "loss": 0.51666313, + "num_input_tokens_seen": 328426865, + "router_z_loss_clip": 0.00939941, + "router_z_loss_mlp": 0.20019531, + "step": 15223, + "time_per_iteration": 4.616261720657349 + }, + { + "auxiliary_loss_clip": 0.01102367, + "auxiliary_loss_mlp": 0.01037801, + "balance_loss_clip": 1.02548087, + "balance_loss_mlp": 1.03582227, + "epoch": 0.9153163986171652, + "flos": 20777016683520.0, + "grad_norm": 1.708315664523414, + "language_loss": 0.7237857, + "learning_rate": 7.470546933201349e-08, + "loss": 0.7451874, + "num_input_tokens_seen": 328445970, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6640625, + "step": 15224, + "time_per_iteration": 2.4585115909576416 + }, + { + "auxiliary_loss_clip": 0.01097737, + "auxiliary_loss_mlp": 0.01025257, + "balance_loss_clip": 1.0136168, + "balance_loss_mlp": 1.03346014, + "epoch": 0.9153765218698332, + "flos": 23040107124480.0, + "grad_norm": 1.7724637972030735, + "language_loss": 0.81216443, + "learning_rate": 7.460005572013895e-08, + "loss": 0.83339441, + "num_input_tokens_seen": 328464585, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.64453125, + "step": 15225, + "time_per_iteration": 2.4403467178344727 + }, + { + "auxiliary_loss_clip": 0.01097606, + "auxiliary_loss_mlp": 0.01025522, + "balance_loss_clip": 1.01470423, + "balance_loss_mlp": 1.03278244, + "epoch": 0.9154366451225011, + "flos": 28990900293120.0, + "grad_norm": 1.266236399456709, + "language_loss": 0.71322179, + "learning_rate": 7.44947151196238e-08, + "loss": 0.73445308, + "num_input_tokens_seen": 328490155, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6484375, + "step": 15226, + "time_per_iteration": 2.541335105895996 + }, + { + "auxiliary_loss_clip": 0.01100546, + "auxiliary_loss_mlp": 0.01030004, + "balance_loss_clip": 1.01837564, + "balance_loss_mlp": 1.03350449, + "epoch": 0.9154967683751691, + "flos": 22309504490880.0, + "grad_norm": 1.9966844593099904, + "language_loss": 0.74624139, + "learning_rate": 7.43894475344613e-08, + "loss": 0.76754689, + "num_input_tokens_seen": 328508275, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 15227, + "time_per_iteration": 3.8971879482269287 + }, + { + "auxiliary_loss_clip": 0.01099091, + "auxiliary_loss_mlp": 0.01027789, + "balance_loss_clip": 1.01703024, + "balance_loss_mlp": 1.03465962, + "epoch": 0.915556891627837, + "flos": 24571481610240.0, + "grad_norm": 1.4148019926474746, + "language_loss": 0.73699552, + "learning_rate": 7.428425296864404e-08, + "loss": 0.75826436, + "num_input_tokens_seen": 328529425, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.64453125, + "step": 15228, + "time_per_iteration": 2.48069429397583 + }, + { + "auxiliary_loss_clip": 0.01095291, + "auxiliary_loss_mlp": 0.01031336, + "balance_loss_clip": 1.02024984, + "balance_loss_mlp": 1.03221262, + "epoch": 0.9156170148805051, + "flos": 22164676853760.0, + "grad_norm": 1.5305780770473314, + "language_loss": 0.71960795, + "learning_rate": 7.417913142616106e-08, + "loss": 0.74087429, + "num_input_tokens_seen": 328550200, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6328125, + "step": 15229, + "time_per_iteration": 2.444805145263672 + }, + { + "auxiliary_loss_clip": 0.011021, + "auxiliary_loss_mlp": 0.01032289, + "balance_loss_clip": 1.02018356, + "balance_loss_mlp": 1.03612995, + "epoch": 0.915677138133173, + "flos": 20920659171840.0, + "grad_norm": 2.0470204935534984, + "language_loss": 0.83144408, + "learning_rate": 7.407408291099848e-08, + "loss": 0.85278797, + "num_input_tokens_seen": 328568540, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66015625, + "step": 15230, + "time_per_iteration": 2.432034730911255 + }, + { + "auxiliary_loss_clip": 0.01098842, + "auxiliary_loss_mlp": 0.01030044, + "balance_loss_clip": 1.01935065, + "balance_loss_mlp": 1.03477907, + "epoch": 0.915737261385841, + "flos": 24345136056960.0, + "grad_norm": 1.6269282697769034, + "language_loss": 0.83418006, + "learning_rate": 7.396910742713957e-08, + "loss": 0.85546893, + "num_input_tokens_seen": 328587300, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.640625, + "step": 15231, + "time_per_iteration": 2.515538215637207 + }, + { + "auxiliary_loss_clip": 0.01095817, + "auxiliary_loss_mlp": 0.01024393, + "balance_loss_clip": 1.01322293, + "balance_loss_mlp": 1.03172684, + "epoch": 0.9157973846385089, + "flos": 26761386090240.0, + "grad_norm": 1.829982250303586, + "language_loss": 0.72207046, + "learning_rate": 7.386420497856516e-08, + "loss": 0.74327254, + "num_input_tokens_seen": 328610055, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.640625, + "step": 15232, + "time_per_iteration": 2.5557878017425537 + }, + { + "auxiliary_loss_clip": 0.01100605, + "auxiliary_loss_mlp": 0.01029482, + "balance_loss_clip": 1.01812696, + "balance_loss_mlp": 1.03404856, + "epoch": 0.9158575078911769, + "flos": 18478733892480.0, + "grad_norm": 4.73970403036583, + "language_loss": 0.67340308, + "learning_rate": 7.375937556925338e-08, + "loss": 0.69470394, + "num_input_tokens_seen": 328626815, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 15233, + "time_per_iteration": 2.4151556491851807 + }, + { + "auxiliary_loss_clip": 0.01101483, + "auxiliary_loss_mlp": 0.01029838, + "balance_loss_clip": 1.01797664, + "balance_loss_mlp": 1.03488588, + "epoch": 0.9159176311438448, + "flos": 21798926616960.0, + "grad_norm": 2.2571803490205564, + "language_loss": 0.6969521, + "learning_rate": 7.365461920317861e-08, + "loss": 0.7182653, + "num_input_tokens_seen": 328643995, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66796875, + "step": 15234, + "time_per_iteration": 2.444852828979492 + }, + { + "auxiliary_loss_clip": 0.01100736, + "auxiliary_loss_mlp": 0.01030082, + "balance_loss_clip": 1.01809597, + "balance_loss_mlp": 1.03512609, + "epoch": 0.9159777543965129, + "flos": 24783749032320.0, + "grad_norm": 1.7294981323630823, + "language_loss": 0.87835944, + "learning_rate": 7.354993588431391e-08, + "loss": 0.89966768, + "num_input_tokens_seen": 328659565, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.65625, + "step": 15235, + "time_per_iteration": 2.4612205028533936 + }, + { + "auxiliary_loss_clip": 0.01102081, + "auxiliary_loss_mlp": 0.01030225, + "balance_loss_clip": 1.01839912, + "balance_loss_mlp": 1.03525317, + "epoch": 0.9160378776491809, + "flos": 26868758820480.0, + "grad_norm": 1.5527464257030497, + "language_loss": 0.76839787, + "learning_rate": 7.344532561662853e-08, + "loss": 0.78972089, + "num_input_tokens_seen": 328679045, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 15236, + "time_per_iteration": 2.474457263946533 + }, + { + "auxiliary_loss_clip": 0.01021315, + "auxiliary_loss_mlp": 0.01003153, + "balance_loss_clip": 1.00213361, + "balance_loss_mlp": 1.0013386, + "epoch": 0.9160980009018488, + "flos": 70578222589440.0, + "grad_norm": 0.6788076551857354, + "language_loss": 0.62263203, + "learning_rate": 7.334078840409019e-08, + "loss": 0.64287663, + "num_input_tokens_seen": 328744565, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.19921875, + "step": 15237, + "time_per_iteration": 3.0201759338378906 + }, + { + "auxiliary_loss_clip": 0.01101293, + "auxiliary_loss_mlp": 0.01029169, + "balance_loss_clip": 1.01718867, + "balance_loss_mlp": 1.03470826, + "epoch": 0.9161581241545168, + "flos": 16289332202880.0, + "grad_norm": 3.0659105416988552, + "language_loss": 0.7453984, + "learning_rate": 7.323632425066151e-08, + "loss": 0.76670301, + "num_input_tokens_seen": 328762455, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 15238, + "time_per_iteration": 2.404824733734131 + }, + { + "auxiliary_loss_clip": 0.01101036, + "auxiliary_loss_mlp": 0.01026664, + "balance_loss_clip": 1.01528561, + "balance_loss_mlp": 1.0344367, + "epoch": 0.9162182474071847, + "flos": 18438154502400.0, + "grad_norm": 1.6853760696818214, + "language_loss": 0.74746668, + "learning_rate": 7.313193316030464e-08, + "loss": 0.76874375, + "num_input_tokens_seen": 328780320, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6640625, + "step": 15239, + "time_per_iteration": 2.4390740394592285 + }, + { + "auxiliary_loss_clip": 0.01100596, + "auxiliary_loss_mlp": 0.0103313, + "balance_loss_clip": 1.02131677, + "balance_loss_mlp": 1.034127, + "epoch": 0.9162783706598527, + "flos": 19167248764800.0, + "grad_norm": 2.2129519581764496, + "language_loss": 0.63188612, + "learning_rate": 7.302761513697819e-08, + "loss": 0.65322334, + "num_input_tokens_seen": 328797570, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 15240, + "time_per_iteration": 2.424992322921753 + }, + { + "auxiliary_loss_clip": 0.01100319, + "auxiliary_loss_mlp": 0.01023378, + "balance_loss_clip": 1.01264906, + "balance_loss_mlp": 1.03647375, + "epoch": 0.9163384939125206, + "flos": 20412990299520.0, + "grad_norm": 1.824123817472358, + "language_loss": 0.76293588, + "learning_rate": 7.292337018463746e-08, + "loss": 0.78417283, + "num_input_tokens_seen": 328814075, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.63671875, + "step": 15241, + "time_per_iteration": 2.453367233276367 + }, + { + "auxiliary_loss_clip": 0.01107929, + "auxiliary_loss_mlp": 0.01028716, + "balance_loss_clip": 1.0154779, + "balance_loss_mlp": 1.03601336, + "epoch": 0.9163986171651887, + "flos": 19645902426240.0, + "grad_norm": 2.7073047066041385, + "language_loss": 0.6746605, + "learning_rate": 7.281919830723549e-08, + "loss": 0.69602692, + "num_input_tokens_seen": 328831990, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.71875, + "step": 15242, + "time_per_iteration": 2.4336512088775635 + }, + { + "auxiliary_loss_clip": 0.01099212, + "auxiliary_loss_mlp": 0.01027991, + "balance_loss_clip": 1.01624286, + "balance_loss_mlp": 1.03331637, + "epoch": 0.9164587404178566, + "flos": 12823054865280.0, + "grad_norm": 1.754571362997044, + "language_loss": 0.80896854, + "learning_rate": 7.271509950872334e-08, + "loss": 0.83024061, + "num_input_tokens_seen": 328849105, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.66015625, + "step": 15243, + "time_per_iteration": 2.426079750061035 + }, + { + "auxiliary_loss_clip": 0.01099771, + "auxiliary_loss_mlp": 0.01029297, + "balance_loss_clip": 1.01748323, + "balance_loss_mlp": 1.03221726, + "epoch": 0.9165188636705246, + "flos": 22309396750080.0, + "grad_norm": 1.8762223959588424, + "language_loss": 0.8205328, + "learning_rate": 7.261107379304721e-08, + "loss": 0.84182346, + "num_input_tokens_seen": 328866810, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.67578125, + "step": 15244, + "time_per_iteration": 2.506777286529541 + }, + { + "auxiliary_loss_clip": 0.01104451, + "auxiliary_loss_mlp": 0.01034704, + "balance_loss_clip": 1.02243781, + "balance_loss_mlp": 1.03492165, + "epoch": 0.9165789869231925, + "flos": 18223337214720.0, + "grad_norm": 2.648614204029591, + "language_loss": 0.72213554, + "learning_rate": 7.250712116415214e-08, + "loss": 0.74352717, + "num_input_tokens_seen": 328885325, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6953125, + "step": 15245, + "time_per_iteration": 2.4680283069610596 + }, + { + "auxiliary_loss_clip": 0.01098467, + "auxiliary_loss_mlp": 0.01030377, + "balance_loss_clip": 1.01930237, + "balance_loss_mlp": 1.03360033, + "epoch": 0.9166391101758605, + "flos": 13691553811200.0, + "grad_norm": 1.6235640253578716, + "language_loss": 0.74646342, + "learning_rate": 7.240324162598033e-08, + "loss": 0.76775181, + "num_input_tokens_seen": 328902655, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6484375, + "step": 15246, + "time_per_iteration": 2.398216485977173 + }, + { + "auxiliary_loss_clip": 0.01099557, + "auxiliary_loss_mlp": 0.01030274, + "balance_loss_clip": 1.01808476, + "balance_loss_mlp": 1.03437448, + "epoch": 0.9166992334285284, + "flos": 17346793622400.0, + "grad_norm": 2.0593596154006355, + "language_loss": 0.75462282, + "learning_rate": 7.229943518247106e-08, + "loss": 0.77592111, + "num_input_tokens_seen": 328918440, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6484375, + "step": 15247, + "time_per_iteration": 2.4263362884521484 + }, + { + "auxiliary_loss_clip": 0.01103714, + "auxiliary_loss_mlp": 0.01027032, + "balance_loss_clip": 1.01507568, + "balance_loss_mlp": 1.03711426, + "epoch": 0.9167593566811965, + "flos": 23731135948800.0, + "grad_norm": 1.6801453014221095, + "language_loss": 0.75884688, + "learning_rate": 7.219570183756052e-08, + "loss": 0.78015435, + "num_input_tokens_seen": 328938055, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66796875, + "step": 15248, + "time_per_iteration": 2.4508020877838135 + }, + { + "auxiliary_loss_clip": 0.0110139, + "auxiliary_loss_mlp": 0.01034082, + "balance_loss_clip": 1.02130914, + "balance_loss_mlp": 1.03448176, + "epoch": 0.9168194799338644, + "flos": 27818201064960.0, + "grad_norm": 2.1653803876672733, + "language_loss": 0.72892481, + "learning_rate": 7.209204159518178e-08, + "loss": 0.75027955, + "num_input_tokens_seen": 328957895, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.66796875, + "step": 15249, + "time_per_iteration": 2.5009031295776367 + }, + { + "auxiliary_loss_clip": 0.01101285, + "auxiliary_loss_mlp": 0.01027212, + "balance_loss_clip": 1.01516008, + "balance_loss_mlp": 1.03550017, + "epoch": 0.9168796031865324, + "flos": 21717552355200.0, + "grad_norm": 1.997505884872102, + "language_loss": 0.76246959, + "learning_rate": 7.198845445926616e-08, + "loss": 0.78375459, + "num_input_tokens_seen": 328971365, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.65625, + "step": 15250, + "time_per_iteration": 2.407268762588501 + }, + { + "auxiliary_loss_clip": 0.01097645, + "auxiliary_loss_mlp": 0.01025557, + "balance_loss_clip": 1.0139761, + "balance_loss_mlp": 1.03325534, + "epoch": 0.9169397264392004, + "flos": 23404420817280.0, + "grad_norm": 1.9158953461140582, + "language_loss": 0.75737274, + "learning_rate": 7.188494043374138e-08, + "loss": 0.77860475, + "num_input_tokens_seen": 328990830, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6484375, + "step": 15251, + "time_per_iteration": 2.449289083480835 + }, + { + "auxiliary_loss_clip": 0.01103639, + "auxiliary_loss_mlp": 0.01030923, + "balance_loss_clip": 1.01801276, + "balance_loss_mlp": 1.03617382, + "epoch": 0.9169998496918683, + "flos": 23950981140480.0, + "grad_norm": 2.01686517651722, + "language_loss": 0.79905111, + "learning_rate": 7.178149952253298e-08, + "loss": 0.82039672, + "num_input_tokens_seen": 329008345, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.67578125, + "step": 15252, + "time_per_iteration": 2.4550137519836426 + }, + { + "auxiliary_loss_clip": 0.01099547, + "auxiliary_loss_mlp": 0.01034299, + "balance_loss_clip": 1.02278996, + "balance_loss_mlp": 1.03405333, + "epoch": 0.9170599729445363, + "flos": 18332469711360.0, + "grad_norm": 1.525633221993305, + "language_loss": 0.7715137, + "learning_rate": 7.167813172956316e-08, + "loss": 0.79285222, + "num_input_tokens_seen": 329027440, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 15253, + "time_per_iteration": 2.4307291507720947 + }, + { + "auxiliary_loss_clip": 0.01102278, + "auxiliary_loss_mlp": 0.01025708, + "balance_loss_clip": 1.01446629, + "balance_loss_mlp": 1.03608871, + "epoch": 0.9171200961972042, + "flos": 22674859678080.0, + "grad_norm": 1.6220223812959684, + "language_loss": 0.73055267, + "learning_rate": 7.157483705875256e-08, + "loss": 0.7518326, + "num_input_tokens_seen": 329046445, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6640625, + "step": 15254, + "time_per_iteration": 2.4426708221435547 + }, + { + "auxiliary_loss_clip": 0.01095915, + "auxiliary_loss_mlp": 0.01024577, + "balance_loss_clip": 1.01363969, + "balance_loss_mlp": 1.03274751, + "epoch": 0.9171802194498723, + "flos": 26719298328960.0, + "grad_norm": 1.4975724788553886, + "language_loss": 0.79085529, + "learning_rate": 7.14716155140167e-08, + "loss": 0.81206024, + "num_input_tokens_seen": 329065555, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6328125, + "step": 15255, + "time_per_iteration": 2.489227771759033 + }, + { + "auxiliary_loss_clip": 0.01101815, + "auxiliary_loss_mlp": 0.01033112, + "balance_loss_clip": 1.02150059, + "balance_loss_mlp": 1.03471398, + "epoch": 0.9172403427025402, + "flos": 37889240538240.0, + "grad_norm": 2.0932584318696197, + "language_loss": 0.68286502, + "learning_rate": 7.136846709927047e-08, + "loss": 0.70421427, + "num_input_tokens_seen": 329087515, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 15256, + "time_per_iteration": 2.5796866416931152 + }, + { + "auxiliary_loss_clip": 0.01097785, + "auxiliary_loss_mlp": 0.01032305, + "balance_loss_clip": 1.0215044, + "balance_loss_mlp": 1.03404987, + "epoch": 0.9173004659552082, + "flos": 17055163100160.0, + "grad_norm": 1.7133190759079449, + "language_loss": 0.83820814, + "learning_rate": 7.126539181842561e-08, + "loss": 0.85950905, + "num_input_tokens_seen": 329106820, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.63671875, + "step": 15257, + "time_per_iteration": 2.4700734615325928 + }, + { + "auxiliary_loss_clip": 0.0109807, + "auxiliary_loss_mlp": 0.01031988, + "balance_loss_clip": 1.02141452, + "balance_loss_mlp": 1.03438568, + "epoch": 0.9173605892078761, + "flos": 22201593056640.0, + "grad_norm": 1.6589909857452685, + "language_loss": 0.77511317, + "learning_rate": 7.116238967539012e-08, + "loss": 0.79641378, + "num_input_tokens_seen": 329126515, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.63671875, + "step": 15258, + "time_per_iteration": 2.4660658836364746 + }, + { + "auxiliary_loss_clip": 0.0110158, + "auxiliary_loss_mlp": 0.01032721, + "balance_loss_clip": 1.02131248, + "balance_loss_mlp": 1.0362736, + "epoch": 0.9174207124605441, + "flos": 16507776764160.0, + "grad_norm": 1.8473842478714684, + "language_loss": 0.78595388, + "learning_rate": 7.105946067406999e-08, + "loss": 0.80729687, + "num_input_tokens_seen": 329142660, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 15259, + "time_per_iteration": 2.42170786857605 + }, + { + "auxiliary_loss_clip": 0.01096695, + "auxiliary_loss_mlp": 0.01031068, + "balance_loss_clip": 1.0201838, + "balance_loss_mlp": 1.03308225, + "epoch": 0.917480835713212, + "flos": 24535606901760.0, + "grad_norm": 1.5141201420761963, + "language_loss": 0.75849646, + "learning_rate": 7.095660481836895e-08, + "loss": 0.77977407, + "num_input_tokens_seen": 329162575, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.63671875, + "step": 15260, + "time_per_iteration": 2.4748823642730713 + }, + { + "auxiliary_loss_clip": 0.01096998, + "auxiliary_loss_mlp": 0.01028616, + "balance_loss_clip": 1.01732063, + "balance_loss_mlp": 1.0325402, + "epoch": 0.9175409589658801, + "flos": 20880726226560.0, + "grad_norm": 1.439145182657997, + "language_loss": 0.61105782, + "learning_rate": 7.085382211218637e-08, + "loss": 0.63231397, + "num_input_tokens_seen": 329182090, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.64453125, + "step": 15261, + "time_per_iteration": 2.466932535171509 + }, + { + "auxiliary_loss_clip": 0.01097067, + "auxiliary_loss_mlp": 0.01032128, + "balance_loss_clip": 1.02064824, + "balance_loss_mlp": 1.03346276, + "epoch": 0.917601082218548, + "flos": 14276035918080.0, + "grad_norm": 1.6885035135611821, + "language_loss": 0.7386173, + "learning_rate": 7.075111255942002e-08, + "loss": 0.75990927, + "num_input_tokens_seen": 329196535, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6328125, + "step": 15262, + "time_per_iteration": 3.797211170196533 + }, + { + "auxiliary_loss_clip": 0.01101112, + "auxiliary_loss_mlp": 0.01032488, + "balance_loss_clip": 1.02048969, + "balance_loss_mlp": 1.03242636, + "epoch": 0.917661205471216, + "flos": 19099234362240.0, + "grad_norm": 1.9369196881857367, + "language_loss": 0.7737118, + "learning_rate": 7.064847616396496e-08, + "loss": 0.79504776, + "num_input_tokens_seen": 329215135, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6875, + "step": 15263, + "time_per_iteration": 3.865194797515869 + }, + { + "auxiliary_loss_clip": 0.01102159, + "auxiliary_loss_mlp": 0.01032945, + "balance_loss_clip": 1.02153111, + "balance_loss_mlp": 1.0338912, + "epoch": 0.917721328723884, + "flos": 21106568989440.0, + "grad_norm": 1.7551595930253303, + "language_loss": 0.75445127, + "learning_rate": 7.054591292971324e-08, + "loss": 0.77580231, + "num_input_tokens_seen": 329235150, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.68359375, + "step": 15264, + "time_per_iteration": 3.919630527496338 + }, + { + "auxiliary_loss_clip": 0.0109944, + "auxiliary_loss_mlp": 0.01035754, + "balance_loss_clip": 1.02476895, + "balance_loss_mlp": 1.03444493, + "epoch": 0.9177814519765519, + "flos": 21943215550080.0, + "grad_norm": 1.7698435079437604, + "language_loss": 0.8347168, + "learning_rate": 7.044342286055394e-08, + "loss": 0.85606873, + "num_input_tokens_seen": 329254365, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6484375, + "step": 15265, + "time_per_iteration": 2.481045961380005 + }, + { + "auxiliary_loss_clip": 0.01105219, + "auxiliary_loss_mlp": 0.01038332, + "balance_loss_clip": 1.02560663, + "balance_loss_mlp": 1.03556991, + "epoch": 0.9178415752292199, + "flos": 24205982768640.0, + "grad_norm": 1.4754568923998763, + "language_loss": 0.73383772, + "learning_rate": 7.034100596037306e-08, + "loss": 0.75527322, + "num_input_tokens_seen": 329274385, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6953125, + "step": 15266, + "time_per_iteration": 2.4675867557525635 + }, + { + "auxiliary_loss_clip": 0.0109957, + "auxiliary_loss_mlp": 0.01031178, + "balance_loss_clip": 1.02005005, + "balance_loss_mlp": 1.03352594, + "epoch": 0.9179016984818879, + "flos": 20042068504320.0, + "grad_norm": 1.626905867062865, + "language_loss": 0.7739476, + "learning_rate": 7.023866223305486e-08, + "loss": 0.79525506, + "num_input_tokens_seen": 329292160, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66015625, + "step": 15267, + "time_per_iteration": 2.539769411087036 + }, + { + "auxiliary_loss_clip": 0.01021972, + "auxiliary_loss_mlp": 0.01000807, + "balance_loss_clip": 0.99981195, + "balance_loss_mlp": 1.00176847, + "epoch": 0.9179618217345559, + "flos": 65555901100800.0, + "grad_norm": 0.7378350855044539, + "language_loss": 0.56234527, + "learning_rate": 7.013639168247975e-08, + "loss": 0.58257306, + "num_input_tokens_seen": 329351870, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.20214844, + "step": 15268, + "time_per_iteration": 4.508407115936279 + }, + { + "auxiliary_loss_clip": 0.01102433, + "auxiliary_loss_mlp": 0.01027411, + "balance_loss_clip": 1.01537657, + "balance_loss_mlp": 1.03522551, + "epoch": 0.9180219449872238, + "flos": 21324618501120.0, + "grad_norm": 4.464341061130245, + "language_loss": 0.76722169, + "learning_rate": 7.0034194312526e-08, + "loss": 0.78852016, + "num_input_tokens_seen": 329370930, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 15269, + "time_per_iteration": 2.4662179946899414 + }, + { + "auxiliary_loss_clip": 0.01099948, + "auxiliary_loss_mlp": 0.01030703, + "balance_loss_clip": 1.01819777, + "balance_loss_mlp": 1.03414619, + "epoch": 0.9180820682398918, + "flos": 41060008684800.0, + "grad_norm": 2.0734442027372633, + "language_loss": 0.7271992, + "learning_rate": 6.993207012706936e-08, + "loss": 0.74850571, + "num_input_tokens_seen": 329391275, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.65625, + "step": 15270, + "time_per_iteration": 2.628192186355591 + }, + { + "auxiliary_loss_clip": 0.01096334, + "auxiliary_loss_mlp": 0.01030644, + "balance_loss_clip": 1.01874113, + "balance_loss_mlp": 1.03209162, + "epoch": 0.9181421914925597, + "flos": 28072915384320.0, + "grad_norm": 1.5736026721435314, + "language_loss": 0.79696983, + "learning_rate": 6.98300191299821e-08, + "loss": 0.81823957, + "num_input_tokens_seen": 329412775, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.640625, + "step": 15271, + "time_per_iteration": 2.4931766986846924 + }, + { + "auxiliary_loss_clip": 0.01099187, + "auxiliary_loss_mlp": 0.01030136, + "balance_loss_clip": 1.01856041, + "balance_loss_mlp": 1.03308785, + "epoch": 0.9182023147452277, + "flos": 29169411909120.0, + "grad_norm": 22.73674764658324, + "language_loss": 0.72910154, + "learning_rate": 6.972804132513355e-08, + "loss": 0.75039482, + "num_input_tokens_seen": 329432440, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66015625, + "step": 15272, + "time_per_iteration": 2.5102016925811768 + }, + { + "auxiliary_loss_clip": 0.01098094, + "auxiliary_loss_mlp": 0.01031842, + "balance_loss_clip": 1.02105904, + "balance_loss_mlp": 1.03331065, + "epoch": 0.9182624379978956, + "flos": 24060831909120.0, + "grad_norm": 2.0644570408475404, + "language_loss": 0.72772151, + "learning_rate": 6.962613671639105e-08, + "loss": 0.74902087, + "num_input_tokens_seen": 329450605, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6484375, + "step": 15273, + "time_per_iteration": 2.465676784515381 + }, + { + "auxiliary_loss_clip": 0.01093024, + "auxiliary_loss_mlp": 0.01025395, + "balance_loss_clip": 1.01514888, + "balance_loss_mlp": 1.03164101, + "epoch": 0.9183225612505637, + "flos": 23293528554240.0, + "grad_norm": 1.4208540999933033, + "language_loss": 0.74430341, + "learning_rate": 6.952430530761933e-08, + "loss": 0.76548761, + "num_input_tokens_seen": 329470550, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.61328125, + "step": 15274, + "time_per_iteration": 2.4480597972869873 + }, + { + "auxiliary_loss_clip": 0.01099117, + "auxiliary_loss_mlp": 0.01035602, + "balance_loss_clip": 1.02451599, + "balance_loss_mlp": 1.03252149, + "epoch": 0.9183826845032316, + "flos": 19609237618560.0, + "grad_norm": 1.5689583484539182, + "language_loss": 0.6853776, + "learning_rate": 6.942254710267902e-08, + "loss": 0.70672476, + "num_input_tokens_seen": 329489765, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6640625, + "step": 15275, + "time_per_iteration": 2.424501895904541 + }, + { + "auxiliary_loss_clip": 0.01096749, + "auxiliary_loss_mlp": 0.01028414, + "balance_loss_clip": 1.0169158, + "balance_loss_mlp": 1.03240776, + "epoch": 0.9184428077558996, + "flos": 18479057114880.0, + "grad_norm": 1.7958542567594675, + "language_loss": 0.72359389, + "learning_rate": 6.932086210542953e-08, + "loss": 0.74484551, + "num_input_tokens_seen": 329507040, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.64453125, + "step": 15276, + "time_per_iteration": 2.4307353496551514 + }, + { + "auxiliary_loss_clip": 0.0110093, + "auxiliary_loss_mlp": 0.01027873, + "balance_loss_clip": 1.01707268, + "balance_loss_mlp": 1.03567207, + "epoch": 0.9185029310085676, + "flos": 20741034234240.0, + "grad_norm": 1.745555104125903, + "language_loss": 0.73787761, + "learning_rate": 6.921925031972642e-08, + "loss": 0.75916559, + "num_input_tokens_seen": 329525540, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.65234375, + "step": 15277, + "time_per_iteration": 2.485466718673706 + }, + { + "auxiliary_loss_clip": 0.01021683, + "auxiliary_loss_mlp": 0.01000132, + "balance_loss_clip": 0.99918407, + "balance_loss_mlp": 1.00166464, + "epoch": 0.9185630542612355, + "flos": 68209231875840.0, + "grad_norm": 0.7154549156944336, + "language_loss": 0.59214282, + "learning_rate": 6.91177117494226e-08, + "loss": 0.61236095, + "num_input_tokens_seen": 329592905, + "router_z_loss_clip": 0.00946045, + "router_z_loss_mlp": 0.20019531, + "step": 15278, + "time_per_iteration": 3.1485769748687744 + }, + { + "auxiliary_loss_clip": 0.01093924, + "auxiliary_loss_mlp": 0.01025318, + "balance_loss_clip": 1.01504803, + "balance_loss_mlp": 1.03094137, + "epoch": 0.9186231775139035, + "flos": 12239470598400.0, + "grad_norm": 1.8937597400336486, + "language_loss": 0.64184052, + "learning_rate": 6.901624639836879e-08, + "loss": 0.66303289, + "num_input_tokens_seen": 329610150, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.6328125, + "step": 15279, + "time_per_iteration": 2.446822166442871 + }, + { + "auxiliary_loss_clip": 0.01021727, + "auxiliary_loss_mlp": 0.0100203, + "balance_loss_clip": 1.0009985, + "balance_loss_mlp": 1.00168133, + "epoch": 0.9186833007665715, + "flos": 63939237770880.0, + "grad_norm": 1.2569457019920138, + "language_loss": 0.60211283, + "learning_rate": 6.891485427041211e-08, + "loss": 0.62235039, + "num_input_tokens_seen": 329673650, + "router_z_loss_clip": 0.01031494, + "router_z_loss_mlp": 0.20019531, + "step": 15280, + "time_per_iteration": 3.04021954536438 + }, + { + "auxiliary_loss_clip": 0.0110026, + "auxiliary_loss_mlp": 0.01029657, + "balance_loss_clip": 1.01787281, + "balance_loss_mlp": 1.03354609, + "epoch": 0.9187434240192395, + "flos": 19974700546560.0, + "grad_norm": 3.7758873171427108, + "language_loss": 0.69328892, + "learning_rate": 6.881353536939815e-08, + "loss": 0.71458817, + "num_input_tokens_seen": 329692520, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66796875, + "step": 15281, + "time_per_iteration": 2.468998432159424 + }, + { + "auxiliary_loss_clip": 0.01101585, + "auxiliary_loss_mlp": 0.01028785, + "balance_loss_clip": 1.01645327, + "balance_loss_mlp": 1.03454149, + "epoch": 0.9188035472719074, + "flos": 25227820874880.0, + "grad_norm": 2.9746368961886867, + "language_loss": 0.84552884, + "learning_rate": 6.871228969916831e-08, + "loss": 0.86683255, + "num_input_tokens_seen": 329713750, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.671875, + "step": 15282, + "time_per_iteration": 2.525195360183716 + }, + { + "auxiliary_loss_clip": 0.01097551, + "auxiliary_loss_mlp": 0.01032254, + "balance_loss_clip": 1.02076828, + "balance_loss_mlp": 1.03387153, + "epoch": 0.9188636705245754, + "flos": 18405547931520.0, + "grad_norm": 1.7713920844445745, + "language_loss": 0.59634107, + "learning_rate": 6.861111726356194e-08, + "loss": 0.61763906, + "num_input_tokens_seen": 329730960, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.63671875, + "step": 15283, + "time_per_iteration": 2.451240062713623 + }, + { + "auxiliary_loss_clip": 0.01103442, + "auxiliary_loss_mlp": 0.01030104, + "balance_loss_clip": 1.01844573, + "balance_loss_mlp": 1.03460884, + "epoch": 0.9189237937772433, + "flos": 23769129559680.0, + "grad_norm": 1.5989024200960449, + "language_loss": 0.65525234, + "learning_rate": 6.851001806641554e-08, + "loss": 0.67658782, + "num_input_tokens_seen": 329750975, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6875, + "step": 15284, + "time_per_iteration": 2.4734537601470947 + }, + { + "auxiliary_loss_clip": 0.01098451, + "auxiliary_loss_mlp": 0.01031303, + "balance_loss_clip": 1.01953125, + "balance_loss_mlp": 1.03279424, + "epoch": 0.9189839170299113, + "flos": 21214624078080.0, + "grad_norm": 1.8968992519509786, + "language_loss": 0.7340166, + "learning_rate": 6.840899211156292e-08, + "loss": 0.75531411, + "num_input_tokens_seen": 329769645, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.65625, + "step": 15285, + "time_per_iteration": 2.475170612335205 + }, + { + "auxiliary_loss_clip": 0.01097989, + "auxiliary_loss_mlp": 0.01029771, + "balance_loss_clip": 1.01809406, + "balance_loss_mlp": 1.03306448, + "epoch": 0.9190440402825792, + "flos": 16727370560640.0, + "grad_norm": 1.9961314364578988, + "language_loss": 0.71681088, + "learning_rate": 6.830803940283458e-08, + "loss": 0.73808849, + "num_input_tokens_seen": 329788185, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6484375, + "step": 15286, + "time_per_iteration": 2.4291200637817383 + }, + { + "auxiliary_loss_clip": 0.01100256, + "auxiliary_loss_mlp": 0.0103143, + "balance_loss_clip": 1.0195086, + "balance_loss_mlp": 1.03459299, + "epoch": 0.9191041635352473, + "flos": 23441193365760.0, + "grad_norm": 2.070932444160172, + "language_loss": 0.7353276, + "learning_rate": 6.820715994405945e-08, + "loss": 0.75664449, + "num_input_tokens_seen": 329806780, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 15287, + "time_per_iteration": 2.4521946907043457 + }, + { + "auxiliary_loss_clip": 0.0110378, + "auxiliary_loss_mlp": 0.01028863, + "balance_loss_clip": 1.01641178, + "balance_loss_mlp": 1.03728819, + "epoch": 0.9191642867879152, + "flos": 18807532012800.0, + "grad_norm": 1.955752098372023, + "language_loss": 0.65609306, + "learning_rate": 6.810635373906226e-08, + "loss": 0.67741948, + "num_input_tokens_seen": 329826350, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6640625, + "step": 15288, + "time_per_iteration": 2.41819167137146 + }, + { + "auxiliary_loss_clip": 0.01104589, + "auxiliary_loss_mlp": 0.01033275, + "balance_loss_clip": 1.02197957, + "balance_loss_mlp": 1.0382545, + "epoch": 0.9192244100405832, + "flos": 32160950167680.0, + "grad_norm": 1.9658810334985228, + "language_loss": 0.7114042, + "learning_rate": 6.800562079166549e-08, + "loss": 0.73278284, + "num_input_tokens_seen": 329846160, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6640625, + "step": 15289, + "time_per_iteration": 2.5379581451416016 + }, + { + "auxiliary_loss_clip": 0.01101664, + "auxiliary_loss_mlp": 0.01030872, + "balance_loss_clip": 1.01940393, + "balance_loss_mlp": 1.03530157, + "epoch": 0.9192845332932512, + "flos": 16357669827840.0, + "grad_norm": 1.7860154245672653, + "language_loss": 0.74310684, + "learning_rate": 6.790496110568921e-08, + "loss": 0.76443219, + "num_input_tokens_seen": 329862020, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6640625, + "step": 15290, + "time_per_iteration": 2.424091339111328 + }, + { + "auxiliary_loss_clip": 0.01098296, + "auxiliary_loss_mlp": 0.01028651, + "balance_loss_clip": 1.01781464, + "balance_loss_mlp": 1.03389239, + "epoch": 0.9193446565459191, + "flos": 26614475464320.0, + "grad_norm": 1.8214465731068186, + "language_loss": 0.72021568, + "learning_rate": 6.78043746849506e-08, + "loss": 0.74148518, + "num_input_tokens_seen": 329880185, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.64453125, + "step": 15291, + "time_per_iteration": 2.524446725845337 + }, + { + "auxiliary_loss_clip": 0.01098547, + "auxiliary_loss_mlp": 0.01028062, + "balance_loss_clip": 1.01701772, + "balance_loss_mlp": 1.03402042, + "epoch": 0.9194047797985871, + "flos": 22492182084480.0, + "grad_norm": 1.7029706448967405, + "language_loss": 0.71118617, + "learning_rate": 6.770386153326346e-08, + "loss": 0.73245227, + "num_input_tokens_seen": 329900255, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.64453125, + "step": 15292, + "time_per_iteration": 2.452636957168579 + }, + { + "auxiliary_loss_clip": 0.01099113, + "auxiliary_loss_mlp": 0.01028165, + "balance_loss_clip": 1.01633954, + "balance_loss_mlp": 1.03386974, + "epoch": 0.9194649030512551, + "flos": 25078791346560.0, + "grad_norm": 2.1375406938776416, + "language_loss": 0.73241705, + "learning_rate": 6.760342165443988e-08, + "loss": 0.75368983, + "num_input_tokens_seen": 329919095, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65234375, + "step": 15293, + "time_per_iteration": 2.475736141204834 + }, + { + "auxiliary_loss_clip": 0.01098791, + "auxiliary_loss_mlp": 0.01026354, + "balance_loss_clip": 1.01479709, + "balance_loss_mlp": 1.03441787, + "epoch": 0.9195250263039231, + "flos": 11911139354880.0, + "grad_norm": 2.0643296988885456, + "language_loss": 0.7831043, + "learning_rate": 6.750305505228837e-08, + "loss": 0.80435574, + "num_input_tokens_seen": 329936505, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.64453125, + "step": 15294, + "time_per_iteration": 2.47523832321167 + }, + { + "auxiliary_loss_clip": 0.01103169, + "auxiliary_loss_mlp": 0.01031611, + "balance_loss_clip": 1.0187782, + "balance_loss_mlp": 1.03504705, + "epoch": 0.919585149556591, + "flos": 21834154880640.0, + "grad_norm": 1.4970432039566248, + "language_loss": 0.77283525, + "learning_rate": 6.74027617306141e-08, + "loss": 0.79418302, + "num_input_tokens_seen": 329956795, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6796875, + "step": 15295, + "time_per_iteration": 2.4907798767089844 + }, + { + "auxiliary_loss_clip": 0.01097049, + "auxiliary_loss_mlp": 0.01026614, + "balance_loss_clip": 1.01649904, + "balance_loss_mlp": 1.034621, + "epoch": 0.919645272809259, + "flos": 28184059042560.0, + "grad_norm": 2.4431811448351106, + "language_loss": 0.71476376, + "learning_rate": 6.730254169322114e-08, + "loss": 0.73600036, + "num_input_tokens_seen": 329977195, + "router_z_loss_clip": 0.10107422, + "router_z_loss_mlp": 0.625, + "step": 15296, + "time_per_iteration": 2.4911844730377197 + }, + { + "auxiliary_loss_clip": 0.0109984, + "auxiliary_loss_mlp": 0.0103458, + "balance_loss_clip": 1.0236125, + "balance_loss_mlp": 1.03452754, + "epoch": 0.9197053960619269, + "flos": 18332828847360.0, + "grad_norm": 2.1011046418165704, + "language_loss": 0.75250423, + "learning_rate": 6.720239494390912e-08, + "loss": 0.77384841, + "num_input_tokens_seen": 329992095, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.65625, + "step": 15297, + "time_per_iteration": 2.4321935176849365 + }, + { + "auxiliary_loss_clip": 0.01097997, + "auxiliary_loss_mlp": 0.01028065, + "balance_loss_clip": 1.01621604, + "balance_loss_mlp": 1.0333879, + "epoch": 0.9197655193145949, + "flos": 28183448511360.0, + "grad_norm": 1.7401856236866056, + "language_loss": 0.73939699, + "learning_rate": 6.710232148647676e-08, + "loss": 0.76065761, + "num_input_tokens_seen": 330011490, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6484375, + "step": 15298, + "time_per_iteration": 2.515803098678589 + }, + { + "auxiliary_loss_clip": 0.01101475, + "auxiliary_loss_mlp": 0.01033872, + "balance_loss_clip": 1.02229095, + "balance_loss_mlp": 1.03466356, + "epoch": 0.9198256425672628, + "flos": 17306321973120.0, + "grad_norm": 2.0840712343344823, + "language_loss": 0.79339898, + "learning_rate": 6.70023213247175e-08, + "loss": 0.8147524, + "num_input_tokens_seen": 330027885, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66796875, + "step": 15299, + "time_per_iteration": 2.425823450088501 + }, + { + "auxiliary_loss_clip": 0.01098834, + "auxiliary_loss_mlp": 0.01026097, + "balance_loss_clip": 1.01545727, + "balance_loss_mlp": 1.03452611, + "epoch": 0.9198857658199309, + "flos": 17858520731520.0, + "grad_norm": 2.140408614905867, + "language_loss": 0.63948607, + "learning_rate": 6.690239446242385e-08, + "loss": 0.66073537, + "num_input_tokens_seen": 330046230, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.640625, + "step": 15300, + "time_per_iteration": 2.441720724105835 + }, + { + "auxiliary_loss_clip": 0.01094075, + "auxiliary_loss_mlp": 0.0102783, + "balance_loss_clip": 1.01806712, + "balance_loss_mlp": 1.03322458, + "epoch": 0.9199458890725988, + "flos": 22127545169280.0, + "grad_norm": 2.56598231172926, + "language_loss": 0.69634527, + "learning_rate": 6.680254090338545e-08, + "loss": 0.71756434, + "num_input_tokens_seen": 330065535, + "router_z_loss_clip": 0.09765625, + "router_z_loss_mlp": 0.609375, + "step": 15301, + "time_per_iteration": 2.467337131500244 + }, + { + "auxiliary_loss_clip": 0.01103435, + "auxiliary_loss_mlp": 0.01033344, + "balance_loss_clip": 1.02025485, + "balance_loss_mlp": 1.03593671, + "epoch": 0.9200060123252668, + "flos": 16034043265920.0, + "grad_norm": 1.8550315285188883, + "language_loss": 0.71411103, + "learning_rate": 6.670276065138814e-08, + "loss": 0.73547888, + "num_input_tokens_seen": 330082920, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.67578125, + "step": 15302, + "time_per_iteration": 2.43485426902771 + }, + { + "auxiliary_loss_clip": 0.01100893, + "auxiliary_loss_mlp": 0.01029347, + "balance_loss_clip": 1.01797462, + "balance_loss_mlp": 1.03467202, + "epoch": 0.9200661355779348, + "flos": 26864521015680.0, + "grad_norm": 1.824853642117339, + "language_loss": 0.76358056, + "learning_rate": 6.660305371021579e-08, + "loss": 0.7848829, + "num_input_tokens_seen": 330101165, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66015625, + "step": 15303, + "time_per_iteration": 3.9412145614624023 + }, + { + "auxiliary_loss_clip": 0.01100608, + "auxiliary_loss_mlp": 0.0102885, + "balance_loss_clip": 1.01765084, + "balance_loss_mlp": 1.03600037, + "epoch": 0.9201262588306027, + "flos": 12786749193600.0, + "grad_norm": 2.172207536480081, + "language_loss": 0.8759762, + "learning_rate": 6.650342008365006e-08, + "loss": 0.8972708, + "num_input_tokens_seen": 330118775, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.64453125, + "step": 15304, + "time_per_iteration": 2.4575695991516113 + }, + { + "auxiliary_loss_clip": 0.01103607, + "auxiliary_loss_mlp": 0.01032918, + "balance_loss_clip": 1.01945925, + "balance_loss_mlp": 1.0359385, + "epoch": 0.9201863820832707, + "flos": 20631614428800.0, + "grad_norm": 1.9620748105275532, + "language_loss": 0.7723875, + "learning_rate": 6.64038597754677e-08, + "loss": 0.79375267, + "num_input_tokens_seen": 330135570, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.67578125, + "step": 15305, + "time_per_iteration": 3.863945484161377 + }, + { + "auxiliary_loss_clip": 0.01098868, + "auxiliary_loss_mlp": 0.0103384, + "balance_loss_clip": 1.02194321, + "balance_loss_mlp": 1.03316355, + "epoch": 0.9202465053359387, + "flos": 26395815421440.0, + "grad_norm": 2.2266234622398002, + "language_loss": 0.81643492, + "learning_rate": 6.630437278944501e-08, + "loss": 0.837762, + "num_input_tokens_seen": 330152840, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 15306, + "time_per_iteration": 3.9599132537841797 + }, + { + "auxiliary_loss_clip": 0.01095421, + "auxiliary_loss_mlp": 0.01030223, + "balance_loss_clip": 1.01949441, + "balance_loss_mlp": 1.03234839, + "epoch": 0.9203066285886067, + "flos": 10488179093760.0, + "grad_norm": 1.8130468972893734, + "language_loss": 0.71801835, + "learning_rate": 6.62049591293541e-08, + "loss": 0.73927486, + "num_input_tokens_seen": 330168605, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6328125, + "step": 15307, + "time_per_iteration": 2.4384212493896484 + }, + { + "auxiliary_loss_clip": 0.0110121, + "auxiliary_loss_mlp": 0.01030379, + "balance_loss_clip": 1.01840997, + "balance_loss_mlp": 1.03425837, + "epoch": 0.9203667518412746, + "flos": 19390721230080.0, + "grad_norm": 1.8060477218017867, + "language_loss": 0.78445113, + "learning_rate": 6.610561879896526e-08, + "loss": 0.80576694, + "num_input_tokens_seen": 330186160, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.671875, + "step": 15308, + "time_per_iteration": 2.4439730644226074 + }, + { + "auxiliary_loss_clip": 0.01098129, + "auxiliary_loss_mlp": 0.01028894, + "balance_loss_clip": 1.01717603, + "balance_loss_mlp": 1.0328846, + "epoch": 0.9204268750939426, + "flos": 15924982596480.0, + "grad_norm": 2.0932008233219968, + "language_loss": 0.77898622, + "learning_rate": 6.600635180204484e-08, + "loss": 0.80025649, + "num_input_tokens_seen": 330201780, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65234375, + "step": 15309, + "time_per_iteration": 2.4441962242126465 + }, + { + "auxiliary_loss_clip": 0.01099417, + "auxiliary_loss_mlp": 0.01026816, + "balance_loss_clip": 1.01493728, + "balance_loss_mlp": 1.03330636, + "epoch": 0.9204869983466105, + "flos": 16471758401280.0, + "grad_norm": 2.4269802461242977, + "language_loss": 0.66559213, + "learning_rate": 6.590715814235781e-08, + "loss": 0.68685448, + "num_input_tokens_seen": 330219165, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66015625, + "step": 15310, + "time_per_iteration": 3.8965320587158203 + }, + { + "auxiliary_loss_clip": 0.01099035, + "auxiliary_loss_mlp": 0.01031641, + "balance_loss_clip": 1.0200541, + "balance_loss_mlp": 1.03259516, + "epoch": 0.9205471215992785, + "flos": 21539220307200.0, + "grad_norm": 1.6733324476894091, + "language_loss": 0.66091675, + "learning_rate": 6.580803782366495e-08, + "loss": 0.68222356, + "num_input_tokens_seen": 330238975, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6640625, + "step": 15311, + "time_per_iteration": 2.444620132446289 + }, + { + "auxiliary_loss_clip": 0.01099034, + "auxiliary_loss_mlp": 0.01034246, + "balance_loss_clip": 1.02293885, + "balance_loss_mlp": 1.03247344, + "epoch": 0.9206072448519464, + "flos": 25005892694400.0, + "grad_norm": 1.5846245764827986, + "language_loss": 0.75952655, + "learning_rate": 6.570899084972503e-08, + "loss": 0.78085929, + "num_input_tokens_seen": 330259755, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 15312, + "time_per_iteration": 2.5009913444519043 + }, + { + "auxiliary_loss_clip": 0.01096584, + "auxiliary_loss_mlp": 0.01031198, + "balance_loss_clip": 1.02051091, + "balance_loss_mlp": 1.03418756, + "epoch": 0.9206673681046145, + "flos": 20522661500160.0, + "grad_norm": 1.935220768084578, + "language_loss": 0.7918942, + "learning_rate": 6.561001722429394e-08, + "loss": 0.81317198, + "num_input_tokens_seen": 330277660, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.625, + "step": 15313, + "time_per_iteration": 2.477346420288086 + }, + { + "auxiliary_loss_clip": 0.01101793, + "auxiliary_loss_mlp": 0.01029069, + "balance_loss_clip": 1.01775026, + "balance_loss_mlp": 1.03461695, + "epoch": 0.9207274913572824, + "flos": 20883455660160.0, + "grad_norm": 1.6811896715223988, + "language_loss": 0.78183317, + "learning_rate": 6.55111169511251e-08, + "loss": 0.80314177, + "num_input_tokens_seen": 330295455, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 15314, + "time_per_iteration": 2.543661594390869 + }, + { + "auxiliary_loss_clip": 0.01103944, + "auxiliary_loss_mlp": 0.01032553, + "balance_loss_clip": 1.01982164, + "balance_loss_mlp": 1.03507841, + "epoch": 0.9207876146099504, + "flos": 22708256348160.0, + "grad_norm": 1.775196131409486, + "language_loss": 0.79086602, + "learning_rate": 6.541229003396864e-08, + "loss": 0.81223094, + "num_input_tokens_seen": 330315310, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6875, + "step": 15315, + "time_per_iteration": 2.4820241928100586 + }, + { + "auxiliary_loss_clip": 0.01103595, + "auxiliary_loss_mlp": 0.01028821, + "balance_loss_clip": 1.01689386, + "balance_loss_mlp": 1.03408074, + "epoch": 0.9208477378626184, + "flos": 18507354053760.0, + "grad_norm": 1.7912978645182498, + "language_loss": 0.75935954, + "learning_rate": 6.531353647657156e-08, + "loss": 0.7806837, + "num_input_tokens_seen": 330333260, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6953125, + "step": 15316, + "time_per_iteration": 2.4458367824554443 + }, + { + "auxiliary_loss_clip": 0.01099953, + "auxiliary_loss_mlp": 0.01034019, + "balance_loss_clip": 1.02208638, + "balance_loss_mlp": 1.03305912, + "epoch": 0.9209078611152863, + "flos": 22999635475200.0, + "grad_norm": 1.7344603926154347, + "language_loss": 0.6935131, + "learning_rate": 6.521485628267931e-08, + "loss": 0.71485275, + "num_input_tokens_seen": 330352465, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 15317, + "time_per_iteration": 2.461711883544922 + }, + { + "auxiliary_loss_clip": 0.01101061, + "auxiliary_loss_mlp": 0.01028604, + "balance_loss_clip": 1.01680803, + "balance_loss_mlp": 1.03546286, + "epoch": 0.9209679843679544, + "flos": 24061514267520.0, + "grad_norm": 1.7038666370863202, + "language_loss": 0.83504558, + "learning_rate": 6.511624945603378e-08, + "loss": 0.8563422, + "num_input_tokens_seen": 330372685, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65625, + "step": 15318, + "time_per_iteration": 2.5033764839172363 + }, + { + "auxiliary_loss_clip": 0.01100705, + "auxiliary_loss_mlp": 0.01029982, + "balance_loss_clip": 1.01856208, + "balance_loss_mlp": 1.03522885, + "epoch": 0.9210281076206223, + "flos": 13553370190080.0, + "grad_norm": 2.0250345502149774, + "language_loss": 0.85513151, + "learning_rate": 6.501771600037354e-08, + "loss": 0.87643838, + "num_input_tokens_seen": 330388860, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 15319, + "time_per_iteration": 2.434962511062622 + }, + { + "auxiliary_loss_clip": 0.01021706, + "auxiliary_loss_mlp": 0.01001621, + "balance_loss_clip": 1.00066721, + "balance_loss_mlp": 1.00159764, + "epoch": 0.9210882308732903, + "flos": 71426289674880.0, + "grad_norm": 0.7706053364589017, + "language_loss": 0.56186169, + "learning_rate": 6.491925591943559e-08, + "loss": 0.58209497, + "num_input_tokens_seen": 330448735, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.20117188, + "step": 15320, + "time_per_iteration": 3.1476900577545166 + }, + { + "auxiliary_loss_clip": 0.01103341, + "auxiliary_loss_mlp": 0.01037848, + "balance_loss_clip": 1.02536726, + "balance_loss_mlp": 1.03501773, + "epoch": 0.9211483541259582, + "flos": 18509113820160.0, + "grad_norm": 2.0044478271622053, + "language_loss": 0.63775176, + "learning_rate": 6.482086921695384e-08, + "loss": 0.65916359, + "num_input_tokens_seen": 330465600, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 15321, + "time_per_iteration": 2.4137425422668457 + }, + { + "auxiliary_loss_clip": 0.01095255, + "auxiliary_loss_mlp": 0.01026901, + "balance_loss_clip": 1.01625538, + "balance_loss_mlp": 1.03391385, + "epoch": 0.9212084773786262, + "flos": 23258228463360.0, + "grad_norm": 1.5355415864744049, + "language_loss": 0.71481681, + "learning_rate": 6.47225558966582e-08, + "loss": 0.73603833, + "num_input_tokens_seen": 330485770, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.61328125, + "step": 15322, + "time_per_iteration": 2.5061845779418945 + }, + { + "auxiliary_loss_clip": 0.01098655, + "auxiliary_loss_mlp": 0.01031099, + "balance_loss_clip": 1.02018511, + "balance_loss_mlp": 1.03329921, + "epoch": 0.9212686006312941, + "flos": 16289511770880.0, + "grad_norm": 1.6914722744606074, + "language_loss": 0.70055711, + "learning_rate": 6.462431596227725e-08, + "loss": 0.72185469, + "num_input_tokens_seen": 330504255, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.65625, + "step": 15323, + "time_per_iteration": 2.4158103466033936 + }, + { + "auxiliary_loss_clip": 0.01102274, + "auxiliary_loss_mlp": 0.01032101, + "balance_loss_clip": 1.01932168, + "balance_loss_mlp": 1.03389645, + "epoch": 0.9213287238839621, + "flos": 19785773986560.0, + "grad_norm": 1.7459918912498436, + "language_loss": 0.74719346, + "learning_rate": 6.452614941753597e-08, + "loss": 0.76853722, + "num_input_tokens_seen": 330520705, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6875, + "step": 15324, + "time_per_iteration": 2.424887180328369 + }, + { + "auxiliary_loss_clip": 0.01101043, + "auxiliary_loss_mlp": 0.01039368, + "balance_loss_clip": 1.02775693, + "balance_loss_mlp": 1.03482819, + "epoch": 0.92138884713663, + "flos": 21030402199680.0, + "grad_norm": 1.9858313128784937, + "language_loss": 0.71462083, + "learning_rate": 6.442805626615744e-08, + "loss": 0.73602492, + "num_input_tokens_seen": 330539245, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 15325, + "time_per_iteration": 2.4648244380950928 + }, + { + "auxiliary_loss_clip": 0.01098648, + "auxiliary_loss_mlp": 0.01031857, + "balance_loss_clip": 1.020401, + "balance_loss_mlp": 1.03404129, + "epoch": 0.9214489703892981, + "flos": 28587264186240.0, + "grad_norm": 1.439709253059829, + "language_loss": 0.78404367, + "learning_rate": 6.433003651186109e-08, + "loss": 0.80534875, + "num_input_tokens_seen": 330561815, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6484375, + "step": 15326, + "time_per_iteration": 2.5329742431640625 + }, + { + "auxiliary_loss_clip": 0.01103679, + "auxiliary_loss_mlp": 0.01032197, + "balance_loss_clip": 1.02008581, + "balance_loss_mlp": 1.03579211, + "epoch": 0.921509093641966, + "flos": 16361476669440.0, + "grad_norm": 3.8016467363656514, + "language_loss": 0.71438289, + "learning_rate": 6.42320901583635e-08, + "loss": 0.73574162, + "num_input_tokens_seen": 330579760, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6796875, + "step": 15327, + "time_per_iteration": 2.4995455741882324 + }, + { + "auxiliary_loss_clip": 0.01105492, + "auxiliary_loss_mlp": 0.01040397, + "balance_loss_clip": 1.02733731, + "balance_loss_mlp": 1.03710175, + "epoch": 0.921569216894634, + "flos": 26830837036800.0, + "grad_norm": 1.7374024208588212, + "language_loss": 0.78006065, + "learning_rate": 6.413421720937906e-08, + "loss": 0.80151951, + "num_input_tokens_seen": 330598545, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.68359375, + "step": 15328, + "time_per_iteration": 2.4673023223876953 + }, + { + "auxiliary_loss_clip": 0.01098437, + "auxiliary_loss_mlp": 0.01027792, + "balance_loss_clip": 1.01674747, + "balance_loss_mlp": 1.03321588, + "epoch": 0.921629340147302, + "flos": 24645134448000.0, + "grad_norm": 2.2530455333427994, + "language_loss": 0.71567261, + "learning_rate": 6.4036417668619e-08, + "loss": 0.73693484, + "num_input_tokens_seen": 330616700, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65234375, + "step": 15329, + "time_per_iteration": 2.495542526245117 + }, + { + "auxiliary_loss_clip": 0.01098027, + "auxiliary_loss_mlp": 0.01024122, + "balance_loss_clip": 1.01332712, + "balance_loss_mlp": 1.03311157, + "epoch": 0.9216894633999699, + "flos": 15086504442240.0, + "grad_norm": 1.6555034439482308, + "language_loss": 0.86653769, + "learning_rate": 6.393869153979192e-08, + "loss": 0.88775921, + "num_input_tokens_seen": 330633355, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6484375, + "step": 15330, + "time_per_iteration": 2.410320281982422 + }, + { + "auxiliary_loss_clip": 0.01100084, + "auxiliary_loss_mlp": 0.01028037, + "balance_loss_clip": 1.01671255, + "balance_loss_mlp": 1.03359747, + "epoch": 0.921749586652638, + "flos": 19204524103680.0, + "grad_norm": 2.1488192808619555, + "language_loss": 0.75690323, + "learning_rate": 6.384103882660397e-08, + "loss": 0.77818441, + "num_input_tokens_seen": 330651470, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6640625, + "step": 15331, + "time_per_iteration": 2.4592649936676025 + }, + { + "auxiliary_loss_clip": 0.01098587, + "auxiliary_loss_mlp": 0.01027028, + "balance_loss_clip": 1.01572061, + "balance_loss_mlp": 1.0333581, + "epoch": 0.9218097099053059, + "flos": 20522446018560.0, + "grad_norm": 1.671974459244748, + "language_loss": 0.75502098, + "learning_rate": 6.374345953275794e-08, + "loss": 0.77627707, + "num_input_tokens_seen": 330669170, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65234375, + "step": 15332, + "time_per_iteration": 2.4462203979492188 + }, + { + "auxiliary_loss_clip": 0.01098277, + "auxiliary_loss_mlp": 0.01029499, + "balance_loss_clip": 1.01846027, + "balance_loss_mlp": 1.03282976, + "epoch": 0.9218698331579739, + "flos": 17348625216000.0, + "grad_norm": 1.766508202244264, + "language_loss": 0.75169802, + "learning_rate": 6.364595366195358e-08, + "loss": 0.7729758, + "num_input_tokens_seen": 330686635, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 15333, + "time_per_iteration": 2.4391255378723145 + }, + { + "auxiliary_loss_clip": 0.01021523, + "auxiliary_loss_mlp": 0.01002464, + "balance_loss_clip": 1.00151014, + "balance_loss_mlp": 1.0014323, + "epoch": 0.9219299564106418, + "flos": 61958332575360.0, + "grad_norm": 0.8201974860223076, + "language_loss": 0.52913523, + "learning_rate": 6.354852121788879e-08, + "loss": 0.54937506, + "num_input_tokens_seen": 330749160, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.20117188, + "step": 15334, + "time_per_iteration": 3.0368025302886963 + }, + { + "auxiliary_loss_clip": 0.01096931, + "auxiliary_loss_mlp": 0.01032717, + "balance_loss_clip": 1.02174962, + "balance_loss_mlp": 1.03388023, + "epoch": 0.9219900796633098, + "flos": 15701761526400.0, + "grad_norm": 2.0174871878969425, + "language_loss": 0.62107778, + "learning_rate": 6.345116220425839e-08, + "loss": 0.64237422, + "num_input_tokens_seen": 330766840, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.62890625, + "step": 15335, + "time_per_iteration": 2.4043402671813965 + }, + { + "auxiliary_loss_clip": 0.0109812, + "auxiliary_loss_mlp": 0.01030431, + "balance_loss_clip": 1.01859975, + "balance_loss_mlp": 1.03406358, + "epoch": 0.9220502029159777, + "flos": 24932670819840.0, + "grad_norm": 2.1329898068794906, + "language_loss": 0.71450561, + "learning_rate": 6.335387662475366e-08, + "loss": 0.73579109, + "num_input_tokens_seen": 330785585, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.640625, + "step": 15336, + "time_per_iteration": 2.475250244140625 + }, + { + "auxiliary_loss_clip": 0.01094756, + "auxiliary_loss_mlp": 0.01028396, + "balance_loss_clip": 1.01813853, + "balance_loss_mlp": 1.03188348, + "epoch": 0.9221103261686457, + "flos": 15667215621120.0, + "grad_norm": 1.8206372240538196, + "language_loss": 0.7180149, + "learning_rate": 6.325666448306433e-08, + "loss": 0.73924649, + "num_input_tokens_seen": 330800750, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.62890625, + "step": 15337, + "time_per_iteration": 2.403857707977295 + }, + { + "auxiliary_loss_clip": 0.01021361, + "auxiliary_loss_mlp": 0.01002116, + "balance_loss_clip": 1.00117433, + "balance_loss_mlp": 1.00144243, + "epoch": 0.9221704494213137, + "flos": 67516299630720.0, + "grad_norm": 0.8770991549438161, + "language_loss": 0.65320015, + "learning_rate": 6.31595257828763e-08, + "loss": 0.67343497, + "num_input_tokens_seen": 330863640, + "router_z_loss_clip": 0.00939941, + "router_z_loss_mlp": 0.19921875, + "step": 15338, + "time_per_iteration": 3.0122439861297607 + }, + { + "auxiliary_loss_clip": 0.01101934, + "auxiliary_loss_mlp": 0.01031265, + "balance_loss_clip": 1.01954699, + "balance_loss_mlp": 1.03547251, + "epoch": 0.9222305726739817, + "flos": 30226945155840.0, + "grad_norm": 1.611756335253548, + "language_loss": 0.67253053, + "learning_rate": 6.306246052787289e-08, + "loss": 0.69386256, + "num_input_tokens_seen": 330884675, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 15339, + "time_per_iteration": 2.51116681098938 + }, + { + "auxiliary_loss_clip": 0.01099814, + "auxiliary_loss_mlp": 0.01030808, + "balance_loss_clip": 1.01939344, + "balance_loss_mlp": 1.0349164, + "epoch": 0.9222906959266496, + "flos": 25337204766720.0, + "grad_norm": 1.823316200451707, + "language_loss": 0.71776712, + "learning_rate": 6.296546872173513e-08, + "loss": 0.73907328, + "num_input_tokens_seen": 330904125, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 15340, + "time_per_iteration": 2.497661828994751 + }, + { + "auxiliary_loss_clip": 0.01098023, + "auxiliary_loss_mlp": 0.01028745, + "balance_loss_clip": 1.01765251, + "balance_loss_mlp": 1.03384233, + "epoch": 0.9223508191793176, + "flos": 27599864244480.0, + "grad_norm": 1.494128096822042, + "language_loss": 0.70278209, + "learning_rate": 6.286855036814098e-08, + "loss": 0.72404981, + "num_input_tokens_seen": 330925140, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.640625, + "step": 15341, + "time_per_iteration": 2.558868169784546 + }, + { + "auxiliary_loss_clip": 0.0109525, + "auxiliary_loss_mlp": 0.01030008, + "balance_loss_clip": 1.01931548, + "balance_loss_mlp": 1.03327823, + "epoch": 0.9224109424319856, + "flos": 27307587277440.0, + "grad_norm": 1.548602535002695, + "language_loss": 0.67397153, + "learning_rate": 6.277170547076571e-08, + "loss": 0.69522405, + "num_input_tokens_seen": 330946625, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6171875, + "step": 15342, + "time_per_iteration": 2.5003254413604736 + }, + { + "auxiliary_loss_clip": 0.01099219, + "auxiliary_loss_mlp": 0.01031656, + "balance_loss_clip": 1.02080154, + "balance_loss_mlp": 1.0339365, + "epoch": 0.9224710656846535, + "flos": 48208314401280.0, + "grad_norm": 6.095870438208894, + "language_loss": 0.69328499, + "learning_rate": 6.26749340332815e-08, + "loss": 0.71459371, + "num_input_tokens_seen": 330967795, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.65234375, + "step": 15343, + "time_per_iteration": 2.6598129272460938 + }, + { + "auxiliary_loss_clip": 0.01022024, + "auxiliary_loss_mlp": 0.0099989, + "balance_loss_clip": 0.99891895, + "balance_loss_mlp": 1.0018754, + "epoch": 0.9225311889373216, + "flos": 66722171794560.0, + "grad_norm": 0.7264917660011667, + "language_loss": 0.51998997, + "learning_rate": 6.257823605935786e-08, + "loss": 0.54020911, + "num_input_tokens_seen": 331040850, + "router_z_loss_clip": 0.00970459, + "router_z_loss_mlp": 0.20117188, + "step": 15344, + "time_per_iteration": 3.241743803024292 + }, + { + "auxiliary_loss_clip": 0.01094735, + "auxiliary_loss_mlp": 0.01029203, + "balance_loss_clip": 1.01856375, + "balance_loss_mlp": 1.03361905, + "epoch": 0.9225913121899895, + "flos": 22271295398400.0, + "grad_norm": 1.6429535121798804, + "language_loss": 0.70311445, + "learning_rate": 6.248161155266162e-08, + "loss": 0.72435379, + "num_input_tokens_seen": 331060595, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.609375, + "step": 15345, + "time_per_iteration": 3.881594181060791 + }, + { + "auxiliary_loss_clip": 0.01099254, + "auxiliary_loss_mlp": 0.01034917, + "balance_loss_clip": 1.0229665, + "balance_loss_mlp": 1.03364944, + "epoch": 0.9226514354426575, + "flos": 20082719721600.0, + "grad_norm": 1.7542089435944361, + "language_loss": 0.77480382, + "learning_rate": 6.238506051685677e-08, + "loss": 0.79614556, + "num_input_tokens_seen": 331080195, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.65625, + "step": 15346, + "time_per_iteration": 2.4377188682556152 + }, + { + "auxiliary_loss_clip": 0.01105129, + "auxiliary_loss_mlp": 0.01034379, + "balance_loss_clip": 1.02243412, + "balance_loss_mlp": 1.03608787, + "epoch": 0.9227115586953254, + "flos": 16070851728000.0, + "grad_norm": 2.4232440557125776, + "language_loss": 0.75999713, + "learning_rate": 6.228858295560457e-08, + "loss": 0.78139222, + "num_input_tokens_seen": 331097645, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6875, + "step": 15347, + "time_per_iteration": 3.9060075283050537 + }, + { + "auxiliary_loss_clip": 0.01095819, + "auxiliary_loss_mlp": 0.01029382, + "balance_loss_clip": 1.01887941, + "balance_loss_mlp": 1.03427565, + "epoch": 0.9227716819479934, + "flos": 20446027833600.0, + "grad_norm": 1.4881916639419828, + "language_loss": 0.76720476, + "learning_rate": 6.219217887256367e-08, + "loss": 0.7884568, + "num_input_tokens_seen": 331116830, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.6171875, + "step": 15348, + "time_per_iteration": 3.9879612922668457 + }, + { + "auxiliary_loss_clip": 0.01099795, + "auxiliary_loss_mlp": 0.01030966, + "balance_loss_clip": 1.01900303, + "balance_loss_mlp": 1.03291154, + "epoch": 0.9228318052006613, + "flos": 25007401065600.0, + "grad_norm": 1.9357360383703182, + "language_loss": 0.67522502, + "learning_rate": 6.209584827138959e-08, + "loss": 0.69653267, + "num_input_tokens_seen": 331137235, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66796875, + "step": 15349, + "time_per_iteration": 2.4881527423858643 + }, + { + "auxiliary_loss_clip": 0.01098952, + "auxiliary_loss_mlp": 0.01026916, + "balance_loss_clip": 1.01580596, + "balance_loss_mlp": 1.03286695, + "epoch": 0.9228919284533293, + "flos": 12677257560960.0, + "grad_norm": 2.1804574418190135, + "language_loss": 0.86920041, + "learning_rate": 6.199959115573495e-08, + "loss": 0.89045906, + "num_input_tokens_seen": 331153155, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.66015625, + "step": 15350, + "time_per_iteration": 2.4354965686798096 + }, + { + "auxiliary_loss_clip": 0.01022095, + "auxiliary_loss_mlp": 0.00999272, + "balance_loss_clip": 0.9983182, + "balance_loss_mlp": 1.00192451, + "epoch": 0.9229520517059973, + "flos": 69986162712960.0, + "grad_norm": 0.7681060822622773, + "language_loss": 0.60345185, + "learning_rate": 6.190340752924994e-08, + "loss": 0.62366551, + "num_input_tokens_seen": 331214895, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.20214844, + "step": 15351, + "time_per_iteration": 2.9938981533050537 + }, + { + "auxiliary_loss_clip": 0.01099145, + "auxiliary_loss_mlp": 0.01024754, + "balance_loss_clip": 1.01364326, + "balance_loss_mlp": 1.03202951, + "epoch": 0.9230121749586653, + "flos": 14793832425600.0, + "grad_norm": 2.0623223330512737, + "language_loss": 0.78037506, + "learning_rate": 6.180729739558233e-08, + "loss": 0.80161405, + "num_input_tokens_seen": 331232185, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.671875, + "step": 15352, + "time_per_iteration": 3.9149723052978516 + }, + { + "auxiliary_loss_clip": 0.01103603, + "auxiliary_loss_mlp": 0.01034177, + "balance_loss_clip": 1.02197564, + "balance_loss_mlp": 1.03482258, + "epoch": 0.9230722982113332, + "flos": 22967208472320.0, + "grad_norm": 2.1804131199600714, + "language_loss": 0.59960139, + "learning_rate": 6.171126075837585e-08, + "loss": 0.62097919, + "num_input_tokens_seen": 331251065, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6875, + "step": 15353, + "time_per_iteration": 2.496880531311035 + }, + { + "auxiliary_loss_clip": 0.01097462, + "auxiliary_loss_mlp": 0.01027824, + "balance_loss_clip": 1.01727974, + "balance_loss_mlp": 1.03385043, + "epoch": 0.9231324214640012, + "flos": 18551452976640.0, + "grad_norm": 1.7905758764270645, + "language_loss": 0.74425894, + "learning_rate": 6.161529762127293e-08, + "loss": 0.76551175, + "num_input_tokens_seen": 331269110, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.63671875, + "step": 15354, + "time_per_iteration": 2.425142526626587 + }, + { + "auxiliary_loss_clip": 0.0110371, + "auxiliary_loss_mlp": 0.01030292, + "balance_loss_clip": 1.01714277, + "balance_loss_mlp": 1.03467274, + "epoch": 0.9231925447166691, + "flos": 22082727974400.0, + "grad_norm": 1.9630173318730952, + "language_loss": 0.64785397, + "learning_rate": 6.1519407987912e-08, + "loss": 0.66919398, + "num_input_tokens_seen": 331286555, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.69140625, + "step": 15355, + "time_per_iteration": 2.4966373443603516 + }, + { + "auxiliary_loss_clip": 0.01096376, + "auxiliary_loss_mlp": 0.0103167, + "balance_loss_clip": 1.02042854, + "balance_loss_mlp": 1.03359032, + "epoch": 0.9232526679693371, + "flos": 26541145848960.0, + "grad_norm": 1.9020455750436218, + "language_loss": 0.74108565, + "learning_rate": 6.142359186192947e-08, + "loss": 0.76236618, + "num_input_tokens_seen": 331307660, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.625, + "step": 15356, + "time_per_iteration": 2.495620012283325 + }, + { + "auxiliary_loss_clip": 0.01101297, + "auxiliary_loss_mlp": 0.01032627, + "balance_loss_clip": 1.02093816, + "balance_loss_mlp": 1.03475368, + "epoch": 0.9233127912220052, + "flos": 14756664827520.0, + "grad_norm": 1.7464532837963378, + "language_loss": 0.60978168, + "learning_rate": 6.132784924695844e-08, + "loss": 0.63112092, + "num_input_tokens_seen": 331324885, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 15357, + "time_per_iteration": 2.3971152305603027 + }, + { + "auxiliary_loss_clip": 0.0110148, + "auxiliary_loss_mlp": 0.01029837, + "balance_loss_clip": 1.01805329, + "balance_loss_mlp": 1.03382421, + "epoch": 0.9233729144746731, + "flos": 25261792162560.0, + "grad_norm": 1.834426423623626, + "language_loss": 0.69739604, + "learning_rate": 6.123218014662956e-08, + "loss": 0.71870929, + "num_input_tokens_seen": 331345885, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.67578125, + "step": 15358, + "time_per_iteration": 2.5024566650390625 + }, + { + "auxiliary_loss_clip": 0.01099424, + "auxiliary_loss_mlp": 0.01029528, + "balance_loss_clip": 1.0182445, + "balance_loss_mlp": 1.03358769, + "epoch": 0.9234330377273411, + "flos": 27849837968640.0, + "grad_norm": 2.364215132336142, + "language_loss": 0.73011422, + "learning_rate": 6.113658456457104e-08, + "loss": 0.75140369, + "num_input_tokens_seen": 331364320, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65625, + "step": 15359, + "time_per_iteration": 2.4889423847198486 + }, + { + "auxiliary_loss_clip": 0.01101489, + "auxiliary_loss_mlp": 0.01031117, + "balance_loss_clip": 1.01995301, + "balance_loss_mlp": 1.03558564, + "epoch": 0.923493160980009, + "flos": 24608361899520.0, + "grad_norm": 2.101856244679429, + "language_loss": 0.64447194, + "learning_rate": 6.104106250440732e-08, + "loss": 0.66579807, + "num_input_tokens_seen": 331384135, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.65625, + "step": 15360, + "time_per_iteration": 2.489089250564575 + }, + { + "auxiliary_loss_clip": 0.01021548, + "auxiliary_loss_mlp": 0.00995886, + "balance_loss_clip": 0.99485475, + "balance_loss_mlp": 1.00148213, + "epoch": 0.923553284232677, + "flos": 67700916558720.0, + "grad_norm": 0.7586585804998485, + "language_loss": 0.55154079, + "learning_rate": 6.094561396976083e-08, + "loss": 0.57171512, + "num_input_tokens_seen": 331440645, + "router_z_loss_clip": 0.01031494, + "router_z_loss_mlp": 0.20117188, + "step": 15361, + "time_per_iteration": 3.001129150390625 + }, + { + "auxiliary_loss_clip": 0.01100288, + "auxiliary_loss_mlp": 0.01027298, + "balance_loss_clip": 1.01519871, + "balance_loss_mlp": 1.03305101, + "epoch": 0.9236134074853449, + "flos": 18807244704000.0, + "grad_norm": 1.6389059955723686, + "language_loss": 0.69725895, + "learning_rate": 6.085023896425112e-08, + "loss": 0.71853483, + "num_input_tokens_seen": 331459580, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 15362, + "time_per_iteration": 2.4049232006073 + }, + { + "auxiliary_loss_clip": 0.0110233, + "auxiliary_loss_mlp": 0.01031802, + "balance_loss_clip": 1.01850414, + "balance_loss_mlp": 1.03439915, + "epoch": 0.923673530738013, + "flos": 27782362270080.0, + "grad_norm": 1.5321644685395488, + "language_loss": 0.7569198, + "learning_rate": 6.075493749149463e-08, + "loss": 0.77826107, + "num_input_tokens_seen": 331481560, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6796875, + "step": 15363, + "time_per_iteration": 2.5406601428985596 + }, + { + "auxiliary_loss_clip": 0.01099534, + "auxiliary_loss_mlp": 0.01027193, + "balance_loss_clip": 1.01585019, + "balance_loss_mlp": 1.03406906, + "epoch": 0.9237336539906809, + "flos": 26797117144320.0, + "grad_norm": 2.197001335564612, + "language_loss": 0.83133066, + "learning_rate": 6.065970955510514e-08, + "loss": 0.85259789, + "num_input_tokens_seen": 331499090, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 15364, + "time_per_iteration": 2.4756619930267334 + }, + { + "auxiliary_loss_clip": 0.01097664, + "auxiliary_loss_mlp": 0.01026364, + "balance_loss_clip": 1.01561093, + "balance_loss_mlp": 1.03372884, + "epoch": 0.9237937772433489, + "flos": 23587708942080.0, + "grad_norm": 1.5348322828842351, + "language_loss": 0.67962128, + "learning_rate": 6.056455515869419e-08, + "loss": 0.70086157, + "num_input_tokens_seen": 331519420, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.640625, + "step": 15365, + "time_per_iteration": 2.497309684753418 + }, + { + "auxiliary_loss_clip": 0.01100931, + "auxiliary_loss_mlp": 0.01030133, + "balance_loss_clip": 1.01817071, + "balance_loss_mlp": 1.03535795, + "epoch": 0.9238539004960168, + "flos": 26140562398080.0, + "grad_norm": 2.194169448976208, + "language_loss": 0.62673676, + "learning_rate": 6.046947430586913e-08, + "loss": 0.64804745, + "num_input_tokens_seen": 331538720, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.65625, + "step": 15366, + "time_per_iteration": 2.5063443183898926 + }, + { + "auxiliary_loss_clip": 0.01099789, + "auxiliary_loss_mlp": 0.01026945, + "balance_loss_clip": 1.01484501, + "balance_loss_mlp": 1.03536332, + "epoch": 0.9239140237486848, + "flos": 21068000760960.0, + "grad_norm": 1.4208418043509794, + "language_loss": 0.74381047, + "learning_rate": 6.037446700023619e-08, + "loss": 0.76507783, + "num_input_tokens_seen": 331558505, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.64453125, + "step": 15367, + "time_per_iteration": 2.4719345569610596 + }, + { + "auxiliary_loss_clip": 0.01094974, + "auxiliary_loss_mlp": 0.0103132, + "balance_loss_clip": 1.02102005, + "balance_loss_mlp": 1.03345788, + "epoch": 0.9239741470013527, + "flos": 24607930936320.0, + "grad_norm": 2.0793504009917445, + "language_loss": 0.64489555, + "learning_rate": 6.027953324539759e-08, + "loss": 0.66615844, + "num_input_tokens_seen": 331578440, + "router_z_loss_clip": 0.10302734, + "router_z_loss_mlp": 0.61328125, + "step": 15368, + "time_per_iteration": 2.4641342163085938 + }, + { + "auxiliary_loss_clip": 0.01102929, + "auxiliary_loss_mlp": 0.0102822, + "balance_loss_clip": 1.01631057, + "balance_loss_mlp": 1.03453827, + "epoch": 0.9240342702540207, + "flos": 24718248581760.0, + "grad_norm": 2.4636100553277895, + "language_loss": 0.74815971, + "learning_rate": 6.018467304495401e-08, + "loss": 0.76947117, + "num_input_tokens_seen": 331598945, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.68359375, + "step": 15369, + "time_per_iteration": 2.4689323902130127 + }, + { + "auxiliary_loss_clip": 0.01104162, + "auxiliary_loss_mlp": 0.01035184, + "balance_loss_clip": 1.02204156, + "balance_loss_mlp": 1.03590214, + "epoch": 0.9240943935066888, + "flos": 20849987162880.0, + "grad_norm": 1.8685080548034616, + "language_loss": 0.76351935, + "learning_rate": 6.008988640250145e-08, + "loss": 0.78491282, + "num_input_tokens_seen": 331616700, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.68359375, + "step": 15370, + "time_per_iteration": 2.422639846801758 + }, + { + "auxiliary_loss_clip": 0.01099737, + "auxiliary_loss_mlp": 0.01032145, + "balance_loss_clip": 1.02084386, + "balance_loss_mlp": 1.03397942, + "epoch": 0.9241545167593567, + "flos": 24462313200000.0, + "grad_norm": 2.4404674499916803, + "language_loss": 0.67358434, + "learning_rate": 5.999517332163528e-08, + "loss": 0.69490314, + "num_input_tokens_seen": 331635625, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 15371, + "time_per_iteration": 2.4798216819763184 + }, + { + "auxiliary_loss_clip": 0.01021681, + "auxiliary_loss_mlp": 0.01000874, + "balance_loss_clip": 0.99990863, + "balance_loss_mlp": 1.00162053, + "epoch": 0.9242146400120247, + "flos": 61827259847040.0, + "grad_norm": 0.7221153992887761, + "language_loss": 0.57649028, + "learning_rate": 5.99005338059464e-08, + "loss": 0.59671581, + "num_input_tokens_seen": 331698595, + "router_z_loss_clip": 0.00964355, + "router_z_loss_mlp": 0.20117188, + "step": 15372, + "time_per_iteration": 3.0151007175445557 + }, + { + "auxiliary_loss_clip": 0.01099368, + "auxiliary_loss_mlp": 0.01030332, + "balance_loss_clip": 1.01962721, + "balance_loss_mlp": 1.03601837, + "epoch": 0.9242747632646926, + "flos": 22048397550720.0, + "grad_norm": 2.471421891520512, + "language_loss": 0.69785196, + "learning_rate": 5.98059678590237e-08, + "loss": 0.71914893, + "num_input_tokens_seen": 331717975, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6328125, + "step": 15373, + "time_per_iteration": 2.436583995819092 + }, + { + "auxiliary_loss_clip": 0.01099684, + "auxiliary_loss_mlp": 0.01033575, + "balance_loss_clip": 1.02204728, + "balance_loss_mlp": 1.03429437, + "epoch": 0.9243348865173606, + "flos": 18478338842880.0, + "grad_norm": 2.1623197384255404, + "language_loss": 0.75304061, + "learning_rate": 5.971147548445299e-08, + "loss": 0.77437317, + "num_input_tokens_seen": 331737220, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65234375, + "step": 15374, + "time_per_iteration": 2.415738582611084 + }, + { + "auxiliary_loss_clip": 0.01101561, + "auxiliary_loss_mlp": 0.01031001, + "balance_loss_clip": 1.01978934, + "balance_loss_mlp": 1.03546357, + "epoch": 0.9243950097700285, + "flos": 23258767167360.0, + "grad_norm": 2.3489424736028974, + "language_loss": 0.64875305, + "learning_rate": 5.961705668581784e-08, + "loss": 0.67007864, + "num_input_tokens_seen": 331757300, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.66015625, + "step": 15375, + "time_per_iteration": 2.4479691982269287 + }, + { + "auxiliary_loss_clip": 0.01099359, + "auxiliary_loss_mlp": 0.01031853, + "balance_loss_clip": 1.02046835, + "balance_loss_mlp": 1.03515816, + "epoch": 0.9244551330226966, + "flos": 29749081593600.0, + "grad_norm": 1.7783255869670582, + "language_loss": 0.66906196, + "learning_rate": 5.952271146669829e-08, + "loss": 0.69037414, + "num_input_tokens_seen": 331776995, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.640625, + "step": 15376, + "time_per_iteration": 2.4910011291503906 + }, + { + "auxiliary_loss_clip": 0.010219, + "auxiliary_loss_mlp": 0.01001278, + "balance_loss_clip": 1.00028849, + "balance_loss_mlp": 1.00179458, + "epoch": 0.9245152562753645, + "flos": 68864960609280.0, + "grad_norm": 0.6519597025269294, + "language_loss": 0.61160791, + "learning_rate": 5.94284398306717e-08, + "loss": 0.63183969, + "num_input_tokens_seen": 331845015, + "router_z_loss_clip": 0.0098877, + "router_z_loss_mlp": 0.20117188, + "step": 15377, + "time_per_iteration": 3.057742118835449 + }, + { + "auxiliary_loss_clip": 0.01099177, + "auxiliary_loss_mlp": 0.01033463, + "balance_loss_clip": 1.02223337, + "balance_loss_mlp": 1.03419769, + "epoch": 0.9245753795280325, + "flos": 21579260993280.0, + "grad_norm": 1.8725219959605253, + "language_loss": 0.73735809, + "learning_rate": 5.933424178131341e-08, + "loss": 0.75868452, + "num_input_tokens_seen": 331862795, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 15378, + "time_per_iteration": 2.425985097885132 + }, + { + "auxiliary_loss_clip": 0.011013, + "auxiliary_loss_mlp": 0.01030775, + "balance_loss_clip": 1.01888347, + "balance_loss_mlp": 1.03506637, + "epoch": 0.9246355027807004, + "flos": 34496077334400.0, + "grad_norm": 2.9917383599465364, + "language_loss": 0.62278056, + "learning_rate": 5.924011732219503e-08, + "loss": 0.64410132, + "num_input_tokens_seen": 331882535, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 15379, + "time_per_iteration": 2.557879686355591 + }, + { + "auxiliary_loss_clip": 0.0109822, + "auxiliary_loss_mlp": 0.0102624, + "balance_loss_clip": 1.01468313, + "balance_loss_mlp": 1.03472924, + "epoch": 0.9246956260333684, + "flos": 15953854152960.0, + "grad_norm": 2.134540215346882, + "language_loss": 0.83972025, + "learning_rate": 5.914606645688591e-08, + "loss": 0.86096483, + "num_input_tokens_seen": 331899335, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.63671875, + "step": 15380, + "time_per_iteration": 2.4178035259246826 + }, + { + "auxiliary_loss_clip": 0.01101277, + "auxiliary_loss_mlp": 0.01033351, + "balance_loss_clip": 1.02084041, + "balance_loss_mlp": 1.03352189, + "epoch": 0.9247557492860363, + "flos": 23368366540800.0, + "grad_norm": 1.4769589190868633, + "language_loss": 0.73472691, + "learning_rate": 5.905208918895233e-08, + "loss": 0.75607318, + "num_input_tokens_seen": 331919030, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 15381, + "time_per_iteration": 2.455674171447754 + }, + { + "auxiliary_loss_clip": 0.01100221, + "auxiliary_loss_mlp": 0.01028933, + "balance_loss_clip": 1.01783454, + "balance_loss_mlp": 1.03460169, + "epoch": 0.9248158725387043, + "flos": 23039855729280.0, + "grad_norm": 1.6990719448021085, + "language_loss": 0.78354275, + "learning_rate": 5.8958185521958524e-08, + "loss": 0.80483425, + "num_input_tokens_seen": 331936465, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 15382, + "time_per_iteration": 2.4381918907165527 + }, + { + "auxiliary_loss_clip": 0.01099044, + "auxiliary_loss_mlp": 0.01031867, + "balance_loss_clip": 1.01986253, + "balance_loss_mlp": 1.03293002, + "epoch": 0.9248759957913724, + "flos": 22522418357760.0, + "grad_norm": 1.7957184237375154, + "language_loss": 0.74939609, + "learning_rate": 5.886435545946455e-08, + "loss": 0.77070516, + "num_input_tokens_seen": 331954625, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66015625, + "step": 15383, + "time_per_iteration": 2.4508137702941895 + }, + { + "auxiliary_loss_clip": 0.01095389, + "auxiliary_loss_mlp": 0.01026924, + "balance_loss_clip": 1.01564097, + "balance_loss_mlp": 1.0316422, + "epoch": 0.9249361190440403, + "flos": 25447271016960.0, + "grad_norm": 1.7969002247576855, + "language_loss": 0.75541508, + "learning_rate": 5.8770599005028456e-08, + "loss": 0.77663815, + "num_input_tokens_seen": 331975865, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.63671875, + "step": 15384, + "time_per_iteration": 2.506045341491699 + }, + { + "auxiliary_loss_clip": 0.01095577, + "auxiliary_loss_mlp": 0.01030739, + "balance_loss_clip": 1.01920581, + "balance_loss_mlp": 1.03235722, + "epoch": 0.9249962422967083, + "flos": 12378623886720.0, + "grad_norm": 1.8321722340960027, + "language_loss": 0.66197598, + "learning_rate": 5.8676916162206045e-08, + "loss": 0.6832391, + "num_input_tokens_seen": 331992760, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6328125, + "step": 15385, + "time_per_iteration": 2.413760185241699 + }, + { + "auxiliary_loss_clip": 0.01097285, + "auxiliary_loss_mlp": 0.01029213, + "balance_loss_clip": 1.01847816, + "balance_loss_mlp": 1.03289402, + "epoch": 0.9250563655493762, + "flos": 22929430343040.0, + "grad_norm": 1.8037603173155325, + "language_loss": 0.80537152, + "learning_rate": 5.85833069345496e-08, + "loss": 0.82663649, + "num_input_tokens_seen": 332011890, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.64453125, + "step": 15386, + "time_per_iteration": 2.4500980377197266 + }, + { + "auxiliary_loss_clip": 0.01098949, + "auxiliary_loss_mlp": 0.01037808, + "balance_loss_clip": 1.02584577, + "balance_loss_mlp": 1.03501868, + "epoch": 0.9251164888020442, + "flos": 18478662065280.0, + "grad_norm": 1.5966888283815128, + "language_loss": 0.75251609, + "learning_rate": 5.8489771325608504e-08, + "loss": 0.7738837, + "num_input_tokens_seen": 332029485, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.640625, + "step": 15387, + "time_per_iteration": 3.821263551712036 + }, + { + "auxiliary_loss_clip": 0.01096172, + "auxiliary_loss_mlp": 0.01031177, + "balance_loss_clip": 1.02039409, + "balance_loss_mlp": 1.03313661, + "epoch": 0.9251766120547121, + "flos": 33037062796800.0, + "grad_norm": 1.5110299147108328, + "language_loss": 0.70130134, + "learning_rate": 5.839630933893014e-08, + "loss": 0.72257483, + "num_input_tokens_seen": 332052970, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6328125, + "step": 15388, + "time_per_iteration": 3.967625141143799 + }, + { + "auxiliary_loss_clip": 0.01100941, + "auxiliary_loss_mlp": 0.01027135, + "balance_loss_clip": 1.01554775, + "balance_loss_mlp": 1.03401148, + "epoch": 0.9252367353073802, + "flos": 24387906176640.0, + "grad_norm": 1.703661518442703, + "language_loss": 0.818995, + "learning_rate": 5.8302920978058115e-08, + "loss": 0.84027576, + "num_input_tokens_seen": 332070395, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66796875, + "step": 15389, + "time_per_iteration": 3.870365858078003 + }, + { + "auxiliary_loss_clip": 0.01107938, + "auxiliary_loss_mlp": 0.01030221, + "balance_loss_clip": 1.01763237, + "balance_loss_mlp": 1.03723955, + "epoch": 0.9252968585600481, + "flos": 18916844077440.0, + "grad_norm": 1.8133156840491251, + "language_loss": 0.7921918, + "learning_rate": 5.820960624653381e-08, + "loss": 0.81357348, + "num_input_tokens_seen": 332090185, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.70703125, + "step": 15390, + "time_per_iteration": 2.43095064163208 + }, + { + "auxiliary_loss_clip": 0.01101708, + "auxiliary_loss_mlp": 0.01039397, + "balance_loss_clip": 1.02741694, + "balance_loss_mlp": 1.03465974, + "epoch": 0.9253569818127161, + "flos": 21725345606400.0, + "grad_norm": 1.7286772201518952, + "language_loss": 0.75258297, + "learning_rate": 5.811636514789597e-08, + "loss": 0.77399403, + "num_input_tokens_seen": 332109050, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 15391, + "time_per_iteration": 2.41867995262146 + }, + { + "auxiliary_loss_clip": 0.01098225, + "auxiliary_loss_mlp": 0.01030171, + "balance_loss_clip": 1.01740921, + "balance_loss_mlp": 1.03291667, + "epoch": 0.925417105065384, + "flos": 34240357434240.0, + "grad_norm": 2.128188979527296, + "language_loss": 0.52005279, + "learning_rate": 5.80231976856802e-08, + "loss": 0.54133677, + "num_input_tokens_seen": 332131180, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.65234375, + "step": 15392, + "time_per_iteration": 2.5652174949645996 + }, + { + "auxiliary_loss_clip": 0.01097761, + "auxiliary_loss_mlp": 0.01027276, + "balance_loss_clip": 1.01597524, + "balance_loss_mlp": 1.03198981, + "epoch": 0.925477228318052, + "flos": 25959536830080.0, + "grad_norm": 1.964965624188704, + "language_loss": 0.77008653, + "learning_rate": 5.7930103863419454e-08, + "loss": 0.79133701, + "num_input_tokens_seen": 332149555, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 15393, + "time_per_iteration": 3.954613447189331 + }, + { + "auxiliary_loss_clip": 0.01096999, + "auxiliary_loss_mlp": 0.01031186, + "balance_loss_clip": 1.01977193, + "balance_loss_mlp": 1.03259718, + "epoch": 0.9255373515707199, + "flos": 11838240702720.0, + "grad_norm": 1.7627839503493286, + "language_loss": 0.69385219, + "learning_rate": 5.783708368464357e-08, + "loss": 0.71513402, + "num_input_tokens_seen": 332165830, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.64453125, + "step": 15394, + "time_per_iteration": 2.4000730514526367 + }, + { + "auxiliary_loss_clip": 0.01100318, + "auxiliary_loss_mlp": 0.01026765, + "balance_loss_clip": 1.01555324, + "balance_loss_mlp": 1.03456926, + "epoch": 0.925597474823388, + "flos": 21434325615360.0, + "grad_norm": 1.7473116250665182, + "language_loss": 0.72601545, + "learning_rate": 5.7744137152879956e-08, + "loss": 0.74728626, + "num_input_tokens_seen": 332185130, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.65625, + "step": 15395, + "time_per_iteration": 2.468665361404419 + }, + { + "auxiliary_loss_clip": 0.01094627, + "auxiliary_loss_mlp": 0.01029581, + "balance_loss_clip": 1.01909065, + "balance_loss_mlp": 1.03195882, + "epoch": 0.925657598076056, + "flos": 22857573185280.0, + "grad_norm": 2.211482629648121, + "language_loss": 0.71316254, + "learning_rate": 5.7651264271653785e-08, + "loss": 0.73440462, + "num_input_tokens_seen": 332203695, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.625, + "step": 15396, + "time_per_iteration": 2.437075614929199 + }, + { + "auxiliary_loss_clip": 0.01097691, + "auxiliary_loss_mlp": 0.01025515, + "balance_loss_clip": 1.0139879, + "balance_loss_mlp": 1.03351045, + "epoch": 0.9257177213287239, + "flos": 25704032411520.0, + "grad_norm": 1.5907781681745499, + "language_loss": 0.8724966, + "learning_rate": 5.755846504448603e-08, + "loss": 0.89372873, + "num_input_tokens_seen": 332224850, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 15397, + "time_per_iteration": 2.497161865234375 + }, + { + "auxiliary_loss_clip": 0.01021793, + "auxiliary_loss_mlp": 0.01004483, + "balance_loss_clip": 1.00352311, + "balance_loss_mlp": 1.00168765, + "epoch": 0.9257778445813919, + "flos": 59592933221760.0, + "grad_norm": 0.821738619907706, + "language_loss": 0.55149096, + "learning_rate": 5.746573947489586e-08, + "loss": 0.57175368, + "num_input_tokens_seen": 332278085, + "router_z_loss_clip": 0.00958252, + "router_z_loss_mlp": 0.20117188, + "step": 15398, + "time_per_iteration": 2.942495107650757 + }, + { + "auxiliary_loss_clip": 0.01104006, + "auxiliary_loss_mlp": 0.01027248, + "balance_loss_clip": 1.0141114, + "balance_loss_mlp": 1.0344249, + "epoch": 0.9258379678340598, + "flos": 27709427704320.0, + "grad_norm": 1.9427050016588183, + "language_loss": 0.75920027, + "learning_rate": 5.7373087566400025e-08, + "loss": 0.78051281, + "num_input_tokens_seen": 332297875, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6953125, + "step": 15399, + "time_per_iteration": 2.491990804672241 + }, + { + "auxiliary_loss_clip": 0.01093745, + "auxiliary_loss_mlp": 0.01027786, + "balance_loss_clip": 1.01742709, + "balance_loss_mlp": 1.0315423, + "epoch": 0.9258980910867278, + "flos": 24863543095680.0, + "grad_norm": 1.5984703912789273, + "language_loss": 0.78239942, + "learning_rate": 5.7280509322510826e-08, + "loss": 0.80361474, + "num_input_tokens_seen": 332318500, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.625, + "step": 15400, + "time_per_iteration": 2.4736785888671875 + }, + { + "auxiliary_loss_clip": 0.01021709, + "auxiliary_loss_mlp": 0.00998028, + "balance_loss_clip": 0.99707454, + "balance_loss_mlp": 1.00160527, + "epoch": 0.9259582143393957, + "flos": 63134587249920.0, + "grad_norm": 0.731962220549327, + "language_loss": 0.51344585, + "learning_rate": 5.718800474673946e-08, + "loss": 0.53364325, + "num_input_tokens_seen": 332381980, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.20117188, + "step": 15401, + "time_per_iteration": 3.0090858936309814 + }, + { + "auxiliary_loss_clip": 0.01096088, + "auxiliary_loss_mlp": 0.01031444, + "balance_loss_clip": 1.02058387, + "balance_loss_mlp": 1.03388858, + "epoch": 0.9260183375920638, + "flos": 24127122458880.0, + "grad_norm": 1.6172522790152049, + "language_loss": 0.8218559, + "learning_rate": 5.709557384259378e-08, + "loss": 0.8431313, + "num_input_tokens_seen": 332399510, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.62109375, + "step": 15402, + "time_per_iteration": 2.4477603435516357 + }, + { + "auxiliary_loss_clip": 0.01021801, + "auxiliary_loss_mlp": 0.01002843, + "balance_loss_clip": 1.00190759, + "balance_loss_mlp": 1.0017004, + "epoch": 0.9260784608447317, + "flos": 63042872849280.0, + "grad_norm": 0.7718721766171598, + "language_loss": 0.5109669, + "learning_rate": 5.700321661357876e-08, + "loss": 0.53121334, + "num_input_tokens_seen": 332459130, + "router_z_loss_clip": 0.00933838, + "router_z_loss_mlp": 0.20117188, + "step": 15403, + "time_per_iteration": 3.1244301795959473 + }, + { + "auxiliary_loss_clip": 0.01021692, + "auxiliary_loss_mlp": 0.00998434, + "balance_loss_clip": 0.99748039, + "balance_loss_mlp": 1.00152445, + "epoch": 0.9261385840973997, + "flos": 70585979927040.0, + "grad_norm": 0.6837569550075934, + "language_loss": 0.58685899, + "learning_rate": 5.69109330631965e-08, + "loss": 0.60706019, + "num_input_tokens_seen": 332526555, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.20117188, + "step": 15404, + "time_per_iteration": 3.094059705734253 + }, + { + "auxiliary_loss_clip": 0.01100562, + "auxiliary_loss_mlp": 0.01031398, + "balance_loss_clip": 1.01923847, + "balance_loss_mlp": 1.03395927, + "epoch": 0.9261987073500676, + "flos": 20229917656320.0, + "grad_norm": 1.9953628404049057, + "language_loss": 0.71774006, + "learning_rate": 5.681872319494596e-08, + "loss": 0.73905957, + "num_input_tokens_seen": 332544005, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66796875, + "step": 15405, + "time_per_iteration": 2.4240143299102783 + }, + { + "auxiliary_loss_clip": 0.01102412, + "auxiliary_loss_mlp": 0.01035822, + "balance_loss_clip": 1.02343011, + "balance_loss_mlp": 1.03474975, + "epoch": 0.9262588306027356, + "flos": 20954163582720.0, + "grad_norm": 1.7576607341023662, + "language_loss": 0.68750131, + "learning_rate": 5.672658701232458e-08, + "loss": 0.7088837, + "num_input_tokens_seen": 332563070, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.67578125, + "step": 15406, + "time_per_iteration": 2.466527223587036 + }, + { + "auxiliary_loss_clip": 0.01101722, + "auxiliary_loss_mlp": 0.01034883, + "balance_loss_clip": 1.0222826, + "balance_loss_mlp": 1.03555775, + "epoch": 0.9263189538554035, + "flos": 22158679282560.0, + "grad_norm": 2.7752973629401856, + "language_loss": 0.76403785, + "learning_rate": 5.663452451882555e-08, + "loss": 0.78540385, + "num_input_tokens_seen": 332579620, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.66015625, + "step": 15407, + "time_per_iteration": 2.4367871284484863 + }, + { + "auxiliary_loss_clip": 0.01103324, + "auxiliary_loss_mlp": 0.01038757, + "balance_loss_clip": 1.02655041, + "balance_loss_mlp": 1.03313541, + "epoch": 0.9263790771080715, + "flos": 18187211111040.0, + "grad_norm": 3.383099092597422, + "language_loss": 0.72512782, + "learning_rate": 5.6542535717940096e-08, + "loss": 0.74654853, + "num_input_tokens_seen": 332597795, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.703125, + "step": 15408, + "time_per_iteration": 2.430126667022705 + }, + { + "auxiliary_loss_clip": 0.01098119, + "auxiliary_loss_mlp": 0.01028103, + "balance_loss_clip": 1.01819086, + "balance_loss_mlp": 1.03438425, + "epoch": 0.9264392003607396, + "flos": 48178545004800.0, + "grad_norm": 1.697930797649794, + "language_loss": 0.68514466, + "learning_rate": 5.645062061315675e-08, + "loss": 0.70640695, + "num_input_tokens_seen": 332620375, + "router_z_loss_clip": 0.09912109, + "router_z_loss_mlp": 0.63671875, + "step": 15409, + "time_per_iteration": 2.662263870239258 + }, + { + "auxiliary_loss_clip": 0.01101595, + "auxiliary_loss_mlp": 0.01029904, + "balance_loss_clip": 1.01756573, + "balance_loss_mlp": 1.03554535, + "epoch": 0.9264993236134075, + "flos": 26389458714240.0, + "grad_norm": 2.2641730930101724, + "language_loss": 0.75665075, + "learning_rate": 5.6358779207960506e-08, + "loss": 0.77796578, + "num_input_tokens_seen": 332639510, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66015625, + "step": 15410, + "time_per_iteration": 2.495643138885498 + }, + { + "auxiliary_loss_clip": 0.01099727, + "auxiliary_loss_mlp": 0.01026725, + "balance_loss_clip": 1.01559091, + "balance_loss_mlp": 1.0341475, + "epoch": 0.9265594468660755, + "flos": 20920084554240.0, + "grad_norm": 1.541705061344522, + "language_loss": 0.82224798, + "learning_rate": 5.6267011505833905e-08, + "loss": 0.84351254, + "num_input_tokens_seen": 332658350, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 15411, + "time_per_iteration": 2.437490940093994 + }, + { + "auxiliary_loss_clip": 0.01104354, + "auxiliary_loss_mlp": 0.01034582, + "balance_loss_clip": 1.02319741, + "balance_loss_mlp": 1.03806257, + "epoch": 0.9266195701187434, + "flos": 17525017929600.0, + "grad_norm": 2.409965861262806, + "language_loss": 0.75620615, + "learning_rate": 5.617531751025728e-08, + "loss": 0.77759552, + "num_input_tokens_seen": 332676715, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6640625, + "step": 15412, + "time_per_iteration": 2.4860680103302 + }, + { + "auxiliary_loss_clip": 0.01096944, + "auxiliary_loss_mlp": 0.01028884, + "balance_loss_clip": 1.01736248, + "balance_loss_mlp": 1.03194141, + "epoch": 0.9266796933714114, + "flos": 33688733293440.0, + "grad_norm": 2.743723110858746, + "language_loss": 0.66987592, + "learning_rate": 5.6083697224707406e-08, + "loss": 0.69113421, + "num_input_tokens_seen": 332701470, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 15413, + "time_per_iteration": 2.5412187576293945 + }, + { + "auxiliary_loss_clip": 0.01100923, + "auxiliary_loss_mlp": 0.01030446, + "balance_loss_clip": 1.01838839, + "balance_loss_mlp": 1.03374481, + "epoch": 0.9267398166240793, + "flos": 18916520855040.0, + "grad_norm": 1.8837967229167019, + "language_loss": 0.76128107, + "learning_rate": 5.5992150652658167e-08, + "loss": 0.78259474, + "num_input_tokens_seen": 332719060, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.671875, + "step": 15414, + "time_per_iteration": 2.435417413711548 + }, + { + "auxiliary_loss_clip": 0.01098362, + "auxiliary_loss_mlp": 0.01029505, + "balance_loss_clip": 1.01819217, + "balance_loss_mlp": 1.03404093, + "epoch": 0.9267999398767474, + "flos": 20478957626880.0, + "grad_norm": 1.974785668209935, + "language_loss": 0.8150264, + "learning_rate": 5.59006777975819e-08, + "loss": 0.83630508, + "num_input_tokens_seen": 332736345, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 15415, + "time_per_iteration": 2.436947822570801 + }, + { + "auxiliary_loss_clip": 0.01100241, + "auxiliary_loss_mlp": 0.01032005, + "balance_loss_clip": 1.02014947, + "balance_loss_mlp": 1.03393376, + "epoch": 0.9268600631294153, + "flos": 24789351553920.0, + "grad_norm": 1.4430461922371247, + "language_loss": 0.54157484, + "learning_rate": 5.580927866294671e-08, + "loss": 0.56289732, + "num_input_tokens_seen": 332756270, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6640625, + "step": 15416, + "time_per_iteration": 2.482398509979248 + }, + { + "auxiliary_loss_clip": 0.0109764, + "auxiliary_loss_mlp": 0.01030069, + "balance_loss_clip": 1.01903081, + "balance_loss_mlp": 1.03377366, + "epoch": 0.9269201863820833, + "flos": 18697178453760.0, + "grad_norm": 1.575334838653751, + "language_loss": 0.72061193, + "learning_rate": 5.571795325221807e-08, + "loss": 0.74188906, + "num_input_tokens_seen": 332775185, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 15417, + "time_per_iteration": 2.421722173690796 + }, + { + "auxiliary_loss_clip": 0.01099529, + "auxiliary_loss_mlp": 0.01029786, + "balance_loss_clip": 1.01774013, + "balance_loss_mlp": 1.03482032, + "epoch": 0.9269803096347512, + "flos": 20923999136640.0, + "grad_norm": 2.1097210915525206, + "language_loss": 0.75657284, + "learning_rate": 5.5626701568859624e-08, + "loss": 0.77786595, + "num_input_tokens_seen": 332794320, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6484375, + "step": 15418, + "time_per_iteration": 2.4550986289978027 + }, + { + "auxiliary_loss_clip": 0.01097568, + "auxiliary_loss_mlp": 0.01028158, + "balance_loss_clip": 1.0163269, + "balance_loss_mlp": 1.03324318, + "epoch": 0.9270404328874192, + "flos": 28002710252160.0, + "grad_norm": 1.4450402960819761, + "language_loss": 0.76005769, + "learning_rate": 5.553552361633174e-08, + "loss": 0.78131491, + "num_input_tokens_seen": 332818095, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.64453125, + "step": 15419, + "time_per_iteration": 2.5159225463867188 + }, + { + "auxiliary_loss_clip": 0.01094814, + "auxiliary_loss_mlp": 0.01032298, + "balance_loss_clip": 1.02189064, + "balance_loss_mlp": 1.03209281, + "epoch": 0.9271005561400871, + "flos": 25889870401920.0, + "grad_norm": 2.0477489170526586, + "language_loss": 0.75719529, + "learning_rate": 5.5444419398091636e-08, + "loss": 0.77846634, + "num_input_tokens_seen": 332839860, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.62890625, + "step": 15420, + "time_per_iteration": 2.5263941287994385 + }, + { + "auxiliary_loss_clip": 0.01102072, + "auxiliary_loss_mlp": 0.01030859, + "balance_loss_clip": 1.01880121, + "balance_loss_mlp": 1.03423715, + "epoch": 0.9271606793927551, + "flos": 27053914452480.0, + "grad_norm": 1.459013416866959, + "language_loss": 0.76789546, + "learning_rate": 5.535338891759389e-08, + "loss": 0.78922474, + "num_input_tokens_seen": 332861155, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6796875, + "step": 15421, + "time_per_iteration": 2.4908981323242188 + }, + { + "auxiliary_loss_clip": 0.01099681, + "auxiliary_loss_mlp": 0.01030277, + "balance_loss_clip": 1.01896989, + "balance_loss_mlp": 1.0345453, + "epoch": 0.9272208026454232, + "flos": 26209869690240.0, + "grad_norm": 1.9925189154946077, + "language_loss": 0.7272985, + "learning_rate": 5.526243217829041e-08, + "loss": 0.7485981, + "num_input_tokens_seen": 332881110, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65234375, + "step": 15422, + "time_per_iteration": 2.504379987716675 + }, + { + "auxiliary_loss_clip": 0.01102396, + "auxiliary_loss_mlp": 0.01034198, + "balance_loss_clip": 1.02169275, + "balance_loss_mlp": 1.03490949, + "epoch": 0.9272809258980911, + "flos": 12458453863680.0, + "grad_norm": 1.894268448401904, + "language_loss": 0.77302563, + "learning_rate": 5.517154918363065e-08, + "loss": 0.79439163, + "num_input_tokens_seen": 332899350, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.671875, + "step": 15423, + "time_per_iteration": 2.4350104331970215 + }, + { + "auxiliary_loss_clip": 0.01101012, + "auxiliary_loss_mlp": 0.0102826, + "balance_loss_clip": 1.01604676, + "balance_loss_mlp": 1.03420961, + "epoch": 0.9273410491507591, + "flos": 22856890826880.0, + "grad_norm": 1.646588555304309, + "language_loss": 0.75237334, + "learning_rate": 5.508073993706053e-08, + "loss": 0.77366608, + "num_input_tokens_seen": 332918105, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66796875, + "step": 15424, + "time_per_iteration": 2.527062177658081 + }, + { + "auxiliary_loss_clip": 0.01021636, + "auxiliary_loss_mlp": 0.01000835, + "balance_loss_clip": 0.99980974, + "balance_loss_mlp": 1.00167572, + "epoch": 0.927401172403427, + "flos": 47665384329600.0, + "grad_norm": 0.7788753343598831, + "language_loss": 0.60629737, + "learning_rate": 5.499000444202351e-08, + "loss": 0.62652206, + "num_input_tokens_seen": 332969490, + "router_z_loss_clip": 0.01025391, + "router_z_loss_mlp": 0.19921875, + "step": 15425, + "time_per_iteration": 2.8316895961761475 + }, + { + "auxiliary_loss_clip": 0.01101174, + "auxiliary_loss_mlp": 0.01032201, + "balance_loss_clip": 1.02046478, + "balance_loss_mlp": 1.03510499, + "epoch": 0.927461295656095, + "flos": 29972374490880.0, + "grad_norm": 4.846350561223134, + "language_loss": 0.70709521, + "learning_rate": 5.489934270196106e-08, + "loss": 0.72842896, + "num_input_tokens_seen": 332988805, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.66015625, + "step": 15426, + "time_per_iteration": 2.533698797225952 + }, + { + "auxiliary_loss_clip": 0.01099514, + "auxiliary_loss_mlp": 0.01024527, + "balance_loss_clip": 1.01384544, + "balance_loss_mlp": 1.0349983, + "epoch": 0.9275214189087629, + "flos": 20375427651840.0, + "grad_norm": 1.7589746620636957, + "language_loss": 0.82876408, + "learning_rate": 5.480875472030977e-08, + "loss": 0.85000449, + "num_input_tokens_seen": 333007960, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.64453125, + "step": 15427, + "time_per_iteration": 2.4352564811706543 + }, + { + "auxiliary_loss_clip": 0.01101445, + "auxiliary_loss_mlp": 0.01035207, + "balance_loss_clip": 1.02391815, + "balance_loss_mlp": 1.03641236, + "epoch": 0.927581542161431, + "flos": 22383193242240.0, + "grad_norm": 1.5057993286553948, + "language_loss": 0.76877588, + "learning_rate": 5.471824050050555e-08, + "loss": 0.79014242, + "num_input_tokens_seen": 333026035, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 15428, + "time_per_iteration": 3.8693413734436035 + }, + { + "auxiliary_loss_clip": 0.01096742, + "auxiliary_loss_mlp": 0.01032046, + "balance_loss_clip": 1.02022672, + "balance_loss_mlp": 1.03172433, + "epoch": 0.9276416654140989, + "flos": 23952453598080.0, + "grad_norm": 2.0403868846760447, + "language_loss": 0.74666828, + "learning_rate": 5.4627800045980555e-08, + "loss": 0.7679562, + "num_input_tokens_seen": 333045590, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6484375, + "step": 15429, + "time_per_iteration": 2.4468398094177246 + }, + { + "auxiliary_loss_clip": 0.01097637, + "auxiliary_loss_mlp": 0.01032278, + "balance_loss_clip": 1.02094138, + "balance_loss_mlp": 1.03397834, + "epoch": 0.9277017886667669, + "flos": 13917719796480.0, + "grad_norm": 1.7045835703544736, + "language_loss": 0.74889922, + "learning_rate": 5.45374333601647e-08, + "loss": 0.77019835, + "num_input_tokens_seen": 333063355, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.63671875, + "step": 15430, + "time_per_iteration": 3.9206631183624268 + }, + { + "auxiliary_loss_clip": 0.01098985, + "auxiliary_loss_mlp": 0.01030676, + "balance_loss_clip": 1.01789057, + "balance_loss_mlp": 1.03349423, + "epoch": 0.9277619119194348, + "flos": 35666478092160.0, + "grad_norm": 1.3411362668102724, + "language_loss": 0.76195765, + "learning_rate": 5.444714044648391e-08, + "loss": 0.78325427, + "num_input_tokens_seen": 333088045, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.65625, + "step": 15431, + "time_per_iteration": 3.985266923904419 + }, + { + "auxiliary_loss_clip": 0.01097878, + "auxiliary_loss_mlp": 0.01024488, + "balance_loss_clip": 1.01342511, + "balance_loss_mlp": 1.03424621, + "epoch": 0.9278220351721028, + "flos": 23841238112640.0, + "grad_norm": 1.6017729259542908, + "language_loss": 0.70828962, + "learning_rate": 5.4356921308363e-08, + "loss": 0.72951329, + "num_input_tokens_seen": 333108005, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.63671875, + "step": 15432, + "time_per_iteration": 2.4481770992279053 + }, + { + "auxiliary_loss_clip": 0.01101221, + "auxiliary_loss_mlp": 0.01029967, + "balance_loss_clip": 1.01845121, + "balance_loss_mlp": 1.03413773, + "epoch": 0.9278821584247707, + "flos": 15228135768960.0, + "grad_norm": 2.1590523070587095, + "language_loss": 0.82312065, + "learning_rate": 5.4266775949222354e-08, + "loss": 0.84443253, + "num_input_tokens_seen": 333124335, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.671875, + "step": 15433, + "time_per_iteration": 2.3907063007354736 + }, + { + "auxiliary_loss_clip": 0.01096167, + "auxiliary_loss_mlp": 0.0102545, + "balance_loss_clip": 1.01522768, + "balance_loss_mlp": 1.03443766, + "epoch": 0.9279422816774388, + "flos": 24681404206080.0, + "grad_norm": 1.7403608716892394, + "language_loss": 0.66221195, + "learning_rate": 5.417670437248056e-08, + "loss": 0.68342805, + "num_input_tokens_seen": 333143995, + "router_z_loss_clip": 0.10205078, + "router_z_loss_mlp": 0.6171875, + "step": 15434, + "time_per_iteration": 2.459033250808716 + }, + { + "auxiliary_loss_clip": 0.01093673, + "auxiliary_loss_mlp": 0.01027627, + "balance_loss_clip": 1.01683831, + "balance_loss_mlp": 1.03230667, + "epoch": 0.9280024049301068, + "flos": 19169188099200.0, + "grad_norm": 1.8005736913066748, + "language_loss": 0.6873616, + "learning_rate": 5.40867065815529e-08, + "loss": 0.70857459, + "num_input_tokens_seen": 333162805, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.61328125, + "step": 15435, + "time_per_iteration": 3.9342024326324463 + }, + { + "auxiliary_loss_clip": 0.01099245, + "auxiliary_loss_mlp": 0.01027343, + "balance_loss_clip": 1.01565433, + "balance_loss_mlp": 1.03373933, + "epoch": 0.9280625281827747, + "flos": 11393701983360.0, + "grad_norm": 2.020400510529268, + "language_loss": 0.72055352, + "learning_rate": 5.399678257985263e-08, + "loss": 0.74181938, + "num_input_tokens_seen": 333175770, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65625, + "step": 15436, + "time_per_iteration": 2.405715227127075 + }, + { + "auxiliary_loss_clip": 0.01098664, + "auxiliary_loss_mlp": 0.01028576, + "balance_loss_clip": 1.0170064, + "balance_loss_mlp": 1.03391147, + "epoch": 0.9281226514354427, + "flos": 24785616539520.0, + "grad_norm": 2.091605034726952, + "language_loss": 0.67294556, + "learning_rate": 5.390693237078925e-08, + "loss": 0.69421792, + "num_input_tokens_seen": 333194775, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6484375, + "step": 15437, + "time_per_iteration": 2.4648404121398926 + }, + { + "auxiliary_loss_clip": 0.01102898, + "auxiliary_loss_mlp": 0.010311, + "balance_loss_clip": 1.01868415, + "balance_loss_mlp": 1.03452563, + "epoch": 0.9281827746881106, + "flos": 15083128563840.0, + "grad_norm": 2.0966698400336896, + "language_loss": 0.71116936, + "learning_rate": 5.3817155957770254e-08, + "loss": 0.73250937, + "num_input_tokens_seen": 333208920, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 15438, + "time_per_iteration": 2.413299798965454 + }, + { + "auxiliary_loss_clip": 0.01100161, + "auxiliary_loss_mlp": 0.0103004, + "balance_loss_clip": 1.01855445, + "balance_loss_mlp": 1.03494883, + "epoch": 0.9282428979407786, + "flos": 24135059364480.0, + "grad_norm": 1.7797926756037903, + "language_loss": 0.64633286, + "learning_rate": 5.3727453344199366e-08, + "loss": 0.66763484, + "num_input_tokens_seen": 333229350, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65234375, + "step": 15439, + "time_per_iteration": 2.461437225341797 + }, + { + "auxiliary_loss_clip": 0.01099241, + "auxiliary_loss_mlp": 0.01027306, + "balance_loss_clip": 1.01593935, + "balance_loss_mlp": 1.03433597, + "epoch": 0.9283030211934465, + "flos": 24823215100800.0, + "grad_norm": 2.3253204491475885, + "language_loss": 0.7027396, + "learning_rate": 5.363782453347876e-08, + "loss": 0.7240051, + "num_input_tokens_seen": 333246125, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 15440, + "time_per_iteration": 2.4703500270843506 + }, + { + "auxiliary_loss_clip": 0.01102231, + "auxiliary_loss_mlp": 0.01035759, + "balance_loss_clip": 1.02361774, + "balance_loss_mlp": 1.03502834, + "epoch": 0.9283631444461146, + "flos": 23981037845760.0, + "grad_norm": 1.612587753570518, + "language_loss": 0.76855183, + "learning_rate": 5.354826952900682e-08, + "loss": 0.78993171, + "num_input_tokens_seen": 333263685, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.671875, + "step": 15441, + "time_per_iteration": 2.527451515197754 + }, + { + "auxiliary_loss_clip": 0.01094431, + "auxiliary_loss_mlp": 0.01026501, + "balance_loss_clip": 1.01657677, + "balance_loss_mlp": 1.03307748, + "epoch": 0.9284232676987825, + "flos": 22784530878720.0, + "grad_norm": 1.90495745755495, + "language_loss": 0.64267159, + "learning_rate": 5.345878833417949e-08, + "loss": 0.66388088, + "num_input_tokens_seen": 333282435, + "router_z_loss_clip": 0.09912109, + "router_z_loss_mlp": 0.61328125, + "step": 15442, + "time_per_iteration": 2.4639720916748047 + }, + { + "auxiliary_loss_clip": 0.01102164, + "auxiliary_loss_mlp": 0.01034514, + "balance_loss_clip": 1.02279592, + "balance_loss_mlp": 1.03435051, + "epoch": 0.9284833909514505, + "flos": 19500500171520.0, + "grad_norm": 1.7797907692602184, + "language_loss": 0.80536753, + "learning_rate": 5.3369380952390295e-08, + "loss": 0.8267343, + "num_input_tokens_seen": 333300400, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 15443, + "time_per_iteration": 2.427996873855591 + }, + { + "auxiliary_loss_clip": 0.01099981, + "auxiliary_loss_mlp": 0.01029047, + "balance_loss_clip": 1.01735878, + "balance_loss_mlp": 1.03426063, + "epoch": 0.9285435142041184, + "flos": 23185976256000.0, + "grad_norm": 1.8653829625255551, + "language_loss": 0.65230483, + "learning_rate": 5.328004738702896e-08, + "loss": 0.67359507, + "num_input_tokens_seen": 333318980, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 15444, + "time_per_iteration": 2.46578049659729 + }, + { + "auxiliary_loss_clip": 0.01098711, + "auxiliary_loss_mlp": 0.01028903, + "balance_loss_clip": 1.01768517, + "balance_loss_mlp": 1.03293288, + "epoch": 0.9286036374567864, + "flos": 17675519915520.0, + "grad_norm": 1.8271492259739264, + "language_loss": 0.73367989, + "learning_rate": 5.3190787641483215e-08, + "loss": 0.75495601, + "num_input_tokens_seen": 333334135, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 15445, + "time_per_iteration": 2.4109835624694824 + }, + { + "auxiliary_loss_clip": 0.0110117, + "auxiliary_loss_mlp": 0.01033091, + "balance_loss_clip": 1.02127099, + "balance_loss_mlp": 1.03590298, + "epoch": 0.9286637607094543, + "flos": 20886687884160.0, + "grad_norm": 1.5979727585178083, + "language_loss": 0.71089745, + "learning_rate": 5.3101601719138135e-08, + "loss": 0.73224002, + "num_input_tokens_seen": 333353325, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65234375, + "step": 15446, + "time_per_iteration": 2.462587594985962 + }, + { + "auxiliary_loss_clip": 0.01102949, + "auxiliary_loss_mlp": 0.01028508, + "balance_loss_clip": 1.01671267, + "balance_loss_mlp": 1.03485143, + "epoch": 0.9287238839621224, + "flos": 19026012487680.0, + "grad_norm": 1.9850834781721192, + "language_loss": 0.69447434, + "learning_rate": 5.301248962337523e-08, + "loss": 0.7157889, + "num_input_tokens_seen": 333371110, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 15447, + "time_per_iteration": 2.427091598510742 + }, + { + "auxiliary_loss_clip": 0.01093107, + "auxiliary_loss_mlp": 0.0102558, + "balance_loss_clip": 1.01545882, + "balance_loss_mlp": 1.03282893, + "epoch": 0.9287840072147904, + "flos": 20557027837440.0, + "grad_norm": 1.6048988598173843, + "language_loss": 0.72284281, + "learning_rate": 5.292345135757403e-08, + "loss": 0.74402964, + "num_input_tokens_seen": 333391420, + "router_z_loss_clip": 0.1015625, + "router_z_loss_mlp": 0.6015625, + "step": 15448, + "time_per_iteration": 2.4651074409484863 + }, + { + "auxiliary_loss_clip": 0.01099744, + "auxiliary_loss_mlp": 0.01027458, + "balance_loss_clip": 1.01506054, + "balance_loss_mlp": 1.03431988, + "epoch": 0.9288441304674583, + "flos": 21250822008960.0, + "grad_norm": 1.5859141437991187, + "language_loss": 0.73905832, + "learning_rate": 5.283448692511072e-08, + "loss": 0.76033032, + "num_input_tokens_seen": 333410365, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.65625, + "step": 15449, + "time_per_iteration": 2.4365196228027344 + }, + { + "auxiliary_loss_clip": 0.0109955, + "auxiliary_loss_mlp": 0.01025285, + "balance_loss_clip": 1.01344728, + "balance_loss_mlp": 1.03404522, + "epoch": 0.9289042537201263, + "flos": 27669853895040.0, + "grad_norm": 2.156577692440534, + "language_loss": 0.67555118, + "learning_rate": 5.27455963293586e-08, + "loss": 0.69679958, + "num_input_tokens_seen": 333430000, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65625, + "step": 15450, + "time_per_iteration": 2.4997141361236572 + }, + { + "auxiliary_loss_clip": 0.01099302, + "auxiliary_loss_mlp": 0.01024774, + "balance_loss_clip": 1.01307964, + "balance_loss_mlp": 1.03357685, + "epoch": 0.9289643769727942, + "flos": 19317750750720.0, + "grad_norm": 2.1026570979427026, + "language_loss": 0.72319663, + "learning_rate": 5.265677957368875e-08, + "loss": 0.7444374, + "num_input_tokens_seen": 333445800, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 15451, + "time_per_iteration": 2.4257543087005615 + }, + { + "auxiliary_loss_clip": 0.01100587, + "auxiliary_loss_mlp": 0.01033818, + "balance_loss_clip": 1.02285671, + "balance_loss_mlp": 1.03431022, + "epoch": 0.9290245002254622, + "flos": 14058058233600.0, + "grad_norm": 1.9644951555843875, + "language_loss": 0.73315656, + "learning_rate": 5.25680366614687e-08, + "loss": 0.75450063, + "num_input_tokens_seen": 333461550, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6640625, + "step": 15452, + "time_per_iteration": 2.404226064682007 + }, + { + "auxiliary_loss_clip": 0.01101725, + "auxiliary_loss_mlp": 0.01027462, + "balance_loss_clip": 1.01583314, + "balance_loss_mlp": 1.0372479, + "epoch": 0.9290846234781301, + "flos": 20047132321920.0, + "grad_norm": 1.8050090440464128, + "language_loss": 0.74203956, + "learning_rate": 5.2479367596064196e-08, + "loss": 0.76333141, + "num_input_tokens_seen": 333478835, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.64453125, + "step": 15453, + "time_per_iteration": 2.4306046962738037 + }, + { + "auxiliary_loss_clip": 0.01021773, + "auxiliary_loss_mlp": 0.01002626, + "balance_loss_clip": 1.00164855, + "balance_loss_mlp": 1.00152075, + "epoch": 0.9291447467307982, + "flos": 61227514460160.0, + "grad_norm": 0.8231858561820261, + "language_loss": 0.60632885, + "learning_rate": 5.2390772380837226e-08, + "loss": 0.62657285, + "num_input_tokens_seen": 333535250, + "router_z_loss_clip": 0.00976562, + "router_z_loss_mlp": 0.203125, + "step": 15454, + "time_per_iteration": 2.995863437652588 + }, + { + "auxiliary_loss_clip": 0.01099994, + "auxiliary_loss_mlp": 0.01031369, + "balance_loss_clip": 1.01989484, + "balance_loss_mlp": 1.03345144, + "epoch": 0.9292048699834661, + "flos": 20553328736640.0, + "grad_norm": 1.4950460872620022, + "language_loss": 0.68971264, + "learning_rate": 5.230225101914709e-08, + "loss": 0.71102631, + "num_input_tokens_seen": 333553805, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 15455, + "time_per_iteration": 2.4303104877471924 + }, + { + "auxiliary_loss_clip": 0.01101049, + "auxiliary_loss_mlp": 0.01029806, + "balance_loss_clip": 1.01793849, + "balance_loss_mlp": 1.03573239, + "epoch": 0.9292649932361341, + "flos": 23623655477760.0, + "grad_norm": 1.7032776336080993, + "language_loss": 0.64673263, + "learning_rate": 5.22138035143509e-08, + "loss": 0.66804117, + "num_input_tokens_seen": 333572800, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.65234375, + "step": 15456, + "time_per_iteration": 2.4663398265838623 + }, + { + "auxiliary_loss_clip": 0.01100142, + "auxiliary_loss_mlp": 0.0102875, + "balance_loss_clip": 1.01669145, + "balance_loss_mlp": 1.03616011, + "epoch": 0.929325116488802, + "flos": 15009942602880.0, + "grad_norm": 1.6544080428744494, + "language_loss": 0.68180311, + "learning_rate": 5.2125429869802615e-08, + "loss": 0.70309204, + "num_input_tokens_seen": 333588520, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.640625, + "step": 15457, + "time_per_iteration": 2.387383460998535 + }, + { + "auxiliary_loss_clip": 0.0109956, + "auxiliary_loss_mlp": 0.01025563, + "balance_loss_clip": 1.01436925, + "balance_loss_mlp": 1.03264999, + "epoch": 0.92938523974147, + "flos": 17967365919360.0, + "grad_norm": 1.9914999600759236, + "language_loss": 0.80684668, + "learning_rate": 5.203713008885291e-08, + "loss": 0.82809794, + "num_input_tokens_seen": 333603435, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.671875, + "step": 15458, + "time_per_iteration": 2.411698341369629 + }, + { + "auxiliary_loss_clip": 0.01100132, + "auxiliary_loss_mlp": 0.01032442, + "balance_loss_clip": 1.02089047, + "balance_loss_mlp": 1.03419471, + "epoch": 0.9294453629941379, + "flos": 23003047267200.0, + "grad_norm": 1.567399109434874, + "language_loss": 0.72272772, + "learning_rate": 5.194890417485065e-08, + "loss": 0.74405348, + "num_input_tokens_seen": 333623305, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66015625, + "step": 15459, + "time_per_iteration": 2.450777769088745 + }, + { + "auxiliary_loss_clip": 0.01101616, + "auxiliary_loss_mlp": 0.01028744, + "balance_loss_clip": 1.01775837, + "balance_loss_mlp": 1.03589296, + "epoch": 0.929505486246806, + "flos": 17055234927360.0, + "grad_norm": 3.1276525868113665, + "language_loss": 0.58476692, + "learning_rate": 5.1860752131141384e-08, + "loss": 0.60607052, + "num_input_tokens_seen": 333641205, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.65625, + "step": 15460, + "time_per_iteration": 2.406024932861328 + }, + { + "auxiliary_loss_clip": 0.01102163, + "auxiliary_loss_mlp": 0.01032031, + "balance_loss_clip": 1.01987743, + "balance_loss_mlp": 1.03494763, + "epoch": 0.9295656094994739, + "flos": 27340409329920.0, + "grad_norm": 2.5998175218554778, + "language_loss": 0.8040331, + "learning_rate": 5.177267396106733e-08, + "loss": 0.82537508, + "num_input_tokens_seen": 333659615, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 15461, + "time_per_iteration": 2.478937864303589 + }, + { + "auxiliary_loss_clip": 0.01095702, + "auxiliary_loss_mlp": 0.01029192, + "balance_loss_clip": 1.01769996, + "balance_loss_mlp": 1.03278279, + "epoch": 0.9296257327521419, + "flos": 21470954509440.0, + "grad_norm": 2.0121095429582807, + "language_loss": 0.78226018, + "learning_rate": 5.168466966796869e-08, + "loss": 0.80350912, + "num_input_tokens_seen": 333678985, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.62890625, + "step": 15462, + "time_per_iteration": 2.471994400024414 + }, + { + "auxiliary_loss_clip": 0.01098138, + "auxiliary_loss_mlp": 0.01023728, + "balance_loss_clip": 1.01270103, + "balance_loss_mlp": 1.03229225, + "epoch": 0.9296858560048099, + "flos": 16362661818240.0, + "grad_norm": 1.846715465327114, + "language_loss": 0.62358242, + "learning_rate": 5.159673925518282e-08, + "loss": 0.64480114, + "num_input_tokens_seen": 333696410, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 15463, + "time_per_iteration": 2.543200969696045 + }, + { + "auxiliary_loss_clip": 0.01096034, + "auxiliary_loss_mlp": 0.01025972, + "balance_loss_clip": 1.01537442, + "balance_loss_mlp": 1.03193223, + "epoch": 0.9297459792574778, + "flos": 29858609139840.0, + "grad_norm": 1.4319692146419465, + "language_loss": 0.70946103, + "learning_rate": 5.15088827260437e-08, + "loss": 0.73068112, + "num_input_tokens_seen": 333716615, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.640625, + "step": 15464, + "time_per_iteration": 2.557159185409546 + }, + { + "auxiliary_loss_clip": 0.01098841, + "auxiliary_loss_mlp": 0.01026744, + "balance_loss_clip": 1.01610518, + "balance_loss_mlp": 1.0332737, + "epoch": 0.9298061025101458, + "flos": 15924838942080.0, + "grad_norm": 2.308666209262228, + "language_loss": 0.77049506, + "learning_rate": 5.1421100083883115e-08, + "loss": 0.79175085, + "num_input_tokens_seen": 333732800, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.65625, + "step": 15465, + "time_per_iteration": 2.412461280822754 + }, + { + "auxiliary_loss_clip": 0.01021493, + "auxiliary_loss_mlp": 0.00998557, + "balance_loss_clip": 0.99757355, + "balance_loss_mlp": 1.00142288, + "epoch": 0.9298662257628137, + "flos": 64096994304000.0, + "grad_norm": 0.6915312931850184, + "language_loss": 0.56440043, + "learning_rate": 5.133339133202952e-08, + "loss": 0.58460093, + "num_input_tokens_seen": 333799300, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.20117188, + "step": 15466, + "time_per_iteration": 3.1553313732147217 + }, + { + "auxiliary_loss_clip": 0.0109893, + "auxiliary_loss_mlp": 0.01037894, + "balance_loss_clip": 1.02524638, + "balance_loss_mlp": 1.03270507, + "epoch": 0.9299263490154818, + "flos": 24280210224000.0, + "grad_norm": 1.430282713051718, + "language_loss": 0.72837657, + "learning_rate": 5.1245756473809355e-08, + "loss": 0.74974477, + "num_input_tokens_seen": 333820360, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6640625, + "step": 15467, + "time_per_iteration": 2.465402603149414 + }, + { + "auxiliary_loss_clip": 0.01101047, + "auxiliary_loss_mlp": 0.01031832, + "balance_loss_clip": 1.02004814, + "balance_loss_mlp": 1.034567, + "epoch": 0.9299864722681497, + "flos": 23294354567040.0, + "grad_norm": 1.6525069967751043, + "language_loss": 0.7171756, + "learning_rate": 5.1158195512545076e-08, + "loss": 0.73850441, + "num_input_tokens_seen": 333840415, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 15468, + "time_per_iteration": 2.467027187347412 + }, + { + "auxiliary_loss_clip": 0.01100943, + "auxiliary_loss_mlp": 0.01029691, + "balance_loss_clip": 1.0181818, + "balance_loss_mlp": 1.03316689, + "epoch": 0.9300465955208177, + "flos": 21395972868480.0, + "grad_norm": 1.6614618348928094, + "language_loss": 0.7563262, + "learning_rate": 5.107070845155737e-08, + "loss": 0.77763259, + "num_input_tokens_seen": 333859910, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.67578125, + "step": 15469, + "time_per_iteration": 2.4551570415496826 + }, + { + "auxiliary_loss_clip": 0.0109919, + "auxiliary_loss_mlp": 0.01030519, + "balance_loss_clip": 1.01893151, + "balance_loss_mlp": 1.03328323, + "epoch": 0.9301067187734856, + "flos": 24571445696640.0, + "grad_norm": 3.117567963702495, + "language_loss": 0.75602072, + "learning_rate": 5.098329529416379e-08, + "loss": 0.77731776, + "num_input_tokens_seen": 333880495, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66015625, + "step": 15470, + "time_per_iteration": 3.9293041229248047 + }, + { + "auxiliary_loss_clip": 0.01098686, + "auxiliary_loss_mlp": 0.01027864, + "balance_loss_clip": 1.0168792, + "balance_loss_mlp": 1.03350711, + "epoch": 0.9301668420261536, + "flos": 22196960202240.0, + "grad_norm": 1.4761653609508787, + "language_loss": 0.7473954, + "learning_rate": 5.089595604367902e-08, + "loss": 0.7686609, + "num_input_tokens_seen": 333897640, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.65234375, + "step": 15471, + "time_per_iteration": 2.435100793838501 + }, + { + "auxiliary_loss_clip": 0.01098709, + "auxiliary_loss_mlp": 0.01027854, + "balance_loss_clip": 1.01604009, + "balance_loss_mlp": 1.03407836, + "epoch": 0.9302269652788215, + "flos": 17747628468480.0, + "grad_norm": 2.1879069429390006, + "language_loss": 0.69004017, + "learning_rate": 5.080869070341487e-08, + "loss": 0.7113058, + "num_input_tokens_seen": 333913670, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6484375, + "step": 15472, + "time_per_iteration": 3.8225207328796387 + }, + { + "auxiliary_loss_clip": 0.01094581, + "auxiliary_loss_mlp": 0.01029343, + "balance_loss_clip": 1.01865005, + "balance_loss_mlp": 1.03333116, + "epoch": 0.9302870885314896, + "flos": 19390793057280.0, + "grad_norm": 1.609251941802182, + "language_loss": 0.88353068, + "learning_rate": 5.0721499276680233e-08, + "loss": 0.9047699, + "num_input_tokens_seen": 333934105, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.61328125, + "step": 15473, + "time_per_iteration": 3.829770088195801 + }, + { + "auxiliary_loss_clip": 0.01104013, + "auxiliary_loss_mlp": 0.01033904, + "balance_loss_clip": 1.02066016, + "balance_loss_mlp": 1.03545713, + "epoch": 0.9303472117841575, + "flos": 21760286561280.0, + "grad_norm": 1.8613424502001032, + "language_loss": 0.64229345, + "learning_rate": 5.063438176678203e-08, + "loss": 0.66367269, + "num_input_tokens_seen": 333953635, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6875, + "step": 15474, + "time_per_iteration": 2.430070161819458 + }, + { + "auxiliary_loss_clip": 0.01099714, + "auxiliary_loss_mlp": 0.01029633, + "balance_loss_clip": 1.0180459, + "balance_loss_mlp": 1.03443456, + "epoch": 0.9304073350368255, + "flos": 19609740408960.0, + "grad_norm": 1.634183098682429, + "language_loss": 0.7463553, + "learning_rate": 5.054733817702339e-08, + "loss": 0.76764882, + "num_input_tokens_seen": 333971825, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65234375, + "step": 15475, + "time_per_iteration": 2.433985948562622 + }, + { + "auxiliary_loss_clip": 0.01097957, + "auxiliary_loss_mlp": 0.01024983, + "balance_loss_clip": 1.0143497, + "balance_loss_mlp": 1.03309751, + "epoch": 0.9304674582894935, + "flos": 30441582875520.0, + "grad_norm": 2.010592371284976, + "language_loss": 0.66876173, + "learning_rate": 5.0460368510704786e-08, + "loss": 0.68999112, + "num_input_tokens_seen": 333990120, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6484375, + "step": 15476, + "time_per_iteration": 2.505357503890991 + }, + { + "auxiliary_loss_clip": 0.01102035, + "auxiliary_loss_mlp": 0.01032612, + "balance_loss_clip": 1.02085757, + "balance_loss_mlp": 1.03647363, + "epoch": 0.9305275815421614, + "flos": 17785693906560.0, + "grad_norm": 2.173665813572123, + "language_loss": 0.68965471, + "learning_rate": 5.0373472771124914e-08, + "loss": 0.71100122, + "num_input_tokens_seen": 334007970, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.65625, + "step": 15477, + "time_per_iteration": 3.904513120651245 + }, + { + "auxiliary_loss_clip": 0.01098178, + "auxiliary_loss_mlp": 0.01027662, + "balance_loss_clip": 1.01687384, + "balance_loss_mlp": 1.03475642, + "epoch": 0.9305877047948294, + "flos": 25298456970240.0, + "grad_norm": 1.8851394448643317, + "language_loss": 0.58472347, + "learning_rate": 5.0286650961578027e-08, + "loss": 0.60598183, + "num_input_tokens_seen": 334027120, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6328125, + "step": 15478, + "time_per_iteration": 2.4862332344055176 + }, + { + "auxiliary_loss_clip": 0.01104232, + "auxiliary_loss_mlp": 0.0102873, + "balance_loss_clip": 1.01574802, + "balance_loss_mlp": 1.03474784, + "epoch": 0.9306478280474973, + "flos": 16977236544000.0, + "grad_norm": 4.322822362251628, + "language_loss": 0.79143488, + "learning_rate": 5.01999030853566e-08, + "loss": 0.81276453, + "num_input_tokens_seen": 334042785, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6953125, + "step": 15479, + "time_per_iteration": 2.3997929096221924 + }, + { + "auxiliary_loss_clip": 0.01099209, + "auxiliary_loss_mlp": 0.01032056, + "balance_loss_clip": 1.02073741, + "balance_loss_mlp": 1.03325725, + "epoch": 0.9307079513001654, + "flos": 35663353608960.0, + "grad_norm": 2.4723899654075554, + "language_loss": 0.68572581, + "learning_rate": 5.0113229145750445e-08, + "loss": 0.70703846, + "num_input_tokens_seen": 334063480, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66015625, + "step": 15480, + "time_per_iteration": 2.566641092300415 + }, + { + "auxiliary_loss_clip": 0.01099075, + "auxiliary_loss_mlp": 0.01029607, + "balance_loss_clip": 1.01816869, + "balance_loss_mlp": 1.03372073, + "epoch": 0.9307680745528333, + "flos": 19208151377280.0, + "grad_norm": 1.69653427681413, + "language_loss": 0.67943531, + "learning_rate": 5.002662914604583e-08, + "loss": 0.7007221, + "num_input_tokens_seen": 334082005, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 15481, + "time_per_iteration": 2.4178357124328613 + }, + { + "auxiliary_loss_clip": 0.01096176, + "auxiliary_loss_mlp": 0.01025523, + "balance_loss_clip": 1.01472914, + "balance_loss_mlp": 1.03221035, + "epoch": 0.9308281978055013, + "flos": 19062641381760.0, + "grad_norm": 2.0018171339857744, + "language_loss": 0.74707091, + "learning_rate": 4.994010308952701e-08, + "loss": 0.76828778, + "num_input_tokens_seen": 334101375, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.640625, + "step": 15482, + "time_per_iteration": 2.46037220954895 + }, + { + "auxiliary_loss_clip": 0.01094997, + "auxiliary_loss_mlp": 0.010277, + "balance_loss_clip": 1.0167743, + "balance_loss_mlp": 1.03203559, + "epoch": 0.9308883210581692, + "flos": 20521548178560.0, + "grad_norm": 1.7837659675322086, + "language_loss": 0.79909325, + "learning_rate": 4.985365097947469e-08, + "loss": 0.82032025, + "num_input_tokens_seen": 334119460, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.62890625, + "step": 15483, + "time_per_iteration": 2.424943208694458 + }, + { + "auxiliary_loss_clip": 0.01098575, + "auxiliary_loss_mlp": 0.01028205, + "balance_loss_clip": 1.01651073, + "balance_loss_mlp": 1.03387845, + "epoch": 0.9309484443108372, + "flos": 13001422826880.0, + "grad_norm": 2.0304206547366777, + "language_loss": 0.74465203, + "learning_rate": 4.976727281916782e-08, + "loss": 0.76591992, + "num_input_tokens_seen": 334136065, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6484375, + "step": 15484, + "time_per_iteration": 2.427067518234253 + }, + { + "auxiliary_loss_clip": 0.01102814, + "auxiliary_loss_mlp": 0.01030027, + "balance_loss_clip": 1.01819587, + "balance_loss_mlp": 1.03494906, + "epoch": 0.9310085675635051, + "flos": 12567765928320.0, + "grad_norm": 2.305608491408132, + "language_loss": 0.76315653, + "learning_rate": 4.968096861188087e-08, + "loss": 0.78448498, + "num_input_tokens_seen": 334153690, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6796875, + "step": 15485, + "time_per_iteration": 2.3986244201660156 + }, + { + "auxiliary_loss_clip": 0.01100485, + "auxiliary_loss_mlp": 0.01029104, + "balance_loss_clip": 1.01677799, + "balance_loss_mlp": 1.03375554, + "epoch": 0.9310686908161732, + "flos": 23477570864640.0, + "grad_norm": 1.7799612714984643, + "language_loss": 0.7810412, + "learning_rate": 4.959473836088723e-08, + "loss": 0.80233711, + "num_input_tokens_seen": 334171880, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66796875, + "step": 15486, + "time_per_iteration": 2.4667983055114746 + }, + { + "auxiliary_loss_clip": 0.01104748, + "auxiliary_loss_mlp": 0.01029377, + "balance_loss_clip": 1.01735497, + "balance_loss_mlp": 1.03717089, + "epoch": 0.9311288140688411, + "flos": 24170287628160.0, + "grad_norm": 2.004478258444932, + "language_loss": 0.77159125, + "learning_rate": 4.950858206945674e-08, + "loss": 0.79293251, + "num_input_tokens_seen": 334190005, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 15487, + "time_per_iteration": 2.446272134780884 + }, + { + "auxiliary_loss_clip": 0.01099239, + "auxiliary_loss_mlp": 0.01026054, + "balance_loss_clip": 1.01362669, + "balance_loss_mlp": 1.03425145, + "epoch": 0.9311889373215091, + "flos": 35590203561600.0, + "grad_norm": 2.746205052123672, + "language_loss": 0.66514063, + "learning_rate": 4.942249974085633e-08, + "loss": 0.68639356, + "num_input_tokens_seen": 334209545, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6484375, + "step": 15488, + "time_per_iteration": 2.6267404556274414 + }, + { + "auxiliary_loss_clip": 0.01095561, + "auxiliary_loss_mlp": 0.01029729, + "balance_loss_clip": 1.01797533, + "balance_loss_mlp": 1.03325832, + "epoch": 0.9312490605741771, + "flos": 20230528187520.0, + "grad_norm": 1.8384174962011377, + "language_loss": 0.74991691, + "learning_rate": 4.933649137834983e-08, + "loss": 0.77116984, + "num_input_tokens_seen": 334228900, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.625, + "step": 15489, + "time_per_iteration": 2.465559959411621 + }, + { + "auxiliary_loss_clip": 0.01100415, + "auxiliary_loss_mlp": 0.01028757, + "balance_loss_clip": 1.01684833, + "balance_loss_mlp": 1.0337944, + "epoch": 0.931309183826845, + "flos": 13950577762560.0, + "grad_norm": 2.563733081982058, + "language_loss": 0.80619878, + "learning_rate": 4.925055698519931e-08, + "loss": 0.82749051, + "num_input_tokens_seen": 334245500, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66796875, + "step": 15490, + "time_per_iteration": 2.4163243770599365 + }, + { + "auxiliary_loss_clip": 0.01101263, + "auxiliary_loss_mlp": 0.01032482, + "balance_loss_clip": 1.02013838, + "balance_loss_mlp": 1.03481793, + "epoch": 0.931369307079513, + "flos": 20156731695360.0, + "grad_norm": 1.5785562831132516, + "language_loss": 0.72108269, + "learning_rate": 4.9164696564663264e-08, + "loss": 0.7424202, + "num_input_tokens_seen": 334264370, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6640625, + "step": 15491, + "time_per_iteration": 2.4315860271453857 + }, + { + "auxiliary_loss_clip": 0.01095636, + "auxiliary_loss_mlp": 0.01027277, + "balance_loss_clip": 1.01641083, + "balance_loss_mlp": 1.03252959, + "epoch": 0.931429430332181, + "flos": 25338569483520.0, + "grad_norm": 1.7853730669825214, + "language_loss": 0.74627632, + "learning_rate": 4.9078910119997096e-08, + "loss": 0.76750547, + "num_input_tokens_seen": 334283905, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6328125, + "step": 15492, + "time_per_iteration": 2.512267827987671 + }, + { + "auxiliary_loss_clip": 0.01021891, + "auxiliary_loss_mlp": 0.00998959, + "balance_loss_clip": 0.99805337, + "balance_loss_mlp": 1.0018034, + "epoch": 0.931489553584849, + "flos": 71226193985280.0, + "grad_norm": 0.712039300089528, + "language_loss": 0.53438187, + "learning_rate": 4.899319765445442e-08, + "loss": 0.5545904, + "num_input_tokens_seen": 334339925, + "router_z_loss_clip": 0.0090332, + "router_z_loss_mlp": 0.20117188, + "step": 15493, + "time_per_iteration": 2.916949510574341 + }, + { + "auxiliary_loss_clip": 0.01098383, + "auxiliary_loss_mlp": 0.01027658, + "balance_loss_clip": 1.01680434, + "balance_loss_mlp": 1.03383327, + "epoch": 0.9315496768375169, + "flos": 14643653662080.0, + "grad_norm": 1.6838927054928123, + "language_loss": 0.71094936, + "learning_rate": 4.890755917128531e-08, + "loss": 0.73220974, + "num_input_tokens_seen": 334357225, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.64453125, + "step": 15494, + "time_per_iteration": 2.4050588607788086 + }, + { + "auxiliary_loss_clip": 0.01100667, + "auxiliary_loss_mlp": 0.01024887, + "balance_loss_clip": 1.01355577, + "balance_loss_mlp": 1.03352082, + "epoch": 0.9316098000901849, + "flos": 28329928174080.0, + "grad_norm": 1.881542723418203, + "language_loss": 0.68522328, + "learning_rate": 4.882199467373671e-08, + "loss": 0.70647883, + "num_input_tokens_seen": 334375945, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 15495, + "time_per_iteration": 2.4895150661468506 + }, + { + "auxiliary_loss_clip": 0.01095117, + "auxiliary_loss_mlp": 0.01031644, + "balance_loss_clip": 1.02116537, + "balance_loss_mlp": 1.03182006, + "epoch": 0.9316699233428528, + "flos": 28512677594880.0, + "grad_norm": 1.762424325452625, + "language_loss": 0.61511773, + "learning_rate": 4.8736504165053815e-08, + "loss": 0.63638532, + "num_input_tokens_seen": 334395310, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.6328125, + "step": 15496, + "time_per_iteration": 2.494763135910034 + }, + { + "auxiliary_loss_clip": 0.01099639, + "auxiliary_loss_mlp": 0.01030735, + "balance_loss_clip": 1.01899242, + "balance_loss_mlp": 1.03399265, + "epoch": 0.9317300465955208, + "flos": 33693402061440.0, + "grad_norm": 1.5515145682874567, + "language_loss": 0.77042872, + "learning_rate": 4.865108764847825e-08, + "loss": 0.79173243, + "num_input_tokens_seen": 334416965, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 15497, + "time_per_iteration": 2.5357086658477783 + }, + { + "auxiliary_loss_clip": 0.01102796, + "auxiliary_loss_mlp": 0.01033, + "balance_loss_clip": 1.02090013, + "balance_loss_mlp": 1.03550088, + "epoch": 0.9317901698481887, + "flos": 23658237296640.0, + "grad_norm": 1.9526034329415265, + "language_loss": 0.66362846, + "learning_rate": 4.856574512724898e-08, + "loss": 0.68498641, + "num_input_tokens_seen": 334435620, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 15498, + "time_per_iteration": 2.467374563217163 + }, + { + "auxiliary_loss_clip": 0.01101485, + "auxiliary_loss_mlp": 0.01035741, + "balance_loss_clip": 1.02316427, + "balance_loss_mlp": 1.03501487, + "epoch": 0.9318502931008568, + "flos": 20960017499520.0, + "grad_norm": 1.7610743502537445, + "language_loss": 0.79906923, + "learning_rate": 4.8480476604602305e-08, + "loss": 0.82044148, + "num_input_tokens_seen": 334456210, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6640625, + "step": 15499, + "time_per_iteration": 2.436098337173462 + }, + { + "auxiliary_loss_clip": 0.01098432, + "auxiliary_loss_mlp": 0.01032221, + "balance_loss_clip": 1.0203414, + "balance_loss_mlp": 1.03484273, + "epoch": 0.9319104163535247, + "flos": 23441049711360.0, + "grad_norm": 1.812161869986891, + "language_loss": 0.76557505, + "learning_rate": 4.8395282083771196e-08, + "loss": 0.78688157, + "num_input_tokens_seen": 334475485, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.63671875, + "step": 15500, + "time_per_iteration": 2.46466064453125 + }, + { + "auxiliary_loss_clip": 0.01095242, + "auxiliary_loss_mlp": 0.01025287, + "balance_loss_clip": 1.01396239, + "balance_loss_mlp": 1.03133726, + "epoch": 0.9319705396061927, + "flos": 22347426274560.0, + "grad_norm": 1.6727103835965809, + "language_loss": 0.72225916, + "learning_rate": 4.8310161567987064e-08, + "loss": 0.74346447, + "num_input_tokens_seen": 334494740, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 15501, + "time_per_iteration": 2.4670472145080566 + }, + { + "auxiliary_loss_clip": 0.01101445, + "auxiliary_loss_mlp": 0.01030377, + "balance_loss_clip": 1.01876068, + "balance_loss_mlp": 1.0345726, + "epoch": 0.9320306628588607, + "flos": 20993557824000.0, + "grad_norm": 1.9501321316828164, + "language_loss": 0.6632303, + "learning_rate": 4.822511506047666e-08, + "loss": 0.6845485, + "num_input_tokens_seen": 334511910, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66796875, + "step": 15502, + "time_per_iteration": 2.4331064224243164 + }, + { + "auxiliary_loss_clip": 0.011008, + "auxiliary_loss_mlp": 0.01032297, + "balance_loss_clip": 1.02094793, + "balance_loss_mlp": 1.03379011, + "epoch": 0.9320907861115286, + "flos": 24538300421760.0, + "grad_norm": 1.4373867863425007, + "language_loss": 0.65522575, + "learning_rate": 4.814014256446586e-08, + "loss": 0.67655671, + "num_input_tokens_seen": 334533150, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.671875, + "step": 15503, + "time_per_iteration": 2.4875681400299072 + }, + { + "auxiliary_loss_clip": 0.01100687, + "auxiliary_loss_mlp": 0.01030624, + "balance_loss_clip": 1.01885748, + "balance_loss_mlp": 1.03377271, + "epoch": 0.9321509093641966, + "flos": 19785414850560.0, + "grad_norm": 1.4699248091106074, + "language_loss": 0.74906504, + "learning_rate": 4.805524408317652e-08, + "loss": 0.77037811, + "num_input_tokens_seen": 334550940, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.66796875, + "step": 15504, + "time_per_iteration": 2.4550881385803223 + }, + { + "auxiliary_loss_clip": 0.01101391, + "auxiliary_loss_mlp": 0.01027238, + "balance_loss_clip": 1.0146023, + "balance_loss_mlp": 1.03532541, + "epoch": 0.9322110326168646, + "flos": 24972675592320.0, + "grad_norm": 1.9858740519405689, + "language_loss": 0.71027422, + "learning_rate": 4.797041961982762e-08, + "loss": 0.73156059, + "num_input_tokens_seen": 334570935, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.66015625, + "step": 15505, + "time_per_iteration": 2.471879482269287 + }, + { + "auxiliary_loss_clip": 0.01100408, + "auxiliary_loss_mlp": 0.01029155, + "balance_loss_clip": 1.0168885, + "balance_loss_mlp": 1.03499889, + "epoch": 0.9322711558695326, + "flos": 16143642639360.0, + "grad_norm": 1.9200486869690463, + "language_loss": 0.75246066, + "learning_rate": 4.788566917763614e-08, + "loss": 0.77375627, + "num_input_tokens_seen": 334589315, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.65625, + "step": 15506, + "time_per_iteration": 2.412144660949707 + }, + { + "auxiliary_loss_clip": 0.01097297, + "auxiliary_loss_mlp": 0.01023636, + "balance_loss_clip": 1.01296115, + "balance_loss_mlp": 1.03512335, + "epoch": 0.9323312791222005, + "flos": 23732428838400.0, + "grad_norm": 1.9532236160910172, + "language_loss": 0.83267069, + "learning_rate": 4.780099275981597e-08, + "loss": 0.85388005, + "num_input_tokens_seen": 334608990, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.62109375, + "step": 15507, + "time_per_iteration": 2.4542086124420166 + }, + { + "auxiliary_loss_clip": 0.01099933, + "auxiliary_loss_mlp": 0.01028089, + "balance_loss_clip": 1.01634693, + "balance_loss_mlp": 1.03375268, + "epoch": 0.9323914023748685, + "flos": 20777914523520.0, + "grad_norm": 1.4480770625048591, + "language_loss": 0.67718458, + "learning_rate": 4.771639036957742e-08, + "loss": 0.69846487, + "num_input_tokens_seen": 334628655, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6640625, + "step": 15508, + "time_per_iteration": 2.483059883117676 + }, + { + "auxiliary_loss_clip": 0.01097823, + "auxiliary_loss_mlp": 0.01030103, + "balance_loss_clip": 1.01831305, + "balance_loss_mlp": 1.03426003, + "epoch": 0.9324515256275364, + "flos": 23915178259200.0, + "grad_norm": 1.6151722837664564, + "language_loss": 0.71979308, + "learning_rate": 4.7631862010129033e-08, + "loss": 0.7410723, + "num_input_tokens_seen": 334648295, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.63671875, + "step": 15509, + "time_per_iteration": 2.444472551345825 + }, + { + "auxiliary_loss_clip": 0.01099809, + "auxiliary_loss_mlp": 0.01031167, + "balance_loss_clip": 1.01989579, + "balance_loss_mlp": 1.03414321, + "epoch": 0.9325116488802044, + "flos": 18005215875840.0, + "grad_norm": 1.7453769402729238, + "language_loss": 0.74520022, + "learning_rate": 4.754740768467624e-08, + "loss": 0.76651001, + "num_input_tokens_seen": 334666280, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 15510, + "time_per_iteration": 2.4823153018951416 + }, + { + "auxiliary_loss_clip": 0.01100681, + "auxiliary_loss_mlp": 0.01026225, + "balance_loss_clip": 1.01489401, + "balance_loss_mlp": 1.03300691, + "epoch": 0.9325717721328723, + "flos": 29021603443200.0, + "grad_norm": 1.518393059285664, + "language_loss": 0.70252025, + "learning_rate": 4.746302739642161e-08, + "loss": 0.72378927, + "num_input_tokens_seen": 334688830, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.67578125, + "step": 15511, + "time_per_iteration": 2.5080928802490234 + }, + { + "auxiliary_loss_clip": 0.01100016, + "auxiliary_loss_mlp": 0.01034567, + "balance_loss_clip": 1.02306938, + "balance_loss_mlp": 1.03380418, + "epoch": 0.9326318953855404, + "flos": 21646341642240.0, + "grad_norm": 1.7360403763937744, + "language_loss": 0.78284937, + "learning_rate": 4.737872114856412e-08, + "loss": 0.80419517, + "num_input_tokens_seen": 334705205, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 15512, + "time_per_iteration": 3.827505111694336 + }, + { + "auxiliary_loss_clip": 0.01097608, + "auxiliary_loss_mlp": 0.01028491, + "balance_loss_clip": 1.01638508, + "balance_loss_mlp": 1.03290629, + "epoch": 0.9326920186382083, + "flos": 26065724411520.0, + "grad_norm": 1.4383075090832378, + "language_loss": 0.80445802, + "learning_rate": 4.7294488944301436e-08, + "loss": 0.825719, + "num_input_tokens_seen": 334723830, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6484375, + "step": 15513, + "time_per_iteration": 4.101969003677368 + }, + { + "auxiliary_loss_clip": 0.01105336, + "auxiliary_loss_mlp": 0.01032016, + "balance_loss_clip": 1.01902199, + "balance_loss_mlp": 1.03549969, + "epoch": 0.9327521418908763, + "flos": 12057116227200.0, + "grad_norm": 1.8828331415899686, + "language_loss": 0.80006057, + "learning_rate": 4.721033078682768e-08, + "loss": 0.82143408, + "num_input_tokens_seen": 334740825, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.69921875, + "step": 15514, + "time_per_iteration": 3.816762685775757 + }, + { + "auxiliary_loss_clip": 0.0109669, + "auxiliary_loss_mlp": 0.01037809, + "balance_loss_clip": 1.02678835, + "balance_loss_mlp": 1.03406465, + "epoch": 0.9328122651435443, + "flos": 43834395271680.0, + "grad_norm": 1.811888338938053, + "language_loss": 0.71603918, + "learning_rate": 4.7126246679333626e-08, + "loss": 0.7373842, + "num_input_tokens_seen": 334765825, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.625, + "step": 15515, + "time_per_iteration": 2.6306512355804443 + }, + { + "auxiliary_loss_clip": 0.01103001, + "auxiliary_loss_mlp": 0.01030438, + "balance_loss_clip": 1.01835024, + "balance_loss_mlp": 1.03492391, + "epoch": 0.9328723883962122, + "flos": 15194954580480.0, + "grad_norm": 2.3044810478032054, + "language_loss": 0.81098676, + "learning_rate": 4.704223662500806e-08, + "loss": 0.83232123, + "num_input_tokens_seen": 334782680, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6796875, + "step": 15516, + "time_per_iteration": 2.4116766452789307 + }, + { + "auxiliary_loss_clip": 0.01101179, + "auxiliary_loss_mlp": 0.01032757, + "balance_loss_clip": 1.02131283, + "balance_loss_mlp": 1.03447771, + "epoch": 0.9329325116488802, + "flos": 20261770041600.0, + "grad_norm": 1.8676294206901967, + "language_loss": 0.8110435, + "learning_rate": 4.695830062703643e-08, + "loss": 0.8323828, + "num_input_tokens_seen": 334800160, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 15517, + "time_per_iteration": 2.431884288787842 + }, + { + "auxiliary_loss_clip": 0.01099406, + "auxiliary_loss_mlp": 0.01030065, + "balance_loss_clip": 1.01764393, + "balance_loss_mlp": 1.03308225, + "epoch": 0.9329926349015482, + "flos": 13115008609920.0, + "grad_norm": 4.35442936800558, + "language_loss": 0.74301833, + "learning_rate": 4.687443868860219e-08, + "loss": 0.76431304, + "num_input_tokens_seen": 334815840, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6640625, + "step": 15518, + "time_per_iteration": 3.8434197902679443 + }, + { + "auxiliary_loss_clip": 0.01099221, + "auxiliary_loss_mlp": 0.01030838, + "balance_loss_clip": 1.01914942, + "balance_loss_mlp": 1.0343585, + "epoch": 0.9330527581542162, + "flos": 23040250778880.0, + "grad_norm": 1.9069245404025545, + "language_loss": 0.75698578, + "learning_rate": 4.679065081288458e-08, + "loss": 0.77828634, + "num_input_tokens_seen": 334834735, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6484375, + "step": 15519, + "time_per_iteration": 2.4393157958984375 + }, + { + "auxiliary_loss_clip": 0.0109868, + "auxiliary_loss_mlp": 0.01032583, + "balance_loss_clip": 1.02047777, + "balance_loss_mlp": 1.03313446, + "epoch": 0.9331128814068841, + "flos": 15559627409280.0, + "grad_norm": 2.2060561744279785, + "language_loss": 0.83241522, + "learning_rate": 4.6706937003061275e-08, + "loss": 0.85372788, + "num_input_tokens_seen": 334853490, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.65625, + "step": 15520, + "time_per_iteration": 2.4281809329986572 + }, + { + "auxiliary_loss_clip": 0.01097061, + "auxiliary_loss_mlp": 0.01027896, + "balance_loss_clip": 1.01652932, + "balance_loss_mlp": 1.03303838, + "epoch": 0.9331730046595521, + "flos": 22271762275200.0, + "grad_norm": 1.737833160408125, + "language_loss": 0.762685, + "learning_rate": 4.6623297262306846e-08, + "loss": 0.78393459, + "num_input_tokens_seen": 334873675, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.640625, + "step": 15521, + "time_per_iteration": 2.45866060256958 + }, + { + "auxiliary_loss_clip": 0.01100752, + "auxiliary_loss_mlp": 0.0102937, + "balance_loss_clip": 1.01811028, + "balance_loss_mlp": 1.03619182, + "epoch": 0.93323312791222, + "flos": 15777641007360.0, + "grad_norm": 1.8418545146351015, + "language_loss": 0.77474684, + "learning_rate": 4.6539731593792545e-08, + "loss": 0.79604805, + "num_input_tokens_seen": 334890970, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.64453125, + "step": 15522, + "time_per_iteration": 2.429081678390503 + }, + { + "auxiliary_loss_clip": 0.01099774, + "auxiliary_loss_mlp": 0.01027111, + "balance_loss_clip": 1.01516044, + "balance_loss_mlp": 1.0343529, + "epoch": 0.933293251164888, + "flos": 22010978557440.0, + "grad_norm": 1.8415574051505679, + "language_loss": 0.63218462, + "learning_rate": 4.6456240000687373e-08, + "loss": 0.65345347, + "num_input_tokens_seen": 334906635, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.65625, + "step": 15523, + "time_per_iteration": 2.4532203674316406 + }, + { + "auxiliary_loss_clip": 0.01099046, + "auxiliary_loss_mlp": 0.01029255, + "balance_loss_clip": 1.01813912, + "balance_loss_mlp": 1.03468919, + "epoch": 0.933353374417556, + "flos": 26031358074240.0, + "grad_norm": 1.705594176735913, + "language_loss": 0.68374217, + "learning_rate": 4.63728224861577e-08, + "loss": 0.7050252, + "num_input_tokens_seen": 334926230, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.64453125, + "step": 15524, + "time_per_iteration": 2.5182032585144043 + }, + { + "auxiliary_loss_clip": 0.01100133, + "auxiliary_loss_mlp": 0.01031099, + "balance_loss_clip": 1.01965547, + "balance_loss_mlp": 1.03399134, + "epoch": 0.933413497670224, + "flos": 24900100162560.0, + "grad_norm": 2.2007198193448105, + "language_loss": 0.74041969, + "learning_rate": 4.628947905336589e-08, + "loss": 0.76173198, + "num_input_tokens_seen": 334946680, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 15525, + "time_per_iteration": 2.4798765182495117 + }, + { + "auxiliary_loss_clip": 0.01096428, + "auxiliary_loss_mlp": 0.01035116, + "balance_loss_clip": 1.02354097, + "balance_loss_mlp": 1.03289247, + "epoch": 0.9334736209228919, + "flos": 23688689051520.0, + "grad_norm": 1.793849639760779, + "language_loss": 0.83958673, + "learning_rate": 4.6206209705473175e-08, + "loss": 0.86090219, + "num_input_tokens_seen": 334964785, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6328125, + "step": 15526, + "time_per_iteration": 2.4932663440704346 + }, + { + "auxiliary_loss_clip": 0.01102195, + "auxiliary_loss_mlp": 0.01026302, + "balance_loss_clip": 1.01432729, + "balance_loss_mlp": 1.03541946, + "epoch": 0.9335337441755599, + "flos": 15377344865280.0, + "grad_norm": 1.9882587404334744, + "language_loss": 0.68634391, + "learning_rate": 4.61230144456366e-08, + "loss": 0.70762885, + "num_input_tokens_seen": 334982400, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66796875, + "step": 15527, + "time_per_iteration": 2.437804937362671 + }, + { + "auxiliary_loss_clip": 0.01101792, + "auxiliary_loss_mlp": 0.01028301, + "balance_loss_clip": 1.01533103, + "balance_loss_mlp": 1.03472137, + "epoch": 0.9335938674282279, + "flos": 16106726436480.0, + "grad_norm": 1.7450138693644768, + "language_loss": 0.64867574, + "learning_rate": 4.603989327701141e-08, + "loss": 0.66997665, + "num_input_tokens_seen": 334999685, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.66796875, + "step": 15528, + "time_per_iteration": 2.4500892162323 + }, + { + "auxiliary_loss_clip": 0.01100501, + "auxiliary_loss_mlp": 0.01029752, + "balance_loss_clip": 1.01744926, + "balance_loss_mlp": 1.03338695, + "epoch": 0.9336539906808958, + "flos": 18952898353920.0, + "grad_norm": 1.7859602907094914, + "language_loss": 0.75145864, + "learning_rate": 4.5956846202748867e-08, + "loss": 0.77276123, + "num_input_tokens_seen": 335019160, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.671875, + "step": 15529, + "time_per_iteration": 2.4640285968780518 + }, + { + "auxiliary_loss_clip": 0.01096769, + "auxiliary_loss_mlp": 0.01029427, + "balance_loss_clip": 1.01828122, + "balance_loss_mlp": 1.03274751, + "epoch": 0.9337141139335638, + "flos": 18109104986880.0, + "grad_norm": 1.785310773164946, + "language_loss": 0.62776995, + "learning_rate": 4.5873873225998674e-08, + "loss": 0.64903188, + "num_input_tokens_seen": 335037350, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.640625, + "step": 15530, + "time_per_iteration": 2.455909252166748 + }, + { + "auxiliary_loss_clip": 0.0109778, + "auxiliary_loss_mlp": 0.01028522, + "balance_loss_clip": 1.01763248, + "balance_loss_mlp": 1.0340414, + "epoch": 0.9337742371862318, + "flos": 17345716214400.0, + "grad_norm": 1.8382088228922817, + "language_loss": 0.72503978, + "learning_rate": 4.5790974349907194e-08, + "loss": 0.74630278, + "num_input_tokens_seen": 335056060, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.63671875, + "step": 15531, + "time_per_iteration": 2.4793055057525635 + }, + { + "auxiliary_loss_clip": 0.01099293, + "auxiliary_loss_mlp": 0.01028628, + "balance_loss_clip": 1.01671898, + "balance_loss_mlp": 1.03480935, + "epoch": 0.9338343604388998, + "flos": 29058986522880.0, + "grad_norm": 1.7387644106584443, + "language_loss": 0.70876235, + "learning_rate": 4.5708149577617925e-08, + "loss": 0.7300415, + "num_input_tokens_seen": 335075410, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.64453125, + "step": 15532, + "time_per_iteration": 2.5110278129577637 + }, + { + "auxiliary_loss_clip": 0.01100897, + "auxiliary_loss_mlp": 0.01030357, + "balance_loss_clip": 1.01876354, + "balance_loss_mlp": 1.03404105, + "epoch": 0.9338944836915677, + "flos": 18660908695680.0, + "grad_norm": 1.6017505405314953, + "language_loss": 0.73168802, + "learning_rate": 4.5625398912271016e-08, + "loss": 0.75300056, + "num_input_tokens_seen": 335095190, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66796875, + "step": 15533, + "time_per_iteration": 2.456326961517334 + }, + { + "auxiliary_loss_clip": 0.0109823, + "auxiliary_loss_mlp": 0.01027232, + "balance_loss_clip": 1.01614547, + "balance_loss_mlp": 1.03383231, + "epoch": 0.9339546069442357, + "flos": 16617735273600.0, + "grad_norm": 1.8538496144624586, + "language_loss": 0.79222482, + "learning_rate": 4.554272235700507e-08, + "loss": 0.81347942, + "num_input_tokens_seen": 335113825, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.64453125, + "step": 15534, + "time_per_iteration": 2.462285280227661 + }, + { + "auxiliary_loss_clip": 0.01094139, + "auxiliary_loss_mlp": 0.0102719, + "balance_loss_clip": 1.01709914, + "balance_loss_mlp": 1.03442264, + "epoch": 0.9340147301969036, + "flos": 23693106424320.0, + "grad_norm": 1.6479187829635167, + "language_loss": 0.74347138, + "learning_rate": 4.546011991495513e-08, + "loss": 0.76468462, + "num_input_tokens_seen": 335136425, + "router_z_loss_clip": 0.10107422, + "router_z_loss_mlp": 0.59375, + "step": 15535, + "time_per_iteration": 2.4844884872436523 + }, + { + "auxiliary_loss_clip": 0.01101269, + "auxiliary_loss_mlp": 0.01027396, + "balance_loss_clip": 1.01573169, + "balance_loss_mlp": 1.0350976, + "epoch": 0.9340748534495716, + "flos": 28654452576000.0, + "grad_norm": 1.9207427228488974, + "language_loss": 0.77459687, + "learning_rate": 4.537759158925292e-08, + "loss": 0.79588354, + "num_input_tokens_seen": 335157925, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66015625, + "step": 15536, + "time_per_iteration": 2.521846294403076 + }, + { + "auxiliary_loss_clip": 0.01097297, + "auxiliary_loss_mlp": 0.01026696, + "balance_loss_clip": 1.01527619, + "balance_loss_mlp": 1.03239119, + "epoch": 0.9341349767022396, + "flos": 24899633285760.0, + "grad_norm": 1.444547733533229, + "language_loss": 0.80330276, + "learning_rate": 4.5295137383028593e-08, + "loss": 0.8245427, + "num_input_tokens_seen": 335177840, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 15537, + "time_per_iteration": 2.473996639251709 + }, + { + "auxiliary_loss_clip": 0.01101334, + "auxiliary_loss_mlp": 0.01031661, + "balance_loss_clip": 1.02027667, + "balance_loss_mlp": 1.0340277, + "epoch": 0.9341950999549076, + "flos": 29059525226880.0, + "grad_norm": 1.8807920821154451, + "language_loss": 0.77858669, + "learning_rate": 4.5212757299408764e-08, + "loss": 0.79991663, + "num_input_tokens_seen": 335199470, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.671875, + "step": 15538, + "time_per_iteration": 2.518378973007202 + }, + { + "auxiliary_loss_clip": 0.01097238, + "auxiliary_loss_mlp": 0.01027192, + "balance_loss_clip": 1.01576018, + "balance_loss_mlp": 1.03289402, + "epoch": 0.9342552232075755, + "flos": 23587062497280.0, + "grad_norm": 1.6297482884228507, + "language_loss": 0.73147398, + "learning_rate": 4.513045134151672e-08, + "loss": 0.75271827, + "num_input_tokens_seen": 335218885, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.64453125, + "step": 15539, + "time_per_iteration": 2.4873478412628174 + }, + { + "auxiliary_loss_clip": 0.01096595, + "auxiliary_loss_mlp": 0.01028476, + "balance_loss_clip": 1.01805711, + "balance_loss_mlp": 1.03259134, + "epoch": 0.9343153464602435, + "flos": 36721389646080.0, + "grad_norm": 1.6257546240564933, + "language_loss": 0.64682591, + "learning_rate": 4.504821951247373e-08, + "loss": 0.66807657, + "num_input_tokens_seen": 335239485, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.640625, + "step": 15540, + "time_per_iteration": 2.5722250938415527 + }, + { + "auxiliary_loss_clip": 0.01096636, + "auxiliary_loss_mlp": 0.01028266, + "balance_loss_clip": 1.01734662, + "balance_loss_mlp": 1.03265882, + "epoch": 0.9343754697129115, + "flos": 22236498097920.0, + "grad_norm": 1.6183457842403597, + "language_loss": 0.76627016, + "learning_rate": 4.496606181539864e-08, + "loss": 0.78751922, + "num_input_tokens_seen": 335258355, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.640625, + "step": 15541, + "time_per_iteration": 2.436232089996338 + }, + { + "auxiliary_loss_clip": 0.01101836, + "auxiliary_loss_mlp": 0.01031204, + "balance_loss_clip": 1.02009416, + "balance_loss_mlp": 1.03635538, + "epoch": 0.9344355929655794, + "flos": 29710333797120.0, + "grad_norm": 1.965621595224706, + "language_loss": 0.67185199, + "learning_rate": 4.4883978253406066e-08, + "loss": 0.69318235, + "num_input_tokens_seen": 335276835, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65234375, + "step": 15542, + "time_per_iteration": 2.573796272277832 + }, + { + "auxiliary_loss_clip": 0.01099558, + "auxiliary_loss_mlp": 0.01028965, + "balance_loss_clip": 1.01739001, + "balance_loss_mlp": 1.03383517, + "epoch": 0.9344957162182475, + "flos": 18880394751360.0, + "grad_norm": 1.8093294927002697, + "language_loss": 0.6968419, + "learning_rate": 4.480196882960907e-08, + "loss": 0.71812713, + "num_input_tokens_seen": 335296220, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 15543, + "time_per_iteration": 2.44272518157959 + }, + { + "auxiliary_loss_clip": 0.01101098, + "auxiliary_loss_mlp": 0.01030802, + "balance_loss_clip": 1.01826096, + "balance_loss_mlp": 1.0330987, + "epoch": 0.9345558394709154, + "flos": 27417761268480.0, + "grad_norm": 1.657022098990054, + "language_loss": 0.69621456, + "learning_rate": 4.4720033547117394e-08, + "loss": 0.71753359, + "num_input_tokens_seen": 335316335, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 15544, + "time_per_iteration": 2.5107553005218506 + }, + { + "auxiliary_loss_clip": 0.01102161, + "auxiliary_loss_mlp": 0.01033016, + "balance_loss_clip": 1.02158332, + "balance_loss_mlp": 1.03555083, + "epoch": 0.9346159627235834, + "flos": 20741285629440.0, + "grad_norm": 1.6017319238780592, + "language_loss": 0.77028668, + "learning_rate": 4.463817240903789e-08, + "loss": 0.79163849, + "num_input_tokens_seen": 335335545, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 15545, + "time_per_iteration": 2.4662442207336426 + }, + { + "auxiliary_loss_clip": 0.0110093, + "auxiliary_loss_mlp": 0.01026866, + "balance_loss_clip": 1.01607156, + "balance_loss_mlp": 1.0343653, + "epoch": 0.9346760859762513, + "flos": 21069221823360.0, + "grad_norm": 1.5473427515527929, + "language_loss": 0.68910575, + "learning_rate": 4.455638541847495e-08, + "loss": 0.71038377, + "num_input_tokens_seen": 335355350, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.66796875, + "step": 15546, + "time_per_iteration": 2.4668684005737305 + }, + { + "auxiliary_loss_clip": 0.0109524, + "auxiliary_loss_mlp": 0.01027139, + "balance_loss_clip": 1.01644647, + "balance_loss_mlp": 1.0332005, + "epoch": 0.9347362092289193, + "flos": 29204927481600.0, + "grad_norm": 1.682178435100884, + "language_loss": 0.827672, + "learning_rate": 4.447467257852966e-08, + "loss": 0.84889573, + "num_input_tokens_seen": 335375160, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.62109375, + "step": 15547, + "time_per_iteration": 2.5151429176330566 + }, + { + "auxiliary_loss_clip": 0.01095669, + "auxiliary_loss_mlp": 0.01031203, + "balance_loss_clip": 1.02056944, + "balance_loss_mlp": 1.03179169, + "epoch": 0.9347963324815872, + "flos": 19427350124160.0, + "grad_norm": 1.8674494270918909, + "language_loss": 0.83416784, + "learning_rate": 4.439303389230087e-08, + "loss": 0.85543656, + "num_input_tokens_seen": 335394080, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.640625, + "step": 15548, + "time_per_iteration": 2.4793310165405273 + }, + { + "auxiliary_loss_clip": 0.01101921, + "auxiliary_loss_mlp": 0.0103276, + "balance_loss_clip": 1.01983714, + "balance_loss_mlp": 1.03354442, + "epoch": 0.9348564557342552, + "flos": 36901840596480.0, + "grad_norm": 1.5491704453799409, + "language_loss": 0.6522944, + "learning_rate": 4.4311469362884326e-08, + "loss": 0.6736412, + "num_input_tokens_seen": 335414230, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.68359375, + "step": 15549, + "time_per_iteration": 2.5649001598358154 + }, + { + "auxiliary_loss_clip": 0.01102455, + "auxiliary_loss_mlp": 0.01033366, + "balance_loss_clip": 1.02144527, + "balance_loss_mlp": 1.03610802, + "epoch": 0.9349165789869232, + "flos": 21690117342720.0, + "grad_norm": 2.6870852732968324, + "language_loss": 0.80190766, + "learning_rate": 4.4229978993372665e-08, + "loss": 0.82326579, + "num_input_tokens_seen": 335432890, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 15550, + "time_per_iteration": 2.447848081588745 + }, + { + "auxiliary_loss_clip": 0.01100445, + "auxiliary_loss_mlp": 0.01028313, + "balance_loss_clip": 1.01714873, + "balance_loss_mlp": 1.0355283, + "epoch": 0.9349767022395912, + "flos": 18844053166080.0, + "grad_norm": 1.7715830672341057, + "language_loss": 0.75721681, + "learning_rate": 4.4148562786856524e-08, + "loss": 0.77850437, + "num_input_tokens_seen": 335452085, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 15551, + "time_per_iteration": 2.431541681289673 + }, + { + "auxiliary_loss_clip": 0.01095285, + "auxiliary_loss_mlp": 0.01029049, + "balance_loss_clip": 1.01916671, + "balance_loss_mlp": 1.03277612, + "epoch": 0.9350368254922591, + "flos": 24973429777920.0, + "grad_norm": 1.5972871846705574, + "language_loss": 0.73139381, + "learning_rate": 4.406722074642255e-08, + "loss": 0.75263715, + "num_input_tokens_seen": 335472130, + "router_z_loss_clip": 0.09863281, + "router_z_loss_mlp": 0.625, + "step": 15552, + "time_per_iteration": 2.4882681369781494 + }, + { + "auxiliary_loss_clip": 0.01098605, + "auxiliary_loss_mlp": 0.01031442, + "balance_loss_clip": 1.02037311, + "balance_loss_mlp": 1.03382468, + "epoch": 0.9350969487449271, + "flos": 23070594792960.0, + "grad_norm": 2.1019402577622315, + "language_loss": 0.77461952, + "learning_rate": 4.3985952875155386e-08, + "loss": 0.79592001, + "num_input_tokens_seen": 335489970, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6484375, + "step": 15553, + "time_per_iteration": 3.8587379455566406 + }, + { + "auxiliary_loss_clip": 0.01101745, + "auxiliary_loss_mlp": 0.01034975, + "balance_loss_clip": 1.02267838, + "balance_loss_mlp": 1.03456163, + "epoch": 0.9351570719975951, + "flos": 18625177641600.0, + "grad_norm": 1.5007379284122981, + "language_loss": 0.78357017, + "learning_rate": 4.390475917613723e-08, + "loss": 0.80493736, + "num_input_tokens_seen": 335509125, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.671875, + "step": 15554, + "time_per_iteration": 2.419851303100586 + }, + { + "auxiliary_loss_clip": 0.01093625, + "auxiliary_loss_mlp": 0.01026646, + "balance_loss_clip": 1.01656055, + "balance_loss_mlp": 1.03161645, + "epoch": 0.935217195250263, + "flos": 15888353702400.0, + "grad_norm": 1.7958483110944459, + "language_loss": 0.69293928, + "learning_rate": 4.382363965244695e-08, + "loss": 0.71414196, + "num_input_tokens_seen": 335525620, + "router_z_loss_clip": 0.10058594, + "router_z_loss_mlp": 0.6171875, + "step": 15555, + "time_per_iteration": 3.920722007751465 + }, + { + "auxiliary_loss_clip": 0.01098789, + "auxiliary_loss_mlp": 0.01038798, + "balance_loss_clip": 1.02724099, + "balance_loss_mlp": 1.03382492, + "epoch": 0.935277318502931, + "flos": 24390312387840.0, + "grad_norm": 1.8860733218695758, + "language_loss": 0.7554931, + "learning_rate": 4.374259430715965e-08, + "loss": 0.776869, + "num_input_tokens_seen": 335547565, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6484375, + "step": 15556, + "time_per_iteration": 3.848532199859619 + }, + { + "auxiliary_loss_clip": 0.01098399, + "auxiliary_loss_mlp": 0.01029598, + "balance_loss_clip": 1.01864231, + "balance_loss_mlp": 1.0332365, + "epoch": 0.935337441755599, + "flos": 27600259294080.0, + "grad_norm": 1.6008830857055418, + "language_loss": 0.72704911, + "learning_rate": 4.366162314334953e-08, + "loss": 0.74832916, + "num_input_tokens_seen": 335570285, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.65234375, + "step": 15557, + "time_per_iteration": 2.486417770385742 + }, + { + "auxiliary_loss_clip": 0.01099803, + "auxiliary_loss_mlp": 0.01030957, + "balance_loss_clip": 1.01889324, + "balance_loss_mlp": 1.03413987, + "epoch": 0.935397565008267, + "flos": 20482872209280.0, + "grad_norm": 1.5664177118870293, + "language_loss": 0.63356799, + "learning_rate": 4.358072616408681e-08, + "loss": 0.65487558, + "num_input_tokens_seen": 335588600, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.65625, + "step": 15558, + "time_per_iteration": 2.415761709213257 + }, + { + "auxiliary_loss_clip": 0.01099528, + "auxiliary_loss_mlp": 0.01026906, + "balance_loss_clip": 1.0146395, + "balance_loss_mlp": 1.03434324, + "epoch": 0.9354576882609349, + "flos": 23654394541440.0, + "grad_norm": 1.8958255236053232, + "language_loss": 0.73185015, + "learning_rate": 4.34999033724388e-08, + "loss": 0.75311458, + "num_input_tokens_seen": 335606235, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.65234375, + "step": 15559, + "time_per_iteration": 2.4690446853637695 + }, + { + "auxiliary_loss_clip": 0.01096312, + "auxiliary_loss_mlp": 0.01025743, + "balance_loss_clip": 1.01587224, + "balance_loss_mlp": 1.03340673, + "epoch": 0.9355178115136029, + "flos": 36684904406400.0, + "grad_norm": 1.6349606783173563, + "language_loss": 0.63386834, + "learning_rate": 4.341915477147062e-08, + "loss": 0.6550889, + "num_input_tokens_seen": 335628240, + "router_z_loss_clip": 0.09863281, + "router_z_loss_mlp": 0.62890625, + "step": 15560, + "time_per_iteration": 4.014149188995361 + }, + { + "auxiliary_loss_clip": 0.01107301, + "auxiliary_loss_mlp": 0.010324, + "balance_loss_clip": 1.01869702, + "balance_loss_mlp": 1.03587627, + "epoch": 0.9355779347662708, + "flos": 14460401450880.0, + "grad_norm": 2.2699289713088557, + "language_loss": 0.6402877, + "learning_rate": 4.3338480364244034e-08, + "loss": 0.66168469, + "num_input_tokens_seen": 335643755, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.71484375, + "step": 15561, + "time_per_iteration": 2.4404451847076416 + }, + { + "auxiliary_loss_clip": 0.0109953, + "auxiliary_loss_mlp": 0.01034464, + "balance_loss_clip": 1.02250171, + "balance_loss_mlp": 1.03558934, + "epoch": 0.9356380580189388, + "flos": 23185976256000.0, + "grad_norm": 1.676704048305052, + "language_loss": 0.7533828, + "learning_rate": 4.325788015381859e-08, + "loss": 0.77472275, + "num_input_tokens_seen": 335665160, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.640625, + "step": 15562, + "time_per_iteration": 2.477750062942505 + }, + { + "auxiliary_loss_clip": 0.01021581, + "auxiliary_loss_mlp": 0.01002822, + "balance_loss_clip": 1.00183833, + "balance_loss_mlp": 1.00135922, + "epoch": 0.9356981812716068, + "flos": 67471626090240.0, + "grad_norm": 0.9485732180196381, + "language_loss": 0.62341046, + "learning_rate": 4.31773541432503e-08, + "loss": 0.64365447, + "num_input_tokens_seen": 335715240, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.20214844, + "step": 15563, + "time_per_iteration": 2.8820064067840576 + }, + { + "auxiliary_loss_clip": 0.01096826, + "auxiliary_loss_mlp": 0.01032818, + "balance_loss_clip": 1.02134371, + "balance_loss_mlp": 1.03330636, + "epoch": 0.9357583045242748, + "flos": 24681619687680.0, + "grad_norm": 1.6269402183292514, + "language_loss": 0.78099597, + "learning_rate": 4.3096902335592714e-08, + "loss": 0.80229235, + "num_input_tokens_seen": 335734970, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.63671875, + "step": 15564, + "time_per_iteration": 2.534823179244995 + }, + { + "auxiliary_loss_clip": 0.01099525, + "auxiliary_loss_mlp": 0.01030752, + "balance_loss_clip": 1.01794863, + "balance_loss_mlp": 1.03237152, + "epoch": 0.9358184277769427, + "flos": 19463727623040.0, + "grad_norm": 1.7253317771488292, + "language_loss": 0.77913517, + "learning_rate": 4.301652473389694e-08, + "loss": 0.80043793, + "num_input_tokens_seen": 335753435, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.671875, + "step": 15565, + "time_per_iteration": 2.5161406993865967 + }, + { + "auxiliary_loss_clip": 0.01096793, + "auxiliary_loss_mlp": 0.01028269, + "balance_loss_clip": 1.01708746, + "balance_loss_mlp": 1.03307271, + "epoch": 0.9358785510296107, + "flos": 18916987731840.0, + "grad_norm": 2.201571134933277, + "language_loss": 0.72346658, + "learning_rate": 4.2936221341210774e-08, + "loss": 0.74471718, + "num_input_tokens_seen": 335772105, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.63671875, + "step": 15566, + "time_per_iteration": 2.4636313915252686 + }, + { + "auxiliary_loss_clip": 0.01098024, + "auxiliary_loss_mlp": 0.01028693, + "balance_loss_clip": 1.0176307, + "balance_loss_mlp": 1.03258288, + "epoch": 0.9359386742822787, + "flos": 23441265192960.0, + "grad_norm": 1.7833005232055914, + "language_loss": 0.67558104, + "learning_rate": 4.285599216057889e-08, + "loss": 0.69684815, + "num_input_tokens_seen": 335789125, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 15567, + "time_per_iteration": 2.476928234100342 + }, + { + "auxiliary_loss_clip": 0.01100526, + "auxiliary_loss_mlp": 0.01031885, + "balance_loss_clip": 1.02041149, + "balance_loss_mlp": 1.03555894, + "epoch": 0.9359987975349466, + "flos": 32744067557760.0, + "grad_norm": 1.8979470567476942, + "language_loss": 0.62194836, + "learning_rate": 4.277583719504418e-08, + "loss": 0.64327252, + "num_input_tokens_seen": 335810995, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6484375, + "step": 15568, + "time_per_iteration": 2.5609068870544434 + }, + { + "auxiliary_loss_clip": 0.01097511, + "auxiliary_loss_mlp": 0.01031871, + "balance_loss_clip": 1.02045119, + "balance_loss_mlp": 1.03235245, + "epoch": 0.9360589207876147, + "flos": 22819651401600.0, + "grad_norm": 1.8186215129656738, + "language_loss": 0.78508359, + "learning_rate": 4.269575644764556e-08, + "loss": 0.80637741, + "num_input_tokens_seen": 335830580, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 15569, + "time_per_iteration": 2.534830093383789 + }, + { + "auxiliary_loss_clip": 0.01101685, + "auxiliary_loss_mlp": 0.01031131, + "balance_loss_clip": 1.01937127, + "balance_loss_mlp": 1.03493094, + "epoch": 0.9361190440402826, + "flos": 20885251340160.0, + "grad_norm": 2.2083929656127816, + "language_loss": 0.69096726, + "learning_rate": 4.261574992142014e-08, + "loss": 0.71229541, + "num_input_tokens_seen": 335846515, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6640625, + "step": 15570, + "time_per_iteration": 2.446850299835205 + }, + { + "auxiliary_loss_clip": 0.01099152, + "auxiliary_loss_mlp": 0.01032193, + "balance_loss_clip": 1.02060008, + "balance_loss_mlp": 1.032758, + "epoch": 0.9361791672929506, + "flos": 19317822577920.0, + "grad_norm": 1.7500529924071564, + "language_loss": 0.78419554, + "learning_rate": 4.2535817619401726e-08, + "loss": 0.80550903, + "num_input_tokens_seen": 335863350, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 15571, + "time_per_iteration": 2.527392864227295 + }, + { + "auxiliary_loss_clip": 0.01098413, + "auxiliary_loss_mlp": 0.01030272, + "balance_loss_clip": 1.01873899, + "balance_loss_mlp": 1.03342748, + "epoch": 0.9362392905456185, + "flos": 15158182032000.0, + "grad_norm": 1.8544768580864697, + "language_loss": 0.77347147, + "learning_rate": 4.2455959544621224e-08, + "loss": 0.79475832, + "num_input_tokens_seen": 335880510, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 15572, + "time_per_iteration": 2.437767505645752 + }, + { + "auxiliary_loss_clip": 0.0109615, + "auxiliary_loss_mlp": 0.01035602, + "balance_loss_clip": 1.02450323, + "balance_loss_mlp": 1.03296947, + "epoch": 0.9362994137982865, + "flos": 22085888371200.0, + "grad_norm": 1.755875004895135, + "language_loss": 0.77844107, + "learning_rate": 4.237617570010688e-08, + "loss": 0.79975855, + "num_input_tokens_seen": 335899440, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6328125, + "step": 15573, + "time_per_iteration": 2.4989819526672363 + }, + { + "auxiliary_loss_clip": 0.01095269, + "auxiliary_loss_mlp": 0.01026582, + "balance_loss_clip": 1.01550794, + "balance_loss_mlp": 1.03273368, + "epoch": 0.9363595370509544, + "flos": 23512260424320.0, + "grad_norm": 1.629671028600837, + "language_loss": 0.74591202, + "learning_rate": 4.2296466088884044e-08, + "loss": 0.76713055, + "num_input_tokens_seen": 335919540, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.625, + "step": 15574, + "time_per_iteration": 2.4767251014709473 + }, + { + "auxiliary_loss_clip": 0.01095997, + "auxiliary_loss_mlp": 0.01031256, + "balance_loss_clip": 1.01983523, + "balance_loss_mlp": 1.03279662, + "epoch": 0.9364196603036224, + "flos": 27123473139840.0, + "grad_norm": 1.8734211277341717, + "language_loss": 0.67999518, + "learning_rate": 4.221683071397564e-08, + "loss": 0.70126772, + "num_input_tokens_seen": 335939665, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6328125, + "step": 15575, + "time_per_iteration": 2.490079164505005 + }, + { + "auxiliary_loss_clip": 0.01096514, + "auxiliary_loss_mlp": 0.01029364, + "balance_loss_clip": 1.01773548, + "balance_loss_mlp": 1.03346181, + "epoch": 0.9364797835562904, + "flos": 18479057114880.0, + "grad_norm": 2.0740128119343484, + "language_loss": 0.65354764, + "learning_rate": 4.2137269578401026e-08, + "loss": 0.67480642, + "num_input_tokens_seen": 335958580, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6328125, + "step": 15576, + "time_per_iteration": 2.4461007118225098 + }, + { + "auxiliary_loss_clip": 0.01099627, + "auxiliary_loss_mlp": 0.01025994, + "balance_loss_clip": 1.01396561, + "balance_loss_mlp": 1.03255725, + "epoch": 0.9365399068089584, + "flos": 13005552890880.0, + "grad_norm": 2.366361816458, + "language_loss": 0.75638366, + "learning_rate": 4.2057782685177566e-08, + "loss": 0.77763987, + "num_input_tokens_seen": 335974965, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 15577, + "time_per_iteration": 2.4425899982452393 + }, + { + "auxiliary_loss_clip": 0.01098342, + "auxiliary_loss_mlp": 0.01029652, + "balance_loss_clip": 1.01796925, + "balance_loss_mlp": 1.03240597, + "epoch": 0.9366000300616263, + "flos": 25666433850240.0, + "grad_norm": 2.7524540234782857, + "language_loss": 0.52199161, + "learning_rate": 4.1978370037318855e-08, + "loss": 0.54327154, + "num_input_tokens_seen": 335996575, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66015625, + "step": 15578, + "time_per_iteration": 2.5211164951324463 + }, + { + "auxiliary_loss_clip": 0.01098227, + "auxiliary_loss_mlp": 0.01030924, + "balance_loss_clip": 1.02001023, + "balance_loss_mlp": 1.03396976, + "epoch": 0.9366601533142943, + "flos": 21433355948160.0, + "grad_norm": 1.560613386304835, + "language_loss": 0.70552897, + "learning_rate": 4.189903163783692e-08, + "loss": 0.72682047, + "num_input_tokens_seen": 336017265, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.64453125, + "step": 15579, + "time_per_iteration": 2.52437424659729 + }, + { + "auxiliary_loss_clip": 0.01097221, + "auxiliary_loss_mlp": 0.0102686, + "balance_loss_clip": 1.01600599, + "balance_loss_mlp": 1.03381634, + "epoch": 0.9367202765669622, + "flos": 24093222998400.0, + "grad_norm": 1.8413182351344008, + "language_loss": 0.76279169, + "learning_rate": 4.181976748973959e-08, + "loss": 0.78403246, + "num_input_tokens_seen": 336035905, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6328125, + "step": 15580, + "time_per_iteration": 2.4710912704467773 + }, + { + "auxiliary_loss_clip": 0.01104329, + "auxiliary_loss_mlp": 0.0102788, + "balance_loss_clip": 1.0158999, + "balance_loss_mlp": 1.03657007, + "epoch": 0.9367803998196302, + "flos": 20888842700160.0, + "grad_norm": 2.694112745852956, + "language_loss": 0.66185987, + "learning_rate": 4.1740577596033114e-08, + "loss": 0.68318188, + "num_input_tokens_seen": 336055585, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 15581, + "time_per_iteration": 2.470471143722534 + }, + { + "auxiliary_loss_clip": 0.01100513, + "auxiliary_loss_mlp": 0.01027313, + "balance_loss_clip": 1.01533818, + "balance_loss_mlp": 1.0348171, + "epoch": 0.9368405230722983, + "flos": 22564362464640.0, + "grad_norm": 1.5158338005661471, + "language_loss": 0.76600075, + "learning_rate": 4.166146195972042e-08, + "loss": 0.78727901, + "num_input_tokens_seen": 336076695, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.65625, + "step": 15582, + "time_per_iteration": 2.488633632659912 + }, + { + "auxiliary_loss_clip": 0.01098016, + "auxiliary_loss_mlp": 0.01032851, + "balance_loss_clip": 1.02078104, + "balance_loss_mlp": 1.03378606, + "epoch": 0.9369006463249662, + "flos": 18880215183360.0, + "grad_norm": 1.7421816786299127, + "language_loss": 0.73751408, + "learning_rate": 4.1582420583800905e-08, + "loss": 0.7588228, + "num_input_tokens_seen": 336094740, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.640625, + "step": 15583, + "time_per_iteration": 2.494215965270996 + }, + { + "auxiliary_loss_clip": 0.0110384, + "auxiliary_loss_mlp": 0.01030693, + "balance_loss_clip": 1.01834249, + "balance_loss_mlp": 1.03549671, + "epoch": 0.9369607695776342, + "flos": 26432516142720.0, + "grad_norm": 2.4708375978600583, + "language_loss": 0.84226978, + "learning_rate": 4.1503453471272376e-08, + "loss": 0.8636151, + "num_input_tokens_seen": 336113985, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 15584, + "time_per_iteration": 2.5331246852874756 + }, + { + "auxiliary_loss_clip": 0.01103426, + "auxiliary_loss_mlp": 0.01034145, + "balance_loss_clip": 1.02168131, + "balance_loss_mlp": 1.03460801, + "epoch": 0.9370208928303021, + "flos": 39567346081920.0, + "grad_norm": 1.582838620482487, + "language_loss": 0.72438812, + "learning_rate": 4.1424560625129334e-08, + "loss": 0.7457639, + "num_input_tokens_seen": 336136395, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 15585, + "time_per_iteration": 2.6119656562805176 + }, + { + "auxiliary_loss_clip": 0.01094739, + "auxiliary_loss_mlp": 0.01025263, + "balance_loss_clip": 1.01492119, + "balance_loss_mlp": 1.0317564, + "epoch": 0.9370810160829701, + "flos": 22963114321920.0, + "grad_norm": 1.8473682533271836, + "language_loss": 0.80436736, + "learning_rate": 4.134574204836316e-08, + "loss": 0.82556736, + "num_input_tokens_seen": 336156345, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.62890625, + "step": 15586, + "time_per_iteration": 2.484668493270874 + }, + { + "auxiliary_loss_clip": 0.01098095, + "auxiliary_loss_mlp": 0.01030395, + "balance_loss_clip": 1.01892114, + "balance_loss_mlp": 1.03355002, + "epoch": 0.937141139335638, + "flos": 23075048079360.0, + "grad_norm": 1.689751969140814, + "language_loss": 0.76728654, + "learning_rate": 4.126699774396258e-08, + "loss": 0.78857148, + "num_input_tokens_seen": 336176760, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6484375, + "step": 15587, + "time_per_iteration": 2.463247299194336 + }, + { + "auxiliary_loss_clip": 0.01102433, + "auxiliary_loss_mlp": 0.01027962, + "balance_loss_clip": 1.01624346, + "balance_loss_mlp": 1.03459477, + "epoch": 0.937201262588306, + "flos": 16356664247040.0, + "grad_norm": 1.8161578272340377, + "language_loss": 0.87579244, + "learning_rate": 4.118832771491387e-08, + "loss": 0.8970964, + "num_input_tokens_seen": 336193285, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6796875, + "step": 15588, + "time_per_iteration": 2.412489891052246 + }, + { + "auxiliary_loss_clip": 0.01095862, + "auxiliary_loss_mlp": 0.01027769, + "balance_loss_clip": 1.0171299, + "balance_loss_mlp": 1.03373146, + "epoch": 0.937261385840974, + "flos": 20194078861440.0, + "grad_norm": 1.564245116069396, + "language_loss": 0.78160763, + "learning_rate": 4.11097319642002e-08, + "loss": 0.80284393, + "num_input_tokens_seen": 336211425, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.625, + "step": 15589, + "time_per_iteration": 2.445446014404297 + }, + { + "auxiliary_loss_clip": 0.01097837, + "auxiliary_loss_mlp": 0.01029357, + "balance_loss_clip": 1.01799083, + "balance_loss_mlp": 1.03447211, + "epoch": 0.937321509093642, + "flos": 18295948558080.0, + "grad_norm": 1.7350020828956296, + "language_loss": 0.77957153, + "learning_rate": 4.103121049480163e-08, + "loss": 0.80084348, + "num_input_tokens_seen": 336230205, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6328125, + "step": 15590, + "time_per_iteration": 2.428891897201538 + }, + { + "auxiliary_loss_clip": 0.01102134, + "auxiliary_loss_mlp": 0.01036092, + "balance_loss_clip": 1.0234977, + "balance_loss_mlp": 1.0337714, + "epoch": 0.9373816323463099, + "flos": 25884662929920.0, + "grad_norm": 1.8230621785176295, + "language_loss": 0.71332479, + "learning_rate": 4.095276330969577e-08, + "loss": 0.734707, + "num_input_tokens_seen": 336252440, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.68359375, + "step": 15591, + "time_per_iteration": 2.4772777557373047 + }, + { + "auxiliary_loss_clip": 0.0110454, + "auxiliary_loss_mlp": 0.01033725, + "balance_loss_clip": 1.02017736, + "balance_loss_mlp": 1.03551292, + "epoch": 0.9374417555989779, + "flos": 27198849830400.0, + "grad_norm": 2.4539812821025895, + "language_loss": 0.54102397, + "learning_rate": 4.0874390411857804e-08, + "loss": 0.56240666, + "num_input_tokens_seen": 336273845, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.69140625, + "step": 15592, + "time_per_iteration": 2.541588068008423 + }, + { + "auxiliary_loss_clip": 0.01098357, + "auxiliary_loss_mlp": 0.01026018, + "balance_loss_clip": 1.01555693, + "balance_loss_mlp": 1.03418398, + "epoch": 0.9375018788516458, + "flos": 23621249266560.0, + "grad_norm": 3.3346969261937245, + "language_loss": 0.67238343, + "learning_rate": 4.0796091804259136e-08, + "loss": 0.69362718, + "num_input_tokens_seen": 336292790, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.640625, + "step": 15593, + "time_per_iteration": 2.4413111209869385 + }, + { + "auxiliary_loss_clip": 0.01099567, + "auxiliary_loss_mlp": 0.010267, + "balance_loss_clip": 1.01530337, + "balance_loss_mlp": 1.03389668, + "epoch": 0.9375620021043138, + "flos": 22678774260480.0, + "grad_norm": 1.4835867748670866, + "language_loss": 0.74052262, + "learning_rate": 4.0717867489868715e-08, + "loss": 0.76178527, + "num_input_tokens_seen": 336312600, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 15594, + "time_per_iteration": 2.497950315475464 + }, + { + "auxiliary_loss_clip": 0.0109474, + "auxiliary_loss_mlp": 0.01027543, + "balance_loss_clip": 1.01693356, + "balance_loss_mlp": 1.03231061, + "epoch": 0.9376221253569819, + "flos": 27560254521600.0, + "grad_norm": 1.7070302081902384, + "language_loss": 0.73724419, + "learning_rate": 4.063971747165351e-08, + "loss": 0.75846702, + "num_input_tokens_seen": 336332770, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.625, + "step": 15595, + "time_per_iteration": 3.9548685550689697 + }, + { + "auxiliary_loss_clip": 0.01099741, + "auxiliary_loss_mlp": 0.01025968, + "balance_loss_clip": 1.01471496, + "balance_loss_mlp": 1.03330636, + "epoch": 0.9376822486096498, + "flos": 24129887806080.0, + "grad_norm": 1.9094131028649322, + "language_loss": 0.76069069, + "learning_rate": 4.056164175257626e-08, + "loss": 0.78194779, + "num_input_tokens_seen": 336351445, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6640625, + "step": 15596, + "time_per_iteration": 2.472580671310425 + }, + { + "auxiliary_loss_clip": 0.0109939, + "auxiliary_loss_mlp": 0.01031065, + "balance_loss_clip": 1.01964438, + "balance_loss_mlp": 1.033849, + "epoch": 0.9377423718623178, + "flos": 22784028088320.0, + "grad_norm": 1.6688470570241747, + "language_loss": 0.78528333, + "learning_rate": 4.0483640335597926e-08, + "loss": 0.80658782, + "num_input_tokens_seen": 336368690, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65625, + "step": 15597, + "time_per_iteration": 3.8680100440979004 + }, + { + "auxiliary_loss_clip": 0.01102727, + "auxiliary_loss_mlp": 0.01030591, + "balance_loss_clip": 1.01869369, + "balance_loss_mlp": 1.03471541, + "epoch": 0.9378024951149857, + "flos": 19168900790400.0, + "grad_norm": 1.8551083723100676, + "language_loss": 0.81072772, + "learning_rate": 4.0405713223676363e-08, + "loss": 0.83206093, + "num_input_tokens_seen": 336388165, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 15598, + "time_per_iteration": 3.8231723308563232 + }, + { + "auxiliary_loss_clip": 0.01104728, + "auxiliary_loss_mlp": 0.01031174, + "balance_loss_clip": 1.01905012, + "balance_loss_mlp": 1.03351772, + "epoch": 0.9378626183676537, + "flos": 23505508667520.0, + "grad_norm": 1.7860005158481418, + "language_loss": 0.63344586, + "learning_rate": 4.0327860419766994e-08, + "loss": 0.65480494, + "num_input_tokens_seen": 336406475, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.7109375, + "step": 15599, + "time_per_iteration": 2.4853222370147705 + }, + { + "auxiliary_loss_clip": 0.01100601, + "auxiliary_loss_mlp": 0.01030096, + "balance_loss_clip": 1.01851487, + "balance_loss_mlp": 1.03380525, + "epoch": 0.9379227416203216, + "flos": 18405655672320.0, + "grad_norm": 1.6427203157979469, + "language_loss": 0.73457086, + "learning_rate": 4.0250081926821e-08, + "loss": 0.75587785, + "num_input_tokens_seen": 336424690, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66796875, + "step": 15600, + "time_per_iteration": 2.4055838584899902 + }, + { + "auxiliary_loss_clip": 0.01097706, + "auxiliary_loss_mlp": 0.0102731, + "balance_loss_clip": 1.01668882, + "balance_loss_mlp": 1.03379583, + "epoch": 0.9379828648729897, + "flos": 17821855923840.0, + "grad_norm": 1.82264927435843, + "language_loss": 0.69327891, + "learning_rate": 4.0172377747788474e-08, + "loss": 0.71452916, + "num_input_tokens_seen": 336443055, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.640625, + "step": 15601, + "time_per_iteration": 2.4290764331817627 + }, + { + "auxiliary_loss_clip": 0.01021583, + "auxiliary_loss_mlp": 0.01004526, + "balance_loss_clip": 1.0034945, + "balance_loss_mlp": 1.00159645, + "epoch": 0.9380429881256576, + "flos": 68024399466240.0, + "grad_norm": 0.756115695258228, + "language_loss": 0.58134079, + "learning_rate": 4.009474788561573e-08, + "loss": 0.60160184, + "num_input_tokens_seen": 336510190, + "router_z_loss_clip": 0.01031494, + "router_z_loss_mlp": 0.19921875, + "step": 15602, + "time_per_iteration": 4.649659156799316 + }, + { + "auxiliary_loss_clip": 0.01100223, + "auxiliary_loss_mlp": 0.01034231, + "balance_loss_clip": 1.02322853, + "balance_loss_mlp": 1.03378415, + "epoch": 0.9381031113783256, + "flos": 20776980769920.0, + "grad_norm": 2.0216671165230022, + "language_loss": 0.71774584, + "learning_rate": 4.001719234324663e-08, + "loss": 0.73909038, + "num_input_tokens_seen": 336529250, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6640625, + "step": 15603, + "time_per_iteration": 2.439192295074463 + }, + { + "auxiliary_loss_clip": 0.01091613, + "auxiliary_loss_mlp": 0.01025078, + "balance_loss_clip": 1.01448584, + "balance_loss_mlp": 1.03135061, + "epoch": 0.9381632346309935, + "flos": 19025078734080.0, + "grad_norm": 1.5724913358336257, + "language_loss": 0.7588923, + "learning_rate": 3.993971112362171e-08, + "loss": 0.78005922, + "num_input_tokens_seen": 336548530, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.6015625, + "step": 15604, + "time_per_iteration": 2.430049419403076 + }, + { + "auxiliary_loss_clip": 0.01101631, + "auxiliary_loss_mlp": 0.01029293, + "balance_loss_clip": 1.01753354, + "balance_loss_mlp": 1.03493166, + "epoch": 0.9382233578836615, + "flos": 23513840622720.0, + "grad_norm": 1.820480005637361, + "language_loss": 0.65220332, + "learning_rate": 3.9862304229679734e-08, + "loss": 0.67351258, + "num_input_tokens_seen": 336568510, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.66796875, + "step": 15605, + "time_per_iteration": 2.454102039337158 + }, + { + "auxiliary_loss_clip": 0.01103599, + "auxiliary_loss_mlp": 0.01032486, + "balance_loss_clip": 1.02008247, + "balance_loss_mlp": 1.03473902, + "epoch": 0.9382834811363294, + "flos": 43067882016000.0, + "grad_norm": 1.693925168028821, + "language_loss": 0.67501086, + "learning_rate": 3.9784971664355683e-08, + "loss": 0.69637167, + "num_input_tokens_seen": 336592020, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6875, + "step": 15606, + "time_per_iteration": 2.6222922801971436 + }, + { + "auxiliary_loss_clip": 0.01093903, + "auxiliary_loss_mlp": 0.01026117, + "balance_loss_clip": 1.01541805, + "balance_loss_mlp": 1.03187966, + "epoch": 0.9383436043889974, + "flos": 16436242828800.0, + "grad_norm": 1.7830574782726436, + "language_loss": 0.77636516, + "learning_rate": 3.970771343058166e-08, + "loss": 0.79756534, + "num_input_tokens_seen": 336610010, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.62109375, + "step": 15607, + "time_per_iteration": 2.437866449356079 + }, + { + "auxiliary_loss_clip": 0.01099714, + "auxiliary_loss_mlp": 0.01027386, + "balance_loss_clip": 1.01631784, + "balance_loss_mlp": 1.03343678, + "epoch": 0.9384037276416655, + "flos": 20740603271040.0, + "grad_norm": 1.7371260163704862, + "language_loss": 0.82830989, + "learning_rate": 3.963052953128776e-08, + "loss": 0.84958094, + "num_input_tokens_seen": 336628520, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6640625, + "step": 15608, + "time_per_iteration": 2.4567601680755615 + }, + { + "auxiliary_loss_clip": 0.01104286, + "auxiliary_loss_mlp": 0.01033221, + "balance_loss_clip": 1.02112722, + "balance_loss_mlp": 1.03803909, + "epoch": 0.9384638508943334, + "flos": 19062677295360.0, + "grad_norm": 1.768980472763552, + "language_loss": 0.68811715, + "learning_rate": 3.9553419969400536e-08, + "loss": 0.70949221, + "num_input_tokens_seen": 336647365, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 15609, + "time_per_iteration": 2.4339687824249268 + }, + { + "auxiliary_loss_clip": 0.01101203, + "auxiliary_loss_mlp": 0.01027536, + "balance_loss_clip": 1.01521575, + "balance_loss_mlp": 1.0328238, + "epoch": 0.9385239741470014, + "flos": 23404887694080.0, + "grad_norm": 1.968005818386474, + "language_loss": 0.75119251, + "learning_rate": 3.9476384747844316e-08, + "loss": 0.77247989, + "num_input_tokens_seen": 336667165, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.68359375, + "step": 15610, + "time_per_iteration": 2.455913543701172 + }, + { + "auxiliary_loss_clip": 0.01101386, + "auxiliary_loss_mlp": 0.01026005, + "balance_loss_clip": 1.01504338, + "balance_loss_mlp": 1.0345664, + "epoch": 0.9385840973996693, + "flos": 12824742804480.0, + "grad_norm": 1.7573633557024793, + "language_loss": 0.74986607, + "learning_rate": 3.939942386953987e-08, + "loss": 0.77113998, + "num_input_tokens_seen": 336684130, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.66796875, + "step": 15611, + "time_per_iteration": 2.4424426555633545 + }, + { + "auxiliary_loss_clip": 0.01099404, + "auxiliary_loss_mlp": 0.01028523, + "balance_loss_clip": 1.01703119, + "balance_loss_mlp": 1.03506732, + "epoch": 0.9386442206523373, + "flos": 15486980152320.0, + "grad_norm": 2.1565540073741447, + "language_loss": 0.65710843, + "learning_rate": 3.9322537337405756e-08, + "loss": 0.67838764, + "num_input_tokens_seen": 336701520, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 15612, + "time_per_iteration": 2.460383892059326 + }, + { + "auxiliary_loss_clip": 0.01096532, + "auxiliary_loss_mlp": 0.01026706, + "balance_loss_clip": 1.01542926, + "balance_loss_mlp": 1.03311181, + "epoch": 0.9387043439050052, + "flos": 21178821196800.0, + "grad_norm": 1.7800423246546628, + "language_loss": 0.57413054, + "learning_rate": 3.924572515435742e-08, + "loss": 0.5953629, + "num_input_tokens_seen": 336720675, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6328125, + "step": 15613, + "time_per_iteration": 2.4363303184509277 + }, + { + "auxiliary_loss_clip": 0.01098477, + "auxiliary_loss_mlp": 0.01032371, + "balance_loss_clip": 1.02088487, + "balance_loss_mlp": 1.03223801, + "epoch": 0.9387644671576733, + "flos": 27668273696640.0, + "grad_norm": 3.022339916598047, + "language_loss": 0.70700508, + "learning_rate": 3.916898732330764e-08, + "loss": 0.72831357, + "num_input_tokens_seen": 336741005, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6640625, + "step": 15614, + "time_per_iteration": 2.5284812450408936 + }, + { + "auxiliary_loss_clip": 0.01103251, + "auxiliary_loss_mlp": 0.01030689, + "balance_loss_clip": 1.01882124, + "balance_loss_mlp": 1.03525591, + "epoch": 0.9388245904103412, + "flos": 18836331742080.0, + "grad_norm": 1.7543973322908877, + "language_loss": 0.81266332, + "learning_rate": 3.9092323847166544e-08, + "loss": 0.83400273, + "num_input_tokens_seen": 336757990, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 15615, + "time_per_iteration": 2.423703193664551 + }, + { + "auxiliary_loss_clip": 0.01097442, + "auxiliary_loss_mlp": 0.01025593, + "balance_loss_clip": 1.01468527, + "balance_loss_mlp": 1.03362358, + "epoch": 0.9388847136630092, + "flos": 25483828083840.0, + "grad_norm": 1.5887485146879645, + "language_loss": 0.71745086, + "learning_rate": 3.901573472884134e-08, + "loss": 0.7386812, + "num_input_tokens_seen": 336777705, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.63671875, + "step": 15616, + "time_per_iteration": 2.493049144744873 + }, + { + "auxiliary_loss_clip": 0.01100207, + "auxiliary_loss_mlp": 0.01027907, + "balance_loss_clip": 1.01633799, + "balance_loss_mlp": 1.03520691, + "epoch": 0.9389448369156771, + "flos": 18734992496640.0, + "grad_norm": 1.8635102246300295, + "language_loss": 0.66588014, + "learning_rate": 3.89392199712355e-08, + "loss": 0.68716127, + "num_input_tokens_seen": 336798275, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6484375, + "step": 15617, + "time_per_iteration": 2.433169364929199 + }, + { + "auxiliary_loss_clip": 0.01104549, + "auxiliary_loss_mlp": 0.0103554, + "balance_loss_clip": 1.02229548, + "balance_loss_mlp": 1.03535593, + "epoch": 0.9390049601683451, + "flos": 21717839664000.0, + "grad_norm": 2.1232753256513264, + "language_loss": 0.73530006, + "learning_rate": 3.886277957725092e-08, + "loss": 0.75670093, + "num_input_tokens_seen": 336813835, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.69140625, + "step": 15618, + "time_per_iteration": 2.4792399406433105 + }, + { + "auxiliary_loss_clip": 0.01103237, + "auxiliary_loss_mlp": 0.01029258, + "balance_loss_clip": 1.01622224, + "balance_loss_mlp": 1.03472626, + "epoch": 0.939065083421013, + "flos": 19391224020480.0, + "grad_norm": 1.9955954128383109, + "language_loss": 0.70013475, + "learning_rate": 3.878641354978662e-08, + "loss": 0.72145975, + "num_input_tokens_seen": 336832210, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.68359375, + "step": 15619, + "time_per_iteration": 2.449866533279419 + }, + { + "auxiliary_loss_clip": 0.01099798, + "auxiliary_loss_mlp": 0.0102901, + "balance_loss_clip": 1.01731527, + "balance_loss_mlp": 1.03438771, + "epoch": 0.939125206673681, + "flos": 24681511946880.0, + "grad_norm": 1.6070276908213748, + "language_loss": 0.77566183, + "learning_rate": 3.8710121891737834e-08, + "loss": 0.79694998, + "num_input_tokens_seen": 336851380, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 15620, + "time_per_iteration": 2.4847350120544434 + }, + { + "auxiliary_loss_clip": 0.01095352, + "auxiliary_loss_mlp": 0.01028465, + "balance_loss_clip": 1.01688337, + "balance_loss_mlp": 1.03205025, + "epoch": 0.9391853299263491, + "flos": 16325961096960.0, + "grad_norm": 2.459938458959684, + "language_loss": 0.73743159, + "learning_rate": 3.8633904605998025e-08, + "loss": 0.75866973, + "num_input_tokens_seen": 336868525, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6328125, + "step": 15621, + "time_per_iteration": 2.4089574813842773 + }, + { + "auxiliary_loss_clip": 0.01104801, + "auxiliary_loss_mlp": 0.01034061, + "balance_loss_clip": 1.02192533, + "balance_loss_mlp": 1.03624845, + "epoch": 0.939245453179017, + "flos": 11655778590720.0, + "grad_norm": 1.931241274628396, + "language_loss": 0.66069001, + "learning_rate": 3.855776169545688e-08, + "loss": 0.6820786, + "num_input_tokens_seen": 336886200, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6875, + "step": 15622, + "time_per_iteration": 2.4193296432495117 + }, + { + "auxiliary_loss_clip": 0.01096904, + "auxiliary_loss_mlp": 0.01027358, + "balance_loss_clip": 1.01664114, + "balance_loss_mlp": 1.03303981, + "epoch": 0.939305576431685, + "flos": 23148700917120.0, + "grad_norm": 1.5757601790448577, + "language_loss": 0.71780264, + "learning_rate": 3.848169316300209e-08, + "loss": 0.73904526, + "num_input_tokens_seen": 336905815, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.640625, + "step": 15623, + "time_per_iteration": 2.4576759338378906 + }, + { + "auxiliary_loss_clip": 0.01103573, + "auxiliary_loss_mlp": 0.01028485, + "balance_loss_clip": 1.01704717, + "balance_loss_mlp": 1.03622472, + "epoch": 0.9393656996843529, + "flos": 33287790706560.0, + "grad_norm": 1.8246277533972777, + "language_loss": 0.72611034, + "learning_rate": 3.84056990115178e-08, + "loss": 0.74743092, + "num_input_tokens_seen": 336928460, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 15624, + "time_per_iteration": 2.574350357055664 + }, + { + "auxiliary_loss_clip": 0.01097672, + "auxiliary_loss_mlp": 0.01029458, + "balance_loss_clip": 1.01789486, + "balance_loss_mlp": 1.03316939, + "epoch": 0.9394258229370209, + "flos": 21689434984320.0, + "grad_norm": 1.781484108648701, + "language_loss": 0.89487529, + "learning_rate": 3.832977924388614e-08, + "loss": 0.91614664, + "num_input_tokens_seen": 336948320, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.64453125, + "step": 15625, + "time_per_iteration": 2.441397190093994 + }, + { + "auxiliary_loss_clip": 0.01099705, + "auxiliary_loss_mlp": 0.01031299, + "balance_loss_clip": 1.01952124, + "balance_loss_mlp": 1.03396618, + "epoch": 0.9394859461896888, + "flos": 23874203819520.0, + "grad_norm": 2.0992089201785076, + "language_loss": 0.83631927, + "learning_rate": 3.825393386298592e-08, + "loss": 0.8576293, + "num_input_tokens_seen": 336967670, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.65625, + "step": 15626, + "time_per_iteration": 2.4755821228027344 + }, + { + "auxiliary_loss_clip": 0.01021753, + "auxiliary_loss_mlp": 0.00999666, + "balance_loss_clip": 0.9987244, + "balance_loss_mlp": 1.00174892, + "epoch": 0.9395460694423569, + "flos": 61566116993280.0, + "grad_norm": 0.77489009158345, + "language_loss": 0.56156707, + "learning_rate": 3.8178162871693284e-08, + "loss": 0.58178127, + "num_input_tokens_seen": 337028395, + "router_z_loss_clip": 0.00939941, + "router_z_loss_mlp": 0.20019531, + "step": 15627, + "time_per_iteration": 2.99603271484375 + }, + { + "auxiliary_loss_clip": 0.01099008, + "auxiliary_loss_mlp": 0.01029639, + "balance_loss_clip": 1.018857, + "balance_loss_mlp": 1.03538132, + "epoch": 0.9396061926950248, + "flos": 20995712640000.0, + "grad_norm": 2.0619040922796605, + "language_loss": 0.69850802, + "learning_rate": 3.810246627288105e-08, + "loss": 0.71979451, + "num_input_tokens_seen": 337048150, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.63671875, + "step": 15628, + "time_per_iteration": 2.4771571159362793 + }, + { + "auxiliary_loss_clip": 0.01098362, + "auxiliary_loss_mlp": 0.01029035, + "balance_loss_clip": 1.01757264, + "balance_loss_mlp": 1.03369975, + "epoch": 0.9396663159476928, + "flos": 27487786832640.0, + "grad_norm": 1.5748669806960962, + "language_loss": 0.75526696, + "learning_rate": 3.8026844069420025e-08, + "loss": 0.77654099, + "num_input_tokens_seen": 337069315, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 15629, + "time_per_iteration": 2.484584331512451 + }, + { + "auxiliary_loss_clip": 0.01095519, + "auxiliary_loss_mlp": 0.01027671, + "balance_loss_clip": 1.01682913, + "balance_loss_mlp": 1.03342628, + "epoch": 0.9397264392003607, + "flos": 19427457864960.0, + "grad_norm": 1.7575351495605849, + "language_loss": 0.74100959, + "learning_rate": 3.795129626417748e-08, + "loss": 0.76224142, + "num_input_tokens_seen": 337087765, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.62109375, + "step": 15630, + "time_per_iteration": 2.438732862472534 + }, + { + "auxiliary_loss_clip": 0.01095471, + "auxiliary_loss_mlp": 0.01028302, + "balance_loss_clip": 1.01762676, + "balance_loss_mlp": 1.0336659, + "epoch": 0.9397865624530287, + "flos": 18004820826240.0, + "grad_norm": 1.9628728384394338, + "language_loss": 0.69608629, + "learning_rate": 3.787582286001845e-08, + "loss": 0.71732402, + "num_input_tokens_seen": 337106265, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6171875, + "step": 15631, + "time_per_iteration": 2.442594289779663 + }, + { + "auxiliary_loss_clip": 0.01098903, + "auxiliary_loss_mlp": 0.01033034, + "balance_loss_clip": 1.02210879, + "balance_loss_mlp": 1.03457558, + "epoch": 0.9398466857056966, + "flos": 22564613859840.0, + "grad_norm": 1.6711410965804296, + "language_loss": 0.7501359, + "learning_rate": 3.7800423859805086e-08, + "loss": 0.77145523, + "num_input_tokens_seen": 337126090, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.64453125, + "step": 15632, + "time_per_iteration": 2.468679428100586 + }, + { + "auxiliary_loss_clip": 0.01105333, + "auxiliary_loss_mlp": 0.01032075, + "balance_loss_clip": 1.01908159, + "balance_loss_mlp": 1.03677893, + "epoch": 0.9399068089583646, + "flos": 24535678728960.0, + "grad_norm": 1.544123558907395, + "language_loss": 0.7436294, + "learning_rate": 3.772509926639622e-08, + "loss": 0.76500344, + "num_input_tokens_seen": 337145655, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.6875, + "step": 15633, + "time_per_iteration": 2.484532594680786 + }, + { + "auxiliary_loss_clip": 0.01101475, + "auxiliary_loss_mlp": 0.01035563, + "balance_loss_clip": 1.0230515, + "balance_loss_mlp": 1.03425372, + "epoch": 0.9399669322110327, + "flos": 25630343660160.0, + "grad_norm": 1.9390816781204983, + "language_loss": 0.72402227, + "learning_rate": 3.764984908264823e-08, + "loss": 0.74539268, + "num_input_tokens_seen": 337164805, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.671875, + "step": 15634, + "time_per_iteration": 2.5017216205596924 + }, + { + "auxiliary_loss_clip": 0.01101172, + "auxiliary_loss_mlp": 0.01029764, + "balance_loss_clip": 1.017694, + "balance_loss_mlp": 1.03352332, + "epoch": 0.9400270554637006, + "flos": 17089385783040.0, + "grad_norm": 1.8378932656538167, + "language_loss": 0.689273, + "learning_rate": 3.75746733114144e-08, + "loss": 0.71058238, + "num_input_tokens_seen": 337182280, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.67578125, + "step": 15635, + "time_per_iteration": 2.422240972518921 + }, + { + "auxiliary_loss_clip": 0.01096959, + "auxiliary_loss_mlp": 0.01026692, + "balance_loss_clip": 1.01565313, + "balance_loss_mlp": 1.0343622, + "epoch": 0.9400871787163686, + "flos": 22055113393920.0, + "grad_norm": 1.6117184105576439, + "language_loss": 0.74286044, + "learning_rate": 3.7499571955545985e-08, + "loss": 0.76409698, + "num_input_tokens_seen": 337203495, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.625, + "step": 15636, + "time_per_iteration": 2.4935927391052246 + }, + { + "auxiliary_loss_clip": 0.01101255, + "auxiliary_loss_mlp": 0.01030148, + "balance_loss_clip": 1.01831651, + "balance_loss_mlp": 1.03481007, + "epoch": 0.9401473019690365, + "flos": 16982767238400.0, + "grad_norm": 1.952777040568534, + "language_loss": 0.82884896, + "learning_rate": 3.7424545017890054e-08, + "loss": 0.85016298, + "num_input_tokens_seen": 337220435, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 15637, + "time_per_iteration": 3.8426685333251953 + }, + { + "auxiliary_loss_clip": 0.0109996, + "auxiliary_loss_mlp": 0.01028077, + "balance_loss_clip": 1.01624513, + "balance_loss_mlp": 1.03416872, + "epoch": 0.9402074252217045, + "flos": 19681956702720.0, + "grad_norm": 2.38782195804008, + "language_loss": 0.68863559, + "learning_rate": 3.7349592501292325e-08, + "loss": 0.70991588, + "num_input_tokens_seen": 337238095, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65625, + "step": 15638, + "time_per_iteration": 2.4199607372283936 + }, + { + "auxiliary_loss_clip": 0.01094752, + "auxiliary_loss_mlp": 0.01035807, + "balance_loss_clip": 1.02552509, + "balance_loss_mlp": 1.0335573, + "epoch": 0.9402675484743724, + "flos": 24754302858240.0, + "grad_norm": 1.5928992567887847, + "language_loss": 0.84922618, + "learning_rate": 3.727471440859498e-08, + "loss": 0.8705318, + "num_input_tokens_seen": 337256645, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.609375, + "step": 15639, + "time_per_iteration": 5.2344276905059814 + }, + { + "auxiliary_loss_clip": 0.01097979, + "auxiliary_loss_mlp": 0.01025309, + "balance_loss_clip": 1.01428199, + "balance_loss_mlp": 1.03253627, + "epoch": 0.9403276717270405, + "flos": 25558630156800.0, + "grad_norm": 1.5027378140640861, + "language_loss": 0.78141928, + "learning_rate": 3.719991074263662e-08, + "loss": 0.80265212, + "num_input_tokens_seen": 337278360, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 15640, + "time_per_iteration": 2.494884729385376 + }, + { + "auxiliary_loss_clip": 0.01100943, + "auxiliary_loss_mlp": 0.01031446, + "balance_loss_clip": 1.01984107, + "balance_loss_mlp": 1.0335753, + "epoch": 0.9403877949797084, + "flos": 26689852154880.0, + "grad_norm": 1.5522382410230178, + "language_loss": 0.74184501, + "learning_rate": 3.7125181506254544e-08, + "loss": 0.76316881, + "num_input_tokens_seen": 337302480, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 15641, + "time_per_iteration": 2.5522215366363525 + }, + { + "auxiliary_loss_clip": 0.01103462, + "auxiliary_loss_mlp": 0.01028087, + "balance_loss_clip": 1.01514721, + "balance_loss_mlp": 1.03437018, + "epoch": 0.9404479182323764, + "flos": 15011666455680.0, + "grad_norm": 1.8973680252603045, + "language_loss": 0.82064319, + "learning_rate": 3.7050526702282256e-08, + "loss": 0.84195864, + "num_input_tokens_seen": 337316600, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.69140625, + "step": 15642, + "time_per_iteration": 2.452345132827759 + }, + { + "auxiliary_loss_clip": 0.01095842, + "auxiliary_loss_mlp": 0.01028838, + "balance_loss_clip": 1.01798356, + "balance_loss_mlp": 1.0321306, + "epoch": 0.9405080414850443, + "flos": 24973573432320.0, + "grad_norm": 2.054671844986166, + "language_loss": 0.6789223, + "learning_rate": 3.697594633355084e-08, + "loss": 0.70016909, + "num_input_tokens_seen": 337336895, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.63671875, + "step": 15643, + "time_per_iteration": 3.902398109436035 + }, + { + "auxiliary_loss_clip": 0.01104768, + "auxiliary_loss_mlp": 0.0103627, + "balance_loss_clip": 1.02403283, + "balance_loss_mlp": 1.03681779, + "epoch": 0.9405681647377123, + "flos": 20844743777280.0, + "grad_norm": 2.174179012657807, + "language_loss": 0.76626414, + "learning_rate": 3.6901440402888226e-08, + "loss": 0.78767455, + "num_input_tokens_seen": 337355105, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6796875, + "step": 15644, + "time_per_iteration": 2.440704107284546 + }, + { + "auxiliary_loss_clip": 0.0109653, + "auxiliary_loss_mlp": 0.01028228, + "balance_loss_clip": 1.01788664, + "balance_loss_mlp": 1.03375196, + "epoch": 0.9406282879903802, + "flos": 23805578885760.0, + "grad_norm": 1.502080892073022, + "language_loss": 0.67556715, + "learning_rate": 3.682700891311974e-08, + "loss": 0.69681478, + "num_input_tokens_seen": 337374905, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.62890625, + "step": 15645, + "time_per_iteration": 2.4514553546905518 + }, + { + "auxiliary_loss_clip": 0.01094594, + "auxiliary_loss_mlp": 0.01030125, + "balance_loss_clip": 1.01893699, + "balance_loss_mlp": 1.03271198, + "epoch": 0.9406884112430483, + "flos": 27674953626240.0, + "grad_norm": 2.1236359589025944, + "language_loss": 0.702784, + "learning_rate": 3.6752651867067774e-08, + "loss": 0.72403121, + "num_input_tokens_seen": 337397130, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.62109375, + "step": 15646, + "time_per_iteration": 2.5117604732513428 + }, + { + "auxiliary_loss_clip": 0.01095576, + "auxiliary_loss_mlp": 0.01028688, + "balance_loss_clip": 1.01755929, + "balance_loss_mlp": 1.03207064, + "epoch": 0.9407485344957163, + "flos": 23075048079360.0, + "grad_norm": 1.6591127603989193, + "language_loss": 0.74060643, + "learning_rate": 3.667836926755208e-08, + "loss": 0.76184905, + "num_input_tokens_seen": 337418660, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6328125, + "step": 15647, + "time_per_iteration": 2.4559590816497803 + }, + { + "auxiliary_loss_clip": 0.0102153, + "auxiliary_loss_mlp": 0.00997604, + "balance_loss_clip": 0.99657249, + "balance_loss_mlp": 1.00147247, + "epoch": 0.9408086577483842, + "flos": 71014034304000.0, + "grad_norm": 0.8841622693124102, + "language_loss": 0.63519818, + "learning_rate": 3.660416111738907e-08, + "loss": 0.65538949, + "num_input_tokens_seen": 337478055, + "router_z_loss_clip": 0.01031494, + "router_z_loss_mlp": 0.20117188, + "step": 15648, + "time_per_iteration": 3.1404430866241455 + }, + { + "auxiliary_loss_clip": 0.01096098, + "auxiliary_loss_mlp": 0.01029583, + "balance_loss_clip": 1.01900291, + "balance_loss_mlp": 1.03372252, + "epoch": 0.9408687810010522, + "flos": 23730956380800.0, + "grad_norm": 1.3226371584068994, + "language_loss": 0.66610408, + "learning_rate": 3.653002741939337e-08, + "loss": 0.68736088, + "num_input_tokens_seen": 337499405, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.625, + "step": 15649, + "time_per_iteration": 2.475015878677368 + }, + { + "auxiliary_loss_clip": 0.01097478, + "auxiliary_loss_mlp": 0.01026624, + "balance_loss_clip": 1.01572871, + "balance_loss_mlp": 1.03270769, + "epoch": 0.9409289042537201, + "flos": 18369314087040.0, + "grad_norm": 1.8009089263007458, + "language_loss": 0.77365673, + "learning_rate": 3.645596817637586e-08, + "loss": 0.79489779, + "num_input_tokens_seen": 337517195, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 15650, + "time_per_iteration": 2.4524431228637695 + }, + { + "auxiliary_loss_clip": 0.01100587, + "auxiliary_loss_mlp": 0.01028045, + "balance_loss_clip": 1.01703572, + "balance_loss_mlp": 1.03667188, + "epoch": 0.9409890275063881, + "flos": 23878333883520.0, + "grad_norm": 1.6843054637423838, + "language_loss": 0.74430692, + "learning_rate": 3.638198339114451e-08, + "loss": 0.76559329, + "num_input_tokens_seen": 337535245, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.640625, + "step": 15651, + "time_per_iteration": 2.463660478591919 + }, + { + "auxiliary_loss_clip": 0.01097843, + "auxiliary_loss_mlp": 0.01032001, + "balance_loss_clip": 1.02029419, + "balance_loss_mlp": 1.03302097, + "epoch": 0.941049150759056, + "flos": 16545088016640.0, + "grad_norm": 1.7829551002680968, + "language_loss": 0.7249018, + "learning_rate": 3.630807306650507e-08, + "loss": 0.7462002, + "num_input_tokens_seen": 337553040, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6484375, + "step": 15652, + "time_per_iteration": 2.4381537437438965 + }, + { + "auxiliary_loss_clip": 0.01104805, + "auxiliary_loss_mlp": 0.01032666, + "balance_loss_clip": 1.02049518, + "balance_loss_mlp": 1.0356704, + "epoch": 0.9411092740117241, + "flos": 25118401069440.0, + "grad_norm": 1.691105612906213, + "language_loss": 0.66318548, + "learning_rate": 3.6234237205260645e-08, + "loss": 0.68456018, + "num_input_tokens_seen": 337574580, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69140625, + "step": 15653, + "time_per_iteration": 2.4872758388519287 + }, + { + "auxiliary_loss_clip": 0.01100084, + "auxiliary_loss_mlp": 0.01034632, + "balance_loss_clip": 1.02277052, + "balance_loss_mlp": 1.03475976, + "epoch": 0.941169397264392, + "flos": 21142264129920.0, + "grad_norm": 1.8957291513192398, + "language_loss": 0.7746827, + "learning_rate": 3.6160475810210536e-08, + "loss": 0.79602987, + "num_input_tokens_seen": 337593010, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.65234375, + "step": 15654, + "time_per_iteration": 2.446638822555542 + }, + { + "auxiliary_loss_clip": 0.01103532, + "auxiliary_loss_mlp": 0.01028024, + "balance_loss_clip": 1.01635361, + "balance_loss_mlp": 1.03482795, + "epoch": 0.94122952051706, + "flos": 38508914995200.0, + "grad_norm": 1.5569925179074333, + "language_loss": 0.70128828, + "learning_rate": 3.6086788884152065e-08, + "loss": 0.7226038, + "num_input_tokens_seen": 337616170, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6875, + "step": 15655, + "time_per_iteration": 2.6205286979675293 + }, + { + "auxiliary_loss_clip": 0.01099387, + "auxiliary_loss_mlp": 0.01029822, + "balance_loss_clip": 1.01775813, + "balance_loss_mlp": 1.03365254, + "epoch": 0.9412896437697279, + "flos": 18369206346240.0, + "grad_norm": 2.4560073984117587, + "language_loss": 0.71858692, + "learning_rate": 3.601317642987944e-08, + "loss": 0.73987901, + "num_input_tokens_seen": 337635215, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.65625, + "step": 15656, + "time_per_iteration": 2.4531502723693848 + }, + { + "auxiliary_loss_clip": 0.01098082, + "auxiliary_loss_mlp": 0.01023486, + "balance_loss_clip": 1.01273918, + "balance_loss_mlp": 1.03367221, + "epoch": 0.9413497670223959, + "flos": 25884950238720.0, + "grad_norm": 1.7752477061266863, + "language_loss": 0.77574635, + "learning_rate": 3.593963845018377e-08, + "loss": 0.79696202, + "num_input_tokens_seen": 337654195, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.64453125, + "step": 15657, + "time_per_iteration": 2.463580369949341 + }, + { + "auxiliary_loss_clip": 0.01097093, + "auxiliary_loss_mlp": 0.01028309, + "balance_loss_clip": 1.0169487, + "balance_loss_mlp": 1.03154922, + "epoch": 0.9414098902750638, + "flos": 16618309891200.0, + "grad_norm": 2.647654113468961, + "language_loss": 0.84199923, + "learning_rate": 3.586617494785371e-08, + "loss": 0.86325324, + "num_input_tokens_seen": 337671810, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65234375, + "step": 15658, + "time_per_iteration": 2.4232261180877686 + }, + { + "auxiliary_loss_clip": 0.01105471, + "auxiliary_loss_mlp": 0.01032031, + "balance_loss_clip": 1.01844144, + "balance_loss_mlp": 1.03557217, + "epoch": 0.9414700135277319, + "flos": 18625033987200.0, + "grad_norm": 1.771509700042808, + "language_loss": 0.70189822, + "learning_rate": 3.5792785925675254e-08, + "loss": 0.72327328, + "num_input_tokens_seen": 337689410, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.69921875, + "step": 15659, + "time_per_iteration": 2.417872667312622 + }, + { + "auxiliary_loss_clip": 0.01097087, + "auxiliary_loss_mlp": 0.01037967, + "balance_loss_clip": 1.02753043, + "balance_loss_mlp": 1.03271377, + "epoch": 0.9415301367803999, + "flos": 26280146649600.0, + "grad_norm": 1.6741789301448684, + "language_loss": 0.79718721, + "learning_rate": 3.571947138643172e-08, + "loss": 0.81853777, + "num_input_tokens_seen": 337709950, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.64453125, + "step": 15660, + "time_per_iteration": 2.473811626434326 + }, + { + "auxiliary_loss_clip": 0.01095424, + "auxiliary_loss_mlp": 0.01027321, + "balance_loss_clip": 1.01637769, + "balance_loss_mlp": 1.03255498, + "epoch": 0.9415902600330678, + "flos": 23261388860160.0, + "grad_norm": 1.4876002398882395, + "language_loss": 0.67924452, + "learning_rate": 3.564623133290201e-08, + "loss": 0.700472, + "num_input_tokens_seen": 337731320, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.62890625, + "step": 15661, + "time_per_iteration": 2.494828224182129 + }, + { + "auxiliary_loss_clip": 0.01098031, + "auxiliary_loss_mlp": 0.01028231, + "balance_loss_clip": 1.01697206, + "balance_loss_mlp": 1.03291059, + "epoch": 0.9416503832857358, + "flos": 14719138093440.0, + "grad_norm": 2.0808441328825977, + "language_loss": 0.65976989, + "learning_rate": 3.557306576786434e-08, + "loss": 0.68103254, + "num_input_tokens_seen": 337747720, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65234375, + "step": 15662, + "time_per_iteration": 2.4719059467315674 + }, + { + "auxiliary_loss_clip": 0.0102173, + "auxiliary_loss_mlp": 0.0099693, + "balance_loss_clip": 0.9958874, + "balance_loss_mlp": 1.00163436, + "epoch": 0.9417105065384037, + "flos": 70312698276480.0, + "grad_norm": 0.7998608385286157, + "language_loss": 0.59257972, + "learning_rate": 3.5499974694092935e-08, + "loss": 0.61276639, + "num_input_tokens_seen": 337806930, + "router_z_loss_clip": 0.01043701, + "router_z_loss_mlp": 0.20117188, + "step": 15663, + "time_per_iteration": 3.091102361679077 + }, + { + "auxiliary_loss_clip": 0.01104755, + "auxiliary_loss_mlp": 0.01034523, + "balance_loss_clip": 1.02186322, + "balance_loss_mlp": 1.03546786, + "epoch": 0.9417706297910717, + "flos": 34057895322240.0, + "grad_norm": 1.7691136273672572, + "language_loss": 0.66977489, + "learning_rate": 3.542695811435914e-08, + "loss": 0.69116765, + "num_input_tokens_seen": 337828100, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.69140625, + "step": 15664, + "time_per_iteration": 2.551748514175415 + }, + { + "auxiliary_loss_clip": 0.01098686, + "auxiliary_loss_mlp": 0.01029058, + "balance_loss_clip": 1.01798368, + "balance_loss_mlp": 1.03435826, + "epoch": 0.9418307530437396, + "flos": 16471614746880.0, + "grad_norm": 1.9356485075218302, + "language_loss": 0.73331189, + "learning_rate": 3.535401603143207e-08, + "loss": 0.75458938, + "num_input_tokens_seen": 337844805, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.64453125, + "step": 15665, + "time_per_iteration": 2.406175136566162 + }, + { + "auxiliary_loss_clip": 0.01096646, + "auxiliary_loss_mlp": 0.0103092, + "balance_loss_clip": 1.01976252, + "balance_loss_mlp": 1.03395486, + "epoch": 0.9418908762964077, + "flos": 11253543114240.0, + "grad_norm": 1.8537640215449973, + "language_loss": 0.6403262, + "learning_rate": 3.528114844807773e-08, + "loss": 0.6616019, + "num_input_tokens_seen": 337860490, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.625, + "step": 15666, + "time_per_iteration": 2.4373819828033447 + }, + { + "auxiliary_loss_clip": 0.0109862, + "auxiliary_loss_mlp": 0.01029507, + "balance_loss_clip": 1.01766348, + "balance_loss_mlp": 1.03337002, + "epoch": 0.9419509995490756, + "flos": 18438836860800.0, + "grad_norm": 1.6324582019369962, + "language_loss": 0.78879476, + "learning_rate": 3.520835536705902e-08, + "loss": 0.81007606, + "num_input_tokens_seen": 337878360, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65234375, + "step": 15667, + "time_per_iteration": 2.475374937057495 + }, + { + "auxiliary_loss_clip": 0.01096246, + "auxiliary_loss_mlp": 0.01025503, + "balance_loss_clip": 1.01497126, + "balance_loss_mlp": 1.03265738, + "epoch": 0.9420111228017436, + "flos": 20737945664640.0, + "grad_norm": 1.624394565290511, + "language_loss": 0.75196528, + "learning_rate": 3.5135636791136404e-08, + "loss": 0.77318275, + "num_input_tokens_seen": 337895635, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.63671875, + "step": 15668, + "time_per_iteration": 2.4471614360809326 + }, + { + "auxiliary_loss_clip": 0.01100563, + "auxiliary_loss_mlp": 0.01029941, + "balance_loss_clip": 1.01816332, + "balance_loss_mlp": 1.03369188, + "epoch": 0.9420712460544115, + "flos": 21141940907520.0, + "grad_norm": 1.9068150139055333, + "language_loss": 0.58626127, + "learning_rate": 3.506299272306723e-08, + "loss": 0.6075663, + "num_input_tokens_seen": 337913940, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.66796875, + "step": 15669, + "time_per_iteration": 2.4526097774505615 + }, + { + "auxiliary_loss_clip": 0.01094433, + "auxiliary_loss_mlp": 0.01024409, + "balance_loss_clip": 1.01398444, + "balance_loss_mlp": 1.03261268, + "epoch": 0.9421313693070795, + "flos": 15851760721920.0, + "grad_norm": 1.5947911043474419, + "language_loss": 0.76924133, + "learning_rate": 3.4990423165606406e-08, + "loss": 0.79042977, + "num_input_tokens_seen": 337932015, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.6171875, + "step": 15670, + "time_per_iteration": 2.4160544872283936 + }, + { + "auxiliary_loss_clip": 0.01100773, + "auxiliary_loss_mlp": 0.01035593, + "balance_loss_clip": 1.02370214, + "balance_loss_mlp": 1.03572822, + "epoch": 0.9421914925597474, + "flos": 32415915882240.0, + "grad_norm": 1.7512755345783233, + "language_loss": 0.65079868, + "learning_rate": 3.491792812150574e-08, + "loss": 0.67216229, + "num_input_tokens_seen": 337953345, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.65234375, + "step": 15671, + "time_per_iteration": 2.55161714553833 + }, + { + "auxiliary_loss_clip": 0.01098368, + "auxiliary_loss_mlp": 0.01030287, + "balance_loss_clip": 1.018682, + "balance_loss_mlp": 1.03393149, + "epoch": 0.9422516158124155, + "flos": 19718513769600.0, + "grad_norm": 1.6476234457343555, + "language_loss": 0.79277271, + "learning_rate": 3.48455075935139e-08, + "loss": 0.81405926, + "num_input_tokens_seen": 337973685, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.64453125, + "step": 15672, + "time_per_iteration": 2.447295904159546 + }, + { + "auxiliary_loss_clip": 0.01102241, + "auxiliary_loss_mlp": 0.01033911, + "balance_loss_clip": 1.02151322, + "balance_loss_mlp": 1.03375137, + "epoch": 0.9423117390650835, + "flos": 16253277926400.0, + "grad_norm": 2.0043909265560376, + "language_loss": 0.73136175, + "learning_rate": 3.47731615843776e-08, + "loss": 0.75272328, + "num_input_tokens_seen": 337989175, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.68359375, + "step": 15673, + "time_per_iteration": 2.4218337535858154 + }, + { + "auxiliary_loss_clip": 0.01097219, + "auxiliary_loss_mlp": 0.01028159, + "balance_loss_clip": 1.01602328, + "balance_loss_mlp": 1.03284574, + "epoch": 0.9423718623177514, + "flos": 31796564647680.0, + "grad_norm": 1.8096983236306534, + "language_loss": 0.70210505, + "learning_rate": 3.470089009683974e-08, + "loss": 0.72335875, + "num_input_tokens_seen": 338011800, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.64453125, + "step": 15674, + "time_per_iteration": 2.529244899749756 + }, + { + "auxiliary_loss_clip": 0.01098708, + "auxiliary_loss_mlp": 0.01024471, + "balance_loss_clip": 1.01358771, + "balance_loss_mlp": 1.03351128, + "epoch": 0.9424319855704194, + "flos": 23331809473920.0, + "grad_norm": 1.6393358696114226, + "language_loss": 0.81179047, + "learning_rate": 3.462869313364125e-08, + "loss": 0.8330223, + "num_input_tokens_seen": 338032120, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.65234375, + "step": 15675, + "time_per_iteration": 2.4658918380737305 + }, + { + "auxiliary_loss_clip": 0.01099127, + "auxiliary_loss_mlp": 0.01025447, + "balance_loss_clip": 1.01464629, + "balance_loss_mlp": 1.03426433, + "epoch": 0.9424921088230873, + "flos": 20777627214720.0, + "grad_norm": 1.5876960969874918, + "language_loss": 0.62726951, + "learning_rate": 3.4556570697519494e-08, + "loss": 0.64851522, + "num_input_tokens_seen": 338051880, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6484375, + "step": 15676, + "time_per_iteration": 2.4417946338653564 + }, + { + "auxiliary_loss_clip": 0.01098357, + "auxiliary_loss_mlp": 0.01036136, + "balance_loss_clip": 1.02487063, + "balance_loss_mlp": 1.03403687, + "epoch": 0.9425522320757553, + "flos": 19026658932480.0, + "grad_norm": 1.8124440468935443, + "language_loss": 0.67221808, + "learning_rate": 3.448452279120984e-08, + "loss": 0.69356304, + "num_input_tokens_seen": 338069665, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.64453125, + "step": 15677, + "time_per_iteration": 2.468874454498291 + }, + { + "auxiliary_loss_clip": 0.01099749, + "auxiliary_loss_mlp": 0.01033718, + "balance_loss_clip": 1.02086103, + "balance_loss_mlp": 1.03233802, + "epoch": 0.9426123553284232, + "flos": 25155353185920.0, + "grad_norm": 1.9350758720269774, + "language_loss": 0.64217019, + "learning_rate": 3.441254941744387e-08, + "loss": 0.6635049, + "num_input_tokens_seen": 338090490, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.67578125, + "step": 15678, + "time_per_iteration": 4.009870290756226 + }, + { + "auxiliary_loss_clip": 0.01098418, + "auxiliary_loss_mlp": 0.01026173, + "balance_loss_clip": 1.01448464, + "balance_loss_mlp": 1.03428507, + "epoch": 0.9426724785810913, + "flos": 21179359900800.0, + "grad_norm": 1.6704050189510526, + "language_loss": 0.74096805, + "learning_rate": 3.434065057895097e-08, + "loss": 0.76221395, + "num_input_tokens_seen": 338109825, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.640625, + "step": 15679, + "time_per_iteration": 2.480060338973999 + }, + { + "auxiliary_loss_clip": 0.01102722, + "auxiliary_loss_mlp": 0.01034946, + "balance_loss_clip": 1.02322173, + "balance_loss_mlp": 1.03508186, + "epoch": 0.9427326018337592, + "flos": 14756916222720.0, + "grad_norm": 2.2968062400181757, + "language_loss": 0.7742976, + "learning_rate": 3.426882627845762e-08, + "loss": 0.7956742, + "num_input_tokens_seen": 338125790, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6796875, + "step": 15680, + "time_per_iteration": 3.8283774852752686 + }, + { + "auxiliary_loss_clip": 0.01098292, + "auxiliary_loss_mlp": 0.01032384, + "balance_loss_clip": 1.02092791, + "balance_loss_mlp": 1.03348768, + "epoch": 0.9427927250864272, + "flos": 20923640000640.0, + "grad_norm": 2.224608877845115, + "language_loss": 0.75309384, + "learning_rate": 3.419707651868742e-08, + "loss": 0.77440059, + "num_input_tokens_seen": 338145610, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6484375, + "step": 15681, + "time_per_iteration": 3.931302547454834 + }, + { + "auxiliary_loss_clip": 0.01101593, + "auxiliary_loss_mlp": 0.01031351, + "balance_loss_clip": 1.0193882, + "balance_loss_mlp": 1.035344, + "epoch": 0.9428528483390951, + "flos": 19752520970880.0, + "grad_norm": 2.642338039071416, + "language_loss": 0.65794468, + "learning_rate": 3.412540130236086e-08, + "loss": 0.67927414, + "num_input_tokens_seen": 338165960, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6640625, + "step": 15682, + "time_per_iteration": 2.472961664199829 + }, + { + "auxiliary_loss_clip": 0.01096718, + "auxiliary_loss_mlp": 0.01026149, + "balance_loss_clip": 1.01485944, + "balance_loss_mlp": 1.03221107, + "epoch": 0.9429129715917631, + "flos": 24534996370560.0, + "grad_norm": 3.7877883909728833, + "language_loss": 0.76713276, + "learning_rate": 3.405380063219665e-08, + "loss": 0.78836143, + "num_input_tokens_seen": 338187215, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6484375, + "step": 15683, + "time_per_iteration": 2.4841740131378174 + }, + { + "auxiliary_loss_clip": 0.01103642, + "auxiliary_loss_mlp": 0.01036534, + "balance_loss_clip": 1.02421999, + "balance_loss_mlp": 1.03587162, + "epoch": 0.942973094844431, + "flos": 17959824063360.0, + "grad_norm": 2.6308434304413066, + "language_loss": 0.75243759, + "learning_rate": 3.398227451090885e-08, + "loss": 0.77383941, + "num_input_tokens_seen": 338201825, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6796875, + "step": 15684, + "time_per_iteration": 2.407205104827881 + }, + { + "auxiliary_loss_clip": 0.01096124, + "auxiliary_loss_mlp": 0.01024816, + "balance_loss_clip": 1.01373529, + "balance_loss_mlp": 1.033481, + "epoch": 0.9430332180970991, + "flos": 26137689310080.0, + "grad_norm": 1.5634156526409637, + "language_loss": 0.77202857, + "learning_rate": 3.391082294121017e-08, + "loss": 0.79323792, + "num_input_tokens_seen": 338220865, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.625, + "step": 15685, + "time_per_iteration": 3.919050455093384 + }, + { + "auxiliary_loss_clip": 0.01095885, + "auxiliary_loss_mlp": 0.01028054, + "balance_loss_clip": 1.01716995, + "balance_loss_mlp": 1.03258085, + "epoch": 0.943093341349767, + "flos": 23951376190080.0, + "grad_norm": 1.8298649374723515, + "language_loss": 0.75466609, + "learning_rate": 3.383944592581023e-08, + "loss": 0.77590549, + "num_input_tokens_seen": 338240160, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6328125, + "step": 15686, + "time_per_iteration": 2.4832725524902344 + }, + { + "auxiliary_loss_clip": 0.01101014, + "auxiliary_loss_mlp": 0.01026995, + "balance_loss_clip": 1.01487172, + "balance_loss_mlp": 1.03364956, + "epoch": 0.943153464602435, + "flos": 17968407413760.0, + "grad_norm": 1.8707164561983298, + "language_loss": 0.80791461, + "learning_rate": 3.376814346741575e-08, + "loss": 0.82919466, + "num_input_tokens_seen": 338259305, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 15687, + "time_per_iteration": 2.447073221206665 + }, + { + "auxiliary_loss_clip": 0.01103674, + "auxiliary_loss_mlp": 0.01032562, + "balance_loss_clip": 1.01928234, + "balance_loss_mlp": 1.03497446, + "epoch": 0.943213587855103, + "flos": 14501519544960.0, + "grad_norm": 4.150535052398094, + "language_loss": 0.75942636, + "learning_rate": 3.369691556873011e-08, + "loss": 0.78078878, + "num_input_tokens_seen": 338274950, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6875, + "step": 15688, + "time_per_iteration": 2.449028730392456 + }, + { + "auxiliary_loss_clip": 0.01095339, + "auxiliary_loss_mlp": 0.01024738, + "balance_loss_clip": 1.01338911, + "balance_loss_mlp": 1.03330553, + "epoch": 0.9432737111077709, + "flos": 28986411093120.0, + "grad_norm": 1.647047447068589, + "language_loss": 0.68151128, + "learning_rate": 3.3625762232454504e-08, + "loss": 0.70271206, + "num_input_tokens_seen": 338295585, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.62109375, + "step": 15689, + "time_per_iteration": 2.535231828689575 + }, + { + "auxiliary_loss_clip": 0.01097551, + "auxiliary_loss_mlp": 0.0103356, + "balance_loss_clip": 1.02347493, + "balance_loss_mlp": 1.03406012, + "epoch": 0.9433338343604389, + "flos": 21609066303360.0, + "grad_norm": 1.9633082824839947, + "language_loss": 0.80533433, + "learning_rate": 3.35546834612872e-08, + "loss": 0.82664549, + "num_input_tokens_seen": 338314555, + "router_z_loss_clip": 0.10058594, + "router_z_loss_mlp": 0.6328125, + "step": 15690, + "time_per_iteration": 2.475369691848755 + }, + { + "auxiliary_loss_clip": 0.01098715, + "auxiliary_loss_mlp": 0.01029222, + "balance_loss_clip": 1.01812971, + "balance_loss_mlp": 1.03422964, + "epoch": 0.9433939576131068, + "flos": 33182285483520.0, + "grad_norm": 1.82445533234153, + "language_loss": 0.60167646, + "learning_rate": 3.348367925792317e-08, + "loss": 0.6229558, + "num_input_tokens_seen": 338336260, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.64453125, + "step": 15691, + "time_per_iteration": 2.6009466648101807 + }, + { + "auxiliary_loss_clip": 0.0110339, + "auxiliary_loss_mlp": 0.01027182, + "balance_loss_clip": 1.01595259, + "balance_loss_mlp": 1.03676319, + "epoch": 0.9434540808657749, + "flos": 20486391742080.0, + "grad_norm": 1.6101520183489826, + "language_loss": 0.66512716, + "learning_rate": 3.341274962505514e-08, + "loss": 0.68643284, + "num_input_tokens_seen": 338354680, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6640625, + "step": 15692, + "time_per_iteration": 2.4502696990966797 + }, + { + "auxiliary_loss_clip": 0.01100244, + "auxiliary_loss_mlp": 0.01028364, + "balance_loss_clip": 1.0168364, + "balance_loss_mlp": 1.03405428, + "epoch": 0.9435142041184428, + "flos": 21542955321600.0, + "grad_norm": 2.6572224401023212, + "language_loss": 0.75021255, + "learning_rate": 3.334189456537251e-08, + "loss": 0.77149868, + "num_input_tokens_seen": 338372490, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 15693, + "time_per_iteration": 2.433387517929077 + }, + { + "auxiliary_loss_clip": 0.01100061, + "auxiliary_loss_mlp": 0.01029217, + "balance_loss_clip": 1.01798141, + "balance_loss_mlp": 1.03463674, + "epoch": 0.9435743273711108, + "flos": 25009089004800.0, + "grad_norm": 1.6938865356157475, + "language_loss": 0.72807014, + "learning_rate": 3.327111408156291e-08, + "loss": 0.74936283, + "num_input_tokens_seen": 338390870, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 15694, + "time_per_iteration": 2.5123260021209717 + }, + { + "auxiliary_loss_clip": 0.01021837, + "auxiliary_loss_mlp": 0.01003079, + "balance_loss_clip": 1.00211906, + "balance_loss_mlp": 1.00187385, + "epoch": 0.9436344506237787, + "flos": 60158707320960.0, + "grad_norm": 0.7028121553364509, + "language_loss": 0.5058524, + "learning_rate": 3.3200408176309316e-08, + "loss": 0.52610159, + "num_input_tokens_seen": 338453075, + "router_z_loss_clip": 0.00958252, + "router_z_loss_mlp": 0.19921875, + "step": 15695, + "time_per_iteration": 3.097665786743164 + }, + { + "auxiliary_loss_clip": 0.01094346, + "auxiliary_loss_mlp": 0.01028873, + "balance_loss_clip": 1.01795948, + "balance_loss_mlp": 1.03261745, + "epoch": 0.9436945738764467, + "flos": 22237252283520.0, + "grad_norm": 1.6978442865454357, + "language_loss": 0.64904177, + "learning_rate": 3.312977685229335e-08, + "loss": 0.67027402, + "num_input_tokens_seen": 338471770, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6171875, + "step": 15696, + "time_per_iteration": 2.499131679534912 + }, + { + "auxiliary_loss_clip": 0.01100812, + "auxiliary_loss_mlp": 0.01028395, + "balance_loss_clip": 1.01725507, + "balance_loss_mlp": 1.03525257, + "epoch": 0.9437546971291146, + "flos": 25045179194880.0, + "grad_norm": 1.6440087246701751, + "language_loss": 0.66226554, + "learning_rate": 3.305922011219353e-08, + "loss": 0.68355763, + "num_input_tokens_seen": 338492190, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 15697, + "time_per_iteration": 2.471853733062744 + }, + { + "auxiliary_loss_clip": 0.0102159, + "auxiliary_loss_mlp": 0.01000945, + "balance_loss_clip": 0.99992609, + "balance_loss_mlp": 1.00164413, + "epoch": 0.9438148203817827, + "flos": 56790788400000.0, + "grad_norm": 0.8437845938587906, + "language_loss": 0.63223118, + "learning_rate": 3.298873795868506e-08, + "loss": 0.65245652, + "num_input_tokens_seen": 338552560, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.19921875, + "step": 15698, + "time_per_iteration": 2.9581832885742188 + }, + { + "auxiliary_loss_clip": 0.01102672, + "auxiliary_loss_mlp": 0.01035134, + "balance_loss_clip": 1.02309942, + "balance_loss_mlp": 1.03445995, + "epoch": 0.9438749436344506, + "flos": 22346384780160.0, + "grad_norm": 1.6461006250652415, + "language_loss": 0.69387424, + "learning_rate": 3.291833039444092e-08, + "loss": 0.71525228, + "num_input_tokens_seen": 338571770, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 15699, + "time_per_iteration": 2.4698126316070557 + }, + { + "auxiliary_loss_clip": 0.01094807, + "auxiliary_loss_mlp": 0.01027864, + "balance_loss_clip": 1.01686084, + "balance_loss_mlp": 1.03219104, + "epoch": 0.9439350668871186, + "flos": 13370800337280.0, + "grad_norm": 3.490240775423036, + "language_loss": 0.74452382, + "learning_rate": 3.2847997422130734e-08, + "loss": 0.76575059, + "num_input_tokens_seen": 338587310, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.625, + "step": 15700, + "time_per_iteration": 2.4451019763946533 + }, + { + "auxiliary_loss_clip": 0.0109842, + "auxiliary_loss_mlp": 0.01030003, + "balance_loss_clip": 1.01925063, + "balance_loss_mlp": 1.03485513, + "epoch": 0.9439951901397866, + "flos": 17785334770560.0, + "grad_norm": 1.502855371588381, + "language_loss": 0.69993806, + "learning_rate": 3.2777739044421495e-08, + "loss": 0.72122228, + "num_input_tokens_seen": 338606235, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.63671875, + "step": 15701, + "time_per_iteration": 2.447377920150757 + }, + { + "auxiliary_loss_clip": 0.01102808, + "auxiliary_loss_mlp": 0.01025784, + "balance_loss_clip": 1.01436985, + "balance_loss_mlp": 1.03356802, + "epoch": 0.9440553133924545, + "flos": 18879568738560.0, + "grad_norm": 2.0452971801099764, + "language_loss": 0.77940154, + "learning_rate": 3.2707555263977505e-08, + "loss": 0.80068743, + "num_input_tokens_seen": 338624090, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.69140625, + "step": 15702, + "time_per_iteration": 2.4038772583007812 + }, + { + "auxiliary_loss_clip": 0.0110026, + "auxiliary_loss_mlp": 0.01030959, + "balance_loss_clip": 1.01999187, + "balance_loss_mlp": 1.03408504, + "epoch": 0.9441154366451225, + "flos": 19572967860480.0, + "grad_norm": 1.7859029402689504, + "language_loss": 0.66538978, + "learning_rate": 3.2637446083460194e-08, + "loss": 0.68670201, + "num_input_tokens_seen": 338643695, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.66015625, + "step": 15703, + "time_per_iteration": 2.489464044570923 + }, + { + "auxiliary_loss_clip": 0.01102883, + "auxiliary_loss_mlp": 0.01030561, + "balance_loss_clip": 1.01824689, + "balance_loss_mlp": 1.03595018, + "epoch": 0.9441755598977905, + "flos": 30294995472000.0, + "grad_norm": 3.744799778583578, + "language_loss": 0.72917163, + "learning_rate": 3.256741150552833e-08, + "loss": 0.7505061, + "num_input_tokens_seen": 338664725, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.671875, + "step": 15704, + "time_per_iteration": 2.5033814907073975 + }, + { + "auxiliary_loss_clip": 0.01098437, + "auxiliary_loss_mlp": 0.01030263, + "balance_loss_clip": 1.01902747, + "balance_loss_mlp": 1.03447068, + "epoch": 0.9442356831504585, + "flos": 20667884186880.0, + "grad_norm": 1.907105078977413, + "language_loss": 0.7433669, + "learning_rate": 3.2497451532837336e-08, + "loss": 0.76465392, + "num_input_tokens_seen": 338683990, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 15705, + "time_per_iteration": 2.4515786170959473 + }, + { + "auxiliary_loss_clip": 0.01100917, + "auxiliary_loss_mlp": 0.01032258, + "balance_loss_clip": 1.02148199, + "balance_loss_mlp": 1.03535068, + "epoch": 0.9442958064031264, + "flos": 16107265140480.0, + "grad_norm": 2.196711266527949, + "language_loss": 0.76928145, + "learning_rate": 3.2427566168039986e-08, + "loss": 0.79061323, + "num_input_tokens_seen": 338702025, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.65234375, + "step": 15706, + "time_per_iteration": 2.4352962970733643 + }, + { + "auxiliary_loss_clip": 0.01095703, + "auxiliary_loss_mlp": 0.01025784, + "balance_loss_clip": 1.01473355, + "balance_loss_mlp": 1.03289199, + "epoch": 0.9443559296557944, + "flos": 20447392550400.0, + "grad_norm": 1.43451723106199, + "language_loss": 0.693344, + "learning_rate": 3.23577554137866e-08, + "loss": 0.71455884, + "num_input_tokens_seen": 338720920, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.62890625, + "step": 15707, + "time_per_iteration": 2.453019380569458 + }, + { + "auxiliary_loss_clip": 0.01091425, + "auxiliary_loss_mlp": 0.01024987, + "balance_loss_clip": 1.01502132, + "balance_loss_mlp": 1.02994144, + "epoch": 0.9444160529084623, + "flos": 21610897896960.0, + "grad_norm": 1.7631234566340965, + "language_loss": 0.69443661, + "learning_rate": 3.22880192727244e-08, + "loss": 0.71560073, + "num_input_tokens_seen": 338739590, + "router_z_loss_clip": 0.09960938, + "router_z_loss_mlp": 0.6171875, + "step": 15708, + "time_per_iteration": 2.4164559841156006 + }, + { + "auxiliary_loss_clip": 0.01098199, + "auxiliary_loss_mlp": 0.01030728, + "balance_loss_clip": 1.01954079, + "balance_loss_mlp": 1.03435826, + "epoch": 0.9444761761611303, + "flos": 18441781776000.0, + "grad_norm": 2.6914619241923896, + "language_loss": 0.70139289, + "learning_rate": 3.221835774749748e-08, + "loss": 0.72268212, + "num_input_tokens_seen": 338757240, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.640625, + "step": 15709, + "time_per_iteration": 2.4482839107513428 + }, + { + "auxiliary_loss_clip": 0.01096914, + "auxiliary_loss_mlp": 0.01032791, + "balance_loss_clip": 1.02171016, + "balance_loss_mlp": 1.03418076, + "epoch": 0.9445362994137982, + "flos": 20957144411520.0, + "grad_norm": 9.952214688927834, + "language_loss": 0.84433717, + "learning_rate": 3.214877084074774e-08, + "loss": 0.8656342, + "num_input_tokens_seen": 338773750, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.62890625, + "step": 15710, + "time_per_iteration": 2.4583065509796143 + }, + { + "auxiliary_loss_clip": 0.01103261, + "auxiliary_loss_mlp": 0.01032053, + "balance_loss_clip": 1.01973879, + "balance_loss_mlp": 1.03534627, + "epoch": 0.9445964226664663, + "flos": 20303283185280.0, + "grad_norm": 1.6435224047891799, + "language_loss": 0.71200496, + "learning_rate": 3.2079258555113956e-08, + "loss": 0.73335809, + "num_input_tokens_seen": 338792115, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6796875, + "step": 15711, + "time_per_iteration": 2.4461560249328613 + }, + { + "auxiliary_loss_clip": 0.01101943, + "auxiliary_loss_mlp": 0.01025338, + "balance_loss_clip": 1.01372731, + "balance_loss_mlp": 1.03681183, + "epoch": 0.9446565459191342, + "flos": 26396030903040.0, + "grad_norm": 1.6856508103929682, + "language_loss": 0.69301665, + "learning_rate": 3.200982089323179e-08, + "loss": 0.71428949, + "num_input_tokens_seen": 338812480, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 15712, + "time_per_iteration": 2.557600736618042 + }, + { + "auxiliary_loss_clip": 0.0110423, + "auxiliary_loss_mlp": 0.01036108, + "balance_loss_clip": 1.02303076, + "balance_loss_mlp": 1.03638661, + "epoch": 0.9447166691718022, + "flos": 16544764794240.0, + "grad_norm": 2.9247099808601393, + "language_loss": 0.71096003, + "learning_rate": 3.1940457857734246e-08, + "loss": 0.73236346, + "num_input_tokens_seen": 338829105, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6796875, + "step": 15713, + "time_per_iteration": 2.4392521381378174 + }, + { + "auxiliary_loss_clip": 0.01096369, + "auxiliary_loss_mlp": 0.01032903, + "balance_loss_clip": 1.02093422, + "balance_loss_mlp": 1.03330159, + "epoch": 0.9447767924244702, + "flos": 29164635400320.0, + "grad_norm": 1.5251847757385297, + "language_loss": 0.76915956, + "learning_rate": 3.187116945125212e-08, + "loss": 0.79045224, + "num_input_tokens_seen": 338850670, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6328125, + "step": 15714, + "time_per_iteration": 2.5028111934661865 + }, + { + "auxiliary_loss_clip": 0.01099452, + "auxiliary_loss_mlp": 0.01032222, + "balance_loss_clip": 1.02040792, + "balance_loss_mlp": 1.03315997, + "epoch": 0.9448369156771381, + "flos": 19274908803840.0, + "grad_norm": 1.7713922514994944, + "language_loss": 0.67678571, + "learning_rate": 3.1801955676412194e-08, + "loss": 0.69810236, + "num_input_tokens_seen": 338867795, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 15715, + "time_per_iteration": 2.4388058185577393 + }, + { + "auxiliary_loss_clip": 0.01100087, + "auxiliary_loss_mlp": 0.01029371, + "balance_loss_clip": 1.01737273, + "balance_loss_mlp": 1.03375924, + "epoch": 0.9448970389298061, + "flos": 23841166285440.0, + "grad_norm": 3.553477247442109, + "language_loss": 0.7459079, + "learning_rate": 3.173281653583948e-08, + "loss": 0.76720244, + "num_input_tokens_seen": 338887205, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 15716, + "time_per_iteration": 2.463731288909912 + }, + { + "auxiliary_loss_clip": 0.01103368, + "auxiliary_loss_mlp": 0.01030902, + "balance_loss_clip": 1.01924956, + "balance_loss_mlp": 1.03722072, + "epoch": 0.944957162182474, + "flos": 22382259488640.0, + "grad_norm": 1.6549087243556793, + "language_loss": 0.62538469, + "learning_rate": 3.166375203215565e-08, + "loss": 0.64672738, + "num_input_tokens_seen": 338906130, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 15717, + "time_per_iteration": 2.4671406745910645 + }, + { + "auxiliary_loss_clip": 0.01099863, + "auxiliary_loss_mlp": 0.01031946, + "balance_loss_clip": 1.02062726, + "balance_loss_mlp": 1.03491199, + "epoch": 0.9450172854351421, + "flos": 17383889393280.0, + "grad_norm": 1.6376281628513882, + "language_loss": 0.79284263, + "learning_rate": 3.1594762167979514e-08, + "loss": 0.8141607, + "num_input_tokens_seen": 338923045, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 15718, + "time_per_iteration": 2.462629556655884 + }, + { + "auxiliary_loss_clip": 0.01021525, + "auxiliary_loss_mlp": 0.01003439, + "balance_loss_clip": 1.002545, + "balance_loss_mlp": 1.0015378, + "epoch": 0.94507740868781, + "flos": 68466352406400.0, + "grad_norm": 0.6948733429962052, + "language_loss": 0.578394, + "learning_rate": 3.152584694592719e-08, + "loss": 0.5986436, + "num_input_tokens_seen": 338987545, + "router_z_loss_clip": 0.00891113, + "router_z_loss_mlp": 0.19921875, + "step": 15719, + "time_per_iteration": 3.0780253410339355 + }, + { + "auxiliary_loss_clip": 0.01100233, + "auxiliary_loss_mlp": 0.01027292, + "balance_loss_clip": 1.015764, + "balance_loss_mlp": 1.03423667, + "epoch": 0.945137531940478, + "flos": 21142479611520.0, + "grad_norm": 1.5397760146484176, + "language_loss": 0.75893283, + "learning_rate": 3.145700636861193e-08, + "loss": 0.78020811, + "num_input_tokens_seen": 339007830, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66015625, + "step": 15720, + "time_per_iteration": 3.864163875579834 + }, + { + "auxiliary_loss_clip": 0.01096195, + "auxiliary_loss_mlp": 0.010281, + "balance_loss_clip": 1.01787829, + "balance_loss_mlp": 1.03208733, + "epoch": 0.9451976551931459, + "flos": 24533918962560.0, + "grad_norm": 1.6263485050916464, + "language_loss": 0.72628319, + "learning_rate": 3.138824043864452e-08, + "loss": 0.74752611, + "num_input_tokens_seen": 339028980, + "router_z_loss_clip": 0.10205078, + "router_z_loss_mlp": 0.640625, + "step": 15721, + "time_per_iteration": 2.5096383094787598 + }, + { + "auxiliary_loss_clip": 0.01100377, + "auxiliary_loss_mlp": 0.01033389, + "balance_loss_clip": 1.0211221, + "balance_loss_mlp": 1.0353353, + "epoch": 0.9452577784458139, + "flos": 23440582834560.0, + "grad_norm": 1.7722462509073895, + "language_loss": 0.85373968, + "learning_rate": 3.131954915863244e-08, + "loss": 0.87507731, + "num_input_tokens_seen": 339047950, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6484375, + "step": 15722, + "time_per_iteration": 3.9022328853607178 + }, + { + "auxiliary_loss_clip": 0.01021641, + "auxiliary_loss_mlp": 0.01002369, + "balance_loss_clip": 1.00133801, + "balance_loss_mlp": 1.0015502, + "epoch": 0.9453179016984818, + "flos": 52017686449920.0, + "grad_norm": 0.8900631949326635, + "language_loss": 0.64461863, + "learning_rate": 3.125093253118005e-08, + "loss": 0.66485882, + "num_input_tokens_seen": 339104535, + "router_z_loss_clip": 0.01031494, + "router_z_loss_mlp": 0.20117188, + "step": 15723, + "time_per_iteration": 4.41249418258667 + }, + { + "auxiliary_loss_clip": 0.01101146, + "auxiliary_loss_mlp": 0.01030443, + "balance_loss_clip": 1.01846862, + "balance_loss_mlp": 1.03444242, + "epoch": 0.9453780249511499, + "flos": 13473001509120.0, + "grad_norm": 2.04231745236359, + "language_loss": 0.73194891, + "learning_rate": 3.1182390558889715e-08, + "loss": 0.75326478, + "num_input_tokens_seen": 339122050, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66796875, + "step": 15724, + "time_per_iteration": 2.4040088653564453 + }, + { + "auxiliary_loss_clip": 0.0109916, + "auxiliary_loss_mlp": 0.01025226, + "balance_loss_clip": 1.01462817, + "balance_loss_mlp": 1.03418875, + "epoch": 0.9454381482038178, + "flos": 23258515772160.0, + "grad_norm": 2.5881922825982615, + "language_loss": 0.84684968, + "learning_rate": 3.111392324436024e-08, + "loss": 0.86809349, + "num_input_tokens_seen": 339138940, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.6484375, + "step": 15725, + "time_per_iteration": 2.469430446624756 + }, + { + "auxiliary_loss_clip": 0.01098906, + "auxiliary_loss_mlp": 0.01027108, + "balance_loss_clip": 1.01581264, + "balance_loss_mlp": 1.03359258, + "epoch": 0.9454982714564858, + "flos": 19496621502720.0, + "grad_norm": 1.7062685853482866, + "language_loss": 0.71106911, + "learning_rate": 3.104553059018822e-08, + "loss": 0.73232925, + "num_input_tokens_seen": 339158245, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65625, + "step": 15726, + "time_per_iteration": 2.4455809593200684 + }, + { + "auxiliary_loss_clip": 0.01097846, + "auxiliary_loss_mlp": 0.01028063, + "balance_loss_clip": 1.0157423, + "balance_loss_mlp": 1.03294992, + "epoch": 0.9455583947091538, + "flos": 23258120722560.0, + "grad_norm": 1.6702126364997434, + "language_loss": 0.60863376, + "learning_rate": 3.097721259896735e-08, + "loss": 0.62989283, + "num_input_tokens_seen": 339178200, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6484375, + "step": 15727, + "time_per_iteration": 3.9425292015075684 + }, + { + "auxiliary_loss_clip": 0.01095273, + "auxiliary_loss_mlp": 0.01033964, + "balance_loss_clip": 1.023265, + "balance_loss_mlp": 1.03242397, + "epoch": 0.9456185179618217, + "flos": 17673041877120.0, + "grad_norm": 1.8004377076099485, + "language_loss": 0.81886947, + "learning_rate": 3.0908969273287566e-08, + "loss": 0.8401618, + "num_input_tokens_seen": 339193950, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.62890625, + "step": 15728, + "time_per_iteration": 2.4058585166931152 + }, + { + "auxiliary_loss_clip": 0.01021632, + "auxiliary_loss_mlp": 0.00997147, + "balance_loss_clip": 0.99612808, + "balance_loss_mlp": 1.00159681, + "epoch": 0.9456786412144897, + "flos": 61415040389760.0, + "grad_norm": 0.736051651837185, + "language_loss": 0.59150136, + "learning_rate": 3.08408006157368e-08, + "loss": 0.61168915, + "num_input_tokens_seen": 339252330, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.20117188, + "step": 15729, + "time_per_iteration": 2.9688003063201904 + }, + { + "auxiliary_loss_clip": 0.0109789, + "auxiliary_loss_mlp": 0.01028206, + "balance_loss_clip": 1.01618958, + "balance_loss_mlp": 1.03341413, + "epoch": 0.9457387644671577, + "flos": 18588369179520.0, + "grad_norm": 1.8776807928087538, + "language_loss": 0.762703, + "learning_rate": 3.077270662890052e-08, + "loss": 0.78396392, + "num_input_tokens_seen": 339270325, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.64453125, + "step": 15730, + "time_per_iteration": 2.4220995903015137 + }, + { + "auxiliary_loss_clip": 0.0109928, + "auxiliary_loss_mlp": 0.01031121, + "balance_loss_clip": 1.01887226, + "balance_loss_mlp": 1.03324008, + "epoch": 0.9457988877198257, + "flos": 21108544237440.0, + "grad_norm": 1.4182548688654766, + "language_loss": 0.62411594, + "learning_rate": 3.070468731536047e-08, + "loss": 0.64541996, + "num_input_tokens_seen": 339291980, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66015625, + "step": 15731, + "time_per_iteration": 2.5083041191101074 + }, + { + "auxiliary_loss_clip": 0.01100342, + "auxiliary_loss_mlp": 0.01025659, + "balance_loss_clip": 1.01327908, + "balance_loss_mlp": 1.03371453, + "epoch": 0.9458590109724936, + "flos": 26688379697280.0, + "grad_norm": 1.9378943529039063, + "language_loss": 0.63918054, + "learning_rate": 3.063674267769589e-08, + "loss": 0.66044056, + "num_input_tokens_seen": 339311795, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6640625, + "step": 15732, + "time_per_iteration": 2.4719395637512207 + }, + { + "auxiliary_loss_clip": 0.01103926, + "auxiliary_loss_mlp": 0.01027459, + "balance_loss_clip": 1.01500165, + "balance_loss_mlp": 1.03460908, + "epoch": 0.9459191342251616, + "flos": 18661591054080.0, + "grad_norm": 1.7756445337768159, + "language_loss": 0.83968151, + "learning_rate": 3.056887271848363e-08, + "loss": 0.86099535, + "num_input_tokens_seen": 339327745, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6953125, + "step": 15733, + "time_per_iteration": 2.4443578720092773 + }, + { + "auxiliary_loss_clip": 0.01095213, + "auxiliary_loss_mlp": 0.01026767, + "balance_loss_clip": 1.01640725, + "balance_loss_mlp": 1.03294325, + "epoch": 0.9459792574778295, + "flos": 23398459159680.0, + "grad_norm": 1.4695884497585416, + "language_loss": 0.72089154, + "learning_rate": 3.0501077440297173e-08, + "loss": 0.74211133, + "num_input_tokens_seen": 339346445, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.625, + "step": 15734, + "time_per_iteration": 2.4488422870635986 + }, + { + "auxiliary_loss_clip": 0.01092681, + "auxiliary_loss_mlp": 0.01027919, + "balance_loss_clip": 1.0181973, + "balance_loss_mlp": 1.0311662, + "epoch": 0.9460393807304975, + "flos": 24392969994240.0, + "grad_norm": 1.423173253742331, + "language_loss": 0.86974919, + "learning_rate": 3.043335684570692e-08, + "loss": 0.89095521, + "num_input_tokens_seen": 339367945, + "router_z_loss_clip": 0.09765625, + "router_z_loss_mlp": 0.6171875, + "step": 15735, + "time_per_iteration": 2.5103213787078857 + }, + { + "auxiliary_loss_clip": 0.0109908, + "auxiliary_loss_mlp": 0.01026303, + "balance_loss_clip": 1.01499617, + "balance_loss_mlp": 1.03345919, + "epoch": 0.9460995039831654, + "flos": 21939408708480.0, + "grad_norm": 2.0160825623367975, + "language_loss": 0.67346275, + "learning_rate": 3.036571093728102e-08, + "loss": 0.69471663, + "num_input_tokens_seen": 339386060, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 15736, + "time_per_iteration": 2.4414546489715576 + }, + { + "auxiliary_loss_clip": 0.01021593, + "auxiliary_loss_mlp": 0.01002149, + "balance_loss_clip": 1.00120163, + "balance_loss_mlp": 1.0015769, + "epoch": 0.9461596272358335, + "flos": 70322466775680.0, + "grad_norm": 0.9159483735058672, + "language_loss": 0.65298235, + "learning_rate": 3.029813971758499e-08, + "loss": 0.6732198, + "num_input_tokens_seen": 339446695, + "router_z_loss_clip": 0.00946045, + "router_z_loss_mlp": 0.20019531, + "step": 15737, + "time_per_iteration": 3.1042568683624268 + }, + { + "auxiliary_loss_clip": 0.01021626, + "auxiliary_loss_mlp": 0.00999988, + "balance_loss_clip": 0.99901086, + "balance_loss_mlp": 1.00169897, + "epoch": 0.9462197504885014, + "flos": 58591242645120.0, + "grad_norm": 0.8017129104896167, + "language_loss": 0.58838046, + "learning_rate": 3.0230643189181225e-08, + "loss": 0.60859656, + "num_input_tokens_seen": 339510080, + "router_z_loss_clip": 0.00976562, + "router_z_loss_mlp": 0.19921875, + "step": 15738, + "time_per_iteration": 3.052255153656006 + }, + { + "auxiliary_loss_clip": 0.01095699, + "auxiliary_loss_mlp": 0.01028308, + "balance_loss_clip": 1.01794255, + "balance_loss_mlp": 1.0324806, + "epoch": 0.9462798737411694, + "flos": 23433759250560.0, + "grad_norm": 1.740585721975819, + "language_loss": 0.71850687, + "learning_rate": 3.016322135462834e-08, + "loss": 0.73974693, + "num_input_tokens_seen": 339529335, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.6328125, + "step": 15739, + "time_per_iteration": 2.453784704208374 + }, + { + "auxiliary_loss_clip": 0.01099551, + "auxiliary_loss_mlp": 0.01031808, + "balance_loss_clip": 1.0200305, + "balance_loss_mlp": 1.0342207, + "epoch": 0.9463399969938374, + "flos": 25046077034880.0, + "grad_norm": 2.103403899839581, + "language_loss": 0.64150524, + "learning_rate": 3.009587421648363e-08, + "loss": 0.66281885, + "num_input_tokens_seen": 339548820, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.65234375, + "step": 15740, + "time_per_iteration": 2.516693115234375 + }, + { + "auxiliary_loss_clip": 0.01096961, + "auxiliary_loss_mlp": 0.01027669, + "balance_loss_clip": 1.01670778, + "balance_loss_mlp": 1.03294837, + "epoch": 0.9464001202465053, + "flos": 24352606085760.0, + "grad_norm": 1.9948304801785786, + "language_loss": 0.66507947, + "learning_rate": 3.0028601777301045e-08, + "loss": 0.68632573, + "num_input_tokens_seen": 339566775, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.640625, + "step": 15741, + "time_per_iteration": 2.483225107192993 + }, + { + "auxiliary_loss_clip": 0.01099414, + "auxiliary_loss_mlp": 0.01026135, + "balance_loss_clip": 1.01472712, + "balance_loss_mlp": 1.03407657, + "epoch": 0.9464602434991733, + "flos": 17165444832000.0, + "grad_norm": 2.182631737231146, + "language_loss": 0.75745535, + "learning_rate": 2.9961404039630987e-08, + "loss": 0.7787109, + "num_input_tokens_seen": 339581905, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 15742, + "time_per_iteration": 2.426438093185425 + }, + { + "auxiliary_loss_clip": 0.01097162, + "auxiliary_loss_mlp": 0.01027953, + "balance_loss_clip": 1.01714623, + "balance_loss_mlp": 1.0337882, + "epoch": 0.9465203667518413, + "flos": 19938107566080.0, + "grad_norm": 2.136193371359759, + "language_loss": 0.72182894, + "learning_rate": 2.989428100602187e-08, + "loss": 0.74308008, + "num_input_tokens_seen": 339599870, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6328125, + "step": 15743, + "time_per_iteration": 2.426293134689331 + }, + { + "auxiliary_loss_clip": 0.0110159, + "auxiliary_loss_mlp": 0.01032513, + "balance_loss_clip": 1.0209254, + "balance_loss_mlp": 1.03408003, + "epoch": 0.9465804900045093, + "flos": 20120318282880.0, + "grad_norm": 4.529960691980935, + "language_loss": 0.79481554, + "learning_rate": 2.982723267901943e-08, + "loss": 0.81615651, + "num_input_tokens_seen": 339620250, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.67578125, + "step": 15744, + "time_per_iteration": 2.4723949432373047 + }, + { + "auxiliary_loss_clip": 0.01101299, + "auxiliary_loss_mlp": 0.01033352, + "balance_loss_clip": 1.02148438, + "balance_loss_mlp": 1.0341022, + "epoch": 0.9466406132571772, + "flos": 23911622812800.0, + "grad_norm": 3.3674745996062225, + "language_loss": 0.77996051, + "learning_rate": 2.9760259061165417e-08, + "loss": 0.80130696, + "num_input_tokens_seen": 339639900, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 15745, + "time_per_iteration": 2.47007155418396 + }, + { + "auxiliary_loss_clip": 0.01100036, + "auxiliary_loss_mlp": 0.01029697, + "balance_loss_clip": 1.01782942, + "balance_loss_mlp": 1.03299022, + "epoch": 0.9467007365098452, + "flos": 19933223316480.0, + "grad_norm": 1.808023282855586, + "language_loss": 0.69985926, + "learning_rate": 2.9693360155000014e-08, + "loss": 0.7211566, + "num_input_tokens_seen": 339658970, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 15746, + "time_per_iteration": 2.4556610584259033 + }, + { + "auxiliary_loss_clip": 0.01098496, + "auxiliary_loss_mlp": 0.01027124, + "balance_loss_clip": 1.01516747, + "balance_loss_mlp": 1.03419673, + "epoch": 0.9467608597625131, + "flos": 19310496203520.0, + "grad_norm": 4.04673875503708, + "language_loss": 0.56715882, + "learning_rate": 2.962653596305964e-08, + "loss": 0.58841503, + "num_input_tokens_seen": 339675600, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.640625, + "step": 15747, + "time_per_iteration": 2.4188010692596436 + }, + { + "auxiliary_loss_clip": 0.01021638, + "auxiliary_loss_mlp": 0.00999103, + "balance_loss_clip": 0.99809551, + "balance_loss_mlp": 1.0015198, + "epoch": 0.9468209830151811, + "flos": 69630252802560.0, + "grad_norm": 0.6607046285663145, + "language_loss": 0.53250241, + "learning_rate": 2.955978648787871e-08, + "loss": 0.55270982, + "num_input_tokens_seen": 339744505, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.20117188, + "step": 15748, + "time_per_iteration": 3.210047960281372 + }, + { + "auxiliary_loss_clip": 0.01100624, + "auxiliary_loss_mlp": 0.01033339, + "balance_loss_clip": 1.02208531, + "balance_loss_mlp": 1.03541768, + "epoch": 0.946881106267849, + "flos": 27016639113600.0, + "grad_norm": 1.696117214299738, + "language_loss": 0.66129446, + "learning_rate": 2.9493111731988096e-08, + "loss": 0.68263412, + "num_input_tokens_seen": 339765810, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 15749, + "time_per_iteration": 2.4717953205108643 + }, + { + "auxiliary_loss_clip": 0.01099175, + "auxiliary_loss_mlp": 0.01028814, + "balance_loss_clip": 1.01611233, + "balance_loss_mlp": 1.03256774, + "epoch": 0.9469412295205171, + "flos": 20190092451840.0, + "grad_norm": 1.870451209534139, + "language_loss": 0.75719225, + "learning_rate": 2.942651169791621e-08, + "loss": 0.77847207, + "num_input_tokens_seen": 339784125, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.66796875, + "step": 15750, + "time_per_iteration": 2.4470083713531494 + }, + { + "auxiliary_loss_clip": 0.01099991, + "auxiliary_loss_mlp": 0.01027426, + "balance_loss_clip": 1.01614845, + "balance_loss_mlp": 1.03496587, + "epoch": 0.947001352773185, + "flos": 21324905809920.0, + "grad_norm": 3.2567403535496373, + "language_loss": 0.67666459, + "learning_rate": 2.9359986388188372e-08, + "loss": 0.6979388, + "num_input_tokens_seen": 339803450, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65234375, + "step": 15751, + "time_per_iteration": 2.4272170066833496 + }, + { + "auxiliary_loss_clip": 0.01100684, + "auxiliary_loss_mlp": 0.01026272, + "balance_loss_clip": 1.01553106, + "balance_loss_mlp": 1.03459108, + "epoch": 0.947061476025853, + "flos": 21944041562880.0, + "grad_norm": 2.233176837277438, + "language_loss": 0.65536374, + "learning_rate": 2.929353580532723e-08, + "loss": 0.6766333, + "num_input_tokens_seen": 339823215, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6640625, + "step": 15752, + "time_per_iteration": 2.4499189853668213 + }, + { + "auxiliary_loss_clip": 0.01098995, + "auxiliary_loss_mlp": 0.01027996, + "balance_loss_clip": 1.0164566, + "balance_loss_mlp": 1.03381038, + "epoch": 0.947121599278521, + "flos": 21394715892480.0, + "grad_norm": 1.595970237118896, + "language_loss": 0.71663833, + "learning_rate": 2.9227159951852764e-08, + "loss": 0.73790824, + "num_input_tokens_seen": 339842230, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65234375, + "step": 15753, + "time_per_iteration": 2.4530341625213623 + }, + { + "auxiliary_loss_clip": 0.01100937, + "auxiliary_loss_mlp": 0.01029372, + "balance_loss_clip": 1.01540065, + "balance_loss_mlp": 1.03327668, + "epoch": 0.9471817225311889, + "flos": 23075730437760.0, + "grad_norm": 1.7043436636476592, + "language_loss": 0.70336282, + "learning_rate": 2.9160858830281855e-08, + "loss": 0.72466588, + "num_input_tokens_seen": 339861640, + "router_z_loss_clip": 0.13964844, + "router_z_loss_mlp": 0.67578125, + "step": 15754, + "time_per_iteration": 2.470735549926758 + }, + { + "auxiliary_loss_clip": 0.01101539, + "auxiliary_loss_mlp": 0.01028837, + "balance_loss_clip": 1.01744604, + "balance_loss_mlp": 1.03313601, + "epoch": 0.947241845783857, + "flos": 11910744305280.0, + "grad_norm": 2.2470655637804695, + "language_loss": 0.78706431, + "learning_rate": 2.9094632443129153e-08, + "loss": 0.80836809, + "num_input_tokens_seen": 339878210, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.68359375, + "step": 15755, + "time_per_iteration": 2.3971338272094727 + }, + { + "auxiliary_loss_clip": 0.01104859, + "auxiliary_loss_mlp": 0.0103399, + "balance_loss_clip": 1.01995945, + "balance_loss_mlp": 1.03432608, + "epoch": 0.9473019690365249, + "flos": 20740675098240.0, + "grad_norm": 2.5046918538507037, + "language_loss": 0.75961721, + "learning_rate": 2.9028480792904876e-08, + "loss": 0.78100568, + "num_input_tokens_seen": 339894255, + "router_z_loss_clip": 0.140625, + "router_z_loss_mlp": 0.703125, + "step": 15756, + "time_per_iteration": 2.426345109939575 + }, + { + "auxiliary_loss_clip": 0.01099898, + "auxiliary_loss_mlp": 0.01028705, + "balance_loss_clip": 1.01795268, + "balance_loss_mlp": 1.03368378, + "epoch": 0.9473620922891929, + "flos": 17639896602240.0, + "grad_norm": 1.9870799305981186, + "language_loss": 0.74695963, + "learning_rate": 2.8962403882118347e-08, + "loss": 0.76824564, + "num_input_tokens_seen": 339912425, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6640625, + "step": 15757, + "time_per_iteration": 2.4045164585113525 + }, + { + "auxiliary_loss_clip": 0.0110339, + "auxiliary_loss_mlp": 0.01031445, + "balance_loss_clip": 1.01900578, + "balance_loss_mlp": 1.03469872, + "epoch": 0.9474222155418608, + "flos": 23550002640000.0, + "grad_norm": 2.378229571033702, + "language_loss": 0.79555655, + "learning_rate": 2.889640171327512e-08, + "loss": 0.8169049, + "num_input_tokens_seen": 339929635, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6875, + "step": 15758, + "time_per_iteration": 2.4659128189086914 + }, + { + "auxiliary_loss_clip": 0.01098428, + "auxiliary_loss_mlp": 0.01029661, + "balance_loss_clip": 1.01864612, + "balance_loss_mlp": 1.03468299, + "epoch": 0.9474823387945288, + "flos": 27089753247360.0, + "grad_norm": 1.5067261773590948, + "language_loss": 0.72213107, + "learning_rate": 2.8830474288877638e-08, + "loss": 0.74341202, + "num_input_tokens_seen": 339951200, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.63671875, + "step": 15759, + "time_per_iteration": 2.5268497467041016 + }, + { + "auxiliary_loss_clip": 0.01095275, + "auxiliary_loss_mlp": 0.01028196, + "balance_loss_clip": 1.01843882, + "balance_loss_mlp": 1.03411698, + "epoch": 0.9475424620471967, + "flos": 22966526113920.0, + "grad_norm": 1.4321356635021014, + "language_loss": 0.75588179, + "learning_rate": 2.8764621611426344e-08, + "loss": 0.77711654, + "num_input_tokens_seen": 339971820, + "router_z_loss_clip": 0.09765625, + "router_z_loss_mlp": 0.61328125, + "step": 15760, + "time_per_iteration": 2.4870219230651855 + }, + { + "auxiliary_loss_clip": 0.01099685, + "auxiliary_loss_mlp": 0.0102963, + "balance_loss_clip": 1.01837659, + "balance_loss_mlp": 1.03509808, + "epoch": 0.9476025852998647, + "flos": 20047671025920.0, + "grad_norm": 1.7659943637470257, + "language_loss": 0.72967952, + "learning_rate": 2.8698843683418128e-08, + "loss": 0.75097269, + "num_input_tokens_seen": 339989420, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 15761, + "time_per_iteration": 2.4621644020080566 + }, + { + "auxiliary_loss_clip": 0.01101443, + "auxiliary_loss_mlp": 0.01033264, + "balance_loss_clip": 1.02264786, + "balance_loss_mlp": 1.03763127, + "epoch": 0.9476627085525327, + "flos": 14975468524800.0, + "grad_norm": 2.0880931998012926, + "language_loss": 0.71599525, + "learning_rate": 2.863314050734722e-08, + "loss": 0.73734236, + "num_input_tokens_seen": 340006690, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.640625, + "step": 15762, + "time_per_iteration": 3.807072877883911 + }, + { + "auxiliary_loss_clip": 0.01102527, + "auxiliary_loss_mlp": 0.01035463, + "balance_loss_clip": 1.02279127, + "balance_loss_mlp": 1.03368092, + "epoch": 0.9477228318052007, + "flos": 18697788984960.0, + "grad_norm": 1.8761919798911448, + "language_loss": 0.66871512, + "learning_rate": 2.856751208570518e-08, + "loss": 0.69009507, + "num_input_tokens_seen": 340025480, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 15763, + "time_per_iteration": 2.4327456951141357 + }, + { + "auxiliary_loss_clip": 0.01098893, + "auxiliary_loss_mlp": 0.01031315, + "balance_loss_clip": 1.02008581, + "balance_loss_mlp": 1.03249335, + "epoch": 0.9477829550578686, + "flos": 23875065745920.0, + "grad_norm": 1.6535306150383073, + "language_loss": 0.69588113, + "learning_rate": 2.8501958420980466e-08, + "loss": 0.71718317, + "num_input_tokens_seen": 340043785, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6640625, + "step": 15764, + "time_per_iteration": 5.365139722824097 + }, + { + "auxiliary_loss_clip": 0.01097061, + "auxiliary_loss_mlp": 0.0102714, + "balance_loss_clip": 1.01689982, + "balance_loss_mlp": 1.03639555, + "epoch": 0.9478430783105366, + "flos": 22562890007040.0, + "grad_norm": 1.6364617025382917, + "language_loss": 0.70810807, + "learning_rate": 2.8436479515659306e-08, + "loss": 0.72935009, + "num_input_tokens_seen": 340064360, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.609375, + "step": 15765, + "time_per_iteration": 2.445587158203125 + }, + { + "auxiliary_loss_clip": 0.01021739, + "auxiliary_loss_mlp": 0.00999534, + "balance_loss_clip": 0.99852639, + "balance_loss_mlp": 1.00167453, + "epoch": 0.9479032015632046, + "flos": 60857885554560.0, + "grad_norm": 0.8043033372730916, + "language_loss": 0.59102297, + "learning_rate": 2.8371075372224384e-08, + "loss": 0.61123562, + "num_input_tokens_seen": 340114425, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.20117188, + "step": 15766, + "time_per_iteration": 2.8118863105773926 + }, + { + "auxiliary_loss_clip": 0.01099537, + "auxiliary_loss_mlp": 0.01034405, + "balance_loss_clip": 1.02343154, + "balance_loss_mlp": 1.03409505, + "epoch": 0.9479633248158725, + "flos": 14683873916160.0, + "grad_norm": 1.8210488332704236, + "language_loss": 0.74425805, + "learning_rate": 2.8305745993155938e-08, + "loss": 0.76559752, + "num_input_tokens_seen": 340132200, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.65625, + "step": 15767, + "time_per_iteration": 2.433103561401367 + }, + { + "auxiliary_loss_clip": 0.01103755, + "auxiliary_loss_mlp": 0.01033165, + "balance_loss_clip": 1.02080297, + "balance_loss_mlp": 1.03559554, + "epoch": 0.9480234480685406, + "flos": 20333878594560.0, + "grad_norm": 2.1086495442960587, + "language_loss": 0.73338264, + "learning_rate": 2.8240491380931096e-08, + "loss": 0.7547518, + "num_input_tokens_seen": 340149175, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6796875, + "step": 15768, + "time_per_iteration": 2.4372289180755615 + }, + { + "auxiliary_loss_clip": 0.0102187, + "auxiliary_loss_mlp": 0.01000121, + "balance_loss_clip": 0.9991194, + "balance_loss_mlp": 1.00185442, + "epoch": 0.9480835713212085, + "flos": 70293092428800.0, + "grad_norm": 0.7343542147395368, + "language_loss": 0.55284411, + "learning_rate": 2.8175311538024326e-08, + "loss": 0.57306397, + "num_input_tokens_seen": 340208155, + "router_z_loss_clip": 0.01000977, + "router_z_loss_mlp": 0.20019531, + "step": 15769, + "time_per_iteration": 4.592373609542847 + }, + { + "auxiliary_loss_clip": 0.01096657, + "auxiliary_loss_mlp": 0.01027843, + "balance_loss_clip": 1.01644039, + "balance_loss_mlp": 1.03143764, + "epoch": 0.9481436945738765, + "flos": 25449749055360.0, + "grad_norm": 1.342521680668915, + "language_loss": 0.77534431, + "learning_rate": 2.8110206466907428e-08, + "loss": 0.79658937, + "num_input_tokens_seen": 340229275, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 15770, + "time_per_iteration": 2.509974956512451 + }, + { + "auxiliary_loss_clip": 0.01103995, + "auxiliary_loss_mlp": 0.01034529, + "balance_loss_clip": 1.02161837, + "balance_loss_mlp": 1.0377177, + "epoch": 0.9482038178265444, + "flos": 26979902478720.0, + "grad_norm": 1.7923861457089987, + "language_loss": 0.79980707, + "learning_rate": 2.8045176170049313e-08, + "loss": 0.82119232, + "num_input_tokens_seen": 340248920, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6640625, + "step": 15771, + "time_per_iteration": 2.4819459915161133 + }, + { + "auxiliary_loss_clip": 0.01097776, + "auxiliary_loss_mlp": 0.01028361, + "balance_loss_clip": 1.017066, + "balance_loss_mlp": 1.03398848, + "epoch": 0.9482639410792124, + "flos": 17785442511360.0, + "grad_norm": 2.453520523449039, + "language_loss": 0.69694543, + "learning_rate": 2.7980220649915566e-08, + "loss": 0.71820688, + "num_input_tokens_seen": 340266775, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.63671875, + "step": 15772, + "time_per_iteration": 2.425267219543457 + }, + { + "auxiliary_loss_clip": 0.01099953, + "auxiliary_loss_mlp": 0.01030819, + "balance_loss_clip": 1.01907134, + "balance_loss_mlp": 1.03535521, + "epoch": 0.9483240643318803, + "flos": 20996682307200.0, + "grad_norm": 1.5802215490409397, + "language_loss": 0.73707336, + "learning_rate": 2.7915339908969327e-08, + "loss": 0.75838113, + "num_input_tokens_seen": 340285295, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6484375, + "step": 15773, + "time_per_iteration": 2.444343328475952 + }, + { + "auxiliary_loss_clip": 0.0110056, + "auxiliary_loss_mlp": 0.01033367, + "balance_loss_clip": 1.02117205, + "balance_loss_mlp": 1.03330648, + "epoch": 0.9483841875845483, + "flos": 20083294339200.0, + "grad_norm": 2.131794605985836, + "language_loss": 0.62298661, + "learning_rate": 2.7850533949671072e-08, + "loss": 0.64432591, + "num_input_tokens_seen": 340304265, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 15774, + "time_per_iteration": 2.4462358951568604 + }, + { + "auxiliary_loss_clip": 0.01099681, + "auxiliary_loss_mlp": 0.0103236, + "balance_loss_clip": 1.02020681, + "balance_loss_mlp": 1.03321493, + "epoch": 0.9484443108372163, + "flos": 20813645577600.0, + "grad_norm": 2.09594864312592, + "language_loss": 0.58812392, + "learning_rate": 2.7785802774478396e-08, + "loss": 0.60944426, + "num_input_tokens_seen": 340323690, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6640625, + "step": 15775, + "time_per_iteration": 2.4414901733398438 + }, + { + "auxiliary_loss_clip": 0.01102102, + "auxiliary_loss_mlp": 0.01029054, + "balance_loss_clip": 1.01669836, + "balance_loss_mlp": 1.03493381, + "epoch": 0.9485044340898843, + "flos": 36429184506240.0, + "grad_norm": 1.8455531404536583, + "language_loss": 0.61595821, + "learning_rate": 2.772114638584555e-08, + "loss": 0.63726979, + "num_input_tokens_seen": 340345830, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.671875, + "step": 15776, + "time_per_iteration": 2.5827388763427734 + }, + { + "auxiliary_loss_clip": 0.01098673, + "auxiliary_loss_mlp": 0.01030668, + "balance_loss_clip": 1.01866937, + "balance_loss_mlp": 1.03275156, + "epoch": 0.9485645573425522, + "flos": 22602535643520.0, + "grad_norm": 1.7272218804811466, + "language_loss": 0.73529625, + "learning_rate": 2.765656478622458e-08, + "loss": 0.75658965, + "num_input_tokens_seen": 340365910, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66015625, + "step": 15777, + "time_per_iteration": 2.46150279045105 + }, + { + "auxiliary_loss_clip": 0.01111124, + "auxiliary_loss_mlp": 0.01034603, + "balance_loss_clip": 1.02140033, + "balance_loss_mlp": 1.03862464, + "epoch": 0.9486246805952202, + "flos": 22017766227840.0, + "grad_norm": 2.8233320899962435, + "language_loss": 0.72577089, + "learning_rate": 2.759205797806441e-08, + "loss": 0.74722815, + "num_input_tokens_seen": 340383935, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.7265625, + "step": 15778, + "time_per_iteration": 2.467472553253174 + }, + { + "auxiliary_loss_clip": 0.01094771, + "auxiliary_loss_mlp": 0.01026604, + "balance_loss_clip": 1.01670969, + "balance_loss_mlp": 1.0343349, + "epoch": 0.9486848038478882, + "flos": 16508674604160.0, + "grad_norm": 1.8271409648319303, + "language_loss": 0.69787717, + "learning_rate": 2.7527625963810865e-08, + "loss": 0.71909094, + "num_input_tokens_seen": 340402760, + "router_z_loss_clip": 0.09863281, + "router_z_loss_mlp": 0.6015625, + "step": 15779, + "time_per_iteration": 2.4266300201416016 + }, + { + "auxiliary_loss_clip": 0.01100503, + "auxiliary_loss_mlp": 0.01030815, + "balance_loss_clip": 1.01880431, + "balance_loss_mlp": 1.03467202, + "epoch": 0.9487449271005561, + "flos": 19244385221760.0, + "grad_norm": 2.8607336238305794, + "language_loss": 0.78267539, + "learning_rate": 2.7463268745907542e-08, + "loss": 0.80398858, + "num_input_tokens_seen": 340422105, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.65625, + "step": 15780, + "time_per_iteration": 2.446392774581909 + }, + { + "auxiliary_loss_clip": 0.01101438, + "auxiliary_loss_mlp": 0.01029635, + "balance_loss_clip": 1.01809514, + "balance_loss_mlp": 1.03621566, + "epoch": 0.9488050503532242, + "flos": 21762692772480.0, + "grad_norm": 1.7816825422070612, + "language_loss": 0.66119897, + "learning_rate": 2.7398986326794494e-08, + "loss": 0.68250966, + "num_input_tokens_seen": 340441160, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65234375, + "step": 15781, + "time_per_iteration": 2.4368371963500977 + }, + { + "auxiliary_loss_clip": 0.01097989, + "auxiliary_loss_mlp": 0.01031123, + "balance_loss_clip": 1.01941085, + "balance_loss_mlp": 1.03366685, + "epoch": 0.9488651736058921, + "flos": 18368919037440.0, + "grad_norm": 2.0866286325402306, + "language_loss": 0.7938571, + "learning_rate": 2.733477870890999e-08, + "loss": 0.81514817, + "num_input_tokens_seen": 340458200, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.64453125, + "step": 15782, + "time_per_iteration": 2.4351727962493896 + }, + { + "auxiliary_loss_clip": 0.01021458, + "auxiliary_loss_mlp": 0.00998812, + "balance_loss_clip": 0.99779856, + "balance_loss_mlp": 1.00149429, + "epoch": 0.9489252968585601, + "flos": 70084057230720.0, + "grad_norm": 0.725186072749968, + "language_loss": 0.59841406, + "learning_rate": 2.7270645894688082e-08, + "loss": 0.61861676, + "num_input_tokens_seen": 340526420, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.19921875, + "step": 15783, + "time_per_iteration": 3.145355224609375 + }, + { + "auxiliary_loss_clip": 0.01099804, + "auxiliary_loss_mlp": 0.01032698, + "balance_loss_clip": 1.0206039, + "balance_loss_mlp": 1.03343678, + "epoch": 0.948985420111228, + "flos": 27855440490240.0, + "grad_norm": 1.6893787149575912, + "language_loss": 0.74055898, + "learning_rate": 2.720658788656105e-08, + "loss": 0.76188403, + "num_input_tokens_seen": 340546325, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 15784, + "time_per_iteration": 2.4882519245147705 + }, + { + "auxiliary_loss_clip": 0.01098838, + "auxiliary_loss_mlp": 0.01027468, + "balance_loss_clip": 1.01474261, + "balance_loss_mlp": 1.03261077, + "epoch": 0.949045543363896, + "flos": 24316049018880.0, + "grad_norm": 2.40873132613553, + "language_loss": 0.69824833, + "learning_rate": 2.714260468695806e-08, + "loss": 0.71951145, + "num_input_tokens_seen": 340565145, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.66015625, + "step": 15785, + "time_per_iteration": 2.4379212856292725 + }, + { + "auxiliary_loss_clip": 0.01100555, + "auxiliary_loss_mlp": 0.01027966, + "balance_loss_clip": 1.01652157, + "balance_loss_mlp": 1.03367662, + "epoch": 0.9491056666165639, + "flos": 24241677909120.0, + "grad_norm": 1.5884098203702628, + "language_loss": 0.75856775, + "learning_rate": 2.707869629830495e-08, + "loss": 0.77985299, + "num_input_tokens_seen": 340585465, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 15786, + "time_per_iteration": 2.4647655487060547 + }, + { + "auxiliary_loss_clip": 0.01099885, + "auxiliary_loss_mlp": 0.01028911, + "balance_loss_clip": 1.01817656, + "balance_loss_mlp": 1.03462839, + "epoch": 0.949165789869232, + "flos": 24531261356160.0, + "grad_norm": 1.6402496308438652, + "language_loss": 0.78891599, + "learning_rate": 2.7014862723025335e-08, + "loss": 0.81020397, + "num_input_tokens_seen": 340606010, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.65234375, + "step": 15787, + "time_per_iteration": 2.463150978088379 + }, + { + "auxiliary_loss_clip": 0.01098978, + "auxiliary_loss_mlp": 0.01027508, + "balance_loss_clip": 1.01666558, + "balance_loss_mlp": 1.03643632, + "epoch": 0.9492259131218999, + "flos": 22235348862720.0, + "grad_norm": 1.6263586462249067, + "language_loss": 0.76067448, + "learning_rate": 2.6951103963540388e-08, + "loss": 0.78193933, + "num_input_tokens_seen": 340626135, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.625, + "step": 15788, + "time_per_iteration": 2.4509265422821045 + }, + { + "auxiliary_loss_clip": 0.01100348, + "auxiliary_loss_mlp": 0.01031567, + "balance_loss_clip": 1.01915097, + "balance_loss_mlp": 1.03344178, + "epoch": 0.9492860363745679, + "flos": 22966310632320.0, + "grad_norm": 2.0839053015801476, + "language_loss": 0.71524441, + "learning_rate": 2.6887420022266848e-08, + "loss": 0.73656362, + "num_input_tokens_seen": 340644870, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.66796875, + "step": 15789, + "time_per_iteration": 2.432985544204712 + }, + { + "auxiliary_loss_clip": 0.0109756, + "auxiliary_loss_mlp": 0.01028031, + "balance_loss_clip": 1.01589549, + "balance_loss_mlp": 1.03416276, + "epoch": 0.9493461596272358, + "flos": 18370283754240.0, + "grad_norm": 2.063727201959523, + "language_loss": 0.73046041, + "learning_rate": 2.682381090161989e-08, + "loss": 0.75171626, + "num_input_tokens_seen": 340663695, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6328125, + "step": 15790, + "time_per_iteration": 2.4657516479492188 + }, + { + "auxiliary_loss_clip": 0.01101174, + "auxiliary_loss_mlp": 0.0103117, + "balance_loss_clip": 1.01891565, + "balance_loss_mlp": 1.03377855, + "epoch": 0.9494062828799038, + "flos": 20011724490240.0, + "grad_norm": 1.7938510674280357, + "language_loss": 0.77490807, + "learning_rate": 2.6760276604012033e-08, + "loss": 0.79623151, + "num_input_tokens_seen": 340682970, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.671875, + "step": 15791, + "time_per_iteration": 2.436913013458252 + }, + { + "auxiliary_loss_clip": 0.01102913, + "auxiliary_loss_mlp": 0.01028292, + "balance_loss_clip": 1.0160315, + "balance_loss_mlp": 1.03452277, + "epoch": 0.9494664061325718, + "flos": 27228583313280.0, + "grad_norm": 1.8010482249748228, + "language_loss": 0.73511958, + "learning_rate": 2.6696817131852234e-08, + "loss": 0.75643158, + "num_input_tokens_seen": 340702275, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 15792, + "time_per_iteration": 2.5013904571533203 + }, + { + "auxiliary_loss_clip": 0.0109955, + "auxiliary_loss_mlp": 0.0103204, + "balance_loss_clip": 1.02072704, + "balance_loss_mlp": 1.03471923, + "epoch": 0.9495265293852397, + "flos": 18369816877440.0, + "grad_norm": 1.858360072617374, + "language_loss": 0.78069293, + "learning_rate": 2.663343248754679e-08, + "loss": 0.80200887, + "num_input_tokens_seen": 340719060, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 15793, + "time_per_iteration": 2.4309065341949463 + }, + { + "auxiliary_loss_clip": 0.01099206, + "auxiliary_loss_mlp": 0.01028746, + "balance_loss_clip": 1.01784468, + "balance_loss_mlp": 1.03409159, + "epoch": 0.9495866526379078, + "flos": 23075766351360.0, + "grad_norm": 1.6667153863215733, + "language_loss": 0.77353388, + "learning_rate": 2.6570122673499562e-08, + "loss": 0.79481339, + "num_input_tokens_seen": 340737815, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.65234375, + "step": 15794, + "time_per_iteration": 2.4753899574279785 + }, + { + "auxiliary_loss_clip": 0.01102667, + "auxiliary_loss_mlp": 0.01031883, + "balance_loss_clip": 1.01900196, + "balance_loss_mlp": 1.03453398, + "epoch": 0.9496467758905757, + "flos": 17529902179200.0, + "grad_norm": 1.9706914699233502, + "language_loss": 0.60769325, + "learning_rate": 2.650688769211107e-08, + "loss": 0.62903881, + "num_input_tokens_seen": 340756035, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6796875, + "step": 15795, + "time_per_iteration": 2.4098758697509766 + }, + { + "auxiliary_loss_clip": 0.01097281, + "auxiliary_loss_mlp": 0.01032496, + "balance_loss_clip": 1.02076006, + "balance_loss_mlp": 1.03450537, + "epoch": 0.9497068991432437, + "flos": 24133910129280.0, + "grad_norm": 1.6244448011780146, + "language_loss": 0.79229355, + "learning_rate": 2.644372754577895e-08, + "loss": 0.81359136, + "num_input_tokens_seen": 340775620, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.62890625, + "step": 15796, + "time_per_iteration": 2.4744715690612793 + }, + { + "auxiliary_loss_clip": 0.01099617, + "auxiliary_loss_mlp": 0.01026633, + "balance_loss_clip": 1.01453948, + "balance_loss_mlp": 1.03425527, + "epoch": 0.9497670223959116, + "flos": 20303319098880.0, + "grad_norm": 2.1269036660223186, + "language_loss": 0.75475836, + "learning_rate": 2.6380642236898398e-08, + "loss": 0.77602082, + "num_input_tokens_seen": 340794510, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.65234375, + "step": 15797, + "time_per_iteration": 2.4281978607177734 + }, + { + "auxiliary_loss_clip": 0.01100771, + "auxiliary_loss_mlp": 0.01030925, + "balance_loss_clip": 1.0194509, + "balance_loss_mlp": 1.03495431, + "epoch": 0.9498271456485796, + "flos": 13698916099200.0, + "grad_norm": 5.214544570088492, + "language_loss": 0.6590659, + "learning_rate": 2.6317631767861727e-08, + "loss": 0.68038285, + "num_input_tokens_seen": 340812955, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.66015625, + "step": 15798, + "time_per_iteration": 2.4303300380706787 + }, + { + "auxiliary_loss_clip": 0.01102492, + "auxiliary_loss_mlp": 0.01032243, + "balance_loss_clip": 1.02094197, + "balance_loss_mlp": 1.0353173, + "epoch": 0.9498872689012475, + "flos": 20814004713600.0, + "grad_norm": 1.7260335815330186, + "language_loss": 0.7747848, + "learning_rate": 2.6254696141058575e-08, + "loss": 0.79613221, + "num_input_tokens_seen": 340829200, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 15799, + "time_per_iteration": 2.4504878520965576 + }, + { + "auxiliary_loss_clip": 0.01098618, + "auxiliary_loss_mlp": 0.01031288, + "balance_loss_clip": 1.01981449, + "balance_loss_mlp": 1.03534567, + "epoch": 0.9499473921539155, + "flos": 21032700670080.0, + "grad_norm": 1.758779888402255, + "language_loss": 0.70793021, + "learning_rate": 2.6191835358874814e-08, + "loss": 0.72922921, + "num_input_tokens_seen": 340848035, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6328125, + "step": 15800, + "time_per_iteration": 2.4800631999969482 + }, + { + "auxiliary_loss_clip": 0.01095391, + "auxiliary_loss_mlp": 0.01028754, + "balance_loss_clip": 1.01717317, + "balance_loss_mlp": 1.03154349, + "epoch": 0.9500075154065835, + "flos": 20998693468800.0, + "grad_norm": 1.741889328340764, + "language_loss": 0.71796048, + "learning_rate": 2.6129049423694315e-08, + "loss": 0.73920196, + "num_input_tokens_seen": 340870025, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.63671875, + "step": 15801, + "time_per_iteration": 2.4760396480560303 + }, + { + "auxiliary_loss_clip": 0.01100868, + "auxiliary_loss_mlp": 0.01032172, + "balance_loss_clip": 1.02095401, + "balance_loss_mlp": 1.03574038, + "epoch": 0.9500676386592515, + "flos": 25121956515840.0, + "grad_norm": 1.525596008392139, + "language_loss": 0.8088901, + "learning_rate": 2.6066338337898508e-08, + "loss": 0.83022046, + "num_input_tokens_seen": 340892290, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 15802, + "time_per_iteration": 2.5023224353790283 + }, + { + "auxiliary_loss_clip": 0.01102144, + "auxiliary_loss_mlp": 0.01028674, + "balance_loss_clip": 1.01719987, + "balance_loss_mlp": 1.03577518, + "epoch": 0.9501277619119194, + "flos": 27523625627520.0, + "grad_norm": 1.5961562305678088, + "language_loss": 0.67818773, + "learning_rate": 2.60037021038646e-08, + "loss": 0.69949591, + "num_input_tokens_seen": 340912260, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6640625, + "step": 15803, + "time_per_iteration": 3.909879684448242 + }, + { + "auxiliary_loss_clip": 0.01098126, + "auxiliary_loss_mlp": 0.01031454, + "balance_loss_clip": 1.0200932, + "balance_loss_mlp": 1.03395629, + "epoch": 0.9501878851645874, + "flos": 20813968800000.0, + "grad_norm": 1.7025824496065405, + "language_loss": 0.76297027, + "learning_rate": 2.5941140723968247e-08, + "loss": 0.784266, + "num_input_tokens_seen": 340928930, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.640625, + "step": 15804, + "time_per_iteration": 2.43643856048584 + }, + { + "auxiliary_loss_clip": 0.01102711, + "auxiliary_loss_mlp": 0.01031644, + "balance_loss_clip": 1.020015, + "balance_loss_mlp": 1.03606462, + "epoch": 0.9502480084172553, + "flos": 18369385914240.0, + "grad_norm": 1.6211668044626601, + "language_loss": 0.73356307, + "learning_rate": 2.5878654200581775e-08, + "loss": 0.75490659, + "num_input_tokens_seen": 340946615, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 15805, + "time_per_iteration": 3.8646578788757324 + }, + { + "auxiliary_loss_clip": 0.01101239, + "auxiliary_loss_mlp": 0.01033939, + "balance_loss_clip": 1.02232194, + "balance_loss_mlp": 1.03600073, + "epoch": 0.9503081316699233, + "flos": 23549607590400.0, + "grad_norm": 2.0152125986211598, + "language_loss": 0.80254024, + "learning_rate": 2.5816242536074618e-08, + "loss": 0.82389206, + "num_input_tokens_seen": 340967545, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 15806, + "time_per_iteration": 4.032423257827759 + }, + { + "auxiliary_loss_clip": 0.01102997, + "auxiliary_loss_mlp": 0.01026855, + "balance_loss_clip": 1.01560807, + "balance_loss_mlp": 1.03544569, + "epoch": 0.9503682549225914, + "flos": 18040444139520.0, + "grad_norm": 2.193286707527827, + "language_loss": 0.82814157, + "learning_rate": 2.5753905732813108e-08, + "loss": 0.8494401, + "num_input_tokens_seen": 340984955, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.67578125, + "step": 15807, + "time_per_iteration": 2.414118528366089 + }, + { + "auxiliary_loss_clip": 0.01097126, + "auxiliary_loss_mlp": 0.01029199, + "balance_loss_clip": 1.01739097, + "balance_loss_mlp": 1.03243184, + "epoch": 0.9504283781752593, + "flos": 25886135387520.0, + "grad_norm": 1.7349067366850919, + "language_loss": 0.71784639, + "learning_rate": 2.5691643793161355e-08, + "loss": 0.73910964, + "num_input_tokens_seen": 341007300, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.64453125, + "step": 15808, + "time_per_iteration": 2.5013911724090576 + }, + { + "auxiliary_loss_clip": 0.01098372, + "auxiliary_loss_mlp": 0.01026538, + "balance_loss_clip": 1.01529086, + "balance_loss_mlp": 1.03383148, + "epoch": 0.9504885014279273, + "flos": 22124025636480.0, + "grad_norm": 1.4451667081622699, + "language_loss": 0.69974124, + "learning_rate": 2.562945671948058e-08, + "loss": 0.72099042, + "num_input_tokens_seen": 341026695, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 15809, + "time_per_iteration": 2.460293769836426 + }, + { + "auxiliary_loss_clip": 0.01097419, + "auxiliary_loss_mlp": 0.01025961, + "balance_loss_clip": 1.01474309, + "balance_loss_mlp": 1.03248215, + "epoch": 0.9505486246805952, + "flos": 21615961714560.0, + "grad_norm": 1.651959109851631, + "language_loss": 0.75416887, + "learning_rate": 2.5567344514128452e-08, + "loss": 0.77540267, + "num_input_tokens_seen": 341047080, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 15810, + "time_per_iteration": 2.5347001552581787 + }, + { + "auxiliary_loss_clip": 0.01097724, + "auxiliary_loss_mlp": 0.01037805, + "balance_loss_clip": 1.02580118, + "balance_loss_mlp": 1.03223252, + "epoch": 0.9506087479332632, + "flos": 22528236360960.0, + "grad_norm": 1.397807601359488, + "language_loss": 0.79862857, + "learning_rate": 2.5505307179460643e-08, + "loss": 0.81998384, + "num_input_tokens_seen": 341067310, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.65625, + "step": 15811, + "time_per_iteration": 3.979959487915039 + }, + { + "auxiliary_loss_clip": 0.01099426, + "auxiliary_loss_mlp": 0.01030663, + "balance_loss_clip": 1.01915371, + "balance_loss_mlp": 1.03354287, + "epoch": 0.9506688711859311, + "flos": 27527360641920.0, + "grad_norm": 1.9697509553597836, + "language_loss": 0.70062947, + "learning_rate": 2.5443344717829495e-08, + "loss": 0.72193033, + "num_input_tokens_seen": 341085110, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 15812, + "time_per_iteration": 2.521512508392334 + }, + { + "auxiliary_loss_clip": 0.0110006, + "auxiliary_loss_mlp": 0.01028477, + "balance_loss_clip": 1.01730156, + "balance_loss_mlp": 1.03445292, + "epoch": 0.9507289944385992, + "flos": 19865783531520.0, + "grad_norm": 1.4891615410905001, + "language_loss": 0.65331221, + "learning_rate": 2.538145713158446e-08, + "loss": 0.67459756, + "num_input_tokens_seen": 341103190, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.65625, + "step": 15813, + "time_per_iteration": 2.4236130714416504 + }, + { + "auxiliary_loss_clip": 0.01101884, + "auxiliary_loss_mlp": 0.01035163, + "balance_loss_clip": 1.02317691, + "balance_loss_mlp": 1.03430367, + "epoch": 0.9507891176912671, + "flos": 25193274969600.0, + "grad_norm": 1.3402778954569576, + "language_loss": 0.7040152, + "learning_rate": 2.5319644423072327e-08, + "loss": 0.72538567, + "num_input_tokens_seen": 341125695, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.67578125, + "step": 15814, + "time_per_iteration": 2.5455849170684814 + }, + { + "auxiliary_loss_clip": 0.01096469, + "auxiliary_loss_mlp": 0.01027375, + "balance_loss_clip": 1.01623535, + "balance_loss_mlp": 1.03357434, + "epoch": 0.9508492409439351, + "flos": 24899561458560.0, + "grad_norm": 2.0743353797115094, + "language_loss": 0.62986439, + "learning_rate": 2.5257906594637445e-08, + "loss": 0.65110284, + "num_input_tokens_seen": 341143930, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.62890625, + "step": 15815, + "time_per_iteration": 2.460432529449463 + }, + { + "auxiliary_loss_clip": 0.01097239, + "auxiliary_loss_mlp": 0.01026801, + "balance_loss_clip": 1.01574445, + "balance_loss_mlp": 1.03236914, + "epoch": 0.950909364196603, + "flos": 29784094375680.0, + "grad_norm": 1.9387627349978607, + "language_loss": 0.5886873, + "learning_rate": 2.519624364862061e-08, + "loss": 0.60992765, + "num_input_tokens_seen": 341164280, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 15816, + "time_per_iteration": 2.5585241317749023 + }, + { + "auxiliary_loss_clip": 0.01098859, + "auxiliary_loss_mlp": 0.01039133, + "balance_loss_clip": 1.02797484, + "balance_loss_mlp": 1.03374326, + "epoch": 0.950969487449271, + "flos": 24717781704960.0, + "grad_norm": 1.3857841520956902, + "language_loss": 0.73455548, + "learning_rate": 2.513465558735994e-08, + "loss": 0.75593543, + "num_input_tokens_seen": 341183670, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.65234375, + "step": 15817, + "time_per_iteration": 2.4631595611572266 + }, + { + "auxiliary_loss_clip": 0.01101933, + "auxiliary_loss_mlp": 0.01034108, + "balance_loss_clip": 1.02061391, + "balance_loss_mlp": 1.03445303, + "epoch": 0.9510296107019389, + "flos": 13699167494400.0, + "grad_norm": 1.6087638355681797, + "language_loss": 0.59922737, + "learning_rate": 2.5073142413190918e-08, + "loss": 0.62058777, + "num_input_tokens_seen": 341201900, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.67578125, + "step": 15818, + "time_per_iteration": 2.4381842613220215 + }, + { + "auxiliary_loss_clip": 0.01100649, + "auxiliary_loss_mlp": 0.01029491, + "balance_loss_clip": 1.01787972, + "balance_loss_mlp": 1.03539026, + "epoch": 0.9510897339546069, + "flos": 17311852667520.0, + "grad_norm": 1.7432779059279067, + "language_loss": 0.69244868, + "learning_rate": 2.5011704128446552e-08, + "loss": 0.71375006, + "num_input_tokens_seen": 341218340, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 15819, + "time_per_iteration": 2.462388038635254 + }, + { + "auxiliary_loss_clip": 0.01102916, + "auxiliary_loss_mlp": 0.01028571, + "balance_loss_clip": 1.01690078, + "balance_loss_mlp": 1.03555536, + "epoch": 0.951149857207275, + "flos": 14793940166400.0, + "grad_norm": 1.8294910897534251, + "language_loss": 0.74143231, + "learning_rate": 2.49503407354561e-08, + "loss": 0.76274723, + "num_input_tokens_seen": 341235885, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.671875, + "step": 15820, + "time_per_iteration": 2.4434814453125 + }, + { + "auxiliary_loss_clip": 0.0110219, + "auxiliary_loss_mlp": 0.01032913, + "balance_loss_clip": 1.02099824, + "balance_loss_mlp": 1.0352037, + "epoch": 0.9512099804599429, + "flos": 19391152193280.0, + "grad_norm": 1.7026634144017363, + "language_loss": 0.78670204, + "learning_rate": 2.4889052236546804e-08, + "loss": 0.80805308, + "num_input_tokens_seen": 341255280, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 15821, + "time_per_iteration": 2.4224560260772705 + }, + { + "auxiliary_loss_clip": 0.01097487, + "auxiliary_loss_mlp": 0.01026304, + "balance_loss_clip": 1.01455593, + "balance_loss_mlp": 1.03292096, + "epoch": 0.9512701037126109, + "flos": 36757874885760.0, + "grad_norm": 1.5339289826116789, + "language_loss": 0.71220911, + "learning_rate": 2.4827838634042586e-08, + "loss": 0.73344707, + "num_input_tokens_seen": 341279055, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6484375, + "step": 15822, + "time_per_iteration": 2.5735907554626465 + }, + { + "auxiliary_loss_clip": 0.01100231, + "auxiliary_loss_mlp": 0.01031823, + "balance_loss_clip": 1.02049828, + "balance_loss_mlp": 1.03538275, + "epoch": 0.9513302269652788, + "flos": 22638266697600.0, + "grad_norm": 1.8073194694188124, + "language_loss": 0.66159809, + "learning_rate": 2.47666999302647e-08, + "loss": 0.68291861, + "num_input_tokens_seen": 341298560, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 15823, + "time_per_iteration": 2.465412139892578 + }, + { + "auxiliary_loss_clip": 0.01097407, + "auxiliary_loss_mlp": 0.01030023, + "balance_loss_clip": 1.01931834, + "balance_loss_mlp": 1.03426194, + "epoch": 0.9513903502179468, + "flos": 22893232412160.0, + "grad_norm": 1.5566121914996327, + "language_loss": 0.76921892, + "learning_rate": 2.4705636127531292e-08, + "loss": 0.79049319, + "num_input_tokens_seen": 341316650, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6328125, + "step": 15824, + "time_per_iteration": 2.4632158279418945 + }, + { + "auxiliary_loss_clip": 0.01103042, + "auxiliary_loss_mlp": 0.01029115, + "balance_loss_clip": 1.01674688, + "balance_loss_mlp": 1.03397322, + "epoch": 0.9514504734706147, + "flos": 27928626451200.0, + "grad_norm": 1.8863003514793029, + "language_loss": 0.73595691, + "learning_rate": 2.4644647228158065e-08, + "loss": 0.75727856, + "num_input_tokens_seen": 341336185, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69140625, + "step": 15825, + "time_per_iteration": 2.474846363067627 + }, + { + "auxiliary_loss_clip": 0.01021578, + "auxiliary_loss_mlp": 0.0100021, + "balance_loss_clip": 0.99922067, + "balance_loss_mlp": 1.00160623, + "epoch": 0.9515105967232828, + "flos": 67366767312000.0, + "grad_norm": 0.8541641407387539, + "language_loss": 0.53453624, + "learning_rate": 2.458373323445806e-08, + "loss": 0.55475414, + "num_input_tokens_seen": 341395795, + "router_z_loss_clip": 0.0098877, + "router_z_loss_mlp": 0.20019531, + "step": 15826, + "time_per_iteration": 2.9626259803771973 + }, + { + "auxiliary_loss_clip": 0.01100498, + "auxiliary_loss_mlp": 0.01035518, + "balance_loss_clip": 1.02391326, + "balance_loss_mlp": 1.03486824, + "epoch": 0.9515707199759507, + "flos": 25846525664640.0, + "grad_norm": 2.681001653375095, + "language_loss": 0.72440886, + "learning_rate": 2.452289414874076e-08, + "loss": 0.74576902, + "num_input_tokens_seen": 341415675, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 15827, + "time_per_iteration": 2.4679887294769287 + }, + { + "auxiliary_loss_clip": 0.01099346, + "auxiliary_loss_mlp": 0.01027496, + "balance_loss_clip": 1.01561654, + "balance_loss_mlp": 1.03423381, + "epoch": 0.9516308432286187, + "flos": 21828983322240.0, + "grad_norm": 1.9123918048218376, + "language_loss": 0.74679339, + "learning_rate": 2.4462129973313207e-08, + "loss": 0.76806182, + "num_input_tokens_seen": 341432990, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6484375, + "step": 15828, + "time_per_iteration": 2.444719076156616 + }, + { + "auxiliary_loss_clip": 0.01098432, + "auxiliary_loss_mlp": 0.01031714, + "balance_loss_clip": 1.02086651, + "balance_loss_mlp": 1.03533959, + "epoch": 0.9516909664812866, + "flos": 27269593666560.0, + "grad_norm": 1.4979200700178503, + "language_loss": 0.7287569, + "learning_rate": 2.440144071047978e-08, + "loss": 0.75005829, + "num_input_tokens_seen": 341454100, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6328125, + "step": 15829, + "time_per_iteration": 2.4894516468048096 + }, + { + "auxiliary_loss_clip": 0.01099572, + "auxiliary_loss_mlp": 0.01026804, + "balance_loss_clip": 1.01606297, + "balance_loss_mlp": 1.034266, + "epoch": 0.9517510897339546, + "flos": 21215342350080.0, + "grad_norm": 1.765032292908151, + "language_loss": 0.6078254, + "learning_rate": 2.4340826362541533e-08, + "loss": 0.62908912, + "num_input_tokens_seen": 341472955, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.65234375, + "step": 15830, + "time_per_iteration": 2.4441115856170654 + }, + { + "auxiliary_loss_clip": 0.01102008, + "auxiliary_loss_mlp": 0.01030583, + "balance_loss_clip": 1.0181613, + "balance_loss_mlp": 1.03501642, + "epoch": 0.9518112129866225, + "flos": 18733986915840.0, + "grad_norm": 2.121238764010395, + "language_loss": 0.73090142, + "learning_rate": 2.428028693179729e-08, + "loss": 0.75222731, + "num_input_tokens_seen": 341490165, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.671875, + "step": 15831, + "time_per_iteration": 2.4257888793945312 + }, + { + "auxiliary_loss_clip": 0.01095715, + "auxiliary_loss_mlp": 0.01023011, + "balance_loss_clip": 1.01229966, + "balance_loss_mlp": 1.03229094, + "epoch": 0.9518713362392905, + "flos": 16763676232320.0, + "grad_norm": 1.8239612191411185, + "language_loss": 0.65346098, + "learning_rate": 2.4219822420542545e-08, + "loss": 0.67464817, + "num_input_tokens_seen": 341508055, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6328125, + "step": 15832, + "time_per_iteration": 2.4187471866607666 + }, + { + "auxiliary_loss_clip": 0.01099237, + "auxiliary_loss_mlp": 0.01029878, + "balance_loss_clip": 1.01906586, + "balance_loss_mlp": 1.03727329, + "epoch": 0.9519314594919586, + "flos": 15230649720960.0, + "grad_norm": 2.0408885296052803, + "language_loss": 0.77953559, + "learning_rate": 2.4159432831070135e-08, + "loss": 0.80082679, + "num_input_tokens_seen": 341526155, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6171875, + "step": 15833, + "time_per_iteration": 2.434434413909912 + }, + { + "auxiliary_loss_clip": 0.01097433, + "auxiliary_loss_mlp": 0.01030915, + "balance_loss_clip": 1.0198164, + "balance_loss_mlp": 1.03424346, + "epoch": 0.9519915827446265, + "flos": 19352943100800.0, + "grad_norm": 2.0874053061362146, + "language_loss": 0.74132979, + "learning_rate": 2.4099118165670007e-08, + "loss": 0.7626133, + "num_input_tokens_seen": 341540450, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6328125, + "step": 15834, + "time_per_iteration": 2.4627585411071777 + }, + { + "auxiliary_loss_clip": 0.01104375, + "auxiliary_loss_mlp": 0.01032918, + "balance_loss_clip": 1.02034688, + "balance_loss_mlp": 1.0350616, + "epoch": 0.9520517059972945, + "flos": 22266303408000.0, + "grad_norm": 1.998991881803621, + "language_loss": 0.76126343, + "learning_rate": 2.4038878426629216e-08, + "loss": 0.78263634, + "num_input_tokens_seen": 341557865, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 15835, + "time_per_iteration": 2.4405393600463867 + }, + { + "auxiliary_loss_clip": 0.01100091, + "auxiliary_loss_mlp": 0.0103236, + "balance_loss_clip": 1.02003992, + "balance_loss_mlp": 1.03354049, + "epoch": 0.9521118292499624, + "flos": 14862313704960.0, + "grad_norm": 1.8271796682870614, + "language_loss": 0.65903687, + "learning_rate": 2.397871361623238e-08, + "loss": 0.68036139, + "num_input_tokens_seen": 341573890, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6640625, + "step": 15836, + "time_per_iteration": 2.449270248413086 + }, + { + "auxiliary_loss_clip": 0.01097308, + "auxiliary_loss_mlp": 0.01024877, + "balance_loss_clip": 1.01343238, + "balance_loss_mlp": 1.03359866, + "epoch": 0.9521719525026304, + "flos": 23508812718720.0, + "grad_norm": 1.6042939226932975, + "language_loss": 0.70522273, + "learning_rate": 2.391862373676057e-08, + "loss": 0.72644454, + "num_input_tokens_seen": 341593770, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.63671875, + "step": 15837, + "time_per_iteration": 2.485703468322754 + }, + { + "auxiliary_loss_clip": 0.01100856, + "auxiliary_loss_mlp": 0.0103009, + "balance_loss_clip": 1.01728129, + "balance_loss_mlp": 1.03319621, + "epoch": 0.9522320757552983, + "flos": 19714922409600.0, + "grad_norm": 1.769143781079073, + "language_loss": 0.73489517, + "learning_rate": 2.3858608790492617e-08, + "loss": 0.75620466, + "num_input_tokens_seen": 341612065, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 15838, + "time_per_iteration": 2.5037167072296143 + }, + { + "auxiliary_loss_clip": 0.01099497, + "auxiliary_loss_mlp": 0.01029417, + "balance_loss_clip": 1.01803803, + "balance_loss_mlp": 1.03289866, + "epoch": 0.9522921990079664, + "flos": 25921291824000.0, + "grad_norm": 1.9893727922388904, + "language_loss": 0.78339815, + "learning_rate": 2.379866877970449e-08, + "loss": 0.80468726, + "num_input_tokens_seen": 341631365, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6640625, + "step": 15839, + "time_per_iteration": 2.4681766033172607 + }, + { + "auxiliary_loss_clip": 0.01101243, + "auxiliary_loss_mlp": 0.01033121, + "balance_loss_clip": 1.0213666, + "balance_loss_mlp": 1.03504133, + "epoch": 0.9523523222606343, + "flos": 19208115463680.0, + "grad_norm": 1.4720319477602668, + "language_loss": 0.80227256, + "learning_rate": 2.3738803706668585e-08, + "loss": 0.82361627, + "num_input_tokens_seen": 341650300, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66015625, + "step": 15840, + "time_per_iteration": 2.4581902027130127 + }, + { + "auxiliary_loss_clip": 0.01093038, + "auxiliary_loss_mlp": 0.0102555, + "balance_loss_clip": 1.01571536, + "balance_loss_mlp": 1.03195643, + "epoch": 0.9524124455133023, + "flos": 20921269703040.0, + "grad_norm": 1.9542207241379934, + "language_loss": 0.72871137, + "learning_rate": 2.3679013573655314e-08, + "loss": 0.74989724, + "num_input_tokens_seen": 341667680, + "router_z_loss_clip": 0.09863281, + "router_z_loss_mlp": 0.609375, + "step": 15841, + "time_per_iteration": 2.465167760848999 + }, + { + "auxiliary_loss_clip": 0.01093774, + "auxiliary_loss_mlp": 0.01024056, + "balance_loss_clip": 1.01342833, + "balance_loss_mlp": 1.03309047, + "epoch": 0.9524725687659702, + "flos": 18843550375680.0, + "grad_norm": 1.8427976481059472, + "language_loss": 0.78926313, + "learning_rate": 2.3619298382931972e-08, + "loss": 0.81044149, + "num_input_tokens_seen": 341685760, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.609375, + "step": 15842, + "time_per_iteration": 2.4842257499694824 + }, + { + "auxiliary_loss_clip": 0.01101716, + "auxiliary_loss_mlp": 0.01031806, + "balance_loss_clip": 1.02011764, + "balance_loss_mlp": 1.03695965, + "epoch": 0.9525326920186382, + "flos": 22674680110080.0, + "grad_norm": 1.8219900566748215, + "language_loss": 0.72275579, + "learning_rate": 2.3559658136762973e-08, + "loss": 0.74409097, + "num_input_tokens_seen": 341705300, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6484375, + "step": 15843, + "time_per_iteration": 2.4471869468688965 + }, + { + "auxiliary_loss_clip": 0.01101194, + "auxiliary_loss_mlp": 0.01030657, + "balance_loss_clip": 1.01852155, + "balance_loss_mlp": 1.03493142, + "epoch": 0.9525928152713061, + "flos": 22086642556800.0, + "grad_norm": 1.6296759970994528, + "language_loss": 0.78324318, + "learning_rate": 2.3500092837409612e-08, + "loss": 0.80456167, + "num_input_tokens_seen": 341724565, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6640625, + "step": 15844, + "time_per_iteration": 2.4472808837890625 + }, + { + "auxiliary_loss_clip": 0.01103251, + "auxiliary_loss_mlp": 0.01031271, + "balance_loss_clip": 1.01777697, + "balance_loss_mlp": 1.03366756, + "epoch": 0.9526529385239741, + "flos": 20704728562560.0, + "grad_norm": 1.8471796733890804, + "language_loss": 0.69943261, + "learning_rate": 2.3440602487130977e-08, + "loss": 0.72077781, + "num_input_tokens_seen": 341743605, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.6953125, + "step": 15845, + "time_per_iteration": 3.8085498809814453 + }, + { + "auxiliary_loss_clip": 0.01101573, + "auxiliary_loss_mlp": 0.01031515, + "balance_loss_clip": 1.02024341, + "balance_loss_mlp": 1.03391755, + "epoch": 0.9527130617766422, + "flos": 23368043318400.0, + "grad_norm": 1.6011167896613763, + "language_loss": 0.75642556, + "learning_rate": 2.338118708818282e-08, + "loss": 0.77775645, + "num_input_tokens_seen": 341763475, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.67578125, + "step": 15846, + "time_per_iteration": 2.473083019256592 + }, + { + "auxiliary_loss_clip": 0.01100923, + "auxiliary_loss_mlp": 0.01027356, + "balance_loss_clip": 1.01582253, + "balance_loss_mlp": 1.03413308, + "epoch": 0.9527731850293101, + "flos": 18985935888000.0, + "grad_norm": 1.7664774219702817, + "language_loss": 0.78162938, + "learning_rate": 2.3321846642817998e-08, + "loss": 0.80291218, + "num_input_tokens_seen": 341781265, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.66796875, + "step": 15847, + "time_per_iteration": 3.8917007446289062 + }, + { + "auxiliary_loss_clip": 0.01096033, + "auxiliary_loss_mlp": 0.01032606, + "balance_loss_clip": 1.02181149, + "balance_loss_mlp": 1.03224957, + "epoch": 0.9528333082819781, + "flos": 19318038059520.0, + "grad_norm": 1.7206750490584977, + "language_loss": 0.77701223, + "learning_rate": 2.326258115328672e-08, + "loss": 0.7982986, + "num_input_tokens_seen": 341798825, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.63671875, + "step": 15848, + "time_per_iteration": 3.823594093322754 + }, + { + "auxiliary_loss_clip": 0.01104682, + "auxiliary_loss_mlp": 0.01039029, + "balance_loss_clip": 1.02638733, + "balance_loss_mlp": 1.03632021, + "epoch": 0.952893431534646, + "flos": 23951340276480.0, + "grad_norm": 1.6696974182974789, + "language_loss": 0.72178817, + "learning_rate": 2.320339062183674e-08, + "loss": 0.74322522, + "num_input_tokens_seen": 341819480, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.68359375, + "step": 15849, + "time_per_iteration": 2.4846818447113037 + }, + { + "auxiliary_loss_clip": 0.01107242, + "auxiliary_loss_mlp": 0.0103393, + "balance_loss_clip": 1.02149057, + "balance_loss_mlp": 1.03735495, + "epoch": 0.952953554787314, + "flos": 21030545854080.0, + "grad_norm": 1.6245014301779637, + "language_loss": 0.75090873, + "learning_rate": 2.314427505071226e-08, + "loss": 0.77232051, + "num_input_tokens_seen": 341838035, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.69921875, + "step": 15850, + "time_per_iteration": 2.4306790828704834 + }, + { + "auxiliary_loss_clip": 0.01099677, + "auxiliary_loss_mlp": 0.01028737, + "balance_loss_clip": 1.01789474, + "balance_loss_mlp": 1.03360927, + "epoch": 0.9530136780399819, + "flos": 22382870019840.0, + "grad_norm": 2.6338381558530766, + "language_loss": 0.72366798, + "learning_rate": 2.308523444215482e-08, + "loss": 0.7449522, + "num_input_tokens_seen": 341855895, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.66015625, + "step": 15851, + "time_per_iteration": 2.4408111572265625 + }, + { + "auxiliary_loss_clip": 0.01097199, + "auxiliary_loss_mlp": 0.01025055, + "balance_loss_clip": 1.01375353, + "balance_loss_mlp": 1.03315783, + "epoch": 0.95307380129265, + "flos": 22159613036160.0, + "grad_norm": 2.0504097549488027, + "language_loss": 0.7981447, + "learning_rate": 2.3026268798403525e-08, + "loss": 0.81936717, + "num_input_tokens_seen": 341875240, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 15852, + "time_per_iteration": 2.4637861251831055 + }, + { + "auxiliary_loss_clip": 0.01099896, + "auxiliary_loss_mlp": 0.01033288, + "balance_loss_clip": 1.02134967, + "balance_loss_mlp": 1.03417897, + "epoch": 0.9531339245453179, + "flos": 44022747214080.0, + "grad_norm": 1.7073175594849501, + "language_loss": 0.59777415, + "learning_rate": 2.2967378121694138e-08, + "loss": 0.61910605, + "num_input_tokens_seen": 341901020, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 15853, + "time_per_iteration": 4.21125054359436 + }, + { + "auxiliary_loss_clip": 0.01095024, + "auxiliary_loss_mlp": 0.01027344, + "balance_loss_clip": 1.01688933, + "balance_loss_mlp": 1.03267741, + "epoch": 0.9531940477979859, + "flos": 20266690204800.0, + "grad_norm": 1.8692728037781963, + "language_loss": 0.7304824, + "learning_rate": 2.290856241425998e-08, + "loss": 0.75170606, + "num_input_tokens_seen": 341919365, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.625, + "step": 15854, + "time_per_iteration": 2.432305097579956 + }, + { + "auxiliary_loss_clip": 0.01098391, + "auxiliary_loss_mlp": 0.01028917, + "balance_loss_clip": 1.01780701, + "balance_loss_mlp": 1.03201079, + "epoch": 0.9532541710506538, + "flos": 25335732309120.0, + "grad_norm": 2.4146000582030047, + "language_loss": 0.67618144, + "learning_rate": 2.284982167833127e-08, + "loss": 0.69745457, + "num_input_tokens_seen": 341939985, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6640625, + "step": 15855, + "time_per_iteration": 2.4794416427612305 + }, + { + "auxiliary_loss_clip": 0.01098939, + "auxiliary_loss_mlp": 0.01027176, + "balance_loss_clip": 1.0162859, + "balance_loss_mlp": 1.03353429, + "epoch": 0.9533142943033218, + "flos": 26469288691200.0, + "grad_norm": 1.5011674711012832, + "language_loss": 0.76639926, + "learning_rate": 2.279115591613556e-08, + "loss": 0.78766036, + "num_input_tokens_seen": 341959255, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.65234375, + "step": 15856, + "time_per_iteration": 2.4852230548858643 + }, + { + "auxiliary_loss_clip": 0.01097507, + "auxiliary_loss_mlp": 0.01029936, + "balance_loss_clip": 1.01927257, + "balance_loss_mlp": 1.03294313, + "epoch": 0.9533744175559897, + "flos": 23656944407040.0, + "grad_norm": 1.6032019491566774, + "language_loss": 0.77757066, + "learning_rate": 2.2732565129897075e-08, + "loss": 0.79884511, + "num_input_tokens_seen": 341977205, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.64453125, + "step": 15857, + "time_per_iteration": 2.4635865688323975 + }, + { + "auxiliary_loss_clip": 0.01021553, + "auxiliary_loss_mlp": 0.01002209, + "balance_loss_clip": 1.00120187, + "balance_loss_mlp": 1.00156283, + "epoch": 0.9534345408086577, + "flos": 61052055500160.0, + "grad_norm": 0.704565405960459, + "language_loss": 0.62570769, + "learning_rate": 2.267404932183803e-08, + "loss": 0.64594531, + "num_input_tokens_seen": 342038545, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.19921875, + "step": 15858, + "time_per_iteration": 3.001497268676758 + }, + { + "auxiliary_loss_clip": 0.01097781, + "auxiliary_loss_mlp": 0.0102579, + "balance_loss_clip": 1.01493001, + "balance_loss_mlp": 1.03351498, + "epoch": 0.9534946640613258, + "flos": 18951677291520.0, + "grad_norm": 1.454205662994463, + "language_loss": 0.56674993, + "learning_rate": 2.2615608494177097e-08, + "loss": 0.58798563, + "num_input_tokens_seen": 342058195, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.64453125, + "step": 15859, + "time_per_iteration": 2.518068313598633 + }, + { + "auxiliary_loss_clip": 0.01094952, + "auxiliary_loss_mlp": 0.01027304, + "balance_loss_clip": 1.01677203, + "balance_loss_mlp": 1.03268635, + "epoch": 0.9535547873139937, + "flos": 16654292340480.0, + "grad_norm": 2.021746638389019, + "language_loss": 0.81863093, + "learning_rate": 2.2557242649130504e-08, + "loss": 0.83985353, + "num_input_tokens_seen": 342075025, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.625, + "step": 15860, + "time_per_iteration": 2.493851900100708 + }, + { + "auxiliary_loss_clip": 0.01097997, + "auxiliary_loss_mlp": 0.01024861, + "balance_loss_clip": 1.0143348, + "balance_loss_mlp": 1.03253686, + "epoch": 0.9536149105666617, + "flos": 20667776446080.0, + "grad_norm": 1.7693188133463755, + "language_loss": 0.66683793, + "learning_rate": 2.249895178891159e-08, + "loss": 0.68806648, + "num_input_tokens_seen": 342094595, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.65625, + "step": 15861, + "time_per_iteration": 2.4849302768707275 + }, + { + "auxiliary_loss_clip": 0.01100217, + "auxiliary_loss_mlp": 0.01034093, + "balance_loss_clip": 1.02243447, + "balance_loss_mlp": 1.03482676, + "epoch": 0.9536750338193296, + "flos": 30700499086080.0, + "grad_norm": 2.2855998410592417, + "language_loss": 0.65861797, + "learning_rate": 2.244073591573037e-08, + "loss": 0.67996109, + "num_input_tokens_seen": 342115970, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 15862, + "time_per_iteration": 2.5085601806640625 + }, + { + "auxiliary_loss_clip": 0.01098858, + "auxiliary_loss_mlp": 0.010273, + "balance_loss_clip": 1.01651764, + "balance_loss_mlp": 1.03623557, + "epoch": 0.9537351570719976, + "flos": 20405484357120.0, + "grad_norm": 1.4470053207480973, + "language_loss": 0.6742301, + "learning_rate": 2.238259503179485e-08, + "loss": 0.69549167, + "num_input_tokens_seen": 342134080, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.625, + "step": 15863, + "time_per_iteration": 2.4261577129364014 + }, + { + "auxiliary_loss_clip": 0.01099259, + "auxiliary_loss_mlp": 0.01028199, + "balance_loss_clip": 1.01673734, + "balance_loss_mlp": 1.03436029, + "epoch": 0.9537952803246655, + "flos": 29929245235200.0, + "grad_norm": 1.814028155072979, + "language_loss": 0.7815752, + "learning_rate": 2.2324529139309267e-08, + "loss": 0.80284977, + "num_input_tokens_seen": 342154725, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 15864, + "time_per_iteration": 2.530269145965576 + }, + { + "auxiliary_loss_clip": 0.01097838, + "auxiliary_loss_mlp": 0.01026439, + "balance_loss_clip": 1.01526916, + "balance_loss_mlp": 1.03393769, + "epoch": 0.9538554035773336, + "flos": 20521404524160.0, + "grad_norm": 1.8811083442124992, + "language_loss": 0.5989036, + "learning_rate": 2.226653824047586e-08, + "loss": 0.62014639, + "num_input_tokens_seen": 342172275, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.640625, + "step": 15865, + "time_per_iteration": 2.4256134033203125 + }, + { + "auxiliary_loss_clip": 0.01098516, + "auxiliary_loss_mlp": 0.01027192, + "balance_loss_clip": 1.0161171, + "balance_loss_mlp": 1.0329715, + "epoch": 0.9539155268300015, + "flos": 18406517598720.0, + "grad_norm": 1.8653100688509543, + "language_loss": 0.69772661, + "learning_rate": 2.2208622337493765e-08, + "loss": 0.71898365, + "num_input_tokens_seen": 342190880, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.65625, + "step": 15866, + "time_per_iteration": 2.43904447555542 + }, + { + "auxiliary_loss_clip": 0.01099512, + "auxiliary_loss_mlp": 0.01031315, + "balance_loss_clip": 1.01943624, + "balance_loss_mlp": 1.0335021, + "epoch": 0.9539756500826695, + "flos": 26213281482240.0, + "grad_norm": 2.271920105664109, + "language_loss": 0.84857428, + "learning_rate": 2.215078143255855e-08, + "loss": 0.86988258, + "num_input_tokens_seen": 342208165, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.66015625, + "step": 15867, + "time_per_iteration": 2.460845708847046 + }, + { + "auxiliary_loss_clip": 0.01021587, + "auxiliary_loss_mlp": 0.00999883, + "balance_loss_clip": 0.9989357, + "balance_loss_mlp": 1.00168824, + "epoch": 0.9540357733353374, + "flos": 68289097766400.0, + "grad_norm": 0.7526563387722108, + "language_loss": 0.61838603, + "learning_rate": 2.2093015527864024e-08, + "loss": 0.63860077, + "num_input_tokens_seen": 342277110, + "router_z_loss_clip": 0.00946045, + "router_z_loss_mlp": 0.19921875, + "step": 15868, + "time_per_iteration": 3.0988385677337646 + }, + { + "auxiliary_loss_clip": 0.01099704, + "auxiliary_loss_mlp": 0.01027007, + "balance_loss_clip": 1.01490152, + "balance_loss_mlp": 1.03455853, + "epoch": 0.9540958965880054, + "flos": 21288276915840.0, + "grad_norm": 1.7170259212083214, + "language_loss": 0.60134614, + "learning_rate": 2.2035324625600425e-08, + "loss": 0.62261331, + "num_input_tokens_seen": 342294695, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.65234375, + "step": 15869, + "time_per_iteration": 2.4509079456329346 + }, + { + "auxiliary_loss_clip": 0.01099414, + "auxiliary_loss_mlp": 0.01031649, + "balance_loss_clip": 1.02160573, + "balance_loss_mlp": 1.03469789, + "epoch": 0.9541560198406733, + "flos": 19751407649280.0, + "grad_norm": 1.7269587427502697, + "language_loss": 0.70540398, + "learning_rate": 2.197770872795579e-08, + "loss": 0.72671461, + "num_input_tokens_seen": 342314970, + "router_z_loss_clip": 0.10009766, + "router_z_loss_mlp": 0.6484375, + "step": 15870, + "time_per_iteration": 2.494284152984619 + }, + { + "auxiliary_loss_clip": 0.01095736, + "auxiliary_loss_mlp": 0.01027683, + "balance_loss_clip": 1.01579773, + "balance_loss_mlp": 1.03193331, + "epoch": 0.9542161430933414, + "flos": 24715626888960.0, + "grad_norm": 2.5278290831513313, + "language_loss": 0.76707828, + "learning_rate": 2.1920167837114368e-08, + "loss": 0.78831249, + "num_input_tokens_seen": 342334255, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.640625, + "step": 15871, + "time_per_iteration": 2.495964527130127 + }, + { + "auxiliary_loss_clip": 0.01100681, + "auxiliary_loss_mlp": 0.01028549, + "balance_loss_clip": 1.01648521, + "balance_loss_mlp": 1.03446722, + "epoch": 0.9542762663460094, + "flos": 31065818359680.0, + "grad_norm": 1.7535486260785396, + "language_loss": 0.58022785, + "learning_rate": 2.1862701955258634e-08, + "loss": 0.60152018, + "num_input_tokens_seen": 342354730, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.66015625, + "step": 15872, + "time_per_iteration": 2.5454163551330566 + }, + { + "auxiliary_loss_clip": 0.01102984, + "auxiliary_loss_mlp": 0.01029416, + "balance_loss_clip": 1.01681566, + "balance_loss_mlp": 1.03452253, + "epoch": 0.9543363895986773, + "flos": 20776729374720.0, + "grad_norm": 1.4591829449069909, + "language_loss": 0.74832845, + "learning_rate": 2.1805311084567514e-08, + "loss": 0.76965249, + "num_input_tokens_seen": 342374565, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.68359375, + "step": 15873, + "time_per_iteration": 2.4488136768341064 + }, + { + "auxiliary_loss_clip": 0.01101317, + "auxiliary_loss_mlp": 0.01031843, + "balance_loss_clip": 1.01900995, + "balance_loss_mlp": 1.03453755, + "epoch": 0.9543965128513453, + "flos": 24462744163200.0, + "grad_norm": 1.727737631306832, + "language_loss": 0.62304831, + "learning_rate": 2.1747995227217265e-08, + "loss": 0.64437991, + "num_input_tokens_seen": 342394590, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.66796875, + "step": 15874, + "time_per_iteration": 2.4801042079925537 + }, + { + "auxiliary_loss_clip": 0.01098124, + "auxiliary_loss_mlp": 0.01032916, + "balance_loss_clip": 1.02129853, + "balance_loss_mlp": 1.03405643, + "epoch": 0.9544566361040132, + "flos": 15261532439040.0, + "grad_norm": 2.237267296362062, + "language_loss": 0.89501953, + "learning_rate": 2.169075438538104e-08, + "loss": 0.91632992, + "num_input_tokens_seen": 342410445, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.640625, + "step": 15875, + "time_per_iteration": 2.391258716583252 + }, + { + "auxiliary_loss_clip": 0.01103812, + "auxiliary_loss_mlp": 0.01031982, + "balance_loss_clip": 1.01947105, + "balance_loss_mlp": 1.03519917, + "epoch": 0.9545167593566812, + "flos": 25918777872000.0, + "grad_norm": 1.930514194430758, + "language_loss": 0.67863441, + "learning_rate": 2.1633588561229765e-08, + "loss": 0.69999236, + "num_input_tokens_seen": 342430970, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 15876, + "time_per_iteration": 2.6414058208465576 + }, + { + "auxiliary_loss_clip": 0.01101042, + "auxiliary_loss_mlp": 0.01028919, + "balance_loss_clip": 1.01681304, + "balance_loss_mlp": 1.0339992, + "epoch": 0.9545768826093491, + "flos": 25628188844160.0, + "grad_norm": 1.7708559487688424, + "language_loss": 0.6911338, + "learning_rate": 2.1576497756931267e-08, + "loss": 0.7124334, + "num_input_tokens_seen": 342449505, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.671875, + "step": 15877, + "time_per_iteration": 2.509392738342285 + }, + { + "auxiliary_loss_clip": 0.01102036, + "auxiliary_loss_mlp": 0.01028312, + "balance_loss_clip": 1.01631999, + "balance_loss_mlp": 1.03537035, + "epoch": 0.9546370058620172, + "flos": 22491499726080.0, + "grad_norm": 1.672159994427175, + "language_loss": 0.70852697, + "learning_rate": 2.1519481974650035e-08, + "loss": 0.72983038, + "num_input_tokens_seen": 342470390, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 15878, + "time_per_iteration": 2.472141742706299 + }, + { + "auxiliary_loss_clip": 0.01096131, + "auxiliary_loss_mlp": 0.01026374, + "balance_loss_clip": 1.01498389, + "balance_loss_mlp": 1.03232455, + "epoch": 0.9546971291146851, + "flos": 24609582961920.0, + "grad_norm": 1.3589761302170789, + "language_loss": 0.68371421, + "learning_rate": 2.1462541216548335e-08, + "loss": 0.70493931, + "num_input_tokens_seen": 342492560, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.63671875, + "step": 15879, + "time_per_iteration": 2.4805357456207275 + }, + { + "auxiliary_loss_clip": 0.01097447, + "auxiliary_loss_mlp": 0.01027897, + "balance_loss_clip": 1.01687622, + "balance_loss_mlp": 1.0332495, + "epoch": 0.9547572523673531, + "flos": 28657756627200.0, + "grad_norm": 1.9054124900066427, + "language_loss": 0.84860075, + "learning_rate": 2.1405675484785334e-08, + "loss": 0.86985421, + "num_input_tokens_seen": 342512315, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 15880, + "time_per_iteration": 2.5021228790283203 + }, + { + "auxiliary_loss_clip": 0.01099262, + "auxiliary_loss_mlp": 0.01030074, + "balance_loss_clip": 1.0179683, + "balance_loss_mlp": 1.0333271, + "epoch": 0.954817375620021, + "flos": 33802606385280.0, + "grad_norm": 1.7287846320179276, + "language_loss": 0.71575916, + "learning_rate": 2.134888478151753e-08, + "loss": 0.73705256, + "num_input_tokens_seen": 342533060, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.65625, + "step": 15881, + "time_per_iteration": 2.5729317665100098 + }, + { + "auxiliary_loss_clip": 0.01099118, + "auxiliary_loss_mlp": 0.01035277, + "balance_loss_clip": 1.02368426, + "balance_loss_mlp": 1.03515661, + "epoch": 0.954877498872689, + "flos": 14428225843200.0, + "grad_norm": 1.8203154025696195, + "language_loss": 0.71242815, + "learning_rate": 2.1292169108898083e-08, + "loss": 0.7337721, + "num_input_tokens_seen": 342550830, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.640625, + "step": 15882, + "time_per_iteration": 2.4185373783111572 + }, + { + "auxiliary_loss_clip": 0.01100013, + "auxiliary_loss_mlp": 0.01030602, + "balance_loss_clip": 1.01927757, + "balance_loss_mlp": 1.03457165, + "epoch": 0.9549376221253569, + "flos": 59269447336320.0, + "grad_norm": 2.0067236869483644, + "language_loss": 0.66055608, + "learning_rate": 2.1235528469078168e-08, + "loss": 0.68186224, + "num_input_tokens_seen": 342575070, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65625, + "step": 15883, + "time_per_iteration": 2.812549114227295 + }, + { + "auxiliary_loss_clip": 0.0110374, + "auxiliary_loss_mlp": 0.01028151, + "balance_loss_clip": 1.01572978, + "balance_loss_mlp": 1.03677058, + "epoch": 0.954997745378025, + "flos": 17274397760640.0, + "grad_norm": 2.8838417421243645, + "language_loss": 0.7817893, + "learning_rate": 2.1178962864205175e-08, + "loss": 0.80310816, + "num_input_tokens_seen": 342592215, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.66796875, + "step": 15884, + "time_per_iteration": 2.4394302368164062 + }, + { + "auxiliary_loss_clip": 0.01100977, + "auxiliary_loss_mlp": 0.01025185, + "balance_loss_clip": 1.01315713, + "balance_loss_mlp": 1.03389931, + "epoch": 0.955057868630693, + "flos": 13006378903680.0, + "grad_norm": 5.22444846040725, + "language_loss": 0.776416, + "learning_rate": 2.1122472296424054e-08, + "loss": 0.79767764, + "num_input_tokens_seen": 342610030, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 15885, + "time_per_iteration": 2.4130163192749023 + }, + { + "auxiliary_loss_clip": 0.01099455, + "auxiliary_loss_mlp": 0.01030292, + "balance_loss_clip": 1.01905656, + "balance_loss_mlp": 1.03313184, + "epoch": 0.9551179918833609, + "flos": 22637692080000.0, + "grad_norm": 1.971891405607794, + "language_loss": 0.69846129, + "learning_rate": 2.1066056767877317e-08, + "loss": 0.71975875, + "num_input_tokens_seen": 342626475, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6640625, + "step": 15886, + "time_per_iteration": 2.4465065002441406 + }, + { + "auxiliary_loss_clip": 0.01103927, + "auxiliary_loss_mlp": 0.01032148, + "balance_loss_clip": 1.01940477, + "balance_loss_mlp": 1.03575993, + "epoch": 0.9551781151360289, + "flos": 21542811667200.0, + "grad_norm": 1.7290024537631783, + "language_loss": 0.72445035, + "learning_rate": 2.1009716280703916e-08, + "loss": 0.7458111, + "num_input_tokens_seen": 342646645, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 15887, + "time_per_iteration": 3.847572088241577 + }, + { + "auxiliary_loss_clip": 0.01095349, + "auxiliary_loss_mlp": 0.01026638, + "balance_loss_clip": 1.01572418, + "balance_loss_mlp": 1.0323261, + "epoch": 0.9552382383886968, + "flos": 20702250524160.0, + "grad_norm": 2.2429397817873085, + "language_loss": 0.56737578, + "learning_rate": 2.0953450837040364e-08, + "loss": 0.58859569, + "num_input_tokens_seen": 342663615, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6328125, + "step": 15888, + "time_per_iteration": 2.4127004146575928 + }, + { + "auxiliary_loss_clip": 0.0102134, + "auxiliary_loss_mlp": 0.01002702, + "balance_loss_clip": 1.00170708, + "balance_loss_mlp": 1.00141358, + "epoch": 0.9552983616413648, + "flos": 67769792887680.0, + "grad_norm": 0.7089096744564684, + "language_loss": 0.57814407, + "learning_rate": 2.0897260439020514e-08, + "loss": 0.5983845, + "num_input_tokens_seen": 342728275, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.19921875, + "step": 15889, + "time_per_iteration": 5.906970977783203 + }, + { + "auxiliary_loss_clip": 0.01100248, + "auxiliary_loss_mlp": 0.0102726, + "balance_loss_clip": 1.01538062, + "balance_loss_mlp": 1.03259969, + "epoch": 0.9553584848940327, + "flos": 21579979265280.0, + "grad_norm": 1.3242300073138324, + "language_loss": 0.66891074, + "learning_rate": 2.084114508877466e-08, + "loss": 0.69018579, + "num_input_tokens_seen": 342748860, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.67578125, + "step": 15890, + "time_per_iteration": 2.4627249240875244 + }, + { + "auxiliary_loss_clip": 0.01100107, + "auxiliary_loss_mlp": 0.01028335, + "balance_loss_clip": 1.01665831, + "balance_loss_mlp": 1.03478599, + "epoch": 0.9554186081467008, + "flos": 24208173498240.0, + "grad_norm": 1.4510635562982561, + "language_loss": 0.74006915, + "learning_rate": 2.0785104788430874e-08, + "loss": 0.76135355, + "num_input_tokens_seen": 342769705, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65625, + "step": 15891, + "time_per_iteration": 2.484015941619873 + }, + { + "auxiliary_loss_clip": 0.01095435, + "auxiliary_loss_mlp": 0.01028968, + "balance_loss_clip": 1.01869869, + "balance_loss_mlp": 1.03344524, + "epoch": 0.9554787313993687, + "flos": 16251554073600.0, + "grad_norm": 2.2414416282964824, + "language_loss": 0.77894902, + "learning_rate": 2.072913954011435e-08, + "loss": 0.80019307, + "num_input_tokens_seen": 342787000, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.6171875, + "step": 15892, + "time_per_iteration": 2.4298534393310547 + }, + { + "auxiliary_loss_clip": 0.01097855, + "auxiliary_loss_mlp": 0.01030441, + "balance_loss_clip": 1.01835918, + "balance_loss_mlp": 1.03325903, + "epoch": 0.9555388546520367, + "flos": 23404133508480.0, + "grad_norm": 1.4859618601332218, + "language_loss": 0.69746578, + "learning_rate": 2.0673249345947386e-08, + "loss": 0.71874869, + "num_input_tokens_seen": 342807795, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6484375, + "step": 15893, + "time_per_iteration": 2.4958953857421875 + }, + { + "auxiliary_loss_clip": 0.01100591, + "auxiliary_loss_mlp": 0.01030848, + "balance_loss_clip": 1.01775336, + "balance_loss_mlp": 1.0359931, + "epoch": 0.9555989779047046, + "flos": 14794047907200.0, + "grad_norm": 1.9274290043089441, + "language_loss": 0.65745211, + "learning_rate": 2.0617434208048955e-08, + "loss": 0.67876649, + "num_input_tokens_seen": 342825490, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6484375, + "step": 15894, + "time_per_iteration": 4.018404960632324 + }, + { + "auxiliary_loss_clip": 0.01101163, + "auxiliary_loss_mlp": 0.01029588, + "balance_loss_clip": 1.01734471, + "balance_loss_mlp": 1.03446078, + "epoch": 0.9556591011573726, + "flos": 22236749493120.0, + "grad_norm": 1.9622102153443857, + "language_loss": 0.81861794, + "learning_rate": 2.056169412853581e-08, + "loss": 0.83992541, + "num_input_tokens_seen": 342844965, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66796875, + "step": 15895, + "time_per_iteration": 2.4529037475585938 + }, + { + "auxiliary_loss_clip": 0.0110047, + "auxiliary_loss_mlp": 0.01030613, + "balance_loss_clip": 1.01922894, + "balance_loss_mlp": 1.0347774, + "epoch": 0.9557192244100405, + "flos": 27855296835840.0, + "grad_norm": 3.12144499892649, + "language_loss": 0.72422135, + "learning_rate": 2.0506029109521593e-08, + "loss": 0.74553216, + "num_input_tokens_seen": 342865915, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65625, + "step": 15896, + "time_per_iteration": 2.531371831893921 + }, + { + "auxiliary_loss_clip": 0.01097836, + "auxiliary_loss_mlp": 0.01027687, + "balance_loss_clip": 1.01624894, + "balance_loss_mlp": 1.03318739, + "epoch": 0.9557793476627086, + "flos": 17602800831360.0, + "grad_norm": 1.891257637063241, + "language_loss": 0.79660171, + "learning_rate": 2.045043915311706e-08, + "loss": 0.81785691, + "num_input_tokens_seen": 342884000, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 15897, + "time_per_iteration": 2.423757791519165 + }, + { + "auxiliary_loss_clip": 0.01098206, + "auxiliary_loss_mlp": 0.01032377, + "balance_loss_clip": 1.02033651, + "balance_loss_mlp": 1.03225029, + "epoch": 0.9558394709153766, + "flos": 23875496709120.0, + "grad_norm": 1.6030681434016965, + "language_loss": 0.72389764, + "learning_rate": 2.03949242614303e-08, + "loss": 0.7452035, + "num_input_tokens_seen": 342903095, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.66015625, + "step": 15898, + "time_per_iteration": 2.4686954021453857 + }, + { + "auxiliary_loss_clip": 0.01021576, + "auxiliary_loss_mlp": 0.0099842, + "balance_loss_clip": 0.9974727, + "balance_loss_mlp": 1.00169182, + "epoch": 0.9558995941680445, + "flos": 53682001171200.0, + "grad_norm": 0.8420099231602695, + "language_loss": 0.52358627, + "learning_rate": 2.033948443656652e-08, + "loss": 0.54378629, + "num_input_tokens_seen": 342958155, + "router_z_loss_clip": 0.00946045, + "router_z_loss_mlp": 0.19921875, + "step": 15899, + "time_per_iteration": 3.0036983489990234 + }, + { + "auxiliary_loss_clip": 0.01104279, + "auxiliary_loss_mlp": 0.01030778, + "balance_loss_clip": 1.0179863, + "balance_loss_mlp": 1.03525329, + "epoch": 0.9559597174207125, + "flos": 13764488376960.0, + "grad_norm": 2.249257849936427, + "language_loss": 0.68539107, + "learning_rate": 2.028411968062782e-08, + "loss": 0.70674157, + "num_input_tokens_seen": 342972500, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69140625, + "step": 15900, + "time_per_iteration": 2.4041638374328613 + }, + { + "auxiliary_loss_clip": 0.01100192, + "auxiliary_loss_mlp": 0.01026875, + "balance_loss_clip": 1.01519287, + "balance_loss_mlp": 1.03403616, + "epoch": 0.9560198406733804, + "flos": 19936347799680.0, + "grad_norm": 2.560017279471282, + "language_loss": 0.82855231, + "learning_rate": 2.0228829995713627e-08, + "loss": 0.849823, + "num_input_tokens_seen": 342989035, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 15901, + "time_per_iteration": 2.488877534866333 + }, + { + "auxiliary_loss_clip": 0.01021779, + "auxiliary_loss_mlp": 0.0100249, + "balance_loss_clip": 1.00149441, + "balance_loss_mlp": 1.00174415, + "epoch": 0.9560799639260484, + "flos": 57289550699520.0, + "grad_norm": 0.7081018961368435, + "language_loss": 0.54319799, + "learning_rate": 2.0173615383920485e-08, + "loss": 0.56344068, + "num_input_tokens_seen": 343051675, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.20117188, + "step": 15902, + "time_per_iteration": 3.1055378913879395 + }, + { + "auxiliary_loss_clip": 0.01093723, + "auxiliary_loss_mlp": 0.01028674, + "balance_loss_clip": 1.01904833, + "balance_loss_mlp": 1.03346229, + "epoch": 0.9561400871787163, + "flos": 18917167299840.0, + "grad_norm": 1.5555768909196643, + "language_loss": 0.85443425, + "learning_rate": 2.01184758473425e-08, + "loss": 0.87565827, + "num_input_tokens_seen": 343068895, + "router_z_loss_clip": 0.09619141, + "router_z_loss_mlp": 0.60546875, + "step": 15903, + "time_per_iteration": 2.4546353816986084 + }, + { + "auxiliary_loss_clip": 0.01097244, + "auxiliary_loss_mlp": 0.01026895, + "balance_loss_clip": 1.01644611, + "balance_loss_mlp": 1.03315914, + "epoch": 0.9562002104313844, + "flos": 18038576632320.0, + "grad_norm": 1.9837154441799645, + "language_loss": 0.80416489, + "learning_rate": 2.0063411388070217e-08, + "loss": 0.82540631, + "num_input_tokens_seen": 343087115, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.640625, + "step": 15904, + "time_per_iteration": 2.410019874572754 + }, + { + "auxiliary_loss_clip": 0.0110219, + "auxiliary_loss_mlp": 0.0103026, + "balance_loss_clip": 1.01825547, + "balance_loss_mlp": 1.03522384, + "epoch": 0.9562603336840523, + "flos": 24717673964160.0, + "grad_norm": 2.367778122852727, + "language_loss": 0.6043731, + "learning_rate": 2.0008422008191972e-08, + "loss": 0.62569761, + "num_input_tokens_seen": 343105575, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 15905, + "time_per_iteration": 2.484440803527832 + }, + { + "auxiliary_loss_clip": 0.01097638, + "auxiliary_loss_mlp": 0.0102839, + "balance_loss_clip": 1.01712513, + "balance_loss_mlp": 1.0328846, + "epoch": 0.9563204569367203, + "flos": 21177205084800.0, + "grad_norm": 1.9836519503290855, + "language_loss": 0.69943386, + "learning_rate": 1.995350770979254e-08, + "loss": 0.72069418, + "num_input_tokens_seen": 343123025, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6484375, + "step": 15906, + "time_per_iteration": 2.436502456665039 + }, + { + "auxiliary_loss_clip": 0.01103642, + "auxiliary_loss_mlp": 0.01028107, + "balance_loss_clip": 1.01620412, + "balance_loss_mlp": 1.03666723, + "epoch": 0.9563805801893882, + "flos": 20229738088320.0, + "grad_norm": 1.5331239973913557, + "language_loss": 0.7067498, + "learning_rate": 1.9898668494954473e-08, + "loss": 0.72806728, + "num_input_tokens_seen": 343141625, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 15907, + "time_per_iteration": 2.43973445892334 + }, + { + "auxiliary_loss_clip": 0.01097674, + "auxiliary_loss_mlp": 0.01028674, + "balance_loss_clip": 1.01680064, + "balance_loss_mlp": 1.03331208, + "epoch": 0.9564407034420562, + "flos": 25411001258880.0, + "grad_norm": 1.8625996431981158, + "language_loss": 0.7003063, + "learning_rate": 1.9843904365757447e-08, + "loss": 0.72156978, + "num_input_tokens_seen": 343161300, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.64453125, + "step": 15908, + "time_per_iteration": 2.477755308151245 + }, + { + "auxiliary_loss_clip": 0.01100131, + "auxiliary_loss_mlp": 0.01029921, + "balance_loss_clip": 1.01866722, + "balance_loss_mlp": 1.03570485, + "epoch": 0.9565008266947241, + "flos": 18623884752000.0, + "grad_norm": 4.607070324323302, + "language_loss": 0.83111703, + "learning_rate": 1.978921532427802e-08, + "loss": 0.85241747, + "num_input_tokens_seen": 343177815, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.64453125, + "step": 15909, + "time_per_iteration": 2.4829578399658203 + }, + { + "auxiliary_loss_clip": 0.01098212, + "auxiliary_loss_mlp": 0.0103104, + "balance_loss_clip": 1.01955426, + "balance_loss_mlp": 1.0328548, + "epoch": 0.9565609499473922, + "flos": 24862142465280.0, + "grad_norm": 2.2246333809539465, + "language_loss": 0.6721313, + "learning_rate": 1.9734601372590086e-08, + "loss": 0.69342375, + "num_input_tokens_seen": 343198140, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65625, + "step": 15910, + "time_per_iteration": 2.4749066829681396 + }, + { + "auxiliary_loss_clip": 0.01102469, + "auxiliary_loss_mlp": 0.010327, + "balance_loss_clip": 1.02111912, + "balance_loss_mlp": 1.03529978, + "epoch": 0.9566210732000601, + "flos": 21798459740160.0, + "grad_norm": 1.6112632328082404, + "language_loss": 0.74234146, + "learning_rate": 1.968006251276444e-08, + "loss": 0.76369315, + "num_input_tokens_seen": 343218280, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.671875, + "step": 15911, + "time_per_iteration": 2.4732449054718018 + }, + { + "auxiliary_loss_clip": 0.01098446, + "auxiliary_loss_mlp": 0.010274, + "balance_loss_clip": 1.01601529, + "balance_loss_mlp": 1.03259337, + "epoch": 0.9566811964527281, + "flos": 18697609416960.0, + "grad_norm": 2.080099232375226, + "language_loss": 0.69968218, + "learning_rate": 1.9625598746869198e-08, + "loss": 0.72094059, + "num_input_tokens_seen": 343236850, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.66015625, + "step": 15912, + "time_per_iteration": 2.4121153354644775 + }, + { + "auxiliary_loss_clip": 0.01100916, + "auxiliary_loss_mlp": 0.01034899, + "balance_loss_clip": 1.02324605, + "balance_loss_mlp": 1.034904, + "epoch": 0.9567413197053961, + "flos": 13000632727680.0, + "grad_norm": 4.337452858375998, + "language_loss": 0.7253468, + "learning_rate": 1.95712100769696e-08, + "loss": 0.74670494, + "num_input_tokens_seen": 343253065, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66015625, + "step": 15913, + "time_per_iteration": 2.4348838329315186 + }, + { + "auxiliary_loss_clip": 0.01099633, + "auxiliary_loss_mlp": 0.0102546, + "balance_loss_clip": 1.01477885, + "balance_loss_mlp": 1.03494883, + "epoch": 0.956801442958064, + "flos": 19719267955200.0, + "grad_norm": 1.9142325578386978, + "language_loss": 0.73507404, + "learning_rate": 1.9516896505128444e-08, + "loss": 0.75632489, + "num_input_tokens_seen": 343270330, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.64453125, + "step": 15914, + "time_per_iteration": 2.4489924907684326 + }, + { + "auxiliary_loss_clip": 0.010974, + "auxiliary_loss_mlp": 0.01027808, + "balance_loss_clip": 1.01634026, + "balance_loss_mlp": 1.03322947, + "epoch": 0.956861566210732, + "flos": 18222834424320.0, + "grad_norm": 1.5054378463207643, + "language_loss": 0.67459226, + "learning_rate": 1.9462658033404965e-08, + "loss": 0.69584435, + "num_input_tokens_seen": 343289625, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.640625, + "step": 15915, + "time_per_iteration": 2.4519832134246826 + }, + { + "auxiliary_loss_clip": 0.01097523, + "auxiliary_loss_mlp": 0.01027131, + "balance_loss_clip": 1.01603842, + "balance_loss_mlp": 1.03358841, + "epoch": 0.9569216894634, + "flos": 22196960202240.0, + "grad_norm": 1.7543278741085384, + "language_loss": 0.64166009, + "learning_rate": 1.9408494663855967e-08, + "loss": 0.66290665, + "num_input_tokens_seen": 343309200, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.640625, + "step": 15916, + "time_per_iteration": 2.4360384941101074 + }, + { + "auxiliary_loss_clip": 0.0109159, + "auxiliary_loss_mlp": 0.01029462, + "balance_loss_clip": 1.01882899, + "balance_loss_mlp": 1.03175974, + "epoch": 0.956981812716068, + "flos": 21689291329920.0, + "grad_norm": 1.904935652985477, + "language_loss": 0.80659258, + "learning_rate": 1.935440639853536e-08, + "loss": 0.82780313, + "num_input_tokens_seen": 343326270, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.59765625, + "step": 15917, + "time_per_iteration": 2.480591297149658 + }, + { + "auxiliary_loss_clip": 0.01098198, + "auxiliary_loss_mlp": 0.01031132, + "balance_loss_clip": 1.01965833, + "balance_loss_mlp": 1.03460228, + "epoch": 0.9570419359687359, + "flos": 13990905757440.0, + "grad_norm": 1.758517813594143, + "language_loss": 0.72947186, + "learning_rate": 1.9300393239494172e-08, + "loss": 0.7507652, + "num_input_tokens_seen": 343344430, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6328125, + "step": 15918, + "time_per_iteration": 2.423100471496582 + }, + { + "auxiliary_loss_clip": 0.01021645, + "auxiliary_loss_mlp": 0.01001396, + "balance_loss_clip": 1.0004425, + "balance_loss_mlp": 1.00167096, + "epoch": 0.9571020592214039, + "flos": 65196938534400.0, + "grad_norm": 0.6313369610735284, + "language_loss": 0.53130996, + "learning_rate": 1.924645518878032e-08, + "loss": 0.55154037, + "num_input_tokens_seen": 343416155, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.19921875, + "step": 15919, + "time_per_iteration": 3.1794583797454834 + }, + { + "auxiliary_loss_clip": 0.01106485, + "auxiliary_loss_mlp": 0.01035749, + "balance_loss_clip": 1.02320194, + "balance_loss_mlp": 1.03811026, + "epoch": 0.9571621824740718, + "flos": 17384068961280.0, + "grad_norm": 2.571633844940979, + "language_loss": 0.75538218, + "learning_rate": 1.919259224843972e-08, + "loss": 0.77680451, + "num_input_tokens_seen": 343431715, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.68359375, + "step": 15920, + "time_per_iteration": 2.4159767627716064 + }, + { + "auxiliary_loss_clip": 0.01102735, + "auxiliary_loss_mlp": 0.01033589, + "balance_loss_clip": 1.02107811, + "balance_loss_mlp": 1.0350672, + "epoch": 0.9572223057267398, + "flos": 14538184352640.0, + "grad_norm": 1.9063096196660445, + "language_loss": 0.7912389, + "learning_rate": 1.9138804420514298e-08, + "loss": 0.8126021, + "num_input_tokens_seen": 343450425, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.67578125, + "step": 15921, + "time_per_iteration": 2.45986008644104 + }, + { + "auxiliary_loss_clip": 0.01103353, + "auxiliary_loss_mlp": 0.01027887, + "balance_loss_clip": 1.01543534, + "balance_loss_mlp": 1.03351963, + "epoch": 0.9572824289794077, + "flos": 33947793158400.0, + "grad_norm": 2.0395877371527718, + "language_loss": 0.50749934, + "learning_rate": 1.9085091707044197e-08, + "loss": 0.52881169, + "num_input_tokens_seen": 343470445, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.69921875, + "step": 15922, + "time_per_iteration": 2.555110454559326 + }, + { + "auxiliary_loss_clip": 0.01101174, + "auxiliary_loss_mlp": 0.01032467, + "balance_loss_clip": 1.0203439, + "balance_loss_mlp": 1.03463745, + "epoch": 0.9573425522320758, + "flos": 18694915896960.0, + "grad_norm": 2.0324325155300844, + "language_loss": 0.83707559, + "learning_rate": 1.903145411006557e-08, + "loss": 0.85841203, + "num_input_tokens_seen": 343485200, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 15923, + "time_per_iteration": 2.4552669525146484 + }, + { + "auxiliary_loss_clip": 0.01096477, + "auxiliary_loss_mlp": 0.01028667, + "balance_loss_clip": 1.01791394, + "balance_loss_mlp": 1.0326041, + "epoch": 0.9574026754847437, + "flos": 28510307297280.0, + "grad_norm": 1.5161968575353546, + "language_loss": 0.74902648, + "learning_rate": 1.8977891631613008e-08, + "loss": 0.77027792, + "num_input_tokens_seen": 343505080, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.640625, + "step": 15924, + "time_per_iteration": 2.5490124225616455 + }, + { + "auxiliary_loss_clip": 0.01098896, + "auxiliary_loss_mlp": 0.01030926, + "balance_loss_clip": 1.01905894, + "balance_loss_mlp": 1.03276801, + "epoch": 0.9574627987374117, + "flos": 24352390604160.0, + "grad_norm": 2.078581919020162, + "language_loss": 0.85878658, + "learning_rate": 1.892440427371711e-08, + "loss": 0.88008475, + "num_input_tokens_seen": 343523995, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66015625, + "step": 15925, + "time_per_iteration": 2.476585865020752 + }, + { + "auxiliary_loss_clip": 0.01104973, + "auxiliary_loss_mlp": 0.0103264, + "balance_loss_clip": 1.02014744, + "balance_loss_mlp": 1.03549838, + "epoch": 0.9575229219900797, + "flos": 23510680225920.0, + "grad_norm": 2.8194945253474297, + "language_loss": 0.75799584, + "learning_rate": 1.8870992038406474e-08, + "loss": 0.77937198, + "num_input_tokens_seen": 343542015, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 15926, + "time_per_iteration": 2.4524552822113037 + }, + { + "auxiliary_loss_clip": 0.01101507, + "auxiliary_loss_mlp": 0.01029116, + "balance_loss_clip": 1.01854181, + "balance_loss_mlp": 1.03607941, + "epoch": 0.9575830452427476, + "flos": 22674823764480.0, + "grad_norm": 1.622745351817711, + "language_loss": 0.77535486, + "learning_rate": 1.8817654927706373e-08, + "loss": 0.79666108, + "num_input_tokens_seen": 343561680, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.65625, + "step": 15927, + "time_per_iteration": 2.5115678310394287 + }, + { + "auxiliary_loss_clip": 0.01102754, + "auxiliary_loss_mlp": 0.01032707, + "balance_loss_clip": 1.0196532, + "balance_loss_mlp": 1.03499341, + "epoch": 0.9576431684954156, + "flos": 30485250835200.0, + "grad_norm": 5.979240549227758, + "language_loss": 0.68711758, + "learning_rate": 1.8764392943639183e-08, + "loss": 0.70847225, + "num_input_tokens_seen": 343585290, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6796875, + "step": 15928, + "time_per_iteration": 4.005671739578247 + }, + { + "auxiliary_loss_clip": 0.01099738, + "auxiliary_loss_mlp": 0.01030764, + "balance_loss_clip": 1.01900387, + "balance_loss_mlp": 1.03447127, + "epoch": 0.9577032917480836, + "flos": 21687387909120.0, + "grad_norm": 1.5556334234006137, + "language_loss": 0.81790125, + "learning_rate": 1.871120608822485e-08, + "loss": 0.83920628, + "num_input_tokens_seen": 343604045, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.65234375, + "step": 15929, + "time_per_iteration": 2.439286470413208 + }, + { + "auxiliary_loss_clip": 0.01103823, + "auxiliary_loss_mlp": 0.0103863, + "balance_loss_clip": 1.02678704, + "balance_loss_mlp": 1.03518653, + "epoch": 0.9577634150007516, + "flos": 29023147728000.0, + "grad_norm": 1.4266689760878288, + "language_loss": 0.72288859, + "learning_rate": 1.8658094363480202e-08, + "loss": 0.74431318, + "num_input_tokens_seen": 343626595, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6875, + "step": 15930, + "time_per_iteration": 3.916687250137329 + }, + { + "auxiliary_loss_clip": 0.01098084, + "auxiliary_loss_mlp": 0.01027954, + "balance_loss_clip": 1.01688528, + "balance_loss_mlp": 1.03421974, + "epoch": 0.9578235382534195, + "flos": 19282235178240.0, + "grad_norm": 1.3860575900403753, + "language_loss": 0.61940473, + "learning_rate": 1.8605057771419185e-08, + "loss": 0.64066511, + "num_input_tokens_seen": 343646195, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 15931, + "time_per_iteration": 3.9892077445983887 + }, + { + "auxiliary_loss_clip": 0.01097363, + "auxiliary_loss_mlp": 0.01028291, + "balance_loss_clip": 1.0176214, + "balance_loss_mlp": 1.03452408, + "epoch": 0.9578836615060875, + "flos": 13699275235200.0, + "grad_norm": 1.6941605420085752, + "language_loss": 0.68982953, + "learning_rate": 1.8552096314052633e-08, + "loss": 0.71108609, + "num_input_tokens_seen": 343663665, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.625, + "step": 15932, + "time_per_iteration": 2.398474931716919 + }, + { + "auxiliary_loss_clip": 0.01103128, + "auxiliary_loss_mlp": 0.0103345, + "balance_loss_clip": 1.02058697, + "balance_loss_mlp": 1.03450584, + "epoch": 0.9579437847587554, + "flos": 17054516655360.0, + "grad_norm": 2.9202077613156447, + "language_loss": 0.75383151, + "learning_rate": 1.849920999338961e-08, + "loss": 0.77519727, + "num_input_tokens_seen": 343682145, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6875, + "step": 15933, + "time_per_iteration": 2.4255053997039795 + }, + { + "auxiliary_loss_clip": 0.01021552, + "auxiliary_loss_mlp": 0.01001482, + "balance_loss_clip": 1.0005517, + "balance_loss_mlp": 1.00157118, + "epoch": 0.9580039080114234, + "flos": 60570887886720.0, + "grad_norm": 0.7217589216028398, + "language_loss": 0.57281023, + "learning_rate": 1.8446398811434948e-08, + "loss": 0.59304059, + "num_input_tokens_seen": 343744685, + "router_z_loss_clip": 0.00927734, + "router_z_loss_mlp": 0.20019531, + "step": 15934, + "time_per_iteration": 3.1378121376037598 + }, + { + "auxiliary_loss_clip": 0.01021591, + "auxiliary_loss_mlp": 0.01003298, + "balance_loss_clip": 1.00234401, + "balance_loss_mlp": 1.00169897, + "epoch": 0.9580640312640913, + "flos": 66235365745920.0, + "grad_norm": 0.913712526229365, + "language_loss": 0.65969813, + "learning_rate": 1.8393662770191277e-08, + "loss": 0.67994696, + "num_input_tokens_seen": 343801835, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.19921875, + "step": 15935, + "time_per_iteration": 3.0164973735809326 + }, + { + "auxiliary_loss_clip": 0.01021566, + "auxiliary_loss_mlp": 0.00997813, + "balance_loss_clip": 0.99687093, + "balance_loss_mlp": 1.00159645, + "epoch": 0.9581241545167594, + "flos": 62218002971520.0, + "grad_norm": 0.7963829283211799, + "language_loss": 0.57069677, + "learning_rate": 1.8341001871658546e-08, + "loss": 0.59089053, + "num_input_tokens_seen": 343861515, + "router_z_loss_clip": 0.00939941, + "router_z_loss_mlp": 0.19921875, + "step": 15936, + "time_per_iteration": 4.516096115112305 + }, + { + "auxiliary_loss_clip": 0.0110158, + "auxiliary_loss_mlp": 0.01029217, + "balance_loss_clip": 1.01734924, + "balance_loss_mlp": 1.03508747, + "epoch": 0.9581842777694273, + "flos": 23768088065280.0, + "grad_norm": 1.7549841016566206, + "language_loss": 0.78426778, + "learning_rate": 1.8288416117833825e-08, + "loss": 0.80557573, + "num_input_tokens_seen": 343881240, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6640625, + "step": 15937, + "time_per_iteration": 2.54921555519104 + }, + { + "auxiliary_loss_clip": 0.01100478, + "auxiliary_loss_mlp": 0.0103017, + "balance_loss_clip": 1.01807022, + "balance_loss_mlp": 1.03413606, + "epoch": 0.9582444010220953, + "flos": 21213079793280.0, + "grad_norm": 2.664120819072444, + "language_loss": 0.68353987, + "learning_rate": 1.8235905510710636e-08, + "loss": 0.70484626, + "num_input_tokens_seen": 343900885, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 15938, + "time_per_iteration": 2.568049907684326 + }, + { + "auxiliary_loss_clip": 0.01099803, + "auxiliary_loss_mlp": 0.01027851, + "balance_loss_clip": 1.01622176, + "balance_loss_mlp": 1.03337634, + "epoch": 0.9583045242747633, + "flos": 23805147922560.0, + "grad_norm": 2.516589048128792, + "language_loss": 0.65331376, + "learning_rate": 1.8183470052280712e-08, + "loss": 0.67459035, + "num_input_tokens_seen": 343918460, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 15939, + "time_per_iteration": 2.4567224979400635 + }, + { + "auxiliary_loss_clip": 0.01097061, + "auxiliary_loss_mlp": 0.0103265, + "balance_loss_clip": 1.02127695, + "balance_loss_mlp": 1.03251374, + "epoch": 0.9583646475274312, + "flos": 24131468004480.0, + "grad_norm": 3.037055398213982, + "language_loss": 0.73640996, + "learning_rate": 1.8131109744532025e-08, + "loss": 0.75770706, + "num_input_tokens_seen": 343938030, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.64453125, + "step": 15940, + "time_per_iteration": 2.5022995471954346 + }, + { + "auxiliary_loss_clip": 0.0110072, + "auxiliary_loss_mlp": 0.01029738, + "balance_loss_clip": 1.01716161, + "balance_loss_mlp": 1.03483605, + "epoch": 0.9584247707800992, + "flos": 20886651970560.0, + "grad_norm": 1.6010591427701613, + "language_loss": 0.73068857, + "learning_rate": 1.8078824589450535e-08, + "loss": 0.75199318, + "num_input_tokens_seen": 343956635, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.65625, + "step": 15941, + "time_per_iteration": 2.4811065196990967 + }, + { + "auxiliary_loss_clip": 0.01099466, + "auxiliary_loss_mlp": 0.01034168, + "balance_loss_clip": 1.02316487, + "balance_loss_mlp": 1.03476393, + "epoch": 0.9584848940327672, + "flos": 26067591918720.0, + "grad_norm": 1.4889591298763103, + "language_loss": 0.71140969, + "learning_rate": 1.8026614589018442e-08, + "loss": 0.73274601, + "num_input_tokens_seen": 343976625, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6484375, + "step": 15942, + "time_per_iteration": 2.488670587539673 + }, + { + "auxiliary_loss_clip": 0.01100639, + "auxiliary_loss_mlp": 0.01030485, + "balance_loss_clip": 1.01804519, + "balance_loss_mlp": 1.03398347, + "epoch": 0.9585450172854352, + "flos": 34492988764800.0, + "grad_norm": 1.6261005015311867, + "language_loss": 0.71908909, + "learning_rate": 1.797447974521571e-08, + "loss": 0.74040031, + "num_input_tokens_seen": 343997790, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.6640625, + "step": 15943, + "time_per_iteration": 2.564479112625122 + }, + { + "auxiliary_loss_clip": 0.01102364, + "auxiliary_loss_mlp": 0.01034948, + "balance_loss_clip": 1.02263975, + "balance_loss_mlp": 1.03473973, + "epoch": 0.9586051405381031, + "flos": 23110743219840.0, + "grad_norm": 1.7040542813640263, + "language_loss": 0.6800124, + "learning_rate": 1.792242006001965e-08, + "loss": 0.7013855, + "num_input_tokens_seen": 344016935, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.67578125, + "step": 15944, + "time_per_iteration": 2.5587704181671143 + }, + { + "auxiliary_loss_clip": 0.01100009, + "auxiliary_loss_mlp": 0.01033516, + "balance_loss_clip": 1.02160716, + "balance_loss_mlp": 1.0336163, + "epoch": 0.9586652637907711, + "flos": 19603994232960.0, + "grad_norm": 2.087826009089437, + "language_loss": 0.65862542, + "learning_rate": 1.7870435535403795e-08, + "loss": 0.67996073, + "num_input_tokens_seen": 344035590, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 15945, + "time_per_iteration": 2.445241689682007 + }, + { + "auxiliary_loss_clip": 0.01021444, + "auxiliary_loss_mlp": 0.01001575, + "balance_loss_clip": 1.00056767, + "balance_loss_mlp": 1.00148821, + "epoch": 0.958725387043439, + "flos": 72073327317120.0, + "grad_norm": 0.7415668424690911, + "language_loss": 0.61897564, + "learning_rate": 1.7818526173339678e-08, + "loss": 0.63920581, + "num_input_tokens_seen": 344100845, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.19921875, + "step": 15946, + "time_per_iteration": 3.135841131210327 + }, + { + "auxiliary_loss_clip": 0.01096719, + "auxiliary_loss_mlp": 0.01027382, + "balance_loss_clip": 1.01627207, + "balance_loss_mlp": 1.03327739, + "epoch": 0.958785510296107, + "flos": 28911932242560.0, + "grad_norm": 1.7761369112332144, + "language_loss": 0.75568569, + "learning_rate": 1.7766691975795723e-08, + "loss": 0.7769267, + "num_input_tokens_seen": 344121780, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6328125, + "step": 15947, + "time_per_iteration": 2.4884331226348877 + }, + { + "auxiliary_loss_clip": 0.01097515, + "auxiliary_loss_mlp": 0.01025857, + "balance_loss_clip": 1.01469898, + "balance_loss_mlp": 1.03267527, + "epoch": 0.958845633548775, + "flos": 18477189607680.0, + "grad_norm": 2.398089295673863, + "language_loss": 0.70082307, + "learning_rate": 1.771493294473747e-08, + "loss": 0.72205675, + "num_input_tokens_seen": 344140150, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 15948, + "time_per_iteration": 2.450761556625366 + }, + { + "auxiliary_loss_clip": 0.01096726, + "auxiliary_loss_mlp": 0.01027812, + "balance_loss_clip": 1.01652312, + "balance_loss_mlp": 1.03256166, + "epoch": 0.958905756801443, + "flos": 24206916522240.0, + "grad_norm": 1.8775052153716447, + "language_loss": 0.78941453, + "learning_rate": 1.7663249082127574e-08, + "loss": 0.81065995, + "num_input_tokens_seen": 344158200, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.640625, + "step": 15949, + "time_per_iteration": 2.453991413116455 + }, + { + "auxiliary_loss_clip": 0.01101471, + "auxiliary_loss_mlp": 0.01033531, + "balance_loss_clip": 1.02134788, + "balance_loss_mlp": 1.03560996, + "epoch": 0.9589658800541109, + "flos": 25007939769600.0, + "grad_norm": 1.732596967369498, + "language_loss": 0.68670601, + "learning_rate": 1.761164038992602e-08, + "loss": 0.70805597, + "num_input_tokens_seen": 344174720, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65625, + "step": 15950, + "time_per_iteration": 2.4904561042785645 + }, + { + "auxiliary_loss_clip": 0.01100004, + "auxiliary_loss_mlp": 0.01029947, + "balance_loss_clip": 1.01883698, + "balance_loss_mlp": 1.0342288, + "epoch": 0.9590260033067789, + "flos": 23514558894720.0, + "grad_norm": 1.820609185891462, + "language_loss": 0.86225641, + "learning_rate": 1.7560106870089687e-08, + "loss": 0.88355601, + "num_input_tokens_seen": 344192580, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.65625, + "step": 15951, + "time_per_iteration": 2.4510254859924316 + }, + { + "auxiliary_loss_clip": 0.01103854, + "auxiliary_loss_mlp": 0.0103563, + "balance_loss_clip": 1.02368557, + "balance_loss_mlp": 1.03520882, + "epoch": 0.9590861265594469, + "flos": 25520349237120.0, + "grad_norm": 2.084854322647747, + "language_loss": 0.7963227, + "learning_rate": 1.7508648524572568e-08, + "loss": 0.81771755, + "num_input_tokens_seen": 344210345, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6875, + "step": 15952, + "time_per_iteration": 2.5137405395507812 + }, + { + "auxiliary_loss_clip": 0.01100763, + "auxiliary_loss_mlp": 0.0102934, + "balance_loss_clip": 1.0173949, + "balance_loss_mlp": 1.03468966, + "epoch": 0.9591462498121148, + "flos": 21179323987200.0, + "grad_norm": 1.719452431467091, + "language_loss": 0.69882435, + "learning_rate": 1.7457265355326434e-08, + "loss": 0.72012538, + "num_input_tokens_seen": 344229540, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66015625, + "step": 15953, + "time_per_iteration": 2.469686985015869 + }, + { + "auxiliary_loss_clip": 0.01102054, + "auxiliary_loss_mlp": 0.01027827, + "balance_loss_clip": 1.01587033, + "balance_loss_mlp": 1.03441012, + "epoch": 0.9592063730647828, + "flos": 21723047136000.0, + "grad_norm": 2.317829736689624, + "language_loss": 0.57854062, + "learning_rate": 1.7405957364299285e-08, + "loss": 0.59983945, + "num_input_tokens_seen": 344247830, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6796875, + "step": 15954, + "time_per_iteration": 2.4689619541168213 + }, + { + "auxiliary_loss_clip": 0.01102063, + "auxiliary_loss_mlp": 0.01031747, + "balance_loss_clip": 1.01876557, + "balance_loss_mlp": 1.03452051, + "epoch": 0.9592664963174508, + "flos": 29891395278720.0, + "grad_norm": 1.965848373822375, + "language_loss": 0.74012095, + "learning_rate": 1.7354724553437117e-08, + "loss": 0.76145911, + "num_input_tokens_seen": 344267760, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.671875, + "step": 15955, + "time_per_iteration": 2.561746120452881 + }, + { + "auxiliary_loss_clip": 0.01099826, + "auxiliary_loss_mlp": 0.01033323, + "balance_loss_clip": 1.02136004, + "balance_loss_mlp": 1.03310394, + "epoch": 0.9593266195701188, + "flos": 17999613354240.0, + "grad_norm": 2.1926585969680796, + "language_loss": 0.62872529, + "learning_rate": 1.7303566924682378e-08, + "loss": 0.65005678, + "num_input_tokens_seen": 344284905, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.66796875, + "step": 15956, + "time_per_iteration": 2.5066936016082764 + }, + { + "auxiliary_loss_clip": 0.0110062, + "auxiliary_loss_mlp": 0.01032107, + "balance_loss_clip": 1.01954842, + "balance_loss_mlp": 1.03487992, + "epoch": 0.9593867428227867, + "flos": 18838271076480.0, + "grad_norm": 1.8584287929461432, + "language_loss": 0.59779477, + "learning_rate": 1.725248447997507e-08, + "loss": 0.61912203, + "num_input_tokens_seen": 344302025, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.65625, + "step": 15957, + "time_per_iteration": 2.4974136352539062 + }, + { + "auxiliary_loss_clip": 0.01099795, + "auxiliary_loss_mlp": 0.0103612, + "balance_loss_clip": 1.02395439, + "balance_loss_mlp": 1.03408015, + "epoch": 0.9594468660754547, + "flos": 29567050444800.0, + "grad_norm": 1.931337896250092, + "language_loss": 0.74394608, + "learning_rate": 1.7201477221252314e-08, + "loss": 0.76530516, + "num_input_tokens_seen": 344321935, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.65625, + "step": 15958, + "time_per_iteration": 2.7763772010803223 + }, + { + "auxiliary_loss_clip": 0.01098821, + "auxiliary_loss_mlp": 0.01026509, + "balance_loss_clip": 1.01513004, + "balance_loss_mlp": 1.03337789, + "epoch": 0.9595069893281226, + "flos": 20703256104960.0, + "grad_norm": 1.470209010768804, + "language_loss": 0.74736482, + "learning_rate": 1.7150545150448116e-08, + "loss": 0.76861811, + "num_input_tokens_seen": 344340405, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65234375, + "step": 15959, + "time_per_iteration": 2.5138134956359863 + }, + { + "auxiliary_loss_clip": 0.01101982, + "auxiliary_loss_mlp": 0.0102754, + "balance_loss_clip": 1.01552415, + "balance_loss_mlp": 1.03473663, + "epoch": 0.9595671125807906, + "flos": 22453613856000.0, + "grad_norm": 1.910701656384312, + "language_loss": 0.64995688, + "learning_rate": 1.7099688269493816e-08, + "loss": 0.67125207, + "num_input_tokens_seen": 344359925, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 15960, + "time_per_iteration": 2.5808286666870117 + }, + { + "auxiliary_loss_clip": 0.01096302, + "auxiliary_loss_mlp": 0.01031218, + "balance_loss_clip": 1.01920176, + "balance_loss_mlp": 1.03344536, + "epoch": 0.9596272358334585, + "flos": 23915214172800.0, + "grad_norm": 1.850697413966108, + "language_loss": 0.77640712, + "learning_rate": 1.7048906580318544e-08, + "loss": 0.79768229, + "num_input_tokens_seen": 344379100, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.62890625, + "step": 15961, + "time_per_iteration": 2.6080851554870605 + }, + { + "auxiliary_loss_clip": 0.01097563, + "auxiliary_loss_mlp": 0.01027144, + "balance_loss_clip": 1.01583743, + "balance_loss_mlp": 1.03365684, + "epoch": 0.9596873590861266, + "flos": 17672539086720.0, + "grad_norm": 1.7282197835839996, + "language_loss": 0.76134586, + "learning_rate": 1.699820008484698e-08, + "loss": 0.78259289, + "num_input_tokens_seen": 344396895, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.640625, + "step": 15962, + "time_per_iteration": 2.4804084300994873 + }, + { + "auxiliary_loss_clip": 0.01101283, + "auxiliary_loss_mlp": 0.01030369, + "balance_loss_clip": 1.01823974, + "balance_loss_mlp": 1.03411567, + "epoch": 0.9597474823387945, + "flos": 25808532053760.0, + "grad_norm": 2.084352656854831, + "language_loss": 0.72044748, + "learning_rate": 1.6947568785002698e-08, + "loss": 0.74176401, + "num_input_tokens_seen": 344415115, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 15963, + "time_per_iteration": 2.614706516265869 + }, + { + "auxiliary_loss_clip": 0.01097072, + "auxiliary_loss_mlp": 0.01030523, + "balance_loss_clip": 1.01976991, + "balance_loss_mlp": 1.03555036, + "epoch": 0.9598076055914625, + "flos": 23768519028480.0, + "grad_norm": 1.5422937274732758, + "language_loss": 0.74315596, + "learning_rate": 1.689701268270527e-08, + "loss": 0.76443183, + "num_input_tokens_seen": 344435185, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6171875, + "step": 15964, + "time_per_iteration": 2.5884809494018555 + }, + { + "auxiliary_loss_clip": 0.01021677, + "auxiliary_loss_mlp": 0.01000233, + "balance_loss_clip": 0.99927884, + "balance_loss_mlp": 1.00162196, + "epoch": 0.9598677288441305, + "flos": 56515962464640.0, + "grad_norm": 0.87740764751359, + "language_loss": 0.57558799, + "learning_rate": 1.684653177987161e-08, + "loss": 0.59580708, + "num_input_tokens_seen": 344488950, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.20019531, + "step": 15965, + "time_per_iteration": 3.032865047454834 + }, + { + "auxiliary_loss_clip": 0.0109898, + "auxiliary_loss_mlp": 0.010295, + "balance_loss_clip": 1.01854491, + "balance_loss_mlp": 1.03277349, + "epoch": 0.9599278520967984, + "flos": 22997480659200.0, + "grad_norm": 4.347845014421521, + "language_loss": 0.78900796, + "learning_rate": 1.6796126078416627e-08, + "loss": 0.81029272, + "num_input_tokens_seen": 344506740, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6640625, + "step": 15966, + "time_per_iteration": 2.4715421199798584 + }, + { + "auxiliary_loss_clip": 0.01095286, + "auxiliary_loss_mlp": 0.01027389, + "balance_loss_clip": 1.01574206, + "balance_loss_mlp": 1.0313921, + "epoch": 0.9599879753494664, + "flos": 23039676161280.0, + "grad_norm": 2.0769488159919423, + "language_loss": 0.79388767, + "learning_rate": 1.674579558025102e-08, + "loss": 0.81511444, + "num_input_tokens_seen": 344526670, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.640625, + "step": 15967, + "time_per_iteration": 2.4780547618865967 + }, + { + "auxiliary_loss_clip": 0.01104282, + "auxiliary_loss_mlp": 0.01026202, + "balance_loss_clip": 1.01377392, + "balance_loss_mlp": 1.0364691, + "epoch": 0.9600480986021344, + "flos": 16392287560320.0, + "grad_norm": 2.049759384254396, + "language_loss": 0.8052963, + "learning_rate": 1.669554028728348e-08, + "loss": 0.82660115, + "num_input_tokens_seen": 344541995, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 15968, + "time_per_iteration": 2.48009991645813 + }, + { + "auxiliary_loss_clip": 0.01103398, + "auxiliary_loss_mlp": 0.01036768, + "balance_loss_clip": 1.02361894, + "balance_loss_mlp": 1.03506923, + "epoch": 0.9601082218548024, + "flos": 24276439296000.0, + "grad_norm": 2.7147848897637794, + "language_loss": 0.67841053, + "learning_rate": 1.6645360201420044e-08, + "loss": 0.69981217, + "num_input_tokens_seen": 344559980, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.68359375, + "step": 15969, + "time_per_iteration": 2.48237681388855 + }, + { + "auxiliary_loss_clip": 0.01100318, + "auxiliary_loss_mlp": 0.01036471, + "balance_loss_clip": 1.02568293, + "balance_loss_mlp": 1.03579783, + "epoch": 0.9601683451074703, + "flos": 19609991804160.0, + "grad_norm": 2.6281665249354553, + "language_loss": 0.79528141, + "learning_rate": 1.6595255324563186e-08, + "loss": 0.81664926, + "num_input_tokens_seen": 344577765, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.64453125, + "step": 15970, + "time_per_iteration": 3.8477537631988525 + }, + { + "auxiliary_loss_clip": 0.01097507, + "auxiliary_loss_mlp": 0.01032057, + "balance_loss_clip": 1.02026701, + "balance_loss_mlp": 1.03499389, + "epoch": 0.9602284683601383, + "flos": 26651104358400.0, + "grad_norm": 1.5295204507537015, + "language_loss": 0.77275121, + "learning_rate": 1.654522565861316e-08, + "loss": 0.79404688, + "num_input_tokens_seen": 344597650, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.625, + "step": 15971, + "time_per_iteration": 2.501603841781616 + }, + { + "auxiliary_loss_clip": 0.01101775, + "auxiliary_loss_mlp": 0.01026363, + "balance_loss_clip": 1.01433444, + "balance_loss_mlp": 1.03340471, + "epoch": 0.9602885916128062, + "flos": 15554096714880.0, + "grad_norm": 2.0074410078313987, + "language_loss": 0.67119515, + "learning_rate": 1.64952712054669e-08, + "loss": 0.69247651, + "num_input_tokens_seen": 344613580, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6875, + "step": 15972, + "time_per_iteration": 3.8172669410705566 + }, + { + "auxiliary_loss_clip": 0.01098207, + "auxiliary_loss_mlp": 0.01024218, + "balance_loss_clip": 1.01264238, + "balance_loss_mlp": 1.03300917, + "epoch": 0.9603487148654742, + "flos": 16502353810560.0, + "grad_norm": 2.5640425645458174, + "language_loss": 0.76354134, + "learning_rate": 1.644539196701844e-08, + "loss": 0.7847656, + "num_input_tokens_seen": 344626910, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65234375, + "step": 15973, + "time_per_iteration": 3.7488813400268555 + }, + { + "auxiliary_loss_clip": 0.01101414, + "auxiliary_loss_mlp": 0.01037122, + "balance_loss_clip": 1.02525496, + "balance_loss_mlp": 1.03684473, + "epoch": 0.9604088381181421, + "flos": 20845354308480.0, + "grad_norm": 1.5915230941554284, + "language_loss": 0.69382858, + "learning_rate": 1.639558794515983e-08, + "loss": 0.71521389, + "num_input_tokens_seen": 344644330, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6484375, + "step": 15974, + "time_per_iteration": 2.459822177886963 + }, + { + "auxiliary_loss_clip": 0.01099172, + "auxiliary_loss_mlp": 0.01026699, + "balance_loss_clip": 1.01501679, + "balance_loss_mlp": 1.03258681, + "epoch": 0.9604689613708102, + "flos": 19683105937920.0, + "grad_norm": 1.6930712798967245, + "language_loss": 0.67391104, + "learning_rate": 1.6345859141779105e-08, + "loss": 0.69516981, + "num_input_tokens_seen": 344663910, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 15975, + "time_per_iteration": 2.4301302433013916 + }, + { + "auxiliary_loss_clip": 0.01096299, + "auxiliary_loss_mlp": 0.01028041, + "balance_loss_clip": 1.01684737, + "balance_loss_mlp": 1.03415847, + "epoch": 0.9605290846234781, + "flos": 24097568544000.0, + "grad_norm": 2.1951271711643554, + "language_loss": 0.55330515, + "learning_rate": 1.6296205558762322e-08, + "loss": 0.5745486, + "num_input_tokens_seen": 344682320, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.62109375, + "step": 15976, + "time_per_iteration": 2.530332326889038 + }, + { + "auxiliary_loss_clip": 0.01094425, + "auxiliary_loss_mlp": 0.01023604, + "balance_loss_clip": 1.01263642, + "balance_loss_mlp": 1.03126621, + "epoch": 0.9605892078761461, + "flos": 27122575299840.0, + "grad_norm": 2.1902101509492633, + "language_loss": 0.68605191, + "learning_rate": 1.624662719799219e-08, + "loss": 0.70723224, + "num_input_tokens_seen": 344701355, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6328125, + "step": 15977, + "time_per_iteration": 2.5073699951171875 + }, + { + "auxiliary_loss_clip": 0.01098235, + "auxiliary_loss_mlp": 0.01035581, + "balance_loss_clip": 1.0238564, + "balance_loss_mlp": 1.03291917, + "epoch": 0.9606493311288141, + "flos": 14136918543360.0, + "grad_norm": 2.8583973827450397, + "language_loss": 0.82103157, + "learning_rate": 1.6197124061348766e-08, + "loss": 0.84236974, + "num_input_tokens_seen": 344717980, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65234375, + "step": 15978, + "time_per_iteration": 3.9228808879852295 + }, + { + "auxiliary_loss_clip": 0.01103256, + "auxiliary_loss_mlp": 0.01029805, + "balance_loss_clip": 1.0179671, + "balance_loss_mlp": 1.03507805, + "epoch": 0.960709454381482, + "flos": 15813336147840.0, + "grad_norm": 2.21939382847535, + "language_loss": 0.83099633, + "learning_rate": 1.614769615070921e-08, + "loss": 0.85232687, + "num_input_tokens_seen": 344733480, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.68359375, + "step": 15979, + "time_per_iteration": 2.4425342082977295 + }, + { + "auxiliary_loss_clip": 0.01098986, + "auxiliary_loss_mlp": 0.01037557, + "balance_loss_clip": 1.0265305, + "balance_loss_mlp": 1.03295469, + "epoch": 0.96076957763415, + "flos": 22565403959040.0, + "grad_norm": 1.5535455683117823, + "language_loss": 0.80101836, + "learning_rate": 1.6098343467947805e-08, + "loss": 0.82238382, + "num_input_tokens_seen": 344752130, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.66015625, + "step": 15980, + "time_per_iteration": 2.488734006881714 + }, + { + "auxiliary_loss_clip": 0.01100084, + "auxiliary_loss_mlp": 0.01029361, + "balance_loss_clip": 1.01745772, + "balance_loss_mlp": 1.03294373, + "epoch": 0.960829700886818, + "flos": 24681260551680.0, + "grad_norm": 1.929420179350021, + "language_loss": 0.68303668, + "learning_rate": 1.6049066014935942e-08, + "loss": 0.70433116, + "num_input_tokens_seen": 344771195, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 15981, + "time_per_iteration": 2.5347800254821777 + }, + { + "auxiliary_loss_clip": 0.01097655, + "auxiliary_loss_mlp": 0.01024293, + "balance_loss_clip": 1.01319432, + "balance_loss_mlp": 1.03369415, + "epoch": 0.960889824139486, + "flos": 26542223256960.0, + "grad_norm": 1.3984693202200493, + "language_loss": 0.69509637, + "learning_rate": 1.5999863793542344e-08, + "loss": 0.71631587, + "num_input_tokens_seen": 344793150, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.640625, + "step": 15982, + "time_per_iteration": 2.553140163421631 + }, + { + "auxiliary_loss_clip": 0.01021661, + "auxiliary_loss_mlp": 0.00998004, + "balance_loss_clip": 0.99696726, + "balance_loss_mlp": 1.00166357, + "epoch": 0.9609499473921539, + "flos": 71114942586240.0, + "grad_norm": 0.6681934068781947, + "language_loss": 0.53323615, + "learning_rate": 1.595073680563286e-08, + "loss": 0.55343282, + "num_input_tokens_seen": 344852855, + "router_z_loss_clip": 0.01037598, + "router_z_loss_mlp": 0.20019531, + "step": 15983, + "time_per_iteration": 3.163548231124878 + }, + { + "auxiliary_loss_clip": 0.01098972, + "auxiliary_loss_mlp": 0.01036637, + "balance_loss_clip": 1.02478802, + "balance_loss_mlp": 1.03438175, + "epoch": 0.9610100706448219, + "flos": 20552466810240.0, + "grad_norm": 2.0538452317245204, + "language_loss": 0.6784721, + "learning_rate": 1.5901685053070212e-08, + "loss": 0.69982827, + "num_input_tokens_seen": 344869830, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6484375, + "step": 15984, + "time_per_iteration": 2.4459614753723145 + }, + { + "auxiliary_loss_clip": 0.01095462, + "auxiliary_loss_mlp": 0.01031702, + "balance_loss_clip": 1.02069306, + "balance_loss_mlp": 1.03341627, + "epoch": 0.9610701938974898, + "flos": 14064199459200.0, + "grad_norm": 1.6237233896189485, + "language_loss": 0.66909266, + "learning_rate": 1.5852708537714477e-08, + "loss": 0.6903643, + "num_input_tokens_seen": 344888905, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6171875, + "step": 15985, + "time_per_iteration": 2.4718329906463623 + }, + { + "auxiliary_loss_clip": 0.01100771, + "auxiliary_loss_mlp": 0.01027832, + "balance_loss_clip": 1.0163815, + "balance_loss_mlp": 1.03461182, + "epoch": 0.9611303171501578, + "flos": 20229989483520.0, + "grad_norm": 1.9043539282595943, + "language_loss": 0.78663325, + "learning_rate": 1.580380726142283e-08, + "loss": 0.80791926, + "num_input_tokens_seen": 344907160, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66015625, + "step": 15986, + "time_per_iteration": 2.5057144165039062 + }, + { + "auxiliary_loss_clip": 0.01100246, + "auxiliary_loss_mlp": 0.01029846, + "balance_loss_clip": 1.01730478, + "balance_loss_mlp": 1.0349791, + "epoch": 0.9611904404028258, + "flos": 20951075013120.0, + "grad_norm": 2.0139764735454984, + "language_loss": 0.63585907, + "learning_rate": 1.5754981226049792e-08, + "loss": 0.65716004, + "num_input_tokens_seen": 344922400, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.65234375, + "step": 15987, + "time_per_iteration": 2.484804630279541 + }, + { + "auxiliary_loss_clip": 0.01096451, + "auxiliary_loss_mlp": 0.01027283, + "balance_loss_clip": 1.01677477, + "balance_loss_mlp": 1.03409028, + "epoch": 0.9612505636554938, + "flos": 24827740214400.0, + "grad_norm": 1.6635144622564184, + "language_loss": 0.67184675, + "learning_rate": 1.5706230433446544e-08, + "loss": 0.69308412, + "num_input_tokens_seen": 344941910, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.62109375, + "step": 15988, + "time_per_iteration": 2.51582932472229 + }, + { + "auxiliary_loss_clip": 0.01098191, + "auxiliary_loss_mlp": 0.01039743, + "balance_loss_clip": 1.02879977, + "balance_loss_mlp": 1.033499, + "epoch": 0.9613106869081617, + "flos": 17164977955200.0, + "grad_norm": 2.0386399724410773, + "language_loss": 0.7444011, + "learning_rate": 1.5657554885462055e-08, + "loss": 0.76578045, + "num_input_tokens_seen": 344960020, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 15989, + "time_per_iteration": 2.4653844833374023 + }, + { + "auxiliary_loss_clip": 0.0102176, + "auxiliary_loss_mlp": 0.00998361, + "balance_loss_clip": 0.99734801, + "balance_loss_mlp": 1.00191987, + "epoch": 0.9613708101608297, + "flos": 61563818522880.0, + "grad_norm": 0.8244627726356378, + "language_loss": 0.63139147, + "learning_rate": 1.5608954583941737e-08, + "loss": 0.65159267, + "num_input_tokens_seen": 345018290, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.19824219, + "step": 15990, + "time_per_iteration": 2.9341416358947754 + }, + { + "auxiliary_loss_clip": 0.01098606, + "auxiliary_loss_mlp": 0.01029602, + "balance_loss_clip": 1.01836634, + "balance_loss_mlp": 1.03330886, + "epoch": 0.9614309334134977, + "flos": 27417904922880.0, + "grad_norm": 1.7929580248033004, + "language_loss": 0.77747667, + "learning_rate": 1.5560429530729003e-08, + "loss": 0.79875869, + "num_input_tokens_seen": 345040235, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 15991, + "time_per_iteration": 2.5114362239837646 + }, + { + "auxiliary_loss_clip": 0.01104631, + "auxiliary_loss_mlp": 0.01031055, + "balance_loss_clip": 1.01856136, + "balance_loss_mlp": 1.03413033, + "epoch": 0.9614910566661656, + "flos": 22819148611200.0, + "grad_norm": 2.3248342677519145, + "language_loss": 0.84501588, + "learning_rate": 1.5511979727663493e-08, + "loss": 0.86637282, + "num_input_tokens_seen": 345054540, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 15992, + "time_per_iteration": 2.4357917308807373 + }, + { + "auxiliary_loss_clip": 0.01098966, + "auxiliary_loss_mlp": 0.01029993, + "balance_loss_clip": 1.01779771, + "balance_loss_mlp": 1.03286505, + "epoch": 0.9615511799188337, + "flos": 20667812359680.0, + "grad_norm": 2.0221041730312583, + "language_loss": 0.72067487, + "learning_rate": 1.5463605176582406e-08, + "loss": 0.7419644, + "num_input_tokens_seen": 345074035, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66015625, + "step": 15993, + "time_per_iteration": 2.458228349685669 + }, + { + "auxiliary_loss_clip": 0.01098416, + "auxiliary_loss_mlp": 0.01031599, + "balance_loss_clip": 1.0197798, + "balance_loss_mlp": 1.03211713, + "epoch": 0.9616113031715016, + "flos": 33149212035840.0, + "grad_norm": 1.425573612625333, + "language_loss": 0.68134975, + "learning_rate": 1.5415305879320716e-08, + "loss": 0.70264989, + "num_input_tokens_seen": 345099270, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 15994, + "time_per_iteration": 2.574979543685913 + }, + { + "auxiliary_loss_clip": 0.01099525, + "auxiliary_loss_mlp": 0.01029522, + "balance_loss_clip": 1.01776767, + "balance_loss_mlp": 1.03487062, + "epoch": 0.9616714264241696, + "flos": 25009807276800.0, + "grad_norm": 1.8311870454132768, + "language_loss": 0.84529275, + "learning_rate": 1.5367081837709183e-08, + "loss": 0.86658323, + "num_input_tokens_seen": 345116975, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.64453125, + "step": 15995, + "time_per_iteration": 2.508324384689331 + }, + { + "auxiliary_loss_clip": 0.01101034, + "auxiliary_loss_mlp": 0.01031715, + "balance_loss_clip": 1.01981795, + "balance_loss_mlp": 1.03394556, + "epoch": 0.9617315496768375, + "flos": 13547480359680.0, + "grad_norm": 1.7535990840626554, + "language_loss": 0.75937271, + "learning_rate": 1.5318933053576788e-08, + "loss": 0.78070021, + "num_input_tokens_seen": 345133645, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.671875, + "step": 15996, + "time_per_iteration": 2.412166118621826 + }, + { + "auxiliary_loss_clip": 0.01097246, + "auxiliary_loss_mlp": 0.01029472, + "balance_loss_clip": 1.01758718, + "balance_loss_mlp": 1.03245521, + "epoch": 0.9617916729295055, + "flos": 11254512781440.0, + "grad_norm": 1.8443743753530013, + "language_loss": 0.76869327, + "learning_rate": 1.52708595287494e-08, + "loss": 0.78996044, + "num_input_tokens_seen": 345150740, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6484375, + "step": 15997, + "time_per_iteration": 2.427321434020996 + }, + { + "auxiliary_loss_clip": 0.0109466, + "auxiliary_loss_mlp": 0.01029654, + "balance_loss_clip": 1.01854956, + "balance_loss_mlp": 1.03262687, + "epoch": 0.9618517961821734, + "flos": 22819723228800.0, + "grad_norm": 1.5037392359064448, + "language_loss": 0.67111742, + "learning_rate": 1.522286126505001e-08, + "loss": 0.69236064, + "num_input_tokens_seen": 345170365, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.62109375, + "step": 15998, + "time_per_iteration": 2.440931797027588 + }, + { + "auxiliary_loss_clip": 0.01096743, + "auxiliary_loss_mlp": 0.0102854, + "balance_loss_clip": 1.01669025, + "balance_loss_mlp": 1.03206193, + "epoch": 0.9619119194348414, + "flos": 16617340224000.0, + "grad_norm": 1.6944994780105895, + "language_loss": 0.72642672, + "learning_rate": 1.5174938264298498e-08, + "loss": 0.74767953, + "num_input_tokens_seen": 345188930, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6484375, + "step": 15999, + "time_per_iteration": 2.439814329147339 + }, + { + "auxiliary_loss_clip": 0.0109533, + "auxiliary_loss_mlp": 0.01025115, + "balance_loss_clip": 1.01451135, + "balance_loss_mlp": 1.03301597, + "epoch": 0.9619720426875094, + "flos": 24535140024960.0, + "grad_norm": 1.856091141124966, + "language_loss": 0.65324283, + "learning_rate": 1.5127090528312514e-08, + "loss": 0.67444718, + "num_input_tokens_seen": 345209615, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.625, + "step": 16000, + "time_per_iteration": 2.460580587387085 + }, + { + "auxiliary_loss_clip": 0.01098363, + "auxiliary_loss_mlp": 0.01026136, + "balance_loss_clip": 1.01385164, + "balance_loss_mlp": 1.03337932, + "epoch": 0.9620321659401774, + "flos": 20632224960000.0, + "grad_norm": 2.318305441538345, + "language_loss": 0.75454199, + "learning_rate": 1.5079318058905723e-08, + "loss": 0.775787, + "num_input_tokens_seen": 345229175, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6484375, + "step": 16001, + "time_per_iteration": 2.437589645385742 + }, + { + "auxiliary_loss_clip": 0.01097856, + "auxiliary_loss_mlp": 0.01031715, + "balance_loss_clip": 1.0196507, + "balance_loss_mlp": 1.0329746, + "epoch": 0.9620922891928453, + "flos": 18515290959360.0, + "grad_norm": 1.501970694433986, + "language_loss": 0.68156397, + "learning_rate": 1.5031620857890447e-08, + "loss": 0.70285976, + "num_input_tokens_seen": 345247815, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6484375, + "step": 16002, + "time_per_iteration": 2.433119773864746 + }, + { + "auxiliary_loss_clip": 0.0109988, + "auxiliary_loss_mlp": 0.01027372, + "balance_loss_clip": 1.01573122, + "balance_loss_mlp": 1.03520513, + "epoch": 0.9621524124455133, + "flos": 28767391914240.0, + "grad_norm": 1.2603335889169271, + "language_loss": 0.64553183, + "learning_rate": 1.4983998927074804e-08, + "loss": 0.66680431, + "num_input_tokens_seen": 345269935, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6484375, + "step": 16003, + "time_per_iteration": 2.5247597694396973 + }, + { + "auxiliary_loss_clip": 0.01101995, + "auxiliary_loss_mlp": 0.01035834, + "balance_loss_clip": 1.02489662, + "balance_loss_mlp": 1.03565001, + "epoch": 0.9622125356981813, + "flos": 19098875226240.0, + "grad_norm": 1.7694321709596037, + "language_loss": 0.75896275, + "learning_rate": 1.493645226826512e-08, + "loss": 0.78034103, + "num_input_tokens_seen": 345288310, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6640625, + "step": 16004, + "time_per_iteration": 2.433055877685547 + }, + { + "auxiliary_loss_clip": 0.01098006, + "auxiliary_loss_mlp": 0.01028519, + "balance_loss_clip": 1.01662803, + "balance_loss_mlp": 1.03370786, + "epoch": 0.9622726589508492, + "flos": 20302816308480.0, + "grad_norm": 2.192137725260173, + "language_loss": 0.79381818, + "learning_rate": 1.4888980883263958e-08, + "loss": 0.81508344, + "num_input_tokens_seen": 345306615, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.64453125, + "step": 16005, + "time_per_iteration": 2.4529757499694824 + }, + { + "auxiliary_loss_clip": 0.01095875, + "auxiliary_loss_mlp": 0.01027459, + "balance_loss_clip": 1.01653981, + "balance_loss_mlp": 1.03297102, + "epoch": 0.9623327822035173, + "flos": 54929750889600.0, + "grad_norm": 1.9248059922293024, + "language_loss": 0.67267632, + "learning_rate": 1.4841584773871652e-08, + "loss": 0.69390965, + "num_input_tokens_seen": 345331935, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.62890625, + "step": 16006, + "time_per_iteration": 2.776263952255249 + }, + { + "auxiliary_loss_clip": 0.01094469, + "auxiliary_loss_mlp": 0.01031278, + "balance_loss_clip": 1.02053165, + "balance_loss_mlp": 1.03415585, + "epoch": 0.9623929054561852, + "flos": 21759029585280.0, + "grad_norm": 1.5940516998351955, + "language_loss": 0.78056121, + "learning_rate": 1.479426394188521e-08, + "loss": 0.80181879, + "num_input_tokens_seen": 345351510, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6015625, + "step": 16007, + "time_per_iteration": 2.47029972076416 + }, + { + "auxiliary_loss_clip": 0.01100629, + "auxiliary_loss_mlp": 0.0103215, + "balance_loss_clip": 1.02034807, + "balance_loss_mlp": 1.03482246, + "epoch": 0.9624530287088532, + "flos": 17931563038080.0, + "grad_norm": 2.0291702556230438, + "language_loss": 0.68004704, + "learning_rate": 1.4747018389099198e-08, + "loss": 0.70137483, + "num_input_tokens_seen": 345367750, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.65625, + "step": 16008, + "time_per_iteration": 2.529724597930908 + }, + { + "auxiliary_loss_clip": 0.01102821, + "auxiliary_loss_mlp": 0.01032416, + "balance_loss_clip": 1.01991701, + "balance_loss_mlp": 1.03540087, + "epoch": 0.9625131519615211, + "flos": 23253739263360.0, + "grad_norm": 2.1805744652126657, + "language_loss": 0.72793615, + "learning_rate": 1.469984811730529e-08, + "loss": 0.74928856, + "num_input_tokens_seen": 345384790, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.671875, + "step": 16009, + "time_per_iteration": 2.4500856399536133 + }, + { + "auxiliary_loss_clip": 0.01097324, + "auxiliary_loss_mlp": 0.01031527, + "balance_loss_clip": 1.01997614, + "balance_loss_mlp": 1.03245699, + "epoch": 0.9625732752141891, + "flos": 18916628595840.0, + "grad_norm": 2.132459467969933, + "language_loss": 0.75247002, + "learning_rate": 1.4652753128292061e-08, + "loss": 0.77375853, + "num_input_tokens_seen": 345403390, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6484375, + "step": 16010, + "time_per_iteration": 2.4420454502105713 + }, + { + "auxiliary_loss_clip": 0.01105906, + "auxiliary_loss_mlp": 0.01033369, + "balance_loss_clip": 1.01913512, + "balance_loss_mlp": 1.03655696, + "epoch": 0.962633398466857, + "flos": 16252918790400.0, + "grad_norm": 1.8880874815114188, + "language_loss": 0.69513392, + "learning_rate": 1.4605733423845635e-08, + "loss": 0.71652675, + "num_input_tokens_seen": 345418685, + "router_z_loss_clip": 0.14257812, + "router_z_loss_mlp": 0.6953125, + "step": 16011, + "time_per_iteration": 2.466012954711914 + }, + { + "auxiliary_loss_clip": 0.01097648, + "auxiliary_loss_mlp": 0.01030261, + "balance_loss_clip": 1.01957953, + "balance_loss_mlp": 1.03402066, + "epoch": 0.962693521719525, + "flos": 54197424403200.0, + "grad_norm": 1.6867727595710786, + "language_loss": 0.68486851, + "learning_rate": 1.4558789005748585e-08, + "loss": 0.70614755, + "num_input_tokens_seen": 345442380, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.63671875, + "step": 16012, + "time_per_iteration": 4.115834474563599 + }, + { + "auxiliary_loss_clip": 0.01105856, + "auxiliary_loss_mlp": 0.01035194, + "balance_loss_clip": 1.02205706, + "balance_loss_mlp": 1.03603888, + "epoch": 0.962753644972193, + "flos": 33105795471360.0, + "grad_norm": 1.7968742145929515, + "language_loss": 0.7239725, + "learning_rate": 1.4511919875781264e-08, + "loss": 0.74538302, + "num_input_tokens_seen": 345463815, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.69921875, + "step": 16013, + "time_per_iteration": 2.5248327255249023 + }, + { + "auxiliary_loss_clip": 0.01098665, + "auxiliary_loss_mlp": 0.01030707, + "balance_loss_clip": 1.01866698, + "balance_loss_mlp": 1.03396904, + "epoch": 0.962813768224861, + "flos": 42230660837760.0, + "grad_norm": 2.303643784633636, + "language_loss": 0.63361096, + "learning_rate": 1.4465126035720698e-08, + "loss": 0.65490472, + "num_input_tokens_seen": 345484525, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6484375, + "step": 16014, + "time_per_iteration": 5.387859582901001 + }, + { + "auxiliary_loss_clip": 0.01094961, + "auxiliary_loss_mlp": 0.01026344, + "balance_loss_clip": 1.0163964, + "balance_loss_mlp": 1.03386617, + "epoch": 0.9628738914775289, + "flos": 43944677003520.0, + "grad_norm": 1.9022819757041962, + "language_loss": 0.71860576, + "learning_rate": 1.4418407487341688e-08, + "loss": 0.73981875, + "num_input_tokens_seen": 345508295, + "router_z_loss_clip": 0.09960938, + "router_z_loss_mlp": 0.609375, + "step": 16015, + "time_per_iteration": 2.74052095413208 + }, + { + "auxiliary_loss_clip": 0.01097382, + "auxiliary_loss_mlp": 0.0102778, + "balance_loss_clip": 1.01629472, + "balance_loss_mlp": 1.03302288, + "epoch": 0.9629340147301969, + "flos": 15596184476160.0, + "grad_norm": 1.7223276387291737, + "language_loss": 0.77100927, + "learning_rate": 1.4371764232415707e-08, + "loss": 0.79226089, + "num_input_tokens_seen": 345525155, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.64453125, + "step": 16016, + "time_per_iteration": 2.5562946796417236 + }, + { + "auxiliary_loss_clip": 0.01021809, + "auxiliary_loss_mlp": 0.00998645, + "balance_loss_clip": 0.99765599, + "balance_loss_mlp": 1.00184894, + "epoch": 0.9629941379828649, + "flos": 62951011816320.0, + "grad_norm": 0.8230091006411403, + "language_loss": 0.6317451, + "learning_rate": 1.4325196272711337e-08, + "loss": 0.65194964, + "num_input_tokens_seen": 345578905, + "router_z_loss_clip": 0.0098877, + "router_z_loss_mlp": 0.19921875, + "step": 16017, + "time_per_iteration": 2.9330668449401855 + }, + { + "auxiliary_loss_clip": 0.01101685, + "auxiliary_loss_mlp": 0.01026728, + "balance_loss_clip": 1.01551044, + "balance_loss_mlp": 1.03531194, + "epoch": 0.9630542612355328, + "flos": 29899116702720.0, + "grad_norm": 2.7407998859576432, + "language_loss": 0.6571548, + "learning_rate": 1.4278703609994502e-08, + "loss": 0.67843896, + "num_input_tokens_seen": 345598965, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6640625, + "step": 16018, + "time_per_iteration": 2.4978015422821045 + }, + { + "auxiliary_loss_clip": 0.0109954, + "auxiliary_loss_mlp": 0.01031934, + "balance_loss_clip": 1.0205797, + "balance_loss_mlp": 1.03453994, + "epoch": 0.9631143844882009, + "flos": 17894575008000.0, + "grad_norm": 1.9221480896799248, + "language_loss": 0.79585052, + "learning_rate": 1.4232286246028457e-08, + "loss": 0.81716537, + "num_input_tokens_seen": 345617945, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 16019, + "time_per_iteration": 3.83243465423584 + }, + { + "auxiliary_loss_clip": 0.01094234, + "auxiliary_loss_mlp": 0.0102779, + "balance_loss_clip": 1.01744246, + "balance_loss_mlp": 1.03089976, + "epoch": 0.9631745077408688, + "flos": 26139161767680.0, + "grad_norm": 1.4518833945438399, + "language_loss": 0.71567214, + "learning_rate": 1.4185944182572907e-08, + "loss": 0.73689234, + "num_input_tokens_seen": 345637920, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.6328125, + "step": 16020, + "time_per_iteration": 2.4867608547210693 + }, + { + "auxiliary_loss_clip": 0.01099297, + "auxiliary_loss_mlp": 0.0102639, + "balance_loss_clip": 1.01586401, + "balance_loss_mlp": 1.03405941, + "epoch": 0.9632346309935368, + "flos": 24973645259520.0, + "grad_norm": 1.654544667826034, + "language_loss": 0.77078342, + "learning_rate": 1.4139677421385331e-08, + "loss": 0.79204035, + "num_input_tokens_seen": 345656195, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.65234375, + "step": 16021, + "time_per_iteration": 2.45025372505188 + }, + { + "auxiliary_loss_clip": 0.01103509, + "auxiliary_loss_mlp": 0.01029839, + "balance_loss_clip": 1.01649964, + "balance_loss_mlp": 1.03461719, + "epoch": 0.9632947542462047, + "flos": 23617226943360.0, + "grad_norm": 1.9848746389791796, + "language_loss": 0.64672452, + "learning_rate": 1.4093485964220331e-08, + "loss": 0.66805798, + "num_input_tokens_seen": 345676700, + "router_z_loss_clip": 0.13378906, + "router_z_loss_mlp": 0.6875, + "step": 16022, + "time_per_iteration": 2.5040066242218018 + }, + { + "auxiliary_loss_clip": 0.01096934, + "auxiliary_loss_mlp": 0.01030439, + "balance_loss_clip": 1.01966798, + "balance_loss_mlp": 1.03360546, + "epoch": 0.9633548774988727, + "flos": 26395599939840.0, + "grad_norm": 2.0658365642461525, + "language_loss": 0.73443997, + "learning_rate": 1.4047369812829168e-08, + "loss": 0.7557137, + "num_input_tokens_seen": 345696725, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6328125, + "step": 16023, + "time_per_iteration": 2.4626638889312744 + }, + { + "auxiliary_loss_clip": 0.0109734, + "auxiliary_loss_mlp": 0.01026163, + "balance_loss_clip": 1.01538599, + "balance_loss_mlp": 1.03294301, + "epoch": 0.9634150007515406, + "flos": 23767728929280.0, + "grad_norm": 1.449705583303519, + "language_loss": 0.81280053, + "learning_rate": 1.4001328968960891e-08, + "loss": 0.83403552, + "num_input_tokens_seen": 345716245, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.64453125, + "step": 16024, + "time_per_iteration": 2.4662721157073975 + }, + { + "auxiliary_loss_clip": 0.01103249, + "auxiliary_loss_mlp": 0.01031952, + "balance_loss_clip": 1.01964998, + "balance_loss_mlp": 1.03470325, + "epoch": 0.9634751240042086, + "flos": 24135346673280.0, + "grad_norm": 1.3425470745031889, + "language_loss": 0.81449908, + "learning_rate": 1.3955363434361212e-08, + "loss": 0.83585107, + "num_input_tokens_seen": 345739060, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 16025, + "time_per_iteration": 2.571988105773926 + }, + { + "auxiliary_loss_clip": 0.01101207, + "auxiliary_loss_mlp": 0.01030748, + "balance_loss_clip": 1.01897025, + "balance_loss_mlp": 1.03413701, + "epoch": 0.9635352472568766, + "flos": 24349086552960.0, + "grad_norm": 1.8159073378750998, + "language_loss": 0.76695681, + "learning_rate": 1.3909473210773181e-08, + "loss": 0.78827643, + "num_input_tokens_seen": 345758325, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.671875, + "step": 16026, + "time_per_iteration": 2.5653977394104004 + }, + { + "auxiliary_loss_clip": 0.01100402, + "auxiliary_loss_mlp": 0.01030366, + "balance_loss_clip": 1.01804018, + "balance_loss_mlp": 1.03367102, + "epoch": 0.9635953705095446, + "flos": 23984772860160.0, + "grad_norm": 1.6861028464709051, + "language_loss": 0.63083422, + "learning_rate": 1.3863658299936965e-08, + "loss": 0.65214193, + "num_input_tokens_seen": 345778530, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.66796875, + "step": 16027, + "time_per_iteration": 2.544005870819092 + }, + { + "auxiliary_loss_clip": 0.01103438, + "auxiliary_loss_mlp": 0.0102768, + "balance_loss_clip": 1.0156163, + "balance_loss_mlp": 1.03664851, + "epoch": 0.9636554937622125, + "flos": 19828436365440.0, + "grad_norm": 2.0164451231663882, + "language_loss": 0.87208748, + "learning_rate": 1.3817918703589837e-08, + "loss": 0.89339876, + "num_input_tokens_seen": 345796535, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.66796875, + "step": 16028, + "time_per_iteration": 2.5071027278900146 + }, + { + "auxiliary_loss_clip": 0.010217, + "auxiliary_loss_mlp": 0.00999046, + "balance_loss_clip": 0.99811667, + "balance_loss_mlp": 1.00170708, + "epoch": 0.9637156170148805, + "flos": 67435499986560.0, + "grad_norm": 0.7421749318599844, + "language_loss": 0.53201663, + "learning_rate": 1.3772254423466412e-08, + "loss": 0.5522241, + "num_input_tokens_seen": 345859700, + "router_z_loss_clip": 0.00927734, + "router_z_loss_mlp": 0.19921875, + "step": 16029, + "time_per_iteration": 3.038540840148926 + }, + { + "auxiliary_loss_clip": 0.01101, + "auxiliary_loss_mlp": 0.01027698, + "balance_loss_clip": 1.01630187, + "balance_loss_mlp": 1.03434622, + "epoch": 0.9637757402675484, + "flos": 20300912887680.0, + "grad_norm": 1.587969426883562, + "language_loss": 0.73793781, + "learning_rate": 1.372666546129797e-08, + "loss": 0.75922477, + "num_input_tokens_seen": 345878760, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66796875, + "step": 16030, + "time_per_iteration": 2.512209892272949 + }, + { + "auxiliary_loss_clip": 0.01096772, + "auxiliary_loss_mlp": 0.01027423, + "balance_loss_clip": 1.0165571, + "balance_loss_mlp": 1.03376997, + "epoch": 0.9638358635202164, + "flos": 27234544970880.0, + "grad_norm": 2.091542472961613, + "language_loss": 0.66038525, + "learning_rate": 1.3681151818813575e-08, + "loss": 0.68162721, + "num_input_tokens_seen": 345900445, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.62890625, + "step": 16031, + "time_per_iteration": 2.6668994426727295 + }, + { + "auxiliary_loss_clip": 0.01021545, + "auxiliary_loss_mlp": 0.01000717, + "balance_loss_clip": 0.99978131, + "balance_loss_mlp": 1.001513, + "epoch": 0.9638959867728845, + "flos": 70288998278400.0, + "grad_norm": 0.8415977039530823, + "language_loss": 0.60769111, + "learning_rate": 1.3635713497738955e-08, + "loss": 0.62791371, + "num_input_tokens_seen": 345961020, + "router_z_loss_clip": 0.00933838, + "router_z_loss_mlp": 0.20117188, + "step": 16032, + "time_per_iteration": 3.1774539947509766 + }, + { + "auxiliary_loss_clip": 0.01092096, + "auxiliary_loss_mlp": 0.01029981, + "balance_loss_clip": 1.01961017, + "balance_loss_mlp": 1.03172016, + "epoch": 0.9639561100255524, + "flos": 25407517639680.0, + "grad_norm": 2.336742509809211, + "language_loss": 0.66448474, + "learning_rate": 1.3590350499796954e-08, + "loss": 0.68570554, + "num_input_tokens_seen": 345980210, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.60546875, + "step": 16033, + "time_per_iteration": 2.5393478870391846 + }, + { + "auxiliary_loss_clip": 0.01099204, + "auxiliary_loss_mlp": 0.01029271, + "balance_loss_clip": 1.01792789, + "balance_loss_mlp": 1.0350368, + "epoch": 0.9640162332782204, + "flos": 18113881495680.0, + "grad_norm": 1.8687233450707268, + "language_loss": 0.6541754, + "learning_rate": 1.3545062826707976e-08, + "loss": 0.6754601, + "num_input_tokens_seen": 345998280, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.640625, + "step": 16034, + "time_per_iteration": 2.5119800567626953 + }, + { + "auxiliary_loss_clip": 0.01100794, + "auxiliary_loss_mlp": 0.01027753, + "balance_loss_clip": 1.01654732, + "balance_loss_mlp": 1.03560579, + "epoch": 0.9640763565308883, + "flos": 23440295525760.0, + "grad_norm": 2.2819152294755765, + "language_loss": 0.7378726, + "learning_rate": 1.3499850480189313e-08, + "loss": 0.75915802, + "num_input_tokens_seen": 346015545, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 16035, + "time_per_iteration": 2.514049530029297 + }, + { + "auxiliary_loss_clip": 0.01102242, + "auxiliary_loss_mlp": 0.0102748, + "balance_loss_clip": 1.01587522, + "balance_loss_mlp": 1.03745866, + "epoch": 0.9641364797835563, + "flos": 22419355259520.0, + "grad_norm": 1.9666432901090276, + "language_loss": 0.82240516, + "learning_rate": 1.3454713461955591e-08, + "loss": 0.84370238, + "num_input_tokens_seen": 346034055, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6484375, + "step": 16036, + "time_per_iteration": 2.470296859741211 + }, + { + "auxiliary_loss_clip": 0.01097949, + "auxiliary_loss_mlp": 0.01029518, + "balance_loss_clip": 1.01781738, + "balance_loss_mlp": 1.03251529, + "epoch": 0.9641966030362242, + "flos": 30622357048320.0, + "grad_norm": 1.8384642674416498, + "language_loss": 0.69920629, + "learning_rate": 1.340965177371789e-08, + "loss": 0.72048092, + "num_input_tokens_seen": 346054130, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 16037, + "time_per_iteration": 2.5022101402282715 + }, + { + "auxiliary_loss_clip": 0.01098879, + "auxiliary_loss_mlp": 0.01024612, + "balance_loss_clip": 1.01337099, + "balance_loss_mlp": 1.03303576, + "epoch": 0.9642567262888923, + "flos": 20953122088320.0, + "grad_norm": 1.8647442017039988, + "language_loss": 0.63255847, + "learning_rate": 1.3364665417185506e-08, + "loss": 0.65379345, + "num_input_tokens_seen": 346072990, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 16038, + "time_per_iteration": 2.4083030223846436 + }, + { + "auxiliary_loss_clip": 0.01101312, + "auxiliary_loss_mlp": 0.01031981, + "balance_loss_clip": 1.02006602, + "balance_loss_mlp": 1.03394938, + "epoch": 0.9643168495415602, + "flos": 22639415932800.0, + "grad_norm": 4.3592959082323715, + "language_loss": 0.70973301, + "learning_rate": 1.3319754394064187e-08, + "loss": 0.73106587, + "num_input_tokens_seen": 346093745, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.67578125, + "step": 16039, + "time_per_iteration": 2.4552149772644043 + }, + { + "auxiliary_loss_clip": 0.01099532, + "auxiliary_loss_mlp": 0.01027491, + "balance_loss_clip": 1.01546872, + "balance_loss_mlp": 1.03366244, + "epoch": 0.9643769727942282, + "flos": 20266259241600.0, + "grad_norm": 1.9258933079611011, + "language_loss": 0.72986352, + "learning_rate": 1.327491870605657e-08, + "loss": 0.75113374, + "num_input_tokens_seen": 346110115, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.65625, + "step": 16040, + "time_per_iteration": 2.485323190689087 + }, + { + "auxiliary_loss_clip": 0.01100883, + "auxiliary_loss_mlp": 0.0103019, + "balance_loss_clip": 1.01842415, + "balance_loss_mlp": 1.03421116, + "epoch": 0.9644370960468961, + "flos": 13881845088000.0, + "grad_norm": 1.8131495617267763, + "language_loss": 0.73091221, + "learning_rate": 1.3230158354863296e-08, + "loss": 0.75222296, + "num_input_tokens_seen": 346127165, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.66796875, + "step": 16041, + "time_per_iteration": 2.388808488845825 + }, + { + "auxiliary_loss_clip": 0.01094729, + "auxiliary_loss_mlp": 0.01026692, + "balance_loss_clip": 1.01594496, + "balance_loss_mlp": 1.03319907, + "epoch": 0.9644972192995641, + "flos": 17238199829760.0, + "grad_norm": 1.7600846480855517, + "language_loss": 0.71910304, + "learning_rate": 1.3185473342181674e-08, + "loss": 0.74031723, + "num_input_tokens_seen": 346145950, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6171875, + "step": 16042, + "time_per_iteration": 2.4807844161987305 + }, + { + "auxiliary_loss_clip": 0.01101997, + "auxiliary_loss_mlp": 0.01028281, + "balance_loss_clip": 1.01721215, + "balance_loss_mlp": 1.03423679, + "epoch": 0.964557342552232, + "flos": 23840340272640.0, + "grad_norm": 1.86868776275858, + "language_loss": 0.80611408, + "learning_rate": 1.3140863669705683e-08, + "loss": 0.82741684, + "num_input_tokens_seen": 346165005, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6796875, + "step": 16043, + "time_per_iteration": 2.4247870445251465 + }, + { + "auxiliary_loss_clip": 0.01098965, + "auxiliary_loss_mlp": 0.01027122, + "balance_loss_clip": 1.01601148, + "balance_loss_mlp": 1.03540707, + "epoch": 0.9646174658049, + "flos": 21653129312640.0, + "grad_norm": 1.4757233148834483, + "language_loss": 0.71590781, + "learning_rate": 1.3096329339127522e-08, + "loss": 0.73716873, + "num_input_tokens_seen": 346185095, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.63671875, + "step": 16044, + "time_per_iteration": 2.4749693870544434 + }, + { + "auxiliary_loss_clip": 0.01096636, + "auxiliary_loss_mlp": 0.01027099, + "balance_loss_clip": 1.01526165, + "balance_loss_mlp": 1.0332067, + "epoch": 0.9646775890575681, + "flos": 17129570123520.0, + "grad_norm": 2.0142953791074323, + "language_loss": 0.69947273, + "learning_rate": 1.3051870352135397e-08, + "loss": 0.7207101, + "num_input_tokens_seen": 346202580, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6328125, + "step": 16045, + "time_per_iteration": 2.4325809478759766 + }, + { + "auxiliary_loss_clip": 0.01101043, + "auxiliary_loss_mlp": 0.01031193, + "balance_loss_clip": 1.01903403, + "balance_loss_mlp": 1.03409147, + "epoch": 0.964737712310236, + "flos": 13005732458880.0, + "grad_norm": 2.254760365933983, + "language_loss": 0.74806952, + "learning_rate": 1.3007486710415737e-08, + "loss": 0.76939189, + "num_input_tokens_seen": 346219395, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.671875, + "step": 16046, + "time_per_iteration": 2.427147626876831 + }, + { + "auxiliary_loss_clip": 0.01102537, + "auxiliary_loss_mlp": 0.0103333, + "balance_loss_clip": 1.0209322, + "balance_loss_mlp": 1.03498721, + "epoch": 0.964797835562904, + "flos": 24279240556800.0, + "grad_norm": 1.7329644028293205, + "language_loss": 0.62384462, + "learning_rate": 1.2963178415651199e-08, + "loss": 0.64520335, + "num_input_tokens_seen": 346239715, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.67578125, + "step": 16047, + "time_per_iteration": 2.492799997329712 + }, + { + "auxiliary_loss_clip": 0.01100779, + "auxiliary_loss_mlp": 0.01033595, + "balance_loss_clip": 1.02144754, + "balance_loss_mlp": 1.03558803, + "epoch": 0.9648579588155719, + "flos": 20522697413760.0, + "grad_norm": 1.8850176887881036, + "language_loss": 0.6955775, + "learning_rate": 1.2918945469521992e-08, + "loss": 0.71692121, + "num_input_tokens_seen": 346258500, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.65234375, + "step": 16048, + "time_per_iteration": 2.4344987869262695 + }, + { + "auxiliary_loss_clip": 0.01101251, + "auxiliary_loss_mlp": 0.01028949, + "balance_loss_clip": 1.01668882, + "balance_loss_mlp": 1.03366709, + "epoch": 0.9649180820682399, + "flos": 32154844855680.0, + "grad_norm": 1.7863475924187646, + "language_loss": 0.63913882, + "learning_rate": 1.2874787873705662e-08, + "loss": 0.66044074, + "num_input_tokens_seen": 346279110, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.67578125, + "step": 16049, + "time_per_iteration": 2.4989817142486572 + }, + { + "auxiliary_loss_clip": 0.01101061, + "auxiliary_loss_mlp": 0.010269, + "balance_loss_clip": 1.01546216, + "balance_loss_mlp": 1.03558612, + "epoch": 0.9649782053209078, + "flos": 20522589672960.0, + "grad_norm": 1.6424917186742727, + "language_loss": 0.71067202, + "learning_rate": 1.2830705629876427e-08, + "loss": 0.73195171, + "num_input_tokens_seen": 346297860, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65625, + "step": 16050, + "time_per_iteration": 2.5452215671539307 + }, + { + "auxiliary_loss_clip": 0.01102281, + "auxiliary_loss_mlp": 0.01029136, + "balance_loss_clip": 1.01621413, + "balance_loss_mlp": 1.0329442, + "epoch": 0.9650383285735759, + "flos": 43067953843200.0, + "grad_norm": 1.8388027945859817, + "language_loss": 0.69875538, + "learning_rate": 1.278669873970606e-08, + "loss": 0.72006953, + "num_input_tokens_seen": 346319860, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.6953125, + "step": 16051, + "time_per_iteration": 2.636740207672119 + }, + { + "auxiliary_loss_clip": 0.01021624, + "auxiliary_loss_mlp": 0.01001844, + "balance_loss_clip": 1.00084877, + "balance_loss_mlp": 1.00160849, + "epoch": 0.9650984518262438, + "flos": 61748255882880.0, + "grad_norm": 0.8334148755985689, + "language_loss": 0.59121096, + "learning_rate": 1.2742767204863004e-08, + "loss": 0.61144561, + "num_input_tokens_seen": 346379025, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.20019531, + "step": 16052, + "time_per_iteration": 3.075615882873535 + }, + { + "auxiliary_loss_clip": 0.01095214, + "auxiliary_loss_mlp": 0.01026471, + "balance_loss_clip": 1.01512265, + "balance_loss_mlp": 1.03191876, + "epoch": 0.9651585750789118, + "flos": 29789337761280.0, + "grad_norm": 1.64808937771042, + "language_loss": 0.74442101, + "learning_rate": 1.2698911027013482e-08, + "loss": 0.76563787, + "num_input_tokens_seen": 346402250, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6328125, + "step": 16053, + "time_per_iteration": 3.8834474086761475 + }, + { + "auxiliary_loss_clip": 0.01101113, + "auxiliary_loss_mlp": 0.01030715, + "balance_loss_clip": 1.01878238, + "balance_loss_mlp": 1.03472745, + "epoch": 0.9652186983315797, + "flos": 16873060124160.0, + "grad_norm": 2.3993755573637743, + "language_loss": 0.68056464, + "learning_rate": 1.2655130207820386e-08, + "loss": 0.70188296, + "num_input_tokens_seen": 346419555, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 16054, + "time_per_iteration": 2.4599058628082275 + }, + { + "auxiliary_loss_clip": 0.01098543, + "auxiliary_loss_mlp": 0.01031479, + "balance_loss_clip": 1.0204761, + "balance_loss_mlp": 1.03408504, + "epoch": 0.9652788215842477, + "flos": 31649761762560.0, + "grad_norm": 1.5511100121301231, + "language_loss": 0.61763877, + "learning_rate": 1.2611424748943944e-08, + "loss": 0.63893896, + "num_input_tokens_seen": 346441245, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.64453125, + "step": 16055, + "time_per_iteration": 4.0541017055511475 + }, + { + "auxiliary_loss_clip": 0.01096153, + "auxiliary_loss_mlp": 0.01027762, + "balance_loss_clip": 1.01644897, + "balance_loss_mlp": 1.03382039, + "epoch": 0.9653389448369156, + "flos": 24754266944640.0, + "grad_norm": 1.899688570193355, + "language_loss": 0.76835245, + "learning_rate": 1.2567794652041719e-08, + "loss": 0.78959155, + "num_input_tokens_seen": 346460065, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.625, + "step": 16056, + "time_per_iteration": 3.974794626235962 + }, + { + "auxiliary_loss_clip": 0.01098862, + "auxiliary_loss_mlp": 0.01027586, + "balance_loss_clip": 1.01672029, + "balance_loss_mlp": 1.03289866, + "epoch": 0.9653990680895836, + "flos": 20297249700480.0, + "grad_norm": 1.5630374431073517, + "language_loss": 0.71658134, + "learning_rate": 1.2524239918767498e-08, + "loss": 0.73784578, + "num_input_tokens_seen": 346478005, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.66015625, + "step": 16057, + "time_per_iteration": 2.4625744819641113 + }, + { + "auxiliary_loss_clip": 0.01096064, + "auxiliary_loss_mlp": 0.01031729, + "balance_loss_clip": 1.02070796, + "balance_loss_mlp": 1.03262568, + "epoch": 0.9654591913422517, + "flos": 22528775064960.0, + "grad_norm": 1.7750175555369185, + "language_loss": 0.72013068, + "learning_rate": 1.2480760550773295e-08, + "loss": 0.74140859, + "num_input_tokens_seen": 346497575, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6328125, + "step": 16058, + "time_per_iteration": 2.4661831855773926 + }, + { + "auxiliary_loss_clip": 0.01097513, + "auxiliary_loss_mlp": 0.01033158, + "balance_loss_clip": 1.0217855, + "balance_loss_mlp": 1.03324616, + "epoch": 0.9655193145949196, + "flos": 26763002202240.0, + "grad_norm": 1.3805922722599118, + "language_loss": 0.74052727, + "learning_rate": 1.2437356549708011e-08, + "loss": 0.76183391, + "num_input_tokens_seen": 346520000, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.640625, + "step": 16059, + "time_per_iteration": 2.4987194538116455 + }, + { + "auxiliary_loss_clip": 0.01102874, + "auxiliary_loss_mlp": 0.01032015, + "balance_loss_clip": 1.0205586, + "balance_loss_mlp": 1.0346117, + "epoch": 0.9655794378475876, + "flos": 41970703132800.0, + "grad_norm": 1.9693634939713338, + "language_loss": 0.73338103, + "learning_rate": 1.239402791721722e-08, + "loss": 0.75472993, + "num_input_tokens_seen": 346541605, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.68359375, + "step": 16060, + "time_per_iteration": 2.691296100616455 + }, + { + "auxiliary_loss_clip": 0.01094521, + "auxiliary_loss_mlp": 0.01028413, + "balance_loss_clip": 1.0182569, + "balance_loss_mlp": 1.03331041, + "epoch": 0.9656395611002555, + "flos": 27709427704320.0, + "grad_norm": 1.5438406345380868, + "language_loss": 0.76715529, + "learning_rate": 1.2350774654944273e-08, + "loss": 0.78838468, + "num_input_tokens_seen": 346560955, + "router_z_loss_clip": 0.1015625, + "router_z_loss_mlp": 0.609375, + "step": 16061, + "time_per_iteration": 4.037534952163696 + }, + { + "auxiliary_loss_clip": 0.01021687, + "auxiliary_loss_mlp": 0.00999978, + "balance_loss_clip": 0.99901831, + "balance_loss_mlp": 1.0016849, + "epoch": 0.9656996843529235, + "flos": 68968562411520.0, + "grad_norm": 0.7240380472657248, + "language_loss": 0.64163613, + "learning_rate": 1.2307596764528749e-08, + "loss": 0.66185272, + "num_input_tokens_seen": 346621615, + "router_z_loss_clip": 0.00958252, + "router_z_loss_mlp": 0.20019531, + "step": 16062, + "time_per_iteration": 3.075866937637329 + }, + { + "auxiliary_loss_clip": 0.01093621, + "auxiliary_loss_mlp": 0.01025081, + "balance_loss_clip": 1.01454329, + "balance_loss_mlp": 1.03160632, + "epoch": 0.9657598076055914, + "flos": 20631327120000.0, + "grad_norm": 2.199981825340732, + "language_loss": 0.92818987, + "learning_rate": 1.226449424760867e-08, + "loss": 0.94937694, + "num_input_tokens_seen": 346637460, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.62109375, + "step": 16063, + "time_per_iteration": 2.4555232524871826 + }, + { + "auxiliary_loss_clip": 0.01101706, + "auxiliary_loss_mlp": 0.01032317, + "balance_loss_clip": 1.02054524, + "balance_loss_mlp": 1.0358125, + "epoch": 0.9658199308582595, + "flos": 20448577699200.0, + "grad_norm": 1.9157708937264109, + "language_loss": 0.81976312, + "learning_rate": 1.2221467105818062e-08, + "loss": 0.84110343, + "num_input_tokens_seen": 346655625, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65625, + "step": 16064, + "time_per_iteration": 2.4142022132873535 + }, + { + "auxiliary_loss_clip": 0.01100449, + "auxiliary_loss_mlp": 0.01027678, + "balance_loss_clip": 1.01711571, + "balance_loss_mlp": 1.03634119, + "epoch": 0.9658800541109274, + "flos": 24718033100160.0, + "grad_norm": 2.24347865862691, + "language_loss": 0.843117, + "learning_rate": 1.2178515340788731e-08, + "loss": 0.8643983, + "num_input_tokens_seen": 346675220, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.640625, + "step": 16065, + "time_per_iteration": 2.4605534076690674 + }, + { + "auxiliary_loss_clip": 0.01096746, + "auxiliary_loss_mlp": 0.01027695, + "balance_loss_clip": 1.01629925, + "balance_loss_mlp": 1.03209305, + "epoch": 0.9659401773635954, + "flos": 21610035970560.0, + "grad_norm": 1.7134975276082676, + "language_loss": 0.67760193, + "learning_rate": 1.2135638954149151e-08, + "loss": 0.69884634, + "num_input_tokens_seen": 346694710, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 16066, + "time_per_iteration": 2.4299395084381104 + }, + { + "auxiliary_loss_clip": 0.01097275, + "auxiliary_loss_mlp": 0.01024339, + "balance_loss_clip": 1.0133121, + "balance_loss_mlp": 1.03257763, + "epoch": 0.9660003006162633, + "flos": 20301200196480.0, + "grad_norm": 2.103530429938663, + "language_loss": 0.82447511, + "learning_rate": 1.209283794752558e-08, + "loss": 0.8456912, + "num_input_tokens_seen": 346712645, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 16067, + "time_per_iteration": 2.462406873703003 + }, + { + "auxiliary_loss_clip": 0.01097804, + "auxiliary_loss_mlp": 0.01026441, + "balance_loss_clip": 1.01503885, + "balance_loss_mlp": 1.03394961, + "epoch": 0.9660604238689313, + "flos": 24461954064000.0, + "grad_norm": 1.8120779839614523, + "language_loss": 0.68879712, + "learning_rate": 1.2050112322540496e-08, + "loss": 0.71003956, + "num_input_tokens_seen": 346732375, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.63671875, + "step": 16068, + "time_per_iteration": 2.4718844890594482 + }, + { + "auxiliary_loss_clip": 0.01093562, + "auxiliary_loss_mlp": 0.01025202, + "balance_loss_clip": 1.01563549, + "balance_loss_mlp": 1.03256798, + "epoch": 0.9661205471215992, + "flos": 19864023765120.0, + "grad_norm": 1.7691682427953708, + "language_loss": 0.67960203, + "learning_rate": 1.20074620808146e-08, + "loss": 0.70078963, + "num_input_tokens_seen": 346750430, + "router_z_loss_clip": 0.09570312, + "router_z_loss_mlp": 0.609375, + "step": 16069, + "time_per_iteration": 2.4708714485168457 + }, + { + "auxiliary_loss_clip": 0.01101825, + "auxiliary_loss_mlp": 0.01028455, + "balance_loss_clip": 1.01733899, + "balance_loss_mlp": 1.03626013, + "epoch": 0.9661806703742672, + "flos": 20557889763840.0, + "grad_norm": 1.7773485796509647, + "language_loss": 0.88872612, + "learning_rate": 1.1964887223964826e-08, + "loss": 0.91002887, + "num_input_tokens_seen": 346768455, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 16070, + "time_per_iteration": 2.436710834503174 + }, + { + "auxiliary_loss_clip": 0.01102442, + "auxiliary_loss_mlp": 0.01032598, + "balance_loss_clip": 1.02048659, + "balance_loss_mlp": 1.03738046, + "epoch": 0.9662407936269353, + "flos": 21430949736960.0, + "grad_norm": 1.7610963303021612, + "language_loss": 0.77342236, + "learning_rate": 1.1922387753605878e-08, + "loss": 0.7947728, + "num_input_tokens_seen": 346786530, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6484375, + "step": 16071, + "time_per_iteration": 2.4870574474334717 + }, + { + "auxiliary_loss_clip": 0.01097326, + "auxiliary_loss_mlp": 0.01030788, + "balance_loss_clip": 1.01827133, + "balance_loss_mlp": 1.03247118, + "epoch": 0.9663009168796032, + "flos": 14902893095040.0, + "grad_norm": 1.6878615394905503, + "language_loss": 0.66351175, + "learning_rate": 1.1879963671349137e-08, + "loss": 0.68479288, + "num_input_tokens_seen": 346804635, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6484375, + "step": 16072, + "time_per_iteration": 2.4131906032562256 + }, + { + "auxiliary_loss_clip": 0.01101893, + "auxiliary_loss_mlp": 0.01029784, + "balance_loss_clip": 1.01826262, + "balance_loss_mlp": 1.03541517, + "epoch": 0.9663610401322712, + "flos": 24310877460480.0, + "grad_norm": 1.7378428887724273, + "language_loss": 0.77110088, + "learning_rate": 1.1837614978803534e-08, + "loss": 0.79241765, + "num_input_tokens_seen": 346823070, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 16073, + "time_per_iteration": 2.4881250858306885 + }, + { + "auxiliary_loss_clip": 0.01103054, + "auxiliary_loss_mlp": 0.01032875, + "balance_loss_clip": 1.02058411, + "balance_loss_mlp": 1.03565359, + "epoch": 0.9664211633849391, + "flos": 17637849527040.0, + "grad_norm": 2.637485372515987, + "language_loss": 0.75828785, + "learning_rate": 1.1795341677574677e-08, + "loss": 0.77964711, + "num_input_tokens_seen": 346841180, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.67578125, + "step": 16074, + "time_per_iteration": 2.4300355911254883 + }, + { + "auxiliary_loss_clip": 0.01101171, + "auxiliary_loss_mlp": 0.01029268, + "balance_loss_clip": 1.01741314, + "balance_loss_mlp": 1.03474593, + "epoch": 0.9664812866376071, + "flos": 29789409588480.0, + "grad_norm": 1.48623421551312, + "language_loss": 0.75616717, + "learning_rate": 1.1753143769265728e-08, + "loss": 0.77747166, + "num_input_tokens_seen": 346864250, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 16075, + "time_per_iteration": 2.5188755989074707 + }, + { + "auxiliary_loss_clip": 0.01100287, + "auxiliary_loss_mlp": 0.01030547, + "balance_loss_clip": 1.01937151, + "balance_loss_mlp": 1.03515041, + "epoch": 0.966541409890275, + "flos": 14282320798080.0, + "grad_norm": 3.3539989887691215, + "language_loss": 0.78949571, + "learning_rate": 1.171102125547696e-08, + "loss": 0.81080413, + "num_input_tokens_seen": 346881955, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6484375, + "step": 16076, + "time_per_iteration": 2.3969225883483887 + }, + { + "auxiliary_loss_clip": 0.01101671, + "auxiliary_loss_mlp": 0.01036827, + "balance_loss_clip": 1.02472758, + "balance_loss_mlp": 1.035146, + "epoch": 0.9666015331429431, + "flos": 19860432405120.0, + "grad_norm": 1.5504558718428159, + "language_loss": 0.71859056, + "learning_rate": 1.166897413780532e-08, + "loss": 0.73997551, + "num_input_tokens_seen": 346900445, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 16077, + "time_per_iteration": 2.439351797103882 + }, + { + "auxiliary_loss_clip": 0.01098273, + "auxiliary_loss_mlp": 0.01032302, + "balance_loss_clip": 1.02069139, + "balance_loss_mlp": 1.03297472, + "epoch": 0.966661656395611, + "flos": 27125951178240.0, + "grad_norm": 1.7498332359336022, + "language_loss": 0.5911901, + "learning_rate": 1.1627002417845533e-08, + "loss": 0.61249584, + "num_input_tokens_seen": 346920135, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 16078, + "time_per_iteration": 2.4835944175720215 + }, + { + "auxiliary_loss_clip": 0.01102377, + "auxiliary_loss_mlp": 0.01032189, + "balance_loss_clip": 1.02033949, + "balance_loss_mlp": 1.03437603, + "epoch": 0.966721779648279, + "flos": 21508229848320.0, + "grad_norm": 1.7663356554518799, + "language_loss": 0.72015703, + "learning_rate": 1.158510609718899e-08, + "loss": 0.7415027, + "num_input_tokens_seen": 346940450, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6796875, + "step": 16079, + "time_per_iteration": 2.454651355743408 + }, + { + "auxiliary_loss_clip": 0.01095773, + "auxiliary_loss_mlp": 0.01027357, + "balance_loss_clip": 1.01631236, + "balance_loss_mlp": 1.03369761, + "epoch": 0.9667819029009469, + "flos": 23878118401920.0, + "grad_norm": 1.765653495348509, + "language_loss": 0.7217977, + "learning_rate": 1.1543285177424644e-08, + "loss": 0.743029, + "num_input_tokens_seen": 346960935, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.62109375, + "step": 16080, + "time_per_iteration": 2.4750964641571045 + }, + { + "auxiliary_loss_clip": 0.01098817, + "auxiliary_loss_mlp": 0.0102682, + "balance_loss_clip": 1.01583505, + "balance_loss_mlp": 1.03450656, + "epoch": 0.9668420261536149, + "flos": 21507224267520.0, + "grad_norm": 1.9225357739509432, + "language_loss": 0.73896688, + "learning_rate": 1.1501539660138115e-08, + "loss": 0.76022321, + "num_input_tokens_seen": 346980100, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.64453125, + "step": 16081, + "time_per_iteration": 2.444805145263672 + }, + { + "auxiliary_loss_clip": 0.01097756, + "auxiliary_loss_mlp": 0.01026915, + "balance_loss_clip": 1.01485133, + "balance_loss_mlp": 1.03251266, + "epoch": 0.9669021494062828, + "flos": 26687266375680.0, + "grad_norm": 2.1860138544574417, + "language_loss": 0.67122877, + "learning_rate": 1.145986954691236e-08, + "loss": 0.69247544, + "num_input_tokens_seen": 347001250, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.65625, + "step": 16082, + "time_per_iteration": 2.478701591491699 + }, + { + "auxiliary_loss_clip": 0.01097487, + "auxiliary_loss_mlp": 0.01026763, + "balance_loss_clip": 1.01560545, + "balance_loss_mlp": 1.03359115, + "epoch": 0.9669622726589508, + "flos": 29825032901760.0, + "grad_norm": 1.476092866160406, + "language_loss": 0.76806712, + "learning_rate": 1.141827483932789e-08, + "loss": 0.78930962, + "num_input_tokens_seen": 347022975, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.640625, + "step": 16083, + "time_per_iteration": 2.501891613006592 + }, + { + "auxiliary_loss_clip": 0.01099638, + "auxiliary_loss_mlp": 0.01028225, + "balance_loss_clip": 1.01670313, + "balance_loss_mlp": 1.03356194, + "epoch": 0.9670223959116189, + "flos": 22922499018240.0, + "grad_norm": 1.7992097460517922, + "language_loss": 0.79434943, + "learning_rate": 1.1376755538961669e-08, + "loss": 0.81562805, + "num_input_tokens_seen": 347038780, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 16084, + "time_per_iteration": 2.4433937072753906 + }, + { + "auxiliary_loss_clip": 0.01101573, + "auxiliary_loss_mlp": 0.01027, + "balance_loss_clip": 1.01498938, + "balance_loss_mlp": 1.03329217, + "epoch": 0.9670825191642868, + "flos": 18624495283200.0, + "grad_norm": 2.5576364134525105, + "language_loss": 0.67727828, + "learning_rate": 1.1335311647387991e-08, + "loss": 0.69856399, + "num_input_tokens_seen": 347056705, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.68359375, + "step": 16085, + "time_per_iteration": 2.439408779144287 + }, + { + "auxiliary_loss_clip": 0.01104066, + "auxiliary_loss_mlp": 0.0103049, + "balance_loss_clip": 1.01778781, + "balance_loss_mlp": 1.03539014, + "epoch": 0.9671426424169548, + "flos": 24497936513280.0, + "grad_norm": 1.9103794202550979, + "language_loss": 0.68926775, + "learning_rate": 1.1293943166178709e-08, + "loss": 0.71061325, + "num_input_tokens_seen": 347075710, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 16086, + "time_per_iteration": 2.543067693710327 + }, + { + "auxiliary_loss_clip": 0.01100289, + "auxiliary_loss_mlp": 0.01032596, + "balance_loss_clip": 1.02028179, + "balance_loss_mlp": 1.03610826, + "epoch": 0.9672027656696227, + "flos": 20371189847040.0, + "grad_norm": 1.4406826333266698, + "language_loss": 0.78265107, + "learning_rate": 1.125265009690235e-08, + "loss": 0.80397993, + "num_input_tokens_seen": 347092325, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.640625, + "step": 16087, + "time_per_iteration": 2.479124069213867 + }, + { + "auxiliary_loss_clip": 0.01097717, + "auxiliary_loss_mlp": 0.01025073, + "balance_loss_clip": 1.014148, + "balance_loss_mlp": 1.03304863, + "epoch": 0.9672628889222907, + "flos": 18880179269760.0, + "grad_norm": 2.902915851013034, + "language_loss": 0.71206176, + "learning_rate": 1.1211432441124769e-08, + "loss": 0.73328972, + "num_input_tokens_seen": 347110595, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 16088, + "time_per_iteration": 2.453108310699463 + }, + { + "auxiliary_loss_clip": 0.01097715, + "auxiliary_loss_mlp": 0.01028177, + "balance_loss_clip": 1.01747155, + "balance_loss_mlp": 1.03437591, + "epoch": 0.9673230121749586, + "flos": 28695247447680.0, + "grad_norm": 1.6805016946049898, + "language_loss": 0.70649052, + "learning_rate": 1.117029020040916e-08, + "loss": 0.72774947, + "num_input_tokens_seen": 347131625, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6328125, + "step": 16089, + "time_per_iteration": 2.539914846420288 + }, + { + "auxiliary_loss_clip": 0.01102248, + "auxiliary_loss_mlp": 0.0103043, + "balance_loss_clip": 1.01866364, + "balance_loss_mlp": 1.03477347, + "epoch": 0.9673831354276267, + "flos": 20484452407680.0, + "grad_norm": 2.2645704786578604, + "language_loss": 0.74865729, + "learning_rate": 1.1129223376315167e-08, + "loss": 0.76998407, + "num_input_tokens_seen": 347147910, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.671875, + "step": 16090, + "time_per_iteration": 2.593626022338867 + }, + { + "auxiliary_loss_clip": 0.01103105, + "auxiliary_loss_mlp": 0.01029552, + "balance_loss_clip": 1.01780951, + "balance_loss_mlp": 1.03354084, + "epoch": 0.9674432586802946, + "flos": 26797548107520.0, + "grad_norm": 1.7972690157643232, + "language_loss": 0.69049466, + "learning_rate": 1.1088231970400653e-08, + "loss": 0.71182114, + "num_input_tokens_seen": 347168805, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6953125, + "step": 16091, + "time_per_iteration": 2.530670642852783 + }, + { + "auxiliary_loss_clip": 0.0109713, + "auxiliary_loss_mlp": 0.01032021, + "balance_loss_clip": 1.01994467, + "balance_loss_mlp": 1.0330565, + "epoch": 0.9675033819329626, + "flos": 22310941034880.0, + "grad_norm": 1.727007301138269, + "language_loss": 0.76661873, + "learning_rate": 1.1047315984219484e-08, + "loss": 0.78791022, + "num_input_tokens_seen": 347189455, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.640625, + "step": 16092, + "time_per_iteration": 2.6132729053497314 + }, + { + "auxiliary_loss_clip": 0.01100479, + "auxiliary_loss_mlp": 0.0102533, + "balance_loss_clip": 1.01423764, + "balance_loss_mlp": 1.03612089, + "epoch": 0.9675635051856305, + "flos": 12675713276160.0, + "grad_norm": 1.7194933349495616, + "language_loss": 0.76217842, + "learning_rate": 1.1006475419323313e-08, + "loss": 0.78343654, + "num_input_tokens_seen": 347206030, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.64453125, + "step": 16093, + "time_per_iteration": 2.6711509227752686 + }, + { + "auxiliary_loss_clip": 0.01097824, + "auxiliary_loss_mlp": 0.01024564, + "balance_loss_clip": 1.01199961, + "balance_loss_mlp": 1.03344226, + "epoch": 0.9676236284382985, + "flos": 24608469640320.0, + "grad_norm": 1.4864798894423341, + "language_loss": 0.68974423, + "learning_rate": 1.096571027726112e-08, + "loss": 0.71096814, + "num_input_tokens_seen": 347226250, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.64453125, + "step": 16094, + "time_per_iteration": 2.576261043548584 + }, + { + "auxiliary_loss_clip": 0.01101022, + "auxiliary_loss_mlp": 0.01026609, + "balance_loss_clip": 1.01573682, + "balance_loss_mlp": 1.0338856, + "epoch": 0.9676837516909664, + "flos": 23367145478400.0, + "grad_norm": 1.4390045199014274, + "language_loss": 0.75913978, + "learning_rate": 1.0925020559578557e-08, + "loss": 0.78041601, + "num_input_tokens_seen": 347247350, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.671875, + "step": 16095, + "time_per_iteration": 3.963588237762451 + }, + { + "auxiliary_loss_clip": 0.01104114, + "auxiliary_loss_mlp": 0.01032264, + "balance_loss_clip": 1.02028298, + "balance_loss_mlp": 1.035339, + "epoch": 0.9677438749436345, + "flos": 20486894532480.0, + "grad_norm": 2.016309233770184, + "language_loss": 0.70449293, + "learning_rate": 1.0884406267818392e-08, + "loss": 0.72585666, + "num_input_tokens_seen": 347266870, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6875, + "step": 16096, + "time_per_iteration": 2.521918773651123 + }, + { + "auxiliary_loss_clip": 0.01101756, + "auxiliary_loss_mlp": 0.01025236, + "balance_loss_clip": 1.01357687, + "balance_loss_mlp": 1.03581285, + "epoch": 0.9678039981963025, + "flos": 47555889719040.0, + "grad_norm": 2.9772334686732624, + "language_loss": 0.71572793, + "learning_rate": 1.0843867403520946e-08, + "loss": 0.73699778, + "num_input_tokens_seen": 347290120, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66015625, + "step": 16097, + "time_per_iteration": 4.2033936977386475 + }, + { + "auxiliary_loss_clip": 0.01098779, + "auxiliary_loss_mlp": 0.01030471, + "balance_loss_clip": 1.01911056, + "balance_loss_mlp": 1.03425193, + "epoch": 0.9678641214489704, + "flos": 25040474513280.0, + "grad_norm": 1.6763904743398519, + "language_loss": 0.77971011, + "learning_rate": 1.0803403968223434e-08, + "loss": 0.80100262, + "num_input_tokens_seen": 347308785, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 16098, + "time_per_iteration": 3.855729341506958 + }, + { + "auxiliary_loss_clip": 0.01096588, + "auxiliary_loss_mlp": 0.01027802, + "balance_loss_clip": 1.01722848, + "balance_loss_mlp": 1.03283536, + "epoch": 0.9679242447016384, + "flos": 19240937516160.0, + "grad_norm": 1.7301712267219669, + "language_loss": 0.90408123, + "learning_rate": 1.0763015963459965e-08, + "loss": 0.92532516, + "num_input_tokens_seen": 347326375, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.63671875, + "step": 16099, + "time_per_iteration": 2.468384265899658 + }, + { + "auxiliary_loss_clip": 0.01100288, + "auxiliary_loss_mlp": 0.01033081, + "balance_loss_clip": 1.02081418, + "balance_loss_mlp": 1.03329253, + "epoch": 0.9679843679543063, + "flos": 33254681345280.0, + "grad_norm": 1.6709265367884942, + "language_loss": 0.65798569, + "learning_rate": 1.0722703390762643e-08, + "loss": 0.67931938, + "num_input_tokens_seen": 347348250, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.66796875, + "step": 16100, + "time_per_iteration": 2.6282451152801514 + }, + { + "auxiliary_loss_clip": 0.01099773, + "auxiliary_loss_mlp": 0.01030447, + "balance_loss_clip": 1.01882386, + "balance_loss_mlp": 1.03416276, + "epoch": 0.9680444912069743, + "flos": 22783633038720.0, + "grad_norm": 1.474374726903324, + "language_loss": 0.73381197, + "learning_rate": 1.0682466251659584e-08, + "loss": 0.7551142, + "num_input_tokens_seen": 347367400, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 16101, + "time_per_iteration": 2.5105645656585693 + }, + { + "auxiliary_loss_clip": 0.01098487, + "auxiliary_loss_mlp": 0.01028409, + "balance_loss_clip": 1.01647615, + "balance_loss_mlp": 1.03371549, + "epoch": 0.9681046144596422, + "flos": 24024095274240.0, + "grad_norm": 1.4963336382837327, + "language_loss": 0.73430026, + "learning_rate": 1.0642304547676672e-08, + "loss": 0.75556922, + "num_input_tokens_seen": 347387600, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6484375, + "step": 16102, + "time_per_iteration": 2.5259511470794678 + }, + { + "auxiliary_loss_clip": 0.01101802, + "auxiliary_loss_mlp": 0.01032381, + "balance_loss_clip": 1.01956034, + "balance_loss_mlp": 1.03549552, + "epoch": 0.9681647377123103, + "flos": 23441013797760.0, + "grad_norm": 2.31512304657473, + "language_loss": 0.77183741, + "learning_rate": 1.0602218280337139e-08, + "loss": 0.79317927, + "num_input_tokens_seen": 347406915, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6640625, + "step": 16103, + "time_per_iteration": 4.039667129516602 + }, + { + "auxiliary_loss_clip": 0.01099986, + "auxiliary_loss_mlp": 0.01024799, + "balance_loss_clip": 1.01388574, + "balance_loss_mlp": 1.03492332, + "epoch": 0.9682248609649782, + "flos": 22675075159680.0, + "grad_norm": 1.5693653808008938, + "language_loss": 0.8058641, + "learning_rate": 1.0562207451160655e-08, + "loss": 0.82711196, + "num_input_tokens_seen": 347425140, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.65234375, + "step": 16104, + "time_per_iteration": 2.581583261489868 + }, + { + "auxiliary_loss_clip": 0.010947, + "auxiliary_loss_mlp": 0.0103063, + "balance_loss_clip": 1.02028215, + "balance_loss_mlp": 1.03151107, + "epoch": 0.9682849842176462, + "flos": 24428413739520.0, + "grad_norm": 1.888198110026545, + "language_loss": 0.77700287, + "learning_rate": 1.0522272061664672e-08, + "loss": 0.79825616, + "num_input_tokens_seen": 347446350, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.6328125, + "step": 16105, + "time_per_iteration": 2.5465734004974365 + }, + { + "auxiliary_loss_clip": 0.01021561, + "auxiliary_loss_mlp": 0.01000898, + "balance_loss_clip": 0.99988431, + "balance_loss_mlp": 1.0015564, + "epoch": 0.9683451074703141, + "flos": 59995132784640.0, + "grad_norm": 0.8173661990945631, + "language_loss": 0.56672597, + "learning_rate": 1.0482412113363536e-08, + "loss": 0.58695054, + "num_input_tokens_seen": 347510135, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.20019531, + "step": 16106, + "time_per_iteration": 3.134302854537964 + }, + { + "auxiliary_loss_clip": 0.01021505, + "auxiliary_loss_mlp": 0.00999876, + "balance_loss_clip": 0.99889243, + "balance_loss_mlp": 1.00162327, + "epoch": 0.9684052307229821, + "flos": 52696145514240.0, + "grad_norm": 0.8868946741274136, + "language_loss": 0.61609983, + "learning_rate": 1.0442627607768707e-08, + "loss": 0.63631362, + "num_input_tokens_seen": 347562505, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.19921875, + "step": 16107, + "time_per_iteration": 2.977184295654297 + }, + { + "auxiliary_loss_clip": 0.01101515, + "auxiliary_loss_mlp": 0.01034494, + "balance_loss_clip": 1.02146411, + "balance_loss_mlp": 1.03632665, + "epoch": 0.96846535397565, + "flos": 22783848520320.0, + "grad_norm": 2.080411016997331, + "language_loss": 0.73906231, + "learning_rate": 1.040291854638875e-08, + "loss": 0.76042247, + "num_input_tokens_seen": 347579150, + "router_z_loss_clip": 0.12988281, + "router_z_loss_mlp": 0.65234375, + "step": 16108, + "time_per_iteration": 2.506273031234741 + }, + { + "auxiliary_loss_clip": 0.01101743, + "auxiliary_loss_mlp": 0.0102695, + "balance_loss_clip": 1.01524949, + "balance_loss_mlp": 1.03544784, + "epoch": 0.968525477228318, + "flos": 23323980309120.0, + "grad_norm": 2.6207813838672194, + "language_loss": 0.56951755, + "learning_rate": 1.0363284930729576e-08, + "loss": 0.59080446, + "num_input_tokens_seen": 347596705, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 16109, + "time_per_iteration": 2.483224868774414 + }, + { + "auxiliary_loss_clip": 0.01021716, + "auxiliary_loss_mlp": 0.01003704, + "balance_loss_clip": 1.00268459, + "balance_loss_mlp": 1.00173068, + "epoch": 0.9685856004809861, + "flos": 67882947707520.0, + "grad_norm": 0.6709491279205547, + "language_loss": 0.54244637, + "learning_rate": 1.0323726762294205e-08, + "loss": 0.56270063, + "num_input_tokens_seen": 347661870, + "router_z_loss_clip": 0.01019287, + "router_z_loss_mlp": 0.20019531, + "step": 16110, + "time_per_iteration": 3.065276861190796 + }, + { + "auxiliary_loss_clip": 0.0110392, + "auxiliary_loss_mlp": 0.01037893, + "balance_loss_clip": 1.0250361, + "balance_loss_mlp": 1.035496, + "epoch": 0.968645723733654, + "flos": 33947900899200.0, + "grad_norm": 1.3847067332829404, + "language_loss": 0.62662238, + "learning_rate": 1.0284244042582325e-08, + "loss": 0.64804053, + "num_input_tokens_seen": 347684295, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.68359375, + "step": 16111, + "time_per_iteration": 2.6130480766296387 + }, + { + "auxiliary_loss_clip": 0.01096411, + "auxiliary_loss_mlp": 0.0102627, + "balance_loss_clip": 1.01571369, + "balance_loss_mlp": 1.03207159, + "epoch": 0.968705846986322, + "flos": 18551488890240.0, + "grad_norm": 3.5311052248737096, + "language_loss": 0.74400336, + "learning_rate": 1.024483677309118e-08, + "loss": 0.76523018, + "num_input_tokens_seen": 347702585, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.64453125, + "step": 16112, + "time_per_iteration": 2.4801442623138428 + }, + { + "auxiliary_loss_clip": 0.01095788, + "auxiliary_loss_mlp": 0.01026327, + "balance_loss_clip": 1.01544356, + "balance_loss_mlp": 1.03244877, + "epoch": 0.9687659702389899, + "flos": 17420913336960.0, + "grad_norm": 2.6238571521164777, + "language_loss": 0.66553986, + "learning_rate": 1.020550495531558e-08, + "loss": 0.68676102, + "num_input_tokens_seen": 347721810, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6328125, + "step": 16113, + "time_per_iteration": 2.4916794300079346 + }, + { + "auxiliary_loss_clip": 0.01021806, + "auxiliary_loss_mlp": 0.01000111, + "balance_loss_clip": 0.99910325, + "balance_loss_mlp": 1.00189781, + "epoch": 0.9688260934916579, + "flos": 62047176865920.0, + "grad_norm": 0.6957044667296043, + "language_loss": 0.56507289, + "learning_rate": 1.0166248590746329e-08, + "loss": 0.5852921, + "num_input_tokens_seen": 347782330, + "router_z_loss_clip": 0.0100708, + "router_z_loss_mlp": 0.19921875, + "step": 16114, + "time_per_iteration": 3.085864305496216 + }, + { + "auxiliary_loss_clip": 0.0109922, + "auxiliary_loss_mlp": 0.01031598, + "balance_loss_clip": 1.02025533, + "balance_loss_mlp": 1.03437066, + "epoch": 0.9688862167443258, + "flos": 15076520461440.0, + "grad_norm": 1.941913365229189, + "language_loss": 0.82679498, + "learning_rate": 1.0127067680872458e-08, + "loss": 0.84810317, + "num_input_tokens_seen": 347794835, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 16115, + "time_per_iteration": 2.423555374145508 + }, + { + "auxiliary_loss_clip": 0.01093975, + "auxiliary_loss_mlp": 0.01025186, + "balance_loss_clip": 1.01466632, + "balance_loss_mlp": 1.03341866, + "epoch": 0.9689463399969939, + "flos": 19938215306880.0, + "grad_norm": 1.8462013242923505, + "language_loss": 0.72099042, + "learning_rate": 1.0087962227179448e-08, + "loss": 0.74218202, + "num_input_tokens_seen": 347814320, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.60546875, + "step": 16116, + "time_per_iteration": 2.4753947257995605 + }, + { + "auxiliary_loss_clip": 0.01101391, + "auxiliary_loss_mlp": 0.01031805, + "balance_loss_clip": 1.01981306, + "balance_loss_mlp": 1.03498697, + "epoch": 0.9690064632496618, + "flos": 19573039687680.0, + "grad_norm": 1.9755515538352788, + "language_loss": 0.75565988, + "learning_rate": 1.0048932231150553e-08, + "loss": 0.77699178, + "num_input_tokens_seen": 347832125, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6640625, + "step": 16117, + "time_per_iteration": 2.491583824157715 + }, + { + "auxiliary_loss_clip": 0.01100278, + "auxiliary_loss_mlp": 0.01031427, + "balance_loss_clip": 1.01930988, + "balance_loss_mlp": 1.0331347, + "epoch": 0.9690665865023298, + "flos": 21872292145920.0, + "grad_norm": 1.8834984101771413, + "language_loss": 0.77285224, + "learning_rate": 1.000997769426548e-08, + "loss": 0.79416931, + "num_input_tokens_seen": 347850765, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 16118, + "time_per_iteration": 2.5223984718322754 + }, + { + "auxiliary_loss_clip": 0.01102479, + "auxiliary_loss_mlp": 0.01030735, + "balance_loss_clip": 1.01985121, + "balance_loss_mlp": 1.03636765, + "epoch": 0.9691267097549977, + "flos": 20994491577600.0, + "grad_norm": 1.8607520753959634, + "language_loss": 0.78167307, + "learning_rate": 9.971098618001272e-09, + "loss": 0.80300522, + "num_input_tokens_seen": 347870125, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.66015625, + "step": 16119, + "time_per_iteration": 2.5595972537994385 + }, + { + "auxiliary_loss_clip": 0.01095474, + "auxiliary_loss_mlp": 0.01024928, + "balance_loss_clip": 1.01487851, + "balance_loss_mlp": 1.03318226, + "epoch": 0.9691868330076657, + "flos": 24279132816000.0, + "grad_norm": 1.6605084045495015, + "language_loss": 0.75243753, + "learning_rate": 9.932295003832747e-09, + "loss": 0.77364153, + "num_input_tokens_seen": 347890615, + "router_z_loss_clip": 0.10058594, + "router_z_loss_mlp": 0.625, + "step": 16120, + "time_per_iteration": 2.6170387268066406 + }, + { + "auxiliary_loss_clip": 0.01098759, + "auxiliary_loss_mlp": 0.01028252, + "balance_loss_clip": 1.01705265, + "balance_loss_mlp": 1.03436828, + "epoch": 0.9692469562603336, + "flos": 17675699483520.0, + "grad_norm": 1.8284557302359925, + "language_loss": 0.6938538, + "learning_rate": 9.89356685323095e-09, + "loss": 0.71512389, + "num_input_tokens_seen": 347908685, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.64453125, + "step": 16121, + "time_per_iteration": 2.5049474239349365 + }, + { + "auxiliary_loss_clip": 0.01098484, + "auxiliary_loss_mlp": 0.01030347, + "balance_loss_clip": 1.0194633, + "balance_loss_mlp": 1.03341174, + "epoch": 0.9693070795130017, + "flos": 26834392483200.0, + "grad_norm": 1.8406321763279332, + "language_loss": 0.69080842, + "learning_rate": 9.854914167664486e-09, + "loss": 0.71209669, + "num_input_tokens_seen": 347926385, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.65234375, + "step": 16122, + "time_per_iteration": 2.5901401042938232 + }, + { + "auxiliary_loss_clip": 0.01098492, + "auxiliary_loss_mlp": 0.01024638, + "balance_loss_clip": 1.01409388, + "balance_loss_mlp": 1.03305304, + "epoch": 0.9693672027656697, + "flos": 18077288515200.0, + "grad_norm": 2.0360794405813296, + "language_loss": 0.75851989, + "learning_rate": 9.81633694859907e-09, + "loss": 0.77975118, + "num_input_tokens_seen": 347945290, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.65625, + "step": 16123, + "time_per_iteration": 2.5255179405212402 + }, + { + "auxiliary_loss_clip": 0.0109954, + "auxiliary_loss_mlp": 0.01031283, + "balance_loss_clip": 1.0194335, + "balance_loss_mlp": 1.03278112, + "epoch": 0.9694273260183376, + "flos": 21763015994880.0, + "grad_norm": 1.7443605692530082, + "language_loss": 0.74463332, + "learning_rate": 9.777835197497753e-09, + "loss": 0.76594156, + "num_input_tokens_seen": 347966330, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66796875, + "step": 16124, + "time_per_iteration": 2.6022872924804688 + }, + { + "auxiliary_loss_clip": 0.01100209, + "auxiliary_loss_mlp": 0.01035364, + "balance_loss_clip": 1.02369344, + "balance_loss_mlp": 1.03369021, + "epoch": 0.9694874492710056, + "flos": 24426115269120.0, + "grad_norm": 2.339359667542991, + "language_loss": 0.73955059, + "learning_rate": 9.739408915820258e-09, + "loss": 0.76090634, + "num_input_tokens_seen": 347982590, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 16125, + "time_per_iteration": 2.5527896881103516 + }, + { + "auxiliary_loss_clip": 0.01021717, + "auxiliary_loss_mlp": 0.00999829, + "balance_loss_clip": 0.99881619, + "balance_loss_mlp": 1.00180256, + "epoch": 0.9695475725236735, + "flos": 67650748237440.0, + "grad_norm": 0.874354260024892, + "language_loss": 0.61542535, + "learning_rate": 9.70105810502364e-09, + "loss": 0.63564086, + "num_input_tokens_seen": 348043310, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.19921875, + "step": 16126, + "time_per_iteration": 3.06150484085083 + }, + { + "auxiliary_loss_clip": 0.01097857, + "auxiliary_loss_mlp": 0.01031948, + "balance_loss_clip": 1.02084327, + "balance_loss_mlp": 1.03425908, + "epoch": 0.9696076957763415, + "flos": 19129326981120.0, + "grad_norm": 2.1736850591790065, + "language_loss": 0.74991131, + "learning_rate": 9.662782766562738e-09, + "loss": 0.77120936, + "num_input_tokens_seen": 348062200, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.63671875, + "step": 16127, + "time_per_iteration": 2.525707721710205 + }, + { + "auxiliary_loss_clip": 0.01099234, + "auxiliary_loss_mlp": 0.01032045, + "balance_loss_clip": 1.0201714, + "balance_loss_mlp": 1.03222942, + "epoch": 0.9696678190290094, + "flos": 15486836497920.0, + "grad_norm": 1.554917519282315, + "language_loss": 0.68819076, + "learning_rate": 9.62458290188839e-09, + "loss": 0.70950353, + "num_input_tokens_seen": 348080685, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66796875, + "step": 16128, + "time_per_iteration": 2.5169262886047363 + }, + { + "auxiliary_loss_clip": 0.01099961, + "auxiliary_loss_mlp": 0.01030218, + "balance_loss_clip": 1.01845229, + "balance_loss_mlp": 1.03504729, + "epoch": 0.9697279422816775, + "flos": 36208692869760.0, + "grad_norm": 1.5275283180861672, + "language_loss": 0.65348375, + "learning_rate": 9.586458512449213e-09, + "loss": 0.67478549, + "num_input_tokens_seen": 348102500, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6484375, + "step": 16129, + "time_per_iteration": 2.6302707195281982 + }, + { + "auxiliary_loss_clip": 0.01103145, + "auxiliary_loss_mlp": 0.01029964, + "balance_loss_clip": 1.01782882, + "balance_loss_mlp": 1.03422213, + "epoch": 0.9697880655343454, + "flos": 25484007651840.0, + "grad_norm": 3.3085134277813424, + "language_loss": 0.63307977, + "learning_rate": 9.548409599691166e-09, + "loss": 0.6544109, + "num_input_tokens_seen": 348122515, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6875, + "step": 16130, + "time_per_iteration": 2.547057628631592 + }, + { + "auxiliary_loss_clip": 0.01103028, + "auxiliary_loss_mlp": 0.01027017, + "balance_loss_clip": 1.01538217, + "balance_loss_mlp": 1.03523135, + "epoch": 0.9698481887870134, + "flos": 15333533251200.0, + "grad_norm": 2.5471865072726056, + "language_loss": 0.69608688, + "learning_rate": 9.510436165056867e-09, + "loss": 0.71738738, + "num_input_tokens_seen": 348138775, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.67578125, + "step": 16131, + "time_per_iteration": 2.4412412643432617 + }, + { + "auxiliary_loss_clip": 0.01101007, + "auxiliary_loss_mlp": 0.01031871, + "balance_loss_clip": 1.0200696, + "balance_loss_mlp": 1.03424954, + "epoch": 0.9699083120396813, + "flos": 21982250655360.0, + "grad_norm": 1.86472716215598, + "language_loss": 0.76548707, + "learning_rate": 9.472538209986058e-09, + "loss": 0.78681588, + "num_input_tokens_seen": 348157115, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.66796875, + "step": 16132, + "time_per_iteration": 2.5090508460998535 + }, + { + "auxiliary_loss_clip": 0.01102566, + "auxiliary_loss_mlp": 0.01035559, + "balance_loss_clip": 1.02364969, + "balance_loss_mlp": 1.03595448, + "epoch": 0.9699684352923493, + "flos": 15664055224320.0, + "grad_norm": 3.9786212871576443, + "language_loss": 0.78581774, + "learning_rate": 9.434715735916477e-09, + "loss": 0.80719894, + "num_input_tokens_seen": 348173035, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66796875, + "step": 16133, + "time_per_iteration": 2.4141860008239746 + }, + { + "auxiliary_loss_clip": 0.01095458, + "auxiliary_loss_mlp": 0.01029888, + "balance_loss_clip": 1.01899862, + "balance_loss_mlp": 1.03250551, + "epoch": 0.9700285585450172, + "flos": 21908382336000.0, + "grad_norm": 1.6269378800137178, + "language_loss": 0.64601958, + "learning_rate": 9.396968744281863e-09, + "loss": 0.66727304, + "num_input_tokens_seen": 348192960, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.62890625, + "step": 16134, + "time_per_iteration": 2.532543182373047 + }, + { + "auxiliary_loss_clip": 0.0109814, + "auxiliary_loss_mlp": 0.01029389, + "balance_loss_clip": 1.01761711, + "balance_loss_mlp": 1.03281355, + "epoch": 0.9700886817976853, + "flos": 23914890950400.0, + "grad_norm": 1.9357465351704168, + "language_loss": 0.80777168, + "learning_rate": 9.359297236513519e-09, + "loss": 0.82904708, + "num_input_tokens_seen": 348212805, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.65234375, + "step": 16135, + "time_per_iteration": 2.4684133529663086 + }, + { + "auxiliary_loss_clip": 0.01102843, + "auxiliary_loss_mlp": 0.01031563, + "balance_loss_clip": 1.01883101, + "balance_loss_mlp": 1.03501081, + "epoch": 0.9701488050503532, + "flos": 25447845634560.0, + "grad_norm": 2.1122625470948577, + "language_loss": 0.72945958, + "learning_rate": 9.321701214040079e-09, + "loss": 0.75080359, + "num_input_tokens_seen": 348232900, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 16136, + "time_per_iteration": 2.550011157989502 + }, + { + "auxiliary_loss_clip": 0.0109795, + "auxiliary_loss_mlp": 0.01026, + "balance_loss_clip": 1.01561141, + "balance_loss_mlp": 1.03390837, + "epoch": 0.9702089283030212, + "flos": 20590855470720.0, + "grad_norm": 1.5366970636246593, + "language_loss": 0.76298726, + "learning_rate": 9.28418067828729e-09, + "loss": 0.78422666, + "num_input_tokens_seen": 348253065, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.640625, + "step": 16137, + "time_per_iteration": 3.8292956352233887 + }, + { + "auxiliary_loss_clip": 0.01021726, + "auxiliary_loss_mlp": 0.01002432, + "balance_loss_clip": 1.00152612, + "balance_loss_mlp": 1.00163674, + "epoch": 0.9702690515556892, + "flos": 70651516291200.0, + "grad_norm": 1.5028472325404159, + "language_loss": 0.54901278, + "learning_rate": 9.246735630678015e-09, + "loss": 0.56925428, + "num_input_tokens_seen": 348316075, + "router_z_loss_clip": 0.0090332, + "router_z_loss_mlp": 0.20117188, + "step": 16138, + "time_per_iteration": 3.1473779678344727 + }, + { + "auxiliary_loss_clip": 0.01098109, + "auxiliary_loss_mlp": 0.01031315, + "balance_loss_clip": 1.02032351, + "balance_loss_mlp": 1.03277564, + "epoch": 0.9703291748083571, + "flos": 35881439034240.0, + "grad_norm": 1.6714662487676892, + "language_loss": 0.70472324, + "learning_rate": 9.209366072632007e-09, + "loss": 0.72601748, + "num_input_tokens_seen": 348337605, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.65625, + "step": 16139, + "time_per_iteration": 5.382527828216553 + }, + { + "auxiliary_loss_clip": 0.01101757, + "auxiliary_loss_mlp": 0.01028848, + "balance_loss_clip": 1.01703477, + "balance_loss_mlp": 1.03570795, + "epoch": 0.9703892980610251, + "flos": 24316479982080.0, + "grad_norm": 1.5984229449176695, + "language_loss": 0.72570795, + "learning_rate": 9.172072005566134e-09, + "loss": 0.74701405, + "num_input_tokens_seen": 348359430, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66015625, + "step": 16140, + "time_per_iteration": 2.500535249710083 + }, + { + "auxiliary_loss_clip": 0.01104371, + "auxiliary_loss_mlp": 0.01036206, + "balance_loss_clip": 1.02412963, + "balance_loss_mlp": 1.03588057, + "epoch": 0.970449421313693, + "flos": 18003743418240.0, + "grad_norm": 2.360721566613716, + "language_loss": 0.67877102, + "learning_rate": 9.13485343089504e-09, + "loss": 0.70017684, + "num_input_tokens_seen": 348377890, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.68359375, + "step": 16141, + "time_per_iteration": 2.4640913009643555 + }, + { + "auxiliary_loss_clip": 0.01095646, + "auxiliary_loss_mlp": 0.01029215, + "balance_loss_clip": 1.01804554, + "balance_loss_mlp": 1.03266811, + "epoch": 0.9705095445663611, + "flos": 25337994865920.0, + "grad_norm": 1.8701728936765305, + "language_loss": 0.68670142, + "learning_rate": 9.097710350029597e-09, + "loss": 0.70795, + "num_input_tokens_seen": 348396550, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.62890625, + "step": 16142, + "time_per_iteration": 2.477365493774414 + }, + { + "auxiliary_loss_clip": 0.0109703, + "auxiliary_loss_mlp": 0.01027934, + "balance_loss_clip": 1.01635337, + "balance_loss_mlp": 1.03252649, + "epoch": 0.970569667819029, + "flos": 26833602384000.0, + "grad_norm": 1.7508245521385353, + "language_loss": 0.55955529, + "learning_rate": 9.060642764378457e-09, + "loss": 0.58080494, + "num_input_tokens_seen": 348417120, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.64453125, + "step": 16143, + "time_per_iteration": 2.5553791522979736 + }, + { + "auxiliary_loss_clip": 0.01100849, + "auxiliary_loss_mlp": 0.01029513, + "balance_loss_clip": 1.01848042, + "balance_loss_mlp": 1.034536, + "epoch": 0.970629791071697, + "flos": 25848644567040.0, + "grad_norm": 1.9577876836122245, + "language_loss": 0.67899948, + "learning_rate": 9.023650675347382e-09, + "loss": 0.70030308, + "num_input_tokens_seen": 348437750, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6640625, + "step": 16144, + "time_per_iteration": 4.970671892166138 + }, + { + "auxiliary_loss_clip": 0.01099463, + "auxiliary_loss_mlp": 0.01041949, + "balance_loss_clip": 1.03105295, + "balance_loss_mlp": 1.03441381, + "epoch": 0.9706899143243649, + "flos": 36540184510080.0, + "grad_norm": 1.7212412330787912, + "language_loss": 0.71903557, + "learning_rate": 8.986734084339253e-09, + "loss": 0.74044967, + "num_input_tokens_seen": 348460935, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6484375, + "step": 16145, + "time_per_iteration": 2.5942580699920654 + }, + { + "auxiliary_loss_clip": 0.01100216, + "auxiliary_loss_mlp": 0.01030004, + "balance_loss_clip": 1.01734352, + "balance_loss_mlp": 1.03212011, + "epoch": 0.9707500375770329, + "flos": 12268234414080.0, + "grad_norm": 2.8276003686197018, + "language_loss": 0.79872471, + "learning_rate": 8.949892992753395e-09, + "loss": 0.82002687, + "num_input_tokens_seen": 348474480, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 16146, + "time_per_iteration": 2.401989221572876 + }, + { + "auxiliary_loss_clip": 0.01021551, + "auxiliary_loss_mlp": 0.01001342, + "balance_loss_clip": 1.00041199, + "balance_loss_mlp": 1.00153255, + "epoch": 0.9708101608297008, + "flos": 60853040196480.0, + "grad_norm": 0.7626452621026454, + "language_loss": 0.54555905, + "learning_rate": 8.91312740198713e-09, + "loss": 0.56578797, + "num_input_tokens_seen": 348541220, + "router_z_loss_clip": 0.00927734, + "router_z_loss_mlp": 0.20019531, + "step": 16147, + "time_per_iteration": 3.0902152061462402 + }, + { + "auxiliary_loss_clip": 0.01102002, + "auxiliary_loss_mlp": 0.01031656, + "balance_loss_clip": 1.01913333, + "balance_loss_mlp": 1.0341568, + "epoch": 0.9708702840823689, + "flos": 27124766029440.0, + "grad_norm": 3.8974321244687964, + "language_loss": 0.61855692, + "learning_rate": 8.876437313434682e-09, + "loss": 0.63989353, + "num_input_tokens_seen": 348559230, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6796875, + "step": 16148, + "time_per_iteration": 2.464473247528076 + }, + { + "auxiliary_loss_clip": 0.01096857, + "auxiliary_loss_mlp": 0.01036108, + "balance_loss_clip": 1.02500391, + "balance_loss_mlp": 1.03314471, + "epoch": 0.9709304073350368, + "flos": 20777699041920.0, + "grad_norm": 1.6335067139956454, + "language_loss": 0.73529303, + "learning_rate": 8.839822728487155e-09, + "loss": 0.75662261, + "num_input_tokens_seen": 348577850, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.63671875, + "step": 16149, + "time_per_iteration": 2.4322702884674072 + }, + { + "auxiliary_loss_clip": 0.01097685, + "auxiliary_loss_mlp": 0.01033949, + "balance_loss_clip": 1.02248096, + "balance_loss_mlp": 1.03168344, + "epoch": 0.9709905305877048, + "flos": 41934541115520.0, + "grad_norm": 3.032692446472873, + "language_loss": 0.75145626, + "learning_rate": 8.803283648533222e-09, + "loss": 0.77277255, + "num_input_tokens_seen": 348598345, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66015625, + "step": 16150, + "time_per_iteration": 2.597921848297119 + }, + { + "auxiliary_loss_clip": 0.01107309, + "auxiliary_loss_mlp": 0.01030192, + "balance_loss_clip": 1.01678681, + "balance_loss_mlp": 1.03694558, + "epoch": 0.9710506538403728, + "flos": 17165588486400.0, + "grad_norm": 2.1049306282297358, + "language_loss": 0.73670769, + "learning_rate": 8.766820074958214e-09, + "loss": 0.75808269, + "num_input_tokens_seen": 348616300, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.703125, + "step": 16151, + "time_per_iteration": 2.422347068786621 + }, + { + "auxiliary_loss_clip": 0.01098403, + "auxiliary_loss_mlp": 0.01027737, + "balance_loss_clip": 1.01606655, + "balance_loss_mlp": 1.03450835, + "epoch": 0.9711107770930407, + "flos": 21173470070400.0, + "grad_norm": 1.7013232135695202, + "language_loss": 0.74849296, + "learning_rate": 8.730432009145027e-09, + "loss": 0.76975429, + "num_input_tokens_seen": 348633845, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.63671875, + "step": 16152, + "time_per_iteration": 2.4549672603607178 + }, + { + "auxiliary_loss_clip": 0.01098407, + "auxiliary_loss_mlp": 0.01032676, + "balance_loss_clip": 1.02126741, + "balance_loss_mlp": 1.03373194, + "epoch": 0.9711709003457087, + "flos": 22237072715520.0, + "grad_norm": 1.6070864987804534, + "language_loss": 0.66789192, + "learning_rate": 8.694119452473448e-09, + "loss": 0.68920273, + "num_input_tokens_seen": 348653070, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 16153, + "time_per_iteration": 2.4515061378479004 + }, + { + "auxiliary_loss_clip": 0.01099334, + "auxiliary_loss_mlp": 0.01029741, + "balance_loss_clip": 1.01902413, + "balance_loss_mlp": 1.03360021, + "epoch": 0.9712310235983767, + "flos": 26213856099840.0, + "grad_norm": 1.5238206763450304, + "language_loss": 0.703457, + "learning_rate": 8.65788240632037e-09, + "loss": 0.72474778, + "num_input_tokens_seen": 348672145, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.65625, + "step": 16154, + "time_per_iteration": 2.478994131088257 + }, + { + "auxiliary_loss_clip": 0.01103679, + "auxiliary_loss_mlp": 0.01030383, + "balance_loss_clip": 1.01762736, + "balance_loss_mlp": 1.03658104, + "epoch": 0.9712911468510447, + "flos": 20668171495680.0, + "grad_norm": 2.2051583809409507, + "language_loss": 0.8076309, + "learning_rate": 8.621720872059812e-09, + "loss": 0.82897151, + "num_input_tokens_seen": 348690615, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.671875, + "step": 16155, + "time_per_iteration": 2.4331750869750977 + }, + { + "auxiliary_loss_clip": 0.01102255, + "auxiliary_loss_mlp": 0.01037292, + "balance_loss_clip": 1.02448285, + "balance_loss_mlp": 1.0339818, + "epoch": 0.9713512701037126, + "flos": 13552903313280.0, + "grad_norm": 2.1467214162660357, + "language_loss": 0.67530596, + "learning_rate": 8.58563485106334e-09, + "loss": 0.69670147, + "num_input_tokens_seen": 348708665, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.68359375, + "step": 16156, + "time_per_iteration": 2.41339111328125 + }, + { + "auxiliary_loss_clip": 0.01100256, + "auxiliary_loss_mlp": 0.01032008, + "balance_loss_clip": 1.02119589, + "balance_loss_mlp": 1.03306246, + "epoch": 0.9714113933563806, + "flos": 25848752307840.0, + "grad_norm": 2.5197090759415994, + "language_loss": 0.90636677, + "learning_rate": 8.54962434469919e-09, + "loss": 0.92768943, + "num_input_tokens_seen": 348726105, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.671875, + "step": 16157, + "time_per_iteration": 2.470351219177246 + }, + { + "auxiliary_loss_clip": 0.01101295, + "auxiliary_loss_mlp": 0.01027685, + "balance_loss_clip": 1.0168134, + "balance_loss_mlp": 1.03504801, + "epoch": 0.9714715166090485, + "flos": 12743081233920.0, + "grad_norm": 1.914722039633016, + "language_loss": 0.72404706, + "learning_rate": 8.513689354332721e-09, + "loss": 0.74533689, + "num_input_tokens_seen": 348743360, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6640625, + "step": 16158, + "time_per_iteration": 2.4336278438568115 + }, + { + "auxiliary_loss_clip": 0.01098334, + "auxiliary_loss_mlp": 0.01036411, + "balance_loss_clip": 1.02518129, + "balance_loss_mlp": 1.03437686, + "epoch": 0.9715316398617165, + "flos": 18405547931520.0, + "grad_norm": 2.239444291406118, + "language_loss": 0.60365427, + "learning_rate": 8.477829881326836e-09, + "loss": 0.62500173, + "num_input_tokens_seen": 348759045, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 16159, + "time_per_iteration": 2.4120450019836426 + }, + { + "auxiliary_loss_clip": 0.01094573, + "auxiliary_loss_mlp": 0.01026284, + "balance_loss_clip": 1.01615119, + "balance_loss_mlp": 1.03301239, + "epoch": 0.9715917631143844, + "flos": 28913799749760.0, + "grad_norm": 1.7275999406739457, + "language_loss": 0.78775787, + "learning_rate": 8.44204592704112e-09, + "loss": 0.80896652, + "num_input_tokens_seen": 348779910, + "router_z_loss_clip": 0.10107422, + "router_z_loss_mlp": 0.61328125, + "step": 16160, + "time_per_iteration": 2.476292133331299 + }, + { + "auxiliary_loss_clip": 0.01021802, + "auxiliary_loss_mlp": 0.00997801, + "balance_loss_clip": 0.99682945, + "balance_loss_mlp": 1.00181723, + "epoch": 0.9716518863670525, + "flos": 65939712900480.0, + "grad_norm": 0.7688722643219095, + "language_loss": 0.54272866, + "learning_rate": 8.406337492832704e-09, + "loss": 0.56292468, + "num_input_tokens_seen": 348838995, + "router_z_loss_clip": 0.00970459, + "router_z_loss_mlp": 0.19921875, + "step": 16161, + "time_per_iteration": 3.047849655151367 + }, + { + "auxiliary_loss_clip": 0.01097904, + "auxiliary_loss_mlp": 0.01032667, + "balance_loss_clip": 1.02122283, + "balance_loss_mlp": 1.0354929, + "epoch": 0.9717120096197204, + "flos": 17712759340800.0, + "grad_norm": 1.7365034375945647, + "language_loss": 0.71583688, + "learning_rate": 8.3707045800554e-09, + "loss": 0.73714256, + "num_input_tokens_seen": 348858090, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.625, + "step": 16162, + "time_per_iteration": 2.3995373249053955 + }, + { + "auxiliary_loss_clip": 0.0109594, + "auxiliary_loss_mlp": 0.01027118, + "balance_loss_clip": 1.01500595, + "balance_loss_mlp": 1.03175616, + "epoch": 0.9717721328723884, + "flos": 24463426521600.0, + "grad_norm": 1.5073534732463694, + "language_loss": 0.7864207, + "learning_rate": 8.335147190060787e-09, + "loss": 0.80765128, + "num_input_tokens_seen": 348877885, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.640625, + "step": 16163, + "time_per_iteration": 2.498755931854248 + }, + { + "auxiliary_loss_clip": 0.01097248, + "auxiliary_loss_mlp": 0.01023869, + "balance_loss_clip": 1.01309824, + "balance_loss_mlp": 1.03388119, + "epoch": 0.9718322561250564, + "flos": 20776477979520.0, + "grad_norm": 2.0532833626708324, + "language_loss": 0.72809923, + "learning_rate": 8.299665324196903e-09, + "loss": 0.74931037, + "num_input_tokens_seen": 348897720, + "router_z_loss_clip": 0.10791016, + "router_z_loss_mlp": 0.6328125, + "step": 16164, + "time_per_iteration": 2.435837507247925 + }, + { + "auxiliary_loss_clip": 0.01100258, + "auxiliary_loss_mlp": 0.01034391, + "balance_loss_clip": 1.02304852, + "balance_loss_mlp": 1.03418469, + "epoch": 0.9718923793777243, + "flos": 19025904746880.0, + "grad_norm": 2.27361632758078, + "language_loss": 0.84098649, + "learning_rate": 8.264258983809114e-09, + "loss": 0.86233294, + "num_input_tokens_seen": 348915410, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.66015625, + "step": 16165, + "time_per_iteration": 2.416750907897949 + }, + { + "auxiliary_loss_clip": 0.01097846, + "auxiliary_loss_mlp": 0.01023556, + "balance_loss_clip": 1.01333344, + "balance_loss_mlp": 1.03371215, + "epoch": 0.9719525026303923, + "flos": 21871717528320.0, + "grad_norm": 1.5641110975288823, + "language_loss": 0.79189312, + "learning_rate": 8.228928170240345e-09, + "loss": 0.81310713, + "num_input_tokens_seen": 348934335, + "router_z_loss_clip": 0.10205078, + "router_z_loss_mlp": 0.640625, + "step": 16166, + "time_per_iteration": 2.4303882122039795 + }, + { + "auxiliary_loss_clip": 0.01100301, + "auxiliary_loss_mlp": 0.01025286, + "balance_loss_clip": 1.01418757, + "balance_loss_mlp": 1.03471613, + "epoch": 0.9720126258830603, + "flos": 14429303251200.0, + "grad_norm": 1.6876740333466311, + "language_loss": 0.70820624, + "learning_rate": 8.193672884830195e-09, + "loss": 0.72946215, + "num_input_tokens_seen": 348952405, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.65625, + "step": 16167, + "time_per_iteration": 2.394996166229248 + }, + { + "auxiliary_loss_clip": 0.01099049, + "auxiliary_loss_mlp": 0.01033231, + "balance_loss_clip": 1.02233577, + "balance_loss_mlp": 1.03586316, + "epoch": 0.9720727491357283, + "flos": 26251167352320.0, + "grad_norm": 1.4769510374268846, + "language_loss": 0.75561023, + "learning_rate": 8.158493128915812e-09, + "loss": 0.77693301, + "num_input_tokens_seen": 348973580, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6328125, + "step": 16168, + "time_per_iteration": 2.4698002338409424 + }, + { + "auxiliary_loss_clip": 0.01101935, + "auxiliary_loss_mlp": 0.01033782, + "balance_loss_clip": 1.02172387, + "balance_loss_mlp": 1.03548265, + "epoch": 0.9721328723883962, + "flos": 22674105492480.0, + "grad_norm": 2.5452954441624596, + "language_loss": 0.72678661, + "learning_rate": 8.123388903830797e-09, + "loss": 0.74814385, + "num_input_tokens_seen": 348992035, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6640625, + "step": 16169, + "time_per_iteration": 2.4278223514556885 + }, + { + "auxiliary_loss_clip": 0.01100414, + "auxiliary_loss_mlp": 0.0103247, + "balance_loss_clip": 1.02016115, + "balance_loss_mlp": 1.03232074, + "epoch": 0.9721929956410642, + "flos": 28074172360320.0, + "grad_norm": 1.704456285146014, + "language_loss": 0.57650185, + "learning_rate": 8.088360210906309e-09, + "loss": 0.59783065, + "num_input_tokens_seen": 349013160, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6796875, + "step": 16170, + "time_per_iteration": 2.481048583984375 + }, + { + "auxiliary_loss_clip": 0.01100814, + "auxiliary_loss_mlp": 0.01028453, + "balance_loss_clip": 1.01628804, + "balance_loss_mlp": 1.03412509, + "epoch": 0.9722531188937321, + "flos": 20996251344000.0, + "grad_norm": 1.7511216977437811, + "language_loss": 0.71781224, + "learning_rate": 8.053407051471062e-09, + "loss": 0.73910493, + "num_input_tokens_seen": 349033485, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.66796875, + "step": 16171, + "time_per_iteration": 2.434035301208496 + }, + { + "auxiliary_loss_clip": 0.01098692, + "auxiliary_loss_mlp": 0.01035091, + "balance_loss_clip": 1.02387321, + "balance_loss_mlp": 1.03350592, + "epoch": 0.9723132421464001, + "flos": 16070600332800.0, + "grad_norm": 1.5945737465594831, + "language_loss": 0.684237, + "learning_rate": 8.018529426850218e-09, + "loss": 0.70557481, + "num_input_tokens_seen": 349051705, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 16172, + "time_per_iteration": 2.4013216495513916 + }, + { + "auxiliary_loss_clip": 0.01097294, + "auxiliary_loss_mlp": 0.01026054, + "balance_loss_clip": 1.01489615, + "balance_loss_mlp": 1.03379488, + "epoch": 0.972373365399068, + "flos": 27745769289600.0, + "grad_norm": 1.7859971927682219, + "language_loss": 0.86250716, + "learning_rate": 7.983727338366274e-09, + "loss": 0.88374066, + "num_input_tokens_seen": 349070825, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6328125, + "step": 16173, + "time_per_iteration": 2.462507486343384 + }, + { + "auxiliary_loss_clip": 0.01105205, + "auxiliary_loss_mlp": 0.0102972, + "balance_loss_clip": 1.01646948, + "balance_loss_mlp": 1.03571761, + "epoch": 0.9724334886517361, + "flos": 23002939526400.0, + "grad_norm": 1.8689935114845415, + "language_loss": 0.64200556, + "learning_rate": 7.949000787339289e-09, + "loss": 0.66335481, + "num_input_tokens_seen": 349089730, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6953125, + "step": 16174, + "time_per_iteration": 2.4644393920898438 + }, + { + "auxiliary_loss_clip": 0.01098429, + "auxiliary_loss_mlp": 0.01025972, + "balance_loss_clip": 1.01502836, + "balance_loss_mlp": 1.03431141, + "epoch": 0.972493611904404, + "flos": 25447055535360.0, + "grad_norm": 1.3755488882571432, + "language_loss": 0.77686203, + "learning_rate": 7.914349775085538e-09, + "loss": 0.79810601, + "num_input_tokens_seen": 349111315, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.640625, + "step": 16175, + "time_per_iteration": 2.4805030822753906 + }, + { + "auxiliary_loss_clip": 0.01098893, + "auxiliary_loss_mlp": 0.01031389, + "balance_loss_clip": 1.01919389, + "balance_loss_mlp": 1.03381467, + "epoch": 0.972553735157072, + "flos": 16983054547200.0, + "grad_norm": 2.3866480046960525, + "language_loss": 0.56767201, + "learning_rate": 7.879774302919307e-09, + "loss": 0.58897483, + "num_input_tokens_seen": 349129495, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65234375, + "step": 16176, + "time_per_iteration": 2.4352569580078125 + }, + { + "auxiliary_loss_clip": 0.01100519, + "auxiliary_loss_mlp": 0.01029542, + "balance_loss_clip": 1.01862288, + "balance_loss_mlp": 1.0360986, + "epoch": 0.97261385840974, + "flos": 26104651776000.0, + "grad_norm": 2.0671972006538066, + "language_loss": 0.72051632, + "learning_rate": 7.845274372151545e-09, + "loss": 0.74181688, + "num_input_tokens_seen": 349148850, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.64453125, + "step": 16177, + "time_per_iteration": 2.479685068130493 + }, + { + "auxiliary_loss_clip": 0.0109915, + "auxiliary_loss_mlp": 0.01029489, + "balance_loss_clip": 1.01790774, + "balance_loss_mlp": 1.0325197, + "epoch": 0.9726739816624079, + "flos": 25447881548160.0, + "grad_norm": 1.6206813566846388, + "language_loss": 0.68881011, + "learning_rate": 7.810849984090984e-09, + "loss": 0.71009654, + "num_input_tokens_seen": 349167620, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6640625, + "step": 16178, + "time_per_iteration": 3.8625214099884033 + }, + { + "auxiliary_loss_clip": 0.01100578, + "auxiliary_loss_mlp": 0.01033252, + "balance_loss_clip": 1.02159894, + "balance_loss_mlp": 1.03372669, + "epoch": 0.972734104915076, + "flos": 29014923513600.0, + "grad_norm": 1.8858437543885507, + "language_loss": 0.67199779, + "learning_rate": 7.776501140042358e-09, + "loss": 0.69333607, + "num_input_tokens_seen": 349185845, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66796875, + "step": 16179, + "time_per_iteration": 2.4749538898468018 + }, + { + "auxiliary_loss_clip": 0.0109792, + "auxiliary_loss_mlp": 0.01031186, + "balance_loss_clip": 1.01961064, + "balance_loss_mlp": 1.0341022, + "epoch": 0.9727942281677439, + "flos": 23437637919360.0, + "grad_norm": 2.0869429380245843, + "language_loss": 0.77196532, + "learning_rate": 7.742227841308624e-09, + "loss": 0.7932564, + "num_input_tokens_seen": 349204525, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.640625, + "step": 16180, + "time_per_iteration": 3.8608553409576416 + }, + { + "auxiliary_loss_clip": 0.01102292, + "auxiliary_loss_mlp": 0.01030323, + "balance_loss_clip": 1.0184679, + "balance_loss_mlp": 1.03368819, + "epoch": 0.9728543514204119, + "flos": 31724599749120.0, + "grad_norm": 1.5558124846538366, + "language_loss": 0.76269901, + "learning_rate": 7.708030089189188e-09, + "loss": 0.78402507, + "num_input_tokens_seen": 349228075, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.6875, + "step": 16181, + "time_per_iteration": 3.876532793045044 + }, + { + "auxiliary_loss_clip": 0.01097363, + "auxiliary_loss_mlp": 0.01030041, + "balance_loss_clip": 1.01892495, + "balance_loss_mlp": 1.0327394, + "epoch": 0.9729144746730798, + "flos": 16289368116480.0, + "grad_norm": 1.486326372174707, + "language_loss": 0.63157636, + "learning_rate": 7.67390788498079e-09, + "loss": 0.65285045, + "num_input_tokens_seen": 349246990, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 16182, + "time_per_iteration": 2.413458824157715 + }, + { + "auxiliary_loss_clip": 0.01101279, + "auxiliary_loss_mlp": 0.01033844, + "balance_loss_clip": 1.0226686, + "balance_loss_mlp": 1.03501475, + "epoch": 0.9729745979257478, + "flos": 25041408266880.0, + "grad_norm": 1.6223683298394753, + "language_loss": 0.62082142, + "learning_rate": 7.639861229977507e-09, + "loss": 0.64217269, + "num_input_tokens_seen": 349265890, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6640625, + "step": 16183, + "time_per_iteration": 2.4679312705993652 + }, + { + "auxiliary_loss_clip": 0.01097049, + "auxiliary_loss_mlp": 0.010324, + "balance_loss_clip": 1.02033019, + "balance_loss_mlp": 1.0336585, + "epoch": 0.9730347211784157, + "flos": 22638733574400.0, + "grad_norm": 1.61421361964316, + "language_loss": 0.77789152, + "learning_rate": 7.605890125470527e-09, + "loss": 0.79918599, + "num_input_tokens_seen": 349285275, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6328125, + "step": 16184, + "time_per_iteration": 2.443528652191162 + }, + { + "auxiliary_loss_clip": 0.01096093, + "auxiliary_loss_mlp": 0.01028266, + "balance_loss_clip": 1.01678014, + "balance_loss_mlp": 1.03245926, + "epoch": 0.9730948444310837, + "flos": 10998613313280.0, + "grad_norm": 2.093845903397055, + "language_loss": 0.79169863, + "learning_rate": 7.571994572747709e-09, + "loss": 0.81294221, + "num_input_tokens_seen": 349301515, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.63671875, + "step": 16185, + "time_per_iteration": 2.3952207565307617 + }, + { + "auxiliary_loss_clip": 0.0110047, + "auxiliary_loss_mlp": 0.01029137, + "balance_loss_clip": 1.01785386, + "balance_loss_mlp": 1.03438795, + "epoch": 0.9731549676837516, + "flos": 16799479113600.0, + "grad_norm": 1.7127735335933876, + "language_loss": 0.77540267, + "learning_rate": 7.538174573094469e-09, + "loss": 0.79669875, + "num_input_tokens_seen": 349319590, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.66015625, + "step": 16186, + "time_per_iteration": 3.935059070587158 + }, + { + "auxiliary_loss_clip": 0.01096754, + "auxiliary_loss_mlp": 0.01029157, + "balance_loss_clip": 1.01746225, + "balance_loss_mlp": 1.03295803, + "epoch": 0.9732150909364197, + "flos": 21141761339520.0, + "grad_norm": 1.5472928038095195, + "language_loss": 0.65344584, + "learning_rate": 7.504430127793337e-09, + "loss": 0.67470491, + "num_input_tokens_seen": 349339230, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.63671875, + "step": 16187, + "time_per_iteration": 2.4246881008148193 + }, + { + "auxiliary_loss_clip": 0.01096472, + "auxiliary_loss_mlp": 0.01030691, + "balance_loss_clip": 1.01973534, + "balance_loss_mlp": 1.03258657, + "epoch": 0.9732752141890876, + "flos": 33727337435520.0, + "grad_norm": 1.6847924527129516, + "language_loss": 0.80288476, + "learning_rate": 7.47076123812418e-09, + "loss": 0.8241564, + "num_input_tokens_seen": 349361155, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.640625, + "step": 16188, + "time_per_iteration": 2.5142602920532227 + }, + { + "auxiliary_loss_clip": 0.01095375, + "auxiliary_loss_mlp": 0.01026037, + "balance_loss_clip": 1.01564157, + "balance_loss_mlp": 1.03211975, + "epoch": 0.9733353374417556, + "flos": 23404384903680.0, + "grad_norm": 1.8691744941970168, + "language_loss": 0.78207177, + "learning_rate": 7.437167905363084e-09, + "loss": 0.80328584, + "num_input_tokens_seen": 349379335, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.6328125, + "step": 16189, + "time_per_iteration": 2.514826536178589 + }, + { + "auxiliary_loss_clip": 0.01096117, + "auxiliary_loss_mlp": 0.01027574, + "balance_loss_clip": 1.01595116, + "balance_loss_mlp": 1.03183913, + "epoch": 0.9733954606944236, + "flos": 39165792963840.0, + "grad_norm": 1.7703842346307654, + "language_loss": 0.5137412, + "learning_rate": 7.403650130784367e-09, + "loss": 0.53497809, + "num_input_tokens_seen": 349401575, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.64453125, + "step": 16190, + "time_per_iteration": 2.5810770988464355 + }, + { + "auxiliary_loss_clip": 0.01099538, + "auxiliary_loss_mlp": 0.01026326, + "balance_loss_clip": 1.01520967, + "balance_loss_mlp": 1.03426313, + "epoch": 0.9734555839470915, + "flos": 21981819692160.0, + "grad_norm": 1.6597335248752023, + "language_loss": 0.80833918, + "learning_rate": 7.3702079156590105e-09, + "loss": 0.82959783, + "num_input_tokens_seen": 349420650, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65234375, + "step": 16191, + "time_per_iteration": 2.429949998855591 + }, + { + "auxiliary_loss_clip": 0.01096727, + "auxiliary_loss_mlp": 0.01027303, + "balance_loss_clip": 1.01659191, + "balance_loss_mlp": 1.03181481, + "epoch": 0.9735157071997596, + "flos": 16575539771520.0, + "grad_norm": 1.762042207505243, + "language_loss": 0.82737201, + "learning_rate": 7.336841261255111e-09, + "loss": 0.84861231, + "num_input_tokens_seen": 349436830, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6484375, + "step": 16192, + "time_per_iteration": 2.4036996364593506 + }, + { + "auxiliary_loss_clip": 0.0110223, + "auxiliary_loss_mlp": 0.01026845, + "balance_loss_clip": 1.0156033, + "balance_loss_mlp": 1.0369916, + "epoch": 0.9735758304524275, + "flos": 20223237726720.0, + "grad_norm": 1.7584855932220518, + "language_loss": 0.75289583, + "learning_rate": 7.303550168837658e-09, + "loss": 0.77418661, + "num_input_tokens_seen": 349454325, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 16193, + "time_per_iteration": 2.4855713844299316 + }, + { + "auxiliary_loss_clip": 0.01096028, + "auxiliary_loss_mlp": 0.01032399, + "balance_loss_clip": 1.02176595, + "balance_loss_mlp": 1.03300667, + "epoch": 0.9736359537050955, + "flos": 23653353047040.0, + "grad_norm": 1.7149640266068487, + "language_loss": 0.85318899, + "learning_rate": 7.270334639669417e-09, + "loss": 0.87447321, + "num_input_tokens_seen": 349470230, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.62890625, + "step": 16194, + "time_per_iteration": 2.447998523712158 + }, + { + "auxiliary_loss_clip": 0.01098878, + "auxiliary_loss_mlp": 0.01031872, + "balance_loss_clip": 1.020082, + "balance_loss_mlp": 1.03576303, + "epoch": 0.9736960769577634, + "flos": 15560202026880.0, + "grad_norm": 1.502245025606998, + "language_loss": 0.75605994, + "learning_rate": 7.237194675009828e-09, + "loss": 0.77736747, + "num_input_tokens_seen": 349486250, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6328125, + "step": 16195, + "time_per_iteration": 2.4004433155059814 + }, + { + "auxiliary_loss_clip": 0.010217, + "auxiliary_loss_mlp": 0.00999257, + "balance_loss_clip": 0.99829692, + "balance_loss_mlp": 1.00176632, + "epoch": 0.9737562002104314, + "flos": 65351783088000.0, + "grad_norm": 0.7095880579147238, + "language_loss": 0.52472728, + "learning_rate": 7.204130276115439e-09, + "loss": 0.54493684, + "num_input_tokens_seen": 349545865, + "router_z_loss_clip": 0.00958252, + "router_z_loss_mlp": 0.19921875, + "step": 16196, + "time_per_iteration": 2.985597610473633 + }, + { + "auxiliary_loss_clip": 0.01098243, + "auxiliary_loss_mlp": 0.01029046, + "balance_loss_clip": 1.0176909, + "balance_loss_mlp": 1.03375459, + "epoch": 0.9738163234630993, + "flos": 27196730928000.0, + "grad_norm": 2.152959147231462, + "language_loss": 0.76202309, + "learning_rate": 7.171141444240136e-09, + "loss": 0.78329599, + "num_input_tokens_seen": 349566080, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6484375, + "step": 16197, + "time_per_iteration": 2.5381948947906494 + }, + { + "auxiliary_loss_clip": 0.01101638, + "auxiliary_loss_mlp": 0.01029388, + "balance_loss_clip": 1.01759267, + "balance_loss_mlp": 1.03324556, + "epoch": 0.9738764467157673, + "flos": 21069365477760.0, + "grad_norm": 1.689607432579003, + "language_loss": 0.67603827, + "learning_rate": 7.13822818063492e-09, + "loss": 0.69734848, + "num_input_tokens_seen": 349585665, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.68359375, + "step": 16198, + "time_per_iteration": 2.473280191421509 + }, + { + "auxiliary_loss_clip": 0.01098612, + "auxiliary_loss_mlp": 0.01026496, + "balance_loss_clip": 1.0147835, + "balance_loss_mlp": 1.03299856, + "epoch": 0.9739365699684353, + "flos": 21361211481600.0, + "grad_norm": 1.7654996276877126, + "language_loss": 0.7798543, + "learning_rate": 7.10539048654768e-09, + "loss": 0.80110532, + "num_input_tokens_seen": 349605125, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 16199, + "time_per_iteration": 2.4409027099609375 + }, + { + "auxiliary_loss_clip": 0.0109881, + "auxiliary_loss_mlp": 0.01029353, + "balance_loss_clip": 1.01776588, + "balance_loss_mlp": 1.03432035, + "epoch": 0.9739966932211033, + "flos": 21902061542400.0, + "grad_norm": 1.6572282578499644, + "language_loss": 0.79276037, + "learning_rate": 7.072628363223865e-09, + "loss": 0.81404197, + "num_input_tokens_seen": 349623360, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.64453125, + "step": 16200, + "time_per_iteration": 2.4363887310028076 + }, + { + "auxiliary_loss_clip": 0.01105141, + "auxiliary_loss_mlp": 0.01034258, + "balance_loss_clip": 1.02201521, + "balance_loss_mlp": 1.0352422, + "epoch": 0.9740568164737712, + "flos": 24827345164800.0, + "grad_norm": 2.133720131745559, + "language_loss": 0.68253577, + "learning_rate": 7.039941811905592e-09, + "loss": 0.70392972, + "num_input_tokens_seen": 349644390, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.69921875, + "step": 16201, + "time_per_iteration": 2.4559359550476074 + }, + { + "auxiliary_loss_clip": 0.01098547, + "auxiliary_loss_mlp": 0.01028902, + "balance_loss_clip": 1.01724362, + "balance_loss_mlp": 1.03254795, + "epoch": 0.9741169397264392, + "flos": 23623583650560.0, + "grad_norm": 1.4435582632035373, + "language_loss": 0.7252574, + "learning_rate": 7.0073308338325364e-09, + "loss": 0.74653184, + "num_input_tokens_seen": 349663200, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66015625, + "step": 16202, + "time_per_iteration": 2.464578866958618 + }, + { + "auxiliary_loss_clip": 0.01101296, + "auxiliary_loss_mlp": 0.01029669, + "balance_loss_clip": 1.0178498, + "balance_loss_mlp": 1.03444588, + "epoch": 0.9741770629791072, + "flos": 18841144164480.0, + "grad_norm": 2.6014538346880083, + "language_loss": 0.72974175, + "learning_rate": 6.974795430241265e-09, + "loss": 0.75105143, + "num_input_tokens_seen": 349681975, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66796875, + "step": 16203, + "time_per_iteration": 2.439260959625244 + }, + { + "auxiliary_loss_clip": 0.01100295, + "auxiliary_loss_mlp": 0.01029685, + "balance_loss_clip": 1.01812792, + "balance_loss_mlp": 1.03454626, + "epoch": 0.9742371862317751, + "flos": 22346241125760.0, + "grad_norm": 1.6487633714089436, + "language_loss": 0.77325201, + "learning_rate": 6.942335602365235e-09, + "loss": 0.79455173, + "num_input_tokens_seen": 349701185, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 16204, + "time_per_iteration": 2.4581048488616943 + }, + { + "auxiliary_loss_clip": 0.0110164, + "auxiliary_loss_mlp": 0.01032522, + "balance_loss_clip": 1.0203265, + "balance_loss_mlp": 1.03502774, + "epoch": 0.9742973094844432, + "flos": 21762764599680.0, + "grad_norm": 2.45174500530448, + "language_loss": 0.79808879, + "learning_rate": 6.909951351435905e-09, + "loss": 0.81943041, + "num_input_tokens_seen": 349720360, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6640625, + "step": 16205, + "time_per_iteration": 2.419422149658203 + }, + { + "auxiliary_loss_clip": 0.01098434, + "auxiliary_loss_mlp": 0.010292, + "balance_loss_clip": 1.01800013, + "balance_loss_mlp": 1.034006, + "epoch": 0.9743574327371111, + "flos": 26248725227520.0, + "grad_norm": 1.5139645709473997, + "language_loss": 0.74249279, + "learning_rate": 6.87764267868074e-09, + "loss": 0.76376915, + "num_input_tokens_seen": 349741040, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.64453125, + "step": 16206, + "time_per_iteration": 2.495774030685425 + }, + { + "auxiliary_loss_clip": 0.0109835, + "auxiliary_loss_mlp": 0.01029482, + "balance_loss_clip": 1.0172745, + "balance_loss_mlp": 1.03183436, + "epoch": 0.9744175559897791, + "flos": 12349321367040.0, + "grad_norm": 2.9366709312982087, + "language_loss": 0.84325778, + "learning_rate": 6.8454095853252015e-09, + "loss": 0.86453605, + "num_input_tokens_seen": 349758895, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6640625, + "step": 16207, + "time_per_iteration": 2.4933202266693115 + }, + { + "auxiliary_loss_clip": 0.0109755, + "auxiliary_loss_mlp": 0.01031029, + "balance_loss_clip": 1.0196625, + "balance_loss_mlp": 1.0328232, + "epoch": 0.974477679242447, + "flos": 28397834835840.0, + "grad_norm": 1.5653170726435226, + "language_loss": 0.70784497, + "learning_rate": 6.813252072591425e-09, + "loss": 0.7291308, + "num_input_tokens_seen": 349779740, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6484375, + "step": 16208, + "time_per_iteration": 2.504995822906494 + }, + { + "auxiliary_loss_clip": 0.0109343, + "auxiliary_loss_mlp": 0.01022533, + "balance_loss_clip": 1.01287138, + "balance_loss_mlp": 1.03384209, + "epoch": 0.974537802495115, + "flos": 17785370684160.0, + "grad_norm": 1.828450111947416, + "language_loss": 0.77404773, + "learning_rate": 6.781170141698878e-09, + "loss": 0.79520738, + "num_input_tokens_seen": 349796820, + "router_z_loss_clip": 0.09667969, + "router_z_loss_mlp": 0.59765625, + "step": 16209, + "time_per_iteration": 2.4571237564086914 + }, + { + "auxiliary_loss_clip": 0.01100923, + "auxiliary_loss_mlp": 0.01029847, + "balance_loss_clip": 1.01828933, + "balance_loss_mlp": 1.03378117, + "epoch": 0.9745979257477829, + "flos": 23842315520640.0, + "grad_norm": 1.6228002539298978, + "language_loss": 0.78707743, + "learning_rate": 6.749163793864144e-09, + "loss": 0.80838501, + "num_input_tokens_seen": 349816550, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.671875, + "step": 16210, + "time_per_iteration": 2.4974353313446045 + }, + { + "auxiliary_loss_clip": 0.01099743, + "auxiliary_loss_mlp": 0.0103492, + "balance_loss_clip": 1.02377987, + "balance_loss_mlp": 1.03362608, + "epoch": 0.9746580490004509, + "flos": 27016172236800.0, + "grad_norm": 2.5001536656047536, + "language_loss": 0.78155959, + "learning_rate": 6.7172330303009176e-09, + "loss": 0.80290616, + "num_input_tokens_seen": 349834350, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66015625, + "step": 16211, + "time_per_iteration": 2.7016804218292236 + }, + { + "auxiliary_loss_clip": 0.01104973, + "auxiliary_loss_mlp": 0.01029219, + "balance_loss_clip": 1.01664877, + "balance_loss_mlp": 1.03550124, + "epoch": 0.9747181722531189, + "flos": 19792022952960.0, + "grad_norm": 2.0555509454208583, + "language_loss": 0.78118324, + "learning_rate": 6.685377852219787e-09, + "loss": 0.80252516, + "num_input_tokens_seen": 349853460, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 16212, + "time_per_iteration": 2.567605495452881 + }, + { + "auxiliary_loss_clip": 0.01097708, + "auxiliary_loss_mlp": 0.01031276, + "balance_loss_clip": 1.02040458, + "balance_loss_mlp": 1.03407741, + "epoch": 0.9747782955057869, + "flos": 31430598929280.0, + "grad_norm": 1.6196122257396004, + "language_loss": 0.80419701, + "learning_rate": 6.653598260829118e-09, + "loss": 0.82548684, + "num_input_tokens_seen": 349874830, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.63671875, + "step": 16213, + "time_per_iteration": 2.589813709259033 + }, + { + "auxiliary_loss_clip": 0.01097874, + "auxiliary_loss_mlp": 0.01024364, + "balance_loss_clip": 1.01280618, + "balance_loss_mlp": 1.03220224, + "epoch": 0.9748384187584548, + "flos": 15961288268160.0, + "grad_norm": 2.0508933879729083, + "language_loss": 0.6626724, + "learning_rate": 6.6218942573335044e-09, + "loss": 0.68389475, + "num_input_tokens_seen": 349893690, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.65625, + "step": 16214, + "time_per_iteration": 2.497565746307373 + }, + { + "auxiliary_loss_clip": 0.01101576, + "auxiliary_loss_mlp": 0.01028842, + "balance_loss_clip": 1.01662934, + "balance_loss_mlp": 1.03467321, + "epoch": 0.9748985420111228, + "flos": 20558715776640.0, + "grad_norm": 1.967600227233748, + "language_loss": 0.74463314, + "learning_rate": 6.5902658429355386e-09, + "loss": 0.76593733, + "num_input_tokens_seen": 349912480, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.66796875, + "step": 16215, + "time_per_iteration": 2.4592413902282715 + }, + { + "auxiliary_loss_clip": 0.01097336, + "auxiliary_loss_mlp": 0.01029205, + "balance_loss_clip": 1.01774275, + "balance_loss_mlp": 1.0326885, + "epoch": 0.9749586652637908, + "flos": 36721605127680.0, + "grad_norm": 1.7764212034166489, + "language_loss": 0.67058563, + "learning_rate": 6.558713018834483e-09, + "loss": 0.69185102, + "num_input_tokens_seen": 349932470, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6484375, + "step": 16216, + "time_per_iteration": 2.5700504779815674 + }, + { + "auxiliary_loss_clip": 0.01101316, + "auxiliary_loss_mlp": 0.0103098, + "balance_loss_clip": 1.01882648, + "balance_loss_mlp": 1.03412616, + "epoch": 0.9750187885164587, + "flos": 10999223844480.0, + "grad_norm": 1.9686853012013303, + "language_loss": 0.71478593, + "learning_rate": 6.527235786226937e-09, + "loss": 0.7361089, + "num_input_tokens_seen": 349949060, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.671875, + "step": 16217, + "time_per_iteration": 2.426276922225952 + }, + { + "auxiliary_loss_clip": 0.01098896, + "auxiliary_loss_mlp": 0.01027681, + "balance_loss_clip": 1.01612353, + "balance_loss_mlp": 1.03396559, + "epoch": 0.9750789117691268, + "flos": 25739512070400.0, + "grad_norm": 1.6447941805042985, + "language_loss": 0.78255022, + "learning_rate": 6.495834146306167e-09, + "loss": 0.80381596, + "num_input_tokens_seen": 349968010, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 16218, + "time_per_iteration": 2.473839282989502 + }, + { + "auxiliary_loss_clip": 0.010964, + "auxiliary_loss_mlp": 0.01029917, + "balance_loss_clip": 1.0179069, + "balance_loss_mlp": 1.03256178, + "epoch": 0.9751390350217947, + "flos": 13333955961600.0, + "grad_norm": 2.5361788769162237, + "language_loss": 0.7754612, + "learning_rate": 6.464508100263222e-09, + "loss": 0.79672432, + "num_input_tokens_seen": 349985270, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.640625, + "step": 16219, + "time_per_iteration": 2.432041645050049 + }, + { + "auxiliary_loss_clip": 0.01101469, + "auxiliary_loss_mlp": 0.01031565, + "balance_loss_clip": 1.0201565, + "balance_loss_mlp": 1.03447962, + "epoch": 0.9751991582744627, + "flos": 22820621068800.0, + "grad_norm": 1.630736434232842, + "language_loss": 0.81259847, + "learning_rate": 6.433257649285817e-09, + "loss": 0.83392882, + "num_input_tokens_seen": 350003935, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 16220, + "time_per_iteration": 3.9300906658172607 + }, + { + "auxiliary_loss_clip": 0.01096566, + "auxiliary_loss_mlp": 0.01025122, + "balance_loss_clip": 1.01427376, + "balance_loss_mlp": 1.03236842, + "epoch": 0.9752592815271306, + "flos": 19646189735040.0, + "grad_norm": 1.8176068692721052, + "language_loss": 0.74883264, + "learning_rate": 6.402082794559227e-09, + "loss": 0.77004945, + "num_input_tokens_seen": 350023595, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.640625, + "step": 16221, + "time_per_iteration": 2.437368869781494 + }, + { + "auxiliary_loss_clip": 0.01095184, + "auxiliary_loss_mlp": 0.0103033, + "balance_loss_clip": 1.01846254, + "balance_loss_mlp": 1.03186214, + "epoch": 0.9753194047797986, + "flos": 26690462686080.0, + "grad_norm": 1.7853777713397307, + "language_loss": 0.66434538, + "learning_rate": 6.370983537265395e-09, + "loss": 0.68560052, + "num_input_tokens_seen": 350045920, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6328125, + "step": 16222, + "time_per_iteration": 3.9962925910949707 + }, + { + "auxiliary_loss_clip": 0.01097085, + "auxiliary_loss_mlp": 0.01029431, + "balance_loss_clip": 1.01823103, + "balance_loss_mlp": 1.03272462, + "epoch": 0.9753795280324665, + "flos": 23221779137280.0, + "grad_norm": 1.7075208088690872, + "language_loss": 0.87882102, + "learning_rate": 6.3399598785836004e-09, + "loss": 0.90008616, + "num_input_tokens_seen": 350063925, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.64453125, + "step": 16223, + "time_per_iteration": 3.828974723815918 + }, + { + "auxiliary_loss_clip": 0.01096799, + "auxiliary_loss_mlp": 0.01027868, + "balance_loss_clip": 1.01694274, + "balance_loss_mlp": 1.03273821, + "epoch": 0.9754396512851345, + "flos": 19463835363840.0, + "grad_norm": 1.585066530051905, + "language_loss": 0.74491924, + "learning_rate": 6.309011819690457e-09, + "loss": 0.76616585, + "num_input_tokens_seen": 350080900, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.640625, + "step": 16224, + "time_per_iteration": 2.4636449813842773 + }, + { + "auxiliary_loss_clip": 0.01021478, + "auxiliary_loss_mlp": 0.01000107, + "balance_loss_clip": 0.99912339, + "balance_loss_mlp": 1.00153255, + "epoch": 0.9754997745378025, + "flos": 68459313340800.0, + "grad_norm": 0.8110453726438787, + "language_loss": 0.59165817, + "learning_rate": 6.278139361759249e-09, + "loss": 0.61187404, + "num_input_tokens_seen": 350144550, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.19921875, + "step": 16225, + "time_per_iteration": 3.0413310527801514 + }, + { + "auxiliary_loss_clip": 0.0109838, + "auxiliary_loss_mlp": 0.01033685, + "balance_loss_clip": 1.02271152, + "balance_loss_mlp": 1.03436458, + "epoch": 0.9755598977904705, + "flos": 26395168976640.0, + "grad_norm": 1.9025953600544858, + "language_loss": 0.68856502, + "learning_rate": 6.247342505960818e-09, + "loss": 0.7098856, + "num_input_tokens_seen": 350164050, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.640625, + "step": 16226, + "time_per_iteration": 2.4803082942962646 + }, + { + "auxiliary_loss_clip": 0.01099306, + "auxiliary_loss_mlp": 0.01036518, + "balance_loss_clip": 1.02476442, + "balance_loss_mlp": 1.0345875, + "epoch": 0.9756200210431384, + "flos": 16617663446400.0, + "grad_norm": 1.6327416633061216, + "language_loss": 0.82874024, + "learning_rate": 6.216621253462894e-09, + "loss": 0.85009849, + "num_input_tokens_seen": 350181350, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6484375, + "step": 16227, + "time_per_iteration": 2.4154109954833984 + }, + { + "auxiliary_loss_clip": 0.01097787, + "auxiliary_loss_mlp": 0.01023816, + "balance_loss_clip": 1.01299191, + "balance_loss_mlp": 1.03345346, + "epoch": 0.9756801442958064, + "flos": 23623044946560.0, + "grad_norm": 1.7854759830371416, + "language_loss": 0.78148073, + "learning_rate": 6.185975605430549e-09, + "loss": 0.80269676, + "num_input_tokens_seen": 350199765, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.640625, + "step": 16228, + "time_per_iteration": 3.987762451171875 + }, + { + "auxiliary_loss_clip": 0.01021739, + "auxiliary_loss_mlp": 0.01001601, + "balance_loss_clip": 1.00065923, + "balance_loss_mlp": 1.00168419, + "epoch": 0.9757402675484744, + "flos": 61625799440640.0, + "grad_norm": 0.8430180611412167, + "language_loss": 0.55817699, + "learning_rate": 6.155405563025962e-09, + "loss": 0.57841039, + "num_input_tokens_seen": 350256420, + "router_z_loss_clip": 0.00939941, + "router_z_loss_mlp": 0.20117188, + "step": 16229, + "time_per_iteration": 2.996128797531128 + }, + { + "auxiliary_loss_clip": 0.0109885, + "auxiliary_loss_mlp": 0.01028544, + "balance_loss_clip": 1.01672459, + "balance_loss_mlp": 1.03362441, + "epoch": 0.9758003908011423, + "flos": 24058964401920.0, + "grad_norm": 1.8129769312332171, + "language_loss": 0.74995404, + "learning_rate": 6.124911127407984e-09, + "loss": 0.77122796, + "num_input_tokens_seen": 350276270, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65234375, + "step": 16230, + "time_per_iteration": 2.4800798892974854 + }, + { + "auxiliary_loss_clip": 0.01095174, + "auxiliary_loss_mlp": 0.01030635, + "balance_loss_clip": 1.02002525, + "balance_loss_mlp": 1.03384256, + "epoch": 0.9758605140538104, + "flos": 17493093717120.0, + "grad_norm": 1.922364806166091, + "language_loss": 0.71574152, + "learning_rate": 6.094492299733245e-09, + "loss": 0.73699963, + "num_input_tokens_seen": 350295000, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.61328125, + "step": 16231, + "time_per_iteration": 2.4648971557617188 + }, + { + "auxiliary_loss_clip": 0.01102814, + "auxiliary_loss_mlp": 0.01029614, + "balance_loss_clip": 1.01759779, + "balance_loss_mlp": 1.03584027, + "epoch": 0.9759206373064783, + "flos": 24826950115200.0, + "grad_norm": 1.9642211900856055, + "language_loss": 0.76472759, + "learning_rate": 6.064149081155267e-09, + "loss": 0.78605187, + "num_input_tokens_seen": 350314980, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 16232, + "time_per_iteration": 2.526571035385132 + }, + { + "auxiliary_loss_clip": 0.01021867, + "auxiliary_loss_mlp": 0.00998904, + "balance_loss_clip": 0.99791414, + "balance_loss_mlp": 1.00185037, + "epoch": 0.9759807605591463, + "flos": 68161182456960.0, + "grad_norm": 0.7369935606950053, + "language_loss": 0.5375663, + "learning_rate": 6.033881472824465e-09, + "loss": 0.55777407, + "num_input_tokens_seen": 350371985, + "router_z_loss_clip": 0.0098877, + "router_z_loss_mlp": 0.20019531, + "step": 16233, + "time_per_iteration": 2.921182632446289 + }, + { + "auxiliary_loss_clip": 0.01097578, + "auxiliary_loss_mlp": 0.01030462, + "balance_loss_clip": 1.01871395, + "balance_loss_mlp": 1.03226352, + "epoch": 0.9760408838118142, + "flos": 18989239939200.0, + "grad_norm": 1.641864888997356, + "language_loss": 0.71351594, + "learning_rate": 6.003689475888807e-09, + "loss": 0.73479629, + "num_input_tokens_seen": 350390590, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.65234375, + "step": 16234, + "time_per_iteration": 2.4427335262298584 + }, + { + "auxiliary_loss_clip": 0.01101418, + "auxiliary_loss_mlp": 0.01027438, + "balance_loss_clip": 1.01523733, + "balance_loss_mlp": 1.03317451, + "epoch": 0.9761010070644822, + "flos": 17125978763520.0, + "grad_norm": 2.664507778412489, + "language_loss": 0.79045486, + "learning_rate": 5.973573091493156e-09, + "loss": 0.8117435, + "num_input_tokens_seen": 350403770, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 16235, + "time_per_iteration": 2.3964405059814453 + }, + { + "auxiliary_loss_clip": 0.01098094, + "auxiliary_loss_mlp": 0.01032535, + "balance_loss_clip": 1.01980996, + "balance_loss_mlp": 1.03295994, + "epoch": 0.9761611303171501, + "flos": 22052599441920.0, + "grad_norm": 1.9655441572234078, + "language_loss": 0.76884139, + "learning_rate": 5.943532320779265e-09, + "loss": 0.79014766, + "num_input_tokens_seen": 350421870, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.65234375, + "step": 16236, + "time_per_iteration": 2.4456753730773926 + }, + { + "auxiliary_loss_clip": 0.01097739, + "auxiliary_loss_mlp": 0.01026086, + "balance_loss_clip": 1.0151124, + "balance_loss_mlp": 1.03347445, + "epoch": 0.9762212535698181, + "flos": 21757521214080.0, + "grad_norm": 1.6942809100848069, + "language_loss": 0.75669736, + "learning_rate": 5.913567164886446e-09, + "loss": 0.77793556, + "num_input_tokens_seen": 350440025, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.64453125, + "step": 16237, + "time_per_iteration": 2.4447457790374756 + }, + { + "auxiliary_loss_clip": 0.01098982, + "auxiliary_loss_mlp": 0.01032306, + "balance_loss_clip": 1.01958013, + "balance_loss_mlp": 1.0322814, + "epoch": 0.9762813768224861, + "flos": 25921615046400.0, + "grad_norm": 1.7060238297486066, + "language_loss": 0.72860193, + "learning_rate": 5.8836776249509e-09, + "loss": 0.74991488, + "num_input_tokens_seen": 350459435, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.66796875, + "step": 16238, + "time_per_iteration": 2.4894490242004395 + }, + { + "auxiliary_loss_clip": 0.01099347, + "auxiliary_loss_mlp": 0.01028506, + "balance_loss_clip": 1.01638234, + "balance_loss_mlp": 1.03404987, + "epoch": 0.9763415000751541, + "flos": 24051853509120.0, + "grad_norm": 2.001677162599297, + "language_loss": 0.83721536, + "learning_rate": 5.8538637021063875e-09, + "loss": 0.85849392, + "num_input_tokens_seen": 350472655, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.65234375, + "step": 16239, + "time_per_iteration": 2.416748285293579 + }, + { + "auxiliary_loss_clip": 0.01100769, + "auxiliary_loss_mlp": 0.01029281, + "balance_loss_clip": 1.017277, + "balance_loss_mlp": 1.03516018, + "epoch": 0.976401623327822, + "flos": 17018677860480.0, + "grad_norm": 5.101623514128856, + "language_loss": 0.59312123, + "learning_rate": 5.824125397483115e-09, + "loss": 0.61442178, + "num_input_tokens_seen": 350488160, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.65625, + "step": 16240, + "time_per_iteration": 2.459441661834717 + }, + { + "auxiliary_loss_clip": 0.01099723, + "auxiliary_loss_mlp": 0.01028806, + "balance_loss_clip": 1.01747513, + "balance_loss_mlp": 1.03588104, + "epoch": 0.97646174658049, + "flos": 16106941918080.0, + "grad_norm": 1.7088989821507206, + "language_loss": 0.82588184, + "learning_rate": 5.7944627122088474e-09, + "loss": 0.84716713, + "num_input_tokens_seen": 350506065, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 16241, + "time_per_iteration": 2.4329097270965576 + }, + { + "auxiliary_loss_clip": 0.01099206, + "auxiliary_loss_mlp": 0.01030425, + "balance_loss_clip": 1.01948154, + "balance_loss_mlp": 1.03410602, + "epoch": 0.9765218698331579, + "flos": 21252725429760.0, + "grad_norm": 1.704721207895292, + "language_loss": 0.83693302, + "learning_rate": 5.764875647408463e-09, + "loss": 0.85822928, + "num_input_tokens_seen": 350524495, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6484375, + "step": 16242, + "time_per_iteration": 2.4511871337890625 + }, + { + "auxiliary_loss_clip": 0.01100525, + "auxiliary_loss_mlp": 0.01025666, + "balance_loss_clip": 1.01453757, + "balance_loss_mlp": 1.03459525, + "epoch": 0.9765819930858259, + "flos": 18588045957120.0, + "grad_norm": 1.7182851933408332, + "language_loss": 0.7538594, + "learning_rate": 5.7353642042037294e-09, + "loss": 0.77512127, + "num_input_tokens_seen": 350544185, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.66015625, + "step": 16243, + "time_per_iteration": 2.4299659729003906 + }, + { + "auxiliary_loss_clip": 0.01098903, + "auxiliary_loss_mlp": 0.01037217, + "balance_loss_clip": 1.02515912, + "balance_loss_mlp": 1.03315616, + "epoch": 0.976642116338494, + "flos": 20266833859200.0, + "grad_norm": 1.7738669375659515, + "language_loss": 0.69590539, + "learning_rate": 5.705928383713754e-09, + "loss": 0.71726656, + "num_input_tokens_seen": 350562675, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.65625, + "step": 16244, + "time_per_iteration": 2.440574884414673 + }, + { + "auxiliary_loss_clip": 0.01102847, + "auxiliary_loss_mlp": 0.01028651, + "balance_loss_clip": 1.01634228, + "balance_loss_mlp": 1.0365603, + "epoch": 0.9767022395911619, + "flos": 25550477769600.0, + "grad_norm": 1.8209615931858283, + "language_loss": 0.83484882, + "learning_rate": 5.676568187055197e-09, + "loss": 0.8561638, + "num_input_tokens_seen": 350581535, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6640625, + "step": 16245, + "time_per_iteration": 2.491964340209961 + }, + { + "auxiliary_loss_clip": 0.01096014, + "auxiliary_loss_mlp": 0.01025395, + "balance_loss_clip": 1.01445746, + "balance_loss_mlp": 1.03261781, + "epoch": 0.9767623628438299, + "flos": 21762656858880.0, + "grad_norm": 1.3301173974373028, + "language_loss": 0.78354228, + "learning_rate": 5.647283615340726e-09, + "loss": 0.80475634, + "num_input_tokens_seen": 350601615, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6328125, + "step": 16246, + "time_per_iteration": 2.439201831817627 + }, + { + "auxiliary_loss_clip": 0.01092456, + "auxiliary_loss_mlp": 0.01027534, + "balance_loss_clip": 1.01759207, + "balance_loss_mlp": 1.03315675, + "epoch": 0.9768224860964978, + "flos": 15851114277120.0, + "grad_norm": 1.4733048814539074, + "language_loss": 0.73865449, + "learning_rate": 5.6180746696812275e-09, + "loss": 0.75985444, + "num_input_tokens_seen": 350619580, + "router_z_loss_clip": 0.09960938, + "router_z_loss_mlp": 0.59375, + "step": 16247, + "time_per_iteration": 2.414113759994507 + }, + { + "auxiliary_loss_clip": 0.01099436, + "auxiliary_loss_mlp": 0.01032501, + "balance_loss_clip": 1.02071714, + "balance_loss_mlp": 1.03369868, + "epoch": 0.9768826093491658, + "flos": 25151151294720.0, + "grad_norm": 1.5425114729506917, + "language_loss": 0.79912806, + "learning_rate": 5.58894135118404e-09, + "loss": 0.82044744, + "num_input_tokens_seen": 350640015, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65625, + "step": 16248, + "time_per_iteration": 2.5048165321350098 + }, + { + "auxiliary_loss_clip": 0.01106324, + "auxiliary_loss_mlp": 0.01040654, + "balance_loss_clip": 1.02789831, + "balance_loss_mlp": 1.03783882, + "epoch": 0.9769427326018337, + "flos": 22967028904320.0, + "grad_norm": 2.093419696491283, + "language_loss": 0.79174924, + "learning_rate": 5.559883660954278e-09, + "loss": 0.81321901, + "num_input_tokens_seen": 350659155, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.68359375, + "step": 16249, + "time_per_iteration": 2.5967299938201904 + }, + { + "auxiliary_loss_clip": 0.01096074, + "auxiliary_loss_mlp": 0.01031147, + "balance_loss_clip": 1.01956034, + "balance_loss_mlp": 1.0337956, + "epoch": 0.9770028558545018, + "flos": 15264297786240.0, + "grad_norm": 2.0929836799102626, + "language_loss": 0.66912627, + "learning_rate": 5.530901600093507e-09, + "loss": 0.69039845, + "num_input_tokens_seen": 350676615, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.625, + "step": 16250, + "time_per_iteration": 2.4212594032287598 + }, + { + "auxiliary_loss_clip": 0.01021381, + "auxiliary_loss_mlp": 0.01003741, + "balance_loss_clip": 1.00278687, + "balance_loss_mlp": 1.00140762, + "epoch": 0.9770629791071697, + "flos": 71450348808960.0, + "grad_norm": 0.775802092014466, + "language_loss": 0.59881055, + "learning_rate": 5.501995169700846e-09, + "loss": 0.61906171, + "num_input_tokens_seen": 350736805, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.20019531, + "step": 16251, + "time_per_iteration": 3.131605863571167 + }, + { + "auxiliary_loss_clip": 0.010985, + "auxiliary_loss_mlp": 0.0102849, + "balance_loss_clip": 1.01706934, + "balance_loss_mlp": 1.03328323, + "epoch": 0.9771231023598377, + "flos": 22412854897920.0, + "grad_norm": 1.8145393283670994, + "language_loss": 0.78657669, + "learning_rate": 5.473164370872307e-09, + "loss": 0.80784655, + "num_input_tokens_seen": 350753600, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 16252, + "time_per_iteration": 2.491278886795044 + }, + { + "auxiliary_loss_clip": 0.0109682, + "auxiliary_loss_mlp": 0.01029378, + "balance_loss_clip": 1.01803493, + "balance_loss_mlp": 1.03293729, + "epoch": 0.9771832256125056, + "flos": 19025940660480.0, + "grad_norm": 2.687078327620969, + "language_loss": 0.64509666, + "learning_rate": 5.444409204701461e-09, + "loss": 0.66635859, + "num_input_tokens_seen": 350771225, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.640625, + "step": 16253, + "time_per_iteration": 2.489243507385254 + }, + { + "auxiliary_loss_clip": 0.01102295, + "auxiliary_loss_mlp": 0.01029244, + "balance_loss_clip": 1.0163399, + "balance_loss_mlp": 1.03592074, + "epoch": 0.9772433488651736, + "flos": 17822143232640.0, + "grad_norm": 1.9709127512699236, + "language_loss": 0.76572144, + "learning_rate": 5.415729672278324e-09, + "loss": 0.78703684, + "num_input_tokens_seen": 350789100, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.66015625, + "step": 16254, + "time_per_iteration": 2.4342849254608154 + }, + { + "auxiliary_loss_clip": 0.01102063, + "auxiliary_loss_mlp": 0.01031471, + "balance_loss_clip": 1.019485, + "balance_loss_mlp": 1.03458083, + "epoch": 0.9773034721178415, + "flos": 37629785623680.0, + "grad_norm": 1.9349490825165467, + "language_loss": 0.64068961, + "learning_rate": 5.387125774690471e-09, + "loss": 0.66202497, + "num_input_tokens_seen": 350811085, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 16255, + "time_per_iteration": 2.597590446472168 + }, + { + "auxiliary_loss_clip": 0.01103968, + "auxiliary_loss_mlp": 0.0103261, + "balance_loss_clip": 1.01989055, + "balance_loss_mlp": 1.03523302, + "epoch": 0.9773635953705095, + "flos": 20302457172480.0, + "grad_norm": 1.5707961835740387, + "language_loss": 0.75804067, + "learning_rate": 5.358597513023033e-09, + "loss": 0.77940643, + "num_input_tokens_seen": 350831065, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 16256, + "time_per_iteration": 2.464634895324707 + }, + { + "auxiliary_loss_clip": 0.01098529, + "auxiliary_loss_mlp": 0.01031073, + "balance_loss_clip": 1.01894903, + "balance_loss_mlp": 1.03593969, + "epoch": 0.9774237186231776, + "flos": 22309253095680.0, + "grad_norm": 3.747088169064556, + "language_loss": 0.77749127, + "learning_rate": 5.330144888357369e-09, + "loss": 0.7987873, + "num_input_tokens_seen": 350849675, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.625, + "step": 16257, + "time_per_iteration": 2.530625820159912 + }, + { + "auxiliary_loss_clip": 0.0109999, + "auxiliary_loss_mlp": 0.01029963, + "balance_loss_clip": 1.0184536, + "balance_loss_mlp": 1.03522408, + "epoch": 0.9774838418758455, + "flos": 24204905360640.0, + "grad_norm": 1.717206349978081, + "language_loss": 0.75214601, + "learning_rate": 5.301767901772391e-09, + "loss": 0.77344555, + "num_input_tokens_seen": 350868955, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 16258, + "time_per_iteration": 2.5173375606536865 + }, + { + "auxiliary_loss_clip": 0.01021907, + "auxiliary_loss_mlp": 0.0100158, + "balance_loss_clip": 1.00060833, + "balance_loss_mlp": 1.00197577, + "epoch": 0.9775439651285135, + "flos": 66357139829760.0, + "grad_norm": 0.6857941213607871, + "language_loss": 0.59782362, + "learning_rate": 5.273466554344353e-09, + "loss": 0.61805856, + "num_input_tokens_seen": 350935110, + "router_z_loss_clip": 0.00970459, + "router_z_loss_mlp": 0.19921875, + "step": 16259, + "time_per_iteration": 3.1181235313415527 + }, + { + "auxiliary_loss_clip": 0.01103425, + "auxiliary_loss_mlp": 0.01031298, + "balance_loss_clip": 1.01912701, + "balance_loss_mlp": 1.03543591, + "epoch": 0.9776040883811814, + "flos": 22601565976320.0, + "grad_norm": 1.7500488558402083, + "language_loss": 0.7345553, + "learning_rate": 5.2452408471461705e-09, + "loss": 0.75590253, + "num_input_tokens_seen": 350953220, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.6796875, + "step": 16260, + "time_per_iteration": 2.5194666385650635 + }, + { + "auxiliary_loss_clip": 0.01100608, + "auxiliary_loss_mlp": 0.01031989, + "balance_loss_clip": 1.02000213, + "balance_loss_mlp": 1.0345335, + "epoch": 0.9776642116338494, + "flos": 18442176825600.0, + "grad_norm": 1.9560228584534347, + "language_loss": 0.79390025, + "learning_rate": 5.2170907812485456e-09, + "loss": 0.81522614, + "num_input_tokens_seen": 350971915, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66015625, + "step": 16261, + "time_per_iteration": 2.494131088256836 + }, + { + "auxiliary_loss_clip": 0.01100642, + "auxiliary_loss_mlp": 0.0102413, + "balance_loss_clip": 1.01245975, + "balance_loss_mlp": 1.03458381, + "epoch": 0.9777243348865173, + "flos": 22638446265600.0, + "grad_norm": 2.38180088162508, + "language_loss": 0.74037927, + "learning_rate": 5.189016357718845e-09, + "loss": 0.76162702, + "num_input_tokens_seen": 350990470, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66015625, + "step": 16262, + "time_per_iteration": 4.020437240600586 + }, + { + "auxiliary_loss_clip": 0.01101829, + "auxiliary_loss_mlp": 0.01029855, + "balance_loss_clip": 1.01671231, + "balance_loss_mlp": 1.03477502, + "epoch": 0.9777844581391854, + "flos": 31321394605440.0, + "grad_norm": 2.209424338913731, + "language_loss": 0.700001, + "learning_rate": 5.16101757762133e-09, + "loss": 0.72131789, + "num_input_tokens_seen": 351010755, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.671875, + "step": 16263, + "time_per_iteration": 2.6097006797790527 + }, + { + "auxiliary_loss_clip": 0.01100862, + "auxiliary_loss_mlp": 0.01026256, + "balance_loss_clip": 1.01556909, + "balance_loss_mlp": 1.03478503, + "epoch": 0.9778445813918533, + "flos": 23039101543680.0, + "grad_norm": 1.690080180410736, + "language_loss": 0.66416574, + "learning_rate": 5.133094442018038e-09, + "loss": 0.6854369, + "num_input_tokens_seen": 351029965, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.66015625, + "step": 16264, + "time_per_iteration": 5.414909362792969 + }, + { + "auxiliary_loss_clip": 0.01104855, + "auxiliary_loss_mlp": 0.01032168, + "balance_loss_clip": 1.018942, + "balance_loss_mlp": 1.03560305, + "epoch": 0.9779047046445213, + "flos": 17566351505280.0, + "grad_norm": 2.017252489595847, + "language_loss": 0.72986895, + "learning_rate": 5.105246951967679e-09, + "loss": 0.75123918, + "num_input_tokens_seen": 351046205, + "router_z_loss_clip": 0.13183594, + "router_z_loss_mlp": 0.69140625, + "step": 16265, + "time_per_iteration": 2.4533677101135254 + }, + { + "auxiliary_loss_clip": 0.01095698, + "auxiliary_loss_mlp": 0.01029327, + "balance_loss_clip": 1.01807976, + "balance_loss_mlp": 1.03230691, + "epoch": 0.9779648278971892, + "flos": 20741141975040.0, + "grad_norm": 1.8239082705051328, + "language_loss": 0.68785274, + "learning_rate": 5.077475108526297e-09, + "loss": 0.70910293, + "num_input_tokens_seen": 351065390, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6328125, + "step": 16266, + "time_per_iteration": 2.5144505500793457 + }, + { + "auxiliary_loss_clip": 0.01096607, + "auxiliary_loss_mlp": 0.01028265, + "balance_loss_clip": 1.01792407, + "balance_loss_mlp": 1.03445101, + "epoch": 0.9780249511498572, + "flos": 21026954494080.0, + "grad_norm": 1.6346367496457415, + "language_loss": 0.86829478, + "learning_rate": 5.049778912747049e-09, + "loss": 0.88954347, + "num_input_tokens_seen": 351084355, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.62109375, + "step": 16267, + "time_per_iteration": 2.467357635498047 + }, + { + "auxiliary_loss_clip": 0.0110114, + "auxiliary_loss_mlp": 0.01027299, + "balance_loss_clip": 1.01483595, + "balance_loss_mlp": 1.03381491, + "epoch": 0.9780850744025251, + "flos": 30774223751040.0, + "grad_norm": 1.7912126481892603, + "language_loss": 0.70019847, + "learning_rate": 5.022158365679985e-09, + "loss": 0.72148287, + "num_input_tokens_seen": 351105870, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.671875, + "step": 16268, + "time_per_iteration": 2.611461639404297 + }, + { + "auxiliary_loss_clip": 0.01100318, + "auxiliary_loss_mlp": 0.01025392, + "balance_loss_clip": 1.01431131, + "balance_loss_mlp": 1.03440547, + "epoch": 0.9781451976551931, + "flos": 20302995876480.0, + "grad_norm": 1.8029387675380926, + "language_loss": 0.73841709, + "learning_rate": 4.994613468372711e-09, + "loss": 0.75967419, + "num_input_tokens_seen": 351124760, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.66015625, + "step": 16269, + "time_per_iteration": 2.492509365081787 + }, + { + "auxiliary_loss_clip": 0.01100361, + "auxiliary_loss_mlp": 0.01027825, + "balance_loss_clip": 1.0153085, + "balance_loss_mlp": 1.03405917, + "epoch": 0.9782053209078612, + "flos": 24316479982080.0, + "grad_norm": 1.7343347922609937, + "language_loss": 0.70707202, + "learning_rate": 4.967144221869501e-09, + "loss": 0.72835386, + "num_input_tokens_seen": 351142820, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6640625, + "step": 16270, + "time_per_iteration": 4.16719651222229 + }, + { + "auxiliary_loss_clip": 0.01100199, + "auxiliary_loss_mlp": 0.01030147, + "balance_loss_clip": 1.0187391, + "balance_loss_mlp": 1.0348208, + "epoch": 0.9782654441605291, + "flos": 32489425065600.0, + "grad_norm": 1.6302831298633103, + "language_loss": 0.63994282, + "learning_rate": 4.939750627212191e-09, + "loss": 0.6612463, + "num_input_tokens_seen": 351164805, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 16271, + "time_per_iteration": 2.630716562271118 + }, + { + "auxiliary_loss_clip": 0.01097487, + "auxiliary_loss_mlp": 0.0102891, + "balance_loss_clip": 1.01719773, + "balance_loss_mlp": 1.03479195, + "epoch": 0.9783255674131971, + "flos": 26979076465920.0, + "grad_norm": 1.4434562656033578, + "language_loss": 0.70372558, + "learning_rate": 4.912432685439505e-09, + "loss": 0.72498953, + "num_input_tokens_seen": 351187005, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.62890625, + "step": 16272, + "time_per_iteration": 2.5594773292541504 + }, + { + "auxiliary_loss_clip": 0.0110247, + "auxiliary_loss_mlp": 0.01033208, + "balance_loss_clip": 1.02101922, + "balance_loss_mlp": 1.03539801, + "epoch": 0.978385690665865, + "flos": 23112251591040.0, + "grad_norm": 2.7207036655043733, + "language_loss": 0.66597646, + "learning_rate": 4.88519039758728e-09, + "loss": 0.68733323, + "num_input_tokens_seen": 351208450, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 16273, + "time_per_iteration": 2.5560595989227295 + }, + { + "auxiliary_loss_clip": 0.01099094, + "auxiliary_loss_mlp": 0.01023905, + "balance_loss_clip": 1.01173985, + "balance_loss_mlp": 1.03361404, + "epoch": 0.978445813918533, + "flos": 25409672455680.0, + "grad_norm": 1.8374122302756553, + "language_loss": 0.74000204, + "learning_rate": 4.85802376468869e-09, + "loss": 0.76123202, + "num_input_tokens_seen": 351229585, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.65625, + "step": 16274, + "time_per_iteration": 2.5694611072540283 + }, + { + "auxiliary_loss_clip": 0.01101012, + "auxiliary_loss_mlp": 0.01029244, + "balance_loss_clip": 1.01815104, + "balance_loss_mlp": 1.03633726, + "epoch": 0.9785059371712009, + "flos": 23550218121600.0, + "grad_norm": 1.7449518961905144, + "language_loss": 0.7771135, + "learning_rate": 4.830932787773579e-09, + "loss": 0.79841614, + "num_input_tokens_seen": 351249525, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 16275, + "time_per_iteration": 2.5744869709014893 + }, + { + "auxiliary_loss_clip": 0.0110169, + "auxiliary_loss_mlp": 0.01028389, + "balance_loss_clip": 1.01628375, + "balance_loss_mlp": 1.03521442, + "epoch": 0.978566060423869, + "flos": 34351177870080.0, + "grad_norm": 1.4726802158178436, + "language_loss": 0.70957249, + "learning_rate": 4.803917467869567e-09, + "loss": 0.73087335, + "num_input_tokens_seen": 351272530, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 16276, + "time_per_iteration": 2.72546648979187 + }, + { + "auxiliary_loss_clip": 0.01095364, + "auxiliary_loss_mlp": 0.01029867, + "balance_loss_clip": 1.01873851, + "balance_loss_mlp": 1.03249002, + "epoch": 0.9786261836765369, + "flos": 11618862387840.0, + "grad_norm": 1.8218394979557164, + "language_loss": 0.859927, + "learning_rate": 4.776977806000726e-09, + "loss": 0.88117933, + "num_input_tokens_seen": 351288530, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.62890625, + "step": 16277, + "time_per_iteration": 2.542083740234375 + }, + { + "auxiliary_loss_clip": 0.01098208, + "auxiliary_loss_mlp": 0.01026865, + "balance_loss_clip": 1.01536143, + "balance_loss_mlp": 1.0346185, + "epoch": 0.9786863069292049, + "flos": 17420949250560.0, + "grad_norm": 1.852013929689249, + "language_loss": 0.70972097, + "learning_rate": 4.7501138031891264e-09, + "loss": 0.73097163, + "num_input_tokens_seen": 351305890, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6328125, + "step": 16278, + "time_per_iteration": 2.493483066558838 + }, + { + "auxiliary_loss_clip": 0.01096145, + "auxiliary_loss_mlp": 0.01027994, + "balance_loss_clip": 1.01613855, + "balance_loss_mlp": 1.03165531, + "epoch": 0.9787464301818728, + "flos": 20844923345280.0, + "grad_norm": 1.8162844777370935, + "language_loss": 0.84460557, + "learning_rate": 4.723325460453065e-09, + "loss": 0.86584687, + "num_input_tokens_seen": 351325010, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.64453125, + "step": 16279, + "time_per_iteration": 2.6132097244262695 + }, + { + "auxiliary_loss_clip": 0.01098514, + "auxiliary_loss_mlp": 0.01029625, + "balance_loss_clip": 1.01753092, + "balance_loss_mlp": 1.03278434, + "epoch": 0.9788065534345408, + "flos": 18222942165120.0, + "grad_norm": 1.8612686985344382, + "language_loss": 0.78869414, + "learning_rate": 4.696612778808395e-09, + "loss": 0.8099755, + "num_input_tokens_seen": 351343060, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.65625, + "step": 16280, + "time_per_iteration": 2.5324976444244385 + }, + { + "auxiliary_loss_clip": 0.01096797, + "auxiliary_loss_mlp": 0.01031557, + "balance_loss_clip": 1.02069163, + "balance_loss_mlp": 1.03460717, + "epoch": 0.9788666766872087, + "flos": 21578219498880.0, + "grad_norm": 1.7034724956942773, + "language_loss": 0.7950545, + "learning_rate": 4.669975759268085e-09, + "loss": 0.81633806, + "num_input_tokens_seen": 351363260, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.62109375, + "step": 16281, + "time_per_iteration": 2.543025493621826 + }, + { + "auxiliary_loss_clip": 0.01099177, + "auxiliary_loss_mlp": 0.01030593, + "balance_loss_clip": 1.01827884, + "balance_loss_mlp": 1.03329802, + "epoch": 0.9789267999398767, + "flos": 24900495212160.0, + "grad_norm": 1.6099254109579124, + "language_loss": 0.80462193, + "learning_rate": 4.643414402842216e-09, + "loss": 0.82591969, + "num_input_tokens_seen": 351382610, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66015625, + "step": 16282, + "time_per_iteration": 2.525231122970581 + }, + { + "auxiliary_loss_clip": 0.01100012, + "auxiliary_loss_mlp": 0.0103584, + "balance_loss_clip": 1.02403867, + "balance_loss_mlp": 1.03417039, + "epoch": 0.9789869231925448, + "flos": 19573111514880.0, + "grad_norm": 2.3091950927100813, + "language_loss": 0.83399373, + "learning_rate": 4.616928710538204e-09, + "loss": 0.85535228, + "num_input_tokens_seen": 351401075, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65625, + "step": 16283, + "time_per_iteration": 2.5172736644744873 + }, + { + "auxiliary_loss_clip": 0.01098196, + "auxiliary_loss_mlp": 0.01031053, + "balance_loss_clip": 1.01948416, + "balance_loss_mlp": 1.03322864, + "epoch": 0.9790470464452127, + "flos": 16796641939200.0, + "grad_norm": 1.8868446346869174, + "language_loss": 0.7178874, + "learning_rate": 4.590518683360134e-09, + "loss": 0.73917985, + "num_input_tokens_seen": 351419275, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6484375, + "step": 16284, + "time_per_iteration": 2.4635121822357178 + }, + { + "auxiliary_loss_clip": 0.01098539, + "auxiliary_loss_mlp": 0.01034979, + "balance_loss_clip": 1.02436984, + "balance_loss_mlp": 1.03568172, + "epoch": 0.9791071696978807, + "flos": 18369350000640.0, + "grad_norm": 1.8562252978598333, + "language_loss": 0.64642346, + "learning_rate": 4.56418432230965e-09, + "loss": 0.66775858, + "num_input_tokens_seen": 351437375, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.62890625, + "step": 16285, + "time_per_iteration": 2.4629716873168945 + }, + { + "auxiliary_loss_clip": 0.0109894, + "auxiliary_loss_mlp": 0.0103082, + "balance_loss_clip": 1.01941144, + "balance_loss_mlp": 1.03462458, + "epoch": 0.9791672929505486, + "flos": 24170323541760.0, + "grad_norm": 1.979309410905623, + "language_loss": 0.70627666, + "learning_rate": 4.537925628385286e-09, + "loss": 0.72757423, + "num_input_tokens_seen": 351457810, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.64453125, + "step": 16286, + "time_per_iteration": 2.519150972366333 + }, + { + "auxiliary_loss_clip": 0.01095816, + "auxiliary_loss_mlp": 0.01030402, + "balance_loss_clip": 1.01952446, + "balance_loss_mlp": 1.03219485, + "epoch": 0.9792274162032166, + "flos": 24354114456960.0, + "grad_norm": 1.38347830602051, + "language_loss": 0.58299065, + "learning_rate": 4.511742602582691e-09, + "loss": 0.60425282, + "num_input_tokens_seen": 351478825, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.63671875, + "step": 16287, + "time_per_iteration": 2.5372016429901123 + }, + { + "auxiliary_loss_clip": 0.01100034, + "auxiliary_loss_mlp": 0.01033961, + "balance_loss_clip": 1.02208781, + "balance_loss_mlp": 1.03487289, + "epoch": 0.9792875394558845, + "flos": 26395779507840.0, + "grad_norm": 1.7701406115909017, + "language_loss": 0.81316799, + "learning_rate": 4.485635245894626e-09, + "loss": 0.83450794, + "num_input_tokens_seen": 351498785, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.65234375, + "step": 16288, + "time_per_iteration": 2.513692617416382 + }, + { + "auxiliary_loss_clip": 0.01098614, + "auxiliary_loss_mlp": 0.01024558, + "balance_loss_clip": 1.01289308, + "balance_loss_mlp": 1.03259087, + "epoch": 0.9793476627085526, + "flos": 28148004766080.0, + "grad_norm": 1.490724806273456, + "language_loss": 0.71809161, + "learning_rate": 4.459603559311631e-09, + "loss": 0.73932338, + "num_input_tokens_seen": 351520235, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.66015625, + "step": 16289, + "time_per_iteration": 2.5497584342956543 + }, + { + "auxiliary_loss_clip": 0.01099152, + "auxiliary_loss_mlp": 0.01036871, + "balance_loss_clip": 1.02502751, + "balance_loss_mlp": 1.03522253, + "epoch": 0.9794077859612205, + "flos": 16763927627520.0, + "grad_norm": 2.421114759913103, + "language_loss": 0.7523073, + "learning_rate": 4.43364754382003e-09, + "loss": 0.77366757, + "num_input_tokens_seen": 351538900, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.640625, + "step": 16290, + "time_per_iteration": 2.478057861328125 + }, + { + "auxiliary_loss_clip": 0.01100685, + "auxiliary_loss_mlp": 0.01031841, + "balance_loss_clip": 1.01924038, + "balance_loss_mlp": 1.03389645, + "epoch": 0.9794679092138885, + "flos": 19280834547840.0, + "grad_norm": 1.6419877933765765, + "language_loss": 0.67298269, + "learning_rate": 4.4077672004048105e-09, + "loss": 0.69430792, + "num_input_tokens_seen": 351558715, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.66796875, + "step": 16291, + "time_per_iteration": 2.508269786834717 + }, + { + "auxiliary_loss_clip": 0.01102554, + "auxiliary_loss_mlp": 0.01028221, + "balance_loss_clip": 1.01628256, + "balance_loss_mlp": 1.03450608, + "epoch": 0.9795280324665564, + "flos": 32156640535680.0, + "grad_norm": 1.7353074100910213, + "language_loss": 0.62683344, + "learning_rate": 4.3819625300467456e-09, + "loss": 0.64814121, + "num_input_tokens_seen": 351578450, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 16292, + "time_per_iteration": 2.581599235534668 + }, + { + "auxiliary_loss_clip": 0.01101072, + "auxiliary_loss_mlp": 0.01032017, + "balance_loss_clip": 1.02097225, + "balance_loss_mlp": 1.03556764, + "epoch": 0.9795881557192244, + "flos": 19060953442560.0, + "grad_norm": 2.0682160456993226, + "language_loss": 0.73132885, + "learning_rate": 4.356233533724829e-09, + "loss": 0.75265968, + "num_input_tokens_seen": 351597195, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 16293, + "time_per_iteration": 2.483751058578491 + }, + { + "auxiliary_loss_clip": 0.01100266, + "auxiliary_loss_mlp": 0.01027342, + "balance_loss_clip": 1.01558185, + "balance_loss_mlp": 1.03350306, + "epoch": 0.9796482789718923, + "flos": 28329928174080.0, + "grad_norm": 2.0559306956335948, + "language_loss": 0.83788204, + "learning_rate": 4.330580212414503e-09, + "loss": 0.85915816, + "num_input_tokens_seen": 351617460, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.66796875, + "step": 16294, + "time_per_iteration": 2.550323009490967 + }, + { + "auxiliary_loss_clip": 0.01095885, + "auxiliary_loss_mlp": 0.01028893, + "balance_loss_clip": 1.01821804, + "balance_loss_mlp": 1.03391075, + "epoch": 0.9797084022245603, + "flos": 17967976450560.0, + "grad_norm": 2.290419249779841, + "language_loss": 0.71717238, + "learning_rate": 4.305002567088767e-09, + "loss": 0.73842019, + "num_input_tokens_seen": 351635900, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6171875, + "step": 16295, + "time_per_iteration": 2.4508378505706787 + }, + { + "auxiliary_loss_clip": 0.01105244, + "auxiliary_loss_mlp": 0.0103715, + "balance_loss_clip": 1.02547288, + "balance_loss_mlp": 1.03634858, + "epoch": 0.9797685254772284, + "flos": 20266726118400.0, + "grad_norm": 1.6649015681944959, + "language_loss": 0.80663395, + "learning_rate": 4.2795005987170674e-09, + "loss": 0.82805789, + "num_input_tokens_seen": 351655400, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6875, + "step": 16296, + "time_per_iteration": 2.5006003379821777 + }, + { + "auxiliary_loss_clip": 0.01096989, + "auxiliary_loss_mlp": 0.01032481, + "balance_loss_clip": 1.02125096, + "balance_loss_mlp": 1.03309369, + "epoch": 0.9798286487298963, + "flos": 26907147480960.0, + "grad_norm": 1.7126573036341362, + "language_loss": 0.75474179, + "learning_rate": 4.254074308266853e-09, + "loss": 0.77603638, + "num_input_tokens_seen": 351675505, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.640625, + "step": 16297, + "time_per_iteration": 2.536893844604492 + }, + { + "auxiliary_loss_clip": 0.01102165, + "auxiliary_loss_mlp": 0.01034134, + "balance_loss_clip": 1.02261257, + "balance_loss_mlp": 1.03400278, + "epoch": 0.9798887719825643, + "flos": 27161071701120.0, + "grad_norm": 1.6213586947116383, + "language_loss": 0.78397214, + "learning_rate": 4.228723696702019e-09, + "loss": 0.80533516, + "num_input_tokens_seen": 351697920, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6796875, + "step": 16298, + "time_per_iteration": 2.525702953338623 + }, + { + "auxiliary_loss_clip": 0.01094464, + "auxiliary_loss_mlp": 0.01024465, + "balance_loss_clip": 1.01362884, + "balance_loss_mlp": 1.03258538, + "epoch": 0.9799488952352322, + "flos": 20668422890880.0, + "grad_norm": 1.4638188813410706, + "language_loss": 0.72470737, + "learning_rate": 4.203448764984019e-09, + "loss": 0.7458967, + "num_input_tokens_seen": 351717615, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6171875, + "step": 16299, + "time_per_iteration": 2.480396032333374 + }, + { + "auxiliary_loss_clip": 0.01100875, + "auxiliary_loss_mlp": 0.01032194, + "balance_loss_clip": 1.01994538, + "balance_loss_mlp": 1.03388453, + "epoch": 0.9800090184879002, + "flos": 21981209160960.0, + "grad_norm": 1.9941262161166102, + "language_loss": 0.89518666, + "learning_rate": 4.178249514071419e-09, + "loss": 0.91651738, + "num_input_tokens_seen": 351735260, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.671875, + "step": 16300, + "time_per_iteration": 2.4887144565582275 + }, + { + "auxiliary_loss_clip": 0.01101113, + "auxiliary_loss_mlp": 0.01029738, + "balance_loss_clip": 1.01779962, + "balance_loss_mlp": 1.03375816, + "epoch": 0.9800691417405681, + "flos": 21288420570240.0, + "grad_norm": 2.0314895800326758, + "language_loss": 0.77960867, + "learning_rate": 4.1531259449194555e-09, + "loss": 0.80091715, + "num_input_tokens_seen": 351755800, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.671875, + "step": 16301, + "time_per_iteration": 2.5138540267944336 + }, + { + "auxiliary_loss_clip": 0.01100158, + "auxiliary_loss_mlp": 0.01035508, + "balance_loss_clip": 1.02306259, + "balance_loss_mlp": 1.03404641, + "epoch": 0.9801292649932362, + "flos": 18439878355200.0, + "grad_norm": 2.000398501552176, + "language_loss": 0.75482309, + "learning_rate": 4.128078058480921e-09, + "loss": 0.77617979, + "num_input_tokens_seen": 351774790, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6640625, + "step": 16302, + "time_per_iteration": 2.4900062084198 + }, + { + "auxiliary_loss_clip": 0.01098806, + "auxiliary_loss_mlp": 0.01028927, + "balance_loss_clip": 1.01688099, + "balance_loss_mlp": 1.03401518, + "epoch": 0.9801893882459041, + "flos": 25046364343680.0, + "grad_norm": 1.6848878091694153, + "language_loss": 0.79394841, + "learning_rate": 4.103105855705724e-09, + "loss": 0.81522572, + "num_input_tokens_seen": 351792855, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6484375, + "step": 16303, + "time_per_iteration": 4.0379838943481445 + }, + { + "auxiliary_loss_clip": 0.01102546, + "auxiliary_loss_mlp": 0.01030204, + "balance_loss_clip": 1.01812756, + "balance_loss_mlp": 1.03442729, + "epoch": 0.9802495114985721, + "flos": 18511484117760.0, + "grad_norm": 2.015058455965645, + "language_loss": 0.82887793, + "learning_rate": 4.078209337540883e-09, + "loss": 0.85020542, + "num_input_tokens_seen": 351811450, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6796875, + "step": 16304, + "time_per_iteration": 2.5293853282928467 + }, + { + "auxiliary_loss_clip": 0.01095069, + "auxiliary_loss_mlp": 0.01026967, + "balance_loss_clip": 1.0165664, + "balance_loss_mlp": 1.03351498, + "epoch": 0.98030963475124, + "flos": 21469841187840.0, + "grad_norm": 1.8806572396287222, + "language_loss": 0.70294923, + "learning_rate": 4.053388504930089e-09, + "loss": 0.72416955, + "num_input_tokens_seen": 351831960, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.6171875, + "step": 16305, + "time_per_iteration": 3.9920012950897217 + }, + { + "auxiliary_loss_clip": 0.01101609, + "auxiliary_loss_mlp": 0.01030105, + "balance_loss_clip": 1.01850629, + "balance_loss_mlp": 1.03499484, + "epoch": 0.980369758003908, + "flos": 20412272027520.0, + "grad_norm": 2.542876871636166, + "language_loss": 0.71830386, + "learning_rate": 4.028643358815032e-09, + "loss": 0.73962104, + "num_input_tokens_seen": 351851585, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66796875, + "step": 16306, + "time_per_iteration": 3.8759777545928955 + }, + { + "auxiliary_loss_clip": 0.01094312, + "auxiliary_loss_mlp": 0.01028834, + "balance_loss_clip": 1.01834977, + "balance_loss_mlp": 1.03180003, + "epoch": 0.9804298812565759, + "flos": 23399177431680.0, + "grad_norm": 1.6763796973864105, + "language_loss": 0.73249525, + "learning_rate": 4.00397390013385e-09, + "loss": 0.75372672, + "num_input_tokens_seen": 351871085, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.625, + "step": 16307, + "time_per_iteration": 2.4559872150421143 + }, + { + "auxiliary_loss_clip": 0.01094645, + "auxiliary_loss_mlp": 0.01030867, + "balance_loss_clip": 1.02071619, + "balance_loss_mlp": 1.03398371, + "epoch": 0.980490004509244, + "flos": 23292666627840.0, + "grad_norm": 1.4089016879713172, + "language_loss": 0.74952251, + "learning_rate": 3.979380129822018e-09, + "loss": 0.77077764, + "num_input_tokens_seen": 351891775, + "router_z_loss_clip": 0.1015625, + "router_z_loss_mlp": 0.609375, + "step": 16308, + "time_per_iteration": 2.4483864307403564 + }, + { + "auxiliary_loss_clip": 0.01021734, + "auxiliary_loss_mlp": 0.01000005, + "balance_loss_clip": 0.99902195, + "balance_loss_mlp": 1.00172949, + "epoch": 0.980550127761912, + "flos": 56051027798400.0, + "grad_norm": 0.75357779305897, + "language_loss": 0.5785439, + "learning_rate": 3.954862048811902e-09, + "loss": 0.59876132, + "num_input_tokens_seen": 351946770, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.20019531, + "step": 16309, + "time_per_iteration": 2.9689579010009766 + }, + { + "auxiliary_loss_clip": 0.01098952, + "auxiliary_loss_mlp": 0.0103338, + "balance_loss_clip": 1.02141714, + "balance_loss_mlp": 1.0326581, + "epoch": 0.9806102510145799, + "flos": 25333290184320.0, + "grad_norm": 1.6693234656111071, + "language_loss": 0.6591835, + "learning_rate": 3.930419658033646e-09, + "loss": 0.68050683, + "num_input_tokens_seen": 351966155, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6640625, + "step": 16310, + "time_per_iteration": 2.468170642852783 + }, + { + "auxiliary_loss_clip": 0.01021706, + "auxiliary_loss_mlp": 0.0100009, + "balance_loss_clip": 0.99913657, + "balance_loss_mlp": 1.00166667, + "epoch": 0.9806703742672479, + "flos": 67274837429760.0, + "grad_norm": 0.8200525059067886, + "language_loss": 0.54590946, + "learning_rate": 3.906052958413841e-09, + "loss": 0.56612742, + "num_input_tokens_seen": 352031655, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.20019531, + "step": 16311, + "time_per_iteration": 4.64594030380249 + }, + { + "auxiliary_loss_clip": 0.01098424, + "auxiliary_loss_mlp": 0.01023662, + "balance_loss_clip": 1.01312995, + "balance_loss_mlp": 1.03379786, + "epoch": 0.9807304975199158, + "flos": 25228970110080.0, + "grad_norm": 1.5926022897704035, + "language_loss": 0.7984302, + "learning_rate": 3.881761950876638e-09, + "loss": 0.81965107, + "num_input_tokens_seen": 352051920, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.64453125, + "step": 16312, + "time_per_iteration": 2.4821081161499023 + }, + { + "auxiliary_loss_clip": 0.01097906, + "auxiliary_loss_mlp": 0.01026927, + "balance_loss_clip": 1.01600158, + "balance_loss_mlp": 1.03469324, + "epoch": 0.9807906207725838, + "flos": 17456392995840.0, + "grad_norm": 1.9862258679310378, + "language_loss": 0.62852752, + "learning_rate": 3.8575466363430785e-09, + "loss": 0.64977586, + "num_input_tokens_seen": 352069315, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.6328125, + "step": 16313, + "time_per_iteration": 2.4287753105163574 + }, + { + "auxiliary_loss_clip": 0.01098817, + "auxiliary_loss_mlp": 0.01028705, + "balance_loss_clip": 1.01709437, + "balance_loss_mlp": 1.0344764, + "epoch": 0.9808507440252517, + "flos": 21032413361280.0, + "grad_norm": 1.8027464664706034, + "language_loss": 0.72543561, + "learning_rate": 3.833407015731316e-09, + "loss": 0.7467109, + "num_input_tokens_seen": 352089480, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.64453125, + "step": 16314, + "time_per_iteration": 2.450726270675659 + }, + { + "auxiliary_loss_clip": 0.01021599, + "auxiliary_loss_mlp": 0.01000108, + "balance_loss_clip": 0.99912471, + "balance_loss_mlp": 1.0017128, + "epoch": 0.9809108672779198, + "flos": 64044491598720.0, + "grad_norm": 0.6974943747069026, + "language_loss": 0.51689386, + "learning_rate": 3.80934308995684e-09, + "loss": 0.53711092, + "num_input_tokens_seen": 352150000, + "router_z_loss_clip": 0.00982666, + "router_z_loss_mlp": 0.19921875, + "step": 16315, + "time_per_iteration": 3.039893388748169 + }, + { + "auxiliary_loss_clip": 0.01097985, + "auxiliary_loss_mlp": 0.01028059, + "balance_loss_clip": 1.01716948, + "balance_loss_mlp": 1.03282857, + "epoch": 0.9809709905305877, + "flos": 22780616296320.0, + "grad_norm": 1.702080149406472, + "language_loss": 0.69737405, + "learning_rate": 3.785354859932033e-09, + "loss": 0.71863449, + "num_input_tokens_seen": 352170990, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.65234375, + "step": 16316, + "time_per_iteration": 2.518357038497925 + }, + { + "auxiliary_loss_clip": 0.01100266, + "auxiliary_loss_mlp": 0.01026664, + "balance_loss_clip": 1.01516604, + "balance_loss_mlp": 1.03365529, + "epoch": 0.9810311137832557, + "flos": 37013415217920.0, + "grad_norm": 1.87109155525106, + "language_loss": 0.55548424, + "learning_rate": 3.76144232656661e-09, + "loss": 0.57675356, + "num_input_tokens_seen": 352195335, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 16317, + "time_per_iteration": 2.6049306392669678 + }, + { + "auxiliary_loss_clip": 0.01098549, + "auxiliary_loss_mlp": 0.01027948, + "balance_loss_clip": 1.01721954, + "balance_loss_mlp": 1.03464651, + "epoch": 0.9810912370359236, + "flos": 18916305373440.0, + "grad_norm": 1.5355444284157869, + "language_loss": 0.73103517, + "learning_rate": 3.737605490767404e-09, + "loss": 0.75230014, + "num_input_tokens_seen": 352214170, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.640625, + "step": 16318, + "time_per_iteration": 2.5125892162323 + }, + { + "auxiliary_loss_clip": 0.01096692, + "auxiliary_loss_mlp": 0.01025119, + "balance_loss_clip": 1.01424098, + "balance_loss_mlp": 1.03356123, + "epoch": 0.9811513602885916, + "flos": 18441602208000.0, + "grad_norm": 2.1192338472210173, + "language_loss": 0.82084936, + "learning_rate": 3.7138443534383555e-09, + "loss": 0.84206748, + "num_input_tokens_seen": 352231470, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6328125, + "step": 16319, + "time_per_iteration": 2.447026014328003 + }, + { + "auxiliary_loss_clip": 0.01021624, + "auxiliary_loss_mlp": 0.01003034, + "balance_loss_clip": 1.00209188, + "balance_loss_mlp": 1.00163507, + "epoch": 0.9812114835412595, + "flos": 68058945371520.0, + "grad_norm": 0.7434937814270395, + "language_loss": 0.53610063, + "learning_rate": 3.6901589154803014e-09, + "loss": 0.55634713, + "num_input_tokens_seen": 352291770, + "router_z_loss_clip": 0.00939941, + "router_z_loss_mlp": 0.20019531, + "step": 16320, + "time_per_iteration": 2.943744659423828 + }, + { + "auxiliary_loss_clip": 0.01099346, + "auxiliary_loss_mlp": 0.01031389, + "balance_loss_clip": 1.01992095, + "balance_loss_mlp": 1.03373194, + "epoch": 0.9812716067939276, + "flos": 25373007648000.0, + "grad_norm": 6.500748558217768, + "language_loss": 0.73322588, + "learning_rate": 3.6665491777914116e-09, + "loss": 0.75453323, + "num_input_tokens_seen": 352310735, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65625, + "step": 16321, + "time_per_iteration": 2.4934024810791016 + }, + { + "auxiliary_loss_clip": 0.01100443, + "auxiliary_loss_mlp": 0.01031922, + "balance_loss_clip": 1.02043045, + "balance_loss_mlp": 1.03733802, + "epoch": 0.9813317300465956, + "flos": 22856818999680.0, + "grad_norm": 2.271671638374391, + "language_loss": 0.78664875, + "learning_rate": 3.6430151412669698e-09, + "loss": 0.80797231, + "num_input_tokens_seen": 352329545, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6328125, + "step": 16322, + "time_per_iteration": 2.46575927734375 + }, + { + "auxiliary_loss_clip": 0.01097688, + "auxiliary_loss_mlp": 0.010344, + "balance_loss_clip": 1.0227654, + "balance_loss_mlp": 1.03237772, + "epoch": 0.9813918532992635, + "flos": 23586954756480.0, + "grad_norm": 1.5064148787884066, + "language_loss": 0.80583704, + "learning_rate": 3.619556806799595e-09, + "loss": 0.82715797, + "num_input_tokens_seen": 352352080, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 16323, + "time_per_iteration": 2.514381170272827 + }, + { + "auxiliary_loss_clip": 0.01101495, + "auxiliary_loss_mlp": 0.01030439, + "balance_loss_clip": 1.0194416, + "balance_loss_mlp": 1.03495967, + "epoch": 0.9814519765519315, + "flos": 19606328616960.0, + "grad_norm": 2.143238321382065, + "language_loss": 0.8492884, + "learning_rate": 3.596174175278799e-09, + "loss": 0.87060773, + "num_input_tokens_seen": 352366455, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6640625, + "step": 16324, + "time_per_iteration": 2.4203484058380127 + }, + { + "auxiliary_loss_clip": 0.01099194, + "auxiliary_loss_mlp": 0.01029615, + "balance_loss_clip": 1.01754546, + "balance_loss_mlp": 1.03411317, + "epoch": 0.9815120998045994, + "flos": 33946284787200.0, + "grad_norm": 1.4738035008515573, + "language_loss": 0.74333966, + "learning_rate": 3.5728672475909827e-09, + "loss": 0.76462775, + "num_input_tokens_seen": 352386090, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.65234375, + "step": 16325, + "time_per_iteration": 2.67201828956604 + }, + { + "auxiliary_loss_clip": 0.01094665, + "auxiliary_loss_mlp": 0.01031985, + "balance_loss_clip": 1.02152395, + "balance_loss_mlp": 1.03282595, + "epoch": 0.9815722230572674, + "flos": 20850023076480.0, + "grad_norm": 1.6682468721270072, + "language_loss": 0.76755691, + "learning_rate": 3.5496360246201063e-09, + "loss": 0.78882343, + "num_input_tokens_seen": 352404000, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.6171875, + "step": 16326, + "time_per_iteration": 2.4386472702026367 + }, + { + "auxiliary_loss_clip": 0.01101179, + "auxiliary_loss_mlp": 0.01025364, + "balance_loss_clip": 1.01325238, + "balance_loss_mlp": 1.0354383, + "epoch": 0.9816323463099353, + "flos": 22894525301760.0, + "grad_norm": 1.7666613399101891, + "language_loss": 0.67005306, + "learning_rate": 3.5264805072470205e-09, + "loss": 0.69131851, + "num_input_tokens_seen": 352423540, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66015625, + "step": 16327, + "time_per_iteration": 2.5084118843078613 + }, + { + "auxiliary_loss_clip": 0.01104489, + "auxiliary_loss_mlp": 0.0103366, + "balance_loss_clip": 1.02107108, + "balance_loss_mlp": 1.03541327, + "epoch": 0.9816924695626034, + "flos": 31539444117120.0, + "grad_norm": 1.5963313544140023, + "language_loss": 0.73459053, + "learning_rate": 3.5034006963501337e-09, + "loss": 0.75597197, + "num_input_tokens_seen": 352445530, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.69140625, + "step": 16328, + "time_per_iteration": 2.513953685760498 + }, + { + "auxiliary_loss_clip": 0.01105032, + "auxiliary_loss_mlp": 0.01034872, + "balance_loss_clip": 1.02234316, + "balance_loss_mlp": 1.03475928, + "epoch": 0.9817525928152713, + "flos": 21506901045120.0, + "grad_norm": 1.6386198679453556, + "language_loss": 0.80848616, + "learning_rate": 3.4803965928040802e-09, + "loss": 0.82988524, + "num_input_tokens_seen": 352466325, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.703125, + "step": 16329, + "time_per_iteration": 2.4811136722564697 + }, + { + "auxiliary_loss_clip": 0.0110134, + "auxiliary_loss_mlp": 0.0102829, + "balance_loss_clip": 1.01591635, + "balance_loss_mlp": 1.03310025, + "epoch": 0.9818127160679393, + "flos": 25550513683200.0, + "grad_norm": 2.098740991949754, + "language_loss": 0.76318562, + "learning_rate": 3.4574681974817168e-09, + "loss": 0.78448194, + "num_input_tokens_seen": 352485505, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.68359375, + "step": 16330, + "time_per_iteration": 2.515571117401123 + }, + { + "auxiliary_loss_clip": 0.01107157, + "auxiliary_loss_mlp": 0.01029104, + "balance_loss_clip": 1.01529956, + "balance_loss_mlp": 1.0347935, + "epoch": 0.9818728393206072, + "flos": 28803661672320.0, + "grad_norm": 2.273811022859368, + "language_loss": 0.66393799, + "learning_rate": 3.434615511252126e-09, + "loss": 0.68530059, + "num_input_tokens_seen": 352505360, + "router_z_loss_clip": 0.13867188, + "router_z_loss_mlp": 0.7265625, + "step": 16331, + "time_per_iteration": 2.5182230472564697 + }, + { + "auxiliary_loss_clip": 0.01097163, + "auxiliary_loss_mlp": 0.01028643, + "balance_loss_clip": 1.01743126, + "balance_loss_mlp": 1.0327661, + "epoch": 0.9819329625732752, + "flos": 23222246014080.0, + "grad_norm": 1.6306236809447248, + "language_loss": 0.73071647, + "learning_rate": 3.411838534981948e-09, + "loss": 0.75197458, + "num_input_tokens_seen": 352524035, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6484375, + "step": 16332, + "time_per_iteration": 2.554363965988159 + }, + { + "auxiliary_loss_clip": 0.01098885, + "auxiliary_loss_mlp": 0.01025465, + "balance_loss_clip": 1.01507628, + "balance_loss_mlp": 1.03489494, + "epoch": 0.9819930858259431, + "flos": 17530440883200.0, + "grad_norm": 1.5999608499133222, + "language_loss": 0.76807606, + "learning_rate": 3.389137269534936e-09, + "loss": 0.78931957, + "num_input_tokens_seen": 352543210, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.640625, + "step": 16333, + "time_per_iteration": 2.406327724456787 + }, + { + "auxiliary_loss_clip": 0.01098481, + "auxiliary_loss_mlp": 0.01030122, + "balance_loss_clip": 1.01864195, + "balance_loss_mlp": 1.03352439, + "epoch": 0.9820532090786112, + "flos": 12529915971840.0, + "grad_norm": 2.3386346748180293, + "language_loss": 0.73073453, + "learning_rate": 3.366511715771958e-09, + "loss": 0.75202054, + "num_input_tokens_seen": 352559770, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6484375, + "step": 16334, + "time_per_iteration": 2.4535202980041504 + }, + { + "auxiliary_loss_clip": 0.01100827, + "auxiliary_loss_mlp": 0.01035443, + "balance_loss_clip": 1.02373648, + "balance_loss_mlp": 1.0337584, + "epoch": 0.9821133323312792, + "flos": 18840174497280.0, + "grad_norm": 1.889228221782078, + "language_loss": 0.78478283, + "learning_rate": 3.3439618745509934e-09, + "loss": 0.80614549, + "num_input_tokens_seen": 352577690, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.671875, + "step": 16335, + "time_per_iteration": 2.455636739730835 + }, + { + "auxiliary_loss_clip": 0.01102519, + "auxiliary_loss_mlp": 0.0103492, + "balance_loss_clip": 1.02213502, + "balance_loss_mlp": 1.03396535, + "epoch": 0.9821734555839471, + "flos": 34824013528320.0, + "grad_norm": 2.3440495087057447, + "language_loss": 0.64146876, + "learning_rate": 3.3214877467271362e-09, + "loss": 0.66284317, + "num_input_tokens_seen": 352598850, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.68359375, + "step": 16336, + "time_per_iteration": 2.595341444015503 + }, + { + "auxiliary_loss_clip": 0.0110517, + "auxiliary_loss_mlp": 0.01035567, + "balance_loss_clip": 1.02200055, + "balance_loss_mlp": 1.03517807, + "epoch": 0.9822335788366151, + "flos": 17128169493120.0, + "grad_norm": 1.9355169649892972, + "language_loss": 0.73395228, + "learning_rate": 3.299089333152372e-09, + "loss": 0.75535965, + "num_input_tokens_seen": 352616130, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.69921875, + "step": 16337, + "time_per_iteration": 2.4344332218170166 + }, + { + "auxiliary_loss_clip": 0.01100641, + "auxiliary_loss_mlp": 0.01027728, + "balance_loss_clip": 1.01541948, + "balance_loss_mlp": 1.03356791, + "epoch": 0.982293702089283, + "flos": 20813250528000.0, + "grad_norm": 1.6801667863354321, + "language_loss": 0.72507012, + "learning_rate": 3.2767666346764645e-09, + "loss": 0.74635381, + "num_input_tokens_seen": 352636885, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.671875, + "step": 16338, + "time_per_iteration": 2.4975478649139404 + }, + { + "auxiliary_loss_clip": 0.01096358, + "auxiliary_loss_mlp": 0.01030229, + "balance_loss_clip": 1.01844525, + "balance_loss_mlp": 1.03190184, + "epoch": 0.982353825341951, + "flos": 24680829588480.0, + "grad_norm": 1.8281230536728026, + "language_loss": 0.81268263, + "learning_rate": 3.2545196521454045e-09, + "loss": 0.83394849, + "num_input_tokens_seen": 352657905, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.64453125, + "step": 16339, + "time_per_iteration": 2.4743876457214355 + }, + { + "auxiliary_loss_clip": 0.01094696, + "auxiliary_loss_mlp": 0.01031203, + "balance_loss_clip": 1.02068281, + "balance_loss_mlp": 1.03254604, + "epoch": 0.982413948594619, + "flos": 20850489953280.0, + "grad_norm": 1.7405720603242414, + "language_loss": 0.62341028, + "learning_rate": 3.232348386403405e-09, + "loss": 0.64466929, + "num_input_tokens_seen": 352676320, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.62109375, + "step": 16340, + "time_per_iteration": 2.478207588195801 + }, + { + "auxiliary_loss_clip": 0.01102859, + "auxiliary_loss_mlp": 0.01032409, + "balance_loss_clip": 1.02079773, + "balance_loss_mlp": 1.03538668, + "epoch": 0.982474071847287, + "flos": 15377380778880.0, + "grad_norm": 2.6974854416597287, + "language_loss": 0.85674942, + "learning_rate": 3.2102528382904613e-09, + "loss": 0.87810206, + "num_input_tokens_seen": 352692665, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.67578125, + "step": 16341, + "time_per_iteration": 2.4368467330932617 + }, + { + "auxiliary_loss_clip": 0.01096331, + "auxiliary_loss_mlp": 0.01026724, + "balance_loss_clip": 1.01563168, + "balance_loss_mlp": 1.03398645, + "epoch": 0.9825341950999549, + "flos": 23774732081280.0, + "grad_norm": 1.426488361267362, + "language_loss": 0.66898513, + "learning_rate": 3.188233008645014e-09, + "loss": 0.69021565, + "num_input_tokens_seen": 352716130, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.625, + "step": 16342, + "time_per_iteration": 2.5108721256256104 + }, + { + "auxiliary_loss_clip": 0.01099917, + "auxiliary_loss_mlp": 0.01024609, + "balance_loss_clip": 1.01293254, + "balance_loss_mlp": 1.03381848, + "epoch": 0.9825943183526229, + "flos": 22746285872640.0, + "grad_norm": 1.5238109255321661, + "language_loss": 0.77271879, + "learning_rate": 3.16628889830195e-09, + "loss": 0.79396409, + "num_input_tokens_seen": 352734705, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 16343, + "time_per_iteration": 2.487384557723999 + }, + { + "auxiliary_loss_clip": 0.01097522, + "auxiliary_loss_mlp": 0.01030497, + "balance_loss_clip": 1.02021468, + "balance_loss_mlp": 1.03368938, + "epoch": 0.9826544416052908, + "flos": 27709966408320.0, + "grad_norm": 1.5085133090122882, + "language_loss": 0.7517612, + "learning_rate": 3.1444205080932707e-09, + "loss": 0.77304137, + "num_input_tokens_seen": 352756225, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.63671875, + "step": 16344, + "time_per_iteration": 2.538987159729004 + }, + { + "auxiliary_loss_clip": 0.01098149, + "auxiliary_loss_mlp": 0.01031521, + "balance_loss_clip": 1.01962423, + "balance_loss_mlp": 1.03374767, + "epoch": 0.9827145648579588, + "flos": 26941657472640.0, + "grad_norm": 2.501007535333455, + "language_loss": 0.66638464, + "learning_rate": 3.122627838848313e-09, + "loss": 0.68768132, + "num_input_tokens_seen": 352776210, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.64453125, + "step": 16345, + "time_per_iteration": 3.9340567588806152 + }, + { + "auxiliary_loss_clip": 0.01092782, + "auxiliary_loss_mlp": 0.01026635, + "balance_loss_clip": 1.01665115, + "balance_loss_mlp": 1.03144765, + "epoch": 0.9827746881106267, + "flos": 21866545969920.0, + "grad_norm": 1.3402147417907175, + "language_loss": 0.79547799, + "learning_rate": 3.1009108913933045e-09, + "loss": 0.81667221, + "num_input_tokens_seen": 352795455, + "router_z_loss_clip": 0.09960938, + "router_z_loss_mlp": 0.61328125, + "step": 16346, + "time_per_iteration": 2.4895803928375244 + }, + { + "auxiliary_loss_clip": 0.01104024, + "auxiliary_loss_mlp": 0.01030984, + "balance_loss_clip": 1.0188067, + "balance_loss_mlp": 1.03385854, + "epoch": 0.9828348113632948, + "flos": 20850777262080.0, + "grad_norm": 1.8224916412255767, + "language_loss": 0.74978042, + "learning_rate": 3.079269666552031e-09, + "loss": 0.7711305, + "num_input_tokens_seen": 352812895, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.703125, + "step": 16347, + "time_per_iteration": 3.956122398376465 + }, + { + "auxiliary_loss_clip": 0.01095315, + "auxiliary_loss_mlp": 0.01032494, + "balance_loss_clip": 1.02184844, + "balance_loss_mlp": 1.03214502, + "epoch": 0.9828949346159628, + "flos": 34569227381760.0, + "grad_norm": 1.9399061780009854, + "language_loss": 0.66402197, + "learning_rate": 3.0577041651449474e-09, + "loss": 0.68530005, + "num_input_tokens_seen": 352835470, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6328125, + "step": 16348, + "time_per_iteration": 3.9791600704193115 + }, + { + "auxiliary_loss_clip": 0.01099713, + "auxiliary_loss_mlp": 0.01026653, + "balance_loss_clip": 1.01496458, + "balance_loss_mlp": 1.03385162, + "epoch": 0.9829550578686307, + "flos": 24457464864000.0, + "grad_norm": 1.8906340007069518, + "language_loss": 0.69143182, + "learning_rate": 3.0362143879898437e-09, + "loss": 0.71269548, + "num_input_tokens_seen": 352854295, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.65625, + "step": 16349, + "time_per_iteration": 2.517441987991333 + }, + { + "auxiliary_loss_clip": 0.0109294, + "auxiliary_loss_mlp": 0.01028225, + "balance_loss_clip": 1.01758015, + "balance_loss_mlp": 1.03203154, + "epoch": 0.9830151811212987, + "flos": 16910084067840.0, + "grad_norm": 1.9423722053932548, + "language_loss": 0.76204872, + "learning_rate": 3.0148003359014018e-09, + "loss": 0.78326035, + "num_input_tokens_seen": 352869695, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.609375, + "step": 16350, + "time_per_iteration": 2.423643112182617 + }, + { + "auxiliary_loss_clip": 0.01099798, + "auxiliary_loss_mlp": 0.01029763, + "balance_loss_clip": 1.0178715, + "balance_loss_mlp": 1.03397298, + "epoch": 0.9830753043739666, + "flos": 21288312829440.0, + "grad_norm": 2.1876266892296283, + "language_loss": 0.84113282, + "learning_rate": 2.9934620096920826e-09, + "loss": 0.86242843, + "num_input_tokens_seen": 352887430, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66015625, + "step": 16351, + "time_per_iteration": 2.446887969970703 + }, + { + "auxiliary_loss_clip": 0.01098309, + "auxiliary_loss_mlp": 0.01025634, + "balance_loss_clip": 1.01421404, + "balance_loss_mlp": 1.03314865, + "epoch": 0.9831354276266346, + "flos": 31723522341120.0, + "grad_norm": 1.6852483245981495, + "language_loss": 0.68510699, + "learning_rate": 2.972199410170795e-09, + "loss": 0.70634645, + "num_input_tokens_seen": 352907555, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 16352, + "time_per_iteration": 2.5545663833618164 + }, + { + "auxiliary_loss_clip": 0.01096103, + "auxiliary_loss_mlp": 0.01029225, + "balance_loss_clip": 1.01824594, + "balance_loss_mlp": 1.03311467, + "epoch": 0.9831955508793025, + "flos": 21619050284160.0, + "grad_norm": 1.4338701194753172, + "language_loss": 0.66359127, + "learning_rate": 2.951012538143782e-09, + "loss": 0.68484455, + "num_input_tokens_seen": 352928670, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.62890625, + "step": 16353, + "time_per_iteration": 3.974562883377075 + }, + { + "auxiliary_loss_clip": 0.01095175, + "auxiliary_loss_mlp": 0.01029938, + "balance_loss_clip": 1.01926327, + "balance_loss_mlp": 1.03227568, + "epoch": 0.9832556741319706, + "flos": 22968214053120.0, + "grad_norm": 1.470289829422996, + "language_loss": 0.74282354, + "learning_rate": 2.9299013944144025e-09, + "loss": 0.76407468, + "num_input_tokens_seen": 352948345, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.62890625, + "step": 16354, + "time_per_iteration": 2.4706337451934814 + }, + { + "auxiliary_loss_clip": 0.01096804, + "auxiliary_loss_mlp": 0.0102677, + "balance_loss_clip": 1.01560569, + "balance_loss_mlp": 1.03276682, + "epoch": 0.9833157973846385, + "flos": 21323900229120.0, + "grad_norm": 1.9586241896566348, + "language_loss": 0.77517724, + "learning_rate": 2.9088659797835702e-09, + "loss": 0.796413, + "num_input_tokens_seen": 352967250, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.640625, + "step": 16355, + "time_per_iteration": 2.4410529136657715 + }, + { + "auxiliary_loss_clip": 0.0109692, + "auxiliary_loss_mlp": 0.01028798, + "balance_loss_clip": 1.01734829, + "balance_loss_mlp": 1.03296006, + "epoch": 0.9833759206373065, + "flos": 21068719032960.0, + "grad_norm": 2.282707470444189, + "language_loss": 0.73298937, + "learning_rate": 2.8879062950484256e-09, + "loss": 0.75424653, + "num_input_tokens_seen": 352984725, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.640625, + "step": 16356, + "time_per_iteration": 2.435739517211914 + }, + { + "auxiliary_loss_clip": 0.0109747, + "auxiliary_loss_mlp": 0.01029947, + "balance_loss_clip": 1.01794899, + "balance_loss_mlp": 1.0338732, + "epoch": 0.9834360438899744, + "flos": 18697322108160.0, + "grad_norm": 1.6040337726439833, + "language_loss": 0.75952339, + "learning_rate": 2.8670223410041104e-09, + "loss": 0.7807976, + "num_input_tokens_seen": 353003480, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.63671875, + "step": 16357, + "time_per_iteration": 2.448345899581909 + }, + { + "auxiliary_loss_clip": 0.01097463, + "auxiliary_loss_mlp": 0.01022689, + "balance_loss_clip": 1.01070881, + "balance_loss_mlp": 1.03399682, + "epoch": 0.9834961671426424, + "flos": 21105240186240.0, + "grad_norm": 9.487408286348185, + "language_loss": 0.80191135, + "learning_rate": 2.846214118442436e-09, + "loss": 0.82311285, + "num_input_tokens_seen": 353021425, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6328125, + "step": 16358, + "time_per_iteration": 2.4538917541503906 + }, + { + "auxiliary_loss_clip": 0.01098016, + "auxiliary_loss_mlp": 0.01026763, + "balance_loss_clip": 1.01573038, + "balance_loss_mlp": 1.03340781, + "epoch": 0.9835562903953103, + "flos": 26687625511680.0, + "grad_norm": 3.160438132632366, + "language_loss": 0.67664564, + "learning_rate": 2.8254816281523263e-09, + "loss": 0.69789338, + "num_input_tokens_seen": 353039870, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 16359, + "time_per_iteration": 2.5928866863250732 + }, + { + "auxiliary_loss_clip": 0.01096367, + "auxiliary_loss_mlp": 0.01027638, + "balance_loss_clip": 1.01712346, + "balance_loss_mlp": 1.03287399, + "epoch": 0.9836164136479784, + "flos": 22090162089600.0, + "grad_norm": 2.767710883229253, + "language_loss": 0.6986711, + "learning_rate": 2.804824870920264e-09, + "loss": 0.71991116, + "num_input_tokens_seen": 353059750, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.63671875, + "step": 16360, + "time_per_iteration": 2.546980381011963 + }, + { + "auxiliary_loss_clip": 0.01099201, + "auxiliary_loss_mlp": 0.010292, + "balance_loss_clip": 1.01731467, + "balance_loss_mlp": 1.03346205, + "epoch": 0.9836765369006463, + "flos": 23878405710720.0, + "grad_norm": 1.6812441062486845, + "language_loss": 0.84103167, + "learning_rate": 2.7842438475293996e-09, + "loss": 0.86231565, + "num_input_tokens_seen": 353079940, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.65625, + "step": 16361, + "time_per_iteration": 2.464859962463379 + }, + { + "auxiliary_loss_clip": 0.01098239, + "auxiliary_loss_mlp": 0.0102529, + "balance_loss_clip": 1.01420975, + "balance_loss_mlp": 1.03314137, + "epoch": 0.9837366601533143, + "flos": 25845017293440.0, + "grad_norm": 1.7057424485209642, + "language_loss": 0.7577697, + "learning_rate": 2.76373855876022e-09, + "loss": 0.77900505, + "num_input_tokens_seen": 353099990, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6484375, + "step": 16362, + "time_per_iteration": 2.5125908851623535 + }, + { + "auxiliary_loss_clip": 0.01099486, + "auxiliary_loss_mlp": 0.01032079, + "balance_loss_clip": 1.02058172, + "balance_loss_mlp": 1.03428173, + "epoch": 0.9837967834059823, + "flos": 21358015171200.0, + "grad_norm": 1.6643095210607834, + "language_loss": 0.71448255, + "learning_rate": 2.7433090053901043e-09, + "loss": 0.73579824, + "num_input_tokens_seen": 353118710, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65234375, + "step": 16363, + "time_per_iteration": 2.432748556137085 + }, + { + "auxiliary_loss_clip": 0.01094357, + "auxiliary_loss_mlp": 0.01027605, + "balance_loss_clip": 1.0168941, + "balance_loss_mlp": 1.03287041, + "epoch": 0.9838569066586502, + "flos": 18515793749760.0, + "grad_norm": 1.7356464514395182, + "language_loss": 0.63440335, + "learning_rate": 2.7229551881937653e-09, + "loss": 0.65562296, + "num_input_tokens_seen": 353136415, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.61328125, + "step": 16364, + "time_per_iteration": 2.4572789669036865 + }, + { + "auxiliary_loss_clip": 0.01099675, + "auxiliary_loss_mlp": 0.01031483, + "balance_loss_clip": 1.02130258, + "balance_loss_mlp": 1.03466845, + "epoch": 0.9839170299113182, + "flos": 22452392793600.0, + "grad_norm": 1.4850865495256305, + "language_loss": 0.74915314, + "learning_rate": 2.702677107943252e-09, + "loss": 0.77046472, + "num_input_tokens_seen": 353154650, + "router_z_loss_clip": 0.1015625, + "router_z_loss_mlp": 0.6484375, + "step": 16365, + "time_per_iteration": 2.4729011058807373 + }, + { + "auxiliary_loss_clip": 0.01096935, + "auxiliary_loss_mlp": 0.01026662, + "balance_loss_clip": 1.01504469, + "balance_loss_mlp": 1.03303897, + "epoch": 0.9839771531639862, + "flos": 27892320779520.0, + "grad_norm": 7.143933962867107, + "language_loss": 0.76209521, + "learning_rate": 2.6824747654072832e-09, + "loss": 0.78333127, + "num_input_tokens_seen": 353174065, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.63671875, + "step": 16366, + "time_per_iteration": 2.5000736713409424 + }, + { + "auxiliary_loss_clip": 0.01095723, + "auxiliary_loss_mlp": 0.01025155, + "balance_loss_clip": 1.01459885, + "balance_loss_mlp": 1.03223205, + "epoch": 0.9840372764166542, + "flos": 28214510797440.0, + "grad_norm": 1.9814409544348766, + "language_loss": 0.77052504, + "learning_rate": 2.662348161352357e-09, + "loss": 0.79173386, + "num_input_tokens_seen": 353193560, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.6328125, + "step": 16367, + "time_per_iteration": 2.542595624923706 + }, + { + "auxiliary_loss_clip": 0.01099313, + "auxiliary_loss_mlp": 0.01031393, + "balance_loss_clip": 1.0192219, + "balance_loss_mlp": 1.03569198, + "epoch": 0.9840973996693221, + "flos": 23403989854080.0, + "grad_norm": 1.745180491052293, + "language_loss": 0.61363411, + "learning_rate": 2.642297296540974e-09, + "loss": 0.63494116, + "num_input_tokens_seen": 353213525, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.63671875, + "step": 16368, + "time_per_iteration": 2.4790031909942627 + }, + { + "auxiliary_loss_clip": 0.01094785, + "auxiliary_loss_mlp": 0.01030327, + "balance_loss_clip": 1.01990819, + "balance_loss_mlp": 1.03288722, + "epoch": 0.9841575229219901, + "flos": 21395865127680.0, + "grad_norm": 1.5270024677807728, + "language_loss": 0.65519226, + "learning_rate": 2.6223221717340816e-09, + "loss": 0.67644334, + "num_input_tokens_seen": 353234000, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.62109375, + "step": 16369, + "time_per_iteration": 2.454857110977173 + }, + { + "auxiliary_loss_clip": 0.0110139, + "auxiliary_loss_mlp": 0.01032035, + "balance_loss_clip": 1.02021563, + "balance_loss_mlp": 1.03482819, + "epoch": 0.984217646174658, + "flos": 24464072966400.0, + "grad_norm": 1.4214427672990262, + "language_loss": 0.68732488, + "learning_rate": 2.6024227876886295e-09, + "loss": 0.70865911, + "num_input_tokens_seen": 353254940, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.6640625, + "step": 16370, + "time_per_iteration": 2.517896890640259 + }, + { + "auxiliary_loss_clip": 0.01099591, + "auxiliary_loss_mlp": 0.01033771, + "balance_loss_clip": 1.02143872, + "balance_loss_mlp": 1.03292727, + "epoch": 0.984277769427326, + "flos": 16435057680000.0, + "grad_norm": 1.8084581245849027, + "language_loss": 0.73778242, + "learning_rate": 2.582599145159792e-09, + "loss": 0.75911605, + "num_input_tokens_seen": 353272590, + "router_z_loss_clip": 0.12353516, + "router_z_loss_mlp": 0.6640625, + "step": 16371, + "time_per_iteration": 2.464282274246216 + }, + { + "auxiliary_loss_clip": 0.01021782, + "auxiliary_loss_mlp": 0.01000386, + "balance_loss_clip": 0.99944443, + "balance_loss_mlp": 1.00176942, + "epoch": 0.9843378926799939, + "flos": 64530615288960.0, + "grad_norm": 0.7761667176847223, + "language_loss": 0.65162444, + "learning_rate": 2.562851244898745e-09, + "loss": 0.67184615, + "num_input_tokens_seen": 353334380, + "router_z_loss_clip": 0.00939941, + "router_z_loss_mlp": 0.20117188, + "step": 16372, + "time_per_iteration": 3.0799262523651123 + }, + { + "auxiliary_loss_clip": 0.01097301, + "auxiliary_loss_mlp": 0.01025244, + "balance_loss_clip": 1.01412201, + "balance_loss_mlp": 1.03277588, + "epoch": 0.984398015932662, + "flos": 17382811985280.0, + "grad_norm": 1.9497041934294832, + "language_loss": 0.70436323, + "learning_rate": 2.5431790876544456e-09, + "loss": 0.72558868, + "num_input_tokens_seen": 353351640, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.64453125, + "step": 16373, + "time_per_iteration": 2.434091091156006 + }, + { + "auxiliary_loss_clip": 0.01097529, + "auxiliary_loss_mlp": 0.01027896, + "balance_loss_clip": 1.01660144, + "balance_loss_mlp": 1.0344733, + "epoch": 0.9844581391853299, + "flos": 23879088069120.0, + "grad_norm": 1.5923709110598652, + "language_loss": 0.81572837, + "learning_rate": 2.523582674173186e-09, + "loss": 0.83698261, + "num_input_tokens_seen": 353372555, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.62890625, + "step": 16374, + "time_per_iteration": 2.488692045211792 + }, + { + "auxiliary_loss_clip": 0.01101403, + "auxiliary_loss_mlp": 0.01031314, + "balance_loss_clip": 1.02025104, + "balance_loss_mlp": 1.0355829, + "epoch": 0.9845182624379979, + "flos": 19865352568320.0, + "grad_norm": 1.6741401712819997, + "language_loss": 0.69374293, + "learning_rate": 2.504062005197927e-09, + "loss": 0.71507013, + "num_input_tokens_seen": 353391385, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.65625, + "step": 16375, + "time_per_iteration": 2.4871280193328857 + }, + { + "auxiliary_loss_clip": 0.01101374, + "auxiliary_loss_mlp": 0.01033035, + "balance_loss_clip": 1.02090538, + "balance_loss_mlp": 1.03388441, + "epoch": 0.9845783856906659, + "flos": 28254659224320.0, + "grad_norm": 1.7060471688472025, + "language_loss": 0.8095867, + "learning_rate": 2.484617081468521e-09, + "loss": 0.83093083, + "num_input_tokens_seen": 353411630, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 16376, + "time_per_iteration": 2.512218713760376 + }, + { + "auxiliary_loss_clip": 0.01096059, + "auxiliary_loss_mlp": 0.01030868, + "balance_loss_clip": 1.0195905, + "balance_loss_mlp": 1.03252149, + "epoch": 0.9846385089433338, + "flos": 28328383889280.0, + "grad_norm": 1.469956165284788, + "language_loss": 0.62223607, + "learning_rate": 2.4652479037228224e-09, + "loss": 0.64350533, + "num_input_tokens_seen": 353432895, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6328125, + "step": 16377, + "time_per_iteration": 2.5331015586853027 + }, + { + "auxiliary_loss_clip": 0.01100529, + "auxiliary_loss_mlp": 0.01035171, + "balance_loss_clip": 1.02324438, + "balance_loss_mlp": 1.03450775, + "epoch": 0.9846986321960018, + "flos": 24316767290880.0, + "grad_norm": 1.6192423407924923, + "language_loss": 0.728405, + "learning_rate": 2.445954472695133e-09, + "loss": 0.74976194, + "num_input_tokens_seen": 353454195, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66015625, + "step": 16378, + "time_per_iteration": 2.4707744121551514 + }, + { + "auxiliary_loss_clip": 0.01099505, + "auxiliary_loss_mlp": 0.01030908, + "balance_loss_clip": 1.01964295, + "balance_loss_mlp": 1.0338167, + "epoch": 0.9847587554486698, + "flos": 27271999877760.0, + "grad_norm": 1.6532810502944137, + "language_loss": 0.71028608, + "learning_rate": 2.426736789116868e-09, + "loss": 0.73159021, + "num_input_tokens_seen": 353475125, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 16379, + "time_per_iteration": 2.561509132385254 + }, + { + "auxiliary_loss_clip": 0.01101135, + "auxiliary_loss_mlp": 0.01030783, + "balance_loss_clip": 1.01918983, + "balance_loss_mlp": 1.03502083, + "epoch": 0.9848188787013378, + "flos": 16542717719040.0, + "grad_norm": 1.9173817203854187, + "language_loss": 0.68630135, + "learning_rate": 2.407594853716999e-09, + "loss": 0.7076205, + "num_input_tokens_seen": 353493265, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.66015625, + "step": 16380, + "time_per_iteration": 2.483130931854248 + }, + { + "auxiliary_loss_clip": 0.01102739, + "auxiliary_loss_mlp": 0.01033361, + "balance_loss_clip": 1.0218395, + "balance_loss_mlp": 1.03448987, + "epoch": 0.9848790019540057, + "flos": 20193647898240.0, + "grad_norm": 1.9554023143101786, + "language_loss": 0.7881375, + "learning_rate": 2.38852866722139e-09, + "loss": 0.80949849, + "num_input_tokens_seen": 353511650, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.68359375, + "step": 16381, + "time_per_iteration": 2.4630978107452393 + }, + { + "auxiliary_loss_clip": 0.01098406, + "auxiliary_loss_mlp": 0.01026778, + "balance_loss_clip": 1.015733, + "balance_loss_mlp": 1.03296387, + "epoch": 0.9849391252066737, + "flos": 28259723041920.0, + "grad_norm": 1.4047755211177806, + "language_loss": 0.82333148, + "learning_rate": 2.3695382303527965e-09, + "loss": 0.84458339, + "num_input_tokens_seen": 353534035, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65234375, + "step": 16382, + "time_per_iteration": 2.5435211658477783 + }, + { + "auxiliary_loss_clip": 0.01102482, + "auxiliary_loss_mlp": 0.01029941, + "balance_loss_clip": 1.01768613, + "balance_loss_mlp": 1.03403974, + "epoch": 0.9849992484593416, + "flos": 22454942659200.0, + "grad_norm": 1.7630013745134487, + "language_loss": 0.74086952, + "learning_rate": 2.3506235438315316e-09, + "loss": 0.76219374, + "num_input_tokens_seen": 353549950, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.68359375, + "step": 16383, + "time_per_iteration": 2.461627244949341 + }, + { + "auxiliary_loss_clip": 0.0110084, + "auxiliary_loss_mlp": 0.01029632, + "balance_loss_clip": 1.01839042, + "balance_loss_mlp": 1.03497994, + "epoch": 0.9850593717120096, + "flos": 34497190656000.0, + "grad_norm": 1.4221795490292306, + "language_loss": 0.65806353, + "learning_rate": 2.3317846083750203e-09, + "loss": 0.67936826, + "num_input_tokens_seen": 353573745, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65625, + "step": 16384, + "time_per_iteration": 2.594108819961548 + }, + { + "auxiliary_loss_clip": 0.01104674, + "auxiliary_loss_mlp": 0.01033399, + "balance_loss_clip": 1.01976156, + "balance_loss_mlp": 1.03614712, + "epoch": 0.9851194949646775, + "flos": 38837282152320.0, + "grad_norm": 1.6568999819680295, + "language_loss": 0.69966209, + "learning_rate": 2.313021424697359e-09, + "loss": 0.72104275, + "num_input_tokens_seen": 353595335, + "router_z_loss_clip": 0.13671875, + "router_z_loss_mlp": 0.6875, + "step": 16385, + "time_per_iteration": 2.6449928283691406 + }, + { + "auxiliary_loss_clip": 0.01102637, + "auxiliary_loss_mlp": 0.01032623, + "balance_loss_clip": 1.02161956, + "balance_loss_mlp": 1.03761828, + "epoch": 0.9851796182173456, + "flos": 17712436118400.0, + "grad_norm": 1.929214822007236, + "language_loss": 0.81081849, + "learning_rate": 2.294333993509978e-09, + "loss": 0.83217108, + "num_input_tokens_seen": 353614270, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6484375, + "step": 16386, + "time_per_iteration": 2.4518470764160156 + }, + { + "auxiliary_loss_clip": 0.01100031, + "auxiliary_loss_mlp": 0.01029256, + "balance_loss_clip": 1.01721561, + "balance_loss_mlp": 1.03449285, + "epoch": 0.9852397414700135, + "flos": 27454318335360.0, + "grad_norm": 1.7433612430328327, + "language_loss": 0.67459857, + "learning_rate": 2.2757223155216442e-09, + "loss": 0.6958915, + "num_input_tokens_seen": 353634900, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.65625, + "step": 16387, + "time_per_iteration": 3.853261947631836 + }, + { + "auxiliary_loss_clip": 0.01092752, + "auxiliary_loss_mlp": 0.0102984, + "balance_loss_clip": 1.01898646, + "balance_loss_mlp": 1.03189099, + "epoch": 0.9852998647226815, + "flos": 18296702743680.0, + "grad_norm": 1.6794156400657199, + "language_loss": 0.73679399, + "learning_rate": 2.257186391438237e-09, + "loss": 0.75801992, + "num_input_tokens_seen": 353652890, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.609375, + "step": 16388, + "time_per_iteration": 2.4796459674835205 + }, + { + "auxiliary_loss_clip": 0.01096542, + "auxiliary_loss_mlp": 0.01029303, + "balance_loss_clip": 1.01828194, + "balance_loss_mlp": 1.03178144, + "epoch": 0.9853599879753495, + "flos": 19642562461440.0, + "grad_norm": 1.854112159676643, + "language_loss": 0.8199439, + "learning_rate": 2.238726221962528e-09, + "loss": 0.84120238, + "num_input_tokens_seen": 353671295, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 16389, + "time_per_iteration": 3.912445068359375 + }, + { + "auxiliary_loss_clip": 0.01097312, + "auxiliary_loss_mlp": 0.01025459, + "balance_loss_clip": 1.01399732, + "balance_loss_mlp": 1.03325129, + "epoch": 0.9854201112280174, + "flos": 23841956384640.0, + "grad_norm": 2.001006106345854, + "language_loss": 0.67084408, + "learning_rate": 2.2203418077946234e-09, + "loss": 0.6920718, + "num_input_tokens_seen": 353690560, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.640625, + "step": 16390, + "time_per_iteration": 2.4856414794921875 + }, + { + "auxiliary_loss_clip": 0.01100101, + "auxiliary_loss_mlp": 0.01032358, + "balance_loss_clip": 1.02013946, + "balance_loss_mlp": 1.03467011, + "epoch": 0.9854802344806854, + "flos": 30080573233920.0, + "grad_norm": 1.7503280437691784, + "language_loss": 0.77223754, + "learning_rate": 2.2020331496312994e-09, + "loss": 0.79356205, + "num_input_tokens_seen": 353710660, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65625, + "step": 16391, + "time_per_iteration": 2.5048904418945312 + }, + { + "auxiliary_loss_clip": 0.01093654, + "auxiliary_loss_mlp": 0.0102981, + "balance_loss_clip": 1.01943874, + "balance_loss_mlp": 1.03313243, + "epoch": 0.9855403577333534, + "flos": 21907412668800.0, + "grad_norm": 1.8244273189308011, + "language_loss": 0.68202817, + "learning_rate": 2.1838002481673333e-09, + "loss": 0.70326281, + "num_input_tokens_seen": 353730440, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.60546875, + "step": 16392, + "time_per_iteration": 2.4745800495147705 + }, + { + "auxiliary_loss_clip": 0.01102623, + "auxiliary_loss_mlp": 0.01026369, + "balance_loss_clip": 1.01361322, + "balance_loss_mlp": 1.03380561, + "epoch": 0.9856004809860214, + "flos": 15413794191360.0, + "grad_norm": 2.00746487685818, + "language_loss": 0.55832624, + "learning_rate": 2.1656431040937286e-09, + "loss": 0.57961619, + "num_input_tokens_seen": 353748360, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 16393, + "time_per_iteration": 2.4202845096588135 + }, + { + "auxiliary_loss_clip": 0.01105775, + "auxiliary_loss_mlp": 0.01031286, + "balance_loss_clip": 1.01867962, + "balance_loss_mlp": 1.03597665, + "epoch": 0.9856606042386893, + "flos": 13653201064320.0, + "grad_norm": 5.137455131585941, + "language_loss": 0.79335487, + "learning_rate": 2.1475617180990444e-09, + "loss": 0.81472552, + "num_input_tokens_seen": 353760880, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6953125, + "step": 16394, + "time_per_iteration": 2.3870041370391846 + }, + { + "auxiliary_loss_clip": 0.01102304, + "auxiliary_loss_mlp": 0.01031803, + "balance_loss_clip": 1.01954222, + "balance_loss_mlp": 1.03348887, + "epoch": 0.9857207274913573, + "flos": 23479151063040.0, + "grad_norm": 1.4204577915939423, + "language_loss": 0.76103747, + "learning_rate": 2.129556090869178e-09, + "loss": 0.78237855, + "num_input_tokens_seen": 353782255, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6875, + "step": 16395, + "time_per_iteration": 4.123412370681763 + }, + { + "auxiliary_loss_clip": 0.01097875, + "auxiliary_loss_mlp": 0.01026877, + "balance_loss_clip": 1.01574266, + "balance_loss_mlp": 1.0336237, + "epoch": 0.9857808507440252, + "flos": 21065486808960.0, + "grad_norm": 1.8472320376349611, + "language_loss": 0.75438356, + "learning_rate": 2.1116262230866933e-09, + "loss": 0.77563113, + "num_input_tokens_seen": 353803580, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.64453125, + "step": 16396, + "time_per_iteration": 2.518141508102417 + }, + { + "auxiliary_loss_clip": 0.01097784, + "auxiliary_loss_mlp": 0.01027321, + "balance_loss_clip": 1.01606131, + "balance_loss_mlp": 1.03416276, + "epoch": 0.9858409739966932, + "flos": 25301365971840.0, + "grad_norm": 1.5636907431654377, + "language_loss": 0.70736861, + "learning_rate": 2.0937721154317133e-09, + "loss": 0.72861964, + "num_input_tokens_seen": 353824200, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.63671875, + "step": 16397, + "time_per_iteration": 2.497194528579712 + }, + { + "auxiliary_loss_clip": 0.01096257, + "auxiliary_loss_mlp": 0.0103081, + "balance_loss_clip": 1.02008724, + "balance_loss_mlp": 1.03538656, + "epoch": 0.9859010972493611, + "flos": 20558751690240.0, + "grad_norm": 1.9966865874098016, + "language_loss": 0.71433568, + "learning_rate": 2.0759937685810304e-09, + "loss": 0.73560631, + "num_input_tokens_seen": 353843350, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.609375, + "step": 16398, + "time_per_iteration": 2.4541091918945312 + }, + { + "auxiliary_loss_clip": 0.01097831, + "auxiliary_loss_mlp": 0.01026569, + "balance_loss_clip": 1.0156436, + "balance_loss_mlp": 1.03418803, + "epoch": 0.9859612205020292, + "flos": 24754985216640.0, + "grad_norm": 1.8623253931763133, + "language_loss": 0.73714447, + "learning_rate": 2.058291183208771e-09, + "loss": 0.7583884, + "num_input_tokens_seen": 353864520, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.63671875, + "step": 16399, + "time_per_iteration": 2.503669261932373 + }, + { + "auxiliary_loss_clip": 0.01098469, + "auxiliary_loss_mlp": 0.0102508, + "balance_loss_clip": 1.01344514, + "balance_loss_mlp": 1.03257656, + "epoch": 0.9860213437546971, + "flos": 21105850717440.0, + "grad_norm": 2.080462229184556, + "language_loss": 0.58062029, + "learning_rate": 2.0406643599863993e-09, + "loss": 0.60185581, + "num_input_tokens_seen": 353882240, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66015625, + "step": 16400, + "time_per_iteration": 2.4521939754486084 + }, + { + "auxiliary_loss_clip": 0.01105515, + "auxiliary_loss_mlp": 0.01028696, + "balance_loss_clip": 1.01613116, + "balance_loss_mlp": 1.03492236, + "epoch": 0.9860814670073651, + "flos": 19136078737920.0, + "grad_norm": 1.567548227092974, + "language_loss": 0.80283344, + "learning_rate": 2.023113299582491e-09, + "loss": 0.8241756, + "num_input_tokens_seen": 353901590, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.70703125, + "step": 16401, + "time_per_iteration": 2.4489645957946777 + }, + { + "auxiliary_loss_clip": 0.01097463, + "auxiliary_loss_mlp": 0.01034925, + "balance_loss_clip": 1.02253318, + "balance_loss_mlp": 1.03393412, + "epoch": 0.9861415902600331, + "flos": 17237050594560.0, + "grad_norm": 1.8340920908807528, + "language_loss": 0.77850628, + "learning_rate": 2.005638002662069e-09, + "loss": 0.79983014, + "num_input_tokens_seen": 353918785, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6328125, + "step": 16402, + "time_per_iteration": 2.440742015838623 + }, + { + "auxiliary_loss_clip": 0.01101709, + "auxiliary_loss_mlp": 0.01030608, + "balance_loss_clip": 1.01902652, + "balance_loss_mlp": 1.0353204, + "epoch": 0.986201713512701, + "flos": 27782577751680.0, + "grad_norm": 1.6254994621551133, + "language_loss": 0.69982457, + "learning_rate": 1.9882384698881596e-09, + "loss": 0.72114778, + "num_input_tokens_seen": 353940390, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.6640625, + "step": 16403, + "time_per_iteration": 2.531202554702759 + }, + { + "auxiliary_loss_clip": 0.01095747, + "auxiliary_loss_mlp": 0.01028757, + "balance_loss_clip": 1.01799786, + "balance_loss_mlp": 1.03178513, + "epoch": 0.986261836765369, + "flos": 28730403884160.0, + "grad_norm": 3.811739920354137, + "language_loss": 0.74388409, + "learning_rate": 1.9709147019204566e-09, + "loss": 0.76512915, + "num_input_tokens_seen": 353962180, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.640625, + "step": 16404, + "time_per_iteration": 2.4980599880218506 + }, + { + "auxiliary_loss_clip": 0.01099419, + "auxiliary_loss_mlp": 0.01025517, + "balance_loss_clip": 1.01446664, + "balance_loss_mlp": 1.03424644, + "epoch": 0.986321960018037, + "flos": 34313471568000.0, + "grad_norm": 2.2063331228177026, + "language_loss": 0.7017042, + "learning_rate": 1.953666699415768e-09, + "loss": 0.72295356, + "num_input_tokens_seen": 353984305, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65234375, + "step": 16405, + "time_per_iteration": 2.5640861988067627 + }, + { + "auxiliary_loss_clip": 0.01098905, + "auxiliary_loss_mlp": 0.01032199, + "balance_loss_clip": 1.02158928, + "balance_loss_mlp": 1.03562474, + "epoch": 0.986382083270705, + "flos": 25189755436800.0, + "grad_norm": 1.9063114595152784, + "language_loss": 0.69724238, + "learning_rate": 1.93649446302846e-09, + "loss": 0.71855342, + "num_input_tokens_seen": 354004495, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6328125, + "step": 16406, + "time_per_iteration": 2.495396375656128 + }, + { + "auxiliary_loss_clip": 0.0109845, + "auxiliary_loss_mlp": 0.01033359, + "balance_loss_clip": 1.0219388, + "balance_loss_mlp": 1.03573644, + "epoch": 0.9864422065233729, + "flos": 11025904671360.0, + "grad_norm": 3.422415674377729, + "language_loss": 0.74666607, + "learning_rate": 1.9193979934095663e-09, + "loss": 0.76798415, + "num_input_tokens_seen": 354015985, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.62890625, + "step": 16407, + "time_per_iteration": 2.4423317909240723 + }, + { + "auxiliary_loss_clip": 0.01096271, + "auxiliary_loss_mlp": 0.01029776, + "balance_loss_clip": 1.01847541, + "balance_loss_mlp": 1.03227949, + "epoch": 0.9865023297760409, + "flos": 16545590807040.0, + "grad_norm": 2.0280682887317667, + "language_loss": 0.77168655, + "learning_rate": 1.9023772912072357e-09, + "loss": 0.79294705, + "num_input_tokens_seen": 354033260, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 16408, + "time_per_iteration": 2.516061782836914 + }, + { + "auxiliary_loss_clip": 0.01101714, + "auxiliary_loss_mlp": 0.01029351, + "balance_loss_clip": 1.01701331, + "balance_loss_mlp": 1.03434014, + "epoch": 0.9865624530287088, + "flos": 18880179269760.0, + "grad_norm": 1.6301786211339495, + "language_loss": 0.67791158, + "learning_rate": 1.8854323570669515e-09, + "loss": 0.69922221, + "num_input_tokens_seen": 354052825, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.671875, + "step": 16409, + "time_per_iteration": 2.4589552879333496 + }, + { + "auxiliary_loss_clip": 0.0102164, + "auxiliary_loss_mlp": 0.00999411, + "balance_loss_clip": 0.99843997, + "balance_loss_mlp": 1.00167465, + "epoch": 0.9866225762813768, + "flos": 68887798680960.0, + "grad_norm": 0.8043087350098772, + "language_loss": 0.61067098, + "learning_rate": 1.8685631916313118e-09, + "loss": 0.63088149, + "num_input_tokens_seen": 354113920, + "router_z_loss_clip": 0.00970459, + "router_z_loss_mlp": 0.19921875, + "step": 16410, + "time_per_iteration": 3.1711127758026123 + }, + { + "auxiliary_loss_clip": 0.01099821, + "auxiliary_loss_mlp": 0.01029496, + "balance_loss_clip": 1.0180335, + "balance_loss_mlp": 1.03321028, + "epoch": 0.9866826995340447, + "flos": 29023111814400.0, + "grad_norm": 2.895263925191816, + "language_loss": 0.66438043, + "learning_rate": 1.8517697955400258e-09, + "loss": 0.68567365, + "num_input_tokens_seen": 354134210, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6640625, + "step": 16411, + "time_per_iteration": 2.529005765914917 + }, + { + "auxiliary_loss_clip": 0.01021481, + "auxiliary_loss_mlp": 0.01002904, + "balance_loss_clip": 1.00196195, + "balance_loss_mlp": 1.00161529, + "epoch": 0.9867428227867128, + "flos": 65376814867200.0, + "grad_norm": 0.7221177127578288, + "language_loss": 0.56282055, + "learning_rate": 1.8350521694299182e-09, + "loss": 0.58306438, + "num_input_tokens_seen": 354198010, + "router_z_loss_clip": 0.00939941, + "router_z_loss_mlp": 0.19921875, + "step": 16412, + "time_per_iteration": 3.1342015266418457 + }, + { + "auxiliary_loss_clip": 0.01102714, + "auxiliary_loss_mlp": 0.01031697, + "balance_loss_clip": 1.01949, + "balance_loss_mlp": 1.0351032, + "epoch": 0.9868029460393807, + "flos": 26506312634880.0, + "grad_norm": 1.5575339262302221, + "language_loss": 0.73079598, + "learning_rate": 1.818410313934926e-09, + "loss": 0.75214005, + "num_input_tokens_seen": 354220000, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.67578125, + "step": 16413, + "time_per_iteration": 2.504788637161255 + }, + { + "auxiliary_loss_clip": 0.01098204, + "auxiliary_loss_mlp": 0.01026072, + "balance_loss_clip": 1.01468778, + "balance_loss_mlp": 1.03174376, + "epoch": 0.9868630692920487, + "flos": 22967280299520.0, + "grad_norm": 1.7588665592045418, + "language_loss": 0.71731371, + "learning_rate": 1.8018442296858782e-09, + "loss": 0.7385565, + "num_input_tokens_seen": 354240910, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6640625, + "step": 16414, + "time_per_iteration": 2.459226369857788 + }, + { + "auxiliary_loss_clip": 0.01097317, + "auxiliary_loss_mlp": 0.01030344, + "balance_loss_clip": 1.01959157, + "balance_loss_mlp": 1.03502083, + "epoch": 0.9869231925447167, + "flos": 19828687760640.0, + "grad_norm": 1.8732802081959814, + "language_loss": 0.70089632, + "learning_rate": 1.7853539173111608e-09, + "loss": 0.72217298, + "num_input_tokens_seen": 354259430, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.625, + "step": 16415, + "time_per_iteration": 2.4389734268188477 + }, + { + "auxiliary_loss_clip": 0.01093108, + "auxiliary_loss_mlp": 0.01024293, + "balance_loss_clip": 1.01405334, + "balance_loss_mlp": 1.03190827, + "epoch": 0.9869833157973846, + "flos": 20195228096640.0, + "grad_norm": 1.436678023164937, + "language_loss": 0.75416452, + "learning_rate": 1.7689393774362737e-09, + "loss": 0.77533853, + "num_input_tokens_seen": 354279490, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.61328125, + "step": 16416, + "time_per_iteration": 2.4589333534240723 + }, + { + "auxiliary_loss_clip": 0.01098366, + "auxiliary_loss_mlp": 0.01030917, + "balance_loss_clip": 1.01936018, + "balance_loss_mlp": 1.03430641, + "epoch": 0.9870434390500527, + "flos": 16099507802880.0, + "grad_norm": 2.5382717093176907, + "language_loss": 0.70592904, + "learning_rate": 1.7526006106833858e-09, + "loss": 0.72722185, + "num_input_tokens_seen": 354295080, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.640625, + "step": 16417, + "time_per_iteration": 2.411501169204712 + }, + { + "auxiliary_loss_clip": 0.01104486, + "auxiliary_loss_mlp": 0.01031386, + "balance_loss_clip": 1.01942897, + "balance_loss_mlp": 1.036502, + "epoch": 0.9871035623027206, + "flos": 21760753438080.0, + "grad_norm": 1.3664906698719754, + "language_loss": 0.70402956, + "learning_rate": 1.7363376176720013e-09, + "loss": 0.72538829, + "num_input_tokens_seen": 354314610, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6796875, + "step": 16418, + "time_per_iteration": 2.460721731185913 + }, + { + "auxiliary_loss_clip": 0.0102156, + "auxiliary_loss_mlp": 0.0099861, + "balance_loss_clip": 0.99766254, + "balance_loss_mlp": 1.00155318, + "epoch": 0.9871636855553886, + "flos": 70219583245440.0, + "grad_norm": 0.6547409033160193, + "language_loss": 0.53709066, + "learning_rate": 1.7201503990189603e-09, + "loss": 0.55729234, + "num_input_tokens_seen": 354383115, + "router_z_loss_clip": 0.00946045, + "router_z_loss_mlp": 0.20019531, + "step": 16419, + "time_per_iteration": 3.1816153526306152 + }, + { + "auxiliary_loss_clip": 0.01102162, + "auxiliary_loss_mlp": 0.01033112, + "balance_loss_clip": 1.0203383, + "balance_loss_mlp": 1.03322339, + "epoch": 0.9872238088080565, + "flos": 25045825639680.0, + "grad_norm": 1.8013568120326042, + "language_loss": 0.78115129, + "learning_rate": 1.7040389553382162e-09, + "loss": 0.80250394, + "num_input_tokens_seen": 354403115, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.69140625, + "step": 16420, + "time_per_iteration": 2.513073682785034 + }, + { + "auxiliary_loss_clip": 0.01100847, + "auxiliary_loss_mlp": 0.01026668, + "balance_loss_clip": 1.01531947, + "balance_loss_mlp": 1.03717494, + "epoch": 0.9872839320607245, + "flos": 19465846525440.0, + "grad_norm": 1.6534906629377784, + "language_loss": 0.70953268, + "learning_rate": 1.6880032872403916e-09, + "loss": 0.73080778, + "num_input_tokens_seen": 354424520, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.63671875, + "step": 16421, + "time_per_iteration": 2.478576898574829 + }, + { + "auxiliary_loss_clip": 0.0110206, + "auxiliary_loss_mlp": 0.01034617, + "balance_loss_clip": 1.02187395, + "balance_loss_mlp": 1.03412378, + "epoch": 0.9873440553133924, + "flos": 26942914448640.0, + "grad_norm": 1.9879851730587292, + "language_loss": 0.82305312, + "learning_rate": 1.6720433953338886e-09, + "loss": 0.8444199, + "num_input_tokens_seen": 354444800, + "router_z_loss_clip": 0.12792969, + "router_z_loss_mlp": 0.6796875, + "step": 16422, + "time_per_iteration": 2.5021886825561523 + }, + { + "auxiliary_loss_clip": 0.01098518, + "auxiliary_loss_mlp": 0.01025282, + "balance_loss_clip": 1.01423693, + "balance_loss_mlp": 1.03484821, + "epoch": 0.9874041785660604, + "flos": 19062210418560.0, + "grad_norm": 1.7189782586075049, + "language_loss": 0.86038244, + "learning_rate": 1.656159280223779e-09, + "loss": 0.88162035, + "num_input_tokens_seen": 354464590, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.63671875, + "step": 16423, + "time_per_iteration": 2.485748052597046 + }, + { + "auxiliary_loss_clip": 0.01102025, + "auxiliary_loss_mlp": 0.0102766, + "balance_loss_clip": 1.016186, + "balance_loss_mlp": 1.03556752, + "epoch": 0.9874643018187284, + "flos": 21105814803840.0, + "grad_norm": 2.8325177104829575, + "language_loss": 0.70638502, + "learning_rate": 1.6403509425122475e-09, + "loss": 0.72768188, + "num_input_tokens_seen": 354484145, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6640625, + "step": 16424, + "time_per_iteration": 2.4713919162750244 + }, + { + "auxiliary_loss_clip": 0.01099037, + "auxiliary_loss_mlp": 0.01029034, + "balance_loss_clip": 1.0172801, + "balance_loss_mlp": 1.03294778, + "epoch": 0.9875244250713964, + "flos": 24426043441920.0, + "grad_norm": 1.9805103537761688, + "language_loss": 0.80257469, + "learning_rate": 1.6246183827990366e-09, + "loss": 0.8238554, + "num_input_tokens_seen": 354502475, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6640625, + "step": 16425, + "time_per_iteration": 2.477077007293701 + }, + { + "auxiliary_loss_clip": 0.01100313, + "auxiliary_loss_mlp": 0.01030155, + "balance_loss_clip": 1.01812065, + "balance_loss_mlp": 1.03379631, + "epoch": 0.9875845483240643, + "flos": 25117610970240.0, + "grad_norm": 1.9649003604605135, + "language_loss": 0.79694617, + "learning_rate": 1.6089616016803364e-09, + "loss": 0.81825078, + "num_input_tokens_seen": 354521855, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6640625, + "step": 16426, + "time_per_iteration": 2.47365403175354 + }, + { + "auxiliary_loss_clip": 0.01100099, + "auxiliary_loss_mlp": 0.01032306, + "balance_loss_clip": 1.02130342, + "balance_loss_mlp": 1.03583455, + "epoch": 0.9876446715767323, + "flos": 16581788737920.0, + "grad_norm": 1.919968888964341, + "language_loss": 0.84918183, + "learning_rate": 1.593380599750338e-09, + "loss": 0.87050593, + "num_input_tokens_seen": 354539535, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 16427, + "time_per_iteration": 2.4481728076934814 + }, + { + "auxiliary_loss_clip": 0.01097771, + "auxiliary_loss_mlp": 0.01031649, + "balance_loss_clip": 1.02047956, + "balance_loss_mlp": 1.03433597, + "epoch": 0.9877047948294003, + "flos": 21616141282560.0, + "grad_norm": 1.808835451302429, + "language_loss": 0.70217133, + "learning_rate": 1.577875377599458e-09, + "loss": 0.72346556, + "num_input_tokens_seen": 354557430, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.6328125, + "step": 16428, + "time_per_iteration": 3.8493616580963135 + }, + { + "auxiliary_loss_clip": 0.01096844, + "auxiliary_loss_mlp": 0.01031684, + "balance_loss_clip": 1.02038908, + "balance_loss_mlp": 1.03368545, + "epoch": 0.9877649180820682, + "flos": 21178497974400.0, + "grad_norm": 1.9142096733438485, + "language_loss": 0.79910493, + "learning_rate": 1.5624459358158926e-09, + "loss": 0.82039022, + "num_input_tokens_seen": 354574735, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6328125, + "step": 16429, + "time_per_iteration": 2.4763388633728027 + }, + { + "auxiliary_loss_clip": 0.01097307, + "auxiliary_loss_mlp": 0.01026506, + "balance_loss_clip": 1.01552033, + "balance_loss_mlp": 1.03279054, + "epoch": 0.9878250413347363, + "flos": 39749233576320.0, + "grad_norm": 1.512921455158019, + "language_loss": 0.61957049, + "learning_rate": 1.5470922749845073e-09, + "loss": 0.64080858, + "num_input_tokens_seen": 354597050, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6484375, + "step": 16430, + "time_per_iteration": 2.6241238117218018 + }, + { + "auxiliary_loss_clip": 0.01099232, + "auxiliary_loss_mlp": 0.01031942, + "balance_loss_clip": 1.02070665, + "balance_loss_mlp": 1.03415561, + "epoch": 0.9878851645874042, + "flos": 29425634599680.0, + "grad_norm": 1.4649628696556245, + "language_loss": 0.72812045, + "learning_rate": 1.531814395687725e-09, + "loss": 0.74943221, + "num_input_tokens_seen": 354619095, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.65234375, + "step": 16431, + "time_per_iteration": 5.483947038650513 + }, + { + "auxiliary_loss_clip": 0.01099159, + "auxiliary_loss_mlp": 0.01031109, + "balance_loss_clip": 1.01914072, + "balance_loss_mlp": 1.03423476, + "epoch": 0.9879452878400722, + "flos": 15806261168640.0, + "grad_norm": 2.237292179691481, + "language_loss": 0.81017017, + "learning_rate": 1.5166122985048602e-09, + "loss": 0.83147275, + "num_input_tokens_seen": 354633790, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.6484375, + "step": 16432, + "time_per_iteration": 2.508455753326416 + }, + { + "auxiliary_loss_clip": 0.0109516, + "auxiliary_loss_mlp": 0.01030559, + "balance_loss_clip": 1.02015758, + "balance_loss_mlp": 1.0320065, + "epoch": 0.9880054110927401, + "flos": 22233912318720.0, + "grad_norm": 1.573439627141087, + "language_loss": 0.80520278, + "learning_rate": 1.5014859840123405e-09, + "loss": 0.82645994, + "num_input_tokens_seen": 354653180, + "router_z_loss_clip": 0.10400391, + "router_z_loss_mlp": 0.6328125, + "step": 16433, + "time_per_iteration": 2.4864025115966797 + }, + { + "auxiliary_loss_clip": 0.01096891, + "auxiliary_loss_mlp": 0.01032717, + "balance_loss_clip": 1.0212785, + "balance_loss_mlp": 1.03420711, + "epoch": 0.9880655343454081, + "flos": 28763836467840.0, + "grad_norm": 2.5851273192510784, + "language_loss": 0.64777255, + "learning_rate": 1.4864354527837075e-09, + "loss": 0.66906863, + "num_input_tokens_seen": 354669900, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.625, + "step": 16434, + "time_per_iteration": 2.5199642181396484 + }, + { + "auxiliary_loss_clip": 0.010991, + "auxiliary_loss_mlp": 0.01029842, + "balance_loss_clip": 1.017802, + "balance_loss_mlp": 1.03237224, + "epoch": 0.988125657598076, + "flos": 32853379622400.0, + "grad_norm": 1.5409302130170526, + "language_loss": 0.69133604, + "learning_rate": 1.4714607053896154e-09, + "loss": 0.71262544, + "num_input_tokens_seen": 354693165, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.66796875, + "step": 16435, + "time_per_iteration": 2.568521499633789 + }, + { + "auxiliary_loss_clip": 0.01100032, + "auxiliary_loss_mlp": 0.01034501, + "balance_loss_clip": 1.02277112, + "balance_loss_mlp": 1.03586268, + "epoch": 0.988185780850744, + "flos": 19390685316480.0, + "grad_norm": 1.8027754000031349, + "language_loss": 0.75371569, + "learning_rate": 1.4565617423980548e-09, + "loss": 0.77506101, + "num_input_tokens_seen": 354711915, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.640625, + "step": 16436, + "time_per_iteration": 3.9403867721557617 + }, + { + "auxiliary_loss_clip": 0.01100488, + "auxiliary_loss_mlp": 0.01029817, + "balance_loss_clip": 1.01765811, + "balance_loss_mlp": 1.03544521, + "epoch": 0.988245904103412, + "flos": 22528415928960.0, + "grad_norm": 2.2611545636080606, + "language_loss": 0.74154097, + "learning_rate": 1.4417385643741286e-09, + "loss": 0.76284397, + "num_input_tokens_seen": 354729135, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.65234375, + "step": 16437, + "time_per_iteration": 2.4243505001068115 + }, + { + "auxiliary_loss_clip": 0.01094253, + "auxiliary_loss_mlp": 0.01028978, + "balance_loss_clip": 1.01777816, + "balance_loss_mlp": 1.03196597, + "epoch": 0.98830602735608, + "flos": 28659193171200.0, + "grad_norm": 1.57503861235398, + "language_loss": 0.60063571, + "learning_rate": 1.4269911718796103e-09, + "loss": 0.62186807, + "num_input_tokens_seen": 354752530, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.62109375, + "step": 16438, + "time_per_iteration": 2.519336223602295 + }, + { + "auxiliary_loss_clip": 0.01098767, + "auxiliary_loss_mlp": 0.01028757, + "balance_loss_clip": 1.01675236, + "balance_loss_mlp": 1.03432131, + "epoch": 0.9883661506087479, + "flos": 20996035862400.0, + "grad_norm": 1.9343189571400579, + "language_loss": 0.71689999, + "learning_rate": 1.4123195654738295e-09, + "loss": 0.73817527, + "num_input_tokens_seen": 354771135, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.64453125, + "step": 16439, + "time_per_iteration": 2.429018974304199 + }, + { + "auxiliary_loss_clip": 0.01097636, + "auxiliary_loss_mlp": 0.01031286, + "balance_loss_clip": 1.01943684, + "balance_loss_mlp": 1.03360188, + "epoch": 0.9884262738614159, + "flos": 32706109860480.0, + "grad_norm": 1.524025495504474, + "language_loss": 0.60003507, + "learning_rate": 1.3977237457134528e-09, + "loss": 0.6213243, + "num_input_tokens_seen": 354791800, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.640625, + "step": 16440, + "time_per_iteration": 2.5483033657073975 + }, + { + "auxiliary_loss_clip": 0.0110006, + "auxiliary_loss_mlp": 0.0102916, + "balance_loss_clip": 1.01797223, + "balance_loss_mlp": 1.03258252, + "epoch": 0.9884863971140839, + "flos": 17564699479680.0, + "grad_norm": 2.3344058610035954, + "language_loss": 0.75737202, + "learning_rate": 1.3832037131513707e-09, + "loss": 0.77866423, + "num_input_tokens_seen": 354809200, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.67578125, + "step": 16441, + "time_per_iteration": 2.4178669452667236 + }, + { + "auxiliary_loss_clip": 0.01098798, + "auxiliary_loss_mlp": 0.01026943, + "balance_loss_clip": 1.01528406, + "balance_loss_mlp": 1.03330851, + "epoch": 0.9885465203667518, + "flos": 40552519380480.0, + "grad_norm": 2.8695336185475675, + "language_loss": 0.68061352, + "learning_rate": 1.3687594683386982e-09, + "loss": 0.70187092, + "num_input_tokens_seen": 354829945, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 16442, + "time_per_iteration": 2.6177468299865723 + }, + { + "auxiliary_loss_clip": 0.01098647, + "auxiliary_loss_mlp": 0.01028235, + "balance_loss_clip": 1.01696944, + "balance_loss_mlp": 1.03386927, + "epoch": 0.9886066436194199, + "flos": 13807976768640.0, + "grad_norm": 2.091114681520994, + "language_loss": 0.74713242, + "learning_rate": 1.3543910118227753e-09, + "loss": 0.76840127, + "num_input_tokens_seen": 354845055, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6484375, + "step": 16443, + "time_per_iteration": 2.386375904083252 + }, + { + "auxiliary_loss_clip": 0.01100478, + "auxiliary_loss_mlp": 0.0102655, + "balance_loss_clip": 1.01426518, + "balance_loss_mlp": 1.03359616, + "epoch": 0.9886667668720878, + "flos": 23325129544320.0, + "grad_norm": 1.669549874757317, + "language_loss": 0.73382336, + "learning_rate": 1.3400983441487213e-09, + "loss": 0.75509363, + "num_input_tokens_seen": 354864680, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66796875, + "step": 16444, + "time_per_iteration": 2.4635965824127197 + }, + { + "auxiliary_loss_clip": 0.01099254, + "auxiliary_loss_mlp": 0.01029277, + "balance_loss_clip": 1.01845896, + "balance_loss_mlp": 1.03630447, + "epoch": 0.9887268901247558, + "flos": 22706029704960.0, + "grad_norm": 1.8001985272034744, + "language_loss": 0.69300127, + "learning_rate": 1.325881465858547e-09, + "loss": 0.71428657, + "num_input_tokens_seen": 354885685, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.62890625, + "step": 16445, + "time_per_iteration": 2.4620094299316406 + }, + { + "auxiliary_loss_clip": 0.01100718, + "auxiliary_loss_mlp": 0.01024609, + "balance_loss_clip": 1.01272345, + "balance_loss_mlp": 1.03484011, + "epoch": 0.9887870133774237, + "flos": 13041283944960.0, + "grad_norm": 3.924942005630186, + "language_loss": 0.60178292, + "learning_rate": 1.311740377491155e-09, + "loss": 0.62303621, + "num_input_tokens_seen": 354901505, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66015625, + "step": 16446, + "time_per_iteration": 2.4125113487243652 + }, + { + "auxiliary_loss_clip": 0.01097171, + "auxiliary_loss_mlp": 0.01031894, + "balance_loss_clip": 1.02105784, + "balance_loss_mlp": 1.03262711, + "epoch": 0.9888471366300917, + "flos": 15158864390400.0, + "grad_norm": 2.0039707237348914, + "language_loss": 0.7062999, + "learning_rate": 1.297675079582783e-09, + "loss": 0.72759056, + "num_input_tokens_seen": 354920060, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.64453125, + "step": 16447, + "time_per_iteration": 2.428260564804077 + }, + { + "auxiliary_loss_clip": 0.01097888, + "auxiliary_loss_mlp": 0.01026915, + "balance_loss_clip": 1.0161624, + "balance_loss_mlp": 1.03387737, + "epoch": 0.9889072598827596, + "flos": 25118796119040.0, + "grad_norm": 1.8113206322833593, + "language_loss": 0.83943892, + "learning_rate": 1.2836855726667818e-09, + "loss": 0.8606869, + "num_input_tokens_seen": 354938690, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.640625, + "step": 16448, + "time_per_iteration": 2.495060443878174 + }, + { + "auxiliary_loss_clip": 0.01095164, + "auxiliary_loss_mlp": 0.01024451, + "balance_loss_clip": 1.0142343, + "balance_loss_mlp": 1.03284883, + "epoch": 0.9889673831354276, + "flos": 16728663450240.0, + "grad_norm": 1.6079355233530224, + "language_loss": 0.7015419, + "learning_rate": 1.26977185727406e-09, + "loss": 0.72273797, + "num_input_tokens_seen": 354956955, + "router_z_loss_clip": 0.10205078, + "router_z_loss_mlp": 0.625, + "step": 16449, + "time_per_iteration": 2.5300204753875732 + }, + { + "auxiliary_loss_clip": 0.01100835, + "auxiliary_loss_mlp": 0.0102704, + "balance_loss_clip": 1.01573348, + "balance_loss_mlp": 1.03388381, + "epoch": 0.9890275063880956, + "flos": 35585175657600.0, + "grad_norm": 2.23472735213177, + "language_loss": 0.74104172, + "learning_rate": 1.25593393393153e-09, + "loss": 0.76232046, + "num_input_tokens_seen": 354976800, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.671875, + "step": 16450, + "time_per_iteration": 2.5563437938690186 + }, + { + "auxiliary_loss_clip": 0.0110011, + "auxiliary_loss_mlp": 0.01030912, + "balance_loss_clip": 1.01878238, + "balance_loss_mlp": 1.03238416, + "epoch": 0.9890876296407636, + "flos": 18952359649920.0, + "grad_norm": 2.0590555415637493, + "language_loss": 0.79410666, + "learning_rate": 1.242171803164549e-09, + "loss": 0.81541693, + "num_input_tokens_seen": 354996625, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6796875, + "step": 16451, + "time_per_iteration": 2.4256367683410645 + }, + { + "auxiliary_loss_clip": 0.01099003, + "auxiliary_loss_mlp": 0.01036805, + "balance_loss_clip": 1.02487206, + "balance_loss_mlp": 1.0322752, + "epoch": 0.9891477528934315, + "flos": 23769309127680.0, + "grad_norm": 1.9104470417388077, + "language_loss": 0.6977967, + "learning_rate": 1.2284854654946996e-09, + "loss": 0.71915483, + "num_input_tokens_seen": 355014535, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66796875, + "step": 16452, + "time_per_iteration": 2.4743566513061523 + }, + { + "auxiliary_loss_clip": 0.01096477, + "auxiliary_loss_mlp": 0.01023142, + "balance_loss_clip": 1.01255608, + "balance_loss_mlp": 1.03474927, + "epoch": 0.9892078761460995, + "flos": 20772922533120.0, + "grad_norm": 1.8258741386751924, + "language_loss": 0.73913336, + "learning_rate": 1.2148749214409004e-09, + "loss": 0.7603296, + "num_input_tokens_seen": 355033280, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.6171875, + "step": 16453, + "time_per_iteration": 2.4414727687835693 + }, + { + "auxiliary_loss_clip": 0.01098548, + "auxiliary_loss_mlp": 0.01034916, + "balance_loss_clip": 1.0235076, + "balance_loss_mlp": 1.0325352, + "epoch": 0.9892679993987675, + "flos": 23367827836800.0, + "grad_norm": 2.157949205702443, + "language_loss": 0.69432741, + "learning_rate": 1.2013401715191828e-09, + "loss": 0.71566206, + "num_input_tokens_seen": 355053320, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.66015625, + "step": 16454, + "time_per_iteration": 2.470310926437378 + }, + { + "auxiliary_loss_clip": 0.01095091, + "auxiliary_loss_mlp": 0.01029019, + "balance_loss_clip": 1.01800466, + "balance_loss_mlp": 1.03340435, + "epoch": 0.9893281226514354, + "flos": 22705419173760.0, + "grad_norm": 1.9739379495934455, + "language_loss": 0.75967795, + "learning_rate": 1.1878812162433583e-09, + "loss": 0.78091908, + "num_input_tokens_seen": 355070230, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6171875, + "step": 16455, + "time_per_iteration": 2.481872797012329 + }, + { + "auxiliary_loss_clip": 0.01096584, + "auxiliary_loss_mlp": 0.01024963, + "balance_loss_clip": 1.01354313, + "balance_loss_mlp": 1.03333139, + "epoch": 0.9893882459041035, + "flos": 21796664060160.0, + "grad_norm": 1.6907179188654564, + "language_loss": 0.65590852, + "learning_rate": 1.1744980561230188e-09, + "loss": 0.67712402, + "num_input_tokens_seen": 355090125, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.6328125, + "step": 16456, + "time_per_iteration": 2.4589202404022217 + }, + { + "auxiliary_loss_clip": 0.01101842, + "auxiliary_loss_mlp": 0.01026745, + "balance_loss_clip": 1.01516438, + "balance_loss_mlp": 1.03582501, + "epoch": 0.9894483691567714, + "flos": 18113773754880.0, + "grad_norm": 1.6581165178501178, + "language_loss": 0.7385301, + "learning_rate": 1.161190691666203e-09, + "loss": 0.75981599, + "num_input_tokens_seen": 355107890, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66015625, + "step": 16457, + "time_per_iteration": 2.409771203994751 + }, + { + "auxiliary_loss_clip": 0.01100137, + "auxiliary_loss_mlp": 0.01026342, + "balance_loss_clip": 1.0151962, + "balance_loss_mlp": 1.03518713, + "epoch": 0.9895084924094394, + "flos": 31211615664000.0, + "grad_norm": 2.0309095473748253, + "language_loss": 0.68817085, + "learning_rate": 1.1479591233773954e-09, + "loss": 0.70943564, + "num_input_tokens_seen": 355126340, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6484375, + "step": 16458, + "time_per_iteration": 2.502516269683838 + }, + { + "auxiliary_loss_clip": 0.01095123, + "auxiliary_loss_mlp": 0.0102721, + "balance_loss_clip": 1.01597428, + "balance_loss_mlp": 1.03228736, + "epoch": 0.9895686156621073, + "flos": 19678042120320.0, + "grad_norm": 1.6352367515725288, + "language_loss": 0.79176056, + "learning_rate": 1.1348033517581956e-09, + "loss": 0.81298381, + "num_input_tokens_seen": 355144025, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.62890625, + "step": 16459, + "time_per_iteration": 2.4237966537475586 + }, + { + "auxiliary_loss_clip": 0.01098841, + "auxiliary_loss_mlp": 0.01031369, + "balance_loss_clip": 1.02002609, + "balance_loss_mlp": 1.03269553, + "epoch": 0.9896287389147753, + "flos": 23581675457280.0, + "grad_norm": 1.972457674640829, + "language_loss": 0.71052337, + "learning_rate": 1.1217233773075373e-09, + "loss": 0.73182547, + "num_input_tokens_seen": 355163125, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66015625, + "step": 16460, + "time_per_iteration": 2.437721014022827 + }, + { + "auxiliary_loss_clip": 0.01100449, + "auxiliary_loss_mlp": 0.0102517, + "balance_loss_clip": 1.01348758, + "balance_loss_mlp": 1.03307641, + "epoch": 0.9896888621674432, + "flos": 29605331364480.0, + "grad_norm": 1.5309126749149615, + "language_loss": 0.87348777, + "learning_rate": 1.1087192005214685e-09, + "loss": 0.89474398, + "num_input_tokens_seen": 355184060, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.671875, + "step": 16461, + "time_per_iteration": 2.491044759750366 + }, + { + "auxiliary_loss_clip": 0.01098778, + "auxiliary_loss_mlp": 0.01030685, + "balance_loss_clip": 1.01894927, + "balance_loss_mlp": 1.03391469, + "epoch": 0.9897489854201112, + "flos": 23695045758720.0, + "grad_norm": 2.5517214726118924, + "language_loss": 0.63009971, + "learning_rate": 1.09579082189315e-09, + "loss": 0.65139437, + "num_input_tokens_seen": 355204505, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6484375, + "step": 16462, + "time_per_iteration": 2.462428569793701 + }, + { + "auxiliary_loss_clip": 0.01101282, + "auxiliary_loss_mlp": 0.01028389, + "balance_loss_clip": 1.01751733, + "balance_loss_mlp": 1.03612256, + "epoch": 0.9898091086727792, + "flos": 13225146687360.0, + "grad_norm": 1.5403701602068196, + "language_loss": 0.72850609, + "learning_rate": 1.0829382419126343e-09, + "loss": 0.74980283, + "num_input_tokens_seen": 355223055, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.65234375, + "step": 16463, + "time_per_iteration": 2.419002056121826 + }, + { + "auxiliary_loss_clip": 0.01097709, + "auxiliary_loss_mlp": 0.0102841, + "balance_loss_clip": 1.01629817, + "balance_loss_mlp": 1.03312826, + "epoch": 0.9898692319254472, + "flos": 22930400010240.0, + "grad_norm": 1.7490706399263698, + "language_loss": 0.70085156, + "learning_rate": 1.0701614610675314e-09, + "loss": 0.72211272, + "num_input_tokens_seen": 355242000, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.64453125, + "step": 16464, + "time_per_iteration": 2.4515798091888428 + }, + { + "auxiliary_loss_clip": 0.01099892, + "auxiliary_loss_mlp": 0.01029848, + "balance_loss_clip": 1.01814127, + "balance_loss_mlp": 1.03332138, + "epoch": 0.9899293551781151, + "flos": 12458346122880.0, + "grad_norm": 2.0677528811720993, + "language_loss": 0.73172307, + "learning_rate": 1.0574604798421204e-09, + "loss": 0.75302052, + "num_input_tokens_seen": 355260175, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 16465, + "time_per_iteration": 2.5083959102630615 + }, + { + "auxiliary_loss_clip": 0.01095532, + "auxiliary_loss_mlp": 0.01030929, + "balance_loss_clip": 1.02051592, + "balance_loss_mlp": 1.03158927, + "epoch": 0.9899894784307831, + "flos": 26871129118080.0, + "grad_norm": 1.8661862242505183, + "language_loss": 0.86434472, + "learning_rate": 1.0448352987182386e-09, + "loss": 0.88560927, + "num_input_tokens_seen": 355281930, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.640625, + "step": 16466, + "time_per_iteration": 2.516256809234619 + }, + { + "auxiliary_loss_clip": 0.01099567, + "auxiliary_loss_mlp": 0.01023106, + "balance_loss_clip": 1.01206732, + "balance_loss_mlp": 1.03457332, + "epoch": 0.990049601683451, + "flos": 21542093395200.0, + "grad_norm": 1.786963509372796, + "language_loss": 0.71397775, + "learning_rate": 1.0322859181743915e-09, + "loss": 0.73520446, + "num_input_tokens_seen": 355301555, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 16467, + "time_per_iteration": 2.479933738708496 + }, + { + "auxiliary_loss_clip": 0.0109761, + "auxiliary_loss_mlp": 0.01027504, + "balance_loss_clip": 1.0166316, + "balance_loss_mlp": 1.03384113, + "epoch": 0.990109724936119, + "flos": 28771809287040.0, + "grad_norm": 1.2832078969195513, + "language_loss": 0.6496833, + "learning_rate": 1.019812338686643e-09, + "loss": 0.67093444, + "num_input_tokens_seen": 355324925, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.63671875, + "step": 16468, + "time_per_iteration": 2.5504212379455566 + }, + { + "auxiliary_loss_clip": 0.01102252, + "auxiliary_loss_mlp": 0.0102875, + "balance_loss_clip": 1.01737785, + "balance_loss_mlp": 1.03365922, + "epoch": 0.9901698481887871, + "flos": 29274270687360.0, + "grad_norm": 1.6640074455423066, + "language_loss": 0.61527658, + "learning_rate": 1.0074145607281704e-09, + "loss": 0.63658667, + "num_input_tokens_seen": 355343875, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.68359375, + "step": 16469, + "time_per_iteration": 2.4935665130615234 + }, + { + "auxiliary_loss_clip": 0.01101977, + "auxiliary_loss_mlp": 0.01026957, + "balance_loss_clip": 1.01494646, + "balance_loss_mlp": 1.0341469, + "epoch": 0.990229971441455, + "flos": 15959025711360.0, + "grad_norm": 2.656192366065704, + "language_loss": 0.70006144, + "learning_rate": 9.950925847685976e-10, + "loss": 0.72135079, + "num_input_tokens_seen": 355358835, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 16470, + "time_per_iteration": 3.8489348888397217 + }, + { + "auxiliary_loss_clip": 0.01021289, + "auxiliary_loss_mlp": 0.0100051, + "balance_loss_clip": 0.99951476, + "balance_loss_mlp": 1.00131559, + "epoch": 0.990290094694123, + "flos": 69780287911680.0, + "grad_norm": 0.6697157586994648, + "language_loss": 0.55488944, + "learning_rate": 9.828464112755509e-10, + "loss": 0.57510746, + "num_input_tokens_seen": 355431225, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.19921875, + "step": 16471, + "time_per_iteration": 3.2679431438446045 + }, + { + "auxiliary_loss_clip": 0.0110021, + "auxiliary_loss_mlp": 0.01030858, + "balance_loss_clip": 1.01922309, + "balance_loss_mlp": 1.03532815, + "epoch": 0.9903502179467909, + "flos": 16252451913600.0, + "grad_norm": 7.34429735890774, + "language_loss": 0.83630276, + "learning_rate": 9.706760407131032e-10, + "loss": 0.85761344, + "num_input_tokens_seen": 355448250, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6484375, + "step": 16472, + "time_per_iteration": 3.8357088565826416 + }, + { + "auxiliary_loss_clip": 0.01098877, + "auxiliary_loss_mlp": 0.01026717, + "balance_loss_clip": 1.01551747, + "balance_loss_mlp": 1.03430748, + "epoch": 0.9904103411994589, + "flos": 21688393489920.0, + "grad_norm": 1.9915784555897358, + "language_loss": 0.8572318, + "learning_rate": 9.585814735431075e-10, + "loss": 0.87848771, + "num_input_tokens_seen": 355467040, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.64453125, + "step": 16473, + "time_per_iteration": 3.857786178588867 + }, + { + "auxiliary_loss_clip": 0.01096247, + "auxiliary_loss_mlp": 0.01028542, + "balance_loss_clip": 1.01799798, + "balance_loss_mlp": 1.03188705, + "epoch": 0.9904704644521268, + "flos": 25739440243200.0, + "grad_norm": 1.6814272861208508, + "language_loss": 0.84478509, + "learning_rate": 9.465627102240859e-10, + "loss": 0.86603308, + "num_input_tokens_seen": 355487825, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.64453125, + "step": 16474, + "time_per_iteration": 2.5232393741607666 + }, + { + "auxiliary_loss_clip": 0.01094689, + "auxiliary_loss_mlp": 0.010346, + "balance_loss_clip": 1.02355504, + "balance_loss_mlp": 1.02992404, + "epoch": 0.9905305877047949, + "flos": 21908346422400.0, + "grad_norm": 1.7352826066562033, + "language_loss": 0.76060629, + "learning_rate": 9.346197512116738e-10, + "loss": 0.78189915, + "num_input_tokens_seen": 355507445, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 16475, + "time_per_iteration": 2.490631341934204 + }, + { + "auxiliary_loss_clip": 0.01096946, + "auxiliary_loss_mlp": 0.01030426, + "balance_loss_clip": 1.01866627, + "balance_loss_mlp": 1.03057003, + "epoch": 0.9905907109574628, + "flos": 21392417422080.0, + "grad_norm": 1.4602935837993765, + "language_loss": 0.7602495, + "learning_rate": 9.227525969588423e-10, + "loss": 0.78152329, + "num_input_tokens_seen": 355527205, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6640625, + "step": 16476, + "time_per_iteration": 2.4616878032684326 + }, + { + "auxiliary_loss_clip": 0.01102543, + "auxiliary_loss_mlp": 0.01026095, + "balance_loss_clip": 1.01298785, + "balance_loss_mlp": 1.03349328, + "epoch": 0.9906508342101308, + "flos": 20521620005760.0, + "grad_norm": 2.177621847035773, + "language_loss": 0.67150068, + "learning_rate": 9.109612479154538e-10, + "loss": 0.69278705, + "num_input_tokens_seen": 355544740, + "router_z_loss_clip": 0.13085938, + "router_z_loss_mlp": 0.6875, + "step": 16477, + "time_per_iteration": 2.4365179538726807 + }, + { + "auxiliary_loss_clip": 0.01104342, + "auxiliary_loss_mlp": 0.01031681, + "balance_loss_clip": 1.01949191, + "balance_loss_mlp": 1.03697991, + "epoch": 0.9907109574627987, + "flos": 21361211481600.0, + "grad_norm": 2.8661825575863564, + "language_loss": 0.71520579, + "learning_rate": 8.992457045289282e-10, + "loss": 0.73656601, + "num_input_tokens_seen": 355564385, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.671875, + "step": 16478, + "time_per_iteration": 4.016811847686768 + }, + { + "auxiliary_loss_clip": 0.01099828, + "auxiliary_loss_mlp": 0.01036886, + "balance_loss_clip": 1.02429771, + "balance_loss_mlp": 1.03379405, + "epoch": 0.9907710807154667, + "flos": 17338605321600.0, + "grad_norm": 2.5601168693900895, + "language_loss": 0.81092632, + "learning_rate": 8.876059672433545e-10, + "loss": 0.83229345, + "num_input_tokens_seen": 355579260, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.66015625, + "step": 16479, + "time_per_iteration": 2.383894920349121 + }, + { + "auxiliary_loss_clip": 0.01100107, + "auxiliary_loss_mlp": 0.01029723, + "balance_loss_clip": 1.01847529, + "balance_loss_mlp": 1.03430843, + "epoch": 0.9908312039681346, + "flos": 28621881918720.0, + "grad_norm": 1.5250465793653611, + "language_loss": 0.6613217, + "learning_rate": 8.760420364999355e-10, + "loss": 0.68261993, + "num_input_tokens_seen": 355599790, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.65625, + "step": 16480, + "time_per_iteration": 2.484576463699341 + }, + { + "auxiliary_loss_clip": 0.01096725, + "auxiliary_loss_mlp": 0.01027987, + "balance_loss_clip": 1.01623344, + "balance_loss_mlp": 1.03243065, + "epoch": 0.9908913272208026, + "flos": 35770654512000.0, + "grad_norm": 1.6952711839759178, + "language_loss": 0.72282261, + "learning_rate": 8.645539127374313e-10, + "loss": 0.7440697, + "num_input_tokens_seen": 355620925, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.640625, + "step": 16481, + "time_per_iteration": 2.59816837310791 + }, + { + "auxiliary_loss_clip": 0.01097429, + "auxiliary_loss_mlp": 0.01024732, + "balance_loss_clip": 1.0137589, + "balance_loss_mlp": 1.03415012, + "epoch": 0.9909514504734707, + "flos": 19902196944000.0, + "grad_norm": 1.6583039054588096, + "language_loss": 0.77450025, + "learning_rate": 8.531415963912713e-10, + "loss": 0.79572183, + "num_input_tokens_seen": 355639165, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6328125, + "step": 16482, + "time_per_iteration": 2.4760994911193848 + }, + { + "auxiliary_loss_clip": 0.01100414, + "auxiliary_loss_mlp": 0.01027222, + "balance_loss_clip": 1.01600456, + "balance_loss_mlp": 1.03429222, + "epoch": 0.9910115737261386, + "flos": 20004793165440.0, + "grad_norm": 1.9891098013725752, + "language_loss": 0.75464189, + "learning_rate": 8.418050878944427e-10, + "loss": 0.77591825, + "num_input_tokens_seen": 355657320, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6640625, + "step": 16483, + "time_per_iteration": 2.438983917236328 + }, + { + "auxiliary_loss_clip": 0.01021514, + "auxiliary_loss_mlp": 0.01001794, + "balance_loss_clip": 1.00084007, + "balance_loss_mlp": 1.00166059, + "epoch": 0.9910716969788066, + "flos": 70688432494080.0, + "grad_norm": 0.6785624181259259, + "language_loss": 0.5365091, + "learning_rate": 8.305443876768237e-10, + "loss": 0.55674213, + "num_input_tokens_seen": 355726370, + "router_z_loss_clip": 0.00952148, + "router_z_loss_mlp": 0.19921875, + "step": 16484, + "time_per_iteration": 3.1859169006347656 + }, + { + "auxiliary_loss_clip": 0.01095081, + "auxiliary_loss_mlp": 0.0102748, + "balance_loss_clip": 1.01633358, + "balance_loss_mlp": 1.03326261, + "epoch": 0.9911318202314745, + "flos": 21434038306560.0, + "grad_norm": 2.132839577495112, + "language_loss": 0.81778204, + "learning_rate": 8.19359496165184e-10, + "loss": 0.83900762, + "num_input_tokens_seen": 355745840, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.6171875, + "step": 16485, + "time_per_iteration": 2.5153956413269043 + }, + { + "auxiliary_loss_clip": 0.01098037, + "auxiliary_loss_mlp": 0.01031579, + "balance_loss_clip": 1.01986718, + "balance_loss_mlp": 1.0349462, + "epoch": 0.9911919434841425, + "flos": 19826820253440.0, + "grad_norm": 1.9487999720953917, + "language_loss": 0.81256086, + "learning_rate": 8.082504137836288e-10, + "loss": 0.833857, + "num_input_tokens_seen": 355763385, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6328125, + "step": 16486, + "time_per_iteration": 2.467226982116699 + }, + { + "auxiliary_loss_clip": 0.01099412, + "auxiliary_loss_mlp": 0.0102555, + "balance_loss_clip": 1.01433814, + "balance_loss_mlp": 1.03397942, + "epoch": 0.9912520667368104, + "flos": 41719364691840.0, + "grad_norm": 1.3991972197643134, + "language_loss": 0.65814865, + "learning_rate": 7.972171409538209e-10, + "loss": 0.6793983, + "num_input_tokens_seen": 355786075, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.65625, + "step": 16487, + "time_per_iteration": 2.687784433364868 + }, + { + "auxiliary_loss_clip": 0.01095741, + "auxiliary_loss_mlp": 0.01028615, + "balance_loss_clip": 1.01798725, + "balance_loss_mlp": 1.03274322, + "epoch": 0.9913121899894785, + "flos": 23769668263680.0, + "grad_norm": 1.9951977920658592, + "language_loss": 0.7668978, + "learning_rate": 7.862596780936481e-10, + "loss": 0.78814131, + "num_input_tokens_seen": 355806295, + "router_z_loss_clip": 0.10644531, + "router_z_loss_mlp": 0.6328125, + "step": 16488, + "time_per_iteration": 2.4734110832214355 + }, + { + "auxiliary_loss_clip": 0.01102628, + "auxiliary_loss_mlp": 0.01026587, + "balance_loss_clip": 1.01442766, + "balance_loss_mlp": 1.03430152, + "epoch": 0.9913723132421464, + "flos": 23769668263680.0, + "grad_norm": 3.1534533227338946, + "language_loss": 0.68729866, + "learning_rate": 7.753780256190001e-10, + "loss": 0.70859075, + "num_input_tokens_seen": 355825730, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 16489, + "time_per_iteration": 2.467500686645508 + }, + { + "auxiliary_loss_clip": 0.01021406, + "auxiliary_loss_mlp": 0.01001161, + "balance_loss_clip": 1.00017166, + "balance_loss_mlp": 1.0014714, + "epoch": 0.9914324364948144, + "flos": 71267419820160.0, + "grad_norm": 0.6086143053932209, + "language_loss": 0.5259285, + "learning_rate": 7.645721839424357e-10, + "loss": 0.54615414, + "num_input_tokens_seen": 355891545, + "router_z_loss_clip": 0.0098877, + "router_z_loss_mlp": 0.19921875, + "step": 16490, + "time_per_iteration": 3.135390520095825 + }, + { + "auxiliary_loss_clip": 0.0110438, + "auxiliary_loss_mlp": 0.01032782, + "balance_loss_clip": 1.02026534, + "balance_loss_mlp": 1.03562975, + "epoch": 0.9914925597474823, + "flos": 23695440808320.0, + "grad_norm": 2.374861870755257, + "language_loss": 0.75565469, + "learning_rate": 7.538421534734052e-10, + "loss": 0.7770263, + "num_input_tokens_seen": 355909920, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6875, + "step": 16491, + "time_per_iteration": 2.4634666442871094 + }, + { + "auxiliary_loss_clip": 0.01105664, + "auxiliary_loss_mlp": 0.01027723, + "balance_loss_clip": 1.01532578, + "balance_loss_mlp": 1.03756356, + "epoch": 0.9915526830001503, + "flos": 13433822749440.0, + "grad_norm": 2.19239694282831, + "language_loss": 0.69975454, + "learning_rate": 7.431879346191383e-10, + "loss": 0.72108841, + "num_input_tokens_seen": 355923130, + "router_z_loss_clip": 0.12402344, + "router_z_loss_mlp": 0.6796875, + "step": 16492, + "time_per_iteration": 2.4108922481536865 + }, + { + "auxiliary_loss_clip": 0.01098681, + "auxiliary_loss_mlp": 0.01029599, + "balance_loss_clip": 1.01737428, + "balance_loss_mlp": 1.03337646, + "epoch": 0.9916128062528182, + "flos": 20740962407040.0, + "grad_norm": 1.8422382567938989, + "language_loss": 0.68127316, + "learning_rate": 7.326095277837563e-10, + "loss": 0.70255595, + "num_input_tokens_seen": 355941960, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.65234375, + "step": 16493, + "time_per_iteration": 2.4719016551971436 + }, + { + "auxiliary_loss_clip": 0.0110235, + "auxiliary_loss_mlp": 0.0103173, + "balance_loss_clip": 1.01974964, + "balance_loss_mlp": 1.03438842, + "epoch": 0.9916729295054862, + "flos": 22487082353280.0, + "grad_norm": 1.6813995805543638, + "language_loss": 0.71178663, + "learning_rate": 7.221069333678276e-10, + "loss": 0.73312747, + "num_input_tokens_seen": 355961640, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.6796875, + "step": 16494, + "time_per_iteration": 2.480767250061035 + }, + { + "auxiliary_loss_clip": 0.01100167, + "auxiliary_loss_mlp": 0.01030677, + "balance_loss_clip": 1.01807642, + "balance_loss_mlp": 1.03380418, + "epoch": 0.9917330527581543, + "flos": 14792467708800.0, + "grad_norm": 2.011490694936815, + "language_loss": 0.67974186, + "learning_rate": 7.116801517701443e-10, + "loss": 0.70105028, + "num_input_tokens_seen": 355977980, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.6640625, + "step": 16495, + "time_per_iteration": 2.40712308883667 + }, + { + "auxiliary_loss_clip": 0.01021311, + "auxiliary_loss_mlp": 0.01000436, + "balance_loss_clip": 0.99946463, + "balance_loss_mlp": 1.00145388, + "epoch": 0.9917931760108222, + "flos": 59191595585280.0, + "grad_norm": 0.7222922542161719, + "language_loss": 0.53426856, + "learning_rate": 7.013291833859458e-10, + "loss": 0.55448598, + "num_input_tokens_seen": 356042900, + "router_z_loss_clip": 0.00970459, + "router_z_loss_mlp": 0.19921875, + "step": 16496, + "time_per_iteration": 3.1515696048736572 + }, + { + "auxiliary_loss_clip": 0.01100625, + "auxiliary_loss_mlp": 0.01029087, + "balance_loss_clip": 1.01697552, + "balance_loss_mlp": 1.03456199, + "epoch": 0.9918532992634902, + "flos": 26761637485440.0, + "grad_norm": 1.5120941987850633, + "language_loss": 0.71478045, + "learning_rate": 6.91054028607585e-10, + "loss": 0.73607767, + "num_input_tokens_seen": 356063000, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6640625, + "step": 16497, + "time_per_iteration": 2.478241205215454 + }, + { + "auxiliary_loss_clip": 0.01102575, + "auxiliary_loss_mlp": 0.01031936, + "balance_loss_clip": 1.01924014, + "balance_loss_mlp": 1.03378558, + "epoch": 0.9919134225161581, + "flos": 14975719920000.0, + "grad_norm": 3.1482345724475196, + "language_loss": 0.82058042, + "learning_rate": 6.808546878249721e-10, + "loss": 0.8419255, + "num_input_tokens_seen": 356078130, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6875, + "step": 16498, + "time_per_iteration": 2.4786031246185303 + }, + { + "auxiliary_loss_clip": 0.01101575, + "auxiliary_loss_mlp": 0.01034974, + "balance_loss_clip": 1.02315426, + "balance_loss_mlp": 1.03460526, + "epoch": 0.9919735457688261, + "flos": 27818201064960.0, + "grad_norm": 1.560350707366415, + "language_loss": 0.68127578, + "learning_rate": 6.707311614246869e-10, + "loss": 0.70264125, + "num_input_tokens_seen": 356101655, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 16499, + "time_per_iteration": 2.5106289386749268 + }, + { + "auxiliary_loss_clip": 0.01102115, + "auxiliary_loss_mlp": 0.01027117, + "balance_loss_clip": 1.015625, + "balance_loss_mlp": 1.03552794, + "epoch": 0.992033669021494, + "flos": 22562782266240.0, + "grad_norm": 2.0283619276595632, + "language_loss": 0.82292485, + "learning_rate": 6.606834497904223e-10, + "loss": 0.84421718, + "num_input_tokens_seen": 356121425, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6640625, + "step": 16500, + "time_per_iteration": 2.448037624359131 + }, + { + "auxiliary_loss_clip": 0.01102063, + "auxiliary_loss_mlp": 0.0102921, + "balance_loss_clip": 1.01717544, + "balance_loss_mlp": 1.03475332, + "epoch": 0.9920937922741621, + "flos": 25374587846400.0, + "grad_norm": 1.8118228269565941, + "language_loss": 0.81654167, + "learning_rate": 6.507115533036511e-10, + "loss": 0.83785439, + "num_input_tokens_seen": 356140710, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.671875, + "step": 16501, + "time_per_iteration": 2.4804978370666504 + }, + { + "auxiliary_loss_clip": 0.0109966, + "auxiliary_loss_mlp": 0.0102834, + "balance_loss_clip": 1.01602578, + "balance_loss_mlp": 1.03350425, + "epoch": 0.99215391552683, + "flos": 22054466949120.0, + "grad_norm": 1.8447298881035472, + "language_loss": 0.77077162, + "learning_rate": 6.408154723420711e-10, + "loss": 0.79205161, + "num_input_tokens_seen": 356159835, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.66015625, + "step": 16502, + "time_per_iteration": 2.4736790657043457 + }, + { + "auxiliary_loss_clip": 0.01100851, + "auxiliary_loss_mlp": 0.01030936, + "balance_loss_clip": 1.01821661, + "balance_loss_mlp": 1.03371501, + "epoch": 0.992214038779498, + "flos": 15413937845760.0, + "grad_norm": 2.5520061200092914, + "language_loss": 0.71432996, + "learning_rate": 6.309952072811597e-10, + "loss": 0.73564786, + "num_input_tokens_seen": 356177555, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.671875, + "step": 16503, + "time_per_iteration": 2.425827980041504 + }, + { + "auxiliary_loss_clip": 0.01021585, + "auxiliary_loss_mlp": 0.01000758, + "balance_loss_clip": 0.99979258, + "balance_loss_mlp": 1.00154912, + "epoch": 0.9922741620321659, + "flos": 62014498467840.0, + "grad_norm": 0.6296146740963268, + "language_loss": 0.55068082, + "learning_rate": 6.212507584932858e-10, + "loss": 0.57090425, + "num_input_tokens_seen": 356244975, + "router_z_loss_clip": 0.00964355, + "router_z_loss_mlp": 0.20117188, + "step": 16504, + "time_per_iteration": 3.140615940093994 + }, + { + "auxiliary_loss_clip": 0.01096978, + "auxiliary_loss_mlp": 0.01022605, + "balance_loss_clip": 1.01209641, + "balance_loss_mlp": 1.03265607, + "epoch": 0.9923342852848339, + "flos": 17165480745600.0, + "grad_norm": 1.8235699601231674, + "language_loss": 0.69573104, + "learning_rate": 6.115821263481536e-10, + "loss": 0.71692687, + "num_input_tokens_seen": 356262605, + "router_z_loss_clip": 0.10498047, + "router_z_loss_mlp": 0.64453125, + "step": 16505, + "time_per_iteration": 2.4154937267303467 + }, + { + "auxiliary_loss_clip": 0.0110206, + "auxiliary_loss_mlp": 0.01030605, + "balance_loss_clip": 1.01768327, + "balance_loss_mlp": 1.03386188, + "epoch": 0.9923944085375018, + "flos": 23183210908800.0, + "grad_norm": 2.1541396755304576, + "language_loss": 0.65518022, + "learning_rate": 6.019893112119146e-10, + "loss": 0.67650688, + "num_input_tokens_seen": 356278935, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.68359375, + "step": 16506, + "time_per_iteration": 2.4460935592651367 + }, + { + "auxiliary_loss_clip": 0.01097659, + "auxiliary_loss_mlp": 0.01025784, + "balance_loss_clip": 1.01382756, + "balance_loss_mlp": 1.03309059, + "epoch": 0.9924545317901698, + "flos": 20813861059200.0, + "grad_norm": 1.8368921898892858, + "language_loss": 0.62782621, + "learning_rate": 5.924723134487219e-10, + "loss": 0.64906067, + "num_input_tokens_seen": 356295675, + "router_z_loss_clip": 0.11962891, + "router_z_loss_mlp": 0.64453125, + "step": 16507, + "time_per_iteration": 2.442676544189453 + }, + { + "auxiliary_loss_clip": 0.01100781, + "auxiliary_loss_mlp": 0.01032877, + "balance_loss_clip": 1.02104521, + "balance_loss_mlp": 1.03471947, + "epoch": 0.9925146550428379, + "flos": 20083437993600.0, + "grad_norm": 2.6591700136294723, + "language_loss": 0.72890103, + "learning_rate": 5.830311334193983e-10, + "loss": 0.75023758, + "num_input_tokens_seen": 356312885, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.66015625, + "step": 16508, + "time_per_iteration": 2.443885564804077 + }, + { + "auxiliary_loss_clip": 0.01099478, + "auxiliary_loss_mlp": 0.01029116, + "balance_loss_clip": 1.01685596, + "balance_loss_mlp": 1.03313184, + "epoch": 0.9925747782955058, + "flos": 24973717086720.0, + "grad_norm": 1.56405746018773, + "language_loss": 0.70219529, + "learning_rate": 5.736657714818793e-10, + "loss": 0.7234813, + "num_input_tokens_seen": 356334070, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.6640625, + "step": 16509, + "time_per_iteration": 2.4716854095458984 + }, + { + "auxiliary_loss_clip": 0.01099172, + "auxiliary_loss_mlp": 0.01031187, + "balance_loss_clip": 1.01902127, + "balance_loss_mlp": 1.03302801, + "epoch": 0.9926349015481738, + "flos": 60472526492160.0, + "grad_norm": 1.5891444400263024, + "language_loss": 0.68136442, + "learning_rate": 5.643762279912146e-10, + "loss": 0.70266795, + "num_input_tokens_seen": 356359410, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6640625, + "step": 16510, + "time_per_iteration": 2.8066964149475098 + }, + { + "auxiliary_loss_clip": 0.01101235, + "auxiliary_loss_mlp": 0.01032713, + "balance_loss_clip": 1.02104878, + "balance_loss_mlp": 1.03479171, + "epoch": 0.9926950248008417, + "flos": 20741716592640.0, + "grad_norm": 2.3909482824040054, + "language_loss": 0.81199002, + "learning_rate": 5.551625032997886e-10, + "loss": 0.83332956, + "num_input_tokens_seen": 356378345, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 16511, + "time_per_iteration": 2.4441933631896973 + }, + { + "auxiliary_loss_clip": 0.01096436, + "auxiliary_loss_mlp": 0.01027142, + "balance_loss_clip": 1.01612711, + "balance_loss_mlp": 1.03253233, + "epoch": 0.9927551480535097, + "flos": 24352965221760.0, + "grad_norm": 1.7454834669895913, + "language_loss": 0.91386062, + "learning_rate": 5.460245977570998e-10, + "loss": 0.93509638, + "num_input_tokens_seen": 356397345, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.640625, + "step": 16512, + "time_per_iteration": 2.5027287006378174 + }, + { + "auxiliary_loss_clip": 0.01021781, + "auxiliary_loss_mlp": 0.00998781, + "balance_loss_clip": 0.99777997, + "balance_loss_mlp": 1.00178456, + "epoch": 0.9928152713061776, + "flos": 71275572207360.0, + "grad_norm": 0.6952001936871817, + "language_loss": 0.55215639, + "learning_rate": 5.369625117095378e-10, + "loss": 0.57236201, + "num_input_tokens_seen": 356459160, + "router_z_loss_clip": 0.01000977, + "router_z_loss_mlp": 0.20019531, + "step": 16513, + "time_per_iteration": 4.509139776229858 + }, + { + "auxiliary_loss_clip": 0.01098001, + "auxiliary_loss_mlp": 0.01028915, + "balance_loss_clip": 1.01759601, + "balance_loss_mlp": 1.03360546, + "epoch": 0.9928753945588457, + "flos": 57809499045120.0, + "grad_norm": 1.6679269237242005, + "language_loss": 0.65108931, + "learning_rate": 5.279762455006054e-10, + "loss": 0.67235851, + "num_input_tokens_seen": 356486405, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.64453125, + "step": 16514, + "time_per_iteration": 4.345771789550781 + }, + { + "auxiliary_loss_clip": 0.01101706, + "auxiliary_loss_mlp": 0.01028023, + "balance_loss_clip": 1.01589918, + "balance_loss_mlp": 1.03492641, + "epoch": 0.9929355178115136, + "flos": 19568981450880.0, + "grad_norm": 1.9155425038175011, + "language_loss": 0.73504049, + "learning_rate": 5.190657994713632e-10, + "loss": 0.75633776, + "num_input_tokens_seen": 356502905, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.66796875, + "step": 16515, + "time_per_iteration": 2.4671685695648193 + }, + { + "auxiliary_loss_clip": 0.01100338, + "auxiliary_loss_mlp": 0.01027438, + "balance_loss_clip": 1.01605916, + "balance_loss_mlp": 1.03494489, + "epoch": 0.9929956410641816, + "flos": 22964658606720.0, + "grad_norm": 1.5036430835752834, + "language_loss": 0.77072322, + "learning_rate": 5.102311739593191e-10, + "loss": 0.79200089, + "num_input_tokens_seen": 356523830, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65625, + "step": 16516, + "time_per_iteration": 2.5214271545410156 + }, + { + "auxiliary_loss_clip": 0.01096027, + "auxiliary_loss_mlp": 0.01028354, + "balance_loss_clip": 1.01744592, + "balance_loss_mlp": 1.03210688, + "epoch": 0.9930557643168495, + "flos": 22566409539840.0, + "grad_norm": 1.5405677122582522, + "language_loss": 0.78079957, + "learning_rate": 5.014723692997602e-10, + "loss": 0.80204338, + "num_input_tokens_seen": 356543965, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.640625, + "step": 16517, + "time_per_iteration": 2.4683997631073 + }, + { + "auxiliary_loss_clip": 0.01104403, + "auxiliary_loss_mlp": 0.01037293, + "balance_loss_clip": 1.02406716, + "balance_loss_mlp": 1.03569436, + "epoch": 0.9931158875695175, + "flos": 17201032231680.0, + "grad_norm": 2.467869272528166, + "language_loss": 0.67826927, + "learning_rate": 4.927893858248655e-10, + "loss": 0.69968623, + "num_input_tokens_seen": 356561530, + "router_z_loss_clip": 0.1328125, + "router_z_loss_mlp": 0.6875, + "step": 16518, + "time_per_iteration": 2.5322189331054688 + }, + { + "auxiliary_loss_clip": 0.01021417, + "auxiliary_loss_mlp": 0.01004028, + "balance_loss_clip": 1.00303864, + "balance_loss_mlp": 1.00145912, + "epoch": 0.9931760108221854, + "flos": 63711204278400.0, + "grad_norm": 0.7302871663277747, + "language_loss": 0.5342353, + "learning_rate": 4.84182223863483e-10, + "loss": 0.55448973, + "num_input_tokens_seen": 356616845, + "router_z_loss_clip": 0.0098877, + "router_z_loss_mlp": 0.19921875, + "step": 16519, + "time_per_iteration": 2.9954869747161865 + }, + { + "auxiliary_loss_clip": 0.01098347, + "auxiliary_loss_mlp": 0.01027735, + "balance_loss_clip": 1.01674438, + "balance_loss_mlp": 1.03470135, + "epoch": 0.9932361340748534, + "flos": 15304805349120.0, + "grad_norm": 1.7167044262737383, + "language_loss": 0.59850049, + "learning_rate": 4.756508837426842e-10, + "loss": 0.61976135, + "num_input_tokens_seen": 356633560, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6328125, + "step": 16520, + "time_per_iteration": 4.069928884506226 + }, + { + "auxiliary_loss_clip": 0.01100252, + "auxiliary_loss_mlp": 0.01030874, + "balance_loss_clip": 1.01937616, + "balance_loss_mlp": 1.03505707, + "epoch": 0.9932962573275215, + "flos": 36064906727040.0, + "grad_norm": 1.6631510235112372, + "language_loss": 0.61730212, + "learning_rate": 4.671953657853223e-10, + "loss": 0.63861334, + "num_input_tokens_seen": 356657600, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.65234375, + "step": 16521, + "time_per_iteration": 2.5684220790863037 + }, + { + "auxiliary_loss_clip": 0.01103581, + "auxiliary_loss_mlp": 0.01035571, + "balance_loss_clip": 1.02342343, + "balance_loss_mlp": 1.03605843, + "epoch": 0.9933563805801894, + "flos": 21470523546240.0, + "grad_norm": 1.6476989131279343, + "language_loss": 0.74009991, + "learning_rate": 4.5881567031225145e-10, + "loss": 0.76149142, + "num_input_tokens_seen": 356675880, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.67578125, + "step": 16522, + "time_per_iteration": 2.436829090118408 + }, + { + "auxiliary_loss_clip": 0.01098481, + "auxiliary_loss_mlp": 0.01028712, + "balance_loss_clip": 1.01750588, + "balance_loss_mlp": 1.03452504, + "epoch": 0.9934165038328574, + "flos": 23986532626560.0, + "grad_norm": 1.4777823452126528, + "language_loss": 0.7283901, + "learning_rate": 4.5051179764143964e-10, + "loss": 0.74966204, + "num_input_tokens_seen": 356696000, + "router_z_loss_clip": 0.11181641, + "router_z_loss_mlp": 0.640625, + "step": 16523, + "time_per_iteration": 2.479238748550415 + }, + { + "auxiliary_loss_clip": 0.01097479, + "auxiliary_loss_mlp": 0.01030754, + "balance_loss_clip": 1.01968598, + "balance_loss_mlp": 1.0324173, + "epoch": 0.9934766270855253, + "flos": 21907807718400.0, + "grad_norm": 1.53854867890714, + "language_loss": 0.70717901, + "learning_rate": 4.422837480875241e-10, + "loss": 0.72846133, + "num_input_tokens_seen": 356716845, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.65234375, + "step": 16524, + "time_per_iteration": 2.444234609603882 + }, + { + "auxiliary_loss_clip": 0.01099088, + "auxiliary_loss_mlp": 0.01030783, + "balance_loss_clip": 1.01931524, + "balance_loss_mlp": 1.03416014, + "epoch": 0.9935367503381933, + "flos": 17129139160320.0, + "grad_norm": 1.790482995534708, + "language_loss": 0.79615587, + "learning_rate": 4.341315219624775e-10, + "loss": 0.81745458, + "num_input_tokens_seen": 356732100, + "router_z_loss_clip": 0.11474609, + "router_z_loss_mlp": 0.6484375, + "step": 16525, + "time_per_iteration": 2.4063704013824463 + }, + { + "auxiliary_loss_clip": 0.01098221, + "auxiliary_loss_mlp": 0.0102579, + "balance_loss_clip": 1.01411915, + "balance_loss_mlp": 1.03453732, + "epoch": 0.9935968735908612, + "flos": 22346241125760.0, + "grad_norm": 1.733872474173661, + "language_loss": 0.74672413, + "learning_rate": 4.2605511957582995e-10, + "loss": 0.76796424, + "num_input_tokens_seen": 356751480, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.63671875, + "step": 16526, + "time_per_iteration": 2.438570499420166 + }, + { + "auxiliary_loss_clip": 0.01095722, + "auxiliary_loss_mlp": 0.01027199, + "balance_loss_clip": 1.01614881, + "balance_loss_mlp": 1.03288567, + "epoch": 0.9936569968435293, + "flos": 29460539640960.0, + "grad_norm": 2.7286855441513405, + "language_loss": 0.72363502, + "learning_rate": 4.180545412333369e-10, + "loss": 0.74486423, + "num_input_tokens_seen": 356772650, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.625, + "step": 16527, + "time_per_iteration": 2.5294551849365234 + }, + { + "auxiliary_loss_clip": 0.01099494, + "auxiliary_loss_mlp": 0.01029487, + "balance_loss_clip": 1.01776278, + "balance_loss_mlp": 1.03302014, + "epoch": 0.9937171200961972, + "flos": 16544046522240.0, + "grad_norm": 2.11286094081821, + "language_loss": 0.76350486, + "learning_rate": 4.1012978723875547e-10, + "loss": 0.78479469, + "num_input_tokens_seen": 356788510, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6640625, + "step": 16528, + "time_per_iteration": 2.3937737941741943 + }, + { + "auxiliary_loss_clip": 0.01100352, + "auxiliary_loss_mlp": 0.01029172, + "balance_loss_clip": 1.01672101, + "balance_loss_mlp": 1.03344357, + "epoch": 0.9937772433488652, + "flos": 24390276474240.0, + "grad_norm": 2.2799255140164227, + "language_loss": 0.66841036, + "learning_rate": 4.022808578922898e-10, + "loss": 0.68970561, + "num_input_tokens_seen": 356809115, + "router_z_loss_clip": 0.12451172, + "router_z_loss_mlp": 0.671875, + "step": 16529, + "time_per_iteration": 2.4863250255584717 + }, + { + "auxiliary_loss_clip": 0.01104753, + "auxiliary_loss_mlp": 0.01031521, + "balance_loss_clip": 1.01808, + "balance_loss_mlp": 1.03608668, + "epoch": 0.9938373666015331, + "flos": 15669909141120.0, + "grad_norm": 2.9215652266283447, + "language_loss": 0.65546691, + "learning_rate": 3.9450775349170186e-10, + "loss": 0.6768297, + "num_input_tokens_seen": 356826410, + "router_z_loss_clip": 0.13476562, + "router_z_loss_mlp": 0.6875, + "step": 16530, + "time_per_iteration": 2.4260799884796143 + }, + { + "auxiliary_loss_clip": 0.01101104, + "auxiliary_loss_mlp": 0.01028039, + "balance_loss_clip": 1.01732826, + "balance_loss_mlp": 1.03536391, + "epoch": 0.9938974898542011, + "flos": 19496190539520.0, + "grad_norm": 2.3806943394415585, + "language_loss": 0.71338522, + "learning_rate": 3.8681047433186676e-10, + "loss": 0.73467672, + "num_input_tokens_seen": 356844990, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.65625, + "step": 16531, + "time_per_iteration": 2.4470114707946777 + }, + { + "auxiliary_loss_clip": 0.01102468, + "auxiliary_loss_mlp": 0.01028393, + "balance_loss_clip": 1.01645386, + "balance_loss_mlp": 1.03573895, + "epoch": 0.993957613106869, + "flos": 26906896085760.0, + "grad_norm": 1.6847640127704915, + "language_loss": 0.74276376, + "learning_rate": 3.791890207045512e-10, + "loss": 0.7640723, + "num_input_tokens_seen": 356866530, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.66796875, + "step": 16532, + "time_per_iteration": 2.4952268600463867 + }, + { + "auxiliary_loss_clip": 0.01093194, + "auxiliary_loss_mlp": 0.01028665, + "balance_loss_clip": 1.01831806, + "balance_loss_mlp": 1.03290677, + "epoch": 0.994017736359537, + "flos": 14939593816320.0, + "grad_norm": 1.6183140191849457, + "language_loss": 0.70227963, + "learning_rate": 3.7164339289885717e-10, + "loss": 0.72349823, + "num_input_tokens_seen": 356884660, + "router_z_loss_clip": 0.10351562, + "router_z_loss_mlp": 0.6015625, + "step": 16533, + "time_per_iteration": 2.4223129749298096 + }, + { + "auxiliary_loss_clip": 0.01100959, + "auxiliary_loss_mlp": 0.01028446, + "balance_loss_clip": 1.01636457, + "balance_loss_mlp": 1.03366101, + "epoch": 0.9940778596122051, + "flos": 15377883569280.0, + "grad_norm": 2.495911763822692, + "language_loss": 0.84326804, + "learning_rate": 3.641735912007782e-10, + "loss": 0.86456203, + "num_input_tokens_seen": 356900895, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.671875, + "step": 16534, + "time_per_iteration": 2.421475410461426 + }, + { + "auxiliary_loss_clip": 0.01093977, + "auxiliary_loss_mlp": 0.01026367, + "balance_loss_clip": 1.01544189, + "balance_loss_mlp": 1.03271604, + "epoch": 0.994137982864873, + "flos": 25228108183680.0, + "grad_norm": 1.3811361665058717, + "language_loss": 0.65835977, + "learning_rate": 3.567796158934211e-10, + "loss": 0.67956328, + "num_input_tokens_seen": 356920985, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.61328125, + "step": 16535, + "time_per_iteration": 2.4900805950164795 + }, + { + "auxiliary_loss_clip": 0.01098474, + "auxiliary_loss_mlp": 0.01026792, + "balance_loss_clip": 1.01655209, + "balance_loss_mlp": 1.03532779, + "epoch": 0.994198106117541, + "flos": 18442140912000.0, + "grad_norm": 1.5464040694380152, + "language_loss": 0.64858508, + "learning_rate": 3.4946146725767235e-10, + "loss": 0.66983771, + "num_input_tokens_seen": 356939800, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.6328125, + "step": 16536, + "time_per_iteration": 2.4714372158050537 + }, + { + "auxiliary_loss_clip": 0.01097217, + "auxiliary_loss_mlp": 0.01027643, + "balance_loss_clip": 1.01602042, + "balance_loss_mlp": 1.03327465, + "epoch": 0.9942582293702089, + "flos": 16654112772480.0, + "grad_norm": 2.6471187803341554, + "language_loss": 0.78560811, + "learning_rate": 3.4221914557064357e-10, + "loss": 0.80685669, + "num_input_tokens_seen": 356957780, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.640625, + "step": 16537, + "time_per_iteration": 2.4144296646118164 + }, + { + "auxiliary_loss_clip": 0.01104051, + "auxiliary_loss_mlp": 0.01030261, + "balance_loss_clip": 1.0181433, + "balance_loss_mlp": 1.03436911, + "epoch": 0.9943183526228769, + "flos": 21944580266880.0, + "grad_norm": 1.5433486041621642, + "language_loss": 0.68369782, + "learning_rate": 3.35052651107004e-10, + "loss": 0.70504093, + "num_input_tokens_seen": 356979185, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.6953125, + "step": 16538, + "time_per_iteration": 2.524678945541382 + }, + { + "auxiliary_loss_clip": 0.01094358, + "auxiliary_loss_mlp": 0.01029791, + "balance_loss_clip": 1.01909792, + "balance_loss_mlp": 1.03162956, + "epoch": 0.9943784758755448, + "flos": 23842566915840.0, + "grad_norm": 1.8615400061160121, + "language_loss": 0.75088692, + "learning_rate": 3.2796198413853614e-10, + "loss": 0.7721284, + "num_input_tokens_seen": 356997735, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.625, + "step": 16539, + "time_per_iteration": 2.646847724914551 + }, + { + "auxiliary_loss_clip": 0.01099417, + "auxiliary_loss_mlp": 0.01031493, + "balance_loss_clip": 1.01966715, + "balance_loss_mlp": 1.03417706, + "epoch": 0.9944385991282129, + "flos": 21469984842240.0, + "grad_norm": 2.7147810890236146, + "language_loss": 0.70484149, + "learning_rate": 3.209471449341361e-10, + "loss": 0.72615063, + "num_input_tokens_seen": 357015660, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.65234375, + "step": 16540, + "time_per_iteration": 2.4886109828948975 + }, + { + "auxiliary_loss_clip": 0.0109585, + "auxiliary_loss_mlp": 0.0102303, + "balance_loss_clip": 1.01286149, + "balance_loss_mlp": 1.03206122, + "epoch": 0.9944987223808808, + "flos": 22927024131840.0, + "grad_norm": 2.939680166237685, + "language_loss": 0.75353402, + "learning_rate": 3.140081337600353e-10, + "loss": 0.77472281, + "num_input_tokens_seen": 357034800, + "router_z_loss_clip": 0.1015625, + "router_z_loss_mlp": 0.63671875, + "step": 16541, + "time_per_iteration": 2.484328508377075 + }, + { + "auxiliary_loss_clip": 0.01098166, + "auxiliary_loss_mlp": 0.01034178, + "balance_loss_clip": 1.02246594, + "balance_loss_mlp": 1.03233027, + "epoch": 0.9945588456335488, + "flos": 22383013674240.0, + "grad_norm": 1.7114515319062655, + "language_loss": 0.76576352, + "learning_rate": 3.0714495087891255e-10, + "loss": 0.78708696, + "num_input_tokens_seen": 357053785, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.65625, + "step": 16542, + "time_per_iteration": 2.5461788177490234 + }, + { + "auxiliary_loss_clip": 0.01102537, + "auxiliary_loss_mlp": 0.01027632, + "balance_loss_clip": 1.01534152, + "balance_loss_mlp": 1.03482652, + "epoch": 0.9946189688862167, + "flos": 21397517153280.0, + "grad_norm": 2.0429647911980595, + "language_loss": 0.74317372, + "learning_rate": 3.0035759655122615e-10, + "loss": 0.76447541, + "num_input_tokens_seen": 357072025, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.6796875, + "step": 16543, + "time_per_iteration": 2.449420690536499 + }, + { + "auxiliary_loss_clip": 0.01101837, + "auxiliary_loss_mlp": 0.01028271, + "balance_loss_clip": 1.01610529, + "balance_loss_mlp": 1.03407598, + "epoch": 0.9946790921388847, + "flos": 12416545670400.0, + "grad_norm": 2.3037711031230663, + "language_loss": 0.81437778, + "learning_rate": 2.9364607103454785e-10, + "loss": 0.83567894, + "num_input_tokens_seen": 357086960, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6796875, + "step": 16544, + "time_per_iteration": 2.399273157119751 + }, + { + "auxiliary_loss_clip": 0.01097832, + "auxiliary_loss_mlp": 0.01028072, + "balance_loss_clip": 1.0168128, + "balance_loss_mlp": 1.03306675, + "epoch": 0.9947392153915526, + "flos": 19058295836160.0, + "grad_norm": 1.8738786872300168, + "language_loss": 0.78694546, + "learning_rate": 2.870103745831187e-10, + "loss": 0.80820447, + "num_input_tokens_seen": 357105095, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6484375, + "step": 16545, + "time_per_iteration": 2.5227584838867188 + }, + { + "auxiliary_loss_clip": 0.01102736, + "auxiliary_loss_mlp": 0.0102799, + "balance_loss_clip": 1.01636696, + "balance_loss_mlp": 1.03555512, + "epoch": 0.9947993386442207, + "flos": 27308808339840.0, + "grad_norm": 1.6947418455255971, + "language_loss": 0.72397494, + "learning_rate": 2.8045050744873733e-10, + "loss": 0.74528217, + "num_input_tokens_seen": 357125065, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.671875, + "step": 16546, + "time_per_iteration": 2.496741533279419 + }, + { + "auxiliary_loss_clip": 0.01096724, + "auxiliary_loss_mlp": 0.01031903, + "balance_loss_clip": 1.02099562, + "balance_loss_mlp": 1.03320479, + "epoch": 0.9948594618968887, + "flos": 20806498771200.0, + "grad_norm": 1.8069969669289252, + "language_loss": 0.77381766, + "learning_rate": 2.739664698798716e-10, + "loss": 0.79510397, + "num_input_tokens_seen": 357141600, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.63671875, + "step": 16547, + "time_per_iteration": 2.4704644680023193 + }, + { + "auxiliary_loss_clip": 0.01099595, + "auxiliary_loss_mlp": 0.01028365, + "balance_loss_clip": 1.01754653, + "balance_loss_mlp": 1.0343287, + "epoch": 0.9949195851495566, + "flos": 23292953936640.0, + "grad_norm": 2.307915611679892, + "language_loss": 0.69766366, + "learning_rate": 2.67558262122769e-10, + "loss": 0.71894336, + "num_input_tokens_seen": 357157880, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.65234375, + "step": 16548, + "time_per_iteration": 2.475226879119873 + }, + { + "auxiliary_loss_clip": 0.01099971, + "auxiliary_loss_mlp": 0.01028963, + "balance_loss_clip": 1.01779294, + "balance_loss_mlp": 1.03472638, + "epoch": 0.9949797084022246, + "flos": 18515470527360.0, + "grad_norm": 1.796397865727554, + "language_loss": 0.75069898, + "learning_rate": 2.6122588442012427e-10, + "loss": 0.77198833, + "num_input_tokens_seen": 357176705, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65234375, + "step": 16549, + "time_per_iteration": 2.4197475910186768 + }, + { + "auxiliary_loss_clip": 0.01102879, + "auxiliary_loss_mlp": 0.01028852, + "balance_loss_clip": 1.0167048, + "balance_loss_mlp": 1.03556645, + "epoch": 0.9950398316548925, + "flos": 30407719328640.0, + "grad_norm": 1.530797233853168, + "language_loss": 0.74324614, + "learning_rate": 2.5496933701241177e-10, + "loss": 0.76456344, + "num_input_tokens_seen": 357197630, + "router_z_loss_clip": 0.12109375, + "router_z_loss_mlp": 0.67578125, + "step": 16550, + "time_per_iteration": 2.498002529144287 + }, + { + "auxiliary_loss_clip": 0.0109823, + "auxiliary_loss_mlp": 0.01025093, + "balance_loss_clip": 1.01437664, + "balance_loss_mlp": 1.0334301, + "epoch": 0.9950999549075605, + "flos": 19900868140800.0, + "grad_norm": 1.8793121441998941, + "language_loss": 0.77961928, + "learning_rate": 2.4878862013655297e-10, + "loss": 0.80085254, + "num_input_tokens_seen": 357215445, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6484375, + "step": 16551, + "time_per_iteration": 2.436533212661743 + }, + { + "auxiliary_loss_clip": 0.01093554, + "auxiliary_loss_mlp": 0.01028132, + "balance_loss_clip": 1.01789141, + "balance_loss_mlp": 1.03328931, + "epoch": 0.9951600781602284, + "flos": 17603555016960.0, + "grad_norm": 1.3682339429908787, + "language_loss": 0.6663608, + "learning_rate": 2.426837340270271e-10, + "loss": 0.68757761, + "num_input_tokens_seen": 357234285, + "router_z_loss_clip": 0.10253906, + "router_z_loss_mlp": 0.6015625, + "step": 16552, + "time_per_iteration": 2.432891368865967 + }, + { + "auxiliary_loss_clip": 0.01101166, + "auxiliary_loss_mlp": 0.01027779, + "balance_loss_clip": 1.0161804, + "balance_loss_mlp": 1.0346899, + "epoch": 0.9952202014128965, + "flos": 28950715952640.0, + "grad_norm": 1.3823867523664939, + "language_loss": 0.81442159, + "learning_rate": 2.3665467891520465e-10, + "loss": 0.835711, + "num_input_tokens_seen": 357257565, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.6640625, + "step": 16553, + "time_per_iteration": 2.5027167797088623 + }, + { + "auxiliary_loss_clip": 0.0102153, + "auxiliary_loss_mlp": 0.01000995, + "balance_loss_clip": 1.0, + "balance_loss_mlp": 1.00160635, + "epoch": 0.9952803246655644, + "flos": 70810386145920.0, + "grad_norm": 0.72272292860588, + "language_loss": 0.57358015, + "learning_rate": 2.3070145503001348e-10, + "loss": 0.59380531, + "num_input_tokens_seen": 357320205, + "router_z_loss_clip": 0.00994873, + "router_z_loss_mlp": 0.19921875, + "step": 16554, + "time_per_iteration": 4.518311500549316 + }, + { + "auxiliary_loss_clip": 0.01099816, + "auxiliary_loss_mlp": 0.01027983, + "balance_loss_clip": 1.01674795, + "balance_loss_mlp": 1.0338006, + "epoch": 0.9953404479182324, + "flos": 21799070271360.0, + "grad_norm": 1.9148999032298457, + "language_loss": 0.76987743, + "learning_rate": 2.24824062597051e-10, + "loss": 0.79115546, + "num_input_tokens_seen": 357340695, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.66015625, + "step": 16555, + "time_per_iteration": 2.476464033126831 + }, + { + "auxiliary_loss_clip": 0.01098218, + "auxiliary_loss_mlp": 0.01029639, + "balance_loss_clip": 1.0180037, + "balance_loss_mlp": 1.0328294, + "epoch": 0.9954005711709003, + "flos": 21937397546880.0, + "grad_norm": 1.7456669794456254, + "language_loss": 0.85952592, + "learning_rate": 2.1902250183902793e-10, + "loss": 0.88080448, + "num_input_tokens_seen": 357357505, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65625, + "step": 16556, + "time_per_iteration": 5.243689060211182 + }, + { + "auxiliary_loss_clip": 0.01097284, + "auxiliary_loss_mlp": 0.01029195, + "balance_loss_clip": 1.01753592, + "balance_loss_mlp": 1.03397655, + "epoch": 0.9954606944235683, + "flos": 19354559212800.0, + "grad_norm": 1.9763730454405837, + "language_loss": 0.73122305, + "learning_rate": 2.132967729762125e-10, + "loss": 0.7524879, + "num_input_tokens_seen": 357375395, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6328125, + "step": 16557, + "time_per_iteration": 2.430027484893799 + }, + { + "auxiliary_loss_clip": 0.01098502, + "auxiliary_loss_mlp": 0.01030148, + "balance_loss_clip": 1.01929998, + "balance_loss_mlp": 1.03515077, + "epoch": 0.9955208176762362, + "flos": 30518611591680.0, + "grad_norm": 1.817461567879454, + "language_loss": 0.76426727, + "learning_rate": 2.0764687622554233e-10, + "loss": 0.78555375, + "num_input_tokens_seen": 357397375, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.6328125, + "step": 16558, + "time_per_iteration": 2.5219368934631348 + }, + { + "auxiliary_loss_clip": 0.01099064, + "auxiliary_loss_mlp": 0.01031328, + "balance_loss_clip": 1.01932991, + "balance_loss_mlp": 1.03307915, + "epoch": 0.9955809409289043, + "flos": 30008249199360.0, + "grad_norm": 2.2312910836598854, + "language_loss": 0.63569021, + "learning_rate": 2.0207281180129044e-10, + "loss": 0.6569941, + "num_input_tokens_seen": 357418880, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.66015625, + "step": 16559, + "time_per_iteration": 2.5117738246917725 + }, + { + "auxiliary_loss_clip": 0.01097276, + "auxiliary_loss_mlp": 0.01024759, + "balance_loss_clip": 1.01383388, + "balance_loss_mlp": 1.03360701, + "epoch": 0.9956410641815723, + "flos": 21543278544000.0, + "grad_norm": 1.7426870102822973, + "language_loss": 0.73885131, + "learning_rate": 1.965745799148433e-10, + "loss": 0.76007164, + "num_input_tokens_seen": 357438310, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.63671875, + "step": 16560, + "time_per_iteration": 2.4488108158111572 + }, + { + "auxiliary_loss_clip": 0.0109778, + "auxiliary_loss_mlp": 0.01028515, + "balance_loss_clip": 1.01738095, + "balance_loss_mlp": 1.03397381, + "epoch": 0.9957011874342402, + "flos": 21689470897920.0, + "grad_norm": 1.7983304898046564, + "language_loss": 0.78763914, + "learning_rate": 1.9115218077470073e-10, + "loss": 0.80890214, + "num_input_tokens_seen": 357457155, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.63671875, + "step": 16561, + "time_per_iteration": 3.9634294509887695 + }, + { + "auxiliary_loss_clip": 0.01096518, + "auxiliary_loss_mlp": 0.01027682, + "balance_loss_clip": 1.01694107, + "balance_loss_mlp": 1.03466511, + "epoch": 0.9957613106869082, + "flos": 17702667619200.0, + "grad_norm": 3.643161069547379, + "language_loss": 0.65290403, + "learning_rate": 1.8580561458647614e-10, + "loss": 0.67414606, + "num_input_tokens_seen": 357468060, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.6171875, + "step": 16562, + "time_per_iteration": 2.406337022781372 + }, + { + "auxiliary_loss_clip": 0.01102342, + "auxiliary_loss_mlp": 0.01037884, + "balance_loss_clip": 1.02503276, + "balance_loss_mlp": 1.03487086, + "epoch": 0.9958214339395761, + "flos": 30555994671360.0, + "grad_norm": 1.7113343441863529, + "language_loss": 0.64638877, + "learning_rate": 1.805348815528962e-10, + "loss": 0.66779101, + "num_input_tokens_seen": 357489665, + "router_z_loss_clip": 0.12890625, + "router_z_loss_mlp": 0.67578125, + "step": 16563, + "time_per_iteration": 2.5361878871917725 + }, + { + "auxiliary_loss_clip": 0.01096492, + "auxiliary_loss_mlp": 0.01030567, + "balance_loss_clip": 1.01859283, + "balance_loss_mlp": 1.03311706, + "epoch": 0.9958815571922441, + "flos": 24169174306560.0, + "grad_norm": 1.5892081199071135, + "language_loss": 0.64616358, + "learning_rate": 1.7533998187380105e-10, + "loss": 0.66743422, + "num_input_tokens_seen": 357511975, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6328125, + "step": 16564, + "time_per_iteration": 2.5013222694396973 + }, + { + "auxiliary_loss_clip": 0.01097598, + "auxiliary_loss_mlp": 0.0102463, + "balance_loss_clip": 1.01393127, + "balance_loss_mlp": 1.03449404, + "epoch": 0.995941680444912, + "flos": 15487016065920.0, + "grad_norm": 1.8072037893855308, + "language_loss": 0.74071467, + "learning_rate": 1.7022091574636633e-10, + "loss": 0.76193696, + "num_input_tokens_seen": 357529345, + "router_z_loss_clip": 0.10693359, + "router_z_loss_mlp": 0.6328125, + "step": 16565, + "time_per_iteration": 2.428020715713501 + }, + { + "auxiliary_loss_clip": 0.01098477, + "auxiliary_loss_mlp": 0.01027986, + "balance_loss_clip": 1.01667905, + "balance_loss_mlp": 1.03254855, + "epoch": 0.9960018036975801, + "flos": 18621227145600.0, + "grad_norm": 1.7652422017737324, + "language_loss": 0.79023802, + "learning_rate": 1.6517768336443694e-10, + "loss": 0.8115027, + "num_input_tokens_seen": 357547615, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.66015625, + "step": 16566, + "time_per_iteration": 2.4422249794006348 + }, + { + "auxiliary_loss_clip": 0.01097067, + "auxiliary_loss_mlp": 0.01027779, + "balance_loss_clip": 1.01725245, + "balance_loss_mlp": 1.0328474, + "epoch": 0.996061926950248, + "flos": 20084120352000.0, + "grad_norm": 1.6111958794194645, + "language_loss": 0.70903325, + "learning_rate": 1.6021028491941535e-10, + "loss": 0.73028171, + "num_input_tokens_seen": 357567380, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.640625, + "step": 16567, + "time_per_iteration": 2.4520092010498047 + }, + { + "auxiliary_loss_clip": 0.01102031, + "auxiliary_loss_mlp": 0.01030294, + "balance_loss_clip": 1.01803339, + "balance_loss_mlp": 1.03490436, + "epoch": 0.996122050202916, + "flos": 24347829576960.0, + "grad_norm": 2.148548690092107, + "language_loss": 0.78551197, + "learning_rate": 1.5531872059959538e-10, + "loss": 0.80683523, + "num_input_tokens_seen": 357586435, + "router_z_loss_clip": 0.12255859, + "router_z_loss_mlp": 0.671875, + "step": 16568, + "time_per_iteration": 2.478513717651367 + }, + { + "auxiliary_loss_clip": 0.01095234, + "auxiliary_loss_mlp": 0.01026304, + "balance_loss_clip": 1.01617098, + "balance_loss_mlp": 1.03350139, + "epoch": 0.9961821734555839, + "flos": 24199302839040.0, + "grad_norm": 1.7226250915847214, + "language_loss": 0.81869441, + "learning_rate": 1.5050299059060634e-10, + "loss": 0.83990985, + "num_input_tokens_seen": 357604720, + "router_z_loss_clip": 0.1015625, + "router_z_loss_mlp": 0.6171875, + "step": 16569, + "time_per_iteration": 2.458204507827759 + }, + { + "auxiliary_loss_clip": 0.01098365, + "auxiliary_loss_mlp": 0.01029328, + "balance_loss_clip": 1.01803255, + "balance_loss_mlp": 1.03522015, + "epoch": 0.9962422967082519, + "flos": 22633741584000.0, + "grad_norm": 1.6857476071497695, + "language_loss": 0.70389342, + "learning_rate": 1.457630950747468e-10, + "loss": 0.72517037, + "num_input_tokens_seen": 357622345, + "router_z_loss_clip": 0.11279297, + "router_z_loss_mlp": 0.6328125, + "step": 16570, + "time_per_iteration": 2.4504377841949463 + }, + { + "auxiliary_loss_clip": 0.01098766, + "auxiliary_loss_mlp": 0.01026096, + "balance_loss_clip": 1.01469421, + "balance_loss_mlp": 1.0342046, + "epoch": 0.9963024199609198, + "flos": 26396030903040.0, + "grad_norm": 1.498051028683254, + "language_loss": 0.74896741, + "learning_rate": 1.4109903423209502e-10, + "loss": 0.77021599, + "num_input_tokens_seen": 357642710, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.64453125, + "step": 16571, + "time_per_iteration": 2.4885144233703613 + }, + { + "auxiliary_loss_clip": 0.01098144, + "auxiliary_loss_mlp": 0.01030562, + "balance_loss_clip": 1.018659, + "balance_loss_mlp": 1.0332427, + "epoch": 0.9963625432135879, + "flos": 16581537342720.0, + "grad_norm": 4.293039734836271, + "language_loss": 0.79286802, + "learning_rate": 1.3651080823939843e-10, + "loss": 0.81415516, + "num_input_tokens_seen": 357659870, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6484375, + "step": 16572, + "time_per_iteration": 2.422837495803833 + }, + { + "auxiliary_loss_clip": 0.01098765, + "auxiliary_loss_mlp": 0.01031228, + "balance_loss_clip": 1.01985526, + "balance_loss_mlp": 1.03418255, + "epoch": 0.9964226664662559, + "flos": 26468534505600.0, + "grad_norm": 1.7068316194851922, + "language_loss": 0.70099813, + "learning_rate": 1.3199841727074e-10, + "loss": 0.72229803, + "num_input_tokens_seen": 357677075, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.6484375, + "step": 16573, + "time_per_iteration": 2.474600076675415 + }, + { + "auxiliary_loss_clip": 0.0110281, + "auxiliary_loss_mlp": 0.01032815, + "balance_loss_clip": 1.02057242, + "balance_loss_mlp": 1.03448069, + "epoch": 0.9964827897189238, + "flos": 27448320764160.0, + "grad_norm": 3.1405745847261892, + "language_loss": 0.63359118, + "learning_rate": 1.275618614968721e-10, + "loss": 0.6549474, + "num_input_tokens_seen": 357696715, + "router_z_loss_clip": 0.12207031, + "router_z_loss_mlp": 0.68359375, + "step": 16574, + "time_per_iteration": 2.49106502532959 + }, + { + "auxiliary_loss_clip": 0.01105742, + "auxiliary_loss_mlp": 0.01031393, + "balance_loss_clip": 1.01886439, + "balance_loss_mlp": 1.03643692, + "epoch": 0.9965429129715918, + "flos": 11721566350080.0, + "grad_norm": 2.227500988407702, + "language_loss": 0.76397538, + "learning_rate": 1.2320114108654856e-10, + "loss": 0.78534675, + "num_input_tokens_seen": 357712345, + "router_z_loss_clip": 0.125, + "router_z_loss_mlp": 0.6953125, + "step": 16575, + "time_per_iteration": 2.4262399673461914 + }, + { + "auxiliary_loss_clip": 0.01098555, + "auxiliary_loss_mlp": 0.01026648, + "balance_loss_clip": 1.01506138, + "balance_loss_mlp": 1.03410196, + "epoch": 0.9966030362242597, + "flos": 19756004590080.0, + "grad_norm": 4.577550443890641, + "language_loss": 0.70150673, + "learning_rate": 1.1891625620474855e-10, + "loss": 0.72275877, + "num_input_tokens_seen": 357731815, + "router_z_loss_clip": 0.11572266, + "router_z_loss_mlp": 0.64453125, + "step": 16576, + "time_per_iteration": 2.451935291290283 + }, + { + "auxiliary_loss_clip": 0.01096621, + "auxiliary_loss_mlp": 0.01025454, + "balance_loss_clip": 1.01396775, + "balance_loss_mlp": 1.03349376, + "epoch": 0.9966631594769277, + "flos": 23915178259200.0, + "grad_norm": 1.6323404485871098, + "language_loss": 0.71913862, + "learning_rate": 1.1470720701400871e-10, + "loss": 0.74035937, + "num_input_tokens_seen": 357751640, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.62890625, + "step": 16577, + "time_per_iteration": 2.4703454971313477 + }, + { + "auxiliary_loss_clip": 0.01097745, + "auxiliary_loss_mlp": 0.01031675, + "balance_loss_clip": 1.02063036, + "balance_loss_mlp": 1.03272855, + "epoch": 0.9967232827295956, + "flos": 15559591495680.0, + "grad_norm": 2.0651068777650927, + "language_loss": 0.78223175, + "learning_rate": 1.1057399367397912e-10, + "loss": 0.80352592, + "num_input_tokens_seen": 357769850, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 16578, + "time_per_iteration": 2.426480770111084 + }, + { + "auxiliary_loss_clip": 0.01099677, + "auxiliary_loss_mlp": 0.01027559, + "balance_loss_clip": 1.01651478, + "balance_loss_mlp": 1.03442752, + "epoch": 0.9967834059822637, + "flos": 20813035046400.0, + "grad_norm": 1.8452406195625735, + "language_loss": 0.76049864, + "learning_rate": 1.0651661634142328e-10, + "loss": 0.78177106, + "num_input_tokens_seen": 357789550, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65234375, + "step": 16579, + "time_per_iteration": 2.432826042175293 + }, + { + "auxiliary_loss_clip": 0.01102874, + "auxiliary_loss_mlp": 0.01036654, + "balance_loss_clip": 1.02306962, + "balance_loss_mlp": 1.03675032, + "epoch": 0.9968435292349316, + "flos": 36719234830080.0, + "grad_norm": 2.4609323658511135, + "language_loss": 0.69146717, + "learning_rate": 1.0253507516999604e-10, + "loss": 0.71286243, + "num_input_tokens_seen": 357809525, + "router_z_loss_clip": 0.13574219, + "router_z_loss_mlp": 0.66015625, + "step": 16580, + "time_per_iteration": 2.581434726715088 + }, + { + "auxiliary_loss_clip": 0.01098839, + "auxiliary_loss_mlp": 0.01026817, + "balance_loss_clip": 1.01583827, + "balance_loss_mlp": 1.03371871, + "epoch": 0.9969036524875996, + "flos": 26760919213440.0, + "grad_norm": 1.8507015089446737, + "language_loss": 0.79869235, + "learning_rate": 9.862937031113184e-11, + "loss": 0.81994891, + "num_input_tokens_seen": 357829795, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6484375, + "step": 16581, + "time_per_iteration": 2.494203567504883 + }, + { + "auxiliary_loss_clip": 0.01096077, + "auxiliary_loss_mlp": 0.01026754, + "balance_loss_clip": 1.01657975, + "balance_loss_mlp": 1.03305769, + "epoch": 0.9969637757402675, + "flos": 24827237424000.0, + "grad_norm": 1.767690643264238, + "language_loss": 0.80186617, + "learning_rate": 9.479950191249031e-11, + "loss": 0.82309449, + "num_input_tokens_seen": 357851655, + "router_z_loss_clip": 0.1015625, + "router_z_loss_mlp": 0.6328125, + "step": 16582, + "time_per_iteration": 2.5011415481567383 + }, + { + "auxiliary_loss_clip": 0.0109477, + "auxiliary_loss_mlp": 0.0102737, + "balance_loss_clip": 1.01649189, + "balance_loss_mlp": 1.03291821, + "epoch": 0.9970238989929355, + "flos": 23038742407680.0, + "grad_norm": 1.569716612794735, + "language_loss": 0.60461831, + "learning_rate": 9.104547011951069e-11, + "loss": 0.62583971, + "num_input_tokens_seen": 357871205, + "router_z_loss_clip": 0.10888672, + "router_z_loss_mlp": 0.6171875, + "step": 16583, + "time_per_iteration": 2.4676523208618164 + }, + { + "auxiliary_loss_clip": 0.01099003, + "auxiliary_loss_mlp": 0.01034011, + "balance_loss_clip": 1.02270365, + "balance_loss_mlp": 1.03365004, + "epoch": 0.9970840222456034, + "flos": 25298816106240.0, + "grad_norm": 1.7140467406862439, + "language_loss": 0.77781087, + "learning_rate": 8.736727507452357e-11, + "loss": 0.79914105, + "num_input_tokens_seen": 357892145, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.65234375, + "step": 16584, + "time_per_iteration": 2.4774999618530273 + }, + { + "auxiliary_loss_clip": 0.01094708, + "auxiliary_loss_mlp": 0.01026725, + "balance_loss_clip": 1.01621103, + "balance_loss_mlp": 1.03186727, + "epoch": 0.9971441454982715, + "flos": 21615602578560.0, + "grad_norm": 1.4787158618998437, + "language_loss": 0.69567794, + "learning_rate": 8.376491691697297e-11, + "loss": 0.71689224, + "num_input_tokens_seen": 357911205, + "router_z_loss_clip": 0.10546875, + "router_z_loss_mlp": 0.62890625, + "step": 16585, + "time_per_iteration": 2.44138240814209 + }, + { + "auxiliary_loss_clip": 0.01098109, + "auxiliary_loss_mlp": 0.01029706, + "balance_loss_clip": 1.018399, + "balance_loss_mlp": 1.03426003, + "epoch": 0.9972042687509394, + "flos": 14975612179200.0, + "grad_norm": 2.2707359935417797, + "language_loss": 0.81493002, + "learning_rate": 8.023839578363834e-11, + "loss": 0.83620816, + "num_input_tokens_seen": 357928190, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.640625, + "step": 16586, + "time_per_iteration": 2.421546697616577 + }, + { + "auxiliary_loss_clip": 0.01099853, + "auxiliary_loss_mlp": 0.01030952, + "balance_loss_clip": 1.01990139, + "balance_loss_mlp": 1.03359437, + "epoch": 0.9972643920036074, + "flos": 25806664546560.0, + "grad_norm": 1.5732795893174074, + "language_loss": 0.778898, + "learning_rate": 7.678771180796851e-11, + "loss": 0.80020607, + "num_input_tokens_seen": 357946985, + "router_z_loss_clip": 0.11083984, + "router_z_loss_mlp": 0.6640625, + "step": 16587, + "time_per_iteration": 2.4762072563171387 + }, + { + "auxiliary_loss_clip": 0.01102564, + "auxiliary_loss_mlp": 0.01032709, + "balance_loss_clip": 1.02090156, + "balance_loss_mlp": 1.03507805, + "epoch": 0.9973245152562754, + "flos": 23326242865920.0, + "grad_norm": 1.9015065345921054, + "language_loss": 0.72213399, + "learning_rate": 7.341286512074773e-11, + "loss": 0.74348676, + "num_input_tokens_seen": 357966720, + "router_z_loss_clip": 0.11816406, + "router_z_loss_mlp": 0.671875, + "step": 16588, + "time_per_iteration": 2.4634549617767334 + }, + { + "auxiliary_loss_clip": 0.01104899, + "auxiliary_loss_mlp": 0.01025375, + "balance_loss_clip": 1.01331103, + "balance_loss_mlp": 1.03560162, + "epoch": 0.9973846385089433, + "flos": 12166212810240.0, + "grad_norm": 2.669215149095081, + "language_loss": 0.82404584, + "learning_rate": 7.011385585031781e-11, + "loss": 0.8453486, + "num_input_tokens_seen": 357981375, + "router_z_loss_clip": 0.12060547, + "router_z_loss_mlp": 0.6953125, + "step": 16589, + "time_per_iteration": 2.4119436740875244 + }, + { + "auxiliary_loss_clip": 0.01103307, + "auxiliary_loss_mlp": 0.01033409, + "balance_loss_clip": 1.02065957, + "balance_loss_mlp": 1.0352869, + "epoch": 0.9974447617616113, + "flos": 20045157073920.0, + "grad_norm": 1.9948573332908004, + "language_loss": 0.70658422, + "learning_rate": 6.689068412168986e-11, + "loss": 0.72795141, + "num_input_tokens_seen": 358000290, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 16590, + "time_per_iteration": 2.4617862701416016 + }, + { + "auxiliary_loss_clip": 0.01102056, + "auxiliary_loss_mlp": 0.01026363, + "balance_loss_clip": 1.01451993, + "balance_loss_mlp": 1.03504336, + "epoch": 0.9975048850142793, + "flos": 32014614159360.0, + "grad_norm": 1.7363819209230626, + "language_loss": 0.63469762, + "learning_rate": 6.374335005676634e-11, + "loss": 0.65598178, + "num_input_tokens_seen": 358022075, + "router_z_loss_clip": 0.11865234, + "router_z_loss_mlp": 0.671875, + "step": 16591, + "time_per_iteration": 2.520002841949463 + }, + { + "auxiliary_loss_clip": 0.0109632, + "auxiliary_loss_mlp": 0.01026939, + "balance_loss_clip": 1.01592362, + "balance_loss_mlp": 1.0312531, + "epoch": 0.9975650082669473, + "flos": 36933728895360.0, + "grad_norm": 2.943724599512804, + "language_loss": 0.7296713, + "learning_rate": 6.067185377522933e-11, + "loss": 0.75090384, + "num_input_tokens_seen": 358043940, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.6484375, + "step": 16592, + "time_per_iteration": 2.5603761672973633 + }, + { + "auxiliary_loss_clip": 0.01100374, + "auxiliary_loss_mlp": 0.01028734, + "balance_loss_clip": 1.01680732, + "balance_loss_mlp": 1.03396702, + "epoch": 0.9976251315196152, + "flos": 16472117537280.0, + "grad_norm": 1.5542464002042724, + "language_loss": 0.85096574, + "learning_rate": 5.767619539343016e-11, + "loss": 0.87225676, + "num_input_tokens_seen": 358062720, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6640625, + "step": 16593, + "time_per_iteration": 2.429603099822998 + }, + { + "auxiliary_loss_clip": 0.01095115, + "auxiliary_loss_mlp": 0.01027135, + "balance_loss_clip": 1.01640582, + "balance_loss_mlp": 1.03307366, + "epoch": 0.9976852547722832, + "flos": 19646836179840.0, + "grad_norm": 1.6719903303496852, + "language_loss": 0.69481122, + "learning_rate": 5.4756375024833656e-11, + "loss": 0.71603376, + "num_input_tokens_seen": 358081560, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.62109375, + "step": 16594, + "time_per_iteration": 2.43540620803833 + }, + { + "auxiliary_loss_clip": 0.0110208, + "auxiliary_loss_mlp": 0.01024833, + "balance_loss_clip": 1.01338243, + "balance_loss_mlp": 1.03504872, + "epoch": 0.9977453780249511, + "flos": 20448434044800.0, + "grad_norm": 2.1178675700771166, + "language_loss": 0.72752357, + "learning_rate": 5.1912392780462113e-11, + "loss": 0.74879265, + "num_input_tokens_seen": 358099065, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.671875, + "step": 16595, + "time_per_iteration": 3.8551931381225586 + }, + { + "auxiliary_loss_clip": 0.01021727, + "auxiliary_loss_mlp": 0.01002197, + "balance_loss_clip": 1.00119519, + "balance_loss_mlp": 1.00168085, + "epoch": 0.9978055012776191, + "flos": 65455097581440.0, + "grad_norm": 0.7875605849663777, + "language_loss": 0.60373664, + "learning_rate": 4.9144248768007156e-11, + "loss": 0.62397587, + "num_input_tokens_seen": 358156095, + "router_z_loss_clip": 0.01000977, + "router_z_loss_mlp": 0.20117188, + "step": 16596, + "time_per_iteration": 2.9350359439849854 + }, + { + "auxiliary_loss_clip": 0.01099895, + "auxiliary_loss_mlp": 0.01029813, + "balance_loss_clip": 1.0180645, + "balance_loss_mlp": 1.03539467, + "epoch": 0.997865624530287, + "flos": 20631506688000.0, + "grad_norm": 3.029028334744603, + "language_loss": 0.77209026, + "learning_rate": 4.645194309227385e-11, + "loss": 0.79338735, + "num_input_tokens_seen": 358175230, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.64453125, + "step": 16597, + "time_per_iteration": 3.942023515701294 + }, + { + "auxiliary_loss_clip": 0.01099625, + "auxiliary_loss_mlp": 0.01028582, + "balance_loss_clip": 1.01690507, + "balance_loss_mlp": 1.03322697, + "epoch": 0.9979257477829551, + "flos": 29387102284800.0, + "grad_norm": 2.635377639422332, + "language_loss": 0.82367396, + "learning_rate": 4.383547585562475e-11, + "loss": 0.84495604, + "num_input_tokens_seen": 358197075, + "router_z_loss_clip": 0.11669922, + "router_z_loss_mlp": 0.6640625, + "step": 16598, + "time_per_iteration": 3.864666700363159 + }, + { + "auxiliary_loss_clip": 0.011023, + "auxiliary_loss_mlp": 0.01037017, + "balance_loss_clip": 1.02425003, + "balance_loss_mlp": 1.03410494, + "epoch": 0.997985871035623, + "flos": 22635070387200.0, + "grad_norm": 2.796907251913606, + "language_loss": 0.65109944, + "learning_rate": 4.129484715709175e-11, + "loss": 0.67249256, + "num_input_tokens_seen": 358215925, + "router_z_loss_clip": 0.12695312, + "router_z_loss_mlp": 0.6796875, + "step": 16599, + "time_per_iteration": 2.4528656005859375 + }, + { + "auxiliary_loss_clip": 0.01021765, + "auxiliary_loss_mlp": 0.0100066, + "balance_loss_clip": 0.99964696, + "balance_loss_mlp": 1.00174737, + "epoch": 0.998045994288291, + "flos": 61806968663040.0, + "grad_norm": 0.8527082276246827, + "language_loss": 0.62352717, + "learning_rate": 3.8830057093264256e-11, + "loss": 0.64375138, + "num_input_tokens_seen": 358269035, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.20019531, + "step": 16600, + "time_per_iteration": 2.9641520977020264 + }, + { + "auxiliary_loss_clip": 0.01098012, + "auxiliary_loss_mlp": 0.01028294, + "balance_loss_clip": 1.01810217, + "balance_loss_mlp": 1.03394604, + "epoch": 0.998106117540959, + "flos": 19245534456960.0, + "grad_norm": 1.8751595217258485, + "language_loss": 0.78435218, + "learning_rate": 3.644110575717896e-11, + "loss": 0.80561531, + "num_input_tokens_seen": 358287680, + "router_z_loss_clip": 0.10205078, + "router_z_loss_mlp": 0.640625, + "step": 16601, + "time_per_iteration": 2.4237499237060547 + }, + { + "auxiliary_loss_clip": 0.01103104, + "auxiliary_loss_mlp": 0.01029889, + "balance_loss_clip": 1.01854587, + "balance_loss_mlp": 1.0346154, + "epoch": 0.9981662407936269, + "flos": 21106209853440.0, + "grad_norm": 1.7881542375847135, + "language_loss": 0.82285678, + "learning_rate": 3.412799323987414e-11, + "loss": 0.84418672, + "num_input_tokens_seen": 358304080, + "router_z_loss_clip": 0.11328125, + "router_z_loss_mlp": 0.6875, + "step": 16602, + "time_per_iteration": 2.4423704147338867 + }, + { + "auxiliary_loss_clip": 0.01101136, + "auxiliary_loss_mlp": 0.01034599, + "balance_loss_clip": 1.02311897, + "balance_loss_mlp": 1.03557825, + "epoch": 0.998226364046295, + "flos": 24316839118080.0, + "grad_norm": 2.4534705060674966, + "language_loss": 0.62488025, + "learning_rate": 3.189071962883538e-11, + "loss": 0.64623755, + "num_input_tokens_seen": 358323670, + "router_z_loss_clip": 0.11425781, + "router_z_loss_mlp": 0.65234375, + "step": 16603, + "time_per_iteration": 3.9773411750793457 + }, + { + "auxiliary_loss_clip": 0.01099863, + "auxiliary_loss_mlp": 0.01026234, + "balance_loss_clip": 1.01466465, + "balance_loss_mlp": 1.0336225, + "epoch": 0.9982864872989629, + "flos": 23836389776640.0, + "grad_norm": 2.745471042635087, + "language_loss": 0.71030104, + "learning_rate": 2.972928500866168e-11, + "loss": 0.73156202, + "num_input_tokens_seen": 358341980, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.66015625, + "step": 16604, + "time_per_iteration": 2.465850353240967 + }, + { + "auxiliary_loss_clip": 0.01097913, + "auxiliary_loss_mlp": 0.01025948, + "balance_loss_clip": 1.01399732, + "balance_loss_mlp": 1.03297126, + "epoch": 0.9983466105516309, + "flos": 18333116156160.0, + "grad_norm": 1.5031183797260619, + "language_loss": 0.64503157, + "learning_rate": 2.7643689461953613e-11, + "loss": 0.66627014, + "num_input_tokens_seen": 358360400, + "router_z_loss_clip": 0.11914062, + "router_z_loss_mlp": 0.6484375, + "step": 16605, + "time_per_iteration": 2.513810873031616 + }, + { + "auxiliary_loss_clip": 0.01096491, + "auxiliary_loss_mlp": 0.01025316, + "balance_loss_clip": 1.01468289, + "balance_loss_mlp": 1.03334665, + "epoch": 0.9984067338042988, + "flos": 17236763285760.0, + "grad_norm": 1.7165174426414616, + "language_loss": 0.71259665, + "learning_rate": 2.5633933067092938e-11, + "loss": 0.73381472, + "num_input_tokens_seen": 358378990, + "router_z_loss_clip": 0.10595703, + "router_z_loss_mlp": 0.6328125, + "step": 16606, + "time_per_iteration": 2.466052770614624 + }, + { + "auxiliary_loss_clip": 0.01100332, + "auxiliary_loss_mlp": 0.01027992, + "balance_loss_clip": 1.01647663, + "balance_loss_mlp": 1.03490186, + "epoch": 0.9984668570569668, + "flos": 20667884186880.0, + "grad_norm": 2.2852105791991284, + "language_loss": 0.81897211, + "learning_rate": 2.370001590090709e-11, + "loss": 0.84025532, + "num_input_tokens_seen": 358395970, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 16607, + "time_per_iteration": 2.420513868331909 + }, + { + "auxiliary_loss_clip": 0.01098993, + "auxiliary_loss_mlp": 0.01030097, + "balance_loss_clip": 1.01795566, + "balance_loss_mlp": 1.03150964, + "epoch": 0.9985269803096347, + "flos": 30262532555520.0, + "grad_norm": 1.5818378676370355, + "language_loss": 0.67044789, + "learning_rate": 2.184193803622669e-11, + "loss": 0.69173878, + "num_input_tokens_seen": 358417355, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.67578125, + "step": 16608, + "time_per_iteration": 2.5308494567871094 + }, + { + "auxiliary_loss_clip": 0.0110093, + "auxiliary_loss_mlp": 0.01028257, + "balance_loss_clip": 1.01673532, + "balance_loss_mlp": 1.03548384, + "epoch": 0.9985871035623027, + "flos": 10560970005120.0, + "grad_norm": 1.9858303042603545, + "language_loss": 0.80386388, + "learning_rate": 2.0059699543883978e-11, + "loss": 0.82515574, + "num_input_tokens_seen": 358434345, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.65625, + "step": 16609, + "time_per_iteration": 2.4074668884277344 + }, + { + "auxiliary_loss_clip": 0.01098865, + "auxiliary_loss_mlp": 0.01033806, + "balance_loss_clip": 1.02219498, + "balance_loss_mlp": 1.03368092, + "epoch": 0.9986472268149706, + "flos": 16873455173760.0, + "grad_norm": 1.4407637215037619, + "language_loss": 0.625763, + "learning_rate": 1.8353300491158462e-11, + "loss": 0.64708972, + "num_input_tokens_seen": 358452870, + "router_z_loss_clip": 0.11621094, + "router_z_loss_mlp": 0.65234375, + "step": 16610, + "time_per_iteration": 2.4297850131988525 + }, + { + "auxiliary_loss_clip": 0.01098855, + "auxiliary_loss_mlp": 0.01033769, + "balance_loss_clip": 1.02262878, + "balance_loss_mlp": 1.03305161, + "epoch": 0.9987073500676387, + "flos": 22054538776320.0, + "grad_norm": 2.0766037550542165, + "language_loss": 0.67106199, + "learning_rate": 1.672274094288717e-11, + "loss": 0.69238824, + "num_input_tokens_seen": 358472210, + "router_z_loss_clip": 0.11132812, + "router_z_loss_mlp": 0.65625, + "step": 16611, + "time_per_iteration": 2.44804310798645 + }, + { + "auxiliary_loss_clip": 0.01098691, + "auxiliary_loss_mlp": 0.01032391, + "balance_loss_clip": 1.02062511, + "balance_loss_mlp": 1.03359318, + "epoch": 0.9987674733203066, + "flos": 30482880537600.0, + "grad_norm": 1.3989424316298207, + "language_loss": 0.69802946, + "learning_rate": 1.5168020961020544e-11, + "loss": 0.71934032, + "num_input_tokens_seen": 358493840, + "router_z_loss_clip": 0.1171875, + "router_z_loss_mlp": 0.6484375, + "step": 16612, + "time_per_iteration": 2.50903582572937 + }, + { + "auxiliary_loss_clip": 0.01096405, + "auxiliary_loss_mlp": 0.01026749, + "balance_loss_clip": 1.01591265, + "balance_loss_mlp": 1.03385317, + "epoch": 0.9988275965729746, + "flos": 27745230585600.0, + "grad_norm": 1.4328824978933166, + "language_loss": 0.74061179, + "learning_rate": 1.3689140604400407e-11, + "loss": 0.76184332, + "num_input_tokens_seen": 358515060, + "router_z_loss_clip": 0.10839844, + "router_z_loss_mlp": 0.625, + "step": 16613, + "time_per_iteration": 2.4886481761932373 + }, + { + "auxiliary_loss_clip": 0.01100248, + "auxiliary_loss_mlp": 0.01029146, + "balance_loss_clip": 1.01697528, + "balance_loss_mlp": 1.03376675, + "epoch": 0.9988877198256426, + "flos": 17524191916800.0, + "grad_norm": 1.906372378951036, + "language_loss": 0.73438096, + "learning_rate": 1.2286099928981996e-11, + "loss": 0.7556749, + "num_input_tokens_seen": 358528200, + "router_z_loss_clip": 0.12158203, + "router_z_loss_mlp": 0.6640625, + "step": 16614, + "time_per_iteration": 2.400599718093872 + }, + { + "auxiliary_loss_clip": 0.01098843, + "auxiliary_loss_mlp": 0.0103049, + "balance_loss_clip": 1.01953483, + "balance_loss_mlp": 1.03430223, + "epoch": 0.9989478430783105, + "flos": 20996502739200.0, + "grad_norm": 2.131115079088909, + "language_loss": 0.72789717, + "learning_rate": 1.0958898988278065e-11, + "loss": 0.74919045, + "num_input_tokens_seen": 358548360, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.64453125, + "step": 16615, + "time_per_iteration": 2.446946382522583 + }, + { + "auxiliary_loss_clip": 0.01101987, + "auxiliary_loss_mlp": 0.01028575, + "balance_loss_clip": 1.01719022, + "balance_loss_mlp": 1.03495038, + "epoch": 0.9990079663309785, + "flos": 13370620769280.0, + "grad_norm": 2.2411165544017155, + "language_loss": 0.77020514, + "learning_rate": 9.70753783247069e-12, + "loss": 0.79151082, + "num_input_tokens_seen": 358566270, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.671875, + "step": 16616, + "time_per_iteration": 2.412311553955078 + }, + { + "auxiliary_loss_clip": 0.01099813, + "auxiliary_loss_mlp": 0.01026941, + "balance_loss_clip": 1.01558042, + "balance_loss_mlp": 1.03469288, + "epoch": 0.9990680895836465, + "flos": 17310236555520.0, + "grad_norm": 1.899199296891262, + "language_loss": 0.83130789, + "learning_rate": 8.532016508855378e-12, + "loss": 0.85257542, + "num_input_tokens_seen": 358584710, + "router_z_loss_clip": 0.11376953, + "router_z_loss_mlp": 0.65234375, + "step": 16617, + "time_per_iteration": 2.410187005996704 + }, + { + "auxiliary_loss_clip": 0.01098748, + "auxiliary_loss_mlp": 0.01027064, + "balance_loss_clip": 1.01609123, + "balance_loss_mlp": 1.03413057, + "epoch": 0.9991282128363145, + "flos": 24207993930240.0, + "grad_norm": 1.546516443425981, + "language_loss": 0.78751385, + "learning_rate": 7.43233506206309e-12, + "loss": 0.80877197, + "num_input_tokens_seen": 358606750, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.6484375, + "step": 16618, + "time_per_iteration": 2.492341995239258 + }, + { + "auxiliary_loss_clip": 0.01096564, + "auxiliary_loss_mlp": 0.01028554, + "balance_loss_clip": 1.01755726, + "balance_loss_mlp": 1.0325985, + "epoch": 0.9991883360889824, + "flos": 21175301664000.0, + "grad_norm": 1.6586785536525817, + "language_loss": 0.75025094, + "learning_rate": 6.408493534060255e-12, + "loss": 0.77150214, + "num_input_tokens_seen": 358624675, + "router_z_loss_clip": 0.10986328, + "router_z_loss_mlp": 0.640625, + "step": 16619, + "time_per_iteration": 2.425902843475342 + }, + { + "auxiliary_loss_clip": 0.0109568, + "auxiliary_loss_mlp": 0.01024454, + "balance_loss_clip": 1.01400542, + "balance_loss_mlp": 1.03293276, + "epoch": 0.9992484593416504, + "flos": 19901155449600.0, + "grad_norm": 2.192896469394689, + "language_loss": 0.86634326, + "learning_rate": 5.460491963260594e-12, + "loss": 0.88754463, + "num_input_tokens_seen": 358640715, + "router_z_loss_clip": 0.10449219, + "router_z_loss_mlp": 0.625, + "step": 16620, + "time_per_iteration": 2.4410362243652344 + }, + { + "auxiliary_loss_clip": 0.01094412, + "auxiliary_loss_mlp": 0.01023895, + "balance_loss_clip": 1.01315451, + "balance_loss_mlp": 1.03186941, + "epoch": 0.9993085825943183, + "flos": 24857832833280.0, + "grad_norm": 2.495339856007808, + "language_loss": 0.72616214, + "learning_rate": 4.58833038607942e-12, + "loss": 0.74734521, + "num_input_tokens_seen": 358659630, + "router_z_loss_clip": 0.10742188, + "router_z_loss_mlp": 0.625, + "step": 16621, + "time_per_iteration": 2.4640777111053467 + }, + { + "auxiliary_loss_clip": 0.01021492, + "auxiliary_loss_mlp": 0.01001851, + "balance_loss_clip": 1.00083733, + "balance_loss_mlp": 1.00158083, + "epoch": 0.9993687058469863, + "flos": 71284478780160.0, + "grad_norm": 0.742951217082793, + "language_loss": 0.56556338, + "learning_rate": 3.79200883515729e-12, + "loss": 0.58579683, + "num_input_tokens_seen": 358727840, + "router_z_loss_clip": 0.01013184, + "router_z_loss_mlp": 0.19921875, + "step": 16622, + "time_per_iteration": 3.2356338500976562 + }, + { + "auxiliary_loss_clip": 0.01099663, + "auxiliary_loss_mlp": 0.01026565, + "balance_loss_clip": 1.01551473, + "balance_loss_mlp": 1.03389001, + "epoch": 0.9994288290996542, + "flos": 12199573566720.0, + "grad_norm": 1.8117566744046234, + "language_loss": 0.71488571, + "learning_rate": 3.071527340914315e-12, + "loss": 0.736148, + "num_input_tokens_seen": 358744125, + "router_z_loss_clip": 0.11035156, + "router_z_loss_mlp": 0.65625, + "step": 16623, + "time_per_iteration": 2.4421582221984863 + }, + { + "auxiliary_loss_clip": 0.01097804, + "auxiliary_loss_mlp": 0.01029483, + "balance_loss_clip": 1.0171864, + "balance_loss_mlp": 1.03373384, + "epoch": 0.9994889523523223, + "flos": 17889942153600.0, + "grad_norm": 2.5657378240797284, + "language_loss": 0.75026071, + "learning_rate": 2.4268859304399368e-12, + "loss": 0.77153361, + "num_input_tokens_seen": 358761420, + "router_z_loss_clip": 0.12304688, + "router_z_loss_mlp": 0.640625, + "step": 16624, + "time_per_iteration": 2.433236598968506 + }, + { + "auxiliary_loss_clip": 0.01097329, + "auxiliary_loss_mlp": 0.01028905, + "balance_loss_clip": 1.01688886, + "balance_loss_mlp": 1.03219914, + "epoch": 0.9995490756049902, + "flos": 26578888064640.0, + "grad_norm": 1.446379729117076, + "language_loss": 0.73516172, + "learning_rate": 1.8580846286031514e-12, + "loss": 0.75642407, + "num_input_tokens_seen": 358782600, + "router_z_loss_clip": 0.12011719, + "router_z_loss_mlp": 0.65234375, + "step": 16625, + "time_per_iteration": 2.4915857315063477 + }, + { + "auxiliary_loss_clip": 0.01095797, + "auxiliary_loss_mlp": 0.01031437, + "balance_loss_clip": 1.02048731, + "balance_loss_mlp": 1.03293371, + "epoch": 0.9996091988576582, + "flos": 22200048771840.0, + "grad_norm": 2.480327678913786, + "language_loss": 0.76776922, + "learning_rate": 1.3651234567202408e-12, + "loss": 0.78904152, + "num_input_tokens_seen": 358801220, + "router_z_loss_clip": 0.109375, + "router_z_loss_mlp": 0.62890625, + "step": 16626, + "time_per_iteration": 2.4423091411590576 + }, + { + "auxiliary_loss_clip": 0.01097122, + "auxiliary_loss_mlp": 0.01031383, + "balance_loss_clip": 1.02014768, + "balance_loss_mlp": 1.03396559, + "epoch": 0.9996693221103262, + "flos": 27373195468800.0, + "grad_norm": 1.7033964365476697, + "language_loss": 0.82272637, + "learning_rate": 9.480024334429515e-13, + "loss": 0.84401143, + "num_input_tokens_seen": 358819190, + "router_z_loss_clip": 0.11230469, + "router_z_loss_mlp": 0.6328125, + "step": 16627, + "time_per_iteration": 2.486177444458008 + }, + { + "auxiliary_loss_clip": 0.0110276, + "auxiliary_loss_mlp": 0.01033814, + "balance_loss_clip": 1.02122557, + "balance_loss_mlp": 1.03508389, + "epoch": 0.9997294453629941, + "flos": 26870410846080.0, + "grad_norm": 2.0058857890661663, + "language_loss": 0.71033239, + "learning_rate": 6.067215747584952e-13, + "loss": 0.73169816, + "num_input_tokens_seen": 358839850, + "router_z_loss_clip": 0.12597656, + "router_z_loss_mlp": 0.67578125, + "step": 16628, + "time_per_iteration": 2.4887261390686035 + }, + { + "auxiliary_loss_clip": 0.01099628, + "auxiliary_loss_mlp": 0.01029216, + "balance_loss_clip": 1.01746821, + "balance_loss_mlp": 1.03319097, + "epoch": 0.9997895686156621, + "flos": 23476996247040.0, + "grad_norm": 1.6302666729818955, + "language_loss": 0.7536037, + "learning_rate": 3.4128089332341456e-13, + "loss": 0.77489209, + "num_input_tokens_seen": 358859805, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.6640625, + "step": 16629, + "time_per_iteration": 2.4460158348083496 + }, + { + "auxiliary_loss_clip": 0.01102553, + "auxiliary_loss_mlp": 0.01031537, + "balance_loss_clip": 1.01974154, + "balance_loss_mlp": 1.03498006, + "epoch": 0.9998496918683301, + "flos": 20224961579520.0, + "grad_norm": 2.824647811709247, + "language_loss": 0.60427022, + "learning_rate": 1.5168039935176126e-13, + "loss": 0.62561107, + "num_input_tokens_seen": 358877900, + "router_z_loss_clip": 0.11767578, + "router_z_loss_mlp": 0.67578125, + "step": 16630, + "time_per_iteration": 2.4238274097442627 + }, + { + "auxiliary_loss_clip": 0.01100925, + "auxiliary_loss_mlp": 0.0102694, + "balance_loss_clip": 1.01544178, + "balance_loss_mlp": 1.03468835, + "epoch": 0.9999098151209981, + "flos": 21652913831040.0, + "grad_norm": 2.316151523286849, + "language_loss": 0.60503012, + "learning_rate": 3.792010017100722e-14, + "loss": 0.62630868, + "num_input_tokens_seen": 358897285, + "router_z_loss_clip": 0.11523438, + "router_z_loss_mlp": 0.6640625, + "step": 16631, + "time_per_iteration": 2.4369335174560547 + }, + { + "auxiliary_loss_clip": 0.0109617, + "auxiliary_loss_mlp": 0.01023912, + "balance_loss_clip": 1.01379728, + "balance_loss_mlp": 1.03328824, + "epoch": 0.999969938373666, + "flos": 11544599018880.0, + "grad_norm": 1.9730275609263277, + "language_loss": 0.72405601, + "learning_rate": 0.0, + "loss": 0.74525678, + "num_input_tokens_seen": 358911570, + "router_z_loss_clip": 0.10107422, + "router_z_loss_mlp": 0.62890625, + "step": 16632, + "time_per_iteration": 2.38352370262146 + }, + { + "epoch": 0.999969938373666, + "num_input_tokens_seen": 358911570, + "step": 16632, + "total_flos": 1.3992168911420785e+18, + "train_loss": 0.6051551120310033, + "train_runtime": 35922.4643, + "train_samples_per_second": 18.52, + "train_steps_per_second": 0.463 + } + ], + "logging_steps": 1.0, + "max_steps": 16632, + "num_input_tokens_seen": 358911570, + "num_train_epochs": 1, + "save_steps": 3328, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.3992168911420785e+18, + "train_batch_size": 5, + "trial_name": null, + "trial_params": null +} diff --git a/sft/Full_new_smoe_sigmoidgating/training_args.bin b/sft/Full_new_smoe_sigmoidgating/training_args.bin new file mode 100644 index 0000000000000000000000000000000000000000..6b9a73eb97a1ef37776f0d97a0590d802e6f8d5a --- /dev/null +++ b/sft/Full_new_smoe_sigmoidgating/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a0c59c7a64d6e018f6d41a91f3e718772a260e91597586a7ce64cd9f7d3d0c6 +size 7992